diff --git a/.bazelrc b/.bazelrc
index 1dd928acdb4..1b9f5e87c6b 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -18,8 +18,10 @@
 #
 # Compiler options:
 #     cuda_clang:             Use clang when building CUDA code.
-#     c++17:                  Build with C++17 options
-#     c++1z:                  Build with C++17 options
+#     c++17:                  Build with C++17 options (links with libc++)
+#     c++1z:                  Build with C++17 options (links with libc++)
+#     c++17_gcc:              Build with C++17 options (links with stdlibc++)
+#     c++1z_gcc:              Build with C++17 options (links with stdlibc++)
 #     avx_linux:              Build with avx instruction set on linux.
 #     avx2_linux:             Build with avx2 instruction set on linux.
 #     native_arch_linux:      Build with instruction sets available to the host machine on linux
@@ -28,6 +30,7 @@
 #
 # Other build options:
 #     short_logs:       Only log errors during build, skip warnings.
+#     verbose_logs:     Show all compiler warnings during build.
 #     monolithic:       Build all TF C++ code into a single shared object.
 #     dynamic_kernels:  Try to link all kernels dynamically (experimental).
 #     libc++:           Link against libc++ instead of stdlibc++
@@ -78,7 +81,16 @@
 #     elinux:          General Embedded Linux options shared by all flavors.
 #     elinux_aarch64:  Embedded Linux options for aarch64 (ARM64) CPU support.
 #     elinux_armhf:    Embedded Linux options for armhf (ARMv7) CPU support.
-
+#
+# Release build options (for all operating systems)
+#     release_common:       Common options for all builds on all operating systems.
+#     release_windows_common:    Common options for all builds on Windows.
+#     release_gpu_common:   Common options for GPU builds on Linux and Windows.
+#     release_cpu_linux:    Toolchain and CUDA options for Linux CPU builds.
+#     release_cpu_macos:    Toolchain and CUDA options for MacOS CPU builds.
+#     release_gpu_linux:    Toolchain and CUDA options for Linux GPU builds.
+#     release_cpu_windows:    Toolchain and CUDA options for Windows CPU builds.
+#     release_gpu_windows:    Toolchain and CUDA options for Windows GPU builds.
 
 # Allow builds using libc++ as a linker library
 # This is mostly for OSSFuzz, so we also pass in the flags from environment to clean build file
@@ -155,14 +167,29 @@ build:mkl -c opt
 # config to build OneDNN backend with a user specified threadpool.
 build:mkl_threadpool --define=build_with_mkl=true --define=enable_mkl=true
 build:mkl_threadpool --define=tensorflow_mkldnn_contraction_kernel=0
+build:mkl_threadpool --define=build_with_mkl_dnn_v1_only=true
+build:mkl_threadpool --define=build_with_mkl_opensource=true
 build:mkl_threadpool --define=build_with_mkldnn_threadpool=true
 build:mkl_threadpool -c opt
+
+# Config setting to build with oneDNN and without the binary blob
+build:mkl_opensource_only --define=build_with_mkl=true --define=enable_mkl=true
+build:mkl_opensource_only --define=tensorflow_mkldnn_contraction_kernel=0
+build:mkl_opensource_only --define=build_with_mkl_dnn_v1_only=true
+build:mkl_opensource_only --define=build_with_mkl_opensource=true
+build:mkl_opensource_only -c opt
+
 # This config refers to building with CUDA available. It does not necessarily
 # mean that we build CUDA op kernels.
 build:using_cuda --define=using_cuda=true
 build:using_cuda --action_env TF_NEED_CUDA=1
 build:using_cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
 
+# Enable the mlir generated GPU kernels only for cuda builds.
+build --define=tensorflow_enable_mlir_generated_gpu_kernels=0
+# This is a more specific option, so it takes precedence over the line above for cuda builds.
+build:using_cuda --define=tensorflow_enable_mlir_generated_gpu_kernels=1
+
 # This config refers to building CUDA op kernels with nvcc.
 build:cuda --config=using_cuda
 build:cuda --define=using_cuda_nvcc=true
@@ -253,6 +280,8 @@ build:dynamic_kernels --copt=-DAUTOLOAD_DYNAMIC_KERNELS
 build:c++17 --cxxopt=-std=c++1z
 build:c++17 --cxxopt=-stdlib=libc++
 build:c++1z --config=c++17
+build:c++17_gcc --cxxopt=-std=c++1z
+build:c++1z_gcc --config=c++17_gcc
 
 # Enable using platform specific build settings, except when cross-compiling for
 # mobile platforms.
@@ -322,6 +351,8 @@ build:windows --distinct_host_configuration=false
 
 # Suppress all warning messages.
 build:short_logs --output_filter=DONT_MATCH_ANYTHING
+build:verbose_logs --output_filter=
+build --config=short_logs
 
 # Instruction set optimizations
 # TODO(gunan): Create a feature in toolchains for avx/avx2 to
@@ -341,7 +372,6 @@ build --config=v2
 test --config=v2
 
 # Enable XLA
-build:xla --action_env=TF_ENABLE_XLA=1
 build:xla --define=with_xla_support=true
 
 # BEGIN TF REMOTE BUILD EXECUTION OPTIONS
@@ -534,3 +564,43 @@ try-import %workspace%/.tf_configure.bazelrc
 
 # Put user-specific options in .bazelrc.user
 try-import %workspace%/.bazelrc.user
+
+# Here are bazelrc configs for release builds
+build:release_common --config=opt
+build:release_common --config=v2
+build:release_common --distinct_host_configuration=false
+build:release_common --action_env TF_CONFIGURE_IOS="0"
+
+build:release_cpu_linux --config=release_common
+build:release_cpu_linux --config=avx_linux
+# We use the same toolchain for CPU/GPU packages.
+# Did not add this to the defaults in case this changes.
+build:release_cpu_linux --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain
+
+build:release_cpu_macos --config=release_common
+build:release_cpu_macos --config=avx_linux
+
+build:release_gpu_common --config=release_common
+build:release_gpu_common --config=cuda
+build:release_gpu_common --config=tensorrt
+build:release_gpu_common --action_env CUDA_TOOLKIT_PATH="/usr/local/cuda-10.1"
+build:release_gpu_common --action_env=TF_CUDA_VERSION="10"
+build:release_gpu_common --action_env=TF_CUDNN_VERSION="7"
+build:release_gpu_common --action_env=TF_NEED_TENSORRT="1"
+build:release_gpu_common --action_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_35,sm_37,sm_52,sm_60,sm_61,compute_70"
+build:release_gpu_common --action_env=TENSORRT_INSTALL_PATH="/usr/local/tensorrt"
+build:release_gpu_common --action_env=LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/tensorrt/lib"
+build:release_gpu_common --action_env=GCC_HOST_COMPILER_PATH="/usr/bin/gcc-5"
+
+
+build:release_gpu_linux --config=release_gpu_common
+build:release_gpu_linux --config=avx_linux
+build:release_gpu_linux --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain
+
+build:release_windows_common --config=release_common
+build:release_windows_common --define=no_tensorflow_py_deps=true
+build:release_windows_common --announce_rc
+
+build:release_cpu_windows --config=release_windows_common
+
+build:release_gpu_windows --config=release_windows_common
diff --git a/README.md b/README.md
index 9cf595bbf61..6398e8e27a1 100644
--- a/README.md
+++ b/README.md
@@ -123,20 +123,21 @@ Build Type               | Status
 
 ### Community Supported Builds
 
-Build Type                                                                          | Status                                                                                                                                                                                        | Artifacts
------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------
-**Linux AMD ROCm GPU** Nightly                                                      | [![Build Status](http://ml-ci.amd.com:21096/job/tensorflow-rocm-nightly/badge/icon)](http://ml-ci.amd.com:21096/job/tensorflow-rocm-nightly)                                                  | [Nightly](http://ml-ci.amd.com:21096/job/tensorflow-rocm-nightly/lastSuccessfulBuild/)
-**Linux AMD ROCm GPU** Stable Release                                               | [![Build Status](http://ml-ci.amd.com:21096/job/tensorflow-rocm-release/badge/icon)](http://ml-ci.amd.com:21096/job/tensorflow-rocm-release/)                                                 | Release [1.15](http://ml-ci.amd.com:21096/job/tensorflow-rocm-release/lastSuccessfulBuild/) / [2.x](http://ml-ci.amd.com:21096/job/tensorflow-rocm-v2-release/lastSuccessfulBuild/)
-**Linux s390x** Nightly                                                             | [![Build Status](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/badge/icon)](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/)                                                             | [Nightly](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/)
-**Linux s390x CPU** Stable Release                                                  | [![Build Status](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_Release_Build/badge/icon)](https://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_Release_Build/)                                      | [Release](https://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_Release_Build/)
-**Linux ppc64le CPU** Nightly                                                       | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/)                                       | [Nightly](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Nightly_Artifact/)
-**Linux ppc64le CPU** Stable Release                                                | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/)                       | Release [1.15](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/) / [2.x](https://powerci.osuosl.org/job/TensorFlow2_PPC64LE_CPU_Release_Build/)
-**Linux ppc64le GPU** Nightly                                                       | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Build/)                                       | [Nightly](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/)
-**Linux ppc64le GPU** Stable Release                                                | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/)                       | Release [1.15](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/) / [2.x](https://powerci.osuosl.org/job/TensorFlow2_PPC64LE_GPU_Release_Build/)
-**Linux aarch64 CPU** Nightly <br> Python 3.6                                       | [![Build Status](http://openlabtesting.org:15000/badge?project=tensorflow%2Ftensorflow)](https://status.openlabtesting.org/builds/builds?project=tensorflow%2Ftensorflow)                     | [Nightly](https://status.openlabtesting.org/builds/builds?project=tensorflow%2Ftensorflow&job_name=tensorflow-arm64-build-daily-master)
-**Linux CPU with Intel oneAPI Deep Neural Network Library (oneDNN)** Nightly        | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/)                     | [Nightly](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/)
-**Linux CPU with Intel oneAPI Deep Neural Network Library (oneDNN)** Stable Release | ![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/badge/icon)                                                                                              | Release [1.15](https://pypi.org/project/intel-tensorflow/1.15.0/) / [2.x](https://pypi.org/project/intel-tensorflow/)
-**Red Hat® Enterprise Linux® 7.6 CPU & GPU** <br> Python 2.7, 3.6                   | [![Build Status](https://jenkins-tensorflow.apps.ci.centos.org/buildStatus/icon?job=tensorflow-rhel7-3.6&build=2)](https://jenkins-tensorflow.apps.ci.centos.org/job/tensorflow-rhel7-3.6/2/) | [1.13.1 PyPI](https://tensorflow.pypi.thoth-station.ninja/index/)
+Build Type                                                                          | Status                                                                                                                                                                                                                                                                                                                                                                                              | Artifacts
+----------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------
+**Linux AMD ROCm GPU** Nightly                                                      | [![Build Status](http://ml-ci.amd.com:21096/job/tensorflow-rocm-nightly/badge/icon)](http://ml-ci.amd.com:21096/job/tensorflow-rocm-nightly)                                                                                                                                                                                                                                                        | [Nightly](http://ml-ci.amd.com:21096/job/tensorflow-rocm-nightly/lastSuccessfulBuild/)
+**Linux AMD ROCm GPU** Stable Release                                               | [![Build Status](http://ml-ci.amd.com:21096/job/tensorflow-rocm-release/badge/icon)](http://ml-ci.amd.com:21096/job/tensorflow-rocm-release/)                                                                                                                                                                                                                                                       | Release [1.15](http://ml-ci.amd.com:21096/job/tensorflow-rocm-release/lastSuccessfulBuild/) / [2.x](http://ml-ci.amd.com:21096/job/tensorflow-rocm-v2-release/lastSuccessfulBuild/)
+**Linux s390x** Nightly                                                             | [![Build Status](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/badge/icon)](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/)                                                                                                                                                                                                                                                                   | [Nightly](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/)
+**Linux s390x CPU** Stable Release                                                  | [![Build Status](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_Release_Build/badge/icon)](https://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_Release_Build/)                                                                                                                                                                                                                                            | [Release](https://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_Release_Build/)
+**Linux ppc64le CPU** Nightly                                                       | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/)                                                                                                                                                                                                                                             | [Nightly](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Nightly_Artifact/)
+**Linux ppc64le CPU** Stable Release                                                | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/)                                                                                                                                                                                                                             | Release [1.15](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/) / [2.x](https://powerci.osuosl.org/job/TensorFlow2_PPC64LE_CPU_Release_Build/)
+**Linux ppc64le GPU** Nightly                                                       | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Build/)                                                                                                                                                                                                                                             | [Nightly](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/)
+**Linux ppc64le GPU** Stable Release                                                | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/)                                                                                                                                                                                                                             | Release [1.15](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/) / [2.x](https://powerci.osuosl.org/job/TensorFlow2_PPC64LE_GPU_Release_Build/)
+**Linux aarch64 CPU** Nightly <br> Python 3.6                                       | [![Build Status](http://openlabtesting.org:15000/badge?project=tensorflow%2Ftensorflow)](https://status.openlabtesting.org/builds/builds?project=tensorflow%2Ftensorflow&job_name=tensorflow-arm64-build-daily-master)                                                                                                                                                                              | [Nightly](https://status.openlabtesting.org/builds/builds?project=tensorflow%2Ftensorflow&job_name=tensorflow-arm64-build-daily-master)
+**Linux aarch64 CPU** Stable Release                                                | [![Build Status](http://openlabtesting.org:15000/badge?project=tensorflow%2Ftensorflow&job_name=tensorflow-v1.15.3-cpu-arm64-release-build-show&job_name=tensorflow-v2.1.0-cpu-arm64-release-build-show)](http://status.openlabtesting.org/builds?project=tensorflow%2Ftensorflow&job_name=tensorflow-v2.1.0-cpu-arm64-release-build-show&job_name=tensorflow-v1.15.3-cpu-arm64-release-build-show) | Release [1.15](http://status.openlabtesting.org/builds?project=tensorflow%2Ftensorflow&job_name=tensorflow-v1.15.3-cpu-arm64-release-build-show) / [2.x](http://status.openlabtesting.org/builds?project=tensorflow%2Ftensorflow&job_name=tensorflow-v2.1.0-cpu-arm64-release-build-show)
+**Linux CPU with Intel oneAPI Deep Neural Network Library (oneDNN)** Nightly        | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/)                                                                                                                                                                                                                           | [Nightly](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/)
+**Linux CPU with Intel oneAPI Deep Neural Network Library (oneDNN)** Stable Release | ![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/badge/icon)                                                                                                                                                                                                                                                                                                    | Release [1.15](https://pypi.org/project/intel-tensorflow/1.15.0/) / [2.x](https://pypi.org/project/intel-tensorflow/)
+**Red Hat® Enterprise Linux® 7.6 CPU & GPU** <br> Python 2.7, 3.6                   | [![Build Status](https://jenkins-tensorflow.apps.ci.centos.org/buildStatus/icon?job=tensorflow-rhel7-3.6&build=2)](https://jenkins-tensorflow.apps.ci.centos.org/job/tensorflow-rhel7-3.6/2/)                                                                                                                                                                                                       | [1.13.1 PyPI](https://tensorflow.pypi.thoth-station.ninja/index/)
 
 ## Resources
 
diff --git a/RELEASE.md b/RELEASE.md
index 69eca82c5f2..430e1b83885 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -11,10 +11,28 @@
 * C-API functions `TF_StringDecode`, `TF_StringEncode`, and
   `TF_StringEncodedSize` are no longer relevant and have been removed; see
   core/platform/ctstring.h for string access/modification in C.
-* In batching library, rename parameter
-  SharedBatchScheduler::QueueOptions::max_batch_size to a more accurate name
-  (input_batch_size_limit) for a recent feature to enable split of large batch
-  sizes.
+* Removed `tf.distribute.Strategy.experimental_run_v2` method, which was deprecated in TF 2.2.
+* `tensorflow.python`, `tensorflow.core` and `tensorflow.compiler` modules are
+    now hidden. These modules are not part of TensorFlow public API.
+* A major refactoring of the internals of the Keras Functional API may affect code that is relying on certain internal details:
+    * Code that uses `isinstance(x, tf.Tensor)` instead of `tf.is_tensor` when checking Keras symbolic inputs/outputs should switch to using `tf.is_tensor`.
+    * Code that is overly dependent on the exact names attached to symbolic tensors (e.g. assumes there will be ":0" at the end of the inputs, treats names as unique identifiers instead of using `tensor.ref()`, etc.)
+    * Code that uses `get_concrete_function` to trace Keras symbolic inputs directly should switch to building matching `tf.TensorSpec`s directly and tracing the `TensorSpec` objects.
+    * Code that relies on the exact number and names of the op layers that TensorFlow operations were converted into. These may have changed.
+    * Code that uses `tf.map_fn`/`tf.cond`/`tf.while_loop`/control flow as op layers and happens to work before TF 2.4. These will explicitly be unsupported now. Converting these ops to Functional API op layers was unreliable before TF 2.4, and prone to erroring incomprehensibly or being silently buggy.
+    * Code that directly asserts on a Keras symbolic value in cases where ops like `tf.rank` used to return a static or symbolic value depending on if the input had a fully static shape or not. Now these ops always return symbolic values.
+    * Code already susceptible to leaking tensors outside of graphs becomes slightly more likely to do so now.
+    * Code that requires very tricky shape manipulation via converted op layers in order to work, where the Keras symbolic shape inference proves insufficient.
+    * Code that tries manually walking a `tf.keras.Model` layer by layer and assumes layers only ever have one positional argument. This assumption doesn't hold true before TF 2.4 either, but is more likely to cause issues know.
+    * Code that manually enters `keras.backend.get_graph()` before building a functional model. This is no longer needed.
+* Start enforcing input shape assumptions when calling Functional API Keras
+  models. This may potentially break some users, in case there is a mismatch
+  between the shape used when creating `Input` objects in a Functional model,
+  and the shape of the data passed to that model. You can fix this mismatch by
+  either calling the model with correctly-shaped data, or by relaxing `Input`
+  shape assumptions (note that you can pass shapes with `None` entries for axes
+  that are meant to be dynamic). You can also disable the input checking
+  entirely by setting `model.input_spec = None`.
 
 ## Known Caveats
 
@@ -24,6 +42,8 @@
 
 * <INSERT MAJOR FEATURE HERE, USING MARKDOWN SYNTAX>
 * <IF RELEASE CONTAINS MULTIPLE FEATURES FROM SAME AREA, GROUP THEM TOGETHER>
+* A new module named `tf.experimental.numpy` is added, which is a NumPy-compatible API for writing TF programs. This module provides class `ndarray`, which mimics the `ndarray` class in NumPy, and wraps an immutable `tf.Tensor` under the hood. A subset of NumPy functions (e.g. `numpy.add`) are provided. Their inter-operation with TF facilities is seamless in most cases. See tensorflow/python/ops/numpy_ops/README.md for details of what are supported and what are the differences with NumPy.
+* A major refactoring of the internals of the Keras Functional API has been completed, that should improve the reliability, stability, and performance of constructing Functional models.
 
 ## Bug Fixes and Other Changes
 
@@ -31,36 +51,106 @@
 * <IF A CHANGE CLOSES A GITHUB ISSUE, IT SHOULD BE DOCUMENTED HERE>
 * <NOTES SHOULD BE GROUPED PER AREA>
 * TF Core:
-  * <ADD RELEASE NOTES HERE>
-  * `tf.Tensor` is now a subclass of `typing.Generic`, allowing type annotations
-    to be parameterized by dtype: `tf.Tensor[tf.Int32]`. This requires Python 3,
-    and will become fully compatible with static type checkers in the future.
-
+  * `tf.types.experimental.TensorLike` is a new `Union` type that can be used as
+    type annotation for variables representing a Tensor or a value that can be
+    converted to Tensor by `tf.convert_to_tensor`.
+  * Calling ops with a python constants or numpy values is now consistent with
+    tf.convert_to_tensor behavior. This avoids operations like tf.reshape
+    truncating inputs such as from int64 to int32.
+  * Added `tf.sparse.map_values` to apply a function to the `.value`s of `SparseTensror` arguments.
+  * The Python bitwise operators for `Tensor` (`__and__`, `__or__`, `__xor__`
+    and `__invert__` now support non-`bool` arguments and apply the
+    corresponding bitwise ops. `bool` arguments continue to be supported and
+    dispatch to logical ops. This brings them more in line with Python and NumPy
+    benavior.
+  * Added `tf.SparseTensor.with_values`. This returns a new SparseTensor with
+    the same sparsity pattern, but with new provided values. It is similar to
+    the `with_values` function of `RaggedTensor`.
+  * Added `StatelessCase` op, and uses it if none of case branches has stateful ops.
 * `tf.data`:
+    * Added new `tf.data.experimental.service.register_dataset` and
+     `tf.data.experimental.service.from_dataset_id` APIs to enable one process
+      to register a dataset with the tf.data service, and another process to
+      consume data from the dataset.
+    * Added support for tf.data service dispatcher fault tolerance. To enable
+      fault tolerance, configure a `work_dir` when running your dispatcher
+      server and set `dispatcher_fault_tolerance=True`. The dispatcher will
+      store its state to `work_dir`, so that on restart it can continue from its
+      previous state after restart.
     * Added optional `exclude_cols` parameter to CsvDataset. This parameter is
-  the complement of `select_cols`; at most one of these should be specified.
+      the complement of `select_cols`; at most one of these should be specified.
+    * We have implemented an optimization which reorders data-discarding
+      transformations such as `take` and `shard` to happen earlier in the
+      dataset when it is safe to do so. The optimization can be disabled via
+      the `experimental_optimization.reorder_data_discarding_ops` dataset
+      option.
+* `tf.image`:
+    * Added deterministic `tf.image.stateless_random_*` functions for each
+      `tf.image.random_*` function. Added a new op
+      `stateless_sample_distorted_bounding_box` which is a determinstic
+      version of `sample_distorted_bounding_box` op. Given the same seed, these
+      stateless functions/ops produce the same results independent of how many
+      times the function is called, and independent of global seed settings.
 *   `tf.distribute`:
     * <ADD RELEASE NOTES HERE>
-*   `tf.keras`:
-    * <ADD RELEASE NOTES HERE>
-*   `tf.function`/AutoGraph:
-    * <ADD RELEASE NOTES HERE>
+* `tf.keras`:
+    * Improvements from the functional API refactoring:
+      * Functional model construction does not need to maintain a global workspace graph, removing memory leaks especially when building many models or very large models.
+      * Functional model construction should be ~8-10% faster on average.
+      * Functional models can now contain non-symbolic values in their call inputs inside of the first positional argument.
+      * Several classes of TF ops that were not reliably converted to Keras layers during functional API construction should now work, e.g. `tf.image.ssim_multiscale`
+      * Error messages when Functional API construction goes wrong (and when ops cannot be converted to Keras layers automatically) should be clearer and easier to understand.
+    * `Optimizer.minimize` can now accept a loss `Tensor` and a `GradientTape`
+      as an alternative to accepting a `callable` loss.
+    * Added `beta` parameter to FTRL optimizer to match paper.
+    * Added `mobilenet_v3` to keras application model.
+* `tf.function` / AutoGraph:
+  * Added `experimental_follow_type_hints` argument for `tf.function`. When
+    True, the function may use type annotations to optimize the tracing
+    performance.
+  * Added support for `iter(DistributedDataset)` in AutoGraph `for` loops.
+  * AutoGraph now allows creating new symbols inside a TensorFLow loop, if
+    the values of these symbols at an iteration does not depend on the previous
+    iteration. These types of loops must run at least one iteration, and will
+    raise a runtime error otherwise.
+
+    Example:
+
+    ```
+    for batch in data:
+      outputs = train_step(batch)
+    tf.print('final outputs', outputs)
+    ```
+    See tensorflow/python/autograph/g3doc/reference/limitations.md for more
+    info.
 *   `tf.lite`:
+    * `DynamicBuffer::AddJoinedString()` will now add a separator if the first
+      string to be joined is empty.
+    * `TFLiteConverter`:
+      * Support optional flags `inference_input_type` and `inference_output_type` for full integer quantized models. This allows users to modify the model input and output type to integer types (`tf.int8`, `tf.uint8`) instead of defaulting to float type (`tf.float32`).
+    * Deprecate `Interpreter::UseNNAPI(bool)` C++ API
+      * Prefer using `NnApiDelegate()` and related delegate configuration methods directly.
+    * Add NNAPI Delegation support for requantization use cases by converting the operation into a dequantize-quantize pair.
     * <ADD RELEASE NOTES HERE>
 *   `tf.random`:
     * <ADD RELEASE NOTES HERE>
 *   Math and Linear Algebra:
     * <ADD RELEASE NOTES HERE>
 *   TPU Enhancements:
+    * Added support for the `beta` parameter of the FTRL optimizer for TPU
+      embeddings. Users of other TensorFlow platforms can implement equivalent
+      behavior by adjusting the `l2` parameter.
     * <ADD RELEASE NOTES HERE>
 *   XLA Support:
+    * xla.experimental.compile is deprecated, use
+      `tf.function(experimental_compile=True)` instead
     * <ADD RELEASE NOTES HERE>
 *   Tracing and Debugging:
     * <ADD RELEASE NOTES HERE>
 *   Other:
-    * We have replaced uses of "whitelist" with "allowlist" where possible.
-  Please see https://developers.google.com/style/word-list#blacklist for more
-  context.
+    * We have replaced uses of "whitelist" and "blacklist" with "allowlist"
+  and "denylist" where possible. Please see 
+  https://developers.google.com/style/word-list#blacklist for more context.
     * <ADD RELEASE NOTES HERE>
 
 ## Thanks to our Contributors
@@ -71,19 +161,206 @@ stjohnso98, <NAME>, <HERE>, <USING>, <GITHUB>, <HANDLE>
 
 # Release 2.3.0
 
-## Breaking Changes
+## Major Features and Improvements
+  * `tf.data` adds two new mechanisms to solve input pipeline bottlenecks and save resources:
+    * [snapshot](https://www.tensorflow.org/api_docs/python/tf/data/experimental/snapshot)
+    * [tf.data service](https://www.tensorflow.org/api_docs/python/tf/data/experimental/service).
 
-*   `tf.image.extract_glimpse` has been updated to correctly process the case
-    where `centered=False` and `normalized=False`. This is a breaking change as
-    the output is different from (incorrect) previous versions. Note this
-    breaking change only impacts `tf.image.extract_glimpse` and
-    `tf.compat.v2.image.extract_glimpse` API endpoints. The behavior of
-    `tf.compat.v1.image.extract_glimpse` does not change. The behavior of
-    exsiting C++ kernel `ExtractGlimpse` does not change as well, so saved
-    models will not be impacted.
+  In addition checkout the detailed [guide](https://www.tensorflow.org/guide/data_performance_analysis) for analyzing input pipeline performance with TF Profiler.
+
+  * [`tf.distribute.TPUStrategy`](https://www.tensorflow.org/api_docs/python/tf/distribute/TPUStrategy) is now a stable API and no longer considered experimental for TensorFlow. (earlier `tf.distribute.experimental.TPUStrategy`).
+
+  * [TF Profiler](https://www.tensorflow.org/guide/profiler) introduces two new tools: a memory profiler to visualize your model’s memory usage over time and a [python tracer](https://www.tensorflow.org/guide/profiler#events) which allows you to trace python function calls in your model. Usability improvements include better diagnostic messages and [profile options](https://tensorflow.org/guide/profiler#collect_performance_data) to customize the host and device trace verbosity level.
+
+  * Introduces experimental support for Keras Preprocessing Layers API ([`tf.keras.layers.experimental.preprocessing.*`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing?version=nightly)) to handle data preprocessing operations, with support for composite tensor inputs. Please see below for additional details on these layers.
+
+  * TFLite now properly supports dynamic shapes during conversion and inference. We’ve also added opt-in support on Android and iOS for [XNNPACK](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/delegates/xnnpack), a highly optimized set of CPU kernels, as well as opt-in support for [executing quantized models on the GPU](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/gpu_advanced.md#running-quantized-models-experimental).
+
+  * Libtensorflow packages are available in GCS starting this release. We have also started to [release a nightly version of these packages](https://github.com/tensorflow/tensorflow#official-builds).
+
+  * The experimental Python API [`tf.debugging.experimental.enable_dump_debug_info()`](https://www.tensorflow.org/api_docs/python/tf/debugging/experimental/enable_dump_debug_info) now allows you to instrument a TensorFlow program and dump debugging information to a directory on the file system. The directory can be read and visualized by a new interactive dashboard in TensorBoard 2.3 called [Debugger V2](https://www.tensorflow.org/tensorboard/debugger_v2), which reveals the details of the TensorFlow program including graph structures, history of op executions at the Python (eager) and intra-graph levels, the runtime dtype, shape, and numerical composistion of tensors, as well as their code locations.
+
+## Breaking Changes
+* Increases the **minimum bazel version** required to build TF to **3.1.0**.
+* `tf.data`
+  *  Makes the following (breaking) changes to the `tf.data`.
+    * C++ API: - `IteratorBase::RestoreInternal`, `IteratorBase::SaveInternal`, and `DatasetBase::CheckExternalState` become pure-virtual and subclasses are now expected to provide an implementation.
+    * The deprecated `DatasetBase::IsStateful` method is removed in favor of `DatasetBase::CheckExternalState`.
+    * Deprecated overrides of `DatasetBase::MakeIterator` and `MakeIteratorFromInputElement` are removed.
+  * The signature of `tensorflow::data::IteratorBase::SaveInternal` and `tensorflow::data::IteratorBase::SaveInput` has been extended with `SerializationContext` argument to enable overriding the default policy for the handling external state during iterator checkpointing. This is not a backwards compatible change and all subclasses of `IteratorBase` *need to be updated* accordingly.
+* `tf.keras`
+    * Add a new `BackupAndRestore` callback for handling distributed training failures & restarts. Please take a look at this [tutorial](https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras) for details on how to use the callback.
+* `tf.image.extract_glimpse` has been updated to correctly process the case
+   where `centered=False` and `normalized=False`. This is a breaking change as
+   the output is different from (incorrect) previous versions. Note this
+   breaking change only impacts `tf.image.extract_glimpse` and
+   `tf.compat.v2.image.extract_glimpse` API endpoints. The behavior of
+   `tf.compat.v1.image.extract_glimpse` does not change. The behavior of
+   exsiting C++ kernel `ExtractGlimpse` does not change either, so saved
+   models using `tf.raw_ops.ExtractGlimpse` will not be impacted.
+
+## Known Caveats
+  * `tf.lite`
+    * Keras-based LSTM models must be converted with an explicit batch size in the input layer.
 
 ## Bug Fixes and Other Changes
-* Mutable tables now restore checkpointed values when loaded from SavedModel.
+
+### TF Core:
+  * Set `tf2_behavior` to 1 to enable V2 for early loading cases.
+  * Add `execute_fn_for_device function` to dynamically choose the implementation based on underlying device placement.
+  * Eager:
+    * Add `reduce_logsumexp` benchmark with experiment compile.
+    * Give `EagerTensor`s a meaningful `__array__` implementation.
+    * Add another version of defun matmul for performance analysis.
+  * `tf.function`/AutoGraph:
+    * `AutoGraph` now includes into TensorFlow loops any variables that are closed over by local functions. Previously, such variables were sometimes incorrectly ignored.
+    * functions returned by the `get_concrete_function` method of `tf.function` objects can now be called with arguments consistent with the original arguments or type specs passed to `get_concrete_function`.  This calling convention is now the preferred way to use concrete functions with nested values and composite tensors. Please check the [guide](https://www.tensorflow.org/guide/concrete_function) for more details on `concrete_ function`.
+    * Update `tf.function`'s `experimental_relax_shapes` to handle composite tensors appropriately.
+    * Optimize `tf.function` invocation, by removing redundant list converter.
+    * `tf.function` will retrace when called with a different variable instead of simply using the `dtype` & `shape`.
+    * [Improve support](https://github.com/tensorflow/tensorflow/issues/33862) for dynamically-sized TensorArray inside `tf.function`.
+  * `tf.math`:
+    * Narrow down `argmin`/`argmax` contract to always return the smallest index for ties.
+    * `tf.math.reduce_variance` and `tf.math.reduce_std` return correct computation for complex types and no longer support integer types.
+    * Add Bessel functions of order 0,1 to `tf.math.special`.
+    * `tf.divide` now always returns a tensor to be consistent with documentation and other APIs.
+  * `tf.image`:
+    * Replaced [`tf.image.non_max_suppression_padded`](https://www.tensorflow.org/versions/r2.3/api_docs/python/tf/image/non_max_suppression_padded?hl=en) with a new implementation that supports batched inputs, which is considerably faster on TPUs and GPUs. Boxes with area=0 will be ignored. Existing usage with single inputs should still work as before.
+  * `tf.linalg`
+    * Add `tf.linalg.banded_triangular_solve`.
+  * `tf.random`:
+    * Add `tf.random.stateless_parameterized_truncated_normal`.
+  * `tf.ragged`:
+    * Add `tf.ragged.cross` and `tf.ragged.cross_hashed` operations.
+  * `tf.RaggedTensor`:
+    * `RaggedTensor.to_tensor()` now preserves static shape.
+    * Add `tf.strings.format()` and `tf.print()` to support RaggedTensors.
+  * `tf.saved_model`:
+    * `@tf.function` from SavedModel no longer ignores args after a `RaggedTensor` when selecting the concrete function to run.
+    * Fix save model issue for ops with a list of functions.
+    * Add `tf.saved_model.LoadOptions` with [`experimental_io_device`](https://www.tensorflow.org/versions/r2.3/api_docs/python/tf/saved_model/LoadOptions?hl=en) as arg with default value `None` to choose the I/O device for loading models and weights.
+    * Update `tf.saved_model.SaveOptions` with [`experimental_io_device`](https://www.tensorflow.org/versions/r2.3/api_docs/python/tf/saved_model/SaveOptions?hl=en) as arg with default value `None` to choose the I/O device for saving models and weights.
+    * Mutable tables now restore checkpointed values when loaded from SavedModel.
+  * GPU
+    * TF 2.3 includes PTX kernels only for [compute capability](https://developer.nvidia.com/cuda-gpus) 7.0 to reduce the TF pip binary size.  Earlier releases included PTX for a variety of older compute capabilities.
+  * Others
+    * Retain parent namescope for ops added inside `tf.while_loop`/`tf.cond`/`tf.switch_case`.
+    * Update `tf.vectorized_map` to support vectorizing `tf.while_loop` and TensorList operations.
+    * `tf.custom_gradient` can now be applied to functions that accept nested structures of `tensors` as inputs (instead of just a list of tensors). Note that Python structures such as tuples and lists now won't be treated as tensors, so if you still want them to be treated that way, you need to wrap them with `tf.convert_to_tensor`.
+    * No lowering on gradient case op when input is `DeviceIndex` op.
+    * Extend the ragged version of `tf.gather` to support `batch_dims` and `axis` args.
+    * Update `tf.map_fn` to support RaggedTensors and SparseTensors.
+    * Deprecate `tf.group`. It is not useful in eager mode.
+    * Add CPU and GPU implementation of modified variation of [`FTRL`](https://www.tensorflow.org/versions/r2.3/api_docs/python/tf/raw_ops/ApplyFtrl)/[`FTRLV2`](https://www.tensorflow.org/versions/r2.3/api_docs/python/tf/raw_ops/ApplyFtrlV2) that can triggerred by `multiply_linear_by_lr` allowing a learning rate of zero.
+
+### `tf.data`:
+  * `tf.data.experimental.dense_to_ragged_batch` works correctly with tuples.
+  * `tf.data.experimental.dense_to_ragged_batch` to output variable ragged rank.
+  * `tf.data.experimental.cardinality` is now a method on `tf.data.Dataset`.
+  * `tf.data.Dataset` now supports `len(Dataset)` when the cardinality is finite.
+
+### `tf.distribute`:
+  * Expose experimental [`tf.distribute.DistributedDataset`](https://www.tensorflow.org/versions/r2.3/api_docs/python/tf/distribute/DistributedDataset?hl=en) and [`tf.distribute.DistributedIterator`](https://www.tensorflow.org/versions/r2.3/api_docs/python/tf/distribute/DistributedIterator) to distribute input data when using `tf.distribute` to scale training on multiple devices.
+    * Added a [`get_next_as_optional`](https://www.tensorflow.org/versions/r2.3/api_docs/python/tf/distribute/DistributedIterator?hl=en#get_next_as_optional) method for [`tf.distribute.DistributedIterator`](https://www.tensorflow.org/versions/r2.3/api_docs/python/tf/distribute/DistributedIterator?hl=en) class to return a `tf.experimental.Optional` instance that contains the next value for all replicas or none instead of raising an out of range error. Also see *new* [guide on input distribution](https://www.tensorflow.org/tutorials/distribute/input).
+  * Allow var.assign on MirroredVariables with aggregation=NONE in replica context. Previously this would raise an error. We now allow this because many users and library writers find using `.assign` in replica context to be more convenient, instead of having to use `Strategy.extended.update` which was the previous way of updating variables in this situation.
+  * `tf.distribute.experimental.MultiWorkerMirroredStrategy` adds support for partial batches. Workers running out of data now continue to participate in the training with empty inputs, instead of raising an error. Learn more about [partial batches here](https://www.tensorflow.org/tutorials/distribute/input#partial_batches).
+  * Improve the performance of reading metrics eagerly under `tf.distribute.experimental.MultiWorkerMirroredStrategy`.
+  * Fix the issue that `strategy.reduce()` inside `tf.function` may raise exceptions when the values to reduce are from loops or if-clauses.
+  * Fix the issue that `tf.distribute.MirroredStrategy` cannot be used together with `tf.distribute.experimental.MultiWorkerMirroredStrategy`.
+  * Add a `tf.distribute.cluster_resolver.TPUClusterResolver.connect` API to simplify TPU initialization.
+
+### `tf.keras`:
+  * Introduces experimental preprocessing layers API (`tf.keras.layers.experimental.preprocessing`)  to handle data preprocessing operations such as categorical feature encoding, text vectorization, data normalization, and data discretization (binning). The newly added layers provide a replacement for the  legacy feature column API, and support composite tensor inputs.
+  * Added **categorical data** processing layers:
+    * `IntegerLookup` & `StringLookup`: build an index of categorical feature values
+    * `CategoryEncoding`: turn integer-encoded categories into one-hot, multi-hot, or tf-idf encoded representations
+    * `CategoryCrossing`: create new categorical features representing co-occurrences of previous categorical feature values
+    * `Hashing`: the hashing trick, for large-vocabulary categorical features
+    * `Discretization`: turn continuous numerical features into categorical features by binning their values
+  * Improved **image preprocessing** layers: `CenterCrop`, `Rescaling`
+  * Improved **image augmentation** layers: `RandomCrop`, `RandomFlip`, `RandomTranslation`, `RandomRotation`, `RandomHeight`, `RandomWidth`, `RandomZoom`, `RandomContrast`
+  * Improved **`TextVectorization`** layer, which handles string tokenization, n-gram generation, and token encoding
+    * The `TextVectorization` layer now accounts for the mask_token as part of the vocabulary size when output_mode='int'. This means that, if you have a max_tokens value of 5000, your output will have 5000 unique values (not 5001 as before).
+    * Change the return value of `TextVectorization.get_vocabulary()` from `byte` to `string`. Users who previously were calling 'decode' on the output of this method should no longer need to do so.
+  * Introduce new Keras dataset generation utilities :
+    * **[`image_dataset_from_directory`](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image_dataset_from_directory)** is a utility based on `tf.data.Dataset`, meant to replace the legacy `ImageDataGenerator`. It takes you from a structured directory of images to a labeled dataset, in one function call. Note that it doesn't perform image data augmentation (which is meant to be done using preprocessing layers).
+    * **[`text_dataset_from_directory`](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text_dataset_from_directory)** takes you from a structured directory of text files to a labeled dataset, in one function call.
+    * **[`timeseries_dataset_from_array`](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/timeseries_dataset_from_array)** is a `tf.data.Dataset`-based replacement of the legacy `TimeseriesGenerator`. It takes you from an array of timeseries data to a dataset of shifting windows with their targets.
+  * Added [`experimental_steps_per_execution`](https://www.tensorflow.org/versions/r2.3/api_docs/python/tf/keras/Model?hl=en#compile)
+ arg to `model.compile` to indicate the number of batches to run per `tf.function` call. This can speed up Keras Models on TPUs up to 3x.
+  * Extends `tf.keras.layers.Lambda` layers to support multi-argument lambdas, and keyword arguments when calling the layer.
+  * Functional models now get constructed if *any* tensor in a layer call's arguments/keyword arguments comes from a keras input. Previously the functional api would only work if all of the elements in the first argument to the layer came from a keras input.
+  * Clean up `BatchNormalization` layer's `trainable` property to act like standard python state when it's used inside `tf.functions` (frozen at tracing time), instead of acting like a pseudo-variable whose updates *kind of sometimes* get reflected in already-traced `tf.function` traces.
+  * Add the `Conv1DTranspose` layer.
+  * Refine the semantics of `SensitivitySpecificityBase` derived metrics. See the updated API docstrings for [`tf.keras.metrics.SensitivityAtSpecificity`](https://www.tensorflow.org/versions/r2.3/api_docs/python/tf/keras/metrics/SensitivityAtSpecificity) and [`tf.keras.metrics.SpecificityAtSensitivty`](https://www.tensorflow.org/versions/r2.3/api_docs/python/tf/keras/metrics/SpecificityAtSensitivity).
+
+### `tf.lite`:
+  * Converter
+      * Restored `inference_input_type` and `inference_output_type` flags in TF 2.x TFLiteConverter (backward compatible with TF 1.x) to support integer (tf.int8, tf.uint8) input and output types in post training full integer quantized models.
+      * Added support for converting and resizing models with dynamic (placeholder) dimensions. Previously, there was only limited support for dynamic batch size, and even that did not guarantee that the model could be properly resized at runtime.
+       * Enabled experimental support for a new quantization mode with 16-bit activations and 8-bit weights. See `lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8`.
+  * CPU
+      * Fix an issue w/ dynamic weights and `Conv2D` on x86.
+      * Add a runtime Android flag for enabling `XNNPACK` for optimized CPU performance.
+      * Add a runtime iOS flag for enabling `XNNPACK` for optimized CPU performance.
+      * Add a compiler flag to enable building a TFLite library that applies `XNNPACK` delegate automatically when the model has a `fp32` operation.
+  * GPU
+      * Allow GPU acceleration starting with internal graph nodes
+      * Experimental support for quantized models with the Android GPU delegate
+      * Add GPU delegate whitelist.
+      * Rename GPU whitelist -> compatibility (list).
+      * Improve GPU compatibility list entries from crash reports.
+  * NNAPI
+      * Set default value for `StatefulNnApiDelegate::Options::max_number_delegated_partitions` to 3.
+      * Add capability to disable `NNAPI` CPU and check `NNAPI` Errno.
+      * Fix crashes when using `NNAPI` with target accelerator specified with model containing Conv2d or FullyConnected or LSTM nodes with quantized weights.
+      * Fix `ANEURALNETWORKS_BAD_DATA` execution failures with `sum`/`max`/`min`/`reduce` operations with `scalar` inputs.
+  * Hexagon
+      * TFLite Hexagon Delegate out of experimental.
+      * Experimental `int8` support for most hexagon ops.
+      * Experimental per-channel quant support for `conv` in Hexagon delegate.
+      * Support dynamic batch size in C++ API.
+  * CoreML
+     * Opensource CoreML delegate
+  * Misc
+      * Enable building Android TFLite targets on Windows
+      * Add support for `BatchMatMul`.
+      * Add support for `half_pixel_centers` with `ResizeNearestNeighbor`.
+      * Add 3D support for `BatchToSpaceND`.
+      * Add 5D support for `BroadcastSub`, `Maximum`, `Minimum`, `Transpose` and `BroadcastDiv`.
+      * Rename `kTfLiteActRelu1` to `kTfLiteActReluN1To1`.
+      * Enable flex delegate on tensorflow.lite.Interpreter Python package.
+      * Add `Buckettize`, `SparseCross` and `BoostedTreesBucketize` to the flex whitelist.
+      * Add support for selective registration of flex ops.
+      * Add missing kernels for flex delegate whitelisted ops.
+      * Fix issue when using direct `ByteBuffer` inputs with graphs that have dynamic shapes.
+      * Fix error checking supported operations in a model containing `HardSwish`.
+
+### Packaging Support
+  * Added `tf.sysconfig.get_build_info()`. Returns a dict that describes the build environment of the currently installed TensorFlow package, e.g. the NVIDIA CUDA and NVIDIA CuDNN versions used when TensorFlow was built.
+
+### Profiler
+  * Fix a subtle use-after-free issue in `XStatVisitor::RefValue()`.
+
+### TPU Enhancements
+  * Adds 3D mesh support in TPU configurations ops.
+  * Added TPU code for `FTRL` with `multiply_linear_by_lr`.
+  * Silently adds a new file system registry at `gstpu`.
+  * Support `restartType` in cloud tpu client.
+  * Depend on a specific version of google-api-python-client.
+  * Fixes apiclient import.
+
+### Tracing and Debugging
+  * Add a `TFE_Py_Execute` traceme.
+
+### XLA Support
+  * Implement stable `argmin` and `argmax`
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+902449@58880@bigcat_chen@ASIC, Abdul Baseer Khan, Abhineet Choudhary, Abolfazl Shahbazi, Adam Hillier, ag.ramesh, Agoniii, Ajay P, Alex Hoffman, Alexander Bayandin, Alexander Grund, Alexandre Abadie, Alexey Rogachevskiy, amoitra, Andrew Stevens, Angus-Luo, Anshuman Tripathy, Anush Elangovan, Artem Mavrin, Ashutosh Hathidara, autoih, Ayushman Kumar, ayushmankumar7, Bairen Yi, Bas Aarts, Bastian Eichenberger, Ben Barsdell, bhack, Bharat Raghunathan, Biagio Montaruli, Bigcat-Himax, blueyi, Bryan Cutler, Byambaa, Carlos Hernandez-Vaquero, Chen Lei, Chris Knorowski, Christian Clauss, chuanqiw, CuiYifeng, Daniel Situnayake, Daria Zhuravleva, Dayananda-V, Deven Desai, Devi Sandeep Endluri, Dmitry Zakharov, Dominic Jack, Duncan Riach, Edgar Liberis, Ehsan Toosi, ekuznetsov139, Elena Zhelezina, Eugene Kuznetsov, Eugene Mikhantiev, Evgenii Zheltonozhskii, Fabio Di Domenico, Fausto Morales, Fei Sun, feihugis, Felix E. Klee, flyingcat, Frederic Bastien, Fredrik Knutsson, frreiss, fsx950223, ganler, Gaurav Singh, Georgios Pinitas, Gian Marco Iodice, Giorgio Arena, Giuseppe Rossini, Gregory Keith, Guozhong Zhuang, gurushantj, Hahn Anselm, Harald Husum, Harjyot Bagga, Hristo Vrigazov, Ilya Persky, Ir1d, Itamar Turner-Trauring, jacco, Jake Tae, Janosh Riebesell, Jason Zaman, jayanth, Jeff Daily, Jens Elofsson, Jinzhe Zeng, JLZ, Jonas Skog, Jonathan Dekhtiar, Josh Meyer, Joshua Chia, Judd, justkw, Kaixi Hou, Kam D Kasravi, Kamil Rakoczy, Karol Gugala, Kayou, Kazuaki Ishizaki, Keith Smiley, Khaled Besrour, Kilaru Yasaswi Sri Chandra Gandhi, Kim, Young Soo, Kristian Hartikainen, Kwabena W. Agyeman, Leslie-Fang, Leslie-Fang-Intel, Li, Guizi, Lukas Geiger, Lutz Roeder, M\U00E5Ns Nilsson, Mahmoud Abuzaina, Manish, Marcel Koester, Marcin Sielski, marload, Martin Jul, Matt Conley, mdfaijul, Meng, Peng, Meteorix, Michael Käufl, Michael137, Milan Straka, Mitchell Vitez, Ml-0, Mokke Meguru, Mshr-H, nammbash, Nathan Luehr, naumkin, Neeraj Bhadani, ngc92, Nick Morgan, nihui, Niranjan Hasabnis, Niranjan Yadla, Nishidha Panpaliya, Oceania2018, oclyke, Ouyang Jin, OverLordGoldDragon, Owen Lyke, Patrick Hemmer, Paul Andrey, Peng Sun, periannath, Phil Pearl, Prashant Dandriyal, Prashant Kumar, Rahul Huilgol, Rajan Singh, Rajeshwar Reddy T, rangjiaheng, Rishit Dagli, Rohan Reddy, rpalakkal, rposts, Ruan Kunliang, Rushabh Vasani, Ryohei Ikegami, Semun Lee, Seo-Inyoung, Sergey Mironov, Sharada Shiddibhavi, ShengYang1, Shraiysh Vaishay, Shunya Ueta, shwetaoj, Siyavash Najafzade, Srinivasan Narayanamoorthy, Stephan Uphoff, storypku, sunchenggen, sunway513, Sven-Hendrik Haase, Swapnil Parekh, Tamas Bela Feher, Teng Lu, tigertang, tomas, Tomohiro Ubukata, tongxuan.ltx, Tony Tonev, Tzu-Wei Huang, Téo Bouvard, Uday Bondhugula, Vaibhav Jade, Vijay Tadikamalla, Vikram Dattu, Vincent Abriou, Vishnuvardhan Janapati, Vo Van Nghia, VoVAllen, Will Battel, William D. Irons, wyzhao, Xiaoming (Jason) Cui, Xiaoquan Kong, Xinan Jiang, xutianming, Yair Ehrenwald, Yasir Modak, Yasuhiro Matsumoto, Yixing Fu, Yong Tang, Yuan Tang, zhaozheng09, Zilin Zhu, zilinzhu, 张志豪
 
 # Release 2.1.1
 
@@ -210,7 +487,7 @@ Coinciding with this change, new releases of [TensorFlow's Docker images](https:
         `Strategy.extended.update` and `Strategy.extended.update_non_slot`.
     *   Experimental support for shape invariants has been enabled in
         `tf.function`. See the API docs for
-        `tf.autograph.experimental.set_loop_options` for additonal info.
+        `tf.autograph.experimental.set_loop_options` for additional info.
     *   AutoGraph error messages now exclude frames corresponding to APIs
         internal to AutoGraph.
     *   Improve shape inference for `tf.function` input arguments to unlock more
@@ -293,7 +570,7 @@ Coinciding with this change, new releases of [TensorFlow's Docker images](https:
         also deterministic back-prop of bias-addition in Keras layers) to
         include when XLA JIT compilation is enabled.
     *   Fix problem, when running on a CUDA GPU and when either environment
-        variable `TF_DETERMINSTIC_OPS` or environment variable
+        variable `TF_DETERMINISTIC_OPS` or environment variable
         `TF_CUDNN_DETERMINISTIC` is set to "true" or "1", in which some layer
         configurations led to an exception with the message "No algorithm
         worked!"
@@ -336,32 +613,86 @@ This release contains contributions from many people at Google, as well as:
 TensorFlow 2.1 will be the last TF release supporting Python 2. Python 2 support [officially ends an January 1, 2020](https://www.python.org/dev/peps/pep-0373/#update). [As announced earlier](https://groups.google.com/a/tensorflow.org/d/msg/announce/gVwS5RC8mds/dCt1ka2XAAAJ), TensorFlow will also stop supporting Python 2 starting January 1, 2020, and no more releases are expected in 2019.
 
 ## Major Features and Improvements
-* The `tensorflow` pip package now includes GPU support by default (same as `tensorflow-gpu`) for both Linux and Windows. This runs on machines with and without NVIDIA GPUs. `tensorflow-gpu` is still available, and CPU-only packages can be downloaded at `tensorflow-cpu` for users who are concerned about package size.
-* **Windows users:** Officially-released `tensorflow` Pip packages are now built with Visual Studio 2019 version 16.4 in order to take advantage of the new `/d2ReducedOptimizeHugeFunctions` compiler flag. To use these new packages, you must install "Microsoft Visual C++ Redistributable for Visual Studio 2015, 2017 and 2019", available from Microsoft's website [here](https://support.microsoft.com/help/2977003/the-latest-supported-visual-c-downloads).
-  * This does not change the minimum required version for building TensorFlow from source on Windows, but builds enabling `EIGEN_STRONG_INLINE` can take over 48 hours to compile without this flag. Refer to `configure.py` for more information about `EIGEN_STRONG_INLINE` and `/d2ReducedOptimizeHugeFunctions`.
-  * If either of the required DLLs, `msvcp140.dll` (old) or `msvcp140_1.dll` (new), are missing on your machine, `import tensorflow` will print a warning message.
-* The `tensorflow` pip package is built with CUDA 10.1 and cuDNN 7.6.
-* `tf.keras`
-  * Experimental support for mixed precision is available on GPUs and Cloud TPUs. See [usage guide](https://www.tensorflow.org/guide/keras/mixed_precision).
-  * Introduced the `TextVectorization` layer, which takes as input raw strings and takes care of text standardization, tokenization, n-gram generation, and vocabulary indexing. See this [end-to-end text classification example](https://colab.research.google.com/drive/1RvCnR7h0_l4Ekn5vINWToI9TNJdpUZB3).
-  * Keras `.compile` `.fit` `.evaluate` and `.predict` are allowed to be outside of the DistributionStrategy scope, as long as the model was constructed inside of a scope.
-  * Experimental support for Keras `.compile`, `.fit`, `.evaluate`, and `.predict` is available for Cloud TPUs, Cloud TPU, for all types of Keras models (sequential, functional and subclassing models).
-  * Automatic outside compilation is now enabled for Cloud TPUs. This allows `tf.summary` to be used more conveniently with Cloud TPUs.
-  * Dynamic batch sizes with DistributionStrategy and Keras are supported on Cloud TPUs.
-  * Support for `.fit`, `.evaluate`, `.predict` on TPU using numpy data, in addition to `tf.data.Dataset`.
-  * Keras reference implementations for many popular models are available in the TensorFlow [Model Garden](https://github.com/tensorflow/models/tree/master/official).
-* `tf.data`
-  * Changes rebatching for `tf.data datasets` + DistributionStrategy for better performance. Note that the dataset also behaves slightly differently, in that the rebatched dataset cardinality will always be a multiple of the number of replicas.
-  * `tf.data.Dataset` now supports automatic data distribution and sharding in distributed environments, including on TPU pods.
-  * Distribution policies for `tf.data.Dataset` can now be tuned with 1. `tf.data.experimental.AutoShardPolicy(OFF, AUTO, FILE, DATA)` 2. `tf.data.experimental.ExternalStatePolicy(WARN, IGNORE, FAIL)`
-* `tf.debugging`
-  * Add `tf.debugging.enable_check_numerics()` and `tf.debugging.disable_check_numerics()` to help debugging the root causes of issues involving infinities and `NaN`s.
-* `tf.distribute`
-  * Custom training loop support on TPUs and TPU pods is avaiable through `strategy.experimental_distribute_dataset`, `strategy.experimental_distribute_datasets_from_function`, `strategy.experimental_run_v2`, `strategy.reduce`.
-  * Support for a global distribution strategy through `tf.distribute.experimental_set_strategy(),` in addition to `strategy.scope()`.
-* `TensorRT`
-  * [TensorRT 6.0](https://developer.nvidia.com/tensorrt#tensorrt-whats-new) is now supported and enabled by default. This adds support for more TensorFlow ops including Conv3D, Conv3DBackpropInputV2, AvgPool3D, MaxPool3D, ResizeBilinear, and ResizeNearestNeighbor. In addition, the TensorFlow-TensorRT python conversion API is exported as `tf.experimental.tensorrt.Converter`.
-* Environment variable `TF_DETERMINISTIC_OPS` has been added. When set to "true" or "1", this environment variable makes `tf.nn.bias_add` operate deterministically (i.e. reproducibly), but currently only when XLA JIT compilation is *not* enabled. Setting `TF_DETERMINISTIC_OPS` to "true" or "1" also makes cuDNN convolution and max-pooling operate deterministically. This makes Keras Conv\*D and MaxPool\*D layers operate deterministically in both the forward and backward directions when running on a CUDA-enabled GPU.
+
+*   The `tensorflow` pip package now includes GPU support by default (same as
+    `tensorflow-gpu`) for both Linux and Windows. This runs on machines with and
+    without NVIDIA GPUs. `tensorflow-gpu` is still available, and CPU-only
+    packages can be downloaded at `tensorflow-cpu` for users who are concerned
+    about package size.
+*   **Windows users:** Officially-released `tensorflow` Pip packages are now
+    built with Visual Studio 2019 version 16.4 in order to take advantage of the
+    new `/d2ReducedOptimizeHugeFunctions` compiler flag. To use these new
+    packages, you must install "Microsoft Visual C++ Redistributable for Visual
+    Studio 2015, 2017 and 2019", available from Microsoft's website
+    [here](https://support.microsoft.com/help/2977003/the-latest-supported-visual-c-downloads).
+    *   This does not change the minimum required version for building
+        TensorFlow from source on Windows, but builds enabling
+        `EIGEN_STRONG_INLINE` can take over 48 hours to compile without this
+        flag. Refer to `configure.py` for more information about
+        `EIGEN_STRONG_INLINE` and `/d2ReducedOptimizeHugeFunctions`.
+    *   If either of the required DLLs, `msvcp140.dll` (old) or `msvcp140_1.dll`
+        (new), are missing on your machine, `import tensorflow` will print a
+        warning message.
+*   The `tensorflow` pip package is built with CUDA 10.1 and cuDNN 7.6.
+*   `tf.keras`
+    *   Experimental support for mixed precision is available on GPUs and Cloud
+        TPUs. See
+        [usage guide](https://www.tensorflow.org/guide/keras/mixed_precision).
+    *   Introduced the `TextVectorization` layer, which takes as input raw
+        strings and takes care of text standardization, tokenization, n-gram
+        generation, and vocabulary indexing. See this
+        [end-to-end text classification example](https://colab.research.google.com/drive/1RvCnR7h0_l4Ekn5vINWToI9TNJdpUZB3).
+    *   Keras `.compile` `.fit` `.evaluate` and `.predict` are allowed to be
+        outside of the DistributionStrategy scope, as long as the model was
+        constructed inside of a scope.
+    *   Experimental support for Keras `.compile`, `.fit`, `.evaluate`, and
+        `.predict` is available for Cloud TPUs, Cloud TPU, for all types of
+        Keras models (sequential, functional and subclassing models).
+    *   Automatic outside compilation is now enabled for Cloud TPUs. This allows
+        `tf.summary` to be used more conveniently with Cloud TPUs.
+    *   Dynamic batch sizes with DistributionStrategy and Keras are supported on
+        Cloud TPUs.
+    *   Support for `.fit`, `.evaluate`, `.predict` on TPU using numpy data, in
+        addition to `tf.data.Dataset`.
+    *   Keras reference implementations for many popular models are available in
+        the TensorFlow
+        [Model Garden](https://github.com/tensorflow/models/tree/master/official).
+*   `tf.data`
+    *   Changes rebatching for `tf.data datasets` + DistributionStrategy for
+        better performance. Note that the dataset also behaves slightly
+        differently, in that the rebatched dataset cardinality will always be a
+        multiple of the number of replicas.
+    *   `tf.data.Dataset` now supports automatic data distribution and sharding
+        in distributed environments, including on TPU pods.
+    *   Distribution policies for `tf.data.Dataset` can now be tuned with 1.
+        `tf.data.experimental.AutoShardPolicy(OFF, AUTO, FILE, DATA)` 2.
+        `tf.data.experimental.ExternalStatePolicy(WARN, IGNORE, FAIL)`
+*   `tf.debugging`
+    *   Add `tf.debugging.enable_check_numerics()` and
+        `tf.debugging.disable_check_numerics()` to help debugging the root
+        causes of issues involving infinities and `NaN`s.
+*   `tf.distribute`
+    *   Custom training loop support on TPUs and TPU pods is available through
+        `strategy.experimental_distribute_dataset`,
+        `strategy.experimental_distribute_datasets_from_function`,
+        `strategy.experimental_run_v2`, `strategy.reduce`.
+    *   Support for a global distribution strategy through
+        `tf.distribute.experimental_set_strategy(),` in addition to
+        `strategy.scope()`.
+*   `TensorRT`
+    *   [TensorRT 6.0](https://developer.nvidia.com/tensorrt#tensorrt-whats-new)
+        is now supported and enabled by default. This adds support for more
+        TensorFlow ops including Conv3D, Conv3DBackpropInputV2, AvgPool3D,
+        MaxPool3D, ResizeBilinear, and ResizeNearestNeighbor. In addition, the
+        TensorFlow-TensorRT python conversion API is exported as
+        `tf.experimental.tensorrt.Converter`.
+*   Environment variable `TF_DETERMINISTIC_OPS` has been added. When set to
+    "true" or "1", this environment variable makes `tf.nn.bias_add` operate
+    deterministically (i.e. reproducibly), but currently only when XLA JIT
+    compilation is *not* enabled. Setting `TF_DETERMINISTIC_OPS` to "true" or
+    "1" also makes cuDNN convolution and max-pooling operate deterministically.
+    This makes Keras Conv\*D and MaxPool\*D layers operate deterministically in
+    both the forward and backward directions when running on a CUDA-enabled GPU.
 
 ## Breaking Changes
 * Deletes `Operation.traceback_with_start_lines` for which we know of no usages.
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 8a0918b416f..d1c1d7dcdef 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -260,6 +260,36 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "armeabi",
+    values = {"cpu": "armeabi"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "armeabi-v7a",
+    values = {"cpu": "armeabi-v7a"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "arm64-v8a",
+    values = {"cpu": "arm64-v8a"},
+    visibility = ["//visibility:public"],
+)
+
+selects.config_setting_group(
+    name = "arm_any",
+    match_any = [
+        ":arm",
+        ":armeabi",
+        ":armeabi-v7a",
+        ":arm64-v8a",
+        ":linux_aarch64",
+        ":linux_armhf",
+    ],
+)
+
 config_setting(
     name = "freebsd",
     values = {"cpu": "freebsd"},
@@ -532,16 +562,14 @@ selects.config_setting_group(
 package_group(
     name = "internal",
     packages = [
-        # To pass open source testing in the pip Kokoros.
-        "//bazel_pip/tensorflow/...",
         "//learning/brain/swift/x10/...",
         "//perftools/accelerators/xprof/api/...",
-        "//third_party/py/autograph/...",
-        "//third_party/swift/tensorflow/x10/...",
-        "//third_party/swift/tensorflow_apis/...",
         "//tensorflow/...",
         "//tensorflow_estimator/python/estimator/...",
         "//tensorflow_models/official/...",
+        "//third_party/py/autograph/...",
+        "//third_party/swift/tensorflow/x10/...",
+        "//third_party/swift/tensorflow_apis/...",
     ],
 )
 
diff --git a/tensorflow/api_template.__init__.py b/tensorflow/api_template.__init__.py
index f0f977aa0b5..5932dda514d 100644
--- a/tensorflow/api_template.__init__.py
+++ b/tensorflow/api_template.__init__.py
@@ -137,7 +137,7 @@ if _running_from_pip_package():
   # TODO(gunan): Add sanity checks to loaded modules here.
   for _s in _site_packages_dirs:
     # Load first party dynamic kernels.
-    _main_dir = _os.path.join(_s, 'tensorflow_core/core/kernels')
+    _main_dir = _os.path.join(_s, 'tensorflow/core/kernels')
     if _fi.file_exists(_main_dir):
       _ll.load_library(_main_dir)
 
@@ -158,4 +158,23 @@ if hasattr(_current_module, 'keras'):
   setattr(_current_module, "initializers", initializers)
 # pylint: enable=undefined-variable
 
+# Delete modules that should be hidden from dir().
+# Don't fail if these modules are not available.
+# For e.g. this file will be originally placed under tensorflow/_api/v1 which
+# does not have 'python', 'core' directories. Then, it will be copied
+# to tensorflow/ which does have these two directories.
+# pylint: disable=undefined-variable
+try:
+  del python
+except NameError:
+  pass
+try:
+  del core
+except NameError:
+  pass
+try:
+  del compiler
+except NameError:
+  pass
+
 # __all__ PLACEHOLDER
diff --git a/tensorflow/api_template_v1.__init__.py b/tensorflow/api_template_v1.__init__.py
index dad91f2d5b2..0d1d2e56fae 100644
--- a/tensorflow/api_template_v1.__init__.py
+++ b/tensorflow/api_template_v1.__init__.py
@@ -147,7 +147,7 @@ if _running_from_pip_package():
   # TODO(gunan): Add sanity checks to loaded modules here.
   for _s in _site_packages_dirs:
     # Load first party dynamic kernels.
-    _main_dir = _os.path.join(_s, 'tensorflow_core/core/kernels')
+    _main_dir = _os.path.join(_s, 'tensorflow/core/kernels')
     if _fi.file_exists(_main_dir):
       _ll.load_library(_main_dir)
 
@@ -156,4 +156,25 @@ if _running_from_pip_package():
     if _fi.file_exists(_plugin_dir):
       _ll.load_library(_plugin_dir)
 
+# Delete modules that should be hidden from dir().
+# Don't fail if these modules are not available.
+# For e.g. this file will be originally placed under tensorflow/_api/v1 which
+# does not have 'python', 'core' directories. Then, it will be copied
+# to tensorflow/ which does have these two directories.
+
+# pylint: disable=undefined-variable
+try:
+  del python
+except NameError:
+  pass
+try:
+  del core
+except NameError:
+  pass
+try:
+  del compiler
+except NameError:
+  pass
+
+
 # __all__ PLACEHOLDER
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 410fc22069f..e5efe323922 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -213,6 +213,17 @@ tf_cuda_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "logging",
+    srcs = ["logging.cc"],
+    hdrs = ["logging.h"],
+    deps = [
+        ":c_api_macros",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:stringprintf",
+    ],
+)
+
 tf_cuda_library(
     name = "tf_status_internal",
     hdrs = [
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 36a08c8cfc9..2e1759ecea0 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -213,7 +213,6 @@ void TF_Reset(const TF_SessionOptions* opt, const char** containers,
 
 namespace tensorflow {
 
-
 Status MessageToBuffer(const tensorflow::protobuf::MessageLite& in,
                        TF_Buffer* out) {
   if (out->data != nullptr) {
@@ -306,8 +305,8 @@ void TF_GraphSetOutputHandleShapesAndTypes(TF_Graph* graph, TF_Output output,
 }
 
 // Helpers for loading a TensorFlow plugin (a .so file).
-Status LoadLibrary(const char* library_filename, void** result,
-                   const void** buf, size_t* len);
+Status LoadDynamicLibrary(const char* library_filename, void** result,
+                          const void** buf, size_t* len);
 
 // TODO(josh11b,mrry): Change Session to be able to use a Graph*
 // directly, instead of requiring us to serialize to a GraphDef and
@@ -552,7 +551,7 @@ void TF_PRun(TF_DeprecatedSession* s, const char* handle,
 
 TF_Library* TF_LoadLibrary(const char* library_filename, TF_Status* status) {
   TF_Library* lib_handle = new TF_Library;
-  status->status = tensorflow::LoadLibrary(
+  status->status = tensorflow::LoadDynamicLibrary(
       library_filename, &lib_handle->lib_handle, &lib_handle->op_list.data,
       &lib_handle->op_list.length);
   if (!status->status.ok()) {
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index 808bcf3bd80..0b4d9993e4d 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -125,6 +125,14 @@ TF_CAPI_EXPORT extern void TF_DeleteBuffer(TF_Buffer*);
 
 TF_CAPI_EXPORT extern TF_Buffer TF_GetBuffer(TF_Buffer* buffer);
 
+// --------------------------------------------------------------------------
+// Used to return strings across the C API. The caller does not take ownership
+// of the underlying data pointer and is not responsible for freeing it.
+typedef struct TF_StringView {
+  const char* data;
+  size_t len;
+} TF_StringView;
+
 // --------------------------------------------------------------------------
 // TF_SessionOptions holds options that can be passed during session creation.
 typedef struct TF_SessionOptions TF_SessionOptions;
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index 831c6a0ad40..b4297033b6d 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor.pb.h"
@@ -525,12 +526,12 @@ tensorflow::Status EnableCollectiveOps(const tensorflow::ServerDef& server_def,
 
     LOG_AND_RETURN_IF_ERROR(context->StoreCollectiveOpsServer(
         std::move(new_server), grpc_server->worker_env()->device_mgr,
-        grpc_server->worker_env()->collective_executor_mgr));
+        grpc_server->worker_env()->collective_executor_mgr.get()));
   } else {
     LOG_AND_RETURN_IF_ERROR(grpc_server->UpdateServerDef(server_def));
     LOG_AND_RETURN_IF_ERROR(context->StoreCollectiveOpsServer(
         /*new_server=*/nullptr, grpc_server->worker_env()->device_mgr,
-        grpc_server->worker_env()->collective_executor_mgr));
+        grpc_server->worker_env()->collective_executor_mgr.get()));
   }
   return tensorflow::Status::OK();
 #undef LOG_AND_RETURN_IF_ERROR
@@ -551,6 +552,14 @@ TF_CAPI_EXPORT extern void TFE_EnableCollectiveOps(TFE_Context* ctx,
   status->status = EnableCollectiveOps(server_def, ctx);
 }
 
+TF_CAPI_EXPORT extern void TFE_AbortCollectiveOps(TFE_Context* ctx,
+                                                  TF_Status* status) {
+  tensorflow::EagerContext* context =
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
+  auto collective_executor_handle = context->GetCollectiveExecutorHandle();
+  collective_executor_handle->get()->StartAbort(status->status);
+}
+
 TF_ShapeAndTypeList* TF_NewShapeAndTypeList(int num_items) {
   TF_ShapeAndTypeList* result = new TF_ShapeAndTypeList;
   result->num_items = num_items;
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index d0ffbf125fb..ebd14b4b571 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -230,6 +230,14 @@ TF_CAPI_EXPORT extern void TFE_EnableCollectiveOps(TFE_Context* ctx,
                                                    size_t proto_len,
                                                    TF_Status* status);
 
+// Aborts all ongoing collectives with the specified status. After abortion,
+// subsequent collectives will error with this status immediately.
+//
+// This is intended to be used when a peer failure is detected. There's yet no
+// way to reset the collectives other than restarting the program.
+TF_CAPI_EXPORT extern void TFE_AbortCollectiveOps(TFE_Context* ctx,
+                                                  TF_Status* status);
+
 // Information about the shape of a Tensor and its type.
 struct TF_ShapeAndType {
   // Number of dimensions. -1 indicates unknown rank.
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index a77e76644b8..61701bc8b21 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -240,6 +240,8 @@ tf_cuda_cc_test(
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_test_util",
         "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c/experimental/gradients:math_grad",
+        "//tensorflow/c/experimental/ops:array_ops",
         "//tensorflow/cc/profiler",
         "//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration",
         "//tensorflow/core:lib",
@@ -260,6 +262,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:refcount",
     ],
 )
 
@@ -308,6 +311,8 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/util:abstract_stack_trace",
+        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -514,7 +519,6 @@ tf_cuda_cc_test(
     extra_copts = tfe_xla_copts(),
     tags = [
         "no_windows",
-        "noasan",  # leaks gRPC server instances
     ],
     deps = [
         ":c_api",
@@ -581,7 +585,6 @@ tf_cuda_cc_test(
     extra_copts = tfe_xla_copts(),
     tags = [
         "no_windows",
-        "noasan",  # leaks gRPC server instances
     ],
     deps = [
         ":c_api",
diff --git a/tensorflow/c/eager/abstract_tensor_handle.h b/tensorflow/c/eager/abstract_tensor_handle.h
index de041690420..37e6d1bf29c 100644
--- a/tensorflow/c/eager/abstract_tensor_handle.h
+++ b/tensorflow/c/eager/abstract_tensor_handle.h
@@ -18,11 +18,12 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/refcount.h"
 namespace tensorflow {
 
 // Abstract interface to a Tensor handle in either tracing or immediate
 // execution mode.
-class AbstractTensorHandle {
+class AbstractTensorHandle : public core::RefCounted {
  protected:
   enum AbstractTensorHandleKind { kGraph, kMlir, kEager, kTfrt };
   explicit AbstractTensorHandle(AbstractTensorHandleKind kind) : kind_(kind) {}
@@ -34,14 +35,6 @@ class AbstractTensorHandle {
 
   AbstractTensorHandleKind getKind() const { return kind_; }
 
-  // Release any underlying resources, including the interface object.
-  //
-  // WARNING: The destructor of this class is marked as protected to disallow
-  // clients from directly destroying this object since it may manage it's own
-  // lifetime through ref counting. Thus this must be allocated on the heap and
-  // clients MUST call Release() in order to destroy an instance of this class.
-  virtual void Release() = 0;
-
  private:
   const AbstractTensorHandleKind kind_;
 };
@@ -50,7 +43,7 @@ namespace internal {
 struct AbstractTensorHandleDeleter {
   void operator()(AbstractTensorHandle* p) const {
     if (p != nullptr) {
-      p->Release();
+      p->Unref();
     }
   }
 };
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 70acd710166..fefa753c608 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -94,7 +94,6 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/public/version.h"
 
-using tensorflow::int64;
 using tensorflow::string;
 
 namespace {
@@ -725,13 +724,7 @@ void TFE_DeleteContextOptions(TFE_ContextOptions* options) { delete options; }
 TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
   if (opts->use_tfrt) {
 #ifdef PLATFORM_GOOGLE
-    tfrt::SmallVector<std::string, 4> op_handler_chains;
-    tfrt::SmallVector<tensorflow::DeviceAttributes, 4> device_attributes;
-    status->status = tfrt::ListOpHandlerChains(
-        opts->session_options.options, &op_handler_chains, &device_attributes);
-    if (!status->status.ok()) return nullptr;
-    return tensorflow::wrap(new tfrt::ContextInterface(
-        op_handler_chains, device_attributes, opts->async));
+    return tensorflow::wrap(new tfrt::tf::ContextInterface(opts->async));
 #else
     status->status = tensorflow::errors::Unimplemented("TFRT is not supported");
     return nullptr;
@@ -974,7 +967,7 @@ int64_t TFE_TensorHandleNumElements(TFE_TensorHandle* h, TF_Status* status) {
     return -1;
   }
 
-  int64 num_elements = -1;
+  tensorflow::int64 num_elements = -1;
   status->status = tensorflow::unwrap(h)->NumElements(&num_elements);
   return num_elements;
 }
@@ -986,7 +979,7 @@ int64_t TFE_TensorHandleDim(TFE_TensorHandle* h, int dim_index,
     return -1;
   }
 
-  int64 dim = -1;
+  tensorflow::int64 dim = -1;
   status->status = tensorflow::unwrap(h)->Dim(dim_index, &dim);
   return dim;
 }
@@ -1079,11 +1072,13 @@ TFE_TensorHandle* TFE_NewTensorHandleFromDeviceMemory(
   status->status = context->FindDeviceFromName(device_name, &device);
   tensorflow::CustomDevice* custom_device = nullptr;
   if (!status->status.ok()) {
-    status->status =
-        context->FindCustomDeviceFromName(device_name, &custom_device);
-    if (!status->status.ok()) {
+    if (!context->FindCustomDeviceFromName(device_name, &custom_device)) {
       deallocator(data, len, deallocator_arg);
+      status->status =
+          tensorflow::errors::InvalidArgument(device_name, " unknown device.");
       return nullptr;
+    } else {
+      status->status = tensorflow::Status::OK();
     }
   }
   std::vector<tensorflow::int64> dimvec(num_dims);
diff --git a/tensorflow/c/eager/c_api_debug.cc b/tensorflow/c/eager/c_api_debug.cc
index 6827021455b..dd55f05283b 100644
--- a/tensorflow/c/eager/c_api_debug.cc
+++ b/tensorflow/c/eager/c_api_debug.cc
@@ -26,14 +26,13 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_device.h"
 #endif  // TENSORFLOW_EAGER_USE_XLA
 
-using tensorflow::int64;
 using tensorflow::string;
 
 namespace {
 
-std::vector<int64> TensorShapeAsVector(const tensorflow::TensorHandle& handle,
-                                       tensorflow::Status* status) {
-  std::vector<int64> shape;
+std::vector<tensorflow::int64> TensorShapeAsVector(
+    const tensorflow::TensorHandle& handle, tensorflow::Status* status) {
+  std::vector<tensorflow::int64> shape;
   int rank = -1;
   *status = handle.NumDims(&rank);
   if (!status->ok()) {
@@ -79,7 +78,7 @@ TF_CAPI_EXPORT extern TFE_TensorDebugInfo* TFE_TensorHandleTensorDebugInfo(
       return nullptr;
     }
     if (VLOG_IS_ON(3)) {
-      std::vector<int64> shape_to_log =
+      std::vector<tensorflow::int64> shape_to_log =
           TensorShapeAsVector(*handle, &status->status);
       if (!status->status.ok()) {
         // Ignore the status here as we are simply logging.
@@ -128,14 +127,14 @@ TF_CAPI_EXPORT extern TFE_TensorDebugInfo* TFE_TensorHandleTensorDebugInfo(
     }
 
     int rank = padded_shape.dimensions_size();
-    std::vector<int64> dev_dims;
+    std::vector<tensorflow::int64> dev_dims;
     dev_dims.reserve(rank);
     if (rank == 1) {
       // Rank 1 tensors might not have padded_shape.layout.minor_to_major set,
       dev_dims.push_back(padded_shape.dimensions(0));
     } else {
       for (int i = rank - 1; i >= 0; --i) {
-        int64 dim_index = padded_shape.layout().minor_to_major(i);
+        tensorflow::int64 dim_index = padded_shape.layout().minor_to_major(i);
         dev_dims.push_back(padded_shape.dimensions(dim_index));
       }
     }
@@ -146,7 +145,8 @@ TF_CAPI_EXPORT extern TFE_TensorDebugInfo* TFE_TensorHandleTensorDebugInfo(
 
   // If the tensor is not an XLA tensor, the device shape is
   // the same as regular tensor shape.
-  std::vector<int64> dev_dims = TensorShapeAsVector(*handle, &status->status);
+  std::vector<tensorflow::int64> dev_dims =
+      TensorShapeAsVector(*handle, &status->status);
   if (!status->status.ok()) {
     return nullptr;
   }
diff --git a/tensorflow/c/eager/c_api_distributed_test.cc b/tensorflow/c/eager/c_api_distributed_test.cc
index a6547e23454..3738768cf02 100644
--- a/tensorflow/c/eager/c_api_distributed_test.cc
+++ b/tensorflow/c/eager/c_api_distributed_test.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <regex>  // NOLINT
+
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_experimental.h"
 #include "tensorflow/c/eager/c_api_internal.h"
@@ -174,9 +176,9 @@ void TestFunctionWithPackedInput(const bool remote) {
   const char task2_name[] = "/job:localhost/replica:0/task:2/device:CPU:0";
 
   // Create one variable per task.
-  TFE_TensorHandle* h0 = TestVariable(ctx, 1.0, task0_name);
-  TFE_TensorHandle* h1 = TestVariable(ctx, 2.0, task1_name);
-  TFE_TensorHandle* h2 = TestVariable(ctx, 3.0, task2_name);
+  TFE_TensorHandle* h0 = TestVariable(ctx, 1.0, task1_name);
+  TFE_TensorHandle* h1 = TestVariable(ctx, 2.0, task2_name);
+  TFE_TensorHandle* h2 = TestVariable(ctx, 3.0, task0_name);
 
   // Add a sync point in order to make sure that variables have been initialized
   // before the function execution starts.
@@ -185,6 +187,9 @@ void TestFunctionWithPackedInput(const bool remote) {
   VarIsInitialized(ctx, h2);
 
   // Pack 3 variable handles into one TFE_TensorHandle.
+  // When remote is false, function device is placed on task0. Handle types are
+  // REMOTE, REMOTE, LOCAL on task0. When remote is true, function device is
+  // placed on task1, Handle types are LOCAL, REMOTE, LOCAL on task1.
   int num_replicas = 3;
   std::vector<TFE_TensorHandle*> handles = {h0, h1, h2};
   TFE_TensorHandle* packed_handle =
@@ -259,61 +264,64 @@ TEST(CAPI, TestRemoteFunctionWithPackedInput) {
   TestFunctionWithPackedInput(/*remote=*/true);
 }
 
+string VariableAddFunctionSignature() {
+  return "    signature {"
+         "      name: 'VariableAddFunction'"
+         "      input_arg {"
+         "        name: 'var0'"
+         "        type: DT_RESOURCE"
+         "      }"
+         "      output_arg {"
+         "        name: 'var0_value'"
+         "        type: DT_FLOAT"
+         "      }"
+         "    }"
+         "    node_def {"
+         "      name: 'read0'"
+         "      op: 'ReadVariableOp'"
+         "      input: 'var0'"
+         "      attr {"
+         "        key: 'dtype'"
+         "        value {"
+         "          type: DT_FLOAT"
+         "        }"
+         "      }"
+         "    }"
+         "    node_def {"
+         "      name: 'add'"
+         "      op: 'Add'"
+         "      input: 'read0:value:0'"
+         "      input: 'read0:value:0'"
+         "      device: '/job:localhost/task:1/device:CPU:0'"
+         "      attr {"
+         "        key: 'T'"
+         "        value {"
+         "          type: DT_FLOAT"
+         "        }"
+         "      }"
+         "    }"
+         "    node_def {"
+         "      name: 'identity'"
+         "      op: 'Identity'"
+         "      input: 'add:z:0'"
+         "      device: '/job:localhost/task:0/device:CPU:0'"
+         "      attr {"
+         "        key: 'T'"
+         "        value {"
+         "          type: DT_FLOAT"
+         "        }"
+         "      }"
+         "    }"
+         "    ret {"
+         "      key: 'var0_value'"
+         "      value: 'identity:output:0'"
+         "    }";
+}
+
 string VariableAddFunction() {
   tensorflow::FunctionDef def;
   CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
-      "    signature {"
-      "      name: 'VariableAddFunction'"
-      "      input_arg {"
-      "        name: 'var0'"
-      "        type: DT_RESOURCE"
-      "      }"
-      "      output_arg {"
-      "        name: 'var0_value'"
-      "        type: DT_FLOAT"
-      "      }"
-      "    }"
-      "    node_def {"
-      "      name: 'read0'"
-      "      op: 'ReadVariableOp'"
-      "      input: 'var0'"
-      "      attr {"
-      "        key: 'dtype'"
-      "        value {"
-      "          type: DT_FLOAT"
-      "        }"
-      "      }"
-      "    }"
-      "    node_def {"
-      "      name: 'add'"
-      "      op: 'Add'"
-      "      input: 'read0:value:0'"
-      "      input: 'read0:value:0'"
-      "      device: '/job:localhost/task:1/device:CPU:0'"
-      "      attr {"
-      "        key: 'T'"
-      "        value {"
-      "          type: DT_FLOAT"
-      "        }"
-      "      }"
-      "    }"
-      "    node_def {"
-      "      name: 'identity'"
-      "      op: 'Identity'"
-      "      input: 'add:z:0'"
-      "      device: '/job:localhost/task:0/device:CPU:0'"
-      "      attr {"
-      "        key: 'T'"
-      "        value {"
-      "          type: DT_FLOAT"
-      "        }"
-      "      }"
-      "    }"
-      "    ret {"
-      "      key: 'var0_value'"
-      "      value: 'identity:output:0'"
-      "    }",
-      &def));
+      VariableAddFunctionSignature(), &def));
   return def.SerializeAsString();
 }
 
@@ -425,6 +433,17 @@ TEST(CAPI, DistributedFunctionGraphPassOnlyOnce) {
   GraphErrorInjectionPass::enabled_ = false;
 }
 
+string VariableAddFunctionWithGraphError() {
+  string signature = VariableAddFunctionSignature();
+  // Replace the node 'read0' with 'read0_maybe_with_graph_error', so that the
+  // error injecting pass can identify and introduce graph pass errors.
+  signature = std::regex_replace(signature, std::regex("read0"),
+                                 "read0_maybe_with_graph_error");
+  tensorflow::FunctionDef def;
+  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(signature, &def));
+  return def.SerializeAsString();
+}
+
 class FunctionErrorInjectionPass : public tensorflow::FunctionOptimizationPass {
  public:
   FunctionErrorInjectionPass(string error_node, string error_device)
@@ -471,16 +490,19 @@ void TestDistributedFunctionCancellation(bool inject_error) {
   const char dev2_name[] = "/job:localhost/replica:0/task:2/device:CPU:0";
 
   if (inject_error) {
-    // Inject a function optimization pass failure when it sees the 'read0' op
-    // having a requested device `dev2_name`. During execution:
-    //   * task:0 processes the main function `VariableAddFunction` and places
-    //     the read0 op on task:2
-    //   * task:0 partitions the main function with a subgraph containing read0
-    //     sent to task:2
-    //   * task:2 graph pass reports an error when it sees read0 with dev2_name
+    // Inject a function optimization pass failure when it sees the
+    // 'read0_maybe_with_graph_error' op having a requested device `dev2_name`.
+    // During execution:
+    //   * task:0 processes main function `VariableAddFunctionWithGraphError`
+    //     and places the 'read0_maybe_with_graph_error' op on task:2
+    //   * task:0 partitions the main function with a subgraph containing
+    //     'read0_maybe_with_graph_error' sent to task:2
+    //   * task:2 graph pass reports an error when it sees
+    //     'read0_maybe_with_graph_error' with dev2_name
     tensorflow::function_optimization_registration::
         FunctionOptimizationPassRegistration register_test_pass(
-            std::make_unique<FunctionErrorInjectionPass>("read0", dev2_name));
+            std::make_unique<FunctionErrorInjectionPass>(
+                "read0_maybe_with_graph_error", dev2_name));
   }
 
   TF_Status* status = TF_NewStatus();
@@ -496,7 +518,7 @@ void TestDistributedFunctionCancellation(bool inject_error) {
   TFE_TensorHandle* var_handle = TestVariable(ctx, 2.0, dev2_name);
   EXPECT_NE(var_handle, nullptr);
 
-  const string function_def = VariableAddFunction();
+  const string function_def = VariableAddFunctionWithGraphError();
   TFE_ContextAddFunctionDef(ctx, function_def.data(), function_def.size(),
                             status);
   ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
diff --git a/tensorflow/c/eager/c_api_remote_test.cc b/tensorflow/c/eager/c_api_remote_test.cc
index 94c32cf3f30..e99f6d6e170 100644
--- a/tensorflow/c/eager/c_api_remote_test.cc
+++ b/tensorflow/c/eager/c_api_remote_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_experimental.h"
 #include "tensorflow/c/eager/c_api_internal.h"
@@ -115,40 +116,42 @@ void TestRemoteExecute(bool async) {
 TEST(CAPI, RemoteExecute) { TestRemoteExecute(false); }
 TEST(CAPI, RemoteExecuteAsync) { TestRemoteExecute(true); }
 
-string MatMulFunction() {
+string MatMulFunction(const string& matmul_device) {
   tensorflow::FunctionDef def;
   CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
-      "    signature {"
-      "      name: 'MatMulFunction'"
-      "      input_arg {"
-      "        name: 'a'"
-      "        type: DT_FLOAT"
-      "      }"
-      "      input_arg {"
-      "        name: 'b'"
-      "        type: DT_FLOAT"
-      "      }"
-      "      output_arg {"
-      "        name: 'm'"
-      "        type: DT_FLOAT"
-      "      }"
-      "    }"
-      "    node_def {"
-      "      name: 'matmul'"
-      "      op: 'MatMul'"
-      "      input: 'a'"
-      "      input: 'b'"
-      "      attr {"
-      "        key: 'T'"
-      "        value {"
-      "          type: DT_FLOAT"
-      "        }"
-      "      }"
-      "    }"
-      "    ret {"
-      "      key: 'm'"
-      "      value: 'matmul:product'"
-      "    }",
+      absl::StrCat("    signature {"
+                   "      name: 'MatMulFunction'"
+                   "      input_arg {"
+                   "        name: 'a'"
+                   "        type: DT_FLOAT"
+                   "      }"
+                   "      input_arg {"
+                   "        name: 'b'"
+                   "        type: DT_FLOAT"
+                   "      }"
+                   "      output_arg {"
+                   "        name: 'm'"
+                   "        type: DT_FLOAT"
+                   "      }"
+                   "    }"
+                   "    node_def {"
+                   "      name: 'matmul'"
+                   "      op: 'MatMul'"
+                   "      input: 'a'"
+                   "      input: 'b'"
+                   "      device: '",
+                   matmul_device, "'",
+                   "      attr {"
+                   "        key: 'T'"
+                   "        value {"
+                   "          type: DT_FLOAT"
+                   "        }"
+                   "      }"
+                   "    }"
+                   "    ret {"
+                   "      key: 'm'"
+                   "      value: 'matmul:product'"
+                   "    }"),
       &def));
   return def.SerializeAsString();
 }
@@ -157,7 +160,8 @@ string MatMulFunction() {
 // which creates a remote remote input, to simulate a scenario that the remote
 // input is not ready when we start running an op or a function.
 void TestRemoteExecuteSilentCopies(bool async, bool remote, bool func,
-                                   bool heavy_load_on_streaming_rpc) {
+                                   bool heavy_load_on_streaming_rpc,
+                                   bool remote_func_outputs = false) {
   tensorflow::ServerDef server_def = GetServerDef(3);
 
   // This server def has the task index set to 0.
@@ -214,7 +218,8 @@ void TestRemoteExecuteSilentCopies(bool async, bool remote, bool func,
 
   TFE_Op* matmul = nullptr;
   if (func) {
-    string function_def = MatMulFunction();
+    const string matmul_device = remote_func_outputs ? task2_name : "";
+    string function_def = MatMulFunction(matmul_device);
     TFE_ContextAddFunctionDef(ctx, function_def.data(), function_def.size(),
                               status);
     CHECK_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
@@ -250,7 +255,7 @@ void TestRemoteExecuteSilentCopies(bool async, bool remote, bool func,
   EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
 
   // TODO(gjn): Add support for waiting on async local mirrors
-  if (!remote && !async) {
+  if (!remote && !async && !remote_func_outputs) {
     auto remote_arg =
         tensorflow::TensorHandleFromInterface(tensorflow::unwrap(h1_task2));
     // The input handles should never change since they have been mirrored.
@@ -329,6 +334,19 @@ TEST(CAPI, RemoteExecuteSilentCopiesLocalAsyncFunc) {
   TestRemoteExecuteSilentCopies(/*async=*/true, /*remote=*/false, /*func=*/true,
                                 /*heavy_load_on_streaming_rpc=*/false);
 }
+// TODO(b/162618595): Enable this test once we remove the check of remote
+// outputs in ProcessFunctionLibraryRuntime.
+TEST(CAPI, DISABLED_RemoteExecuteSilentCopiesLocalFuncRemoteOutputs) {
+  TestRemoteExecuteSilentCopies(/*async=*/false, /*remote=*/false,
+                                /*func=*/true,
+                                /*heavy_load_on_streaming_rpc=*/false,
+                                /*remote_func_outputs=*/true);
+}
+TEST(CAPI, DISABLED_RemoteExecuteSilentCopiesLocalAsyncFuncRemoteOutputs) {
+  TestRemoteExecuteSilentCopies(/*async=*/true, /*remote=*/false, /*func=*/true,
+                                /*heavy_load_on_streaming_rpc=*/false,
+                                /*remote_func_outputs=*/true);
+}
 TEST(CAPI, RemoteExecuteSilentCopiesLocalAsyncFuncOrdering) {
   // A remote input may be not ready when we start running a function. Test that
   // the function execution should wait until the remote input is ready.
diff --git a/tensorflow/c/eager/c_api_test_util.cc b/tensorflow/c/eager/c_api_test_util.cc
index 4b5ad8f50f7..192f10533a6 100644
--- a/tensorflow/c/eager/c_api_test_util.cc
+++ b/tensorflow/c/eager/c_api_test_util.cc
@@ -88,6 +88,20 @@ TFE_TensorHandle* TestMatrixTensorHandle(TFE_Context* ctx) {
   return th;
 }
 
+TFE_TensorHandle* TestMatrixTensorHandleWithInput(TFE_Context* ctx,
+                                                  float data[], int64_t dims[],
+                                                  int num_dims) {
+  TF_Status* status = TF_NewStatus();
+  TF_Tensor* t =
+      TFE_AllocateHostTensor(ctx, TF_FLOAT, &dims[0], num_dims, status);
+  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
+  TFE_TensorHandle* th = TFE_NewTensorHandleFromTensor(ctx, t, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteTensor(t);
+  TF_DeleteStatus(status);
+  return th;
+}
+
 TFE_TensorHandle* TestMatrixTensorHandle100x100(TFE_Context* ctx) {
   constexpr int64_t dims[] = {100, 100};
   constexpr int num_elements = dims[0] * dims[1];
@@ -143,7 +157,7 @@ TFE_TensorHandle* TestVariable(TFE_Context* ctx, float value,
   if (TF_GetCode(status) != TF_OK) return nullptr;
   TFE_OpSetAttrType(op, "dtype", TF_FLOAT);
   TFE_OpSetAttrShape(op, "shape", {}, 0, status);
-  TFE_OpSetAttrString(op, "container", "", 0);
+  TFE_OpSetAttrString(op, "container", "localhost", 0);
   TFE_OpSetAttrString(op, "shared_name", "", 0);
   if (!device_name.empty()) {
     TFE_OpSetDevice(op, device_name.c_str(), status);
diff --git a/tensorflow/c/eager/c_api_test_util.h b/tensorflow/c/eager/c_api_test_util.h
index fcf62223f14..fcf407aa9c3 100644
--- a/tensorflow/c/eager/c_api_test_util.h
+++ b/tensorflow/c/eager/c_api_test_util.h
@@ -34,6 +34,12 @@ TFE_TensorHandle* DoubleTestMatrixTensorHandle(TFE_Context* ctx);
 // Return a tensor handle containing a 2x2 matrix of floats
 TFE_TensorHandle* TestMatrixTensorHandle(TFE_Context* ctx);
 
+// Return a tensor handle containing 2D matrix containing given data and
+// dimensions
+TFE_TensorHandle* TestMatrixTensorHandleWithInput(TFE_Context* ctx,
+                                                  float data[], int64_t dims[],
+                                                  int num_dims);
+
 // Return a tensor handle containing a 100x100 matrix of floats
 TFE_TensorHandle* TestMatrixTensorHandle100x100(TFE_Context* ctx);
 
diff --git a/tensorflow/c/eager/c_api_unified_experimental.cc b/tensorflow/c/eager/c_api_unified_experimental.cc
index 605a60c186c..8408f7ef60f 100644
--- a/tensorflow/c/eager/c_api_unified_experimental.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental.cc
@@ -147,7 +147,7 @@ TF_AbstractOp* TF_NewAbstractOp(TF_ExecutionContext* c) {
 
 void TF_DeleteAbstractOp(TF_AbstractOp* op) { unwrap(op)->Release(); }
 
-void TF_DeleteAbstractTensor(TF_AbstractTensor* t) { unwrap(t)->Release(); }
+void TF_DeleteAbstractTensor(TF_AbstractTensor* t) { unwrap(t)->Unref(); }
 
 TF_OutputList* TF_NewOutputList() { return wrap(new OutputList); }
 void TF_DeleteOutputList(TF_OutputList* o) { delete unwrap(o); }
diff --git a/tensorflow/c/eager/c_api_unified_experimental_graph.cc b/tensorflow/c/eager/c_api_unified_experimental_graph.cc
index 6165a7d14a3..7bda3aed76d 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_graph.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_graph.cc
@@ -33,6 +33,7 @@ limitations under the License.
 
 using tensorflow::dyn_cast;
 using tensorflow::string;
+using tensorflow::gtl::ArraySlice;
 
 namespace tensorflow {
 namespace tracing {
@@ -48,7 +49,6 @@ class GraphTensor : public TracingTensorHandle {
  public:
   explicit GraphTensor(TF_Output output)
       : TracingTensorHandle(kGraph), output_(output) {}
-  void Release() override { delete this; }
 
   tensorflow::DataType DataType() const override {
     return static_cast<tensorflow::DataType>(TF_OperationOutputType(output_));
@@ -138,20 +138,23 @@ class GraphOperation : public TracingOperation {
 
   Status SetAttrString(const char* attr_name, const char* data,
                        size_t length) override {
-    return tensorflow::errors::Unimplemented(
-        "SetAttrString has not been implemented yet.");
+    tensorflow::StringPiece s(data, length);
+    op_->node_builder.Attr(attr_name, s);
+    return Status::OK();
   }
   Status SetAttrInt(const char* attr_name, int64_t value) override {
-    return tensorflow::errors::Unimplemented(
-        "SetAttrInt has not been implemented yet.");
+    static_assert(sizeof(int64_t) == sizeof(tensorflow::int64),
+                  "64-bit int types should match in size");
+    op_->node_builder.Attr(attr_name, static_cast<tensorflow::int64>(value));
+    return Status::OK();
   }
   Status SetAttrFloat(const char* attr_name, float value) override {
-    return tensorflow::errors::Unimplemented(
-        "SetAttrFloat has not been implemented yet.");
+    op_->node_builder.Attr(attr_name, value);
+    return Status::OK();
   }
   Status SetAttrBool(const char* attr_name, bool value) override {
-    return tensorflow::errors::Unimplemented(
-        "SetAttrBool has not been implemented yet.");
+    op_->node_builder.Attr(attr_name, value);
+    return Status::OK();
   }
   Status SetAttrType(const char* const attr_name, DataType value) override {
     if (!op_) {
@@ -164,8 +167,15 @@ class GraphOperation : public TracingOperation {
   }
   Status SetAttrShape(const char* attr_name, const int64_t* dims,
                       const int num_dims) override {
-    return tensorflow::errors::Unimplemented(
-        "SetAttrShape has not been implemented yet.");
+    PartialTensorShape shape;
+    if (num_dims >= 0) {
+      static_assert(sizeof(int64_t) == sizeof(tensorflow::int64),
+                    "64-bit int types should match in size");
+      shape = PartialTensorShape(ArraySlice<tensorflow::int64>(
+          reinterpret_cast<const tensorflow::int64*>(dims), num_dims));
+    }
+    op_->node_builder.Attr(attr_name, shape);
+    return Status::OK();
   }
   Status SetAttrFunction(const char* attr_name,
                          const AbstractOperation* value) override {
@@ -174,8 +184,10 @@ class GraphOperation : public TracingOperation {
   }
   Status SetAttrFunctionName(const char* attr_name, const char* value,
                              size_t length) override {
-    return tensorflow::errors::Unimplemented(
-        "SetAttrFunctionName has not been implemented yet.");
+    tensorflow::NameAttrList func_name;
+    func_name.set_name(string(value, value + length));
+    op_->node_builder.Attr(attr_name, func_name);
+    return Status::OK();
   }
   Status SetAttrTensor(const char* attr_name,
                        AbstractTensorInterface* tensor) override {
@@ -184,33 +196,71 @@ class GraphOperation : public TracingOperation {
   }
   Status SetAttrStringList(const char* attr_name, const void* const* values,
                            const size_t* lengths, int num_values) override {
-    return tensorflow::errors::Unimplemented(
-        "SetAttrStringList has not been implemented yet.");
+    if (strcmp(attr_name, tensorflow::kColocationAttrName) == 0) {
+      op_->colocation_constraints.clear();
+      for (int i = 0; i < num_values; ++i) {
+        op_->colocation_constraints.emplace(static_cast<const char*>(values[i]),
+                                            lengths[i]);
+      }
+    } else {
+      std::vector<tensorflow::StringPiece> v;
+      v.reserve(num_values);
+      for (int i = 0; i < num_values; ++i) {
+        v.emplace_back(static_cast<const char*>(values[i]), lengths[i]);
+      }
+      op_->node_builder.Attr(attr_name, v);
+    }
+    return Status::OK();
   }
   Status SetAttrFloatList(const char* attr_name, const float* values,
                           int num_values) override {
-    return tensorflow::errors::Unimplemented(
-        "SetAttrFloatList has not been implemented yet.");
+    op_->node_builder.Attr(attr_name,
+                           ArraySlice<const float>(values, num_values));
+    return Status::OK();
   }
   Status SetAttrIntList(const char* attr_name, const int64_t* values,
                         int num_values) override {
-    return tensorflow::errors::Unimplemented(
-        "SetAttrIntList has not been implemented yet.");
+    static_assert(sizeof(int64_t) == sizeof(tensorflow::int64),
+                  "64-bit int types should match in size");
+    op_->node_builder.Attr(
+        attr_name,
+        ArraySlice<const tensorflow::int64>(
+            reinterpret_cast<const tensorflow::int64*>(values), num_values));
+    return Status::OK();
   }
   Status SetAttrTypeList(const char* attr_name, const DataType* values,
                          int num_values) override {
-    return tensorflow::errors::Unimplemented(
-        "SetAttrTypeList has not been implemented yet.");
+    op_->node_builder.Attr(attr_name,
+                           ArraySlice<const DataType>(values, num_values));
+    return Status::OK();
   }
   Status SetAttrBoolList(const char* attr_name, const unsigned char* values,
                          int num_values) override {
-    return tensorflow::errors::Unimplemented(
-        "SetAttrBoolList has not been implemented yet.");
+    std::unique_ptr<bool[]> b(new bool[num_values]);
+    for (int i = 0; i < num_values; ++i) {
+      b[i] = values[i];
+    }
+    op_->node_builder.Attr(attr_name,
+                           ArraySlice<const bool>(b.get(), num_values));
+
+    return Status::OK();
   }
   Status SetAttrShapeList(const char* attr_name, const int64_t** dims,
                           const int* num_dims, int num_values) override {
-    return tensorflow::errors::Unimplemented(
-        "SetAttrShapeList has not been implemented yet.");
+    std::vector<PartialTensorShape> shapes;
+    shapes.reserve(num_values);
+    for (int i = 0; i < num_values; ++i) {
+      if (num_dims[i] < 0) {
+        shapes.emplace_back();
+      } else {
+        static_assert(sizeof(int64_t) == sizeof(tensorflow::int64),
+                      "64-bit int types should match in size");
+        shapes.emplace_back(ArraySlice<tensorflow::int64>(
+            reinterpret_cast<const tensorflow::int64*>(dims[i]), num_dims[i]));
+      }
+    }
+    op_->node_builder.Attr(attr_name, shapes);
+    return Status::OK();
   }
   Status SetAttrFunctionList(
       const char* attr_name,
diff --git a/tensorflow/c/eager/c_api_unified_experimental_test.cc b/tensorflow/c/eager/c_api_unified_experimental_test.cc
index a25dccc4638..c56e8ab05fc 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_test.cc
@@ -92,9 +92,255 @@ TEST_P(UnifiedCAPI, TestBasicEager) {
   TF_DeleteExecutionContext(ctx);
 }
 
+// MatMul Test
+TEST_P(UnifiedCAPI, TestBasicEagerMatMul) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TF_ExecutionContext* ctx = TF_NewEagerExecutionContext(opts, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_DeleteContextOptions(opts);
+
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  /* Want to test simple MatMul example:
+    [[0,0],    *   [[0,0],    =   [[0,0],
+     [0,0]]         [0,0]]         [0,0]]
+  */
+
+  // Build an abstract input tensor.
+  int64_t dims[] = {2, 2};  // Matrices will be 2 x 2
+  int num_dims = sizeof(dims) / sizeof(dims[0]);
+
+  float vals[] = {0.0f, 0.0f, 0.0f, 0.0f};
+  TFE_Context* eager_ctx = TF_ExecutionContextGetTFEContext(ctx, status.get());
+  TFE_TensorHandle* t =
+      TestMatrixTensorHandleWithInput(eager_ctx, vals, dims, num_dims);
+
+  TF_AbstractTensor* at = TF_CreateAbstractTensorFromEagerTensor(
+      t, status.get());  // get abstract tensor
+
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Build an abstract operation.
+  auto* op = TF_NewAbstractOp(ctx);
+  TF_AbstractOpSetOpType(op, "MatMul", status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Build inputs and outputs.
+  TF_AbstractTensor* inputs[2] = {at, at};
+  TF_OutputList* o = TF_NewOutputList();
+  TF_OutputListSetNumOutputs(o, 1, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Execute.
+  TF_ExecuteOperation(op, 2, inputs, o, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Clean up operation and inputs.
+  TF_DeleteAbstractOp(op);
+  TF_DeleteAbstractTensor(at);
+
+  // Verify the results.
+  ASSERT_EQ(1, TF_OutputListNumOutputs(o));
+  TF_AbstractTensor* result = TF_OutputListGet(o, 0);
+  TFE_TensorHandle* result_t =
+      TF_AbstractTensorGetEagerTensor(result, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TF_Tensor* result_tensor = TFE_TensorHandleResolve(result_t, status.get());
+
+  // Copy Tensor data into an array.
+  float result_data[4] = {0};
+  memcpy(&result_data[0], TF_TensorData(result_tensor),
+         TF_TensorByteSize(result_tensor));
+
+  int data_len = 4;  // length of result_data
+  for (int i = 0; i < data_len; i++) {
+    EXPECT_EQ(result_data[i], 0);
+  }
+
+  TF_DeleteTensor(result_tensor);
+  TF_DeleteAbstractTensor(result);
+  TF_DeleteOutputList(o);
+  TF_DeleteExecutionContext(ctx);
+}
+
+// MatMul Test 2
+TEST_P(UnifiedCAPI, TestBasicEagerMatMul2) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TF_ExecutionContext* ctx = TF_NewEagerExecutionContext(opts, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_DeleteContextOptions(opts);
+
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  /* Want to test simple MatMul example with abstract tensors:
+    [[1,2],   *  [[5,6],   =   [[19,22],
+     [3,4]]       [7,8]]        [43,50]]
+  */
+
+  // Build 1st Matrix.
+  int64_t dims[] = {2, 2};  // Matrices will be 2 x 2
+  int num_dims = sizeof(dims) / sizeof(dims[0]);
+
+  float vals1[] = {1.0f, 2.0f, 3.0f, 4.0f};
+  TFE_Context* eager_ctx = TF_ExecutionContextGetTFEContext(ctx, status.get());
+  TFE_TensorHandle* t1 =
+      TestMatrixTensorHandleWithInput(eager_ctx, vals1, dims, num_dims);
+
+  TF_AbstractTensor* at1 = TF_CreateAbstractTensorFromEagerTensor(
+      t1, status.get());  // get abstract tensor
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Build 2nd Matrix.
+  float vals2[] = {5.0f, 6.0f, 7.0f, 8.0f};
+  TFE_TensorHandle* t2 =
+      TestMatrixTensorHandleWithInput(eager_ctx, vals2, dims, num_dims);
+
+  TF_AbstractTensor* at2 = TF_CreateAbstractTensorFromEagerTensor(
+      t2, status.get());  // get abstract tensor
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Build an abstract operation.
+  auto* op = TF_NewAbstractOp(ctx);
+  TF_AbstractOpSetOpType(op, "MatMul", status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Build inputs and outputs.
+  TF_AbstractTensor* inputs[2] = {at1, at2};
+  TF_OutputList* o = TF_NewOutputList();
+  TF_OutputListSetNumOutputs(o, 1, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Execute.
+  TF_ExecuteOperation(op, 2, inputs, o, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Clean up operation and inputs.
+  TF_DeleteAbstractOp(op);
+  TF_DeleteAbstractTensor(at1);
+  TF_DeleteAbstractTensor(at2);
+
+  // Verify the results.
+  ASSERT_EQ(1, TF_OutputListNumOutputs(o));
+  TF_AbstractTensor* result = TF_OutputListGet(o, 0);
+  TFE_TensorHandle* result_t =
+      TF_AbstractTensorGetEagerTensor(result, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  TF_Tensor* result_tensor = TFE_TensorHandleResolve(result_t, status.get());
+
+  // Copy Tensor data into array.
+  float result_data[4] = {0};
+  memcpy(&result_data[0], TF_TensorData(result_tensor),
+         TF_TensorByteSize(result_tensor));
+
+  // Build expected result & verify.
+  float e_vals[] = {19.0f, 22.0f, 43.0f, 50.0f};
+
+  int data_len = 4;  // length of e_vals
+  for (int i = 0; i < data_len; i++) {
+    EXPECT_EQ(result_data[i], e_vals[i]);
+  }
+
+  TF_DeleteTensor(result_tensor);
+  TF_DeleteAbstractTensor(result);
+  TF_DeleteOutputList(o);
+  TF_DeleteExecutionContext(ctx);
+}
+
+// MatAdd
+TEST_P(UnifiedCAPI, TestBasicEagerMatAdd) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TF_ExecutionContext* ctx = TF_NewEagerExecutionContext(opts, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_DeleteContextOptions(opts);
+
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  /* Want to test simple MatAdd example with abstract tensors:
+    [[1,2] ,   +   [[5,6],    =   [[6,8],
+     [3,4] ]        [7,8] ]        [10,12]]
+  */
+
+  // Build 1st Matrix.
+  int64_t dims[] = {2, 2};  // Matrices will be 2 x 2
+  int num_dims = sizeof(dims) / sizeof(dims[0]);
+
+  float vals1[] = {1.0f, 2.0f, 3.0f, 4.0f};
+  TFE_Context* eager_ctx = TF_ExecutionContextGetTFEContext(ctx, status.get());
+  TFE_TensorHandle* t1 =
+      TestMatrixTensorHandleWithInput(eager_ctx, vals1, dims, num_dims);
+
+  TF_AbstractTensor* at1 = TF_CreateAbstractTensorFromEagerTensor(
+      t1, status.get());  // get abstract tensor
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Build 2nd Matrix.
+  float vals2[] = {5.0f, 6.0f, 7.0f, 8.0f};
+  TFE_TensorHandle* t2 =
+      TestMatrixTensorHandleWithInput(eager_ctx, vals2, dims, num_dims);
+
+  TF_AbstractTensor* at2 = TF_CreateAbstractTensorFromEagerTensor(
+      t2, status.get());  // get abstract tensor
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Build an abstract operation.
+  auto* op = TF_NewAbstractOp(ctx);
+  TF_AbstractOpSetOpType(op, "Add", status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Build inputs and outputs.
+  TF_AbstractTensor* inputs[2] = {at1, at2};
+  TF_OutputList* o = TF_NewOutputList();
+  TF_OutputListSetNumOutputs(o, 1, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Execute.
+  TF_ExecuteOperation(op, 2, inputs, o, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Clean up operation and inputs.
+  TF_DeleteAbstractOp(op);
+  TF_DeleteAbstractTensor(at1);
+  TF_DeleteAbstractTensor(at2);
+
+  // Verify the results.
+  ASSERT_EQ(1, TF_OutputListNumOutputs(o));
+  TF_AbstractTensor* result = TF_OutputListGet(o, 0);
+  TFE_TensorHandle* result_t =
+      TF_AbstractTensorGetEagerTensor(result, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  TF_Tensor* result_tensor = TFE_TensorHandleResolve(result_t, status.get());
+
+  // Copy Tensor data into array.
+  float result_data[4] = {0};
+  memcpy(&result_data[0], TF_TensorData(result_tensor),
+         TF_TensorByteSize(result_tensor));
+
+  // Build expected result & verify.
+  float e_vals[] = {6.0f, 8.0f, 10.0f, 12.0f};
+
+  int data_len = 4;  // length of e_vals
+  for (int i = 0; i < data_len; i++) {
+    EXPECT_EQ(result_data[i], e_vals[i]);
+  }
+
+  TF_DeleteTensor(result_tensor);
+  TF_DeleteAbstractTensor(result);
+  TF_DeleteOutputList(o);
+  TF_DeleteExecutionContext(ctx);
+}
+
 TEST_P(UnifiedCAPI, TestBasicGraph) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
+
   // Start a new function / execution context.
   string fn_name = "double";
   TF_ExecutionContext* graph_ctx =
@@ -142,6 +388,7 @@ TEST_P(UnifiedCAPI, TestBasicGraph) {
 
   TF_ExecutionContextRegisterFunction(eager_execution_ctx, func, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
   // Build the abstract op to run the function.
   TF_AbstractOp* fn_op = TF_NewAbstractOp(eager_execution_ctx);
   TF_AbstractOpSetOpType(fn_op, fn_name.c_str(), status.get());
@@ -180,6 +427,111 @@ TEST_P(UnifiedCAPI, TestBasicGraph) {
   TF_DeleteExecutionContext(eager_execution_ctx);
 }
 
+// Graph Tracing for MatMul
+TEST_P(UnifiedCAPI, TestBasicGraphMatMul) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+
+  // Start a new function / execution context.
+  string fn_name = "matrix_multiply";
+  TF_ExecutionContext* graph_ctx =
+      TF_CreateFunction(fn_name.c_str(), status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  auto* placeholder_t =
+      TF_AddFunctionParameter(graph_ctx, TF_FLOAT, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Build an abstract operation.
+  auto* matmul_op = TF_NewAbstractOp(graph_ctx);
+  TF_AbstractOpSetOpType(matmul_op, "MatMul", status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TF_AbstractOpSetOpName(matmul_op, "my_matmul", status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Build inputs and outputs.
+  TF_AbstractTensor* inputs[2] = {placeholder_t, placeholder_t};
+  TF_OutputList* mm_outputs = TF_NewOutputList();
+  TF_OutputListSetNumOutputs(mm_outputs, 1, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Execute.
+  TF_ExecuteOperation(matmul_op, 2, inputs, mm_outputs, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Clean up operation and inputs.
+  TF_DeleteAbstractOp(matmul_op);
+
+  TF_AbstractFunction* func =
+      TF_FinalizeFunction(graph_ctx, mm_outputs, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  /* Now that the graph is built, test graph implementation on matmul example:
+    [[1,1] ,   *   [[1,1] ,   =  [[2,2],
+     [1,1]]         [1,1]]        [2,2]]
+  */
+
+  // Build eager context.
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TF_ExecutionContext* eager_execution_ctx =
+      TF_NewEagerExecutionContext(opts, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_DeleteContextOptions(opts);
+
+  TF_ExecutionContextRegisterFunction(eager_execution_ctx, func, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Build the abstract op to run the function.
+  TF_AbstractOp* fn_op = TF_NewAbstractOp(eager_execution_ctx);
+  TF_AbstractOpSetOpType(fn_op, fn_name.c_str(), status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Build an abstract input tensor.
+  TFE_Context* eager_ctx =
+      TF_ExecutionContextGetTFEContext(eager_execution_ctx, status.get());
+
+  float vals[] = {1.0f, 1.0f, 1.0f, 1.0f};
+  int64_t dims[] = {2, 2};  // Matrices will be 2 x 2
+  int num_dims = sizeof(dims) / sizeof(dims[0]);
+
+  TFE_TensorHandle* input_eager =
+      TestMatrixTensorHandleWithInput(eager_ctx, vals, dims, num_dims);
+  TF_AbstractTensor* input_t =
+      TF_CreateAbstractTensorFromEagerTensor(input_eager, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  TF_OutputListSetNumOutputs(mm_outputs, 1, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TF_ExecuteOperation(fn_op, 1, &input_t, mm_outputs, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  ASSERT_EQ(1, TF_OutputListNumOutputs(mm_outputs));
+  TF_AbstractTensor* final_result = TF_OutputListGet(mm_outputs, 0);
+  TFE_TensorHandle* final =
+      TF_AbstractTensorGetEagerTensor(final_result, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TF_Tensor* f_t = TFE_TensorHandleResolve(final, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  float result_data[4] = {0};
+  memcpy(&result_data[0], TF_TensorData(f_t), TF_TensorByteSize(f_t));
+
+  int data_len = 4;
+  for (int i = 0; i < data_len; i++) {
+    ASSERT_EQ(result_data[i], 2.0f);
+  }
+
+  TF_DeleteAbstractTensor(final_result);
+  TF_DeleteOutputList(mm_outputs);
+  TF_DeleteAbstractTensor(placeholder_t);
+  TF_DeleteAbstractOp(fn_op);
+  TF_DeleteAbstractTensor(input_t);
+  TF_DeleteTensor(f_t);
+  TF_DeleteAbstractFunction(func);
+
+  TF_DeleteExecutionContext(eager_execution_ctx);
+}
+
 TEST_P(UnifiedCAPI, TestMultiOutputGraph) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
@@ -336,6 +688,217 @@ TEST_P(UnifiedCAPI, TestMultiOutputGraph) {
   TF_DeleteAbstractFunction(func);
 }
 
+TEST_P(UnifiedCAPI, TestMultiOutputGraphMatMul) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TF_Status* s = status.get();
+
+  // Start a new function / execution context.
+  string fn_name = "two_adds_and_matmul";
+  TF_ExecutionContext* graph_ctx = TF_CreateFunction(fn_name.c_str(), s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  auto* arg0 = TF_AddFunctionParameter(graph_ctx, TF_FLOAT, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  auto* arg1 = TF_AddFunctionParameter(graph_ctx, TF_FLOAT, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Create a first "Add" computing `arg0 + arg1`.
+  TF_AbstractTensor* add_output1;
+  {
+    // Build an abstract operation, inputs and output.
+    auto* add_op = TF_NewAbstractOp(graph_ctx);
+    TF_AbstractOpSetOpType(add_op, "Add", s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_AbstractOpSetOpName(add_op, "my_add1", s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_AbstractTensor* inputs[2] = {arg0, arg1};
+    TF_OutputList* add_outputs = TF_NewOutputList();
+    TF_OutputListSetNumOutputs(add_outputs, 1, status.get());
+    ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+    // Trace the operation now (create a node in the graph).
+    TF_ExecuteOperation(add_op, 2, inputs, add_outputs, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_DeleteAbstractOp(add_op);
+
+    // Extract the resulting tensor.
+    add_output1 = TF_OutputListGet(add_outputs, 0);
+    TF_DeleteOutputList(add_outputs);
+  }
+
+  // Same with a second "Add" computing `arg1 + arg1`.
+  TF_AbstractTensor* add_output2;
+  {
+    // Build an abstract operation, inputs and output.
+    auto* add_op = TF_NewAbstractOp(graph_ctx);
+    TF_AbstractOpSetOpType(add_op, "Add", s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_AbstractOpSetOpName(add_op, "my_add2", s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_AbstractTensor* inputs[2] = {arg1, arg1};
+    TF_OutputList* add_outputs = TF_NewOutputList();
+    TF_OutputListSetNumOutputs(add_outputs, 1, status.get());
+    ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+    // Trace the operation now (create a node in the graph).
+    TF_ExecuteOperation(add_op, 2, inputs, add_outputs, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_DeleteAbstractOp(add_op);
+
+    // Extract the resulting tensor.
+    add_output2 = TF_OutputListGet(add_outputs, 0);
+    TF_DeleteOutputList(add_outputs);
+  }
+
+  // 3rd Output will be Matrix Multiplication of add_output1 and add_output2
+  TF_AbstractTensor* mm_output;
+  {
+    // Build an abstract operation, inputs and output.
+    auto* mm_op = TF_NewAbstractOp(graph_ctx);
+    TF_AbstractOpSetOpType(mm_op, "MatMul", s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_AbstractOpSetOpName(mm_op, "mm", s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_AbstractTensor* inputs[2] = {add_output1, add_output2};
+    TF_OutputList* mm_outputs = TF_NewOutputList();
+    TF_OutputListSetNumOutputs(mm_outputs, 1, status.get());
+    ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+    // Trace the operation now (create a node in the graph).
+    TF_ExecuteOperation(mm_op, 2, inputs, mm_outputs, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_DeleteAbstractOp(mm_op);
+
+    // Extract the resulting tensor.
+    mm_output = TF_OutputListGet(mm_outputs, 0);
+    TF_DeleteOutputList(mm_outputs);
+  }
+
+  // Finalize the function by providing the returned values.
+  TF_AbstractFunction* func;
+  {
+    // We want to return the output of both add operations and MatMul operation,
+    // create a new list and populate it.
+    TF_OutputList* func_outputs = TF_NewOutputList();
+    TF_OutputListPushBack(func_outputs, add_output1, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_OutputListPushBack(func_outputs, add_output2, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_OutputListPushBack(func_outputs, mm_output, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    func = TF_FinalizeFunction(graph_ctx, func_outputs, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_DeleteOutputList(func_outputs);
+  }
+
+  /**
+   * We traced so far this function:
+   *
+   *   def two_adds_and_mm(A, B):
+   *     my_add1 = A + B
+   *     my_add2 = B + B
+   *     mm = tf.MatMul(my_add1,my_add2)
+   *     return my_add1, my_add2, mm
+   *
+   * Now we will execute this function with an eager context:
+   *
+   *   A =[[0, 1],[1, 0]]
+   *   B =[[1, 0],[0, 1]]
+   *
+   *   output1, output2, output3 = two_adds_and_mm(A, B)
+   *
+   * We expect outputs:
+   *
+   *   output1 = [[1, 1],[1, 1]]
+   *   output2 = [[2, 0],[0, 2]]
+   *   output3 = [[2, 2],[2, 2]]
+   *
+   */
+
+  // Build eager context.
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TF_ExecutionContext* eager_execution_ctx =
+      TF_NewEagerExecutionContext(opts, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TFE_DeleteContextOptions(opts);
+
+  TF_ExecutionContextRegisterFunction(eager_execution_ctx, func, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Build the abstract op to run the function.
+  TF_AbstractOp* fn_op = TF_NewAbstractOp(eager_execution_ctx);
+  TF_AbstractOpSetOpType(fn_op, fn_name.c_str(), s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Build two abstract input tensors as function arguments.
+  std::vector<TF_AbstractTensor*> func_args;
+  {
+    TFE_Context* eager_ctx =
+        TF_ExecutionContextGetTFEContext(eager_execution_ctx, s);
+
+    // 1st Arg
+    float vals1[] = {0.0f, 1.0f, 1.0f, 0.0f};
+    int64_t dims[] = {2, 2};  // Matrices will be 2 x 2
+    int num_dims = sizeof(dims) / sizeof(dims[0]);
+
+    TFE_TensorHandle* input_eager =
+        TestMatrixTensorHandleWithInput(eager_ctx, vals1, dims, num_dims);
+    func_args.push_back(TF_CreateAbstractTensorFromEagerTensor(input_eager, s));
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+    // 2nd Arg
+    float vals2[] = {1.0f, 0.0f, 0.0f, 1.0f};
+    input_eager =
+        TestMatrixTensorHandleWithInput(eager_ctx, vals2, dims, num_dims);
+    func_args.push_back(TF_CreateAbstractTensorFromEagerTensor(input_eager, s));
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  }
+
+  TF_OutputList* func_outputs = TF_NewOutputList();
+  TF_OutputListSetNumOutputs(func_outputs, 3, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_ExecuteOperation(fn_op, func_args.size(), func_args.data(), func_outputs,
+                      s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_DeleteAbstractOp(fn_op);
+  for (TF_AbstractTensor* t : func_args) TF_DeleteAbstractTensor(t);
+
+  ASSERT_EQ(3, TF_OutputListNumOutputs(func_outputs));
+
+  float expected_outputs[3][4] = {{1.0f, 1.0f, 1.0f, 1.0f},
+                                  {2.0f, 0.0f, 0.0f, 2.0f},
+                                  {2.0f, 2.0f, 2.0f, 2.0f}};
+
+  float result_data[4];
+  for (int idx = 0; idx < 3; ++idx) {
+    TF_AbstractTensor* result = TF_OutputListGet(func_outputs, idx);
+    TFE_TensorHandle* handle = TF_AbstractTensorGetEagerTensor(result, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_Tensor* f_t = TFE_TensorHandleResolve(handle, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+    memcpy(&result_data[0], TF_TensorData(f_t), TF_TensorByteSize(f_t));
+
+    // Verify results for each output
+    for (int j = 0; j < 4; j++) {
+      ASSERT_EQ(result_data[j], expected_outputs[idx][j]);
+    }
+
+    TF_DeleteTensor(f_t);
+  }
+
+  // Free memory associated with add and MatMul outputs
+  for (int idx = 0; idx < 3; ++idx) {
+    TF_AbstractTensor* result = TF_OutputListGet(func_outputs, idx);
+    TF_DeleteAbstractTensor(result);
+  }
+
+  TF_DeleteOutputList(func_outputs);
+  TF_DeleteExecutionContext(eager_execution_ctx);
+  TF_DeleteAbstractFunction(func);
+}
+
 TEST_P(UnifiedCAPI, TF_ExecutionContextToFunctionWithEagerContextRaises) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
diff --git a/tensorflow/c/eager/gradients.cc b/tensorflow/c/eager/gradients.cc
index 3a7a6282192..39cadd421e2 100644
--- a/tensorflow/c/eager/gradients.cc
+++ b/tensorflow/c/eager/gradients.cc
@@ -51,25 +51,14 @@ int64 ToId(AbstractTensorHandle* t) {
 
 TapeTensor::TapeTensor(AbstractTensorHandle* handle, AbstractContext* ctx)
     : handle_(handle), ctx_(ctx) {
-  // TODO(b/160888114): Make AbstractTensorHandle RefCounted. Right now we rely
-  // on the client to keep this tensor live for the duration of the gradient
-  // computation.
-  // handle_->Ref();
+  handle_->Ref();
 }
 TapeTensor::TapeTensor(const TapeTensor& other) {
   handle_ = other.handle_;
-  // TODO(b/160888114): Make AbstractTensorHandle RefCounted. Right now we rely
-  // on the client to keep this tensor live for the duration of the gradient
-  // computation.
-  // handle_->Ref();
+  handle_->Ref();
   ctx_ = other.ctx_;
 }
-TapeTensor::~TapeTensor() {
-  // TODO(b/160888114): Make AbstractTensorHandle RefCounted. Right now we rely
-  // on the client to keep this tensor live for the duration of the gradient
-  // computation.
-  // handle_->Unref();
-}
+TapeTensor::~TapeTensor() { handle_->Unref(); }
 
 tensorflow::int64 TapeTensor::GetID() const { return ToId(handle_); }
 
@@ -112,7 +101,7 @@ AbstractTensorHandle* TapeTensor::ZerosLike() const {
   }
   if (isa<tracing::TracingOperation>(op.get())) {
     s = dyn_cast<tracing::TracingOperation>(op.get())->SetOpName(
-        absl::StrCat("OnesLike", ToId(handle_)).c_str());
+        absl::StrCat("ZerosLike", ToId(handle_)).c_str());
     if (!s.ok()) {
       return nullptr;
     }
@@ -175,7 +164,8 @@ Status TapeVSpace::CallBackwardFunction(
     gtl::ArraySlice<AbstractTensorHandle*> output_gradients,
     std::vector<AbstractTensorHandle*>* result) const {
   if (backward_function == nullptr) return Status::OK();
-  return backward_function->Compute(output_gradients, result);
+  Context ctx = {ctx_};
+  return backward_function->Compute(&ctx, output_gradients, result);
 }
 
 // Looks up the ID of a Gradient.
@@ -191,7 +181,7 @@ TapeTensor TapeVSpace::TapeTensorFromGradient(AbstractTensorHandle* g) const {
 void TapeVSpace::MarkAsResult(AbstractTensorHandle* gradient) const {}
 
 void TapeVSpace::DeleteGradient(AbstractTensorHandle* gradient) const {
-  gradient->Release();
+  gradient->Unref();
 }
 
 // Helper functions which delegate to `AbstractOperation`, update
@@ -373,6 +363,10 @@ Status Execute(AbstractOperation* op_, AbstractContext* ctx,
     input_ids[i] = ToId(forward_op_->inputs[i]);
     input_dtypes[i] = forward_op_->inputs[i]->DataType();
   }
+  for (int i = 0; i < *num_retvals; i++) {
+    // TODO(srbs): Manage refcount of ForwardOperation's inputs/outputs.
+    forward_op_->outputs.push_back(retvals[i]);
+  }
   std::vector<TapeTensor> tape_tensors;
   for (auto t : retvals) {
     tape_tensors.push_back(TapeTensor(t, ctx));
diff --git a/tensorflow/c/eager/gradients.h b/tensorflow/c/eager/gradients.h
index e09b6ff8613..267ee5b7ab2 100644
--- a/tensorflow/c/eager/gradients.h
+++ b/tensorflow/c/eager/gradients.h
@@ -31,7 +31,8 @@ namespace gradients {
 //
 // class AddGradientFunction : public GradientFunction {
 //  public:
-//   Status Compute(absl::Span<AbstractTensorHandle* const> grad_inputs,
+//   Status Compute(Context* ctx,
+//                  absl::Span<AbstractTensorHandle* const> grad_inputs,
 //                  std::vector<AbstractTensorHandle*>* grad_outputs) override {
 //     grad_outputs->resize(2);
 //     (*grad_outputs)[0] = grad_inputs[0];
@@ -50,11 +51,16 @@ namespace gradients {
 // Status RegisterGradients(GradientRegistry* registry) {
 //   return registry->Register("Add", AddRegisterer);
 // }
+struct Context {
+ public:
+  AbstractContext* ctx;
+};
 class GradientFunction {
  public:
   // TODO(srbs): How we support CompositeTensors e.g. IndexedSlices in
   // `grad_inputs`.
-  virtual Status Compute(absl::Span<AbstractTensorHandle* const> grad_inputs,
+  virtual Status Compute(Context* ctx,
+                         absl::Span<AbstractTensorHandle* const> grad_inputs,
                          std::vector<AbstractTensorHandle*>* grad_outputs) = 0;
   virtual ~GradientFunction() {}
 };
diff --git a/tensorflow/c/eager/gradients_test.cc b/tensorflow/c/eager/gradients_test.cc
index 5820058f3e2..41993b3e125 100644
--- a/tensorflow/c/eager/gradients_test.cc
+++ b/tensorflow/c/eager/gradients_test.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api_unified_experimental.h"
 #include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
 #include "tensorflow/c/eager/gradients_internal.h"
+#include "tensorflow/c/experimental/gradients/math_grad.h"
+#include "tensorflow/c/experimental/ops/array_ops.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/c/tf_tensor.h"
 #include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
@@ -42,55 +44,12 @@ class CppGradients
   }
 };
 
-// Creates an Identity op.
-Status Identity(AbstractContext* ctx,
-                absl::Span<AbstractTensorHandle* const> inputs,
-                absl::Span<AbstractTensorHandle*> outputs, const char* name) {
-  AbstractOperationPtr identity_op(ctx->CreateOperation());
-  TF_RETURN_IF_ERROR(
-      identity_op->Reset("Identity", /*raw_device_name=*/nullptr));
-  if (isa<tracing::TracingOperation>(identity_op.get())) {
-    TF_RETURN_IF_ERROR(dyn_cast<tracing::TracingOperation>(identity_op.get())
-                           ->SetOpName(name));
-  }
-  TF_RETURN_IF_ERROR(identity_op->AddInput(inputs[0]));
-  int num_retvals = 1;
-  TF_RETURN_IF_ERROR(identity_op->Execute(outputs, &num_retvals));
+Status RegisterGradients(GradientRegistry* registry) {
+  TF_RETURN_IF_ERROR(registry->Register("Add", AddRegisterer));
+  TF_RETURN_IF_ERROR(registry->Register("Exp", ExpRegisterer));
   return Status::OK();
 }
 
-// =================== Register gradients for Add ============================
-class AddGradientFunction : public GradientFunction {
- public:
-  explicit AddGradientFunction(AbstractContext* ctx) : ctx_(ctx) {}
-  Status Compute(absl::Span<AbstractTensorHandle* const> grad_inputs,
-                 std::vector<AbstractTensorHandle*>* grad_outputs) override {
-    grad_outputs->resize(2);
-    std::vector<AbstractTensorHandle*> identity_outputs(1);
-    TF_RETURN_IF_ERROR(Identity(ctx_, {grad_inputs[0]},
-                                absl::MakeSpan(identity_outputs), "Id0"));
-    (*grad_outputs)[0] = identity_outputs[0];
-    TF_RETURN_IF_ERROR(Identity(ctx_, {grad_inputs[0]},
-                                absl::MakeSpan(identity_outputs), "Id1"));
-    (*grad_outputs)[1] = identity_outputs[0];
-    return Status::OK();
-  }
-  ~AddGradientFunction() override {}
-
- private:
-  AbstractContext* ctx_;
-};
-
-GradientFunction* AddRegisterer(const ForwardOperation& op) {
-  return new AddGradientFunction(op.ctx);
-}
-
-Status RegisterGradients(GradientRegistry* registry) {
-  return registry->Register("Add", AddRegisterer);
-}
-
-// =================== End gradient registrations ============================
-
 // Computes `inputs[0] + inputs[1]` and records it on the tape.
 Status Add(AbstractContext* ctx, Tape* tape,
            absl::Span<AbstractTensorHandle* const> inputs,
@@ -112,6 +71,26 @@ Status Add(AbstractContext* ctx, Tape* tape,
                  registry);
 }
 
+// Computes `exp(inputs[0])` and records it on the tape.
+Status Exp(AbstractContext* ctx, Tape* tape,
+           absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs,
+           const GradientRegistry& registry) {
+  AbstractOperationPtr exp_op(ctx->CreateOperation());
+  ForwardOperation forward_op;
+  forward_op.ctx = ctx;
+  TF_RETURN_IF_ERROR(
+      Reset(exp_op.get(), "Exp", /*raw_device_name=*/nullptr, &forward_op));
+  if (isa<tracing::TracingOperation>(exp_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<tracing::TracingOperation>(exp_op.get())->SetOpName("my_exp"));
+  }
+  TF_RETURN_IF_ERROR(AddInput(exp_op.get(), inputs[0], &forward_op));
+  int num_retvals = 1;
+  return Execute(exp_op.get(), ctx, outputs, &num_retvals, &forward_op, tape,
+                 registry);
+}
+
 // Computes
 // y = inputs[0] + inputs[1]
 // return grad(y, {inputs[0], inputs[1]})
@@ -136,7 +115,7 @@ Status AddGradModel(AbstractContext* ctx,
       source_tensors_that_are_targets,
       /*output_gradients=*/{}, &out_grads));
   for (auto add_output : add_outputs) {
-    add_output->Release();
+    add_output->Unref();
   }
   outputs[0] = out_grads[0];
   outputs[1] = out_grads[1];
@@ -144,6 +123,35 @@ Status AddGradModel(AbstractContext* ctx,
   return Status::OK();
 }
 
+// Computes
+// y = exp(inputs[0])
+// return grad(y, {inputs[0]})
+Status ExpGradModel(AbstractContext* ctx,
+                    absl::Span<AbstractTensorHandle* const> inputs,
+                    absl::Span<AbstractTensorHandle*> outputs,
+                    const GradientRegistry& registry) {
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  tape->Watch(ToId(inputs[0]));  // Watch x.
+  std::vector<AbstractTensorHandle*> exp_outputs(1);
+  TF_RETURN_IF_ERROR(Exp(ctx, tape, inputs, absl::MakeSpan(exp_outputs),
+                         registry));  // Compute x+y.
+  std::unordered_map<tensorflow::int64, TapeTensor>
+      source_tensors_that_are_targets;
+
+  std::vector<AbstractTensorHandle*> out_grads;
+  TF_RETURN_IF_ERROR(tape->ComputeGradient(
+      vspace, /*target_tensor_ids=*/{ToId(exp_outputs[0])},
+      /*source_tensor_ids=*/{ToId(inputs[0])}, source_tensors_that_are_targets,
+      /*output_gradients=*/{}, &out_grads));
+  for (auto exp_output : exp_outputs) {
+    exp_output->Unref();
+  }
+  outputs[0] = out_grads[0];
+  delete tape;
+  return Status::OK();
+}
+
 AbstractContext* BuildFunction(const char* fn_name) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
@@ -187,14 +195,15 @@ Status RunModel(Model model, AbstractContext* ctx,
       TF_RETURN_IF_ERROR(model(func_ctx.get(), absl::MakeSpan(func_inputs),
                                absl::MakeSpan(output_list.outputs), registry));
       for (auto func_input : func_inputs) {
-        func_input->Release();
+        func_input->Unref();
       }
       AbstractFunction* func = nullptr;
       TF_RETURN_IF_ERROR(dyn_cast<tracing::TracingContext>(func_ctx.get())
                              ->Finalize(&output_list, &func));
       scoped_func.reset(func);
-      output_list.outputs[0]->Release();
-      output_list.outputs[1]->Release();
+      for (auto output : output_list.outputs) {
+        output->Unref();
+      }
       TF_RETURN_IF_ERROR(ctx->RegisterFunction(func));
     }
 
@@ -295,7 +304,7 @@ TEST_P(CppGradients, TestAddGrad) {
   ASSERT_EQ(errors::OK, s.code()) << s.error_message();
   auto result_value = static_cast<float*>(TF_TensorData(result_tensor));
   EXPECT_EQ(*result_value, 1.0);
-  outputs[0]->Release();
+  outputs[0]->Unref();
   TF_DeleteTensor(result_tensor);
   result_tensor = nullptr;
 
@@ -303,17 +312,61 @@ TEST_P(CppGradients, TestAddGrad) {
   ASSERT_EQ(errors::OK, s.code()) << s.error_message();
   result_value = static_cast<float*>(TF_TensorData(result_tensor));
   EXPECT_EQ(*result_value, 1.0);
-  outputs[1]->Release();
+  outputs[1]->Unref();
   TF_DeleteTensor(result_tensor);
 }
 
+TEST_P(CppGradients, TestExpGrad) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  AbstractTensorHandlePtr x;
+  {
+    AbstractTensorHandle* x_raw = nullptr;
+    Status s = TestScalarTensorHandle(ctx.get(), 1.0f, &x_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    x.reset(x_raw);
+  }
+
+  GradientRegistry registry;
+  Status s = RegisterGradients(&registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  // Pseudo-code:
+  //
+  // tape.watch(x)
+  // y = exp(x)
+  // outputs = tape.gradient(y, x)
+  std::vector<AbstractTensorHandle*> outputs(1);
+  s = RunModel(ExpGradModel, ctx.get(), {x.get()}, absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  TF_Tensor* result_tensor;
+  s = getValue(outputs[0], &result_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  auto result_value = static_cast<float*>(TF_TensorData(result_tensor));
+  EXPECT_NEAR(*result_value, 2.718, 0.001);
+  outputs[0]->Unref();
+  TF_DeleteTensor(result_tensor);
+  result_tensor = nullptr;
+}
+
 // TODO(b/160888630): Enable this test with mlir after AddInputList is
 // supported. It is needed for AddN op which is used for gradient aggregation.
 #ifdef PLATFORM_GOOGLE
 INSTANTIATE_TEST_SUITE_P(
     UnifiedCAPI, CppGradients,
     ::testing::Combine(::testing::Values("graphdef"),
-                       /*tfrt*/ ::testing::Values(false),
+                       /*tfrt*/ ::testing::Values(true, false),
                        /*executing_eagerly*/ ::testing::Values(true, false)));
 #else
 INSTANTIATE_TEST_SUITE_P(
diff --git a/tensorflow/c/eager/immediate_execution_operation.h b/tensorflow/c/eager/immediate_execution_operation.h
index 31a75c5b8c7..ee212b21a96 100644
--- a/tensorflow/c/eager/immediate_execution_operation.h
+++ b/tensorflow/c/eager/immediate_execution_operation.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/c/eager/abstract_operation.h"
 #include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
@@ -26,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/casts.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/util/abstract_stack_trace.h"
 
 struct TFE_Op;
 
@@ -36,6 +38,10 @@ class ImmediateExecutionOperation : public AbstractOperation {
  public:
   virtual void Clear() = 0;
 
+  // Returns the inputs of this op.
+  virtual absl::Span<ImmediateExecutionTensorHandle* const> GetInputs()
+      const = 0;
+
   virtual const tensorflow::OpDef* OpDef() const = 0;
 
   virtual Status InputLength(const char* input_name, int* length) = 0;
@@ -44,6 +50,12 @@ class ImmediateExecutionOperation : public AbstractOperation {
   // Experimental
   virtual Status SetUseXla(bool enable) = 0;
 
+  // Set stack trace to be used for potential async error reporting.
+  virtual void SetStackTrace(AbstractStackTrace stack_trace) = 0;
+
+  // Returns the stack trace set by `SetStackTrace` if exists.
+  virtual absl::optional<AbstractStackTrace> GetStackTrace() = 0;
+
   // For LLVM style RTTI.
   static bool classof(const AbstractOperation* ptr) {
     return ptr->getKind() == kEager || ptr->getKind() == kTfrt;
diff --git a/tensorflow/c/eager/immediate_execution_tensor_handle.h b/tensorflow/c/eager/immediate_execution_tensor_handle.h
index f7c77aa06db..6d32d482747 100644
--- a/tensorflow/c/eager/immediate_execution_tensor_handle.h
+++ b/tensorflow/c/eager/immediate_execution_tensor_handle.h
@@ -50,6 +50,14 @@ class ImmediateExecutionTensorHandle : public AbstractTensorHandle {
   // Return a copy of the handle.
   virtual ImmediateExecutionTensorHandle* Copy() = 0;
 
+  // Release any underlying resources, including the interface object.
+  //
+  // WARNING: The destructor of this class is marked as protected to disallow
+  // clients from directly destroying this object since it may manage it's own
+  // lifetime through ref counting. Thus this must be allocated on the heap and
+  // clients MUST call Release() in order to destroy an instance of this class.
+  virtual void Release() = 0;
+
   // For LLVM style RTTI.
   static bool classof(const AbstractTensorHandle* ptr) {
     return ptr->getKind() == kEager || ptr->getKind() == kTfrt;
diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h
index 40cfa87dd66..27629bb3bdf 100644
--- a/tensorflow/c/eager/tape.h
+++ b/tensorflow/c/eager/tape.h
@@ -177,12 +177,12 @@ class GradientTape {
 template <typename Gradient>
 class ForwardFunction
     : public std::function<Status(const std::vector<Gradient*>&,
-                                  std::vector<Gradient*>*)> {
+                                  std::vector<Gradient*>*, bool)> {
  public:
   template <typename lambda_type>
   explicit ForwardFunction(lambda_type lambda)
       : std::function<Status(const std::vector<Gradient*>&,
-                             std::vector<Gradient*>*)>(lambda) {}
+                             std::vector<Gradient*>*, bool)>(lambda) {}
 };
 
 // Computes Jacobian-vector products using forward-mode automatic
@@ -205,8 +205,9 @@ class ForwardAccumulator {
   // Does not take ownership of `vspace`, which must outlive the
   // ForwardAccumulator.
   explicit ForwardAccumulator(
-      const VSpace<Gradient, BackwardFunction, TapeTensor>& vspace)
-      : vspace_(vspace) {
+      const VSpace<Gradient, BackwardFunction, TapeTensor>& vspace,
+      bool use_batch)
+      : vspace_(vspace), use_batch_(use_batch) {
     call_state_.emplace(nullptr, false);
   }
 
@@ -314,6 +315,9 @@ class ForwardAccumulator {
   // available in language bindings (e.g. Python).
   const VSpace<Gradient, BackwardFunction, TapeTensor>& vspace_;
 
+  // Decides if tangents are vectorized or not
+  bool use_batch_;
+
   struct AccumulatorCallState {
     AccumulatorCallState(
         GradientTape<Gradient, BackwardFunction, TapeTensor>* backward_tape,
@@ -573,7 +577,7 @@ Status InitialGradients(
     gtl::ArraySlice<Gradient*> output_gradients, const TensorTape& tensor_tape,
     const OpTape<BackwardFunction, TapeTensor>& op_tape,
     std::unordered_map<int64, std::vector<Gradient*>>* result) {
-  for (int i = 0; i < target_tensor_ids.size(); ++i) {
+  for (int i = 0, end = target_tensor_ids.size(); i < end; ++i) {
     const int64 id = target_tensor_ids[i];
     if (output_gradients.empty() || output_gradients[i] == nullptr) {
       auto tensor_it = tensor_tape.find(id);
@@ -699,7 +703,7 @@ Status GradientTape<Gradient, BackwardFunction, TapeTensor>::ComputeGradient(
     std::vector<Gradient*> out_gradients;
     out_gradients.reserve(trace.output_tensor_info.size());
     std::vector<int64> unneeded_gradients;
-    for (int i = 0; i < trace.input_tensor_id.size(); i++) {
+    for (int i = 0, end = trace.input_tensor_id.size(); i < end; i++) {
       const auto& in_tensor_id = trace.input_tensor_id[i];
       if (tensor_tape_.find(in_tensor_id) == tensor_tape_.end() &&
           sources_set.find(in_tensor_id) == sources_set.end()) {
@@ -709,7 +713,7 @@ Status GradientTape<Gradient, BackwardFunction, TapeTensor>::ComputeGradient(
 
     bool any_gradient_nonzero = false;
     std::vector<int> zero_indices;
-    for (int i = 0; i < trace.output_tensor_info.size(); ++i) {
+    for (int i = 0, end = trace.output_tensor_info.size(); i < end; ++i) {
       const int64 id = trace.output_tensor_info[i].GetID();
       auto grad_it = gradients.find(id);
       if (grad_it == gradients.end()) {
@@ -775,7 +779,7 @@ Status GradientTape<Gradient, BackwardFunction, TapeTensor>::ComputeGradient(
     }
     VLOG(1) << "Got " << in_gradients.size() << " in_gradients for "
             << trace.input_tensor_id.size() << " sources";
-    for (int i = 0; i < in_gradients.size(); ++i) {
+    for (int i = 0, end = in_gradients.size(); i < end; ++i) {
       const int64 id = trace.input_tensor_id[i];
       if (in_gradients[i] != nullptr) {
         auto& unaggregated_grads = gradients[id];
@@ -968,7 +972,7 @@ ForwardAccumulator<Gradient, BackwardFunction, TapeTensor>::ForwardpropFromTape(
   targets.reserve(grad.size());
   used_in_grads.reserve(grad.size());
   std::unordered_map<int64, TapeTensor> sources_that_are_targets;
-  for (int grad_index = 0; grad_index < grad.size(); ++grad_index) {
+  for (int grad_index = 0, end = grad.size(); grad_index < end; ++grad_index) {
     Gradient* grad_tensor = grad[grad_index];
     if (grad_tensor != nullptr) {
       int64 tensor_id = vspace_.TensorId(grad_tensor);
@@ -1062,7 +1066,8 @@ Status ForwardAccumulator<Gradient, BackwardFunction, TapeTensor>::Accumulate(
         output_tensors, backward_function_getter, backward_function_deleter,
         in_grads, &forward_grads));
   } else {
-    TF_RETURN_IF_ERROR((*forward_function)(in_grads, &forward_grads));
+    TF_RETURN_IF_ERROR(
+        (*forward_function)(in_grads, &forward_grads, use_batch_));
   }
   for (int i = 0; i < forward_grads.size(); ++i) {
     if (forward_grads[i] != nullptr) {
diff --git a/tensorflow/c/env.cc b/tensorflow/c/env.cc
index ce715c43acb..fbde13dea5a 100644
--- a/tensorflow/c/env.cc
+++ b/tensorflow/c/env.cc
@@ -186,3 +186,22 @@ void TF_JoinThread(TF_Thread* thread) {
   // ::tensorflow::Thread joins on destruction
   delete reinterpret_cast<::tensorflow::Thread*>(thread);
 }
+
+void* TF_LoadSharedLibrary(const char* library_filename, TF_Status* status) {
+  void* handle = nullptr;
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(
+      status, ::tensorflow::Env::Default()->LoadDynamicLibrary(library_filename,
+                                                               &handle));
+  return handle;
+}
+
+void* TF_GetSymbolFromLibrary(void* handle, const char* symbol_name,
+                              TF_Status* status) {
+  void* symbol = nullptr;
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(
+      status, ::tensorflow::Env::Default()->GetSymbolFromLibrary(
+                  handle, symbol_name, &symbol));
+  return symbol;
+}
diff --git a/tensorflow/c/env.h b/tensorflow/c/env.h
index 7dc7ac32f08..63e2c86ad44 100644
--- a/tensorflow/c/env.h
+++ b/tensorflow/c/env.h
@@ -184,6 +184,26 @@ TF_CAPI_EXPORT extern TF_Thread* TF_StartThread(const TF_ThreadOptions* options,
 // Waits for the given thread to finish execution, then deletes it.
 TF_CAPI_EXPORT extern void TF_JoinThread(TF_Thread* thread);
 
+// \brief Load a dynamic library.
+//
+// Pass "library_filename" to a platform-specific mechanism for dynamically
+// loading a library. The rules for determining the exact location of the
+// library are platform-specific and are not documented here.
+//
+// On success, place OK in status and return the newly created library handle.
+// Otherwise returns nullptr and set error status.
+TF_CAPI_EXPORT extern void* TF_LoadSharedLibrary(const char* library_filename,
+                                                 TF_Status* status);
+
+// \brief Get a pointer to a symbol from a dynamic library.
+//
+// "handle" should be a pointer returned from a previous call to
+// TF_LoadLibraryFromEnv. On success, place OK in status and return a pointer to
+// the located symbol. Otherwise returns nullptr and set error status.
+TF_CAPI_EXPORT extern void* TF_GetSymbolFromLibrary(void* handle,
+                                                    const char* symbol_name,
+                                                    TF_Status* status);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/tensorflow/c/experimental/BUILD b/tensorflow/c/experimental/BUILD
deleted file mode 100644
index 53cd99f18a6..00000000000
--- a/tensorflow/c/experimental/BUILD
+++ /dev/null
@@ -1,124 +0,0 @@
-# Description:
-# Experimental C APIs for TensorFlow.
-
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_copts",
-    "tf_cuda_library",
-)
-load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
-
-package(
-    licenses = ["notice"],  # Apache 2.0
-)
-
-tf_cuda_library(
-    name = "rendezvous_internal",
-    srcs = [
-        "rendezvous.cc",
-    ],
-    hdrs = [
-        "rendezvous.h",
-        "rendezvous_internal.h",
-    ],
-    copts = tf_copts(),
-    visibility = ["//tensorflow/c:__subpackages__"],
-    deps = [
-        "//tensorflow/c:c_api_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/distributed_runtime:base_rendezvous_mgr",
-        "//tensorflow/core/distributed_runtime:worker_env",
-        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
-    ],
-)
-
-tf_cuda_library(
-    name = "rendezvous",
-    hdrs = [
-        "rendezvous.h",
-    ],
-    copts = tf_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":rendezvous_internal",
-        "//tensorflow/c:c_api",
-    ],
-)
-
-tf_cuda_library(
-    name = "network_internal",
-    srcs = [
-        "network.cc",
-    ],
-    hdrs = [
-        "network.h",
-        "network_internal.h",
-    ],
-    copts = tf_copts(),
-    visibility = ["//tensorflow/c:__subpackages__"],
-    deps = [
-        ":rendezvous_internal",
-        "//tensorflow/c:c_api_internal",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/distributed_runtime:server_lib",
-        "//tensorflow/core/distributed_runtime:worker_env",
-        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
-    ],
-)
-
-tf_cuda_library(
-    name = "network",
-    hdrs = [
-        "network.h",
-    ],
-    copts = tf_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":network_internal",
-        ":rendezvous",
-        "//tensorflow/c:c_api",
-    ],
-)
-
-# -----------------------------------------------------------------------------
-# Tests
-
-tf_cuda_cc_test(
-    name = "network_test",
-    size = "medium",
-    srcs = ["network_test.cc"],
-    tags = ["noasan"],
-    # We must ensure that the dependencies can be dynamically linked since
-    # the shared library must be able to use core:framework.
-    # linkstatic = tf_kernel_tests_linkstatic(),
-    deps = [
-        ":network",
-        ":network_internal",
-        ":rendezvous",
-        ":rendezvous_internal",
-        "//tensorflow/c:c_api",
-        "//tensorflow/c:env",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/distributed_runtime:rendezvous_mgr_interface",
-        "//tensorflow/core/distributed_runtime:server_lib",
-        "//tensorflow/core/distributed_runtime:session_mgr",
-        "//tensorflow/core/distributed_runtime:worker_env",
-        "//tensorflow/core/distributed_runtime:worker_session",
-        "//tensorflow/core/distributed_runtime/rpc:async_service_interface",
-        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-    ],
-)
diff --git a/tensorflow/c/experimental/filesystem/filesystem_interface.h b/tensorflow/c/experimental/filesystem/filesystem_interface.h
index 5463eb35088..6e05c861439 100644
--- a/tensorflow/c/experimental/filesystem/filesystem_interface.h
+++ b/tensorflow/c/experimental/filesystem/filesystem_interface.h
@@ -78,6 +78,11 @@ typedef struct TF_Filesystem {
   void* plugin_filesystem;
 } TF_Filesystem;
 
+typedef struct TF_TransactionToken {
+  void* token;
+  TF_Filesystem* owner;
+} TF_TransactionToken;
+
 /// SECTION 2. Function tables for functionality provided by plugins
 /// ----------------------------------------------------------------------------
 ///
@@ -679,6 +684,133 @@ typedef struct TF_FilesystemOps {
   ///
   /// DEFAULT IMPLEMENTATION: No op.
   void (*flush_caches)(const TF_Filesystem* filesystem);
+
+  /// Starts a new transaction.
+  ///
+  /// An opaque transaction token is returned in `token`. Ownership of the token
+  /// is in filesystem. Token will be freed in `end_transaction` call and any
+  /// access to token after that is invalid.
+  ///
+  /// In case of error, plugins must set `status` to a value different than
+  /// `TF_OK`, free memory allocated for `token` and return -1.
+  ///
+  /// The allocation and freeing of memory must happen via the functions sent to
+  /// core TensorFlow upon registration (see the `TF_FilesystemPluginInfo`
+  /// structure in Section 4).
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if transaction successfuly started.
+  ///   * Must set `status` to `TF_FAILED_PRECONDITION` if multiple transactions
+  ///     are not supported
+  ///   * Might use any other error value for `status` to signal other errors.
+  int (*start_transaction)(const TF_Filesystem* filesystem,
+                           TF_TransactionToken** token, TF_Status* status);
+
+  /// Ends transaction and free the `token`. Any access to token after
+  /// that will be invalid.
+  ///
+  /// In case of error, plugins must set `status` to a value different than
+  /// `TF_OK`, free memory allocated for `token` and return -1.
+  ///
+  /// The allocation and freeing of memory must happen via the functions sent to
+  /// core TensorFlow upon registration (see the `TF_FilesystemPluginInfo`
+  /// structure in Section 4).
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if transaction successfuly finalized.
+  ///   * Must set `status` to `TF_NOT_FOUND` if token is invalid/not found
+  ///   * Might use any other error value for `status` to signal other errors.
+  int (*end_transaction)(const TF_Filesystem* filesystem,
+                         TF_TransactionToken* token, TF_Status* status);
+
+  /// Adds file/directory in the `path` to transaction in `token`. It is a valid
+  /// operation to add a path that doesn't exist yet to a transaction.
+  ///
+  /// In case of error, plugins must set `status` to a value different than
+  /// `TF_OK`, free memory allocated for `token` and return -1.
+  ///
+  /// The allocation and freeing of memory must happen via the functions sent to
+  /// core TensorFlow upon registration (see the `TF_FilesystemPluginInfo`
+  /// structure in Section 4).
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if path added to transaction successful.
+  ///   * Must set `status` to `TF_NOT_FOUND` if `token` is invalid.
+  ///   * Must set `status` to `TF_FAILED_PRECONDITION` if file/directory is in
+  ///     another transaction and multiple transactions are not supported
+  ///   * Might use any other error value for `status` to signal other errors.
+  int (*add_to_transaction)(const TF_Filesystem* filesystem, const char* path,
+                            TF_TransactionToken* token, TF_Status* status);
+
+  /// Returns transaction token for file/directory in the `path`. Note that path
+  /// may not exist yet but still might be part of a transaction.
+  ///
+  /// Transaction token is returned in `token`. Ownership of the token is in
+  /// filesystem. Token will be freed in `end_transaction` call and any access
+  /// to token after that is invalid.
+  ///
+  /// In case of error, plugins must set `status` to a value different than
+  /// `TF_OK`, free memory allocated for `token` and return -1.
+  ///
+  /// The allocation and freeing of memory must happen via the functions sent to
+  /// core TensorFlow upon registration (see the `TF_FilesystemPluginInfo`
+  /// structure in Section 4).
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if a transaction for path is found
+  ///   * Must set `status` to `TF_NOT_FOUND` if `path` is not part of any
+  ///     transaction
+  ///   * Must set `status` to `TF_FAILED_PRECONDITION` if `path`  is
+  ///     not in this filesystem.
+  ///   * Might use any other error value for `status` to signal other errors.
+  int (*get_transaction_for_path)(const TF_Filesystem* filesystem,
+                                  const char* path, TF_TransactionToken** token,
+                                  TF_Status* status);
+
+  /// Returns transaction token for `path` if it is part of a transaction else
+  /// starts a new transaction and adds `path` to that transaction
+  ///
+  /// Transaction token is returned in `token`. Ownership of the token is in
+  /// filesystem. Token will be freed in `end_transaction` call and any access
+  /// to token after that is invalid.
+  ///
+  /// In case of error, plugins must set `status` to a value different than
+  /// `TF_OK`, free memory allocated for `token` and return -1.
+  ///
+  /// The allocation and freeing of memory must happen via the functions sent to
+  /// core TensorFlow upon registration (see the `TF_FilesystemPluginInfo`
+  /// structure in Section 4).
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if transaction found or successfuly
+  ///     started.
+  ///   * Must set `status` to `TF_NOT_FOUND` if `path` doesn't point to this
+  ///     filesystem
+  ///   * Must set `status` to `TF_FAILED_PRECONDITION` if file/directory is
+  ///     not in any transaction and multiple transactions are not supported.
+  ///   * Might use any other error value for `status` to signal other errors.
+  int (*get_or_start_transaction_for_path)(const TF_Filesystem* filesystem,
+                                           const char* path,
+                                           TF_TransactionToken** token,
+                                           TF_Status* status);
+
+  /// Decodes transaction token in `token` to human readable format for
+  /// debugging.
+  ///
+  /// A new `char*` buffer must be allocated by this method. Core TensorFlow
+  /// manages the lifetime of the buffer after the call. Thus, all callers of
+  /// this method must take ownership of the returned pointer.
+  ///
+  /// Plugins must not return `nullptr`. Returning empty strings is allowed.
+  ///
+  /// The allocation and freeing of memory must happen via the functions sent to
+  /// core TensorFlow upon registration (see the `TF_FilesystemPluginInfo`
+  /// structure in Section 4).
+  ///
+  /// DEFAULT IMPLEMENTATION: Dump token and owner address.
+  char* (*decode_transaction_token)(const TF_Filesystem* filesystem,
+                                    const TF_TransactionToken* token);
+
 } TF_FilesystemOps;
 // LINT.ThenChange(:filesystem_ops_version)
 
diff --git a/tensorflow/c/experimental/filesystem/modular_filesystem.cc b/tensorflow/c/experimental/filesystem/modular_filesystem.cc
index 58541ea2b36..9c8d3518800 100644
--- a/tensorflow/c/experimental/filesystem/modular_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/modular_filesystem.cc
@@ -35,7 +35,8 @@ using UniquePtrTo_TF_Status =
     ::std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)>;
 
 Status ModularFileSystem::NewRandomAccessFile(
-    const std::string& fname, std::unique_ptr<RandomAccessFile>* result) {
+    const std::string& fname, TransactionToken* token,
+    std::unique_ptr<RandomAccessFile>* result) {
   if (ops_->new_random_access_file == nullptr)
     return errors::Unimplemented(tensorflow::strings::StrCat(
         "Filesystem for ", fname, " does not support NewRandomAccessFile()"));
@@ -54,7 +55,8 @@ Status ModularFileSystem::NewRandomAccessFile(
 }
 
 Status ModularFileSystem::NewWritableFile(
-    const std::string& fname, std::unique_ptr<WritableFile>* result) {
+    const std::string& fname, TransactionToken* token,
+    std::unique_ptr<WritableFile>* result) {
   if (ops_->new_writable_file == nullptr)
     return errors::Unimplemented(tensorflow::strings::StrCat(
         "Filesystem for ", fname, " does not support NewWritableFile()"));
@@ -73,7 +75,8 @@ Status ModularFileSystem::NewWritableFile(
 }
 
 Status ModularFileSystem::NewAppendableFile(
-    const std::string& fname, std::unique_ptr<WritableFile>* result) {
+    const std::string& fname, TransactionToken* token,
+    std::unique_ptr<WritableFile>* result) {
   if (ops_->new_appendable_file == nullptr)
     return errors::Unimplemented(tensorflow::strings::StrCat(
         "Filesystem for ", fname, " does not support NewAppendableFile()"));
@@ -92,7 +95,8 @@ Status ModularFileSystem::NewAppendableFile(
 }
 
 Status ModularFileSystem::NewReadOnlyMemoryRegionFromFile(
-    const std::string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
+    const std::string& fname, TransactionToken* token,
+    std::unique_ptr<ReadOnlyMemoryRegion>* result) {
   if (ops_->new_read_only_memory_region_from_file == nullptr)
     return errors::Unimplemented(tensorflow::strings::StrCat(
         "Filesystem for ", fname,
@@ -112,7 +116,8 @@ Status ModularFileSystem::NewReadOnlyMemoryRegionFromFile(
   return StatusFromTF_Status(plugin_status.get());
 }
 
-Status ModularFileSystem::FileExists(const std::string& fname) {
+Status ModularFileSystem::FileExists(const std::string& fname,
+                                     TransactionToken* token) {
   if (ops_->path_exists == nullptr)
     return errors::Unimplemented(tensorflow::strings::StrCat(
         "Filesystem for ", fname, " does not support FileExists()"));
@@ -125,6 +130,7 @@ Status ModularFileSystem::FileExists(const std::string& fname) {
 }
 
 bool ModularFileSystem::FilesExist(const std::vector<std::string>& files,
+                                   TransactionToken* token,
                                    std::vector<Status>* status) {
   if (ops_->paths_exist == nullptr)
     return FileSystem::FilesExist(files, status);
@@ -157,6 +163,7 @@ bool ModularFileSystem::FilesExist(const std::vector<std::string>& files,
 }
 
 Status ModularFileSystem::GetChildren(const std::string& dir,
+                                      TransactionToken* token,
                                       std::vector<std::string>* result) {
   if (ops_->get_children == nullptr)
     return errors::Unimplemented(tensorflow::strings::StrCat(
@@ -182,6 +189,7 @@ Status ModularFileSystem::GetChildren(const std::string& dir,
 }
 
 Status ModularFileSystem::GetMatchingPaths(const std::string& pattern,
+                                           TransactionToken* token,
                                            std::vector<std::string>* result) {
   if (ops_->get_matching_paths == nullptr)
     return internal::GetMatchingPaths(this, Env::Default(), pattern, result);
@@ -203,7 +211,8 @@ Status ModularFileSystem::GetMatchingPaths(const std::string& pattern,
   return StatusFromTF_Status(plugin_status.get());
 }
 
-Status ModularFileSystem::DeleteFile(const std::string& fname) {
+Status ModularFileSystem::DeleteFile(const std::string& fname,
+                                     TransactionToken* token) {
   if (ops_->delete_file == nullptr)
     return errors::Unimplemented(tensorflow::strings::StrCat(
         "Filesystem for ", fname, " does not support DeleteFile()"));
@@ -216,6 +225,7 @@ Status ModularFileSystem::DeleteFile(const std::string& fname) {
 }
 
 Status ModularFileSystem::DeleteRecursively(const std::string& dirname,
+                                            TransactionToken* token,
                                             int64* undeleted_files,
                                             int64* undeleted_dirs) {
   if (undeleted_files == nullptr || undeleted_dirs == nullptr)
@@ -238,7 +248,8 @@ Status ModularFileSystem::DeleteRecursively(const std::string& dirname,
   return StatusFromTF_Status(plugin_status.get());
 }
 
-Status ModularFileSystem::DeleteDir(const std::string& dirname) {
+Status ModularFileSystem::DeleteDir(const std::string& dirname,
+                                    TransactionToken* token) {
   if (ops_->delete_dir == nullptr)
     return errors::Unimplemented(tensorflow::strings::StrCat(
         "Filesystem for ", dirname, " does not support DeleteDir()"));
@@ -250,7 +261,8 @@ Status ModularFileSystem::DeleteDir(const std::string& dirname) {
   return StatusFromTF_Status(plugin_status.get());
 }
 
-Status ModularFileSystem::RecursivelyCreateDir(const std::string& dirname) {
+Status ModularFileSystem::RecursivelyCreateDir(const std::string& dirname,
+                                               TransactionToken* token) {
   if (ops_->recursively_create_dir == nullptr)
     return FileSystem::RecursivelyCreateDir(dirname);
 
@@ -261,7 +273,8 @@ Status ModularFileSystem::RecursivelyCreateDir(const std::string& dirname) {
   return StatusFromTF_Status(plugin_status.get());
 }
 
-Status ModularFileSystem::CreateDir(const std::string& dirname) {
+Status ModularFileSystem::CreateDir(const std::string& dirname,
+                                    TransactionToken* token) {
   if (ops_->create_dir == nullptr)
     return errors::Unimplemented(tensorflow::strings::StrCat(
         "Filesystem for ", dirname, " does not support CreateDir()"));
@@ -273,7 +286,8 @@ Status ModularFileSystem::CreateDir(const std::string& dirname) {
   return StatusFromTF_Status(plugin_status.get());
 }
 
-Status ModularFileSystem::Stat(const std::string& fname, FileStatistics* stat) {
+Status ModularFileSystem::Stat(const std::string& fname,
+                               TransactionToken* token, FileStatistics* stat) {
   if (ops_->stat == nullptr)
     return errors::Unimplemented(tensorflow::strings::StrCat(
         "Filesystem for ", fname, " does not support Stat()"));
@@ -296,7 +310,8 @@ Status ModularFileSystem::Stat(const std::string& fname, FileStatistics* stat) {
   return StatusFromTF_Status(plugin_status.get());
 }
 
-Status ModularFileSystem::IsDirectory(const std::string& name) {
+Status ModularFileSystem::IsDirectory(const std::string& name,
+                                      TransactionToken* token) {
   if (ops_->is_directory == nullptr) return FileSystem::IsDirectory(name);
 
   UniquePtrTo_TF_Status plugin_status(TF_NewStatus(), TF_DeleteStatus);
@@ -307,6 +322,7 @@ Status ModularFileSystem::IsDirectory(const std::string& name) {
 }
 
 Status ModularFileSystem::GetFileSize(const std::string& fname,
+                                      TransactionToken* token,
                                       uint64* file_size) {
   if (ops_->get_file_size == nullptr) {
     FileStatistics stat;
@@ -327,7 +343,8 @@ Status ModularFileSystem::GetFileSize(const std::string& fname,
 }
 
 Status ModularFileSystem::RenameFile(const std::string& src,
-                                     const std::string& target) {
+                                     const std::string& target,
+                                     TransactionToken* token) {
   if (ops_->rename_file == nullptr) {
     Status status = CopyFile(src, target);
     if (status.ok()) status = DeleteFile(src);
@@ -343,7 +360,8 @@ Status ModularFileSystem::RenameFile(const std::string& src,
 }
 
 Status ModularFileSystem::CopyFile(const std::string& src,
-                                   const std::string& target) {
+                                   const std::string& target,
+                                   TransactionToken* token) {
   if (ops_->copy_file == nullptr) return FileSystem::CopyFile(src, target);
 
   UniquePtrTo_TF_Status plugin_status(TF_NewStatus(), TF_DeleteStatus);
@@ -366,7 +384,7 @@ std::string ModularFileSystem::TranslateName(const std::string& name) const {
   return ret;
 }
 
-void ModularFileSystem::FlushCaches() {
+void ModularFileSystem::FlushCaches(TransactionToken* token) {
   if (ops_->flush_caches != nullptr) ops_->flush_caches(filesystem_.get());
 }
 
@@ -443,7 +461,7 @@ Status RegisterFilesystemPlugin(const std::string& dso_path) {
   // Step 1: Load plugin
   Env* env = Env::Default();
   void* dso_handle;
-  TF_RETURN_IF_ERROR(env->LoadLibrary(dso_path.c_str(), &dso_handle));
+  TF_RETURN_IF_ERROR(env->LoadDynamicLibrary(dso_path.c_str(), &dso_handle));
 
   // Step 2: Load symbol for `TF_InitPlugin`
   void* dso_symbol;
diff --git a/tensorflow/c/experimental/filesystem/modular_filesystem.h b/tensorflow/c/experimental/filesystem/modular_filesystem.h
index baf665fd6aa..061a1aa446b 100644
--- a/tensorflow/c/experimental/filesystem/modular_filesystem.h
+++ b/tensorflow/c/experimental/filesystem/modular_filesystem.h
@@ -59,36 +59,48 @@ class ModularFileSystem final : public FileSystem {
 
   ~ModularFileSystem() override { ops_->cleanup(filesystem_.get()); }
 
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
   Status NewRandomAccessFile(
-      const std::string& fname,
+      const std::string& fname, TransactionToken* token,
       std::unique_ptr<RandomAccessFile>* result) override;
-  Status NewWritableFile(const std::string& fname,
+  Status NewWritableFile(const std::string& fname, TransactionToken* token,
                          std::unique_ptr<WritableFile>* result) override;
-  Status NewAppendableFile(const std::string& fname,
+  Status NewAppendableFile(const std::string& fname, TransactionToken* token,
                            std::unique_ptr<WritableFile>* result) override;
   Status NewReadOnlyMemoryRegionFromFile(
-      const std::string& fname,
+      const std::string& fname, TransactionToken* token,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
-  Status FileExists(const std::string& fname) override;
+  Status FileExists(const std::string& fname, TransactionToken* token) override;
   bool FilesExist(const std::vector<std::string>& files,
+                  TransactionToken* token,
                   std::vector<Status>* status) override;
-  Status GetChildren(const std::string& dir,
+  Status GetChildren(const std::string& dir, TransactionToken* token,
                      std::vector<std::string>* result) override;
-  Status GetMatchingPaths(const std::string& pattern,
+  Status GetMatchingPaths(const std::string& pattern, TransactionToken* token,
                           std::vector<std::string>* results) override;
-  Status DeleteFile(const std::string& fname) override;
-  Status DeleteRecursively(const std::string& dirname, int64* undeleted_files,
+  Status DeleteFile(const std::string& fname, TransactionToken* token) override;
+  Status DeleteRecursively(const std::string& dirname, TransactionToken* token,
+                           int64* undeleted_files,
                            int64* undeleted_dirs) override;
-  Status DeleteDir(const std::string& dirname) override;
-  Status RecursivelyCreateDir(const std::string& dirname) override;
-  Status CreateDir(const std::string& dirname) override;
-  Status Stat(const std::string& fname, FileStatistics* stat) override;
-  Status IsDirectory(const std::string& fname) override;
-  Status GetFileSize(const std::string& fname, uint64* file_size) override;
-  Status RenameFile(const std::string& src, const std::string& target) override;
-  Status CopyFile(const std::string& src, const std::string& target) override;
+  Status DeleteDir(const std::string& dirname,
+                   TransactionToken* token) override;
+  Status RecursivelyCreateDir(const std::string& dirname,
+                              TransactionToken* token) override;
+  Status CreateDir(const std::string& dirname,
+                   TransactionToken* token) override;
+  Status Stat(const std::string& fname, TransactionToken* token,
+              FileStatistics* stat) override;
+  Status IsDirectory(const std::string& fname,
+                     TransactionToken* token) override;
+  Status GetFileSize(const std::string& fname, TransactionToken* token,
+                     uint64* file_size) override;
+  Status RenameFile(const std::string& src, const std::string& target,
+                    TransactionToken* token) override;
+  Status CopyFile(const std::string& src, const std::string& target,
+                  TransactionToken* token) override;
   std::string TranslateName(const std::string& name) const override;
-  void FlushCaches() override;
+  void FlushCaches(TransactionToken* token) override;
 
  private:
   std::unique_ptr<TF_Filesystem> filesystem_;
diff --git a/tensorflow/c/experimental/filesystem/modular_filesystem_test.cc b/tensorflow/c/experimental/filesystem/modular_filesystem_test.cc
index 8ee47da01dd..7e0a95cc915 100644
--- a/tensorflow/c/experimental/filesystem/modular_filesystem_test.cc
+++ b/tensorflow/c/experimental/filesystem/modular_filesystem_test.cc
@@ -33,7 +33,6 @@ limitations under the License.
 // Windows defines the following macros to convert foo to fooA or fooW,
 // depending on the type of the string argument. We don't use these macros, so
 // undefine them here.
-#undef LoadLibrary
 #undef CopyFile
 #undef DeleteFile
 #undef TranslateName
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
index a0c13701766..68875d61e47 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
@@ -25,12 +25,15 @@ cc_library(
         "//tensorflow:windows": get_win_copts(),
     }),
     deps = [
+        ":expiring_lru_cache",
         ":gcs_helper",
+        ":ram_file_block_cache",
         "//tensorflow/c:env",
         "//tensorflow/c:tf_status",
         "//tensorflow/c/experimental/filesystem:filesystem_interface",
         "@com_github_googlecloudplatform_google_cloud_cpp//:storage_client",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:variant",
     ],
 )
 
@@ -44,14 +47,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "file_block_cache",
-    hdrs = ["file_block_cache.h"],
-    deps = [
-        "//tensorflow/c:tf_status",
-    ],
-)
-
 cc_library(
     name = "cleanup",
     hdrs = ["cleanup.h"],
@@ -63,7 +58,6 @@ cc_library(
     hdrs = ["ram_file_block_cache.h"],
     deps = [
         ":cleanup",
-        ":file_block_cache",
         "//tensorflow/c:env",
         "//tensorflow/c:tf_status",
         "@com_google_absl//absl/base:core_headers",
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/file_block_cache.h b/tensorflow/c/experimental/filesystem/plugins/gcs/file_block_cache.h
deleted file mode 100644
index 3ba7d8d7993..00000000000
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/file_block_cache.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_FILE_BLOCK_CACHE_H_
-#define TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_FILE_BLOCK_CACHE_H_
-
-#include <functional>
-#include <iostream>
-#include <list>
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "tensorflow/c/tf_status.h"
-
-namespace tf_gcs_filesystem {
-
-class FileBlockCache;
-
-/// FileBlockCacheStatsInterface allows for instrumentation of the block cache.
-///
-/// FileBlockCacheStatsInterface and its subclasses must be safe to use from
-/// multiple threads concurrently.
-///
-/// WARNING! This is an experimental interface that may change or go away at any
-/// time.
-class FileBlockCacheStatsInterface {
- public:
-  /// Configure is called to provide instrumentation hooks.
-  ///
-  /// Note: Configure can be called multiple times (e.g. if the block cache is
-  /// re-initialized).
-  virtual void Configure(const FileBlockCache* block_cache) = 0;
-
-  /// RecordBlockLoadRequest is called to record the size of a hit block.
-  virtual void RecordCacheHitBlockSize(size_t bytes_transferred) = 0;
-
-  /// RecordBlockLoadRequest is called to record the size of a missed block.
-  virtual void RecordCacheMissBlockSize(size_t bytes_transferred) = 0;
-
-  virtual ~FileBlockCacheStatsInterface() = default;
-};
-
-/// \brief A block cache of file contents, keyed by {filename, offset}.
-///
-/// This class should be shared by read-only random access files on a remote
-/// filesystem (e.g. GCS).
-class FileBlockCache {
- public:
-  /// The callback executed when a block is not found in the cache, and needs to
-  /// be fetched from the backing filesystem. This callback is provided when the
-  /// cache is constructed. The `status` should be `TF_OK` as long as the
-  /// read from the remote filesystem succeeded (similar to the semantics of the
-  /// read(2) system call).
-  typedef std::function<void(const std::string& filename, size_t offset,
-                             size_t buffer_size, char* buffer,
-                             size_t* bytes_transferred, TF_Status* status)>
-      BlockFetcher;
-
-  virtual ~FileBlockCache() {}
-
-  /// Read `n` bytes from `filename` starting at `offset` into `buffer`. This
-  /// method will set `status` to:
-  ///
-  /// 1) The error from the remote filesystem, if the read from the remote
-  ///    filesystem failed.
-  /// 2) `TF_FAILED_PRECONDITION` if the read from the remote filesystem
-  /// succeeded,
-  ///    but the read returned a partial block, and the LRU cache contained a
-  ///    block at a higher offset (indicating that the partial block should have
-  ///    been a full block).
-  /// 3) `TF_OUT_OF_RANGE` if the read from the remote filesystem succeeded, but
-  ///    the file contents do not extend past `offset` and thus nothing was
-  ///    placed in `out`.
-  /// 4) `TF_OK` otherwise (i.e. the read succeeded, and at least one byte was
-  /// placed
-  ///    in `buffer`).
-  ///
-  /// Caller is responsible for allocating memory for `buffer`.
-  /// `buffer` will be left unchanged in case of errors.
-  virtual void Read(const std::string& filename, size_t offset, size_t n,
-                    char* buffer, size_t* bytes_transferred,
-                    TF_Status* status) = 0;
-
-  // Validate the given file signature with the existing file signature in the
-  // cache. Returns true if the signature doesn't change or the file did not
-  // exist before. If the signature changes, update the existing signature with
-  // the new one and remove the file from cache.
-  virtual bool ValidateAndUpdateFileSignature(const std::string& filename,
-                                              int64_t file_signature) = 0;
-
-  /// Remove all cached blocks for `filename`.
-  virtual void RemoveFile(const std::string& filename) = 0;
-
-  /// Remove all cached data.
-  virtual void Flush() = 0;
-
-  /// Accessors for cache parameters.
-  virtual size_t block_size() const = 0;
-  virtual size_t max_bytes() const = 0;
-  virtual uint64_t max_staleness() const = 0;
-
-  /// The current size (in bytes) of the cache.
-  virtual size_t CacheSize() const = 0;
-
-  // Returns true if the cache is enabled. If false, the BlockFetcher callback
-  // is always executed during Read.
-  virtual bool IsCacheEnabled() const = 0;
-
-  void SetStats(FileBlockCacheStatsInterface* stats) {
-    if (stats == nullptr) {
-      std::cerr
-          << "Attempted to monitor a NULL stats object. This may prevent the "
-             "corresponding monitoring data from being exported";
-      return;
-    }
-    cache_stats_ = stats;
-    cache_stats_->Configure(this);
-  }
-
- protected:
-  FileBlockCacheStatsInterface* cache_stats_ = nullptr;  // Not owned.
-};
-
-}  // namespace tf_gcs_filesystem
-
-#endif  // TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_FILE_BLOCK_CACHE_H_
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
index 7861a5708b5..e01af918100 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <string.h>
 
 #include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/variant.h"
 #include "google/cloud/storage/client.h"
 #include "tensorflow/c/env.h"
 #include "tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h"
@@ -27,6 +29,27 @@ limitations under the License.
 // This filesystem will support `gs://` URI schemes.
 namespace gcs = google::cloud::storage;
 
+// The environment variable that overrides the block size for aligned reads from
+// GCS. Specified in MB (e.g. "16" = 16 x 1024 x 1024 = 16777216 bytes).
+constexpr char kBlockSize[] = "GCS_READ_CACHE_BLOCK_SIZE_MB";
+constexpr size_t kDefaultBlockSize = 64 * 1024 * 1024;
+// The environment variable that overrides the max size of the LRU cache of
+// blocks read from GCS. Specified in MB.
+constexpr char kMaxCacheSize[] = "GCS_READ_CACHE_MAX_SIZE_MB";
+constexpr size_t kDefaultMaxCacheSize = 0;
+// The environment variable that overrides the maximum staleness of cached file
+// contents. Once any block of a file reaches this staleness, all cached blocks
+// will be evicted on the next read.
+constexpr char kMaxStaleness[] = "GCS_READ_CACHE_MAX_STALENESS";
+constexpr uint64_t kDefaultMaxStaleness = 0;
+
+constexpr char kStatCacheMaxAge[] = "GCS_STAT_CACHE_MAX_AGE";
+constexpr uint64_t kStatCacheDefaultMaxAge = 5;
+// The environment variable that overrides the maximum number of entries in the
+// Stat cache.
+constexpr char kStatCacheMaxEntries[] = "GCS_STAT_CACHE_MAX_ENTRIES";
+constexpr size_t kStatCacheDefaultMaxEntries = 1024;
+
 // How to upload new data when Flush() is called multiple times.
 // By default the entire file is reuploaded.
 constexpr char kAppendMode[] = "GCS_APPEND_MODE";
@@ -81,28 +104,16 @@ static void MaybeAppendSlash(std::string* name) {
     name->push_back('/');
 }
 
-// SECTION 1. Implementation for `TF_RandomAccessFile`
-// ----------------------------------------------------------------------------
-namespace tf_random_access_file {
-typedef struct GCSFile {
-  const std::string bucket;
-  const std::string object;
-  gcs::Client* gcs_client;  // not owned
-} GCSFile;
-
-void Cleanup(TF_RandomAccessFile* file) {
-  auto gcs_file = static_cast<GCSFile*>(file->plugin_file);
-  delete gcs_file;
-}
-
-// TODO(vnvo2409): Adding cache.
-// `google-cloud-cpp` is working on a feature that we may want to use.
-// See https://github.com/googleapis/google-cloud-cpp/issues/4013.
-int64_t Read(const TF_RandomAccessFile* file, uint64_t offset, size_t n,
-             char* buffer, TF_Status* status) {
-  auto gcs_file = static_cast<GCSFile*>(file->plugin_file);
-  auto stream = gcs_file->gcs_client->ReadObject(
-      gcs_file->bucket, gcs_file->object, gcs::ReadRange(offset, offset + n));
+// A helper function to actually read the data from GCS.
+static int64_t LoadBufferFromGCS(const std::string& path, size_t offset,
+                                 size_t buffer_size, char* buffer,
+                                 tf_gcs_filesystem::GCSFile* gcs_file,
+                                 TF_Status* status) {
+  std::string bucket, object;
+  ParseGCSPath(path, false, &bucket, &object, status);
+  if (TF_GetCode(status) != TF_OK) return -1;
+  auto stream = gcs_file->gcs_client.ReadObject(
+      bucket, object, gcs::ReadRange(offset, offset + buffer_size));
   TF_SetStatusFromGCSStatus(stream.status(), status);
   if ((TF_GetCode(status) != TF_OK) &&
       (TF_GetCode(status) != TF_OUT_OF_RANGE)) {
@@ -111,16 +122,119 @@ int64_t Read(const TF_RandomAccessFile* file, uint64_t offset, size_t n,
   int64_t read;
   if (!absl::SimpleAtoi(stream.headers().find("content-length")->second,
                         &read)) {
-    TF_SetStatus(status, TF_UNKNOWN, "Could not get content-length header");
-    return -1;
-  }
-  if (read != n) {
-    TF_SetStatus(status, TF_OUT_OF_RANGE, "Read less bytes than requested");
+    // When we read a file with offset that is bigger than the actual file size.
+    // GCS will return an empty header (e.g no `content-length` header). In this
+    // case, we will set read to `0` and continue.
+    if (TF_GetCode(status) == TF_OUT_OF_RANGE) {
+      read = 0;
+    } else {
+      TF_SetStatus(status, TF_UNKNOWN, "Could not get content-length header");
+      return -1;
+    }
   }
+  // `TF_OUT_OF_RANGE` isn't considered as an error. So we clear it here.
+  TF_SetStatus(status, TF_OK, "");
   stream.read(buffer, read);
+  read = stream.gcount();
+  if (read < buffer_size) {
+    // Check stat cache to see if we encountered an interrupted read.
+    tf_gcs_filesystem::GcsFileStat stat;
+    if (gcs_file->stat_cache->Lookup(path, &stat)) {
+      if (offset + read < stat.base.length) {
+        TF_SetStatus(status, TF_INTERNAL,
+                     absl::StrCat("File contents are inconsistent for file: ",
+                                  path, " @ ", offset)
+                         .c_str());
+      }
+    }
+  }
   return read;
 }
 
+// SECTION 1. Implementation for `TF_RandomAccessFile`
+// ----------------------------------------------------------------------------
+namespace tf_random_access_file {
+using ReadFn =
+    std::function<int64_t(const std::string& path, uint64_t offset, size_t n,
+                          char* buffer, TF_Status* status)>;
+typedef struct GCSFile {
+  const std::string path;
+  const bool is_cache_enable;
+  const uint64_t buffer_size;
+  ReadFn read_fn;
+  absl::Mutex buffer_mutex;
+  uint64_t buffer_start ABSL_GUARDED_BY(buffer_mutex);
+  bool buffer_end_is_past_eof ABSL_GUARDED_BY(buffer_mutex);
+  std::string buffer ABSL_GUARDED_BY(buffer_mutex);
+
+  GCSFile(std::string path, bool is_cache_enable, uint64_t buffer_size,
+          ReadFn read_fn)
+      : path(path),
+        is_cache_enable(is_cache_enable),
+        buffer_size(buffer_size),
+        read_fn(std::move(read_fn)),
+        buffer_mutex(),
+        buffer_start(0),
+        buffer_end_is_past_eof(false),
+        buffer() {}
+} GCSFile;
+
+void Cleanup(TF_RandomAccessFile* file) {
+  auto gcs_file = static_cast<GCSFile*>(file->plugin_file);
+  delete gcs_file;
+}
+
+// `google-cloud-cpp` is working on a feature that we may want to use.
+// See https://github.com/googleapis/google-cloud-cpp/issues/4013.
+int64_t Read(const TF_RandomAccessFile* file, uint64_t offset, size_t n,
+             char* buffer, TF_Status* status) {
+  auto gcs_file = static_cast<GCSFile*>(file->plugin_file);
+  if (gcs_file->is_cache_enable || n > gcs_file->buffer_size) {
+    return gcs_file->read_fn(gcs_file->path, offset, n, buffer, status);
+  } else {
+    absl::MutexLock l(&gcs_file->buffer_mutex);
+    size_t buffer_end = gcs_file->buffer_start + gcs_file->buffer.size();
+    size_t copy_size = 0;
+    if (offset < buffer_end && gcs_file->buffer_start) {
+      copy_size = (std::min)(n, static_cast<size_t>(buffer_end - offset));
+      memcpy(buffer,
+             gcs_file->buffer.data() + (offset - gcs_file->buffer_start),
+             copy_size);
+    }
+    bool consumed_buffer_to_eof =
+        offset + copy_size >= buffer_end && gcs_file->buffer_end_is_past_eof;
+    if (copy_size < n && !consumed_buffer_to_eof) {
+      gcs_file->buffer_start = offset + copy_size;
+      gcs_file->buffer.resize(gcs_file->buffer_size);
+      auto read_fill_buffer = gcs_file->read_fn(
+          gcs_file->path, gcs_file->buffer_start, gcs_file->buffer_size,
+          &(gcs_file->buffer[0]), status);
+      gcs_file->buffer_end_is_past_eof =
+          (TF_GetCode(status) == TF_OUT_OF_RANGE);
+      if (read_fill_buffer >= 0) gcs_file->buffer.resize(read_fill_buffer);
+      if (TF_GetCode(status) != TF_OK &&
+          TF_GetCode(status) != TF_OUT_OF_RANGE) {
+        // Empty the buffer to avoid caching bad reads.
+        gcs_file->buffer.resize(0);
+        return -1;
+      }
+      size_t remaining_copy =
+          (std::min)(n - copy_size, gcs_file->buffer.size());
+      memcpy(buffer + copy_size, gcs_file->buffer.data(), remaining_copy);
+      copy_size += remaining_copy;
+    }
+    if (copy_size < n) {
+      // Forget the end-of-file flag to allow for clients that poll on the
+      // same file.
+      gcs_file->buffer_end_is_past_eof = false;
+      TF_SetStatus(status, TF_OUT_OF_RANGE, "Read less bytes than requested");
+      return copy_size;
+    }
+    TF_SetStatus(status, TF_OK, "");
+    return copy_size;
+  }
+}
+
 }  // namespace tf_random_access_file
 
 // SECTION 2. Implementation for `TF_WritableFile`
@@ -289,11 +403,87 @@ uint64_t Length(const TF_ReadOnlyMemoryRegion* region) {
 // SECTION 4. Implementation for `TF_Filesystem`, the actual filesystem
 // ----------------------------------------------------------------------------
 namespace tf_gcs_filesystem {
-// TODO(vnvo2409): Add lazy-loading and customizing parameters.
 // TODO(vnvo2409): Use partial reponse for better performance.
 // TODO(vnvo2409): We could do some cleanups like `return TF_SetStatus`.
 // TODO(vnvo2409): Refactor the filesystem implementation when
 // https://github.com/googleapis/google-cloud-cpp/issues/4482 is done.
+GCSFile::GCSFile(google::cloud::storage::Client&& gcs_client)
+    : gcs_client(gcs_client), block_cache_lock() {
+  const char* append_mode = std::getenv(kAppendMode);
+  compose = (append_mode != nullptr) && (!strcmp(kAppendMode, append_mode));
+
+  uint64_t value;
+  block_size = kDefaultBlockSize;
+  size_t max_bytes = kDefaultMaxCacheSize;
+  uint64_t max_staleness = kDefaultMaxStaleness;
+
+  // Apply the overrides for the block size (MB), max bytes (MB), and max
+  // staleness (seconds) if provided.
+  if (absl::SimpleAtoi(std::getenv(kBlockSize), &value)) {
+    block_size = value * 1024 * 1024;
+  }
+  if (absl::SimpleAtoi(std::getenv(kMaxCacheSize), &value)) {
+    max_bytes = static_cast<size_t>(value * 1024 * 1024);
+  }
+  if (absl::SimpleAtoi(std::getenv(kMaxStaleness), &value)) {
+    max_staleness = value;
+  }
+
+  file_block_cache = std::make_unique<RamFileBlockCache>(
+      block_size, max_bytes, max_staleness,
+      [this](const std::string& filename, size_t offset, size_t buffer_size,
+             char* buffer, TF_Status* status) {
+        return LoadBufferFromGCS(filename, offset, buffer_size, buffer, this,
+                                 status);
+      });
+
+  uint64_t stat_cache_max_age = kStatCacheDefaultMaxAge;
+  size_t stat_cache_max_entries = kStatCacheDefaultMaxEntries;
+  if (absl::SimpleAtoi(std::getenv(kStatCacheMaxAge), &value)) {
+    stat_cache_max_age = value;
+  }
+  if (absl::SimpleAtoi(std::getenv(kStatCacheMaxEntries), &value)) {
+    stat_cache_max_entries = static_cast<size_t>(value);
+  }
+  stat_cache = std::make_unique<ExpiringLRUCache<GcsFileStat>>(
+      stat_cache_max_age, stat_cache_max_entries);
+}
+
+GCSFile::GCSFile(google::cloud::storage::Client&& gcs_client, bool compose,
+                 uint64_t block_size, size_t max_bytes, uint64_t max_staleness,
+                 uint64_t stat_cache_max_age, size_t stat_cache_max_entries)
+    : gcs_client(gcs_client),
+      compose(compose),
+      block_cache_lock(),
+      block_size(block_size) {
+  file_block_cache = std::make_unique<RamFileBlockCache>(
+      block_size, max_bytes, max_staleness,
+      [this](const std::string& filename, size_t offset, size_t buffer_size,
+             char* buffer, TF_Status* status) {
+        return LoadBufferFromGCS(filename, offset, buffer_size, buffer, this,
+                                 status);
+      });
+  stat_cache = std::make_unique<ExpiringLRUCache<GcsFileStat>>(
+      stat_cache_max_age, stat_cache_max_entries);
+}
+
+void InitTest(TF_Filesystem* filesystem, bool compose, uint64_t block_size,
+              size_t max_bytes, uint64_t max_staleness,
+              uint64_t stat_cache_max_age, size_t stat_cache_max_entries,
+              TF_Status* status) {
+  google::cloud::StatusOr<gcs::Client> client =
+      gcs::Client::CreateDefaultClient();
+  if (!client) {
+    TF_SetStatusFromGCSStatus(client.status(), status);
+    return;
+  }
+
+  filesystem->plugin_filesystem =
+      new GCSFile(std::move(client.value()), compose, block_size, max_bytes,
+                  max_staleness, stat_cache_max_age, stat_cache_max_entries);
+  TF_SetStatus(status, TF_OK, "");
+}
+
 void Init(TF_Filesystem* filesystem, TF_Status* status) {
   google::cloud::StatusOr<gcs::Client> client =
       gcs::Client::CreateDefaultClient();
@@ -302,12 +492,7 @@ void Init(TF_Filesystem* filesystem, TF_Status* status) {
     return;
   }
 
-  const char* append_mode = std::getenv(kAppendMode);
-  bool compose =
-      (append_mode != nullptr) && (!strcmp(kAppendMode, append_mode));
-
-  filesystem->plugin_filesystem =
-      new GCSFile({std::move(client.value()), compose});
+  filesystem->plugin_filesystem = new GCSFile(std::move(client.value()));
   TF_SetStatus(status, TF_OK, "");
 }
 
@@ -316,6 +501,19 @@ void Cleanup(TF_Filesystem* filesystem) {
   delete gcs_file;
 }
 
+static void UncachedStatForObject(const std::string& bucket,
+                                  const std::string& object, GcsFileStat* stat,
+                                  gcs::Client* gcs_client, TF_Status* status) {
+  auto metadata = gcs_client->GetObjectMetadata(bucket, object);
+  if (!metadata) return TF_SetStatusFromGCSStatus(metadata.status(), status);
+  stat->generation_number = metadata->generation();
+  stat->base.length = metadata->size();
+  stat->base.mtime_nsec =
+      metadata->time_storage_class_updated().time_since_epoch().count();
+  stat->base.is_directory = object.back() == '/';
+  return TF_SetStatus(status, TF_OK, "");
+}
+
 // TODO(vnvo2409): Implement later
 void NewRandomAccessFile(const TF_Filesystem* filesystem, const char* path,
                          TF_RandomAccessFile* file, TF_Status* status) {
@@ -324,8 +522,46 @@ void NewRandomAccessFile(const TF_Filesystem* filesystem, const char* path,
   if (TF_GetCode(status) != TF_OK) return;
 
   auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
+  bool is_cache_enabled;
+  {
+    absl::MutexLock l(&gcs_file->block_cache_lock);
+    is_cache_enabled = gcs_file->file_block_cache->IsCacheEnabled();
+  }
+  auto read_fn = [gcs_file, is_cache_enabled, bucket, object](
+                     const std::string& path, uint64_t offset, size_t n,
+                     char* buffer, TF_Status* status) -> int64_t {
+    int64_t read = 0;
+    if (is_cache_enabled) {
+      absl::ReaderMutexLock l(&gcs_file->block_cache_lock);
+      GcsFileStat stat;
+      gcs_file->stat_cache->LookupOrCompute(
+          path, &stat,
+          [gcs_file, bucket, object](const std::string& path, GcsFileStat* stat,
+                                     TF_Status* status) {
+            UncachedStatForObject(bucket, object, stat, &gcs_file->gcs_client,
+                                  status);
+          },
+          status);
+      if (TF_GetCode(status) != TF_OK) return -1;
+      if (!gcs_file->file_block_cache->ValidateAndUpdateFileSignature(
+              path, stat.generation_number)) {
+        std::cout
+            << "File signature has been changed. Refreshing the cache. Path: "
+            << path;
+      }
+      read = gcs_file->file_block_cache->Read(path, offset, n, buffer, status);
+    } else {
+      read = LoadBufferFromGCS(path, offset, n, buffer, gcs_file, status);
+    }
+    if (TF_GetCode(status) != TF_OK) return -1;
+    if (read < n)
+      TF_SetStatus(status, TF_OUT_OF_RANGE, "Read less bytes than requested");
+    else
+      TF_SetStatus(status, TF_OK, "");
+    return read;
+  };
   file->plugin_file = new tf_random_access_file::GCSFile(
-      {std::move(bucket), std::move(object), &gcs_file->gcs_client});
+      std::move(path), is_cache_enabled, gcs_file->block_size, read_fn);
   TF_SetStatus(status, TF_OK, "");
 }
 
@@ -428,28 +664,179 @@ void NewReadOnlyMemoryRegionFromFile(const TF_Filesystem* filesystem,
   }
 }
 
-void CreateDir(const TF_Filesystem* filesystem, const char* path,
-               TF_Status* status) {
+static void StatForObject(GCSFile* gcs_file, const std::string& path,
+                          const std::string& bucket, const std::string& object,
+                          GcsFileStat* stat, TF_Status* status) {
+  if (object.empty())
+    return TF_SetStatus(
+        status, TF_INVALID_ARGUMENT,
+        ("'object' must be a non-empty string. (File: " + path + ")").c_str());
+  TF_SetStatus(status, TF_OK, "");
+  gcs_file->stat_cache->LookupOrCompute(
+      path, stat,
+      [gcs_file, bucket, object](const std::string& path, GcsFileStat* stat,
+                                 TF_Status* status) {
+        UncachedStatForObject(bucket, object, stat, &gcs_file->gcs_client,
+                              status);
+      },
+      status);
+}
+
+static bool ObjectExists(GCSFile* gcs_file, const std::string& path,
+                         const std::string& bucket, const std::string& object,
+                         TF_Status* status) {
+  GcsFileStat stat;
+  StatForObject(gcs_file, path, bucket, object, &stat, status);
+  if (TF_GetCode(status) != TF_OK && TF_GetCode(status) != TF_NOT_FOUND)
+    return false;
+  if (TF_GetCode(status) == TF_NOT_FOUND) {
+    TF_SetStatus(status, TF_OK, "");
+    return false;
+  }
+  return !stat.base.is_directory;
+}
+
+static bool BucketExists(GCSFile* gcs_file, const std::string& bucket,
+                         TF_Status* status) {
+  auto metadata = gcs_file->gcs_client.GetBucketMetadata(bucket);
+  TF_SetStatusFromGCSStatus(metadata.status(), status);
+  if (TF_GetCode(status) != TF_OK && TF_GetCode(status) != TF_NOT_FOUND)
+    return false;
+  if (TF_GetCode(status) == TF_NOT_FOUND) {
+    TF_SetStatus(status, TF_OK, "");
+    return false;
+  }
+  return true;
+}
+
+static std::vector<std::string> GetChildrenBounded(
+    GCSFile* gcs_file, std::string dir, uint64_t max_results, bool recursive,
+    bool include_self_directory_marker, TF_Status* status) {
+  std::string bucket, prefix;
+  MaybeAppendSlash(&dir);
+  ParseGCSPath(dir, true, &bucket, &prefix, status);
+
+  std::vector<std::string> result;
+  uint64_t count = 0;
+  std::string delimiter = recursive ? "" : "/";
+
+  for (auto&& item : gcs_file->gcs_client.ListObjectsAndPrefixes(
+           bucket, gcs::Prefix(prefix), gcs::Delimiter(delimiter))) {
+    if (count == max_results) {
+      TF_SetStatus(status, TF_OK, "");
+      return result;
+    }
+    if (!item) {
+      TF_SetStatusFromGCSStatus(item.status(), status);
+      return result;
+    }
+    auto value = *std::move(item);
+    std::string children = absl::holds_alternative<std::string>(value)
+                               ? absl::get<std::string>(value)
+                               : absl::get<gcs::ObjectMetadata>(value).name();
+    auto pos = children.find(prefix);
+    if (pos != 0) {
+      TF_SetStatus(status, TF_INTERNAL,
+                   ("Unexpected response: the returned file name " + children +
+                    " doesn't match the prefix " + prefix)
+                       .c_str());
+      return result;
+    }
+    children.erase(0, prefix.length());
+    if (!children.empty() || include_self_directory_marker) {
+      result.emplace_back(children);
+    }
+    ++count;
+  }
+
+  return result;
+}
+
+static bool FolderExists(GCSFile* gcs_file, std::string dir,
+                         TF_Status* status) {
+  ExpiringLRUCache<GcsFileStat>::ComputeFunc compute_func =
+      [gcs_file](const std::string& dir, GcsFileStat* stat, TF_Status* status) {
+        auto children =
+            GetChildrenBounded(gcs_file, dir, 1, true, true, status);
+        if (TF_GetCode(status) != TF_OK) return;
+        if (!children.empty()) {
+          stat->base = {0, 0, true};
+          return TF_SetStatus(status, TF_OK, "");
+        } else {
+          return TF_SetStatus(status, TF_INVALID_ARGUMENT, "Not a directory!");
+        }
+      };
+  GcsFileStat stat;
+  MaybeAppendSlash(&dir);
+  gcs_file->stat_cache->LookupOrCompute(dir, &stat, compute_func, status);
+  if (TF_GetCode(status) != TF_OK && TF_GetCode(status) != TF_INVALID_ARGUMENT)
+    return false;
+  if (TF_GetCode(status) == TF_INVALID_ARGUMENT) {
+    TF_SetStatus(status, TF_OK, "");
+    return false;
+  }
+  return true;
+}
+
+static void ClearFileCaches(GCSFile* gcs_file, const std::string& path) {
+  absl::ReaderMutexLock l(&gcs_file->block_cache_lock);
+  gcs_file->file_block_cache->RemoveFile(path);
+  gcs_file->stat_cache->Delete(path);
+}
+
+void PathExists(const TF_Filesystem* filesystem, const char* path,
+                TF_Status* status) {
   std::string bucket, object;
   ParseGCSPath(path, true, &bucket, &object, status);
   if (TF_GetCode(status) != TF_OK) return;
+
   auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
   if (object.empty()) {
-    auto bucket_metadata = gcs_file->gcs_client.GetBucketMetadata(bucket);
-    TF_SetStatusFromGCSStatus(bucket_metadata.status(), status);
+    bool result = BucketExists(gcs_file, bucket, status);
+    if (result) return TF_SetStatus(status, TF_OK, "");
+  }
+
+  GcsFileStat stat;
+  StatForObject(gcs_file, path, bucket, object, &stat, status);
+  if (TF_GetCode(status) != TF_NOT_FOUND) return;
+
+  bool result = FolderExists(gcs_file, path, status);
+  if (TF_GetCode(status) != TF_OK || (TF_GetCode(status) == TF_OK && result))
+    return;
+  return TF_SetStatus(
+      status, TF_NOT_FOUND,
+      absl::StrCat("The path ", path, " does not exist.").c_str());
+}
+
+void CreateDir(const TF_Filesystem* filesystem, const char* path,
+               TF_Status* status) {
+  std::string dir = path;
+  MaybeAppendSlash(&dir);
+  std::string bucket, object;
+  ParseGCSPath(dir, true, &bucket, &object, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
+  if (object.empty()) {
+    bool is_directory = BucketExists(gcs_file, bucket, status);
+    if (TF_GetCode(status) != TF_OK) return;
+    if (!is_directory)
+      TF_SetStatus(status, TF_NOT_FOUND,
+                   ("The specified bucket " + dir + " was not found.").c_str());
     return;
   }
 
-  MaybeAppendSlash(&object);
-  auto object_metadata = gcs_file->gcs_client.GetObjectMetadata(bucket, object);
-  TF_SetStatusFromGCSStatus(object_metadata.status(), status);
-  if (TF_GetCode(status) == TF_NOT_FOUND) {
-    auto insert_metadata =
-        gcs_file->gcs_client.InsertObject(bucket, object, "");
-    TF_SetStatusFromGCSStatus(insert_metadata.status(), status);
-  } else if (TF_GetCode(status) == TF_OK) {
+  PathExists(filesystem, dir.c_str(), status);
+  if (TF_GetCode(status) == TF_OK)
+    return TF_SetStatus(status, TF_ALREADY_EXISTS, path);
+
+  auto metadata = gcs_file->gcs_client.InsertObject(
+      bucket, object, "",
+      // Adding this parameter means HTTP_CODE_PRECONDITION_FAILED
+      // will be returned if the object already exists, so avoid reuploading.
+      gcs::IfGenerationMatch(0));
+  TF_SetStatusFromGCSStatus(metadata.status(), status);
+  if (TF_GetCode(status) == TF_FAILED_PRECONDITION)
     TF_SetStatus(status, TF_ALREADY_EXISTS, path);
-  }
 }
 
 // TODO(vnvo2409): `RecursivelyCreateDir` should use `CreateDir` instead of the
@@ -465,79 +852,31 @@ void DeleteFile(const TF_Filesystem* filesystem, const char* path,
   auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
   auto gcs_status = gcs_file->gcs_client.DeleteObject(bucket, object);
   TF_SetStatusFromGCSStatus(gcs_status, status);
+  if (TF_GetCode(status) == TF_OK) ClearFileCaches(gcs_file, path);
 }
 
+// Checks that the directory is empty (i.e no objects with this prefix exist).
+// Deletes the GCS directory marker if it exists.
 void DeleteDir(const TF_Filesystem* filesystem, const char* path,
                TF_Status* status) {
-  std::string bucket, object;
-  ParseGCSPath(path, false, &bucket, &object, status);
-  if (TF_GetCode(status) != TF_OK) return;
-  MaybeAppendSlash(&object);
+  // A directory is considered empty either if there are no matching objects
+  // with the corresponding name prefix or if there is exactly one matching
+  // object and it is the directory marker. Therefore we need to retrieve
+  // at most two children for the prefix to detect if a directory is empty.
   auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
-  int object_count = 0;
-  for (auto&& metadata :
-       gcs_file->gcs_client.ListObjects(bucket, gcs::Prefix(object))) {
-    if (!metadata) {
-      TF_SetStatusFromGCSStatus(metadata.status(), status);
-      return;
-    }
-    ++object_count;
-    // We consider a path is a non-empty directory in two cases:
-    // - There are more than two objects whose keys start with the name of this
-    // directory.
-    // - There is one object whose key contains the name of this directory ( but
-    // not equal ).
-    if (object_count > 1 || metadata->name() != object) {
-      TF_SetStatus(status, TF_FAILED_PRECONDITION,
-                   "Cannot delete a non-empty directory.");
-      return;
-    }
-  }
-  auto gcs_status = gcs_file->gcs_client.DeleteObject(bucket, object);
-  TF_SetStatusFromGCSStatus(gcs_status, status);
-}
-
-// TODO(vnvo2409): `DeleteRecursively` needs `GetChildrens` but there will be
-// some differents compared to the default implementation. Will be refactored.
-static void DeleteRecursively(const TF_Filesystem* filesystem, const char* path,
-                              uint64_t* undeleted_files,
-                              uint64_t* undeleted_dirs, TF_Status* status) {
-  std::string bucket, object;
-  ParseGCSPath(path, false, &bucket, &object, status);
+  auto childrens = GetChildrenBounded(gcs_file, path, 2, true, true, status);
   if (TF_GetCode(status) != TF_OK) return;
-
-  auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
-  auto gcs_status = gcs::DeleteByPrefix(gcs_file->gcs_client, bucket, object);
-  TF_SetStatusFromGCSStatus(gcs_status, status);
-  if (TF_GetCode(status) != TF_OK) return;
-  *undeleted_dirs = 0;
-  *undeleted_files = 0;
-}
-
-// TODO(vnvo2409): `RewriteObjectBlocking` will set `status` to `TF_NOT_FOUND`
-// if the object does not exist. In that case, we will have to check if the
-// `src` is a directory or not to set the correspondent `status` (i.e
-// `TF_NOT_FOUND` if path `src` does not exist, `TF_FAILED_PRECONDITION` if
-// path `src` is a directory).
-void RenameFile(const TF_Filesystem* filesystem, const char* src,
-                const char* dst, TF_Status* status) {
-  std::string bucket_src, object_src;
-  ParseGCSPath(src, false, &bucket_src, &object_src, status);
-  if (TF_GetCode(status) != TF_OK) return;
-
-  std::string bucket_dst, object_dst;
-  ParseGCSPath(dst, false, &bucket_dst, &object_dst, status);
-  if (TF_GetCode(status) != TF_OK) return;
-
-  auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
-  auto metadata = gcs_file->gcs_client.RewriteObjectBlocking(
-      bucket_src, object_src, bucket_dst, object_dst);
-  if (!metadata) {
-    TF_SetStatusFromGCSStatus(metadata.status(), status);
+  if (childrens.size() > 1 || (childrens.size() == 1 && !childrens[0].empty()))
+    return TF_SetStatus(status, TF_FAILED_PRECONDITION,
+                        "Cannot delete a non-empty directory.");
+  if (childrens.size() == 1 && childrens[0].empty()) {
+    // This is the directory marker object. Delete it.
+    std::string dir = path;
+    MaybeAppendSlash(&dir);
+    DeleteFile(filesystem, dir.c_str(), status);
     return;
   }
-  auto gcs_status = gcs_file->gcs_client.DeleteObject(bucket_src, object_src);
-  TF_SetStatusFromGCSStatus(gcs_status, status);
+  TF_SetStatus(status, TF_OK, "");
 }
 
 void CopyFile(const TF_Filesystem* filesystem, const char* src, const char* dst,
@@ -556,6 +895,183 @@ void CopyFile(const TF_Filesystem* filesystem, const char* src, const char* dst,
   TF_SetStatusFromGCSStatus(metadata.status(), status);
 }
 
+bool IsDirectory(const TF_Filesystem* filesystem, const char* path,
+                 TF_Status* status) {
+  std::string bucket, object;
+  ParseGCSPath(path, true, &bucket, &object, status);
+  if (TF_GetCode(status) != TF_OK) return false;
+
+  auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
+  if (object.empty()) {
+    bool result = BucketExists(gcs_file, bucket, status);
+    if (TF_GetCode(status) != TF_OK) return false;
+    if (!result)
+      TF_SetStatus(
+          status, TF_NOT_FOUND,
+          ("The specified bucket gs://" + bucket + " was not found.").c_str());
+    return result;
+  }
+
+  bool is_folder = FolderExists(gcs_file, path, status);
+  if (TF_GetCode(status) != TF_OK) return false;
+  if (is_folder) return true;
+
+  bool is_object = ObjectExists(gcs_file, path, bucket, object, status);
+  if (TF_GetCode(status) != TF_OK) return false;
+  if (is_object) {
+    TF_SetStatus(
+        status, TF_FAILED_PRECONDITION,
+        absl::StrCat("The specified path ", path, " is not a directory.")
+            .c_str());
+    return false;
+  }
+  TF_SetStatus(status, TF_NOT_FOUND,
+               absl::StrCat("The path ", path, " does not exist.").c_str());
+  return false;
+}
+
+static void RenameObject(const TF_Filesystem* filesystem,
+                         const std::string& src, const std::string& dst,
+                         TF_Status* status) {
+  std::string bucket_src, object_src;
+  ParseGCSPath(src, false, &bucket_src, &object_src, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  std::string bucket_dst, object_dst;
+  ParseGCSPath(dst, false, &bucket_dst, &object_dst, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
+  auto metadata = gcs_file->gcs_client.RewriteObjectBlocking(
+      bucket_src, object_src, bucket_dst, object_dst);
+  TF_SetStatusFromGCSStatus(metadata.status(), status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  ClearFileCaches(gcs_file, dst);
+  DeleteFile(filesystem, src.c_str(), status);
+}
+
+void RenameFile(const TF_Filesystem* filesystem, const char* src,
+                const char* dst, TF_Status* status) {
+  if (!IsDirectory(filesystem, src, status)) {
+    if (TF_GetCode(status) == TF_FAILED_PRECONDITION)
+      RenameObject(filesystem, src, dst, status);
+    return;
+  }
+
+  auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
+  std::vector<std::string> childrens =
+      GetChildrenBounded(gcs_file, src, UINT64_MAX, true, true, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  std::string src_dir = src;
+  std::string dst_dir = dst;
+  MaybeAppendSlash(&src_dir);
+  MaybeAppendSlash(&dst_dir);
+  for (const std::string& children : childrens) {
+    RenameObject(filesystem, src_dir + children, dst_dir + children, status);
+    if (TF_GetCode(status) != TF_OK) return;
+  }
+  TF_SetStatus(status, TF_OK, "");
+}
+
+void DeleteRecursively(const TF_Filesystem* filesystem, const char* path,
+                       uint64_t* undeleted_files, uint64_t* undeleted_dirs,
+                       TF_Status* status) {
+  if (!undeleted_files || !undeleted_dirs)
+    return TF_SetStatus(
+        status, TF_INTERNAL,
+        "'undeleted_files' and 'undeleted_dirs' cannot be nullptr.");
+  *undeleted_files = 0;
+  *undeleted_dirs = 0;
+  if (!IsDirectory(filesystem, path, status)) {
+    *undeleted_dirs = 1;
+    return;
+  }
+  auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
+  std::vector<std::string> childrens =
+      GetChildrenBounded(gcs_file, path, UINT64_MAX, true, true, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  std::string dir = path;
+  MaybeAppendSlash(&dir);
+  for (const std::string& children : childrens) {
+    const std::string& full_path = dir + children;
+    DeleteFile(filesystem, full_path.c_str(), status);
+    if (TF_GetCode(status) != TF_OK) {
+      if (IsDirectory(filesystem, full_path.c_str(), status))
+        // The object is a directory marker.
+        (*undeleted_dirs)++;
+      else
+        (*undeleted_files)++;
+    }
+  }
+}
+
+int GetChildren(const TF_Filesystem* filesystem, const char* path,
+                char*** entries, TF_Status* status) {
+  auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
+  std::vector<std::string> childrens =
+      GetChildrenBounded(gcs_file, path, UINT64_MAX, false, false, status);
+  if (TF_GetCode(status) != TF_OK) return -1;
+
+  int num_entries = childrens.size();
+  *entries = static_cast<char**>(
+      plugin_memory_allocate(num_entries * sizeof((*entries)[0])));
+  for (int i = 0; i < num_entries; i++)
+    (*entries)[i] = strdup(childrens[i].c_str());
+  TF_SetStatus(status, TF_OK, "");
+  return num_entries;
+}
+
+void Stat(const TF_Filesystem* filesystem, const char* path,
+          TF_FileStatistics* stats, TF_Status* status) {
+  std::string bucket, object;
+  ParseGCSPath(path, true, &bucket, &object, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
+  if (object.empty()) {
+    auto bucket_metadata = gcs_file->gcs_client.GetBucketMetadata(bucket);
+    TF_SetStatusFromGCSStatus(bucket_metadata.status(), status);
+    if (TF_GetCode(status) == TF_OK) {
+      stats->is_directory = true;
+      stats->length = 0;
+      stats->mtime_nsec = 0;
+    }
+    return;
+  }
+  if (IsDirectory(filesystem, path, status)) {
+    stats->is_directory = true;
+    stats->length = 0;
+    stats->mtime_nsec = 0;
+    return TF_SetStatus(status, TF_OK, "");
+  }
+  if (TF_GetCode(status) == TF_OK) {
+    auto metadata = gcs_file->gcs_client.GetObjectMetadata(bucket, object);
+    if (metadata) {
+      stats->is_directory = false;
+      stats->length = metadata.value().size();
+      stats->mtime_nsec = metadata.value()
+                              .time_storage_class_updated()
+                              .time_since_epoch()
+                              .count();
+    }
+    TF_SetStatusFromGCSStatus(metadata.status(), status);
+  }
+}
+
+static char* TranslateName(const TF_Filesystem* filesystem, const char* uri) {
+  return strdup(uri);
+}
+
+static void FlushCaches(const TF_Filesystem* filesystem) {
+  auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
+  absl::ReaderMutexLock l(&gcs_file->block_cache_lock);
+  gcs_file->file_block_cache->Flush();
+  gcs_file->stat_cache->Clear();
+}
+
 }  // namespace tf_gcs_filesystem
 
 static void ProvideFilesystemSupportFor(TF_FilesystemPluginOps* ops,
@@ -572,6 +1088,13 @@ static void ProvideFilesystemSupportFor(TF_FilesystemPluginOps* ops,
       plugin_memory_allocate(TF_WRITABLE_FILE_OPS_SIZE));
   ops->writable_file_ops->cleanup = tf_writable_file::Cleanup;
 
+  ops->read_only_memory_region_ops = static_cast<TF_ReadOnlyMemoryRegionOps*>(
+      plugin_memory_allocate(TF_READ_ONLY_MEMORY_REGION_OPS_SIZE));
+  ops->read_only_memory_region_ops->cleanup =
+      tf_read_only_memory_region::Cleanup;
+  ops->read_only_memory_region_ops->data = tf_read_only_memory_region::Data;
+  ops->read_only_memory_region_ops->length = tf_read_only_memory_region::Length;
+
   ops->filesystem_ops = static_cast<TF_FilesystemOps*>(
       plugin_memory_allocate(TF_FILESYSTEM_OPS_SIZE));
   ops->filesystem_ops->init = tf_gcs_filesystem::Init;
@@ -581,6 +1104,20 @@ static void ProvideFilesystemSupportFor(TF_FilesystemPluginOps* ops,
   ops->filesystem_ops->new_writable_file = tf_gcs_filesystem::NewWritableFile;
   ops->filesystem_ops->new_appendable_file =
       tf_gcs_filesystem::NewAppendableFile;
+  ops->filesystem_ops->new_read_only_memory_region_from_file =
+      tf_gcs_filesystem::NewReadOnlyMemoryRegionFromFile;
+  ops->filesystem_ops->create_dir = tf_gcs_filesystem::CreateDir;
+  ops->filesystem_ops->delete_file = tf_gcs_filesystem::DeleteFile;
+  ops->filesystem_ops->delete_dir = tf_gcs_filesystem::DeleteDir;
+  ops->filesystem_ops->delete_recursively =
+      tf_gcs_filesystem::DeleteRecursively;
+  ops->filesystem_ops->copy_file = tf_gcs_filesystem::CopyFile;
+  ops->filesystem_ops->path_exists = tf_gcs_filesystem::PathExists;
+  ops->filesystem_ops->is_directory = tf_gcs_filesystem::IsDirectory;
+  ops->filesystem_ops->stat = tf_gcs_filesystem::Stat;
+  ops->filesystem_ops->get_children = tf_gcs_filesystem::GetChildren;
+  ops->filesystem_ops->translate_name = tf_gcs_filesystem::TranslateName;
+  ops->filesystem_ops->flush_caches = tf_gcs_filesystem::FlushCaches;
 }
 
 void TF_InitPlugin(TF_FilesystemPluginInfo* info) {
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h
index 93862f4a871..973ce9e9dc2 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h
@@ -17,6 +17,8 @@
 
 #include "google/cloud/storage/client.h"
 #include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
+#include "tensorflow/c/experimental/filesystem/plugins/gcs/expiring_lru_cache.h"
+#include "tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.h"
 #include "tensorflow/c/tf_status.h"
 
 void ParseGCSPath(const std::string& fname, bool object_empty_ok,
@@ -45,10 +47,34 @@ uint64_t Length(const TF_ReadOnlyMemoryRegion* region);
 }  // namespace tf_read_only_memory_region
 
 namespace tf_gcs_filesystem {
+typedef struct GcsFileStat {
+  TF_FileStatistics base;
+  int64_t generation_number;
+} GcsFileStat;
+
 typedef struct GCSFile {
   google::cloud::storage::Client gcs_client;  // owned
   bool compose;
+  absl::Mutex block_cache_lock;
+  std::shared_ptr<RamFileBlockCache> file_block_cache
+      ABSL_GUARDED_BY(block_cache_lock);
+  uint64_t block_size;  // Reads smaller than block_size will trigger a read
+                        // of block_size.
+  std::unique_ptr<ExpiringLRUCache<GcsFileStat>> stat_cache;
+  GCSFile(google::cloud::storage::Client&& gcs_client);
+  // This constructor is used for testing purpose only.
+  GCSFile(google::cloud::storage::Client&& gcs_client, bool compose,
+          uint64_t block_size, size_t max_bytes, uint64_t max_staleness,
+          uint64_t stat_cache_max_age, size_t stat_cache_max_entries);
 } GCSFile;
+
+// This function is used to initialize a filesystem without the need of setting
+// manually environement variables.
+void InitTest(TF_Filesystem* filesystem, bool compose, uint64_t block_size,
+              size_t max_bytes, uint64_t max_staleness,
+              uint64_t stat_cache_max_age, size_t stat_cache_max_entries,
+              TF_Status* status);
+
 void Init(TF_Filesystem* filesystem, TF_Status* status);
 void Cleanup(TF_Filesystem* filesystem);
 void NewRandomAccessFile(const TF_Filesystem* filesystem, const char* path,
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
index 0e3c855d6c6..82c4e4b8705 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
@@ -66,6 +66,9 @@ static std::string* GetTmpDir() {
 namespace tensorflow {
 namespace {
 
+// TODO(vnvo2409): Refactor `gcs_filesystem_test` to remove unnecessary tests
+// after porting all tests from
+// `//tensorflow/core/platform/cloud:gcs_file_system_test`.
 class GCSFilesystemTest : public ::testing::Test {
  public:
   void SetUp() override {
@@ -74,13 +77,14 @@ class GCSFilesystemTest : public ::testing::Test {
         ::testing::UnitTest::GetInstance()->current_test_info()->name());
     status_ = TF_NewStatus();
     filesystem_ = new TF_Filesystem;
-    tf_gcs_filesystem::Init(filesystem_, status_);
-    ASSERT_TF_OK(status_) << "Could not initialize filesystem. "
-                          << TF_Message(status_);
+    filesystem_->plugin_filesystem = nullptr;
+    // Because different tests requires different setup for filesystem. We
+    // initialize filesystem in each testcase.
   }
   void TearDown() override {
     TF_DeleteStatus(status_);
-    tf_gcs_filesystem::Cleanup(filesystem_);
+    if (filesystem_->plugin_filesystem != nullptr)
+      tf_gcs_filesystem::Cleanup(filesystem_);
     delete filesystem_;
   }
 
@@ -117,6 +121,21 @@ class GCSFilesystemTest : public ::testing::Test {
   }
 }
 
+::testing::AssertionResult InsertObject(const std::string& path,
+                                        const std::string& content,
+                                        gcs::Client* gcs_client,
+                                        TF_Status* status) {
+  std::string bucket, object;
+  ParseGCSPath(path, false, &bucket, &object, status);
+  if (TF_GetCode(status) != TF_OK)
+    return ::testing::AssertionFailure() << TF_Message(status);
+  auto metadata = gcs_client->InsertObject(bucket, object, content);
+  if (metadata)
+    return ::testing::AssertionSuccess();
+  else
+    return ::testing::AssertionFailure() << metadata.status().message();
+}
+
 ::testing::AssertionResult CompareSubString(int64_t offset, size_t length,
                                             absl::string_view result,
                                             size_t read) {
@@ -172,6 +191,9 @@ TEST_F(GCSFilesystemTest, ParseGCSPath) {
 }
 
 TEST_F(GCSFilesystemTest, RandomAccessFile) {
+  tf_gcs_filesystem::Init(filesystem_, status_);
+  ASSERT_TF_OK(status_) << "Could not initialize filesystem. "
+                        << TF_Message(status_);
   std::string filepath = GetURIForPath("a_file");
   TF_RandomAccessFile* file = new TF_RandomAccessFile;
   tf_gcs_filesystem::NewRandomAccessFile(filesystem_, filepath.c_str(), file,
@@ -208,6 +230,9 @@ TEST_F(GCSFilesystemTest, RandomAccessFile) {
 }
 
 TEST_F(GCSFilesystemTest, WritableFile) {
+  tf_gcs_filesystem::Init(filesystem_, status_);
+  ASSERT_TF_OK(status_) << "Could not initialize filesystem. "
+                        << TF_Message(status_);
   std::string filepath = GetURIForPath("a_file");
   TF_WritableFile* file = new TF_WritableFile;
   tf_gcs_filesystem::NewWritableFile(filesystem_, filepath.c_str(), file,
@@ -273,6 +298,9 @@ TEST_F(GCSFilesystemTest, WritableFile) {
 }
 
 TEST_F(GCSFilesystemTest, ReadOnlyMemoryRegion) {
+  tf_gcs_filesystem::Init(filesystem_, status_);
+  ASSERT_TF_OK(status_) << "Could not initialize filesystem. "
+                        << TF_Message(status_);
   std::string path = GetURIForPath("a_file");
   auto gcs_file =
       static_cast<tf_gcs_filesystem::GCSFile*>(filesystem_->plugin_filesystem);
@@ -298,6 +326,131 @@ TEST_F(GCSFilesystemTest, ReadOnlyMemoryRegion) {
   delete region;
 }
 
+// These tests below are ported from
+// `//tensorflow/core/platform/cloud:gcs_file_system_test`
+TEST_F(GCSFilesystemTest, NewRandomAccessFile_NoBlockCache) {
+  tf_gcs_filesystem::InitTest(filesystem_, false, 0, 0, 0, 0, 0, status_);
+  ASSERT_TF_OK(status_) << "Could not initialize filesystem. "
+                        << TF_Message(status_);
+  std::string path = GetURIForPath("a_file");
+  auto gcs_file =
+      static_cast<tf_gcs_filesystem::GCSFile*>(filesystem_->plugin_filesystem);
+  ASSERT_TRUE(InsertObject(path, "0123456789", &gcs_file->gcs_client, status_));
+
+  TF_RandomAccessFile* file = new TF_RandomAccessFile;
+  tf_gcs_filesystem::NewRandomAccessFile(filesystem_, path.c_str(), file,
+                                         status_);
+  ASSERT_TF_OK(status_);
+
+  std::string result;
+  result.resize(6);
+  int64_t read = tf_random_access_file::Read(file, 0, 6, &result[0], status_);
+  ASSERT_EQ(read, 6) << "Read: " << read << "\n";
+  ASSERT_TF_OK(status_);
+  ASSERT_EQ(result, "012345") << "Result: " << result << "\n";
+
+  read = tf_random_access_file::Read(file, 6, 6, &result[0], status_);
+  ASSERT_EQ(read, 4) << "Read: " << read << "\n";
+  ASSERT_EQ(TF_GetCode(status_), TF_OUT_OF_RANGE) << TF_Message(status_);
+  result.resize(read);
+  ASSERT_EQ(result, "6789") << "Result: " << result << "\n";
+}
+
+TEST_F(GCSFilesystemTest, NewRandomAccessFile_Buffered) {
+  tf_gcs_filesystem::InitTest(filesystem_, false, 10, 0, 0, 0, 0, status_);
+  ASSERT_TF_OK(status_) << "Could not initialize filesystem. "
+                        << TF_Message(status_);
+  std::string path = GetURIForPath("a_file");
+  auto gcs_file =
+      static_cast<tf_gcs_filesystem::GCSFile*>(filesystem_->plugin_filesystem);
+  ASSERT_TRUE(InsertObject(path, "0123456789", &gcs_file->gcs_client, status_));
+
+  TF_RandomAccessFile* file = new TF_RandomAccessFile;
+  tf_gcs_filesystem::NewRandomAccessFile(filesystem_, path.c_str(), file,
+                                         status_);
+  ASSERT_TF_OK(status_);
+
+  std::string result;
+  result.resize(6);
+  int64_t read = tf_random_access_file::Read(file, 0, 6, &result[0], status_);
+  ASSERT_EQ(read, 6) << "Read: " << read << "\n";
+  ASSERT_TF_OK(status_);
+  ASSERT_EQ(result, "012345") << "Result: " << result << "\n";
+
+  read = tf_random_access_file::Read(file, 6, 6, &result[0], status_);
+  ASSERT_EQ(read, 4) << "Read: " << read << "\n";
+  ASSERT_EQ(TF_GetCode(status_), TF_OUT_OF_RANGE) << TF_Message(status_);
+  result.resize(read);
+  ASSERT_EQ(result, "6789") << "Result: " << result << "\n";
+}
+
+TEST_F(GCSFilesystemTest, NewRandomAccessFile_Buffered_ReadAtEOF) {
+  tf_gcs_filesystem::InitTest(filesystem_, false, 10, 0, 0, 0, 0, status_);
+  ASSERT_TF_OK(status_) << "Could not initialize filesystem. "
+                        << TF_Message(status_);
+  std::string path = GetURIForPath("a_file");
+  auto gcs_file =
+      static_cast<tf_gcs_filesystem::GCSFile*>(filesystem_->plugin_filesystem);
+  ASSERT_TRUE(InsertObject(path, "0123456789", &gcs_file->gcs_client, status_));
+
+  TF_RandomAccessFile* file = new TF_RandomAccessFile;
+  tf_gcs_filesystem::NewRandomAccessFile(filesystem_, path.c_str(), file,
+                                         status_);
+  ASSERT_TF_OK(status_);
+
+  std::string result;
+  result.resize(10);
+  int64_t read = tf_random_access_file::Read(file, 0, result.length(),
+                                             &result[0], status_);
+  ASSERT_EQ(read, 10) << "Read: " << read << "\n";
+  ASSERT_TF_OK(status_);
+  ASSERT_EQ(result, "0123456789") << "Result: " << result << "\n";
+
+  read = tf_random_access_file::Read(file, result.length(), result.length(),
+                                     &result[0], status_);
+  ASSERT_EQ(read, 0) << "Read: " << read << "\n";
+  ASSERT_EQ(TF_GetCode(status_), TF_OUT_OF_RANGE) << TF_Message(status_);
+  result.resize(read);
+  ASSERT_EQ(result, "") << "Result: " << result << "\n";
+}
+
+TEST_F(GCSFilesystemTest, NewRandomAccessFile_Buffered_CachedOutOfRange) {
+  tf_gcs_filesystem::InitTest(filesystem_, false, 10, 0, 0, 0, 0, status_);
+  ASSERT_TF_OK(status_) << "Could not initialize filesystem. "
+                        << TF_Message(status_);
+  std::string path = GetURIForPath("a_file");
+  auto gcs_file =
+      static_cast<tf_gcs_filesystem::GCSFile*>(filesystem_->plugin_filesystem);
+  ASSERT_TRUE(InsertObject(path, "012345678", &gcs_file->gcs_client, status_));
+
+  TF_RandomAccessFile* file = new TF_RandomAccessFile;
+  tf_gcs_filesystem::NewRandomAccessFile(filesystem_, path.c_str(), file,
+                                         status_);
+  ASSERT_TF_OK(status_);
+
+  std::string result;
+  result.resize(5);
+  int64_t read = tf_random_access_file::Read(file, 0, result.length(),
+                                             &result[0], status_);
+  ASSERT_EQ(read, 5) << "Read: " << read << "\n";
+  ASSERT_TF_OK(status_);
+  ASSERT_EQ(result, "01234") << "Result: " << result << "\n";
+
+  read = tf_random_access_file::Read(file, 4, result.length(), &result[0],
+                                     status_);
+  ASSERT_EQ(read, 5) << "Read: " << read << "\n";
+  ASSERT_TF_OK(status_);
+  result.resize(read);
+  ASSERT_EQ(result, "45678") << "Result: " << result << "\n";
+
+  read = tf_random_access_file::Read(file, 5, result.length(), &result[0],
+                                     status_);
+  ASSERT_EQ(read, 4) << "Read: " << read << "\n";
+  ASSERT_EQ(TF_GetCode(status_), TF_OUT_OF_RANGE) << TF_Message(status_);
+  result.resize(read);
+  ASSERT_EQ(result, "5678") << "Result: " << result << "\n";
+}
+
 }  // namespace
 }  // namespace tensorflow
 
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.cc
index 102c7fa175c..3700ccf17a2 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.cc
@@ -39,9 +39,6 @@ std::shared_ptr<RamFileBlockCache::Block> RamFileBlockCache::Lookup(
   auto entry = block_map_.find(key);
   if (entry != block_map_.end()) {
     if (BlockNotStale(entry->second)) {
-      if (cache_stats_ != nullptr) {
-        cache_stats_->RecordCacheHitBlockSize(entry->second->data.size());
-      }
       return entry->second;
     } else {
       // Remove the stale block and continue.
@@ -136,12 +133,9 @@ void RamFileBlockCache::MaybeFetch(const Key& key,
         block->mu.Unlock();  // Release the lock while making the API call.
         block->data.clear();
         block->data.resize(block_size_, 0);
-        size_t bytes_transferred;
-        block_fetcher_(key.first, key.second, block_size_, block->data.data(),
-                       &bytes_transferred, status);
-        if (cache_stats_ != nullptr) {
-          cache_stats_->RecordCacheMissBlockSize(bytes_transferred);
-        }
+        int64_t bytes_transferred;
+        bytes_transferred = block_fetcher_(key.first, key.second, block_size_,
+                                           block->data.data(), status);
         block->mu.Lock();  // Reacquire the lock immediately afterwards
         if (TF_GetCode(status) == TF_OK) {
           block->data.resize(bytes_transferred, 0);
@@ -171,18 +165,16 @@ void RamFileBlockCache::MaybeFetch(const Key& key,
       "Control flow should never reach the end of RamFileBlockCache::Fetch.");
 }
 
-void RamFileBlockCache::Read(const std::string& filename, size_t offset,
-                             size_t n, char* buffer, size_t* bytes_transferred,
-                             TF_Status* status) {
-  *bytes_transferred = 0;
+int64_t RamFileBlockCache::Read(const std::string& filename, size_t offset,
+                                size_t n, char* buffer, TF_Status* status) {
   if (n == 0) {
-    return TF_SetStatus(status, TF_OK, "");
+    TF_SetStatus(status, TF_OK, "");
+    return 0;
   }
   if (!IsCacheEnabled() || (n > max_bytes_)) {
     // The cache is effectively disabled, so we pass the read through to the
     // fetcher without breaking it up into blocks.
-    return block_fetcher_(filename, offset, n, buffer, bytes_transferred,
-                          status);
+    return block_fetcher_(filename, offset, n, buffer, status);
   }
   // Calculate the block-aligned start and end of the read.
   size_t start = block_size_ * (offset / block_size_);
@@ -202,20 +194,20 @@ void RamFileBlockCache::Read(const std::string& filename, size_t offset,
       abort();
     }
     MaybeFetch(key, block, status);
-    if (TF_GetCode(status) != TF_OK) return;
+    if (TF_GetCode(status) != TF_OK) return -1;
     UpdateLRU(key, block, status);
-    if (TF_GetCode(status) != TF_OK) return;
+    if (TF_GetCode(status) != TF_OK) return -1;
     // Copy the relevant portion of the block into the result buffer.
     const auto& data = block->data;
     if (offset >= pos + data.size()) {
       // The requested offset is at or beyond the end of the file. This can
       // happen if `offset` is not block-aligned, and the read returns the last
       // block in the file, which does not extend all the way out to `offset`.
-      *bytes_transferred = total_bytes_transferred;
       std::stringstream os;
       os << "EOF at offset " << offset << " in file " << filename
          << " at position " << pos << " with data size " << data.size();
-      return TF_SetStatus(status, TF_OUT_OF_RANGE, std::move(os).str().c_str());
+      TF_SetStatus(status, TF_OUT_OF_RANGE, std::move(os).str().c_str());
+      return total_bytes_transferred;
     }
     auto begin = data.begin();
     if (offset > pos) {
@@ -237,8 +229,8 @@ void RamFileBlockCache::Read(const std::string& filename, size_t offset,
       break;
     }
   }
-  *bytes_transferred = total_bytes_transferred;
-  return TF_SetStatus(status, TF_OK, "");
+  TF_SetStatus(status, TF_OK, "");
+  return total_bytes_transferred;
 }
 
 bool RamFileBlockCache::ValidateAndUpdateFileSignature(
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.h b/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.h
index 5a82f65db41..2abfb6f924b 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.h
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.h
@@ -28,7 +28,6 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
 #include "tensorflow/c/env.h"
-#include "tensorflow/c/experimental/filesystem/plugins/gcs/file_block_cache.h"
 #include "tensorflow/c/tf_status.h"
 
 namespace tf_gcs_filesystem {
@@ -37,16 +36,17 @@ namespace tf_gcs_filesystem {
 ///
 /// This class should be shared by read-only random access files on a remote
 /// filesystem (e.g. GCS).
-class RamFileBlockCache : public FileBlockCache {
+class RamFileBlockCache {
  public:
   /// The callback executed when a block is not found in the cache, and needs to
   /// be fetched from the backing filesystem. This callback is provided when the
-  /// cache is constructed. The `status` should be `TF_OK` as long as the
-  /// read from the remote filesystem succeeded (similar to the semantics of the
-  /// read(2) system call).
-  typedef std::function<void(const std::string& filename, size_t offset,
-                             size_t buffer_size, char* buffer,
-                             size_t* bytes_transferred, TF_Status* status)>
+  /// cache is constructed. It returns total bytes read ( -1 in case of errors
+  /// ). The `status` should be `TF_OK` as long as the read from the remote
+  /// filesystem succeeded (similar to the semantics of the read(2) system
+  /// call).
+  typedef std::function<int64_t(const std::string& filename, size_t offset,
+                                size_t buffer_size, char* buffer,
+                                TF_Status* status)>
       BlockFetcher;
 
   RamFileBlockCache(size_t block_size, size_t max_bytes, uint64_t max_staleness,
@@ -66,10 +66,10 @@ class RamFileBlockCache : public FileBlockCache {
           TF_StartThread(&thread_options, "TF_prune_FBC", PruneThread, this));
     }
     std::cout << "GCS file block cache is "
-              << (IsCacheEnabled() ? "enabled" : "disabled");
+              << (IsCacheEnabled() ? "enabled" : "disabled") << ".\n";
   }
 
-  ~RamFileBlockCache() override {
+  ~RamFileBlockCache() {
     if (pruning_thread_) {
       stop_pruning_thread_.Notify();
       // Destroying pruning_thread_ will block until Prune() receives the above
@@ -78,8 +78,9 @@ class RamFileBlockCache : public FileBlockCache {
     }
   }
 
-  /// Read `n` bytes from `filename` starting at `offset` into `buffer`. This
-  /// method will set `status` to:
+  /// Read `n` bytes from `filename` starting at `offset` into `buffer`. It
+  /// returns total bytes read ( -1 in case of errors ). This method will set
+  /// `status` to:
   ///
   /// 1) The error from the remote filesystem, if the read from the remote
   ///    filesystem failed.
@@ -97,37 +98,34 @@ class RamFileBlockCache : public FileBlockCache {
   ///
   /// Caller is responsible for allocating memory for `buffer`.
   /// `buffer` will be left unchanged in case of errors.
-  void Read(const std::string& filename, size_t offset, size_t n, char* buffer,
-            size_t* bytes_transferred, TF_Status* status) override;
+  int64_t Read(const std::string& filename, size_t offset, size_t n,
+               char* buffer, TF_Status* status);
 
   // Validate the given file signature with the existing file signature in the
   // cache. Returns true if the signature doesn't change or the file doesn't
   // exist before. If the signature changes, update the existing signature with
   // the new one and remove the file from cache.
   bool ValidateAndUpdateFileSignature(const std::string& filename,
-                                      int64_t file_signature) override
+                                      int64_t file_signature)
       ABSL_LOCKS_EXCLUDED(mu_);
 
   /// Remove all cached blocks for `filename`.
-  void RemoveFile(const std::string& filename) override
-      ABSL_LOCKS_EXCLUDED(mu_);
+  void RemoveFile(const std::string& filename) ABSL_LOCKS_EXCLUDED(mu_);
 
   /// Remove all cached data.
-  void Flush() override ABSL_LOCKS_EXCLUDED(mu_);
+  void Flush() ABSL_LOCKS_EXCLUDED(mu_);
 
   /// Accessors for cache parameters.
-  size_t block_size() const override { return block_size_; }
-  size_t max_bytes() const override { return max_bytes_; }
-  uint64_t max_staleness() const override { return max_staleness_; }
+  size_t block_size() const { return block_size_; }
+  size_t max_bytes() const { return max_bytes_; }
+  uint64_t max_staleness() const { return max_staleness_; }
 
   /// The current size (in bytes) of the cache.
-  size_t CacheSize() const override ABSL_LOCKS_EXCLUDED(mu_);
+  size_t CacheSize() const ABSL_LOCKS_EXCLUDED(mu_);
 
   // Returns true if the cache is enabled. If false, the BlockFetcher callback
   // is always executed during Read.
-  bool IsCacheEnabled() const override {
-    return block_size_ > 0 && max_bytes_ > 0;
-  }
+  bool IsCacheEnabled() const { return block_size_ > 0 && max_bytes_ > 0; }
 
   // We can not pass a lambda with capture as a function pointer to
   // `TF_StartThread`, so we have to wrap `Prune` inside a static function.
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache_test.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache_test.cc
index b1ea295c080..859d42d85e3 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache_test.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache_test.cc
@@ -33,20 +33,22 @@ Status ReadCache(tf_gcs_filesystem::RamFileBlockCache* cache,
                  std::vector<char>* out) {
   out->clear();
   out->resize(n, 0);
-  size_t bytes_transferred = 0;
   TF_Status status;
-  cache->Read(filename, offset, n, out->data(), &bytes_transferred, &status);
-  EXPECT_LE(bytes_transferred, n);
-  out->resize(bytes_transferred, n);
+  auto bytes_transferred =
+      cache->Read(filename, offset, n, out->data(), &status);
+  if (bytes_transferred >= 0) {
+    EXPECT_LE(bytes_transferred, n);
+    out->resize(bytes_transferred, n);
+  }
   return status.status;
 }
 
 TEST(RamFileBlockCacheTest, IsCacheEnabled) {
   auto fetcher = [](const string& filename, size_t offset, size_t n,
-                    char* buffer, size_t* bytes_transferred,
-                    TF_Status* status) {
+                    char* buffer, TF_Status* status) -> int64_t {
     // Do nothing.
-    return TF_SetStatus(status, TF_OK, "");
+    TF_SetStatus(status, TF_OK, "");
+    return 0;
   };
   tf_gcs_filesystem::RamFileBlockCache cache1(0, 0, 0, fetcher);
   tf_gcs_filesystem::RamFileBlockCache cache2(16, 0, 0, fetcher);
@@ -62,12 +64,11 @@ TEST(RamFileBlockCacheTest, IsCacheEnabled) {
 TEST(RamFileBlockCacheTest, ValidateAndUpdateFileSignature) {
   int calls = 0;
   auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
-                          char* buffer, size_t* bytes_transferred,
-                          TF_Status* status) {
+                          char* buffer, TF_Status* status) -> int64_t {
     calls++;
     memset(buffer, 'x', n);
-    *bytes_transferred = n;
-    return TF_SetStatus(status, TF_OK, "");
+    TF_SetStatus(status, TF_OK, "");
+    return n;
   };
   string filename = "file";
   tf_gcs_filesystem::RamFileBlockCache cache(16, 32, 0, fetcher);
@@ -96,15 +97,14 @@ TEST(RamFileBlockCacheTest, PassThrough) {
   int calls = 0;
   auto fetcher = [&calls, want_filename, want_offset, want_n](
                      const string& got_filename, size_t got_offset,
-                     size_t got_n, char* buffer, size_t* bytes_transferred,
-                     TF_Status* status) {
+                     size_t got_n, char* buffer, TF_Status* status) -> int64_t {
     EXPECT_EQ(got_filename, want_filename);
     EXPECT_EQ(got_offset, want_offset);
     EXPECT_EQ(got_n, want_n);
     calls++;
     memset(buffer, 'x', got_n);
-    *bytes_transferred = got_n;
-    return TF_SetStatus(status, TF_OK, "");
+    TF_SetStatus(status, TF_OK, "");
+    return got_n;
   };
   // If block_size, max_bytes, or both are zero, or want_n is larger than
   // max_bytes the cache is a pass-through.
@@ -133,16 +133,17 @@ TEST(RamFileBlockCacheTest, BlockAlignment) {
   }
   // The fetcher just fetches slices of the buffer.
   auto fetcher = [&buf](const string& filename, size_t offset, size_t n,
-                        char* buffer, size_t* bytes_transferred,
-                        TF_Status* status) {
+                        char* buffer, TF_Status* status) -> int64_t {
+    int64_t bytes_transferred;
     if (offset < buf.size()) {
       size_t bytes_to_copy = std::min<size_t>(buf.size() - offset, n);
       memcpy(buffer, buf.data() + offset, bytes_to_copy);
-      *bytes_transferred = bytes_to_copy;
+      bytes_transferred = bytes_to_copy;
     } else {
-      *bytes_transferred = 0;
+      bytes_transferred = 0;
     }
-    return TF_SetStatus(status, TF_OK, "");
+    TF_SetStatus(status, TF_OK, "");
+    return bytes_transferred;
   };
   for (size_t block_size = 2; block_size <= 4; block_size++) {
     // Make a cache of N-byte block size (1 block) and verify that reads of
@@ -181,15 +182,14 @@ TEST(RamFileBlockCacheTest, CacheHits) {
   std::set<size_t> calls;
   auto fetcher = [&calls, block_size](const string& filename, size_t offset,
                                       size_t n, char* buffer,
-                                      size_t* bytes_transferred,
-                                      TF_Status* status) {
+                                      TF_Status* status) -> int64_t {
     EXPECT_EQ(n, block_size);
     EXPECT_EQ(offset % block_size, 0);
     EXPECT_EQ(calls.find(offset), calls.end()) << "at offset " << offset;
     calls.insert(offset);
     memset(buffer, 'x', n);
-    *bytes_transferred = n;
-    return TF_SetStatus(status, TF_OK, "");
+    TF_SetStatus(status, TF_OK, "");
+    return n;
   };
   const uint32 block_count = 256;
   tf_gcs_filesystem::RamFileBlockCache cache(
@@ -215,8 +215,7 @@ TEST(RamFileBlockCacheTest, OutOfRange) {
   bool second_block = false;
   auto fetcher = [block_size, file_size, &first_block, &second_block](
                      const string& filename, size_t offset, size_t n,
-                     char* buffer, size_t* bytes_transferred,
-                     TF_Status* status) {
+                     char* buffer, TF_Status* status) -> int64_t {
     EXPECT_EQ(n, block_size);
     EXPECT_EQ(offset % block_size, 0);
     size_t bytes_to_copy = 0;
@@ -231,8 +230,8 @@ TEST(RamFileBlockCacheTest, OutOfRange) {
       memset(buffer, 'x', bytes_to_copy);
       second_block = true;
     }
-    *bytes_transferred = bytes_to_copy;
-    return TF_SetStatus(status, TF_OK, "");
+    TF_SetStatus(status, TF_OK, "");
+    return bytes_to_copy;
   };
   tf_gcs_filesystem::RamFileBlockCache cache(block_size, block_size, 0,
                                              fetcher);
@@ -260,14 +259,13 @@ TEST(RamFileBlockCacheTest, Inconsistent) {
   const size_t block_size = 16;
   // This fetcher returns OK but only fills in one byte for any offset.
   auto fetcher = [block_size](const string& filename, size_t offset, size_t n,
-                              char* buffer, size_t* bytes_transferred,
-                              TF_Status* status) {
+                              char* buffer, TF_Status* status) -> int64_t {
     EXPECT_EQ(n, block_size);
     EXPECT_EQ(offset % block_size, 0);
     EXPECT_GE(n, 1);
     memset(buffer, 'x', 1);
-    *bytes_transferred = 1;
-    return TF_SetStatus(status, TF_OK, "");
+    TF_SetStatus(status, TF_OK, "");
+    return 1;
   };
   tf_gcs_filesystem::RamFileBlockCache cache(block_size, 2 * block_size, 0,
                                              fetcher);
@@ -286,8 +284,7 @@ TEST(RamFileBlockCacheTest, LRU) {
   std::list<size_t> calls;
   auto fetcher = [&calls, block_size](const string& filename, size_t offset,
                                       size_t n, char* buffer,
-                                      size_t* bytes_transferred,
-                                      TF_Status* status) {
+                                      TF_Status* status) -> int64_t {
     EXPECT_EQ(n, block_size);
     EXPECT_FALSE(calls.empty()) << "at offset = " << offset;
     if (!calls.empty()) {
@@ -295,8 +292,8 @@ TEST(RamFileBlockCacheTest, LRU) {
       calls.pop_front();
     }
     memset(buffer, 'x', n);
-    *bytes_transferred = n;
-    return TF_SetStatus(status, TF_OK, "");
+    TF_SetStatus(status, TF_OK, "");
+    return n;
   };
   const uint32 block_count = 2;
   tf_gcs_filesystem::RamFileBlockCache cache(
@@ -335,12 +332,11 @@ TEST(RamFileBlockCacheTest, LRU) {
 TEST(RamFileBlockCacheTest, MaxStaleness) {
   int calls = 0;
   auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
-                          char* buffer, size_t* bytes_transferred,
-                          TF_Status* status) {
+                          char* buffer, TF_Status* status) -> int64_t {
     calls++;
     memset(buffer, 'x', n);
-    *bytes_transferred = n;
-    return TF_SetStatus(status, TF_OK, "");
+    TF_SetStatus(status, TF_OK, "");
+    return n;
   };
   std::vector<char> out;
   std::unique_ptr<NowSecondsEnv> env(new NowSecondsEnv);
@@ -380,8 +376,7 @@ TEST(RamFileBlockCacheTest, MaxStaleness) {
 TEST(RamFileBlockCacheTest, RemoveFile) {
   int calls = 0;
   auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
-                          char* buffer, size_t* bytes_transferred,
-                          TF_Status* status) {
+                          char* buffer, TF_Status* status) -> int64_t {
     calls++;
     char c = (filename == "a") ? 'a' : (filename == "b") ? 'b' : 'x';
     if (offset > 0) {
@@ -389,8 +384,8 @@ TEST(RamFileBlockCacheTest, RemoveFile) {
       c = toupper(c);
     }
     memset(buffer, c, n);
-    *bytes_transferred = n;
-    return TF_SetStatus(status, TF_OK, "");
+    TF_SetStatus(status, TF_OK, "");
+    return n;
   };
   // This cache has space for 4 blocks; we'll read from two files.
   const size_t n = 3;
@@ -443,12 +438,11 @@ TEST(RamFileBlockCacheTest, RemoveFile) {
 TEST(RamFileBlockCacheTest, Prune) {
   int calls = 0;
   auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
-                          char* buffer, size_t* bytes_transferred,
-                          TF_Status* status) {
+                          char* buffer, TF_Status* status) -> int64_t {
     calls++;
     memset(buffer, 'x', n);
-    *bytes_transferred = n;
-    return TF_SetStatus(status, TF_OK, "");
+    TF_SetStatus(status, TF_OK, "");
+    return n;
   };
   std::vector<char> out;
   // Our fake environment is initialized with the current timestamp.
@@ -509,17 +503,17 @@ TEST(RamFileBlockCacheTest, ParallelReads) {
   const int callers = 4;
   BlockingCounter counter(callers);
   auto fetcher = [&counter](const string& filename, size_t offset, size_t n,
-                            char* buffer, size_t* bytes_transferred,
-                            TF_Status* status) {
+                            char* buffer, TF_Status* status) -> int64_t {
     counter.DecrementCount();
     if (!counter.WaitFor(std::chrono::seconds(10))) {
       // This avoids having the test time out, which is harder to debug.
-      return TF_SetStatus(status, TF_FAILED_PRECONDITION,
-                          "desired concurrency not reached");
+      TF_SetStatus(status, TF_FAILED_PRECONDITION,
+                   "desired concurrency not reached");
+      return -1;
     }
     memset(buffer, 'x', n);
-    *bytes_transferred = n;
-    return TF_SetStatus(status, TF_OK, "");
+    TF_SetStatus(status, TF_OK, "");
+    return n;
   };
   const int block_size = 8;
   tf_gcs_filesystem::RamFileBlockCache cache(
@@ -548,17 +542,16 @@ TEST(RamFileBlockCacheTest, CoalesceConcurrentReads) {
   Notification notification;
   auto fetcher = [&num_requests, &notification, block_size](
                      const string& filename, size_t offset, size_t n,
-                     char* buffer, size_t* bytes_transferred,
-                     TF_Status* status) {
+                     char* buffer, TF_Status* status) -> int64_t {
     EXPECT_EQ(n, block_size);
     EXPECT_EQ(offset, 0);
     num_requests++;
     memset(buffer, 'x', n);
-    *bytes_transferred = n;
     notification.Notify();
     // Wait for other thread to issue read.
     Env::Default()->SleepForMicroseconds(100000);  // 0.1 secs
-    return TF_SetStatus(status, TF_OK, "");
+    TF_SetStatus(status, TF_OK, "");
+    return n;
   };
   tf_gcs_filesystem::RamFileBlockCache cache(block_size, block_size, 0,
                                              fetcher);
@@ -580,12 +573,11 @@ TEST(RamFileBlockCacheTest, CoalesceConcurrentReads) {
 TEST(RamFileBlockCacheTest, Flush) {
   int calls = 0;
   auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
-                          char* buffer, size_t* bytes_transferred,
-                          TF_Status* status) {
+                          char* buffer, TF_Status* status) -> int64_t {
     calls++;
     memset(buffer, 'x', n);
-    *bytes_transferred = n;
-    return TF_SetStatus(status, TF_OK, "");
+    TF_SetStatus(status, TF_OK, "");
+    return n;
   };
   tf_gcs_filesystem::RamFileBlockCache cache(16, 32, 0, fetcher);
   std::vector<char> out;
diff --git a/tensorflow/c/experimental/filesystem/plugins/hadoop/BUILD b/tensorflow/c/experimental/filesystem/plugins/hadoop/BUILD
new file mode 100644
index 00000000000..51ffd709f3d
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/hadoop/BUILD
@@ -0,0 +1,35 @@
+# Experimental hadoop filesystem plugin.
+load("//tensorflow:tensorflow.bzl", "get_win_copts", "tf_cc_shared_object")
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+# Filesystem implementation for HADOOP environments
+tf_cc_shared_object(
+    name = "hadoop_filesystem",
+    framework_so = [],
+    linkstatic = False,
+    per_os_targets = 1,
+    visibility = ["//visibility:public"],
+    deps = [":hadoop_filesystem_impl"],
+)
+
+# The real implementation of the filesystem.
+cc_library(
+    name = "hadoop_filesystem_impl",
+    srcs = ["hadoop_filesystem.cc"],
+    hdrs = ["hadoop_filesystem.h"],
+    copts = select({
+        "//conditions:default": [],
+        "//tensorflow:windows": get_win_copts(),
+    }),
+    deps = [
+        "//tensorflow/c:env",
+        "//tensorflow/c:tf_status",
+        "//tensorflow/c/experimental/filesystem:filesystem_interface",
+        "//third_party/hadoop:hdfs",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
diff --git a/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.cc
new file mode 100644
index 00000000000..e53e3d0bcc5
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.cc
@@ -0,0 +1,660 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <functional>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/c/env.h"
+#include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
+#include "tensorflow/c/tf_status.h"
+#include "third_party/hadoop/hdfs.h"
+
+// Implementation of a filesystem for HADOOP environments.
+// This filesystem will support `hdfs://`, `viewfs://` and `har://` URI schemes.
+
+static void* plugin_memory_allocate(size_t size) { return calloc(1, size); }
+static void plugin_memory_free(void* ptr) { free(ptr); }
+
+void ParseHadoopPath(const std::string& fname, std::string* scheme,
+                     std::string* namenode, std::string* path) {
+  size_t scheme_end = fname.find("://") + 2;
+  *scheme = fname.substr(0, scheme_end + 1);
+  size_t nn_end = fname.find("/", scheme_end + 1);
+  if (nn_end == std::string::npos) return;
+  *namenode = fname.substr(scheme_end + 1, nn_end - scheme_end - 1);
+  *path = fname.substr(nn_end + 1);
+}
+
+void SplitArchiveNameAndPath(std::string* path, std::string* nn,
+                             TF_Status* status) {
+  size_t index_end_archive_name = path->find(".har");
+  if (index_end_archive_name == path->npos) {
+    return TF_SetStatus(
+        status, TF_INVALID_ARGUMENT,
+        "Hadoop archive path does not contain a .har extension");
+  }
+  // Case of hadoop archive. Namenode is the path to the archive.
+  std::ostringstream namenodestream;
+  namenodestream << "har://" << nn
+                 << path->substr(0, index_end_archive_name + 4);
+  *nn = namenodestream.str();
+  path->erase(0, index_end_archive_name + 4);
+  if (path->empty())
+    // Root of the archive
+    *path = "/";
+  return TF_SetStatus(status, TF_OK, "");
+}
+
+template <typename R, typename... Args>
+void BindFunc(void* handle, const char* name, std::function<R(Args...)>* func,
+              TF_Status* status) {
+  *func = reinterpret_cast<R (*)(Args...)>(
+      TF_GetSymbolFromLibrary(handle, name, status));
+}
+
+class LibHDFS {
+ public:
+  explicit LibHDFS(TF_Status* status) { LoadAndBind(status); }
+
+  std::function<hdfsFS(hdfsBuilder*)> hdfsBuilderConnect;
+  std::function<hdfsBuilder*()> hdfsNewBuilder;
+  std::function<void(hdfsBuilder*, const char*)> hdfsBuilderSetNameNode;
+  std::function<int(const char*, char**)> hdfsConfGetStr;
+  std::function<int(hdfsFS, hdfsFile)> hdfsCloseFile;
+  std::function<tSize(hdfsFS, hdfsFile, tOffset, void*, tSize)> hdfsPread;
+  std::function<tSize(hdfsFS, hdfsFile, const void*, tSize)> hdfsWrite;
+  std::function<int(hdfsFS, hdfsFile)> hdfsHFlush;
+  std::function<int(hdfsFS, hdfsFile)> hdfsHSync;
+  std::function<tOffset(hdfsFS, hdfsFile)> hdfsTell;
+  std::function<hdfsFile(hdfsFS, const char*, int, int, short, tSize)>
+      hdfsOpenFile;
+  std::function<int(hdfsFS, const char*)> hdfsExists;
+  std::function<hdfsFileInfo*(hdfsFS, const char*, int*)> hdfsListDirectory;
+  std::function<void(hdfsFileInfo*, int)> hdfsFreeFileInfo;
+  std::function<int(hdfsFS, const char*, int recursive)> hdfsDelete;
+  std::function<int(hdfsFS, const char*)> hdfsCreateDirectory;
+  std::function<hdfsFileInfo*(hdfsFS, const char*)> hdfsGetPathInfo;
+  std::function<int(hdfsFS, const char*, const char*)> hdfsRename;
+
+ private:
+  void LoadAndBind(TF_Status* status) {
+    auto TryLoadAndBind = [this](const char* name, void** handle,
+                                 TF_Status* status) {
+      *handle = TF_LoadSharedLibrary(name, status);
+      if (TF_GetCode(status) != TF_OK) return;
+
+#define BIND_HDFS_FUNC(function)                     \
+  do {                                               \
+    BindFunc(*handle, #function, &function, status); \
+    if (TF_GetCode(status) != TF_OK) return;         \
+  } while (0);
+
+      BIND_HDFS_FUNC(hdfsBuilderConnect);
+      BIND_HDFS_FUNC(hdfsNewBuilder);
+      BIND_HDFS_FUNC(hdfsBuilderSetNameNode);
+      BIND_HDFS_FUNC(hdfsConfGetStr);
+      BIND_HDFS_FUNC(hdfsCloseFile);
+      BIND_HDFS_FUNC(hdfsPread);
+      BIND_HDFS_FUNC(hdfsWrite);
+      BIND_HDFS_FUNC(hdfsHFlush);
+      BIND_HDFS_FUNC(hdfsTell);
+      BIND_HDFS_FUNC(hdfsHSync);
+      BIND_HDFS_FUNC(hdfsOpenFile);
+      BIND_HDFS_FUNC(hdfsExists);
+      BIND_HDFS_FUNC(hdfsListDirectory);
+      BIND_HDFS_FUNC(hdfsFreeFileInfo);
+      BIND_HDFS_FUNC(hdfsDelete);
+      BIND_HDFS_FUNC(hdfsCreateDirectory);
+      BIND_HDFS_FUNC(hdfsGetPathInfo);
+      BIND_HDFS_FUNC(hdfsRename);
+
+#undef BIND_HDFS_FUNC
+    };
+
+    // libhdfs.so won't be in the standard locations. Use the path as specified
+    // in the libhdfs documentation.
+#if defined(_WIN32)
+    constexpr char kLibHdfsDso[] = "hdfs.dll";
+#elif defined(__GNUC__) && (defined(__APPLE_CPP__) || defined(__APPLE_CC__) || \
+                            defined(__MACOS_CLASSIC__))
+    constexpr char kLibHdfsDso[] = "libhdfs.dylib";
+#else
+    constexpr char kLibHdfsDso[] = "libhdfs.so";
+#endif
+    char* hdfs_home = getenv("HADOOP_HDFS_HOME");
+    if (hdfs_home != nullptr) {
+      auto JoinPath = [](std::string home, std::string lib) {
+        if (home.back() != '/') home.push_back('/');
+        return home + "lib/native/" + lib;
+      };
+      std::string path = JoinPath(hdfs_home, kLibHdfsDso);
+      TryLoadAndBind(path.c_str(), &handle_, status);
+      if (TF_GetCode(status) == TF_OK) {
+        return;
+      } else {
+        std::cerr << "HadoopFileSystem load error: " << TF_Message(status);
+      }
+    }
+
+    // Try to load the library dynamically in case it has been installed
+    // to a in non-standard location.
+    TryLoadAndBind(kLibHdfsDso, &handle_, status);
+  }
+
+  void* handle_;
+};
+
+// We rely on HDFS connection caching here. The HDFS client calls
+// org.apache.hadoop.fs.FileSystem.get(), which caches the connection
+// internally.
+hdfsFS Connect(LibHDFS* libhdfs, const std::string& path, TF_Status* status) {
+  std::string scheme, namenode, hdfs_path;
+  ParseHadoopPath(path, &scheme, &namenode, &hdfs_path);
+
+  hdfsBuilder* builder = libhdfs->hdfsNewBuilder();
+  if (scheme == "file") {
+    libhdfs->hdfsBuilderSetNameNode(builder, nullptr);
+  } else if (scheme == "viewfs") {
+    char* defaultFS = nullptr;
+    libhdfs->hdfsConfGetStr("fs.defaultFS", &defaultFS);
+    std::string defaultScheme, defaultCluster, defaultPath;
+    ParseHadoopPath(defaultFS, &defaultScheme, &defaultCluster, &defaultPath);
+
+    if (scheme != defaultScheme ||
+        (namenode.empty() && namenode != defaultCluster)) {
+      TF_SetStatus(status, TF_UNIMPLEMENTED,
+                   "viewfs is only supported as a fs.defaultFS.");
+      return nullptr;
+    }
+    // The default NameNode configuration will be used (from the XML
+    // configuration files). See:
+    // https://github.com/tensorflow/tensorflow/blob/v1.0.0/third_party/hadoop/hdfs.h#L259
+    libhdfs->hdfsBuilderSetNameNode(builder, "default");
+  } else if (scheme == "har") {
+    std::string path_har = path;
+    SplitArchiveNameAndPath(&path_har, &namenode, status);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+    libhdfs->hdfsBuilderSetNameNode(builder, namenode.c_str());
+  } else {
+    libhdfs->hdfsBuilderSetNameNode(
+        builder, namenode.empty() ? "default" : namenode.c_str());
+  }
+  auto fs = libhdfs->hdfsBuilderConnect(builder);
+  if (fs == nullptr)
+    TF_SetStatusFromIOError(status, TF_NOT_FOUND, strerror(errno));
+  else
+    TF_SetStatus(status, TF_OK, "");
+  return fs;
+}
+
+// SECTION 1. Implementation for `TF_RandomAccessFile`
+// ----------------------------------------------------------------------------
+namespace tf_random_access_file {
+typedef struct HDFSFile {
+  std::string path;
+  std::string hdfs_path;
+  hdfsFS fs;
+  LibHDFS* libhdfs;
+  absl::Mutex mu;
+  hdfsFile handle ABSL_GUARDED_BY(mu);
+  HDFSFile(std::string path, std::string hdfs_path, hdfsFS fs, LibHDFS* libhdfs,
+           hdfsFile handle)
+      : path(std::move(path)),
+        hdfs_path(std::move(hdfs_path)),
+        fs(fs),
+        libhdfs(libhdfs),
+        mu(),
+        handle(handle) {}
+} HDFSFile;
+
+void Cleanup(TF_RandomAccessFile* file) {
+  auto hdfs_file = static_cast<HDFSFile*>(file->plugin_file);
+  {
+    absl::MutexLock l(&hdfs_file->mu);
+    if (hdfs_file->handle != nullptr) {
+      hdfs_file->libhdfs->hdfsCloseFile(hdfs_file->fs, hdfs_file->handle);
+    }
+  }
+  delete hdfs_file;
+}
+
+int64_t Read(const TF_RandomAccessFile* file, uint64_t offset, size_t n,
+             char* buffer, TF_Status* status) {
+  auto hdfs_file = static_cast<HDFSFile*>(file->plugin_file);
+  auto libhdfs = hdfs_file->libhdfs;
+  auto fs = hdfs_file->fs;
+  auto hdfs_path = hdfs_file->hdfs_path.c_str();
+  auto path = hdfs_file->path.c_str();
+
+  char* dst = buffer;
+  bool eof_retried = false;
+  int64_t r = 0;
+  while (TF_GetCode(status) == TF_OK && !eof_retried) {
+    // We lock inside the loop rather than outside so we don't block other
+    // concurrent readers.
+    absl::MutexLock l(&hdfs_file->mu);
+    auto handle = hdfs_file->handle;
+    // Max read length is INT_MAX-2, for hdfsPread function take a parameter
+    // of int32. -2 offset can avoid JVM OutOfMemoryError.
+    size_t read_n =
+        (std::min)(n, static_cast<size_t>(std::numeric_limits<int>::max() - 2));
+    r = libhdfs->hdfsPread(fs, handle, static_cast<tOffset>(offset), dst,
+                           static_cast<tSize>(read_n));
+    if (r > 0) {
+      dst += r;
+      n -= r;
+      offset += r;
+    } else if (!eof_retried && r == 0) {
+      // Always reopen the file upon reaching EOF to see if there's more data.
+      // If writers are streaming contents while others are concurrently
+      // reading, HDFS requires that we reopen the file to see updated
+      // contents.
+      //
+      // Fixes #5438
+      if (handle != nullptr && libhdfs->hdfsCloseFile(fs, handle) != 0) {
+        TF_SetStatusFromIOError(status, errno, path);
+        return -1;
+      }
+      handle = libhdfs->hdfsOpenFile(fs, hdfs_path, O_RDONLY, 0, 0, 0);
+      if (handle == nullptr) {
+        TF_SetStatusFromIOError(status, errno, path);
+        return -1;
+      }
+      eof_retried = true;
+    } else if (eof_retried && r == 0) {
+      TF_SetStatus(status, TF_OUT_OF_RANGE, "Read less bytes than requested");
+    } else if (errno == EINTR || errno == EAGAIN) {
+      // hdfsPread may return EINTR too. Just retry.
+    } else {
+      TF_SetStatusFromIOError(status, errno, path);
+    }
+  }
+  return r;
+}
+
+}  // namespace tf_random_access_file
+
+// SECTION 2. Implementation for `TF_WritableFile`
+// ----------------------------------------------------------------------------
+namespace tf_writable_file {
+typedef struct HDFSFile {
+  std::string hdfs_path;
+  hdfsFS fs;
+  LibHDFS* libhdfs;
+  hdfsFile handle;
+  HDFSFile(std::string hdfs_path, hdfsFS fs, LibHDFS* libhdfs, hdfsFile handle)
+      : hdfs_path(std::move(hdfs_path)),
+        fs(fs),
+        libhdfs(libhdfs),
+        handle(handle) {}
+} HDFSFile;
+
+static void Cleanup(TF_WritableFile* file) {
+  auto hdfs_file = static_cast<HDFSFile*>(file->plugin_file);
+  hdfs_file->libhdfs->hdfsCloseFile(hdfs_file->fs, hdfs_file->handle);
+  hdfs_file->fs = nullptr;
+  hdfs_file->handle = nullptr;
+  delete hdfs_file;
+}
+
+void Append(const TF_WritableFile* file, const char* buffer, size_t n,
+            TF_Status* status) {
+  auto hdfs_file = static_cast<HDFSFile*>(file->plugin_file);
+  auto libhdfs = hdfs_file->libhdfs;
+  auto fs = hdfs_file->fs;
+  auto handle = hdfs_file->handle;
+
+  size_t cur_pos = 0, write_len = 0;
+  bool retry = false;
+  // max() - 2 can avoid OutOfMemoryError in JVM .
+  static const size_t max_len_once =
+      static_cast<size_t>(std::numeric_limits<tSize>::max() - 2);
+  while (cur_pos < n) {
+    write_len = (std::min)(n - cur_pos, max_len_once);
+    tSize w = libhdfs->hdfsWrite(fs, handle, buffer + cur_pos,
+                                 static_cast<tSize>(write_len));
+    if (w == -1) {
+      if (!retry && (errno == EINTR || errno == EAGAIN)) {
+        retry = true;
+      } else {
+        return TF_SetStatusFromIOError(status, errno,
+                                       hdfs_file->hdfs_path.c_str());
+      }
+    } else {
+      cur_pos += w;
+    }
+  }
+  TF_SetStatus(status, TF_OK, "");
+}
+
+int64_t Tell(const TF_WritableFile* file, TF_Status* status) {
+  auto hdfs_file = static_cast<HDFSFile*>(file->plugin_file);
+  int64_t position =
+      hdfs_file->libhdfs->hdfsTell(hdfs_file->fs, hdfs_file->handle);
+  if (position == -1)
+    TF_SetStatusFromIOError(status, errno, hdfs_file->hdfs_path.c_str());
+  else
+    TF_SetStatus(status, TF_OK, "");
+  return position;
+}
+
+void Flush(const TF_WritableFile* file, TF_Status* status) {
+  auto hdfs_file = static_cast<HDFSFile*>(file->plugin_file);
+  if (hdfs_file->libhdfs->hdfsHFlush(hdfs_file->fs, hdfs_file->handle) != 0)
+    TF_SetStatusFromIOError(status, errno, hdfs_file->hdfs_path.c_str());
+  else
+    TF_SetStatus(status, TF_OK, "");
+}
+
+void Sync(const TF_WritableFile* file, TF_Status* status) {
+  auto hdfs_file = static_cast<HDFSFile*>(file->plugin_file);
+  if (hdfs_file->libhdfs->hdfsHSync(hdfs_file->fs, hdfs_file->handle) != 0)
+    TF_SetStatusFromIOError(status, errno, hdfs_file->hdfs_path.c_str());
+  else
+    TF_SetStatus(status, TF_OK, "");
+}
+
+void Close(const TF_WritableFile* file, TF_Status* status) {
+  auto hdfs_file = static_cast<HDFSFile*>(file->plugin_file);
+  TF_SetStatus(status, TF_OK, "");
+  if (hdfs_file->libhdfs->hdfsCloseFile(hdfs_file->fs, hdfs_file->handle) != 0)
+    TF_SetStatusFromIOError(status, errno, hdfs_file->hdfs_path.c_str());
+  hdfs_file->fs = nullptr;
+  hdfs_file->handle = nullptr;
+}
+
+}  // namespace tf_writable_file
+
+// SECTION 3. Implementation for `TF_ReadOnlyMemoryRegion`
+// ----------------------------------------------------------------------------
+namespace tf_read_only_memory_region {
+
+// TODO(vnvo2409): Implement later
+
+}  // namespace tf_read_only_memory_region
+
+// SECTION 4. Implementation for `TF_Filesystem`, the actual filesystem
+// ----------------------------------------------------------------------------
+namespace tf_hadoop_filesystem {
+
+void Init(TF_Filesystem* filesystem, TF_Status* status) {
+  filesystem->plugin_filesystem = new LibHDFS(status);
+  if (TF_GetCode(status) != TF_OK) return;
+  TF_SetStatus(status, TF_OK, "");
+}
+
+void Cleanup(TF_Filesystem* filesystem) {
+  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
+  delete libhdfs;
+}
+
+void NewRandomAccessFile(const TF_Filesystem* filesystem, const char* path,
+                         TF_RandomAccessFile* file, TF_Status* status) {
+  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
+  auto fs = Connect(libhdfs, path, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  std::string scheme, namenode, hdfs_path;
+  ParseHadoopPath(path, &scheme, &namenode, &hdfs_path);
+
+  auto handle = libhdfs->hdfsOpenFile(fs, hdfs_path.c_str(), O_RDONLY, 0, 0, 0);
+  if (handle == nullptr) return TF_SetStatusFromIOError(status, errno, path);
+
+  file->plugin_file =
+      new tf_random_access_file::HDFSFile(path, hdfs_path, fs, libhdfs, handle);
+  TF_SetStatus(status, TF_OK, "");
+}
+
+void NewWritableFile(const TF_Filesystem* filesystem, const char* path,
+                     TF_WritableFile* file, TF_Status* status) {
+  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
+  auto fs = Connect(libhdfs, path, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  std::string scheme, namenode, hdfs_path;
+  ParseHadoopPath(path, &scheme, &namenode, &hdfs_path);
+
+  auto handle = libhdfs->hdfsOpenFile(fs, hdfs_path.c_str(),
+                                      O_WRONLY | O_APPEND, 0, 0, 0);
+  if (handle == nullptr) return TF_SetStatusFromIOError(status, errno, path);
+
+  file->plugin_file =
+      new tf_writable_file::HDFSFile(hdfs_path, fs, libhdfs, handle);
+  TF_SetStatus(status, TF_OK, "");
+}
+
+void NewReadOnlyMemoryRegionFromFile(const TF_Filesystem* filesystem,
+                                     const char* path,
+                                     TF_ReadOnlyMemoryRegion* region,
+                                     TF_Status* status) {
+  // hadoopReadZero() technically supports this call with the following
+  // caveats:
+  // - It only works up to 2 GB. We'd have to Stat() the file to ensure that
+  //   it fits.
+  // - If not on the local filesystem, the entire file will be read, making
+  //   it inefficient for callers that assume typical mmap() behavior.
+  TF_SetStatus(status, TF_UNIMPLEMENTED,
+               "HDFS does not support ReadOnlyMemoryRegion");
+}
+
+void PathExists(const TF_Filesystem* filesystem, const char* path,
+                TF_Status* status) {
+  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
+  auto fs = Connect(libhdfs, path, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  std::string scheme, namenode, hdfs_path;
+  ParseHadoopPath(path, &scheme, &namenode, &hdfs_path);
+
+  if (libhdfs->hdfsExists(fs, hdfs_path.c_str()) == 0)
+    TF_SetStatus(status, TF_OK, "");
+  else
+    TF_SetStatus(status, TF_NOT_FOUND,
+                 (std::string(path) + " not found").c_str());
+}
+
+void Stat(const TF_Filesystem* filesystem, const char* path,
+          TF_FileStatistics* stats, TF_Status* status) {
+  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
+  auto fs = Connect(libhdfs, path, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  std::string scheme, namenode, hdfs_path;
+  ParseHadoopPath(path, &scheme, &namenode, &hdfs_path);
+
+  auto info = libhdfs->hdfsGetPathInfo(fs, hdfs_path.c_str());
+  if (info == nullptr) return TF_SetStatusFromIOError(status, errno, path);
+
+  stats->length = static_cast<int64_t>(info->mSize);
+  stats->mtime_nsec = static_cast<int64_t>(info->mLastMod) * 1e9;
+  stats->is_directory = info->mKind == kObjectKindDirectory;
+  libhdfs->hdfsFreeFileInfo(info, 1);
+  TF_SetStatus(status, TF_OK, "");
+}
+
+int64_t GetFileSize(const TF_Filesystem* filesystem, const char* path,
+                    TF_Status* status) {
+  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
+  auto fs = Connect(libhdfs, path, status);
+  if (TF_GetCode(status) != TF_OK) return -1;
+
+  std::string scheme, namenode, hdfs_path;
+  ParseHadoopPath(path, &scheme, &namenode, &hdfs_path);
+
+  auto info = libhdfs->hdfsGetPathInfo(fs, hdfs_path.c_str());
+  if (info == nullptr) {
+    TF_SetStatusFromIOError(status, errno, path);
+    return -1;
+  }
+
+  TF_SetStatus(status, TF_OK, "");
+  auto size = static_cast<int64_t>(info->mSize);
+  libhdfs->hdfsFreeFileInfo(info, 1);
+  return size;
+}
+
+void DeleteFile(const TF_Filesystem* filesystem, const char* path,
+                TF_Status* status) {
+  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
+  auto fs = Connect(libhdfs, path, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  std::string scheme, namenode, hdfs_path;
+  ParseHadoopPath(path, &scheme, &namenode, &hdfs_path);
+
+  if (libhdfs->hdfsDelete(fs, hdfs_path.c_str(), /*recursive=*/0) != 0)
+    TF_SetStatusFromIOError(status, errno, path);
+  else
+    TF_SetStatus(status, TF_OK, "");
+}
+
+void CreateDir(const TF_Filesystem* filesystem, const char* path,
+               TF_Status* status) {
+  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
+  auto fs = Connect(libhdfs, path, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  std::string scheme, namenode, hdfs_path;
+  ParseHadoopPath(path, &scheme, &namenode, &hdfs_path);
+
+  if (libhdfs->hdfsCreateDirectory(fs, hdfs_path.c_str()) != 0)
+    TF_SetStatusFromIOError(status, errno, path);
+  else
+    TF_SetStatus(status, TF_OK, "");
+}
+
+void DeleteDir(const TF_Filesystem* filesystem, const char* path,
+               TF_Status* status) {
+  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
+  auto fs = Connect(libhdfs, path, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  std::string scheme, namenode, hdfs_path;
+  ParseHadoopPath(path, &scheme, &namenode, &hdfs_path);
+
+  // Count the number of entries in the directory, and only delete if it's
+  // non-empty. This is consistent with the interface, but note that there's
+  // a race condition where a file may be added after this check, in which
+  // case the directory will still be deleted.
+  int entries = 0;
+  auto info = libhdfs->hdfsListDirectory(fs, hdfs_path.c_str(), &entries);
+  if (info != nullptr) libhdfs->hdfsFreeFileInfo(info, entries);
+
+  // Due to HDFS bug HDFS-8407, we can't distinguish between an error and empty
+  // folder, especially for Kerberos enable setup, EAGAIN is quite common when
+  // the call is actually successful. Check again by Stat.
+  if (info == nullptr && errno != 0) {
+    TF_FileStatistics stat;
+    Stat(filesystem, path, &stat, status);
+    if (TF_GetCode(status) != TF_OK) return;
+  }
+
+  if (entries > 0)
+    return TF_SetStatus(status, TF_FAILED_PRECONDITION,
+                        "Cannot delete a non-empty directory.");
+
+  if (libhdfs->hdfsDelete(fs, hdfs_path.c_str(), /*recursive=*/1) != 0)
+    TF_SetStatusFromIOError(status, errno, path);
+  else
+    TF_SetStatus(status, TF_OK, "");
+}
+
+void RenameFile(const TF_Filesystem* filesystem, const char* src,
+                const char* dst, TF_Status* status) {
+  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
+  auto fs = Connect(libhdfs, src, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  std::string scheme, namenode, hdfs_path_src, hdfs_path_dst;
+  ParseHadoopPath(src, &scheme, &namenode, &hdfs_path_src);
+  ParseHadoopPath(dst, &scheme, &namenode, &hdfs_path_dst);
+
+  if (libhdfs->hdfsExists(fs, hdfs_path_dst.c_str()) == 0 &&
+      libhdfs->hdfsDelete(fs, hdfs_path_dst.c_str(), /*recursive=*/0) != 0)
+    return TF_SetStatusFromIOError(status, errno, dst);
+
+  if (libhdfs->hdfsRename(fs, hdfs_path_src.c_str(), hdfs_path_dst.c_str()) !=
+      0)
+    TF_SetStatusFromIOError(status, errno, src);
+  else
+    TF_SetStatus(status, TF_OK, "");
+}
+
+int GetChildren(const TF_Filesystem* filesystem, const char* path,
+                char*** entries, TF_Status* status) {
+  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
+  auto fs = Connect(libhdfs, path, status);
+  if (TF_GetCode(status) != TF_OK) return -1;
+
+  std::string scheme, namenode, hdfs_path;
+  ParseHadoopPath(path, &scheme, &namenode, &hdfs_path);
+
+  // hdfsListDirectory returns nullptr if the directory is empty. Do a separate
+  // check to verify the directory exists first.
+  TF_FileStatistics stat;
+  Stat(filesystem, path, &stat, status);
+  if (TF_GetCode(status) != TF_OK) return -1;
+
+  int num_entries = 0;
+  auto info = libhdfs->hdfsListDirectory(fs, hdfs_path.c_str(), &num_entries);
+  if (info == nullptr) {
+    if (stat.is_directory) {
+      // Assume it's an empty directory.
+      TF_SetStatus(status, TF_OK, "");
+      return 0;
+    }
+    TF_SetStatusFromIOError(status, errno, path);
+    return -1;
+  }
+  *entries = static_cast<char**>(
+      plugin_memory_allocate(num_entries * sizeof((*entries)[0])));
+  auto BaseName = [](const std::string& name) {
+    return name.substr(name.find_last_of('/') + 1);
+  };
+  for (int i = 0; i < num_entries; i++) {
+    (*entries)[i] = strdup(BaseName(info[i].mName).c_str());
+  }
+  libhdfs->hdfsFreeFileInfo(info, num_entries);
+  TF_SetStatus(status, TF_OK, "");
+  return num_entries;
+}
+
+// TODO(vnvo2409): Implement later
+
+}  // namespace tf_hadoop_filesystem
+
+static void ProvideFilesystemSupportFor(TF_FilesystemPluginOps* ops,
+                                        const char* uri) {
+  TF_SetFilesystemVersionMetadata(ops);
+  ops->scheme = strdup(uri);
+}
+
+void TF_InitPlugin(TF_FilesystemPluginInfo* info) {
+  info->plugin_memory_allocate = plugin_memory_allocate;
+  info->plugin_memory_free = plugin_memory_free;
+  info->num_schemes = 3;
+  info->ops = static_cast<TF_FilesystemPluginOps*>(
+      plugin_memory_allocate(info->num_schemes * sizeof(info->ops[0])));
+  ProvideFilesystemSupportFor(&info->ops[0], "hdfs");
+  ProvideFilesystemSupportFor(&info->ops[1], "viewfs");
+  ProvideFilesystemSupportFor(&info->ops[2], "har");
+}
diff --git a/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.h b/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.h
new file mode 100644
index 00000000000..850cefe0231
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.h
@@ -0,0 +1,21 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_HADOOP_HADOOP_FILESYSTEM_H_
+#define TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_HADOOP_HADOOP_FILESYSTEM_H_
+
+#include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
+#include "tensorflow/c/tf_status.h"
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_HADOOP_HADOOP_FILESYSTEM_H_
diff --git a/tensorflow/c/experimental/filesystem/plugins/s3/BUILD b/tensorflow/c/experimental/filesystem/plugins/s3/BUILD
new file mode 100644
index 00000000000..56bd3b4a75c
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/s3/BUILD
@@ -0,0 +1,63 @@
+# Experimental s3 filesystem plugin.
+load("//tensorflow:tensorflow.bzl", "get_win_copts", "tf_cc_shared_object", "tf_cc_test")
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+# Filesystem implementation for S3 environments
+tf_cc_shared_object(
+    name = "s3_filesystem",
+    framework_so = [],
+    linkstatic = False,
+    per_os_targets = 1,
+    visibility = ["//visibility:public"],
+    deps = [":s3_filesystem_impl"],
+)
+
+# The real implementation of the filesystem.
+cc_library(
+    name = "s3_filesystem_impl",
+    srcs = ["s3_filesystem.cc"],
+    hdrs = ["s3_filesystem.h"],
+    copts = select({
+        "//conditions:default": [],
+        "//tensorflow:windows": get_win_copts(),
+    }),
+    deps = [
+        ":aws_crypto",
+        "//tensorflow/c:tf_status",
+        "//tensorflow/c/experimental/filesystem:filesystem_interface",
+        "@aws",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_library(
+    name = "aws_crypto",
+    srcs = ["aws_crypto.cc"],
+    hdrs = ["aws_crypto.h"],
+    deps = [
+        "@aws",
+        "@boringssl//:crypto",
+    ],
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "s3_filesystem_test",
+    srcs = [
+        "s3_filesystem_test.cc",
+    ],
+    tags = [
+        "manual",
+        "notap",
+    ],
+    deps = [
+        ":s3_filesystem_impl",
+        "//tensorflow/core/platform:path",
+        "//tensorflow/core/platform:stacktrace_handler",
+        "//tensorflow/core/platform:test",
+    ],
+)
diff --git a/tensorflow/c/experimental/filesystem/plugins/s3/aws_crypto.cc b/tensorflow/c/experimental/filesystem/plugins/s3/aws_crypto.cc
new file mode 100644
index 00000000000..2e15ac176e3
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/s3/aws_crypto.cc
@@ -0,0 +1,133 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/filesystem/plugins/s3/aws_crypto.h"
+
+#include <aws/core/utils/crypto/HashResult.h>
+#include <aws/s3/S3Client.h>
+#include <openssl/hmac.h>
+#include <openssl/rand.h>
+#include <openssl/sha.h>
+
+namespace tf_s3_filesystem {
+
+class AWSSha256HMACOpenSSLImpl : public Aws::Utils::Crypto::HMAC {
+ public:
+  AWSSha256HMACOpenSSLImpl() {}
+
+  virtual ~AWSSha256HMACOpenSSLImpl() = default;
+
+  Aws::Utils::Crypto::HashResult Calculate(
+      const Aws::Utils::ByteBuffer& toSign,
+      const Aws::Utils::ByteBuffer& secret) override {
+    unsigned int length = SHA256_DIGEST_LENGTH;
+    Aws::Utils::ByteBuffer digest(length);
+    memset(digest.GetUnderlyingData(), 0, length);
+
+    HMAC_CTX ctx;
+    HMAC_CTX_init(&ctx);
+
+    HMAC_Init_ex(&ctx, secret.GetUnderlyingData(),
+                 static_cast<int>(secret.GetLength()), EVP_sha256(), NULL);
+    HMAC_Update(&ctx, toSign.GetUnderlyingData(), toSign.GetLength());
+    HMAC_Final(&ctx, digest.GetUnderlyingData(), &length);
+    HMAC_CTX_cleanup(&ctx);
+
+    return Aws::Utils::Crypto::HashResult(std::move(digest));
+  }
+};
+
+class AWSSha256OpenSSLImpl : public Aws::Utils::Crypto::Hash {
+ public:
+  AWSSha256OpenSSLImpl() {}
+
+  virtual ~AWSSha256OpenSSLImpl() = default;
+
+  Aws::Utils::Crypto::HashResult Calculate(const Aws::String& str) override {
+    SHA256_CTX sha256;
+    SHA256_Init(&sha256);
+    SHA256_Update(&sha256, str.data(), str.size());
+
+    Aws::Utils::ByteBuffer hash(SHA256_DIGEST_LENGTH);
+    SHA256_Final(hash.GetUnderlyingData(), &sha256);
+
+    return Aws::Utils::Crypto::HashResult(std::move(hash));
+  }
+
+  Aws::Utils::Crypto::HashResult Calculate(Aws::IStream& stream) override {
+    SHA256_CTX sha256;
+    SHA256_Init(&sha256);
+
+    auto currentPos = stream.tellg();
+    if (currentPos == std::streampos(std::streamoff(-1))) {
+      currentPos = 0;
+      stream.clear();
+    }
+
+    stream.seekg(0, stream.beg);
+
+    char streamBuffer
+        [Aws::Utils::Crypto::Hash::INTERNAL_HASH_STREAM_BUFFER_SIZE];
+    while (stream.good()) {
+      stream.read(streamBuffer,
+                  Aws::Utils::Crypto::Hash::INTERNAL_HASH_STREAM_BUFFER_SIZE);
+      auto bytesRead = stream.gcount();
+
+      if (bytesRead > 0) {
+        SHA256_Update(&sha256, streamBuffer, static_cast<size_t>(bytesRead));
+      }
+    }
+
+    stream.clear();
+    stream.seekg(currentPos, stream.beg);
+
+    Aws::Utils::ByteBuffer hash(SHA256_DIGEST_LENGTH);
+    SHA256_Final(hash.GetUnderlyingData(), &sha256);
+
+    return Aws::Utils::Crypto::HashResult(std::move(hash));
+  }
+};
+
+class AWSSecureRandomBytesImpl : public Aws::Utils::Crypto::SecureRandomBytes {
+ public:
+  AWSSecureRandomBytesImpl() {}
+  virtual ~AWSSecureRandomBytesImpl() = default;
+  void GetBytes(unsigned char* buffer, size_t bufferSize) override {
+    assert(buffer);
+    int success = RAND_bytes(buffer, static_cast<int>(bufferSize));
+    if (success != 1) {
+      m_failure = true;
+    }
+  }
+
+ private:
+  bool m_failure;
+};
+
+std::shared_ptr<Aws::Utils::Crypto::Hash>
+AWSSHA256Factory::CreateImplementation() const {
+  return Aws::MakeShared<AWSSha256OpenSSLImpl>(AWSCryptoAllocationTag);
+}
+
+std::shared_ptr<Aws::Utils::Crypto::HMAC>
+AWSSHA256HmacFactory::CreateImplementation() const {
+  return Aws::MakeShared<AWSSha256HMACOpenSSLImpl>(AWSCryptoAllocationTag);
+}
+
+std::shared_ptr<Aws::Utils::Crypto::SecureRandomBytes>
+AWSSecureRandomFactory::CreateImplementation() const {
+  return Aws::MakeShared<AWSSecureRandomBytesImpl>(AWSCryptoAllocationTag);
+}
+
+}  // namespace tf_s3_filesystem
diff --git a/tensorflow/c/experimental/filesystem/plugins/s3/aws_crypto.h b/tensorflow/c/experimental/filesystem/plugins/s3/aws_crypto.h
new file mode 100644
index 00000000000..a70bf060fc7
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/s3/aws_crypto.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_S3_AWS_CRYPTO_H_
+#define TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_S3_AWS_CRYPTO_H_
+
+#include <aws/core/Aws.h>
+#include <aws/core/utils/crypto/Factories.h>
+#include <aws/core/utils/crypto/HMAC.h>
+#include <aws/core/utils/crypto/Hash.h>
+#include <aws/core/utils/crypto/SecureRandom.h>
+
+namespace tf_s3_filesystem {
+constexpr char AWSCryptoAllocationTag[] = "AWSCryptoAllocation";
+
+class AWSSHA256Factory : public Aws::Utils::Crypto::HashFactory {
+ public:
+  std::shared_ptr<Aws::Utils::Crypto::Hash> CreateImplementation()
+      const override;
+};
+
+class AWSSHA256HmacFactory : public Aws::Utils::Crypto::HMACFactory {
+ public:
+  std::shared_ptr<Aws::Utils::Crypto::HMAC> CreateImplementation()
+      const override;
+};
+
+class AWSSecureRandomFactory : public Aws::Utils::Crypto::SecureRandomFactory {
+ public:
+  std::shared_ptr<Aws::Utils::Crypto::SecureRandomBytes> CreateImplementation()
+      const override;
+};
+
+}  // namespace tf_s3_filesystem
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_S3_AWS_CRYPTO_H_
diff --git a/tensorflow/c/experimental/filesystem/plugins/s3/s3_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/s3/s3_filesystem.cc
new file mode 100644
index 00000000000..7e1b36f2dcc
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/s3/s3_filesystem.cc
@@ -0,0 +1,1239 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/filesystem/plugins/s3/s3_filesystem.h"
+
+#include <aws/core/client/AsyncCallerContext.h>
+#include <aws/core/config/AWSProfileConfigLoader.h>
+#include <aws/core/utils/FileSystemUtils.h>
+#include <aws/core/utils/stream/PreallocatedStreamBuf.h>
+#include <aws/s3/model/AbortMultipartUploadRequest.h>
+#include <aws/s3/model/CompleteMultipartUploadRequest.h>
+#include <aws/s3/model/CompletedMultipartUpload.h>
+#include <aws/s3/model/CompletedPart.h>
+#include <aws/s3/model/CopyObjectRequest.h>
+#include <aws/s3/model/CreateMultipartUploadRequest.h>
+#include <aws/s3/model/DeleteObjectRequest.h>
+#include <aws/s3/model/GetObjectRequest.h>
+#include <aws/s3/model/HeadBucketRequest.h>
+#include <aws/s3/model/HeadObjectRequest.h>
+#include <aws/s3/model/ListObjectsRequest.h>
+#include <aws/s3/model/UploadPartCopyRequest.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "absl/strings/ascii.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
+#include "tensorflow/c/experimental/filesystem/plugins/s3/aws_crypto.h"
+#include "tensorflow/c/tf_status.h"
+
+// Implementation of a filesystem for S3 environments.
+// This filesystem will support `s3://` URI schemes.
+constexpr char kS3FileSystemAllocationTag[] = "S3FileSystemAllocation";
+constexpr char kS3ClientAllocationTag[] = "S3ClientAllocation";
+constexpr int64_t kS3TimeoutMsec = 300000;  // 5 min
+constexpr int kS3GetChildrenMaxKeys = 100;
+
+constexpr char kExecutorTag[] = "TransferManagerExecutorAllocation";
+constexpr int kExecutorPoolSize = 25;
+
+constexpr uint64_t kS3MultiPartUploadChunkSize = 50 * 1024 * 1024;    // 50 MB
+constexpr uint64_t kS3MultiPartDownloadChunkSize = 50 * 1024 * 1024;  // 50 MB
+constexpr size_t kDownloadRetries = 3;
+constexpr size_t kUploadRetries = 3;
+
+constexpr size_t kS3ReadAppendableFileBufferSize = 1024 * 1024;  // 1 MB
+
+static void* plugin_memory_allocate(size_t size) { return calloc(1, size); }
+static void plugin_memory_free(void* ptr) { free(ptr); }
+
+static inline void TF_SetStatusFromAWSError(
+    const Aws::Client::AWSError<Aws::S3::S3Errors>& error, TF_Status* status) {
+  switch (error.GetResponseCode()) {
+    case Aws::Http::HttpResponseCode::FORBIDDEN:
+      TF_SetStatus(status, TF_FAILED_PRECONDITION,
+                   "AWS Credentials have not been set properly. "
+                   "Unable to access the specified S3 location");
+      break;
+    case Aws::Http::HttpResponseCode::REQUESTED_RANGE_NOT_SATISFIABLE:
+      TF_SetStatus(status, TF_OUT_OF_RANGE, "Read less bytes than requested");
+      break;
+    case Aws::Http::HttpResponseCode::NOT_FOUND:
+      TF_SetStatus(status, TF_NOT_FOUND, error.GetMessage().c_str());
+      break;
+    default:
+      TF_SetStatus(
+          status, TF_UNKNOWN,
+          (error.GetExceptionName() + ": " + error.GetMessage()).c_str());
+      break;
+  }
+}
+
+void ParseS3Path(const Aws::String& fname, bool object_empty_ok,
+                 Aws::String* bucket, Aws::String* object, TF_Status* status) {
+  size_t scheme_end = fname.find("://") + 2;
+  if (fname.substr(0, scheme_end + 1) != "s3://") {
+    TF_SetStatus(status, TF_INVALID_ARGUMENT,
+                 "S3 path doesn't start with 's3://'.");
+    return;
+  }
+
+  size_t bucket_end = fname.find("/", scheme_end + 1);
+  if (bucket_end == std::string::npos) {
+    TF_SetStatus(status, TF_INVALID_ARGUMENT,
+                 "S3 path doesn't contain a bucket name.");
+    return;
+  }
+
+  *bucket = fname.substr(scheme_end + 1, bucket_end - scheme_end - 1);
+  *object = fname.substr(bucket_end + 1);
+
+  if (object->empty() && !object_empty_ok) {
+    TF_SetStatus(status, TF_INVALID_ARGUMENT,
+                 "S3 path doesn't contain an object name.");
+  }
+}
+
+static Aws::Client::ClientConfiguration& GetDefaultClientConfig() {
+  ABSL_CONST_INIT static absl::Mutex cfg_lock(absl::kConstInit);
+  static bool init(false);
+  static Aws::Client::ClientConfiguration cfg;
+
+  absl::MutexLock l(&cfg_lock);
+
+  if (!init) {
+    const char* endpoint = getenv("S3_ENDPOINT");
+    if (endpoint) cfg.endpointOverride = Aws::String(endpoint);
+    const char* region = getenv("AWS_REGION");
+    // TODO (yongtang): `S3_REGION` should be deprecated after 2.0.
+    if (!region) region = getenv("S3_REGION");
+    if (region) {
+      cfg.region = Aws::String(region);
+    } else {
+      // Load config file (e.g., ~/.aws/config) only if AWS_SDK_LOAD_CONFIG
+      // is set with a truthy value.
+      const char* load_config_env = getenv("AWS_SDK_LOAD_CONFIG");
+      std::string load_config =
+          load_config_env ? absl::AsciiStrToLower(load_config_env) : "";
+      if (load_config == "true" || load_config == "1") {
+        Aws::String config_file;
+        // If AWS_CONFIG_FILE is set then use it, otherwise use ~/.aws/config.
+        const char* config_file_env = getenv("AWS_CONFIG_FILE");
+        if (config_file_env) {
+          config_file = config_file_env;
+        } else {
+          const char* home_env = getenv("HOME");
+          if (home_env) {
+            config_file = home_env;
+            config_file += "/.aws/config";
+          }
+        }
+        Aws::Config::AWSConfigFileProfileConfigLoader loader(config_file);
+        loader.Load();
+        auto profiles = loader.GetProfiles();
+        if (!profiles["default"].GetRegion().empty())
+          cfg.region = profiles["default"].GetRegion();
+      }
+    }
+    const char* use_https = getenv("S3_USE_HTTPS");
+    if (use_https) {
+      if (use_https[0] == '0')
+        cfg.scheme = Aws::Http::Scheme::HTTP;
+      else
+        cfg.scheme = Aws::Http::Scheme::HTTPS;
+    }
+    const char* verify_ssl = getenv("S3_VERIFY_SSL");
+    if (verify_ssl) {
+      if (verify_ssl[0] == '0')
+        cfg.verifySSL = false;
+      else
+        cfg.verifySSL = true;
+    }
+    // if these timeouts are low, you may see an error when
+    // uploading/downloading large files: Unable to connect to endpoint
+    int64_t timeout;
+    cfg.connectTimeoutMs =
+        absl::SimpleAtoi(getenv("S3_CONNECT_TIMEOUT_MSEC"), &timeout)
+            ? timeout
+            : kS3TimeoutMsec;
+    cfg.requestTimeoutMs =
+        absl::SimpleAtoi(getenv("S3_REQUEST_TIMEOUT_MSEC"), &timeout)
+            ? timeout
+            : kS3TimeoutMsec;
+    const char* ca_file = getenv("S3_CA_FILE");
+    if (ca_file) cfg.caFile = Aws::String(ca_file);
+    const char* ca_path = getenv("S3_CA_PATH");
+    if (ca_path) cfg.caPath = Aws::String(ca_path);
+    init = true;
+  }
+  return cfg;
+};
+
+static void GetS3Client(tf_s3_filesystem::S3File* s3_file) {
+  absl::MutexLock l(&s3_file->initialization_lock);
+
+  if (s3_file->s3_client.get() == nullptr) {
+    Aws::SDKOptions options;
+    options.cryptoOptions.sha256Factory_create_fn = []() {
+      return Aws::MakeShared<tf_s3_filesystem::AWSSHA256Factory>(
+          tf_s3_filesystem::AWSCryptoAllocationTag);
+    };
+    options.cryptoOptions.sha256HMACFactory_create_fn = []() {
+      return Aws::MakeShared<tf_s3_filesystem::AWSSHA256HmacFactory>(
+          tf_s3_filesystem::AWSCryptoAllocationTag);
+    };
+    options.cryptoOptions.secureRandomFactory_create_fn = []() {
+      return Aws::MakeShared<tf_s3_filesystem::AWSSecureRandomFactory>(
+          tf_s3_filesystem::AWSCryptoAllocationTag);
+    };
+    Aws::InitAPI(options);
+
+    // The creation of S3Client disables virtual addressing:
+    //   S3Client(clientConfiguration, signPayloads, useVirtualAddressing =
+    //   true)
+    // The purpose is to address the issue encountered when there is an `.`
+    // in the bucket name. Due to TLS hostname validation or DNS rules,
+    // the bucket may not be resolved. Disabling of virtual addressing
+    // should address the issue. See GitHub issue 16397 for details.
+    s3_file->s3_client = Aws::MakeShared<Aws::S3::S3Client>(
+        kS3ClientAllocationTag, GetDefaultClientConfig(),
+        Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, false);
+  }
+}
+
+static void GetExecutor(tf_s3_filesystem::S3File* s3_file) {
+  absl::MutexLock l(&s3_file->initialization_lock);
+
+  if (s3_file->executor.get() == nullptr) {
+    s3_file->executor =
+        Aws::MakeShared<Aws::Utils::Threading::PooledThreadExecutor>(
+            kExecutorTag, kExecutorPoolSize);
+  }
+}
+
+static void GetTransferManager(
+    const Aws::Transfer::TransferDirection& direction,
+    tf_s3_filesystem::S3File* s3_file) {
+  // These functions should be called before holding `initialization_lock`.
+  GetS3Client(s3_file);
+  GetExecutor(s3_file);
+
+  absl::MutexLock l(&s3_file->initialization_lock);
+
+  if (s3_file->transfer_managers[direction].get() == nullptr) {
+    Aws::Transfer::TransferManagerConfiguration config(s3_file->executor.get());
+    config.s3Client = s3_file->s3_client;
+    config.bufferSize = s3_file->multi_part_chunk_sizes[direction];
+    // must be larger than pool size * multi part chunk size
+    config.transferBufferMaxHeapSize =
+        (kExecutorPoolSize + 1) * s3_file->multi_part_chunk_sizes[direction];
+    s3_file->transfer_managers[direction] =
+        Aws::Transfer::TransferManager::Create(config);
+  }
+}
+
+static void ShutdownClient(Aws::S3::S3Client* s3_client) {
+  if (s3_client != nullptr) {
+    delete s3_client;
+    Aws::SDKOptions options;
+    Aws::ShutdownAPI(options);
+  }
+}
+
+// SECTION 1. Implementation for `TF_RandomAccessFile`
+// ----------------------------------------------------------------------------
+namespace tf_random_access_file {
+typedef struct S3File {
+  Aws::String bucket;
+  Aws::String object;
+  std::shared_ptr<Aws::S3::S3Client> s3_client;
+  std::shared_ptr<Aws::Transfer::TransferManager> transfer_manager;
+  bool use_multi_part_download;
+} S3File;
+
+// AWS Streams destroy the buffer (buf) passed, so creating a new
+// IOStream that retains the buffer so the calling function
+// can control it's lifecycle
+class TFS3UnderlyingStream : public Aws::IOStream {
+ public:
+  using Base = Aws::IOStream;
+  TFS3UnderlyingStream(std::streambuf* buf) : Base(buf) {}
+  virtual ~TFS3UnderlyingStream() = default;
+};
+
+void Cleanup(TF_RandomAccessFile* file) {
+  auto s3_file = static_cast<S3File*>(file->plugin_file);
+  delete s3_file;
+}
+
+static int64_t ReadS3Client(S3File* s3_file, uint64_t offset, size_t n,
+                            char* buffer, TF_Status* status) {
+  Aws::S3::Model::GetObjectRequest get_object_request;
+  get_object_request.WithBucket(s3_file->bucket).WithKey(s3_file->object);
+  Aws::String bytes =
+      absl::StrCat("bytes=", offset, "-", offset + n - 1).c_str();
+  get_object_request.SetRange(bytes);
+  get_object_request.SetResponseStreamFactory(
+      []() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); });
+
+  auto get_object_outcome = s3_file->s3_client->GetObject(get_object_request);
+  if (!get_object_outcome.IsSuccess())
+    TF_SetStatusFromAWSError(get_object_outcome.GetError(), status);
+  else
+    TF_SetStatus(status, TF_OK, "");
+  if (TF_GetCode(status) != TF_OK && TF_GetCode(status) != TF_OUT_OF_RANGE)
+    return -1;
+
+  int64_t read = get_object_outcome.GetResult().GetContentLength();
+  if (read < n)
+    TF_SetStatus(status, TF_OUT_OF_RANGE, "Read less bytes than requested");
+  get_object_outcome.GetResult().GetBody().read(buffer, read);
+  return read;
+}
+
+static int64_t ReadS3TransferManager(S3File* s3_file, uint64_t offset, size_t n,
+                                     char* buffer, TF_Status* status) {
+  auto create_download_stream = [&]() {
+    return Aws::New<TFS3UnderlyingStream>(
+        "S3ReadStream",
+        Aws::New<Aws::Utils::Stream::PreallocatedStreamBuf>(
+            "S3ReadStream", reinterpret_cast<unsigned char*>(buffer), n));
+  };
+  auto handle = s3_file->transfer_manager->DownloadFile(
+      s3_file->bucket, s3_file->object, offset, n, create_download_stream);
+  handle->WaitUntilFinished();
+
+  size_t retries = 0;
+  while (handle->GetStatus() == Aws::Transfer::TransferStatus::FAILED &&
+         handle->GetLastError().GetResponseCode() !=
+             Aws::Http::HttpResponseCode::REQUESTED_RANGE_NOT_SATISFIABLE &&
+         retries++ < kDownloadRetries) {
+    // Only failed parts will be downloaded again.
+    s3_file->transfer_manager->RetryDownload(handle);
+    handle->WaitUntilFinished();
+  }
+
+  if (handle->GetStatus() != Aws::Transfer::TransferStatus::COMPLETED)
+    TF_SetStatusFromAWSError(handle->GetLastError(), status);
+  else
+    TF_SetStatus(status, TF_OK, "");
+  if (TF_GetCode(status) != TF_OK && TF_GetCode(status) != TF_OUT_OF_RANGE)
+    return -1;
+  int64_t read = handle->GetBytesTransferred();
+  if (read < n)
+    TF_SetStatus(status, TF_OUT_OF_RANGE, "Read less bytes than requested");
+  return read;
+}
+
+int64_t Read(const TF_RandomAccessFile* file, uint64_t offset, size_t n,
+             char* buffer, TF_Status* status) {
+  auto s3_file = static_cast<S3File*>(file->plugin_file);
+  if (s3_file->use_multi_part_download)
+    return ReadS3TransferManager(s3_file, offset, n, buffer, status);
+  else
+    return ReadS3Client(s3_file, offset, n, buffer, status);
+}
+
+}  // namespace tf_random_access_file
+
+// SECTION 2. Implementation for `TF_WritableFile`
+// ----------------------------------------------------------------------------
+namespace tf_writable_file {
+typedef struct S3File {
+  Aws::String bucket;
+  Aws::String object;
+  std::shared_ptr<Aws::S3::S3Client> s3_client;
+  std::shared_ptr<Aws::Transfer::TransferManager> transfer_manager;
+  bool sync_needed;
+  std::shared_ptr<Aws::Utils::TempFile> outfile;
+  S3File(Aws::String bucket, Aws::String object,
+         std::shared_ptr<Aws::S3::S3Client> s3_client,
+         std::shared_ptr<Aws::Transfer::TransferManager> transfer_manager)
+      : bucket(bucket),
+        object(object),
+        s3_client(s3_client),
+        transfer_manager(transfer_manager),
+        outfile(Aws::MakeShared<Aws::Utils::TempFile>(
+            kS3FileSystemAllocationTag, nullptr, "_s3_filesystem_XXXXXX",
+            std::ios_base::binary | std::ios_base::trunc | std::ios_base::in |
+                std::ios_base::out)) {}
+} S3File;
+
+void Cleanup(TF_WritableFile* file) {
+  auto s3_file = static_cast<S3File*>(file->plugin_file);
+  delete s3_file;
+}
+
+void Append(const TF_WritableFile* file, const char* buffer, size_t n,
+            TF_Status* status) {
+  auto s3_file = static_cast<S3File*>(file->plugin_file);
+  if (!s3_file->outfile) {
+    TF_SetStatus(status, TF_FAILED_PRECONDITION,
+                 "The internal temporary file is not writable.");
+    return;
+  }
+  s3_file->sync_needed = true;
+  s3_file->outfile->write(buffer, n);
+  if (!s3_file->outfile->good())
+    TF_SetStatus(status, TF_INTERNAL,
+                 "Could not append to the internal temporary file.");
+  else
+    TF_SetStatus(status, TF_OK, "");
+}
+
+int64_t Tell(const TF_WritableFile* file, TF_Status* status) {
+  auto s3_file = static_cast<S3File*>(file->plugin_file);
+  auto position = static_cast<int64_t>(s3_file->outfile->tellp());
+  if (position == -1)
+    TF_SetStatus(status, TF_INTERNAL,
+                 "tellp on the internal temporary file failed");
+  else
+    TF_SetStatus(status, TF_OK, "");
+  return position;
+}
+
+void Sync(const TF_WritableFile* file, TF_Status* status) {
+  auto s3_file = static_cast<S3File*>(file->plugin_file);
+  if (!s3_file->outfile) {
+    TF_SetStatus(status, TF_FAILED_PRECONDITION,
+                 "The internal temporary file is not writable.");
+    return;
+  }
+  if (!s3_file->sync_needed) {
+    TF_SetStatus(status, TF_OK, "");
+    return;
+  }
+  auto position = static_cast<int64_t>(s3_file->outfile->tellp());
+  auto handle = s3_file->transfer_manager->UploadFile(
+      s3_file->outfile, s3_file->bucket, s3_file->object,
+      "application/octet-stream", Aws::Map<Aws::String, Aws::String>());
+  handle->WaitUntilFinished();
+
+  size_t retries = 0;
+  while (handle->GetStatus() == Aws::Transfer::TransferStatus::FAILED &&
+         retries++ < kUploadRetries) {
+    // if multipart upload was used, only the failed parts will be re-sent
+    s3_file->transfer_manager->RetryUpload(s3_file->outfile, handle);
+    handle->WaitUntilFinished();
+  }
+  if (handle->GetStatus() != Aws::Transfer::TransferStatus::COMPLETED)
+    return TF_SetStatusFromAWSError(handle->GetLastError(), status);
+  s3_file->outfile->clear();
+  s3_file->outfile->seekp(position);
+  s3_file->sync_needed = false;
+  TF_SetStatus(status, TF_OK, "");
+}
+
+void Flush(const TF_WritableFile* file, TF_Status* status) {
+  Sync(file, status);
+}
+
+void Close(const TF_WritableFile* file, TF_Status* status) {
+  auto s3_file = static_cast<S3File*>(file->plugin_file);
+  if (s3_file->outfile) {
+    Sync(file, status);
+    if (TF_GetCode(status) != TF_OK) return;
+    s3_file->outfile.reset();
+  }
+  TF_SetStatus(status, TF_OK, "");
+}
+
+}  // namespace tf_writable_file
+
+// SECTION 3. Implementation for `TF_ReadOnlyMemoryRegion`
+// ----------------------------------------------------------------------------
+namespace tf_read_only_memory_region {
+typedef struct S3MemoryRegion {
+  std::unique_ptr<char[]> data;
+  uint64_t length;
+} S3MemoryRegion;
+
+void Cleanup(TF_ReadOnlyMemoryRegion* region) {
+  auto r = static_cast<S3MemoryRegion*>(region->plugin_memory_region);
+  delete r;
+}
+
+const void* Data(const TF_ReadOnlyMemoryRegion* region) {
+  auto r = static_cast<S3MemoryRegion*>(region->plugin_memory_region);
+  return reinterpret_cast<const void*>(r->data.get());
+}
+
+uint64_t Length(const TF_ReadOnlyMemoryRegion* region) {
+  auto r = static_cast<S3MemoryRegion*>(region->plugin_memory_region);
+  return r->length;
+}
+
+}  // namespace tf_read_only_memory_region
+
+// SECTION 4. Implementation for `TF_Filesystem`, the actual filesystem
+// ----------------------------------------------------------------------------
+namespace tf_s3_filesystem {
+S3File::S3File()
+    : s3_client(nullptr, ShutdownClient),
+      executor(nullptr),
+      transfer_managers(),
+      multi_part_chunk_sizes(),
+      use_multi_part_download(true),
+      initialization_lock() {
+  uint64_t temp_value;
+  multi_part_chunk_sizes[Aws::Transfer::TransferDirection::UPLOAD] =
+      absl::SimpleAtoi(getenv("S3_MULTI_PART_UPLOAD_CHUNK_SIZE"), &temp_value)
+          ? temp_value
+          : kS3MultiPartUploadChunkSize;
+  multi_part_chunk_sizes[Aws::Transfer::TransferDirection::DOWNLOAD] =
+      absl::SimpleAtoi(getenv("S3_MULTI_PART_DOWNLOAD_CHUNK_SIZE"), &temp_value)
+          ? temp_value
+          : kS3MultiPartDownloadChunkSize;
+  use_multi_part_download =
+      absl::SimpleAtoi(getenv("S3_DISABLE_MULTI_PART_DOWNLOAD"), &temp_value)
+          ? (temp_value != 1)
+          : use_multi_part_download;
+  transfer_managers.emplace(Aws::Transfer::TransferDirection::UPLOAD, nullptr);
+  transfer_managers.emplace(Aws::Transfer::TransferDirection::DOWNLOAD,
+                            nullptr);
+}
+void Init(TF_Filesystem* filesystem, TF_Status* status) {
+  filesystem->plugin_filesystem = new S3File();
+  TF_SetStatus(status, TF_OK, "");
+}
+
+void Cleanup(TF_Filesystem* filesystem) {
+  auto s3_file = static_cast<S3File*>(filesystem->plugin_filesystem);
+  delete s3_file;
+}
+
+void NewRandomAccessFile(const TF_Filesystem* filesystem, const char* path,
+                         TF_RandomAccessFile* file, TF_Status* status) {
+  Aws::String bucket, object;
+  ParseS3Path(path, false, &bucket, &object, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  auto s3_file = static_cast<S3File*>(filesystem->plugin_filesystem);
+  GetS3Client(s3_file);
+  GetTransferManager(Aws::Transfer::TransferDirection::DOWNLOAD, s3_file);
+  file->plugin_file = new tf_random_access_file::S3File(
+      {bucket, object, s3_file->s3_client,
+       s3_file->transfer_managers[Aws::Transfer::TransferDirection::DOWNLOAD],
+       s3_file->use_multi_part_download});
+  TF_SetStatus(status, TF_OK, "");
+}
+
+void NewWritableFile(const TF_Filesystem* filesystem, const char* path,
+                     TF_WritableFile* file, TF_Status* status) {
+  Aws::String bucket, object;
+  ParseS3Path(path, false, &bucket, &object, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  auto s3_file = static_cast<S3File*>(filesystem->plugin_filesystem);
+  GetS3Client(s3_file);
+  GetTransferManager(Aws::Transfer::TransferDirection::UPLOAD, s3_file);
+  file->plugin_file = new tf_writable_file::S3File(
+      bucket, object, s3_file->s3_client,
+      s3_file->transfer_managers[Aws::Transfer::TransferDirection::UPLOAD]);
+  TF_SetStatus(status, TF_OK, "");
+}
+
+void NewAppendableFile(const TF_Filesystem* filesystem, const char* path,
+                       TF_WritableFile* file, TF_Status* status) {
+  Aws::String bucket, object;
+  ParseS3Path(path, false, &bucket, &object, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  auto s3_file = static_cast<S3File*>(filesystem->plugin_filesystem);
+  GetS3Client(s3_file);
+  GetTransferManager(Aws::Transfer::TransferDirection::UPLOAD, s3_file);
+
+  // We need to delete `file->plugin_file` in case of errors. We set
+  // `file->plugin_file` to `nullptr` in order to avoid segment fault when
+  // calling deleter of `unique_ptr`.
+  file->plugin_file = nullptr;
+  std::unique_ptr<TF_WritableFile, void (*)(TF_WritableFile*)> writer(
+      file, [](TF_WritableFile* file) {
+        if (file != nullptr && file->plugin_file != nullptr) {
+          tf_writable_file::Cleanup(file);
+        }
+      });
+  writer->plugin_file = new tf_writable_file::S3File(
+      bucket, object, s3_file->s3_client,
+      s3_file->transfer_managers[Aws::Transfer::TransferDirection::UPLOAD]);
+  TF_SetStatus(status, TF_OK, "");
+
+  // Wraping inside a `std::unique_ptr` to prevent memory-leaking.
+  std::unique_ptr<TF_RandomAccessFile, void (*)(TF_RandomAccessFile*)> reader(
+      new TF_RandomAccessFile, [](TF_RandomAccessFile* file) {
+        if (file != nullptr) {
+          if (file->plugin_file != nullptr)
+            tf_random_access_file::Cleanup(file);
+          delete file;
+        }
+      });
+  // We set `reader->plugin_file` to `nullptr` in order to avoid segment fault
+  // when calling deleter of `unique_ptr`
+  reader->plugin_file = nullptr;
+  NewRandomAccessFile(filesystem, path, reader.get(), status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  uint64_t offset = 0;
+  std::string buffer(kS3ReadAppendableFileBufferSize, {});
+  while (true) {
+    auto read = tf_random_access_file::Read(reader.get(), offset,
+                                            kS3ReadAppendableFileBufferSize,
+                                            &buffer[0], status);
+    if (TF_GetCode(status) == TF_NOT_FOUND) {
+      break;
+    } else if (TF_GetCode(status) == TF_OK) {
+      offset += read;
+      tf_writable_file::Append(file, buffer.c_str(), read, status);
+      if (TF_GetCode(status) != TF_OK) return;
+    } else if (TF_GetCode(status) == TF_OUT_OF_RANGE) {
+      offset += read;
+      tf_writable_file::Append(file, buffer.c_str(), read, status);
+      if (TF_GetCode(status) != TF_OK) return;
+      break;
+    } else {
+      return;
+    }
+  }
+  writer.release();
+  TF_SetStatus(status, TF_OK, "");
+}
+
+void Stat(const TF_Filesystem* filesystem, const char* path,
+          TF_FileStatistics* stats, TF_Status* status) {
+  Aws::String bucket, object;
+  ParseS3Path(path, true, &bucket, &object, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  auto s3_file = static_cast<S3File*>(filesystem->plugin_filesystem);
+  GetS3Client(s3_file);
+
+  if (object.empty()) {
+    Aws::S3::Model::HeadBucketRequest head_bucket_request;
+    head_bucket_request.WithBucket(bucket);
+    auto head_bucket_outcome =
+        s3_file->s3_client->HeadBucket(head_bucket_request);
+    if (!head_bucket_outcome.IsSuccess())
+      return TF_SetStatusFromAWSError(head_bucket_outcome.GetError(), status);
+    stats->length = 0;
+    stats->is_directory = 1;
+    stats->mtime_nsec = 0;
+    return TF_SetStatus(status, TF_OK, "");
+  }
+
+  bool found = false;
+  Aws::S3::Model::HeadObjectRequest head_object_request;
+  head_object_request.WithBucket(bucket).WithKey(object);
+  head_object_request.SetResponseStreamFactory(
+      []() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); });
+  auto head_object_outcome =
+      s3_file->s3_client->HeadObject(head_object_request);
+  if (head_object_outcome.IsSuccess()) {
+    stats->length = head_object_outcome.GetResult().GetContentLength();
+    stats->is_directory = 0;
+    stats->mtime_nsec =
+        head_object_outcome.GetResult().GetLastModified().Millis() * 1e6;
+    found = true;
+  } else {
+    TF_SetStatusFromAWSError(head_object_outcome.GetError(), status);
+    if (TF_GetCode(status) == TF_FAILED_PRECONDITION) return;
+  }
+
+  auto prefix = object;
+  if (prefix.back() != '/') {
+    prefix.push_back('/');
+  }
+  Aws::S3::Model::ListObjectsRequest list_objects_request;
+  list_objects_request.WithBucket(bucket).WithPrefix(prefix).WithMaxKeys(1);
+  list_objects_request.SetResponseStreamFactory(
+      []() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); });
+  auto list_objects_outcome =
+      s3_file->s3_client->ListObjects(list_objects_request);
+  if (list_objects_outcome.IsSuccess()) {
+    auto objects = list_objects_outcome.GetResult().GetContents();
+    if (objects.size() > 0) {
+      stats->length = 0;
+      stats->is_directory = 1;
+      stats->mtime_nsec = objects[0].GetLastModified().Millis() * 1e6;
+      found = true;
+    }
+  } else {
+    TF_SetStatusFromAWSError(list_objects_outcome.GetError(), status);
+    if (TF_GetCode(status) == TF_FAILED_PRECONDITION) return;
+  }
+  if (!found)
+    return TF_SetStatus(
+        status, TF_NOT_FOUND,
+        absl::StrCat("Object ", path, " does not exist").c_str());
+  TF_SetStatus(status, TF_OK, "");
+}
+
+void PathExists(const TF_Filesystem* filesystem, const char* path,
+                TF_Status* status) {
+  TF_FileStatistics stats;
+  Stat(filesystem, path, &stats, status);
+}
+
+int64_t GetFileSize(const TF_Filesystem* filesystem, const char* path,
+                    TF_Status* status) {
+  TF_FileStatistics stats;
+  Stat(filesystem, path, &stats, status);
+  return stats.length;
+}
+
+void NewReadOnlyMemoryRegionFromFile(const TF_Filesystem* filesystem,
+                                     const char* path,
+                                     TF_ReadOnlyMemoryRegion* region,
+                                     TF_Status* status) {
+  Aws::String bucket, object;
+  ParseS3Path(path, false, &bucket, &object, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  auto s3_file = static_cast<S3File*>(filesystem->plugin_filesystem);
+  GetS3Client(s3_file);
+  GetTransferManager(Aws::Transfer::TransferDirection::UPLOAD, s3_file);
+
+  auto size = GetFileSize(filesystem, path, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  if (size == 0)
+    return TF_SetStatus(status, TF_INVALID_ARGUMENT, "File is empty");
+
+  std::unique_ptr<char[]> data(new char[size]);
+  // Wraping inside a `std::unique_ptr` to prevent memory-leaking.
+  std::unique_ptr<TF_RandomAccessFile, void (*)(TF_RandomAccessFile*)> reader(
+      new TF_RandomAccessFile, [](TF_RandomAccessFile* file) {
+        if (file != nullptr) {
+          if (file->plugin_file != nullptr)
+            tf_random_access_file::Cleanup(file);
+          delete file;
+        }
+      });
+  // We set `reader->plugin_file` to `nullptr` in order to avoid segment fault
+  // when calling deleter of `unique_ptr`
+  reader->plugin_file = nullptr;
+  NewRandomAccessFile(filesystem, path, reader.get(), status);
+  if (TF_GetCode(status) != TF_OK) return;
+  auto read =
+      tf_random_access_file::Read(reader.get(), 0, size, data.get(), status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  region->plugin_memory_region = new tf_read_only_memory_region::S3MemoryRegion(
+      {std::move(data), static_cast<uint64_t>(read)});
+  TF_SetStatus(status, TF_OK, "");
+}
+
+static void SimpleCopyFile(const Aws::String& source,
+                           const Aws::String& bucket_dst,
+                           const Aws::String& object_dst, S3File* s3_file,
+                           TF_Status* status) {
+  Aws::S3::Model::CopyObjectRequest copy_object_request;
+  copy_object_request.WithCopySource(source)
+      .WithBucket(bucket_dst)
+      .WithKey(object_dst);
+  auto copy_object_outcome =
+      s3_file->s3_client->CopyObject(copy_object_request);
+  if (!copy_object_outcome.IsSuccess())
+    TF_SetStatusFromAWSError(copy_object_outcome.GetError(), status);
+  else
+    TF_SetStatus(status, TF_OK, "");
+};
+
+using EtagOutcome =
+    Aws::Utils::Outcome<Aws::String, Aws::Client::AWSError<Aws::S3::S3Errors>>;
+typedef struct MultipartCopyAsyncContext
+    : public Aws::Client::AsyncCallerContext {
+  int part_number;
+  int* num_finished_parts;
+  Aws::Vector<EtagOutcome>* etag_outcomes;
+
+  // lock and cv for multi part copy
+  absl::Mutex* multi_part_copy_mutex;
+  absl::CondVar* multi_part_copy_cv;
+} MultipartCopyAsyncContext;
+
+static void AbortMultiPartCopy(const Aws::String& bucket_dst,
+                               const Aws::String& object_dst,
+                               const Aws::String& upload_id, S3File* s3_file,
+                               TF_Status* status) {
+  Aws::S3::Model::AbortMultipartUploadRequest request;
+  request.WithBucket(bucket_dst).WithKey(object_dst).WithUploadId(upload_id);
+  auto outcome = s3_file->s3_client->AbortMultipartUpload(request);
+  if (!outcome.IsSuccess())
+    TF_SetStatusFromAWSError(outcome.GetError(), status);
+  else
+    TF_SetStatus(status, TF_OK, "");
+}
+
+static void MultiPartCopyCallback(
+    const Aws::S3::Model::UploadPartCopyRequest& request,
+    const Aws::S3::Model::UploadPartCopyOutcome& outcome,
+    const std::shared_ptr<const MultipartCopyAsyncContext>& context) {
+  // Access to `etag_outcomes` should be thread-safe because of distinct
+  // `part_number`.
+  auto part_number = context->part_number;
+  auto etag_outcomes = context->etag_outcomes;
+  if (outcome.IsSuccess()) {
+    (*etag_outcomes)[part_number] =
+        outcome.GetResult().GetCopyPartResult().GetETag();
+  } else {
+    (*etag_outcomes)[part_number] = outcome.GetError();
+  }
+  {
+    absl::MutexLock l(context->multi_part_copy_mutex);
+    (*context->num_finished_parts)++;
+    context->multi_part_copy_cv->Signal();
+  }
+}
+
+static void MultiPartCopy(const Aws::String& source,
+                          const Aws::String& bucket_dst,
+                          const Aws::String& object_dst, const size_t num_parts,
+                          const uint64_t file_size, S3File* s3_file,
+                          TF_Status* status) {
+  Aws::S3::Model::CreateMultipartUploadRequest create_multipart_upload_request;
+  create_multipart_upload_request.WithBucket(bucket_dst).WithKey(object_dst);
+
+  GetS3Client(s3_file);
+  GetTransferManager(Aws::Transfer::TransferDirection::UPLOAD, s3_file);
+
+  auto create_multipart_upload_outcome =
+      s3_file->s3_client->CreateMultipartUpload(
+          create_multipart_upload_request);
+  if (!create_multipart_upload_outcome.IsSuccess())
+    return TF_SetStatusFromAWSError(create_multipart_upload_outcome.GetError(),
+                                    status);
+
+  auto upload_id = create_multipart_upload_outcome.GetResult().GetUploadId();
+
+  int num_finished_parts = 0;
+  // Keep track of `Outcome` of each upload part.
+  Aws::Vector<EtagOutcome> etag_outcomes(num_parts);
+  // Mutex which protects access of the part_states map.
+  absl::Mutex multi_part_copy_mutex;
+  // Condition variable to be used with above mutex for synchronization.
+  absl::CondVar multi_part_copy_cv;
+
+  auto chunk_size =
+      s3_file->multi_part_chunk_sizes[Aws::Transfer::TransferDirection::UPLOAD];
+
+  size_t retries = 0;
+  while (retries++ < 3) {
+    // Queue up parts.
+    for (auto part_number = 0; part_number < num_parts; ++part_number) {
+      if (etag_outcomes[part_number].IsSuccess()) continue;
+      uint64_t start_pos = part_number * chunk_size;
+      uint64_t end_pos = start_pos + chunk_size - 1;
+      if (end_pos >= file_size) end_pos = file_size - 1;
+
+      Aws::String range =
+          absl::StrCat("bytes=", start_pos, "-", end_pos).c_str();
+      Aws::S3::Model::UploadPartCopyRequest upload_part_copy_request;
+      upload_part_copy_request.WithBucket(bucket_dst)
+          .WithKey(object_dst)
+          .WithCopySource(source)
+          .WithCopySourceRange(range)
+          // S3 API partNumber starts from 1.
+          .WithPartNumber(part_number + 1)
+          .WithUploadId(upload_id);
+
+      auto multi_part_context =
+          Aws::MakeShared<MultipartCopyAsyncContext>("MultiPartCopyContext");
+      multi_part_context->part_number = part_number;
+      multi_part_context->num_finished_parts = &num_finished_parts;
+      multi_part_context->etag_outcomes = &etag_outcomes;
+      multi_part_context->multi_part_copy_mutex = &multi_part_copy_mutex;
+      multi_part_context->multi_part_copy_cv = &multi_part_copy_cv;
+      auto callback =
+          [](const Aws::S3::S3Client* client,
+             const Aws::S3::Model::UploadPartCopyRequest& request,
+             const Aws::S3::Model::UploadPartCopyOutcome& outcome,
+             const std::shared_ptr<const Aws::Client::AsyncCallerContext>&
+                 context) {
+            auto multipart_context =
+                std::static_pointer_cast<const MultipartCopyAsyncContext>(
+                    context);
+            MultiPartCopyCallback(request, outcome, multipart_context);
+          };
+
+      std::shared_ptr<const Aws::Client::AsyncCallerContext> context =
+          multi_part_context;
+      s3_file->s3_client->UploadPartCopyAsync(upload_part_copy_request,
+                                              callback, context);
+    }
+    // Wait till they finish.
+    {
+      absl::MutexLock l(&multi_part_copy_mutex);
+      // Wait on the mutex until notify is called then check the finished parts
+      // as there could be false notifications.
+      while (num_finished_parts != num_parts) {
+        multi_part_copy_cv.Wait(&multi_part_copy_mutex);
+      }
+    }
+    // check if there was any error for any part.
+    for (auto part_number = 0; part_number < num_parts; ++part_number) {
+      if (!etag_outcomes[part_number].IsSuccess()) {
+        if (retries >= 3) {
+          AbortMultiPartCopy(bucket_dst, object_dst, upload_id, s3_file,
+                             status);
+          if (TF_GetCode(status) != TF_OK) return;
+          return TF_SetStatusFromAWSError(etag_outcomes[part_number].GetError(),
+                                          status);
+        } else {
+          // Retry.
+          num_finished_parts--;
+        }
+      }
+    }
+  }
+
+  Aws::S3::Model::CompletedMultipartUpload completed_multipart_upload;
+  // If there was an error still in any part, it would abort and return in the
+  // above loop. We set the eTag of completed parts to the final
+  // `completed_multipart_upload`. Note these parts have to be added in order.
+  for (int part_number = 0; part_number < num_parts; ++part_number) {
+    Aws::S3::Model::CompletedPart completed_part;
+    completed_part.SetPartNumber(part_number + 1);
+    completed_part.SetETag(etag_outcomes[part_number].GetResult());
+    completed_multipart_upload.AddParts(completed_part);
+  }
+
+  Aws::S3::Model::CompleteMultipartUploadRequest
+      complete_multipart_upload_request;
+  complete_multipart_upload_request.WithBucket(bucket_dst)
+      .WithKey(object_dst)
+      .WithUploadId(upload_id)
+      .WithMultipartUpload(completed_multipart_upload);
+  auto complete_multipart_upload_outcome =
+      s3_file->s3_client->CompleteMultipartUpload(
+          complete_multipart_upload_request);
+  if (!complete_multipart_upload_outcome.IsSuccess())
+    AbortMultiPartCopy(bucket_dst, object_dst, upload_id, s3_file, status);
+  else
+    return TF_SetStatus(status, TF_OK, "");
+  if (TF_GetCode(status) == TF_OK)
+    return TF_SetStatusFromAWSError(
+        complete_multipart_upload_outcome.GetError(), status);
+};
+
+void CopyFile(const TF_Filesystem* filesystem, const char* src, const char* dst,
+              TF_Status* status) {
+  auto file_size = GetFileSize(filesystem, src, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  if (file_size == 0)
+    return TF_SetStatus(status, TF_FAILED_PRECONDITION,
+                        "Source is a directory or empty file");
+
+  Aws::String bucket_src, object_src;
+  ParseS3Path(src, false, &bucket_src, &object_src, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  Aws::String copy_src = bucket_src + "/" + object_src;
+
+  Aws::String bucket_dst, object_dst;
+  ParseS3Path(dst, false, &bucket_dst, &object_dst, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  auto s3_file = static_cast<S3File*>(filesystem->plugin_filesystem);
+  auto chunk_size =
+      s3_file->multi_part_chunk_sizes[Aws::Transfer::TransferDirection::UPLOAD];
+  size_t num_parts = 1;
+  if (file_size > chunk_size) num_parts = ceil((float)file_size / chunk_size);
+  if (num_parts == 1)
+    SimpleCopyFile(copy_src, bucket_dst, object_dst, s3_file, status);
+  else if (num_parts > 10000)
+    TF_SetStatus(
+        status, TF_UNIMPLEMENTED,
+        absl::StrCat("MultiPartCopy with number of parts more than 10000 is "
+                     "not supported. Your object ",
+                     src, " required ", num_parts,
+                     " as multi_part_copy_part_size is set to ", chunk_size,
+                     ". You can control this part size using the environment "
+                     "variable S3_MULTI_PART_COPY_PART_SIZE to increase it.")
+            .c_str());
+  else
+    MultiPartCopy(copy_src, bucket_dst, object_dst, num_parts, file_size,
+                  s3_file, status);
+}
+
+void DeleteFile(const TF_Filesystem* filesystem, const char* path,
+                TF_Status* status) {
+  Aws::String bucket, object;
+  ParseS3Path(path, false, &bucket, &object, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  auto s3_file = static_cast<S3File*>(filesystem->plugin_filesystem);
+  GetS3Client(s3_file);
+
+  Aws::S3::Model::DeleteObjectRequest delete_object_request;
+  delete_object_request.WithBucket(bucket).WithKey(object);
+  auto delete_object_outcome =
+      s3_file->s3_client->DeleteObject(delete_object_request);
+  if (!delete_object_outcome.IsSuccess())
+    TF_SetStatusFromAWSError(delete_object_outcome.GetError(), status);
+  else
+    TF_SetStatus(status, TF_OK, "");
+}
+
+void CreateDir(const TF_Filesystem* filesystem, const char* path,
+               TF_Status* status) {
+  Aws::String bucket, object;
+  ParseS3Path(path, true, &bucket, &object, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  auto s3_file = static_cast<S3File*>(filesystem->plugin_filesystem);
+  GetS3Client(s3_file);
+
+  if (object.empty()) {
+    Aws::S3::Model::HeadBucketRequest head_bucket_request;
+    head_bucket_request.WithBucket(bucket);
+    auto head_bucket_outcome =
+        s3_file->s3_client->HeadBucket(head_bucket_request);
+    if (!head_bucket_outcome.IsSuccess())
+      TF_SetStatusFromAWSError(head_bucket_outcome.GetError(), status);
+    else
+      TF_SetStatus(status, TF_OK, "");
+    return;
+  }
+
+  Aws::String dir_path = path;
+  if (dir_path.back() != '/') dir_path.push_back('/');
+
+  PathExists(filesystem, dir_path.c_str(), status);
+  if (TF_GetCode(status) == TF_OK) {
+    std::unique_ptr<TF_WritableFile, void (*)(TF_WritableFile * file)> file(
+        new TF_WritableFile, [](TF_WritableFile* file) {
+          if (file != nullptr) {
+            if (file->plugin_file != nullptr) tf_writable_file::Cleanup(file);
+            delete file;
+          }
+        });
+    file->plugin_file = nullptr;
+    NewWritableFile(filesystem, dir_path.c_str(), file.get(), status);
+    if (TF_GetCode(status) != TF_OK) return;
+    tf_writable_file::Close(file.get(), status);
+    if (TF_GetCode(status) != TF_OK) return;
+  }
+  TF_SetStatus(status, TF_OK, "");
+}
+
+void DeleteDir(const TF_Filesystem* filesystem, const char* path,
+               TF_Status* status) {
+  Aws::String bucket, object;
+  ParseS3Path(path, false, &bucket, &object, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  auto s3_file = static_cast<S3File*>(filesystem->plugin_filesystem);
+  GetS3Client(s3_file);
+
+  if (object.back() != '/') object.push_back('/');
+  Aws::S3::Model::ListObjectsRequest list_objects_request;
+  list_objects_request.WithBucket(bucket).WithPrefix(object).WithMaxKeys(2);
+  list_objects_request.SetResponseStreamFactory(
+      []() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); });
+  auto list_objects_outcome =
+      s3_file->s3_client->ListObjects(list_objects_request);
+  if (list_objects_outcome.IsSuccess()) {
+    auto contents = list_objects_outcome.GetResult().GetContents();
+    if (contents.size() > 1 ||
+        (contents.size() == 1 && contents[0].GetKey() != object)) {
+      TF_SetStatus(status, TF_UNKNOWN,
+                   "Cannot delete a non-empty directory. "
+                   "This operation will be retried in case this "
+                   "is due to S3's eventual consistency.");
+    }
+    if (contents.size() == 1 && contents[0].GetKey() == object) {
+      Aws::String dir_path = path;
+      if (dir_path.back() != '/') dir_path.push_back('/');
+      DeleteFile(filesystem, dir_path.c_str(), status);
+    }
+  } else {
+    TF_SetStatusFromAWSError(list_objects_outcome.GetError(), status);
+  }
+}
+
+void RenameFile(const TF_Filesystem* filesystem, const char* src,
+                const char* dst, TF_Status* status) {
+  Aws::String bucket_src, object_src;
+  ParseS3Path(src, false, &bucket_src, &object_src, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  Aws::String copy_src = bucket_src + "/" + object_src;
+
+  Aws::String bucket_dst, object_dst;
+  ParseS3Path(dst, false, &bucket_dst, &object_dst, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  auto s3_file = static_cast<S3File*>(filesystem->plugin_filesystem);
+  GetS3Client(s3_file);
+
+  if (object_src.back() == '/') {
+    if (object_dst.back() != '/') {
+      object_dst.push_back('/');
+    }
+  } else {
+    if (object_dst.back() == '/') {
+      object_dst.pop_back();
+    }
+  }
+
+  Aws::S3::Model::DeleteObjectRequest delete_object_request;
+  Aws::S3::Model::ListObjectsRequest list_objects_request;
+  list_objects_request.WithBucket(bucket_src)
+      .WithPrefix(object_src)
+      .WithMaxKeys(kS3GetChildrenMaxKeys);
+  list_objects_request.SetResponseStreamFactory(
+      []() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); });
+
+  Aws::S3::Model::ListObjectsResult list_objects_result;
+  do {
+    auto list_objects_outcome =
+        s3_file->s3_client->ListObjects(list_objects_request);
+    if (!list_objects_outcome.IsSuccess())
+      return TF_SetStatusFromAWSError(list_objects_outcome.GetError(), status);
+
+    list_objects_result = list_objects_outcome.GetResult();
+    for (const auto& object : list_objects_result.GetContents()) {
+      Aws::String key_src = object.GetKey();
+      Aws::String key_dst = key_src;
+      key_dst.replace(0, object_src.length(), object_dst);
+      CopyFile(filesystem, ("s3://" + bucket_src + "/" + key_src).c_str(),
+               ("s3://" + bucket_dst + "/" + key_dst).c_str(), status);
+      if (TF_GetCode(status) != TF_OK) return;
+
+      delete_object_request.WithBucket(bucket_src).WithKey(key_src);
+      auto delete_object_outcome =
+          s3_file->s3_client->DeleteObject(delete_object_request);
+      if (!delete_object_outcome.IsSuccess())
+        return TF_SetStatusFromAWSError(delete_object_outcome.GetError(),
+                                        status);
+    }
+    list_objects_request.SetMarker(list_objects_result.GetNextMarker());
+  } while (list_objects_result.GetIsTruncated());
+  TF_SetStatus(status, TF_OK, "");
+}
+
+int GetChildren(const TF_Filesystem* filesystem, const char* path,
+                char*** entries, TF_Status* status) {
+  Aws::String bucket, prefix;
+  ParseS3Path(path, true, &bucket, &prefix, status);
+  if (TF_GetCode(status) != TF_OK) return -1;
+  if (!prefix.empty() && prefix.back() != '/') prefix.push_back('/');
+
+  auto s3_file = static_cast<S3File*>(filesystem->plugin_filesystem);
+  GetS3Client(s3_file);
+
+  Aws::S3::Model::ListObjectsRequest list_objects_request;
+  list_objects_request.WithBucket(bucket)
+      .WithPrefix(prefix)
+      .WithMaxKeys(kS3GetChildrenMaxKeys)
+      .WithDelimiter("/");
+  list_objects_request.SetResponseStreamFactory(
+      []() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); });
+
+  Aws::S3::Model::ListObjectsResult list_objects_result;
+  std::vector<Aws::String> result;
+  do {
+    auto list_objects_outcome =
+        s3_file->s3_client->ListObjects(list_objects_request);
+    if (!list_objects_outcome.IsSuccess()) {
+      TF_SetStatusFromAWSError(list_objects_outcome.GetError(), status);
+      return -1;
+    }
+
+    list_objects_result = list_objects_outcome.GetResult();
+    for (const auto& object : list_objects_result.GetCommonPrefixes()) {
+      Aws::String s = object.GetPrefix();
+      s.erase(s.length() - 1);
+      Aws::String entry = s.substr(prefix.length());
+      if (entry.length() > 0) {
+        result.push_back(entry);
+      }
+    }
+    for (const auto& object : list_objects_result.GetContents()) {
+      Aws::String s = object.GetKey();
+      Aws::String entry = s.substr(prefix.length());
+      if (entry.length() > 0) {
+        result.push_back(entry);
+      }
+    }
+    list_objects_result.SetMarker(list_objects_result.GetNextMarker());
+  } while (list_objects_result.GetIsTruncated());
+
+  int num_entries = result.size();
+  *entries = static_cast<char**>(
+      plugin_memory_allocate(num_entries * sizeof((*entries)[0])));
+  for (int i = 0; i < num_entries; i++)
+    (*entries)[i] = strdup(result[i].c_str());
+  TF_SetStatus(status, TF_OK, "");
+  return num_entries;
+}
+
+static char* TranslateName(const TF_Filesystem* filesystem, const char* uri) {
+  return strdup(uri);
+}
+
+}  // namespace tf_s3_filesystem
+
+static void ProvideFilesystemSupportFor(TF_FilesystemPluginOps* ops,
+                                        const char* uri) {
+  TF_SetFilesystemVersionMetadata(ops);
+  ops->scheme = strdup(uri);
+
+  ops->random_access_file_ops = static_cast<TF_RandomAccessFileOps*>(
+      plugin_memory_allocate(TF_RANDOM_ACCESS_FILE_OPS_SIZE));
+  ops->random_access_file_ops->cleanup = tf_random_access_file::Cleanup;
+  ops->random_access_file_ops->read = tf_random_access_file::Read;
+
+  ops->writable_file_ops = static_cast<TF_WritableFileOps*>(
+      plugin_memory_allocate(TF_WRITABLE_FILE_OPS_SIZE));
+  ops->writable_file_ops->cleanup = tf_writable_file::Cleanup;
+  ops->writable_file_ops->append = tf_writable_file::Append;
+  ops->writable_file_ops->tell = tf_writable_file::Tell;
+  ops->writable_file_ops->flush = tf_writable_file::Flush;
+  ops->writable_file_ops->sync = tf_writable_file::Sync;
+  ops->writable_file_ops->close = tf_writable_file::Close;
+
+  ops->read_only_memory_region_ops = static_cast<TF_ReadOnlyMemoryRegionOps*>(
+      plugin_memory_allocate(TF_READ_ONLY_MEMORY_REGION_OPS_SIZE));
+  ops->read_only_memory_region_ops->cleanup =
+      tf_read_only_memory_region::Cleanup;
+  ops->read_only_memory_region_ops->data = tf_read_only_memory_region::Data;
+  ops->read_only_memory_region_ops->length = tf_read_only_memory_region::Length;
+
+  ops->filesystem_ops = static_cast<TF_FilesystemOps*>(
+      plugin_memory_allocate(TF_FILESYSTEM_OPS_SIZE));
+  ops->filesystem_ops->init = tf_s3_filesystem::Init;
+  ops->filesystem_ops->cleanup = tf_s3_filesystem::Cleanup;
+  ops->filesystem_ops->new_random_access_file =
+      tf_s3_filesystem::NewRandomAccessFile;
+  ops->filesystem_ops->new_writable_file = tf_s3_filesystem::NewWritableFile;
+  ops->filesystem_ops->new_appendable_file =
+      tf_s3_filesystem::NewAppendableFile;
+  ops->filesystem_ops->new_read_only_memory_region_from_file =
+      tf_s3_filesystem::NewReadOnlyMemoryRegionFromFile;
+  ops->filesystem_ops->create_dir = tf_s3_filesystem::CreateDir;
+  ops->filesystem_ops->delete_file = tf_s3_filesystem::DeleteFile;
+  ops->filesystem_ops->delete_dir = tf_s3_filesystem::DeleteDir;
+  ops->filesystem_ops->copy_file = tf_s3_filesystem::CopyFile;
+  ops->filesystem_ops->rename_file = tf_s3_filesystem::RenameFile;
+  ops->filesystem_ops->path_exists = tf_s3_filesystem::PathExists;
+  ops->filesystem_ops->get_file_size = tf_s3_filesystem::GetFileSize;
+  ops->filesystem_ops->stat = tf_s3_filesystem::Stat;
+  ops->filesystem_ops->get_children = tf_s3_filesystem::GetChildren;
+  ops->filesystem_ops->translate_name = tf_s3_filesystem::TranslateName;
+}
+
+void TF_InitPlugin(TF_FilesystemPluginInfo* info) {
+  info->plugin_memory_allocate = plugin_memory_allocate;
+  info->plugin_memory_free = plugin_memory_free;
+  info->num_schemes = 1;
+  info->ops = static_cast<TF_FilesystemPluginOps*>(
+      plugin_memory_allocate(info->num_schemes * sizeof(info->ops[0])));
+  ProvideFilesystemSupportFor(&info->ops[0], "s3");
+}
diff --git a/tensorflow/c/experimental/filesystem/plugins/s3/s3_filesystem.h b/tensorflow/c/experimental/filesystem/plugins/s3/s3_filesystem.h
new file mode 100644
index 00000000000..4a995e8c109
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/s3/s3_filesystem.h
@@ -0,0 +1,101 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_S3_S3_FILESYSTEM_H_
+#define TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_S3_S3_FILESYSTEM_H_
+
+#include <aws/core/Aws.h>
+#include <aws/core/utils/StringUtils.h>
+#include <aws/core/utils/memory/stl/AWSMap.h>
+#include <aws/core/utils/threading/Executor.h>
+#include <aws/s3/S3Client.h>
+#include <aws/transfer/TransferManager.h>
+
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
+#include "tensorflow/c/tf_status.h"
+
+void ParseS3Path(const Aws::String& fname, bool object_empty_ok,
+                 Aws::String* bucket, Aws::String* object, TF_Status* status);
+
+namespace tf_random_access_file {
+void Cleanup(TF_RandomAccessFile* file);
+int64_t Read(const TF_RandomAccessFile* file, uint64_t offset, size_t n,
+             char* buffer, TF_Status* status);
+}  // namespace tf_random_access_file
+
+namespace tf_writable_file {
+void Cleanup(TF_WritableFile* file);
+void Append(const TF_WritableFile* file, const char* buffer, size_t n,
+            TF_Status* status);
+int64_t Tell(const TF_WritableFile* file, TF_Status* status);
+void Sync(const TF_WritableFile* file, TF_Status* status);
+void Flush(const TF_WritableFile* file, TF_Status* status);
+void Close(const TF_WritableFile* file, TF_Status* status);
+}  // namespace tf_writable_file
+
+namespace tf_read_only_memory_region {
+void Cleanup(TF_ReadOnlyMemoryRegion* region);
+const void* Data(const TF_ReadOnlyMemoryRegion* region);
+uint64_t Length(const TF_ReadOnlyMemoryRegion* region);
+}  // namespace tf_read_only_memory_region
+
+namespace tf_s3_filesystem {
+typedef struct S3File {
+  std::shared_ptr<Aws::S3::S3Client> s3_client;
+  std::shared_ptr<Aws::Utils::Threading::PooledThreadExecutor> executor;
+  // We need 2 `TransferManager`, for multipart upload/download.
+  Aws::Map<Aws::Transfer::TransferDirection,
+           std::shared_ptr<Aws::Transfer::TransferManager>>
+      transfer_managers;
+  // Sizes to split objects during multipart upload/download.
+  Aws::Map<Aws::Transfer::TransferDirection, uint64_t> multi_part_chunk_sizes;
+  bool use_multi_part_download;
+  absl::Mutex initialization_lock;
+  S3File();
+} S3File;
+
+void Init(TF_Filesystem* filesystem, TF_Status* status);
+void Cleanup(TF_Filesystem* filesystem);
+void NewRandomAccessFile(const TF_Filesystem* filesystem, const char* path,
+                         TF_RandomAccessFile* file, TF_Status* status);
+void NewWritableFile(const TF_Filesystem* filesystem, const char* path,
+                     TF_WritableFile* file, TF_Status* status);
+void NewAppendableFile(const TF_Filesystem* filesystem, const char* path,
+                       TF_WritableFile* file, TF_Status* status);
+int64_t GetFileSize(const TF_Filesystem* filesystem, const char* path,
+                    TF_Status* status);
+void NewReadOnlyMemoryRegionFromFile(const TF_Filesystem* filesystem,
+                                     const char* path,
+                                     TF_ReadOnlyMemoryRegion* region,
+                                     TF_Status* status);
+void PathExists(const TF_Filesystem* filesystem, const char* path,
+                TF_Status* status);
+void CreateDir(const TF_Filesystem* filesystem, const char* path,
+               TF_Status* status);
+int GetChildren(const TF_Filesystem* filesystem, const char* path,
+                char*** entries, TF_Status* status);
+void DeleteFile(const TF_Filesystem* filesystem, const char* path,
+                TF_Status* status);
+void Stat(const TF_Filesystem* filesystem, const char* path,
+          TF_FileStatistics* stats, TF_Status* status);
+void DeleteDir(const TF_Filesystem* filesystem, const char* path,
+               TF_Status* status);
+void CopyFile(const TF_Filesystem* filesystem, const char* src, const char* dst,
+              TF_Status* status);
+void RenameFile(const TF_Filesystem* filesystem, const char* src,
+                const char* dst, TF_Status* status);
+}  // namespace tf_s3_filesystem
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_S3_S3_FILESYSTEM_H_
diff --git a/tensorflow/c/experimental/filesystem/plugins/s3/s3_filesystem_test.cc b/tensorflow/c/experimental/filesystem/plugins/s3/s3_filesystem_test.cc
new file mode 100644
index 00000000000..7dc80fb11ed
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/s3/s3_filesystem_test.cc
@@ -0,0 +1,540 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/filesystem/plugins/s3/s3_filesystem.h"
+
+#include <fstream>
+#include <random>
+
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/stacktrace_handler.h"
+#include "tensorflow/core/platform/test.h"
+
+#define ASSERT_TF_OK(x) ASSERT_EQ(TF_OK, TF_GetCode(x)) << TF_Message(x)
+#define EXPECT_TF_OK(x) EXPECT_EQ(TF_OK, TF_GetCode(x)) << TF_Message(x)
+
+static std::string InitializeTmpDir() {
+  // This env should be something like `s3://bucket/path`
+  const char* test_dir = getenv("S3_TEST_TMPDIR");
+  if (test_dir != nullptr) {
+    Aws::String bucket, object;
+    TF_Status* status = TF_NewStatus();
+    ParseS3Path(test_dir, true, &bucket, &object, status);
+    if (TF_GetCode(status) != TF_OK) {
+      TF_DeleteStatus(status);
+      return "";
+    }
+    TF_DeleteStatus(status);
+
+    // We add a random value into `test_dir` to ensures that two consecutive
+    // runs are unlikely to clash.
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<> distribution;
+    std::string rng_val = std::to_string(distribution(gen));
+    return tensorflow::io::JoinPath(std::string(test_dir), rng_val);
+  } else {
+    return "";
+  }
+}
+
+static std::string GetLocalLargeFile() {
+  // This env is used when we want to test against a large file ( ~  50MB ).
+  // `S3_TEST_LOCAL_LARGE_FILE` and `S3_TEST_SERVER_LARGE_FILE` must be the same
+  // file.
+  static std::string path;
+  if (path.empty()) {
+    const char* env = getenv("S3_TEST_LOCAL_LARGE_FILE");
+    if (env == nullptr) return "";
+    path = env;
+  }
+  return path;
+}
+
+static std::string GetServerLargeFile() {
+  // This env is used when we want to test against a large file ( ~  50MB ).
+  // `S3_TEST_LOCAL_LARGE_FILE` and `S3_TEST_SERVER_LARGE_FILE` must be the same
+  // file.
+  static std::string path;
+  if (path.empty()) {
+    const char* env = getenv("S3_TEST_SERVER_LARGE_FILE");
+    if (env == nullptr) return "";
+    Aws::String bucket, object;
+    TF_Status* status = TF_NewStatus();
+    ParseS3Path(env, false, &bucket, &object, status);
+    if (TF_GetCode(status) != TF_OK) {
+      TF_DeleteStatus(status);
+      return "";
+    }
+    TF_DeleteStatus(status);
+    path = env;
+  }
+  return path;
+}
+
+static std::string* GetTmpDir() {
+  static std::string tmp_dir = InitializeTmpDir();
+  if (tmp_dir == "")
+    return nullptr;
+  else
+    return &tmp_dir;
+}
+
+namespace tensorflow {
+namespace {
+
+class S3FilesystemTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    root_dir_ = io::JoinPath(
+        *GetTmpDir(),
+        ::testing::UnitTest::GetInstance()->current_test_info()->name());
+    status_ = TF_NewStatus();
+    filesystem_ = new TF_Filesystem;
+    tf_s3_filesystem::Init(filesystem_, status_);
+    ASSERT_TF_OK(status_) << "Could not initialize filesystem. "
+                          << TF_Message(status_);
+  }
+  void TearDown() override {
+    TF_DeleteStatus(status_);
+    tf_s3_filesystem::Cleanup(filesystem_);
+    delete filesystem_;
+  }
+
+  std::string GetURIForPath(const std::string& path) {
+    const std::string translated_name =
+        tensorflow::io::JoinPath(root_dir_, path);
+    return translated_name;
+  }
+
+  std::unique_ptr<TF_WritableFile, void (*)(TF_WritableFile* file)>
+  GetWriter() {
+    std::unique_ptr<TF_WritableFile, void (*)(TF_WritableFile * file)> writer(
+        new TF_WritableFile, [](TF_WritableFile* file) {
+          if (file != nullptr) {
+            if (file->plugin_file != nullptr) tf_writable_file::Cleanup(file);
+            delete file;
+          }
+        });
+    writer->plugin_file = nullptr;
+    return writer;
+  }
+
+  std::unique_ptr<TF_RandomAccessFile, void (*)(TF_RandomAccessFile* file)>
+  GetReader() {
+    std::unique_ptr<TF_RandomAccessFile, void (*)(TF_RandomAccessFile * file)>
+        reader(new TF_RandomAccessFile, [](TF_RandomAccessFile* file) {
+          if (file != nullptr) {
+            if (file->plugin_file != nullptr)
+              tf_random_access_file::Cleanup(file);
+            delete file;
+          }
+        });
+    reader->plugin_file = nullptr;
+    return reader;
+  }
+
+  void WriteString(const std::string& path, const std::string& content) {
+    auto writer = GetWriter();
+    tf_s3_filesystem::NewWritableFile(filesystem_, path.c_str(), writer.get(),
+                                      status_);
+    if (TF_GetCode(status_) != TF_OK) return;
+    tf_writable_file::Append(writer.get(), content.c_str(), content.length(),
+                             status_);
+    if (TF_GetCode(status_) != TF_OK) return;
+    tf_writable_file::Close(writer.get(), status_);
+    if (TF_GetCode(status_) != TF_OK) return;
+  }
+
+  std::string ReadAll(const string& path) {
+    auto reader = GetReader();
+    tf_s3_filesystem::NewRandomAccessFile(filesystem_, path.c_str(),
+                                          reader.get(), status_);
+    if (TF_GetCode(status_) != TF_OK) return "";
+
+    auto file_size =
+        tf_s3_filesystem::GetFileSize(filesystem_, path.c_str(), status_);
+    if (TF_GetCode(status_) != TF_OK) return "";
+
+    std::string content;
+    content.resize(file_size);
+    auto read = tf_random_access_file::Read(reader.get(), 0, file_size,
+                                            &content[0], status_);
+    if (TF_GetCode(status_) != TF_OK) return "";
+    if (read >= 0) content.resize(read);
+    if (file_size != content.size())
+      TF_SetStatus(
+          status_, TF_DATA_LOSS,
+          std::string("expected " + std::to_string(file_size) + " got " +
+                      std::to_string(content.size()) + " bytes")
+              .c_str());
+    return content;
+  }
+
+  std::string ReadAllInChunks(const string& path, size_t buffer_size,
+                              bool use_multi_part_download) {
+    auto reader = GetReader();
+    auto s3_file =
+        static_cast<tf_s3_filesystem::S3File*>(filesystem_->plugin_filesystem);
+    s3_file->use_multi_part_download = use_multi_part_download;
+    s3_file
+        ->multi_part_chunk_sizes[Aws::Transfer::TransferDirection::DOWNLOAD] =
+        buffer_size;
+    tf_s3_filesystem::NewRandomAccessFile(filesystem_, path.c_str(),
+                                          reader.get(), status_);
+    if (TF_GetCode(status_) != TF_OK) return "";
+    auto file_size =
+        tf_s3_filesystem::GetFileSize(filesystem_, path.c_str(), status_);
+    if (TF_GetCode(status_) != TF_OK) return "";
+
+    std::size_t part_count = (std::max)(
+        static_cast<size_t>((file_size + buffer_size - 1) / buffer_size),
+        static_cast<size_t>(1));
+    std::unique_ptr<char[]> buffer{new char[buffer_size]};
+    std::stringstream ss;
+
+    uint64_t offset = 0;
+    uint64_t server_size = 0;
+    for (size_t i = 0; i < part_count; i++) {
+      offset = i * buffer_size;
+      buffer_size =
+          (i == part_count - 1) ? file_size - server_size : buffer_size;
+      auto read = tf_random_access_file::Read(reader.get(), offset, buffer_size,
+                                              buffer.get(), status_);
+      if (TF_GetCode(status_) != TF_OK) return "";
+      if (read > 0) {
+        ss.write(buffer.get(), read);
+        server_size += static_cast<uint64_t>(read);
+      }
+      if (server_size == file_size) break;
+      if (read != buffer_size) {
+        if (read == 0)
+          TF_SetStatus(status_, TF_OUT_OF_RANGE, "eof");
+        else
+          TF_SetStatus(
+              status_, TF_DATA_LOSS,
+              ("truncated record at " + std::to_string(offset)).c_str());
+        return "";
+      }
+    }
+
+    if (file_size != server_size) {
+      TF_SetStatus(status_, TF_DATA_LOSS,
+                   std::string("expected " + std::to_string(file_size) +
+                               " got " + std::to_string(server_size) + " bytes")
+                       .c_str());
+      return "";
+    }
+    TF_SetStatus(status_, TF_OK, "");
+    return ss.str();
+  }
+
+ protected:
+  TF_Filesystem* filesystem_;
+  TF_Status* status_;
+
+ private:
+  std::string root_dir_;
+};
+
+TEST_F(S3FilesystemTest, NewRandomAccessFile) {
+  const std::string path = GetURIForPath("RandomAccessFile");
+  const std::string content = "abcdefghijklmn";
+
+  WriteString(path, content);
+  ASSERT_TF_OK(status_);
+
+  auto reader = GetReader();
+  tf_s3_filesystem::NewRandomAccessFile(filesystem_, path.c_str(), reader.get(),
+                                        status_);
+  EXPECT_TF_OK(status_);
+
+  std::string result;
+  result.resize(content.size());
+  auto read = tf_random_access_file::Read(reader.get(), 0, content.size(),
+                                          &result[0], status_);
+  result.resize(read);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ(content.size(), result.size());
+  EXPECT_EQ(content, result);
+
+  result.clear();
+  result.resize(4);
+  read = tf_random_access_file::Read(reader.get(), 2, 4, &result[0], status_);
+  result.resize(read);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ(4, result.size());
+  EXPECT_EQ(content.substr(2, 4), result);
+}
+
+TEST_F(S3FilesystemTest, NewWritableFile) {
+  auto writer = GetWriter();
+  const std::string path = GetURIForPath("WritableFile");
+  tf_s3_filesystem::NewWritableFile(filesystem_, path.c_str(), writer.get(),
+                                    status_);
+  EXPECT_TF_OK(status_);
+  tf_writable_file::Append(writer.get(), "content1,", strlen("content1,"),
+                           status_);
+  EXPECT_TF_OK(status_);
+  tf_writable_file::Append(writer.get(), "content2", strlen("content2"),
+                           status_);
+  EXPECT_TF_OK(status_);
+  tf_writable_file::Flush(writer.get(), status_);
+  EXPECT_TF_OK(status_);
+  tf_writable_file::Sync(writer.get(), status_);
+  EXPECT_TF_OK(status_);
+  tf_writable_file::Close(writer.get(), status_);
+  EXPECT_TF_OK(status_);
+
+  auto content = ReadAll(path);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ("content1,content2", content);
+}
+
+TEST_F(S3FilesystemTest, NewAppendableFile) {
+  const std::string path = GetURIForPath("AppendableFile");
+  WriteString(path, "test");
+  ASSERT_TF_OK(status_);
+
+  auto writer = GetWriter();
+  tf_s3_filesystem::NewAppendableFile(filesystem_, path.c_str(), writer.get(),
+                                      status_);
+  EXPECT_TF_OK(status_);
+  tf_writable_file::Append(writer.get(), "content", strlen("content"), status_);
+  EXPECT_TF_OK(status_);
+  tf_writable_file::Close(writer.get(), status_);
+  EXPECT_TF_OK(status_);
+}
+
+TEST_F(S3FilesystemTest, NewReadOnlyMemoryRegionFromFile) {
+  const std::string path = GetURIForPath("MemoryFile");
+  const std::string content = "content";
+  WriteString(path, content);
+  ASSERT_TF_OK(status_);
+
+  std::unique_ptr<TF_ReadOnlyMemoryRegion,
+                  void (*)(TF_ReadOnlyMemoryRegion * file)>
+      region(new TF_ReadOnlyMemoryRegion, [](TF_ReadOnlyMemoryRegion* file) {
+        if (file != nullptr) {
+          if (file->plugin_memory_region != nullptr)
+            tf_read_only_memory_region::Cleanup(file);
+          delete file;
+        }
+      });
+  region->plugin_memory_region = nullptr;
+  tf_s3_filesystem::NewReadOnlyMemoryRegionFromFile(filesystem_, path.c_str(),
+                                                    region.get(), status_);
+  EXPECT_TF_OK(status_);
+  std::string result(reinterpret_cast<const char*>(
+                         tf_read_only_memory_region::Data(region.get())),
+                     tf_read_only_memory_region::Length(region.get()));
+  EXPECT_EQ(content, result);
+}
+
+TEST_F(S3FilesystemTest, PathExists) {
+  const std::string path = GetURIForPath("PathExists");
+  tf_s3_filesystem::PathExists(filesystem_, path.c_str(), status_);
+  EXPECT_EQ(TF_NOT_FOUND, TF_GetCode(status_)) << TF_Message(status_);
+  TF_SetStatus(status_, TF_OK, "");
+  WriteString(path, "test");
+  ASSERT_TF_OK(status_);
+  tf_s3_filesystem::PathExists(filesystem_, path.c_str(), status_);
+  EXPECT_TF_OK(status_);
+}
+
+TEST_F(S3FilesystemTest, GetChildren) {
+  const std::string base = GetURIForPath("GetChildren");
+  tf_s3_filesystem::CreateDir(filesystem_, base.c_str(), status_);
+  EXPECT_TF_OK(status_);
+
+  const std::string file = io::JoinPath(base, "TestFile.csv");
+  WriteString(file, "test");
+  EXPECT_TF_OK(status_);
+
+  const std::string subdir = io::JoinPath(base, "SubDir");
+  tf_s3_filesystem::CreateDir(filesystem_, subdir.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  const std::string subfile = io::JoinPath(subdir, "TestSubFile.csv");
+  WriteString(subfile, "test");
+  EXPECT_TF_OK(status_);
+
+  char** entries;
+  auto num_entries = tf_s3_filesystem::GetChildren(filesystem_, base.c_str(),
+                                                   &entries, status_);
+  EXPECT_TF_OK(status_);
+
+  std::vector<std::string> childrens;
+  for (int i = 0; i < num_entries; ++i) {
+    childrens.push_back(entries[i]);
+  }
+  std::sort(childrens.begin(), childrens.end());
+  EXPECT_EQ(std::vector<string>({"SubDir", "TestFile.csv"}), childrens);
+}
+
+TEST_F(S3FilesystemTest, DeleteFile) {
+  const std::string path = GetURIForPath("DeleteFile");
+  WriteString(path, "test");
+  ASSERT_TF_OK(status_);
+  tf_s3_filesystem::DeleteFile(filesystem_, path.c_str(), status_);
+  EXPECT_TF_OK(status_);
+}
+
+TEST_F(S3FilesystemTest, CreateDir) {
+  // s3 object storage doesn't support empty directory, we create file in the
+  // directory
+  const std::string dir = GetURIForPath("CreateDir");
+  tf_s3_filesystem::CreateDir(filesystem_, dir.c_str(), status_);
+  EXPECT_TF_OK(status_);
+
+  const std::string file = io::JoinPath(dir, "CreateDirFile.csv");
+  WriteString(file, "test");
+  ASSERT_TF_OK(status_);
+
+  TF_FileStatistics stat;
+  tf_s3_filesystem::Stat(filesystem_, dir.c_str(), &stat, status_);
+  EXPECT_TF_OK(status_);
+  EXPECT_TRUE(stat.is_directory);
+}
+
+TEST_F(S3FilesystemTest, DeleteDir) {
+  // s3 object storage doesn't support empty directory, we create file in the
+  // directory
+  const std::string dir = GetURIForPath("DeleteDir");
+  const std::string file = io::JoinPath(dir, "DeleteDirFile.csv");
+  WriteString(file, "test");
+  ASSERT_TF_OK(status_);
+  tf_s3_filesystem::DeleteDir(filesystem_, dir.c_str(), status_);
+  EXPECT_NE(TF_GetCode(status_), TF_OK);
+
+  TF_SetStatus(status_, TF_OK, "");
+  tf_s3_filesystem::DeleteFile(filesystem_, file.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  tf_s3_filesystem::DeleteDir(filesystem_, dir.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  TF_FileStatistics stat;
+  tf_s3_filesystem::Stat(filesystem_, dir.c_str(), &stat, status_);
+  EXPECT_EQ(TF_GetCode(status_), TF_NOT_FOUND) << TF_Message(status_);
+}
+
+TEST_F(S3FilesystemTest, StatFile) {
+  const std::string path = GetURIForPath("StatFile");
+  WriteString(path, "test");
+  ASSERT_TF_OK(status_);
+
+  TF_FileStatistics stat;
+  tf_s3_filesystem::Stat(filesystem_, path.c_str(), &stat, status_);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ(4, stat.length);
+  EXPECT_FALSE(stat.is_directory);
+}
+
+TEST_F(S3FilesystemTest, SimpleCopyFile) {
+  const std::string src = GetURIForPath("SimpleCopySrc");
+  const std::string dst = GetURIForPath("SimpleCopyDst");
+  WriteString(src, "test");
+  ASSERT_TF_OK(status_);
+
+  tf_s3_filesystem::CopyFile(filesystem_, src.c_str(), dst.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  auto result = ReadAll(dst);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ(result, "test");
+}
+
+TEST_F(S3FilesystemTest, RenameFile) {
+  const std::string src = GetURIForPath("RenameFileSrc");
+  const std::string dst = GetURIForPath("RenameFileDst");
+  WriteString(src, "test");
+  ASSERT_TF_OK(status_);
+
+  tf_s3_filesystem::RenameFile(filesystem_, src.c_str(), dst.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  auto result = ReadAll(dst);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ("test", result);
+}
+
+TEST_F(S3FilesystemTest, RenameFileOverwrite) {
+  const std::string src = GetURIForPath("RenameFileOverwriteSrc");
+  const std::string dst = GetURIForPath("RenameFileOverwriteDst");
+
+  WriteString(src, "test_old");
+  ASSERT_TF_OK(status_);
+  WriteString(dst, "test_new");
+  ASSERT_TF_OK(status_);
+
+  tf_s3_filesystem::PathExists(filesystem_, dst.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  tf_s3_filesystem::RenameFile(filesystem_, src.c_str(), dst.c_str(), status_);
+  EXPECT_TF_OK(status_);
+
+  auto result = ReadAll(dst);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ("test_old", result);
+}
+
+// Test against large file.
+TEST_F(S3FilesystemTest, ReadLargeFile) {
+  auto local_path = GetLocalLargeFile();
+  auto server_path = GetServerLargeFile();
+  if (local_path.empty() || server_path.empty()) GTEST_SKIP();
+  std::ifstream in(local_path, std::ios::binary);
+  std::string local_content((std::istreambuf_iterator<char>(in)),
+                            std::istreambuf_iterator<char>());
+
+  constexpr size_t buffer_size = 50 * 1024 * 1024;
+  auto server_content = ReadAllInChunks(server_path, buffer_size, true);
+  ASSERT_TF_OK(status_);
+  EXPECT_EQ(local_content, server_content);
+
+  server_content = ReadAllInChunks(server_path, buffer_size, false);
+  ASSERT_TF_OK(status_);
+  EXPECT_EQ(local_content, server_content);
+}
+
+TEST_F(S3FilesystemTest, CopyLargeFile) {
+  auto server_path = GetServerLargeFile();
+  if (server_path.empty()) GTEST_SKIP();
+
+  auto path = GetURIForPath("CopyLargeFile");
+  constexpr size_t buffer_size = 5 * 1024 * 1024;
+  auto s3_file =
+      static_cast<tf_s3_filesystem::S3File*>(filesystem_->plugin_filesystem);
+  s3_file->multi_part_chunk_sizes[Aws::Transfer::TransferDirection::UPLOAD] =
+      buffer_size;
+  tf_s3_filesystem::CopyFile(filesystem_, server_path.c_str(), path.c_str(),
+                             status_);
+  EXPECT_TF_OK(status_);
+
+  auto server_size =
+      tf_s3_filesystem::GetFileSize(filesystem_, server_path.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  auto actual_size =
+      tf_s3_filesystem::GetFileSize(filesystem_, path.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ(server_size, actual_size);
+}
+
+}  // namespace
+}  // namespace tensorflow
+
+GTEST_API_ int main(int argc, char** argv) {
+  tensorflow::testing::InstallStacktraceHandler();
+  if (!GetTmpDir()) {
+    std::cerr << "Could not read S3_TEST_TMPDIR env";
+    return -1;
+  }
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/c/experimental/gradients/BUILD b/tensorflow/c/experimental/gradients/BUILD
new file mode 100644
index 00000000000..80c4e8d9791
--- /dev/null
+++ b/tensorflow/c/experimental/gradients/BUILD
@@ -0,0 +1,24 @@
+# Library of gradient functions.
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "math_grad",
+    srcs = ["math_grad.cc"],
+    hdrs = [
+        "math_grad.h",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        "//tensorflow/c/eager:abstract_operation",
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:c_api_unified_internal",
+        "//tensorflow/c/eager:gradients",
+        "//tensorflow/c/experimental/ops:array_ops",
+        "//tensorflow/c/experimental/ops:math_ops",
+        "//tensorflow/core/lib/llvm_rtti",
+    ],
+)
diff --git a/tensorflow/c/experimental/gradients/math_grad.cc b/tensorflow/c/experimental/gradients/math_grad.cc
new file mode 100644
index 00000000000..d8b70848d4e
--- /dev/null
+++ b/tensorflow/c/experimental/gradients/math_grad.cc
@@ -0,0 +1,86 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/gradients/math_grad.h"
+
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/experimental/ops/array_ops.h"
+#include "tensorflow/c/experimental/ops/math_ops.h"
+
+using std::vector;
+using tensorflow::ops::Conj;
+using tensorflow::ops::Identity;
+using tensorflow::ops::Mul;
+
+namespace tensorflow {
+namespace gradients {
+namespace {
+
+class AddGradientFunction : public GradientFunction {
+ public:
+  Status Compute(Context* ctx,
+                 absl::Span<AbstractTensorHandle* const> grad_inputs,
+                 vector<AbstractTensorHandle*>* grad_outputs) override {
+    grad_outputs->resize(2);
+    vector<AbstractTensorHandle*> identity_outputs(1);
+    // TODO(b/145674566): Handle name unification in tracing code.
+    // TODO(b/161805092): Support broadcasting.
+    TF_RETURN_IF_ERROR(ops::Identity(ctx->ctx, {grad_inputs[0]},
+                                     absl::MakeSpan(identity_outputs),
+                                     "Identity0"));
+    (*grad_outputs)[0] = identity_outputs[0];
+    TF_RETURN_IF_ERROR(ops::Identity(ctx->ctx, {grad_inputs[0]},
+                                     absl::MakeSpan(identity_outputs),
+                                     "Identity1"));
+    (*grad_outputs)[1] = identity_outputs[0];
+    return Status::OK();
+  }
+  ~AddGradientFunction() override {}
+};
+
+class ExpGradientFunction : public GradientFunction {
+ public:
+  explicit ExpGradientFunction(AbstractTensorHandle* exp) : exp_(exp) {
+    exp->Ref();
+  }
+  Status Compute(Context* ctx,
+                 absl::Span<AbstractTensorHandle* const> grad_inputs,
+                 vector<AbstractTensorHandle*>* grad_outputs) override {
+    vector<AbstractTensorHandle*> conj_outputs(1);
+    TF_RETURN_IF_ERROR(
+        Conj(ctx->ctx, {exp_.get()}, absl::MakeSpan(conj_outputs), "ExpConj"));
+    AbstractTensorHandlePtr conj_output_releaser(conj_outputs[0]);
+    grad_outputs->resize(1);
+    TF_RETURN_IF_ERROR(Mul(ctx->ctx, {conj_outputs[0], grad_inputs[0]},
+                           absl::MakeSpan(*grad_outputs), "ExpGradMul"));
+    return Status::OK();
+  }
+  ~ExpGradientFunction() override {}
+
+ private:
+  AbstractTensorHandlePtr exp_;
+};
+
+}  // namespace
+
+GradientFunction* AddRegisterer(const ForwardOperation& op) {
+  return new AddGradientFunction;
+}
+
+GradientFunction* ExpRegisterer(const ForwardOperation& op) {
+  return new ExpGradientFunction(op.outputs[0]);
+}
+
+}  // namespace gradients
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/gradients/math_grad.h b/tensorflow/c/experimental/gradients/math_grad.h
new file mode 100644
index 00000000000..6c7242a1a49
--- /dev/null
+++ b/tensorflow/c/experimental/gradients/math_grad.h
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_MATH_GRAD_H_
+#define TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_MATH_GRAD_H_
+
+#include "tensorflow/c/eager/gradients.h"
+
+namespace tensorflow {
+namespace gradients {
+GradientFunction* AddRegisterer(const ForwardOperation& op);
+GradientFunction* ExpRegisterer(const ForwardOperation& op);
+}  // namespace gradients
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_MATH_GRAD_H_
diff --git a/tensorflow/c/experimental/network.cc b/tensorflow/c/experimental/network.cc
deleted file mode 100644
index 97e63ec6259..00000000000
--- a/tensorflow/c/experimental/network.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/c/experimental/network.h"
-
-#include <memory>
-#include <string>
-
-#include "tensorflow/c/c_api.h"
-#include "tensorflow/c/c_api_internal.h"
-#include "tensorflow/c/experimental/network_internal.h"
-#include "tensorflow/c/experimental/rendezvous_internal.h"
-#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
-#include "tensorflow/core/distributed_runtime/server_lib.h"
-#include "tensorflow/core/distributed_runtime/worker_env.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/status.h"
-
-using tensorflow::ServerFactory;
-
-namespace tensorflow {
-
-/* static */ Status CGrpcServer::Create(
-    const ServerDef& server_def,
-    void* (*init_function)(const TF_GrpcServer*, TF_Status*),
-    void (*start_function)(const TF_GrpcServer*, void*, TF_Status*),
-    void (*stop_function)(const TF_GrpcServer*, void*, TF_Status*),
-    void (*join_function)(const TF_GrpcServer*, void*, TF_Status*),
-    void (*delete_function)(void*),
-    TF_RemoteRendezvousBuilder* rendezvous_builder,
-    std::unique_ptr<ServerInterface>* out_server) {
-  auto* grpc_server = new CGrpcServer(server_def, start_function, stop_function,
-                                      join_function, delete_function);
-
-  GrpcServerOptions options;
-  options.rendezvous_mgr_func = [rendezvous_builder](const WorkerEnv* env) {
-    return new CRendezvousMgr(env, rendezvous_builder);
-  };
-  TF_RETURN_IF_ERROR(grpc_server->Init(options));
-  TF_Status* tf_status = TF_NewStatus();
-  grpc_server->SetContext(init_function(
-      reinterpret_cast<const TF_GrpcServer*>(grpc_server), tf_status));
-  TF_RETURN_IF_ERROR(tf_status->status);
-  TF_DeleteStatus(tf_status);
-
-  out_server->reset(grpc_server);
-  return Status::OK();
-}
-
-Status CGrpcServer::Start() {
-  Status status = GrpcServer::Start();
-  TF_Status* tf_status = TF_NewStatus();
-  (*start_function_)(reinterpret_cast<const TF_GrpcServer*>(this), context_,
-                     tf_status);
-  status.Update(tf_status->status);
-  TF_DeleteStatus(tf_status);
-  return status;
-}
-
-Status CGrpcServer::Stop() {
-  Status status = GrpcServer::Stop();
-  TF_Status* tf_status = TF_NewStatus();
-  (*stop_function_)(reinterpret_cast<const TF_GrpcServer*>(this), context_,
-                    tf_status);
-  status.Update(tf_status->status);
-  TF_DeleteStatus(tf_status);
-  return status;
-}
-
-Status CGrpcServer::Join() {
-  Status status = GrpcServer::Join();
-  TF_Status* tf_status = TF_NewStatus();
-  (*join_function_)(reinterpret_cast<const TF_GrpcServer*>(this), context_,
-                    tf_status);
-  status.Update(tf_status->status);
-  TF_DeleteStatus(tf_status);
-  return status;
-}
-
-namespace {
-// Factory that creates CGrpcServer instances.
-class CServerFactory : public ServerFactory {
- public:
-  CServerFactory(bool (*accept_function)(const char*),
-                 void* (*init_function)(const TF_GrpcServer*, TF_Status*),
-                 void (*start_function)(const TF_GrpcServer*, void*,
-                                        TF_Status*),
-                 void (*stop_function)(const TF_GrpcServer*, void*, TF_Status*),
-                 void (*join_function)(const TF_GrpcServer*, void*, TF_Status*),
-                 void (*delete_function)(void*),
-                 TF_RemoteRendezvousBuilder* rendezvous_builder)
-      : accept_function_(accept_function),
-        init_function_(init_function),
-        start_function_(start_function),
-        stop_function_(stop_function),
-        join_function_(join_function),
-        delete_function_(delete_function),
-        rendezvous_builder_(rendezvous_builder) {}
-
-  Status NewServer(const ServerDef& server_def, const Options& options,
-                   std::unique_ptr<ServerInterface>* out_server) override {
-    TF_RETURN_IF_ERROR(CGrpcServer::Create(
-        server_def, init_function_, start_function_, stop_function_,
-        join_function_, delete_function_, rendezvous_builder_, out_server));
-    return Status::OK();
-  }
-
-  // Returns true if and only if this factory can create a server
-  // based on the given `server_def`.
-  bool AcceptsOptions(const ServerDef& server_def) override {
-    return (*accept_function_)(server_def.protocol().c_str());
-  }
-
- private:
-  bool (*accept_function_)(const char* protocol);
-  void* (*init_function_)(const TF_GrpcServer*, TF_Status*);
-  void (*start_function_)(const TF_GrpcServer*, void*, TF_Status*);
-  void (*stop_function_)(const TF_GrpcServer*, void*, TF_Status*);
-  void (*join_function_)(const TF_GrpcServer*, void*, TF_Status*);
-  void (*delete_function_)(void*);
-  TF_RemoteRendezvousBuilder* rendezvous_builder_;
-};
-}  // namespace
-}  // namespace tensorflow
-
-// Server factory representation to use in C API.
-// Holds CServerFactory pointer.
-struct TF_GrpcServerFactory {
-  ::tensorflow::CServerFactory* factory;
-};
-
-TF_GrpcServerFactory* TF_NewGrpcServerFactory(
-    bool (*accept_function)(const char*),
-    void* (*init_function)(const TF_GrpcServer*, TF_Status*),
-    void (*start_function)(const TF_GrpcServer*, void*, TF_Status*),
-    void (*stop_function)(const TF_GrpcServer*, void*, TF_Status*),
-    void (*join_function)(const TF_GrpcServer*, void*, TF_Status*),
-    void (*delete_function)(void*),
-    TF_RemoteRendezvousBuilder* rendezvous_builder) {
-  TF_GrpcServerFactory* server_factory = new TF_GrpcServerFactory;
-  server_factory->factory = new ::tensorflow::CServerFactory(
-      accept_function, init_function, start_function, stop_function,
-      join_function, delete_function, rendezvous_builder);
-  return server_factory;
-}
-
-void TF_DeleteGrpcServerFactory(TF_GrpcServerFactory* server_factory) {
-  DCHECK_NE(server_factory, nullptr);
-  delete server_factory;
-}
-
-void TF_RegisterGrpcServerFactory(const char* server_type,
-                                  TF_GrpcServerFactory* server_factory) {
-  ServerFactory::Register(server_type, server_factory->factory);
-}
diff --git a/tensorflow/c/experimental/network.h b/tensorflow/c/experimental/network.h
deleted file mode 100644
index bd74ec8ffec..00000000000
--- a/tensorflow/c/experimental/network.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_C_EXPERIMENTAL_NETWORK_H_
-#define TENSORFLOW_C_EXPERIMENTAL_NETWORK_H_
-
-#include "tensorflow/c/c_api.h"
-#include "tensorflow/c/experimental/rendezvous.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// --------------------------------------------------------------------------
-// C API for TensorFlow Networking.
-// NOTE: This API is unstable and almost certainly will change in the near
-// future.
-//
-// Users wishing to register a custom GrpcServer should call
-// TF_NewServerFactory and then TF_RegisterGrpcServerFactory.
-//
-// Example:
-// ```c++
-// auto* rendezvous_builder = TF_NewRemoteRendezvousBuilder(
-//     rendezvous_init_function,
-//     receive_from_remote_async_function,
-//     rendezvous_delete_function);
-//
-// TF_GrpcServerFactory* factory = TF_NewGrpcServerFactory(
-//     accept_function,
-//     init_function,
-//     start_function,
-//     stop_function,
-//     join_function,
-//     delete_function,
-//     rendezvous_builder);
-// TF_RegisterGrpcServerFactory("customfactory", factory);
-// ...
-// TF_DeleteGrpcServerFactory(factory);
-// ```
-
-typedef struct TF_GrpcServerFactory TF_GrpcServerFactory;
-typedef struct TF_GrpcServerOptions TF_GrpcServerOptions;
-typedef struct TF_GrpcServer TF_GrpcServer;
-typedef struct TF_ServerContext {
-  TF_GrpcServer* const server;
-  void* context;
-} TF_ServerContext;
-
-// Creates a new TF_GrpcServerFactory instance. Caller takes ownership
-// of TF_GrpcServerFactory instance and should deallocate it by calling
-// TF_GrpcDeleteServerFactory.
-// accept_function should return true if this ServerFactory can create
-// server instances for the given protocol name (for e.g. grpc+verbs).
-// GRPC servers created by this factory will call provided
-// init_function, start_function, stop_function, join_function and
-// delete_function.
-//
-// Note that clean shutdown is currently not implemented for GrpcServer.
-// So, stop_function will never be called now but may be in the future
-// when stop mechanism is supported.
-TF_CAPI_EXPORT extern TF_GrpcServerFactory* TF_NewGrpcServerFactory(
-    bool (*accept_function)(const char*),
-    void* (*init_function)(const TF_GrpcServer*, TF_Status*),
-    void (*start_function)(const TF_GrpcServer*, void*, TF_Status*),
-    void (*stop_function)(const TF_GrpcServer*, void*, TF_Status*),
-    void (*join_function)(const TF_GrpcServer*, void*, TF_Status*),
-    void (*delete_function)(void*),
-    TF_RemoteRendezvousBuilder* rendezvous_builder);
-
-// Deletes TF_GrpcServerFactory instances.
-// Note that this function only deletes TF_GrpcServerFactory wrapper.
-// Actual underlying server factory would not be deleted and will
-// remain registered.
-TF_CAPI_EXPORT extern void TF_DeleteGrpcServerFactory(
-    TF_GrpcServerFactory* server_factory);
-
-// Registers provided server_factory for the given server_type.
-// server_type must be unique to the server factory.
-TF_CAPI_EXPORT extern void TF_RegisterGrpcServerFactory(
-    const char* server_type, TF_GrpcServerFactory* server_factory);
-
-#ifdef __cplusplus
-} /* end extern "C" */
-#endif
-#endif  // TENSORFLOW_C_EXPERIMENTAL_NETWORK_H_
diff --git a/tensorflow/c/experimental/network_internal.h b/tensorflow/c/experimental/network_internal.h
deleted file mode 100644
index 389de440b70..00000000000
--- a/tensorflow/c/experimental/network_internal.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_C_EXPERIMENTAL_NETWORK_INTERNAL_H_
-#define TENSORFLOW_C_EXPERIMENTAL_NETWORK_INTERNAL_H_
-
-#include <memory>
-
-#include "tensorflow/c/c_api.h"
-#include "tensorflow/c/experimental/network.h"
-#include "tensorflow/c/experimental/rendezvous.h"
-#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
-#include "tensorflow/core/distributed_runtime/server_lib.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
-
-namespace tensorflow {
-
-// GrpcServer implementation that forwards calls to callbacks.
-class CGrpcServer : public GrpcServer {
- protected:
-  CGrpcServer(const ServerDef& server_def,
-              void (*start_function)(const TF_GrpcServer*, void*, TF_Status*),
-              void (*stop_function)(const TF_GrpcServer*, void*, TF_Status*),
-              void (*join_function)(const TF_GrpcServer*, void*, TF_Status*),
-              void (*delete_function)(void*))
-      : GrpcServer(server_def, ::tensorflow::Env::Default()),
-        start_function_(start_function),
-        stop_function_(stop_function),
-        join_function_(join_function),
-        delete_function_(delete_function),
-        context_(nullptr) {}
-
- public:
-  static Status Create(
-      const ServerDef& server_def,
-      void* (*init_function)(const TF_GrpcServer*, TF_Status*),
-      void (*start_function)(const TF_GrpcServer*, void*, TF_Status*),
-      void (*stop_function)(const TF_GrpcServer*, void*, TF_Status*),
-      void (*join_function)(const TF_GrpcServer*, void*, TF_Status*),
-      void (*delete_function)(void*),
-      TF_RemoteRendezvousBuilder* rendezvous_builder,
-      std::unique_ptr<ServerInterface>* out_server);
-
-  Status Start() override;
-  Status Stop() override;
-  Status Join() override;
-
-  ~CGrpcServer() override { delete_function_(context_); }
-
- protected:
-  void SetContext(void* context) { context_ = context; }
-
- private:
-  void (*start_function_)(const TF_GrpcServer*, void*, TF_Status*);
-  void (*stop_function_)(const TF_GrpcServer*, void*, TF_Status*);
-  void (*join_function_)(const TF_GrpcServer*, void*, TF_Status*);
-  void (*delete_function_)(void*);
-  void* context_;
-
-  friend class NetworksTest;
-};
-
-}  // namespace tensorflow
-#endif  // TENSORFLOW_C_EXPERIMENTAL_NETWORK_INTERNAL_H_
diff --git a/tensorflow/c/experimental/network_test.cc b/tensorflow/c/experimental/network_test.cc
deleted file mode 100644
index b7a50008c37..00000000000
--- a/tensorflow/c/experimental/network_test.cc
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/c/experimental/network.h"
-
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-
-#include <memory>
-#include <string>
-
-#include "absl/synchronization/notification.h"
-#include "absl/time/time.h"
-#include "tensorflow/c/c_api.h"
-#include "tensorflow/c/experimental/network_internal.h"
-#include "tensorflow/c/experimental/rendezvous.h"
-#include "tensorflow/c/experimental/rendezvous_internal.h"
-#include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
-#include "tensorflow/core/distributed_runtime/rpc/async_service_interface.h"
-#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
-#include "tensorflow/core/distributed_runtime/server_lib.h"
-#include "tensorflow/core/distributed_runtime/session_mgr.h"
-#include "tensorflow/core/distributed_runtime/worker_env.h"
-#include "tensorflow/core/distributed_runtime/worker_session.h"
-#include "tensorflow/core/framework/allocator.h"
-#include "tensorflow/core/framework/rendezvous.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/strcat.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/protobuf/cluster.pb.h"
-#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
-
-namespace tensorflow {
-
-bool accept_functionA(const char* protocol_name) {
-  return strcmp(protocol_name, "grpc+A") == 0;
-}
-
-bool accept_functionB(const char* protocol_name) {
-  return strcmp(protocol_name, "grpc+B") == 0;
-}
-
-struct SomeServerData {
-  bool server_started = false;
-};
-
-struct SomeRendezvousData {
-  int test = 0;
-};
-
-void* init_function(const TF_GrpcServer* server, TF_Status* status) {
-  SomeServerData* server_data = new SomeServerData();
-  TF_SetStatus(status, TF_OK, "");
-  return server_data;
-}
-
-void start_function(const TF_GrpcServer* server, void* context,
-                    TF_Status* status) {
-  auto* server_data = static_cast<SomeServerData*>(context);
-  server_data->server_started = true;
-  TF_SetStatus(status, TF_OK, "");
-}
-
-void stop_function(const TF_GrpcServer* server, void* context,
-                   TF_Status* status) {
-  TF_SetStatus(status, TF_OK, "");
-}
-
-void join_function(const TF_GrpcServer* server, void* context,
-                   TF_Status* status) {
-  TF_SetStatus(status, TF_OK, "");
-}
-
-void delete_function(void* context) {
-  auto* server_data = static_cast<SomeServerData*>(context);
-  delete server_data;
-}
-
-void* rendezvous_init_function(void* server_context) {
-  return new SomeRendezvousData();
-}
-
-void Deallocator(void* data, size_t, void* arg) {
-  tensorflow::cpu_allocator()->DeallocateRaw(data);
-  *reinterpret_cast<bool*>(arg) = true;
-}
-
-void receive_from_remote_async_function(TF_ParsedKey* key,
-                                        TF_RendezvousArgs* args,
-                                        TF_RendezvousDoneCallback* callback,
-                                        void* context) {
-  // Create dummy tensor
-  const int num_bytes = 6 * sizeof(float);
-  float* values =
-      reinterpret_cast<float*>(tensorflow::cpu_allocator()->AllocateRaw(
-          EIGEN_MAX_ALIGN_BYTES, num_bytes));
-  int64_t dims[] = {2, 3};
-  bool deallocator_called = false;
-  auto* tensor = TF_NewTensor(TF_FLOAT, dims, 2, values, num_bytes,
-                              &Deallocator, &deallocator_called);
-  callback->tensor = tensor;
-  auto* tf_status = TF_NewStatus();
-  TF_SetStatus(tf_status, TF_OK, "");
-  callback->status = tf_status;
-  TF_RendezvousDone(callback);
-  TF_DeleteStatus(tf_status);
-  TF_DeleteTensor(tensor);
-}
-
-void rendezvous_delete_function(void* context) {
-  auto* rendezvous_data = static_cast<SomeRendezvousData*>(context);
-  delete rendezvous_data;
-}
-
-tensorflow::ServerDef GetServerDef(const string& protocol,
-                                   const string& job_name, int num_tasks) {
-  tensorflow::ServerDef server_def;
-  server_def.set_protocol(protocol);
-  server_def.set_job_name(job_name);
-  server_def.set_task_index(0);
-  tensorflow::ClusterDef* cluster_def = server_def.mutable_cluster();
-  tensorflow::JobDef* job_def = cluster_def->add_job();
-  job_def->set_name(job_name);
-  for (int i = 0; i < num_tasks; i++) {
-    int port = tensorflow::testing::PickUnusedPortOrDie();
-    job_def->mutable_tasks()->insert(
-        {i, tensorflow::strings::StrCat("localhost:", port)});
-  }
-  return server_def;
-}
-
-class NetworksTest : public ::testing::Test {
- public:
-  ~NetworksTest() override {}
-
-  SomeServerData* GetServerData(CGrpcServer* server) {
-    EXPECT_NE(server->context_, nullptr);
-    return static_cast<SomeServerData*>(server->context_);
-  }
-};
-
-Rendezvous::ParsedKey Key(const string& sender, const uint64 incarnation,
-                          const string& receiver, const string& name) {
-  Rendezvous::ParsedKey result;
-  CHECK(
-      Rendezvous::ParseKey(Rendezvous::CreateKey(sender, incarnation, receiver,
-                                                 name, FrameAndIter(0, 0)),
-                           &result)
-          .ok());
-  return result;
-}
-
-void InitializeRendezvous(GrpcServer* grpc_server, ServerDef* server_def,
-                          RemoteRendezvous* remote_rendezvous) {
-  int rendezvous_id = 0;
-  auto session_name = tensorflow::strings::StrCat("test_", rendezvous_id);
-  TF_EXPECT_OK(grpc_server->worker_env()->session_mgr->CreateSession(
-      session_name, *server_def, true));
-
-  std::shared_ptr<tensorflow::WorkerSession> worker_session;
-  TF_EXPECT_OK(grpc_server->worker_env()->session_mgr->WorkerSessionForSession(
-      session_name, &worker_session));
-
-  TF_EXPECT_OK(remote_rendezvous->Initialize(worker_session.get()));
-}
-
-TEST_F(NetworksTest, TestStartServer) {
-  auto* rendezvous_builder = TF_NewRemoteRendezvousBuilder(
-      rendezvous_init_function, receive_from_remote_async_function,
-      rendezvous_delete_function);
-
-  TF_Status* tf_status = TF_NewStatus();
-  TF_GrpcServerFactory* factory = TF_NewGrpcServerFactory(
-      accept_functionA, init_function, start_function, stop_function,
-      join_function, delete_function, rendezvous_builder);
-  TF_RegisterGrpcServerFactory("testfactoryA", factory);
-
-  ServerDef server_def = GetServerDef("grpc+A", "localhost", 1);
-  std::unique_ptr<ServerInterface> server;
-  TF_EXPECT_OK(NewServer(server_def, &server));
-  auto* grpc_server = static_cast<CGrpcServer*>(server.get());
-  auto* server_data = GetServerData(grpc_server);
-  ASSERT_FALSE(server_data->server_started);
-
-  TF_EXPECT_OK(server->Start());
-  ASSERT_TRUE(server_data->server_started);
-
-  TF_DeleteStatus(tf_status);
-  TF_DeleteGrpcServerFactory(factory);
-  TF_DeleteRemoteRendezvousBuilder(rendezvous_builder);
-  // TODO(annarev): find a clean way to shutdown server.
-  server.release();
-}
-
-TEST_F(NetworksTest, TestReceiveData) {
-  auto* rendezvous_builder = TF_NewRemoteRendezvousBuilder(
-      rendezvous_init_function, receive_from_remote_async_function,
-      rendezvous_delete_function);
-
-  TF_Status* tf_status = TF_NewStatus();
-  TF_GrpcServerFactory* factory = TF_NewGrpcServerFactory(
-      accept_functionB, init_function, start_function, stop_function,
-      join_function, delete_function, rendezvous_builder);
-  TF_RegisterGrpcServerFactory("testfactoryB", factory);
-
-  ServerDef server_def = GetServerDef("grpc+B", "localhost", 1);
-  std::unique_ptr<ServerInterface> server;
-  TF_EXPECT_OK(NewServer(server_def, &server));
-  auto* grpc_server = static_cast<CGrpcServer*>(server.get());
-
-  TF_EXPECT_OK(server->Start());
-  auto* rendezvous_mgr = grpc_server->worker_env()->rendezvous_mgr;
-  auto* remote_rendezvous = rendezvous_mgr->Find(0);
-
-  auto key = Key("/job:localhost/replica:1/task:2/device:CPU:0", 1,
-                 "/job:localhost/replica:0/task:0/device:CPU:0", "test");
-  Rendezvous::Args args;
-  bool done_callback_called = false;
-  auto* done_callback_called_ptr = &done_callback_called;
-  absl::Notification notification;
-  auto* notification_ptr = &notification;
-
-  InitializeRendezvous(grpc_server, &server_def, remote_rendezvous);
-  remote_rendezvous->RecvAsync(
-      key, args,
-      [done_callback_called_ptr, notification_ptr](
-          const Status&, const Rendezvous::Args&, const Rendezvous::Args&,
-          const Tensor&, const bool) mutable {
-        *done_callback_called_ptr = true;
-        notification_ptr->Notify();
-      });
-  notification.WaitForNotificationWithTimeout(absl::Seconds(10));
-  ASSERT_EQ(done_callback_called, true);
-
-  TF_DeleteStatus(tf_status);
-  TF_DeleteGrpcServerFactory(factory);
-  TF_DeleteRemoteRendezvousBuilder(rendezvous_builder);
-  // Server doesn't have a clean shutdown.
-  server.release();
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/ops/BUILD b/tensorflow/c/experimental/ops/BUILD
new file mode 100644
index 00000000000..d13d7a72d3e
--- /dev/null
+++ b/tensorflow/c/experimental/ops/BUILD
@@ -0,0 +1,48 @@
+# Experimental ops. These will eventually be replaced by machine-generated versions.
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "array_ops",
+    srcs = [
+        "array_ops.cc",
+    ],
+    hdrs = [
+        "array_ops.h",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        "//tensorflow/c/eager:abstract_context",
+        "//tensorflow/c/eager:abstract_operation",
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:c_api_unified_internal",
+        "//tensorflow/core/lib/llvm_rtti",
+        "//tensorflow/core/platform:errors",
+    ],
+)
+
+cc_library(
+    name = "math_ops",
+    srcs = [
+        "math_ops.cc",
+    ],
+    hdrs = [
+        "math_ops.h",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        ":array_ops",
+        "//tensorflow/c/eager:abstract_context",
+        "//tensorflow/c/eager:abstract_operation",
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:c_api_unified_internal",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core/lib/llvm_rtti",
+        "//tensorflow/core/platform:errors",
+    ],
+)
diff --git a/tensorflow/c/experimental/ops/array_ops.cc b/tensorflow/c/experimental/ops/array_ops.cc
new file mode 100644
index 00000000000..ab2d114d9d9
--- /dev/null
+++ b/tensorflow/c/experimental/ops/array_ops.cc
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/ops/array_ops.h"
+
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace tensorflow {
+namespace ops {
+// Creates an Identity op.
+Status Identity(AbstractContext* ctx,
+                absl::Span<AbstractTensorHandle* const> inputs,
+                absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr identity_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(
+      identity_op->Reset("Identity", /*raw_device_name=*/nullptr));
+  if (isa<tensorflow::tracing::TracingOperation>(identity_op.get())) {
+    TF_RETURN_IF_ERROR(dyn_cast<tracing::TracingOperation>(identity_op.get())
+                           ->SetOpName(name));
+  }
+  TF_RETURN_IF_ERROR(identity_op->AddInput(inputs[0]));
+  int num_retvals = 1;
+  return identity_op->Execute(outputs, &num_retvals);
+}
+
+}  // namespace ops
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/ops/array_ops.h b/tensorflow/c/experimental/ops/array_ops.h
new file mode 100644
index 00000000000..226461fd286
--- /dev/null
+++ b/tensorflow/c/experimental/ops/array_ops.h
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_ARRAY_OPS_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_ARRAY_OPS_H_
+
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/abstract_operation.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+
+namespace tensorflow {
+namespace ops {
+Status Identity(AbstractContext* ctx,
+                absl::Span<AbstractTensorHandle* const> inputs,
+                absl::Span<AbstractTensorHandle*> outputs, const char* name);
+}  // namespace ops
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_ARRAY_OPS_H_
diff --git a/tensorflow/c/experimental/ops/math_ops.cc b/tensorflow/c/experimental/ops/math_ops.cc
new file mode 100644
index 00000000000..e91acbd6370
--- /dev/null
+++ b/tensorflow/c/experimental/ops/math_ops.cc
@@ -0,0 +1,55 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/ops/math_ops.h"
+
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/experimental/ops/array_ops.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/errors.h"
+namespace tensorflow {
+namespace ops {
+using tensorflow::tracing::TracingOperation;
+
+Status Mul(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr mul_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(mul_op->Reset("Mul", /*raw_device_name=*/nullptr));
+  if (isa<TracingOperation>(mul_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<TracingOperation>(mul_op.get())->SetOpName(name));
+  }
+  TF_RETURN_IF_ERROR(mul_op->AddInput(inputs[0]));
+  TF_RETURN_IF_ERROR(mul_op->AddInput(inputs[1]));
+  int num_retvals = 1;
+  return mul_op->Execute(outputs, &num_retvals);
+}
+
+Status Conj(AbstractContext* ctx,
+            absl::Span<AbstractTensorHandle* const> inputs,
+            absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  auto dtype = inputs[0]->DataType();
+  if (DataTypeIsFloating(BaseType(dtype)) ||
+      DataTypeIsInteger(BaseType(dtype))) {
+    TF_RETURN_IF_ERROR(Identity(ctx, inputs, outputs, name));
+  } else {
+    return errors::Unimplemented("Conj does not support complex types yet.");
+  }
+  return Status::OK();
+}
+
+}  // namespace ops
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/ops/math_ops.h b/tensorflow/c/experimental/ops/math_ops.h
new file mode 100644
index 00000000000..4d7c3d838ce
--- /dev/null
+++ b/tensorflow/c/experimental/ops/math_ops.h
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_MATH_OPS_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_MATH_OPS_H_
+
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+
+namespace tensorflow {
+namespace ops {
+Status Mul(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs, const char* name);
+Status Conj(AbstractContext* ctx,
+            absl::Span<AbstractTensorHandle* const> inputs,
+            absl::Span<AbstractTensorHandle*> outputs, const char* name);
+}  // namespace ops
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_MATH_OPS_H_
diff --git a/tensorflow/c/experimental/rendezvous.cc b/tensorflow/c/experimental/rendezvous.cc
deleted file mode 100644
index c996cfb44f3..00000000000
--- a/tensorflow/c/experimental/rendezvous.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/c/experimental/rendezvous.h"
-
-#include <functional>
-
-#include "tensorflow/c/c_api.h"
-#include "tensorflow/c/c_api_internal.h"
-#include "tensorflow/c/experimental/rendezvous_internal.h"
-#include "tensorflow/core/distributed_runtime/base_rendezvous_mgr.h"
-#include "tensorflow/core/distributed_runtime/worker_env.h"
-#include "tensorflow/core/framework/allocator.h"
-#include "tensorflow/core/framework/rendezvous.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/stringpiece.h"
-
-namespace tensorflow {
-
-CRemoteRendezvous::CRemoteRendezvous(const WorkerEnv* env, int64 step_id,
-                                     void (*receive_from_remote_async_function)(
-                                         TF_ParsedKey*, TF_RendezvousArgs*,
-                                         TF_RendezvousDoneCallback*,
-                                         void* context),
-                                     void (*delete_function)(void* context),
-                                     void* server_context)
-    : BaseRemoteRendezvous(env, step_id),
-      receive_from_remote_async_function_(receive_from_remote_async_function),
-      delete_function_(delete_function),
-      context_(nullptr) {}
-
-void CRemoteRendezvous::RecvFromRemoteAsync(const Rendezvous::ParsedKey& parsed,
-                                            const Rendezvous::Args& args,
-                                            DoneCallback done) {
-  if (args.cancellation_manager != nullptr) {
-    VLOG(1) << "WARNING: CRemoteRendezvous does not support cancellation.";
-  }
-  TF_ParsedKey key;
-  key.src_device = parsed.src_device.data();
-  key.src_device_len = parsed.src_device.size();
-  key.dst_device = parsed.dst_device.data();
-  key.dst_device_len = parsed.dst_device.size();
-  key.full_key = parsed.FullKey().data();
-  key.full_key_len = parsed.FullKey().size();
-
-  TF_DeviceContext* device_context = new TF_DeviceContext();
-  device_context->context = args.device_context;
-
-  TF_AllocatorAttributes* alloc_attrs = new TF_AllocatorAttributes();
-  alloc_attrs->value = args.alloc_attrs.value;
-  alloc_attrs->scope_id = args.alloc_attrs.scope_id;
-  alloc_attrs->on_host = args.alloc_attrs.on_host();
-  alloc_attrs->nic_compatible = args.alloc_attrs.nic_compatible();
-
-  TF_RendezvousArgs* cargs = new TF_RendezvousArgs();
-  cargs->device_context = device_context;
-  cargs->alloc_attrs = alloc_attrs;
-
-  TF_RendezvousDoneCallback* done_callback = new TF_RendezvousDoneCallback();
-  done_callback->done_callback = done;
-  done_callback->recv_args = cargs;
-
-  receive_from_remote_async_function_(&key, cargs, done_callback, context_);
-}
-
-CRemoteRendezvous::~CRemoteRendezvous() { delete_function_(context_); }
-}  // namespace tensorflow
-
-TF_RemoteRendezvousBuilder* TF_NewRemoteRendezvousBuilder(
-    void* (*init_function)(void* server_context),
-    void (*receive_from_remote_async_function)(TF_ParsedKey*,
-                                               TF_RendezvousArgs*,
-                                               TF_RendezvousDoneCallback*,
-                                               void* context),
-    void (*delete_function)(void* context)) {
-  TF_RemoteRendezvousBuilder* builder = new TF_RemoteRendezvousBuilder();
-  builder->init_function = init_function;
-  builder->delete_function = delete_function;
-  builder->receive_from_remote_async_function =
-      receive_from_remote_async_function;
-  return builder;
-}
-
-void TF_DeleteRemoteRendezvousBuilder(
-    TF_RemoteRendezvousBuilder* rendezvous_builder) {
-  DCHECK_NE(rendezvous_builder, nullptr);
-  delete rendezvous_builder;
-}
-
-TF_CAPI_EXPORT extern void TF_RendezvousDone(
-    TF_RendezvousDoneCallback* callback) {
-  DCHECK_NE(callback, nullptr);
-  ::tensorflow::Tensor tensor;
-  TF_CHECK_OK(TF_TensorToTensor(callback->tensor, &tensor));
-  ::tensorflow::Rendezvous::Args recv_args;
-  recv_args.alloc_attrs.value = callback->recv_args->alloc_attrs->value;
-  recv_args.alloc_attrs.scope_id = callback->recv_args->alloc_attrs->scope_id;
-  recv_args.device_context = callback->recv_args->device_context->context;
-  ::tensorflow::Rendezvous::Args sent_args;
-
-  callback->done_callback(callback->status->status, sent_args, recv_args,
-                          tensor, callback->dead);
-
-  if (callback->recv_args) {
-    DCHECK_NE(callback->recv_args, nullptr);
-    DCHECK_NE(callback->recv_args->alloc_attrs, nullptr);
-    DCHECK_NE(callback->recv_args->device_context, nullptr);
-    delete callback->recv_args->alloc_attrs;
-    delete callback->recv_args->device_context;
-    delete callback->recv_args;
-  }
-  delete callback;
-  callback = nullptr;
-}
diff --git a/tensorflow/c/experimental/rendezvous.h b/tensorflow/c/experimental/rendezvous.h
deleted file mode 100644
index 5b007d52429..00000000000
--- a/tensorflow/c/experimental/rendezvous.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_C_EXPERIMENTAL_RENDEZVOUS_H_
-#define TENSORFLOW_C_EXPERIMENTAL_RENDEZVOUS_H_
-
-#include "tensorflow/c/c_api.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// --------------------------------------------------------------------------
-// C API for Rendezvous.
-// NOTE: This API is unstable and almost certainly will change in the near
-// future.
-//
-// Custom rendezvous allows for custom implementations of Recv call.
-//
-// Users wishing to create custom rendezvous objects should call
-// TF_NewRemoteRendezvousBuilder and pass returned TF_RemoteRendezvousBuilder
-// to to TF_NewServerFactory.
-
-typedef struct TF_RemoteRendezvousBuilder TF_RemoteRendezvousBuilder;
-typedef struct TF_ParsedKey TF_ParsedKey;
-typedef struct TF_RendezvousArgs TF_RendezvousArgs;
-typedef struct TF_RendezvousDoneCallback TF_RendezvousDoneCallback;
-
-// Creates a new TF_RemoteRendezvousBuilder instance.
-// Rendezvous instances will forward calls to init_function,
-// receive_from_remote_async_function and delete_function passed here.
-//
-// Note that receive_from_remote_async_function implementation must call
-// TF_Done with the TF_DoneCallback passed as an argument.
-TF_CAPI_EXPORT extern TF_RemoteRendezvousBuilder* TF_NewRemoteRendezvousBuilder(
-    void* (*init_function)(void* server_context),
-    void (*receive_from_remote_async_function)(TF_ParsedKey*,
-                                               TF_RendezvousArgs*,
-                                               TF_RendezvousDoneCallback*,
-                                               void* context),
-    void (*delete_function)(void* context));
-
-// Deletes TF_RemoteRendezvousBuilder instances.
-TF_CAPI_EXPORT extern void TF_DeleteRemoteRendezvousBuilder(
-    TF_RemoteRendezvousBuilder* rendezvous_builder);
-
-// Calls TF_DoneCallback and destroys callback instance and
-// TF_DoneCallback members except `tensor` and `status`. Caller is
-// responsible for deleting `tensor` and `status` after TF_Done returns.
-TF_CAPI_EXPORT extern void TF_RendezvousDone(
-    TF_RendezvousDoneCallback* callback);
-
-#ifdef __cplusplus
-} /* end extern "C" */
-#endif
-#endif  // TENSORFLOW_C_EXPERIMENTAL_RENDEZVOUS_H_
diff --git a/tensorflow/c/experimental/rendezvous_internal.h b/tensorflow/c/experimental/rendezvous_internal.h
deleted file mode 100644
index f06686023e6..00000000000
--- a/tensorflow/c/experimental/rendezvous_internal.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_C_EXPERIMENTAL_RENDEZVOUS_INTERNAL_H_
-#define TENSORFLOW_C_EXPERIMENTAL_RENDEZVOUS_INTERNAL_H_
-
-#include <stddef.h>
-
-#include "tensorflow/c/c_api.h"
-#include "tensorflow/c/experimental/rendezvous.h"
-#include "tensorflow/core/distributed_runtime/base_rendezvous_mgr.h"
-#include "tensorflow/core/distributed_runtime/worker_env.h"
-#include "tensorflow/core/framework/device_base.h"
-#include "tensorflow/core/framework/rendezvous.h"
-#include "tensorflow/core/platform/macros.h"
-
-struct TF_ParsedKey {
-  // char* members might not be null-terminated.
-  const char* src_device;
-  size_t src_device_len;
-  const char* dst_device;
-  size_t dst_device_len;
-  const char* full_key;
-  size_t full_key_len;
-};
-
-struct TF_AllocatorAttributes {
-  bool on_host;
-  bool nic_compatible;
-  // NOTE: The upper 8 bits of the value are reserved for
-  // device-specific uses.  Implementors of a device can interpret these
-  // upper 8 bits in device-specific ways, and ops implemented for those
-  // devices are responsible for setting those 8 bits appropriately.
-  tensorflow::uint32 value = 0;
-  // EXPERIMENTAL: If this is greater than zero, then allocation is delegated to
-  // a named special-purpose allocator on the same device.
-  tensorflow::int32 scope_id = 0;
-};
-
-struct TF_DeviceContext {
-  ::tensorflow::DeviceContext* context;
-};
-
-struct TF_RendezvousArgs {
-  const TF_DeviceContext* device_context;
-  const TF_AllocatorAttributes* alloc_attrs;
-};
-
-struct TF_RendezvousDoneCallback {
-  ::tensorflow::Rendezvous::DoneCallback done_callback;
-
-  // TODO(annarev): figure out if we should also support sent_args.
-  const TF_RendezvousArgs* recv_args;
-  TF_Tensor* tensor = nullptr;
-  TF_Status* status;
-  bool dead;
-};
-
-struct TF_RemoteRendezvousBuilder {
-  void* (*init_function)(void* server_context);
-  void (*receive_from_remote_async_function)(TF_ParsedKey*, TF_RendezvousArgs*,
-                                             TF_RendezvousDoneCallback*,
-                                             void* context);
-  void (*delete_function)(void* context);
-  void* server_context;
-};
-
-namespace tensorflow {
-
-class CRemoteRendezvous : public BaseRemoteRendezvous {
- public:
-  CRemoteRendezvous(const WorkerEnv* env, int64 step_id,
-                    void (*receive_from_remote_async_function)(
-                        TF_ParsedKey*, TF_RendezvousArgs*,
-                        TF_RendezvousDoneCallback*, void* context),
-                    void (*delete_function)(void* context),
-                    void* server_context);
-
-  void SetContext(void* context) { context_ = context; }
-
- protected:
-  void RecvFromRemoteAsync(const Rendezvous::ParsedKey& parsed,
-                           const Rendezvous::Args& args,
-                           DoneCallback done) override;
-
- private:
-  ~CRemoteRendezvous() override;
-
-  void (*receive_from_remote_async_function_)(TF_ParsedKey*, TF_RendezvousArgs*,
-                                              TF_RendezvousDoneCallback*,
-                                              void* context);
-  void (*delete_function_)(void* context);
-  void* context_;
-  TF_DISALLOW_COPY_AND_ASSIGN(CRemoteRendezvous);
-};
-
-class CRendezvousMgr : public BaseRendezvousMgr {
- public:
-  CRendezvousMgr(const WorkerEnv* env,
-                 const TF_RemoteRendezvousBuilder* rendezvous_builder)
-      : BaseRendezvousMgr(env), rendezvous_builder_(rendezvous_builder) {}
-
- protected:
-  BaseRemoteRendezvous* Create(int64 step_id,
-                               const WorkerEnv* worker_env) override {
-    auto* rendezvous = new CRemoteRendezvous(
-        worker_env, step_id,
-        rendezvous_builder_->receive_from_remote_async_function,
-        rendezvous_builder_->delete_function,
-        rendezvous_builder_->server_context);
-
-    rendezvous->SetContext(rendezvous_builder_->init_function(
-        rendezvous_builder_->server_context));
-    return rendezvous;
-  }
-
- private:
-  const TF_RemoteRendezvousBuilder* rendezvous_builder_;
-  TF_DISALLOW_COPY_AND_ASSIGN(CRendezvousMgr);
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_C_EXPERIMENTAL_RENDEZVOUS_INTERNAL_H_
diff --git a/tensorflow/c/experimental/saved_model/core/BUILD b/tensorflow/c/experimental/saved_model/core/BUILD
index 5931e229e28..b2e432782de 100644
--- a/tensorflow/c/experimental/saved_model/core/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/BUILD
@@ -26,6 +26,7 @@ cc_library(
         ":function_metadata",
         "//tensorflow/c/eager:immediate_execution_operation",
         "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -113,8 +114,23 @@ cc_library(
     deps = [
         ":concrete_function",
         ":saved_model_api",
+        ":saved_model_utils",
+        "//tensorflow/c:tensor_interface",
         "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/c/experimental/saved_model/core/ops:restore_ops",
+        "//tensorflow/c/experimental/saved_model/core/revived_types:constant",
+        "//tensorflow/c/experimental/saved_model/core/revived_types:tensorhandle_convertible",
+        "//tensorflow/c/experimental/saved_model/core/revived_types:tf_concrete_function",
+        "//tensorflow/c/experimental/saved_model/core/revived_types:variable",
+        "//tensorflow/cc/saved_model:bundle_v2",
+        "//tensorflow/cc/saved_model:constants",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime/eager:tensor_handle",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
     ],
 )
@@ -131,6 +147,7 @@ cc_library(
         "//tensorflow/c/eager:immediate_execution_operation",
         "//tensorflow/c/eager:immediate_execution_tensor_handle",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -199,6 +216,23 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "signature_flattening_test",
+    srcs = [
+        "signature_flattening_test.cc",
+    ],
+    deps = [
+        ":saved_model_utils",
+        "//tensorflow/c/experimental/saved_model/core:tf_concrete_function_test_protos",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/common_runtime/eager:core",
+    ],
+)
+
 tf_cc_test(
     name = "tf_concrete_function_loading_test",
     srcs = [
diff --git a/tensorflow/c/experimental/saved_model/core/concrete_function.h b/tensorflow/c/experimental/saved_model/core/concrete_function.h
index 2cc627bcf27..da3a64b91a3 100644
--- a/tensorflow/c/experimental/saved_model/core/concrete_function.h
+++ b/tensorflow/c/experimental/saved_model/core/concrete_function.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/types/span.h"
 #include "tensorflow/c/eager/immediate_execution_operation.h"
 #include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/c/experimental/saved_model/core/function_metadata.h"
@@ -38,10 +39,9 @@ class ConcreteFunction {
   virtual ~ConcreteFunction() = default;
 
   // This method returns the "Call" Op used to execute the function.
-  virtual Status GetCallOp(ImmediateOpPtr* out) = 0;
+  virtual Status GetCallOp(absl::Span<AbstractTensorHandle* const> inputs,
+                           ImmediateOpPtr* out) = 0;
 
-  virtual const std::vector<ImmediateExecutionTensorHandle*>& GetCaptures()
-      const = 0;
   virtual const FunctionMetadata& GetFunctionMetadata() const = 0;
 };
 
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/BUILD b/tensorflow/c/experimental/saved_model/core/revived_types/BUILD
index 8bb15674db0..2b883618c87 100644
--- a/tensorflow/c/experimental/saved_model/core/revived_types/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/BUILD
@@ -69,6 +69,7 @@ cc_library(
     ],
     deps = [
         ":tensorhandle_convertible",
+        "//tensorflow/c/eager:abstract_tensor_handle",
         "//tensorflow/c/eager:immediate_execution_context",
         "//tensorflow/c/eager:immediate_execution_operation",
         "//tensorflow/c/eager:immediate_execution_tensor_handle",
@@ -77,5 +78,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/common_runtime/eager:context",
+        "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.cc b/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.cc
index aa6f0e7205e..f734f9eca66 100644
--- a/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.cc
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
 #include "tensorflow/c/eager/immediate_execution_operation.h"
 #include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h"
@@ -60,16 +62,12 @@ Status TFConcreteFunction::Create(
   return Status();
 }
 
-const std::vector<ImmediateExecutionTensorHandle*>&
-TFConcreteFunction::GetCaptures() const {
-  return captures_;
-}
-
 const FunctionMetadata& TFConcreteFunction::GetFunctionMetadata() const {
   return metadata_;
 }
 
-Status TFConcreteFunction::GetCallOp(ImmediateOpPtr* out) {
+Status TFConcreteFunction::GetCallOp(
+    absl::Span<AbstractTensorHandle* const> inputs, ImmediateOpPtr* out) {
   out->reset(ctx_->CreateOperation());
   // In eager mode, TF2 python executes functions by constructing an op with
   // the name of the functiondef:
@@ -81,6 +79,16 @@ Status TFConcreteFunction::GetCallOp(ImmediateOpPtr* out) {
   // PartitionedCallOp for compatibility with "tooling that assumes functions in
   // graphs are PartitionedCallOps".
   TF_RETURN_IF_ERROR((*out)->Reset(name_.c_str(), nullptr));
+
+  // Adding the user-provided inputs to the function.
+  TF_RETURN_IF_ERROR((*out)->AddInputList(inputs));
+
+  absl::Span<AbstractTensorHandle* const> captures(
+      reinterpret_cast<AbstractTensorHandle**>(captures_.data()),
+      captures_.size());
+
+  // Adding the captures of the function.
+  TF_RETURN_IF_ERROR((*out)->AddInputList(captures));
   return Status();
 }
 
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h b/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h
index 71c8322414d..d38f3546f91 100644
--- a/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h
@@ -58,10 +58,8 @@ class TFConcreteFunction : public ConcreteFunction {
                        std::unique_ptr<TFConcreteFunction>* out);
 
   // This method returns the "Call" Op used to execute the function.
-  Status GetCallOp(ImmediateOpPtr* out) override;
-
-  const std::vector<ImmediateExecutionTensorHandle*>& GetCaptures()
-      const override;
+  Status GetCallOp(absl::Span<AbstractTensorHandle* const> inputs,
+                   ImmediateOpPtr* out) override;
 
   const FunctionMetadata& GetFunctionMetadata() const override;
 
diff --git a/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc b/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc
index 2037c4886de..0d97741d7f0 100644
--- a/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc
+++ b/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/core/protobuf/saved_object_graph.pb.h"
 #include "tensorflow/core/protobuf/struct.pb.h"
@@ -36,52 +37,8 @@ namespace tensorflow {
 namespace internal {
 namespace {
 
-// This returns the size of `tf.nest.flatten(value)`, on values that are
-// used in tf.function's input_signatures.
-int FlattenedSize(const tensorflow::StructuredValue& value, Status* status) {
-  // This follows the logic from
-  // https://github.com/tensorflow/tensorflow/blob/1c064ab76064c58e54261b805027474885a1534d/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc#L2775
-  switch (value.kind_case()) {
-    case StructuredValue::kDictValue: {
-      const DictValue& dict = value.dict_value();
-      int size = 0;
-      for (const auto& field : dict.fields()) {
-        size += FlattenedSize(field.second, status);
-      }
-      return size;
-    }
-    case StructuredValue::kTupleValue: {
-      const TupleValue& tuple = value.tuple_value();
-      int size = 0;
-      for (const StructuredValue& value : tuple.values()) {
-        size += FlattenedSize(value, status);
-      }
-      return size;
-    }
-    case StructuredValue::kListValue: {
-      const ListValue& list = value.list_value();
-      int size = 0;
-      for (const StructuredValue& value : list.values()) {
-        size += FlattenedSize(value, status);
-      }
-      return size;
-    }
-    case StructuredValue::kTensorSpecValue: {
-      return 1;
-    }
-    case StructuredValue::kNoneValue: {
-      // Base case: do nothing.
-      // This arises, for example, as the top-level object of an output
-      // signature when there are no return values.
-      return 0;
-    }
-    default: {
-      status->Update(errors::Internal("Unhandled structured value kind ",
-                                      value.kind_case()));
-      return 0;
-    }
-  }
-}
+using StructuredValueDictEntry =
+    protobuf::MapPair<std::string, StructuredValue>;
 
 // Perform some basic sanity checks on SavedConcreteFunction's input and
 // output signatures with respect to the corresponding FunctionDef's input
@@ -111,34 +68,34 @@ Status ValidateSavedFunctionCompatibleWithFunctionDef(
   // https://github.com/tensorflow/tensorflow/blob/1c064ab76064c58e54261b805027474885a1534d/tensorflow/python/eager/function.py#L1974-L1979
 
   const std::string& name = function_def->signature().name();
+
   const StructuredValue& input_signature =
       saved_concrete_function.canonicalized_input_signature();
-  Status status;
-  int input_signature_size = FlattenedSize(input_signature, &status);
-  TF_RETURN_IF_ERROR(status);
-  if (input_signature_size + saved_concrete_function.bound_inputs_size() !=
+  std::vector<const TensorSpecProto*> input_specs;
+  TF_RETURN_IF_ERROR(FlattenSignature(input_signature, &input_specs));
+  if (input_specs.size() + saved_concrete_function.bound_inputs_size() !=
       function_def->signature().input_arg_size()) {
     return errors::FailedPrecondition(
         "FunctionDef ", name, " has ",
         function_def->signature().input_arg_size(),
-        " inputs, but the SavedConcreteFunction has ", input_signature_size,
+        " inputs, but the SavedConcreteFunction has ", input_specs.size(),
         " flattened user inputs and ",
         saved_concrete_function.bound_inputs_size(), " captured inputs.");
   }
 
   const StructuredValue& output_signature =
       saved_concrete_function.output_signature();
-  int output_signature_size = FlattenedSize(output_signature, &status);
-  TF_RETURN_IF_ERROR(status);
-  if (output_signature_size != function_def->signature().output_arg_size()) {
+  std::vector<const TensorSpecProto*> output_specs;
+  TF_RETURN_IF_ERROR(FlattenSignature(output_signature, &output_specs));
+  if (output_specs.size() != function_def->signature().output_arg_size()) {
     return errors::FailedPrecondition(
         "FunctionDef ", name, " has ",
         function_def->signature().output_arg_size(),
-        " outputs, but the SavedConcreteFunction has ", output_signature_size,
+        " outputs, but the SavedConcreteFunction has ", output_specs.size(),
         " flattened outputs.");
   }
 
-  return status;
+  return Status();
 }
 
 }  // namespace
@@ -197,6 +154,62 @@ Status LoadTFConcreteFunction(
                                     out);
 }
 
+Status FlattenSignature(const StructuredValue& signature,
+                        std::vector<const TensorSpecProto*>* flattened_specs) {
+  // This follows the logic from
+  // https://github.com/tensorflow/tensorflow/blob/1c064ab76064c58e54261b805027474885a1534d/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc#L2775
+  switch (signature.kind_case()) {
+    case StructuredValue::kDictValue: {
+      // Dictionaries must be sorted in order of keys
+      const DictValue& dict = signature.dict_value();
+      std::vector<const StructuredValueDictEntry*> entries;
+      entries.reserve(dict.fields_size());
+      for (const auto& field : dict.fields()) {
+        entries.push_back(&field);
+      }
+
+      std::sort(entries.begin(), entries.end(),
+                [](const StructuredValueDictEntry* x,
+                   const StructuredValueDictEntry* y) {
+                  return x->first < y->first;
+                });
+
+      for (const auto& entry : entries) {
+        TF_RETURN_IF_ERROR(FlattenSignature(entry->second, flattened_specs));
+      }
+      return Status();
+    }
+    case StructuredValue::kTupleValue: {
+      const TupleValue& tuple = signature.tuple_value();
+      for (const StructuredValue& value : tuple.values()) {
+        TF_RETURN_IF_ERROR(FlattenSignature(value, flattened_specs));
+      }
+      return Status();
+    }
+    case StructuredValue::kListValue: {
+      const ListValue& list = signature.list_value();
+      for (const StructuredValue& value : list.values()) {
+        TF_RETURN_IF_ERROR(FlattenSignature(value, flattened_specs));
+      }
+      return Status();
+    }
+    case StructuredValue::kTensorSpecValue: {
+      flattened_specs->push_back(&signature.tensor_spec_value());
+      return Status();
+    }
+    case StructuredValue::kNoneValue: {
+      // Base case: do nothing.
+      // This arises, for example, as the top-level object of an output
+      // signature when there are no return values.
+      return Status();
+    }
+    default: {
+      return errors::Internal("Unhandled structured value kind ",
+                              signature.kind_case());
+    }
+  }
+}
+
 const SavedObject* FindNodeAtPath(StringPiece path,
                                   const SavedObjectGraph& object_graph) {
   const auto& nodes = object_graph.nodes();
diff --git a/tensorflow/c/experimental/saved_model/core/saved_model_utils.h b/tensorflow/c/experimental/saved_model/core/saved_model_utils.h
index 57f30afa91b..68bfbe32222 100644
--- a/tensorflow/c/experimental/saved_model/core/saved_model_utils.h
+++ b/tensorflow/c/experimental/saved_model/core/saved_model_utils.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+#include "tensorflow/core/protobuf/struct.pb.h"
 
 namespace tensorflow {
 namespace internal {
@@ -59,10 +60,17 @@ Status LoadTFConcreteFunction(
         captured_objects,
     ImmediateExecutionContext* ctx, std::unique_ptr<TFConcreteFunction>* out);
 
-// Find the SavedObject in `object_graph` at location `path`. `path` must be a
-// dot-delimited string of object names relative to the root object. If no
-// object is found, returns nullptr. Callers must ensure `object_graph` outlives
-// the returned pointer.
+// Flattens `signature` into a vector of TensorSpecProto pointers back into
+// `signature`. `signature` must outlive flattened_specs. `signature` must also
+// be the input or output signature of a SavedConcreteFunction (i.e. "nested
+// structures of tensorspecs").
+Status FlattenSignature(const StructuredValue& signature,
+                        std::vector<const TensorSpecProto*>* flattened_specs);
+
+// Find the SavedObject in `object_graph` at location `path`. `path` must be
+// a dot-delimited string of object names relative to the root object. If no
+// object is found, returns nullptr. Callers must ensure `object_graph`
+// outlives the returned pointer.
 const SavedObject* FindNodeAtPath(StringPiece path,
                                   const SavedObjectGraph& object_graph);
 
diff --git a/tensorflow/c/experimental/saved_model/core/signature_flattening_test.cc b/tensorflow/c/experimental/saved_model/core/signature_flattening_test.cc
new file mode 100644
index 00000000000..9ee495f524a
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/signature_flattening_test.cc
@@ -0,0 +1,133 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/c/experimental/saved_model/core/saved_model_utils.h"
+#include "tensorflow/c/experimental/saved_model/core/tf_concrete_function_test_protos.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/struct.pb.h"
+
+namespace tensorflow {
+namespace {
+
+// Validates names, shapes, and dtypes of two tensorspecprotos are equivalent.
+bool TensorSpecsAreEqual(const TensorSpecProto& spec,
+                         const std::string& expected_name,
+                         const PartialTensorShape& expected_shape,
+                         DataType expected_dtype) {
+  return spec.name() == expected_name &&
+         PartialTensorShape(spec.shape()).IsIdenticalTo(expected_shape) &&
+         spec.dtype() == expected_dtype;
+}
+
+// This tests the common case for a tf.function w/o inputs. This ends up
+// being serialized as a tuple of an empty tuple + empty dictionary
+// (corresponding to the args, kwargs) of the function.
+TEST(SignatureFlatteningTest, ZeroArgInputSignature) {
+  std::vector<const TensorSpecProto*> flattened;
+  StructuredValue value = testing::ZeroArgInputSignature();
+  TF_EXPECT_OK(internal::FlattenSignature(value, &flattened));
+  EXPECT_EQ(flattened.size(), 0);
+}
+
+// This tests the common case for a tf.function w/o outputs. This ends up
+// being serialized as a "NoneValue".
+TEST(SignatureFlatteningTest, ZeroRetOutputSignature) {
+  std::vector<const TensorSpecProto*> flattened;
+  StructuredValue value = testing::ZeroReturnOutputSignature();
+  TF_EXPECT_OK(internal::FlattenSignature(value, &flattened));
+  EXPECT_EQ(flattened.size(), 0);
+}
+
+TEST(SignatureFlatteningTest, SingleArgInputSignature) {
+  std::vector<const TensorSpecProto*> flattened;
+  StructuredValue value = testing::SingleArgInputSignature();
+  TF_EXPECT_OK(internal::FlattenSignature(value, &flattened));
+  EXPECT_EQ(flattened.size(), 1);
+  EXPECT_TRUE(TensorSpecsAreEqual(*flattened[0],
+                                  /* expected_name = */ "x",
+                                  /* expected_shape = */ {1, 10},
+                                  /* expected_dtype = */ DT_FLOAT))
+      << "Expected " << flattened[0]->DebugString();
+}
+
+TEST(SignatureFlatteningTest, SingleReturnOutputSignature) {
+  std::vector<const TensorSpecProto*> flattened;
+  StructuredValue value = testing::SingleReturnOutputSignature();
+  TF_EXPECT_OK(internal::FlattenSignature(value, &flattened));
+  EXPECT_EQ(flattened.size(), 1);
+  EXPECT_TRUE(TensorSpecsAreEqual(*flattened[0],
+                                  /* expected_name = */ "",
+                                  /* expected_shape = */ {1},
+                                  /* expected_dtype = */ DT_FLOAT))
+      << "Expected " << flattened[0]->DebugString();
+}
+
+TEST(SignatureFlatteningTest, ThreeArgInputSignature) {
+  std::vector<const TensorSpecProto*> flattened;
+  StructuredValue value = testing::ThreeArgInputSignature();
+  TF_EXPECT_OK(internal::FlattenSignature(value, &flattened));
+  EXPECT_EQ(flattened.size(), 3);
+  EXPECT_TRUE(TensorSpecsAreEqual(*flattened[0],
+                                  /* expected_name = */ "x",
+                                  /* expected_shape = */ {1},
+                                  /* expected_dtype = */ DT_FLOAT))
+      << "Expected " << flattened[0]->DebugString();
+
+  EXPECT_TRUE(TensorSpecsAreEqual(*flattened[1],
+                                  /* expected_name = */ "y",
+                                  /* expected_shape = */ {1},
+                                  /* expected_dtype = */ DT_FLOAT))
+      << "Expected " << flattened[1]->DebugString();
+
+  EXPECT_TRUE(TensorSpecsAreEqual(*flattened[2],
+                                  /* expected_name = */ "z",
+                                  /* expected_shape = */ {1},
+                                  /* expected_dtype = */ DT_FLOAT))
+      << "Expected " << flattened[2]->DebugString();
+}
+
+// This test has an exotic outputsignature of tuple of a
+// dictionary<string,tensor>, tensor
+TEST(SignatureFlatteningTest, ThreeReturnOutputSignature) {
+  std::vector<const TensorSpecProto*> flattened;
+  StructuredValue value = testing::ThreeReturnOutputSignature();
+  TF_EXPECT_OK(internal::FlattenSignature(value, &flattened));
+  EXPECT_EQ(flattened.size(), 3);
+  EXPECT_TRUE(TensorSpecsAreEqual(*flattened[0],
+                                  /* expected_name = */ "0/a",
+                                  /* expected_shape = */ {1},
+                                  /* expected_dtype = */ DT_FLOAT))
+      << "Expected " << flattened[0]->DebugString();
+
+  EXPECT_TRUE(TensorSpecsAreEqual(*flattened[1],
+                                  /* expected_name = */ "0/b",
+                                  /* expected_shape = */ {1},
+                                  /* expected_dtype = */ DT_FLOAT))
+      << "Expected " << flattened[1]->DebugString();
+
+  EXPECT_TRUE(TensorSpecsAreEqual(*flattened[2],
+                                  /* expected_name = */ "1",
+                                  /* expected_shape = */ {1},
+                                  /* expected_dtype = */ DT_FLOAT))
+      << "Expected " << flattened[2]->DebugString();
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.cc b/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.cc
index 225ba1db9f4..0f0102be857 100644
--- a/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.cc
+++ b/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.cc
@@ -15,47 +15,364 @@ limitations under the License.
 
 #include "tensorflow/c/experimental/saved_model/core/tf_saved_model_api.h"
 
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/c/experimental/saved_model/core/concrete_function.h"
+#include "tensorflow/c/experimental/saved_model/core/ops/restore_ops.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/constant.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/variable.h"
+#include "tensorflow/c/experimental/saved_model/core/saved_model_utils.h"
+#include "tensorflow/cc/saved_model/bundle_v2.h"
+#include "tensorflow/cc/saved_model/constants.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/casts.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/tstring.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/protobuf/saved_model.pb.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+#include "tensorflow/core/protobuf/trackable_object_graph.pb.h"
 
 namespace tensorflow {
 
+// Maps from a FunctionDef's name to FunctionDef, for a given FunctionDefLibrary
+using FunctionDefMap =
+    std::unordered_map<StringPiece, const tensorflow::FunctionDef*,
+                       StringPieceHasher>;
+
+// Maps from a Nodedef's name to its corresponding AttrValues, for a given
+// Graphdef
+using NodeAttrMap =
+    std::unordered_map<StringPiece, const AttrValueMap*, StringPieceHasher>;
+
+// Maps from Node ID to an "Revived Object" implementing
+// "TensorHandleConvertible"
+using RevivedObjectMap =
+    std::unordered_map<int, std::unique_ptr<TensorHandleConvertible>>;
+
+// Maps from a functiondef's name to the corresponding "TFConcreteFunction"
+using ConcreteFunctionMap =
+    std::unordered_map<std::string, std::unique_ptr<TFConcreteFunction>>;
+
+namespace {
+
+Status ConstantFromSavedConstant(
+    ImmediateExecutionContext* ctx,
+    const tensorflow::SavedConstant& saved_constant,
+    const NodeAttrMap& node_attr_map, std::unique_ptr<Constant>* output) {
+  const std::string& const_op_name = saved_constant.operation();
+  const auto& node_name_and_attrs = node_attr_map.find(const_op_name);
+  if (node_name_and_attrs == node_attr_map.end()) {
+    return errors::FailedPrecondition(
+        "Unable to find Const operation with name'", const_op_name,
+        "' in SavedModel graphdef");
+  }
+  const AttrValueMap* attrs = node_name_and_attrs->second;
+  const auto& attr_name_and_value = attrs->find("value");
+  if (attr_name_and_value == attrs->end()) {
+    return errors::FailedPrecondition("Unable to find Const operation '",
+                                      const_op_name, "'s value attribute");
+  }
+  const TensorProto& tensor_proto = attr_name_and_value->second.tensor();
+  return internal::TensorProtoToConstant(ctx, tensor_proto, output);
+}
+
+// Restores all non-function objects in the SavedModel's object graph.
+// This function walks through the metagraph's saved object graph, and
+// constructs revived versions of SavedVariable, SavedConstant, SavedAsset, and
+// SavedResources. These are returned via the `out` parameter.
+Status ReviveObjects(
+    const MetaGraphDef& metagraph, ImmediateExecutionContext* context,
+    std::unordered_map<int, std::unique_ptr<TensorHandleConvertible>>*
+        revived_objects) {
+  // This is needed to restore "Constant" nodes by looking up their
+  // "Value" attribute.
+  NodeAttrMap node_attr_map = internal::NodeToAttrMap(metagraph.graph_def());
+
+  // Iterate through all the saved objects, restoring objects as we go.
+  // We don't recreate functions until all other objects have been created.
+  for (int i = 0; i < metagraph.object_graph_def().nodes_size(); ++i) {
+    const SavedObject& node = metagraph.object_graph_def().nodes(i);
+    if (node.kind_case() == SavedObject::kVariable) {
+      std::unique_ptr<Variable> variable;
+      TF_RETURN_IF_ERROR(
+          internal::LoadSavedVariable(context, node.variable(), &variable));
+      (*revived_objects)[i] = std::move(variable);
+    } else if (node.kind_case() == SavedObject::kConstant) {
+      std::unique_ptr<Constant> constant;
+      TF_RETURN_IF_ERROR(ConstantFromSavedConstant(context, node.constant(),
+                                                   node_attr_map, &constant));
+      (*revived_objects)[i] = std::move(constant);
+    } else if (node.kind_case() == SavedObject::kAsset) {
+      // TODO(bmzhao): Implement Asset C++ class. This should be just recreating
+      // the full path to the asset file:
+      // https://github.com/tensorflow/tensorflow/blob/6a0bdbdb7c48a3491ae1277083ae3dafb4ab4d7a/tensorflow/python/saved_model/load.py#L395-L396
+      // and storing it as a string tensor:
+      // https://github.com/tensorflow/tensorflow/blob/6a0bdbdb7c48a3491ae1277083ae3dafb4ab4d7a/tensorflow/python/training/tracking/tracking.py#L324-L325
+      return errors::Unimplemented("SavedAsset loading is not implemented yet");
+    } else if (node.kind_case() == SavedObject::kResource) {
+      // TODO(bmzhao): Figure out how resource loading works and implement it
+      return errors::Unimplemented(
+          "SavedResource loading is not implemented yet");
+    }
+  }
+  return Status();
+}
+
+Status ReviveFunctions(const MetaGraphDef& metagraph,
+                       const RevivedObjectMap& revived_objects,
+                       ImmediateExecutionContext* context,
+                       ConcreteFunctionMap* restored_functions) {
+  const FunctionDefMap function_def_map =
+      internal::FunctionNameToFunctionDefMap(metagraph.graph_def().library());
+
+  // Iterate through all objects, only examining functions.
+  for (const SavedObject& node : metagraph.object_graph_def().nodes()) {
+    if (node.kind_case() == SavedObject::kBareConcreteFunction) {
+      const std::string& function_name =
+          node.bare_concrete_function().concrete_function_name();
+
+      const SavedConcreteFunction& saved_concrete_function =
+          metagraph.object_graph_def().concrete_functions().at(function_name);
+
+      const FunctionDef* function_def = function_def_map.at(function_name);
+      std::unique_ptr<TFConcreteFunction> concrete_function;
+      TF_RETURN_IF_ERROR(internal::LoadTFConcreteFunction(
+          saved_concrete_function, function_def, revived_objects, context,
+          &concrete_function));
+      (*restored_functions)[function_name] = std::move(concrete_function);
+    } else if (node.kind_case() == SavedObject::kFunction) {
+      // We only allow loading functions that have an annotated input signature,
+      // which means there is 1:1 correspondence between tf.function
+      // <=> SavedFunction <=> SavedConcreteFunction <=> FunctionDef. This is
+      // the same restriction that MLIR has:
+      // https://github.com/tensorflow/tensorflow/blob/1c064ab76064c58e54261b805027474885a1534d/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc#L2677-L2707
+      const SavedFunction& saved_function = node.function();
+      if (saved_function.concrete_functions_size() != 1) {
+        return errors::FailedPrecondition(
+            "Only tf.functions annotated with an input signature are supported "
+            "by SavedModelAPI. This means that there should only be a single "
+            "ConcreteFunction per tf.function");
+      }
+      const std::string& function_name = saved_function.concrete_functions(0);
+      const SavedConcreteFunction& saved_concrete_function =
+          metagraph.object_graph_def().concrete_functions().at(function_name);
+
+      const FunctionDef* function_def = function_def_map.at(function_name);
+
+      std::unique_ptr<TFConcreteFunction> concrete_function;
+      TF_RETURN_IF_ERROR(internal::LoadTFConcreteFunction(
+          saved_concrete_function, function_def, revived_objects, context,
+          &concrete_function));
+      (*restored_functions)[function_name] = std::move(concrete_function);
+    }
+  }
+  return Status();
+}
+
+const TrackableObjectGraph::TrackableObject::SerializedTensor*
+FindSerializedTensorInTrackable(
+    const TrackableObjectGraph::TrackableObject& trackable_object,
+    absl::string_view name) {
+  for (const auto& maybe_serialized_tensor : trackable_object.attributes()) {
+    if (maybe_serialized_tensor.name() == name) {
+      return &maybe_serialized_tensor;
+    }
+  }
+  return nullptr;
+}
+
+// This function reads the Checkpoint embedded in the SavedModel, and calls the
+// appropriate Restore ops on each of the variables.
+// Note(bmzhao): Conceptually, objects that contain checkpointable state
+// implement the "_gather_saveables_for_checkpoint" method
+// https://github.com/tensorflow/tensorflow/blob/ddc1bbad3dfd4a089eb96014f26cc16664b1b2f8/tensorflow/python/training/tracking/base.py#L953-L983
+// which returns a dict of string key -> EITHER:
+// 1. python callable (taking a checkpoint key) returning SaveableObject OR
+// 2. variable (partitioned/resource/reference or otherwise)
+// https://github.com/tensorflow/tensorflow/blob/ddc1bbad3dfd4a089eb96014f26cc16664b1b2f8/tensorflow/python/training/saving/saveable_object.py#L58.
+// The string key becomes the "name" attribute of the SerializedTensor proto
+// in the TrackableObjectGraph,
+// https://github.com/tensorflow/tensorflow/blob/ddc1bbad3dfd4a089eb96014f26cc16664b1b2f8/tensorflow/core/protobuf/trackable_object_graph.proto#L26
+// And the checkpoint_key is a globally unique string derived from this name:
+// https://github.com/tensorflow/tensorflow/blob/842df9e6b516e42578a8d23b35d41176b9a6cf1d/tensorflow/python/training/tracking/graph_view.py#L236-L241
+// SaveableObjects model the information needed to pass to the SaveV2/RestoreV2
+// ops via their SaveSpec members
+// https://github.com/tensorflow/tensorflow/blob/ddc1bbad3dfd4a089eb96014f26cc16664b1b2f8/tensorflow/python/training/saving/saveable_object.py#L21,
+// which contain the "real" checkpoint keys into the TensorBundle SSTable.
+// They also contain the logic needed to take the restored tensors from
+// RestoreV2 and load them back into the "object" they came from via their
+// overridden "restore" method:
+// https://github.com/tensorflow/tensorflow/blob/ddc1bbad3dfd4a089eb96014f26cc16664b1b2f8/tensorflow/python/training/saving/saveable_object.py#L85
+Status RestoreCheckpoint(SavedModelV2Bundle* bundle,
+                         const RevivedObjectMap& revived_objects,
+                         const std::string& directory,
+                         ImmediateExecutionContext* context) {
+  // TODO(bmzhao): Batch up all the restores into a single restore op per
+  // device, following logic in MultiDeviceSaver.
+  TF_RETURN_IF_ERROR(bundle->VisitObjectsToRestore(
+      [&revived_objects, &directory, context, bundle](
+          int node, const TrackableObjectGraph::TrackableObject& trackable) {
+        if (bundle->saved_object_graph().nodes(node).kind_case() !=
+            SavedObject::kVariable) {
+          // TODO(bmzhao): This requires using the newly added Save/Restore
+          // functions from
+          // https://github.com/tensorflow/tensorflow/commit/df6b21c13c82b5d0981642cfe18f10e60f78ea5c
+          LOG(WARNING) << "Restoring non-variable objects has not been "
+                          "implemented yet. (Kind="
+                       << bundle->saved_object_graph().nodes(node).kind_case()
+                       << ")";
+          return Status::OK();
+        }
+
+        Variable* variable =
+            down_cast<Variable*>(revived_objects.at(node).get());
+
+        // Restore the tensor's value from the checkpoint
+        const TrackableObjectGraph::TrackableObject::SerializedTensor*
+            attribute =
+                FindSerializedTensorInTrackable(trackable, "VARIABLE_VALUE");
+        if (attribute == nullptr) {
+          return errors::FailedPrecondition(
+              "Could not find SerializedTensor with name VARIABLE_VALUE for "
+              "saved variable");
+        }
+
+        const std::string& checkpoint_key = attribute->checkpoint_key();
+        std::string variables_path_prefix =
+            io::JoinPath(directory, kSavedModelVariablesDirectory,
+                         kSavedModelVariablesFilename);
+        ImmediateTensorHandlePtr restored_output;
+        TF_RETURN_IF_ERROR(internal::SingleRestore(
+            context, variables_path_prefix, checkpoint_key, variable->dtype(),
+            &restored_output));
+
+        // Assign the restored tensor's value to the variable
+        return variable->Assign(restored_output.get());
+      }));
+
+  return Status();
+}
+
+}  // namespace
+
 Status TFSavedModelAPI::GetFunction(const std::string& function_path,
                                     ConcreteFunction** function) {
-  // TODO(bmzhao): Add support for retrieving a function.
-  return errors::Unimplemented(
-      "Retrieving functions is unimplemented currently");
+  const SavedObject* object =
+      internal::FindNodeAtPath(function_path, bundle_.saved_object_graph());
+  if (object == nullptr) {
+    return errors::NotFound("No saved object found at path ", function_path);
+  }
+
+  if (object->kind_case() == SavedObject::kBareConcreteFunction) {
+    *function =
+        concrete_functions_
+            .at(object->bare_concrete_function().concrete_function_name())
+            .get();
+  } else if (object->kind_case() == SavedObject::kFunction) {
+    *function =
+        concrete_functions_.at(object->function().concrete_functions(0)).get();
+  } else {
+    return errors::InvalidArgument(function_path,
+                                   " is not a path to a Function.");
+  }
+
+  return Status();
 }
 
 Status TFSavedModelAPI::GetSignatureDefFunction(
     const std::string& signature_def_key, ConcreteFunction** function) {
   // TODO(bmzhao): Add support for retrieving a signaturedef function.
   return errors::Unimplemented(
-      "Retrieving functions is unimplemented currently");
+      "Retrieving SignatureDef functions is unimplemented currently");
 }
 
 std::vector<ConcreteFunction*> TFSavedModelAPI::ListFunctions() {
   std::vector<ConcreteFunction*> result;
-  result.reserve(functions_.size());
-  for (ConcreteFunction& function : functions_) {
-    result.push_back(&function);
+  result.reserve(concrete_functions_.size());
+  for (auto& index_and_function : concrete_functions_) {
+    result.push_back(index_and_function.second.get());
   }
   return result;
 }
 
+TFSavedModelAPI::TFSavedModelAPI(
+    const std::string& directory, SavedModelV2Bundle bundle,
+    std::unordered_map<int, std::unique_ptr<TensorHandleConvertible>>
+        revived_objects,
+    std::unordered_map<std::string, std::unique_ptr<TFConcreteFunction>>
+        concrete_functions)
+    : directory_(directory),
+      bundle_(std::move(bundle)),
+      revived_objects_(std::move(revived_objects)),
+      concrete_functions_(std::move(concrete_functions)) {}
+
 Status TFSavedModelAPI::Load(
     const std::string& directory,
     const absl::optional<std::unordered_set<std::string>>& tags,
     ImmediateExecutionContext* context, std::unique_ptr<TFSavedModelAPI>* out) {
-  // TODO(bmzhao): Add support for loading a TFSavedModelImpl.
-  return errors::Unimplemented(
-      "TFSavedModelAPIImpl loading is unimplemented currently");
+  // TODO(bmzhao): Add support for loading a TF1 SavedModel.
+  if (tags) {
+    return errors::Unimplemented(
+        "Loading saved models with explicit tags will be supported in the "
+        "future");
+  }
+
+  SavedModelV2Bundle bundle;
+  TF_RETURN_IF_ERROR(SavedModelV2Bundle::Load(directory, &bundle));
+
+  // TODO(bmzhao): Mangle loaded function names so that different
+  // models loaded in the same runtime Context don't clobber eachother.
+  // This occurs in python here:
+  // https://github.com/tensorflow/tensorflow/blob/285b5fa15405c5e2c084080f52a1818be8648079/tensorflow/python/saved_model/function_deserialization.py#L438-L454
+
+  RevivedObjectMap revived_objects;
+  TF_RETURN_IF_ERROR(
+      ReviveObjects(bundle.meta_graph_def(), context, &revived_objects));
+
+  // TODO(bmzhao): When we later add support for loading resources, we need to
+  // handle the case where materializing a function's captures requires invoking
+  // other functions. This occurs when retrieving the resource handle for a
+  // TrackableResource:
+  // https://github.com/tensorflow/tensorflow/blob/f19c6efb4a8ba60e2492eedc98ef5375abb39dc7/tensorflow/python/saved_model/load.py#L240
+  // https://github.com/tensorflow/tensorflow/blob/f19c6efb4a8ba60e2492eedc98ef5375abb39dc7/tensorflow/python/training/tracking/tracking.py#L233
+  // This requires restoring functions in a topological sort order by capture
+  // dependencies.
+  ConcreteFunctionMap function_map;
+  TF_RETURN_IF_ERROR(ReviveFunctions(bundle.meta_graph_def(), revived_objects,
+                                     context, &function_map));
+
+  TF_RETURN_IF_ERROR(
+      RestoreCheckpoint(&bundle, revived_objects, directory, context));
+
+  out->reset(new TFSavedModelAPI(directory, std::move(bundle),
+                                 std::move(revived_objects),
+                                 std::move(function_map)));
+  return Status();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.h b/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.h
index cc631a9f3ae..fc8e738e86f 100644
--- a/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.h
+++ b/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.h
@@ -16,14 +16,19 @@ limitations under the License.
 #ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TF_SAVED_MODEL_IMPL_H_
 #define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TF_SAVED_MODEL_IMPL_H_
 
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 
 #include "absl/types/optional.h"
 #include "tensorflow/c/eager/immediate_execution_context.h"
 #include "tensorflow/c/experimental/saved_model/core/concrete_function.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h"
 #include "tensorflow/c/experimental/saved_model/core/saved_model_api.h"
+#include "tensorflow/cc/saved_model/bundle_v2.h"
 #include "tensorflow/core/platform/status.h"
 
 namespace tensorflow {
@@ -63,8 +68,19 @@ class TFSavedModelAPI : public SavedModelAPI {
   ~TFSavedModelAPI() override = default;
 
  private:
-  TFSavedModelAPI() = default;
-  std::vector<ConcreteFunction> functions_;
+  TFSavedModelAPI(
+      const std::string& directory, SavedModelV2Bundle bundle,
+      std::unordered_map<int, std::unique_ptr<TensorHandleConvertible>>
+          revived_objects,
+      std::unordered_map<std::string, std::unique_ptr<TFConcreteFunction>>
+          concrete_functions);
+
+  std::string directory_;
+  SavedModelV2Bundle bundle_;
+  std::unordered_map<int, std::unique_ptr<TensorHandleConvertible>>
+      revived_objects_;
+  std::unordered_map<std::string, std::unique_ptr<TFConcreteFunction>>
+      concrete_functions_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/internal/BUILD b/tensorflow/c/experimental/saved_model/internal/BUILD
index b22718dfd04..323298c5fc1 100644
--- a/tensorflow/c/experimental/saved_model/internal/BUILD
+++ b/tensorflow/c/experimental/saved_model/internal/BUILD
@@ -38,16 +38,17 @@ cc_library(
         ":concrete_function_type",
         ":function_metadata",
         ":function_metadata_type",
-        ":tensorhandle_list",
-        ":tensorhandle_list_type",
         "//tensorflow/c:c_api_macros",
         "//tensorflow/c:tf_status_internal",
+        "//tensorflow/c/eager:abstract_tensor_handle",
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:immediate_execution_operation",
         "//tensorflow/c/eager:tfe_op_internal",
+        "//tensorflow/c/eager:tfe_tensorhandle_internal",
         "//tensorflow/c/experimental/saved_model/core:concrete_function",
         "//tensorflow/c/experimental/saved_model/core:function_metadata",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -164,38 +165,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "tensorhandle_list",
-    srcs = [
-        "tensorhandle_list.cc",
-    ],
-    hdrs = [
-        "//tensorflow/c/experimental/saved_model/public:tensorhandle_list.h",
-    ],
-    copts = tf_copts(),
-    visibility = [
-        "//tensorflow/c/experimental/saved_model/public:__pkg__",
-    ],
-    deps = [
-        ":tensorhandle_list_type",
-        "//tensorflow/c:c_api_macros",
-        "//tensorflow/c/eager:c_api",
-        "//tensorflow/c/eager:immediate_execution_tensor_handle",
-        "//tensorflow/c/eager:tfe_tensorhandle_internal",
-    ],
-)
-
-cc_library(
-    name = "tensorhandle_list_type",
-    hdrs = [
-        "tensorhandle_list_type.h",
-    ],
-    deps = [
-        "//tensorflow/c:conversion_macros",
-        "//tensorflow/c/eager:immediate_execution_tensor_handle",
-    ],
-)
-
 tf_cc_test(
     name = "saved_model_api_test",
     size = "small",
@@ -213,7 +182,6 @@ tf_cc_test(
         "//tensorflow/c/eager:c_api_test_util",
         "//tensorflow/c/experimental/saved_model/public:concrete_function",
         "//tensorflow/c/experimental/saved_model/public:saved_model_api",
-        "//tensorflow/c/experimental/saved_model/public:tensorhandle_list",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
diff --git a/tensorflow/c/experimental/saved_model/internal/concrete_function.cc b/tensorflow/c/experimental/saved_model/internal/concrete_function.cc
index 12d49212a88..65c6eca5623 100644
--- a/tensorflow/c/experimental/saved_model/internal/concrete_function.cc
+++ b/tensorflow/c/experimental/saved_model/internal/concrete_function.cc
@@ -15,13 +15,15 @@ limitations under the License.
 
 #include "tensorflow/c/experimental/saved_model/public/concrete_function.h"
 
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
 #include "tensorflow/c/eager/immediate_execution_operation.h"
 #include "tensorflow/c/eager/tfe_op_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
 #include "tensorflow/c/experimental/saved_model/core/concrete_function.h"
 #include "tensorflow/c/experimental/saved_model/core/function_metadata.h"
 #include "tensorflow/c/experimental/saved_model/internal/concrete_function_type.h"
 #include "tensorflow/c/experimental/saved_model/internal/function_metadata_type.h"
-#include "tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h"
 #include "tensorflow/c/tf_status_internal.h"
 #include "tensorflow/core/platform/status.h"
 
@@ -32,15 +34,18 @@ TF_FunctionMetadata* TF_ConcreteFunctionGetMetadata(TF_ConcreteFunction* func) {
       &tensorflow::unwrap(func)->GetFunctionMetadata()));
 }
 
-const TF_TensorHandleList* TF_ConcreteFunctionGetCaptures(
-    TF_ConcreteFunction* func) {
-  return tensorflow::wrap(&tensorflow::unwrap(func)->GetCaptures());
-}
-
 TFE_Op* TF_ConcreteFunctionGetCallOp(TF_ConcreteFunction* func,
+                                     TFE_TensorHandle** inputs, int num_inputs,
                                      TF_Status* status) {
-  tensorflow::ImmediateOpPtr call_op(nullptr);
-  status->status = tensorflow::unwrap(func)->GetCallOp(&call_op);
+  tensorflow::ImmediateOpPtr call_op;
+  absl::Span<tensorflow::AbstractTensorHandle* const> input_span(
+      reinterpret_cast<tensorflow::AbstractTensorHandle**>(
+          tensorflow::unwrap(inputs)),
+      static_cast<size_t>(num_inputs));
+  status->status = tensorflow::unwrap(func)->GetCallOp(input_span, &call_op);
+  if (!status->status.ok()) {
+    return nullptr;
+  }
   return tensorflow::wrap(call_op.release());
 }
 
diff --git a/tensorflow/c/experimental/saved_model/internal/saved_model_api_test.cc b/tensorflow/c/experimental/saved_model/internal/saved_model_api_test.cc
index aa0b00ab847..e58b232f9c9 100644
--- a/tensorflow/c/experimental/saved_model/internal/saved_model_api_test.cc
+++ b/tensorflow/c/experimental/saved_model/internal/saved_model_api_test.cc
@@ -16,10 +16,14 @@ limitations under the License.
 #include "tensorflow/c/experimental/saved_model/public/saved_model_api.h"
 
 #include <string>
+#include <vector>
 
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_test_util.h"
+#include "tensorflow/c/experimental/saved_model/public/concrete_function.h"
 #include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_tensor.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/core/platform/test.h"
@@ -92,12 +96,42 @@ TEST_P(CSavedModelAPITest, LoadsSavedModel) {
   TF_SavedModel* saved_model =
       TF_LoadSavedModel(model_dir.c_str(), ctx, status);
 
-  // TODO(bmzhao): Change this to expect TF_OK when loading is implemented.
-  // That unblocks writing other tests that require a TF_SavedModel*,
-  // like loading a ConcreteFunction. This test at least checks that the
-  // C API builds and can be minimally run.
-  EXPECT_EQ(TF_GetCode(status), TF_UNIMPLEMENTED);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TF_ConcreteFunction* compute_fn =
+      TF_GetSavedModelConcreteFunction(saved_model, "compute", status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
 
+  std::vector<TFE_TensorHandle*> compute_fn_inputs;
+  TFE_TensorHandle* input_a = TestScalarTensorHandle(ctx, 2.0f);
+  TFE_TensorHandle* input_b = TestScalarTensorHandle(ctx, 1.0f);
+  compute_fn_inputs.push_back(input_a);
+  compute_fn_inputs.push_back(input_b);
+
+  TFE_Op* compute_fn_op = TF_ConcreteFunctionGetCallOp(
+      compute_fn, compute_fn_inputs.data(), compute_fn_inputs.size(), status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  // TODO(bmzhao): Finish API on FunctionMetadata args, so we know how many
+  // inputs + outputs a function has.
+  TFE_TensorHandle* compute_fn_outputs[1] = {nullptr};
+  int num_retvals = 1;
+
+  TFE_Execute(compute_fn_op, &compute_fn_outputs[0], &num_retvals, status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  TF_Tensor* result = TFE_TensorHandleResolve(compute_fn_outputs[0], status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  EXPECT_EQ(TF_NumDims(result), 0);
+  float output_value = *static_cast<float*>(TF_TensorData(result));
+  // (1 + 2) * (2 + 1) / 3 + 5 should be 8
+  EXPECT_FLOAT_EQ(output_value, 8.0);
+
+  TF_DeleteTensor(result);
+  TFE_DeleteTensorHandle(compute_fn_outputs[0]);
+  TFE_DeleteTensorHandle(input_a);
+  TFE_DeleteTensorHandle(input_b);
+  TFE_DeleteOp(compute_fn_op);
   TF_DeleteSavedModel(saved_model);
   TF_DeleteStatus(status);
   TFE_DeleteContext(ctx);
diff --git a/tensorflow/c/experimental/saved_model/public/BUILD b/tensorflow/c/experimental/saved_model/public/BUILD
index 0cfa0a2c005..af65e05e7f6 100644
--- a/tensorflow/c/experimental/saved_model/public/BUILD
+++ b/tensorflow/c/experimental/saved_model/public/BUILD
@@ -24,7 +24,6 @@ exports_files(
         "concrete_function_list.h",
         "function_metadata.h",
         "saved_model_api.h",
-        "tensorhandle_list.h",
     ],
     visibility = ["//tensorflow/c/experimental/saved_model/internal:__pkg__"],
 )
@@ -40,7 +39,6 @@ cc_library(
         ":concrete_function_list",
         ":function_metadata",
         ":saved_model_api",
-        ":tensorhandle_list",
     ],
 )
 
@@ -63,8 +61,3 @@ alias(
     name = "saved_model_api",
     actual = "//tensorflow/c/experimental/saved_model/internal:saved_model_api",
 )
-
-alias(
-    name = "tensorhandle_list",
-    actual = "//tensorflow/c/experimental/saved_model/internal:tensorhandle_list",
-)
diff --git a/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h b/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h
index aae95a5477c..30f533f140a 100644
--- a/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h
+++ b/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/c/experimental/saved_model/public/concrete_function_list.h"
 #include "tensorflow/c/experimental/saved_model/public/function_metadata.h"
 #include "tensorflow/c/experimental/saved_model/public/saved_model_api.h"
-#include "tensorflow/c/experimental/saved_model/public/tensorhandle_list.h"
 // IWYU pragma: end_exports
 
 #endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_C_SAVED_MODEL_API_H_
diff --git a/tensorflow/c/experimental/saved_model/public/concrete_function.h b/tensorflow/c/experimental/saved_model/public/concrete_function.h
index 944ddecea16..ee5292294d6 100644
--- a/tensorflow/c/experimental/saved_model/public/concrete_function.h
+++ b/tensorflow/c/experimental/saved_model/public/concrete_function.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/c/c_api_macros.h"
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/experimental/saved_model/public/function_metadata.h"
-#include "tensorflow/c/experimental/saved_model/public/tensorhandle_list.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -35,13 +34,15 @@ typedef struct TF_ConcreteFunction TF_ConcreteFunction;
 TF_CAPI_EXPORT extern TF_FunctionMetadata* TF_ConcreteFunctionGetMetadata(
     TF_ConcreteFunction* func);
 
-// Returns a list of TensorHandles implicitly captured by this function.
-TF_CAPI_EXPORT extern const TF_TensorHandleList* TF_ConcreteFunctionGetCaptures(
-    TF_ConcreteFunction* func);
-
-// Returns a TFE_Op suitable for executing this function.
+// Returns a TFE_Op suitable for executing this function. Caller must provide
+// all function inputs in `inputs`, and must not add any additional inputs on
+// the returned op. (i.e. don't call TFE_OpAddInput or TFE_OpAddInputList).
+// The caller is responsible for deleting the returned TFE_Op. If op
+// construction fails, `status` will be non-OK and the returned pointer will be
+// null.
 TF_CAPI_EXPORT extern TFE_Op* TF_ConcreteFunctionGetCallOp(
-    TF_ConcreteFunction* func, TF_Status* status);
+    TF_ConcreteFunction* func, TFE_TensorHandle** inputs, int num_inputs,
+    TF_Status* status);
 
 #ifdef __cplusplus
 }  // end extern "C"
diff --git a/tensorflow/c/experimental/saved_model/public/tensorhandle_list.h b/tensorflow/c/experimental/saved_model/public/tensorhandle_list.h
deleted file mode 100644
index a1e88db3474..00000000000
--- a/tensorflow/c/experimental/saved_model/public/tensorhandle_list.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_TENSORHANDLE_LIST_H_
-#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_TENSORHANDLE_LIST_H_
-
-#include <stddef.h>
-
-#include "tensorflow/c/c_api_macros.h"
-#include "tensorflow/c/eager/c_api.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-// An opaque type that is acts like a list of TF_ConcreteFunction pointers.
-typedef struct TF_TensorHandleList TF_TensorHandleList;
-
-// Returns the size of `list`.
-TF_CAPI_EXPORT extern size_t TF_TensorHandleListSize(
-    const TF_TensorHandleList* list);
-
-// Returns the `i`th TFE_TensorHandle in the list.
-TF_CAPI_EXPORT extern TFE_TensorHandle* TF_TensorHandleListGet(
-    const TF_TensorHandleList* list, int i);
-
-#ifdef __cplusplus
-}  // end extern "C"
-#endif  // __cplusplus
-
-#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_TENSORHANDLE_LIST_H_
diff --git a/tensorflow/c/kernels.cc b/tensorflow/c/kernels.cc
index 3021a38e888..20a6c5117cf 100644
--- a/tensorflow/c/kernels.cc
+++ b/tensorflow/c/kernels.cc
@@ -97,6 +97,11 @@ void TF_KernelBuilder_HostMemory(TF_KernelBuilder* kernel_builder,
   kernel_builder->cc_builder->HostMemory(arg_name);
 }
 
+void TF_KernelBuilder_Priority(TF_KernelBuilder* kernel_builder,
+                               int32_t priority_number) {
+  kernel_builder->cc_builder->Priority(priority_number);
+}
+
 namespace tensorflow {
 namespace {
 
@@ -234,6 +239,14 @@ void TF_OpKernelContext_Failure(TF_OpKernelContext* ctx, TF_Status* status) {
 DEFINE_TF_GETATTR(Type, TF_DataType, tensorflow::DataType)
 DEFINE_TF_GETATTR(Int32, tensorflow::int32, int32_t)
 
+TF_StringView TF_OpKernelConstruction_GetName(TF_OpKernelConstruction* ctx) {
+  auto* cc_ctx = reinterpret_cast<tensorflow::OpKernelConstruction*>(ctx);
+  TF_StringView string_view_of_name;
+  string_view_of_name.data = cc_ctx->def().name().data();
+  string_view_of_name.len = cc_ctx->def().name().length();
+  return string_view_of_name;
+}
+
 TF_DataType TF_ExpectedOutputDataType(TF_OpKernelContext* ctx, int i) {
   auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
   return static_cast<TF_DataType>(cc_ctx->expected_output_dtype(i));
@@ -266,4 +279,4 @@ TF_Tensor* TF_AllocateOutput(TF_OpKernelContext* context, int index,
     return nullptr;
   }
   return tf_tensor;
-}
+}
\ No newline at end of file
diff --git a/tensorflow/c/kernels.h b/tensorflow/c/kernels.h
index 084717c1d9e..c7138a39c73 100644
--- a/tensorflow/c/kernels.h
+++ b/tensorflow/c/kernels.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <stdint.h>
 
+#include "tensorflow/c/c_api.h"
 #include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/c/tf_status.h"
 
@@ -107,6 +108,10 @@ TF_CAPI_EXPORT extern void TF_KernelBuilder_TypeConstraint(
 TF_CAPI_EXPORT extern void TF_KernelBuilder_HostMemory(
     TF_KernelBuilder* kernel_builder, const char* arg_name);
 
+// Specify a priority number for this kernel.
+TF_CAPI_EXPORT extern void TF_KernelBuilder_Priority(
+    TF_KernelBuilder* kernel_builder, int32_t priority_number);
+
 // Register the given kernel builder with the TensorFlow runtime. If
 // registration fails, the given status will be populated.
 //
@@ -180,6 +185,10 @@ TF_CAPI_EXPORT extern void TF_OpKernelConstruction_GetAttrInt32(
     TF_OpKernelConstruction* ctx, const char* attr_name, int32_t* val,
     TF_Status* status);
 
+// Returns the unique operation name for this OpKernel.
+TF_CAPI_EXPORT extern TF_StringView TF_OpKernelConstruction_GetName(
+    TF_OpKernelConstruction* ctx);
+
 // Allocates Tensor for output at given index. Caller takes ownership of
 // returned TF_Tensor and should deallocate it using TF_DeleteTensor(tensor).
 //
diff --git a/tensorflow/c/kernels/BUILD b/tensorflow/c/kernels/BUILD
index 770352c62c1..008d2ee2d67 100644
--- a/tensorflow/c/kernels/BUILD
+++ b/tensorflow/c/kernels/BUILD
@@ -24,6 +24,21 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "summary_op",
+    prefix = "summary_op",
+    deps = [
+        "//tensorflow/c:kernels",
+        "//tensorflow/c:tf_status",
+        "//tensorflow/c:tf_tensor",
+        "//tensorflow/c/kernels:tensor_shape_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//third_party/eigen3",
+    ],
+)
+
 tf_gen_op_libs(
     op_lib_names = ["bitcast"],
     deps = [
@@ -35,6 +50,15 @@ tf_gen_op_libs(
     ],
 )
 
+tf_gen_op_libs(
+    op_lib_names = ["summary"],
+    deps = [
+        "//tensorflow/c:ops",
+        "//tensorflow/c:tf_status",
+        "//tensorflow/core:lib",
+    ],
+)
+
 tf_cc_test(
     name = "bitcast_op_test",
     srcs = ["bitcast_op_test.cc"],
@@ -48,6 +72,62 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "summary_op_test",
+    srcs = ["summary_op_test.cc"],
+    deps = [
+        ":summary_op",
+        "//tensorflow/c:kernels",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+tf_cc_test(
+    name = "summary_op_benchmark_test",
+    size = "small",
+    srcs = ["summary_op_benchmark_test.cc"],
+    deps = [
+        ":summary_op",
+        "//tensorflow/c:kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+cc_library(
+    name = "tensor_shape_utils",
+    srcs = ["tensor_shape_utils.cc"],
+    hdrs = ["tensor_shape_utils.h"],
+    visibility = ["//visibility:private"],
+    deps = [
+        "//tensorflow/c:tf_tensor",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "tensor_shape_utils_test",
+    srcs = ["tensor_shape_utils_test.cc"],
+    deps = [
+        ":tensor_shape_utils",
+        "//tensorflow/c:tf_tensor_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 # Changes to the Android srcs here should be replicated in
 # tensorflow/contrib/makefile/tf_op_files.txt.
 #
@@ -59,11 +139,17 @@ filegroup(
     name = "android_all_op_kernels",
     srcs = [
         "bitcast_op.cc",
+        "summary_op.cc",
+        "tensor_shape_utils.cc",
+        "tensor_shape_utils.h",
     ],
 )
 # LINT.ThenChange(//tensorflow/contrib/makefile/tf_op_files.txt)
 
 filegroup(
     name = "android_all_ops",
-    srcs = ["ops/bitcast.cc"],
+    srcs = [
+        "ops/bitcast.cc",
+        "ops/summary.cc",
+    ],
 )
diff --git a/tensorflow/c/kernels/ops/bitcast.cc b/tensorflow/c/kernels/ops/bitcast.cc
index 3ba56411c38..0bc9fe86f10 100644
--- a/tensorflow/c/kernels/ops/bitcast.cc
+++ b/tensorflow/c/kernels/ops/bitcast.cc
@@ -22,8 +22,19 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 
 static void ComputeNewShape(TF_ShapeInferenceContext* ctx,
-                            TF_ShapeHandle* shape, size_t input_type_size,
-                            size_t output_type_size, TF_Status* status) {
+                            TF_ShapeHandle* shape, TF_DataType input_type,
+                            TF_DataType output_type, TF_Status* status) {
+  size_t input_type_size = TF_DataTypeSize(input_type);
+  size_t output_type_size = TF_DataTypeSize(output_type);
+
+  if (input_type_size == 0 || output_type_size == 0) {
+    std::ostringstream err;
+    err << "Cannot bitcast type " << input_type << " to " << output_type
+        << " because one of the type sizes is zero";
+    TF_SetStatus(status, TF_INVALID_ARGUMENT, err.str().c_str());
+    return;
+  }
+
   TF_SetStatus(status, TF_OK, "");
   if (input_type_size < output_type_size) {
     TF_ShapeInferenceContextWithRankAtLeast(ctx, shape, 1, shape, status);
@@ -37,9 +48,9 @@ static void ComputeNewShape(TF_ShapeInferenceContext* ctx,
         TF_ShapeInferenceContextSubshape(ctx, shape, 0, -1, shape, status);
       } else {
         std::ostringstream err;
-        err << "Cannot bitcast due to shape. "
-            << TF_DimensionHandleValue(last_dim) << " does not match "
-            << divisor_val;
+        err << "Cannot bitcast from " << input_type << " to " << output_type
+            << " due to shape. " << TF_DimensionHandleValue(last_dim)
+            << " does not match " << divisor_val;
         TF_SetStatus(status, TF_INVALID_ARGUMENT, err.str().c_str());
       }
       TF_DeleteDimensionHandle(last_dim);
@@ -78,23 +89,8 @@ static void bitcast_shape_inference_fn(TF_ShapeInferenceContext* ctx,
     TF_ShapeInferenceContext_GetAttrType(ctx, "type", &output_type, status);
   }
 
-  size_t input_type_size;
-  size_t output_type_size;
-
   if (TF_GetCode(status) == TF_OK) {
-    input_type_size = TF_DataTypeSize(input_type);
-    output_type_size = TF_DataTypeSize(output_type);
-
-    if (input_type_size == 0 || output_type_size == 0) {
-      std::ostringstream err;
-      err << "Cannot bitcast type " << input_type << " to " << output_type
-          << " because one of the type sizes is zero";
-      TF_SetStatus(status, TF_INVALID_ARGUMENT, err.str().c_str());
-    }
-  }
-
-  if (TF_GetCode(status) == TF_OK) {
-    ComputeNewShape(ctx, result, input_type_size, output_type_size, status);
+    ComputeNewShape(ctx, result, input_type, output_type, status);
   }
 
   if (TF_GetCode(status) == TF_OK) {
diff --git a/tensorflow/c/kernels/ops/summary.cc b/tensorflow/c/kernels/ops/summary.cc
new file mode 100644
index 00000000000..b6b37f6b5b4
--- /dev/null
+++ b/tensorflow/c/kernels/ops/summary.cc
@@ -0,0 +1,53 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/ops.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/framework/selective_registration.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+
+static void scalar_summary_shape_inference_fn(TF_ShapeInferenceContext* ctx,
+                                              TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
+  TF_ShapeHandle* result = TF_ShapeInferenceContextScalar(ctx);
+  TF_ShapeInferenceContextSetOutput(ctx, 0, result, status);
+  TF_DeleteShapeHandle(result);
+}
+
+void Register_ScalarSummaryOp() {
+  TF_Status* status = TF_NewStatus();
+
+  TF_OpDefinitionBuilder* op_builder =
+      TF_NewOpDefinitionBuilder("ScalarSummary");
+  TF_OpDefinitionBuilderAddInput(op_builder, "tags: string");
+  TF_OpDefinitionBuilderAddInput(op_builder, "values: T");
+  TF_OpDefinitionBuilderAddOutput(op_builder, "summary: string");
+  TF_OpDefinitionBuilderAddAttr(op_builder, "T: realnumbertype");
+  TF_OpDefinitionBuilderSetShapeInferenceFunction(
+      op_builder, &scalar_summary_shape_inference_fn);
+
+  TF_RegisterOpDefinition(op_builder, status);
+  CHECK_EQ(TF_GetCode(status), TF_OK)
+      << "ScalarSummary op registration failed: " << TF_Message(status);
+  TF_DeleteStatus(status);
+}
+
+TF_ATTRIBUTE_UNUSED static bool SummaryScalarOpRegistered = []() {
+  if (SHOULD_REGISTER_OP("ScalarSummary")) {
+    Register_ScalarSummaryOp();
+  }
+  return true;
+}();
diff --git a/tensorflow/c/kernels/summary_op.cc b/tensorflow/c/kernels/summary_op.cc
new file mode 100644
index 00000000000..bd528da4165
--- /dev/null
+++ b/tensorflow/c/kernels/summary_op.cc
@@ -0,0 +1,172 @@
+
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <sstream>
+#include <string>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/c/kernels.h"
+#include "tensorflow/c/kernels/tensor_shape_utils.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/framework/selective_registration.h"
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/core/platform/tstring.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace {
+
+// Struct that stores the status and TF_Tensor inputs to the opkernel.
+// Used to delete tensor and status in its destructor upon kernel return.
+struct Params {
+  TF_Tensor* tags;
+  TF_Tensor* values;
+  TF_Status* status;
+  explicit Params(TF_OpKernelContext* ctx)
+      : tags(nullptr), values(nullptr), status(nullptr) {
+    status = TF_NewStatus();
+    TF_GetInput(ctx, 0, &tags, status);
+    if (TF_GetCode(status) == TF_OK) {
+      TF_GetInput(ctx, 1, &values, status);
+    }
+  }
+  ~Params() {
+    TF_DeleteStatus(status);
+    TF_DeleteTensor(tags);
+    TF_DeleteTensor(values);
+  }
+};
+
+// dummy functions used for kernel registration
+void* ScalarSummaryOp_Create(TF_OpKernelConstruction* ctx) { return nullptr; }
+
+void ScalarSummaryOp_Delete(void* kernel) {}
+
+// Helper functions for compute method
+bool IsSameSize(TF_Tensor* tensor1, TF_Tensor* tensor2);
+// Returns a string representation of a single tag or empty string if there
+// are multiple tags
+std::string SingleTag(TF_Tensor* tags);
+
+template <typename T>
+void ScalarSummaryOp_Compute(void* kernel, TF_OpKernelContext* ctx) {
+  Params params(ctx);
+  if (TF_GetCode(params.status) != TF_OK) {
+    TF_OpKernelContext_Failure(ctx, params.status);
+    return;
+  }
+  if (!IsSameSize(params.tags, params.values)) {
+    std::ostringstream err;
+    err << "tags and values are not the same shape: "
+        << tensorflow::ShapeDebugString(params.tags)
+        << " != " << tensorflow::ShapeDebugString(params.values)
+        << SingleTag(params.tags);
+    TF_SetStatus(params.status, TF_INVALID_ARGUMENT, err.str().c_str());
+    TF_OpKernelContext_Failure(ctx, params.status);
+    return;
+  }
+  // Convert tags and values tensor to array to access elements by index
+  tensorflow::Summary s;
+  auto tags_array =
+      static_cast<tensorflow::tstring*>(TF_TensorData(params.tags));
+  auto values_array = static_cast<T*>(TF_TensorData(params.values));
+  // Copy tags and values into summary protobuf
+  for (int i = 0; i < TF_TensorElementCount(params.tags); ++i) {
+    tensorflow::Summary::Value* v = s.add_value();
+    const tensorflow::tstring& Ttags_i = tags_array[i];
+    v->set_tag(Ttags_i.data(), Ttags_i.size());
+    v->set_simple_value(static_cast<float>(values_array[i]));
+  }
+  TF_Tensor* summary_tensor =
+      TF_AllocateOutput(ctx, 0, TF_ExpectedOutputDataType(ctx, 0), nullptr, 0,
+                        sizeof(tensorflow::tstring), params.status);
+  if (TF_GetCode(params.status) != TF_OK) {
+    TF_DeleteTensor(summary_tensor);
+    TF_OpKernelContext_Failure(ctx, params.status);
+    return;
+  }
+  tensorflow::tstring* output_tstring =
+      reinterpret_cast<tensorflow::tstring*>(TF_TensorData(summary_tensor));
+  CHECK(SerializeToTString(s, output_tstring));
+  TF_DeleteTensor(summary_tensor);
+}
+
+bool IsSameSize(TF_Tensor* tensor1, TF_Tensor* tensor2) {
+  if (TF_NumDims(tensor1) != TF_NumDims(tensor2)) {
+    return false;
+  }
+  for (int d = 0; d < TF_NumDims(tensor1); d++) {
+    if (TF_Dim(tensor1, d) != TF_Dim(tensor2, d)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+std::string SingleTag(TF_Tensor* tags) {
+  if (TF_TensorElementCount(tags) == 1) {
+    const char* single_tag =
+        static_cast<tensorflow::tstring*>(TF_TensorData(tags))->c_str();
+    return tensorflow::strings::StrCat(" (tag '", single_tag, "')");
+  } else {
+    return "";
+  }
+}
+
+template <typename T>
+void RegisterScalarSummaryOpKernel() {
+  TF_Status* status = TF_NewStatus();
+  {
+    auto* builder = TF_NewKernelBuilder(
+        "ScalarSummary", tensorflow::DEVICE_CPU, &ScalarSummaryOp_Create,
+        &ScalarSummaryOp_Compute<T>, &ScalarSummaryOp_Delete);
+    TF_KernelBuilder_TypeConstraint(
+        builder, "T",
+        static_cast<TF_DataType>(tensorflow::DataTypeToEnum<T>::v()), status);
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << "Error while adding type constraint";
+    TF_RegisterKernelBuilder("ScalarSummary", builder, status);
+    CHECK_EQ(TF_OK, TF_GetCode(status))
+        << "Error while registering Scalar Summmary kernel";
+  }
+  TF_DeleteStatus(status);
+}
+
+// A dummy static variable initialized by a lambda whose side-effect is to
+// register the ScalarSummary kernel.
+TF_ATTRIBUTE_UNUSED bool IsScalarSummaryOpKernelRegistered = []() {
+  if (SHOULD_REGISTER_OP_KERNEL("ScalarSummary")) {
+    RegisterScalarSummaryOpKernel<tensorflow::int64>();
+    RegisterScalarSummaryOpKernel<tensorflow::uint64>();
+    RegisterScalarSummaryOpKernel<tensorflow::int32>();
+    RegisterScalarSummaryOpKernel<tensorflow::uint32>();
+    RegisterScalarSummaryOpKernel<tensorflow::uint16>();
+    RegisterScalarSummaryOpKernel<tensorflow::int16>();
+    RegisterScalarSummaryOpKernel<tensorflow::int8>();
+    RegisterScalarSummaryOpKernel<tensorflow::uint8>();
+    RegisterScalarSummaryOpKernel<Eigen::half>();
+    RegisterScalarSummaryOpKernel<tensorflow::bfloat16>();
+    RegisterScalarSummaryOpKernel<float>();
+    RegisterScalarSummaryOpKernel<double>();
+  }
+  return true;
+}();
+}  // namespace
diff --git a/tensorflow/c/kernels/summary_op_benchmark_test.cc b/tensorflow/c/kernels/summary_op_benchmark_test.cc
index 9a68d5ddec1..7c1ab1f7103 100644
--- a/tensorflow/c/kernels/summary_op_benchmark_test.cc
+++ b/tensorflow/c/kernels/summary_op_benchmark_test.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
+
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/node_builder.h"
@@ -20,19 +22,20 @@ limitations under the License.
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 
-namespace tensorflow { 
+namespace tensorflow {
+namespace {
 
-static Graph* BM_ScalarSummaryOp(TensorShape shape, const char* tag, 
-																 float value) {
+Graph* BM_ScalarSummaryOp(TensorShape shape, std::string tag, 
+                          float value) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor tags(DT_STRING, shape);
   Tensor values(DT_FLOAT, shape);
   for (int i = 0; i < tags.NumElements(); ++i){ 
-  	tags.flat<tstring>()(i) = tag; 
-  	values.flat<float>()(i) = value; 
+    tags.flat<tstring>()(i) = tag; 
+    values.flat<float>()(i) = value; 
   } 
   Node* ret;
-  TF_CHECK_OK(NodeBuilder(g->NewName("dummy"), "SummaryScalar")
+  TF_CHECK_OK(NodeBuilder(g->NewName("dummy"), "ScalarSummary")
                   .Input(test::graph::Constant(g, tags))
                   .Input(test::graph::Constant(g, values))
                   .Attr("T", DT_FLOAT)
@@ -42,23 +45,27 @@ static Graph* BM_ScalarSummaryOp(TensorShape shape, const char* tag,
 
 // Macro used to parse initializer list for tensorshape 
 #define DIMARGS(...) {__VA_ARGS__}
-// Random parameters for testing
-constexpr char longTagParam = "LONGTAG____________________________"; 
+// // Random parameters for testing
+constexpr char longTagParam[] = "LONGTAG____________________________"; 
 constexpr float largeValueParam = 2352352.2623433; 
 
-#define BM_ScalarSummaryDev(device, dims, name, tag, value)       		\
-	static void BM_ScalarSummary_##name##_##device(int iters) { 	      \
-		TensorShape tensorshape(DIMARGS(dims)); 													\
-		test::Benchmark(#device, BM_ScalarSummaryOp(											\
-				tensorshape, #tag, value)).Run(iters); 												\
-	}																																		\
-	BENCHMARK(BM_ScalarSummary_##name##_##device); 
+#define BM_ScalarSummaryDev(device, dims, name, tag, value)           \
+  void BM_ScalarSummary##name##device(int iters) {                    \
+    testing::StopTiming();                                            \
+    TensorShape tensorshape(DIMARGS dims);                            \
+    auto g = BM_ScalarSummaryOp(tensorshape, #tag, value);            \
+    testing::StartTiming();                                           \
+    test::Benchmark("cpu", g).Run(iters);                             \
+  }                                                                   \
+  BENCHMARK(BM_ScalarSummary##name##device); 
 
-BM_ScalarSummaryDev(cpu, (5, 10, 100), Base, tag, 5.2);
+BM_ScalarSummaryDev(Cpu, (5, 10, 100), Base, Tag, 5.2);
 // Benchmark for large shapes 
-BM_ScalarSummaryDev(cpu, (500, 1000, 10000), Large_Shape, tag, 5.2);
+BM_ScalarSummaryDev(Cpu, (500, 100, 100), LargeShape, Tag, 5.2);
 // Benchmark for large tag tstring 
-BM_ScalarSummaryDev(cpu, (5, 10, 100), Long_Tag, longTagParam, 5.2);
+BM_ScalarSummaryDev(Cpu, (5, 10, 100), LongTag, longTagParam, 5.2);
 // Benchmark for large values 
-BM_ScalarSummaryDev(cpu, (500, 1000, 10000), Large_Value, tag, largeValueParam);
-} // namespace tensorflow
\ No newline at end of file
+BM_ScalarSummaryDev(Cpu, (500, 100, 100), LargeValue, Tag, largeValueParam);
+
+} // namespace
+} // namespace tensorflow
diff --git a/tensorflow/c/kernels/summary_op_test.cc b/tensorflow/c/kernels/summary_op_test.cc
new file mode 100644
index 00000000000..68c8deb5eab
--- /dev/null
+++ b/tensorflow/c/kernels/summary_op_test.cc
@@ -0,0 +1,186 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/kernels.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
+
+namespace tensorflow {
+namespace {
+
+class DummyDevice : public DeviceBase {
+ public:
+  explicit DummyDevice(Env* env) : DeviceBase(env) {}
+  Allocator* GetAllocator(AllocatorAttributes /*attr*/) override {
+    return cpu_allocator();
+  }
+};
+
+// Helper for comparing ouput and expected output
+void ExpectSummaryMatches(const Summary& actual, const string& expected_str) {
+  Summary expected;
+  ASSERT_TRUE(protobuf::TextFormat::ParseFromString(expected_str, &expected));
+  EXPECT_EQ(expected.DebugString(), actual.DebugString());
+}
+
+void TestScalarSummaryOp(Tensor* tags, Tensor* values, string expected_output,
+                         error::Code expected_code) {
+  // Initialize node used to fetch OpKernel
+  Status status;
+  NodeDef def;
+  def.set_op("ScalarSummary");
+
+  def.set_device(DEVICE_CPU);
+
+  AttrValue valuesTypeAttr;
+  SetAttrValue(values->dtype(), &valuesTypeAttr);
+  (*def.mutable_attr())["T"] = valuesTypeAttr;
+
+  def.add_input(strings::StrCat("input1: ", DataTypeString(tags->dtype())));
+  def.add_input(strings::StrCat("input2: ", DataTypeString(values->dtype())));
+
+  std::unique_ptr<OpKernel> kernel =
+      CreateOpKernel(DeviceType(DEVICE_CPU), nullptr, nullptr, def, 1, &status);
+  ASSERT_TRUE(status.ok()) << status.ToString();
+  OpKernelContext::Params params;
+  DummyDevice dummy_device(nullptr);
+  params.device = &dummy_device;
+  params.op_kernel = kernel.get();
+  AllocatorAttributes alloc_attrs;
+  params.output_attr_array = &alloc_attrs;
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  inputs.emplace_back(tags);
+  inputs.emplace_back(values);
+  params.inputs = &inputs;
+  OpKernelContext ctx(&params, 1);
+  kernel->Compute(&ctx);
+  ASSERT_EQ(expected_code, ctx.status().code());
+  if (expected_code == error::OK) {
+    Summary summary;
+    ASSERT_TRUE(ParseProtoUnlimited(
+        &summary, ctx.mutable_output(0)->scalar<tstring>()()));
+    ExpectSummaryMatches(summary, expected_output);
+  } else {
+    EXPECT_TRUE(absl::StrContains(ctx.status().ToString(), expected_output))
+        << ctx.status();
+  }
+}
+
+TEST(ScalarSummaryOpTest, SimpleFloat) {
+  int vectorSize = 3;
+  Tensor tags(DT_STRING, {vectorSize});
+  Tensor values(DT_FLOAT, {vectorSize});
+  tags.vec<tstring>()(0) = "tag1";
+  tags.vec<tstring>()(1) = "tag2";
+  tags.vec<tstring>()(2) = "tag3";
+  values.vec<float>()(0) = 1.0f;
+  values.vec<float>()(1) = -0.73f;
+  values.vec<float>()(2) = 10000.0f;
+  TestScalarSummaryOp(&tags, &values, R"(
+                      value { tag: 'tag1' simple_value: 1.0 }
+                      value { tag: 'tag2' simple_value: -0.73}
+                      value { tag: 'tag3' simple_value: 10000.0})",
+                      error::OK);
+}
+
+TEST(ScalarSummaryOpTest, SimpleDouble) {
+  int vectorSize = 3;
+  Tensor tags(DT_STRING, {vectorSize});
+  Tensor values(DT_DOUBLE, {vectorSize});
+  tags.vec<tstring>()(0) = "tag1";
+  tags.vec<tstring>()(1) = "tag2";
+  tags.vec<tstring>()(2) = "tag3";
+  values.vec<double>()(0) = 1.0;
+  values.vec<double>()(1) = -0.73;
+  values.vec<double>()(2) = 10000.0;
+  TestScalarSummaryOp(&tags, &values, R"(
+                      value { tag: 'tag1' simple_value: 1.0 }
+                      value { tag: 'tag2' simple_value: -0.73}
+                      value { tag: 'tag3' simple_value: 10000.0})",
+                      error::OK);
+}
+
+TEST(ScalarSummaryOpTest, SimpleHalf) {
+  int vectorSize = 3;
+  Tensor tags(DT_STRING, {vectorSize});
+  Tensor values(DT_HALF, {vectorSize});
+  tags.vec<tstring>()(0) = "tag1";
+  tags.vec<tstring>()(1) = "tag2";
+  tags.vec<tstring>()(2) = "tag3";
+  values.vec<Eigen::half>()(0) = Eigen::half(1.0);
+  values.vec<Eigen::half>()(1) = Eigen::half(-2.0);
+  values.vec<Eigen::half>()(2) = Eigen::half(10000.0);
+  TestScalarSummaryOp(&tags, &values, R"(
+                      value { tag: 'tag1' simple_value: 1.0 }
+                      value { tag: 'tag2' simple_value: -2.0}
+                      value { tag: 'tag3' simple_value: 10000.0})",
+                      error::OK);
+}
+
+TEST(ScalarSummaryOpTest, Error_WrongDimsTags) {
+  Tensor tags(DT_STRING, {2, 1});
+  Tensor values(DT_FLOAT, {2});
+  tags.matrix<tstring>()(0, 0) = "tag1";
+  tags.matrix<tstring>()(1, 0) = "tag2";
+  values.vec<float>()(0) = 1.0f;
+  values.vec<float>()(1) = -2.0f;
+  TestScalarSummaryOp(&tags, &values, "tags and values are not the same shape",
+                      error::INVALID_ARGUMENT);
+}
+
+TEST(ScalarSummaryOpTest, Error_WrongValuesTags) {
+  Tensor tags(DT_STRING, {2});
+  Tensor values(DT_FLOAT, {2, 1});
+  tags.vec<tstring>()(0) = "tag1";
+  tags.vec<tstring>()(1) = "tag2";
+  values.matrix<float>()(0, 0) = 1.0f;
+  values.matrix<float>()(1, 0) = -2.0f;
+  TestScalarSummaryOp(&tags, &values, "tags and values are not the same shape",
+                      error::INVALID_ARGUMENT);
+}
+
+TEST(ScalarSummaryOpTest, Error_WrongWithSingleTag) {
+  Tensor tags(DT_STRING, {1});
+  Tensor values(DT_FLOAT, {2, 1});
+  tags.vec<tstring>()(0) = "tag1";
+  values.matrix<float>()(0, 0) = 1.0f;
+  values.matrix<float>()(1, 0) = -2.0f;
+  TestScalarSummaryOp(&tags, &values, "tags and values are not the same shape",
+                      error::INVALID_ARGUMENT);
+}
+
+TEST(ScalarSummaryOpTest, IsRegistered) {
+  const OpRegistrationData* reg;
+  TF_CHECK_OK(OpRegistry::Global()->LookUp("ScalarSummary", &reg));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/c/kernels/tensor_shape_utils.cc b/tensorflow/c/kernels/tensor_shape_utils.cc
new file mode 100644
index 00000000000..967330ccb93
--- /dev/null
+++ b/tensorflow/c/kernels/tensor_shape_utils.cc
@@ -0,0 +1,40 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/kernels/tensor_shape_utils.h"
+
+#include <string>
+
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/strcat.h"
+
+namespace tensorflow {
+
+std::string ShapeDebugString(TF_Tensor* tensor) {
+  // A TF_Tensor cannot have an unknown rank.
+  CHECK_GE(TF_NumDims(tensor), 0);
+  tensorflow::string s = "[";
+  for (int i = 0; i < TF_NumDims(tensor); ++i) {
+    if (i > 0) tensorflow::strings::StrAppend(&s, ",");
+    int64_t dim = TF_Dim(tensor, i);
+    // A TF_Tensor cannot have an unknown dimension.
+    CHECK_GE(dim, 0);
+    tensorflow::strings::StrAppend(&s, dim);
+  }
+  tensorflow::strings::StrAppend(&s, "]");
+  return s;
+}
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h b/tensorflow/c/kernels/tensor_shape_utils.h
similarity index 51%
rename from tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h
rename to tensorflow/c/kernels/tensor_shape_utils.h
index 566417df025..bfe51bc1a2a 100644
--- a/tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h
+++ b/tensorflow/c/kernels/tensor_shape_utils.h
@@ -13,25 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_CONCRETE_FUNCTION_LIST_TYPE_H_
-#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_CONCRETE_FUNCTION_LIST_TYPE_H_
+// This file contains shape utilities to be used by kernels and is not part of
+// the C API. As such, it is subject to change at any time.
 
-#include <vector>
+#ifndef TENSORFLOW_C_TENSOR_SHAPE_UTILS_H_
+#define TENSORFLOW_C_TENSOR_SHAPE_UTILS_H_
 
-#include "tensorflow/c/conversion_macros.h"
-#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include <string>
 
-// Internal structures used by the SavedModel C API. These are likely to
-// change and should not be depended on.
-
-typedef struct TF_TensorHandleList TF_TensorHandleList;
+#include "tensorflow/c/tf_tensor.h"
 
 namespace tensorflow {
 
-DEFINE_CONVERSION_FUNCTIONS(
-    std::vector<tensorflow::ImmediateExecutionTensorHandle*>,
-    TF_TensorHandleList)
+// The following are utils for the shape of a TF_Tensor type.
+// These functions may later be subsumed by the methods for a
+// TF_TensorShape type.
+
+// Returns a string representation of the TF_Tensor shape.
+std::string ShapeDebugString(TF_Tensor* tensor);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_CONCRETE_FUNCTION_LIST_TYPE_H_
+#endif  // TENSORFLOW_C_TENSOR_SHAPE_UTILS_H_
diff --git a/tensorflow/c/kernels/tensor_shape_utils_test.cc b/tensorflow/c/kernels/tensor_shape_utils_test.cc
new file mode 100644
index 00000000000..783105f3ad7
--- /dev/null
+++ b/tensorflow/c/kernels/tensor_shape_utils_test.cc
@@ -0,0 +1,51 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/kernels/tensor_shape_utils.h"
+
+#include "tensorflow/c/tf_tensor_internal.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+namespace {
+
+// A wrapper that will automatically delete the allocated TF_Tensor
+// once out of scope.
+struct TF_TensorWrapper {
+  TF_Tensor* tf_tensor;
+  explicit TF_TensorWrapper(TF_Tensor* tensor) { tf_tensor = tensor; }
+  ~TF_TensorWrapper() { TF_DeleteTensor(tf_tensor); }
+};
+
+void TestShapeMatch(TensorShape shape) {
+  Tensor tensor(DT_FLOAT, shape);
+  Status status;
+  TF_Tensor* tf_tensor = TF_TensorFromTensor(tensor, &status);
+  TF_TensorWrapper tensor_wrapper = TF_TensorWrapper(tf_tensor);
+  ASSERT_TRUE(status.ok()) << status.ToString();
+  ASSERT_EQ(tensor.shape().DebugString(), ShapeDebugString(tf_tensor));
+}
+
+TEST(ShapeDebugString, RegularShape) { TestShapeMatch(TensorShape({5, 4, 7})); }
+
+TEST(ShapeDebugString, ScalarShape) { TestShapeMatch(TensorShape({})); }
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/c/kernels_test.cc b/tensorflow/c/kernels_test.cc
index 423302741de..3c8ac934428 100644
--- a/tensorflow/c/kernels_test.cc
+++ b/tensorflow/c/kernels_test.cc
@@ -73,6 +73,12 @@ static void* MyCreateFunc(TF_OpKernelConstruction* ctx) {
   EXPECT_EQ(TF_FLOAT, type);
   TF_DeleteStatus(status);
 
+  // Exercise kernel NodeDef name read
+  TF_StringView name_string_view = TF_OpKernelConstruction_GetName(ctx);
+  std::string node_name = "SomeNodeName";
+  std::string candidate_node_name =
+      std::string(name_string_view.data, name_string_view.len);
+  EXPECT_EQ(node_name, candidate_node_name);
   return s;
 }
 
@@ -96,9 +102,11 @@ namespace tensorflow {
 
 static std::unique_ptr<OpKernel> GetFakeKernel(const char* device_name,
                                                const char* op_name,
+                                               const char* node_name,
                                                Status* status) {
   NodeDef def;
   def.set_op(op_name);
+  def.set_name(node_name);
   def.set_device(device_name);
   def.add_input("input1");
   def.add_input("input2");
@@ -114,7 +122,7 @@ static std::unique_ptr<OpKernel> GetFakeKernel(const char* device_name,
 // Tests registration of a single C kernel and checks that calls through the
 // C/C++ boundary are being made.
 TEST(TestKernel, TestRegisterKernelBuilder) {
-  const char* kernel_name = "SomeKernelName";
+  const char* node_name = "SomeNodeName";
   const char* op_name = "FooOp";
   const char* device_name = "FakeDeviceName1";
 
@@ -129,7 +137,7 @@ TEST(TestKernel, TestRegisterKernelBuilder) {
 
   {
     TF_Status* status = TF_NewStatus();
-    TF_RegisterKernelBuilder(kernel_name, builder, status);
+    TF_RegisterKernelBuilder(node_name, builder, status);
     EXPECT_EQ(TF_OK, TF_GetCode(status));
     TF_Buffer* buf = TF_GetRegisteredKernelsForOp(op_name, status);
     EXPECT_EQ(TF_OK, TF_GetCode(status));
@@ -144,7 +152,7 @@ TEST(TestKernel, TestRegisterKernelBuilder) {
   {
     Status status;
     std::unique_ptr<OpKernel> kernel =
-        GetFakeKernel(device_name, op_name, &status);
+        GetFakeKernel(device_name, op_name, node_name, &status);
     TF_EXPECT_OK(status);
     ASSERT_NE(nullptr, kernel.get());
     kernel->Compute(nullptr);
@@ -162,7 +170,7 @@ class DummyDevice : public DeviceBase {
 };
 
 TEST(TestKernel, TestInputAndOutputCount) {
-  const char* kernel_name = "InputOutputCounterKernel";
+  const char* node_name = "InputOutputCounterKernel";
   const char* op_name = "BarOp";
   const char* device_name = "FakeDeviceName2";
 
@@ -212,7 +220,7 @@ TEST(TestKernel, TestInputAndOutputCount) {
 
   {
     TF_Status* status = TF_NewStatus();
-    TF_RegisterKernelBuilder(kernel_name, builder, status);
+    TF_RegisterKernelBuilder(node_name, builder, status);
     EXPECT_EQ(TF_OK, TF_GetCode(status));
     TF_DeleteStatus(status);
   }
@@ -233,7 +241,7 @@ TEST(TestKernel, TestInputAndOutputCount) {
 
     Status status;
     std::unique_ptr<OpKernel> kernel =
-        GetFakeKernel(device_name, op_name, &status);
+        GetFakeKernel(device_name, op_name, node_name, &status);
     TF_EXPECT_OK(status);
     ASSERT_NE(nullptr, kernel.get());
 
@@ -252,7 +260,7 @@ TEST(TestKernel, DeleteKernelBuilderIsOkOnNull) {
 }
 
 TEST(TestKernel, TestTypeConstraint) {
-  const char* kernel_name = "SomeKernelName";
+  const char* node_name = "SomeNodeName";
   const char* op_name = "TypeOp";
   const char* device_name = "FakeDeviceName1";
 
@@ -267,7 +275,7 @@ TEST(TestKernel, TestTypeConstraint) {
   TF_Status* status = TF_NewStatus();
   TF_KernelBuilder_TypeConstraint(builder, "T", TF_DataType::TF_INT32, status);
   EXPECT_EQ(TF_OK, TF_GetCode(status));
-  TF_RegisterKernelBuilder(kernel_name, builder, status);
+  TF_RegisterKernelBuilder(node_name, builder, status);
   EXPECT_EQ(TF_OK, TF_GetCode(status));
 
   TF_Buffer* buf = TF_GetRegisteredKernelsForOp(op_name, status);
@@ -296,7 +304,7 @@ TEST(TestKernel, TestTypeConstraint) {
 }
 
 TEST(TestKernel, TestHostMemory) {
-  const char* kernel_name = "SomeKernelName";
+  const char* node_name = "SomeNodeName";
   const char* op_name = "HostMemoryOp";
   const char* device_name = "FakeDeviceName1";
 
@@ -311,7 +319,7 @@ TEST(TestKernel, TestHostMemory) {
   TF_KernelBuilder_HostMemory(builder, "input2");
   TF_KernelBuilder_HostMemory(builder, "output1");
   TF_Status* status = TF_NewStatus();
-  TF_RegisterKernelBuilder(kernel_name, builder, status);
+  TF_RegisterKernelBuilder(node_name, builder, status);
   EXPECT_EQ(TF_OK, TF_GetCode(status));
 
   TF_Buffer* buf = TF_GetRegisteredKernelsForOp(op_name, status);
@@ -335,12 +343,12 @@ TEST(TestKernel, TestHostMemory) {
 
 class DeviceKernelOpTest : public OpsTestBase {
  protected:
-  void SetupOp(const char* op_name, const char* kernel_name,
+  void SetupOp(const char* op_name, const char* node_name,
                void (*compute_func)(void*, TF_OpKernelContext*)) {
     TF_KernelBuilder* builder = TF_NewKernelBuilder(
         op_name, device_name_, nullptr, compute_func, nullptr);
     TF_Status* status = TF_NewStatus();
-    TF_RegisterKernelBuilder(kernel_name, builder, status);
+    TF_RegisterKernelBuilder(node_name, builder, status);
     EXPECT_EQ(TF_OK, TF_GetCode(status));
     TF_DeleteStatus(status);
 
diff --git a/tensorflow/c/logging.cc b/tensorflow/c/logging.cc
new file mode 100644
index 00000000000..bf6bf069fff
--- /dev/null
+++ b/tensorflow/c/logging.cc
@@ -0,0 +1,59 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/logging.h"
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stringprintf.h"
+
+static ::tensorflow::string BuildMessage(const char* fmt, va_list args) {
+  ::tensorflow::string message;
+  ::tensorflow::strings::Appendv(&message, fmt, args);
+  return message;
+}
+
+void TF_Log(TF_LogLevel level, const char* fmt, ...) {
+  if (level < TF_INFO || level > TF_FATAL) return;
+  va_list args;
+  va_start(args, fmt);
+  auto message = BuildMessage(fmt, args);
+  switch (level) {
+    case TF_INFO:
+      LOG(INFO) << message;
+      break;
+    case TF_WARNING:
+      LOG(WARNING) << message;
+      break;
+    case TF_ERROR:
+      LOG(ERROR) << message;
+      break;
+    case TF_FATAL:
+      LOG(FATAL) << message;
+      break;
+  }
+}
+
+void TF_VLog(int level, const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  auto message = BuildMessage(fmt, args);
+  VLOG(level) << message;
+}
+
+void TF_DVLog(int level, const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  auto message = BuildMessage(fmt, args);
+  DVLOG(level) << message;
+}
diff --git a/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc b/tensorflow/c/logging.h
similarity index 52%
rename from tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc
rename to tensorflow/c/logging.h
index c8f00c1f7c0..9583777b661 100644
--- a/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc
+++ b/tensorflow/c/logging.h
@@ -12,25 +12,31 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#ifndef TENSORFLOW_C_LOGGING_H_
+#define TENSORFLOW_C_LOGGING_H_
 
-#include "tensorflow/c/experimental/saved_model/public/tensorhandle_list.h"
+#include "tensorflow/c/c_api_macros.h"
 
-#include <stddef.h>
-
-#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
-#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
-#include "tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h"
+// --------------------------------------------------------------------------
+// C API for tensorflow::Logging.
 
+#ifdef __cplusplus
 extern "C" {
+#endif
 
-size_t TF_TensorHandleListSize(const TF_TensorHandleList* list) {
-  return tensorflow::unwrap(list)->size();
+typedef enum TF_LogLevel {
+  TF_INFO = 0,
+  TF_WARNING = 1,
+  TF_ERROR = 2,
+  TF_FATAL = 3,
+} TF_LogLevel;
+
+TF_CAPI_EXPORT extern void TF_Log(TF_LogLevel level, const char* fmt, ...);
+TF_CAPI_EXPORT extern void TF_VLog(int level, const char* fmt, ...);
+TF_CAPI_EXPORT extern void TF_DVLog(int level, const char* fmt, ...);
+
+#ifdef __cplusplus
 }
+#endif
 
-TFE_TensorHandle* TF_TensorHandleListGet(const TF_TensorHandleList* list,
-                                         int i) {
-  return tensorflow::wrap((*tensorflow::unwrap(list))[i]);
-}
-
-
-}  // end extern "C"
+#endif  // TENSORFLOW_C_LOGGING_H_
diff --git a/tensorflow/c/ops.cc b/tensorflow/c/ops.cc
index 118385ed72c..cc0eddfcbf6 100644
--- a/tensorflow/c/ops.cc
+++ b/tensorflow/c/ops.cc
@@ -104,6 +104,12 @@ TF_ShapeHandle* TF_NewShapeHandle() {
   return reinterpret_cast<TF_ShapeHandle*>(new ShapeHandle);
 }
 
+TF_ShapeHandle* TF_ShapeInferenceContextScalar(TF_ShapeInferenceContext* ctx) {
+  auto* handle = new ShapeHandle;
+  *handle = reinterpret_cast<InferenceContext*>(ctx)->Scalar();
+  return reinterpret_cast<TF_ShapeHandle*>(handle);
+}
+
 TF_ShapeHandle* TF_ShapeInferenceContextVectorFromSize(
     TF_ShapeInferenceContext* ctx, size_t size) {
   auto* handle = new ShapeHandle;
diff --git a/tensorflow/c/ops.h b/tensorflow/c/ops.h
index 14868e40260..7463809e35b 100644
--- a/tensorflow/c/ops.h
+++ b/tensorflow/c/ops.h
@@ -280,6 +280,11 @@ extern void TF_ShapeInferenceContextSetOutput(TF_ShapeInferenceContext* ctx,
                                               int i, TF_ShapeHandle* handle,
                                               TF_Status* status);
 
+// Returns a newly-allocated scalar shape handle. The returned handle should
+// be freed with TF_DeleteShapeHandle.
+TF_CAPI_EXPORT extern TF_ShapeHandle* TF_ShapeInferenceContextScalar(
+    TF_ShapeInferenceContext* ctx);
+
 // Returns a newly-allocate shape handle representing a vector of the given
 // size. The returned handle should be freed with TF_DeleteShapeHandle.
 TF_CAPI_EXPORT extern TF_ShapeHandle* TF_ShapeInferenceContextVectorFromSize(
diff --git a/tensorflow/c/ops_test.cc b/tensorflow/c/ops_test.cc
index 482413f966c..9fbf4dcbf8b 100644
--- a/tensorflow/c/ops_test.cc
+++ b/tensorflow/c/ops_test.cc
@@ -316,5 +316,16 @@ TEST(OpsTest, ShapeInferenceSubshape) {
   TF_DeleteShapeHandle(handle);
 }
 
+TEST(OpsTest, ShapeInferenceScalarShape) {
+  NodeDef def;
+  shape_inference::InferenceContext c(0, def, MakeOpDef(0, 0), {S({})}, {}, {},
+                                      {});
+  TF_ShapeHandle* TF_scalar_shape = TF_ShapeInferenceContextScalar(C_CTX(&c));
+  shape_inference::ShapeHandle* scalar_shape =
+      reinterpret_cast<shape_inference::ShapeHandle*>(TF_scalar_shape);
+  ASSERT_EQ("[]", c.DebugString(*scalar_shape));
+  TF_DeleteShapeHandle(TF_scalar_shape);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/c/tf_tensor.cc b/tensorflow/c/tf_tensor.cc
index 0feb986ce44..39d2683226f 100644
--- a/tensorflow/c/tf_tensor.cc
+++ b/tensorflow/c/tf_tensor.cc
@@ -288,7 +288,7 @@ TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src, Status* status) {
   if (!tensor.CopyFrom(src, src.shape())) {
     return nullptr;
   }
-  return new TF_Tensor{new tensorflow::TensorInterface(tensor)};
+  return new TF_Tensor{new tensorflow::TensorInterface(std::move(tensor))};
 }
 
 Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst) {
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index e1fad8e697a..8602bfafff8 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -558,6 +558,7 @@ tf_gen_op_wrappers_cc(
         "io_ops",
         "linalg_ops",
         "list_ops",
+        "map_ops",
         "logging_ops",
         "lookup_ops",
         "manip_ops",
diff --git a/tensorflow/cc/framework/gradients.cc b/tensorflow/cc/framework/gradients.cc
index 88cd3fe79d6..3195a357186 100644
--- a/tensorflow/cc/framework/gradients.cc
+++ b/tensorflow/cc/framework/gradients.cc
@@ -425,7 +425,7 @@ Status SymbolicGradientBuilder::ProcessWhileLoop(Node* exit_node,
   // Backprop along the in edges to the while loop (i.e. the inputs to the enter
   // nodes)
   DCHECK_EQ(dx.size(), while_ctx->enter_nodes().size());
-  for (int i = 0; i < dx.size(); ++i) {
+  for (int i = 0, end = dx.size(); i < end; ++i) {
     Node* enter_node = while_ctx->enter_nodes()[i];
     for (const Edge* e : enter_node->in_edges()) {
       if (e->IsControlEdge()) continue;
@@ -489,7 +489,7 @@ Status SymbolicGradientBuilder::AddGradients() {
     // All loop-specific control flow ops should have been handled above
     DCHECK(!n->IsEnter() && !n->IsNextIteration()) << n->DebugString();
 
-    const size_t num_no_grad = no_grad_dy_indices.size();
+    const int num_no_grad = no_grad_dy_indices.size();
     if (IsPrimitiveOpWithNoGrad(n->type_string()) || num_no_grad == num_y) {
       // No grad defined for this op, or all outputs returned 'NoGradient':
       // Backprop 'NoGradient' along the in edges.
@@ -524,7 +524,7 @@ Status SymbolicGradientBuilder::AddGradients() {
     // make this association explicit.
     for (const Edge* e : n->in_edges()) {
       if (e->IsControlEdge()) continue;
-      int dx_index = e->dst_input();
+      size_t dx_index = e->dst_input();
       if (dx_index >= dx.size()) {
         return errors::Internal(
             "Invalid gradient output index: ", dx_index, " size: ", dx.size());
diff --git a/tensorflow/cc/framework/while_gradients.cc b/tensorflow/cc/framework/while_gradients.cc
index 81870a0efa3..e241cfaebe9 100644
--- a/tensorflow/cc/framework/while_gradients.cc
+++ b/tensorflow/cc/framework/while_gradients.cc
@@ -34,7 +34,7 @@ Output ToOutput(OutputTensor output_tensor) {
 
 std::vector<Output> ToOutputVector(
     const std::vector<OutputTensor>& output_tensors) {
-  size_t n = output_tensors.size();
+  const int n = output_tensors.size();
   std::vector<Output> result;
   result.reserve(n);
   for (int i = 0; i < n; ++i) result.push_back(ToOutput(output_tensors[i]));
diff --git a/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc b/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc
index ad80b74f1d5..cf5f742538e 100644
--- a/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc
+++ b/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc
@@ -86,11 +86,7 @@ TEST_P(CPPSavedModelAPITest, LoadsSavedModel) {
   std::unique_ptr<SavedModelAPI> model =
       SavedModelAPI::Load(model_dir, *runtime, &status);
 
-  // TODO(bmzhao): Change this to expect TF_OK when loading is implemented.
-  // That unblocks writing other tests that require a TF_SavedModel*,
-  // like loading a ConcreteFunction. This test at least checks that the
-  // C API builds and can be minimally run.
-  EXPECT_EQ(status.code(), TF_UNIMPLEMENTED) << status.message();
+  EXPECT_EQ(status.code(), TF_OK) << status.message();
 }
 
 INSTANTIATE_TEST_SUITE_P(RuntimeAgnosticCPPSavedModelTests,
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index d091146c75a..ff255dd9cc1 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -308,6 +308,8 @@ cc_library(
     ],
     deps = [
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla:xla_context",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/core:framework",
     ],
     alwayslink = 1,
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index e4df3090046..ae50a447b19 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -172,7 +172,7 @@ string RewriteWithName(const string& name, string code,
 Status GenArgMethods(const tf2xla::Config& config,
                      const xla::ProgramShapeProto& ps,
                      const CompileResult& compile_result, string* methods) {
-  size_t num_args = ps.parameters_size();
+  const int num_args = ps.parameters_size();
   // feed_size() + variable_size() is the maximum number of args as an
   // implementation may not create an argument for an unused variable.
   if (config.feed_size() + config.variable_size() < num_args) {
@@ -229,8 +229,9 @@ Status GenResultMethods(const tf2xla::Config& config,
   int readonly_variables = absl::c_count_if(
       config.variable(),
       [](const tf2xla::Variable& var) { return var.readonly(); });
-  if (config.fetch_size() + config.variable_size() - readonly_variables !=
-      num_results) {
+  const int actual_num_results =
+      config.fetch_size() + config.variable_size() - readonly_variables;
+  if (actual_num_results != num_results) {
     return errors::InvalidArgument("mismatch between fetch_size(",
                                    config.fetch_size(), ")+variable_size(",
                                    config.variable_size(), ") and tuple_size(",
@@ -273,7 +274,7 @@ Status GenResultMethods(const tf2xla::Config& config,
 // Generate methods for variables.
 Status GenVariableMethods(const tf2xla::Config& config,
                           const xla::ProgramShapeProto& ps, string* methods) {
-  size_t num_args = ps.parameters_size();
+  const int num_args = ps.parameters_size();
   for (int i = config.feed_size(); i < num_args; ++i) {
     std::vector<std::pair<string, string>> rewrites;
     TF_RETURN_IF_ERROR(
@@ -401,7 +402,8 @@ Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config,
       ::xla::cpu::CreateArgIndexTableFromBufferInfos(buffer_infos);
   std::vector<string> buffer_infos_as_strings =
       BufferInfosToCppExpression(buffer_infos);
-  if (result_index < 0 || result_index >= buffer_infos.size()) {
+  const int64 buffer_infos_size = buffer_infos.size();
+  if (result_index < 0 || result_index >= buffer_infos_size) {
     return errors::InvalidArgument("result index: ", result_index,
                                    " is outside the range of temp sizes: [0,",
                                    buffer_infos.size(), ")");
@@ -797,8 +799,8 @@ Status ParseCppClass(const string& cpp_class, string* class_name,
     // Allow a fully qualified name that starts with "::".
     parts.erase(parts.begin());
   }
-  for (int i = 0; i < parts.size(); ++i) {
-    if (i < parts.size() - 1) {
+  for (int i = 0, end = parts.size(); i < end; ++i) {
+    if (i < end - 1) {
       TF_RETURN_IF_ERROR(ValidateCppIdent(
           parts[i], "in namespace component of cpp_class: " + cpp_class));
       namespaces->push_back(parts[i]);
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index 0c44ed8bf37..5f6b3dc7101 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -63,7 +63,7 @@ py_binary(
     testonly = 1,
     srcs = ["make_test_graphs.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python",  # TODO(b/34059704): remove when fixed
@@ -110,8 +110,8 @@ genrule(
     # have control of the full GPU.
     cmd = "CUDA_VISIBLE_DEVICES='' " +
           "$(location :make_test_graphs) --out_dir $(@D)",
+    exec_tools = [":make_test_graphs"],
     tags = ["manual"],
-    tools = [":make_test_graphs"],
 )
 
 tf_library(
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 5ec0575ed77..d05bb8264c3 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -95,6 +95,7 @@ cc_library(
         ":xla_kernel_creator",  # buildcleaner: keep
         "//tensorflow/compiler/jit/kernels:xla_ops",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla/service:cpu_plugin",  # buildcleaner: keep
         "//tensorflow/core:core_cpu_internal",
@@ -115,6 +116,7 @@ cc_library(
         ":xla_kernel_creator",  # buildcleaner: keep
         "//tensorflow/compiler/jit/kernels:xla_ops",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla/service:gpu_plugin",  # buildcleaner: keep
         "//tensorflow/core:core_cpu_internal",
@@ -126,22 +128,6 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "xla_interpreter_device",
-    srcs = ["xla_interpreter_device.cc"],
-    visibility = [":friends"],
-    deps = [
-        ":jit_compilation_passes",
-        ":xla_device",
-        "//tensorflow/compiler/jit/kernels:xla_ops",
-        "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
-        "//tensorflow/compiler/xla/service:interpreter_plugin",  # buildcleaner: keep
-        "@com_google_absl//absl/memory",
-    ],
-    alwayslink = 1,
-)
-
 cc_library(
     name = "xla_tensor",
     srcs = ["xla_tensor.cc"],
@@ -172,6 +158,7 @@ XLA_DEVICE_DEPS = [
     "//tensorflow/compiler/tf2xla:common",
     "//tensorflow/compiler/tf2xla:tf2xla_util",
     "//tensorflow/compiler/tf2xla:xla_compiler",
+    "//tensorflow/compiler/tf2xla:xla_op_registry",
     "//tensorflow/compiler/tf2xla/kernels:xla_ops",
     "//tensorflow/compiler/xla:util",
     "//tensorflow/compiler/xla/client:client_library",
@@ -208,6 +195,7 @@ XLA_DEVICE_DEPS = [
     "//tensorflow/core/kernels/data:optional_ops",
     "//tensorflow/core/kernels/data:prefetch_dataset_op",
     "//tensorflow/core/profiler/lib:traceme",
+    "//tensorflow/stream_executor:tf_allocator_adapter",
     "//tensorflow/stream_executor/platform",
 ]
 
@@ -218,14 +206,18 @@ cc_library(
         "xla_device.cc",
         "xla_device_context.cc",
         "xla_device_ops.cc",
+        "xla_platform_info.cc",
     ],
     hdrs = [
         "xla_compile_on_demand_op.h",
         "xla_device.h",
         "xla_device_context.h",
         "xla_device_ops.h",
+        "xla_platform_info.h",
     ],
-    deps = XLA_DEVICE_DEPS,
+    # Public visibility is needed for external TF/XLA backends.
+    visibility = ["//visibility:public"],
+    deps = XLA_DEVICE_DEPS + [":xla_compilation_cache"],
 )
 
 cc_library(
@@ -341,6 +333,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:compile_mlir_util_no_tf_dialect_passes",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla:xla_context",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:client_library",
@@ -388,22 +381,26 @@ cc_library(
     alwayslink = 1,
 )
 
-# Linked by tensorflow core, without registration of jit compilation passes
-# which is not necessary to create and run a XlaLocalLaunchBase kernel.
-# Linking jit compilation passes could cause programs stuck right now (b/140069592).
 cc_library(
-    name = "xla_kernel_creator_util",
+    name = "xla_kernel_creator",
     srcs = [
-        "xla_kernel_creator_util.cc",
+        "xla_kernel_creator.cc",
+        "xla_kernel_creator.h",
+    ],
+    visibility = [
+        ":internal",
+        "//learning/brain/contrib/tpu_modeling/exp/tpu_inference_converter:__pkg__",
+        "//tensorflow/core/common_runtime/eager:__pkg__",
     ],
-    hdrs = ["xla_kernel_creator_util.h"],
-    visibility = ["//tensorflow/core/common_runtime/eager:__pkg__"],
     deps = [
         ":common",
         ":compilability_check_util",
         ":compilation_passes",
+        ":flags",
+        ":jit_compilation_passes",
         "//tensorflow/compiler/jit/kernels:xla_ops_no_jit_rewrite_registration",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -415,25 +412,6 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "xla_kernel_creator",
-    srcs = [
-        "xla_kernel_creator.cc",
-        "xla_kernel_creator.h",
-    ],
-    deps = [
-        ":compilability_check_util",
-        ":flags",
-        ":jit_compilation_passes",
-        ":xla_kernel_creator_util",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
-    alwayslink = 1,
-)
-
 tf_cc_test(
     name = "xla_kernel_creator_test",
     srcs = [
@@ -639,6 +617,7 @@ cc_library(
         "//tensorflow/compiler/tf2xla:side_effect_util",
         "//tensorflow/compiler/tf2xla:tf2xla_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/compiler/tf2xla/cc:xla_jit_ops",
         "//tensorflow/compiler/tf2xla/cc:xla_ops",
         "//tensorflow/compiler/xla:status_macros",
@@ -648,11 +627,11 @@ cc_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
-        "//tensorflow/core:framework_bounds_check",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/framework:bounds_check",
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
@@ -677,11 +656,11 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
-        "//tensorflow/core:framework_bounds_check",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/framework:bounds_check",
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -698,6 +677,7 @@ cc_library(
     hdrs = ["device_util.h"],
     deps = [
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/core:framework",
@@ -912,6 +892,7 @@ cc_library(
         "//tensorflow/compiler/jit/graphcycles",
         "//tensorflow/compiler/tf2xla:resource_operation_table",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:core_cpu",
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass.cc b/tensorflow/compiler/jit/build_xla_ops_pass.cc
index 5a57008cf61..a340b9d3f45 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass.cc
@@ -452,7 +452,7 @@ Status PredicateInt32Inputs(const Scope& root, Node* n,
   root.graph()->AddControlEdge(predicate_as_control.node(),
                                identity_n.operation.node());
 
-  for (int i = 0; i < int32_inputs.size(); i++) {
+  for (int i = 0, end = int32_inputs.size(); i < end; i++) {
     TF_RETURN_IF_ERROR(root.graph()->UpdateEdge(identity_n[i].node(), i, n,
                                                 int32_inputs_input_idxs[i]));
   }
diff --git a/tensorflow/compiler/jit/compilability_check_util.h b/tensorflow/compiler/jit/compilability_check_util.h
index a21cb6b98dd..3b20784cc29 100644
--- a/tensorflow/compiler/jit/compilability_check_util.h
+++ b/tensorflow/compiler/jit/compilability_check_util.h
@@ -257,7 +257,7 @@ class RecursiveCompilabilityChecker {
       UncompilableNodesMap* uncompilable_nodes_map);
 
   // Make sure we don't recurse infinitely on recursive functions.
-  const int kMaxRecursionDepth = 10;
+  const size_t kMaxRecursionDepth = 10;
 
   const OperationFilter& op_filter_;
   const DeviceType& jit_device_type_;
diff --git a/tensorflow/compiler/jit/device_util.cc b/tensorflow/compiler/jit/device_util.cc
index 375d30c4cf3..d8749baf872 100644
--- a/tensorflow/compiler/jit/device_util.cc
+++ b/tensorflow/compiler/jit/device_util.cc
@@ -26,8 +26,8 @@ using xla::StatusOr;
 void DeviceSet::Insert(DeviceId device_id) {
   int word_index = device_id.id() / kWordSize;
   int bit_index = device_id.id() % kWordSize;
-
-  if (word_index >= storage_.size()) {
+  const int storage_size = storage_.size();
+  if (word_index >= storage_size) {
     storage_.resize(word_index + 1, 0);
   }
 
@@ -39,7 +39,7 @@ void DeviceSet::UnionWith(const DeviceSet& other) {
     storage_.resize(other.storage_.size(), 0);
   }
 
-  for (int i = 0; i < other.storage_.size(); i++) {
+  for (int i = 0, end = other.storage_.size(); i < end; i++) {
     storage_[i] |= other.storage_[i];
   }
 }
diff --git a/tensorflow/compiler/jit/device_util.h b/tensorflow/compiler/jit/device_util.h
index 35f3321b47b..6304cc813ca 100644
--- a/tensorflow/compiler/jit/device_util.h
+++ b/tensorflow/compiler/jit/device_util.h
@@ -72,7 +72,8 @@ class DeviceSet {
   void ForEach(FnTy func) const {
     // This is really a poor man's iterator, we should consider writing a proper
     // iterator if this ends up being used widely.
-    for (int word_index = 0; word_index < storage_.size(); word_index++) {
+    for (int word_index = 0, end = storage_.size(); word_index < end;
+         word_index++) {
       uint64 word = storage_[word_index];
       while (word != 0) {
         uint64 only_lowest_bit_set = word & -word;
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index 435c2ec5f7f..d482642b44c 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -1132,7 +1132,8 @@ static Status GetArgTypes(const Graph& graph, DataTypeVector* types) {
     if (n->type_string() == kArgOp) {
       int index;
       TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
-      if (index < 0 || index >= types->size()) {
+      const int num_types = types->size();
+      if (index < 0 || index >= num_types) {
         return errors::InvalidArgument("Invalid argument number");
       }
       (*types)[index] = n->output_type(0);
@@ -1149,7 +1150,8 @@ static Status RenumberArguments(Graph* graph,
     if (n->type_string() == kArgOp) {
       int index;
       TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
-      if (index < 0 || index >= permutation.size()) {
+      const int permutation_size = permutation.size();
+      if (index < 0 || index >= permutation_size) {
         return errors::InvalidArgument("Invalid argument number");
       }
       n->AddAttr("index", permutation[index]);
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index 6640a5d5dba..efd2ef24c3b 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -295,19 +295,6 @@ bool EqualFunctionDefLibrary(const FunctionDefLibrary& expected,
         << diff << "\nActual: " << actual.DebugString();          \
   } while (false)
 
-// These dummy Op registrations are here because the real Op registrations live
-// in contrib and there can't be a dependence from this test to contrib.
-REGISTER_OP("XlaHostCompute")
-    .Input("inputs: Tinputs")
-    .Output("outputs: Toutputs")
-    .Attr("Tinputs: list(type) >= 0")
-    .Attr("Toutputs: list(type) >= 0")
-    .Attr("ancestors: list(string) >= 0")
-    .Attr("key: string")
-    .Attr("shape_inference_graph: func")
-    .Attr("shapes: list(shape) >= 0")
-    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
-
 REGISTER_OP("InputTest")
     .Output("o: float")
     .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
@@ -947,6 +934,8 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"shape_inference_graph", shape_inference_graph},
+            {"tpu_core", 0},
+            {"cost_estimate_ns", 1000000},
             {"shapes", absl::Span<const DataType>({})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
@@ -1114,6 +1103,8 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_F1_O2"},
             {"shape_inference_graph", shape_inference_graph2},
+            {"tpu_core", 0},
+            {"cost_estimate_ns", 1000000},
             {"shapes", absl::Span<const DataType>({})},
             {"_outside_compilation_subgraph", "O2"},
             {"_xla_token_input_nodes",
@@ -1130,6 +1121,8 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"shape_inference_graph", shape_inference_graph1},
+            {"tpu_core", 0},
+            {"cost_estimate_ns", 1000000},
             {"shapes", absl::Span<const DataType>({})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
@@ -1266,6 +1259,8 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"shape_inference_graph", NameAttrList()},
+            {"tpu_core", 0},
+            {"cost_estimate_ns", 1000000},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"},
@@ -1295,6 +1290,8 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F2_F2_O1"},
             {"shape_inference_graph", NameAttrList()},
+            {"tpu_core", 0},
+            {"cost_estimate_ns", 1000000},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"},
@@ -1428,6 +1425,8 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"shape_inference_graph", NameAttrList()},
+            {"tpu_core", 0},
+            {"cost_estimate_ns", 1000000},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"},
@@ -1454,6 +1453,8 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F2_F2_O1"},
             {"shape_inference_graph", NameAttrList()},
+            {"tpu_core", 0},
+            {"cost_estimate_ns", 1000000},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"},
@@ -1566,6 +1567,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"shape_inference_graph", NameAttrList()},
+            {"tpu_core", 0},
+            {"cost_estimate_ns", 1000000},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"},
@@ -1658,6 +1661,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"shape_inference_graph", NameAttrList()},
+            {"tpu_core", 0},
+            {"cost_estimate_ns", 1000000},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"},
@@ -1765,6 +1770,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"shape_inference_graph", shape_inference_graph},
+            {"tpu_core", 0},
+            {"cost_estimate_ns", 1000000},
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
@@ -1875,6 +1882,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"shape_inference_graph", shape_inference_graph},
+            {"tpu_core", 0},
+            {"cost_estimate_ns", 1000000},
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
@@ -2009,6 +2018,8 @@ TEST(EncapsulateSubgraphsTest,
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"shape_inference_graph", shape_inference_graph1},
+            {"tpu_core", 0},
+            {"cost_estimate_ns", 1000000},
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
@@ -2023,6 +2034,8 @@ TEST(EncapsulateSubgraphsTest,
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_F1_O2"},
             {"shape_inference_graph", shape_inference_graph2},
+            {"tpu_core", 0},
+            {"cost_estimate_ns", 1000000},
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O2"},
             {"_xla_token_input_nodes",
@@ -2153,6 +2166,8 @@ TEST(EncapsulateSubgraphsTest,
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_F1_O2"},
             {"shape_inference_graph", NameAttrList()},
+            {"tpu_core", 0},
+            {"cost_estimate_ns", 1000000},
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O2"},
             {"_xla_token_input_nodes",
@@ -2169,6 +2184,8 @@ TEST(EncapsulateSubgraphsTest,
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"shape_inference_graph", shape_inference_graph},
+            {"tpu_core", 0},
+            {"cost_estimate_ns", 1000000},
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
@@ -2296,6 +2313,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
          {"ancestors", absl::Span<const string>({})},
          {"key", "host_compute_channel_F1_F1_O1"},
          {"shape_inference_graph", shape_inference_graph},
+         {"tpu_core", 0},
+         {"cost_estimate_ns", 1000000},
          {"shapes", absl::Span<const TensorShapeProto>({})},
          {"_outside_compilation_subgraph", "O1"},
          {"_xla_token_input_nodes",
@@ -2310,6 +2329,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
          {"ancestors", absl::Span<const string>({})},
          {"key", "host_compute_channel_F1_F1_O2"},
          {"shape_inference_graph", NameAttrList()},
+         {"tpu_core", 0},
+         {"cost_estimate_ns", 1000000},
          {"shapes", absl::Span<const TensorShapeProto>({})},
          {"_outside_compilation_subgraph", "O2"},
          {"_xla_token_input_nodes",
@@ -2325,6 +2346,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
          {"ancestors", absl::Span<const string>({})},
          {"key", "host_compute_channel_F1_F1_O3"},
          {"shape_inference_graph", NameAttrList()},
+         {"tpu_core", 0},
+         {"cost_estimate_ns", 1000000},
          {"shapes", absl::Span<const TensorShapeProto>({})},
          {"_outside_compilation_subgraph", "O3"},
          {"_xla_token_input_nodes",
@@ -2451,6 +2474,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) {
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"shape_inference_graph", shape_inference_graph},
+            {"tpu_core", 0},
+            {"cost_estimate_ns", 1000000},
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
@@ -2567,6 +2592,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"shape_inference_graph", shape_inference_graph},
+            {"tpu_core", 0},
+            {"cost_estimate_ns", 1000000},
             {"shapes", absl::Span<const DataType>({})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
diff --git a/tensorflow/compiler/jit/encapsulate_util.cc b/tensorflow/compiler/jit/encapsulate_util.cc
index 5325f6faa31..12afee70716 100644
--- a/tensorflow/compiler/jit/encapsulate_util.cc
+++ b/tensorflow/compiler/jit/encapsulate_util.cc
@@ -139,7 +139,7 @@ Status PreprocessDataEdgesBetweenOutsideCompilations(
   // Remove the edge from host to outside compilation. Add a placeholder as
   // outside compilation node input.
   std::map<std::pair<string, int>, Node*> placeholders;
-  for (int i = 0; i < edges.size(); i++) {
+  for (int i = 0, end = edges.size(); i < end; i++) {
     Node* dst = g->FindNodeId(edges[i].dst_node_id);
     const Edge* e;
     TF_RETURN_IF_ERROR(dst->input_edge(edges[i].dst_input, &e));
@@ -185,7 +185,7 @@ Status PreprocessDataEdgesBetweenOutsideCompilations(
     // Other edge in `edges` might have `e->dst()` as src or dst
     // node. Before removing `e->dst()`, replace those edges with
     // corresponding edges for `dst_replace_node`.
-    for (int j = i + 1; j < edges.size(); j++) {
+    for (int j = i + 1, end = edges.size(); j < end; j++) {
       if (edges[j].dst_node_id == edges[i].dst_node_id) {
         edges[j].dst_node_id = dst_replace_node->id();
       }
@@ -238,7 +238,7 @@ Status PostprocessDataEdgesBetweenOutsideCompilations(
       g->AddControlEdge(original_node, e->dst());
       g->RemoveEdge(e);
     }
-    for (int i = 0; i < data_edges.size(); i++) {
+    for (int i = 0, end = data_edges.size(); i < end; i++) {
       Node* dst = data_edges[i].dst;
       NodeDef new_def = dst->def();
       int dst_input = data_edges[i].dst_input;
@@ -253,7 +253,7 @@ Status PostprocessDataEdgesBetweenOutsideCompilations(
 
       // Other edges might have `dst` as dst node. Update those edges with
       // `replace_node`.
-      for (int j = i + 1; j < data_edges.size(); j++) {
+      for (int j = i + 1, end = data_edges.size(); j < end; j++) {
         if (data_edges[j].dst == dst) {
           data_edges[j].dst = replace_node;
         }
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
index 2b7a6c83b8b..ed25baa62ff 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
@@ -351,14 +351,14 @@ Status RewriteSubgraph(const std::vector<OutputTensor>& arg_source_tensors,
     if (!status.ok()) {
       return status;
     }
-    for (int i = 0; i < data_inputs.size(); ++i) {
+    for (int i = 0, end = data_inputs.size(); i < end; ++i) {
       graph->AddEdge(data_inputs[i].first, data_inputs[i].second, xla_launch,
                      i);
     }
     for (Node* n : control_inputs) {
       graph->AddControlEdge(n, xla_launch);
     }
-    for (int i = 0; i < data_outputs.size(); ++i) {
+    for (int i = 0, end = data_outputs.size(); i < end; ++i) {
       for (const auto& successor : data_outputs[i]) {
         graph->AddEdge(xla_launch, i, successor.first, successor.second);
       }
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
index 5f1c3d536a8..fef43eb8730 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
@@ -95,7 +95,7 @@ Status GetArgDataTypes(const std::vector<Node*>& arg_nodes,
     TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "T", &dtype));
     (*recv_at_host_dtypes)[index] = dtype;
   }
-  for (int i = 0; i < recv_at_host_dtypes->size(); i++) {
+  for (int i = 0, end = recv_at_host_dtypes->size(); i < end; i++) {
     if ((*recv_at_host_dtypes)[i] == DT_INVALID) {
       return errors::Internal("Cannot get datatype for input ", i);
     }
@@ -160,7 +160,7 @@ xla::StatusOr<Node*> ReplaceArgNodesWithRecvAtHostNode(
     }
 
     // Rewrite dst nodes because their input changed.
-    for (int i = 0; i < out_edge_info.size(); i++) {
+    for (int i = 0, end = out_edge_info.size(); i < end; i++) {
       const OutEdgeInfo edge = out_edge_info[i];
       if (edge.dst_input == Graph::kControlSlot) {
         continue;
@@ -174,7 +174,7 @@ xla::StatusOr<Node*> ReplaceArgNodesWithRecvAtHostNode(
 
       // Other edges might have `dst` as dst node as well. Update those edges
       // with `dst_replace`.
-      for (int j = i + 1; j < out_edge_info.size(); j++) {
+      for (int j = i + 1, end = out_edge_info.size(); j < end; j++) {
         if (out_edge_info[j].dst == dst) {
           out_edge_info[j].dst = dst_replace;
         }
@@ -196,7 +196,7 @@ Status GetRetDataTypes(const std::vector<Node*>& ret_nodes,
     TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "T", &dtype));
     (*send_from_host_dtypes)[index] = dtype;
   }
-  for (int i = 0; i < send_from_host_dtypes->size(); i++) {
+  for (int i = 0, end = send_from_host_dtypes->size(); i < end; i++) {
     if ((*send_from_host_dtypes)[i] == DT_INVALID) {
       return errors::Internal("Cannot get datatype for output ", i);
     }
@@ -226,7 +226,8 @@ xla::StatusOr<Node*> BuildSendFromHostNode(
   for (auto* n : ret_nodes) {
     int index;
     TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
-    if (index < 0 || index >= send_from_host_dtypes.size()) {
+    const int num_dtypes = send_from_host_dtypes.size();
+    if (index < 0 || index >= num_dtypes) {
       return errors::Internal("Invalid _Retval index: ", index);
     }
     for (auto edge : n->in_edges()) {
@@ -361,7 +362,8 @@ xla::StatusOr<NodeDef> BuildXlaHostComputeNodeDef(
       continue;
     }
 
-    if (e->dst_input() < 0 || e->dst_input() >= input_dtypes.size()) {
+    const int input_dtypes_size = input_dtypes.size();
+    if (e->dst_input() < 0 || e->dst_input() >= input_dtypes_size) {
       return errors::Internal("Invalid dst_input: ", e->dst_input());
     }
     inputs[e->dst_input()] = NodeDefBuilder::NodeOut{
@@ -500,7 +502,7 @@ void AddEdgesFromOutsideCompilationNodes(
     const std::vector<DataType>& data_types,
     const std::vector<Node*>& outside_compilation_nodes, Graph* g, Node* n) {
   // Add edges from outside compilation nodes to While node.
-  for (int i = original_arg_count; i < data_types.size(); i++) {
+  for (int i = original_arg_count, end = data_types.size(); i < end; i++) {
     Node* outside_compilation_node =
         outside_compilation_nodes[i - original_arg_count];
     g->AddEdge(outside_compilation_node, 0, n, i + arg_to_input_edge_offset);
@@ -619,7 +621,7 @@ Status PostprocessLiftedArgsForWhile(
       lifted_arg_nodes_and_outside_compilation_nodes.end(),
       std::back_inserter(lifted_arg_nodes),
       [](const std::pair<Node*, Node*>& pair) { return pair.first; });
-  for (int i = original_arg_count; i < data_types.size(); i++) {
+  for (int i = original_arg_count, end = data_types.size(); i < end; i++) {
     TF_ASSIGN_OR_RETURN(Node * arg_node,
                         AddOutsideCompilationInputArgToFunctionBody(
                             *body_function_body, i, data_types[i]));
@@ -648,7 +650,7 @@ Status PostprocessLiftedArgsForWhile(
                                              AttrSlice(&cond_func.attr()), fld,
                                              &cond_function_body));
 
-  for (int i = original_arg_count; i < data_types.size(); i++) {
+  for (int i = original_arg_count, end = data_types.size(); i < end; i++) {
     xla::StatusOr<Node*> arg_node_or =
         AddOutsideCompilationInputArgToFunctionBody(*cond_function_body, i,
                                                     data_types[i]);
@@ -759,7 +761,7 @@ Status PostprocessLiftedArgsForIf(
                                       data_types, outside_compilation_nodes, g,
                                       n);
 
-  for (int i = original_arg_count; i < data_types.size(); ++i) {
+  for (int i = original_arg_count, end = data_types.size(); i < end; ++i) {
     TF_ASSIGN_OR_RETURN(Node * then_branch_arg_node,
                         AddOutsideCompilationInputArgToFunctionBody(
                             *then_branch_function_body, i, data_types[i]));
@@ -837,7 +839,7 @@ Status PostprocessLiftedArgsForCall(
       lifted_arg_nodes_and_outside_compilation_nodes.end(),
       std::back_inserter(lifted_arg_nodes),
       [](const std::pair<Node*, Node*>& pair) { return pair.first; });
-  for (int i = original_arg_count; i < data_types.size(); ++i) {
+  for (int i = original_arg_count, end = data_types.size(); i < end; ++i) {
     TF_ASSIGN_OR_RETURN(
         Node * arg_node,
         AddOutsideCompilationInputArgToFunctionBody(*fbody, i, data_types[i]));
@@ -855,7 +857,7 @@ Status PostprocessLiftedArgsForCall(
   // We need to recreate the node. Otherwise TF will not know n->num_inputs()
   // has increased.
   NodeDef node_def = n->def();
-  for (int i = original_arg_count; i < data_types.size(); i++) {
+  for (int i = original_arg_count, end = data_types.size(); i < end; i++) {
     Node* outside_compilation_node =
         lifted_arg_nodes_and_outside_compilation_nodes[i - original_arg_count]
             .second;
@@ -1804,7 +1806,9 @@ TF_ATTRIBUTE_NOINLINE Status ExtractOutsideCompilationForFuncCallNode(
       continue;
     }
 
-    TF_RET_CHECK(e->dst_input() >= 0 && e->dst_input() < inputs.size());
+    const bool input_size_check =
+        e->dst_input() < static_cast<int>(inputs.size());
+    TF_RET_CHECK(e->dst_input() >= 0 && input_size_check);
     inputs[e->dst_input()] =
         NodeDefBuilder::NodeOut{e->src()->name(), e->src_output(),
                                 e->src()->output_type(e->src_output())};
@@ -2420,6 +2424,7 @@ Status ExtractOutsideCompilationForFunction(
     auto updated_fdef = absl::make_unique<FunctionDef>();
     TF_RETURN_IF_ERROR(
         GraphToFunctionDef(*g, new_func_name, updated_fdef.get()));
+    updated_fdef->mutable_signature()->set_is_stateful(true);
     const FunctionDef* original_fdef = fld->Find(func_name);
     if (original_fdef) {
       for (const auto& attr : original_fdef->attr()) {
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
index a6f2bd41275..b727dfc72fc 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
@@ -422,19 +422,6 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, NoHostGraph) {
   EXPECT_EQ(fld.Find("host_graph"), nullptr);
 }
 
-REGISTER_OP("XlaSendToHost")
-    .Input("input: Tinput")
-    .Attr("Tinput: type")
-    .Attr("key: string")
-    .SetIsStateful();
-
-REGISTER_OP("XlaRecvFromHost")
-    .Output("output: Toutput")
-    .Attr("Toutput: type")
-    .Attr("shape: shape")
-    .Attr("key: string")
-    .SetIsStateful();
-
 TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
   // Build the XLA computation func.
   // "const0" (bool)
diff --git a/tensorflow/compiler/jit/flags.cc b/tensorflow/compiler/jit/flags.cc
index d1301a8c40f..ff085c854c6 100644
--- a/tensorflow/compiler/jit/flags.cc
+++ b/tensorflow/compiler/jit/flags.cc
@@ -268,10 +268,4 @@ void AppendMarkForCompilationPassFlags(std::vector<Flag>* flag_list) {
   AppendMarkForCompilationPassFlagsInternal(flag_list);
 }
 
-static bool xla_is_enabled = false;
-
-void SetXlaIsEnabled() { xla_is_enabled = true; }
-
-bool IsXlaEnabled() { return xla_is_enabled; }
-
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/flags.h b/tensorflow/compiler/jit/flags.h
index 89e20d9f8ea..6c54fc8825e 100644
--- a/tensorflow/compiler/jit/flags.h
+++ b/tensorflow/compiler/jit/flags.h
@@ -162,14 +162,6 @@ MlirCommonFlags* GetMlirCommonFlags();
 void AppendMarkForCompilationPassFlags(
     std::vector<tensorflow::Flag>* flag_list);
 
-// Makes all future calls to `IsXlaEnabled()` return `true`.
-//
-// Should only be called when XLA is linked in.
-void SetXlaIsEnabled();
-
-// Returns whether XLA is enabled.
-bool IsXlaEnabled();
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_FLAGS_H_
diff --git a/tensorflow/compiler/jit/graphcycles/graphcycles.cc b/tensorflow/compiler/jit/graphcycles/graphcycles.cc
index 6c5e3a745e2..416e101a025 100644
--- a/tensorflow/compiler/jit/graphcycles/graphcycles.cc
+++ b/tensorflow/compiler/jit/graphcycles/graphcycles.cc
@@ -461,7 +461,7 @@ string GraphCycles::DebugString() const {
   }
 
   string result = "digraph {\n";
-  for (int i = 0; i < rep_->nodes_.size(); i++) {
+  for (int i = 0, end = rep_->nodes_.size(); i < end; i++) {
     if (free_nodes_set.contains(i)) {
       continue;
     }
diff --git a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
index 23931a0d7cd..bf9d88b73fa 100644
--- a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
+++ b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
@@ -194,7 +194,7 @@ Status ComputeSliceSize(const Scope& host_scope,
   ConstantCache constant_pool(host_scope, control_deps);
 
   std::vector<Output> slice_size;
-  for (int i = 0; i < slice_inputs.size_as_vector.size(); i++) {
+  for (int i = 0, end = slice_inputs.size_as_vector.size(); i < end; i++) {
     if (slice_inputs.size_as_vector[i] >= 0) {
       slice_size.push_back(
           constant_pool.Get1DHostConstant(slice_inputs.size_as_vector[i]));
diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD
index 347bae087df..eb9ad8a2e85 100644
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@@ -21,6 +21,7 @@ XLA_OPS_DEPS = [
     "//tensorflow/compiler/tf2xla:common",
     "//tensorflow/compiler/tf2xla:tf2xla_util",
     "//tensorflow/compiler/tf2xla:xla_compiler",
+    "//tensorflow/compiler/tf2xla:xla_op_registry",
     "//tensorflow/compiler/xla:executable_run_options",
     "//tensorflow/compiler/xla:status_macros",
     "//tensorflow/compiler/xla:statusor",
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index 48347a2915f..9cee4b9af28 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/xla_activity_listener.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
+#include "tensorflow/compiler/jit/xla_platform_info.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
@@ -63,38 +64,6 @@ namespace tensorflow {
 
 namespace {
 
-XlaPlatformInfo PlatformInfoFromContext(OpKernelConstruction* ctx) {
-  DeviceType device_type = ctx->device_type();
-  se::Platform::Id platform_id = nullptr;
-  const XlaDevice::Metadata* xla_device_metadata = nullptr;
-  se::DeviceMemoryAllocator* custom_allocator = nullptr;
-
-  if (ctx->device_type() == DeviceType(DEVICE_CPU)) {
-    platform_id = se::host::kHostPlatformId;
-  } else if (ctx->device_type() == DeviceType(DEVICE_GPU)) {
-    platform_id = ctx->device()
-                      ->tensorflow_gpu_device_info()
-                      ->stream->parent()
-                      ->platform()
-                      ->id();
-  } else if (XlaDevice::GetMetadata(ctx, &xla_device_metadata).ok()) {
-    // If we are on an XlaDevice, use the underlying XLA platform's allocator
-    // directly. We could use the StreamExecutor's allocator which may
-    // theoretically be more correct, but XLA returns a nice OOM message in a
-    // Status and StreamExecutor does not.
-    //
-    // Importantly we can't use ctx->device()->GetAllocator() as the allocator
-    // (which xla_allocator above uses) as on an XlaDevice, this is a dummy
-    // allocator that returns XlaTensor objects. The XlaCompiler needs a real
-    // allocator to allocate real buffers.
-    platform_id = xla_device_metadata->platform()->id();
-    custom_allocator =
-        xla_device_metadata->client()->backend().memory_allocator();
-  }
-
-  return XlaPlatformInfo(device_type, platform_id, xla_device_metadata,
-                         custom_allocator);
-}
 
 // A closure describing how to run a compiled version of a TensorFlow function.
 //
@@ -178,31 +147,6 @@ class XlaExecutableClosureStore {
   TF_DISALLOW_COPY_AND_ASSIGN(XlaExecutableClosureStore);
 };
 
-// Return allocator from platform info if non-null, or populate and return a
-// pointer to the allocator adapter with allocator from context.
-//
-// This is necessary because for XLA devices the underlying TF allocator returns
-// dummy tensors.
-se::DeviceMemoryAllocator* GetAllocator(
-    absl::optional<se::TfAllocatorAdapter>* tf_allocator_adapter,
-    OpKernelContext* ctx, const XlaPlatformInfo& platform_info) {
-  if (platform_info.custom_allocator()) {
-    return platform_info.custom_allocator();
-  }
-  if (!ctx->op_device_context()) {
-    // Stream is not set for the host platform.
-    se::Platform* platform =
-        se::MultiPlatformManager::PlatformWithId(platform_info.platform_id())
-            .ValueOrDie();
-    tf_allocator_adapter->emplace(ctx->device()->GetAllocator({}), platform);
-    return &tf_allocator_adapter->value();
-  }
-  // platform_info.
-  tf_allocator_adapter->emplace(ctx->device()->GetAllocator({}),
-                                ctx->op_device_context()->stream());
-  return &tf_allocator_adapter->value();
-}
-
 }  // namespace
 
 XlaLocalLaunchBase::XlaLocalLaunchBase(OpKernelConstruction* ctx,
@@ -214,70 +158,15 @@ XlaLocalLaunchBase::XlaLocalLaunchBase(OpKernelConstruction* ctx,
       constants_(constants),
       resources_(resources),
       function_(function),
-      platform_info_(PlatformInfoFromContext(ctx)),
+      platform_info_(XlaPlatformInfoFromContext(ctx)),
       has_ref_vars_(has_ref_vars) {}
 
-static Status BuildCompilationCache(OpKernelContext* ctx,
-                                    const XlaPlatformInfo& platform_info,
-                                    XlaCompilationCache** cache) {
-  if (platform_info.xla_device_metadata()) {
-    *cache = new XlaCompilationCache(
-        platform_info.xla_device_metadata()->client(),
-        platform_info.xla_device_metadata()->jit_device_type());
-    return Status::OK();
-  }
-
-  auto platform =
-      se::MultiPlatformManager::PlatformWithId(platform_info.platform_id());
-  if (!platform.ok()) {
-    return platform.status();
-  }
-
-  xla::StatusOr<xla::Compiler*> compiler_for_platform =
-      xla::Compiler::GetForPlatform(platform.ValueOrDie());
-  if (!compiler_for_platform.ok()) {
-    // In some rare cases (usually in unit tests with very small clusters) we
-    // may end up transforming an XLA cluster with at least one GPU operation
-    // (which would normally force the cluster to be compiled using XLA:GPU)
-    // into an XLA cluster with no GPU operations (i.e. containing only CPU
-    // operations).  Such a cluster can fail compilation (in way that
-    // MarkForCompilation could not have detected) if the CPU JIT is not linked
-    // in.
-    //
-    // So bail out of _XlaCompile in this case, and let the executor handle the
-    // situation for us.
-    const Status& status = compiler_for_platform.status();
-    if (status.code() == error::NOT_FOUND) {
-      return errors::Unimplemented("Could not find compiler for platform ",
-                                   platform.ValueOrDie()->Name(), ": ",
-                                   status.ToString());
-    }
-  }
-
-  xla::LocalClientOptions client_options;
-  client_options.set_platform(platform.ValueOrDie());
-  client_options.set_intra_op_parallelism_threads(
-      ctx->device()->tensorflow_cpu_worker_threads()->num_threads);
-  auto client = xla::ClientLibrary::GetOrCreateLocalClient(client_options);
-  if (!client.ok()) {
-    return client.status();
-  }
-  const XlaOpRegistry::DeviceRegistration* registration;
-  if (!XlaOpRegistry::GetCompilationDevice(platform_info.device_type().type(),
-                                           &registration)) {
-    return errors::InvalidArgument("No JIT device registered for ",
-                                   platform_info.device_type().type());
-  }
-  *cache = new XlaCompilationCache(
-      client.ValueOrDie(), DeviceType(registration->compilation_device_name));
-  return Status::OK();
-}
-
 static Status CompileToLocalExecutable(
     OpKernelContext* ctx, const NameAttrList& function, bool has_ref_vars,
     const XlaPlatformInfo& platform_info,
     absl::Span<VariableInfo const> variable_infos,
-    absl::Span<const int> constants, bool lazy, xla::LocalClient** client,
+    absl::Span<const int> constants, bool lazy, bool may_alias_resource_update,
+    xla::LocalClient** client,
     const XlaCompiler::CompilationResult** compilation_result,
     xla::LocalExecutable** executable) {
   // We store information about the JIT-compiled XLA computation
@@ -291,7 +180,7 @@ static Status CompileToLocalExecutable(
   TF_RETURN_IF_ERROR(rm->LookupOrCreate<XlaCompilationCache>(
       rm->default_container(), "xla_cache", &cache,
       [&](XlaCompilationCache** cache) {
-        return BuildCompilationCache(ctx, platform_info, cache);
+        return BuildXlaCompilationCache(ctx, platform_info, cache);
       }));
   // Hold the reference to the JIT during evaluation. (We could probably
   // free it sooner because the ResourceMgr will retain a reference, but
@@ -301,37 +190,22 @@ static Status CompileToLocalExecutable(
   *client = static_cast<xla::LocalClient*>(cache->client());
 
   absl::optional<se::TfAllocatorAdapter> tf_allocator_adapter;
-  XlaCompiler::Options options;
-  options.client = *client;
-  if (ctx->op_device_context() != nullptr) {
-    options.device_ordinal =
-        ctx->op_device_context()->stream()->parent()->device_ordinal();
-  }
-  options.device_type = cache->device_type();
-  options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
-  options.graph_def_version = ctx->function_library()->graph_def_version();
-  options.allow_cpu_custom_calls =
-      (platform_info.platform_id() == se::host::kHostPlatformId);
-  options.device_allocator =
-      GetAllocator(&tf_allocator_adapter, ctx, platform_info);
-  if (platform_info.xla_device_metadata()) {
-    options.shape_representation_fn =
-        platform_info.xla_device_metadata()->shape_representation_fn();
-  }
-  // If reference variables are not present in the graph, we can safely alias
-  // passthrough parameters without performing a copy.
-  options.alias_passthrough_params =
-      !has_ref_vars && !platform_info.is_on_xla_device();
+  XlaCompiler::Options options = GenerateCompilerOptions(
+      cache, ctx, platform_info, has_ref_vars, &tf_allocator_adapter);
 
   std::map<int, Tensor> constant_args;
   for (int i : constants) {
     constant_args.insert({i, ctx->input(i)});
   }
+
   XlaCompiler::CompileOptions compile_options;
   compile_options.is_entry_computation = true;
   // Optimization: where possible, have the computation return a naked array
   // rather than a one-element tuple.
   compile_options.always_return_tuple = false;
+  compile_options.alias_resource_update = !has_ref_vars &&
+                                          !platform_info.is_on_xla_device() &&
+                                          may_alias_resource_update;
 
   std::vector<XlaCompiler::Argument> args;
   TF_RETURN_IF_ERROR(XlaComputationLaunchContext::BuildXlaCompilerArguments(
@@ -350,20 +224,22 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
   const XlaCompiler::CompilationResult* compilation_result;
   xla::LocalExecutable* executable;
 
-  ResourceVarsSnapshot variables_snapshot;
+  std::vector<VariableInfo> variable_infos;
   {
-    std::vector<VariableInfo> variable_infos;
     OP_REQUIRES_OK(
         ctx, GetVariableInfosFromCtxInputs(ctx, resources_, &variable_infos));
     OP_REQUIRES_OK(ctx, LockVariables(absl::MakeSpan(variable_infos)));
     Status s = CompileToLocalExecutable(
         ctx, function_, /*has_ref_vars=*/has_ref_vars_, platform_info_,
-        variable_infos, constants_, /*lazy=*/false, &client,
-        &compilation_result, &executable);
+        variable_infos, constants_, /*lazy=*/false,
+        /*may_alias_resource_update=*/true, &client, &compilation_result,
+        &executable);
     OP_REQUIRES_OK(ctx, s);
-    OP_REQUIRES_OK(ctx,
-                   SnapshotResourceVariables(ctx, resources_, variable_infos,
-                                             &variables_snapshot));
+  }
+
+  std::map<int, const Tensor*> resource_var_ptrs;
+  for (int i = 0; i < resources_.size(); i++) {
+    resource_var_ptrs[resources_[i]] = variable_infos[i].var()->tensor();
   }
 
   se::Stream* stream =
@@ -374,12 +250,19 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
   absl::optional<se::TfAllocatorAdapter> tf_allocator_adapter;
   se::DeviceMemoryAllocator* allocator =
       GetAllocator(&tf_allocator_adapter, ctx, platform_info_);
+  int device_ordinal = stream ? stream->parent()->device_ordinal()
+                              : client->default_device_ordinal();
   XlaComputationLaunchContext launch_context(
-      client, allocator,
+      client, allocator, device_ordinal,
       /*allocate_xla_tensors=*/platform_info_.is_on_xla_device(),
       platform_info_.UseMultipleStreams());
-  launch_context.PopulateInputs(ctx, compilation_result, variables_snapshot,
-                                /*missing_ctx_input_prefix=*/0);
+  const xla::HloInputOutputAliasConfig& input_output_alias =
+      executable->executable()->module().input_output_alias_config();
+  xla::StatusOr<std::vector<xla::ExecutionInput>> execution_inputs =
+      launch_context.PopulateInputs(ctx, compilation_result, resource_var_ptrs,
+                                    /*missing_ctx_input_prefix=*/0,
+                                    input_output_alias);
+  OP_REQUIRES_OK(ctx, execution_inputs.status());
 
   // Execute the computation.
   VLOG(2) << "Executing computation.";
@@ -403,24 +286,24 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
   Env* env = Env::Default();
   auto start_time = env->NowMicros();
 
-  xla::StatusOr<xla::ScopedShapedBuffer> run_result;
+  xla::StatusOr<xla::ExecutionOutput> execution_output;
   if (!stream || platform_info_.platform_id() == se::host::kHostPlatformId) {
-    run_result = executable->Run(launch_context.arguments(), run_options);
+    execution_output =
+        executable->Run(std::move(*execution_inputs), run_options);
   } else {
-    run_result = executable->RunAsync(launch_context.arguments(), run_options);
+    execution_output =
+        executable->RunAsync(std::move(*execution_inputs), run_options);
   }
-  OP_REQUIRES(ctx, run_result.ok(), run_result.status());
+  OP_REQUIRES(ctx, execution_output.ok(), execution_output.status());
 
   auto elapsed = env->NowMicros() - start_time;
   VLOG(2) << "Elapsed time: " << elapsed << "us";
+  OP_REQUIRES_OK(
+      ctx, launch_context.PopulateOutputs(
+               ctx, compilation_result, execution_output->ConsumeResult(),
+               /*missing_ctx_input_prefix=*/0, absl::MakeSpan(variable_infos),
+               input_output_alias, resource_var_ptrs));
 
-  const xla::HloInputOutputAliasConfig& input_output_alias =
-      executable->executable()->module().input_output_alias_config();
-  OP_REQUIRES_OK(ctx,
-                 launch_context.PopulateOutputs(
-                     ctx, compilation_result, run_result.ConsumeValueOrDie(),
-                     /*missing_ctx_input_prefix=*/0, input_output_alias,
-                     variables_snapshot));
   VLOG(1) << "Done";
 }
 
@@ -490,7 +373,7 @@ XlaCompileOp::XlaCompileOp(OpKernelConstruction* ctx)
       constants_(ConstantsVector(ctx)),
       resources_(ResourcesVector(ctx)),
       function_(FunctionAttr(ctx)),
-      platform_info_(PlatformInfoFromContext(ctx)),
+      platform_info_(XlaPlatformInfoFromContext(ctx)),
       must_compile_(MustCompileAttr(ctx)),
       has_ref_vars_(HasRefVars(ctx)) {}
 
@@ -516,10 +399,14 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
     OP_REQUIRES_OK(
         ctx, GetVariableInfosFromCtxInputs(ctx, resources_, &variable_infos));
     OP_REQUIRES_OK(ctx, LockVariables(absl::MakeSpan(variable_infos)));
+
+    // Do not alias resource updates as locking variables in XlaCompile and
+    // unlocking them in XlaRun may lead to deadlocks.
     Status status = CompileToLocalExecutable(
         ctx, function_, has_ref_vars_, platform_info_, variable_infos,
         constants_,
-        /*lazy=*/!must_compile_, &client, &kernel, &executable);
+        /*lazy=*/!must_compile_,
+        /*may_alias_resource_update=*/false, &client, &kernel, &executable);
     OP_REQUIRES_OK(ctx, SnapshotResourceVariables(ctx, resources_,
                                                   variable_infos, &variables));
     if (must_compile_ || status.code() != error::UNIMPLEMENTED) {
@@ -574,7 +461,7 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
 }
 
 XlaRunOp::XlaRunOp(OpKernelConstruction* ctx)
-    : OpKernel(ctx), platform_info_(PlatformInfoFromContext(ctx)) {}
+    : OpKernel(ctx), platform_info_(XlaPlatformInfoFromContext(ctx)) {}
 
 void XlaRunOp::Compute(OpKernelContext* ctx) {
   VLOG(3) << "XlaRunOp " << def().name();
@@ -587,14 +474,22 @@ void XlaRunOp::Compute(OpKernelContext* ctx) {
   absl::optional<se::TfAllocatorAdapter> tf_allocator_adapter;
   se::DeviceMemoryAllocator* allocator =
       GetAllocator(&tf_allocator_adapter, ctx, platform_info_);
+  se::Stream* stream =
+      ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
+  int device_ordinal = stream ? stream->parent()->device_ordinal()
+                              : closure.client()->default_device_ordinal();
   XlaComputationLaunchContext launch_context(
-      closure.client(), allocator,
+      closure.client(), allocator, device_ordinal,
       /*allocate_xla_tensors=*/platform_info_.is_on_xla_device(),
       /*use_multiple_streams=*/platform_info_.UseMultipleStreams());
 
   // We're missing the must-be-constant inputs, tell `PopulateInputs`
   // about this.  We don't actually need these inputs because they've
   // already been baked into the compiled kernel.
+  const xla::HloInputOutputAliasConfig& input_output_alias =
+      closure.executable()->executable()->module().input_output_alias_config();
+  xla::StatusOr<std::vector<xla::ExecutionInput>> execution_inputs;
+  std::map<int, const Tensor*> snapshot_ptrs;
   {
     tensorflow::profiler::TraceMe hlo_module_activity(
         [&] {
@@ -604,13 +499,17 @@ void XlaRunOp::Compute(OpKernelContext* ctx) {
         },
         tensorflow::profiler::TraceMeLevel::kInfo);
 
-    launch_context.PopulateInputs(
-        ctx, closure.compilation_result(), closure.resource_var_snapshots(),
-        /*missing_ctx_input_prefix=*/closure.num_constant_args());
+    for (auto& p : closure.resource_var_snapshots()) {
+      snapshot_ptrs.emplace(p.first,
+                            p.second.has_value() ? &p.second.value() : nullptr);
+    }
+    execution_inputs = launch_context.PopulateInputs(
+        ctx, closure.compilation_result(), snapshot_ptrs,
+        /*missing_ctx_input_prefix=*/closure.num_constant_args(),
+        input_output_alias);
+    OP_REQUIRES_OK(ctx, execution_inputs.status());
   }
 
-  se::Stream* stream =
-      ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
   xla::ExecutableRunOptions run_options;
   run_options.set_stream(stream);
   run_options.set_allocator(allocator);
@@ -631,21 +530,19 @@ void XlaRunOp::Compute(OpKernelContext* ctx) {
   Env* env = Env::Default();
   auto start_time = env->NowMicros();
 
-  xla::StatusOr<xla::ScopedShapedBuffer> run_result;
+  xla::StatusOr<xla::ExecutionOutput> execution_output;
   if (!stream || platform_info_.platform_id() == se::host::kHostPlatformId) {
-    run_result =
-        closure.executable()->Run(launch_context.arguments(), run_options);
+    execution_output =
+        closure.executable()->Run(std::move(*execution_inputs), run_options);
   } else {
-    run_result =
-        closure.executable()->RunAsync(launch_context.arguments(), run_options);
+    execution_output = closure.executable()->RunAsync(
+        std::move(*execution_inputs), run_options);
   }
-  OP_REQUIRES(ctx, run_result.ok(), run_result.status());
+  OP_REQUIRES(ctx, execution_output.ok(), execution_output.status());
 
   auto elapsed = env->NowMicros() - start_time;
   VLOG(2) << "Elapsed time in computation: " << elapsed << "us";
 
-  const xla::HloInputOutputAliasConfig& input_output_alias =
-      closure.executable()->executable()->module().input_output_alias_config();
 
   tensorflow::profiler::TraceMe hlo_module_activity(
       [&] {
@@ -653,12 +550,16 @@ void XlaRunOp::Compute(OpKernelContext* ctx) {
       },
       tensorflow::profiler::TraceMeLevel::kInfo);
 
+  xla::StatusOr<std::vector<VariableInfo>> variable_infos = GatherVariableInfo(
+      ctx, *closure.compilation_result(), closure.num_constant_args());
+  OP_REQUIRES_OK(ctx, variable_infos.status());
+  OP_REQUIRES_OK(ctx, LockVariables(absl::MakeSpan(*variable_infos)));
   OP_REQUIRES_OK(
       ctx,
       launch_context.PopulateOutputs(
-          ctx, closure.compilation_result(), run_result.ConsumeValueOrDie(),
+          ctx, closure.compilation_result(), execution_output->ConsumeResult(),
           /*missing_ctx_input_prefix=*/closure.num_constant_args(),
-          input_output_alias, closure.resource_var_snapshots()));
+          absl::MakeSpan(*variable_infos), input_output_alias, snapshot_ptrs));
 }
 
 XlaMergeOp::XlaMergeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.h b/tensorflow/compiler/jit/kernels/xla_ops.h
index 112408226a8..78707c8126d 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.h
+++ b/tensorflow/compiler/jit/kernels/xla_ops.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_compilation_cache.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_launch_util.h"
+#include "tensorflow/compiler/jit/xla_platform_info.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -31,61 +32,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Holds some information about the platform on which an
-// XlaLaunch/_XlaCompile/_XlaRun op must run on.
-class XlaPlatformInfo {
- public:
-  XlaPlatformInfo() : device_type_("") {}
-  XlaPlatformInfo(XlaPlatformInfo&&) = default;
-  explicit XlaPlatformInfo(const DeviceType device_type,
-                           se::Platform::Id platform_id,
-                           const XlaDevice::Metadata* xla_device_metadata,
-                           se::DeviceMemoryAllocator* device_allocator)
-      : device_type_(device_type),
-        platform_id_(platform_id),
-        xla_device_metadata_(xla_device_metadata),
-        device_allocator_(device_allocator) {}
-
-  XlaPlatformInfo& operator=(XlaPlatformInfo&& other) = default;
-
-  bool UseMultipleStreams() const {
-    return xla_device_metadata_ && xla_device_metadata_->UseMultipleStreams();
-  }
-
-  // Non-null only when run on an XLA device.
-  se::DeviceMemoryAllocator* custom_allocator() const {
-    return device_allocator_;
-  }
-
-  DeviceType device_type() const { return device_type_; }
-
-  // This is equal to xla_device_metadata()->platform()->id() if
-  // xla_device_metadata() is not nullptr.
-  se::Platform::Id platform_id() const { return platform_id_; }
-
-  // This may be null if the op this XlaPlatformInfo is for was not placed on an
-  // XLA device.
-  const XlaDevice::Metadata* xla_device_metadata() const {
-    return xla_device_metadata_;
-  }
-  bool is_on_xla_device() const { return xla_device_metadata() != nullptr; }
-
- private:
-  DeviceType device_type_;
-  se::Platform::Id platform_id_;
-
-  // xla_device_metadata_ lives in the tensorflow::DeviceBase in which the
-  // XlaLaunch/_XlaCompile/_XlaRun op is placed and thus does not die before the
-  // XlaLaunch/_XlaCompile/_XlaRun OpKernel.
-  const XlaDevice::Metadata* xla_device_metadata_;
-
-  // If the op associated with this XlaPlatformInfo is placed on an XLA device
-  // then device_allocator_ is the xla::Backend's memory allocator.  If the op
-  // is placed on a regular CPU or GPU device then device_allocator_ is null.
-  se::DeviceMemoryAllocator* device_allocator_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaPlatformInfo);
-};
 
 // XlaLocalLaunchBase is almost the same as XlaLocalLaunchOp.
 // The only difference is that it does not require arguments to follow
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 55ff57a04c5..19eb61b6f72 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -1952,6 +1952,7 @@ absl::flat_hash_set<string> GetKnownXLAAllowlistOp() {
                                      "ParallelDynamicStitch",
                                      "ParameterizedTruncatedNormal",
                                      "PartitionedCall",
+                                     "PopulationCount",
                                      "Qr",
                                      "QuantizeAndDequantizeV2",
                                      "QuantizeAndDequantizeV3",
@@ -2014,6 +2015,7 @@ absl::flat_hash_set<string> GetKnownXLAAllowlistOp() {
                                      "StatefulUniform",
                                      "StatefulUniformFullInt",
                                      "StatefulUniformInt",
+                                     "StatelessCase",
                                      "StatelessIf",
                                      "StatelessMultinomial",
                                      "StatelessRandomNormal",
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 3ae72eb514c..e88319bb732 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -1829,7 +1829,7 @@ TEST(XlaCompilationTest, XLALiteAllowlist) {
   }
   EXPECT_TRUE(unknow_op.empty())
       << "Someone added support for a new TF opeations inside XLA. They must "
-         "be included in the XLALite allowlist or blacklist:\n"
+         "be included in the XLALite allowlist or denylist:\n"
       << absl::StrJoin(unknow_op, "\n");
 }
 }  // namespace
diff --git a/tensorflow/compiler/jit/shape_inference.cc b/tensorflow/compiler/jit/shape_inference.cc
index 72804ff57e4..7f585e70ec4 100644
--- a/tensorflow/compiler/jit/shape_inference.cc
+++ b/tensorflow/compiler/jit/shape_inference.cc
@@ -36,7 +36,7 @@ Status ShapeHandleToTensorShape(shape_inference::InferenceContext* context,
   if (!context->RankKnown(handle)) return Status::OK();
 
   std::vector<int64> dims(context->Rank(handle));
-  for (int32 i = 0; i < dims.size(); ++i) {
+  for (int32 i = 0, end = dims.size(); i < end; ++i) {
     dims[i] = context->Value(context->Dim(handle, i));
   }
   return PartialTensorShape::MakePartialShape(dims.data(), dims.size(), shape);
diff --git a/tensorflow/compiler/jit/xla_cluster_util.cc b/tensorflow/compiler/jit/xla_cluster_util.cc
index b8b11d2c7cd..38c23b7fa25 100644
--- a/tensorflow/compiler/jit/xla_cluster_util.cc
+++ b/tensorflow/compiler/jit/xla_cluster_util.cc
@@ -489,7 +489,7 @@ Status GetNodesRelatedToRefVariablesInDirection(
                  /*stable_comparator=*/NodeComparatorName());
   }
 
-  int old_result_size;
+  size_t old_result_size;
   int iterations = 0;
 
   const int kMaxIterations = 10 * 1000;
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index 62b0c0ab4cf..b1525337dbc 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -97,7 +97,7 @@ bool XlaCompilationCache::Signature::operator==(const Signature& other) const {
   if (arg_shapes != other.arg_shapes) return false;
 
   if (arg_values.size() != other.arg_values.size()) return false;
-  for (int i = 0; i < arg_values.size(); ++i) {
+  for (int i = 0, end = arg_values.size(); i < end; ++i) {
     if (arg_values[i].dtype() != other.arg_values[i].dtype() ||
         arg_values[i].shape() != other.arg_values[i].shape() ||
         arg_values[i].tensor_data() != other.arg_values[i].tensor_data()) {
@@ -158,7 +158,7 @@ Status XlaCompilationCache::BuildExecutable(
 
   std::vector<const xla::Shape*> argument_layouts(
       result.xla_input_shapes.size());
-  for (int i = 0; i < result.xla_input_shapes.size(); ++i) {
+  for (int i = 0, end = result.xla_input_shapes.size(); i < end; ++i) {
     argument_layouts[i] = &result.xla_input_shapes[i];
   }
   xla::ExecutableBuildOptions build_options;
@@ -224,7 +224,7 @@ static xla::StatusOr<std::unique_ptr<Graph>> CreateGraph(
 
   // Create dummy _Arg nodes. Link these to `node` and also via a control
   // dependency edge to the _SOURCE node.
-  for (int64 i = 0; i < args.size(); ++i) {
+  for (int64 i = 0, end = args.size(); i < end; ++i) {
     Node* node;
     string arg_name = absl::StrCat("_arg", i);
     Status status =
@@ -240,7 +240,7 @@ static xla::StatusOr<std::unique_ptr<Graph>> CreateGraph(
   }
 
   // Similarly with return values, create dummy _Retval nodes fed by `node`.
-  for (int64 i = 0; i < result_types.size(); ++i) {
+  for (int64 i = 0, end = result_types.size(); i < end; ++i) {
     Node* node;
     string retval_name = absl::StrCat("_retval", i);
     Status status = NodeBuilder(retval_name, FunctionLibraryDefinition::kRetOp)
@@ -271,7 +271,7 @@ Status XlaCompilationCache::CompileSingleOp(
   auto compile_op = [&](XlaCompiler* compiler,
                         XlaCompiler::CompilationResult* result) {
     std::vector<DataType> result_dtypes(ctx->num_outputs());
-    for (int i = 0; i < result_dtypes.size(); ++i) {
+    for (int i = 0, end = result_dtypes.size(); i < end; ++i) {
       result_dtypes[i] = ctx->expected_output_dtype(i);
     }
 
@@ -330,7 +330,7 @@ Status XlaCompilationCache::CompileImpl(
 
   if (VLOG_IS_ON(2)) {
     VLOG(2) << "num_inputs=" << args.size();
-    for (int i = 0; i < args.size(); i++) {
+    for (int i = 0, end = args.size(); i < end; i++) {
       VLOG(3) << i << ": " << args[i].HumanString();
     }
   }
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index afaee614f02..73c512bfa6f 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_launch_util.h"
+#include "tensorflow/compiler/jit/xla_platform_info.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
@@ -41,94 +42,82 @@ static std::vector<int> GetResourceVariableIndices(OpKernelContext* ctx) {
 }
 
 Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
-                                 const XlaDevice::Metadata& metadata,
+                                 XlaCompilationCache* cache,
                                  const XlaCompiler::CompilationResult* result,
                                  xla::LocalExecutable* executable,
                                  const ResourceVarsSnapshot& variable_args) {
-  xla::LocalClient* client = metadata.client();
+  xla::LocalClient* client = static_cast<xla::LocalClient*>(cache->client());
 
-  // Builds an XLA allocator for the device.
   XlaComputationLaunchContext launch_context(
       client, client->backend().memory_allocator(),
-      /*allocate_xla_tensors=*/true,
-      /*use_multiple_streams=*/metadata.UseMultipleStreams());
+      client->default_device_ordinal(),
+      /*allocate_xla_tensors=*/platform_info_.xla_device_metadata() != nullptr,
+      platform_info_.xla_device_metadata()
+          ? platform_info_.xla_device_metadata()->UseMultipleStreams()
+          : false);
 
-  launch_context.PopulateInputs(ctx, result, variable_args,
-                                /*missing_ctx_input_prefix=*/0);
+  std::map<int, const Tensor*> snapshot_ptrs;
+  for (auto& p : variable_args) {
+    snapshot_ptrs.emplace(p.first,
+                          p.second.has_value() ? &p.second.value() : nullptr);
+  }
+
+  const xla::HloInputOutputAliasConfig& input_output_alias =
+      executable->executable()->module().input_output_alias_config();
+  xla::StatusOr<std::vector<xla::ExecutionInput>> execution_inputs =
+      launch_context.PopulateInputs(ctx, result, snapshot_ptrs,
+                                    /*missing_ctx_input_prefix=*/0,
+                                    input_output_alias);
+  TF_RETURN_IF_ERROR(execution_inputs.status());
 
   se::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
-  TF_RET_CHECK(stream);
 
   VLOG(2) << "Executing computation: " << name();
-  for (const xla::ShapedBuffer* arg : launch_context.arguments()) {
-    VLOG(2) << name() << ": " << *arg;
-  }
   xla::ExecutableRunOptions run_options;
   run_options.set_stream(stream);
   run_options.set_allocator(client->backend().memory_allocator());
   run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device());
   run_options.set_rng_seed(GetXLARandomSeed());
 
-  xla::StatusOr<xla::ScopedShapedBuffer> run_result =
-      executable->Run(launch_context.arguments(), run_options);
+  xla::StatusOr<xla::ExecutionOutput> run_result =
+      executable->Run(execution_inputs.ConsumeValueOrDie(), run_options);
   TF_RETURN_IF_ERROR(run_result.status());
-
-  const xla::HloInputOutputAliasConfig& input_output_alias =
-      executable->executable()->module().input_output_alias_config();
+  xla::ExecutionOutput execution_output = run_result.ConsumeValueOrDie();
+  xla::StatusOr<std::vector<VariableInfo>> variable_infos =
+      GatherVariableInfo(ctx, *result, 0);
+  TF_RETURN_IF_ERROR(variable_infos.status());
+  TF_RETURN_IF_ERROR(LockVariables(absl::MakeSpan(*variable_infos)));
   TF_RETURN_IF_ERROR(launch_context.PopulateOutputs(
-      ctx, result, run_result.ConsumeValueOrDie(),
-      /*missing_ctx_input_prefix=*/0, input_output_alias, variable_args));
+      ctx, result, execution_output.ConsumeResult(),
+      /*missing_ctx_input_prefix=*/0, absl::MakeSpan(*variable_infos),
+      input_output_alias, snapshot_ptrs));
   return Status::OK();
 }
 
-Status XlaCompileOnDemandOp::MustArgumentBeConstant(
-    const OpKernel* op_kernel, int64 argument_idx,
-    FunctionLibraryRuntime* flib_runtime, bool* result) {
-  *result = false;
-
-  // TODO(jmolloy): This could be expensive, so memoize.
-  std::vector<int> constant_input_indices;
-  TF_RETURN_IF_ERROR(GetCompileTimeConstInputs(
-      op_kernel, &constant_input_indices, flib_runtime));
-  *result = absl::c_binary_search(constant_input_indices, argument_idx);
-  return Status::OK();
-}
-
-// TODO(ycao): Remove the need to call ShouldArgumentBeConstant. Its benefit is
-// not clear yet and it causes heavy constant analysis to run twice.
-Status XlaCompileOnDemandOp::ShouldArgumentBeConstant(
-    const OpKernel* op_kernel, int64 argument_idx,
-    FunctionLibraryRuntime* flib_runtime, bool* result) {
-  return MustArgumentBeConstant(op_kernel, argument_idx, flib_runtime, result);
-}
-
 Status XlaCompileOnDemandOp::Compile(
-    OpKernelContext* ctx, const XlaDevice::Metadata& metadata,
-    const XlaCompiler::CompilationResult** result,
-    ResourceVarsSnapshot* variable_args, xla::LocalExecutable** executable) {
+    OpKernelContext* ctx, const XlaCompiler::CompilationResult** result,
+    XlaCompilationCache** cache, ResourceVarsSnapshot* variable_args,
+    xla::LocalExecutable** executable) {
   std::map<int, Tensor> constant_arguments;
+
+  std::vector<int> constant_input_indices;
+  TF_RETURN_IF_ERROR(GetCompileTimeConstInputs(
+      &ctx->op_kernel(), &constant_input_indices, ctx->function_library()));
+  CHECK(absl::c_is_sorted(constant_input_indices));
+
   for (int64 i = 0; i < ctx->num_inputs(); ++i) {
     const Tensor& device_tensor = ctx->input(i);
     if (const XlaTensor* xla_tensor = XlaTensor::FromTensor(&device_tensor)) {
       if (xla_tensor->has_host_tensor()) {
-        bool should_arg_be_const;
-        TF_RETURN_IF_ERROR(ShouldArgumentBeConstant(&ctx->op_kernel(), i,
-                                                    ctx->function_library(),
-                                                    &should_arg_be_const));
-        if (should_arg_be_const) {
+        if (absl::c_binary_search(constant_input_indices, i)) {
           constant_arguments[i] = xla_tensor->host_tensor();
         }
       }
     }
 
-    if (constant_arguments.count(i) == 0) {
-      bool must_argument_be_const;
-      TF_RETURN_IF_ERROR(MustArgumentBeConstant(&ctx->op_kernel(), i,
-                                                ctx->function_library(),
-                                                &must_argument_be_const));
-
-      if (must_argument_be_const) {
+    if (!constant_arguments.count(i)) {
+      if (absl::c_binary_search(constant_input_indices, i)) {
         // Slow path; the argument is not available as a host constant so we
         // must fetch it synchronously.
         Tensor host_tensor;
@@ -156,24 +145,16 @@ Status XlaCompileOnDemandOp::Compile(
   ResourceMgr* rm = ctx->resource_manager();
   CHECK(rm);
 
-  XlaCompilationCache* cache;
   TF_RETURN_IF_ERROR(rm->LookupOrCreate<XlaCompilationCache>(
-      rm->default_container(), "xla_cache", &cache,
-      [&](XlaCompilationCache** cache) {
-        *cache = new XlaCompilationCache(metadata.client(),
-                                         metadata.jit_device_type());
-        return Status::OK();
+      rm->default_container(), "xla_cache", cache,
+      [&](XlaCompilationCache** write_into_cache) {
+        return BuildXlaCompilationCache(ctx, platform_info_, write_into_cache);
       }));
-  // Hold the reference to the JIT during evaluation. (We could probably
-  // free it sooner because the ResourceMgr will retain a reference, but
-  // this is more obviously correct.)
-  core::ScopedUnref cache_ref(cache);
 
-  XlaCompiler::Options options;
-  options.device_type = metadata.jit_device_type();
-  options.client = metadata.client();
-  options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
-  options.shape_representation_fn = metadata.shape_representation_fn();
+  absl::optional<se::TfAllocatorAdapter> tf_allocator_adapter;
+  XlaCompiler::Options options =
+      GenerateCompilerOptions(*cache, ctx, platform_info_,
+                              /*has_ref_vars=*/true, &tf_allocator_adapter);
 
   XlaCompiler::CompileOptions compile_options;
   compile_options.is_entry_computation = true;
@@ -194,19 +175,23 @@ Status XlaCompileOnDemandOp::Compile(
         constant_arguments, variable_infos, ctx, &args));
   }
 
-  return cache->CompileSingleOp(options, args, ctx, compile_options, result,
-                                executable);
+  return (*cache)->CompileSingleOp(options, args, ctx, compile_options, result,
+                                   executable);
 }
 
 void XlaCompileOnDemandOp::Compute(OpKernelContext* ctx) {
   const XlaCompiler::CompilationResult* result;
   xla::LocalExecutable* executable;
-  const XlaDevice::Metadata* metadata;
-  OP_REQUIRES_OK(ctx, XlaDevice::GetMetadata(ctx, &metadata));
   ResourceVarsSnapshot variable_args;
+  XlaCompilationCache* cache;
   OP_REQUIRES_OK(ctx,
-                 Compile(ctx, *metadata, &result, &variable_args, &executable));
-  OP_REQUIRES_OK(ctx, Run(ctx, *metadata, result, executable, variable_args));
+                 Compile(ctx, &result, &cache, &variable_args, &executable));
+
+  // Hold the reference to the JIT during evaluation. (We could probably
+  // free it sooner because the ResourceMgr will retain a reference, but
+  // this is more obviously correct.)
+  core::ScopedUnref cache_ref(cache);
+  OP_REQUIRES_OK(ctx, Run(ctx, cache, result, executable, variable_args));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.h b/tensorflow/compiler/jit/xla_compile_on_demand_op.h
index cc5f2f1e42f..095d3427d41 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.h
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_launch_util.h"
+#include "tensorflow/compiler/jit/xla_platform_info.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/core/framework/function.h"
@@ -35,25 +36,24 @@ namespace tensorflow {
 // vanilla TensorFlow op as long as the bridge supports it.
 class XlaCompileOnDemandOp : public OpKernel {
  public:
-  explicit XlaCompileOnDemandOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  explicit XlaCompileOnDemandOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx), platform_info_(XlaPlatformInfoFromContext(ctx)) {}
   void Compute(OpKernelContext* ctx) override;
 
  private:
   XlaCompiler::Argument CreateCompilerArgument(OpKernelContext* ctx, int64 i);
-  Status ShouldArgumentBeConstant(const OpKernel* op_kernel, int64 argument_idx,
-                                  FunctionLibraryRuntime* flib_runtime,
-                                  bool* result);
-  Status MustArgumentBeConstant(const OpKernel* op_kernel, int64 argument_idx,
-                                FunctionLibraryRuntime* flib_runtime,
-                                bool* result);
-  Status Compile(OpKernelContext* ctx, const XlaDevice::Metadata& metadata,
+  Status Compile(OpKernelContext* ctx,
                  const XlaCompiler::CompilationResult** result,
+                 XlaCompilationCache** cache,
                  ResourceVarsSnapshot* variable_args,
                  xla::LocalExecutable** executable);
-  Status Run(OpKernelContext* ctx, const XlaDevice::Metadata& metadata,
+
+  Status Run(OpKernelContext* ctx, XlaCompilationCache* cache,
              const XlaCompiler::CompilationResult* result,
              xla::LocalExecutable* executable,
              const ResourceVarsSnapshot& variable_args);
+
+  const XlaPlatformInfo platform_info_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_device_ops.cc b/tensorflow/compiler/jit/xla_device_ops.cc
index 8126059262b..f0555ae32e5 100644
--- a/tensorflow/compiler/jit/xla_device_ops.cc
+++ b/tensorflow/compiler/jit/xla_device_ops.cc
@@ -59,11 +59,13 @@ void XlaAssignVariableOp::Compute(OpKernelContext* context) {
                                 return Status::OK();
                               }));
   mutex_lock ml(*variable->mu());
-  OP_REQUIRES(context, variable->tensor()->dtype() == dtype_,
-              errors::InvalidArgument(
-                  "Trying to assign variable with wrong dtype. Expected ",
-                  DataTypeString(variable->tensor()->dtype()), " got ",
-                  DataTypeString(dtype_)));
+  OP_REQUIRES(
+      context,
+      !variable->is_initialized || variable->tensor()->dtype() == dtype_,
+      errors::InvalidArgument(
+          "Trying to assign variable with wrong dtype. Expected ",
+          DataTypeString(variable->tensor()->dtype()), " got ",
+          DataTypeString(dtype_)));
   variable->is_initialized = true;
   *variable->tensor() = value;
 }
diff --git a/tensorflow/compiler/jit/xla_interpreter_device.cc b/tensorflow/compiler/jit/xla_interpreter_device.cc
deleted file mode 100644
index f720183e196..00000000000
--- a/tensorflow/compiler/jit/xla_interpreter_device.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Registers the XLA_INTERPRETER device which exposes the XLA Interpreter.
-
-#include "absl/memory/memory.h"
-#include "tensorflow/compiler/jit/kernels/xla_ops.h"
-#include "tensorflow/compiler/jit/xla_device.h"
-#include "tensorflow/compiler/jit/xla_device_ops.h"
-#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-
-namespace tensorflow {
-
-const char* const DEVICE_XLA_INTERPRETER = "XLA_INTERPRETER";
-const char* const DEVICE_INTERPRETER_XLA_JIT = "XLA_INTERPRETER_JIT";
-
-constexpr std::array<DataType, 10> kExecAllTypes = {
-    {DT_INT8, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64,
-     DT_COMPLEX128, DT_BOOL, DT_BFLOAT16}};
-
-class XlaInterpreterDeviceFactory : public DeviceFactory {
- public:
-  Status ListPhysicalDevices(std::vector<string>* devices) override;
-  Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<std::unique_ptr<Device>>* devices) override;
-};
-
-Status XlaInterpreterDeviceFactory::ListPhysicalDevices(
-    std::vector<string>* devices) {
-  devices->push_back(
-      absl::StrCat("/physical_device:", DEVICE_XLA_INTERPRETER, ":0"));
-
-  return Status::OK();
-}
-
-Status XlaInterpreterDeviceFactory::CreateDevices(
-    const SessionOptions& session_options, const string& name_prefix,
-    std::vector<std::unique_ptr<Device>>* devices) {
-  static XlaDeviceOpRegistrations* registrations = RegisterXlaDeviceKernels(
-      DEVICE_XLA_INTERPRETER, DEVICE_INTERPRETER_XLA_JIT);
-  (void)registrations;
-
-  XlaOpRegistry::DeviceRegistration registration;
-  registration.compilation_device_name = DEVICE_INTERPRETER_XLA_JIT;
-  registration.autoclustering_policy =
-      XlaOpRegistry::AutoclusteringPolicy::kAlways;
-  registration.cluster_resource_variable_ops_unsafely = true;
-  registration.cluster_stack_ops = false;
-  registration.cluster_tensor_array_ops = true;
-  registration.cluster_stateful_rng_ops = true;
-  registration.cluster_control_trigger = true;
-  registration.elide_assert_and_checknumerics = true;
-  registration.cluster_variant_ops = true;
-  registration.cluster_slow_ops = true;
-  registration.cluster_inaccurate_ops = true;
-  XlaOpRegistry::RegisterCompilationDevice(DEVICE_XLA_INTERPRETER,
-                                           registration);
-
-  TF_ASSIGN_OR_RETURN(
-      auto platform, se::MultiPlatformManager::PlatformWithName("Interpreter"));
-
-  XlaDevice::Options options;
-  options.platform = platform;
-  options.device_name_prefix = name_prefix;
-  options.device_name = DEVICE_XLA_INTERPRETER;
-  options.device_ordinal = 0;
-  options.compilation_device_name = DEVICE_INTERPRETER_XLA_JIT;
-  options.use_multiple_streams = false;
-  devices->push_back(absl::make_unique<XlaDevice>(session_options, options));
-
-  return Status::OK();
-}
-
-// Set priority to be below the default priority (50), so that Interpreter is
-// not selected as a high priority device over other default devices. See
-// constructor comments for Registrar in
-// tensorflow/core/common_runtime/device_factory.h for a list of priority for
-// devices.
-REGISTER_LOCAL_DEVICE_FACTORY(DEVICE_XLA_INTERPRETER,
-                              XlaInterpreterDeviceFactory, 40);
-
-// Kernel registrations
-static bool OpFilter(KernelDef* kdef) { return true; }
-
-REGISTER_XLA_LAUNCH_KERNEL(DEVICE_XLA_INTERPRETER, XlaLocalLaunchOp,
-                           kExecAllTypes);
-REGISTER_XLA_COMPILE_KERNEL(DEVICE_XLA_INTERPRETER, XlaCompileOp,
-                            kExecAllTypes);
-REGISTER_XLA_RUN_KERNEL(DEVICE_XLA_INTERPRETER, XlaRunOp, kExecAllTypes);
-
-REGISTER_XLA_DEVICE_KERNELS(DEVICE_XLA_INTERPRETER, kExecAllTypes);
-REGISTER_XLA_BACKEND(DEVICE_INTERPRETER_XLA_JIT, kExecAllTypes, OpFilter);
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_kernel_creator.cc b/tensorflow/compiler/jit/xla_kernel_creator.cc
index 5ca146969e0..3a6345afe9f 100644
--- a/tensorflow/compiler/jit/xla_kernel_creator.cc
+++ b/tensorflow/compiler/jit/xla_kernel_creator.cc
@@ -14,10 +14,62 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/jit/xla_kernel_creator.h"
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/compiler/jit/compilability_check_util.h"
+#include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/flags.h"
-#include "tensorflow/compiler/jit/xla_kernel_creator_util.h"
+#include "tensorflow/compiler/jit/kernels/xla_ops.h"
+#include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
+#include "tensorflow/compiler/tf2xla/const_analysis.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace {
+
+// Utility which searches for values in a sorted list by scanning over it once.
+// No matter how many times ScanForValue is called, the list is scanned at most
+// once. However, if a call to ScanForValue skips over a value, that value is
+// not revisited in future calls to ScanForValue, so callers must take
+// care to order their calls.
+//
+// Useful for merging multiple sorted lists in O(n) time.
+class SinglePassSearch {
+ public:
+  // Creates a SinglePassSearch object that can be used to search in `values`.
+  // Does not take ownership of `values`. `values` must outlive this.
+  // `values` must be sorted.
+  explicit SinglePassSearch(const std::vector<int>* values)
+      : current_index_(0), values_(values) {}
+
+  // Scans forward in the vector looking for "value", updating the internal
+  // position in to the vector.
+  // Returns true iff the vector contains the given value at or after current
+  // position.
+  // Not thread-safe.
+  bool ScanForValue(int value) {
+    while (current_index_ < values_->size() &&
+           (*values_)[current_index_] <= value) {
+      if ((*values_)[current_index_] == value) {
+        current_index_++;
+        return true;
+      }
+      current_index_++;
+    }
+    return false;
+  }
+
+ private:
+  int current_index_;
+  const std::vector<int>* values_;
+};
+
+}  // end namespace
 
 namespace tensorflow {
 
@@ -27,6 +79,121 @@ bool XlaKernelCreator::CanCreateKernel(
   return CanCreateXlaKernel(props->node_def);
 }
 
+static Status CreateXlaKernel(FunctionLibraryRuntime* flr,
+                              const NodeDef& node_def,
+                              std::unique_ptr<OpKernel>* kernel) {
+  if (!CanCreateXlaKernel(node_def)) {
+    return errors::Internal("Invalid node: ", node_def.ShortDebugString());
+  }
+
+  VLOG(3) << "Attempting to create XlaLaunchOp for " << node_def.DebugString();
+
+  // Make sure that kernels have been registered on the JIT device.
+  XlaOpRegistry::RegisterCompilationKernels();
+
+  // Only check for compilability if the MLIR bridge is not enabled.
+  if (!GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge) {
+    RecursiveCompilabilityChecker::UncompilableNodesMap uncompilable_nodes_map;
+    if (!IsCompilable(flr, node_def, &uncompilable_nodes_map)) {
+      std::vector<RecursiveCompilabilityChecker::UncompilableNodeInfo>
+          uncompilable_node_info;
+      for (const auto& it : uncompilable_nodes_map) {
+        for (const auto& info : it.second.second) {
+          uncompilable_node_info.emplace_back(info);
+        }
+      }
+      string message = absl::StrCat(
+          "Function invoked by the following node is not compilable: ",
+          SummarizeNodeDef(node_def, /*max_inputs_in_summary=*/10), ".\n");
+      absl::StrAppend(&message, "Uncompilable nodes:");
+      for (const auto& node_info : uncompilable_node_info) {
+        string node_message = absl::StrCat("\n", node_info.name, ": ",
+                                           node_info.uncompilable_reason, "\n",
+                                           "\tStacktrace:\n");
+        for (const auto& stack_frame : node_info.stack_trace) {
+          absl::StrAppendFormat(&node_message, "\t\tNode: %s, function: %s\n",
+                                stack_frame.name, stack_frame.function_name);
+        }
+        absl::StrAppend(&message, node_message);
+      }
+      VLOG(1) << message;
+      return errors::InvalidArgument(message);
+    }
+  }
+
+  // Get function body, constant args, and resource args.
+  const FunctionBody* fbody = nullptr;
+  std::vector<int> constant_arg_indices;
+  std::vector<int> resource_arg_indices;
+  TF_RETURN_IF_ERROR(GetBodyAndConstantsAndResources(
+      flr, node_def, &fbody, &constant_arg_indices, &resource_arg_indices));
+
+  // Set input and output memory types.
+  MemoryTypeVector input_memory_types(fbody->arg_types.size(), DEVICE_MEMORY);
+  // These indices are used only for optimization purposes. They allow us
+  // to loop over constant_arg_indices and resource_arg_indices only once
+  // while iterating over all the function arguments checking if it is a
+  // resource or a constant.
+  // The reason we optimized this code is because functions can have a lot of
+  // captured arguments. For example, the backward pass of ResNet50 takes in all
+  // 214 variables and a similar number of activations.
+  SinglePassSearch constants_search(&constant_arg_indices);
+  SinglePassSearch resources_search(&resource_arg_indices);
+  for (size_t i = 0; i < fbody->arg_types.size(); ++i) {
+    if (resources_search.ScanForValue(i) || constants_search.ScanForValue(i)) {
+      // Compile-time constants and resource handles are expected to be in
+      // host memory.
+      input_memory_types[i] = HOST_MEMORY;
+    }
+  }
+  // One might wonder, about the case where a compile-time constant argument
+  // (which must be in host memory) is also used as an input into an op,
+  // e.g. Add, that expects its inputs in device memory. Here is how it
+  // works now.
+  // First, what do we mean by "op expects an input in XYZ memory"?
+  // There are two types of "ops" here: the tf2xla kernel and the HLO
+  // computation it builds. The tf2xla kernel needs to retrieve the actual
+  // numeric value of the compile-time constant tensors, so it really expects
+  // them to be on in host memory. However, for other inputs, it refers to them
+  // using xla::ComputationDataHandle, which is just a symbolic handle that
+  // xla::ComputationBuilder assigns. How does this handle gets assigned for
+  // constant arguments? Even constant arguments get an _Arg node in the graph
+  // instantiated for Function compilation. The tf2xla kernel for constant _Arg
+  // nodes takes the constant value, converts it to XlaLiteral, and feeds it
+  // to xla::ComputationBuilder.ConstantLiteral, which returns the handle. This
+  // constant XlaLiteral is included in the HLO graph, and subsequently, in
+  // the actual executable, which is copied to the device before being
+  // executed. Thus, when this executable runs, the constant is available in
+  // device memory.
+
+  // XlaLaunch kernel keeps all outputs (including constants, which it copies),
+  // in device memory except for resources.
+  MemoryTypeVector output_memory_types(fbody->ret_types.size(), DEVICE_MEMORY);
+  for (size_t i = 0; i < fbody->ret_types.size(); ++i) {
+    if (fbody->ret_types[i] == DT_RESOURCE) {
+      output_memory_types[i] = HOST_MEMORY;
+    }
+  }
+
+  // Create the kernel.
+  NameAttrList function;
+  TF_RETURN_IF_ERROR(NameAndAttrsFromFunctionCall(node_def, &function));
+  Device* dev = flr->device();
+  Status s;
+  auto props = std::make_shared<NodeProperties>(
+      &fbody->fdef.signature(), node_def, fbody->arg_types, fbody->ret_types);
+  OpKernelConstruction construction(DeviceType(dev->device_type()), dev,
+                                    dev->GetAllocator(AllocatorAttributes()),
+                                    flr, dev->resource_manager(), props,
+                                    input_memory_types, output_memory_types,
+                                    flr->graph_def_version(), &s);
+
+  *kernel = absl::make_unique<XlaLocalLaunchBase>(
+      &construction, constant_arg_indices, resource_arg_indices, function,
+      /*has_ref_vars=*/false);
+  return s;
+}
+
 Status XlaKernelCreator::CreateKernel(
     FunctionLibraryRuntime* flr,
     const std::shared_ptr<const NodeProperties>& props,
@@ -34,19 +201,12 @@ Status XlaKernelCreator::CreateKernel(
   return CreateXlaKernel(flr, props->node_def, kernel);
 }
 
-namespace {
-
-bool RegisterLaunchOpCreator() {
+static bool RegisterLaunchOpCreator() {
   XlaKernelCreator* xla_kernel_creator = new XlaKernelCreator();
   RegisterDefaultCustomKernelCreator(xla_kernel_creator);
   return true;
 }
 
 static bool register_me = RegisterLaunchOpCreator();
-static bool register_xla = [] {
-  SetXlaIsEnabled();
-  return true;
-}();
 
-}  // end namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_kernel_creator_util.cc b/tensorflow/compiler/jit/xla_kernel_creator_util.cc
deleted file mode 100644
index 3cc68f2a1a4..00000000000
--- a/tensorflow/compiler/jit/xla_kernel_creator_util.cc
+++ /dev/null
@@ -1,182 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/compiler/jit/xla_kernel_creator_util.h"
-
-#include "absl/memory/memory.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "tensorflow/compiler/jit/compilability_check_util.h"
-#include "tensorflow/compiler/jit/defs.h"
-#include "tensorflow/compiler/jit/kernels/xla_ops.h"
-#include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
-#include "tensorflow/compiler/tf2xla/const_analysis.h"
-#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/framework/node_def_builder.h"
-#include "tensorflow/core/framework/node_def_util.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/util/ptr_util.h"
-
-namespace tensorflow {
-namespace {
-
-// Utility which searches for values in a sorted list by scanning over it once.
-// No matter how many times ScanForValue is called, the list is scanned at most
-// once. However, if a call to ScanForValue skips over a value, that value is
-// not revisited in future calls to ScanForValue, so callers must take
-// care to order their calls.
-//
-// Useful for merging multiple sorted lists in O(n) time.
-class SinglePassSearch {
- public:
-  // Creates a SinglePassSearch object that can be used to search in `values`.
-  // Does not take ownership of `values`. `values` must outlive this.
-  // `values` must be sorted.
-  explicit SinglePassSearch(const std::vector<int>* values)
-      : current_index_(0), values_(values) {}
-
-  // Scans forward in the vector looking for "value", updating the internal
-  // position in to the vector.
-  // Returns true iff the vector contains the given value at or after current
-  // position.
-  // Not thread-safe.
-  bool ScanForValue(int value) {
-    while (current_index_ < values_->size() &&
-           (*values_)[current_index_] <= value) {
-      if ((*values_)[current_index_] == value) {
-        current_index_++;
-        return true;
-      }
-      current_index_++;
-    }
-    return false;
-  }
-
- private:
-  int current_index_;
-  const std::vector<int>* values_;
-};
-}  // namespace
-
-Status CreateXlaKernel(FunctionLibraryRuntime* flr, const NodeDef& node_def,
-                       std::unique_ptr<OpKernel>* kernel) {
-  if (!CanCreateXlaKernel(node_def)) {
-    return errors::Internal("Invalid node: ", node_def.ShortDebugString());
-  }
-
-  VLOG(3) << "Attempting to create XlaLaunchOp for " << node_def.DebugString();
-
-  // Make sure that kernels have been registered on the JIT device.
-  XlaOpRegistry::RegisterCompilationKernels();
-  RecursiveCompilabilityChecker::UncompilableNodesMap uncompilable_nodes_map;
-  if (!IsCompilable(flr, node_def, &uncompilable_nodes_map)) {
-    std::vector<RecursiveCompilabilityChecker::UncompilableNodeInfo>
-        uncompilable_node_info;
-    for (const auto& it : uncompilable_nodes_map) {
-      for (const auto& info : it.second.second) {
-        uncompilable_node_info.emplace_back(info);
-      }
-    }
-    string message = absl::StrCat(
-        "Function invoked by the following node is not compilable: ",
-        SummarizeNodeDef(node_def, /*max_inputs_in_summary=*/10), ".\n");
-    absl::StrAppend(&message, "Uncompilable nodes:");
-    for (const auto& node_info : uncompilable_node_info) {
-      string node_message =
-          absl::StrCat("\n", node_info.name, ": ",
-                       node_info.uncompilable_reason, "\n", "\tStacktrace:\n");
-      for (const auto& stack_frame : node_info.stack_trace) {
-        absl::StrAppendFormat(&node_message, "\t\tNode: %s, function: %s\n",
-                              stack_frame.name, stack_frame.function_name);
-      }
-      absl::StrAppend(&message, node_message);
-    }
-    VLOG(1) << message;
-    return errors::InvalidArgument(message);
-  }
-
-  // Get function body, constant args, and resource args.
-  const FunctionBody* fbody = nullptr;
-  std::vector<int> constant_arg_indices;
-  std::vector<int> resource_arg_indices;
-  TF_RETURN_IF_ERROR(GetBodyAndConstantsAndResources(
-      flr, node_def, &fbody, &constant_arg_indices, &resource_arg_indices));
-
-  // Set input and output memory types.
-  MemoryTypeVector input_memory_types(fbody->arg_types.size(), DEVICE_MEMORY);
-  // These indices are used only for optimization purposes. They allow us
-  // to loop over constant_arg_indices and resource_arg_indices only once
-  // while iterating over all the function arguments checking if it is a
-  // resource or a constant.
-  // The reason we optimized this code is because functions can have a lot of
-  // captured arguments. For example, the backward pass of ResNet50 takes in all
-  // 214 variables and a similar number of activations.
-  SinglePassSearch constants_search(&constant_arg_indices);
-  SinglePassSearch resources_search(&resource_arg_indices);
-  for (size_t i = 0; i < fbody->arg_types.size(); ++i) {
-    if (resources_search.ScanForValue(i) || constants_search.ScanForValue(i)) {
-      // Compile-time constants and resource handles are expected to be in
-      // host memory.
-      input_memory_types[i] = HOST_MEMORY;
-    }
-  }
-  // One might wonder, about the case where a compile-time constant argument
-  // (which must be in host memory) is also used as an input into an op,
-  // e.g. Add, that expects its inputs in device memory. Here is how it
-  // works now.
-  // First, what do we mean by "op expects an input in XYZ memory"?
-  // There are two types of "ops" here: the tf2xla kernel and the HLO
-  // computation it builds. The tf2xla kernel needs to retrieve the actual
-  // numeric value of the compile-time constant tensors, so it really expects
-  // them to be on in host memory. However, for other inputs, it refers to them
-  // using xla::ComputationDataHandle, which is just a symbolic handle that
-  // xla::ComputationBuilder assigns. How does this handle gets assigned for
-  // constant arguments? Even constant arguments get an _Arg node in the graph
-  // instantiated for Function compilation. The tf2xla kernel for constant _Arg
-  // nodes takes the constant value, converts it to XlaLiteral, and feeds it
-  // to xla::ComputationBuilder.ConstantLiteral, which returns the handle. This
-  // constant XlaLiteral is included in the HLO graph, and subsequently, in
-  // the actual executable, which is copied to the device before being
-  // executed. Thus, when this executable runs, the constant is available in
-  // device memory.
-
-  // XlaLaunch kernel keeps all outputs (including constants, which it copies),
-  // in device memory except for resources.
-  MemoryTypeVector output_memory_types(fbody->ret_types.size(), DEVICE_MEMORY);
-  for (size_t i = 0; i < fbody->ret_types.size(); ++i) {
-    if (fbody->ret_types[i] == DT_RESOURCE) {
-      output_memory_types[i] = HOST_MEMORY;
-    }
-  }
-
-  // Create the kernel.
-  NameAttrList function;
-  TF_RETURN_IF_ERROR(NameAndAttrsFromFunctionCall(node_def, &function));
-  Device* dev = flr->device();
-  Status s;
-  auto props = std::make_shared<NodeProperties>(
-      &fbody->fdef.signature(), node_def, fbody->arg_types, fbody->ret_types);
-  OpKernelConstruction construction(DeviceType(dev->device_type()), dev,
-                                    dev->GetAllocator(AllocatorAttributes()),
-                                    flr, dev->resource_manager(), props,
-                                    input_memory_types, output_memory_types,
-                                    flr->graph_def_version(), &s);
-
-  *kernel = absl::make_unique<XlaLocalLaunchBase>(
-      &construction, constant_arg_indices, resource_arg_indices, function,
-      /*has_ref_vars=*/false);
-  return s;
-}
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 7f107aaef11..19e2b5a2bb5 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -91,29 +91,19 @@ VariableInfo::~VariableInfo() {
 Status GetVariableInfosFromCtxInputs(OpKernelContext* ctx,
                                      absl::Span<const int> variable_indices,
                                      std::vector<VariableInfo>* result) {
-  std::vector<const ResourceHandle*> resource_handles;
-  absl::c_transform(
-      variable_indices, std::back_inserter(resource_handles),
-      [&](int variable_idx) { return &HandleFromInput(ctx, variable_idx); });
-
-  std::vector<core::RefCountPtr<Var>> variables;
-  Status s = LookupResources(ctx, resource_handles, &variables);
-  if (!s.ok()) {
-    errors::AppendToMessage(&s, kPossibleNonVariableResourceHintMessage);
-    return s;
-  }
-
   result->clear();
   result->reserve(variable_indices.size());
-  for (int i = 0; i < variable_indices.size(); i++) {
-    // *Release* the variable because we're going to unref it later in
-    // ~VariableInfo.
-    Var* variable = variables[i].release();
-    int input_idx = variable_indices[i];
-    std::string var_name = HandleFromInput(ctx, input_idx).name();
-    result->emplace_back(input_idx, var_name, variable);
+  for (int var_idx : variable_indices) {
+    Var* variable = nullptr;
+    ResourceHandle handle = HandleFromInput(ctx, var_idx);
+    TF_RETURN_IF_ERROR(
+        LookupOrCreateResource<Var>(ctx, handle, &variable, [&](Var** ptr) {
+          // This var is uninitialized for now.
+          *ptr = new Var(DT_INVALID);
+          return Status::OK();
+        }));
+    result->emplace_back(var_idx, handle.name(), variable);
   }
-
   return Status::OK();
 }
 
@@ -166,7 +156,7 @@ Status SnapshotResourceVariables(OpKernelContext* ctx,
                                  absl::Span<const int> variable_indices,
                                  absl::Span<VariableInfo const> variable_infos,
                                  ResourceVarsSnapshot* result) {
-  for (int i = 0; i < variable_indices.size(); i++) {
+  for (int i = 0, end = variable_indices.size(); i < end; i++) {
     Var* var = variable_infos[i].var();
     (*result)[variable_indices[i]] =
         var ? absl::make_optional(*var->tensor()) : absl::nullopt;
@@ -176,35 +166,73 @@ Status SnapshotResourceVariables(OpKernelContext* ctx,
 
 XlaComputationLaunchContext::XlaComputationLaunchContext(
     xla::LocalClient* client, se::DeviceMemoryAllocator* xla_allocator,
-    bool allocate_xla_tensors, bool use_multiple_streams)
+    int device_ordinal, bool allocate_xla_tensors, bool use_multiple_streams)
     : client_(client),
       xla_allocator_(xla_allocator),
       allocate_xla_tensors_(allocate_xla_tensors),
-      use_multiple_streams_(use_multiple_streams) {
+      use_multiple_streams_(use_multiple_streams),
+      device_ordinal_(device_ordinal) {
   if (use_multiple_streams_) {
     CHECK(allocate_xla_tensors_) << "To use multiple streams correctly we must "
                                     "be allocating XLA tensors!";
   }
 }
 
-void XlaComputationLaunchContext::PopulateInputs(
+// Fills in `execution_input` with `buffer` for `index`.
+static void PopulateExecutionInputBuffer(xla::ExecutionInput& execution_input,
+                                         xla::ShapeIndex index,
+                                         se::DeviceMemoryBase& buffer,
+                                         bool donate_buffer, int device_ordinal,
+                                         se::DeviceMemoryAllocator* allocator) {
+  xla::MaybeOwningDeviceMemory* in_buffer =
+      execution_input.MutableBuffer(index);
+  if (donate_buffer) {
+    *in_buffer = se::OwningDeviceMemory(buffer, device_ordinal, allocator);
+    buffer = se::DeviceMemoryBase();
+  } else {
+    *in_buffer = buffer;
+  }
+}
+
+xla::StatusOr<std::vector<xla::ExecutionInput>>
+XlaComputationLaunchContext::PopulateInputs(
     OpKernelContext* ctx,
     const XlaCompiler::CompilationResult* compilation_result,
-    const ResourceVarsSnapshot& variables, int missing_ctx_input_prefix) {
-  // Build ShapedBuffers that point directly to the Tensor buffers.
-  arg_ptrs_ =
-      std::vector<ShapedBuffer*>(compilation_result->xla_input_shapes.size());
+    const std::map<int, const Tensor*>& resource_vars,
+    int missing_ctx_input_prefix,
+    const xla::HloInputOutputAliasConfig& input_output_alias) {
+  std::vector<xla::ExecutionInput> arguments;
+  arguments.reserve(compilation_result->xla_input_shapes.size());
 
   xla::TransferManager* transfer_manager =
       client_->backend().transfer_manager();
-  for (int i = 0; i < compilation_result->xla_input_shapes.size(); ++i) {
+  for (int i = 0, end = compilation_result->xla_input_shapes.size(); i < end;
+       ++i) {
     int arg_num = compilation_result->input_mapping[i];
     CHECK_GE(arg_num, missing_ctx_input_prefix);
     const xla::Shape& shape = compilation_result->xla_input_shapes[i];
-    const Tensor* t = variables.count(arg_num)
-                          ? &(variables.at(arg_num).value())
+    const xla::Shape& device_shape =
+        transfer_manager->HostShapeToDeviceShape(shape);
+
+    bool is_resource_variable = resource_vars.count(arg_num);
+    bool is_updated_resource_variable =
+        is_resource_variable &&
+        absl::c_any_of(compilation_result->resource_updates,
+                       [&](const XlaCompiler::ResourceUpdate& update) {
+                         return update.input_index == i && update.modified;
+                       });
+
+    const Tensor* t = is_resource_variable
+                          ? resource_vars.at(arg_num)
                           : &(ctx->input(arg_num - missing_ctx_input_prefix));
     CHECK(t);
+    bool donate_buffer =
+        t->RefCountIsOne() && is_updated_resource_variable &&
+        input_output_alias.ParameterHasAlias(i, xla::ShapeIndex{});
+    VLOG(3) << "Processing input: " << i
+            << "; is_resource_variable=" << is_resource_variable
+            << "; is_updated_resource_variable=" << is_updated_resource_variable
+            << "; donate_buffer=" << donate_buffer;
 
     if (use_multiple_streams_) {
       CHECK(ctx->op_device_context() && ctx->op_device_context()->stream())
@@ -215,23 +243,28 @@ void XlaComputationLaunchContext::PopulateInputs(
           ctx->op_device_context()->stream());
     }
 
-    if (xla::Shape::Equal().MinorToMajorOnlyInLayout()(
-            shape, transfer_manager->HostShapeToDeviceShape(shape))) {
+    arguments.emplace_back(device_shape, shape);
+    xla::ExecutionInput& execution_input = arguments.back();
+    if (xla::Shape::Equal().MinorToMajorOnlyInLayout()(shape, device_shape)) {
       se::DeviceMemoryBase dmem = XlaTensor::DeviceMemoryFromTensor(*t);
-      arg_buffers_.emplace_back(
-          /*on_host_shape=*/shape, /*on_device_shape=*/shape,
-          client_->platform(), client_->default_device_ordinal());
-      arg_buffers_.back().set_buffer(dmem, /*index=*/{});
-      arg_ptrs_[i] = &arg_buffers_.back();
+      PopulateExecutionInputBuffer(execution_input, xla::ShapeIndex{}, dmem,
+                                   donate_buffer, device_ordinal_,
+                                   xla_allocator_);
     } else {
-      const XlaTensor* xla_tensor = XlaTensor::FromTensor(t);
+      XlaTensor* xla_tensor = XlaTensor::FromTensor(t);
       CHECK(xla_tensor && xla_tensor->has_shaped_buffer());
-      arg_ptrs_[i] = const_cast<ShapedBuffer*>(&xla_tensor->shaped_buffer());
+      xla_tensor->shaped_buffer().buffers().ForEachMutableElement(
+          [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
+            PopulateExecutionInputBuffer(execution_input, index, *buffer,
+                                         donate_buffer, device_ordinal_,
+                                         xla_allocator_);
+          });
     }
   }
+  return std::move(arguments);
 }
 
-// Construct the tensor for given type and buffer.
+// Construct the tensor for the given type and buffer.
 static Tensor MakeTensor(DataType dtype, const TensorShape& shape,
                          se::DeviceMemoryBase buffer, Allocator* allocator) {
   size_t expected_size = shape.num_elements() * DataTypeSize(dtype);
@@ -247,28 +280,26 @@ static Tensor GetOrCreateTensorForOutput(
     int output_num, OpKernelContext* ctx, int missing_ctx_input_prefix,
     const xla::HloInputOutputAliasConfig& input_output_alias,
     absl::Span<const int> input_mapping,
-    const ResourceVarsSnapshot& resource_var_snapshots, DataType output_dtype,
-    const TensorShape& output_shape, se::DeviceMemoryBase output_buffer,
-    Allocator* output_allocator) {
+    const std::map<int, const Tensor*>& resource_vars_snapshots,
+    DataType output_dtype, const TensorShape& output_shape,
+    se::DeviceMemoryBase output_buffer, Allocator* output_allocator) {
   xla::ShapeIndex output_index = input_output_alias.shape().IsTuple()
                                      ? xla::ShapeIndex({output_num})
                                      : xla::ShapeIndex({});
+
   CHECK(input_output_alias.shape().IsTuple() || output_num == 0);
   if (absl::optional<xla::HloInputOutputAliasConfig::Alias> alias =
           input_output_alias.GetAliasedParameter(output_index)) {
+    VLOG(3) << "Found alias: " << alias->ToString();
     int tf_param =
         input_mapping[alias->parameter_number] - missing_ctx_input_prefix;
-    const Tensor* input_tensor = &ctx->input(tf_param);
-
-    // If input tensor is a resource variable, alias to the snapshot we took at
-    // entry time.
-    if (input_tensor->dtype() == DT_RESOURCE) {
-      const absl::optional<Tensor>& v =
-          resource_var_snapshots.at(missing_ctx_input_prefix + tf_param);
-      CHECK(v.has_value());
-      return *v;
+    const Tensor input_tensor =
+        ctx->input(tf_param).dtype() != DT_RESOURCE
+            ? ctx->input(tf_param)
+            : *resource_vars_snapshots.at(missing_ctx_input_prefix + tf_param);
+    if (output_buffer.opaque() == input_tensor.data()) {
+      return input_tensor;
     }
-    return *input_tensor;
   }
   return MakeTensor(output_dtype, output_shape, output_buffer,
                     output_allocator);
@@ -291,12 +322,10 @@ static Status SetOutputForConstant(
     OpKernelContext* ctx, se::Stream* stream,
     const XlaCompiler::CompilationResult* compilation_result, int output_num) {
   CHECK(compilation_result->outputs[output_num].is_constant);
-  // Output is a constant.
   const Tensor& const_tensor =
       compilation_result->outputs[output_num].constant_value;
   Tensor* output_tensor;
-  const size_t total_bytes = const_tensor.TotalBytes();
-  if (stream && total_bytes > 0) {
+  if (stream && const_tensor.TotalBytes() > 0) {
     // Copy host -> device. (Empty tensors don't have backing buffers.)
     // Manually allocate memory using an XlaTensorBuffer so we can allocate
     // as much memory as the device requires (as given by
@@ -335,52 +364,55 @@ static Status SetOutputForConstant(
   return Status::OK();
 }
 
-// Creates a list of updates resource variables.
-static xla::StatusOr<std::vector<VariableInfo>> GatherVariableInfo(
-    OpKernelContext* ctx,
-    const XlaCompiler::CompilationResult* compilation_result,
-    int missing_ctx_input_prefix) {
-  std::vector<VariableInfo> variable_infos;
-  variable_infos.reserve(compilation_result->resource_updates.size());
+static xla::StatusOr<Var*> GetOrCreateResourceVar(
+    OpKernelContext* ctx, const ResourceHandle& handle,
+    const XlaCompiler::ResourceUpdate& write) {
+  Var* variable = nullptr;
+  TF_RETURN_IF_ERROR(
+      LookupOrCreateResource<Var>(ctx, handle, &variable, [&write](Var** ptr) {
+        *ptr = new Var(write.type);
+        return Status::OK();
+      }));
+  return variable;
+}
 
-  for (int i = 0; i < compilation_result->resource_updates.size(); ++i) {
+xla::StatusOr<std::vector<VariableInfo>> GatherVariableInfo(
+    OpKernelContext* ctx,
+    const XlaCompiler::CompilationResult& compilation_result,
+    int missing_ctx_input_prefix) {
+  std::vector<VariableInfo> out;
+  out.reserve(compilation_result.resource_updates.size());
+  for (int i = 0; i < compilation_result.resource_updates.size(); ++i) {
     const XlaCompiler::ResourceUpdate& write =
-        compilation_result->resource_updates[i];
+        compilation_result.resource_updates[i];
     int actual_input_index = write.input_index - missing_ctx_input_prefix;
     if (actual_input_index < 0 || actual_input_index >= ctx->num_inputs()) {
       return errors::Internal("Invalid input index for variable write.");
     }
 
-    // TODO(b/35625933): tensorflow::Var should contain a PersistentTensor,
-    // not a Tensor.
-    Var* variable = nullptr;
     const ResourceHandle handle = HandleFromInput(ctx, actual_input_index);
-    TF_RETURN_IF_ERROR(LookupOrCreateResource<Var>(ctx, handle, &variable,
-                                                   [&write](Var** ptr) {
-                                                     *ptr = new Var(write.type);
-                                                     return Status::OK();
-                                                   }));
-    variable_infos.emplace_back(actual_input_index, handle.name(), variable);
+    TF_ASSIGN_OR_RETURN(Var * variable,
+                        GetOrCreateResourceVar(ctx, handle, write));
+    out.emplace_back(actual_input_index, handle.name(), variable);
   }
-  return variable_infos;
+  return std::move(out);
 }
 
 Status XlaComputationLaunchContext::PopulateOutputs(
     OpKernelContext* ctx,
     const XlaCompiler::CompilationResult* compilation_result,
     ScopedShapedBuffer output, int missing_ctx_input_prefix,
+    absl::Span<VariableInfo> variable_infos,
     const xla::HloInputOutputAliasConfig& input_output_alias,
-    const ResourceVarsSnapshot& resource_var_snapshots) {
+    const std::map<int, const Tensor*>& resource_vars) {
   se::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
   Allocator* allocator = ctx->device()->GetAllocator({});
 
   // Computation output should always be a tuple.
-  if (VLOG_IS_ON(2)) {
-    VLOG(2) << "Result tuple shape: " << output.on_host_shape().DebugString();
-    VLOG(2) << "Result tuple shape (on device): "
-            << output.on_device_shape().DebugString();
-  }
+  VLOG(2) << "Result tuple shape: " << output.on_host_shape().DebugString();
+  VLOG(2) << "Result tuple shape (on device): "
+          << output.on_device_shape().DebugString();
   CHECK_EQ(ctx->num_outputs(), compilation_result->outputs.size());
 
   // If the on-host-shape isn't a tuple, create a new single-element tuple
@@ -435,11 +467,11 @@ Status XlaComputationLaunchContext::PopulateOutputs(
 
   // Copy XLA results to the OpOutputList.
   int output_num = 0;
-  for (int i = 0; i < ctx->num_outputs(); ++i) {
+  for (int i = 0, end = ctx->num_outputs(); i < end; ++i) {
     const TensorShape& shape = output_tensor_shapes[i];
     const DataType& type = compilation_result->outputs[i].type;
-    VLOG(2) << "Retval " << i << " shape " << shape.DebugString() << " type "
-            << DataTypeString(type);
+    VLOG(2) << "Populating output for retval " << i << " shape "
+            << shape.DebugString() << " type " << DataTypeString(type);
     if (type == DT_VARIANT) {
       return errors::Unimplemented(
           "Support for TensorList crossing the XLA/TF boundary "
@@ -467,30 +499,38 @@ Status XlaComputationLaunchContext::PopulateOutputs(
         se::DeviceMemoryBase buffer = output.buffer({output_num});
         Tensor output_tensor = GetOrCreateTensorForOutput(
             output_num, ctx, missing_ctx_input_prefix, input_output_alias,
-            compilation_result->input_mapping, resource_var_snapshots,
+            compilation_result->input_mapping, resource_vars,
             ctx->expected_output_dtype(i), shape, buffer, allocator);
-        output.set_buffer(se::OwningDeviceMemory(), {output_num});
         ctx->set_output(i, output_tensor);
       }
+      output.set_buffer(se::OwningDeviceMemory(), {output_num});
       ++output_num;
     }
-
-    if (VLOG_IS_ON(3)) {
-      VLOG(3) << ctx->mutable_output(i)->DeviceSafeDebugString();
-    }
   }
 
-  // Apply variable updates, if any.
-  VLOG(2) << "Applying variable updates";
-  TF_ASSIGN_OR_RETURN(
-      std::vector<VariableInfo> variable_infos,
-      GatherVariableInfo(ctx, compilation_result, missing_ctx_input_prefix));
-  TF_RETURN_IF_ERROR(LockVariables(absl::MakeSpan(variable_infos)));
+  // input_index -> index into variable_infos.
+  absl::flat_hash_map<int, int> variable_info_lookup;
+  for (int i = 0; i < variable_infos.size(); i++) {
+    variable_info_lookup.emplace(variable_infos[i].index(), i);
+  }
 
-  for (int i = 0; i < compilation_result->resource_updates.size(); ++i) {
+  // Apply variable updates, if any.
+  for (int i = 0, end = compilation_result->resource_updates.size(); i < end;
+       ++i) {
     const XlaCompiler::ResourceUpdate& write =
         compilation_result->resource_updates[i];
-    if (variable_infos[i].var()->tensor()->dtype() != write.type) {
+    int actual_input_index = write.input_index - missing_ctx_input_prefix;
+    CHECK_GE(actual_input_index, 0);
+    CHECK_LT(actual_input_index, ctx->num_inputs());
+    Var* var = variable_infos[variable_info_lookup[actual_input_index]].var();
+    CHECK(var);
+
+    VLOG(2) << "Updating variable #" << i
+            << " at input index: " << actual_input_index << " with shape "
+            << write.shape.DebugString() << "; variable tensor has shape: "
+            << var->tensor()->shape().DebugString();
+
+    if (var->is_initialized && var->tensor()->dtype() != write.type) {
       return errors::Internal("Mismatched type in variable write");
     }
 
@@ -504,21 +544,21 @@ Status XlaComputationLaunchContext::PopulateOutputs(
       }
     } else {
       se::DeviceMemoryBase buffer = output.buffer({output_num});
-      output.set_buffer(se::OwningDeviceMemory(), {output_num});
       output_tensor = GetOrCreateTensorForOutput(
           output_num, ctx, missing_ctx_input_prefix, input_output_alias,
-          compilation_result->input_mapping, resource_var_snapshots, write.type,
+          compilation_result->input_mapping, resource_vars, write.type,
           write.shape, buffer, allocator);
     }
-    *variable_infos[i].var()->tensor() = output_tensor;
-    variable_infos[i].var()->is_initialized |= write.modified;
+    output.set_buffer(se::OwningDeviceMemory(), {output_num});
+    var->is_initialized |= write.modified;
+    *var->tensor() = output_tensor;
     ++output_num;
   }
   return Status::OK();
 }
 
 Status XlaComputationLaunchContext::BuildXlaCompilerArguments(
-    const std::map<int, Tensor>& constant_args,
+    const std::map<int, Tensor>& must_be_constant_args,
     absl::Span<VariableInfo const> variable_args, OpKernelContext* ctx,
     std::vector<XlaCompiler::Argument>* args) {
   args->resize(ctx->num_inputs());
@@ -534,9 +574,9 @@ Status XlaComputationLaunchContext::BuildXlaCompilerArguments(
   for (int64 input_num = 0; input_num < ctx->num_inputs(); ++input_num) {
     XlaCompiler::Argument& arg = (*args)[input_num];
 
-    if (constant_args.count(input_num) > 0) {
+    if (must_be_constant_args.count(input_num) > 0) {
       // Handles compile-time constants.
-      const Tensor& input = constant_args.at(input_num);
+      const Tensor& input = must_be_constant_args.at(input_num);
       TF_RET_CHECK(input.dtype() != DT_RESOURCE);
       arg.kind = XlaCompiler::Argument::kConstant;
       arg.type = input.dtype();
@@ -562,7 +602,7 @@ Status XlaComputationLaunchContext::BuildXlaCompilerArguments(
       arg.name = std::string(variable.name());
       arg.kind = XlaCompiler::Argument::kResource;
       arg.resource_kind = XlaResource::kVariable;
-      if (variable.var()) {
+      if (variable.var() && variable.var()->is_initialized) {
         const Tensor* value = variable.var()->tensor();
         arg.type = value->dtype();
         arg.shape = value->shape();
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 92b6c4c8a08..b34b3059a4f 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -81,6 +81,12 @@ class VariableInfo {
   bool lock_held_ = false;
 };
 
+// Creates a list of updated resource variables.
+xla::StatusOr<std::vector<VariableInfo>> GatherVariableInfo(
+    OpKernelContext* ctx,
+    const XlaCompiler::CompilationResult& compilation_result,
+    int missing_ctx_input_prefix);
+
 // Takes a snapshot of the values of resource variable arguments, whose indices
 // are specified in `variable_indices` argument. We snapshot tensors that back
 // resource variables since concurrent updates may modify the shape, and it is
@@ -124,7 +130,7 @@ class XlaComputationLaunchContext {
   // objects.
   XlaComputationLaunchContext(xla::LocalClient* client,
                               se::DeviceMemoryAllocator* xla_allocator,
-                              bool allocate_xla_tensors,
+                              int device_ordinal, bool allocate_xla_tensors,
                               bool use_multiple_streams);
 
   // Builds a XlaCompiler::Argument vector from the arguments to an XlaLaunch
@@ -142,10 +148,12 @@ class XlaComputationLaunchContext {
   // missing and adjusts input indices accordingly.  All elements in kernel's
   // input_mapping must be greater than or equal to `missing_ctx_input_prefix`
   // (in other words, no inputs actually required by the kernel can be missing).
-  void PopulateInputs(OpKernelContext* ctx,
-                      const XlaCompiler::CompilationResult* compilation_result,
-                      const ResourceVarsSnapshot& variables,
-                      int missing_ctx_input_prefix);
+  xla::StatusOr<std::vector<xla::ExecutionInput>> PopulateInputs(
+      OpKernelContext* ctx,
+      const XlaCompiler::CompilationResult* compilation_result,
+      const std::map<int, const Tensor*>& resource_vars,
+      int missing_ctx_input_prefix,
+      const xla::HloInputOutputAliasConfig& input_output_alias);
 
   // Given the XLA output in `output`, populate all outputs of `ctx`.  Also
   // writes out the resource variable updates.
@@ -161,20 +169,16 @@ class XlaComputationLaunchContext {
       OpKernelContext* ctx,
       const XlaCompiler::CompilationResult* compilation_result,
       xla::ScopedShapedBuffer output, int missing_ctx_input_prefix,
+      absl::Span<VariableInfo> variable_infos,
       const xla::HloInputOutputAliasConfig& input_output_alias,
-      const ResourceVarsSnapshot& resource_var_snapshots);
-
-  // Return the argument list. Only valid after PopulateInputs() has been
-  // called.
-  const std::vector<xla::ShapedBuffer*>& arguments() const { return arg_ptrs_; }
+      const std::map<int, const Tensor*>& resource_vars);
 
  private:
   xla::LocalClient* client_;
   se::DeviceMemoryAllocator* xla_allocator_;
   bool allocate_xla_tensors_;
   bool use_multiple_streams_;
-  std::deque<xla::ShapedBuffer> arg_buffers_;
-  std::vector<xla::ShapedBuffer*> arg_ptrs_;
+  int device_ordinal_;
 };
 
 // A simple TensorBuffer implementation that allows us to create Tensors that
diff --git a/tensorflow/compiler/jit/xla_platform_info.cc b/tensorflow/compiler/jit/xla_platform_info.cc
new file mode 100644
index 00000000000..e2a89353055
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_platform_info.cc
@@ -0,0 +1,158 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/xla_platform_info.h"
+
+#include "tensorflow/compiler/xla/client/client_library.h"
+
+namespace tensorflow {
+
+Status BuildXlaCompilationCache(OpKernelContext* ctx,
+                                const XlaPlatformInfo& platform_info,
+                                XlaCompilationCache** cache) {
+  if (platform_info.xla_device_metadata()) {
+    *cache = new XlaCompilationCache(
+        platform_info.xla_device_metadata()->client(),
+        platform_info.xla_device_metadata()->jit_device_type());
+    return Status::OK();
+  }
+
+  auto platform =
+      se::MultiPlatformManager::PlatformWithId(platform_info.platform_id());
+  if (!platform.ok()) {
+    return platform.status();
+  }
+
+  xla::StatusOr<xla::Compiler*> compiler_for_platform =
+      xla::Compiler::GetForPlatform(platform.ValueOrDie());
+  if (!compiler_for_platform.ok()) {
+    // In some rare cases (usually in unit tests with very small clusters) we
+    // may end up transforming an XLA cluster with at least one GPU operation
+    // (which would normally force the cluster to be compiled using XLA:GPU)
+    // into an XLA cluster with no GPU operations (i.e. containing only CPU
+    // operations).  Such a cluster can fail compilation (in way that
+    // MarkForCompilation could not have detected) if the CPU JIT is not linked
+    // in.
+    //
+    // So bail out of _XlaCompile in this case, and let the executor handle the
+    // situation for us.
+    const Status& status = compiler_for_platform.status();
+    if (status.code() == error::NOT_FOUND) {
+      return errors::Unimplemented("Could not find compiler for platform ",
+                                   platform.ValueOrDie()->Name(), ": ",
+                                   status.ToString());
+    }
+  }
+
+  xla::LocalClientOptions client_options;
+  client_options.set_platform(platform.ValueOrDie());
+  client_options.set_intra_op_parallelism_threads(
+      ctx->device()->tensorflow_cpu_worker_threads()->num_threads);
+  auto client = xla::ClientLibrary::GetOrCreateLocalClient(client_options);
+  if (!client.ok()) {
+    return client.status();
+  }
+  const XlaOpRegistry::DeviceRegistration* registration;
+  if (!XlaOpRegistry::GetCompilationDevice(platform_info.device_type().type(),
+                                           &registration)) {
+    return errors::InvalidArgument("No JIT device registered for ",
+                                   platform_info.device_type().type());
+  }
+  *cache = new XlaCompilationCache(
+      client.ValueOrDie(), DeviceType(registration->compilation_device_name));
+  return Status::OK();
+}
+
+XlaPlatformInfo XlaPlatformInfoFromContext(OpKernelConstruction* ctx) {
+  DeviceType device_type = ctx->device_type();
+  se::Platform::Id platform_id = nullptr;
+  const XlaDevice::Metadata* xla_device_metadata = nullptr;
+  se::DeviceMemoryAllocator* custom_allocator = nullptr;
+
+  if (ctx->device_type() == DeviceType(DEVICE_CPU)) {
+    platform_id = se::host::kHostPlatformId;
+  } else if (ctx->device_type() == DeviceType(DEVICE_GPU)) {
+    platform_id = ctx->device()
+                      ->tensorflow_gpu_device_info()
+                      ->stream->parent()
+                      ->platform()
+                      ->id();
+  } else if (XlaDevice::GetMetadata(ctx, &xla_device_metadata).ok()) {
+    // If we are on an XlaDevice, use the underlying XLA platform's allocator
+    // directly. We could use the StreamExecutor's allocator which may
+    // theoretically be more correct, but XLA returns a nice OOM message in a
+    // Status and StreamExecutor does not.
+    //
+    // Importantly we can't use ctx->device()->GetAllocator() as the allocator
+    // (which xla_allocator above uses) as on an XlaDevice, this is a dummy
+    // allocator that returns XlaTensor objects. The XlaCompiler needs a real
+    // allocator to allocate real buffers.
+    platform_id = xla_device_metadata->platform()->id();
+    custom_allocator =
+        xla_device_metadata->client()->backend().memory_allocator();
+  }
+
+  return XlaPlatformInfo(device_type, platform_id, xla_device_metadata,
+                         custom_allocator);
+}
+
+se::DeviceMemoryAllocator* GetAllocator(
+    absl::optional<se::TfAllocatorAdapter>* tf_allocator_adapter,
+    OpKernelContext* ctx, const XlaPlatformInfo& platform_info) {
+  if (platform_info.custom_allocator()) {
+    return platform_info.custom_allocator();
+  }
+  if (!ctx->op_device_context()) {
+    // Stream is not set for the host platform.
+    se::Platform* platform =
+        se::MultiPlatformManager::PlatformWithId(platform_info.platform_id())
+            .ValueOrDie();
+    tf_allocator_adapter->emplace(ctx->device()->GetAllocator({}), platform);
+    return &tf_allocator_adapter->value();
+  }
+  tf_allocator_adapter->emplace(ctx->device()->GetAllocator({}),
+                                ctx->op_device_context()->stream());
+  return &tf_allocator_adapter->value();
+}
+
+XlaCompiler::Options GenerateCompilerOptions(
+    XlaCompilationCache* cache, OpKernelContext* ctx,
+    const XlaPlatformInfo& platform_info, bool has_ref_vars,
+    absl::optional<se::TfAllocatorAdapter>* tf_allocator_adapter) {
+  XlaCompiler::Options options;
+  options.client = static_cast<xla::LocalClient*>(cache->client());
+  if (ctx->op_device_context() != nullptr) {
+    options.device_ordinal =
+        ctx->op_device_context()->stream()->parent()->device_ordinal();
+  }
+  options.device_type = cache->device_type();
+  options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
+  options.graph_def_version = ctx->function_library()->graph_def_version();
+  options.allow_cpu_custom_calls =
+      (platform_info.platform_id() == se::host::kHostPlatformId);
+  options.device_allocator =
+      GetAllocator(tf_allocator_adapter, ctx, platform_info);
+  if (platform_info.xla_device_metadata()) {
+    options.shape_representation_fn =
+        platform_info.xla_device_metadata()->shape_representation_fn();
+  }
+  // If reference variables are not present in the graph, we can safely alias
+  // passthrough parameters without performing a copy.
+  options.alias_passthrough_params =
+      !has_ref_vars && !platform_info.is_on_xla_device();
+  return options;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_platform_info.h b/tensorflow/compiler/jit/xla_platform_info.h
new file mode 100644
index 00000000000..dac45529ac9
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_platform_info.h
@@ -0,0 +1,108 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_PLATFORM_INFO_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_PLATFORM_INFO_H_
+
+#include "tensorflow/compiler/jit/xla_compilation_cache.h"
+#include "tensorflow/compiler/jit/xla_device.h"
+#include "tensorflow/stream_executor/tf_allocator_adapter.h"
+
+namespace tensorflow {
+
+// Holds some information about the platform on which an
+// XlaLaunch/_XlaCompile/_XlaRun op must run on. Provides a common layer of
+// abstraction for normal and XLA devices.
+class XlaPlatformInfo {
+ public:
+  XlaPlatformInfo() : device_type_("") {}
+  XlaPlatformInfo(XlaPlatformInfo&&) = default;
+  explicit XlaPlatformInfo(const DeviceType device_type,
+                           se::Platform::Id platform_id,
+                           const XlaDevice::Metadata* xla_device_metadata,
+                           se::DeviceMemoryAllocator* device_allocator)
+      : device_type_(device_type),
+        platform_id_(platform_id),
+        xla_device_metadata_(xla_device_metadata),
+        device_allocator_(device_allocator) {}
+
+  XlaPlatformInfo& operator=(XlaPlatformInfo&& other) = default;
+
+  bool UseMultipleStreams() const {
+    return xla_device_metadata_ && xla_device_metadata_->UseMultipleStreams();
+  }
+
+  // Non-null only when run on an XLA device.
+  se::DeviceMemoryAllocator* custom_allocator() const {
+    return device_allocator_;
+  }
+
+  DeviceType device_type() const { return device_type_; }
+
+  // This is equal to xla_device_metadata()->platform()->id() if
+  // xla_device_metadata() is not nullptr.
+  se::Platform::Id platform_id() const { return platform_id_; }
+
+  // This may be null if the op this XlaPlatformInfo is for was not placed on an
+  // XLA device.
+  const XlaDevice::Metadata* xla_device_metadata() const {
+    return xla_device_metadata_;
+  }
+  bool is_on_xla_device() const { return xla_device_metadata() != nullptr; }
+
+ private:
+  DeviceType device_type_;
+  se::Platform::Id platform_id_;
+
+  // xla_device_metadata_ lives in the tensorflow::DeviceBase in which the
+  // XlaLaunch/_XlaCompile/_XlaRun op is placed and thus does not die before the
+  // XlaLaunch/_XlaCompile/_XlaRun OpKernel.
+  const XlaDevice::Metadata* xla_device_metadata_;
+
+  // If the op associated with this XlaPlatformInfo is placed on an XLA device
+  // then device_allocator_ is the xla::Backend's memory allocator.  If the op
+  // is placed on a regular CPU or GPU device then device_allocator_ is null.
+  se::DeviceMemoryAllocator* device_allocator_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaPlatformInfo);
+};
+
+// Returns created XLA compilation cache.
+Status BuildXlaCompilationCache(OpKernelContext* ctx,
+                                const XlaPlatformInfo& platform_info,
+                                XlaCompilationCache** cache);
+
+// Returns information about the platform from kernel context.
+XlaPlatformInfo XlaPlatformInfoFromContext(OpKernelConstruction* ctx);
+
+// Returns allocator from platform info if non-null, or populate and return a
+// pointer to the allocator adapter with allocator from context.
+//
+// This is necessary because for XLA devices the underlying TF allocator returns
+// dummy tensors.
+se::DeviceMemoryAllocator* GetAllocator(
+    absl::optional<se::TfAllocatorAdapter>* tf_allocator_adapter,
+    OpKernelContext* ctx, const XlaPlatformInfo& platform_info);
+
+// Returns created options for the XLA compiler, and writes the used allocator
+// into `tf_allocator_adapter`.
+XlaCompiler::Options GenerateCompilerOptions(
+    XlaCompilationCache* cache, OpKernelContext* ctx,
+    const XlaPlatformInfo& platform_info, bool has_ref_vars,
+    absl::optional<se::TfAllocatorAdapter>* tf_allocator_adapter);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_PLATFORM_INFO_H_
diff --git a/tensorflow/compiler/mlir/BUILD b/tensorflow/compiler/mlir/BUILD
index 57f923caa91..01c187790b7 100644
--- a/tensorflow/compiler/mlir/BUILD
+++ b/tensorflow/compiler/mlir/BUILD
@@ -150,6 +150,7 @@ tf_cc_binary(
         "//tensorflow/compiler/mlir/tensorflow:translate_registration",
         "//tensorflow/compiler/mlir/tensorflow:translate_tf_dialect_op",
         "//tensorflow/compiler/mlir/xla:xla_mlir_translate",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
diff --git a/tensorflow/compiler/mlir/g3doc/xla_gpu_codegen.md b/tensorflow/compiler/mlir/g3doc/xla_gpu_codegen.md
index 2fe109c1783..8e7e605fc4c 100644
--- a/tensorflow/compiler/mlir/g3doc/xla_gpu_codegen.md
+++ b/tensorflow/compiler/mlir/g3doc/xla_gpu_codegen.md
@@ -74,7 +74,6 @@ We have several choices on how to lower the host-side part from LHLO:
     *   (Pro) easy to implement library calls (cuDNN, cuBLAS, cuFFT, etc), as
         TFRT ops are interpreted by C++ code.
     *   (Con) host side is under development and not tested.
-    *   (Con) the JAX integration isn’t clear from a runtime point of view
 *   Jitted CPU code
     *   (Pro) great lower-ability. Create a few loops and conditions and it's
         done.
@@ -84,8 +83,7 @@ We have several choices on how to lower the host-side part from LHLO:
         dynamic loading, etc).
 *   Existing (interpreting) XLA runtime
 
-Tentative conclusion: Use jitted CPU code during the transition, and optionally
-adopt TFRT in the end.
+Decision: adopt TFRT, but also support jitting CPU code in TFRT.
 
 ## Migrating Device LLVM IR (Task 3)
 
@@ -114,7 +112,7 @@ end state of each XLA op:
     *   (Cost) Will be throw-away work if we want to ultimately migrate to
         Standard.
     *   (Benefit) It is easy and mechanical. Can be done in a short period.
-    *   (Benefit) It doesn't benefit more compared to a).
+    *   (Benefit) It doesn't benefit more compared to (1).
 1.  Refactor old emitters to be like LHLO -> MLIR GPU + Standard + Loops:
     *   (Cost) Lifting existing emitters to Standard introduces some challenges.
         Pointers and GEPs need to be converted to MemRefs and SubViews. Ensuring
@@ -134,6 +132,19 @@ end state of each XLA op:
     *   (Benefit) unified stack; community support; portability; more
         optimization potentials.
 
+Conclusions:
+
+*   Don't go for (2). (1) or (3) are just better than (2). (2) costs more than
+    (1), since it requires a lot of mechanical refactoring. With (1) we can
+    still achieve the goal of enabling XLA to pick up MLIR emitters. This is by
+    doing LHLO -> LLVM IR -> run legacy device emitters.
+*   ElementalIrEmitter ops go for (4), but not incrementally. There is no way to
+    do it op by op, because all elementally-emitted ops are connected into the
+    same graph. This work can also serve as a unification point of several
+    on-going forces (xla/service/mlir\_gpu, the kernel generator, Linalg).
+*   All other ops go for (1). As a stretch goal, they might be migrated to (3)
+    or (4).
+
 ## Prioritization
 
 While all three tasks mentioned above are parallelizable, under limited
@@ -210,26 +221,19 @@ The exact profiling can't be easily done for MLIR-generated ops, since:
 
 ### Step 3: (Task 2) Migrating Thunks
 
-This step migrates all host ops and library calls. This step will eliminate most
-of the thunks and produce serializable MLIR instead.
-
-There are roughly three kinds of thunks:
-
+As a note, there are roughly three kinds of thunks:
 *   KernelThunk, which launches a kernel.
 *   Control flow thunks, which has host control flow logic (conditional, while,
     for, sequence) and launch body kernels.
 *   Library thunks: cuDNN, cuBLAS, cuFFT, NCCL, etc.
 
-The **bottom line** is to:
+The plan is:
+*   Make Thunks (de)serializable.
+*   Help improve TFRT to a state where it can support these semantics.
+*   As the state improves, migrate individual thunks incrementally.
 
-*   Create a Thunk dialect that provides (de)serialize logic for all existing
-    C++-based Thunks.
-*   Change emitters to emit a graph of Thunk dialect.
-
-**Optionally**, we can relieve some thunks from C++ implementation. KernelThunk
-can lower to the GPU LaunchKernelOp. Control flow thunks can leverage the CFG
-Dialect for loops and conditions, combined with LaunchKernelOp. This optional
-step requires profiling and stream support.
+These action items are only partially ordered. The actual execution order /
+engineering parallelism is to be evaluated as it goes.
 
 ### Step 4: (Task 3) Migrated ElementalIrEmitter
 
diff --git a/tensorflow/compiler/mlir/glob_lit_test.bzl b/tensorflow/compiler/mlir/glob_lit_test.bzl
index 9f6856f3636..edbf3663a89 100644
--- a/tensorflow/compiler/mlir/glob_lit_test.bzl
+++ b/tensorflow/compiler/mlir/glob_lit_test.bzl
@@ -52,7 +52,7 @@ def _run_lit_test(name, data, size, tags, driver, features, exec_properties):
     native.py_test(
         name = name,
         srcs = ["@llvm-project//llvm:lit"],
-        tags = tags + ["no_windows"],
+        tags = tags + ["no_pip", "no_windows"],
         args = [
             "tensorflow/compiler/mlir/" + paths.basename(data[-1]) + " --config-prefix=runlit -v",
         ] + features,
diff --git a/tensorflow/compiler/mlir/hlo/.gitignore b/tensorflow/compiler/mlir/hlo/.gitignore
new file mode 100644
index 00000000000..cc1696bf575
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/.gitignore
@@ -0,0 +1,4 @@
+build
+llvm-project
+llvm-build
+
diff --git a/tensorflow/compiler/mlir/hlo/BUILD b/tensorflow/compiler/mlir/hlo/BUILD
index c7bda887db0..126d44670a0 100644
--- a/tensorflow/compiler/mlir/hlo/BUILD
+++ b/tensorflow/compiler/mlir/hlo/BUILD
@@ -55,6 +55,38 @@ filegroup(
     ],
 )
 
+gentbl(
+    name = "MhloPassIncGen",
+    strip_include_prefix = "include/mlir-hlo/Dialect/mhlo/transforms/",
+    tbl_outs = [
+        (
+            "-gen-pass-decls -name MHLO",
+            "include/mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "include/mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.td",
+    td_srcs = [
+        "@llvm-project//mlir:PassBaseTdFiles",
+    ],
+)
+
+gentbl(
+    name = "LmhloPassIncGen",
+    strip_include_prefix = "include/mlir-hlo/Dialect/mhlo/transforms/",
+    tbl_outs = [
+        (
+            "-gen-pass-decls -name LMHLO",
+            "include/mlir-hlo/Dialect/mhlo/transforms/lmhlo_passes.h.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "include/mlir-hlo/Dialect/mhlo/transforms/lmhlo_passes.td",
+    td_srcs = [
+        "@llvm-project//mlir:PassBaseTdFiles",
+    ],
+)
+
 gentbl(
     name = "chlo_ops_inc_gen",
     strip_include_prefix = "include",
@@ -76,8 +108,8 @@ gentbl(
     tbl_outs = [
         ("-gen-op-decls", "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h.inc"),
         ("-gen-op-defs", "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.cc.inc"),
-        ("-gen-struct-attr-decls", "include/mlir-hlo/Dialect/mhlo/IR/hlo_structs.h.inc"),
-        ("-gen-struct-attr-defs", "include/mlir-hlo/Dialect/mhlo/IR/hlo_structs.cc.inc"),
+        ("-gen-struct-attr-decls", "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_structs.h.inc"),
+        ("-gen-struct-attr-defs", "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_structs.cc.inc"),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td",
@@ -106,14 +138,36 @@ gentbl(
     td_srcs = [":hlo_ops_td_files"],
 )
 
+gentbl(
+    name = "hlo_ops_pattern_gen",
+    strip_include_prefix = "lib/Dialect/mhlo/IR/",
+    tbl_outs = [
+        (
+            "-gen-rewriters",
+            "lib/Dialect/mhlo/IR/hlo_patterns.cc.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "lib/Dialect/mhlo/IR/hlo_patterns.td",
+    td_relative_includes = [
+        "include",
+    ],
+    td_srcs = [
+        ":hlo_ops_td_files",
+        "@llvm-project//mlir:StdOpsTdFiles",
+        "@llvm-project//mlir:include/mlir/Dialect/Shape/IR/ShapeBase.td",
+        "@llvm-project//mlir:include/mlir/Dialect/Shape/IR/ShapeOps.td",
+    ],
+)
+
 gentbl(
     name = "lhlo_ops_inc_gen",
     strip_include_prefix = "include",
     tbl_outs = [
         ("-gen-op-decls", "include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h.inc"),
         ("-gen-op-defs", "include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.cc.inc"),
-        ("-gen-struct-attr-decls", "include/mlir-hlo/Dialect/mhlo/IR/lhlo_structs.h.inc"),
-        ("-gen-struct-attr-defs", "include/mlir-hlo/Dialect/mhlo/IR/lhlo_structs.cc.inc"),
+        ("-gen-struct-attr-decls", "include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops_structs.h.inc"),
+        ("-gen-struct-attr-defs", "include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops_structs.cc.inc"),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.td",
@@ -126,11 +180,12 @@ gentbl(
 #TODO(aminim): revisit the naming and grouping of these rules post-move.
 gentbl(
     name = "canonicalize_inc_gen",
+    strip_include_prefix = "lib/Dialect/mhlo/IR/",
     tbl_outs = [
-        ("-gen-rewriters", "lib/Dialect/mhlo/transforms/generated_canonicalize.inc"),
+        ("-gen-rewriters", "lib/Dialect/mhlo/IR/mhlo_canonicalize.inc"),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "lib/Dialect/mhlo/transforms/canonicalize.td",
+    td_file = "lib/Dialect/mhlo/IR/mhlo_canonicalize.td",
     td_relative_includes = [
         "include",
     ],
@@ -146,7 +201,7 @@ gentbl(
         ),
         (
             "-gen-op-interface-defs",
-            "include/mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.cc.inc",
+            "include/mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.cpp.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
@@ -168,6 +223,7 @@ cc_library(
         "include/mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.h",
         "include/mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.h.inc",
     ],
+    includes = ["include"],
     deps = [
         ":infer_fusibility_op_interface_gen",
         "@llvm-project//mlir:IR",
@@ -180,6 +236,7 @@ cc_library(
     name = "convert_op_folder",
     srcs = ["lib/utils/convert_op_folder.cc"],
     hdrs = ["include/mlir-hlo/utils/convert_op_folder.h"],
+    includes = ["include"],
     deps = [
         "@llvm-project//mlir:IR",
     ],
@@ -203,13 +260,13 @@ cc_library(
     ],
     includes = ["include"],
     deps = [
+        "hlo_ops_pattern_gen",
         ":canonicalize_inc_gen",
         ":chlo_ops_inc_gen",
         ":convert_op_folder",
         ":hlo_ops_base_inc_gen",
         ":hlo_ops_inc_gen",
         ":infer_fusibility_op_interface",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
@@ -254,7 +311,7 @@ cc_library(
 )
 
 cc_library(
-    name = "hlo_dialect_registration",
+    name = "hlo_dialect_force_registration",
     srcs = ["lib/Dialect/mhlo/IR/dialect_registration.cc"],
     deps = [
         ":hlo",
@@ -264,6 +321,17 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "hlo_dialect_registration",
+    srcs = ["lib/Dialect/mhlo/IR/init.cc"],
+    hdrs = ["include/mlir-hlo/Dialect/mhlo/IR/register.h"],
+    deps = [
+        ":hlo",
+        ":lhlo",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
 cc_library(
     name = "sink_constants_to_control_flow",
     srcs = ["lib/Dialect/mhlo/transforms/sink_constants_to_control_flow.cc"],
@@ -273,6 +341,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
     ],
@@ -307,7 +376,6 @@ cc_library(
         ":hlo",
         ":lhlo",
         ":map_lmhlo_to_scalar_op",
-        "@com_google_absl//absl/memory",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Affine",
         "@llvm-project//mlir:IR",
@@ -322,7 +390,6 @@ cc_library(
     srcs = ["lib/Dialect/mhlo/transforms/lhlo_legalize_to_parallel_loops.cc"],
     deps = [
         ":lhlo",
-        "@com_google_absl//absl/memory",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgOps",
@@ -337,6 +404,7 @@ cc_library(
 cc_library(
     name = "lhlo_legalize_to_llvm",
     srcs = ["lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm.cc"],
+    hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"],
     deps = [
         ":lhlo",
         "@llvm-project//mlir:IR",
@@ -357,7 +425,6 @@ cc_library(
         ":hlo",
         ":lhlo",
         ":map_lmhlo_to_scalar_op",
-        "@com_google_absl//absl/memory",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Affine",
         "@llvm-project//mlir:IR",
@@ -375,7 +442,6 @@ cc_library(
     hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"],
     deps = [
         ":hlo",
-        "@com_google_absl//absl/memory",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Shape",
@@ -392,7 +458,6 @@ cc_library(
         ":hlo",
         ":lhlo",
         ":map_lmhlo_to_scalar_op",
-        "@com_google_absl//absl/memory",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:IR",
@@ -411,7 +476,6 @@ cc_library(
     hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/passes.h"],
     deps = [
         ":lhlo",
-        "@com_google_absl//absl/memory",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgTransforms",
@@ -429,7 +493,6 @@ cc_library(
     hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/passes.h"],
     deps = [
         ":lhlo",
-        "@com_google_absl//absl/memory",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
@@ -450,7 +513,6 @@ cc_library(
         ":hlo",
         ":lhlo",
         ":map_hlo_to_lhlo_op",
-        "@com_google_absl//absl/memory",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
@@ -465,6 +527,7 @@ cc_library(
     name = "cycle_detector",
     srcs = ["lib/utils/cycle_detector.cc"],
     hdrs = ["include/mlir-hlo/utils/cycle_detector.h"],
+    includes = ["include"],
     deps = [
         "@llvm-project//llvm:Support",
     ],
@@ -501,13 +564,14 @@ cc_library(
 
 gentbl(
     name = "legalize_to_standard_inc_gen",
+    strip_include_prefix = "lib/Dialect/mhlo/transforms/",
     tbl_outs = [
         ("-gen-rewriters", "lib/Dialect/mhlo/transforms/generated_legalize_to_standard.inc"),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "lib/Dialect/mhlo/transforms/legalize_to_standard_patterns.td",
     td_relative_includes = [
-        "../hlo/include",
+        "include",
     ],
     td_srcs = [
         ":hlo_ops_td_files",
@@ -548,6 +612,25 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "legalize_gather_to_torch_index_select",
+    srcs = ["lib/Dialect/mhlo/transforms/legalize_gather_to_torch_index_select.cc"],
+    hdrs = [
+        "include/mlir-hlo/Dialect/mhlo/transforms/passes.h",
+        "include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h",
+    ],
+    deps = [
+        ":hlo",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "legalize_tanh_to_approximation",
     srcs = ["lib/Dialect/mhlo/transforms/legalize_tanh_to_approximation.cc"],
@@ -555,6 +638,7 @@ cc_library(
         "include/mlir-hlo/Dialect/mhlo/transforms/passes.h",
         "include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h",
     ],
+    includes = ["include"],
     deps = [
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -568,13 +652,14 @@ cc_library(
 
 gentbl(
     name = "lower_complex_inc_gen",
+    strip_include_prefix = "lib/Dialect/mhlo/transforms/",
     tbl_outs = [
         ("-gen-rewriters", "lib/Dialect/mhlo/transforms/generated_lower_complex.inc"),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "lib/Dialect/mhlo/transforms/lower_complex_patterns.td",
     td_relative_includes = [
-        "../hlo/include",
+        "include",
     ],
     td_srcs = [
         ":hlo_ops_td_files",
@@ -587,9 +672,9 @@ cc_library(
     #TODO(aminim): find a better name here?
     name = "mhlo_to_mhlo_lowering_patterns",
     srcs = [
-        "lib/Dialect/mhlo/transforms/generated_lower_complex.inc",
         "lib/Dialect/mhlo/transforms/lower_complex.cc",
         "lib/Dialect/mhlo/transforms/lower_general_dot.cc",
+        "lib/Dialect/mhlo/transforms/optimize_mhlo.cc",
     ],
     hdrs = [
         "include/mlir-hlo/Dialect/mhlo/transforms/passes.h",
@@ -597,7 +682,8 @@ cc_library(
     ],
     deps = [
         ":hlo",
-        ":hlo_dialect_registration",
+        ":hlo_dialect_force_registration",
+        ":lower_complex_inc_gen",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
@@ -649,7 +735,9 @@ cc_library(
     deps = [
         ":hlo",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:Shape",
+        "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Transforms",
     ],
 )
@@ -661,6 +749,7 @@ cc_library(
         "lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_pass.cc",
         "lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm_pass.cc",
         "lib/Dialect/mhlo/transforms/materialize_broadcasts_pass.cc",
+        "lib/Dialect/mhlo/transforms/optimize_mhlo_pass.cc",
         "lib/Dialect/mhlo/transforms/test_infer_shaped_type_pass.cc",
         "lib/Dialect/mhlo/transforms/unfuse_batch_norm_pass.cc",
     ],
@@ -671,13 +760,12 @@ cc_library(
         ":lhlo_legalize_to_llvm",  # build-cleaner: keep
         ":materialize_broadcasts",  # build-cleaner: keep
         ":unfuse_batch_norm",  # build-cleaner: keep
-        "@llvm-project//mlir:AffineToStandardTransforms",
-        "@llvm-project//mlir:CFGTransforms",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:LLVMTransforms",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:Shape",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Transforms",
@@ -686,15 +774,20 @@ cc_library(
 )
 
 cc_library(
-    name = "all_passes_for_testing",
+    name = "all_passes",
+    hdrs = [
+        "include/mlir-hlo/Dialect/mhlo/transforms/register_passes.h",
+    ],
     visibility = [
-        "//tensorflow/compiler/mlir:__subpackages__",
+        ":friends",
     ],
     deps = [
+        ":LmhloPassIncGen",
+        ":MhloPassIncGen",
         ":chlo_legalize_to_hlo",
-        ":hlo_dialect_registration",
         ":hlo_legalize_to_lhlo",
         ":legalize_control_flow",
+        ":legalize_gather_to_torch_index_select",
         ":legalize_tanh_to_approximation",
         ":legalize_to_linalg",
         ":legalize_to_standard",
@@ -709,15 +802,23 @@ cc_library(
         ":sink_constants_to_control_flow",
         ":test_passes",
         ":transform_unranked_hlo",
+        "@llvm-project//mlir:Pass",
     ],
 )
 
 cc_binary(
     name = "mlir-hlo-opt",
+    srcs = [
+        "tools/mlir-hlo-opt/mlir-hlo-opt.cpp",
+    ],
     deps = [
-        ":all_passes_for_testing",
-        "@llvm-project//mlir:AllPassesAndDialects",
+        ":all_passes",
+        ":hlo_dialect_registration",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
+        "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MlirOptLib",
-        "@llvm-project//mlir:MlirOptMain",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
     ],
 )
diff --git a/tensorflow/compiler/mlir/hlo/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/CMakeLists.txt
new file mode 100644
index 00000000000..c4e2ea123df
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/CMakeLists.txt
@@ -0,0 +1,94 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+cmake_minimum_required(VERSION 3.13.4)
+
+if(POLICY CMP0068)
+  cmake_policy(SET CMP0068 NEW)
+  set(CMAKE_BUILD_WITH_INSTALL_NAME_DIR ON)
+endif()
+
+if(POLICY CMP0075)
+  cmake_policy(SET CMP0075 NEW)
+endif()
+
+if(POLICY CMP0077)
+  cmake_policy(SET CMP0077 NEW)
+endif()
+
+#-------------------------------------------------------------------------------
+# Project setup and globals
+#-------------------------------------------------------------------------------
+
+project(mlir-hlo LANGUAGES CXX C)
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_CXX_STANDARD 14)
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules")
+
+#-------------------------------------------------------------------------------
+# Options and settings
+#-------------------------------------------------------------------------------
+
+#-------------------------------------------------------------------------------
+# MSVC defaults
+#-------------------------------------------------------------------------------
+
+if(MSVC)
+    add_compile_options(
+        $<$<CONFIG:>:/MD>
+        $<$<CONFIG:Debug>:/MD>
+        $<$<CONFIG:Release>:/MD>
+    )
+endif()
+
+#-------------------------------------------------------------------------------
+# MLIR/LLVM Configuration
+#-------------------------------------------------------------------------------
+
+find_package(MLIR REQUIRED CONFIG)
+message(STATUS "Using MLIRConfig.cmake in: ${MLIR_DIR}")
+message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
+list(APPEND CMAKE_MODULE_PATH "${MLIR_CMAKE_DIR}")
+list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
+
+if(LLVM_ENABLE_ZLIB)
+  find_package(ZLIB)
+endif()
+
+include(TableGen)
+include(AddLLVM)
+include(AddMLIR)
+include(HandleLLVMOptions)
+include_directories(${LLVM_INCLUDE_DIRS})
+include_directories(${MLIR_INCLUDE_DIRS})
+include_directories(${PROJECT_SOURCE_DIR}/include)
+include_directories(${PROJECT_BINARY_DIR}/include)
+include_directories(${PROJECT_BINARY_DIR}/)
+link_directories(${LLVM_BUILD_LIBRARY_DIR})
+add_definitions(${LLVM_DEFINITIONS})
+
+#-------------------------------------------------------------------------------
+# Directory setup
+#-------------------------------------------------------------------------------
+
+set(MLIR_HLO_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+set(MLIR_HLO_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+
+add_custom_target(check-mlir-hlo)
+
+add_subdirectory(include/mlir-hlo)
+add_subdirectory(lib)
+add_subdirectory(tools)
+add_subdirectory(tests)
diff --git a/tensorflow/compiler/mlir/hlo/README.md b/tensorflow/compiler/mlir/hlo/README.md
new file mode 100644
index 00000000000..9eaa14031fd
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/README.md
@@ -0,0 +1,233 @@
+# MLIR-HLO: A Standalone "HLO" MLIR-based Compiler
+
+The code here exists in two places:
+
+*   https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/mlir/hlo;
+    this is the canonical location and where contributions should be made using
+    GitHub pull-requests.
+*   https://github.com/tensorflow/mlir-hlo; this is a standalone repository with
+    a view to the same code to allow other projects to use this without
+    depending on the entire TF monorepo.
+
+This implements a self-contained compiler for a linear algebra set of operations
+inspired by XLA
+[HLO IR](https://www.tensorflow.org/xla/architecture#how_does_xla_work) using
+MLIR components. It is designed to provide an end-to-end flow independent of
+TensorFlow and XLA, but usable inside of these projects.
+
+Coding practice and conventions in this repository follow the
+[MLIR Developer Guide](https://mlir.llvm.org/getting_started/DeveloperGuide/) in
+this repo as part of the intent to act as an incubator for technology to
+upstream.
+
+## QuickStart: building and testing
+
+These instructions work on Linux, you may have to adjust for your plaform.
+
+To build the code in this repository, you need a clone of the LLVM/MLIR git
+repository:
+
+    $ git clone https://github.com/llvm/llvm-project.git
+
+
+You need to make sure you have the right commit checked out in the LLVM
+repository (you need to do this every time you pull from this repo):
+
+    $ (cd llvm-project && git checkout $(cat build_tools/llvm_version.txt))
+
+We provide a script to configure and build LLVM/MLIR:
+
+    $ build_tools/build_mlir.sh ${PWD}/llvm-project/ ${PWD}/llvm-build
+
+Again this is something to do every time you pull from this repository and the
+LLVM revision changes.
+
+Finally you can build and test this repository:
+
+    $ mkdir build && cd build
+    $ cmake .. -GNinja \
+       -DLLVM_ENABLE_LLD=ON \
+       -DCMAKE_BUILD_TYPE=Release \
+       -DLLVM_ENABLE_ASSERTIONS=On \
+       -DMLIR_DIR=${PWD}/../llvm-build/lib/cmake/mlir
+    $ ninja check-mlir-hlo
+
+
+## Overview
+
+MLIR-HLO aims to provide an end-to-end compiler for CPU and GPU, as well as
+building reusable blocks for other accelerators. This is heavily inspired by the
+success of XLA.
+
+[XLA](https://www.tensorflow.org/xla/) (Accelerated Linear Algebra) is a
+domain-specific compiler framework and execution environment for linear algebra,
+which powers code-generation for ML frameworks like TensorFlow, JAX, and others.
+
+A cornerstone of XLA is the HLO (High Level Optimizer) IR, which offers a
+carefully fixed selected list of operations, mostly orthogonal to each other. It
+provides an efficient optimizer for computations expressed with this set of
+operations and generate codes for hardware platforms like CPU, GPU, and TPUs.
+Its goal is to provide a uniform interface to compile and execute these
+optimized HLO programs independently of the targeted device. It is not a
+front-end ML system like TensorFlow or JAX, rather it is a backend framework
+that optimizes HLO and lowers to machine code.
+
+The HLO set of operations is closed and has well defined semantics. HLO
+operations operate on immutable Tensors with static shapes (actually bounded
+shapes to be exact) and explicit broadcasts.
+
+[MLIR](https://mlir.llvm.org/) is a compiler infrastructure which intends to
+come with "battery included", as such it intends to provide all the blocks
+required to assemble graph optimization and codegen pipelines. The longer term
+roadmap for MLIR is to provide a
+[Tensor Compute Primitive](https://llvm.discourse.group/c/mlir/MLIR-TCP-WG/36)
+(TCP) dialect, which should hopefully be general enough to model what HLO
+represents today (see
+[slides](https://drive.google.com/open?id=1iljcpTQ5NPaMfGpoPDFml1XkYxjK_6A4) and
+[recording](https://drive.google.com/open?id=1jSPa8TwPKUt0WuLquGc8OgSUVYJHMvWZ)
+for a technical discussion on this topic).
+
+The work on MLIR-HLO can be seen as a stepping stone towards building TCP, while
+integrating intermediate components into XLA itself by relying on the
+well-proven HLO IR and introducing more pieces from upstream MLIR
+([Linalg](https://mlir.llvm.org/docs/Dialects/Linalg/),
+[Vector](https://mlir.llvm.org/docs/Dialects/Vector/),
+[GPU](https://mlir.llvm.org/docs/Dialects/GPU/) dialect, ...).
+[This document](https://www.tensorflow.org/mlir/xla_gpu_codegen) provides more
+information on the current migration of the XLA GPU codegen.
+
+## MLIR Dialects for XLA-style compilation
+
+This repository defines three dialects to support a HLO-like compilation
+pipeline using MLIR:
+
+*   `chlo`: the "client" HLO dialect, intended to be closer to the frontend
+    (including implicit broadcast semantics).
+*   `mhlo`: "meta"-HLO dialect ; similar to `xla_hlo`, but with extensions for
+    dynamic shape support.
+*   `lmhlo`: "late"-"meta"-HLO, it is the IR after buffer allocation is
+    performed. In XLA the buffer allocation is a side-datastructure which keeps
+    track of these informations, while this separate dialect materializes it in
+    the IR.
+
+We describe these in more details below.
+
+### HLO Client Dialect: `chlo`.
+
+*   It was originaly designed to map the
+    [XLA client APIs](https://www.tensorflow.org/xla/operation_semantics) (e.g.,
+    ops supports implicit broadcast and roughly modeled on XlaBuilder API)
+    modulo support for dynamic shapes and additional ops required to support
+    dynamic client side HLOs.
+*   Ops can be from either the XlaBuilder or XLA helper functions can be
+    converted into ops (e.g., given ambiguity in what constitutes these ops,
+    there is some freedom to decide), the goal of this dialect is to correspond
+    close to client level and enable a thin layer between client use and op
+    construction (making it cheap to construct and optimizations on the dialect
+    close to optimizations on the client ops).
+
+Entry:
+
+*   The vast majority of old "client" interactions are via the XlaBuilder APIs.
+    These APIs are used by TF2XLA kernels, JAX, PyTorch bridge and directly. The
+    legalization path (described below) can also reuse the XlaBuilder's APIs to
+    construct XLA Client HLO ops directly (this uses MlirXlaBuilder which is a
+    subclass of XlaBuilder).
+*   The other entry point is during legalization from TensorFlow ops in the TF
+    Graph Compiler and other tools (e.g., SavedModel lowering and TFCompile).
+
+Exit:
+
+*   MHLO
+*   May be exported to xla::HloInstructionProto by invoking the XlaBuilder APIs
+    (with regular XlaBuilder)
+
+The `chlo` dialect started originally as mapping to the XLA client Builder APIs.
+It enables it to both be constructed and converted back to existing XLA
+interfaces using the XlaBuilder API. Due to the way that translation into and
+out of the dialect works, there is no expectation that this dialect roundtrips
+to XLA (e.g., it is only intended to be translated to MLIR and then legalized to
+another dialect or translated to HloInstructionProto).
+
+The export approach of reusing the XlaBuilders enables reusing a lot of logic
+that was already implemented in terms of computing shapes, inserting broadcasts
+etc.
+
+An important topic here is that XLA Client HLO ops are not a well defined set.
+And in particular what some would consider helper functions, others would
+consider ops. It should be easy to move between these and so define a new op
+along with the helper function or autogenerate the helper functions from the
+descriptions of the ops. For the former, a simple approach would be to simply
+consider the context in which the op is being constructed and if an MLIR one,
+construct a op in the client dialect instead of further calls into XlaBuilder.
+The latter could be implemented by adding the op and a legalization of the op to
+other known ops, from which a helper function can get generated that could be
+used as regular.
+
+Status: Exists but need to be cleaned up.
+
+### Meta HLO Dialect `mhlo`
+
+*   Dialect is closer to current HLO server ops (e.g., no implicit broadcast)
+*   MHLO dialect where we can deviate from the requirements of the client or
+    server dialect, in particular:
+    *   Control flow ops with implicit capture to enable simpler optimizations
+        (e.g., generic LICM, unroll & jam, etc.)
+    *   Multiple results ops (e.g., no tuples)
+    *   More ops (for example, unique op or assert op), and ops that don't need
+        to be added to either client or server dialect.
+    *   Op set not constrained by implementation (e.g., hlo.add operating on say
+        i79 or !mydialect.weird_type is allowed even though no XLA backend
+        supports it). Verification on types happening at the boundaries.
+    *   It does not need to preserve some deprecated XLA constructs (e.g.
+        stateful RNG HLO).
+    *   More dynamic shape support ops without need for updating all
+        users/backends.
+*   This dialect enables evolving HLO independently from XLA in order to
+    experiment with features we'd like to upstream in MLIR TCP. In particular it
+    intends to be user-extensible through
+    [interfaces](https://mlir.llvm.org/docs/Interfaces/).
+*   It should have no TensorFlow, or proto, or other Google internal
+    dependencies.
+*   It need not be a complete superset of ops compared to XLA HLO dialect.
+
+Entry:
+
+*   Legalization from `chlo` dialect or conversion from XLA HLO.
+*   Directly emitted from TF Graph Compiler;
+*   Builder call (e.g., EDSL);
+
+Exit:
+
+*   LMHLO, Linalg IREE, directly used in codegen.
+*   XLA HLO.
+
+The MHLO dialect has no direct export format, it is only meant as an
+intermediate optimization dialect/format. It is also where we can experiment
+cheaply with new ops. This format will be where the representation would differ
+from existing end points.
+
+Status: Exists but need to be cleaned up and evolved, in particular with respect
+to supporting dynamic shapes.
+
+### LMHLO
+
+LMHLO corresponds to late `mhlo` and operates on buffer domain (e.g., memref)
+with side-effecting operations. The lowering from `mhlo` dialect proceeds by way
+of scheduling, memory and buffer allocation. The current mapping is directly on
+XLA Client HLOs but without implicit broadcast and with operation on memrefs.
+This dialect will instead be rebased on `mhlo` dialect but operating on buffers
+still.
+
+Entry:
+
+*   Post buffer assignment on `mhlo` dialect, or from XLA after buffer
+    assignment.
+
+Exit:
+
+*   Codegen (LLVM IR in the common cases at the moment)
+
+## End-to-End pipeline
+
+TODO
diff --git a/tensorflow/compiler/mlir/hlo/build_tools/build_mlir.sh b/tensorflow/compiler/mlir/hlo/build_tools/build_mlir.sh
new file mode 100755
index 00000000000..5ccefb9416f
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/build_tools/build_mlir.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+if [[ $# -ne 2 ]] ; then
+  echo "Usage: $0 <path/to/llvm> <build_dir>"
+  exit 1
+fi
+
+# LLVM source
+LLVM_SRC_DIR="$1"
+build_dir="$2"
+
+if ! [ -f "$LLVM_SRC_DIR/llvm/CMakeLists.txt" ]; then
+  echo "Expected the path to LLVM to be set correctly (got '$LLVM_SRC_DIR'): can't find CMakeLists.txt"
+  exit 1
+fi
+echo "Using LLVM source dir: $LLVM_SRC_DIR"
+
+# Setup directories.
+echo "Building MLIR in $build_dir"
+mkdir -p "$build_dir"
+
+echo "Beginning build (commands will echo)"
+set -x
+
+cmake -GNinja \
+  "-H$LLVM_SRC_DIR/llvm" \
+  "-B$build_dir" \
+  -DLLVM_INSTALL_UTILS=ON \
+  -DLLVM_ENABLE_LLD=ON \
+  -DLLVM_ENABLE_PROJECTS=mlir \
+  -DLLVM_TARGETS_TO_BUILD="X86;NVPTX;AMDGPU" \
+  -DLLVM_INCLUDE_TOOLS=ON \
+  -DLLVM_BUILD_TOOLS=OFF \
+  -DLLVM_INCLUDE_TESTS=OFF \
+  -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+  -DLLVM_ENABLE_ASSERTIONS=On
+
+cmake --build "$build_dir" --target all --target mlir-cpu-runner
diff --git a/tensorflow/compiler/mlir/hlo/build_tools/llvm_version.txt b/tensorflow/compiler/mlir/hlo/build_tools/llvm_version.txt
new file mode 100644
index 00000000000..0d5446142ec
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/build_tools/llvm_version.txt
@@ -0,0 +1,2 @@
+<LLVM_COMMIT>
+
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/CMakeLists.txt
new file mode 100644
index 00000000000..92759d76383
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/CMakeLists.txt
@@ -0,0 +1,16 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+add_subdirectory(Dialect)
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/CMakeLists.txt
new file mode 100644
index 00000000000..5ee1a1924ec
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/CMakeLists.txt
@@ -0,0 +1,16 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+add_subdirectory(mhlo)
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/CMakeLists.txt
new file mode 100644
index 00000000000..e138afa587f
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/CMakeLists.txt
@@ -0,0 +1,17 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+add_subdirectory(IR)
+add_subdirectory(transforms)
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/CMakeLists.txt
new file mode 100644
index 00000000000..09bdca84cd3
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/CMakeLists.txt
@@ -0,0 +1,31 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Need a separate function because of the .cc vs .cpp used in the one provided by MLIR
+function(add_mlir_hlo_dialect dialect dialect_namespace)
+  set(LLVM_TARGET_DEFINITIONS ${dialect}.td)
+  mlir_tablegen(${dialect}.h.inc -gen-op-decls)
+  mlir_tablegen(${dialect}.cc.inc -gen-op-defs)
+  mlir_tablegen(${dialect}_structs.h.inc -gen-struct-attr-decls)
+  mlir_tablegen(${dialect}_structs.cc.inc -gen-struct-attr-defs)
+  add_public_tablegen_target(MLIR${dialect}IncGen)
+  add_dependencies(mlir-headers MLIR${dialect}IncGen)
+endfunction()
+
+add_mlir_hlo_dialect(chlo_ops chlo)
+add_mlir_hlo_dialect(hlo_ops mhlo)
+add_mlir_hlo_dialect(lhlo_ops lmhlo)
+
+add_mlir_interface(infer_fusibility_op_interface)
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h
index 1fbf55ded83..9704f34a4d6 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h
@@ -17,27 +17,48 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_CHLO_OPS_H_
 
 #include "llvm/ADT/StringRef.h"
-#include "mlir/IR/Dialect.h"  // from @llvm-project
-#include "mlir/IR/DialectImplementation.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/OpDefinition.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Interfaces/InferTypeOpInterface.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
 
 namespace mlir {
 namespace chlo {
 
 class HloClientDialect : public Dialect {
+  void initialize();
+
  public:
-  explicit HloClientDialect(MLIRContext *context);
+  explicit HloClientDialect(MLIRContext *context)
+      : Dialect(getDialectNamespace(), context,
+                TypeID::get<HloClientDialect>()) {
+    initialize();
+  }
   static StringRef getDialectNamespace() { return "chlo"; }
 };
 
 #define GET_OP_CLASSES
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h.inc"
+#include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.h.inc"
+
+template <typename T>
+static Value getConstantLike(OpBuilder& b, T constant, Value val) {
+  Type ty = getElementTypeOrSelf(val.getType());
+
+  auto getAttr = [&]() -> Attribute {
+    if (ty.isa<IntegerType>()) return b.getIntegerAttr(ty, constant);
+    if (ty.isa<FloatType>()) return b.getFloatAttr(ty, constant);
+    llvm_unreachable("unhandled element type");
+  };
+  // TODO(jpienaar): Add ability to pass loc via native call and update.
+  return b.create<ConstantLikeOp>(b.getUnknownLoc(), getAttr(), val);
+}
 
 }  // namespace chlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.td
index 79d6fb25318..2f3bbefb5ab 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.td
@@ -33,6 +33,7 @@ include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td"
+include "mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.td"
 
 def HLOClient_Dialect : Dialect {
   let name = "chlo";
@@ -338,6 +339,49 @@ def HLOClient_BroadcastComplexOp : HLOClient_BroadcastBinaryElementwiseOp<
   let results = (outs HLO_ComplexTensor);
 }
 
+//===----------------------------------------------------------------------===//
+// Unary op
+//===----------------------------------------------------------------------===//
+
+class HLOClient_UnaryElementwiseOp<string mnemonic, list<OpTrait> traits,
+    Type TensorType>: HLOClient_Op<mnemonic,
+      !listconcat(traits, [InferFusibilityOpInterface])> {
+  let arguments = (ins TensorType:$operand);
+  let results = (outs TensorType);
+}
+
+def HLOClient_AcosOp: HLOClient_UnaryElementwiseOp<"acos",
+    [NoSideEffect, SameOperandsAndResultType], HLO_FpOrComplexTensor> {
+  let summary = "Acos operator";
+
+  let description = [{
+    Returns `Acos(operand)` element-wise.
+
+    $$
+    \acos(x) = 2 * \atan(\sqrt(1 - x^2) / (1 + x)) if x != -1
+             = pi                                  if x == -1
+    $$
+  }];
+}
+
+def HLOClient_ConstantLikeOp: HLOClient_Op<"constant_like",
+    [NoSideEffect, SameOperandsAndResultShape,
+     InferTypeOpInterface,
+     DeclareOpInterfaceMethods<InferShapedTypeOpInterface>,
+     NativeOpTrait<"InferTensorType">]> {
+  let summary = "Constant like operator";
+
+  let description = [{
+    Returns a splat constant of the same shape as the operand.
+  }];
+
+  // TODO(jpienaar): value's type could be tightened.
+  let arguments = (ins AnyAttr:$value, HLO_Tensor:$operand);
+  let results = (outs HLO_Tensor);
+
+  let hasCanonicalizer = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // Broadcasting compare op
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h
index 4de52639bca..0036cc0dc19 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h
@@ -19,23 +19,23 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_H_
 
 #include "llvm/ADT/StringRef.h"
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Dialect.h"  // from @llvm-project
-#include "mlir/IR/DialectImplementation.h"  // from @llvm-project
-#include "mlir/IR/Location.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/OpDefinition.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.h"
+#include "mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Interfaces/InferTypeOpInterface.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
 
 namespace mlir {
 class OpBuilder;
 
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_structs.h.inc"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_structs.h.inc"
 
 namespace mhlo {
 
@@ -91,7 +91,7 @@ LogicalResult deriveShapeFromFirstOperand(
     SmallVectorImpl<Value> *reifiedReturnShapes);
 
 #define GET_OP_CLASSES
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h.inc"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h.inc"
 
 }  // end namespace mhlo
 }  // end namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td
index 0ed4235e23f..d0abbe043ea 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td
@@ -40,6 +40,14 @@ class HLO_Op<string mnemonic, list<OpTrait> traits> :
   let verifier = [{ return Verify(*this); }];
 }
 
+def HLO_LOOP_FUSION : StrEnumAttrCase<"kLoop">;
+def HLO_INPUT_FUSION : StrEnumAttrCase<"kInput">;
+def HLO_OUTPUT_FUSION : StrEnumAttrCase<"kOutput">;
+def HLO_CUSTOM_FUSION : StrEnumAttrCase<"kCustom">;
+def HLO_FusionKindAttr : StrEnumAttr<"FusionKind", "fusion kind", [
+    HLO_LOOP_FUSION, HLO_INPUT_FUSION, HLO_OUTPUT_FUSION, HLO_CUSTOM_FUSION
+]>;
+
 //===----------------------------------------------------------------------===//
 // MHLO nullary op definitions.
 //===----------------------------------------------------------------------===//
@@ -52,15 +60,14 @@ def HLO_ConstOp : HLO_Op<"constant",
   );
 
   let results = (outs
-    HLO_Tensor:$output
+    HLO_StaticShapeTensor:$output
   );
 
   let builders = [OpBuilder<
     "OpBuilder &builder, OperationState &result, Attribute value"
   >];
 
-  let printer = [{ return Print(*this, &p); }];
-  let parser = [{ return ParseConstOp(&parser, &result); }];
+  let assemblyFormat = "attr-dict $value";
 
   let hasFolder = 1;
 
@@ -656,13 +663,14 @@ def HLO_GetTupleElementOp: HLO_Op<"get_tuple_element", [NoSideEffect]>, BASE_HLO
 }
 
 def HLO_TupleOp : HLO_Op<"tuple", [NoSideEffect]>, BASE_HLO_TupleOp {
-  let arguments = (ins Variadic<HLO_TensorOrTuple>:$val);
+  let arguments = (ins Variadic<HLO_TensorOrTokenOrTuple>:$val);
   let results = (outs HLO_Tuple);
 
   let builders = [OpBuilder<
                   "OpBuilder &builder, OperationState &results, "
                   "ValueRange values">];
 
+  let hasCanonicalizer = 1;
 }
 
 def HLO_CompareOp: HLO_Op<"compare",
@@ -1067,7 +1075,10 @@ def HLO_GetDimensionSizeOp: HLO_Op<"get_dimension_size", [NoSideEffect]>,
     HLO_Tensor:$operand,
     I32Attr:$dimension
   );
-  let results = (outs HLO_IntTensor);
+  // TODO(hinsu): Allow 64-bit result types once XLA HLO dialect based on the
+  // XLA semantics is available. This limitation is because of the current XLA
+  // implementation.
+  let results = (outs I32Tensor);
 }
 
 def HLO_MapOp: HLO_Op<"map",
@@ -1318,13 +1329,14 @@ def HLO_TorchIndexSelectOp : HLO_Op<"torch_index_select", [NoSideEffect]> {
 }
 
 //===----------------------------------------------------------------------===//
-// MHLO RngUniform Operator.
+// MHLO RNG Operators.
 //===----------------------------------------------------------------------===//
+
 def HLO_RngUniformOp : HLO_Op<"rng_uniform", []>, BASE_HLO_RngUniformOp {
   let arguments = (ins
     HLO_PredIntOrFpTensor:$a,
     HLO_PredIntOrFpTensor:$b,
-    I64Tensor:$shape
+    HLO_DimensionTensor:$shape
   );
 
   let results = (outs HLO_PredIntOrFpTensor);
@@ -1336,7 +1348,7 @@ def HLO_RngNormalOp : HLO_Op<"rng_normal", []>, BASE_HLO_RngNormalOp {
   let arguments = (ins
     HLO_FpTensor:$mu,
     HLO_FpTensor:$sigma,
-    I64Tensor:$shape
+    HLO_DimensionTensor:$shape
   );
 
   let results = (outs HLO_FpTensor);
@@ -1344,6 +1356,19 @@ def HLO_RngNormalOp : HLO_Op<"rng_normal", []>, BASE_HLO_RngNormalOp {
   let hasCustomHLOConverter = 1;
 }
 
+def HLO_RngBitGeneratorOp : HLO_Op<"rng_bit_generator", [NoSideEffect]>, BASE_HLO_RngBitGeneratorOp {
+  let arguments = (ins
+    // TODO(jpienaar): This could be an enum instead.
+    I32Attr:$rng_algorithm,
+    HLO_IntOrFpTensor:$initial_state
+  );
+
+  let results = (outs HLO_TensorOrTuple:$result);
+
+  // TODO(jpienaar): This should not be needed.
+  let hasCustomHLOConverter = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // MHLO Quantize Operator.
 //===----------------------------------------------------------------------===//
@@ -1375,7 +1400,8 @@ def HLO_FusionOp : HLO_Op<"fusion", []> {
   let regions = (region SizedRegion<1>:$fused_computation);
 
   let arguments = (ins
-    Variadic<HLO_TensorOrTuple>:$operands
+    Variadic<HLO_TensorOrTuple>:$operands,
+    OptionalAttr<HLO_FusionKindAttr>:$fusion_kind
   );
 
   let results = (outs
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td
index 7f9784d7f11..2f80545ad19 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td
@@ -316,6 +316,19 @@ class BASE_HLO_RealOp {
   }];
 }
 
+class BASE_HLO_RngBitGeneratorOp {
+  string summary = "Uniform random number generator operator";
+
+  string description = [{
+    Returns an output with a given shape filled with uniform random bits using
+    the specified algorithm (or backend default) and returns an updated state
+    (with the same shape as initial state) and the generated random data.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#rngbitgenerator.
+  }];
+}
+
 class BASE_HLO_RoundOp {
   string summary = "Round operator";
 
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_utils.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_utils.td
index e1ae9e1fb89..c201aeff8ec 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_utils.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_utils.td
@@ -27,6 +27,9 @@ def CastIntElementsAttr : NativeCodeCall<"$0.cast<DenseIntElementsAttr>()">;
 class ConstantSplat<string value> : NativeCodeCall<
     "hlo::getSplat(&$_builder, $0, " # value # ")">;
 
+class HLO_ConstantLike<string value> : NativeCodeCall<
+    "chlo::getConstantLike($_builder, " # value # ", $0)">;
+
 def NullDenseIntElementsAttr : NativeCodeCall<"DenseIntElementsAttr()">;
 
 def BinBroadcastDimensions : NativeCodeCall<
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.h
index ecbf2e05000..00de1170f8a 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.h
@@ -21,7 +21,7 @@ limitations under the License.
 
 namespace mlir {
 
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.h.inc"
+#include "mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.h.inc"
 
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.td
index eb2c1ba3ffe..f8e02d413e9 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.td
@@ -140,7 +140,7 @@ def InferFusibilityOpInterface : OpInterface<"InferFusibilityOpInterface"> {
       Here the effective workload shape roughly represents the maximum
       parallelism can be used during the codegen stage. It's used to check
       the shape-compatibility of the operation. During fusion, we only
-      try to fuse shape-compatible ops for performace.
+      try to fuse shape-compatible ops for performance.
       For example, the effective workload shape of an elementwise op is its
       output shape, while the effective workload shape of a reduction op may
       be its operand shape.
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h
index fd31bec44c0..bb9b29096f3 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h
@@ -19,21 +19,21 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_LHLO_OPS_H_
 
 #include "llvm/ADT/StringRef.h"
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Dialect.h"  // from @llvm-project
-#include "mlir/IR/Location.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/OpDefinition.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
-#include "mlir/Interfaces/ViewLikeInterface.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
 
 namespace mlir {
 class OpBuilder;
 
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_structs.h.inc"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops_structs.h.inc"
 
 namespace lmhlo {
 
@@ -44,7 +44,7 @@ class LmhloDialect : public Dialect {
 };
 
 #define GET_OP_CLASSES
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h.inc"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h.inc"
 
 }  // namespace lmhlo
 }  // end namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.td
index 87082219db7..3fa46584ca2 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.td
@@ -66,6 +66,8 @@ def LHLO_PredOrIntBuffer : MemRefOf<[HLO_Int, HLO_Pred]>;
 
 def LHLO_Buffer : MemRefOf<[AnyFloat, AnySignlessInteger, AnyComplex]>;
 
+def LHLO_ExtentBuffer : MemRefRankOf<[AnySignlessInteger, Index], [1]>;
+
 //===----------------------------------------------------------------------===//
 // LMHLO nullary op definitions.
 //===----------------------------------------------------------------------===//
@@ -467,7 +469,7 @@ def ReshapeMemRefCastOp: Op<LHLO_Dialect, "reshape_memref_cast", [
 
   let arguments = (ins
     AnyRankedOrUnrankedMemRef:$operand,
-    MemRefRankOf<[AnySignlessInteger], [1]>:$shape
+    LHLO_ExtentBuffer:$shape
   );
   let results = (outs AnyRankedOrUnrankedMemRef:$result);
 
diff --git a/tensorflow/core/lib/bfloat16/bfloat16.cc b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h
similarity index 61%
rename from tensorflow/core/lib/bfloat16/bfloat16.cc
rename to tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h
index e6e24bc0786..5773901ad78 100644
--- a/tensorflow/core/lib/bfloat16/bfloat16.cc
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,16 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#ifndef MLIR_HLO_DIALECT_MHLO_IR_REGISTER_H_
+#define MLIR_HLO_DIALECT_MHLO_IR_REGISTER_H_
 
-#include "third_party/eigen3/Eigen/Core"
+namespace mlir {
+namespace mhlo {
 
-namespace tensorflow {
+void registerAllDialects();
 
-const uint16_t bfloat16::NAN_VALUE;
-const uint16_t bfloat16::ZERO_VALUE;
-
-B16_DEVICE_FUNC bfloat16::operator Eigen::half() const {
-  return static_cast<Eigen::half>(float(*this));
 }
-}  // end namespace tensorflow
+}  // namespace mlir
+
+#endif  // MLIR_HLO_DIALECT_MHLO_IR_REGISTER_H_
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/CMakeLists.txt
new file mode 100644
index 00000000000..6de6851b8d7
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/CMakeLists.txt
@@ -0,0 +1,23 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set(LLVM_TARGET_DEFINITIONS mhlo_passes.td)
+mlir_tablegen(mhlo_passes.h.inc -gen-pass-decls -name MHLO)
+add_public_tablegen_target(MLIRMhloPassIncGen)
+
+set(LLVM_TARGET_DEFINITIONS lmhlo_passes.td)
+mlir_tablegen(lmhlo_passes.h.inc -gen-pass-decls -name LMHLO)
+add_public_tablegen_target(MLIRLmhloPassIncGen)
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/lmhlo_passes.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/lmhlo_passes.td
new file mode 100644
index 00000000000..963ff5dbacf
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/lmhlo_passes.td
@@ -0,0 +1,65 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+include "mlir/Pass/PassBase.td"
+
+def LhloCopyRemovalPass : Pass<"lhlo-copy-removal", "FuncOp"> {
+  let summary = "Removes redundant LHLO copy operations.";
+  let constructor = "createLhloCopyRemovalPass()";
+}
+
+
+def LhloLegalizeToLinalgPass : Pass<"lhlo-legalize-to-linalg", "FuncOp"> {
+  let summary = "Legalize from LHLO dialect to Linalg dialect.";
+  let constructor = "createLegalizeLhloToLinalgPass()";
+}
+
+
+def LhloFuseLinalgPass : Pass<"lhlo-fuse-linalg", "FuncOp"> {
+  let summary = "Greedily fuse linalg ops obtained after LHLO lowering.";
+  let constructor = "createLhloFuseLinalgPass()";
+  let options = [
+    Option<"use_parallel_loops_", "use-parallel-loops", "bool",
+           /*default=*/"false", "Tiles GenericOp consumer to parallel loops before linalg fusion">,
+    ListOption<"tile_sizes_", "tile-sizes", "unsigned",
+           "Faster memory space number to promote fusion buffers to",
+           "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">,
+  ];
+}
+
+
+def LhloLegalizeToAffinePass : Pass<"lhlo-legalize-to-affine", "FuncOp"> {
+  let summary = "Legalize from LHLO dialect to affine dialect.";
+  let constructor = "createLhloLegalizeToAffinePass()";
+}
+
+
+def LhloLegalizeToGpuPass : Pass<"lhlo-legalize-to-gpu", "FuncOp"> {
+  let summary = "Legalize from LHLO dialect to GPU dialect.";
+  let constructor = "createLegalizeToGpuPass()";
+}
+
+
+def TestLhloToLLVMPass : Pass<"test-lhlo-legalize-to-llvm", "FuncOp"> {
+  let summary = "Legalize from LHLO dialect to LLVM.";
+  let constructor = "createTestLhloToLLVMPass()";
+}
+
+
+def LhloLegalizeToParallelLoopsPass : Pass<"lhlo-legalize-to-parallel-loops", "FuncOp"> {
+  let summary = "Legalize from LHLO dialect to parallel loops.";
+  let constructor = "createLegalizeLhloToParallelLoopsPass()";
+}
+
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_hlo_to_lhlo_op.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_hlo_to_lhlo_op.h
index a0246f93180..c51bcfcfe89 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_hlo_to_lhlo_op.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_hlo_to_lhlo_op.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <type_traits>
 
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 
 namespace mlir {
 namespace mhlo {
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h
index 5d2bffcec2a..2bb5ab2888d 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h
@@ -18,10 +18,10 @@ limitations under the License.
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_hlo_to_lhlo_op.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/map_hlo_to_lhlo_op.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
 
 namespace mlir {
 namespace lmhlo {
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.td
new file mode 100644
index 00000000000..fa3bde24df1
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.td
@@ -0,0 +1,108 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+include "mlir/Pass/PassBase.td"
+
+def TestChloLegalizeToHloPass : Pass<"mhlo-test-chlo-legalize-to-hlo", "FuncOp"> {
+  let summary = "Test pass for applying chlo -> hlo legalization patterns.";
+  let constructor = "createTestChloLegalizeToHloPass()";
+}
+
+def HloLegalizeToLhloPass : Pass<"hlo-legalize-to-lhlo", "ModuleOp"> {
+  let summary = "Legalize from HLO dialect to LHLO dialect.";
+  let constructor = "createLegalizeToLhloPass()";
+}
+
+def LegalizeControlFlowPass : Pass<"mhlo-legalize-control-flow", "FuncOp"> {
+  let summary = "Legalize from MHLO control flow to CFG control flow.";
+  let constructor = "createLegalizeControlFlowPass()";
+}
+
+def LegalizeGatherToTorchIndexSelectPass : Pass<"mhlo-legalize-gather-to-torch-index-select", "FuncOp"> {
+  let summary = "Legalizes gathers to a torch index select.";
+  let constructor = "createLegalizeGatherToTorchIndexSelectPass()";
+}
+
+
+def LegalizeTanhToApproximationPass : Pass<"mhlo-legalize-tanh-to-approximation", "FuncOp"> {
+  let summary = "Legalize tanh from standard dialect to an approximation.";
+  let constructor = "createLegalizeTanhToApproximationPass()";
+}
+
+
+def HloLegalizeToLinalgPass : Pass<"hlo-legalize-to-linalg", "FuncOp"> {
+  let summary = "Legalize from HLO dialect to Linalg dialect.";
+  let constructor = "createLegalizeHloToLinalgPass()";
+}
+
+
+def LegalizeToStandardPass : Pass<"mhlo-legalize-to-std", "FuncOp"> {
+  let summary = "Legalize from MHLO dialect to standard dialect.";
+  let constructor = "createLegalizeToStdPass()";
+}
+
+def LowerComplexPass : Pass<"mhlo-test-lower-complex", "FuncOp"> {
+  let summary = "Lower complex operations into non-complex operations.";
+  let constructor = "createLowerComplexPass()";
+}
+
+
+def LegalizeGeneralDotPass : Pass<"mhlo-test-lower-general-dot", "FuncOp"> {
+  let summary = "Tests lowering general dot to a non-batched dot when possible.";
+  let constructor = "createLegalizeGeneralDotPass()";
+}
+
+
+def TestMaterializeBroadcastsPass : Pass<"mhlo-test-materialize-broadcasts", "FuncOp"> {
+  let summary = "Test pass for materializing 'broadcast_dimensions' attributes.";
+  let constructor = "createTestMaterializeBroadcastsPass()";
+}
+
+
+def MhloFusionPass : Pass<"mhlo-fusion", "FuncOp"> {
+  let summary = "Fuse mhlo ops to kLoop/kInput fusion patterns.";
+  let constructor = "createMhloFusionPass()";
+}
+
+
+def OptimizeMhloPass : Pass<"mhlo-test-optimize", "FuncOp"> {
+  let summary = "Run optional HLO optimizations.";
+  let constructor = "createOptimizeMhloPass()";
+}
+
+
+def SinkConstantsToControlFlowPass : Pass<"mhlo-sink-constants-to-control-flow", "FuncOp"> {
+  let summary = "Sink constants implicitly captured in control flow regions. This "
+    "is necessary to export to XLA.";
+  let constructor = "createSinkConstantsToControlFlowPass()";
+}
+
+
+def TestInferShapedTypeMethodsPass : Pass<"mhlo-test-infer-shaped-type-methods", "FuncOp"> {
+  let summary = "Uses test ops to invoke InferShapedTypeOpInterface methods.";
+  let constructor = "createTestInferShapedTypeMethodsPass()";
+}
+
+
+def TransformUnrankedHloPass : Pass<"transform-unranked-hlo", "FuncOp"> {
+  let summary = "Realize element-wise operations on ranked tensors where possible.";
+  let constructor = "createTransformUnrankedHloPass()";
+}
+
+
+def TestUnfuseBatchNormPass : Pass<"mhlo-test-unfuse-batch-norm", "FuncOp"> {
+  let summary = "Test pass for materializing 'broadcast_dimensions' attributes.";
+  let constructor = "createTestUnfuseBatchNormPass()";
+}
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h
index 9ea39e95fef..efa116f3f0d 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h
@@ -23,6 +23,7 @@ limitations under the License.
 namespace mlir {
 
 class FuncOp;
+class FunctionPass;
 class ModuleOp;
 class Operation;
 template <typename T>
@@ -58,18 +59,26 @@ std::unique_ptr<OperationPass<FuncOp>> createSinkConstantsToControlFlowPass();
 // fuse mhlo ops to kLoop/kInput fusion patterns
 std::unique_ptr<OperationPass<FuncOp>> createMhloFusionPass();
 
+/// Lowers the standard TanhOp to an approximation that does not use intrinsics.
+std::unique_ptr<OperationPass<FuncOp>> createLegalizeTanhToApproximationPass();
+
+std::unique_ptr<FunctionPass> createOptimizeMhloPass();
+std::unique_ptr<FunctionPass> createLowerComplexPass();
+std::unique_ptr<::mlir::Pass> createLegalizeGeneralDotPass();
+std::unique_ptr<FunctionPass> createLegalizeGatherToTorchIndexSelectPass();
+
 }  // namespace mhlo
 
 namespace lmhlo {
 
 // Lowers from LHLO dialect to Affine dialect.
-std::unique_ptr<OperationPass<FuncOp>> createLegalizeToAffinePass();
+std::unique_ptr<OperationPass<FuncOp>> createLhloLegalizeToAffinePass();
 
 // Lowers from LHLO dialect to Linalg dialect.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeLhloToLinalgPass();
 
 // Lowers from LHLO dialect to GPU dialect.
-std::unique_ptr<OperationPass<FuncOp>> createLegalizeToGpuPass();
+std::unique_ptr<FunctionPass> createLegalizeToGpuPass();
 
 // Fuses linalg ops obtained after LHLO lowering. To enable fusion,
 // operations are first tiled.
@@ -80,7 +89,7 @@ std::unique_ptr<OperationPass<FuncOp>> createLegalizeToGpuPass();
 // 'tile_sizes' provides the tile sizes to use for tiling. If the linalg
 // operation has more dimensions than tile sizes provided, 1 is used as
 // default.
-std::unique_ptr<OperationPass<FuncOp>> createLhloFuseLinalg(
+std::unique_ptr<FunctionPass> createLhloFuseLinalgPass(
     bool use_parallel_loops = false, llvm::ArrayRef<unsigned> tile_sizes = {});
 
 // Removes unnecessary LHLO copies which copy from the allocated buffers to the
@@ -94,12 +103,6 @@ std::unique_ptr<OperationPass<FuncOp>> createLegalizeLhloToParallelLoopsPass();
 
 }  // namespace lmhlo
 
-namespace hlo {
-
-/// Lowers the standard TanhOp to an approximation that does not use intrinsics.
-std::unique_ptr<OperationPass<FuncOp>> createLegalizeTanhToApproximationPass();
-
-}  // namespace hlo
 }  // namespace mlir
 
 #endif  // TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_PASSES_H_
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/register_passes.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/register_passes.h
new file mode 100644
index 00000000000..8f70f64359b
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/register_passes.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_DIALECT_MHLO_TRANSFORMS_REGISTER_PASSES_H_
+#define MLIR_HLO_DIALECT_MHLO_TRANSFORMS_REGISTER_PASSES_H_
+
+#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace mhlo {
+
+std::unique_ptr<Pass> createTestChloLegalizeToHloPass();
+std::unique_ptr<FunctionPass> createTestInferShapedTypeMethodsPass();
+std::unique_ptr<Pass> createTestMaterializeBroadcastsPass();
+std::unique_ptr<Pass> createTestUnfuseBatchNormPass();
+
+#define GEN_PASS_REGISTRATION
+#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+
+inline void registerAllMhloPasses() { registerMHLOPasses(); }
+
+}  // namespace mhlo
+
+namespace lmhlo {
+
+std::unique_ptr<Pass> createTestLhloToLLVMPass();
+
+#define GEN_PASS_REGISTRATION
+#include "mlir-hlo/Dialect/mhlo/transforms/lmhlo_passes.h.inc"
+
+inline void registerAllLmhloPasses() { registerLMHLOPasses(); }
+
+}  // namespace lmhlo
+}  // namespace mlir
+
+#endif  // MLIR_HLO_DIALECT_MHLO_TRANSFORMS_REGISTER_PASSES_H_
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h
index cb9a85a658a..725155e9403 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include <memory>
 
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
 class LLVMTypeConverter;
@@ -38,6 +38,13 @@ void PopulateGeneralDotOpLoweringPatterns(OwningRewritePatternList *patterns,
 void PopulateComplexLoweringPatterns(MLIRContext *context,
                                      OwningRewritePatternList *patterns);
 
+void PopulateOptimizeMHLOPatterns(MLIRContext *context,
+                                  OwningRewritePatternList *patterns);
+
+// Rewrite patterns for gather to equivalent torch index select legalization.
+void PopulateGatherToTorchIndexSelectPatterns(
+    mlir::MLIRContext *context, OwningRewritePatternList *patterns);
+
 void PopulateMhloToStdPatterns(OwningRewritePatternList *patterns,
                                MLIRContext *ctx);
 
@@ -73,13 +80,17 @@ void PopulateTransformUnrankedHloPatterns(MLIRContext *context,
 void PopulateUnfuseBatchNormPatterns(MLIRContext *context,
                                      OwningRewritePatternList *patterns);
 
+// Populates a pattern that translates the standard TanhOp to an approximation
+// that does not use intrinsics.
+void PopulateTanhToApproximationPatterns(MLIRContext *context,
+                                         OwningRewritePatternList *patterns);
+
 }  // namespace mhlo
 
 namespace lmhlo {
 
 /// Collect a set of patterns to convert from the LHLO dialect to LLVM.
-void PopulateLhloToLLVMConversionPatterns(const LowerToLLVMOptions &options,
-                                          LLVMTypeConverter *converter,
+void PopulateLhloToLLVMConversionPatterns(LLVMTypeConverter *converter,
                                           OwningRewritePatternList *patterns);
 
 }  // namespace lmhlo
@@ -93,14 +104,6 @@ void PopulateLegalizeChloToHloPatterns(MLIRContext *context,
 
 }  // namespace chlo
 
-namespace hlo {
-
-// Populates a pattern that translates the standard TanhOp to an approximation
-// that does not use intrinsics.
-void PopulateTanhToApproximationPatterns(MLIRContext *context,
-                                         OwningRewritePatternList *patterns);
-
-}  // namespace hlo
 }  // namespace mlir
 
 #endif  // TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_REWRITERS_H_
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/broadcast_utils.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/broadcast_utils.h
index 3be7d42cc25..1c57073f4ab 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/broadcast_utils.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/broadcast_utils.h
@@ -19,12 +19,12 @@ limitations under the License.
 // Utilities relating to implementing HLO broadcasting.
 // Note: This file should not depend on any non-MLIR TensorFlow libraries.
 
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/Location.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Interfaces/InferTypeOpInterface.h"
+#include "mlir/Support/LLVM.h"
 
 namespace mlir {
 namespace hlo {
@@ -38,10 +38,12 @@ bool IsLegalNumpyRankedBroadcast(Value lhs, Value rhs,
 
 // Emits shape dialect ops to compute the result shape for a broadcasting
 // binary elementwise op which broadcasts according to "numpy" semantics
-// (see above), returning an extents tensor of the resulting shape.
-Value ComputeBinaryElementwiseBroadcastingResultExtents(Location loc, Value lhs,
-                                                        Value rhs,
-                                                        OpBuilder& builder);
+// (see above), returning a `shape.shape` or an extent tensor of the resulting
+// shape. The result should only be an extent tensor in contexts that ensure
+// both operands to be broadcastable.
+Value ComputeBinaryElementwiseBroadcastingResultExtents(
+    Location loc, Value lhs, Value rhs, OpBuilder& builder,
+    bool unsafe_as_extent_tensor);
 
 }  // namespace hlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/convert_op_folder.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/convert_op_folder.h
index a63df336d8f..4cf74385843 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/convert_op_folder.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/convert_op_folder.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_UTILS_CONVERT_OP_FOLDER_H_
 #define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_UTILS_CONVERT_OP_FOLDER_H_
 
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/StandardTypes.h"
 
 namespace mlir {
 namespace hlo {
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/hlo_utils.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/hlo_utils.h
index b31ba231acd..1e335ae6b82 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/hlo_utils.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/hlo_utils.h
@@ -16,11 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_UTILS_HLO_UTILS_H_
 #define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_UTILS_HLO_UTILS_H_
 
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeUtilities.h"
 
 namespace mlir {
 namespace hlo {
diff --git a/tensorflow/compiler/mlir/hlo/lib/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/lib/CMakeLists.txt
new file mode 100644
index 00000000000..ec65a5ee882
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/CMakeLists.txt
@@ -0,0 +1,17 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+add_subdirectory(Dialect)
+add_subdirectory(utils)
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/lib/Dialect/CMakeLists.txt
new file mode 100644
index 00000000000..5ee1a1924ec
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/CMakeLists.txt
@@ -0,0 +1,16 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+add_subdirectory(mhlo)
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/CMakeLists.txt
new file mode 100644
index 00000000000..e138afa587f
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/CMakeLists.txt
@@ -0,0 +1,17 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+add_subdirectory(IR)
+add_subdirectory(transforms)
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/CMakeLists.txt
new file mode 100644
index 00000000000..d7bb5057b00
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/CMakeLists.txt
@@ -0,0 +1,82 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+include_directories(BEFORE
+    ${CMAKE_CURRENT_BINARY_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR})
+
+set(LLVM_TARGET_DEFINITIONS hlo_patterns.td)
+mlir_tablegen(hlo_patterns.cc.inc -gen-rewriters)
+add_public_tablegen_target(MLIRMhloRewriterIncGen)
+
+set(LLVM_TARGET_DEFINITIONS mhlo_canonicalize.td)
+mlir_tablegen(mhlo_canonicalize.inc -gen-rewriters)
+add_public_tablegen_target(MLIRMhloCanonicalizeIncGen)
+
+add_mlir_dialect_library(ChloDialect
+  chlo_ops.cc
+
+  DEPENDS
+  MLIRchlo_opsIncGen
+)
+target_link_libraries(ChloDialect PUBLIC MLIRIR)
+
+add_mlir_library(MhloInferFusibilityOpInterface
+  infer_fusibility_op_interface.cc
+
+  DEPENDS
+  MLIRinfer_fusibility_op_interfaceIncGen
+)
+
+
+add_mlir_dialect_library(MhloDialect
+  hlo_ops.cc
+
+  DEPENDS
+  MLIRhlo_opsIncGen
+  MLIRMhloCanonicalizeIncGen
+  MLIRMhloRewriterIncGen
+  MLIRinfer_fusibility_op_interfaceIncGen
+)
+target_link_libraries(MhloDialect
+  PUBLIC
+  MLIRIR
+  MhloInferFusibilityOpInterface
+  MLIRMhloUtils
+)
+
+
+add_mlir_dialect_library(LmhloDialect
+  lhlo_ops.cc
+
+  DEPENDS
+  MLIRlhlo_opsIncGen
+)
+target_link_libraries(LmhloDialect PUBLIC MLIRIR)
+
+
+add_mlir_dialect_library(MhloRegisterDialects
+  init.cc
+DEPENDS
+  MLIRchlo_opsIncGen
+  MLIRhlo_opsIncGen
+  MLIRlhlo_opsIncGen
+)
+target_link_libraries(MhloRegisterDialects
+  PUBLIC
+  ChloDialect
+  MhloDialect
+  LmhloDialect
+)
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/canonicalize.td b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/chlo_canonicalize.td
similarity index 100%
rename from tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/canonicalize.td
rename to tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/chlo_canonicalize.td
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/chlo_ops.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/chlo_ops.cc
index c6c193a9d89..b5eacd686bd 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/chlo_ops.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/chlo_ops.cc
@@ -13,14 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
 
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/Diagnostics.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/broadcast_utils.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/utils/broadcast_utils.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeUtilities.h"
 
 namespace mlir {
 namespace chlo {
@@ -151,7 +153,7 @@ LogicalResult ReifyBroadcastBinaryOpReturnTypeShapes(
   }
 
   Value computed_shape = hlo::ComputeBinaryElementwiseBroadcastingResultExtents(
-      loc, lhs, rhs, builder);
+      loc, lhs, rhs, builder, /*unsafe_as_extent_tensor=*/false);
   if (!computed_shape) return failure();
   reifiedReturnShapes.push_back(computed_shape);
   return success();
@@ -259,18 +261,59 @@ BROADCAST_BINARY_OP_DEFS(BroadcastXorOp);
 #undef BROADCAST_INFER_SHAPE_TYPE_OP_DEFS
 #undef BROADCAST_BINARY_OP_DEFS
 
+static LogicalResult Verify(ConstantLikeOp op) {
+  if (op.value().getType() != op.getType().cast<ShapedType>().getElementType())
+    return op.emitOpError() << "value's type doesn't match element return type";
+  return success();
+}
+
+LogicalResult ConstantLikeOp::inferReturnTypeComponents(
+    MLIRContext* context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {
+  ConstantLikeOp::Adaptor op(operands, attributes);
+  if (failed(op.verify(location.getValue()))) return failure();
+  Type element_type = op.value().getType();
+  Type operand_type = op.operand().getType();
+  if (operand_type.isa<UnrankedTensorType>()) {
+    inferedReturnShapes.emplace_back(element_type);
+  } else {
+    const auto& shape = operand_type.cast<RankedTensorType>().getShape();
+    inferedReturnShapes.emplace_back(shape, element_type);
+  }
+  return success();
+}
+
+struct ConstantLikeToConstant : public OpRewritePattern<ConstantLikeOp> {
+  using OpRewritePattern<ConstantLikeOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ConstantLikeOp op,
+                                PatternRewriter& rewriter) const override {
+    auto op_type = op.operand().getType().cast<ShapedType>();
+    if (!op_type.hasStaticShape()) return failure();
+    auto type = RankedTensorType::get(op_type.getShape(), op.value().getType());
+    ElementsAttr attr = DenseElementsAttr::get(type, op.value());
+    rewriter.replaceOpWithNewOp<mhlo::ConstOp>(op.getOperation(), attr);
+    return success();
+  }
+};
+
+void ConstantLikeOp::getCanonicalizationPatterns(
+    OwningRewritePatternList& results, MLIRContext* context) {
+  results.insert<ConstantLikeToConstant>(context);
+}
+
 #define GET_OP_CLASSES
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.cc.inc"
+#include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.cc.inc"
 
 //===----------------------------------------------------------------------===//
 // chlo Dialect Constructor
 //===----------------------------------------------------------------------===//
 
-HloClientDialect::HloClientDialect(MLIRContext* context)
-    : Dialect(getDialectNamespace(), context) {
+void HloClientDialect::initialize() {
   addOperations<
 #define GET_OP_LIST
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.cc.inc"
+#include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.cc.inc"
       >();
 }
 
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/dialect_registration.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/dialect_registration.cc
index f4df946d11a..9d1c354690a 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/dialect_registration.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/dialect_registration.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 
 // Static initialization for *HLO dialects registration.
 static mlir::DialectRegistration<mlir::mhlo::MhloDialect> mhlo_ops;
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_ops.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_ops.cc
index cbd478a0283..f5deb94e3a4 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_ops.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // This file defines the operations used in the MHLO dialect.
 
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 
 #include <assert.h>
 #include <stddef.h>
@@ -24,7 +24,6 @@ limitations under the License.
 #include <algorithm>
 #include <functional>
 
-#include "absl/container/flat_hash_set.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -35,31 +34,33 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MathExtras.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/Dialect.h"  // from @llvm-project
-#include "mlir/IR/Location.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Matchers.h"  // from @llvm-project
-#include "mlir/IR/OpDefinition.h"  // from @llvm-project
-#include "mlir/IR/OpImplementation.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/OperationSupport.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h.inc"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/convert_op_folder.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/hlo_utils.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h.inc"
+#include "mlir-hlo/utils/convert_op_folder.h"
+#include "mlir-hlo/utils/hlo_utils.h"
+#include "mlir/Dialect/Shape/IR/Shape.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/InliningUtils.h"
 
 namespace mlir {
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_structs.cc.inc"
+#include "hlo_patterns.cc.inc"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_structs.cc.inc"
 namespace mhlo {
 
 Operation* MhloDialect::materializeConstant(OpBuilder& builder, Attribute value,
@@ -104,44 +105,13 @@ DenseIntElementsAttr BuildSliceLimits(DenseIntElementsAttr start_indices,
   return GetI64ElementsAttr(slice_limits, builder);
 }
 
-#include "tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/generated_canonicalize.inc"
+#include "mhlo_canonicalize.inc"
 }  // namespace
 
 //===----------------------------------------------------------------------===//
 // ConstOp
 //===----------------------------------------------------------------------===//
 
-static void Print(ConstOp op, OpAsmPrinter* printer) {
-  // Print op name.
-  *printer << op.getOperationName();
-
-  // Elide attribute value while printing the attribute dictionary.
-  SmallVector<StringRef, 1> elided_attrs;
-  elided_attrs.push_back("value");
-  printer->printOptionalAttrDict(op.getAttrs(), elided_attrs);
-
-  *printer << ' ' << op.value();
-}
-
-static ParseResult ParseConstOp(OpAsmParser* parser, OperationState* result) {
-  if (parser->parseOptionalAttrDict(result->attributes)) return failure();
-
-  // If colon is not present after attribute dictionary, it should be short form
-  // and attribute 'value' is outside the dictionary.
-  if (failed(parser->parseOptionalColon())) {
-    Attribute value;
-    if (parser->parseAttribute(value, "value", result->attributes))
-      return failure();
-    return parser->addTypeToList(value.getType(), result->types);
-  }
-
-  // Long form should have type of the result after colon.
-  Type ty;
-  if (parser->parseType(ty)) return failure();
-  result->types.push_back(ty);
-  return success();
-}
-
 OpFoldResult ConstOp::fold(ArrayRef<Attribute> operands) {
   assert(operands.empty() && "constant has no operands");
 
@@ -339,6 +309,33 @@ void DynamicIotaOp::getCanonicalizationPatterns(
   results.insert<DynamicIotaBroadcast>(context);
 }
 
+//===----------------------------------------------------------------------===//
+// DynamicUpdateSliceOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(DynamicUpdateSliceOp op) {
+  OperandRange indices = op.start_indices();
+  if (indices.size() <= 1) return success();
+
+  // Note: start_indices is constrained to Variadic<HLO_ScalarIntTensor>, so it
+  // is OK to cast indices to ShapedType here.
+  auto idx_tensor = indices.take_front().front().getType().cast<ShapedType>();
+  Type first_elem_ty = idx_tensor.getElementType();
+  Type elem_ty;
+
+  for (auto idx : llvm::drop_begin(indices, 1)) {
+    idx_tensor = idx.getType().cast<ShapedType>();
+    elem_ty = idx_tensor.getElementType();
+
+    if (first_elem_ty != elem_ty) {
+      return op.emitOpError() << "start indices must have same element type "
+                                 "(encountered mismatch: "
+                              << first_elem_ty << " vs " << elem_ty << ")";
+    }
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // AbsOp
 //===----------------------------------------------------------------------===//
@@ -373,8 +370,8 @@ static LogicalResult Verify(CollectivePermuteOp op) {
            << "expect source_target_pairs attribute of shape (N, 2), but got ("
            << type.getShape() << ")";
   // Check source target pairs for duplicate sources or targets
-  absl::flat_hash_set<int64_t> sources;
-  absl::flat_hash_set<int64_t> targets;
+  llvm::DenseSet<int64_t> sources;
+  llvm::DenseSet<int64_t> targets;
   for (auto i = op.source_target_pairs().begin(),
             e = op.source_target_pairs().end();
        i != e; ++i) {
@@ -505,6 +502,46 @@ static LogicalResult Verify(TupleOp op) {
   return success();
 }
 
+namespace {
+
+// Pattern for unpacking and repacking the same tuple.
+struct UnpackRepackSameTuple : public OpRewritePattern<TupleOp> {
+  using OpRewritePattern<TupleOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TupleOp op,
+                                PatternRewriter& rewriter) const override {
+    if (op.val().empty()) return failure();
+
+    Value first_element = op.val().front();
+    auto first_element_op =
+        dyn_cast_or_null<GetTupleElementOp>(first_element.getDefiningOp());
+    if (!first_element_op || first_element_op.indexAttr().getInt() != 0)
+      return failure();
+
+    Value tuple_predecessor = first_element_op.getOperand();
+    if (tuple_predecessor.getType() != op.getType()) return failure();
+
+    for (auto element_and_idx : llvm::enumerate(op.val().drop_front(1))) {
+      auto element_op = dyn_cast_or_null<GetTupleElementOp>(
+          element_and_idx.value().getDefiningOp());
+      if (!element_op ||
+          element_op.indexAttr().getInt() != element_and_idx.index() + 1 ||
+          element_op.getOperand() != tuple_predecessor)
+        return failure();
+    }
+
+    rewriter.replaceOp(op, tuple_predecessor);
+    return success();
+  }
+};
+
+}  // namespace
+
+void TupleOp::getCanonicalizationPatterns(OwningRewritePatternList& results,
+                                          MLIRContext* context) {
+  results.insert<UnpackRepackSameTuple>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // AllToAllOp
 //===----------------------------------------------------------------------===//
@@ -707,10 +744,12 @@ static LogicalResult Verify(DynamicBroadcastInDimOp op) {
 
     auto dimSize = operandType.getDimSize(i);
     auto resultDimSize = resultType.getDimSize(dimIndex);
-    if (dimSize != 1 && dimSize != resultDimSize) {
+    // Note: verifyCompatibleShapes doesn't consider size-1 broadcasting, so we
+    // add a manual check for this.
+    if (dimSize != 1 && failed(verifyCompatibleShape(dimSize, resultDimSize))) {
       return op.emitOpError(
-          llvm::formatv("size of operand dimension {0} ({1}) is not equal to "
-                        "1 or size of result dimension {2} ({3})",
+          llvm::formatv("size of operand dimension {0} ({1}) is not compatible "
+                        "with size of result dimension {2} ({3})",
                         i, dimSize, dimIndex, resultDimSize));
     }
   }
@@ -744,7 +783,9 @@ class DynamicBroadcastInDimOpNotActuallyDynamic
 
 void DynamicBroadcastInDimOp::getCanonicalizationPatterns(
     OwningRewritePatternList& results, MLIRContext* context) {
-  results.insert<DynamicBroadcastInDimOpNotActuallyDynamic>(context);
+  results.insert<DynamicBroadcastInDimOpNotActuallyDynamic,
+                 DynamicBroadcastToOwnShape_1, DynamicBroadcastToOwnShape_2>(
+      context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1465,7 +1506,7 @@ static LogicalResult Verify(PadOp op) {
 
 static LogicalResult Verify(ReshapeOp op) {
   // If the operand type is dynamically shaped there is nothing to verify.
-  auto operand_ty = op.operand().getType().cast<RankedTensorType>();
+  auto operand_ty = op.operand().getType().dyn_cast<RankedTensorType>();
   if (!operand_ty || !operand_ty.hasStaticShape()) return success();
 
   // If the operand type is statically shaped (not required) the number of
@@ -2119,7 +2160,7 @@ void CompareOp::build(OpBuilder& builder, OperationState& result, Value lhs,
 }
 
 #define GET_OP_CLASSES
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.cc.inc"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.cc.inc"
 
 //===----------------------------------------------------------------------===//
 // mhlo Dialect Interfaces
@@ -2147,10 +2188,10 @@ struct HLOInlinerInterface : public DialectInlinerInterface {
 //===----------------------------------------------------------------------===//
 
 MhloDialect::MhloDialect(MLIRContext* context)
-    : Dialect(getDialectNamespace(), context) {
+    : Dialect(getDialectNamespace(), context, TypeID::get<MhloDialect>()) {
   addOperations<
 #define GET_OP_LIST
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.cc.inc"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.cc.inc"
       >();
   addInterfaces<HLOInlinerInterface>();
   addTypes<TokenType>();
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_patterns.td b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_patterns.td
new file mode 100644
index 00000000000..b8b6cb80fba
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_patterns.td
@@ -0,0 +1,32 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Canonicalization patterns for the MHLO dialect.
+
+include "mlir/Dialect/Shape/IR/ShapeOps.td"
+include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.td"
+
+def EqualBinaryOperands : Constraint<CPred<"$0 == $1">>;
+
+// Canonicalization patterns.
+
+def DynamicBroadcastToOwnShape_1 : Pat<
+   (HLO_DynamicBroadcastInDimOp:$op $arg0,
+       (Shape_ToExtentTensorOp (Shape_ShapeOfOp $arg1)), $attr),
+  (replaceWithValue $arg0), [(EqualBinaryOperands $arg0, $arg1)]>;
+def DynamicBroadcastToOwnShape_2 : Pat<
+  (HLO_DynamicBroadcastInDimOp:$op $arg0, (Shape_ShapeOfOp $arg1), $attr),
+  (replaceWithValue $arg0), [(EqualBinaryOperands $arg0, $arg1)]>;
+
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/infer_fusibility_op_interface.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/infer_fusibility_op_interface.cc
index eaa3414b36a..e93a6cfce3d 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/infer_fusibility_op_interface.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/infer_fusibility_op_interface.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.h"
+#include "mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.h"
 
 namespace mlir {
 
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.cc.inc"
+#include "mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.cpp.inc"
 
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/init.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/init.cc
new file mode 100644
index 00000000000..9fffeae1cc5
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/init.cc
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/register.h"
+
+// Static initialization for *HLO dialects registration.
+
+void mlir::mhlo::registerAllDialects() {
+  static bool init_once = []() {
+    registerDialect<mlir::chlo::HloClientDialect>();
+    registerDialect<mlir::lmhlo::LmhloDialect>();
+    registerDialect<mlir::mhlo::MhloDialect>();
+    return true;
+  }();
+  (void)init_once;
+
+  // Dependent dialects
+}
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_ops.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_ops.cc
index bd0dc224ccc..f61a66397e7 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_ops.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_ops.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // This file defines the operations used in the LMHLO dialect.
 
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 
 #include <assert.h>
 #include <stddef.h>
@@ -28,31 +28,31 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/Dialect.h"  // from @llvm-project
-#include "mlir/IR/Location.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/OpDefinition.h"  // from @llvm-project
-#include "mlir/IR/OpImplementation.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/OperationSupport.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h.inc"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h.inc"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
 
 namespace mlir {
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_structs.cc.inc"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops_structs.cc.inc"
 namespace lmhlo {
 
 LmhloDialect::LmhloDialect(MLIRContext *context)
-    : Dialect(getDialectNamespace(), context) {
+    : Dialect(getDialectNamespace(), context, TypeID::get<LmhloDialect>()) {
   addOperations<
 #define GET_OP_LIST
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.cc.inc"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.cc.inc"
       >();
 }
 
@@ -127,7 +127,7 @@ static LogicalResult Verify(ReshapeMemRefCastOp op) {
 }
 
 #define GET_OP_CLASSES
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.cc.inc"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.cc.inc"
 
 // TODO(cheshire): Support folding, reuse code from hlo_ops.cc.
 
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/mhlo_canonicalize.td b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/mhlo_canonicalize.td
new file mode 100644
index 00000000000..eb92d9e0e46
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/mhlo_canonicalize.td
@@ -0,0 +1,30 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is the canonicalize pattern definition file.
+
+include "mlir/IR/OpBase.td"
+include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.td"
+include "mlir-hlo/Dialect/mhlo/IR/hlo_utils.td"
+
+def UnaryToBinaryEinsumEq : NativeCodeCall<
+  "$_builder.getStringAttr(\",\" + $0.getValue().str())">;
+
+// Convert UnaryEinsumOp to EinsumOp with two operands with redundant first
+// operand.
+def UnaryEinsumToEinsum : Pat<
+  (HLO_UnaryEinsumOp $operand, $equation),
+  (HLO_EinsumOp (HLO_ConstOp (GetScalarOfType<1> $operand)),
+                $operand, (UnaryToBinaryEinsumEq $equation))>;
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/CMakeLists.txt
new file mode 100644
index 00000000000..bb9f98d32d3
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/CMakeLists.txt
@@ -0,0 +1,155 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+include_directories(BEFORE
+    ${CMAKE_CURRENT_BINARY_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR})
+
+set(LLVM_TARGET_DEFINITIONS lower_complex_patterns.td)
+mlir_tablegen(generated_lower_complex.inc -gen-rewriters)
+add_public_tablegen_target(MLIRMhloLowerComplexIncGen)
+
+set(LLVM_TARGET_DEFINITIONS legalize_to_standard_patterns.td)
+mlir_tablegen(generated_legalize_to_standard.inc -gen-rewriters)
+add_public_tablegen_target(MLIRMhloLegalizeToStandardIncGen)
+
+
+add_mlir_library(ChloPasses
+  chlo_legalize_to_hlo.cc
+  chlo_legalize_to_hlo_pass.cc
+
+  DEPENDS
+  MLIRhlo_opsIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  ChloDialect
+  MLIRIR
+  MLIRPass
+)
+
+add_mlir_library(MhloPasses
+  legalize_gather_to_torch_index_select.cc
+  legalize_tanh_to_approximation.cc
+  lower_complex.cc
+  lower_complex_patterns.td
+  lower_general_dot.cc
+  materialize_broadcasts.cc
+  materialize_broadcasts_pass.cc
+  mhlo_fusion.cc
+  optimize_mhlo.cc
+  optimize_mhlo_pass.cc
+  sink_constants_to_control_flow.cc
+  test_infer_shaped_type_pass.cc
+  transform_unranked_hlo.cc
+  unfuse_batch_norm.cc
+  unfuse_batch_norm_pass.cc
+
+  DEPENDS
+  MLIRhlo_opsIncGen
+  MLIRMhloLowerComplexIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRMhloUtils
+  MLIRPass
+  MLIRTransformUtils
+)
+
+add_mlir_library(MhloToLhloConversion
+  hlo_legalize_to_lhlo.cc
+
+  DEPENDS
+  MLIRhlo_opsIncGen
+  MLIRlhlo_opsIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MhloDialect
+  LmhloDialect
+  MLIRIR
+  MLIRPass
+)
+
+add_mlir_library(MhloToStandard
+  legalize_control_flow.cc
+  legalize_to_standard.cc
+
+  DEPENDS
+  MLIRhlo_opsIncGen
+  MLIRlhlo_opsIncGen
+  MLIRMhloLegalizeToStandardIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRPass
+)
+
+add_mlir_library(MhloLhloToLinalg
+  legalize_to_linalg.cc
+
+  DEPENDS
+  MLIRhlo_opsIncGen
+  MLIRlhlo_opsIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MhloDialect
+  MLIRIR
+  MLIRPass
+)
+
+add_mlir_library(LmhloPasses
+  lhlo_copy_removal.cc
+  lhlo_fuse_linalg.cc
+  lhlo_legalize_to_affine.cc
+  lhlo_legalize_to_gpu.cc
+  lhlo_legalize_to_llvm.cc
+  lhlo_legalize_to_llvm_pass.cc
+  lhlo_legalize_to_parallel_loops.cc
+
+  DEPENDS
+  MLIRlhlo_opsIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  LmhloDialect
+  MLIRIR
+  MLIRPass
+)
+
+add_library(AllMhloPasses INTERFACE)
+target_link_libraries(AllMhloPasses INTERFACE
+  ChloPasses
+  MhloPasses
+  MhloToLhloConversion
+  MhloToStandard
+  MhloLhloToLinalg
+  LmhloPasses
+)
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo.cc
index 06e95e04c76..c2db4880632 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo.cc
@@ -13,20 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/OperationSupport.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/broadcast_utils.h"
+#include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mlir-hlo/utils/broadcast_utils.h"
+#include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/Shape/IR/Shape.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
 namespace chlo {
-
 namespace {
 
 // Converts binary ops that statically are determined to not broadcast directly
@@ -74,10 +76,6 @@ struct ConvertTrivialNonBroadcastBinaryOp : public OpRewritePattern<ChloOpTy> {
 //   - Legal combinations of degenerate (1-dim) implicit broadcasting.
 // The restriction on broadcast_dims derives from the definition of the
 // `shape.broadcast` op, which only supports prefix-padding.
-//
-// It may be possible to expand this pattern to operate on unranked tensors in
-// the future by emitting more code to dynamically differentiate based on rank.
-// Whether that is of any practical benefit remains to be seen.
 template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
 struct ConvertRankedDynamicBroadcastBinaryOp
     : public OpRewritePattern<ChloOpTy> {
@@ -126,8 +124,8 @@ struct ConvertRankedDynamicBroadcastBinaryOp
 
     int64_t result_rank = std::max(lhs_type.getRank(), rhs_type.getRank());
     Value result_extents =
-        hlo::ComputeBinaryElementwiseBroadcastingResultExtents(loc, lhs, rhs,
-                                                               rewriter);
+        hlo::ComputeBinaryElementwiseBroadcastingResultExtents(
+            loc, lhs, rhs, rewriter, /*unsafe_as_extent_tensor=*/true);
 
     // Note that we unconditionally emit DynamicBroadcastInDim ops and let
     // downstream canonicalizations fold them away if possible. This is
@@ -160,6 +158,273 @@ struct ConvertRankedDynamicBroadcastBinaryOp
   }
 };
 
+// Converts a broadcasting binary operation with a scalar operand and an
+// unranked operand to a ranked broadcasting operation by dynamically reshaping
+// the unranked operand to a 1D tensor. This will always be safe because
+// broadcasting from a scalar to another shape always works.
+template <typename ChloOpTy, typename HloOpTy>
+struct ConvertUnrankedScalarDynamicBroadcastBinaryOp
+    : public OpRewritePattern<ChloOpTy> {
+  using OpRewritePattern<ChloOpTy>::OpRewritePattern;
+  LogicalResult matchAndRewrite(ChloOpTy op,
+                                PatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    Value lhs = op.lhs();
+    Value rhs = op.rhs();
+
+    auto lhs_ranked_type = lhs.getType().dyn_cast<RankedTensorType>();
+    auto lhs_unranked_type = lhs.getType().dyn_cast<UnrankedTensorType>();
+
+    auto rhs_ranked_type = rhs.getType().dyn_cast<RankedTensorType>();
+    auto rhs_unranked_type = rhs.getType().dyn_cast<UnrankedTensorType>();
+
+    bool lhs_is_scalar = lhs_ranked_type &&
+                         lhs_ranked_type.getShape().empty() &&
+                         rhs_unranked_type;
+    bool rhs_is_scalar = rhs_ranked_type &&
+                         rhs_ranked_type.getShape().empty() &&
+                         lhs_unranked_type;
+
+    // Only support the case where exactly one operand is scalar and the other
+    // is unranked. Other patterns in this file will create more efficient
+    // lowerings for cases where both ranks are known or will handle the more
+    // generic case of both inputs being unranked.
+    if (!(lhs_is_scalar ^ rhs_is_scalar)) return failure();
+
+    auto result_type = op.getResult().getType().template dyn_cast<TensorType>();
+
+    // Reshape the non-scalar value into a dynamically sized, rank-1 tensor
+    Value shape =
+        rewriter.create<shape::ShapeOfOp>(loc, lhs_is_scalar ? rhs : lhs);
+    Value num_elements = rewriter.create<shape::NumElementsOp>(loc, shape);
+    Value size_tensor =
+        rewriter.create<TensorFromElementsOp>(loc, num_elements);
+    Value reshaped = rewriter.create<mhlo::DynamicReshapeOp>(
+        loc, RankedTensorType::get({-1}, result_type.getElementType()),
+        lhs_is_scalar ? rhs : lhs, size_tensor);
+
+    // Create a new ranked Chlo op that will be further lowered by other
+    // patterns into Mhlo.
+    SmallVector<Value, 2> operands{lhs_is_scalar ? lhs : reshaped,
+                                   rhs_is_scalar ? rhs : reshaped};
+    Value computed = rewriter.create<ChloOpTy>(
+        loc, SmallVector<Type, 1>{reshaped.getType()}, operands, op.getAttrs());
+
+    // Reshape the result back into an unranked tensor.
+    rewriter.replaceOpWithNewOp<mhlo::DynamicReshapeOp>(op, result_type,
+                                                        computed, shape);
+
+    return success();
+  }
+};
+
+// Handles lowering of the following pattern to patterns that will be further
+// matched by other patterns until they result in LHLO:
+//   %result = "chlo.op"(%lhs, %rhs) : (<*xTy>, <*xTy>) -> <*xTy>
+//
+// The sequence of specializations this handles is:
+//   - Either operand being scalar
+//   - Operands having equal shapes
+//   - The resulting value being any of ranks [2,6]
+template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
+struct ConvertUnrankedDynamicBroadcastBinaryOp
+    : public OpRewritePattern<ChloOpTy> {
+  using OpRewritePattern<ChloOpTy>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ChloOpTy op,
+                                PatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    Value lhs = op.lhs();
+    Value rhs = op.rhs();
+    auto lhs_type = lhs.getType().dyn_cast<UnrankedTensorType>();
+    auto rhs_type = rhs.getType().dyn_cast<UnrankedTensorType>();
+    auto result_type = op.getResult().getType().template dyn_cast<TensorType>();
+
+    // Only support unranked operands. If either operand is ranked, another
+    // pattern will handle the lowering.
+    if (!lhs_type || !rhs_type) return failure();
+
+    // If lhs is scalar
+    auto if_op = rewriter.create<scf::IfOp>(
+        loc, result_type, IsScalarTensor(rewriter, op, lhs), true);
+    OpBuilder if_lhs_scalar_builder = if_op.getThenBodyBuilder();
+    Value reshaped_lhs = if_lhs_scalar_builder.create<mhlo::ReshapeOp>(
+        loc, RankedTensorType::get({}, lhs_type.getElementType()), lhs);
+    Value if_lhs_scalar_result = if_lhs_scalar_builder.create<ChloOpTy>(
+        loc, ArrayRef<Type>{result_type}, ArrayRef<Value>{reshaped_lhs, rhs},
+        op.getAttrs());
+    if_lhs_scalar_builder.create<scf::YieldOp>(loc, if_lhs_scalar_result);
+
+    // If lhs is NOT scalar
+    //
+    // See if rhs is scalar
+    OpBuilder else_lhs_scalar_builder = if_op.getElseBodyBuilder();
+    auto if_rhs_scalar_op = else_lhs_scalar_builder.create<scf::IfOp>(
+        loc, result_type, IsScalarTensor(else_lhs_scalar_builder, op, rhs),
+        true);
+    else_lhs_scalar_builder.create<scf::YieldOp>(loc,
+                                                 if_rhs_scalar_op.getResult(0));
+    OpBuilder if_rhs_scalar_builder = if_rhs_scalar_op.getThenBodyBuilder();
+    Value reshaped_rhs = if_rhs_scalar_builder.create<mhlo::ReshapeOp>(
+        loc, RankedTensorType::get({}, lhs_type.getElementType()), rhs);
+    Value if_rhs_scalar_result = if_rhs_scalar_builder.create<ChloOpTy>(
+        loc, ArrayRef<Type>{result_type}, ArrayRef<Value>{lhs, reshaped_rhs},
+        op.getAttrs());
+    if_rhs_scalar_builder.create<scf::YieldOp>(loc, if_rhs_scalar_result);
+
+    // If NEITHER shape is scalar
+    //
+    // See if shapes are equal.
+    OpBuilder else_no_scalars_builder = if_rhs_scalar_op.getElseBodyBuilder();
+    Value shape_of_lhs =
+        else_no_scalars_builder.create<shape::ShapeOfOp>(loc, lhs);
+    Value shape_of_rhs =
+        else_no_scalars_builder.create<shape::ShapeOfOp>(loc, rhs);
+    Value equal_shapes = else_no_scalars_builder.create<shape::ShapeEqOp>(
+        loc, shape_of_lhs, shape_of_rhs);
+
+    auto if_eq_shapes_op = else_no_scalars_builder.create<scf::IfOp>(
+        loc, result_type, equal_shapes, true);
+    else_no_scalars_builder.create<scf::YieldOp>(loc,
+                                                 if_eq_shapes_op.getResult(0));
+
+    OpBuilder if_eq_shapes_builder = if_eq_shapes_op.getThenBodyBuilder();
+    Value non_broadcast_op =
+        Adaptor::CreateOp(op, result_type, lhs, rhs, if_eq_shapes_builder);
+    if_eq_shapes_builder.create<scf::YieldOp>(loc, non_broadcast_op);
+
+    // If shapes are not scalar, nor equal
+    //
+    // See if values are of a rank that we support.
+    OpBuilder if_neq_shapes_builder = if_eq_shapes_op.getElseBodyBuilder();
+    if_neq_shapes_builder.create<scf::YieldOp>(
+        loc, HandleBroadcastAndOp(if_neq_shapes_builder, op, lhs, rhs));
+
+    rewriter.replaceOp(op, {if_op.getResult(0)});
+    return success();
+  }
+
+ private:
+  // Returns the dyanamic result of checking the given value is a scalar
+  // tensor.
+  Value IsScalarTensor(OpBuilder &rewriter, ChloOpTy op, Value tensor) const {
+    auto loc = op.getLoc();
+
+    Value shape_of_tensor = rewriter.create<shape::ShapeOfOp>(loc, tensor);
+    Value rank_tensor = rewriter.create<shape::RankOp>(
+        loc, rewriter.getIndexType(), shape_of_tensor);
+    return rewriter.create<CmpIOp>(loc, rewriter.getI1Type(), CmpIPredicate::eq,
+                                   rank_tensor,
+                                   rewriter.create<ConstantIndexOp>(loc, 0));
+  }
+
+  // Create the if statement and code for a broadcasting op with a result of a
+  // given rank.
+  scf::IfOp createRankSpecializedBroadcastAndOp(OpBuilder &builder, ChloOpTy op,
+                                                Value lhs, Value rhs,
+                                                Value actual_rank,
+                                                int targeted_rank) const {
+    auto loc = op.getLoc();
+
+    // Create the if block to place the current specialized logic in.
+    Value greater_rank_is_n = builder.create<CmpIOp>(
+        loc, CmpIPredicate::eq, actual_rank,
+        builder.create<ConstantIndexOp>(loc, targeted_rank));
+    auto if_op =
+        builder.create<scf::IfOp>(loc, lhs.getType(), greater_rank_is_n, true);
+    OpBuilder if_builder = if_op.getThenBodyBuilder();
+
+    // Handle shape broadcasting and inferrence.
+    Value lhs_shape = if_builder.create<shape::ShapeOfOp>(loc, lhs);
+    Value rhs_shape = if_builder.create<shape::ShapeOfOp>(loc, rhs);
+    SmallVector<int64_t, 6> ranked_shape(targeted_rank, 1);
+    auto extent_tensor_type =
+        RankedTensorType::get({targeted_rank}, builder.getIndexType());
+    auto reshaped_type = RankedTensorType::get(
+        llvm::SmallVector<int64_t, 6>(targeted_rank,
+                                      RankedTensorType::kDynamicSize),
+        lhs.getType().template dyn_cast<TensorType>().getElementType());
+    Value ranked_shape_val = if_builder.create<shape::ConstShapeOp>(
+        loc, extent_tensor_type,
+        mlir::DenseIntElementsAttr::get(extent_tensor_type, ranked_shape));
+    // TODO(tpopp): Return extent tensors when possible to signal that this is a
+    // guaranteed safe broadcast by construction.
+    Value extended_lhs = if_builder.create<shape::BroadcastOp>(
+        loc, extent_tensor_type, lhs_shape, ranked_shape_val, nullptr);
+    Value extended_rhs = if_builder.create<shape::BroadcastOp>(
+        loc, extent_tensor_type, rhs_shape, ranked_shape_val, nullptr);
+
+    // 1. Reshape operands to the given rank (with the same number of elements)
+    // 2. Compute the ranked-broadcasted ChloOp (which will assert that the ops
+    //    can be broadcasted and do the actual broadcasting)
+    // 3. Type erase the output back to unranked
+    Value reshaped_lhs = if_builder.create<mhlo::DynamicReshapeOp>(
+        loc, reshaped_type, lhs, extended_lhs);
+    Value reshaped_rhs = if_builder.create<mhlo::DynamicReshapeOp>(
+        loc, reshaped_type, rhs, extended_rhs);
+    Value result = if_builder.create<ChloOpTy>(
+        loc, ArrayRef<Type>{reshaped_type},
+        ArrayRef<Value>{reshaped_lhs, reshaped_rhs}, op.getAttrs());
+    Value reshaped_result = if_builder.create<TensorCastOp>(
+        loc, UnrankedTensorType::get(reshaped_type.getElementType()), result);
+    if_builder.create<scf::YieldOp>(loc, reshaped_result);
+
+    // Return the if_op, so the result can be used and the else block can be
+    // used for the next rank specialized step.
+    return if_op;
+  }
+
+  // Iterates over the desired ranks to be specialized and generates the code
+  // snippet for each case.
+  Value HandleBroadcastAndOp(OpBuilder &rewriter, ChloOpTy op, Value lhs,
+                             Value rhs) const {
+    constexpr int max_rank_specialization = 7;
+    auto loc = op.getLoc();
+
+    // Find the larger rank of the 2 operands.
+    auto extent_tensor_type = RankedTensorType::get({ShapedType::kDynamicSize},
+                                                    rewriter.getIndexType());
+    Value lhs_shape =
+        rewriter.create<shape::ShapeOfOp>(loc, extent_tensor_type, lhs);
+    Value rhs_shape =
+        rewriter.create<shape::ShapeOfOp>(loc, extent_tensor_type, rhs);
+    Value lhs_rank =
+        rewriter.create<RankOp>(loc, rewriter.getIndexType(), lhs_shape);
+    Value rhs_rank =
+        rewriter.create<RankOp>(loc, rewriter.getIndexType(), rhs_shape);
+    Value greater_rank_lhs =
+        rewriter.create<CmpIOp>(loc, CmpIPredicate::sgt, lhs_rank, rhs_rank);
+    Value greater_rank =
+        rewriter.create<SelectOp>(loc, greater_rank_lhs, lhs_rank, rhs_rank);
+
+    // Generate a list of nested if/else statements to handle rank
+    // specializations from 2-6.
+    scf::IfOp if_op = createRankSpecializedBroadcastAndOp(rewriter, op, lhs,
+                                                          rhs, greater_rank, 2);
+
+    // Put each subsequent rank specialization inside the else statement of the
+    // previous one.
+    OpBuilder else_builder = if_op.getElseBodyBuilder();
+    for (int i = 3; i < max_rank_specialization; i++) {
+      auto inner_if = createRankSpecializedBroadcastAndOp(else_builder, op, lhs,
+                                                          rhs, greater_rank, i);
+
+      else_builder.create<scf::YieldOp>(loc, inner_if.getResult(0));
+      else_builder = inner_if.getElseBodyBuilder();
+    }
+
+    // Fire an assertion if none of the rank specializations applied (one of the
+    // ranks was greater than 6).
+    else_builder.create<AssertOp>(
+        loc, else_builder.create<ConstantIntOp>(loc, 0, 1),
+        "Input for dynamic binary op lowering was of a rank greater than 6");
+    else_builder.create<scf::YieldOp>(loc, lhs);
+
+    // Return the result of the outermost if statement.
+    return if_op.getResult(0);
+  }
+};
+
 template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
 void PopulateForBinaryOp(MLIRContext *context,
                          OwningRewritePatternList *patterns) {
@@ -169,6 +434,10 @@ void PopulateForBinaryOp(MLIRContext *context,
   patterns->insert<
       ConvertRankedDynamicBroadcastBinaryOp<ChloOpTy, HloOpTy, Adaptor>>(
       context, 5);
+  patterns->insert<
+      ConvertUnrankedScalarDynamicBroadcastBinaryOp<ChloOpTy, HloOpTy>,
+      ConvertUnrankedDynamicBroadcastBinaryOp<ChloOpTy, HloOpTy, Adaptor>>(
+      context);
 }
 
 template <typename FromOpTy, typename ToOpTy>
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_pass.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_pass.cc
index 48749c7d43d..50cd6df5c99 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_pass.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_pass.cc
@@ -13,15 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/Shape/IR/Shape.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/Pass/Pass.h"
 
 namespace mlir {
-namespace chlo {
+namespace mhlo {
 
 namespace {
 
@@ -31,14 +33,15 @@ struct TestChloLegalizeToHloPass
     ConversionTarget conversionTarget(getContext());
     OwningRewritePatternList conversionPatterns;
 
-    conversionTarget.addIllegalDialect<HloClientDialect>();
+    conversionTarget.addIllegalDialect<chlo::HloClientDialect>();
     // Consider the mhlo dialect legal for tests.
     conversionTarget.addLegalDialect<mhlo::MhloDialect>();
     // The conversion uses helpers from the Standard dialect.
     conversionTarget.addLegalDialect<mlir::StandardOpsDialect>();
     conversionTarget.addLegalDialect<mlir::shape::ShapeDialect>();
+    conversionTarget.addLegalDialect<mlir::scf::SCFDialect>();
 
-    PopulateLegalizeChloToHloPatterns(&getContext(), &conversionPatterns);
+    chlo::PopulateLegalizeChloToHloPatterns(&getContext(), &conversionPatterns);
 
     if (failed(applyPartialConversion(getFunction(), conversionTarget,
                                       conversionPatterns))) {
@@ -49,9 +52,10 @@ struct TestChloLegalizeToHloPass
 
 }  // namespace
 
-}  // namespace chlo
+std::unique_ptr<FunctionPass> createTestChloLegalizeToHloPass() {
+  return std::make_unique<TestChloLegalizeToHloPass>();
+}
+
+}  // namespace mhlo
 }  // namespace mlir
 
-static mlir::PassRegistration<mlir::chlo::TestChloLegalizeToHloPass> pass(
-    "mhlo-test-chlo-legalize-to-hlo",
-    "Test pass for applying chlo -> hlo legalization patterns");
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc
index 4ee45d56a8e..a8c3ad17ebb 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc
@@ -15,26 +15,25 @@ limitations under the License.
 
 // This file implements logic for lowering HLO dialect to LHLO dialect.
 
-#include "absl/memory/memory.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/AffineMap.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/Function.h"  // from @llvm-project
-#include "mlir/IR/Location.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Transforms/BufferPlacement.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_hlo_to_lhlo_op.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/map_hlo_to_lhlo_op.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/BufferPlacement.h"
+#include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
 namespace mhlo {
@@ -42,9 +41,6 @@ namespace {
 
 template <typename T>
 using BaseOpConversion = BufferAssignmentOpConversionPattern<T>;
-using StdReturnOpConverter =
-    detail::BufferAssignmentReturnOpConverter<mlir::ReturnOp, mlir::ReturnOp,
-                                              lmhlo::CopyOp, true>;
 
 Value InsertDynamicAllocAndDealloc(Location loc, Value result,
                                    Value shape_operand,
@@ -272,27 +268,21 @@ struct HloToLhloReduceOpConverter : public BaseOpConversion<mhlo::ReduceOp> {
     // Copy over the operations inside the region.
     rewriter.inlineRegionBefore(op.body(), new_op.body(), new_op.body().end());
 
-    // Create new block arguments with correct type.
+    // Convert the region signature to memref and add extra result.
     auto& entry_block = new_op.body().front();
-    int original_arg_count = entry_block.getNumArguments();
-    for (int i = 0; i < original_arg_count; ++i) {
-      auto old_arg = entry_block.getArgument(i);
-      auto old_type = old_arg.getType().cast<TensorType>();
+    TypeConverter::SignatureConversion sig_conversion(
+        entry_block.getNumArguments() + 1);
+    for (auto arg : entry_block.getArguments()) {
+      auto old_type = arg.getType().cast<TensorType>();
       auto new_type =
           MemRefType::get(old_type.getShape(), old_type.getElementType());
-      auto new_arg = entry_block.addArgument(new_type);
-      rewriter.replaceUsesOfBlockArgument(old_arg, new_arg);
+      sig_conversion.addInputs(arg.getArgNumber(), new_type);
     }
-    // Add an argument for the result.
-    entry_block.addArgument(
-        entry_block.getArgument(original_arg_count).getType());
-    // Remove the old arguments.
-    for (int i = original_arg_count - 1; i >= 0; --i) {
-      entry_block.eraseArgument(i);
-    }
-    // Insert terminator at the end.
-    rewriter.setInsertionPointToEnd(&entry_block);
-    rewriter.create<lmhlo::TerminatorOp>(loc);
+    auto return_op = cast<mhlo::ReturnOp>(entry_block.getTerminator());
+    auto result_type = return_op.results().front().getType().cast<TensorType>();
+    sig_conversion.addInputs({MemRefType::get(result_type.getShape(),
+                                              result_type.getElementType())});
+    rewriter.applySignatureConversion(&new_op.body(), sig_conversion);
 
     rewriter.replaceOp(op, ArrayRef<Value>(buffer_args).slice(operands.size()));
 
@@ -300,6 +290,12 @@ struct HloToLhloReduceOpConverter : public BaseOpConversion<mhlo::ReduceOp> {
   }
 };
 
+// Legalize mhlo.return to a lmhlo.copy and lmhlo.terminator. This functionality
+// is provided by mlir buffer assignment, so use the pattern from there.
+// TODO(DFKI): Move this out of detail.
+using HloToLhloReturnOpConverter = detail::BufferAssignmentReturnOpConverter<
+    mhlo::ReturnOp, lmhlo::TerminatorOp, lmhlo::CopyOp, false>;
+
 class HloToLhloTensorLoadOpConverter
     : public BaseOpConversion<mlir::TensorLoadOp> {
  public:
@@ -312,7 +308,6 @@ class HloToLhloTensorLoadOpConverter
   }
 };
 
-// TODO(b/137624192): Rewrite into a copy and elide copy if possible.
 class HloToLhloTensorStoreOpConverter
     : public BaseOpConversion<mlir::TensorStoreOp> {
  public:
@@ -506,6 +501,7 @@ void populateHLOToLHLOConversionPattern(
       HloToLhloOpConverter<mhlo::SubOp>,
       HloToLhloOpConverter<mhlo::TanhOp>,
       HloToLhloReduceOpConverter,
+      HloToLhloReturnOpConverter,
       HloToLhloTensorLoadOpConverter,
       HloToLhloTensorStoreOpConverter
   >(context, bufferAssignment, converter);
@@ -514,11 +510,8 @@ void populateHLOToLHLOConversionPattern(
 
 std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToLhloPass(
     bool results_escape_function) {
-  return absl::make_unique<HloLegalizeToLhlo>(results_escape_function);
+  return std::make_unique<HloLegalizeToLhlo>(results_escape_function);
 }
 
-static PassRegistration<HloLegalizeToLhlo> legalize_pass(
-    "hlo-legalize-to-lhlo", "Legalize from HLO dialect to LHLO dialect");
-
 }  // namespace mhlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_control_flow.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_control_flow.cc
index 440df7ec23f..b6e23a6b131 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_control_flow.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_control_flow.cc
@@ -18,27 +18,27 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Casting.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/Block.h"  // from @llvm-project
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/Function.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
+#include "mlir/Support/LogicalResult.h"
 
 using mlir::PassRegistration;
 
 namespace mlir {
 namespace mhlo {
 namespace {
-struct LegalizeControlFlow
-    : public mlir::PassWrapper<LegalizeControlFlow, FunctionPass> {
+struct LegalizeControlFlowPass
+    : public mlir::PassWrapper<LegalizeControlFlowPass, FunctionPass> {
   // Perform the lowering to MLIR control flow.
   void runOnFunction() override;
 };
@@ -206,7 +206,7 @@ LogicalResult LowerWhileOp(mlir::mhlo::WhileOp while_op) {
   return success();
 }
 
-void LegalizeControlFlow::runOnFunction() {
+void LegalizeControlFlowPass::runOnFunction() {
   auto func = getFunction();
   llvm::SmallVector<IfOp, 4> if_ops;
   func.walk([&](IfOp op) { if_ops.push_back(op); });
@@ -228,9 +228,5 @@ void LegalizeControlFlow::runOnFunction() {
 
 std::unique_ptr<mlir::OperationPass<mlir::FuncOp>>
 mlir::mhlo::createLegalizeControlFlowPass() {
-  return std::make_unique<LegalizeControlFlow>();
+  return std::make_unique<LegalizeControlFlowPass>();
 }
-
-static PassRegistration<mlir::mhlo::LegalizeControlFlow> legalize_cf_pass(
-    "mhlo-legalize-control-flow",
-    "Legalize from MHLO control flow to CFG control flow");
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_gather_to_torch_index_select.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_gather_to_torch_index_select.cc
new file mode 100644
index 00000000000..59cd3381133
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_gather_to_torch_index_select.cc
@@ -0,0 +1,151 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+
+namespace mhlo {
+namespace {
+
+struct GatherIsTorchIndexSelect : public OpRewritePattern<GatherOp> {
+  using OpRewritePattern<GatherOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(GatherOp gather,
+                                PatternRewriter &rewriter) const override {
+    auto start_indices = gather.start_indices();
+    auto start_indices_ty = start_indices.getType().cast<ShapedType>();
+    if (!start_indices_ty.hasRank()) {
+      return failure();
+    }
+
+    auto operand = gather.operand();
+    auto operand_ty = operand.getType().cast<ShapedType>();
+    if (!operand_ty.hasRank()) {
+      return failure();
+    }
+
+    int64_t index_vector_dim =
+        std::max<int64_t>(0, start_indices_ty.getRank() - 1);
+
+    // We can use torch_index_select if the last dimension represents the
+    // gather indices.
+    auto dimension_numbers = gather.dimension_numbers();
+    if (dimension_numbers.index_vector_dim().getValue().getSExtValue() !=
+        index_vector_dim) {
+      return failure();
+    }
+
+    // Index select only works across a single dimension.
+    if (!start_indices_ty.getShape().empty() &&
+        start_indices_ty.getShape().back() != 1) {
+      return failure();
+    }
+
+    // Only support the default case for start_index_map.
+    if (dimension_numbers.start_index_map().getType().getRank() != 1 ||
+        dimension_numbers.start_index_map()
+                .getValue(0)
+                .cast<IntegerAttr>()
+                .getValue() != 0) {
+      return failure();
+    }
+
+    auto result_ty = gather.getResult().getType().dyn_cast<RankedTensorType>();
+    if (!result_ty) {
+      return failure();
+    }
+
+    // Offset dimensions should be the defaults.
+    if (dimension_numbers.offset_dims().getType().getNumElements() !=
+        result_ty.getRank() - index_vector_dim) {
+      return failure();
+    }
+
+    for (auto it : llvm::enumerate(dimension_numbers.offset_dims())) {
+      if ((it.index() + index_vector_dim) != it.value()) {
+        return failure();
+      }
+    }
+
+    for (auto it : llvm::enumerate(gather.slice_sizes().getIntValues())) {
+      // First shape value must be 1.
+      if (it.index() == 0) {
+        if (it.value().getSExtValue() != 1) {
+          return failure();
+        }
+        continue;
+      }
+
+      // The gather needs to index the entire slice for each other dimension.
+      if (it.value().getSExtValue() != operand_ty.getDimSize(it.index())) {
+        return failure();
+      }
+    }
+
+    llvm::SmallVector<int64_t, 4> index_select_shape =
+        llvm::to_vector<4>(start_indices_ty.getShape());
+
+    for (auto dim : operand_ty.getShape().drop_front()) {
+      index_select_shape.push_back(dim);
+    }
+
+    if (!dimension_numbers.collapsed_slice_dims().getType().hasRank() ||
+        dimension_numbers.collapsed_slice_dims().getType().getNumElements() !=
+            1 ||
+        dimension_numbers.collapsed_slice_dims().getValue<int64_t>({0}) != 0) {
+      return failure();
+    }
+
+    auto torch_index_select = rewriter.create<TorchIndexSelectOp>(
+        gather.getLoc(),
+        RankedTensorType::get(index_select_shape, operand_ty.getElementType()),
+        operand, gather.start_indices(), rewriter.getI64IntegerAttr(0),
+        rewriter.getI64IntegerAttr(0));
+
+    rewriter.replaceOpWithNewOp<ReshapeOp>(gather, gather.getType(),
+                                           torch_index_select);
+
+    return success();
+  }
+};
+
+struct LegalizeGatherToTorchIndexSelectPass
+    : public PassWrapper<LegalizeGatherToTorchIndexSelectPass, FunctionPass> {
+  /// Perform the lowering of standard dialect operations to approximations.
+  void runOnFunction() override {
+    OwningRewritePatternList patterns;
+    PopulateGatherToTorchIndexSelectPatterns(&getContext(), &patterns);
+    applyPatternsAndFoldGreedily(getFunction(), patterns);
+  }
+};
+}  // namespace
+
+void PopulateGatherToTorchIndexSelectPatterns(
+    mlir::MLIRContext *context, OwningRewritePatternList *patterns) {
+  patterns->insert<GatherIsTorchIndexSelect>(context);
+}
+
+std::unique_ptr<FunctionPass> createLegalizeGatherToTorchIndexSelectPass() {
+  return std::make_unique<LegalizeGatherToTorchIndexSelectPass>();
+}
+
+}  // namespace mhlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_tanh_to_approximation.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_tanh_to_approximation.cc
index 1890646160e..57c494f536b 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_tanh_to_approximation.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_tanh_to_approximation.cc
@@ -16,15 +16,15 @@ limitations under the License.
 // This file implements logic for lowering the tanh standard ops to an
 // approximation.
 
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/Function.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
 
 namespace mlir {
-namespace hlo {
+namespace mhlo {
 namespace {
 
 /// Emits the fast tanh approximation that is also used by XLA.
@@ -126,8 +126,8 @@ class ApproximateTanhLowering : public OpRewritePattern<TanhOp> {
   }
 };
 
-struct LegalizeTanhToApproximation
-    : public PassWrapper<LegalizeTanhToApproximation, FunctionPass> {
+struct LegalizeTanhToApproximationPass
+    : public PassWrapper<LegalizeTanhToApproximationPass, FunctionPass> {
   /// Perform the lowering of standard dialect operations to approximations.
   void runOnFunction() override {
     OwningRewritePatternList patterns;
@@ -140,7 +140,7 @@ struct LegalizeTanhToApproximation
 
 std::unique_ptr<mlir::OperationPass<mlir::FuncOp>>
 createLegalizeTanhToApproximationPass() {
-  return std::make_unique<LegalizeTanhToApproximation>();
+  return std::make_unique<LegalizeTanhToApproximationPass>();
 }
 
 void PopulateTanhToApproximationPatterns(mlir::MLIRContext *context,
@@ -148,9 +148,5 @@ void PopulateTanhToApproximationPatterns(mlir::MLIRContext *context,
   patterns->insert<ApproximateTanhLowering>(context);
 }
 
-static PassRegistration<LegalizeTanhToApproximation> legalize_pass(
-    "mhlo-legalize-tanh-to-approximation",
-    "Legalize tanh from standard dialect to an approximation");
-
-}  // namespace hlo
+}  // namespace mhlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc
index 717e9682436..f47f2c2fbdc 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc
@@ -15,26 +15,25 @@ limitations under the License.
 
 // This file implements logic for lowering HLO/LHLO dialect to Linalg dialect.
 
-#include "absl/memory/memory.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
-#include "mlir/Dialect/Linalg/IR/LinalgOps.h"  // from @llvm-project
-#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"  // from @llvm-project
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/AffineExpr.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/Function.h"  // from @llvm-project
-#include "mlir/IR/Location.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
 namespace {
@@ -298,8 +297,8 @@ class DataMovementOpConverter : public OpConversionPattern<OpTy> {
     auto nloops = resultType.getRank();
     auto loc = op.getLoc();
     auto linalgOp = rewriter.create<linalg::GenericOp>(
-        loc, isLHLO ? ArrayRef<Type>{} : resultType, args, /*inputCount=*/1,
-        /*outputCount=*/1, indexing_maps, GetNParallelLoopsAttrs(nloops),
+        loc, isLHLO ? ArrayRef<Type>{} : resultType, args, /*argsIn=*/1,
+        /*argsOut=*/1, indexing_maps, GetNParallelLoopsAttrs(nloops),
         [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange args) {
           nestedBuilder.create<linalg::YieldOp>(loc, *args.begin());
         });
@@ -420,7 +419,7 @@ class LhloBroadcastInDimConverter
           rewriter.create<LoadOp>(loc, operand, llvm::makeArrayRef({zero}));
       rewriter.create<linalg::GenericOp>(
           loc, llvm::None, llvm::makeArrayRef(operand_adaptor.output()),
-          /*inputCount=*/0, /*outputCount=*/1,
+          /*argsIn=*/0, /*argsOut=*/1,
           llvm::makeArrayRef(rewriter.getMultiDimIdentityMap(nloops)),
           GetNParallelLoopsAttrs(nloops),
           [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange args) {
@@ -433,7 +432,7 @@ class LhloBroadcastInDimConverter
       rewriter.create<linalg::GenericOp>(
           loc, llvm::None,
           llvm::makeArrayRef({operand, operand_adaptor.output()}),
-          /*inputCount=*/1, /*outputCount=*/1, indexing_maps,
+          /*argsIn=*/1, /*argsOut=*/1, indexing_maps,
           GetNParallelLoopsAttrs(nloops),
           [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange args) {
             nestedBuilder.create<linalg::YieldOp>(loc, *args.begin());
@@ -640,25 +639,25 @@ class ReshapeOpConverter : public OpConversionPattern<OpTy> {
   }
 };
 
-class IotaConverter : public OpConversionPattern<lmhlo::IotaOp> {
+template <typename OpTy, bool isLHLO = true>
+class IotaConverter : public OpConversionPattern<OpTy> {
  public:
-  using OpConversionPattern<lmhlo::IotaOp>::OpConversionPattern;
+  using OpConversionPattern<OpTy>::OpConversionPattern;
 
   LogicalResult matchAndRewrite(
-      lmhlo::IotaOp iotaOp, ArrayRef<Value> args,
+      OpTy iotaOp, ArrayRef<Value> args,
       ConversionPatternRewriter& rewriter) const final {
-    auto resultMemrefType =
-        iotaOp.getOperand().getType().dyn_cast<MemRefType>();
-    if (!resultMemrefType) return failure();
+    ShapedType resultShapedType = getHloOpResultType<isLHLO>(iotaOp);
+    if (!resultShapedType) return failure();
 
-    auto resultElementType = resultMemrefType.getElementType();
+    auto resultElementType = resultShapedType.getElementType();
     if (!resultElementType.isSignlessIntOrFloat()) return failure();
 
     // Construct the indexing maps needed for linalg.generic ops.
-    unsigned nloops = resultMemrefType.getRank();
+    unsigned nloops = resultShapedType.getRank();
 
-    rewriter.create<linalg::IndexedGenericOp>(
-        iotaOp.getLoc(), ArrayRef<Type>{}, args,
+    auto linalgOp = rewriter.create<linalg::IndexedGenericOp>(
+        iotaOp.getLoc(), isLHLO ? ArrayRef<Type>{} : resultShapedType, args,
         0,  // args_in
         1,  // args_out
         llvm::makeArrayRef(rewriter.getMultiDimIdentityMap(nloops)),
@@ -669,14 +668,16 @@ class IotaConverter : public OpConversionPattern<lmhlo::IotaOp> {
               nestedLoc, ivs[iotaOp.iota_dimension().getZExtValue()],
               nestedBuilder.getIntegerType(
                   resultElementType.getIntOrFloatBitWidth()));
-          if (resultElementType.isa<FloatType>()) {
+          if (resultElementType.template isa<FloatType>()) {
             castOp = nestedBuilder.create<SIToFPOp>(nestedLoc, castOp,
                                                     resultElementType);
           }
           nestedBuilder.create<linalg::YieldOp>(nestedLoc, castOp);
         });
-
-    rewriter.replaceOp(iotaOp, llvm::None);
+    if (isLHLO)
+      rewriter.replaceOp(iotaOp, llvm::None);
+    else
+      rewriter.replaceOp(iotaOp, linalgOp.output_tensors());
     return success();
   }
 };
@@ -768,7 +769,7 @@ void populateLHLOToLinalgConversionPattern(MLIRContext* context,
   patterns->insert<BroadcastConverter<lmhlo::BroadcastOp>,
                    ConstConverter,
                    ConvToLinalgConverter,
-                   IotaConverter,
+                   IotaConverter<lmhlo::IotaOp>,
                    LhloBroadcastInDimConverter,
                    PointwiseToLinalgConverter<lmhlo::AbsOp>,
                    PointwiseToLinalgConverter<lmhlo::AddOp>,
@@ -824,8 +825,8 @@ void populateLHLOToLinalgConversionPattern(MLIRContext* context,
 //     indexing_maps = [#map0, #map0, #map0],
 //     iterator_types = ["parallel", "parallel"],
 // } : (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
-struct LhloLegalizeToLinalg
-    : public PassWrapper<LhloLegalizeToLinalg, FunctionPass> {
+struct LhloLegalizeToLinalgPass
+    : public PassWrapper<LhloLegalizeToLinalgPass, FunctionPass> {
   void runOnFunction() override {
     OwningRewritePatternList patterns;
     ConversionTarget target(getContext());
@@ -840,8 +841,8 @@ struct LhloLegalizeToLinalg
   }
 };
 
-struct HloLegalizeToLinalg
-    : public PassWrapper<HloLegalizeToLinalg, FunctionPass> {
+struct HloLegalizeToLinalgPass
+    : public PassWrapper<HloLegalizeToLinalgPass, FunctionPass> {
   void runOnFunction() override {
     OwningRewritePatternList patterns;
     ConversionTarget target(getContext());
@@ -859,54 +860,49 @@ struct HloLegalizeToLinalg
 
 namespace lmhlo {
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeLhloToLinalgPass() {
-  return absl::make_unique<LhloLegalizeToLinalg>();
+  return std::make_unique<LhloLegalizeToLinalgPass>();
 }
-
-static PassRegistration<LhloLegalizeToLinalg> legalize_lhlo_pass(
-    "lhlo-legalize-to-linalg", "Legalize from LHLO dialect to Linalg dialect");
 }  // namespace lmhlo
 
 namespace mhlo {
 
 void populateHLOToLinalgConversionPattern(MLIRContext* context,
                                           OwningRewritePatternList* patterns) {
-  patterns->insert<BroadcastConverter<mhlo::BroadcastOp, false>,
-                   HloBroadcastInDimConverter,
-                   PointwiseToLinalgConverter<mhlo::AbsOp, false>,
-                   PointwiseToLinalgConverter<mhlo::AddOp, false>,
-                   PointwiseToLinalgConverter<mhlo::AndOp, false>,
-                   PointwiseToLinalgConverter<mhlo::CeilOp, false>,
-                   PointwiseToLinalgConverter<mhlo::CompareOp, false>,
-                   PointwiseToLinalgConverter<mhlo::ComplexOp, false>,
-                   PointwiseToLinalgConverter<mhlo::ConvertOp, false>,
-                   PointwiseToLinalgConverter<mhlo::CopyOp, false>,
-                   PointwiseToLinalgConverter<mhlo::CosOp, false>,
-                   PointwiseToLinalgConverter<mhlo::DivOp, false>,
-                   PointwiseToLinalgConverter<mhlo::ExpOp, false>,
-                   PointwiseToLinalgConverter<mhlo::ImagOp, false>,
-                   PointwiseToLinalgConverter<mhlo::LogOp, false>,
-                   PointwiseToLinalgConverter<mhlo::MaxOp, false>,
-                   PointwiseToLinalgConverter<mhlo::MinOp, false>,
-                   PointwiseToLinalgConverter<mhlo::MulOp, false>,
-                   PointwiseToLinalgConverter<mhlo::NegOp, false>,
-                   PointwiseToLinalgConverter<mhlo::RealOp, false>,
-                   PointwiseToLinalgConverter<mhlo::RemOp, false>,
-                   PointwiseToLinalgConverter<mhlo::RsqrtOp, false>,
-                   PointwiseToLinalgConverter<mhlo::SelectOp, false>,
-                   PointwiseToLinalgConverter<mhlo::SinOp, false>,
-                   PointwiseToLinalgConverter<mhlo::SqrtOp, false>,
-                   PointwiseToLinalgConverter<mhlo::SubOp, false>,
-                   PointwiseToLinalgConverter<mhlo::TanhOp, false>,
-                   ReshapeOpConverter<mhlo::ReshapeOp, false>,
-                   ReverseConverter<mhlo::ReverseOp, false>,
-                   TransposeConverter<mhlo::TransposeOp, false>>(context);
+  patterns
+      ->insert<BroadcastConverter<mhlo::BroadcastOp, false>,
+               HloBroadcastInDimConverter, IotaConverter<mhlo::IotaOp, false>,
+               PointwiseToLinalgConverter<mhlo::AbsOp, false>,
+               PointwiseToLinalgConverter<mhlo::AddOp, false>,
+               PointwiseToLinalgConverter<mhlo::AndOp, false>,
+               PointwiseToLinalgConverter<mhlo::CeilOp, false>,
+               PointwiseToLinalgConverter<mhlo::CompareOp, false>,
+               PointwiseToLinalgConverter<mhlo::ComplexOp, false>,
+               PointwiseToLinalgConverter<mhlo::ConvertOp, false>,
+               PointwiseToLinalgConverter<mhlo::CopyOp, false>,
+               PointwiseToLinalgConverter<mhlo::CosOp, false>,
+               PointwiseToLinalgConverter<mhlo::DivOp, false>,
+               PointwiseToLinalgConverter<mhlo::ExpOp, false>,
+               PointwiseToLinalgConverter<mhlo::ImagOp, false>,
+               PointwiseToLinalgConverter<mhlo::LogOp, false>,
+               PointwiseToLinalgConverter<mhlo::MaxOp, false>,
+               PointwiseToLinalgConverter<mhlo::MinOp, false>,
+               PointwiseToLinalgConverter<mhlo::MulOp, false>,
+               PointwiseToLinalgConverter<mhlo::NegOp, false>,
+               PointwiseToLinalgConverter<mhlo::RealOp, false>,
+               PointwiseToLinalgConverter<mhlo::RemOp, false>,
+               PointwiseToLinalgConverter<mhlo::RsqrtOp, false>,
+               PointwiseToLinalgConverter<mhlo::SelectOp, false>,
+               PointwiseToLinalgConverter<mhlo::SinOp, false>,
+               PointwiseToLinalgConverter<mhlo::SqrtOp, false>,
+               PointwiseToLinalgConverter<mhlo::SubOp, false>,
+               PointwiseToLinalgConverter<mhlo::TanhOp, false>,
+               ReshapeOpConverter<mhlo::ReshapeOp, false>,
+               ReverseConverter<mhlo::ReverseOp, false>,
+               TransposeConverter<mhlo::TransposeOp, false>>(context);
 }
 
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeHloToLinalgPass() {
-  return absl::make_unique<HloLegalizeToLinalg>();
+  return std::make_unique<HloLegalizeToLinalgPass>();
 }
-
-static PassRegistration<HloLegalizeToLinalg> legalize_hlo_pass(
-    "hlo-legalize-to-linalg", "Legalize from HLO dialect to Linalg dialect");
 }  // namespace mhlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_standard.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_standard.cc
index c71aa1d0460..cc574e008d5 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_standard.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_standard.cc
@@ -16,17 +16,17 @@ limitations under the License.
 // This file implements logic for lowering MHLO dialect to Standard dialect.
 
 #include "llvm/ADT/StringSwitch.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/Function.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
 
 namespace mlir {
 namespace {
-#include "tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/generated_legalize_to_standard.inc"
+#include "generated_legalize_to_standard.inc"
 }  // end anonymous namespace
 namespace mhlo {
 namespace {
@@ -176,15 +176,15 @@ class ConvertIotaOp : public OpRewritePattern<mhlo::IotaOp> {
 }  // end anonymous namespace
 
 namespace {
-struct LegalizeToStandard
-    : public PassWrapper<LegalizeToStandard, FunctionPass> {
+struct LegalizeToStandardPass
+    : public PassWrapper<LegalizeToStandardPass, FunctionPass> {
   /// Perform the lowering to Standard dialect.
   void runOnFunction() override;
 };
 }  // end anonymous namespace
 
 std::unique_ptr<mlir::OperationPass<mlir::FuncOp>> createLegalizeToStdPass() {
-  return std::make_unique<LegalizeToStandard>();
+  return std::make_unique<LegalizeToStandardPass>();
 }
 
 void PopulateMhloToStdPatterns(OwningRewritePatternList *patterns,
@@ -194,14 +194,11 @@ void PopulateMhloToStdPatterns(OwningRewritePatternList *patterns,
 }
 
 /// Perform the lowering to standard dialect.
-void LegalizeToStandard::runOnFunction() {
+void LegalizeToStandardPass::runOnFunction() {
   OwningRewritePatternList patterns;
   mlir::mhlo::PopulateMhloToStdPatterns(&patterns, &getContext());
   applyPatternsAndFoldGreedily(getFunction(), patterns);
 }
 
-static PassRegistration<LegalizeToStandard> legalize_pass(
-    "mhlo-legalize-to-std", "Legalize from MHLO dialect to standard dialect");
-
 }  // end namespace mhlo
 }  // end namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_standard_patterns.td b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_standard_patterns.td
index 0e6fdf06701..ea67c052c5c 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_standard_patterns.td
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_standard_patterns.td
@@ -17,7 +17,7 @@ limitations under the License.
 
 include "mlir/IR/OpBase.td"
 include "mlir/Dialect/StandardOps/IR/Ops.td"
-include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td"
+include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.td"
 
 //===----------------------------------------------------------------------===//
 // Nullary op patterns.
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_copy_removal.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_copy_removal.cc
index d2607887482..7a4418466b5 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_copy_removal.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_copy_removal.cc
@@ -15,12 +15,11 @@ limitations under the License.
 
 // This file implements a pass to remove redundant LHLO copy operations.
 
-#include "absl/memory/memory.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Pass/Pass.h"
 
 namespace mlir {
 namespace lmhlo {
@@ -30,7 +29,8 @@ namespace {
 // arguments. All uses of each buffer are replaced with the corresponding block
 // argument and the buffer is freed. Note that this pass only works in regions
 // with a single block.
-struct LhloCopyRemoval : mlir::PassWrapper<LhloCopyRemoval, OperationPass<>> {
+struct LhloCopyRemovalPass
+    : mlir::PassWrapper<LhloCopyRemovalPass, OperationPass<>> {
   void runOnOperation() override {
     llvm::SmallVector<mlir::Operation*, 2> eraseList;
     auto operation = getOperation();
@@ -95,11 +95,8 @@ struct LhloCopyRemoval : mlir::PassWrapper<LhloCopyRemoval, OperationPass<>> {
 }  // namespace
 
 std::unique_ptr<Pass> createLhloCopyRemovalPass() {
-  return absl::make_unique<LhloCopyRemoval>();
+  return std::make_unique<LhloCopyRemovalPass>();
 }
 
-static PassRegistration<LhloCopyRemoval> copy_removal_pass(
-    "lhlo-copy-removal", "Removes redundant LHLO copy operations");
-
 }  // namespace lmhlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_fuse_linalg.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_fuse_linalg.cc
index d832b96bf7b..1467f015dc9 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_fuse_linalg.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_fuse_linalg.cc
@@ -16,15 +16,14 @@ limitations under the License.
 // This file implements logic for fusing linalg ops obtained after LHLO
 // lowering.
 
-#include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
-#include "absl/memory/memory.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"  // from @llvm-project
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Transforms/FoldUtils.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/FoldUtils.h"
 
 namespace mlir {
 namespace lmhlo {
@@ -32,11 +31,13 @@ namespace {
 
 using linalg::LinalgOp;
 
-class LhloFuseLinalg : public PassWrapper<LhloFuseLinalg, FunctionPass> {
+class LhloFuseLinalgPass
+    : public PassWrapper<LhloFuseLinalgPass, FunctionPass> {
  public:
-  LhloFuseLinalg() = default;
-  LhloFuseLinalg(const LhloFuseLinalg&) {}
-  LhloFuseLinalg(bool use_parallel_loops, llvm::ArrayRef<unsigned> tile_sizes) {
+  LhloFuseLinalgPass() = default;
+  LhloFuseLinalgPass(const LhloFuseLinalgPass&) {}
+  LhloFuseLinalgPass(bool use_parallel_loops,
+                     llvm::ArrayRef<unsigned> tile_sizes) {
     tile_sizes_ = tile_sizes;
     use_parallel_loops_.setValue(use_parallel_loops);
   }
@@ -138,14 +139,10 @@ class LhloFuseLinalg : public PassWrapper<LhloFuseLinalg, FunctionPass> {
 
 }  // namespace
 
-std::unique_ptr<OperationPass<FuncOp>> createLhloFuseLinalg(
+std::unique_ptr<FunctionPass> createLhloFuseLinalgPass(
     bool use_parallel_loops, ArrayRef<unsigned> tile_sizes) {
-  return absl::make_unique<LhloFuseLinalg>(use_parallel_loops, tile_sizes);
+  return std::make_unique<LhloFuseLinalgPass>(use_parallel_loops, tile_sizes);
 }
 
-static PassRegistration<LhloFuseLinalg> legalize_pass(
-    "lhlo-fuse-linalg",
-    "Greedily fuse linalg ops obtained after LHLO lowering.");
-
 }  // namespace lmhlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_affine.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_affine.cc
index a353472be4b..07891327775 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_affine.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_affine.cc
@@ -15,17 +15,16 @@ limitations under the License.
 
 // This file implements logic for lowering LHLO dialect to Affine dialect.
 
-#include "absl/memory/memory.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Location.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Pass/Pass.h"
 
 namespace mlir {
 namespace lmhlo {
@@ -138,8 +137,8 @@ void populateLHLOToAffineConversionPattern(MLIRContext* context,
   // clang-format on
 }
 
-struct LhloLegalizeToAffine
-    : public PassWrapper<LhloLegalizeToAffine, FunctionPass> {
+struct LhloLegalizeToAffinePass
+    : public PassWrapper<LhloLegalizeToAffinePass, FunctionPass> {
   void runOnFunction() override {
     OwningRewritePatternList patterns;
     auto func = getFunction();
@@ -150,12 +149,9 @@ struct LhloLegalizeToAffine
 
 }  // namespace
 
-std::unique_ptr<OperationPass<FuncOp>> createLegalizeToAffinePass() {
-  return absl::make_unique<LhloLegalizeToAffine>();
+std::unique_ptr<OperationPass<FuncOp>> createLhloLegalizeToAffinePass() {
+  return std::make_unique<LhloLegalizeToAffinePass>();
 }
 
-static PassRegistration<LhloLegalizeToAffine> legalize_pass(
-    "lhlo-legalize-to-affine", "Legalize from LHLO dialect to affine dialect");
-
 }  // namespace lmhlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_gpu.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_gpu.cc
index 0ff491a93c3..cffb58b37de 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_gpu.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_gpu.cc
@@ -17,25 +17,24 @@ limitations under the License.
 
 #include <cstdint>
 
-#include "absl/memory/memory.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
-#include "mlir/Dialect/Linalg/IR/LinalgOps.h"  // from @llvm-project
-#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/Function.h"  // from @llvm-project
-#include "mlir/IR/Location.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
 namespace lmhlo {
@@ -148,9 +147,9 @@ class LhloReduceToGPULaunchConverter : public OpConversionPattern<ReduceOp> {
       // Now copy over the actual body of the reduction, leaving out the
       // terminator.
       BlockAndValueMapping mapping;
-      mapping.map(reduce_op.body().front().getArgument(0), accumulator);
-      mapping.map(reduce_op.body().front().getArgument(1), rhs);
-      mapping.map(reduce_op.body().front().getArgument(2), accumulator);
+      mapping.map(reduce_op.body().getArgument(0), accumulator);
+      mapping.map(reduce_op.body().getArgument(1), rhs);
+      mapping.map(reduce_op.body().getArgument(2), accumulator);
       for (auto& nested : reduce_op.body().front().without_terminator()) {
         auto clone = rewriter.clone(nested, mapping);
         for (auto pair : llvm::zip(nested.getResults(), clone->getResults())) {
@@ -168,7 +167,8 @@ class LhloReduceToGPULaunchConverter : public OpConversionPattern<ReduceOp> {
   };
 };
 
-struct LhloLegalizeToGpu : public PassWrapper<LhloLegalizeToGpu, FunctionPass> {
+struct LhloLegalizeToGpuPass
+    : public PassWrapper<LhloLegalizeToGpuPass, FunctionPass> {
   void runOnFunction() override {
     OwningRewritePatternList patterns;
     ConversionTarget target(getContext());
@@ -185,12 +185,9 @@ struct LhloLegalizeToGpu : public PassWrapper<LhloLegalizeToGpu, FunctionPass> {
 
 }  // namespace
 
-std::unique_ptr<OperationPass<FuncOp>> createLegalizeToGpuPass() {
-  return absl::make_unique<LhloLegalizeToGpu>();
+std::unique_ptr<FunctionPass> createLegalizeToGpuPass() {
+  return std::make_unique<LhloLegalizeToGpuPass>();
 }
 
-static PassRegistration<LhloLegalizeToGpu> legalize_pass(
-    "lhlo-legalize-to-gpu", "Legalize from LHLO dialect to GPU dialect");
-
 }  // namespace lmhlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm.cc
index 32606f068a8..42b71543543 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"  // from @llvm-project
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
 namespace lmhlo {
@@ -133,8 +133,8 @@ struct ReshapeMemRefCastOpConverter
     Location loc = op->getLoc();
 
     auto reshape_op = cast<ReshapeMemRefCastOp>(op);
-    Type dst_type = reshape_op.getResult().getType();
-    auto element_type = dst_type.cast<ShapedType>().getElementType();
+    auto dst_type = reshape_op.getResult().getType().cast<BaseMemRefType>();
+    auto element_type = dst_type.getElementType();
 
     auto shape = reshape_op.shape();
 
@@ -162,18 +162,17 @@ struct ReshapeMemRefCastOpConverter
       desc.setAlignedPtr(rewriter, loc, ptrs_n_offset.aligned_ptr);
       desc.setOffset(rewriter, loc, ptrs_n_offset.offset);
 
-      auto llvmIndexTy = typeConverter.convertType(rewriter.getIndexType())
-                             .cast<LLVM::LLVMType>();
-      auto llvmIndexTyPtr = llvmIndexTy.getPointerTo();
+      auto llvm_index_type = typeConverter.getIndexType();
+      auto llvm_index_ptr_type = llvm_index_type.getPointerTo();
       Value stride_carried = rewriter.create<LLVM::ConstantOp>(
-          loc, llvmIndexTy,
+          loc, llvm_index_type,
           rewriter.getIntegerAttr(rewriter.getIndexType(), 1));
       for (int i = shape_length - 1; i >= 0; --i) {
         Value pos = rewriter.create<LLVM::ConstantOp>(
-            loc, llvmIndexTy,
+            loc, llvm_index_type,
             rewriter.getIntegerAttr(rewriter.getIndexType(), i));
         Value ptr = rewriter.create<LLVM::GEPOp>(
-            loc, llvmIndexTyPtr, shape_desc.alignedPtr(rewriter, loc),
+            loc, llvm_index_ptr_type, shape_desc.alignedPtr(rewriter, loc),
             ValueRange{pos});
         Value extracted_size = rewriter.create<LLVM::LoadOp>(loc, ptr);
         desc.setSize(rewriter, loc, i, extracted_size);
@@ -188,7 +187,7 @@ struct ReshapeMemRefCastOpConverter
         rewriter.replaceOp(op, {desc});
       } else {
         Value rank = rewriter.create<LLVM::ConstantOp>(
-            loc, llvmIndexTy,
+            loc, llvm_index_type,
             rewriter.getIntegerAttr(rewriter.getIndexType(), shape_length));
         Value alloca =
             typeConverter.promoteOneMemRefDescriptor(loc, desc, rewriter);
@@ -199,15 +198,126 @@ struct ReshapeMemRefCastOpConverter
             {rank, void_ptr});
         rewriter.replaceOp(op, {unranked_desc});
       }
-    } else {
-      /*
-       * TODO(pifon, herhut):
-       *   Compute strides with llvm.loop;
-       *   Use UnrankedMemrefDescr::ComputeSize with Alloca;
-       *   Set all the fields using getelementptr.
-       */
-      return failure();
+      return success();
     }
+
+    // The shape is a rank-1 tensor with unknown length.
+    Value result_rank = shape_desc.size(rewriter, loc, 0);
+    // TODO(herhut): Propely handle address spaces.
+    unsigned address_space = 0;
+    auto target_type =
+        typeConverter
+            .convertType(UnrankedMemRefType::get(element_type, address_space))
+            .cast<LLVM::LLVMType>();
+    // Create the unranked memref descriptor that holds the ranked one. The
+    // inner descriptor is allocated on stack.
+    UnrankedMemRefDescriptor target_desc =
+        UnrankedMemRefDescriptor::undef(rewriter, loc, target_type);
+    target_desc.setRank(rewriter, loc, result_rank);
+    SmallVector<Value, 1> sizes;
+    UnrankedMemRefDescriptor::computeSizes(rewriter, loc, typeConverter,
+                                           {target_desc}, sizes);
+    auto void_ptr_type = LLVM::LLVMType::getInt8PtrTy(rewriter.getContext());
+    Value ranked_desc_mem = rewriter.create<LLVM::AllocaOp>(
+        loc, void_ptr_type, sizes.front(), llvm::None);
+    target_desc.setMemRefDescPtr(rewriter, loc, ranked_desc_mem);
+
+    // Fill the fixed parts. For this, we cast to a 0-D memref.
+    auto zero_d_memref_type = MemRefType::get({}, element_type);
+    Value as_zero_d = rewriter.create<LLVM::BitcastOp>(
+        loc,
+        typeConverter.convertType(zero_d_memref_type)
+            .cast<LLVM::LLVMType>()
+            .getPointerTo(address_space),
+        ranked_desc_mem);
+    // Some common constants. Use 32 bit where required by gep struct indexes.
+    auto int32_type = typeConverter.convertType(rewriter.getI32Type());
+    Value zero_index = rewriter.create<LLVM::ConstantOp>(
+        loc, typeConverter.getIndexType(), rewriter.getIndexAttr(0));
+    Value zero = rewriter.create<LLVM::ConstantOp>(
+        loc, int32_type, rewriter.getI32IntegerAttr(0));
+    Value one = rewriter.create<LLVM::ConstantOp>(
+        loc, int32_type, rewriter.getI32IntegerAttr(1));
+    Value two = rewriter.create<LLVM::ConstantOp>(
+        loc, int32_type, rewriter.getI32IntegerAttr(2));
+    // Set base_pointer and aligned pointer.
+    auto element_ptr_ptr_type = typeConverter.convertType(element_type)
+                                    .cast<LLVM::LLVMType>()
+                                    .getPointerTo(address_space)
+                                    .getPointerTo(address_space);
+    auto base_gep = rewriter.create<LLVM::GEPOp>(
+        loc, element_ptr_ptr_type, as_zero_d, ValueRange({zero_index, zero}));
+    rewriter.create<LLVM::StoreOp>(loc, ptrs_n_offset.allocated_ptr, base_gep);
+    auto aligned_gep = rewriter.create<LLVM::GEPOp>(
+        loc, element_ptr_ptr_type, as_zero_d, ValueRange({zero_index, one}));
+    rewriter.create<LLVM::StoreOp>(loc, ptrs_n_offset.aligned_ptr, aligned_gep);
+    // Set offset.
+    auto index_ptr_type =
+        typeConverter.getIndexType().getPointerTo(address_space);
+    auto offset_gep = rewriter.create<LLVM::GEPOp>(
+        loc, index_ptr_type, as_zero_d, ValueRange({zero_index, two}));
+    rewriter.create<LLVM::StoreOp>(loc, ptrs_n_offset.offset, offset_gep);
+
+    // Use the offset pointer as base for further addressing. Copy over the
+    // new shape and compute strides. For this, we need to create a loop from
+    // rank - 1 to 0.
+    Value one_index = rewriter.create<LLVM::ConstantOp>(
+        loc, typeConverter.getIndexType(), rewriter.getIndexAttr(1));
+    auto target_shape_base = rewriter.create<LLVM::GEPOp>(
+        loc, index_ptr_type, offset_gep, ValueRange({one}));
+    auto target_strides_base = rewriter.create<LLVM::GEPOp>(
+        loc, index_ptr_type, target_shape_base, ValueRange({result_rank}));
+    auto shape_ptr = shape_desc.alignedPtr(rewriter, loc);
+    auto result_rank_minus_one =
+        rewriter.create<LLVM::SubOp>(loc, result_rank, one_index);
+
+    Block *init_block = rewriter.getInsertionBlock();
+    Block *cond_block =
+        rewriter.splitBlock(init_block, rewriter.getInsertionPoint());
+    rewriter.setInsertionPointToEnd(init_block);
+    rewriter.create<LLVM::BrOp>(
+        loc, ValueRange({result_rank_minus_one, one_index}), cond_block);
+    rewriter.setInsertionPointToStart(cond_block);
+    auto index_arg = cond_block->addArgument(typeConverter.getIndexType());
+    auto stride_arg = cond_block->addArgument(typeConverter.getIndexType());
+    auto pred = rewriter.create<LLVM::ICmpOp>(
+        loc, LLVM::LLVMType::getInt1Ty(rewriter.getContext()),
+        LLVM::ICmpPredicate::sge, index_arg, zero_index);
+
+    Block *body_block =
+        rewriter.splitBlock(cond_block, rewriter.getInsertionPoint());
+    rewriter.setInsertionPointToStart(body_block);
+
+    // Copy size from shape to descriptor.
+    auto size_load_gep = rewriter.create<LLVM::GEPOp>(
+        loc, index_ptr_type, shape_ptr, ValueRange{index_arg});
+    auto extracted_size = rewriter.create<LLVM::LoadOp>(loc, size_load_gep);
+    auto size_store_gep = rewriter.create<LLVM::GEPOp>(
+        loc, index_ptr_type, target_shape_base, ValueRange({index_arg}));
+    rewriter.create<LLVM::StoreOp>(loc, extracted_size, size_store_gep);
+    // Write stride value and compute next one.
+    auto stride_store_gep = rewriter.create<LLVM::GEPOp>(
+        loc, index_ptr_type, target_strides_base, ValueRange({index_arg}));
+    rewriter.create<LLVM::StoreOp>(loc, stride_arg, stride_store_gep);
+    auto next_stride =
+        rewriter.create<LLVM::MulOp>(loc, stride_arg, extracted_size);
+
+    // Decrement loop counter and branch back.
+    auto decrement = rewriter.create<LLVM::SubOp>(loc, index_arg, one_index);
+    rewriter.create<LLVM::BrOp>(loc, ValueRange({decrement, next_stride}),
+                                cond_block);
+
+    Block *remainder =
+        rewriter.splitBlock(body_block, rewriter.getInsertionPoint());
+
+    // Hook up the cond exit to the remainder.
+    rewriter.setInsertionPointToEnd(cond_block);
+    rewriter.create<LLVM::CondBrOp>(loc, pred, body_block, ValueRange(),
+                                    remainder, ValueRange());
+
+    // Reset position to beginning of new remainder block.
+    rewriter.setInsertionPointToStart(remainder);
+    rewriter.replaceOp(op, {target_desc});
     return success();
   }
 
@@ -250,11 +360,10 @@ struct ReshapeMemRefCastOpConverter
 
 }  // namespace
 
-void PopulateLhloToLLVMConversionPatterns(const LowerToLLVMOptions &options,
-                                          LLVMTypeConverter *converter,
+void PopulateLhloToLLVMConversionPatterns(LLVMTypeConverter *converter,
                                           OwningRewritePatternList *patterns) {
   patterns->insert<DynamicMemRefCastOpConverter, ReshapeMemRefCastOpConverter,
-                   StaticMemRefCastOpConverter>(*converter, options);
+                   StaticMemRefCastOpConverter>(*converter);
 }
 
 }  // namespace lmhlo
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm_pass.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm_pass.cc
index d6cda99a912..8493a1feb5d 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm_pass.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm_pass.cc
@@ -13,16 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"  // from @llvm-project
-#include "mlir/Conversion/SCFToStandard/SCFToStandard.h"  // from @llvm-project
-#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"  // from @llvm-project
-#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"  // from @llvm-project
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Pass/Pass.h"
 
 namespace mlir {
 namespace lmhlo {
@@ -36,13 +34,9 @@ class TestLhloToLLVMPass
     ModuleOp m = getOperation();
 
     OwningRewritePatternList patterns;
-    LLVMTypeConverter converter(m.getContext());
+    LLVMTypeConverter converter(&getContext());
     populateStdToLLVMConversionPatterns(converter, patterns);
-    PopulateLhloToLLVMConversionPatterns(
-        LowerToLLVMOptions::getDefaultOptions(), &converter, &patterns);
-    mlir::populateLoopToStdConversionPatterns(patterns, &getContext());
-
-    mlir::populateAffineToStdConversionPatterns(patterns, m.getContext());
+    PopulateLhloToLLVMConversionPatterns(&converter, &patterns);
 
     ConversionTarget target(getContext());
     target.addLegalDialect<LLVM::LLVMDialect>();
@@ -57,8 +51,9 @@ class TestLhloToLLVMPass
 
 }  // namespace
 
-static PassRegistration<TestLhloToLLVMPass> legalize_lhlo_pass(
-    "test-lhlo-legalize-to-llvm", "Legalize from LHLO dialect to LLVM.");
+std::unique_ptr<Pass> createTestLhloToLLVMPass() {
+  return std::make_unique<TestLhloToLLVMPass>();
+}
 
 }  // namespace lmhlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_parallel_loops.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_parallel_loops.cc
index 4255d87d48e..19f47d08c0d 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_parallel_loops.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_parallel_loops.cc
@@ -13,17 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "absl/memory/memory.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "mlir/Dialect/Linalg/IR/LinalgOps.h"  // from @llvm-project
-#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
 namespace lmhlo {
@@ -690,8 +689,8 @@ class SelectAndScatterOpConverter
   }
 };
 
-struct LhloLegalizeToParallelLoops
-    : public PassWrapper<LhloLegalizeToParallelLoops, FunctionPass> {
+struct LhloLegalizeToParallelLoopsPass
+    : public PassWrapper<LhloLegalizeToParallelLoopsPass, FunctionPass> {
   void runOnFunction() override {
     auto func = getFunction();
 
@@ -715,16 +714,11 @@ struct LhloLegalizeToParallelLoops
     }
   }
 };
-
 }  // namespace
 
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeLhloToParallelLoopsPass() {
-  return absl::make_unique<LhloLegalizeToParallelLoops>();
+  return std::make_unique<LhloLegalizeToParallelLoopsPass>();
 }
 
-static PassRegistration<LhloLegalizeToParallelLoops> legalize_lhlo_pass(
-    "lhlo-legalize-to-parallel-loops",
-    "Legalize from LHLO dialect to parallel loops.");
-
 }  // namespace lmhlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_complex.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_complex.cc
index 54ea4955573..9f7c946577d 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_complex.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_complex.cc
@@ -23,17 +23,17 @@ limitations under the License.
 #include <numeric>
 
 #include "llvm/ADT/STLExtras.h"
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/hlo_utils.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mlir-hlo/utils/hlo_utils.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
 
 using mlir::FunctionPass;
 using mlir::OwningRewritePatternList;
@@ -41,9 +41,9 @@ using mlir::PassRegistration;
 using mlir::PassWrapper;
 
 namespace {
-class LowerComplex : public PassWrapper<LowerComplex, FunctionPass> {
+class LowerComplexPass : public PassWrapper<LowerComplexPass, FunctionPass> {
  public:
-  explicit LowerComplex() : PassWrapper<LowerComplex, FunctionPass>() {}
+  explicit LowerComplexPass() : PassWrapper<LowerComplexPass, FunctionPass>() {}
 
   /// Performs the lowering to MHLO dialect.
   void runOnFunction() override;
@@ -51,10 +51,10 @@ class LowerComplex : public PassWrapper<LowerComplex, FunctionPass> {
 }  // end anonymous namespace
 
 namespace mlir {
-namespace hlo {
+namespace mhlo {
 namespace {
 
-#include "tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/generated_lower_complex.inc"
+#include "generated_lower_complex.inc"
 
 }  // end anonymous namespace
 
@@ -62,18 +62,18 @@ void PopulateComplexLoweringPatterns(MLIRContext* context,
                                      OwningRewritePatternList* patterns) {
   populateWithGenerated(context, patterns);
 }
-}  // end namespace hlo
+}  // end namespace mhlo
 }  // end namespace mlir
 
 // Lowers the complex operations that can be represented using other operations.
-void LowerComplex::runOnFunction() {
+void LowerComplexPass::runOnFunction() {
   // Add lowering patterns to the list.
   OwningRewritePatternList patterns;
-  mlir::hlo::PopulateComplexLoweringPatterns(&getContext(), &patterns);
+  mlir::mhlo::PopulateComplexLoweringPatterns(&getContext(), &patterns);
 
   applyPatternsAndFoldGreedily(getFunction(), patterns);
 }
 
-static PassRegistration<LowerComplex> pass(
-    "mhlo-test-lower-complex",
-    "Lower complex operations into non-complex operations");
+std::unique_ptr<FunctionPass> mlir::mhlo::createLowerComplexPass() {
+  return std::make_unique<LowerComplexPass>();
+}
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_complex_patterns.td b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_complex_patterns.td
index 0b72ccaa823..2cc97c90d1c 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_complex_patterns.td
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_complex_patterns.td
@@ -18,7 +18,7 @@ limitations under the License.
 
 include "mlir/IR/OpBase.td"
 include "mlir/Dialect/StandardOps/IR/Ops.td"
-include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td"
+include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.td"
 
 //===----------------------------------------------------------------------===//
 // Binary op patterns.
@@ -89,12 +89,10 @@ def : Pat<(HLO_DivOp HLO_ComplexTensor:$lhs, HLO_IntOrFpTensor:$rhs),
 // Absolute value is evaluated as:
 //   result = sqrt(val.real * val.real + val.imag * val.imag)
 def : Pat<(HLO_AbsOp HLO_ComplexTensor:$val),
-          (HLO_ComplexOp
            (HLO_SqrtOp
              (HLO_AddOp
               (HLO_MulOp (HLO_RealOp:$real $val), $real),
-              (HLO_MulOp (HLO_ImagOp:$imag $val), $imag))),
-           (HLO_ConstOp (ConstantSplat<"0"> $real)))>;
+              (HLO_MulOp (HLO_ImagOp:$imag $val), $imag)))>;
 
 // Exponential can be lowered to an exponential on the real component and a
 // sum of sinusoids of the imaginary component, which equates to a normal
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_general_dot.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_general_dot.cc
index 32a6ce42e5e..2bbd4691f95 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_general_dot.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_general_dot.cc
@@ -17,18 +17,18 @@ limitations under the License.
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Function.h"  // from @llvm-project
-#include "mlir/IR/Location.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Pass/Pass.h"
 
 using mlir::DenseIntElementsAttr;
 using mlir::ElementsAttr;
@@ -170,8 +170,8 @@ struct GeneralDotConvert : public OpRewritePattern<mlir::mhlo::DotGeneralOp> {
   }
 };
 
-struct LegalizeGeneralDot
-    : public PassWrapper<LegalizeGeneralDot, FunctionPass> {
+struct LegalizeGeneralDotPass
+    : public PassWrapper<LegalizeGeneralDotPass, FunctionPass> {
   /// Lower all general dots that can be represented as a non-batched matmul.
   void runOnFunction() override {
     OwningRewritePatternList patterns;
@@ -187,6 +187,6 @@ void mlir::mhlo::PopulateGeneralDotOpLoweringPatterns(
   patterns->insert<GeneralDotConvert>(ctx);
 }
 
-static PassRegistration<LegalizeGeneralDot> legalize_pass(
-    "mhlo-test-lower-general-dot",
-    "Tests lowering general dot to a non-batched dot when possible");
+std::unique_ptr<::mlir::Pass> mlir::mhlo::createLegalizeGeneralDotPass() {
+  return std::make_unique<LegalizeGeneralDotPass>();
+}
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/materialize_broadcasts.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/materialize_broadcasts.cc
index c2f88ad5e31..445cf2e79fe 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/materialize_broadcasts.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/materialize_broadcasts.cc
@@ -15,12 +15,12 @@ limitations under the License.
 
 #include <numeric>
 
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
 namespace mhlo {
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/materialize_broadcasts_pass.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/materialize_broadcasts_pass.cc
index 1d5d593bd43..3909f046007 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/materialize_broadcasts_pass.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/materialize_broadcasts_pass.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
 namespace mhlo {
@@ -50,9 +50,9 @@ struct TestMaterializeBroadcastsPass
 
 }  // namespace
 
+std::unique_ptr<::mlir::Pass> createTestMaterializeBroadcastsPass() {
+  return std::make_unique<TestMaterializeBroadcastsPass>();
+}
+
 }  // namespace mhlo
 }  // namespace mlir
-
-static mlir::PassRegistration<mlir::mhlo::TestMaterializeBroadcastsPass> pass(
-    "mhlo-test-materialize-broadcasts",
-    "Test pass for materializing 'broadcast_dimensions' attributes");
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/mhlo_fusion.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/mhlo_fusion.cc
index 91f9344b8c5..233d95a1a65 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/mhlo_fusion.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/mhlo_fusion.cc
@@ -18,14 +18,14 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/utils/cycle_detector.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/MLIRContext.h"              // TF:llvm-project
 #include "mlir/IR/Matchers.h"
 #include "mlir/Pass/Pass.h"               // TF:local_config_mlir
 #include "mlir/Transforms/RegionUtils.h"  // TF:llvm-project
-#include "llvm/ADT/EquivalenceClasses.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/cycle_detector.h"
 
 // This pass has similar functionality of the fusion pass in XLA stack.
 // However, unlike XLA, it targets the fully dynamic shape scenario.
@@ -479,7 +479,7 @@ class FusionPlanner {
   EquivalenceClasses<int32_t> leader_for_node_;
 };
 
-struct MhloFusion : public mlir::PassWrapper<MhloFusion, FunctionPass> {
+struct MhloFusionPass : public mlir::PassWrapper<MhloFusionPass, FunctionPass> {
   void runOnFunction() override {
     FuncOp func = getFunction();
     if (!IsTargetFunc(func)) {
@@ -568,12 +568,9 @@ struct MhloFusion : public mlir::PassWrapper<MhloFusion, FunctionPass> {
 
 }  // namespace
 
-std::unique_ptr<OperationPass<FuncOp>> createMhloFusion() {
-  return std::make_unique<MhloFusion>();
+std::unique_ptr<OperationPass<FuncOp>> createMhloFusionPass() {
+  return std::make_unique<MhloFusionPass>();
 }
 
-static PassRegistration<MhloFusion> mhlo_fusion_pass(
-    "mhlo-fusion", "fuse mhlo ops to kLoop/kInput fusion patterns.");
-
 }  // namespace mhlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/optimize_mhlo.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/optimize_mhlo.cc
new file mode 100644
index 00000000000..43de47086bf
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/optimize_mhlo.cc
@@ -0,0 +1,187 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file provides optional optimization patterns for mhlo, canonocalizing
+// operations to equivalent but potentially more efficient operations.
+
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <numeric>
+
+#include "llvm/ADT/STLExtras.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mlir-hlo/utils/hlo_utils.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
+
+using mlir::OwningRewritePatternList;
+
+namespace mlir {
+namespace mhlo {
+namespace {
+
+// Returns 1D 64-bit dense elements attribute with the given values.
+static DenseIntElementsAttr GetI64ElementsAttr(ArrayRef<int64_t> values,
+                                               Builder* builder) {
+  RankedTensorType ty = RankedTensorType::get(
+      {static_cast<int64_t>(values.size())}, builder->getIntegerType(64));
+  return DenseIntElementsAttr::get(ty, values);
+}
+
+//===----------------------------------------------------------------------===//
+// GatherOp
+//===----------------------------------------------------------------------===//
+
+class GatherIsSlice : public OpRewritePattern<GatherOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(GatherOp gather,
+                                PatternRewriter& rewriter) const override {
+    auto dimension_numbers = gather.dimension_numbers();
+
+    // Inputs need to be ranked to lower.
+    if (!gather.operand().getType().cast<ShapedType>().hasRank() ||
+        !gather.operand().getType().cast<ShapedType>().hasStaticShape() ||
+        !gather.start_indices().getType().cast<ShapedType>().hasRank() ||
+        !gather.start_indices().getType().cast<ShapedType>().hasStaticShape()) {
+      return failure();
+    }
+
+    if (dimension_numbers.index_vector_dim().getValue().getSExtValue() != 0) {
+      return failure();
+    }
+
+    // TODO(suderman): Handle start index map != {0}.
+    if (!dimension_numbers.start_index_map() ||
+        dimension_numbers.start_index_map().getType().getRank() != 1 ||
+        dimension_numbers.start_index_map().getType().getDimSize(0) != 1 ||
+        dimension_numbers.start_index_map()
+                .getValue({0})
+                .cast<IntegerAttr>()
+                .getValue() != 0) {
+      return failure();
+    }
+
+    auto result_ty = gather.getResult().getType().dyn_cast<RankedTensorType>();
+
+    // Requires a ranked output.
+    if (!result_ty) {
+      return failure();
+    }
+    if (dimension_numbers.offset_dims().getType().getNumElements() !=
+        result_ty.getRank()) {
+      return failure();
+    }
+    for (auto it : llvm::enumerate(dimension_numbers.offset_dims())) {
+      if (it.index() != it.value()) {
+        return failure();
+      }
+    }
+
+    // Verify the gather slice sizes are correct.
+    if (gather.slice_sizes().getNumElements() !=
+        gather.operand().getType().cast<ShapedType>().getRank()) {
+      return failure();
+    }
+
+    // Validate the slice sizes are correct.
+    if (gather.slice_sizes().getType().cast<ShapedType>().getNumElements() <
+        result_ty.getShape().size() + 1) {
+      return failure();
+    }
+
+    for (auto it : llvm::enumerate(result_ty.getShape())) {
+      if (gather.slice_sizes()
+              .getValue(it.index() + 1)
+              .cast<IntegerAttr>()
+              .getValue() != it.value()) {
+        return failure();
+      }
+    }
+
+    auto gather_start_indices = gather.start_indices();
+    auto gather_start_indices_ty =
+        gather_start_indices.getType().cast<ShapedType>();
+
+    llvm::SmallVector<Value, 4> slice_start_indices;
+
+    if (gather_start_indices_ty.getRank() == 0) {
+      slice_start_indices.push_back(gather_start_indices);
+    } else if (gather_start_indices_ty.getRank() == 1) {
+      for (int i = 0; i < gather_start_indices_ty.getDimSize(0); i++) {
+        auto start = GetI64ElementsAttr({i}, &rewriter);
+        auto limit = GetI64ElementsAttr({i + 1}, &rewriter);
+        auto stride = GetI64ElementsAttr({1}, &rewriter);
+        auto indicesSlice = rewriter.create<SliceOp>(
+            gather.getLoc(), gather_start_indices, start, limit, stride);
+        auto reshaped = rewriter.create<ReshapeOp>(
+            gather.getLoc(),
+            RankedTensorType::get(
+                {}, indicesSlice.getType().cast<ShapedType>().getElementType()),
+            indicesSlice);
+        slice_start_indices.push_back(reshaped);
+      }
+    } else {
+      return failure();
+    }
+
+    auto sliceSizes = gather.slice_sizes();
+    auto sliceSizesTy = sliceSizes.getType();
+    if (sliceSizesTy.getRank() != 1) {
+      return failure();
+    }
+
+    // Start indices have implicit zeros when not specified. This is because
+    // Gather occurs similar to slicing where full slices are inferred. Add any
+    // missing zeros as necessary.
+    auto zero = rewriter.create<ConstOp>(
+        gather.getLoc(), rewriter.getZeroAttr(RankedTensorType::get(
+                             {}, gather_start_indices_ty.getElementType())));
+    while (slice_start_indices.size() < sliceSizesTy.getDimSize(0)) {
+      slice_start_indices.push_back(zero);
+    }
+
+    SmallVector<int64_t, 5> sliceShape;
+    for (auto shapeValue : gather.slice_sizes().getIntValues()) {
+      sliceShape.push_back(shapeValue.getSExtValue());
+    }
+
+    auto sliceTy =
+        RankedTensorType::get(sliceShape, result_ty.getElementType());
+    auto slice = rewriter.create<DynamicSliceOp>(
+        gather.getLoc(), sliceTy, gather.operand(), slice_start_indices,
+        gather.slice_sizes());
+
+    rewriter.replaceOpWithNewOp<ReshapeOp>(gather, gather.getType(), slice);
+
+    return success();
+  }
+};
+
+}  // end anonymous namespace
+
+void PopulateOptimizeMHLOPatterns(MLIRContext* context,
+                                  OwningRewritePatternList* patterns) {
+  patterns->insert<GatherIsSlice>(context);
+}
+}  // end namespace mhlo
+}  // end namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/optimize_mhlo_pass.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/optimize_mhlo_pass.cc
new file mode 100644
index 00000000000..32a846e79ef
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/optimize_mhlo_pass.cc
@@ -0,0 +1,51 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+using mlir::FunctionPass;
+using mlir::PassRegistration;
+using mlir::PassWrapper;
+
+namespace {
+class OptimizeMhloPass : public PassWrapper<OptimizeMhloPass, FunctionPass> {
+ public:
+  explicit OptimizeMhloPass() : PassWrapper<OptimizeMhloPass, FunctionPass>() {}
+
+  /// Performs the lowering to MHLO dialect.
+  void runOnFunction() override;
+};
+}  // end anonymous namespace
+
+// Lowers the complex operations that can be represented using other operations.
+void OptimizeMhloPass::runOnFunction() {
+  // Add lowering patterns to the list.
+  mlir::OwningRewritePatternList patterns;
+  mlir::mhlo::PopulateOptimizeMHLOPatterns(&getContext(), &patterns);
+
+  applyPatternsAndFoldGreedily(getFunction(), patterns);
+}
+
+std::unique_ptr<mlir::FunctionPass> mlir::mhlo::createOptimizeMhloPass() {
+  return std::make_unique<OptimizeMhloPass>();
+}
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/sink_constants_to_control_flow.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/sink_constants_to_control_flow.cc
index b05918030e9..8d677f45c19 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/sink_constants_to_control_flow.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/sink_constants_to_control_flow.cc
@@ -15,12 +15,13 @@ limitations under the License.
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Casting.h"
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/RegionUtils.h"
 
 namespace mlir {
 namespace mhlo {
@@ -29,8 +30,16 @@ namespace {
 
 // A pass that sinks constants implicitly captured in control flow regions. This
 // is necessary to export to XLA.
-class SinkConstantsToControlFlow
-    : public mlir::PassWrapper<SinkConstantsToControlFlow, FunctionPass> {
+//
+// TODO(hinsu): Generalize this pass to handle all the ops with regions. Any
+// value used within the region that is defined outside of op's region should be
+// sank to the regions and not just the constants. Ops such as If and While
+// whose computations doesn't require fixed signature like Sort or Reduce have
+// an option to pass outside values as operands of the op to avoid recomputing
+// those within internally. Note that doing so is the only option in case of
+// values defined outside that are BlockArguments of any of the parent region.
+class SinkConstantsToControlFlowPass
+    : public mlir::PassWrapper<SinkConstantsToControlFlowPass, FunctionPass> {
   void runOnFunction() override {
     getFunction().walk([](Operation* op) {
       if (auto while_op = llvm::dyn_cast<WhileOp>(op)) {
@@ -39,6 +48,8 @@ class SinkConstantsToControlFlow
       } else if (auto if_op = llvm::dyn_cast<IfOp>(op)) {
         SinkToRegion(&if_op.true_branch());
         SinkToRegion(&if_op.false_branch());
+      } else if (auto sort_op = llvm::dyn_cast<SortOp>(op)) {
+        SinkToRegion(&sort_op.comparator());
       }
     });
   }
@@ -46,39 +57,36 @@ class SinkConstantsToControlFlow
  private:
   // Performs constant sinking into a region.
   static void SinkToRegion(Region* region) {
-    llvm::DenseMap<Value, ConstOp> sunk_constant;
+    llvm::DenseMap<Value, Operation*> sunk_constant;
     visitUsedValuesDefinedAbove({*region}, [&](OpOperand* use) {
       Value constant = use->get();
-      auto const_op = dyn_cast_or_null<ConstOp>(constant.getDefiningOp());
-      if (!const_op) return;
+      auto op = constant.getDefiningOp();
+      if (!op || !op->hasTrait<OpTrait::ConstantLike>()) return;
       auto map_entry = sunk_constant.try_emplace(constant, nullptr);
       if (!map_entry.second) {
         // This constant has already been cloned into the region, reuse it.
-        use->set(map_entry.first->getSecond().getResult());
-        if (constant.use_empty()) const_op.erase();
+        use->set(map_entry.first->getSecond()->getResult(0));
+        if (op->use_empty()) op->erase();
         return;
       }
       if (constant.hasOneUse()) {
-        const_op.getOperation()->moveBefore(&region->front().front());
+        op->moveBefore(&region->front().front());
         return;
       }
-      map_entry.first->getSecond() = const_op.clone();
+      map_entry.first->getSecond() = op->clone();
       region->front().getOperations().insert(region->front().begin(),
                                              map_entry.first->getSecond());
-      use->set(map_entry.first->getSecond().getResult());
+      use->set(map_entry.first->getSecond()->getResult(0));
     });
   }
 };
 
-static mlir::PassRegistration<SinkConstantsToControlFlow> pass(
-    "mhlo-sink-constants-to-control-flow",
-    "Sink constants implicitly captured in control flow regions. This is "
-    "necessary to export to XLA.");
-
 }  // anonymous namespace
 
+// TODO(hinsu): Rename this pass and move to a different file along with the
+// generalization to make all ops isolated from above.
 std::unique_ptr<OperationPass<FuncOp>> createSinkConstantsToControlFlowPass() {
-  return std::make_unique<SinkConstantsToControlFlow>();
+  return std::make_unique<SinkConstantsToControlFlowPass>();
 }
 
 }  // namespace mhlo
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/test_infer_shaped_type_pass.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/test_infer_shaped_type_pass.cc
index 184420bb8f7..35e5a184472 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/test_infer_shaped_type_pass.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/test_infer_shaped_type_pass.cc
@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Identifier.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/OperationSupport.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Identifier.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/InferTypeOpInterface.h"
+#include "mlir/Pass/Pass.h"
 
 namespace mlir {
-namespace hlo {
+namespace mhlo {
 namespace {
 
 struct InferReturnTypeComponentsPattern : public RewritePattern {
@@ -92,9 +92,10 @@ struct TestInferShapedTypeMethodsPass
 };
 
 }  // namespace
-}  // namespace hlo
-}  // namespace mlir
 
-static mlir::PassRegistration<mlir::hlo::TestInferShapedTypeMethodsPass> pass(
-    "mhlo-test-infer-shaped-type-methods",
-    "Uses test ops to invoke InferShapedTypeOpInterface methods");
+std::unique_ptr<FunctionPass> createTestInferShapedTypeMethodsPass() {
+  return std::make_unique<TestInferShapedTypeMethodsPass>();
+}
+
+}  // namespace mhlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/transform_unranked_hlo.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/transform_unranked_hlo.cc
index 53947855cc7..7c985ea7535 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/transform_unranked_hlo.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/transform_unranked_hlo.cc
@@ -14,24 +14,38 @@ limitations under the License.
 
 ==============================================================================*/
 
-#include "absl/memory/memory.h"
-#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/Function.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mlir/Dialect/Shape/IR/Shape.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
 namespace mhlo {
 namespace {
 
-// TODO(frgossen): Make it variadic.
+// TODO(herhut): Generate these out of op definitions.
+#define MAP_XLA_OPERATION_CWISE_UNARY(fn, sep)                                 \
+  fn(AbsOp) sep fn(CeilOp) sep fn(ClzOp) sep fn(CosOp) sep fn(ExpOp)           \
+      sep fn(Expm1Op) sep fn(FloorOp) sep fn(ImagOp) sep fn(IsFiniteOp)        \
+          sep fn(LogOp) sep fn(Log1pOp) sep fn(LogisticOp) sep fn(NotOp)       \
+              sep fn(NegOp) sep fn(PopulationCountOp) sep fn(RealOp)           \
+                  sep fn(RoundOp) sep fn(RsqrtOp) sep fn(SignOp) sep fn(SinOp) \
+                      sep fn(SqrtOp) sep fn(TanhOp)
+
+// TODO(herhut): Generate these out of op definitions.
+#define MAP_XLA_OPERATION_CWISE_BINARY(fn, sep)                           \
+  fn(AddOp) sep fn(Atan2Op) sep fn(ComplexOp) sep fn(DivOp) sep fn(MaxOp) \
+      sep fn(MinOp) sep fn(MulOp) sep fn(PowOp) sep fn(RemOp)             \
+          sep fn(ShiftLeftOp) sep fn(ShiftRightArithmeticOp)              \
+              sep fn(ShiftRightLogicalOp) sep fn(SubOp)
+
 template <typename OpTy>
 inline void AddLegalOpOnRankedTensor(ConversionTarget *target) {
   target->addDynamicallyLegalOp<OpTy>([](OpTy op) {
@@ -60,29 +74,24 @@ struct UnaryElementwiseOpConversion : public OpRewritePattern<OpTy> {
 
     // Generate IR to flatten the operand.
     auto loc = op.getLoc();
-    Value shape = rewriter.create<shape::ShapeOfOp>(loc, operand);
-    Value numElements = rewriter.create<shape::NumElementsOp>(
-        loc, rewriter.getType<shape::SizeType>(), shape);
-    Value numElementsAsIndex = rewriter.create<shape::SizeToIndexOp>(
-        loc, rewriter.getIndexType(), numElements);
-    Value flatShapeAsDimTensor =
-        rewriter.create<TensorFromElementsOp>(loc, numElementsAsIndex);
+    Type extentTensorTy = shape::getExtentTensorType(rewriter.getContext());
+    Value shape =
+        rewriter.create<shape::ShapeOfOp>(loc, extentTensorTy, operand);
+    Type indexTy = rewriter.getIndexType();
+    Value numElements =
+        rewriter.create<shape::NumElementsOp>(loc, indexTy, shape);
+    Value flatShape = rewriter.create<TensorFromElementsOp>(loc, numElements);
     auto flatTensorTy = RankedTensorType::get({ShapedType::kDynamicSize},
                                               operandTy.getElementType());
     Value flatOperand = rewriter.create<mhlo::DynamicReshapeOp>(
-        loc, flatTensorTy, operand, flatShapeAsDimTensor);
+        loc, flatTensorTy, operand, flatShape);
 
     // Generate IR for the actual operation.
     Value flatResult = rewriter.create<OpTy>(loc, flatTensorTy, flatOperand);
 
     // Generate IR to restore the original shape.
-    auto extentTensorTy = RankedTensorType::get({ShapedType::kDynamicSize},
-                                                rewriter.getIndexType());
-    Value shapeAsExtentTensor =
-        rewriter.create<shape::ToExtentTensorOp>(loc, extentTensorTy, shape);
-    Value result = rewriter.create<mhlo::DynamicReshapeOp>(
-        loc, operandTy, flatResult, shapeAsExtentTensor);
-    rewriter.replaceOp(op, result);
+    rewriter.replaceOpWithNewOp<mhlo::DynamicReshapeOp>(op, operandTy,
+                                                        flatResult, shape);
 
     return success();
   }
@@ -108,17 +117,18 @@ struct BinaryElementwiseOpConversion : public OpRewritePattern<OpTy> {
     }
 
     // Flatten operands.
-    Type shapeTy = shape::ShapeType::get(rewriter.getContext());
     auto loc = op.getLoc();
-    Value shapeLhs = rewriter.create<shape::ShapeOfOp>(loc, op.lhs());
-    Value shapeRhs = rewriter.create<shape::ShapeOfOp>(loc, op.rhs());
-    Value shape = rewriter.create<shape::AnyOp>(loc, shapeTy,
+    Type extentTensorTy = shape::getExtentTensorType(rewriter.getContext());
+    Value shapeLhs =
+        rewriter.create<shape::ShapeOfOp>(loc, extentTensorTy, op.lhs());
+    Value shapeRhs =
+        rewriter.create<shape::ShapeOfOp>(loc, extentTensorTy, op.rhs());
+    Value shape = rewriter.create<shape::AnyOp>(loc, extentTensorTy,
                                                 ValueRange{shapeLhs, shapeRhs});
-    Value numElements = rewriter.create<shape::NumElementsOp>(loc, shape);
-    Value numElementsAsIndex =
-        rewriter.create<shape::SizeToIndexOp>(loc, numElements);
-    Value flatShape =
-        rewriter.create<TensorFromElementsOp>(loc, numElementsAsIndex);
+    Type indexTy = rewriter.getIndexType();
+    Value numElements =
+        rewriter.create<shape::NumElementsOp>(loc, indexTy, shape);
+    Value flatShape = rewriter.create<TensorFromElementsOp>(loc, numElements);
     TensorType lhsTy = op.lhs().getType().template cast<TensorType>();
     Type flatLhsTy = RankedTensorType::get({ShapedType::kDynamicSize},
                                            lhsTy.getElementType());
@@ -134,13 +144,8 @@ struct BinaryElementwiseOpConversion : public OpRewritePattern<OpTy> {
     Value flatResult = rewriter.create<OpTy>(loc, flatLhs, flatRhs);
 
     // Restore original shape.
-    auto extentTensorTy = RankedTensorType::get({ShapedType::kDynamicSize},
-                                                rewriter.getIndexType());
-    Value shapeAsExtentTensor =
-        rewriter.create<shape::ToExtentTensorOp>(loc, extentTensorTy, shape);
-    Value result = rewriter.create<DynamicReshapeOp>(
-        loc, op.getType(), flatResult, shapeAsExtentTensor);
-    rewriter.replaceOp(op, result);
+    rewriter.replaceOpWithNewOp<DynamicReshapeOp>(op, op.getType(), flatResult,
+                                                  shape);
 
     return success();
   }
@@ -155,15 +160,17 @@ struct TransformUnrankedHloPass
     target.addLegalDialect<MhloDialect, StandardOpsDialect,
                            shape::ShapeDialect>();
     target.addLegalOp<FuncOp>();
-    AddLegalOpOnRankedTensor<SqrtOp>(&target);
-    AddLegalOpOnRankedTensor<AddOp>(&target);
+#define ADD_LEGAL(op) AddLegalOpOnRankedTensor<op>(&target)
+    MAP_XLA_OPERATION_CWISE_UNARY(ADD_LEGAL, ;);
+    MAP_XLA_OPERATION_CWISE_BINARY(ADD_LEGAL, ;);
+#undef ADD_LEGAL
 
     // Populate rewrite patterns.
     OwningRewritePatternList patterns;
     PopulateTransformUnrankedHloPatterns(&ctx, &patterns);
 
     // Apply transformation.
-    if (failed(applyFullConversion(getFunction(), target, patterns)))
+    if (failed(applyPartialConversion(getFunction(), target, patterns)))
       return signalPassFailure();
   }
 };
@@ -174,15 +181,22 @@ void PopulateTransformUnrankedHloPatterns(MLIRContext *context,
                                           OwningRewritePatternList *patterns) {
   // TODO(frgossen): Populate all unary and binary operations.
   // clang-format off
+#define MAP_UNARY(op) UnaryElementwiseOpConversion<op>
+#define MAP_BINARY(op) BinaryElementwiseOpConversion<op>
+#define COMMA ,
   patterns->insert<
-      BinaryElementwiseOpConversion<AddOp>,
-      UnaryElementwiseOpConversion<SqrtOp>>(context);
+      MAP_XLA_OPERATION_CWISE_UNARY(MAP_UNARY, COMMA),
+      MAP_XLA_OPERATION_CWISE_BINARY(MAP_BINARY, COMMA)
+      >(context);
+#undef MAP_UNARY
+#undef MAP_BINARY
+#undef COMMA
   // clang-format on
 }
 
-static PassRegistration<TransformUnrankedHloPass> transform_unranked_hlo_pass(
-    "transform-unranked-hlo",
-    "Realize element-wise operations on ranked tensors where possible");
+std::unique_ptr<::mlir::Pass> createTransformUnrankedHloPass() {
+  return std::make_unique<TransformUnrankedHloPass>();
+}
 
 }  // namespace mhlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/unfuse_batch_norm.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/unfuse_batch_norm.cc
index 09c9c61119e..1458e5f3d63 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/unfuse_batch_norm.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/unfuse_batch_norm.cc
@@ -14,15 +14,15 @@ limitations under the License.
 ==============================================================================*/
 
 #include "llvm/ADT/SmallVector.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
 namespace mhlo {
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/unfuse_batch_norm_pass.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/unfuse_batch_norm_pass.cc
index c26d73f3306..f187a7470cf 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/unfuse_batch_norm_pass.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/unfuse_batch_norm_pass.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
 namespace mhlo {
@@ -38,9 +38,9 @@ struct TestUnfuseBatchNormPass
 
 }  // namespace
 
+std::unique_ptr<::mlir::Pass> createTestUnfuseBatchNormPass() {
+  return std::make_unique<TestUnfuseBatchNormPass>();
+}
+
 }  // namespace mhlo
 }  // namespace mlir
-
-static mlir::PassRegistration<mlir::mhlo::TestUnfuseBatchNormPass> pass(
-    "mhlo-test-unfuse-batch-norm",
-    "Test pass for materializing 'broadcast_dimensions' attributes");
diff --git a/tensorflow/tools/dockerfiles/tests/import-mkl-horovod.sh b/tensorflow/compiler/mlir/hlo/lib/utils/CMakeLists.txt
old mode 100755
new mode 100644
similarity index 58%
rename from tensorflow/tools/dockerfiles/tests/import-mkl-horovod.sh
rename to tensorflow/compiler/mlir/hlo/lib/utils/CMakeLists.txt
index b1cae48c6ee..17e86f1caa8
--- a/tensorflow/tools/dockerfiles/tests/import-mkl-horovod.sh
+++ b/tensorflow/compiler/mlir/hlo/lib/utils/CMakeLists.txt
@@ -1,18 +1,25 @@
-#!/usr/bin/env bash
-
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#      https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ============================================================================
+#
 
-python -c 'from tensorflow.python import pywrap_tensorflow; pywrap_tensorflow.IsMklEnabled() or exit(1); import horovod.tensorflow as hvd'
+add_mlir_library(MLIRMhloUtils
+  broadcast_utils.cc
+  convert_op_folder.cc
+  cycle_detector.cc
+  hlo_utils.cc
+
+  LINK_LIBS PUBLIC
+  MLIRSupport
+  )
diff --git a/tensorflow/compiler/mlir/hlo/lib/utils/broadcast_utils.cc b/tensorflow/compiler/mlir/hlo/lib/utils/broadcast_utils.cc
index e05ec3c3481..71b1a4e164f 100644
--- a/tensorflow/compiler/mlir/hlo/lib/utils/broadcast_utils.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/utils/broadcast_utils.cc
@@ -13,15 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/broadcast_utils.h"
+#include "mlir-hlo/utils/broadcast_utils.h"
 
 #include <algorithm>
 
 #include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallVector.h"
-#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
-#include "mlir/IR/Diagnostics.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/StandardTypes.h"
 
 namespace mlir {
 namespace hlo {
@@ -46,9 +47,9 @@ bool IsLegalNumpyRankedBroadcast(Value lhs, Value rhs,
                     broadcast_dims.getIntValues().begin());
 }
 
-Value ComputeBinaryElementwiseBroadcastingResultExtents(Location loc, Value lhs,
-                                                        Value rhs,
-                                                        OpBuilder& builder) {
+Value ComputeBinaryElementwiseBroadcastingResultExtents(
+    Location loc, Value lhs, Value rhs, OpBuilder& builder,
+    bool unsafe_as_extent_tensor) {
   auto lhs_type = lhs.getType().dyn_cast<RankedTensorType>();
   auto rhs_type = rhs.getType().dyn_cast<RankedTensorType>();
   if (!lhs_type || !rhs_type) {
@@ -57,17 +58,22 @@ Value ComputeBinaryElementwiseBroadcastingResultExtents(Location loc, Value lhs,
     return nullptr;
   }
 
-  int64_t result_rank = std::max(lhs_type.getRank(), rhs_type.getRank());
-  auto shape_type = shape::ShapeType::get(builder.getContext());
-  Value lhs_shape_v =
-      builder.createOrFold<shape::ShapeOfOp>(loc, shape_type, lhs);
-  Value rhs_shape_v =
-      builder.createOrFold<shape::ShapeOfOp>(loc, shape_type, rhs);
-  Value result_shape_v = builder.createOrFold<shape::BroadcastOp>(
-      loc, shape_type, lhs_shape_v, rhs_shape_v, nullptr /* error */);
-  return builder.createOrFold<shape::ToExtentTensorOp>(
-      loc, RankedTensorType::get({result_rank}, builder.getIndexType()),
-      result_shape_v);
+  Value lhs_shape_v = builder.createOrFold<shape::ShapeOfOp>(loc, lhs);
+  Value rhs_shape_v = builder.createOrFold<shape::ShapeOfOp>(loc, rhs);
+
+  if (unsafe_as_extent_tensor) {
+    int64_t result_rank = std::max(lhs_type.getRank(), rhs_type.getRank());
+    Value result_shape_v = builder.createOrFold<shape::BroadcastOp>(
+        loc, shape::getExtentTensorType(builder.getContext()), lhs_shape_v,
+        rhs_shape_v, nullptr /* error */);
+    return builder.createOrFold<TensorCastOp>(
+        loc, RankedTensorType::get({result_rank}, builder.getIndexType()),
+        result_shape_v);
+  }
+
+  return builder.createOrFold<shape::BroadcastOp>(
+      loc, builder.getType<shape::ShapeType>(), lhs_shape_v, rhs_shape_v,
+      nullptr /* error */);
 }
 
 }  // namespace hlo
diff --git a/tensorflow/compiler/mlir/hlo/lib/utils/convert_op_folder.cc b/tensorflow/compiler/mlir/hlo/lib/utils/convert_op_folder.cc
index ea074c4907d..0751d2c626c 100644
--- a/tensorflow/compiler/mlir/hlo/lib/utils/convert_op_folder.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/utils/convert_op_folder.cc
@@ -15,11 +15,11 @@ limitations under the License.
 
 // This file defines helpers useful when creating or manipulating lhlo/hlo.
 
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/convert_op_folder.h"
+#include "mlir-hlo/utils/convert_op_folder.h"
 
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeUtilities.h"
 
 namespace mlir {
 namespace hlo {
diff --git a/tensorflow/compiler/mlir/hlo/lib/utils/cycle_detector.cc b/tensorflow/compiler/mlir/hlo/lib/utils/cycle_detector.cc
index 6145391a608..0914460236d 100644
--- a/tensorflow/compiler/mlir/hlo/lib/utils/cycle_detector.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/utils/cycle_detector.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/cycle_detector.h"
+#include "mlir-hlo/utils/cycle_detector.h"
 
 #include <algorithm>
 
diff --git a/tensorflow/compiler/mlir/hlo/lib/utils/cycle_detector_test.cc b/tensorflow/compiler/mlir/hlo/lib/utils/cycle_detector_test.cc
index 314bbd699c7..263321c17d1 100644
--- a/tensorflow/compiler/mlir/hlo/lib/utils/cycle_detector_test.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/utils/cycle_detector_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/cycle_detector.h"
+#include "mlir-hlo/utils/cycle_detector.h"
 
 #include "tensorflow/compiler/xla/test.h"
 
diff --git a/tensorflow/compiler/mlir/hlo/lib/utils/hlo_utils.cc b/tensorflow/compiler/mlir/hlo/lib/utils/hlo_utils.cc
index 184d113fb9d..df2442cc4b6 100644
--- a/tensorflow/compiler/mlir/hlo/lib/utils/hlo_utils.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/utils/hlo_utils.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/hlo_utils.h"
+#include "mlir-hlo/utils/hlo_utils.h"
 
 #include <numeric>
 
-#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"
 
 namespace mlir {
 namespace hlo {
diff --git a/tensorflow/compiler/mlir/hlo/tests/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/tests/CMakeLists.txt
new file mode 100644
index 00000000000..36a7eec5a1f
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/tests/CMakeLists.txt
@@ -0,0 +1,36 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+configure_lit_site_cfg(
+        ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
+        ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
+        MAIN_CONFIG
+        ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py
+)
+
+set(MLIR_HLO_TEST_DEPENDS
+        FileCheck count not
+        mlir-hlo-opt
+)
+
+add_lit_testsuite(check-mlir-hlo-lit "Running the mlir-hlo regression tests"
+        ${CMAKE_CURRENT_BINARY_DIR}
+        DEPENDS ${MLIR_HLO_TEST_DEPENDS}
+        )
+set_target_properties(check-mlir-hlo-lit PROPERTIES FOLDER "Tests")
+
+add_lit_testsuites(MLIR_HLO_OPT ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS ${MLIR_HLO_TEST_DEPENDS})
+
+add_dependencies(check-mlir-hlo check-mlir-hlo-lit)
diff --git a/tensorflow/compiler/mlir/hlo/tests/canonicalize.mlir b/tensorflow/compiler/mlir/hlo/tests/canonicalize.mlir
index 87774129ffb..15b1a150fdd 100644
--- a/tensorflow/compiler/mlir/hlo/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/canonicalize.mlir
@@ -191,6 +191,20 @@ func @concatenate_const_2D_horizontal() -> tensor<2x2xi32> {
   return %2 : tensor<2x2xi32>
 }
 
+// CHECK-LABEL: constant_like_constant
+func @constant_like_constant(%arg0: tensor<3x4xi32>) -> tensor<3x4xf32> {
+  // CHECK: mhlo.constant dense<3.200000e+00>
+  %0 = "chlo.constant_like"(%arg0) { value = 3.2 : f32 } : (tensor<3x4xi32>) -> tensor<3x4xf32>
+  return %0 : tensor<3x4xf32>
+}
+
+// CHECK-LABEL: constant_like_constant_dynamic
+func @constant_like_constant_dynamic(%arg0: tensor<*xi32>) -> tensor<*xf32> {
+  // CHECK: chlo.constant_like
+  %0 = "chlo.constant_like"(%arg0) { value = 3.2 : f32 } : (tensor<*xi32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
 // CHECK-LABEL: dynamic_slice_variable_start
 func @dynamic_slice_variable_start(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
   // CHECK: "mhlo.dynamic-slice"
@@ -365,6 +379,25 @@ func @dynamic_broadcast_in_dim_op_not_actually_dynamic(%arg0: tensor<4xf32>, %ar
   return %0 : tensor<5x4xf32>
 }
 
+// CHECK-LABEL: func @dynamic_broadcast_in_dim_to_same_shape_1
+func @dynamic_broadcast_in_dim_to_same_shape_1(%arg0: tensor<?xf32>) -> tensor<?xf32> {
+  // CHECK-SAME: %[[ARG:.*]]: tensor<?xf32>
+  %0 = shape.shape_of %arg0 : tensor<?xf32> -> tensor<1xindex>
+  %2 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %0) { broadcast_dimensions = dense<0> : tensor<1xi64> } : (tensor<?xf32>, tensor<1xindex>) -> tensor<?xf32>
+  // CHECK: return %[[ARG]] : tensor<?xf32>
+  return %2 : tensor<?xf32>
+}
+
+// CHECK-LABEL: func @dynamic_broadcast_in_dim_to_same_shape_2
+func @dynamic_broadcast_in_dim_to_same_shape_2(%arg0: tensor<?xf32>) -> tensor<?xf32> {
+  // CHECK-SAME: %[[ARG:.*]]: tensor<?xf32>
+  %0 = shape.shape_of %arg0 : tensor<?xf32> -> !shape.shape
+  %1 = shape.to_extent_tensor %0 : !shape.shape -> tensor<1xindex>
+  %2 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %1) { broadcast_dimensions = dense<0> : tensor<1xi64> } : (tensor<?xf32>, tensor<1xindex>) -> tensor<?xf32>
+  // CHECK: return %[[ARG]] : tensor<?xf32>
+  return %2 : tensor<?xf32>
+}
+
 // CHECK-LABEL: func @broadcast_in_dim_constant_fold_0d
 func @broadcast_in_dim_constant_fold_0d() -> tensor<1x64x224x224xf32> {
   %cst = mhlo.constant dense<0.000000e+00> : tensor<f32>
@@ -542,3 +575,25 @@ func @dce_while_without_side_effect(%arg0: tensor<i64>) -> tensor<i64> {
 
   return %arg0 : tensor<i64>
 }
+
+// CHECK-LABEL: unpack_repack_same_tuple
+// CHECK-SAME: ([[ARG0:%.*]]: tuple<tensor<i32>, !mhlo.token, tensor<f32>>)
+func @unpack_repack_same_tuple(%arg0: tuple<tensor<i32>, !mhlo.token, tensor<f32>>) -> tuple<tensor<i32>, !mhlo.token, tensor<f32>> {
+  %0 = "mhlo.get_tuple_element"(%arg0) {index = 0 : i32} : (tuple<tensor<i32>, !mhlo.token, tensor<f32>>) -> tensor<i32>
+  %1 = "mhlo.get_tuple_element"(%arg0) {index = 1 : i32} : (tuple<tensor<i32>, !mhlo.token, tensor<f32>>) -> !mhlo.token
+  %2 = "mhlo.get_tuple_element"(%arg0) {index = 2 : i32} : (tuple<tensor<i32>, !mhlo.token, tensor<f32>>) -> tensor<f32>
+  %3 = "mhlo.tuple"(%0, %1, %2) : (tensor<i32>, !mhlo.token, tensor<f32>) -> tuple<tensor<i32>, !mhlo.token, tensor<f32>>
+
+  // CHECK: return [[ARG0]]
+  return %3 : tuple<tensor<i32>, !mhlo.token, tensor<f32>>
+}
+
+// CHECK-LABEL: unpack_repack_same_tuple_single_element
+// CHECK-SAME: ([[ARG0:%.*]]: tuple<tensor<i32>>)
+func @unpack_repack_same_tuple_single_element(%arg0: tuple<tensor<i32>>) -> tuple<tensor<i32>> {
+  %0 = "mhlo.get_tuple_element"(%arg0) {index = 0 : i32} : (tuple<tensor<i32>>) -> tensor<i32>
+  %3 = "mhlo.tuple"(%0) : (tensor<i32>) -> tuple<tensor<i32>>
+
+  // CHECK: return [[ARG0]]
+  return %3 : tuple<tensor<i32>>
+}
diff --git a/tensorflow/compiler/mlir/hlo/tests/chlo_infer_shape_type_methods.mlir b/tensorflow/compiler/mlir/hlo/tests/chlo_infer_shape_type_methods.mlir
index 65074325563..d226c92858a 100644
--- a/tensorflow/compiler/mlir/hlo/tests/chlo_infer_shape_type_methods.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/chlo_infer_shape_type_methods.mlir
@@ -5,15 +5,14 @@
 // only test reification on an examplar op.
 // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>,
 // CHECK-SAME: %[[ARG1:.+]]: tensor<?xf32>
-func @broadcast_add(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<1xindex> {
+func @broadcast_add(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> !shape.shape {
   // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
   // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
-  // CHECK-DAG: %[[BCAST_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
-  // CHECK: %[[EXTENTS:.+]] = shape.to_extent_tensor %[[BCAST_S]]
-  // CHECK: return %[[EXTENTS]]
+  // CHECK-DAG: %[[BCAST_S:.+]] = shape.broadcast %[[ARG0_S]], %[[ARG1_S]] : tensor<?xindex>, tensor<?xindex> -> !shape.shape
+  // CHECK: return %[[BCAST_S]] : !shape.shape
   %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %1 = "mhlo_test.reify_return_type_shapes"(%0) : (tensor<?xf32>) -> tensor<1xindex>
-  return %1 : tensor<1xindex>
+  %1 = "mhlo_test.reify_return_type_shapes"(%0) : (tensor<?xf32>) -> !shape.shape
+  return %1 : !shape.shape
 }
 
 // -----
diff --git a/tensorflow/compiler/mlir/hlo/tests/chlo_legalize_to_hlo_broadcasts.mlir b/tensorflow/compiler/mlir/hlo/tests/chlo_legalize_to_hlo_broadcasts.mlir
index 2c0e2d7f170..9670372a864 100644
--- a/tensorflow/compiler/mlir/hlo/tests/chlo_legalize_to_hlo_broadcasts.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/chlo_legalize_to_hlo_broadcasts.mlir
@@ -18,8 +18,8 @@ func @dynamicBroadcast(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?
   // CHECK-DAG:  %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
   // CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[ARG0_S]], %[[ARG1_S]]
   // CHECK-NEXT: %[[FINAL_RESULT:.+]] = shape.assuming %[[WITNESS]]
-  // CHECK-DAG:    %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
-  // CHECK:        %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_S]]
+  // CHECK-DAG:    %[[RESULT_S:.+]] = shape.broadcast %[[ARG0_S]], %[[ARG1_S]]
+  // CHECK:        %[[RESULT_EXTENTS:.+]] = tensor_cast %[[RESULT_S]] : tensor<?xindex> to tensor<2xindex>
   // CHECK-DAG:    %[[ARG0_B:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG:    %[[ARG1_B:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
   // CHECK-NEXT:   %[[RESULT:.+]] = mhlo.add %[[ARG0_B]], %[[ARG1_B]]
@@ -39,8 +39,8 @@ func @dynamicBroadcastComplex(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> t
   // CHECK-DAG:  %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
   // CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[ARG0_S]], %[[ARG1_S]]
   // CHECK-NEXT: %[[FINAL_RESULT:.+]] = shape.assuming %[[WITNESS]]
-  // CHECK-NEXT:   %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
-  // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_S]]
+  // CHECK-NEXT:   %[[RESULT_S:.+]] = shape.broadcast %[[ARG0_S]], %[[ARG1_S]]
+  // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = tensor_cast %[[RESULT_S]] : tensor<?xindex> to tensor<2xindex>
   // CHECK-DAG:    %[[ARG0_B:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   // CHECK-DAG:    %[[ARG1_B:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   // CHECK-NEXT:   %[[RESULT:.+]] = "mhlo.complex"(%[[ARG0_B]], %[[ARG1_B]]) : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>>
@@ -60,8 +60,8 @@ func @dynamicBroadcastCompare(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> t
   // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
   // CHECK: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[ARG0_S]], %[[ARG1_S]]
   // CHECK: %[[FINAL_RESULT:.+]] = shape.assuming %[[WITNESS]]
-  // CHECK: %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
-  // CHECK: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_S]]
+  // CHECK: %[[RESULT_S:.+]] = shape.broadcast %[[ARG0_S]], %[[ARG1_S]]
+  // CHECK: %[[RESULT_EXTENTS:.+]] = tensor_cast %[[RESULT_S]] : tensor<?xindex> to tensor<2xindex>
   // CHECK-DAG: %[[ARG0_B:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   // CHECK-DAG: %[[ARG1_B:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   // CHECK: %[[RESULT:.+]] = "mhlo.compare"(%[[ARG0_B]], %[[ARG1_B]]) {comparison_direction = "EQ"} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xi1>
@@ -237,3 +237,199 @@ func @xorWithoutBroadcast(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4x
   %0 = chlo.broadcast_xor %arg0, %arg1 : (tensor<4xi1>, tensor<4xi1>) -> tensor<4xi1>
   return %0 : tensor<4xi1>
 }
+
+// -----
+func @addScalarUnranked(%arg0: tensor<f32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<f32>, tensor<*xf32>)
+                                         -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// CHECK-LABEL:   func @addScalarUnranked(
+// CHECK-SAME:                            %[[ARG_0:.*]]: tensor<f32>,
+// CHECK-SAME:                            %[[ARG_1:.*]]: tensor<*xf32>
+// CHECK-SAME:                            ) -> tensor<*xf32> {
+//                  First handle the dynamic reshaping of the unranked operand
+//                  to a 1D tensor.
+// CHECK:           %[[SHAPE_1:.*]] = shape.shape_of %[[ARG_1]] : tensor<*xf32>
+// CHECK:           %[[NUM_ELEMENTS:.*]] = shape.num_elements %[[SHAPE_1]] : tensor<?xindex> -> index
+// CHECK:           %[[SIZE_TENSOR:.*]] = tensor_from_elements(%[[NUM_ELEMENTS]]) : tensor<1xindex>
+// CHECK:           %[[RESHAPED:.*]] = "mhlo.dynamic_reshape"(%[[ARG_1]], %[[SIZE_TENSOR]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
+//                  The assuming region is part of the second stage of lowering
+//                  with ranked broadcasting logic.
+// CHECK:           %[[SHAPE_0:.*]] = shape.shape_of %[[ARG_0]] : tensor<f32>
+// CHECK:           %[[SHAPE_RESHAPED:.*]] = shape.shape_of %[[RESHAPED]] : tensor<?xf32>
+// CHECK:           %[[WITNESS:.*]] = shape.cstr_broadcastable %[[SHAPE_0]], %[[SHAPE_RESHAPED]]
+// CHECK:           %[[ASSUMING_RESULT:.*]] = shape.assuming %[[WITNESS]] -> (tensor<?xf32>) {
+// CHECK:             %[[SCALAR_SHAPE:.*]] = shape.const_shape []
+// CHECK:             %[[BROADCASTED_SHAPE:.*]] = shape.broadcast %[[SCALAR_SHAPE]], %[[SHAPE_RESHAPED]]
+// CHECK:             %[[SHAPE_TENSOR:.*]] = tensor_cast %[[BROADCASTED_SHAPE]] : tensor<?xindex> to tensor<1xindex>
+// CHECK:             %[[BROADCASTED_LHS:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG_0]], %[[SHAPE_TENSOR]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<1xindex>) -> tensor<?xf32>
+// CHECK:             %[[BROADCASTED_RHS:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[RESHAPED]], %[[SHAPE_TENSOR]]) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<?xf32>, tensor<1xindex>) -> tensor<?xf32>
+// CHECK:             %[[BROADCASTED_RESULT:.*]] = mhlo.add %[[BROADCASTED_LHS]], %[[BROADCASTED_RHS]] : tensor<?xf32>
+// CHECK:             shape.assuming_yield %[[BROADCASTED_RESULT]] : tensor<?xf32>
+// CHECK:           }
+//                  As part of the unranked logic, the result is reshaped back
+//                  to an unranked tensor.
+// CHECK:           %[[RESHAPED_RESULT:.*]] = "mhlo.dynamic_reshape"(%[[ASSUMING_RESULT:.*]], %[[SHAPE_1]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
+// CHECK:           return %[[RESHAPED_RESULT]] : tensor<*xf32>
+// CHECK:         }
+
+// -----
+func @addUnrankedScalar(%arg0: tensor<*xf32>, %arg1: tensor<f32>) -> tensor<*xf32> {
+  %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<*xf32>, tensor<f32>)
+                                         -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+// CHECK-LABEL:   func @addUnrankedScalar(
+// CHECK-SAME:                            %[[ARG_0:.*]]: tensor<*xf32>,
+// CHECK-SAME:                            %[[ARG_1:.*]]: tensor<f32>) -> tensor<*xf32> {
+//                  First handle the dynamic reshaping of the unranked operand
+//                  to a 1D tensor.
+// CHECK:           %[[SHAPE_0:.*]] = shape.shape_of %[[ARG_0]] : tensor<*xf32>
+// CHECK:           %[[NUM_ELEMENTS:.*]] = shape.num_elements %[[SHAPE_0]] : tensor<?xindex> -> index
+// CHECK:           %[[SIZE_TENSOR:.*]] = tensor_from_elements(%[[NUM_ELEMENTS]]) : tensor<1xindex>
+// CHECK:           %[[RESHAPED:.*]] = "mhlo.dynamic_reshape"(%[[ARG_0]], %[[SIZE_TENSOR]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
+//                  The assuming region is part of the second stage of lowering
+//                  with ranked broadcasting logic.
+// CHECK:           %[[SHAPE_RESHAPED:.*]] = shape.shape_of %[[RESHAPED]] : tensor<?xf32>
+// CHECK:           %[[SHAPE_1:.*]] = shape.shape_of %[[ARG_1]] : tensor<f32>
+// CHECK:           %[[WITNESS:.*]] = shape.cstr_broadcastable %[[SHAPE_RESHAPED]], %[[SHAPE_1]]
+// CHECK:           %[[ASSUMING_RESULT:.*]] = shape.assuming %[[WITNESS]] -> (tensor<?xf32>) {
+// CHECK:             %[[ASTENSOR:.*]] = tensor_cast %[[SHAPE_RESHAPED]]
+// CHECK:             %[[BROADCASTED_LHS:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[RESHAPED]], %[[ASTENSOR]]) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<?xf32>, tensor<1xindex>) -> tensor<?xf32>
+// CHECK:             %[[BROADCASTED_RHS:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG_1]], %[[ASTENSOR]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<1xindex>) -> tensor<?xf32>
+// CHECK:             %[[BROADCASTED_RESULT:.*]] = mhlo.add %[[BROADCASTED_LHS]], %[[BROADCASTED_RHS]] : tensor<?xf32>
+// CHECK:             shape.assuming_yield %[[BROADCASTED_RESULT]] : tensor<?xf32>
+// CHECK:           }
+//                  As part of the unranked logic, the result is reshaped back
+//                  to an unranked tensor.
+// CHECK:           %[[RESHAPED_RESULT:.*]] = "mhlo.dynamic_reshape"(%[[ASSUMING_RESULT:.*]], %[[SHAPE_0]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
+// CHECK:           return %[[RESHAPED_RESULT]] : tensor<*xf32>
+// CHECK:         }
+
+// -----
+func @addUnrankedUnranked(
+      %arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<*xf32>, tensor<*xf32>)
+                                         -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// CHECK-LABEL:   func @addUnrankedUnranked(
+// CHECK-SAME:          %[[LHS:.*]]: tensor<*xf32>,
+// CHECK-SAME:          %[[RHS:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[LHS_SHAPE:.*]] = shape.shape_of %[[LHS]] : tensor<*xf32> -> tensor<?xindex>
+// CHECK:           %[[RANK_LHS:.*]] = shape.rank %[[LHS_SHAPE]] : tensor<?xindex> -> index
+// CHECK:           %[[C0:.*]] = constant 0 : index
+// CHECK:           %[[LHS_IS_SCALAR:.*]] = cmpi "eq", %[[RANK_LHS]], %[[C0]] : index
+//                  Handle scalar LHS case
+// CHECK:           %[[VAL_8:.*]] = scf.if %[[LHS_IS_SCALAR]] -> (tensor<*xf32>) {
+// CHECK:             %[[SCALAR_LHS:.*]] = "mhlo.reshape"(%[[LHS]]) : (tensor<*xf32>) -> tensor<f32>
+// CHECK:             %[[VAL_10:.*]] = chlo.broadcast_add %[[SCALAR_LHS]], %[[RHS]] : (tensor<f32>, tensor<*xf32>) -> tensor<*xf32>
+// CHECK:             scf.yield %[[VAL_10]] : tensor<*xf32>
+// CHECK:           } else {
+// CHECK:             %[[RHS_SHAPE:.*]] = shape.shape_of %[[RHS]] : tensor<*xf32> -> tensor<?xindex>
+// CHECK:             %[[RANK_RHS:.*]] = shape.rank %[[RHS_SHAPE]] : tensor<?xindex> -> index
+// CHECK:             %[[RHS_IS_SCALAR:.*]] = cmpi "eq", %[[RANK_RHS]], %[[C0]] : index
+  //                  Handle scalar RHS case
+// CHECK:             %[[VAL_14:.*]] = scf.if %[[RHS_IS_SCALAR]] -> (tensor<*xf32>) {
+// CHECK:               %[[SCALAR_RHS:.*]] = "mhlo.reshape"(%[[RHS]]) : (tensor<*xf32>) -> tensor<f32>
+// CHECK:               %[[VAL_16:.*]] = chlo.broadcast_add %[[LHS]], %[[SCALAR_RHS]] : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
+// CHECK:               scf.yield %[[VAL_16]] : tensor<*xf32>
+// CHECK:             } else {
+// CHECK:               %[[SHAPES_EQ:.*]] = shape.shape_eq %[[LHS_SHAPE]], %[[RHS_SHAPE]] : tensor<?xindex>, tensor<?xindex>
+  //                    Handle scalar RHS case
+// CHECK:               %[[VAL_18:.*]] = scf.if %[[SHAPES_EQ]] -> (tensor<*xf32>) {
+// CHECK:                 %[[VAL_19:.*]] = mhlo.add %[[LHS]], %[[RHS]] : tensor<*xf32>
+// CHECK:                 scf.yield %[[VAL_19]] : tensor<*xf32>
+// CHECK:               } else {
+// CHECK:                 %[[LHS_RANK:.*]] = rank %[[LHS_SHAPE]] : tensor<?xindex>
+// CHECK:                 %[[RHS_RANK:.*]] = rank %[[RHS_SHAPE]] : tensor<?xindex>
+// CHECK:                 %[[LHS_RANK_GREATER:.*]] = cmpi "sgt", %[[LHS_RANK]], %[[RHS_RANK]] : index
+// CHECK:                 %[[GREATEST_RANK:.*]] = select %[[LHS_RANK_GREATER]], %[[LHS_RANK]], %[[RHS_RANK]] : index
+// CHECK:                 %[[C2:.*]] = constant 2 : index
+// CHECK:                 %[[GREATEST_RANK_IS_2:.*]] = cmpi "eq", %[[GREATEST_RANK]], %[[C2]] : index
+//                        Handle rank 2 specialization
+// CHECK:                 %[[VAL_26:.*]] = scf.if %[[GREATEST_RANK_IS_2]] -> (tensor<*xf32>) {
+// CHECK:                   %[[CONST_SHAPE_2:.*]] = shape.const_shape [1, 1]
+// CHECK:                   %[[BROADCASTED_LHS_2:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_2]] : tensor<?xindex>, tensor<2xindex> -> tensor<2xindex>
+// CHECK:                   %[[BROADCASTED_RHS_2:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_2]] : tensor<?xindex>, tensor<2xindex> -> tensor<2xindex>
+// CHECK:                   %[[RESHAPED_LHS_2:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[BROADCASTED_LHS_2]]) : (tensor<*xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+// CHECK:                   %[[RESHAPED_RHS_2:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[BROADCASTED_RHS_2]]) : (tensor<*xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+// CHECK:                   %[[RESULT_RANK_2:.*]] = chlo.broadcast_add %[[RESHAPED_LHS_2]], %[[RESHAPED_RHS_2]] : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK:                   %[[RESULT_2:.*]] = tensor_cast %[[RESULT_RANK_2]] : tensor<?x?xf32> to tensor<*xf32>
+// CHECK:                   scf.yield %[[RESULT_2]] : tensor<*xf32>
+// CHECK:                 } else {
+// CHECK:                   %[[C3:.*]] = constant 3 : index
+// CHECK:                   %[[GREATEST_RANK_IS_3:.*]] = cmpi "eq", %[[GREATEST_RANK]], %[[C3]] : index
+//                          Handle rank 3 specialization
+// CHECK:                   %[[VAL_34:.*]] = scf.if %[[GREATEST_RANK_IS_3]] -> (tensor<*xf32>) {
+// CHECK:                     %[[CONST_SHAPE_3:.*]] = shape.const_shape [1, 1, 1]
+// CHECK:                     %[[BROADCASTED_LHS_3:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_3]] : tensor<?xindex>, tensor<3xindex> -> tensor<3xindex>
+// CHECK:                     %[[BROADCASTED_RHS_3:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_3]] : tensor<?xindex>, tensor<3xindex> -> tensor<3xindex>
+// CHECK:                     %[[RESHAPED_LHS_3:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[BROADCASTED_LHS_3]]) : (tensor<*xf32>, tensor<3xindex>) -> tensor<?x?x?xf32>
+// CHECK:                     %[[RESHAPED_RHS_3:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[BROADCASTED_RHS_3]]) : (tensor<*xf32>, tensor<3xindex>) -> tensor<?x?x?xf32>
+// CHECK:                     %[[RESULT_RANK_3:.*]] = chlo.broadcast_add %[[RESHAPED_LHS_3]], %[[RESHAPED_RHS_3]] : (tensor<?x?x?xf32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// CHECK:                     %[[RESULT_3:.*]] = tensor_cast %[[RESULT_RANK_3]] : tensor<?x?x?xf32> to tensor<*xf32>
+// CHECK:                     scf.yield %[[RESULT_3]] : tensor<*xf32>
+// CHECK:                   } else {
+// CHECK:                     %[[C4:.*]] = constant 4 : index
+// CHECK:                     %[[GREATEST_RANK_IS_4:.*]] = cmpi "eq", %[[GREATEST_RANK]], %[[C4]] : index
+//                            Handle rank 4 specialization
+// CHECK:                     %[[VAL_42:.*]] = scf.if %[[GREATEST_RANK_IS_4]] -> (tensor<*xf32>) {
+// CHECK:                       %[[CONST_SHAPE_4:.*]] = shape.const_shape [1, 1, 1, 1]
+// CHECK:                       %[[BROADCASTED_LHS_4:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_4]] : tensor<?xindex>, tensor<4xindex> -> tensor<4xindex>
+// CHECK:                       %[[BROADCASTED_RHS_4:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_4]] : tensor<?xindex>, tensor<4xindex> -> tensor<4xindex>
+// CHECK:                       %[[RESHAPED_LHS_4:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[BROADCASTED_LHS_4]]) : (tensor<*xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+// CHECK:                       %[[RESHAPED_RHS_4:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[BROADCASTED_RHS_4]]) : (tensor<*xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+// CHECK:                       %[[RESULT_RANK_4:.*]] = chlo.broadcast_add %[[RESHAPED_LHS_4]], %[[RESHAPED_RHS_4]] : (tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+// CHECK:                       %[[RESULT_4:.*]] = tensor_cast %[[RESULT_RANK_4]] : tensor<?x?x?x?xf32> to tensor<*xf32>
+// CHECK:                       scf.yield %[[RESULT_4]] : tensor<*xf32>
+// CHECK:                     } else {
+// CHECK:                       %[[C5:.*]] = constant 5 : index
+// CHECK:                       %[[GREATEST_RANK_IS_5:.*]] = cmpi "eq", %[[GREATEST_RANK]], %[[C5]] : index
+//                              Handle rank 5 specialization
+// CHECK:                       %[[VAL_50:.*]] = scf.if %[[GREATEST_RANK_IS_5]] -> (tensor<*xf32>) {
+// CHECK:                         %[[CONST_SHAPE_5:.*]] = shape.const_shape [1, 1, 1, 1, 1]
+// CHECK:                         %[[BROADCASTED_LHS_5:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_5]] : tensor<?xindex>, tensor<5xindex> -> tensor<5xindex>
+// CHECK:                         %[[BROADCASTED_RHS_5:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_5]] : tensor<?xindex>, tensor<5xindex> -> tensor<5xindex>
+// CHECK:                         %[[RESHAPED_LHS_5:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[BROADCASTED_LHS_5]]) : (tensor<*xf32>, tensor<5xindex>) -> tensor<?x?x?x?x?xf32>
+// CHECK:                         %[[RESHAPED_RHS_5:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[BROADCASTED_RHS_5]]) : (tensor<*xf32>, tensor<5xindex>) -> tensor<?x?x?x?x?xf32>
+// CHECK:                         %[[RESULT_RANK_5:.*]] = chlo.broadcast_add %[[RESHAPED_LHS_5]], %[[RESHAPED_RHS_5]] : (tensor<?x?x?x?x?xf32>, tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32>
+// CHECK:                         %[[RESULT_5:.*]] = tensor_cast %[[RESULT_RANK_5]] : tensor<?x?x?x?x?xf32> to tensor<*xf32>
+// CHECK:                         scf.yield %[[RESULT_5]] : tensor<*xf32>
+// CHECK:                       } else {
+// CHECK:                         %[[C6:.*]] = constant 6 : index
+// CHECK:                         %[[GREATEST_RANK_IS_6:.*]] = cmpi "eq", %[[GREATEST_RANK]], %[[C6]] : index
+//                                Handle rank 6 specialization
+// CHECK:                         %[[VAL_58:.*]] = scf.if %[[GREATEST_RANK_IS_6]] -> (tensor<*xf32>) {
+// CHECK:                           %[[CONST_SHAPE_6:.*]] = shape.const_shape [1, 1, 1, 1, 1, 1]
+// CHECK:                           %[[BROADCASTED_LHS_6:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_6]] : tensor<?xindex>, tensor<6xindex> -> tensor<6xindex>
+// CHECK:                           %[[BROADCASTED_RHS_6:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_6]] : tensor<?xindex>, tensor<6xindex> -> tensor<6xindex>
+// CHECK:                           %[[RESHAPED_LHS_6:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[BROADCASTED_LHS_6]]) : (tensor<*xf32>, tensor<6xindex>) -> tensor<?x?x?x?x?x?xf32>
+// CHECK:                           %[[RESHAPED_RHS_6:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[BROADCASTED_RHS_6]]) : (tensor<*xf32>, tensor<6xindex>) -> tensor<?x?x?x?x?x?xf32>
+// CHECK:                           %[[RESULT_RANK_6:.*]] = chlo.broadcast_add %[[RESHAPED_LHS_6]], %[[RESHAPED_RHS_6]] : (tensor<?x?x?x?x?x?xf32>, tensor<?x?x?x?x?x?xf32>) -> tensor<?x?x?x?x?x?xf32>
+// CHECK:                           %[[RESULT_6:.*]] = tensor_cast %[[RESULT_RANK_6]] : tensor<?x?x?x?x?x?xf32> to tensor<*xf32>
+// CHECK:                           scf.yield %[[RESULT_6]] : tensor<*xf32>
+// CHECK:                         } else {
+// CHECK:                           %false = constant false
+// CHECK:                           assert %false
+// CHECK:                           scf.yield %[[LHS]] : tensor<*xf32>
+// CHECK:                         }
+// CHECK:                         scf.yield %[[VAL_64:.*]] : tensor<*xf32>
+// CHECK:                       }
+// CHECK:                       scf.yield %[[VAL_65:.*]] : tensor<*xf32>
+// CHECK:                     }
+// CHECK:                     scf.yield %[[VAL_66:.*]] : tensor<*xf32>
+// CHECK:                   }
+// CHECK:                   scf.yield %[[VAL_67:.*]] : tensor<*xf32>
+// CHECK:                 }
+// CHECK:                 scf.yield %[[VAL_68:.*]] : tensor<*xf32>
+// CHECK:               }
+// CHECK:               scf.yield %[[VAL_69:.*]] : tensor<*xf32>
+// CHECK:             }
+// CHECK:             scf.yield %[[VAL_70:.*]] : tensor<*xf32>
+// CHECK:           }
+// CHECK:           return %[[VAL_71:.*]] : tensor<*xf32>
+// CHECK:         }
diff --git a/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-gather-to-torch-index-select.mlir b/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-gather-to-torch-index-select.mlir
new file mode 100644
index 00000000000..ca90a80aa6c
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-gather-to-torch-index-select.mlir
@@ -0,0 +1,41 @@
+// RUN: mlir-hlo-opt -mhlo-legalize-gather-to-torch-index-select %s -o - | FileCheck %s
+
+// CHECK-LABEL: @gather_to_index_select
+func @gather_to_index_select(%arg0 : tensor<5x4xf32>, %arg1 : tensor<1x3x1xi32>) -> tensor<1x3x4xf32> {
+  // CHECK: [[TIS:%.+]] = "mhlo.torch_index_select"(%arg0, %arg1) {
+  // CHECK-SAME:   batch_dims = 0 : i64,
+  // CHECK-SAME:   dim = 0 : i64
+  // CHECK-SAME: } : (tensor<5x4xf32>, tensor<1x3x1xi32>) -> tensor<1x3x1x4xf32>
+  // CHECK: [[RES:%.+]] = "mhlo.reshape"([[TIS]])
+  %0 = "mhlo.gather"(%arg0, %arg1) {dimension_numbers = {collapsed_slice_dims = dense<0> : tensor<1xi64>, index_vector_dim = 2 : i64, offset_dims = dense<2> : tensor<1xi64>, start_index_map = dense<0> : tensor<1xi64>}, indices_are_sorted = false, slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<5x4xf32>, tensor<1x3x1xi32>) -> tensor<1x3x4xf32>
+
+  // CHECK: return [[RES]]
+  return %0 : tensor<1x3x4xf32>
+}
+
+// CHECK-LABEL: @scalar_gather_to_index_select
+func @scalar_gather_to_index_select(%arg0 : tensor<5x4xf32>, %arg1 : tensor<i32>) -> tensor<1x4xf32> {
+  // CHECK: [[TIS:%.+]] = "mhlo.torch_index_select"(%arg0, %arg1) {
+  // CHECK-SAME:   batch_dims = 0 : i64,
+  // CHECK-SAME:   dim = 0 : i64
+  // CHECK-SAME: } : (tensor<5x4xf32>, tensor<i32>) -> tensor<4xf32>
+  // CHECK: [[RES:%.+]] = "mhlo.reshape"([[TIS]])
+  %0 = "mhlo.gather"(%arg0, %arg1) {dimension_numbers = {collapsed_slice_dims = dense<0> : tensor<1xi64>, index_vector_dim = 0 : i64, offset_dims = dense<[0, 1]> : tensor<2xi64>, start_index_map = dense<0> : tensor<1xi64>}, indices_are_sorted = false, slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<5x4xf32>, tensor<i32>) -> tensor<1x4xf32>
+
+  // CHECK: return [[RES]]
+  return %0 : tensor<1x4xf32>
+}
+
+// CHECK-LABEL: @gather_no_lowering_subslice
+func @gather_no_lowering_subslice(%arg0 : tensor<5x4xf32>, %arg1 : tensor<1x3x1xi32>) -> tensor<1x3x3xf32> {
+  // CHECK: "mhlo.gather"
+  %0 = "mhlo.gather"(%arg0, %arg1) {dimension_numbers = {collapsed_slice_dims = dense<0> : tensor<1xi64>, index_vector_dim = 2 : i64, offset_dims = dense<2> : tensor<1xi64>, start_index_map = dense<0> : tensor<1xi64>}, indices_are_sorted = false, slice_sizes = dense<[1, 3]> : tensor<2xi64>} : (tensor<5x4xf32>, tensor<1x3x1xi32>) -> tensor<1x3x3xf32>
+  return %0 : tensor<1x3x3xf32>
+}
+
+// CHECK-LABEL: @gather_no_lowering_multidim
+func @gather_no_lowering_multidim(%arg0 : tensor<5x4xf32>, %arg1 : tensor<1x3x2xi32>) -> tensor<1x3x4xf32> {
+  // CHECK: "mhlo.gather"
+  %0 = "mhlo.gather"(%arg0, %arg1) {dimension_numbers = {collapsed_slice_dims = dense<0> : tensor<1xi64>, index_vector_dim = 2 : i64, offset_dims = dense<2> : tensor<1xi64>, start_index_map = dense<0> : tensor<1xi64>}, indices_are_sorted = false, slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<5x4xf32>, tensor<1x3x2xi32>) -> tensor<1x3x4xf32>
+  return %0 : tensor<1x3x4xf32>
+}
diff --git a/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-lhlo.mlir b/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-lhlo.mlir
index aa5d800b82b..018711e33cb 100644
--- a/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-lhlo.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-lhlo.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-hlo-opt -hlo-legalize-to-lhlo -buffer-placement -split-input-file %s -o - | FileCheck --check-prefixes=PRE,BOTH %s
-// RUN: mlir-hlo-opt -hlo-legalize-to-lhlo=results-escape-function=true -buffer-placement -split-input-file %s -o - | FileCheck --check-prefixes=ESC,BOTH %s
+// RUN: mlir-hlo-opt -hlo-legalize-to-lhlo -buffer-placement -split-input-file %s -o - | FILECHECK_OPTS="" FileCheck --check-prefixes=PRE,BOTH %s
+// RUN: mlir-hlo-opt -hlo-legalize-to-lhlo=results-escape-function=true -buffer-placement -split-input-file %s -o - | FILECHECK_OPTS="" FileCheck --check-prefixes=ESC,BOTH %s
 
 // BOTH-LABEL: func @attrs
 func @attrs_copy(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
@@ -487,3 +487,26 @@ func @conv(%input: tensor<3x5x5x3xf32>, %filter : tensor<2x2x3x4xf32>) -> tensor
   } : (tensor<2x2x3x4xf32>, tensor<3x5x5x3xf32>) -> tensor<3x5x5x4xf32>
   return %out : tensor<3x5x5x4xf32>
 }
+
+// -----
+
+// BOTH-LABEL: func @reduce
+func @reduce(%arg0: tensor<1x8xf32>, %arg1: tensor<f32>) -> tensor<1xf32> {
+  // BOTH: %[[OUT:.*]] = alloc() : memref<1xf32>
+  // BOTH:  "lmhlo.reduce"(%{{.+}}, %{{.+}}, %[[OUT]]) ( {
+  // BOTH:  ^bb0(%[[ARG1:.*]]: memref<f32>, %[[ARG2:.*]]: memref<f32>,
+  // BOTH-SAME:  %[[ARG3:.*]]: memref<f32>):
+  // BOTH:    %[[TMP:.*]] = alloc() : memref<f32>
+  // BOTH:    "lmhlo.add"(%[[ARG1]], %[[ARG2]], %[[TMP]])
+  // BOTH:    "lmhlo.copy"(%[[TMP]], %[[ARG3]])
+  // BOTH:    "lmhlo.terminator"() : () -> ()
+  // BOTH:  }) {dimensions = dense<1> : tensor<1xi64>}
+  // BOTH-SAME: : (memref<1x8xf32>, memref<f32>, memref<1xf32>) -> ()
+  %0 = "mhlo.reduce"(%arg0, %arg1) ( {
+  ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):  // no predecessors
+    %1 = mhlo.add %arg2, %arg3 : tensor<f32>
+    "mhlo.return"(%1) : (tensor<f32>) -> ()
+  }) {dimensions = dense<1> : tensor<1xi64>}
+      : (tensor<1x8xf32>, tensor<f32>) -> tensor<1xf32>
+  return %0 : tensor<1xf32>
+}
diff --git a/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-linalg.mlir
index 320ce069ac0..46725e0bd09 100644
--- a/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-linalg.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt %s -hlo-legalize-to-linalg -split-input-file | FileCheck %s
+// RUN: mlir-hlo-opt %s -hlo-legalize-to-linalg -split-input-file | FILECHECK_OPTS="" FileCheck %s
 
 // CHECK: #map0 = affine_map<(d0, d1) -> (d0, d1)>
 // CHECK-LABEL: func @float_add
@@ -557,3 +557,18 @@ func @reverse(%input: tensor<2x3xf32>) -> tensor<2x3xf32> {
 }
 // CHECK: linalg.generic
 // CHECK-SAME: indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
+
+// -----
+
+// CHECK: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-LABEL: func @iota
+func @iota() -> tensor<7x10xf32> {
+  %result = "mhlo.iota"() {iota_dimension = 1 : i64} : () -> (tensor<7x10xf32>)
+  return %result : tensor<7x10xf32>
+}
+// CHECK: linalg.indexed_generic
+// CHECK-SAME: indexing_maps = [#[[RESULT_MAP]]]
+// CHECK-NEXT: ^bb0(%[[D0:.*]]: index, %[[D1:.*]]: index):
+// CHECK-NEXT:   %[[INT_CAST:.*]] = index_cast %[[D1]] : index to i32
+// CHECK-NEXT:   %[[FLOAT_CAST:.*]] = sitofp %[[INT_CAST]] : i32 to f32
+// CHECK-NEXT:   linalg.yield %[[FLOAT_CAST]] : f32
diff --git a/tensorflow/compiler/mlir/hlo/tests/lhlo-copy-removal.mlir b/tensorflow/compiler/mlir/hlo/tests/lhlo-copy-removal.mlir
index 6d7992cb868..3271595900d 100644
--- a/tensorflow/compiler/mlir/hlo/tests/lhlo-copy-removal.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/lhlo-copy-removal.mlir
@@ -91,3 +91,25 @@ func @must_be_removed_second(%arg0: memref<2x2xf32>,
     dealloc %0 : memref<2x2xf32>
     "lmhlo.terminator"() : () -> ()
 }
+
+// -----
+
+// CHECK-LABEL: func @reduce
+func @reduce(%arg0: memref<1x8xf32>, %arg1: memref<f32>, %arg2: memref<1xf32>) {
+  %0 = alloc() : memref<1xf32>
+  "lmhlo.reduce"(%arg0, %arg1, %0) ( {
+  // CHECK: ^bb0(%[[ARG0:.*]]: memref<f32>, %[[ARG1:.*]]: memref<f32>,
+  // CHECK-SAME: %[[ARG2:.*]]: memref<f32>)
+  ^bb0(%arg3: memref<f32>, %arg4: memref<f32>, %arg5: memref<f32>):
+    %1 = alloc() : memref<f32>
+    // CHECK: "lmhlo.add"(%[[ARG0]], %[[ARG1]], %[[ARG2]])
+    "lmhlo.add"(%arg3, %arg4, %1)
+        : (memref<f32>, memref<f32>, memref<f32>) -> ()
+    // CHECK-NOT; lmhlo.copy
+    "lmhlo.copy"(%1, %arg5) : (memref<f32>, memref<f32>) -> ()
+    "lmhlo.terminator"() : () -> ()
+  }) {dimensions = dense<1> : tensor<1xi64>}
+      : (memref<1x8xf32>, memref<f32>, memref<1xf32>) -> ()
+  "lmhlo.copy"(%0, %arg2) : (memref<1xf32>, memref<1xf32>) -> ()
+  return
+}
diff --git a/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-linalg.mlir
index dd88e5c80bf..768d8da22bd 100644
--- a/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-linalg.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt %s -lhlo-legalize-to-linalg -split-input-file | FileCheck %s
+// RUN: mlir-hlo-opt %s -lhlo-legalize-to-linalg -split-input-file | FILECHECK_OPTS="" FileCheck %s
 
 // CHECK: #map0 = affine_map<(d0, d1) -> (d0, d1)>
 // CHECK-LABEL: func @element_wise
diff --git a/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-llvm.mlir b/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-llvm.mlir
index a25a508b2d3..45c383bd1d6 100644
--- a/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-llvm.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt %s --test-lhlo-legalize-to-llvm -split-input-file | FileCheck %s
+// RUN: mlir-hlo-opt %s -lower-affine -convert-scf-to-std -test-lhlo-legalize-to-llvm -split-input-file | FileCheck %s
 
 // CHECK-LABEL: func @static_memref_cast
 func @static_memref_cast(%buf : memref<10x1x5xf32>) {
@@ -11,11 +11,11 @@ func @static_memref_cast(%buf : memref<10x1x5xf32>) {
 // CHECK: %[[MEMREF_BLDR_0:.*]] = llvm.mlir.undef : [[DESCRIPTOR_TYPE_2D:!.*]]
 
 // CHECK: %[[IN_PTR:.*]] = llvm.extractvalue %[[INPUT_MEMREF:.*]][0] : [[DESCRIPTOR_TYPE_3D]]
-// CHECK: %[[PTR:.*]] = llvm.bitcast %[[IN_PTR]] : !llvm<"float*"> to !llvm<"float*">
+// CHECK: %[[PTR:.*]] = llvm.bitcast %[[IN_PTR]] : !llvm.ptr<float> to !llvm.ptr<float>
 // CHECK: %[[MEMREF_BLDR_1:.*]] = llvm.insertvalue %[[PTR]], %[[MEMREF_BLDR_0]][0] : [[DESCRIPTOR_TYPE_2D]]
 
 // CHECK: %[[IN_ALIGNED_PTR:.*]] = llvm.extractvalue %[[INPUT_MEMREF]][1] : [[DESCRIPTOR_TYPE_3D]]
-// CHECK: %[[ALIGNED_PTR:.*]] = llvm.bitcast %[[IN_ALIGNED_PTR]] : !llvm<"float*"> to !llvm<"float*">
+// CHECK: %[[ALIGNED_PTR:.*]] = llvm.bitcast %[[IN_ALIGNED_PTR]] : !llvm.ptr<float> to !llvm.ptr<float>
 // CHECK: %[[MEMREF_BLDR_2:.*]] = llvm.insertvalue %[[ALIGNED_PTR]], %[[MEMREF_BLDR_1]][1] : [[DESCRIPTOR_TYPE_2D]]
 
 // CHECK: %[[C2:.*]] = llvm.mlir.constant(2 : index) : !llvm.i64
@@ -50,11 +50,11 @@ func @dynamic_memref_cast(%buf : memref<?x?xf32>) {
 // CHECK: %[[MEMREF_BLDR_0:.*]] = llvm.mlir.undef : [[DESCRIPTOR_TYPE:!.*]]
 
 // CHECK: %[[IN_PTR:.*]] = llvm.extractvalue %[[INPUT_MEMREF:.*]][0] : [[DESCRIPTOR_TYPE]]
-// CHECK: %[[PTR:.*]] = llvm.bitcast %[[IN_PTR]] : !llvm<"float*"> to !llvm<"float*">
+// CHECK: %[[PTR:.*]] = llvm.bitcast %[[IN_PTR]] : !llvm.ptr<float> to !llvm.ptr<float>
 // CHECK: %[[MEMREF_BLDR_1:.*]] = llvm.insertvalue %[[PTR]], %[[MEMREF_BLDR_0]][0] : [[DESCRIPTOR_TYPE]]
 
 // CHECK: %[[IN_ALIGNED_PTR:.*]] = llvm.extractvalue %[[INPUT_MEMREF]][1] : [[DESCRIPTOR_TYPE]]
-// CHECK: %[[ALIGNED_PTR:.*]] = llvm.bitcast %[[IN_ALIGNED_PTR]] : !llvm<"float*"> to !llvm<"float*">
+// CHECK: %[[ALIGNED_PTR:.*]] = llvm.bitcast %[[IN_ALIGNED_PTR]] : !llvm.ptr<float> to !llvm.ptr<float>
 // CHECK: %[[MEMREF_BLDR_2:.*]] = llvm.insertvalue %[[ALIGNED_PTR]], %[[MEMREF_BLDR_1]][1] : [[DESCRIPTOR_TYPE]]
 
 // CHECK: %[[SRC_OFFSET:.*]] = llvm.extractvalue %[[INPUT_MEMREF]][2] : [[DESCRIPTOR_TYPE]]
diff --git a/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-parallel-loops.mlir b/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-parallel-loops.mlir
index 1530f59317d..47ef99bcac0 100644
--- a/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-parallel-loops.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-parallel-loops.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt %s -lhlo-legalize-to-parallel-loops -canonicalize -split-input-file | FileCheck %s
+// RUN: mlir-hlo-opt %s -lhlo-legalize-to-parallel-loops -canonicalize -split-input-file | FILECHECK_OPTS="" FileCheck %s
 
 func @reduce(%arg: memref<100x10x5xf32>,
              %init: memref<f32>,
diff --git a/tensorflow/compiler/mlir/hlo/tests/lit.cfg.py b/tensorflow/compiler/mlir/hlo/tests/lit.cfg.py
new file mode 100644
index 00000000000..f81d47a76cd
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/tests/lit.cfg.py
@@ -0,0 +1,82 @@
+"""Lit configuration to drive test in this repo."""
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- Python -*-
+# pylint: disable=undefined-variable
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import lit.formats
+from lit.llvm import llvm_config
+from lit.llvm.subst import ToolSubst
+import lit.util
+
+# Configuration file for the 'lit' test runner.
+
+# name: The name of this test suite.
+config.name = 'MLIR_HLO_OPT'
+
+config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell)
+
+# suffixes: A list of file extensions to treat as test files.
+config.suffixes = ['.mlir', '.mlir.py']
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+
+# test_exec_root: The root path where tests should be run.
+config.test_exec_root = os.path.join(config.mlir_hlo_obj_root, 'test')
+
+config.substitutions.append(('%PATH%', config.environment['PATH']))
+config.substitutions.append(('%shlibext', config.llvm_shlib_ext))
+
+llvm_config.with_system_environment(['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP'])
+
+llvm_config.use_default_substitutions()
+
+# excludes: A list of directories to exclude from the testsuite. The 'Inputs'
+# subdirectories contain auxiliary inputs for various tests in their parent
+# directories.
+config.excludes = [
+    'Inputs', 'Examples', 'CMakeLists.txt', 'README.txt', 'LICENSE.txt'
+]
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+
+# test_exec_root: The root path where tests should be run.
+config.test_exec_root = os.path.join(config.mlir_hlo_obj_root, 'test')
+config.mlir_hlo_tools_dir = os.path.join(config.mlir_hlo_obj_root, 'tools')
+
+# Tweak the PATH to include the tools dir.
+llvm_config.with_environment('PATH', config.llvm_tools_dir, append_path=True)
+
+tool_dirs = [
+    os.path.join(config.mlir_hlo_tools_dir, 'mlir-hlo-opt'),
+    config.llvm_tools_dir,
+]
+tools = [
+    'mlir-hlo-opt',
+    'mlir-cpu-runner',
+    ToolSubst(
+        '%mlir_runner_utils_dir',
+        config.mlir_runner_utils_dir,
+        unresolved='ignore'),
+]
+
+llvm_config.add_tool_substitutions(tools, tool_dirs)
diff --git a/tensorflow/compiler/mlir/hlo/tests/lit.site.cfg.py.in b/tensorflow/compiler/mlir/hlo/tests/lit.site.cfg.py.in
new file mode 100644
index 00000000000..1555d314df0
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/tests/lit.site.cfg.py.in
@@ -0,0 +1,63 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+@LIT_SITE_CFG_IN_HEADER@
+
+import sys
+
+config.host_triple = "@LLVM_HOST_TRIPLE@"
+config.target_triple = "@TARGET_TRIPLE@"
+config.llvm_src_root = "@LLVM_SOURCE_DIR@"
+config.llvm_obj_root = "@LLVM_BINARY_DIR@"
+config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
+config.llvm_lib_dir = "@LLVM_LIBRARY_DIR@"
+config.llvm_shlib_dir = "@SHLIBDIR@"
+config.llvm_shlib_ext = "@SHLIBEXT@"
+config.llvm_exe_ext = "@EXEEXT@"
+config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
+config.python_executable = "@PYTHON_EXECUTABLE@"
+config.gold_executable = "@GOLD_EXECUTABLE@"
+config.ld64_executable = "@LD64_EXECUTABLE@"
+config.enable_shared = @ENABLE_SHARED@
+config.enable_assertions = @ENABLE_ASSERTIONS@
+config.targets_to_build = "@TARGETS_TO_BUILD@"
+config.native_target = "@LLVM_NATIVE_ARCH@"
+config.llvm_bindings = "@LLVM_BINDINGS@".split(' ')
+config.host_os = "@HOST_OS@"
+config.host_cc = "@HOST_CC@"
+config.host_cxx = "@HOST_CXX@"
+# Note: ldflags can contain double-quoted paths, so must use single quotes here.
+config.host_ldflags = '@HOST_LDFLAGS@'
+config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
+config.llvm_host_triple = '@LLVM_HOST_TRIPLE@'
+config.host_arch = "@HOST_ARCH@"
+config.mlir_hlo_src_root = "@CMAKE_SOURCE_DIR@"
+config.mlir_hlo_obj_root = "@CMAKE_BINARY_DIR@"
+config.mlir_runner_utils_dir = os.path.join(config.llvm_obj_root, "lib")
+
+# Support substitution of the tools_dir with user parameters. This is
+# used when we can't determine the tool dir at configuration time.
+try:
+    config.llvm_tools_dir = config.llvm_tools_dir % lit_config.params
+    config.llvm_shlib_dir = config.llvm_shlib_dir % lit_config.params
+except KeyError:
+    e = sys.exc_info()[1]
+    key, = e.args
+    lit_config.fatal("unable to find %r parameter, use '--param=%s=VALUE'" % (key,key))
+
+
+import lit.llvm
+lit.llvm.initialize(lit_config, config)
+
+# Let the main config do the real work.
+lit_config.load_config(config, "@CMAKE_SOURCE_DIR@/tests/lit.cfg.py")
diff --git a/tensorflow/compiler/mlir/hlo/tests/lower-complex.mlir b/tensorflow/compiler/mlir/hlo/tests/lower-complex.mlir
index 8d84e7140f3..a7bd21257a6 100644
--- a/tensorflow/compiler/mlir/hlo/tests/lower-complex.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/lower-complex.mlir
@@ -182,11 +182,10 @@ func @abs(%arg0 : tensor<2xf32>, %arg1 : tensor<2xf32>) -> (tensor<2xf32>) {
   // CHECK-DAG: [[VAL1:%.+]] = mhlo.multiply %arg1, %arg1
   // CHECK-DAG: [[VAL2:%.+]] = mhlo.add [[VAL0]], [[VAL1]]
   // CHECK-DAG: [[VAL3:%.+]] = "mhlo.sqrt"([[VAL2]])
-  %1 = "mhlo.abs"(%0) : (tensor<2xcomplex<f32>>) -> (tensor<2xcomplex<f32>>)
-  %2 = "mhlo.real"(%1) : (tensor<2xcomplex<f32>>) -> (tensor<2xf32>)
+  %1 = "mhlo.abs"(%0) : (tensor<2xcomplex<f32>>) -> (tensor<2xf32>)
 
   // CHECK: return [[VAL3]]
-  return %2 : tensor<2xf32>
+  return %1 : tensor<2xf32>
 }
 
 // CHECK-LABEL: @exp
diff --git a/tensorflow/compiler/mlir/hlo/tests/mhlo-transform-unranked.mlir b/tensorflow/compiler/mlir/hlo/tests/mhlo-transform-unranked.mlir
index 80474156f29..56a7cf7294c 100644
--- a/tensorflow/compiler/mlir/hlo/tests/mhlo-transform-unranked.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/mhlo-transform-unranked.mlir
@@ -5,10 +5,9 @@
 func @sqr_transform_result(%a: tensor<*xf32>) -> tensor<*xf32> {
 
   // Flatten operand shape.
-  %shape = shape.shape_of %a : tensor<*xf32>
-  %num_elements = shape.num_elements %shape
-  %num_elements_as_index = shape.size_to_index %num_elements
-  %flat_shape = tensor_from_elements(%num_elements_as_index) : tensor<1xindex>
+  %shape = shape.shape_of %a : tensor<*xf32> -> tensor<?xindex>
+  %num_elements = shape.num_elements %shape : tensor<?xindex> -> index
+  %flat_shape = tensor_from_elements(%num_elements) : tensor<1xindex>
   %flat_a = "mhlo.dynamic_reshape"(%a, %flat_shape)
       : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
 
@@ -16,8 +15,7 @@ func @sqr_transform_result(%a: tensor<*xf32>) -> tensor<*xf32> {
   %flat_b = "mhlo.sqrt"(%flat_a) : (tensor<?xf32>) -> tensor<?xf32>
 
   // Restore original shape.
-  %shape_as_extent_tensor = shape.to_extent_tensor %shape : tensor<?xindex>
-  %b = "mhlo.dynamic_reshape"(%flat_b, %shape_as_extent_tensor)
+  %b = "mhlo.dynamic_reshape"(%flat_b, %shape)
       : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
 
   return %b : tensor<*xf32>
@@ -29,14 +27,12 @@ func @sqr_transform_result(%a: tensor<*xf32>) -> tensor<*xf32> {
 // CHECK-LABEL: @sqrt
 // CHECK-SAME: (%[[A:.*]]: tensor<*xf32>)
 func @sqrt(%a: tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK-NEXT: %[[SHAPE:.*]] = shape.shape_of %[[A]] : tensor<*xf32>
+  // CHECK-NEXT: %[[SHAPE:.*]] = shape.shape_of %[[A]] : tensor<*xf32> -> tensor<?xindex>
   // CHECK-NEXT: %[[NUM_ELEMENTS:.*]] = shape.num_elements %[[SHAPE]]
-  // CHECK-NEXT: %[[NUM_ELEMENTS_AS_INDEX:.*]] = shape.size_to_index %[[NUM_ELEMENTS]]
-  // CHECK-NEXT: %[[FLAT_SHAPE:.*]] = tensor_from_elements(%[[NUM_ELEMENTS_AS_INDEX]]) : tensor<1xindex>
+  // CHECK-NEXT: %[[FLAT_SHAPE:.*]] = tensor_from_elements(%[[NUM_ELEMENTS]]) : tensor<1xindex>
   // CHECK-NEXT: %[[FLAT_A:.*]] = "mhlo.dynamic_reshape"(%[[A]], %[[FLAT_SHAPE]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
   // CHECK-NEXT: %[[FLAT_B:.*]] = "mhlo.sqrt"(%[[FLAT_A]]) : (tensor<?xf32>) -> tensor<?xf32>
-  // CHECK-NEXT: %[[SHAPE_AS_EXTENT_TENSOR:.*]] = shape.to_extent_tensor %[[SHAPE]] : tensor<?xindex>
-  // CHECK-NEXT: %[[B:.*]] = "mhlo.dynamic_reshape"(%[[FLAT_B]], %[[SHAPE_AS_EXTENT_TENSOR]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
+  // CHECK-NEXT: %[[B:.*]] = "mhlo.dynamic_reshape"(%[[FLAT_B]], %[[SHAPE]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
   // CHECK-NEXT: return %[[B]] : tensor<*xf32>
   %b = "mhlo.sqrt"(%a) : (tensor<*xf32>) -> tensor<*xf32>
   return %b : tensor<*xf32>
@@ -73,15 +69,13 @@ func @sqrt_static(%a: tensor<2x3xf32>) -> tensor<2x3xf32> {
 func @add_unranked(%a : tensor<*xf32>, %b : tensor<*xf32>) -> tensor<*xf32> {
   // CHECK: %[[SHAPE_A:.*]] = shape.shape_of %[[A]]
   // CHECK: %[[SHAPE_B:.*]] = shape.shape_of %[[B]]
-  // CHECK: %[[SHAPE:.*]] = shape.any %[[SHAPE_A]], %[[SHAPE_B]]
+  // CHECK: %[[SHAPE:.*]] = "shape.any"(%[[SHAPE_A]], %[[SHAPE_B]])
   // CHECK: %[[NUM_ELEMENTS:.*]] = shape.num_elements %[[SHAPE]]
-  // CHECK: %[[NUM_ELEMENTS_AS_INDEX:.*]] = shape.size_to_index %[[NUM_ELEMENTS]]
-  // CHECK: %[[FLAT_SHAPE:.*]] = tensor_from_elements(%[[NUM_ELEMENTS_AS_INDEX]]) : tensor<1xindex>
+  // CHECK: %[[FLAT_SHAPE:.*]] = tensor_from_elements(%[[NUM_ELEMENTS]]) : tensor<1xindex>
   // CHECK: %[[FLAT_A:.*]] = "mhlo.dynamic_reshape"(%[[A]], %[[FLAT_SHAPE]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
   // CHECK: %[[FLAT_B:.*]] = "mhlo.dynamic_reshape"(%[[B]], %[[FLAT_SHAPE]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
   // CHECK: %[[FLAT_RESULT:.*]] = mhlo.add %[[FLAT_A]], %[[FLAT_B]] : tensor<?xf32>
-  // CHECK: %[[SHAPE_AS_EXTENT_TENSOR:.*]] = shape.to_extent_tensor %[[SHAPE]] : tensor<?xindex>
-  // CHECK: %[[RESULT:.*]] = "mhlo.dynamic_reshape"(%[[FLAT_RESULT]], %[[SHAPE_AS_EXTENT_TENSOR]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
+  // CHECK: %[[RESULT:.*]] = "mhlo.dynamic_reshape"(%[[FLAT_RESULT]], %[[SHAPE]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
   // CHECK: return %[[RESULT]] : tensor<*xf32>
   %result = mhlo.add %a, %b : tensor<*xf32>
   return %result : tensor<*xf32>
diff --git a/tensorflow/compiler/mlir/hlo/tests/ops.mlir b/tensorflow/compiler/mlir/hlo/tests/ops.mlir
index b46827b88a5..a8f16c403ae 100644
--- a/tensorflow/compiler/mlir/hlo/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/ops.mlir
@@ -116,6 +116,30 @@ func @dynamic_broadcast_in_dim(%arg0: tensor<?x?xi32>, %shape: tensor<3xi64>) ->
 
 // -----
 
+// CHECK-LABEL: func @dynamic_broadcast_in_dim_unknown_dim
+func @dynamic_broadcast_in_dim_unknown_dim(%arg0: tensor<32xf32>, %shape: tensor<3xi64>) -> tensor<?x?x?xf32> {
+  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %shape) {broadcast_dimensions = dense<[2]> : tensor<1xi64>} : (tensor<32xf32>, tensor<3xi64>) -> tensor<?x?x?xf32>
+  return %0 : tensor<?x?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @dynamic_broadcast_in_dim_ok_dim
+func @dynamic_broadcast_in_dim_ok_dim(%arg0: tensor<1xf32>, %shape: tensor<3xi64>) -> tensor<7x8x9xf32> {
+  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %shape) {broadcast_dimensions = dense<[2]> : tensor<1xi64>} : (tensor<1xf32>, tensor<3xi64>) -> tensor<7x8x9xf32>
+  return %0 : tensor<7x8x9xf32>
+}
+
+// -----
+
+func @dynamic_broadcast_in_dim_shape_mismatch(%arg0: tensor<32xf32>, %shape: tensor<3xi64>) -> tensor<7x8x9xf32> {
+  // expected-error@+1 {{size of operand dimension 0 (32) is not compatible with size of result dimension 2 (9)}}
+  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %shape) {broadcast_dimensions = dense<[2]> : tensor<1xi64>} : (tensor<32xf32>, tensor<3xi64>) -> tensor<7x8x9xf32>
+  return %0 : tensor<7x8x9xf32>
+}
+
+// -----
+
 func @broadcast_in_dim_bad_dimension_rank(%arg0: tensor<1x2xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+1 {{broadcast_dimensions has rank 2 instead of rank 1}}
   %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[[1,1],[1,1]]> : tensor<2x2xi64>} : (tensor<1x2xi32>) -> tensor<1x2x3xi32>
@@ -456,7 +480,7 @@ func @map_non_scalar_computation_operand(%arg0: tensor<4x5xf32>, %arg1: tensor<4
   // expected-error@+1 {{computation arguments must be 0-rank tensor, but got: arg #1 of type 'tensor<5xf32>'}}
   %0 = "mhlo.map"(%arg0, %arg1) ( {
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<5xf32>):
-    %1 = mhlo.constant {value = dense<2.0> : tensor<f32>} : tensor<f32>
+    %1 = mhlo.constant dense<2.0> : tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
   }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<4x5xf32>, tensor<4x5xf32>) -> tensor<4x5xf32>
   return %0 : tensor<4x5xf32>
@@ -468,7 +492,7 @@ func @map_mismatch_operand_and_computation_args(%arg0: tensor<4x5xf32>, %arg1: t
   // expected-error@+1 {{element type of operands and computation arguments must match, but got: 'f32' and 'i32'}}
   %0 = "mhlo.map"(%arg0, %arg1) ( {
     ^bb0(%arg2: tensor<i32>, %arg3: tensor<i32>):
-    %1 = mhlo.constant {value = dense<2.0> : tensor<f32>} : tensor<f32>
+    %1 = mhlo.constant dense<2.0> : tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
   }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<4x5xf32>, tensor<4x5xf32>) -> tensor<4x5xf32>
   return %0 : tensor<4x5xf32>
@@ -480,7 +504,7 @@ func @map_invalid_number_of_computation_output(%arg0: tensor<4x5xf32>, %arg1: te
   // expected-error@+1 {{computation must return single output, but got: 0}}
   %0 = "mhlo.map"(%arg0, %arg1) ( {
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
-    %1 = mhlo.constant {value = dense<2.0> : tensor<f32>} : tensor<f32>
+    %1 = mhlo.constant dense<2.0> : tensor<f32>
     "mhlo.return"() : () -> ()
   }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<4x5xf32>, tensor<4x5xf32>) -> tensor<4x5xf32>
   return %0 : tensor<4x5xf32>
@@ -492,7 +516,7 @@ func @main_non_scalar_computation_output(%arg0: tensor<4x5xf32>, %arg1: tensor<4
   // expected-error@+1 {{computation must return 0-rank tensor, but got: 'tensor<5xf32>'}}
   %0 = "mhlo.map"(%arg0, %arg1) ( {
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
-    %1 = mhlo.constant {value = dense<2.0> : tensor<f32>} : tensor<5xf32>
+    %1 = mhlo.constant dense<2.0> : tensor<5xf32>
     "mhlo.return"(%1) : (tensor<5xf32>) -> ()
   }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<4x5xf32>, tensor<4x5xf32>) -> tensor<4x5xf32>
   return %0 : tensor<4x5xf32>
@@ -504,7 +528,7 @@ func @mismatch_computation_output_type(%arg0: tensor<4x5xf32>, %arg1: tensor<4x5
   // expected-error@+1 {{element type of result and computation output must match, but got: 'f32' and 'i32'}}
   %0 = "mhlo.map"(%arg0, %arg1) ( {
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
-    %1 = mhlo.constant {value = dense<2> : tensor<i32>} : tensor<i32>
+    %1 = mhlo.constant dense<2> : tensor<i32>
     "mhlo.return"(%1) : (tensor<i32>) -> ()
   }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<4x5xf32>, tensor<4x5xf32>) -> tensor<4x5xf32>
   return %0 : tensor<4x5xf32>
@@ -730,6 +754,14 @@ func @dynamic_update_slice_invalid_start(%input: tensor<3x4xi64>, %update: tenso
 
 // -----
 
+func @dynamic_update_slice_mismatched_start(%input: tensor<11x3x4xi32>, %update: tensor<1x3x4xi32>, %start1: tensor<i32>, %start2: tensor<i64>, %start3: tensor<i64>) -> tensor<11x3x4xi32> {
+  // expected-error@+1 {{start indices must have same element type (encountered mismatch: 'i32' vs 'i64')}}
+  %0 = "mhlo.dynamic-update-slice"(%input, %update, %start1, %start2, %start3) : (tensor<11x3x4xi32>, tensor<1x3x4xi32>, tensor<i32>, tensor<i64>, tensor<i64>) -> tensor<11x3x4xi32>
+  return %0 : tensor<11x3x4xi32>
+}
+
+// -----
+
 // CHECK-LABEL: func @transpose
 func @transpose(%arg0: tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32> {
   %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
@@ -847,6 +879,13 @@ func @tuple(%arg0: tensor<1xi32>, %arg1: tensor<1x2xf32>) -> tuple<tensor<1xi32>
 
 // -----
 
+func @tuple_token(%arg0: tensor<f32>, %arg1: !mhlo.token) -> tuple<tensor<f32>, !mhlo.token> {
+  %0 = "mhlo.tuple"(%arg0, %arg1) : (tensor<f32>, !mhlo.token) -> tuple<tensor<f32>, !mhlo.token>
+  return %0 : tuple<tensor<f32>, !mhlo.token>
+}
+
+// -----
+
 func @tuple_arg_size_mismatch(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tuple<tensor<f32>, tensor<f32>, tensor<f32>> {
   // expected-error@+1 {{has return type tuple<tensor<f32>, tensor<f32>, tensor<f32>>, but expected tuple<tensor<f32>, tensor<f32>>}}
   %0 = "mhlo.tuple"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tuple<tensor<f32>, tensor<f32>, tensor<f32>>
@@ -939,7 +978,23 @@ func @constants() -> () {
 
 func @constant_invalid() -> () {
   // expected-error@+1 {{op failed to verify that all of {value, output} have same type}}
-  %0 = "mhlo.constant"() {value = dense<0> : tensor<i32>} : () -> (tensor<*xi32>)
+  %0 = "mhlo.constant"() {value = dense<0> : tensor<i32>} : () -> (tensor<3xi32>)
+  return
+}
+
+// -----
+
+func @constant_invalid() -> () {
+  // expected-error@+1 {{op result #0 must be statically shaped tensor}}
+  %0 = "mhlo.constant"() {value = dense<1> : tensor<i32>} : () -> tensor<?xi32>
+  return
+}
+
+// -----
+
+func @constant_invalid() -> () {
+  // expected-error@+1 {{elements literal type must have static shape}}
+  %0 = "mhlo.constant"() {value = dense<1> : tensor<?xi32>} : () -> tensor<?xi32>
   return
 }
 
diff --git a/tensorflow/compiler/mlir/hlo/tests/optimize-hlo.mlir b/tensorflow/compiler/mlir/hlo/tests/optimize-hlo.mlir
new file mode 100644
index 00000000000..c20de0b2a9f
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/tests/optimize-hlo.mlir
@@ -0,0 +1,64 @@
+// RUN: mlir-hlo-opt %s -pass-pipeline='func(mhlo-test-optimize)' | FileCheck %s
+
+// CHECK-LABEL: @gather_is_slice_no_rank
+func @gather_is_slice_no_rank(%arg0: tensor<2x1x2xi32>, %arg1: tensor<i64>) -> tensor<1x2xi32> {
+  // CHECK: [[CST:%.+]] = mhlo.constant dense<0> : tensor<i64>
+  // CHECK: [[SLICE:%.+]] = "mhlo.dynamic-slice"(%arg0, %arg1, [[CST]], [[CST]]) {slice_sizes = dense<[1, 1, 2]> : tensor<3xi64>}
+  // CHECK: [[RESHAPE:%.+]] = "mhlo.reshape"([[SLICE]])
+   %res = "mhlo.gather"(%arg0, %arg1) {
+    dimension_numbers = {
+      collapsed_slice_dims = dense<0> : tensor<1xi64>,
+      index_vector_dim = 0 : i64,
+      offset_dims = dense<[0, 1]> : tensor<2xi64>,
+      start_index_map = dense<0> : tensor<1xi64>
+    },
+    slice_sizes = dense<[1, 1, 2]> : tensor<3xi64>
+  } : (tensor<2x1x2xi32>, tensor<i64>) -> tensor<1x2xi32>
+
+  // CHECK: return [[RESHAPE]]
+  return %res : tensor<1x2xi32>
+}
+
+// CHECK-LABEL: @gather_is_slice
+func @gather_is_slice(%arg0: tensor<2x1x2xi32>, %arg1: tensor<1xi64>) -> tensor<1x2xi32> {
+   // CHECK: [[CST:%.+]] = mhlo.constant dense<0> : tensor<i64>
+   // CHECK: [[RESHAPE:%.+]] = "mhlo.reshape"(%arg1)
+   // CHECK: [[SLICE:%.+]] = "mhlo.dynamic-slice"(%arg0, [[RESHAPE]], [[CST]], [[CST]]) {slice_sizes = dense<[1, 1, 2]> : tensor<3xi64>}
+   // CHECK: [[RES:%.+]] = "mhlo.reshape"([[SLICE]])
+
+   %res = "mhlo.gather"(%arg0, %arg1) {
+    dimension_numbers = {
+      collapsed_slice_dims = dense<0> : tensor<1xi64>,
+      index_vector_dim = 0 : i64,
+      offset_dims = dense<[0, 1]> : tensor<2xi64>,
+      start_index_map = dense<0> : tensor<1xi64>
+    },
+    slice_sizes = dense<[1, 1, 2]> : tensor<3xi64>
+  } : (tensor<2x1x2xi32>, tensor<1xi64>) -> tensor<1x2xi32>
+
+  // CHECK: return [[RES]]
+  return %res : tensor<1x2xi32>
+}
+
+// CHECK-LABEL: @gather_is_slice_multiple_start_indices
+func @gather_is_slice_multiple_start_indices(%arg0: tensor<2x1x2xi32>, %arg1: tensor<2xi64>) -> tensor<1x2xi32> {
+  // CHECK-DAG: [[CST:%.+]] = mhlo.constant dense<0>
+  // CHECK-DAG: [[SLICE1:%.+]] = "mhlo.slice"(%arg1) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[RESHAPE1:%.+]] = "mhlo.reshape"([[SLICE1]])
+  // CHECK-DAG: [[SLICE2:%.+]] = "mhlo.slice"(%arg1) {limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[RESHAPE2:%.+]] = "mhlo.reshape"([[SLICE2]])
+  // CHECK-DAG: [[DSLICE:%.+]] = "mhlo.dynamic-slice"(%arg0, [[RESHAPE1]], [[RESHAPE2]], [[CST]]) {slice_sizes = dense<[1, 1, 2]> : tensor<3xi64>}
+  // CHECK-DAG: [[RES:%.+]] = "mhlo.reshape"([[DSLICE]])
+   %res = "mhlo.gather"(%arg0, %arg1) {
+    dimension_numbers = {
+      collapsed_slice_dims = dense<0> : tensor<1xi64>,
+      index_vector_dim = 0 : i64,
+      offset_dims = dense<[0, 1]> : tensor<2xi64>,
+      start_index_map = dense<0> : tensor<1xi64>
+    },
+    slice_sizes = dense<[1, 1, 2]> : tensor<3xi64>
+  } : (tensor<2x1x2xi32>, tensor<2xi64>) -> tensor<1x2xi32>
+
+  // CHECK: return [[RES]]
+  return %res : tensor<1x2xi32>
+}
diff --git a/tensorflow/compiler/mlir/hlo/tests/sink-constants-to-control-flow.mlir b/tensorflow/compiler/mlir/hlo/tests/sink-constants-to-control-flow.mlir
index f8b6b629c9e..9e18ad8a2d8 100644
--- a/tensorflow/compiler/mlir/hlo/tests/sink-constants-to-control-flow.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/sink-constants-to-control-flow.mlir
@@ -58,3 +58,17 @@ func @sink_const_to_conditional(%arg0: tensor<i64>) -> tensor<i64> {
   %9 = "mhlo.get_tuple_element"(%2) {index = 0 : i32} : (tuple<tensor<i64>>) -> tensor<i64>
   return %9 : tensor<i64>
 }
+
+func @sink_const_to_sort(%arg0: tensor<16xf32>) {
+  %c0 = constant dense<1.0> : tensor<f32>
+  // CHECK: "mhlo.sort"
+  %0 = "mhlo.sort"(%arg0) ( {
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+    // CHECK: constant dense<1.000000e+00>
+    %1 = "mhlo.divide"(%arg1, %c0) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    %2 = "mhlo.divide"(%arg2, %c0) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    %3 = "mhlo.compare"(%1, %2) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    "mhlo.return"(%3) : (tensor<i1>) -> ()
+  }) {is_stable = true} : (tensor<16xf32>) -> tensor<16xi32>
+  return
+}
diff --git a/tensorflow/compiler/mlir/hlo/tests/unfuse_batch_norm.mlir b/tensorflow/compiler/mlir/hlo/tests/unfuse_batch_norm.mlir
index c1930721218..f903dbb7080 100644
--- a/tensorflow/compiler/mlir/hlo/tests/unfuse_batch_norm.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/unfuse_batch_norm.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt -split-input-file -mhlo-test-unfuse-batch-norm -verify-diagnostics %s | FileCheck --enable-var-scope %s
+// RUN: mlir-hlo-opt -split-input-file -mhlo-test-unfuse-batch-norm -verify-diagnostics %s | FILECHECK_OPTS="" FileCheck --enable-var-scope %s
 
 // CHECK-LABEL: @batchNormInference_2D_inner_features
 // CHECK-SAME: %[[X:[^:[:space:]]+]]
diff --git a/tensorflow/compiler/mlir/hlo/tools/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/tools/CMakeLists.txt
new file mode 100644
index 00000000000..0f3d1c85795
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/tools/CMakeLists.txt
@@ -0,0 +1,16 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+add_subdirectory(mlir-hlo-opt)
diff --git a/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/CMakeLists.txt
new file mode 100644
index 00000000000..754469a3c84
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/CMakeLists.txt
@@ -0,0 +1,32 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
+get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
+set(LIBS
+        ${dialect_libs}
+        ${conversion_libs}
+        MLIROptLib
+
+        MhloRegisterDialects
+        AllMhloPasses
+        )
+add_llvm_executable(mlir-hlo-opt mlir-hlo-opt.cpp
+  DEPENDS
+        MLIRLmhloPassIncGen
+        MLIRMhloPassIncGen
+)
+llvm_update_compile_flags(mlir-hlo-opt)
+target_link_libraries(mlir-hlo-opt PRIVATE ${LIBS})
diff --git a/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cpp b/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cpp
new file mode 100644
index 00000000000..70fc21d6959
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cpp
@@ -0,0 +1,121 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "mlir-hlo/Dialect/mhlo/IR/register.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/register_passes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/InitAllDialects.h"
+#include "mlir/InitAllPasses.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Support/MlirOptMain.h"
+
+// NOLINTNEXTLINE
+static llvm::cl::opt<std::string> inputFilename(llvm::cl::Positional,
+                                                llvm::cl::desc("<input file>"),
+                                                llvm::cl::init("-"));
+
+// NOLINTNEXTLINE
+static llvm::cl::opt<std::string> outputFilename(
+    "o", llvm::cl::desc("Output filename"), llvm::cl::value_desc("filename"),
+    llvm::cl::init("-"));
+
+// NOLINTNEXTLINE
+static llvm::cl::opt<bool> splitInputFile(
+    "split-input-file",
+    llvm::cl::desc("Split the input file into pieces and process each "
+                   "chunk independently"),
+    llvm::cl::init(false));
+
+// NOLINTNEXTLINE
+static llvm::cl::opt<bool> verifyDiagnostics(
+    "verify-diagnostics",
+    llvm::cl::desc("Check that emitted diagnostics match "
+                   "expected-* lines on the corresponding line"),
+    llvm::cl::init(false));
+
+// NOLINTNEXTLINE
+static llvm::cl::opt<bool> verifyPasses(
+    "verify-each",
+    llvm::cl::desc("Run the verifier after each transformation pass"),
+    llvm::cl::init(true));
+
+// NOLINTNEXTLINE
+static llvm::cl::opt<bool> allowUnregisteredDialects(
+    "allow-unregistered-dialect",
+    llvm::cl::desc("Allow operation with no registered dialects"),
+    llvm::cl::init(false));
+
+// NOLINTNEXTLINE
+static llvm::cl::opt<bool> showDialects(
+    "show-dialects", llvm::cl::desc("Print the list of registered dialects"),
+    llvm::cl::init(false));
+
+int main(int argc, char **argv) {
+  mlir::registerAllDialects();
+  mlir::registerAllPasses();
+
+  mlir::mhlo::registerAllDialects();
+  mlir::mhlo::registerAllMhloPasses();
+  mlir::lmhlo::registerAllLmhloPasses();
+
+  llvm::InitLLVM y(argc, argv);
+
+  // Register any pass manager command line options.
+  mlir::registerPassManagerCLOptions();
+  mlir::PassPipelineCLParser passPipeline("", "Compiler passes to run");
+
+  // Parse pass names in main to ensure static initialization completed.
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "MLIR modular optimizer driver\n");
+
+  if (showDialects) {
+    mlir::MLIRContext context;
+    llvm::outs() << "Registered Dialects:\n";
+    for (mlir::Dialect *dialect : context.getRegisteredDialects()) {
+      llvm::outs() << dialect->getNamespace() << "\n";
+    }
+    return 0;
+  }
+
+  // Set up the input file.
+  std::string errorMessage;
+  auto file = mlir::openInputFile(inputFilename, &errorMessage);
+  if (!file) {
+    llvm::errs() << errorMessage << "\n";
+    return 1;
+  }
+
+  auto output = mlir::openOutputFile(outputFilename, &errorMessage);
+  if (!output) {
+    llvm::errs() << errorMessage << "\n";
+    exit(1);
+  }
+
+  if (failed(MlirOptMain(output->os(), std::move(file), passPipeline,
+                         splitInputFile, verifyDiagnostics, verifyPasses,
+                         allowUnregisteredDialects))) {
+    return 1;
+  }
+  // Keep the output file if the invocation of MlirOptMain was successful.
+  output->keep();
+  return 0;
+}
diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 8d0c204f434..555c11779f5 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -25,7 +25,6 @@ package_group(
 filegroup(
     name = "tensorflow_lite_ops_td_files",
     srcs = [
-        "experimental/tfl_hardware_interfaces.td",
         "ir/tfl_op_interfaces.td",
         "ir/tfl_ops.td",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_td_files",
@@ -221,18 +220,14 @@ cc_library(
     ],
     deps = [
         ":tensorflow_lite_ops_inc_gen",
-        ":validators",
-        "//tensorflow/compiler/mlir/lite/experimental/estimators:cost_estimators",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/lite/schema:schema_fbs",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:DerivedAttributeOpInterface",
         "@llvm-project//mlir:Dialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LoopLikeInterface",
-        "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:SideEffects",
         "@llvm-project//mlir:StandardOps",
@@ -273,7 +268,9 @@ cc_library(
     deps = [
         ":tensorflow_lite",
         "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_attributes",
         "//tensorflow/core:framework",
+        "@flatbuffers",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:StandardOps",
@@ -338,6 +335,7 @@ cc_library(
         "transforms/optimize_functional_ops.cc",
         "transforms/prepare_composite_functions_tf.cc",
         "transforms/prepare_tf.cc",
+        "transforms/raise_custom_ops.cc",
         "transforms/runtime_verify.cc",
         "transforms/split_merged_operands.cc",
         "transforms/trim_functions_tf.cc",
@@ -349,26 +347,29 @@ cc_library(
         "transforms/passes.h",
     ],
     deps = [
-        ":common",
         ":lstm_utils",
         ":stateful_ops_utils",
         ":tensorflow_lite",
         ":tftext_utils",
         ":validators",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
+        "//tensorflow/compiler/mlir/hlo",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
         "//tensorflow/compiler/mlir/tensorflow:mangling_util",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_attributes",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/compiler/mlir/tensorflow:tf_legalize_hlo",
         "//tensorflow/compiler/mlir/tensorflow:unroll_batch_matmul_pass",
+        "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
+        "//tensorflow/compiler/mlir/xla:xla_legalize_tf_with_tf2xla",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:tensor_list",
-        "//tensorflow/core/platform:logging",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
         "@llvm-project//llvm:Support",
@@ -399,7 +400,6 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
         "//tensorflow/compiler/mlir/tensorflow",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:StandardOps",
@@ -433,7 +433,6 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/memory",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
@@ -454,7 +453,7 @@ cc_library(
     deps = [
         ":tensorflow_lite",
         "//tensorflow/lite/tools/optimize/sparsity:format_converter",
-        "@com_google_absl//absl/base",
+        "//third_party/eigen3",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
@@ -480,7 +479,6 @@ gentbl(
     td_srcs = [
         "@llvm-project//mlir:include/mlir/Interfaces/LoopLikeInterface.td",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_td_files",
-        "experimental/tfl_hardware_interfaces.td",
         "ir/tfl_op_interfaces.td",
     ],
 )
@@ -609,8 +607,6 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
         "//tensorflow/compiler/mlir/tensorflow:export_tf_dialect_op",
-        "//tensorflow/compiler/mlir/tensorflow:mangling_util",
-        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/compiler/xla:statusor",
@@ -620,7 +616,7 @@ cc_library(
         "//tensorflow/core/platform:status",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite:string_util",
-        "//tensorflow/lite/delegates/flex:whitelisted_flex_ops_lib",
+        "//tensorflow/lite/delegates/flex:allowlisted_flex_ops_lib",
         "//tensorflow/lite/kernels/internal:kernel_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/tools/versioning",
@@ -651,7 +647,6 @@ cc_library(
         ":flatbuffer_tflite_operator_lib",
         ":tensorflow_lite",
         ":tensorflow_lite_dialect_registration",
-        "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
         "//tensorflow/compiler/mlir/tensorflow:mangling_util",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/compiler/xla:statusor",
@@ -724,7 +719,6 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MlirTranslateMain",
@@ -858,10 +852,8 @@ cc_library(
         "//tensorflow/core:core_cpu_base",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AllPassesAndDialects",
-        "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:Transforms",
     ],
 )
diff --git a/tensorflow/compiler/mlir/lite/experimental/estimators/BUILD b/tensorflow/compiler/mlir/lite/experimental/estimators/BUILD
index 04d5d3db918..373c95f6bf5 100644
--- a/tensorflow/compiler/mlir/lite/experimental/estimators/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/estimators/BUILD
@@ -8,9 +8,6 @@ package(
 cc_library(
     name = "cost_estimators",
     textual_hdrs = [
-        "estimator.h",
-        "cpu_estimators.h",
-        "gpu_estimators.h",
         "hardware.h",
         "arithmetic_count_util.h",
     ],
diff --git a/tensorflow/compiler/mlir/lite/experimental/estimators/arithmetic_count_util.h b/tensorflow/compiler/mlir/lite/experimental/estimators/arithmetic_count_util.h
index 2ca49e4e1e5..782714f5955 100644
--- a/tensorflow/compiler/mlir/lite/experimental/estimators/arithmetic_count_util.h
+++ b/tensorflow/compiler/mlir/lite/experimental/estimators/arithmetic_count_util.h
@@ -15,13 +15,17 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_ESTIMATORS_ARITHMETIC_COUNT_UTIL_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_ESTIMATORS_ARITHMETIC_COUNT_UTIL_H_
 
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+
 // For add/mul/div/sub and other broadcastable ops.
 class ArithmeticCountUtilHelper {
  public:
   static bool GetArithmeticCountForBroadcastableOp(mlir::Operation* op,
                                                    int64_t* count) {
     auto output = op->getResult(0);
-    auto output_type = output.getType().dyn_cast_or_null<RankedTensorType>();
+    auto output_type =
+        output.getType().dyn_cast_or_null<mlir::RankedTensorType>();
     if (!output_type || !output_type.hasStaticShape()) return false;
 
     *count = output_type.getNumElements();
@@ -31,7 +35,8 @@ class ArithmeticCountUtilHelper {
   static bool GetInputTensorTotalSize(mlir::Operation* op, int64_t* count) {
     int64_t total_count = 0;
     for (auto input : op->getOperands()) {
-      auto input_type = input.getType().dyn_cast_or_null<RankedTensorType>();
+      auto input_type =
+          input.getType().dyn_cast_or_null<mlir::RankedTensorType>();
       if (!input_type || !input_type.hasStaticShape()) {
         return false;
       }
@@ -43,14 +48,16 @@ class ArithmeticCountUtilHelper {
 
   // For conv2d/depthwise_conv/fully_connected ops.
   // This algorithm actually comes from TOCO tooling_util.cc
-  static bool GetArithmeticCountForConvAndFullyconnectedOp(Operation* op,
+  static bool GetArithmeticCountForConvAndFullyconnectedOp(mlir::Operation* op,
                                                            int64_t* count) {
     auto weight = op->getOperand(1);
-    auto weight_type = weight.getType().dyn_cast_or_null<RankedTensorType>();
+    auto weight_type =
+        weight.getType().dyn_cast_or_null<mlir::RankedTensorType>();
     if (weight_type == nullptr || !weight_type.hasStaticShape()) return false;
 
     auto output = op->getResult(0);
-    auto output_type = output.getType().dyn_cast_or_null<RankedTensorType>();
+    auto output_type =
+        output.getType().dyn_cast_or_null<mlir::RankedTensorType>();
     if (output_type == nullptr || !output_type.hasStaticShape()) return false;
 
     int64_t cols = 1;
@@ -63,7 +70,8 @@ class ArithmeticCountUtilHelper {
 
     auto bias = op->getOperand(2);
     if (bias) {
-      auto bias_type = bias.getType().dyn_cast_or_null<RankedTensorType>();
+      auto bias_type =
+          bias.getType().dyn_cast_or_null<mlir::RankedTensorType>();
       if (bias_type && bias_type.hasStaticShape()) {
         *count += bias_type.getNumElements();
       }
diff --git a/tensorflow/compiler/mlir/lite/experimental/estimators/cpu_estimators.h b/tensorflow/compiler/mlir/lite/experimental/estimators/cpu_estimators.h
deleted file mode 100644
index b47c08c7cb4..00000000000
--- a/tensorflow/compiler/mlir/lite/experimental/estimators/cpu_estimators.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_ESTIMATORS_CPU_ESTIMATORS_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_ESTIMATORS_CPU_ESTIMATORS_H_
-
-// CPU
-constexpr float kCPUArithmeticUnitCost = 1.0;
-
-// This basically assumes pure load/store. This is just fake data.
-constexpr float kCPUCopyUnitCost = 0.5;
-constexpr float kCPUDefaultCost = 3.0f;
-
-// Default values.
-constexpr float kCPUDefaultFixedValuedCost = 10000.0;
-
-// tfl.add
-template <>
-class TFLiteCostEstimator<AddOp, hardware::CPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    int64_t count;
-    if (ArithmeticCountUtilHelper::GetArithmeticCountForBroadcastableOp(op,
-                                                                        &count))
-      return kCPUArithmeticUnitCost * count;
-    return kCPUDefaultFixedValuedCost;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.concatenation
-template <>
-class TFLiteCostEstimator<ConcatenationOp, hardware::CPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    int64_t count;
-    if (ArithmeticCountUtilHelper::GetInputTensorTotalSize(op, &count))
-      return kCPUCopyUnitCost * count;
-    return kCPUDefaultFixedValuedCost;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.conv_2d
-template <>
-class TFLiteCostEstimator<Conv2DOp, hardware::CPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    int64_t arithmetic_count;
-    if (ArithmeticCountUtilHelper::GetArithmeticCountForConvAndFullyconnectedOp(
-            op, &arithmetic_count)) {
-      return arithmetic_count * kCPUArithmeticUnitCost;
-    }
-    return kCPUDefaultFixedValuedCost;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.depthwise_conv_2d
-template <>
-class TFLiteCostEstimator<DepthwiseConv2DOp, hardware::CPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    int64_t arithmetic_count;
-    if (ArithmeticCountUtilHelper::GetArithmeticCountForConvAndFullyconnectedOp(
-            op, &arithmetic_count)) {
-      return arithmetic_count * kCPUArithmeticUnitCost;
-    }
-    return kCPUDefaultFixedValuedCost;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.fully_connected
-template <>
-class TFLiteCostEstimator<FullyConnectedOp, hardware::CPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    int64_t arithmetic_count;
-    if (ArithmeticCountUtilHelper::GetArithmeticCountForConvAndFullyconnectedOp(
-            op, &arithmetic_count)) {
-      return arithmetic_count * kCPUArithmeticUnitCost;
-    }
-    return kCPUDefaultFixedValuedCost;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.mul
-template <>
-class TFLiteCostEstimator<MulOp, hardware::CPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    int64_t count;
-    if (ArithmeticCountUtilHelper::GetArithmeticCountForBroadcastableOp(op,
-                                                                        &count))
-      return kCPUArithmeticUnitCost * count;
-    return kCPUDefaultFixedValuedCost;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.pack
-template <>
-class TFLiteCostEstimator<PackOp, hardware::CPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    int64_t count;
-    if (ArithmeticCountUtilHelper::GetInputTensorTotalSize(op, &count))
-      return kCPUCopyUnitCost * count;
-    return kCPUDefaultFixedValuedCost;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.reshape
-template <>
-class TFLiteCostEstimator<ReshapeOp, hardware::CPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    int64_t count;
-    if (ArithmeticCountUtilHelper::GetInputTensorTotalSize(op, &count))
-      return kCPUCopyUnitCost * count;
-    return kCPUDefaultFixedValuedCost;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_ESTIMATORS_CPU_ESTIMATORS_H_
diff --git a/tensorflow/compiler/mlir/lite/experimental/estimators/estimator.h b/tensorflow/compiler/mlir/lite/experimental/estimators/estimator.h
deleted file mode 100644
index c4a509945fa..00000000000
--- a/tensorflow/compiler/mlir/lite/experimental/estimators/estimator.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_ESTIMATORS_ESTIMATOR_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_ESTIMATORS_ESTIMATOR_H_
-
-#include "llvm/Support/raw_ostream.h"
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/experimental/estimators/hardware.h"
-#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h.inc"
-
-template <typename Op, typename TargetHardware>
-class TFLiteCostEstimator {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) {
-    llvm::errs() << "No defined support for op: "
-                 << op->getName().getStringRef().str();
-    return false;
-  }
-};
-
-// All ops on CPU are supported.
-// TODO(karimnosseir): Only allow TFL ops in the "TFL_OP" param.
-template <typename TFL_OP>
-class TFLiteCostEstimator<TFL_OP, hardware::CPU> {
- public:
-  // TODO(karimnosseir): Update and use table based method and lookup
-  // cost from a loadable table ?
-  static double GetCost(mlir::Operation* op) { return 0.0; }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_ESTIMATORS_ESTIMATOR_H_
diff --git a/tensorflow/compiler/mlir/lite/experimental/estimators/gpu_estimators.h b/tensorflow/compiler/mlir/lite/experimental/estimators/gpu_estimators.h
deleted file mode 100644
index 45e8707ef44..00000000000
--- a/tensorflow/compiler/mlir/lite/experimental/estimators/gpu_estimators.h
+++ /dev/null
@@ -1,543 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_ESTIMATORS_GPU_ESTIMATORS_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_ESTIMATORS_GPU_ESTIMATORS_H_
-
-// GPU
-constexpr float kGPUArithmeticUnitCost = 0.2;
-
-// The copy can be non-consectutive copy. This is just fake data.
-constexpr float kGPUCopyUnitCost = 0.2;
-constexpr float kGPUDefaultCost = 1.0f;
-
-// Default values.
-constexpr float kGPUDefaultFixedValuedCost = 10000.0;
-
-// tfl.abs
-template <>
-class TFLiteCostEstimator<AbsOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.add
-template <>
-class TFLiteCostEstimator<AddOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    int64_t count;
-    if (ArithmeticCountUtilHelper::GetArithmeticCountForBroadcastableOp(op,
-                                                                        &count))
-      return kGPUArithmeticUnitCost * count;
-    return kGPUDefaultFixedValuedCost;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.average_pool_2d
-template <>
-class TFLiteCostEstimator<AveragePool2DOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.concatenation
-template <>
-class TFLiteCostEstimator<ConcatenationOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    int64_t count;
-    if (ArithmeticCountUtilHelper::GetInputTensorTotalSize(op, &count))
-      return kGPUCopyUnitCost * count;
-    return kGPUDefaultFixedValuedCost;
-  }
-
-  // TODO(renjieliu): We probably need to check for dynamic weights.
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.conv_2d
-template <>
-class TFLiteCostEstimator<Conv2DOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    int64_t arithmetic_count;
-    if (ArithmeticCountUtilHelper::GetArithmeticCountForConvAndFullyconnectedOp(
-            op, &arithmetic_count)) {
-      return arithmetic_count * kGPUArithmeticUnitCost;
-    }
-    return kGPUDefaultFixedValuedCost;
-  }
-
-  // TODO(renjieliu): We probably need to check for dynamic weights.
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.cos
-template <>
-class TFLiteCostEstimator<CosOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.depthwise_conv_2d
-template <>
-class TFLiteCostEstimator<DepthwiseConv2DOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    int64_t arithmetic_count;
-    if (ArithmeticCountUtilHelper::GetArithmeticCountForConvAndFullyconnectedOp(
-            op, &arithmetic_count)) {
-      return arithmetic_count * kGPUArithmeticUnitCost;
-    }
-    return kGPUDefaultFixedValuedCost;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.div
-template <>
-class TFLiteCostEstimator<DivOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.exp
-template <>
-class TFLiteCostEstimator<ExpOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.fully_connected
-template <>
-class TFLiteCostEstimator<FullyConnectedOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    int64_t arithmetic_count;
-    if (ArithmeticCountUtilHelper::GetArithmeticCountForConvAndFullyconnectedOp(
-            op, &arithmetic_count)) {
-      return arithmetic_count * kGPUArithmeticUnitCost;
-    }
-    return kGPUDefaultFixedValuedCost;
-  }
-
-  // TODO(renjieliu): we need to check for dynamic weights.
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.hard_swish
-template <>
-class TFLiteCostEstimator<HardSwishOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.log
-template <>
-class TFLiteCostEstimator<LogOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.logistic
-template <>
-class TFLiteCostEstimator<LogisticOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.max_pool_2d
-template <>
-class TFLiteCostEstimator<MaxPool2DOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.mirror_pad
-template <>
-class TFLiteCostEstimator<MirrorPadOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.maximum
-template <>
-class TFLiteCostEstimator<MaximumOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.custom
-template <>
-class TFLiteCostEstimator<CustomOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.mean
-template <>
-class TFLiteCostEstimator<MeanOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  // TODO(renjieiu): check for constraints.
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.minimum
-template <>
-class TFLiteCostEstimator<MinimumOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.mul
-template <>
-class TFLiteCostEstimator<MulOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    int64_t count;
-    if (ArithmeticCountUtilHelper::GetArithmeticCountForBroadcastableOp(op,
-                                                                        &count))
-      return kGPUArithmeticUnitCost * count;
-    return kGPUDefaultFixedValuedCost;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.pad
-template <>
-class TFLiteCostEstimator<PadOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.pow
-template <>
-class TFLiteCostEstimator<PowOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.prelu
-template <>
-class TFLiteCostEstimator<PReluOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.relu
-template <>
-class TFLiteCostEstimator<ReluOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.relu6
-template <>
-class TFLiteCostEstimator<Relu6Op, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.reshape
-template <>
-class TFLiteCostEstimator<ReshapeOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    int64_t count;
-    if (ArithmeticCountUtilHelper::GetInputTensorTotalSize(op, &count))
-      return kGPUCopyUnitCost * count;
-    return kGPUDefaultFixedValuedCost;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.rsqrt
-template <>
-class TFLiteCostEstimator<RsqrtOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.sin
-template <>
-class TFLiteCostEstimator<SinOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.slice
-template <>
-class TFLiteCostEstimator<SliceOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.softmax
-template <>
-class TFLiteCostEstimator<SoftmaxOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.space_to_depth
-template <>
-class TFLiteCostEstimator<SpaceToDepthOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.sqrt
-template <>
-class TFLiteCostEstimator<SqrtOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.square
-template <>
-class TFLiteCostEstimator<SquareOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.squared_difference
-template <>
-class TFLiteCostEstimator<SquaredDifferenceOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.strided_slice
-template <>
-class TFLiteCostEstimator<StridedSliceOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.tanh
-template <>
-class TFLiteCostEstimator<TanhOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.transpose
-template <>
-class TFLiteCostEstimator<TransposeOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-// tfl.transpose_conv
-template <>
-class TFLiteCostEstimator<TransposeConvOp, hardware::GPU> {
- public:
-  static double GetCost(mlir::Operation* op) {
-    llvm::errs() << "No defined cost function for op: "
-                 << op->getName().getStringRef().str();
-    return 0.0;
-  }
-
-  static bool IsSupported(mlir::Operation* op) { return true; }
-};
-
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_ESTIMATORS_GPU_ESTIMATORS_H_
-
diff --git a/tensorflow/compiler/mlir/lite/experimental/tfl_hardware_interfaces.td b/tensorflow/compiler/mlir/lite/experimental/tfl_hardware_interfaces.td
deleted file mode 100644
index 5c3ec6c206c..00000000000
--- a/tensorflow/compiler/mlir/lite/experimental/tfl_hardware_interfaces.td
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// WARNING: This Interface is experimental, DO NOT USE.
-
-// This is the Target Hardware operation interfacea definition file
-// for TensorFlow Lite.
-
-#ifndef TFL_TARGET_HARDWARE_OP_INTERFACES
-#define TFL_TARGET_HARDWARE_OP_INTERFACES
-
-def TFL_CpuTargetOp : OpInterface<"CpuOpTargetInterface"> {
-  let description = [{
-    Interface for ops to run on CPU.
-  }];
-
-  let methods = [
-    InterfaceMethod<
-      [{Returns the cost of running this op on CPU.}],
-      // TODO(karimnosseir): Change to return Cost object instead.
-      "double", "GetOpCost", (ins "mlir::Operation*":$op_to_check), [{
-        // TODO(karimnosseir): Consider changing to another way that doesn't
-        // rely on template param name.
-        return TFL::TFLiteCostEstimator<ConcreteOp, TFL::hardware::CPU>::GetCost(op_to_check);
-      }]
-    >,
-    InterfaceMethod<
-      [{Returns whether this op can be run on CPU.}],
-      "bool", "IsSupported", (ins "mlir::Operation*":$op_to_check), [{
-        // TODO(karimnosseir): Consider changing to another way that doesn't
-        // rely on template param name.
-        return TFL::TFLiteCostEstimator<ConcreteOp, TFL::hardware::CPU>::IsSupported(op_to_check);
-      }]
-    >,
-  ];
-}
-
-def TFL_GpuTargetOp : OpInterface<"GpuOpTargetInterface"> {
-  let description = [{
-    Interface for ops to run on GPU.
-  }];
-
-  let methods = [
-    InterfaceMethod<
-      [{Returns the cost of running this op on GPU.}],
-      // TODO(karimnosseir): Change to return Cost object instead.
-      "double", "GetOpCost", (ins "Operation*":$op_to_check), [{
-        // TODO(karimnosseir): Consider changing to another way that doesn't
-        // rely on template param name.
-        return TFL::TFLiteCostEstimator<ConcreteOp, TFL::hardware::GPU>::GetCost(op_to_check);
-      }]
-    >,
-    InterfaceMethod<
-      [{Returns whether this op can be run on GPU.}],
-      "bool", "IsSupported", (ins "Operation*":$op_to_check), [{
-        // TODO(karimnosseir): Consider changing to another way that doesn't
-        // rely on template param name.
-        return TFL::TFLiteCostEstimator<ConcreteOp, TFL::hardware::GPU>::IsSupported(op_to_check);
-      }]
-    >,
-  ];
-}
-
-#endif // TFL_TARGET_HARDWARE_OP_INTERFACES
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index fb20e842a75..89fae87cb25 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -149,6 +149,9 @@ static StatusOr<tflite::TensorType> GetTFLiteType(Type type,
       if (ftype && ftype.isF32()) {
         return tflite::TensorType_COMPLEX64;
       }
+      if (ftype && ftype.isF64()) {
+        return tflite::TensorType_COMPLEX128;
+      }
       return Status(error::INVALID_ARGUMENT, "Unsupported type");
     }
     case mlir::StandardTypes::Integer: {
@@ -1193,22 +1196,35 @@ Optional<BufferOffset<tflite::SubGraph>> Translator::BuildSubGraph(
     if (IsConst(&inst)) continue;
 
     // Fetch operand and result tensor indices.
-    std::vector<int32_t> operands;
-    operands.reserve(inst.getNumOperands());
-    for (auto operand : inst.getOperands()) {
-      if (operand.getType().isa<NoneType>())
-        operands.push_back(kTfLiteOptionalTensor);
-      else
-        operands.push_back(tensor_index_map.lookup(operand));
-    }
     std::vector<int32_t> results;
     results.reserve(inst.getNumOperands());
     for (auto result : inst.getResults()) {
       results.push_back(tensor_index_map.lookup(result));
     }
+    Operation* real_inst = &inst;
+    // CustomTfOp is just a wrapper around a TF op, we export the custom Op
+    // not the wrapper, so we fetch the op from the region.
+    if (auto custom_op = dyn_cast<mlir::TFL::CustomTfOp>(inst)) {
+      // If we have custom op with a region, then use the first op in the
+      // region, if it exists, otherwise just use params for custom op.
+      if (!custom_op.body().empty()) {
+        real_inst = &custom_op.body().front().front();
+      } else {
+        module_.emitError(
+            "Invalid CustomTfOp: Custom TF Op have empty region.");
+      }
+    }
+    std::vector<int32_t> operands;
+    operands.reserve(real_inst->getNumOperands());
+    for (auto operand : real_inst->getOperands()) {
+      if (operand.getType().isa<NoneType>())
+        operands.push_back(kTfLiteOptionalTensor);
+      else
+        operands.push_back(tensor_index_map.lookup(operand));
+    }
 
     if (auto tfl_operator =
-            BuildOperator(&inst, operands, results, intermediates))
+            BuildOperator(real_inst, operands, results, intermediates))
       operators.push_back(*tfl_operator);
     else
       failed_once = true;
@@ -1402,7 +1418,7 @@ BufferOffset<tflite::SparsityParameters> Translator::BuildSparsityParameters(
     } else {
       auto segments = dim_metadata.segments();
       std::vector<int> vector_segments(segments.size(), 0);
-      for (int j = 0; j < segments.size(); j++) {
+      for (int j = 0, end = segments.size(); j < end; j++) {
         vector_segments[j] = segments[j].dyn_cast<mlir::IntegerAttr>().getInt();
       }
       tflite::SparseIndexVector segments_type;
@@ -1434,7 +1450,7 @@ BufferOffset<tflite::SparsityParameters> Translator::BuildSparsityParameters(
       auto indices = dim_metadata.indices();
       std::vector<int> vector_indices(indices.size(), 0);
       int max_of_indices = 0;
-      for (int j = 0; j < indices.size(); j++) {
+      for (int j = 0, end = indices.size(); j < end; j++) {
         vector_indices[j] = indices[j].dyn_cast<mlir::IntegerAttr>().getInt();
         if (vector_indices[j] > max_of_indices) {
           max_of_indices = vector_indices[j];
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
index fa85b4e50fd..3c8bf26aa14 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
@@ -229,7 +229,7 @@ mlir::Operation* ConvertMinMaxToStatsOp(const TensorT& tensor, OpBuilder b,
 
   llvm::SmallVector<llvm::APFloat, 4> min_maxs;
   min_maxs.reserve(mins.size() * 2);
-  for (int i = 0; i < mins.size(); ++i) {
+  for (int i = 0, end = mins.size(); i < end; ++i) {
     llvm::APFloat min(mins[i]);
     llvm::APFloat max(maxs[i]);
     min_maxs.push_back(min);
@@ -281,7 +281,7 @@ std::vector<T> ReadAsLittleEndian(ArrayRef<uint8_t> bytes) {
   int bytes_len = bytes.size();
   assert(bytes_len % read_size == 0);
 
-  size_t elem_count = bytes_len / read_size;
+  int elem_count = bytes_len / read_size;
   ret.reserve(elem_count);
 
   const char* data_ptr = reinterpret_cast<const char*>(bytes.data());
@@ -318,7 +318,7 @@ StatusOr<mlir::ElementsAttr> ConvertFloatBuffer(
   switch (elem_type.getWidth()) {
     case 16: {
       assert(bytes_len % 2 == 0);
-      size_t elem_count = bytes_len / 2;
+      int elem_count = bytes_len / 2;
       std::vector<llvm::APFloat> values;
       values.reserve(elem_count);
 
@@ -337,7 +337,7 @@ StatusOr<mlir::ElementsAttr> ConvertFloatBuffer(
     }
     case 32: {
       assert(bytes_len % 4 == 0);
-      size_t elem_count = bytes_len / 4;
+      int elem_count = bytes_len / 4;
       std::vector<float> values;
       values.reserve(elem_count);
 
@@ -353,7 +353,7 @@ StatusOr<mlir::ElementsAttr> ConvertFloatBuffer(
     }
     case 64: {
       assert(bytes_len % 8 == 0);
-      size_t elem_count = bytes_len / 8;
+      int elem_count = bytes_len / 8;
       std::vector<double> values;
       values.reserve(elem_count);
 
@@ -829,7 +829,7 @@ StatusOr<FuncOp> ConvertSubgraph(
   // Add state variables to inputs.
   absl::flat_hash_set<int32_t> input_index_set(func_inputs.begin(),
                                                func_inputs.end());
-  for (int i = 0; i < subgraph.tensors.size(); i++) {
+  for (int i = 0, end = subgraph.tensors.size(); i < end; i++) {
     auto& tensor = *subgraph.tensors.at(i);
     if (tensor.is_variable && !input_index_set.contains(i)) {
       func_inputs.emplace_back(i);
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td b/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td
index becc2f7ab85..e14178d6f6d 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td
@@ -19,7 +19,6 @@ limitations under the License.
 #define TFL_OP_INTERFACES
 
 include "mlir/IR/OpBase.td"
-include "tensorflow/compiler/mlir/lite/experimental/tfl_hardware_interfaces.td"
 
 //===----------------------------------------------------------------------===//
 // TFL op interface for stateful operands.
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index 427b9c692a7..b5fcd5e82e2 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -269,7 +269,7 @@ struct TensorFlowLiteOpFolderDialectInterface
 };
 
 TensorFlowLiteDialect::TensorFlowLiteDialect(mlir::MLIRContext *context)
-    : Dialect(/*name=*/"tfl", context) {
+    : Dialect(/*name=*/"tfl", context, TypeID::get<TensorFlowLiteDialect>()) {
   addOperations<
 #define GET_OP_LIST
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.cc.inc"
@@ -773,8 +773,8 @@ static LogicalResult Verify(CustomOp op) {
       op.custom_option().cast<OpaqueElementsAttr>();
   if (!opaque_attr.getType().hasStaticShape())
     return op.emitOpError("custom_option should have a static shape.");
-  if (opaque_attr.getValue().size() !=
-      opaque_attr.getType().cast<ShapedType>().getDimSize(0))
+  const int attribute_size = opaque_attr.getValue().size();
+  if (attribute_size != opaque_attr.getType().cast<ShapedType>().getDimSize(0))
     return op.emitOpError(
         "custom_option should have the same length of content with shape.");
   return success();
@@ -955,7 +955,7 @@ static LogicalResult Verify(ScatterNdOp op) {
     // Checks whether the last `(shape_type.getDimSize(0) - outermost_dim)`
     // dimensions of `updates` and `shape` are equal.
     for (auto shape_it : llvm::enumerate(shape_value)) {
-      auto i = shape_it.index();
+      int64_t i = shape_it.index();
       auto value = shape_it.value().getSExtValue();
       if (i >= outermost_dim) {
         auto corresponding_dim = i - outermost_dim + outer_dims;
@@ -1192,7 +1192,8 @@ struct RemoveRedundantUnpackPack : public RewritePattern {
       return failure();
 
     const int total_pack_inputs = pack_op.getNumOperands();
-    if (total_pack_inputs != input_unpack_op.getNumResults()) return failure();
+    const int num_results = input_unpack_op.getNumResults();
+    if (total_pack_inputs != num_results) return failure();
     for (auto input_output :
          llvm::zip(pack_op.getOperands(), input_unpack_op.getResults())) {
       Value pack_input = std::get<0>(input_output);
@@ -1261,8 +1262,7 @@ static LogicalResult Verify(SliceOp op) {
   }
 
   if (begin && size && input_type.hasStaticShape()) {
-    const int input_rank = begin.getNumElements();
-    for (uint64_t i = 0; i < input_rank; i++) {
+    for (uint64_t i = 0, end = begin.getNumElements(); i < end; i++) {
       int begin_i =
           begin.getValue({i}).cast<IntegerAttr>().getValue().getSExtValue();
       int size_i =
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
index c7a1504c3b7..caed0bb3ad9 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
@@ -29,7 +29,7 @@ limitations under the License.
 #include "mlir/Interfaces/LoopLikeInterface.h"  // from @llvm-project
 #include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/quantization/quantization_traits.h"
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace mlir {
@@ -48,14 +48,9 @@ class TensorFlowLiteDialect : public Dialect {
                                  Location loc) override;
 };
 
-#include "tensorflow/compiler/mlir/lite/experimental/estimators/estimator.h"
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops_interface.h.inc"
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h.inc"
-// Include all specializes estimators below this line
-#include "tensorflow/compiler/mlir/lite/experimental/estimators/arithmetic_count_util.h"
-#include "tensorflow/compiler/mlir/lite/experimental/estimators/cpu_estimators.h"
-#include "tensorflow/compiler/mlir/lite/experimental/estimators/gpu_estimators.h"
 
 }  // end namespace TFL
 }  // end namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 4a56d893b19..6dc9fda656f 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -410,10 +410,7 @@ def TFL_ComparisonBinaryBuilder : OpBuilder<
 
 class TFL_Op<string mnemonic, list<OpTrait> traits = []> :
     Op<TFL_Dialect, mnemonic, !listconcat(traits,
-      [DeclareOpInterfaceMethods<TFL_RuntimeVerification>,
-       // All TFL ops are supported on CPU.
-       DeclareOpInterfaceMethods<TFL_CpuTargetOp>
-      ])> {
+      [DeclareOpInterfaceMethods<TFL_RuntimeVerification>])> {
   // FlatBuffer generation specific information.
   // -------------------------------------------
   // When generating the FlatBuffer output some operations have
@@ -435,8 +432,7 @@ class TFL_Op<string mnemonic, list<OpTrait> traits = []> :
 
 class TFL_ConvOp<string mnemonic, string opSummary, int index> :
     TFL_Op<mnemonic, [NoSideEffect, AccumulatorUniformScale<2, 0, 1>,
-    AffineQuantizedOpInterface, AffineOpCoefficient<index, 1>,
-    TFL_GpuTargetOp, TFL_SparseOp]> {
+    AffineQuantizedOpInterface, AffineOpCoefficient<index, 1>, TFL_SparseOp]> {
   let summary = opSummary # " operator";
 
   let description = [{
@@ -473,8 +469,7 @@ def TFL_AbsOp : TFL_Op<"abs", [
     NoSideEffect,
     SameOperandsAndResultShape,
     SameOperandsAndResultType,
-    NoQuantizableResult,
-    TFL_GpuTargetOp]> {
+    NoQuantizableResult]> {
   let summary = "Absolute value operator";
 
   let description = [{
@@ -495,8 +490,7 @@ def TFL_AddOp : TFL_Op<"add", [
       CPred<"TFL::VerifyAddOpShapeConstraints(llvm::cast<AddOp>($_op))">>,
     ResultsBroadcastableShape,
     NoSideEffect,
-    Commutative,
-    TFL_GpuTargetOp]> {
+    Commutative]> {
   let summary = "Addition operator";
 
   let description = [{
@@ -573,7 +567,6 @@ def TFL_TransposeConvOp: TFL_Op<"transpose_conv", [
       TFL_TCresVTEtIsSameAsOp<0, 2>>,
     AccumulatorUniformScale<3, 1, 2>,
     AffineQuantizedOpInterface, AffineOpCoefficient<0, 2>,
-    TFL_GpuTargetOp,
     TFL_SparseOp]> {
   let summary = "Transpose convolution operator";
 
@@ -612,8 +605,7 @@ def TFL_TransposeConvOp: TFL_Op<"transpose_conv", [
 def TFL_AveragePool2DOp:
     TFL_Op<"average_pool_2d",
            [NoSideEffect,
-            SameOperandsAndResultsScale,
-            TFL_GpuTargetOp]> {
+            SameOperandsAndResultsScale]> {
   let summary = "Average_pool_2d operator";
 
   let description = [{
@@ -713,8 +705,7 @@ def TFL_ConcatenationOp : TFL_Op<"concatenation",
     NoSideEffect,
     PredOpTrait<"values and output must have same element type",
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
-    SameOperandsAndResultsScale,
-    TFL_GpuTargetOp
+    SameOperandsAndResultsScale
   ]> {
   let summary = "Concatenation operator";
 
@@ -861,8 +852,7 @@ def TFL_CosOp: TFL_Op<"cos", [
     NoSideEffect,
     SameOperandsAndResultShape,
     SameOperandsAndResultType,
-    NoQuantizableResult,
-    TFL_GpuTargetOp]> {
+    NoQuantizableResult]> {
   let summary = "Cosine operator";
 
   let description = [{
@@ -916,8 +906,7 @@ def TFL_FullyConnectedOp : TFL_Op<"fully_connected", [
     NoSideEffect, AccumulatorUniformScale<2, 0, 1>,
     AffineQuantizedOpInterface,
     AffineOpCoefficient<-1, 1>,
-    TFL_SparseOp,
-    TFL_GpuTargetOp]> {
+    TFL_SparseOp]> {
   let summary = "Fully connected op";
 
   let arguments = (ins
@@ -954,7 +943,10 @@ def TFL_BatchMatMulOp : TFL_Op<"batch_matmul", [
    NoSideEffect,
    TFL_OperandHasAtleastRank<0, 2>,
    TFL_OperandHasAtleastRank<1, 2>,
-   SameOperandsAndResultElementType]> {
+   PredOpTrait<"x and output must have same element type",
+       TFL_TCresVTEtIsSameAsOp<0, 0>>,
+   PredOpTrait<"y and output must have same element type",
+       TFL_TCresVTEtIsSameAsOp<0, 1>>]> {
 
   let summary = "Batch Matrix Multiply Operator";
 
@@ -1070,8 +1062,7 @@ def TFL_LessEqualOp : TFL_Op<"less_equal", [
     ResultsBroadcastableShape,
     BinaryOpSameElementTypeConstraint,
     TFL_OperandsHaveSameShapesOrBroadcastableShape<[0, 1], 4>,
-    NoSideEffect,
-    NoQuantizableResult]> {
+    NoSideEffect]> {
   let summary = "Less_equal operator";
 
   let description = [{
@@ -1132,8 +1123,7 @@ convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imag
 def TFL_GreaterEqualOp : TFL_Op<"greater_equal", [
     TFL_OperandsHaveSameShapesOrBroadcastableShape<[0, 1], 4>,
     ResultsBroadcastableShape,
-    NoSideEffect,
-    NoQuantizableResult]> {
+    NoSideEffect]> {
   let summary = "Greater_equal operator";
 
   let description = [{
@@ -1360,8 +1350,7 @@ def TFL_DivOp : TFL_Op<"div", [
     TFL_OperandsHaveSameShapesOrBroadcastableShape<[0, 1], 5>,
     ResultsBroadcastableShape,
     NoSideEffect,
-    NoQuantizableResult,
-    TFL_GpuTargetOp]> {
+    NoQuantizableResult]> {
   let summary = "Division operator";
 
   let description = [{
@@ -1427,7 +1416,6 @@ def TFL_EmbeddingLookupOp: TFL_Op<"embedding_lookup",
 
 def TFL_EqualOp: TFL_Op<"equal", [
     Commutative,
-    NoQuantizableResult,
     ResultsBroadcastableShape,
     TFL_OperandsHaveSameShapesOrBroadcastableShape<[0, 1], 4>,
     PredOpTrait<"Operands have same value type", TCopVTEtIsSameAs<0, 1>>]> {
@@ -1449,8 +1437,7 @@ def TFL_EqualOp: TFL_Op<"equal", [
 }
 
 def TFL_ExpOp: TFL_Op<"exp", [NoSideEffect,
-                              SameOperandsAndResultType,
-                              TFL_GpuTargetOp]> {
+                              SameOperandsAndResultType]> {
   let summary = "Natural exponentiation operator";
 
   let description = [{
@@ -1634,8 +1621,7 @@ def TFL_GreaterOp : TFL_Op<"greater", [
     ResultsBroadcastableShape,
     BinaryOpSameElementTypeConstraint,
     TFL_OperandsHaveSameShapesOrBroadcastableShape<[0, 1], 4>,
-    NoSideEffect,
-    NoQuantizableResult]> {
+    NoSideEffect]> {
   let summary = "Greater operator";
 
   let description = [{
@@ -1659,8 +1645,7 @@ def TFL_HardSwishOp: TFL_Op<"hard_swish", [
     NoSideEffect,
     SameOperandsAndResultShape,
     PredOpTrait<"input and output must have same element type",
-      TFL_TCresVTEtIsSameAsOp<0, 0>>,
-    TFL_GpuTargetOp]> {
+      TFL_TCresVTEtIsSameAsOp<0, 0>>]> {
   let summary = "Hardswish activation function.";
   let description = [{
     Computes hard-swish activation function
@@ -1676,12 +1661,7 @@ def TFL_HardSwishOp: TFL_Op<"hard_swish", [
 }
 
 def TFL_L2NormalizationOp : TFL_Op<"l2_normalization", [NoSideEffect,
-    FixedOutputRangeInterface,
-    // central_value = min_value / 2 + (max_value - 1) / 2 + 1
-    // zero_point = central_value
-    // scale = 1. / (central_value - min_value)
-    FixedResultScale<Int8UniformQuantizedType<0, 78125, -7>>,
-    FixedResultScale<UInt8UniformQuantizedType<128, 78125, -7>>]> {
+    FixedOutputRangeInterface]> {
   let summary = "L2 Normalize Operator";
 
   let description = [{
@@ -1703,29 +1683,12 @@ def TFL_L2NormalizationOp : TFL_Op<"l2_normalization", [NoSideEffect,
   // FixedOutputRangeInterface:
   quant::UniformQuantizedType GetFixedOutputRange(
       bool is_signed, int bit_width) {
-    auto result_type = output().getType().cast<ShapedType>();
-    if (!result_type.getElementType().isa<FloatType>()) return {};
-    Builder builder(result_type.getContext());
-
-    // Only support 8-bits
-    if (bit_width != 8) return {};
-    IntegerType storage_type = builder.getIntegerType(bit_width);
-
-    double scale = 1.0 / 128;
-    int64_t zero_point, storage_min, storage_max;
-    if (is_signed) {
-      zero_point = 0;
-      storage_min = -128;
-      storage_max = 127;
-    } else {
-      zero_point = 128;
-      storage_min = 0;
-      storage_max = 255;
-    }
-
-    return quant::UniformQuantizedType::getChecked(
-        is_signed, storage_type, result_type.getElementType(), scale,
-        zero_point, storage_min, storage_max, builder.getUnknownLoc());
+    auto result_type = output().getType();
+    // central_value = min_value / 2 + (max_value - 1) / 2 + 1
+    // zero_point = central_value
+    // scale = 1. / (central_value - min_value)
+    return quant::GetFixedOutputRange(is_signed, bit_width, result_type,
+        /*scale=*/1.0 / 128, /*zero_point=*/0);
   }
   }];
 }
@@ -1757,8 +1720,7 @@ def TFL_LessOp : TFL_Op<"less", [
     ResultsBroadcastableShape,
     BinaryOpSameElementTypeConstraint,
     TFL_OperandsHaveSameShapesOrBroadcastableShape<[0, 1], 4>,
-    NoSideEffect,
-    NoQuantizableResult]> {
+    NoSideEffect]> {
   let summary = "Less operator";
 
   let description = [{
@@ -1834,12 +1796,7 @@ def TFL_LogisticOp: TFL_Op<"logistic", [
     PredOpTrait<"x and y must have same element type",
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
     SameOperandsAndResultShape,
-    // zero_point = 0
-    // scale = 1. / (max_value + 1)
-    FixedResultScale<Int8UniformQuantizedType<-128, 390625, -8>>,
-    FixedResultScale<UInt8UniformQuantizedType<0, 390625, -8>>,
-    FixedOutputRangeInterface,
-    TFL_GpuTargetOp]> {
+    FixedOutputRangeInterface]> {
   let summary = "Logistic operator";
 
   let description = [{
@@ -1854,29 +1811,11 @@ def TFL_LogisticOp: TFL_Op<"logistic", [
   // FixedOutputRangeInterface:
   quant::UniformQuantizedType GetFixedOutputRange(
       bool is_signed, int bit_width) {
-    auto result_type = y().getType().cast<ShapedType>();
-    if (!result_type.getElementType().isa<FloatType>()) return {};
-    Builder builder(result_type.getContext());
-
-    // Only support 8-bits
-    if (bit_width != 8) return {};
-    IntegerType storage_type = builder.getIntegerType(bit_width);
-
-    double scale = 1.0 / 256;
-    int64_t zero_point, storage_min, storage_max;
-    if (is_signed) {
-      zero_point = -128;
-      storage_min = -128;
-      storage_max = 127;
-    } else {
-      zero_point = 0;
-      storage_min = 0;
-      storage_max = 255;
-    }
-
-    return quant::UniformQuantizedType::getChecked(
-        is_signed, storage_type, result_type.getElementType(), scale,
-        zero_point, storage_min, storage_max, builder.getUnknownLoc());
+    auto result_type = y().getType();
+    // zero_point = 0
+    // scale = 1. / (max_value + 1)
+    return quant::GetFixedOutputRange(is_signed, bit_width, result_type,
+        /*scale=*/1.0 / 256, /*zero_point=*/-128);
   }
   }];
 }
@@ -1885,8 +1824,7 @@ def TFL_LogOp: TFL_Op<"log", [
     NoSideEffect,
     SameOperandsAndResultShape,
     SameOperandsAndResultType,
-    NoQuantizableResult,
-    TFL_GpuTargetOp]> {
+    NoQuantizableResult]> {
   let summary = "Natural logarithm operator";
 
   let description = [{
@@ -1905,10 +1843,7 @@ def TFL_LogSoftmaxOp : TFL_Op<"log_softmax", [
     SameOperandsAndResultShape,
     PredOpTrait<"x and y must have same element type",
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
-    // zero_point = max_value
-    // scale = -log_softmax_output_min / (max_value + 1)
-    FixedResultScale<Int8UniformQuantizedType<127, 625, -4>>,
-    FixedResultScale<UInt8UniformQuantizedType<255, 625, -4>>]> {
+    FixedOutputRangeInterface]> {
   let summary = "Log softmax operator";
 
   let description = [{
@@ -1922,6 +1857,18 @@ def TFL_LogSoftmaxOp : TFL_Op<"log_softmax", [
   let results = (outs TFL_TensorOf<[F32, QUI8, QI8, TFL_Quint8]>:$output);
 
   let hasOptions = 1;
+
+  let extraClassDeclaration = [{
+  // FixedOutputRangeInterface:
+  quant::UniformQuantizedType GetFixedOutputRange(
+      bool is_signed, int bit_width) {
+    auto result_type = output().getType();
+    // zero_point = max_value
+    // scale = -log_softmax_output_min / (max_value + 1)
+    return quant::GetFixedOutputRange(is_signed, bit_width, result_type,
+        /*scale=*/16.0 / 256, /*zero_point=*/127);
+  }
+  }];
 }
 
 // TODO(ashwinm): Revisit the granularity of the PredOpTraits. We could
@@ -1943,8 +1890,7 @@ def TFL_MaxPool2DOp : TFL_Op<"max_pool_2d", [
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
     NoSideEffect,
     MaxPoolOperandAndResultConstraints,
-    SameOperandsAndResultsScale,
-    TFL_GpuTargetOp]> {
+    SameOperandsAndResultsScale]> {
   let summary = "Max Pool 2D op";
 
   let description = [{
@@ -1976,8 +1922,7 @@ def TFL_MaximumOp : TFL_Op<"maximum", [
     NoSideEffect,
     TFL_OperandsHaveSameShapesOrBroadcastableShape<[0, 1], 5>,
     Commutative,
-    SameOperandsAndResultsScale,
-    TFL_GpuTargetOp]> {
+    SameOperandsAndResultsScale]> {
   let summary = "Max operator";
   let description = [{
     Element-wise max operation.
@@ -2000,8 +1945,7 @@ def TFL_MaximumOp : TFL_Op<"maximum", [
 def TFL_MeanOp : TFL_Op<"mean", [
     PredOpTrait<"input and output must have same element type",
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
-    NoSideEffect,
-    TFL_GpuTargetOp]> {
+    NoSideEffect]> {
   let summary = "Mean operator";
 
   let description = [{
@@ -2079,8 +2023,7 @@ def TFL_SliceOp : TFL_Op<"slice", [
     SameOperandsAndResultsScale,
     TFL_OperandHasRankAtMost<0, 4>,
     TFL_OperandHasRankAtMost<1, 1>,
-    TFL_OperandHasRankAtMost<2, 1>,
-    TFL_GpuTargetOp]> {
+    TFL_OperandHasRankAtMost<2, 1>]> {
   let summary = "Return a slice from 'input'.";
 
   let description = [{
@@ -2211,8 +2154,7 @@ def TFL_MinimumOp : TFL_Op<"minimum", [
     NoSideEffect,
     TFL_OperandsHaveSameShapesOrBroadcastableShape<[0, 1], 5>,
     Commutative,
-    SameOperandsAndResultsScale,
-    TFL_GpuTargetOp]> {
+    SameOperandsAndResultsScale]> {
   let summary = "Min operator";
   let description = [{
     Element-wise min operation.
@@ -2238,8 +2180,7 @@ def TFL_MulOp : TFL_Op<"mul", [
     Commutative,
     BinaryOpSameElementTypeConstraint,
     TFL_RuntimePredOpTrait<"Operands do not have valid shapes",
-      CPred<"TFL::VerifyMulOpShapeConstraints(llvm::cast<MulOp>($_op))">>,
-    TFL_GpuTargetOp]> {
+      CPred<"TFL::VerifyMulOpShapeConstraints(llvm::cast<MulOp>($_op))">>]> {
   let summary = "Multiplication operator";
 
   let description = [{
@@ -2345,8 +2286,7 @@ def TFL_PadOp : TFL_Op<"pad", [
     TFL_OperandRankEquals1DimOfOperand<0, 1>,
     PredOpTrait<"the first dim size of the padding argument must be at most 4",
       Or<[TFL_OperandIsUnrankedPred<1>,
-          TFL_OperandDimIsAtMost<1, 0, 4>]>>,
-    TFL_GpuTargetOp]> {
+          TFL_OperandDimIsAtMost<1, 0, 4>]>>]> {
   let summary = "Padding operator";
 
   let description = [{
@@ -2439,8 +2379,7 @@ def TFL_PowOp : TFL_Op<"pow", [
     ResultsBroadcastableShape,
     NoSideEffect,
     TFL_OperandsHaveSameShapesOrBroadcastableShape<[0, 1], 4>,
-    NoQuantizableResult,
-    TFL_GpuTargetOp]> {
+    NoQuantizableResult]> {
   let summary = "Power operator";
 
   let description = [{
@@ -2463,7 +2402,6 @@ def TFL_PowOp : TFL_Op<"pow", [
 def TFL_PReluOp : TFL_Op<"prelu", [
     NoSideEffect,
     ResultsBroadcastableShape,
-    TFL_GpuTargetOp,
     TFL_OperandsHaveSameShapesOrBroadcastableShape<[0, 1], 4>,
     BinaryOpSameElementTypeConstraint,
     PredOpTrait<"input and output must have the same element type",
@@ -2505,8 +2443,7 @@ def TFL_ReluOp: TFL_Op<"relu", [
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
     NoSideEffect,
     SameOperandsAndResultShape,
-    SameOperandsAndResultsScale,
-    TFL_GpuTargetOp]> {
+    SameOperandsAndResultsScale]> {
   let summary = "Relu operator";
 
   let description = [{
@@ -2535,8 +2472,7 @@ def TFL_Relu6Op: TFL_Op<"relu6", [
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
     NoSideEffect,
     SameOperandsAndResultShape,
-    SameOperandsAndResultsScale,
-    TFL_GpuTargetOp]> {
+    SameOperandsAndResultsScale]> {
   let summary = "Relu6 operator";
 
   let description = [{
@@ -2590,7 +2526,7 @@ def TFL_Relu1Op: TFL_Op<"relu_n1_to_1", [
 }
 
 def TFL_ReshapeOp: TFL_Op<"reshape", [
-    NoSideEffect, SameOperandsAndResultsScale, TFL_GpuTargetOp]> {
+    NoSideEffect, SameOperandsAndResultsScale]> {
   let summary = "Reshape operator";
 
   let description = [{
@@ -2645,8 +2581,7 @@ slice `i`, with the first `seq_lengths[i]` slices along dimension
 def TFL_RsqrtOp: TFL_Op<"rsqrt", [NoSideEffect,
                                   SameOperandsAndResultType,
                                   SameOperandsAndResultShape,
-                                  NoQuantizableResult,
-                                  TFL_GpuTargetOp]> {
+                                  NoQuantizableResult]> {
   let summary = "Reciprocal of square root operator";
 
   let description = [{
@@ -2741,6 +2676,7 @@ def TFL_ReverseV2Op: TFL_Op<"reverse_v2", [
 // are unranked. Therefore, we skip adding shape constraints here.
 def TFL_SelectOp : TFL_Op<"select", [
   NoSideEffect,
+  SameOperandsAndResultsScale,
   PredOpTrait<"operands have same element type", TCopVTEtIsSameAs<1, 2>>,
   PredOpTrait<"operands and result have same element type",
     TFL_TCresVTEtIsSameAsOp<0, 1>>]> {
@@ -2812,8 +2748,7 @@ def TFL_SinOp: TFL_Op<"sin", [
     NoSideEffect,
     SameOperandsAndResultShape,
     SameOperandsAndResultType,
-    NoQuantizableResult,
-    TFL_GpuTargetOp]> {
+    NoQuantizableResult]> {
   let summary = "Sine operator";
 
   let description = [{
@@ -2833,11 +2768,7 @@ def TFL_SoftmaxOp : TFL_Op<"softmax", [
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
     TFL_OperandHasRankRange<0, 1, 4>,
     SameOperandsAndResultShape,
-    // zero_point = 0
-    // scale = 1. / (max_value + 1)
-    FixedResultScale<Int8UniformQuantizedType<-128, 390625, -8>>,
-    FixedResultScale<UInt8UniformQuantizedType<0, 390625, -8>>,
-    TFL_GpuTargetOp]> {
+    FixedOutputRangeInterface]> {
   let summary = "Softmax operator";
 
   let description = [{
@@ -2854,14 +2785,25 @@ def TFL_SoftmaxOp : TFL_Op<"softmax", [
   let results = (outs TFL_TensorOf<[F32, QI8, QUI8, TFL_Quint8]>:$output);
 
   let hasOptions = 1;
+
+  let extraClassDeclaration = [{
+  // FixedOutputRangeInterface:
+  quant::UniformQuantizedType GetFixedOutputRange(
+      bool is_signed, int bit_width) {
+    auto result_type = output().getType();
+    // zero_point = 0
+    // scale = 1. / (max_value + 1)
+    return quant::GetFixedOutputRange(is_signed, bit_width, result_type,
+        /*scale=*/1.0 / 256, /*zero_point=*/-128);
+  }
+  }];
 }
 
 def TFL_SqrtOp: TFL_Op<"sqrt", [
     NoSideEffect,
     SameOperandsAndResultShape,
     SameOperandsAndResultType,
-    NoQuantizableResult,
-    TFL_GpuTargetOp]> {
+    NoQuantizableResult]> {
   let summary = "Square root operator";
 
   let description = [{
@@ -2879,8 +2821,7 @@ def TFL_SquareOp: TFL_Op<"square", [
     NoSideEffect,
     SameOperandsAndResultShape,
     SameOperandsAndResultType,
-    NoQuantizableResult,
-    TFL_GpuTargetOp]> {
+    NoQuantizableResult]> {
   let summary = "Square operator";
 
   let description = [{
@@ -2933,8 +2874,7 @@ def TFL_SquaredDifferenceOp : TFL_Op<"squared_difference", [
     SameOperandsAndResultElementType,
     ResultsBroadcastableShape,
     NoSideEffect,
-    NoQuantizableResult,
-    TFL_GpuTargetOp]> {
+    NoQuantizableResult]> {
   let summary = "Squared difference operator";
 
   let description = [{
@@ -2959,12 +2899,7 @@ def TFL_TanhOp: TFL_Op<"tanh", [
     SameOperandsAndResultShape,
     PredOpTrait<"input and output must have same element type",
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
-    // central_value = min_value / 2 + (max_value - 1) / 2 + 1
-    // zero_point = central_value
-    // scale = 1. / (central_value - min_value)
-    FixedResultScale<Int8UniformQuantizedType<0, 78125, -7>>,
-    FixedResultScale<UInt8UniformQuantizedType<128, 78125, -7>>,
-    TFL_GpuTargetOp]> {
+    FixedOutputRangeInterface]> {
   let summary = "Hyperbolic tangent operator";
 
   let description = [{
@@ -2985,6 +2920,19 @@ def TFL_TanhOp: TFL_Op<"tanh", [
       state.addTypes(input.getType());
     }]>
   ];
+
+  let extraClassDeclaration = [{
+  // FixedOutputRangeInterface:
+  quant::UniformQuantizedType GetFixedOutputRange(
+      bool is_signed, int bit_width) {
+    auto result_type = output().getType();
+    // central_value = min_value / 2 + (max_value - 1) / 2 + 1
+    // zero_point = central_value
+    // scale = 1. / (central_value - min_value)
+    return quant::GetFixedOutputRange(is_signed, bit_width, result_type,
+        /*scale=*/1.0 / 128, /*zero_point=*/0);
+  }
+  }];
 }
 
 def TFL_TileOp: TFL_Op<"tile", [
@@ -3052,8 +3000,7 @@ def TFL_TransposeOp : TFL_Op<"transpose", [
     TFL_OperandHasRank<1, 1>,
     PredOpTrait<"input and output must have same element type",
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
-    SameOperandsAndResultsScale,
-    TFL_GpuTargetOp]> {
+    SameOperandsAndResultsScale]> {
   let summary = "Transpose operator";
 
   let description = [{
@@ -3187,8 +3134,7 @@ def TFL_SpaceToDepthOp: TFL_Op<"space_to_depth", [
     SameOperandsAndResultsScale,
     PredOpTrait<"input and output must have same element type",
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
-    TFL_OperandHasRankAtMost<0, 4>,
-    TFL_GpuTargetOp
+    TFL_OperandHasRankAtMost<0, 4>
   ]> {
   let summary = "SpaceToDepth operator";
 
@@ -3400,8 +3346,7 @@ def TFL_StridedSliceOp: TFL_Op<"strided_slice", [
     TFL_OperandHasRankAtMost<0, 5>,
     TFL_OperandHasRank<1, 1>,
     TFL_OperandHasRank<2, 1>,
-    TFL_OperandHasRank<3, 1>,
-    TFL_GpuTargetOp
+    TFL_OperandHasRank<3, 1>
   ]> {
   let summary = "StridedSlice Op";
 
@@ -3451,7 +3396,7 @@ def TFL_CastOp : TFL_Op<"cast", [
 }
 
 def TFL_MirrorPadOp: TFL_Op<"mirror_pad", [
-                     NoSideEffect, TFL_OperandHasRank<1, 2>, TFL_GpuTargetOp]> {
+                     NoSideEffect, TFL_OperandHasRank<1, 2>]> {
   let summary = "MirrorPad Operator. Pads a tensor with mirrored values.";
 
   let description = [{
@@ -4354,7 +4299,8 @@ def TFL_WhileOp : Op<TFL_Dialect, "while", [
   let hasCanonicalizer = 1;
 }
 
-def TFL_CustomOp : Op<TFL_Dialect, "custom", [NoSideEffect]> {
+def TFL_CustomOp : Op<TFL_Dialect, "custom", [
+  NoSideEffect, NoQuantizableResult]> {
   let summary = "Custom op";
 
   let description = [{
@@ -4377,4 +4323,29 @@ def TFL_CustomOp : Op<TFL_Dialect, "custom", [NoSideEffect]> {
   let verifier = [{ return Verify(*this); }];
 }
 
+def TFL_CustomTfOp : Op<TFL_Dialect, "custom_tf", [
+  // Currently the custom ops have no side effect
+  // TODO(karimnosseir): Revisit if this needs updating.
+  NoSideEffect,
+  NoQuantizableResult,
+  SingleBlockImplicitTerminator<"YieldOp">]> {
+  let summary = "Wrapper Op for TF custom ops.";
+
+  let description = [{
+    A wrapper op around any Custom TF op. These includes ops defined using
+    custom_opdefs or linked which are not defined in TF dialect.
+    This Op just wraps the custom op inside a region.
+    Note #1, this Op will not include TF Lite custom ops defined using CustomOp.
+    Note #2, this op is just internal representation inside the converter and
+    are not exposed/exported when the model is exported to Flatbuffer.
+  }];
+
+  let arguments = (ins
+    Variadic<TFL_TensorOfOrNone<[AnyType]>>:$input
+  );
+  let results = (outs Variadic<AnyTensor>:$output);
+
+  let regions = (region SizedRegion<1>:$body);
+}
+
 #endif // TFL_OPS
diff --git a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
index ddd36fbd74c..529c9ee9238 100644
--- a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
@@ -75,7 +75,8 @@ Status HandleInputOutputArraysWithModule(const toco::ModelFlags& model_flags,
   }
   auto input_names = input_attr.cast<mlir::StringAttr>().getValue();
   input_names.split(function_input_names, ",");
-  if (function_input_names.size() != model_flags.input_arrays().size()) {
+  const int function_input_names_size = function_input_names.size();
+  if (function_input_names_size != model_flags.input_arrays().size()) {
     return errors::InvalidArgument(
         "input array size mismatch: got ", function_input_names.size(),
         ", expected: ", model_flags.input_arrays().size());
@@ -99,7 +100,8 @@ Status HandleInputOutputArraysWithModule(const toco::ModelFlags& model_flags,
   }
   auto output_names = output_attr.cast<mlir::StringAttr>().getValue();
   output_names.split(function_output_names, ",");
-  if (function_output_names.size() != model_flags.output_arrays().size()) {
+  const int function_output_names_size = function_output_names.size();
+  if (function_output_names_size != model_flags.output_arrays().size()) {
     return errors::InvalidArgument(
         "output array size mismatch: got ", function_output_names.size(),
         ", expected: ", model_flags.output_arrays().size());
@@ -151,10 +153,13 @@ Status ConvertSavedModelToTFLiteFlatBuffer(
     return errors::Unimplemented("Only support a single exported name.");
   }
 
+  tensorflow::GraphImportConfig specs;
+  specs.upgrade_legacy = true;
+
   TF_ASSIGN_OR_RETURN(auto module,
                       ImportSavedModel(model_flags.saved_model_dir(),
                                        model_flags.saved_model_version(), tags,
-                                       exported_names, &context));
+                                       exported_names, specs, &context));
 
   if (!model_flags.input_arrays().empty() ||
       !model_flags.output_arrays().empty()) {
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
index 4725eb1ac5f..a4e58123e05 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
@@ -123,6 +123,8 @@ DataType ConvertIODataTypeToDataType(toco::IODataType dtype) {
       return DT_BOOL;
     case toco::IODataType::COMPLEX64:
       return DT_COMPLEX64;
+    case toco::IODataType::COMPLEX128:
+      return DT_COMPLEX128;
     default:
       return DT_INVALID;
   }
diff --git a/tensorflow/compiler/mlir/lite/quantization/BUILD b/tensorflow/compiler/mlir/lite/quantization/BUILD
index de83a37b82e..aec0d8da34f 100644
--- a/tensorflow/compiler/mlir/lite/quantization/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/BUILD
@@ -81,7 +81,6 @@ cc_library(
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
index 0c9ccf1a979..9e0ad990657 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
@@ -794,16 +794,18 @@ bool QuantizationDriver::PropagateParams() {
     }
 
     // TODO(fengliuai): make the bit width configurable.
-    auto spec = GetQuantSpec(op);
-    auto key = std::make_pair(8, is_signed_);
-    auto &restricted_outputs = spec->restricted_output_params[key];
-    for (int i = 0, e = restricted_outputs.size(); i != e; ++i) {
-      // The restrict can be nullptr if the result has been quantized.
-      if (auto params = restricted_outputs[i]) {
-        changed |= SetResultParams(op, i, params);
+    if (auto restricted = llvm::dyn_cast<FixedOutputRangeInterface>(op)) {
+      // TODO(fengliuai): different result can have different fixed range.
+      auto params = restricted.GetFixedOutputRange(is_signed_, /*bit_width=*/8);
+      for (auto i = 0; i < op->getNumResults(); ++i) {
+        // The range is null if the result has been quantized.
+        if (params) {
+          changed |= SetResultParams(op, i, params);
+        }
       }
     }
 
+    auto spec = GetQuantSpec(op);
     for (auto &it : spec->biases_params) {
       auto params =
           GetBiasParams(op, it.first, it.second.first, it.second.second);
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
index 8f6b63b3ee6..9991d103449 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
@@ -449,7 +449,7 @@ static bool PreferResultScale(Operation* op) {
 // only considers the ops with restricted output params.
 static bool IsStatsRedundant(Operation* op,
                              OpQuantSpecGetter op_quant_spec_getter) {
-  return !op_quant_spec_getter(op)->restricted_output_params.empty();
+  return llvm::isa<FixedOutputRangeInterface>(op);
 }
 
 bool RemoveRedundantStatsOps(mlir::FuncOp func,
@@ -469,7 +469,7 @@ bool RemoveRedundantStatsOps(mlir::FuncOp func,
 
   // Step 1: forward pass: propagate any value scales which are not produces
   // by `SameOperandsAndResultsScale`. Additionally, remove the value scales
-  // which are produced by the `restricted_output_params`.
+  // which are produced by the ops with the `FixedOutputRangeInterface`.
   // Note that we don't propagate across the multiple-operands
   // `SameOperandsAndResultsScale` ops like `concatenation`.
   func.walk(
@@ -594,5 +594,27 @@ LogicalResult VerifySameScales(Operation* op) {
   }
   return success();
 }
+
+quant::UniformQuantizedType GetFixedOutputRange(bool is_signed, int bit_width,
+                                                Type tensor_type, double scale,
+                                                int64_t zero_point,
+                                                int64_t storage_min,
+                                                int64_t storage_max) {
+  auto result_type = tensor_type.cast<ShapedType>();
+  if (!result_type.getElementType().isa<FloatType>()) return {};
+  Builder builder(result_type.getContext());
+
+  // Only support 8-bits
+  if (bit_width != 8) return {};
+  IntegerType storage_type = builder.getIntegerType(bit_width);
+  if (!is_signed) {
+    zero_point += 128;
+    storage_min += 128;
+    storage_max += 128;
+  }
+  return quant::UniformQuantizedType::getChecked(
+      is_signed, storage_type, result_type.getElementType(), scale, zero_point,
+      storage_min, storage_max, builder.getUnknownLoc());
+}
 }  // namespace quant
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
index 4ced43014f5..07e5ba4e879 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
@@ -395,8 +395,6 @@ struct FoldTrivalRequantizeOp : public OpRewritePattern<RQ> {
 
     llvm::SmallVector<Type, 4> new_output_types;
     for (auto result : def->getResults()) {
-      result.getUsers().begin()->dump();
-      op.dump();
       if (result.hasOneUse() && *result.getUsers().begin() == op) {
         new_output_types.push_back(op.qtype());
       } else {
@@ -502,6 +500,13 @@ void ApplyQuantizationParamsPropagation(mlir::FuncOp func, bool is_signed,
 bool RemoveRedundantStatsOps(mlir::FuncOp func,
                              OpQuantSpecGetter op_quant_spec_getter);
 
+// Given quantization parameters for int8, compute the quantization parameters
+// for uint if it is required, and wrap the result in an UniformQuantizedType.
+quant::UniformQuantizedType GetFixedOutputRange(bool is_signed, int bit_width,
+                                                Type tensor_type, double scale,
+                                                int64_t zero_point,
+                                                int64_t storage_min = -128,
+                                                int64_t storage_max = 127);
 }  // namespace quant
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/lite/tests/const-fold.mlir b/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
index b37fdb9aa7b..ff7c47fb621 100644
--- a/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -canonicalize | FileCheck %s
+// RUN: tf-opt %s -canonicalize | FILECHECK_OPTS="" FileCheck %s
 
 // CHECK-LABEL: @add_float
 func @add_float() -> (tensor<f32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) {
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/custom_opdef.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/custom_opdef.pbtxt
index 345468e609e..481be9d4deb 100644
--- a/tensorflow/compiler/mlir/lite/tests/end2end/custom_opdef.pbtxt
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/custom_opdef.pbtxt
@@ -36,11 +36,11 @@ versions {
   producer: 27
 }
 
-# CHECK-LABEL: func @main
-# CHECK-SAME:  (%[[ARG_0:[a-z0-9]+]]: tensor<4xi32>, %[[ARG_1:[a-z0-9]+]]: tensor<4xi32>) -> tensor<*xi32>
-# CHECK-SAME:  control_outputs = ""
-# CHECK-SAME:  inputs = "input0,input1"
-# CHECK-SAME:  outputs = "output"
-# CHECK-NEXT:    %[[OP:[a-z0-9]+]] = "tf.BannaPotatoSaladWithColeslaw"(%[[ARG_0]], %[[ARG_1]]) {T = i32, device = ""} : (tensor<4xi32>, tensor<4xi32>) -> tensor<*xi32>
-# CHECK-NEXT:    return %[[OP]] : tensor<*xi32>
-# CHECK-NEXT:  }
+# CHECK-LABEL: func @main(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<*xi32>
+# CHECK: attributes {tf.entry_function = {control_outputs = "", inputs = "input0,input1", outputs = "output"}} {
+# CHECK-NEXT: %[[CUSTOM:.*]] = "tfl.custom_tf"(%arg0, %arg1) ( {
+# CHECK-NEXT:   %[[OUTPUTS:.*]] = "tf.BannaPotatoSaladWithColeslaw"(%arg0, %arg1) {T = i32, device = ""} : (tensor<4xi32>, tensor<4xi32>) -> tensor<*xi32>
+# CHECK-NEXT:   "tfl.yield"(%[[OUTPUTS]]) : (tensor<*xi32>) -> ()
+# CHECK-NEXT: }) : (tensor<4xi32>, tensor<4xi32>) -> tensor<*xi32>
+# CHECK-NEXT: return %[[CUSTOM]] : tensor<*xi32>
+# CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/constants.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/constants.mlir
index 50fe804f86c..a622c43c2f2 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/constants.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/constants.mlir
@@ -15,6 +15,13 @@ func @complex64() -> tensor<4xcomplex<f32>> {
   return %0 : tensor<4xcomplex<f32>>
 }
 
+func @complex128() -> tensor<4xcomplex<f64>> {
+  // CHECK-LABEL: @complex128
+  // CHECK: value = opaque<"tf", "0x746674656E736F722464747970653A2044545F434F4D504C45583132382074656E736F725F7368617065207B2064696D207B2073697A653A2034207D207D2074656E736F725F636F6E74656E743A20225C3030305C3030305C3030305C3030305C3030305C3030305C3336303F5C3030305C3030305C3030305C3030305C3030305C3030305C303030405C3030305C3030305C3030305C3030305C3030305C3030305C303030405C3030305C3030305C3030305C3030305C3030305C3030305C303030405C3030305C3030305C3030305C3030305C3030305C3030305C303130405C3030305C3030305C3030305C3030305C3030305C3030305C303030405C3030305C3030305C3030305C3030305C3030305C3030305C303230405C3030305C3030305C3030305C3030305C3030305C3030305C3030304022"> : tensor<4xcomplex<f64>>
+  %0 = "tfl.pseudo_const"() { value = opaque<"tf", "0x746674656E736F722464747970653A2044545F434F4D504C45583132382074656E736F725F7368617065207B2064696D207B2073697A653A2034207D207D2074656E736F725F636F6E74656E743A20225C3030305C3030305C3030305C3030305C3030305C3030305C3336303F5C3030305C3030305C3030305C3030305C3030305C3030305C303030405C3030305C3030305C3030305C3030305C3030305C3030305C303030405C3030305C3030305C3030305C3030305C3030305C3030305C303030405C3030305C3030305C3030305C3030305C3030305C3030305C303130405C3030305C3030305C3030305C3030305C3030305C3030305C303030405C3030305C3030305C3030305C3030305C3030305C3030305C303230405C3030305C3030305C3030305C3030305C3030305C3030305C3030304022"> : tensor<4xcomplex<f64>> } : () -> tensor<4xcomplex<f64>>
+  return %0 : tensor<4xcomplex<f64>>
+}
+
 // TODO(b/138847107) this should work but doesn't
 // func @f16() -> tensor<4xf16> {
 //   %0 = "tfl.pseudo_const"() { value = dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf16> } : () -> tensor<4xf16>
diff --git a/tensorflow/compiler/mlir/lite/tests/fuse-tftext.mlir b/tensorflow/compiler/mlir/lite/tests/fuse-tftext.mlir
index 1a3ed0509c4..f6f32e7a069 100644
--- a/tensorflow/compiler/mlir/lite/tests/fuse-tftext.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/fuse-tftext.mlir
@@ -1,3196 +1,3438 @@
-// RUN: tf-opt -tfl-prepare-composite-funcs-tf -tfl-fuse-tftext=true %s -split-input-file | FileCheck %s
-module {
+// RUN: tf-opt -tfl-prepare-composite-funcs-tf -tfl-fuse-tftext=true %s | FileCheck %s
 
-  func @whitespace_tokenizer_rank1(%arg0: tensor<1x!tf.string> {tf._user_specified_name = "input"}) -> (tensor<?x!tf.string>, tensor<?xi64>) attributes {tf._input_shapes = [#tf.shape<1>], tf.api_implements = "tftext:WhitespaceTokenizer", tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi64>} : () -> tensor<2xi64>
-    %1 = "tf.Const"() {value = dense<[]> : tensor<0xi64>} : () -> tensor<0xi64>
-    %2 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
-    %3 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
-    %4 = "tf.Const"() {value = dense<[[0], [1]]> : tensor<2x1xi64>} : () -> tensor<2x1xi64>
-    %5 = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
-    %6 = "tf.Const"() {value = dense<2> : tensor<1xi32>} : () -> tensor<1xi32>
-    %7 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-    %8 = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
-    %9 = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
-    %10 = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
-    %11 = "tf.Const"() {value = dense<0> : tensor<1xi64>} : () -> tensor<1xi64>
-    %12 = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
-    %13 = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
-    %14 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %15 = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
-    %16 = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
-    %17 = "tf.If"(%2, %2, %13, %13) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedConcat_assert_equal_1_Assert_AssertGuard_false_3210, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedConcat_assert_equal_1_Assert_AssertGuard_true_3200} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %18 = "tf.Identity"(%17) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %19 = "tf.StringLength"(%arg0) {device = "", unit = "BYTE"} : (tensor<1x!tf.string>) -> tensor<1xi32>
-    %20 = "tf.ExpandDims"(%19, %7) {device = ""} : (tensor<1xi32>, tensor<i32>) -> tensor<1x1xi32>
-    %21 = "tf.Cast"(%20) {Truncate = false, device = ""} : (tensor<1x1xi32>) -> tensor<1x1xi64>
-    %22 = "tf.Reshape"(%21, %12) {device = ""} : (tensor<1x1xi64>, tensor<1xi64>) -> tensor<1xi64>
-    %23 = "tf.Reshape"(%arg0, %5) {device = ""} : (tensor<1x!tf.string>, tensor<1xi32>) -> tensor<1x!tf.string>
-    %24:3 = "tf.UnicodeDecodeWithOffsets"(%23) {Tsplits = i64, device = "", errors = "replace", input_encoding = "UTF-8", replace_control_characters = false, replacement_char = 65533 : i64} : (tensor<1x!tf.string>) -> (tensor<2xi64>, tensor<?xi32>, tensor<?xi64>)
-    %25 = "tf.StridedSlice"(%24#0, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
-    %26 = "tf.AddV2"(%25, %13) {device = ""} : (tensor<1xi64>, tensor<i64>) -> tensor<1xi64>
-    %27 = "tf.StridedSlice"(%24#0, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
-    %28 = "tf.Minimum"(%26, %27) {device = ""} : (tensor<1xi64>, tensor<1xi64>) -> tensor<1xi64>
-    %29:2 = "tf.RaggedRange"(%28, %27, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<1xi64>, tensor<1xi64>, tensor<i64>) -> (tensor<2xi64>, tensor<?xi64>)
-    %30 = "tf.StridedSlice"(%29#0, %5, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %31 = "tf.AddV2"(%30, %12) {device = ""} : (tensor<i64>, tensor<1xi64>) -> tensor<1xi64>
-    %32 = "tf.ConcatV2"(%29#0, %31, %14) {device = ""} : (tensor<2xi64>, tensor<1xi64>, tensor<i32>) -> tensor<3xi64>
-    %33 = "tf.GatherV2"(%24#2, %29#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %34 = "tf.ConcatV2"(%33, %22, %14) {device = ""} : (tensor<?xi64>, tensor<1xi64>, tensor<i32>) -> tensor<?xi64>
-    %35:2 = "tf.RaggedGather"(%32, %34, %0) {OUTPUT_RAGGED_RANK = 1 : i64, PARAMS_RAGGED_RANK = 1 : i64, Tindices = i64, Tsplits = i64, Tvalues = i64, device = ""} : (tensor<3xi64>, tensor<?xi64>, tensor<2xi64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %36:5 = "tf.WhitespaceTokenizeWithOffsets"(%24#1, %24#0) {Tsplits = i64, device = ""} : (tensor<?xi32>, tensor<2xi64>) -> (tensor<?xi32>, tensor<?xi64>, tensor<?xi64>, tensor<?xi64>, tensor<?xi64>)
-    %37 = "tf.StridedSlice"(%36#1, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %38 = "tf.Equal"(%37, %10) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %39 = "tf.All"(%38, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
-    %40 = "tf.If"(%39, %39, %37, %10) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_3970, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_3960} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %41 = "tf.Identity"(%40) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %42 = "tf.StridedSlice"(%36#1, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %43 = "tf.StridedSlice"(%36#1, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %44 = "tf.Sub"(%42, %43) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %45 = "tf.LessEqual"(%10, %44) {device = ""} : (tensor<i64>, tensor<?xi64>) -> tensor<?xi1>
-    %46 = "tf.All"(%45, %15) {device = "", keep_dims = false} : (tensor<?xi1>, tensor<1xi32>) -> tensor<i1>
-    %47 = "tf.If"(%46, %46, %44) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_4330, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_4320} : (tensor<i1>, tensor<i1>, tensor<?xi64>) -> tensor<i1>
-    %48 = "tf.Identity"(%47) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %49 = "tf.Identity"(%36#1) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
-    %50 = "tf.StridedSlice"(%49, %5, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %51 = "tf.Shape"(%36#0) {device = ""} : (tensor<?xi32>) -> tensor<1xi64>
-    %52 = "tf.StridedSlice"(%51, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %53 = "tf.Equal"(%50, %52) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %54 = "tf.All"(%53, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
-    %55 = "tf.If"(%54, %54, %50, %52) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_4670, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_4660} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %56 = "tf.Identity"(%55) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %57 = "tf.Identity"(%49) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
-    %58 = "tf.Shape"(%57) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %59 = "tf.StridedSlice"(%58, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %60 = "tf.Sub"(%59, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %61 = "tf.StridedSlice"(%36#4, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %62 = "tf.Equal"(%61, %10) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %63 = "tf.All"(%62, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
-    %64 = "tf.If"(%63, %63, %61, %10) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_5040, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_5030} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %65 = "tf.Identity"(%64) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %66 = "tf.StridedSlice"(%36#4, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %67 = "tf.StridedSlice"(%36#4, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %68 = "tf.Sub"(%66, %67) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %69 = "tf.LessEqual"(%10, %68) {device = ""} : (tensor<i64>, tensor<?xi64>) -> tensor<?xi1>
-    %70 = "tf.All"(%69, %15) {device = "", keep_dims = false} : (tensor<?xi1>, tensor<1xi32>) -> tensor<i1>
-    %71 = "tf.If"(%70, %70, %68) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_5400, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_5390} : (tensor<i1>, tensor<i1>, tensor<?xi64>) -> tensor<i1>
-    %72 = "tf.Identity"(%71) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %73 = "tf.Identity"(%36#4) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
-    %74 = "tf.StridedSlice"(%73, %5, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %75 = "tf.Equal"(%74, %60) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %76 = "tf.All"(%75, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
-    %77 = "tf.If"(%76, %76, %74, %60) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_false_5760, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_true_5750} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %78 = "tf.Identity"(%77) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %79 = "tf.Identity"(%73) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
-    %80 = "tf.StridedSlice"(%36#4, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %81 = "tf.Equal"(%80, %10) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %82 = "tf.All"(%81, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
-    %83 = "tf.If"(%82, %82, %80, %10) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_6110, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_6100} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %84 = "tf.Identity"(%83) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %85 = "tf.StridedSlice"(%36#4, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %86 = "tf.StridedSlice"(%36#4, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %87 = "tf.Sub"(%85, %86) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %88 = "tf.LessEqual"(%10, %87) {device = ""} : (tensor<i64>, tensor<?xi64>) -> tensor<?xi1>
-    %89 = "tf.All"(%88, %15) {device = "", keep_dims = false} : (tensor<?xi1>, tensor<1xi32>) -> tensor<i1>
-    %90 = "tf.If"(%89, %89, %87) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_6470, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_6460} : (tensor<i1>, tensor<i1>, tensor<?xi64>) -> tensor<i1>
-    %91 = "tf.Identity"(%90) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %92 = "tf.Identity"(%36#4) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
-    %93 = "tf.StridedSlice"(%92, %5, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %94 = "tf.Shape"(%36#2) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %95 = "tf.StridedSlice"(%94, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %96 = "tf.Equal"(%93, %95) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %97 = "tf.All"(%96, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
-    %98 = "tf.If"(%97, %97, %93, %95) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_6810, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_6800} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %99 = "tf.Identity"(%98) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %100 = "tf.Identity"(%92) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
-    %101 = "tf.Shape"(%100) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %102 = "tf.StridedSlice"(%101, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %103 = "tf.Sub"(%102, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %104 = "tf.Equal"(%103, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %105 = "tf.LogicalOr"(%104, %2) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %106 = "tf.Equal"(%103, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %107 = "tf.LogicalOr"(%105, %106) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %108 = "tf.StridedSlice"(%100, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %109 = "tf.StridedSlice"(%100, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %110 = "tf.Sub"(%108, %109) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %111 = "tf.Shape"(%100) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %112 = "tf.StridedSlice"(%111, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %113 = "tf.Sub"(%112, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %114 = "tf.Equal"(%113, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %115 = "tf.ExpandDims"(%100, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %116 = "tf.Shape"(%100) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %117 = "tf.StridedSlice"(%116, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %118 = "tf.StridedSlice"(%116, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %119 = "tf.StridedSlice"(%116, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %120 = "tf.StridedSlice"(%36#4, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %121 = "tf.Equal"(%120, %10) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %122 = "tf.All"(%121, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
-    %123 = "tf.If"(%122, %122, %120, %10) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_7180, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_7170} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %124 = "tf.Identity"(%123) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %125 = "tf.StridedSlice"(%36#4, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %126 = "tf.StridedSlice"(%36#4, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %127 = "tf.Sub"(%125, %126) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %128 = "tf.LessEqual"(%10, %127) {device = ""} : (tensor<i64>, tensor<?xi64>) -> tensor<?xi1>
-    %129 = "tf.All"(%128, %15) {device = "", keep_dims = false} : (tensor<?xi1>, tensor<1xi32>) -> tensor<i1>
-    %130 = "tf.If"(%129, %129, %127) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_7540, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_7530} : (tensor<i1>, tensor<i1>, tensor<?xi64>) -> tensor<i1>
-    %131 = "tf.Identity"(%130) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %132 = "tf.Identity"(%36#4) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
-    %133 = "tf.StridedSlice"(%132, %5, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %134 = "tf.Shape"(%36#3) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %135 = "tf.StridedSlice"(%134, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %136 = "tf.Equal"(%133, %135) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %137 = "tf.All"(%136, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
-    %138 = "tf.If"(%137, %137, %133, %135) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_7880, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_7870} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %139 = "tf.Identity"(%138) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %140 = "tf.Identity"(%132) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
-    %141 = "tf.Shape"(%140) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %142 = "tf.StridedSlice"(%141, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %143 = "tf.Sub"(%142, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %144 = "tf.Equal"(%143, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %145 = "tf.LogicalOr"(%144, %2) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %146 = "tf.Equal"(%143, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %147 = "tf.LogicalOr"(%145, %146) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %148 = "tf.StridedSlice"(%140, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %149 = "tf.StridedSlice"(%140, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %150 = "tf.Sub"(%148, %149) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %151 = "tf.Shape"(%140) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %152 = "tf.StridedSlice"(%151, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %153 = "tf.Sub"(%152, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %154 = "tf.Equal"(%153, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %155 = "tf.ExpandDims"(%140, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %156 = "tf.Shape"(%140) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %157 = "tf.StridedSlice"(%156, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %158 = "tf.StridedSlice"(%156, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %159 = "tf.StridedSlice"(%156, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %160 = "tf.StridedSlice"(%140, %5, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %161 = "tf.Range"(%10, %160, %13) {device = ""} : (tensor<i64>, tensor<i64>, tensor<i64>) -> tensor<?xi64>
-    %162 = "tf.StridedSlice"(%140, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %163 = "tf.StridedSlice"(%140, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %164 = "tf.Sub"(%162, %163) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %165 = "tf.If"(%107, %107, %13, %103) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedGather_Assert_AssertGuard_false_8680, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedGather_Assert_AssertGuard_true_8670} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %166 = "tf.Identity"(%165) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %167 = "tf.Equal"(%103, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %168 = "tf.Select"(%167, %13, %103) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %169 = "tf.Equal"(%168, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %170 = "tf.LogicalOr"(%169, %2) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %171 = "tf.Equal"(%168, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %172 = "tf.LogicalOr"(%170, %171) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %173 = "tf.Select"(%114, %168, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %174 = "tf.Pack"(%173, %13) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
-    %175 = "tf.StridedSlice"(%174, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %176 = "tf.Cast"(%175) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
-    %177 = "tf.Reshape"(%176, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
-    %178 = "tf.Pack"(%7, %177) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %179 = "tf.Tile"(%115, %178) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %180 = "tf.Mul"(%177, %118) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %181 = "tf.Pack"(%180) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %182 = "tf.ConcatV2"(%117, %181, %119, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %183 = "tf.Reshape"(%179, %182) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %184 = "tf.Shape"(%183) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %185 = "tf.StridedSlice"(%184, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %186 = "tf.Pack"(%175) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %187 = "tf.StridedSlice"(%183, %186, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %188 = "tf.Sub"(%185, %175) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %189 = "tf.Pack"(%188) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %190 = "tf.StridedSlice"(%183, %11, %189, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %191:2 = "tf.RaggedRange"(%190, %187, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %192 = "tf.Select"(%2, %168, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %193 = "tf.Pack"(%192, %13) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
-    %194 = "tf.StridedSlice"(%193, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %195 = "tf.Cast"(%194) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
-    %196 = "tf.Reshape"(%195, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
-    %197 = "tf.Pack"(%7, %196) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %198 = "tf.Tile"(%4, %197) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
-    %199 = "tf.Mul"(%196, %8) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %200 = "tf.Pack"(%199) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %201 = "tf.ConcatV2"(%9, %200, %9, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %202 = "tf.Reshape"(%198, %201) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %203 = "tf.Shape"(%202) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %204 = "tf.StridedSlice"(%203, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %205 = "tf.Pack"(%194) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %206 = "tf.StridedSlice"(%202, %205, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %207 = "tf.Sub"(%204, %194) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %208 = "tf.Pack"(%207) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %209 = "tf.StridedSlice"(%202, %11, %208, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %210:2 = "tf.RaggedRange"(%209, %206, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %211 = "tf.StridedSlice"(%193, %15, %16, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
-    %212 = "tf.StridedSlice"(%193, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %213 = "tf.Mul"(%212, %12) {device = ""} : (tensor<i64>, tensor<1xi64>) -> tensor<1xi64>
-    %214 = "tf.Tile"(%213, %211) {device = ""} : (tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %215 = "tf.Cumsum"(%214, %14) {device = "", exclusive = false, reverse = false} : (tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %216 = "tf.ConcatV2"(%11, %215, %3) {device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %217 = "tf.StridedSlice"(%216, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %218 = "tf.ExpandDims"(%217, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %219 = "tf.Shape"(%217) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %220 = "tf.StridedSlice"(%219, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %221 = "tf.Pack"(%220) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %222 = "tf.StridedSlice"(%216, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %223 = "tf.ExpandDims"(%222, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %224 = "tf.Shape"(%222) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %225 = "tf.StridedSlice"(%224, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %226 = "tf.Pack"(%225) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %227 = "tf.Equal"(%103, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %228 = "tf.Select"(%227, %168, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %229 = "tf.Cast"(%228) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
-    %230 = "tf.Reshape"(%229, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
-    %231 = "tf.Pack"(%7, %230) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %232 = "tf.Mul"(%230, %8) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %233 = "tf.Pack"(%232) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %234 = "tf.ConcatV2"(%9, %233, %9, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %235 = "tf.Pack"(%228) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %236 = "tf.Pack"(%10, %103) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
-    %237 = "tf.ExpandDims"(%236, %7) {device = ""} : (tensor<2xi64>, tensor<i32>) -> tensor<2x1xi64>
-    %238 = "tf.Tile"(%237, %231) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
-    %239 = "tf.Reshape"(%238, %234) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %240 = "tf.Shape"(%239) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %241 = "tf.StridedSlice"(%240, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %242 = "tf.Sub"(%241, %228) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %243 = "tf.Pack"(%242) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %244 = "tf.StridedSlice"(%239, %11, %243, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %245 = "tf.StridedSlice"(%239, %235, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %246:2 = "tf.RaggedRange"(%244, %245, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %247 = "tf.GatherV2"(%110, %246#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %248 = "tf.Cast"(%247) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
-    %249 = "tf.BroadcastTo"(%248, %221) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-    %250 = "tf.Max"(%249, %15) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
-    %251 = "tf.Maximum"(%14, %250) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %252 = "tf.Range"(%14, %251, %7) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
-    %253 = "tf.Pack"(%7, %251) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %254 = "tf.Tile"(%218, %253) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %255 = "tf.Shape"(%254) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %256 = "tf.StridedSlice"(%255, %15, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-    %257 = "tf.Prod"(%256, %15) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
-    %258 = "tf.Pack"(%257) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %259 = "tf.Shape"(%254) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %260 = "tf.StridedSlice"(%259, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %261 = "tf.Shape"(%254) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %262 = "tf.StridedSlice"(%261, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %263 = "tf.ConcatV2"(%260, %258, %262, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %264 = "tf.Reshape"(%254, %263) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %265 = "tf.ExpandDims"(%249, %3) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
-    %266 = "tf.Less"(%252, %265) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
-    %267 = "tf.Reshape"(%266, %5) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
-    %268 = "tf.Where"(%267) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
-    %269 = "tf.Squeeze"(%268) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
-    %270 = "tf.GatherV2"(%264, %269, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %271 = "tf.Cast"(%247) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
-    %272 = "tf.BroadcastTo"(%271, %226) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-    %273 = "tf.Max"(%272, %15) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
-    %274 = "tf.Maximum"(%14, %273) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %275 = "tf.Range"(%14, %274, %7) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
-    %276 = "tf.Pack"(%7, %274) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %277 = "tf.Tile"(%223, %276) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %278 = "tf.Shape"(%277) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %279 = "tf.StridedSlice"(%278, %15, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-    %280 = "tf.Prod"(%279, %15) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
-    %281 = "tf.Pack"(%280) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %282 = "tf.Shape"(%277) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %283 = "tf.StridedSlice"(%282, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %284 = "tf.Shape"(%277) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %285 = "tf.StridedSlice"(%284, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %286 = "tf.ConcatV2"(%283, %281, %285, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %287 = "tf.Reshape"(%277, %286) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %288 = "tf.ExpandDims"(%272, %3) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
-    %289 = "tf.Less"(%275, %288) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
-    %290 = "tf.Reshape"(%289, %5) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
-    %291 = "tf.Where"(%290) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
-    %292 = "tf.Squeeze"(%291) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
-    %293 = "tf.GatherV2"(%287, %292, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %294:2 = "tf.RaggedRange"(%270, %293, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %295 = "tf.If"(%172, %172, %168, %13) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedGather_Assert_1_AssertGuard_false_9750, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedGather_Assert_1_AssertGuard_true_9740} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %296 = "tf.Identity"(%295) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %297 = "tf.Select"(%2, %168, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %298 = "tf.Pack"(%297) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %299 = "tf.ConcatV2"(%1, %298, %12, %14) {device = ""} : (tensor<0xi64>, tensor<1xi64>, tensor<1xi64>, tensor<i32>) -> tensor<2xi64>
-    %300 = "tf.StridedSlice"(%299, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %301 = "tf.Equal"(%300, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %302 = "tf.StridedSlice"(%299, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %303 = "tf.StridedSlice"(%299, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %304 = "tf.Equal"(%303, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %305 = "tf.If"(%304, %304, %303, %247) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedGather_Assert_2_AssertGuard_false_10240, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedGather_Assert_2_AssertGuard_true_10230} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<?xi64>) -> tensor<i1>
-    %306 = "tf.Identity"(%305) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %307 = "tf.If"(%301, %301, %247, %302) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedGather_Assert_3_AssertGuard_false_10600, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedGather_Assert_3_AssertGuard_true_10590} : (tensor<i1>, tensor<i1>, tensor<?xi64>, tensor<i64>) -> tensor<i1>
-    %308 = "tf.If"(%147, %147, %13, %143) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_Assert_AssertGuard_false_15300, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_Assert_AssertGuard_true_15290} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %309 = "tf.Identity"(%308) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %310 = "tf.Equal"(%143, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %311 = "tf.Select"(%310, %13, %143) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %312 = "tf.Equal"(%311, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %313 = "tf.LogicalOr"(%312, %2) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %314 = "tf.Equal"(%311, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %315 = "tf.LogicalOr"(%313, %314) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %316 = "tf.Select"(%154, %311, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %317 = "tf.Pack"(%316, %13) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
-    %318 = "tf.StridedSlice"(%317, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %319 = "tf.Cast"(%318) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
-    %320 = "tf.Reshape"(%319, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
-    %321 = "tf.Pack"(%7, %320) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %322 = "tf.Tile"(%155, %321) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %323 = "tf.Mul"(%320, %158) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %324 = "tf.Pack"(%323) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %325 = "tf.ConcatV2"(%157, %324, %159, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %326 = "tf.Reshape"(%322, %325) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %327 = "tf.Shape"(%326) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %328 = "tf.StridedSlice"(%327, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %329 = "tf.Pack"(%318) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %330 = "tf.StridedSlice"(%326, %329, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %331 = "tf.Sub"(%328, %318) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %332 = "tf.Pack"(%331) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %333 = "tf.StridedSlice"(%326, %11, %332, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %334:2 = "tf.RaggedRange"(%333, %330, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %335 = "tf.GatherV2"(%161, %334#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %336 = "tf.StridedSlice"(%317, %15, %16, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
-    %337 = "tf.StridedSlice"(%317, %15, %16, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
-    %338 = "tf.StridedSlice"(%317, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi64>
-    %339 = "tf.ConcatV2"(%337, %338, %14) {device = ""} : (tensor<1xi64>, tensor<0xi64>, tensor<i32>) -> tensor<1xi64>
-    %340 = "tf.StridedSlice"(%317, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %341 = "tf.Mul"(%164, %340) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
-    %342 = "tf.Tile"(%341, %336) {device = ""} : (tensor<?xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %343 = "tf.Cumsum"(%342, %14) {device = "", exclusive = false, reverse = false} : (tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %344 = "tf.ConcatV2"(%11, %343, %3) {device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %345 = "tf.Shape"(%344) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %346 = "tf.StridedSlice"(%345, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %347 = "tf.Sub"(%346, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %348 = "tf.Equal"(%347, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %349 = "tf.LogicalOr"(%348, %2) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %350 = "tf.Equal"(%347, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %351 = "tf.LogicalOr"(%349, %350) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %352 = "tf.StridedSlice"(%344, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %353 = "tf.StridedSlice"(%344, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %354 = "tf.Sub"(%352, %353) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %355 = "tf.Shape"(%344) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %356 = "tf.StridedSlice"(%355, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %357 = "tf.Sub"(%356, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %358 = "tf.Equal"(%357, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %359 = "tf.ExpandDims"(%344, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %360 = "tf.Shape"(%344) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %361 = "tf.StridedSlice"(%360, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %362 = "tf.StridedSlice"(%360, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %363 = "tf.StridedSlice"(%360, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %364 = "tf.Select"(%2, %311, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %365 = "tf.Pack"(%364, %13) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
-    %366 = "tf.StridedSlice"(%365, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %367 = "tf.Cast"(%366) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
-    %368 = "tf.Reshape"(%367, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
-    %369 = "tf.Pack"(%7, %368) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %370 = "tf.Tile"(%4, %369) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
-    %371 = "tf.Mul"(%368, %8) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %372 = "tf.Pack"(%371) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %373 = "tf.ConcatV2"(%9, %372, %9, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %374 = "tf.Reshape"(%370, %373) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %375 = "tf.Shape"(%374) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %376 = "tf.StridedSlice"(%375, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %377 = "tf.Pack"(%366) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %378 = "tf.StridedSlice"(%374, %377, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %379 = "tf.Sub"(%376, %366) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %380 = "tf.Pack"(%379) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %381 = "tf.StridedSlice"(%374, %11, %380, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %382:2 = "tf.RaggedRange"(%381, %378, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %383 = "tf.GatherV2"(%11, %382#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %384 = "tf.GatherV2"(%12, %383, %14) {batch_dims = 0 : i64, device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %385 = "tf.StridedSlice"(%365, %15, %16, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
-    %386 = "tf.StridedSlice"(%365, %15, %16, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
-    %387 = "tf.StridedSlice"(%365, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi64>
-    %388 = "tf.ConcatV2"(%386, %387, %14) {device = ""} : (tensor<1xi64>, tensor<0xi64>, tensor<i32>) -> tensor<1xi64>
-    %389 = "tf.Tile"(%384, %388) {device = ""} : (tensor<?xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %390 = "tf.StridedSlice"(%365, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %391 = "tf.Mul"(%390, %12) {device = ""} : (tensor<i64>, tensor<1xi64>) -> tensor<1xi64>
-    %392 = "tf.Tile"(%391, %385) {device = ""} : (tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %393 = "tf.Cumsum"(%392, %14) {device = "", exclusive = false, reverse = false} : (tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %394 = "tf.ConcatV2"(%11, %393, %3) {device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %395 = "tf.StridedSlice"(%394, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %396 = "tf.ExpandDims"(%395, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %397 = "tf.Shape"(%395) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %398 = "tf.StridedSlice"(%397, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %399 = "tf.Pack"(%398) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %400 = "tf.StridedSlice"(%394, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %401 = "tf.ExpandDims"(%400, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %402 = "tf.Shape"(%400) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %403 = "tf.StridedSlice"(%402, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %404 = "tf.Pack"(%403) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %405 = "tf.Equal"(%143, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %406 = "tf.Select"(%405, %311, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %407 = "tf.Cast"(%406) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
-    %408 = "tf.Reshape"(%407, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
-    %409 = "tf.Pack"(%7, %408) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %410 = "tf.Mul"(%408, %8) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %411 = "tf.Pack"(%410) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %412 = "tf.ConcatV2"(%9, %411, %9, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %413 = "tf.Pack"(%406) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %414 = "tf.Pack"(%10, %143) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
-    %415 = "tf.ExpandDims"(%414, %7) {device = ""} : (tensor<2xi64>, tensor<i32>) -> tensor<2x1xi64>
-    %416 = "tf.Tile"(%415, %409) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
-    %417 = "tf.Reshape"(%416, %412) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %418 = "tf.Shape"(%417) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %419 = "tf.StridedSlice"(%418, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %420 = "tf.Sub"(%419, %406) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %421 = "tf.Pack"(%420) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %422 = "tf.StridedSlice"(%417, %11, %421, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %423 = "tf.StridedSlice"(%417, %413, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %424:2 = "tf.RaggedRange"(%422, %423, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %425 = "tf.GatherV2"(%150, %424#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %426 = "tf.Cast"(%425) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
-    %427 = "tf.BroadcastTo"(%426, %399) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-    %428 = "tf.Max"(%427, %15) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
-    %429 = "tf.Maximum"(%14, %428) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %430 = "tf.Range"(%14, %429, %7) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
-    %431 = "tf.Pack"(%7, %429) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %432 = "tf.Tile"(%396, %431) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %433 = "tf.Shape"(%432) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %434 = "tf.StridedSlice"(%433, %15, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-    %435 = "tf.Prod"(%434, %15) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
-    %436 = "tf.Pack"(%435) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %437 = "tf.Shape"(%432) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %438 = "tf.StridedSlice"(%437, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %439 = "tf.Shape"(%432) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %440 = "tf.StridedSlice"(%439, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %441 = "tf.ConcatV2"(%438, %436, %440, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %442 = "tf.Reshape"(%432, %441) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %443 = "tf.ExpandDims"(%427, %3) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
-    %444 = "tf.Less"(%430, %443) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
-    %445 = "tf.Reshape"(%444, %5) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
-    %446 = "tf.Where"(%445) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
-    %447 = "tf.Squeeze"(%446) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
-    %448 = "tf.GatherV2"(%442, %447, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %449 = "tf.Cast"(%425) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
-    %450 = "tf.BroadcastTo"(%449, %404) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-    %451 = "tf.Max"(%450, %15) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
-    %452 = "tf.Maximum"(%14, %451) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %453 = "tf.Range"(%14, %452, %7) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
-    %454 = "tf.Pack"(%7, %452) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %455 = "tf.Tile"(%401, %454) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %456 = "tf.Shape"(%455) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %457 = "tf.StridedSlice"(%456, %15, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-    %458 = "tf.Prod"(%457, %15) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
-    %459 = "tf.Pack"(%458) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %460 = "tf.Shape"(%455) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %461 = "tf.StridedSlice"(%460, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %462 = "tf.Shape"(%455) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %463 = "tf.StridedSlice"(%462, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %464 = "tf.ConcatV2"(%461, %459, %463, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %465 = "tf.Reshape"(%455, %464) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %466 = "tf.ExpandDims"(%450, %3) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
-    %467 = "tf.Less"(%453, %466) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
-    %468 = "tf.Reshape"(%467, %5) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
-    %469 = "tf.Where"(%468) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
-    %470 = "tf.Squeeze"(%469) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
-    %471 = "tf.GatherV2"(%465, %470, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %472:2 = "tf.RaggedRange"(%448, %471, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %473 = "tf.GatherV2"(%389, %472#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %474 = "tf.If"(%315, %315, %311, %13) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_Assert_1_AssertGuard_false_16370, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_Assert_1_AssertGuard_true_16360} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %475 = "tf.Identity"(%474) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %476 = "tf.Select"(%2, %311, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %477 = "tf.Pack"(%476) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %478 = "tf.ConcatV2"(%1, %477, %12, %14) {device = ""} : (tensor<0xi64>, tensor<1xi64>, tensor<1xi64>, tensor<i32>) -> tensor<2xi64>
-    %479 = "tf.StridedSlice"(%478, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %480 = "tf.Equal"(%479, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %481 = "tf.StridedSlice"(%478, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %482 = "tf.StridedSlice"(%478, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %483 = "tf.Equal"(%482, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %484 = "tf.If"(%483, %483, %482, %425) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_Assert_2_AssertGuard_false_16860, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_Assert_2_AssertGuard_true_16850} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<?xi64>) -> tensor<i1>
-    %485 = "tf.Identity"(%484) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %486 = "tf.If"(%480, %480, %425, %481) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_Assert_3_AssertGuard_false_17220, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_Assert_3_AssertGuard_true_17210} : (tensor<i1>, tensor<i1>, tensor<?xi64>, tensor<i64>) -> tensor<i1>
-    %487 = "tf.Identity"(%486) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %488 = "tf.If"(%351, %351, %13, %347) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedGather_1_Assert_AssertGuard_false_21900, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedGather_1_Assert_AssertGuard_true_21890} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %489 = "tf.Identity"(%488) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %490 = "tf.Equal"(%347, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %491 = "tf.Select"(%490, %13, %347) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %492 = "tf.Equal"(%491, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %493 = "tf.LogicalOr"(%492, %2) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %494 = "tf.Equal"(%491, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %495 = "tf.LogicalOr"(%493, %494) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %496 = "tf.Select"(%358, %491, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %497 = "tf.Pack"(%496, %13) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
-    %498 = "tf.StridedSlice"(%497, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %499 = "tf.Cast"(%498) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
-    %500 = "tf.Reshape"(%499, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
-    %501 = "tf.Pack"(%7, %500) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %502 = "tf.Tile"(%359, %501) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %503 = "tf.Mul"(%500, %362) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %504 = "tf.Pack"(%503) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %505 = "tf.ConcatV2"(%361, %504, %363, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %506 = "tf.Reshape"(%502, %505) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %507 = "tf.Shape"(%506) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %508 = "tf.StridedSlice"(%507, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %509 = "tf.Pack"(%498) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %510 = "tf.StridedSlice"(%506, %509, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %511 = "tf.Sub"(%508, %498) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %512 = "tf.Pack"(%511) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %513 = "tf.StridedSlice"(%506, %11, %512, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %514:2 = "tf.RaggedRange"(%513, %510, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %515 = "tf.Select"(%2, %491, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %516 = "tf.Pack"(%515, %13) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
-    %517 = "tf.StridedSlice"(%516, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %518 = "tf.Cast"(%517) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
-    %519 = "tf.Reshape"(%518, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
-    %520 = "tf.Pack"(%7, %519) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %521 = "tf.Tile"(%4, %520) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
-    %522 = "tf.Mul"(%519, %8) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %523 = "tf.Pack"(%522) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %524 = "tf.ConcatV2"(%9, %523, %9, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %525 = "tf.Reshape"(%521, %524) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %526 = "tf.Shape"(%525) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %527 = "tf.StridedSlice"(%526, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %528 = "tf.Pack"(%517) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %529 = "tf.StridedSlice"(%525, %528, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %530 = "tf.Sub"(%527, %517) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %531 = "tf.Pack"(%530) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %532 = "tf.StridedSlice"(%525, %11, %531, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %533:2 = "tf.RaggedRange"(%532, %529, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %534 = "tf.StridedSlice"(%516, %15, %16, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
-    %535 = "tf.StridedSlice"(%516, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %536 = "tf.Mul"(%535, %12) {device = ""} : (tensor<i64>, tensor<1xi64>) -> tensor<1xi64>
-    %537 = "tf.Tile"(%536, %534) {device = ""} : (tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %538 = "tf.Cumsum"(%537, %14) {device = "", exclusive = false, reverse = false} : (tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %539 = "tf.ConcatV2"(%11, %538, %3) {device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %540 = "tf.StridedSlice"(%539, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %541 = "tf.ExpandDims"(%540, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %542 = "tf.Shape"(%540) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %543 = "tf.StridedSlice"(%542, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %544 = "tf.Pack"(%543) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %545 = "tf.StridedSlice"(%539, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %546 = "tf.ExpandDims"(%545, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %547 = "tf.Shape"(%545) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %548 = "tf.StridedSlice"(%547, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %549 = "tf.Pack"(%548) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %550 = "tf.Equal"(%347, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %551 = "tf.Select"(%550, %491, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %552 = "tf.Cast"(%551) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
-    %553 = "tf.Reshape"(%552, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
-    %554 = "tf.Pack"(%7, %553) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %555 = "tf.Mul"(%553, %8) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %556 = "tf.Pack"(%555) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %557 = "tf.ConcatV2"(%9, %556, %9, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %558 = "tf.Pack"(%551) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %559 = "tf.Pack"(%10, %347) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
-    %560 = "tf.ExpandDims"(%559, %7) {device = ""} : (tensor<2xi64>, tensor<i32>) -> tensor<2x1xi64>
-    %561 = "tf.Tile"(%560, %554) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
-    %562 = "tf.Reshape"(%561, %557) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %563 = "tf.Shape"(%562) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %564 = "tf.StridedSlice"(%563, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %565 = "tf.Sub"(%564, %551) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %566 = "tf.Pack"(%565) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %567 = "tf.StridedSlice"(%562, %11, %566, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %568 = "tf.StridedSlice"(%562, %558, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %569:2 = "tf.RaggedRange"(%567, %568, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %570 = "tf.GatherV2"(%354, %569#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %571 = "tf.Cast"(%570) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
-    %572 = "tf.BroadcastTo"(%571, %544) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-    %573 = "tf.Max"(%572, %15) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
-    %574 = "tf.Maximum"(%14, %573) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %575 = "tf.Range"(%14, %574, %7) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
-    %576 = "tf.Pack"(%7, %574) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %577 = "tf.Tile"(%541, %576) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %578 = "tf.Shape"(%577) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %579 = "tf.StridedSlice"(%578, %15, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-    %580 = "tf.Prod"(%579, %15) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
-    %581 = "tf.Pack"(%580) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %582 = "tf.Shape"(%577) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %583 = "tf.StridedSlice"(%582, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %584 = "tf.Shape"(%577) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %585 = "tf.StridedSlice"(%584, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %586 = "tf.ConcatV2"(%583, %581, %585, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %587 = "tf.Reshape"(%577, %586) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %588 = "tf.ExpandDims"(%572, %3) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
-    %589 = "tf.Less"(%575, %588) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
-    %590 = "tf.Reshape"(%589, %5) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
-    %591 = "tf.Where"(%590) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
-    %592 = "tf.Squeeze"(%591) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
-    %593 = "tf.GatherV2"(%587, %592, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %594 = "tf.Cast"(%570) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
-    %595 = "tf.BroadcastTo"(%594, %549) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-    %596 = "tf.Max"(%595, %15) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
-    %597 = "tf.Maximum"(%14, %596) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %598 = "tf.Range"(%14, %597, %7) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
-    %599 = "tf.Pack"(%7, %597) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %600 = "tf.Tile"(%546, %599) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %601 = "tf.Shape"(%600) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %602 = "tf.StridedSlice"(%601, %15, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-    %603 = "tf.Prod"(%602, %15) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
-    %604 = "tf.Pack"(%603) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %605 = "tf.Shape"(%600) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %606 = "tf.StridedSlice"(%605, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %607 = "tf.Shape"(%600) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %608 = "tf.StridedSlice"(%607, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %609 = "tf.ConcatV2"(%606, %604, %608, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %610 = "tf.Reshape"(%600, %609) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %611 = "tf.ExpandDims"(%595, %3) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
-    %612 = "tf.Less"(%598, %611) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
-    %613 = "tf.Reshape"(%612, %5) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
-    %614 = "tf.Where"(%613) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
-    %615 = "tf.Squeeze"(%614) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
-    %616 = "tf.GatherV2"(%610, %615, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %617:2 = "tf.RaggedRange"(%593, %616, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %618 = "tf.If"(%495, %495, %491, %13) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedGather_1_Assert_1_AssertGuard_false_22970, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedGather_1_Assert_1_AssertGuard_true_22960} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %619 = "tf.Identity"(%618) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %620 = "tf.Select"(%2, %491, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %621 = "tf.Pack"(%620) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %622 = "tf.ConcatV2"(%1, %621, %12, %14) {device = ""} : (tensor<0xi64>, tensor<1xi64>, tensor<1xi64>, tensor<i32>) -> tensor<2xi64>
-    %623 = "tf.StridedSlice"(%622, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %624 = "tf.Equal"(%623, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %625 = "tf.StridedSlice"(%622, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %626 = "tf.StridedSlice"(%622, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %627 = "tf.Equal"(%626, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %628 = "tf.If"(%627, %627, %626, %570) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedGather_1_Assert_2_AssertGuard_false_23460, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedGather_1_Assert_2_AssertGuard_true_23450} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<?xi64>) -> tensor<i1>
-    %629 = "tf.Identity"(%628) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %630 = "tf.If"(%624, %624, %570, %625) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedGather_1_Assert_3_AssertGuard_false_23820, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedGather_1_Assert_3_AssertGuard_true_23810} : (tensor<i1>, tensor<i1>, tensor<?xi64>, tensor<i64>) -> tensor<i1>
-    %631 = "tf.Identity"(%79) {device = ""} : (tensor<?xi64>) -> tensor<?xi64>
-    %632 = "tf.Identity"(%630) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %633 = "tf.Identity"(%307) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %634 = "tf.Shape"(%36#2) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %635 = "tf.StridedSlice"(%634, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %636 = "tf.Cast"(%635) {Truncate = false, device = ""} : (tensor<0xi32>) -> tensor<0xi64>
-    %637 = "tf.Identity"(%636) {device = ""} : (tensor<0xi64>) -> tensor<0xi64>
-    %638 = "tf.Shape"(%36#3) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %639 = "tf.StridedSlice"(%638, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %640 = "tf.Cast"(%639) {Truncate = false, device = ""} : (tensor<0xi32>) -> tensor<0xi64>
-    %641 = "tf.Identity"(%640) {device = ""} : (tensor<0xi64>) -> tensor<0xi64>
-    %642 = "tf.GatherV2"(%36#3, %335, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %643 = "tf.Tile"(%642, %339) {device = ""} : (tensor<?xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %644 = "tf.Sub"(%643, %473) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %645 = "tf.Shape"(%644) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %646 = "tf.StridedSlice"(%645, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %647 = "tf.Cast"(%646) {Truncate = false, device = ""} : (tensor<0xi32>) -> tensor<0xi64>
-    %648 = "tf.Identity"(%647) {device = ""} : (tensor<0xi64>) -> tensor<0xi64>
-    %649 = "tf.UnicodeEncode"(%36#0, %57) {Tsplits = i64, device = "", errors = "replace", output_encoding = "UTF-8", replacement_char = 65533 : i64} : (tensor<?xi32>, tensor<?xi64>) -> tensor<?x!tf.string>
-    %650 = "tf.Identity"(%649) {device = ""} : (tensor<?x!tf.string>) -> tensor<?x!tf.string>
-    return %650, %631 : tensor<?x!tf.string>, tensor<?xi64>
-  }
-  func @WhitespaceTokenize_RaggedConcat_assert_equal_1_Assert_AssertGuard_false_3210(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Input tensors have incompatible shapes."> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/RaggedConcat/RaggedFromTensor/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/RaggedConcat/RaggedNRows/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedConcat_assert_equal_1_Assert_AssertGuard_true_3200(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_3970(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:zero"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/RowPartitionFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/RowPartitionFromRowSplits/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_3960(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_4330(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:monotonic"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x >= 0 did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/RowPartitionFromRowSplits/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<?xi64>) -> ()
-    %3 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %4 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %4 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_4320(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_4670(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to _from_row_partition do not form a valid RaggedTensor"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/strided_slice_1:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_4660(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_5040(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:zero"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RowPartitionFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RowPartitionFromRowSplits/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_5030(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_5400(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:monotonic"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x >= 0 did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RowPartitionFromRowSplits/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<?xi64>) -> ()
-    %3 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %4 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %4 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_5390(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_false_5760(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to _from_row_partition do not form a valid RaggedTensor"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RaggedNRows/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_true_5750(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_6110(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:zero"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/RowPartitionFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/RowPartitionFromRowSplits/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_6100(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_6470(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:monotonic"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x >= 0 did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/RowPartitionFromRowSplits/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<?xi64>) -> ()
-    %3 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %4 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %4 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_6460(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_6810(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to _from_row_partition do not form a valid RaggedTensor"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/strided_slice_1:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_6800(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_7180(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:zero"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/RowPartitionFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/RowPartitionFromRowSplits/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_7170(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_7540(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:monotonic"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x >= 0 did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/RowPartitionFromRowSplits/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<?xi64>) -> ()
-    %3 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %4 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %4 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_7530(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_7880(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to _from_row_partition do not form a valid RaggedTensor"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/strided_slice_1:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_7870(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedGather_Assert_AssertGuard_false_8680(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedGather_Assert_AssertGuard_true_8670(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedGather_Assert_1_AssertGuard_false_9750(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedGather_Assert_1_AssertGuard_true_9740(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedGather_Assert_2_AssertGuard_false_10240(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<?xi64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedGather_Assert_2_AssertGuard_true_10230(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedGather_Assert_3_AssertGuard_false_10600(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<?xi64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedGather_Assert_3_AssertGuard_true_10590(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_Assert_AssertGuard_false_15300(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_Assert_AssertGuard_true_15290(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_Assert_1_AssertGuard_false_16370(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_Assert_1_AssertGuard_true_16360(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_Assert_2_AssertGuard_false_16860(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<?xi64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_Assert_2_AssertGuard_true_16850(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_Assert_3_AssertGuard_false_17220(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<?xi64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_Assert_3_AssertGuard_true_17210(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedGather_1_Assert_AssertGuard_false_21900(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedGather_1_Assert_AssertGuard_true_21890(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedGather_1_Assert_1_AssertGuard_false_22970(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedGather_1_Assert_1_AssertGuard_true_22960(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedGather_1_Assert_2_AssertGuard_false_23460(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<?xi64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedGather_1_Assert_2_AssertGuard_true_23450(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedGather_1_Assert_3_AssertGuard_false_23820(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<?xi64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_RaggedGather_1_Assert_3_AssertGuard_true_23810(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-
-  // CHECK:  func @whitespace_tokenizer_rank1(%arg0: tensor<1x!tf.string> {tf._user_specified_name = "input"}) -> (tensor<?x!tf.string>, tensor<?xi64>) attributes {tf._input_shapes = [#tf.shape<1>], tf.api_implements = "tftext:WhitespaceTokenizer", tf.signature.is_stateful} {
-  // CHECK:  %0:2 = "tfl.custom"(%arg0) {custom_code = "tftext:WhitespaceTokenizer", custom_option = opaque<"tfl", "0x"> : tensor<0xi8>} : (tensor<1x!tf.string>) -> (tensor<?x!tf.string>, tensor<?xi64>)
-  // CHECK:  return %0#0, %0#1 : tensor<?x!tf.string>, tensor<?xi64>
-
-  func @whitespace_tokenizer_rank2(%arg0: tensor<?x1x!tf.string> {tf._user_specified_name = "input"}) -> (tensor<?x!tf.string>, tensor<?xi64>, tensor<?xi64>) attributes {tf._input_shapes = [#tf.shape<?x1>], tf.api_implements = "tftext:WhitespaceTokenizer", tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<[]> : tensor<0xi64>} : () -> tensor<0xi64>
-    %1 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
-    %2 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
-    %3 = "tf.Const"() {value = dense<[[0], [1]]> : tensor<2x1xi64>} : () -> tensor<2x1xi64>
-    %4 = "tf.Const"() {value = dense<[2, -1]> : tensor<2xi32>} : () -> tensor<2xi32>
-    %5 = "tf.Const"() {value = dense<2> : tensor<i64>} : () -> tensor<i64>
-    %6 = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
-    %7 = "tf.Const"() {value = dense<2> : tensor<1xi32>} : () -> tensor<1xi32>
-    %8 = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
-    %9 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-    %10 = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
-    %11 = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
-    %12 = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
-    %13 = "tf.Const"() {value = dense<0> : tensor<1xi64>} : () -> tensor<1xi64>
-    %14 = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
-    %15 = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
-    %16 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %17 = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
-    %18 = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
-    %19 = "tf.Shape"(%arg0) {device = ""} : (tensor<?x1x!tf.string>) -> tensor<2xi64>
-    %20 = "tf.StridedSlice"(%19, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %21 = "tf.StridedSlice"(%19, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %22 = "tf.Mul"(%20, %21) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %23 = "tf.Pack"(%22) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %24 = "tf.StridedSlice"(%19, %7, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi64>
-    %25 = "tf.ConcatV2"(%23, %24, %16) {device = ""} : (tensor<1xi64>, tensor<0xi64>, tensor<i32>) -> tensor<1xi64>
-    %26 = "tf.Reshape"(%arg0, %25) {device = ""} : (tensor<?x1x!tf.string>, tensor<1xi64>) -> tensor<?x!tf.string>
-    %27 = "tf.StringLength"(%26) {device = "", unit = "BYTE"} : (tensor<?x!tf.string>) -> tensor<?xi32>
-    %28 = "tf.ExpandDims"(%27, %9) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
-    %29 = "tf.Cast"(%28) {Truncate = false, device = ""} : (tensor<?x1xi32>) -> tensor<?x1xi64>
-    %30 = "tf.Shape"(%29) {device = ""} : (tensor<?x1xi64>) -> tensor<2xi64>
-    %31 = "tf.StridedSlice"(%30, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %32 = "tf.StridedSlice"(%30, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %33 = "tf.Mul"(%31, %32) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %34 = "tf.Pack"(%33) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %35 = "tf.StridedSlice"(%30, %7, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi64>
-    %36 = "tf.ConcatV2"(%34, %35, %16) {device = ""} : (tensor<1xi64>, tensor<0xi64>, tensor<i32>) -> tensor<1xi64>
-    %37 = "tf.Reshape"(%29, %36) {device = ""} : (tensor<?x1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %38 = "tf.StridedSlice"(%30, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %39 = "tf.AddV2"(%38, %15) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %40 = "tf.Range"(%12, %39, %15) {device = ""} : (tensor<i64>, tensor<i64>, tensor<i64>) -> tensor<?xi64>
-    %41 = "tf.Mul"(%40, %15) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
-    %42 = "tf.Reshape"(%26, %6) {device = ""} : (tensor<?x!tf.string>, tensor<1xi32>) -> tensor<?x!tf.string>
-    %43:3 = "tf.UnicodeDecodeWithOffsets"(%42) {Tsplits = i64, device = "", errors = "replace", input_encoding = "UTF-8", replace_control_characters = false, replacement_char = 65533 : i64} : (tensor<?x!tf.string>) -> (tensor<?xi64>, tensor<?xi32>, tensor<?xi64>)
-    %44 = "tf.StridedSlice"(%43#0, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %45 = "tf.Shape"(%44) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %46 = "tf.ConcatV2"(%45, %18, %16) {device = ""} : (tensor<1xi32>, tensor<1xi32>, tensor<i32>) -> tensor<2xi32>
-    %47 = "tf.Reshape"(%44, %46) {device = ""} : (tensor<?xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %48 = "tf.Shape"(%47) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi64>
-    %49 = "tf.StridedSlice"(%48, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %50 = "tf.AddV2"(%49, %15) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %51 = "tf.Range"(%12, %50, %15) {device = ""} : (tensor<i64>, tensor<i64>, tensor<i64>) -> tensor<?xi64>
-    %52 = "tf.Mul"(%51, %15) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
-    %53 = "tf.ExpandDims"(%52, %9) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %54 = "tf.Shape"(%52) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %55 = "tf.StridedSlice"(%54, %17, %17, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %56 = "tf.StridedSlice"(%54, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %57 = "tf.StridedSlice"(%54, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %58 = "tf.StridedSlice"(%52, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %59 = "tf.StridedSlice"(%52, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %60 = "tf.Sub"(%58, %59) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %61 = "tf.Shape"(%47) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %62 = "tf.Cast"(%61) {Truncate = false, device = ""} : (tensor<2xi32>) -> tensor<2xi64>
-    %63 = "tf.StridedSlice"(%62, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %64 = "tf.Equal"(%63, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %65 = "tf.StridedSlice"(%62, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %66 = "tf.Equal"(%65, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %67 = "tf.StridedSlice"(%62, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %68 = "tf.Shape"(%47) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %69 = "tf.Cast"(%68) {Truncate = false, device = ""} : (tensor<2xi32>) -> tensor<2xi64>
-    %70 = "tf.StridedSlice"(%69, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %71 = "tf.Equal"(%70, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %72 = "tf.StridedSlice"(%43#0, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %73 = "tf.AddV2"(%72, %15) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
-    %74 = "tf.StridedSlice"(%43#0, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %75 = "tf.Minimum"(%73, %74) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %76:2 = "tf.RaggedRange"(%75, %74, %15) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %77 = "tf.Shape"(%76#0) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %78 = "tf.StridedSlice"(%77, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %79 = "tf.Sub"(%78, %15) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %80 = "tf.Equal"(%38, %79) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %81 = "tf.All"(%80, %11) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
-    %82 = "tf.If"(%81, %81, %38, %79) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedConcat_assert_equal_1_Assert_AssertGuard_false_99640, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedConcat_assert_equal_1_Assert_AssertGuard_true_99630} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %83 = "tf.Identity"(%82) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %84 = "tf.StridedSlice"(%41, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %85 = "tf.Mul"(%79, %5) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %86 = "tf.Range"(%12, %85, %15) {device = ""} : (tensor<i64>, tensor<i64>, tensor<i64>) -> tensor<?xi64>
-    %87 = "tf.Reshape"(%86, %4) {device = ""} : (tensor<?xi64>, tensor<2xi32>) -> tensor<2x?xi64>
-    %88 = "tf.Transpose"(%87, %8) {device = ""} : (tensor<2x?xi64>, tensor<2xi32>) -> tensor<?x2xi64>
-    %89 = "tf.Reshape"(%88, %6) {device = ""} : (tensor<?x2xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %90 = "tf.StridedSlice"(%76#0, %6, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %91 = "tf.AddV2"(%84, %90) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
-    %92 = "tf.ConcatV2"(%76#0, %91, %16) {device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %93 = "tf.GatherV2"(%43#2, %76#1, %16) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %94 = "tf.ConcatV2"(%93, %37, %16) {device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %95:2 = "tf.RaggedGather"(%92, %94, %89) {OUTPUT_RAGGED_RANK = 1 : i64, PARAMS_RAGGED_RANK = 1 : i64, Tindices = i64, Tsplits = i64, Tvalues = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<?xi64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %96 = "tf.StridedSlice"(%95#0, %17, %17, %7) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %97 = "tf.StridedSlice"(%96, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %98 = "tf.Shape"(%97) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %99 = "tf.ConcatV2"(%98, %18, %16) {device = ""} : (tensor<1xi32>, tensor<1xi32>, tensor<i32>) -> tensor<2xi32>
-    %100 = "tf.Reshape"(%97, %99) {device = ""} : (tensor<?xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %101 = "tf.Shape"(%100) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi64>
-    %102 = "tf.StridedSlice"(%101, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %103 = "tf.AddV2"(%102, %15) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %104 = "tf.Range"(%12, %103, %15) {device = ""} : (tensor<i64>, tensor<i64>, tensor<i64>) -> tensor<?xi64>
-    %105 = "tf.Mul"(%104, %15) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
-    %106 = "tf.ExpandDims"(%105, %9) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %107 = "tf.Shape"(%105) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %108 = "tf.StridedSlice"(%107, %17, %17, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %109 = "tf.StridedSlice"(%107, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %110 = "tf.StridedSlice"(%107, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %111 = "tf.StridedSlice"(%105, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %112 = "tf.StridedSlice"(%105, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %113 = "tf.Sub"(%111, %112) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %114 = "tf.Shape"(%100) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %115 = "tf.Cast"(%114) {Truncate = false, device = ""} : (tensor<2xi32>) -> tensor<2xi64>
-    %116 = "tf.StridedSlice"(%115, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %117 = "tf.Equal"(%116, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %118 = "tf.StridedSlice"(%115, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %119 = "tf.Equal"(%118, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %120 = "tf.StridedSlice"(%115, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %121 = "tf.Shape"(%100) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %122 = "tf.Cast"(%121) {Truncate = false, device = ""} : (tensor<2xi32>) -> tensor<2xi64>
-    %123 = "tf.StridedSlice"(%122, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %124 = "tf.Equal"(%123, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %125:5 = "tf.WhitespaceTokenizeWithOffsets"(%43#1, %43#0) {Tsplits = i64, device = ""} : (tensor<?xi32>, tensor<?xi64>) -> (tensor<?xi32>, tensor<?xi64>, tensor<?xi64>, tensor<?xi64>, tensor<?xi64>)
-    %126 = "tf.StridedSlice"(%125#1, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %127 = "tf.Equal"(%126, %12) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %128 = "tf.All"(%127, %11) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
-    %129 = "tf.If"(%128, %128, %126, %12) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_100400, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_100390} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %130 = "tf.Identity"(%129) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %131 = "tf.StridedSlice"(%125#1, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %132 = "tf.StridedSlice"(%125#1, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %133 = "tf.Sub"(%131, %132) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %134 = "tf.LessEqual"(%12, %133) {device = ""} : (tensor<i64>, tensor<?xi64>) -> tensor<?xi1>
-    %135 = "tf.All"(%134, %17) {device = "", keep_dims = false} : (tensor<?xi1>, tensor<1xi32>) -> tensor<i1>
-    %136 = "tf.If"(%135, %135, %133) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_100760, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_100750} : (tensor<i1>, tensor<i1>, tensor<?xi64>) -> tensor<i1>
-    %137 = "tf.Identity"(%136) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %138 = "tf.Identity"(%125#1) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
-    %139 = "tf.StridedSlice"(%138, %6, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %140 = "tf.Shape"(%125#0) {device = ""} : (tensor<?xi32>) -> tensor<1xi64>
-    %141 = "tf.StridedSlice"(%140, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %142 = "tf.Equal"(%139, %141) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %143 = "tf.All"(%142, %11) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
-    %144 = "tf.If"(%143, %143, %139, %141) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_101100, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_101090} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %145 = "tf.Identity"(%144) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %146 = "tf.Identity"(%138) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
-    %147 = "tf.Shape"(%146) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %148 = "tf.StridedSlice"(%147, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %149 = "tf.Sub"(%148, %15) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %150 = "tf.StridedSlice"(%125#4, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %151 = "tf.Equal"(%150, %12) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %152 = "tf.All"(%151, %11) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
-    %153 = "tf.If"(%152, %152, %150, %12) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_101470, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_101460} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %154 = "tf.Identity"(%153) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %155 = "tf.StridedSlice"(%125#4, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %156 = "tf.StridedSlice"(%125#4, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %157 = "tf.Sub"(%155, %156) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %158 = "tf.LessEqual"(%12, %157) {device = ""} : (tensor<i64>, tensor<?xi64>) -> tensor<?xi1>
-    %159 = "tf.All"(%158, %17) {device = "", keep_dims = false} : (tensor<?xi1>, tensor<1xi32>) -> tensor<i1>
-    %160 = "tf.If"(%159, %159, %157) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_101830, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_101820} : (tensor<i1>, tensor<i1>, tensor<?xi64>) -> tensor<i1>
-    %161 = "tf.Identity"(%160) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %162 = "tf.Identity"(%125#4) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
-    %163 = "tf.StridedSlice"(%162, %6, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %164 = "tf.Equal"(%163, %149) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %165 = "tf.All"(%164, %11) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
-    %166 = "tf.If"(%165, %165, %163, %149) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_false_102190, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_true_102180} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %167 = "tf.Identity"(%166) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %168 = "tf.Identity"(%162) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
-    %169 = "tf.StridedSlice"(%125#4, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %170 = "tf.Equal"(%169, %12) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %171 = "tf.All"(%170, %11) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
-    %172 = "tf.If"(%171, %171, %169, %12) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_102540, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_102530} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %173 = "tf.Identity"(%172) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %174 = "tf.StridedSlice"(%125#4, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %175 = "tf.StridedSlice"(%125#4, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %176 = "tf.Sub"(%174, %175) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %177 = "tf.LessEqual"(%12, %176) {device = ""} : (tensor<i64>, tensor<?xi64>) -> tensor<?xi1>
-    %178 = "tf.All"(%177, %17) {device = "", keep_dims = false} : (tensor<?xi1>, tensor<1xi32>) -> tensor<i1>
-    %179 = "tf.If"(%178, %178, %176) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_102900, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_102890} : (tensor<i1>, tensor<i1>, tensor<?xi64>) -> tensor<i1>
-    %180 = "tf.Identity"(%179) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %181 = "tf.Identity"(%125#4) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
-    %182 = "tf.StridedSlice"(%181, %6, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %183 = "tf.Shape"(%125#2) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %184 = "tf.StridedSlice"(%183, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %185 = "tf.Equal"(%182, %184) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %186 = "tf.All"(%185, %11) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
-    %187 = "tf.If"(%186, %186, %182, %184) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_103240, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_103230} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %188 = "tf.Identity"(%187) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %189 = "tf.Identity"(%181) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
-    %190 = "tf.Shape"(%189) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %191 = "tf.StridedSlice"(%190, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %192 = "tf.Sub"(%191, %15) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %193 = "tf.Equal"(%192, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %194 = "tf.LogicalOr"(%64, %193) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %195 = "tf.Equal"(%192, %63) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %196 = "tf.LogicalOr"(%194, %195) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %197 = "tf.StridedSlice"(%189, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %198 = "tf.StridedSlice"(%189, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %199 = "tf.Sub"(%197, %198) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %200 = "tf.Shape"(%189) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %201 = "tf.StridedSlice"(%200, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %202 = "tf.Sub"(%201, %15) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %203 = "tf.Equal"(%202, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %204 = "tf.ExpandDims"(%189, %9) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %205 = "tf.Shape"(%189) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %206 = "tf.StridedSlice"(%205, %17, %17, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %207 = "tf.StridedSlice"(%205, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %208 = "tf.StridedSlice"(%205, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %209 = "tf.StridedSlice"(%125#4, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %210 = "tf.Equal"(%209, %12) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %211 = "tf.All"(%210, %11) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
-    %212 = "tf.If"(%211, %211, %209, %12) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_103610, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_103600} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %213 = "tf.Identity"(%212) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %214 = "tf.StridedSlice"(%125#4, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %215 = "tf.StridedSlice"(%125#4, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %216 = "tf.Sub"(%214, %215) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %217 = "tf.LessEqual"(%12, %216) {device = ""} : (tensor<i64>, tensor<?xi64>) -> tensor<?xi1>
-    %218 = "tf.All"(%217, %17) {device = "", keep_dims = false} : (tensor<?xi1>, tensor<1xi32>) -> tensor<i1>
-    %219 = "tf.If"(%218, %218, %216) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_103970, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_103960} : (tensor<i1>, tensor<i1>, tensor<?xi64>) -> tensor<i1>
-    %220 = "tf.Identity"(%219) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %221 = "tf.Identity"(%125#4) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
-    %222 = "tf.StridedSlice"(%221, %6, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %223 = "tf.Shape"(%125#3) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %224 = "tf.StridedSlice"(%223, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %225 = "tf.Equal"(%222, %224) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %226 = "tf.All"(%225, %11) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
-    %227 = "tf.If"(%226, %226, %222, %224) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_104310, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_104300} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %228 = "tf.Identity"(%227) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %229 = "tf.Identity"(%221) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
-    %230 = "tf.Shape"(%229) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %231 = "tf.StridedSlice"(%230, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %232 = "tf.Sub"(%231, %15) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %233 = "tf.Equal"(%232, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %234 = "tf.LogicalOr"(%233, %1) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %235 = "tf.Equal"(%232, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %236 = "tf.LogicalOr"(%234, %235) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %237 = "tf.StridedSlice"(%229, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %238 = "tf.StridedSlice"(%229, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %239 = "tf.Sub"(%237, %238) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %240 = "tf.Shape"(%229) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %241 = "tf.StridedSlice"(%240, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %242 = "tf.Sub"(%241, %15) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %243 = "tf.Equal"(%242, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %244 = "tf.ExpandDims"(%229, %9) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %245 = "tf.Shape"(%229) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %246 = "tf.StridedSlice"(%245, %17, %17, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %247 = "tf.StridedSlice"(%245, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %248 = "tf.StridedSlice"(%245, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %249 = "tf.StridedSlice"(%229, %6, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %250 = "tf.Range"(%12, %249, %15) {device = ""} : (tensor<i64>, tensor<i64>, tensor<i64>) -> tensor<?xi64>
-    %251 = "tf.StridedSlice"(%229, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %252 = "tf.StridedSlice"(%229, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %253 = "tf.Sub"(%251, %252) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %254 = "tf.If"(%196, %196, %63, %192) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_AssertGuard_false_105110, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_AssertGuard_true_105100} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %255 = "tf.Identity"(%254) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %256 = "tf.Equal"(%192, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %257 = "tf.Select"(%256, %63, %192) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %258 = "tf.Equal"(%257, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %259 = "tf.LogicalOr"(%258, %66) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %260 = "tf.Equal"(%65, %257) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %261 = "tf.LogicalOr"(%259, %260) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %262 = "tf.Select"(%203, %257, %15) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %263 = "tf.Pack"(%262, %15) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
-    %264 = "tf.StridedSlice"(%263, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %265 = "tf.Cast"(%264) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
-    %266 = "tf.Reshape"(%265, %11) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
-    %267 = "tf.Pack"(%9, %266) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %268 = "tf.Tile"(%204, %267) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %269 = "tf.Mul"(%266, %207) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %270 = "tf.Pack"(%269) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %271 = "tf.ConcatV2"(%206, %270, %208, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %272 = "tf.Reshape"(%268, %271) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %273 = "tf.Shape"(%272) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %274 = "tf.StridedSlice"(%273, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %275 = "tf.Pack"(%264) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %276 = "tf.StridedSlice"(%272, %275, %13, %14) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %277 = "tf.Sub"(%274, %264) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %278 = "tf.Pack"(%277) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %279 = "tf.StridedSlice"(%272, %13, %278, %14) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %280:2 = "tf.RaggedRange"(%279, %276, %15) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %281 = "tf.Select"(%71, %257, %15) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %282 = "tf.Pack"(%281, %15) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
-    %283 = "tf.StridedSlice"(%282, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %284 = "tf.Cast"(%283) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
-    %285 = "tf.Reshape"(%284, %11) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
-    %286 = "tf.Pack"(%9, %285) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %287 = "tf.Tile"(%53, %286) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %288 = "tf.Mul"(%285, %56) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %289 = "tf.Pack"(%288) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %290 = "tf.ConcatV2"(%55, %289, %57, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %291 = "tf.Reshape"(%287, %290) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %292 = "tf.Shape"(%291) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %293 = "tf.StridedSlice"(%292, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %294 = "tf.Pack"(%283) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %295 = "tf.StridedSlice"(%291, %294, %13, %14) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %296 = "tf.Sub"(%293, %283) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %297 = "tf.Pack"(%296) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %298 = "tf.StridedSlice"(%291, %13, %297, %14) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %299:2 = "tf.RaggedRange"(%298, %295, %15) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %300 = "tf.StridedSlice"(%282, %17, %18, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
-    %301 = "tf.StridedSlice"(%282, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %302 = "tf.Mul"(%60, %301) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
-    %303 = "tf.Tile"(%302, %300) {device = ""} : (tensor<?xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %304 = "tf.Cumsum"(%303, %16) {device = "", exclusive = false, reverse = false} : (tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %305 = "tf.ConcatV2"(%13, %304, %2) {device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %306 = "tf.StridedSlice"(%305, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %307 = "tf.ExpandDims"(%306, %9) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %308 = "tf.Shape"(%306) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %309 = "tf.StridedSlice"(%308, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %310 = "tf.Pack"(%309) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %311 = "tf.StridedSlice"(%305, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %312 = "tf.ExpandDims"(%311, %9) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %313 = "tf.Shape"(%311) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %314 = "tf.StridedSlice"(%313, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %315 = "tf.Pack"(%314) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %316 = "tf.Equal"(%192, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %317 = "tf.Select"(%316, %257, %15) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %318 = "tf.Cast"(%317) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
-    %319 = "tf.Reshape"(%318, %11) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
-    %320 = "tf.Pack"(%9, %319) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %321 = "tf.Mul"(%319, %10) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %322 = "tf.Pack"(%321) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %323 = "tf.ConcatV2"(%11, %322, %11, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %324 = "tf.Pack"(%317) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %325 = "tf.Pack"(%12, %192) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
-    %326 = "tf.ExpandDims"(%325, %9) {device = ""} : (tensor<2xi64>, tensor<i32>) -> tensor<2x1xi64>
-    %327 = "tf.Tile"(%326, %320) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
-    %328 = "tf.Reshape"(%327, %323) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %329 = "tf.Shape"(%328) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %330 = "tf.StridedSlice"(%329, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %331 = "tf.Sub"(%330, %317) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %332 = "tf.Pack"(%331) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %333 = "tf.StridedSlice"(%328, %13, %332, %14) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %334 = "tf.StridedSlice"(%328, %324, %13, %14) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %335:2 = "tf.RaggedRange"(%333, %334, %15) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %336 = "tf.GatherV2"(%199, %335#1, %16) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %337 = "tf.Cast"(%336) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
-    %338 = "tf.BroadcastTo"(%337, %310) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-    %339 = "tf.Max"(%338, %17) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
-    %340 = "tf.Maximum"(%16, %339) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %341 = "tf.Range"(%16, %340, %9) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
-    %342 = "tf.Pack"(%9, %340) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %343 = "tf.Tile"(%307, %342) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %344 = "tf.Shape"(%343) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %345 = "tf.StridedSlice"(%344, %17, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-    %346 = "tf.Prod"(%345, %17) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
-    %347 = "tf.Pack"(%346) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %348 = "tf.Shape"(%343) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %349 = "tf.StridedSlice"(%348, %17, %17, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %350 = "tf.Shape"(%343) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %351 = "tf.StridedSlice"(%350, %7, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %352 = "tf.ConcatV2"(%349, %347, %351, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %353 = "tf.Reshape"(%343, %352) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %354 = "tf.ExpandDims"(%338, %2) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
-    %355 = "tf.Less"(%341, %354) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
-    %356 = "tf.Reshape"(%355, %6) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
-    %357 = "tf.Where"(%356) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
-    %358 = "tf.Squeeze"(%357) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
-    %359 = "tf.GatherV2"(%353, %358, %16) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %360 = "tf.Cast"(%336) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
-    %361 = "tf.BroadcastTo"(%360, %315) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-    %362 = "tf.Max"(%361, %17) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
-    %363 = "tf.Maximum"(%16, %362) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %364 = "tf.Range"(%16, %363, %9) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
-    %365 = "tf.Pack"(%9, %363) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %366 = "tf.Tile"(%312, %365) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %367 = "tf.Shape"(%366) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %368 = "tf.StridedSlice"(%367, %17, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-    %369 = "tf.Prod"(%368, %17) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
-    %370 = "tf.Pack"(%369) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %371 = "tf.Shape"(%366) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %372 = "tf.StridedSlice"(%371, %17, %17, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %373 = "tf.Shape"(%366) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %374 = "tf.StridedSlice"(%373, %7, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %375 = "tf.ConcatV2"(%372, %370, %374, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %376 = "tf.Reshape"(%366, %375) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %377 = "tf.ExpandDims"(%361, %2) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
-    %378 = "tf.Less"(%364, %377) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
-    %379 = "tf.Reshape"(%378, %6) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
-    %380 = "tf.Where"(%379) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
-    %381 = "tf.Squeeze"(%380) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
-    %382 = "tf.GatherV2"(%376, %381, %16) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %383:2 = "tf.RaggedRange"(%359, %382, %15) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %384 = "tf.If"(%261, %261, %257, %67) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_1_AssertGuard_false_106180, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_1_AssertGuard_true_106170} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %385 = "tf.Identity"(%384) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %386 = "tf.StridedSlice"(%62, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %387 = "tf.Equal"(%386, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %388 = "tf.Select"(%387, %257, %386) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %389 = "tf.Pack"(%388) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %390 = "tf.StridedSlice"(%62, %17, %17, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi64>
-    %391 = "tf.StridedSlice"(%62, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
-    %392 = "tf.ConcatV2"(%390, %389, %391, %16) {device = ""} : (tensor<0xi64>, tensor<1xi64>, tensor<1xi64>, tensor<i32>) -> tensor<2xi64>
-    %393 = "tf.StridedSlice"(%392, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %394 = "tf.Equal"(%393, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %395 = "tf.StridedSlice"(%392, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %396 = "tf.StridedSlice"(%392, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %397 = "tf.Equal"(%396, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %398 = "tf.If"(%397, %397, %396, %336) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_2_AssertGuard_false_106670, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_2_AssertGuard_true_106660} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<?xi64>) -> tensor<i1>
-    %399 = "tf.Identity"(%398) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %400 = "tf.If"(%394, %394, %336, %395) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_3_AssertGuard_false_107030, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_3_AssertGuard_true_107020} : (tensor<i1>, tensor<i1>, tensor<?xi64>, tensor<i64>) -> tensor<i1>
-    %401 = "tf.If"(%236, %236, %15, %232) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_AssertGuard_false_111870, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_AssertGuard_true_111860} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %402 = "tf.Identity"(%401) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %403 = "tf.Equal"(%232, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %404 = "tf.Select"(%403, %15, %232) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %405 = "tf.Equal"(%404, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %406 = "tf.LogicalOr"(%405, %1) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %407 = "tf.Equal"(%404, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %408 = "tf.LogicalOr"(%406, %407) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %409 = "tf.Select"(%243, %404, %15) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %410 = "tf.Pack"(%409, %15) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
-    %411 = "tf.StridedSlice"(%410, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %412 = "tf.Cast"(%411) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
-    %413 = "tf.Reshape"(%412, %11) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
-    %414 = "tf.Pack"(%9, %413) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %415 = "tf.Tile"(%244, %414) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %416 = "tf.Mul"(%413, %247) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %417 = "tf.Pack"(%416) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %418 = "tf.ConcatV2"(%246, %417, %248, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %419 = "tf.Reshape"(%415, %418) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %420 = "tf.Shape"(%419) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %421 = "tf.StridedSlice"(%420, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %422 = "tf.Pack"(%411) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %423 = "tf.StridedSlice"(%419, %422, %13, %14) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %424 = "tf.Sub"(%421, %411) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %425 = "tf.Pack"(%424) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %426 = "tf.StridedSlice"(%419, %13, %425, %14) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %427:2 = "tf.RaggedRange"(%426, %423, %15) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %428 = "tf.GatherV2"(%250, %427#1, %16) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %429 = "tf.StridedSlice"(%410, %17, %18, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
-    %430 = "tf.StridedSlice"(%410, %17, %18, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
-    %431 = "tf.StridedSlice"(%410, %7, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi64>
-    %432 = "tf.ConcatV2"(%430, %431, %16) {device = ""} : (tensor<1xi64>, tensor<0xi64>, tensor<i32>) -> tensor<1xi64>
-    %433 = "tf.StridedSlice"(%410, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %434 = "tf.Mul"(%253, %433) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
-    %435 = "tf.Tile"(%434, %429) {device = ""} : (tensor<?xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %436 = "tf.Cumsum"(%435, %16) {device = "", exclusive = false, reverse = false} : (tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %437 = "tf.ConcatV2"(%13, %436, %2) {device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %438 = "tf.Shape"(%437) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %439 = "tf.StridedSlice"(%438, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %440 = "tf.Sub"(%439, %15) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %441 = "tf.Equal"(%440, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %442 = "tf.LogicalOr"(%117, %441) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %443 = "tf.Equal"(%440, %116) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %444 = "tf.LogicalOr"(%442, %443) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %445 = "tf.StridedSlice"(%437, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %446 = "tf.StridedSlice"(%437, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %447 = "tf.Sub"(%445, %446) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %448 = "tf.Shape"(%437) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %449 = "tf.StridedSlice"(%448, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %450 = "tf.Sub"(%449, %15) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %451 = "tf.Equal"(%450, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %452 = "tf.ExpandDims"(%437, %9) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %453 = "tf.Shape"(%437) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %454 = "tf.StridedSlice"(%453, %17, %17, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %455 = "tf.StridedSlice"(%453, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %456 = "tf.StridedSlice"(%453, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %457 = "tf.Select"(%1, %404, %15) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %458 = "tf.Pack"(%457, %15) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
-    %459 = "tf.StridedSlice"(%458, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %460 = "tf.Cast"(%459) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
-    %461 = "tf.Reshape"(%460, %11) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
-    %462 = "tf.Pack"(%9, %461) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %463 = "tf.Tile"(%3, %462) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
-    %464 = "tf.Mul"(%461, %10) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %465 = "tf.Pack"(%464) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %466 = "tf.ConcatV2"(%11, %465, %11, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %467 = "tf.Reshape"(%463, %466) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %468 = "tf.Shape"(%467) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %469 = "tf.StridedSlice"(%468, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %470 = "tf.Pack"(%459) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %471 = "tf.StridedSlice"(%467, %470, %13, %14) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %472 = "tf.Sub"(%469, %459) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %473 = "tf.Pack"(%472) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %474 = "tf.StridedSlice"(%467, %13, %473, %14) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %475:2 = "tf.RaggedRange"(%474, %471, %15) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %476 = "tf.GatherV2"(%13, %475#1, %16) {batch_dims = 0 : i64, device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %477 = "tf.GatherV2"(%14, %476, %16) {batch_dims = 0 : i64, device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %478 = "tf.StridedSlice"(%458, %17, %18, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
-    %479 = "tf.StridedSlice"(%458, %17, %18, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
-    %480 = "tf.StridedSlice"(%458, %7, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi64>
-    %481 = "tf.ConcatV2"(%479, %480, %16) {device = ""} : (tensor<1xi64>, tensor<0xi64>, tensor<i32>) -> tensor<1xi64>
-    %482 = "tf.Tile"(%477, %481) {device = ""} : (tensor<?xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %483 = "tf.StridedSlice"(%458, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %484 = "tf.Mul"(%483, %14) {device = ""} : (tensor<i64>, tensor<1xi64>) -> tensor<1xi64>
-    %485 = "tf.Tile"(%484, %478) {device = ""} : (tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %486 = "tf.Cumsum"(%485, %16) {device = "", exclusive = false, reverse = false} : (tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %487 = "tf.ConcatV2"(%13, %486, %2) {device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %488 = "tf.StridedSlice"(%487, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %489 = "tf.ExpandDims"(%488, %9) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %490 = "tf.Shape"(%488) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %491 = "tf.StridedSlice"(%490, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %492 = "tf.Pack"(%491) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %493 = "tf.StridedSlice"(%487, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %494 = "tf.ExpandDims"(%493, %9) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %495 = "tf.Shape"(%493) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %496 = "tf.StridedSlice"(%495, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %497 = "tf.Pack"(%496) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %498 = "tf.Equal"(%232, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %499 = "tf.Select"(%498, %404, %15) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %500 = "tf.Cast"(%499) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
-    %501 = "tf.Reshape"(%500, %11) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
-    %502 = "tf.Pack"(%9, %501) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %503 = "tf.Mul"(%501, %10) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %504 = "tf.Pack"(%503) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %505 = "tf.ConcatV2"(%11, %504, %11, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %506 = "tf.Pack"(%499) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %507 = "tf.Pack"(%12, %232) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
-    %508 = "tf.ExpandDims"(%507, %9) {device = ""} : (tensor<2xi64>, tensor<i32>) -> tensor<2x1xi64>
-    %509 = "tf.Tile"(%508, %502) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
-    %510 = "tf.Reshape"(%509, %505) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %511 = "tf.Shape"(%510) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %512 = "tf.StridedSlice"(%511, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %513 = "tf.Sub"(%512, %499) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %514 = "tf.Pack"(%513) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %515 = "tf.StridedSlice"(%510, %13, %514, %14) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %516 = "tf.StridedSlice"(%510, %506, %13, %14) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %517:2 = "tf.RaggedRange"(%515, %516, %15) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %518 = "tf.GatherV2"(%239, %517#1, %16) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %519 = "tf.Cast"(%518) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
-    %520 = "tf.BroadcastTo"(%519, %492) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-    %521 = "tf.Max"(%520, %17) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
-    %522 = "tf.Maximum"(%16, %521) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %523 = "tf.Range"(%16, %522, %9) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
-    %524 = "tf.Pack"(%9, %522) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %525 = "tf.Tile"(%489, %524) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %526 = "tf.Shape"(%525) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %527 = "tf.StridedSlice"(%526, %17, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-    %528 = "tf.Prod"(%527, %17) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
-    %529 = "tf.Pack"(%528) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %530 = "tf.Shape"(%525) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %531 = "tf.StridedSlice"(%530, %17, %17, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %532 = "tf.Shape"(%525) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %533 = "tf.StridedSlice"(%532, %7, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %534 = "tf.ConcatV2"(%531, %529, %533, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %535 = "tf.Reshape"(%525, %534) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %536 = "tf.ExpandDims"(%520, %2) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
-    %537 = "tf.Less"(%523, %536) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
-    %538 = "tf.Reshape"(%537, %6) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
-    %539 = "tf.Where"(%538) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
-    %540 = "tf.Squeeze"(%539) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
-    %541 = "tf.GatherV2"(%535, %540, %16) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %542 = "tf.Cast"(%518) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
-    %543 = "tf.BroadcastTo"(%542, %497) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-    %544 = "tf.Max"(%543, %17) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
-    %545 = "tf.Maximum"(%16, %544) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %546 = "tf.Range"(%16, %545, %9) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
-    %547 = "tf.Pack"(%9, %545) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %548 = "tf.Tile"(%494, %547) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %549 = "tf.Shape"(%548) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %550 = "tf.StridedSlice"(%549, %17, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-    %551 = "tf.Prod"(%550, %17) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
-    %552 = "tf.Pack"(%551) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %553 = "tf.Shape"(%548) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %554 = "tf.StridedSlice"(%553, %17, %17, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %555 = "tf.Shape"(%548) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %556 = "tf.StridedSlice"(%555, %7, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %557 = "tf.ConcatV2"(%554, %552, %556, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %558 = "tf.Reshape"(%548, %557) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %559 = "tf.ExpandDims"(%543, %2) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
-    %560 = "tf.Less"(%546, %559) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
-    %561 = "tf.Reshape"(%560, %6) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
-    %562 = "tf.Where"(%561) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
-    %563 = "tf.Squeeze"(%562) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
-    %564 = "tf.GatherV2"(%558, %563, %16) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %565:2 = "tf.RaggedRange"(%541, %564, %15) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %566 = "tf.GatherV2"(%482, %565#1, %16) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %567 = "tf.If"(%408, %408, %404, %15) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_1_AssertGuard_false_112940, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_1_AssertGuard_true_112930} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %568 = "tf.Identity"(%567) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %569 = "tf.Select"(%1, %404, %15) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %570 = "tf.Pack"(%569) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %571 = "tf.ConcatV2"(%0, %570, %14, %16) {device = ""} : (tensor<0xi64>, tensor<1xi64>, tensor<1xi64>, tensor<i32>) -> tensor<2xi64>
-    %572 = "tf.StridedSlice"(%571, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %573 = "tf.Equal"(%572, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %574 = "tf.StridedSlice"(%571, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %575 = "tf.StridedSlice"(%571, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %576 = "tf.Equal"(%575, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %577 = "tf.If"(%576, %576, %575, %518) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_2_AssertGuard_false_113430, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_2_AssertGuard_true_113420} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<?xi64>) -> tensor<i1>
-    %578 = "tf.Identity"(%577) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %579 = "tf.If"(%573, %573, %518, %574) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_3_AssertGuard_false_113790, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_3_AssertGuard_true_113780} : (tensor<i1>, tensor<i1>, tensor<?xi64>, tensor<i64>) -> tensor<i1>
-    %580 = "tf.Identity"(%579) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %581 = "tf.If"(%444, %444, %116, %440) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_AssertGuard_false_118470, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_AssertGuard_true_118460} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %582 = "tf.Identity"(%581) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %583 = "tf.Equal"(%440, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %584 = "tf.Select"(%583, %116, %440) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %585 = "tf.Equal"(%584, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %586 = "tf.LogicalOr"(%585, %119) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %587 = "tf.Equal"(%118, %584) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %588 = "tf.LogicalOr"(%586, %587) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %589 = "tf.Select"(%451, %584, %15) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %590 = "tf.Pack"(%589, %15) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
-    %591 = "tf.StridedSlice"(%590, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %592 = "tf.Cast"(%591) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
-    %593 = "tf.Reshape"(%592, %11) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
-    %594 = "tf.Pack"(%9, %593) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %595 = "tf.Tile"(%452, %594) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %596 = "tf.Mul"(%593, %455) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %597 = "tf.Pack"(%596) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %598 = "tf.ConcatV2"(%454, %597, %456, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %599 = "tf.Reshape"(%595, %598) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %600 = "tf.Shape"(%599) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %601 = "tf.StridedSlice"(%600, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %602 = "tf.Pack"(%591) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %603 = "tf.StridedSlice"(%599, %602, %13, %14) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %604 = "tf.Sub"(%601, %591) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %605 = "tf.Pack"(%604) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %606 = "tf.StridedSlice"(%599, %13, %605, %14) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %607:2 = "tf.RaggedRange"(%606, %603, %15) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %608 = "tf.Select"(%124, %584, %15) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %609 = "tf.Pack"(%608, %15) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
-    %610 = "tf.StridedSlice"(%609, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %611 = "tf.Cast"(%610) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
-    %612 = "tf.Reshape"(%611, %11) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
-    %613 = "tf.Pack"(%9, %612) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %614 = "tf.Tile"(%106, %613) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %615 = "tf.Mul"(%612, %109) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %616 = "tf.Pack"(%615) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %617 = "tf.ConcatV2"(%108, %616, %110, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %618 = "tf.Reshape"(%614, %617) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %619 = "tf.Shape"(%618) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %620 = "tf.StridedSlice"(%619, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %621 = "tf.Pack"(%610) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %622 = "tf.StridedSlice"(%618, %621, %13, %14) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %623 = "tf.Sub"(%620, %610) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %624 = "tf.Pack"(%623) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %625 = "tf.StridedSlice"(%618, %13, %624, %14) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %626:2 = "tf.RaggedRange"(%625, %622, %15) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %627 = "tf.StridedSlice"(%609, %17, %18, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
-    %628 = "tf.StridedSlice"(%609, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %629 = "tf.Mul"(%113, %628) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
-    %630 = "tf.Tile"(%629, %627) {device = ""} : (tensor<?xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %631 = "tf.Cumsum"(%630, %16) {device = "", exclusive = false, reverse = false} : (tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %632 = "tf.ConcatV2"(%13, %631, %2) {device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %633 = "tf.StridedSlice"(%632, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %634 = "tf.ExpandDims"(%633, %9) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %635 = "tf.Shape"(%633) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %636 = "tf.StridedSlice"(%635, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %637 = "tf.Pack"(%636) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %638 = "tf.StridedSlice"(%632, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %639 = "tf.ExpandDims"(%638, %9) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %640 = "tf.Shape"(%638) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %641 = "tf.StridedSlice"(%640, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %642 = "tf.Pack"(%641) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %643 = "tf.Equal"(%440, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %644 = "tf.Select"(%643, %584, %15) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %645 = "tf.Cast"(%644) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
-    %646 = "tf.Reshape"(%645, %11) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
-    %647 = "tf.Pack"(%9, %646) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %648 = "tf.Mul"(%646, %10) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %649 = "tf.Pack"(%648) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %650 = "tf.ConcatV2"(%11, %649, %11, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %651 = "tf.Pack"(%644) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %652 = "tf.Pack"(%12, %440) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
-    %653 = "tf.ExpandDims"(%652, %9) {device = ""} : (tensor<2xi64>, tensor<i32>) -> tensor<2x1xi64>
-    %654 = "tf.Tile"(%653, %647) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
-    %655 = "tf.Reshape"(%654, %650) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %656 = "tf.Shape"(%655) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %657 = "tf.StridedSlice"(%656, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %658 = "tf.Sub"(%657, %644) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %659 = "tf.Pack"(%658) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %660 = "tf.StridedSlice"(%655, %13, %659, %14) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %661 = "tf.StridedSlice"(%655, %651, %13, %14) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %662:2 = "tf.RaggedRange"(%660, %661, %15) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %663 = "tf.GatherV2"(%447, %662#1, %16) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %664 = "tf.Cast"(%663) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
-    %665 = "tf.BroadcastTo"(%664, %637) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-    %666 = "tf.Max"(%665, %17) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
-    %667 = "tf.Maximum"(%16, %666) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %668 = "tf.Range"(%16, %667, %9) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
-    %669 = "tf.Pack"(%9, %667) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %670 = "tf.Tile"(%634, %669) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %671 = "tf.Shape"(%670) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %672 = "tf.StridedSlice"(%671, %17, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-    %673 = "tf.Prod"(%672, %17) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
-    %674 = "tf.Pack"(%673) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %675 = "tf.Shape"(%670) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %676 = "tf.StridedSlice"(%675, %17, %17, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %677 = "tf.Shape"(%670) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %678 = "tf.StridedSlice"(%677, %7, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %679 = "tf.ConcatV2"(%676, %674, %678, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %680 = "tf.Reshape"(%670, %679) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %681 = "tf.ExpandDims"(%665, %2) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
-    %682 = "tf.Less"(%668, %681) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
-    %683 = "tf.Reshape"(%682, %6) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
-    %684 = "tf.Where"(%683) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
-    %685 = "tf.Squeeze"(%684) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
-    %686 = "tf.GatherV2"(%680, %685, %16) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %687 = "tf.Cast"(%663) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
-    %688 = "tf.BroadcastTo"(%687, %642) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-    %689 = "tf.Max"(%688, %17) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
-    %690 = "tf.Maximum"(%16, %689) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %691 = "tf.Range"(%16, %690, %9) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
-    %692 = "tf.Pack"(%9, %690) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %693 = "tf.Tile"(%639, %692) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %694 = "tf.Shape"(%693) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %695 = "tf.StridedSlice"(%694, %17, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-    %696 = "tf.Prod"(%695, %17) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
-    %697 = "tf.Pack"(%696) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %698 = "tf.Shape"(%693) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %699 = "tf.StridedSlice"(%698, %17, %17, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %700 = "tf.Shape"(%693) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %701 = "tf.StridedSlice"(%700, %7, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %702 = "tf.ConcatV2"(%699, %697, %701, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %703 = "tf.Reshape"(%693, %702) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %704 = "tf.ExpandDims"(%688, %2) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
-    %705 = "tf.Less"(%691, %704) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
-    %706 = "tf.Reshape"(%705, %6) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
-    %707 = "tf.Where"(%706) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
-    %708 = "tf.Squeeze"(%707) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
-    %709 = "tf.GatherV2"(%703, %708, %16) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %710:2 = "tf.RaggedRange"(%686, %709, %15) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %711 = "tf.If"(%588, %588, %584, %120) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_1_AssertGuard_false_119540, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_1_AssertGuard_true_119530} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %712 = "tf.Identity"(%711) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %713 = "tf.StridedSlice"(%115, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %714 = "tf.Equal"(%713, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %715 = "tf.Select"(%714, %584, %713) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %716 = "tf.Pack"(%715) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %717 = "tf.StridedSlice"(%115, %17, %17, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi64>
-    %718 = "tf.StridedSlice"(%115, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
-    %719 = "tf.ConcatV2"(%717, %716, %718, %16) {device = ""} : (tensor<0xi64>, tensor<1xi64>, tensor<1xi64>, tensor<i32>) -> tensor<2xi64>
-    %720 = "tf.StridedSlice"(%719, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %721 = "tf.Equal"(%720, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %722 = "tf.StridedSlice"(%719, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %723 = "tf.StridedSlice"(%719, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %724 = "tf.Equal"(%723, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %725 = "tf.If"(%724, %724, %723, %663) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_2_AssertGuard_false_120030, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_2_AssertGuard_true_120020} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<?xi64>) -> tensor<i1>
-    %726 = "tf.Identity"(%725) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %727 = "tf.If"(%721, %721, %663, %722) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_3_AssertGuard_false_120390, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_3_AssertGuard_true_120380} : (tensor<i1>, tensor<i1>, tensor<?xi64>, tensor<i64>) -> tensor<i1>
-    %728 = "tf.Identity"(%168) {device = ""} : (tensor<?xi64>) -> tensor<?xi64>
-    %729 = "tf.Identity"(%727) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %730 = "tf.Identity"(%400) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %731 = "tf.Shape"(%125#2) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %732 = "tf.StridedSlice"(%731, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %733 = "tf.Cast"(%732) {Truncate = false, device = ""} : (tensor<0xi32>) -> tensor<0xi64>
-    %734 = "tf.Identity"(%733) {device = ""} : (tensor<0xi64>) -> tensor<0xi64>
-    %735 = "tf.Shape"(%125#3) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %736 = "tf.StridedSlice"(%735, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %737 = "tf.Cast"(%736) {Truncate = false, device = ""} : (tensor<0xi32>) -> tensor<0xi64>
-    %738 = "tf.Identity"(%737) {device = ""} : (tensor<0xi64>) -> tensor<0xi64>
-    %739 = "tf.GatherV2"(%125#3, %428, %16) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %740 = "tf.Tile"(%739, %432) {device = ""} : (tensor<?xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %741 = "tf.Sub"(%740, %566) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %742 = "tf.Shape"(%741) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %743 = "tf.StridedSlice"(%742, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %744 = "tf.Cast"(%743) {Truncate = false, device = ""} : (tensor<0xi32>) -> tensor<0xi64>
-    %745 = "tf.Identity"(%744) {device = ""} : (tensor<0xi64>) -> tensor<0xi64>
-    %746 = "tf.UnicodeEncode"(%125#0, %146) {Tsplits = i64, device = "", errors = "replace", output_encoding = "UTF-8", replacement_char = 65533 : i64} : (tensor<?xi32>, tensor<?xi64>) -> tensor<?x!tf.string>
-    %747 = "tf.Identity"(%746) {device = ""} : (tensor<?x!tf.string>) -> tensor<?x!tf.string>
-    %748 = "tf.StridedSlice"(%19, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %749 = "tf.AddV2"(%748, %15) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %750 = "tf.Range"(%12, %749, %15) {device = ""} : (tensor<i64>, tensor<i64>, tensor<i64>) -> tensor<?xi64>
-    %751 = "tf.Mul"(%750, %15) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
-    %752 = "tf.Identity"(%751) {device = ""} : (tensor<?xi64>) -> tensor<?xi64>
-    return %747, %752, %728 : tensor<?x!tf.string>, tensor<?xi64>, tensor<?xi64>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedConcat_assert_equal_1_Assert_AssertGuard_false_99640(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Input tensors have incompatible shapes."> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedConcat/RaggedFromTensor/strided_slice_4:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedConcat/RaggedNRows/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedConcat_assert_equal_1_Assert_AssertGuard_true_99630(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_100400(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:zero"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/RowPartitionFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/RowPartitionFromRowSplits/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_100390(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_100760(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:monotonic"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x >= 0 did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/RowPartitionFromRowSplits/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<?xi64>) -> ()
-    %3 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %4 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %4 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_100750(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_101100(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to _from_row_partition do not form a valid RaggedTensor"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/strided_slice_1:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_101090(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_101470(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:zero"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RowPartitionFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RowPartitionFromRowSplits/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_101460(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_101830(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:monotonic"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x >= 0 did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RowPartitionFromRowSplits/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<?xi64>) -> ()
-    %3 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %4 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %4 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_101820(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_false_102190(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to _from_row_partition do not form a valid RaggedTensor"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RaggedNRows/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_true_102180(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_102540(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:zero"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/RowPartitionFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/RowPartitionFromRowSplits/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_102530(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_102900(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:monotonic"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x >= 0 did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/RowPartitionFromRowSplits/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<?xi64>) -> ()
-    %3 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %4 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %4 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_102890(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_103240(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to _from_row_partition do not form a valid RaggedTensor"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/strided_slice_1:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_103230(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_103610(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:zero"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/RowPartitionFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/RowPartitionFromRowSplits/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_103600(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_103970(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:monotonic"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x >= 0 did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/RowPartitionFromRowSplits/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<?xi64>) -> ()
-    %3 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %4 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %4 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_103960(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_104310(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to _from_row_partition do not form a valid RaggedTensor"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/strided_slice_1:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_104300(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_AssertGuard_false_105110(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_AssertGuard_true_105100(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_1_AssertGuard_false_106180(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_1_AssertGuard_true_106170(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_2_AssertGuard_false_106670(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<?xi64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_2_AssertGuard_true_106660(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_3_AssertGuard_false_107030(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<?xi64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_3_AssertGuard_true_107020(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_AssertGuard_false_111870(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_AssertGuard_true_111860(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_1_AssertGuard_false_112940(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_1_AssertGuard_true_112930(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_2_AssertGuard_false_113430(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<?xi64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_2_AssertGuard_true_113420(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_3_AssertGuard_false_113790(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<?xi64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_3_AssertGuard_true_113780(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_AssertGuard_false_118470(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_AssertGuard_true_118460(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_1_AssertGuard_false_119540(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_1_AssertGuard_true_119530(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_2_AssertGuard_false_120030(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<?xi64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_2_AssertGuard_true_120020(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_3_AssertGuard_false_120390(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<?xi64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_3_AssertGuard_true_120380(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-
-
-  // CHECK:  func @whitespace_tokenizer_rank2(%arg0: tensor<?x1x!tf.string> {tf._user_specified_name = "input"}) -> (tensor<?x!tf.string>, tensor<?xi64>, tensor<?xi64>) attributes {tf._input_shapes = [#tf.shape<?x1>], tf.api_implements = "tftext:WhitespaceTokenizer", tf.signature.is_stateful} {
-  // CHECK:  %0:3 = "tfl.custom"(%arg0) {custom_code = "tftext:WhitespaceTokenizer", custom_option = opaque<"tfl", "0x"> : tensor<0xi8>} : (tensor<?x1x!tf.string>) -> (tensor<?x!tf.string>, tensor<?xi64>, tensor<?xi64>)
-  // CHECK:  return %0#0, %0#1, %0#2 : tensor<?x!tf.string>, tensor<?xi64>, tensor<?xi64>
-
-  func @whitespace_tokenizer_rank0(%arg0: tensor<!tf.string> {tf._user_specified_name = "input"}) -> tensor<?x!tf.string> attributes {tf._input_shapes = [#tf.shape<>], tf.api_implements = "tftext:WhitespaceTokenizer", tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi64>} : () -> tensor<2xi64>
-    %1 = "tf.Const"() {value = dense<[]> : tensor<0xi64>} : () -> tensor<0xi64>
-    %2 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
-    %3 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
-    %4 = "tf.Const"() {value = dense<[[0], [1]]> : tensor<2x1xi64>} : () -> tensor<2x1xi64>
-    %5 = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
-    %6 = "tf.Const"() {value = dense<2> : tensor<1xi32>} : () -> tensor<1xi32>
-    %7 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-    %8 = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
-    %9 = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
-    %10 = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
-    %11 = "tf.Const"() {value = dense<0> : tensor<1xi64>} : () -> tensor<1xi64>
-    %12 = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
-    %13 = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
-    %14 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %15 = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
-    %16 = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
-    %17 = "tf.If"(%2, %2, %13, %13) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedConcat_assert_equal_1_Assert_AssertGuard_false_3220, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedConcat_assert_equal_1_Assert_AssertGuard_true_3210} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %18 = "tf.Identity"(%17) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %19 = "tf.Pack"(%arg0) {axis = 0 : i64, device = ""} : (tensor<!tf.string>) -> tensor<1x!tf.string>
-    %20 = "tf.StringLength"(%19) {device = "", unit = "BYTE"} : (tensor<1x!tf.string>) -> tensor<1xi32>
-    %21 = "tf.ExpandDims"(%20, %7) {device = ""} : (tensor<1xi32>, tensor<i32>) -> tensor<1x1xi32>
-    %22 = "tf.Cast"(%21) {Truncate = false, device = ""} : (tensor<1x1xi32>) -> tensor<1x1xi64>
-    %23 = "tf.Reshape"(%22, %12) {device = ""} : (tensor<1x1xi64>, tensor<1xi64>) -> tensor<1xi64>
-    %24 = "tf.Reshape"(%19, %5) {device = ""} : (tensor<1x!tf.string>, tensor<1xi32>) -> tensor<1x!tf.string>
-    %25:3 = "tf.UnicodeDecodeWithOffsets"(%24) {Tsplits = i64, device = "", errors = "replace", input_encoding = "UTF-8", replace_control_characters = false, replacement_char = 65533 : i64} : (tensor<1x!tf.string>) -> (tensor<2xi64>, tensor<?xi32>, tensor<?xi64>)
-    %26 = "tf.StridedSlice"(%25#0, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
-    %27 = "tf.AddV2"(%26, %13) {device = ""} : (tensor<1xi64>, tensor<i64>) -> tensor<1xi64>
-    %28 = "tf.StridedSlice"(%25#0, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
-    %29 = "tf.Minimum"(%27, %28) {device = ""} : (tensor<1xi64>, tensor<1xi64>) -> tensor<1xi64>
-    %30:2 = "tf.RaggedRange"(%29, %28, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<1xi64>, tensor<1xi64>, tensor<i64>) -> (tensor<2xi64>, tensor<?xi64>)
-    %31 = "tf.StridedSlice"(%30#0, %5, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %32 = "tf.AddV2"(%31, %12) {device = ""} : (tensor<i64>, tensor<1xi64>) -> tensor<1xi64>
-    %33 = "tf.ConcatV2"(%30#0, %32, %14) {device = ""} : (tensor<2xi64>, tensor<1xi64>, tensor<i32>) -> tensor<3xi64>
-    %34 = "tf.GatherV2"(%25#2, %30#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %35 = "tf.ConcatV2"(%34, %23, %14) {device = ""} : (tensor<?xi64>, tensor<1xi64>, tensor<i32>) -> tensor<?xi64>
-    %36:2 = "tf.RaggedGather"(%33, %35, %0) {OUTPUT_RAGGED_RANK = 1 : i64, PARAMS_RAGGED_RANK = 1 : i64, Tindices = i64, Tsplits = i64, Tvalues = i64, device = ""} : (tensor<3xi64>, tensor<?xi64>, tensor<2xi64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %37:5 = "tf.WhitespaceTokenizeWithOffsets"(%25#1, %25#0) {Tsplits = i64, device = ""} : (tensor<?xi32>, tensor<2xi64>) -> (tensor<?xi32>, tensor<?xi64>, tensor<?xi64>, tensor<?xi64>, tensor<?xi64>)
-    %38 = "tf.StridedSlice"(%37#1, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %39 = "tf.Equal"(%38, %10) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %40 = "tf.All"(%39, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
-    %41 = "tf.If"(%40, %40, %38, %10) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_3980, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_3970} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %42 = "tf.Identity"(%41) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %43 = "tf.StridedSlice"(%37#1, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %44 = "tf.StridedSlice"(%37#1, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %45 = "tf.Sub"(%43, %44) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %46 = "tf.LessEqual"(%10, %45) {device = ""} : (tensor<i64>, tensor<?xi64>) -> tensor<?xi1>
-    %47 = "tf.All"(%46, %15) {device = "", keep_dims = false} : (tensor<?xi1>, tensor<1xi32>) -> tensor<i1>
-    %48 = "tf.If"(%47, %47, %45) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_4340, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_4330} : (tensor<i1>, tensor<i1>, tensor<?xi64>) -> tensor<i1>
-    %49 = "tf.Identity"(%48) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %50 = "tf.Identity"(%37#1) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
-    %51 = "tf.StridedSlice"(%50, %5, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %52 = "tf.Shape"(%37#0) {device = ""} : (tensor<?xi32>) -> tensor<1xi64>
-    %53 = "tf.StridedSlice"(%52, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %54 = "tf.Equal"(%51, %53) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %55 = "tf.All"(%54, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
-    %56 = "tf.If"(%55, %55, %51, %53) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_4680, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_4670} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %57 = "tf.Identity"(%56) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %58 = "tf.Identity"(%50) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
-    %59 = "tf.Shape"(%58) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %60 = "tf.StridedSlice"(%59, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %61 = "tf.Sub"(%60, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %62 = "tf.StridedSlice"(%37#4, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %63 = "tf.Equal"(%62, %10) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %64 = "tf.All"(%63, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
-    %65 = "tf.If"(%64, %64, %62, %10) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_5050, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_5040} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %66 = "tf.Identity"(%65) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %67 = "tf.StridedSlice"(%37#4, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %68 = "tf.StridedSlice"(%37#4, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %69 = "tf.Sub"(%67, %68) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %70 = "tf.LessEqual"(%10, %69) {device = ""} : (tensor<i64>, tensor<?xi64>) -> tensor<?xi1>
-    %71 = "tf.All"(%70, %15) {device = "", keep_dims = false} : (tensor<?xi1>, tensor<1xi32>) -> tensor<i1>
-    %72 = "tf.If"(%71, %71, %69) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_5410, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_5400} : (tensor<i1>, tensor<i1>, tensor<?xi64>) -> tensor<i1>
-    %73 = "tf.Identity"(%72) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %74 = "tf.Identity"(%37#4) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
-    %75 = "tf.StridedSlice"(%74, %5, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %76 = "tf.Equal"(%75, %61) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %77 = "tf.All"(%76, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
-    %78 = "tf.If"(%77, %77, %75, %61) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_false_5770, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_true_5760} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %79 = "tf.Identity"(%78) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %80 = "tf.Identity"(%74) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
-    %81 = "tf.StridedSlice"(%37#4, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %82 = "tf.Equal"(%81, %10) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %83 = "tf.All"(%82, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
-    %84 = "tf.If"(%83, %83, %81, %10) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_6120, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_6110} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %85 = "tf.Identity"(%84) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %86 = "tf.StridedSlice"(%37#4, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %87 = "tf.StridedSlice"(%37#4, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %88 = "tf.Sub"(%86, %87) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %89 = "tf.LessEqual"(%10, %88) {device = ""} : (tensor<i64>, tensor<?xi64>) -> tensor<?xi1>
-    %90 = "tf.All"(%89, %15) {device = "", keep_dims = false} : (tensor<?xi1>, tensor<1xi32>) -> tensor<i1>
-    %91 = "tf.If"(%90, %90, %88) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_6480, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_6470} : (tensor<i1>, tensor<i1>, tensor<?xi64>) -> tensor<i1>
-    %92 = "tf.Identity"(%91) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %93 = "tf.Identity"(%37#4) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
-    %94 = "tf.StridedSlice"(%93, %5, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %95 = "tf.Shape"(%37#2) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %96 = "tf.StridedSlice"(%95, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %97 = "tf.Equal"(%94, %96) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %98 = "tf.All"(%97, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
-    %99 = "tf.If"(%98, %98, %94, %96) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_6820, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_6810} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %100 = "tf.Identity"(%99) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %101 = "tf.Identity"(%93) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
-    %102 = "tf.Shape"(%101) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %103 = "tf.StridedSlice"(%102, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %104 = "tf.Sub"(%103, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %105 = "tf.Equal"(%104, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %106 = "tf.LogicalOr"(%105, %2) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %107 = "tf.Equal"(%104, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %108 = "tf.LogicalOr"(%106, %107) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %109 = "tf.StridedSlice"(%101, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %110 = "tf.StridedSlice"(%101, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %111 = "tf.Sub"(%109, %110) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %112 = "tf.Shape"(%101) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %113 = "tf.StridedSlice"(%112, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %114 = "tf.Sub"(%113, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %115 = "tf.Equal"(%114, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %116 = "tf.ExpandDims"(%101, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %117 = "tf.Shape"(%101) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %118 = "tf.StridedSlice"(%117, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %119 = "tf.StridedSlice"(%117, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %120 = "tf.StridedSlice"(%117, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %121 = "tf.StridedSlice"(%37#4, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %122 = "tf.Equal"(%121, %10) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %123 = "tf.All"(%122, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
-    %124 = "tf.If"(%123, %123, %121, %10) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_7190, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_7180} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %125 = "tf.Identity"(%124) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %126 = "tf.StridedSlice"(%37#4, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %127 = "tf.StridedSlice"(%37#4, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %128 = "tf.Sub"(%126, %127) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %129 = "tf.LessEqual"(%10, %128) {device = ""} : (tensor<i64>, tensor<?xi64>) -> tensor<?xi1>
-    %130 = "tf.All"(%129, %15) {device = "", keep_dims = false} : (tensor<?xi1>, tensor<1xi32>) -> tensor<i1>
-    %131 = "tf.If"(%130, %130, %128) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_7550, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_7540} : (tensor<i1>, tensor<i1>, tensor<?xi64>) -> tensor<i1>
-    %132 = "tf.Identity"(%131) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %133 = "tf.Identity"(%37#4) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
-    %134 = "tf.StridedSlice"(%133, %5, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %135 = "tf.Shape"(%37#3) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %136 = "tf.StridedSlice"(%135, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %137 = "tf.Equal"(%134, %136) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %138 = "tf.All"(%137, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
-    %139 = "tf.If"(%138, %138, %134, %136) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_7890, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_7880} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %140 = "tf.Identity"(%139) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %141 = "tf.Identity"(%133) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
-    %142 = "tf.Shape"(%141) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %143 = "tf.StridedSlice"(%142, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %144 = "tf.Sub"(%143, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %145 = "tf.Equal"(%144, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %146 = "tf.LogicalOr"(%145, %2) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %147 = "tf.Equal"(%144, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %148 = "tf.LogicalOr"(%146, %147) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %149 = "tf.StridedSlice"(%141, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %150 = "tf.StridedSlice"(%141, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %151 = "tf.Sub"(%149, %150) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %152 = "tf.Shape"(%141) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %153 = "tf.StridedSlice"(%152, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %154 = "tf.Sub"(%153, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %155 = "tf.Equal"(%154, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %156 = "tf.ExpandDims"(%141, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %157 = "tf.Shape"(%141) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %158 = "tf.StridedSlice"(%157, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %159 = "tf.StridedSlice"(%157, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %160 = "tf.StridedSlice"(%157, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %161 = "tf.StridedSlice"(%141, %5, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %162 = "tf.Range"(%10, %161, %13) {device = ""} : (tensor<i64>, tensor<i64>, tensor<i64>) -> tensor<?xi64>
-    %163 = "tf.StridedSlice"(%141, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %164 = "tf.StridedSlice"(%141, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %165 = "tf.Sub"(%163, %164) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %166 = "tf.If"(%108, %108, %13, %104) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_AssertGuard_false_8690, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_AssertGuard_true_8680} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %167 = "tf.Identity"(%166) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %168 = "tf.Equal"(%104, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %169 = "tf.Select"(%168, %13, %104) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %170 = "tf.Equal"(%169, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %171 = "tf.LogicalOr"(%170, %2) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %172 = "tf.Equal"(%169, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %173 = "tf.LogicalOr"(%171, %172) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %174 = "tf.Select"(%115, %169, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %175 = "tf.Pack"(%174, %13) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
-    %176 = "tf.StridedSlice"(%175, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %177 = "tf.Cast"(%176) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
-    %178 = "tf.Reshape"(%177, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
-    %179 = "tf.Pack"(%7, %178) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %180 = "tf.Tile"(%116, %179) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %181 = "tf.Mul"(%178, %119) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %182 = "tf.Pack"(%181) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %183 = "tf.ConcatV2"(%118, %182, %120, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %184 = "tf.Reshape"(%180, %183) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %185 = "tf.Shape"(%184) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %186 = "tf.StridedSlice"(%185, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %187 = "tf.Pack"(%176) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %188 = "tf.StridedSlice"(%184, %187, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %189 = "tf.Sub"(%186, %176) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %190 = "tf.Pack"(%189) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %191 = "tf.StridedSlice"(%184, %11, %190, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %192:2 = "tf.RaggedRange"(%191, %188, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %193 = "tf.Select"(%2, %169, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %194 = "tf.Pack"(%193, %13) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
-    %195 = "tf.StridedSlice"(%194, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %196 = "tf.Cast"(%195) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
-    %197 = "tf.Reshape"(%196, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
-    %198 = "tf.Pack"(%7, %197) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %199 = "tf.Tile"(%4, %198) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
-    %200 = "tf.Mul"(%197, %8) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %201 = "tf.Pack"(%200) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %202 = "tf.ConcatV2"(%9, %201, %9, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %203 = "tf.Reshape"(%199, %202) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %204 = "tf.Shape"(%203) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %205 = "tf.StridedSlice"(%204, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %206 = "tf.Pack"(%195) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %207 = "tf.StridedSlice"(%203, %206, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %208 = "tf.Sub"(%205, %195) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %209 = "tf.Pack"(%208) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %210 = "tf.StridedSlice"(%203, %11, %209, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %211:2 = "tf.RaggedRange"(%210, %207, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %212 = "tf.StridedSlice"(%194, %15, %16, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
-    %213 = "tf.StridedSlice"(%194, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %214 = "tf.Mul"(%213, %12) {device = ""} : (tensor<i64>, tensor<1xi64>) -> tensor<1xi64>
-    %215 = "tf.Tile"(%214, %212) {device = ""} : (tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %216 = "tf.Cumsum"(%215, %14) {device = "", exclusive = false, reverse = false} : (tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %217 = "tf.ConcatV2"(%11, %216, %3) {device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %218 = "tf.StridedSlice"(%217, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %219 = "tf.ExpandDims"(%218, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %220 = "tf.Shape"(%218) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %221 = "tf.StridedSlice"(%220, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %222 = "tf.Pack"(%221) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %223 = "tf.StridedSlice"(%217, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %224 = "tf.ExpandDims"(%223, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %225 = "tf.Shape"(%223) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %226 = "tf.StridedSlice"(%225, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %227 = "tf.Pack"(%226) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %228 = "tf.Equal"(%104, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %229 = "tf.Select"(%228, %169, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %230 = "tf.Cast"(%229) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
-    %231 = "tf.Reshape"(%230, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
-    %232 = "tf.Pack"(%7, %231) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %233 = "tf.Mul"(%231, %8) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %234 = "tf.Pack"(%233) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %235 = "tf.ConcatV2"(%9, %234, %9, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %236 = "tf.Pack"(%229) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %237 = "tf.Pack"(%10, %104) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
-    %238 = "tf.ExpandDims"(%237, %7) {device = ""} : (tensor<2xi64>, tensor<i32>) -> tensor<2x1xi64>
-    %239 = "tf.Tile"(%238, %232) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
-    %240 = "tf.Reshape"(%239, %235) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %241 = "tf.Shape"(%240) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %242 = "tf.StridedSlice"(%241, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %243 = "tf.Sub"(%242, %229) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %244 = "tf.Pack"(%243) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %245 = "tf.StridedSlice"(%240, %11, %244, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %246 = "tf.StridedSlice"(%240, %236, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %247:2 = "tf.RaggedRange"(%245, %246, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %248 = "tf.GatherV2"(%111, %247#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %249 = "tf.Cast"(%248) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
-    %250 = "tf.BroadcastTo"(%249, %222) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-    %251 = "tf.Max"(%250, %15) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
-    %252 = "tf.Maximum"(%14, %251) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %253 = "tf.Range"(%14, %252, %7) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
-    %254 = "tf.Pack"(%7, %252) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %255 = "tf.Tile"(%219, %254) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %256 = "tf.Shape"(%255) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %257 = "tf.StridedSlice"(%256, %15, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-    %258 = "tf.Prod"(%257, %15) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
-    %259 = "tf.Pack"(%258) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %260 = "tf.Shape"(%255) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %261 = "tf.StridedSlice"(%260, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %262 = "tf.Shape"(%255) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %263 = "tf.StridedSlice"(%262, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %264 = "tf.ConcatV2"(%261, %259, %263, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %265 = "tf.Reshape"(%255, %264) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %266 = "tf.ExpandDims"(%250, %3) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
-    %267 = "tf.Less"(%253, %266) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
-    %268 = "tf.Reshape"(%267, %5) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
-    %269 = "tf.Where"(%268) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
-    %270 = "tf.Squeeze"(%269) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
-    %271 = "tf.GatherV2"(%265, %270, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %272 = "tf.Cast"(%248) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
-    %273 = "tf.BroadcastTo"(%272, %227) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-    %274 = "tf.Max"(%273, %15) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
-    %275 = "tf.Maximum"(%14, %274) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %276 = "tf.Range"(%14, %275, %7) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
-    %277 = "tf.Pack"(%7, %275) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %278 = "tf.Tile"(%224, %277) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %279 = "tf.Shape"(%278) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %280 = "tf.StridedSlice"(%279, %15, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-    %281 = "tf.Prod"(%280, %15) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
-    %282 = "tf.Pack"(%281) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %283 = "tf.Shape"(%278) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %284 = "tf.StridedSlice"(%283, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %285 = "tf.Shape"(%278) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %286 = "tf.StridedSlice"(%285, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %287 = "tf.ConcatV2"(%284, %282, %286, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %288 = "tf.Reshape"(%278, %287) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %289 = "tf.ExpandDims"(%273, %3) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
-    %290 = "tf.Less"(%276, %289) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
-    %291 = "tf.Reshape"(%290, %5) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
-    %292 = "tf.Where"(%291) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
-    %293 = "tf.Squeeze"(%292) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
-    %294 = "tf.GatherV2"(%288, %293, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %295:2 = "tf.RaggedRange"(%271, %294, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %296 = "tf.If"(%173, %173, %169, %13) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_1_AssertGuard_false_9760, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_1_AssertGuard_true_9750} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %297 = "tf.Identity"(%296) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %298 = "tf.Select"(%2, %169, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %299 = "tf.Pack"(%298) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %300 = "tf.ConcatV2"(%1, %299, %12, %14) {device = ""} : (tensor<0xi64>, tensor<1xi64>, tensor<1xi64>, tensor<i32>) -> tensor<2xi64>
-    %301 = "tf.StridedSlice"(%300, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %302 = "tf.Equal"(%301, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %303 = "tf.StridedSlice"(%300, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %304 = "tf.StridedSlice"(%300, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %305 = "tf.Equal"(%304, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %306 = "tf.If"(%305, %305, %304, %248) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_2_AssertGuard_false_10250, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_2_AssertGuard_true_10240} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<?xi64>) -> tensor<i1>
-    %307 = "tf.Identity"(%306) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %308 = "tf.If"(%302, %302, %248, %303) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_3_AssertGuard_false_10610, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_3_AssertGuard_true_10600} : (tensor<i1>, tensor<i1>, tensor<?xi64>, tensor<i64>) -> tensor<i1>
-    %309 = "tf.If"(%148, %148, %13, %144) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_Assert_AssertGuard_false_15310, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_Assert_AssertGuard_true_15300} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %310 = "tf.Identity"(%309) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %311 = "tf.Equal"(%144, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %312 = "tf.Select"(%311, %13, %144) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %313 = "tf.Equal"(%312, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %314 = "tf.LogicalOr"(%313, %2) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %315 = "tf.Equal"(%312, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %316 = "tf.LogicalOr"(%314, %315) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %317 = "tf.Select"(%155, %312, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %318 = "tf.Pack"(%317, %13) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
-    %319 = "tf.StridedSlice"(%318, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %320 = "tf.Cast"(%319) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
-    %321 = "tf.Reshape"(%320, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
-    %322 = "tf.Pack"(%7, %321) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %323 = "tf.Tile"(%156, %322) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %324 = "tf.Mul"(%321, %159) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %325 = "tf.Pack"(%324) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %326 = "tf.ConcatV2"(%158, %325, %160, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %327 = "tf.Reshape"(%323, %326) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %328 = "tf.Shape"(%327) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %329 = "tf.StridedSlice"(%328, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %330 = "tf.Pack"(%319) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %331 = "tf.StridedSlice"(%327, %330, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %332 = "tf.Sub"(%329, %319) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %333 = "tf.Pack"(%332) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %334 = "tf.StridedSlice"(%327, %11, %333, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %335:2 = "tf.RaggedRange"(%334, %331, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %336 = "tf.GatherV2"(%162, %335#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %337 = "tf.StridedSlice"(%318, %15, %16, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
-    %338 = "tf.StridedSlice"(%318, %15, %16, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
-    %339 = "tf.StridedSlice"(%318, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi64>
-    %340 = "tf.ConcatV2"(%338, %339, %14) {device = ""} : (tensor<1xi64>, tensor<0xi64>, tensor<i32>) -> tensor<1xi64>
-    %341 = "tf.StridedSlice"(%318, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %342 = "tf.Mul"(%165, %341) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
-    %343 = "tf.Tile"(%342, %337) {device = ""} : (tensor<?xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %344 = "tf.Cumsum"(%343, %14) {device = "", exclusive = false, reverse = false} : (tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %345 = "tf.ConcatV2"(%11, %344, %3) {device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %346 = "tf.Shape"(%345) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %347 = "tf.StridedSlice"(%346, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %348 = "tf.Sub"(%347, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %349 = "tf.Equal"(%348, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %350 = "tf.LogicalOr"(%349, %2) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %351 = "tf.Equal"(%348, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %352 = "tf.LogicalOr"(%350, %351) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %353 = "tf.StridedSlice"(%345, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %354 = "tf.StridedSlice"(%345, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %355 = "tf.Sub"(%353, %354) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %356 = "tf.Shape"(%345) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %357 = "tf.StridedSlice"(%356, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %358 = "tf.Sub"(%357, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %359 = "tf.Equal"(%358, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %360 = "tf.ExpandDims"(%345, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %361 = "tf.Shape"(%345) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %362 = "tf.StridedSlice"(%361, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %363 = "tf.StridedSlice"(%361, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %364 = "tf.StridedSlice"(%361, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %365 = "tf.Select"(%2, %312, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %366 = "tf.Pack"(%365, %13) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
-    %367 = "tf.StridedSlice"(%366, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %368 = "tf.Cast"(%367) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
-    %369 = "tf.Reshape"(%368, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
-    %370 = "tf.Pack"(%7, %369) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %371 = "tf.Tile"(%4, %370) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
-    %372 = "tf.Mul"(%369, %8) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %373 = "tf.Pack"(%372) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %374 = "tf.ConcatV2"(%9, %373, %9, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %375 = "tf.Reshape"(%371, %374) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %376 = "tf.Shape"(%375) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %377 = "tf.StridedSlice"(%376, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %378 = "tf.Pack"(%367) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %379 = "tf.StridedSlice"(%375, %378, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %380 = "tf.Sub"(%377, %367) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %381 = "tf.Pack"(%380) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %382 = "tf.StridedSlice"(%375, %11, %381, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %383:2 = "tf.RaggedRange"(%382, %379, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %384 = "tf.GatherV2"(%11, %383#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %385 = "tf.GatherV2"(%12, %384, %14) {batch_dims = 0 : i64, device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %386 = "tf.StridedSlice"(%366, %15, %16, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
-    %387 = "tf.StridedSlice"(%366, %15, %16, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
-    %388 = "tf.StridedSlice"(%366, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi64>
-    %389 = "tf.ConcatV2"(%387, %388, %14) {device = ""} : (tensor<1xi64>, tensor<0xi64>, tensor<i32>) -> tensor<1xi64>
-    %390 = "tf.Tile"(%385, %389) {device = ""} : (tensor<?xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %391 = "tf.StridedSlice"(%366, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %392 = "tf.Mul"(%391, %12) {device = ""} : (tensor<i64>, tensor<1xi64>) -> tensor<1xi64>
-    %393 = "tf.Tile"(%392, %386) {device = ""} : (tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %394 = "tf.Cumsum"(%393, %14) {device = "", exclusive = false, reverse = false} : (tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %395 = "tf.ConcatV2"(%11, %394, %3) {device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %396 = "tf.StridedSlice"(%395, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %397 = "tf.ExpandDims"(%396, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %398 = "tf.Shape"(%396) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %399 = "tf.StridedSlice"(%398, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %400 = "tf.Pack"(%399) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %401 = "tf.StridedSlice"(%395, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %402 = "tf.ExpandDims"(%401, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %403 = "tf.Shape"(%401) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %404 = "tf.StridedSlice"(%403, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %405 = "tf.Pack"(%404) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %406 = "tf.Equal"(%144, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %407 = "tf.Select"(%406, %312, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %408 = "tf.Cast"(%407) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
-    %409 = "tf.Reshape"(%408, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
-    %410 = "tf.Pack"(%7, %409) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %411 = "tf.Mul"(%409, %8) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %412 = "tf.Pack"(%411) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %413 = "tf.ConcatV2"(%9, %412, %9, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %414 = "tf.Pack"(%407) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %415 = "tf.Pack"(%10, %144) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
-    %416 = "tf.ExpandDims"(%415, %7) {device = ""} : (tensor<2xi64>, tensor<i32>) -> tensor<2x1xi64>
-    %417 = "tf.Tile"(%416, %410) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
-    %418 = "tf.Reshape"(%417, %413) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %419 = "tf.Shape"(%418) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %420 = "tf.StridedSlice"(%419, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %421 = "tf.Sub"(%420, %407) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %422 = "tf.Pack"(%421) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %423 = "tf.StridedSlice"(%418, %11, %422, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %424 = "tf.StridedSlice"(%418, %414, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %425:2 = "tf.RaggedRange"(%423, %424, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %426 = "tf.GatherV2"(%151, %425#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %427 = "tf.Cast"(%426) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
-    %428 = "tf.BroadcastTo"(%427, %400) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-    %429 = "tf.Max"(%428, %15) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
-    %430 = "tf.Maximum"(%14, %429) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %431 = "tf.Range"(%14, %430, %7) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
-    %432 = "tf.Pack"(%7, %430) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %433 = "tf.Tile"(%397, %432) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %434 = "tf.Shape"(%433) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %435 = "tf.StridedSlice"(%434, %15, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-    %436 = "tf.Prod"(%435, %15) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
-    %437 = "tf.Pack"(%436) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %438 = "tf.Shape"(%433) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %439 = "tf.StridedSlice"(%438, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %440 = "tf.Shape"(%433) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %441 = "tf.StridedSlice"(%440, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %442 = "tf.ConcatV2"(%439, %437, %441, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %443 = "tf.Reshape"(%433, %442) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %444 = "tf.ExpandDims"(%428, %3) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
-    %445 = "tf.Less"(%431, %444) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
-    %446 = "tf.Reshape"(%445, %5) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
-    %447 = "tf.Where"(%446) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
-    %448 = "tf.Squeeze"(%447) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
-    %449 = "tf.GatherV2"(%443, %448, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %450 = "tf.Cast"(%426) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
-    %451 = "tf.BroadcastTo"(%450, %405) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-    %452 = "tf.Max"(%451, %15) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
-    %453 = "tf.Maximum"(%14, %452) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %454 = "tf.Range"(%14, %453, %7) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
-    %455 = "tf.Pack"(%7, %453) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %456 = "tf.Tile"(%402, %455) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %457 = "tf.Shape"(%456) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %458 = "tf.StridedSlice"(%457, %15, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-    %459 = "tf.Prod"(%458, %15) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
-    %460 = "tf.Pack"(%459) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %461 = "tf.Shape"(%456) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %462 = "tf.StridedSlice"(%461, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %463 = "tf.Shape"(%456) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %464 = "tf.StridedSlice"(%463, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %465 = "tf.ConcatV2"(%462, %460, %464, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %466 = "tf.Reshape"(%456, %465) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %467 = "tf.ExpandDims"(%451, %3) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
-    %468 = "tf.Less"(%454, %467) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
-    %469 = "tf.Reshape"(%468, %5) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
-    %470 = "tf.Where"(%469) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
-    %471 = "tf.Squeeze"(%470) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
-    %472 = "tf.GatherV2"(%466, %471, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %473:2 = "tf.RaggedRange"(%449, %472, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %474 = "tf.GatherV2"(%390, %473#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %475 = "tf.If"(%316, %316, %312, %13) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_Assert_1_AssertGuard_false_16380, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_Assert_1_AssertGuard_true_16370} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %476 = "tf.Identity"(%475) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %477 = "tf.Select"(%2, %312, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %478 = "tf.Pack"(%477) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %479 = "tf.ConcatV2"(%1, %478, %12, %14) {device = ""} : (tensor<0xi64>, tensor<1xi64>, tensor<1xi64>, tensor<i32>) -> tensor<2xi64>
-    %480 = "tf.StridedSlice"(%479, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %481 = "tf.Equal"(%480, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %482 = "tf.StridedSlice"(%479, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %483 = "tf.StridedSlice"(%479, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %484 = "tf.Equal"(%483, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %485 = "tf.If"(%484, %484, %483, %426) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_Assert_2_AssertGuard_false_16870, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_Assert_2_AssertGuard_true_16860} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<?xi64>) -> tensor<i1>
-    %486 = "tf.Identity"(%485) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %487 = "tf.If"(%481, %481, %426, %482) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_Assert_3_AssertGuard_false_17230, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_Assert_3_AssertGuard_true_17220} : (tensor<i1>, tensor<i1>, tensor<?xi64>, tensor<i64>) -> tensor<i1>
-    %488 = "tf.Identity"(%487) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %489 = "tf.If"(%352, %352, %13, %348) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_AssertGuard_false_21910, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_AssertGuard_true_21900} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %490 = "tf.Identity"(%489) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %491 = "tf.Equal"(%348, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %492 = "tf.Select"(%491, %13, %348) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %493 = "tf.Equal"(%492, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %494 = "tf.LogicalOr"(%493, %2) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %495 = "tf.Equal"(%492, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %496 = "tf.LogicalOr"(%494, %495) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-    %497 = "tf.Select"(%359, %492, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %498 = "tf.Pack"(%497, %13) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
-    %499 = "tf.StridedSlice"(%498, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %500 = "tf.Cast"(%499) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
-    %501 = "tf.Reshape"(%500, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
-    %502 = "tf.Pack"(%7, %501) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %503 = "tf.Tile"(%360, %502) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %504 = "tf.Mul"(%501, %363) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %505 = "tf.Pack"(%504) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %506 = "tf.ConcatV2"(%362, %505, %364, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %507 = "tf.Reshape"(%503, %506) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %508 = "tf.Shape"(%507) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %509 = "tf.StridedSlice"(%508, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %510 = "tf.Pack"(%499) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %511 = "tf.StridedSlice"(%507, %510, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %512 = "tf.Sub"(%509, %499) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %513 = "tf.Pack"(%512) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %514 = "tf.StridedSlice"(%507, %11, %513, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %515:2 = "tf.RaggedRange"(%514, %511, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %516 = "tf.Select"(%2, %492, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %517 = "tf.Pack"(%516, %13) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
-    %518 = "tf.StridedSlice"(%517, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %519 = "tf.Cast"(%518) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
-    %520 = "tf.Reshape"(%519, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
-    %521 = "tf.Pack"(%7, %520) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %522 = "tf.Tile"(%4, %521) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
-    %523 = "tf.Mul"(%520, %8) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %524 = "tf.Pack"(%523) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %525 = "tf.ConcatV2"(%9, %524, %9, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %526 = "tf.Reshape"(%522, %525) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %527 = "tf.Shape"(%526) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %528 = "tf.StridedSlice"(%527, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %529 = "tf.Pack"(%518) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %530 = "tf.StridedSlice"(%526, %529, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %531 = "tf.Sub"(%528, %518) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %532 = "tf.Pack"(%531) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %533 = "tf.StridedSlice"(%526, %11, %532, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %534:2 = "tf.RaggedRange"(%533, %530, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %535 = "tf.StridedSlice"(%517, %15, %16, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
-    %536 = "tf.StridedSlice"(%517, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %537 = "tf.Mul"(%536, %12) {device = ""} : (tensor<i64>, tensor<1xi64>) -> tensor<1xi64>
-    %538 = "tf.Tile"(%537, %535) {device = ""} : (tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %539 = "tf.Cumsum"(%538, %14) {device = "", exclusive = false, reverse = false} : (tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %540 = "tf.ConcatV2"(%11, %539, %3) {device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %541 = "tf.StridedSlice"(%540, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %542 = "tf.ExpandDims"(%541, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %543 = "tf.Shape"(%541) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %544 = "tf.StridedSlice"(%543, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %545 = "tf.Pack"(%544) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %546 = "tf.StridedSlice"(%540, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
-    %547 = "tf.ExpandDims"(%546, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
-    %548 = "tf.Shape"(%546) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %549 = "tf.StridedSlice"(%548, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-    %550 = "tf.Pack"(%549) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %551 = "tf.Equal"(%348, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %552 = "tf.Select"(%551, %492, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %553 = "tf.Cast"(%552) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
-    %554 = "tf.Reshape"(%553, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
-    %555 = "tf.Pack"(%7, %554) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %556 = "tf.Mul"(%554, %8) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %557 = "tf.Pack"(%556) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %558 = "tf.ConcatV2"(%9, %557, %9, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %559 = "tf.Pack"(%552) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %560 = "tf.Pack"(%10, %348) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
-    %561 = "tf.ExpandDims"(%560, %7) {device = ""} : (tensor<2xi64>, tensor<i32>) -> tensor<2x1xi64>
-    %562 = "tf.Tile"(%561, %555) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
-    %563 = "tf.Reshape"(%562, %558) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %564 = "tf.Shape"(%563) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
-    %565 = "tf.StridedSlice"(%564, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %566 = "tf.Sub"(%565, %552) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
-    %567 = "tf.Pack"(%566) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %568 = "tf.StridedSlice"(%563, %11, %567, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %569 = "tf.StridedSlice"(%563, %559, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %570:2 = "tf.RaggedRange"(%568, %569, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %571 = "tf.GatherV2"(%355, %570#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %572 = "tf.Cast"(%571) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
-    %573 = "tf.BroadcastTo"(%572, %545) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-    %574 = "tf.Max"(%573, %15) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
-    %575 = "tf.Maximum"(%14, %574) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %576 = "tf.Range"(%14, %575, %7) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
-    %577 = "tf.Pack"(%7, %575) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %578 = "tf.Tile"(%542, %577) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %579 = "tf.Shape"(%578) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %580 = "tf.StridedSlice"(%579, %15, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-    %581 = "tf.Prod"(%580, %15) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
-    %582 = "tf.Pack"(%581) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %583 = "tf.Shape"(%578) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %584 = "tf.StridedSlice"(%583, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %585 = "tf.Shape"(%578) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %586 = "tf.StridedSlice"(%585, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %587 = "tf.ConcatV2"(%584, %582, %586, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %588 = "tf.Reshape"(%578, %587) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %589 = "tf.ExpandDims"(%573, %3) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
-    %590 = "tf.Less"(%576, %589) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
-    %591 = "tf.Reshape"(%590, %5) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
-    %592 = "tf.Where"(%591) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
-    %593 = "tf.Squeeze"(%592) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
-    %594 = "tf.GatherV2"(%588, %593, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %595 = "tf.Cast"(%571) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
-    %596 = "tf.BroadcastTo"(%595, %550) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-    %597 = "tf.Max"(%596, %15) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
-    %598 = "tf.Maximum"(%14, %597) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %599 = "tf.Range"(%14, %598, %7) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
-    %600 = "tf.Pack"(%7, %598) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-    %601 = "tf.Tile"(%547, %600) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
-    %602 = "tf.Shape"(%601) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %603 = "tf.StridedSlice"(%602, %15, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-    %604 = "tf.Prod"(%603, %15) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
-    %605 = "tf.Pack"(%604) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
-    %606 = "tf.Shape"(%601) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %607 = "tf.StridedSlice"(%606, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %608 = "tf.Shape"(%601) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
-    %609 = "tf.StridedSlice"(%608, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %610 = "tf.ConcatV2"(%607, %605, %609, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
-    %611 = "tf.Reshape"(%601, %610) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
-    %612 = "tf.ExpandDims"(%596, %3) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
-    %613 = "tf.Less"(%599, %612) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
-    %614 = "tf.Reshape"(%613, %5) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
-    %615 = "tf.Where"(%614) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
-    %616 = "tf.Squeeze"(%615) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
-    %617 = "tf.GatherV2"(%611, %616, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %618:2 = "tf.RaggedRange"(%594, %617, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
-    %619 = "tf.If"(%496, %496, %492, %13) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_1_AssertGuard_false_22980, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_1_AssertGuard_true_22970} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
-    %620 = "tf.Identity"(%619) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %621 = "tf.Select"(%2, %492, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
-    %622 = "tf.Pack"(%621) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
-    %623 = "tf.ConcatV2"(%1, %622, %12, %14) {device = ""} : (tensor<0xi64>, tensor<1xi64>, tensor<1xi64>, tensor<i32>) -> tensor<2xi64>
-    %624 = "tf.StridedSlice"(%623, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %625 = "tf.Equal"(%624, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %626 = "tf.StridedSlice"(%623, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %627 = "tf.StridedSlice"(%623, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
-    %628 = "tf.Equal"(%627, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
-    %629 = "tf.If"(%628, %628, %627, %571) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_2_AssertGuard_false_23470, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_2_AssertGuard_true_23460} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<?xi64>) -> tensor<i1>
-    %630 = "tf.Identity"(%629) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %631 = "tf.If"(%625, %625, %571, %626) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_3_AssertGuard_false_23830, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_3_AssertGuard_true_23820} : (tensor<i1>, tensor<i1>, tensor<?xi64>, tensor<i64>) -> tensor<i1>
-    %632 = "tf.Identity"(%631) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %633 = "tf.Identity"(%308) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %634 = "tf.Shape"(%37#2) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %635 = "tf.StridedSlice"(%634, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %636 = "tf.Cast"(%635) {Truncate = false, device = ""} : (tensor<0xi32>) -> tensor<0xi64>
-    %637 = "tf.Identity"(%636) {device = ""} : (tensor<0xi64>) -> tensor<0xi64>
-    %638 = "tf.Shape"(%37#3) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %639 = "tf.StridedSlice"(%638, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %640 = "tf.Cast"(%639) {Truncate = false, device = ""} : (tensor<0xi32>) -> tensor<0xi64>
-    %641 = "tf.Identity"(%640) {device = ""} : (tensor<0xi64>) -> tensor<0xi64>
-    %642 = "tf.GatherV2"(%37#3, %336, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
-    %643 = "tf.Tile"(%642, %340) {device = ""} : (tensor<?xi64>, tensor<1xi64>) -> tensor<?xi64>
-    %644 = "tf.Sub"(%643, %474) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
-    %645 = "tf.Shape"(%644) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
-    %646 = "tf.StridedSlice"(%645, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
-    %647 = "tf.Cast"(%646) {Truncate = false, device = ""} : (tensor<0xi32>) -> tensor<0xi64>
-    %648 = "tf.Identity"(%647) {device = ""} : (tensor<0xi64>) -> tensor<0xi64>
-    %649 = "tf.UnicodeEncode"(%37#0, %58) {Tsplits = i64, device = "", errors = "replace", output_encoding = "UTF-8", replacement_char = 65533 : i64} : (tensor<?xi32>, tensor<?xi64>) -> tensor<?x!tf.string>
-    %650 = "tf.Identity"(%649) {device = ""} : (tensor<?x!tf.string>) -> tensor<?x!tf.string>
-    return %650 : tensor<?x!tf.string>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedConcat_assert_equal_1_Assert_AssertGuard_false_3220(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Input tensors have incompatible shapes."> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/RaggedConcat/RaggedFromTensor/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/RaggedConcat/RaggedNRows/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedConcat_assert_equal_1_Assert_AssertGuard_true_3210(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_3980(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:zero"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/RowPartitionFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/RowPartitionFromRowSplits/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_3970(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_4340(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:monotonic"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x >= 0 did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/RowPartitionFromRowSplits/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<?xi64>) -> ()
-    %3 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %4 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %4 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_4330(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_4680(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to _from_row_partition do not form a valid RaggedTensor"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/strided_slice_1:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_4670(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_5050(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:zero"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RowPartitionFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RowPartitionFromRowSplits/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_5040(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_5410(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:monotonic"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x >= 0 did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RowPartitionFromRowSplits/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<?xi64>) -> ()
-    %3 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %4 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %4 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_5400(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_false_5770(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to _from_row_partition do not form a valid RaggedTensor"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RaggedNRows/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_true_5760(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_6120(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:zero"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/RowPartitionFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/RowPartitionFromRowSplits/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_6110(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_6480(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:monotonic"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x >= 0 did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/RowPartitionFromRowSplits/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<?xi64>) -> ()
-    %3 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %4 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %4 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_6470(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_6820(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to _from_row_partition do not form a valid RaggedTensor"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/strided_slice_1:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_6810(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_7190(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:zero"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/RowPartitionFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/RowPartitionFromRowSplits/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_7180(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_7550(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:monotonic"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x >= 0 did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/RowPartitionFromRowSplits/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<?xi64>) -> ()
-    %3 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %4 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %4 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_7540(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_7890(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Arguments to _from_row_partition do not form a valid RaggedTensor"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/strided_slice_1:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_7880(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_AssertGuard_false_8690(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_AssertGuard_true_8680(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_1_AssertGuard_false_9760(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_1_AssertGuard_true_9750(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_2_AssertGuard_false_10250(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<?xi64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_2_AssertGuard_true_10240(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_3_AssertGuard_false_10610(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<?xi64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_3_AssertGuard_true_10600(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_Assert_AssertGuard_false_15310(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_Assert_AssertGuard_true_15300(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_Assert_1_AssertGuard_false_16380(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_Assert_1_AssertGuard_true_16370(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_Assert_2_AssertGuard_false_16870(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<?xi64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_Assert_2_AssertGuard_true_16860(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_Assert_3_AssertGuard_false_17230(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<?xi64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_Assert_3_AssertGuard_true_17220(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_AssertGuard_false_21910(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_AssertGuard_true_21900(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_1_AssertGuard_false_22980(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_1_AssertGuard_true_22970(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_2_AssertGuard_false_23470(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<?xi64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_2_AssertGuard_true_23460(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_3_AssertGuard_false_23830(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>], tf.signature.is_stateful} {
-    %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-    %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
-    "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<?xi64>, tensor<!tf.string>, tensor<i64>) -> ()
-    %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %5 : tensor<i1>
-  }
-  func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_3_AssertGuard_true_23820(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>]} {
-    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
-
-  // CHECK: func @whitespace_tokenizer_rank0(%arg0: tensor<!tf.string> {tf._user_specified_name = "input"}) -> tensor<?x!tf.string> attributes {tf._input_shapes = [#tf.shape<>], tf.api_implements = "tftext:WhitespaceTokenizer", tf.signature.is_stateful} {
-  // CHECK: %0 = "tfl.custom"(%arg0) {custom_code = "tftext:WhitespaceTokenizer", custom_option = opaque<"tfl", "0x"> : tensor<0xi8>} : (tensor<!tf.string>) -> tensor<?x!tf.string>
-  // CHECK: return %0 : tensor<?x!tf.string>
+func @whitespace_tokenizer_rank1(%arg0: tensor<1x!tf.string> {tf._user_specified_name = "input"}) -> (tensor<?x!tf.string>, tensor<?xi64>) attributes {sym_visibility = "private", tf._input_shapes = [#tf.shape<1>], tf._implements = #tf.func<@"tftext:WhitespaceTokenizer", {}>, tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi64>} : () -> tensor<2xi64>
+  %1 = "tf.Const"() {value = dense<[]> : tensor<0xi64>} : () -> tensor<0xi64>
+  %2 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+  %3 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+  %4 = "tf.Const"() {value = dense<[[0], [1]]> : tensor<2x1xi64>} : () -> tensor<2x1xi64>
+  %5 = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+  %6 = "tf.Const"() {value = dense<2> : tensor<1xi32>} : () -> tensor<1xi32>
+  %7 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %8 = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
+  %9 = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
+  %10 = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
+  %11 = "tf.Const"() {value = dense<0> : tensor<1xi64>} : () -> tensor<1xi64>
+  %12 = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
+  %13 = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+  %14 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %15 = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
+  %16 = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  %17 = "tf.If"(%2, %2, %13, %13) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedConcat_assert_equal_1_Assert_AssertGuard_false_3210, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedConcat_assert_equal_1_Assert_AssertGuard_true_3200} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %18 = "tf.Identity"(%17) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %19 = "tf.StringLength"(%arg0) {device = "", unit = "BYTE"} : (tensor<1x!tf.string>) -> tensor<1xi32>
+  %20 = "tf.ExpandDims"(%19, %7) {device = ""} : (tensor<1xi32>, tensor<i32>) -> tensor<1x1xi32>
+  %21 = "tf.Cast"(%20) {Truncate = false, device = ""} : (tensor<1x1xi32>) -> tensor<1x1xi64>
+  %22 = "tf.Reshape"(%21, %12) {device = ""} : (tensor<1x1xi64>, tensor<1xi64>) -> tensor<1xi64>
+  %23 = "tf.Reshape"(%arg0, %5) {device = ""} : (tensor<1x!tf.string>, tensor<1xi32>) -> tensor<1x!tf.string>
+  %24:3 = "tf.UnicodeDecodeWithOffsets"(%23) {Tsplits = i64, device = "", errors = "replace", input_encoding = "UTF-8", replace_control_characters = false, replacement_char = 65533 : i64} : (tensor<1x!tf.string>) -> (tensor<2xi64>, tensor<?xi32>, tensor<?xi64>)
+  %25 = "tf.StridedSlice"(%24#0, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+  %26 = "tf.AddV2"(%25, %13) {device = ""} : (tensor<1xi64>, tensor<i64>) -> tensor<1xi64>
+  %27 = "tf.StridedSlice"(%24#0, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+  %28 = "tf.Minimum"(%26, %27) {device = ""} : (tensor<1xi64>, tensor<1xi64>) -> tensor<1xi64>
+  %29:2 = "tf.RaggedRange"(%28, %27, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<1xi64>, tensor<1xi64>, tensor<i64>) -> (tensor<2xi64>, tensor<?xi64>)
+  %30 = "tf.StridedSlice"(%29#0, %5, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %31 = "tf.AddV2"(%30, %12) {device = ""} : (tensor<i64>, tensor<1xi64>) -> tensor<1xi64>
+  %32 = "tf.ConcatV2"(%29#0, %31, %14) {device = ""} : (tensor<2xi64>, tensor<1xi64>, tensor<i32>) -> tensor<3xi64>
+  %33 = "tf.GatherV2"(%24#2, %29#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %34 = "tf.ConcatV2"(%33, %22, %14) {device = ""} : (tensor<?xi64>, tensor<1xi64>, tensor<i32>) -> tensor<?xi64>
+  %35:2 = "tf.RaggedGather"(%32, %34, %0) {OUTPUT_RAGGED_RANK = 1 : i64, PARAMS_RAGGED_RANK = 1 : i64, Tindices = i64, Tsplits = i64, Tvalues = i64, device = ""} : (tensor<3xi64>, tensor<?xi64>, tensor<2xi64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %36:5 = "tf.WhitespaceTokenizeWithOffsets"(%24#1, %24#0) {Tsplits = i64, device = ""} : (tensor<?xi32>, tensor<2xi64>) -> (tensor<?xi32>, tensor<?xi64>, tensor<?xi64>, tensor<?xi64>, tensor<?xi64>)
+  %37 = "tf.StridedSlice"(%36#1, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %38 = "tf.Equal"(%37, %10) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %39 = "tf.All"(%38, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %40 = "tf.If"(%39, %39, %37, %10) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_3970, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_3960} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %41 = "tf.Identity"(%40) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %42 = "tf.StridedSlice"(%36#1, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %43 = "tf.StridedSlice"(%36#1, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %44 = "tf.Sub"(%42, %43) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %45 = "tf.LessEqual"(%10, %44) {device = ""} : (tensor<i64>, tensor<?xi64>) -> tensor<?xi1>
+  %46 = "tf.All"(%45, %15) {device = "", keep_dims = false} : (tensor<?xi1>, tensor<1xi32>) -> tensor<i1>
+  %47 = "tf.If"(%46, %46, %44) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_4330, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_4320} : (tensor<i1>, tensor<i1>, tensor<?xi64>) -> tensor<i1>
+  %48 = "tf.Identity"(%47) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %49 = "tf.Identity"(%36#1) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %50 = "tf.StridedSlice"(%49, %5, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %51 = "tf.Shape"(%36#0) {device = ""} : (tensor<?xi32>) -> tensor<1xi64>
+  %52 = "tf.StridedSlice"(%51, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %53 = "tf.Equal"(%50, %52) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %54 = "tf.All"(%53, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %55 = "tf.If"(%54, %54, %50, %52) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_4670, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_4660} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %56 = "tf.Identity"(%55) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %57 = "tf.Identity"(%49) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %58 = "tf.Shape"(%57) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %59 = "tf.StridedSlice"(%58, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %60 = "tf.Sub"(%59, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %61 = "tf.StridedSlice"(%36#4, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %62 = "tf.Equal"(%61, %10) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %63 = "tf.All"(%62, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %64 = "tf.If"(%63, %63, %61, %10) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_5040, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_5030} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %65 = "tf.Identity"(%64) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %66 = "tf.StridedSlice"(%36#4, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %67 = "tf.StridedSlice"(%36#4, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %68 = "tf.Sub"(%66, %67) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %69 = "tf.LessEqual"(%10, %68) {device = ""} : (tensor<i64>, tensor<?xi64>) -> tensor<?xi1>
+  %70 = "tf.All"(%69, %15) {device = "", keep_dims = false} : (tensor<?xi1>, tensor<1xi32>) -> tensor<i1>
+  %71 = "tf.If"(%70, %70, %68) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_5400, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_5390} : (tensor<i1>, tensor<i1>, tensor<?xi64>) -> tensor<i1>
+  %72 = "tf.Identity"(%71) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %73 = "tf.Identity"(%36#4) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %74 = "tf.StridedSlice"(%73, %5, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %75 = "tf.Equal"(%74, %60) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %76 = "tf.All"(%75, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %77 = "tf.If"(%76, %76, %74, %60) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_false_5760, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_true_5750} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %78 = "tf.Identity"(%77) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %79 = "tf.Identity"(%73) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %80 = "tf.StridedSlice"(%36#4, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %81 = "tf.Equal"(%80, %10) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %82 = "tf.All"(%81, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %83 = "tf.If"(%82, %82, %80, %10) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_6110, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_6100} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %84 = "tf.Identity"(%83) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %85 = "tf.StridedSlice"(%36#4, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %86 = "tf.StridedSlice"(%36#4, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %87 = "tf.Sub"(%85, %86) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %88 = "tf.LessEqual"(%10, %87) {device = ""} : (tensor<i64>, tensor<?xi64>) -> tensor<?xi1>
+  %89 = "tf.All"(%88, %15) {device = "", keep_dims = false} : (tensor<?xi1>, tensor<1xi32>) -> tensor<i1>
+  %90 = "tf.If"(%89, %89, %87) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_6470, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_6460} : (tensor<i1>, tensor<i1>, tensor<?xi64>) -> tensor<i1>
+  %91 = "tf.Identity"(%90) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %92 = "tf.Identity"(%36#4) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %93 = "tf.StridedSlice"(%92, %5, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %94 = "tf.Shape"(%36#2) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %95 = "tf.StridedSlice"(%94, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %96 = "tf.Equal"(%93, %95) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %97 = "tf.All"(%96, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %98 = "tf.If"(%97, %97, %93, %95) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_6810, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_6800} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %99 = "tf.Identity"(%98) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %100 = "tf.Identity"(%92) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %101 = "tf.Shape"(%100) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %102 = "tf.StridedSlice"(%101, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %103 = "tf.Sub"(%102, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %104 = "tf.Equal"(%103, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %105 = "tf.LogicalOr"(%104, %2) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %106 = "tf.Equal"(%103, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %107 = "tf.LogicalOr"(%105, %106) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %108 = "tf.StridedSlice"(%100, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %109 = "tf.StridedSlice"(%100, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %110 = "tf.Sub"(%108, %109) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %111 = "tf.Shape"(%100) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %112 = "tf.StridedSlice"(%111, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %113 = "tf.Sub"(%112, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %114 = "tf.Equal"(%113, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %115 = "tf.ExpandDims"(%100, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %116 = "tf.Shape"(%100) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %117 = "tf.StridedSlice"(%116, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %118 = "tf.StridedSlice"(%116, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %119 = "tf.StridedSlice"(%116, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %120 = "tf.StridedSlice"(%36#4, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %121 = "tf.Equal"(%120, %10) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %122 = "tf.All"(%121, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %123 = "tf.If"(%122, %122, %120, %10) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_7180, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_7170} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %124 = "tf.Identity"(%123) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %125 = "tf.StridedSlice"(%36#4, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %126 = "tf.StridedSlice"(%36#4, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %127 = "tf.Sub"(%125, %126) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %128 = "tf.LessEqual"(%10, %127) {device = ""} : (tensor<i64>, tensor<?xi64>) -> tensor<?xi1>
+  %129 = "tf.All"(%128, %15) {device = "", keep_dims = false} : (tensor<?xi1>, tensor<1xi32>) -> tensor<i1>
+  %130 = "tf.If"(%129, %129, %127) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_7540, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_7530} : (tensor<i1>, tensor<i1>, tensor<?xi64>) -> tensor<i1>
+  %131 = "tf.Identity"(%130) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %132 = "tf.Identity"(%36#4) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %133 = "tf.StridedSlice"(%132, %5, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %134 = "tf.Shape"(%36#3) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %135 = "tf.StridedSlice"(%134, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %136 = "tf.Equal"(%133, %135) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %137 = "tf.All"(%136, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %138 = "tf.If"(%137, %137, %133, %135) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_7880, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_7870} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %139 = "tf.Identity"(%138) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %140 = "tf.Identity"(%132) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %141 = "tf.Shape"(%140) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %142 = "tf.StridedSlice"(%141, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %143 = "tf.Sub"(%142, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %144 = "tf.Equal"(%143, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %145 = "tf.LogicalOr"(%144, %2) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %146 = "tf.Equal"(%143, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %147 = "tf.LogicalOr"(%145, %146) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %148 = "tf.StridedSlice"(%140, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %149 = "tf.StridedSlice"(%140, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %150 = "tf.Sub"(%148, %149) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %151 = "tf.Shape"(%140) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %152 = "tf.StridedSlice"(%151, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %153 = "tf.Sub"(%152, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %154 = "tf.Equal"(%153, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %155 = "tf.ExpandDims"(%140, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %156 = "tf.Shape"(%140) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %157 = "tf.StridedSlice"(%156, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %158 = "tf.StridedSlice"(%156, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %159 = "tf.StridedSlice"(%156, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %160 = "tf.StridedSlice"(%140, %5, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %161 = "tf.Range"(%10, %160, %13) {device = ""} : (tensor<i64>, tensor<i64>, tensor<i64>) -> tensor<?xi64>
+  %162 = "tf.StridedSlice"(%140, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %163 = "tf.StridedSlice"(%140, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %164 = "tf.Sub"(%162, %163) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %165 = "tf.If"(%107, %107, %13, %103) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedGather_Assert_AssertGuard_false_8680, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedGather_Assert_AssertGuard_true_8670} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %166 = "tf.Identity"(%165) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %167 = "tf.Equal"(%103, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %168 = "tf.Select"(%167, %13, %103) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %169 = "tf.Equal"(%168, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %170 = "tf.LogicalOr"(%169, %2) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %171 = "tf.Equal"(%168, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %172 = "tf.LogicalOr"(%170, %171) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %173 = "tf.Select"(%114, %168, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %174 = "tf.Pack"(%173, %13) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
+  %175 = "tf.StridedSlice"(%174, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %176 = "tf.Cast"(%175) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
+  %177 = "tf.Reshape"(%176, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
+  %178 = "tf.Pack"(%7, %177) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %179 = "tf.Tile"(%115, %178) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %180 = "tf.Mul"(%177, %118) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %181 = "tf.Pack"(%180) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %182 = "tf.ConcatV2"(%117, %181, %119, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %183 = "tf.Reshape"(%179, %182) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %184 = "tf.Shape"(%183) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %185 = "tf.StridedSlice"(%184, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %186 = "tf.Pack"(%175) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %187 = "tf.StridedSlice"(%183, %186, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %188 = "tf.Sub"(%185, %175) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %189 = "tf.Pack"(%188) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %190 = "tf.StridedSlice"(%183, %11, %189, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %191:2 = "tf.RaggedRange"(%190, %187, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %192 = "tf.Select"(%2, %168, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %193 = "tf.Pack"(%192, %13) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
+  %194 = "tf.StridedSlice"(%193, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %195 = "tf.Cast"(%194) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
+  %196 = "tf.Reshape"(%195, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
+  %197 = "tf.Pack"(%7, %196) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %198 = "tf.Tile"(%4, %197) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
+  %199 = "tf.Mul"(%196, %8) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %200 = "tf.Pack"(%199) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %201 = "tf.ConcatV2"(%9, %200, %9, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %202 = "tf.Reshape"(%198, %201) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %203 = "tf.Shape"(%202) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %204 = "tf.StridedSlice"(%203, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %205 = "tf.Pack"(%194) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %206 = "tf.StridedSlice"(%202, %205, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %207 = "tf.Sub"(%204, %194) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %208 = "tf.Pack"(%207) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %209 = "tf.StridedSlice"(%202, %11, %208, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %210:2 = "tf.RaggedRange"(%209, %206, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %211 = "tf.StridedSlice"(%193, %15, %16, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+  %212 = "tf.StridedSlice"(%193, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %213 = "tf.Mul"(%212, %12) {device = ""} : (tensor<i64>, tensor<1xi64>) -> tensor<1xi64>
+  %214 = "tf.Tile"(%213, %211) {device = ""} : (tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %215 = "tf.Cumsum"(%214, %14) {device = "", exclusive = false, reverse = false} : (tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %216 = "tf.ConcatV2"(%11, %215, %3) {device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %217 = "tf.StridedSlice"(%216, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %218 = "tf.ExpandDims"(%217, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %219 = "tf.Shape"(%217) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %220 = "tf.StridedSlice"(%219, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %221 = "tf.Pack"(%220) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %222 = "tf.StridedSlice"(%216, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %223 = "tf.ExpandDims"(%222, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %224 = "tf.Shape"(%222) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %225 = "tf.StridedSlice"(%224, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %226 = "tf.Pack"(%225) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %227 = "tf.Equal"(%103, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %228 = "tf.Select"(%227, %168, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %229 = "tf.Cast"(%228) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
+  %230 = "tf.Reshape"(%229, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
+  %231 = "tf.Pack"(%7, %230) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %232 = "tf.Mul"(%230, %8) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %233 = "tf.Pack"(%232) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %234 = "tf.ConcatV2"(%9, %233, %9, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %235 = "tf.Pack"(%228) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %236 = "tf.Pack"(%10, %103) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
+  %237 = "tf.ExpandDims"(%236, %7) {device = ""} : (tensor<2xi64>, tensor<i32>) -> tensor<2x1xi64>
+  %238 = "tf.Tile"(%237, %231) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
+  %239 = "tf.Reshape"(%238, %234) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %240 = "tf.Shape"(%239) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %241 = "tf.StridedSlice"(%240, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %242 = "tf.Sub"(%241, %228) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %243 = "tf.Pack"(%242) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %244 = "tf.StridedSlice"(%239, %11, %243, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %245 = "tf.StridedSlice"(%239, %235, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %246:2 = "tf.RaggedRange"(%244, %245, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %247 = "tf.GatherV2"(%110, %246#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %248 = "tf.Cast"(%247) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
+  %249 = "tf.BroadcastTo"(%248, %221) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+  %250 = "tf.Max"(%249, %15) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
+  %251 = "tf.Maximum"(%14, %250) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %252 = "tf.Range"(%14, %251, %7) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+  %253 = "tf.Pack"(%7, %251) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %254 = "tf.Tile"(%218, %253) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %255 = "tf.Shape"(%254) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %256 = "tf.StridedSlice"(%255, %15, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  %257 = "tf.Prod"(%256, %15) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
+  %258 = "tf.Pack"(%257) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %259 = "tf.Shape"(%254) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %260 = "tf.StridedSlice"(%259, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %261 = "tf.Shape"(%254) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %262 = "tf.StridedSlice"(%261, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %263 = "tf.ConcatV2"(%260, %258, %262, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %264 = "tf.Reshape"(%254, %263) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %265 = "tf.ExpandDims"(%249, %3) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
+  %266 = "tf.Less"(%252, %265) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
+  %267 = "tf.Reshape"(%266, %5) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
+  %268 = "tf.Where"(%267) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
+  %269 = "tf.Squeeze"(%268) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
+  %270 = "tf.GatherV2"(%264, %269, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %271 = "tf.Cast"(%247) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
+  %272 = "tf.BroadcastTo"(%271, %226) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+  %273 = "tf.Max"(%272, %15) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
+  %274 = "tf.Maximum"(%14, %273) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %275 = "tf.Range"(%14, %274, %7) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+  %276 = "tf.Pack"(%7, %274) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %277 = "tf.Tile"(%223, %276) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %278 = "tf.Shape"(%277) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %279 = "tf.StridedSlice"(%278, %15, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  %280 = "tf.Prod"(%279, %15) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
+  %281 = "tf.Pack"(%280) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %282 = "tf.Shape"(%277) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %283 = "tf.StridedSlice"(%282, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %284 = "tf.Shape"(%277) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %285 = "tf.StridedSlice"(%284, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %286 = "tf.ConcatV2"(%283, %281, %285, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %287 = "tf.Reshape"(%277, %286) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %288 = "tf.ExpandDims"(%272, %3) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
+  %289 = "tf.Less"(%275, %288) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
+  %290 = "tf.Reshape"(%289, %5) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
+  %291 = "tf.Where"(%290) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
+  %292 = "tf.Squeeze"(%291) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
+  %293 = "tf.GatherV2"(%287, %292, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %294:2 = "tf.RaggedRange"(%270, %293, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %295 = "tf.If"(%172, %172, %168, %13) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedGather_Assert_1_AssertGuard_false_9750, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedGather_Assert_1_AssertGuard_true_9740} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %296 = "tf.Identity"(%295) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %297 = "tf.Select"(%2, %168, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %298 = "tf.Pack"(%297) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %299 = "tf.ConcatV2"(%1, %298, %12, %14) {device = ""} : (tensor<0xi64>, tensor<1xi64>, tensor<1xi64>, tensor<i32>) -> tensor<2xi64>
+  %300 = "tf.StridedSlice"(%299, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %301 = "tf.Equal"(%300, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %302 = "tf.StridedSlice"(%299, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %303 = "tf.StridedSlice"(%299, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %304 = "tf.Equal"(%303, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %305 = "tf.If"(%304, %304, %303, %247) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedGather_Assert_2_AssertGuard_false_10240, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedGather_Assert_2_AssertGuard_true_10230} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<?xi64>) -> tensor<i1>
+  %306 = "tf.Identity"(%305) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %307 = "tf.If"(%301, %301, %247, %302) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedGather_Assert_3_AssertGuard_false_10600, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedGather_Assert_3_AssertGuard_true_10590} : (tensor<i1>, tensor<i1>, tensor<?xi64>, tensor<i64>) -> tensor<i1>
+  %308 = "tf.If"(%147, %147, %13, %143) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_Assert_AssertGuard_false_15300, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_Assert_AssertGuard_true_15290} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %309 = "tf.Identity"(%308) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %310 = "tf.Equal"(%143, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %311 = "tf.Select"(%310, %13, %143) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %312 = "tf.Equal"(%311, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %313 = "tf.LogicalOr"(%312, %2) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %314 = "tf.Equal"(%311, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %315 = "tf.LogicalOr"(%313, %314) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %316 = "tf.Select"(%154, %311, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %317 = "tf.Pack"(%316, %13) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
+  %318 = "tf.StridedSlice"(%317, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %319 = "tf.Cast"(%318) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
+  %320 = "tf.Reshape"(%319, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
+  %321 = "tf.Pack"(%7, %320) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %322 = "tf.Tile"(%155, %321) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %323 = "tf.Mul"(%320, %158) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %324 = "tf.Pack"(%323) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %325 = "tf.ConcatV2"(%157, %324, %159, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %326 = "tf.Reshape"(%322, %325) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %327 = "tf.Shape"(%326) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %328 = "tf.StridedSlice"(%327, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %329 = "tf.Pack"(%318) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %330 = "tf.StridedSlice"(%326, %329, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %331 = "tf.Sub"(%328, %318) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %332 = "tf.Pack"(%331) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %333 = "tf.StridedSlice"(%326, %11, %332, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %334:2 = "tf.RaggedRange"(%333, %330, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %335 = "tf.GatherV2"(%161, %334#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %336 = "tf.StridedSlice"(%317, %15, %16, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+  %337 = "tf.StridedSlice"(%317, %15, %16, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+  %338 = "tf.StridedSlice"(%317, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi64>
+  %339 = "tf.ConcatV2"(%337, %338, %14) {device = ""} : (tensor<1xi64>, tensor<0xi64>, tensor<i32>) -> tensor<1xi64>
+  %340 = "tf.StridedSlice"(%317, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %341 = "tf.Mul"(%164, %340) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
+  %342 = "tf.Tile"(%341, %336) {device = ""} : (tensor<?xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %343 = "tf.Cumsum"(%342, %14) {device = "", exclusive = false, reverse = false} : (tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %344 = "tf.ConcatV2"(%11, %343, %3) {device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %345 = "tf.Shape"(%344) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %346 = "tf.StridedSlice"(%345, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %347 = "tf.Sub"(%346, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %348 = "tf.Equal"(%347, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %349 = "tf.LogicalOr"(%348, %2) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %350 = "tf.Equal"(%347, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %351 = "tf.LogicalOr"(%349, %350) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %352 = "tf.StridedSlice"(%344, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %353 = "tf.StridedSlice"(%344, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %354 = "tf.Sub"(%352, %353) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %355 = "tf.Shape"(%344) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %356 = "tf.StridedSlice"(%355, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %357 = "tf.Sub"(%356, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %358 = "tf.Equal"(%357, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %359 = "tf.ExpandDims"(%344, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %360 = "tf.Shape"(%344) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %361 = "tf.StridedSlice"(%360, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %362 = "tf.StridedSlice"(%360, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %363 = "tf.StridedSlice"(%360, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %364 = "tf.Select"(%2, %311, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %365 = "tf.Pack"(%364, %13) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
+  %366 = "tf.StridedSlice"(%365, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %367 = "tf.Cast"(%366) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
+  %368 = "tf.Reshape"(%367, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
+  %369 = "tf.Pack"(%7, %368) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %370 = "tf.Tile"(%4, %369) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
+  %371 = "tf.Mul"(%368, %8) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %372 = "tf.Pack"(%371) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %373 = "tf.ConcatV2"(%9, %372, %9, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %374 = "tf.Reshape"(%370, %373) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %375 = "tf.Shape"(%374) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %376 = "tf.StridedSlice"(%375, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %377 = "tf.Pack"(%366) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %378 = "tf.StridedSlice"(%374, %377, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %379 = "tf.Sub"(%376, %366) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %380 = "tf.Pack"(%379) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %381 = "tf.StridedSlice"(%374, %11, %380, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %382:2 = "tf.RaggedRange"(%381, %378, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %383 = "tf.GatherV2"(%11, %382#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %384 = "tf.GatherV2"(%12, %383, %14) {batch_dims = 0 : i64, device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %385 = "tf.StridedSlice"(%365, %15, %16, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+  %386 = "tf.StridedSlice"(%365, %15, %16, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+  %387 = "tf.StridedSlice"(%365, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi64>
+  %388 = "tf.ConcatV2"(%386, %387, %14) {device = ""} : (tensor<1xi64>, tensor<0xi64>, tensor<i32>) -> tensor<1xi64>
+  %389 = "tf.Tile"(%384, %388) {device = ""} : (tensor<?xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %390 = "tf.StridedSlice"(%365, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %391 = "tf.Mul"(%390, %12) {device = ""} : (tensor<i64>, tensor<1xi64>) -> tensor<1xi64>
+  %392 = "tf.Tile"(%391, %385) {device = ""} : (tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %393 = "tf.Cumsum"(%392, %14) {device = "", exclusive = false, reverse = false} : (tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %394 = "tf.ConcatV2"(%11, %393, %3) {device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %395 = "tf.StridedSlice"(%394, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %396 = "tf.ExpandDims"(%395, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %397 = "tf.Shape"(%395) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %398 = "tf.StridedSlice"(%397, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %399 = "tf.Pack"(%398) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %400 = "tf.StridedSlice"(%394, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %401 = "tf.ExpandDims"(%400, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %402 = "tf.Shape"(%400) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %403 = "tf.StridedSlice"(%402, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %404 = "tf.Pack"(%403) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %405 = "tf.Equal"(%143, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %406 = "tf.Select"(%405, %311, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %407 = "tf.Cast"(%406) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
+  %408 = "tf.Reshape"(%407, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
+  %409 = "tf.Pack"(%7, %408) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %410 = "tf.Mul"(%408, %8) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %411 = "tf.Pack"(%410) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %412 = "tf.ConcatV2"(%9, %411, %9, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %413 = "tf.Pack"(%406) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %414 = "tf.Pack"(%10, %143) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
+  %415 = "tf.ExpandDims"(%414, %7) {device = ""} : (tensor<2xi64>, tensor<i32>) -> tensor<2x1xi64>
+  %416 = "tf.Tile"(%415, %409) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
+  %417 = "tf.Reshape"(%416, %412) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %418 = "tf.Shape"(%417) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %419 = "tf.StridedSlice"(%418, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %420 = "tf.Sub"(%419, %406) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %421 = "tf.Pack"(%420) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %422 = "tf.StridedSlice"(%417, %11, %421, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %423 = "tf.StridedSlice"(%417, %413, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %424:2 = "tf.RaggedRange"(%422, %423, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %425 = "tf.GatherV2"(%150, %424#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %426 = "tf.Cast"(%425) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
+  %427 = "tf.BroadcastTo"(%426, %399) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+  %428 = "tf.Max"(%427, %15) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
+  %429 = "tf.Maximum"(%14, %428) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %430 = "tf.Range"(%14, %429, %7) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+  %431 = "tf.Pack"(%7, %429) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %432 = "tf.Tile"(%396, %431) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %433 = "tf.Shape"(%432) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %434 = "tf.StridedSlice"(%433, %15, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  %435 = "tf.Prod"(%434, %15) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
+  %436 = "tf.Pack"(%435) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %437 = "tf.Shape"(%432) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %438 = "tf.StridedSlice"(%437, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %439 = "tf.Shape"(%432) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %440 = "tf.StridedSlice"(%439, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %441 = "tf.ConcatV2"(%438, %436, %440, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %442 = "tf.Reshape"(%432, %441) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %443 = "tf.ExpandDims"(%427, %3) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
+  %444 = "tf.Less"(%430, %443) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
+  %445 = "tf.Reshape"(%444, %5) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
+  %446 = "tf.Where"(%445) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
+  %447 = "tf.Squeeze"(%446) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
+  %448 = "tf.GatherV2"(%442, %447, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %449 = "tf.Cast"(%425) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
+  %450 = "tf.BroadcastTo"(%449, %404) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+  %451 = "tf.Max"(%450, %15) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
+  %452 = "tf.Maximum"(%14, %451) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %453 = "tf.Range"(%14, %452, %7) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+  %454 = "tf.Pack"(%7, %452) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %455 = "tf.Tile"(%401, %454) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %456 = "tf.Shape"(%455) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %457 = "tf.StridedSlice"(%456, %15, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  %458 = "tf.Prod"(%457, %15) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
+  %459 = "tf.Pack"(%458) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %460 = "tf.Shape"(%455) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %461 = "tf.StridedSlice"(%460, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %462 = "tf.Shape"(%455) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %463 = "tf.StridedSlice"(%462, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %464 = "tf.ConcatV2"(%461, %459, %463, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %465 = "tf.Reshape"(%455, %464) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %466 = "tf.ExpandDims"(%450, %3) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
+  %467 = "tf.Less"(%453, %466) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
+  %468 = "tf.Reshape"(%467, %5) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
+  %469 = "tf.Where"(%468) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
+  %470 = "tf.Squeeze"(%469) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
+  %471 = "tf.GatherV2"(%465, %470, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %472:2 = "tf.RaggedRange"(%448, %471, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %473 = "tf.GatherV2"(%389, %472#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %474 = "tf.If"(%315, %315, %311, %13) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_Assert_1_AssertGuard_false_16370, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_Assert_1_AssertGuard_true_16360} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %475 = "tf.Identity"(%474) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %476 = "tf.Select"(%2, %311, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %477 = "tf.Pack"(%476) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %478 = "tf.ConcatV2"(%1, %477, %12, %14) {device = ""} : (tensor<0xi64>, tensor<1xi64>, tensor<1xi64>, tensor<i32>) -> tensor<2xi64>
+  %479 = "tf.StridedSlice"(%478, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %480 = "tf.Equal"(%479, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %481 = "tf.StridedSlice"(%478, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %482 = "tf.StridedSlice"(%478, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %483 = "tf.Equal"(%482, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %484 = "tf.If"(%483, %483, %482, %425) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_Assert_2_AssertGuard_false_16860, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_Assert_2_AssertGuard_true_16850} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<?xi64>) -> tensor<i1>
+  %485 = "tf.Identity"(%484) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %486 = "tf.If"(%480, %480, %425, %481) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_Assert_3_AssertGuard_false_17220, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_Assert_3_AssertGuard_true_17210} : (tensor<i1>, tensor<i1>, tensor<?xi64>, tensor<i64>) -> tensor<i1>
+  %487 = "tf.Identity"(%486) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %488 = "tf.If"(%351, %351, %13, %347) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedGather_1_Assert_AssertGuard_false_21900, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedGather_1_Assert_AssertGuard_true_21890} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %489 = "tf.Identity"(%488) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %490 = "tf.Equal"(%347, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %491 = "tf.Select"(%490, %13, %347) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %492 = "tf.Equal"(%491, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %493 = "tf.LogicalOr"(%492, %2) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %494 = "tf.Equal"(%491, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %495 = "tf.LogicalOr"(%493, %494) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %496 = "tf.Select"(%358, %491, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %497 = "tf.Pack"(%496, %13) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
+  %498 = "tf.StridedSlice"(%497, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %499 = "tf.Cast"(%498) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
+  %500 = "tf.Reshape"(%499, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
+  %501 = "tf.Pack"(%7, %500) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %502 = "tf.Tile"(%359, %501) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %503 = "tf.Mul"(%500, %362) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %504 = "tf.Pack"(%503) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %505 = "tf.ConcatV2"(%361, %504, %363, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %506 = "tf.Reshape"(%502, %505) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %507 = "tf.Shape"(%506) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %508 = "tf.StridedSlice"(%507, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %509 = "tf.Pack"(%498) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %510 = "tf.StridedSlice"(%506, %509, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %511 = "tf.Sub"(%508, %498) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %512 = "tf.Pack"(%511) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %513 = "tf.StridedSlice"(%506, %11, %512, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %514:2 = "tf.RaggedRange"(%513, %510, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %515 = "tf.Select"(%2, %491, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %516 = "tf.Pack"(%515, %13) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
+  %517 = "tf.StridedSlice"(%516, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %518 = "tf.Cast"(%517) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
+  %519 = "tf.Reshape"(%518, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
+  %520 = "tf.Pack"(%7, %519) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %521 = "tf.Tile"(%4, %520) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
+  %522 = "tf.Mul"(%519, %8) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %523 = "tf.Pack"(%522) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %524 = "tf.ConcatV2"(%9, %523, %9, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %525 = "tf.Reshape"(%521, %524) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %526 = "tf.Shape"(%525) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %527 = "tf.StridedSlice"(%526, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %528 = "tf.Pack"(%517) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %529 = "tf.StridedSlice"(%525, %528, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %530 = "tf.Sub"(%527, %517) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %531 = "tf.Pack"(%530) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %532 = "tf.StridedSlice"(%525, %11, %531, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %533:2 = "tf.RaggedRange"(%532, %529, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %534 = "tf.StridedSlice"(%516, %15, %16, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+  %535 = "tf.StridedSlice"(%516, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %536 = "tf.Mul"(%535, %12) {device = ""} : (tensor<i64>, tensor<1xi64>) -> tensor<1xi64>
+  %537 = "tf.Tile"(%536, %534) {device = ""} : (tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %538 = "tf.Cumsum"(%537, %14) {device = "", exclusive = false, reverse = false} : (tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %539 = "tf.ConcatV2"(%11, %538, %3) {device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %540 = "tf.StridedSlice"(%539, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %541 = "tf.ExpandDims"(%540, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %542 = "tf.Shape"(%540) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %543 = "tf.StridedSlice"(%542, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %544 = "tf.Pack"(%543) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %545 = "tf.StridedSlice"(%539, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %546 = "tf.ExpandDims"(%545, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %547 = "tf.Shape"(%545) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %548 = "tf.StridedSlice"(%547, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %549 = "tf.Pack"(%548) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %550 = "tf.Equal"(%347, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %551 = "tf.Select"(%550, %491, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %552 = "tf.Cast"(%551) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
+  %553 = "tf.Reshape"(%552, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
+  %554 = "tf.Pack"(%7, %553) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %555 = "tf.Mul"(%553, %8) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %556 = "tf.Pack"(%555) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %557 = "tf.ConcatV2"(%9, %556, %9, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %558 = "tf.Pack"(%551) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %559 = "tf.Pack"(%10, %347) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
+  %560 = "tf.ExpandDims"(%559, %7) {device = ""} : (tensor<2xi64>, tensor<i32>) -> tensor<2x1xi64>
+  %561 = "tf.Tile"(%560, %554) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
+  %562 = "tf.Reshape"(%561, %557) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %563 = "tf.Shape"(%562) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %564 = "tf.StridedSlice"(%563, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %565 = "tf.Sub"(%564, %551) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %566 = "tf.Pack"(%565) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %567 = "tf.StridedSlice"(%562, %11, %566, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %568 = "tf.StridedSlice"(%562, %558, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %569:2 = "tf.RaggedRange"(%567, %568, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %570 = "tf.GatherV2"(%354, %569#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %571 = "tf.Cast"(%570) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
+  %572 = "tf.BroadcastTo"(%571, %544) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+  %573 = "tf.Max"(%572, %15) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
+  %574 = "tf.Maximum"(%14, %573) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %575 = "tf.Range"(%14, %574, %7) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+  %576 = "tf.Pack"(%7, %574) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %577 = "tf.Tile"(%541, %576) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %578 = "tf.Shape"(%577) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %579 = "tf.StridedSlice"(%578, %15, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  %580 = "tf.Prod"(%579, %15) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
+  %581 = "tf.Pack"(%580) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %582 = "tf.Shape"(%577) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %583 = "tf.StridedSlice"(%582, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %584 = "tf.Shape"(%577) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %585 = "tf.StridedSlice"(%584, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %586 = "tf.ConcatV2"(%583, %581, %585, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %587 = "tf.Reshape"(%577, %586) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %588 = "tf.ExpandDims"(%572, %3) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
+  %589 = "tf.Less"(%575, %588) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
+  %590 = "tf.Reshape"(%589, %5) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
+  %591 = "tf.Where"(%590) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
+  %592 = "tf.Squeeze"(%591) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
+  %593 = "tf.GatherV2"(%587, %592, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %594 = "tf.Cast"(%570) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
+  %595 = "tf.BroadcastTo"(%594, %549) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+  %596 = "tf.Max"(%595, %15) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
+  %597 = "tf.Maximum"(%14, %596) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %598 = "tf.Range"(%14, %597, %7) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+  %599 = "tf.Pack"(%7, %597) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %600 = "tf.Tile"(%546, %599) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %601 = "tf.Shape"(%600) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %602 = "tf.StridedSlice"(%601, %15, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  %603 = "tf.Prod"(%602, %15) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
+  %604 = "tf.Pack"(%603) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %605 = "tf.Shape"(%600) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %606 = "tf.StridedSlice"(%605, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %607 = "tf.Shape"(%600) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %608 = "tf.StridedSlice"(%607, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %609 = "tf.ConcatV2"(%606, %604, %608, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %610 = "tf.Reshape"(%600, %609) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %611 = "tf.ExpandDims"(%595, %3) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
+  %612 = "tf.Less"(%598, %611) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
+  %613 = "tf.Reshape"(%612, %5) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
+  %614 = "tf.Where"(%613) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
+  %615 = "tf.Squeeze"(%614) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
+  %616 = "tf.GatherV2"(%610, %615, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %617:2 = "tf.RaggedRange"(%593, %616, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %618 = "tf.If"(%495, %495, %491, %13) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedGather_1_Assert_1_AssertGuard_false_22970, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedGather_1_Assert_1_AssertGuard_true_22960} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %619 = "tf.Identity"(%618) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %620 = "tf.Select"(%2, %491, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %621 = "tf.Pack"(%620) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %622 = "tf.ConcatV2"(%1, %621, %12, %14) {device = ""} : (tensor<0xi64>, tensor<1xi64>, tensor<1xi64>, tensor<i32>) -> tensor<2xi64>
+  %623 = "tf.StridedSlice"(%622, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %624 = "tf.Equal"(%623, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %625 = "tf.StridedSlice"(%622, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %626 = "tf.StridedSlice"(%622, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %627 = "tf.Equal"(%626, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %628 = "tf.If"(%627, %627, %626, %570) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedGather_1_Assert_2_AssertGuard_false_23460, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedGather_1_Assert_2_AssertGuard_true_23450} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<?xi64>) -> tensor<i1>
+  %629 = "tf.Identity"(%628) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %630 = "tf.If"(%624, %624, %570, %625) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_RaggedGather_1_Assert_3_AssertGuard_false_23820, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_RaggedGather_1_Assert_3_AssertGuard_true_23810} : (tensor<i1>, tensor<i1>, tensor<?xi64>, tensor<i64>) -> tensor<i1>
+  %631 = "tf.Identity"(%79) {device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %632 = "tf.Identity"(%630) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %633 = "tf.Identity"(%307) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %634 = "tf.Shape"(%36#2) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %635 = "tf.StridedSlice"(%634, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %636 = "tf.Cast"(%635) {Truncate = false, device = ""} : (tensor<0xi32>) -> tensor<0xi64>
+  %637 = "tf.Identity"(%636) {device = ""} : (tensor<0xi64>) -> tensor<0xi64>
+  %638 = "tf.Shape"(%36#3) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %639 = "tf.StridedSlice"(%638, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %640 = "tf.Cast"(%639) {Truncate = false, device = ""} : (tensor<0xi32>) -> tensor<0xi64>
+  %641 = "tf.Identity"(%640) {device = ""} : (tensor<0xi64>) -> tensor<0xi64>
+  %642 = "tf.GatherV2"(%36#3, %335, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %643 = "tf.Tile"(%642, %339) {device = ""} : (tensor<?xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %644 = "tf.Sub"(%643, %473) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %645 = "tf.Shape"(%644) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %646 = "tf.StridedSlice"(%645, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %647 = "tf.Cast"(%646) {Truncate = false, device = ""} : (tensor<0xi32>) -> tensor<0xi64>
+  %648 = "tf.Identity"(%647) {device = ""} : (tensor<0xi64>) -> tensor<0xi64>
+  %649 = "tf.UnicodeEncode"(%36#0, %57) {Tsplits = i64, device = "", errors = "replace", output_encoding = "UTF-8", replacement_char = 65533 : i64} : (tensor<?xi32>, tensor<?xi64>) -> tensor<?x!tf.string>
+  %650 = "tf.Identity"(%649) {device = ""} : (tensor<?x!tf.string>) -> tensor<?x!tf.string>
+  return %650, %631 : tensor<?x!tf.string>, tensor<?xi64>
 }
+func @WhitespaceTokenize_RaggedConcat_assert_equal_1_Assert_AssertGuard_false_3210(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Input tensors have incompatible shapes."> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/RaggedConcat/RaggedFromTensor/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/RaggedConcat/RaggedNRows/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedConcat_assert_equal_1_Assert_AssertGuard_true_3200(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_3970(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:zero"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/RowPartitionFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/RowPartitionFromRowSplits/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_3960(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_4330(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:monotonic"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x >= 0 did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/RowPartitionFromRowSplits/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<?xi64>) -> ()
+  %3 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %4 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %4 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_4320(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_4670(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to _from_row_partition do not form a valid RaggedTensor"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/strided_slice_1:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_4660(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_5040(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:zero"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RowPartitionFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RowPartitionFromRowSplits/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_5030(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_5400(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:monotonic"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x >= 0 did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RowPartitionFromRowSplits/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<?xi64>) -> ()
+  %3 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %4 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %4 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_5390(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_false_5760(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to _from_row_partition do not form a valid RaggedTensor"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RaggedNRows/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_true_5750(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_6110(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:zero"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/RowPartitionFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/RowPartitionFromRowSplits/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_6100(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_6470(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:monotonic"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x >= 0 did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/RowPartitionFromRowSplits/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<?xi64>) -> ()
+  %3 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %4 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %4 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_6460(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_6810(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to _from_row_partition do not form a valid RaggedTensor"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/strided_slice_1:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_6800(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_7180(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:zero"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/RowPartitionFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/RowPartitionFromRowSplits/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_7170(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_7540(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:monotonic"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x >= 0 did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/RowPartitionFromRowSplits/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<?xi64>) -> ()
+  %3 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %4 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %4 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_7530(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_7880(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to _from_row_partition do not form a valid RaggedTensor"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/strided_slice_1:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_7870(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedGather_Assert_AssertGuard_false_8680(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedGather_Assert_AssertGuard_true_8670(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedGather_Assert_1_AssertGuard_false_9750(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedGather_Assert_1_AssertGuard_true_9740(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedGather_Assert_2_AssertGuard_false_10240(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<?xi64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedGather_Assert_2_AssertGuard_true_10230(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedGather_Assert_3_AssertGuard_false_10600(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<?xi64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedGather_Assert_3_AssertGuard_true_10590(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_Assert_AssertGuard_false_15300(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_Assert_AssertGuard_true_15290(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_Assert_1_AssertGuard_false_16370(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_Assert_1_AssertGuard_true_16360(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_Assert_2_AssertGuard_false_16860(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<?xi64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_Assert_2_AssertGuard_true_16850(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_Assert_3_AssertGuard_false_17220(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<?xi64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_Assert_3_AssertGuard_true_17210(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedGather_1_Assert_AssertGuard_false_21900(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedGather_1_Assert_AssertGuard_true_21890(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedGather_1_Assert_1_AssertGuard_false_22970(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedGather_1_Assert_1_AssertGuard_true_22960(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedGather_1_Assert_2_AssertGuard_false_23460(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<?xi64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedGather_1_Assert_2_AssertGuard_true_23450(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedGather_1_Assert_3_AssertGuard_false_23820(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<?xi64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_RaggedGather_1_Assert_3_AssertGuard_true_23810(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+
+// CHECK:  func @whitespace_tokenizer_rank1(%arg0: tensor<1x!tf.string> {tf._user_specified_name = "input"}) -> (tensor<?x!tf.string>, tensor<?xi64>) attributes {sym_visibility = "private", tf._implements = #tf.func<@"tftext:WhitespaceTokenizer", {}>, tf._input_shapes = [#tf.shape<1>], tf.signature.is_stateful} {
+// CHECK:  %0:2 = "tfl.custom"(%arg0) {custom_code = "tftext:WhitespaceTokenizer", custom_option = opaque<"tfl", "0x"> : tensor<0xi8>} : (tensor<1x!tf.string>) -> (tensor<?x!tf.string>, tensor<?xi64>)
+// CHECK:  return %0#0, %0#1 : tensor<?x!tf.string>, tensor<?xi64>
+
+func @whitespace_tokenizer_rank2(%arg0: tensor<?x1x!tf.string> {tf._user_specified_name = "input"}) -> (tensor<?x!tf.string>, tensor<?xi64>, tensor<?xi64>) attributes {sym_visibility = "private", tf._input_shapes = [#tf.shape<?x1>], tf._implements = #tf.func<@"tftext:WhitespaceTokenizer", {}>, tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<[]> : tensor<0xi64>} : () -> tensor<0xi64>
+  %1 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+  %2 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+  %3 = "tf.Const"() {value = dense<[[0], [1]]> : tensor<2x1xi64>} : () -> tensor<2x1xi64>
+  %4 = "tf.Const"() {value = dense<[2, -1]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %5 = "tf.Const"() {value = dense<2> : tensor<i64>} : () -> tensor<i64>
+  %6 = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+  %7 = "tf.Const"() {value = dense<2> : tensor<1xi32>} : () -> tensor<1xi32>
+  %8 = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %9 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %10 = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
+  %11 = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
+  %12 = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
+  %13 = "tf.Const"() {value = dense<0> : tensor<1xi64>} : () -> tensor<1xi64>
+  %14 = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
+  %15 = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+  %16 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %17 = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
+  %18 = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  %19 = "tf.Shape"(%arg0) {device = ""} : (tensor<?x1x!tf.string>) -> tensor<2xi64>
+  %20 = "tf.StridedSlice"(%19, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %21 = "tf.StridedSlice"(%19, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %22 = "tf.Mul"(%20, %21) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %23 = "tf.Pack"(%22) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %24 = "tf.StridedSlice"(%19, %7, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi64>
+  %25 = "tf.ConcatV2"(%23, %24, %16) {device = ""} : (tensor<1xi64>, tensor<0xi64>, tensor<i32>) -> tensor<1xi64>
+  %26 = "tf.Reshape"(%arg0, %25) {device = ""} : (tensor<?x1x!tf.string>, tensor<1xi64>) -> tensor<?x!tf.string>
+  %27 = "tf.StringLength"(%26) {device = "", unit = "BYTE"} : (tensor<?x!tf.string>) -> tensor<?xi32>
+  %28 = "tf.ExpandDims"(%27, %9) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
+  %29 = "tf.Cast"(%28) {Truncate = false, device = ""} : (tensor<?x1xi32>) -> tensor<?x1xi64>
+  %30 = "tf.Shape"(%29) {device = ""} : (tensor<?x1xi64>) -> tensor<2xi64>
+  %31 = "tf.StridedSlice"(%30, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %32 = "tf.StridedSlice"(%30, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %33 = "tf.Mul"(%31, %32) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %34 = "tf.Pack"(%33) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %35 = "tf.StridedSlice"(%30, %7, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi64>
+  %36 = "tf.ConcatV2"(%34, %35, %16) {device = ""} : (tensor<1xi64>, tensor<0xi64>, tensor<i32>) -> tensor<1xi64>
+  %37 = "tf.Reshape"(%29, %36) {device = ""} : (tensor<?x1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %38 = "tf.StridedSlice"(%30, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %39 = "tf.AddV2"(%38, %15) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %40 = "tf.Range"(%12, %39, %15) {device = ""} : (tensor<i64>, tensor<i64>, tensor<i64>) -> tensor<?xi64>
+  %41 = "tf.Mul"(%40, %15) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
+  %42 = "tf.Reshape"(%26, %6) {device = ""} : (tensor<?x!tf.string>, tensor<1xi32>) -> tensor<?x!tf.string>
+  %43:3 = "tf.UnicodeDecodeWithOffsets"(%42) {Tsplits = i64, device = "", errors = "replace", input_encoding = "UTF-8", replace_control_characters = false, replacement_char = 65533 : i64} : (tensor<?x!tf.string>) -> (tensor<?xi64>, tensor<?xi32>, tensor<?xi64>)
+  %44 = "tf.StridedSlice"(%43#0, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %45 = "tf.Shape"(%44) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %46 = "tf.ConcatV2"(%45, %18, %16) {device = ""} : (tensor<1xi32>, tensor<1xi32>, tensor<i32>) -> tensor<2xi32>
+  %47 = "tf.Reshape"(%44, %46) {device = ""} : (tensor<?xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %48 = "tf.Shape"(%47) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi64>
+  %49 = "tf.StridedSlice"(%48, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %50 = "tf.AddV2"(%49, %15) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %51 = "tf.Range"(%12, %50, %15) {device = ""} : (tensor<i64>, tensor<i64>, tensor<i64>) -> tensor<?xi64>
+  %52 = "tf.Mul"(%51, %15) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
+  %53 = "tf.ExpandDims"(%52, %9) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %54 = "tf.Shape"(%52) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %55 = "tf.StridedSlice"(%54, %17, %17, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %56 = "tf.StridedSlice"(%54, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %57 = "tf.StridedSlice"(%54, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %58 = "tf.StridedSlice"(%52, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %59 = "tf.StridedSlice"(%52, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %60 = "tf.Sub"(%58, %59) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %61 = "tf.Shape"(%47) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %62 = "tf.Cast"(%61) {Truncate = false, device = ""} : (tensor<2xi32>) -> tensor<2xi64>
+  %63 = "tf.StridedSlice"(%62, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %64 = "tf.Equal"(%63, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %65 = "tf.StridedSlice"(%62, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %66 = "tf.Equal"(%65, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %67 = "tf.StridedSlice"(%62, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %68 = "tf.Shape"(%47) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %69 = "tf.Cast"(%68) {Truncate = false, device = ""} : (tensor<2xi32>) -> tensor<2xi64>
+  %70 = "tf.StridedSlice"(%69, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %71 = "tf.Equal"(%70, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %72 = "tf.StridedSlice"(%43#0, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %73 = "tf.AddV2"(%72, %15) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
+  %74 = "tf.StridedSlice"(%43#0, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %75 = "tf.Minimum"(%73, %74) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %76:2 = "tf.RaggedRange"(%75, %74, %15) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %77 = "tf.Shape"(%76#0) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %78 = "tf.StridedSlice"(%77, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %79 = "tf.Sub"(%78, %15) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %80 = "tf.Equal"(%38, %79) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %81 = "tf.All"(%80, %11) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %82 = "tf.If"(%81, %81, %38, %79) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedConcat_assert_equal_1_Assert_AssertGuard_false_99640, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedConcat_assert_equal_1_Assert_AssertGuard_true_99630} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %83 = "tf.Identity"(%82) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %84 = "tf.StridedSlice"(%41, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %85 = "tf.Mul"(%79, %5) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %86 = "tf.Range"(%12, %85, %15) {device = ""} : (tensor<i64>, tensor<i64>, tensor<i64>) -> tensor<?xi64>
+  %87 = "tf.Reshape"(%86, %4) {device = ""} : (tensor<?xi64>, tensor<2xi32>) -> tensor<2x?xi64>
+  %88 = "tf.Transpose"(%87, %8) {device = ""} : (tensor<2x?xi64>, tensor<2xi32>) -> tensor<?x2xi64>
+  %89 = "tf.Reshape"(%88, %6) {device = ""} : (tensor<?x2xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %90 = "tf.StridedSlice"(%76#0, %6, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %91 = "tf.AddV2"(%84, %90) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
+  %92 = "tf.ConcatV2"(%76#0, %91, %16) {device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %93 = "tf.GatherV2"(%43#2, %76#1, %16) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %94 = "tf.ConcatV2"(%93, %37, %16) {device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %95:2 = "tf.RaggedGather"(%92, %94, %89) {OUTPUT_RAGGED_RANK = 1 : i64, PARAMS_RAGGED_RANK = 1 : i64, Tindices = i64, Tsplits = i64, Tvalues = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<?xi64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %96 = "tf.StridedSlice"(%95#0, %17, %17, %7) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %97 = "tf.StridedSlice"(%96, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %98 = "tf.Shape"(%97) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %99 = "tf.ConcatV2"(%98, %18, %16) {device = ""} : (tensor<1xi32>, tensor<1xi32>, tensor<i32>) -> tensor<2xi32>
+  %100 = "tf.Reshape"(%97, %99) {device = ""} : (tensor<?xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %101 = "tf.Shape"(%100) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi64>
+  %102 = "tf.StridedSlice"(%101, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %103 = "tf.AddV2"(%102, %15) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %104 = "tf.Range"(%12, %103, %15) {device = ""} : (tensor<i64>, tensor<i64>, tensor<i64>) -> tensor<?xi64>
+  %105 = "tf.Mul"(%104, %15) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
+  %106 = "tf.ExpandDims"(%105, %9) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %107 = "tf.Shape"(%105) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %108 = "tf.StridedSlice"(%107, %17, %17, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %109 = "tf.StridedSlice"(%107, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %110 = "tf.StridedSlice"(%107, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %111 = "tf.StridedSlice"(%105, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %112 = "tf.StridedSlice"(%105, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %113 = "tf.Sub"(%111, %112) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %114 = "tf.Shape"(%100) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %115 = "tf.Cast"(%114) {Truncate = false, device = ""} : (tensor<2xi32>) -> tensor<2xi64>
+  %116 = "tf.StridedSlice"(%115, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %117 = "tf.Equal"(%116, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %118 = "tf.StridedSlice"(%115, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %119 = "tf.Equal"(%118, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %120 = "tf.StridedSlice"(%115, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %121 = "tf.Shape"(%100) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %122 = "tf.Cast"(%121) {Truncate = false, device = ""} : (tensor<2xi32>) -> tensor<2xi64>
+  %123 = "tf.StridedSlice"(%122, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %124 = "tf.Equal"(%123, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %125:5 = "tf.WhitespaceTokenizeWithOffsets"(%43#1, %43#0) {Tsplits = i64, device = ""} : (tensor<?xi32>, tensor<?xi64>) -> (tensor<?xi32>, tensor<?xi64>, tensor<?xi64>, tensor<?xi64>, tensor<?xi64>)
+  %126 = "tf.StridedSlice"(%125#1, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %127 = "tf.Equal"(%126, %12) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %128 = "tf.All"(%127, %11) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %129 = "tf.If"(%128, %128, %126, %12) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_100400, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_100390} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %130 = "tf.Identity"(%129) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %131 = "tf.StridedSlice"(%125#1, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %132 = "tf.StridedSlice"(%125#1, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %133 = "tf.Sub"(%131, %132) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %134 = "tf.LessEqual"(%12, %133) {device = ""} : (tensor<i64>, tensor<?xi64>) -> tensor<?xi1>
+  %135 = "tf.All"(%134, %17) {device = "", keep_dims = false} : (tensor<?xi1>, tensor<1xi32>) -> tensor<i1>
+  %136 = "tf.If"(%135, %135, %133) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_100760, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_100750} : (tensor<i1>, tensor<i1>, tensor<?xi64>) -> tensor<i1>
+  %137 = "tf.Identity"(%136) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %138 = "tf.Identity"(%125#1) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %139 = "tf.StridedSlice"(%138, %6, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %140 = "tf.Shape"(%125#0) {device = ""} : (tensor<?xi32>) -> tensor<1xi64>
+  %141 = "tf.StridedSlice"(%140, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %142 = "tf.Equal"(%139, %141) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %143 = "tf.All"(%142, %11) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %144 = "tf.If"(%143, %143, %139, %141) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_101100, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_101090} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %145 = "tf.Identity"(%144) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %146 = "tf.Identity"(%138) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %147 = "tf.Shape"(%146) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %148 = "tf.StridedSlice"(%147, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %149 = "tf.Sub"(%148, %15) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %150 = "tf.StridedSlice"(%125#4, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %151 = "tf.Equal"(%150, %12) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %152 = "tf.All"(%151, %11) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %153 = "tf.If"(%152, %152, %150, %12) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_101470, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_101460} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %154 = "tf.Identity"(%153) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %155 = "tf.StridedSlice"(%125#4, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %156 = "tf.StridedSlice"(%125#4, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %157 = "tf.Sub"(%155, %156) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %158 = "tf.LessEqual"(%12, %157) {device = ""} : (tensor<i64>, tensor<?xi64>) -> tensor<?xi1>
+  %159 = "tf.All"(%158, %17) {device = "", keep_dims = false} : (tensor<?xi1>, tensor<1xi32>) -> tensor<i1>
+  %160 = "tf.If"(%159, %159, %157) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_101830, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_101820} : (tensor<i1>, tensor<i1>, tensor<?xi64>) -> tensor<i1>
+  %161 = "tf.Identity"(%160) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %162 = "tf.Identity"(%125#4) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %163 = "tf.StridedSlice"(%162, %6, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %164 = "tf.Equal"(%163, %149) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %165 = "tf.All"(%164, %11) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %166 = "tf.If"(%165, %165, %163, %149) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_false_102190, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_true_102180} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %167 = "tf.Identity"(%166) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %168 = "tf.Identity"(%162) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %169 = "tf.StridedSlice"(%125#4, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %170 = "tf.Equal"(%169, %12) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %171 = "tf.All"(%170, %11) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %172 = "tf.If"(%171, %171, %169, %12) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_102540, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_102530} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %173 = "tf.Identity"(%172) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %174 = "tf.StridedSlice"(%125#4, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %175 = "tf.StridedSlice"(%125#4, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %176 = "tf.Sub"(%174, %175) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %177 = "tf.LessEqual"(%12, %176) {device = ""} : (tensor<i64>, tensor<?xi64>) -> tensor<?xi1>
+  %178 = "tf.All"(%177, %17) {device = "", keep_dims = false} : (tensor<?xi1>, tensor<1xi32>) -> tensor<i1>
+  %179 = "tf.If"(%178, %178, %176) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_102900, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_102890} : (tensor<i1>, tensor<i1>, tensor<?xi64>) -> tensor<i1>
+  %180 = "tf.Identity"(%179) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %181 = "tf.Identity"(%125#4) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %182 = "tf.StridedSlice"(%181, %6, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %183 = "tf.Shape"(%125#2) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %184 = "tf.StridedSlice"(%183, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %185 = "tf.Equal"(%182, %184) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %186 = "tf.All"(%185, %11) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %187 = "tf.If"(%186, %186, %182, %184) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_103240, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_103230} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %188 = "tf.Identity"(%187) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %189 = "tf.Identity"(%181) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %190 = "tf.Shape"(%189) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %191 = "tf.StridedSlice"(%190, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %192 = "tf.Sub"(%191, %15) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %193 = "tf.Equal"(%192, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %194 = "tf.LogicalOr"(%64, %193) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %195 = "tf.Equal"(%192, %63) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %196 = "tf.LogicalOr"(%194, %195) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %197 = "tf.StridedSlice"(%189, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %198 = "tf.StridedSlice"(%189, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %199 = "tf.Sub"(%197, %198) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %200 = "tf.Shape"(%189) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %201 = "tf.StridedSlice"(%200, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %202 = "tf.Sub"(%201, %15) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %203 = "tf.Equal"(%202, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %204 = "tf.ExpandDims"(%189, %9) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %205 = "tf.Shape"(%189) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %206 = "tf.StridedSlice"(%205, %17, %17, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %207 = "tf.StridedSlice"(%205, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %208 = "tf.StridedSlice"(%205, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %209 = "tf.StridedSlice"(%125#4, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %210 = "tf.Equal"(%209, %12) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %211 = "tf.All"(%210, %11) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %212 = "tf.If"(%211, %211, %209, %12) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_103610, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_103600} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %213 = "tf.Identity"(%212) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %214 = "tf.StridedSlice"(%125#4, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %215 = "tf.StridedSlice"(%125#4, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %216 = "tf.Sub"(%214, %215) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %217 = "tf.LessEqual"(%12, %216) {device = ""} : (tensor<i64>, tensor<?xi64>) -> tensor<?xi1>
+  %218 = "tf.All"(%217, %17) {device = "", keep_dims = false} : (tensor<?xi1>, tensor<1xi32>) -> tensor<i1>
+  %219 = "tf.If"(%218, %218, %216) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_103970, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_103960} : (tensor<i1>, tensor<i1>, tensor<?xi64>) -> tensor<i1>
+  %220 = "tf.Identity"(%219) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %221 = "tf.Identity"(%125#4) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %222 = "tf.StridedSlice"(%221, %6, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %223 = "tf.Shape"(%125#3) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %224 = "tf.StridedSlice"(%223, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %225 = "tf.Equal"(%222, %224) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %226 = "tf.All"(%225, %11) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %227 = "tf.If"(%226, %226, %222, %224) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_104310, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_104300} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %228 = "tf.Identity"(%227) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %229 = "tf.Identity"(%221) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %230 = "tf.Shape"(%229) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %231 = "tf.StridedSlice"(%230, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %232 = "tf.Sub"(%231, %15) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %233 = "tf.Equal"(%232, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %234 = "tf.LogicalOr"(%233, %1) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %235 = "tf.Equal"(%232, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %236 = "tf.LogicalOr"(%234, %235) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %237 = "tf.StridedSlice"(%229, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %238 = "tf.StridedSlice"(%229, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %239 = "tf.Sub"(%237, %238) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %240 = "tf.Shape"(%229) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %241 = "tf.StridedSlice"(%240, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %242 = "tf.Sub"(%241, %15) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %243 = "tf.Equal"(%242, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %244 = "tf.ExpandDims"(%229, %9) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %245 = "tf.Shape"(%229) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %246 = "tf.StridedSlice"(%245, %17, %17, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %247 = "tf.StridedSlice"(%245, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %248 = "tf.StridedSlice"(%245, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %249 = "tf.StridedSlice"(%229, %6, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %250 = "tf.Range"(%12, %249, %15) {device = ""} : (tensor<i64>, tensor<i64>, tensor<i64>) -> tensor<?xi64>
+  %251 = "tf.StridedSlice"(%229, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %252 = "tf.StridedSlice"(%229, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %253 = "tf.Sub"(%251, %252) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %254 = "tf.If"(%196, %196, %63, %192) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_AssertGuard_false_105110, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_AssertGuard_true_105100} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %255 = "tf.Identity"(%254) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %256 = "tf.Equal"(%192, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %257 = "tf.Select"(%256, %63, %192) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %258 = "tf.Equal"(%257, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %259 = "tf.LogicalOr"(%258, %66) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %260 = "tf.Equal"(%65, %257) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %261 = "tf.LogicalOr"(%259, %260) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %262 = "tf.Select"(%203, %257, %15) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %263 = "tf.Pack"(%262, %15) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
+  %264 = "tf.StridedSlice"(%263, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %265 = "tf.Cast"(%264) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
+  %266 = "tf.Reshape"(%265, %11) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
+  %267 = "tf.Pack"(%9, %266) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %268 = "tf.Tile"(%204, %267) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %269 = "tf.Mul"(%266, %207) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %270 = "tf.Pack"(%269) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %271 = "tf.ConcatV2"(%206, %270, %208, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %272 = "tf.Reshape"(%268, %271) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %273 = "tf.Shape"(%272) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %274 = "tf.StridedSlice"(%273, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %275 = "tf.Pack"(%264) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %276 = "tf.StridedSlice"(%272, %275, %13, %14) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %277 = "tf.Sub"(%274, %264) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %278 = "tf.Pack"(%277) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %279 = "tf.StridedSlice"(%272, %13, %278, %14) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %280:2 = "tf.RaggedRange"(%279, %276, %15) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %281 = "tf.Select"(%71, %257, %15) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %282 = "tf.Pack"(%281, %15) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
+  %283 = "tf.StridedSlice"(%282, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %284 = "tf.Cast"(%283) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
+  %285 = "tf.Reshape"(%284, %11) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
+  %286 = "tf.Pack"(%9, %285) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %287 = "tf.Tile"(%53, %286) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %288 = "tf.Mul"(%285, %56) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %289 = "tf.Pack"(%288) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %290 = "tf.ConcatV2"(%55, %289, %57, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %291 = "tf.Reshape"(%287, %290) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %292 = "tf.Shape"(%291) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %293 = "tf.StridedSlice"(%292, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %294 = "tf.Pack"(%283) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %295 = "tf.StridedSlice"(%291, %294, %13, %14) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %296 = "tf.Sub"(%293, %283) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %297 = "tf.Pack"(%296) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %298 = "tf.StridedSlice"(%291, %13, %297, %14) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %299:2 = "tf.RaggedRange"(%298, %295, %15) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %300 = "tf.StridedSlice"(%282, %17, %18, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+  %301 = "tf.StridedSlice"(%282, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %302 = "tf.Mul"(%60, %301) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
+  %303 = "tf.Tile"(%302, %300) {device = ""} : (tensor<?xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %304 = "tf.Cumsum"(%303, %16) {device = "", exclusive = false, reverse = false} : (tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %305 = "tf.ConcatV2"(%13, %304, %2) {device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %306 = "tf.StridedSlice"(%305, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %307 = "tf.ExpandDims"(%306, %9) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %308 = "tf.Shape"(%306) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %309 = "tf.StridedSlice"(%308, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %310 = "tf.Pack"(%309) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %311 = "tf.StridedSlice"(%305, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %312 = "tf.ExpandDims"(%311, %9) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %313 = "tf.Shape"(%311) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %314 = "tf.StridedSlice"(%313, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %315 = "tf.Pack"(%314) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %316 = "tf.Equal"(%192, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %317 = "tf.Select"(%316, %257, %15) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %318 = "tf.Cast"(%317) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
+  %319 = "tf.Reshape"(%318, %11) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
+  %320 = "tf.Pack"(%9, %319) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %321 = "tf.Mul"(%319, %10) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %322 = "tf.Pack"(%321) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %323 = "tf.ConcatV2"(%11, %322, %11, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %324 = "tf.Pack"(%317) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %325 = "tf.Pack"(%12, %192) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
+  %326 = "tf.ExpandDims"(%325, %9) {device = ""} : (tensor<2xi64>, tensor<i32>) -> tensor<2x1xi64>
+  %327 = "tf.Tile"(%326, %320) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
+  %328 = "tf.Reshape"(%327, %323) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %329 = "tf.Shape"(%328) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %330 = "tf.StridedSlice"(%329, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %331 = "tf.Sub"(%330, %317) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %332 = "tf.Pack"(%331) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %333 = "tf.StridedSlice"(%328, %13, %332, %14) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %334 = "tf.StridedSlice"(%328, %324, %13, %14) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %335:2 = "tf.RaggedRange"(%333, %334, %15) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %336 = "tf.GatherV2"(%199, %335#1, %16) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %337 = "tf.Cast"(%336) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
+  %338 = "tf.BroadcastTo"(%337, %310) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+  %339 = "tf.Max"(%338, %17) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
+  %340 = "tf.Maximum"(%16, %339) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %341 = "tf.Range"(%16, %340, %9) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+  %342 = "tf.Pack"(%9, %340) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %343 = "tf.Tile"(%307, %342) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %344 = "tf.Shape"(%343) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %345 = "tf.StridedSlice"(%344, %17, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  %346 = "tf.Prod"(%345, %17) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
+  %347 = "tf.Pack"(%346) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %348 = "tf.Shape"(%343) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %349 = "tf.StridedSlice"(%348, %17, %17, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %350 = "tf.Shape"(%343) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %351 = "tf.StridedSlice"(%350, %7, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %352 = "tf.ConcatV2"(%349, %347, %351, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %353 = "tf.Reshape"(%343, %352) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %354 = "tf.ExpandDims"(%338, %2) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
+  %355 = "tf.Less"(%341, %354) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
+  %356 = "tf.Reshape"(%355, %6) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
+  %357 = "tf.Where"(%356) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
+  %358 = "tf.Squeeze"(%357) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
+  %359 = "tf.GatherV2"(%353, %358, %16) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %360 = "tf.Cast"(%336) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
+  %361 = "tf.BroadcastTo"(%360, %315) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+  %362 = "tf.Max"(%361, %17) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
+  %363 = "tf.Maximum"(%16, %362) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %364 = "tf.Range"(%16, %363, %9) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+  %365 = "tf.Pack"(%9, %363) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %366 = "tf.Tile"(%312, %365) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %367 = "tf.Shape"(%366) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %368 = "tf.StridedSlice"(%367, %17, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  %369 = "tf.Prod"(%368, %17) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
+  %370 = "tf.Pack"(%369) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %371 = "tf.Shape"(%366) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %372 = "tf.StridedSlice"(%371, %17, %17, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %373 = "tf.Shape"(%366) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %374 = "tf.StridedSlice"(%373, %7, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %375 = "tf.ConcatV2"(%372, %370, %374, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %376 = "tf.Reshape"(%366, %375) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %377 = "tf.ExpandDims"(%361, %2) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
+  %378 = "tf.Less"(%364, %377) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
+  %379 = "tf.Reshape"(%378, %6) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
+  %380 = "tf.Where"(%379) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
+  %381 = "tf.Squeeze"(%380) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
+  %382 = "tf.GatherV2"(%376, %381, %16) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %383:2 = "tf.RaggedRange"(%359, %382, %15) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %384 = "tf.If"(%261, %261, %257, %67) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_1_AssertGuard_false_106180, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_1_AssertGuard_true_106170} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %385 = "tf.Identity"(%384) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %386 = "tf.StridedSlice"(%62, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %387 = "tf.Equal"(%386, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %388 = "tf.Select"(%387, %257, %386) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %389 = "tf.Pack"(%388) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %390 = "tf.StridedSlice"(%62, %17, %17, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi64>
+  %391 = "tf.StridedSlice"(%62, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+  %392 = "tf.ConcatV2"(%390, %389, %391, %16) {device = ""} : (tensor<0xi64>, tensor<1xi64>, tensor<1xi64>, tensor<i32>) -> tensor<2xi64>
+  %393 = "tf.StridedSlice"(%392, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %394 = "tf.Equal"(%393, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %395 = "tf.StridedSlice"(%392, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %396 = "tf.StridedSlice"(%392, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %397 = "tf.Equal"(%396, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %398 = "tf.If"(%397, %397, %396, %336) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_2_AssertGuard_false_106670, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_2_AssertGuard_true_106660} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<?xi64>) -> tensor<i1>
+  %399 = "tf.Identity"(%398) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %400 = "tf.If"(%394, %394, %336, %395) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_3_AssertGuard_false_107030, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_3_AssertGuard_true_107020} : (tensor<i1>, tensor<i1>, tensor<?xi64>, tensor<i64>) -> tensor<i1>
+  %401 = "tf.If"(%236, %236, %15, %232) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_AssertGuard_false_111870, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_AssertGuard_true_111860} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %402 = "tf.Identity"(%401) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %403 = "tf.Equal"(%232, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %404 = "tf.Select"(%403, %15, %232) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %405 = "tf.Equal"(%404, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %406 = "tf.LogicalOr"(%405, %1) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %407 = "tf.Equal"(%404, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %408 = "tf.LogicalOr"(%406, %407) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %409 = "tf.Select"(%243, %404, %15) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %410 = "tf.Pack"(%409, %15) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
+  %411 = "tf.StridedSlice"(%410, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %412 = "tf.Cast"(%411) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
+  %413 = "tf.Reshape"(%412, %11) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
+  %414 = "tf.Pack"(%9, %413) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %415 = "tf.Tile"(%244, %414) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %416 = "tf.Mul"(%413, %247) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %417 = "tf.Pack"(%416) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %418 = "tf.ConcatV2"(%246, %417, %248, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %419 = "tf.Reshape"(%415, %418) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %420 = "tf.Shape"(%419) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %421 = "tf.StridedSlice"(%420, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %422 = "tf.Pack"(%411) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %423 = "tf.StridedSlice"(%419, %422, %13, %14) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %424 = "tf.Sub"(%421, %411) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %425 = "tf.Pack"(%424) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %426 = "tf.StridedSlice"(%419, %13, %425, %14) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %427:2 = "tf.RaggedRange"(%426, %423, %15) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %428 = "tf.GatherV2"(%250, %427#1, %16) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %429 = "tf.StridedSlice"(%410, %17, %18, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+  %430 = "tf.StridedSlice"(%410, %17, %18, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+  %431 = "tf.StridedSlice"(%410, %7, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi64>
+  %432 = "tf.ConcatV2"(%430, %431, %16) {device = ""} : (tensor<1xi64>, tensor<0xi64>, tensor<i32>) -> tensor<1xi64>
+  %433 = "tf.StridedSlice"(%410, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %434 = "tf.Mul"(%253, %433) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
+  %435 = "tf.Tile"(%434, %429) {device = ""} : (tensor<?xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %436 = "tf.Cumsum"(%435, %16) {device = "", exclusive = false, reverse = false} : (tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %437 = "tf.ConcatV2"(%13, %436, %2) {device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %438 = "tf.Shape"(%437) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %439 = "tf.StridedSlice"(%438, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %440 = "tf.Sub"(%439, %15) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %441 = "tf.Equal"(%440, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %442 = "tf.LogicalOr"(%117, %441) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %443 = "tf.Equal"(%440, %116) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %444 = "tf.LogicalOr"(%442, %443) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %445 = "tf.StridedSlice"(%437, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %446 = "tf.StridedSlice"(%437, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %447 = "tf.Sub"(%445, %446) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %448 = "tf.Shape"(%437) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %449 = "tf.StridedSlice"(%448, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %450 = "tf.Sub"(%449, %15) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %451 = "tf.Equal"(%450, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %452 = "tf.ExpandDims"(%437, %9) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %453 = "tf.Shape"(%437) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %454 = "tf.StridedSlice"(%453, %17, %17, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %455 = "tf.StridedSlice"(%453, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %456 = "tf.StridedSlice"(%453, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %457 = "tf.Select"(%1, %404, %15) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %458 = "tf.Pack"(%457, %15) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
+  %459 = "tf.StridedSlice"(%458, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %460 = "tf.Cast"(%459) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
+  %461 = "tf.Reshape"(%460, %11) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
+  %462 = "tf.Pack"(%9, %461) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %463 = "tf.Tile"(%3, %462) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
+  %464 = "tf.Mul"(%461, %10) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %465 = "tf.Pack"(%464) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %466 = "tf.ConcatV2"(%11, %465, %11, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %467 = "tf.Reshape"(%463, %466) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %468 = "tf.Shape"(%467) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %469 = "tf.StridedSlice"(%468, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %470 = "tf.Pack"(%459) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %471 = "tf.StridedSlice"(%467, %470, %13, %14) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %472 = "tf.Sub"(%469, %459) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %473 = "tf.Pack"(%472) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %474 = "tf.StridedSlice"(%467, %13, %473, %14) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %475:2 = "tf.RaggedRange"(%474, %471, %15) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %476 = "tf.GatherV2"(%13, %475#1, %16) {batch_dims = 0 : i64, device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %477 = "tf.GatherV2"(%14, %476, %16) {batch_dims = 0 : i64, device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %478 = "tf.StridedSlice"(%458, %17, %18, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+  %479 = "tf.StridedSlice"(%458, %17, %18, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+  %480 = "tf.StridedSlice"(%458, %7, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi64>
+  %481 = "tf.ConcatV2"(%479, %480, %16) {device = ""} : (tensor<1xi64>, tensor<0xi64>, tensor<i32>) -> tensor<1xi64>
+  %482 = "tf.Tile"(%477, %481) {device = ""} : (tensor<?xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %483 = "tf.StridedSlice"(%458, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %484 = "tf.Mul"(%483, %14) {device = ""} : (tensor<i64>, tensor<1xi64>) -> tensor<1xi64>
+  %485 = "tf.Tile"(%484, %478) {device = ""} : (tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %486 = "tf.Cumsum"(%485, %16) {device = "", exclusive = false, reverse = false} : (tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %487 = "tf.ConcatV2"(%13, %486, %2) {device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %488 = "tf.StridedSlice"(%487, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %489 = "tf.ExpandDims"(%488, %9) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %490 = "tf.Shape"(%488) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %491 = "tf.StridedSlice"(%490, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %492 = "tf.Pack"(%491) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %493 = "tf.StridedSlice"(%487, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %494 = "tf.ExpandDims"(%493, %9) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %495 = "tf.Shape"(%493) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %496 = "tf.StridedSlice"(%495, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %497 = "tf.Pack"(%496) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %498 = "tf.Equal"(%232, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %499 = "tf.Select"(%498, %404, %15) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %500 = "tf.Cast"(%499) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
+  %501 = "tf.Reshape"(%500, %11) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
+  %502 = "tf.Pack"(%9, %501) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %503 = "tf.Mul"(%501, %10) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %504 = "tf.Pack"(%503) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %505 = "tf.ConcatV2"(%11, %504, %11, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %506 = "tf.Pack"(%499) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %507 = "tf.Pack"(%12, %232) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
+  %508 = "tf.ExpandDims"(%507, %9) {device = ""} : (tensor<2xi64>, tensor<i32>) -> tensor<2x1xi64>
+  %509 = "tf.Tile"(%508, %502) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
+  %510 = "tf.Reshape"(%509, %505) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %511 = "tf.Shape"(%510) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %512 = "tf.StridedSlice"(%511, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %513 = "tf.Sub"(%512, %499) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %514 = "tf.Pack"(%513) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %515 = "tf.StridedSlice"(%510, %13, %514, %14) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %516 = "tf.StridedSlice"(%510, %506, %13, %14) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %517:2 = "tf.RaggedRange"(%515, %516, %15) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %518 = "tf.GatherV2"(%239, %517#1, %16) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %519 = "tf.Cast"(%518) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
+  %520 = "tf.BroadcastTo"(%519, %492) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+  %521 = "tf.Max"(%520, %17) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
+  %522 = "tf.Maximum"(%16, %521) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %523 = "tf.Range"(%16, %522, %9) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+  %524 = "tf.Pack"(%9, %522) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %525 = "tf.Tile"(%489, %524) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %526 = "tf.Shape"(%525) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %527 = "tf.StridedSlice"(%526, %17, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  %528 = "tf.Prod"(%527, %17) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
+  %529 = "tf.Pack"(%528) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %530 = "tf.Shape"(%525) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %531 = "tf.StridedSlice"(%530, %17, %17, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %532 = "tf.Shape"(%525) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %533 = "tf.StridedSlice"(%532, %7, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %534 = "tf.ConcatV2"(%531, %529, %533, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %535 = "tf.Reshape"(%525, %534) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %536 = "tf.ExpandDims"(%520, %2) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
+  %537 = "tf.Less"(%523, %536) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
+  %538 = "tf.Reshape"(%537, %6) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
+  %539 = "tf.Where"(%538) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
+  %540 = "tf.Squeeze"(%539) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
+  %541 = "tf.GatherV2"(%535, %540, %16) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %542 = "tf.Cast"(%518) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
+  %543 = "tf.BroadcastTo"(%542, %497) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+  %544 = "tf.Max"(%543, %17) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
+  %545 = "tf.Maximum"(%16, %544) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %546 = "tf.Range"(%16, %545, %9) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+  %547 = "tf.Pack"(%9, %545) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %548 = "tf.Tile"(%494, %547) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %549 = "tf.Shape"(%548) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %550 = "tf.StridedSlice"(%549, %17, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  %551 = "tf.Prod"(%550, %17) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
+  %552 = "tf.Pack"(%551) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %553 = "tf.Shape"(%548) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %554 = "tf.StridedSlice"(%553, %17, %17, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %555 = "tf.Shape"(%548) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %556 = "tf.StridedSlice"(%555, %7, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %557 = "tf.ConcatV2"(%554, %552, %556, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %558 = "tf.Reshape"(%548, %557) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %559 = "tf.ExpandDims"(%543, %2) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
+  %560 = "tf.Less"(%546, %559) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
+  %561 = "tf.Reshape"(%560, %6) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
+  %562 = "tf.Where"(%561) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
+  %563 = "tf.Squeeze"(%562) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
+  %564 = "tf.GatherV2"(%558, %563, %16) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %565:2 = "tf.RaggedRange"(%541, %564, %15) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %566 = "tf.GatherV2"(%482, %565#1, %16) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %567 = "tf.If"(%408, %408, %404, %15) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_1_AssertGuard_false_112940, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_1_AssertGuard_true_112930} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %568 = "tf.Identity"(%567) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %569 = "tf.Select"(%1, %404, %15) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %570 = "tf.Pack"(%569) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %571 = "tf.ConcatV2"(%0, %570, %14, %16) {device = ""} : (tensor<0xi64>, tensor<1xi64>, tensor<1xi64>, tensor<i32>) -> tensor<2xi64>
+  %572 = "tf.StridedSlice"(%571, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %573 = "tf.Equal"(%572, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %574 = "tf.StridedSlice"(%571, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %575 = "tf.StridedSlice"(%571, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %576 = "tf.Equal"(%575, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %577 = "tf.If"(%576, %576, %575, %518) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_2_AssertGuard_false_113430, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_2_AssertGuard_true_113420} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<?xi64>) -> tensor<i1>
+  %578 = "tf.Identity"(%577) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %579 = "tf.If"(%573, %573, %518, %574) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_3_AssertGuard_false_113790, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_3_AssertGuard_true_113780} : (tensor<i1>, tensor<i1>, tensor<?xi64>, tensor<i64>) -> tensor<i1>
+  %580 = "tf.Identity"(%579) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %581 = "tf.If"(%444, %444, %116, %440) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_AssertGuard_false_118470, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_AssertGuard_true_118460} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %582 = "tf.Identity"(%581) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %583 = "tf.Equal"(%440, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %584 = "tf.Select"(%583, %116, %440) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %585 = "tf.Equal"(%584, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %586 = "tf.LogicalOr"(%585, %119) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %587 = "tf.Equal"(%118, %584) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %588 = "tf.LogicalOr"(%586, %587) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %589 = "tf.Select"(%451, %584, %15) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %590 = "tf.Pack"(%589, %15) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
+  %591 = "tf.StridedSlice"(%590, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %592 = "tf.Cast"(%591) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
+  %593 = "tf.Reshape"(%592, %11) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
+  %594 = "tf.Pack"(%9, %593) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %595 = "tf.Tile"(%452, %594) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %596 = "tf.Mul"(%593, %455) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %597 = "tf.Pack"(%596) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %598 = "tf.ConcatV2"(%454, %597, %456, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %599 = "tf.Reshape"(%595, %598) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %600 = "tf.Shape"(%599) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %601 = "tf.StridedSlice"(%600, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %602 = "tf.Pack"(%591) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %603 = "tf.StridedSlice"(%599, %602, %13, %14) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %604 = "tf.Sub"(%601, %591) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %605 = "tf.Pack"(%604) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %606 = "tf.StridedSlice"(%599, %13, %605, %14) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %607:2 = "tf.RaggedRange"(%606, %603, %15) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %608 = "tf.Select"(%124, %584, %15) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %609 = "tf.Pack"(%608, %15) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
+  %610 = "tf.StridedSlice"(%609, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %611 = "tf.Cast"(%610) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
+  %612 = "tf.Reshape"(%611, %11) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
+  %613 = "tf.Pack"(%9, %612) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %614 = "tf.Tile"(%106, %613) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %615 = "tf.Mul"(%612, %109) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %616 = "tf.Pack"(%615) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %617 = "tf.ConcatV2"(%108, %616, %110, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %618 = "tf.Reshape"(%614, %617) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %619 = "tf.Shape"(%618) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %620 = "tf.StridedSlice"(%619, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %621 = "tf.Pack"(%610) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %622 = "tf.StridedSlice"(%618, %621, %13, %14) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %623 = "tf.Sub"(%620, %610) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %624 = "tf.Pack"(%623) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %625 = "tf.StridedSlice"(%618, %13, %624, %14) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %626:2 = "tf.RaggedRange"(%625, %622, %15) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %627 = "tf.StridedSlice"(%609, %17, %18, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+  %628 = "tf.StridedSlice"(%609, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %629 = "tf.Mul"(%113, %628) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
+  %630 = "tf.Tile"(%629, %627) {device = ""} : (tensor<?xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %631 = "tf.Cumsum"(%630, %16) {device = "", exclusive = false, reverse = false} : (tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %632 = "tf.ConcatV2"(%13, %631, %2) {device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %633 = "tf.StridedSlice"(%632, %17, %6, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %634 = "tf.ExpandDims"(%633, %9) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %635 = "tf.Shape"(%633) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %636 = "tf.StridedSlice"(%635, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %637 = "tf.Pack"(%636) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %638 = "tf.StridedSlice"(%632, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %639 = "tf.ExpandDims"(%638, %9) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %640 = "tf.Shape"(%638) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %641 = "tf.StridedSlice"(%640, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %642 = "tf.Pack"(%641) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %643 = "tf.Equal"(%440, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %644 = "tf.Select"(%643, %584, %15) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %645 = "tf.Cast"(%644) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
+  %646 = "tf.Reshape"(%645, %11) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
+  %647 = "tf.Pack"(%9, %646) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %648 = "tf.Mul"(%646, %10) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %649 = "tf.Pack"(%648) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %650 = "tf.ConcatV2"(%11, %649, %11, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %651 = "tf.Pack"(%644) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %652 = "tf.Pack"(%12, %440) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
+  %653 = "tf.ExpandDims"(%652, %9) {device = ""} : (tensor<2xi64>, tensor<i32>) -> tensor<2x1xi64>
+  %654 = "tf.Tile"(%653, %647) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
+  %655 = "tf.Reshape"(%654, %650) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %656 = "tf.Shape"(%655) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %657 = "tf.StridedSlice"(%656, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %658 = "tf.Sub"(%657, %644) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %659 = "tf.Pack"(%658) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %660 = "tf.StridedSlice"(%655, %13, %659, %14) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %661 = "tf.StridedSlice"(%655, %651, %13, %14) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %662:2 = "tf.RaggedRange"(%660, %661, %15) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %663 = "tf.GatherV2"(%447, %662#1, %16) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %664 = "tf.Cast"(%663) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
+  %665 = "tf.BroadcastTo"(%664, %637) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+  %666 = "tf.Max"(%665, %17) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
+  %667 = "tf.Maximum"(%16, %666) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %668 = "tf.Range"(%16, %667, %9) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+  %669 = "tf.Pack"(%9, %667) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %670 = "tf.Tile"(%634, %669) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %671 = "tf.Shape"(%670) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %672 = "tf.StridedSlice"(%671, %17, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  %673 = "tf.Prod"(%672, %17) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
+  %674 = "tf.Pack"(%673) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %675 = "tf.Shape"(%670) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %676 = "tf.StridedSlice"(%675, %17, %17, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %677 = "tf.Shape"(%670) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %678 = "tf.StridedSlice"(%677, %7, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %679 = "tf.ConcatV2"(%676, %674, %678, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %680 = "tf.Reshape"(%670, %679) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %681 = "tf.ExpandDims"(%665, %2) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
+  %682 = "tf.Less"(%668, %681) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
+  %683 = "tf.Reshape"(%682, %6) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
+  %684 = "tf.Where"(%683) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
+  %685 = "tf.Squeeze"(%684) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
+  %686 = "tf.GatherV2"(%680, %685, %16) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %687 = "tf.Cast"(%663) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
+  %688 = "tf.BroadcastTo"(%687, %642) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+  %689 = "tf.Max"(%688, %17) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
+  %690 = "tf.Maximum"(%16, %689) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %691 = "tf.Range"(%16, %690, %9) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+  %692 = "tf.Pack"(%9, %690) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %693 = "tf.Tile"(%639, %692) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %694 = "tf.Shape"(%693) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %695 = "tf.StridedSlice"(%694, %17, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  %696 = "tf.Prod"(%695, %17) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
+  %697 = "tf.Pack"(%696) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %698 = "tf.Shape"(%693) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %699 = "tf.StridedSlice"(%698, %17, %17, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %700 = "tf.Shape"(%693) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %701 = "tf.StridedSlice"(%700, %7, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %702 = "tf.ConcatV2"(%699, %697, %701, %16) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %703 = "tf.Reshape"(%693, %702) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %704 = "tf.ExpandDims"(%688, %2) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
+  %705 = "tf.Less"(%691, %704) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
+  %706 = "tf.Reshape"(%705, %6) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
+  %707 = "tf.Where"(%706) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
+  %708 = "tf.Squeeze"(%707) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
+  %709 = "tf.GatherV2"(%703, %708, %16) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %710:2 = "tf.RaggedRange"(%686, %709, %15) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %711 = "tf.If"(%588, %588, %584, %120) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_1_AssertGuard_false_119540, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_1_AssertGuard_true_119530} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %712 = "tf.Identity"(%711) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %713 = "tf.StridedSlice"(%115, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %714 = "tf.Equal"(%713, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %715 = "tf.Select"(%714, %584, %713) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %716 = "tf.Pack"(%715) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %717 = "tf.StridedSlice"(%115, %17, %17, %18) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi64>
+  %718 = "tf.StridedSlice"(%115, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+  %719 = "tf.ConcatV2"(%717, %716, %718, %16) {device = ""} : (tensor<0xi64>, tensor<1xi64>, tensor<1xi64>, tensor<i32>) -> tensor<2xi64>
+  %720 = "tf.StridedSlice"(%719, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %721 = "tf.Equal"(%720, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %722 = "tf.StridedSlice"(%719, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %723 = "tf.StridedSlice"(%719, %18, %7, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %724 = "tf.Equal"(%723, %15) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %725 = "tf.If"(%724, %724, %723, %663) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_2_AssertGuard_false_120030, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_2_AssertGuard_true_120020} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<?xi64>) -> tensor<i1>
+  %726 = "tf.Identity"(%725) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %727 = "tf.If"(%721, %721, %663, %722) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_3_AssertGuard_false_120390, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_3_AssertGuard_true_120380} : (tensor<i1>, tensor<i1>, tensor<?xi64>, tensor<i64>) -> tensor<i1>
+  %728 = "tf.Identity"(%168) {device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %729 = "tf.Identity"(%727) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %730 = "tf.Identity"(%400) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %731 = "tf.Shape"(%125#2) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %732 = "tf.StridedSlice"(%731, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %733 = "tf.Cast"(%732) {Truncate = false, device = ""} : (tensor<0xi32>) -> tensor<0xi64>
+  %734 = "tf.Identity"(%733) {device = ""} : (tensor<0xi64>) -> tensor<0xi64>
+  %735 = "tf.Shape"(%125#3) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %736 = "tf.StridedSlice"(%735, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %737 = "tf.Cast"(%736) {Truncate = false, device = ""} : (tensor<0xi32>) -> tensor<0xi64>
+  %738 = "tf.Identity"(%737) {device = ""} : (tensor<0xi64>) -> tensor<0xi64>
+  %739 = "tf.GatherV2"(%125#3, %428, %16) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %740 = "tf.Tile"(%739, %432) {device = ""} : (tensor<?xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %741 = "tf.Sub"(%740, %566) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %742 = "tf.Shape"(%741) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %743 = "tf.StridedSlice"(%742, %18, %17, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %744 = "tf.Cast"(%743) {Truncate = false, device = ""} : (tensor<0xi32>) -> tensor<0xi64>
+  %745 = "tf.Identity"(%744) {device = ""} : (tensor<0xi64>) -> tensor<0xi64>
+  %746 = "tf.UnicodeEncode"(%125#0, %146) {Tsplits = i64, device = "", errors = "replace", output_encoding = "UTF-8", replacement_char = 65533 : i64} : (tensor<?xi32>, tensor<?xi64>) -> tensor<?x!tf.string>
+  %747 = "tf.Identity"(%746) {device = ""} : (tensor<?x!tf.string>) -> tensor<?x!tf.string>
+  %748 = "tf.StridedSlice"(%19, %17, %18, %18) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %749 = "tf.AddV2"(%748, %15) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %750 = "tf.Range"(%12, %749, %15) {device = ""} : (tensor<i64>, tensor<i64>, tensor<i64>) -> tensor<?xi64>
+  %751 = "tf.Mul"(%750, %15) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
+  %752 = "tf.Identity"(%751) {device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  return %747, %752, %728 : tensor<?x!tf.string>, tensor<?xi64>, tensor<?xi64>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedConcat_assert_equal_1_Assert_AssertGuard_false_99640(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Input tensors have incompatible shapes."> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedConcat/RaggedFromTensor/strided_slice_4:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedConcat/RaggedNRows/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedConcat_assert_equal_1_Assert_AssertGuard_true_99630(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_100400(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:zero"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/RowPartitionFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/RowPartitionFromRowSplits/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_100390(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_100760(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:monotonic"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x >= 0 did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/RowPartitionFromRowSplits/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<?xi64>) -> ()
+  %3 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %4 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %4 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_100750(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_101100(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to _from_row_partition do not form a valid RaggedTensor"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/strided_slice_1:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_101090(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_101470(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:zero"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RowPartitionFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RowPartitionFromRowSplits/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_101460(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_101830(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:monotonic"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x >= 0 did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RowPartitionFromRowSplits/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<?xi64>) -> ()
+  %3 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %4 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %4 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_101820(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_false_102190(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to _from_row_partition do not form a valid RaggedTensor"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RaggedNRows/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_true_102180(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_102540(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:zero"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/RowPartitionFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/RowPartitionFromRowSplits/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_102530(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_102900(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:monotonic"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x >= 0 did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/RowPartitionFromRowSplits/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<?xi64>) -> ()
+  %3 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %4 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %4 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_102890(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_103240(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to _from_row_partition do not form a valid RaggedTensor"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/strided_slice_1:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_103230(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_103610(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:zero"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/RowPartitionFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/RowPartitionFromRowSplits/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_103600(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_103970(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:monotonic"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x >= 0 did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/RowPartitionFromRowSplits/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<?xi64>) -> ()
+  %3 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %4 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %4 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_103960(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_104310(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to _from_row_partition do not form a valid RaggedTensor"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/strided_slice_1:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_104300(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_AssertGuard_false_105110(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_AssertGuard_true_105100(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_1_AssertGuard_false_106180(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_1_AssertGuard_true_106170(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_2_AssertGuard_false_106670(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<?xi64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_2_AssertGuard_true_106660(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_3_AssertGuard_false_107030(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<?xi64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_3_AssertGuard_true_107020(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_AssertGuard_false_111870(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_AssertGuard_true_111860(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_1_AssertGuard_false_112940(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_1_AssertGuard_true_112930(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_2_AssertGuard_false_113430(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<?xi64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_2_AssertGuard_true_113420(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_3_AssertGuard_false_113790(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<?xi64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_Assert_3_AssertGuard_true_113780(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_AssertGuard_false_118470(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_AssertGuard_true_118460(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_1_AssertGuard_false_119540(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_1_AssertGuard_true_119530(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_2_AssertGuard_false_120030(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<?xi64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_2_AssertGuard_true_120020(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_3_AssertGuard_false_120390(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<?xi64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_3_AssertGuard_true_120380(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+
+
+
+// CHECK:  func @whitespace_tokenizer_rank2(%arg0: tensor<?x1x!tf.string> {tf._user_specified_name = "input"}) -> (tensor<?x!tf.string>, tensor<?xi64>, tensor<?xi64>) attributes {sym_visibility = "private", tf._implements = #tf.func<@"tftext:WhitespaceTokenizer", {}>, tf._input_shapes = [#tf.shape<?x1>], tf.signature.is_stateful} {
+// CHECK:  %0:3 = "tfl.custom"(%arg0) {custom_code = "tftext:WhitespaceTokenizer", custom_option = opaque<"tfl", "0x"> : tensor<0xi8>} : (tensor<?x1x!tf.string>) -> (tensor<?x!tf.string>, tensor<?xi64>, tensor<?xi64>)
+// CHECK:  return %0#0, %0#1, %0#2 : tensor<?x!tf.string>, tensor<?xi64>, tensor<?xi64>
+
+func @whitespace_tokenizer_rank0(%arg0: tensor<!tf.string> {tf._user_specified_name = "input"}) -> tensor<?x!tf.string> attributes {sym_visibility = "private", tf._input_shapes = [#tf.shape<>], tf._implements = #tf.func<@"tftext:WhitespaceTokenizer", {}>, tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi64>} : () -> tensor<2xi64>
+  %1 = "tf.Const"() {value = dense<[]> : tensor<0xi64>} : () -> tensor<0xi64>
+  %2 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+  %3 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+  %4 = "tf.Const"() {value = dense<[[0], [1]]> : tensor<2x1xi64>} : () -> tensor<2x1xi64>
+  %5 = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+  %6 = "tf.Const"() {value = dense<2> : tensor<1xi32>} : () -> tensor<1xi32>
+  %7 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %8 = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
+  %9 = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
+  %10 = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
+  %11 = "tf.Const"() {value = dense<0> : tensor<1xi64>} : () -> tensor<1xi64>
+  %12 = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
+  %13 = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+  %14 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %15 = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
+  %16 = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  %17 = "tf.If"(%2, %2, %13, %13) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedConcat_assert_equal_1_Assert_AssertGuard_false_3220, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedConcat_assert_equal_1_Assert_AssertGuard_true_3210} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %18 = "tf.Identity"(%17) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %19 = "tf.Pack"(%arg0) {axis = 0 : i64, device = ""} : (tensor<!tf.string>) -> tensor<1x!tf.string>
+  %20 = "tf.StringLength"(%19) {device = "", unit = "BYTE"} : (tensor<1x!tf.string>) -> tensor<1xi32>
+  %21 = "tf.ExpandDims"(%20, %7) {device = ""} : (tensor<1xi32>, tensor<i32>) -> tensor<1x1xi32>
+  %22 = "tf.Cast"(%21) {Truncate = false, device = ""} : (tensor<1x1xi32>) -> tensor<1x1xi64>
+  %23 = "tf.Reshape"(%22, %12) {device = ""} : (tensor<1x1xi64>, tensor<1xi64>) -> tensor<1xi64>
+  %24 = "tf.Reshape"(%19, %5) {device = ""} : (tensor<1x!tf.string>, tensor<1xi32>) -> tensor<1x!tf.string>
+  %25:3 = "tf.UnicodeDecodeWithOffsets"(%24) {Tsplits = i64, device = "", errors = "replace", input_encoding = "UTF-8", replace_control_characters = false, replacement_char = 65533 : i64} : (tensor<1x!tf.string>) -> (tensor<2xi64>, tensor<?xi32>, tensor<?xi64>)
+  %26 = "tf.StridedSlice"(%25#0, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+  %27 = "tf.AddV2"(%26, %13) {device = ""} : (tensor<1xi64>, tensor<i64>) -> tensor<1xi64>
+  %28 = "tf.StridedSlice"(%25#0, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+  %29 = "tf.Minimum"(%27, %28) {device = ""} : (tensor<1xi64>, tensor<1xi64>) -> tensor<1xi64>
+  %30:2 = "tf.RaggedRange"(%29, %28, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<1xi64>, tensor<1xi64>, tensor<i64>) -> (tensor<2xi64>, tensor<?xi64>)
+  %31 = "tf.StridedSlice"(%30#0, %5, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %32 = "tf.AddV2"(%31, %12) {device = ""} : (tensor<i64>, tensor<1xi64>) -> tensor<1xi64>
+  %33 = "tf.ConcatV2"(%30#0, %32, %14) {device = ""} : (tensor<2xi64>, tensor<1xi64>, tensor<i32>) -> tensor<3xi64>
+  %34 = "tf.GatherV2"(%25#2, %30#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %35 = "tf.ConcatV2"(%34, %23, %14) {device = ""} : (tensor<?xi64>, tensor<1xi64>, tensor<i32>) -> tensor<?xi64>
+  %36:2 = "tf.RaggedGather"(%33, %35, %0) {OUTPUT_RAGGED_RANK = 1 : i64, PARAMS_RAGGED_RANK = 1 : i64, Tindices = i64, Tsplits = i64, Tvalues = i64, device = ""} : (tensor<3xi64>, tensor<?xi64>, tensor<2xi64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %37:5 = "tf.WhitespaceTokenizeWithOffsets"(%25#1, %25#0) {Tsplits = i64, device = ""} : (tensor<?xi32>, tensor<2xi64>) -> (tensor<?xi32>, tensor<?xi64>, tensor<?xi64>, tensor<?xi64>, tensor<?xi64>)
+  %38 = "tf.StridedSlice"(%37#1, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %39 = "tf.Equal"(%38, %10) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %40 = "tf.All"(%39, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %41 = "tf.If"(%40, %40, %38, %10) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_3980, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_3970} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %42 = "tf.Identity"(%41) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %43 = "tf.StridedSlice"(%37#1, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %44 = "tf.StridedSlice"(%37#1, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %45 = "tf.Sub"(%43, %44) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %46 = "tf.LessEqual"(%10, %45) {device = ""} : (tensor<i64>, tensor<?xi64>) -> tensor<?xi1>
+  %47 = "tf.All"(%46, %15) {device = "", keep_dims = false} : (tensor<?xi1>, tensor<1xi32>) -> tensor<i1>
+  %48 = "tf.If"(%47, %47, %45) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_4340, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_4330} : (tensor<i1>, tensor<i1>, tensor<?xi64>) -> tensor<i1>
+  %49 = "tf.Identity"(%48) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %50 = "tf.Identity"(%37#1) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %51 = "tf.StridedSlice"(%50, %5, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %52 = "tf.Shape"(%37#0) {device = ""} : (tensor<?xi32>) -> tensor<1xi64>
+  %53 = "tf.StridedSlice"(%52, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %54 = "tf.Equal"(%51, %53) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %55 = "tf.All"(%54, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %56 = "tf.If"(%55, %55, %51, %53) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_4680, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_4670} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %57 = "tf.Identity"(%56) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %58 = "tf.Identity"(%50) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %59 = "tf.Shape"(%58) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %60 = "tf.StridedSlice"(%59, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %61 = "tf.Sub"(%60, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %62 = "tf.StridedSlice"(%37#4, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %63 = "tf.Equal"(%62, %10) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %64 = "tf.All"(%63, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %65 = "tf.If"(%64, %64, %62, %10) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_5050, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_5040} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %66 = "tf.Identity"(%65) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %67 = "tf.StridedSlice"(%37#4, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %68 = "tf.StridedSlice"(%37#4, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %69 = "tf.Sub"(%67, %68) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %70 = "tf.LessEqual"(%10, %69) {device = ""} : (tensor<i64>, tensor<?xi64>) -> tensor<?xi1>
+  %71 = "tf.All"(%70, %15) {device = "", keep_dims = false} : (tensor<?xi1>, tensor<1xi32>) -> tensor<i1>
+  %72 = "tf.If"(%71, %71, %69) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_5410, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_5400} : (tensor<i1>, tensor<i1>, tensor<?xi64>) -> tensor<i1>
+  %73 = "tf.Identity"(%72) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %74 = "tf.Identity"(%37#4) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %75 = "tf.StridedSlice"(%74, %5, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %76 = "tf.Equal"(%75, %61) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %77 = "tf.All"(%76, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %78 = "tf.If"(%77, %77, %75, %61) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_false_5770, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_true_5760} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %79 = "tf.Identity"(%78) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %80 = "tf.Identity"(%74) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %81 = "tf.StridedSlice"(%37#4, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %82 = "tf.Equal"(%81, %10) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %83 = "tf.All"(%82, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %84 = "tf.If"(%83, %83, %81, %10) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_6120, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_6110} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %85 = "tf.Identity"(%84) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %86 = "tf.StridedSlice"(%37#4, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %87 = "tf.StridedSlice"(%37#4, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %88 = "tf.Sub"(%86, %87) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %89 = "tf.LessEqual"(%10, %88) {device = ""} : (tensor<i64>, tensor<?xi64>) -> tensor<?xi1>
+  %90 = "tf.All"(%89, %15) {device = "", keep_dims = false} : (tensor<?xi1>, tensor<1xi32>) -> tensor<i1>
+  %91 = "tf.If"(%90, %90, %88) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_6480, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_6470} : (tensor<i1>, tensor<i1>, tensor<?xi64>) -> tensor<i1>
+  %92 = "tf.Identity"(%91) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %93 = "tf.Identity"(%37#4) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %94 = "tf.StridedSlice"(%93, %5, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %95 = "tf.Shape"(%37#2) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %96 = "tf.StridedSlice"(%95, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %97 = "tf.Equal"(%94, %96) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %98 = "tf.All"(%97, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %99 = "tf.If"(%98, %98, %94, %96) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_6820, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_6810} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %100 = "tf.Identity"(%99) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %101 = "tf.Identity"(%93) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %102 = "tf.Shape"(%101) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %103 = "tf.StridedSlice"(%102, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %104 = "tf.Sub"(%103, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %105 = "tf.Equal"(%104, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %106 = "tf.LogicalOr"(%105, %2) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %107 = "tf.Equal"(%104, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %108 = "tf.LogicalOr"(%106, %107) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %109 = "tf.StridedSlice"(%101, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %110 = "tf.StridedSlice"(%101, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %111 = "tf.Sub"(%109, %110) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %112 = "tf.Shape"(%101) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %113 = "tf.StridedSlice"(%112, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %114 = "tf.Sub"(%113, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %115 = "tf.Equal"(%114, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %116 = "tf.ExpandDims"(%101, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %117 = "tf.Shape"(%101) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %118 = "tf.StridedSlice"(%117, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %119 = "tf.StridedSlice"(%117, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %120 = "tf.StridedSlice"(%117, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %121 = "tf.StridedSlice"(%37#4, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %122 = "tf.Equal"(%121, %10) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %123 = "tf.All"(%122, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %124 = "tf.If"(%123, %123, %121, %10) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_7190, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_7180} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %125 = "tf.Identity"(%124) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %126 = "tf.StridedSlice"(%37#4, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %127 = "tf.StridedSlice"(%37#4, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %128 = "tf.Sub"(%126, %127) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %129 = "tf.LessEqual"(%10, %128) {device = ""} : (tensor<i64>, tensor<?xi64>) -> tensor<?xi1>
+  %130 = "tf.All"(%129, %15) {device = "", keep_dims = false} : (tensor<?xi1>, tensor<1xi32>) -> tensor<i1>
+  %131 = "tf.If"(%130, %130, %128) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_7550, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_7540} : (tensor<i1>, tensor<i1>, tensor<?xi64>) -> tensor<i1>
+  %132 = "tf.Identity"(%131) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %133 = "tf.Identity"(%37#4) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %134 = "tf.StridedSlice"(%133, %5, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %135 = "tf.Shape"(%37#3) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %136 = "tf.StridedSlice"(%135, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %137 = "tf.Equal"(%134, %136) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %138 = "tf.All"(%137, %9) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %139 = "tf.If"(%138, %138, %134, %136) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_7890, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_7880} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %140 = "tf.Identity"(%139) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %141 = "tf.Identity"(%133) {_class = ["loc:@WhitespaceTokenize/WhitespaceTokenize/WhitespaceTokenizeWithOffsets"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %142 = "tf.Shape"(%141) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %143 = "tf.StridedSlice"(%142, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %144 = "tf.Sub"(%143, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %145 = "tf.Equal"(%144, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %146 = "tf.LogicalOr"(%145, %2) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %147 = "tf.Equal"(%144, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %148 = "tf.LogicalOr"(%146, %147) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %149 = "tf.StridedSlice"(%141, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %150 = "tf.StridedSlice"(%141, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %151 = "tf.Sub"(%149, %150) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %152 = "tf.Shape"(%141) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %153 = "tf.StridedSlice"(%152, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %154 = "tf.Sub"(%153, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %155 = "tf.Equal"(%154, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %156 = "tf.ExpandDims"(%141, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %157 = "tf.Shape"(%141) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %158 = "tf.StridedSlice"(%157, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %159 = "tf.StridedSlice"(%157, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %160 = "tf.StridedSlice"(%157, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %161 = "tf.StridedSlice"(%141, %5, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %162 = "tf.Range"(%10, %161, %13) {device = ""} : (tensor<i64>, tensor<i64>, tensor<i64>) -> tensor<?xi64>
+  %163 = "tf.StridedSlice"(%141, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %164 = "tf.StridedSlice"(%141, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %165 = "tf.Sub"(%163, %164) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %166 = "tf.If"(%108, %108, %13, %104) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_AssertGuard_false_8690, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_AssertGuard_true_8680} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %167 = "tf.Identity"(%166) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %168 = "tf.Equal"(%104, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %169 = "tf.Select"(%168, %13, %104) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %170 = "tf.Equal"(%169, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %171 = "tf.LogicalOr"(%170, %2) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %172 = "tf.Equal"(%169, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %173 = "tf.LogicalOr"(%171, %172) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %174 = "tf.Select"(%115, %169, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %175 = "tf.Pack"(%174, %13) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
+  %176 = "tf.StridedSlice"(%175, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %177 = "tf.Cast"(%176) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
+  %178 = "tf.Reshape"(%177, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
+  %179 = "tf.Pack"(%7, %178) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %180 = "tf.Tile"(%116, %179) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %181 = "tf.Mul"(%178, %119) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %182 = "tf.Pack"(%181) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %183 = "tf.ConcatV2"(%118, %182, %120, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %184 = "tf.Reshape"(%180, %183) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %185 = "tf.Shape"(%184) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %186 = "tf.StridedSlice"(%185, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %187 = "tf.Pack"(%176) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %188 = "tf.StridedSlice"(%184, %187, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %189 = "tf.Sub"(%186, %176) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %190 = "tf.Pack"(%189) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %191 = "tf.StridedSlice"(%184, %11, %190, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %192:2 = "tf.RaggedRange"(%191, %188, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %193 = "tf.Select"(%2, %169, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %194 = "tf.Pack"(%193, %13) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
+  %195 = "tf.StridedSlice"(%194, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %196 = "tf.Cast"(%195) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
+  %197 = "tf.Reshape"(%196, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
+  %198 = "tf.Pack"(%7, %197) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %199 = "tf.Tile"(%4, %198) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
+  %200 = "tf.Mul"(%197, %8) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %201 = "tf.Pack"(%200) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %202 = "tf.ConcatV2"(%9, %201, %9, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %203 = "tf.Reshape"(%199, %202) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %204 = "tf.Shape"(%203) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %205 = "tf.StridedSlice"(%204, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %206 = "tf.Pack"(%195) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %207 = "tf.StridedSlice"(%203, %206, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %208 = "tf.Sub"(%205, %195) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %209 = "tf.Pack"(%208) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %210 = "tf.StridedSlice"(%203, %11, %209, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %211:2 = "tf.RaggedRange"(%210, %207, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %212 = "tf.StridedSlice"(%194, %15, %16, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+  %213 = "tf.StridedSlice"(%194, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %214 = "tf.Mul"(%213, %12) {device = ""} : (tensor<i64>, tensor<1xi64>) -> tensor<1xi64>
+  %215 = "tf.Tile"(%214, %212) {device = ""} : (tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %216 = "tf.Cumsum"(%215, %14) {device = "", exclusive = false, reverse = false} : (tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %217 = "tf.ConcatV2"(%11, %216, %3) {device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %218 = "tf.StridedSlice"(%217, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %219 = "tf.ExpandDims"(%218, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %220 = "tf.Shape"(%218) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %221 = "tf.StridedSlice"(%220, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %222 = "tf.Pack"(%221) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %223 = "tf.StridedSlice"(%217, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %224 = "tf.ExpandDims"(%223, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %225 = "tf.Shape"(%223) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %226 = "tf.StridedSlice"(%225, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %227 = "tf.Pack"(%226) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %228 = "tf.Equal"(%104, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %229 = "tf.Select"(%228, %169, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %230 = "tf.Cast"(%229) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
+  %231 = "tf.Reshape"(%230, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
+  %232 = "tf.Pack"(%7, %231) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %233 = "tf.Mul"(%231, %8) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %234 = "tf.Pack"(%233) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %235 = "tf.ConcatV2"(%9, %234, %9, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %236 = "tf.Pack"(%229) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %237 = "tf.Pack"(%10, %104) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
+  %238 = "tf.ExpandDims"(%237, %7) {device = ""} : (tensor<2xi64>, tensor<i32>) -> tensor<2x1xi64>
+  %239 = "tf.Tile"(%238, %232) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
+  %240 = "tf.Reshape"(%239, %235) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %241 = "tf.Shape"(%240) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %242 = "tf.StridedSlice"(%241, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %243 = "tf.Sub"(%242, %229) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %244 = "tf.Pack"(%243) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %245 = "tf.StridedSlice"(%240, %11, %244, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %246 = "tf.StridedSlice"(%240, %236, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %247:2 = "tf.RaggedRange"(%245, %246, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %248 = "tf.GatherV2"(%111, %247#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %249 = "tf.Cast"(%248) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
+  %250 = "tf.BroadcastTo"(%249, %222) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+  %251 = "tf.Max"(%250, %15) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
+  %252 = "tf.Maximum"(%14, %251) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %253 = "tf.Range"(%14, %252, %7) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+  %254 = "tf.Pack"(%7, %252) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %255 = "tf.Tile"(%219, %254) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %256 = "tf.Shape"(%255) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %257 = "tf.StridedSlice"(%256, %15, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  %258 = "tf.Prod"(%257, %15) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
+  %259 = "tf.Pack"(%258) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %260 = "tf.Shape"(%255) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %261 = "tf.StridedSlice"(%260, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %262 = "tf.Shape"(%255) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %263 = "tf.StridedSlice"(%262, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %264 = "tf.ConcatV2"(%261, %259, %263, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %265 = "tf.Reshape"(%255, %264) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %266 = "tf.ExpandDims"(%250, %3) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
+  %267 = "tf.Less"(%253, %266) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
+  %268 = "tf.Reshape"(%267, %5) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
+  %269 = "tf.Where"(%268) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
+  %270 = "tf.Squeeze"(%269) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
+  %271 = "tf.GatherV2"(%265, %270, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %272 = "tf.Cast"(%248) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
+  %273 = "tf.BroadcastTo"(%272, %227) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+  %274 = "tf.Max"(%273, %15) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
+  %275 = "tf.Maximum"(%14, %274) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %276 = "tf.Range"(%14, %275, %7) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+  %277 = "tf.Pack"(%7, %275) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %278 = "tf.Tile"(%224, %277) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %279 = "tf.Shape"(%278) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %280 = "tf.StridedSlice"(%279, %15, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  %281 = "tf.Prod"(%280, %15) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
+  %282 = "tf.Pack"(%281) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %283 = "tf.Shape"(%278) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %284 = "tf.StridedSlice"(%283, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %285 = "tf.Shape"(%278) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %286 = "tf.StridedSlice"(%285, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %287 = "tf.ConcatV2"(%284, %282, %286, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %288 = "tf.Reshape"(%278, %287) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %289 = "tf.ExpandDims"(%273, %3) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
+  %290 = "tf.Less"(%276, %289) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
+  %291 = "tf.Reshape"(%290, %5) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
+  %292 = "tf.Where"(%291) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
+  %293 = "tf.Squeeze"(%292) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
+  %294 = "tf.GatherV2"(%288, %293, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %295:2 = "tf.RaggedRange"(%271, %294, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %296 = "tf.If"(%173, %173, %169, %13) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_1_AssertGuard_false_9760, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_1_AssertGuard_true_9750} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %297 = "tf.Identity"(%296) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %298 = "tf.Select"(%2, %169, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %299 = "tf.Pack"(%298) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %300 = "tf.ConcatV2"(%1, %299, %12, %14) {device = ""} : (tensor<0xi64>, tensor<1xi64>, tensor<1xi64>, tensor<i32>) -> tensor<2xi64>
+  %301 = "tf.StridedSlice"(%300, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %302 = "tf.Equal"(%301, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %303 = "tf.StridedSlice"(%300, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %304 = "tf.StridedSlice"(%300, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %305 = "tf.Equal"(%304, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %306 = "tf.If"(%305, %305, %304, %248) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_2_AssertGuard_false_10250, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_2_AssertGuard_true_10240} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<?xi64>) -> tensor<i1>
+  %307 = "tf.Identity"(%306) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %308 = "tf.If"(%302, %302, %248, %303) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_3_AssertGuard_false_10610, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_3_AssertGuard_true_10600} : (tensor<i1>, tensor<i1>, tensor<?xi64>, tensor<i64>) -> tensor<i1>
+  %309 = "tf.If"(%148, %148, %13, %144) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_Assert_AssertGuard_false_15310, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_Assert_AssertGuard_true_15300} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %310 = "tf.Identity"(%309) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %311 = "tf.Equal"(%144, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %312 = "tf.Select"(%311, %13, %144) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %313 = "tf.Equal"(%312, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %314 = "tf.LogicalOr"(%313, %2) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %315 = "tf.Equal"(%312, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %316 = "tf.LogicalOr"(%314, %315) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %317 = "tf.Select"(%155, %312, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %318 = "tf.Pack"(%317, %13) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
+  %319 = "tf.StridedSlice"(%318, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %320 = "tf.Cast"(%319) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
+  %321 = "tf.Reshape"(%320, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
+  %322 = "tf.Pack"(%7, %321) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %323 = "tf.Tile"(%156, %322) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %324 = "tf.Mul"(%321, %159) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %325 = "tf.Pack"(%324) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %326 = "tf.ConcatV2"(%158, %325, %160, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %327 = "tf.Reshape"(%323, %326) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %328 = "tf.Shape"(%327) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %329 = "tf.StridedSlice"(%328, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %330 = "tf.Pack"(%319) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %331 = "tf.StridedSlice"(%327, %330, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %332 = "tf.Sub"(%329, %319) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %333 = "tf.Pack"(%332) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %334 = "tf.StridedSlice"(%327, %11, %333, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %335:2 = "tf.RaggedRange"(%334, %331, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %336 = "tf.GatherV2"(%162, %335#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %337 = "tf.StridedSlice"(%318, %15, %16, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+  %338 = "tf.StridedSlice"(%318, %15, %16, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+  %339 = "tf.StridedSlice"(%318, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi64>
+  %340 = "tf.ConcatV2"(%338, %339, %14) {device = ""} : (tensor<1xi64>, tensor<0xi64>, tensor<i32>) -> tensor<1xi64>
+  %341 = "tf.StridedSlice"(%318, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %342 = "tf.Mul"(%165, %341) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
+  %343 = "tf.Tile"(%342, %337) {device = ""} : (tensor<?xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %344 = "tf.Cumsum"(%343, %14) {device = "", exclusive = false, reverse = false} : (tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %345 = "tf.ConcatV2"(%11, %344, %3) {device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %346 = "tf.Shape"(%345) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %347 = "tf.StridedSlice"(%346, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %348 = "tf.Sub"(%347, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %349 = "tf.Equal"(%348, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %350 = "tf.LogicalOr"(%349, %2) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %351 = "tf.Equal"(%348, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %352 = "tf.LogicalOr"(%350, %351) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %353 = "tf.StridedSlice"(%345, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %354 = "tf.StridedSlice"(%345, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %355 = "tf.Sub"(%353, %354) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %356 = "tf.Shape"(%345) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %357 = "tf.StridedSlice"(%356, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %358 = "tf.Sub"(%357, %13) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %359 = "tf.Equal"(%358, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %360 = "tf.ExpandDims"(%345, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %361 = "tf.Shape"(%345) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %362 = "tf.StridedSlice"(%361, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %363 = "tf.StridedSlice"(%361, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %364 = "tf.StridedSlice"(%361, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %365 = "tf.Select"(%2, %312, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %366 = "tf.Pack"(%365, %13) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
+  %367 = "tf.StridedSlice"(%366, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %368 = "tf.Cast"(%367) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
+  %369 = "tf.Reshape"(%368, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
+  %370 = "tf.Pack"(%7, %369) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %371 = "tf.Tile"(%4, %370) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
+  %372 = "tf.Mul"(%369, %8) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %373 = "tf.Pack"(%372) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %374 = "tf.ConcatV2"(%9, %373, %9, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %375 = "tf.Reshape"(%371, %374) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %376 = "tf.Shape"(%375) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %377 = "tf.StridedSlice"(%376, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %378 = "tf.Pack"(%367) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %379 = "tf.StridedSlice"(%375, %378, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %380 = "tf.Sub"(%377, %367) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %381 = "tf.Pack"(%380) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %382 = "tf.StridedSlice"(%375, %11, %381, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %383:2 = "tf.RaggedRange"(%382, %379, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %384 = "tf.GatherV2"(%11, %383#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %385 = "tf.GatherV2"(%12, %384, %14) {batch_dims = 0 : i64, device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %386 = "tf.StridedSlice"(%366, %15, %16, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+  %387 = "tf.StridedSlice"(%366, %15, %16, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+  %388 = "tf.StridedSlice"(%366, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi64>
+  %389 = "tf.ConcatV2"(%387, %388, %14) {device = ""} : (tensor<1xi64>, tensor<0xi64>, tensor<i32>) -> tensor<1xi64>
+  %390 = "tf.Tile"(%385, %389) {device = ""} : (tensor<?xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %391 = "tf.StridedSlice"(%366, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %392 = "tf.Mul"(%391, %12) {device = ""} : (tensor<i64>, tensor<1xi64>) -> tensor<1xi64>
+  %393 = "tf.Tile"(%392, %386) {device = ""} : (tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %394 = "tf.Cumsum"(%393, %14) {device = "", exclusive = false, reverse = false} : (tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %395 = "tf.ConcatV2"(%11, %394, %3) {device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %396 = "tf.StridedSlice"(%395, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %397 = "tf.ExpandDims"(%396, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %398 = "tf.Shape"(%396) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %399 = "tf.StridedSlice"(%398, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %400 = "tf.Pack"(%399) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %401 = "tf.StridedSlice"(%395, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %402 = "tf.ExpandDims"(%401, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %403 = "tf.Shape"(%401) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %404 = "tf.StridedSlice"(%403, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %405 = "tf.Pack"(%404) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %406 = "tf.Equal"(%144, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %407 = "tf.Select"(%406, %312, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %408 = "tf.Cast"(%407) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
+  %409 = "tf.Reshape"(%408, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
+  %410 = "tf.Pack"(%7, %409) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %411 = "tf.Mul"(%409, %8) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %412 = "tf.Pack"(%411) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %413 = "tf.ConcatV2"(%9, %412, %9, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %414 = "tf.Pack"(%407) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %415 = "tf.Pack"(%10, %144) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
+  %416 = "tf.ExpandDims"(%415, %7) {device = ""} : (tensor<2xi64>, tensor<i32>) -> tensor<2x1xi64>
+  %417 = "tf.Tile"(%416, %410) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
+  %418 = "tf.Reshape"(%417, %413) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %419 = "tf.Shape"(%418) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %420 = "tf.StridedSlice"(%419, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %421 = "tf.Sub"(%420, %407) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %422 = "tf.Pack"(%421) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %423 = "tf.StridedSlice"(%418, %11, %422, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %424 = "tf.StridedSlice"(%418, %414, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %425:2 = "tf.RaggedRange"(%423, %424, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %426 = "tf.GatherV2"(%151, %425#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %427 = "tf.Cast"(%426) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
+  %428 = "tf.BroadcastTo"(%427, %400) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+  %429 = "tf.Max"(%428, %15) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
+  %430 = "tf.Maximum"(%14, %429) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %431 = "tf.Range"(%14, %430, %7) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+  %432 = "tf.Pack"(%7, %430) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %433 = "tf.Tile"(%397, %432) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %434 = "tf.Shape"(%433) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %435 = "tf.StridedSlice"(%434, %15, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  %436 = "tf.Prod"(%435, %15) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
+  %437 = "tf.Pack"(%436) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %438 = "tf.Shape"(%433) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %439 = "tf.StridedSlice"(%438, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %440 = "tf.Shape"(%433) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %441 = "tf.StridedSlice"(%440, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %442 = "tf.ConcatV2"(%439, %437, %441, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %443 = "tf.Reshape"(%433, %442) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %444 = "tf.ExpandDims"(%428, %3) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
+  %445 = "tf.Less"(%431, %444) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
+  %446 = "tf.Reshape"(%445, %5) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
+  %447 = "tf.Where"(%446) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
+  %448 = "tf.Squeeze"(%447) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
+  %449 = "tf.GatherV2"(%443, %448, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %450 = "tf.Cast"(%426) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
+  %451 = "tf.BroadcastTo"(%450, %405) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+  %452 = "tf.Max"(%451, %15) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
+  %453 = "tf.Maximum"(%14, %452) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %454 = "tf.Range"(%14, %453, %7) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+  %455 = "tf.Pack"(%7, %453) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %456 = "tf.Tile"(%402, %455) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %457 = "tf.Shape"(%456) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %458 = "tf.StridedSlice"(%457, %15, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  %459 = "tf.Prod"(%458, %15) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
+  %460 = "tf.Pack"(%459) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %461 = "tf.Shape"(%456) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %462 = "tf.StridedSlice"(%461, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %463 = "tf.Shape"(%456) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %464 = "tf.StridedSlice"(%463, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %465 = "tf.ConcatV2"(%462, %460, %464, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %466 = "tf.Reshape"(%456, %465) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %467 = "tf.ExpandDims"(%451, %3) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
+  %468 = "tf.Less"(%454, %467) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
+  %469 = "tf.Reshape"(%468, %5) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
+  %470 = "tf.Where"(%469) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
+  %471 = "tf.Squeeze"(%470) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
+  %472 = "tf.GatherV2"(%466, %471, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %473:2 = "tf.RaggedRange"(%449, %472, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %474 = "tf.GatherV2"(%390, %473#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %475 = "tf.If"(%316, %316, %312, %13) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_Assert_1_AssertGuard_false_16380, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_Assert_1_AssertGuard_true_16370} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %476 = "tf.Identity"(%475) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %477 = "tf.Select"(%2, %312, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %478 = "tf.Pack"(%477) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %479 = "tf.ConcatV2"(%1, %478, %12, %14) {device = ""} : (tensor<0xi64>, tensor<1xi64>, tensor<1xi64>, tensor<i32>) -> tensor<2xi64>
+  %480 = "tf.StridedSlice"(%479, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %481 = "tf.Equal"(%480, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %482 = "tf.StridedSlice"(%479, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %483 = "tf.StridedSlice"(%479, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %484 = "tf.Equal"(%483, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %485 = "tf.If"(%484, %484, %483, %426) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_Assert_2_AssertGuard_false_16870, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_Assert_2_AssertGuard_true_16860} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<?xi64>) -> tensor<i1>
+  %486 = "tf.Identity"(%485) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %487 = "tf.If"(%481, %481, %426, %482) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_Assert_3_AssertGuard_false_17230, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_Assert_3_AssertGuard_true_17220} : (tensor<i1>, tensor<i1>, tensor<?xi64>, tensor<i64>) -> tensor<i1>
+  %488 = "tf.Identity"(%487) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %489 = "tf.If"(%352, %352, %13, %348) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_AssertGuard_false_21910, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_AssertGuard_true_21900} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %490 = "tf.Identity"(%489) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %491 = "tf.Equal"(%348, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %492 = "tf.Select"(%491, %13, %348) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %493 = "tf.Equal"(%492, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %494 = "tf.LogicalOr"(%493, %2) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %495 = "tf.Equal"(%492, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %496 = "tf.LogicalOr"(%494, %495) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %497 = "tf.Select"(%359, %492, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %498 = "tf.Pack"(%497, %13) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
+  %499 = "tf.StridedSlice"(%498, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %500 = "tf.Cast"(%499) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
+  %501 = "tf.Reshape"(%500, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
+  %502 = "tf.Pack"(%7, %501) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %503 = "tf.Tile"(%360, %502) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %504 = "tf.Mul"(%501, %363) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %505 = "tf.Pack"(%504) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %506 = "tf.ConcatV2"(%362, %505, %364, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %507 = "tf.Reshape"(%503, %506) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %508 = "tf.Shape"(%507) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %509 = "tf.StridedSlice"(%508, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %510 = "tf.Pack"(%499) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %511 = "tf.StridedSlice"(%507, %510, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %512 = "tf.Sub"(%509, %499) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %513 = "tf.Pack"(%512) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %514 = "tf.StridedSlice"(%507, %11, %513, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %515:2 = "tf.RaggedRange"(%514, %511, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %516 = "tf.Select"(%2, %492, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %517 = "tf.Pack"(%516, %13) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
+  %518 = "tf.StridedSlice"(%517, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %519 = "tf.Cast"(%518) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
+  %520 = "tf.Reshape"(%519, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
+  %521 = "tf.Pack"(%7, %520) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %522 = "tf.Tile"(%4, %521) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
+  %523 = "tf.Mul"(%520, %8) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %524 = "tf.Pack"(%523) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %525 = "tf.ConcatV2"(%9, %524, %9, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %526 = "tf.Reshape"(%522, %525) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %527 = "tf.Shape"(%526) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %528 = "tf.StridedSlice"(%527, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %529 = "tf.Pack"(%518) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %530 = "tf.StridedSlice"(%526, %529, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %531 = "tf.Sub"(%528, %518) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %532 = "tf.Pack"(%531) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %533 = "tf.StridedSlice"(%526, %11, %532, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %534:2 = "tf.RaggedRange"(%533, %530, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %535 = "tf.StridedSlice"(%517, %15, %16, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+  %536 = "tf.StridedSlice"(%517, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %537 = "tf.Mul"(%536, %12) {device = ""} : (tensor<i64>, tensor<1xi64>) -> tensor<1xi64>
+  %538 = "tf.Tile"(%537, %535) {device = ""} : (tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %539 = "tf.Cumsum"(%538, %14) {device = "", exclusive = false, reverse = false} : (tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %540 = "tf.ConcatV2"(%11, %539, %3) {device = ""} : (tensor<1xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %541 = "tf.StridedSlice"(%540, %15, %5, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %542 = "tf.ExpandDims"(%541, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %543 = "tf.Shape"(%541) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %544 = "tf.StridedSlice"(%543, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %545 = "tf.Pack"(%544) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %546 = "tf.StridedSlice"(%540, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %547 = "tf.ExpandDims"(%546, %7) {device = ""} : (tensor<?xi64>, tensor<i32>) -> tensor<?x1xi64>
+  %548 = "tf.Shape"(%546) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %549 = "tf.StridedSlice"(%548, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %550 = "tf.Pack"(%549) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %551 = "tf.Equal"(%348, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %552 = "tf.Select"(%551, %492, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %553 = "tf.Cast"(%552) {Truncate = false, device = ""} : (tensor<i64>) -> tensor<i32>
+  %554 = "tf.Reshape"(%553, %9) {device = ""} : (tensor<i32>, tensor<0xi32>) -> tensor<i32>
+  %555 = "tf.Pack"(%7, %554) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %556 = "tf.Mul"(%554, %8) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %557 = "tf.Pack"(%556) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %558 = "tf.ConcatV2"(%9, %557, %9, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %559 = "tf.Pack"(%552) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %560 = "tf.Pack"(%10, %348) {axis = 0 : i64, device = ""} : (tensor<i64>, tensor<i64>) -> tensor<2xi64>
+  %561 = "tf.ExpandDims"(%560, %7) {device = ""} : (tensor<2xi64>, tensor<i32>) -> tensor<2x1xi64>
+  %562 = "tf.Tile"(%561, %555) {device = ""} : (tensor<2x1xi64>, tensor<2xi32>) -> tensor<2x?xi64>
+  %563 = "tf.Reshape"(%562, %558) {device = ""} : (tensor<2x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %564 = "tf.Shape"(%563) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %565 = "tf.StridedSlice"(%564, %15, %16, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %566 = "tf.Sub"(%565, %552) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %567 = "tf.Pack"(%566) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %568 = "tf.StridedSlice"(%563, %11, %567, %12) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %569 = "tf.StridedSlice"(%563, %559, %11, %12) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %570:2 = "tf.RaggedRange"(%568, %569, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %571 = "tf.GatherV2"(%355, %570#1, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %572 = "tf.Cast"(%571) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
+  %573 = "tf.BroadcastTo"(%572, %545) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+  %574 = "tf.Max"(%573, %15) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
+  %575 = "tf.Maximum"(%14, %574) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %576 = "tf.Range"(%14, %575, %7) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+  %577 = "tf.Pack"(%7, %575) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %578 = "tf.Tile"(%542, %577) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %579 = "tf.Shape"(%578) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %580 = "tf.StridedSlice"(%579, %15, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  %581 = "tf.Prod"(%580, %15) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
+  %582 = "tf.Pack"(%581) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %583 = "tf.Shape"(%578) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %584 = "tf.StridedSlice"(%583, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %585 = "tf.Shape"(%578) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %586 = "tf.StridedSlice"(%585, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %587 = "tf.ConcatV2"(%584, %582, %586, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %588 = "tf.Reshape"(%578, %587) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %589 = "tf.ExpandDims"(%573, %3) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
+  %590 = "tf.Less"(%576, %589) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
+  %591 = "tf.Reshape"(%590, %5) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
+  %592 = "tf.Where"(%591) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
+  %593 = "tf.Squeeze"(%592) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
+  %594 = "tf.GatherV2"(%588, %593, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %595 = "tf.Cast"(%571) {Truncate = false, device = ""} : (tensor<?xi64>) -> tensor<?xi32>
+  %596 = "tf.BroadcastTo"(%595, %550) {device = ""} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+  %597 = "tf.Max"(%596, %15) {device = "", keep_dims = false} : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
+  %598 = "tf.Maximum"(%14, %597) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %599 = "tf.Range"(%14, %598, %7) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+  %600 = "tf.Pack"(%7, %598) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %601 = "tf.Tile"(%547, %600) {device = ""} : (tensor<?x1xi64>, tensor<2xi32>) -> tensor<?x?xi64>
+  %602 = "tf.Shape"(%601) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %603 = "tf.StridedSlice"(%602, %15, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  %604 = "tf.Prod"(%603, %15) {device = "", keep_dims = false} : (tensor<2xi32>, tensor<1xi32>) -> tensor<i32>
+  %605 = "tf.Pack"(%604) {axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %606 = "tf.Shape"(%601) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %607 = "tf.StridedSlice"(%606, %15, %15, %16) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %608 = "tf.Shape"(%601) {device = ""} : (tensor<?x?xi64>) -> tensor<2xi32>
+  %609 = "tf.StridedSlice"(%608, %6, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %610 = "tf.ConcatV2"(%607, %605, %609, %14) {device = ""} : (tensor<0xi32>, tensor<1xi32>, tensor<0xi32>, tensor<i32>) -> tensor<1xi32>
+  %611 = "tf.Reshape"(%601, %610) {device = ""} : (tensor<?x?xi64>, tensor<1xi32>) -> tensor<?xi64>
+  %612 = "tf.ExpandDims"(%596, %3) {device = ""} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
+  %613 = "tf.Less"(%599, %612) {device = ""} : (tensor<?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
+  %614 = "tf.Reshape"(%613, %5) {device = ""} : (tensor<?x?xi1>, tensor<1xi32>) -> tensor<?xi1>
+  %615 = "tf.Where"(%614) {device = ""} : (tensor<?xi1>) -> tensor<?x1xi64>
+  %616 = "tf.Squeeze"(%615) {device = "", squeeze_dims = [1]} : (tensor<?x1xi64>) -> tensor<?xi64>
+  %617 = "tf.GatherV2"(%611, %616, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %618:2 = "tf.RaggedRange"(%594, %617, %13) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %619 = "tf.If"(%496, %496, %492, %13) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_1_AssertGuard_false_22980, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_1_AssertGuard_true_22970} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+  %620 = "tf.Identity"(%619) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %621 = "tf.Select"(%2, %492, %13) {device = ""} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+  %622 = "tf.Pack"(%621) {axis = 0 : i64, device = ""} : (tensor<i64>) -> tensor<1xi64>
+  %623 = "tf.ConcatV2"(%1, %622, %12, %14) {device = ""} : (tensor<0xi64>, tensor<1xi64>, tensor<1xi64>, tensor<i32>) -> tensor<2xi64>
+  %624 = "tf.StridedSlice"(%623, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %625 = "tf.Equal"(%624, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %626 = "tf.StridedSlice"(%623, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %627 = "tf.StridedSlice"(%623, %16, %6, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %628 = "tf.Equal"(%627, %13) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %629 = "tf.If"(%628, %628, %627, %571) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_2_AssertGuard_false_23470, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_2_AssertGuard_true_23460} : (tensor<i1>, tensor<i1>, tensor<i64>, tensor<?xi64>) -> tensor<i1>
+  %630 = "tf.Identity"(%629) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %631 = "tf.If"(%625, %625, %571, %626) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = "", else_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_3_AssertGuard_false_23830, is_stateless = false, output_shapes = [#tf.shape<>], then_branch = @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_3_AssertGuard_true_23820} : (tensor<i1>, tensor<i1>, tensor<?xi64>, tensor<i64>) -> tensor<i1>
+  %632 = "tf.Identity"(%631) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %633 = "tf.Identity"(%308) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %634 = "tf.Shape"(%37#2) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %635 = "tf.StridedSlice"(%634, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %636 = "tf.Cast"(%635) {Truncate = false, device = ""} : (tensor<0xi32>) -> tensor<0xi64>
+  %637 = "tf.Identity"(%636) {device = ""} : (tensor<0xi64>) -> tensor<0xi64>
+  %638 = "tf.Shape"(%37#3) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %639 = "tf.StridedSlice"(%638, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %640 = "tf.Cast"(%639) {Truncate = false, device = ""} : (tensor<0xi32>) -> tensor<0xi64>
+  %641 = "tf.Identity"(%640) {device = ""} : (tensor<0xi64>) -> tensor<0xi64>
+  %642 = "tf.GatherV2"(%37#3, %336, %14) {batch_dims = 0 : i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i32>) -> tensor<?xi64>
+  %643 = "tf.Tile"(%642, %340) {device = ""} : (tensor<?xi64>, tensor<1xi64>) -> tensor<?xi64>
+  %644 = "tf.Sub"(%643, %474) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %645 = "tf.Shape"(%644) {device = ""} : (tensor<?xi64>) -> tensor<1xi32>
+  %646 = "tf.StridedSlice"(%645, %16, %15, %16) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
+  %647 = "tf.Cast"(%646) {Truncate = false, device = ""} : (tensor<0xi32>) -> tensor<0xi64>
+  %648 = "tf.Identity"(%647) {device = ""} : (tensor<0xi64>) -> tensor<0xi64>
+  %649 = "tf.UnicodeEncode"(%37#0, %58) {Tsplits = i64, device = "", errors = "replace", output_encoding = "UTF-8", replacement_char = 65533 : i64} : (tensor<?xi32>, tensor<?xi64>) -> tensor<?x!tf.string>
+  %650 = "tf.Identity"(%649) {device = ""} : (tensor<?x!tf.string>) -> tensor<?x!tf.string>
+  return %650 : tensor<?x!tf.string>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedConcat_assert_equal_1_Assert_AssertGuard_false_3220(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Input tensors have incompatible shapes."> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/RaggedConcat/RaggedFromTensor/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/RaggedConcat/RaggedNRows/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedConcat_assert_equal_1_Assert_AssertGuard_true_3210(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_3980(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:zero"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/RowPartitionFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/RowPartitionFromRowSplits/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_3970(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_4340(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:monotonic"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x >= 0 did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/RowPartitionFromRowSplits/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<?xi64>) -> ()
+  %3 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %4 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %4 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_4330(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_4680(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to _from_row_partition do not form a valid RaggedTensor"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/strided_slice_1:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_4670(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_5050(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:zero"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RowPartitionFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RowPartitionFromRowSplits/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_5040(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_5410(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:monotonic"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x >= 0 did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RowPartitionFromRowSplits/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<?xi64>) -> ()
+  %3 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %4 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %4 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_5400(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_false_5770(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to _from_row_partition do not form a valid RaggedTensor"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RaggedNRows/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_true_5760(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_6120(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:zero"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/RowPartitionFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/RowPartitionFromRowSplits/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_6110(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_6480(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:monotonic"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x >= 0 did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/RowPartitionFromRowSplits/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<?xi64>) -> ()
+  %3 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %4 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %4 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_6470(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_6820(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to _from_row_partition do not form a valid RaggedTensor"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/strided_slice_1:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_1/RaggedFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_1_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_6810(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_7190(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:zero"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/RowPartitionFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/RowPartitionFromRowSplits/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_7180(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_7550(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:monotonic"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x >= 0 did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/RowPartitionFromRowSplits/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<?xi64>) -> ()
+  %3 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %4 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %4 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_7540(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_7890(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to _from_row_partition do not form a valid RaggedTensor"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/strided_slice_1:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (WhitespaceTokenize/WhitespaceTokenize/RaggedFromNestedRowSplits_2/RaggedFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedFromNestedRowSplits_2_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_7880(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_AssertGuard_false_8690(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_AssertGuard_true_8680(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_1_AssertGuard_false_9760(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_1_AssertGuard_true_9750(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_2_AssertGuard_false_10250(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<?xi64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_2_AssertGuard_true_10240(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_3_AssertGuard_false_10610(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<?xi64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_Assert_3_AssertGuard_true_10600(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_Assert_AssertGuard_false_15310(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_Assert_AssertGuard_true_15300(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_Assert_1_AssertGuard_false_16380(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_Assert_1_AssertGuard_true_16370(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_Assert_2_AssertGuard_false_16870(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<?xi64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_Assert_2_AssertGuard_true_16860(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_Assert_3_AssertGuard_false_17230(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<?xi64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_Assert_3_AssertGuard_true_17220(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_AssertGuard_false_21910(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_AssertGuard_true_21900(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_1_AssertGuard_false_22980(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_1_AssertGuard_true_22970(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_2_AssertGuard_false_23470(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<?xi64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_2_AssertGuard_true_23460(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<?>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_3_AssertGuard_false_23830(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Unable to broadcast: dimension size mismatch in dimension"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<"lengths="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"dim_size="> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 10 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<i32>, tensor<!tf.string>, tensor<?xi64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_3_AssertGuard_true_23820(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<i64>) -> tensor<i1> attributes {tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+
+// CHECK: func @whitespace_tokenizer_rank0(%arg0: tensor<!tf.string> {tf._user_specified_name = "input"}) -> tensor<?x!tf.string> attributes {sym_visibility = "private", tf._implements = #tf.func<@"tftext:WhitespaceTokenizer", {}>, tf._input_shapes = [#tf.shape<>], tf.signature.is_stateful} {
+// CHECK: %0 = "tfl.custom"(%arg0) {custom_code = "tftext:WhitespaceTokenizer", custom_option = opaque<"tfl", "0x"> : tensor<0xi8>} : (tensor<!tf.string>) -> tensor<?x!tf.string>
+// CHECK: return %0 : tensor<?x!tf.string>
+
+func @ngrams(%arg0: tensor<?x!tf.string> {tf._user_specified_name = "input"}) -> tensor<?x!tf.string> attributes {tf._input_shapes = [#tf.shape<?>], tf._implements = #tf.func<@"tftext:Ngrams", {axis = -1 : i64, reduction_type = "STRING_JOIN", string_separator = " ", width = 2 : i64}>} {
+  %0 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tf.Const"() {value = dense<[0, -1]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %2 = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %3 = "tf.Const"() {value = dense<0> : tensor<2xi32>} : () -> tensor<2xi32>
+  %4 = "tf.Const"() {value = dense<1> : tensor<2xi32>} : () -> tensor<2xi32>
+  %5 = "tf.StridedSlice"(%arg0, %3, %1, %4) {begin_mask = 0 : i64, device = "", ellipsis_mask = 1 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?x!tf.string>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<?x!tf.string>
+  %6 = "tf.StridedSlice"(%arg0, %2, %3, %4) {begin_mask = 0 : i64, device = "", ellipsis_mask = 1 : i64, end_mask = 2 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?x!tf.string>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<?x!tf.string>
+  %7 = "tf.Pack"(%5, %6) {axis = -1 : i64, device = ""} : (tensor<?x!tf.string>, tensor<?x!tf.string>) -> tensor<?x2x!tf.string>
+  %8 = "tf.ReduceJoin"(%7, %0) {device = "", keep_dims = false, separator = " "} : (tensor<?x2x!tf.string>, tensor<i32>) -> tensor<?x!tf.string>
+  %9 = "tf.Identity"(%8) {device = ""} : (tensor<?x!tf.string>) -> tensor<?x!tf.string>
+  return %9 : tensor<?x!tf.string>
+}
+
+// CHECK: func @ngrams(%arg0: tensor<?x!tf.string> {tf._user_specified_name = "input"}) -> tensor<?x!tf.string> attributes {tf._implements = #tf.func<@"tftext:Ngrams", {axis = -1 : i64, reduction_type = "STRING_JOIN", string_separator = " ", width = 2 : i64}>, tf._input_shapes = [#tf.shape<?>]} {
+// CHECK:   %0 = "tfl.custom"(%arg0) {custom_code = "tftext:Ngrams", custom_option = opaque<"tfl", "0x776964746800737472696E675F736570617261746F72000120006178697300726564756374696F6E5F74797065000B535452494E475F4A4F494E0004221E383F040104FF152D0204141404082401"> : tensor<78xi8>} : (tensor<?x!tf.string>) -> tensor<?x!tf.string>
+// CHECK:   return %0 : tensor<?x!tf.string>
+// CHECK: }
+
+func @ngrams_ragged_rank_2(%arg0: tensor<?x!tf.string> {tf._user_specified_name = "values"}, %arg1: tensor<3xi64> {tf._user_specified_name = "args_0"}, %arg2: tensor<?xi64> {tf._user_specified_name = "args_1"}) -> (tensor<?x!tf.string>, tensor<3xi64>, tensor<?xi64>) attributes {sym_visibility = "private", tf._implements = #tf.func<@"tftext:Ngrams", {axis = -1 : i64, reduction_type = "STRING_JOIN", string_separator = "", width = 2 : i64}>, tf._input_shapes = [#tf.shape<?>, #tf.shape<3>, #tf.shape<?>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tf.Const"() {value = dense<-1> : tensor<i64>} : () -> tensor<i64>
+  %2 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %3 = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+  %4 = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
+  %5 = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+  %6 = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+  %7 = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
+  %8 = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  %9 = "tf.StridedSlice"(%arg1, %7, %8, %8) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<3xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %10 = "tf.Equal"(%9, %4) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %11 = "tf.All"(%10, %5) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %12 = "tf.StridedSlice"(%arg1, %8, %7, %8) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<3xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi64>
+  %13 = "tf.StridedSlice"(%arg1, %7, %6, %8) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<3xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi64>
+  %14 = "tf.Sub"(%12, %13) {device = ""} : (tensor<2xi64>, tensor<2xi64>) -> tensor<2xi64>
+  %15 = "tf.LessEqual"(%4, %14) {device = ""} : (tensor<i64>, tensor<2xi64>) -> tensor<2xi1>
+  %16 = "tf.All"(%15, %7) {device = "", keep_dims = false} : (tensor<2xi1>, tensor<1xi32>) -> tensor<i1>
+  %17 = "tf.StridedSlice"(%arg2, %7, %8, %8) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %18 = "tf.Equal"(%17, %4) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %19 = "tf.All"(%18, %5) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %20 = "tf.IfRegion"(%19) ( {
+    %72 = "std.call"(%19, %17, %4) {callee = @RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_27770} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+    "tf.Yield"(%72) : (tensor<i1>) -> ()
+  },  {
+    %72 = "std.call"(%19, %17, %4) {callee = @RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_27780} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+    "tf.Yield"(%72) : (tensor<i1>) -> ()
+  }) {is_stateless = false} : (tensor<i1>) -> tensor<i1>
+  %21 = "tf.Identity"(%20) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %22 = "tf.StridedSlice"(%arg2, %8, %7, %8) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %23 = "tf.StridedSlice"(%arg2, %7, %6, %8) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %24 = "tf.Sub"(%22, %23) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %25 = "tf.LessEqual"(%4, %24) {device = ""} : (tensor<i64>, tensor<?xi64>) -> tensor<?xi1>
+  %26 = "tf.All"(%25, %7) {device = "", keep_dims = false} : (tensor<?xi1>, tensor<1xi32>) -> tensor<i1>
+  %27 = "tf.IfRegion"(%26) ( {
+    %72 = "std.call"(%26, %24) {callee = @RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_28130} : (tensor<i1>, tensor<?xi64>) -> tensor<i1>
+    "tf.Yield"(%72) : (tensor<i1>) -> ()
+  },  {
+    %72 = "std.call"(%26, %24) {callee = @RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_28140} : (tensor<i1>, tensor<?xi64>) -> tensor<i1>
+    "tf.Yield"(%72) : (tensor<i1>) -> ()
+  }) {is_stateless = false} : (tensor<i1>) -> tensor<i1>
+  %28 = "tf.Identity"(%27) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %29 = "tf.Identity"(%arg2) {_class = ["loc:@args_1"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %30 = "tf.StridedSlice"(%29, %6, %7, %8) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %31 = "tf.Shape"(%arg0) {device = ""} : (tensor<?x!tf.string>) -> tensor<1xi64>
+  %32 = "tf.StridedSlice"(%31, %7, %8, %8) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %33 = "tf.Equal"(%30, %32) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %34 = "tf.All"(%33, %5) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %35 = "tf.IfRegion"(%34) ( {
+    %72 = "std.call"(%34, %30, %32) {callee = @RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_28500} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+    "tf.Yield"(%72) : (tensor<i1>) -> ()
+  },  {
+    %72 = "std.call"(%34, %30, %32) {callee = @RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_28510} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+    "tf.Yield"(%72) : (tensor<i1>) -> ()
+  }) {is_stateless = false} : (tensor<i1>) -> tensor<i1>
+  %36 = "tf.Identity"(%35) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %37 = "tf.Identity"(%29) {_class = ["loc:@args_1"], device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %38 = "tf.StridedSlice"(%37, %7, %6, %8) {begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %39 = "tf.StridedSlice"(%37, %8, %7, %8) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<?xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi64>
+  %40 = "tf.Minimum"(%38, %39) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %41 = "tf.AddV2"(%39, %1) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
+  %42 = "tf.Maximum"(%41, %38) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %43:2 = "tf.RaggedRange"(%40, %42, %3) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %44 = "tf.GatherV2"(%arg0, %43#1, %2) {batch_dims = 0 : i64, device = ""} : (tensor<?x!tf.string>, tensor<?xi64>, tensor<i32>) -> tensor<?x!tf.string>
+  %45 = "tf.AddV2"(%38, %3) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
+  %46 = "tf.Minimum"(%45, %39) {device = ""} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi64>
+  %47:2 = "tf.RaggedRange"(%46, %39, %3) {T = i64, Tsplits = i64, device = ""} : (tensor<?xi64>, tensor<?xi64>, tensor<i64>) -> (tensor<?xi64>, tensor<?xi64>)
+  %48 = "tf.Equal"(%43#0, %47#0) {device = "", incompatible_shape_error = true} : (tensor<?xi64>, tensor<?xi64>) -> tensor<?xi1>
+  %49 = "tf.All"(%48, %7) {device = "", keep_dims = false} : (tensor<?xi1>, tensor<1xi32>) -> tensor<i1>
+  %50 = "tf.GatherV2"(%arg0, %47#1, %2) {batch_dims = 0 : i64, device = ""} : (tensor<?x!tf.string>, tensor<?xi64>, tensor<i32>) -> tensor<?x!tf.string>
+  %51 = "tf.Shape"(%37) {device = ""} : (tensor<?xi64>) -> tensor<1xi64>
+  %52 = "tf.StridedSlice"(%51, %7, %8, %8) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %53 = "tf.Sub"(%52, %3) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %54 = "tf.IfRegion"(%11) ( {
+    %72 = "std.call"(%11, %9, %4) {callee = @RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_28900} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+    "tf.Yield"(%72) : (tensor<i1>) -> ()
+  },  {
+    %72 = "std.call"(%11, %9, %4) {callee = @RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_28910} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+    "tf.Yield"(%72) : (tensor<i1>) -> ()
+  }) {is_stateless = false} : (tensor<i1>) -> tensor<i1>
+  %55 = "tf.Identity"(%54) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %56 = "tf.IfRegion"(%16) ( {
+    %72 = "std.call"(%16, %14) {callee = @RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_29260} : (tensor<i1>, tensor<2xi64>) -> tensor<i1>
+    "tf.Yield"(%72) : (tensor<i1>) -> ()
+  },  {
+    %72 = "std.call"(%16, %14) {callee = @RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_29270} : (tensor<i1>, tensor<2xi64>) -> tensor<i1>
+    "tf.Yield"(%72) : (tensor<i1>) -> ()
+  }) {is_stateless = false} : (tensor<i1>) -> tensor<i1>
+  %57 = "tf.Identity"(%56) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %58 = "tf.Identity"(%arg1) {_class = ["loc:@args_0"], device = ""} : (tensor<3xi64>) -> tensor<3xi64>
+  %59 = "tf.StridedSlice"(%58, %6, %7, %8) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<3xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
+  %60 = "tf.Equal"(%59, %53) {device = "", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %61 = "tf.All"(%60, %5) {device = "", keep_dims = false} : (tensor<i1>, tensor<0xi32>) -> tensor<i1>
+  %62 = "tf.IfRegion"(%61) ( {
+    %72 = "std.call"(%61, %59, %53) {callee = @RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_true_29650} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+    "tf.Yield"(%72) : (tensor<i1>) -> ()
+  },  {
+    %72 = "std.call"(%61, %59, %53) {callee = @RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_false_29660} : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i1>
+    "tf.Yield"(%72) : (tensor<i1>) -> ()
+  }) {is_stateless = false} : (tensor<i1>) -> tensor<i1>
+  %63 = "tf.IfRegion"(%49) ( {
+    %72 = "std.call"(%49, %43#0, %47#0) {callee = @NGrams_SlidingWindow_RaggedConcat_assert_equal_2_Assert_AssertGuard_true_30330} : (tensor<i1>, tensor<?xi64>, tensor<?xi64>) -> tensor<i1>
+    "tf.Yield"(%72) : (tensor<i1>) -> ()
+  },  {
+    %72 = "std.call"(%49, %43#0, %47#0) {callee = @NGrams_SlidingWindow_RaggedConcat_assert_equal_2_Assert_AssertGuard_false_30340} : (tensor<i1>, tensor<?xi64>, tensor<?xi64>) -> tensor<i1>
+    "tf.Yield"(%72) : (tensor<i1>) -> ()
+  }) {is_stateless = false} : (tensor<i1>) -> tensor<i1>
+  %64 = "tf.Identity"(%43#0) {device = ""} : (tensor<?xi64>) -> tensor<?xi64>
+  %65 = "tf.Identity"(%63) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %66 = "tf.Pack"(%44, %50) {axis = 1 : i64, device = ""} : (tensor<?x!tf.string>, tensor<?x!tf.string>) -> tensor<?x2x!tf.string>
+  %67 = "tf.ReduceJoin"(%66, %0) {device = "", keep_dims = false, separator = ""} : (tensor<?x2x!tf.string>, tensor<i32>) -> tensor<?x!tf.string>
+  %68 = "tf.Identity"(%67) {device = ""} : (tensor<?x!tf.string>) -> tensor<?x!tf.string>
+  %69 = "tf.Identity"(%62) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %70 = "tf.Identity"(%58) {_class = ["loc:@args_0"], device = ""} : (tensor<3xi64>) -> tensor<3xi64>
+  %71 = "tf.Identity"(%70) {device = ""} : (tensor<3xi64>) -> tensor<3xi64>
+  return %68, %71, %64 : tensor<?x!tf.string>, tensor<3xi64>, tensor<?xi64>
+}
+func @RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_27770(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {sym_visibility = "private", tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_27780(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {sym_visibility = "private", tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:zero"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (RaggedFromNestedRowSplits/RaggedFromRowSplits/RowPartitionFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (RaggedFromNestedRowSplits/RaggedFromRowSplits/RowPartitionFromRowSplits/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_28130(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {sym_visibility = "private", tf._input_shapes = [#tf.shape<>, #tf.shape<?>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @RaggedFromNestedRowSplits_RaggedFromRowSplits_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_28140(%arg0: tensor<i1>, %arg1: tensor<?xi64>) -> tensor<i1> attributes {sym_visibility = "private", tf._input_shapes = [#tf.shape<>, #tf.shape<?>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:monotonic"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x >= 0 did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (RaggedFromNestedRowSplits/RaggedFromRowSplits/RowPartitionFromRowSplits/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<?xi64>) -> ()
+  %3 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %4 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %4 : tensor<i1>
+}
+func @RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_true_28500(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {sym_visibility = "private", tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @RaggedFromNestedRowSplits_RaggedFromRowSplits_assert_equal_1_Assert_AssertGuard_false_28510(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {sym_visibility = "private", tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to _from_row_partition do not form a valid RaggedTensor"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (RaggedFromNestedRowSplits/RaggedFromRowSplits/strided_slice_1:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (RaggedFromNestedRowSplits/RaggedFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+"tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_true_28900(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {sym_visibility = "private", tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_equal_1_Assert_AssertGuard_false_28910(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {sym_visibility = "private", tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:zero"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RowPartitionFromRowSplits/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RowPartitionFromRowSplits/Const:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_true_29260(%arg0: tensor<i1>, %arg1: tensor<2xi64>) -> tensor<i1> attributes {sym_visibility = "private", tf._input_shapes = [#tf.shape<>, #tf.shape<2>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @RaggedFromNestedRowSplits_RaggedFromRowSplits_1_RowPartitionFromRowSplits_assert_non_negative_assert_less_equal_Assert_AssertGuard_false_29270(%arg0: tensor<i1>, %arg1: tensor<2xi64>) -> tensor<i1> attributes {sym_visibility = "private", tf._input_shapes = [#tf.shape<>, #tf.shape<2>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to from_row_splits do not form a valid RaggedTensor:monotonic"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x >= 0 did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RowPartitionFromRowSplits/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<2xi64>) -> ()
+  %3 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %4 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %4 : tensor<i1>
+}
+func @RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_true_29650(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {sym_visibility = "private", tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @RaggedFromNestedRowSplits_RaggedFromRowSplits_1_assert_equal_1_Assert_AssertGuard_false_29660(%arg0: tensor<i1>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<i1> attributes {sym_visibility = "private", tf._input_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Arguments to _from_row_partition do not form a valid RaggedTensor"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (RaggedFromNestedRowSplits/RaggedFromRowSplits_1/strided_slice:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (RaggedFromNestedRowSplits/RaggedFromRowSplits_1/RaggedNRows/sub:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<i64>, tensor<!tf.string>, tensor<i64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+func @NGrams_SlidingWindow_RaggedConcat_assert_equal_2_Assert_AssertGuard_true_30330(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {sym_visibility = "private", tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<?>]} {
+  %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %1 : tensor<i1>
+}
+func @NGrams_SlidingWindow_RaggedConcat_assert_equal_2_Assert_AssertGuard_false_30340(%arg0: tensor<i1>, %arg1: tensor<?xi64>, %arg2: tensor<?xi64>) -> tensor<i1> attributes {sym_visibility = "private", tf._input_shapes = [#tf.shape<>, #tf.shape<?>, #tf.shape<?>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<"Inputs must have identical ragged splits"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %1 = "tf.Const"() {value = dense<"Condition x == y did not hold element-wise:"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %2 = "tf.Const"() {value = dense<"x (NGrams/SlidingWindow/RaggedGetItem/RaggedRange:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  %3 = "tf.Const"() {value = dense<"y (NGrams/SlidingWindow/RaggedGetItem_1/RaggedRange:0) = "> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  "tf.Assert"(%arg0, %0, %1, %2, %arg1, %3, %arg2) {device = "", summarize = 3 : i64} : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>, tensor<?xi64>, tensor<!tf.string>, tensor<?xi64>) -> ()
+  %4 = "tf.Identity"(%arg0) {device = ""} : (tensor<i1>) -> tensor<i1>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<i1>) -> tensor<i1>
+  return %5 : tensor<i1>
+}
+// CHECK:  func @ngrams_ragged_rank_2(%arg0: tensor<?x!tf.string> {tf._user_specified_name = "values"}, %arg1: tensor<3xi64> {tf._user_specified_name = "args_0"}, %arg2: tensor<?xi64> {tf._user_specified_name = "args_1"}) -> (tensor<?x!tf.string>, tensor<3xi64>, tensor<?xi64>) attributes {sym_visibility = "private", tf._implements = #tf.func<@"tftext:Ngrams", {axis = -1 : i64, reduction_type = "STRING_JOIN", string_separator = "", width = 2 : i64}>, tf._input_shapes = [#tf.shape<?>, #tf.shape<3>, #tf.shape<?>], tf.signature.is_stateful} {
+// CHECK:    %0:3 = "tfl.custom"(%arg0, %arg1, %arg2) {custom_code = "tftext:Ngrams", custom_option = opaque<"tfl", "0x776964746800737472696E675F736570617261746F720000006178697300726564756374696F6E5F74797065000B535452494E475F4A4F494E0004221E373E040104FF152C0204141404082401"> : tensor<77xi8>} : (tensor<?x!tf.string>, tensor<3xi64>, tensor<?xi64>) -> (tensor<?x!tf.string>, tensor<3xi64>, tensor<?xi64>)
+// CHECK:    return %0#0, %0#1, %0#2 : tensor<?x!tf.string>, tensor<3xi64>, tensor<?xi64>
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir b/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir
index 1a61bc3f517..1ebe912284b 100644
--- a/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir
@@ -277,6 +277,45 @@ func @tensorlistWhileCond(%arg0: tensor<i32>, %arg1: tensor<!tf.variant>) -> ten
 // CHECK:  return %[[RESULT]] : tensor<i1>
 }
 
+// CHECK-LABEL: func @tensorlistWhileRegion
+func @tensorlistWhileRegion(%arg0: tensor<2x3xf32>) -> tensor<*xf32> {
+  %cst = constant dense<3> : tensor<1xi32>
+  %cst_0 = constant dense<0> : tensor<i32>
+  %cst_1 = constant dense<-1> : tensor<i32>
+  %0 = "tf.TensorListFromTensor"(%arg0, %cst) : (tensor<2x3xf32>, tensor<1xi32>) -> tensor<!tf.variant<tensor<3xf32>>>
+  // CHECK: "tf.WhileRegion"
+  %1:2 = "tf.WhileRegion"(%cst_0, %0) ({
+      ^bb0(%carg0: tensor<i32>, %carg1: tensor<!tf.variant>):
+       %cst_2 = constant dense<2> : tensor<i32>
+       %1 = "tf.Less"(%carg0, %cst_2) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+       "tf.Yield"(%1) : (tensor<i1>) -> ()
+
+      // verify condition types
+      // CHECK: ^bb0(%[[CARG0:.*]]: tensor<i32>, %[[CARG1:.*]]: tensor<*xf32>):
+      // CHECK:  %[[COND:.*]] = "tf.Less"(%[[CARG0]], {{.*}}) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+      // CHECK:  "tf.Yield"(%[[COND]]) : (tensor<i1>) -> ()
+
+    },
+    {
+      ^bb0(%barg0: tensor<i32>, %barg1: tensor<!tf.variant>):
+       %1 = "tf.TensorListLength"(%barg1) : (tensor<!tf.variant>) -> tensor<i32>
+       "tf.Yield"(%1, %barg1) : (tensor<i32>, tensor<!tf.variant>) -> ()
+
+      // verify body types
+      // CHECK: ^bb0(%[[BARG0:.*]]: tensor<i32>, %[[BARG1:.*]]: tensor<*xf32>):
+      // CHECK-NOT: tensor<!tf.variant>
+      // CHECK:  %[[LEN:.*]] = "tf.Gather"
+      // CHECK-NOT: tensor<!tf.variant>
+      // CHECK:  "tf.Yield"(%[[LEN]], %[[BARG1]]) : (tensor<i32>, tensor<*xf32>) -> ()
+
+  }) {is_stateless = false} : (tensor<i32>, tensor<!tf.variant<tensor<3xf32>>>) -> (tensor<i32>, tensor<!tf.variant<tensor<*xf32>>>)
+  // make sure the variant types in input/output have been updated
+  // CHECK: {is_stateless = false} : (tensor<i32>, tensor<2x3xf32>) -> (tensor<i32>, tensor<*xf32>)
+  %2 = "tf.TensorListStack"(%1#1, %cst_1) : (tensor<!tf.variant<tensor<*xf32>>>, tensor<i32>) -> tensor<*xf32>
+  // CHECK:  return %0#1 : tensor<*xf32>
+  return %2 : tensor<*xf32>
+}
+
 func @tensorlistResize(%arg0: tensor<3x10xf32>, %arg1: tensor<1xi32>, %arg2: tensor<i32>) -> tensor<?x10xf32> {
   %0 = "tf.TensorListFromTensor"(%arg0, %arg1) : (tensor<3x10xf32>, tensor<1xi32>) -> tensor<!tf.variant<tensor<10xf32>>>
   %1 = "tf.TensorListResize"(%0, %arg2) : (tensor<!tf.variant<tensor<10xf32>>>, tensor<i32>) -> tensor<!tf.variant<tensor<10xf32>>>
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_complex128.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_complex128.mlir
new file mode 100644
index 00000000000..a5e6d4aabb5
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_complex128.mlir
@@ -0,0 +1,66 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -emit-select-tf-ops -o - | flatbuffer_to_string - | FileCheck %s
+
+func @main(tensor<4xcomplex<f64>>, tensor<4xcomplex<f64>>) -> tensor<4xcomplex<f64>> {
+^bb0(%arg0: tensor<4xcomplex<f64>>, %arg1: tensor<4xcomplex<f64>>):
+// CHECK:  {
+// CHECK-NEXT:  version: 3,
+// CHECK-NEXT:  operator_codes: [ {
+// CHECK-NEXT:    builtin_code: CUSTOM,
+// CHECK-NEXT:    custom_code: "FlexAdd"
+// CHECK-NEXT:  } ],
+// CHECK-NEXT:  subgraphs: [ {
+// CHECK-NEXT:    tensors: [ {
+// CHECK-NEXT:      shape: [ 4 ],
+// CHECK-NEXT:      type: COMPLEX128,
+// CHECK-NEXT:      buffer: 1,
+// CHECK-NEXT:      name: "arg0",
+// CHECK-NEXT:      quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:      }
+// CHECK-NEXT:    }, {
+// CHECK-NEXT:      shape: [ 4 ],
+// CHECK-NEXT:      type: COMPLEX128,
+// CHECK-NEXT:      buffer: 2,
+// CHECK-NEXT:      name: "arg1",
+// CHECK-NEXT:      quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:      }
+// CHECK-NEXT:    }, {
+// CHECK-NEXT:      shape: [ 4 ],
+// CHECK-NEXT:      type: COMPLEX128,
+// CHECK-NEXT:      buffer: 3,
+// CHECK-NEXT:      name: "add",
+// CHECK-NEXT:      quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:      }
+// CHECK-NEXT:    } ],
+// CHECK-NEXT:    inputs: [ 0, 1 ],
+// CHECK-NEXT:    outputs: [ 2 ],
+// CHECK-NEXT:    operators: [ {
+// CHECK-NEXT:      inputs: [ 0, 1 ],
+// CHECK-NEXT:      outputs: [ 2 ],
+// CHECK-NEXT:      custom_options: [ 3, 65, 100, 100, 0, 20, 18, 3, 65, 100, 100, 26, 0, 26, 0, 42, 7, 10, 1, 84, 18, 2, 48, 18, 50, 0, 0, 2, 27, 23, 20, 20, 4, 40, 1 ]
+// CHECK-NEXT:    } ],
+// CHECK-NEXT:    name: "main"
+// CHECK-NEXT:  } ],
+// CHECK-NEXT:  description: "MLIR Converted.",
+// CHECK-NEXT:  buffers: [ {
+// CHECK-EMPTY:
+// CHECK-NEXT:  }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:  }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:  }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:  }, {
+// CHECK-NEXT:    data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:  } ],
+// CHECK-NEXT:  metadata: [ {
+// CHECK-NEXT:    name: "min_runtime_version",
+// CHECK-NEXT:    buffer: 4
+// CHECK-NEXT:  } ]
+// CHECK-NEXT:}
+
+  %0 = "tf.Add"(%arg0, %arg1)  : (tensor<4xcomplex<f64>>, tensor<4xcomplex<f64>>) -> tensor<4xcomplex<f64>> loc("add")
+  return %0 : tensor<4xcomplex<f64>>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir
index 5f434e954c8..7ef6997f938 100644
--- a/tensorflow/compiler/mlir/lite/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir
@@ -598,6 +598,16 @@ func @testMaxPool2DWrongOperandStorageType(tensor<1x7x7x16x!quant.uniform<i9:f32
 
 // -----
 
+func @testTFLiteDetectionPostProcess(%arg0: tensor<1x64x64x32xf32>, %arg1: tensor<1x64x64x32xf32>, %arg2: tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>) {
+  %0, %1, %2, %3 = "tfl.custom_tf"(%arg0, %arg1, %arg2) ({
+    %4, %5, %6, %7 = "tf.TFLite_Detection_PostProcess"(%arg0, %arg1, %arg2) {_output_quantized = true, _output_types = [f32, f32, f32, f32], _support_output_type_float_in_quantized_op = true, detections_per_class = 100 : i64, device = "", h_scale = 5.000000e+00 : f32, max_classes_per_detection = 1 : i64, max_detections = 20 : i64, nms_iou_threshold = 6.000000e-01 : f32, nms_score_threshold = 3.000000e-01 : f32, num_classes = 90 : i64, use_regular_nms = false, w_scale = 5.000000e+00 : f32, x_scale = 1.000000e+01 : f32, y_scale = 1.000000e+01 : f32} : (tensor<1x64x64x32xf32>, tensor<1x64x64x32xf32>, tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>)
+    "tfl.yield"(%4, %5, %6, %7) : (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>) -> ()
+  }) : (tensor<1x64x64x32xf32>, tensor<1x64x64x32xf32>, tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>)
+  return %0, %1 : tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>
+}
+
+// -----
+
 func @testMaxPoolingWithArgMax2D(%arg0: tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>) {
   // custom op for "tfl.max_pooling_with_argmax_2d"(%arg0) {filter_h = 2 : i32, filter_w = 2 : i32, padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>)
   %0, %1 = "tfl.custom"(%arg0) {custom_option = opaque<"tfl", "0x01000000020000000200000002000000020000000000000000000000000000000000000000000000"> : tensor<40xi8>, custom_code = "MaxPoolingWithArgmax2D"} : (tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>)
@@ -1238,6 +1248,13 @@ func @testSpaceToBatchND(%arg0 : tensor<1x4x4x3xf32>, %arg1 : tensor<2xi32>, %ar
 
 // -----
 
+func @testBatchMatmulQuant(%arg0 : tensor<1x4x384x32x!quant.uniform<i8:f32, 0.06:-2>>, %arg1 : tensor<1x4x384x32x!quant.uniform<i8:f32, 0.11:-16>>) -> tensor<1x4x384x384x!quant.uniform<i8:f32, 1.02:-73>> {
+  // CHECK: "tfl.batch_matmul"(%arg0, %arg1)
+  %0 = "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = true} : (tensor<1x4x384x32x!quant.uniform<i8:f32, 0.06:-2>>, tensor<1x4x384x32x!quant.uniform<i8:f32, 0.11:-16>>) -> tensor<1x4x384x384x!quant.uniform<i8:f32, 1.02:-73>>
+  return %0 : tensor<1x4x384x384x!quant.uniform<i8:f32, 1.02:-73>>
+}
+// -----
+
 func @testConcat(%arg0: tensor<1x2xi32>, %arg1: tensor<1x2xi32>) -> tensor<2x2xi32> {
   // CHECK: "tfl.concatenation"(%arg0, %arg1) {axis = 0 : i32, fused_activation_function = "NONE"}
   %0 = "tfl.concatenation"(%arg0, %arg1) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2xi32>, tensor<1x2xi32>) -> tensor<2x2xi32>
diff --git a/tensorflow/compiler/mlir/lite/tests/optimize.mlir b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
index 7861eb1ec6b..7923c82ba92 100644
--- a/tensorflow/compiler/mlir/lite/tests/optimize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
@@ -400,6 +400,32 @@ func @FuseFullyConnectedReshapeAddConst(%arg0: tensor<40x37xf32>, %arg1: tensor<
   // FOLD: return %[[fc]]
 }
 
+// CHECK-LABEL: @FuseFullyConnectedReshapeAddConstWithActivation
+// FOLD-LABEL: @FuseFullyConnectedReshapeAddConstWithActivation
+func @FuseFullyConnectedReshapeAddConstWithActivation(%arg0: tensor<40x37xf32>, %arg1: tensor<40x37xf32>) -> tensor<40x40xf32> {
+  %cst = constant dense<3.0> : tensor<40x40xf32>
+  %cst2 = constant dense<2.0> : tensor<40xf32>
+  %shape1 = constant dense<[1, 40, 40]> : tensor<3xi32>
+  %shape2 = constant dense<[40, 40]> : tensor<2xi32>
+
+  %0 = "tfl.fully_connected"(%arg0, %arg1, %cst) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<40x37xf32>, tensor<40x37xf32>, tensor<40x40xf32>) -> (tensor<40x40xf32>)
+  %1 = "tfl.reshape"(%0, %shape1) : (tensor<40x40xf32>, tensor<3xi32>) -> tensor<1x40x40xf32>
+  %2 = "tfl.add"(%1, %cst2) {fused_activation_function = "RELU6"} : (tensor<1x40x40xf32>, tensor<40xf32>) -> tensor<1x40x40xf32>
+  %3 = "tfl.reshape"(%2, %shape2) : (tensor<1x40x40xf32>, tensor<2xi32>) -> tensor<40x40xf32>
+
+  return %3 : tensor<40x40xf32>
+
+  // CHECK: %[[cst:.*]] = constant dense<5.000000e+00> : tensor<40x40xf32>
+  // CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %arg1, %[[cst]]) {fused_activation_function = "RELU6", keep_num_dims = false, weights_format = "DEFAULT"}
+  // CHECK: %[[rs1:.*]] = "tfl.reshape"(%[[fc]]
+  // CHECK: %[[rs2:.*]] = "tfl.reshape"(%[[rs1]]
+  // CHECK: return %[[rs2]]
+
+  // FOLD: %[[cst:.*]] = constant dense<5.000000e+00> : tensor<40x40xf32>
+  // FOLD: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %arg1, %[[cst]]) {fused_activation_function = "RELU6", keep_num_dims = false, weights_format = "DEFAULT"}
+  // FOLD: return %[[fc]]
+}
+
 // CHECK-LABEL: @NotReorderReshapeAddIfNotBroadcastableAfter
 func @NotReorderReshapeAddIfNotBroadcastableAfter(%arg0: tensor<40x10x4xf32>) -> tensor<40x40xf32> {
   %cst = constant dense<2.0> : tensor<40xf32>
@@ -829,6 +855,15 @@ func @doNotConvertNonTrivialTransposeToReshape(%arg0: tensor<6x6x256x1xf32>) ->
   // CHECK: return %[[RESULT]]
 }
 
+// CHECK-LABEL: Relu
+func @Relu(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
+  %cst = constant dense<0.0> : tensor<f32>
+  %0 = "tfl.maximum"(%arg0, %cst) : (tensor<2x3xf32>, tensor<f32>) -> tensor<2x3xf32>
+  return %0 : tensor<2x3xf32>
+
+  // CHECK: %[[RESULT:.*]] = "tfl.relu"(%arg0)
+  // CHECK: return %[[RESULT]]
+}
 
 // CHECK-LABEL: Relu1
 func @Relu1(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
@@ -992,3 +1027,91 @@ func @RemoveCast(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   // CHECK: return %arg0
 }
 
+func @squaredDifferenceReluRemoveRelu(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
+  %0 = "tfl.squared_difference"(%arg0, %arg1) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+  %1 = "tfl.relu"(%0) : (tensor<1xf32>) -> tensor<1xf32>
+  return %1: tensor<1xf32>
+
+// CHECK-LABEL: squaredDifferenceReluRemoveRelu
+// CHECK:  %[[RESULT:.*]] = tfl.squared_difference %arg0, %arg1 : tensor<1xf32>
+// CHECK:  return %[[RESULT]]
+}
+
+func @ConvertSqueezeToReshapeWithDynamicDimension(%arg0: tensor<?x1x8x3xf32>) -> tensor<?x8x3xf32> {
+  %0 = "tfl.squeeze"(%arg0) {squeeze_dims = [1]}: (tensor<?x1x8x3xf32>) -> tensor<?x8x3xf32>
+  return %0: tensor<?x8x3xf32>
+
+// CHECK-LABEL: ConvertSqueezeToReshapeWithDynamicDimension
+// CHECK: [[CONST:.*]] = constant dense<[-1, 8, 3]> : tensor<3xi32>
+// CHECK: %[[RESULT:.*]] = "tfl.reshape"(%arg0, %[[CONST:.*]]) : (tensor<?x1x8x3xf32>, tensor<3xi32>) -> tensor<?x8x3xf32>
+// CHECK:  return %[[RESULT]]
+}
+
+func @ConvertSqueezeToReshapeWithDynamicDimension2(%arg0: tensor<?x1x8x3xf32>) -> tensor<1x8x3xf32> {
+  %0 = "tfl.squeeze"(%arg0) {squeeze_dims = [0]}: (tensor<?x1x8x3xf32>) -> tensor<1x8x3xf32>
+  return %0: tensor<1x8x3xf32>
+
+// CHECK-LABEL: ConvertSqueezeToReshapeWithDynamicDimension2
+// CHECK: [[CONST:.*]] = constant dense<[1, 8, 3]> : tensor<3xi32>
+// CHECK: %[[RESULT:.*]] = "tfl.reshape"(%arg0, %[[CONST:.*]]) : (tensor<?x1x8x3xf32>, tensor<3xi32>) -> tensor<1x8x3xf32>
+// CHECK:  return %[[RESULT]]
+}
+
+func @DontConvertSqueezeToReshape(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = "tfl.squeeze"(%arg0) {squeeze_dims = [0]}: (tensor<*xf32>) -> tensor<*xf32>
+  return %0: tensor<*xf32>
+
+// CHECK-LABEL: DontConvertSqueezeToReshape
+// CHECK: %[[RESULT:.*]] = "tfl.squeeze"(%arg0)
+// CHECK:  return %[[RESULT]]
+}
+
+func @ConvertPow1ToIdentity(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %cst = constant dense<1.000000e+00> : tensor<f32>
+  %0 = "tfl.pow"(%arg0, %cst) : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+
+// CHECK-LABEL: ConvertPow1ToIdentity
+// CHECK: return %arg0
+}
+
+func @ConvertPow2ToSquare(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %cst = constant dense<2.000000e+00> : tensor<f32>
+  %0 = "tfl.pow"(%arg0, %cst) : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+
+// CHECK-LABEL: ConvertPow2ToSquare
+// CHECK: %[[RESULT:.*]] = tfl.mul %arg0, %arg0 {fused_activation_function = "NONE"} : tensor<2x2xf32>
+// CHECK: return %[[RESULT]]
+}
+
+func @ConvertIdentityGatherNdOp(%arg0: tensor<4x3xf32>) -> tensor<4x3xf32> {
+  %cst = constant dense<[[0], [1], [2], [3]]> : tensor<4x1xi32>
+  %0 = "tfl.gather_nd"(%arg0, %cst) : (tensor<4x3xf32>, tensor<4x1xi32>) -> tensor<4x3xf32>
+  return %0 : tensor<4x3xf32>
+
+// CHECK-LABEL: ConvertIdentityGatherNdOp
+// CHECK-SAME: (%[[ARG:.*]]: tensor<4x3xf32>) -> tensor<4x3xf32>
+// CHECK-NEXT: return %[[ARG]] : tensor<4x3xf32>
+}
+
+func @ConvertIdentityGatherNdOp3D(%arg0: tensor<4x3x4xf32>) -> tensor<4x3x4xf32> {
+  %cst = constant dense<[[0], [1], [2], [3]]> : tensor<4x1xi32>
+  %0 = "tfl.gather_nd"(%arg0, %cst) : (tensor<4x3x4xf32>, tensor<4x1xi32>) -> tensor<4x3x4xf32>
+  return %0 : tensor<4x3x4xf32>
+
+// CHECK-LABEL: ConvertIdentityGatherNdOp3D
+// CHECK-SAME: (%[[ARG:.*]]: tensor<4x3x4xf32>) -> tensor<4x3x4xf32>
+// CHECK-NEXT: return %[[ARG]] : tensor<4x3x4xf32>
+}
+
+func @ConvertIdentityScatterNd(%arg0: tensor<4x3xf32>) -> tensor<4x3xf32> {
+  %cst = constant dense<[[0], [1], [2], [3]]> : tensor<4x1xi32>
+  %shape = constant dense<[4, 3]> : tensor<2xi32>
+  %0 = "tfl.scatter_nd"(%cst, %arg0, %shape) : (tensor<4x1xi32>, tensor<4x3xf32>, tensor<2xi32>) -> tensor<4x3xf32>
+  return %0 : tensor<4x3xf32>
+
+// CHECK-LABEL: ConvertIdentityScatterNd
+// CHECK-SAME: (%[[ARG:.*]]: tensor<4x3xf32>) -> tensor<4x3xf32>
+// CHECK-NEXT: return %[[ARG]] : tensor<4x3xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir
index 7ce60d98062..6847cdd5874 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt -tfl-prepare-composite-funcs-tf %s -split-input-file -verify-diagnostics | FileCheck %s
+// RUN: tf-opt -tfl-prepare-composite-funcs-tf %s -split-input-file -verify-diagnostics | FILECHECK_OPTS="" FileCheck %s
 
 module{
 func @embedding(%arg0: tensor<*xf32>, %arg1: tensor<*xi32>) -> tensor<*xf32> attributes  {tf._implements = "embedding_matmul", tf._reference = "mlir"} {
@@ -154,18 +154,18 @@ func @layernormalizedlstmcellsimple(%arg0: tensor<1x?xf32>, %arg1: tensor<3x4xf3
 // -----
 
 module {
-func @inference_standard_lstm_time_major(%arg0: tensor<?x8x8xf32>, %arg1: tensor<?x10xf32>, %arg2: tensor<?x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x40xf32>, %arg5: tensor<40xf32>) -> (tensor<?x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = true} {
+func @inference_standard_lstm_time_major(%arg0: tensor<?x8x8xf32>, %arg1: tensor<8x10xf32>, %arg2: tensor<8x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x40xf32>, %arg5: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: false", "tfshape$unknown_rank: false"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = true} {
   %0 = "tf.BatchMatMulV2"(%arg0, %arg3) {adj_x = false, adj_y = false} : (tensor<?x8x8xf32>, tensor<8x40xf32>) -> tensor<?x8x40xf32>
   %1 = "tf.Add"(%0, %arg5) : (tensor<?x8x40xf32>, tensor<40xf32>) -> tensor<?x8x40xf32>
   %2 = "tf.BatchMatMulV2"(%1, %arg4) {adj_x = false, adj_y = true} : (tensor<?x8x40xf32>, tensor<10x40xf32>) -> tensor<?x8x10xf32>
-  %3 = "tf.Add"(%2, %arg1) : (tensor<?x8x10xf32>, tensor<?x10xf32>) -> tensor<?x8x10xf32>
-  %4 = "tf.Add"(%2, %arg2) : (tensor<?x8x10xf32>, tensor<?x10xf32>) -> tensor<?x8x10xf32>
-  %5 = "tf.Add"(%arg1, %arg2) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<?x10xf32>
+  %3 = "tf.Add"(%2, %arg1) : (tensor<?x8x10xf32>, tensor<8x10xf32>) -> tensor<?x8x10xf32>
+  %4 = "tf.Add"(%2, %arg2) : (tensor<?x8x10xf32>, tensor<8x10xf32>) -> tensor<?x8x10xf32>
+  %5 = "tf.Add"(%arg1, %arg2) : (tensor<8x10xf32>, tensor<8x10xf32>) -> tensor<8x10xf32>
   %6 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "/device:CPU:0", dtype = f32, value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  return %5, %4, %5, %5, %6 : tensor<?x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
+  return %5, %4, %5, %5, %6 : tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
 }
 
-// CHECK:       func @inference_standard_lstm_time_major([[VAL_0:%.*]]: tensor<?x8x8xf32>, [[VAL_1:%.*]]: tensor<?x10xf32>, [[VAL_2:%.*]]: tensor<?x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x40xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = true} {
+// CHECK:       func @inference_standard_lstm_time_major([[VAL_0:%.*]]: tensor<?x8x8xf32>, [[VAL_1:%.*]]: tensor<8x10xf32>, [[VAL_2:%.*]]: tensor<8x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x40xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: false", "tfshape$unknown_rank: false"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = true} {
 // CHECK:           [[VAL_6:%.*]] = constant dense<[1, 0]> : tensor<2xi32>
 // CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_6]]) : (tensor<8x40xf32>, tensor<2xi32>) -> tensor<40x8xf32>
 // CHECK:           [[VAL_8:%.*]] = constant dense<[1, 0]> : tensor<2xi32>
@@ -180,33 +180,33 @@ func @inference_standard_lstm_time_major(%arg0: tensor<?x8x8xf32>, %arg1: tensor
 // CHECK:           [[VAL_17:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
 // CHECK:           [[VAL_18:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_16]], [[VAL_17]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
 // CHECK:           [[VAL_19:%.*]] = constant unit
-// CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<?x10xf32>, tensor<?x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
+// CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
 // CHECK:           [[VAL_21:%.*]] = constant dense<[-1, 0, 0]> : tensor<3xi32>
 // CHECK:           [[VAL_22:%.*]] = constant dense<0> : tensor<3xi32>
 // CHECK:           [[VAL_23:%.*]] = constant dense<1> : tensor<3xi32>
 // CHECK:           [[VAL_24:%.*]] = "tf.StridedSlice"([[VAL_20]], [[VAL_21]], [[VAL_22]], [[VAL_23]]) {begin_mask = 6 : i64, ellipsis_mask = 0 : i64, end_mask = 6 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
-// CHECK:           [[VAL_25:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
-// CHECK:           [[VAL_26:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
+// CHECK:           [[VAL_25:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
+// CHECK:           [[VAL_26:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
 // CHECK:           [[VAL_27:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
-// CHECK:           return [[VAL_24]], [[VAL_20]], [[VAL_25]], [[VAL_26]], [[VAL_27]] : tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
+// CHECK:           return [[VAL_24]], [[VAL_20]], [[VAL_25]], [[VAL_26]], [[VAL_27]] : tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
 // CHECK:         }
 }
 
 // -----
 
 module {
-func @inference_standard_lstm_non_time_major(%arg0: tensor<8x8x8xf32>, %arg1: tensor<?x10xf32>, %arg2: tensor<?x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x40xf32>, %arg5: tensor<40xf32>) -> (tensor<?x10xf32>, tensor<8x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = false} {
+func @inference_standard_lstm_non_time_major(%arg0: tensor<8x8x8xf32>, %arg1: tensor<8x10xf32>, %arg2: tensor<8x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x40xf32>, %arg5: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: false", "tfshape$unknown_rank: false"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = false} {
   %0 = "tf.BatchMatMulV2"(%arg0, %arg3) {adj_x = false, adj_y = false} : (tensor<8x8x8xf32>, tensor<8x40xf32>) -> tensor<8x8x40xf32>
   %1 = "tf.Add"(%0, %arg5) : (tensor<8x8x40xf32>, tensor<40xf32>) -> tensor<8x8x40xf32>
   %2 = "tf.BatchMatMulV2"(%1, %arg4) {adj_x = false, adj_y = true} : (tensor<8x8x40xf32>, tensor<10x40xf32>) -> tensor<8x8x10xf32>
-  %3 = "tf.Add"(%2, %arg1) : (tensor<8x8x10xf32>, tensor<?x10xf32>) -> tensor<8x8x10xf32>
-  %4 = "tf.Add"(%2, %arg2) : (tensor<8x8x10xf32>, tensor<?x10xf32>) -> tensor<8x8x10xf32>
-  %5 = "tf.Add"(%arg1, %arg2) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<?x10xf32>
+  %3 = "tf.Add"(%2, %arg1) : (tensor<8x8x10xf32>, tensor<8x10xf32>) -> tensor<8x8x10xf32>
+  %4 = "tf.Add"(%2, %arg2) : (tensor<8x8x10xf32>, tensor<8x10xf32>) -> tensor<8x8x10xf32>
+  %5 = "tf.Add"(%arg1, %arg2) : (tensor<8x10xf32>, tensor<8x10xf32>) -> tensor<8x10xf32>
   %6 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "/device:CPU:0", dtype = f32, value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  return %5, %4, %5, %5, %6 : tensor<?x10xf32>, tensor<8x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
+  return %5, %4, %5, %5, %6 : tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
 }
 
-// CHECK:       func @inference_standard_lstm_non_time_major([[VAL_0:%.*]]: tensor<8x8x8xf32>, [[VAL_1:%.*]]: tensor<?x10xf32>, [[VAL_2:%.*]]: tensor<?x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x40xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = false} {
+// CHECK:       func @inference_standard_lstm_non_time_major([[VAL_0:%.*]]: tensor<8x8x8xf32>, [[VAL_1:%.*]]: tensor<8x10xf32>, [[VAL_2:%.*]]: tensor<8x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x40xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: false", "tfshape$unknown_rank: false"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = false} {
 // CHECK:           [[VAL_6:%.*]] = constant dense<[1, 0]> : tensor<2xi32>
 // CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_6]]) : (tensor<8x40xf32>, tensor<2xi32>) -> tensor<40x8xf32>
 // CHECK:           [[VAL_8:%.*]] = constant dense<[1, 0]> : tensor<2xi32>
@@ -221,15 +221,15 @@ func @inference_standard_lstm_non_time_major(%arg0: tensor<8x8x8xf32>, %arg1: te
 // CHECK:           [[VAL_17:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
 // CHECK:           [[VAL_18:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_16]], [[VAL_17]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
 // CHECK:           [[VAL_19:%.*]] = constant unit
-// CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<?x10xf32>, tensor<?x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
+// CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
 // CHECK:           [[VAL_21:%.*]] = constant dense<[0, -1, 0]> : tensor<3xi32>
 // CHECK:           [[VAL_22:%.*]] = constant dense<0> : tensor<3xi32>
 // CHECK:           [[VAL_23:%.*]] = constant dense<1> : tensor<3xi32>
 // CHECK:           [[VAL_24:%.*]] = "tf.StridedSlice"([[VAL_20]], [[VAL_21]], [[VAL_22]], [[VAL_23]]) {begin_mask = 5 : i64, ellipsis_mask = 0 : i64, end_mask = 5 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 2 : i64} : (tensor<8x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
-// CHECK:           [[VAL_25:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
-// CHECK:           [[VAL_26:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
+// CHECK:           [[VAL_25:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
+// CHECK:           [[VAL_26:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
 // CHECK:           [[VAL_27:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
-// CHECK:           return [[VAL_24]], [[VAL_20]], [[VAL_25]], [[VAL_26]], [[VAL_27]] : tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
+// CHECK:           return [[VAL_24]], [[VAL_20]], [[VAL_25]], [[VAL_26]], [[VAL_27]] : tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
 // CHECK:         }
 
 }
@@ -237,18 +237,18 @@ func @inference_standard_lstm_non_time_major(%arg0: tensor<8x8x8xf32>, %arg1: te
 // -----
 
 module {
-func @inference_standard_lstm_time_major_go_backwards(%arg0: tensor<?x8x8xf32>, %arg1: tensor<?x10xf32>, %arg2: tensor<?x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x40xf32>, %arg5: tensor<40xf32>) -> (tensor<?x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = true, tf.time_major = true} {
+func @inference_standard_lstm_time_major_go_backwards(%arg0: tensor<?x8x8xf32>, %arg1: tensor<8x10xf32>, %arg2: tensor<8x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x40xf32>, %arg5: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: false", "tfshape$unknown_rank: false"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = true, tf.time_major = true} {
   %0 = "tf.BatchMatMulV2"(%arg0, %arg3) {adj_x = false, adj_y = false} : (tensor<?x8x8xf32>, tensor<8x40xf32>) -> tensor<?x8x40xf32>
   %1 = "tf.Add"(%0, %arg5) : (tensor<?x8x40xf32>, tensor<40xf32>) -> tensor<?x8x40xf32>
   %2 = "tf.BatchMatMulV2"(%1, %arg4) {adj_x = false, adj_y = true} : (tensor<?x8x40xf32>, tensor<10x40xf32>) -> tensor<?x8x10xf32>
-  %3 = "tf.Add"(%2, %arg1) : (tensor<?x8x10xf32>, tensor<?x10xf32>) -> tensor<?x8x10xf32>
-  %4 = "tf.Add"(%2, %arg2) : (tensor<?x8x10xf32>, tensor<?x10xf32>) -> tensor<?x8x10xf32>
-  %5 = "tf.Add"(%arg1, %arg2) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<?x10xf32>
+  %3 = "tf.Add"(%2, %arg1) : (tensor<?x8x10xf32>, tensor<8x10xf32>) -> tensor<?x8x10xf32>
+  %4 = "tf.Add"(%2, %arg2) : (tensor<?x8x10xf32>, tensor<8x10xf32>) -> tensor<?x8x10xf32>
+  %5 = "tf.Add"(%arg1, %arg2) : (tensor<8x10xf32>, tensor<8x10xf32>) -> tensor<8x10xf32>
   %6 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "/device:CPU:0", dtype = f32, value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  return %5, %4, %5, %5, %6 : tensor<?x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
+  return %5, %4, %5, %5, %6 : tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
 }
 
-// CHECK:       func @inference_standard_lstm_time_major_go_backwards([[VAL_0:%.*]]: tensor<?x8x8xf32>, [[VAL_1:%.*]]: tensor<?x10xf32>, [[VAL_2:%.*]]: tensor<?x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x40xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = true, tf.time_major = true} {
+// CHECK:       func @inference_standard_lstm_time_major_go_backwards([[VAL_0:%.*]]: tensor<?x8x8xf32>, [[VAL_1:%.*]]: tensor<8x10xf32>, [[VAL_2:%.*]]: tensor<8x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x40xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: false", "tfshape$unknown_rank: false"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = true, tf.time_major = true} {
 // CHECK:           [[VAL_6:%.*]] = constant dense<0> : tensor<1xi32>
 // CHECK:           [[VAL_7:%.*]] = "tf.ReverseV2"([[VAL_0]], [[VAL_6]]) : (tensor<?x8x8xf32>, tensor<1xi32>) -> tensor<?x8x8xf32>
 // CHECK:           [[VAL_8:%.*]] = constant dense<[1, 0]> : tensor<2xi32>
@@ -265,15 +265,15 @@ func @inference_standard_lstm_time_major_go_backwards(%arg0: tensor<?x8x8xf32>,
 // CHECK:           [[VAL_19:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
 // CHECK:           [[VAL_20:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_18]], [[VAL_19]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
 // CHECK:           [[VAL_21:%.*]] = constant unit
-// CHECK:           [[VAL_22:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_7]], [[VAL_14]]#0, [[VAL_14]]#1, [[VAL_14]]#2, [[VAL_14]]#3, [[VAL_17]]#0, [[VAL_17]]#1, [[VAL_17]]#2, [[VAL_17]]#3, [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_20]]#0, [[VAL_20]]#1, [[VAL_20]]#2, [[VAL_20]]#3, [[VAL_21]], [[VAL_21]], [[VAL_1]], [[VAL_2]], [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_21]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<?x10xf32>, tensor<?x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
+// CHECK:           [[VAL_22:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_7]], [[VAL_14]]#0, [[VAL_14]]#1, [[VAL_14]]#2, [[VAL_14]]#3, [[VAL_17]]#0, [[VAL_17]]#1, [[VAL_17]]#2, [[VAL_17]]#3, [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_20]]#0, [[VAL_20]]#1, [[VAL_20]]#2, [[VAL_20]]#3, [[VAL_21]], [[VAL_21]], [[VAL_1]], [[VAL_2]], [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_21]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
 // CHECK:           [[VAL_23:%.*]] = constant dense<[-1, 0, 0]> : tensor<3xi32>
 // CHECK:           [[VAL_24:%.*]] = constant dense<0> : tensor<3xi32>
 // CHECK:           [[VAL_25:%.*]] = constant dense<1> : tensor<3xi32>
 // CHECK:           [[VAL_26:%.*]] = "tf.StridedSlice"([[VAL_22]], [[VAL_23]], [[VAL_24]], [[VAL_25]]) {begin_mask = 6 : i64, ellipsis_mask = 0 : i64, end_mask = 6 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
-// CHECK:           [[VAL_27:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
-// CHECK:           [[VAL_28:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
+// CHECK:           [[VAL_27:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
+// CHECK:           [[VAL_28:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
 // CHECK:           [[VAL_29:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
-// CHECK:           return [[VAL_26]], [[VAL_22]], [[VAL_27]], [[VAL_28]], [[VAL_29]] : tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
+// CHECK:           return [[VAL_26]], [[VAL_22]], [[VAL_27]], [[VAL_28]], [[VAL_29]] : tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
 // CHECK:         }
 
 }
@@ -281,18 +281,18 @@ func @inference_standard_lstm_time_major_go_backwards(%arg0: tensor<?x8x8xf32>,
 // -----
 
 module {
-func @inference_standard_lstm_non_time_major_go_backwards(%arg0: tensor<8x8x8xf32>, %arg1: tensor<?x10xf32>, %arg2: tensor<?x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x40xf32>, %arg5: tensor<40xf32>) -> (tensor<?x10xf32>, tensor<8x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = true, tf.time_major = false} {
+func @inference_standard_lstm_non_time_major_go_backwards(%arg0: tensor<8x8x8xf32>, %arg1: tensor<8x10xf32>, %arg2: tensor<8x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x40xf32>, %arg5: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: false", "tfshape$unknown_rank: false"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = true, tf.time_major = false} {
   %0 = "tf.BatchMatMulV2"(%arg0, %arg3) {adj_x = false, adj_y = false} : (tensor<8x8x8xf32>, tensor<8x40xf32>) -> tensor<8x8x40xf32>
   %1 = "tf.Add"(%0, %arg5) : (tensor<8x8x40xf32>, tensor<40xf32>) -> tensor<8x8x40xf32>
   %2 = "tf.BatchMatMulV2"(%1, %arg4) {adj_x = false, adj_y = true} : (tensor<8x8x40xf32>, tensor<10x40xf32>) -> tensor<8x8x10xf32>
-  %3 = "tf.Add"(%2, %arg1) : (tensor<8x8x10xf32>, tensor<?x10xf32>) -> tensor<8x8x10xf32>
-  %4 = "tf.Add"(%2, %arg2) : (tensor<8x8x10xf32>, tensor<?x10xf32>) -> tensor<8x8x10xf32>
-  %5 = "tf.Add"(%arg1, %arg2) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<?x10xf32>
+  %3 = "tf.Add"(%2, %arg1) : (tensor<8x8x10xf32>, tensor<8x10xf32>) -> tensor<8x8x10xf32>
+  %4 = "tf.Add"(%2, %arg2) : (tensor<8x8x10xf32>, tensor<8x10xf32>) -> tensor<8x8x10xf32>
+  %5 = "tf.Add"(%arg1, %arg2) : (tensor<8x10xf32>, tensor<8x10xf32>) -> tensor<8x10xf32>
   %6 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "/device:CPU:0", dtype = f32, value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  return %5, %4, %5, %5, %6 : tensor<?x10xf32>, tensor<8x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
+  return %5, %4, %5, %5, %6 : tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
 }
 
-// CHECK:       func @inference_standard_lstm_non_time_major_go_backwards([[VAL_0:%.*]]: tensor<8x8x8xf32>, [[VAL_1:%.*]]: tensor<?x10xf32>, [[VAL_2:%.*]]: tensor<?x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x40xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = true, tf.time_major = false} {
+// CHECK:       func @inference_standard_lstm_non_time_major_go_backwards([[VAL_0:%.*]]: tensor<8x8x8xf32>, [[VAL_1:%.*]]: tensor<8x10xf32>, [[VAL_2:%.*]]: tensor<8x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x40xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: false", "tfshape$unknown_rank: false"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = true, tf.time_major = false} {
 // CHECK:           [[VAL_6:%.*]] = constant dense<1> : tensor<1xi32>
 // CHECK:           [[VAL_7:%.*]] = "tf.ReverseV2"([[VAL_0]], [[VAL_6]]) : (tensor<8x8x8xf32>, tensor<1xi32>) -> tensor<8x8x8xf32>
 // CHECK:           [[VAL_8:%.*]] = constant dense<[1, 0]> : tensor<2xi32>
@@ -309,15 +309,15 @@ func @inference_standard_lstm_non_time_major_go_backwards(%arg0: tensor<8x8x8xf3
 // CHECK:           [[VAL_19:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
 // CHECK:           [[VAL_20:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_18]], [[VAL_19]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
 // CHECK:           [[VAL_21:%.*]] = constant unit
-// CHECK:           [[VAL_22:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_7]], [[VAL_14]]#0, [[VAL_14]]#1, [[VAL_14]]#2, [[VAL_14]]#3, [[VAL_17]]#0, [[VAL_17]]#1, [[VAL_17]]#2, [[VAL_17]]#3, [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_20]]#0, [[VAL_20]]#1, [[VAL_20]]#2, [[VAL_20]]#3, [[VAL_21]], [[VAL_21]], [[VAL_1]], [[VAL_2]], [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_21]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<?x10xf32>, tensor<?x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
+// CHECK:           [[VAL_22:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_7]], [[VAL_14]]#0, [[VAL_14]]#1, [[VAL_14]]#2, [[VAL_14]]#3, [[VAL_17]]#0, [[VAL_17]]#1, [[VAL_17]]#2, [[VAL_17]]#3, [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_20]]#0, [[VAL_20]]#1, [[VAL_20]]#2, [[VAL_20]]#3, [[VAL_21]], [[VAL_21]], [[VAL_1]], [[VAL_2]], [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_21]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
 // CHECK:           [[VAL_23:%.*]] = constant dense<[0, -1, 0]> : tensor<3xi32>
 // CHECK:           [[VAL_24:%.*]] = constant dense<0> : tensor<3xi32>
 // CHECK:           [[VAL_25:%.*]] = constant dense<1> : tensor<3xi32>
 // CHECK:           [[VAL_26:%.*]] = "tf.StridedSlice"([[VAL_22]], [[VAL_23]], [[VAL_24]], [[VAL_25]]) {begin_mask = 5 : i64, ellipsis_mask = 0 : i64, end_mask = 5 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 2 : i64} : (tensor<8x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
-// CHECK:           [[VAL_27:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
-// CHECK:           [[VAL_28:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
+// CHECK:           [[VAL_27:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
+// CHECK:           [[VAL_28:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
 // CHECK:           [[VAL_29:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
-// CHECK:           return [[VAL_26]], [[VAL_22]], [[VAL_27]], [[VAL_28]], [[VAL_29]] : tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
+// CHECK:           return [[VAL_26]], [[VAL_22]], [[VAL_27]], [[VAL_28]], [[VAL_29]] : tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
 // CHECK:         }
 
 }
@@ -325,25 +325,25 @@ func @inference_standard_lstm_non_time_major_go_backwards(%arg0: tensor<8x8x8xf3
 // -----
 
 module {
-func @inference_can_fuse(%arg0: tensor<?x8x8xf32>, %arg1: tensor<?x10xf32>, %arg2: tensor<?x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x40xf32>, %arg5: tensor<40xf32>) {
+func @inference_can_fuse(%arg0: tensor<?x8x8xf32>, %arg1: tensor<8x10xf32>, %arg2: tensor<8x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x40xf32>, %arg5: tensor<40xf32>) {
   %0 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "", dtype = f32, value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  %1:5 = "tf.PartitionedCall"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5) {Tin = ["tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT"], _output_shapes = ["tfshape$dim { size: 9 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 9 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$"], _read_only_resource_inputs = [], config = "", config_proto = "\0A\07\0A\03CPU\10\01\0A\07\0A\03GPU\10\002\02J\008\01", device = "", executor_type = "", f = @inference_standard_lstm_time_major_can_fuse} : (tensor<?x8x8xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<8x40xf32>, tensor<10x40xf32>, tensor<40xf32>) -> (tensor<?x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>)
+  %1:5 = "tf.PartitionedCall"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5) {Tin = ["tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT"], _output_shapes = ["tfshape$dim { size: 9 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 9 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$"], _read_only_resource_inputs = [], config = "", config_proto = "\0A\07\0A\03CPU\10\01\0A\07\0A\03GPU\10\002\02J\008\01", device = "", executor_type = "", f = @inference_standard_lstm_time_major_can_fuse} : (tensor<?x8x8xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<8x40xf32>, tensor<10x40xf32>, tensor<40xf32>) -> (tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>)
   %2 = "tf.Add"(%0, %1#1) : (tensor<f32>, tensor<?x8x10xf32>) -> tensor<?x8x10xf32>
   return
 }
 
-func @inference_standard_lstm_time_major_can_fuse(%arg0: tensor<?x8x8xf32>, %arg1: tensor<?x10xf32>, %arg2: tensor<?x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x40xf32>, %arg5: tensor<40xf32>) -> (tensor<?x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = true} {
+func @inference_standard_lstm_time_major_can_fuse(%arg0: tensor<?x8x8xf32>, %arg1: tensor<8x10xf32>, %arg2: tensor<8x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x40xf32>, %arg5: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: false", "tfshape$unknown_rank: false"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = true} {
   %0 = "tf.BatchMatMulV2"(%arg0, %arg3) {adj_x = false, adj_y = false} : (tensor<?x8x8xf32>, tensor<8x40xf32>) -> tensor<?x8x40xf32>
   %1 = "tf.Add"(%0, %arg5) : (tensor<?x8x40xf32>, tensor<40xf32>) -> tensor<?x8x40xf32>
   %2 = "tf.BatchMatMulV2"(%1, %arg4) {adj_x = false, adj_y = true} : (tensor<?x8x40xf32>, tensor<10x40xf32>) -> tensor<?x8x10xf32>
-  %3 = "tf.Add"(%2, %arg1) : (tensor<?x8x10xf32>, tensor<?x10xf32>) -> tensor<?x8x10xf32>
-  %4 = "tf.Add"(%2, %arg2) : (tensor<?x8x10xf32>, tensor<?x10xf32>) -> tensor<?x8x10xf32>
-  %5 = "tf.Add"(%arg1, %arg2) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<?x10xf32>
+  %3 = "tf.Add"(%2, %arg1) : (tensor<?x8x10xf32>, tensor<8x10xf32>) -> tensor<?x8x10xf32>
+  %4 = "tf.Add"(%2, %arg2) : (tensor<?x8x10xf32>, tensor<8x10xf32>) -> tensor<?x8x10xf32>
+  %5 = "tf.Add"(%arg1, %arg2) : (tensor<8x10xf32>, tensor<8x10xf32>) -> tensor<8x10xf32>
   %6 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "/device:CPU:0", dtype = f32, value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  return %5, %4, %5, %5, %6 : tensor<?x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
+  return %5, %4, %5, %5, %6 : tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
 }
 
-// CHECK:       func @inference_standard_lstm_time_major_can_fuse([[VAL_0:%.*]]: tensor<?x8x8xf32>, [[VAL_1:%.*]]: tensor<?x10xf32>, [[VAL_2:%.*]]: tensor<?x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x40xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = true} {
+// CHECK:       func @inference_standard_lstm_time_major_can_fuse([[VAL_0:%.*]]: tensor<?x8x8xf32>, [[VAL_1:%.*]]: tensor<8x10xf32>, [[VAL_2:%.*]]: tensor<8x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x40xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: false", "tfshape$unknown_rank: false"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = true} {
 // CHECK:           [[VAL_6:%.*]] = constant dense<[1, 0]> : tensor<2xi32>
 // CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_6]]) : (tensor<8x40xf32>, tensor<2xi32>) -> tensor<40x8xf32>
 // CHECK:           [[VAL_8:%.*]] = constant dense<[1, 0]> : tensor<2xi32>
@@ -358,15 +358,15 @@ func @inference_standard_lstm_time_major_can_fuse(%arg0: tensor<?x8x8xf32>, %arg
 // CHECK:           [[VAL_17:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
 // CHECK:           [[VAL_18:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_16]], [[VAL_17]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
 // CHECK:           [[VAL_19:%.*]] = constant unit
-// CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<?x10xf32>, tensor<?x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
+// CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
 // CHECK:           [[VAL_21:%.*]] = constant dense<[-1, 0, 0]> : tensor<3xi32>
 // CHECK:           [[VAL_22:%.*]] = constant dense<0> : tensor<3xi32>
 // CHECK:           [[VAL_23:%.*]] = constant dense<1> : tensor<3xi32>
 // CHECK:           [[VAL_24:%.*]] = "tf.StridedSlice"([[VAL_20]], [[VAL_21]], [[VAL_22]], [[VAL_23]]) {begin_mask = 6 : i64, ellipsis_mask = 0 : i64, end_mask = 6 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
-// CHECK:           [[VAL_25:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
-// CHECK:           [[VAL_26:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
+// CHECK:           [[VAL_25:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
+// CHECK:           [[VAL_26:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
 // CHECK:           [[VAL_27:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
-// CHECK:           return [[VAL_24]], [[VAL_20]], [[VAL_25]], [[VAL_26]], [[VAL_27]] : tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
+// CHECK:           return [[VAL_24]], [[VAL_20]], [[VAL_25]], [[VAL_26]], [[VAL_27]] : tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
 // CHECK:         }
 
 }
@@ -374,26 +374,26 @@ func @inference_standard_lstm_time_major_can_fuse(%arg0: tensor<?x8x8xf32>, %arg
 // -----
 
 module {
-func @inference_can_fuse_last_output(%arg0: tensor<?x8x8xf32>, %arg1: tensor<?x10xf32>, %arg2: tensor<?x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x40xf32>, %arg5: tensor<40xf32>) {
+func @inference_can_fuse_last_output(%arg0: tensor<?x8x8xf32>, %arg1: tensor<8x10xf32>, %arg2: tensor<8x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x40xf32>, %arg5: tensor<40xf32>) {
   %0 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "", dtype = f32, value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  %1:5 = "tf.PartitionedCall"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5) {Tin = ["tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT"], _output_shapes = ["tfshape$dim { size: 9 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 9 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$"], _read_only_resource_inputs = [], config = "", config_proto = "\0A\07\0A\03CPU\10\01\0A\07\0A\03GPU\10\002\02J\008\01", device = "", executor_type = "", f = @inference_standard_lstm_time_major_can_fuse_last_output} : (tensor<?x8x8xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<8x40xf32>, tensor<10x40xf32>, tensor<40xf32>) -> (tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>)
+  %1:5 = "tf.PartitionedCall"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5) {Tin = ["tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT"], _output_shapes = ["tfshape$dim { size: 9 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 9 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$"], _read_only_resource_inputs = [], config = "", config_proto = "\0A\07\0A\03CPU\10\01\0A\07\0A\03GPU\10\002\02J\008\01", device = "", executor_type = "", f = @inference_standard_lstm_time_major_can_fuse_last_output} : (tensor<?x8x8xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<8x40xf32>, tensor<10x40xf32>, tensor<40xf32>) -> (tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>)
   %2 = "tf.Add"(%0, %1#0) : (tensor<f32>, tensor<8x10xf32>) -> tensor<8x10xf32>
   return
 }
 
-func @inference_standard_lstm_time_major_can_fuse_last_output(%arg0: tensor<?x8x8xf32>, %arg1: tensor<?x10xf32>, %arg2: tensor<?x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x40xf32>, %arg5: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = true} {
+func @inference_standard_lstm_time_major_can_fuse_last_output(%arg0: tensor<?x8x8xf32>, %arg1: tensor<8x10xf32>, %arg2: tensor<8x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x40xf32>, %arg5: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: false", "tfshape$unknown_rank: false"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = true} {
   %0 = "tf.BatchMatMulV2"(%arg0, %arg3) {adj_x = false, adj_y = false} : (tensor<?x8x8xf32>, tensor<8x40xf32>) -> tensor<?x8x40xf32>
   %1 = "tf.Add"(%0, %arg5) : (tensor<?x8x40xf32>, tensor<40xf32>) -> tensor<?x8x40xf32>
   %2 = "tf.BatchMatMulV2"(%1, %arg4) {adj_x = false, adj_y = true} : (tensor<?x8x40xf32>, tensor<10x40xf32>) -> tensor<?x8x10xf32>
-  %3 = "tf.Add"(%2, %arg1) : (tensor<?x8x10xf32>, tensor<?x10xf32>) -> tensor<?x8x10xf32>
-  %4 = "tf.Add"(%2, %arg2) : (tensor<?x8x10xf32>, tensor<?x10xf32>) -> tensor<?x8x10xf32>
-  %5 = "tf.Add"(%arg1, %arg2) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<?x10xf32>
+  %3 = "tf.Add"(%2, %arg1) : (tensor<?x8x10xf32>, tensor<8x10xf32>) -> tensor<?x8x10xf32>
+  %4 = "tf.Add"(%2, %arg2) : (tensor<?x8x10xf32>, tensor<8x10xf32>) -> tensor<?x8x10xf32>
+  %5 = "tf.Add"(%arg1, %arg2) : (tensor<8x10xf32>, tensor<8x10xf32>) -> tensor<8x10xf32>
   %6 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "/device:CPU:0", dtype = f32, value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  %7 = "tf.Add"(%arg1, %arg2) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<8x10xf32>
-  return %7, %4, %5, %5, %6 : tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
+  %7 = "tf.Add"(%arg1, %arg2) : (tensor<8x10xf32>, tensor<8x10xf32>) -> tensor<8x10xf32>
+  return %7, %4, %5, %5, %6 : tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
 }
 
-// CHECK:       func @inference_standard_lstm_time_major_can_fuse_last_output([[VAL_0:%.*]]: tensor<?x8x8xf32>, [[VAL_1:%.*]]: tensor<?x10xf32>, [[VAL_2:%.*]]: tensor<?x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x40xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = true} {
+// CHECK:       func @inference_standard_lstm_time_major_can_fuse_last_output([[VAL_0:%.*]]: tensor<?x8x8xf32>, [[VAL_1:%.*]]: tensor<8x10xf32>, [[VAL_2:%.*]]: tensor<8x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x40xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: false", "tfshape$unknown_rank: false"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = true} {
 // CHECK:           [[VAL_6:%.*]] = constant dense<[1, 0]> : tensor<2xi32>
 // CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_6]]) : (tensor<8x40xf32>, tensor<2xi32>) -> tensor<40x8xf32>
 // CHECK:           [[VAL_8:%.*]] = constant dense<[1, 0]> : tensor<2xi32>
@@ -408,15 +408,15 @@ func @inference_standard_lstm_time_major_can_fuse_last_output(%arg0: tensor<?x8x
 // CHECK:           [[VAL_17:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
 // CHECK:           [[VAL_18:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_16]], [[VAL_17]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
 // CHECK:           [[VAL_19:%.*]] = constant unit
-// CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<?x10xf32>, tensor<?x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
+// CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
 // CHECK:           [[VAL_21:%.*]] = constant dense<[-1, 0, 0]> : tensor<3xi32>
 // CHECK:           [[VAL_22:%.*]] = constant dense<0> : tensor<3xi32>
 // CHECK:           [[VAL_23:%.*]] = constant dense<1> : tensor<3xi32>
 // CHECK:           [[VAL_24:%.*]] = "tf.StridedSlice"([[VAL_20]], [[VAL_21]], [[VAL_22]], [[VAL_23]]) {begin_mask = 6 : i64, ellipsis_mask = 0 : i64, end_mask = 6 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
-// CHECK:           [[VAL_25:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
-// CHECK:           [[VAL_26:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
+// CHECK:           [[VAL_25:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
+// CHECK:           [[VAL_26:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
 // CHECK:           [[VAL_27:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
-// CHECK:           return [[VAL_24]], [[VAL_20]], [[VAL_25]], [[VAL_26]], [[VAL_27]] : tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
+// CHECK:           return [[VAL_24]], [[VAL_20]], [[VAL_25]], [[VAL_26]], [[VAL_27]] : tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
 // CHECK:         }
 
 }
@@ -456,6 +456,32 @@ func @inference_standard_lstm_time_major_cannot_fuse(%arg0: tensor<?x8x8xf32>, %
 
 // -----
 
+module {
+func @dynamic_shape_non_fuse_standard_lstm(%arg0: tensor<?x8x8xf32>, %arg1: tensor<?x10xf32>, %arg2: tensor<?x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x40xf32>, %arg5: tensor<40xf32>) -> (tensor<?x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = true} {
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg3) {adj_x = false, adj_y = false} : (tensor<?x8x8xf32>, tensor<8x40xf32>) -> tensor<?x8x40xf32>
+  %1 = "tf.Add"(%0, %arg5) : (tensor<?x8x40xf32>, tensor<40xf32>) -> tensor<?x8x40xf32>
+  %2 = "tf.BatchMatMulV2"(%1, %arg4) {adj_x = false, adj_y = true} : (tensor<?x8x40xf32>, tensor<10x40xf32>) -> tensor<?x8x10xf32>
+  %3 = "tf.Add"(%2, %arg1) : (tensor<?x8x10xf32>, tensor<?x10xf32>) -> tensor<?x8x10xf32>
+  %4 = "tf.Add"(%2, %arg2) : (tensor<?x8x10xf32>, tensor<?x10xf32>) -> tensor<?x8x10xf32>
+  %5 = "tf.Add"(%arg1, %arg2) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<?x10xf32>
+  %6 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "/device:CPU:0", dtype = f32, value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  return %5, %4, %5, %5, %6 : tensor<?x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
+}
+
+// CHECK: func @dynamic_shape_non_fuse_standard_lstm(%[[VAL_0:.*]]: tensor<?x8x8xf32>, %[[VAL_1:.*]]: tensor<?x10xf32>, %[[VAL_2:.*]]: tensor<?x10xf32>, %[[VAL_3:.*]]: tensor<8x40xf32>, %[[VAL_4:.*]]: tensor<10x40xf32>, %[[VAL_5:.*]]: tensor<40xf32>) -> (tensor<?x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = true} {
+// CHECK:         %[[VAL_6:.*]] = "tf.BatchMatMulV2"(%[[VAL_0]], %[[VAL_3]]) {adj_x = false, adj_y = false} : (tensor<?x8x8xf32>, tensor<8x40xf32>) -> tensor<?x8x40xf32>
+// CHECK:         %[[VAL_7:.*]] = "tf.Add"(%[[VAL_6]], %[[VAL_5]]) : (tensor<?x8x40xf32>, tensor<40xf32>) -> tensor<?x8x40xf32>
+// CHECK:         %[[VAL_8:.*]] = "tf.BatchMatMulV2"(%[[VAL_7]], %[[VAL_4]]) {adj_x = false, adj_y = true} : (tensor<?x8x40xf32>, tensor<10x40xf32>) -> tensor<?x8x10xf32>
+// CHECK:         %[[VAL_9:.*]] = "tf.Add"(%[[VAL_8]], %[[VAL_1]]) : (tensor<?x8x10xf32>, tensor<?x10xf32>) -> tensor<?x8x10xf32>
+// CHECK:         %[[VAL_10:.*]] = "tf.Add"(%[[VAL_8]], %[[VAL_2]]) : (tensor<?x8x10xf32>, tensor<?x10xf32>) -> tensor<?x8x10xf32>
+// CHECK:         %[[VAL_11:.*]] = "tf.Add"(%[[VAL_1]], %[[VAL_2]]) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<?x10xf32>
+// CHECK:         %[[VAL_12:.*]] = "tf.Const"() {_output_shapes = ["tfshape$"], device = "/device:CPU:0", dtype = f32, value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+// CHECK:         return %[[VAL_11]], %[[VAL_10]], %[[VAL_11]], %[[VAL_11]], %[[VAL_12]] : tensor<?x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
+// CHECK:       }
+}
+
+// -----
+
 module {
 func @nms_padded(%arg0: tensor<100x4xf32>, %arg1: tensor<100xf32>, %arg2: tensor<i32>, %arg3: tensor<f32>, %arg4: tensor<f32>, %arg5: tensor<i1>, %arg6: tensor<i1>, %arg7: tensor<i1>, %arg8: tensor<i32>) -> (tensor<1x10xi32>, tensor<i32>) attributes  {tf._implements = "non_max_suppression_padded_v2", tf._reference = "mlir"} {
   %0 = "tf.Const"() {value = dense<1> : tensor<1x10xi32>} : () -> tensor<1x10xi32>
@@ -481,3 +507,15 @@ func @nms_padded_invalid_num_args(%arg0: tensor<100x4xf32>, %arg1: tensor<100xf3
 // expected-error @+1 {{TFLite does not support batched input for non_max_suppression_padded}}
 func @nms_padded_with_batches(%arg0: tensor<2x100x4xf32>, %arg1: tensor<2x100xf32>, %arg2: tensor<i32>, %arg3: tensor<f32>, %arg4: tensor<f32>, %arg5: tensor<i1>, %arg6: tensor<i1>, %arg7: tensor<i1>, %arg8: tensor<i32>) -> (tensor<2x10xi32>, tensor<i32>) attributes  {tf._implements = "non_max_suppression_padded_v2", tf._reference = "mlir"}
 }
+
+// -----
+
+module {
+// CHECK-LABEL: func @some_func
+// CHECK-LABEL: func @func_with_call
+func @some_func(%arg0: tensor<100xf32>) -> tensor<100xf32> attributes {tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c"}
+func @func_with_call(%arg0: tensor<100xf32>) -> tensor<100xf32> {
+  %0 = call @some_func(%arg0) : (tensor<100xf32>) -> tensor<100xf32>
+  return %0 : tensor<100xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
index 7b51ec32f89..066139e179b 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
@@ -1,5 +1,7 @@
 // RUN: tf-opt -tfl-prepare-tf %s | FileCheck %s
 
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+
 func @conv(tensor<256x32x32x3xf32>, tensor<3x3x3x16xf32>, tensor<256x3x32x32xf32>) -> (tensor<256x30x30x16xf32>, tensor<256x16x30x30xf32>, tensor<256x30x30x16xf32>, tensor<256x30x30x16xf32>, tensor<256x30x30x16xf32>) {
 ^bb0(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<3x3x3x16xf32>, %arg2: tensor<256x3x32x32xf32>) :
    // OK
@@ -578,3 +580,19 @@ func @MatrixSetDiagV3Conversion(%arg0: tensor<3x3xi32>, %arg1: tensor<3xi32>) ->
   // CHECK: %[[RES:.*]] = "tf.MatrixSetDiag"(%arg0, %arg1) : (tensor<3x3xi32>, tensor<3xi32>) -> tensor<3x3xi32>
   // CHECK: return %[[RES]]
 }
+
+// CHECK-LABEL: xla_conv
+func @xla_conv(%arg0: tensor<4x8x8x16xf32>) -> tensor<4x8x8x16xf32> {
+  %0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<3x3x16x16xf32>} : () -> tensor<3x3x16x16xf32> loc("Const_1")
+  %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32> loc("XlaConv/feature_group_count")
+  %2 = "tf.Const"() {value = dense<1> : tensor<2x2xi32>} : () -> tensor<2x2xi32> loc("XlaConv/padding")
+  %3 = "tf.Const"() {value = dense<1> : tensor<2xi32>} : () -> tensor<2xi32> loc("XlaConv/window_strides")
+  %4 = "tf.XlaConv"(%arg0, %0, %3, %2, %3, %3, %1) {device = "", dimension_numbers = "\18\02 \032\02\00\01@\03P\03Z\02\01\02b\02\01\02", precision_config = ""} : (tensor<4x8x8x16xf32>, tensor<3x3x16x16xf32>, tensor<2xi32>, tensor<2x2xi32>, tensor<2xi32>, tensor<2xi32>, tensor<i32>) -> tensor<4x8x8x16xf32>
+  return %4 : tensor<4x8x8x16xf32>
+  // CHECK: %[[CST:.*]] = constant dense<0.000000e+00> : tensor<16xf32>
+  // CHECK: %[[CST0:.*]] = constant dense<1.000000e+00> : tensor<16x3x3x16xf32>
+  // CHECK: %[[RES:.*]] = "tfl.conv_2d"(%arg0, %[[CST0]], %[[CST]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<4x8x8x16xf32>, tensor<16x3x3x16xf32>, tensor<16xf32>) -> tensor<4x8x8x16xf32>
+  // CHECK: return %[[RES]]
+}
+
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/raise-custom-ops.mlir b/tensorflow/compiler/mlir/lite/tests/raise-custom-ops.mlir
new file mode 100644
index 00000000000..1bac8019a30
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/raise-custom-ops.mlir
@@ -0,0 +1,20 @@
+// RUN: tf-opt -tfl-raise-custom-ops -canonicalize %s -o - | FileCheck %s
+
+// CHECK-LABEL: custom_op
+func @custom_op(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+  %0 = "tfl.pseudo_const" () {value = dense<1.0> : tensor<4xf32>} : () -> tensor<4xf32>
+  %1 = "tfl.mul"(%arg0, %0) {fused_activation_function = "NONE"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  // will be preserved since it has uses.
+  %2 = "tf.MyCustomOp"(%1, %0) {fused_activation_function = "RELU", int_attr = 2 : i32}  : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  // will be removed since it doesn't have uses and doesn't have side effect.
+  "tf.MyCustomOp"(%1, %0) {fused_activation_function = "RELU", int_attr = 2 : i32}  : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %2 : tensor<4xf32>
+
+// CHECK-NEXT: %[[CST:.*]] = constant dense<1.000000e+00>
+// CHECK-NEXT: %[[MUL:.*]] = tfl.mul %arg0, %[[CST]] {fused_activation_function = "NONE"} : tensor<4xf32>
+// CHECK-NEXT: %[[CUSTOM:.*]] = "tfl.custom_tf"(%[[MUL]], %[[CST]]) ( {
+// CHECK-NEXT:   %[[MY_CUSTOM:.*]] = "tf.MyCustomOp"(%[[MUL]], %[[CST]]) {fused_activation_function = "RELU", int_attr = 2 : i32} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+// CHECK-NEXT:   "tfl.yield"(%[[MY_CUSTOM]]) : (tensor<4xf32>) -> ()
+// CHECK-NEXT: }) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+// CHECK-NEXT: return %[[CUSTOM]] : tensor<4xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index 1e1c431822d..d63eb481376 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -166,6 +166,10 @@ void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
   // The below passes only make sense if Builtin TFLite ops are enabled
   // for emission.
   if (pass_config.emit_builtin_tflite_ops) {
+    // Run shape inference after variables are converted to constants.
+    if (pass_config.shape_inference) {
+      pass_manager->addPass(mlir::TF::CreateTFShapeInferencePass());
+    }
     // Prepare for TFLite dialect, rerun canonicalization, and then legalize to
     // the TFLite dialect.
     pass_manager->addPass(
@@ -173,8 +177,19 @@ void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
     pass_manager->addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
     if (pass_config.shape_inference) {
       // Add a shape inference pass to optimize away the unnecessary casts.
+      // This also fixes the unranked shapes due to TF ops constant folding.
+      // TODO(fengliuai): remove this pass if TableGen patterns have a better
+      // to control the shapes for the intermediate results.
       pass_manager->addPass(mlir::TF::CreateTFShapeInferencePass());
     }
+
+    // Inline function calls that left in the graph after folding functional
+    // control flow ops (IfOp, CaseOp).
+    pass_manager->addPass(mlir::createInlinerPass());
+
+    // This pass removes the asset file dependencies in hash table use cases.
+    pass_manager->addPass(mlir::TF::CreateInitTextFileToImportPass());
+
     pass_manager->addPass(
         mlir::TFL::CreateLegalizeTFPass(pass_config.runtime_verification));
     pass_manager->addPass(mlir::TFL::CreateOptimizePass());
@@ -182,6 +197,7 @@ void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
     // so that it can target constants introduced once TensorFlow Identity ops
     // are removed during legalization.
     pass_manager->addPass(mlir::TFL::CreateOptimizeFunctionalOpsPass());
+    pass_manager->addPass(mlir::TFL::CreateRaiseCustomOpsPass());
     pass_manager->addPass(mlir::createSymbolDCEPass());
     pass_manager->addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
     pass_manager->addNestedPass<mlir::FuncOp>(mlir::createCSEPass());
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
index 963ab743a83..046c7bbbcf0 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
@@ -144,6 +144,10 @@ int main(int argc, char **argv) {
 
   StatusOr<mlir::OwningModuleRef> module;
 
+  tensorflow::GraphImportConfig specs;
+  specs.upgrade_legacy = upgrade_legacy;
+  specs.prune_unused_nodes = true;
+
   // TODO(b/147435528): We need to test the e2e behavior once the graph freezing
   // inside mlir is done.
   if (import_saved_model_object_graph || import_saved_model_signature_defs) {
@@ -168,12 +172,10 @@ int main(int argc, char **argv) {
       return kTrFailure;
     }
 
-    module = tensorflow::ImportSavedModel(input_file_name, saved_model_version,
-                                          tags, exported_names, &context);
+    module =
+        tensorflow::ImportSavedModel(input_file_name, saved_model_version, tags,
+                                     exported_names, specs, &context);
   } else {
-    tensorflow::GraphImportConfig specs;
-    specs.upgrade_legacy = upgrade_legacy;
-    specs.prune_unused_nodes = true;
     module = tensorflow::LoadFromGraphdefOrMlirSource(
         input_file_name, input_mlir, use_splatted_constant, custom_opdefs,
         specs, debug_info_file, input_arrays, input_dtypes, input_shapes,
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
index 714bc493bed..c158f3a8e21 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
@@ -129,6 +129,18 @@ Status ConvertTFExecutorToTFLOrFlatbuffer(
     bool emit_select_tf_ops, bool emit_custom_ops,
     const mlir::TFL::QuantizationSpecs& quant_specs, std::string* result,
     mlir::PassManager* pass_manager) {
+  // Register a warning handler only log to std out.
+  mlir::ScopedDiagnosticHandler s(
+      module.getContext(), [](mlir::Diagnostic& diag) {
+        if (diag.getSeverity() == mlir::DiagnosticSeverity::Warning) {
+          for (auto& note : diag.getNotes()) {
+            std::cout << note.str() << "\n";
+            LOG(WARNING) << note.str() << "\n";
+          }
+        }
+        return mlir::failure();
+      });
+
   mlir::StatusScopedDiagnosticHandler statusHandler(module.getContext(),
                                                     /*propagate=*/true);
 
@@ -186,7 +198,8 @@ Status ConvertTFExecutorToTFLOrFlatbuffer(
 StatusOr<mlir::OwningModuleRef> ImportSavedModel(
     const std::string& input_filename, const int saved_model_version,
     const std::unordered_set<std::string>& tags,
-    absl::Span<std::string> exported_names, mlir::MLIRContext* context) {
+    absl::Span<std::string> exported_names, const GraphImportConfig& specs,
+    mlir::MLIRContext* context) {
   if (saved_model_version == 2) {
     auto module_or = tensorflow::SavedModelObjectGraphToMlirImport(
         input_filename, tags, exported_names, context);
@@ -194,7 +207,7 @@ StatusOr<mlir::OwningModuleRef> ImportSavedModel(
     return module_or.ConsumeValueOrDie();
   } else if (saved_model_version == 1) {
     auto module_or = tensorflow::SavedModelSignatureDefsToMlirImport(
-        input_filename, tags, exported_names, context);
+        input_filename, tags, exported_names, context, specs.upgrade_legacy);
 
     if (!module_or.status().ok()) return module_or.status();
     return module_or.ConsumeValueOrDie();
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
index 4ad58c4f8ef..8f1edec8879 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
@@ -48,7 +48,8 @@ LoadFromGraphdefOrMlirSource(
 stream_executor::port::StatusOr<mlir::OwningModuleRef> ImportSavedModel(
     const std::string& input_filename, const int saved_model_version,
     const std::unordered_set<std::string>& tags,
-    absl::Span<std::string> exported_names, mlir::MLIRContext* context);
+    absl::Span<std::string> exported_names, const GraphImportConfig& specs,
+    mlir::MLIRContext* context);
 
 // Taking a MLIR module in TF executor dialect and a set of parameters,
 // applies a set of passes to convert the module to TF Lite dialect and
diff --git a/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc b/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc
index f5ef2585be5..50a7ee52430 100644
--- a/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc
@@ -16,6 +16,7 @@ limitations under the License.
 // This transformation pass convert dense tensor to sparse format.
 
 #include "absl/memory/memory.h"
+#include "third_party/eigen3/Eigen/Core"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
@@ -36,6 +37,16 @@ const float kMinSparsityLevel = 0.3;
 // Heuristic to check if a block configuration is correct.
 const float kBlockOverRandomSparsityRatio = 0.9;
 
+Eigen::half APFloatToEigenHalf(const APFloat& val) {
+  uint16_t raw_data = val.bitcastToAPInt().getZExtValue();
+  return Eigen::half_impl::raw_uint16_to_half(raw_data);
+}
+
+APFloat EigenHalfToAPFloat(const Eigen::half& val) {
+  uint16_t raw_data = val.x;
+  return APFloat(APFloat::IEEEhalf(), APInt(16, raw_data));
+}
+
 void PopulateEncodingParams(const std::vector<int>& block_size,
                             std::vector<int>* traversal_order,
                             std::vector<TfLiteDimensionType>* format,
@@ -64,14 +75,18 @@ void PopulateEncodingParams(const std::vector<int>& block_size,
   }
 }
 
+inline float GetSparsity(const int num_zeros, const int num_elements) {
+  return (1.0 * num_zeros / num_elements);
+}
+
 float CalculateRandomSparsity(const ElementsAttr& attr,
                               const ShapedType& type) {
   int num_elements = type.getNumElements();
   int num_zeros = 0;
 
-  if (type.getElementType().isF32()) {
-    for (const auto val : attr.getValues<float>()) {
-      if (val == 0.f) {
+  if (type.getElementType().isa<FloatType>()) {
+    for (const auto val : attr.getValues<APFloat>()) {
+      if (val.isZero()) {
         num_zeros++;
       }
     }
@@ -83,7 +98,7 @@ float CalculateRandomSparsity(const ElementsAttr& attr,
     }
   }
 
-  return 1.0 * num_zeros / num_elements;
+  return GetSparsity(num_zeros, num_elements);
 }
 
 float CalculateBlockSparsity(const ElementsAttr& attr, const ShapedType& type,
@@ -108,7 +123,19 @@ float CalculateBlockSparsity(const ElementsAttr& attr, const ShapedType& type,
     for (const auto val : attr.getValues<float>()) data.push_back(val);
     format_converter.DenseToSparse(data.data());
     sparsity =
-        1 - 1.0 * format_converter.GetData().size() / type.getNumElements();
+        GetSparsity(type.getNumElements() - format_converter.GetData().size(),
+                    type.getNumElements());
+  } else if (type.getElementType().isF16()) {
+    tflite::optimize::sparsity::FormatConverter<Eigen::half> format_converter(
+        shape, traversal_order, format, b_size, b_map);
+    std::vector<Eigen::half> data;
+    data.reserve(type.getNumElements());
+    for (const auto& val : attr.getValues<APFloat>())
+      data.push_back(APFloatToEigenHalf(val));
+    format_converter.DenseToSparse(data.data());
+    sparsity =
+        GetSparsity(type.getNumElements() - format_converter.GetData().size(),
+                    type.getNumElements());
   } else if (type.getElementType().isa<quant::QuantizedType>()) {
     tflite::optimize::sparsity::FormatConverter<int8_t> format_converter(
         shape, traversal_order, format, b_size, b_map);
@@ -117,7 +144,8 @@ float CalculateBlockSparsity(const ElementsAttr& attr, const ShapedType& type,
     for (const auto val : attr.getValues<int8_t>()) data.push_back(val);
     format_converter.DenseToSparse(data.data());
     sparsity =
-        1 - 1.0 * format_converter.GetData().size() / type.getNumElements();
+        GetSparsity(type.getNumElements() - format_converter.GetData().size(),
+                    type.getNumElements());
   }
 
   return sparsity;
@@ -184,8 +212,8 @@ InspectResult InspectWeight(
 
 template <typename T>
 std::vector<T> BuildSparsityParameterAttribute(
-    const std::vector<int>& block_size, Operation* inst, OpBuilder* builder,
-    SparsityParameterAttr* s_param) {
+    const std::vector<int>& block_size, const T* dense_buffer, Operation* inst,
+    OpBuilder* builder, SparsityParameterAttr* s_param) {
   ElementsAttr attr;
   ShapedType type;
   if (auto cst = dyn_cast<ConstOp>(inst)) {
@@ -210,10 +238,7 @@ std::vector<T> BuildSparsityParameterAttribute(
 
   tflite::optimize::sparsity::FormatConverter<T> format_converter(
       shape, traversal_order, format, b_size, b_map);
-  std::vector<T> data;
-  data.reserve(type.getNumElements());
-  for (const auto val : attr.getValues<T>()) data.push_back(val);
-  format_converter.DenseToSparse(data.data());
+  format_converter.DenseToSparse(dense_buffer);
   auto metadata = format_converter.GetDimMetadata();
   auto compressed_data = format_converter.GetData();
   const int dim_size = metadata.size() / 2;
@@ -264,15 +289,28 @@ void DenseToSparse::runOnFunction() {
   func.walk([&](SparseOpInterface sparse_op) {
     const auto& sparse_operands = sparse_op.GetSparseOperands();
     std::vector<std::vector<int>> supported_block_size;
-    for (const int operand : sparse_operands) {
+    for (int operand : sparse_operands) {
       auto* op = sparse_op.getOperation();
-      const auto& value = op->getOperand(operand);
+      auto value = op->getOperand(operand);
 
       auto* inst = value.getDefiningOp();
       if (!inst) {
         continue;
       }
 
+      // There could be a Dequantize op after the weight tensor in cases like
+      // fp16 post-training quantization. We need to get the weight from the
+      // input of the Dequantize op.
+      if (isa<DequantizeOp>(inst)) {
+        op = inst;
+        value = inst->getOperand(0);
+        inst = value.getDefiningOp();
+        if (!inst) {
+          continue;
+        }
+        operand = 0;
+      }
+
       ShapedType type;
       if (isa<ConstOp>(inst)) {
         supported_block_size = sparse_op.GetFloatBlockSize();
@@ -297,22 +335,60 @@ void DenseToSparse::runOnFunction() {
       builder.setInsertionPoint(op);
       SparsityParameterAttr s_param;
       if (auto cst = dyn_cast<ConstOp>(inst)) {
-        std::vector<float> compressed_data =
-            BuildSparsityParameterAttribute<float>(result.selected_block_size,
-                                                   inst, &builder, &s_param);
-        auto compressed_data_type = RankedTensorType::get(
-            {static_cast<int64_t>(compressed_data.size())},
-            builder.getF32Type());
-        auto new_value = DenseElementsAttr::get<float>(compressed_data_type,
-                                                       compressed_data);
-        auto s_const = builder.create<SparseConstOp>(op->getLoc(), cst.value(),
-                                                     s_param, new_value);
-        value.replaceAllUsesWith(s_const.getResult());
-        cst.erase();
+        auto attr = cst.value();
+        auto type = cst.getType().cast<ShapedType>();
+        if (type.getElementType().isF32()) {
+          std::vector<float> dense_data;
+          dense_data.reserve(type.getNumElements());
+          for (const auto val : attr.getValues<float>())
+            dense_data.push_back(val);
+          std::vector<float> compressed_data =
+              BuildSparsityParameterAttribute<float>(result.selected_block_size,
+                                                     dense_data.data(), inst,
+                                                     &builder, &s_param);
+          auto compressed_data_type = RankedTensorType::get(
+              {static_cast<int64_t>(compressed_data.size())},
+              builder.getF32Type());
+          auto new_value = DenseElementsAttr::get<float>(compressed_data_type,
+                                                         compressed_data);
+          auto s_const = builder.create<SparseConstOp>(
+              op->getLoc(), cst.value(), s_param, new_value);
+          value.replaceAllUsesWith(s_const.getResult());
+          cst.erase();
+        } else if (type.getElementType().isF16()) {
+          std::vector<Eigen::half> dense_data;
+          dense_data.reserve(type.getNumElements());
+          for (const auto& val : attr.getValues<APFloat>())
+            dense_data.push_back(APFloatToEigenHalf(val));
+          std::vector<Eigen::half> compressed_data =
+              BuildSparsityParameterAttribute<Eigen::half>(
+                  result.selected_block_size, dense_data.data(), inst, &builder,
+                  &s_param);
+          std::vector<APFloat> apfloat_data;
+          apfloat_data.reserve(type.getNumElements());
+          for (const auto& val : compressed_data)
+            apfloat_data.push_back(EigenHalfToAPFloat(val));
+          auto compressed_data_type = RankedTensorType::get(
+              {static_cast<int64_t>(compressed_data.size())},
+              type.getElementType());
+          auto new_value =
+              DenseElementsAttr::get(compressed_data_type, apfloat_data);
+          auto s_const = builder.create<SparseConstOp>(
+              op->getLoc(), cst.value(), s_param, new_value);
+          value.replaceAllUsesWith(s_const.getResult());
+          cst.erase();
+        }
       } else if (auto cst = dyn_cast<QConstOp>(inst)) {
+        auto attr = cst.value();
+        auto type = cst.getType().cast<ShapedType>();
+        std::vector<int8_t> dense_data(type.getNumElements());
+        dense_data.reserve(type.getNumElements());
+        for (const auto& val : attr.getValues<int8_t>())
+          dense_data.push_back(val);
         std::vector<int8_t> compressed_data =
             BuildSparsityParameterAttribute<int8_t>(result.selected_block_size,
-                                                    inst, &builder, &s_param);
+                                                    dense_data.data(), inst,
+                                                    &builder, &s_param);
         auto compressed_data_type = RankedTensorType::get(
             {static_cast<int64_t>(compressed_data.size())},
             builder.getIntegerType(8, true));
diff --git a/tensorflow/compiler/mlir/lite/transforms/dilated_conv.h b/tensorflow/compiler/mlir/lite/transforms/dilated_conv.h
index b745be7753a..2054bab4185 100644
--- a/tensorflow/compiler/mlir/lite/transforms/dilated_conv.h
+++ b/tensorflow/compiler/mlir/lite/transforms/dilated_conv.h
@@ -276,7 +276,7 @@ ConvertTFDilatedConvOp<Conv2dOpTy>::ExtractDilationsAttrFromBlockShape(
   }
   // Check that the block_shape of `stb_op` and `bts_op` are equal.
   if (stb_bs_attr.getNumElements() != bts_bs_attr.getNumElements()) return {};
-  for (uint64_t i = 0; i < stb_bs_attr.getNumElements(); ++i) {
+  for (uint64_t i = 0, end = stb_bs_attr.getNumElements(); i < end; ++i) {
     if (stb_bs_attr.getValue({i}) != bts_bs_attr.getValue({i})) return {};
   }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
index f5b45df3eee..47cfaecd3fb 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
@@ -66,8 +66,10 @@ def LegalizeTFConstToTFLConst: Pat<(TF_ConstOp ElementsAttr:$value),
                                    (TFL_ConstOp $value)>;
 
 // Convert to std constant for statically shaped, non-opaque constants.
-def : Pat<(TF_ConstOp:$res NonOpaqueElementsAttr:$value), (ConstantOp $value),
-          [(AnyStaticShapeTensor $res)], (addBenefit 10)>;
+def ConvertTfConstToStdConst : Pat<
+  (TF_ConstOp:$res NonOpaqueElementsAttr:$value),
+  (ConstantOp $value),
+  [(AnyStaticShapeTensor $res)], (addBenefit 10)>;
 
 //===----------------------------------------------------------------------===//
 // Unary ops patterns.
@@ -162,186 +164,234 @@ def LegalizeMaximum : Pat<(TF_MaximumOp $arg1, $arg2),
 def LegalizeMinimum : Pat<(TF_MinimumOp $arg1, $arg2),
                           (TFL_MinimumOp $arg1, $arg2)>;
 
-def : Pat<(TF_NegOp $arg), (TFL_NegOp $arg)>;
-def : Pat<(TF_OneHotOp $indices, $depth, $on_value, $off_value, $axis),
-          (TFL_OneHotOp $indices, $depth, $on_value, $off_value,
-           (convertIntAttrTo32Bit $axis))>;
-def : Pat<(TF_PowOp $x, $y), (TFL_PowOp $x, $y)>;
-def : Pat<(TF_RangeOp $start, $limit, $delta), (TFL_RangeOp $start, $limit, $delta)>;
-def : Pat<(TF_Relu6Op $arg), (TFL_Relu6Op $arg)>;
-def : Pat<(TF_ReluOp $arg), (TFL_ReluOp $arg)>;
-def : Pat<(TF_ReverseSequenceOp $input, $seq_lengths, $seq_dim, $batch_dim),
-          (TFL_ReverseSequenceOp $input, $seq_lengths,
-           (convertIntAttrTo32Bit $seq_dim),
-           (convertIntAttrTo32Bit $batch_dim))>;
-def : Pat<(TF_RoundOp $arg), (TFL_RoundOp $arg)>;
-def : Pat<(TF_RsqrtOp $arg), (TFL_RsqrtOp $arg)>;
-def : Pat<(TF_SqrtOp $arg), (TFL_SqrtOp $arg)>;
-def : Pat<(TF_SquareOp $arg), (TFL_SquareOp $arg)>;
-def : Pat<(TF_SegmentSumOp $data, I32Tensor:$segment_ids), (TFL_SegmentSumOp $data, $segment_ids)>;
-def : Pat<(TF_SelectOp $cond, $x, $y), (TFL_SelectOp $cond, $x, $y)>;
-def : Pat<(TF_SelectV2Op:$src_op $cond, $x, $y), (TFL_SelectOp $cond, $x, $y), [(HasSameStaticShapes $src_op)]>;
-def : Pat<(TF_SelectV2Op:$src_op $cond, $x, $y), (TFL_SelectV2Op $cond, $x, $y), [(HasNotSameStaticShapes $src_op)]>;
-def : Pat<(TF_ShapeOp $arg), (TFL_ShapeOp $arg)>;
-def : Pat<(TF_SigmoidOp $arg), (TFL_LogisticOp $arg)>;
-def : Pat<(TF_SinOp F32Tensor:$arg), (TFL_SinOp $arg)>;
-def : Pat<(TF_SliceOp $input, $begin, $size), (TFL_SliceOp $input, $begin, $size)>;
-def : Pat<(TF_SoftmaxOp $arg), (TFL_SoftmaxOp $arg, ConstF32Attr<"1.0">)>;
-def : Pat<(TF_SoftplusOp F32Tensor:$arg0), (TFL_LogOp (TFL_AddOp (TFL_ExpOp $arg0), (ConstantOp ConstantAttr<RankedF32ElementsAttr<[]>, "1.0f">), TFL_AF_None))>;
-def : Pat<(TF_SqueezeOp $arg, $squeeze_dims), (TFL_SqueezeOp $arg, $squeeze_dims)>;
-def : Pat<(TF_TanhOp $arg), (TFL_TanhOp $arg)>;
-def : Pat<(TF_TransposeOp $arg, $perm), (TFL_TransposeOp $arg, $perm)>;
-def : Pat<(TF_WhereOp $arg), (TFL_WhereOp $arg)>;
-def : Pat<(TF_ZerosLikeOp $arg), (TFL_ZerosLikeOp $arg)>;
+def LegalizeNeg : Pat<(TF_NegOp $arg), (TFL_NegOp $arg)>;
+def LegalizeOneHot : Pat<
+  (TF_OneHotOp $indices, $depth, $on_value, $off_value, $axis),
+  (TFL_OneHotOp $indices, $depth, $on_value, $off_value,
+  (convertIntAttrTo32Bit $axis))>;
+def LegalizePow : Pat<(TF_PowOp $x, $y), (TFL_PowOp $x, $y)>;
+def LegalizeRange : Pat<(TF_RangeOp $start, $limit, $delta),
+                        (TFL_RangeOp $start, $limit, $delta)>;
+def LegalizeRelu6 : Pat<(TF_Relu6Op $arg), (TFL_Relu6Op $arg)>;
+def LegalizeRelu : Pat<(TF_ReluOp $arg), (TFL_ReluOp $arg)>;
+def LegalizeReverseSequence : Pat<
+  (TF_ReverseSequenceOp $input, $seq_lengths, $seq_dim, $batch_dim),
+  (TFL_ReverseSequenceOp $input, $seq_lengths,
+      (convertIntAttrTo32Bit $seq_dim), (convertIntAttrTo32Bit $batch_dim))>;
+def LegalizeRound : Pat<(TF_RoundOp $arg), (TFL_RoundOp $arg)>;
+def LegalizeRsqrt : Pat<(TF_RsqrtOp $arg), (TFL_RsqrtOp $arg)>;
+def LegalizeSqrt : Pat<(TF_SqrtOp $arg), (TFL_SqrtOp $arg)>;
+def LegalizeSquare : Pat<(TF_SquareOp $arg), (TFL_SquareOp $arg)>;
+def LegalizeSegmentSum : Pat<(TF_SegmentSumOp $data, I32Tensor:$segment_ids),
+                             (TFL_SegmentSumOp $data, $segment_ids)>;
+def LegalizeSelect : Pat<(TF_SelectOp $cond, $x, $y),
+                         (TFL_SelectOp $cond, $x, $y)>;
+def LegalizeSelectV2SameStaticShape : Pat<(TF_SelectV2Op:$src_op $cond, $x, $y),
+    (TFL_SelectOp $cond, $x, $y),
+    [(HasSameStaticShapes $src_op)]>;
+def LegalizeSelectV2NotSameStaticShape : Pat<
+  (TF_SelectV2Op:$src_op $cond, $x, $y),
+  (TFL_SelectV2Op $cond, $x, $y),
+  [(HasNotSameStaticShapes $src_op)]>;
+def LegalizeShape : Pat<(TF_ShapeOp $arg), (TFL_ShapeOp $arg)>;
+def LegalizeSigmoid : Pat<(TF_SigmoidOp $arg), (TFL_LogisticOp $arg)>;
+def LegalizeSin : Pat<(TF_SinOp F32Tensor:$arg), (TFL_SinOp $arg)>;
+def LegalizeSlice : Pat<(TF_SliceOp $input, $begin, $size),
+                        (TFL_SliceOp $input, $begin, $size)>;
+def LegalizeSoftmax : Pat<(TF_SoftmaxOp $arg),
+                          (TFL_SoftmaxOp $arg, ConstF32Attr<"1.0">)>;
+def LegalizeSoftPlus : Pat<(TF_SoftplusOp F32Tensor:$arg0),
+    (TFL_LogOp (TFL_AddOp (TFL_ExpOp $arg0),
+                (ConstantOp ConstantAttr<RankedF32ElementsAttr<[]>, "1.0f">),
+                TFL_AF_None))>;
+def LegalizeSqueeze : Pat<(TF_SqueezeOp $arg, $squeeze_dims),
+                          (TFL_SqueezeOp $arg, $squeeze_dims)>;
+def LegalizeTanh : Pat<(TF_TanhOp $arg), (TFL_TanhOp $arg)>;
+def LegalizeTranspose : Pat<(TF_TransposeOp $arg, $perm),
+                            (TFL_TransposeOp $arg, $perm)>;
+def LegalizeWhere : Pat<(TF_WhereOp $arg), (TFL_WhereOp $arg)>;
+def LegalizeZerosLike : Pat<(TF_ZerosLikeOp $arg), (TFL_ZerosLikeOp $arg)>;
 
 //===----------------------------------------------------------------------===//
 // Binary ops patterns.
 //===----------------------------------------------------------------------===//
-def : Pat<(TF_LessOp $l, $r), (TFL_LessOp $l, $r)>;
-def : Pat<(TF_GreaterOp $l, $r), (TFL_GreaterOp $l, $r)>;
+def LegalizeLess : Pat<(TF_LessOp $l, $r), (TFL_LessOp $l, $r)>;
+def LegalizeGreater : Pat<(TF_GreaterOp $l, $r), (TFL_GreaterOp $l, $r)>;
 
-def : Pat<(TF_LessEqualOp $l, $r), (TFL_LessEqualOp $l, $r)>;
-def : Pat<(TF_GreaterEqualOp $l, $r), (TFL_GreaterEqualOp $l, $r)>;
+def LegalizeLessEqual : Pat<(TF_LessEqualOp $l, $r), (TFL_LessEqualOp $l, $r)>;
+def LegalizeGreaterEqual : Pat<(TF_GreaterEqualOp $l, $r),
+                               (TFL_GreaterEqualOp $l, $r)>;
 
 // Gather in TF -> Gather in TFL with axis=0
 // The 'validate_indices' attribute is deprecated.
-def : Pat<(TF_GatherOp $params, $indices, $ignored_validate_indices),
-          (TFL_GatherOp $params, $indices, ConstantAttr<I32Attr, "0">)>;
+def LegalizeGather: Pat<
+  (TF_GatherOp $params, $indices, $ignored_validate_indices),
+  (TFL_GatherOp $params, $indices, ConstantAttr<I32Attr, "0">)>;
 
-def : Pat<(TF_GatherNdOp $params, $indices),
-          (TFL_GatherNdOp $params, $indices)>;
+def LegalizeGatherNd : Pat<(TF_GatherNdOp $params, $indices),
+                           (TFL_GatherNdOp $params, $indices)>;
 
-def : Pat<(TF_GatherV2Op $params, $indices,
-                         (ConstantOp ElementsAttr:$axis),
-                         ConstantAttr<I64Attr, "0">:$batch_dims),
-          (TFL_GatherOp $params, $indices,
-                        ExtractSingleElementAsInt32:$axis)>;
+def LegalizeGatherV2 : Pat<
+  (TF_GatherV2Op $params, $indices, (ConstantOp ElementsAttr:$axis),
+    ConstantAttr<I64Attr, "0">:$batch_dims),
+  (TFL_GatherOp $params, $indices, ExtractSingleElementAsInt32:$axis)>;
 
-def : Pat<(TF_FloorDivOp $l, $r), (TFL_FloorDivOp $l, $r)>;
+def LegalizeFloorDiv : Pat<(TF_FloorDivOp $l, $r), (TFL_FloorDivOp $l, $r)>;
 
-def : Pat<(TF_NotEqualOp $l, $r, /*incompatible_shape_error=*/ConstBoolAttrTrue),
-          (TFL_NotEqualOp $l, $r)>;
+def LegalizeNotEqual : Pat<
+  (TF_NotEqualOp $l, $r, /*incompatible_shape_error=*/ConstBoolAttrTrue),
+  (TFL_NotEqualOp $l, $r)>;
 
-def : Pat<(TF_LogicalAndOp $l, $r), (TFL_LogicalAndOp $l, $r)>;
+def LegalizeLogicalAnd : Pat<(TF_LogicalAndOp $l, $r),
+                             (TFL_LogicalAndOp $l, $r)>;
 
-def : Pat<(TF_LogicalOrOp $l, $r), (TFL_LogicalOrOp $l, $r)>;
+def LegalizeLogicalOr : Pat<(TF_LogicalOrOp $l, $r), (TFL_LogicalOrOp $l, $r)>;
+
+def LegalizeAdd : Pat<(TF_AddOp $lhs, $rhs),
+                      (TFL_AddOp $lhs, $rhs, TFL_AF_None)>;
+def LegalizeAddv2 : Pat<(TF_AddV2Op $lhs, $rhs),
+                        (TFL_AddOp $lhs, $rhs, TFL_AF_None)>;
+def LegalizeBiasAdd : Pat<
+  (TF_BiasAddOp F32Tensor:$l, F32Tensor:$r, IsDataFormatNHWC:$data_format),
+  (TFL_AddOp $l, $r, TFL_AF_None)>;
+def LegalizeSub : Pat<(TF_SubOp $lhs, $rhs),
+                      (TFL_SubOp $lhs, $rhs, TFL_AF_None)>;
+def LegalizeMul : Pat<(TF_MulOp $lhs, $rhs),
+                      (TFL_MulOp $lhs, $rhs, TFL_AF_None)>;
+def LegalizeRealDiv : Pat<(TF_RealDivOp $lhs, $rhs),
+                          (TFL_DivOp $lhs, $rhs, TFL_AF_None)>;
+def LegalizeDiv : Pat<(TF_DivOp $lhs, $rhs),
+                      (TFL_DivOp $lhs, $rhs, TFL_AF_None)>;
 
-def : Pat<(TF_AddOp $lhs, $rhs), (TFL_AddOp $lhs, $rhs, TFL_AF_None)>;
-def : Pat<(TF_AddV2Op $lhs, $rhs), (TFL_AddOp $lhs, $rhs, TFL_AF_None)>;
 // When batch size is known, TF BatchMatMul gets unfolded to TFL FullyConnected
 // with additional ops. In the case of unknown batch size, the match will
 // fall through to here and convert to TF Lite BatchMatMul.
-def : Pat<(TF_BatchMatMulV2Op $lhs, $rhs, $adj_x, $adj_y), (TFL_BatchMatMulOp $lhs, $rhs, $adj_x, $adj_y)>;
-def : Pat<(TF_BatchMatMulOp $lhs, $rhs, $adj_x, $adj_y), (TFL_BatchMatMulOp $lhs, $rhs, $adj_x, $adj_y)>;
-def : Pat<(TF_SubOp $lhs, $rhs), (TFL_SubOp $lhs, $rhs, TFL_AF_None)>;
-def : Pat<(TF_MulOp $lhs, $rhs), (TFL_MulOp $lhs, $rhs, TFL_AF_None)>;
-def : Pat<(TF_RealDivOp $lhs, $rhs), (TFL_DivOp $lhs, $rhs, TFL_AF_None)>;
-def : Pat<(TF_DivOp $lhs, $rhs), (TFL_DivOp $lhs, $rhs, TFL_AF_None)>;
+def LegalizeBatchMatMulV2UnknownBatch : Pat<
+  (TF_BatchMatMulV2Op $lhs, $rhs, $adj_x, $adj_y),
+  (TFL_BatchMatMulOp $lhs, $rhs, $adj_x, $adj_y)>;
+def LegalizeBatchMatMulUnknownBatch : Pat<
+  (TF_BatchMatMulOp $lhs, $rhs, $adj_x, $adj_y),
+  (TFL_BatchMatMulOp $lhs, $rhs, $adj_x, $adj_y)>;
 
-def : Pat<(TF_BiasAddOp F32Tensor:$l, F32Tensor:$r,
-                        IsDataFormatNHWC:$data_format),
-          (TFL_AddOp $l, $r, TFL_AF_None)>;
-// TODO(jpienaar): These should be handled by the pattern rewriter, find out
-// why it isn't.
-def : Pat<(TF_Relu6Op (TF_BiasAddOp F32Tensor:$l, F32Tensor:$r,
-                                    IsDataFormatNHWC:$data_format)),
-          (TFL_AddOp $l, $r, TFL_AF_Relu6)>;
-
-def : Pat<(TF_FakeQuantWithMinMaxVarsOp $inputs,
-              (ConstantOp F32ElementsAttr:$min),
-              (ConstantOp F32ElementsAttr:$max),
-              $num_bits, $narrow_range),
-          (TFL_DequantizeOp
-              (TFL_QuantizeOp $inputs,
-                 (ConvertToQuantTypeFromAttrs $inputs, $min, $max,
-                     $num_bits, $narrow_range)))>;
+def LegalizeFakeQuantWithMinMaxVars: Pat<
+  (TF_FakeQuantWithMinMaxVarsOp $inputs, (ConstantOp F32ElementsAttr:$min),
+    (ConstantOp F32ElementsAttr:$max), $num_bits, $narrow_range),
+  (TFL_DequantizeOp
+    (TFL_QuantizeOp $inputs, (ConvertToQuantTypeFromAttrs $inputs, $min, $max,
+                              $num_bits, $narrow_range)))>;
 
 // TODO(rocky): Not all of the attributes are handled correctly.  Make this
 // more general if there is a need.
-def : Pat<(TF_QuantizeAndDequantizeV2Op $inputs,
-              (ConstantOp F32ElementsAttr:$min),
-              (ConstantOp F32ElementsAttr:$max),
-              $signed_input, $num_bits, $range_given, $round_mode,
-              $narrow_range, $axis),
-          (TFL_DequantizeOp
-              (TFL_QuantizeOp $inputs,
-                 (ConvertToQuantTypeFromAttrs $inputs, $min, $max,
-                     $num_bits, $narrow_range)))>;
+def LegalizeQuantizeAndDequantizeV2 : Pat<
+  (TF_QuantizeAndDequantizeV2Op $inputs, (ConstantOp F32ElementsAttr:$min),
+    (ConstantOp F32ElementsAttr:$max),
+    $signed_input, $num_bits, $range_given, $round_mode, $narrow_range, $axis),
+  (TFL_DequantizeOp
+    (TFL_QuantizeOp $inputs, (ConvertToQuantTypeFromAttrs $inputs, $min, $max,
+                              $num_bits, $narrow_range)))>;
 
-def : Pat<(TF_RankOp $input), (TFL_RankOp $input)>;
+def LegalizeRank : Pat<(TF_RankOp $input), (TFL_RankOp $input)>;
 
-def : Pat<(TF_SquaredDifferenceOp $l, $r), (TFL_SquaredDifferenceOp $l, $r)>;
+def LegalizeSquaredDifference : Pat<(TF_SquaredDifferenceOp $l, $r),
+                                    (TFL_SquaredDifferenceOp $l, $r)>;
 
-// Note(ycling): We can eliminate Relu from Relu(SquaredDifference(x, y)),
-// since the result of SquaredDifference is always non-negative.
-// TFLite interpreter doesn't support Relu+int32 for now. So the test cases
-// are failing without the following pattern to optimize Relu away fixes
-// the problem.
-def : Pat<(TF_ReluOp (TF_SquaredDifferenceOp $l, $r)),
-          (TFL_SquaredDifferenceOp $l, $r)>;
+def LegalizeReverseV2 : Pat<(TF_ReverseV2Op $arg0, $arg1),
+                            (TFL_ReverseV2Op $arg0, $arg1)>;
 
-def : Pat<(TF_ReverseV2Op $arg0, $arg1), (TFL_ReverseV2Op $arg0, $arg1)>;
+def LegalizeEqual : Pat<(TF_EqualOp $arg0, $arg1,
+                          /*incompatible_shape_error=*/ConstBoolAttrTrue),
+                        (TFL_EqualOp $arg0, $arg1)>;
 
-def : Pat<(TF_EqualOp $arg0, $arg1, /*incompatible_shape_error=*/ConstBoolAttrTrue), (TFL_EqualOp $arg0, $arg1)>;
+def LegalizePad : Pat<(TF_PadOp $arg0, $arg1), (TFL_PadOp $arg0, $arg1)>;
 
-def : Pat<(TF_PadOp $arg0, $arg1), (TFL_PadOp $arg0, $arg1)>;
+def LegalizeTile : Pat<(TF_TileOp $arg0, $arg1), (TFL_TileOp $arg0, $arg1)>;
 
-def : Pat<(TF_TileOp $arg0, $arg1), (TFL_TileOp $arg0, $arg1)>;
+def LegalizePadV2 : Pat<(TF_PadV2Op $arg0, $arg1, $cst),
+                        (TFL_PadV2Op $arg0, $arg1, $cst)>;
 
-def : Pat<(TF_PadV2Op $arg0, $arg1, $cst), (TFL_PadV2Op $arg0, $arg1, $cst)>;
+def LegalizeMean : Pat<(TF_MeanOp $arg0, $arg1, BoolAttr:$arg2),
+                       (TFL_MeanOp $arg0, $arg1, $arg2)>;
 
-def : Pat<(TF_MeanOp $arg0, $arg1, BoolAttr:$arg2), (TFL_MeanOp $arg0, $arg1, $arg2)>;
-
-def : Pat<(TF_SumOp $arg, $axes, BoolAttr:$arg2), (TFL_SumOp $arg, $axes, $arg2)>;
+def LegalizeSum : Pat<(TF_SumOp $arg, $axes, BoolAttr:$arg2),
+                      (TFL_SumOp $arg, $axes, $arg2)>;
 
 // TopK in TFL is always sorted so we ignore that attribute here.
-def : Pat<(TF_TopKV2Op $input, $k, $ignored_sorted), (TFL_TopKV2Op $input, $k)>;
+def LegalizeTopKV2 : Pat<(TF_TopKV2Op $input, $k, $ignored_sorted),
+                         (TFL_TopKV2Op $input, $k)>;
 
-def : Pat<(TF_MinOp $arg0, $arg1, BoolAttr:$arg2), (TFL_ReduceMinOp $arg0, $arg1, $arg2)>;
+def LegalizeMin : Pat<(TF_MinOp $arg0, $arg1, BoolAttr:$arg2),
+                      (TFL_ReduceMinOp $arg0, $arg1, $arg2)>;
 
-def : Pat<(TF_MaxOp $arg0, $arg1, BoolAttr:$arg2), (TFL_ReduceMaxOp $arg0, $arg1, $arg2)>;
+def LegalizeMax : Pat<(TF_MaxOp $arg0, $arg1, BoolAttr:$arg2),
+                      (TFL_ReduceMaxOp $arg0, $arg1, $arg2)>;
 
-def : Pat<(TF_ProdOp $arg0, $arg1, BoolAttr:$arg2), (TFL_ReduceProdOp $arg0, $arg1, $arg2)>;
+def LegalizeProd : Pat<(TF_ProdOp $arg0, $arg1, BoolAttr:$arg2),
+                       (TFL_ReduceProdOp $arg0, $arg1, $arg2)>;
 
-def : Pat<(TF_AnyOp $input, $reduction_indices, $keep_dims),
-          (TFL_ReduceAnyOp $input, $reduction_indices, $keep_dims)>;
+def LegalizeAny : Pat<(TF_AnyOp $input, $reduction_indices, $keep_dims),
+                      (TFL_ReduceAnyOp $input, $reduction_indices, $keep_dims)>;
 
-def : Pat<(TF_CastOp $arg0, BoolAttr:$arg1), (TFL_CastOp $arg0)>;
+def LegalizeCast : Pat<(TF_CastOp $arg0, BoolAttr:$arg1), (TFL_CastOp $arg0)>;
 
-def : Pat<(TF_BatchToSpaceNDOp $input, $block_shape, $crops), (TFL_BatchToSpaceNdOp $input, $block_shape, $crops)>;
+def LegalizeBatchToSpaceND : Pat<
+  (TF_BatchToSpaceNDOp $input, $block_shape, $crops),
+  (TFL_BatchToSpaceNdOp $input, $block_shape, $crops)>;
 
-def : Pat<(TF_SpaceToBatchNDOp $input, $block_shape, $paddings), (TFL_SpaceToBatchNdOp $input, $block_shape, $paddings)>;
+def LegalizeSpaceToBatchND : Pat<
+  (TF_SpaceToBatchNDOp $input, $block_shape, $paddings),
+  (TFL_SpaceToBatchNdOp $input, $block_shape, $paddings)>;
 
-def : Pat<(TF_SpaceToDepthOp $input, $block_size, IsDataFormatNHWC:$data_format),
-          (TFL_SpaceToDepthOp $input, (convertIntAttrTo32Bit $block_size))>;
+def LegalizeSpaceToDepth : Pat<
+  (TF_SpaceToDepthOp $input, $block_size, IsDataFormatNHWC:$data_format),
+  (TFL_SpaceToDepthOp $input, (convertIntAttrTo32Bit $block_size))>;
 
-def : Pat<(TF_DepthToSpaceOp $input, $block_size, IsDataFormatNHWC:$data_format),
-          (TFL_DepthToSpaceOp $input, (convertIntAttrTo32Bit $block_size))>;
+def LegalizeDepthToSpace : Pat<
+  (TF_DepthToSpaceOp $input, $block_size, IsDataFormatNHWC:$data_format),
+  (TFL_DepthToSpaceOp $input, (convertIntAttrTo32Bit $block_size))>;
 
-def : Pat<(TF_ResizeBilinearOp $images, $size, $align_corners, $half_pixel_centers), (TFL_ResizeBilinearOp $images, $size, $align_corners, $half_pixel_centers)>;
-def : Pat<(TF_ResizeNearestNeighborOp $images, $size, $align_corners, $half_pixel_centers), (TFL_ResizeNearestNeighborOp $images, $size, $align_corners, $half_pixel_centers)>;
+def LegalizeResizeBilinear : Pat<
+  (TF_ResizeBilinearOp $images, $size, $align_corners, $half_pixel_centers),
+  (TFL_ResizeBilinearOp $images, $size, $align_corners, $half_pixel_centers)>;
+def LegalizeResizeNearestNeighbor : Pat<
+  (TF_ResizeNearestNeighborOp $images, $size, $align_corners,
+    $half_pixel_centers),
+  (TFL_ResizeNearestNeighborOp $images, $size, $align_corners,
+    $half_pixel_centers)>;
 
-def : Pat<(TF_MirrorPadOp $arg0, $arg1, $cst), (TFL_MirrorPadOp $arg0, $arg1, $cst)>;
+def LegalizeMirrorPad : Pat<(TF_MirrorPadOp $arg0, $arg1, $cst),
+                            (TFL_MirrorPadOp $arg0, $arg1, $cst)>;
 
-def : Pat<(TF_SparseToDenseOp $sparse_indices, $output_shape, $sparse_values, $default_value, $validate_indices),
-          (TFL_SparseToDenseOp $sparse_indices, $output_shape, $sparse_values, $default_value)>;
+def LegalizeSparseToDense : Pat<
+  (TF_SparseToDenseOp $sparse_indices, $output_shape, $sparse_values,
+    $default_value, $validate_indices),
+  (TFL_SparseToDenseOp $sparse_indices, $output_shape, $sparse_values,
+    $default_value)>;
 
-def : Pat<(TF_UniqueOp $arg0),(TFL_UniqueOp $arg0)>;
+def LegalizeUnique : Pat<(TF_UniqueOp $arg0),(TFL_UniqueOp $arg0)>;
 
-def : Pat<(TF_FloorModOp $arg0, $arg1), (TFL_FloorModOp $arg0, $arg1)>;
-def : Pat<(TF_ExpOp $arg0), (TFL_ExpOp $arg0)>;
+def LegalizeFloorMod : Pat<(TF_FloorModOp $arg0, $arg1),
+                           (TFL_FloorModOp $arg0, $arg1)>;
+def LegalizeExp : Pat<(TF_ExpOp $arg0), (TFL_ExpOp $arg0)>;
 
-def : Pat<(TF_LRNOp $arg0, $radius, F32Attr:$bias, F32Attr:$alpha, F32Attr:$beta), (TFL_LocalResponseNormalizationOp $arg0, (convertIntAttrTo32Bit $radius), $bias, $alpha, $beta)>;
+def LegalizeLRN : Pat<
+  (TF_LRNOp $arg0, $radius, F32Attr:$bias, F32Attr:$alpha, F32Attr:$beta),
+  (TFL_LocalResponseNormalizationOp $arg0, (convertIntAttrTo32Bit $radius),
+    $bias, $alpha, $beta)>;
 
-def : Pat<
-  (TF_NonMaxSuppressionV4Op $boxes, $scores, $max_output_size, $iou_threshold, $score_threshold, $pad_to_max_output_size),
-  (TFL_NonMaxSuppressionV4Op $boxes, $scores, $max_output_size, $iou_threshold, $score_threshold)>;
+def LegalizeNonMaxSuppressionV4 : Pat<
+  (TF_NonMaxSuppressionV4Op $boxes, $scores, $max_output_size, $iou_threshold,
+    $score_threshold, $pad_to_max_output_size),
+  (TFL_NonMaxSuppressionV4Op $boxes, $scores, $max_output_size, $iou_threshold,
+    $score_threshold)>;
 
-def : Pat<
-  (TF_NonMaxSuppressionV5Op $boxes, $scores, $max_output_size, $iou_threshold, $score_threshold, $soft_nms_sigma, $pad_to_max_output_size),
-  (TFL_NonMaxSuppressionV5Op $boxes, $scores, $max_output_size, $iou_threshold, $score_threshold, $soft_nms_sigma)>;
+def LegalizeNonMaxSuppressionV5 : Pat<
+  (TF_NonMaxSuppressionV5Op $boxes, $scores, $max_output_size, $iou_threshold,
+    $score_threshold, $soft_nms_sigma, $pad_to_max_output_size),
+  (TFL_NonMaxSuppressionV5Op $boxes, $scores, $max_output_size, $iou_threshold,
+    $score_threshold, $soft_nms_sigma)>;
 
-def : Pat<(TF_MatrixDiagOp $diagonal), (TFL_MatrixDiagOp $diagonal)>;
+def LegalizeMatrixDiag : Pat<(TF_MatrixDiagOp $diagonal),
+                             (TFL_MatrixDiagOp $diagonal)>;
 
 class I32VectorElementsAttr<int len> : ElementsAttrBase<
   CPred<"$_self.isa<DenseIntElementsAttr>() &&"
@@ -356,7 +406,7 @@ class I32VectorElementsAttr<int len> : ElementsAttrBase<
     "RankedTensorType::get({" # len # "}, $_builder.getIntegerType(32)), $0)";
 }
 
-def : Pat<
+def LegalizeConv2DBackpropInput : Pat<
   (TF_Conv2DBackpropInputOp $input_sizes, $filter, $out_backprop,
      IsIntList1XY1:$strides,
      BoolAttr:$use_cudnn_on_gpu,
@@ -373,9 +423,10 @@ def : Pat<
      /*stride_h=*/ ExtractI32At<1>:$strides,
      /*stride_w=*/ ExtractI32At<2>:$strides)>;
 
-def : Pat<
+def LegalizeMatrixSetDiag : Pat<
   (TF_MatrixSetDiagOp $input, $diagonal),
   (TFL_MatrixSetDiagOp $input, $diagonal)>;
 
-def : Pat<(TF_ScatterNdOp I32Tensor:$indices, $updates, $shape),
-          (TFL_ScatterNdOp I32Tensor:$indices, $updates, $shape)>;
+def LegalizeScatterNd : Pat<
+  (TF_ScatterNdOp I32Tensor:$indices, $updates, $shape),
+  (TFL_ScatterNdOp I32Tensor:$indices, $updates, $shape)>;
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
index 1328a2baf5d..7a16e475ce3 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
@@ -158,7 +158,7 @@ LogicalResult ConvertTFRandomUniformOp::matchAndRewrite(
       random_uniform_op.seed().getSExtValue(),
       random_uniform_op.seed2().getSExtValue());
   Distribution dist;
-  int num_elements = 0;
+  size_t num_elements = 0;
   if (auto output_type =
           random_uniform_op.output().getType().dyn_cast_or_null<ShapedType>()) {
     if (auto ranked_output = output_type.dyn_cast_or_null<RankedTensorType>()) {
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf_while.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf_while.cc
index 31e3f6dd005..6202507ae91 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf_while.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf_while.cc
@@ -49,23 +49,19 @@ void RunOnWhile(TF::WhileOp while_op) {
       op->getLoc(), op->getResultTypes(), op->getOperands(),
       while_op.is_stateless());
   // Insert call to the given function into the 'region'.
-  auto create_region_with_call = [&while_op](FlatSymbolRefAttr symbol,
-                                             Region& region) {
+  auto create_region_with_call = [&while_op](FuncOp func, Region& region) {
     OpBuilder builder(region);
     auto block = builder.createBlock(&region);
     SmallVector<Value, 4> new_operands;
-    auto func = while_op.getParentOfType<ModuleOp>().lookupSymbol<FuncOp>(
-        symbol.getValue());
     for (Type t : func.getType().getInputs())
       new_operands.push_back(block->addArgument(t));
-    auto call = builder.create<CallOp>(
-        while_op.getLoc(), symbol, func.getType().getResults(), new_operands);
+    auto call = builder.create<CallOp>(while_op.getLoc(), func, new_operands);
     builder.create<YieldOp>(while_op.getLoc(), call.getResults());
     // Mark old function as private so that it can be DCE'd if not called.
     func.setVisibility(SymbolTable::Visibility::Private);
   };
-  create_region_with_call(while_op.condAttr(), new_op.cond());
-  create_region_with_call(while_op.bodyAttr(), new_op.body());
+  create_region_with_call(while_op.cond_func(), new_op.cond());
+  create_region_with_call(while_op.body_func(), new_op.body());
 
   op->replaceAllUsesWith(new_op.getResults());
   op->erase();
diff --git a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
index 2498a732a86..edddc7751ab 100644
--- a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
@@ -17,7 +17,7 @@ limitations under the License.
 // converting Tensorlist operations in TensorFlow dialect into operations that
 // can be legalized to TensorFlow Lite dialect with simple replacements.  The
 // newly created operations are in the TensorFlow dialect if the operation can
-// be represented using a TensorFlow op.  Otherwise, TensorFlow Lite dialect op
+// be represented using a TensorFlow op. Otherwise, TensorFlow Lite dialect op
 // is used.
 
 #include <climits>
@@ -332,9 +332,8 @@ struct ConvertTensorListInitOp : public OpConversionPattern<OpT> {
       ConversionPatternRewriter &rewriter) const override {
     Type dtype = op.element_dtype();
     if (!(dtype.isF16() || dtype.isF32() || dtype.isF64() ||
-          dtype.isInteger(1) || dtype.isSignlessInteger(8) ||
-          dtype.isSignlessInteger(16) || dtype.isSignlessInteger(32) ||
-          dtype.isSignlessInteger(64))) {
+          dtype.isInteger(1) || dtype.isInteger(8) || dtype.isInteger(16) ||
+          dtype.isInteger(32) || dtype.isInteger(64))) {
       op.emitError(
           "requires element_dtype to be 1-bit/8-bit/16-bit/32-bit/64-bit "
           "integer or 16-bit/32-bit/64-bit float type during TF Lite "
@@ -739,14 +738,18 @@ struct ConvertIdentity : public OpConversionPattern<TF::IdentityOp> {
   }
 };
 
+// Returns an unranked tensor type with an element of the same type as `value`
+// if `type` is a tensor of variant. Otherwise, returns `type` unmodified.
+Type VariantToUnrankedTensorType(Type type, Value value) {
+  if (getElementTypeOrSelf(type).isa<TF::VariantType>())
+    return UnrankedTensorType::get(getElementTypeOrSelf(value.getType()));
+  return type;
+}
+
 // Changes the function type of `cond_func` and `body_func` for the given While
 // op.
-static LogicalResult UpdateFunctionTypes(TF::WhileOp op) {
-  auto module = op.getParentOfType<ModuleOp>();
-  auto *context = module.getContext();
-
-  for (StringRef func_name : {op.cond(), op.body()}) {
-    FuncOp func = module.lookupSymbol<FuncOp>(func_name);
+LogicalResult UpdateFunctionTypes(TF::WhileOp op) {
+  for (FuncOp func : {op.cond_func(), op.body_func()}) {
     if (!func) continue;
 
     FunctionType func_type = func.getType();
@@ -757,42 +760,29 @@ static LogicalResult UpdateFunctionTypes(TF::WhileOp op) {
     // tensor type if it's a variant type.
     SmallVector<Type, 8> updated_argument_types;
     updated_argument_types.reserve(num_inputs);
-    for (int i = 0; i < num_inputs; ++i) {
-      Type arg_type = func_type.getInput(i);
-      if (getElementTypeOrSelf(arg_type).isa<TF::VariantType>()) {
-        arg_type = UnrankedTensorType::get(
-            getElementTypeOrSelf(op.getOperand(i).getType()));
-      }
-      updated_argument_types.push_back(arg_type);
-    }
+    for (auto it : llvm::zip(func_type.getInputs(), op.getOperands()))
+      updated_argument_types.push_back(
+          VariantToUnrankedTensorType(std::get<0>(it), std::get<1>(it)));
 
-    // For each result type in function's results, change it to unranked tensor
-    // type if it's a variant type.
+    // Change all DT_VARIANT result types in function results to unranked tensor
+    // type with element type derived from the corresponding input operand. This
+    // is correct because while body's inputs and results have the same type.
     SmallVector<Type, 8> updated_result_types;
     updated_result_types.reserve(num_results);
-    for (int i = 0; i < num_results; ++i) {
-      Type result_type = func_type.getResult(i);
-      if (getElementTypeOrSelf(result_type).isa<TF::VariantType>()) {
-        // Here update the variant type with the unranked tensor type derived
-        // from the corresponding input operand. This is correct because while
-        // body's inputs and results have the same type.
-        result_type = UnrankedTensorType::get(
-            getElementTypeOrSelf(op.getOperand(i).getType()));
-      }
-      updated_result_types.push_back(result_type);
-    }
+    for (auto it : llvm::zip(func_type.getResults(), op.getOperands()))
+      updated_result_types.push_back(
+          VariantToUnrankedTensorType(std::get<0>(it), std::get<1>(it)));
 
     // Change `func`'s argument type to `unranked_argument_types`. If it
     // return types contain a `DT_VARIANT`, change it to the unranked type
     // derived from the corresponding argument.
     func.setType(FunctionType::get(updated_argument_types, updated_result_types,
-                                   context));
+                                   op.getContext()));
 
     // Change the argument type for the first block.
-    Block &body_first_bb = func.front();
-    for (int i = 0; i < body_first_bb.getNumArguments(); ++i) {
-      body_first_bb.getArgument(i).setType(updated_argument_types[i]);
-    }
+    llvm::for_each(func.getArguments(), [&](BlockArgument &arg) {
+      arg.setType(updated_argument_types[arg.getArgNumber()]);
+    });
   }
   return success();
 }
@@ -805,25 +795,60 @@ struct ConvertWhile : public OpConversionPattern<TF::WhileOp> {
       ConversionPatternRewriter &rewriter) const override {
     llvm::SmallVector<Type, 8> result_types;
     result_types.reserve(op.getNumOperands());
-    for (int i = 0, e = operands.size(); i != e; ++i) {
-      Type result_ty = op.getResult(i).getType();
+    // Change all DT_VARIANT result types to unranked tensor type.
+    for (auto it : llvm::zip(op.getResultTypes(), operands))
+      result_types.push_back(
+          VariantToUnrankedTensorType(std::get<0>(it), std::get<1>(it)));
 
-      // If we notice the result type is a DT_VARIANT, we change the
-      // corresponding result type to unranked tensor type.
-      if (getElementTypeOrSelf(result_ty).isa<TF::VariantType>()) {
-        Type element_ty = getElementTypeOrSelf(operands[i].getType());
-        result_ty = UnrankedTensorType::get(element_ty);
+    // Create a new while op with new operands and updated result types.
+    auto converted = rewriter.create<TF::WhileOp>(op.getLoc(), result_types,
+                                                  operands, op.getAttrs());
+    converted.removeAttr("T");
+    UpdateFunctionTypes(converted);
+
+    rewriter.replaceOp(op, converted.getResults());
+    return success();
+  }
+};
+
+struct ConvertWhileRegion : public OpConversionPattern<TF::WhileRegionOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      TF::WhileRegionOp op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const override {
+    llvm::SmallVector<Type, 8> result_types;
+    result_types.reserve(op.getNumOperands());
+    // Change all DT_VARIANT result types to unranked tensor type.
+    for (auto it : llvm::zip(op.getResultTypes(), operands))
+      result_types.push_back(
+          VariantToUnrankedTensorType(std::get<0>(it), std::get<1>(it)));
+
+    // Create a new while op with new operands and updated result types.
+    auto converted = rewriter.create<TF::WhileRegionOp>(
+        op.getLoc(), result_types, operands, op.getAttrs());
+
+    // Inline the regions from the old while into the new one, and apply
+    // signature conversion to inlined region.
+    for (auto it : llvm::zip(op.getRegions(), converted.getRegions())) {
+      Region &old_region = *std::get<0>(it);
+      Region &new_region = *std::get<1>(it);
+
+      Block &entry = old_region.front();
+      // Build signature conversion for the region.
+      TypeConverter::SignatureConversion signature_conversion(operands.size());
+      for (auto it : llvm::zip(entry.getArguments(), operands)) {
+        BlockArgument arg = std::get<0>(it);
+        signature_conversion.addInputs(
+            arg.getArgNumber(),
+            VariantToUnrankedTensorType(arg.getType(), std::get<1>(it)));
       }
-      result_types.push_back(result_ty);
+
+      rewriter.inlineRegionBefore(old_region, new_region, new_region.end());
+      rewriter.applySignatureConversion(&new_region, signature_conversion);
     }
 
-    // Clone original while op with new operands and updated result types.
-    auto cloned = rewriter.create<TF::WhileOp>(op.getLoc(), result_types,
-                                               operands, op.getAttrs());
-    cloned.removeAttr("T");
-    UpdateFunctionTypes(cloned);
-
-    rewriter.replaceOp(op, cloned.getResults());
+    rewriter.replaceOp(op, converted.getResults());
     return success();
   }
 };
@@ -872,7 +897,8 @@ LogicalResult LowerStaticTensorListPass::RewriteFunction(
                   ConvertTensorListGetItem, ConvertTensorListLength,
                   ConvertTensorListPushBack, ConvertTensorListReserve,
                   ConvertTensorListSetItem, ConvertTensorListStack,
-                  ConvertTensorListResize, ConvertWhile>(context);
+                  ConvertTensorListResize, ConvertWhile, ConvertWhileRegion>(
+      context);
   return applyPartialConversion(func, target, patterns);
 }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize.cc b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
index d26a4906420..eeecfac67cf 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
@@ -160,6 +160,31 @@ bool CanFuseConvOrDepthwiseConv(Attribute filter, Attribute val,
   return false;
 }
 
+// Retuns true if we can eliminate the GatherNdOp or ScatterNdOp. When the value
+// of `indices` are from 0 to n-1, the output tensor are identical to the
+// `params`.
+bool CanOptimizeIdentityGatherNdOrScatterNdOp(Value params,
+                                              DenseIntElementsAttr indices) {
+  auto params_type = params.getType().dyn_cast<RankedTensorType>();
+  auto indices_type = indices.getType().dyn_cast<RankedTensorType>();
+  // Checks the shape of `params` is [n, ...], shape of `indices` is [n, 1]. 2D
+  // `indices` means it gets the first row of `params`. As long as indices
+  // iterate the first row of `params`, the output is identical to input.
+  if (!params_type || !indices_type || indices_type.getRank() != 2 ||
+      indices_type.getDimSize(0) != params_type.getDimSize(0) ||
+      indices_type.getDimSize(1) != 1)
+    return false;
+
+  // Checks the value in `indices` is from 0 to n-1.
+  int cur_value = 0;
+  for (const auto &v : indices.getValues<APInt>()) {
+    if (v.getSExtValue() != cur_value) return false;
+    ++cur_value;
+  }
+
+  return true;
+}
+
 // Expand Attribute 'a' to 4D with all 1s except 1 dimension.
 // Which dimension depends on 'is_depthwise' is true or false.
 ElementsAttr ExpandTo4DForConvImpl(Attribute a, bool is_depthwise) {
@@ -197,9 +222,10 @@ TypeAttr RescaleQtype(Type input, Attribute factor) {
 DenseElementsAttr GetShape(Value output_val) {
   auto output_type = output_val.getType().cast<RankedTensorType>();
   auto shape_vector = output_type.getShape();
-  std::vector<int32_t> shape(shape_vector.size());
-  for (int i = 0; i < shape_vector.size(); ++i) {
-    shape[i] = shape_vector[i];
+  std::vector<int32_t> shape;
+  shape.reserve(shape_vector.size());
+  for (auto shape_object : shape_vector) {
+    shape.push_back(shape_object);
   }
   return mlir::DenseElementsAttr::get(
       RankedTensorType::get(
@@ -684,7 +710,7 @@ struct ConvertTrivialTransposeOpToReshapeOp
 
     SmallVector<int, 8> old_major_index_ordering;
     SmallVector<int, 8> new_major_index_ordering;
-    for (int i = 0; i < input_shape.size(); i++) {
+    for (int i = 0, end = input_shape.size(); i < end; i++) {
       if (input_shape[i] != 1) {
         old_major_index_ordering.push_back(i);
       }
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc b/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc
index 18c1912d4c7..2311ae0668c 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc
@@ -83,16 +83,15 @@ class FoldIfOp : public OpRewritePattern<TF::IfOp> {
     if (!llvm::hasSingleElement(parent_op)) return failure();
 
     // Find the then and else branch functions.
-    SymbolTable table(op.getParentOfType<ModuleOp>());
-    FuncOp then_branch = table.lookup<FuncOp>(op.then_branch());
-    FuncOp else_branch = table.lookup<FuncOp>(op.else_branch());
+    FuncOp then_func = op.then_func();
+    FuncOp else_func = op.else_func();
 
     // If the If has no uses and its functions are side-effect free, then
     // remove.
     // TODO(jpienaar): Remove once recusive side-effects are supported.
     if (op.use_empty() &&
         (op.is_stateless() ||
-         (IsSideEffectFree(then_branch) && IsSideEffectFree(else_branch)))) {
+         (IsSideEffectFree(then_func) && IsSideEffectFree(else_func)))) {
       rewriter.eraseOp(op.getOperation());
       return success();
     }
@@ -109,7 +108,7 @@ class FoldIfOp : public OpRewritePattern<TF::IfOp> {
 
     // Identify the branch to inline.
     bool cond_value = (*cond.int_value_begin()).getSExtValue();
-    FuncOp func = cond_value ? then_branch : else_branch;
+    FuncOp func = cond_value ? then_func : else_func;
 
     // Make sure that the function has exactly one block to simplify inlining.
     // TFLite doesn't use control flow with blocks so functions with more than
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
index 1fae567c835..3c5fc7a0c5e 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
@@ -37,22 +37,19 @@ class HasRankAtMost<int n> : Constraint<
 // Multi-pattern consisting of matching stand-alone convolution op followed by
 // activation op.
 multiclass FuseActFnIntoConvOpPat<dag ActFnOp, dag ActFnAttr> {
-  def : Pat<(ActFnOp (TFL_Conv2DOp:$conv_out $input, $filter, $bias,
-                                   $h_factor, $w_factor, TFL_AF_None,
-                                   $padding, $stride_h, $stride_w)),
-            (TFL_Conv2DOp $input, $filter, $bias,
-                          $h_factor, $w_factor, ActFnAttr,
-                          $padding, $stride_h, $stride_w),
-            [(HasOneUse $conv_out)]>;
-  def : Pat<(ActFnOp (TFL_DepthwiseConv2DOp:$conv_out $input, $filter, $bias,
-                                   $h_factor, $w_factor, TFL_AF_None,
-                                   $padding, $stride_h, $stride_w,
-                                   $multiplier)),
-            (TFL_DepthwiseConv2DOp $input, $filter, $bias,
-                                   $h_factor, $w_factor, ActFnAttr,
-                                   $padding, $stride_h, $stride_w,
-                                   $multiplier),
-            [(HasOneUse $conv_out)]>;
+  def FuseActivationFuncWithConv#ActFnOp#ActFnAttr : Pat<
+    (ActFnOp (TFL_Conv2DOp:$conv_out $input, $filter, $bias, $h_factor,
+                 $w_factor, TFL_AF_None, $padding, $stride_h, $stride_w)),
+    (TFL_Conv2DOp $input, $filter, $bias, $h_factor, $w_factor, ActFnAttr,
+        $padding, $stride_h, $stride_w),
+    [(HasOneUse $conv_out)]>;
+  def FuseActivationFuncWithDepthwiseConv#ActFnOp#ActFnAttr : Pat<
+    (ActFnOp (TFL_DepthwiseConv2DOp:$conv_out $input, $filter, $bias, $h_factor,
+                $w_factor, TFL_AF_None, $padding, $stride_h, $stride_w,
+                $multiplier)),
+    (TFL_DepthwiseConv2DOp $input, $filter, $bias, $h_factor, $w_factor,
+        ActFnAttr, $padding, $stride_h, $stride_w, $multiplier),
+    [(HasOneUse $conv_out)]>;
 }
 
 // TODO(hinsu): Also fuse ops corresponding to SIGN_BIT fused
@@ -73,33 +70,29 @@ class CanFuseConvOrDepthwiseConv<string is_depthwise> : Constraint<
 // constant folding the bias and the binary op's constant operand. The following
 // pattern restricts to float constant values for now.
 multiclass FuseBinaryOpToPrecedingAffine<dag binaryOp> {
-  def : Pat<(binaryOp (TFL_Conv2DOp:$output $input, $filter,
-                          (ConstantOp F32ElementsAttr:$bias),
-                          $h_factor, $w_factor, TFL_AF_None,
-                          $padding, $stride_h, $stride_w),
-                     (ConstantOp F32ElementsAttr:$value), $act_fn),
-          (TFL_Conv2DOp $input, $filter,
-                        (binaryOp (ConstantOp $bias),
-                                   (ConstantOp $value), TFL_AF_None),
-                        $h_factor, $w_factor, $act_fn,
-                        $padding, $stride_h, $stride_w),
-          [(CanFuseConvOrDepthwiseConv<"false"> $filter, $value),
-           (HasOneUse $output)]>;
-  def : Pat<(binaryOp (TFL_DepthwiseConv2DOp:$output $input, $filter,
-                          (ConstantOp F32ElementsAttr:$bias),
-                          $h_factor, $w_factor, TFL_AF_None,
-                          $padding, $stride_h, $stride_w,
-                          $multiplier),
-                     (ConstantOp F32ElementsAttr:$value), $act_fn),
-          (TFL_DepthwiseConv2DOp $input, $filter,
-                          (binaryOp (ConstantOp $bias),
-                                     (ConstantOp $value),
-                                     TFL_AF_None),
-                          $h_factor, $w_factor, $act_fn,
-                          $padding, $stride_h, $stride_w,
-                          $multiplier),
-          [(CanFuseConvOrDepthwiseConv<"true"> $filter, $value),
-           (HasOneUse $output)]>;
+  def FuseBinaryOpWithConv#binaryOp : Pat<
+    (binaryOp (TFL_Conv2DOp:$output $input, $filter,
+                (ConstantOp F32ElementsAttr:$bias), $h_factor, $w_factor,
+                TFL_AF_None, $padding, $stride_h, $stride_w),
+              (ConstantOp F32ElementsAttr:$value), $act_fn),
+    (TFL_Conv2DOp $input, $filter,
+      (binaryOp (ConstantOp $bias),
+         (ConstantOp $value), TFL_AF_None),
+      $h_factor, $w_factor, $act_fn, $padding, $stride_h, $stride_w),
+    [(CanFuseConvOrDepthwiseConv<"false"> $filter, $value),
+     (HasOneUse $output)]>;
+  def FuseBinaryOpWithDepthwiseConv#binaryOp : Pat<
+    (binaryOp (TFL_DepthwiseConv2DOp:$output $input, $filter,
+                (ConstantOp F32ElementsAttr:$bias),
+                $h_factor, $w_factor, TFL_AF_None, $padding, $stride_h,
+                $stride_w, $multiplier),
+              (ConstantOp F32ElementsAttr:$value), $act_fn),
+    (TFL_DepthwiseConv2DOp $input, $filter,
+      (binaryOp (ConstantOp $bias), (ConstantOp $value), TFL_AF_None),
+      $h_factor, $w_factor, $act_fn, $padding, $stride_h, $stride_w,
+      $multiplier),
+    [(CanFuseConvOrDepthwiseConv<"true"> $filter, $value),
+     (HasOneUse $output)]>;
 }
 foreach binaryOp = [TFL_AddOp, TFL_SubOp] in
   defm : FuseBinaryOpToPrecedingAffine<binaryOp>;
@@ -116,43 +109,43 @@ def ExpandTo4DForDepthwiseConv: NativeCodeCall<
 // The following pattern restricts to float constant values for now.
 
 multiclass FuseMulOrDivWithConv2dOrDepthwiseConv2d<dag BinaryOp> {
-  def : Pat<(BinaryOp (TFL_DepthwiseConv2DOp:$output $input,
-                          (ConstantOp F32ElementsAttr:$filter),
-                          (ConstantOp F32ElementsAttr:$bias),
-                          $h_factor, $w_factor, TFL_AF_None,
-                          $padding, $stride_h, $stride_w,
-                          $multiplier),
-                     (ConstantOp F32ElementsAttr:$value), $act_fn),
-          (TFL_DepthwiseConv2DOp $input,
-                          (BinaryOp (ConstantOp $filter),
-                                     (ConstantOp
-                                       (ExpandTo4DForDepthwiseConv $value)),
-                                     TFL_AF_None),
-                          (BinaryOp (ConstantOp $bias),
-                                     (ConstantOp $value),
-                                     TFL_AF_None),
-                          $h_factor, $w_factor, $act_fn,
-                          $padding, $stride_h, $stride_w,
-                          $multiplier),
-          [(CanFuseConvOrDepthwiseConv<"true"> $filter, $value),
-           (HasOneUse $output)]>;
-  def : Pat<(BinaryOp (TFL_Conv2DOp:$conv_output $input,
-                          (ConstantOp F32ElementsAttr:$filter),
-                          (ConstantOp F32ElementsAttr:$bias),
-                          $h_factor, $w_factor, TFL_AF_None,
-                          $padding, $stride_h, $stride_w),
-                     (ConstantOp F32ElementsAttr:$value), $act_fn),
-          (TFL_Conv2DOp $input,
-                          (BinaryOp (ConstantOp $filter),
-                                     (ConstantOp (ExpandTo4DForConv $value)),
-                                     TFL_AF_None),
-                          (BinaryOp (ConstantOp $bias),
-                                     (ConstantOp $value),
-                                     TFL_AF_None),
-                          $h_factor, $w_factor, $act_fn,
-                          $padding, $stride_h, $stride_w),
-          [(CanFuseConvOrDepthwiseConv<"false"> $filter, $value),
-           (HasOneUse $conv_output)]>;
+  def FuseMulOrDivWithDepthwiseConv#BinaryOp : Pat<
+    (BinaryOp (TFL_DepthwiseConv2DOp:$output $input,
+                (ConstantOp F32ElementsAttr:$filter),
+                (ConstantOp F32ElementsAttr:$bias),
+                $h_factor, $w_factor, TFL_AF_None, $padding, $stride_h,
+                $stride_w, $multiplier),
+              (ConstantOp F32ElementsAttr:$value), $act_fn),
+    (TFL_DepthwiseConv2DOp $input,
+      (BinaryOp
+        (ConstantOp $filter),
+        (ConstantOp (ExpandTo4DForDepthwiseConv $value)),
+        TFL_AF_None),
+      (BinaryOp
+        (ConstantOp $bias),
+        (ConstantOp $value),
+        TFL_AF_None),
+      $h_factor, $w_factor, $act_fn, $padding, $stride_h,
+      $stride_w, $multiplier),
+    [(CanFuseConvOrDepthwiseConv<"true"> $filter, $value),
+     (HasOneUse $output)]>;
+  def FuseMulOrDivWithConv#BinaryOp : Pat<
+    (BinaryOp (TFL_Conv2DOp:$conv_output $input,
+                (ConstantOp F32ElementsAttr:$filter),
+                (ConstantOp F32ElementsAttr:$bias),
+                $h_factor, $w_factor, TFL_AF_None,
+                $padding, $stride_h, $stride_w),
+              (ConstantOp F32ElementsAttr:$value), $act_fn),
+    (TFL_Conv2DOp $input,
+      (BinaryOp (ConstantOp $filter),
+        (ConstantOp (ExpandTo4DForConv $value)),
+        TFL_AF_None),
+      (BinaryOp (ConstantOp $bias),
+        (ConstantOp $value),
+        TFL_AF_None),
+      $h_factor, $w_factor, $act_fn, $padding, $stride_h, $stride_w),
+    [(CanFuseConvOrDepthwiseConv<"false"> $filter, $value),
+     (HasOneUse $conv_output)]>;
 }
 
 foreach BinaryOp = [TFL_DivOp, TFL_MulOp] in
@@ -177,7 +170,7 @@ class OperandHasRank<int n> : Constraint<
   CPred<"$0.getType().cast<ShapedType>().getRank() == " # n>>;
 
 // Matching HardSwish
-def : Pat<
+def MatchHardSwishPattern1 : Pat<
   (TFL_MulOp
     (TFL_MulOp
      $x, (TFL_AddOp
@@ -190,7 +183,7 @@ def : Pat<
   (TFL_HardSwishOp $x),
   [(EqualOperands $x, $y)]>;
 
-def : Pat<
+def MatchHardSwishPattern2 : Pat<
   (TFL_MulOp
     $x,
     (TFL_MulOp
@@ -207,7 +200,7 @@ def : Pat<
 // Matching HardSwish with extra FakeQuant. These FakeQuant ops were due to
 // incorrect placement in the quantization aware training.
 // TODO(b/149735743): We should make the placement automatically.
-def : Pat<
+def MatchHardSwishQuantized : Pat<
   (TFL_MulOp (TFL_DequantizeOp (TFL_QuantizeOp
     (TFL_MulOp
      $x, (TFL_DequantizeOp (TFL_QuantizeOp (TFL_AddOp
@@ -238,7 +231,8 @@ multiclass L2NormalizePatterns<dag FirstOp, dag SecondOp> {
   // This pattern constructs L2NormalizationOp from
   // Mul->Rsqrt->Sum->Square Or
   // Div->sqrt->Sum->Square
-  def : Pat<(FirstOp $operand1,
+  def L2NormalizePattern1#FirstOp#SecondOp : Pat<
+                  (FirstOp $operand1,
                      (SecondOp
                         (TFL_SumOp
                            (TFL_SquareOp:$sq_op $square_operand),
@@ -251,7 +245,8 @@ multiclass L2NormalizePatterns<dag FirstOp, dag SecondOp> {
 
   // Below patterns for L2Normalize when there is an Add or Maximum
   // adding or clamping to a small constant scalar.
-  def : Pat<(FirstOp $operand1,
+  def L2NormalizePattern2#FirstOp#SecondOp : Pat<
+                    (FirstOp $operand1,
                      (SecondOp
                       (TFL_AddOp
                        (TFL_SumOp
@@ -265,7 +260,8 @@ multiclass L2NormalizePatterns<dag FirstOp, dag SecondOp> {
             (L2NormValidReduceIndex $sq_op, $axis),
             (ConstDoubleValueLessThan<"1e-3"> $epsilon)]>;
 
-  def : Pat<(FirstOp $operand1,
+  def L2NormalizePattern3#FirstOp#SecondOp : Pat<
+                    (FirstOp $operand1,
                      (SecondOp
                       (TFL_MaximumOp
                        (TFL_SumOp
@@ -302,14 +298,16 @@ def HaveSameType : Constraint<CPred<"$0.getType(), $1.getType()">>;
 // Pattern for skipping Tile if it is mainly for broadcasting and the
 // Op is already supporting broadcasting.
 multiclass FuseTileBroadcastIntoFollowingBinary<dag BinaryOp> {
-  def : Pat<(BinaryOp:$result (TFL_TileOp $input, (ConstantOp $tile)),
-             $operand, $act_func),
-  (BinaryOp $input, $operand, $act_func),
+  def FuseTileBroadcastToBinaryOp1#BinaryOp : Pat<
+    (BinaryOp:$result (TFL_TileOp $input, (ConstantOp $tile)),
+     $operand, $act_func),
+    (BinaryOp $input, $operand, $act_func),
   [(OperandsBroadcastToOutputType $input, $operand, $result)]>;
 
-  def : Pat<(BinaryOp:$result $operand,
-             (TFL_TileOp $input, (ConstantOp $tile)), $act_func),
-  (BinaryOp $operand, $input, $act_func),
+  def FuseTileBroadcastToBinaryOp2#BinaryOp : Pat<
+    (BinaryOp:$result $operand,
+      (TFL_TileOp $input, (ConstantOp $tile)), $act_func),
+    (BinaryOp $operand, $input, $act_func),
   [(OperandsBroadcastToOutputType $operand, $input, $result)]>;
 }
 
@@ -318,9 +316,10 @@ multiclass FusedBinaryActivationFuncOpPat<dag BinaryOp> {
   foreach actFnPair = [[TFL_ReluOp, TFL_AF_Relu],
                        [TFL_Relu6Op, TFL_AF_Relu6],
                        [TFL_Relu1Op, TFL_AF_Relu1]] in {
-    def : Pat<(actFnPair[0] (BinaryOp:$binary_out $lhs, $rhs, TFL_AF_None)),
-              (BinaryOp $lhs, $rhs, actFnPair[1]),
-              [(HasOneUse $binary_out)]>;
+    def FuseBinaryWithActivation#BinaryOp#actFnPair[0] : Pat<
+      (actFnPair[0] (BinaryOp:$binary_out $lhs, $rhs, TFL_AF_None)),
+      (BinaryOp $lhs, $rhs, actFnPair[1]),
+    [(HasOneUse $binary_out)]>;
   }
 }
 
@@ -340,21 +339,22 @@ foreach BinaryOp = [TFL_AddOp, TFL_SubOp, TFL_DivOp, TFL_MulOp] in {
   // transformation, the shape of the binary op result is [40x1600], which
   // couldn't be reshaped to [1,40,40]. `IsTailOfShape` constraint is added to
   // make sure $rhs is the tail shape of $lhs.
-  def : Pat<(BinaryOp (TFL_ReshapeOp:$lhs $input, (ConstantOp:$shape $s)),
-                      (ConstantOp:$rhs $a), TFL_AF_None),
-            (TFL_ReshapeOp (BinaryOp $input, $rhs, TFL_AF_None), $shape),
-             // The broadcasting of "BinaryOp" only happens in the lower
-             // dimensions, and the higher dimensions are same, so we know the
-             // result and input of the "BinaryOp" in the source pattern have
-             // the same shape, which is defined by `shape`.
-            [(IsTailOfShape $rhs, $lhs),
-             (HasOneUse $lhs),
-             // The result of the new "BinaryOp" will have the same shape as
-             // `input`. In other words, the shape of the `Reshape` op are not
-             // changed after the transformation.
-             (IsTailOfShape $rhs, $input),
-             (HasRankAtMost<5> $input),
-             (HasRankAtMost<5> $rhs)]>;
+  def MoveBinaryOpBeforeReshape#BinaryOp : Pat<
+    (BinaryOp (TFL_ReshapeOp:$lhs $input, (ConstantOp:$shape $s)),
+      (ConstantOp:$rhs $a), $act_fn),
+    (TFL_ReshapeOp (BinaryOp $input, $rhs, $act_fn), $shape),
+    // The broadcasting of "BinaryOp" only happens in the lower
+    // dimensions, and the higher dimensions are same, so we know the
+    // result and input of the "BinaryOp" in the source pattern have
+    // the same shape, which is defined by `shape`.
+    [(IsTailOfShape $rhs, $lhs),
+     (HasOneUse $lhs),
+     // The result of the new "BinaryOp" will have the same shape as
+     // `input`. In other words, the shape of the `Reshape` op are not
+     // changed after the transformation.
+     (IsTailOfShape $rhs, $input),
+     (HasRankAtMost<5> $input),
+     (HasRankAtMost<5> $rhs)]>;
 }
 
 foreach BinaryOp = [TFL_FloorDivOp, TFL_FloorModOp, TFL_MinimumOp,
@@ -370,19 +370,20 @@ foreach BinaryOp = [TFL_FloorDivOp, TFL_FloorModOp, TFL_MinimumOp,
   // transformation, the shape of the binary op result is [40x1600], which
   // couldn't be reshaped to [1,40,40]. `IsTailOfShape` constraint is added to
   // make sure $rhs is the tail shape of $lhs.
-  def : Pat<(BinaryOp (TFL_ReshapeOp:$lhs $input, (ConstantOp:$shape $s)),
-                      (ConstantOp:$rhs $a)),
-            (TFL_ReshapeOp (BinaryOp $input, $rhs), $shape),
-             // The broadcasting of "BinaryOp" only happens in the lower
-             // dimensions, and the higher dimensions are same, so we know the
-             // result and input of the "BinaryOp" in the source pattern have
-             // the same shape, which is defined by `shape`.
-            [(IsTailOfShape $rhs, $lhs),
-             (HasOneUse $lhs),
-             // The result of the new "BinaryOp" will have the same shape as
-             // `input`. In other words, the shape of the `Reshape` op are not
-             // changed after the transformation.
-             (IsTailOfShape $rhs, $input)]>;
+  def MoveBinaryOpBeforeReshape#BinaryOp : Pat<
+    (BinaryOp (TFL_ReshapeOp:$lhs $input, (ConstantOp:$shape $s)),
+      (ConstantOp:$rhs $a)),
+    (TFL_ReshapeOp (BinaryOp $input, $rhs), $shape),
+    // The broadcasting of "BinaryOp" only happens in the lower
+    // dimensions, and the higher dimensions are same, so we know the
+    // result and input of the "BinaryOp" in the source pattern have
+    // the same shape, which is defined by `shape`.
+    [(IsTailOfShape $rhs, $lhs),
+     (HasOneUse $lhs),
+     // The result of the new "BinaryOp" will have the same shape as
+     // `input`. In other words, the shape of the `Reshape` op are not
+     // changed after the transformation.
+     (IsTailOfShape $rhs, $input)]>;
 }
 
 // Reorder the element-wise value operations and the element move operations,
@@ -392,9 +393,10 @@ foreach ValueOp = [TFL_CeilOp, TFL_ExpOp, TFL_FloorOp, TFL_NegOp,
                    TFL_TanhOp, TFL_SqrtOp, TFL_SquareOp] in {
   foreach MoveOp = [TFL_DepthToSpaceOp, TFL_ExpandDimsOp, TFL_SqueezeOp,
                    TFL_ReshapeOp, TFL_TransposeOp] in {
-    def : Pat<(ValueOp:$value (MoveOp:$move $input, $move_def)),
-              (MoveOp (ValueOp $input), $move_def),
-              [(HasOneUse $move)]>;
+    def ReorderElementwiseAndMoveOperations#ValueOp#MoveOp : Pat<
+      (ValueOp:$value (MoveOp:$move $input, $move_def)),
+      (MoveOp (ValueOp $input), $move_def),
+      [(HasOneUse $move)]>;
   }
 }
 
@@ -402,17 +404,20 @@ foreach ValueOp = [TFL_CeilOp, TFL_ExpOp, TFL_FloorOp, TFL_NegOp,
 // if called without a ranked tensor it will fail.
 def GetShape: NativeCodeCall<"GetShape($0)">;
 
-// Convert squeeze to reshape if possible.
-def : Pat<(TFL_SqueezeOp:$squeeze_op $input, $squeeze_dims),
-          (TFL_ReshapeOp $input,
-           (ConstantOp (GetShape $squeeze_op))),
-          [(AnyStaticShapeTensor $squeeze_op)]>;
+// Returns True if the operand type is RankedTensorType.
+def HasRankedTensor : Constraint<
+    CPred<"$0.getType().isa<RankedTensorType>()">>;
+
+def ConvertSqueezeToReshape : Pat<
+  (TFL_SqueezeOp:$squeeze_op $input, $squeeze_dims),
+  (TFL_ReshapeOp $input, (ConstantOp (GetShape $squeeze_op))),
+  [(HasRankedTensor $squeeze_op)]>;
 
 // Convert expand_dims to reshape if possible.
-def : Pat<(TFL_ExpandDimsOp:$expand_dims_op $input, $dim),
-          (TFL_ReshapeOp $input,
-           (ConstantOp (GetShape $expand_dims_op))),
-          [(AnyStaticShapeTensor $expand_dims_op)]>;
+def ConvertExpandDimsToReshape : Pat<
+  (TFL_ExpandDimsOp:$expand_dims_op $input, $dim),
+  (TFL_ReshapeOp $input, (ConstantOp (GetShape $expand_dims_op))),
+  [(AnyStaticShapeTensor $expand_dims_op)]>;
 
 class FloatValueEquals<string val> : Constraint<CPred<
   "$0.cast<DenseElementsAttr>().getNumElements() == 1 &&"
@@ -420,25 +425,32 @@ class FloatValueEquals<string val> : Constraint<CPred<
   "*$0.cast<DenseElementsAttr>().getValues<float>().begin() == " # val>>;
 
 // ReLU patterns
-def : Pat<(TFL_MinimumOp (TFL_MaximumOp $input,
-                          (ConstantOp $NegOne)),
-           (ConstantOp $One)),
-          (TFL_Relu1Op $input),
-          [(FloatValueEquals<"-1"> $NegOne), (FloatValueEquals<"1"> $One)]>;
+def MatchReluPattern : Pat<
+  (TFL_MaximumOp $input, (ConstantOp $Zero)),
+  (TFL_ReluOp $input),
+  [(FloatValueEquals<"0"> $Zero)]>;
 
-def : Pat<(TFL_MaximumOp (TFL_MinimumOp $input,
-                          (ConstantOp $One)),
-           (ConstantOp $NegOne)),
-          (TFL_Relu1Op $input),
-          [(FloatValueEquals<"-1"> $NegOne), (FloatValueEquals<"1"> $One)]>;
+def MatchRelu1Pattern1 : Pat<
+  (TFL_MinimumOp (TFL_MaximumOp $input, (ConstantOp $NegOne)),
+    (ConstantOp $One)),
+  (TFL_Relu1Op $input),
+  [(FloatValueEquals<"-1"> $NegOne), (FloatValueEquals<"1"> $One)]>;
 
-def : Pat<(TFL_MaximumOp (TFL_MulOp:$mul_out $input1,
-                          (ConstantOp F32ElementsAttr:$alpha), TFL_AF_None),
-           $input2),
-          (TFL_LeakyReluOp $input1, ExtractSingleElementAsFloat:$alpha),
-          [(ConstDoubleValueLessThan<"1"> $alpha),
-           (EqualOperands $input1, $input2),
-           (HasOneUse $mul_out)]>;
+def MatchRelu1Pattern2 : Pat<
+  (TFL_MaximumOp (TFL_MinimumOp $input, (ConstantOp $One)),
+    (ConstantOp $NegOne)),
+  (TFL_Relu1Op $input),
+  [(FloatValueEquals<"-1"> $NegOne), (FloatValueEquals<"1"> $One)]>;
+
+def MatchLeakyRelu : Pat<
+  (TFL_MaximumOp
+    (TFL_MulOp:$mul_out $input1,
+     (ConstantOp F32ElementsAttr:$alpha), TFL_AF_None),
+    $input2),
+  (TFL_LeakyReluOp $input1, ExtractSingleElementAsFloat:$alpha),
+  [(ConstDoubleValueLessThan<"1"> $alpha),
+   (EqualOperands $input1, $input2),
+   (HasOneUse $mul_out)]>;
 
 def RemoveTrivialCast : Pat<(TFL_CastOp:$output $input),
                             (replaceWithValue $input),
@@ -451,23 +463,25 @@ def PReluAlphaRankCheck : Constraint<
 
 // PReLU pattern from Keras:
 // f(x) = Relu(x) + (-alpha * Relu(-x))
-def : Pat<(TFL_AddOp
-           (TFL_ReluOp:$relu_out $input1),
-           (TFL_MulOp:$mul_out
-            (TFL_ReluOp (TFL_NegOp:$input_neg_out $input2)),
-            $neg_alpha,
-            TFL_AF_None),
-           TFL_AF_None),
-          (TFL_PReluOp $input1, (TFL_NegOp $neg_alpha)),
-          [(EqualOperands $input1, $input2),
-           (PReluAlphaRankCheck $neg_alpha, $input1),
-           (HasOneUse $relu_out),
-           (HasOneUse $mul_out),
-           (HasOneUse $input_neg_out)]>;
+def MatchPRelu : Pat<
+  (TFL_AddOp
+   (TFL_ReluOp:$relu_out $input1),
+   (TFL_MulOp:$mul_out
+    (TFL_ReluOp (TFL_NegOp:$input_neg_out $input2)),
+    $neg_alpha,
+    TFL_AF_None),
+   TFL_AF_None),
+  (TFL_PReluOp $input1, (TFL_NegOp $neg_alpha)),
+  [(EqualOperands $input1, $input2),
+   (PReluAlphaRankCheck $neg_alpha, $input1),
+   (HasOneUse $relu_out),
+   (HasOneUse $mul_out),
+   (HasOneUse $input_neg_out)]>;
 
 // The constant folding in this pass might produce constant in the tf dialect.
 // This rule is to legalize these constant to the tfl dialect.
-def : Pat<(TF_ConstOp ElementsAttr:$value), (TFL_ConstOp $value)>;
+def LegalizeConstOp : Pat<
+  (TF_ConstOp ElementsAttr:$value), (TFL_ConstOp $value)>;
 
 // Reorders adds to allow constant folding.
 // Add --> Add $input, $constantA
@@ -476,13 +490,49 @@ def : Pat<(TF_ConstOp ElementsAttr:$value), (TFL_ConstOp $value)>;
 // Add --> $input
 //    \--> Add ($constantA, $constantB)
 foreach ActFun = [TFL_AF_Relu, TFL_AF_Relu6, TFL_AF_Relu1, TFL_AF_None] in {
-  def : Pat<(TFL_AddOp
-              (TFL_AddOp:$first_output $input, (ConstantOp $a), TFL_AF_None),
-              (ConstantOp $b), ActFun),
-            (TFL_AddOp $input,
-              (TFL_AddOp (ConstantOp $a), (ConstantOp $b), TFL_AF_None),
-              ActFun),
-  [(HasOneUse $first_output)]>;
+  def ReorderAddToAllowConstFold_ActFunc_#ActFun : Pat<
+    (TFL_AddOp
+     (TFL_AddOp:$first_output $input, (ConstantOp $a), TFL_AF_None),
+     (ConstantOp $b), ActFun),
+    (TFL_AddOp $input,
+     (TFL_AddOp (ConstantOp $a), (ConstantOp $b), TFL_AF_None),
+     ActFun),
+    [(HasOneUse $first_output)]>;
 }
 
+// We can eliminate Relu from Relu(SquaredDifference(x, y)),
+// since the result of SquaredDifference is always non-negative.
+// TFLite interpreter doesn't support Relu+int32 for now. So the test cases
+// are failing without the following pattern to optimize Relu away fixes
+// the problem.
+def OptimizeReluSquaredDifference : Pat<
+  (TFL_ReluOp (TFL_SquaredDifferenceOp $l, $r)),
+  (TFL_SquaredDifferenceOp $l, $r)>;
+
+// Optimize X^1 o X
+def OptimizePow1ToIdentity : Pat<
+  (TFL_PowOp $input,
+    (ConstantOp ConstantAttr<RankedF32ElementsAttr<[]>, "1.0f">)),
+  (replaceWithValue $input)>;
+
+// Optimize X^2 to X*X
+def OptimizePow2ToSquare : Pat<
+  (TFL_PowOp $input,
+    (ConstantOp ConstantAttr<RankedF32ElementsAttr<[]>, "2.0f">)),
+  (TFL_MulOp $input, $input, TFL_AF_None)>;
+
+def CanOptimizeIdentityGatherNdOrScatterNdOp : Constraint<CPred<
+  "TFL::CanOptimizeIdentityGatherNdOrScatterNdOp("
+  "$0, $1.cast<DenseIntElementsAttr>())">>;
+
+def OptimizeIdentityGatherNdOp : Pat<
+  (TFL_GatherNdOp $params, (ConstantOp I32ElementsAttr: $indices)),
+  (replaceWithValue $params),
+  [(CanOptimizeIdentityGatherNdOrScatterNdOp $params, $indices)]>;
+
+def OptimizeIdentityScatterNdOp : Pat<
+  (TFL_ScatterNdOp (ConstantOp I32ElementsAttr: $indices), $params, $ignored),
+  (replaceWithValue $params),
+  [(CanOptimizeIdentityGatherNdOrScatterNdOp $params, $indices)]>;
+
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/passes.h b/tensorflow/compiler/mlir/lite/transforms/passes.h
index af97931b2a3..804a391231a 100644
--- a/tensorflow/compiler/mlir/lite/transforms/passes.h
+++ b/tensorflow/compiler/mlir/lite/transforms/passes.h
@@ -91,6 +91,9 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateWhileOutlinePass();
 // Verifies runtime constraints.
 std::unique_ptr<OperationPass<FuncOp>> CreateRuntimeVerifyPass();
 
+// Creates raise custom ops pass, which legalize custom ops to TFL::CustomOp
+std::unique_ptr<OperationPass<FuncOp>> CreateRaiseCustomOpsPass();
+
 }  // namespace TFL
 
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
index 3d2ab662e6f..3be6246c0dd 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
@@ -42,6 +43,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/lstm_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/tftext_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
 // The cmd line flag to turn on/off Tf.Text API fusion.
@@ -56,9 +58,11 @@ namespace TFL {
 namespace {
 
 constexpr char kTFAPIImplements[] = "tf.api_implements";
-constexpr char kTfTextAPIPRefix[] = "tftext:";
+constexpr char kTFTextAPIPrefix[] = "tftext:";
 constexpr char kTfNMSPadded[] = "non_max_suppression_padded_v2";
 
+using mlir::TF::FuncAttr;
+
 // Abstracts the conversion of the embedded lookup composite function.
 class ConvertEmbeddedLookupFunc {
  public:
@@ -161,7 +165,9 @@ class PrepareCompositeFunctionsPass
   explicit PrepareCompositeFunctionsPass() {}
 
  private:
+  // TODO(b/160915525): Consolidate FuncAttr and StringAttr into one.
   void ConvertTFImplements(FuncOp func, StringAttr attr);
+  void ConvertTFImplementsWithAttributes(FuncOp func, FuncAttr attr);
   void ConvertTFAPIImplements(FuncOp func, StringAttr attr, ModuleOp module);
   void runOnOperation() override;
 };
@@ -204,10 +210,23 @@ void PrepareCompositeFunctionsPass::ConvertTFImplements(FuncOp func,
   }
 }
 
+void PrepareCompositeFunctionsPass::ConvertTFImplementsWithAttributes(
+    FuncOp func, FuncAttr attr) {
+  auto api_name = attr.GetName().getLeafReference();
+  bool enable_fuse_tftext =
+      fuse_tftext_flag || IsTFTextRegistered(tensorflow::OpRegistry::Global());
+  if (api_name.startswith(kTFTextAPIPrefix) && enable_fuse_tftext) {
+    if (failed(ConvertTFTextAPI(func, api_name, attr))) {
+      return signalPassFailure();
+    }
+  }
+}
+
 LogicalResult CheckOutputConsumer(
     Operation* call_op, int expected_num_outputs,
     llvm::DenseSet<int> expected_consumer_indices) {
-  if (call_op->getNumResults() != expected_num_outputs) return failure();
+  const int num_results = call_op->getNumResults();
+  if (num_results != expected_num_outputs) return failure();
 
   for (int i = 0; i < expected_num_outputs; ++i) {
     auto it = expected_consumer_indices.find(i);
@@ -220,21 +239,31 @@ LogicalResult CheckOutputConsumer(
 }
 
 LogicalResult CheckFusableKerasLstm(FuncOp lstm_func, ModuleOp module) {
-  bool check_failed = false;
   for (auto func : module.getOps<FuncOp>()) {
-    func.walk([&](Operation* op) {
-      auto call_op = dyn_cast_or_null<CallOpInterface>(op);
-      if (call_op && op->getAttrOfType<SymbolRefAttr>("f").getRootReference() ==
-                         lstm_func.getName()) {
+    if (func == lstm_func) continue;
+    auto result = func.walk([&](CallOpInterface op) {
+      if (dyn_cast<FuncOp>(op.resolveCallable()) == lstm_func) {
         // Keras LSTM have 5 outputs.
-        // We should make sure only the first or the second output are consumed.
-        if (failed(CheckOutputConsumer(call_op, 5, {0, 1})))
-          check_failed = true;
+        // We should make sure only the first or the second output are
+        // consumed.
+        if (failed(CheckOutputConsumer(op.getOperation(), 5, {0, 1})))
+          return WalkResult::interrupt();
       }
+      return WalkResult::advance();
     });
+
+    if (result.wasInterrupted()) return failure();
+  }
+
+  // We should know the batch size in advance for the lstm fusion.
+  // A good indicator of batch size is both cell state and input state have
+  // fixed shape. (indices 1 & 2).
+  for (int i = 1; i < 3; ++i) {
+    auto input = lstm_func.getArgument(i);
+    auto input_type = input.getType().dyn_cast_or_null<RankedTensorType>();
+    if (!input_type || !input_type.hasStaticShape()) return failure();
   }
 
-  if (check_failed) return failure();
   return success();
 }
 
@@ -256,26 +285,27 @@ void PrepareCompositeFunctionsPass::ConvertTFAPIImplements(FuncOp func,
     OpBuilder builder(func.getBody());
     if (failed(ConvertKerasLSTMLayer(func, &builder)))
       return signalPassFailure();
-  } else if (fuse_tftext_flag ||
-             IsTfTextRegistered(tensorflow::OpRegistry::Global())) {
-    if (attr.getValue().startswith(kTfTextAPIPRefix)) {
-      if (failed(ConvertTFTextAPI(func, attr.getValue()))) {
-        return signalPassFailure();
-      }
-    }
   }
 }
 
 void PrepareCompositeFunctionsPass::runOnOperation() {
   auto module = getOperation();
   for (auto func : module.getOps<FuncOp>()) {
-    // We have two kinds of implements:
-    // 1) tf._implements.
-    // 2) tf.api_implements.
+    // We have three kinds of implements:
+    // 1) tf._implements, with string attributes.
+    // 2) tf._implements, with proto attributes.
+    // 3) tf.api_implements.
     // We need to handle them separately.
-    auto tf_implements_attr = func.getAttrOfType<StringAttr>(kTFImplements);
+    auto tf_implements_attr_str = func.getAttrOfType<StringAttr>(kTFImplements);
+    if (tf_implements_attr_str) {
+      ConvertTFImplements(func, tf_implements_attr_str);
+      continue;
+    }
+
+    auto tf_implements_attr = func.getAttrOfType<FuncAttr>(kTFImplements);
     if (tf_implements_attr) {
-      ConvertTFImplements(func, tf_implements_attr);
+      ConvertTFImplementsWithAttributes(func, tf_implements_attr);
+      continue;
     }
 
     auto tf_api_implements_attr =
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index 6ee988496fa..62688937d7e 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -41,7 +41,9 @@ limitations under the License.
 #include "mlir/Analysis/LoopAnalysis.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/FakeQuantSupport.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/UniformSupport.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
@@ -49,6 +51,7 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/dilated_conv.h"
@@ -57,7 +60,9 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/utils/validators.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/einsum.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.h"
+#include "tensorflow/compiler/mlir/xla/transforms/passes.h"
 
 #define DEBUG_TYPE "tf-tfl-legalization"
 
@@ -494,7 +499,8 @@ struct ConvertTFStridedSlice : public RewritePattern {
         original_input_type.getShape();
     SmallVector<int64_t, 4> new_shape;
     int index = 0;
-    while (index < original_input_shape.size() || new_axis_mask) {
+    const int original_input_rank = original_input_shape.size();
+    while (index < original_input_rank || new_axis_mask) {
       if (new_axis_mask & 1) {
         new_shape.emplace_back(1);
       } else {
@@ -696,6 +702,23 @@ LogicalResult ValidateOp(Operation *op) {
   return failure(has_illegal_ops);
 }
 
+// Converts a set of TF2XLA ops into pure TF ops for future legalizations as
+// TF2XLA ops aren't supported by later stages.
+LogicalResult ConvertTf2XlaOps(FuncOp func, MLIRContext *context) {
+  ConversionTarget target(*context);
+  target.addLegalDialect<StandardOpsDialect>();
+  target.addLegalDialect<TF::TensorFlowDialect>();
+  target.addLegalOp<ModuleOp>();
+  target.addLegalOp<FuncOp>();
+  target.addIllegalOp<TF::XlaConvOp>();
+
+  OwningRewritePatternList patterns;
+  mhlo::PopulateLegalizeTfWithTf2XlaPatterns("XLA_CPU_JIT", patterns);
+  TF::PopulateLegalizeHloToTfPatterns(&patterns, context);
+
+  return applyPartialConversion(func, target, patterns);
+}
+
 void PrepareTFPass::runOnFunction() {
   OwningRewritePatternList patterns;
   auto func = getFunction();
@@ -711,6 +734,11 @@ void PrepareTFPass::runOnFunction() {
     return;
   }
 
+  if (failed(ConvertTf2XlaOps(func, ctx))) {
+    signalPassFailure();
+    return;
+  }
+
   // This pattern was intented to uses TFL QDQs to preserve the quantization
   // parameters from the TF Quant ops, thus this pattern should run with the
   // first `applyPatternsGreedily` method, which would otherwise removes the
diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td
index 22bcc563f7b..38c754ed08c 100644
--- a/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td
@@ -33,7 +33,7 @@ def : Pat<(TFL_QuantizeOp (TFL_DequantizeOp $in), $qt), (replaceWithValue $in)>;
 // point constant.
 def : Pat<(TFL_DequantizeOp
              (TFL_QuantizeOp (ConstantOp F32ElementsAttr:$cst), $qt)),
-          (ConstantOp $cst)>;
+          (TFL_ConstOp $cst)>;
 
 // Quantize the value of a constant op if the quantization parameters have been
 // propagated to the output.
diff --git a/tensorflow/compiler/mlir/lite/transforms/raise_custom_ops.cc b/tensorflow/compiler/mlir/lite/transforms/raise_custom_ops.cc
new file mode 100644
index 00000000000..40cca526951
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/transforms/raise_custom_ops.cc
@@ -0,0 +1,80 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace TFL {
+namespace {
+// This transformation pass takes an operation with unknown op properties and
+// wrap it by a TFL::CustomTfOp.
+struct RaiseCustomOpsPass
+    : public PassWrapper<RaiseCustomOpsPass, FunctionPass> {
+  void runOnFunction() override;
+};
+
+void RaiseCustomOpsPass::runOnFunction() {
+  auto fn = getFunction();
+  OpBuilder builder(fn.getContext());
+
+  llvm::SmallVector<Operation *, 4> custom_ops;
+  for (Operation &op : fn.getOps()) {
+    // Skips the ops with known op property.
+    if (op.getAbstractOperation()) continue;
+    // Skips already imported ops that are imported as CustomTfOp.
+    if (op.getParentOfType<CustomTfOp>()) continue;
+    if (llvm::isa<TFL::CustomTfOp>(op) || llvm::isa<TFL::CustomOp>(op))
+      continue;
+    custom_ops.push_back(&op);
+  }
+
+  for (auto *op : custom_ops) {
+    builder.setInsertionPoint(op);
+    auto custom_op = builder.create<CustomTfOp>(
+        op->getLoc(), op->getResultTypes(), op->getOperands());
+    Region region;
+    region.push_back(new Block);
+
+    builder.setInsertionPointToEnd(&region.front());
+    Operation *inner_op = builder.clone(*op);
+    builder.create<YieldOp>(op->getLoc(), inner_op->getResults());
+    custom_op.body().takeBody(region);
+
+    op->replaceAllUsesWith(custom_op);
+    op->erase();
+  }
+}
+}  // namespace
+
+// Creates an instance of the TensorFlow Lite dialect raise custom op pass.
+std::unique_ptr<OperationPass<FuncOp>> CreateRaiseCustomOpsPass() {
+  return std::make_unique<RaiseCustomOpsPass>();
+}
+
+static PassRegistration<RaiseCustomOpsPass> pass(
+    "tfl-raise-custom-ops", "Raise custom ops into tflite dialect.");
+
+}  // namespace TFL
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc b/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc
index 106b0f9af83..56b38ec58d8 100644
--- a/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc
@@ -80,7 +80,7 @@ void WhileOutlinePass::OutlineWhile(WhileOp while_op) {
   // The basic block arguments correspond to values that are loop carried, while
   // all those post are loop independent. Initialize extern_values with while_op
   // not loop carried operands.
-  auto num_loop_carried = while_op.cond().front().getNumArguments();
+  auto num_loop_carried = while_op.cond().getNumArguments();
   auto not_carried_operands =
       while_op.getOperands().drop_front(num_loop_carried);
   extern_values.insert(not_carried_operands.begin(),
@@ -124,8 +124,7 @@ void WhileOutlinePass::OutlineWhile(WhileOp while_op) {
   // Collect new types.
   SmallVector<Type, 4> types;
   types.reserve(extra_operands.size() + while_op.getNumOperands());
-  for (BlockArgument ba : while_op.cond().front().getArguments())
-    types.push_back(ba.getType());
+  for (Type type : while_op.cond().getArgumentTypes()) types.push_back(type);
   for (Value operand : extern_values) types.push_back(operand.getType());
 
   // Create outline function from region. Optional pass extra arguments through
@@ -143,8 +142,7 @@ void WhileOutlinePass::OutlineWhile(WhileOp while_op) {
       type = FunctionType::get(types, result_types, &getContext());
     }
 
-    auto outlined_func = builder.create<FuncOp>(while_op.getLoc(), name, type,
-                                                ArrayRef<NamedAttribute>{});
+    auto outlined_func = builder.create<FuncOp>(while_op.getLoc(), name, type);
     outlined_func.getBody().takeBody(region);
     Region& func_region = outlined_func.getBody();
 
diff --git a/tensorflow/compiler/mlir/lite/utils/convert_type.cc b/tensorflow/compiler/mlir/lite/utils/convert_type.cc
index 22283d7eace..6b3ad78a830 100644
--- a/tensorflow/compiler/mlir/lite/utils/convert_type.cc
+++ b/tensorflow/compiler/mlir/lite/utils/convert_type.cc
@@ -53,6 +53,8 @@ mlir::Type ConvertElementType(tflite::TensorType type, mlir::Builder builder) {
       return builder.getIntegerType(16);
     case tflite::TensorType_COMPLEX64:
       return mlir::ComplexType::get(builder.getF32Type());
+    case tflite::TensorType_COMPLEX128:
+      return mlir::ComplexType::get(builder.getF64Type());
     case tflite::TensorType_INT8:
       return builder.getIntegerType(8);
   }
@@ -64,6 +66,8 @@ tensorflow::DataType TflTypeToTfType(tflite::TensorType type) {
       return tensorflow::DT_BOOL;
     case tflite::TensorType_COMPLEX64:
       return tensorflow::DT_COMPLEX64;
+    case tflite::TensorType_COMPLEX128:
+      return tensorflow::DT_COMPLEX128;
     case tflite::TensorType_FLOAT16:
       return tensorflow::DT_HALF;
     case tflite::TensorType_FLOAT32:
diff --git a/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc b/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
index 2f876c68fb8..3a469dd7341 100644
--- a/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
@@ -134,7 +134,7 @@ Value SliceRankedTensor(OpBuilder* builder, Value input,
   // the input tensor's dimensions, return 0-valued tensor of the requested
   // shape.
   ArrayRef<int64_t> input_shape = GetRankedTensorShape(input);
-  for (int i = 0; i < input_shape.size(); i++) {
+  for (int i = 0, end = input_shape.size(); i < end; i++) {
     if (begin_values[i] < 0 ||
         (begin_values[i] + size_values[i] > input_shape[i])) {
       return CreateF32SplatConst(builder, size_shape, 0, location);
diff --git a/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc b/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc
index 2ed0891dc59..96d22cb51e9 100644
--- a/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/lite/utils/tftext_utils.h"
 
+#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/SmallVector.h"
@@ -28,6 +29,7 @@ limitations under the License.
 #include "mlir/IR/Identifier.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
@@ -43,30 +45,35 @@ namespace TFL {
 
 namespace {
 
+constexpr char kNgrams[] = "tftext:Ngrams";
 constexpr char kWhitespaceTokenizer[] = "tftext:WhitespaceTokenizer";
-constexpr char kTFAPIImplements[] = "tf.api_implements";
+constexpr char kTFImplements[] = "tf._implements";
 
-inline OpaqueElementsAttr emptyCustomOption(OpBuilder* builder) {
-  std::string content = "";
+using mlir::TF::FuncAttr;
+using mlir::TF::StringType;
+
+inline OpaqueElementsAttr CustomOption(OpBuilder* builder,
+                                       const std::string& content) {
   ShapedType type = RankedTensorType::get(
       {static_cast<int64_t>(content.size())}, builder->getIntegerType(8));
   return OpaqueElementsAttr::get(
-      builder->getContext()->getRegisteredDialect("tfl"), type, content);
+      builder->getContext()->getRegisteredDialect("tfl"), type,
+      StringRef(content.data(), content.size()));
 }
 
-inline RankedTensorType getInputType(mlir::FuncOp func, int idx) {
-  return func.getType()
-      .getInput(idx)
-      .dyn_cast_or_null<mlir::RankedTensorType>();
+inline TensorType GetInputType(FuncOp func, int idx) {
+  return func.getType().getInput(idx).dyn_cast_or_null<TensorType>();
 }
 
-inline RankedTensorType getResultType(mlir::FuncOp func, int idx) {
-  return func.getType()
-      .getResult(idx)
-      .dyn_cast_or_null<mlir::RankedTensorType>();
+inline TensorType GetResultType(FuncOp func, int idx) {
+  return func.getType().getResult(idx).dyn_cast_or_null<TensorType>();
 }
 
-LogicalResult VerifyWhitespaceTokenizer(mlir::FuncOp func) {
+inline bool RankEquals(const TensorType& type, int rank) {
+  return type && type.hasRank() && type.getRank() == rank;
+}
+
+LogicalResult VerifyWhitespaceTokenizer(FuncOp func) {
   // In the case of input tensor with 0 rank.
   // Whitespace tokenizer generates 1 output:
   // * String tensor for tokens.
@@ -81,8 +88,8 @@ LogicalResult VerifyWhitespaceTokenizer(mlir::FuncOp func) {
   // * 1st output is the value of ragged tensor;
   // * 2nd output is the inner offset;
   // * 3rd output is the outer offset.
-  auto input_type = getInputType(func, 0);
-  if (!input_type || !input_type.getElementType().isa<mlir::TF::StringType>() ||
+  auto input_type = GetInputType(func, 0);
+  if (!input_type || !input_type.getElementType().isa<StringType>() ||
       !input_type.hasRank()) {
     return func.emitError() << "Input should be a string tensor";
   }
@@ -98,21 +105,21 @@ LogicalResult VerifyWhitespaceTokenizer(mlir::FuncOp func) {
            << "output(s) when input has rank " << input_type.getRank();
   }
 
-  auto value_type = getResultType(func, 0);
-  if (!value_type || !value_type.hasRank() || value_type.getRank() != 1 ||
-      !value_type.getElementType().isa<mlir::TF::StringType>()) {
+  auto value_type = GetResultType(func, 0);
+  if (!RankEquals(value_type, 1) ||
+      !value_type.getElementType().isa<StringType>()) {
     return func.emitError() << "1st output should be string tensor";
   }
   if (func.getNumResults() > 1) {
-    auto offset_type = getResultType(func, 1);
-    if (!offset_type || !offset_type.hasRank() || offset_type.getRank() != 1 ||
+    auto offset_type = GetResultType(func, 1);
+    if (!RankEquals(offset_type, 1) ||
         !offset_type.getElementType().isInteger(64)) {
       return func.emitError() << "2nd output should be int64 tensor";
     }
   }
   if (func.getNumResults() > 2) {
-    auto offset_type = getResultType(func, 2);
-    if (!offset_type || !offset_type.hasRank() || offset_type.getRank() != 1 ||
+    auto offset_type = GetResultType(func, 2);
+    if (!RankEquals(offset_type, 1) ||
         !offset_type.getElementType().isInteger(64)) {
       return func.emitError() << "3rd output should be int64 tensor";
     }
@@ -121,36 +128,168 @@ LogicalResult VerifyWhitespaceTokenizer(mlir::FuncOp func) {
   return success();
 }
 
-LogicalResult ConvertWhitespaceTokenizer(mlir::FuncOp func,
-                                         llvm::StringRef api) {
+LogicalResult ConvertWhitespaceTokenizer(FuncOp func, llvm::StringRef api,
+                                         FuncAttr attr) {
   func.eraseBody();
   func.addEntryBlock();
-  func.setAttr(kTFAPIImplements, StringAttr::get(api, func.getContext()));
-  Value text = func.getArgument(0);
+  func.setAttr(kTFImplements, attr);
   OpBuilder builder(func.getBody());
-
-  auto op = builder.create<mlir::TFL::CustomOp>(
-      func.getLoc(), func.getType().getResults(), ValueRange(text), api,
-      emptyCustomOption(&builder));
-  builder.create<mlir::ReturnOp>(func.getLoc(), op.getResults());
+  std::string empty_option_buffer;
+  auto op = builder.create<CustomOp>(
+      func.getLoc(), func.getType().getResults(), func.getArguments(), api,
+      CustomOption(&builder, empty_option_buffer));
+  builder.create<ReturnOp>(func.getLoc(), op.getResults());
   return success();
 }
+
+LogicalResult VerifyNgrams(FuncOp func) {
+  // The inputs and outputs should be the same:
+  // * A string tensor for tokens/ragged tensor values.
+  // * Zero or more row_split tensors.
+  constexpr int kValues = 0;
+  constexpr int kRowSplits = 1;
+
+  if (func.getType().getInputs().size() != func.getType().getResults().size()) {
+    return func.emitError() << "Mismatched number of inputs and outputs.";
+  }
+
+  int row_splits = func.getType().getInputs().size() - kRowSplits;
+  if (row_splits == 0) {
+    auto input_values = GetInputType(func, kValues);
+    if (!input_values || !input_values.getElementType().isa<StringType>()) {
+      return func.emitError()
+             << "Input " << kValues << " should be a string tensor";
+    }
+    auto output_values = GetResultType(func, kValues);
+    if (!output_values || !output_values.getElementType().isa<StringType>()) {
+      return func.emitError()
+             << "Output " << kValues << " should be a string tensor";
+    }
+
+    if (input_values.hasRank() && output_values.hasRank() &&
+        input_values.getRank() != output_values.getRank()) {
+      return func.emitError() << "Input " << kValues << " and output "
+                              << kValues << " should have the same rank";
+    }
+  } else {
+    auto input_values = GetInputType(func, kValues);
+    if (!RankEquals(input_values, 1) ||
+        !input_values.getElementType().isa<StringType>()) {
+      return func.emitError()
+             << "Input " << kValues << " should be a 1D string tensor";
+    }
+    auto output_values = GetResultType(func, kValues);
+    if (!RankEquals(output_values, 1) ||
+        !output_values.getElementType().isa<StringType>()) {
+      return func.emitError()
+             << "Output " << kValues << " should be a 1D string tensor";
+    }
+
+    for (int i = 0; i < row_splits; ++i) {
+      const int row_index = i + kRowSplits;
+      auto input_row_splits = GetInputType(func, row_index);
+      if (!RankEquals(input_row_splits, 1) ||
+          !input_row_splits.getElementType().isInteger(64)) {
+        return func.emitError()
+               << "Input " << row_index << " should be a 1D int64 tensor";
+      }
+      auto output_row_splits = GetResultType(func, row_index);
+      if (!RankEquals(output_row_splits, 1) ||
+          !output_row_splits.getElementType().isInteger(64)) {
+        return func.emitError()
+               << "Output " << row_index << " should be a 1D int64 tensor";
+      }
+    }
+  }
+
+  return success();
+}
+
+LogicalResult CreateNgramsCustomOption(FuncOp func, DictionaryAttr attrs,
+                                       std::string& custom_option_buffer) {
+  flexbuffers::Builder fbb;
+  size_t start_map = fbb.StartMap();
+
+  auto width = attrs.get("width").dyn_cast_or_null<IntegerAttr>();
+  if (!width) {
+    return func.emitError() << "'width' attribute is not set or not an integer";
+  }
+  fbb.Int("width", width.getInt());
+
+  auto string_separator =
+      attrs.get("string_separator").dyn_cast_or_null<StringAttr>();
+  if (!string_separator) {
+    return func.emitError()
+           << "'string_separator' attribute is not set or not a string";
+  }
+  // StringAttrs are not guaranteed to be NUL terminated, but flexbuffers
+  // strings expect NUL terminated strings.
+  std::string string_separator_str(string_separator.getValue().data(),
+                                   string_separator.getValue().size());
+  fbb.String("string_separator", string_separator_str);
+
+  auto axis = attrs.get("axis").dyn_cast_or_null<IntegerAttr>();
+  if (!axis) {
+    return func.emitError() << "'axis' attribute is not set or not an integer";
+  }
+  fbb.Int("axis", axis.getInt());
+
+  auto reduction_type =
+      attrs.get("reduction_type").dyn_cast_or_null<StringAttr>();
+  if (!reduction_type) {
+    return func.emitError()
+           << "'reduction_type' attribute is not set or not a string";
+  }
+  // StringAttrs are not guaranteed to be NUL terminated, but flexbuffers
+  // strings expect NUL terminated strings.
+  std::string reduction_type_str(reduction_type.getValue().data(),
+                                 reduction_type.getValue().size());
+  fbb.String("reduction_type", reduction_type_str);
+
+  fbb.EndMap(start_map);
+  fbb.Finish();
+  custom_option_buffer.assign(fbb.GetBuffer().begin(), fbb.GetBuffer().end());
+  return success();
+}
+
+LogicalResult ConvertNgrams(FuncOp func, llvm::StringRef api, FuncAttr attr) {
+  func.eraseBody();
+  func.addEntryBlock();
+  func.setAttr(kTFImplements, attr);
+  OpBuilder builder(func.getBody());
+  std::string custom_option_buffer;
+  if (failed(CreateNgramsCustomOption(func, attr.GetAttrs(),
+                                      custom_option_buffer))) {
+    return failure();
+  }
+  auto op = builder.create<CustomOp>(
+      func.getLoc(), func.getType().getResults(), func.getArguments(), api,
+      CustomOption(&builder, custom_option_buffer));
+  builder.create<ReturnOp>(func.getLoc(), op.getResults());
+  return success();
+}
+
 }  // namespace
 
-LogicalResult ConvertTFTextAPI(mlir::FuncOp func, llvm::StringRef api) {
+LogicalResult ConvertTFTextAPI(FuncOp func, llvm::StringRef api,
+                               FuncAttr attr) {
   if (api.str() == kWhitespaceTokenizer) {
     if (succeeded(VerifyWhitespaceTokenizer(func))) {
-      return ConvertWhitespaceTokenizer(func, api);
+      return ConvertWhitespaceTokenizer(func, api, attr);
+    }
+  } else if (api.str() == kNgrams) {
+    if (succeeded(VerifyNgrams(func))) {
+      return ConvertNgrams(func, api, attr);
     }
   }
   return failure();
 }
 
-bool IsTfTextRegistered(const tensorflow::OpRegistry* op_registery) {
-  const std::vector<std::string> kTfTextOps = {
+bool IsTFTextRegistered(const tensorflow::OpRegistry* op_registery) {
+  const std::vector<std::string> kTFTextOps = {
       "WhitespaceTokenizeWithOffsets",
   };
-  for (const auto& iter : kTfTextOps) {
+  for (const auto& iter : kTFTextOps) {
     if (op_registery->LookUp(iter)) {
       return true;
     }
diff --git a/tensorflow/compiler/mlir/lite/utils/tftext_utils.h b/tensorflow/compiler/mlir/lite/utils/tftext_utils.h
index c52ee019d8d..55e4680c3dd 100644
--- a/tensorflow/compiler/mlir/lite/utils/tftext_utils.h
+++ b/tensorflow/compiler/mlir/lite/utils/tftext_utils.h
@@ -27,14 +27,18 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/core/framework/op.h"
 
 namespace mlir {
 namespace TFL {
 
-LogicalResult ConvertTFTextAPI(mlir::FuncOp func, llvm::StringRef api);
+// Fuse TF.Text APIs annotated by tf.function to a TFLite custom op.
+LogicalResult ConvertTFTextAPI(mlir::FuncOp func, llvm::StringRef api,
+                               mlir::TF::FuncAttr attr);
 
-bool IsTfTextRegistered(const tensorflow::OpRegistry* op_registery);
+// Check if TF.Text Tensorflow ops are registered.
+bool IsTFTextRegistered(const tensorflow::OpRegistry* op_registery);
 
 }  // end namespace TFL
 }  // end namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/utils/tftext_utils_test.cc b/tensorflow/compiler/mlir/lite/utils/tftext_utils_test.cc
index 7d29264aaae..9bcfa89c544 100644
--- a/tensorflow/compiler/mlir/lite/utils/tftext_utils_test.cc
+++ b/tensorflow/compiler/mlir/lite/utils/tftext_utils_test.cc
@@ -41,13 +41,13 @@ void Register(const std::string& op_name, OpRegistry* registry) {
 TEST(TfTextUtilsTest, TestTfTextRegistered) {
   std::unique_ptr<OpRegistry> registry(new OpRegistry);
   Register("WhitespaceTokenizeWithOffsets", registry.get());
-  EXPECT_TRUE(IsTfTextRegistered(registry.get()));
+  EXPECT_TRUE(IsTFTextRegistered(registry.get()));
 }
 
 TEST(TfTextUtilsTest, TestTfTextNotRegistered) {
   std::unique_ptr<OpRegistry> registry(new OpRegistry);
   Register("Test", registry.get());
-  EXPECT_FALSE(IsTfTextRegistered(registry.get()));
+  EXPECT_FALSE(IsTFTextRegistered(registry.get()));
 }
 }  // namespace TFL
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
index 67002aa65bf..8be6facce38 100644
--- a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
+++ b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
@@ -115,13 +115,15 @@ Status MlirFunctionOptimizationPass::Run(
       });
 
   if (!is_enabled) {
-    VLOG(0) << "None of the MLIR optimization passes are enabled "
-            << "(registered " << registry_->passes().size() << ")";
+    LOG_FIRST_N(INFO, 1)
+        << "None of the MLIR optimization passes are enabled "
+        << "(registered " << registry_->passes().size() << ")";
     return Status::OK();
   }
 
-  VLOG(0) << "Running MLIR Graph Optimization Passes "
-          << "(registered " << registry_->passes().size() << " passes)";
+  LOG_FIRST_N(INFO, 1) << "Running MLIR Graph Optimization Passes "
+                          << "(registered " << registry_->passes().size()
+                          << " passes)";
 
   GraphDebugInfo debug_info;
   RegisterDialects();
@@ -130,6 +132,12 @@ Status MlirFunctionOptimizationPass::Run(
   import_config.graph_as_function = true;
   import_config.control_outputs = *control_ret_node_names;
   import_config.upgrade_legacy = true;
+  // Disable shape inference during import as some TensorFlow op fails during
+  // shape inference with dynamic shaped operands. This in turn causes the
+  // import to fail. Shape inference during import is going to be removed and
+  // the shape inference pass is run early in the pass pipeline, shape inference
+  // during import is not necessary.
+  import_config.enable_shape_inference = false;
   TF_ASSIGN_OR_RETURN(auto module_ref,
                       ConvertGraphToMlir(**graph, debug_info, *flib_def,
                                          import_config, &context));
@@ -187,13 +195,15 @@ Status MlirV1CompatGraphOptimizationPass::Run(
       });
 
   if (!is_enabled) {
-    VLOG(0) << "None of the MLIR optimization passes are enabled "
-            << "(registered" << registry_->passes().size() << " passes)";
+    LOG_FIRST_N(INFO, 1)
+        << "None of the MLIR optimization passes are enabled "
+        << "(registered " << registry_->passes().size() << " passes)";
     return Status::OK();
   }
 
-  VLOG(0) << "Running MLIR Graph Optimization V1 Compat Passes "
-          << "(registered" << registry_->passes().size() << " passes)";
+  LOG_FIRST_N(INFO, 1) << "Running MLIR Graph Optimization V1 Compat Passes "
+                          << "(registered " << registry_->passes().size()
+                          << " passes)";
 
   GraphDebugInfo debug_info;
   RegisterDialects();
diff --git a/tensorflow/compiler/mlir/runlit.cfg.py b/tensorflow/compiler/mlir/runlit.cfg.py
index e3158f21cb2..45c8dce8422 100644
--- a/tensorflow/compiler/mlir/runlit.cfg.py
+++ b/tensorflow/compiler/mlir/runlit.cfg.py
@@ -73,7 +73,8 @@ tool_names = [
     'mlir-opt', 'mlir-hlo-opt', 'mlir-translate', 'tf-opt', 'tf_tfl_translate',
     'tf_tfjs_translate', 'flatbuffer_to_string', 'flatbuffer_translate',
     'tf-mlir-translate', 'mlir-tflite-runner', 'tfcompile',
-    'json_to_flatbuffer', 'xla-gpu-opt', 'xla-opt', 'hlo_to_llvm_ir'
+    'json_to_flatbuffer', 'xla-gpu-opt', 'xla-opt', 'hlo_to_llvm_ir',
+    'kernel-gen-opt', 'xla-thunks-opt'
 ]
 tools = [ToolSubst(s, unresolved='ignore') for s in tool_names]
 llvm_config.add_tool_substitutions(tools, tool_dirs)
diff --git a/tensorflow/compiler/mlir/runlit.site.cfg.py b/tensorflow/compiler/mlir/runlit.site.cfg.py
index 82175d7f680..b4d3e6185a6 100644
--- a/tensorflow/compiler/mlir/runlit.site.cfg.py
+++ b/tensorflow/compiler/mlir/runlit.site.cfg.py
@@ -47,6 +47,7 @@ mlir_tf_tools_dirs = [
     'tensorflow/compiler/mlir/tensorflow',
     'tensorflow/compiler/mlir/tfjs',
     'tensorflow/compiler/mlir/xla',
+    'tensorflow/compiler/mlir/tools/kernel_gen',
     'tensorflow/compiler/aot',
     'tensorflow/compiler/xla/service/mlir_gpu',
     'tensorflow/compiler/xla/service/gpu/tests',
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 14d7faecdca..d2e57f72774 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -88,6 +88,7 @@ gentbl(
 cc_library(
     name = "tensorflow_op_interfaces",
     srcs = [
+        "ir/tf_op_interfaces.cc",
         "ir/tf_op_interfaces.cc.inc",
         "ir/tf_op_interfaces.h.inc",
         "ir/tf_verifiers.cc",
@@ -105,15 +106,67 @@ cc_library(
 )
 
 gentbl(
-    name = "tensorflow_ops_inc_gen",
+    name = "tensorflow_all_ops_inc_gen",
     tbl_outs = [
         (
             "-gen-op-decls",
-            "ir/tf_ops.h.inc",
+            "ir/tf_all_ops.h.inc",
         ),
         (
             "-gen-op-defs",
-            "ir/tf_ops.cc.inc",
+            "ir/tf_all_ops.cc.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "ir/tf_ops.td",
+    td_srcs = [
+        ":tensorflow_ops_td_files",
+    ],
+)
+
+# We only shard tf_op on name for build performance reasons.
+tf_ops_category_list = [
+    {
+        "name": "ops_a_m",
+        "include": "tf.[A-M].*$$",
+    },
+    {
+        "name": "ops_n_z",
+        "include": "tf.[N-Z].*$$",
+    },
+]
+
+[[
+    gentbl(
+        name = "tensorflow_" + target["name"] + "_inc_gen",
+        tbl_outs = [
+            (
+                "-gen-op-decls -op-include-regex='" + target["include"] + "'",
+                "ir/tf_" + target["name"] + ".h.inc",
+            ),
+            (
+                "-gen-op-defs -op-include-regex='" + target["include"] + "'",
+                "ir/tf_" + target["name"] + ".cc.inc",
+            ),
+        ],
+        tblgen = "@llvm-project//mlir:mlir-tblgen",
+        td_file = "ir/tf_ops.td",
+        td_srcs = [
+            ":tensorflow_ops_td_files",
+        ],
+    ),
+] for target in tf_ops_category_list]
+
+gentbl(
+    name = "tensorflow_remaining_ops_inc_gen",
+    tbl_outs = [
+        (
+            "-gen-op-decls -op-exclude-regex='" + "|".join([target["include"] for target in tf_ops_category_list]) + "' ",
+            "ir/tf_remaining_ops.h.inc",
+        ),
+        (
+            "-gen-op-defs -op-exclude-regex='" + "|".join([target["include"] for target in tf_ops_category_list]) + "' ",
+            "ir/tf_remaining_ops.cc.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
@@ -179,7 +232,7 @@ gentbl(
     name = "tensorflow_device_ops_inc_gen",
     tbl_outs = [
         (
-            "-gen-op-decls",
+            "-gen-op-decls ",
             "ir/tf_device.h.inc",
         ),
         (
@@ -280,28 +333,72 @@ cc_library(
     deps = [
         ":tensorflow_types",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:SideEffects",
         "@llvm-project//mlir:Support",
     ],
 )
 
+[[
+    cc_library(
+        name = "tensorflow_" + target["name"],
+        srcs = [
+            "ir/tf_ops.h",
+            "ir/tf_remaining_ops.h",
+            "ir/tf_" + target["name"] + ".cc",
+            "ir/tf_" + target["name"] + ".cc.inc",
+        ] + ["ir/tf_" + target["name"] + ".h" for target in tf_ops_category_list],
+        hdrs = [
+        ],
+        textual_hdrs = [
+            "ir/tf_all_ops.h.inc",
+            "ir/tf_ops_helpers.inc",
+            "ir/tf_remaining_ops.h.inc",
+        ] + ["ir/tf_" + target["name"] + ".h.inc" for target in tf_ops_category_list],
+        deps = [
+            ":tensorflow_attributes",
+            ":tensorflow_canonicalize_inc_gen",
+            ":tensorflow_op_interfaces",
+            ":tensorflow_op_interfaces_inc_gen",
+            ":tensorflow_side_effects",
+            ":tensorflow_structs",
+            ":tensorflow_traits",
+            ":tensorflow_types",
+            "@llvm-project//llvm:Support",
+            "@llvm-project//mlir:DerivedAttributeOpInterface",
+            "@llvm-project//mlir:Dialect",
+            "@llvm-project//mlir:IR",
+            "@llvm-project//mlir:InferTypeOpInterface",
+            "@llvm-project//mlir:LoopLikeInterface",
+            "@llvm-project//mlir:Parser",
+            "@llvm-project//mlir:SideEffects",
+            "@llvm-project//mlir:StandardOps",
+            "@llvm-project//mlir:Support",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:lib",
+        ] + [":tensorflow_" + target["name"] + "_inc_gen"],
+    ),
+] for target in tf_ops_category_list]
+
 cc_library(
-    name = "tensorflow_ops",
+    name = "tensorflow_remaining_ops",
     srcs = [
-        "ir/tf_ops.cc",
-        "ir/tf_ops.cc.inc",
         "ir/tf_ops.h",
-    ],
+        "ir/tf_remaining_ops.h",
+        "ir/tf_remaining_ops.cc",
+    ] + ["ir/tf_" + target["name"] + ".h" for target in tf_ops_category_list],
     hdrs = [
     ],
     textual_hdrs = [
-        "ir/tf_ops.h.inc",
-    ],
+        "ir/tf_all_ops.h.inc",
+        "ir/tf_ops_helpers.inc",
+        "ir/tf_remaining_ops.h.inc",
+    ] + ["ir/tf_" + target["name"] + ".h.inc" for target in tf_ops_category_list],
     deps = [
         ":tensorflow_attributes",
         ":tensorflow_canonicalize_inc_gen",
         ":tensorflow_op_interfaces",
         ":tensorflow_op_interfaces_inc_gen",
-        ":tensorflow_ops_inc_gen",
+        ":tensorflow_remaining_ops_inc_gen",
         ":tensorflow_side_effects",
         ":tensorflow_structs",
         ":tensorflow_traits",
@@ -321,6 +418,43 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tensorflow_ops",
+    srcs = [
+        "ir/tf_ops.cc",
+        "ir/tf_ops.h",
+    ],
+    textual_hdrs = [
+        "ir/tf_all_ops.h.inc",
+        "ir/tf_remaining_ops.h",
+    ] + ["ir/tf_" + target["name"] + ".h" for target in tf_ops_category_list],
+    deps = [
+        ":tensorflow_all_ops_inc_gen",
+        ":tensorflow_remaining_ops_inc_gen",
+        ":tensorflow_attributes",
+        ":tensorflow_canonicalize_inc_gen",
+        ":tensorflow_op_interfaces",
+        ":tensorflow_op_interfaces_inc_gen",
+        ":tensorflow_side_effects",
+        ":tensorflow_structs",
+        ":tensorflow_traits",
+        ":tensorflow_types",
+        ":tensorflow_remaining_ops",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:DerivedAttributeOpInterface",
+        "@llvm-project//mlir:Dialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:LoopLikeInterface",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:SideEffects",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ] + [":tensorflow_" + target["name"] for target in tf_ops_category_list],
+)
+
 cc_library(
     name = "tensorflow_structs",
     srcs = [
@@ -393,12 +527,14 @@ cc_library(
     includes = ["include"],
     deps = [
         ":error_util",
+        ":tensorflow_all_ops_inc_gen",
         ":tensorflow_attributes",
         ":tensorflow_canonicalize_inc_gen",
         ":tensorflow_device_ops_inc_gen",
         ":tensorflow_executor_inc_gen",
         ":tensorflow_op_interfaces",
         ":tensorflow_ops",
+        ":tensorflow_side_effects",
         ":tensorflow_structs",
         ":tensorflow_traits",
         ":tensorflow_types",
@@ -540,6 +676,7 @@ cc_library(
 cc_library(
     name = "tf_saved_model_passes",
     srcs = [
+        "transforms/deduplicate_bound_input_bindings.cc",
         "transforms/freeze_global_tensors.cc",
         "transforms/lift_variables_pass.cc",
         "transforms/optimize_global_tensors.cc",
@@ -567,6 +704,30 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "tensorflow_analysis",
+    srcs = [
+        "analysis/per_function_aggregate_analysis.h",
+        "analysis/resource_alias_analysis.cc",
+        "analysis/side_effect_analysis.cc",
+    ],
+    hdrs = [
+        "analysis/resource_alias_analysis.h",
+        "analysis/side_effect_analysis.h",
+    ],
+    deps = [
+        ":tensorflow",
+        ":tensorflow_types",
+        "//tensorflow/compiler/tf2xla:resource_operation_table",
+        "//tensorflow/core:framework",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 cc_library(
     name = "tensorflow_passes",
     srcs = [
@@ -592,11 +753,15 @@ cc_library(
         "transforms/generated_optimize.inc",
         "transforms/gpu_fusion.cc",
         "transforms/graph_pruning.cc",
+        "transforms/guarantee_all_funcs_one_use.cc",
+        "transforms/init_text_file_to_import.cc",
         "transforms/launch_to_device_attribute.cc",
         "transforms/layout_optimization.cc",
+        "transforms/mark_ops_for_outside_compilation.cc",
         "transforms/materialize_mlir_passthrough_op.cc",
         "transforms/optimize.cc",
         "transforms/parallel_execute_to_islands.cc",
+        "transforms/parallelize_embedding_params_ops_pass.cc",
         "transforms/promote_resources_to_args.cc",
         "transforms/readonly_references_to_resources.cc",
         "transforms/region_control_flow_to_functional.cc",
@@ -611,6 +776,7 @@ cc_library(
         "transforms/stack_ops_decomposition.cc",
         "transforms/tensor_array_ops_decomposition.cc",
         "transforms/tensor_list_ops_decomposition.cc",
+        "transforms/test_resource_alias_analysis.cc",
         "transforms/test_side_effect_analysis.cc",
         "transforms/tf_data_optimization_pass.cc",
         "transforms/tf_device_assignment.cc",
@@ -632,6 +798,7 @@ cc_library(
         "translate/tf_functional_to_executor.cc",
     ],
     hdrs = [
+        "transforms/attribute_utils.h",
         "transforms/batchmatmul_to_einsum.h",
         "transforms/bridge.h",
         "transforms/collection_ops_util.h",
@@ -650,8 +817,8 @@ cc_library(
         ":error_util",
         ":export_tf_dialect_op",
         ":mangling_util",
-        ":side_effect_analysis",
         ":tensorflow",
+        ":tensorflow_analysis",
         ":tensorflow_optimize_inc_gen",
         ":tensorflow_types",
         ":tf_data_optimization",
@@ -661,6 +828,8 @@ cc_library(
         ":xla_sharding_util",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "//tensorflow/compiler/mlir/lite:validators",
+        "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
+        "//tensorflow/compiler/mlir/xla:xla_legalize_tf_with_tf2xla",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla:xla_proto_cc",
         "//tensorflow/compiler/xla/client:sharding_builder",
@@ -671,6 +840,7 @@ cc_library(
         "//tensorflow/core/platform:random",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
         "//tensorflow/core/protobuf/tpu:dynamic_padding_proto_cc",
+        "//tensorflow/core/tpu:tpu_embedding_optimization_parameters_utils",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
@@ -690,6 +860,7 @@ cc_library(
 cc_library(
     name = "tensorflow_test_passes",
     srcs = [
+        "transforms/init_text_file_to_import_test_pass.cc",
         "transforms/lift_variables_test_pass.cc",
         "transforms/lower_tf_pass.cc",
     ],
@@ -705,8 +876,10 @@ cc_library(
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:threadpool_options",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
     ],
     alwayslink = 1,
@@ -1190,11 +1363,13 @@ cc_library(
         ":mlir_roundtrip_flags",
         "//tensorflow/cc/saved_model:bundle_v2",
         "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler/utils:transitive_fanin",
         "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -1315,6 +1490,7 @@ COMPILE_MLIR_UTIL_DEPS = [
     ":mlir_roundtrip_flags",
     ":tensorflow",
     ":tensorflow_dialect_registration",
+    ":tensorflow_types",
     ":tensorflow_passes",
     ":translate_utils",
     "@com_google_absl//absl/types:optional",
@@ -1333,10 +1509,13 @@ COMPILE_MLIR_UTIL_DEPS = [
     "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
     "//tensorflow/compiler/mlir/xla:xla_legalize_tf_with_tf2xla",
     "//tensorflow/compiler/tf2xla:common",
-    "//tensorflow/compiler/tf2xla:xla_compiler",
+    "//tensorflow/compiler/tf2xla:xla_helpers",
+    "//tensorflow/compiler/tf2xla:xla_argument",
+    "//tensorflow/compiler/xla/client:xla_computation",
+    "//tensorflow/core/common_runtime:core_cpu_internal",
+    "//tensorflow/core/platform:logging",
     "//tensorflow/core:framework",
     "//tensorflow/core:protos_all_cc",
-    "//tensorflow/core/platform:logging",
     "//tensorflow/stream_executor/lib",
     "//tensorflow/compiler/xla:xla_data_proto_cc",
     "//tensorflow/compiler/xla/service:hlo",
@@ -1373,6 +1552,9 @@ tf_cc_test(
     srcs = ["utils/compile_mlir_util_test.cc"],
     deps = [
         ":compile_mlir_util",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:resource_variable_ops",
+        "//tensorflow/cc:scope",
         "//tensorflow/compiler/jit",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
@@ -1500,6 +1682,7 @@ cc_library(
         ":tensorflow",
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
+        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
@@ -1602,22 +1785,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "side_effect_analysis",
-    srcs = ["analysis/side_effect_analysis.cc"],
-    hdrs = ["analysis/side_effect_analysis.h"],
-    deps = [
-        ":tensorflow",
-        ":tensorflow_types",
-        "//tensorflow/compiler/tf2xla:resource_operation_table",
-        "//tensorflow/core:framework",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
 cc_library(
     name = "xla_sharding_util",
     srcs = [
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/per_function_aggregate_analysis.h b/tensorflow/compiler/mlir/tensorflow/analysis/per_function_aggregate_analysis.h
new file mode 100644
index 00000000000..da7a2bd9b5c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/per_function_aggregate_analysis.h
@@ -0,0 +1,76 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_PER_FUNCTION_AGGREGATE_ANALYSIS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_PER_FUNCTION_AGGREGATE_ANALYSIS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+namespace detail {
+
+// This template defines an aggregate analysis base class, which analyzes a
+// module but the analysis info is stored per function.
+template <typename InfoT>
+class PerFunctionAggregateAnalysis {
+ public:
+  using Info = InfoT;
+
+  // Returns the analysis info for the given function.
+  const Info& GetAnalysisForFunc(FuncOp func) const {
+    auto it = info_map_.find(func);
+    assert(it != info_map_.end());
+    return it->second;
+  }
+
+ protected:
+  llvm::SmallDenseMap<FuncOp, InfoT, 8> info_map_;
+};
+
+}  // namespace detail
+
+// Base CRTP class to help write passes that are consumes a per-function
+// aggregate analysis and operate on all non-extern functions (similar to a
+// FunctionPass, but with no concurrency between functions). The derived classes
+// need to provide a runOnFunction() method that accepts the function and the
+// analysis information for that function.
+template <typename DerivedT, typename AnalysisT>
+class PerFunctionAggregateAnalysisConsumerPass
+    : public PassWrapper<
+          PerFunctionAggregateAnalysisConsumerPass<DerivedT, AnalysisT>,
+          OperationPass<ModuleOp>> {
+  void runOnOperation() override {
+    ModuleOp op = this->getOperation();
+    DerivedT& derived = *static_cast<DerivedT*>(this);
+    auto& analysis = this->template getAnalysis<AnalysisT>();
+
+    for (auto func : op.getOps<FuncOp>())
+      if (!func.isExternal())
+        derived.runOnFunction(func, analysis.GetAnalysisForFunc(func));
+  }
+};
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_PER_FUNCTION_AGGREGATE_ANALYSIS_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc b/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc
new file mode 100644
index 00000000000..256217b6542
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc
@@ -0,0 +1,507 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h"
+
+#include <cstdint>
+#include <initializer_list>
+
+#include "absl/strings/str_cat.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/tf2xla/resource_operation_table.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+
+namespace mlir {
+namespace TF {
+namespace detail {
+
+//===----------------------------------------------------------------------===//
+// BacktrackAnalysisInfo
+//===----------------------------------------------------------------------===//
+// Class to hold backtrack analysis for a results of a region. Backtrack
+// analysis will trace back the definition of return values of regions through
+// pass-through operations, so that the return value of the region will have the
+// same value as the backtracked value.
+class BacktrackAnalysisInfo {
+ public:
+  // Initializes the backtrack analysis for the given region.
+  explicit BacktrackAnalysisInfo(Region& region,
+                                 detail::BacktrackAnalysis& backtrack_analysis);
+
+  BacktrackAnalysisInfo(BacktrackAnalysisInfo&&) = default;
+
+  // Returns the value to which the given result number of the region can be
+  // backtracked to.
+  Value GetValue(int result_index) const {
+    return backtracked_values_[result_index];
+  }
+
+  // Returns the argument index of the region to which the given result number
+  // can backtracked to. Such results will be called "function passthrough". If
+  // the result cannot be backtracked to a region argument, returns llvm::None.
+  llvm::Optional<int> GetArg(int result_index) const {
+    if (auto arg = GetValue(result_index).dyn_cast<BlockArgument>())
+      if (arg.getParentBlock() == &region_->front()) return arg.getArgNumber();
+    return llvm::None;
+  }
+
+ private:
+  friend class detail::BacktrackAnalysis;
+
+  // Region for which this object holds the analysis info.
+  Region* region_;
+
+  // Backtracked values indexed by the result number.
+  llvm::SmallVector<Value, 4> backtracked_values_;
+};
+
+//===----------------------------------------------------------------------===//
+// BacktrackAnalysis
+//===----------------------------------------------------------------------===//
+// Holds backtrack analysis for all functions and regions within a module.
+class BacktrackAnalysis {
+ public:
+  using InfoT = BacktrackAnalysisInfo;
+
+  // Constructs the analysis by analyzing the given module.
+  explicit BacktrackAnalysis(ModuleOp module);
+
+  // Returns backtracking analysis for the given region.
+  const InfoT& GetAnalysisForRegion(Region& region) const {
+    auto it = info_map_.find(&region);
+    assert(it != info_map_.end());
+    return it->second;
+  }
+
+  // Returns backtracking analysis for the given function.
+  const InfoT& GetAnalysisForFunc(FuncOp func) const {
+    return GetAnalysisForRegion(func.getBody());
+  }
+
+  // Backtracks the given value.
+  Value BacktrackValue(Value value);
+
+ private:
+  // Returns the analysis for the given region (analyzing the region if it has
+  // not yet been analyzed).
+  const InfoT& GetOrCreateAnalysis(Region& region) {
+    auto it = info_map_.find(&region);
+    if (it == info_map_.end()) {
+      // Note: Keep object construction and insertion separate. If we use
+      // emplace() to construct and insert in a single shot, when analyzing
+      // this region, calls to BacktrackValue() may end up inserting additional
+      // entries in the map, causing the underlying storage to be moved. This
+      // would also include this pertially constructed object that we have just
+      // inserted into the map and are constructing it. To avoid this issue,
+      // construct the analysis object separately and then insert it into the
+      // map.
+      InfoT info(region, *this);
+      info_map_.insert({&region, std::move(info)});
+    }
+
+    return GetAnalysisForRegion(region);
+  }
+
+ private:
+  llvm::SmallDenseMap<Region*, InfoT> info_map_;
+};
+
+// Analyzes all regions attached to all operations in the module.
+BacktrackAnalysis::BacktrackAnalysis(ModuleOp module) {
+  module.walk([this](Operation* op) {
+    for (Region& region : op->getRegions()) GetOrCreateAnalysis(region);
+  });
+}
+
+// Backtracks the definition of `value` looking through passthrough ops.
+// Returns a non-null value and can return `value` if backtracking is not
+// possible.
+Value BacktrackAnalysis::BacktrackValue(Value value) {
+  while (Operation* op = value.getDefiningOp()) {
+    int res_index = value.cast<OpResult>().getResultNumber();
+    if (auto graph = dyn_cast<tf_executor::GraphOp>(op)) {
+      value = graph.GetFetch().getOperand(res_index);
+    } else if (auto island = dyn_cast<tf_executor::IslandOp>(op)) {
+      // Control output is generated by the IslandOp, not the yield in
+      // in the Island body.
+      if (value == island.control()) break;
+      value = island.GetYield().getOperand(res_index);
+    } else if (isa<IdentityNOp, IdentityOp>(op)) {
+      value = op->getOperand(res_index);
+    } else {
+      break;
+    }
+  }
+  return value;
+}
+
+// Analyze the region.
+BacktrackAnalysisInfo::BacktrackAnalysisInfo(
+    Region& region, detail::BacktrackAnalysis& backtrack_analysis)
+    : region_(&region) {
+  if (region.empty()) return;
+
+  assert(llvm::hasSingleElement(region.getBlocks()));
+  auto results = region.front().getTerminator()->getOperands();
+  if (results.empty()) return;
+
+  backtracked_values_.reserve(results.size());
+  for (auto result : results)
+    backtracked_values_.push_back(backtrack_analysis.BacktrackValue(result));
+}
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// ResourceAliasAnalysisInfo helper functions.
+//===----------------------------------------------------------------------===//
+
+constexpr char kResourceArgUniqueIdAttr[] = "tf._resource_arg_unique_id";
+
+// Returns if a VarHandleOp is anonymous, which means it always creates a new
+// variable.
+bool IsResourceHandleAnonymous(VarHandleOp handle) {
+  return handle.shared_name() == tensorflow::ResourceHandle::ANONYMOUS_NAME;
+}
+
+// Returns a string unique identifier for a non-anonymous VarHandleOp.
+std::string GetVarHandleStringId(VarHandleOp handle) {
+  auto device = handle.getAttrOfType<StringAttr>("device");
+  return absl::StrCat(handle.container().str(), "/", handle.shared_name().str(),
+                      "/", device ? device.getValue().str() : std::string(""));
+}
+
+// Finds a unique ID for a VarHandleOp's output. If it is anonymous, always
+// creates a new ID; otherwise, tries to reuse the existing ID for the
+// referenced variable if it exists, or creates a new one if not.
+int64_t GetOrCreateIdForVarHandle(VarHandleOp handle, int64_t* next_id,
+                                  llvm::StringMap<int64_t>* name_id_map) {
+  // Always create a new ID for anonymous handle.
+  if (IsResourceHandleAnonymous(handle)) return (*next_id)++;
+
+  auto name = GetVarHandleStringId(handle);
+  auto emplace_res = name_id_map->try_emplace(name, *next_id);
+  // New ID created, increment next_id.
+  if (emplace_res.second) ++(*next_id);
+  return emplace_res.first->second;
+}
+
+}  // namespace
+
+//===----------------------------------------------------------------------===//
+// ResourceAliasAnalysisInfo
+//===----------------------------------------------------------------------===//
+
+// Constructs the analysis info by analyzing the given function.
+ResourceAliasAnalysisInfo::ResourceAliasAnalysisInfo(
+    FuncOp func_op, const BacktrackAnalysis& backtrack_analysis) {
+  // This function populates resource_value_to_ids_ and id_to_resource_values_.
+
+  int64_t next_unique_id = 0;
+
+  // Helper to assign new unique id for all resources in the given list of
+  // values.
+  auto assign_unique_id_to_all = [&](ValueRange values) {
+    for (Value value : filter_resources(values)) {
+      AddValueUniqueIDMapping(value, next_unique_id++);
+    }
+  };
+
+  // Helper to assign new unknown id for all resources in the given list of
+  // values.
+  auto assign_unknown_id_to_all = [&](ValueRange values) {
+    for (Value value : filter_resources(values)) {
+      AddValueUniqueIDMapping(value, kUnknownResourceId);
+    }
+  };
+
+  // If the "tf.resource_arg_unique_id" argument attributes are present for
+  // resource-type arguments, respect them when choosing IDs; otherwise, they
+  // must not alias.
+  const bool has_arg_unique_id_attrs =
+      llvm::any_of(func_op.getArguments(), [&](const BlockArgument& arg) {
+        return func_op.getArgAttr(arg.getArgNumber(), kResourceArgUniqueIdAttr);
+      });
+  // Maps the kResourceArgUniqueIdAttr attribute value to the internal integer
+  // ID used by this pass.
+  if (has_arg_unique_id_attrs) {
+    llvm::SmallDenseMap<int64_t, int64_t> attr_id_to_internal_id;
+    for (auto arg : filter_resources(func_op.getArguments())) {
+      auto id_attr = func_op.getArgAttrOfType<IntegerAttr>(
+          arg.getArgNumber(), kResourceArgUniqueIdAttr);
+      assert(id_attr &&
+             "tf.resource_arg_unique_id attribute should exist on either "
+             "none or all arguments.");
+      auto emplace_res = attr_id_to_internal_id.try_emplace(id_attr.getInt(),
+                                                            next_unique_id++);
+      AddValueUniqueIDMapping(arg, emplace_res.first->getSecond());
+    }
+  } else {
+    assign_unique_id_to_all(func_op.getArguments());
+  }
+
+  // Since this analysis is neither inter-procedural nor inter-regional,
+  // each region attached to Op's within a function is analyzed independently.
+  // Seed this analysis for each such region by mapping all resource arguments
+  // for such regions to a new unique-id. This is required because walk() walks
+  // the attached regions first before visiting the op, so there is no
+  // opportunity during the walk to seed region arguments. Also note that walk
+  // eventually also visits the Op on which the walk() is called, so make sure
+  // we do not overwrite the function argument mapping here.
+  func_op.walk([&](Operation* op) {
+    if (op == func_op) return;
+    for (Region& region : op->getRegions()) {
+      assign_unique_id_to_all(region.getArguments());
+    }
+  });
+
+  llvm::StringMap<int64_t> var_handle_name_id_map;
+  func_op.walk([&](Operation* op) {
+    if (auto var_handle = dyn_cast<VarHandleOp>(op)) {
+      AddValueUniqueIDMapping(
+          var_handle.resource(),
+          GetOrCreateIdForVarHandle(var_handle, &next_unique_id,
+                                    &var_handle_name_id_map));
+    } else if (llvm::isa<IdentityNOp, IdentityOp>(op)) {
+      for (auto result : filter_resources(op->getResults()))
+        PropagateInputToOutput(op->getOperand(result.getResultNumber()),
+                               result);
+    } else if (auto while_op = dyn_cast<WhileOp>(op)) {
+      AnalyzeWhileLoop(while_op, backtrack_analysis.GetAnalysisForFunc(
+                                     while_op.body_func()));
+    } else if (auto while_region = dyn_cast<WhileRegionOp>(op)) {
+      AnalyzeWhileLoop(while_region, backtrack_analysis.GetAnalysisForRegion(
+                                         while_region.body()));
+    } else if (auto if_op = dyn_cast<IfOp>(op)) {
+      const auto& then_info =
+          backtrack_analysis.GetAnalysisForFunc(if_op.then_func());
+      const auto& else_info =
+          backtrack_analysis.GetAnalysisForFunc(if_op.else_func());
+      // If a result is a passthrough of both branches' inputs, merge the
+      // resource IDs of corresponding operands for the two inputs.
+      for (auto result : filter_resources(if_op.getResults())) {
+        auto passthrough_then_arg = then_info.GetArg(result.getResultNumber());
+        auto passthrough_else_arg = else_info.GetArg(result.getResultNumber());
+        if (passthrough_then_arg && passthrough_else_arg) {
+          Value then_operand = if_op.input()[passthrough_then_arg.getValue()];
+          Value else_operand = if_op.input()[passthrough_else_arg.getValue()];
+          PropagateInputToOutput(then_operand, result);
+          PropagateInputToOutput(else_operand, result);
+        } else {
+          AddValueUniqueIDMapping(result, kUnknownResourceId);
+        }
+      }
+    } else if (auto if_region = dyn_cast<IfRegionOp>(op)) {
+      const auto& then_info =
+          backtrack_analysis.GetAnalysisForRegion(if_region.then_branch());
+      const auto& else_info =
+          backtrack_analysis.GetAnalysisForRegion(if_region.else_branch());
+      for (auto result : filter_resources(if_region.getResults())) {
+        Value then_result = then_info.GetValue(result.getResultNumber());
+        Value else_result = else_info.GetValue(result.getResultNumber());
+        // For IfRegion, the walk would have visited the else and then regions
+        // before visiting the IfRegion op. Backtracking of the then and else
+        // results will either give a value computed within these regions,
+        // or a region capture. If its a region capture, computed before this
+        // IfRegion, it will have been visited earlier and a mapping would
+        // exist for that value. If its computed within the region, then again
+        // a mapping would exist.
+        PropagateInputToOutput(then_result, result);
+        PropagateInputToOutput(else_result, result);
+      }
+    } else if (auto call = dyn_cast<CallOpInterface>(op)) {
+      FuncOp func = dyn_cast<FuncOp>(call.resolveCallable());
+      if (!func) {
+        assign_unknown_id_to_all(op->getResults());
+        return WalkResult::advance();
+      }
+      const auto& func_info = backtrack_analysis.GetAnalysisForFunc(func);
+      for (auto result : filter_resources(op->getResults())) {
+        auto passthrough_arg = func_info.GetArg(result.getResultNumber());
+        if (passthrough_arg) {
+          PropagateInputToOutput(
+              call.getArgOperands()[passthrough_arg.getValue()], result);
+        } else {
+          AddValueUniqueIDMapping(result, kUnknownResourceId);
+        }
+      }
+    } else {
+      assign_unknown_id_to_all(op->getResults());
+    }
+    return WalkResult::advance();
+  });
+}
+
+// Propagates the resource ID's from an input operand to a result. Returns true
+// if the mapping changed.
+bool ResourceAliasAnalysisInfo::PropagateInputToOutput(const Value& operand,
+                                                       const OpResult& result) {
+  auto operand_it = resource_value_to_ids_.find(operand);
+  assert(operand_it != resource_value_to_ids_.end() &&
+         "A resource-type output does not have the corresponding "
+         "resource-type input.");
+  bool change = false;
+  for (int64_t id : operand_it->second)
+    change = AddValueUniqueIDMapping(result, id) || change;
+  return change;
+}
+
+// Analyzes while loops to compute resourceIDs for the loop results.
+//
+// (1) The base case for the analysis is that if the loop body does not execute
+//     at all, the resource IDs for each result is the same as the resource IDs
+//     of the corresponding input.
+// (2) If the loop does execute one or more times, then we need to account for
+//     data flow through the body of the while loop. If result #r is the same
+//     as arg #a of the loop body (pass through argument), then we can reason
+//     further, else if the result is not a passthrough, we mark it as unknown.
+// (3) For passthrough results, if result #r is the same as arg #a of the loop
+//     body, after one iteration, result #r = arg #a, so we need to also
+//     propagate arg #a to result #r. After another iteration, arg #a of the
+//     loop body will be result #a of the previous iteration. So then we need
+//     propagate from result #a to result #r. Generalizing, the resource ID
+//     propagation (for results which are passthrough) looks like:
+//
+//     for r in (0, num_results) : result[r] = arg[r];
+//     repeat till no change {
+//       a = passthrough arg for result #r;
+//       result[r] += result[a];
+//     }
+//
+void ResourceAliasAnalysisInfo::AnalyzeWhileLoop(
+    Operation* while_op, const BacktrackAnalysisInfo& body_info) {
+  // Seed the resource ID's for the results using either the resource ID of the
+  // passthrough arg, or unknown. We need to perform further analysis if we
+  // find a passthrough arg which is not the same as corresponding the result #.
+  llvm::SmallVector<Optional<int>, 4> passthrough_args(
+      while_op->getNumResults());
+  bool need_analysis = false;
+  for (auto result : filter_resources(while_op->getResults())) {
+    int result_index = result.getResultNumber();
+    passthrough_args[result_index] = body_info.GetArg(result_index);
+    if (passthrough_args[result_index]) {
+      int passthru_index = passthrough_args[result_index].getValue();
+      PropagateInputToOutput(while_op->getOperand(passthru_index), result);
+      need_analysis |=
+          !IsUnknownResource(result) && passthru_index != result_index;
+    } else {
+      AddValueUniqueIDMapping(result, kUnknownResourceId);
+    }
+  }
+
+  if (!need_analysis) return;
+
+  // We found a result that is not unknown and whose passthrough operand index
+  // is not the same as the result index, which means there is "crosstalk"
+  // between 2 or more operands. In that case, we do an iterative propagation
+  // of resource ID's till the results converge.
+  bool change = true;
+  while (change) {
+    change = false;
+    for (auto result : filter_resources(while_op->getResults())) {
+      if (IsUnknownResource(result)) continue;
+      // If this result has a valid passthrough arg, propagate resource ID's
+      // from the result of the passthrough arg
+      int result_index = result.getResultNumber();
+      int passthru_index = passthrough_args[result_index].getValue();
+      change =
+          PropagateInputToOutput(while_op->getResult(passthru_index), result) ||
+          change;
+    }
+  }
+}
+
+bool ResourceAliasAnalysisInfo::IsUnknownResource(Value resource) const {
+  auto it = resource_value_to_ids_.find(resource);
+  assert(it != resource_value_to_ids_.end() && !it->getSecond().empty());
+  // The set is sorted so we only need to check the first element since
+  // kUnknownResourceId < 0.
+  static_assert(kUnknownResourceId < 0,
+                "kUnknownResourceId should be negative");
+  return *it->getSecond().begin() == kUnknownResourceId;
+}
+
+const llvm::SmallSet<int64_t, 8>&
+ResourceAliasAnalysisInfo::GetResourceUniqueIds(Value resource) const {
+  assert(!IsUnknownResource(resource));
+  auto it = resource_value_to_ids_.find(resource);
+  assert(it != resource_value_to_ids_.end() && "Unseen resource was queried");
+  return it->getSecond();
+}
+
+const llvm::SmallSetVector<Value, 8>&
+ResourceAliasAnalysisInfo::GetUniqueIdResources(const int64_t id) const {
+  auto it = id_to_resource_values_.find(id);
+  assert(it != id_to_resource_values_.end() && "Unseen id was queried");
+  return it->getSecond();
+}
+
+llvm::SmallSetVector<Value, 8> ResourceAliasAnalysisInfo::GetResourceAliases(
+    Value resource) const {
+  assert(!IsUnknownResource(resource) && "Unknown resource was queried");
+  llvm::SmallSetVector<Value, 8> aliases;
+  for (int64_t id : GetResourceUniqueIds(resource)) {
+    const llvm::SmallSetVector<Value, 8>& resources_aliasing_id =
+        GetUniqueIdResources(id);
+    aliases.insert(resources_aliasing_id.begin(), resources_aliasing_id.end());
+  }
+  // If there are resources that were marked as unknown, they alias with all
+  // other resources.
+  auto it = id_to_resource_values_.find(kUnknownResourceId);
+  if (it != id_to_resource_values_.end())
+    aliases.insert(it->getSecond().begin(), it->getSecond().end());
+  return aliases;
+}
+
+}  // namespace detail
+
+//===----------------------------------------------------------------------===//
+// ResourceAliasAnalysis
+//===----------------------------------------------------------------------===//
+
+ResourceAliasAnalysis::ResourceAliasAnalysis(Operation* op) {
+  auto module = dyn_cast<ModuleOp>(op);
+  assert(module);
+
+  // Analyze all regions for backtracking info.
+  detail::BacktrackAnalysis backtrack_analysis(module);
+
+  // Analyze each function.
+  for (auto func : module.getOps<FuncOp>())
+    this->info_map_.try_emplace(func, func, backtrack_analysis);
+}
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h b/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h
new file mode 100644
index 00000000000..c965b5d7602
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h
@@ -0,0 +1,120 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_RESOURCE_ALIAS_ANALYSIS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_RESOURCE_ALIAS_ANALYSIS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/analysis/per_function_aggregate_analysis.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+
+namespace mlir {
+namespace TF {
+namespace detail {
+class BacktrackAnalysis;
+class BacktrackAnalysisInfo;
+
+// Resource alias analysis information for a single function.
+class ResourceAliasAnalysisInfo {
+ public:
+  // Constructs analysis info by analyzing the given function.
+  ResourceAliasAnalysisInfo(FuncOp func,
+                            const BacktrackAnalysis& backtrack_analysis);
+
+  ResourceAliasAnalysisInfo(ResourceAliasAnalysisInfo&&) = default;
+
+  // Returns if the analysis fails to resolve a resource-type value.
+  bool IsUnknownResource(Value resource) const;
+
+  // Returns the set unique IDs which `resource` could alias. Requires that
+  // IsUnknownResource(resource) == false.
+  const llvm::SmallSet<int64_t, 8>& GetResourceUniqueIds(Value resource) const;
+
+  // Returns the set of values that are potentially aliases of `value`. Requires
+  // that IsUnknownResource(resource) == false.
+  llvm::SmallSetVector<Value, 8> GetResourceAliases(Value resource) const;
+
+ private:
+  // Maps resource value to unique ID and vice-versa. Returns true of the
+  // mapping has changed.
+  bool AddValueUniqueIDMapping(Value value, int64_t id) {
+    resource_value_to_ids_[value].insert(id);
+    return id_to_resource_values_[id].insert(value);
+  }
+
+  // Returns the set unique Values which map to `id`.
+  const llvm::SmallSetVector<Value, 8>& GetUniqueIdResources(int64_t id) const;
+
+  // Propagates the resource ID's from an input operand to a result. Returns
+  // true of the mapping has changed.
+  bool PropagateInputToOutput(const Value& operand, const OpResult& result);
+
+  // Analyzes while loops to compute resourceID's for the loop results.
+  // `body_info` is the backtrack analysis info for the loop body.
+  void AnalyzeWhileLoop(Operation* while_op,
+                        const BacktrackAnalysisInfo& body_info);
+
+  // Maps each resource-type value to a set of unique IDs that it could alias.
+  llvm::SmallDenseMap<Value, llvm::SmallSet<int64_t, 8>, 8>
+      resource_value_to_ids_;
+
+  // Maps each unique ID to a set of resource-type values that could alias to
+  // it. This is inverse of `resource_value_to_ids_` map.
+  llvm::SmallDenseMap<int64_t, llvm::SmallSetVector<Value, 8>, 8>
+      id_to_resource_values_;
+
+ public:
+  static constexpr int64_t kUnknownResourceId = -1;
+};
+
+}  // namespace detail
+
+// An analysis that runs on a module and maps each resource-type value to a
+// set of unique IDs representing the possible resources it could alias.
+//
+// Note that this is not an inter-procedural or inter-regional analysis, i.e.,
+// each function and region are handled separately and cross-function or cross-
+// region aliasing cannot be checked by this analysis.
+class ResourceAliasAnalysis : public detail::PerFunctionAggregateAnalysis<
+                                  detail::ResourceAliasAnalysisInfo> {
+ public:
+  // Constructs analysis by analyzing the given module operation.
+  explicit ResourceAliasAnalysis(Operation* op);
+};
+
+// Returns a range with just resource type values from the input range
+// preserved.
+template <typename RangeT>
+auto filter_resources(RangeT&& range) {
+  return llvm::make_filter_range(std::forward<RangeT>(range), [](Value val) {
+    return getElementTypeOrSelf(val.getType()).isa<TF::ResourceType>();
+  });
+}
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_RESOURCE_ALIAS_ANALYSIS_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
index be203e0397e..9e78b90debc 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
@@ -45,234 +45,14 @@ limitations under the License.
 
 namespace mlir {
 namespace TF {
-
 namespace {
 
-constexpr int64_t kUnknownResourceId = -1;
-constexpr char kResourceArgUniqueIdAttr[] = "tf._resource_arg_unique_id";
+constexpr auto kUnknownResourceId =
+    ResourceAliasAnalysis::Info::kUnknownResourceId;
 
-// Returns if a VarHandleOp is anonymous, which means it always creates a new
-// variable.
-bool IsResourceHandleAnonymous(TF::VarHandleOp handle) {
-  return handle.shared_name() == tensorflow::ResourceHandle::ANONYMOUS_NAME;
-}
-
-// Returns a string unique identifier for a non-anonymous VarHandleOp.
-std::string GetVarHandleStringId(TF::VarHandleOp handle) {
-  auto device = handle.getAttrOfType<StringAttr>("device");
-  return absl::StrCat(handle.container().str(), "/", handle.shared_name().str(),
-                      "/", device ? device.getValue().str() : std::string(""));
-}
-
-// Finds a unique ID for a VarHandleOp's output. If it is anonymous, always
-// creates a new ID; otherwise, tries to reuse the existing ID for the
-// referenced variable if it exists, or creates a new one if not.
-int64_t GetOrCreateIdForVarHandle(TF::VarHandleOp handle, int64_t* next_id,
-                                  llvm::StringMap<int64_t>* name_id_map) {
-  // Always create a new ID for anonymous handle.
-  if (IsResourceHandleAnonymous(handle)) return (*next_id)++;
-
-  auto name = GetVarHandleStringId(handle);
-  auto emplace_res = name_id_map->try_emplace(name, *next_id);
-  // New ID created, increment next_id.
-  if (emplace_res.second) ++(*next_id);
-  return emplace_res.first->second;
-}
-
-// If the return value for `func_op` at `return_index` is a pass-through of an
-// argument of this function, returns the argument index; otherwise, returns -1.
-int64_t FindPassthroughArgumentForReturnValue(int64_t return_index,
-                                              FuncOp func_op) {
-  auto value =
-      func_op.getBody().front().getTerminator()->getOperand(return_index);
-  assert(mlir::getElementTypeOrSelf(value.getType()).isa<TF::ResourceType>());
-  int64_t arg_index = -1;
-  auto try_parse_arg_index = [&arg_index](Value v) {
-    auto resource_arg = v.dyn_cast<BlockArgument>();
-    if (resource_arg) arg_index = resource_arg.getArgNumber();
-    return arg_index;
-  };
-  while (try_parse_arg_index(value) == -1) {
-    auto op = value.getDefiningOp();
-    assert(op);
-    int64_t res_num = value.cast<OpResult>().getResultNumber();
-    if (auto graph = llvm::dyn_cast<tf_executor::GraphOp>(op)) {
-      value = graph.GetFetch().getOperand(res_num);
-    } else if (auto island = llvm::dyn_cast<tf_executor::IslandOp>(op)) {
-      value = island.GetYield().getOperand(res_num);
-    } else if (llvm::isa<TF::IdentityNOp, TF::IdentityOp>(op)) {
-      value = op->getOperand(res_num);
-    } else {
-      return -1;
-    }
-  }
-  return arg_index;
-}
-
-}  // namespace
-
-ResourceAliasAnalysis::ResourceAliasAnalysis(Operation* op) {
-  auto func_op = llvm::dyn_cast<FuncOp>(op);
-  if (!func_op) return;
-  AnalyzeFunction(func_op);
-}
-
-void ResourceAliasAnalysis::AnalyzeFunction(FuncOp func_op) {
-  // This function populates resource_value_to_ids_ and id_to_resource_values_.
-
-  // If the "tf.resource_arg_unique_id" argument attributes are present for
-  // resource-type arguments, respect them when choosing IDs; otherwise, they
-  // must not alias.
-  int64_t next_unique_id = 0;
-  const bool has_arg_unique_id_attrs =
-      llvm::any_of(func_op.getArguments(), [&](const BlockArgument& arg) {
-        return func_op.getArgAttr(arg.getArgNumber(), kResourceArgUniqueIdAttr);
-      });
-  // Maps the kResourceArgUniqueIdAttr attribute value to the internal integer
-  // ID used by this pass.
-  llvm::SmallDenseMap<int64_t, int64_t> attr_id_to_internal_id;
-  for (auto arg : func_op.getArguments()) {
-    if (!mlir::getElementTypeOrSelf(arg.getType()).isa<TF::ResourceType>())
-      continue;
-    if (has_arg_unique_id_attrs) {
-      auto id_attr = func_op.getArgAttrOfType<IntegerAttr>(
-          arg.getArgNumber(), kResourceArgUniqueIdAttr);
-      assert(id_attr &&
-             "tf.resource_arg_unique_id attribute should exist on either none "
-             "or all arguments.");
-      auto emplace_res = attr_id_to_internal_id.try_emplace(id_attr.getInt(),
-                                                            next_unique_id++);
-      AddValueUniqueIDMapping(arg, emplace_res.first->getSecond());
-    } else {
-      AddValueUniqueIDMapping(arg, next_unique_id++);
-    }
-  }
-  llvm::StringMap<int64_t> var_handle_name_id_map;
-  auto forward_input_to_output = [&](const Value& operand,
-                                     const Value& result) {
-    if (!mlir::getElementTypeOrSelf(result.getType()).isa<TF::ResourceType>())
-      return;
-    auto& result_ids = resource_value_to_ids_[result];
-    auto operand_it = resource_value_to_ids_.find(operand);
-    assert(operand_it != resource_value_to_ids_.end() &&
-           "A resource-type output does not have the corresponding "
-           "resource-type input.");
-    result_ids.insert(operand_it->getSecond().begin(),
-                      operand_it->getSecond().end());
-  };
-  auto module = func_op.getParentOfType<ModuleOp>();
-
-  func_op.walk([&](Operation* op) {
-    if (auto var_handle = llvm::dyn_cast<TF::VarHandleOp>(op)) {
-      AddValueUniqueIDMapping(
-          var_handle.resource(),
-          GetOrCreateIdForVarHandle(var_handle, &next_unique_id,
-                                    &var_handle_name_id_map));
-    } else if (llvm::isa<TF::IdentityNOp, TF::IdentityOp>(op)) {
-      for (auto operand_and_result :
-           llvm::zip(op->getOperands(), op->getResults())) {
-        forward_input_to_output(std::get<0>(operand_and_result),
-                                std::get<1>(operand_and_result));
-      }
-    } else if (auto replicate = llvm::dyn_cast<tf_device::ReplicateOp>(op)) {
-      // The nested block for ReplicateOp is handled separately in side-effect
-      // analysis. Inside that block, we can still treat its block arguments as
-      // different resources.
-      for (auto arg : replicate.GetBody().getArguments()) {
-        if (mlir::getElementTypeOrSelf(arg.getType()).isa<TF::ResourceType>()) {
-          AddValueUniqueIDMapping(arg, next_unique_id++);
-        }
-      }
-    } else if (auto while_op = llvm::dyn_cast<TF::WhileOp>(op)) {
-      auto body = llvm::cast<FuncOp>(module.lookupSymbol(while_op.body()));
-      // If a result is a passthrough of the body input, use the corresponding
-      // operand's resource IDs.
-      for (auto result : llvm::enumerate(while_op.getResults())) {
-        if (!mlir::getElementTypeOrSelf(result.value().getType())
-                 .isa<TF::ResourceType>()) {
-          continue;
-        }
-        int64_t passthrough_operand =
-            FindPassthroughArgumentForReturnValue(result.index(), body);
-        if (passthrough_operand >= 0) {
-          forward_input_to_output(while_op.getOperand(passthrough_operand),
-                                  result.value());
-        } else {
-          AddValueUniqueIDMapping(result.value(), kUnknownResourceId);
-        }
-      }
-    } else if (auto if_op = llvm::dyn_cast<TF::IfOp>(op)) {
-      auto then_branch =
-          llvm::cast<FuncOp>(module.lookupSymbol(if_op.then_branch()));
-      auto else_branch =
-          llvm::cast<FuncOp>(module.lookupSymbol(if_op.else_branch()));
-      // If a result is a passthrough of both branches' inputs, merge the
-      // resource IDs of corresponding operands for the two inputs.
-      for (auto result : llvm::enumerate(if_op.getResults())) {
-        if (!mlir::getElementTypeOrSelf(result.value().getType())
-                 .isa<TF::ResourceType>()) {
-          continue;
-        }
-        int64_t passthrough_then_arg =
-            FindPassthroughArgumentForReturnValue(result.index(), then_branch);
-        int64_t passthrough_else_arg =
-            FindPassthroughArgumentForReturnValue(result.index(), else_branch);
-        if (passthrough_then_arg >= 0 && passthrough_else_arg >= 0) {
-          forward_input_to_output(if_op.getOperand(passthrough_then_arg + 1),
-                                  result.value());
-          forward_input_to_output(if_op.getOperand(passthrough_else_arg + 1),
-                                  result.value());
-        } else {
-          AddValueUniqueIDMapping(result.value(), kUnknownResourceId);
-        }
-      }
-    } else {
-      for (auto result : op->getResults()) {
-        if (!mlir::getElementTypeOrSelf(result.getType())
-                 .isa<TF::ResourceType>())
-          continue;
-        AddValueUniqueIDMapping(result, kUnknownResourceId);
-      }
-    }
-  });
-}
-
-bool ResourceAliasAnalysis::IsUnknownResource(const Value resource) const {
-  auto it = resource_value_to_ids_.find(resource);
-  assert(it != resource_value_to_ids_.end() && !it->getSecond().empty());
-  // The set is sorted so we only need to check the first element since
-  // kUnknownResourceId < 0.
-  static_assert(kUnknownResourceId < 0,
-                "kUnknownResourceId should be negative");
-  return *it->getSecond().begin() == kUnknownResourceId;
-}
-
-const llvm::SmallSet<int64_t, 8>& ResourceAliasAnalysis::GetResourceUniqueIds(
-    const Value resource) const {
-  auto it = resource_value_to_ids_.find(resource);
-  assert(it != resource_value_to_ids_.end() && "Unseen resource was queried");
-  return it->getSecond();
-}
-
-const llvm::SmallSetVector<Value, 8>&
-ResourceAliasAnalysis::GetUniqueIdResources(const int64_t id) const {
-  auto it = id_to_resource_values_.find(id);
-  assert(it != id_to_resource_values_.end() && "Unseen id was queried");
-  return it->getSecond();
-}
-
-llvm::SmallSetVector<Value, 8> ResourceAliasAnalysis::GetResourceAliases(
-    const Value resource) const {
-  assert(!IsUnknownResource(resource) && "Unseen resource was queried");
-  llvm::SmallSetVector<Value, 8> aliases;
-  for (int64_t id : GetResourceUniqueIds(resource)) {
-    const llvm::SmallSetVector<Value, 8>& resources_aliasing_id =
-        GetUniqueIdResources(id);
-    aliases.insert(resources_aliasing_id.begin(), resources_aliasing_id.end());
-  }
-  return aliases;
-}
-namespace {
+//===----------------------------------------------------------------------===//
+// SideEffectAnalysisInfo helper functions.
+//===----------------------------------------------------------------------===//
 
 // Returns a set that contains only kUnknownResourceId.
 llvm::SmallDenseSet<int64_t, 8> UnknownResourceSet() {
@@ -284,7 +64,7 @@ llvm::SmallDenseSet<int64_t, 8> UnknownResourceSet() {
 // Returns all resources that could be accessed by op, or UnknownResourceSet()
 // if we cannot find all of them.
 llvm::SmallDenseSet<int64_t, 8> FindAccessedResources(
-    Operation* op, const ResourceAliasAnalysis& alias_analysis) {
+    Operation* op, const ResourceAliasAnalysis::Info& alias_analysis) {
   llvm::SmallDenseSet<int64_t, 8> resources;
 
   for (auto operand : op->getOperands()) {
@@ -311,7 +91,6 @@ llvm::SmallDenseSet<int64_t, 8> FindAccessedResources(
 // TODO(yuanzx): Define this information in a different place. Currently we use
 // tensorflow/compiler/tf2xla/resource_operation_table.h.
 const tensorflow::XlaResourceOpInfo* GetResourceInfoForOp(Operation* op) {
-  auto op_name = op->getName().getStringRef().str();
   if (op->getName().getDialect() !=
       TF::TensorFlowDialect::getDialectNamespace()) {
     return nullptr;
@@ -329,7 +108,7 @@ bool OpIsReadOnly(Operation* op) {
 
 // Returns if `op` is a resource declaration.
 bool OpIsDeclaration(Operation* op,
-                     const ResourceAliasAnalysis& alias_analysis) {
+                     const ResourceAliasAnalysis::Info& alias_analysis) {
   // TODO(yuanzx): Add other types of resources.
   return llvm::isa<TF::VarHandleOp>(op) ||
          (llvm::isa<TF::IdentityNOp, TF::IdentityOp>(op) &&
@@ -370,8 +149,13 @@ bool OpIsKnownToHaveNoSideEffect(Operation* op) {
 
 }  // namespace
 
-void SideEffectAnalysis::TrackAccess(int64_t resource_id, Operation* op,
-                                     bool read_only) {
+namespace detail {
+//===----------------------------------------------------------------------===//
+// SideEffectAnalysisInfo
+//===----------------------------------------------------------------------===//
+
+void SideEffectAnalysisInfo::TrackAccess(int64_t resource_id, Operation* op,
+                                         bool read_only) {
   if (resource_id == kUnknownResourceId) {
     if (read_only) {
       // New unknown read is not tracked by any known resource access.
@@ -402,9 +186,9 @@ void SideEffectAnalysis::TrackAccess(int64_t resource_id, Operation* op,
   }
 }
 
-void SideEffectAnalysis::AddPredecessorsForAccess(int64_t resource_id,
-                                                  Operation* op,
-                                                  bool read_only) {
+void SideEffectAnalysisInfo::AddPredecessorsForAccess(int64_t resource_id,
+                                                      Operation* op,
+                                                      bool read_only) {
   auto it = per_resource_access_info_.find(resource_id);
   if (it == per_resource_access_info_.end()) return;
   const auto& access_info = it->getSecond();
@@ -420,8 +204,8 @@ void SideEffectAnalysis::AddPredecessorsForAccess(int64_t resource_id,
   }
 }
 
-void SideEffectAnalysis::AnalyzeFunction(
-    FuncOp func_op, const ResourceAliasAnalysis& alias_analysis) {
+void SideEffectAnalysisInfo::AnalyzeFunction(
+    FuncOp func_op, const TF::ResourceAliasAnalysis::Info& alias_analysis) {
   // AnalyzeRegion() recursively analyzes the function body, and only populates
   // control_predecessors_.
   AnalyzeRegion(&func_op.getBody(), alias_analysis);
@@ -448,8 +232,8 @@ void SideEffectAnalysis::AnalyzeFunction(
   }
 }
 
-void SideEffectAnalysis::AnalyzeRegion(
-    Region* region, const ResourceAliasAnalysis& alias_analysis) {
+void SideEffectAnalysisInfo::AnalyzeRegion(
+    Region* region, const TF::ResourceAliasAnalysis::Info& alias_analysis) {
   // This function populates control_predecessors_ by walking through the
   // region, and tracking resource accesses in per_resource_access_info_.
 
@@ -476,13 +260,12 @@ void SideEffectAnalysis::AnalyzeRegion(
   // different nested regions separately.
   for (auto& block : *region) {
     for (auto& op : block) {
-      if (op.getNumRegions() > 0) {
-        llvm::SmallVector<SideEffectAnalysis, 4> child_analyses;
-        for (auto& child_region : op.getRegions()) {
-          child_analyses.emplace_back();
-          child_analyses.back().AnalyzeRegion(&child_region, alias_analysis);
-        }
-        ConsumeChildAnalyses(std::move(child_analyses));
+      for (Region& child : op.getRegions()) {
+        SideEffectAnalysisInfo child_analysis(&child, alias_analysis);
+        // Moves the control_predecessors_ fields in child region to current
+        // region
+        for (auto& entry : child_analysis.control_predecessors_)
+          control_predecessors_[entry.first] = std::move(entry.second);
       }
 
       // We do not need explicit control edges for declaration ops.
@@ -529,16 +312,8 @@ void SideEffectAnalysis::AnalyzeRegion(
   }
 }
 
-void SideEffectAnalysis::ConsumeChildAnalyses(
-    llvm::SmallVector<SideEffectAnalysis, 4>&& children) {
-  for (auto& child : children) {
-    for (auto& entry : child.control_predecessors_) {
-      control_predecessors_[entry.getFirst()] = std::move(entry.getSecond());
-    }
-  }
-}
-
-llvm::SmallVector<Operation*, 4> SideEffectAnalysis::DirectControlPredecessors(
+llvm::SmallVector<Operation*, 4>
+SideEffectAnalysisInfo::DirectControlPredecessors(
     Operation* op, llvm::function_ref<bool(Operation*)> filter) const {
   llvm::SmallVector<Operation*, 4> result;
   auto it = sorted_control_predecessors_.find(op);
@@ -550,7 +325,8 @@ llvm::SmallVector<Operation*, 4> SideEffectAnalysis::DirectControlPredecessors(
   return result;
 }
 
-llvm::SmallVector<Operation*, 4> SideEffectAnalysis::DirectControlSuccessors(
+llvm::SmallVector<Operation*, 4>
+SideEffectAnalysisInfo::DirectControlSuccessors(
     Operation* op, llvm::function_ref<bool(Operation*)> filter) const {
   llvm::SmallVector<Operation*, 4> result;
   auto it = sorted_control_successors_.find(op);
@@ -561,12 +337,19 @@ llvm::SmallVector<Operation*, 4> SideEffectAnalysis::DirectControlSuccessors(
   }
   return result;
 }
+}  // namespace detail
 
 SideEffectAnalysis::SideEffectAnalysis(Operation* op) {
-  auto func_op = llvm::dyn_cast<FuncOp>(op);
-  if (!func_op) return;
-  ResourceAliasAnalysis alias_analysis(op);
-  AnalyzeFunction(func_op, alias_analysis);
+  auto module = dyn_cast<ModuleOp>(op);
+  assert(module);
+
+  // Analyze entire module for alias analysis info.
+  ResourceAliasAnalysis alias_analysis(module);
+
+  // Analyze all functions.
+  for (auto func : module.getOps<FuncOp>())
+    this->info_map_.try_emplace(func, func,
+                                alias_analysis.GetAnalysisForFunc(func));
 }
 
 }  // namespace TF
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h
index a318c6667c6..c92c6e1882c 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_SIDE_EFFECT_ANALYSIS_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_SIDE_EFFECT_ANALYSIS_H_
 
+#include <cstddef>
 #include <cstdint>
 #include <memory>
 
@@ -23,78 +24,33 @@ limitations under the License.
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
-#include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Region.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h"
 
 namespace mlir {
 namespace TF {
+namespace detail {
 
-// An analysis that runs on a function and maps each resource-type value to a
-// set of unique int64_t IDs representing the possible resources it could alias.
-//
-// If there are nested regions, each region is handled separately. This means
-// cross-region aliasing cannot be checked by this analysis.
-class ResourceAliasAnalysis {
+// Side effect analysis info for a single function.
+class SideEffectAnalysisInfo {
  public:
-  explicit ResourceAliasAnalysis(Operation* op);
-  ~ResourceAliasAnalysis() = default;
-  ResourceAliasAnalysis(ResourceAliasAnalysis&&) = default;
+  SideEffectAnalysisInfo() = default;
 
-  // Returns if the analysis fails to resolve a resource-type value.
-  bool IsUnknownResource(const Value resource) const;
-
-  // Returns the set unique IDs which `resource` could alias. Requires that
-  // IsUnknownResource(resource) == true.
-  const llvm::SmallSet<int64_t, 8>& GetResourceUniqueIds(
-      const Value resource) const;
-
-  // Returns the set of values that are potentially aliases of `value`. Requires
-  // that IsUnknownResource(resource) == true.
-  llvm::SmallSetVector<Value, 8> GetResourceAliases(const Value resource) const;
-
- private:
-  ResourceAliasAnalysis() = default;
-
-  // Runs the analysis on `func_op` and populates two way resource values to
-  // unique ID mapping.
-  void AnalyzeFunction(FuncOp func_op);
-
-  // Maps resource value to unique ID and vice-versa.
-  void AddValueUniqueIDMapping(Value value, int64_t id) {
-    resource_value_to_ids_[value].insert(id);
-    id_to_resource_values_[id].insert(value);
+  // Constructs analysis info by analyzing the given function.
+  SideEffectAnalysisInfo(
+      FuncOp func_op, const TF::ResourceAliasAnalysis::Info& alias_analysis) {
+    AnalyzeFunction(func_op, alias_analysis);
   }
 
-  // Returns the set unique Values which map to `id`.
-  const llvm::SmallSetVector<Value, 8>& GetUniqueIdResources(int64_t id) const;
+  // Constructs analysis info by analyzing the given region.
+  SideEffectAnalysisInfo(
+      Region* region, const TF::ResourceAliasAnalysis::Info& alias_analysis) {
+    AnalyzeRegion(region, alias_analysis);
+  }
 
-  // Maps each resource-type value to a set of unique IDs that it could alias.
-  llvm::SmallDenseMap<Value, llvm::SmallSet<int64_t, 8>, 8>
-      resource_value_to_ids_;
-
-  // Maps each unique ID to a set of resource-type values that could alias to
-  // it. This is inverse of `resource_value_to_ids_` map.
-  llvm::SmallDenseMap<int64_t, llvm::SmallSetVector<Value, 8>, 8>
-      id_to_resource_values_;
-};
-
-// An analysis that runs on a function and infers the control predecessors and
-// successors for each op, based on side-effects on known and unknown resources.
-// Side-effecting ops on unknown resources are conservatively treated as
-// interfering with all known resource op accesses. It distinguishes accesses
-// based on whether they are read-only, and read-only ops do not interfere with
-// each other.
-//
-// If there are nested regions, each region is handled separately, and control
-// dependencies are only tracked for ops under the same parent op.
-class SideEffectAnalysis {
- public:
-  explicit SideEffectAnalysis() = default;
-  explicit SideEffectAnalysis(Operation* op);
-  SideEffectAnalysis(SideEffectAnalysis&& other) = default;
-  ~SideEffectAnalysis() = default;
+  SideEffectAnalysisInfo(SideEffectAnalysisInfo&&) = default;
 
   // Returns a vector of ops that are direct control predecessors of `op`,
   // sorted in program order. If `filter` is provided, only predecessors that
@@ -103,9 +59,9 @@ class SideEffectAnalysis {
       Operation* op,
       llvm::function_ref<bool(Operation*)> filter = nullptr) const;
 
-  // Returns a vector of ops that are direct control successors of `op`, sorted
-  // in program order. If `filter` is provided, only successors that pass the
-  // filter (returning true) will be included.
+  // Returns a vector of ops that are direct control successors of `op`,
+  // sorted in program order. If `filter` is provided, only successors that
+  // pass the filter (returning true) will be included.
   llvm::SmallVector<Operation*, 4> DirectControlSuccessors(
       Operation* op,
       llvm::function_ref<bool(Operation*)> filter = nullptr) const;
@@ -114,16 +70,11 @@ class SideEffectAnalysis {
   // Runs the analysis on `func_op` and populates sorted_control_predecessors_
   // and sorted_control_successors_.
   void AnalyzeFunction(FuncOp func_op,
-                       const ResourceAliasAnalysis& alias_analysis);
+                       const TF::ResourceAliasAnalysis::Info& alias_analysis);
 
   // Runs the analysis on `region` and populates control_predecessors_.
   void AnalyzeRegion(Region* region,
-                     const ResourceAliasAnalysis& alias_analysis);
-
-  // Moves the control_predecessors_ fields in `children` analyses to this
-  // current analysis.
-  void ConsumeChildAnalyses(
-      llvm::SmallVector<SideEffectAnalysis, 4>&& children);
+                     const TF::ResourceAliasAnalysis::Info& alias_analysis);
 
   // Updates control_predecessors_ for `op` that is being visited, on the given
   // `resource_id`.
@@ -159,10 +110,29 @@ class SideEffectAnalysis {
     // write for a the current write being analyzed.
     bool tracked_last_unknown_write_for_write = false;
   };
+
   llvm::SmallDenseMap<int64_t, PerResourceAccessInfo, 8>
       per_resource_access_info_;
 };
 
+}  // namespace detail
+
+// An analysis that runs on a function and infers the control predecessors and
+// successors for each op, based on side-effects on known and unknown resources.
+// Side-effecting ops on unknown resources are conservatively treated as
+// interfering with all known resource op accesses. It distinguishes accesses
+// based on whether they are read-only, and read-only ops do not interfere with
+// each other.
+//
+// If there are nested regions, each region is handled separately, and control
+// dependencies are only tracked for ops under the same parent op.
+class SideEffectAnalysis : public detail::PerFunctionAggregateAnalysis<
+                               detail::SideEffectAnalysisInfo> {
+ public:
+  // Constructs analysis by analyzing the given module operation.
+  explicit SideEffectAnalysis(Operation* op);
+};
+
 }  // namespace TF
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc b/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
index ffd9c149d2d..66447995709 100644
--- a/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
+++ b/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
@@ -102,8 +102,6 @@ class MlirTensor : public TracingTensorHandle {
     return type;
   }
 
-  void Release() override { delete this; }
-
   Value getValue() { return value_; }
 
   // For LLVM style RTTI.
@@ -564,7 +562,7 @@ Status MlirFunction::GetFunctionDef(tensorflow::FunctionDef** f) {
   }
   PassManager pm(func_.getContext());
   pm.addNestedPass<FuncOp>(CreateFunctionalToExecutorDialectConversionPass());
-  pm.addNestedPass<FuncOp>(CreateBreakUpIslandsPass());
+  pm.addPass(CreateBreakUpIslandsPass());
 
   // In case of failure, the `diag_handler` converts MLIR errors emitted to
   // the MLIRContext into a tensorflow::Status.
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
index 77008b55672..5345000b4bd 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
@@ -101,7 +101,8 @@ bool BlockWrapsSingleOp(Block* block) {
 }  // end anonymous namespace
 
 TensorFlowDeviceDialect::TensorFlowDeviceDialect(MLIRContext* context)
-    : Dialect(/*name=*/"tf_device", context) {
+    : Dialect(/*name=*/"tf_device", context,
+              TypeID::get<TensorFlowDeviceDialect>()) {
   addOperations<
 #define GET_OP_LIST
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc.inc"
@@ -118,31 +119,6 @@ TensorFlowDeviceDialect::TensorFlowDeviceDialect(MLIRContext* context)
 // operation results are perfectly forwarded to the launch return.
 bool LaunchOp::WrapsSingleOp() { return BlockWrapsSingleOp(&GetBody()); }
 
-//===----------------------------------------------------------------------===//
-// tf_device.return
-//===----------------------------------------------------------------------===//
-
-namespace {
-ParseResult ParseReturnOp(OpAsmParser* parser, OperationState* state) {
-  llvm::SmallVector<OpAsmParser::OperandType, 2> op_info;
-  llvm::SmallVector<Type, 2> types;
-  llvm::SMLoc loc = parser->getCurrentLocation();
-  return failure(parser->parseOperandList(op_info) ||
-                 (!op_info.empty() && parser->parseColonTypeList(types)) ||
-                 parser->resolveOperands(op_info, types, loc, state->operands));
-}
-
-void Print(ReturnOp op, OpAsmPrinter* p) {
-  *p << op.getOperationName();
-  if (op.getNumOperands() > 0) {
-    *p << ' ';
-    p->printOperands(op.getOperands());
-    *p << " : ";
-    interleaveComma(op.getOperandTypes(), *p);
-  }
-}
-}  // anonymous namespace
-
 //===----------------------------------------------------------------------===//
 // tf_device.parallel_execute
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.h
index 4c20d1ccc4f..688c8ca5715 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 
@@ -35,6 +36,7 @@ namespace tf_device {
 // XlaRun.
 class TensorFlowDeviceDialect : public Dialect {
  public:
+  static StringRef getDialectNamespace() { return "tf_device"; }
   // Constructing TensorFlowDevice dialect under an non-null MLIRContext.
   explicit TensorFlowDeviceDialect(MLIRContext* context);
 };
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td
index 3a92e3237dc..d94a37d9b02 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td
@@ -104,8 +104,7 @@ The `tf_device.return` operation terminates and returns values from a
     }]>
    ];
 
-  let parser = [{ return Parse$cppClass(&parser, &result); }];
-  let printer = [{ return Print(*this, &p); }];
+  let assemblyFormat = "attr-dict ($results^ `:` type($results))?";
 }
 
 def TfDevice_LaunchFuncOp : TfDevice_Op<"launch_func", []> {
@@ -354,7 +353,10 @@ This op is used for outlining a cluster.
   );
 
   let extraClassDeclaration = [{
-    StringRef getFunc() { return func(); }
+    // returns the function that this operation will launch.
+    FuncOp getFunc() {
+      return SymbolTable::lookupNearestSymbolFrom<FuncOp>(*this, func());
+    }
   }];
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
index 1b1d5ba6f3b..9c2968fab37 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
@@ -92,7 +92,8 @@ struct TensorFlowExecutorOpFolderDialectInterface
 }  // namespace
 
 TensorFlowExecutorDialect::TensorFlowExecutorDialect(MLIRContext *context)
-    : Dialect(/*name=*/"tf_executor", context) {
+    : Dialect(/*name=*/"tf_executor", context,
+              TypeID::get<TensorFlowExecutorDialect>()) {
   addOperations<
 #define GET_OP_LIST
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc.inc"
@@ -190,14 +191,15 @@ LogicalResult Verify(GraphOp graph) {
   for (int i : llvm::seq<int>(0, fetch.getNumOperands())) {
     Value operand = fetch.getOperand(i);
     // Break out of the loop at the first control operand encountered.
+    const int64_t num_results = graph.getNumResults();
     if (operand.getType().isa<ControlType>()) {
-      if (i != graph.getNumResults())
+      if (i != num_results)
         return fetch.emitOpError()
                << "operand #" << i
                << " is a control type, can't be bound to a graph result";
       break;
     }
-    if (i >= graph.getNumResults())
+    if (i >= num_results)
       return fetch.emitOpError()
              << "operand #" << i << " does not have a graph results to bind";
     if (graph.getResult(i).getType() != operand.getType())
@@ -301,8 +303,8 @@ bool IslandOp::WrapsSingleOp() {
 namespace {
 
 LogicalResult Verify(IslandOp island) {
-  if (island.GetBody().empty())
-    return island.emitOpError() << "expects a non-empty body";
+  if (!island.GetBody().args_empty())
+    return island.emitOpError() << "expects body without any arguments";
 
   Operation &yield = island.GetBody().back();
   if (!isa<YieldOp>(yield))
@@ -311,7 +313,8 @@ LogicalResult Verify(IslandOp island) {
 
   // Ensure that the yield terminator operands matches the island results type.
   int result_count = island.getNumResults() - 1;  // -1 for the control token
-  if (yield.getNumOperands() != result_count)
+  const int num_operands = yield.getNumOperands();
+  if (num_operands != result_count)
     return yield.emitOpError()
            << "has " << yield.getNumOperands()
            << " operand, but island returns " << result_count;
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h
index 3bb30f16c3d..61358172d6d 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h
@@ -35,6 +35,7 @@ namespace tf_executor {
 
 class TensorFlowExecutorDialect : public Dialect {
  public:
+  static StringRef getDialectNamespace() { return "tf_executor"; }
   explicit TensorFlowExecutorDialect(MLIRContext *context);
 
   // Parses a type registered to this dialect.
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index a0e73f116cf..bf8d7015b46 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -87,7 +87,7 @@ tf.math.acosh(x) ==> [nan nan 0. 0.62236255 5.9914584 9.903487 inf]
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_AddOp : TF_Op<"Add", [NoSideEffect, ResultsBroadcastableShape, TF_LayoutAgnostic, SameOperandsAndResultElementType]>,
+def TF_AddOp : TF_Op<"Add", [NoSideEffect, ResultsBroadcastableShape, TF_LayoutAgnostic, TF_SameOperandsAndResultElementTypeResolveRef]>,
                WithBroadcastableBinOpBuilder {
   let summary = "Returns x + y element-wise.";
 
@@ -136,7 +136,7 @@ Inputs must be of same size and shape.
   let hasFolder = 1;
 }
 
-def TF_AddV2Op : TF_Op<"AddV2", [Commutative, NoSideEffect, ResultsBroadcastableShape, TF_LayoutAgnostic, SameOperandsAndResultElementType]>,
+def TF_AddV2Op : TF_Op<"AddV2", [Commutative, NoSideEffect, ResultsBroadcastableShape, TF_LayoutAgnostic, TF_SameOperandsAndResultElementTypeResolveRef, TF_CwiseBinary]>,
                  WithBroadcastableBinOpBuilder {
   let summary = "Returns x + y element-wise.";
 
@@ -648,7 +648,7 @@ tf.math.atan(y) # [1.047, 0.785] = x
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_Atan2Op : TF_Op<"Atan2", [NoSideEffect, ResultsBroadcastableShape, SameOperandsAndResultElementType]>,
+def TF_Atan2Op : TF_Op<"Atan2", [NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef]>,
                  WithBroadcastableBinOpBuilder {
   let summary = [{
 Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
@@ -725,6 +725,30 @@ window in `value`.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_AvgPool3DOp : TF_Op<"AvgPool3D", [NoSideEffect]> {
+  let summary = "Performs 3D average pooling on the input.";
+
+  let description = [{
+Each entry in `output` is the mean of the corresponding size `ksize` window in
+`value`.
+  }];
+
+  let arguments = (ins
+    TF_FpTensor:$input,
+
+    Confined<I64ArrayAttr, [ArrayMinCount<5>]>:$ksize,
+    Confined<I64ArrayAttr, [ArrayMinCount<5>]>:$strides,
+    TF_AnyStrAttrOf<["SAME", "VALID"]>:$padding,
+    DefaultValuedAttr<TF_AnyStrAttrOf<["NDHWC", "NCDHW"]>, "NDHWC">:$data_format
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_AvgPool3DGradOp : TF_Op<"AvgPool3DGrad", [NoSideEffect]> {
   let summary = "Computes gradients of average pooling function.";
 
@@ -765,7 +789,7 @@ def TF_AvgPoolGradOp : TF_Op<"AvgPoolGrad", [NoSideEffect]> {
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
 }
 
-def TF_BatchMatMulOp : TF_Op<"BatchMatMul", [NoSideEffect, SameOperandsAndResultElementType]> {
+def TF_BatchMatMulOp : TF_Op<"BatchMatMul", [NoSideEffect, TF_SameOperandsAndResultElementTypeResolveRef]> {
   let summary = "Multiplies slices of two tensors in batches.";
 
   let description = [{
@@ -806,7 +830,7 @@ It is computed as:
   let hasCanonicalizer = 1;
 }
 
-def TF_BatchMatMulV2Op : TF_Op<"BatchMatMulV2", [NoSideEffect, SameOperandsAndResultElementType]> {
+def TF_BatchMatMulV2Op : TF_Op<"BatchMatMulV2", [NoSideEffect, TF_SameOperandsAndResultElementTypeResolveRef]> {
   let summary = "Multiplies slices of two tensors in batches.";
 
   let description = [{
@@ -1326,48 +1350,6 @@ then the output will be
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_CaseOp : TF_Op<"Case", []> {
-  let summary = [{
-An n-way switch statement which calls a single branch function.
-  }];
-
-  let description = [{
-An n-way switch statement, implementing the following:
-    ```
-    switch (branch_index) {
-      case 0:
-        output = branches[0](input);
-        break;
-      case 1:
-        output = branches[1](input);
-        break;
-      ...
-      case [[nbranches-1]]:
-      default:
-        output = branches[nbranches-1](input);
-        break;
-    }
-    ```
-  }];
-
-  let arguments = (ins
-    I32Tensor:$branch_index,
-    Variadic<TF_Tensor>:$input,
-
-    Confined<SymbolRefArrayAttr, [ArrayMinCount<1>]>:$branches,
-    DefaultValuedAttr<TF_ShapeAttrArray, "{}">:$output_shapes
-  );
-
-  let results = (outs
-    Variadic<TF_Tensor>:$output
-  );
-
-  TF_DerivedOperandTypeListAttr Tin = TF_DerivedOperandTypeListAttr<1>;
-  TF_DerivedResultTypeListAttr Tout = TF_DerivedResultTypeListAttr<0>;
-
-  let hasCanonicalizer = 1;
-}
-
 def TF_CastOp : TF_Op<"Cast", [NoSideEffect, SameOperandsAndResultShape]> {
   let summary = "Cast x of type SrcT to y of DstT.";
 
@@ -1422,7 +1404,7 @@ that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_ClipByValueOp : TF_Op<"ClipByValue", [NoSideEffect, SameOperandsAndResultElementType]> {
+def TF_ClipByValueOp : TF_Op<"ClipByValue", [NoSideEffect, TF_SameOperandsAndResultElementTypeResolveRef]> {
   let summary = "Clips tensor values to a specified min and max.";
 
   let description = [{
@@ -1534,6 +1516,29 @@ Mutually reduces multiple tensors of identical type and shape.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_CollectiveReduceV2Op : TF_Op<"CollectiveReduceV2", []> {
+  let summary = [{
+Mutually reduces multiple tensors of identical type and shape.
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32, F64, I32, I64]>:$input,
+    I32Tensor:$group_size,
+    I32Tensor:$group_key,
+    I32Tensor:$instance_key,
+
+    TF_AnyStrAttrOf<["Min", "Max", "Mul", "Add"]>:$merge_op,
+    TF_AnyStrAttrOf<["Id", "Div"]>:$final_op,
+    DefaultValuedAttr<StrAttr, "auto">:$communication_hint
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64, I32, I64]>:$data
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_ComplexOp : TF_Op<"Complex", [NoSideEffect, ResultsBroadcastableShape]> {
   let summary = "Converts two real numbers to a complex number.";
 
@@ -1664,6 +1669,8 @@ def TF_ConcatV2Op : TF_Op<"ConcatV2", [NoSideEffect]> {
   let verifier = [{
     return Verify(*this);
   }];
+
+  let hasCanonicalizer = 1;
 }
 
 def TF_ConjOp : TF_Op<"Conj", [NoSideEffect, SameOperandsAndResultType]> {
@@ -2469,7 +2476,7 @@ Computes Psi, the derivative of Lgamma (the log of the absolute value of
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_DivOp : TF_Op<"Div", [NoSideEffect, ResultsBroadcastableShape, SameOperandsAndResultElementType]>,
+def TF_DivOp : TF_Op<"Div", [NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef]>,
                WithBroadcastableBinOpBuilder {
   let summary = "Returns x / y element-wise.";
 
@@ -2494,7 +2501,7 @@ def TF_DivOp : TF_Op<"Div", [NoSideEffect, ResultsBroadcastableShape, SameOperan
   let hasFolder = 1;
 }
 
-def TF_DivNoNanOp : TF_Op<"DivNoNan", [NoSideEffect, ResultsBroadcastableShape, SameOperandsAndResultElementType]>,
+def TF_DivNoNanOp : TF_Op<"DivNoNan", [NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef]>,
                     WithBroadcastableBinOpBuilder {
   let summary = "Returns 0 if the denominator is zero.";
 
@@ -3374,7 +3381,7 @@ def TF_FloorDivOp : TF_Op<"FloorDiv", [NoSideEffect, ResultsBroadcastableShape]>
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_FloorModOp : TF_Op<"FloorMod", [NoSideEffect, ResultsBroadcastableShape, SameOperandsAndResultElementType]>,
+def TF_FloorModOp : TF_Op<"FloorMod", [NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef]>,
                     WithBroadcastableBinOpBuilder {
   let summary = [{
 Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
@@ -3540,51 +3547,6 @@ The size of 1D Tensors matches the dimension C of the 4D Tensors.
   }];
 }
 
-def TF_FusedBatchNormV3Op : TF_Op<"FusedBatchNormV3", [NoSideEffect, TF_FoldOperandsTransposeInterface, TF_LayoutSensitiveInterface]> {
-  let summary = "Batch normalization.";
-
-  let description = [{
-Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-The size of 1D Tensors matches the dimension C of the 4D Tensors.
-  }];
-
-  let arguments = (ins
-    TensorOf<[BF16, F16, F32]>:$x,
-    F32Tensor:$scale,
-    F32Tensor:$offset,
-    F32Tensor:$mean,
-    F32Tensor:$variance,
-
-    DefaultValuedAttr<F32Attr, "0.0001f">:$epsilon,
-    DefaultValuedAttr<F32Attr, "1.0f">:$exponential_avg_factor,
-    DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format,
-    DefaultValuedAttr<BoolAttr, "true">:$is_training
-  );
-
-  let results = (outs
-    TensorOf<[BF16, F16, F32]>:$y,
-    F32Tensor:$batch_mean,
-    F32Tensor:$batch_variance,
-    F32Tensor:$reserve_space_1,
-    F32Tensor:$reserve_space_2,
-    F32Tensor:$reserve_space_3
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-  TF_DerivedOperandTypeAttr U = TF_DerivedOperandTypeAttr<1>;
-
-  let extraClassDeclaration = [{
-    // TF_FoldOperandsTransposeInterface:
-    SmallVector<unsigned, 4> GetLayoutDependentArgs() { return {0}; }
-    SmallVector<unsigned, 4> GetLayoutDependentResults() { return {0}; }
-    LogicalResult FoldOperandsPermutation(ArrayRef<int64_t> permutation);
-
-    // TF_LayoutSensitiveInterface:
-    StringRef GetOptimalLayout(const RuntimeDevices& devices);
-    LogicalResult UpdateDataFormat(StringRef data_format);
-  }];
-}
-
 def TF_GatherOp : TF_Op<"Gather", [NoSideEffect]> {
   let summary = "Gather slices from `params` according to `indices`.";
 
@@ -4111,7 +4073,7 @@ def ApplyG(op, dy, _):
   TF_DerivedOperandTypeListAttr T = TF_DerivedOperandTypeListAttr<0>;
 }
 
-def TF_IgammaOp : TF_Op<"Igamma", [NoSideEffect, ResultsBroadcastableShape, SameOperandsAndResultElementType]>,
+def TF_IgammaOp : TF_Op<"Igamma", [NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef]>,
                   WithBroadcastableBinOpBuilder {
   let summary = [{
 Compute the lower regularized incomplete Gamma function `P(a, x)`.
@@ -4145,7 +4107,7 @@ Gamma function.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_IgammaGradAOp : TF_Op<"IgammaGradA", [NoSideEffect, ResultsBroadcastableShape, SameOperandsAndResultElementType]>,
+def TF_IgammaGradAOp : TF_Op<"IgammaGradA", [NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef]>,
                        WithBroadcastableBinOpBuilder {
   let summary = "Computes the gradient of `igamma(a, x)` wrt `a`.";
 
@@ -4161,7 +4123,7 @@ def TF_IgammaGradAOp : TF_Op<"IgammaGradA", [NoSideEffect, ResultsBroadcastableS
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_IgammacOp : TF_Op<"Igammac", [NoSideEffect, ResultsBroadcastableShape, SameOperandsAndResultElementType]>,
+def TF_IgammacOp : TF_Op<"Igammac", [NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef]>,
                    WithBroadcastableBinOpBuilder {
   let summary = [{
 Compute the upper regularized incomplete Gamma function `Q(a, x)`.
@@ -4252,6 +4214,29 @@ Where to extract the key and value from a line is specified by `key_index` and
   let results = (outs);
 }
 
+def TF_InplaceUpdateOp : TF_Op<"InplaceUpdate", [NoSideEffect]> {
+  let summary = "Updates specified rows 'i' with values 'v'.";
+
+  let description = [{
+Computes `x[i, :] = v; return x`.
+
+Originally this function is mutative however for compilation we make this
+operation create / operate on a copy of `x`.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$x,
+    I32Tensor:$i,
+    TF_Tensor:$v
+  );
+
+  let results = (outs
+    TF_Tensor:$y
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_InvOp : TF_Op<"Inv", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes the reciprocal of x element-wise.";
 
@@ -4772,7 +4757,7 @@ tf.math.log(x) ==> [-inf, -0.6931472,  0. ,  1.609438]
   let hasCanonicalizer = 1;
 }
 
-def TF_Log1pOp : TF_Op<"Log1p", [NoSideEffect, SameOperandsAndResultType]> {
+def TF_Log1pOp : TF_Op<"Log1p", [NoSideEffect, SameOperandsAndResultType, TF_CwiseUnary]> {
   let summary = "Computes natural logarithm of (1 + x) element-wise.";
 
   let description = [{
@@ -4928,7 +4913,7 @@ def TF_LookupTableSizeV2Op : TF_Op<"LookupTableSizeV2", []> {
   );
 }
 
-def TF_MatMulOp : TF_Op<"MatMul", [NoSideEffect, SameOperandsAndResultElementType]> {
+def TF_MatMulOp : TF_Op<"MatMul", [NoSideEffect, TF_SameOperandsAndResultElementTypeResolveRef]> {
   let summary = [{
 Multiply the matrix "a" by the matrix "b".
   }];
@@ -5066,6 +5051,126 @@ which has shape (2, 4, 4)
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_MatrixDiagPartV3Op : TF_Op<"MatrixDiagPartV3", [NoSideEffect]> {
+  let summary = "Returns the batched diagonal part of a batched tensor.";
+
+  let description = [{
+Returns a tensor with the `k[0]`-th to `k[1]`-th diagonals of the batched
+`input`.
+
+Assume `input` has `r` dimensions `[I, J, ..., L, M, N]`.
+Let `max_diag_len` be the maximum length among all diagonals to be extracted,
+`max_diag_len = min(M + min(k[1], 0), N + min(-k[0], 0))`
+Let `num_diags` be the number of diagonals to extract,
+`num_diags = k[1] - k[0] + 1`.
+
+If `num_diags == 1`, the output tensor is of rank `r - 1` with shape
+`[I, J, ..., L, max_diag_len]` and values:
+
+```
+diagonal[i, j, ..., l, n]
+  = input[i, j, ..., l, n+y, n+x] ; if 0 <= n+y < M and 0 <= n+x < N,
+    padding_value                 ; otherwise.
+```
+where `y = max(-k[1], 0)`, `x = max(k[1], 0)`.
+
+Otherwise, the output tensor has rank `r` with dimensions
+`[I, J, ..., L, num_diags, max_diag_len]` with values:
+
+```
+diagonal[i, j, ..., l, m, n]
+  = input[i, j, ..., l, n+y, n+x] ; if 0 <= n+y < M and 0 <= n+x < N,
+    padding_value                 ; otherwise.
+```
+where `d = k[1] - m`, `y = max(-d, 0) - offset`, and `x = max(d, 0) - offset`.
+
+`offset` is zero except when the alignment of the diagonal is to the right.
+```
+offset = max_diag_len - diag_len(d) ; if (`align` in {RIGHT_LEFT, RIGHT_RIGHT}
+                                           and `d >= 0`) or
+                                         (`align` in {LEFT_RIGHT, RIGHT_RIGHT}
+                                           and `d <= 0`)
+         0                          ; otherwise
+```
+where `diag_len(d) = min(cols - max(d, 0), rows + min(d, 0))`.
+
+The input must be at least a matrix.
+
+For example:
+
+```
+input = np.array([[[1, 2, 3, 4],  # Input shape: (2, 3, 4)
+                   [5, 6, 7, 8],
+                   [9, 8, 7, 6]],
+                  [[5, 4, 3, 2],
+                   [1, 2, 3, 4],
+                   [5, 6, 7, 8]]])
+
+# A main diagonal from each batch.
+tf.matrix_diag_part(input) ==> [[1, 6, 7],  # Output shape: (2, 3)
+                                [5, 2, 7]]
+
+# A superdiagonal from each batch.
+tf.matrix_diag_part(input, k = 1)
+  ==> [[2, 7, 6],  # Output shape: (2, 3)
+       [4, 3, 8]]
+
+# A band from each batch.
+tf.matrix_diag_part(input, k = (-1, 2))
+  ==> [[[0, 3, 8],  # Output shape: (2, 4, 3)
+        [2, 7, 6],
+        [1, 6, 7],
+        [5, 8, 0]],
+       [[0, 3, 4],
+        [4, 3, 8],
+        [5, 2, 7],
+        [1, 6, 0]]]
+
+# LEFT_RIGHT alignment.
+tf.matrix_diag_part(input, k = (-1, 2), align="LEFT_RIGHT")
+  ==> [[[3, 8, 0],  # Output shape: (2, 4, 3)
+        [2, 7, 6],
+        [1, 6, 7],
+        [0, 5, 8]],
+       [[3, 4, 0],
+        [4, 3, 8],
+        [5, 2, 7],
+        [0, 1, 6]]]
+
+# max_diag_len can be shorter than the main diagonal.
+tf.matrix_diag_part(input, k = (-2, -1))
+  ==> [[[5, 8],
+        [9, 0]],
+       [[1, 6],
+        [5, 0]]]
+
+# padding_value = 9
+tf.matrix_diag_part(input, k = (1, 3), padding_value = 9)
+  ==> [[[9, 9, 4],  # Output shape: (2, 3, 3)
+        [9, 3, 8],
+        [2, 7, 6]],
+       [[9, 9, 2],
+        [9, 3, 4],
+        [4, 3, 8]]]
+
+```
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input,
+    I32Tensor:$k,
+    TF_Tensor:$padding_value,
+
+    DefaultValuedAttr<TF_AnyStrAttrOf<["LEFT_RIGHT", "RIGHT_LEFT", "LEFT_LEFT", "RIGHT_RIGHT"]>, "RIGHT_LEFT">:$align
+  );
+
+  let results = (outs
+    TF_Tensor:$diagonal
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_MatrixDiagV2Op : TF_Op<"MatrixDiagV2", [NoSideEffect]> {
   let summary = [{
 Returns a batched diagonal tensor with given batched diagonal values.
@@ -5692,7 +5797,7 @@ def TF_MaxPoolGradOp : TF_Op<"MaxPoolGrad", [NoSideEffect]> {
   }];
 }
 
-def TF_MaximumOp : TF_Op<"Maximum", [NoSideEffect, ResultsBroadcastableShape, SameOperandsAndResultElementType]>,
+def TF_MaximumOp : TF_Op<"Maximum", [NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef]>,
                    WithBroadcastableBinOpBuilder {
   let summary = "Returns the max of x and y (i.e. x > y ? x : y) element-wise.";
 
@@ -5766,7 +5871,7 @@ retained with length 1.
   TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
 }
 
-def TF_MinimumOp : TF_Op<"Minimum", [NoSideEffect, ResultsBroadcastableShape, SameOperandsAndResultElementType]>,
+def TF_MinimumOp : TF_Op<"Minimum", [NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef]>,
                    WithBroadcastableBinOpBuilder {
   let summary = "Returns the min of x and y (i.e. x < y ? x : y) element-wise.";
 
@@ -5899,7 +6004,7 @@ graph_def = foo.get_concrete_function(tf.TensorSpec([10], tf.float32), tf.Tensor
   TF_DerivedResultTypeListAttr Toutputs = TF_DerivedResultTypeListAttr<0>;
 }
 
-def TF_ModOp : TF_Op<"Mod", [NoSideEffect, ResultsBroadcastableShape, SameOperandsAndResultElementType]>,
+def TF_ModOp : TF_Op<"Mod", [NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef]>,
                WithBroadcastableBinOpBuilder {
   let summary = [{
 Returns element-wise remainder of division. This emulates C semantics in that
@@ -5925,7 +6030,7 @@ the result here is consistent with a truncating divide. E.g.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_MulOp : TF_Op<"Mul", [Commutative, NoSideEffect, ResultsBroadcastableShape, SameOperandsAndResultElementType]>,
+def TF_MulOp : TF_Op<"Mul", [Commutative, NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef, TF_CwiseBinary]>,
                WithBroadcastableBinOpBuilder {
   let summary = "Returns x * y element-wise.";
 
@@ -5971,7 +6076,7 @@ Returns x * y element-wise. Returns zero if y is zero, even if x if infinite or
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_MultinomialOp : TF_Op<"Multinomial", []> {
+def TF_MultinomialOp : TF_Op<"Multinomial", [TF_CannotDuplicate]> {
   let summary = "Draws samples from a multinomial distribution.";
 
   let arguments = (ins
@@ -6332,6 +6437,8 @@ This is the opposite of `unpack`.
   let verifier = [{
     return Verify(*this);
   }];
+
+  let hasFolder = 1;
 }
 
 def TF_PadOp : TF_Op<"Pad", [NoSideEffect, TF_FoldOperandsTransposeInterface]> {
@@ -6426,7 +6533,36 @@ pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
   TF_DerivedOperandTypeAttr Tpaddings = TF_DerivedOperandTypeAttr<1>;
 }
 
-def TF_PowOp : TF_Op<"Pow", [NoSideEffect, ResultsBroadcastableShape, SameOperandsAndResultElementType]>,
+def TF_ParameterizedTruncatedNormalOp : TF_Op<"ParameterizedTruncatedNormal", [TF_CannotDuplicate]> {
+  let summary = [{
+Outputs random values from a normal distribution. The parameters may each be a
+  }];
+
+  let description = [{
+scalar which applies to the entire output, or a vector of length shape[0] which
+stores the parameters for each batch.
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$shape,
+    TF_FpTensor:$means,
+    TF_FpTensor:$stdevs,
+    TF_FpTensor:$minvals,
+    TF_FpTensor:$maxvals,
+
+    DefaultValuedAttr<I64Attr, "0">:$seed,
+    DefaultValuedAttr<I64Attr, "0">:$seed2
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<1>;
+}
+
+def TF_PowOp : TF_Op<"Pow", [NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef]>,
                WithBroadcastableBinOpBuilder {
   let summary = "Computes the power of one value to another.";
 
@@ -6809,6 +6945,33 @@ array([0.6666667, 1. , 1. ], dtype=float32)
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_RandomGammaOp : TF_Op<"RandomGamma", [TF_CannotDuplicate]> {
+  let summary = [{
+Outputs random values from the Gamma distribution(s) described by alpha.
+  }];
+
+  let description = [{
+This op uses the algorithm by Marsaglia et al. to acquire samples via
+transformation-rejection from pairs of uniform and normal random variables.
+See http://dl.acm.org/citation.cfm?id=358414
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$shape,
+    TensorOf<[F16, F32, F64]>:$alpha,
+
+    DefaultValuedAttr<I64Attr, "0">:$seed,
+    DefaultValuedAttr<I64Attr, "0">:$seed2
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr S = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_RandomGammaGradOp : TF_Op<"RandomGammaGrad", [NoSideEffect, ResultsBroadcastableShape]>,
                            WithBroadcastableBinOpBuilder {
   let summary = [{
@@ -6827,7 +6990,60 @@ Computes the derivative of a Gamma random sample w.r.t. `alpha`.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_RandomShuffleOp : TF_Op<"RandomShuffle", [SameOperandsAndResultType]> {
+def TF_RandomPoissonOp : TF_Op<"RandomPoisson", [TF_CannotDuplicate]> {
+  let summary = "Use RandomPoissonV2 instead.";
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$shape,
+    TensorOf<[F16, F32, F64]>:$rate,
+
+    DefaultValuedAttr<I64Attr, "0">:$seed,
+    DefaultValuedAttr<I64Attr, "0">:$seed2
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr S = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<1>;
+}
+
+def TF_RandomPoissonV2Op : TF_Op<"RandomPoissonV2", [TF_CannotDuplicate]> {
+  let summary = [{
+Outputs random values from the Poisson distribution(s) described by rate.
+  }];
+
+  let description = [{
+This op uses two algorithms, depending on rate. If rate >= 10, then
+the algorithm by Hormann is used to acquire samples via
+transformation-rejection.
+See http://www.sciencedirect.com/science/article/pii/0167668793909974.
+
+Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
+random variables.
+See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
+Programming, Volume 2. Addison Wesley
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$shape,
+    TensorOf<[F16, F32, F64, I32, I64]>:$rate,
+
+    DefaultValuedAttr<I64Attr, "0">:$seed,
+    DefaultValuedAttr<I64Attr, "0">:$seed2
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64, I32, I64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr R = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr S = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
+def TF_RandomShuffleOp : TF_Op<"RandomShuffle", [SameOperandsAndResultType, TF_CannotDuplicate]> {
   let summary = "Randomly shuffles a tensor along its first dimension.";
 
   let description = [{
@@ -6856,7 +7072,7 @@ The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_RandomStandardNormalOp : TF_Op<"RandomStandardNormal", []> {
+def TF_RandomStandardNormalOp : TF_Op<"RandomStandardNormal", [TF_CannotDuplicate]> {
   let summary = "Outputs random values from a normal distribution.";
 
   let description = [{
@@ -6878,7 +7094,7 @@ The generated values will have mean 0 and standard deviation 1.
   TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
 }
 
-def TF_RandomUniformOp : TF_Op<"RandomUniform", []> {
+def TF_RandomUniformOp : TF_Op<"RandomUniform", [TF_CannotDuplicate]> {
   let summary = "Outputs random values from a uniform distribution.";
 
   let description = [{
@@ -6905,7 +7121,37 @@ lower bound 0 is included in the range, while the upper bound 1 is excluded.
   }];
 }
 
-def TF_RangeOp : TF_Op<"Range", [NoSideEffect, SameOperandsAndResultElementType]> {
+def TF_RandomUniformIntOp : TF_Op<"RandomUniformInt", [TF_CannotDuplicate]> {
+  let summary = "Outputs random integers from a uniform distribution.";
+
+  let description = [{
+The generated values are uniform integers in the range `[minval, maxval)`.
+The lower bound `minval` is included in the range, while the upper bound
+`maxval` is excluded.
+
+The random integers are slightly biased unless `maxval - minval` is an exact
+power of two.  The bias is small for values of `maxval - minval` significantly
+smaller than the range of the output (either `2^32` or `2^64`).
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$shape,
+    TF_I32OrI64Tensor:$minval,
+    TF_I32OrI64Tensor:$maxval,
+
+    DefaultValuedAttr<I64Attr, "0">:$seed,
+    DefaultValuedAttr<I64Attr, "0">:$seed2
+  );
+
+  let results = (outs
+    TF_I32OrI64Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tout = TF_DerivedOperandTypeAttr<1>;
+}
+
+def TF_RangeOp : TF_Op<"Range", [NoSideEffect, TF_SameOperandsAndResultElementTypeResolveRef]> {
   let summary = "Creates a sequence of numbers.";
 
   let description = [{
@@ -6940,6 +7186,28 @@ tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
   ];
 }
 
+def TF_RangeDatasetOp : TF_Op<"RangeDataset", []> {
+  let summary = [{
+Creates a dataset with a range of values. Corresponds to python's xrange.
+  }];
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    I64Tensor:$start,
+    I64Tensor:$stop,
+    I64Tensor:$step,
+
+    Confined<TypeArrayAttr, [ArrayMinCount<1>]>:$output_types,
+    Confined<TF_ShapeAttrArray, [ArrayMinCount<1>]>:$output_shapes
+  );
+
+  let results = (outs
+    TF_VariantTensor:$handle
+  );
+}
+
 def TF_RankOp : TF_Op<"Rank", [NoSideEffect]> {
   let summary = "Returns the rank of a tensor.";
 
@@ -7030,7 +7298,7 @@ tf.real(input) ==> [-2.25, 3.25]
   TF_DerivedResultTypeAttr Tout = TF_DerivedResultTypeAttr<0>;
 }
 
-def TF_RealDivOp : TF_Op<"RealDiv", [NoSideEffect, ResultsBroadcastableShape]>,
+def TF_RealDivOp : TF_Op<"RealDiv", [NoSideEffect, ResultsBroadcastableShape, TF_CwiseBinary]>,
                    WithBroadcastableBinOpBuilder {
   let summary = "Returns x / y element-wise for real types.";
 
@@ -7272,6 +7540,7 @@ reshape(t, []) ==> 7
   }];
 
   let hasCanonicalizer = 1;
+  let hasFolder = 1;
 }
 
 def TF_ResizeBilinearOp : TF_Op<"ResizeBilinear", [NoSideEffect]> {
@@ -9309,6 +9578,33 @@ The outputs are a deterministic function of `shape` and `seed`.
   TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
 }
 
+def TF_StatelessTruncatedNormalOp : TF_Op<"StatelessTruncatedNormal", [NoSideEffect]> {
+  let summary = [{
+Outputs deterministic pseudorandom values from a truncated normal distribution.
+  }];
+
+  let description = [{
+The generated values follow a normal distribution with mean 0 and standard
+deviation 1, except that values whose magnitude is more than 2 standard
+deviations from the mean are dropped and re-picked.
+
+The outputs are a deterministic function of `shape` and `seed`.
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$shape,
+    TF_I32OrI64Tensor:$seed
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tseed = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
 def TF_StopGradientOp : TF_Op<"StopGradient", [NoSideEffect, TF_AllTypesMatch<["input", "output"]>]> {
   let summary = "Stops gradient computation.";
 
@@ -9551,7 +9847,7 @@ Examples:
   TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<0>;
 }
 
-def TF_SubOp : TF_Op<"Sub", [NoSideEffect, ResultsBroadcastableShape, SameOperandsAndResultElementType]>,
+def TF_SubOp : TF_Op<"Sub", [NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef, TF_CwiseBinary]>,
                WithBroadcastableBinOpBuilder {
   let summary = "Returns x - y element-wise.";
 
@@ -9731,6 +10027,22 @@ For internal use only.
   );
 }
 
+def TF_TPUOrdinalSelectorOp : TF_Op<"TPUOrdinalSelector", []> {
+  let summary = "A TPU core selector Op.";
+
+  let description = [{
+This Op produces a set of TPU cores (for warm-up) or a single TPU core
+(for regular inference) to execute the TPU program on. The output is
+consumed by TPUPartitionedCall.
+  }];
+
+  let arguments = (ins);
+
+  let results = (outs
+    I32Tensor:$device_ordinals
+  );
+}
+
 def TF_TPUReplicatedInputOp : TF_Op<"TPUReplicatedInput", [NoSideEffect]> {
   let summary = "Connects N inputs to an N-way replicated TPU computation.";
 
@@ -10708,7 +11020,7 @@ Python Semantics.
   let hasCanonicalizer = 1;
 }
 
-def TF_TruncateModOp : TF_Op<"TruncateMod", [NoSideEffect, ResultsBroadcastableShape, SameOperandsAndResultElementType]>,
+def TF_TruncateModOp : TF_Op<"TruncateMod", [NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef]>,
                        WithBroadcastableBinOpBuilder {
   let summary = [{
 Returns element-wise remainder of division. This emulates C semantics in that
@@ -10734,7 +11046,7 @@ y + truncate_mod(x, y) = x`.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_TruncatedNormalOp : TF_Op<"TruncatedNormal", []> {
+def TF_TruncatedNormalOp : TF_Op<"TruncatedNormal", [TF_CannotDuplicate]> {
   let summary = "Outputs random values from a truncated normal distribution.";
 
   let description = [{
@@ -11181,7 +11493,7 @@ where(input) ==> [[0, 0, 0],
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_XdivyOp : TF_Op<"Xdivy", [NoSideEffect, ResultsBroadcastableShape, SameOperandsAndResultElementType]>,
+def TF_XdivyOp : TF_Op<"Xdivy", [NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef]>,
                  WithBroadcastableBinOpBuilder {
   let summary = "Returns 0 if x == 0, and x / y otherwise, elementwise.";
 
@@ -11547,7 +11859,7 @@ tensor such that tensor[...,:,:] = u[..., :, :] * Diag(s[..., :]) * Transpose(v[
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_Xlog1pyOp : TF_Op<"Xlog1py", [NoSideEffect, SameOperandsAndResultElementType]> {
+def TF_Xlog1pyOp : TF_Op<"Xlog1py", [NoSideEffect, TF_SameOperandsAndResultElementTypeResolveRef]> {
   let summary = "Returns 0 if x == 0, and x * log1p(y) otherwise, elementwise.";
 
   let arguments = (ins
@@ -11562,7 +11874,7 @@ def TF_Xlog1pyOp : TF_Op<"Xlog1py", [NoSideEffect, SameOperandsAndResultElementT
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_XlogyOp : TF_Op<"Xlogy", [NoSideEffect, ResultsBroadcastableShape, SameOperandsAndResultElementType]>,
+def TF_XlogyOp : TF_Op<"Xlogy", [NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef]>,
                  WithBroadcastableBinOpBuilder {
   let summary = "Returns 0 if x == 0, and x * log(y) otherwise, elementwise.";
 
@@ -11640,7 +11952,7 @@ create these operators.
   TF_DerivedOperandSizeAttr num_args = TF_DerivedOperandSizeAttr<2>;
 }
 
-def TF__FusedMatMulOp : TF_Op<"_FusedMatMul", [NoSideEffect]> {
+def TF__FusedMatMulOp : TF_Op<"_FusedMatMul", [NoSideEffect, SameOperandsAndResultElementType]> {
   let summary = [{
 Performs a MatMul followed by a specified series of operations.
   }];
@@ -11666,9 +11978,9 @@ expected to create these operators.
   }];
 
   let arguments = (ins
-    F32Tensor:$a,
-    F32Tensor:$b,
-    Variadic<F32Tensor>:$args,
+    TensorOf<[BF16, F32]>:$a,
+    TensorOf<[BF16, F32]>:$b,
+    Variadic<TensorOf<[BF16, F32]>>:$args,
 
     DefaultValuedAttr<BoolAttr, "false">:$transpose_a,
     DefaultValuedAttr<BoolAttr, "false">:$transpose_b,
@@ -11677,31 +11989,13 @@ expected to create these operators.
   );
 
   let results = (outs
-    F32Tensor:$product
+    TensorOf<[BF16, F32]>:$product
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
   TF_DerivedOperandSizeAttr num_args = TF_DerivedOperandSizeAttr<2>;
 }
 
-def TF__HostComputeMlirOp : TF_Op<"_HostComputeMlir", []> {
-  let summary = "A host-side computation called from a TPU device.";
-
-  let arguments = (ins
-    Variadic<TF_Tensor>:$inputs,
-
-    StrAttr:$key,
-    DefaultValuedAttr<I64Attr, "0">:$tpu_core
-  );
-
-  let results = (outs
-    Variadic<TF_Tensor>:$outputs
-  );
-
-  TF_DerivedOperandTypeListAttr Tinputs = TF_DerivedOperandTypeListAttr<0>;
-  TF_DerivedResultTypeListAttr Toutputs = TF_DerivedResultTypeListAttr<0>;
-}
-
 def TF__RecvTPUEmbeddingActivationsOp : TF_Op<"_RecvTPUEmbeddingActivations", []> {
   let summary = "An op that receives embeddng activations on the TPU.";
 
@@ -11761,6 +12055,45 @@ used to look up the program in the compilation cache.
   TF_DerivedOperandSizeAttr NumDynamicShapes = TF_DerivedOperandSizeAttr<0>;
 }
 
+def TF__TPUCompileMlirPlaceholderProgramKeyOp : TF_Op<"_TPUCompileMlirPlaceholderProgramKey", []> {
+  let summary = [{
+Placeholder program key (compilation cache key) of a _TPUCompileMlir `program`.
+  }];
+
+  let description = [{
+This op can be used when certain rewrite passes materialize ops that require a
+program key but the _TPUCompileMlir op has not been added yet. Subsequent
+rewrite passes must replace this op with a _TPUCompileMlir op `program` output.
+  }];
+
+  let arguments = (ins);
+
+  let results = (outs
+    TF_StrTensor:$program
+  );
+}
+
+def TF__XlaHostComputeMlirOp : TF_Op<"_XlaHostComputeMlir", []> {
+  let summary = [{
+A pseudo-op to represent host-side computation in an XLA program.
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor>:$inputs,
+
+    StrAttr:$send_key,
+    StrAttr:$recv_key,
+    DefaultValuedAttr<I64Attr, "0">:$tpu_core
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$outputs
+  );
+
+  TF_DerivedOperandTypeListAttr Tinputs = TF_DerivedOperandTypeListAttr<0>;
+  TF_DerivedResultTypeListAttr Toutputs = TF_DerivedResultTypeListAttr<0>;
+}
+
 def TF__XlaRecvAtHostOp : TF_Op<"_XlaRecvAtHost", []> {
   let summary = [{
 A placeholder op to receive values from a running XLA computation.
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
index 17424b54fc2..1755c975c23 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
@@ -59,10 +59,27 @@ TODO: Make invariants more structured so that we can reference them in ops.
 def TF_OperandsSameAsResultsTypeOrRef : NativeOpTrait<
   "TF::OperandsSameAsResultsTypeOrRef">;
 
+// Op has the same operand and result element types (or type itself, if scalar)
+// after resolving reference types (i.e., after converting reference types to
+// their corresponding TensorFlow or standard types).
+def TF_SameOperandsAndResultElementTypeResolveRef : NativeOpTrait<
+  "TF::SameOperandsAndResultElementTypeResolveRef">;
+
 // Layout agnostic operations do not depend on the operands data layout (data
 // format), as an example all element wise operations are layout agnostic.
 def TF_LayoutAgnostic : NativeOpTrait<"TF::LayoutAgnostic">;
 
+// Trait to indicate operations that cannot be duplicated as they might carry
+// certain state around within their implementations.
+def TF_CannotDuplicate : NativeOpTrait<"TF::CannotDuplicate">;
+
+// Coefficient wise binary operation with implicit broadcasting support, for
+// example tf.Sub operation.
+def TF_CwiseBinary : NativeOpTrait<"TF::CwiseBinary">;
+
+// Coefficient wise unary operation, for example tf.Sqrt operation.
+def TF_CwiseUnary : NativeOpTrait<"TF::CwiseUnary">;
+
 // Variant of broadcastable trait that considers TF's subtype behavior.
 class TF_OpIsBroadcastableToRes<int opId, int resId> : And<[
     TCOpResIsShapedTypePred<opId, resId>,
@@ -332,7 +349,7 @@ class TF_DerivedOperandTypeListAttr<int idx> : DerivedAttr<
 // This returns a list of shapes so it is used for variadic operands that
 // can have different shapes.
 class TF_DerivedOperandShapeListAttr<int idx> : DerivedAttr<
-  "mlir::TF::OperandShapeRange",
+  "::mlir::TF::OperandShapeRange",
   "auto values = getODSOperands(" # idx # ");\n"
   "return {mlir::TF::OperandShapeIterator(values.begin()), "
           "mlir::TF::OperandShapeIterator(values.end())};",
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.cc
new file mode 100644
index 00000000000..ffcc9f7dd4f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.cc
@@ -0,0 +1,22 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
+
+namespace mlir {
+namespace TF {
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.cc.inc"
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index de6ce2d313a..dbad613d909 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -69,4363 +69,66 @@ limitations under the License.
 namespace mlir {
 namespace TF {
 
-// Propagates underscore and device attributes from src to dst.
-// TODO(b/158769932): This should be a general feature instead post some policy
-// discussion.
-static void PropagateAttributes(Operation *src, Operation *dst) {
-  auto device = mlir::Identifier::get("device", src->getContext());
-  for (auto named_attr : src->getAttrs()) {
-    if (*named_attr.first.begin() == '_' || named_attr.first == device)
-      dst->setAttr(named_attr.first, named_attr.second);
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// TF op helper functions
-//===----------------------------------------------------------------------===//
-
-// Returns the RankedTensorType for the given operand. TensorFlow constant ops
-// may have non-static shape because the shape is not propagated during constant
-// folding. If the defining op for the given operand is a constant op, this
-// routine uses the constant op's attribute to get the actual shape.
-static RankedTensorType GetRankedTensorTypeForOperand(Value operand) {
-  DenseElementsAttr attr;
-  if (matchPattern(operand, m_Constant(&attr))) {
-    return attr.getType().dyn_cast<RankedTensorType>();
-  }
-  return operand.getType().dyn_cast<RankedTensorType>();
-}
-
-// Returns true if the given `value` is of ranked float tensor type with the
-// given `rank`.
-static inline bool IsOfRankedFloatTensorType(RankedTensorType type, int rank) {
-  return type && type.getRank() == rank &&
-         type.getElementType().isa<FloatType>();
-}
-
-// Returns true if the given `value` has the specified rank or has unranked
-// type.
-static inline bool IsOfRankOrUnranked(Value value, int64_t rank) {
-  RankedTensorType type = GetRankedTensorTypeForOperand(value);
-  return !type || type.getRank() == rank;
-}
-
-// Returns true if the given `value` has at least the specified rank or has
-// unranked type.
-static inline bool HasRankAtLeast(Value value, int64_t rank) {
-  RankedTensorType type = GetRankedTensorTypeForOperand(value);
-  return !type || type.getRank() >= rank;
-}
-
-// Returns true if the given `value` has at most the specified rank or has
-// unranked type.
-static inline bool HasRankAtMost(Value value, int64_t rank) {
-  RankedTensorType type = GetRankedTensorTypeForOperand(value);
-  return !type || type.getRank() <= rank;
-}
-
-static bool IsUnknownDimOrRank(int64_t dim_or_rank) {
-  return dim_or_rank == -1;
-}
-
-// Returns the tf.Equal/tf.NotEqual result type given `x` and `y` and inputs. If
-// `incompatible_shape_error` is true, reports error if `x` and `y` has
-// incompatible shapes. Otherwise, returns a tensor type with unknown rank.
-static Type DeduceEqualCmpOpType(Builder *builder, Location loc, Value x,
-                                 Value y, BoolAttr incompatible_shape_error) {
-  auto result_type =
-      OpTrait::util::getBroadcastedType(x.getType(), y.getType());
-  if (!result_type) {
-    if (incompatible_shape_error.getValue()) {
-      mlir::emitError(loc, "non-broadcastable operands");
-    } else {
-      return UnrankedTensorType::get(builder->getI1Type());
-    }
-  }
-
-  auto ranked_type = result_type.dyn_cast<RankedTensorType>();
-  if (!ranked_type) return UnrankedTensorType::get(builder->getI1Type());
-
-  return RankedTensorType::get(ranked_type.getShape(), builder->getI1Type());
-}
-
-// Returns dimension index for the given TensorFlow axis that supports negative
-// indexing.
-static int64_t GetDimForAxis(int64_t axis, int64_t rank) {
-  return axis >= 0 ? axis : axis + rank;
-}
-
-// Infers output type for reduction ops such as SumOp, MaxOp etc.
-// TODO(b/e667204a): Move this logic to shape inference once it supports custom
-// inference functions.
-static Type InferReductionOpType(Value input, Value reduction_indices,
-                                 BoolAttr keep_dims, Builder *builder) {
-  Type input_ty = input.getType();
-  Type element_ty = getElementTypeOrSelf(input_ty);
-
-  // Output type is unranked if input type is not ranked.
-  auto ranked_ty = input_ty.dyn_cast<RankedTensorType>();
-  if (!ranked_ty) return UnrankedTensorType::get(element_ty);
-  int64_t rank = ranked_ty.getRank();
-
-  DenseIntElementsAttr indices;
-  if (!matchPattern(reduction_indices, m_Constant(&indices))) {
-    // Output type is unranked if reduction indices are not constant and reduced
-    // dimensions are not kept.
-    if (!keep_dims.getValue()) return UnrankedTensorType::get(element_ty);
-
-    // Otherwise, output type has same rank as the input.
-    return RankedTensorType::get(SmallVector<int64_t, 4>(rank, -1), element_ty);
-  }
-
-  int64_t num_reduce_dim = 0;
-  llvm::SmallVector<bool, 4> is_reduce_dim(rank, false);
-  for (const APInt &index : indices.getValues<APInt>()) {
-    int64_t dim = GetDimForAxis(index.getSExtValue(), rank);
-    // Invalid input.
-    if (dim < 0 || dim >= rank) return UnrankedTensorType::get(element_ty);
-
-    if (!is_reduce_dim[dim]) {
-      is_reduce_dim[dim] = true;
-      num_reduce_dim++;
-    }
-  }
-
-  ArrayRef<int64_t> shape = ranked_ty.getShape();
-  SmallVector<int64_t, 4> out_shape;
-  out_shape.reserve(rank - (keep_dims.getValue() ? 0 : num_reduce_dim));
-  for (int64_t i = 0; i < rank; ++i) {
-    if (!is_reduce_dim[i])
-      out_shape.push_back(shape[i]);
-    else if (keep_dims.getValue())
-      out_shape.push_back(1);
-  }
-  return RankedTensorType::get(out_shape, element_ty);
-}
-
-// Verifies that the given types are cast compatible. If not, emits appropriate
-// error for the given op. If mask_one_dim is set to true, then the types are
-// allowed to have one mismatching dimension. Masking one of the dimensions is
-// useful for ops like Concat that requires all ranked inputs to have the same
-// rank and match dimension sizes for all but one of the dimensions.
-static LogicalResult VerifyTypesCompatibility(
-    Operation::operand_type_range types, bool mask_one_dim, Operation *op) {
-  constexpr int64_t kUninitialized = -1;
-  int64_t common_rank = kUninitialized;
-  llvm::SmallVector<int64_t, 4> common_dims;
-  int64_t dim_to_mask = kUninitialized;
-
-  // Initialize common_rank with rank of the first ranked type and verify that
-  // following ranked types have the same rank.
-  // Similarly, initialize each of the dimensions with the first type that has
-  // the dimension size available and verify that all following types have the
-  // same size for the dimension. However, if mask_one_dim is true, note down
-  // the dimension index on the first mismatch and ignore dimension at that
-  // index in following types.
-  for (Type ty : types) {
-    RankedTensorType ranked_ty = ty.dyn_cast<RankedTensorType>();
-    if (!ranked_ty) continue;
-
-    int64_t rank = ranked_ty.getRank();
-    if (common_rank == kUninitialized) {
-      common_rank = rank;
-      common_dims.resize(common_rank, kUninitialized);
-    } else if (common_rank != rank) {
-      return op->emitError()
-             << "operand type " << ranked_ty
-             << " is not compatible with preceding operands; expected rank: "
-             << common_rank;
-    }
-
-    for (int64_t i = 0, e = common_rank; i != e; i++) {
-      if (i == dim_to_mask) continue;
-
-      int64_t dim = ranked_ty.getDimSize(i);
-      if (dim == kUninitialized) continue;
-
-      int64_t &common_dim = common_dims[i];
-      if (common_dim == kUninitialized) {
-        common_dim = dim;
-      } else if (common_dim != dim) {
-        // If mask_one_dim is true, do not emit an error if this is the only
-        // dimension with mismatches. Note down the dimension to mask it from
-        // the following types.
-        if (mask_one_dim && dim_to_mask == kUninitialized) {
-          dim_to_mask = i;
-          continue;
-        }
-
-        return op->emitError() << "operand type " << ranked_ty
-                               << " is not compatible with preceding operands; "
-                                  "expected dimension at index "
-                               << i << ": " << common_dim;
-      }
-    }
-  }
-  return success();
-}
-
-// This is a helper for the Select to SelectV2 canonicalization. The `data` rank
-// refers to the rank of `t`/`e` (these two inputs have equal rank; this is
-// checked in the verifier).
-//
-// In most cases, the predicate for Select can be used directly as the predicate
-// for SelectV2. However, there is one case that varies, which is when the
-// predicate is a tensor and the data is multidimensional. In this case, Select
-// op semantics dictate that the predicate tensor length must match the size of
-// the first data dimension. This varies from normal broadcasting semantics
-// (which are used in SelectV2), so we must reshape the tensor in this case to
-// be compatible.
-static Value ReshapeSelectPredIfNecessary(OpBuilder *builder, Location loc,
-                                          Value cond, int data_rank) {
-  auto cond_tensor = cond.getType().cast<RankedTensorType>();
-  // Reshape is only needed in the case that the cond rank is 1 (i.e. it is
-  // a vector) AND t/e rank is > 1.
-  if (cond_tensor.getRank() != 1 || data_rank <= 1) {
-    // No reshape necessary. Leave cond as it is.
-    return cond;
-  }
-
-  // This is the case where a reshape is needed. We want to construct the
-  // shape [x,1,...1], where x is the value in the pred tensor and the
-  // length of the shape is equal to data_rank.
-  SmallVector<int64_t, 8> shape(data_rank, 1);
-  shape[0] = cond_tensor.getShape().front();
-  auto new_shape_type =
-      RankedTensorType::get({data_rank}, builder->getIntegerType(64));
-  auto shape_attr = DenseIntElementsAttr::get(new_shape_type, shape);
-  auto new_shape = builder->create<ConstOp>(loc, shape_attr);
-  return builder->create<ReshapeOp>(loc, cond, new_shape);
-}
-
-//===----------------------------------------------------------------------===//
-// Helper functions detect device capabilities from RuntimeDevices.
-//===----------------------------------------------------------------------===//
-
-namespace {
-using DeviceNameUtils = ::tensorflow::DeviceNameUtils;
-using ParsedName = ::tensorflow::DeviceNameUtils::ParsedName;
-
-bool IsGpuDevice(const DeviceNameUtils::ParsedName &device) {
-  return device.type == ::tensorflow::DEVICE_GPU;
-}
-
-}  // namespace
-
-// Returns true if at least one GPU device is available at runtime.
-bool CanUseGpuDevice(const RuntimeDevices &devices) {
-  return llvm::any_of(devices.device_names(), IsGpuDevice);
-}
-
-// Returns true if all of the GPUs available at runtime support TensorCores
-// (NVIDIA compute capability >= 7.0).
-bool CanUseTensorCores(const RuntimeDevices &devices) {
-  auto has_tensor_cores = [&](const DeviceNameUtils::ParsedName &device) {
-    auto md = devices.GetGpuDeviceMetadata(device);
-    return md ? md->cc_major().getInt() >= 7 : false;
-  };
-  return llvm::all_of(
-      llvm::make_filter_range(devices.device_names(), IsGpuDevice),
-      has_tensor_cores);
-}
-
-// Returns true if operation does not have explicit device placement that would
-// prevent it from running on GPU device.
-bool CanUseGpuDevice(Operation *op) {
-  auto device_attr = op->getAttrOfType<StringAttr>("device");
-  if (!device_attr || device_attr.getValue().empty()) return true;
-
-  DeviceNameUtils::ParsedName device;
-  if (!DeviceNameUtils::ParseFullName(device_attr.getValue().str(), &device))
-    return false;
-
-  // We can't use GPU if operation explicitly placed on non-GPU device.
-  return !device.has_type || device.type == ::tensorflow::DEVICE_GPU;
-}
-
-//===----------------------------------------------------------------------===//
-// TF op helper functions to work with layout transformation.
-//===----------------------------------------------------------------------===//
-
-SmallVector<int64_t, 4> ReversePermutation(ArrayRef<int64_t> permutation) {
-  SmallVector<int64_t, 4> reverse(permutation.size());
-  for (size_t i = 0; i < permutation.size(); ++i) {
-    reverse[permutation[i]] = i;
-  }
-  return reverse;
-}
-
-SmallVector<int64_t, 4> GetDataFormatPermutation(StringRef from, StringRef to) {
-  if (from == "NHWC" && to == "NCHW") {
-    return {0, 3, 1, 2};
-  } else if (from == "NCHW" && to == "NHWC") {
-    return {0, 2, 3, 1};
-  } else {
-    return {};
-  }
-}
-
-// Shuffle elements in the `attr` according to the permutation. Optional
-// `inner_size` allows to shuffle array attributes created from rank 2 tensors
-// on outer dimension only.
-ArrayAttr ShuffleArrayAttr(ArrayAttr attr, ArrayRef<int64_t> permutation,
-                           int inner_size = 1) {
-  if (attr.size() == 0) return attr;
-
-  assert(attr.size() % inner_size == 0);
-  assert(attr.size() / inner_size == permutation.size());
-
-  SmallVector<Attribute, 8> values{attr.begin(), attr.end()};
-  SmallVector<Attribute, 8> shuffled(values.size());
-
-  for (size_t i = 0; i < permutation.size(); ++i) {
-    for (size_t j = 0; j < inner_size; ++j) {
-      shuffled[i * inner_size + j] = values[permutation[i] * inner_size + j];
-    }
-  }
-
-  return ArrayAttr::get(shuffled, attr.getContext());
-}
-
-// Shuffle ranked tensor dimensions according to the permutation.
-Type ShuffleRankedTensorType(Type type, ArrayRef<int64_t> permutation) {
-  if (auto ranked_type = type.dyn_cast<RankedTensorType>()) {
-    ArrayRef<int64_t> shape = ranked_type.getShape();
-    assert(permutation.size() == shape.size());
-
-    SmallVector<int64_t, 4> new_shape(permutation.size());
-    for (size_t i = 0; i < permutation.size(); ++i)
-      new_shape[i] = shape[permutation[i]];
-
-    return RankedTensorType::get(new_shape, ranked_type.getElementType());
-  }
-
-  return type;
-}
-
-static bool AreCancellablePermutations(DenseIntElementsAttr perm0,
-                                       DenseIntElementsAttr perm1) {
-  if (perm0.getNumElements() == 0 || perm1.getNumElements() == 0) return false;
-  if (perm0.getNumElements() != perm1.getNumElements()) return false;
-
-  SmallVector<int64_t, 8> perm0_values;
-  for (const auto &value : perm0.getIntValues())
-    perm0_values.push_back(value.getSExtValue());
-
-  SmallVector<int64_t, 8> perm1_values;
-  for (const auto &value : perm1.getIntValues())
-    perm1_values.push_back(value.getSExtValue());
-
-  for (int i = 0; i < perm0_values.size(); ++i) {
-    if (perm0_values[perm1_values[i]] != i) return false;
-  }
-
-  return true;
-}
-
-// Default implementation of `LayoutSensitiveInterface::UpdateDataFormat` for
-// layout sensitive operations that do not have any additional layout dependent
-// attributes besides `data_format` string.
-template <typename Op>
-LogicalResult UpdateDataFormat(StringRef data_format, Op *op) {
-  auto perm = GetDataFormatPermutation(op->data_format(), data_format);
-  if (perm.empty()) return failure();
-
-  // Update data format attribute.
-  op->setAttr("data_format", StringAttr::get(data_format, op->getContext()));
-
-  // Update types for all layout sensitive results.
-  auto layout_sensitive = cast<LayoutSensitiveInterface>(op->getOperation());
-  for (unsigned idx : layout_sensitive.GetLayoutDependentResults()) {
-    OpResult result = op->getOperation()->getResult(idx);
-    result.setType(ShuffleRankedTensorType(result.getType(), perm));
-  }
-
-  return success();
-}
-
-// Default implementation for folding operand transpose into the operation.
-// See `FoldOperandsTransposeInterface::FoldOperandsPermutation`.
-template <typename Op>
-LogicalResult FoldOperandsPermutation(
-    ArrayRef<int64_t> permutation, Op *op,
-    ArrayRef<std::pair<StringRef, ArrayAttr>> shuffle_attrs = {}) {
-  MLIRContext *context = op->template getParentOfType<ModuleOp>().getContext();
-
-  // We only support NHWC <-> NCHW permutations.
-  static constexpr std::array<int64_t, 4> kNchwToNhwc = {0, 2, 3, 1};
-  static constexpr std::array<int64_t, 4> kNhwcToNchw = {0, 3, 1, 2};
-
-  // Operation data format after folding `permutation`.
-  StringRef target_data_format = [&]() -> StringRef {
-    if (op->data_format() == "NHWC" && permutation.equals(kNchwToNhwc)) {
-      return "NCHW";  // cancel NCHW->NHWC operand permutation
-    } else if (op->data_format() == "NCHW" && permutation.equals(kNhwcToNchw)) {
-      return "NHWC";  // cancel NHWC->NCHW operand permutation
-    } else {
-      return "";
-    }
-  }();
-  if (target_data_format.empty()) return failure();
-
-  // To fold operand `permutation` into the `op` we need shuffle all layout
-  // dependent attributes and types with a reverse permutation, and change
-  // operation data format to `target_data_format`.
-  //
-  // Example:
-  //   %1 = SomeOp(...)   {data_format = NHWC}
-  //   %2 = Transpose(%1) {permutation = NHWC->NCHW}
-  //   %3 = Op(%2)        {data_format = NCHW}
-  //
-  // To bypass %2 we have to change data format to shuffle data format from NCHW
-  // to NHWC, which is the reverse of operand permutation (function argument).
-  auto reverse_permutation =
-      GetDataFormatPermutation(op->data_format(), target_data_format);
-  if (reverse_permutation.empty()) return failure();
-
-  op->setAttr("data_format", StringAttr::get(target_data_format, context));
-
-  for (auto pair : shuffle_attrs) {
-    StringRef attr_name = pair.first;
-    ArrayAttr attr_value = pair.second;
-    op->setAttr(attr_name, ShuffleArrayAttr(attr_value, reverse_permutation));
-  }
-
-  auto fold = cast<FoldOperandsTransposeInterface>(op->getOperation());
-  for (unsigned idx : fold.GetLayoutDependentResults()) {
-    OpResult result = op->getOperation()->getResult(idx);
-    result.setType(
-        ShuffleRankedTensorType(result.getType(), reverse_permutation));
-  }
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// Rewrite Pattern for removing trivial Arithmetic op.
-//===----------------------------------------------------------------------===//
-
-namespace {
-// Fold Arithmetic Op if one of the operands is a constant known to be an
-// Identity (e.g. X+0, X*1, etc...). For commutative operations fold if
-// known identity value is either lhs or rhs.
-template <
-    typename OpT,
-    typename std::enable_if<llvm::is_one_of<
-        OpT, AddV2Op, SubOp, MulOp, DivOp, RealDivOp>::value>::type * = nullptr>
-OpFoldResult IdentityArithmeticOpFolder(OpT arithmetic_op,
-                                        ArrayRef<Attribute> operands) {
-  auto lhs_type = arithmetic_op.x().getType().template cast<ShapedType>();
-  auto rhs_type = arithmetic_op.y().getType().template cast<ShapedType>();
-  auto result_type =
-      arithmetic_op.getResult().getType().template cast<ShapedType>();
-
-  // We can fold arithmetic operation only of we can prove that we will not
-  // accidentally hide a broadcasting error.
-  auto is_valid_broadcasting = [](ShapedType operand_ty, ShapedType identity_ty,
-                                  ShapedType result_ty) -> bool {
-    // Scalar identity is broadcastable to any operand shape, we only need to
-    // check that operand has the same shape as a result.
-    bool scalar_identity = identity_ty.hasRank() && identity_ty.getRank() == 0;
-    if (scalar_identity) return operand_ty == result_ty;
-
-    // If identity is not a scalar, we must verify that all shapes are equal
-    // and statically known.
-    //
-    // TODO(ezhulenev): Fold if identity shape is statically know to be
-    // broadcastable to the operand shape.
-    return operand_ty == result_ty && identity_ty == result_ty &&
-           result_ty.hasStaticShape();
-  };
-
-  // Check that we have a constant operand on one side (candidate for identity).
-  const bool is_commutative =
-      (std::is_same<OpT, AddV2Op>::value || std::is_same<OpT, MulOp>::value);
-  auto lhs_attr = operands[0].dyn_cast_or_null<DenseElementsAttr>();
-  auto rhs_attr = operands[1].dyn_cast_or_null<DenseElementsAttr>();
-  if (!rhs_attr && !(is_commutative && lhs_attr)) return {};
-
-  // Mul and Div ops have identity value one while AddV2 and SubOp have identity
-  // value zero.
-  const int identity =
-      (std::is_same<OpT, MulOp>::value || std::is_same<OpT, DivOp>::value ||
-       std::is_same<OpT, RealDivOp>::value)
-          ? 1
-          : 0;
-
-  Type element_ty = lhs_type.getElementType();
-  Attribute identity_attr;
-  if (auto ty = element_ty.template dyn_cast<FloatType>()) {
-    identity_attr = FloatAttr::get(ty, static_cast<double>(identity));
-  } else if (auto ty = element_ty.template dyn_cast<IntegerType>()) {
-    identity_attr = IntegerAttr::get(ty, static_cast<int64_t>(identity));
-  } else {
-    return {};
-  }
-
-  // Fold: Op(Operand, Identity) -> Operand.
-  if (rhs_attr && is_valid_broadcasting(lhs_type, rhs_type, result_type)) {
-    if (rhs_attr.isSplat() && rhs_attr.getSplatValue() == identity_attr)
-      return arithmetic_op.x();
-  }
-
-  // Fold: Op(Identity, Operand) -> Operand for commutative operations.
-  if (lhs_attr && is_commutative &&
-      is_valid_broadcasting(rhs_type, lhs_type, result_type)) {
-    if (lhs_attr.isSplat() && lhs_attr.getSplatValue() == identity_attr)
-      return arithmetic_op.y();
-  }
-
-  return {};
-}
-}  // namespace
-
-namespace {
-#include "tensorflow/compiler/mlir/tensorflow/transforms/generated_canonicalize.inc"
-}  // namespace
-
-//===----------------------------------------------------------------------===//
-// AddOp
-//===----------------------------------------------------------------------===//
-
-void AddOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
-                                        MLIRContext *context) {
-  results.insert<AddToAddV2>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// AddNOp
-//===----------------------------------------------------------------------===//
-
-OpFoldResult AddNOp::fold(ArrayRef<Attribute> operands) {
-  if (operands.size() == 1) return *inputs().begin();
-  return {};
-}
-
-//===----------------------------------------------------------------------===//
-// AddV2Op
-//===----------------------------------------------------------------------===//
-
-void AddV2Op::getCanonicalizationPatterns(OwningRewritePatternList &results,
-                                          MLIRContext *context) {
-  results.insert<AddV2OfNegLeft, AddV2OfNegRight>(context);
-}
-
-OpFoldResult AddV2Op::fold(ArrayRef<Attribute> operands) {
-  return IdentityArithmeticOpFolder<AddV2Op>(*this, operands);
-}
-
-//===----------------------------------------------------------------------===//
-// AllOp
-//===----------------------------------------------------------------------===//
-
-// Verifies an reduction op's `input` and reduction `dims`.
-static LogicalResult VerifyReductionInputAndDims(Value input, Value dims,
-                                                 Location loc) {
-  auto dims_type = dims.getType().dyn_cast<RankedTensorType>();
-  if (!dims_type) return success();
-  if (dims_type.getRank() > 1)
-    return emitError(loc, "dimensions can only be 0D or 1D tensor");
-
-  auto input_type = input.getType().dyn_cast<RankedTensorType>();
-  if (!input_type) return success();
-  int64_t rank = input_type.getRank();
-
-  DenseIntElementsAttr dims_attr;
-  if (!matchPattern(dims, m_Constant(&dims_attr))) return success();
-  for (const auto &dim_pair : llvm::enumerate(dims_attr)) {
-    int64_t cur_dim = dim_pair.value().getSExtValue();
-    if (cur_dim < -rank || cur_dim >= rank)
-      return emitError(loc)
-             << dim_pair.index() << "-th dimension should be in the range of [-"
-             << rank << ", " << rank << ")";
-  }
-
-  return success();
-}
-
-static LogicalResult Verify(AllOp op) {
-  return VerifyReductionInputAndDims(op.input(), op.reduction_indices(),
-                                     op.getLoc());
-}
-
-//===----------------------------------------------------------------------===//
-// AnyOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(AnyOp op) {
-  return VerifyReductionInputAndDims(op.input(), op.reduction_indices(),
-                                     op.getLoc());
-}
-
-//===----------------------------------------------------------------------===//
-// AssertOp
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-// Removes Assert with constant true predicate.
-struct AssertWithTrue : public OpRewritePattern<AssertOp> {
-  using OpRewritePattern<AssertOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(AssertOp op,
-                                PatternRewriter &rewriter) const override {
-    ElementsAttr cst;
-    if (matchPattern(op.condition(), m_Constant(&cst))) {
-      if (cst.getValue<BoolAttr>({}).getValue()) {
-        rewriter.eraseOp(op);
-        return success();
-      }
-    }
-    return failure();
-  }
-};
-}  // namespace
-
-void AssertOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
-                                           MLIRContext *context) {
-  results.insert<AssertWithTrue>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// BatchMatMulOp
-//===----------------------------------------------------------------------===//
-
-void BatchMatMulOp::getCanonicalizationPatterns(
-    OwningRewritePatternList &results, MLIRContext *context) {
-  results.insert<BatchMatMulToMatMul>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// BatchMatMulV2Op
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(BatchMatMulV2Op op) {
-  if (!HasRankAtLeast(op.x(), 2)) {
-    return op.emitOpError("requires lhs operand to have rank at least two");
-  }
-  if (!HasRankAtLeast(op.y(), 2)) {
-    return op.emitOpError("requires rhs operand to have rank at least two");
-  }
-  return success();
-}
-
-void BatchMatMulV2Op::getCanonicalizationPatterns(
-    OwningRewritePatternList &results, MLIRContext *context) {
-  results.insert<BatchMatMulV2ToMatMul>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// BatchToSpaceOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(BatchToSpaceOp op) {
-  // Op already has a constraint that block_size >= 2.
-  int64_t block_size = op.block_size().getSExtValue();
-
-  llvm::SmallVector<int64_t, 4> input_shape(4, ShapedType::kDynamicSize);
-  auto input_type = op.input().getType().cast<TensorType>();
-  if (input_type.hasRank()) {
-    if (input_type.getRank() != 4)
-      return op.emitOpError()
-             << "requires input to be a 4D tensor, but got " << input_type;
-
-    int64_t input_batch = input_type.getDimSize(0);
-    if (input_batch != ShapedType::kDynamicSize &&
-        input_batch % (block_size * block_size) != 0) {
-      return op.emitOpError()
-             << "requires input batch (dimension 0) to be evenly divisible "
-                "by (block_size * block_size), but got input batch "
-             << input_batch << " and block_size " << block_size;
-    }
-
-    input_shape.assign(input_type.getShape().begin(),
-                       input_type.getShape().end());
-  }
-
-  auto crops_type = op.crops().getType().cast<TensorType>();
-  if (crops_type.hasRank()) {
-    if (crops_type.getRank() != 2)
-      return op.emitOpError()
-             << "requires crops to be a 2D tensor, but got " << crops_type;
-
-    auto dim_of_size = [&](int64_t dim, int64_t size) {
-      if (crops_type.isDynamicDim(dim)) return true;
-      return crops_type.getDimSize(dim) == size;
-    };
-    if (!dim_of_size(0, 2) || !dim_of_size(1, 2))
-      return op.emitOpError()
-             << "requires crops to be a tensor<2x2>, but got " << crops_type;
-  }
-
-  DenseIntElementsAttr crops_attr;
-  // Crops are defined as [[crop_top, crop_bottom], [crop_left, crop_right]],
-  // and flattened as [crop_top, crop_bottom, crop_left, crop_right]
-  llvm::SmallVector<int64_t, 4> crops_values;
-  if (matchPattern(op.crops(), m_Constant(&crops_attr))) {
-    assert(crops_attr.getNumElements() == 4 &&
-           "tf.BatchToSpace crops must have 4 elements");
-
-    auto crops_range = crops_attr.getIntValues();
-    for (const auto &crops_value : crops_range) {
-      int64_t crops_value_int = crops_value.getSExtValue();
-      if (crops_value_int < 0)
-        return op.emitOpError()
-               << "requires all crop values to be nonnegative, but got "
-               << crops_attr;
-
-      crops_values.push_back(crops_value_int);
-    }
-  }
-
-  auto output_type = op.output().getType().cast<TensorType>();
-  if (output_type.hasRank()) {
-    if (output_type.getRank() != 4)
-      return op.emitOpError()
-             << "requires output to be a 4D tensor, but got " << output_type;
-
-    auto static_dims = [](int64_t dim_a, int64_t dim_b) {
-      return dim_a != ShapedType::kDynamicSize &&
-             dim_b != ShapedType::kDynamicSize;
-    };
-
-    auto output_shape = output_type.getShape();
-
-    // output batch = input batch / (block_size * block_size).
-    int64_t input_batch = input_shape[0];
-    int64_t output_batch = output_shape[0];
-    if (static_dims(input_batch, output_batch) &&
-        (output_batch * block_size * block_size) != input_batch)
-      return op.emitOpError()
-             << "requires output batch (dimension 0) to be equal to input "
-                "batch (dimension 0) / (block_size * block_size), but got "
-                "output batch "
-             << output_batch << ", input batch " << input_batch
-             << ", and block_size " << block_size;
-
-    auto check_spatial_dim = [&](int64_t spatial_dim_index,
-                                 llvm::StringRef dim_name,
-                                 llvm::StringRef crop_a_name,
-                                 llvm::StringRef crop_b_name) -> LogicalResult {
-      int64_t input_dim = input_shape[spatial_dim_index];
-      int64_t output_dim = output_shape[spatial_dim_index];
-      if (!static_dims(input_dim, output_dim)) return success();
-
-      int64_t input_dim_pad = input_dim * block_size;
-      // If crops are unknown, the maximum output spatial dim size is input
-      // spatial dim size * block_size, as crops can be minimum 0.
-      if (crops_values.empty() && output_dim > input_dim * block_size)
-        return op.emitOpError()
-               << "requires output " << dim_name << " (dimension "
-               << spatial_dim_index << ") to be less than or equal to input "
-               << dim_name << " (dimension " << spatial_dim_index
-               << ") * block_size, but got output " << dim_name << " "
-               << output_dim << ", input " << dim_name << " " << input_dim
-               << ", and block_size " << block_size;
-
-      if (!crops_values.empty()) {
-        // output spatial dim = input spatial dim * block_size - crops.
-        int64_t crop_a = crops_values[2 * (spatial_dim_index - 1)];
-        int64_t crop_b = crops_values[2 * (spatial_dim_index - 1) + 1];
-        if (output_dim != input_dim_pad - crop_a - crop_b)
-          return op.emitOpError()
-                 << "requires output " << dim_name << " (dimension "
-                 << spatial_dim_index << ") to be equal to input " << dim_name
-                 << " (dimension " << spatial_dim_index << ") * block_size - "
-                 << crop_a_name << " - " << crop_b_name << ", but got output "
-                 << dim_name << " " << output_dim << ", input " << dim_name
-                 << " " << input_dim << ", " << crop_a_name << " " << crop_a
-                 << ", " << crop_b_name << " " << crop_b << ", and block_size "
-                 << block_size;
-      }
-
-      return success();
-    };
-
-    if (failed(check_spatial_dim(1, "height", "crop_top", "crop_bottom")) ||
-        failed(check_spatial_dim(2, "width", "crop_left", "crop_right")))
-      return failure();
-
-    int64_t input_depth = input_shape[3];
-    int64_t output_depth = output_shape[3];
-    if (static_dims(input_depth, output_depth) && output_depth != input_depth)
-      return op.emitOpError()
-             << "requires output depth (dimension 3) to be equal to input "
-                "depth (dimension 3), but got output depth "
-             << output_depth << " and input depth " << input_depth;
-  }
-
-  return success();
-}
-
-void BatchToSpaceOp::getCanonicalizationPatterns(
-    OwningRewritePatternList &results, MLIRContext *context) {
-  results.insert<BatchToSpaceToBatchToSpaceND>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// BiasAddOp
-//===----------------------------------------------------------------------===//
-
-// Verifies that,
-// * the value and bias operands have valid ranks or are unranked.
-// * Channel dimension of the value operand and length of bias matches if they
-//   are not unknown.
-//
-static LogicalResult Verify(BiasAddOp op) {
-  StringRef format = op.data_format();
-  if (format == "NHWC") {
-    if (!HasRankAtLeast(op.value(), 2))
-      return op.emitOpError(
-          "requires value operand to have rank at least two with `NHWC` data "
-          "format");
-  } else {
-    // Op definition requires data_format to be either NHWC or NCHW.
-    DCHECK_EQ(format.str(), "NCHW");
-    if (!HasRankAtLeast(op.value(), 3))
-      return op.emitOpError(
-          "requires value operand to have rank at least three with `NCHW` data "
-          "format");
-  }
-
-  if (!IsOfRankOrUnranked(op.bias(), 1))
-    return op.emitOpError("requires bias operand to have rank exactly one");
-
-  RankedTensorType value_ty = op.value().getType().dyn_cast<RankedTensorType>();
-  RankedTensorType bias_ty = op.bias().getType().dyn_cast<RankedTensorType>();
-  if (!bias_ty || !value_ty) return success();
-
-  // TODO(hinsu): Leverage tensor_format.h utility in TensorFlow to compute
-  // dimension indices based on format.
-  int64_t feature_dim_idx = format == "NHWC" ? value_ty.getRank() - 1 : 1;
-  int64_t feature_dim = value_ty.getDimSize(feature_dim_idx);
-  int64_t bias_len = bias_ty.getDimSize(0);
-  if (feature_dim != -1 && bias_len != -1 && feature_dim != bias_len) {
-    return op.emitOpError()
-           << "requires channel dimension and feature dimension to match; "
-              "found "
-           << feature_dim << " and " << bias_len << ", respectively";
-  }
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// BiasAddGradOp
-//===----------------------------------------------------------------------===//
-
-// Verifies that,
-// * the out_backprop operands have valid ranks or are unranked.
-//
-static LogicalResult Verify(BiasAddGradOp op) {
-  StringRef format = op.data_format();
-  if (format == "NHWC") {
-    if (!HasRankAtLeast(op.out_backprop(), 2))
-      return op.emitOpError(
-          "requires out_backprop operand to have rank at least two with `NHWC` "
-          "data format");
-  } else {
-    // Op definition requires data_format to be either NHWC or NCHW.
-    DCHECK_EQ(format.str(), "NCHW");
-    if (!HasRankAtLeast(op.out_backprop(), 3))
-      return op.emitOpError(
-          "requires out_backprop operand to have rank at least three with "
-          "`NCHW` data format");
-  }
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// BiasAddV1Op
-//===----------------------------------------------------------------------===//
-
-void BiasAddV1Op::getCanonicalizationPatterns(OwningRewritePatternList &results,
-                                              MLIRContext *context) {
-  results.insert<BiasAddV1ToBiasAdd>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// BitcastOp
-//===----------------------------------------------------------------------===//
-
-void BitcastOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
-                                            MLIRContext *context) {
-  results.insert<BitcastSameType, BitcastNested>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// BroadcastToOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(BroadcastToOp op) {
-  // TODO(antiagainst): check that
-  // * The 'shape' input is an 1-D int tensor.
-  // * Each dimension pair of the source and target shapes are either equal
-  //   or one of them is one.
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// CaseOp
-//===----------------------------------------------------------------------===//
-
-class FoldConstantCaseOp : public OpRewritePattern<TF::CaseOp> {
- public:
-  explicit FoldConstantCaseOp(MLIRContext *context)
-      : OpRewritePattern<TF::CaseOp>(context) {}
-  LogicalResult matchAndRewrite(TF::CaseOp op,
-                                PatternRewriter &rewriter) const override;
-};
-
-LogicalResult FoldConstantCaseOp::matchAndRewrite(
-    TF::CaseOp op, PatternRewriter &rewriter) const {
-  // Extract the constant cond value.
-  DenseIntElementsAttr branch;
-  if (!matchPattern(op.branch_index(), m_Constant(&branch))) return failure();
-
-  // Only attempt to fold scalar valued case statements.
-  // TODO(jpienaar): This can be removed if CaseOp's verifier covers it.
-  if (!branch.getType().cast<RankedTensorType>().getShape().empty())
-    return failure();
-
-  int index = *branch.getValues<int>().begin();
-  // TODO(jpienaar): This can be removed if CaseOp's verifier covers it.
-  if (index >= op.branches().size()) return failure();
-
-  auto func = op.branches()[index].cast<SymbolRefAttr>();
-  auto empty = rewriter.getStringAttr("");
-  auto call_op = rewriter.create<PartitionedCallOp>(
-      op.getLoc(), op.getResultTypes(), op.getOperands().drop_front(), func,
-      /*config=*/empty, /*config_proto=*/empty, /*executor_type=*/empty);
-  PropagateAttributes(op.getOperation(), call_op);
-  rewriter.replaceOp(op, call_op.getResults());
-  return success();
-}
-
-void CaseOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
-                                         MLIRContext *context) {
-  results.insert<FoldConstantCaseOp>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// CastOp
-//===----------------------------------------------------------------------===//
-
-OpFoldResult CastOp::fold(ArrayRef<Attribute> operands) {
-  // Cast with the same type is a no-op.
-  Value operand = getOperand();
-  if (getType() == operand.getType()) return operand;
-  return {};
-}
-
-//===----------------------------------------------------------------------===//
-// ConcatOp and ConcatV2Op
-//===----------------------------------------------------------------------===//
-
-template <typename OpT,
-          typename std::enable_if<llvm::is_one_of<
-              OpT, ConcatOp, ConcatV2Op>::value>::type * = nullptr>
-static LogicalResult Verify(OpT op) {
-  // TODO(hinsu): Convert variadic length attributes to derived attributes.
-  Operation::operand_range values = op.values();
-
-  int axis_idx = std::is_same<OpT, ConcatOp>() ? 0 : 1;
-  Value axis = *op.getODSOperands(axis_idx).begin();
-  if (!HasRankAtMost(axis, 1)) {
-    return op.emitOpError(
-        "requires axis to be of scalar type (or vector type for older "
-        "versions)");
-  }
-
-  return VerifyTypesCompatibility(values,
-                                  /*mask_one_dim=*/true, op.getOperation());
-}
-
-void ConcatOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
-                                           MLIRContext *context) {
-  results.insert<ConvertToConcatV2>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// ConcatOffsetOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(ConcatOffsetOp op) {
-  if (op.N() < 2)
-    return op.emitOpError() << "requires N to be at least 2, got " << op.N();
-
-  if (op.shape().size() != op.offset().size())
-    return op.emitOpError()
-           << "requires sizes of shapes and offsets to be the same, got sizes "
-           << op.shape().size() << " and " << op.offset().size();
-
-  auto ranked_dim = op.concat_dim().getType().dyn_cast<RankedTensorType>();
-  if (ranked_dim && ranked_dim.getRank() != 0)
-    return op.emitOpError()
-           << "requires concat_dim to be a scalar, got tensor of rank "
-           << ranked_dim.getRank();
-
-  int64_t num_dims = -1;
-  for (auto shape_offset_idx :
-       llvm::enumerate(llvm::zip(op.shape(), op.offset()))) {
-    Value shape = std::get<0>(shape_offset_idx.value());
-    Value offset = std::get<1>(shape_offset_idx.value());
-    const size_t idx = shape_offset_idx.index();
-
-    if (failed(verifyCompatibleShape(shape.getType(), offset.getType())))
-      return op.emitOpError() << "requires operand and result " << idx
-                              << " to have compatible shapes";
-
-    auto ranked_shape = shape.getType().dyn_cast<RankedTensorType>();
-    if (!ranked_shape) continue;
-
-    if (ranked_shape.getRank() != 1)
-      return op.emitOpError() << "requires shape tensor operand " << idx
-                              << " to be of rank 1, got tensor of rank "
-                              << ranked_shape.getRank();
-
-    if (!ranked_shape.hasStaticShape()) continue;
-
-    int64_t ranked_shape_dim = ranked_shape.getDimSize(0);
-    if (num_dims == -1)
-      num_dims = ranked_shape_dim;
-    else if (ranked_shape_dim != num_dims)
-      return op.emitOpError()
-             << "requires shape tensor (rank 1) operand " << idx
-             << " to be of length " << num_dims
-             << ", got tensor (rank 1) of length " << ranked_shape_dim;
-  }
-
-  return success();
-}
-
-LogicalResult ConcatOffsetOp::fold(ArrayRef<Attribute> operands,
-                                   SmallVectorImpl<OpFoldResult> &results) {
-  // ConcatOffset must have its first operand be concat_dim and at least two
-  // shape tensors in variadic shapes operand.
-  if (operands.size() < 3) return failure();
-
-  // Check concat_dim is a scalar.
-  auto concat_dim_attr = operands[0].dyn_cast_or_null<DenseIntElementsAttr>();
-  if (!concat_dim_attr || concat_dim_attr.getType().getRank() != 0)
-    return failure();
-
-  llvm::SmallVector<DenseIntElementsAttr, 4> shapes;
-  shapes.reserve(operands.size() - 1);
-  for (Attribute shape : llvm::drop_begin(operands, 1))
-    if (auto shape_attr = shape.dyn_cast_or_null<DenseIntElementsAttr>())
-      shapes.push_back(shape_attr);
-    else
-      return failure();
-
-  // Check all shapes are vectors of the same length.
-  if (shapes.front().getType().getRank() != 1) return success();
-  const int64_t num_dims = shapes.front().getNumElements();
-  for (DenseIntElementsAttr shape : llvm::drop_begin(shapes, 1))
-    if (shape.getType().getRank() != 1 || shape.getNumElements() != num_dims)
-      return failure();
-
-  // Check concat_dim is within [-num_dims, num_dims).
-  int32_t concat_dim = (*concat_dim_attr.getValues<int32_t>().begin());
-  if (concat_dim < 0) concat_dim += num_dims;
-  if (concat_dim >= num_dims || concat_dim < 0) return failure();
-
-  // Check all elements besides at concat_dim match across all shape tensors.
-  SmallVector<int32_t, 4> shape0;
-  shape0.reserve(num_dims);
-  for (int32_t dim : shapes.front().getValues<int32_t>()) shape0.push_back(dim);
-
-  for (DenseIntElementsAttr shape : llvm::drop_begin(shapes, 1)) {
-    for (auto dims_and_idx : llvm::enumerate(llvm::zip(shape0, shape))) {
-      if (dims_and_idx.index() == concat_dim) continue;
-
-      if (std::get<0>(dims_and_idx.value()) !=
-          std::get<1>(dims_and_idx.value()).getSExtValue())
-        return failure();
-    }
-  }
-
-  // Compute an exclusive cumulative sum of elements at concat_dim.
-  results.reserve(shapes.size());
-  SmallVector<int32_t, 4> cumulative_sum(num_dims, 0);
-  RankedTensorType offset_type =
-      RankedTensorType::get({num_dims}, IntegerType::get(32, getContext()));
-  for (DenseIntElementsAttr shape : shapes) {
-    results.push_back(DenseIntElementsAttr::get(offset_type, cumulative_sum));
-    cumulative_sum[concat_dim] += shape.getValue<int32_t>(concat_dim);
-  }
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// ConjOp
-//===----------------------------------------------------------------------===//
-
-void ConjOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
-                                         MLIRContext *context) {
-  results.insert<ConjNested>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// ConstOp
-//===----------------------------------------------------------------------===//
-
-OpFoldResult ConstOp::fold(ArrayRef<Attribute> operands) {
-  assert(operands.empty() && "constant has no operands");
-
-  // Return the held attribute value.
-  return value();
-}
-
-// Builds a constant op with the specified attribute `value`. The result
-// op's type is deduced from `value`; if `value` is of scalar type,
-// wraps it up with a tensor type of empty shape.
-// TODO(jpienaar): This one differs from the autogenerated one as it takes an
-// attribute but always creates an ElementsAttr internally.
-void ConstOp::build(OpBuilder &builder, OperationState &result,
-                    Attribute value) {
-  ShapedType type;
-  if (auto elem_attr = value.dyn_cast<ElementsAttr>()) {
-    return ConstOp::build(builder, result, elem_attr);
-  } else if (value.isa<BoolAttr, FloatAttr, IntegerAttr>()) {
-    // All TensorFlow types must be tensor types. In the build() method,
-    // we want to provide more flexibility by allowing attributes of scalar
-    // types. But we need to wrap it up with ElementsAttr to construct
-    // valid TensorFlow constants.
-    type = RankedTensorType::get(/*shape=*/{}, value.getType());
-    return ConstOp::build(builder, result, DenseElementsAttr::get(type, value));
-  }
-  // TODO(jpienaar): support other TensorFlow specific types.
-  llvm_unreachable("unsupported attribute type for building tf.Const");
-}
-
-void ConstOp::build(OpBuilder &builder, OperationState &result, Type type,
-                    Attribute value) {
-  // Handle the case where the type and value are already tensors.
-  if (type.isa<TensorType>() && value.isa<ElementsAttr>()) {
-    result.addTypes(type);
-    result.addAttribute("value", value);
-    return;
-  }
-
-  // Otherwise, default to the attribute builder.
-  ConstOp::build(builder, result, value);
-  assert(type == result.types[0] && "type mismatch in construction");
-}
-
-LogicalResult ConstOp::inferReturnTypes(
-    MLIRContext *context, Optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
-  auto value = attributes.get("value");
-  if (!value) return emitOptionalError(location, "missing attribute 'value'");
-  if (auto elem_attr = value.dyn_cast<ElementsAttr>()) {
-    inferredReturnTypes.assign({elem_attr.getType()});
-    return success();
-  }
-  return emitOptionalError(location,
-                           "attribute 'value' failed to satisfy constraint: "
-                           "constant vector/tensor");
-}
-
-//===----------------------------------------------------------------------===//
-// Conv2DOp and Conv3DOp
-//===----------------------------------------------------------------------===//
-
-template <typename OpT>
-static LogicalResult VerifyConvOpAttributes(OpT op, int num_dims) {
-  if (!IsOfRankOrUnranked(op.getResult(), num_dims))
-    return op.emitOpError()
-           << "requires result to be " << num_dims << "D tensor";
-
-  auto is_not_positive = [](Attribute val) {
-    return val.cast<IntegerAttr>().getValue().getSExtValue() <= 0;
-  };
-
-  int64_t strides_size = op.strides().size();
-  if (strides_size != num_dims)
-    return op.emitOpError() << "requires strides attribute length to be "
-                            << num_dims << "; actual length " << strides_size;
-  if (llvm::any_of(op.strides().getValue(), is_not_positive))
-    return op.emitOpError("requires positive strides");
-
-  int64_t dilations_size = op.strides().size();
-  if (op.dilations().size() != num_dims)
-    return op.emitOpError() << "requires dilations attribute length to be "
-                            << num_dims << "; actual length " << dilations_size;
-  if (llvm::any_of(op.dilations().getValue(), is_not_positive))
-    return op.emitOpError("requires positive dilations");
-
-  return success();
-}
-
-// Verifies that,
-// * Ranks of operands and result are valid
-// * Number of input channels is divisible by the number of filter input
-//   channels
-// * Length of explicit_paddings attribute is valid and has non negative
-//   elements
-// * strides and dilations attributes have positive elements
-template <typename OpT, typename std::enable_if<llvm::is_one_of<
-                            OpT, Conv2DOp, Conv3DOp>::value>::type * = nullptr>
-static LogicalResult Verify(OpT op) {
-  int num_spatial_dims = std::is_same<OpT, Conv2DOp>() ? 2 : 3;
-  int num_dims = 2 + num_spatial_dims;
-
-  if (!IsOfRankOrUnranked(op.input(), num_dims) ||
-      !IsOfRankOrUnranked(op.filter(), num_dims))
-    return op.emitOpError()
-           << "requires operands to be " << num_dims << "D tensor";
-
-  // EXPLICIT padding mode and the associated attribute is limited to Conv2D.
-  // So, fetch attribute by string instead of the op.explicit_paddings()
-  // attribute getter.
-  if (op.padding() == "EXPLICIT") {
-    auto paddings = op.template getAttrOfType<ArrayAttr>("explicit_paddings");
-    if (!paddings)
-      return op.emitOpError() << "requires attribute 'explicit_paddings' with "
-                                 "'EXPLICIT' padding mode";
-
-    int64_t paddings_size = paddings.size();
-    int64_t expected_size = 2 * num_dims;
-
-    if (paddings_size != expected_size)
-      return op.emitOpError()
-             << "requires explicit_paddings attribute length to be "
-             << expected_size << "; actual length " << paddings_size;
-
-    auto is_negative = [](Attribute val) {
-      return val.cast<IntegerAttr>().getValue().getSExtValue() < 0;
-    };
-    if (llvm::any_of(paddings.getValue(), is_negative))
-      return op.emitOpError("requires non negative explicit paddings");
-  }
-
-  LogicalResult verify_result = VerifyConvOpAttributes(op, num_dims);
-  if (failed(verify_result)) {
-    return verify_result;
-  }
-
-  int64_t input_channels = -1;
-  if (auto ty = op.input().getType().template dyn_cast<RankedTensorType>()) {
-    std::string data_format = op.data_format().str();
-    tensorflow::TensorFormat format;
-    auto is_valid = FormatFromString(data_format, &format);
-    DCHECK(is_valid) << data_format;
-    int idx = tensorflow::GetTensorFeatureDimIndex(num_dims, format);
-    input_channels = ty.getDimSize(idx);
-  }
-
-  int64_t filter_channels = -1;
-  if (auto ty = op.filter().getType().template dyn_cast<RankedTensorType>()) {
-    int idx = tensorflow::GetFilterTensorInputChannelsDimIndex(
-        num_dims, tensorflow::FORMAT_HWIO);
-    filter_channels = ty.getDimSize(idx);
-  }
-
-  if (input_channels != -1 && filter_channels != -1 &&
-      input_channels % filter_channels != 0)
-    return op.emitOpError()
-           << "requires the number of input channels to be divisible by the "
-              "number of filter input channels; found "
-           << input_channels << " and " << filter_channels << ", respectively";
-
-  return success();
-}
-
-LogicalResult Conv2DOp::UpdateDataFormat(StringRef data_format) {
-  auto perm = GetDataFormatPermutation(this->data_format(), data_format);
-  if (perm.empty()) return failure();
-
-  // Update data_format attribute and result types.
-  if (failed(::mlir::TF::UpdateDataFormat(data_format, this))) return failure();
-
-  // Update convolution attributes.
-  setAttr("dilations", ShuffleArrayAttr(dilations(), perm));
-  setAttr("strides", ShuffleArrayAttr(strides(), perm));
-  setAttr("explicit_paddings", ShuffleArrayAttr(explicit_paddings(), perm, 2));
-
-  return success();
-}
-
-StringRef Conv2DOp::GetOptimalLayout(const RuntimeDevices &devices) {
-  // Keep current data format if no GPUs are available or if explicit placement
-  // does not allow to use GPU for this operation.
-  if (!CanUseGpuDevice(devices) || !CanUseGpuDevice(getOperation()))
-    return data_format();
-
-  // Input must be a tensor.
-  auto input_ty = input().getType().dyn_cast<TensorType>();
-  if (!input_ty) return data_format();
-
-  // For f16 data type on devices with Tensor Cores support NHWC data format
-  // is up to ~2x faster.
-  const bool is_f16 = input_ty.getElementType().isF16();
-  if (is_f16 && CanUseTensorCores(devices)) return "NHWC";
-
-  // For f32/f16 data type decision depends on the filter size in spatial
-  // dimensions, for other data types we keep current data format.
-  if (!input_ty.getElementType().isF32() && !input_ty.getElementType().isF16())
-    return data_format();
-
-  // Keep current data format if filter rank is unknown or not equal to 4.
-  auto filter_ty = filter().getType().dyn_cast<RankedTensorType>();
-  if (!filter_ty || filter_ty.getRank() != 4) return data_format();
-
-  const int64_t d0 = filter_ty.getDimSize(0);
-  const int64_t d1 = filter_ty.getDimSize(1);
-
-  auto all_ones = [](ArrayAttr arr) -> bool {
-    return llvm::all_of(arr, [](Attribute attr) -> bool {
-      return attr.cast<IntegerAttr>().getInt() == 1;
-    });
-  };
-
-  // Convolutions with 1x1 filter and with strides and dilations all ones, can
-  // be computed as a GEMM in NHWC data format, and can be up to ~2x times
-  // faster than convolution in NCHW.
-  const bool one_by_one = d0 == 1 && d1 == 1;
-  const bool trivial_strides = all_ones(strides());
-  const bool trivial_dilations = all_ones(dilations());
-
-  // TODO(ezhulenev): This might lead to excessive transposes in the final IR,
-  // if the ratio of 1x1 convolutions to regular convolutions is close to 1:1.
-  // Also FusedBatchNorm in training mode prefers NCHW data format. Check if all
-  // users can efficiently use NHWC data format?
-  if (one_by_one && trivial_strides && trivial_dilations) {
-    return "NHWC";
-  }
-
-  // If filter spatial dimensions are unknown or not 1x1 we prefer NCHW, because
-  // it's the fastest option on NVIDIA GPUs with cuDNN library support.
-  return "NCHW";
-}
-
-//===----------------------------------------------------------------------===//
-// Conv2dBackpropFilterOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult Conv2DBackpropFilterOp::UpdateDataFormat(StringRef data_format) {
-  StringRef src_data_format = this->data_format();
-
-  auto perm = GetDataFormatPermutation(src_data_format, data_format);
-  if (perm.empty()) return failure();
-
-  // Update data_format attribute and result types.
-  if (failed(::mlir::TF::UpdateDataFormat(data_format, this))) return failure();
-
-  // Update convolution attributes.
-  setAttr("dilations", ShuffleArrayAttr(dilations(), perm));
-  setAttr("strides", ShuffleArrayAttr(strides(), perm));
-  setAttr("explicit_paddings", ShuffleArrayAttr(explicit_paddings(), perm, 2));
-
-  // Permute filter sizes operand.
-  OpBuilder builder(getOperation());
-  auto filter_sizes_permuted = builder.create<TF::DataFormatVecPermuteOp>(
-      getLoc(), filter_sizes(), StringAttr::get(src_data_format, getContext()),
-      StringAttr::get(data_format, getContext()));
-  setOperand(1, filter_sizes_permuted);
-
-  return success();
-}
-
-StringRef Conv2DBackpropFilterOp::GetOptimalLayout(
-    const RuntimeDevices &devices) {
-  // Keep current data format if no GPUs are available or if explicit placement
-  // does not allow to use GPU for this operation.
-  if (!CanUseGpuDevice(devices) || !CanUseGpuDevice(getOperation()))
-    return data_format();
-
-  // Input must be a tensor.
-  auto input_ty = input().getType().dyn_cast<TensorType>();
-  if (!input_ty) return data_format();
-
-  // For f16 data type on devices with Tensor Cores support NHWC data format
-  // is up to ~2x faster.
-  const bool is_f16 = input_ty.getElementType().isF16();
-  if (is_f16 && CanUseTensorCores(devices)) return "NHWC";
-
-  // Otherwise always use "NCHW".
-  return "NCHW";
-}
-
-//===----------------------------------------------------------------------===//
-// Conv2DBackpropInputOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(Conv2DBackpropInputOp op) {
-  int num_spatial_dims = 2;
-  int num_dims = 2 + num_spatial_dims;
-
-  if (!IsOfRankOrUnranked(op.out_backprop(), num_dims) ||
-      !IsOfRankOrUnranked(op.filter(), num_dims))
-    return op.emitOpError()
-           << "requires operands to be " << num_dims << "D tensor";
-
-  LogicalResult verify_result = VerifyConvOpAttributes(op, num_dims);
-  if (failed(verify_result)) {
-    return verify_result;
-  }
-
-  return success();
-}
-
-LogicalResult Conv2DBackpropInputOp::UpdateDataFormat(StringRef data_format) {
-  StringRef src_data_format = this->data_format();
-
-  auto perm = GetDataFormatPermutation(src_data_format, data_format);
-  if (perm.empty()) return failure();
-
-  // Update data_format attribute and result types.
-  if (failed(::mlir::TF::UpdateDataFormat(data_format, this))) return failure();
-
-  // Update convolution attributes.
-  setAttr("dilations", ShuffleArrayAttr(dilations(), perm));
-  setAttr("strides", ShuffleArrayAttr(strides(), perm));
-  setAttr("explicit_paddings", ShuffleArrayAttr(explicit_paddings(), perm, 2));
-
-  // Permute input sizes operand.
-  OpBuilder builder(getOperation());
-  auto input_sizes_permuted = builder.create<TF::DataFormatVecPermuteOp>(
-      getLoc(), input_sizes(), StringAttr::get(src_data_format, getContext()),
-      StringAttr::get(data_format, getContext()));
-  setOperand(0, input_sizes_permuted);
-
-  return success();
-}
-
-StringRef Conv2DBackpropInputOp::GetOptimalLayout(
-    const RuntimeDevices &devices) {
-  // Keep current data format if no GPUs are available or if explicit placement
-  // does not allow to use GPU for this operation.
-  if (!CanUseGpuDevice(devices) || !CanUseGpuDevice(getOperation()))
-    return data_format();
-
-  // Filter must be a tensor.
-  auto filter_ty = filter().getType().dyn_cast<TensorType>();
-  if (!filter_ty) return data_format();
-
-  // For f16 data type on devices with Tensor Cores support NHWC data format
-  // is up to ~2x faster.
-  const bool is_f16 = filter_ty.getElementType().isF16();
-  if (is_f16 && CanUseTensorCores(devices)) return "NHWC";
-
-  // Otherwise always use "NCHW".
-  return "NCHW";
-}
-
-//===----------------------------------------------------------------------===//
-// DataFormatVecPermuteOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(DataFormatVecPermuteOp op) {
-  auto input_ty = op.x().getType().dyn_cast<RankedTensorType>();
-  if (!input_ty) return success();
-
-  int rank = input_ty.getRank();
-  if (rank != 1 && rank != 2)
-    return op.emitOpError("requires input of rank 1 or 2");
-
-  if (rank == 1) {
-    int64_t dim0 = input_ty.getDimSize(0);
-    if (dim0 != ShapedType::kDynamicSize && dim0 != 4 && dim0 != 2)
-      return op.emitOpError("requires 1D input of size 4 or size 2");
-  }
-
-  if (rank == 2) {
-    int64_t dim0 = input_ty.getDimSize(0);
-    if (dim0 != ShapedType::kDynamicSize && dim0 != 4)
-      return op.emitOpError(
-          "requires first dimensions of 2D input to be of size 4");
-
-    int64_t dim1 = input_ty.getDimSize(1);
-    if (dim1 != ShapedType::kDynamicSize && dim1 != 2)
-      return op.emitOpError(
-          "requires second dimensions of 2D input to be of size 2");
-  }
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// DivOp
-//===----------------------------------------------------------------------===//
-
-void DivOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
-                                        MLIRContext *context) {
-  results.insert<DivWithSqrtDivisor>(context);
-}
-
-OpFoldResult DivOp::fold(ArrayRef<Attribute> operands) {
-  return IdentityArithmeticOpFolder<DivOp>(*this, operands);
-}
-
-//===----------------------------------------------------------------------===//
-// DynamicStitchOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(DynamicStitchOp op) {
-  if (op.N() < 1) return op.emitOpError("requires attribute N with value >= 1");
-
-  if (RankedTensorType out_ty = op.getType().dyn_cast<RankedTensorType>()) {
-    if (out_ty.getRank() == 0) {
-      return op.emitOpError("requires non scalar output");
-    }
-  }
-
-  llvm::SmallDenseSet<int64_t, 8> index_values;
-  bool all_indices_const = true;
-  int32_t max_index = -1;
-  llvm::Optional<SmallVector<int64_t, 4>> inferred_item_shape;
-  for (auto it : llvm::zip(op.indices(), op.data())) {
-    Value index = std::get<0>(it);
-
-    DenseIntElementsAttr index_attr;
-    if (matchPattern(index, m_Constant(&index_attr))) {
-      for (int32_t index : index_attr.getValues<int32_t>()) {
-        if (index < 0)
-          return op.emitOpError()
-                 << "requires non-negative index values; found " << index;
-        max_index = std::max(index, max_index);
-        index_values.insert(index);
-      }
-    } else {
-      all_indices_const = false;
-    }
-
-    Value data = std::get<1>(it);
-    RankedTensorType index_ty = index.getType().dyn_cast<RankedTensorType>();
-    RankedTensorType data_ty = data.getType().dyn_cast<RankedTensorType>();
-    if (!index_ty || !data_ty) continue;
-
-    int64_t index_rank = index_ty.getRank();
-    ArrayRef<int64_t> data_shape = data_ty.getShape();
-    ArrayRef<int64_t> index_shape = index_ty.getShape();
-    if (failed(mlir::verifyCompatibleShape(index_shape,
-                                           data_shape.take_front(index_rank))))
-      return op.emitOpError() << "requires shape of data with type " << data_ty
-                              << " to have prefix matching with shape of the "
-                                 "corresponding index type "
-                              << index_ty;
-
-    ArrayRef<int64_t> item_shape = data_shape.drop_front(index_rank);
-    if (!inferred_item_shape) {
-      inferred_item_shape = llvm::to_vector<4>(item_shape);
-      continue;
-    }
-
-    if (failed(mlir::verifyCompatibleShape(item_shape, *inferred_item_shape)))
-      return op.emitOpError() << "has inconsistent shaped data and index "
-                                 "pairs; inferred item shapes ["
-                              << llvm::makeArrayRef(*inferred_item_shape)
-                              << "] and [" << item_shape << "] don't match";
-    for (int i = 0, e = item_shape.size(); i < e; ++i) {
-      int64_t &inferred_dim = (*inferred_item_shape)[i];
-      int64_t dim = item_shape[i];
-      if (ShapedType::isDynamic(inferred_dim)) inferred_dim = dim;
-    }
-  }
-
-  // If all indices are constants, then verify that they cover all indices in
-  // the range [0, max_index] and the output type is legal.
-  if (all_indices_const) {
-    for (int32_t i = 0; i <= max_index; i++) {
-      if (!index_values.count(i))
-        return op.emitOpError() << "missing index " << i;
-    }
-
-    if (inferred_item_shape) {
-      SmallVector<int64_t, 4> expected_shape;
-      expected_shape.push_back(max_index + 1);
-      expected_shape.append(inferred_item_shape->begin(),
-                            inferred_item_shape->end());
-
-      auto out_ty = op.getType().cast<TensorType>();
-      auto expected_out_ty =
-          RankedTensorType::get(expected_shape, out_ty.getElementType());
-
-      if (!AreCastCompatible({out_ty, expected_out_ty})) {
-        return op.emitOpError() << "has invalid output type; should be "
-                                   "compatible with inferred type "
-                                << expected_out_ty;
-      }
-    }
-  }
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// EinsumOp
-//===----------------------------------------------------------------------===//
-
-// Verifies that,
-// * Arity of the op is at most two.
-//
-// TODO(hinsu): Verify einsum equation attribute.
-static LogicalResult Verify(EinsumOp op) {
-  if (op.N() > 2) {
-    return op.emitOpError("supports at most two operands");
-  }
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// EmptyOp
-//===----------------------------------------------------------------------===//
-
-OpFoldResult EmptyOp::fold(ArrayRef<Attribute> operands) {
-  assert(operands.size() == 1 && "empty op has one operand");
-
-  Attribute attr = operands.front();
-  if (!attr) return {};
-
-  auto int_attr = attr.cast<DenseIntElementsAttr>();
-  SmallVector<int64_t, 6> out_shape;
-  for (const auto val : int_attr.getValues<int32_t>()) {
-    out_shape.push_back(val);
-  }
-
-  auto type = getResult().getType().cast<ShapedType>();
-  auto etype = type.getElementType();
-
-  // We can not fold if the result is not static.
-  if (!type.hasStaticShape()) return {};
-
-  if (auto float_type = etype.dyn_cast<FloatType>()) {
-    auto out_type = RankedTensorType::get(out_shape, float_type);
-    return DenseElementsAttr::get(out_type,
-                                  {APFloat(float_type.getFloatSemantics())});
-  }
-
-  if (auto int_type = etype.dyn_cast<IntegerType>()) {
-    auto out_type = RankedTensorType::get(out_shape, etype);
-    APInt val(int_type.getWidth(), 0, int_type.getSignedness());
-    return DenseElementsAttr::get(out_type, val);
-  }
-
-  return {};
-}
-
-//===----------------------------------------------------------------------===//
-// EmptyTensorListOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(EmptyTensorListOp op) {
-  if (!IsOfRankOrUnranked(op.element_shape(), 0) &&
-      !IsOfRankOrUnranked(op.element_shape(), 1)) {
-    return op.emitOpError("requires element_shape operand to be 0D/1D tensor");
-  }
-
-  if (!IsOfRankOrUnranked(op.max_num_elements(), 0)) {
-    return op.emitOpError("requires max_num_elements operand to be 0D tensor");
-  }
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// EqualOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(EqualOp op) {
-  // If we allow inputs to have incompatible type, then nothing to do.
-  if (!op.incompatible_shape_error()) return success();
-
-  // Otherwise, check inputs are broadcastable.
-  return mlir::OpTrait::impl::verifyCompatibleOperandBroadcast(
-      op.getOperation());
-}
-
-void EqualOp::build(OpBuilder &builder, OperationState &result, Value x,
-                    Value y, BoolAttr incompatible_shape_error) {
-  auto result_type = DeduceEqualCmpOpType(&builder, result.location, x, y,
-                                          incompatible_shape_error);
-  return build(builder, result, result_type, x, y, incompatible_shape_error);
-}
-
-//===----------------------------------------------------------------------===//
-// ExpandDimsOp
-//===----------------------------------------------------------------------===//
-
-Type InferExpandDimsOpType(Value input, Value dim) {
-  Type element_ty = input.getType().cast<TensorType>().getElementType();
-  auto unranked_ty = UnrankedTensorType::get(element_ty);
-
-  auto input_ty = input.getType().dyn_cast<RankedTensorType>();
-  if (!input_ty) return unranked_ty;
-
-  DenseIntElementsAttr dim_attr;
-  if (!matchPattern(dim, m_Constant(&dim_attr)) ||
-      dim_attr.getNumElements() != 1)
-    return unranked_ty;
-  int64_t dim_val = (*dim_attr.begin()).getSExtValue();
-  int64_t input_rank = input_ty.getRank();
-
-  if (dim_val < -input_rank - 1 || dim_val > input_rank + 1) return unranked_ty;
-  if (dim_val < 0) dim_val += input_rank + 1;
-
-  SmallVector<int64_t, 4> shape = llvm::to_vector<4>(input_ty.getShape());
-  shape.insert(shape.begin() + dim_val, 1);
-  return RankedTensorType::get(shape, element_ty);
-}
-
-void ExpandDimsOp::build(OpBuilder &builder, OperationState &result,
-                         Value input, Value dim) {
-  return build(builder, result, InferExpandDimsOpType(input, dim), input, dim);
-}
-
-//===----------------------------------------------------------------------===//
-// FakeQuantWithMinMaxArgsOp
-//===----------------------------------------------------------------------===//
-static LogicalResult Verify(FakeQuantWithMinMaxArgsOp op) {
-  // TODO(fengliuai): moving the following to an utility method.
-  const llvm::fltSemantics &semantics = op.min().getSemantics();
-  float rmin, rmax;
-  if (&semantics == &APFloat::IEEEsingle()) {
-    rmin = op.min().convertToFloat();
-    rmax = op.max().convertToFloat();
-  } else {
-    rmin = op.min().convertToDouble();
-    rmax = op.max().convertToDouble();
-  }
-  // Range boundaries must be valid.
-  if (rmin >= rmax) {
-    return op.emitOpError("range is invalid: [" + Twine(std::to_string(rmin)) +
-                          "," + Twine(std::to_string(rmax)) + "]");
-  }
-  int64_t num_bits = op.num_bits().getSExtValue();
-  if (num_bits < 2 || num_bits > 16) {
-    return op.emitOpError(
-        "requires num_bits to be between 2 and 16, inclusive");
-  }
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// FakeQuantWithMinMaxVarsOp
-//===----------------------------------------------------------------------===//
-static LogicalResult Verify(FakeQuantWithMinMaxVarsOp op) {
-  auto min = GetRankedTensorTypeForOperand(op.min());
-  if (min && !IsOfRankedFloatTensorType(min, 0))
-    return op.emitOpError("requires min to be a 0d float tensor");
-
-  auto max = GetRankedTensorTypeForOperand(op.max());
-  if (max && !IsOfRankedFloatTensorType(max, 0))
-    return op.emitOpError("requires max to be a 0d float tensor");
-
-  int64_t num_bits = op.num_bits().getSExtValue();
-  if (num_bits < 2 || num_bits > 16) {
-    return op.emitOpError(
-        "requires num_bits to be between 2 and 16, inclusive");
-  }
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// FakeQuantWithMinMaxVarsPerChannelOp
-//===----------------------------------------------------------------------===//
-static LogicalResult Verify(FakeQuantWithMinMaxVarsPerChannelOp op) {
-  auto min = GetRankedTensorTypeForOperand(op.min());
-  if (min && !IsOfRankedFloatTensorType(min, 1))
-    return op.emitOpError("requires min to be a 1d float tensor");
-
-  auto max = GetRankedTensorTypeForOperand(op.max());
-  if (max && !IsOfRankedFloatTensorType(max, 1))
-    return op.emitOpError("requires max to be a 1d float tensor");
-
-  Value inputs = op.inputs();
-  if (!HasRankAtLeast(inputs, 1))
-    return op.emitError("requires inputs to be at least 1d float tensor");
-
-  int64_t num_bits = op.num_bits().getSExtValue();
-  if (num_bits < 2 || num_bits > 16) {
-    return op.emitOpError(
-        "requires num_bits to be between 2 and 16, inclusive");
-  }
-
-  auto inputs_type = inputs.getType().dyn_cast<RankedTensorType>();
-  if (!inputs_type) return success();
-  int depth = inputs_type.getDimSize(inputs_type.getRank() - 1);
-  if ((min && min.getDimSize(0) != depth) ||
-      (max && max.getDimSize(0) != depth)) {
-    return op.emitOpError(
-        "requires min and max to have same size as last dimension of inputs");
-  }
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// FillOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(FillOp op) {
-  if (!IsOfRankOrUnranked(op.dims(), 1))
-    return op.emitOpError() << "requires dims to be a 1D tensor";
-  if (!IsOfRankOrUnranked(op.value(), 0))
-    return op.emitOpError() << "requires value to be a scalar";
-
-  return success();
-}
-
-static ShapedType InferFillOpType(Value dims, Value value) {
-  Type etype = value.getType().cast<ShapedType>().getElementType();
-
-  DenseIntElementsAttr dims_attr;
-  if (!matchPattern(dims, m_Constant(&dims_attr))) {
-    return UnrankedTensorType::get(etype);
-  }
-
-  llvm::SmallVector<int64_t, 4> shape;
-  shape.reserve(dims_attr.getNumElements());
-  for (const APInt dim : dims_attr.getValues<APInt>()) {
-    shape.push_back(dim.getSExtValue());
-  }
-  return RankedTensorType::get(shape, etype);
-}
-
-void FillOp::build(OpBuilder &builder, OperationState &result, Value dims,
-                   Value value) {
-  FillOp::build(builder, result, InferFillOpType(dims, value), dims, value);
-}
-
-OpFoldResult FillOp::fold(ArrayRef<Attribute> operands) {
-  assert(operands.size() == 2 && "fill op has two operand");
-
-  auto type = getType().cast<ShapedType>();
-  // DenseElementsAttr that is used in this folder only supports int and float
-  // types.
-  // TODO(hinsu): Handle complex types once there is a attribute kind for
-  // complex.
-  if (!type.getElementType().isIntOrFloat()) return {};
-
-  auto value = operands[1].dyn_cast_or_null<ElementsAttr>();
-  if (!value) return {};
-
-  if (type.hasStaticShape())
-    return DenseElementsAttr::get(type, value.getValue({}));
-
-  auto dims = operands[0].dyn_cast_or_null<DenseIntElementsAttr>();
-  if (!dims) return {};
-
-  llvm::SmallVector<int64_t, 4> shape;
-  shape.reserve(dims.getNumElements());
-  for (const APInt dim : dims.getValues<APInt>()) {
-    shape.push_back(dim.getSExtValue());
-  }
-  type = RankedTensorType::get(shape, type.getElementType());
-
-  return DenseElementsAttr::get(type, value.getValue({}));
-}
-
-//===----------------------------------------------------------------------===//
-// FusedBatchNormGradOp
-//===----------------------------------------------------------------------===//
-
-// TODO(b/150954845): Add benchmarks to verify that layout preference didn't
-// change in the latest GPU generations.
-
-LogicalResult FusedBatchNormGradV3Op::UpdateDataFormat(StringRef data_format) {
-  return ::mlir::TF::UpdateDataFormat(data_format, this);
-}
-
-StringRef FusedBatchNormGradV3Op::GetOptimalLayout(
-    const RuntimeDevices &devices) {
-  // Keep current data format if no GPUs are available or if explicit placement
-  // does not allow to use GPU for this operation.
-  if (!CanUseGpuDevice(devices) || !CanUseGpuDevice(getOperation()))
-    return data_format();
-
-  // For f16 data type on devices with Tensor Cores support NHWC data format
-  // is up to ~2x faster.
-  auto x_ty = x().getType().cast<TensorType>();
-  const bool is_f16 = x_ty.getElementType().isF16();
-  if (is_f16 && CanUseTensorCores(devices)) return "NHWC";
-
-  // For all other data types prefer NCHW.
-  return "NCHW";
-}
-
-//===----------------------------------------------------------------------===//
-// FusedBatchNormOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(FusedBatchNormOp op) {
-  auto x = GetRankedTensorTypeForOperand(op.x());
-  if (x && !IsOfRankedFloatTensorType(x, 4))
-    return op.emitOpError("requires x to be a 4D float tensor");
-
-  auto scale = GetRankedTensorTypeForOperand(op.scale());
-  if (scale && !IsOfRankedFloatTensorType(scale, 1))
-    return op.emitOpError("requires scale to be a 1D float tensor");
-
-  auto offset = GetRankedTensorTypeForOperand(op.offset());
-  if (offset && !IsOfRankedFloatTensorType(offset, 1))
-    return op.emitOpError("requires offset to be a 1D float tensor");
-
-  auto mean = GetRankedTensorTypeForOperand(op.mean());
-  if (mean && !IsOfRankedFloatTensorType(mean, 1))
-    return op.emitOpError("requires mean to be a 1D float tensor");
-
-  auto variance = GetRankedTensorTypeForOperand(op.variance());
-  if (variance && !IsOfRankedFloatTensorType(variance, 1))
-    return op.emitOpError("requires variance to be a 1D float tensor");
-
-  // TODO(antiagainst): check attributes
-
-  return success();
-}
-
-LogicalResult FusedBatchNormV3Op::FoldOperandsPermutation(
-    ArrayRef<int64_t> permutation) {
-  // FusedBatchNorm in training mode is a layout sentitive operation, and should
-  // have already assigned an optimal data format.
-  if (is_training()) return failure();
-
-  return ::mlir::TF::FoldOperandsPermutation(permutation, this);
-}
-
-LogicalResult FusedBatchNormV3Op::UpdateDataFormat(StringRef data_format) {
-  return ::mlir::TF::UpdateDataFormat(data_format, this);
-}
-
-StringRef FusedBatchNormV3Op::GetOptimalLayout(const RuntimeDevices &devices) {
-  // In inference mode FusedBatchNorm is not sensitive to data layout.
-  if (!is_training()) return data_format();
-
-  // Keep current data format if no GPUs are available or if explicit placement
-  // does not allow to use GPU for this operation.
-  if (!CanUseGpuDevice(devices) || !CanUseGpuDevice(getOperation()))
-    return data_format();
-
-  // For f16 data type on devices with Tensor Cores support NHWC data format
-  // is up to ~2x faster.
-  auto x_ty = x().getType().cast<TensorType>();
-  const bool is_f16 = x_ty.getElementType().isF16();
-  if (is_f16 && CanUseTensorCores(devices)) return "NHWC";
-
-  // For all other data types prefer NCHW.
-  return "NCHW";
-}
-
-//===----------------------------------------------------------------------===//
-// GatherV2Op
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(GatherV2Op op) {
-  int64_t batch_dims = op.batch_dims().getSExtValue();
-  if (auto ty = op.indices().getType().dyn_cast<RankedTensorType>()) {
-    int64_t rank = ty.getRank();
-    if (batch_dims > rank || batch_dims < -rank)
-      return op.emitOpError()
-             << "batch_dims (" << batch_dims << ") must be in range [" << -rank
-             << ", " << rank + 1 << ")";
-    if (batch_dims < 0) batch_dims += rank;
-  }
-
-  if (!HasRankAtMost(op.axis(), 1))
-    return op.emitOpError("requires axis to have rank at most 1");
-
-  DenseIntElementsAttr axis_attr;
-  if (matchPattern(op.axis(), m_Constant(&axis_attr))) {
-    int64_t axis = (*axis_attr.begin()).getSExtValue();
-    if (auto ty = op.params().getType().dyn_cast<RankedTensorType>()) {
-      int64_t rank = ty.getRank();
-      if (axis >= rank || axis < -rank)
-        return op.emitOpError() << "axis (" << axis << ") must be in range ["
-                                << -rank << ", " << rank << ")";
-      if (axis < 0) axis += rank;
-    }
-
-    if (batch_dims >= 0 && axis >= 0 && axis < batch_dims) {
-      return op.emitOpError() << "requires axis (" << axis
-                              << ") to be greater than or equal to batch_dims ("
-                              << batch_dims << ")";
-    }
-  }
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// IfOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(IfOp op) {
-  auto module = op.getParentOfType<ModuleOp>();
-  auto then_fn = module.lookupSymbol<FuncOp>(op.then_branch());
-  if (!then_fn)
-    return op.emitOpError("then_branch refers to an undefined function : ")
-           << op.then_branch();
-  auto else_fn = module.lookupSymbol<FuncOp>(op.else_branch());
-  if (!else_fn)
-    return op.emitOpError("else_branch refers to an undefined function : ")
-           << op.else_branch();
-  auto then_fn_type = then_fn.getType();
-  auto else_fn_type = else_fn.getType();
-
-  // Non-conditional operands starting with the second operand are passed to
-  // branches and should be pair-wise compatible with branches' inputs.
-  unsigned expected_num_inputs = op.getNumOperands() - 1;
-  if (then_fn_type.getNumInputs() != expected_num_inputs ||
-      else_fn_type.getNumInputs() != expected_num_inputs)
-    return op.emitError("branches should have " + Twine(expected_num_inputs) +
-                        " inputs");
-
-  for (unsigned i = 0; i < expected_num_inputs; ++i) {
-    auto operand_type = op.getOperand(i + 1).getType().cast<TensorType>();
-    auto then_input_type = then_fn_type.getInput(i).cast<TensorType>();
-    if (!AreCastCompatible({operand_type, then_input_type}))
-      return op.emitError(
-          llvm::formatv("then branch input type {0} is incompatible with "
-                        "operand type {1} at index {2}",
-                        then_input_type, operand_type, i));
-
-    auto else_input_type = else_fn_type.getInput(i).cast<TensorType>();
-    if (!AreCastCompatible({operand_type, else_input_type}))
-      return op.emitError(
-          llvm::formatv("else branch input type {0} is incompatible with "
-                        "operand type {1} at index {2}",
-                        else_input_type, operand_type, i));
-
-    // If branches have incompatible input types that means that no tensor can
-    // serve as input to both the functions. Hence, the op is invalid.
-    if (!AreCastCompatible({then_input_type, else_input_type}))
-      return op.emitError(llvm::formatv(
-          "branches inputs have incompatible types {0} and {1} at index {2}",
-          then_input_type, else_input_type, i));
-  }
-
-  // Branches' results should be pair-wise compatible with the op results.
-  unsigned expected_num_results = op.getNumResults();
-  if (then_fn_type.getNumResults() != expected_num_results ||
-      else_fn_type.getNumResults() != expected_num_results)
-    return op.emitError("branches should have " + Twine(expected_num_results) +
-                        " results");
-
-  for (unsigned i = 0; i < expected_num_results; ++i) {
-    auto result_type = op.getResult(i).getType().cast<TensorType>();
-    auto then_result_type = then_fn_type.getResult(i).cast<TensorType>();
-    if (!AreCastCompatible({then_result_type, result_type}))
-      return op.emitError(
-          llvm::formatv("then branch result type {0} is incompatible with op "
-                        "result type {1} at index {2}",
-                        then_result_type, result_type, i));
-
-    auto else_result_type = else_fn_type.getResult(i).cast<TensorType>();
-    if (!AreCastCompatible({else_result_type, result_type}))
-      return op.emitError(
-          llvm::formatv("else branch result type {0} is incompatible with op "
-                        "result type {1} at index {2}",
-                        else_result_type, result_type, i));
-  }
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// IfRegionOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult VerifyRegionResults(Operation *op, Region &region,
-                                  StringRef region_name) {
-  auto op_name = op->getName().getStringRef();
-  // verify that op outputs match yield inputs
-  YieldOp yield = cast<YieldOp>(region.front().getTerminator());
-  unsigned expected_num_results = op->getNumResults();
-  if (yield.getNumOperands() != expected_num_results)
-    return op->emitOpError()
-           << region_name + " should have same number (" << expected_num_results
-           << ") of results as " << op_name << " but has "
-           << yield.getNumOperands() << " results";
-
-  for (int idx : llvm::seq<int>(0, expected_num_results)) {
-    auto op_result_type = op->getResult(idx).getType().cast<TensorType>();
-    auto region_result_type =
-        yield.getOperand(idx).getType().cast<TensorType>();
-    if (!AreCastCompatible({region_result_type, op_result_type}))
-      return op->emitError(llvm::formatv(
-          "{0} result type {1} is incompatible with {2} "
-          "result type {3} at index {4}",
-          region_name, region_result_type, op_name, op_result_type, idx));
-  }
-  return success();
-}
-
-static LogicalResult Verify(IfRegionOp op) {
-  if (failed(VerifyRegionResults(op, op.then_branch(), "then")))
-    return failure();
-  if (failed(VerifyRegionResults(op, op.else_branch(), "else")))
-    return failure();
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// InvertOp
-//===----------------------------------------------------------------------===//
-
-void InvertOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
-                                           MLIRContext *context) {
-  results.insert<InvertNested>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// InvertPermutationOp
-//===----------------------------------------------------------------------===//
-
-// Verifies that the input is 1D.
-static LogicalResult Verify(InvertPermutationOp op) {
-  auto x_type = op.x().getType().cast<TensorType>();
-  if (!x_type.hasRank()) return success();
-  if (x_type.getShape().size() != 1)
-    return op.emitOpError() << "requires input x to be 1-dimensional";
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// LeakyReluOp
-//===----------------------------------------------------------------------===//
-
-OpFoldResult LeakyReluOp::fold(ArrayRef<Attribute> operands) {
-  assert(operands.size() == 1 && "leaky relu has one operand");
-
-  // leaky_relu(x, alpha: 1) -> x
-  if (alpha().convertToFloat() == 1.0f) return getOperand();
-
-  auto calculate = [&](FloatAttr arg) {
-    APFloat val = arg.getValue();
-    if (val.isNegative()) val = alpha() * val;
-    return FloatAttr::get(arg.getType(), val);
-  };
-
-  if (auto arg = operands[0].dyn_cast_or_null<FloatAttr>()) {
-    return calculate(arg);
-  } else if (auto arg = operands[0].dyn_cast_or_null<SplatElementsAttr>()) {
-    if (auto elementAttr = arg.getSplatValue().dyn_cast<FloatAttr>())
-      return DenseElementsAttr::get(arg.getType(), calculate(elementAttr));
-  }
-  return {};
-}
-
-//===----------------------------------------------------------------------===//
-// LogOp
-//===----------------------------------------------------------------------===//
-
-void LogOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
-                                        MLIRContext *context) {
-  results.insert<LogOfSoftmax, LogToLog1p>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// ReadVariableOp
-//===----------------------------------------------------------------------===//
-
-void ReadVariableOp::getCanonicalizationPatterns(
-    OwningRewritePatternList &results, MLIRContext *context) {
-  results.insert<ReadVariableOfCast>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// VarIsInitializedOp
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-/// Erase VarIsInitializedOp operations with no uses. This op has side effect on
-/// resources (read-only), but can still be deleted if it has zero uses.
-struct EraseDeadVarIsInitializedOp
-    : public OpRewritePattern<VarIsInitializedOp> {
-  using OpRewritePattern<VarIsInitializedOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(VarIsInitializedOp op,
-                                PatternRewriter &rewriter) const override {
-    if (!op.use_empty()) return failure();
-    rewriter.eraseOp(op);
-    return success();
-  }
-};
-}  // end anonymous namespace.
-
-void VarIsInitializedOp::getCanonicalizationPatterns(
-    OwningRewritePatternList &patterns, MLIRContext *context) {
-  patterns.insert<EraseDeadVarIsInitializedOp>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// LogicalNotOp
-//===----------------------------------------------------------------------===//
-
-void LogicalNotOp::getCanonicalizationPatterns(
-    OwningRewritePatternList &results, MLIRContext *context) {
-  results.insert<LogicalNotNested, LogicalNotOfEqual, LogicalNotOfNotEqual,
-                 LogicalNotOfGreater, LogicalNotOfGreaterEqual,
-                 LogicalNotOfLess, LogicalNotOfLessEqual>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// MatrixBandPartOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(MatrixBandPartOp op) {
-  if (!HasRankAtLeast(op.input(), 2)) {
-    return op.emitOpError()
-           << "requires `input` to have rank of at least 2, but found "
-           << op.input().getType();
-  }
-  if (!IsOfRankOrUnranked(op.num_lower(), 0)) {
-    return op.emitOpError()
-           << "requires `num_lower` to have 0 dimensions, but found "
-           << op.num_lower().getType();
-  }
-  if (!IsOfRankOrUnranked(op.num_upper(), 0)) {
-    return op.emitOpError()
-           << "requires `num_upper` to have 0 dimensions, but found "
-           << op.num_upper().getType();
-  }
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// MaxOp
-//===----------------------------------------------------------------------===//
-
-void MaxOp::build(OpBuilder &builder, OperationState &result, Value input,
-                  Value reduction_indices, BoolAttr keep_dims) {
-  Type out_ty =
-      InferReductionOpType(input, reduction_indices, keep_dims, &builder);
-  build(builder, result, out_ty, input, reduction_indices, keep_dims);
-}
-
-//===----------------------------------------------------------------------===//
-// MaxPoolOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult MaxPoolOp::FoldOperandsPermutation(
-    ArrayRef<int64_t> permutation) {
-  return ::mlir::TF::FoldOperandsPermutation(
-      permutation, this, {{"strides", strides()}, {"ksize", ksize()}});
-}
-
-//===----------------------------------------------------------------------===//
-// MaxPoolGradOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(MaxPoolGradOp op) {
-  if (!IsOfRankOrUnranked(op.orig_input(), 4)) {
-    return op.emitOpError() << "requires orig_input to be rank 4";
-  }
-  if (!IsOfRankOrUnranked(op.orig_output(), 4)) {
-    return op.emitOpError() << "requires orig_output to be rank 4";
-  }
-  if (!IsOfRankOrUnranked(op.grad(), 4)) {
-    return op.emitOpError() << "requires grad to be rank 4";
-  }
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// MeanOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult MeanOp::FoldOperandsPermutation(ArrayRef<int64_t> permutation) {
-  // Reduction indices must be defined by a constant operation.
-  auto reduction_op =
-      dyn_cast_or_null<TF::ConstOp>(reduction_indices().getDefiningOp());
-  if (!reduction_op) return failure();
-
-  auto reductions_value = reduction_op.value().dyn_cast<DenseElementsAttr>();
-  if (!reductions_value) return failure();
-
-  // Prepare new reduction indices according to operand permutation.
-  SmallVector<int32_t, 4> shuffled_reduction;
-  llvm::transform(reductions_value.getIntValues(),
-                  std::back_inserter(shuffled_reduction),
-                  [&](APInt idx) { return permutation[idx.getSExtValue()]; });
-
-  // Add constant operation with a new reduction indices.
-  OpBuilder builder(getOperation());
-  auto type = mlir::RankedTensorType::get(shuffled_reduction.size(),
-                                          builder.getIntegerType(32));
-  auto values = mlir::DenseIntElementsAttr::get(type, shuffled_reduction);
-  auto shuffled_reduction_op = builder.create<TF::ConstOp>(getLoc(), values);
-
-  // Use new reduction indices.
-  setOperand(1, shuffled_reduction_op);
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// MulOp
-//===----------------------------------------------------------------------===//
-
-OpFoldResult MulOp::fold(ArrayRef<Attribute> operands) {
-  return IdentityArithmeticOpFolder<MulOp>(*this, operands);
-}
-
-//===----------------------------------------------------------------------===//
-// NegOp
-//===----------------------------------------------------------------------===//
-
-void NegOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
-                                        MLIRContext *context) {
-  results.insert<NegNested>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// NotEqualOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(NotEqualOp op) {
-  // If we allow inputs to have incompatible type, then nothing to do.
-  if (!op.incompatible_shape_error()) return success();
-
-  // Otherwise, check inputs are broadcastable.
-  return mlir::OpTrait::impl::verifyCompatibleOperandBroadcast(
-      op.getOperation());
-}
-
-void NotEqualOp::build(OpBuilder &builder, OperationState &result, Value x,
-                       Value y, BoolAttr incompatible_shape_error) {
-  auto result_type = DeduceEqualCmpOpType(&builder, result.location, x, y,
-                                          incompatible_shape_error);
-  return build(builder, result, result_type, x, y, incompatible_shape_error);
-}
-
-//===----------------------------------------------------------------------===//
-// OneHotOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(OneHotOp op) {
-  int64_t axis = op.axis().getSExtValue();
-
-  auto indices_ty = op.indices().getType().dyn_cast<RankedTensorType>();
-  if (indices_ty &&
-      !(axis == -1 || (axis >= 0 && axis <= indices_ty.getShape().size()))) {
-    return op.emitOpError()
-           << "expected axis (" << axis << ") to be -1 or between [0, "
-           << indices_ty.getShape().size() << "]";
-  }
-
-  if (axis < -1) {
-    return op.emitOpError() << "expected axis (" << axis
-                            << ") to be -1 or between [0, rank(indices()))";
-  }
-
-  if (!IsOfRankOrUnranked(op.depth(), 0)) {
-    return op.emitOpError() << "requires depth to be a scalar";
-  }
-  if (!IsOfRankOrUnranked(op.on_value(), 0)) {
-    return op.emitOpError() << "requires on_value to be a scalar";
-  }
-  if (!IsOfRankOrUnranked(op.off_value(), 0)) {
-    return op.emitOpError() << "requires off_value to be a scalar";
-  }
-
-  DenseIntElementsAttr depth_attr;
-  if (matchPattern(op.depth(), m_Constant(&depth_attr))) {
-    if (depth_attr.getType().getRank() != 0)
-      return op.emitOpError() << "requires depth to be a scalar";
-    int64_t depth = depth_attr.getValue<APInt>({}).getSExtValue();
-    if (depth < 0) {
-      return op.emitOpError() << "depth must be non-negative, got: " << depth;
-    }
-  }
-
-  return success();
-}
-
-static TensorType InferOneHotOpType(Value indices, Value depth, Value on_value,
-                                    Value off_value, IntegerAttr axis) {
-  int64_t axis_val = axis.getInt();
-  Type element_ty = on_value.getType().cast<TensorType>().getElementType();
-  auto unranked_ty = UnrankedTensorType::get(element_ty);
-  if (axis_val < -1) return unranked_ty;
-
-  auto indices_ty = indices.getType().dyn_cast<RankedTensorType>();
-  if (!indices_ty) return unranked_ty;
-
-  auto shape = llvm::to_vector<2>(indices_ty.getShape());
-  if (axis_val == -1) axis_val = shape.size();
-
-  int64_t depth_val = ShapedType::kDynamicSize;
-  DenseIntElementsAttr depth_attr;
-  if (matchPattern(depth, m_Constant(&depth_attr)) &&
-      depth_attr.getNumElements() == 1)
-    depth_val = (*depth_attr.begin()).getSExtValue();
-  shape.insert(shape.begin() + axis_val, depth_val);
-  return RankedTensorType::get(shape, element_ty);
-}
-
-void OneHotOp::build(OpBuilder &builder, OperationState &result, Value indices,
-                     Value depth, Value on_value, Value off_value,
-                     IntegerAttr axis) {
-  build(builder, result,
-        InferOneHotOpType(indices, depth, on_value, off_value, axis), indices,
-        depth, on_value, off_value, axis);
-}
-
-//===----------------------------------------------------------------------===//
-// PackOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(PackOp op) {
-  // TODO(hinsu): Convert variadic length attributes to derived attributes.
-  Operation::operand_range values = op.values();
-
-  if (failed(VerifyTypesCompatibility(values,
-                                      /*mask_one_dim=*/false,
-                                      op.getOperation()))) {
-    return failure();
-  }
-
-  int64_t inputs_rank = -1;
-  for (Value value : values) {
-    if (auto ty = value.getType().dyn_cast<RankedTensorType>()) {
-      // Exit early as input types are verified to be compatible so all ranked
-      // tensors have the same rank.
-      inputs_rank = ty.getRank();
-      break;
-    }
-  }
-  if (inputs_rank == -1) return success();
-
-  // The values can be packed along any of the dimensions between 0 and
-  // inputs rank, inclusive. Also, as the negative axis values wrap around so
-  // the axis value range is [-(R+1), R+1).
-  int64_t range_begin = -inputs_rank - 1;  // Inclusive
-  int64_t range_end = inputs_rank + 1;     // Exclusive
-  int64_t axis = op.axis().getSExtValue();
-  if (axis < range_begin || axis >= range_end) {
-    return op.emitError() << "attribute 'axis' should be within range ["
-                          << range_begin << ", " << range_end
-                          << "); actual value: " << axis;
-  }
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// PadOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult PadOp::FoldOperandsPermutation(ArrayRef<int64_t> permutation) {
-  // Paddings must be defined by a constant operation.
-  auto paddings_op = dyn_cast_or_null<TF::ConstOp>(paddings().getDefiningOp());
-  if (!paddings_op) return failure();
-
-  auto paddings_value = paddings_op.value().dyn_cast<DenseElementsAttr>();
-  if (!paddings_value ||
-      paddings_value.getNumElements() != permutation.size() * 2)
-    return failure();
-
-  SmallVector<int32_t, 8> shuffled_paddings(paddings_value.getNumElements());
-  for (auto index_pair : llvm::enumerate(paddings_value.getIntValues())) {
-    size_t outer_idx = index_pair.index() / 2;
-    size_t inner_idx = index_pair.index() % 2;
-
-    shuffled_paddings[permutation[outer_idx] * 2 + inner_idx] =
-        index_pair.value().getSExtValue();
-  }
-
-  // Add constant operation with a new paddings.
-  OpBuilder builder(getOperation());
-  auto type = mlir::RankedTensorType::get(paddings_value.getType().getShape(),
-                                          builder.getIntegerType(32));
-  auto values = mlir::DenseIntElementsAttr::get(type, shuffled_paddings);
-  auto shuffled_paddings_op = builder.create<TF::ConstOp>(getLoc(), values);
-
-  // Use new paddings.
-  setOperand(1, shuffled_paddings_op);
-
-  // Change the result type.
-  getResult().setType(ShuffleRankedTensorType(getResult().getType(),
-                                              ReversePermutation(permutation)));
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// ParseExampleV2Op
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(ParseExampleV2Op op) {
-  // NOTE(mrry): This validates properties of an op that would previously be
-  // validated by the TensorFlow OpDef type checker. In addition to these
-  // checks, the shape inference function for ParseExampleV2 validates the
-  // consistency of the argument and result types.
-
-  // Validate dense variadic input and output lengths.
-  // NOTE(mrry): The Tdense attr is derived from dense_defaults, so we
-  // do not need to validate dense_defaults.
-  auto dense_types_count =
-      std::distance(op.Tdense().begin(), op.Tdense().end());
-  auto dense_values_count =
-      std::distance(op.dense_values().begin(), op.dense_values().end());
-  if (dense_values_count != dense_types_count) {
-    return op.emitError() << "output 'dense_values' should have same length "
-                          << "as attribute 'Tdense'";
-  }
-
-  // Validate sparse variadic output lengths.
-  // NOTE(mrry): The sparse_types attr is derived from sparse_values, so we
-  // do not need to validate sparse_values.
-  auto sparse_types_count =
-      std::distance(op.sparse_types().begin(), op.sparse_types().end());
-  if (op.num_sparse() != sparse_types_count) {
-    return op.emitError() << "attribute 'num_sparse' should be the same as "
-                          << "the length of attribute 'sparse_types'";
-  }
-  if (op.sparse_indices().size() != sparse_types_count) {
-    return op.emitError() << "output 'sparse_indices' should have same length "
-                          << "as attribute 'sparse_types'";
-  }
-  if (op.sparse_shapes().size() != sparse_types_count) {
-    return op.emitError() << "output 'sparse_shapes' should have same length "
-                          << "as attribute 'sparse_types'";
-  }
-
-  // Validate ragged variadic output lengths.
-  auto ragged_value_types_count = std::distance(op.ragged_value_types().begin(),
-                                                op.ragged_value_types().end());
-  auto ragged_split_types_count = std::distance(op.ragged_split_types().begin(),
-                                                op.ragged_split_types().end());
-  if (ragged_value_types_count != ragged_split_types_count) {
-    return op.emitError() << "attribute 'ragged_value_types' should have same "
-                          << "length as attribute 'ragged_split_types'";
-  }
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// PartitionedCallOp
-//===----------------------------------------------------------------------===//
-
-template <class OpClass>
-static LogicalResult VerifyPartitionedCall(OpClass op) {
-  auto module = op.template getParentOfType<ModuleOp>();
-  SymbolRefAttr func = op.getAttr("f").template cast<SymbolRefAttr>();
-
-  auto function =
-      dyn_cast_or_null<FuncOp>(SymbolTable::lookupSymbolIn(module, func));
-
-  if (!function) {
-    return op.emitError("'f' attribute refers to an undefined function: ")
-           << func;
-  }
-
-  FunctionType function_ty = function.getType();
-  int func_arg_count = function_ty.getNumInputs();
-  int arg_count = op.args().size();
-
-  if (arg_count != func_arg_count) {
-    return op.emitError() << "argument count mismatch: 'args' has " << arg_count
-                          << " arguments, but '" << func << "' expects "
-                          << func_arg_count;
-  }
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// PowOp
-//===----------------------------------------------------------------------===//
-
-OpFoldResult PowOp::fold(ArrayRef<Attribute> operands) {
-  auto constant_y = operands[1].dyn_cast_or_null<DenseFPElementsAttr>();
-  if (constant_y && constant_y.isSplat()) {
-    APFloat y_value = constant_y.getSplatValue<APFloat>();
-    auto output_type = getType().cast<ShapedType>();
-    if (y_value.isZero() && output_type.hasStaticShape()) {
-      return DenseElementsAttr::get(
-          output_type,
-          FloatAttr::get(output_type.getElementType(), /*value=*/1.0));
-    }
-    if (y_value.isExactlyValue(1.0)) {
-      return x();
-    }
-  }
-  return {};
-}
-
-//===----------------------------------------------------------------------===//
-// QrOp
-//===----------------------------------------------------------------------===//
-
-// Verifies that,
-//
-// * Input type, if ranked, must have at least 2 dimensions and at most
-//   INT32_MAX dimensions.
-//
-static LogicalResult Verify(QrOp op) {
-  auto ttype = op.input().getType().cast<TensorType>();
-  if (!ttype.hasRank()) return success();
-  if (!HasRankAtLeast(op.input(), 2))
-    return op.emitOpError(
-        "requires ranked input tensor to be of rank 2 or more");
-  if (!HasRankAtMost(op.input(), std::numeric_limits<int32_t>::max()))
-    return op.emitOpError(
-        "requires ranked input tensor to be of rank INT32_MAX or less");
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// ReciprocalOp
-//===----------------------------------------------------------------------===//
-
-void ReciprocalOp::getCanonicalizationPatterns(
-    OwningRewritePatternList &results, MLIRContext *context) {
-  results.insert<ReciprocalNested>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// RandomUniformOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(RandomUniformOp op) {
-  if (!IsOfRankOrUnranked(op.shape(), 1))
-    return op.emitOpError("shape must be 1D tensor");
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// RangeOp
-//===----------------------------------------------------------------------===//
-
-void RangeOp::build(OpBuilder &builder, OperationState &result, Value start,
-                    Value limit, Value delta) {
-  assert(start.getType() == limit.getType());
-  assert(start.getType() == delta.getType());
-  DenseIntElementsAttr start_val;
-  DenseIntElementsAttr limit_val;
-  DenseIntElementsAttr delta_val;
-  if (matchPattern(start, m_Constant(&start_val)) &&
-      matchPattern(limit, m_Constant(&limit_val)) &&
-      matchPattern(delta, m_Constant(&delta_val))) {
-    auto size = llvm::APIntOps::RoundingSDiv(
-        *limit_val.begin() - *start_val.begin(), *delta_val.begin(),
-        llvm::APInt::Rounding::DOWN);
-    return RangeOp::build(
-        builder, result,
-        RankedTensorType::get(
-            size.getSExtValue(),
-            start.getType().cast<TensorType>().getElementType()),
-        start, limit, delta);
-  }
-  return RangeOp::build(
-      builder, result,
-      RankedTensorType::get(
-          {-1}, start.getType().cast<TensorType>().getElementType()),
-      start, limit, delta);
-}
-//===----------------------------------------------------------------------===//
-// RankOp
-//===----------------------------------------------------------------------===//
-
-void RankOp::build(OpBuilder &builder, OperationState &result, Value input) {
-  return RankOp::build(builder, result,
-                       RankedTensorType::get({}, builder.getIntegerType(32)),
-                       input);
-}
-
-// This will create a constant value for RankOp of a ranked tensor.
-OpFoldResult RankOp::fold(ArrayRef<Attribute> operands) {
-  auto type = input().getType();
-  auto ranked_type = type.dyn_cast<RankedTensorType>();
-  if (!ranked_type) return {};
-
-  auto output_type = getType().cast<ShapedType>();
-  int32_t rank = ranked_type.getRank();
-  return DenseIntElementsAttr::get(output_type, rank);
-}
-
-//===----------------------------------------------------------------------===//
-// RealDivOp
-//===----------------------------------------------------------------------===//
-
-void RealDivOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
-                                            MLIRContext *context) {
-  results.insert<RealDivWithSqrtDivisor>(context);
-}
-
-OpFoldResult RealDivOp::fold(ArrayRef<Attribute> operands) {
-  return IdentityArithmeticOpFolder<RealDivOp>(*this, operands);
-}
-
-//===----------------------------------------------------------------------===//
-// ReshapeOp
-//===----------------------------------------------------------------------===//
-
-// TODO(b/128020684): Verify the output type.
-static LogicalResult Verify(ReshapeOp op) {
-  auto shape_type = op.shape().getType().cast<TensorType>();
-  if (!shape_type.hasRank()) return success();
-  if (shape_type.getRank() != 1)
-    return op.emitOpError("shape must be 1D tensor");
-  auto rank_by_shape = shape_type.getShape()[0];
-  auto type_of_tensor = op.tensor().getType().cast<TensorType>();
-  // No compile time verification for unknown sized shape.
-  if (rank_by_shape == -1 || !type_of_tensor.hasStaticShape()) return success();
-  int64_t num_by_tensor = type_of_tensor.getNumElements();
-
-  auto out_ty = op.getType().dyn_cast<RankedTensorType>();
-  if (out_ty && out_ty.hasStaticShape()) {
-    int64_t num_output_elements = out_ty.getNumElements();
-    if (num_by_tensor != num_output_elements)
-      return op.emitOpError()
-             << "number of output elements (" << num_output_elements
-             << ") does not match expected number of elements ("
-             << num_by_tensor << ")";
-  }
-
-  // Check values if constant shape. No compiling time verification for
-  // non-constant shape.
-  auto *shape_op = op.shape().getDefiningOp();
-  if (!shape_op) return success();
-  Attribute shape_cst;
-  if (!matchPattern(shape_op, m_Constant(&shape_cst))) return success();
-  auto shape_cst_attr = shape_cst.dyn_cast<ElementsAttr>();
-  if (!shape_cst_attr) return op.emitOpError("shape must be a valid tensor");
-
-  if (auto opaque_attr = shape_cst_attr.dyn_cast<OpaqueElementsAttr>()) {
-    opaque_attr.decode(shape_cst_attr);
-  }
-
-  // We know the shape is a 1-D Tensor, then let us get the number of
-  // elements it implies.
-  unsigned num_by_shape = 1;
-  unsigned unknown_dim_count = 0;
-  for (int i = 0, e = rank_by_shape; i != e; ++i) {
-    auto num = shape_cst_attr.getValue<IntegerAttr>(i).getInt();
-    // The dimension size value can be -1, and that the real size needs to
-    // be computed so that the total size remains constant. At most one
-    // component of shape can be -1.
-    if (num == -1) {
-      if (++unknown_dim_count > 1) {
-        return op.emitOpError("more than one component of shape are -1");
-      }
-    } else {
-      num_by_shape *= num;
-    }
-  }
-  // If there is one component of shape is -1, the dimension should be
-  // computed so that the total size remains constant.
-  if (unknown_dim_count == 1) {
-    if (num_by_tensor % num_by_shape != 0)
-      return op.emitOpError(
-          "one component of shape is -1 but couldn't infer the dimension");
-    return success();
-  }
-  // If the elements by the tensor and implies by the shape don't match,
-  // fail this static check.
-  if (num_by_tensor != num_by_shape) {
-    return op.emitOpError(
-        "mismatch in tensor elements and shape implied elements");
-  }
-  return success();
-}
-
-void ReshapeOp::build(OpBuilder &builder, OperationState &result, Value tensor,
-                      Value shape) {
-  auto ttype = tensor.getType().cast<ShapedType>();
-  auto etype = ttype.getElementType();
-
-  auto unranked = [&builder, etype, &result, shape, tensor]() {
-    return ReshapeOp::build(builder, result, UnrankedTensorType::get(etype),
-                            tensor, shape);
-  };
-
-  // If tensor is unranked then we have no info about output of shape.
-  if (!ttype.hasRank()) return unranked();
-
-  DenseIntElementsAttr attr_shape;
-  if (matchPattern(shape, m_Constant(&attr_shape))) {
-    llvm::SmallVector<int64_t, 4> const_shape;
-    const_shape.reserve(attr_shape.getNumElements());
-
-    // Detect if reshape output shape is folded.
-    bool flatten = false;
-    int unknown_index = -1;
-    // The product of constant shape argument excluding unknown dimension.
-    int64_t product_cshape = 1;
-    for (auto e : llvm::enumerate(attr_shape)) {
-      int64_t val = e.value().getSExtValue();
-      if (IsUnknownDimOrRank(val)) {
-        if (flatten) {
-          mlir::emitError(result.location)
-              << "only one unknown dimension allowed";
-          return;
-        }
-        flatten = true;
-        unknown_index = e.index();
-      } else {
-        product_cshape *= val;
-      }
-      const_shape.push_back(val);
-    }
-
-    // Compute the value of the unknown dimension.
-    if (flatten) {
-      // Compute number of elements in tensor shape.
-      auto tshape = ttype.getShape();
-      int64_t product_tshape = std::accumulate(tshape.begin(), tshape.end(), 1,
-                                               std::multiplies<int64_t>());
-      // Set the unknown dimension such that total number of elements remain
-      // constant.
-      // Note: The case where the ratio is not integral, and so the total size
-      // of reshape not constant, is checked in verify function.
-      const_shape[unknown_index] = product_tshape / product_cshape;
-    }
-    return ReshapeOp::build(builder, result,
-                            RankedTensorType::get(const_shape, etype), tensor,
-                            shape);
-  }
-  return unranked();
-}
-
-void ReshapeOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
-                                            MLIRContext *context) {
-  results.insert<RedundantReshape>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// SelectOp
-//===----------------------------------------------------------------------===//
-
-void SelectOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
-                                           MLIRContext *context) {
-  results.insert<SelectToSelectV2>(context);
-}
-
-// Verifies a few extra requirements on SelectOp:
-// (1) `then` and `else` must have same shape
-// (2) At least one of the following must be true:
-//     (a) `cond` has the same rank as `then` and `else`
-//     (b) `cond` is a scalar
-//     (c) `cond` is a vector AND `then` and `else` are non-scalar with their
-//         first dimension equal to `cond`.
-static LogicalResult Verify(SelectOp op) {
-  auto then_tensor = op.t().getType().cast<TensorType>();
-  auto else_tensor = op.e().getType().cast<TensorType>();
-  // Check (1).
-  if (!AreCastCompatible({then_tensor, else_tensor}))
-    return op.emitOpError() << "requires t and e have compatible shapes";
-
-  // Get data rank (if exists).
-  int data_rank;
-  // If data is unranked or data_rank is 0, this will remain -2. Otherwise
-  // refers to first dimension of then and/or else.
-  int data_first_dim = -2;
-  bool then_has_rank = then_tensor.hasRank();
-  bool else_has_rank = else_tensor.hasRank();
-  if (then_has_rank && else_has_rank) {
-    data_rank = then_tensor.getRank();
-    if (then_tensor.getRank() > 0)
-      data_first_dim = then_tensor.getShape().front();
-    if (else_tensor.getRank() > 0)
-      data_first_dim = std::max(
-          static_cast<int>(else_tensor.getShape().front()), data_first_dim);
-  } else if (then_has_rank) {
-    data_rank = then_tensor.getRank();
-    if (then_tensor.getRank() > 0)
-      data_first_dim = then_tensor.getShape().front();
-  } else if (else_has_rank) {
-    data_rank = else_tensor.getRank();
-    if (else_tensor.getRank() > 0)
-      data_first_dim = else_tensor.getShape().front();
-  } else {
-    // Neither has a rank.
-    return success();
-  }
-
-  auto cond_tensor = op.condition().getType().dyn_cast<RankedTensorType>();
-  if (!cond_tensor) return success();
-  auto cond_rank = cond_tensor.getRank();
-  // Check (2a) and (2b).
-  if (cond_rank == 0 || cond_rank == data_rank) return success();
-  // Check (2c).
-  if (cond_rank == 1) {
-    auto cond_shape = cond_tensor.getShape().front();
-    if (data_rank == 0) {
-      return op.emitOpError()
-             << "requires that t and e are nonscalar when pred is a vector";
-    }
-    // We know `data` tensor has a rank of at least 1.
-    if (data_first_dim != -1 && cond_shape != -1 &&
-        data_first_dim != cond_shape) {
-      return op.emitOpError() << "requires that, when pred is a vector, the "
-                                 "shape matches the first dimension of t and e";
-    }
-    return success();
-  }
-  // None of (2a,b,c) were true; fail.
-  return op.emitOpError() << "requires that pred is a scalar OR has the same "
-                             "rank as t and e OR is a vector";
-}
-
-//===----------------------------------------------------------------------===//
-// SelectV2Op
-//===----------------------------------------------------------------------===//
-
-static Type InferSelectV2OpType(Value condition, Value e, Value t) {
-  Type element_ty = e.getType().cast<TensorType>().getElementType();
-  auto unranked_ty = UnrankedTensorType::get(element_ty);
-
-  Type broadcasted_ty =
-      OpTrait::util::getBroadcastedType(e.getType(), t.getType());
-  if (!broadcasted_ty) return unranked_ty;
-
-  auto cond_ranked_ty = condition.getType().dyn_cast<RankedTensorType>();
-  auto broadcasted_ranked_ty = broadcasted_ty.dyn_cast<RankedTensorType>();
-  if (!cond_ranked_ty || !broadcasted_ranked_ty) return unranked_ty;
-
-  // Explicitly get broadcasted output type as element types of condition may
-  // not be same as the broadcated type's element type.
-  SmallVector<int64_t, 4> result_shape;
-  if (!OpTrait::util::getBroadcastedShape(cond_ranked_ty.getShape(),
-                                          broadcasted_ranked_ty.getShape(),
-                                          result_shape))
-    return unranked_ty;
-  return RankedTensorType::get(result_shape, element_ty);
-}
-
-void SelectV2Op::build(OpBuilder &builder, OperationState &result,
-                       Value condition, Value e, Value t) {
-  build(builder, result, InferSelectV2OpType(condition, e, t), condition, e, t);
-}
-
-//===----------------------------------------------------------------------===//
-// ShapeOp
-//===----------------------------------------------------------------------===//
-
-namespace {
-// Validates Shape/ShapeN/VariableShape operand and associated result types.
-LogicalResult VerifyShapeOperandAndResult(Operation *op, Type operand_type,
-                                          Type result_type,
-                                          int variadic_idx = -1) {
-  std::string variadic_idx_str =
-      variadic_idx < 0 ? "" : llvm::formatv(" #{0}", variadic_idx).str();
-
-  auto result_ranked_type = result_type.dyn_cast<RankedTensorType>();
-  if (!result_ranked_type) return success();
-  if (result_ranked_type.getShape().size() != 1)
-    return op->emitOpError("requires 1D type for result") << variadic_idx_str;
-
-  auto operand_ranked_type = operand_type.dyn_cast_or_null<RankedTensorType>();
-  if (operand_ranked_type) {
-    // The operand is a ranked tensor.
-    if (result_ranked_type.hasStaticShape() &&
-        !operand_ranked_type.getShape().empty() &&
-        result_ranked_type.getDimSize(0) !=
-            operand_ranked_type.getShape().size())
-      return op->emitOpError("requires dimension size of result")
-             << variadic_idx_str << " to match rank of operand"
-             << variadic_idx_str;
-  } else if (result_ranked_type.hasStaticShape()) {
-    // The operand is an unranked tensor, print a warning if the result
-    // is static.
-    // Note: We do not handle this situation as an error, this would be too
-    // restrictive due to incompleteness of shape inference at this point.
-    op->emitWarning("has static shape result")
-        << variadic_idx_str << " for unranked operand" << variadic_idx_str;
-  }
-
-  Type element_type = result_ranked_type.getElementType();
-  if (!element_type.isSignlessInteger(32) &&
-      !element_type.isSignlessInteger(64))
-    return op->emitOpError("requires int32 or int64 return type for result")
-           << variadic_idx_str;
-
-  return success();
-}
-}  // anonymous namespace
-
-static LogicalResult Verify(ShapeOp op) {
-  return VerifyShapeOperandAndResult(op, op.input().getType(), op.getType());
-}
-
-// Converts shape of the given type to attribute if it is of ranked tensor type.
-// Returned attribute has integer elements of the given width.
-static Attribute ConvertShapeToAttr(Type input_ty, int out_width) {
-  auto ranked_ty = input_ty.dyn_cast<RankedTensorType>();
-  if (!ranked_ty || !ranked_ty.hasStaticShape()) return {};
-
-  auto shape = ranked_ty.getShape();
-  int rank = shape.size();
-
-  SmallVector<APInt, 4> dimensions;
-  dimensions.reserve(rank);
-  for (int i = 0; i < rank; ++i)
-    dimensions.push_back(APInt(out_width, shape[i]));
-
-  auto result_type = RankedTensorType::get(
-      {rank}, IntegerType::get(out_width, input_ty.getContext()));
-  return DenseElementsAttr::get(result_type, dimensions);
-}
-
-OpFoldResult ShapeOp::fold(ArrayRef<Attribute> operands) {
-  int width =
-      getType().cast<ShapedType>().getElementType().getIntOrFloatBitWidth();
-  return ConvertShapeToAttr(getOperand().getType(), width);
-}
-
-void ShapeOp::build(OpBuilder &builder, OperationState &result, Value input,
-                    BoolAttr use32Bit) {
-  auto rankedTensorType = input.getType().dyn_cast<RankedTensorType>();
-  int64_t rank = rankedTensorType ? rankedTensorType.getRank() : -1;
-  auto out_type = use32Bit.getValue() ? builder.getIntegerType(32)
-                                      : builder.getIntegerType(64);
-  return ShapeOp::build(builder, result,
-                        RankedTensorType::get({rank}, out_type), input);
-}
-
-//===----------------------------------------------------------------------===//
-// ShapeNOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(ShapeNOp op) {
-  const size_t num_tensors = op.N();
-
-  if (op.getNumOperands() != num_tensors)
-    return op.emitOpError() << "requires " << num_tensors << " operand(s), got "
-                            << op.getNumOperands() << " operand(s)";
-
-  if (op.getNumResults() != num_tensors)
-    return op.emitOpError() << "requires " << num_tensors << " result(s), got "
-                            << op.getNumResults() << " result(s)";
-
-  for (auto i : llvm::seq<uint64_t>(0, num_tensors)) {
-    auto verification = VerifyShapeOperandAndResult(
-        op, op.getOperand(i).getType(), op.getResult(i).getType(), i);
-    if (failed(verification)) return verification;
-  }
-
-  return success();
-}
-
-LogicalResult ShapeNOp::fold(ArrayRef<Attribute> operands,
-                             SmallVectorImpl<OpFoldResult> &results) {
-  if (getNumOperands() == 0) return success();
-  int width =
-      getType(0).cast<ShapedType>().getElementType().getIntOrFloatBitWidth();
-
-  for (Type input_ty : getOperandTypes()) {
-    OpFoldResult result = ConvertShapeToAttr(input_ty, width);
-    if (!result) return failure();
-
-    results.push_back(result);
-  }
-  return success();
-}
-
-// TODO(hinsu): Add canonicalization pattern for ShapeN ops that don't have all
-// static input shapes. Replacing output values corresponding to static input
-// types may enable optimizations in users of the values.
-
-//===----------------------------------------------------------------------===//
-// SizeOp
-//===----------------------------------------------------------------------===//
-
-// Verifies that,
-//
-// * Input type, if is a ranked tensor, has at most INT32_MAX dimensions.
-//
-static LogicalResult Verify(SizeOp op) {
-  if (!HasRankAtMost(op.input(), std::numeric_limits<int32_t>::max()))
-    return op.emitOpError(
-        "requires ranked input tensor to be of rank INT32_MAX or less");
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// SliceOp
-//===----------------------------------------------------------------------===//
-
-// Verifies that:
-//
-// - operands begin and size are 1D with the same number of elements.
-// - if the input is a ranked tensor, the rank of the input equals the number
-//   of elements in operands begin and size.
-// - if begin are constants, that
-//   0 <= begin[i] <= begin[i] + size[i] <= input_ty.getShape()[i]
-// - if begins aren't constant but the input is a ranked tensor, that
-//   size[i] <= input_ty.getShape()[i]
-//
-static LogicalResult Verify(SliceOp op) {
-  RankedTensorType begin_ty = GetRankedTensorTypeForOperand(op.begin());
-  if (begin_ty && begin_ty.getRank() != 1) {
-    return op.emitOpError() << "requires begin operand to be 1D tensor";
-  }
-
-  RankedTensorType size_ty = GetRankedTensorTypeForOperand(op.size());
-  if (size_ty && size_ty.getRank() != 1) {
-    return op.emitOpError() << "requires size operand to be 1D tensor";
-  }
-
-  if (!begin_ty || !size_ty || !begin_ty.hasStaticShape() ||
-      !size_ty.hasStaticShape())
-    return success();
-
-  if (begin_ty.getNumElements() != size_ty.getNumElements()) {
-    return op.emitOpError() << "requires begin and size operands to have the"
-                               " same number of elements";
-  }
-
-  auto input_ty = op.input().getType().dyn_cast<RankedTensorType>();
-  if (input_ty && begin_ty.getNumElements() != input_ty.getRank()) {
-    return op.emitOpError() << "requires number of elements in begin and size"
-                               "are equal to input rank";
-  }
-
-  DenseIntElementsAttr begin_indices;
-  if (matchPattern(op.begin(), m_Constant(&begin_indices))) {
-    DenseIntElementsAttr slice_sizes;
-    bool constant_slice_sizes =
-        matchPattern(op.size(), m_Constant(&slice_sizes));
-    int dim = 0;
-    for (const APInt &raw_begin_index : begin_indices.getValues<APInt>()) {
-      int64_t begin_index = raw_begin_index.getSExtValue();
-      int64_t input_size = input_ty ? input_ty.getShape()[dim] : -1;
-      int64_t slice_size = constant_slice_sizes
-                               ? slice_sizes.getValue<APInt>(dim).getSExtValue()
-                               : 0;
-      if (slice_size == -1 && input_size != -1) {
-        slice_size = input_size - begin_index;
-      }
-      if (begin_index < 0 ||
-          (input_size != -1 && begin_index + slice_size > input_size)) {
-        return op.emitOpError()
-               << "requires 0 <= begin[i] <= begin[i] + size[i] <= Di";
-      }
-      ++dim;
-    }
-  } else if (input_ty) {
-    // If the inputs are ranked, we can do a few more sanity checks.
-    DenseIntElementsAttr slice_sizes;
-    if (matchPattern(op.size(), m_Constant(&slice_sizes))) {
-      auto input_shape = input_ty.getShape();
-      for (int64_t i = 0; i < input_ty.getRank(); ++i) {
-        int64_t slice_size = slice_sizes.getValue<IntegerAttr>(i).getInt();
-        int64_t input_size = input_shape[i];
-        if (slice_size != -1 && input_size != -1 && slice_size > input_size) {
-          return op.emitOpError() << "requires size[i] <= Di, even if begin[i] "
-                                     "is unknown at compile time";
-        }
-      }
-    }
-  }
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// SoftmaxOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(SoftmaxOp op) {
-  if (!HasRankAtLeast(op.logits(), 1)) {
-    return op.emitOpError("requires operand to have rank at least 1");
-  }
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// SoftmaxCrossEntropyWithLogitsOp
-//===----------------------------------------------------------------------===//
-
-// Verifies that,
-//
-// * Input types are broadcast compatible and the broadcasted type has rank two.
-//
-static LogicalResult Verify(SoftmaxCrossEntropyWithLogitsOp op) {
-  auto broadcasted_ty = OpTrait::util::getBroadcastedType(
-                            op.features().getType(), op.labels().getType())
-                            .dyn_cast_or_null<ShapedType>();
-  if (!broadcasted_ty ||
-      (broadcasted_ty.hasRank() && broadcasted_ty.getRank() != 2))
-    return op.emitOpError(
-        "requires features and labels to be broadcast compatible to rank two");
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// SparseSoftmaxCrossEntropyWithLogitsOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(SparseSoftmaxCrossEntropyWithLogitsOp op) {
-  if (!IsOfRankOrUnranked(op.features(), 2)) {
-    return op.emitOpError("requires features operand of rank two");
-  }
-  if (!IsOfRankOrUnranked(op.labels(), 1)) {
-    return op.emitOpError("requires labels operand of rank one");
-  }
-  auto features_ty = op.features().getType().dyn_cast<RankedTensorType>();
-  auto labels_ty = op.labels().getType().dyn_cast<RankedTensorType>();
-  if (features_ty && labels_ty) {
-    int64_t features_batches = features_ty.getDimSize(0);
-    int64_t labels_batches = labels_ty.getDimSize(0);
-    if (!ShapedType::isDynamic(features_batches) &&
-        !ShapedType::isDynamic(labels_batches) &&
-        features_batches != labels_batches)
-      return op.emitOpError(
-          "requires features and labels with matching first dimension");
-  }
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// SplitOp
-//===----------------------------------------------------------------------===//
-
-// Verifies the input and split dimension operands for tf.Split/tf.SplitV.
-// Writes the split dimension's index (adjusted with input rank) via `dim_index`
-// if it's a constant.
-template <class Op>
-LogicalResult VerifySplitInputAndSplitDim(Op op, Optional<int64_t> *dim_index) {
-  *dim_index = llvm::None;
-
-  Value split_dim = op.split_dim();
-  if (auto split_dim_type = split_dim.getType().dyn_cast<RankedTensorType>())
-    if (split_dim_type.getRank() != 0)
-      return op.emitOpError(
-          "split dimension should be an integer scalar tensor");
-
-  // We can perform further verification if the input tensor to be split has
-  // known rank and the split dimension tensor is a constant.
-
-  auto input_type = op.value().getType().template dyn_cast<RankedTensorType>();
-  if (!input_type) return success();
-
-  int64_t input_rank = input_type.getRank();
-  if (input_rank == 0)
-    return op.emitOpError("cannot split scalar input tensor");
-
-  DenseIntElementsAttr split_dim_attr;
-  if (!matchPattern(split_dim, m_Constant(&split_dim_attr))) return success();
-
-  int64_t index = (*split_dim_attr.begin()).getSExtValue();
-
-  if (index + input_rank < 0 || index >= input_rank) {
-    return op.emitOpError("split dimension must be in range [-")
-           << input_rank << ", " << input_rank << ")";
-  }
-
-  if (index < 0) index += input_rank;
-  *dim_index = index;
-
-  return success();
-}
-
-static LogicalResult Verify(SplitOp op) {
-  Optional<int64_t> dim_index;
-  if (failed(VerifySplitInputAndSplitDim(op, &dim_index))) return failure();
-  if (!dim_index) return success();
-
-  int64_t input_dim_size =
-      op.value().getType().cast<RankedTensorType>().getDimSize(*dim_index);
-  if (input_dim_size == ShapedType::kDynamicSize) return success();
-
-  if (input_dim_size % op.getNumResults() != 0)
-    return op.emitOpError("dimension #")
-           << *dim_index << " not divisible by the number of result tensors";
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// SplitVOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(SplitVOp op) {
-  auto split_sizes_type =
-      op.size_splits().getType().dyn_cast<RankedTensorType>();
-  if (!split_sizes_type) return success();
-
-  if (split_sizes_type.getRank() != 1 ||
-      split_sizes_type.getDimSize(0) != op.getNumResults())
-    return op.emitOpError("split sizes should be a 1D tensor of ")
-           << op.getNumResults() << " elements";
-
-  Optional<int64_t> dim_index = 0;
-  if (failed(VerifySplitInputAndSplitDim(op, &dim_index))) return failure();
-  if (!dim_index) return success();
-
-  int64_t input_dim_size =
-      op.value().getType().cast<RankedTensorType>().getDimSize(*dim_index);
-  if (input_dim_size == ShapedType::kDynamicSize) return success();
-
-  // If split sizes come from a constant, they must sum to the dimension size
-  // along split_dim, and we can have no more than one dynamic dimension.
-  DenseIntElementsAttr split_sizes_attr;
-  if (!matchPattern(op.size_splits(), m_Constant(&split_sizes_attr)))
-    return success();
-
-  int64_t total_dim_size = 0;  // Total dimension size assigned to splits
-  llvm::Optional<int> dynamic_dim_index;
-
-  SmallVector<int64_t, 4> split_sizes;
-  split_sizes.reserve(
-      split_sizes_attr.getType().cast<ShapedType>().getNumElements());
-
-  for (auto dim : llvm::enumerate(split_sizes_attr)) {
-    int64_t dim_val = dim.value().getSExtValue();
-    split_sizes.push_back(dim_val);
-    if (dim_val == ShapedType::kDynamicSize) {
-      // We cannot have more than one dynamic dimension.
-      if (dynamic_dim_index)
-        return op.emitOpError(
-            "cannot have more than one dynamic dimension in split sizes");
-      dynamic_dim_index = dim.index();
-    } else {
-      total_dim_size += dim_val;
-    }
-  }
-
-  if (!dynamic_dim_index && total_dim_size != input_dim_size)
-    return op.emitOpError(
-               "split sizes must sum up to the dimension size along split "
-               "dimension, found ")
-           << total_dim_size << " vs " << input_dim_size;
-
-  if (dynamic_dim_index && total_dim_size > input_dim_size)
-    return op.emitOpError(
-               "split sizes must sum up to be less than or equal to the "
-               "dimension size along split dimension, found ")
-           << total_dim_size << " vs " << input_dim_size;
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// SquareOp
-//===----------------------------------------------------------------------===//
-
-void SquareOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
-                                           MLIRContext *context) {
-  results.insert<SquareOfSub>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// SubOp
-//===----------------------------------------------------------------------===//
-
-void SubOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
-                                        MLIRContext *context) {
-  results.insert<SubOfNeg>(context);
-}
-
-OpFoldResult SubOp::fold(ArrayRef<Attribute> operands) {
-  return IdentityArithmeticOpFolder<SubOp>(*this, operands);
-}
-
-//===----------------------------------------------------------------------===//
-// SumOp
-//===----------------------------------------------------------------------===//
-
-void SumOp::build(OpBuilder &builder, OperationState &result, Value input,
-                  Value reduction_indices, BoolAttr keep_dims) {
-  Type out_ty =
-      InferReductionOpType(input, reduction_indices, keep_dims, &builder);
-  build(builder, result, out_ty, input, reduction_indices, keep_dims);
-}
-
-//===----------------------------------------------------------------------===//
-// StridedSliceOp
-//===----------------------------------------------------------------------===//
-
-// TODO(b/154160827): Add a canonicalization pattern from tf.StridedSliceOp to
-// tf.SliceOp if both of the following are true:
-// - All strides have a known value equal to 1
-// - No masks are set (or masks can be applied by transforming the inputs to
-//   Slice)
-
-// Verifies that,
-//
-// - begin, end and strides operands are 1D and they have the same number of
-//   elements. Here, the number of elements should be less than 32 to support
-//   32-bit mask attributes.
-// - None of the strides values are zero.
-// - Ellipsis mask can have at most one bit set.
-
-template <class OpTy>
-static LogicalResult VerifyStridedSliceBase(OpTy op) {
-  // Expected size for operands begin, end and strides vector operands.
-  int64_t expected_size = -1;
-
-  for (Value val : {op.begin(), op.end(), op.strides()}) {
-    auto operand_ty = val.getType().dyn_cast<ShapedType>();
-    if (!operand_ty || !operand_ty.hasStaticShape()) {
-      // TensorFlow constant ops may have non-static shape because the shape is
-      // not propagated during constant folding. If the defining op for this
-      // operand is a constant op, use the constant op's attribute to get the
-      // actual shape.
-      DenseIntElementsAttr attr;
-      if (!matchPattern(val, m_Constant(&attr))) continue;
-      operand_ty = attr.getType();
-    }
-
-    if (operand_ty.getRank() != 1)
-      return op.emitOpError()
-             << "requires begin, end and strides to be 1D tensors";
-
-    int64_t length = operand_ty.getDimSize(0);
-    if (length == -1) continue;
-
-    if (expected_size == -1) {
-      // This op uses 32-bit masks.
-      if (length >= 32)
-        return op.emitOpError(
-            "requires begin, end and strides operands with less than 32 "
-            "elements");
-
-      expected_size = length;
-    } else if (length != expected_size) {
-      return op.emitOpError() << "requires begin, end and strides to have the "
-                                 "same number of elements";
-    }
-  }
-
-  // If strides are constants, verify that none of the element is zero.
-  DenseIntElementsAttr strides;
-  if (matchPattern(op.strides(), m_Constant(&strides))) {
-    if (llvm::is_contained(strides.getValues<APInt>(), 0))
-      return op.emitOpError("requires non-zero strides");
-  }
-
-  // Use bit compares to ensure ellipsis_mask is 0 or a power of 2, i.e. there
-  // exists only no more than one ellipsis.
-  uint32_t ellipsis_mask = op.ellipsis_mask().getZExtValue();
-  if (ellipsis_mask != 0 && !llvm::isPowerOf2_32(ellipsis_mask))
-    return op.emitOpError("cannot have multiple ellipses");
-
-  return success();
-}
-
-// Clamps the given `val`: returns `low` if `val` is less than `low`; returns
-// `high` if `high` is less than `val`; otherwise returns `val`.
-template <class T>
-constexpr const T &Clamp(const T &val, const T &low, const T &high) {
-  assert(!(high < low));
-  return (val < low) ? low : (high < val) ? high : val;
-}
-
-// Checks if the `index` bit of `val` is set.
-template <class T>
-constexpr bool IsSet(const T &val, unsigned index) {
-  return (val & (1 << index)) != 0;
-}
-
-// Sets the `index` bit of `val`.
-template <class T>
-constexpr void Set(T &val, unsigned index) {
-  val |= (1 << index);
-}
-
-// Unset the `index` bit of `val`.
-template <class T>
-constexpr void Unset(T &val, unsigned index) {
-  val &= ~(1 << index);
-}
-
-// Copy the `src_index` bit of `src` to `dst_index` bit of `dst`.
-template <class T>
-constexpr void CopyBit(const T &src, unsigned src_index, T &dst,
-                       unsigned dst_index) {
-  if (IsSet(src, src_index))
-    Set(dst, dst_index);
-  else
-    Unset(dst, dst_index);
-}
-
-// The sparse spec of strided slice does not correspond to the number of
-// dimensions. For example, sparse spec for foo[..., 3:10] for foo of shape (2,
-// 4, 8) would have dims = 2.
-struct SparseSliceSpec {
-  int64_t dims;
-  int32_t begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask;
-  const ArrayRef<int64_t> &begin;
-  const ArrayRef<int64_t> &end;
-  const ArrayRef<int64_t> &strides;
-};
-
-// The dense spec of strided slice is the canonicalized version of sparse spec.
-// The number of dimensions of dense spec correspond to the number of dimensions
-// in operand tensor.
-struct DenseSliceSpec {
-  int64_t dims;
-  int32_t begin_mask, end_mask, shrink_axis_mask;
-  SmallVectorImpl<int64_t> &begin;
-  SmallVectorImpl<int64_t> &end;
-  SmallVectorImpl<int64_t> &strides;
-};
-
-// Make a sparse spec into a dense index spec.
-// The sparse spec does not correspond to the number of dimensions
-// Make a dense spec that corresponds to the number of dimensions
-//
-// For example suppose foo[...,3:, 2] on foo.shape=(2,2,3,4) then
-// we need to produce the missing begin_mask, end_mask for the first two
-// dimensions i.e. foo[:, :, 3:, 2].
-static void BuildDenseSliceSpec(const SparseSliceSpec &sparse,
-                                DenseSliceSpec *dense) {
-  // Build expanded dense begin, end, strides, begin_mask, end_mask, and
-  // shrink_axis_mask.
-  dense->begin.resize(dense->dims);
-  dense->end.resize(dense->dims);
-  dense->strides.resize(dense->dims);
-  dense->begin_mask = 0;
-  dense->end_mask = 0;
-  dense->shrink_axis_mask = 0;
-
-  // Count number of new_axis after ellipsis. This helps in calculating the
-  // number of dimensions ellipsis represents in the sparse spec.
-  bool ellipsis_seen = false;
-  int num_new_axis_after_ellipsis = 0;
-  for (int sparse_index = 0; sparse_index < sparse.dims; ++sparse_index) {
-    if (ellipsis_seen && IsSet(sparse.new_axis_mask, sparse_index))
-      num_new_axis_after_ellipsis++;
-    if (IsSet(sparse.ellipsis_mask, sparse_index)) ellipsis_seen = true;
-  }
-
-  int dense_index = 0;
-  for (int sparse_index = 0; sparse_index < sparse.dims; ++sparse_index) {
-    if (IsSet(sparse.new_axis_mask, sparse_index)) continue;
-    if (IsSet(sparse.ellipsis_mask, sparse_index)) {
-      auto next_index = std::min(dense->dims - (sparse.dims - sparse_index) +
-                                     1 + num_new_axis_after_ellipsis,
-                                 dense->dims);
-      // Expand ellipsis into the appropriate dense indices. From current index
-      // until next_index, all dimensions would have begin and end masks set and
-      // stride 1, i.e., get all elements in those dimensions.
-      for (; dense_index < next_index; ++dense_index) {
-        dense->begin[dense_index] = dense->end[dense_index] = 0;
-        dense->strides[dense_index] = 1;
-        Set(dense->begin_mask, dense_index);
-        Set(dense->end_mask, dense_index);
-      }
-      continue;
-    }
-    assert(dense_index < dense->dims);
-    // Copy over the sparse indices to dense indices if ellipsis_mask and
-    // new_axis_mask are not set.
-    dense->begin[dense_index] = sparse.begin[sparse_index];
-    dense->end[dense_index] = sparse.end[sparse_index];
-    dense->strides[dense_index] = sparse.strides[sparse_index];
-    CopyBit(sparse.begin_mask, sparse_index, dense->begin_mask, dense_index);
-    CopyBit(sparse.end_mask, sparse_index, dense->end_mask, dense_index);
-    CopyBit(sparse.shrink_axis_mask, sparse_index, dense->shrink_axis_mask,
-            dense_index);
-    dense_index++;
-  }
-}
-
-// For the given `input_shape`, calculates the sliced shape using the given
-// `begin`, `end`, and `stride` ranges and `begin_mask`, `end_mask`, and
-// `shrink_axis_mask` masks. Updates the result back to `input_shape`. If
-// `shrink_axis_mask` is not zero, this function will not drop the corresponding
-// dimensions in `input_shape`; it will turn them into 1s. At the same time,
-// canonicalizes `begin`, `end`, and `strides. The calculation follows
-// tf.StridedSlice op semantics.
-static void CalculateSlicedShapeFromDenseIndices(
-    MutableArrayRef<int64_t> input_shape, int32_t begin_mask, int32_t end_mask,
-    int32_t shrink_axis_mask, MutableArrayRef<int64_t> begin,
-    MutableArrayRef<int64_t> end, MutableArrayRef<int64_t> stride) {
-  assert(input_shape.size() <= 32);  // Only 32-bit masks are supported.
-
-  // Make sure ranges' ranks are consistent with the input.
-  assert(input_shape.size() == begin.size());
-  assert(input_shape.size() == end.size());
-  assert(input_shape.size() == stride.size());
-
-  for (int i = 0, e = input_shape.size(); i < e; ++i) {
-    if (ShapedType::isDynamic(input_shape[i])) continue;
-
-    int64_t dim_i = input_shape[i];
-    int64_t begin_i = begin[i];
-    int64_t end_i = end[i];
-    int64_t stride_i = stride[i];
-
-    // [0]: mask for begin, [1]: mask for end
-    int64_t masks[] = {begin_mask & (1 << i), end_mask & (1 << i)};
-    // [0]: bound for begin, [1]: bound for end
-    int64_t bounds[] = {stride_i > 0 ? 0 : -1,
-                        stride_i > 0 ? dim_i : dim_i - 1};
-
-    // Canonicalizes the given range `point` (begin/end) according to the
-    // current dimension. `c` means case: 0 for begin, 1 for end.
-    auto canonicalize = [&](int64_t point, int c) {
-      if (masks[c]) return stride_i > 0 ? bounds[c] : bounds[(c + 1) & 1];
-
-      // Add dim as offset to negative range point.
-      point = point < 0 ? dim_i + point : point;
-      return Clamp(point, bounds[0], bounds[1]);
-    };
-
-    begin_i = canonicalize(begin_i, 0);
-    end_i = canonicalize(end_i, 1);
-
-    int64_t interval_len = end_i - begin_i;
-    int64_t size_i = 0;
-    // If internal length is zero or has different sign from stride, it's a
-    // degenerated case: we are slicing nothing. Otherwise, calculate the sliced
-    // size.
-    if (interval_len != 0 && (interval_len < 0) == (stride_i < 0))
-      size_i = (interval_len / stride_i) + (interval_len % stride_i != 0);
-
-    begin[i] = begin_i;
-    if (IsSet(shrink_axis_mask, i)) {
-      // Shrink this dimension. It means we only take the element at begin_i.
-      input_shape[i] = 1;
-      end[i] = begin_i + 1;
-      stride[i] = 1;
-    } else {
-      input_shape[i] = size_i;
-      end[i] = end_i;
-      stride[i] = stride_i;
-    }
-  }
-}
-
-// For the given `input_shape`, calculates the sliced shape using the given
-// `sparse_begin`, `sparse_end`, and `sparse_strides` ranges and `begin_mask`,
-// `end_mask`, `ellipsis_mask` , `new_axis_mask` and `shrink_axis_mask` masks.
-// Updates the result back to `input_shape`.
-static void CalculateSlicedShapeFromSparseIndices(
-    MutableArrayRef<int64_t> input_shape, ArrayRef<int64_t> sparse_begin,
-    ArrayRef<int64_t> sparse_end, ArrayRef<int64_t> sparse_strides,
-    int32_t begin_mask, int32_t end_mask, int32_t ellipsis_mask,
-    int32_t new_axis_mask, int32_t shrink_axis_mask,
-    SmallVectorImpl<int64_t> *begin, SmallVectorImpl<int64_t> *end,
-    SmallVectorImpl<int64_t> *stride) {
-  int64_t num_sparse_indices = sparse_begin.size();
-  SparseSliceSpec sparse = {num_sparse_indices, begin_mask,    end_mask,
-                            ellipsis_mask,      new_axis_mask, shrink_axis_mask,
-                            sparse_begin,       sparse_end,    sparse_strides};
-
-  // If no ellipsis_mask exists then an implicit ellipsis_mask at the end is
-  // inserted. This handles cases where foo[2:4] (foo.shape() = [4, 8]) yields
-  // a tensor of shape [2, 8], i.e., foo[2:4] is same as foo[2:4, ...].
-  if (sparse.ellipsis_mask == 0) {
-    Set(sparse.ellipsis_mask, sparse.dims);
-    sparse.dims++;
-  }
-
-  int64_t dims = input_shape.size();
-  DenseSliceSpec dense = {dims,
-                          /*begin_mask = */ 0,
-                          /*end_mask = */ 0,
-                          /*shrink_axis_mask = */ 0,
-                          *begin,
-                          *end,
-                          *stride};
-
-  BuildDenseSliceSpec(sparse, &dense);
-  CalculateSlicedShapeFromDenseIndices(input_shape, dense.begin_mask,
-                                       dense.end_mask, dense.shrink_axis_mask,
-                                       *begin, *end, *stride);
-}
-
-bool StridedSliceOp::GetSlicedBoundRanges(
-    SmallVectorImpl<int64_t> *slice_begin, SmallVectorImpl<int64_t> *slice_end,
-    SmallVectorImpl<int64_t> *slice_stride) {
-  // TODO(hinsu): Support lowering for ops with dynamic begin and end values
-  // when it is possible to derive indices based on mask attributes.
-  DenseIntElementsAttr sparse_begin_attr, sparse_end_attr, sparse_strides_attr;
-  if (!matchPattern(begin(), m_Constant(&sparse_begin_attr)) ||
-      !matchPattern(end(), m_Constant(&sparse_end_attr)) ||
-      !matchPattern(strides(), m_Constant(&sparse_strides_attr)))
-    return false;
-
-  auto input_ty = this->input().getType().dyn_cast<RankedTensorType>();
-  if (!input_ty || !input_ty.hasStaticShape()) return false;
-  auto input_shape = llvm::to_vector<4>(input_ty.getShape());
-
-  SmallVector<int64_t, 4> sparse_begin, sparse_end, sparse_strides;
-
-  for (const APInt &index : sparse_begin_attr)
-    sparse_begin.push_back(index.getSExtValue());
-  for (const APInt &index : sparse_end_attr)
-    sparse_end.push_back(index.getSExtValue());
-  for (const APInt &stride : sparse_strides_attr)
-    sparse_strides.push_back(stride.getSExtValue());
-
-  CalculateSlicedShapeFromSparseIndices(
-      input_shape, sparse_begin, sparse_end, sparse_strides,
-      begin_mask().getZExtValue(), end_mask().getZExtValue(),
-      ellipsis_mask().getZExtValue(), new_axis_mask().getZExtValue(),
-      shrink_axis_mask().getZExtValue(), slice_begin, slice_end, slice_stride);
-  return true;
-}
-
-//===----------------------------------------------------------------------===//
-// StridedSliceGradOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(StridedSliceGradOp op) {
-  auto shape_type = op.shape().getType().dyn_cast<RankedTensorType>();
-  if (shape_type && shape_type.getRank() != 1)
-    return op.emitOpError("'shape' operand must be 1D tensor, but got ")
-           << shape_type.getRank() << "D tensor";
-
-  if (failed(VerifyStridedSliceBase(op))) return failure();
-
-  // TODO(antiagainst): verify the gradient op.dy()'s shape is consistent with
-  // the sliced type from StridedSlice.
-
-  return success();
-}
-
-bool StridedSliceGradOp::GetSlicedShapeAndBoundRanges(
-    SmallVectorImpl<int64_t> *input_shape,
-    SmallVectorImpl<int64_t> *slice_begin, SmallVectorImpl<int64_t> *slice_end,
-    SmallVectorImpl<int64_t> *slice_stride) {
-  DenseIntElementsAttr shape_attr;
-  DenseIntElementsAttr sparse_begin_attr, sparse_end_attr, sparse_strides_attr;
-  if (!matchPattern(shape(), m_Constant(&shape_attr)) ||
-      !matchPattern(begin(), m_Constant(&sparse_begin_attr)) ||
-      !matchPattern(end(), m_Constant(&sparse_end_attr)) ||
-      !matchPattern(strides(), m_Constant(&sparse_strides_attr)))
-    return false;
-
-  int rank = std::distance(shape_attr.begin(), shape_attr.end());
-
-  input_shape->clear();
-  input_shape->reserve(rank);
-  for (const APInt &dim : shape_attr)
-    input_shape->push_back(dim.getSExtValue());
-
-  SmallVector<int64_t, 4> sparse_begin, sparse_end, sparse_strides;
-
-  for (const APInt &index : sparse_begin_attr)
-    sparse_begin.push_back(index.getSExtValue());
-  for (const APInt &index : sparse_end_attr)
-    sparse_end.push_back(index.getSExtValue());
-  for (const APInt &stride : sparse_strides_attr)
-    sparse_strides.push_back(stride.getSExtValue());
-
-  CalculateSlicedShapeFromSparseIndices(
-      *input_shape, sparse_begin, sparse_end, sparse_strides,
-      begin_mask().getZExtValue(), end_mask().getZExtValue(),
-      ellipsis_mask().getZExtValue(), new_axis_mask().getZExtValue(),
-      shrink_axis_mask().getZExtValue(), slice_begin, slice_end, slice_stride);
-  return true;
-}
-
-//===----------------------------------------------------------------------===//
-// TensorListReserveOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(TensorListReserveOp op) {
-  if (!IsOfRankOrUnranked(op.element_shape(), 0) &&
-      !IsOfRankOrUnranked(op.element_shape(), 1)) {
-    return op.emitOpError("requires element_shape operand to be 0D/1D tensor");
-  }
-
-  if (!IsOfRankOrUnranked(op.num_elements(), 0)) {
-    return op.emitOpError("requires num_elements operand to be 0D tensor");
-  }
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// TensorListElementShapeOp
-//===----------------------------------------------------------------------===//
-
-OpFoldResult TensorListElementShapeOp::fold(ArrayRef<Attribute> operands) {
-  int width =
-      getType().cast<ShapedType>().getElementType().getIntOrFloatBitWidth();
-  auto variant_type =
-      getElementTypeOrSelf(getOperand().getType()).cast<TF::VariantType>();
-  if (variant_type.getSubtypes().empty()) return {};
-  return ConvertShapeToAttr(variant_type.getSubtypes()[0], width);
-}
-
-//===----------------------------------------------------------------------===//
-// TensorListStackOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(TensorListStackOp op) {
-  if (!IsOfRankOrUnranked(op.element_shape(), 0) &&
-      !IsOfRankOrUnranked(op.element_shape(), 1)) {
-    return op.emitOpError("requires element_shape operand to be 0D/1D tensor");
-  }
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// TensorScatterUpdateOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(TensorScatterUpdateOp op) {
-  if (!HasRankAtLeast(op.tensor(), 1))
-    return op.emitOpError(
-        "requires tensor operand to have at least 1 dimension");
-  if (!HasRankAtLeast(op.indices(), 1))
-    return op.emitOpError(
-        "requires indices operand to have at least 1 dimension");
-  if (!HasRankAtLeast(op.updates(), 1))
-    return op.emitOpError(
-        "requires updates operand to have at least 1 dimension");
-
-  auto tensor_ty = op.tensor().getType().dyn_cast<RankedTensorType>();
-  auto indices_ty = op.indices().getType().dyn_cast<RankedTensorType>();
-  if (!tensor_ty || !indices_ty) return success();
-
-  int64_t num_index_dims = indices_ty.getShape().back();
-  if (ShapedType::isDynamic(num_index_dims)) return success();
-
-  if (num_index_dims > tensor_ty.getRank())
-    return op.emitOpError(
-        "requires tensor operand with rank greater than or equal to the "
-        "indices operand's last dimensions");
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// TopKV2Op
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(TopKV2Op op) {
-  if (!HasRankAtLeast(op.input(), 1))
-    return op.emitOpError(
-        "requires input operand to have at least 1 dimension");
-
-  if (!IsOfRankOrUnranked(op.k(), 0))
-    return op.emitOpError("requires k operand to be 0D tensor");
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// ToBoolOp
-//===----------------------------------------------------------------------===//
-
-namespace {
-// If the input to ToBoolOp is a `tensor<i1>`, then the ToBoolOp is an identity
-// function and can be removed.
-class ToBoolOfZeroDBoolTensor : public OpRewritePattern<ToBoolOp> {
-  using OpRewritePattern<ToBoolOp>::OpRewritePattern;
-  LogicalResult matchAndRewrite(ToBoolOp op,
-                                PatternRewriter &rewriter) const override {
-    if (auto type = op.getOperand().getType().dyn_cast<RankedTensorType>()) {
-      if (type.getRank() == 0 && type.getElementType().isInteger(1)) {
-        rewriter.replaceOp(op, op.getOperand());
-        return success();
-      }
-    }
-    return failure();
-  }
-};
-}  // namespace
-
-void ToBoolOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
-                                           MLIRContext *context) {
-  results.insert<ToBoolOfZeroDBoolTensor>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// TransposeOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(TransposeOp op) {
-  // TODO(hinsu): Verify using a custom verifier that,
-  // * Transpose permutation is 1-D of size equal to the rank of the first
-  //   input, if the shapes are partially known. Requires use of a more
-  //   restrictive type than TF_Tensor.
-  // * Result shape dimensions are possible based on the input shape.
-  return success();
-}
-
-// TODO(jpienaar): perm could be optional too.
-void TransposeOp::build(OpBuilder &builder, OperationState &result, Value x,
-                        Value perm) {
-  auto x_type = x.getType().cast<TensorType>();
-  // If value is unranked, then so is results.
-  if (!x_type.hasRank())
-    return TransposeOp::build(builder, result,
-                              UnrankedTensorType::get(x_type.getElementType()),
-                              x, perm);
-
-  // TODO(jpienaar): Handle unknown perm case.
-
-  // TODO(jpienaar): Extract utility function.
-  auto etype = x_type.cast<ShapedType>().getElementType();
-  DenseIntElementsAttr attr_shape;
-  if (matchPattern(perm, m_Constant(&attr_shape))) {
-    llvm::SmallVector<int64_t, 4> const_shape;
-    if (attr_shape.isSplat()) {
-      const_shape.assign(
-          attr_shape.getNumElements(),
-          x_type.getDimSize((*attr_shape.begin()).getSExtValue()));
-    } else {
-      const_shape.reserve(attr_shape.getNumElements());
-      for (const auto &dim : attr_shape)
-        const_shape.push_back(x_type.getDimSize(dim.getSExtValue()));
-    }
-    return TransposeOp::build(
-        builder, result, RankedTensorType::get(const_shape, etype), x, perm);
-  }
-  return TransposeOp::build(builder, result, UnrankedTensorType::get(etype), x,
-                            perm);
-}
-
-namespace {
-
-OpFoldResult FoldIdentityTranspose(TransposeOp op) {
-  auto const_perm = dyn_cast_or_null<TF::ConstOp>(op.perm().getDefiningOp());
-  if (!const_perm) return {};
-
-  auto const_value = const_perm.value();
-  const auto elements = const_value.getValues<APInt>();
-
-  for (auto it : llvm::enumerate(elements)) {
-    if (it.index() != it.value()) return {};
-  }
-
-  // TODO(jpienaar): Remove if/when we handle this more generally.
-  if (op.getType() != op.x().getType()) {
-    // If the types don't match then only fold if all the operands are in the TF
-    // dialect.
-    for (auto user : op.getOperation()->getUsers())
-      if (user->getDialect() != op.getDialect()) return {};
-  }
-
-  return op.x();
-}
-
-OpFoldResult FoldCancellableTranspose(TransposeOp op) {
-  // Operand is a TransposeOp.
-  auto transpose = dyn_cast_or_null<TF::TransposeOp>(op.x().getDefiningOp());
-  if (!transpose) return {};
-
-  // Permutations defined by constant operations.
-  auto perm0 = dyn_cast_or_null<TF::ConstOp>(op.perm().getDefiningOp());
-  auto perm1 = dyn_cast_or_null<TF::ConstOp>(transpose.perm().getDefiningOp());
-  if (!perm0 || !perm1) return {};
-
-  // With permutation indices that cancel each other
-  auto perm0_value = perm0.value().cast<DenseIntElementsAttr>();
-  auto perm1_value = perm1.value().cast<DenseIntElementsAttr>();
-  if (!AreCancellablePermutations(perm0_value, perm1_value)) return {};
-
-  return transpose.x();
-}
-
-}  // namespace
-
-OpFoldResult TransposeOp::fold(ArrayRef<Attribute> operands) {
-  if (auto folded = FoldIdentityTranspose(*this)) return folded;
-  if (auto folded = FoldCancellableTranspose(*this)) return folded;
-  return {};
-}
-
-//===----------------------------------------------------------------------===//
-// TruncateDivOp
-//===----------------------------------------------------------------------===//
-
-void TruncateDivOp::getCanonicalizationPatterns(
-    OwningRewritePatternList &results, MLIRContext *context) {
-  results.insert<TruncateDivWithSqrtDivisor>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// UnpackOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(UnpackOp op) {
-  auto value_type = op.value().getType().dyn_cast<RankedTensorType>();
-  if (!value_type) return success();
-
-  int64_t value_rank = value_type.getRank();
-  int64_t axis = op.axis().getSExtValue();
-  if (axis < -value_rank || axis >= value_rank)
-    return op.emitOpError("axis attribute must be in the range of [-")
-           << value_rank << ", " << value_rank << ')';
-
-  axis = GetDimForAxis(axis, value_rank);
-  int64_t dim_size = value_type.getDimSize(axis);
-  if (ShapedType::isDynamic(dim_size)) return success();
-
-  if (dim_size != op.getNumResults())
-    return op.emitOpError("result count must be equal to ") << dim_size;
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// Unsorted segment reduction ops
-//===----------------------------------------------------------------------===//
-
-template <class Op>
-static LogicalResult VerifyUnsortedSegmentReduction(Op op) {
-  if (!HasRankAtMost(op.num_segments(), 0))
-    return op.emitOpError("number of segments should be a 0-D tensor");
-
-  auto data_type = op.data().getType().template dyn_cast<RankedTensorType>();
-  auto segment_ids_type =
-      op.segment_ids().getType().template dyn_cast<RankedTensorType>();
-  if (data_type && segment_ids_type) {
-    if (data_type.getRank() < segment_ids_type.getRank())
-      return op.emitOpError(
-          "requires segment ids rank to be less than or equal to data's rank");
-
-    int index = 0;
-    for (auto shape_pair :
-         llvm::zip_first(segment_ids_type.getShape(), data_type.getShape())) {
-      int64_t segment_id_dim = std::get<0>(shape_pair);
-      int64_t data_dim = std::get<1>(shape_pair);
-      if (!ShapedType::isDynamic(segment_id_dim) &&
-          !ShapedType::isDynamic(data_dim) && segment_id_dim != data_dim)
-        return op.emitOpError(
-                   "requires segment ids shape to be a prefix of data shape, "
-                   "but dimension #")
-               << index << " differs: " << segment_id_dim << " vs. "
-               << data_dim;
-      ++index;
-    }
-  }
-
-  DenseIntElementsAttr num_segments_attr;
-  if (matchPattern(op.num_segments(), m_Constant(&num_segments_attr))) {
-    int64_t num_segments = (*num_segments_attr.begin()).getSExtValue();
-    if (num_segments < 0)
-      return op.emitOpError("num of segments cannot be negative");
-  }
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// VariableShapeOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(VariableShapeOp op) {
-  auto input_type = op.input().getType().cast<TensorType>();
-  if (input_type.hasStaticShape() && input_type.getNumElements() != 1)
-    return op.emitOpError("requires input to have one resource");
-
-  auto resource_type = input_type.getElementType().cast<TF::ResourceType>();
-  auto subtypes = resource_type.getSubtypes();
-  switch (subtypes.size()) {
-    case 1:
-      return VerifyShapeOperandAndResult(
-          op, resource_type.getSubtypes().front(), op.getType());
-    case 0:
-      return VerifyShapeOperandAndResult(op, Type(), op.getType());
-    default:
-      return op.emitOpError(
-          "requires resource input type to have at most 1 subtype");
-  }
-}
-
-OpFoldResult VariableShapeOp::fold(ArrayRef<Attribute> operands) {
-  int width =
-      getType().cast<ShapedType>().getElementType().getIntOrFloatBitWidth();
-  auto resource_type =
-      getElementTypeOrSelf(getOperand().getType()).cast<TF::ResourceType>();
-  if (resource_type.getSubtypes().empty()) return {};
-  return ConvertShapeToAttr(resource_type.getSubtypes()[0], width);
-}
-
-//===----------------------------------------------------------------------===//
-// WhileOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(WhileOp op) {
-  auto module = op.getParentOfType<ModuleOp>();
-  auto cond_fn = module.lookupSymbol<FuncOp>(op.cond());
-  auto body_fn = module.lookupSymbol<FuncOp>(op.body());
-  if (!cond_fn) {
-    return op.emitOpError("cond refers to an undefined function : ")
-           << op.cond();
-  }
-  if (!body_fn) {
-    return op.emitOpError("body refers to an undefined function : ")
-           << op.body();
-  }
-
-  auto cond_fn_type = cond_fn.getType();
-  auto body_fn_type = body_fn.getType();
-
-  // Verify that the cond function has exactly one result.
-  if (cond_fn_type.getNumResults() != 1)
-    return op.emitOpError("requires cond function to have exactly one result");
-
-  SmallVector<Type, 4> operands(op.getOperandTypes());
-
-  // Collect all the type lists for the op so that different pairs of type lists
-  // can be compared for the compatibility.
-  constexpr int kNumTypeLists = 5;
-  const std::array<std::pair<std::string, ArrayRef<Type>>, kNumTypeLists>
-      type_lists = {{
-          {"operand", operands},
-          {"body function result", body_fn_type.getResults()},
-          {"result", op.getResultTypes()},
-          {"cond function input", cond_fn_type.getInputs()},
-          {"body function input", body_fn_type.getInputs()},
-      }};
-
-  // A pair of type lists should be cast compatible with each other if one is
-  // converted to the another for a function call or assignment or there is a
-  // common source of inputs for both.  Therefore, the While op requires the
-  // following pairs of type lists to be cast compatible for the tensor_cast
-  // operation:
-  //
-  // * Operands and cond inputs to call the cond function before the
-  //   first iteration.
-  // * Operands and body inputs to call the body function for the first
-  //   iteration if the cond functions returns True or equivalent result.
-  // * Operands and results to assign cond function arguments to op results if
-  //   the cond function returns False or equivalent result.
-  // * All three pairs using cond inputs, body inputs and results as operand is
-  //   a common source for all three.
-  // * Body result and cond inputs to call the cond function for the subsequent
-  //   iterations. Similarly, Body result should be compatible with body inputs
-  //   and op results.
-  //
-  // Note that the operands and body results need not be compatible as they are
-  // never converted from one to the another nor there is a common source
-  // tensors.  Compatibility requirement is not transitive.
-
-  for (int i = 0; i < kNumTypeLists; ++i) {
-    // Skip the first pair as the While op operands and body function results
-    // does not need to be compatible with each other.
-    for (int j = std::max(2, i + 1); j < kNumTypeLists; ++j) {
-      auto &a = type_lists[i];
-      auto &b = type_lists[j];
-
-      int a_size = a.second.size();
-      if (a_size != b.second.size())
-        return op.emitOpError(
-            llvm::formatv("requires the number of {0}s to be equal to the "
-                          "number of {1}s. Found {2} and {3}, respectively",
-                          a.first, b.first, a_size, b.second.size()));
-
-      for (int idx = 0; idx < a_size; ++idx) {
-        auto a_type = a.second[idx];
-        auto b_type = b.second[idx];
-
-        if (!AreCastCompatible({a_type, b_type}))
-          return op.emitError(llvm::formatv(
-              "{0} type {1} is incompatible with {2} type {3} at index {4}",
-              a.first, a_type, b.first, b_type, idx));
-      }
-    }
-  }
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// WhileRegionOp
-//===----------------------------------------------------------------------===//
-static LogicalResult Verify(WhileRegionOp op) {
-  // Verify that the condition generates a single tensor<i1> result.
-  YieldOp yield = cast<YieldOp>(op.cond().front().getTerminator());
-  if (yield.getNumOperands() != 1)
-    return op.emitOpError()
-           << "condition should have a single tensor<i1> result";
-
-  auto cond_type = yield.getOperand(0).getType().dyn_cast<RankedTensorType>();
-  if (!cond_type || !cond_type.getShape().equals({}) ||
-      !cond_type.getElementType().isInteger(/*width=*/1))
-    return op.emitOpError()
-           << "condition should have a single tensor<i1> result";
-
-  // The body result types should match while op result types.
-  if (failed(VerifyRegionResults(op, op.body(), "body"))) return failure();
-
-  // Both condition and body should have same number and type of operands as
-  // the WhileRegion inputs.
-  const int num_inputs = op.getNumOperands();
-  auto block_inputs_match_op_inputs = [&](Region &region,
-                                          StringRef name) -> LogicalResult {
-    Block &block = region.front();
-    if (block.getNumArguments() != num_inputs)
-      return op.emitOpError()
-             << name << " should have same number of inputs (" << num_inputs
-             << ") as " << WhileRegionOp::getOperationName() << " but has "
-             << block.getNumArguments() << " inputs";
-
-    for (auto types_idx : llvm::enumerate(
-             llvm::zip(op.getOperandTypes(), block.getArgumentTypes()))) {
-      auto op_input_type = std::get<0>(types_idx.value());
-      auto block_input_type = std::get<1>(types_idx.value());
-      if (!AreCastCompatible({block_input_type, op_input_type}))
-        return op.emitOpError(llvm::formatv(
-            "{0} input type {1} is incompatible with {2} "
-            "input type {3} at index {4}",
-            name, block_input_type, WhileRegionOp::getOperationName(),
-            op_input_type, types_idx.index()));
-    }
-    return success();
-  };
-
-  if (failed(block_inputs_match_op_inputs(op.cond(), "condition")) ||
-      failed(block_inputs_match_op_inputs(op.body(), "body")))
-    return failure();
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// WhileRegionOp LoopLikeOpInterface
-//===----------------------------------------------------------------------===//
-
-Region &WhileRegionOp::getLoopBody() { return body(); }
-
-bool WhileRegionOp::isDefinedOutsideOfLoop(Value value) {
-  // If the Op defining the value exists and the defining op is outside the
-  // scope of this WhileRegion, then we can infer that its defined outside.
-  // The defining Op is outside the scope of this WhileRegion if this
-  // WhileRegionOp is not an ancestor of the defining op in the parent chain.
-  Operation *def_op = value.getDefiningOp();
-  return def_op && !getOperation()->isAncestor(def_op);
-}
-
-LogicalResult WhileRegionOp::moveOutOfLoop(
-    llvm::ArrayRef<mlir::Operation *> ops) {
-  // Move the hoisted value to just before the while.
-  Operation *while_op = this->getOperation();
-  for (auto op : ops) op->moveBefore(while_op);
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// WhileRegionOp canonicalization
-//===----------------------------------------------------------------------===//
-namespace {
-// Eliminate values that pass through the WhileRegionOp body.
-struct WhileRegionEliminatePassThrough
-    : public OpRewritePattern<WhileRegionOp> {
-  using OpRewritePattern<WhileRegionOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(WhileRegionOp while_op,
-                                PatternRewriter &rewriter) const override {
-    // Replace values that simply passthrough the body with extern values. The
-    // block arguments of body and while match and so the corresponding cond
-    // argument can be easily found.
-    int old_num_operands = while_op.getNumOperands();
-    int new_num_operands = old_num_operands;
-    auto &body_block = while_op.body().front();
-    auto &cond_block = while_op.cond().front();
-    auto &yield = *body_block.getTerminator();
-
-    // Bit mask indicating which operands will be removed.
-    SmallVector<bool, 16> removed_operand(old_num_operands, false);
-
-    for (int op_idx : llvm::seq<int>(0, old_num_operands)) {
-      auto body_arg = body_block.getArgument(op_idx);
-      if (body_arg == yield.getOperand(op_idx)) {
-        // Replace the use of the passthrough value with the while operand
-        // in the body and condition regions, as well as the while output (if
-        // type match)
-        // TODO(jurahul): Use PatternRewriter API for IR modification.
-        auto value = while_op.getOperand(op_idx);
-        if (body_arg.getType() == value.getType())
-          body_arg.replaceAllUsesWith(value);
-
-        auto cond_arg = cond_block.getArgument(op_idx);
-        if (cond_arg.getType() == value.getType())
-          cond_arg.replaceAllUsesWith(value);
-
-        auto result = while_op.getResult(op_idx);
-        if (result.getType() == value.getType())
-          result.replaceAllUsesWith(value);
-      }
-
-      // Now check if the operand is unused in both regions as well as the
-      // result. If so, mark it for removal.
-      if (body_block.getArgument(op_idx).use_empty() &&
-          cond_block.getArgument(op_idx).use_empty() &&
-          while_op.getResult(op_idx).use_empty()) {
-        removed_operand[op_idx] = true;
-        new_num_operands--;
-      }
-    }
-
-    if (new_num_operands == old_num_operands) return failure();
-
-    // Compress the operands, region arguments, and outputs.
-    SmallVector<Value, 4> new_while_operands;
-    SmallVector<Type, 4> new_result_types;
-    new_while_operands.reserve(new_num_operands);
-    new_result_types.reserve(new_num_operands);
-
-    // Build new operands and result type.
-    int next_idx = 0;
-    for (int op_idx : llvm::seq<int>(0, old_num_operands)) {
-      if (removed_operand[op_idx]) continue;
-      new_while_operands.push_back(while_op.getOperand(op_idx));
-      new_result_types.push_back(while_op.getResult(op_idx).getType());
-      next_idx++;
-    }
-
-    // Create the new while operation.
-    auto new_while_op =
-        rewriter.create<WhileRegionOp>(while_op.getLoc(), new_result_types,
-                                       new_while_operands, while_op.getAttrs());
-
-    // Move region bodies to the new while.
-    rewriter.inlineRegionBefore(while_op.cond(), new_while_op.cond(),
-                                new_while_op.cond().end());
-    rewriter.inlineRegionBefore(while_op.body(), new_while_op.body(),
-                                new_while_op.body().end());
-
-    auto &new_cond_block = new_while_op.cond().front();
-    auto &new_body_block = new_while_op.body().front();
-    auto &new_yield = *new_body_block.getTerminator();
-
-    // Build a vector of new results. Also patch up the region bodies and yield.
-    SmallVector<Value, 4> new_results;
-    next_idx = 0;
-    for (int op_idx : llvm::seq<int>(0, old_num_operands)) {
-      if (removed_operand[op_idx]) {
-        new_cond_block.eraseArgument(next_idx);
-        new_body_block.eraseArgument(next_idx);
-        new_yield.eraseOperand(next_idx);
-        new_results.push_back(nullptr);
-      } else {
-        new_results.push_back(new_while_op.getResult(next_idx++));
-      }
-    }
-
-    rewriter.replaceOp(while_op, new_results);
-    return success();
-  }
-};
-
-}  // anonymous namespace
-
-void WhileRegionOp::getCanonicalizationPatterns(
-    OwningRewritePatternList &results, MLIRContext *context) {
-  results.insert<WhileRegionEliminatePassThrough>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// XdivyOp
-//===----------------------------------------------------------------------===//
-
-void XdivyOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
-                                          MLIRContext *context) {
-  results.insert<XdivyWithSqrtDivisor>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// TableGen'd op method definitions
-//===----------------------------------------------------------------------===//
-
-#define GET_OP_CLASSES
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc.inc"
-
 //===----------------------------------------------------------------------===//
 // TF Dialect Interfaces
 //===----------------------------------------------------------------------===//
 
 namespace {
+// Returns true if the op can be duplicated.
+bool CanDuplicate(Operation *op) {
+  // If the op is marked with the cannot duplicate trait, it cannot be
+  // duplicated.
+  if (op->hasTrait<OpTrait::TF::CannotDuplicate>()) return false;
+
+  // If the op has no memory side effects, it can be duplicated.
+  if (MemoryEffectOpInterface::hasNoEffect(op)) return true;
+
+  // If the op is marked stateless using the `is_stateless` attribute, that
+  // attribute determines if the op can be duplicated.
+  if (auto is_stateless = op->getAttrOfType<BoolAttr>("is_stateless"))
+    return is_stateless.getValue();
+
+  // Otherwise, assume ops can be duplicated by default.
+  return true;
+}
+
+// Returns true of the given function has a single uses (within the scope
+// of the module containing it and all parent modules).
+bool HasSingleUse(FuncOp func) {
+  // Public function can have any number of external uses.
+  if (func.isPublic()) return false;
+
+  // Return false if unexpected IR structure seen.
+  ModuleOp module = func.getParentOfType<ModuleOp>();
+  if (!module) return false;
+
+  // Inspect function uses in the containing module and all parent
+  // modules.
+  bool use_seen = false;
+  for (; module; module = module.getParentOfType<ModuleOp>()) {
+    auto func_uses_optional =
+        SymbolTable::getSymbolUses(func, &module.getBodyRegion());
+    // Found an unknown use.
+    if (!func_uses_optional) return false;
+
+    // If no uses in this scope, continue looking in parent module
+    SymbolTable::UseRange func_uses = func_uses_optional.getValue();
+    if (func_uses.empty()) continue;
+
+    // Check if multiple uses at this scope or another use already seen.
+    if (!llvm::hasSingleElement(func_uses) || use_seen) return false;
+
+    // This is the first use seen.
+    use_seen = true;
+
+    // If the function is private, no need to inspect parent modules.
+    if (func.isPrivate()) break;
+  }
+
+  // No multiple uses seen.
+  return true;
+}
+
 struct TFInlinerInterface : public DialectInlinerInterface {
   using DialectInlinerInterface::DialectInlinerInterface;
 
@@ -4433,8 +136,8 @@ struct TFInlinerInterface : public DialectInlinerInterface {
   // Analysis Hooks
   //===--------------------------------------------------------------------===//
 
-  // Defines the legality of inlinining 'src' region into the 'dest' region
-  // attached to a TF operation
+  // Returns if its legal to inline 'src' region into the 'dest' region
+  // attached to a TF operation.
   bool isLegalToInline(Region *dest, Region *src,
                        BlockAndValueMapping &valueMapping) const final {
     // Allow inlining in regions attached to region based control flow
@@ -4443,13 +146,17 @@ struct TFInlinerInterface : public DialectInlinerInterface {
            llvm::hasSingleElement(*src);
   }
 
-  // Defines the legality of inlining TF operations.
-  bool isLegalToInline(Operation *, Region *,
+  // Returns true if its legal to inline a TF operation `op` into the `dest`
+  // region.
+  bool isLegalToInline(Operation *op, Region *dest,
                        BlockAndValueMapping &) const final {
-    // TODO(riverriddle) For now, enable inlining all operations. This isn't
-    // correct in the face of operations that cannot be duplicated, but this
-    // requires more intricate side-effect modeling.
-    return true;
+    // An op is legal to inline if either of the following conditions is true:
+    // (a) Its legal to duplicate the Op.
+    // (a) The Op is inside a single use function. If that function is inlined,
+    //     post inlining, the function will be dead and eliminated from the IR.
+    //     So there won't be any code duplication.
+    FuncOp func = op->getParentOfType<FuncOp>();
+    return !func || CanDuplicate(op) || HasSingleUse(func);
   }
 
   //===--------------------------------------------------------------------===//
@@ -4476,17 +183,15 @@ struct TFInlinerInterface : public DialectInlinerInterface {
 // TF Dialect
 //===----------------------------------------------------------------------===//
 
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.cc.inc"
-
 std::vector<TensorFlowDialect::AdditionalOpFunction>
     *TensorFlowDialect::additional_operation_hooks_ =
         new std::vector<TensorFlowDialect::AdditionalOpFunction>();
 
 TensorFlowDialect::TensorFlowDialect(MLIRContext *context)
-    : Dialect(/*name=*/"tf", context) {
+    : Dialect(/*name=*/"tf", context, TypeID::get<TensorFlowDialect>()) {
   addOperations<
 #define GET_OP_LIST
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc.inc"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_all_ops.cc.inc"
       >();
   addTypes<
 #define HANDLE_TF_TYPE(tftype, enumerant, name) tftype##Type,
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
index f37b71575f6..039ed1bc3a8 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/IR/OpImplementation.h"  // from @llvm-project
@@ -35,6 +36,9 @@ limitations under the License.
 #include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
@@ -112,17 +116,6 @@ class TensorFlowDialect : public Dialect {
   static std::vector<AdditionalOpFunction> *additional_operation_hooks_;
 };
 
-// TODO(b/131258166): TensorFlow's mutex.h defines a `mutex_lock` macro, whose
-// purpose is to catch bug on `tensorflow::mutex_lock`. We don't use
-// `tensorflow::mutex_lock` here but we have ops (`tf.MutexLock` and
-// `tf.ConsumeMutexLock`) with getter methods named as `mutex_lock()`. Need to
-// undefine here to avoid expanding the getter symbol as macro when including
-// both mutex.h and this header file.
-#undef mutex_lock
-
-#define GET_OP_CLASSES
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h.inc"
-
 }  // namespace TF
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index 7c6e6c672ae..5269bb82239 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -68,6 +68,51 @@ class TF_TensorListInitOp<string mnemonic> : TF_Op<mnemonic, [NoSideEffect]> {
   }];
 }
 
+def TF_CaseOp : TF_Op<"Case", []> {
+  let summary = [{
+An n-way switch statement which calls a single branch function.
+  }];
+
+  let description = [{
+An n-way switch statement, implementing the following:
+    ```
+    switch (branch_index) {
+      case 0:
+        output = branches[0](input);
+        break;
+      case 1:
+        output = branches[1](input);
+        break;
+      ...
+      case [[nbranches-1]]:
+      default:
+        output = branches[nbranches-1](input);
+        break;
+    }
+    ```
+  }];
+
+  let arguments = (ins
+    I32Tensor:$branch_index,
+    Variadic<TF_Tensor>:$input,
+
+    Confined<SymbolRefArrayAttr, [ArrayMinCount<1>]>:$branches,
+    DefaultValuedAttr<TF_ShapeAttrArray, "{}">:$output_shapes,
+
+    // Used to map StatelessCase and Case to a common op.
+    DefaultValuedAttr<BoolAttr, "false">:$is_stateless
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$output
+  );
+
+  TF_DerivedOperandTypeListAttr Tin = TF_DerivedOperandTypeListAttr<1>;
+  TF_DerivedResultTypeListAttr Tout = TF_DerivedResultTypeListAttr<0>;
+
+  let hasCanonicalizer = 1;
+}
+
 // In MLIR, the TensorFlow tensor value is represented as an ElementsAttr, with
 // its type encoding the tensor's shape and data type.
 def TF_ConstOp : TF_Op<"Const", [ConstantLike, NoSideEffect,
@@ -225,10 +270,25 @@ else_branch: A function that takes 'inputs' and returns a list of
   TF_DerivedOperandTypeAttr Tcond = TF_DerivedOperandTypeAttr<0>;
   TF_DerivedOperandTypeListAttr Tin = TF_DerivedOperandTypeListAttr<1>;
   TF_DerivedResultTypeListAttr Tout = TF_DerivedResultTypeListAttr<0>;
+  TF_DerivedResultShapeListAttr output_shapes = TF_DerivedResultShapeListAttr<0>;
 
   let verifier = [{
     return Verify(*this);
   }];
+
+  let hasCanonicalizer = 1;
+
+  let extraClassDeclaration = [{
+    // Get the then branch function.
+    FuncOp then_func() {
+     return SymbolTable::lookupNearestSymbolFrom<FuncOp>(*this, then_branch());
+    }
+
+    // Get the else branch function.
+    FuncOp else_func() {
+     return SymbolTable::lookupNearestSymbolFrom<FuncOp>(*this, else_branch());
+    }
+  }];
 }
 
 def TF_YieldOp : TF_Op<"Yield",
@@ -331,8 +391,8 @@ def TF_LegacyCallOp : TF_Op<"LegacyCall",
     within the same symbol scope as the call and is mapped to a GraphDef node
     with the function name as the op name. Unlike a PartitionedCall which
     represents asynchronously executing a function across multiple devices, a
-    LegacyCall represents a function call with the only attribute
-    _diable_call_shape_inference.
+    LegacyCall ignores specification for ops in the attached function and
+    instead executes it on the device assigned to this op.
   }];
 
   let arguments = (ins
@@ -351,8 +411,11 @@ def TF_LegacyCallOp : TF_Op<"LegacyCall",
     operand_range getArgOperands() { return args(); }
 
     // Returns the callee of this operation.
-    CallInterfaceCallable getCallableForCallee() {
-      return getAttrOfType<SymbolRefAttr>("f");
+    CallInterfaceCallable getCallableForCallee() { return fAttr(); }
+
+    // returns the callee of this operation.
+    FuncOp func() {
+      return SymbolTable::lookupNearestSymbolFrom<FuncOp>(*this, f());
     }
   }];
 }
@@ -469,8 +532,11 @@ underlying graph, and executes each of the partitioned subgraphs as a function.
     operand_range getArgOperands() { return args(); }
 
     // Returns the callee of this operation.
-    CallInterfaceCallable getCallableForCallee() {
-      return getAttrOfType<SymbolRefAttr>("f");
+    CallInterfaceCallable getCallableForCallee() { return fAttr(); }
+
+    // returns the callee of this operation.
+    FuncOp func() {
+      return SymbolTable::lookupNearestSymbolFrom<FuncOp>(*this, f());
     }
   }];
 
@@ -575,8 +641,11 @@ underlying graph, and executes each of the partitioned subgraphs as a function.
     operand_range getArgOperands() { return args(); }
 
     // Returns the callee of this operation.
-    CallInterfaceCallable getCallableForCallee() {
-      return getAttrOfType<SymbolRefAttr>("f");
+    CallInterfaceCallable getCallableForCallee() { return fAttr(); }
+
+    // returns the callee of this operation.
+    FuncOp func() {
+      return SymbolTable::lookupNearestSymbolFrom<FuncOp>(*this, f());
     }
   }];
 
@@ -610,7 +679,6 @@ body: A function that takes a list of tensors and returns another
 
     FlatSymbolRefAttr:$cond,
     FlatSymbolRefAttr:$body,
-    DefaultValuedAttr<TF_ShapeAttrArray, "{}">:$output_shapes,
     DefaultValuedAttr<I64Attr, "10">:$parallel_iterations,
 
     // Used to map StatelessWhile and While op defined in TensorFlow to a common
@@ -623,10 +691,24 @@ body: A function that takes a list of tensors and returns another
   );
 
   TF_DerivedOperandTypeListAttr T = TF_DerivedOperandTypeListAttr<0>;
+  TF_DerivedResultShapeListAttr output_shapes = TF_DerivedResultShapeListAttr<0>;
 
   let verifier = [{
     return Verify(*this);
   }];
+  let hasCanonicalizer = 1;
+
+  let extraClassDeclaration = [{
+    // Get the condition function.
+    FuncOp cond_func() {
+      return SymbolTable::lookupNearestSymbolFrom<FuncOp>(*this, cond());
+    }
+
+    // Get the body function.
+    FuncOp body_func() {
+      return SymbolTable::lookupNearestSymbolFrom<FuncOp>(*this, body());
+    }
+  }];
 }
 
 def TL_WhileRegionOp : TF_Op<"WhileRegion",
@@ -1068,31 +1150,6 @@ def TF_TensorSliceDatasetOp : TF_Op<"TensorSliceDataset", []> {
   TF_DerivedOperandTypeListAttr Toutput_types = TF_DerivedOperandTypeListAttr<0>;
 }
 
-// TODO(b/156507832): Move tf.InplaceUpdate to tf_generated_ops.td once
-// autogenerated op def matches.
-def TF_InplaceUpdateOp : TF_Op<"InplaceUpdate", [NoSideEffect]> {
-  let summary = "Updates specified rows 'i' with values 'v'.";
-
-  let description = [{
-Computes `x[i, :] = v; return x`.
-
-Originally this function is mutative however for compilation we make this
-operation create / operate on a copy of `x`.
-  }];
-
-  let arguments = (ins
-    TF_Tensor:$x,
-    I32Tensor:$i,
-    TF_Tensor:$v
-  );
-
-  let results = (outs
-    TF_Tensor:$y
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-}
-
 def TF_BesselI0eOp : TF_Op<"BesselI0e", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes the Bessel i0e function of `x` element-wise.";
 
@@ -1188,12 +1245,143 @@ def TF_TPUPartitionedCallOp : TF_Op<"TPUPartitionedCall", [CallOpInterface]> {
     operand_range getArgOperands() { return args(); }
 
     // Returns the callee of this operation.
-    CallInterfaceCallable getCallableForCallee() {
-      return getAttrOfType<SymbolRefAttr>("f");
+    CallInterfaceCallable getCallableForCallee() { return fAttr(); }
+
+    // returns the callee of this operation.
+    FuncOp func() {
+      return SymbolTable::lookupNearestSymbolFrom<FuncOp>(*this, f());
     }
   }];
 
   let verifier = [{ return VerifyPartitionedCall(*this); }];
 }
 
+class TF_FusedBatchNormOpBase<string Name> : TF_Op<Name, [NoSideEffect, TF_FoldOperandsTransposeInterface, TF_LayoutSensitiveInterface]> {
+  let summary = "Batch normalization.";
+
+  let description = [{
+Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+The size of 1D Tensors matches the dimension C of the 4D Tensors.
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32]>:$x,
+    F32Tensor:$scale,
+    F32Tensor:$offset,
+    F32Tensor:$mean,
+    F32Tensor:$variance,
+
+    DefaultValuedAttr<F32Attr, "0.0001f">:$epsilon,
+    DefaultValuedAttr<F32Attr, "1.0f">:$exponential_avg_factor,
+    DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format,
+    DefaultValuedAttr<BoolAttr, "true">:$is_training
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr U = TF_DerivedOperandTypeAttr<1>;
+
+  let extraClassDeclaration = [{
+    // TF_FoldOperandsTransposeInterface:
+    SmallVector<unsigned, 4> GetLayoutDependentArgs() { return {0}; }
+    SmallVector<unsigned, 4> GetLayoutDependentResults() { return {0}; }
+    LogicalResult FoldOperandsPermutation(ArrayRef<int64_t> permutation);
+
+    // TF_LayoutSensitiveInterface:
+    StringRef GetOptimalLayout(const RuntimeDevices& devices);
+    LogicalResult UpdateDataFormat(StringRef data_format);
+  }];
+}
+
+def TF_FusedBatchNormV2Op : TF_FusedBatchNormOpBase<"FusedBatchNormV2"> {
+  let results = (outs
+    TensorOf<[BF16, F16, F32]>:$y,
+    F32Tensor:$batch_mean,
+    F32Tensor:$batch_variance,
+    F32Tensor:$reserve_space_1,
+    F32Tensor:$reserve_space_2
+  );
+}
+
+def TF_FusedBatchNormV3Op : TF_FusedBatchNormOpBase<"FusedBatchNormV3"> {
+  let results = (outs
+    TensorOf<[BF16, F16, F32]>:$y,
+    F32Tensor:$batch_mean,
+    F32Tensor:$batch_variance,
+    F32Tensor:$reserve_space_1,
+    F32Tensor:$reserve_space_2,
+    F32Tensor:$reserve_space_3
+  );
+}
+
+def TF_BatchFunctionOp : TF_Op<"BatchFunction", [AttrSizedOperandSegments]> {
+  let summary = [{
+Batches all the inputs tensors to the computation done by the function.
+  }];
+
+  let description = [{
+So, for example, in the following code
+
+  ```python
+
+  # This input will be captured.
+  y = tf.placeholder_with_default(1.0, shape=[])
+
+  @tf.Defun(tf.float32)
+  def computation(a):
+    return tf.matmul(a, a) + y
+
+  b = gen_batch_ops.batch_function(
+          f=computation
+          in_tensors=[a],
+          captured_tensors=computation.captured_inputs,
+          Tout=[o.type for o in computation.definition.signature.output_arg],
+          num_batch_threads=1,
+          max_batch_size=10,
+          batch_timeout_micros=100000,  # 100ms
+          allowed_batch_sizes=[3, 10],
+          batching_queue="")
+  ```
+
+If more than one session.run call is simultaneously trying to compute `b`
+the values of `a` will be gathered, non-deterministically concatenated
+along the first axis, and only one thread will run the computation.
+
+Assumes that all arguments of the function are Tensors which will be batched
+along their first dimension.
+
+Arguments that are captured, are not batched. The session.run call which does
+the concatenation, will use the values of the captured tensors available to it.
+Therefore, typical uses of captured tensors should involve values which remain
+unchanged across session.run calls. Inference is a good example of this.
+
+SparseTensor is not supported. The return value of the decorated function
+must be a Tensor or a list/tuple of Tensors.
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor>:$in_tensors,
+    Variadic<TF_Tensor>:$captured_tensors,
+
+    SymbolRefAttr:$f,
+    I64Attr:$num_batch_threads,
+    I64Attr:$max_batch_size,
+    I64Attr:$batch_timeout_micros,
+    DefaultValuedAttr<I64Attr, "10">:$max_enqueued_batches,
+    DefaultValuedAttr<I64ArrayAttr, "{}">:$allowed_batch_sizes,
+    StrAttr:$container,
+    StrAttr:$shared_name,
+    StrAttr:$batching_queue,
+    DefaultValuedAttr<BoolAttr, "false">:$enable_large_batch_splitting,
+    I32ElementsAttr:$operand_segment_sizes
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$out_tensors
+  );
+
+  TF_DerivedOperandTypeListAttr Tin = TF_DerivedOperandTypeListAttr<0>;
+  TF_DerivedOperandTypeListAttr Tcaptured = TF_DerivedOperandTypeListAttr<1>;
+  TF_DerivedResultTypeListAttr Tout = TF_DerivedResultTypeListAttr<0>;
+}
+
 #endif // TF_OPS
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
new file mode 100644
index 00000000000..1a730a38618
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
@@ -0,0 +1,2083 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <numeric>
+#include <string>
+#include <tuple>
+#include <type_traits>
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/DialectImplementation.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Identifier.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Parser.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace mlir {
+namespace TF {
+
+namespace {
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_helpers.inc"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/generated_canonicalize.inc"
+}  // namespace
+
+//===----------------------------------------------------------------------===//
+// AddOp
+//===----------------------------------------------------------------------===//
+
+void AddOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                        MLIRContext *context) {
+  results.insert<AddToAddV2>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// AddNOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult AddNOp::fold(ArrayRef<Attribute> operands) {
+  if (operands.size() == 1) return *inputs().begin();
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// AddV2Op
+//===----------------------------------------------------------------------===//
+
+void AddV2Op::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                          MLIRContext *context) {
+  results.insert<AddV2OfNegLeft, AddV2OfNegRight>(context);
+}
+
+OpFoldResult AddV2Op::fold(ArrayRef<Attribute> operands) {
+  return IdentityArithmeticOpFolder<AddV2Op>(*this, operands);
+}
+
+//===----------------------------------------------------------------------===//
+// AllOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(AllOp op) {
+  return VerifyReductionInputAndDims(op.input(), op.reduction_indices(),
+                                     op.getLoc());
+}
+
+//===----------------------------------------------------------------------===//
+// AnyOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(AnyOp op) {
+  return VerifyReductionInputAndDims(op.input(), op.reduction_indices(),
+                                     op.getLoc());
+}
+
+//===----------------------------------------------------------------------===//
+// AssertOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+// Removes Assert with constant true predicate.
+struct AssertWithTrue : public OpRewritePattern<AssertOp> {
+  using OpRewritePattern<AssertOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(AssertOp op,
+                                PatternRewriter &rewriter) const override {
+    ElementsAttr cst;
+    if (matchPattern(op.condition(), m_Constant(&cst))) {
+      if (cst.getValue<BoolAttr>({}).getValue()) {
+        rewriter.eraseOp(op);
+        return success();
+      }
+    }
+    return failure();
+  }
+};
+}  // namespace
+
+void AssertOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                           MLIRContext *context) {
+  results.insert<AssertWithTrue>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// BatchMatMulOp
+//===----------------------------------------------------------------------===//
+
+void BatchMatMulOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<BatchMatMulToMatMul>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// BatchMatMulV2Op
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(BatchMatMulV2Op op) {
+  if (!HasRankAtLeast(op.x(), 2)) {
+    return op.emitOpError("requires lhs operand to have rank at least two");
+  }
+  if (!HasRankAtLeast(op.y(), 2)) {
+    return op.emitOpError("requires rhs operand to have rank at least two");
+  }
+  return success();
+}
+
+void BatchMatMulV2Op::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<BatchMatMulV2ToMatMul>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// BatchToSpaceOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(BatchToSpaceOp op) {
+  // Op already has a constraint that block_size >= 2.
+  int64_t block_size = op.block_size().getSExtValue();
+
+  llvm::SmallVector<int64_t, 4> input_shape(4, ShapedType::kDynamicSize);
+  auto input_type = op.input().getType().cast<TensorType>();
+  if (input_type.hasRank()) {
+    if (input_type.getRank() != 4)
+      return op.emitOpError()
+             << "requires input to be a 4D tensor, but got " << input_type;
+
+    int64_t input_batch = input_type.getDimSize(0);
+    if (input_batch != ShapedType::kDynamicSize &&
+        input_batch % (block_size * block_size) != 0) {
+      return op.emitOpError()
+             << "requires input batch (dimension 0) to be evenly divisible "
+                "by (block_size * block_size), but got input batch "
+             << input_batch << " and block_size " << block_size;
+    }
+
+    input_shape.assign(input_type.getShape().begin(),
+                       input_type.getShape().end());
+  }
+
+  auto crops_type = op.crops().getType().cast<TensorType>();
+  if (crops_type.hasRank()) {
+    if (crops_type.getRank() != 2)
+      return op.emitOpError()
+             << "requires crops to be a 2D tensor, but got " << crops_type;
+
+    auto dim_of_size = [&](int64_t dim, int64_t size) {
+      if (crops_type.isDynamicDim(dim)) return true;
+      return crops_type.getDimSize(dim) == size;
+    };
+    if (!dim_of_size(0, 2) || !dim_of_size(1, 2))
+      return op.emitOpError()
+             << "requires crops to be a tensor<2x2>, but got " << crops_type;
+  }
+
+  DenseIntElementsAttr crops_attr;
+  // Crops are defined as [[crop_top, crop_bottom], [crop_left, crop_right]],
+  // and flattened as [crop_top, crop_bottom, crop_left, crop_right]
+  llvm::SmallVector<int64_t, 4> crops_values;
+  if (matchPattern(op.crops(), m_Constant(&crops_attr))) {
+    assert(crops_attr.getNumElements() == 4 &&
+           "tf.BatchToSpace crops must have 4 elements");
+
+    auto crops_range = crops_attr.getIntValues();
+    for (const auto &crops_value : crops_range) {
+      int64_t crops_value_int = crops_value.getSExtValue();
+      if (crops_value_int < 0)
+        return op.emitOpError()
+               << "requires all crop values to be nonnegative, but got "
+               << crops_attr;
+
+      crops_values.push_back(crops_value_int);
+    }
+  }
+
+  auto output_type = op.output().getType().cast<TensorType>();
+  if (output_type.hasRank()) {
+    if (output_type.getRank() != 4)
+      return op.emitOpError()
+             << "requires output to be a 4D tensor, but got " << output_type;
+
+    auto static_dims = [](int64_t dim_a, int64_t dim_b) {
+      return dim_a != ShapedType::kDynamicSize &&
+             dim_b != ShapedType::kDynamicSize;
+    };
+
+    auto output_shape = output_type.getShape();
+
+    // output batch = input batch / (block_size * block_size).
+    int64_t input_batch = input_shape[0];
+    int64_t output_batch = output_shape[0];
+    if (static_dims(input_batch, output_batch) &&
+        (output_batch * block_size * block_size) != input_batch)
+      return op.emitOpError()
+             << "requires output batch (dimension 0) to be equal to input "
+                "batch (dimension 0) / (block_size * block_size), but got "
+                "output batch "
+             << output_batch << ", input batch " << input_batch
+             << ", and block_size " << block_size;
+
+    auto check_spatial_dim = [&](int64_t spatial_dim_index,
+                                 llvm::StringRef dim_name,
+                                 llvm::StringRef crop_a_name,
+                                 llvm::StringRef crop_b_name) -> LogicalResult {
+      int64_t input_dim = input_shape[spatial_dim_index];
+      int64_t output_dim = output_shape[spatial_dim_index];
+      if (!static_dims(input_dim, output_dim)) return success();
+
+      int64_t input_dim_pad = input_dim * block_size;
+      // If crops are unknown, the maximum output spatial dim size is input
+      // spatial dim size * block_size, as crops can be minimum 0.
+      if (crops_values.empty() && output_dim > input_dim * block_size)
+        return op.emitOpError()
+               << "requires output " << dim_name << " (dimension "
+               << spatial_dim_index << ") to be less than or equal to input "
+               << dim_name << " (dimension " << spatial_dim_index
+               << ") * block_size, but got output " << dim_name << " "
+               << output_dim << ", input " << dim_name << " " << input_dim
+               << ", and block_size " << block_size;
+
+      if (!crops_values.empty()) {
+        // output spatial dim = input spatial dim * block_size - crops.
+        int64_t crop_a = crops_values[2 * (spatial_dim_index - 1)];
+        int64_t crop_b = crops_values[2 * (spatial_dim_index - 1) + 1];
+        if (output_dim != input_dim_pad - crop_a - crop_b)
+          return op.emitOpError()
+                 << "requires output " << dim_name << " (dimension "
+                 << spatial_dim_index << ") to be equal to input " << dim_name
+                 << " (dimension " << spatial_dim_index << ") * block_size - "
+                 << crop_a_name << " - " << crop_b_name << ", but got output "
+                 << dim_name << " " << output_dim << ", input " << dim_name
+                 << " " << input_dim << ", " << crop_a_name << " " << crop_a
+                 << ", " << crop_b_name << " " << crop_b << ", and block_size "
+                 << block_size;
+      }
+
+      return success();
+    };
+
+    if (failed(check_spatial_dim(1, "height", "crop_top", "crop_bottom")) ||
+        failed(check_spatial_dim(2, "width", "crop_left", "crop_right")))
+      return failure();
+
+    int64_t input_depth = input_shape[3];
+    int64_t output_depth = output_shape[3];
+    if (static_dims(input_depth, output_depth) && output_depth != input_depth)
+      return op.emitOpError()
+             << "requires output depth (dimension 3) to be equal to input "
+                "depth (dimension 3), but got output depth "
+             << output_depth << " and input depth " << input_depth;
+  }
+
+  return success();
+}
+
+void BatchToSpaceOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<BatchToSpaceToBatchToSpaceND>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// BiasAddOp
+//===----------------------------------------------------------------------===//
+
+// Verifies that,
+// * the value and bias operands have valid ranks or are unranked.
+// * Channel dimension of the value operand and length of bias matches if they
+//   are not unknown.
+//
+static LogicalResult Verify(BiasAddOp op) {
+  absl::string_view data_format(op.data_format().data(),
+                                op.data_format().size());
+  tensorflow::TensorFormat format;
+  bool is_valid = FormatFromString(data_format, &format);
+  DCHECK(is_valid) << data_format;
+  if (format == tensorflow::TensorFormat::FORMAT_NHWC) {
+    if (!HasRankAtLeast(op.value(), 2))
+      return op.emitOpError(
+          "requires value operand to have rank at least two with `NHWC` data "
+          "format");
+  } else {
+    // Op definition requires data_format to be either NHWC or NCHW.
+    DCHECK_EQ(format, tensorflow::TensorFormat::FORMAT_NCHW);
+    if (!HasRankAtLeast(op.value(), 3))
+      return op.emitOpError(
+          "requires value operand to have rank at least three with `NCHW` data "
+          "format");
+  }
+
+  if (!IsOfRankOrUnranked(op.bias(), 1))
+    return op.emitOpError("requires bias operand to have rank exactly one");
+
+  RankedTensorType value_ty = op.value().getType().dyn_cast<RankedTensorType>();
+  RankedTensorType bias_ty = op.bias().getType().dyn_cast<RankedTensorType>();
+  if (!bias_ty || !value_ty) return success();
+
+  int64_t feature_dim_idx =
+      tensorflow::GetTensorFeatureDimIndex(value_ty.getRank(), format);
+  int64_t feature_dim = value_ty.getDimSize(feature_dim_idx);
+  int64_t bias_len = bias_ty.getDimSize(0);
+  if (feature_dim != -1 && bias_len != -1 && feature_dim != bias_len) {
+    return op.emitOpError()
+           << "requires channel dimension and feature dimension to match; "
+              "found "
+           << feature_dim << " and " << bias_len << ", respectively";
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// BiasAddGradOp
+//===----------------------------------------------------------------------===//
+
+// Verifies that,
+// * the out_backprop operands have valid ranks or are unranked.
+//
+static LogicalResult Verify(BiasAddGradOp op) {
+  absl::string_view data_format(op.data_format().data(),
+                                op.data_format().size());
+  tensorflow::TensorFormat format;
+  bool is_valid = FormatFromString(data_format, &format);
+  DCHECK(is_valid) << data_format;
+  if (format == tensorflow::TensorFormat::FORMAT_NHWC) {
+    if (!HasRankAtLeast(op.out_backprop(), 2))
+      return op.emitOpError(
+          "requires out_backprop operand to have rank at least two with `NHWC` "
+          "data format");
+  } else {
+    // Op definition requires data_format to be either NHWC or NCHW.
+    DCHECK_EQ(format, tensorflow::TensorFormat::FORMAT_NCHW);
+    if (!HasRankAtLeast(op.out_backprop(), 3))
+      return op.emitOpError(
+          "requires out_backprop operand to have rank at least three with "
+          "`NCHW` data format");
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// BiasAddV1Op
+//===----------------------------------------------------------------------===//
+
+void BiasAddV1Op::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                              MLIRContext *context) {
+  results.insert<BiasAddV1ToBiasAdd>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// BitcastOp
+//===----------------------------------------------------------------------===//
+
+void BitcastOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                            MLIRContext *context) {
+  results.insert<BitcastSameType, BitcastNested>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// BroadcastToOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(BroadcastToOp op) {
+  // TODO(antiagainst): check that
+  // * The 'shape' input is an 1-D int tensor.
+  // * Each dimension pair of the source and target shapes are either equal
+  //   or one of them is one.
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// CaseOp
+//===----------------------------------------------------------------------===//
+
+class FoldConstantCaseOp : public OpRewritePattern<TF::CaseOp> {
+ public:
+  explicit FoldConstantCaseOp(MLIRContext *context)
+      : OpRewritePattern<TF::CaseOp>(context) {}
+  LogicalResult matchAndRewrite(TF::CaseOp op,
+                                PatternRewriter &rewriter) const override;
+};
+
+LogicalResult FoldConstantCaseOp::matchAndRewrite(
+    TF::CaseOp op, PatternRewriter &rewriter) const {
+  // Extract the constant cond value.
+  DenseIntElementsAttr branch;
+  if (!matchPattern(op.branch_index(), m_Constant(&branch))) return failure();
+
+  // Only attempt to fold scalar valued case statements.
+  // TODO(jpienaar): This can be removed if CaseOp's verifier covers it.
+  if (!branch.getType().cast<RankedTensorType>().getShape().empty())
+    return failure();
+
+  int index = *branch.getValues<int>().begin();
+  // TODO(jpienaar): This can be removed if CaseOp's verifier covers it.
+  if (index >= op.branches().size()) return failure();
+
+  auto func = op.branches()[index].cast<SymbolRefAttr>();
+  auto empty = rewriter.getStringAttr("");
+  auto call_op = rewriter.create<PartitionedCallOp>(
+      op.getLoc(), op.getResultTypes(), op.getOperands().drop_front(), func,
+      /*config=*/empty, /*config_proto=*/empty, /*executor_type=*/empty);
+  PropagateDeviceAndInternalAttrs(op.getOperation(), call_op);
+  rewriter.replaceOp(op, call_op.getResults());
+  return success();
+}
+
+void CaseOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                         MLIRContext *context) {
+  results.insert<FoldConstantCaseOp>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// CastOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult CastOp::fold(ArrayRef<Attribute> operands) {
+  // Cast with the same type is a no-op.
+  Value operand = getOperand();
+  if (getType() == operand.getType()) return operand;
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// ConcatOp and ConcatV2Op
+//===----------------------------------------------------------------------===//
+
+template <typename OpT,
+          typename std::enable_if<llvm::is_one_of<
+              OpT, ConcatOp, ConcatV2Op>::value>::type * = nullptr>
+static LogicalResult Verify(OpT op) {
+  // TODO(hinsu): Convert variadic length attributes to derived attributes.
+  Operation::operand_range values = op.values();
+
+  int axis_idx = std::is_same<OpT, ConcatOp>() ? 0 : 1;
+  Value axis = *op.getODSOperands(axis_idx).begin();
+  if (!HasRankAtMost(axis, 1)) {
+    return op.emitOpError(
+        "requires axis to be of scalar type (or vector type for older "
+        "versions)");
+  }
+
+  return VerifyTypesCompatibility(values,
+                                  /*mask_one_dim=*/true, op.getOperation());
+}
+
+void ConcatOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                           MLIRContext *context) {
+  results.insert<ConvertToConcatV2>(context);
+}
+
+namespace {
+
+// Hoist coefficient-wise unary operation out of the Concat op:
+//
+//   %0 = "tf.Log1p"(%arg_0)
+//   %1 = "tf.Log1p"(%arg_1)
+//   ...
+//   %n = "tf.Log1p"(%arg_n)
+//   %m = "tf.ConcatV2"(%0, %1, ..., %n, %axis)
+//
+// Rewrite it to:
+//
+//   %0 = "tf.ConcatV2"(%arg_0, %arg_1, ..., %arg_n, %axis)
+//   %1 = "tf.Log1p"(%0)
+class HoistCwiseUnaryOutOfConcat : public OpRewritePattern<TF::ConcatV2Op> {
+ public:
+  explicit HoistCwiseUnaryOutOfConcat(MLIRContext *context)
+      : OpRewritePattern<TF::ConcatV2Op>(context) {}
+  LogicalResult matchAndRewrite(TF::ConcatV2Op op,
+                                PatternRewriter &rewriter) const override;
+};
+
+LogicalResult HoistCwiseUnaryOutOfConcat::matchAndRewrite(
+    TF::ConcatV2Op op, PatternRewriter &rewriter) const {
+  auto loc = op.getLoc();
+
+  // All concat operands must be defined by ops.
+  Operation *first_arg_op = op.values().front().getDefiningOp();
+  if (first_arg_op == nullptr) return failure();
+
+  // All concat operands must be produced by the coeff-wise unary operation.
+  if (!first_arg_op->hasTrait<OpTrait::TF::CwiseUnary>()) return failure();
+
+  // All concat operands must be defined by the op of same kind.
+  bool args_same_op = llvm::all_of(op.values(), [&](Value arg) -> bool {
+    Operation *arg_op = arg.getDefiningOp();
+    return arg_op && arg_op->getName() == first_arg_op->getName();
+  });
+  if (!args_same_op) return failure();
+
+  // Collect unary operations operands.
+  auto unary_operands = llvm::map_range(op.values(), [](Value arg) -> Value {
+    return arg.getDefiningOp()->getOperand(0);
+  });
+  SmallVector<Value, 8> unary_ops_args(unary_operands);
+
+  // Concatenate unary ops operands.
+  auto concat_unary_operands =
+      rewriter.create<ConcatV2Op>(loc, op.getType(), unary_ops_args, op.axis());
+
+  // Replace original concat with an unary op.
+  OperationState new_unary_op_state(loc, first_arg_op->getName().getStringRef(),
+                                    concat_unary_operands.getResult(),
+                                    op.getResult().getType(),
+                                    ArrayRef<NamedAttribute>());
+  Operation *new_unary_op = rewriter.createOperation(new_unary_op_state);
+
+  rewriter.replaceOp(op, new_unary_op->getResults());
+
+  return success();
+}
+
+// Hoist coefficient-wise binary operation out of the Concat op:
+//
+//   %0 = tf.Mul(%lhs_0, %rhs_0)
+//   %1 = tf.Mul(%lhs_1, %rhs_1)
+//   ...
+//   %n = tf.Mul(%lhs_n, %rhs_n)
+//   %m = tf.ConcatV2(%0, %1, ..., %n, %axis)
+//
+// Rewrite it to:
+//
+//   %0 = tf.ConcatV2(%lhs0, %lhs1, ..., %lhs_n, %lhs_concat_axis)
+//   %1 = tf.ConcatV2(%rhs0, %rhs1, ..., %rhs_n, %rhs_concat_axis)
+//   %2 = tf.Mul(%0, %1)
+//
+// Because coefficient-wise binary operations support implicit broadcasting, we
+// should be very careful with this optimization, and do not accidentally
+// produce incorrect concat operations.
+class HoistCwiseBinaryOutOfConcat : public OpRewritePattern<TF::ConcatV2Op> {
+ public:
+  explicit HoistCwiseBinaryOutOfConcat(MLIRContext *context)
+      : OpRewritePattern<TF::ConcatV2Op>(context) {}
+  LogicalResult matchAndRewrite(TF::ConcatV2Op op,
+                                PatternRewriter &rewriter) const override;
+
+ private:
+  struct HoistParams {
+    SmallVector<Value, 8> lhs_args;
+    SmallVector<Value, 8> rhs_args;
+    int64_t lhs_axis;
+    int64_t rhs_axis;
+    Type lhs_concat_type;
+    Type rhs_concat_type;
+  };
+
+  // Returns parameters of a binary op hoisting out of concatenation if all of
+  // the operands are in one of the compatible configurations.
+  Optional<HoistParams> GetHoistParams(TF::ConcatV2Op op, int64_t axis) const;
+};
+
+LogicalResult HoistCwiseBinaryOutOfConcat::matchAndRewrite(
+    TF::ConcatV2Op op, PatternRewriter &rewriter) const {
+  auto loc = op.getLoc();
+
+  // Axis must be a constant scalar value.
+  DenseIntElementsAttr axis_attr;
+  if (!matchPattern(op.axis(), m_Constant(&axis_attr))) return failure();
+  if (axis_attr.getNumElements() != 1) return failure();
+  int64_t axis =
+      axis_attr.getSplatValue<IntegerAttr>().getValue().getSExtValue();
+
+  // All concat operands must be defined by ops.
+  Operation *first_arg_op = op.values().front().getDefiningOp();
+  if (first_arg_op == nullptr) return failure();
+
+  // All concat operands must be produced by the coeff-wise binary operation.
+  if (!first_arg_op->hasTrait<OpTrait::TF::CwiseBinary>()) return failure();
+
+  // All concat operands must be defined by the op of same kind.
+  bool args_same_op = llvm::all_of(op.values(), [&](Value arg) -> bool {
+    Operation *arg_op = arg.getDefiningOp();
+    return arg_op && arg_op->getName() == first_arg_op->getName();
+  });
+  if (!args_same_op) return failure();
+
+  // Compute binary operands hoist parameters.
+  auto hoist_params = GetHoistParams(op, axis);
+  if (!hoist_params.hasValue()) return failure();
+
+  // New lhs and rhs concatenation axis.
+  auto axis_type = mlir::RankedTensorType::get({}, rewriter.getIntegerType(64));
+  auto lhs_axis = rewriter.create<TF::ConstOp>(
+      loc, DenseIntElementsAttr::get(axis_type, hoist_params->lhs_axis));
+  auto rhs_axis = rewriter.create<TF::ConstOp>(
+      loc, DenseIntElementsAttr::get(axis_type, hoist_params->rhs_axis));
+
+  // Concatenate binary ops operands on the new axis.
+  auto lhs_concat = rewriter.create<ConcatV2Op>(
+      loc, hoist_params->lhs_concat_type, hoist_params->lhs_args, lhs_axis);
+  auto rhs_concat = rewriter.create<ConcatV2Op>(
+      loc, hoist_params->rhs_concat_type, hoist_params->rhs_args, rhs_axis);
+
+  // Replace original concat with a binary op.
+  OperationState new_binary_op_state(
+      loc, first_arg_op->getName().getStringRef(),
+      {lhs_concat.getResult(), rhs_concat.getResult()},
+      op.getResult().getType(), ArrayRef<NamedAttribute>());
+  Operation *new_binary_op = rewriter.createOperation(new_binary_op_state);
+
+  rewriter.replaceOp(op, new_binary_op->getResults());
+
+  return success();
+}
+
+Optional<HoistCwiseBinaryOutOfConcat::HoistParams>
+HoistCwiseBinaryOutOfConcat::GetHoistParams(TF::ConcatV2Op op,
+                                            int64_t axis) const {
+  // Collects lhs or rhs arguments of concat op operands.
+  auto args = [&](int operand_idx) -> SmallVector<Value, 8> {
+    auto range = llvm::map_range(op.values(), [&](Value arg) {
+      return arg.getDefiningOp()->getOperand(operand_idx);
+    });
+    return {range.begin(), range.end()};
+  };
+
+  // Returns true if all binary ops operands at `operand_idx` index are tensors
+  // of `axis + 1` rank and axis dim has size `1`.
+  auto is_all_tensors = [&](int operand_idx, int axis) -> bool {
+    return llvm::all_of(op.values(), [&](Value arg) -> bool {
+      auto operand = arg.getDefiningOp()->getOperand(operand_idx);
+      auto ranked = operand.getType().dyn_cast<RankedTensorType>();
+      return ranked && ranked.getRank() == (axis + 1) &&
+             ranked.getShape()[axis] == 1;
+    });
+  };
+
+  // Returns true if all binary ops operands at `operand_idx` index are scalars.
+  auto is_all_scalars = [&](int operand_idx) -> bool {
+    return llvm::all_of(op.values(), [&](Value arg) -> bool {
+      auto operand = arg.getDefiningOp()->getOperand(operand_idx);
+      auto ranked = operand.getType().dyn_cast<RankedTensorType>();
+      return ranked && ranked.hasRank() && ranked.getRank() == 0;
+    });
+  };
+
+  // Concat result type must be a ranked tensor.
+  auto ranked = op.getType().dyn_cast<RankedTensorType>();
+  if (!ranked) return None;
+
+  // TODO(ezhulenev): Add support for more valid concat patterns.
+
+  // Tensor + Scalar: [..., 1] + []  <- scalar
+  //                        ^
+  //                        \- axis is the innermost dimension.
+  //
+  // Concatenate tensor arguments on the same axis as the original operation,
+  // and concatenate scalars into the vector.
+  if (is_all_tensors(0, axis) && is_all_scalars(1)) {
+    std::array<int64_t, 1> rhs_dims{static_cast<int64_t>(op.values().size())};
+    auto rhs_type = RankedTensorType::get(rhs_dims, ranked.getElementType());
+    return HoistParams{args(0), args(1), axis, 0, op.getType(), rhs_type};
+  }
+
+  return None;
+}
+
+}  // namespace
+
+void ConcatV2Op::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                             MLIRContext *context) {
+  results.insert<HoistCwiseBinaryOutOfConcat, HoistCwiseUnaryOutOfConcat>(
+      context);
+}
+
+//===----------------------------------------------------------------------===//
+// ConcatOffsetOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(ConcatOffsetOp op) {
+  if (op.N() < 2)
+    return op.emitOpError() << "requires N to be at least 2, got " << op.N();
+
+  if (op.shape().size() != op.offset().size())
+    return op.emitOpError()
+           << "requires sizes of shapes and offsets to be the same, got sizes "
+           << op.shape().size() << " and " << op.offset().size();
+
+  auto ranked_dim = op.concat_dim().getType().dyn_cast<RankedTensorType>();
+  if (ranked_dim && ranked_dim.getRank() != 0)
+    return op.emitOpError()
+           << "requires concat_dim to be a scalar, got tensor of rank "
+           << ranked_dim.getRank();
+
+  int64_t num_dims = -1;
+  for (auto shape_offset_idx :
+       llvm::enumerate(llvm::zip(op.shape(), op.offset()))) {
+    Value shape = std::get<0>(shape_offset_idx.value());
+    Value offset = std::get<1>(shape_offset_idx.value());
+    const size_t idx = shape_offset_idx.index();
+
+    if (failed(verifyCompatibleShape(shape.getType(), offset.getType())))
+      return op.emitOpError() << "requires operand and result " << idx
+                              << " to have compatible shapes";
+
+    auto ranked_shape = shape.getType().dyn_cast<RankedTensorType>();
+    if (!ranked_shape) continue;
+
+    if (ranked_shape.getRank() != 1)
+      return op.emitOpError() << "requires shape tensor operand " << idx
+                              << " to be of rank 1, got tensor of rank "
+                              << ranked_shape.getRank();
+
+    if (!ranked_shape.hasStaticShape()) continue;
+
+    int64_t ranked_shape_dim = ranked_shape.getDimSize(0);
+    if (num_dims == -1)
+      num_dims = ranked_shape_dim;
+    else if (ranked_shape_dim != num_dims)
+      return op.emitOpError()
+             << "requires shape tensor (rank 1) operand " << idx
+             << " to be of length " << num_dims
+             << ", got tensor (rank 1) of length " << ranked_shape_dim;
+  }
+
+  return success();
+}
+
+LogicalResult ConcatOffsetOp::fold(ArrayRef<Attribute> operands,
+                                   SmallVectorImpl<OpFoldResult> &results) {
+  // ConcatOffset must have its first operand be concat_dim and at least two
+  // shape tensors in variadic shapes operand.
+  if (operands.size() < 3) return failure();
+
+  // Check concat_dim is a scalar.
+  auto concat_dim_attr = operands[0].dyn_cast_or_null<DenseIntElementsAttr>();
+  if (!concat_dim_attr || concat_dim_attr.getType().getRank() != 0)
+    return failure();
+
+  llvm::SmallVector<DenseIntElementsAttr, 4> shapes;
+  shapes.reserve(operands.size() - 1);
+  for (Attribute shape : llvm::drop_begin(operands, 1))
+    if (auto shape_attr = shape.dyn_cast_or_null<DenseIntElementsAttr>())
+      shapes.push_back(shape_attr);
+    else
+      return failure();
+
+  // Check all shapes are vectors of the same length.
+  if (shapes.front().getType().getRank() != 1) return success();
+  const int64_t num_dims = shapes.front().getNumElements();
+  for (DenseIntElementsAttr shape : llvm::drop_begin(shapes, 1))
+    if (shape.getType().getRank() != 1 || shape.getNumElements() != num_dims)
+      return failure();
+
+  // Check concat_dim is within [-num_dims, num_dims).
+  int32_t concat_dim = (*concat_dim_attr.getValues<int32_t>().begin());
+  if (concat_dim < 0) concat_dim += num_dims;
+  if (concat_dim >= num_dims || concat_dim < 0) return failure();
+
+  // Check all elements besides at concat_dim match across all shape tensors.
+  SmallVector<int32_t, 4> shape0;
+  shape0.reserve(num_dims);
+  for (int32_t dim : shapes.front().getValues<int32_t>()) shape0.push_back(dim);
+
+  for (DenseIntElementsAttr shape : llvm::drop_begin(shapes, 1)) {
+    for (auto dims_and_idx : llvm::enumerate(llvm::zip(shape0, shape))) {
+      if (dims_and_idx.index() == concat_dim) continue;
+
+      if (std::get<0>(dims_and_idx.value()) !=
+          std::get<1>(dims_and_idx.value()).getSExtValue())
+        return failure();
+    }
+  }
+
+  // Compute an exclusive cumulative sum of elements at concat_dim.
+  results.reserve(shapes.size());
+  SmallVector<int32_t, 4> cumulative_sum(num_dims, 0);
+  RankedTensorType offset_type =
+      RankedTensorType::get({num_dims}, IntegerType::get(32, getContext()));
+  for (DenseIntElementsAttr shape : shapes) {
+    results.push_back(DenseIntElementsAttr::get(offset_type, cumulative_sum));
+    cumulative_sum[concat_dim] += shape.getValue<int32_t>(concat_dim);
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// ConjOp
+//===----------------------------------------------------------------------===//
+
+void ConjOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                         MLIRContext *context) {
+  results.insert<ConjNested>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// ConstOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult ConstOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.empty() && "constant has no operands");
+
+  // Return the held attribute value.
+  return value();
+}
+
+// Builds a constant op with the specified attribute `value`. The result
+// op's type is deduced from `value`; if `value` is of scalar type,
+// wraps it up with a tensor type of empty shape.
+// TODO(jpienaar): This one differs from the autogenerated one as it takes an
+// attribute but always creates an ElementsAttr internally.
+void ConstOp::build(OpBuilder &builder, OperationState &result,
+                    Attribute value) {
+  ShapedType type;
+  if (auto elem_attr = value.dyn_cast<ElementsAttr>()) {
+    return ConstOp::build(builder, result, elem_attr);
+  } else if (value.isa<BoolAttr, FloatAttr, IntegerAttr>()) {
+    // All TensorFlow types must be tensor types. In the build() method,
+    // we want to provide more flexibility by allowing attributes of scalar
+    // types. But we need to wrap it up with ElementsAttr to construct
+    // valid TensorFlow constants.
+    type = RankedTensorType::get(/*shape=*/{}, value.getType());
+    return ConstOp::build(builder, result, DenseElementsAttr::get(type, value));
+  }
+  // TODO(jpienaar): support other TensorFlow specific types.
+  llvm_unreachable("unsupported attribute type for building tf.Const");
+}
+
+void ConstOp::build(OpBuilder &builder, OperationState &result, Type type,
+                    Attribute value) {
+  // Handle the case where the type and value are already tensors.
+  if (type.isa<TensorType>() && value.isa<ElementsAttr>()) {
+    result.addTypes(type);
+    result.addAttribute("value", value);
+    return;
+  }
+
+  // Otherwise, default to the attribute builder.
+  ConstOp::build(builder, result, value);
+  assert(type == result.types[0] && "type mismatch in construction");
+}
+
+LogicalResult ConstOp::inferReturnTypes(
+    MLIRContext *context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  auto value = attributes.get("value");
+  if (!value) return emitOptionalError(location, "missing attribute 'value'");
+  if (auto elem_attr = value.dyn_cast<ElementsAttr>()) {
+    inferredReturnTypes.assign({elem_attr.getType()});
+    return success();
+  }
+  return emitOptionalError(location,
+                           "attribute 'value' failed to satisfy constraint: "
+                           "constant vector/tensor");
+}
+
+//===----------------------------------------------------------------------===//
+// Conv2DOp and Conv3DOp
+//===----------------------------------------------------------------------===//
+
+template <typename OpT>
+static LogicalResult VerifyConvOpAttributes(OpT op, int num_dims) {
+  if (!IsOfRankOrUnranked(op.getResult(), num_dims))
+    return op.emitOpError()
+           << "requires result to be " << num_dims << "D tensor";
+
+  auto is_not_positive = [](Attribute val) {
+    return val.cast<IntegerAttr>().getValue().getSExtValue() <= 0;
+  };
+
+  int64_t strides_size = op.strides().size();
+  if (strides_size != num_dims)
+    return op.emitOpError() << "requires strides attribute length to be "
+                            << num_dims << "; actual length " << strides_size;
+  if (llvm::any_of(op.strides().getValue(), is_not_positive))
+    return op.emitOpError("requires positive strides");
+
+  int64_t dilations_size = op.strides().size();
+  if (op.dilations().size() != num_dims)
+    return op.emitOpError() << "requires dilations attribute length to be "
+                            << num_dims << "; actual length " << dilations_size;
+  if (llvm::any_of(op.dilations().getValue(), is_not_positive))
+    return op.emitOpError("requires positive dilations");
+
+  return success();
+}
+
+// Verifies that,
+// * Ranks of operands and result are valid
+// * Number of input channels is divisible by the number of filter input
+//   channels
+// * Length of explicit_paddings attribute is valid and has non negative
+//   elements
+// * strides and dilations attributes have positive elements
+template <typename OpT, typename std::enable_if<llvm::is_one_of<
+                            OpT, Conv2DOp, Conv3DOp>::value>::type * = nullptr>
+static LogicalResult Verify(OpT op) {
+  int num_spatial_dims = std::is_same<OpT, Conv2DOp>() ? 2 : 3;
+  int num_dims = 2 + num_spatial_dims;
+
+  if (!IsOfRankOrUnranked(op.input(), num_dims) ||
+      !IsOfRankOrUnranked(op.filter(), num_dims))
+    return op.emitOpError()
+           << "requires operands to be " << num_dims << "D tensor";
+
+  // EXPLICIT padding mode and the associated attribute is limited to Conv2D.
+  // So, fetch attribute by string instead of the op.explicit_paddings()
+  // attribute getter.
+  if (op.padding() == "EXPLICIT") {
+    auto paddings = op.template getAttrOfType<ArrayAttr>("explicit_paddings");
+    if (!paddings)
+      return op.emitOpError() << "requires attribute 'explicit_paddings' with "
+                                 "'EXPLICIT' padding mode";
+
+    int64_t paddings_size = paddings.size();
+    int64_t expected_size = 2 * num_dims;
+
+    if (paddings_size != expected_size)
+      return op.emitOpError()
+             << "requires explicit_paddings attribute length to be "
+             << expected_size << "; actual length " << paddings_size;
+
+    auto is_negative = [](Attribute val) {
+      return val.cast<IntegerAttr>().getValue().getSExtValue() < 0;
+    };
+    if (llvm::any_of(paddings.getValue(), is_negative))
+      return op.emitOpError("requires non negative explicit paddings");
+  }
+
+  LogicalResult verify_result = VerifyConvOpAttributes(op, num_dims);
+  if (failed(verify_result)) {
+    return verify_result;
+  }
+
+  int64_t input_channels = -1;
+  if (auto ty = op.input().getType().template dyn_cast<RankedTensorType>()) {
+    absl::string_view data_format(op.data_format().data(),
+                                  op.data_format().size());
+    tensorflow::TensorFormat format;
+    auto is_valid = FormatFromString(data_format, &format);
+    DCHECK(is_valid) << data_format;
+    int idx = tensorflow::GetTensorFeatureDimIndex(num_dims, format);
+    input_channels = ty.getDimSize(idx);
+  }
+
+  int64_t filter_channels = -1;
+  if (auto ty = op.filter().getType().template dyn_cast<RankedTensorType>()) {
+    int idx = tensorflow::GetFilterTensorInputChannelsDimIndex(
+        num_dims, tensorflow::FORMAT_HWIO);
+    filter_channels = ty.getDimSize(idx);
+  }
+
+  if (input_channels != -1 && filter_channels != -1 &&
+      input_channels % filter_channels != 0)
+    return op.emitOpError()
+           << "requires the number of input channels to be divisible by the "
+              "number of filter input channels; found "
+           << input_channels << " and " << filter_channels << ", respectively";
+
+  return success();
+}
+
+LogicalResult Conv2DOp::UpdateDataFormat(StringRef data_format) {
+  auto perm = GetDataFormatPermutation(this->data_format(), data_format);
+  if (perm.empty()) return failure();
+
+  // Update data_format attribute and result types.
+  if (failed(::mlir::TF::UpdateDataFormat(data_format, this))) return failure();
+
+  // Update convolution attributes.
+  setAttr("dilations", ShuffleArrayAttr(dilations(), perm));
+  setAttr("strides", ShuffleArrayAttr(strides(), perm));
+  setAttr("explicit_paddings", ShuffleArrayAttr(explicit_paddings(), perm, 2));
+
+  return success();
+}
+
+StringRef Conv2DOp::GetOptimalLayout(const RuntimeDevices &devices) {
+  // Keep current data format if no GPUs are available or if explicit placement
+  // does not allow to use GPU for this operation.
+  if (!CanUseGpuDevice(devices) || !CanUseGpuDevice(getOperation()))
+    return data_format();
+
+  // Input must be a tensor.
+  auto input_ty = input().getType().dyn_cast<TensorType>();
+  if (!input_ty) return data_format();
+
+  // For f16 data type on devices with Tensor Cores support NHWC data format
+  // is up to ~2x faster.
+  const bool is_f16 = input_ty.getElementType().isF16();
+  if (is_f16 && CanUseTensorCores(devices)) return "NHWC";
+
+  // For f32/f16 data type decision depends on the filter size in spatial
+  // dimensions, for other data types we keep current data format.
+  if (!input_ty.getElementType().isF32() && !input_ty.getElementType().isF16())
+    return data_format();
+
+  // Keep current data format if filter rank is unknown or not equal to 4.
+  auto filter_ty = filter().getType().dyn_cast<RankedTensorType>();
+  if (!filter_ty || filter_ty.getRank() != 4) return data_format();
+
+  const int64_t d0 = filter_ty.getDimSize(0);
+  const int64_t d1 = filter_ty.getDimSize(1);
+
+  auto all_ones = [](ArrayAttr arr) -> bool {
+    return llvm::all_of(arr, [](Attribute attr) -> bool {
+      return attr.cast<IntegerAttr>().getInt() == 1;
+    });
+  };
+
+  // Convolutions with 1x1 filter and with strides and dilations all ones, can
+  // be computed as a GEMM in NHWC data format, and can be up to ~2x times
+  // faster than convolution in NCHW.
+  const bool one_by_one = d0 == 1 && d1 == 1;
+  const bool trivial_strides = all_ones(strides());
+  const bool trivial_dilations = all_ones(dilations());
+
+  // TODO(ezhulenev): This might lead to excessive transposes in the final IR,
+  // if the ratio of 1x1 convolutions to regular convolutions is close to 1:1.
+  // Also FusedBatchNorm in training mode prefers NCHW data format. Check if all
+  // users can efficiently use NHWC data format?
+  if (one_by_one && trivial_strides && trivial_dilations) {
+    return "NHWC";
+  }
+
+  // If filter spatial dimensions are unknown or not 1x1 we prefer NCHW, because
+  // it's the fastest option on NVIDIA GPUs with cuDNN library support.
+  return "NCHW";
+}
+
+//===----------------------------------------------------------------------===//
+// Conv2dBackpropFilterOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult Conv2DBackpropFilterOp::UpdateDataFormat(StringRef data_format) {
+  StringRef src_data_format = this->data_format();
+
+  auto perm = GetDataFormatPermutation(src_data_format, data_format);
+  if (perm.empty()) return failure();
+
+  // Update data_format attribute and result types.
+  if (failed(::mlir::TF::UpdateDataFormat(data_format, this))) return failure();
+
+  // Update convolution attributes.
+  setAttr("dilations", ShuffleArrayAttr(dilations(), perm));
+  setAttr("strides", ShuffleArrayAttr(strides(), perm));
+  setAttr("explicit_paddings", ShuffleArrayAttr(explicit_paddings(), perm, 2));
+
+  // Permute filter sizes operand.
+  OpBuilder builder(getOperation());
+  auto filter_sizes_permuted = builder.create<TF::DataFormatVecPermuteOp>(
+      getLoc(), filter_sizes(), StringAttr::get(src_data_format, getContext()),
+      StringAttr::get(data_format, getContext()));
+  setOperand(1, filter_sizes_permuted);
+
+  return success();
+}
+
+StringRef Conv2DBackpropFilterOp::GetOptimalLayout(
+    const RuntimeDevices &devices) {
+  // Keep current data format if no GPUs are available or if explicit placement
+  // does not allow to use GPU for this operation.
+  if (!CanUseGpuDevice(devices) || !CanUseGpuDevice(getOperation()))
+    return data_format();
+
+  // Input must be a tensor.
+  auto input_ty = input().getType().dyn_cast<TensorType>();
+  if (!input_ty) return data_format();
+
+  // For f16 data type on devices with Tensor Cores support NHWC data format
+  // is up to ~2x faster.
+  const bool is_f16 = input_ty.getElementType().isF16();
+  if (is_f16 && CanUseTensorCores(devices)) return "NHWC";
+
+  // Otherwise always use "NCHW".
+  return "NCHW";
+}
+
+//===----------------------------------------------------------------------===//
+// Conv2DBackpropInputOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(Conv2DBackpropInputOp op) {
+  int num_spatial_dims = 2;
+  int num_dims = 2 + num_spatial_dims;
+
+  if (!IsOfRankOrUnranked(op.out_backprop(), num_dims) ||
+      !IsOfRankOrUnranked(op.filter(), num_dims))
+    return op.emitOpError()
+           << "requires operands to be " << num_dims << "D tensor";
+
+  LogicalResult verify_result = VerifyConvOpAttributes(op, num_dims);
+  if (failed(verify_result)) {
+    return verify_result;
+  }
+
+  return success();
+}
+
+LogicalResult Conv2DBackpropInputOp::UpdateDataFormat(StringRef data_format) {
+  StringRef src_data_format = this->data_format();
+
+  auto perm = GetDataFormatPermutation(src_data_format, data_format);
+  if (perm.empty()) return failure();
+
+  // Update data_format attribute and result types.
+  if (failed(::mlir::TF::UpdateDataFormat(data_format, this))) return failure();
+
+  // Update convolution attributes.
+  setAttr("dilations", ShuffleArrayAttr(dilations(), perm));
+  setAttr("strides", ShuffleArrayAttr(strides(), perm));
+  setAttr("explicit_paddings", ShuffleArrayAttr(explicit_paddings(), perm, 2));
+
+  // Permute input sizes operand.
+  OpBuilder builder(getOperation());
+  auto input_sizes_permuted = builder.create<TF::DataFormatVecPermuteOp>(
+      getLoc(), input_sizes(), StringAttr::get(src_data_format, getContext()),
+      StringAttr::get(data_format, getContext()));
+  setOperand(0, input_sizes_permuted);
+
+  return success();
+}
+
+StringRef Conv2DBackpropInputOp::GetOptimalLayout(
+    const RuntimeDevices &devices) {
+  // Keep current data format if no GPUs are available or if explicit placement
+  // does not allow to use GPU for this operation.
+  if (!CanUseGpuDevice(devices) || !CanUseGpuDevice(getOperation()))
+    return data_format();
+
+  // Filter must be a tensor.
+  auto filter_ty = filter().getType().dyn_cast<TensorType>();
+  if (!filter_ty) return data_format();
+
+  // For f16 data type on devices with Tensor Cores support NHWC data format
+  // is up to ~2x faster.
+  const bool is_f16 = filter_ty.getElementType().isF16();
+  if (is_f16 && CanUseTensorCores(devices)) return "NHWC";
+
+  // Otherwise always use "NCHW".
+  return "NCHW";
+}
+
+//===----------------------------------------------------------------------===//
+// DataFormatVecPermuteOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(DataFormatVecPermuteOp op) {
+  auto input_ty = op.x().getType().dyn_cast<RankedTensorType>();
+  if (!input_ty) return success();
+
+  int rank = input_ty.getRank();
+  if (rank != 1 && rank != 2)
+    return op.emitOpError("requires input of rank 1 or 2");
+
+  if (rank == 1) {
+    int64_t dim0 = input_ty.getDimSize(0);
+    if (dim0 != ShapedType::kDynamicSize && dim0 != 4 && dim0 != 2)
+      return op.emitOpError("requires 1D input of size 4 or size 2");
+  }
+
+  if (rank == 2) {
+    int64_t dim0 = input_ty.getDimSize(0);
+    if (dim0 != ShapedType::kDynamicSize && dim0 != 4)
+      return op.emitOpError(
+          "requires first dimensions of 2D input to be of size 4");
+
+    int64_t dim1 = input_ty.getDimSize(1);
+    if (dim1 != ShapedType::kDynamicSize && dim1 != 2)
+      return op.emitOpError(
+          "requires second dimensions of 2D input to be of size 2");
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// DivOp
+//===----------------------------------------------------------------------===//
+
+void DivOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                        MLIRContext *context) {
+  results.insert<DivWithSqrtDivisor>(context);
+}
+
+OpFoldResult DivOp::fold(ArrayRef<Attribute> operands) {
+  return IdentityArithmeticOpFolder<DivOp>(*this, operands);
+}
+
+//===----------------------------------------------------------------------===//
+// DynamicStitchOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(DynamicStitchOp op) {
+  if (op.N() < 1) return op.emitOpError("requires attribute N with value >= 1");
+
+  if (RankedTensorType out_ty = op.getType().dyn_cast<RankedTensorType>()) {
+    if (out_ty.getRank() == 0) {
+      return op.emitOpError("requires non scalar output");
+    }
+  }
+
+  llvm::SmallDenseSet<int64_t, 8> index_values;
+  bool all_indices_const = true;
+  int32_t max_index = -1;
+  llvm::Optional<SmallVector<int64_t, 4>> inferred_item_shape;
+  for (auto it : llvm::zip(op.indices(), op.data())) {
+    Value index = std::get<0>(it);
+
+    DenseIntElementsAttr index_attr;
+    if (matchPattern(index, m_Constant(&index_attr))) {
+      for (int32_t index : index_attr.getValues<int32_t>()) {
+        if (index < 0)
+          return op.emitOpError()
+                 << "requires non-negative index values; found " << index;
+        max_index = std::max(index, max_index);
+        index_values.insert(index);
+      }
+    } else {
+      all_indices_const = false;
+    }
+
+    Value data = std::get<1>(it);
+    RankedTensorType index_ty = index.getType().dyn_cast<RankedTensorType>();
+    RankedTensorType data_ty = data.getType().dyn_cast<RankedTensorType>();
+    if (!index_ty || !data_ty) continue;
+
+    int64_t index_rank = index_ty.getRank();
+    ArrayRef<int64_t> data_shape = data_ty.getShape();
+    ArrayRef<int64_t> index_shape = index_ty.getShape();
+    if (failed(mlir::verifyCompatibleShape(index_shape,
+                                           data_shape.take_front(index_rank))))
+      return op.emitOpError() << "requires shape of data with type " << data_ty
+                              << " to have prefix matching with shape of the "
+                                 "corresponding index type "
+                              << index_ty;
+
+    ArrayRef<int64_t> item_shape = data_shape.drop_front(index_rank);
+    if (!inferred_item_shape) {
+      inferred_item_shape = llvm::to_vector<4>(item_shape);
+      continue;
+    }
+
+    if (failed(mlir::verifyCompatibleShape(item_shape, *inferred_item_shape)))
+      return op.emitOpError() << "has inconsistent shaped data and index "
+                                 "pairs; inferred item shapes ["
+                              << llvm::makeArrayRef(*inferred_item_shape)
+                              << "] and [" << item_shape << "] don't match";
+    for (int i = 0, e = item_shape.size(); i < e; ++i) {
+      int64_t &inferred_dim = (*inferred_item_shape)[i];
+      int64_t dim = item_shape[i];
+      if (ShapedType::isDynamic(inferred_dim)) inferred_dim = dim;
+    }
+  }
+
+  // If all indices are constants, then verify that they cover all indices in
+  // the range [0, max_index] and the output type is legal.
+  if (all_indices_const) {
+    for (int32_t i = 0; i <= max_index; i++) {
+      if (!index_values.count(i))
+        return op.emitOpError() << "missing index " << i;
+    }
+
+    if (inferred_item_shape) {
+      SmallVector<int64_t, 4> expected_shape;
+      expected_shape.push_back(max_index + 1);
+      expected_shape.append(inferred_item_shape->begin(),
+                            inferred_item_shape->end());
+
+      auto out_ty = op.getType().cast<TensorType>();
+      auto expected_out_ty =
+          RankedTensorType::get(expected_shape, out_ty.getElementType());
+
+      if (!AreCastCompatible({out_ty, expected_out_ty})) {
+        return op.emitOpError() << "has invalid output type; should be "
+                                   "compatible with inferred type "
+                                << expected_out_ty;
+      }
+    }
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// EinsumOp
+//===----------------------------------------------------------------------===//
+
+// Verifies that,
+// * Arity of the op is at most two.
+//
+// TODO(hinsu): Verify einsum equation attribute.
+static LogicalResult Verify(EinsumOp op) {
+  if (op.N() > 2) {
+    return op.emitOpError("supports at most two operands");
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// EmptyOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult EmptyOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 1 && "empty op has one operand");
+
+  Attribute attr = operands.front();
+  if (!attr) return {};
+
+  auto int_attr = attr.cast<DenseIntElementsAttr>();
+  SmallVector<int64_t, 6> out_shape;
+  for (const auto val : int_attr.getValues<int32_t>()) {
+    out_shape.push_back(val);
+  }
+
+  auto type = getResult().getType().cast<ShapedType>();
+  auto etype = type.getElementType();
+
+  // We can not fold if the result is not static.
+  if (!type.hasStaticShape()) return {};
+
+  if (auto float_type = etype.dyn_cast<FloatType>()) {
+    auto out_type = RankedTensorType::get(out_shape, float_type);
+    return DenseElementsAttr::get(out_type,
+                                  {APFloat(float_type.getFloatSemantics())});
+  }
+
+  if (auto int_type = etype.dyn_cast<IntegerType>()) {
+    auto out_type = RankedTensorType::get(out_shape, etype);
+    APInt val(int_type.getWidth(), 0, int_type.getSignedness());
+    return DenseElementsAttr::get(out_type, val);
+  }
+
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// EmptyTensorListOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(EmptyTensorListOp op) {
+  if (!IsOfRankOrUnranked(op.element_shape(), 0) &&
+      !IsOfRankOrUnranked(op.element_shape(), 1)) {
+    return op.emitOpError("requires element_shape operand to be 0D/1D tensor");
+  }
+
+  if (!IsOfRankOrUnranked(op.max_num_elements(), 0)) {
+    return op.emitOpError("requires max_num_elements operand to be 0D tensor");
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// EqualOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(EqualOp op) {
+  // If we allow inputs to have incompatible type, then nothing to do.
+  if (!op.incompatible_shape_error()) return success();
+
+  // Otherwise, check inputs are broadcastable.
+  return mlir::OpTrait::impl::verifyCompatibleOperandBroadcast(
+      op.getOperation());
+}
+
+void EqualOp::build(OpBuilder &builder, OperationState &result, Value x,
+                    Value y, BoolAttr incompatible_shape_error) {
+  auto result_type = DeduceEqualCmpOpType(&builder, result.location, x, y,
+                                          incompatible_shape_error);
+  return build(builder, result, result_type, x, y, incompatible_shape_error);
+}
+
+//===----------------------------------------------------------------------===//
+// ExpandDimsOp
+//===----------------------------------------------------------------------===//
+
+Type InferExpandDimsOpType(Value input, Value dim) {
+  Type element_ty = input.getType().cast<TensorType>().getElementType();
+  auto unranked_ty = UnrankedTensorType::get(element_ty);
+
+  auto input_ty = input.getType().dyn_cast<RankedTensorType>();
+  if (!input_ty) return unranked_ty;
+
+  DenseIntElementsAttr dim_attr;
+  if (!matchPattern(dim, m_Constant(&dim_attr)) ||
+      dim_attr.getNumElements() != 1)
+    return unranked_ty;
+  int64_t dim_val = (*dim_attr.begin()).getSExtValue();
+  int64_t input_rank = input_ty.getRank();
+
+  if (dim_val < -input_rank - 1 || dim_val > input_rank + 1) return unranked_ty;
+  if (dim_val < 0) dim_val += input_rank + 1;
+
+  SmallVector<int64_t, 4> shape = llvm::to_vector<4>(input_ty.getShape());
+  shape.insert(shape.begin() + dim_val, 1);
+  return RankedTensorType::get(shape, element_ty);
+}
+
+void ExpandDimsOp::build(OpBuilder &builder, OperationState &result,
+                         Value input, Value dim) {
+  return build(builder, result, InferExpandDimsOpType(input, dim), input, dim);
+}
+
+//===----------------------------------------------------------------------===//
+// FakeQuantWithMinMaxArgsOp
+//===----------------------------------------------------------------------===//
+static LogicalResult Verify(FakeQuantWithMinMaxArgsOp op) {
+  // TODO(fengliuai): moving the following to an utility method.
+  const llvm::fltSemantics &semantics = op.min().getSemantics();
+  float rmin, rmax;
+  if (&semantics == &APFloat::IEEEsingle()) {
+    rmin = op.min().convertToFloat();
+    rmax = op.max().convertToFloat();
+  } else {
+    rmin = op.min().convertToDouble();
+    rmax = op.max().convertToDouble();
+  }
+  // Range boundaries must be valid.
+  if (rmin >= rmax) {
+    return op.emitOpError("range is invalid: [" + Twine(std::to_string(rmin)) +
+                          "," + Twine(std::to_string(rmax)) + "]");
+  }
+  int64_t num_bits = op.num_bits().getSExtValue();
+  if (num_bits < 2 || num_bits > 16) {
+    return op.emitOpError(
+        "requires num_bits to be between 2 and 16, inclusive");
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// FakeQuantWithMinMaxVarsOp
+//===----------------------------------------------------------------------===//
+static LogicalResult Verify(FakeQuantWithMinMaxVarsOp op) {
+  auto min = GetRankedTensorTypeForOperand(op.min());
+  if (min && !IsOfRankedFloatTensorType(min, 0))
+    return op.emitOpError("requires min to be a 0d float tensor");
+
+  auto max = GetRankedTensorTypeForOperand(op.max());
+  if (max && !IsOfRankedFloatTensorType(max, 0))
+    return op.emitOpError("requires max to be a 0d float tensor");
+
+  int64_t num_bits = op.num_bits().getSExtValue();
+  if (num_bits < 2 || num_bits > 16) {
+    return op.emitOpError(
+        "requires num_bits to be between 2 and 16, inclusive");
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// FakeQuantWithMinMaxVarsPerChannelOp
+//===----------------------------------------------------------------------===//
+static LogicalResult Verify(FakeQuantWithMinMaxVarsPerChannelOp op) {
+  auto min = GetRankedTensorTypeForOperand(op.min());
+  if (min && !IsOfRankedFloatTensorType(min, 1))
+    return op.emitOpError("requires min to be a 1d float tensor");
+
+  auto max = GetRankedTensorTypeForOperand(op.max());
+  if (max && !IsOfRankedFloatTensorType(max, 1))
+    return op.emitOpError("requires max to be a 1d float tensor");
+
+  Value inputs = op.inputs();
+  if (!HasRankAtLeast(inputs, 1))
+    return op.emitError("requires inputs to be at least 1d float tensor");
+
+  int64_t num_bits = op.num_bits().getSExtValue();
+  if (num_bits < 2 || num_bits > 16) {
+    return op.emitOpError(
+        "requires num_bits to be between 2 and 16, inclusive");
+  }
+
+  auto inputs_type = inputs.getType().dyn_cast<RankedTensorType>();
+  if (!inputs_type) return success();
+  int depth = inputs_type.getDimSize(inputs_type.getRank() - 1);
+  if ((min && min.getDimSize(0) != depth) ||
+      (max && max.getDimSize(0) != depth)) {
+    return op.emitOpError(
+        "requires min and max to have same size as last dimension of inputs");
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// FillOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(FillOp op) {
+  if (!IsOfRankOrUnranked(op.dims(), 1))
+    return op.emitOpError() << "requires dims to be a 1D tensor";
+  if (!IsOfRankOrUnranked(op.value(), 0))
+    return op.emitOpError() << "requires value to be a scalar";
+
+  return success();
+}
+
+static ShapedType InferFillOpType(Value dims, Value value) {
+  Type etype = value.getType().cast<ShapedType>().getElementType();
+
+  DenseIntElementsAttr dims_attr;
+  if (!matchPattern(dims, m_Constant(&dims_attr))) {
+    return UnrankedTensorType::get(etype);
+  }
+
+  llvm::SmallVector<int64_t, 4> shape;
+  shape.reserve(dims_attr.getNumElements());
+  for (const APInt dim : dims_attr.getValues<APInt>()) {
+    shape.push_back(dim.getSExtValue());
+  }
+  return RankedTensorType::get(shape, etype);
+}
+
+void FillOp::build(OpBuilder &builder, OperationState &result, Value dims,
+                   Value value) {
+  FillOp::build(builder, result, InferFillOpType(dims, value), dims, value);
+}
+
+OpFoldResult FillOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 2 && "fill op has two operand");
+
+  auto type = getType().cast<ShapedType>();
+  // DenseElementsAttr that is used in this folder only supports int and float
+  // types.
+  // TODO(hinsu): Handle complex types once there is a attribute kind for
+  // complex.
+  if (!type.getElementType().isIntOrFloat()) return {};
+
+  auto value = operands[1].dyn_cast_or_null<ElementsAttr>();
+  if (!value) return {};
+
+  if (type.hasStaticShape())
+    return DenseElementsAttr::get(type, value.getValue({}));
+
+  auto dims = operands[0].dyn_cast_or_null<DenseIntElementsAttr>();
+  if (!dims) return {};
+
+  llvm::SmallVector<int64_t, 4> shape;
+  shape.reserve(dims.getNumElements());
+  for (const APInt dim : dims.getValues<APInt>()) {
+    shape.push_back(dim.getSExtValue());
+  }
+  type = RankedTensorType::get(shape, type.getElementType());
+
+  return DenseElementsAttr::get(type, value.getValue({}));
+}
+
+//===----------------------------------------------------------------------===//
+// FusedBatchNormGradOp
+//===----------------------------------------------------------------------===//
+
+// TODO(b/150954845): Add benchmarks to verify that layout preference didn't
+// change in the latest GPU generations.
+
+LogicalResult FusedBatchNormGradV3Op::UpdateDataFormat(StringRef data_format) {
+  return ::mlir::TF::UpdateDataFormat(data_format, this);
+}
+
+StringRef FusedBatchNormGradV3Op::GetOptimalLayout(
+    const RuntimeDevices &devices) {
+  // Keep current data format if no GPUs are available or if explicit placement
+  // does not allow to use GPU for this operation.
+  if (!CanUseGpuDevice(devices) || !CanUseGpuDevice(getOperation()))
+    return data_format();
+
+  // For f16 data type on devices with Tensor Cores support NHWC data format
+  // is up to ~2x faster.
+  auto x_ty = x().getType().cast<TensorType>();
+  const bool is_f16 = x_ty.getElementType().isF16();
+  if (is_f16 && CanUseTensorCores(devices)) return "NHWC";
+
+  // For all other data types prefer NCHW.
+  return "NCHW";
+}
+
+//===----------------------------------------------------------------------===//
+// FusedBatchNormOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(FusedBatchNormOp op) {
+  auto x = GetRankedTensorTypeForOperand(op.x());
+  if (x && !IsOfRankedFloatTensorType(x, 4))
+    return op.emitOpError("requires x to be a 4D float tensor");
+
+  auto scale = GetRankedTensorTypeForOperand(op.scale());
+  if (scale && !IsOfRankedFloatTensorType(scale, 1))
+    return op.emitOpError("requires scale to be a 1D float tensor");
+
+  auto offset = GetRankedTensorTypeForOperand(op.offset());
+  if (offset && !IsOfRankedFloatTensorType(offset, 1))
+    return op.emitOpError("requires offset to be a 1D float tensor");
+
+  auto mean = GetRankedTensorTypeForOperand(op.mean());
+  if (mean && !IsOfRankedFloatTensorType(mean, 1))
+    return op.emitOpError("requires mean to be a 1D float tensor");
+
+  auto variance = GetRankedTensorTypeForOperand(op.variance());
+  if (variance && !IsOfRankedFloatTensorType(variance, 1))
+    return op.emitOpError("requires variance to be a 1D float tensor");
+
+  // TODO(antiagainst): check attributes
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// FusedBatchNormV2Op / FusedBatchNormV3Op
+//===----------------------------------------------------------------------===//
+
+template <class Op>
+static LogicalResult InferenceFoldOperandsPermutation(
+    ArrayRef<int64_t> permutation, Op *op) {
+  // FusedBatchNorm in training mode is a layout sentitive operation, and should
+  // have already assigned an optimal data format.
+  if (op->is_training()) return failure();
+  return ::mlir::TF::FoldOperandsPermutation(permutation, op);
+}
+
+template <class Op>
+static StringRef GetOptimalLayout(const RuntimeDevices &devices, Op *op) {
+  // In inference mode FusedBatchNorm is not sensitive to data layout.
+  if (!op->is_training()) return op->data_format();
+
+  // Keep current data format if no GPUs are available or if explicit placement
+  // does not allow to use GPU for this operation.
+  if (!CanUseGpuDevice(devices) || !CanUseGpuDevice(op->getOperation()))
+    return op->data_format();
+
+  // For f16 data type on devices with Tensor Cores support NHWC data format
+  // is up to ~2x faster.
+  auto x_ty = op->x().getType().template cast<TensorType>();
+  const bool is_f16 = x_ty.getElementType().isF16();
+  if (is_f16 && CanUseTensorCores(devices)) return "NHWC";
+
+  // For all other data types prefer NCHW.
+  return "NCHW";
+}
+
+LogicalResult FusedBatchNormV2Op::FoldOperandsPermutation(
+    ArrayRef<int64_t> permutation) {
+  return ::mlir::TF::InferenceFoldOperandsPermutation(permutation, this);
+}
+
+LogicalResult FusedBatchNormV2Op::UpdateDataFormat(StringRef data_format) {
+  return ::mlir::TF::UpdateDataFormat(data_format, this);
+}
+
+StringRef FusedBatchNormV2Op::GetOptimalLayout(const RuntimeDevices &devices) {
+  return ::mlir::TF::GetOptimalLayout(devices, this);
+}
+
+LogicalResult FusedBatchNormV3Op::FoldOperandsPermutation(
+    ArrayRef<int64_t> permutation) {
+  return ::mlir::TF::InferenceFoldOperandsPermutation(permutation, this);
+}
+
+LogicalResult FusedBatchNormV3Op::UpdateDataFormat(StringRef data_format) {
+  return ::mlir::TF::UpdateDataFormat(data_format, this);
+}
+
+StringRef FusedBatchNormV3Op::GetOptimalLayout(const RuntimeDevices &devices) {
+  return ::mlir::TF::GetOptimalLayout(devices, this);
+}
+
+//===----------------------------------------------------------------------===//
+// GatherV2Op
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(GatherV2Op op) {
+  int64_t batch_dims = op.batch_dims().getSExtValue();
+  if (auto ty = op.indices().getType().dyn_cast<RankedTensorType>()) {
+    int64_t rank = ty.getRank();
+    if (batch_dims > rank || batch_dims < -rank)
+      return op.emitOpError()
+             << "batch_dims (" << batch_dims << ") must be in range [" << -rank
+             << ", " << rank + 1 << ")";
+    if (batch_dims < 0) batch_dims += rank;
+  }
+
+  if (!HasRankAtMost(op.axis(), 1))
+    return op.emitOpError("requires axis to have rank at most 1");
+
+  DenseIntElementsAttr axis_attr;
+  if (matchPattern(op.axis(), m_Constant(&axis_attr))) {
+    int64_t axis = (*axis_attr.begin()).getSExtValue();
+    if (auto ty = op.params().getType().dyn_cast<RankedTensorType>()) {
+      int64_t rank = ty.getRank();
+      if (axis >= rank || axis < -rank)
+        return op.emitOpError() << "axis (" << axis << ") must be in range ["
+                                << -rank << ", " << rank << ")";
+      if (axis < 0) axis += rank;
+    }
+
+    if (batch_dims >= 0 && axis >= 0 && axis < batch_dims) {
+      return op.emitOpError() << "requires axis (" << axis
+                              << ") to be greater than or equal to batch_dims ("
+                              << batch_dims << ")";
+    }
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// IfOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(IfOp op) {
+  auto then_fn = op.then_func();
+  if (!then_fn)
+    return op.emitOpError("then_branch refers to an undefined function : ")
+           << op.then_branch();
+  auto else_fn = op.else_func();
+  if (!else_fn)
+    return op.emitOpError("else_branch refers to an undefined function : ")
+           << op.else_branch();
+  auto then_fn_type = then_fn.getType();
+  auto else_fn_type = else_fn.getType();
+
+  // Non-conditional operands starting with the second operand are passed to
+  // branches and should be pair-wise compatible with branches' inputs.
+  unsigned expected_num_inputs = op.getNumOperands() - 1;
+  if (then_fn_type.getNumInputs() != expected_num_inputs ||
+      else_fn_type.getNumInputs() != expected_num_inputs)
+    return op.emitError("branches should have " + Twine(expected_num_inputs) +
+                        " inputs");
+
+  for (unsigned i = 0; i < expected_num_inputs; ++i) {
+    auto operand_type = op.getOperand(i + 1).getType().cast<TensorType>();
+    auto then_input_type = then_fn_type.getInput(i).cast<TensorType>();
+    if (!AreCastCompatible({operand_type, then_input_type}))
+      return op.emitError(
+          llvm::formatv("then branch input type {0} is incompatible with "
+                        "operand type {1} at index {2}",
+                        then_input_type, operand_type, i));
+
+    auto else_input_type = else_fn_type.getInput(i).cast<TensorType>();
+    if (!AreCastCompatible({operand_type, else_input_type}))
+      return op.emitError(
+          llvm::formatv("else branch input type {0} is incompatible with "
+                        "operand type {1} at index {2}",
+                        else_input_type, operand_type, i));
+
+    // If branches have incompatible input types that means that no tensor can
+    // serve as input to both the functions. Hence, the op is invalid.
+    if (!AreCastCompatible({then_input_type, else_input_type}))
+      return op.emitError(llvm::formatv(
+          "branches inputs have incompatible types {0} and {1} at index {2}",
+          then_input_type, else_input_type, i));
+  }
+
+  // Branches' results should be pair-wise compatible with the op results.
+  unsigned expected_num_results = op.getNumResults();
+  if (then_fn_type.getNumResults() != expected_num_results ||
+      else_fn_type.getNumResults() != expected_num_results)
+    return op.emitError("branches should have " + Twine(expected_num_results) +
+                        " results");
+
+  for (unsigned i = 0; i < expected_num_results; ++i) {
+    auto result_type = op.getResult(i).getType().cast<TensorType>();
+    auto then_result_type = then_fn_type.getResult(i).cast<TensorType>();
+    if (!AreCastCompatible({then_result_type, result_type}))
+      return op.emitError(
+          llvm::formatv("then branch result type {0} is incompatible with op "
+                        "result type {1} at index {2}",
+                        then_result_type, result_type, i));
+
+    auto else_result_type = else_fn_type.getResult(i).cast<TensorType>();
+    if (!AreCastCompatible({else_result_type, result_type}))
+      return op.emitError(
+          llvm::formatv("else branch result type {0} is incompatible with op "
+                        "result type {1} at index {2}",
+                        else_result_type, result_type, i));
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// IfOp canonicalization.
+//===----------------------------------------------------------------------===//
+
+class FoldConstantIfOp : public OpRewritePattern<TF::IfOp> {
+ public:
+  explicit FoldConstantIfOp(MLIRContext *context)
+      : OpRewritePattern<TF::IfOp>(context) {}
+  LogicalResult matchAndRewrite(TF::IfOp op,
+                                PatternRewriter &rewriter) const override;
+
+ private:
+  template <typename T>
+  struct CallOpType {
+    using CallOp = T;
+  };
+};
+
+LogicalResult FoldConstantIfOp::matchAndRewrite(
+    TF::IfOp op, PatternRewriter &rewriter) const {
+  // Extract the constant cond value.
+  DenseIntElementsAttr cond_attr;
+  if (!matchPattern(op.cond(), m_Constant(&cond_attr))) return failure();
+
+  // Cond value must be a scalar.
+  if (cond_attr.getNumElements() != 1) return failure();
+
+  // Select a branch function.
+  bool cond = cond_attr.getSplatValue<BoolAttr>().getValue();
+  FlatSymbolRefAttr func = cond ? op.then_branchAttr() : op.else_branchAttr();
+
+  // Replace IfOp with PartitionedCallOp or StatefulPartitionedCallOp.
+  auto rewrite = [&](auto op_type) {
+    auto empty = rewriter.getStringAttr("");
+    auto call_op = rewriter.create<typename decltype(op_type)::CallOp>(
+        op.getLoc(), op.getResultTypes(), op.getOperands().drop_front(), func,
+        /*config=*/empty, /*config_proto=*/empty, /*executor_type=*/empty);
+    PropagateDeviceAndInternalAttrs(op.getOperation(), call_op);
+    rewriter.replaceOp(op, call_op.getResults());
+  };
+
+  if (op.is_stateless())
+    rewrite(CallOpType<PartitionedCallOp>{});
+  else
+    rewrite(CallOpType<StatefulPartitionedCallOp>{});
+
+  return success();
+}
+
+void IfOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                       MLIRContext *context) {
+  results.insert<FoldConstantIfOp, DropAttributes<IfOp>>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// IfRegionOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(IfRegionOp op) {
+  if (failed(VerifyRegionResults(op, op.then_branch(), "then")))
+    return failure();
+  if (failed(VerifyRegionResults(op, op.else_branch(), "else")))
+    return failure();
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// InvertOp
+//===----------------------------------------------------------------------===//
+
+void InvertOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                           MLIRContext *context) {
+  results.insert<InvertNested>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// InvertPermutationOp
+//===----------------------------------------------------------------------===//
+
+// Verifies that the input is 1D.
+static LogicalResult Verify(InvertPermutationOp op) {
+  auto x_type = op.x().getType().cast<TensorType>();
+  if (!x_type.hasRank()) return success();
+  if (x_type.getShape().size() != 1)
+    return op.emitOpError() << "requires input x to be 1-dimensional";
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// LeakyReluOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult LeakyReluOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 1 && "leaky relu has one operand");
+
+  // leaky_relu(x, alpha: 1) -> x
+  if (alpha().convertToFloat() == 1.0f) return getOperand();
+
+  auto calculate = [&](FloatAttr arg) {
+    APFloat val = arg.getValue();
+    if (val.isNegative()) val = alpha() * val;
+    return FloatAttr::get(arg.getType(), val);
+  };
+
+  if (auto arg = operands[0].dyn_cast_or_null<FloatAttr>()) {
+    return calculate(arg);
+  } else if (auto arg = operands[0].dyn_cast_or_null<SplatElementsAttr>()) {
+    if (auto elementAttr = arg.getSplatValue().dyn_cast<FloatAttr>())
+      return DenseElementsAttr::get(arg.getType(), calculate(elementAttr));
+  }
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// LogOp
+//===----------------------------------------------------------------------===//
+
+void LogOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                        MLIRContext *context) {
+  results.insert<LogOfSoftmax, LogToLog1p>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// LogicalNotOp
+//===----------------------------------------------------------------------===//
+
+void LogicalNotOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<LogicalNotNested, LogicalNotOfEqual, LogicalNotOfNotEqual,
+                 LogicalNotOfGreater, LogicalNotOfGreaterEqual,
+                 LogicalNotOfLess, LogicalNotOfLessEqual>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// MatrixBandPartOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(MatrixBandPartOp op) {
+  if (!HasRankAtLeast(op.input(), 2)) {
+    return op.emitOpError()
+           << "requires `input` to have rank of at least 2, but found "
+           << op.input().getType();
+  }
+  if (!IsOfRankOrUnranked(op.num_lower(), 0)) {
+    return op.emitOpError()
+           << "requires `num_lower` to have 0 dimensions, but found "
+           << op.num_lower().getType();
+  }
+  if (!IsOfRankOrUnranked(op.num_upper(), 0)) {
+    return op.emitOpError()
+           << "requires `num_upper` to have 0 dimensions, but found "
+           << op.num_upper().getType();
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// MaxOp
+//===----------------------------------------------------------------------===//
+
+void MaxOp::build(OpBuilder &builder, OperationState &result, Value input,
+                  Value reduction_indices, BoolAttr keep_dims) {
+  Type out_ty =
+      InferReductionOpType(input, reduction_indices, keep_dims, &builder);
+  build(builder, result, out_ty, input, reduction_indices, keep_dims);
+}
+
+//===----------------------------------------------------------------------===//
+// MaxPoolOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult MaxPoolOp::FoldOperandsPermutation(
+    ArrayRef<int64_t> permutation) {
+  return ::mlir::TF::FoldOperandsPermutation(
+      permutation, this, {{"strides", strides()}, {"ksize", ksize()}});
+}
+
+//===----------------------------------------------------------------------===//
+// MaxPoolGradOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(MaxPoolGradOp op) {
+  if (!IsOfRankOrUnranked(op.orig_input(), 4)) {
+    return op.emitOpError() << "requires orig_input to be rank 4";
+  }
+  if (!IsOfRankOrUnranked(op.orig_output(), 4)) {
+    return op.emitOpError() << "requires orig_output to be rank 4";
+  }
+  if (!IsOfRankOrUnranked(op.grad(), 4)) {
+    return op.emitOpError() << "requires grad to be rank 4";
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// MeanOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult MeanOp::FoldOperandsPermutation(ArrayRef<int64_t> permutation) {
+  // Reduction indices must be defined by a constant operation.
+  auto reduction_op =
+      dyn_cast_or_null<TF::ConstOp>(reduction_indices().getDefiningOp());
+  if (!reduction_op) return failure();
+
+  auto reductions_value = reduction_op.value().dyn_cast<DenseElementsAttr>();
+  if (!reductions_value) return failure();
+
+  // Prepare new reduction indices according to operand permutation.
+  SmallVector<int32_t, 4> shuffled_reduction;
+  llvm::transform(reductions_value.getIntValues(),
+                  std::back_inserter(shuffled_reduction),
+                  [&](APInt idx) { return permutation[idx.getSExtValue()]; });
+
+  // Add constant operation with a new reduction indices.
+  OpBuilder builder(getOperation());
+  auto type = mlir::RankedTensorType::get(shuffled_reduction.size(),
+                                          builder.getIntegerType(32));
+  auto values = mlir::DenseIntElementsAttr::get(type, shuffled_reduction);
+  auto shuffled_reduction_op = builder.create<TF::ConstOp>(getLoc(), values);
+
+  // Use new reduction indices.
+  setOperand(1, shuffled_reduction_op);
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// MulOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult MulOp::fold(ArrayRef<Attribute> operands) {
+  return IdentityArithmeticOpFolder<MulOp>(*this, operands);
+}
+
+//===----------------------------------------------------------------------===//
+// TableGen'd op method definitions
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc.inc"
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h
new file mode 100644
index 00000000000..19a927a23d7
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h
@@ -0,0 +1,62 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_A_M_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_A_M_H_
+
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/LoopLikeInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h"
+
+namespace mlir {
+namespace TF {
+
+class YieldOp;
+
+// TODO(b/131258166): TensorFlow's mutex.h defines a `mutex_lock` macro, whose
+// purpose is to catch bug on `tensorflow::mutex_lock`. We don't use
+// `tensorflow::mutex_lock` here but we have ops (`tf.MutexLock` and
+// `tf.ConsumeMutexLock`) with getter methods named as `mutex_lock()`. Need to
+// undefine here to avoid expanding the getter symbol as macro when including
+// both mutex.h and this header file.
+#undef mutex_lock
+
+#define GET_OP_FWD_DEFINES
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_all_ops.h.inc"
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h.inc"
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_A_M_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_helpers.inc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_helpers.inc
new file mode 100644
index 00000000000..71f1560aa6c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_helpers.inc
@@ -0,0 +1,600 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is a simple include file used to simplify the splitting of the
+// tf_ops.cc file. The helpers in here should be refactored and moved to
+// tf_verifiers or tf_ops.
+// TODO(jpienaar): Remove this file post refactoring.
+
+// Propagates underscore and device attributes from src to dst.
+// TODO(b/158769932): This should be a general feature instead post some policy
+// discussion.
+static void PropagateDeviceAndInternalAttrs(Operation *src, Operation *dst) {
+  auto device = mlir::Identifier::get("device", src->getContext());
+  for (auto named_attr : src->getAttrs()) {
+    if (*named_attr.first.begin() == '_' || named_attr.first == device)
+      dst->setAttr(named_attr.first, named_attr.second);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// TF op helper functions
+//===----------------------------------------------------------------------===//
+
+// Returns the RankedTensorType for the given operand. TensorFlow constant ops
+// may have non-static shape because the shape is not propagated during constant
+// folding. If the defining op for the given operand is a constant op, this
+// routine uses the constant op's attribute to get the actual shape.
+static RankedTensorType GetRankedTensorTypeForOperand(Value operand) {
+  DenseElementsAttr attr;
+  if (matchPattern(operand, m_Constant(&attr))) {
+    return attr.getType().dyn_cast<RankedTensorType>();
+  }
+  return operand.getType().dyn_cast<RankedTensorType>();
+}
+
+// Returns true if the given `value` is of ranked float tensor type with the
+// given `rank`.
+static inline bool IsOfRankedFloatTensorType(RankedTensorType type, int rank) {
+  return type && type.getRank() == rank &&
+         type.getElementType().isa<FloatType>();
+}
+
+// Returns true if the given `value` has the specified rank or has unranked
+// type.
+static inline bool IsOfRankOrUnranked(Value value, int64_t rank) {
+  RankedTensorType type = GetRankedTensorTypeForOperand(value);
+  return !type || type.getRank() == rank;
+}
+
+// Returns true if the given `value` has at least the specified rank or has
+// unranked type.
+static inline bool HasRankAtLeast(Value value, int64_t rank) {
+  RankedTensorType type = GetRankedTensorTypeForOperand(value);
+  return !type || type.getRank() >= rank;
+}
+
+// Returns true if the given `value` has at most the specified rank or has
+// unranked type.
+static inline bool HasRankAtMost(Value value, int64_t rank) {
+  RankedTensorType type = GetRankedTensorTypeForOperand(value);
+  return !type || type.getRank() <= rank;
+}
+
+static bool IsUnknownDimOrRank(int64_t dim_or_rank) {
+  return dim_or_rank == -1;
+}
+
+// Returns the tf.Equal/tf.NotEqual result type given `x` and `y` and inputs. If
+// `incompatible_shape_error` is true, reports error if `x` and `y` has
+// incompatible shapes. Otherwise, returns a tensor type with unknown rank.
+static Type DeduceEqualCmpOpType(Builder *builder, Location loc, Value x,
+                                 Value y, BoolAttr incompatible_shape_error) {
+  auto result_type =
+      OpTrait::util::getBroadcastedType(x.getType(), y.getType());
+  if (!result_type) {
+    if (incompatible_shape_error.getValue()) {
+      mlir::emitError(loc, "non-broadcastable operands");
+    } else {
+      return UnrankedTensorType::get(builder->getI1Type());
+    }
+  }
+
+  auto ranked_type = result_type.dyn_cast<RankedTensorType>();
+  if (!ranked_type) return UnrankedTensorType::get(builder->getI1Type());
+
+  return RankedTensorType::get(ranked_type.getShape(), builder->getI1Type());
+}
+
+// Returns dimension index for the given TensorFlow axis that supports negative
+// indexing.
+static int64_t GetDimForAxis(int64_t axis, int64_t rank) {
+  return axis >= 0 ? axis : axis + rank;
+}
+
+// Infers output type for reduction ops such as SumOp, MaxOp etc.
+// TODO(b/e667204a): Move this logic to shape inference once it supports custom
+// inference functions.
+static Type InferReductionOpType(Value input, Value reduction_indices,
+                                 BoolAttr keep_dims, Builder *builder) {
+  Type input_ty = input.getType();
+  Type element_ty = getElementTypeOrSelf(input_ty);
+
+  // Output type is unranked if input type is not ranked.
+  auto ranked_ty = input_ty.dyn_cast<RankedTensorType>();
+  if (!ranked_ty) return UnrankedTensorType::get(element_ty);
+  int64_t rank = ranked_ty.getRank();
+
+  DenseIntElementsAttr indices;
+  if (!matchPattern(reduction_indices, m_Constant(&indices))) {
+    // Output type is unranked if reduction indices are not constant and reduced
+    // dimensions are not kept.
+    if (!keep_dims.getValue()) return UnrankedTensorType::get(element_ty);
+
+    // Otherwise, output type has same rank as the input.
+    return RankedTensorType::get(SmallVector<int64_t, 4>(rank, -1), element_ty);
+  }
+
+  int64_t num_reduce_dim = 0;
+  llvm::SmallVector<bool, 4> is_reduce_dim(rank, false);
+  for (const APInt &index : indices.getValues<APInt>()) {
+    int64_t dim = GetDimForAxis(index.getSExtValue(), rank);
+    // Invalid input.
+    if (dim < 0 || dim >= rank) return UnrankedTensorType::get(element_ty);
+
+    if (!is_reduce_dim[dim]) {
+      is_reduce_dim[dim] = true;
+      num_reduce_dim++;
+    }
+  }
+
+  ArrayRef<int64_t> shape = ranked_ty.getShape();
+  SmallVector<int64_t, 4> out_shape;
+  out_shape.reserve(rank - (keep_dims.getValue() ? 0 : num_reduce_dim));
+  for (int64_t i = 0; i < rank; ++i) {
+    if (!is_reduce_dim[i])
+      out_shape.push_back(shape[i]);
+    else if (keep_dims.getValue())
+      out_shape.push_back(1);
+  }
+  return RankedTensorType::get(out_shape, element_ty);
+}
+
+// Verifies that the given types are cast compatible. If not, emits appropriate
+// error for the given op. If mask_one_dim is set to true, then the types are
+// allowed to have one mismatching dimension. Masking one of the dimensions is
+// useful for ops like Concat that requires all ranked inputs to have the same
+// rank and match dimension sizes for all but one of the dimensions.
+static LogicalResult VerifyTypesCompatibility(
+    Operation::operand_type_range types, bool mask_one_dim, Operation *op) {
+  constexpr int64_t kUninitialized = -1;
+  int64_t common_rank = kUninitialized;
+  llvm::SmallVector<int64_t, 4> common_dims;
+  int64_t dim_to_mask = kUninitialized;
+
+  // Initialize common_rank with rank of the first ranked type and verify that
+  // following ranked types have the same rank.
+  // Similarly, initialize each of the dimensions with the first type that has
+  // the dimension size available and verify that all following types have the
+  // same size for the dimension. However, if mask_one_dim is true, note down
+  // the dimension index on the first mismatch and ignore dimension at that
+  // index in following types.
+  for (Type ty : types) {
+    RankedTensorType ranked_ty = ty.dyn_cast<RankedTensorType>();
+    if (!ranked_ty) continue;
+
+    int64_t rank = ranked_ty.getRank();
+    if (common_rank == kUninitialized) {
+      common_rank = rank;
+      common_dims.resize(common_rank, kUninitialized);
+    } else if (common_rank != rank) {
+      return op->emitError()
+             << "operand type " << ranked_ty
+             << " is not compatible with preceding operands; expected rank: "
+             << common_rank;
+    }
+
+    for (int64_t i = 0, e = common_rank; i != e; i++) {
+      if (i == dim_to_mask) continue;
+
+      int64_t dim = ranked_ty.getDimSize(i);
+      if (dim == kUninitialized) continue;
+
+      int64_t &common_dim = common_dims[i];
+      if (common_dim == kUninitialized) {
+        common_dim = dim;
+      } else if (common_dim != dim) {
+        // If mask_one_dim is true, do not emit an error if this is the only
+        // dimension with mismatches. Note down the dimension to mask it from
+        // the following types.
+        if (mask_one_dim && dim_to_mask == kUninitialized) {
+          dim_to_mask = i;
+          continue;
+        }
+
+        return op->emitError() << "operand type " << ranked_ty
+                               << " is not compatible with preceding operands; "
+                                  "expected dimension at index "
+                               << i << ": " << common_dim;
+      }
+    }
+  }
+  return success();
+}
+
+// This is a helper for the Select to SelectV2 canonicalization. The `data` rank
+// refers to the rank of `t`/`e` (these two inputs have equal rank; this is
+// checked in the verifier).
+//
+// In most cases, the predicate for Select can be used directly as the predicate
+// for SelectV2. However, there is one case that varies, which is when the
+// predicate is a tensor and the data is multidimensional. In this case, Select
+// op semantics dictate that the predicate tensor length must match the size of
+// the first data dimension. This varies from normal broadcasting semantics
+// (which are used in SelectV2), so we must reshape the tensor in this case to
+// be compatible.
+static Value ReshapeSelectPredIfNecessary(OpBuilder *builder, Location loc,
+                                          Value cond, int data_rank) {
+  auto cond_tensor = cond.getType().cast<RankedTensorType>();
+  // Reshape is only needed in the case that the cond rank is 1 (i.e. it is
+  // a vector) AND t/e rank is > 1.
+  if (cond_tensor.getRank() != 1 || data_rank <= 1) {
+    // No reshape necessary. Leave cond as it is.
+    return cond;
+  }
+
+  // This is the case where a reshape is needed. We want to construct the
+  // shape [x,1,...1], where x is the value in the pred tensor and the
+  // length of the shape is equal to data_rank.
+  SmallVector<int64_t, 8> shape(data_rank, 1);
+  shape[0] = cond_tensor.getShape().front();
+  auto new_shape_type =
+      RankedTensorType::get({data_rank}, builder->getIntegerType(64));
+  auto shape_attr = DenseIntElementsAttr::get(new_shape_type, shape);
+  auto new_shape = builder->create<ConstOp>(loc, shape_attr);
+  return builder->create<ReshapeOp>(loc, cond, new_shape);
+}
+
+//===----------------------------------------------------------------------===//
+// Helper functions detect device capabilities from RuntimeDevices.
+//===----------------------------------------------------------------------===//
+
+namespace {
+using DeviceNameUtils = ::tensorflow::DeviceNameUtils;
+using ParsedName = ::tensorflow::DeviceNameUtils::ParsedName;
+
+bool IsGpuDevice(const DeviceNameUtils::ParsedName &device) {
+  return device.type == ::tensorflow::DEVICE_GPU;
+}
+
+}  // namespace
+
+// Returns true if at least one GPU device is available at runtime.
+bool CanUseGpuDevice(const RuntimeDevices &devices) {
+  return llvm::any_of(devices.device_names(), IsGpuDevice);
+}
+
+// Returns true if all of the GPUs available at runtime support TensorCores
+// (NVIDIA compute capability >= 7.0).
+bool CanUseTensorCores(const RuntimeDevices &devices) {
+  auto has_tensor_cores = [&](const DeviceNameUtils::ParsedName &device) {
+    auto md = devices.GetGpuDeviceMetadata(device);
+    return md ? md->cc_major().getInt() >= 7 : false;
+  };
+  return llvm::all_of(
+      llvm::make_filter_range(devices.device_names(), IsGpuDevice),
+      has_tensor_cores);
+}
+
+// Returns true if operation does not have explicit device placement that would
+// prevent it from running on GPU device.
+bool CanUseGpuDevice(Operation *op) {
+  auto device_attr = op->getAttrOfType<StringAttr>("device");
+  if (!device_attr || device_attr.getValue().empty()) return true;
+
+  DeviceNameUtils::ParsedName device;
+  if (!DeviceNameUtils::ParseFullName(device_attr.getValue().str(), &device))
+    return false;
+
+  // We can't use GPU if operation explicitly placed on non-GPU device.
+  return !device.has_type || device.type == ::tensorflow::DEVICE_GPU;
+}
+
+//===----------------------------------------------------------------------===//
+// TF op helper functions to work with layout transformation.
+//===----------------------------------------------------------------------===//
+
+SmallVector<int64_t, 4> ReversePermutation(ArrayRef<int64_t> permutation) {
+  SmallVector<int64_t, 4> reverse(permutation.size());
+  for (size_t i = 0; i < permutation.size(); ++i) {
+    reverse[permutation[i]] = i;
+  }
+  return reverse;
+}
+
+SmallVector<int64_t, 4> GetDataFormatPermutation(StringRef from, StringRef to) {
+  if (from == "NHWC" && to == "NCHW") {
+    return {0, 3, 1, 2};
+  } else if (from == "NCHW" && to == "NHWC") {
+    return {0, 2, 3, 1};
+  } else {
+    return {};
+  }
+}
+
+// Shuffle elements in the `attr` according to the permutation. Optional
+// `inner_size` allows to shuffle array attributes created from rank 2 tensors
+// on outer dimension only.
+ArrayAttr ShuffleArrayAttr(ArrayAttr attr, ArrayRef<int64_t> permutation,
+                           int inner_size = 1) {
+  if (attr.size() == 0) return attr;
+
+  assert(attr.size() % inner_size == 0);
+  assert(attr.size() / inner_size == permutation.size());
+
+  SmallVector<Attribute, 8> values{attr.begin(), attr.end()};
+  SmallVector<Attribute, 8> shuffled(values.size());
+
+  for (size_t i = 0; i < permutation.size(); ++i) {
+    for (size_t j = 0; j < inner_size; ++j) {
+      shuffled[i * inner_size + j] = values[permutation[i] * inner_size + j];
+    }
+  }
+
+  return ArrayAttr::get(shuffled, attr.getContext());
+}
+
+// Shuffle ranked tensor dimensions according to the permutation.
+Type ShuffleRankedTensorType(Type type, ArrayRef<int64_t> permutation) {
+  if (auto ranked_type = type.dyn_cast<RankedTensorType>()) {
+    ArrayRef<int64_t> shape = ranked_type.getShape();
+    assert(permutation.size() == shape.size());
+
+    SmallVector<int64_t, 4> new_shape(permutation.size());
+    for (size_t i = 0; i < permutation.size(); ++i)
+      new_shape[i] = shape[permutation[i]];
+
+    return RankedTensorType::get(new_shape, ranked_type.getElementType());
+  }
+
+  return type;
+}
+
+static bool AreCancellablePermutations(DenseIntElementsAttr perm0,
+                                       DenseIntElementsAttr perm1) {
+  if (perm0.getNumElements() == 0 || perm1.getNumElements() == 0) return false;
+  if (perm0.getNumElements() != perm1.getNumElements()) return false;
+
+  SmallVector<int64_t, 8> perm0_values;
+  for (const auto &value : perm0.getIntValues())
+    perm0_values.push_back(value.getSExtValue());
+
+  SmallVector<int64_t, 8> perm1_values;
+  for (const auto &value : perm1.getIntValues())
+    perm1_values.push_back(value.getSExtValue());
+
+  for (int i = 0; i < perm0_values.size(); ++i) {
+    if (perm0_values[perm1_values[i]] != i) return false;
+  }
+
+  return true;
+}
+
+// Default implementation of `LayoutSensitiveInterface::UpdateDataFormat` for
+// layout sensitive operations that do not have any additional layout dependent
+// attributes besides `data_format` string.
+template <typename Op>
+LogicalResult UpdateDataFormat(StringRef data_format, Op *op) {
+  auto perm = GetDataFormatPermutation(op->data_format(), data_format);
+  if (perm.empty()) return failure();
+
+  // Update data format attribute.
+  op->setAttr("data_format", StringAttr::get(data_format, op->getContext()));
+
+  // Update types for all layout sensitive results.
+  auto layout_sensitive = cast<LayoutSensitiveInterface>(op->getOperation());
+  for (unsigned idx : layout_sensitive.GetLayoutDependentResults()) {
+    OpResult result = op->getOperation()->getResult(idx);
+    result.setType(ShuffleRankedTensorType(result.getType(), perm));
+  }
+
+  return success();
+}
+
+// Default implementation for folding operand transpose into the operation.
+// See `FoldOperandsTransposeInterface::FoldOperandsPermutation`.
+template <typename Op>
+LogicalResult FoldOperandsPermutation(
+    ArrayRef<int64_t> permutation, Op *op,
+    ArrayRef<std::pair<StringRef, ArrayAttr>> shuffle_attrs = {}) {
+  MLIRContext *context = op->template getParentOfType<ModuleOp>().getContext();
+
+  // We only support NHWC <-> NCHW permutations.
+  static constexpr std::array<int64_t, 4> kNchwToNhwc = {0, 2, 3, 1};
+  static constexpr std::array<int64_t, 4> kNhwcToNchw = {0, 3, 1, 2};
+
+  // Operation data format after folding `permutation`.
+  StringRef target_data_format = [&]() -> StringRef {
+    if (op->data_format() == "NHWC" && permutation.equals(kNchwToNhwc)) {
+      return "NCHW";  // cancel NCHW->NHWC operand permutation
+    } else if (op->data_format() == "NCHW" && permutation.equals(kNhwcToNchw)) {
+      return "NHWC";  // cancel NHWC->NCHW operand permutation
+    } else {
+      return "";
+    }
+  }();
+  if (target_data_format.empty()) return failure();
+
+  // To fold operand `permutation` into the `op` we need shuffle all layout
+  // dependent attributes and types with a reverse permutation, and change
+  // operation data format to `target_data_format`.
+  //
+  // Example:
+  //   %1 = SomeOp(...)   {data_format = NHWC}
+  //   %2 = Transpose(%1) {permutation = NHWC->NCHW}
+  //   %3 = Op(%2)        {data_format = NCHW}
+  //
+  // To bypass %2 we have to change data format to shuffle data format from NCHW
+  // to NHWC, which is the reverse of operand permutation (function argument).
+  auto reverse_permutation =
+      GetDataFormatPermutation(op->data_format(), target_data_format);
+  if (reverse_permutation.empty()) return failure();
+
+  op->setAttr("data_format", StringAttr::get(target_data_format, context));
+
+  for (auto pair : shuffle_attrs) {
+    StringRef attr_name = pair.first;
+    ArrayAttr attr_value = pair.second;
+    op->setAttr(attr_name, ShuffleArrayAttr(attr_value, reverse_permutation));
+  }
+
+  auto fold = cast<FoldOperandsTransposeInterface>(op->getOperation());
+  for (unsigned idx : fold.GetLayoutDependentResults()) {
+    OpResult result = op->getOperation()->getResult(idx);
+    result.setType(
+        ShuffleRankedTensorType(result.getType(), reverse_permutation));
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Rewrite Pattern for removing trivial Arithmetic op.
+//===----------------------------------------------------------------------===//
+
+namespace {
+// Fold Arithmetic Op if one of the operands is a constant known to be an
+// Identity (e.g. X+0, X*1, etc...). For commutative operations fold if
+// known identity value is either lhs or rhs.
+template <
+    typename OpT,
+    typename std::enable_if<llvm::is_one_of<
+        OpT, AddV2Op, SubOp, MulOp, DivOp, RealDivOp>::value>::type * = nullptr>
+OpFoldResult IdentityArithmeticOpFolder(OpT arithmetic_op,
+                                        ArrayRef<Attribute> operands) {
+  auto lhs_type = arithmetic_op.x().getType().template cast<ShapedType>();
+  auto rhs_type = arithmetic_op.y().getType().template cast<ShapedType>();
+  auto result_type =
+      arithmetic_op.getResult().getType().template cast<ShapedType>();
+
+  // We can fold arithmetic operation only of we can prove that we will not
+  // accidentally hide a broadcasting error.
+  auto is_valid_broadcasting = [](ShapedType operand_ty, ShapedType identity_ty,
+                                  ShapedType result_ty) -> bool {
+    // Scalar identity is broadcastable to any operand shape, we only need to
+    // check that operand has the same shape as a result.
+    bool scalar_identity = identity_ty.hasRank() && identity_ty.getRank() == 0;
+    if (scalar_identity) return operand_ty == result_ty;
+
+    // If identity is not a scalar, we must verify that all shapes are equal
+    // and statically known.
+    //
+    // TODO(ezhulenev): Fold if identity shape is statically know to be
+    // broadcastable to the operand shape.
+    return operand_ty == result_ty && identity_ty == result_ty &&
+           result_ty.hasStaticShape();
+  };
+
+  // Check that we have a constant operand on one side (candidate for identity).
+  const bool is_commutative =
+      (std::is_same<OpT, AddV2Op>::value || std::is_same<OpT, MulOp>::value);
+  auto lhs_attr = operands[0].dyn_cast_or_null<DenseElementsAttr>();
+  auto rhs_attr = operands[1].dyn_cast_or_null<DenseElementsAttr>();
+  if (!rhs_attr && !(is_commutative && lhs_attr)) return {};
+
+  // Mul and Div ops have identity value one while AddV2 and SubOp have identity
+  // value zero.
+  const int identity =
+      (std::is_same<OpT, MulOp>::value || std::is_same<OpT, DivOp>::value ||
+       std::is_same<OpT, RealDivOp>::value)
+          ? 1
+          : 0;
+
+  Type element_ty = lhs_type.getElementType();
+  Attribute identity_attr;
+  if (auto ty = element_ty.template dyn_cast<FloatType>()) {
+    identity_attr = FloatAttr::get(ty, static_cast<double>(identity));
+  } else if (auto ty = element_ty.template dyn_cast<IntegerType>()) {
+    identity_attr = IntegerAttr::get(ty, static_cast<int64_t>(identity));
+  } else {
+    return {};
+  }
+
+  // Fold: Op(Operand, Identity) -> Operand.
+  if (rhs_attr && is_valid_broadcasting(lhs_type, rhs_type, result_type)) {
+    if (rhs_attr.isSplat() && rhs_attr.getSplatValue() == identity_attr)
+      return arithmetic_op.x();
+  }
+
+  // Fold: Op(Identity, Operand) -> Operand for commutative operations.
+  if (lhs_attr && is_commutative &&
+      is_valid_broadcasting(rhs_type, lhs_type, result_type)) {
+    if (lhs_attr.isSplat() && lhs_attr.getSplatValue() == identity_attr)
+      return arithmetic_op.y();
+  }
+
+  return {};
+}
+}  // namespace
+
+// Verifies an reduction op's `input` and reduction `dims`.
+static LogicalResult VerifyReductionInputAndDims(Value input, Value dims,
+                                                 Location loc) {
+  auto dims_type = dims.getType().dyn_cast<RankedTensorType>();
+  if (!dims_type) return success();
+  if (dims_type.getRank() > 1)
+    return emitError(loc, "dimensions can only be 0D or 1D tensor");
+
+  auto input_type = input.getType().dyn_cast<RankedTensorType>();
+  if (!input_type) return success();
+  int64_t rank = input_type.getRank();
+
+  DenseIntElementsAttr dims_attr;
+  if (!matchPattern(dims, m_Constant(&dims_attr))) return success();
+  for (const auto &dim_pair : llvm::enumerate(dims_attr)) {
+    int64_t cur_dim = dim_pair.value().getSExtValue();
+    if (cur_dim < -rank || cur_dim >= rank)
+      return emitError(loc)
+             << dim_pair.index() << "-th dimension should be in the range of [-"
+             << rank << ", " << rank << ")";
+  }
+
+  return success();
+}
+
+LogicalResult VerifyRegionResults(Operation *op, Region &region,
+                                  StringRef region_name) {
+  auto op_name = op->getName().getStringRef();
+  // verify that op outputs match yield inputs
+  YieldOp yield = cast<YieldOp>(region.front().getTerminator());
+  unsigned expected_num_results = op->getNumResults();
+  if (yield.getNumOperands() != expected_num_results)
+    return op->emitOpError()
+           << region_name + " should have same number (" << expected_num_results
+           << ") of results as " << op_name << " but has "
+           << yield.getNumOperands() << " results";
+
+  for (int idx : llvm::seq<int>(0, expected_num_results)) {
+    auto op_result_type = op->getResult(idx).getType().cast<TensorType>();
+    auto region_result_type =
+        yield.getOperand(idx).getType().cast<TensorType>();
+    if (!AreCastCompatible({region_result_type, op_result_type}))
+      return op->emitError(llvm::formatv(
+          "{0} result type {1} is incompatible with {2} "
+          "result type {3} at index {4}",
+          region_name, region_result_type, op_name, op_result_type, idx));
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Function control flow canonicalization.
+//===----------------------------------------------------------------------===//
+
+// Eliminate attributes that are not needed, but can get attached to Ops
+// during import.
+template <typename Op>
+struct DropAttributes : public OpRewritePattern<Op> {
+  using OpRewritePattern<Op>::OpRewritePattern;
+
+  // Drop the "output_shapes" attribute.
+  LogicalResult matchAndRewrite(Op op,
+                                PatternRewriter &rewriter) const override {
+    bool found = op.removeAttr("output_shapes") ==
+                 MutableDictionaryAttr::RemoveResult::Removed;
+    return success(found);
+  }
+};
+
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
new file mode 100644
index 00000000000..ffedcb47f7e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
@@ -0,0 +1,2326 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <numeric>
+#include <string>
+#include <tuple>
+#include <type_traits>
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/DialectImplementation.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Identifier.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Parser.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace mlir {
+namespace TF {
+
+namespace {
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_helpers.inc"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/generated_canonicalize.inc"
+}  // namespace
+
+//===----------------------------------------------------------------------===//
+// NegOp
+//===----------------------------------------------------------------------===//
+
+void NegOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                        MLIRContext *context) {
+  results.insert<NegNested>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// NotEqualOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(NotEqualOp op) {
+  // If we allow inputs to have incompatible type, then nothing to do.
+  if (!op.incompatible_shape_error()) return success();
+
+  // Otherwise, check inputs are broadcastable.
+  return mlir::OpTrait::impl::verifyCompatibleOperandBroadcast(
+      op.getOperation());
+}
+
+void NotEqualOp::build(OpBuilder &builder, OperationState &result, Value x,
+                       Value y, BoolAttr incompatible_shape_error) {
+  auto result_type = DeduceEqualCmpOpType(&builder, result.location, x, y,
+                                          incompatible_shape_error);
+  return build(builder, result, result_type, x, y, incompatible_shape_error);
+}
+
+//===----------------------------------------------------------------------===//
+// OneHotOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(OneHotOp op) {
+  int64_t axis = op.axis().getSExtValue();
+
+  auto indices_ty = op.indices().getType().dyn_cast<RankedTensorType>();
+  if (indices_ty &&
+      !(axis == -1 || (axis >= 0 && axis <= indices_ty.getShape().size()))) {
+    return op.emitOpError()
+           << "expected axis (" << axis << ") to be -1 or between [0, "
+           << indices_ty.getShape().size() << "]";
+  }
+
+  if (axis < -1) {
+    return op.emitOpError() << "expected axis (" << axis
+                            << ") to be -1 or between [0, rank(indices()))";
+  }
+
+  if (!IsOfRankOrUnranked(op.depth(), 0)) {
+    return op.emitOpError() << "requires depth to be a scalar";
+  }
+  if (!IsOfRankOrUnranked(op.on_value(), 0)) {
+    return op.emitOpError() << "requires on_value to be a scalar";
+  }
+  if (!IsOfRankOrUnranked(op.off_value(), 0)) {
+    return op.emitOpError() << "requires off_value to be a scalar";
+  }
+
+  DenseIntElementsAttr depth_attr;
+  if (matchPattern(op.depth(), m_Constant(&depth_attr))) {
+    if (depth_attr.getType().getRank() != 0)
+      return op.emitOpError() << "requires depth to be a scalar";
+    int64_t depth = depth_attr.getValue<APInt>({}).getSExtValue();
+    if (depth < 0) {
+      return op.emitOpError() << "depth must be non-negative, got: " << depth;
+    }
+  }
+
+  return success();
+}
+
+static TensorType InferOneHotOpType(Value indices, Value depth, Value on_value,
+                                    Value off_value, IntegerAttr axis) {
+  int64_t axis_val = axis.getInt();
+  Type element_ty = on_value.getType().cast<TensorType>().getElementType();
+  auto unranked_ty = UnrankedTensorType::get(element_ty);
+  if (axis_val < -1) return unranked_ty;
+
+  auto indices_ty = indices.getType().dyn_cast<RankedTensorType>();
+  if (!indices_ty) return unranked_ty;
+
+  auto shape = llvm::to_vector<2>(indices_ty.getShape());
+  if (axis_val == -1) axis_val = shape.size();
+
+  int64_t depth_val = ShapedType::kDynamicSize;
+  DenseIntElementsAttr depth_attr;
+  if (matchPattern(depth, m_Constant(&depth_attr)) &&
+      depth_attr.getNumElements() == 1)
+    depth_val = (*depth_attr.begin()).getSExtValue();
+  shape.insert(shape.begin() + axis_val, depth_val);
+  return RankedTensorType::get(shape, element_ty);
+}
+
+void OneHotOp::build(OpBuilder &builder, OperationState &result, Value indices,
+                     Value depth, Value on_value, Value off_value,
+                     IntegerAttr axis) {
+  build(builder, result,
+        InferOneHotOpType(indices, depth, on_value, off_value, axis), indices,
+        depth, on_value, off_value, axis);
+}
+
+//===----------------------------------------------------------------------===//
+// PackOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(PackOp op) {
+  // TODO(hinsu): Convert variadic length attributes to derived attributes.
+  Operation::operand_range values = op.values();
+
+  if (failed(VerifyTypesCompatibility(values,
+                                      /*mask_one_dim=*/false,
+                                      op.getOperation()))) {
+    return failure();
+  }
+
+  int64_t inputs_rank = -1;
+  for (Value value : values) {
+    if (auto ty = value.getType().dyn_cast<RankedTensorType>()) {
+      // Exit early as input types are verified to be compatible so all ranked
+      // tensors have the same rank.
+      inputs_rank = ty.getRank();
+      break;
+    }
+  }
+  if (inputs_rank == -1) return success();
+
+  // The values can be packed along any of the dimensions between 0 and
+  // inputs rank, inclusive. Also, as the negative axis values wrap around so
+  // the axis value range is [-(R+1), R+1).
+  int64_t range_begin = -inputs_rank - 1;  // Inclusive
+  int64_t range_end = inputs_rank + 1;     // Exclusive
+  int64_t axis = op.axis().getSExtValue();
+  if (axis < range_begin || axis >= range_end) {
+    return op.emitError() << "attribute 'axis' should be within range ["
+                          << range_begin << ", " << range_end
+                          << "); actual value: " << axis;
+  }
+
+  return success();
+}
+
+OpFoldResult PackOp::fold(ArrayRef<Attribute> operands) {
+  // Fold pack operation if it computes the input tensor shape:
+  //
+  //   %shape  = tf.Shape(%arg)                    // [? x ...]
+  //   %dim0   = tf.StridedSlice(%shape, 0, 1, 1)  // get unknown dim0 value
+  //   %pack   = tf.Pack(dim0, ...) { axis = 0 }   // [? x ...]
+  //
+  // Where `...` are some statically known dimensions. In this case %pack can be
+  // replaced with a %shape. This is a common pattern in models with a dynamic
+  // batch size.
+
+  // Pack operation should pack at least two values.
+  if (values().size() < 2) return {};
+
+  // Dimensions packed along axis = 0 (pack scalars into vector).
+  if (axis().getSExtValue() != 0) return {};
+
+  // First packed value is defined by a strided slice operation.
+  auto slice_op = dyn_cast_or_null<StridedSliceOp>(values()[0].getDefiningOp());
+  if (!slice_op) return {};
+
+  // Input to the slice op is defined by shape operation.
+  auto shape_op = dyn_cast_or_null<ShapeOp>(slice_op.input().getDefiningOp());
+  if (!shape_op) return {};
+
+  // Input tensor, which shape is reconstructed by the pack operation.
+  Value tensor = shape_op.input();
+
+  // All masks are `0` except `shrink_axis_mask` which is equal to `1` (slicing
+  // scalar value from input vector).
+  if (slice_op.begin_mask().getSExtValue() != 0 ||
+      slice_op.ellipsis_mask().getSExtValue() != 0 ||
+      slice_op.end_mask().getSExtValue() != 0 ||
+      slice_op.new_axis_mask().getSExtValue() != 0 ||
+      slice_op.shrink_axis_mask().getSExtValue() != 1)
+    return {};
+
+  // Returns a value if the `value` is defined by a ConstOp with a single
+  // integer element in it and has an expected rank.
+  auto get_const_int = [](Value value, int expected_rank) -> Optional<int64_t> {
+    auto const_op = dyn_cast_or_null<ConstOp>(value.getDefiningOp());
+    if (!const_op) return None;
+
+    auto value_attr = const_op.value().dyn_cast<DenseIntElementsAttr>();
+    if (!value_attr || value_attr.getNumElements() != 1) return None;
+
+    auto value_ty = value_attr.getType();
+    if (!value_ty.hasRank() || value_ty.getRank() != expected_rank) return None;
+
+    auto splat = value_attr.getSplatValue<IntegerAttr>();
+    return splat.getValue().getSExtValue();
+  };
+
+  // All other packed values are scalar constants.
+  SmallVector<int64_t, 4> packed_dims;
+  packed_dims.reserve(values().size() - 1);
+  for (Value operand : llvm::drop_begin(values(), 1)) {
+    if (auto dim = get_const_int(operand, /*expected_rank=*/0)) {
+      packed_dims.push_back(*dim);
+    } else {
+      return {};
+    }
+  }
+
+  // Slice exactly the first shape dimension:
+  //   begin = [0] end = [1], strides = [1]
+  auto begin = get_const_int(slice_op.begin(), /*expected_rank=*/1);
+  auto end = get_const_int(slice_op.end(), /*expected_rank=*/1);
+  auto strides = get_const_int(slice_op.strides(), /*expected_rank=*/1);
+  if (!begin.hasValue() || !end.hasValue() || !strides.hasValue() ||
+      *begin != 0 || *end != 1 || *strides != 1)
+    return {};
+
+  // First tensor dimension is dynamic.
+  auto arg_ty = tensor.getType().dyn_cast<ShapedType>();
+  if (!arg_ty || !arg_ty.hasRank() || arg_ty.getNumDynamicDims() != 1 ||
+      !arg_ty.isDynamicDim(0))
+    return {};
+
+  // Argument tensor rank is equal to the number of packed dimensions.
+  if (arg_ty.getRank() != values().size()) return {};
+
+  // All other dimensions are statically known and equal to packed dims.
+  auto arg_dims = llvm::drop_begin(arg_ty.getShape(), 1);
+  if (!std::equal(arg_dims.begin(), arg_dims.end(), packed_dims.begin()))
+    return {};
+
+  // Replace %pack with %shape.
+  return slice_op.input();
+}
+
+//===----------------------------------------------------------------------===//
+// PadOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult PadOp::FoldOperandsPermutation(ArrayRef<int64_t> permutation) {
+  // Paddings must be defined by a constant operation.
+  auto paddings_op = dyn_cast_or_null<TF::ConstOp>(paddings().getDefiningOp());
+  if (!paddings_op) return failure();
+
+  auto paddings_value = paddings_op.value().dyn_cast<DenseElementsAttr>();
+  if (!paddings_value ||
+      paddings_value.getNumElements() != permutation.size() * 2)
+    return failure();
+
+  SmallVector<int32_t, 8> shuffled_paddings(paddings_value.getNumElements());
+  for (auto index_pair : llvm::enumerate(paddings_value.getIntValues())) {
+    size_t outer_idx = index_pair.index() / 2;
+    size_t inner_idx = index_pair.index() % 2;
+
+    shuffled_paddings[permutation[outer_idx] * 2 + inner_idx] =
+        index_pair.value().getSExtValue();
+  }
+
+  // Add constant operation with a new paddings.
+  OpBuilder builder(getOperation());
+  auto type = mlir::RankedTensorType::get(paddings_value.getType().getShape(),
+                                          builder.getIntegerType(32));
+  auto values = mlir::DenseIntElementsAttr::get(type, shuffled_paddings);
+  auto shuffled_paddings_op = builder.create<TF::ConstOp>(getLoc(), values);
+
+  // Use new paddings.
+  setOperand(1, shuffled_paddings_op);
+
+  // Change the result type.
+  getResult().setType(ShuffleRankedTensorType(getResult().getType(),
+                                              ReversePermutation(permutation)));
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// ParseExampleV2Op
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(ParseExampleV2Op op) {
+  // NOTE(mrry): This validates properties of an op that would previously be
+  // validated by the TensorFlow OpDef type checker. In addition to these
+  // checks, the shape inference function for ParseExampleV2 validates the
+  // consistency of the argument and result types.
+
+  // Validate dense variadic input and output lengths.
+  // NOTE(mrry): The Tdense attr is derived from dense_defaults, so we
+  // do not need to validate dense_defaults.
+  auto dense_types_count =
+      std::distance(op.Tdense().begin(), op.Tdense().end());
+  auto dense_values_count =
+      std::distance(op.dense_values().begin(), op.dense_values().end());
+  if (dense_values_count != dense_types_count) {
+    return op.emitError() << "output 'dense_values' should have same length "
+                          << "as attribute 'Tdense'";
+  }
+
+  // Validate sparse variadic output lengths.
+  // NOTE(mrry): The sparse_types attr is derived from sparse_values, so we
+  // do not need to validate sparse_values.
+  auto sparse_types_count =
+      std::distance(op.sparse_types().begin(), op.sparse_types().end());
+  if (op.num_sparse() != sparse_types_count) {
+    return op.emitError() << "attribute 'num_sparse' should be the same as "
+                          << "the length of attribute 'sparse_types'";
+  }
+  if (op.sparse_indices().size() != sparse_types_count) {
+    return op.emitError() << "output 'sparse_indices' should have same length "
+                          << "as attribute 'sparse_types'";
+  }
+  if (op.sparse_shapes().size() != sparse_types_count) {
+    return op.emitError() << "output 'sparse_shapes' should have same length "
+                          << "as attribute 'sparse_types'";
+  }
+
+  // Validate ragged variadic output lengths.
+  auto ragged_value_types_count = std::distance(op.ragged_value_types().begin(),
+                                                op.ragged_value_types().end());
+  auto ragged_split_types_count = std::distance(op.ragged_split_types().begin(),
+                                                op.ragged_split_types().end());
+  if (ragged_value_types_count != ragged_split_types_count) {
+    return op.emitError() << "attribute 'ragged_value_types' should have same "
+                          << "length as attribute 'ragged_split_types'";
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// PartitionedCallOp
+//===----------------------------------------------------------------------===//
+
+template <class OpClass>
+static LogicalResult VerifyPartitionedCall(OpClass op) {
+  auto module = op.template getParentOfType<ModuleOp>();
+  SymbolRefAttr func = op.getAttr("f").template cast<SymbolRefAttr>();
+
+  auto function =
+      dyn_cast_or_null<FuncOp>(SymbolTable::lookupSymbolIn(module, func));
+
+  if (!function) {
+    return op.emitError("'f' attribute refers to an undefined function: ")
+           << func;
+  }
+
+  FunctionType function_ty = function.getType();
+  int func_arg_count = function_ty.getNumInputs();
+  int arg_count = op.args().size();
+
+  if (arg_count != func_arg_count) {
+    return op.emitError() << "argument count mismatch: 'args' has " << arg_count
+                          << " arguments, but '" << func << "' expects "
+                          << func_arg_count;
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// PowOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult PowOp::fold(ArrayRef<Attribute> operands) {
+  auto constant_y = operands[1].dyn_cast_or_null<DenseFPElementsAttr>();
+  if (constant_y && constant_y.isSplat()) {
+    APFloat y_value = constant_y.getSplatValue<APFloat>();
+    auto output_type = getType().cast<ShapedType>();
+    if (y_value.isZero() && output_type.hasStaticShape()) {
+      return DenseElementsAttr::get(
+          output_type,
+          FloatAttr::get(output_type.getElementType(), /*value=*/1.0));
+    }
+    if (y_value.isExactlyValue(1.0)) {
+      return x();
+    }
+  }
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// QrOp
+//===----------------------------------------------------------------------===//
+
+// Verifies that,
+//
+// * Input type, if ranked, must have at least 2 dimensions and at most
+//   INT32_MAX dimensions.
+//
+static LogicalResult Verify(QrOp op) {
+  auto ttype = op.input().getType().cast<TensorType>();
+  if (!ttype.hasRank()) return success();
+  if (!HasRankAtLeast(op.input(), 2))
+    return op.emitOpError(
+        "requires ranked input tensor to be of rank 2 or more");
+  if (!HasRankAtMost(op.input(), std::numeric_limits<int32_t>::max()))
+    return op.emitOpError(
+        "requires ranked input tensor to be of rank INT32_MAX or less");
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// ReadVariableOp
+//===----------------------------------------------------------------------===//
+
+void ReadVariableOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<ReadVariableOfCast>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// ReciprocalOp
+//===----------------------------------------------------------------------===//
+
+void ReciprocalOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<ReciprocalNested>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// RandomUniformOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(RandomUniformOp op) {
+  if (!IsOfRankOrUnranked(op.shape(), 1))
+    return op.emitOpError("shape must be 1D tensor");
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// RangeOp
+//===----------------------------------------------------------------------===//
+
+void RangeOp::build(OpBuilder &builder, OperationState &result, Value start,
+                    Value limit, Value delta) {
+  assert(start.getType() == limit.getType());
+  assert(start.getType() == delta.getType());
+  DenseIntElementsAttr start_val;
+  DenseIntElementsAttr limit_val;
+  DenseIntElementsAttr delta_val;
+  if (matchPattern(start, m_Constant(&start_val)) &&
+      matchPattern(limit, m_Constant(&limit_val)) &&
+      matchPattern(delta, m_Constant(&delta_val))) {
+    auto size = llvm::APIntOps::RoundingSDiv(
+        *limit_val.begin() - *start_val.begin(), *delta_val.begin(),
+        llvm::APInt::Rounding::DOWN);
+    return RangeOp::build(
+        builder, result,
+        RankedTensorType::get(
+            size.getSExtValue(),
+            start.getType().cast<TensorType>().getElementType()),
+        start, limit, delta);
+  }
+  return RangeOp::build(
+      builder, result,
+      RankedTensorType::get(
+          {-1}, start.getType().cast<TensorType>().getElementType()),
+      start, limit, delta);
+}
+//===----------------------------------------------------------------------===//
+// RankOp
+//===----------------------------------------------------------------------===//
+
+void RankOp::build(OpBuilder &builder, OperationState &result, Value input) {
+  return RankOp::build(builder, result,
+                       RankedTensorType::get({}, builder.getIntegerType(32)),
+                       input);
+}
+
+// This will create a constant value for RankOp of a ranked tensor.
+OpFoldResult RankOp::fold(ArrayRef<Attribute> operands) {
+  auto type = input().getType();
+  auto ranked_type = type.dyn_cast<RankedTensorType>();
+  if (!ranked_type) return {};
+
+  auto output_type = getType().cast<ShapedType>();
+  int32_t rank = ranked_type.getRank();
+  return DenseIntElementsAttr::get(output_type, rank);
+}
+
+//===----------------------------------------------------------------------===//
+// RealDivOp
+//===----------------------------------------------------------------------===//
+
+void RealDivOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                            MLIRContext *context) {
+  results.insert<RealDivWithSqrtDivisor, RealDivWithConstDivisor>(context);
+}
+
+OpFoldResult RealDivOp::fold(ArrayRef<Attribute> operands) {
+  return IdentityArithmeticOpFolder<RealDivOp>(*this, operands);
+}
+
+//===----------------------------------------------------------------------===//
+// ReshapeOp
+//===----------------------------------------------------------------------===//
+
+// TODO(b/128020684): Verify the output type.
+static LogicalResult Verify(ReshapeOp op) {
+  auto shape_type = op.shape().getType().cast<TensorType>();
+  if (!shape_type.hasRank()) return success();
+  if (shape_type.getRank() != 1)
+    return op.emitOpError("shape must be 1D tensor");
+  auto rank_by_shape = shape_type.getShape()[0];
+  auto type_of_tensor = op.tensor().getType().cast<TensorType>();
+  // No compile time verification for unknown sized shape.
+  if (rank_by_shape == -1 || !type_of_tensor.hasStaticShape()) return success();
+  int64_t num_by_tensor = type_of_tensor.getNumElements();
+
+  auto out_ty = op.getType().dyn_cast<RankedTensorType>();
+  if (out_ty && out_ty.hasStaticShape()) {
+    int64_t num_output_elements = out_ty.getNumElements();
+    if (num_by_tensor != num_output_elements)
+      return op.emitOpError()
+             << "number of output elements (" << num_output_elements
+             << ") does not match expected number of elements ("
+             << num_by_tensor << ")";
+  }
+
+  // Check values if constant shape. No compiling time verification for
+  // non-constant shape.
+  auto *shape_op = op.shape().getDefiningOp();
+  if (!shape_op) return success();
+  Attribute shape_cst;
+  if (!matchPattern(shape_op, m_Constant(&shape_cst))) return success();
+  auto shape_cst_attr = shape_cst.dyn_cast<ElementsAttr>();
+  if (!shape_cst_attr) return op.emitOpError("shape must be a valid tensor");
+
+  if (auto opaque_attr = shape_cst_attr.dyn_cast<OpaqueElementsAttr>()) {
+    opaque_attr.decode(shape_cst_attr);
+  }
+
+  // We know the shape is a 1-D Tensor, then let us get the number of
+  // elements it implies.
+  unsigned num_by_shape = 1;
+  unsigned unknown_dim_count = 0;
+  for (int i = 0, e = rank_by_shape; i != e; ++i) {
+    auto num = shape_cst_attr.getValue<IntegerAttr>(i).getInt();
+    // The dimension size value can be -1, and that the real size needs to
+    // be computed so that the total size remains constant. At most one
+    // component of shape can be -1.
+    if (num == -1) {
+      if (++unknown_dim_count > 1) {
+        return op.emitOpError("more than one component of shape are -1");
+      }
+    } else {
+      num_by_shape *= num;
+    }
+  }
+  // If there is one component of shape is -1, the dimension should be
+  // computed so that the total size remains constant.
+  if (unknown_dim_count == 1) {
+    if (num_by_tensor % num_by_shape != 0)
+      return op.emitOpError(
+          "one component of shape is -1 but couldn't infer the dimension");
+    return success();
+  }
+  // If the elements by the tensor and implies by the shape don't match,
+  // fail this static check.
+  if (num_by_tensor != num_by_shape) {
+    return op.emitOpError(
+        "mismatch in tensor elements and shape implied elements");
+  }
+  return success();
+}
+
+void ReshapeOp::build(OpBuilder &builder, OperationState &result, Value tensor,
+                      Value shape) {
+  auto ttype = tensor.getType().cast<ShapedType>();
+  auto etype = ttype.getElementType();
+
+  auto unranked = [&builder, etype, &result, shape, tensor]() {
+    return ReshapeOp::build(builder, result, UnrankedTensorType::get(etype),
+                            tensor, shape);
+  };
+
+  // If tensor is unranked then we have no info about output of shape.
+  if (!ttype.hasRank()) return unranked();
+
+  DenseIntElementsAttr attr_shape;
+  if (matchPattern(shape, m_Constant(&attr_shape))) {
+    llvm::SmallVector<int64_t, 4> const_shape;
+    const_shape.reserve(attr_shape.getNumElements());
+
+    // Detect if reshape output shape is folded.
+    bool flatten = false;
+    int unknown_index = -1;
+    // The product of constant shape argument excluding unknown dimension.
+    int64_t product_cshape = 1;
+    for (auto e : llvm::enumerate(attr_shape)) {
+      int64_t val = e.value().getSExtValue();
+      if (IsUnknownDimOrRank(val)) {
+        if (flatten) {
+          mlir::emitError(result.location)
+              << "only one unknown dimension allowed";
+          return;
+        }
+        flatten = true;
+        unknown_index = e.index();
+      } else {
+        product_cshape *= val;
+      }
+      const_shape.push_back(val);
+    }
+
+    // Compute the value of the unknown dimension.
+    if (flatten) {
+      // Compute number of elements in tensor shape.
+      auto tshape = ttype.getShape();
+      int64_t product_tshape = std::accumulate(tshape.begin(), tshape.end(), 1,
+                                               std::multiplies<int64_t>());
+      // Set the unknown dimension such that total number of elements remain
+      // constant.
+      // Note: The case where the ratio is not integral, and so the total size
+      // of reshape not constant, is checked in verify function.
+      const_shape[unknown_index] = product_tshape / product_cshape;
+    }
+    return ReshapeOp::build(builder, result,
+                            RankedTensorType::get(const_shape, etype), tensor,
+                            shape);
+  }
+  return unranked();
+}
+
+void ReshapeOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                            MLIRContext *context) {
+  results.insert<RedundantReshape, ReshapeToSelfShape>(context);
+}
+
+OpFoldResult ReshapeOp::fold(ArrayRef<Attribute> operands) {
+  Value tensor = this->tensor();
+
+  // Fold reshape if operand and result types are the same and all dimensions
+  // are statically known (no-op reshape).
+  // TODO(ezhulenev): Add the same folding for BroadcastToOp.
+  auto result_ty = getType().dyn_cast<ShapedType>();
+  if (result_ty && result_ty.hasStaticShape() &&
+      result_ty == tensor.getType()) {
+    return tensor;
+  }
+
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// SelectOp
+//===----------------------------------------------------------------------===//
+
+void SelectOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                           MLIRContext *context) {
+  results.insert<SelectToSelectV2>(context);
+}
+
+// Verifies a few extra requirements on SelectOp:
+// (1) `then` and `else` must have same shape
+// (2) At least one of the following must be true:
+//     (a) `cond` has the same rank as `then` and `else`
+//     (b) `cond` is a scalar
+//     (c) `cond` is a vector AND `then` and `else` are non-scalar with their
+//         first dimension equal to `cond`.
+static LogicalResult Verify(SelectOp op) {
+  auto then_tensor = op.t().getType().cast<TensorType>();
+  auto else_tensor = op.e().getType().cast<TensorType>();
+  // Check (1).
+  if (!AreCastCompatible({then_tensor, else_tensor}))
+    return op.emitOpError() << "requires t and e have compatible shapes";
+
+  // Get data rank (if exists).
+  int data_rank;
+  // If data is unranked or data_rank is 0, this will remain -2. Otherwise
+  // refers to first dimension of then and/or else.
+  int data_first_dim = -2;
+  bool then_has_rank = then_tensor.hasRank();
+  bool else_has_rank = else_tensor.hasRank();
+  if (then_has_rank && else_has_rank) {
+    data_rank = then_tensor.getRank();
+    if (then_tensor.getRank() > 0)
+      data_first_dim = then_tensor.getShape().front();
+    if (else_tensor.getRank() > 0)
+      data_first_dim = std::max(
+          static_cast<int>(else_tensor.getShape().front()), data_first_dim);
+  } else if (then_has_rank) {
+    data_rank = then_tensor.getRank();
+    if (then_tensor.getRank() > 0)
+      data_first_dim = then_tensor.getShape().front();
+  } else if (else_has_rank) {
+    data_rank = else_tensor.getRank();
+    if (else_tensor.getRank() > 0)
+      data_first_dim = else_tensor.getShape().front();
+  } else {
+    // Neither has a rank.
+    return success();
+  }
+
+  auto cond_tensor = op.condition().getType().dyn_cast<RankedTensorType>();
+  if (!cond_tensor) return success();
+  auto cond_rank = cond_tensor.getRank();
+  // Check (2a) and (2b).
+  if (cond_rank == 0 || cond_rank == data_rank) return success();
+  // Check (2c).
+  if (cond_rank == 1) {
+    auto cond_shape = cond_tensor.getShape().front();
+    if (data_rank == 0) {
+      return op.emitOpError()
+             << "requires that t and e are nonscalar when pred is a vector";
+    }
+    // We know `data` tensor has a rank of at least 1.
+    if (data_first_dim != -1 && cond_shape != -1 &&
+        data_first_dim != cond_shape) {
+      return op.emitOpError() << "requires that, when pred is a vector, the "
+                                 "shape matches the first dimension of t and e";
+    }
+    return success();
+  }
+  // None of (2a,b,c) were true; fail.
+  return op.emitOpError() << "requires that pred is a scalar OR has the same "
+                             "rank as t and e OR is a vector";
+}
+
+//===----------------------------------------------------------------------===//
+// SelectV2Op
+//===----------------------------------------------------------------------===//
+
+static Type InferSelectV2OpType(Value condition, Value e, Value t) {
+  Type element_ty = e.getType().cast<TensorType>().getElementType();
+  auto unranked_ty = UnrankedTensorType::get(element_ty);
+
+  Type broadcasted_ty =
+      OpTrait::util::getBroadcastedType(e.getType(), t.getType());
+  if (!broadcasted_ty) return unranked_ty;
+
+  auto cond_ranked_ty = condition.getType().dyn_cast<RankedTensorType>();
+  auto broadcasted_ranked_ty = broadcasted_ty.dyn_cast<RankedTensorType>();
+  if (!cond_ranked_ty || !broadcasted_ranked_ty) return unranked_ty;
+
+  // Explicitly get broadcasted output type as element types of condition may
+  // not be same as the broadcated type's element type.
+  SmallVector<int64_t, 4> result_shape;
+  if (!OpTrait::util::getBroadcastedShape(cond_ranked_ty.getShape(),
+                                          broadcasted_ranked_ty.getShape(),
+                                          result_shape))
+    return unranked_ty;
+  return RankedTensorType::get(result_shape, element_ty);
+}
+
+void SelectV2Op::build(OpBuilder &builder, OperationState &result,
+                       Value condition, Value e, Value t) {
+  build(builder, result, InferSelectV2OpType(condition, e, t), condition, e, t);
+}
+
+//===----------------------------------------------------------------------===//
+// ShapeOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+// Validates Shape/ShapeN/VariableShape operand and associated result types.
+LogicalResult VerifyShapeOperandAndResult(Operation *op, Type operand_type,
+                                          Type result_type,
+                                          int variadic_idx = -1) {
+  std::string variadic_idx_str =
+      variadic_idx < 0 ? "" : llvm::formatv(" #{0}", variadic_idx).str();
+
+  auto result_ranked_type = result_type.dyn_cast<RankedTensorType>();
+  if (!result_ranked_type) return success();
+  if (result_ranked_type.getShape().size() != 1)
+    return op->emitOpError("requires 1D type for result") << variadic_idx_str;
+
+  auto operand_ranked_type = operand_type.dyn_cast_or_null<RankedTensorType>();
+  if (operand_ranked_type) {
+    // The operand is a ranked tensor.
+    if (result_ranked_type.hasStaticShape() &&
+        !operand_ranked_type.getShape().empty() &&
+        result_ranked_type.getDimSize(0) !=
+            operand_ranked_type.getShape().size())
+      return op->emitOpError("requires dimension size of result")
+             << variadic_idx_str << " to match rank of operand"
+             << variadic_idx_str;
+  } else if (result_ranked_type.hasStaticShape()) {
+    // The operand is an unranked tensor, print a warning if the result
+    // is static.
+    // Note: We do not handle this situation as an error, this would be too
+    // restrictive due to incompleteness of shape inference at this point.
+    op->emitWarning("has static shape result")
+        << variadic_idx_str << " for unranked operand" << variadic_idx_str;
+  }
+
+  Type element_type = result_ranked_type.getElementType();
+  if (!element_type.isSignlessInteger(32) &&
+      !element_type.isSignlessInteger(64))
+    return op->emitOpError("requires int32 or int64 return type for result")
+           << variadic_idx_str;
+
+  return success();
+}
+}  // anonymous namespace
+
+static LogicalResult Verify(ShapeOp op) {
+  return VerifyShapeOperandAndResult(op, op.input().getType(), op.getType());
+}
+
+// Converts shape of the given type to attribute if it is of ranked tensor type.
+// Returned attribute has integer elements of the given width.
+static Attribute ConvertShapeToAttr(Type input_ty, int out_width) {
+  auto ranked_ty = input_ty.dyn_cast<RankedTensorType>();
+  if (!ranked_ty || !ranked_ty.hasStaticShape()) return {};
+
+  auto shape = ranked_ty.getShape();
+  int rank = shape.size();
+
+  SmallVector<APInt, 4> dimensions;
+  dimensions.reserve(rank);
+  for (int i = 0; i < rank; ++i)
+    dimensions.push_back(APInt(out_width, shape[i]));
+
+  auto result_type = RankedTensorType::get(
+      {rank}, IntegerType::get(out_width, input_ty.getContext()));
+  return DenseElementsAttr::get(result_type, dimensions);
+}
+
+OpFoldResult ShapeOp::fold(ArrayRef<Attribute> operands) {
+  int width =
+      getType().cast<ShapedType>().getElementType().getIntOrFloatBitWidth();
+  return ConvertShapeToAttr(getOperand().getType(), width);
+}
+
+void ShapeOp::build(OpBuilder &builder, OperationState &result, Value input,
+                    BoolAttr use32Bit) {
+  auto rankedTensorType = input.getType().dyn_cast<RankedTensorType>();
+  int64_t rank = rankedTensorType ? rankedTensorType.getRank() : -1;
+  auto out_type = use32Bit.getValue() ? builder.getIntegerType(32)
+                                      : builder.getIntegerType(64);
+  return ShapeOp::build(builder, result,
+                        RankedTensorType::get({rank}, out_type), input);
+}
+
+//===----------------------------------------------------------------------===//
+// ShapeNOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(ShapeNOp op) {
+  const size_t num_tensors = op.N();
+
+  if (op.getNumOperands() != num_tensors)
+    return op.emitOpError() << "requires " << num_tensors << " operand(s), got "
+                            << op.getNumOperands() << " operand(s)";
+
+  if (op.getNumResults() != num_tensors)
+    return op.emitOpError() << "requires " << num_tensors << " result(s), got "
+                            << op.getNumResults() << " result(s)";
+
+  for (auto i : llvm::seq<uint64_t>(0, num_tensors)) {
+    auto verification = VerifyShapeOperandAndResult(
+        op, op.getOperand(i).getType(), op.getResult(i).getType(), i);
+    if (failed(verification)) return verification;
+  }
+
+  return success();
+}
+
+LogicalResult ShapeNOp::fold(ArrayRef<Attribute> operands,
+                             SmallVectorImpl<OpFoldResult> &results) {
+  if (getNumOperands() == 0) return success();
+  int width =
+      getType(0).cast<ShapedType>().getElementType().getIntOrFloatBitWidth();
+
+  for (Type input_ty : getOperandTypes()) {
+    OpFoldResult result = ConvertShapeToAttr(input_ty, width);
+    if (!result) return failure();
+
+    results.push_back(result);
+  }
+  return success();
+}
+
+// TODO(hinsu): Add canonicalization pattern for ShapeN ops that don't have all
+// static input shapes. Replacing output values corresponding to static input
+// types may enable optimizations in users of the values.
+
+//===----------------------------------------------------------------------===//
+// SizeOp
+//===----------------------------------------------------------------------===//
+
+// Verifies that,
+//
+// * Input type, if is a ranked tensor, has at most INT32_MAX dimensions.
+//
+static LogicalResult Verify(SizeOp op) {
+  if (!HasRankAtMost(op.input(), std::numeric_limits<int32_t>::max()))
+    return op.emitOpError(
+        "requires ranked input tensor to be of rank INT32_MAX or less");
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// SliceOp
+//===----------------------------------------------------------------------===//
+
+// Verifies that:
+//
+// - operands begin and size are 1D with the same number of elements.
+// - if the input is a ranked tensor, the rank of the input equals the number
+//   of elements in operands begin and size.
+// - if begin are constants, that
+//   0 <= begin[i] <= begin[i] + size[i] <= input_ty.getShape()[i]
+// - if begins aren't constant but the input is a ranked tensor, that
+//   size[i] <= input_ty.getShape()[i]
+//
+static LogicalResult Verify(SliceOp op) {
+  RankedTensorType begin_ty = GetRankedTensorTypeForOperand(op.begin());
+  if (begin_ty && begin_ty.getRank() != 1) {
+    return op.emitOpError() << "requires begin operand to be 1D tensor";
+  }
+
+  RankedTensorType size_ty = GetRankedTensorTypeForOperand(op.size());
+  if (size_ty && size_ty.getRank() != 1) {
+    return op.emitOpError() << "requires size operand to be 1D tensor";
+  }
+
+  if (!begin_ty || !size_ty || !begin_ty.hasStaticShape() ||
+      !size_ty.hasStaticShape())
+    return success();
+
+  if (begin_ty.getNumElements() != size_ty.getNumElements()) {
+    return op.emitOpError() << "requires begin and size operands to have the"
+                               " same number of elements";
+  }
+
+  auto input_ty = op.input().getType().dyn_cast<RankedTensorType>();
+  if (input_ty && begin_ty.getNumElements() != input_ty.getRank()) {
+    return op.emitOpError() << "requires number of elements in begin and size"
+                               "are equal to input rank";
+  }
+
+  DenseIntElementsAttr begin_indices;
+  if (matchPattern(op.begin(), m_Constant(&begin_indices))) {
+    DenseIntElementsAttr slice_sizes;
+    bool constant_slice_sizes =
+        matchPattern(op.size(), m_Constant(&slice_sizes));
+    int dim = 0;
+    for (const APInt &raw_begin_index : begin_indices.getValues<APInt>()) {
+      int64_t begin_index = raw_begin_index.getSExtValue();
+      int64_t input_size = input_ty ? input_ty.getShape()[dim] : -1;
+      int64_t slice_size = constant_slice_sizes
+                               ? slice_sizes.getValue<APInt>(dim).getSExtValue()
+                               : 0;
+      if (slice_size == -1 && input_size != -1) {
+        slice_size = input_size - begin_index;
+      }
+      if (begin_index < 0 ||
+          (input_size != -1 && begin_index + slice_size > input_size)) {
+        return op.emitOpError()
+               << "requires 0 <= begin[i] <= begin[i] + size[i] <= Di";
+      }
+      ++dim;
+    }
+  } else if (input_ty) {
+    // If the inputs are ranked, we can do a few more sanity checks.
+    DenseIntElementsAttr slice_sizes;
+    if (matchPattern(op.size(), m_Constant(&slice_sizes))) {
+      auto input_shape = input_ty.getShape();
+      for (int64_t i = 0; i < input_ty.getRank(); ++i) {
+        int64_t slice_size = slice_sizes.getValue<IntegerAttr>(i).getInt();
+        int64_t input_size = input_shape[i];
+        if (slice_size != -1 && input_size != -1 && slice_size > input_size) {
+          return op.emitOpError() << "requires size[i] <= Di, even if begin[i] "
+                                     "is unknown at compile time";
+        }
+      }
+    }
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// SoftmaxOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(SoftmaxOp op) {
+  if (!HasRankAtLeast(op.logits(), 1)) {
+    return op.emitOpError("requires operand to have rank at least 1");
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// SoftmaxCrossEntropyWithLogitsOp
+//===----------------------------------------------------------------------===//
+
+// Verifies that,
+//
+// * Input types are broadcast compatible and the broadcasted type has rank two.
+//
+static LogicalResult Verify(SoftmaxCrossEntropyWithLogitsOp op) {
+  auto broadcasted_ty = OpTrait::util::getBroadcastedType(
+                            op.features().getType(), op.labels().getType())
+                            .dyn_cast_or_null<ShapedType>();
+  if (!broadcasted_ty ||
+      (broadcasted_ty.hasRank() && broadcasted_ty.getRank() != 2))
+    return op.emitOpError(
+        "requires features and labels to be broadcast compatible to rank two");
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// SparseSoftmaxCrossEntropyWithLogitsOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(SparseSoftmaxCrossEntropyWithLogitsOp op) {
+  if (!IsOfRankOrUnranked(op.features(), 2)) {
+    return op.emitOpError("requires features operand of rank two");
+  }
+  if (!IsOfRankOrUnranked(op.labels(), 1)) {
+    return op.emitOpError("requires labels operand of rank one");
+  }
+  auto features_ty = op.features().getType().dyn_cast<RankedTensorType>();
+  auto labels_ty = op.labels().getType().dyn_cast<RankedTensorType>();
+  if (features_ty && labels_ty) {
+    int64_t features_batches = features_ty.getDimSize(0);
+    int64_t labels_batches = labels_ty.getDimSize(0);
+    if (!ShapedType::isDynamic(features_batches) &&
+        !ShapedType::isDynamic(labels_batches) &&
+        features_batches != labels_batches)
+      return op.emitOpError(
+          "requires features and labels with matching first dimension");
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// SplitOp
+//===----------------------------------------------------------------------===//
+
+// Verifies the input and split dimension operands for tf.Split/tf.SplitV.
+// Writes the split dimension's index (adjusted with input rank) via `dim_index`
+// if it's a constant.
+template <class Op>
+LogicalResult VerifySplitInputAndSplitDim(Op op, Optional<int64_t> *dim_index) {
+  *dim_index = llvm::None;
+
+  Value split_dim = op.split_dim();
+  if (auto split_dim_type = split_dim.getType().dyn_cast<RankedTensorType>())
+    if (split_dim_type.getRank() != 0)
+      return op.emitOpError(
+          "split dimension should be an integer scalar tensor");
+
+  // We can perform further verification if the input tensor to be split has
+  // known rank and the split dimension tensor is a constant.
+
+  auto input_type = op.value().getType().template dyn_cast<RankedTensorType>();
+  if (!input_type) return success();
+
+  int64_t input_rank = input_type.getRank();
+  if (input_rank == 0)
+    return op.emitOpError("cannot split scalar input tensor");
+
+  DenseIntElementsAttr split_dim_attr;
+  if (!matchPattern(split_dim, m_Constant(&split_dim_attr))) return success();
+
+  int64_t index = (*split_dim_attr.begin()).getSExtValue();
+
+  if (index + input_rank < 0 || index >= input_rank) {
+    return op.emitOpError("split dimension must be in range [-")
+           << input_rank << ", " << input_rank << ")";
+  }
+
+  if (index < 0) index += input_rank;
+  *dim_index = index;
+
+  return success();
+}
+
+static LogicalResult Verify(SplitOp op) {
+  Optional<int64_t> dim_index;
+  if (failed(VerifySplitInputAndSplitDim(op, &dim_index))) return failure();
+  if (!dim_index) return success();
+
+  int64_t input_dim_size =
+      op.value().getType().cast<RankedTensorType>().getDimSize(*dim_index);
+  if (input_dim_size == ShapedType::kDynamicSize) return success();
+
+  if (input_dim_size % op.getNumResults() != 0)
+    return op.emitOpError("dimension #")
+           << *dim_index << " not divisible by the number of result tensors";
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// SplitVOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(SplitVOp op) {
+  auto split_sizes_type =
+      op.size_splits().getType().dyn_cast<RankedTensorType>();
+  if (!split_sizes_type) return success();
+
+  if (split_sizes_type.getRank() != 1 ||
+      split_sizes_type.getDimSize(0) != op.getNumResults())
+    return op.emitOpError("split sizes should be a 1D tensor of ")
+           << op.getNumResults() << " elements";
+
+  Optional<int64_t> dim_index = 0;
+  if (failed(VerifySplitInputAndSplitDim(op, &dim_index))) return failure();
+  if (!dim_index) return success();
+
+  int64_t input_dim_size =
+      op.value().getType().cast<RankedTensorType>().getDimSize(*dim_index);
+  if (input_dim_size == ShapedType::kDynamicSize) return success();
+
+  // If split sizes come from a constant, they must sum to the dimension size
+  // along split_dim, and we can have no more than one dynamic dimension.
+  DenseIntElementsAttr split_sizes_attr;
+  if (!matchPattern(op.size_splits(), m_Constant(&split_sizes_attr)))
+    return success();
+
+  int64_t total_dim_size = 0;  // Total dimension size assigned to splits
+  llvm::Optional<int> dynamic_dim_index;
+
+  SmallVector<int64_t, 4> split_sizes;
+  split_sizes.reserve(
+      split_sizes_attr.getType().cast<ShapedType>().getNumElements());
+
+  for (auto dim : llvm::enumerate(split_sizes_attr)) {
+    int64_t dim_val = dim.value().getSExtValue();
+    split_sizes.push_back(dim_val);
+    if (dim_val == ShapedType::kDynamicSize) {
+      // We cannot have more than one dynamic dimension.
+      if (dynamic_dim_index)
+        return op.emitOpError(
+            "cannot have more than one dynamic dimension in split sizes");
+      dynamic_dim_index = dim.index();
+    } else {
+      total_dim_size += dim_val;
+    }
+  }
+
+  if (!dynamic_dim_index && total_dim_size != input_dim_size)
+    return op.emitOpError(
+               "split sizes must sum up to the dimension size along split "
+               "dimension, found ")
+           << total_dim_size << " vs " << input_dim_size;
+
+  if (dynamic_dim_index && total_dim_size > input_dim_size)
+    return op.emitOpError(
+               "split sizes must sum up to be less than or equal to the "
+               "dimension size along split dimension, found ")
+           << total_dim_size << " vs " << input_dim_size;
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// SquareOp
+//===----------------------------------------------------------------------===//
+
+void SquareOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                           MLIRContext *context) {
+  results.insert<SquareOfSub>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// SubOp
+//===----------------------------------------------------------------------===//
+
+void SubOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                        MLIRContext *context) {
+  results.insert<SubOfNeg>(context);
+}
+
+OpFoldResult SubOp::fold(ArrayRef<Attribute> operands) {
+  return IdentityArithmeticOpFolder<SubOp>(*this, operands);
+}
+
+//===----------------------------------------------------------------------===//
+// SumOp
+//===----------------------------------------------------------------------===//
+
+void SumOp::build(OpBuilder &builder, OperationState &result, Value input,
+                  Value reduction_indices, BoolAttr keep_dims) {
+  Type out_ty =
+      InferReductionOpType(input, reduction_indices, keep_dims, &builder);
+  build(builder, result, out_ty, input, reduction_indices, keep_dims);
+}
+
+//===----------------------------------------------------------------------===//
+// StridedSliceOp
+//===----------------------------------------------------------------------===//
+
+// TODO(b/154160827): Add a canonicalization pattern from tf.StridedSliceOp to
+// tf.SliceOp if both of the following are true:
+// - All strides have a known value equal to 1
+// - No masks are set (or masks can be applied by transforming the inputs to
+//   Slice)
+
+// Verifies that,
+//
+// - begin, end and strides operands are 1D and they have the same number of
+//   elements. Here, the number of elements should be less than 32 to support
+//   32-bit mask attributes.
+// - None of the strides values are zero.
+// - Ellipsis mask can have at most one bit set.
+
+template <class OpTy>
+static LogicalResult VerifyStridedSliceBase(OpTy op) {
+  // Expected size for operands begin, end and strides vector operands.
+  int64_t expected_size = -1;
+
+  for (Value val : {op.begin(), op.end(), op.strides()}) {
+    auto operand_ty = val.getType().dyn_cast<ShapedType>();
+    if (!operand_ty || !operand_ty.hasStaticShape()) {
+      // TensorFlow constant ops may have non-static shape because the shape is
+      // not propagated during constant folding. If the defining op for this
+      // operand is a constant op, use the constant op's attribute to get the
+      // actual shape.
+      DenseIntElementsAttr attr;
+      if (!matchPattern(val, m_Constant(&attr))) continue;
+      operand_ty = attr.getType();
+    }
+
+    if (operand_ty.getRank() != 1)
+      return op.emitOpError()
+             << "requires begin, end and strides to be 1D tensors";
+
+    int64_t length = operand_ty.getDimSize(0);
+    if (length == -1) continue;
+
+    if (expected_size == -1) {
+      // This op uses 32-bit masks.
+      if (length >= 32)
+        return op.emitOpError(
+            "requires begin, end and strides operands with less than 32 "
+            "elements");
+
+      expected_size = length;
+    } else if (length != expected_size) {
+      return op.emitOpError() << "requires begin, end and strides to have the "
+                                 "same number of elements";
+    }
+  }
+
+  // If strides are constants, verify that none of the element is zero.
+  DenseIntElementsAttr strides;
+  if (matchPattern(op.strides(), m_Constant(&strides))) {
+    if (llvm::is_contained(strides.getValues<APInt>(), 0))
+      return op.emitOpError("requires non-zero strides");
+  }
+
+  // Use bit compares to ensure ellipsis_mask is 0 or a power of 2, i.e. there
+  // exists only no more than one ellipsis.
+  uint32_t ellipsis_mask = op.ellipsis_mask().getZExtValue();
+  if (ellipsis_mask != 0 && !llvm::isPowerOf2_32(ellipsis_mask))
+    return op.emitOpError("cannot have multiple ellipses");
+
+  return success();
+}
+
+// Clamps the given `val`: returns `low` if `val` is less than `low`; returns
+// `high` if `high` is less than `val`; otherwise returns `val`.
+template <class T>
+constexpr const T &Clamp(const T &val, const T &low, const T &high) {
+  assert(!(high < low));
+  return (val < low) ? low : (high < val) ? high : val;
+}
+
+// Checks if the `index` bit of `val` is set.
+template <class T>
+constexpr bool IsSet(const T &val, unsigned index) {
+  return (val & (1 << index)) != 0;
+}
+
+// Sets the `index` bit of `val`.
+template <class T>
+constexpr void Set(T &val, unsigned index) {
+  val |= (1 << index);
+}
+
+// Unset the `index` bit of `val`.
+template <class T>
+constexpr void Unset(T &val, unsigned index) {
+  val &= ~(1 << index);
+}
+
+// Copy the `src_index` bit of `src` to `dst_index` bit of `dst`.
+template <class T>
+constexpr void CopyBit(const T &src, unsigned src_index, T &dst,
+                       unsigned dst_index) {
+  if (IsSet(src, src_index))
+    Set(dst, dst_index);
+  else
+    Unset(dst, dst_index);
+}
+
+// The sparse spec of strided slice does not correspond to the number of
+// dimensions. For example, sparse spec for foo[..., 3:10] for foo of shape (2,
+// 4, 8) would have dims = 2.
+struct SparseSliceSpec {
+  int64_t dims;
+  int32_t begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask;
+  const ArrayRef<int64_t> &begin;
+  const ArrayRef<int64_t> &end;
+  const ArrayRef<int64_t> &strides;
+};
+
+// The dense spec of strided slice is the canonicalized version of sparse spec.
+// The number of dimensions of dense spec correspond to the number of dimensions
+// in operand tensor.
+struct DenseSliceSpec {
+  int64_t dims;
+  int32_t begin_mask, end_mask, shrink_axis_mask;
+  SmallVectorImpl<int64_t> &begin;
+  SmallVectorImpl<int64_t> &end;
+  SmallVectorImpl<int64_t> &strides;
+};
+
+// Make a sparse spec into a dense index spec.
+// The sparse spec does not correspond to the number of dimensions
+// Make a dense spec that corresponds to the number of dimensions
+//
+// For example suppose foo[...,3:, 2] on foo.shape=(2,2,3,4) then
+// we need to produce the missing begin_mask, end_mask for the first two
+// dimensions i.e. foo[:, :, 3:, 2].
+static void BuildDenseSliceSpec(const SparseSliceSpec &sparse,
+                                DenseSliceSpec *dense) {
+  // Build expanded dense begin, end, strides, begin_mask, end_mask, and
+  // shrink_axis_mask.
+  dense->begin.resize(dense->dims);
+  dense->end.resize(dense->dims);
+  dense->strides.resize(dense->dims);
+  dense->begin_mask = 0;
+  dense->end_mask = 0;
+  dense->shrink_axis_mask = 0;
+
+  // Count number of new_axis after ellipsis. This helps in calculating the
+  // number of dimensions ellipsis represents in the sparse spec.
+  bool ellipsis_seen = false;
+  int num_new_axis_after_ellipsis = 0;
+  for (int sparse_index = 0; sparse_index < sparse.dims; ++sparse_index) {
+    if (ellipsis_seen && IsSet(sparse.new_axis_mask, sparse_index))
+      num_new_axis_after_ellipsis++;
+    if (IsSet(sparse.ellipsis_mask, sparse_index)) ellipsis_seen = true;
+  }
+
+  int dense_index = 0;
+  for (int sparse_index = 0; sparse_index < sparse.dims; ++sparse_index) {
+    if (IsSet(sparse.new_axis_mask, sparse_index)) continue;
+    if (IsSet(sparse.ellipsis_mask, sparse_index)) {
+      auto next_index = std::min(dense->dims - (sparse.dims - sparse_index) +
+                                     1 + num_new_axis_after_ellipsis,
+                                 dense->dims);
+      // Expand ellipsis into the appropriate dense indices. From current index
+      // until next_index, all dimensions would have begin and end masks set and
+      // stride 1, i.e., get all elements in those dimensions.
+      for (; dense_index < next_index; ++dense_index) {
+        dense->begin[dense_index] = dense->end[dense_index] = 0;
+        dense->strides[dense_index] = 1;
+        Set(dense->begin_mask, dense_index);
+        Set(dense->end_mask, dense_index);
+      }
+      continue;
+    }
+    assert(dense_index < dense->dims);
+    // Copy over the sparse indices to dense indices if ellipsis_mask and
+    // new_axis_mask are not set.
+    dense->begin[dense_index] = sparse.begin[sparse_index];
+    dense->end[dense_index] = sparse.end[sparse_index];
+    dense->strides[dense_index] = sparse.strides[sparse_index];
+    CopyBit(sparse.begin_mask, sparse_index, dense->begin_mask, dense_index);
+    CopyBit(sparse.end_mask, sparse_index, dense->end_mask, dense_index);
+    CopyBit(sparse.shrink_axis_mask, sparse_index, dense->shrink_axis_mask,
+            dense_index);
+    dense_index++;
+  }
+}
+
+// For the given `input_shape`, calculates the sliced shape using the given
+// `begin`, `end`, and `stride` ranges and `begin_mask`, `end_mask`, and
+// `shrink_axis_mask` masks. Updates the result back to `input_shape`. If
+// `shrink_axis_mask` is not zero, this function will not drop the corresponding
+// dimensions in `input_shape`; it will turn them into 1s. At the same time,
+// canonicalizes `begin`, `end`, and `strides. The calculation follows
+// tf.StridedSlice op semantics.
+static void CalculateSlicedShapeFromDenseIndices(
+    MutableArrayRef<int64_t> input_shape, int32_t begin_mask, int32_t end_mask,
+    int32_t shrink_axis_mask, MutableArrayRef<int64_t> begin,
+    MutableArrayRef<int64_t> end, MutableArrayRef<int64_t> stride) {
+  assert(input_shape.size() <= 32);  // Only 32-bit masks are supported.
+
+  // Make sure ranges' ranks are consistent with the input.
+  assert(input_shape.size() == begin.size());
+  assert(input_shape.size() == end.size());
+  assert(input_shape.size() == stride.size());
+
+  for (int i = 0, e = input_shape.size(); i < e; ++i) {
+    if (ShapedType::isDynamic(input_shape[i])) continue;
+
+    int64_t dim_i = input_shape[i];
+    int64_t begin_i = begin[i];
+    int64_t end_i = end[i];
+    int64_t stride_i = stride[i];
+
+    // [0]: mask for begin, [1]: mask for end
+    int64_t masks[] = {begin_mask & (1 << i), end_mask & (1 << i)};
+    // [0]: bound for begin, [1]: bound for end
+    int64_t bounds[] = {stride_i > 0 ? 0 : -1,
+                        stride_i > 0 ? dim_i : dim_i - 1};
+
+    // Canonicalizes the given range `point` (begin/end) according to the
+    // current dimension. `c` means case: 0 for begin, 1 for end.
+    auto canonicalize = [&](int64_t point, int c) {
+      if (masks[c]) return stride_i > 0 ? bounds[c] : bounds[(c + 1) & 1];
+
+      // Add dim as offset to negative range point.
+      point = point < 0 ? dim_i + point : point;
+      return Clamp(point, bounds[0], bounds[1]);
+    };
+
+    begin_i = canonicalize(begin_i, 0);
+    end_i = canonicalize(end_i, 1);
+
+    int64_t interval_len = end_i - begin_i;
+    int64_t size_i = 0;
+    // If internal length is zero or has different sign from stride, it's a
+    // degenerated case: we are slicing nothing. Otherwise, calculate the sliced
+    // size.
+    if (interval_len != 0 && (interval_len < 0) == (stride_i < 0))
+      size_i = (interval_len / stride_i) + (interval_len % stride_i != 0);
+
+    begin[i] = begin_i;
+    if (IsSet(shrink_axis_mask, i)) {
+      // Shrink this dimension. It means we only take the element at begin_i.
+      input_shape[i] = 1;
+      end[i] = begin_i + 1;
+      stride[i] = 1;
+    } else {
+      input_shape[i] = size_i;
+      end[i] = end_i;
+      stride[i] = stride_i;
+    }
+  }
+}
+
+// For the given `input_shape`, calculates the sliced shape using the given
+// `sparse_begin`, `sparse_end`, and `sparse_strides` ranges and `begin_mask`,
+// `end_mask`, `ellipsis_mask` , `new_axis_mask` and `shrink_axis_mask` masks.
+// Updates the result back to `input_shape`.
+static void CalculateSlicedShapeFromSparseIndices(
+    MutableArrayRef<int64_t> input_shape, ArrayRef<int64_t> sparse_begin,
+    ArrayRef<int64_t> sparse_end, ArrayRef<int64_t> sparse_strides,
+    int32_t begin_mask, int32_t end_mask, int32_t ellipsis_mask,
+    int32_t new_axis_mask, int32_t shrink_axis_mask,
+    SmallVectorImpl<int64_t> *begin, SmallVectorImpl<int64_t> *end,
+    SmallVectorImpl<int64_t> *stride) {
+  int64_t num_sparse_indices = sparse_begin.size();
+  SparseSliceSpec sparse = {num_sparse_indices, begin_mask,    end_mask,
+                            ellipsis_mask,      new_axis_mask, shrink_axis_mask,
+                            sparse_begin,       sparse_end,    sparse_strides};
+
+  // If no ellipsis_mask exists then an implicit ellipsis_mask at the end is
+  // inserted. This handles cases where foo[2:4] (foo.shape() = [4, 8]) yields
+  // a tensor of shape [2, 8], i.e., foo[2:4] is same as foo[2:4, ...].
+  if (sparse.ellipsis_mask == 0) {
+    Set(sparse.ellipsis_mask, sparse.dims);
+    sparse.dims++;
+  }
+
+  int64_t dims = input_shape.size();
+  DenseSliceSpec dense = {dims,
+                          /*begin_mask = */ 0,
+                          /*end_mask = */ 0,
+                          /*shrink_axis_mask = */ 0,
+                          *begin,
+                          *end,
+                          *stride};
+
+  BuildDenseSliceSpec(sparse, &dense);
+  CalculateSlicedShapeFromDenseIndices(input_shape, dense.begin_mask,
+                                       dense.end_mask, dense.shrink_axis_mask,
+                                       *begin, *end, *stride);
+}
+
+bool StridedSliceOp::GetSlicedBoundRanges(
+    SmallVectorImpl<int64_t> *slice_begin, SmallVectorImpl<int64_t> *slice_end,
+    SmallVectorImpl<int64_t> *slice_stride) {
+  // TODO(hinsu): Support lowering for ops with dynamic begin and end values
+  // when it is possible to derive indices based on mask attributes.
+  DenseIntElementsAttr sparse_begin_attr, sparse_end_attr, sparse_strides_attr;
+  if (!matchPattern(begin(), m_Constant(&sparse_begin_attr)) ||
+      !matchPattern(end(), m_Constant(&sparse_end_attr)) ||
+      !matchPattern(strides(), m_Constant(&sparse_strides_attr)))
+    return false;
+
+  auto input_ty = this->input().getType().dyn_cast<RankedTensorType>();
+  if (!input_ty || !input_ty.hasStaticShape()) return false;
+  auto input_shape = llvm::to_vector<4>(input_ty.getShape());
+
+  SmallVector<int64_t, 4> sparse_begin, sparse_end, sparse_strides;
+
+  for (const APInt &index : sparse_begin_attr)
+    sparse_begin.push_back(index.getSExtValue());
+  for (const APInt &index : sparse_end_attr)
+    sparse_end.push_back(index.getSExtValue());
+  for (const APInt &stride : sparse_strides_attr)
+    sparse_strides.push_back(stride.getSExtValue());
+
+  CalculateSlicedShapeFromSparseIndices(
+      input_shape, sparse_begin, sparse_end, sparse_strides,
+      begin_mask().getZExtValue(), end_mask().getZExtValue(),
+      ellipsis_mask().getZExtValue(), new_axis_mask().getZExtValue(),
+      shrink_axis_mask().getZExtValue(), slice_begin, slice_end, slice_stride);
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// StridedSliceGradOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(StridedSliceGradOp op) {
+  auto shape_type = op.shape().getType().dyn_cast<RankedTensorType>();
+  if (shape_type && shape_type.getRank() != 1)
+    return op.emitOpError("'shape' operand must be 1D tensor, but got ")
+           << shape_type.getRank() << "D tensor";
+
+  if (failed(VerifyStridedSliceBase(op))) return failure();
+
+  // TODO(antiagainst): verify the gradient op.dy()'s shape is consistent with
+  // the sliced type from StridedSlice.
+
+  return success();
+}
+
+bool StridedSliceGradOp::GetSlicedShapeAndBoundRanges(
+    SmallVectorImpl<int64_t> *input_shape,
+    SmallVectorImpl<int64_t> *slice_begin, SmallVectorImpl<int64_t> *slice_end,
+    SmallVectorImpl<int64_t> *slice_stride) {
+  DenseIntElementsAttr shape_attr;
+  DenseIntElementsAttr sparse_begin_attr, sparse_end_attr, sparse_strides_attr;
+  if (!matchPattern(shape(), m_Constant(&shape_attr)) ||
+      !matchPattern(begin(), m_Constant(&sparse_begin_attr)) ||
+      !matchPattern(end(), m_Constant(&sparse_end_attr)) ||
+      !matchPattern(strides(), m_Constant(&sparse_strides_attr)))
+    return false;
+
+  int rank = std::distance(shape_attr.begin(), shape_attr.end());
+
+  input_shape->clear();
+  input_shape->reserve(rank);
+  for (const APInt &dim : shape_attr)
+    input_shape->push_back(dim.getSExtValue());
+
+  SmallVector<int64_t, 4> sparse_begin, sparse_end, sparse_strides;
+
+  for (const APInt &index : sparse_begin_attr)
+    sparse_begin.push_back(index.getSExtValue());
+  for (const APInt &index : sparse_end_attr)
+    sparse_end.push_back(index.getSExtValue());
+  for (const APInt &stride : sparse_strides_attr)
+    sparse_strides.push_back(stride.getSExtValue());
+
+  CalculateSlicedShapeFromSparseIndices(
+      *input_shape, sparse_begin, sparse_end, sparse_strides,
+      begin_mask().getZExtValue(), end_mask().getZExtValue(),
+      ellipsis_mask().getZExtValue(), new_axis_mask().getZExtValue(),
+      shrink_axis_mask().getZExtValue(), slice_begin, slice_end, slice_stride);
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// TensorListReserveOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(TensorListReserveOp op) {
+  if (!IsOfRankOrUnranked(op.element_shape(), 0) &&
+      !IsOfRankOrUnranked(op.element_shape(), 1)) {
+    return op.emitOpError("requires element_shape operand to be 0D/1D tensor");
+  }
+
+  if (!IsOfRankOrUnranked(op.num_elements(), 0)) {
+    return op.emitOpError("requires num_elements operand to be 0D tensor");
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// TensorListElementShapeOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult TensorListElementShapeOp::fold(ArrayRef<Attribute> operands) {
+  int width =
+      getType().cast<ShapedType>().getElementType().getIntOrFloatBitWidth();
+  auto variant_type =
+      getElementTypeOrSelf(getOperand().getType()).cast<TF::VariantType>();
+  if (variant_type.getSubtypes().empty()) return {};
+  return ConvertShapeToAttr(variant_type.getSubtypes()[0], width);
+}
+
+//===----------------------------------------------------------------------===//
+// TensorListStackOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(TensorListStackOp op) {
+  if (!IsOfRankOrUnranked(op.element_shape(), 0) &&
+      !IsOfRankOrUnranked(op.element_shape(), 1)) {
+    return op.emitOpError("requires element_shape operand to be 0D/1D tensor");
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// TensorScatterUpdateOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(TensorScatterUpdateOp op) {
+  if (!HasRankAtLeast(op.tensor(), 1))
+    return op.emitOpError(
+        "requires tensor operand to have at least 1 dimension");
+  if (!HasRankAtLeast(op.indices(), 1))
+    return op.emitOpError(
+        "requires indices operand to have at least 1 dimension");
+  if (!HasRankAtLeast(op.updates(), 1))
+    return op.emitOpError(
+        "requires updates operand to have at least 1 dimension");
+
+  auto tensor_ty = op.tensor().getType().dyn_cast<RankedTensorType>();
+  auto indices_ty = op.indices().getType().dyn_cast<RankedTensorType>();
+  if (!tensor_ty || !indices_ty) return success();
+
+  int64_t num_index_dims = indices_ty.getShape().back();
+  if (ShapedType::isDynamic(num_index_dims)) return success();
+
+  if (num_index_dims > tensor_ty.getRank())
+    return op.emitOpError(
+        "requires tensor operand with rank greater than or equal to the "
+        "indices operand's last dimensions");
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// TopKV2Op
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(TopKV2Op op) {
+  if (!HasRankAtLeast(op.input(), 1))
+    return op.emitOpError(
+        "requires input operand to have at least 1 dimension");
+
+  if (!IsOfRankOrUnranked(op.k(), 0))
+    return op.emitOpError("requires k operand to be 0D tensor");
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// ToBoolOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+// If the input to ToBoolOp is a `tensor<i1>`, then the ToBoolOp is an identity
+// function and can be removed.
+class ToBoolOfZeroDBoolTensor : public OpRewritePattern<ToBoolOp> {
+  using OpRewritePattern<ToBoolOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(ToBoolOp op,
+                                PatternRewriter &rewriter) const override {
+    if (auto type = op.getOperand().getType().dyn_cast<RankedTensorType>()) {
+      if (type.getRank() == 0 && type.getElementType().isInteger(1)) {
+        rewriter.replaceOp(op, op.getOperand());
+        return success();
+      }
+    }
+    return failure();
+  }
+};
+}  // namespace
+
+void ToBoolOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                           MLIRContext *context) {
+  results.insert<ToBoolOfZeroDBoolTensor>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// TransposeOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(TransposeOp op) {
+  auto perm_type = op.perm().getType().dyn_cast<RankedTensorType>();
+  auto x_type = op.x().getType().dyn_cast<RankedTensorType>();
+  auto y_type = op.y().getType().dyn_cast<RankedTensorType>();
+
+  if (perm_type && perm_type.getRank() != 1) {
+    return op.emitOpError()
+           << "expected perm to be a 1-D Tensor, got perm of rank "
+           << perm_type.getRank();
+  }
+
+  if (x_type && y_type && x_type.getRank() != y_type.getRank()) {
+    return op.emitOpError() << "x should be of the same rank with y, got "
+                            << "x of rank " << x_type.getRank()
+                            << ", and y of rank " << y_type.getRank();
+  }
+
+  if (!x_type || !y_type || !perm_type || !perm_type.hasStaticShape()) {
+    return success();
+  }
+
+  if (x_type.getRank() != perm_type.getNumElements()) {
+    return op.emitOpError() << "expected perm to be a 1-D Tensor of size "
+                            << "equal to the rank of x, got perm of size "
+                            << perm_type.getNumElements() << ", and x of rank "
+                            << x_type.getRank();
+  }
+
+  DenseIntElementsAttr attr_perm;
+  if (matchPattern(op.perm(), m_Constant(&attr_perm))) {
+    // y.shape[i] should be equal to x.shape[perm[i]]
+    // for i = [0, 1, ..., rank(x) - 1]
+    for (auto e : llvm::enumerate(attr_perm)) {
+      const int64_t y_idx = e.index();
+      const int64_t y_dim = y_type.getDimSize(y_idx);
+      const int64_t x_idx = e.value().getSExtValue();
+      const int64_t x_dim = x_type.getDimSize(x_idx);
+      if (y_dim != ShapedType::kDynamicSize &&
+          x_dim != ShapedType::kDynamicSize && y_dim != x_dim) {
+        return op.emitOpError()
+               << "requires y.shape[" << y_idx << "] (" << y_dim << ") "
+               << "to be equal to x.shape[perm[" << x_idx << "]] "
+               << "(" << x_dim << ")";
+      }
+    }
+  }
+
+  return success();
+}
+
+// TODO(jpienaar): perm could be optional too.
+void TransposeOp::build(OpBuilder &builder, OperationState &result, Value x,
+                        Value perm) {
+  auto x_type = x.getType().cast<TensorType>();
+  // If value is unranked, then so is results.
+  if (!x_type.hasRank())
+    return TransposeOp::build(builder, result,
+                              UnrankedTensorType::get(x_type.getElementType()),
+                              x, perm);
+
+  // TODO(jpienaar): Handle unknown perm case.
+
+  // TODO(jpienaar): Extract utility function.
+  auto etype = x_type.cast<ShapedType>().getElementType();
+  DenseIntElementsAttr attr_shape;
+  if (matchPattern(perm, m_Constant(&attr_shape))) {
+    llvm::SmallVector<int64_t, 4> const_shape;
+    if (attr_shape.isSplat()) {
+      const_shape.assign(
+          attr_shape.getNumElements(),
+          x_type.getDimSize((*attr_shape.begin()).getSExtValue()));
+    } else {
+      const_shape.reserve(attr_shape.getNumElements());
+      for (const auto &dim : attr_shape)
+        const_shape.push_back(x_type.getDimSize(dim.getSExtValue()));
+    }
+    return TransposeOp::build(
+        builder, result, RankedTensorType::get(const_shape, etype), x, perm);
+  }
+  return TransposeOp::build(builder, result, UnrankedTensorType::get(etype), x,
+                            perm);
+}
+
+namespace {
+
+OpFoldResult FoldIdentityTranspose(TransposeOp op) {
+  auto const_perm = dyn_cast_or_null<TF::ConstOp>(op.perm().getDefiningOp());
+  if (!const_perm) return {};
+
+  auto const_value = const_perm.value();
+  const auto elements = const_value.getValues<APInt>();
+
+  for (auto it : llvm::enumerate(elements)) {
+    if (it.index() != it.value()) return {};
+  }
+
+  // TODO(jpienaar): Remove if/when we handle this more generally.
+  if (op.getType() != op.x().getType()) {
+    // If the types don't match then only fold if all the operands are in the TF
+    // dialect.
+    for (auto user : op.getOperation()->getUsers())
+      if (user->getDialect() != op.getDialect()) return {};
+  }
+
+  return op.x();
+}
+
+OpFoldResult FoldCancellableTranspose(TransposeOp op) {
+  // Operand is a TransposeOp.
+  auto transpose = dyn_cast_or_null<TF::TransposeOp>(op.x().getDefiningOp());
+  if (!transpose) return {};
+
+  // Permutations defined by constant operations.
+  auto perm0 = dyn_cast_or_null<TF::ConstOp>(op.perm().getDefiningOp());
+  auto perm1 = dyn_cast_or_null<TF::ConstOp>(transpose.perm().getDefiningOp());
+  if (!perm0 || !perm1) return {};
+
+  // With permutation indices that cancel each other
+  auto perm0_value = perm0.value().cast<DenseIntElementsAttr>();
+  auto perm1_value = perm1.value().cast<DenseIntElementsAttr>();
+  if (!AreCancellablePermutations(perm0_value, perm1_value)) return {};
+
+  return transpose.x();
+}
+
+}  // namespace
+
+OpFoldResult TransposeOp::fold(ArrayRef<Attribute> operands) {
+  if (auto folded = FoldIdentityTranspose(*this)) return folded;
+  if (auto folded = FoldCancellableTranspose(*this)) return folded;
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// TruncateDivOp
+//===----------------------------------------------------------------------===//
+
+void TruncateDivOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<TruncateDivWithSqrtDivisor>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// UnpackOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(UnpackOp op) {
+  auto value_type = op.value().getType().dyn_cast<RankedTensorType>();
+  if (!value_type) return success();
+
+  int64_t value_rank = value_type.getRank();
+  int64_t axis = op.axis().getSExtValue();
+  if (axis < -value_rank || axis >= value_rank)
+    return op.emitOpError("axis attribute must be in the range of [-")
+           << value_rank << ", " << value_rank << ')';
+
+  axis = GetDimForAxis(axis, value_rank);
+  int64_t dim_size = value_type.getDimSize(axis);
+  if (ShapedType::isDynamic(dim_size)) return success();
+
+  if (dim_size != op.getNumResults())
+    return op.emitOpError("result count must be equal to ") << dim_size;
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Unsorted segment reduction ops
+//===----------------------------------------------------------------------===//
+
+template <class Op>
+static LogicalResult VerifyUnsortedSegmentReduction(Op op) {
+  if (!HasRankAtMost(op.num_segments(), 0))
+    return op.emitOpError("number of segments should be a 0-D tensor");
+
+  auto data_type = op.data().getType().template dyn_cast<RankedTensorType>();
+  auto segment_ids_type =
+      op.segment_ids().getType().template dyn_cast<RankedTensorType>();
+  if (data_type && segment_ids_type) {
+    if (data_type.getRank() < segment_ids_type.getRank())
+      return op.emitOpError(
+          "requires segment ids rank to be less than or equal to data's rank");
+
+    int index = 0;
+    for (auto shape_pair :
+         llvm::zip_first(segment_ids_type.getShape(), data_type.getShape())) {
+      int64_t segment_id_dim = std::get<0>(shape_pair);
+      int64_t data_dim = std::get<1>(shape_pair);
+      if (!ShapedType::isDynamic(segment_id_dim) &&
+          !ShapedType::isDynamic(data_dim) && segment_id_dim != data_dim)
+        return op.emitOpError(
+                   "requires segment ids shape to be a prefix of data shape, "
+                   "but dimension #")
+               << index << " differs: " << segment_id_dim << " vs. "
+               << data_dim;
+      ++index;
+    }
+  }
+
+  DenseIntElementsAttr num_segments_attr;
+  if (matchPattern(op.num_segments(), m_Constant(&num_segments_attr))) {
+    int64_t num_segments = (*num_segments_attr.begin()).getSExtValue();
+    if (num_segments < 0)
+      return op.emitOpError("num of segments cannot be negative");
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// VarIsInitializedOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// Erase VarIsInitializedOp operations with no uses. This op has side effect on
+/// resources (read-only), but can still be deleted if it has zero uses.
+struct EraseDeadVarIsInitializedOp
+    : public OpRewritePattern<VarIsInitializedOp> {
+  using OpRewritePattern<VarIsInitializedOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(VarIsInitializedOp op,
+                                PatternRewriter &rewriter) const override {
+    if (!op.use_empty()) return failure();
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+}  // end anonymous namespace.
+
+void VarIsInitializedOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &patterns, MLIRContext *context) {
+  patterns.insert<EraseDeadVarIsInitializedOp>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// VariableShapeOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(VariableShapeOp op) {
+  auto input_type = op.input().getType().cast<TensorType>();
+  if (input_type.hasStaticShape() && input_type.getNumElements() != 1)
+    return op.emitOpError("requires input to have one resource");
+
+  auto resource_type = input_type.getElementType().cast<TF::ResourceType>();
+  auto subtypes = resource_type.getSubtypes();
+  switch (subtypes.size()) {
+    case 1:
+      return VerifyShapeOperandAndResult(
+          op, resource_type.getSubtypes().front(), op.getType());
+    case 0:
+      return VerifyShapeOperandAndResult(op, Type(), op.getType());
+    default:
+      return op.emitOpError(
+          "requires resource input type to have at most 1 subtype");
+  }
+}
+
+OpFoldResult VariableShapeOp::fold(ArrayRef<Attribute> operands) {
+  int width =
+      getType().cast<ShapedType>().getElementType().getIntOrFloatBitWidth();
+  auto resource_type =
+      getElementTypeOrSelf(getOperand().getType()).cast<TF::ResourceType>();
+  if (resource_type.getSubtypes().empty()) return {};
+  return ConvertShapeToAttr(resource_type.getSubtypes()[0], width);
+}
+
+//===----------------------------------------------------------------------===//
+// WhileOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(WhileOp op) {
+  auto cond_fn = op.cond_func();
+  auto body_fn = op.body_func();
+  if (!cond_fn) {
+    return op.emitOpError("cond refers to an undefined function : ")
+           << op.cond();
+  }
+  if (!body_fn) {
+    return op.emitOpError("body refers to an undefined function : ")
+           << op.body();
+  }
+
+  auto cond_fn_type = cond_fn.getType();
+  auto body_fn_type = body_fn.getType();
+
+  // Verify that the cond function has exactly one result.
+  if (cond_fn_type.getNumResults() != 1)
+    return op.emitOpError("requires cond function to have exactly one result");
+
+  SmallVector<Type, 4> operands(op.getOperandTypes());
+
+  // Collect all the type lists for the op so that different pairs of type lists
+  // can be compared for the compatibility.
+  constexpr int kNumTypeLists = 5;
+  const std::array<std::pair<std::string, ArrayRef<Type>>, kNumTypeLists>
+      type_lists = {{
+          {"operand", operands},
+          {"body function result", body_fn_type.getResults()},
+          {"result", op.getResultTypes()},
+          {"cond function input", cond_fn_type.getInputs()},
+          {"body function input", body_fn_type.getInputs()},
+      }};
+
+  // A pair of type lists should be cast compatible with each other if one is
+  // converted to the another for a function call or assignment or there is a
+  // common source of inputs for both.  Therefore, the While op requires the
+  // following pairs of type lists to be cast compatible for the tensor_cast
+  // operation:
+  //
+  // * Operands and cond inputs to call the cond function before the
+  //   first iteration.
+  // * Operands and body inputs to call the body function for the first
+  //   iteration if the cond functions returns True or equivalent result.
+  // * Operands and results to assign cond function arguments to op results if
+  //   the cond function returns False or equivalent result.
+  // * All three pairs using cond inputs, body inputs and results as operand is
+  //   a common source for all three.
+  // * Body result and cond inputs to call the cond function for the subsequent
+  //   iterations. Similarly, Body result should be compatible with body inputs
+  //   and op results.
+  //
+  // Note that the operands and body results need not be compatible as they are
+  // never converted from one to the another nor there is a common source
+  // tensors.  Compatibility requirement is not transitive.
+
+  for (int i = 0; i < kNumTypeLists; ++i) {
+    // Skip the first pair as the While op operands and body function results
+    // does not need to be compatible with each other.
+    for (int j = std::max(2, i + 1); j < kNumTypeLists; ++j) {
+      auto &a = type_lists[i];
+      auto &b = type_lists[j];
+
+      int a_size = a.second.size();
+      if (a_size != b.second.size())
+        return op.emitOpError(
+            llvm::formatv("requires the number of {0}s to be equal to the "
+                          "number of {1}s. Found {2} and {3}, respectively",
+                          a.first, b.first, a_size, b.second.size()));
+
+      for (int idx = 0; idx < a_size; ++idx) {
+        auto a_type = a.second[idx];
+        auto b_type = b.second[idx];
+
+        if (!AreCastCompatible({a_type, b_type}))
+          return op.emitError(llvm::formatv(
+              "{0} type {1} is incompatible with {2} type {3} at index {4}",
+              a.first, a_type, b.first, b_type, idx));
+      }
+    }
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// WhileOp canonicalization.
+//===----------------------------------------------------------------------===//
+void WhileOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                          MLIRContext *context) {
+  results.insert<DropAttributes<WhileOp>>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// WhileRegionOp
+//===----------------------------------------------------------------------===//
+static LogicalResult Verify(WhileRegionOp op) {
+  // Verify that the condition generates a single tensor<i1> result.
+  YieldOp yield = cast<YieldOp>(op.cond().front().getTerminator());
+  if (yield.getNumOperands() != 1)
+    return op.emitOpError()
+           << "condition should have a single tensor<i1> result";
+
+  auto cond_type = yield.getOperand(0).getType().dyn_cast<RankedTensorType>();
+  if (!cond_type || !cond_type.getShape().equals({}) ||
+      !cond_type.getElementType().isInteger(/*width=*/1))
+    return op.emitOpError()
+           << "condition should have a single tensor<i1> result";
+
+  // The body result types should match while op result types.
+  if (failed(VerifyRegionResults(op, op.body(), "body"))) return failure();
+
+  // Both condition and body should have same number and type of operands as
+  // the WhileRegion inputs.
+  const int num_inputs = op.getNumOperands();
+  auto block_inputs_match_op_inputs = [&](Region &region,
+                                          StringRef name) -> LogicalResult {
+    Block &block = region.front();
+    if (block.getNumArguments() != num_inputs)
+      return op.emitOpError()
+             << name << " should have same number of inputs (" << num_inputs
+             << ") as " << WhileRegionOp::getOperationName() << " but has "
+             << block.getNumArguments() << " inputs";
+
+    for (auto types_idx : llvm::enumerate(
+             llvm::zip(op.getOperandTypes(), block.getArgumentTypes()))) {
+      auto op_input_type = std::get<0>(types_idx.value());
+      auto block_input_type = std::get<1>(types_idx.value());
+      if (!AreCastCompatible({block_input_type, op_input_type}))
+        return op.emitOpError(llvm::formatv(
+            "{0} input type {1} is incompatible with {2} "
+            "input type {3} at index {4}",
+            name, block_input_type, WhileRegionOp::getOperationName(),
+            op_input_type, types_idx.index()));
+    }
+    return success();
+  };
+
+  if (failed(block_inputs_match_op_inputs(op.cond(), "condition")) ||
+      failed(block_inputs_match_op_inputs(op.body(), "body")))
+    return failure();
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// WhileRegionOp LoopLikeOpInterface
+//===----------------------------------------------------------------------===//
+
+Region &WhileRegionOp::getLoopBody() { return body(); }
+
+bool WhileRegionOp::isDefinedOutsideOfLoop(Value value) {
+  // If the Op defining the value exists and the defining op is outside the
+  // scope of this WhileRegion, then we can infer that its defined outside.
+  // The defining Op is outside the scope of this WhileRegion if this
+  // WhileRegionOp is not an ancestor of the defining op in the parent chain.
+  Operation *def_op = value.getDefiningOp();
+  return def_op && !getOperation()->isAncestor(def_op);
+}
+
+LogicalResult WhileRegionOp::moveOutOfLoop(
+    llvm::ArrayRef<mlir::Operation *> ops) {
+  // Move the hoisted value to just before the while.
+  Operation *while_op = this->getOperation();
+  for (auto op : ops) op->moveBefore(while_op);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// WhileRegionOp canonicalization
+//===----------------------------------------------------------------------===//
+namespace {
+// Eliminate values that pass through the WhileRegionOp body.
+struct WhileRegionEliminatePassThrough
+    : public OpRewritePattern<WhileRegionOp> {
+  using OpRewritePattern<WhileRegionOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(WhileRegionOp while_op,
+                                PatternRewriter &rewriter) const override {
+    // Replace values that simply passthrough the body with extern values. The
+    // block arguments of body and while match and so the corresponding cond
+    // argument can be easily found.
+    int old_num_operands = while_op.getNumOperands();
+    int new_num_operands = old_num_operands;
+    auto &body_block = while_op.body().front();
+    auto &cond_block = while_op.cond().front();
+    auto &yield = *body_block.getTerminator();
+
+    // Bit mask indicating which operands will be removed.
+    SmallVector<bool, 16> removed_operand(old_num_operands, false);
+
+    for (int op_idx : llvm::seq<int>(0, old_num_operands)) {
+      auto body_arg = body_block.getArgument(op_idx);
+      if (body_arg == yield.getOperand(op_idx)) {
+        // Replace the use of the passthrough value with the while operand
+        // in the body and condition regions, as well as the while output (if
+        // type match)
+        // TODO(jurahul): Use PatternRewriter API for IR modification.
+        auto value = while_op.getOperand(op_idx);
+        if (body_arg.getType() == value.getType())
+          body_arg.replaceAllUsesWith(value);
+
+        auto cond_arg = cond_block.getArgument(op_idx);
+        if (cond_arg.getType() == value.getType())
+          cond_arg.replaceAllUsesWith(value);
+
+        auto result = while_op.getResult(op_idx);
+        if (result.getType() == value.getType())
+          result.replaceAllUsesWith(value);
+      }
+
+      // Now check if the operand is unused in both regions as well as the
+      // result. If so, mark it for removal.
+      if (body_block.getArgument(op_idx).use_empty() &&
+          cond_block.getArgument(op_idx).use_empty() &&
+          while_op.getResult(op_idx).use_empty()) {
+        removed_operand[op_idx] = true;
+        new_num_operands--;
+      }
+    }
+
+    if (new_num_operands == old_num_operands) return failure();
+
+    // Compress the operands, region arguments, and outputs.
+    SmallVector<Value, 4> new_while_operands;
+    SmallVector<Type, 4> new_result_types;
+    new_while_operands.reserve(new_num_operands);
+    new_result_types.reserve(new_num_operands);
+
+    // Build new operands and result type.
+    int next_idx = 0;
+    for (int op_idx : llvm::seq<int>(0, old_num_operands)) {
+      if (removed_operand[op_idx]) continue;
+      new_while_operands.push_back(while_op.getOperand(op_idx));
+      new_result_types.push_back(while_op.getResult(op_idx).getType());
+      next_idx++;
+    }
+
+    // Create the new while operation.
+    auto new_while_op =
+        rewriter.create<WhileRegionOp>(while_op.getLoc(), new_result_types,
+                                       new_while_operands, while_op.getAttrs());
+
+    // Move region bodies to the new while.
+    rewriter.inlineRegionBefore(while_op.cond(), new_while_op.cond(),
+                                new_while_op.cond().end());
+    rewriter.inlineRegionBefore(while_op.body(), new_while_op.body(),
+                                new_while_op.body().end());
+
+    auto &new_cond_block = new_while_op.cond().front();
+    auto &new_body_block = new_while_op.body().front();
+    auto &new_yield = *new_body_block.getTerminator();
+
+    // Build a vector of new results. Also patch up the region bodies and yield.
+    SmallVector<Value, 4> new_results;
+    next_idx = 0;
+    for (int op_idx : llvm::seq<int>(0, old_num_operands)) {
+      if (removed_operand[op_idx]) {
+        new_cond_block.eraseArgument(next_idx);
+        new_body_block.eraseArgument(next_idx);
+        new_yield.eraseOperand(next_idx);
+        new_results.push_back(nullptr);
+      } else {
+        new_results.push_back(new_while_op.getResult(next_idx++));
+      }
+    }
+
+    rewriter.replaceOp(while_op, new_results);
+    return success();
+  }
+};
+
+}  // anonymous namespace
+
+void WhileRegionOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<WhileRegionEliminatePassThrough>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// XdivyOp
+//===----------------------------------------------------------------------===//
+
+void XdivyOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                          MLIRContext *context) {
+  results.insert<XdivyWithSqrtDivisor>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// TableGen'd op method definitions
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc.inc"
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h
new file mode 100644
index 00000000000..761c06a475c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h
@@ -0,0 +1,52 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_N_Z_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_N_Z_H_
+
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/LoopLikeInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h"
+
+namespace mlir {
+namespace TF {
+
+#define GET_OP_FWD_DEFINES
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_all_ops.h.inc"
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h.inc"
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_N_Z_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc
new file mode 100644
index 00000000000..e87cc494a4a
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc
@@ -0,0 +1,87 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <numeric>
+#include <string>
+#include <tuple>
+#include <type_traits>
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/DialectImplementation.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Identifier.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Parser.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace mlir {
+namespace TF {
+
+namespace {
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_helpers.inc"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/generated_canonicalize.inc"
+}  // namespace
+
+//===----------------------------------------------------------------------===//
+// TableGen'd op method definitions
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc.inc"
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.h
new file mode 100644
index 00000000000..8586515edee
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.h
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_REMAINING_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_REMAINING_OPS_H_
+
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/LoopLikeInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h"
+
+namespace mlir {
+namespace TF {
+
+#define GET_OP_FWD_DEFINES
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_all_ops.h.inc"
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.h.inc"
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_REMAINING_OPS_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
index edfc7feefd5..6883d0358ec 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
@@ -113,7 +113,8 @@ static LogicalResult Verify(SessionInitializerOp session_initializer) {
 //===----------------------------------------------------------------------===//
 
 TensorFlowSavedModelDialect::TensorFlowSavedModelDialect(MLIRContext *context)
-    : Dialect(/*name=*/"tf_saved_model", context) {
+    : Dialect(/*name=*/"tf_saved_model", context,
+              TypeID::get<TensorFlowSavedModelDialect>()) {
   addOperations<
 #define GET_OP_LIST
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc.inc"
@@ -337,6 +338,7 @@ LogicalResult VerifyExportedFunc(FuncOp func) {
     if (auto attr = func.getArgAttrOfType<FlatSymbolRefAttr>(
             i, "tf_saved_model.bound_input")) {
       if (!unique_bound_inputs.insert(attr.getValue()).second) {
+        if (module.getAttr("tf_saved_model.under_construction")) continue;
         return func.emitError()
                << "duplicate 'tf_saved_model.bound_input' binding";
       }
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
index f488171d1e1..fc8e6f40f65 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 
@@ -65,11 +66,73 @@ class OperandsSameAsResultsTypeOrRef
   }
 };
 
+// Verifies that op has the same operand and result element types (or type
+// itself, if scalar) after resolving reference types (i.e., after converting
+// reference types to their corresponding TensorFlow or standard types).
+template <typename ConcreteType>
+class SameOperandsAndResultElementTypeResolveRef
+    : public TraitBase<ConcreteType,
+                       SameOperandsAndResultElementTypeResolveRef> {
+ public:
+  static LogicalResult verifyTrait(Operation* op) {
+    Type element_type;
+    if (op->getNumResults() > 0) {
+      element_type =
+          mlir::TF::GetElementTypeOrSelfResolveRef(op->getResult(0).getType());
+    } else if (op->getNumOperands() > 0) {
+      element_type =
+          mlir::TF::GetElementTypeOrSelfResolveRef(op->getOperand(0).getType());
+    } else {
+      // Nothing to check.
+      return success();
+    }
+    // Verify that all result element types are compatible to `element_type`.
+    for (const auto& result_type : op->getResultTypes()) {
+      if (mlir::TF::GetElementTypeOrSelfResolveRef(result_type) !=
+          element_type) {
+        return op->emitOpError(
+            "requires compatible element types for all operands and results");
+      }
+    }
+    // Verify that all operand element types are compatible to `element_type`.
+    for (const auto& operand_type : op->getOperandTypes()) {
+      if (mlir::TF::GetElementTypeOrSelfResolveRef(operand_type) !=
+          element_type) {
+        return op->emitOpError(
+            "requires compatible element types for all operands and results");
+      }
+    }
+    return success();
+  }
+};
+
 // Layout agnostic operations do not depend on the operands data layout (data
 // format), as and example all element wise operations are layout agnostic.
 template <typename ConcreteType>
 class LayoutAgnostic : public TraitBase<ConcreteType, LayoutAgnostic> {};
 
+// Trait to indicate operations that cannot be duplicated as they might carry
+// certain state around within their implementations.
+template <typename ConcreteType>
+class CannotDuplicate : public TraitBase<ConcreteType, CannotDuplicate> {
+ public:
+  static LogicalResult verifyTrait(Operation* op) {
+    if (MemoryEffectOpInterface::hasNoEffect(op))
+      return op->emitError(
+          "operations with no side effects cannot have CannotDuplicate trait");
+    return success();
+  }
+};
+
+// Coefficient-wise binary operation with implicit broadcasting support, for
+// example tf.Sub operation.
+template <typename ConcreteType>
+class CwiseBinary : public TraitBase<ConcreteType, CwiseBinary> {};
+
+// Coefficient-wise unary operation, for example tf.Sqrt operation.
+template <typename ConcreteType>
+class CwiseUnary : public TraitBase<ConcreteType, CwiseUnary> {};
+
 }  // namespace TF
 }  // namespace OpTrait
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
index f352bc0eb47..125f6bb31df 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 
 namespace mlir {
@@ -166,6 +167,17 @@ static inline Type GetDefaultTypeOf(TensorFlowRefType type) {
   return type.RemoveRef();
 }
 
+// Returns the element type if `type` is a `ShapedType` and the type itself
+// otherwise, converting `TensorFlowRef` type to corresponding `TensorFlow` or
+// standard type if necessary.
+static inline Type GetElementTypeOrSelfResolveRef(Type type) {
+  Type element_type = mlir::getElementTypeOrSelf(type);
+  if (auto ref_type = element_type.dyn_cast<mlir::TF::TensorFlowRefType>()) {
+    element_type = ref_type.RemoveRef();
+  }
+  return element_type;
+}
+
 #define HANDLE_TF_TYPE(tftype, enumerant, name)                          \
   class tftype##Type : public detail::TensorFlowTypeImpl<tftype##Type> { \
    public:                                                               \
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
index 8597740a4ae..595bdce5be4 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
@@ -143,6 +143,56 @@ func @testConcatCanonicalization(%arg0: tensor<2x1xi32>, %arg1: tensor<2x1xi32>)
   return %1 : tensor<2x2xi32>
 }
 
+// CHECK-LABEL: testConcatCwiseUnary
+func @testConcatCwiseUnary(%arg0: tensor<?x1xf32>, %arg1: tensor<?x1xf32>, %arg2: tensor<i32>) -> tensor<?x2xf32> {
+
+  // CHECK: %[[CONCAT:.*]] = "tf.ConcatV2"(%arg0, %arg1, %arg2)
+  // CHECK: %[[LOG1P:.*]] = "tf.Log1p"(%[[CONCAT]])
+  // CHECK: return %[[LOG1P]]
+  %0 = "tf.Log1p"(%arg0) : (tensor<?x1xf32>) -> tensor<?x1xf32>
+  %1 = "tf.Log1p"(%arg1) : (tensor<?x1xf32>) -> tensor<?x1xf32>
+  %2 = "tf.ConcatV2"(%0, %1, %arg2) : (tensor<?x1xf32>, tensor<?x1xf32>, tensor<i32>) -> tensor<?x2xf32>
+
+  return %2 : tensor<?x2xf32>
+}
+
+// CHECK-LABEL: testConcatCwiseBinaryOnInnerDim
+func @testConcatCwiseBinaryOnInnerDim(%arg0: tensor<?x1xf32>,
+  %arg1: tensor<?x1xf32>, %arg2: tensor<f32>, %arg3: tensor<f32>) -> tensor<?x2xf32> {
+
+  // CHECK: %[[LHS_AXIS:.*]] = "tf.Const"() {value = dense<1> : tensor<i64>}
+  // CHECK: %[[RHS_AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i64>}
+
+  // CHECK: %[[LHS_CONCAT:.*]] = "tf.ConcatV2"(%arg0, %arg1, %[[LHS_AXIS]])
+  // CHECK: %[[RHS_CONCAT:.*]] = "tf.ConcatV2"(%arg2, %arg3, %[[RHS_AXIS]])
+
+  // CHECK: %[[MUL:.*]] = "tf.Mul"(%[[LHS_CONCAT]], %[[RHS_CONCAT]])
+  // CHECK-SAME: (tensor<?x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
+  // CHECK: return %[[MUL]]
+
+  %0 = "tf.Const"() { value = dense<1> : tensor<i32> } : () -> tensor<i32>
+  %1 = "tf.Mul"(%arg0, %arg2) : (tensor<?x1xf32>, tensor<f32>) -> tensor<?x1xf32>
+  %2 = "tf.Mul"(%arg1, %arg3) : (tensor<?x1xf32>, tensor<f32>) -> tensor<?x1xf32>
+  %3 = "tf.ConcatV2"(%1, %2, %0) : (tensor<?x1xf32>, tensor<?x1xf32>, tensor<i32>) -> tensor<?x2xf32>
+
+  return %3 : tensor<?x2xf32>
+}
+
+// CHECK-LABEL: testConcatCwiseBinaryInvalidInnerDim
+func @testConcatCwiseBinaryInvalidInnerDim(%arg0: tensor<?x2xf32>,
+  %arg1: tensor<?x2xf32>, %arg2: tensor<f32>, %arg3: tensor<f32>) -> tensor<?x4xf32> {
+  // Each individual binary operation has an implicit broadcast that will be
+  // lost if we would reorder them with the concat.
+
+  // CHECK: "tf.ConcatV2"(%1, %2, %0)
+  %0 = "tf.Const"() { value = dense<1> : tensor<i32> } : () -> tensor<i32>
+  %1 = "tf.Mul"(%arg0, %arg2) : (tensor<?x2xf32>, tensor<f32>) -> tensor<?x2xf32>
+  %2 = "tf.Mul"(%arg1, %arg3) : (tensor<?x2xf32>, tensor<f32>) -> tensor<?x2xf32>
+  %3 = "tf.ConcatV2"(%1, %2, %0) : (tensor<?x2xf32>, tensor<?x2xf32>, tensor<i32>) -> tensor<?x4xf32>
+
+  return %3 : tensor<?x4xf32>
+}
+
 // CHECK-LABEL: testLogOfSoftmax
 func @testLogOfSoftmax(%arg0: tensor<8x16xf32>) -> tensor<8x16xf32> {
   %0 = "tf.Softmax"(%arg0) : (tensor<8x16xf32>) -> tensor<8x16xf32>
@@ -377,6 +427,86 @@ func @testRedundantReshape(%arg0: tensor<4x4xi32>) -> tensor<2x8xi32> {
   // CHECK: return %1 : tensor<2x8xi32>
 }
 
+// CHECK-LABEL: testReshapeToSelfShape
+func @testReshapeToSelfShape(%arg0: tensor<?x4xf32>) -> tensor<?x4xf32> {
+  %0 = "tf.Shape"(%arg0) : (tensor<?x4xf32>) -> tensor<2xi32>
+  %1 = "tf.Reshape"(%arg0, %0) : (tensor<?x4xf32>, tensor<2xi32>) -> tensor<?x4xf32>
+
+  // CHECK: return %arg0 : tensor<?x4xf32>
+  return %1: tensor<?x4xf32>
+}
+
+// CHECK-LABEL: func @testReshapeNoOp
+func @testReshapeNoOp(%arg0: tensor<2x4xf32>, %arg1: tensor<2xi32>) -> tensor<2x4xf32> {
+  %0 = "tf.Reshape"(%arg0, %arg1) : (tensor<2x4xf32>, tensor<2xi32>) -> tensor<2x4xf32>
+
+  // CHECK: return %arg0
+  return %0 : tensor<2x4xf32>
+}
+
+// CHECK-LABEL: func @testPackShapeComputation
+func @testPackShapeComputation(%arg0: tensor<?x1xf32>, %arg1: tensor<?x1x2xf32>, %arg2: tensor<*xf32>) -> (tensor<2xi32>, tensor<3xi32>, tensor<3xi32>,  tensor<3xi32>, tensor<3xi32>, tensor<*xi32>) {
+  // Test dimensions sizes.
+  %d1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %d2 = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
+
+  // Slice bounds.
+  %0 = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
+  %1 = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  %2 = "tf.Const"() {value = dense<2> : tensor<1xi32>} : () -> tensor<1xi32>
+
+  // Fold pack operation if it computes the input tensor shape:
+  //
+  //   %shape  = tf.Shape(%arg)                    // [? x ...]
+  //   %dim0   = tf.StridedSlice(%shape, 0, 1, 1)  // get unknown dim0 value
+  //   %pack   = tf.Pack(dim0, ...) { axis = 0 }   // [? x ...]
+  //
+  // Where `...` are some statically known dimensions. In this case %pack can be
+  // replace with a %shape. This is a common pattern in models with a dynamic
+  // batch size.
+
+  // Test Rank 2
+  // CHECK: %[[SHAPE0:.*]] = "tf.Shape"
+  %3 = "tf.Shape"(%arg0) : (tensor<?x1xf32>) -> tensor<2xi32>
+  %4 = "tf.StridedSlice"(%3, %0, %1, %1) {shrink_axis_mask = 1 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %5 = "tf.Pack"(%4, %d1) {axis = 0 : i64} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %6 = "tf.Reshape"(%arg0, %5) : (tensor<?x1xf32>, tensor<2xi32>) -> tensor<?x1xf32>
+
+  // Test Rank 3.
+  // CHECK: %[[SHAPE1:.*]] = "tf.Shape"
+  %7 = "tf.Shape"(%arg1) : (tensor<?x1x2xf32>) -> tensor<3xi32>
+  %8 = "tf.StridedSlice"(%7, %0, %1, %1) {shrink_axis_mask = 1 : i64} : (tensor<3xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %9 = "tf.Pack"(%8, %d1, %d2) {axis = 0 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<3xi32>
+  %10 = "tf.Reshape"(%arg1, %9) : (tensor<?x1x2xf32>, tensor<3xi32>) -> tensor<?x1x2xf32>
+
+  // Packed dimensions have different order from the reshape operand:
+  //   [?, 1, 2] vs [?, 2, 1]
+  %14 = "tf.StridedSlice"(%7, %0, %1, %1) {shrink_axis_mask = 1 : i64} : (tensor<3xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %15 = "tf.Pack"(%14, %d2, %d1) {axis = 0 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<3xi32>
+  // CHECK: %[[PACK0:.*]] = "tf.Pack"
+
+  // StridedSlice takes second dimension from the shape:
+  //   begin = [1], end = [2], stride = [1]
+  %17 = "tf.StridedSlice"(%7, %1, %2, %1) {shrink_axis_mask = 1 : i64} : (tensor<3xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %18 = "tf.Pack"(%17, %d1, %d2) {axis = 0 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<3xi32>
+  // CHECK: %[[PACK1:.*]] = "tf.Pack"
+
+  // Packed dimensions have higher rank than the reshape operand:
+  //   [?, 1] vs [?, 1, 1]
+  %20 = "tf.StridedSlice"(%3, %0, %1, %1) {shrink_axis_mask = 1 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  %21 = "tf.Pack"(%20, %d1, %d1) {axis = 0 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<3xi32>
+  // CHECK: %[[PACK2:.*]] = "tf.Pack"
+
+  // Make sure a dynamic ranked shape doesn't crash the "canonicalize" pass
+  %23 = "tf.Shape"(%arg2) : (tensor<*xf32>) -> tensor<*xi32>
+  %24 = "tf.StridedSlice"(%23, %0, %1, %1) {shrink_axis_mask = 1 : i64} : (tensor<*xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<*xi32>
+  %25 = "tf.Pack"(%24, %d1) {axis = 0 : i64} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
+  // CHECK: %[[PACK3:.*]] = "tf.Pack"
+
+  // CHECK: return %[[SHAPE0]], %[[SHAPE1]], %[[PACK0]], %[[PACK1]], %[[PACK2]], %[[PACK3]]
+  return %5, %9, %15, %18, %21, %25 : tensor<2xi32>, tensor<3xi32>, tensor<3xi32>,  tensor<3xi32>, tensor<3xi32>, tensor<*xi32>
+}
+
 // CHECK-LABEL: testSelectScalarPred
 func @testSelectScalarPred(%arg0: tensor<i1>, %arg1: tensor<4x2xf16>, %arg2: tensor<4x2xf16>) -> tensor<4x2xf16> {
   // CHECK-NEXT: "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<4x2xf16>, tensor<4x2xf16>) -> tensor<4x2xf16>
@@ -512,6 +642,18 @@ func @testRealDivWithSqrtDivisor(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32
 // CHECK: return %1
 }
 
+// CHECK-LABEL: testRealDivWithConstDivisor
+func @testRealDivWithConstDivisor(%arg0: tensor<8x2xf32>) -> tensor<8x2xf32> {
+  %0 = "tf.Const"() {value = dense<[2.0, 4.0]> : tensor<2xf32>} : () -> tensor<2xf32>
+  %1 = "tf.RealDiv"(%arg0, %0) : (tensor<8x2xf32>, tensor<2xf32>) -> tensor<8x2xf32>
+  return %1: tensor<8x2xf32>
+
+  // CHECK: %0 = "tf.Const"
+  // CHECK-SAME: value = dense<[5.000000e-01, 2.500000e-01]
+  // CHECK: %1 = "tf.Mul"(%arg0, %0)
+  // CHECK: return %1
+}
+
 // CHECK-LABEL: testTruncateDivWithSqrtDivisor
 func @testTruncateDivWithSqrtDivisor(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) -> tensor<8x16xf32> {
   %0 = "tf.Sqrt"(%arg1) : (tensor<8x16xf32>) -> tensor<8x16xf32>
@@ -663,6 +805,27 @@ func @foldFill() -> (tensor<3x2x1xf32>, tensor<*xf32>, tensor<*xcomplex<f32>>) {
   return %2, %3, %4 : tensor<3x2x1xf32>, tensor<*xf32>, tensor<*xcomplex<f32>>
 }
 
+// CHECK-LABEL: foldIf
+func @foldIf(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i1>) -> (tensor<f32>) {
+  %0 = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+  %1 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+
+  // CHECK: %0 = "tf.PartitionedCall"(%arg0, %arg1)
+  // CHECK-SAME: device = "noodle"
+  // CHECK-SAME: f = @sub
+  %2 = "tf.If"(%0, %arg0, %arg1) {then_branch = @add, else_branch = @sub, output_shapes = [#tf.shape<>], device = "noodle", is_stateless = true} : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: %1 = "tf.StatefulPartitionedCall"(%0, %arg1)
+  // CHECK-SAME: _underscore_attr = "something"
+  // CHECK-SAME: f = @add
+  %3 = "tf.If"(%1, %2, %arg1) {then_branch = @add, else_branch = @sub, output_shapes = [#tf.shape<>], _underscore_attr = "something", is_stateless = false} : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+
+  // CHECK: %2 = "tf.If"
+  %4 = "tf.If"(%arg2, %3, %arg1) {then_branch = @add, else_branch = @sub, is_stateless = false} : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+
+  // CHECK: return %2
+  return %4 : tensor<f32>
+}
+
 // CHECK-LABEL: foldCase
 func @foldCase(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>) {
   %2 = constant dense<1> : tensor<i32>
@@ -872,3 +1035,36 @@ func @testWhileRegionUnusedValue(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>, %ar
   // CHECK: return %[[WHILE_OUT]]#0 : tensor<*xf32>
   return %0#0 : tensor<*xf32>
 }
+
+// Check that output_shapes attribute is removed for tf.If
+func @testIfThen(tensor<*xf32>) -> tensor<*xf32>
+func @testIfElse(tensor<*xf32>) -> tensor<*xf32>
+// CHECK-LABEL: func @testIfDropOutputShapes
+func @testIfDropOutputShapes(tensor<i1>, tensor<2xf32>) -> tensor<2xf32> {
+^bb0(%arg0: tensor<i1>, %arg1: tensor<2xf32>):
+  // CHECK: "tf.If"
+  // CHECK-NOT: output_shapes
+  %1 = "tf.If"(%arg0, %arg1) {
+    then_branch = @testIfThen, else_branch = @testIfElse, is_stateless = false, output_shapes = [#tf.shape<>]
+  } : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
+
+  return %1 : tensor<2xf32>
+}
+
+// Check that output_shapes attribute is removed for tf.Whileß
+func @testWhileCond(tensor<*xf32>) -> (tensor<i1>)
+func @testWhileBody(tensor<*xf32>) -> (tensor<*xf32>)
+// CHECK-LABEL: func @testWhileDropOutputShapes
+func @testWhileDropOutputShapes(tensor<*xf32>) -> (tensor<*xf32>) {
+^bb0(%arg0: tensor<*xf32>):
+  // CHECK: "tf.While"
+  // CHECK-NOT: output_shapes
+  %1 = "tf.While"(%arg0) {
+    cond = @testWhileCond,
+    body = @testWhileBody,
+    is_stateless = false,
+    output_shapes = [#tf.shape<>]
+  } : (tensor<*xf32>) -> (tensor<*xf32>)
+
+  return %1 : tensor<*xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
index 7b8c998bcf1..b86815dbe57 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
@@ -443,7 +443,7 @@ func @DontRemoveTrivialMul(%arg0: tensor<1x6x8x1xf32>) -> tensor<1x6x8x1xf32> {
   // CHECK: return %[[RESULT]] : tensor<1x6x8x1xf32>
 }
 
-// Do not fold if total result size is large (>128 KB) and more than 2 times
+// Do not fold if total result size is large (>256 KB) and more than 2 times
 // the size of operands.
 
 // LINT.IfChange(folding-policy-test)
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/einsum.mlir b/tensorflow/compiler/mlir/tensorflow/tests/einsum.mlir
index 130887555b0..e7430993755 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/einsum.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/einsum.mlir
@@ -98,6 +98,31 @@ func @einsum_transposereduceddim(%arg0: tensor<2x5x7xf32>, %arg1: tensor<2x5x3x7
   // CHECK: return %[[v3]] : tensor<2x5x3xf32>
 }
 
+func @einsum_fourdreducelast(%arg0: tensor<2x5x7x3xf32>, %arg1: tensor<2x3x5x13xf32>) -> tensor<2x7x5x13xf32> {
+  %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "acbe,aecd->abcd"}: (tensor<2x5x7x3xf32>, tensor<2x3x5x13xf32>) -> tensor<2x7x5x13xf32>
+  return %0 : tensor<2x7x5x13xf32>
+  // CHECK-LABEL: einsum_fourdreducelast
+  // CHECK: %[[cst:.*]] = constant dense<[0, 2, 1, 3]> : tensor<4xi32>
+  // CHECK: %[[v0:.*]] = "tf.Transpose"(%arg1, %[[cst]]) : (tensor<2x3x5x13xf32>, tensor<4xi32>) -> tensor<2x5x3x13xf32>
+  // CHECK: %[[v1:.*]] = "tf.BatchMatMulV2"(%arg0, %[[v0]]) {adj_x = false, adj_y = false} : (tensor<2x5x7x3xf32>, tensor<2x5x3x13xf32>) -> tensor<2x5x7x13xf32>
+  // CHECK: %[[v2:.*]] = "tf.Transpose"(%[[v1]], %[[cst]]) : (tensor<2x5x7x13xf32>, tensor<4xi32>) -> tensor<2x7x5x13xf32>
+  // CHECK: return %[[v2]] : tensor<2x7x5x13xf32>
+}
+
+func @einsum_fourdtransposeall(%arg0: tensor<2x5x7x3xf32>, %arg1: tensor<2x11x7x3xf32>) -> tensor<2x7x11x5xf32> {
+  %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "aecd,abcd->acbe"}: (tensor<2x5x7x3xf32>, tensor<2x11x7x3xf32>) -> tensor<2x7x11x5xf32>
+  return %0 : tensor<2x7x11x5xf32>
+  // CHECK-LABEL: einsum_fourdtransposeall
+  // CHECK: %[[cst:.*]] = constant dense<[0, 2, 1, 3]> : tensor<4xi32>
+  // CHECK: %[[cst_1:.*]] = constant dense<[0, 2, 3, 1]> : tensor<4xi32>
+  // CHECK: %[[cst_2:.*]] = constant dense<[0, 1, 3, 2]> : tensor<4xi32>
+  // CHECK: %[[v0:.*]] = "tf.Transpose"(%arg0, %[[cst]]) : (tensor<2x5x7x3xf32>, tensor<4xi32>) -> tensor<2x7x5x3xf32>
+  // CHECK: %[[v1:.*]] = "tf.Transpose"(%arg1, %[[cst_1]]) : (tensor<2x11x7x3xf32>, tensor<4xi32>) -> tensor<2x7x3x11xf32>
+  // CHECK: %[[v2:.*]] = "tf.BatchMatMulV2"(%[[v0]], %[[v1]]) {adj_x = false, adj_y = false} : (tensor<2x7x5x3xf32>, tensor<2x7x3x11xf32>) -> tensor<2x7x5x11xf32>
+  // CHECK: %[[v3:.*]] = "tf.Transpose"(%[[v2]], %[[cst_2]]) : (tensor<2x7x5x11xf32>, tensor<4xi32>) -> tensor<2x7x11x5xf32>
+  // CHECK: return %[[v3]] : tensor<2x7x11x5xf32>
+}
+
 func @einsum_no_match(%arg0: tensor<4x5xf32>, %arg1: tensor<5xf32>) -> tensor<4xf32> {
   %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "ij,j->i"}: (tensor<4x5xf32>, tensor<5xf32>) -> tensor<4xf32>
   return %0 : tensor<4xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_inlining/executor_tpuv1_inline_tpu_island.mlir b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_inlining/executor_tpuv1_inline_tpu_island.mlir
index f45f0a435c3..b7bdf505a85 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_inlining/executor_tpuv1_inline_tpu_island.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_inlining/executor_tpuv1_inline_tpu_island.mlir
@@ -35,11 +35,11 @@ module {
   }
 // CHECK-NOT: _tpu_v1_compat_outlined
   module @_tpu_v1_compat_outlined {
-    func @_tpu_v1_compat_outlined_func0(%arg0: tensor<i1>) -> tensor<i1> {
+    func @_tpu_v1_compat_outlined_func0(%arg0: tensor<i1>) -> tensor<i1> attributes {sym_visibility = "nested"} {
       %0 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
       return %0 : tensor<i1>
     }
-    func @_tpu_v1_compat_outlined_func1(%arg0: tensor<i1>, %arg1: tensor<f32>) -> (tensor<i1>, tensor<i32>) {
+    func @_tpu_v1_compat_outlined_func1(%arg0: tensor<i1>, %arg1: tensor<f32>) -> (tensor<i1>, tensor<i32>) attributes {sym_visibility = "nested"}  {
       %0 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
       %1 = "tf.opA"(%0) : (tensor<i1>) -> tensor<i1>
       %2 = "tf.SomeOp"(%arg0, %arg1) : (tensor<i1>, tensor<f32>) -> tensor<i32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_inlining/while_op.mlir b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_inlining/while_op.mlir
index 8c174a7cfaf..6724033d292 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_inlining/while_op.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_inlining/while_op.mlir
@@ -12,7 +12,7 @@ module {
     return %0#0 : tensor<i32>
   }
   module @_tpu_v1_compat_outlined {
-    func @_tpu_v1_compat_outlined_func0(%arg0: tensor<i1>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>) {
+    func @_tpu_v1_compat_outlined_func0(%arg0: tensor<i1>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>) attributes {sym_visibility = "nested"} {
       "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", device = "device", num_replicas = 1 : i64, topology = "topology"} : () -> ()
       %0 = "tf.opA"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i1>) -> tensor<i32>
       %1 = "tf.While"(%0) {body = @while_body_with_cluster_attr, cond = @while_cond_with_cluster_attr, is_stateless = false, name = "A", parallel_iterations = 10 : i64} : (tensor<i32>) -> tensor<i32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-regions.mlir b/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-regions.mlir
index a7e9b22d72b..c8c82c5c08f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-regions.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-regions.mlir
@@ -1,5 +1,6 @@
-// RUN: tf-opt %s -tf-functional-control-flow-to-regions -split-input-file | FileCheck %s --dump-input=fail
+// RUN: tf-opt %s -tf-functional-control-flow-to-regions -split-input-file | FileCheck %s
 
+// Simple If
 // CHECK: func @testIf1Then{{.+}}
 // CHECK: func @testIf1Else{{.+}}
 func @testIf1Then(tensor<*xf32>) -> tensor<*xf32>
@@ -8,7 +9,8 @@ func @testIf1Else(tensor<*xf32>) -> tensor<*xf32>
 // CHECK-LABEL: func @testIf1Result(%arg0: tensor<i1>, %arg1: tensor<*xf32>)
 func @testIf1Result(%arg0: tensor<i1>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "tf.If"(%arg0, %arg1) {
-    then_branch = @testIf1Then, else_branch = @testIf1Else, is_stateless = false
+    then_branch = @testIf1Then, else_branch = @testIf1Else, is_stateless = false,
+    _attr0 = 10, _attr1 = true, attr2 = "hello"
   } : (tensor<i1>, tensor<*xf32>) -> tensor<*xf32>
 
   // CHECK: "tf.IfRegion"
@@ -16,12 +18,19 @@ func @testIf1Result(%arg0: tensor<i1>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
   // CHECK: "tf.Yield"([[Result0]])
   // CHECK: [[Result1:%.*]] = call @testIf1Else
   // CHECK: "tf.Yield"([[Result1]])
+  // CHECK: _attr0 = 10
+  // CHECK-SAME: _attr1 = true
+  // CHECK-NOT: attr2 =
+  // CHECK-NOT: else_branch
+  // CHECK-SAME: is_stateless = false
+  // CHECK-NOT: then_branch
+  // CHECK-SAME: }
   return %0 : tensor<*xf32>
 }
 
 // -----
 
-// With mismatching input types
+// If with mismatching input types
 
 // CHECK: func @testIf1Then{{.+}}
 // CHECK: func @testIf1Else{{.+}}
@@ -46,7 +55,7 @@ func @testIf2Result(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
 
 // -----
 
-// No inputs, some outputs
+// If with no inputs, some outputs
 // CHECK: func @testIf1Then{{.+}}
 // CHECK: func @testIf1Else{{.+}}
 func @testIf1Then() -> tensor<*xf32>
@@ -68,7 +77,7 @@ func @testIfNoInputs(%arg0: tensor<i1>) -> tensor<2xf32> {
 
 // -----
 
-// No outputs, some inputs
+// If with no outputs, some inputs
 // CHECK: func @testIf1Then{{.+}}
 // CHECK: func @testIf1Else{{.+}}
 func @testIf1Then(tensor<*xf32>) -> ()
@@ -91,7 +100,8 @@ func @testIfNoResult(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> () {
 }
 
 // -----
-// No outputs, No inputs
+
+// If with no outputs, No inputs
 // CHECK: func @testIf1Then{{.+}}
 // CHECK: func @testIf1Else{{.+}}
 func @testIf1Then() -> ()
@@ -111,3 +121,82 @@ func @testIfNoInputAndNoResult(%arg0: tensor<i1>) -> () {
   return
 }
 
+// -----
+
+// Simple While
+func @testWhileCond(tensor<*xf32>) -> (tensor<i1>)
+func @testWhileBody(tensor<*xf32>) -> (tensor<*xf32>)
+
+// CHECK-LABEL: func @testWhileResult
+func @testWhileResult(tensor<*xf32>) -> (tensor<*xf32>) {
+^bb0(%arg0: tensor<*xf32>):
+  %1 = "tf.While"(%arg0) {
+    cond = @testWhileCond,
+    body = @testWhileBody,
+    is_stateless = true,
+    _attr0 = 10, _attr1 = true, attr2 = "hello"
+  } : (tensor<*xf32>) -> (tensor<*xf32>)
+
+  // CHECK: [[Result0:%.*]] = "tf.WhileRegion"
+  // CHECK: [[Result1:%.*]] = call @testWhileCond
+  // CHECK: "tf.Yield"([[Result1]])
+  // CHECK: [[Result2:%.*]] = call @testWhileBody
+  // CHECK: "tf.Yield"([[Result2]])
+  // CHECK: _attr0 = 10
+  // CHECK-SAME: _attr1 = true
+  // CHECK-NOT: attr2 =
+  // CHECK-NOT: cond =
+  // CHECK-NOT: body =
+  // CHECK-SAME: is_stateless = true
+  // CHECK: return [[Result0]]
+  return %1 : tensor<*xf32>
+}
+
+// -----
+
+// While with no inputs & outputs
+func @testWhileCond() -> (tensor<i1>)
+func @testWhileBody() -> ()
+
+// CHECK-LABEL: func @testWhileResultNoIO
+func @testWhileResultNoIO() -> () {
+  "tf.While"() {
+    cond = @testWhileCond,
+    body = @testWhileBody,
+    is_stateless = false
+  } : () -> ()
+
+  // CHECK: "tf.WhileRegion"
+  // CHECK: [[Result1:%.*]] = call @testWhileCond
+  // CHECK: "tf.Yield"([[Result1]])
+  // CHECK: call @testWhileBody
+  // CHECK: "tf.Yield"()
+  return
+}
+
+// -----
+
+// While with type mismatch
+func @testWhileCond(tensor<4xf32>) -> (tensor<i1>)
+func @testWhileBody(tensor<4xf32>) -> (tensor<4xf32>)
+
+// CHECK-LABEL: func @testWhileResult
+func @testWhileResult(tensor<*xf32>) -> (tensor<*xf32>) {
+^bb0(%arg0: tensor<*xf32>):
+  %1 = "tf.While"(%arg0) {
+    cond = @testWhileCond,
+    body = @testWhileBody,
+    is_stateless = false
+  } : (tensor<*xf32>) -> (tensor<*xf32>)
+
+  // CHECK: [[Result0:%.*]] = "tf.WhileRegion"
+  // CHECK: [[ResultCast0:%.*]] = "tf.Cast"
+  // CHECK: [[Result1:%.*]] = call @testWhileCond([[ResultCast0]])
+  // CHECK: "tf.Yield"([[Result1]])
+  // CHECK: [[ResultCast1:%.*]] = "tf.Cast"
+  // CHECK: [[Result2:%.*]] = call @testWhileBody([[ResultCast1]])
+  // CHECK: "tf.Yield"([[Result2]])
+  // CHECK: return [[Result0]]
+  return %1 : tensor<*xf32>
+}
+
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/case_op.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/case_op.pbtxt
new file mode 100644
index 00000000000..1372ad71283
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/case_op.pbtxt
@@ -0,0 +1,261 @@
+# RUN: tf-mlir-translate -graphdef-to-splatted-mlir %s -o - | FileCheck %s
+
+node {
+  name: "Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Const_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "indexed_case"
+  op: "StatelessCase"
+  input: "Const_1"
+  input: "Const"
+  attr {
+    key: "Tin"
+    value {
+      list {
+        type: DT_INT32
+      }
+    }
+  }
+  attr {
+    key: "Tout"
+    value {
+      list {
+        type: DT_INT32
+      }
+    }
+  }
+  attr {
+    key: "_lower_using_switch_merge"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "_read_only_resource_inputs"
+    value {
+      list {
+      }
+    }
+  }
+  attr {
+    key: "branches"
+    value {
+      list {
+        func {
+          name: "indexed_case_branch0_4"
+        }
+        func {
+          name: "indexed_case_branch1_5"
+        }
+      }
+    }
+  }
+  attr {
+    key: "output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "indexed_case/Identity"
+  op: "Identity"
+  input: "indexed_case"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+library {
+  function {
+    signature {
+      name: "indexed_case_branch0_4"
+      input_arg {
+        name: "add_const"
+        type: DT_INT32
+      }
+      output_arg {
+        name: "add"
+        type: DT_INT32
+      }
+    }
+    node_def {
+      name: "add/y"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 1
+          }
+        }
+      }
+      experimental_debug_info {
+        original_node_names: "add/y"
+      }
+    }
+    node_def {
+      name: "add_0"
+      op: "AddV2"
+      input: "add_const"
+      input: "add/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+      experimental_debug_info {
+        original_node_names: "add"
+      }
+    }
+    ret {
+      key: "add"
+      value: "add_0:z:0"
+    }
+    arg_attr {
+      key: 0
+      value {
+        attr {
+          key: "_output_shapes"
+          value {
+            list {
+              shape {
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  function {
+    signature {
+      name: "indexed_case_branch1_5"
+      input_arg {
+        name: "add_const"
+        type: DT_INT32
+      }
+      output_arg {
+        name: "add"
+        type: DT_INT32
+      }
+    }
+    node_def {
+      name: "add/y"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 2
+          }
+        }
+      }
+      experimental_debug_info {
+        original_node_names: "add/y"
+      }
+    }
+    node_def {
+      name: "add_0"
+      op: "AddV2"
+      input: "add_const"
+      input: "add/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+      experimental_debug_info {
+        original_node_names: "add"
+      }
+    }
+    ret {
+      key: "add"
+      value: "add_0:z:0"
+    }
+    arg_attr {
+      key: 0
+      value {
+        attr {
+          key: "_output_shapes"
+          value {
+            list {
+              shape {
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+versions {
+  producer: 486
+  min_consumer: 12
+}
+
+# CHECK: tf.Case
+# CHECK-SAME: is_stateless
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt
index cf08d55b3cb..304429c8783 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt
@@ -54,5 +54,5 @@ versions {
 # the names are matching between the function definition and the uses / call
 # site (a numerical suffix may be appended).
 
-# CHECK: "tf.LegacyCall"(%outputs) {_disable_call_shape_inference = false, f = @foo0}
+# CHECK: "tf.LegacyCall"(%outputs) {_disable_call_shape_inference = false, device = "", f = @foo0}
 # CHECK: func @foo0
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-call.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-call.pbtxt
index fa6f63e27a5..f954657765a 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-call.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-call.pbtxt
@@ -34,6 +34,12 @@ node {
       b: true
     }
   }
+  attr {
+    key: "_tpu_replicate"
+    value {
+      s: "cluster"
+    }
+  }
 }
 library {
   function {
@@ -62,4 +68,4 @@ library {
 }
 
 # CHECK: func @main
-# CHECK: "tf.LegacyCall"(%arg0) {_disable_call_shape_inference = true, f = @test_func_name0}
+# CHECK: "tf.LegacyCall"(%arg0) {_disable_call_shape_inference = true, _tpu_replicate = "cluster", device = "", f = @test_func_name0}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-name-bug.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-name-bug.pbtxt
index 8cf6d4ed5d5..326e7b1ecd4 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-name-bug.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-name-bug.pbtxt
@@ -121,8 +121,8 @@ versions {
 # Verify that functions from the library are properly imported.
 
 # CHECK-LABEL:  func @main() {
-# CHECK:    "tf.LegacyCall"() {_disable_call_shape_inference = false, f = @foo110}
-# CHECK:    "tf.LegacyCall"() {_disable_call_shape_inference = false, f = @foo111}
+# CHECK:    "tf.LegacyCall"() {_disable_call_shape_inference = false, device = "", f = @foo110}
+# CHECK:    "tf.LegacyCall"() {_disable_call_shape_inference = false, device = "", f = @foo111}
 
 # CHECK-LABEL:  func @foo110() attributes {sym_visibility = "private"}
 # CHECK-LABEL:  func @foo111() attributes {sym_visibility = "private"} 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt
index eb358d52b26..7cb7ac7e008 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt
@@ -88,7 +88,7 @@ library {
 # CHECK:    tf_executor.graph
 # CHECK:      "tf.VarHandleOp"()
 # CHECK:      "tf.LegacyCall"
-# CHECK-SAME:   {_disable_call_shape_inference = true, f = @test_func_name0}
+# CHECK-SAME:   {_disable_call_shape_inference = true, device = "", f = @test_func_name0}
 # CHECK:      tf_executor.fetch
 # CHECK:    return
 # CHECK:  func @test_func_name0
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-library.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-library.pbtxt
index 55a76b1b668..53e951473d0 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-library.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-library.pbtxt
@@ -54,10 +54,10 @@ versions {
 # Verify that functions from the library are properly imported.
 
 # CHECK-LABEL:  func @main() {
-# CHECK:    "tf.LegacyCall"() {_disable_call_shape_inference = true, f = @foo0}
-# CHECK:    "tf.LegacyCall"() {_disable_call_shape_inference = false, f = @bar0}
+# CHECK:    "tf.LegacyCall"() {_disable_call_shape_inference = true, device = "", f = @foo0}
+# CHECK:    "tf.LegacyCall"() {_disable_call_shape_inference = false, device = "", f = @bar0}
 
 # CHECK-LABEL:  func @foo0() attributes {sym_visibility = "private"}
-# CHECK: "tf.LegacyCall"() {_disable_call_shape_inference = false, f = @bar0}
+# CHECK: "tf.LegacyCall"() {_disable_call_shape_inference = false, device = "", f = @bar0}
 
 # CHECK-LABEL:  func @bar0() attributes {sym_visibility = "private"} 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/guarantee-all-funcs-one-use.mlir b/tensorflow/compiler/mlir/tensorflow/tests/guarantee-all-funcs-one-use.mlir
new file mode 100644
index 00000000000..d8903846158
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/guarantee-all-funcs-one-use.mlir
@@ -0,0 +1,54 @@
+// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-guarantee-all-funcs-one-use | FileCheck %s
+
+// -----
+// Basic test.
+// CHECK-LABEL: func @f
+func @f() {
+  // CHECK: call @g() : () -> ()
+  // CHECK: call @[[NEWG:.+]]() : () -> ()
+  call @g() : () -> ()
+  call @g() : () -> ()
+  return
+}
+
+// CHECK: func @g()
+// CHECK: func @[[NEWG]]() attributes {sym_visibility = "private"}
+func @g() {
+  return
+}
+
+// -----
+// Transitive callees.
+// CHECK-LABEL: func @f
+// 2 copies of @g
+// CHECK-DAG: func @g{{.*}}
+// CHECK-DAG: func @g{{.*}}
+// 4 copies of @h
+// CHECK-DAG: func @h{{.*}}
+// CHECK-DAG: func @h{{.*}}
+// CHECK-DAG: func @h{{.*}}
+// CHECK-DAG: func @h{{.*}}
+func @f() {
+  call @g() : () -> ()
+  call @g() : () -> ()
+  return
+}
+
+func @g() {
+  call @h() : () -> ()
+  call @h() : () -> ()
+  return
+}
+
+func @h() {
+  return
+}
+
+// -----
+// Handle error case of infinite recursion.
+// expected-error @+1 {{reached cloning limit}}
+func @f() attributes {sym_visibility = "private"} {
+  call @f() : () -> ()
+  call @f() : () -> ()
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/init_text_file_to_import.mlir b/tensorflow/compiler/mlir/tensorflow/tests/init_text_file_to_import.mlir
new file mode 100644
index 00000000000..6a9581b0e44
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/init_text_file_to_import.mlir
@@ -0,0 +1,14 @@
+// RUN: tf-opt -tf-init-text-file-to-import-test %s | FileCheck %s
+
+// Tests that the tf.InitializeTableFromTextFileV2 op are inlined.
+
+func @init_all_tables() {
+  %cst = constant dense<"%FILE_PLACEHOLDER"> : tensor<!tf.string>
+  %0 = "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf.string, shared_name = "hash_table_/tmp/vocab.txt_-2_-1", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf.resource>
+  "tf.InitializeTableFromTextFileV2"(%0, %cst) {delimiter = " ", device = "", key_index = -2 : i64, value_index = -1 : i64, vocab_size = -1 : i64} : (tensor<!tf.resource>, tensor<!tf.string>) -> ()
+  return
+  // CHECK: [[CST:%.*]]  = constant dense<["apple", "banana", "grape"]> : tensor<3x!tf.string>
+  // CHECK: [[CST_0:%.*]]  = constant dense<[0, 1, 2]> : tensor<3xi64>
+  // CHECK: [[VAL:%.*]] = "tf.HashTableV2"()
+  // CHECK: "tf.LookupTableImportV2"([[VAL]], [[CST]], [[CST_0]])
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/init_text_file_to_import_invalid.mlir b/tensorflow/compiler/mlir/tensorflow/tests/init_text_file_to_import_invalid.mlir
new file mode 100644
index 00000000000..05afe1cc27f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/init_text_file_to_import_invalid.mlir
@@ -0,0 +1,53 @@
+// RUN: tf-opt -split-input-file -verify-diagnostics -tf-init-text-file-to-import %s | FileCheck %s
+
+// Tests that the given vocabulary file does not exist.
+
+func @init_all_tables() {
+  %cst = constant dense<"vocab_file_does_not_exist.txt"> : tensor<!tf.string>
+  %0 = "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf.string, shared_name = "hash_table_/tmp/vocab.txt_-2_-1", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf.resource>
+  // expected-error @+1 {{'tf.InitializeTableFromTextFileV2' op failed to open vocabulary file (vocab_file_does_not_exist.txt): cannot open input file 'vocab_file_does_not_exist.txt': No such file or directory}}
+  "tf.InitializeTableFromTextFileV2"(%0, %cst) {delimiter = " ", device = "", key_index = -2 : i64, value_index = -1 : i64, vocab_size = -1 : i64} : (tensor<!tf.resource>, tensor<!tf.string>) -> ()
+  return
+}
+
+// -----
+
+// Tests that the tf.InitializeTableFromTextFileV2 op is not converted since
+// unsupported key_index, -1.
+
+func @init_all_tables() {
+  %cst = constant dense<"vocab_file_does_not_exist.txt"> : tensor<!tf.string>
+  %0 = "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf.string, shared_name = "hash_table_/tmp/vocab.txt_-2_-1", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf.resource>
+  "tf.InitializeTableFromTextFileV2"(%0, %cst) {delimiter = " ", device = "", key_index = -1 : i64, value_index = -1 : i64, vocab_size = -1 : i64} : (tensor<!tf.resource>, tensor<!tf.string>) -> ()
+  return
+  // CHECK: [[VAL:%.*]] = "tf.HashTableV2"()
+  // CHECK: tf.InitializeTableFromTextFileV2"
+}
+
+// -----
+
+// Tests that the tf.InitializeTableFromTextFileV2 op is not converted since
+// unsupported value_index, 0.
+
+func @init_all_tables() {
+  %cst = constant dense<"vocab_file_does_not_exist.txt"> : tensor<!tf.string>
+  %0 = "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf.string, shared_name = "hash_table_/tmp/vocab.txt_-2_-1", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf.resource>
+  "tf.InitializeTableFromTextFileV2"(%0, %cst) {delimiter = " ", device = "", key_index = -2 : i64, value_index = 0 : i64, vocab_size = -1 : i64} : (tensor<!tf.resource>, tensor<!tf.string>) -> ()
+  return
+  // CHECK: [[VAL:%.*]] = "tf.HashTableV2"()
+  // CHECK: tf.InitializeTableFromTextFileV2"
+}
+
+// -----
+
+// Tests that the tf.InitializeTableFromTextFileV2 op is not converted since
+// unsupported vocab_size, 1.
+
+func @init_all_tables() {
+  %cst = constant dense<"vocab_file_does_not_exist.txt"> : tensor<!tf.string>
+  %0 = "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf.string, shared_name = "hash_table_/tmp/vocab.txt_-2_-1", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf.resource>
+  "tf.InitializeTableFromTextFileV2"(%0, %cst) {delimiter = " ", device = "", key_index = -2 : i64, value_index = -1 : i64, vocab_size = 1 : i64} : (tensor<!tf.resource>, tensor<!tf.string>) -> ()
+  return
+  // CHECK: [[VAL:%.*]] = "tf.HashTableV2"()
+  // CHECK: tf.InitializeTableFromTextFileV2"
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/inlining.mlir b/tensorflow/compiler/mlir/tensorflow/tests/inlining.mlir
index 5f4bffcc7c2..7e583d0425a 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/inlining.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/inlining.mlir
@@ -2,7 +2,7 @@
 
 // Test that simple TF operations can be inlined.
 
-func @inline_simple_callee() -> tensor<2xi32> {
+func @inline_simple_callee() -> tensor<2xi32> attributes {sym_visibility = "private"} {
   %cst = "tf.Const"() { value = dense<2> : tensor<2xi32> } : () -> tensor<2xi32>
   return %cst : tensor<2xi32>
 }
@@ -18,7 +18,7 @@ func @inline_simple() -> tensor<2xi32> {
 // Check that TF call operations can be inlined, even when the shape of the
 // argument or result is different than the called function.
 
-func @inline_shape_cast_callee(%arg : tensor<*xi32>) -> tensor<*xi32> {
+func @inline_shape_cast_callee(%arg : tensor<*xi32>) -> tensor<*xi32> attributes {sym_visibility = "private"} {
   return %arg : tensor<*xi32>
 }
 
@@ -34,7 +34,12 @@ func @inline_shape_cast(%arg: tensor<2xi32>) -> tensor<2xi32> {
 
 // Check that functions can be inlined into islands.
 
-func @inline_into_island_multi_block_callee() -> tensor<2xi32> {
+func @inline_simple_callee1() -> tensor<2xi32> attributes {sym_visibility = "private"} {
+  %cst = "tf.Const"() { value = dense<2> : tensor<2xi32> } : () -> tensor<2xi32>
+  return %cst : tensor<2xi32>
+}
+
+func @inline_into_island_multi_block_callee() -> tensor<2xi32> attributes {sym_visibility = "private"} {
   br ^bb1
 
 ^bb1:
@@ -48,7 +53,7 @@ func @inline_into_island() -> (tensor<2xi32>, tensor<2xi32>) {
     %1:3 = tf_executor.island {
       // Single block regions may be inlined.
       // CHECK: %[[CST:.*]] = "tf.Const"
-      %result = "tf.StatefulPartitionedCall"() {config = "", config_proto = "", executor_type = "", f = @inline_simple_callee} : () -> tensor<2xi32>
+      %result = "tf.StatefulPartitionedCall"() {config = "", config_proto = "", executor_type = "", f = @inline_simple_callee1} : () -> tensor<2xi32>
 
       // Multi block regions may not.
       // CHECK-NEXT: %[[CALL:.*]] = "tf.StatefulPartitionedCall"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir b/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
index 3215055a249..e11474c0755 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -test-tf-lower-tf | FileCheck %s
+// RUN: tf-opt %s -test-tf-lower-tf | FILECHECK_OPTS="" FileCheck %s
 
 // CHECK-LABEL: invert_permutation
 func @invert_permutation(%arg0: tensor<5xi32>) -> tensor<5xi32> {
@@ -353,8 +353,16 @@ func @ZerosLike_variant(%arg0: tensor<!tf.variant<tensor<2xi32>>>) -> tensor<!tf
   return %0 : tensor<!tf.variant<tensor<2xi32>>>
 }
 
-// CHECK-LABEL: func @addN
-func @addN(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK-LABEL: func @addN_2
+func @addN_2(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+  // CHECK: %[[SUM0:.*]] = "tf.AddV2"(%arg0, %arg1)
+  // return %[[SUM0]]
+  %0 = "tf.AddN"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// CHECK-LABEL: func @addN_3
+func @addN_3(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<*xf32>) -> tensor<*xf32> {
   // CHECK: %[[SUM0:.*]] = "tf.AddV2"(%arg0, %arg1)
   // CHECK: %[[SUM1:.*]] = "tf.AddV2"(%[[SUM0]], %arg2)
   // return %[[SUM1]]
@@ -362,6 +370,27 @@ func @addN(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<*xf32>) ->
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL: func @addN_4
+func @addN_4(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<*xf32>, %arg3: tensor<*xf32>) -> tensor<*xf32> {
+  // CHECK: %[[SUM0:.*]] = "tf.AddV2"(%arg0, %arg1)
+  // CHECK: %[[SUM1:.*]] = "tf.AddV2"(%arg2, %arg3)
+  // CHECK: %[[SUM2:.*]] = "tf.AddV2"(%[[SUM0]], %[[SUM1]])
+  // return %[[SUM2]]
+  %0 = "tf.AddN"(%arg0, %arg1, %arg2, %arg3) : (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// CHECK-LABEL: func @addN_5
+func @addN_5(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<*xf32>, %arg3: tensor<*xf32>, %arg4: tensor<*xf32>) -> tensor<*xf32> {
+  // CHECK: %[[SUM0:.*]] = "tf.AddV2"(%arg0, %arg1)
+  // CHECK: %[[SUM1:.*]] = "tf.AddV2"(%arg2, %arg3)
+  // CHECK: %[[SUM2:.*]] = "tf.AddV2"(%[[SUM0]], %[[SUM1]])
+  // CHECK: %[[SUM3:.*]] = "tf.AddV2"(%[[SUM2]], %arg4)
+  // return %[[SUM3]]
+  %0 = "tf.AddN"(%arg0, %arg1, %arg2, %arg3, %arg4) : (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
 // CHECK-LABEL: func @addN_variant
 func @addN_variant(%arg0: tensor<!tf.variant<tensor<2xf32>>>, %arg1: tensor<!tf.variant<tensor<2xf32>>>, %arg2: tensor<!tf.variant<tensor<2xf32>>>) -> tensor<!tf.variant<tensor<2xf32>>> {
   // CHECK: tf.AddN
@@ -371,9 +400,7 @@ func @addN_variant(%arg0: tensor<!tf.variant<tensor<2xf32>>>, %arg1: tensor<!tf.
 
 // CHECK-LABEL: func @DynamicStitch_simple
 func @DynamicStitch_simple(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
-  // CHECK-DAG: %[[SHAPE:.*]] = "tf.Const"() {value = dense<[-1, 2]> : tensor<2xi64>} : () -> tensor<2xi64>
-  // CHECK-DAG: %[[INP:.*]] = "tf.Reshape"(%arg0, %[[SHAPE]]) : (tensor<2x2xf32>, tensor<2xi64>) -> tensor<2x2xf32>
-  // CHECK-DAG: %[[ITEMS:.*]]:2 = "tf.Unpack"(%[[INP]]) {axis = 0 : i64} : (tensor<2x2xf32>) -> (tensor<2xf32>, tensor<2xf32>)
+  // CHECK-DAG: %[[ITEMS:.*]]:2 = "tf.Unpack"(%arg0) {axis = 0 : i64} : (tensor<2x2xf32>) -> (tensor<2xf32>, tensor<2xf32>)
   // CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
   // CHECK-DAG: %[[RESULT:.*]] = "tf.ConcatV2"(%[[ITEMS]]#1, %[[ITEMS]]#0, %[[AXIS]]) : (tensor<2xf32>, tensor<2xf32>, tensor<i64>) -> tensor<2x2xf32>
   // CHECK: return %[[RESULT]]
@@ -411,9 +438,7 @@ func @DynamicStitch_uint8(%arg0: tensor<2x2xui8>) -> tensor<2x2xui8> {
 
 // CHECK-LABEL: func @DynamicStitch_scalar_item
 func @DynamicStitch_scalar_item(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-  // CHECK-DAG: %[[SHAPE:.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi64>} : () -> tensor<1xi64>
-  // CHECK-DAG: %[[INP:.*]] = "tf.Reshape"(%arg0, %[[SHAPE]]) : (tensor<2xf32>, tensor<1xi64>) -> tensor<2xf32>
-  // CHECK-DAG: %[[ITEMS]]:2 = "tf.Unpack"(%[[INP]]) {axis = 0 : i64} : (tensor<2xf32>) -> (tensor<f32>, tensor<f32>)
+  // CHECK-DAG: %[[ITEMS]]:2 = "tf.Unpack"(%arg0) {axis = 0 : i64} : (tensor<2xf32>) -> (tensor<f32>, tensor<f32>)
   // CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
   // CHECK-DAG: %[[RESULT]] = "tf.ConcatV2"(%[[ITEMS]]#1, %[[ITEMS]]#0, %[[AXIS]]) : (tensor<f32>, tensor<f32>, tensor<i64>) -> tensor<2xf32>
   // CHECK: return %[[RESULT]]
@@ -425,9 +450,7 @@ func @DynamicStitch_scalar_item(%arg0: tensor<2xf32>) -> tensor<2xf32> {
 
 // CHECK-LABEL: func @DynamicStitch_matrix_item
 func @DynamicStitch_matrix_item(%arg0: tensor<2x2x2xf32>) -> tensor<2x2x2xf32> {
-  // CHECK-DAG: %[[SHAPE:.*]] = "tf.Const"() {value = dense<[-1, 2, 2]> : tensor<3xi64>} : () -> tensor<3xi64>
-  // CHECK-DAG: %[[INP:.*]] = "tf.Reshape"(%arg0, %[[SHAPE]]) : (tensor<2x2x2xf32>, tensor<3xi64>) -> tensor<2x2x2xf32>
-  // CHECK-DAG: %[[ITEMS:.*]]:2 = "tf.Unpack"(%[[INP]]) {axis = 0 : i64} : (tensor<2x2x2xf32>) -> (tensor<2x2xf32>, tensor<2x2xf32>)
+  // CHECK-DAG: %[[ITEMS:.*]]:2 = "tf.Unpack"(%arg0) {axis = 0 : i64} : (tensor<2x2x2xf32>) -> (tensor<2x2xf32>, tensor<2x2xf32>)
   // CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
   // CHECK-DAG: %[[RESULT:.*]] = "tf.ConcatV2"(%[[ITEMS]]#1, %[[ITEMS]]#0, %[[AXIS]]) : (tensor<2x2xf32>, tensor<2x2xf32>, tensor<i64>) -> tensor<2x2x2xf32>
   // CHECK: return %[[RESULT]]
@@ -446,9 +469,7 @@ func @DynamicStitch_dynamic(%arg0: tensor<*xi32>, %arg1: tensor<*xf32>) -> tenso
 
 // CHECK-LABEL: func @DynamicStitch_duplicates
 func @DynamicStitch_duplicates(%arg0: tensor<2x2xf32>) -> tensor<1x2xf32> {
-  // CHECK-DAG: %[[SHAPE:.*]] = "tf.Const"() {value = dense<[-1, 2]> : tensor<2xi64>} : () -> tensor<2xi64>
-  // CHECK-DAG: %[[INP:.*]] = "tf.Reshape"(%arg0, %[[SHAPE]]) : (tensor<2x2xf32>, tensor<2xi64>) -> tensor<2x2xf32>
-  // CHECK-DAG: %[[ITEMS:.*]]:2 = "tf.Unpack"(%[[INP]]) {axis = 0 : i64} : (tensor<2x2xf32>) -> (tensor<2xf32>, tensor<2xf32>)
+  // CHECK-DAG: %[[ITEMS:.*]]:2 = "tf.Unpack"(%arg0) {axis = 0 : i64} : (tensor<2x2xf32>) -> (tensor<2xf32>, tensor<2xf32>)
   // CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
   // CHECK-DAG: %[[RESULT:.*]] = "tf.ConcatV2"(%[[ITEMS]]#1, %[[AXIS]]) : (tensor<2xf32>, tensor<i64>) -> tensor<1x2xf32>
   // CHECK: return %[[RESULT]]
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mark_ops_for_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mark_ops_for_outside_compilation.mlir
new file mode 100644
index 00000000000..2d86889e35b
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mark_ops_for_outside_compilation.mlir
@@ -0,0 +1,257 @@
+// RUN: tf-opt %s -tf-mark-ops-for-outside-compilation | FILECHECK_OPTS="" FileCheck %s
+
+// CHECK-LABEL: func @unsupported_op
+func @unsupported_op() -> tensor<i32> {
+  %0 = "tf_device.cluster"() ( {
+    // CHECK: "tf.UnsupportedOp"
+    // CHECK-SAME: _xla_outside_compilation
+    // CHECK: "tf.Identity"
+    // CHECK-NOT: _xla_outside_compilation
+    %1 = "tf.UnsupportedOp"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %2 = "tf.Identity"(%1) : (tensor<i32>) -> tensor<i32>
+    tf_device.return %2 : tensor<i32>
+  }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<i32>
+  return %0 : tensor<i32>
+}
+
+// CHECK-LABEL: func @tf2xla_fallback_op
+func @tf2xla_fallback_op() -> tensor<f32> {
+  %0 = "tf_device.cluster"() ( {
+    // CHECK: "tf.UnsupportedOp"
+    // CHECK-SAME: _xla_outside_compilation
+    // CHECK: "tf.Identity"
+    // CHECK-NOT: _xla_outside_compilation
+    // CHECK: "tf.Sinh"
+    // CHECK-NOT: _xla_outside_compilation
+    %1 = "tf.UnsupportedOp"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %2 = "tf.Const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
+    %3 = "tf.Identity"(%1) : (tensor<i32>) -> tensor<i32>
+    %4 = "tf.Sinh"(%2) : (tensor<f32>) -> tensor<f32>
+    tf_device.return %4 : tensor<f32>
+  }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// CHECK-LABEL: func @ignore_embedding_ops
+func @ignore_embedding_ops() -> () {
+  "tf_device.cluster"() ( {
+    // CHECK: "tf.RecvTPUEmbeddingActivations"
+    // CHECK-NOT: _xla_outside_compilation
+    // CHECK: "tf.SendTPUEmbeddingGradients"
+    // CHECK-NOT: _xla_outside_compilation
+    %2:2 = "tf.RecvTPUEmbeddingActivations"() {_tpu_embedding_layer = "call1", config = "\0A\0B\0C\0D"} : () -> (tensor<2x2xf32>, tensor<4x4xf32>)
+    "tf.SendTPUEmbeddingGradients"(%2#0, %2#1) {_tpu_embedding_layer = "call1", config = "\0A\0B\0C\0D", operand_segment_sizes = dense<[2, 0]> : vector<2xi32>} : (tensor<2x2xf32>, tensor<4x4xf32>) -> ()
+    tf_device.return
+  }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @op_string_result
+func @op_string_result() -> tensor<i32> {
+  %0 = "tf_device.cluster"() ( {
+    // CHECK: "tf.Const"() {value = dense<1> : tensor<i32>}
+    // CHECK-NOT: _xla_outside_compilation
+    // CHECK: "tf.Const"
+    // CHECK-SAME: _xla_outside_compilation
+    // CHECK-SAME: tf.string
+    // CHECK: "tf.Identity"
+    // CHECK-NOT: _xla_outside_compilation
+    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %2 = "tf.Const"() {value = dense<"x"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+    %3 = "tf.Identity"(%1) : (tensor<i32>) -> tensor<i32>
+    tf_device.return %3 : tensor<i32>
+  }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<i32>
+  return %0 : tensor<i32>
+}
+// CHECK-LABEL: func @op_string_operand
+func @op_string_operand(%arg0: tensor<!tf.string>) -> tensor<i32> {
+  %0 = "tf_device.cluster"() ( {
+    // CHECK: "tf.Const"() {value = dense<1> : tensor<i32>}
+    // CHECK-NOT: _xla_outside_compilation
+    // CHECK: "tf.StringToNumber"
+    // CHECK-SAME: _xla_outside_compilation
+    // CHECK-SAME: tf.string
+    // CHECK: "tf.Identity"
+    // CHECK-NOT: _xla_outside_compilation
+    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %2 = "tf.StringToNumber"(%arg0) {out_type = f32} : (tensor<!tf.string>) -> tensor<f32>
+    %3 = "tf.Identity"(%1) : (tensor<i32>) -> tensor<i32>
+    tf_device.return %3 : tensor<i32>
+  }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<i32>
+  return %0 : tensor<i32>
+}
+
+// CHECK-LABEL: func @op_string_operand_string_result
+func @op_string_operand_string_result(%arg0: tensor<!tf.string>) -> tensor<i32> {
+  %0 = "tf_device.cluster"() ( {
+    // CHECK: "tf.Const"() {value = dense<1> : tensor<i32>}
+    // CHECK-NOT: _xla_outside_compilation
+    // CHECK: "tf.Identity"
+    // CHECK-SAME: _xla_outside_compilation
+    // CHECK-SAME: tf.string
+    // CHECK: "tf.Identity"
+    // CHECK-NOT: _xla_outside_compilation
+    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %2 = "tf.Identity"(%arg0)  : (tensor<!tf.string>) -> tensor<!tf.string>
+    %3 = "tf.Identity"(%1) : (tensor<i32>) -> tensor<i32>
+    tf_device.return %3 : tensor<i32>
+  }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<i32>
+  return %0 : tensor<i32>
+}
+
+// Test that a tf.IfRegion op with a captured string operand is marked for outside compilation.
+
+// CHECK-LABEL: func @if_region_captured_string
+func @if_region_captured_string(%arg0: tensor<i1>, %arg1: tensor<!tf.string>) -> tensor<f32> {
+  %0 = "tf_device.cluster"() ( {
+    // CHECK: "tf.Const"() {value = dense<1> : tensor<i32>}
+    // CHECK-NOT: _xla_outside_compilation
+    // CHECK: "tf.IfRegion"
+    // CHECK: "tf.StringToNumber"
+    // CHECK: _xla_outside_compilation = "auto", is_stateless = true
+    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %2 = "tf.IfRegion"(%arg0) ( {
+      %3 = "tf.StringToNumber"(%arg1) {out_type = f32} : (tensor<!tf.string>) -> tensor<f32>
+      "tf.Yield"(%3) : (tensor<f32>) -> ()
+     },  {
+      %4 = "tf.Const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
+      "tf.Yield"(%4) : (tensor<f32>) -> ()
+    }) {is_stateless = true} : (tensor<i1>) -> (tensor<f32>)
+    %5 = "tf.Identity"(%2) : (tensor<f32>) -> tensor<f32>
+    tf_device.return %5 : tensor<f32>
+  }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// Test that ops with string results/operands inside a tf.IfRegion branch are marked for outside compilation.
+
+// CHECK-LABEL: func @if_region_string_op
+func @if_region_string_op(%arg0: tensor<i1>, %arg1: tensor<?xi32>) -> tensor<f32> {
+  %0 = "tf_device.cluster"() ( {
+    // CHECK: "tf.Const"() {value = dense<1> : tensor<i32>}
+    // CHECK-NOT: _xla_outside_compilation
+    // CHECK: "tf.IfRegion"
+    // CHECK-NOT: _xla_outside_compilation
+    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %2 = "tf.IfRegion"(%arg0) ( {
+      %3 = "tf.Const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
+      "tf.Yield"(%3) : (tensor<f32>) -> ()
+     },  {
+      // CHECK: "tf.Const"() {_xla_outside_compilation = "auto", value = dense<"1.0"> : tensor<!tf.string>}
+      // CHECK-NEXT: "tf.StringToNumber"
+      // CHECK-SAME: _xla_outside_compilation
+      %4 = "tf.Const"() {value = dense<"1.0"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+      %5 = "tf.StringToNumber"(%4) {out_type = f32} : (tensor<!tf.string>) -> tensor<f32>
+      "tf.Yield"(%5) : (tensor<f32>) -> ()
+    // CHECK: {is_stateless
+    }) {is_stateless = true} : (tensor<i1>) -> (tensor<f32>)
+    %6 = "tf.Identity"(%2) : (tensor<f32>) -> tensor<f32>
+    tf_device.return %6: tensor<f32>
+  }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// Test that ops with string results/operands inside a nested tf.IfRegion branch are marked for outside compilation.
+
+// CHECK-LABEL: func @nested_if_region_string_op
+func @nested_if_region_string_op(%arg0: tensor<i1>, %arg1: tensor<?xi32>) -> tensor<f32> {
+  %0 = "tf_device.cluster"() ( {
+    // CHECK: "tf.Const"() {value = dense<1> : tensor<i32>}
+    // CHECK-NOT: _xla_outside_compilation
+    // CHECK: "tf.IfRegion"
+    // CHECK-NOT: _xla_outside_compilation
+    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %2 = "tf.IfRegion"(%arg0) ( {
+      %3 = "tf.Const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
+      "tf.Yield"(%3) : (tensor<f32>) -> ()
+      },  {
+       // CHECK: "tf.Const"() {value = dense<true> : tensor<i1>}
+       // CHECK-NOT: _xla_outside_compilation
+       %4 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+       %5 = "tf.IfRegion"(%4)({
+         // CHECK: "tf.Const"() {_xla_outside_compilation = "auto", value = dense<"1.0"> : tensor<!tf.string>}
+         // CHECK-NEXT: "tf.StringToNumber"
+         // CHECK-SAME: _xla_outside_compilation
+         %6 = "tf.Const"() {value = dense<"1.0"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+         %7 = "tf.StringToNumber"(%6) {out_type = f32} : (tensor<!tf.string>) -> tensor<f32>
+         "tf.Yield"(%7) : (tensor<f32>) -> ()
+       },  {
+         // CHECK: "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>}
+         // CHECK-NOT: _xla_outside_compilation
+         %8 = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+         "tf.Yield"(%8) : (tensor<f32>) -> ()
+       // CHECK: {is_stateless
+       }){is_stateless = true} : (tensor<i1>) -> (tensor<f32>)
+       "tf.Yield"(%5) : (tensor<f32>) -> ()
+    // CHECK: {is_stateless
+    }) {is_stateless = true} : (tensor<i1>) -> (tensor<f32>)
+    %9 = "tf.Identity"(%2) : (tensor<f32>) -> tensor<f32>
+    tf_device.return %9: tensor<f32>
+  }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// Test that a tf.WhileRegion op with a captured string operand is marked for outside compilation.
+
+// CHECK-LABEL: func @while_region_captured_string
+func @while_region_captured_string(%arg0: tensor<i32>, %arg1: tensor<!tf.string>) -> tensor<f32> {
+  %0 = "tf_device.cluster"() ( {
+    // CHECK: "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>}
+    // CHECK-NOT: _xla_outside_compilation
+    // CHECK: "tf.WhileRegion"
+    // CHECK: "tf.StringToNumber"
+    // CHECK: _xla_outside_compilation = "auto", is_stateless = true
+    %1 = "tf.Const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
+    %2:2 = "tf.WhileRegion"(%1, %arg0) ( {
+      ^bb0(%carg0: tensor<f32>, %carg1: tensor<i32>):
+         %limit = constant dense<5> : tensor<i32>
+         %cond = "tf.NotEqual"(%carg1, %limit) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+         "tf.Yield"(%cond) : (tensor<i1>) -> ()
+    },  {
+      ^bb0(%barg0: tensor<f32>, %barg1: tensor<i32>):
+        %one = constant dense<1> : tensor<i32>
+        %sub = "tf.Sub"(%barg1, %one) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+        %3 = "tf.StringToNumber"(%arg1) {out_type = f32} : (tensor<!tf.string>) -> tensor<f32>
+        "tf.Yield"(%3, %sub) : (tensor<f32>, tensor<i32>) -> ()
+    }) {is_stateless = true} : (tensor<f32>, tensor<i32>) -> (tensor<f32>, tensor<i32>)
+    // CHECK: "tf.Identity"
+    // CHECK-NOT: _xla_outside_compilation
+    %5 = "tf.Identity"(%2#0) : (tensor<f32>) -> (tensor<f32>)
+    tf_device.return %5 : tensor<f32>
+  }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// Test that an unsupported op within a  tf.WhileRegion is marked for outside compilation.
+
+// CHECK-LABEL: func @while_region_unsupported_op
+func @while_region_unsupported_op(%arg0: tensor<i32>, %arg1: tensor<!tf.string>) -> tensor<f32> {
+  %0 = "tf_device.cluster"() ( {
+    // CHECK: "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>}
+    // CHECK-NOT: _xla_outside_compilation
+    // CHECK: "tf.WhileRegion"
+    %1 = "tf.Const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
+    %2:2 = "tf.WhileRegion"(%1, %arg0) ( {
+      ^bb0(%carg0: tensor<f32>, %carg1: tensor<i32>):
+         %limit = constant dense<5> : tensor<i32>
+         %cond = "tf.NotEqual"(%carg1, %limit) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+         "tf.Yield"(%cond) : (tensor<i1>) -> ()
+    },  {
+      ^bb0(%barg0: tensor<f32>, %barg1: tensor<i32>):
+        %one = constant dense<1> : tensor<i32>
+        %sub = "tf.Sub"(%barg1, %one) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+        // CHECK: "tf.UnsupportedOp"
+        // CHECK-SAME: _xla_outside_compilation
+        %3 = "tf.UnsupportedOp"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+        // CHECK: "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>}
+        %4 = "tf.Const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
+        "tf.Yield"(%4, %sub) : (tensor<f32>, tensor<i32>) -> ()
+    // CHECK: {is_stateless = true
+    }) {is_stateless = true} : (tensor<f32>, tensor<i32>) -> (tensor<f32>, tensor<i32>)
+    // CHECK: "tf.Identity"
+    // CHECK-NOT: _xla_outside_compilation
+    %5 = "tf.Identity"(%2#0) : (tensor<f32>) -> (tensor<f32>)
+    tf_device.return %5 : tensor<f32>
+  }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<f32>
+  return %0 : tensor<f32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/case.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/case.mlir
new file mode 100644
index 00000000000..2f2ee6f1286
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/case.mlir
@@ -0,0 +1,38 @@
+// RUN: tf-mlir-translate -mlir-to-graphdef %s -o - | FileCheck %s
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 486 : i32}} {
+  func @main() {
+    tf_executor.graph {
+      %outputs, %control = tf_executor.island wraps "tf.Const"() {device = "", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+      %outputs_0, %control_1 = tf_executor.island wraps "tf.Const"() {device = "", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+      %outputs_2, %control_3 = tf_executor.island wraps "tf.Case"(%outputs_0, %outputs) {Tin = [i32], Tout = [i32], _lower_using_switch_merge = true, _read_only_resource_inputs = [], branches = [@indexed_case_branch0_40, @indexed_case_branch1_50], device = "", is_stateless = true, output_shapes = [#tf.shape<>]} : (tensor<i32>, tensor<i32>) -> tensor<*xi32> loc("stateless_case")
+      %outputs_4, %control_5 = tf_executor.island wraps "tf.Identity"(%outputs_2) {device = ""} : (tensor<*xi32>) -> tensor<*xi32>
+      %outputs_6, %control_7 = tf_executor.island wraps "tf.Case"(%outputs_0, %outputs) {Tin = [i32], Tout = [i32], _lower_using_switch_merge = true, _read_only_resource_inputs = [], branches = [@indexed_case_branch0_40, @indexed_case_branch1_50], device = "", is_stateless = false, output_shapes = [#tf.shape<>]} : (tensor<i32>, tensor<i32>) -> tensor<*xi32> loc("regular_case")
+      tf_executor.fetch
+    }
+    return
+  }
+
+  func @indexed_case_branch0_40(%arg0: tensor<i32>) -> tensor<*xi32> attributes {sym_visibility = "private"} {
+    %0 = tf_executor.graph {
+      %outputs, %control = tf_executor.island wraps "tf.Const"() {device = "", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+      %outputs_0, %control_1 = tf_executor.island wraps "tf.AddV2"(%arg0, %outputs) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<*xi32>
+      tf_executor.fetch %outputs_0 : tensor<*xi32>
+    }
+    return %0 : tensor<*xi32>
+  }
+
+  func @indexed_case_branch1_50(%arg0: tensor<i32>) -> tensor<*xi32> attributes {sym_visibility = "private"} {
+    %0 = tf_executor.graph {
+      %outputs, %control = tf_executor.island wraps "tf.Const"() {device = "", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+      %outputs_0, %control_1 = tf_executor.island wraps "tf.AddV2"(%arg0, %outputs) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<*xi32>
+      tf_executor.fetch %outputs_0 : tensor<*xi32>
+    }
+    return %0 : tensor<*xi32>
+  }
+}
+
+// CHECK: name: "stateless_case"
+// CHECK-NEXT: "StatelessCase"
+// CHECK: name: "regular_case"
+// CHECK-NEXT: "Case"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/func_attr.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/func_attr.mlir
new file mode 100644
index 00000000000..fadb62c44b8
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/func_attr.mlir
@@ -0,0 +1,40 @@
+// RUN: tf-mlir-translate -mlir-to-graphdef %s | tf-mlir-translate -graphdef-to-mlir | tf-mlir-translate -mlir-to-graphdef | FileCheck %s
+
+// Tests #tf.func attributes are exported as AttrValue.NameAttrList attributes
+// with its attr field populated with nested attributes.
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 458 : i32}} {
+  func @main() {
+    tf_executor.graph {
+      %control = tf_executor.island wraps "tf.NoOp"() {_f = #tf.func<@callee, {attr2 = true, attr3 = 8.0 : f32}>} : () -> ()
+      tf_executor.fetch
+    }
+    return
+  }
+  func @callee() {
+    tf_executor.graph {
+      tf_executor.fetch
+    }
+    return
+  }
+}
+
+// CHECK:        op: "NoOp"
+// CHECK-NEXT:   attr
+// CHECK-NEXT:     key: "_f"
+// CHECK-NEXT:     value
+// CHECK-NEXT:       func
+// CHECK-NEXT:         name: [[FUNC_NAME:".*"]]
+// CHECK-NEXT:         attr
+// CHECK-NEXT:           key: "attr2"
+// CHECK-NEXT:           value
+// CHECK-NEXT:             b: true
+// CHECK:              attr
+// CHECK-NEXT:           key: "attr3"
+// CHECK-NEXT:           value
+// CHECK-NEXT:             f: 8
+
+// CHECK:      library
+// CHECK-NEXT:   function
+// CHECK-NEXT:     signature
+// CHECK-NEXT:       name: [[FUNC_NAME]]
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/functional-if-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/functional-if-ops.mlir
index d9ad36f2ce6..b6933459382 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/functional-if-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/functional-if-ops.mlir
@@ -1,13 +1,13 @@
 // RUN: tf-mlir-translate -mlir-to-graphdef %s -o - | FileCheck %s
 
-func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
+func @main(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<4xf32>, %arg3: tensor<4xf32>) -> (tensor<4xf32>, tensor<4xf32>) {
   %0:2 = tf_executor.graph {
     %outputs_2, %control_3 = tf_executor.island wraps "tf.Less"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<i1>
-    %outputs_4, %control_5 = tf_executor.island wraps "tf.If"(%outputs_2, %arg0, %arg1) {else_branch = @cond_false, is_stateless = false, then_branch = @cond_true} : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32> loc("StatefulIf")
-    %outputs_6, %control_7 = tf_executor.island wraps "tf.If"(%outputs_2, %arg0, %arg1) {else_branch = @cond_false, is_stateless = true, then_branch = @cond_true} : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32> loc("StatelessIf")
-    tf_executor.fetch %outputs_4, %outputs_6 : tensor<f32>, tensor<f32>
+    %outputs_4, %control_5 = tf_executor.island wraps "tf.If"(%outputs_2, %arg2, %arg3) {else_branch = @cond_false, is_stateless = false, then_branch = @cond_true} : (tensor<i1>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32> loc("StatefulIf")
+    %outputs_6, %control_7 = tf_executor.island wraps "tf.If"(%outputs_2, %arg2, %arg3) {else_branch = @cond_false, is_stateless = true, then_branch = @cond_true} : (tensor<i1>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32> loc("StatelessIf")
+    tf_executor.fetch %outputs_4, %outputs_6 : tensor<4xf32>, tensor<4xf32>
   }
-  return %0#0, %0#1 : tensor<f32>, tensor<f32>
+  return %0#0, %0#1 : tensor<4xf32>, tensor<4xf32>
 }
 
 func @cond_true(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
@@ -34,8 +34,32 @@ func @cond_false(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
 // CHECK-NOT: name:
 // CHECK: op: "If"
 // CHECK-NOT: is_stateless
+// CHECK:  attr {
+// CHECK:    key: "output_shapes"
+// CHECK:    value {
+// CHECK:      list {
+// CHECK:        shape {
+// CHECK:          dim {
+// CHECK:            size: 4
+// CHECK:          }
+// CHECK:        }
+// CHECK:      }
+// CHECK:    }
+// CHECK:  }
 
 // CHECK: name: "StatelessIf"
 // CHECK-NOT: name:
 // CHECK: op: "StatelessIf"
 // CHECK-NOT: is_stateless
+// CHECK:  attr {
+// CHECK:    key: "output_shapes"
+// CHECK:    value {
+// CHECK:      list {
+// CHECK:        shape {
+// CHECK:          dim {
+// CHECK:            size: 4
+// CHECK:          }
+// CHECK:        }
+// CHECK:      }
+// CHECK:    }
+// CHECK:  }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/functional-while-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/functional-while-ops.mlir
index 9f14a144d9d..c7a4630d985 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/functional-while-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/functional-while-ops.mlir
@@ -1,12 +1,12 @@
 // RUN: tf-mlir-translate -mlir-to-graphdef %s -o - | FileCheck %s
 
-func @main(%arg0: tensor<i32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
+func @main(%arg0: tensor<i32>, %arg1: tensor<5xf32>) -> (tensor<5xf32>, tensor<5xf32>) {
   %0:2 = tf_executor.graph {
-    %outputs_2:2, %control_3 = tf_executor.island wraps "tf.While"(%arg0, %arg1) {body = @body, cond = @cond, is_stateless = false} : (tensor<i32>, tensor<f32>) -> (tensor<i32>, tensor<f32>) loc("StatefulWhile")
-    %outputs_4:2, %control_5 = tf_executor.island wraps "tf.While"(%arg0, %arg1) {body = @body, cond = @cond, is_stateless = true} : (tensor<i32>, tensor<f32>) -> (tensor<i32>, tensor<f32>) loc("StatelessWhile")
-    tf_executor.fetch %outputs_2#1, %outputs_4#1 : tensor<f32>, tensor<f32>
+    %outputs_2:2, %control_3 = tf_executor.island wraps "tf.While"(%arg0, %arg1) {body = @body, cond = @cond, is_stateless = false} : (tensor<i32>, tensor<5xf32>) -> (tensor<i32>, tensor<5xf32>) loc("StatefulWhile")
+    %outputs_4:2, %control_5 = tf_executor.island wraps "tf.While"(%arg0, %arg1) {body = @body, cond = @cond, is_stateless = true} : (tensor<i32>, tensor<5xf32>) -> (tensor<i32>, tensor<5xf32>) loc("StatelessWhile")
+    tf_executor.fetch %outputs_2#1, %outputs_4#1 : tensor<5xf32>, tensor<5xf32>
   }
-  return %0#0, %0#1 : tensor<f32>, tensor<f32>
+  return %0#0, %0#1 : tensor<5xf32>, tensor<5xf32>
 }
 
 func @cond(%arg0: tensor<*xi32>, %arg1: tensor<*xf32>) -> tensor<i1> {
@@ -36,8 +36,34 @@ func @body(%arg0: tensor<*xi32>, %arg1: tensor<*xf32>) -> (tensor<*xi32>, tensor
 // CHECK-NOT: name:
 // CHECK: op: "While"
 // CHECK-NOT: is_stateless
+// CHECK:  attr {
+// CHECK:    key: "output_shapes"
+// CHECK:    value {
+// CHECK:      list {
+// CHECK:        shape {
+// CHECK:          dim {
+// CHECK:            size: 5
+// CHECK:          }
+// CHECK:        }
+// CHECK:      }
+// CHECK:    }
+// CHECK:  }
+
 
 // CHECK: name: "StatelessWhile"
 // CHECK-NOT: name:
 // CHECK: op: "StatelessWhile"
 // CHECK-NOT: is_stateless
+// CHECK:  attr {
+// CHECK:    key: "output_shapes"
+// CHECK:    value {
+// CHECK:      list {
+// CHECK:        shape {
+// CHECK:          dim {
+// CHECK:            size: 5
+// CHECK:          }
+// CHECK:        }
+// CHECK:      }
+// CHECK:    }
+// CHECK:  }
+
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/tf-legacy-call.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/tf-legacy-call.mlir
index 5f92d789066..3e50aa18098 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/tf-legacy-call.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/tf-legacy-call.mlir
@@ -3,7 +3,7 @@
 func @main() {
   tf_executor.graph {
     %outputs, %control = tf_executor.island wraps "tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "Constant", value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %outputs_0, %control_1 = tf_executor.island wraps "tf.LegacyCall"(%outputs) {f = @foo0} : (tensor<i32>) -> tensor<i32>
+    %outputs_0, %control_1 = tf_executor.island wraps "tf.LegacyCall"(%outputs) {_tpu_replicate = "cluster", device = "", f = @foo0} : (tensor<i32>) -> tensor<i32>
     tf_executor.fetch
   }
   return
@@ -23,6 +23,12 @@ func @foo0(%arg0: tensor<*xi32>) -> tensor<*xi32> {
 // CHECK-NEXT:     value {
 // CHECK-NEXT:       list {
 // CHECK-NEXT:         shape {
+// CHECK:  attr {
+// CHECK-NEXT:  key: "_tpu_replicate"
+// CHECK-NEXT:    value {
+// CHECK-NEXT:      s: "cluster"
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
 
 // CHECK: library {
 // CHECK-NEXT:  function {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/parallel_execute_to_islands.mlir b/tensorflow/compiler/mlir/tensorflow/tests/parallel_execute_to_islands.mlir
index 31ca7b28fe7..52dc06cd393 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/parallel_execute_to_islands.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/parallel_execute_to_islands.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -tf-parallel-execute-to-islands | FileCheck %s
+// RUN: tf-opt %s -tf-parallel-execute-to-islands | FILECHECK_OPTS="" FileCheck %s
 
 // CHECK-LABEL: func @check_regions_to_islands
 func @check_regions_to_islands() {
@@ -17,11 +17,9 @@ func @check_regions_to_islands() {
   return
 }
 
-// CHECK:      %[[ISLAND_INPUT_CTL:[a-z_0-9]*]] = tf_executor.island {
-// CHECK-NEXT:   tf_executor.yield
-// CHECK:      %[[ISLAND_1_CTL:[a-z_0-9]*]] = tf_executor.island(%[[ISLAND_INPUT_CTL]]) {
+// CHECK:      %[[ISLAND_1_CTL:[a-z_0-9]*]] = tf_executor.island {
 // CHECK:        tf_executor.yield
-// CHECK:      %[[ISLAND_2_CTL:[a-z_0-9]*]] = tf_executor.island(%[[ISLAND_INPUT_CTL]]) {
+// CHECK:      %[[ISLAND_2_CTL:[a-z_0-9]*]] = tf_executor.island {
 // CHECK:        tf_executor.yield
 // CHECK:      %{{.*}} = tf_executor.island(%[[ISLAND_1_CTL]], %[[ISLAND_2_CTL]]) {
 // CHECK-NEXT:   tf_executor.yield
@@ -192,3 +190,37 @@ func @check_output_barrier_correctly_forwards_outputs(%arg0 : tensor<i1>) -> ten
 // CHECK:         tf_executor.yield %[[OP_C_OUTPUT]] : tensor<i32>
 // CHECK:       %[[OUTPUT_SINK_OUTPUT:[a-z_0-9]*]]:2, %[[OUTPUT_SINK_CTL:[a-z_0-9]*]] = tf_executor.island {
 // CHECK-NEXT:    tf_executor.yield %[[ISLAND_1_OUTPUT]], %[[ISLAND_2_OUTPUT]] : tensor<i1>, tensor<i32>
+
+// CHECK-LABEL: func @check_parallel_execute_using_args
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
+func @check_parallel_execute_using_args(%arg0 : tensor<i1>) {
+  tf_executor.graph {
+    %1:2 = tf_executor.island {
+      %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield %2 : tensor<i1>
+    }
+    %2:2 = tf_executor.island {
+      %3 = "tf.opB"(%arg0) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield %3 : tensor<i1>
+    }
+    tf_executor.island() {
+      "tf_device.parallel_execute"() ({
+        %4 = "tf.opC"(%arg0, %1#0) : (tensor<i1>, tensor<i1>) -> tensor<i1>
+        tf_device.return %4 : tensor<i1>
+      },
+      {
+        %5 = "tf.opD"(%arg0, %2#0) : (tensor<i1>, tensor<i1>) -> tensor<i32>
+        tf_device.return %5 : tensor<i32>
+      }) {} : () -> (tensor<i1>, tensor<i32>)
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// Verify that args are directly accessed in newly created island without alias
+// through entry barrier.
+
+// CHECK:  "tf.opC"(%[[ARG_0]]
+// CHECK:  "tf.opD"(%[[ARG_0]]
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/parallelize_embedding_params_ops_pass.mlir b/tensorflow/compiler/mlir/tensorflow/tests/parallelize_embedding_params_ops_pass.mlir
new file mode 100644
index 00000000000..e1cfaba5dcc
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/parallelize_embedding_params_ops_pass.mlir
@@ -0,0 +1,96 @@
+// RUN: tf-opt %s -tf-parallize-embedding-params-ops -verify-diagnostics -split-input-file | FileCheck %s
+
+// CHECK-LABEL: func @two_shards
+func @two_shards(%arg0: tensor<*x!tf.resource<tensor<8xf32>>>, %arg1: tensor<*x!tf.resource<tensor<8xf32>>>, %arg2: tensor<*x!tf.resource<tensor<8xf32>>>, %arg3: tensor<*x!tf.resource<tensor<8xf32>>>) {
+  tf_executor.graph {
+    %control = tf_executor.island {
+      // CHECK: "tf_device.parallel_execute"
+      // CHECK:   "tf.ReadVariableOp"
+      // CHECK:   "tf.ReadVariableOp"
+      // CHECK:   "tf.LoadTPUEmbeddingAdagradParameters"
+      // CHECK:   tf_device.return
+      // CHECK:   "tf.ReadVariableOp"
+      // CHECK:   "tf.ReadVariableOp"
+      // CHECK:   "tf.LoadTPUEmbeddingAdagradParameters"
+      // CHECK:   tf_device.return
+      %0 = "tf.ReadVariableOp"(%arg0) {device = ""} : (tensor<*x!tf.resource<tensor<8xf32>>>) -> tensor<8xf32>
+      %1 = "tf.ReadVariableOp"(%arg1) {device = ""} : (tensor<*x!tf.resource<tensor<8xf32>>>) -> tensor<8xf32>
+      %2 = "tf.ReadVariableOp"(%arg2) {device = ""} : (tensor<*x!tf.resource<tensor<8xf32>>>) -> tensor<8xf32>
+      %3 = "tf.ReadVariableOp"(%arg3) {device = ""} : (tensor<*x!tf.resource<tensor<8xf32>>>) -> tensor<8xf32>
+      "tf.LoadTPUEmbeddingAdagradParameters"(%0, %1) {config = "", device = "/job:worker/replica:0/task:0/device:CPU:0", num_shards = 2 : i64, shard_id = 0 : i64, table_id = -1 : i64, table_name = "param_table"} : (tensor<8xf32>, tensor<8xf32>) -> ()
+      "tf.LoadTPUEmbeddingAdagradParameters"(%2, %3) {config = "", device = "/job:worker/replica:0/task:1/device:CPU:0", num_shards = 2 : i64, shard_id = 1 : i64, table_id = -1 : i64, table_name = "param_table"} : (tensor<8xf32>, tensor<8xf32>) -> ()
+      tf_executor.yield
+    }
+    tf_executor.fetch %control : !tf_executor.control
+  }
+  return
+}
+
+// Verifies that resource reads shared across two shards are kept outside the
+// parallel_execute op.
+
+// CHECK-LABEL: func @shared_reads
+func @shared_reads(%arg0: tensor<*x!tf.resource<tensor<8xf32>>>, %arg1: tensor<*x!tf.resource<tensor<8xf32>>>) {
+  tf_executor.graph {
+    %control = tf_executor.island {
+      // CHECK: "tf.ReadVariableOp"
+      %0 = "tf.ReadVariableOp"(%arg0) {device = ""} : (tensor<*x!tf.resource<tensor<8xf32>>>) -> tensor<8xf32>
+      // CHECK: "tf.ReadVariableOp"
+      %1 = "tf.ReadVariableOp"(%arg1) {device = ""} : (tensor<*x!tf.resource<tensor<8xf32>>>) -> tensor<8xf32>
+
+      // CHECK: "tf_device.parallel_execute"
+      // CHECK:   "tf.LoadTPUEmbeddingAdagradParameters"
+      // CHECK:   tf_device.return
+      // CHECK:   "tf.LoadTPUEmbeddingAdagradParameters"
+      // CHECK:   tf_device.return
+      "tf.LoadTPUEmbeddingAdagradParameters"(%0, %1) {config = "", device = "/job:worker/replica:0/task:0/device:CPU:0", num_shards = 2 : i64, shard_id = 0 : i64, table_id = -1 : i64, table_name = "param_table"} : (tensor<8xf32>, tensor<8xf32>) -> ()
+      "tf.LoadTPUEmbeddingAdagradParameters"(%0, %1) {config = "", device = "/job:worker/replica:0/task:1/device:CPU:0", num_shards = 2 : i64, shard_id = 1 : i64, table_id = -1 : i64, table_name = "param_table"} : (tensor<8xf32>, tensor<8xf32>) -> ()
+      tf_executor.yield
+    }
+    tf_executor.fetch %control : !tf_executor.control
+  }
+  return
+}
+
+// Verifies that if the resource variables are used in ops other than read
+// variable op whose semantics are not known then the function is kept
+// unchanged.
+
+// CHECK-LABEL: func @update_var
+func @update_var(%arg0: tensor<*x!tf.resource<tensor<8xf32>>>, %arg1: tensor<*x!tf.resource<tensor<8xf32>>>, %arg2: tensor<*x!tf.resource<tensor<8xf32>>>) {
+  tf_executor.graph {
+    // CHECK-NOT: tf_device.parallel_execute
+    %control = tf_executor.island {
+      %0 = "tf.ReadVariableOp"(%arg0) {device = ""} : (tensor<*x!tf.resource<tensor<8xf32>>>) -> tensor<8xf32>
+      %1 = "tf.ReadVariableOp"(%arg1) {device = ""} : (tensor<*x!tf.resource<tensor<8xf32>>>) -> tensor<8xf32>
+      "tf.LoadTPUEmbeddingAdagradParameters"(%0, %1) {config = "", device = "/job:worker/replica:0/task:0/device:CPU:0", num_shards = 2 : i64, shard_id = 0 : i64, table_id = -1 : i64, table_name = "param_table"} : (tensor<8xf32>, tensor<8xf32>) -> ()
+
+      %2 = "tf.ReadVariableOp"(%arg2) {device = ""} : (tensor<*x!tf.resource<tensor<8xf32>>>) -> tensor<8xf32>
+      %zeros = "tf.Const"() {value = dense<1.0> : tensor<8xf32>} : () -> tensor<8xf32>
+      "tf.AssignVariableOp"(%arg2, %zeros) : (tensor<*x!tf.resource<tensor<8xf32>>>, tensor<8xf32>) -> ()
+      %3 = "tf.ReadVariableOp"(%arg2) {device = ""} : (tensor<*x!tf.resource<tensor<8xf32>>>) -> tensor<8xf32>
+      "tf.LoadTPUEmbeddingAdagradParameters"(%2, %3) {config = "", device = "/job:worker/replica:0/task:1/device:CPU:0", num_shards = 2 : i64, shard_id = 1 : i64, table_id = -1 : i64, table_name = "param_table"} : (tensor<8xf32>, tensor<8xf32>) -> ()
+      tf_executor.yield
+    }
+    tf_executor.fetch %control : !tf_executor.control
+  }
+  return
+}
+
+// -----
+
+func @invalid_shard_range(%arg0: tensor<*x!tf.resource<tensor<8xf32>>>, %arg1: tensor<*x!tf.resource<tensor<8xf32>>>) {
+  tf_executor.graph {
+    %control = tf_executor.island {
+      // expected-error @-1 {{require continuous range of shards}}
+      %0 = "tf.ReadVariableOp"(%arg0) {device = ""} : (tensor<*x!tf.resource<tensor<8xf32>>>) -> tensor<8xf32>
+      %1 = "tf.ReadVariableOp"(%arg1) {device = ""} : (tensor<*x!tf.resource<tensor<8xf32>>>) -> tensor<8xf32>
+
+      "tf.LoadTPUEmbeddingAdagradParameters"(%0, %1) {config = "", device = "/job:worker/replica:0/task:0/device:CPU:0", num_shards = 3 : i64, shard_id = 0 : i64, table_id = -1 : i64, table_name = "param_table"} : (tensor<8xf32>, tensor<8xf32>) -> ()
+      "tf.LoadTPUEmbeddingAdagradParameters"(%0, %1) {config = "", device = "/job:worker/replica:0/task:1/device:CPU:0", num_shards = 3 : i64, shard_id = 3 : i64, table_id = -1 : i64, table_name = "param_table"} : (tensor<8xf32>, tensor<8xf32>) -> ()
+      tf_executor.yield
+    }
+    tf_executor.fetch %control : !tf_executor.control
+  }
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir b/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
index 40cfc03b8e6..3e6d4f37bac 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-promote-resources-to-args | FileCheck %s
+// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-promote-resources-to-args | FILECHECK_OPTS="" FileCheck %s
 
 // One resource, one read. The initial value of the resource is read.
 // CHECK-LABEL: func @main(%arg0: tensor<i1>, %arg1: tensor<f32> {tf.resource_name = "x"}) -> tensor<2xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/readonly_references_to_resources.mlir b/tensorflow/compiler/mlir/tensorflow/tests/readonly_references_to_resources.mlir
index 2b8f47a407e..7d36e6f4319 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/readonly_references_to_resources.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/readonly_references_to_resources.mlir
@@ -12,6 +12,18 @@ func @f() {
 
 // -----
 
+// Test case: Basic converting. '_class' attribute is at IdentityOp.
+
+func @f() {
+  // CHECK: "tf.VarHandleOp"
+  // CHECK: "tf.ReadVariableOp"
+  %val0 = "tf.VariableV2"() {container = "", device = "", shape = #tf.shape<96>, shared_name = ""} : () -> tensor<96x!tf.f32ref>
+  %val1 = "tf.Identity"(%val0) {_class = ["loc:@v"]} : (tensor<96x!tf.f32ref>) -> tensor<96xf32>
+  return
+}
+
+// -----
+
 // Test case: Two ReadVariable ops.
 
 func @f() {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/region-control-flow-to-functional.mlir b/tensorflow/compiler/mlir/tensorflow/tests/region-control-flow-to-functional.mlir
index 5ea863852ad..e9d4e441a10 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/region-control-flow-to-functional.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/region-control-flow-to-functional.mlir
@@ -1,19 +1,23 @@
-// RUN: tf-opt %s -tf-region-control-flow-to-functional -split-input-file
-//| FileCheck %s --dump-input=fail
+// RUN: tf-opt %s -tf-region-control-flow-to-functional -split-input-file | FileCheck %s
 
+// Simple IfRegion
 // CHECK: func @tf.IfRegion_else(%arg0: tensor<*xf32>) -> tensor<*xf32>
 // CHECK-NEXT:   "tf.Neg"
 // CHECK: func @tf.IfRegion_then(%arg0: tensor<*xf32>) -> tensor<*xf32>
 // CHECK-NEXT:   "tf.Abs"
 func @testSimple(%arg0: tensor<i1>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK: "tf.If"{{.+}}else_branch = @tf.IfRegion_else{{.+}}then_branch = @tf.IfRegion_then
+  // CHECK: "tf.If"
+  // CHECK-SAME: _attr0 = false
+  // CHECK-NOT: attr1
+  // CHECK-SAME: else_branch = @tf.IfRegion_else
+  // CHECK-SAME: then_branch = @tf.IfRegion_then
   %0 = "tf.IfRegion"(%arg0) ({
     %1 = "tf.Abs"(%arg1) : (tensor<*xf32>) -> tensor<*xf32>
     "tf.Yield"(%1) : (tensor<*xf32>) -> ()
     }, {
     %2 = "tf.Neg"(%arg1) : (tensor<*xf32>) -> tensor<*xf32>
     "tf.Yield"(%2) : (tensor<*xf32>) -> ()
-    }) { is_stateless = true } :  (tensor<i1>) -> tensor<*xf32>
+    }) {is_stateless = true, _attr0 = false, attr1 = "hello"} :  (tensor<i1>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
@@ -42,7 +46,7 @@ func @testIfCondition(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32>
 
 // -----
 
-// Constant sinking
+// Constant sinking for IfRegion
 
 // CHECK: func @tf.IfRegion_else() -> tensor<2xf32>
 // CHECK-NEXT: constant dense<1.0
@@ -105,7 +109,7 @@ func @testNested(%arg0: tensor<i1>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
 
 // -----
 
-// Match existing function->Region pattern (simple)
+// Match existing function->Region pattern (simple) for IfRegion
 func @testIf1Then(tensor<*xf32>) -> tensor<*xf32>
 func @testIf1Else(tensor<*xf32>) -> tensor<*xf32>
 func @testIf1Result(%arg0: tensor<i1>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
@@ -122,7 +126,7 @@ func @testIf1Result(%arg0: tensor<i1>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
 
 // -----
 
-// Match existing function->Region pattern (with casts)
+// Match existing function->Region pattern (with casts) for IfRegion
 
 func @testIf1Then(tensor<*xf32>) -> tensor<*xf32>
 func @testIf1Else(tensor<*xf32>) -> tensor<*xf32>
@@ -142,7 +146,29 @@ func @testIf2Result(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
 
 // -----
 
-// No inputs, some outputs
+// Match existing function->Region pattern (with multiple casts) for IfRegion
+
+func @testIf1Then(tensor<*xf32>) -> tensor<*xf32>
+func @testIf1Else(tensor<*xf32>) -> tensor<*xf32>
+func @testIf2Result(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // CHECK: "tf.If"({{.+}}) {else_branch = @testIf1Else, {{.+}} then_branch = @testIf1Then}
+  %0 = "tf.IfRegion"(%arg0) ( {
+    %1 = "tf.Cast"(%arg1) {Truncate = false} : (tensor<2xf32>) -> tensor<?xf32>
+    %2 = "tf.Cast"(%1) {Truncate = false} : (tensor<?xf32>) -> tensor<*xf32>
+    %3 = call @testIf1Then(%2) : (tensor<*xf32>) -> tensor<*xf32>
+    "tf.Yield"(%3) : (tensor<*xf32>) -> ()
+  },  {
+    %1 = "tf.Cast"(%arg1) {Truncate = false} : (tensor<2xf32>) -> tensor<?xf32>
+    %2 = "tf.Cast"(%1) {Truncate = false} : (tensor<?xf32>) -> tensor<*xf32>
+    %3 = call @testIf1Else(%2) : (tensor<*xf32>) -> tensor<*xf32>
+    "tf.Yield"(%3) : (tensor<*xf32>) -> ()
+  }) {is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+// No inputs, some outputs for IfRegion
 // CHECK: func @tf.IfRegion_else() -> tensor<2xf32>
 // CHECK-NEXT:    constant dense<1.000000e+00>
 // CHECK-NEXT:   "tf.Neg"
@@ -165,7 +191,7 @@ func @testSimple(%arg0: tensor<i1>) -> tensor<2xf32> {
 
 // -----
 
-// No outputs, some inputs
+// No outputs, some inputs for IfRegion
 //
 // CHECK: func @tf.IfRegion_else(%arg0: tensor<*xf32>)
 // CHECK-NEXT:   "tf.Neg"
@@ -186,3 +212,383 @@ func @testNoOutputs(%arg0: tensor<i1>, %arg1: tensor<*xf32>) -> () {
   return
 }
 
+// -----
+
+// Simple WhileRegion
+// CHECK: func @tf.WhileRegion_body{{.+}}{sym_visibility = "private"}
+// CHECK: "tf.Add"
+// CHECK: constant dense<1>
+// CHECK: "tf.Sub"
+// CHECK:func @tf.WhileRegion_cond{{.+}}{sym_visibility = "private"}
+// CHECK: constant dense<0>
+// CHECK: "tf.NotEqual"
+// CHECK-LABEL: testValidWhileRegion
+func @testValidWhileRegion(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> tensor<*xf32> {
+  // CHECK: [[Result:%.*]]:2 = "tf.While"(%arg0, %arg1)
+  // CHECK-SAME: _attr0 = false
+  // CHECK-NOT: attr1
+  // CHECK-SAME: body = @tf.WhileRegion_body
+  // CHECK-SAME: cond = @tf.WhileRegion_cond
+  %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
+    {
+      // condition, check if count has reached 0
+      ^bb0(%carg0: tensor<*xf32>, %carg1: tensor<i32>):
+      %zero = constant dense<0> : tensor<i32>
+      %ne = "tf.NotEqual"(%carg1, %zero) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+      "tf.Yield"(%ne) : (tensor<i1>) -> ()
+    },
+    {
+      // loop body
+      ^bb0(%barg0: tensor<*xf32>, %barg1: tensor<i32>):
+      %add = "tf.Add"(%barg0, %barg0) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+      %one = constant dense<1> : tensor<i32>
+      %sub = "tf.Sub"(%barg1, %one) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      "tf.Yield"(%add, %sub) : (tensor<*xf32>, tensor<i32>) -> ()
+    }
+  ) { is_stateless = false, _attr0 = false, attr1 = "hello"} : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+  // CHECK: return [[Result]]#0
+  return %0#0 : tensor<*xf32>
+}
+
+// -----
+
+// WhileRegion with type mismatch
+// CHECK: func @tf.WhileRegion_body{{.+}}{sym_visibility = "private"}
+// CHECK: "tf.Add"
+// CHECK: constant dense<1>
+// CHECK: "tf.Sub"
+// CHECK:func @tf.WhileRegion_cond{{.+}}{sym_visibility = "private"}
+// CHECK: constant dense<0>
+// CHECK: "tf.NotEqual"
+// CHECK-LABEL: testWhileRegionTypeMismatch
+func @testWhileRegionTypeMismatch(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> tensor<*xf32> {
+  // CHECK: [[Result:%.*]]:2 = "tf.While"(%arg0, %arg1) {body = @tf.WhileRegion_body, cond = @tf.WhileRegion_cond
+  %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
+    {
+      // condition, check if count has reached 0
+      ^bb0(%carg0: tensor<4xf32>, %carg1: tensor<i32>):
+      %zero = constant dense<0> : tensor<i32>
+      %ne = "tf.NotEqual"(%carg1, %zero) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+      "tf.Yield"(%ne) : (tensor<i1>) -> ()
+    },
+    {
+      // loop body
+      ^bb0(%barg0: tensor<4xf32>, %barg1: tensor<i32>):
+      %add = "tf.Add"(%barg0, %barg0) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+      %one = constant dense<1> : tensor<i32>
+      %sub = "tf.Sub"(%barg1, %one) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      "tf.Yield"(%add, %sub) : (tensor<4xf32>, tensor<i32>) -> ()
+    }
+  ) { is_stateless = false } : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+  // CHECK: return [[Result]]#0
+  return %0#0 : tensor<*xf32>
+}
+
+// -----
+
+// WhileRegion with constant sinking
+// CHECK: func @tf.WhileRegion_body{{.+}}{sym_visibility = "private"}
+// CHECK: constant dense<1>
+// CHECK: "tf.Add"
+// CHECK: "tf.Sub"
+// CHECK:func @tf.WhileRegion_cond{{.+}}{sym_visibility = "private"}
+// CHECK: constant dense<0>
+// CHECK: "tf.NotEqual"
+// CHECK-LABEL: testWhileRegionConstantSink
+func @testWhileRegionConstantSink(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> tensor<*xf32> {
+  %zero = constant dense<0> : tensor<i32>
+  %one = constant dense<1> : tensor<i32>
+  // CHECK: [[Result:%.*]]:2 = "tf.While"(%arg0, %arg1) {body = @tf.WhileRegion_body, cond = @tf.WhileRegion_cond
+  %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
+    {
+      ^bb0(%carg0: tensor<4xf32>, %carg1: tensor<i32>):
+      %ne = "tf.NotEqual"(%carg1, %zero) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+      "tf.Yield"(%ne) : (tensor<i1>) -> ()
+    },
+    {
+      // loop body
+      ^bb0(%barg0: tensor<4xf32>, %barg1: tensor<i32>):
+      %add = "tf.Add"(%barg0, %barg0) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+      %sub = "tf.Sub"(%barg1, %one) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      "tf.Yield"(%add, %sub) : (tensor<4xf32>, tensor<i32>) -> ()
+    }
+  ) { is_stateless = false } : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+  // CHECK: return [[Result]]#0
+  return %0#0 : tensor<*xf32>
+}
+
+// -----
+
+// WhileRegion with implicitly captured extern value in cond
+// CHECK: func @tf.WhileRegion_body(%arg0: tensor<*xf32>, %arg1: tensor<i32>, %arg2: tensor<i32>)
+// CHECK: "tf.Add"
+// CHECK: constant dense<1>
+// CHECK: "tf.Sub"
+// CHECK: return %{{.+}}, %{{.+}}, %arg2 : tensor<*xf32>, tensor<i32>, tensor<i32>
+// CHECK: func @tf.WhileRegion_cond(%arg0: tensor<*xf32>, %arg1: tensor<i32>, %arg2: tensor<i32>)
+// CHECK: "tf.NotEqual"(%arg1, %arg2)
+// CHECK-LABEL: testWhileRegionExternInCond
+func @testWhileRegionExternInCond(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>, %arg2 : tensor<i32>) -> tensor<*xf32> {
+  %cst = constant dense<4> : tensor<i32>
+  %limit = "tf.Add"(%arg2, %cst) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  // CHECK: [[Result:%.*]]:3 = "tf.While"(%arg0, %arg1, %{{.+}}) {body = @tf.WhileRegion_body, cond = @tf.WhileRegion_cond
+  %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
+    {
+      ^bb0(%carg0: tensor<*xf32>, %carg1: tensor<i32>):
+      %ne = "tf.NotEqual"(%carg1, %limit) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+      "tf.Yield"(%ne) : (tensor<i1>) -> ()
+    },
+    {
+      // loop body
+      ^bb0(%barg0: tensor<*xf32>, %barg1: tensor<i32>):
+      %add = "tf.Add"(%barg0, %barg0) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+      %one = constant dense<1> : tensor<i32>
+      %sub = "tf.Sub"(%barg1, %one) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      "tf.Yield"(%add, %sub) : (tensor<*xf32>, tensor<i32>) -> ()
+    }
+  ) { is_stateless = false } : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+  // CHECK: return [[Result]]#0
+  return %0#0 : tensor<*xf32>
+}
+
+// -----
+
+// WhileRegion with implicitly captured extern value in body
+// CHECK: func @tf.WhileRegion_body(%arg0: tensor<*xf32>, %arg1: tensor<i32>, %arg2: tensor<i32>)
+// CHECK: %0 = "tf.Add"(%arg0, %arg0)
+// CHECK: %1 = "tf.Sub"(%arg1, %arg2)
+// CHECK: return %0, %1, %arg2
+
+// CHECK: func @tf.WhileRegion_cond(%arg0: tensor<*xf32>, %arg1: tensor<i32>, %arg2: tensor<i32>)
+// CHECK: constant dense<0>
+// CHECK: "tf.NotEqual"
+
+// CHECK-LABEL: testWhileRegionExternInBody
+func @testWhileRegionExternInBody(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>, %arg2 : tensor<i32>) -> tensor<*xf32> {
+  %zero = constant dense<0> : tensor<i32>
+  %cst = constant dense<4> : tensor<i32>
+  %stride = "tf.Add"(%arg2, %cst) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  // CHECK: [[Result:%.*]]:3 = "tf.While"(%arg0, %arg1, %{{.+}}) {body = @tf.WhileRegion_body, cond = @tf.WhileRegion_cond
+  %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
+    {
+      ^bb0(%carg0: tensor<*xf32>, %carg1: tensor<i32>):
+      %ne = "tf.NotEqual"(%carg1, %zero) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+      "tf.Yield"(%ne) : (tensor<i1>) -> ()
+    },
+    {
+      // loop body
+      ^bb0(%barg0: tensor<*xf32>, %barg1: tensor<i32>):
+      %add = "tf.Add"(%barg0, %barg0) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+      %sub = "tf.Sub"(%barg1, %stride) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      "tf.Yield"(%add, %sub) : (tensor<*xf32>, tensor<i32>) -> ()
+    }
+  ) { is_stateless = false } : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+  // CHECK: return [[Result]]#0
+  return %0#0 : tensor<*xf32>
+}
+
+// -----
+
+// WhileRegion with implicitly captured extern value in cond and body
+// CHECK: func @tf.WhileRegion_body(%arg0: tensor<*xf32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>)
+// CHECK: return %{{.+}}, %{{.+}}, %arg2, %arg3
+// CHECK: func @tf.WhileRegion_cond(%arg0: tensor<*xf32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>)
+// CHECK-LABEL: testWhileRegionExternInBodyAndCond
+func @testWhileRegionExternInBodyAndCond(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>, %arg2 : tensor<i32>) -> tensor<*xf32> {
+  %cst = constant dense<4> : tensor<i32>
+  %stride = "tf.Add"(%arg2, %cst) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %cst1 = constant dense<44> : tensor<i32>
+  %limit = "tf.Add"(%arg2, %cst1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  // CHECK: [[Result:%.*]]:4 = "tf.While"(%arg0, %arg1, %{{.+}}, %{{.+}}) {body = @tf.WhileRegion_body, cond = @tf.WhileRegion_cond
+  %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
+    {
+      ^bb0(%carg0: tensor<*xf32>, %carg1: tensor<i32>):
+      %ne = "tf.NotEqual"(%carg1, %limit) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+      "tf.Yield"(%ne) : (tensor<i1>) -> ()
+    },
+    {
+      // loop body
+      ^bb0(%barg0: tensor<*xf32>, %barg1: tensor<i32>):
+      %add = "tf.Add"(%barg0, %barg0) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+      %sub = "tf.Sub"(%barg1, %stride) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      "tf.Yield"(%add, %sub) : (tensor<*xf32>, tensor<i32>) -> ()
+    }
+  ) { is_stateless = false } : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+  // CHECK: return [[Result]]#0
+  return %0#0 : tensor<*xf32>
+}
+
+// -----
+
+// WhileRegion with same value implicitly captured in cond and body
+// CHECK: func @tf.WhileRegion_body(%arg0: tensor<*xf32>, %arg1: tensor<i32>, %arg2: tensor<i32>)
+// CHECK: return %{{.+}}, %{{.+}}, %arg2
+// CHECK: func @tf.WhileRegion_cond(%arg0: tensor<*xf32>, %arg1: tensor<i32>, %arg2: tensor<i32>)
+// CHECK-LABEL: testWhileRegionSameExternInBodyAndCond
+func @testWhileRegionSameExternInBodyAndCond(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>, %arg2 : tensor<i32>) -> tensor<*xf32> {
+  %cst = constant dense<4> : tensor<i32>
+  %stride = "tf.Add"(%arg2, %cst) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  // CHECK: [[Result:%.*]]:3 = "tf.While"(%arg0, %arg1, %{{.+}}) {body = @tf.WhileRegion_body, cond = @tf.WhileRegion_cond
+  %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
+    {
+      ^bb0(%carg0: tensor<*xf32>, %carg1: tensor<i32>):
+      %ne = "tf.NotEqual"(%carg1, %stride) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+      "tf.Yield"(%ne) : (tensor<i1>) -> ()
+    },
+    {
+      // loop body
+      ^bb0(%barg0: tensor<*xf32>, %barg1: tensor<i32>):
+      %add = "tf.Add"(%barg0, %barg0) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+      %sub = "tf.Sub"(%barg1, %stride) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      "tf.Yield"(%add, %sub) : (tensor<*xf32>, tensor<i32>) -> ()
+    }
+  ) { is_stateless = false } : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+  // CHECK: return [[Result]]#0
+  return %0#0 : tensor<*xf32>
+}
+
+// -----
+
+// Simple trivially transformable while
+// CHECK: func @while_cond
+// CHECK: func @while_body
+// CHECK-LABEL: testWhileRegionTrivial
+func @while_cond(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> tensor<i1>
+func @while_body(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+func @testWhileRegionTrivial(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> tensor<*xf32> {
+  // CHECK: [[Result:%.*]]:2 = "tf.While"(%arg0, %arg1) {body = @while_body, cond = @while_cond
+  %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
+    {
+      ^bb0(%carg0: tensor<*xf32>, %carg1: tensor<i32>):
+        %cond = call @while_cond(%carg0, %carg1) : (tensor<*xf32>, tensor<i32>) -> tensor<i1>
+        "tf.Yield"(%cond) : (tensor<i1>) -> ()
+    },
+    {
+      // loop body
+      ^bb0(%barg0: tensor<*xf32>, %barg1: tensor<i32>):
+        %bdy:2 = call @while_body(%barg0, %barg1) : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+        "tf.Yield"(%bdy#0, %bdy#1) : (tensor<*xf32>, tensor<i32>) -> ()
+    }
+  ) { is_stateless = false } : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+  // CHECK: return [[Result]]#0
+  return %0#0 : tensor<*xf32>
+}
+
+// -----
+
+// Trivially transformable with casts
+// CHECK: func @while_cond
+// CHECK: func @while_body
+// CHECK-LABEL: testWhileRegionTrivialCasts
+func @while_cond(%arg0 : tensor<4xf32>, %arg1 : tensor<i32>) -> tensor<i1>
+func @while_body(%arg0 : tensor<4xf32>, %arg1 : tensor<i32>) -> (tensor<4xf32>, tensor<i32>)
+func @testWhileRegionTrivialCasts(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> tensor<*xf32> {
+  // CHECK: [[Result:%.*]]:2 = "tf.While"(%arg0, %arg1) {body = @while_body, cond = @while_cond
+  %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
+    {
+      ^bb0(%carg0: tensor<*xf32>, %carg1: tensor<i32>):
+        %cond_cast = "tf.Cast"(%carg0) : (tensor<*xf32>) -> tensor<4xf32>
+        %cond = call @while_cond(%cond_cast, %carg1) : (tensor<4xf32>, tensor<i32>) -> tensor<i1>
+        "tf.Yield"(%cond) : (tensor<i1>) -> ()
+    },
+    {
+      // loop body
+      ^bb0(%barg0: tensor<*xf32>, %barg1: tensor<i32>):
+        %bdy_cast = "tf.Cast"(%barg0) : (tensor<*xf32>) -> tensor<4xf32>
+        %bdy:2 = call @while_body(%bdy_cast, %barg1) : (tensor<4xf32>, tensor<i32>) -> (tensor<4xf32>, tensor<i32>)
+        "tf.Yield"(%bdy#0, %bdy#1) : (tensor<4xf32>, tensor<i32>) -> ()
+    }
+  ) { is_stateless = false } : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+  // CHECK: return [[Result]]#0
+  return %0#0 : tensor<*xf32>
+}
+
+// -----
+
+// Trivially transformable with multiple casts
+// CHECK: func @while_cond
+// CHECK: func @while_body
+// CHECK-LABEL: testWhileRegionTrivialMultipleCasts
+func @while_cond(%arg0 : tensor<4xf32>, %arg1 : tensor<i32>) -> tensor<i1>
+func @while_body(%arg0 : tensor<4xf32>, %arg1 : tensor<i32>) -> (tensor<4xf32>, tensor<i32>)
+func @testWhileRegionTrivialMultipleCasts(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> tensor<*xf32> {
+  // CHECK: [[Result:%.*]]:2 = "tf.While"(%arg0, %arg1) {body = @while_body, cond = @while_cond
+  %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
+    {
+      ^bb0(%carg0: tensor<*xf32>, %carg1: tensor<i32>):
+        %cond_cast0 = "tf.Cast"(%carg0) : (tensor<*xf32>) -> tensor<?xf32>
+        %cond_cast1 = "tf.Cast"(%cond_cast0) : (tensor<?xf32>) -> tensor<4xf32>
+        %cond = call @while_cond(%cond_cast1, %carg1) : (tensor<4xf32>, tensor<i32>) -> tensor<i1>
+        "tf.Yield"(%cond) : (tensor<i1>) -> ()
+    },
+    {
+      // loop body
+      ^bb0(%barg0: tensor<*xf32>, %barg1: tensor<i32>):
+        %bdy_cast0 = "tf.Cast"(%barg0) : (tensor<*xf32>) -> tensor<?xf32>
+        %bdy_cast1 = "tf.Cast"(%bdy_cast0) : (tensor<?xf32>) -> tensor<4xf32>
+        %bdy:2 = call @while_body(%bdy_cast1, %barg1) : (tensor<4xf32>, tensor<i32>) -> (tensor<4xf32>, tensor<i32>)
+        "tf.Yield"(%bdy#0, %bdy#1) : (tensor<4xf32>, tensor<i32>) -> ()
+    }
+  ) { is_stateless = false } : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+  // CHECK: return [[Result]]#0
+  return %0#0 : tensor<*xf32>
+}
+
+// -----
+
+// Almost trivially transformable with extern values
+// CHECK: func @tf.WhileRegion_body
+// CHECK: call @while_body
+// CHECK: @tf.WhileRegion_cond
+// CHECK: call @while_cond
+// CHECK-LABEL: testWhileRegionExtern
+func @while_cond(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> tensor<i1>
+func @while_body(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>, %arg2 : tensor<*xf32>) -> (tensor<*xf32>, tensor<i32>)
+func @testWhileRegionExtern(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> tensor<*xf32> {
+  %ext = "tf.Neg"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
+  // CHECK: [[Result:%.*]]:3 = "tf.While"(%arg0, %arg1, %{{.+}}) {body = @tf.WhileRegion_body, cond = @tf.WhileRegion_cond
+  %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
+    {
+      ^bb0(%carg0: tensor<*xf32>, %carg1: tensor<i32>):
+        %cond = call @while_cond(%carg0, %carg1) : (tensor<*xf32>, tensor<i32>) -> tensor<i1>
+        "tf.Yield"(%cond) : (tensor<i1>) -> ()
+    },
+    {
+      // loop body
+      ^bb0(%barg0: tensor<*xf32>, %barg1: tensor<i32>):
+        %bdy:2 = call @while_body(%barg0, %barg1, %ext) : (tensor<*xf32>, tensor<i32>, tensor<*xf32>) -> (tensor<*xf32>, tensor<i32>)
+        "tf.Yield"(%bdy#0, %bdy#1) : (tensor<*xf32>, tensor<i32>) -> ()
+    }
+  ) { is_stateless = false } : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+  // CHECK: return [[Result]]#0
+  return %0#0 : tensor<*xf32>
+}
+
+// -----
+
+// Almost trivially transformable, mismatching block arguments
+// CHECK: func @tf.WhileRegion_body
+// CHECK: call @while_body
+// CHECK: @tf.WhileRegion_cond
+// CHECK: call @while_cond
+// CHECK-LABEL: testWhileRegionBlockArgMismatch
+func @while_cond(%arg0 : tensor<i32>, %arg1 : tensor<*xf32>) -> tensor<i1>
+func @while_body(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+func @testWhileRegionBlockArgMismatch(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> tensor<*xf32> {
+  // CHECK: [[Result:%.*]]:2 = "tf.While"(%arg0, %arg1) {body = @tf.WhileRegion_body, cond = @tf.WhileRegion_cond
+  %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
+    {
+      ^bb0(%carg0: tensor<*xf32>, %carg1: tensor<i32>):
+        %cond = call @while_cond(%carg1, %carg0) : (tensor<i32>, tensor<*xf32>) -> tensor<i1>
+        "tf.Yield"(%cond) : (tensor<i1>) -> ()
+    },
+    {
+      // loop body
+      ^bb0(%barg0: tensor<*xf32>, %barg1: tensor<i32>):
+        %bdy:2 = call @while_body(%barg0, %barg1) : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+        "tf.Yield"(%bdy#0, %bdy#1) : (tensor<*xf32>, tensor<i32>) -> ()
+    }
+  ) { is_stateless = false } : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+  // CHECK: return [[Result]]#0
+  return %0#0 : tensor<*xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir b/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
index 9931a45f995..487234ce958 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -tf-replicate-to-island | FileCheck %s
+// RUN: tf-opt -split-input-file %s -tf-replicate-to-island | FileCheck %s
 
 // Tests per replica island has same control operands as island holding
 // replicate.
@@ -223,3 +223,219 @@ func @replica_id_attr_added(%arg0: tensor<!tf.string>, %arg1: tensor<!tf.string>
 // CHECK:      "tf.A"
 // CHECK-NOT:   _xla_replica_id
 // CHECK:      tf_executor.fetch
+
+
+// Tests device ordinals are added to `tf._XlaSendFromHost`/`tf._XlaRecvAtHost`
+// based on the first TPU core device id.
+// CHECK-LABEL: func @device_ordinals
+func @device_ordinals(%arg0: tensor<f32>, %arg1: tensor<2x!tf.string>) {
+  tf_executor.graph {
+    tf_executor.island {
+      tf_device.replicate([%arg0, %arg0] as %arg2: tensor<f32>) {n = 2 : i32, devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:1", "/job:worker/replica:0/task:0/device:TPU:2"]}} {
+        %0 = "tf._XlaRecvAtHost"(%arg1) {_xla_has_host_transfer = true, device_ordinal = 0 : i64, key = "host_compute_channel_send_0"} : (tensor<2x!tf.string>) -> tensor<f32>
+        "tf._XlaSendFromHost"(%0, %arg1) {_xla_has_host_transfer = true, device_ordinal = 0 : i64, key = "host_compute_channel_recv_0"} : (tensor<f32>, tensor<2x!tf.string>) -> ()
+        "tf.NoOp"() : () -> ()
+        tf_device.return
+      }
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK:      tf_executor.island
+// CHECK:      "tf._XlaRecvAtHost"
+// CHECK-SAME:   device_ordinal = 1
+// CHECK:      "tf._XlaSendFromHost"
+// CHECK-SAME:   device_ordinal = 1
+// CHECK:      "tf.NoOp"
+// CHECK:      tf_executor.island
+// CHECK:      "tf._XlaRecvAtHost"
+// CHECK-SAME:   device_ordinal = 2
+// CHECK:      "tf._XlaSendFromHost"
+// CHECK-SAME:   device_ordinal = 2
+// CHECK:      "tf.NoOp"
+
+// -----
+
+// Tests functions with replica variant ops reachable from a replicate region
+// is cloned and remapped.
+
+// CHECK-LABEL: func @call_with_replicate_variant_ops
+func @call_with_replicate_variant_ops(%arg0: tensor<f32>, %arg1: tensor<2x!tf.string>) {
+  tf_executor.graph {
+    tf_executor.island {
+      tf_device.replicate([%arg0, %arg0] as %arg2: tensor<f32>) {n = 2 : i32, devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:1", "/job:worker/replica:0/task:0/device:TPU:2"]}} {
+        "tf.StatefulPartitionedCall"(%arg1) {config = "", config_proto = "", executor_type = "", f = @send_recv} : (tensor<2x!tf.string>) -> ()
+        tf_device.return
+      }
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK: "tf.StatefulPartitionedCall"
+// CHECK-SAME: f = [[CALL_REPLICA_0:@[a-z0-9_]+]]
+// CHECK: "tf.StatefulPartitionedCall"
+// CHECK-SAME: f = [[CALL_REPLICA_1:@[a-z0-9_]+]]
+
+func @send_recv(%arg0: tensor<2x!tf.string>) {
+  %0 = "tf._XlaRecvAtHost"(%arg0) {_xla_has_host_transfer = true, device_ordinal = 0 : i64, key = "host_compute_channel_send_0"} : (tensor<2x!tf.string>) -> tensor<f32>
+  "tf._XlaSendFromHost"(%0, %arg0) {_xla_has_host_transfer = true, device_ordinal = 0 : i64, key = "host_compute_channel_recv_0"} : (tensor<f32>, tensor<2x!tf.string>) -> ()
+  "tf.NoOp"() : () -> ()
+  return
+}
+
+// CHECK: func [[CALL_REPLICA_0]]
+// CHECK: "tf._XlaRecvAtHost"
+// CHECK-SAME: device_ordinal = 1
+// CHECK: "tf._XlaSendFromHost"
+// CHECK-SAME: device_ordinal = 1
+
+// CHECK: func [[CALL_REPLICA_1]]
+// CHECK: "tf._XlaRecvAtHost"
+// CHECK-SAME: device_ordinal = 2
+// CHECK: "tf._XlaSendFromHost"
+// CHECK-SAME: device_ordinal = 2
+
+// -----
+
+// Tests transitive functions with replica variant ops reachable from a
+// replicate region is cloned and remapped.
+
+// CHECK-LABEL: func @call_with_replicate_variant_ops
+func @call_with_replicate_variant_ops(%arg0: tensor<f32>, %arg1: tensor<2x!tf.string>) {
+  tf_executor.graph {
+    tf_executor.island {
+      tf_device.replicate([%arg0, %arg0] as %arg2: tensor<f32>) {n = 2 : i32, devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:1", "/job:worker/replica:0/task:0/device:TPU:2"]}} {
+        "tf.StatefulPartitionedCall"(%arg1) {config = "", config_proto = "", executor_type = "", f = @callee} : (tensor<2x!tf.string>) -> ()
+        tf_device.return
+      }
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK: "tf.StatefulPartitionedCall"
+// CHECK-SAME: f = [[CALLEE_REPLICA_0:@[a-z0-9_]+]]
+// CHECK: "tf.StatefulPartitionedCall"
+// CHECK-SAME: f = [[CALLEE_REPLICA_1:@[a-z0-9_]+]]
+
+func @callee(%arg0: tensor<2x!tf.string>) {
+  "tf.StatefulPartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @send_recv} : (tensor<2x!tf.string>) -> ()
+  return
+}
+
+func @send_recv(%arg0: tensor<2x!tf.string>) {
+  %0 = "tf._XlaRecvAtHost"(%arg0) {_xla_has_host_transfer = true, device_ordinal = 0 : i64, key = "host_compute_channel_send_0"} : (tensor<2x!tf.string>) -> tensor<f32>
+  "tf._XlaSendFromHost"(%0, %arg0) {_xla_has_host_transfer = true, device_ordinal = 0 : i64, key = "host_compute_channel_recv_0"} : (tensor<f32>, tensor<2x!tf.string>) -> ()
+  "tf.NoOp"() : () -> ()
+  return
+}
+
+// CHECK: func [[CALLEE_REPLICA_0]]
+// CHECK: "tf.StatefulPartitionedCall"
+// CHECK-SAME: f = [[TRANSITIVE_CALLEE_REPLICA_0:@[a-z0-9_]+]]
+
+// CHECK: func [[TRANSITIVE_CALLEE_REPLICA_0]]
+// CHECK: "tf._XlaRecvAtHost"
+// CHECK-SAME: device_ordinal = 1
+// CHECK: "tf._XlaSendFromHost"
+// CHECK-SAME: device_ordinal = 1
+
+// CHECK: func [[CALLEE_REPLICA_1]]
+// CHECK: "tf.StatefulPartitionedCall"
+// CHECK-SAME: f = [[TRANSITIVE_CALLEE_REPLICA_1:@[a-z0-9_]+]]
+
+// CHECK: func [[TRANSITIVE_CALLEE_REPLICA_1]]
+// CHECK: "tf._XlaRecvAtHost"
+// CHECK-SAME: device_ordinal = 2
+// CHECK: "tf._XlaSendFromHost"
+// CHECK-SAME: device_ordinal = 2
+
+// -----
+
+// Tests functional control flow functions with replica variant ops reachable
+// from a replicate region is cloned and remapped. Only the branches reachable
+// with replica variant ops are cloned.
+
+// CHECK-LABEL: func @control_flow_with_replicate_variant_ops
+func @control_flow_with_replicate_variant_ops(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>, %arg3: tensor<2x!tf.string>) {
+  tf_executor.graph {
+    tf_executor.island {
+      tf_device.replicate([%arg0, %arg0] as %arg4: tensor<i1>, [%arg1, %arg1] as %arg5: tensor<f32>, [%arg2, %arg2] as %arg6: tensor<f32>) {n = 2 : i32, devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:1", "/job:worker/replica:0/task:0/device:TPU:2"]}} {
+        %0 = "tf.If"(%arg4, %arg5, %arg6, %arg3) {else_branch = @cond_false, is_stateless = true, then_branch = @cond_true} : (tensor<i1>, tensor<f32>, tensor<f32>, tensor<2x!tf.string>) -> tensor<f32>
+        tf_device.return
+      }
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK: "tf.If"
+// CHECK-SAME: else_branch = @cond_false
+// CHECK-SAME: then_branch = [[COND_TRUE_REPLICA_0:@[a-z0-9_]+]]
+// CHECK: "tf.If"
+// CHECK-SAME: else_branch = @cond_false
+// CHECK-SAME: then_branch = [[COND_TRUE_REPLICA_1:@[a-z0-9_]+]]
+
+func @cond_false(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<2x!tf.string>) -> tensor<f32> {
+  return %arg0 : tensor<f32>
+}
+
+// CHECK-NOT: func @cond_false.+(
+
+func @cond_true(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<2x!tf.string>) -> tensor<f32> {
+  "tf._XlaSendFromHost"(%arg1, %arg2) {_xla_has_host_transfer = true, device_ordinal = 0 : i64, key = "host_compute_channel_recv_0"} : (tensor<f32>, tensor<2x!tf.string>) -> ()
+  %0 = "tf._XlaRecvAtHost"(%arg2) {_xla_has_host_transfer = true, device_ordinal = 0 : i64, key = "host_compute_channel_send_0"} : (tensor<2x!tf.string>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// CHECK: func [[COND_TRUE_REPLICA_0]]
+// CHECK: "tf._XlaSendFromHost"
+// CHECK-SAME: device_ordinal = 1
+// CHECK: "tf._XlaRecvAtHost"
+// CHECK-SAME: device_ordinal = 1
+
+// CHECK: func [[COND_TRUE_REPLICA_1]]
+// CHECK: "tf._XlaSendFromHost"
+// CHECK-SAME: device_ordinal = 2
+// CHECK: "tf._XlaRecvAtHost"
+// CHECK-SAME: device_ordinal = 2
+
+// -----
+
+// Tests function with no replica variant ops reachable from a replicate region
+// is not cloned.
+
+// CHECK-LABEL: func @no_replicate_variant_ops
+func @no_replicate_variant_ops(%arg0: tensor<f32>, %arg1: tensor<2x!tf.string>) {
+  tf_executor.graph {
+    tf_executor.island {
+      tf_device.replicate([%arg0, %arg0] as %arg2: tensor<f32>) {n = 2 : i32, devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:1", "/job:worker/replica:0/task:0/device:TPU:2"]}} {
+        "tf.StatefulPartitionedCall"(%arg1) {config = "", config_proto = "", executor_type = "", f = @send_recv} : (tensor<2x!tf.string>) -> ()
+        tf_device.return
+      }
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK: "tf.StatefulPartitionedCall"
+// CHECK-SAME: f = @send_recv
+
+func @send_recv(%arg0: tensor<2x!tf.string>) {
+  "tf.NoOp"() : () -> ()
+  return
+}
+
+// CHECK-NOT: @send_recv.+(
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/resource-alias-analysis-test.mlir b/tensorflow/compiler/mlir/tensorflow/tests/resource-alias-analysis-test.mlir
new file mode 100644
index 00000000000..87da399b726
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/resource-alias-analysis-test.mlir
@@ -0,0 +1,234 @@
+// RUN: tf-opt -split-input-file -tf-test-resource-alias-analysis -verify-diagnostics %s | FileCheck %s
+
+// Test 2 resources that do not alias.
+
+!tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
+// CHECK-LABEL: func @non_aliasing_reads_writes
+// expected-remark@below {{Region #0, Arg #0, ID 1 : 1}}
+// expected-remark@below {{Region #0, Arg #1, ID 2 : 2}}
+func @non_aliasing_reads_writes(
+  %arg0: !tf_res,
+  %arg1: !tf_res,
+  %arg2: tensor<32xf32>) -> (tensor<32xf32>) {
+  %graph = tf_executor.graph {
+    // CHECK: tf_executor.island
+    %island:2 = tf_executor.island {
+      %read0 = "tf.ReadVariableOp"(%arg0) : (!tf_res) -> tensor<32xf32>
+      "tf.AssignVariableOp"(%arg0, %arg2) : (!tf_res, tensor<32xf32>) -> ()
+      %read1 = "tf.ReadVariableOp"(%arg1) : (!tf_res) -> tensor<32xf32>
+      // expected-remark@below {{Result #0, ID 0 : 0}}
+      %var_handle = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> !tf_res
+      %read2 = "tf.ReadVariableOp"(%var_handle) : (!tf_res) -> tensor<32xf32>
+      "tf.AssignVariableOp"(%arg1, %read0) : (!tf_res, tensor<32xf32>) -> ()
+      "tf.AssignVariableOp"(%arg0, %read2) : (!tf_res, tensor<32xf32>) -> ()
+      %read3 = "tf.ReadVariableOp"(%arg0) : (!tf_res) -> tensor<32xf32>
+      tf_executor.yield %read3 : tensor<32xf32>
+    }
+    tf_executor.fetch %island#0 : tensor<32xf32>
+  }
+  return %graph : tensor<32xf32>
+}
+
+// -----
+// Tests aliasing of the two resource handles that refer to the same variable.
+
+!tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
+// CHECK-LABEL: func @aliasing_reads_writes
+func @aliasing_reads_writes(%arg0: tensor<32xf32>) -> () {
+  tf_executor.graph {
+    // CHECK: tf_executor.island
+    %island = tf_executor.island {
+      // expected-remark@below {{Result #0, ID 0 : 0, 1, 2}}
+      %vh0 = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> !tf_res
+      // expected-remark@below {{Result #0, ID 1 : 0, 1, 2}}
+      %vh1 = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> !tf_res
+      // expected-remark@below {{Result #0, ID 2 : 0, 1, 2}}
+      %vh1_id:2 = "tf.IdentityN"(%vh1, %arg0) : (!tf_res, tensor<32xf32>) -> (!tf_res, tensor<32xf32>)
+      %read0 = "tf.ReadVariableOp"(%vh0) : (!tf_res) -> tensor<32xf32>
+      "tf.AssignVariableOp"(%vh1_id#0, %arg0) : (!tf_res, tensor<32xf32>) -> ()
+      %read1 = "tf.ReadVariableOp"(%vh0) : (!tf_res) -> tensor<32xf32>
+      %read2 = "tf.ReadVariableOp"(%vh1) : (!tf_res) -> tensor<32xf32>
+      "tf.AssignVariableOp"(%vh0, %read2) : (!tf_res, tensor<32xf32>) -> ()
+      "tf.AssignVariableOp"(%vh1_id#0, %read1) : (!tf_res, tensor<32xf32>) -> ()
+      tf_executor.yield
+    }
+    tf_executor.fetch %island : !tf_executor.control
+  }
+  return
+}
+
+// -----
+// Test an unknown op that has a resource result is marked unknown
+
+!tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
+// CHECK-LABEL: func @unknown_resource_op
+func @unknown_resource_op(%arg0: tensor<32xf32>) -> () {
+    // expected-remark@below {{Result #0, ID 0 : Unknown}}
+    %0 = "tf.UnknownVarHandleOp"() : () -> !tf_res
+}
+
+// -----
+// Test aliasing through IfOp
+
+!tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
+
+// CHECK-LABEL: func @if_op_aliasing
+// expected-remark@below {{Region #0, Arg #0, ID 4 : 1, 4}}
+// expected-remark@below {{Region #0, Arg #1, ID 5 : 1, 2, 3, 5}}
+func @if_op_aliasing(%arg0: !tf_res, %arg1: !tf_res) {
+  // expected-remark@below {{Result #0, ID 0 : 0}}
+  %vh0 = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> !tf_res
+  %read0 = "tf.ReadVariableOp"(%vh0) : (!tf_res) -> tensor<32xf32>
+  // expected-remark@below {{Result #0, ID 1 : Unknown}}
+  // expected-remark@below {{Result #1, ID 2 : 1, 2, 3, 5}}
+  // expected-remark@below {{Result #2, ID 3 : 0, 1, 2, 3, 5}}
+  %if:3 = "tf.If"(%read0, %arg1, %vh0) {
+            then_branch = @if_then, else_branch = @if_else, is_stateless = true
+          } : (tensor<32xf32>, !tf_res, !tf_res) -> (!tf_res, !tf_res, !tf_res)
+  return
+}
+
+// expected-remark@below {{Region #0, Arg #0, ID 2 : 0, 1, 2}}
+// expected-remark@below {{Region #0, Arg #1, ID 3 : 0, 3}}
+func @if_then(%arg0: !tf_res, %arg1: !tf_res) -> (!tf_res, !tf_res, !tf_res) {
+  // expected-remark@below {{Result #0, ID 0 : Unknown}}
+  %u0 = "tf._UnknownSideEffectingOp_"() : () -> !tf_res
+  // expected-remark@below {{Result #0, ID 1 : 0, 1, 2}}
+  %id0 = "tf.Identity"(%arg0) : (!tf_res) -> !tf_res
+  return %u0, %id0, %id0 : !tf_res, !tf_res, !tf_res
+}
+
+// expected-remark@below {{Region #0, Arg #0, ID 1 : 0, 1}}
+// expected-remark@below {{Region #0, Arg #1, ID 2 : 2}}
+func @if_else(%arg0: !tf_res, %arg1: !tf_res) -> (!tf_res, !tf_res, !tf_res) {
+  // expected-remark@below {{Result #0, ID 0 : 0, 1}}
+  %id0 = "tf.Identity"(%arg0) : (!tf_res) -> !tf_res
+  return %id0, %id0, %arg1 : !tf_res, !tf_res, !tf_res
+}
+
+// -----
+// Test aliasing through WhileOp
+!tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
+
+// CHECK-LABEL: func @while_op_aliasing
+// expected-remark@below {{Region #0, Arg #0, ID 4 : 1, 4}}
+// expected-remark@below {{Region #0, Arg #1, ID 5 : 1, 2, 3, 5}}
+// expected-remark@below {{Region #0, Arg #2, ID 6 : 1, 2, 3, 6}}
+func @while_op_aliasing(%arg0: !tf_res, %arg1: !tf_res, %arg2: !tf_res) {
+  // expected-remark@below {{Result #0, ID 0 : 0}}
+  %vh0 = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> !tf_res
+  // expected-remark@below {{Result #0, ID 1 : Unknown}}
+  // expected-remark@below {{Result #1, ID 2 : 1, 2, 3, 5, 6}}
+  // expected-remark@below {{Result #2, ID 3 : 1, 2, 3, 5, 6}}
+  %w:3 = "tf.While"(%arg0, %arg1, %arg2) {
+            body = @while_body, cond = @while_cond, is_stateless = false
+         } : (!tf_res, !tf_res, !tf_res) -> (!tf_res, !tf_res, !tf_res)
+  return
+}
+
+// CHECK-LABEL: func @while_body
+// Return 0 : new unknown resource
+// Return 1 : arg2
+// Return 2 : arg1
+// expected-remark@below {{Region #0, Arg #0, ID 1 : 0, 1}}
+// expected-remark@below {{Region #0, Arg #1, ID 2 : 0, 2}}
+// expected-remark@below {{Region #0, Arg #2, ID 3 : 0, 3}}
+func @while_body(%arg0: !tf_res, %arg1: !tf_res, %arg2: !tf_res) -> (!tf_res, !tf_res, !tf_res) {
+  // expected-remark@below {{Result #0, ID 0 : Unknown}}
+  %u0 = "tf._UnknownSideEffectingOp_"() : () -> !tf_res
+  return %u0, %arg2, %arg1 : !tf_res, !tf_res, !tf_res
+}
+
+// CHECK-LABEL: func @while_cond
+// expected-remark@below {{Region #0, Arg #0, ID 0 : 0}}
+// expected-remark@below {{Region #0, Arg #1, ID 1 : 1}}
+// expected-remark@below {{Region #0, Arg #2, ID 2 : 2}}
+func @while_cond(%arg0: !tf_res, %arg1: !tf_res, %arg2: !tf_res) -> tensor<i1> {
+  %0 = constant dense<false> : tensor<i1>
+  return %0 : tensor<i1>
+}
+
+// -----
+// Test alias propagation through calls.
+!tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
+// CHECK-LABEL: func @aliasing_through_calls
+func @aliasing_through_calls(%arg0: tensor<32xf32>) -> () {
+  // expected-remark@below {{Result #0, ID 0 : 0, 1, 2, 3}}
+  %vh0 = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> !tf_res
+  // expected-remark@below {{Result #0, ID 1 : 0, 1, 2, 3}}
+  %vh1 = "tf.Identity"(%vh0) : (!tf_res) -> (!tf_res)
+  // expected-remark@below {{Result #0, ID 2 : Unknown}}
+  // expected-remark@below {{Result #1, ID 3 : 0, 1, 2, 3}}
+  %c:2 = call @passthru(%vh1) : (!tf_res) -> (!tf_res, !tf_res)
+  return
+}
+
+// expected-remark@below {{Region #0, Arg #0, ID 1 : 1}}
+func @passthru(%arg0: !tf_res) -> (!tf_res, !tf_res) {
+  // expected-remark@below {{Result #0, ID 0 : 0}}
+  %vx = "tf.VarHandleOp"() {container = "cf", shared_name = "vx"} : () -> !tf_res
+  return %vx, %arg0 : !tf_res, !tf_res
+}
+
+// -----
+// Test aliasing through IfRegion
+
+!tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
+
+// CHECK-LABEL: func @if_region_aliasing
+// expected-remark@below {{Region #0, Arg #0, ID 7 : 1, 4, 6, 7}}
+// expected-remark@below {{Region #0, Arg #1, ID 8 : 1, 2, 4, 5, 6, 8}}
+func @if_region_aliasing(%arg0: !tf_res, %arg1: !tf_res) {
+  // expected-remark@below {{Result #0, ID 0 : 0, 1, 3, 4, 5}}
+  %vh0 = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> !tf_res
+  %read0 = "tf.ReadVariableOp"(%vh0) : (!tf_res) -> tensor<32xf32>
+  // expected-remark@below {{Result #0, ID 4 : Unknown}}
+  // expected-remark@below {{Result #1, ID 5 : 0, 1, 2, 3, 4, 5, 6, 8}}
+  // expected-remark@below {{Result #2, ID 6 : 1, 2, 4, 5, 6, 7, 8}}
+  %if:3 = "tf.IfRegion"(%read0) ({
+            // expected-remark@below {{Result #0, ID 1 : Unknown}}
+            %u0 = "tf._UnknownSideEffectingOp_"() : () -> !tf_res
+            // expected-remark@below {{Result #0, ID 2 : 1, 2, 4, 5, 6, 8}}
+            %id0 = "tf.Identity"(%arg1) : (!tf_res) -> !tf_res
+            "tf.Yield"(%u0, %id0, %id0) : (!tf_res, !tf_res, !tf_res) -> ()
+          }, {
+            // expected-remark@below {{Result #0, ID 3 : 0, 1, 3, 4, 5}}
+            %id0 = "tf.Identity"(%vh0) : (!tf_res) -> !tf_res
+            "tf.Yield"(%id0, %id0, %arg0) : (!tf_res, !tf_res, !tf_res) -> ()
+          }) {is_stateless = true} : (tensor<32xf32>) -> (!tf_res, !tf_res, !tf_res)
+  return
+}
+
+// -----
+// Test aliasing through WhileRegion
+!tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
+
+// CHECK-LABEL: func @while_region_aliasing
+// expected-remark@below {{Region #0, Arg #0, ID 11 : 1, 8, 11}}
+// expected-remark@below {{Region #0, Arg #1, ID 12 : 1, 8, 9, 10, 12}}
+// expected-remark@below {{Region #0, Arg #2, ID 13 : 1, 8, 9, 10, 13}}
+func @while_region_aliasing(%arg0: !tf_res, %arg1: !tf_res, %arg2: !tf_res) {
+  // expected-remark@below {{Result #0, ID 0 : 0, 1, 8}}
+  %vh0 = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> !tf_res
+  // expected-remark@below {{Result #0, ID 8 : Unknown}}
+  // expected-remark@below {{Result #1, ID 9 : 1, 8, 9, 10, 12, 13}}
+  // expected-remark@below {{Result #2, ID 10 : 1, 8, 9, 10, 12, 13}}
+  // expected-remark@below {{Region #0, Arg #0, ID 2 : 1, 2, 8}}
+  // expected-remark@below {{Region #0, Arg #1, ID 3 : 1, 3, 8}}
+  // expected-remark@below {{Region #0, Arg #2, ID 4 : 1, 4, 8}}
+  // expected-remark@below {{Region #1, Arg #0, ID 5 : 1, 5, 8}}
+  // expected-remark@below {{Region #1, Arg #1, ID 6 : 1, 6, 8}}
+  // expected-remark@below {{Region #1, Arg #2, ID 7 : 1, 7, 8}}
+  %w:3 = "tf.WhileRegion"(%arg0, %arg1, %arg2) ({
+          ^bb0(%carg0: !tf_res, %carg1: !tf_res, %carg2: !tf_res):
+          %0 = constant dense<false> : tensor<i1>
+          "tf.Yield"(%0) : (tensor<i1>) -> ()
+         },{
+          ^bb0(%barg0: !tf_res, %barg1: !tf_res, %barg2: !tf_res):
+          // expected-remark@below {{Result #0, ID 1 : Unknown}}
+          %u0 = "tf._UnknownSideEffectingOp_"() : () -> !tf_res
+          "tf.Yield"(%u0, %barg2, %barg1) : (!tf_res, !tf_res, !tf_res) -> ()
+         }) {is_stateless = false} : (!tf_res, !tf_res, !tf_res) -> (!tf_res, !tf_res, !tf_res)
+  return
+}
+
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/resource-device-inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/resource-device-inference.mlir
index a9e814c647e..a4a7c1dad2e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/resource-device-inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/resource-device-inference.mlir
@@ -56,7 +56,7 @@ func @propagate_if_op(
       "tf.If"(%arg1, %id0, %var_handle) {
           then_branch = @if_then,
           else_branch = @if_else,
-          output_shapes = [], is_stateless = false}
+          is_stateless = false}
         : (tensor<i1>, tensor<*x!tf.resource<tensor<32xf32>>>,
            tensor<*x!tf.resource<tensor<32xf32>>>) -> ()
       tf_executor.yield
@@ -128,8 +128,7 @@ func @propagate_while_op(
       // CHECK-NEXT: "tf.While"
       "tf.While"(%arg1, %id0, %var_handle) {
           body = @while_body,
-          cond = @while_cond,
-          output_shapes = [], is_stateless = false}
+          cond = @while_cond, is_stateless = false}
         : (tensor<i32>, tensor<*x!tf.resource<tensor<32xf32>>>,
            tensor<*x!tf.resource<tensor<32xf32>>>) ->
           (tensor<i32>, tensor<*x!tf.resource<tensor<32xf32>>>,
@@ -209,8 +208,7 @@ func @error_on_conflict_multiple_callers(
         : () -> tensor<*x!tf.resource<tensor<32xf32>>>
       "tf.If"(%arg1, %id0, %var_handle) {
           then_branch = @if_then_and_else,
-          else_branch = @if_then_and_else,
-          output_shapes = [], is_stateless = false}
+          else_branch = @if_then_and_else, is_stateless = false}
         : (tensor<i1>, tensor<*x!tf.resource<tensor<32xf32>>>,
            tensor<*x!tf.resource<tensor<32xf32>>>) -> ()
       "tf.If"(%arg1, %var_handle, %id0) {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir b/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
index 7c8e4382e2b..ac5c2df8f7e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-resource-op-lifting | FileCheck %s
+// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-resource-op-lifting | FILECHECK_OPTS="" FileCheck %s
 
 // Tests that resource load operations are hoisted.
 
@@ -147,8 +147,7 @@ func @cluster_with_loop() -> () {
   "tf_device.cluster"() ( {
     // CHECK: %[[WHILE:.*]]:2 = "tf.While"(%[[COUNT]], %[[READ]])
     %2:3 = "tf.While"(%0, %1, %unused)
-               {body = @while_body, cond = @while_cond, device = "", is_stateless = false,
-                output_shapes = [#tf.shape<>, #tf.shape<>]}
+               {body = @while_body, cond = @while_cond, device = "", is_stateless = false}
          : (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>)
          -> (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>)
     // CHECK: tf_device.return %[[WHILE]]#1 : tensor<f32>
@@ -197,8 +196,7 @@ func @cluster_with_loop() -> () {
   "tf_device.cluster"() ( {
     // CHECK: %[[WHILE:.*]] = "tf.While"(%[[READ]])
     %1 = "tf.While"(%0) {
-      body = @while_body, cond = @while_cond, device = "", is_stateless = false,
-      output_shapes = [#tf.shape<>]}
+      body = @while_body, cond = @while_cond, device = "", is_stateless = false}
          : (tensor<*x!tf.resource<tensor<f32>>>)
          -> (tensor<*x!tf.resource<tensor<f32>>>)
     // CHECK: tf_device.return %[[WHILE]] : tensor<f32>
@@ -239,8 +237,7 @@ func @cluster_with_loop() -> () {
   "tf_device.cluster"() ( {
     // CHECK: %[[WHILE:.*]] = "tf.While"(%[[READ]])
     %1 = "tf.While"(%0) {
-      body = @while_body, cond = @while_cond, device = "", is_stateless = false,
-      output_shapes = [#tf.shape<>]}
+      body = @while_body, cond = @while_cond, device = "", is_stateless = false}
          : (tensor<*x!tf.resource<tensor<f32>>>)
          -> (tensor<*x!tf.resource<tensor<f32>>>)
     // CHECK: tf_device.return
@@ -278,8 +275,7 @@ func @cluster_with_nested_loop() -> () {
   "tf_device.cluster"() ( {
     // CHECK: %[[WHILE:.*]] = "tf.While"(%[[READ]])
     %2:2 = "tf.While"(%0, %1) {
-      body = @while_body, cond = @while_cond, device = "", is_stateless = false,
-      output_shapes = [#tf.shape<>, #tf.shape<>]}
+      body = @while_body, cond = @while_cond, device = "", is_stateless = false}
          : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>)
          -> (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>)
     // CHECK: tf_device.return %[[WHILE]] : tensor<f32>
@@ -295,8 +291,7 @@ func @while_body(%arg0: tensor<*x!tf.resource<tensor<f32>>>, %arg1: tensor<*x!tf
     -> (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>) {
   // CHECK: %[[WHILE:.*]] = "tf.While"(%[[BARG0]])
   %0:2 = "tf.While"(%arg0, %arg1) {
-    body = @while_body1, cond = @while_cond1, device = "", is_stateless = false,
-    output_shapes = [#tf.shape<>, #tf.shape<>]}
+    body = @while_body1, cond = @while_cond1, device = "", is_stateless = false}
        : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>)
        -> (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>)
   // CHECK-NEXT: return %[[WHILE]]
@@ -334,8 +329,7 @@ func @cluster_with_loop() -> () {
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<f32>>>
   "tf_device.cluster"() ( {
     %1 = "tf.While"(%0) {
-      body = @while_body, cond = @while_cond, device = "", is_stateless = false,
-      output_shapes = [#tf.shape<>]}
+      body = @while_body, cond = @while_cond, device = "", is_stateless = false}
          : (tensor<*x!tf.resource<tensor<f32>>>) -> (tensor<*x!tf.resource<tensor<f32>>>)
     tf_device.return
   }) {cluster_attr = "cluster_attr"} : () -> ()
@@ -359,8 +353,7 @@ func @cluster_with_loop() -> () {
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<f32>>>
   "tf_device.cluster"() ( {
     %1 = "tf.While"(%0) {
-      body = @while_body, cond = @while_cond, device = "", is_stateless = false,
-      output_shapes = [#tf.shape<>]}
+      body = @while_body, cond = @while_cond, device = "", is_stateless = false}
          : (tensor<*x!tf.resource<tensor<f32>>>) -> (tensor<*x!tf.resource<tensor<f32>>>)
     tf_device.return
   }) {cluster_attr = "cluster_attr"} : () -> ()
@@ -384,8 +377,7 @@ func @cluster_with_loop() -> () {
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<f32>>>
   "tf_device.cluster"() ( {
     %1 = "tf.While"(%0) {
-      body = @while_body, cond = @while_cond, device = "", is_stateless = false,
-      output_shapes = [#tf.shape<>]}
+      body = @while_body, cond = @while_cond, device = "", is_stateless = false}
          : (tensor<*x!tf.resource<tensor<f32>>>) -> (tensor<*x!tf.resource<tensor<f32>>>)
     tf_device.return
   }) {cluster_attr = "cluster_attr"} : () -> ()
@@ -600,6 +592,35 @@ func @if_else(%arg0: tensor<*x!tf.resource<tensor<4xf32>>>, %arg1: tensor<*x!tf.
 
 // -----
 
+// Tests that the pass reports error if output does not alias input.
+
+func @cluster_with_if(%arg0: tensor<i1>) -> tensor<4xf32> {
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
+  %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v2"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
+  %2 = "tf_device.cluster"() ( {
+    // expected-error @+1 {{unsupported output: resource does not alias input}}
+    %3 = "tf.If"(%arg0, %0, %1) {then_branch = @if_then, else_branch = @if_else,
+        is_stateless = false}
+      : (tensor<i1>, tensor<*x!tf.resource<tensor<4xf32>>>, tensor<*x!tf.resource<tensor<4xf32>>>)
+      -> (tensor<*x!tf.resource<tensor<4xf32>>>)
+    %4 = "tf.ReadVariableOp"(%3) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
+    tf_device.return %4 : tensor<4xf32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<4xf32>
+  return %2 : tensor<4xf32>
+}
+func @if_then(%arg0: tensor<*x!tf.resource<tensor<4xf32>>>, %arg1: tensor<*x!tf.resource<tensor<4xf32>>>)
+    -> (tensor<*x!tf.resource<tensor<4xf32>>>) {
+  %0 = "tf.foo"(%arg0) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<*x!tf.resource<tensor<4xf32>>>
+  return %0 : tensor<*x!tf.resource<tensor<4xf32>>>
+}
+func @if_else(%arg0: tensor<*x!tf.resource<tensor<4xf32>>>, %arg1: tensor<*x!tf.resource<tensor<4xf32>>>)
+    -> (tensor<*x!tf.resource<tensor<4xf32>>>) {
+  %0 = "tf.bar"(%arg0) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<*x!tf.resource<tensor<4xf32>>>
+  return %0 : tensor<*x!tf.resource<tensor<4xf32>>>
+}
+
+// -----
+
 // Tests that the pass lifts resources on two partitioned call ops sharing the
 // same callee. The lifting should clone the callee then modify the clone.
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
index 4193edf8cc6..4a5e3c8deaa 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
@@ -100,10 +100,11 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     return %1 : tensor<?x?x?x?xf32>
   }
 
-  // CHECK-LABEL: func @shape_from_if_to_branch_functions
-  func @shape_from_if_to_branch_functions(%arg0: tensor<i1>, %arg1: tensor<1x2x3xf32>) -> tensor<1x2x3xf32> {
-    %0 = "tf.If"(%arg0, %arg1) {Tcond = i1, Tin = ["tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"], _xla_propagate_compile_time_consts = true, device = "", else_branch = @if_else_branch, is_stateless = true, name = "if", then_branch = @if_then_branch} : (tensor<i1>, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
-    return %0 : tensor<1x2x3xf32>
+  // CHECK-LABEL: func @shape_from_if_to_branch_functions_to_results
+  // CHECK-SAME: (%arg0: tensor<i1>, %arg1: tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+  func @shape_from_if_to_branch_functions_to_results(%arg0: tensor<i1>, %arg1: tensor<1x2x3xf32>) -> tensor<*xf32> {
+    %0 = "tf.If"(%arg0, %arg1) {Tcond = i1, Tin = ["tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"], else_branch = @if_else_branch, is_stateless = true, name = "if", then_branch = @if_then_branch} : (tensor<i1>, tensor<1x2x3xf32>) -> tensor<*xf32>
+    return %0 : tensor<*xf32>
   }
 
   // CHECK-LABEL: func @if_then_branch
@@ -124,6 +125,27 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     return %0 : tensor<*xf32>
   }
 
+  // Verify shape propagation from function arg -> if region body -> if region output -> function return type
+  // CHECK-LABEL: shape_from_if_to_region_bodies_to_output
+  // CHECK-SAME: -> tensor<1x2x3xf32>
+  func @shape_from_if_to_region_bodies_to_output(%arg0: tensor<i1>, %arg1: tensor<1x2x3xf32>) -> tensor<*xf32> {
+    %unshaped = "tf.Cast"(%arg1) : (tensor<1x2x3xf32>) -> tensor<*xf32>
+    %0 = "tf.IfRegion"(%arg0) ({
+      // CHECK: "tf.Add"{{.+}}(tensor<1x2x3xf32>, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+      // CHECK: "tf.Yield"{{.+}}(tensor<1x2x3xf32>) -> ()
+      %1 = "tf.Add"(%unshaped, %unshaped) : (tensor<*xf32>,  tensor<*xf32>) -> tensor<*xf32>
+      "tf.Yield"(%1) : (tensor<*xf32>) -> ()
+     }, {
+      // CHECK: "tf.Sub"{{.+}}(tensor<1x2x3xf32>, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+      // CHECK: "tf.Yield"{{.+}}(tensor<1x2x3xf32>) -> ()
+      %2 = "tf.Sub"(%unshaped, %unshaped) : (tensor<*xf32>,  tensor<*xf32>) -> tensor<*xf32>
+      "tf.Yield"(%2) : (tensor<*xf32>) -> ()
+      // CHECK: {is_stateless = true} : (tensor<i1>) -> tensor<1x2x3xf32>
+     }) {is_stateless = true} : (tensor<i1>) -> tensor<*xf32>
+    // CHECK: return {{.*}} :  tensor<1x2x3xf32>
+    return %0 : tensor<*xf32>
+  }
+
   // CHECK-LABEL: func @shape_from_while_to_cond_body_functions
   func @shape_from_while_to_cond_body_functions(%arg0: tensor<4xf32>, %arg1: tensor<!tf.resource<tensor<4xf32>>>, %arg2: tensor<!tf.resource<tensor<*xf32>>>) -> tensor<4xf32> {
     // CHECK: "tf.While"
@@ -169,6 +191,33 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     return %1, %arg1, %arg2 : tensor<*xf32>, tensor<*x!tf.resource>, tensor<!tf.resource<tensor<*xf32>>>
   }
 
+  // Verify shape propagation from function arg -> while region cond/body -> while region output -> function return type
+  // CHECK-LABEL: func @shape_from_while_operands_to_cond_body_to_while_results
+  // CHECK-SAME: -> tensor<1x2x3xf32>
+  func @shape_from_while_operands_to_cond_body_to_while_results(%arg0: tensor<i32>, %arg1: tensor<1x2x3xf32>) ->  tensor<*xf32> {
+    %unshaped = "tf.Cast"(%arg1) : (tensor<1x2x3xf32>) -> tensor<*xf32>
+    // CHECK: "tf.WhileRegion"
+    %0:2 = "tf.WhileRegion"(%arg0, %unshaped) ({
+       // CHECK: {{.*}}({{.+}}: tensor<i32>, {{.+}}: tensor<1x2x3xf32>):
+       ^bb0(%carg0: tensor<i32>, %carg1: tensor<*xf32>):
+         %limit = constant dense<5> : tensor<i32>
+         %cond = "tf.NotEqual"(%carg0, %limit) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+         "tf.Yield"(%cond) : (tensor<i1>) -> ()
+      }, {
+       // CHECK: {{.*}}({{.+}}: tensor<i32>, {{.+}}: tensor<1x2x3xf32>):
+       ^bb0(%barg0: tensor<i32>, %barg1: tensor<*xf32>):
+        %one = constant dense<1> : tensor<i32>
+        %sub = "tf.Sub"(%barg0, %one) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+        // CHECK: "tf.Neg"({{.+}}) : (tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+        %neg = "tf.Neg"(%barg1) : (tensor<*xf32>) -> tensor<*xf32>
+        // CHECK: "tf.Yield"{{.+}}, {{.+}}) : (tensor<i32>, tensor<1x2x3xf32>) -> ()
+        "tf.Yield"(%sub, %neg) : (tensor<i32>, tensor<*xf32>) -> ()
+    // CHECK: {is_stateless = true} : (tensor<i32>, tensor<1x2x3xf32>) -> (tensor<i32>, tensor<1x2x3xf32>)
+    }) {is_stateless = true} : (tensor<i32>, tensor<*xf32>) -> (tensor<i32>, tensor<*xf32>)
+    // CHECK: return {{.+}}#1 : tensor<1x2x3xf32>
+    return %0#1 : tensor<*xf32>
+  }
+
   // CHECK-LABEL: func @shape_from_case_to_branch_functions(
   // CHECK-SAME:    %[[ARG_0:.*]]: tensor<i32>,
   // CHECK-SAME:    %[[ARG_1:.*]]: tensor<!tf.resource<tensor<1x2x3xf32>>>
@@ -219,7 +268,7 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
 
   // CHECK-LABEL: func @reused_if_then_branch
   // CHECK-SAME: (%arg0: tensor<*xf32>) -> tensor<*xf32>
-  // expected-warning @+1 {{expected control flow function reused_if_then_branch to have exactly 1 use}}
+  // expected-warning @+1 {{expected control flow function @reused_if_then_branch to have exactly 1 use}}
   func @reused_if_then_branch(%arg0: tensor<*xf32>) -> tensor<*xf32> {
     // CHECK: return
     // CHECK-SAME: tensor<*xf32>
@@ -228,7 +277,7 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
 
   // CHECK-LABEL: func @reused_if_else_branch
   // CHECK-SAME: (%arg0: tensor<*xf32>) -> tensor<*xf32>
-  // expected-warning @+1 {{expected control flow function reused_if_else_branch to have exactly 1 use}}
+  // expected-warning @+1 {{expected control flow function @reused_if_else_branch to have exactly 1 use}}
   func @reused_if_else_branch(%arg0: tensor<*xf32>) -> tensor<*xf32> {
     // CHECK: "tf.Identity"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
     %0 = "tf.Identity"(%arg0) : (tensor<*xf32>) -> (tensor<*xf32>)
@@ -499,4 +548,16 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
    %outputs_2 = "tf.TensorSliceDataset"(%outputs_0) {device = "", output_shapes = [#tf.shape<>]} : (tensor<*xf32>) -> tensor<!tf.variant>
    return
   }
+
+  // Test resource result subtypes are propagated to call op results.
+  // CHECK-LABEL: func @pcall_resource_result
+  func @pcall_resource_result(%arg0: tensor<*x!tf.resource<tensor<f32>>>) {
+    // CHECK: "tf.StatefulPartitionedCall"
+    // CHECK-SAME: (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<*x!tf.resource<tensor<f32>>>
+    %0 = "tf.StatefulPartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @pcall_resource_result_func} : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<*x!tf.resource>
+    return
+  }
+  func @pcall_resource_result_func(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> tensor<*x!tf.resource<tensor<f32>>> {
+    return %arg0 : tensor<*x!tf.resource<tensor<f32>>>
+  }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index 44646690519..20a0e22c48e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -191,6 +191,24 @@ func @testMul(%arg0: tensor<2xui16>) -> (tensor<2xui16>) {
 
 // -----
 
+// Test error message for incompatible element types.
+func @testIncompatibleElementTypes(%arg0: tensor<3x2xf32>, %arg1: tensor<3x2xf64>) -> (tensor<3x2xf32>) {
+    // expected-error @+1 {{'tf.Mul' op requires compatible element types for all operands and results}}
+  %0 = "tf.Mul"(%arg0, %arg1) : (tensor<3x2xf32>, tensor<3x2xf64>) -> tensor<3x2xf32>
+  return %0 : tensor<3x2xf32>
+}
+
+// -----
+
+// Test error message for incompatible element types.
+func @testIncompatibleElementTypes(%arg0: tensor<3x2xf32>, %arg1: tensor<3x2xf32>) -> (tensor<3x2xf64>) {
+    // expected-error @+1 {{'tf.Mul' op requires compatible element types for all operands and results}}
+  %0 = "tf.Mul"(%arg0, %arg1) : (tensor<3x2xf32>, tensor<3x2xf32>) -> tensor<3x2xf64>
+  return %0 : tensor<3x2xf64>
+}
+
+// -----
+
 // CHECK-LABEL: func @testReshape(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<10000xf32>, %arg3: tensor<*xi32>)
 func @testReshape(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<10000xf32>, %arg3: tensor<*xi32>) -> (tensor<100x100xf32>, tensor<*xf32>, tensor<10000xf32>, tensor<100x100xf32>, tensor<*xf32>, tensor<*xf32>) {
   %shape1 = constant dense<100> : tensor<2xi32>
@@ -2026,6 +2044,71 @@ func @testTranspose(tensor<2x3xf32>) -> tensor<3x2xf32> {
 
 // -----
 
+// Test tf.Transpose with partial unknown shape
+// CHECK-LABEL: testTranspose
+func @testTranspose(tensor<2x?xf32>) -> tensor<?x2xf32> {
+^bb0(%arg0: tensor<2x?xf32>):
+  %cst = constant dense<[1, 0]> : tensor<2xi32>
+  %0 = "tf.Transpose"(%arg0, %cst) {T = "tfdtype$DT_FLOAT", Tperm = "tfdtype$DT_INT32"} : (tensor<2x?xf32>, tensor<2xi32>) -> tensor<?x2xf32>
+  return %0 : tensor<?x2xf32>
+}
+
+// -----
+
+// Test tf.Transpose with different partial unknown shape
+// CHECK-LABEL: testTranspose
+func @testTranspose(tensor<2x?x?xf32>) -> tensor<3x?x2xf32> {
+^bb0(%arg0: tensor<2x?x?xf32>):
+  %cst = constant dense<[2, 1, 0]> : tensor<3xi32>
+  %0 = "tf.Transpose"(%arg0, %cst) {T = "tfdtype$DT_FLOAT", Tperm = "tfdtype$DT_INT32"} : (tensor<2x?x?xf32>, tensor<3xi32>) -> tensor<3x?x2xf32>
+  return %0 : tensor<3x?x2xf32>
+}
+
+// -----
+
+// Test tf.Transpose with invalid rank of perm
+func @testTranspose(tensor<2x3xf32>, tensor<1x2xi32>) -> tensor<3x2xf32> {
+^bb0(%arg0: tensor<2x3xf32>, %arg1: tensor<1x2xi32>):
+  // expected-error @+1 {{expected perm to be a 1-D Tensor, got perm of rank 2}}
+  %0 = "tf.Transpose"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", Tperm = "tfdtype$DT_INT32"} : (tensor<2x3xf32>, tensor<1x2xi32>) -> tensor<3x2xf32>
+  return %0 : tensor<3x2xf32>
+}
+
+// -----
+
+// Test tf.Transpose with invalid size of perm
+func @testTranspose(tensor<2x3xf32>) -> tensor<3x2xf32> {
+^bb0(%arg0: tensor<2x3xf32>):
+  %cst = constant dense<[1, 0, 2]> : tensor<3xi32>
+  // expected-error @+1 {{expected perm to be a 1-D Tensor of size equal to the rank of x, got perm of size 3, and x of rank 2}}
+  %0 = "tf.Transpose"(%arg0, %cst) {T = "tfdtype$DT_FLOAT", Tperm = "tfdtype$DT_INT32"} : (tensor<2x3xf32>, tensor<3xi32>) -> tensor<3x2xf32>
+  return %0 : tensor<3x2xf32>
+}
+
+// -----
+
+// Test tf.Transpose with invalid rank of y
+func @testTranspose(tensor<2x3xf32>) -> tensor<3x2x1xf32> {
+^bb0(%arg0: tensor<2x3xf32>):
+  %cst = constant dense<[1, 0]> : tensor<2xi32>
+  // expected-error @+1 {{x should be of the same rank with y, got x of rank 2, and y of rank 3}}
+  %0 = "tf.Transpose"(%arg0, %cst) {T = "tfdtype$DT_FLOAT", Tperm = "tfdtype$DT_INT32"} : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<3x2x1xf32>
+  return %0 : tensor<3x2x1xf32>
+}
+
+// -----
+
+// Test tf.Transpose with invalid shape of y
+func @testTranspose(tensor<2x3x4xf32>) -> tensor<3x2x4xf32> {
+^bb0(%arg0: tensor<2x3x4xf32>):
+  %cst = constant dense<[2, 0, 1]> : tensor<3xi32>
+  // expected-error @+1 {{requires y.shape[0] (3) to be equal to x.shape[perm[2]] (4)}}
+  %0 = "tf.Transpose"(%arg0, %cst) {T = "tfdtype$DT_FLOAT", Tperm = "tfdtype$DT_INT32"} : (tensor<2x3x4xf32>, tensor<3xi32>) -> tensor<3x2x4xf32>
+  return %0 : tensor<3x2x4xf32>
+}
+
+// -----
+
 // Test invalid tf.Less
 func @testLess(tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32> {
 ^bb0(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>):
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_invalid.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_invalid.mlir
index 2f034f1bfae..0e9814de137 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_invalid.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_invalid.mlir
@@ -232,6 +232,20 @@ func @invalid_island(%arg0: tensor<*xf32>, %ctl: !tf_executor.control) {
 
 // -----
 
+// Check that an island body doesn't have any block arguments.
+func @invalid_island(%arg0: tensor<*xf32>, %ctl: !tf_executor.control) {
+  tf_executor.graph {
+    "tf_executor.island"() ({
+      // expected-error@-1 {{expects body without any arguments}}
+      ^entry(%arg: tensor<2xi32>):
+        tf_executor.yield
+    }) : () -> (!tf_executor.control)
+  }
+  return
+}
+
+// -----
+
 // Check that an island body can't be empty.
 func @invalid_island(%arg0: tensor<*xf32>, %ctl: !tf_executor.control) {
   tf_executor.graph {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/control_flow_upgrade_legacy_v1.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/control_flow_upgrade_legacy_v1.py
index 209ed3492e8..19e7a90c1e1 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/control_flow_upgrade_legacy_v1.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/control_flow_upgrade_legacy_v1.py
@@ -33,9 +33,10 @@ from tensorflow.python.ops import control_flow_ops
 
 def Test():
   data = tf.constant([1, 2, 3, 4, 5, 6])
-  zero = tf.convert_to_tensor(0)
-  one = tf.convert_to_tensor(1)
-  less_op = tf.less(zero, one)
+  # Create placeholders to prevent constant folding.
+  x_op = tf.placeholder(dtype=tf.int32)
+  y_op = tf.placeholder(dtype=tf.int32)
+  less_op = tf.less(x_op, y_op)
   switch_op = control_flow_ops.switch(data, less_op)
   merge_op = control_flow_ops.merge(switch_op)[0]
   result = tf.transpose(merge_op)
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_asset_v1.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_asset_v1.py
index 7e86953eb8f..4cb931253b3 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_asset_v1.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_asset_v1.py
@@ -27,13 +27,15 @@ import tensorflow.compat.v1 as tf
 from tensorflow.compiler.mlir.tensorflow.tests.tf_saved_model import common_v1
 
 # CHECK: "tf_saved_model.session_initializer"() {initializer = [[init:@.*]]} : () -> ()
-# CHECK: "tf_saved_model.asset"() {filename = {{.*}}, sym_name = "[[asset:.*]]"}
+# CHECK: "tf_saved_model.asset"() {filename = {{.*}}, sym_name = "[[asset1:__tf_saved_model_asset1_.*]]"}
+# CHECK: "tf_saved_model.asset"() {filename = {{.*}}, sym_name = "[[asset0:__tf_saved_model_asset0_.*]]"}
 
 # CHECK:      func [[init]]
-# CHECK-SAME: [[ARG:%.*]]: tensor<!tf.string> {tf_saved_model.bound_input = @[[asset]]}
+# CHECK-SAME: [[ARG0:%.*]]: tensor<!tf.string> {tf_saved_model.bound_input = @[[asset0]]}
+# CHECK-SAME: [[ARG1:%.*]]: tensor<!tf.string> {tf_saved_model.bound_input = @[[asset1]]}
 # CHECK-NEXT: [[R0:%.*]] = "tf.HashTableV2"()
 # CHECK-SAME: shared_name = "[[hash_table:.*]]"
-# CHECK-NEXT: "tf.InitializeTableFromTextFileV2"([[R0]], [[ARG]])
+# CHECK-NEXT: "tf.InitializeTableFromTextFileV2"([[R0]], [[ARG0]])
 
 
 def write_vocabulary_file(vocabulary):
@@ -48,11 +50,16 @@ def write_vocabulary_file(vocabulary):
 
 def test():
 
+  vocabulary_file = write_vocabulary_file(['cat', 'is', 'on', 'the', 'mat'])
   table_initializer = tf.lookup.TextFileInitializer(
-      write_vocabulary_file(['cat', 'is', 'on', 'the', 'mat']), tf.string,
-      tf.lookup.TextFileIndex.WHOLE_LINE, tf.int64,
+      vocabulary_file, tf.string, tf.lookup.TextFileIndex.WHOLE_LINE, tf.int64,
       tf.lookup.TextFileIndex.LINE_NUMBER)
+  # Incur another bound_input on the asset, but with a different sym_name, i.e.,
+  # __tf_saved_model_asset1_tokens.txt vs. __tf_saved_model_asset0_tokens.txt.
   table = tf.lookup.StaticVocabularyTable(table_initializer, num_oov_buckets=10)
+  vocab_file_tensor = tf.convert_to_tensor(vocabulary_file, tf.string,
+                                           name='asset_filepath')
+  tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, vocab_file_tensor)
 
   x = tf.placeholder(tf.string, shape=(), name='input')
   r = table.lookup(x)
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_deduplicate_bound_input_bindings.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_deduplicate_bound_input_bindings.mlir
new file mode 100644
index 00000000000..22fd3d86068
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_deduplicate_bound_input_bindings.mlir
@@ -0,0 +1,33 @@
+// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-saved-model-dedup-bound-input-binding-pass | FileCheck %s
+
+module attributes {tf_saved_model.semantics, tf_saved_model.under_construction} {
+  // Test case: Remove duplicate bound_input symbols.
+  "tf_saved_model.global_tensor"() { is_mutable, sym_name = "v", type = tensor<f32>, value = dense<42.0> : tensor<f32> } : () -> ()
+  "tf_saved_model.global_tensor"() { is_mutable, sym_name = "w", type = tensor<f32>, value = dense<43.0> : tensor<f32> } : () -> ()
+  "tf_saved_model.global_tensor"() { is_mutable, sym_name = "x", type = tensor<f32>, value = dense<44.0> : tensor<f32> } : () -> ()
+  // CHECK: func @f
+  // CHECK: %arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v}
+  // CHECK: %arg1: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @w}
+  // CHECK: %arg2: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @x}
+  // CHECK-NOT: %arg3
+  // CHECK-NOT: %arg4
+  func @f(
+    %arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v},
+    %arg1: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @w},
+    %arg2: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v},
+    %arg3: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @x},
+    %arg4: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v}
+  ) attributes {tf_saved_model.exported_names = ["f"]} {
+    // CHECK: "tf.ReadVariableOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+    // CHECK: "tf.ReadVariableOp"(%arg1) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+    // CHECK: "tf.ReadVariableOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+    // CHECK: "tf.ReadVariableOp"(%arg2) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+    // CHECK: "tf.ReadVariableOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+    %val0 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+    %val1 = "tf.ReadVariableOp"(%arg1) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+    %val2 = "tf.ReadVariableOp"(%arg2) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+    %val3 = "tf.ReadVariableOp"(%arg3) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+    %val4 = "tf.ReadVariableOp"(%arg4) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+    return
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
index 7156a1fab63..d2c5509b52d 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
@@ -76,3 +76,16 @@ module attributes {tf_saved_model.semantics, tf_saved_model.under_construction}
   }
 
 }
+
+// -----
+
+module attributes {tf_saved_model.semantics, tf_saved_model.under_construction} {
+  "tf_saved_model.global_tensor"() { is_mutable, sym_name = "v", type = tensor<f32>, value = dense<42.0> : tensor<f32> } : () -> ()
+  // CHECK: func @f
+  func @f(
+    %arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v},
+    %arg1: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v}
+  ) attributes {tf_saved_model.exported_names = ["f"]} {
+    return
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
index dcb889ff99e..714c8908825 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
@@ -400,3 +400,17 @@ module attributes {tf_saved_model.semantics} {
   }
 
 }
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+  "tf_saved_model.global_tensor"() { is_mutable, sym_name = "v", type = tensor<f32>, value = dense<42.0> : tensor<f32> } : () -> ()
+  // expected-error@+1 {{duplicate 'tf_saved_model.bound_input' binding}}
+  func @f(
+    %arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v},
+    %arg1: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v}
+  ) attributes {tf_saved_model.exported_names = ["f"]} {
+    return
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir
index 43be8743e51..1e308b42bfc 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir
@@ -20,8 +20,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
                {T = ["tfdtype$DT_INT32", "tfdtype$DT_RESOURCE",
                  "tfdtype$DT_RESOURCE", "tfdtype$DT_RESOURCE",
                  "tfdtype$DT_RESOURCE"], body = @while_body_7560,
-                cond = @while_cond_7550, device = "", is_stateless = false,
-                output_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>]}
+                cond = @while_cond_7550, device = "", is_stateless = false}
          : (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>,
             tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>)
          -> (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>,
@@ -217,8 +216,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
                {T = ["tfdtype$DT_INT32", "tfdtype$DT_RESOURCE",
                  "tfdtype$DT_RESOURCE", "tfdtype$DT_RESOURCE",
                  "tfdtype$DT_RESOURCE"], body = @while_body_7560,
-                cond = @while_cond_7550, device = "", is_stateless = false,
-                output_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>]}
+                cond = @while_cond_7550, device = "", is_stateless = false}
          : (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>,
             tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>)
          -> (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>,
@@ -305,8 +303,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
                {T = ["tfdtype$DT_INT32", "tfdtype$DT_RESOURCE",
                  "tfdtype$DT_RESOURCE", "tfdtype$DT_RESOURCE"],
                 body = @while_body_7560,
-                cond = @while_cond_7550, device = "", is_stateless = false,
-                output_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>]}
+                cond = @while_cond_7550, device = "", is_stateless = false}
          : (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>,
             tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>)
          -> (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>,
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
index 208146a1226..1f516a25824 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-tpu-extract-outside-compilation | FileCheck %s
+// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-tpu-extract-outside-compilation | FILECHECK_OPTS="" FileCheck %s
 
 // Tests that missing `_xla_outside_compilation` attribute value results in an error.
 
@@ -143,14 +143,14 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
     // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
     // CHECK-NEXT:     "tf_device.launch"
-    // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+    // CHECK:            %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"
     // CHECK:            %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    // CHECK-SAME:       key = "host_compute_channel_cluster1_args"
     // CHECK:            "tf.B"(%[[RECV_OUTPUT]])
     // CHECK:          "tf_device.cluster"
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
-    // CHECK:            "tf._HostComputeMlir"(%[[A_OUTPUT]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    // CHECK:            "tf._XlaHostComputeMlir"(%[[A_OUTPUT]])
+    // CHECK-SAME:       send_key = "host_compute_channel_cluster1_args"
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
       %2 = "tf_device.cluster"() ( {
         %3 = "tf.A"() : () -> (tensor<?xi32>)
@@ -172,15 +172,17 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
     // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
     // CHECK-NEXT:     "tf_device.launch"
-    // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+    // CHECK:            %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"
     // CHECK:            "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
+    // CHECK-SAME:       key = "host_compute_channel_cluster1_args"
     // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"()
     // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]], %[[PROGRAM_OUTPUT]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    // CHECK-SAME:       key = "host_compute_channel_cluster1_retvals"
     // CHECK:         "tf_device.cluster"
     // CHECK:           %[[A_OUTPUT:[0-9]*]] = "tf.A"
-    // CHECK:           %[[HOST_OUTPUT:[0-9]*]] = "tf._HostComputeMlir"()
-    // CHECK-SAME:      key = "host_compute_channel_cluster1"
+    // CHECK:           %[[HOST_OUTPUT:[0-9]*]] = "tf._XlaHostComputeMlir"()
+    // CHECK-SAME:      recv_key = "host_compute_channel_cluster1_retvals"
+    // CHECK-SAME:      send_key = "host_compute_channel_cluster1_args"
     // CHECK:           "tf.C"(%[[HOST_OUTPUT]])
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
       %2 = "tf_device.cluster"() ( {
@@ -203,14 +205,15 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
     // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
     // CHECK-NEXT:     "tf_device.launch"
-    // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+    // CHECK:            %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"
     // CHECK:            %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
     // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[RECV_OUTPUT]])
     // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]], %[[PROGRAM_OUTPUT]])
+    // CHECK-SAME:       key = "host_compute_channel_cluster1_retvals"
     // CHECK:          "tf_device.cluster"
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
-    // CHECK:            %[[HOST_OUTPUT:[0-9]*]] = "tf._HostComputeMlir"(%[[A_OUTPUT]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    // CHECK:            %[[HOST_OUTPUT:[0-9]*]] = "tf._XlaHostComputeMlir"(%[[A_OUTPUT]])
+    // CHECK-SAME:       recv_key = "host_compute_channel_cluster1_retvals"
     // CHECK:            tf_device.return %[[HOST_OUTPUT]]
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
       %2 = "tf_device.cluster"() ( {
@@ -233,15 +236,15 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
     // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
     // CHECK-NEXT:     "tf_device.launch"
-    // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+    // CHECK:            %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"
     // CHECK:            %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
     // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[RECV_OUTPUT]])
     // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]], %[[PROGRAM_OUTPUT]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    // CHECK-SAME:       key = "host_compute_channel_cluster1_retvals"
     // CHECK:          "tf_device.cluster"
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
-    // CHECK:            %[[HOST_OUTPUT:[0-9]*]] = "tf._HostComputeMlir"(%[[A_OUTPUT]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    // CHECK:            %[[HOST_OUTPUT:[0-9]*]] = "tf._XlaHostComputeMlir"(%[[A_OUTPUT]])
+    // CHECK-SAME:       recv_key = "host_compute_channel_cluster1_retvals"
     // CHECK:            "tf.C"(%[[HOST_OUTPUT]])
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
       %2 = "tf_device.cluster"() ( {
@@ -264,16 +267,16 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
     // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
     // CHECK-NEXT:     "tf_device.launch"
-    // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+    // CHECK:            %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"
     // CHECK:            %[[RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
     // CHECK:            %[[B_OUTPUT:[0-9]*]]:2 = "tf.C"(%[[RECV_OUTPUT]]#0, %[[RECV_OUTPUT]]#1)
     // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]]#0, %[[B_OUTPUT]]#1, %[[PROGRAM_OUTPUT]])
-    // CHECK-SAME:        key = "host_compute_channel_cluster1"
+    // CHECK-SAME:       key = "host_compute_channel_cluster1_retvals"
     // CHECK:          "tf_device.cluster"
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
     // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
-    // CHECK:            %[[HOST_OUTPUT:[0-9]*]]:2 = "tf._HostComputeMlir"(%[[A_OUTPUT]], %[[B_OUTPUT]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    // CHECK:            %[[HOST_OUTPUT:[0-9]*]]:2 = "tf._XlaHostComputeMlir"(%[[A_OUTPUT]], %[[B_OUTPUT]])
+    // CHECK-SAME:       recv_key = "host_compute_channel_cluster1_retvals"
     // CHECK:            "tf.D"(%[[HOST_OUTPUT]]#0)
     // CHECK:            "tf.E"(%[[HOST_OUTPUT]]#1)
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
@@ -299,24 +302,24 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
     // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
     // CHECK-NEXT:     "tf_device.launch"
-    // CHECK:            %[[STATUS_OUTPUT2:[a-z_0-9]*]], %[[PROGRAM_OUTPUT2:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+    // CHECK:            %[[PROGRAM_OUTPUT2:[a-z_0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"
     // CHECK:            %[[RECV_OUTPUT2:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT2]])
     // CHECK:            %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[RECV_OUTPUT2]])
     // CHECK:            "tf._XlaSendFromHost"(%[[D_OUTPUT]], %[[PROGRAM_OUTPUT]])
-    // CHECK-SAME:         key = "host_compute_channel_cluster2"
+    // CHECK-SAME:       key = "host_compute_channel_cluster2_retvals"
     // CHECK:          "tf_device.launch"
-    // CHECK:            %[[STATUS_OUTPUT1:[a-z_0-9]*]], %[[PROGRAM_OUTPUT1:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+    // CHECK:            %[[PROGRAM_OUTPUT1:[a-z_0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"
     // CHECK:            %[[RECV_OUTPUT1:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT1]])
     // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[RECV_OUTPUT1]])
     // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]], %[[PROGRAM_OUTPUT]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    // CHECK-SAME:       key = "host_compute_channel_cluster1_retvals"
     // CHECK:          "tf_device.cluster"
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
-    // CHECK:            %[[HOST_OUTPUT1:[0-9]*]] = "tf._HostComputeMlir"(%[[A_OUTPUT]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    // CHECK:            %[[HOST_OUTPUT1:[0-9]*]] = "tf._XlaHostComputeMlir"(%[[A_OUTPUT]])
+    // CHECK-SAME:       recv_key = "host_compute_channel_cluster1_retvals"
     // CHECK:            %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[HOST_OUTPUT1]])
-    // CHECK:            %[[HOST_OUTPUT2:[0-9]*]] = "tf._HostComputeMlir"(%[[C_OUTPUT]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster2"
+    // CHECK:            %[[HOST_OUTPUT2:[0-9]*]] = "tf._XlaHostComputeMlir"(%[[C_OUTPUT]])
+    // CHECK-SAME:       recv_key = "host_compute_channel_cluster2_retvals"
     // CHECK:            "tf.E"(%[[HOST_OUTPUT2]])
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
       %2 = "tf_device.cluster"() ( {
@@ -341,14 +344,14 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
     // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
     // CHECK-NEXT:     "tf_device.launch"
-    // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+    // CHECK:            %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"
     // CHECK:            %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    // CHECK-SAME:       key = "host_compute_channel_cluster1_args"
     // CHECK:            "tf.B"(%arg0, %[[RECV_OUTPUT]])
     // CHECK:          "tf_device.cluster"
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
-    // CHECK:            "tf._HostComputeMlir"(%[[A_OUTPUT]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    // CHECK:            "tf._XlaHostComputeMlir"(%[[A_OUTPUT]])
+    // CHECK-SAME:       send_key = "host_compute_channel_cluster1_args"
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
       %2 = "tf_device.cluster"() ( {
         %3 = "tf.A"() : () -> (tensor<?xi32>)
@@ -370,22 +373,22 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
     // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
     // CHECK-NEXT:     "tf_device.launch"
-    // CHECK:            %[[STATUS_OUTPUT_2:[a-z_0-9]*]], %[[PROGRAM_OUTPUT_2:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+    // CHECK:            %[[PROGRAM_OUTPUT_2:[a-z_0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"
     // CHECK:            %[[RECV_OUTPUT_2:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT_2]])
-    // CHECK-SAME:      key = "host_compute_channel_cluster2"
+    // CHECK-SAME:      key = "host_compute_channel_cluster2_args"
     // CHECK:           "tf.D"(%[[RECV_OUTPUT_2]])
     // CHECK:          "tf_device.launch"
-    // CHECK:            %[[STATUS_OUTPUT_1:[a-z_0-9]*]], %[[PROGRAM_OUTPUT_1:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+    // CHECK:            %[[PROGRAM_OUTPUT_1:[a-z_0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"
     // CHECK:            %[[RECV_OUTPUT_1:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT_1]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    // CHECK-SAME:       key = "host_compute_channel_cluster1_args"
     // CHECK:            "tf.B"(%[[RECV_OUTPUT_1]])
     // CHECK:          "tf_device.cluster"
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
-    // CHECK:            "tf._HostComputeMlir"(%[[A_OUTPUT]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    // CHECK:            "tf._XlaHostComputeMlir"(%[[A_OUTPUT]])
+    // CHECK-SAME:       send_key = "host_compute_channel_cluster1_args"
     // CHECK:            %[[C_OUTPUT:[0-9]*]] = "tf.C"
-    // CHECK:            "tf._HostComputeMlir"(%[[C_OUTPUT]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster2"
+    // CHECK:            "tf._XlaHostComputeMlir"(%[[C_OUTPUT]])
+    // CHECK-SAME:       send_key = "host_compute_channel_cluster2_args"
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
       %2 = "tf_device.cluster"() ( {
         %3 = "tf.A"() : () -> (tensor<?xi32>)
@@ -408,16 +411,16 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
     // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
     // CHECK-NEXT:     "tf_device.launch"
-    // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+    // CHECK:            %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"
     // CHECK:            %[[RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    // CHECK-SAME:       key = "host_compute_channel_cluster1_args"
     // CHECK:            "tf.C"(%[[RECV_OUTPUT]]#0)
     // CHECK:            "tf.D"(%[[RECV_OUTPUT]]#1, %[[RECV_OUTPUT]]#0)
     // CHECK:          "tf_device.cluster"
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
     // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
-    // CHECK:            "tf._HostComputeMlir"(%[[A_OUTPUT]], %[[B_OUTPUT]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    // CHECK:            "tf._XlaHostComputeMlir"(%[[A_OUTPUT]], %[[B_OUTPUT]])
+    // CHECK-SAME:       send_key = "host_compute_channel_cluster1_args"
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
       %2 = "tf_device.cluster"() ( {
         %3 = "tf.A"() : () -> (tensor<?xi32>)
@@ -453,4 +456,236 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     }
     return %1 : tensor<?xi32>
   }
+
+  // Tests extraction of a single outside compiled cluster inside a tf.IfRegion op.
+
+  // CHECK-LABEL: func @outside_compiled_ops_inside_tf_if
+  func @outside_compiled_ops_inside_tf_if(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:      %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
+    // CHECK-NEXT:      %[[PREDICATE_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:      device_ordinal = 0
+    // CHECK-SAME:      key = "if_predicate_channel_cluster1_0"
+    // CHECK-NEXT:       tf.IfRegion"(%[[PREDICATE_RECV_OUTPUT]])
+    // CHECK-NEXT:         %[[ARG_RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:         device_ordinal = 0
+    // CHECK-SAME:         key = "host_compute_channel_cluster1_args"
+    // CHECK:              "tf.D"(%[[ARG_RECV_OUTPUT]]#0, %[[ARG_RECV_OUTPUT]]#1)
+    // CHECK:              "tf._XlaSendFromHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:         device_ordinal = 0
+    // CHECK-SAME:         key = "host_compute_channel_cluster1_retvals"
+    // CHECK-NEXT:         "tf.Yield"() : () -> ()
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+    // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
+    // CHECK:            "tf.XlaSendToHost"(%6) {key = "if_predicate_channel_cluster1_0"}
+    // CHECK-NEXT:       tf.IfRegion"(%[[G_OUTPUT]])
+    // CHECK:              "tf._XlaHostComputeMlir"(%[[B_OUTPUT]], %[[A_OUTPUT]])
+    // CHECK-SAME:         recv_key = "host_compute_channel_cluster1_retvals"
+    // CHECK-SAME:         send_key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:         tpu_core = 0
+    // CHECK-NEXT:         "tf.Yield"() : () -> ()
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xi32>)
+        %4 = "tf.B"() : () -> (tensor<?xi32>)
+        %6 = "tf.G"() : () -> (tensor<i1>)
+
+        "tf.IfRegion"(%6) ({
+          "tf.D"(%4, %3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>, tensor<?xi32>) -> ()
+          "tf.Yield"() : () -> ()
+        }, {
+          "tf.Yield"() : () -> ()
+        }) { is_stateless = false} : (tensor<i1>) -> ()
+
+        %5 = "tf.E"() : () -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
+
+  // Tests extraction of a single outside compiled cluster inside a tf.IfRegion
+  // op with return values.
+
+  // CHECK-LABEL: func @outside_compiled_ops_inside_tf_if_with_return_values
+  func @outside_compiled_ops_inside_tf_if_with_return_values(
+    %arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:      %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
+    // CHECK-NEXT:      %[[PREDICATE_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:      device_ordinal = 0
+    // CHECK-SAME:      key = "if_predicate_channel_cluster1_0"
+    // CHECK-NEXT:       tf.IfRegion"(%[[PREDICATE_RECV_OUTPUT]])
+    // CHECK-NEXT:         %[[ARG_RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:         device_ordinal = 0
+    // CHECK-SAME:         key = "host_compute_channel_cluster1_args"
+    // CHECK:              %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[ARG_RECV_OUTPUT]]#0, %[[ARG_RECV_OUTPUT]]#1)
+    // CHECK:              "tf._XlaSendFromHost"(%[[D_OUTPUT]], %[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:         device_ordinal = 0
+    // CHECK-SAME:         key = "host_compute_channel_cluster1_retvals"
+    // CHECK-NEXT:         "tf.Yield"() : () -> ()
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+    // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
+    // CHECK:            "tf.XlaSendToHost"(%6) {key = "if_predicate_channel_cluster1_0"}
+    // CHECK-NEXT:       tf.IfRegion"(%[[G_OUTPUT]])
+    // CHECK:              %[[HOST_COMPUTE_OUT:[0-9]*]] = "tf._XlaHostComputeMlir"(%[[B_OUTPUT]], %[[A_OUTPUT]])
+    // CHECK-SAME:         recv_key = "host_compute_channel_cluster1_retvals"
+    // CHECK-SAME:         send_key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:         tpu_core = 0
+    // CHECK-NEXT:         "tf.Yield"(%[[HOST_COMPUTE_OUT]])
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xi32>)
+        %4 = "tf.B"() : () -> (tensor<?xi32>)
+        %6 = "tf.G"() : () -> (tensor<i1>)
+
+        "tf.IfRegion"(%6) ({
+          %7 = "tf.D"(%4, %3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>, tensor<?xi32>) -> (tensor<?xi32>)
+          "tf.Yield"(%7) : (tensor<?xi32>) -> ()
+        }, {
+
+          %8 = "tf.F"() : () -> (tensor<?xi32>)
+          "tf.Yield"(%8) : (tensor<?xi32>) -> ()
+        }) { is_stateless = false} : (tensor<i1>) -> (tensor<?xi32>)
+
+        %5 = "tf.E"() : () -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
+
+  // Tests extraction of a single outside compiled cluster inside a tf.IfRegion op without external inputs/outputs
+
+  // CHECK-LABEL: func @outside_compiled_ops_inside_tf_if_without_input_outputs
+  func @outside_compiled_ops_inside_tf_if_without_input_outputs(
+    %arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:      %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
+    // CHECK-NEXT:      %[[PREDICATE_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:      device_ordinal = 0
+    // CHECK-SAME:      key = "if_predicate_channel_cluster1_0"
+    // CHECK-NEXT:       tf.IfRegion"(%[[PREDICATE_RECV_OUTPUT]])
+    // CHECK:              "tf.D"
+    // CHECK-NEXT:         "tf.Yield"() : () -> ()
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+    // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
+    // CHECK:            "tf.XlaSendToHost"(%6) {key = "if_predicate_channel_cluster1_0"}
+    // CHECK-NEXT:       tf.IfRegion"(%[[G_OUTPUT]])
+    // CHECK-NEXT:         "tf.Yield"() : () -> ()
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xi32>)
+        %4 = "tf.B"() : () -> (tensor<?xi32>)
+        %6 = "tf.G"() : () -> (tensor<i1>)
+
+        "tf.IfRegion"(%6) ({
+          "tf.D"() {_xla_outside_compilation = "cluster1"} : () -> ()
+          "tf.Yield"() : () -> ()
+        }, {
+          "tf.Yield"() : () -> ()
+        }) { is_stateless = false} : (tensor<i1>) -> ()
+
+        %5 = "tf.E"() : () -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
+
+  // Tests extraction of a single outside compiled cluster inside a nested
+  // tf.IfRegion op.
+
+  // CHECK-LABEL: func @outside_compiled_ops_inside_nested_if
+  func @outside_compiled_ops_inside_nested_if(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:      %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
+    // CHECK-NEXT:      %[[PREDICATE_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:      device_ordinal = 0
+    // CHECK-SAME:      key = "if_predicate_channel_cluster1_0"
+    // CHECK-NEXT:      tf.IfRegion"(%[[PREDICATE_RECV_OUTPUT]])
+    // CHECK-NEXT:        %[[PREDICATE2_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:        device_ordinal = 0
+    // CHECK-SAME:        key = "if_predicate_channel_cluster1_1"
+    // CHECK-NEXT:        tf.IfRegion"(%[[PREDICATE2_RECV_OUTPUT]])
+    // CHECK-NEXT:          "tf.Yield"() : () -> ()
+    // CHECK:               %[[ARG_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:          device_ordinal = 0
+    // CHECK-SAME:          key = "host_compute_channel_cluster1_args"
+    // CHECK:               "tf.D"(%[[ARG_RECV_OUTPUT]])
+    // CHECK:               "tf._XlaSendFromHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:          device_ordinal = 0
+    // CHECK-SAME:          key = "host_compute_channel_cluster1_retvals"
+    // CHECK-NEXT:          "tf.Yield"() : () -> ()
+
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+    // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
+    // CHECK:            "tf.XlaSendToHost"(%[[G_OUTPUT]]) {key = "if_predicate_channel_cluster1_0"}
+    // CHECK-NEXT:       tf.IfRegion"(%[[G_OUTPUT]])
+    // CHECK:              %[[H_OUTPUT:[0-9]*]] = "tf.H"(%[[B_OUTPUT]])
+    // CHECK:              "tf.XlaSendToHost"(%[[H_OUTPUT]]) {key = "if_predicate_channel_cluster1_1"}
+    // CHECK-NEXT:         tf.IfRegion"(%[[H_OUTPUT]])
+    // CHECK-NEXT:           "tf.Yield"() : () -> ()
+    // CHECK:                 %[[I_OUTPUT:[0-9]*]] = "tf.I"(%[[H_OUTPUT]])
+    // CHECK:                 "tf._XlaHostComputeMlir"(%[[I_OUTPUT]])
+    // CHECK-NEXT:            "tf.Yield"() : () -> ()
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xi32>)
+        %4 = "tf.B"() : () -> (tensor<?xi32>)
+        %6 = "tf.G"() : () -> (tensor<i1>)
+
+        "tf.IfRegion"(%6) ({
+           %7 = "tf.H"(%4) : (tensor<?xi32>) -> (tensor<i1>)
+
+          "tf.IfRegion"(%7)({
+              "tf.Yield"() : () -> ()
+            },
+            {
+              %8 = "tf.I"(%7) : (tensor<i1>) -> (tensor<?xi32>)
+              "tf.D"(%8) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> ()
+              "tf.Yield"() : () -> ()
+            }) { is_stateless = false} : (tensor<i1>) -> ()
+
+          "tf.Yield"() : () -> ()
+        }, {
+          "tf.Yield"() : () -> ()
+        }) { is_stateless = false} : (tensor<i1>) -> ()
+
+        %5 = "tf.E"() : () -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
index fa70ca85419..2a0091ce9bf 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-tpu-rewrite -tpu_compile_metadata_debug | FileCheck %s
+// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-tpu-rewrite -tpu_compile_metadata_debug | FILECHECK_OPTS="" FileCheck %s
 
 // Tests module with missing `tf.versions` attribute.
 
@@ -1256,21 +1256,21 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
       // CHECK: "tf._TPUCompileMlir"
       // CHECK: "tf.TPUCompileSucceededAssert"
       // CHECK: "tf_device.parallel_execute"
-      // CHECK-NOT:"tf._TPUCompileMlir"
+      // CHECK-NOT:"tf._TPUCompileMlirPlaceholderProgramKey"
       // CHECK:    "tf.D"(%[[COMPILE_OUTPUT]]#1
       // CHECK:    "tf.TPUExecute"
-      // CHECK-NOT:"tf._TPUCompileMlir"
+      // CHECK-NOT:"tf._TPUCompileMlirPlaceholderProgramKey"
       // CHECK:    "tf.E"(%[[COMPILE_OUTPUT]]#1
       %3 = "tf_device.parallel_execute"() ( {
-        %status, %program = "tf._TPUCompileMlir"() {metadata = "...", mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-        "tf.D"(%program) : (tensor<!tf.string>) -> ()
+         %program = "tf._TPUCompileMlirPlaceholderProgramKey"() : () -> tensor<?x!tf.string>
+        "tf.D"(%program) : (tensor<?x!tf.string>) -> ()
         tf_device.return
       }, {
         %4 = "tf_device.cluster_func"(%ri_0) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], use_spmd_for_xla_partitioning = false} : (tensor<?xi32>) -> tensor<?xi32>
         tf_device.return %4 : tensor<?xi32>
       }, {
-        %status, %program = "tf._TPUCompileMlir"() {metadata = "...", mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-        "tf.E"(%program) : (tensor<!tf.string>) -> ()
+        %program = "tf._TPUCompileMlirPlaceholderProgramKey"() : () -> tensor<?x!tf.string>
+        "tf.E"(%program) : (tensor<?x!tf.string>) -> ()
         tf_device.return
       }) : () -> (tensor<?xi32>)
       tf_device.return %3 : tensor<?xi32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_space_to_depth_pass.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_space_to_depth_pass.mlir
index 199426b1aa9..280986a7ee1 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_space_to_depth_pass.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_space_to_depth_pass.mlir
@@ -7,7 +7,7 @@ module attributes {tf.devices = {"/job:localhost/replica:0/task:0/device:CPU:0"
     %0 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
     %1 = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
     %2 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %3:10 = "tf.While"(%2, %1, %2, %0, %1, %arg2, %arg4, %arg5, %arg6, %arg7) {_lower_using_switch_merge = true, _num_original_outputs = 10 : i64, _read_only_resource_inputs = [], body = @while_body_2710, cond = @while_cond_2700, device = "", is_stateless = false, output_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>], parallel_iterations = 10 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf.resource>, tensor<!tf.resource<tensor<7x7x3x64xf32>>>, tensor<!tf.resource<tensor<f32>>>, tensor<!tf.resource<tensor<f32>>>, tensor<!tf.resource<tensor<i64>>>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf.resource>, tensor<!tf.resource<tensor<7x7x3x64xf32>>>, tensor<!tf.resource<tensor<f32>>>, tensor<!tf.resource<tensor<f32>>>, tensor<!tf.resource<tensor<i64>>>)
+    %3:10 = "tf.While"(%2, %1, %2, %0, %1, %arg2, %arg4, %arg5, %arg6, %arg7) {_lower_using_switch_merge = true, _num_original_outputs = 10 : i64, _read_only_resource_inputs = [], body = @while_body_2710, cond = @while_cond_2700, device = "", is_stateless = false, parallel_iterations = 10 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf.resource>, tensor<!tf.resource<tensor<7x7x3x64xf32>>>, tensor<!tf.resource<tensor<f32>>>, tensor<!tf.resource<tensor<f32>>>, tensor<!tf.resource<tensor<i64>>>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf.resource>, tensor<!tf.resource<tensor<7x7x3x64xf32>>>, tensor<!tf.resource<tensor<f32>>>, tensor<!tf.resource<tensor<f32>>>, tensor<!tf.resource<tensor<i64>>>)
     return
   }
   // CHECK-LABEL: func @while_body_2710
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_update_embedding_enqueue_op_inputs.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_update_embedding_enqueue_op_inputs.mlir
index b77e4b1fbd0..47374b7f7d4 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_update_embedding_enqueue_op_inputs.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_update_embedding_enqueue_op_inputs.mlir
@@ -9,16 +9,15 @@
 // CHECK-SAME: %[[ARG_5:[a-z0-9]*]]: tensor<?xi32>
 // CHECK-SAME: %[[ARG_6:[a-z0-9]*]]: tensor<!tf.string>
 // CHECK-SAME: %[[ARG_7:[a-z0-9]*]]: tensor<!tf.string>
-// CHECK-SAME: %[[ARG_8:[a-z0-9]*]]: tensor<i1>
 func @check_enqueue_ops_update_for_eval(%arg0: tensor<?x2xi32>, %arg1: tensor<?x2xi32>,
   %arg2 :tensor<?x2xi32>, %arg3: tensor<?xi32>, %arg4: tensor<?xi32>, %arg5: tensor<?xi32>,
-  %arg6: tensor<!tf.string>, %arg7: tensor<!tf.string>, %arg8: tensor<i1>) -> () {
+  %arg6: tensor<!tf.string>, %arg7: tensor<!tf.string>) -> () {
   // CHECK: %[[CONST_0:[a-z0-9]*]] = "tf.Const"()
   %0 = "tf.Const"() {value = dense<[]> : tensor<0xf32>} : () -> tensor<0xf32>
-  %1 = "tf.SelectV2"(%arg8, %arg6, %arg7) : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>) -> tensor<!tf.string>
 
-  // CHECK: "tf.EnqueueTPUEmbeddingSparseTensorBatch"(%[[ARG_0]], %[[ARG_1]], %[[ARG_2]], %[[ARG_3]], %[[ARG_4]], %[[ARG_5]], %[[CONST_0]], %[[CONST_0]], %[[CONST_0]], %[[ARG_7]])
-  "tf.EnqueueTPUEmbeddingSparseTensorBatch"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %0, %0, %0, %1) {_tpu_embedding_layer = "call1", _xla_outside_compilation = "0", combiners = ["mean", "sum"], device_ordinal = -1 : i64, max_sequence_lengths = [0, 0, 0], table_ids = [1, 1, 0]} : (tensor<?x2xi32>, tensor<?x2xi32>, tensor<?x2xi32>, tensor<?xi32>, tensor<?xi32>, tensor<?xi32>, tensor<0xf32>, tensor<0xf32>, tensor<0xf32>, tensor<!tf.string>) -> ()
+  // CHECK: %[[CONST_MODE:[a-z0-9]*]] = "tf.Const"() {_xla_outside_compilation = "0", value = dense<"inference"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  // CHECK: "tf.EnqueueTPUEmbeddingSparseTensorBatch"(%[[ARG_0]], %[[ARG_1]], %[[ARG_2]], %[[ARG_3]], %[[ARG_4]], %[[ARG_5]], %[[CONST_0]], %[[CONST_0]], %[[CONST_0]], %[[CONST_MODE]])
+  "tf.EnqueueTPUEmbeddingSparseTensorBatch"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %0, %0, %0, %arg7) {_tpu_embedding_layer = "call1", _xla_outside_compilation = "0", combiners = ["mean", "sum"], device_ordinal = -1 : i64, max_sequence_lengths = [0, 0, 0], table_ids = [1, 1, 0]} : (tensor<?x2xi32>, tensor<?x2xi32>, tensor<?x2xi32>, tensor<?xi32>, tensor<?xi32>, tensor<?xi32>, tensor<0xf32>, tensor<0xf32>, tensor<0xf32>, tensor<!tf.string>) -> ()
   %2:2 = "tf.RecvTPUEmbeddingActivations"() {_tpu_embedding_layer = "call1", config = "\0A\0B\0C\0D"} : () -> (tensor<2x2xf32>, tensor<4x4xf32>)
   return
 }
@@ -34,20 +33,19 @@ func @check_enqueue_ops_update_for_eval(%arg0: tensor<?x2xi32>, %arg1: tensor<?x
 // CHECK-SAME: %[[ARG_5:[a-z0-9]*]]: tensor<?xi32>
 // CHECK-SAME: %[[ARG_6:[a-z0-9]*]]: tensor<!tf.string>
 // CHECK-SAME: %[[ARG_7:[a-z0-9]*]]: tensor<!tf.string>
-// CHECK-SAME: %[[ARG_8:[a-z0-9]*]]: tensor<i1>
 func @check_enqueue_ops_update_for_training(%arg0: tensor<?x2xi32>, %arg1: tensor<?x2xi32>,
   %arg2 :tensor<?x2xi32>, %arg3: tensor<?xi32>, %arg4: tensor<?xi32>, %arg5: tensor<?xi32>,
-  %arg6: tensor<!tf.string>, %arg7: tensor<!tf.string>, %arg8: tensor<i1>) -> () {
+  %arg6: tensor<!tf.string>, %arg7: tensor<!tf.string>) -> () {
   // CHECK: %[[CONST_0:[a-z0-9]*]] = "tf.Const"()
   %0 = "tf.Const"() {value = dense<[]> : tensor<0xf32>} : () -> tensor<0xf32>
-  %1 = "tf.SelectV2"(%arg8, %arg6, %arg7) : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>) -> tensor<!tf.string>
 
   %2 = "tf.Const"() {value = dense<0.0> : tensor<2x2xf32>} : () -> tensor<2x2xf32>
   %3 = "tf.Const"() {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
   "tf.SendTPUEmbeddingGradients"(%2, %3) {_tpu_embedding_layer = "call1", config = "\0A\0B\0C\0D", operand_segment_sizes = dense<[2, 0]> : vector<2xi32>} : (tensor<2x2xf32>, tensor<4x4xf32>) -> ()
 
-  // CHECK: "tf.EnqueueTPUEmbeddingSparseTensorBatch"(%[[ARG_0]], %[[ARG_1]], %[[ARG_2]], %[[ARG_3]], %[[ARG_4]], %[[ARG_5]], %[[CONST_0]], %[[CONST_0]], %[[CONST_0]], %[[ARG_6]])
-  "tf.EnqueueTPUEmbeddingSparseTensorBatch"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %0, %0, %0, %1) {_tpu_embedding_layer = "call1", _xla_outside_compilation = "0", combiners = ["mean", "sum"], device_ordinal = -1 : i64, max_sequence_lengths = [0, 0, 0], table_ids = [1, 1, 0]} : (tensor<?x2xi32>, tensor<?x2xi32>, tensor<?x2xi32>, tensor<?xi32>, tensor<?xi32>, tensor<?xi32>, tensor<0xf32>, tensor<0xf32>, tensor<0xf32>, tensor<!tf.string>) -> ()
+  // CHECK: %[[CONST_MODE:[a-z0-9]*]] = "tf.Const"() {_xla_outside_compilation = "0", value = dense<"train"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  // CHECK: "tf.EnqueueTPUEmbeddingSparseTensorBatch"(%[[ARG_0]], %[[ARG_1]], %[[ARG_2]], %[[ARG_3]], %[[ARG_4]], %[[ARG_5]], %[[CONST_0]], %[[CONST_0]], %[[CONST_0]], %[[CONST_MODE]])
+  "tf.EnqueueTPUEmbeddingSparseTensorBatch"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %0, %0, %0, %arg7) {_tpu_embedding_layer = "call1", _xla_outside_compilation = "0", combiners = ["mean", "sum"], device_ordinal = -1 : i64, max_sequence_lengths = [0, 0, 0], table_ids = [1, 1, 0]} : (tensor<?x2xi32>, tensor<?x2xi32>, tensor<?x2xi32>, tensor<?xi32>, tensor<?xi32>, tensor<?xi32>, tensor<0xf32>, tensor<0xf32>, tensor<0xf32>, tensor<!tf.string>) -> ()
   %4:2 = "tf.RecvTPUEmbeddingActivations"() {_tpu_embedding_layer = "call1", config = "\0A\0B\0C\0D"} : () -> (tensor<2x2xf32>, tensor<4x4xf32>)
   return
 }
@@ -65,15 +63,3 @@ func @check_enqueue_ops_with_different_attr_disallowed(%arg0: tensor<?x2xi32>, %
   return
 }
 
-// -----
-
-func @check_embedding_ops_with_missing_attribute_disallowed(%arg0: tensor<?x2xi32>, %arg1: tensor<?x2xi32>,
-  %arg2 :tensor<?x2xi32>, %arg3: tensor<?xi32>, %arg4: tensor<?xi32>, %arg5: tensor<?xi32>,
-  %arg6: tensor<!tf.string>, %arg7: tensor<!tf.string>, %arg8: tensor<i1>) -> () {
-  %0 = "tf.Const"() {value = dense<[]> : tensor<0xf32>} : () -> tensor<0xf32>
-  %1 = "tf.SelectV2"(%arg8, %arg6, %arg7) : (tensor<i1>, tensor<!tf.string>, tensor<!tf.string>) -> tensor<!tf.string>
-  "tf.EnqueueTPUEmbeddingSparseTensorBatch"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %0, %0, %0, %1) {_tpu_embedding_layer = "call_123", _xla_outside_compilation = "0", combiners = ["mean", "sum"], device_ordinal = -1 : i64, max_sequence_lengths = [0, 0, 0], table_ids = [1, 1, 0]} : (tensor<?x2xi32>, tensor<?x2xi32>, tensor<?x2xi32>, tensor<?xi32>, tensor<?xi32>, tensor<?xi32>, tensor<0xf32>, tensor<0xf32>, tensor<0xf32>, tensor<!tf.string>) -> ()
-  // expected-error @+1 {{'tf.RecvTPUEmbeddingActivations' op requires attribute '_tpu_embedding_layer'}}
-  %2:2 = "tf.RecvTPUEmbeddingActivations"() {config = "\0A\0B\0C\0D"} : () -> (tensor<2x2xf32>, tensor<4x4xf32>)
-  return
-}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/unroll-batch-matmul.mlir b/tensorflow/compiler/mlir/tensorflow/tests/unroll-batch-matmul.mlir
index 5a3f0b6e997..7cf5f19523d 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/unroll-batch-matmul.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/unroll-batch-matmul.mlir
@@ -67,41 +67,35 @@ func @batchMatMulV2FlatInput(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x5x6xf32>)
   return %0 : tensor<3x4x6xf32>
 
   // CHECK-LABEL: batchMatMulV2FlatInput
-  // CHECK: %[[cst:.*]] = "tf.Const"() {value = dense<[3, 4, 5]> : tensor<3xi64>}
   // CHECK: %[[cst_0:.*]] = "tf.Const"() {value = dense<[1, 4, 5]> : tensor<3xi64>}
   // CHECK: %[[cst_1:.*]] = "tf.Const"() {value = dense<[4, 5]> : tensor<2xi64>}
-  // CHECK: %[[cst_2:.*]] = "tf.Const"() {value = dense<[3, 5, 6]> : tensor<3xi64>}
-  // CHECK: %[[cst_3:.*]] = "tf.Const"() {value = dense<0> : tensor<3xi64>}
-  // CHECK: %[[cst_4:.*]] = "tf.Const"() {value = dense<[1, 0, 0]> : tensor<3xi64>}
-  // CHECK: %[[cst_5:.*]] = "tf.Const"() {value = dense<[2, 0, 0]> : tensor<3xi64>}
-  // CHECK: %[[cst_6:.*]] = "tf.Const"() {value = dense<[1, 5, 6]> : tensor<3xi64>}
-  // CHECK: %[[cst_7:.*]] = "tf.Const"() {value = dense<[5, 6]> : tensor<2xi64>}
-  // CHECK: %[[cst_8:.*]] = "tf.Const"() {value = dense<[3, 4, 6]> : tensor<3xi64>}
+  // CHECK: %[[cst_2:.*]] = "tf.Const"() {value = dense<0> : tensor<3xi64>}
+  // CHECK: %[[cst_3:.*]] = "tf.Const"() {value = dense<[1, 0, 0]> : tensor<3xi64>}
+  // CHECK: %[[cst_4:.*]] = "tf.Const"() {value = dense<[2, 0, 0]> : tensor<3xi64>}
+  // CHECK: %[[cst_5:.*]] = "tf.Const"() {value = dense<[1, 5, 6]> : tensor<3xi64>}
+  // CHECK: %[[cst_6:.*]] = "tf.Const"() {value = dense<[5, 6]> : tensor<2xi64>}
 
-  // CHECK: %[[v0:.*]] = "tf.Reshape"(%arg0, %[[cst]]) : (tensor<3x4x5xf32>, tensor<3xi64>) -> tensor<3x4x5xf32>
-  // CHECK: %[[v1:.*]] = "tf.Slice"(%[[v0]], %[[cst_3]], %[[cst_0]]) : (tensor<3x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
-  // CHECK: %[[v2:.*]] = "tf.Reshape"(%[[v1]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
-  // CHECK: %[[v3:.*]] = "tf.Slice"(%[[v0]], %[[cst_4]], %[[cst_0]]) : (tensor<3x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
-  // CHECK: %[[v4:.*]] = "tf.Reshape"(%[[v3]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
-  // CHECK: %[[v5:.*]] = "tf.Slice"(%[[v0]], %[[cst_5]], %[[cst_0]]) : (tensor<3x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
-  // CHECK: %[[v6:.*]] = "tf.Reshape"(%[[v5]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
+  // CHECK: %[[v0:.*]] = "tf.Slice"(%arg0, %[[cst_2]], %[[cst_0]]) : (tensor<3x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
+  // CHECK: %[[v1:.*]] = "tf.Reshape"(%[[v0]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
+  // CHECK: %[[v2:.*]] = "tf.Slice"(%arg0, %[[cst_3]], %[[cst_0]]) : (tensor<3x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
+  // CHECK: %[[v3:.*]] = "tf.Reshape"(%[[v2]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
+  // CHECK: %[[v4:.*]] = "tf.Slice"(%arg0, %[[cst_4]], %[[cst_0]]) : (tensor<3x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
+  // CHECK: %[[v5:.*]] = "tf.Reshape"(%[[v4]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
 
-  // CHECK: %[[v7:.*]] = "tf.Reshape"(%arg1, %[[cst_2]]) : (tensor<3x5x6xf32>, tensor<3xi64>) -> tensor<3x5x6xf32>
-  // CHECK: %[[v8:.*]] = "tf.Slice"(%[[v7]], %[[cst_3]], %[[cst_6]]) : (tensor<3x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
-  // CHECK: %[[v9:.*]] = "tf.Reshape"(%[[v8]], %[[cst_7]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
-  // CHECK: %[[v10:.*]] = "tf.Slice"(%[[v7]], %[[cst_4]], %[[cst_6]]) : (tensor<3x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
-  // CHECK: %[[v11:.*]] = "tf.Reshape"(%[[v10]], %[[cst_7]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
-  // CHECK: %[[v12:.*]] = "tf.Slice"(%[[v7]], %[[cst_5]], %[[cst_6]]) : (tensor<3x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
-  // CHECK: %[[v13:.*]] = "tf.Reshape"(%[[v12]], %[[cst_7]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
+  // CHECK: %[[v6:.*]] = "tf.Slice"(%arg1, %[[cst_2]], %[[cst_5]]) : (tensor<3x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
+  // CHECK: %[[v7:.*]] = "tf.Reshape"(%[[v6]], %[[cst_6]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
+  // CHECK: %[[v8:.*]] = "tf.Slice"(%arg1, %[[cst_3]], %[[cst_5]]) : (tensor<3x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
+  // CHECK: %[[v9:.*]] = "tf.Reshape"(%[[v8]], %[[cst_6]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
+  // CHECK: %[[v10:.*]] = "tf.Slice"(%arg1, %[[cst_4]], %[[cst_5]]) : (tensor<3x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
+  // CHECK: %[[v11:.*]] = "tf.Reshape"(%[[v10]], %[[cst_6]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[v14:.*]] = "tf.MatMul"(%[[v2]], %[[v9]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[v15:.*]] = "tf.MatMul"(%[[v4]], %[[v11]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[v16:.*]] = "tf.MatMul"(%[[v6]], %[[v13]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[mm0:.*]] = "tf.MatMul"(%[[v1]], %[[v7]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[mm1:.*]] = "tf.MatMul"(%[[v3]], %[[v9]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[mm2:.*]] = "tf.MatMul"(%[[v5]], %[[v11]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
-  // CHECK: %[[v17:.*]] = "tf.Pack"(%[[v14]], %[[v15]], %[[v16]]) {axis = 0 : i64} : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
-  // CHECK: %[[v18:.*]] = "tf.Reshape"(%[[v17]], %[[cst_8]]) : (tensor<3x4x6xf32>, tensor<3xi64>) -> tensor<3x4x6xf32>
+  // CHECK: %[[v17:.*]] = "tf.Pack"(%[[mm0]], %[[mm1]], %[[mm2]]) {axis = 0 : i64} : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
 
-  // CHECK: return %[[v18]] : tensor<3x4x6xf32>
+  // CHECK: return %[[v17]] : tensor<3x4x6xf32>
 }
 
 // -----
@@ -184,41 +178,35 @@ func @batchMatMulFlatInput(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x5x6xf32>) -
   return %0 : tensor<3x4x6xf32>
 
   // CHECK-LABEL: batchMatMulFlatInput
-  // CHECK: %[[cst:.*]] = "tf.Const"() {value = dense<[3, 4, 5]> : tensor<3xi64>}
   // CHECK: %[[cst_0:.*]] = "tf.Const"() {value = dense<[1, 4, 5]> : tensor<3xi64>}
   // CHECK: %[[cst_1:.*]] = "tf.Const"() {value = dense<[4, 5]> : tensor<2xi64>}
-  // CHECK: %[[cst_2:.*]] = "tf.Const"() {value = dense<[3, 5, 6]> : tensor<3xi64>}
-  // CHECK: %[[cst_3:.*]] = "tf.Const"() {value = dense<0> : tensor<3xi64>}
-  // CHECK: %[[cst_4:.*]] = "tf.Const"() {value = dense<[1, 0, 0]> : tensor<3xi64>}
-  // CHECK: %[[cst_5:.*]] = "tf.Const"() {value = dense<[2, 0, 0]> : tensor<3xi64>}
-  // CHECK: %[[cst_6:.*]] = "tf.Const"() {value = dense<[1, 5, 6]> : tensor<3xi64>}
-  // CHECK: %[[cst_7:.*]] = "tf.Const"() {value = dense<[5, 6]> : tensor<2xi64>}
-  // CHECK: %[[cst_8:.*]] = "tf.Const"() {value = dense<[3, 4, 6]> : tensor<3xi64>}
+  // CHECK: %[[cst_2:.*]] = "tf.Const"() {value = dense<0> : tensor<3xi64>}
+  // CHECK: %[[cst_3:.*]] = "tf.Const"() {value = dense<[1, 0, 0]> : tensor<3xi64>}
+  // CHECK: %[[cst_4:.*]] = "tf.Const"() {value = dense<[2, 0, 0]> : tensor<3xi64>}
+  // CHECK: %[[cst_5:.*]] = "tf.Const"() {value = dense<[1, 5, 6]> : tensor<3xi64>}
+  // CHECK: %[[cst_6:.*]] = "tf.Const"() {value = dense<[5, 6]> : tensor<2xi64>}
 
-  // CHECK: %[[v0:.*]] = "tf.Reshape"(%arg0, %[[cst]]) : (tensor<3x4x5xf32>, tensor<3xi64>) -> tensor<3x4x5xf32>
-  // CHECK: %[[v1:.*]] = "tf.Slice"(%[[v0]], %[[cst_3]], %[[cst_0]]) : (tensor<3x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
-  // CHECK: %[[v2:.*]] = "tf.Reshape"(%[[v1]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
-  // CHECK: %[[v3:.*]] = "tf.Slice"(%[[v0]], %[[cst_4]], %[[cst_0]]) : (tensor<3x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
-  // CHECK: %[[v4:.*]] = "tf.Reshape"(%[[v3]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
-  // CHECK: %[[v5:.*]] = "tf.Slice"(%[[v0]], %[[cst_5]], %[[cst_0]]) : (tensor<3x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
-  // CHECK: %[[v6:.*]] = "tf.Reshape"(%[[v5]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
+  // CHECK: %[[v0:.*]] = "tf.Slice"(%arg0, %[[cst_2]], %[[cst_0]]) : (tensor<3x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
+  // CHECK: %[[v1:.*]] = "tf.Reshape"(%[[v0]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
+  // CHECK: %[[v2:.*]] = "tf.Slice"(%arg0, %[[cst_3]], %[[cst_0]]) : (tensor<3x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
+  // CHECK: %[[v3:.*]] = "tf.Reshape"(%[[v2]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
+  // CHECK: %[[v4:.*]] = "tf.Slice"(%arg0, %[[cst_4]], %[[cst_0]]) : (tensor<3x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
+  // CHECK: %[[v5:.*]] = "tf.Reshape"(%[[v4]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
 
-  // CHECK: %[[v7:.*]] = "tf.Reshape"(%arg1, %[[cst_2]]) : (tensor<3x5x6xf32>, tensor<3xi64>) -> tensor<3x5x6xf32>
-  // CHECK: %[[v8:.*]] = "tf.Slice"(%[[v7]], %[[cst_3]], %[[cst_6]]) : (tensor<3x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
-  // CHECK: %[[v9:.*]] = "tf.Reshape"(%[[v8]], %[[cst_7]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
-  // CHECK: %[[v10:.*]] = "tf.Slice"(%[[v7]], %[[cst_4]], %[[cst_6]]) : (tensor<3x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
-  // CHECK: %[[v11:.*]] = "tf.Reshape"(%[[v10]], %[[cst_7]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
-  // CHECK: %[[v12:.*]] = "tf.Slice"(%[[v7]], %[[cst_5]], %[[cst_6]]) : (tensor<3x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
-  // CHECK: %[[v13:.*]] = "tf.Reshape"(%[[v12]], %[[cst_7]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
+  // CHECK: %[[v6:.*]] = "tf.Slice"(%arg1, %[[cst_2]], %[[cst_5]]) : (tensor<3x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
+  // CHECK: %[[v7:.*]] = "tf.Reshape"(%[[v6]], %[[cst_6]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
+  // CHECK: %[[v8:.*]] = "tf.Slice"(%arg1, %[[cst_3]], %[[cst_5]]) : (tensor<3x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
+  // CHECK: %[[v9:.*]] = "tf.Reshape"(%[[v8]], %[[cst_6]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
+  // CHECK: %[[v10:.*]] = "tf.Slice"(%arg1, %[[cst_4]], %[[cst_5]]) : (tensor<3x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
+  // CHECK: %[[v11:.*]] = "tf.Reshape"(%[[v10]], %[[cst_6]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[v14:.*]] = "tf.MatMul"(%[[v2]], %[[v9]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[v15:.*]] = "tf.MatMul"(%[[v4]], %[[v11]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[v16:.*]] = "tf.MatMul"(%[[v6]], %[[v13]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[mm0:.*]] = "tf.MatMul"(%[[v1]], %[[v7]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[mm1:.*]] = "tf.MatMul"(%[[v3]], %[[v9]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[mm2:.*]] = "tf.MatMul"(%[[v5]], %[[v11]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
-  // CHECK: %[[v17:.*]] = "tf.Pack"(%[[v14]], %[[v15]], %[[v16]]) {axis = 0 : i64} : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
-  // CHECK: %[[v18:.*]] = "tf.Reshape"(%[[v17]], %[[cst_8]]) : (tensor<3x4x6xf32>, tensor<3xi64>) -> tensor<3x4x6xf32>
+  // CHECK: %[[v17:.*]] = "tf.Pack"(%[[mm0]], %[[mm1]], %[[mm2]]) {axis = 0 : i64} : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
 
-  // CHECK: return %[[v18]] : tensor<3x4x6xf32>
+  // CHECK: return %[[v17]] : tensor<3x4x6xf32>
 }
 
 // -----
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/attribute_utils.h b/tensorflow/compiler/mlir/tensorflow/transforms/attribute_utils.h
new file mode 100644
index 00000000000..599a8df63d7
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/attribute_utils.h
@@ -0,0 +1,42 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_ATTRIBUTE_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_ATTRIBUTE_UTILS_H_
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+
+// Copies attributes that satisfy the given predicate from `from` to `to`.
+template <typename Predicate>
+void CopyAttributes(Operation *from, Operation *to, Predicate P) {
+  for (const NamedAttribute &attr : from->getAttrs())
+    if (P(attr)) to->setAttr(attr.first, attr.second);
+}
+
+// Copies attributes whose name begins with an _ from `from` to `to`.
+inline void CopyUnderscoredAttributes(Operation *from, Operation *to) {
+  CopyAttributes(from, to, [](const NamedAttribute &attr) {
+    return attr.first.strref().front() == '_';
+  });
+}
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_ATTRIBUTE_UTILS_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
index 1963931b497..2a5c8a05ef3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
@@ -40,14 +40,17 @@ void EnableLogging(PassManager *pm) {
 namespace TFTPU {
 namespace {
 void AddGraphExportLoweringPasses(OpPassManager &pm) {
+  auto add_pass = [&](std::unique_ptr<Pass> pass) {
+    pm.addNestedPass<FuncOp>(std::move(pass));
+    pm.addPass(CreateBreakUpIslandsPass());
+  };
+
   pm.addNestedPass<FuncOp>(CreateFunctionalToExecutorDialectConversionPass());
-  pm.addNestedPass<FuncOp>(CreateBreakUpIslandsPass());
-  pm.addNestedPass<FuncOp>(TFDevice::CreateReplicateToIslandPass());
-  pm.addNestedPass<FuncOp>(CreateBreakUpIslandsPass());
-  pm.addNestedPass<FuncOp>(TFDevice::CreateParallelExecuteToIslandsPass());
-  pm.addNestedPass<FuncOp>(CreateBreakUpIslandsPass());
-  pm.addNestedPass<FuncOp>(TFDevice::CreateLaunchToDeviceAttributePass());
-  pm.addNestedPass<FuncOp>(CreateBreakUpIslandsPass());
+  add_pass(TFDevice::CreateParallelizeEmbeddingParamsOpsPass());
+  pm.addPass(TFDevice::CreateReplicateToIslandPass());
+  pm.addPass(CreateBreakUpIslandsPass());
+  add_pass(TFDevice::CreateParallelExecuteToIslandsPass());
+  add_pass(TFDevice::CreateLaunchToDeviceAttributePass());
 }
 
 tensorflow::Status RunTPUBridge(
@@ -80,14 +83,23 @@ void CreateTPUBridgePipeline(OpPassManager &pm) {
   // Run shape inference so that tf_executor/tf_device ops created later will
   // likely to inherit more concrete types.
   pm.addPass(TF::CreateTFShapeInferencePass());
-  OpPassManager &func_pm = pm.nest<FuncOp>();
-  func_pm.addPass(CreateTPUClusterFormationPass());
-  // Place DecomposeResourceOpsPass before TFExecutorConstantSinking pass
-  // because DecomposeResourceOpsPass uses pattern rewriter which hoists
-  // changed constants out of tf_device.Launch.
-  func_pm.addPass(TFDevice::CreateDecomposeResourceOpsPass());
-  func_pm.addPass(CreateTPUHostComputationExpansionPass());
+  // Encode this in its own scope so that func_pm is not mistakenly used
+  // later on.
+  {
+    OpPassManager &func_pm = pm.nest<FuncOp>();
+    func_pm.addPass(CreateTPUClusterFormationPass());
+    // Place DecomposeResourceOpsPass before TFExecutorConstantSinking pass
+    // because DecomposeResourceOpsPass uses pattern rewriter which hoists
+    // changed constants out of tf_device.Launch.
+    func_pm.addPass(TFDevice::CreateDecomposeResourceOpsPass());
+    func_pm.addPass(CreateTPUHostComputationExpansionPass());
+    func_pm.addPass(CreateTPUUpdateEmbeddingEnqueueOpInputsPass());
+  }
+  pm.addPass(TF::CreateTFFunctionalControlFlowToRegions());
+  pm.addPass(mlir::createInlinerPass());
   pm.addPass(CreateTPUExtractHeadTailOutsideCompilationPass());
+  pm.addPass(TF::CreateTFRegionControlFlowToFunctional());
+
   // Run another shape inference pass because resource decomposition might have
   // created new partial types.
   pm.addPass(TF::CreateTFShapeInferencePass());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td b/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
index 9d72284da91..d5b7eb7a739 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
@@ -150,6 +150,7 @@ def LogToLog1p : Pat<
 // LogicalNot op patterns.
 //===----------------------------------------------------------------------===//
 
+// TODO(ezhulenev): Generalize this pattern for all involutions.
 def LogicalNotNested : Pat<(TF_LogicalNotOp (TF_LogicalNotOp $arg)),
                            (replaceWithValue $arg)>;
 
@@ -187,6 +188,13 @@ def NegNested : Pat<(TF_NegOp (TF_NegOp $arg)), (replaceWithValue $arg)>;
 def RealDivWithSqrtDivisor : Pat<(TF_RealDivOp $arg0, (TF_SqrtOp $arg1)),
                                  (TF_MulOp $arg0, (TF_RsqrtOp $arg1))>;
 
+// Replace division by a constant with a multiplication by a reciprocal of that
+// constant. Floating point division can be ~10x more expensive than a
+// multiplication.
+def RealDivWithConstDivisor : Pat<
+  (TF_RealDivOp $arg0, (TF_ConstOp FloatElementsAttr<32>:$value)),
+  (TF_MulOp $arg0, (TF_ReciprocalOp (TF_ConstOp $value)))>;
+
 //===----------------------------------------------------------------------===//
 // Reciprocal op patterns.
 //===----------------------------------------------------------------------===//
@@ -201,6 +209,11 @@ def ReciprocalNested : Pat<(TF_ReciprocalOp (TF_ReciprocalOp $arg)),
 def RedundantReshape : Pat<(TF_ReshapeOp (TF_ReshapeOp $arg, $unused), $shape),
                            (TF_ReshapeOp $arg, $shape)>;
 
+def IsSame : Constraint<CPred<"$0 == $1">>;
+def ReshapeToSelfShape : Pat<(TF_ReshapeOp $arg0, (TF_ShapeOp $arg1)),
+                         (replaceWithValue $arg0),
+                         [(IsSame $arg0, $arg1)]>;
+
 //===----------------------------------------------------------------------===//
 // Select op patterns.
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc b/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc
index 58c4eac5c95..57a5cd888a1 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc
@@ -77,8 +77,7 @@ Value GetIndicesForElement(Value index, Value buffer, OpBuilder builder,
       ArrayRef<Type>{RankedTensorType::get(
           {static_cast<int64_t>(buffer_type.getShape().size())},
           getElementTypeOrSelf(index.getType()))},
-      ArrayRef<Value>{index, zeros_tensor, CreateScalarConst(0, builder, loc)},
-      ArrayRef<NamedAttribute>{});
+      ArrayRef<Value>{index, zeros_tensor, CreateScalarConst(0, builder, loc)});
 }
 
 Value GetElement(Value index, Value buffer, OpBuilder builder, Location loc,
@@ -95,15 +94,14 @@ Value GetElement(Value index, Value buffer, OpBuilder builder, Location loc,
   auto slice = builder.create<TF::SliceOp>(
       loc, ArrayRef<Type>{slice_type},
       ArrayRef<Value>{buffer, GetIndicesForElement(index, buffer, builder, loc),
-                      size_const},
-      ArrayRef<NamedAttribute>{});
+                      size_const});
   if (keep_slice_shape) return slice;
   auto element_type = RankedTensorType::get(buffer_type.getShape().drop_front(),
                                             buffer_type.getElementType());
   auto reshape = builder.create<TF::ReshapeOp>(
       loc, ArrayRef<Type>{element_type},
-      ArrayRef<Value>{slice, GetR1Const(element_type.getShape(), builder, loc)},
-      ArrayRef<NamedAttribute>{});
+      ArrayRef<Value>{slice,
+                      GetR1Const(element_type.getShape(), builder, loc)});
   return reshape.output();
 }
 
@@ -120,15 +118,13 @@ Value SetElement(Value index, Value buffer, Value element, OpBuilder builder,
   if (element.getType() != slice_type) {
     update_slice = builder.create<TF::ReshapeOp>(
         loc, ArrayRef<Type>{slice_type},
-        ArrayRef<Value>{element, GetR1Const(slice_shape, builder, loc)},
-        ArrayRef<NamedAttribute>{});
+        ArrayRef<Value>{element, GetR1Const(slice_shape, builder, loc)});
   }
   return builder
       .create<TF::XlaDynamicUpdateSliceOp>(
           loc, ArrayRef<Type>{buffer.getType()},
           ArrayRef<Value>{buffer, update_slice,
-                          GetIndicesForElement(index, buffer, builder, loc)},
-          ArrayRef<NamedAttribute>{})
+                          GetIndicesForElement(index, buffer, builder, loc)})
       .output();
 }
 
@@ -140,8 +136,7 @@ Value ReshapeScalarToSizeType(OpBuilder builder, Value scalar, Location loc) {
   auto size_type = GetSizeType(builder);
   return builder.create<TF::ReshapeOp>(
       loc, ArrayRef<Type>{size_type},
-      ArrayRef<Value>{scalar, GetR1Const(size_type.getShape(), builder, loc)},
-      ArrayRef<NamedAttribute>{});
+      ArrayRef<Value>{scalar, GetR1Const(size_type.getShape(), builder, loc)});
 }
 
 LogicalResult CreateInitBufferValue(ArrayRef<int64_t> element_shape,
@@ -171,13 +166,12 @@ LogicalResult CreateInitBufferValue(ArrayRef<int64_t> element_shape,
   if (getElementTypeOrSelf(zero.getType()) != element_dtype) {
     zero = builder.create<TF::CastOp>(
         op->getLoc(), ArrayRef<Type>{RankedTensorType::get({}, element_dtype)},
-        ArrayRef<Value>{zero}, ArrayRef<NamedAttribute>{});
+        ArrayRef<Value>{zero});
   }
   auto buffer_type = RankedTensorType::get(buffer_shape, element_dtype);
   auto broadcast = builder.create<TF::BroadcastToOp>(
       op->getLoc(), ArrayRef<Type>{buffer_type},
-      ArrayRef<Value>{zero, GetR1Const(buffer_shape, builder, op->getLoc())},
-      ArrayRef<NamedAttribute>{});
+      ArrayRef<Value>{zero, GetR1Const(buffer_shape, builder, op->getLoc())});
   *buffer = broadcast.output();
   return success();
 }
@@ -187,14 +181,14 @@ llvm::Optional<RankedTensorType> GetElementTypeFromAccess(
     llvm::function_ref<llvm::Optional<Type>(Operation*)> infer_from_op) {
   for (auto& use : collection.getUses()) {
     if (auto while_op = llvm::dyn_cast<TF::WhileOp>(use.getOwner())) {
-      auto body = module.lookupSymbol<FuncOp>(while_op.body());
+      auto body = while_op.body_func();
       assert(body);
       auto type_from_body = GetElementTypeFromAccess(
           body.getArgument(use.getOperandNumber()), module, infer_from_op);
       if (type_from_body.hasValue()) return type_from_body;
     } else if (auto if_op = llvm::dyn_cast<TF::IfOp>(use.getOwner())) {
-      auto then_branch = module.lookupSymbol<FuncOp>(if_op.then_branch());
-      auto else_branch = module.lookupSymbol<FuncOp>(if_op.else_branch());
+      auto then_branch = if_op.then_func();
+      auto else_branch = if_op.else_func();
       assert(then_branch && else_branch);
       auto type_from_then = GetElementTypeFromAccess(
           then_branch.getArgument(use.getOperandNumber() - 1), module,
@@ -204,18 +198,8 @@ llvm::Optional<RankedTensorType> GetElementTypeFromAccess(
           else_branch.getArgument(use.getOperandNumber() - 1), module,
           infer_from_op);
       if (type_from_else.hasValue()) return type_from_else;
-    } else if (auto pcall =
-                   llvm::dyn_cast<TF::PartitionedCallOp>(use.getOwner())) {
-      if (!pcall.f().isa<FlatSymbolRefAttr>()) continue;
-      auto callee = module.lookupSymbol<FuncOp>(pcall.f().getRootReference());
-      assert(callee);
-      auto type_from_callee = GetElementTypeFromAccess(
-          callee.getArgument(use.getOperandNumber()), module, infer_from_op);
-      if (type_from_callee.hasValue()) return type_from_callee;
-    } else if (auto spcall = llvm::dyn_cast<TF::StatefulPartitionedCallOp>(
-                   use.getOwner())) {
-      auto callee = module.lookupSymbol<FuncOp>(spcall.f());
-      assert(callee);
+    } else if (auto call = llvm::dyn_cast<CallOpInterface>(use.getOwner())) {
+      auto callee = dyn_cast<FuncOp>(call.resolveCallable());
       auto type_from_callee = GetElementTypeFromAccess(
           callee.getArgument(use.getOperandNumber()), module, infer_from_op);
       if (type_from_callee.hasValue()) return type_from_callee;
@@ -241,27 +225,24 @@ Value ReadLocalVariable(Value local_var, OpBuilder builder, Location loc) {
           ArrayRef<Type>{getElementTypeOrSelf(local_var.getType())
                              .cast<TF::ResourceType>()
                              .getSubtypes()[0]},
-          ArrayRef<Value>{local_var}, ArrayRef<NamedAttribute>{})
+          ArrayRef<Value>{local_var})
       .value();
 }
 
 // Creates an AssignVariableOp on a local variable.
 TF::AssignVariableOp WriteLocalVariable(Value local_var, Value value,
                                         OpBuilder builder, Location loc) {
-  return builder.create<TF::AssignVariableOp>(loc, ArrayRef<Type>{},
-                                              ArrayRef<Value>{local_var, value},
-                                              ArrayRef<NamedAttribute>{});
+  return builder.create<TF::AssignVariableOp>(
+      loc, ArrayRef<Type>{}, ArrayRef<Value>{local_var, value});
 }
 
 Value AccumulateBuffers(Value a, Value b, OpBuilder builder, Location loc) {
   if (getElementTypeOrSelf(a.getType()) == builder.getI1Type()) {
     return builder.create<TF::LogicalOrOp>(loc, ArrayRef<Type>{a.getType()},
-                                           ArrayRef<Value>{a, b},
-                                           ArrayRef<NamedAttribute>{});
+                                           ArrayRef<Value>{a, b});
   }
   return builder.create<TF::AddV2Op>(loc, ArrayRef<Type>{a.getType()},
-                                     ArrayRef<Value>{a, b},
-                                     ArrayRef<NamedAttribute>{});
+                                     ArrayRef<Value>{a, b});
 }
 
 namespace {
@@ -303,15 +284,13 @@ Value GatherElements(Value indices, Value buffer, OpBuilder builder,
     return builder.create<TF::SliceOp>(
         loc, ArrayRef<Type>{slice_type},
         ArrayRef<Value>{buffer, GetR1Const(slice_starts, builder, loc),
-                        GetR1Const(result_shape, builder, loc)},
-        ArrayRef<NamedAttribute>{});
+                        GetR1Const(result_shape, builder, loc)});
   }
   auto result_type =
       RankedTensorType::get(result_shape, buffer_type.getElementType());
   return builder.create<TF::GatherV2Op>(
       loc, ArrayRef<Type>{result_type},
-      ArrayRef<Value>{buffer, indices, CreateScalarConst(0, builder, loc)},
-      ArrayRef<NamedAttribute>{});
+      ArrayRef<Value>{buffer, indices, CreateScalarConst(0, builder, loc)});
 }
 
 Value ScatterAccumulateElements(Value indices, Value updates, Value buffer,
@@ -334,8 +313,7 @@ Value ScatterAccumulateElements(Value indices, Value updates, Value buffer,
     auto index = builder.create<TF::SliceOp>(
         loc, ArrayRef<Type>{GetSizeType(builder)},
         ArrayRef<Value>{indices, GetR1Const({i}, builder, loc),
-                        GetR1Const({1}, builder, loc)},
-        ArrayRef<NamedAttribute>{});
+                        GetR1Const({1}, builder, loc)});
     auto old_slice =
         GetElement(index, buffer, builder, loc, /*keep_slice_shape=*/true);
     starts_in_update[0] = i;
@@ -344,8 +322,7 @@ Value ScatterAccumulateElements(Value indices, Value updates, Value buffer,
         builder
             .create<TF::SliceOp>(
                 loc, ArrayRef<Type>{old_slice.getType()},
-                ArrayRef<Value>{updates, update_slice_starts, slice_sizes},
-                ArrayRef<NamedAttribute>{})
+                ArrayRef<Value>{updates, update_slice_starts, slice_sizes})
             .output();
     slice = AccumulateBuffers(old_slice, slice, builder, loc);
     buffer = SetElement(index, buffer, slice, builder, loc);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
index 007baaae433..1429e2b3fd4 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
@@ -40,7 +40,7 @@ namespace TF {
 // LINT.IfChange(folding-policy)
 static bool ShouldBeFolded(Operation* inst) {
   constexpr int kSizeFactor = 2;
-  constexpr int64_t kSizeThreshold = (1 << 20);  // 128 KB
+  constexpr int64_t kSizeThreshold = (1 << 21);  // 256 KB
   bool has_unknown_shape = false;
   auto get_size = [&](TypeRange types) {
     int64_t size = 0;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/deduplicate_bound_input_bindings.cc b/tensorflow/compiler/mlir/tensorflow/transforms/deduplicate_bound_input_bindings.cc
new file mode 100644
index 00000000000..c1514dfa357
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/deduplicate_bound_input_bindings.cc
@@ -0,0 +1,65 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+
+namespace mlir {
+namespace tf_saved_model {
+namespace {
+
+class DedupBoundInputBindingPass
+    : public PassWrapper<DedupBoundInputBindingPass, FunctionPass> {
+ public:
+  void runOnFunction() override;
+};
+
+void DedupBoundInputBindingPass::runOnFunction() {
+  FuncOp func = getFunction();
+  if (!mlir::tf_saved_model::IsExported(func)) return;
+  llvm::SmallDenseMap<Attribute, unsigned, 8> unique_bound_inputs;
+  llvm::SmallVector<unsigned, 8> arg_indices_to_erase;
+  for (unsigned i = 0, e = func.getNumArguments(); i < e; i++) {
+    auto attr = func.getArgAttrOfType<FlatSymbolRefAttr>(
+        i, "tf_saved_model.bound_input");
+    if (!attr) continue;
+    auto inserted = unique_bound_inputs.insert(std::make_pair(attr, i));
+    if (inserted.second) continue;
+    auto duplicate_arg = func.getArgument(i);
+    auto original_arg = func.getArgument(unique_bound_inputs[attr]);
+    duplicate_arg.replaceAllUsesWith(original_arg);
+    arg_indices_to_erase.push_back(i);
+  }
+  func.eraseArguments(arg_indices_to_erase);
+}
+
+}  // namespace
+
+static PassRegistration<DedupBoundInputBindingPass> pass(
+    "tf-saved-model-dedup-bound-input-binding-pass",
+    "Remove duplicate 'tf_saved_model.bound_input' bindings.");
+
+std::unique_ptr<OperationPass<FuncOp>> CreateDedupBoundInputBindingPass() {
+  return std::make_unique<DedupBoundInputBindingPass>();
+}
+
+}  // namespace tf_saved_model
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
index 1e622a295ec..69dab58c3f5 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
@@ -54,6 +54,8 @@ enum EinsumEquation {
   TransposeMatMul,
   BatchMatMulReducedDim,
   TransposeReducedDim,
+  FourDReduceLast,
+  FourDTransposeAll,
   UnsupportedEquation
 };
 
@@ -72,7 +74,7 @@ constexpr int kNumSupportedEquationVariables = 5;  // A - E for now.
 bool tokenizeEquation(const llvm::StringRef& equation,
                       std::vector<EquationToken>* tokens) {
   std::map<char, EquationToken> label_axis_mapping;
-  int index = 0;
+  size_t index = 0;
   int variable_count = 0;
   llvm::Regex r("[[:alpha:]]");
   while (index < equation.size()) {
@@ -146,6 +148,14 @@ EinsumEquation parseEquation(const std::vector<EquationToken>& eqn) {
   if (is_equal(eqn, {A, B, C, COMMA, A, B, D, C, ARROW, A, B, D})) {
     return EinsumEquation::TransposeReducedDim;
   }
+  // ABCD,ADBE->ACBE
+  if (is_equal(eqn, {A, B, C, D, COMMA, A, D, B, E, ARROW, A, C, B, E})) {
+    return EinsumEquation::FourDReduceLast;
+  }
+  // ABCD,AECD->ACEB
+  if (is_equal(eqn, {A, B, C, D, COMMA, A, E, C, D, ARROW, A, C, E, B})) {
+    return EinsumEquation::FourDTransposeAll;
+  }
   return EinsumEquation::UnsupportedEquation;
 }
 
@@ -167,7 +177,7 @@ TF::TransposeOp createTransposeOp(Value value, Location loc,
   auto perm_attr = DenseElementsAttr::get(perm_type, permutation);
   auto perm_op = rewriter->create<ConstantOp>(loc, perm_type, perm_attr);
   std::vector<int64_t> transposed_shape(shape.begin(), shape.end());
-  for (int i = 0; i < shape.size(); ++i) {
+  for (int i = 0, end = shape.size(); i < end; ++i) {
     transposed_shape[i] = shape[permutation[i]];
   }
   auto transposed_type =
@@ -187,7 +197,7 @@ TF::SumOp createSumOp(Value value, Location loc,
   auto redux_op = rewriter->create<ConstantOp>(loc, redux_type, redux_attr);
   std::vector<int64_t> sum_shape(shape.size() - redux_axes.size());
   int count = 0;
-  for (int i = 0; i < shape.size(); ++i) {
+  for (int i = 0, end = shape.size(); i < end; ++i) {
     if (std::find(redux_axes.begin(), redux_axes.end(), i) ==
         redux_axes.end()) {
       sum_shape[count] = shape[i];
@@ -380,6 +390,7 @@ LogicalResult ConvertTFEinsumOp::matchAndRewrite(
     auto final_reshape = createReshapeOp(bmm_op, {lhs_dim0, lhs_dim1, rhs_dim3},
                                          bmm_element_type, loc, &rewriter);
     rewriter.replaceOp(op, {final_reshape.getResult()});
+    return success();
   }
   if (einsum_eqn == EinsumEquation::TransposeReducedDim) {
     // Case "BIJ,BINJ->BIN"
@@ -404,6 +415,45 @@ LogicalResult ConvertTFEinsumOp::matchAndRewrite(
     auto final_reshape = createReshapeOp(bmm_op, {lhs_dim0, lhs_dim1, rhs_dim2},
                                          bmm_element_type, loc, &rewriter);
     rewriter.replaceOp(op, {final_reshape.getResult()});
+    return success();
+  }
+  if (einsum_eqn == EinsumEquation::FourDReduceLast) {
+    // Case "acbe,aecd->abcd"
+    const int lhs_dim2 = lhs_shape[2];
+    const int rhs_dim0 = rhs_shape[0];
+    const int rhs_dim2 = rhs_shape[2];
+    const int rhs_dim3 = rhs_shape[3];
+    // Transpose RHS
+    rhs = createTransposeOp(rhs, loc, {0, 2, 1, 3}, &rewriter);
+    std::vector<int64_t> bmm_shape = {rhs_dim0, rhs_dim2, lhs_dim2, rhs_dim3};
+    auto bmm_type = RankedTensorType::get(bmm_shape, rhs_type.getElementType());
+    auto bmm_op = rewriter.create<TF::BatchMatMulV2Op>(
+        loc, ArrayRef<Type>{bmm_type}, lhs, rhs, rewriter.getBoolAttr(false),
+        rewriter.getBoolAttr(false));
+
+    auto trans_bmm = createTransposeOp(bmm_op, loc, {0, 2, 1, 3}, &rewriter);
+    rewriter.replaceOp(op, {trans_bmm.getResult()});
+    return success();
+  }
+  if (einsum_eqn == EinsumEquation::FourDTransposeAll) {
+    // Case "aecd,abcd->acbe"
+    const int lhs_dim0 = lhs_shape[0];
+    const int lhs_dim1 = lhs_shape[1];
+    const int lhs_dim2 = lhs_shape[2];
+    const int rhs_dim1 = rhs_shape[1];
+    // Transpose LHS
+    lhs = createTransposeOp(lhs, loc, {0, 2, 1, 3}, &rewriter);
+    // Transpose RHS
+    rhs = createTransposeOp(rhs, loc, {0, 2, 3, 1}, &rewriter);
+    std::vector<int64_t> bmm_shape = {lhs_dim0, lhs_dim2, lhs_dim1, rhs_dim1};
+    auto bmm_type = RankedTensorType::get(bmm_shape, rhs_type.getElementType());
+    auto bmm_op = rewriter.create<TF::BatchMatMulV2Op>(
+        loc, ArrayRef<Type>{bmm_type}, lhs, rhs, rewriter.getBoolAttr(false),
+        rewriter.getBoolAttr(false));
+
+    auto trans_bmm = createTransposeOp(bmm_op, loc, {0, 1, 3, 2}, &rewriter);
+    rewriter.replaceOp(op, {trans_bmm.getResult()});
+    return success();
   }
 
   return failure();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/executor_island_coarsening.cc b/tensorflow/compiler/mlir/tensorflow/transforms/executor_island_coarsening.cc
index 0d72a7638a3..02a2e7efa6f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/executor_island_coarsening.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/executor_island_coarsening.cc
@@ -185,8 +185,8 @@ IslandOp CreateNewIsland(IslandOp parent, IslandOp child,
 
   Operation* old_island = insert_position == kParentIsland ? parent : child;
   OpBuilder builder(old_island);
-  auto new_island = builder.create<IslandOp>(
-      old_island->getLoc(), result_types, operands, ArrayRef<NamedAttribute>{});
+  auto new_island =
+      builder.create<IslandOp>(old_island->getLoc(), result_types, operands);
   new_island.body().push_back(new Block);
   return new_island;
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_inline_tpu_island.cc b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_inline_tpu_island.cc
index 9a533798208..f624d6cad58 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_inline_tpu_island.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_inline_tpu_island.cc
@@ -61,11 +61,11 @@ void TPUBridgeExecutorIslandInlining::runOnOperation() {
     LLVM_DEBUG(llvm::dbgs()
                << "Found call to inline: " << *call_op.getOperation() << "\n");
 
-    FuncOp called_func = dyn_cast_or_null<FuncOp>(
-        symbol_table.lookupSymbolIn(getOperation(), call_op.f()));
+    auto call_interface = cast<CallOpInterface>(call_op.getOperation());
+    auto called_func =
+        dyn_cast_or_null<FuncOp>(call_interface.resolveCallable());
 
-    if (failed(inlineCall(inliner,
-                          cast<CallOpInterface>(call_op.getOperation()),
+    if (failed(inlineCall(inliner, call_interface,
                           cast<CallableOpInterface>(called_func.getOperation()),
                           called_func.getCallableRegion(),
                           /* shouldCloneInlinedRegion = */ false))) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc
index e04f6bf3daa..a5177fac647 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc
@@ -105,9 +105,10 @@ void TPUBridgeExecutorIslandOutlining::runOnOperation() {
     // Create the outlined function
     SmallString<32> name = kOutlinedFuncPrefix;
     name += llvm::Twine(prefix_id++).str();
-    auto outlined_func = OpBuilder(ctx).create<FuncOp>(
-        island_op.getLoc(), name, func_type, ArrayRef<NamedAttribute>());
+    auto outlined_func =
+        OpBuilder(ctx).create<FuncOp>(island_op.getLoc(), name, func_type);
     outlined_symbol_table.insert(outlined_func);
+    outlined_func.setVisibility(FuncOp::Visibility::Nested);
 
     // We will "steal" the body of the island and replace it with a call to the
     // new function later.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc
index a0be88cc564..d8678e620f4 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc
@@ -140,10 +140,6 @@ static LogicalResult LowerIfOp(IfOp op) {
   Value cond_i1 = LowerCondition(loc, op.cond(), &builder);
   if (!cond_i1) return failure();
 
-  auto module = op_inst->getParentOfType<ModuleOp>();
-  auto then_fn = module.lookupSymbol<FuncOp>(op.then_branch());
-  auto else_fn = module.lookupSymbol<FuncOp>(op.else_branch());
-
   // Split the basic block before the 'if'.  The new dest will be our merge
   // point.
   Block* orig_block = op_inst->getBlock();
@@ -161,14 +157,14 @@ static LogicalResult LowerIfOp(IfOp op) {
 
   // Set up the 'then' block.
   Block* then_block = builder.createBlock(merge_block);
-  Operation* call_op = CallFn(loc, get_operand, then_fn, &builder);
+  Operation* call_op = CallFn(loc, get_operand, op.then_func(), &builder);
 
   auto get_then_result = [&](int i) { return call_op->getResult(i); };
   JumpToBlock(loc, get_then_result, merge_block, &builder);
 
   // Set up the 'else' block.
   Block* else_block = builder.createBlock(merge_block);
-  call_op = CallFn(loc, get_operand, else_fn, &builder);
+  call_op = CallFn(loc, get_operand, op.else_func(), &builder);
 
   auto get_else_result = [&](int i) { return call_op->getResult(i); };
   JumpToBlock(loc, get_else_result, merge_block, &builder);
@@ -194,9 +190,8 @@ static LogicalResult LowerWhileOp(WhileOp op) {
 
   OpBuilder builder(op_inst);
 
-  auto module = op_inst->getParentOfType<ModuleOp>();
-  auto cond_fn = module.lookupSymbol<FuncOp>(op.cond());
-  auto body_fn = module.lookupSymbol<FuncOp>(op.body());
+  auto cond_fn = op.cond_func();
+  auto body_fn = op.body_func();
 
   // Split the block containing the While op into two blocks.  One containing
   // operations before the While op and other containing the rest.  Create two
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
index 5ab0eda08c6..d23b977f0e3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // This transformation pass transforms functional control flow operations in the
 // TensorFlow dialect to their region based counterparts, i.e.,
-// tf.If -> tf.IfRegion
+// tf.If -> tf.IfRegion and tf.While -> tf.WhileRegion
 
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
@@ -31,8 +31,11 @@ limitations under the License.
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 
+#define DEBUG_TYPE "tf-functional-cf-to-region"
+
 namespace mlir {
 namespace TF {
 
@@ -44,24 +47,36 @@ struct FunctionalControlFlowToRegions
   void runOnOperation() override;
 };
 
-// Create a call to function `fn` with arguments `args` and return the CallOp.
-// The arguments are cast to the required type before the call.
-CallOp CreateCall(Location loc, Operation::operand_range args, FuncOp fn,
-                  OpBuilder* builder) {
-  FunctionType fn_type = fn.getType();
-  llvm::SmallVector<Value, 4> operands;
-  int num_operands = fn_type.getNumInputs();
-  operands.reserve(num_operands);
-  for (const auto& ArgAndType : zip(args, fn_type.getInputs())) {
+// Creates a call to function `func` in region `caller_region`. Use `args` as
+// the call arguments, and terminate the region with a yield. The arguments are
+// cast to the required type before the call. `use_region_args` control whether
+// the input arguments are used as is (for IfOp) or block arguments of the same
+// type as the input arguments are created and then used as call arguments (for
+// While).
+void CreateCall(Operation* op, FuncOp func, Region& caller_region,
+                ValueRange args, bool use_region_args) {
+  assert(caller_region.empty() &&
+         "Expected empty region for newly created ops");
+  OpBuilder builder(caller_region);
+  Block* entry = builder.createBlock(&caller_region);
+
+  if (use_region_args) {
+    entry->addArguments(args.getType());
+    args = entry->getArguments();
+  }
+  llvm::SmallVector<Value, 4> casted_args;
+  casted_args.reserve(func.getNumArguments());
+  for (const auto& ArgAndType : zip(args, func.getType().getInputs())) {
     Value arg = std::get<0>(ArgAndType);
     Type expected_type = std::get<1>(ArgAndType);
     if (arg.getType() != expected_type) {
-      arg = builder->create<CastOp>(loc, expected_type, arg,
-                                    /*Truncate=*/builder->getBoolAttr(false));
+      arg = builder.create<CastOp>(op->getLoc(), expected_type, arg,
+                                   /*Truncate=*/builder.getBoolAttr(false));
     }
-    operands.push_back(arg);
+    casted_args.push_back(arg);
   }
-  return builder->create<CallOp>(loc, fn, operands);
+  auto call = builder.create<CallOp>(op->getLoc(), func, casted_args);
+  builder.create<YieldOp>(op->getLoc(), call.getResults());
 }
 
 // Transform a functional IfOp to a region based IfRegionOp.
@@ -69,32 +84,47 @@ LogicalResult ConvertIfOp(IfOp if_op) {
   auto if_region = OpBuilder(if_op).create<TF::IfRegionOp>(
       if_op.getLoc(), if_op.getResultTypes(), if_op.cond(),
       if_op.is_stateless());
+  CopyUnderscoredAttributes(if_op, if_region);
 
-  // Insert call to the given function into the 'region'.
-  auto create_region_with_call = [&if_op](FlatSymbolRefAttr symbol,
-                                          Region& region) {
-    OpBuilder builder(region);
-    builder.createBlock(&region);
-    auto func = if_op.getParentOfType<ModuleOp>().lookupSymbol<FuncOp>(
-        symbol.getValue());
-    auto call = CreateCall(if_op.getLoc(), if_op.input(), func, &builder);
-    builder.create<YieldOp>(if_op.getLoc(), call.getResults());
-  };
-
-  create_region_with_call(if_op.then_branchAttr(), if_region.then_branch());
-  create_region_with_call(if_op.else_branchAttr(), if_region.else_branch());
-
+  CreateCall(if_op, if_op.then_func(),
+             /*caller_region=*/if_region.then_branch(), if_op.input(),
+             /*use_region_args=*/false);
+  CreateCall(if_op, if_op.else_func(),
+             /*caller_region=*/if_region.else_branch(), if_op.input(),
+             /*use_region_args=*/false);
   if_op.replaceAllUsesWith(if_region.getResults());
   if_op.erase();
   return success();
 }
 
+LogicalResult ConvertWhileOp(WhileOp while_op) {
+  auto while_region = OpBuilder(while_op).create<TF::WhileRegionOp>(
+      while_op.getLoc(), while_op.getResultTypes(), while_op.input(),
+      while_op.is_stateless(), while_op.parallel_iterations());
+  CopyUnderscoredAttributes(while_op, while_region);
+
+  CreateCall(while_op, while_op.cond_func(),
+             /*caller_region=*/while_region.cond(), while_op.input(),
+             /*use_region_args=*/true);
+  CreateCall(while_op, while_op.body_func(),
+             /*caller_region=*/while_region.body(), while_op.input(),
+             /*use_region_args=*/true);
+  while_op.replaceAllUsesWith(while_region.getResults());
+  while_op.erase();
+  return success();
+}
+
 void FunctionalControlFlowToRegions::runOnOperation() {
   ModuleOp module = getOperation();
   auto result = module.walk([](Operation* op) {
     if (IfOp if_op = llvm::dyn_cast<IfOp>(op)) {
       if (failed(ConvertIfOp(if_op))) {
-        if_op.emitOpError() << " failed to convert to region form";
+        op->emitOpError() << "failed to convert to region form";
+        return WalkResult::interrupt();
+      }
+    } else if (auto while_op = llvm::dyn_cast<WhileOp>(op)) {
+      if (failed(ConvertWhileOp(while_op))) {
+        op->emitOpError() << "failed to convert to region form";
         return WalkResult::interrupt();
       }
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.cc
index e2090803c00..7563f606434 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.cc
@@ -51,7 +51,7 @@ Status MlirGraphOptimizationPass::Run(const ConfigProto& config_proto,
   CreateLayoutOptimizationPipeline(pm, layout_optimization_options);
 
   // Prepare IR for exporting.
-  pm.addNestedPass<FuncOp>(CreateBreakUpIslandsPass());
+  pm.addPass(CreateBreakUpIslandsPass());
 
   // In case of failure, the `diag_handler` converts MLIR errors emitted to the
   // MLIRContext into a tensorflow::Status.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/graph_pruning.cc b/tensorflow/compiler/mlir/tensorflow/transforms/graph_pruning.cc
index f4d3eda3e7e..859d3ffb23c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/graph_pruning.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/graph_pruning.cc
@@ -19,15 +19,62 @@ limitations under the License.
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/UseDefLists.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 
 namespace mlir {
 namespace tf_executor {
 
+// Visits an op's operand if it is an output of an Operation in the same
+// tf_executor.graph.
+void VisitOpOperand(GraphOp graph, Value operand,
+                    llvm::SmallPtrSetImpl<Operation*>* reachable_ops,
+                    llvm::SmallVectorImpl<Operation*>* ops_to_visit) {
+  Operation* def = operand.getDefiningOp();
+  if (def && def->getParentOp() == graph && reachable_ops->insert(def).second) {
+    // Op has not been visited, add to queue to visit later.
+    ops_to_visit->push_back(def);
+  }
+}
+
+// Visits all operands of an op where each operand is an output of an Operation
+// in the same tf_executor.graph.
+void VisitOpOperands(GraphOp graph, Operation* op,
+                     llvm::SmallPtrSetImpl<Operation*>* reachable_ops,
+                     llvm::SmallVectorImpl<Operation*>* ops_to_visit) {
+  for (Value operand : op->getOperands())
+    VisitOpOperand(graph, operand, reachable_ops, ops_to_visit);
+}
+
+// Visits an op and it's associated operands. IslandOps are handled differently
+// where it's regions op operands are also visited as values may be implicitly
+// captured within. NextIterationSourceOp will also visit it's associated
+// NextIterationSinkOp.
+void VisitOp(GraphOp graph, Operation* op,
+             llvm::SmallPtrSetImpl<Operation*>* reachable_ops,
+             llvm::SmallVectorImpl<Operation*>* ops_to_visit) {
+  if (auto island = llvm::dyn_cast<IslandOp>(op)) {
+    mlir::visitUsedValuesDefinedAbove(
+        island.body(), island.body(), [&](OpOperand* operand) {
+          VisitOpOperand(graph, operand->get(), reachable_ops, ops_to_visit);
+        });
+  }
+
+  VisitOpOperands(graph, op, reachable_ops, ops_to_visit);
+
+  // If op is a `tf_executor.NextIteration.Source`, visit its associated
+  // `tf_executor.NextIteration.Sink` op.
+  if (auto source_op = llvm::dyn_cast<NextIterationSourceOp>(op)) {
+    Operation* sink_op = source_op.GetSink().getOperation();
+    if (reachable_ops->insert(sink_op).second) ops_to_visit->push_back(sink_op);
+  }
+}
+
 // Prunes unreachable operations of a tf_executor.graph operation.
 void PruneGraph(GraphOp graph) {
   // A graph has a single block which forms a DAG: operations that aren't
@@ -36,49 +83,23 @@ void PruneGraph(GraphOp graph) {
   llvm::SmallPtrSet<Operation*, 8> reachable_ops;
   llvm::SmallVector<Operation*, 8> ops_to_visit;
 
-  // Visit an op's operands if it is output of an Operation in same graph.
-  auto visit_op = [&](Operation* op) {
-    for (Value operand : op->getOperands()) {
-      Operation* def = operand.getDefiningOp();
-      if (def && def->getParentOp() == graph &&
-          reachable_ops.insert(def).second) {
-        // Op has not been visited, add to queue to visit later.
-        ops_to_visit.push_back(def);
-      }
-    }
-  };
-
-  // Visit `fetch` operands.
-  visit_op(graph.GetFetch());
+  // Visit fetches first to create a starting point for ops that are reachable.
+  reachable_ops.insert(graph.GetFetch());
+  VisitOpOperands(graph, graph.GetFetch(), &reachable_ops, &ops_to_visit);
 
+  // Visit transitive ops until no there are no reachable ops left that have not
+  // been visited.
   while (!ops_to_visit.empty()) {
     Operation* op = ops_to_visit.pop_back_val();
-    if (llvm::isa<IslandOp>(op)) {
-      // Visit island and island inner ops operands.
-      op->walk([&](Operation* inner_op) { visit_op(inner_op); });
-      continue;
-    } else {
-      // Op is not an island, only visit its operands.
-      visit_op(op);
-    }
-
-    // If op is a `tf_executor.NextIteration.Source`, visit its associated
-    // `tf_executor.NextIteration.Sink` op.
-    if (auto source_op = llvm::dyn_cast<NextIterationSourceOp>(op)) {
-      Operation* sink_op = source_op.GetSink().getOperation();
-      if (reachable_ops.insert(sink_op).second) {
-        ops_to_visit.push_back(sink_op);
-      }
-    }
+    VisitOp(graph, op, &reachable_ops, &ops_to_visit);
   }
 
-  // Erase unreachable ops in reverse order.
-  for (Operation& op : llvm::make_early_inc_range(
-           llvm::drop_begin(llvm::reverse(graph.GetBody()), 1))) {
-    if (reachable_ops.find(&op) == reachable_ops.end()) {
-      op.erase();
-    }
-  }
+  // Erase unreachable ops in reverse order so references don't need to be
+  // dropped before removing an op. Going in reverse order will guarantee that
+  // when an op to be erased is reached, there are no users left.
+  for (Operation& op :
+       llvm::make_early_inc_range(llvm::reverse(graph.GetBody())))
+    if (!reachable_ops.contains(&op)) op.erase();
 }
 
 namespace {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/guarantee_all_funcs_one_use.cc b/tensorflow/compiler/mlir/tensorflow/transforms/guarantee_all_funcs_one_use.cc
new file mode 100644
index 00000000000..776afd72ad5
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/guarantee_all_funcs_one_use.cc
@@ -0,0 +1,121 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/Utils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+
+namespace mlir {
+namespace TF {
+
+namespace {
+
+// Clones FuncOp's until they have a single use only (or no users).
+//
+// The tf-shape-inference pass doesn't support functions that have more than
+// a single use. But some real code from frontends does end up creating code
+// like that. For example, the same LSTM cell function or loop body function
+// will be reused.
+//
+// This pass clones functions as needed to establish the invariant that all
+// functions have a single use. This can in principle cause exponential code
+// size bloat, and should in general be guided by a proper cost model.
+//
+// There are two factors which should be considered by a principled replacement
+// to this pass:
+//
+// 1. TF currently relies on "sufficiently good shape inference" for
+// correctness so for now the cost of doing this seems acceptable since
+// pathological cases haven't hit us yet.
+//
+// 2. Cloning functions can help by allowing code to be specialized (much as
+// inlining does). In fact, tf-shape-inference attempts to do specialization
+// of callees which is difficult if callees have multiple uses.
+class GuaranteeAllFuncsOneUse
+    : public PassWrapper<GuaranteeAllFuncsOneUse, OperationPass<ModuleOp>> {
+ public:
+  void runOnOperation() override {
+    if (failed(Run())) {
+      signalPassFailure();
+    }
+  }
+
+  LogicalResult Run() {
+    auto module = getOperation();
+
+    // Overall strategy:
+    // Fixed point iteration, iteratively applying a rule that clones
+    // any FuncOp with more than one use to eliminate its uses.
+
+    SymbolTable symbol_table(module);
+    bool made_changes = false;
+    // This value needs to be low enough to actually stop compilation in a
+    // reasonable time, but not too low that it blocks real programs.
+    // This number was chosen semi-randomly.
+    const int k_max_clones = 1000;
+    int num_clones = 0;
+    do {
+      made_changes = false;
+      for (auto func : llvm::make_early_inc_range(module.getOps<FuncOp>())) {
+        auto uses_optional = symbol_table.getSymbolUses(func, module);
+        if (!uses_optional.hasValue()) {
+          return func.emitError() << "could not walk uses of func";
+        }
+        auto &uses = *uses_optional;
+        if (llvm::size(uses) <= 1) {
+          continue;
+        }
+        // At this point, we know we are going to change the module.
+        made_changes = true;
+        for (const SymbolTable::SymbolUse &use : llvm::drop_begin(uses, 1)) {
+          if (num_clones++ > k_max_clones) {
+            return func.emitError()
+                   << "reached cloning limit (likely recursive call graph or "
+                      "repeated diamond-like call structure "
+                      "or just very large program)";
+          }
+          auto new_func = func.clone();
+          symbol_table.insert(new_func);
+          new_func.setVisibility(SymbolTable::Visibility::Private);
+          if (failed(symbol_table.replaceAllSymbolUses(func, new_func.getName(),
+                                                       use.getUser()))) {
+            return func.emitError() << "could not replace symbol use";
+          }
+        }
+      }
+    } while (made_changes);
+
+    return success();
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateGuaranteeAllFuncsOneUsePass() {
+  return std::make_unique<GuaranteeAllFuncsOneUse>();
+}
+
+static PassRegistration<GuaranteeAllFuncsOneUse> pass(
+    "tf-guarantee-all-funcs-one-use",
+    "Guarantee all FuncOp's have only a single use.");
+
+}  // namespace TF
+
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import.cc b/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import.cc
new file mode 100644
index 00000000000..615ca26012e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import.cc
@@ -0,0 +1,134 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <numeric>
+
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace TF {
+namespace {
+
+static constexpr int kTextFileIndex_WholeLine = -2;
+static constexpr int kTextFileIndex_LineNumber = -1;
+
+// InitTextFileToImportPass converts InitializeTableFromTextFileV2Op to the
+// corresponding LookupTableImportV2Op if possible.
+class InitTextFileToImportPass
+    : public mlir::PassWrapper<InitTextFileToImportPass, FunctionPass> {
+ public:
+  explicit InitTextFileToImportPass() {}
+
+ private:
+  void runOnFunction() override;
+};
+
+class ConvertInitializeTableFromTextFileV2
+    : public OpRewritePattern<InitializeTableFromTextFileV2Op> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(InitializeTableFromTextFileV2Op op,
+                                PatternRewriter& rewriter) const override {
+    // Now, this pattern matching only supports the following case, which is
+    // commonly used among inference use cases:
+    //
+    // tf.lookup.TextFileInitializer(
+    //   "test.txt", tf.string, tf.lookup.TextFileIndex.WHOLE_LINE,
+    //   tf.int64, tf.lookup.TextFileIndex.LINE_NUMBER, delimiter=" ")
+    //
+    // In the above case, the delimiter will be not used since the key is just a
+    // whole line and value is a line number.
+    if (op.key_index() != kTextFileIndex_WholeLine ||
+        op.value_index() != kTextFileIndex_LineNumber ||
+        op.vocab_size() != -1) {
+      return failure();
+    }
+
+    // Try to find filename from constant op.
+    DenseStringElementsAttr filename_attr;
+    if (!matchPattern(op.filename().getDefiningOp(),
+                      m_Constant(&filename_attr))) {
+      return failure();
+    }
+    StringRef filename = filename_attr.getRawStringData()[0];
+
+    // Read the content of the file.
+    std::string error_message;
+    auto file = openInputFile(filename, &error_message);
+    if (!file) {
+      return op.emitOpError("failed to open vocabulary file")
+             << " (" << filename.str() << "): " << error_message;
+    }
+
+    // Splits into lines.
+    SmallVector<StringRef, 8> lines;
+    file->getBuffer().split(lines, "\n", -1, false);
+
+    // Map each line to line number, starting from zero.
+    SmallVector<int64_t, 8> line_nums;
+    line_nums.resize(lines.size());
+    std::iota(line_nums.begin(), line_nums.end(), 0);
+
+    // Create constant ops for keys an values.
+    Value key_constant_tensor = rewriter.create<ConstantOp>(
+        op.getLoc(),
+        DenseStringElementsAttr::get(
+            RankedTensorType::get(static_cast<int64_t>(lines.size()),
+                                  StringType::get(rewriter.getContext())),
+            lines));
+
+    Value value_constant_tensor = rewriter.create<ConstantOp>(
+        op.getLoc(), rewriter.getI64TensorAttr(line_nums));
+
+    // Replace the given op with LookupTableImportV2Op.
+    rewriter.create<LookupTableImportV2Op>(op.getLoc(), op.table_handle(),
+                                           key_constant_tensor,
+                                           value_constant_tensor);
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
+void InitTextFileToImportPass::runOnFunction() {
+  OwningRewritePatternList patterns;
+  MLIRContext* context = &getContext();
+  FuncOp func = getFunction();
+
+  patterns.insert<ConvertInitializeTableFromTextFileV2>(context);
+  applyPatternsAndFoldGreedily(func, patterns);
+}
+
+}  // namespace
+
+// Replace InitializeTableFromTextFileV2Ops with LookupTableImportV2Ops.
+std::unique_ptr<OperationPass<FuncOp>> CreateInitTextFileToImportPass() {
+  return std::make_unique<InitTextFileToImportPass>();
+}
+
+static PassRegistration<InitTextFileToImportPass> pass(
+    "tf-init-text-file-to-import",
+    "convert InitializeTableFromTextFileV2 ops to LookupTableImportV2Op to "
+    "remove the dependency on asset files");
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import_test_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import_test_pass.cc
new file mode 100644
index 00000000000..96a04fa6eeb
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import_test_pass.cc
@@ -0,0 +1,99 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+
+namespace mlir {
+namespace TF {
+namespace {
+
+// InitTextFileToImportTestPass generates a temporary file and run the
+// InitTextFileToImportPass for testing purpose.
+class InitTextFileToImportTestPass
+    : public mlir::PassWrapper<InitTextFileToImportTestPass,
+                               OperationPass<ModuleOp>> {
+ public:
+  explicit InitTextFileToImportTestPass() {}
+
+ private:
+  void runOnOperation() override;
+};
+
+void InitTextFileToImportTestPass::runOnOperation() {
+  ModuleOp module = getOperation();
+
+  // Create a temporary vocab file.
+  int fd;
+  SmallString<256> filename;
+  std::error_code error_code =
+      llvm::sys::fs::createTemporaryFile("text", "vocab", fd, filename);
+  if (error_code) return signalPassFailure();
+
+  llvm::ToolOutputFile temp_file(filename, fd);
+  const char* dictionary_in_lines =
+      "apple\n"
+      "banana\n"
+      "grape";
+  temp_file.os() << dictionary_in_lines;
+  temp_file.os().flush();
+
+  // Replace filename constant ops to use the temporary file.
+  MLIRContext* context = &getContext();
+
+  for (FuncOp func : module.getOps<FuncOp>()) {
+    llvm::SmallVector<ConstantOp, 4> constant_ops(func.getOps<ConstantOp>());
+    for (auto op : constant_ops) {
+      ShapedType shaped_type =
+          RankedTensorType::get({1}, StringType::get(context));
+
+      DenseStringElementsAttr attr;
+      if (!matchPattern(op.getOperation(), m_Constant(&attr))) {
+        continue;
+      }
+
+      ArrayRef<StringRef> values = attr.getRawStringData();
+      if (values.size() != 1 || values[0] != "%FILE_PLACEHOLDER") {
+        continue;
+      }
+
+      op.valueAttr(DenseStringElementsAttr::get(shaped_type, {filename}));
+    }
+  }
+
+  // Run the lowering pass.
+  PassManager pm(context);
+  pm.addPass(CreateInitTextFileToImportPass());
+  if (failed(pm.run(module))) return signalPassFailure();
+}
+
+}  // namespace
+
+static PassRegistration<InitTextFileToImportTestPass> pass(
+    "tf-init-text-file-to-import-test",
+    "generate a temporary file and invoke InitTextFileToImportPass");
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/launch_to_device_attribute.cc b/tensorflow/compiler/mlir/tensorflow/transforms/launch_to_device_attribute.cc
index bce18c0b4b7..9f67a3e7e71 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/launch_to_device_attribute.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/launch_to_device_attribute.cc
@@ -106,8 +106,8 @@ LogicalResult HoistOpsAndAnnotateWithDevice(const Dialect* tf_dialect,
 void LaunchToDeviceAttributePass::runOnFunction() {
   const Dialect* tf_dialect = getContext().getRegisteredDialect("tf");
   if (!tf_dialect) {
-    signalPassFailure();
     getFunction().emitError() << "'tf' dialect is not registered";
+    return signalPassFailure();
   }
 
   auto result = getFunction().walk([&](tf_device::LaunchOp launch) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
index c263dcc75d1..ad241ef9488 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/core/framework/kernel_shape_util.h"
 
 namespace mlir {
@@ -744,9 +745,7 @@ void LegalizeHloToTf::runOnFunction() {
 
   // Add legalization patterns to the list.
   OwningRewritePatternList patterns;
-  populateWithGenerated(&context, &patterns);
-  patterns.insert<ConvertConvOp, ConvertSliceOp, ConvertReduceOpToTfMax,
-                  ConvertReduceOpToTfMin, ConvertReduceOpToTfSum>(&context);
+  PopulateLegalizeHloToTfPatterns(&patterns, &context);
 
   ConversionTarget target(context);
   target.addLegalDialect<TensorFlowDialect>();
@@ -762,6 +761,13 @@ static PassRegistration<LegalizeHloToTf> pass(
 
 }  // end namespace
 
+void PopulateLegalizeHloToTfPatterns(OwningRewritePatternList *patterns,
+                                     MLIRContext *context) {
+  populateWithGenerated(context, patterns);
+  patterns->insert<ConvertConvOp, ConvertSliceOp, ConvertReduceOpToTfMax,
+                   ConvertReduceOpToTfMin, ConvertReduceOpToTfSum>(context);
+}
+
 std::unique_ptr<OperationPass<FuncOp>> CreateLegalizeHloToTfPass() {
   return std::make_unique<LegalizeHloToTf>();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
index c0de6f557ab..483c84b3e80 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
@@ -113,12 +113,42 @@ Type InferExpandDimsType(Type ty, int64_t axis, Builder *builder) {
 
 // Lowers AddN op to a sequence of AddV2 ops to accumulate operands.
 //
+// Note that to improve the parallelism, AddN op uses tree-based reduction.
+// For example, tf.AddN([0, 1, 2, 3, 4]) behaves as follows:
+//
+//                 0     1     2     3     4
+//                 |     |     |     |     |
+//                 -------     -------     |
+//                    |           |        |
+//                    5           6        |
+//                    |           |        |
+//                    -------------        |
+//                          |              |
+//                          7              |
+//                          |              |
+//                          ----------------
+//                                 |
+//                                 8
+//
+// Example:
+//
 //   %result = "tf.AddN"(%0, %1, %2)
 //
 // is lowered to:
 //
-//   %sum_0 = "tf.AddV2"(%0, %1)
-//   %result = "tf.AddV2"(%sum_0, %2)
+//   %sum0 = "tf.AddV2"(%0, %1)
+//   %result = "tf.AddV2"(%sum0, %2)
+//
+// While
+//
+//   %result = "tf.AddN"(%0, %1, %2, %3, %4)
+//
+// is lowered to:
+//
+//   %sum0 = "tf.AddV2"(%0, %1)
+//   %sum1 = "tf.AddV2"(%2, %3)
+//   %sum2 = "tf.AddV2"(%sum0, %sum1)
+//   %result = "tf.AddV2"(%sum2, %4)
 //
 class LowerAddNOp : public OpRewritePattern<TF::AddNOp> {
  public:
@@ -131,14 +161,23 @@ class LowerAddNOp : public OpRewritePattern<TF::AddNOp> {
     // support variant type so variant types require special handling.
     if (getElementTypeOrSelf(op.getType()).isa<VariantType>()) return failure();
 
-    // TODO(hinsu): Improve parallelism by splitting operands in two halves and
-    // accumulating them first.
-    Value result = *op.inputs().begin();
-    for (Value operand : llvm::drop_begin(op.inputs(), 1)) {
-      result = rewriter.create<TF::AddV2Op>(op.getLoc(), result, operand);
+    llvm::SmallVector<Value, 4> operands(op.inputs().begin(),
+                                         op.inputs().end());
+
+    int64_t n = operands.size();
+    // Keep doing tree-based reduction when there are more than one operand.
+    while (n > 1) {
+      for (int64_t i = 0; i < n; i += 2) {
+        // Add two adjacent operands if applicable.
+        operands[i / 2] = (i + 1 < n)
+                              ? rewriter.create<TF::AddV2Op>(
+                                    op.getLoc(), operands[i], operands[i + 1])
+                              : operands[i];
+      }
+      n = (n + 1) / 2;
     }
 
-    rewriter.replaceOp(op, result);
+    rewriter.replaceOp(op, operands[0]);
     return success();
   }
 };
@@ -344,12 +383,56 @@ class LowerPackOp : public OpRewritePattern<TF::PackOp> {
   }
 };
 
+// Lowers `TF::SparseMatMulOp` to `TF::MatMulOp`, ignoring the sparseness hints,
+// since we currently don't have an implementation that can use this
+// information. Adds appropriate casts where necessary to align element types
+// of operands and result for `TF::MatMulOp`.
+class LowerSparseMatMulOp : public OpRewritePattern<TF::SparseMatMulOp> {
+ public:
+  using OpRewritePattern<TF::SparseMatMulOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TF::SparseMatMulOp op,
+                                PatternRewriter &rewriter) const override {
+    // Result type must be f32 for applying the pattern (currently this is
+    // required by the op anyway but this might change).
+    if (!op.product().getType().cast<TensorType>().getElementType().isF32()) {
+      return failure();
+    }
+    MLIRContext *context = rewriter.getContext();
+    llvm::SmallVector<Value, 2> operands{op.a(), op.b()};
+    for (Value &operand : operands) {
+      TensorType tensor_type = operand.getType().cast<TensorType>();
+      Type element_type = tensor_type.getElementType();
+      if (element_type.isF32()) continue;
+      // Element type can either be f32 or bf16 for `SparseMatMulOp` so it
+      // must be bf16 here.
+      assert(element_type.isBF16());
+      Type tensor_type_f32;
+      if (tensor_type.hasRank()) {
+        tensor_type_f32 = RankedTensorType::get(tensor_type.getShape(),
+                                                FloatType::getF32(context));
+      } else {
+        tensor_type_f32 = UnrankedTensorType::get(FloatType::getF32(context));
+      }
+      // Add cast to f32 to conform with element type of result.
+      operand =
+          rewriter.create<TF::CastOp>(op.getLoc(), tensor_type_f32, operand);
+    }
+    Value result = rewriter.create<TF::MatMulOp>(
+        op.getLoc(), op.product().getType(), operands[0], operands[1],
+        op.transpose_a(), op.transpose_b());
+
+    rewriter.replaceOp(op, {result});
+    return success();
+  }
+};
+
 }  // namespace
 
 void PopulateLoweringTFPatterns(MLIRContext *context,
                                 OwningRewritePatternList *patterns) {
   patterns->insert<LowerAddNOp, LowerDynamicStitchOp, LowerInvertPermutationOp,
-                   LowerPackOp>(context);
+                   LowerPackOp, LowerSparseMatMulOp>(context);
   populateWithGenerated(context, patterns);
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc
new file mode 100644
index 00000000000..ece26dca416
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc
@@ -0,0 +1,177 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/xla/transforms/passes.h"
+
+namespace mlir {
+namespace TFDevice {
+
+namespace {
+
+constexpr char kXlaOutsideCompilationAttr[] = "_xla_outside_compilation";
+
+// This pass marks unsupported ops in a device cluster with
+// `_xla_outside_compilation` attribute so the operations will run on the host
+// instead of the device.  Unsupported ops are ops that can not be code
+// generated to run on the device for the cluster.
+struct MarkOpsForOutsideCompilation
+    : public PassWrapper<MarkOpsForOutsideCompilation,
+                         OperationPass<ModuleOp>> {
+  void runOnOperation() override;
+};
+
+// TODO(b/159128666): Check the control flow legalization passes instead once
+// added.
+void AddSupportedControlFlowOps(MLIRContext* context,
+                                llvm::DenseSet<OperationName>* supported_ops) {
+  supported_ops->insert(
+      OperationName(TF::IfRegionOp::getOperationName(), context));
+  supported_ops->insert(
+      OperationName(TF::WhileRegionOp::getOperationName(), context));
+  supported_ops->insert(
+      OperationName(TF::YieldOp::getOperationName(), context));
+}
+
+// These embedding ops are rewritten when running TPUCompileOp.
+void AddRewrittenEmbeddingOps(MLIRContext* context,
+                              llvm::DenseSet<OperationName>* supported_ops) {
+  supported_ops->insert(OperationName(
+      TF::RecvTPUEmbeddingActivationsOp::getOperationName(), context));
+  supported_ops->insert(OperationName(
+      TF::SendTPUEmbeddingGradientsOp::getOperationName(), context));
+}
+
+bool HasStringOperand(Operation& op) {
+  for (auto operand : op.getOperands()) {
+    if (getElementTypeOrSelf(operand).isa<TF::StringType>()) return true;
+  }
+  return false;
+}
+
+bool HasStringResult(Operation& op) {
+  for (auto result : op.getResults()) {
+    if (getElementTypeOrSelf(result).isa<TF::StringType>()) return true;
+  }
+  return false;
+}
+
+bool MatchesPattern(Operation& op,
+                    const llvm::DenseSet<OperationName>& supported_ops) {
+  return (supported_ops.contains(op.getName()));
+}
+
+// Checks if the op is supported inside of a device cluster.  Ops not
+// in `tf_dialect` are considered supported.
+bool IsSupportedOp(Operation& op,
+                   const llvm::DenseSet<OperationName>& supported_ops,
+                   const Dialect* tf_dialect) {
+  if (op.getDialect() != tf_dialect)
+    return true;
+  else
+    return !HasStringOperand(op) && !HasStringResult(op) &&
+           (MatchesPattern(op, supported_ops) ||
+            mhlo::IsOpAllowedTf2XlaFallback(&op));
+}
+
+// Checks all regions of `op` for captured string operands.
+bool HasCapturedStringOperand(Operation* op) {
+  bool string_operand = false;
+  for (auto& region : op->getRegions()) {
+    mlir::visitUsedValuesDefinedAbove(
+        region, region, [&](mlir::OpOperand* operand) {
+          if (getElementTypeOrSelf(operand->get()).isa<TF::StringType>())
+            string_operand = true;
+        });
+    if (string_operand) return string_operand;
+  }
+  return string_operand;
+}
+
+// Marks uncompilable ops that are in `tf_dialect` for outside compilation.
+LogicalResult MarkUncompilableOps(
+    const Dialect* tf_dialect, Block* block,
+    llvm::DenseSet<OperationName>& supported_ops) {
+  block->walk([&](Operation* op) {
+    if (!IsSupportedOp(*op, supported_ops, tf_dialect)) {
+      op->setAttr(kXlaOutsideCompilationAttr,
+                  StringAttr::get("auto", op->getContext()));
+    }
+    if (llvm::isa<TF::IfRegionOp, TF::WhileRegionOp>(op)) {
+      if (HasCapturedStringOperand(op)) {
+        op->setAttr(kXlaOutsideCompilationAttr,
+                    StringAttr::get("auto", op->getContext()));
+      }
+    }
+  });
+  return success();
+}
+
+void MarkOpsForOutsideCompilation::runOnOperation() {
+  auto module = getOperation();
+  const Dialect* tf_dialect = getContext().getRegisteredDialect("tf");
+  if (!tf_dialect) {
+    getOperation().emitError() << "'tf' dialect is not registered";
+    return signalPassFailure();
+  }
+  OwningRewritePatternList patterns;
+  mhlo::PopulateLegalizeTfPatterns(module.getContext(), &patterns);
+
+  // `supported_ops` contains the name of all of the ops that can potentially be
+  // lowered into HLO on the device. This doesn't always mean that the op can
+  // be lowered in the future passes but if the op is not in this set, it can't
+  // be lowered in a subsequent pass.
+  llvm::DenseSet<OperationName> supported_ops;
+  for (auto& pattern : patterns) {
+    supported_ops.insert(*pattern->getRootKind());
+  }
+  AddSupportedControlFlowOps(module.getContext(), &supported_ops);
+  AddRewrittenEmbeddingOps(module.getContext(), &supported_ops);
+
+  auto result = module.walk([&](tf_device::ClusterOp cluster) {
+    if (failed(
+            MarkUncompilableOps(tf_dialect, &cluster.GetBody(), supported_ops)))
+      return WalkResult::interrupt();
+
+    return WalkResult::advance();
+  });
+
+  if (result.wasInterrupted()) return signalPassFailure();
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateMarkOpsForOutsideCompilationPass() {
+  return std::make_unique<MarkOpsForOutsideCompilation>();
+}
+
+static PassRegistration<MarkOpsForOutsideCompilation> pass(
+    "tf-mark-ops-for-outside-compilation",
+    "Marks unsupported ops a device cluster for outside compilation.");
+
+}  // namespace TFDevice
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc b/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
index 67a6c8dd6dd..6fee693554e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
@@ -68,9 +68,8 @@ bool IsResource(Value value) { return IsResourceType(value.getType()); }
 class ResourceAnalyzer {
  public:
   explicit ResourceAnalyzer(ModuleOp module) {
-    SymbolTable symbol_table(module);
     for (auto func : module.getOps<FuncOp>()) {
-      AnalyzeFunc(func, symbol_table);
+      AnalyzeFunc(func);
     }
   }
 
@@ -89,7 +88,7 @@ class ResourceAnalyzer {
   // written". Do this recursively across the chain of funcs via call or control
   // flow ops.
   // TODO(ashwinm): Move to iterative traversal.
-  LogicalResult AnalyzeFunc(FuncOp func, const SymbolTable& symbol_table) {
+  LogicalResult AnalyzeFunc(FuncOp func) {
     // Avoid infinite recursion.
     if (!discovered_.insert(func).second) {
       return success();
@@ -104,24 +103,20 @@ class ResourceAnalyzer {
         return;
       }
       if (auto call = dyn_cast<CallOpInterface>(op)) {
-        if (auto sym = op->getAttrOfType<SymbolRefAttr>("f")) {
-          PropagatePotentiallyWrittenUpFromCallee(
-              sym.cast<FlatSymbolRefAttr>().getValue(), call.getArgOperands(),
-              symbol_table);
+        if (auto func = dyn_cast<FuncOp>(call.resolveCallable())) {
+          PropagatePotentiallyWrittenUpFromCallee(func, call.getArgOperands());
         }
         return;
       }
       if (auto if_op = dyn_cast<TF::IfOp>(op)) {
-        for (auto callee : {if_op.then_branch(), if_op.else_branch()}) {
-          PropagatePotentiallyWrittenUpFromCallee(callee, if_op.input(),
-                                                  symbol_table);
+        for (auto callee : {if_op.then_func(), if_op.else_func()}) {
+          PropagatePotentiallyWrittenUpFromCallee(callee, if_op.input());
         }
         return;
       }
       if (auto while_op = dyn_cast<TF::WhileOp>(op)) {
-        for (auto callee : {while_op.cond(), while_op.body()}) {
-          PropagatePotentiallyWrittenUpFromCallee(callee, while_op.input(),
-                                                  symbol_table);
+        for (auto callee : {while_op.cond_func(), while_op.body_func()}) {
+          PropagatePotentiallyWrittenUpFromCallee(callee, while_op.input());
         }
         return;
       }
@@ -149,15 +144,13 @@ class ResourceAnalyzer {
     });
   }
 
-  // Given a funcOp associated with the callee and operands from the
+  // Given a FuncOp associated with the callee and operands from the
   // corresponding callOp, propagate the potentially written decision to the
   // callOp's operands, if the corresponding func's arguments are potentially
   // written resources.
   void PropagatePotentiallyWrittenUpFromCallee(
-      StringRef callee, Operation::operand_range propagate_to,
-      const SymbolTable& symbol_table) {
-    auto func = symbol_table.lookup<FuncOp>(callee);
-    AnalyzeFunc(func, symbol_table);
+      FuncOp func, Operation::operand_range propagate_to) {
+    AnalyzeFunc(func);
     for (auto t : llvm::zip(func.getArguments(), propagate_to)) {
       if (!IsResource(std::get<0>(t))) {
         continue;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/parallel_execute_to_islands.cc b/tensorflow/compiler/mlir/tensorflow/transforms/parallel_execute_to_islands.cc
index c13d7de754e..1332c8b6e59 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/parallel_execute_to_islands.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/parallel_execute_to_islands.cc
@@ -71,6 +71,7 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
@@ -111,8 +112,8 @@ LogicalResult ExpandParallelExecuteToIslands(
     // executed.
     llvm::SetVector<Value> region_inputs;
     getUsedValuesDefinedAbove(*execute_region, region_inputs);
-    llvm::SmallVector<Value, 8> execution_control_inputs;
-    if (region_inputs.empty())
+    llvm::SmallVector<Value, 1> execution_control_inputs;
+    if (region_inputs.empty() && input_sink_island)
       execution_control_inputs.emplace_back(input_sink_island.control());
 
     // Collect result types and operands.
@@ -147,13 +148,22 @@ tf_executor::IslandOp CreateInputBarrierIsland(
     OpBuilder* builder, tf_executor::IslandOp island_op) {
   builder->setInsertionPoint(island_op);
 
-  llvm::SetVector<Value> island_inputs;
-  getUsedValuesDefinedAbove(island_op.body(), island_inputs);
+  llvm::SetVector<Value> all_inputs;
+  getUsedValuesDefinedAbove(island_op.body(), all_inputs);
 
+  // Filter out values that are arguments and doesn't need to be part of the
+  // entry barrier.
+  llvm::SmallVector<Value, 8> island_inputs;
   llvm::SmallVector<Type, 8> input_types;
-  input_types.reserve(island_inputs.size());
-  for (const auto& input_val : island_inputs)
-    input_types.emplace_back(input_val.getType());
+  island_inputs.reserve(all_inputs.size());
+  input_types.reserve(all_inputs.size());
+  for (Value val : all_inputs) {
+    if (!val.isa<BlockArgument>()) {
+      island_inputs.push_back(val);
+      input_types.push_back(val.getType());
+    }
+  }
+  if (island_inputs.empty() && island_op.controlInputs().empty()) return {};
 
   // Create new island for that forwards all inputs.
   auto control_type = tf_executor::ControlType::get(island_op.getContext());
@@ -190,7 +200,7 @@ tf_executor::IslandOp CreateOutputBarrierIsland(
   builder->setInsertionPoint(island_op);
   auto island_output_sink = builder->create<tf_executor::IslandOp>(
       island_op.getLoc(), llvm::to_vector<8>(island_op.getResultTypes()),
-      island_operands, llvm::ArrayRef<NamedAttribute>{});
+      island_operands);
   island_output_sink.body().push_back(new Block);
   return island_output_sink;
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/parallelize_embedding_params_ops_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/parallelize_embedding_params_ops_pass.cc
new file mode 100644
index 00000000000..527af0934ea
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/parallelize_embedding_params_ops_pass.cc
@@ -0,0 +1,152 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This transformation parallelizes TPU embedding params assigned to different
+// shards using the parallel execute op. This is useful to avoid introducing
+// control dependency between these ops that are known to be independent.
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h"
+
+namespace mlir {
+namespace TFDevice {
+
+namespace {
+
+struct ParallelizeEmbeddingParamsOpsPass
+    : public PassWrapper<ParallelizeEmbeddingParamsOpsPass, FunctionPass> {
+  void runOnFunction() override;
+};
+
+bool IsLoadTPUEmbeddingParmasOp(Operation& op) {
+  static const auto* algorithms = []() {
+    auto* algorithms = new llvm::SmallSet<std::string, 16>();
+    for (tensorflow::tpu::OptimizationAlgorithm alg :
+         tensorflow::tpu::GetOptimizationAlgorithms()) {
+      const auto alg_name = tensorflow::tpu::GetOptimizationAlgorithmName(alg);
+      algorithms->insert(alg_name);
+    }
+    return algorithms;
+  }();
+  StringRef op_name = op.getName().getStringRef();
+  return op_name.consume_front("tf.LoadTPUEmbedding") &&
+         op_name.consume_back("Parameters") &&
+         algorithms->contains(op_name.str());
+}
+
+static LogicalResult RunOnIsland(tf_executor::IslandOp island) {
+  Block* block = island.getBody();
+
+  // Map from op to the id of the shard it is assigned for ops that can execute
+  // in parallel across shards.
+  llvm::SmallMapVector<Operation*, int64_t, 4> assigned_shard;
+  llvm::SmallVector<Value, 8> resources;
+  llvm::SmallSet<int64_t, 16> shard_ids;
+  for (Operation& op : llvm::reverse(*block)) {
+    int64_t shard = -1;
+    if (IsLoadTPUEmbeddingParmasOp(op)) {
+      auto shard_id = op.getAttrOfType<mlir::IntegerAttr>("shard_id");
+      if (!shard_id) {
+        return op.emitOpError("requires 'shard_id' integer attribute");
+      }
+      shard = shard_id.getInt();
+      shard_ids.insert(shard);
+    } else if (auto read_op = llvm::dyn_cast<TF::ReadVariableOp>(op)) {
+      if (assigned_shard.empty()) continue;
+
+      for (Operation* user : op.getUsers()) {
+        auto iter = assigned_shard.find(user);
+        if (iter == assigned_shard.end() ||
+            (shard != -1 && shard != iter->second)) {
+          shard = -1;
+          break;
+        }
+        shard = iter->second;
+      }
+      if (shard != -1) resources.push_back(read_op.resource());
+    }
+
+    if (shard != -1) assigned_shard.insert(std::make_pair(&op, shard));
+  }
+
+  // No transformations are required.
+  int num_shards = shard_ids.size();
+  if (num_shards <= 1) return success();
+
+  // If the resources are used for ops other than read variable op, then moving
+  // read variable ops to the parallel_execute may not preserve the semantics.
+  for (Value resource : resources) {
+    for (Operation* user : resource.getUsers())
+      if (!llvm::isa<TF::ReadVariableOp>(*user)) return success();
+  }
+
+  // Create parallel_execute op at the end of the block and move operations
+  // to their corresponding shard.
+  auto builder = OpBuilder::atBlockTerminator(block);
+  auto parallel_execute_op = builder.create<tf_device::ParallelExecuteOp>(
+      island.getLoc(), num_shards, llvm::ArrayRef<Type>());
+  for (int shard_id = 0; shard_id < num_shards; ++shard_id) {
+    mlir::Block& b = parallel_execute_op.GetRegionBlockWithIndex(shard_id);
+    builder.setInsertionPointToStart(&b);
+    builder.create<tf_device::ReturnOp>(island.getLoc());
+  }
+
+  for (auto op_shard : assigned_shard) {
+    int64_t shard = op_shard.second;
+    if (shard >= num_shards) {
+      return island.emitOpError(
+          "load tpu embedding ops require continuous range of shards");
+    }
+    mlir::Block& b = parallel_execute_op.GetRegionBlockWithIndex(shard);
+    op_shard.first->moveBefore(&b, b.begin());
+  }
+  return success();
+}
+
+void ParallelizeEmbeddingParamsOpsPass::runOnFunction() {
+  getFunction().walk([&](tf_executor::IslandOp island) {
+    if (failed(RunOnIsland(island))) {
+      signalPassFailure();
+      return WalkResult::interrupt();
+    }
+    return WalkResult::advance();
+  });
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<FuncOp>>
+CreateParallelizeEmbeddingParamsOpsPass() {
+  return std::make_unique<ParallelizeEmbeddingParamsOpsPass>();
+}
+}  // namespace TFDevice
+}  // namespace mlir
+
+static mlir::PassRegistration<mlir::TFDevice::ParallelizeEmbeddingParamsOpsPass>
+    pass("tf-parallize-embedding-params-ops",
+         "Parallelizes TPU embedding params assigned to different shards using "
+         "the parallel_execte op");
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index 5af8a0195a4..3be6c9e1a70 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -18,13 +18,15 @@ limitations under the License.
 
 #include <memory>
 
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 
 namespace mlir {
 
 // Creates a pass that breaks up an island with multiple ops into multiple
 // islands, each with a single op.
-std::unique_ptr<OperationPass<FuncOp>> CreateBreakUpIslandsPass();
+std::unique_ptr<OperationPass<ModuleOp>> CreateBreakUpIslandsPass();
 
 // Creates a pass that converts mlir functions consisting of mlir ops into a
 // tf_executor dialect as a single island.
@@ -58,6 +60,9 @@ std::unique_ptr<OperationPass<FuncOp>> CreateMaterializePassthroughOpPass();
 // Performs Shape Inference on the TensorFlow dialect using the global registry.
 std::unique_ptr<OperationPass<ModuleOp>> CreateTFShapeInferencePass();
 
+// Guarantee that all FuncOp's have a single use.
+std::unique_ptr<OperationPass<ModuleOp>> CreateGuaranteeAllFuncsOneUsePass();
+
 // Optional pass which will unroll BatchMatMul and use only MatMul
 std::unique_ptr<OperationPass<FuncOp>> CreateUnrollBatchMatMulPassPass();
 
@@ -148,6 +153,10 @@ CreateTensorArrayOpsDecompositionPass();
 // Create a pass that legalize HLO to TF dialect.
 std::unique_ptr<OperationPass<FuncOp>> CreateLegalizeHloToTfPass();
 
+// Addds the HLO to TF rewrite patterns to the specified pattern list.
+void PopulateLegalizeHloToTfPatterns(OwningRewritePatternList* patterns,
+                                     MLIRContext* context);
+
 // Matches sequence of ops to TensorFlow fused kernels. This pass should not be
 // generally used beyond exporting to runtimes that supports these ops. In the
 // future these fusions may be codegen'd automatically.
@@ -155,6 +164,10 @@ std::unique_ptr<OperationPass<FuncOp>> CreateFusedKernelMatcherPass();
 
 // Creates function pass to select device index/fold tf.DeviceIndex.
 std::unique_ptr<OperationPass<FuncOp>> CreateDeviceIndexSelectorPass();
+
+// Creates function pass to replace InitializeTableFromTextFileV2Ops with
+// LookupTableImportV2Op ops.
+std::unique_ptr<OperationPass<FuncOp>> CreateInitTextFileToImportPass();
 }  // namespace TF
 
 namespace tf_executor {
@@ -226,17 +239,27 @@ std::unique_ptr<OperationPass<FuncOp>> CreateReplicateInvariantOpHoistingPass();
 
 // Creates a pass that forms replica `tf_executor.island` from a single
 // `tf_device.replicate` island.
-std::unique_ptr<OperationPass<FuncOp>> CreateReplicateToIslandPass();
+std::unique_ptr<OperationPass<ModuleOp>> CreateReplicateToIslandPass();
 
 // Creates a pass that creates `tf_executor.island` from a single
 // `tf_device.parallel_execute` island.
 std::unique_ptr<OperationPass<FuncOp>> CreateParallelExecuteToIslandsPass();
 
+// Create a pass to parallelize TPU embedding params assigned to different
+// shards using the parallel_execte op.
+std::unique_ptr<OperationPass<FuncOp>>
+CreateParallelizeEmbeddingParamsOpsPass();
+
 // Creates a pass that annotates whether a LaunchFuncOp's parameters have the
 // same data across replicas.
 std::unique_ptr<OperationPass<ModuleOp>>
 CreateAnnotateParameterReplicationPass();
 
+// Creates a pass that marks unsupported ops in device cluster for outside
+// compilation.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateMarkOpsForOutsideCompilationPass();
+
 // Creates a pass that hoists a `tf_device.launch` body and assigns a `device`
 // attribute to each TensorFlow dialect op in the body based on the `device`
 // attribute on the `tf_device.launch`.
@@ -250,7 +273,7 @@ std::unique_ptr<OperationPass<FuncOp>> CreateTPUClusterFormationPass();
 
 // Creates a pass that allows TPU program inputs to have layouts determined at
 // run time.
-std::unique_ptr<OperationPass<FuncOp>> CreateTPUDynamicLayoutPass();
+std::unique_ptr<OperationPass<ModuleOp>> CreateTPUDynamicLayoutPass();
 
 // Creates a pass that remaps and assigns padding map from a
 // `tf_device.launch_func` `padding_map` attribute to its encapsulated function.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
index 961287b0b1f..89910d6b3a5 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
@@ -304,7 +304,7 @@ LogicalResult PromoteResourcesToArguments(
       continue;
     }
 
-    const auto index = resource_and_index.index();
+    const int64_t index = resource_and_index.index();
     const bool is_var_handle = index >= var_handles_start_idx;
     if (resource.write) {
       if (!is_var_handle || resource.read) {
@@ -342,7 +342,8 @@ LogicalResult PromoteResourcesToArguments(
   }
 
   // Rewrite return if there are variable writes.
-  if (return_operands.size() > num_results_before) {
+  const int return_operands_size = return_operands.size();
+  if (return_operands_size > num_results_before) {
     builder.create<ReturnOp>(return_op.getLoc(), return_operands);
     return_op.erase();
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/readonly_references_to_resources.cc b/tensorflow/compiler/mlir/tensorflow/transforms/readonly_references_to_resources.cc
index 5fc35361bca..104f11e0cc0 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/readonly_references_to_resources.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/readonly_references_to_resources.cc
@@ -65,8 +65,15 @@ class ConvertReadonlyReferenceVariablesToResourceVariablesPass
 StringRef GetNodeNameFromClassAttr(Operation *op) {
   ArrayAttr classes_attr = op->getAttrOfType<ArrayAttr>(kClassAttr);
   if (!classes_attr) {
-    op->emitOpError() << "has no '_class' attribute";
-    return StringRef();
+    // Attampt to parse "_class" from the IdentityOp that follows VariableV2.
+    // For read-only reference variables, IdentityOp should be the only user of
+    // VariableV2.
+    auto identity_op = op->getUsers().begin();
+    classes_attr = identity_op->getAttrOfType<ArrayAttr>(kClassAttr);
+    if (!classes_attr) {
+      op->emitOpError() << "has no '_class' attribute";
+      return StringRef();
+    }
   }
 
   StringRef result;
@@ -153,7 +160,7 @@ void ConvertReadonlyReferenceVariablesToResourceVariablesPass::runOnFunction() {
       builder.setInsertionPoint(user);
       ReadVariableOp read_variable_op = builder.create<ReadVariableOp>(
           user->getLoc(), ArrayRef<Type>{tensor_type},
-          ArrayRef<Value>{var_handle_op}, ArrayRef<NamedAttribute>{});
+          ArrayRef<Value>{var_handle_op});
       user->getResult(0).replaceAllUsesWith(read_variable_op.getResult());
       user->erase();
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc b/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc
index ca0467942ca..ba876e08fbb 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc
@@ -15,9 +15,11 @@ limitations under the License.
 
 // This transformation pass transforms region bases control flow operations in
 // the TensorFlow dialect to their functional counterparts, i.e.,
-// tf.IfRegion ->  tf.If
+// tf.IfRegion ->  tf.If and tf.WhileRegion -> tf.While
 
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
@@ -34,8 +36,11 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 
+#define DEBUG_TYPE "tf-region-cf-to-functional"
+
 namespace mlir {
 namespace TF {
 
@@ -48,6 +53,7 @@ struct RegionControlFlowToFunctional
 
  private:
   LogicalResult ConvertIfOp(IfRegionOp if_region);
+  LogicalResult ConvertWhileOp(WhileRegionOp while_region);
 
   // Get unique name by using the loc to name mapping.
   std::string GetName(Operation* op, StringRef suffix);
@@ -61,20 +67,20 @@ std::string RegionControlFlowToFunctional::GetName(Operation* op,
   return (mapper.GetUniqueName(op) + suffix).str();
 }
 
-// Returns all the external values referenced from the given set of regions. If
-// the external value is a constant, sink it into the region instead (and do not
+// Returns all the external values referenced from the given regions. If the
+// external value is a constant, sink it into the region instead (and do not
 // add it to the returned vector).
-llvm::SmallVector<Value, 4> CollectExternValues(ArrayRef<Region*> regions) {
-  llvm::SetVector<Value> extern_values_set;
+llvm::SmallVector<Value, 4> CollectExternValues(Region& first, Region& second) {
+  llvm::SetVector<Value> extern_values;
 
-  for (auto region : regions) {
+  for (Region* region : {&first, &second}) {
     llvm::SetVector<Value> region_extern_values;
     getUsedValuesDefinedAbove(*region, region_extern_values);
 
     // Sink down constants into the functions.
     for (auto extern_value : region_extern_values) {
       if (!matchPattern(extern_value, m_Constant())) {
-        extern_values_set.insert(extern_value);
+        extern_values.insert(extern_value);
         continue;
       }
       // Add constant at start of region.
@@ -85,28 +91,43 @@ llvm::SmallVector<Value, 4> CollectExternValues(ArrayRef<Region*> regions) {
     }
   }
 
-  return {extern_values_set.begin(), extern_values_set.end()};
+  return llvm::to_vector<4>(extern_values);
 }
 
 // Extracts the contents of a region with a single block into a new function.
 // `extern_values` is the set of external values that the region refers to.
 //
-// Any inputs to the terminator of the region are converted to return values of
-// the function. If any of these values is not exact type as the function's
-// return type, appropriate cast operations will be inserted
-void ExtractSingleBlockRegion(Region& region, FunctionType type, StringRef name,
+// Inputs to the terminator of the region are converted to return values of
+// the function. If `extern_values_passthrough` is true, all the extern values
+// are also added as return values from the function
+void ExtractSingleBlockRegion(Region& region, StringRef name,
                               llvm::SmallVectorImpl<Value>& extern_values,
-                              llvm::SmallVectorImpl<FuncOp>& worklist) {
+                              llvm::SmallVectorImpl<FuncOp>& worklist,
+                              bool extern_values_passthrough) {
   ModuleOp module = region.getParentOfType<ModuleOp>();
   auto builder = OpBuilder::atBlockBegin(module.getBody());
   auto loc = region.getParentOp()->getLoc();
+  Block& entry = region.front();
+  int num_region_arguments = entry.getNumArguments();
+  Operation* terminator = entry.getTerminator();
+
+  // Build the function type. Region arguments and extern values together
+  // become the function arguments, with region arguments going first.
+  auto input_types = llvm::to_vector<4>(entry.getArgumentTypes());
+  for (auto input : extern_values) input_types.push_back(input.getType());
+
+  // Terminator operands and pass through extern values (if enabled) together
+  // become the function return values.
+  auto return_types = llvm::to_vector<4>(terminator->getOperandTypes());
+  if (extern_values_passthrough)
+    for (auto input : extern_values) return_types.push_back(input.getType());
+
+  auto type = FunctionType::get(input_types, return_types, region.getContext());
 
   // Create new function and extract region body into the function.
-  auto outlined_func =
-      builder.create<FuncOp>(loc, name, type, ArrayRef<NamedAttribute>{});
-
-  outlined_func.getBody().takeBody(region);
+  auto outlined_func = builder.create<FuncOp>(loc, name, type);
   Region& func_region = outlined_func.getBody();
+  func_region.takeBody(region);
   Block& first_block = func_region.front();
 
   // Replace all external uses with function arguments.
@@ -115,27 +136,24 @@ void ExtractSingleBlockRegion(Region& region, FunctionType type, StringRef name,
     replaceAllUsesInRegionWith(it.value(), arg, func_region);
   }
 
-  // Replace the existing terminator with a return.
-  Operation* terminator = outlined_func.getBody().front().getTerminator();
-  builder.setInsertionPoint(terminator);
+  // Function return values are all the terminator operands + pass through
+  // extern values (if enabled).
+  auto return_values = llvm::to_vector<4>(terminator->getOperands());
+  if (extern_values_passthrough)
+    return_values.insert(return_values.end(),
+                         first_block.args_begin() + num_region_arguments,
+                         first_block.args_end());
 
-  SmallVector<Value, 4> return_values;
-  return_values.reserve(terminator->getNumOperands());
-  for (auto it : llvm::enumerate(type.getResults())) {
-    Value ret_val = terminator->getOperand(it.index());
-    // Add a cast operation if types do not match.
-    if (ret_val.getType() != it.value()) {
-      ret_val =
-          builder.create<CastOp>(terminator->getLoc(), it.value(), ret_val);
-    }
-    return_values.push_back(ret_val);
-  }
+  // Replace the existing terminator with a return.
+  terminator = first_block.getTerminator();
+  builder.setInsertionPoint(terminator);
   builder.create<ReturnOp>(terminator->getLoc(), return_values);
   terminator->erase();
+
   outlined_func.setVisibility(FuncOp::Visibility::Private);
 
   // Add the outlined function to the worklist in case its body has
-  // IfRegion ops that need to converted.
+  // IfRegion or WhileRegion ops that need to converted.
   worklist.push_back(outlined_func);
 }
 
@@ -170,17 +188,29 @@ llvm::Optional<CallOp> IsSingleCallRegion(Region& region) {
   return call;
 }
 
-// Returns whether the arguments of the given call are same as the given list of
-// arguments (after looking through cast ops).
-bool MatchCallArgs(CallOp call, llvm::SmallVectorImpl<Value>& args) {
-  if (call.getNumOperands() != args.size()) return false;
+using MatcherFn = function_ref<bool(Value, Region&, Value, Region&)>;
 
-  for (auto it : llvm::enumerate(args)) {
-    Value arg = call.getOperand(it.index());
-    if (auto cast = dyn_cast_or_null<CastOp>(arg.getDefiningOp()))
-      arg = cast.getOperand();
+// Returns whether the arguments of the given 2 calls are match (after looking
+// through cast ops). `matcher` is the predicate used to check if two arguments
+// match.
+bool MatchCallArgs(CallOp first, CallOp second, MatcherFn matcher) {
+  if (first.getNumOperands() != second.getNumOperands()) return false;
 
-    if (arg != it.value()) return false;
+  Region& first_region = *first.getParentRegion();
+  Region& second_region = *second.getParentRegion();
+
+  for (auto it : llvm::zip(first.getArgOperands(), second.getArgOperands())) {
+    // Get the defining Op, skipping over casts.
+    auto get_defining_op = [](Value value) {
+      while (llvm::isa_and_nonnull<CastOp>(value.getDefiningOp()))
+        value = cast<CastOp>(value.getDefiningOp()).getOperand();
+      return value;
+    };
+    Value first_arg = get_defining_op(std::get<0>(it));
+    Value second_arg = get_defining_op(std::get<1>(it));
+
+    if (!matcher(first_arg, first_region, second_arg, second_region))
+      return false;
   }
   return true;
 }
@@ -193,11 +223,10 @@ struct TrivialTransformInfo {
   bool can_transform = false;
 
   // List of callee names (one for each region).
-  llvm::SmallVector<StringRef, 4> callee_names;
+  llvm::SmallVector<StringRef, 2> callee_names;
 
-  // List of arguments used in these call (each call uses the same arguments
-  // potentially through casts).
-  llvm::SmallVector<Value, 4> call_args;
+  // Constructor will analyze the 2 regions.
+  TrivialTransformInfo(Region& first, Region& second, MatcherFn matcher);
 };
 
 // Analyzes the given set of regions (attached to the same parent op) to check
@@ -206,88 +235,62 @@ struct TrivialTransformInfo {
 // regions are single call regions and the all the calls have the same
 // arguments.
 //
-// If this trivial transformation is possible, return the relevant information
-// needed for the transformation (in `TrivialTransformInfo`), else indicate that
-// a trivial transformation is not possible by setting `can_transform` false.
-TrivialTransformInfo AnalyzeForTrivialTransform(ArrayRef<Region*> regions) {
-  const TrivialTransformInfo cannot_transform;
+// If such a trivial transformation is possible, stash the relevant information
+// needed for the transformation, else indicate that a trivial transformation is
+// not possible by setting `can_transform` to false.
+TrivialTransformInfo::TrivialTransformInfo(Region& first, Region& second,
+                                           MatcherFn matcher) {
+  auto call0 = IsSingleCallRegion(first);
+  auto call1 = IsSingleCallRegion(second);
+  if (!call0 || !call1) return;
 
-  if (regions.empty()) return cannot_transform;
+  if (!MatchCallArgs(call0.getValue(), call1.getValue(), matcher)) return;
 
-  llvm::SmallVector<CallOp, 2> calls;
-  calls.reserve(regions.size());
-
-  // Verify each region is a single call and collect these calls.
-  for (Region* region : regions) {
-    auto call = IsSingleCallRegion(*region);
-    if (!call.hasValue()) return cannot_transform;
-    calls.push_back(call.getValue());
-  }
-
-  llvm::SmallVector<StringRef, 4> callees;
-  callees.reserve(regions.size());
-
-  CallOp call0 = calls[0];
-  int num_args = call0.getNumOperands();
-
-  // Collect arguments of the first call.
-  llvm::SmallVector<Value, 4> call0_args;
-  call0_args.reserve(num_args);
-  for (Value arg : call0.getArgOperands()) {
-    if (auto cast = dyn_cast_or_null<CastOp>(arg.getDefiningOp()))
-      arg = cast.getOperand();
-    call0_args.push_back(arg);
-  }
-
-  // Match arguments of rest of the calls with those of the first call.
-  for (auto call : calls) {
-    if (call != call0 && !MatchCallArgs(call, call0_args))
-      return cannot_transform;
-    callees.push_back(call.getCallee());
-  }
-
-  return {true, callees, call0_args};
+  can_transform = true;
+  callee_names = {call0.getValue().getCallee(), call1.getValue().getCallee()};
 }
 
 // Transform IfRegionOp to IfOp.
 LogicalResult RegionControlFlowToFunctional::ConvertIfOp(IfRegionOp if_region) {
-  const TrivialTransformInfo tti = AnalyzeForTrivialTransform(
-      {&if_region.then_branch(), &if_region.else_branch()});
+  llvm::SmallVector<Value, 4> extern_values;
+
+  // For IfOp, arguments of calls in the then and else regions match if they
+  // are the same value.
+  auto if_matcher = [&](Value first, Region&, Value second, Region&) {
+    if (first != second) return false;
+
+    // collect the call arguments post lookup through cast Op's
+    extern_values.push_back(first);
+    return true;
+  };
+
+  const TrivialTransformInfo tti(if_region.then_branch(),
+                                 if_region.else_branch(), if_matcher);
 
   std::string then_name, else_name;
-  llvm::SmallVector<Value, 4> extern_values;
 
   if (tti.can_transform) {
     // We can transform to functional form trivially without outlining.
     then_name = tti.callee_names[0].str();
     else_name = tti.callee_names[1].str();
-    extern_values = tti.call_args;
   } else {
     // Collect external values that are used within the else and then bodies.
-    extern_values = CollectExternValues(
-        {&if_region.then_branch(), &if_region.else_branch()});
+    extern_values =
+        CollectExternValues(if_region.then_branch(), if_region.else_branch());
 
     // These external values need to be added as inputs to the generated If. The
     // order is determined by the order of these values the `extern_vales`.
 
-    // Build the type for the outlined function.
-    llvm::SmallVector<Type, 4> input_types;
-    input_types.reserve(extern_values.size());
-    for (auto input : extern_values) input_types.push_back(input.getType());
-
-    FunctionType func_type = FunctionType::get(
-        input_types, if_region.getResultTypes(), if_region.getContext());
-
     // Create 2 new functions with the input signature matching this order,
     // and outline the `then` and `else` regions by moving the bodies of these
     // regions into these functions. Replace tf.yield with a regular return.
     then_name = GetName(if_region, "_then");
-    ExtractSingleBlockRegion(if_region.then_branch(), func_type, then_name,
-                             extern_values, worklist);
+    ExtractSingleBlockRegion(if_region.then_branch(), then_name, extern_values,
+                             worklist, /*extern_values_passthrough=*/false);
 
     else_name = GetName(if_region, "_else");
-    ExtractSingleBlockRegion(if_region.else_branch(), func_type, else_name,
-                             extern_values, worklist);
+    ExtractSingleBlockRegion(if_region.else_branch(), else_name, extern_values,
+                             worklist, /*extern_values_passthrough=*/false);
   }
 
   // Once we have the `then` and `else` functions ready (either outlined or
@@ -297,24 +300,111 @@ LogicalResult RegionControlFlowToFunctional::ConvertIfOp(IfRegionOp if_region) {
   auto if_op = builder.create<IfOp>(
       if_region.getLoc(), if_region.getResultTypes(), if_region.cond(),
       extern_values, then_name, else_name, if_region.is_stateless());
+  CopyUnderscoredAttributes(if_region, if_op);
   if_region.replaceAllUsesWith(if_op.getResults());
   if_region.erase();
   return success();
 }
 
+// Transform WhileRegion to WhileOp.
+LogicalResult RegionControlFlowToFunctional::ConvertWhileOp(
+    WhileRegionOp while_region) {
+  // For While, the arguments of the calls in the body and cond regions match
+  // if they are region arguments with the same region argument numbers. If the
+  // 2 calls have the same value (an extern value) used an an argument, we
+  // cannot do a trivial transformation because post transform, we will need to
+  // pass this extern value as an argument to the function, so we cannot use the
+  // existing function as is.
+  auto while_matcher = [](Value first, Region& first_region, Value second,
+                          Region& second_region) {
+    if (!first.isa<BlockArgument>() || !second.isa<BlockArgument>())
+      return false;
+    BlockArgument first_block_arg = first.cast<BlockArgument>();
+    BlockArgument second_block_arg = second.cast<BlockArgument>();
+
+    // 2 block arguments will match if they are the same argument number, and
+    // are block arguments of the corresponding containing regions.
+    return first_block_arg.getArgNumber() == second_block_arg.getArgNumber() &&
+           first_block_arg.getParentBlock() == &first_region.front() &&
+           second_block_arg.getParentBlock() == &second_region.front();
+  };
+
+  const TrivialTransformInfo tti(while_region.cond(), while_region.body(),
+                                 while_matcher);
+
+  // All existing inputs to while region are inputs to the functional while.
+  auto new_inputs = llvm::to_vector<4>(while_region.getOperands());
+
+  // All existing results will also be generated by the functional while.
+  auto new_result_types = llvm::to_vector<4>(while_region.getResultTypes());
+
+  std::string cond_name, body_name;
+  if (tti.can_transform) {
+    // We can transform to functional form trivially without outlining.
+    cond_name = tti.callee_names[0].str();
+    body_name = tti.callee_names[1].str();
+  } else {
+    // The WhileRegion regions can refer to either arguments of the region, or
+    // external values implicitly captured by the region. When converting to
+    // functional form, all such external values need to become function
+    // arguments of the outlined functions, and become pass through values in
+    // the outlined body function. So when outlining the while body, in addition
+    // to the region arguments, all these external references need to be added
+    // as function arguments.
+    llvm::SmallVector<Value, 4> extern_values =
+        CollectExternValues(while_region.cond(), while_region.body());
+
+    // Outline the `cond` and `body` regions by moving the bodies of these
+    // regions into new functions. Replace tf.yield with a regular return.
+    cond_name = GetName(while_region, "_cond");
+    ExtractSingleBlockRegion(while_region.cond(), cond_name, extern_values,
+                             worklist, /*extern_values_passthrough=*/false);
+
+    body_name = GetName(while_region, "_body");
+    ExtractSingleBlockRegion(while_region.body(), body_name, extern_values,
+                             worklist, /*extern_values_passthrough=*/true);
+
+    // All extern values become additional inputs and additional output types
+    // for the functional while.
+    new_inputs.append(extern_values.begin(), extern_values.end());
+    for (auto ext : extern_values) new_result_types.push_back(ext.getType());
+  }
+
+  // Once we have the `cond` and `body` functions ready (either outlined or
+  // existing ones), replace the region based op with a functional op.
+  OpBuilder builder(while_region);
+  auto while_op = builder.create<WhileOp>(
+      while_region.getLoc(), new_result_types, new_inputs, cond_name, body_name,
+      while_region.parallel_iterations(), while_region.is_stateless());
+  CopyUnderscoredAttributes(while_region, while_op);
+
+  // Redirect old results to new results.
+  for (auto it : llvm::zip(
+           while_region.getResults(),
+           while_op.getResults().take_front(while_region.getNumResults())))
+    std::get<0>(it).replaceAllUsesWith(std::get<1>(it));
+
+  while_region.erase();
+  return success();
+}
+
 void RegionControlFlowToFunctional::runOnOperation() {
   ModuleOp module = getOperation();
 
   // Seed worklist with all functions in the module.
   worklist = llvm::to_vector<4>(module.getOps<FuncOp>());
-
   while (!worklist.empty()) {
     FuncOp function = worklist.pop_back_val();
 
     auto result = function.walk([&](Operation* op) {
-      if (IfRegionOp if_region = llvm::dyn_cast<IfRegionOp>(op)) {
+      if (auto if_region = llvm::dyn_cast<IfRegionOp>(op)) {
         if (failed(ConvertIfOp(if_region))) {
-          if_region.emitOpError() << " failed to convert to functional form";
+          op->emitOpError() << "failed to convert to functional form";
+          return WalkResult::interrupt();
+        }
+      } else if (auto while_region = llvm::dyn_cast<WhileRegionOp>(op)) {
+        if (failed(ConvertWhileOp(while_region))) {
+          op->emitOpError() << "failed to convert to functional form";
           return WalkResult::interrupt();
         }
       }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
index b16868311f0..ef75f90d5c1 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
@@ -32,12 +33,14 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace mlir {
@@ -45,10 +48,11 @@ namespace TFDevice {
 namespace {
 constexpr char kDeviceAttr[] = "device";
 constexpr char kReplicaIdAttr[] = "_xla_replica_id";
+constexpr char kDeviceOrdinalAttr[] = "device_ordinal";
 
 struct ReplicateToIslandPass
-    : public PassWrapper<ReplicateToIslandPass, FunctionPass> {
-  void runOnFunction() override;
+    : public PassWrapper<ReplicateToIslandPass, OperationPass<ModuleOp>> {
+  void runOnOperation() override;
 };
 
 // Returns whether op requires `_xla_replica_id` attribute.
@@ -57,29 +61,207 @@ bool RequiresReplicaIDAttribute(Operation* op) {
                    TF::EnqueueTPUEmbeddingRaggedTensorBatchOp>(op);
 }
 
-// Adds integer attribute that represents replica id for replicated ops that
-// require replica id attribute.
-void AddReplicaIdToOpsInReplicatedRegion(OpBuilder* builder, Region* region,
-                                         const int replica_id) {
-  region->walk([&](Operation* replicated_op) {
-    if (RequiresReplicaIDAttribute(replicated_op))
-      replicated_op->setAttr(kReplicaIdAttr,
-                             builder->getI32IntegerAttr(replica_id));
+bool RequiresDeviceOrdinalAttribute(Operation* op) {
+  return llvm::isa<TF::_XlaSendFromHostOp>(op) ||
+         llvm::isa<TF::_XlaRecvAtHostOp>(op);
+}
+
+// Checks if a region contains ops that are replica variant.
+bool HasReplicaVariantOps(Region& region,
+                          const llvm::Optional<DictionaryAttr>& devices) {
+  auto result = region.walk([&](Operation* op) {
+    if (RequiresReplicaIDAttribute(op) ||
+        (devices.hasValue() && RequiresDeviceOrdinalAttribute(op)))
+      return WalkResult::interrupt();
+
+    if (auto launch = dyn_cast<tf_device::LaunchOp>(op))
+      if (devices.hasValue() && devices.getValue().get(launch.device()))
+        return WalkResult::interrupt();
+
+    return WalkResult::advance();
   });
+  return result.wasInterrupted();
+}
+
+// Collects all functions reachable from a region, including transitive ones.
+llvm::SmallPtrSet<FuncOp, 4> GetReachableFunctionsFromRegion(ModuleOp module,
+                                                             Region& region) {
+  llvm::SmallPtrSet<FuncOp, 4> visited_functions;
+
+  SymbolTable symbol_table(module);
+  auto symbol_uses = symbol_table.getSymbolUses(&region);
+  if (!symbol_uses) return {};
+
+  for (auto& use : *symbol_uses)
+    if (auto func =
+            symbol_table.lookup<FuncOp>(use.getSymbolRef().getRootReference()))
+      visited_functions.insert(func);
+
+  llvm::SmallVector<FuncOp, 4> functions_to_visit(visited_functions.begin(),
+                                                  visited_functions.end());
+  while (!functions_to_visit.empty()) {
+    llvm::SmallVector<FuncOp, 4> new_functions_to_visit;
+
+    for (FuncOp function_to_visit : functions_to_visit) {
+      auto func_symbol_uses =
+          symbol_table.getSymbolUses(function_to_visit.getCallableRegion());
+      if (!func_symbol_uses) continue;
+
+      for (auto& use : *func_symbol_uses)
+        if (auto func = symbol_table.lookup<FuncOp>(
+                use.getSymbolRef().getRootReference()))
+          if (visited_functions.insert(func).second)
+            new_functions_to_visit.push_back(func);
+    }
+
+    functions_to_visit.swap(new_functions_to_visit);
+  }
+
+  return visited_functions;
+}
+
+// Collects all functions and transitive functions reachable from region that
+// contain replicate variant ops.
+llvm::SmallDenseMap<llvm::StringRef, FuncOp> GetReachableFunctionsToClone(
+    ModuleOp module, Region& region,
+    const llvm::Optional<DictionaryAttr>& devices) {
+  llvm::SmallPtrSet<FuncOp, 4> reachable_functions =
+      GetReachableFunctionsFromRegion(module, region);
+
+  llvm::SmallDenseMap<llvm::StringRef, FuncOp> functions_to_clone;
+  llvm::SmallVector<FuncOp, 4> functions_to_visit;
+  for (FuncOp func : reachable_functions) {
+    if (!func.getCallableRegion()) continue;
+    if (HasReplicaVariantOps(*func.getCallableRegion(), devices)) {
+      functions_to_clone.insert({func.getName(), func});
+      functions_to_visit.push_back(func);
+    }
+  }
+
+  while (!functions_to_visit.empty()) {
+    llvm::SmallVector<FuncOp, 4> new_functions_to_visit;
+
+    for (FuncOp func_to_visit : functions_to_visit) {
+      auto func_uses = func_to_visit.getSymbolUses(module);
+      if (!func_uses) continue;
+      for (auto use : *func_uses) {
+        auto parent_func = use.getUser()->getParentOfType<FuncOp>();
+        if (!parent_func || !reachable_functions.contains(parent_func) ||
+            !functions_to_clone.insert({parent_func.getName(), parent_func})
+                 .second)
+          continue;
+        new_functions_to_visit.push_back(parent_func);
+      }
+    }
+
+    functions_to_visit.swap(new_functions_to_visit);
+  }
+
+  return functions_to_clone;
+}
+
+struct FuncOldNameAndClone {
+  StringRef old_name;
+  FuncOp clone;
+};
+
+// Replaces all symbol uses with cloned functions, for `region` and across the
+// cloned functions themselves.
+LogicalResult UpdateSymbolUsesWithClones(
+    SymbolTable& symbol_table, ModuleOp module, Region& region,
+    llvm::MutableArrayRef<FuncOldNameAndClone> cloned_functions) {
+  llvm::SmallVector<std::pair<StringRef, StringRef>, 4> old_to_new_names;
+  old_to_new_names.reserve(cloned_functions.size());
+  for (auto& cloned_function : cloned_functions)
+    old_to_new_names.push_back(
+        {cloned_function.old_name, cloned_function.clone.getName()});
+
+  for (const auto& old_to_new_name : old_to_new_names) {
+    if (failed(symbol_table.replaceAllSymbolUses(
+            old_to_new_name.first, old_to_new_name.second, &region)))
+      return failure();
+
+    for (auto& cloned_function : cloned_functions)
+      if (failed(symbol_table.replaceAllSymbolUses(
+              old_to_new_name.first, old_to_new_name.second,
+              cloned_function.clone.getCallableRegion())))
+        return failure();
+  }
+  return success();
+}
+
+// Collects TPU device ordinal for outside compilation communication ops. This
+// currently assumes outside compilation only uses `TPU_REPLICATED_CORE_0`
+// aliased device for the device computation.
+llvm::Optional<int64_t> GetDeviceOrdinal(
+    const llvm::Optional<DictionaryAttr>& devices, Location loc,
+    unsigned replica_id) {
+  int64_t device_ordinal = 0;
+  if (devices.hasValue()) {
+    if (auto tpu_replica_0 = devices.getValue().get("TPU_REPLICATED_CORE_0")) {
+      llvm::StringRef tpu_device = tpu_replica_0.cast<ArrayAttr>()[replica_id]
+                                       .cast<StringAttr>()
+                                       .getValue();
+      if (succeeded(tensorflow::GetDeviceOrdinalFromDeviceString(
+              loc, tpu_device, &device_ordinal))) {
+        return llvm::Optional<int64_t>(device_ordinal);
+      }
+    }
+  }
+  return llvm::None;
+}
+
+// Updates replica variant ops in a region based on replica `replica_id`.
+// TODO(b/157624749): Replace this with better abstraction to differentiate ops
+// for different replicas. Some ops, such as XlaHostCompute op or TPU Embedding
+// ops, require replica id to be added as an op attribute to be used during
+// execution. Handle such ops separately and add an integer attribute that
+// represents replica id.
+LogicalResult UpdateRegionReplicateVariantOps(
+    OpBuilder& builder, Location loc, Region& region, int replica_id,
+    llvm::MutableArrayRef<FuncOldNameAndClone> cloned_functions,
+    const llvm::Optional<DictionaryAttr>& devices) {
+  llvm::Optional<int64_t> device_ordinal =
+      GetDeviceOrdinal(devices, loc, replica_id);
+
+  auto update_replicate_variant_ops = [&](Operation* op) {
+    // Add replica id.
+    if (RequiresReplicaIDAttribute(op))
+      op->setAttr(kReplicaIdAttr, builder.getI32IntegerAttr(replica_id));
+
+    if (!devices.hasValue()) return;
+
+    // Map aliased devices to explicit devices based on replica.
+    if (auto launch = dyn_cast<tf_device::LaunchOp>(op))
+      if (auto device_by_replica = devices.getValue().get(launch.device()))
+        launch.setAttr(
+            kDeviceAttr,
+            device_by_replica.cast<ArrayAttr>()[replica_id].cast<StringAttr>());
+
+    // Add device ordinal.
+    if (device_ordinal && RequiresDeviceOrdinalAttribute(op))
+      op->setAttr(kDeviceOrdinalAttr,
+                  builder.getI64IntegerAttr(*device_ordinal));
+  };
+
+  region.walk(update_replicate_variant_ops);
+  for (auto& cloned_function : cloned_functions)
+    cloned_function.clone.getCallableRegion()->walk(
+        update_replicate_variant_ops);
+
+  return success();
 }
 
 // Creates islands per replica from `tf_device.replicate` region. If for a
 // `tf_device.launch` op the device is an aliased device of the
 // `tf_device.replicate`, the device will be remapped to an explicit device
 // for the associated replica island.
-llvm::SmallVector<tf_executor::IslandOp, 8> ExpandReplicateIntoReplicas(
-    const Dialect* tf_dialect, OpBuilder* builder,
+LogicalResult ExpandReplicateIntoReplicas(
+    const Dialect* tf_dialect, OpBuilder& builder, ModuleOp module,
     tf_executor::IslandOp island_op, tf_device::ReplicateOp replicate_op,
-    int num_replicas) {
-  auto devices = replicate_op.devices();
-  const bool has_devices = devices.hasValue();
-  llvm::SmallVector<tf_executor::IslandOp, 8> replicas;
+    int num_replicas, llvm::SmallVectorImpl<tf_executor::IslandOp>& replicas) {
   replicas.reserve(num_replicas);
+  auto devices = replicate_op.devices();
 
   // Collect result types and operands.
   Operation& terminator = replicate_op.GetBody().back();
@@ -88,16 +270,30 @@ llvm::SmallVector<tf_executor::IslandOp, 8> ExpandReplicateIntoReplicas(
   llvm::SmallVector<Value, 8> replica_inputs(island_op.controlInputs());
 
   // Replace replicate terminator with YieldOp.
-  builder->setInsertionPoint(&terminator);
-  builder->create<tf_executor::YieldOp>(terminator.getLoc(),
-                                        terminator.getOperands());
+  builder.setInsertionPoint(&terminator);
+  builder.create<tf_executor::YieldOp>(terminator.getLoc(),
+                                       terminator.getOperands());
   terminator.erase();
 
-  builder->setInsertionPoint(island_op);
+  auto funcs_to_clone =
+      GetReachableFunctionsToClone(module, replicate_op.body(), devices);
+  SymbolTable symbol_table(module);
+
+  builder.setInsertionPoint(island_op);
   BlockAndValueMapping mapping;
   for (int i : llvm::seq<int>(0, num_replicas)) {
+    // Clone reachable functions with replica variant ops.
+    llvm::SmallVector<FuncOldNameAndClone, 4> cloned_functions;
+    cloned_functions.reserve(funcs_to_clone.size());
+    for (auto& func_to_clone : funcs_to_clone) {
+      auto cloned_function = func_to_clone.getSecond().clone();
+      symbol_table.insert(cloned_function, module.end());
+      cloned_functions.push_back(
+          {func_to_clone.getSecond().getName(), cloned_function});
+    }
+
     // Create new island for replica.
-    auto replica = builder->create<tf_executor::IslandOp>(
+    auto replica = builder.create<tf_executor::IslandOp>(
         island_op.getLoc(), output_types, control_type, replica_inputs);
 
     // Map block arg to replica arg.
@@ -109,28 +305,19 @@ llvm::SmallVector<tf_executor::IslandOp, 8> ExpandReplicateIntoReplicas(
     // Copy over replicate region into replica island.
     replicate_op.body().cloneInto(&replica.body(), mapping);
 
-    // TODO(b/157624749): Replace this with better abstraction to
-    // differentiate ops for different replicas.
-    // Some ops, such as XlaHostCompute op or TPU Embedding ops, require
-    // replica id to be added as an op attribute to be used during
-    // execution. Handle such ops separately and add an integer attribute
-    // that represents replica id.
-    AddReplicaIdToOpsInReplicatedRegion(builder, &replica.body(), i);
+    if (failed(UpdateSymbolUsesWithClones(symbol_table, module, replica.body(),
+                                          cloned_functions)))
+      return failure();
 
-    // Map aliased devices to explicit devices based on replica.
-    if (has_devices) {
-      replica.walk([&](tf_device::LaunchOp launch) {
-        if (auto device_by_replica = devices.getValue().get(launch.device()))
-          launch.setAttr(
-              kDeviceAttr,
-              device_by_replica.cast<ArrayAttr>()[i].cast<StringAttr>());
-      });
-    }
+    if (failed(UpdateRegionReplicateVariantOps(
+            builder, replicate_op.getLoc(), replica.body(),
+            /*replica_id=*/i, cloned_functions, devices)))
+      return failure();
 
     replicas.push_back(replica);
   }
 
-  return replicas;
+  return success();
 }
 
 // Creates islands per replica from `tf_device.replicate` region and remap
@@ -183,17 +370,19 @@ llvm::SmallVector<tf_executor::IslandOp, 8> ExpandReplicateIntoReplicas(
 //   }) {device = "/DEVICE:3"} : () -> tensor<i1>
 //   tf_executor.yield %a1, %b1 : tensor<i1>, tensor<i1>
 // }
-void CreateIslandsFromReplicate(const Dialect* tf_dialect,
-                                tf_executor::GraphOp graph_op,
-                                tf_executor::IslandOp island_op,
-                                tf_device::ReplicateOp replicate_op) {
+LogicalResult CreateIslandsFromReplicate(const Dialect* tf_dialect,
+                                         ModuleOp module,
+                                         tf_executor::GraphOp graph_op,
+                                         tf_executor::IslandOp island_op,
+                                         tf_device::ReplicateOp replicate_op) {
   OpBuilder builder(island_op);
   const int num_replicas = replicate_op.n().getLimitedValue();
 
   // Create islands per replica.
-  llvm::SmallVector<tf_executor::IslandOp, 8> replicas =
-      ExpandReplicateIntoReplicas(tf_dialect, &builder, island_op, replicate_op,
-                                  num_replicas);
+  llvm::SmallVector<tf_executor::IslandOp, 8> replicas;
+  if (failed(ExpandReplicateIntoReplicas(tf_dialect, builder, module, island_op,
+                                         replicate_op, num_replicas, replicas)))
+    return failure();
 
   // Collect all replica results.
   llvm::SmallVector<Value, 8> replicas_outputs(replicate_op.getNumResults(),
@@ -244,36 +433,41 @@ void CreateIslandsFromReplicate(const Dialect* tf_dialect,
   }
 
   island_op.erase();
+  return success();
 }
 
-// Finds islands with a single `tf_device.replicate` and create individual
-// islands per replica of the replicate.
-void LowerSingleIslandReplicateToIslands(const Dialect* tf_dialect,
-                                         tf_executor::GraphOp graph_op,
-                                         tf_executor::IslandOp island_op) {
-  if (!island_op.WrapsSingleOp()) return;
-
-  if (auto replicate_op =
-          llvm::dyn_cast<tf_device::ReplicateOp>(&island_op.GetBody().front()))
-    CreateIslandsFromReplicate(tf_dialect, graph_op, island_op, replicate_op);
-}
-
-void ReplicateToIslandPass::runOnFunction() {
+void ReplicateToIslandPass::runOnOperation() {
+  auto module = getOperation();
   const Dialect* tf_dialect = getContext().getRegisteredDialect("tf");
   if (!tf_dialect) {
-    signalPassFailure();
-    getFunction().emitError() << "'tf' dialect is not registered";
+    module.emitError() << "'tf' dialect is not registered";
+    return signalPassFailure();
   }
 
-  getFunction().walk([&](tf_executor::GraphOp graph_op) {
-    for (auto island_op :
-         llvm::make_early_inc_range(graph_op.getOps<tf_executor::IslandOp>()))
-      LowerSingleIslandReplicateToIslands(tf_dialect, graph_op, island_op);
+  // Find islands with a single `tf_device.replicate` and create individual
+  // islands per replica of the replicate.
+  llvm::SmallVector<tf_executor::IslandOp, 4> replicate_op_islands;
+  module.walk([&](tf_executor::GraphOp graph_op) {
+    for (auto island_op : graph_op.getOps<tf_executor::IslandOp>()) {
+      if (!island_op.WrapsSingleOp()) continue;
+
+      if (isa<tf_device::ReplicateOp>(&island_op.GetBody().front()))
+        replicate_op_islands.push_back(island_op);
+    }
   });
+
+  for (tf_executor::IslandOp island_op : replicate_op_islands) {
+    auto graph_op = island_op.getParentOfType<tf_executor::GraphOp>();
+    auto replicate_op =
+        cast<tf_device::ReplicateOp>(island_op.GetBody().front());
+    if (failed(CreateIslandsFromReplicate(tf_dialect, module, graph_op,
+                                          island_op, replicate_op)))
+      return signalPassFailure();
+  }
 }
 }  // anonymous namespace
 
-std::unique_ptr<OperationPass<FuncOp>> CreateReplicateToIslandPass() {
+std::unique_ptr<OperationPass<ModuleOp>> CreateReplicateToIslandPass() {
   return std::make_unique<ReplicateToIslandPass>();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc
index 21d74d81b20..7e8e9ee30c8 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc
@@ -36,7 +36,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h"
+#include "tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 
@@ -61,7 +61,9 @@ struct ResourceDeviceInference
 // A class that records each resource's device assignment in a function.
 class PerFunctionResult {
  public:
-  explicit PerFunctionResult(FuncOp func_op) : alias_analysis_(func_op) {}
+  explicit PerFunctionResult(
+      FuncOp func_op, const TF::ResourceAliasAnalysis::Info& alias_analysis)
+      : alias_analysis_(alias_analysis) {}
 
   // Returns the recorded device assignment for a resource, if any.
   llvm::Optional<llvm::StringRef> DeviceForResource(
@@ -105,7 +107,7 @@ class PerFunctionResult {
 
  private:
   llvm::SmallDenseMap<int64_t, llvm::StringRef, 8> resource_id_to_device_;
-  TF::ResourceAliasAnalysis alias_analysis_;
+  const TF::ResourceAliasAnalysis::Info& alias_analysis_;
 };
 
 // Tries to record device assignment for a resource.
@@ -193,46 +195,50 @@ LogicalResult ComputeResourceDevicesInComputation(FuncOp func_op,
 
 void ResourceDeviceInference::runOnOperation() {
   auto module = getOperation();
+  const auto& resource_alias_analysis =
+      getAnalysis<TF::ResourceAliasAnalysis>();
+
   llvm::SmallDenseMap<Operation*, PerFunctionResult, 4> per_function_results;
   llvm::SetVector<FuncOp> worklist;
   module.walk([&](FuncOp func_op) {
     worklist.insert(func_op);
-    per_function_results.try_emplace(func_op, func_op);
+    per_function_results.try_emplace(
+        func_op, func_op, resource_alias_analysis.GetAnalysisForFunc(func_op));
   });
   // Helper that propagates an op's recorded operand device assignments to its
   // called function's arguments.
   auto propagate_operands_to_callee_arguments =
       [&](Operation* caller, Operation::operand_range caller_operands,
-          llvm::StringRef called_func_name,
-          const PerFunctionResult& caller_res) {
-        auto callee =
-            llvm::dyn_cast<FuncOp>(module.lookupSymbol(called_func_name));
-        assert(callee);
-        auto& callee_res = per_function_results.find(callee)->getSecond();
-        bool callee_needs_recompute = false;
-        for (auto operand_and_argument :
-             llvm::zip(caller_operands, callee.getArguments())) {
-          if (!mlir::getElementTypeOrSelf(
-                   std::get<0>(operand_and_argument).getType())
-                   .isa<TF::ResourceType>()) {
-            continue;
+          ArrayRef<FuncOp> callees, const PerFunctionResult& caller_res) {
+        for (FuncOp callee : callees) {
+          assert(callee);
+          auto& callee_res = per_function_results.find(callee)->getSecond();
+          bool callee_needs_recompute = false;
+          for (auto operand_and_argument :
+               llvm::zip(caller_operands, callee.getArguments())) {
+            if (!mlir::getElementTypeOrSelf(
+                     std::get<0>(operand_and_argument).getType())
+                     .isa<TF::ResourceType>()) {
+              continue;
+            }
+            auto device =
+                caller_res.DeviceForResource(std::get<0>(operand_and_argument));
+            if (!device) continue;
+            if (failed(AddResourceDeviceAndEmitError(
+                    std::get<1>(operand_and_argument), *device, caller,
+                    &callee_res, &callee_needs_recompute))) {
+              return failure();
+            }
           }
-          auto device =
-              caller_res.DeviceForResource(std::get<0>(operand_and_argument));
-          if (!device) continue;
-          if (failed(AddResourceDeviceAndEmitError(
-                  std::get<1>(operand_and_argument), *device, caller,
-                  &callee_res, &callee_needs_recompute))) {
-            return failure();
+          // If the callee recording is modified, make sure that it will be
+          // reprocessed.
+          if (callee_needs_recompute) {
+            worklist.insert(callee);
           }
         }
-        // If the callee recording is modified, make sure that it will be
-        // reprocessed.
-        if (callee_needs_recompute) {
-          worklist.insert(callee);
-        }
         return success();
       };
+
   while (!worklist.empty()) {
     auto func_op = worklist.back();
     worklist.pop_back();
@@ -245,18 +251,14 @@ void ResourceDeviceInference::runOnOperation() {
     auto walk_res = func_op.walk([&](Operation* op) {
       if (auto while_op = llvm::dyn_cast<TF::WhileOp>(op)) {
         if (failed(propagate_operands_to_callee_arguments(
-                while_op, while_op.getOperands(), while_op.body(), func_res)) ||
-            failed(propagate_operands_to_callee_arguments(
-                while_op, while_op.getOperands(), while_op.cond(), func_res))) {
+                while_op, while_op.getOperands(),
+                {while_op.body_func(), while_op.cond_func()}, func_res)))
           return WalkResult::interrupt();
-        }
       } else if (auto if_op = llvm::dyn_cast<TF::IfOp>(op)) {
         if (failed(propagate_operands_to_callee_arguments(
-                if_op, if_op.input(), if_op.then_branch(), func_res)) ||
-            failed(propagate_operands_to_callee_arguments(
-                if_op, if_op.input(), if_op.else_branch(), func_res))) {
+                if_op, if_op.input(), {if_op.then_func(), if_op.else_func()},
+                func_res)))
           return WalkResult::interrupt();
-        }
       }
       return WalkResult::advance();
     });
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
index 6a67f0bea0a..702455d156d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
@@ -558,15 +558,13 @@ void AddLoadsStoresOutsideControlFlowOp(
     auto operand = caller->getOperand(index);
     builder.setInsertionPoint(caller);
     new_operands[index] = builder.create<TF::ReadVariableOp>(
-        caller->getLoc(), ArrayRef<Type>{new_type}, ArrayRef<Value>{operand},
-        ArrayRef<NamedAttribute>{});
+        caller->getLoc(), ArrayRef<Type>{new_type}, ArrayRef<Value>{operand});
     caller->setOperand(index, new_operands[index]);
     if (updated_index < 0) continue;
     builder.setInsertionPointAfter(caller);
     builder.create<TF::AssignVariableOp>(
         caller->getLoc(), ArrayRef<Type>{},
-        ArrayRef<Value>{operand, caller->getResult(updated_index)},
-        ArrayRef<NamedAttribute>{});
+        ArrayRef<Value>{operand, caller->getResult(updated_index)});
   }
 }
 
@@ -629,8 +627,6 @@ LogicalResult HandleWhileLoop(TF::WhileOp while_op, FuncOp body, FuncOp cond) {
                                  });
   // Recreate the while op.
   OpBuilder builder(while_op);
-  auto new_output_shapes = FilterRange<Attribute, ArrayRef<Attribute>>(
-      while_op.output_shapes().getValue(), resource_arg_uses);
   // Now use the filtered original operands, which will be replaced by
   // AddLoadsStoresOutsideControlFlowOp().
   auto new_while = builder.create<TF::WhileOp>(
@@ -638,8 +634,7 @@ LogicalResult HandleWhileLoop(TF::WhileOp while_op, FuncOp body, FuncOp cond) {
       FilterRange<Value, OperandRange>(while_op.getOperands(),
                                        resource_arg_uses),
       while_op.getAttrs());
-  // Prepare for AddLoadsStoresOutsideControlFlowOp() and update
-  // new_output_shapes.
+  // Prepare for AddLoadsStoresOutsideControlFlowOp().
   llvm::SmallDenseMap<int64_t, std::pair<Type, int64_t>>
       arg_data_type_and_updated_output_index;
   for (const auto& entry : remaining_resource_data_types) {
@@ -649,16 +644,11 @@ LogicalResult HandleWhileLoop(TF::WhileOp while_op, FuncOp body, FuncOp cond) {
                                : entry.getFirst();
     arg_data_type_and_updated_output_index[entry.getFirst()] = {
         entry.getSecond(), update_index};
-    if (!new_output_shapes.empty()) {
-      new_output_shapes[entry.getFirst()] =
-          tensorflow::ConvertTypeToTensorShapeAttr(entry.getSecond());
-    }
   }
   AddLoadsStoresOutsideControlFlowOp(new_while,
                                      arg_data_type_and_updated_output_index);
-  new_while.setAttr("output_shapes", builder.getArrayAttr(new_output_shapes));
   // Replace uses.
-  for (int64_t i = 0; i < old_to_new_indices.size(); ++i) {
+  for (int64_t i = 0, end = old_to_new_indices.size(); i < end; ++i) {
     if (old_to_new_indices[i] >= 0) {
       while_op.getResult(i).replaceAllUsesWith(
           new_while.getResult(old_to_new_indices[i]));
@@ -687,10 +677,12 @@ LogicalResult HandleCaseOrIfOp(CaseOrIfOp op, ArrayRef<FuncOp> branches) {
       auto retval = func.front().getTerminator()->getOperand(result_index);
       assert(result.getType() == retval.getType());
       auto aliasing_arg = retval.dyn_cast<BlockArgument>();
+      if (!aliasing_arg)
+        return op.emitOpError("unsupported output: ")
+               << "resource does not alias input";
       if (common_aliasing_arg_num == kUnassigned)
         common_aliasing_arg_num = aliasing_arg.getArgNumber();
-      if (!aliasing_arg ||
-          aliasing_arg.getArgNumber() != common_aliasing_arg_num)
+      if (aliasing_arg.getArgNumber() != common_aliasing_arg_num)
         return op.emitOpError("unsupported output: ")
                << "resource does not alias a single input";
     }
@@ -760,8 +752,11 @@ LogicalResult HandleCaseOrIfOp(CaseOrIfOp op, ArrayRef<FuncOp> branches) {
   for (auto branch : branches) {
     auto new_retvals =
         llvm::to_vector<4>(branch.front().getTerminator()->getOperands());
+    new_retvals.resize(new_retvals.size() + resource_arg_to_new_output.size());
     for (const auto& entry : resource_arg_to_new_output) {
-      new_retvals.push_back(branch.getArgument(entry.getFirst()));
+      int64_t resource_arg_index = entry.getFirst();
+      int64_t output_index = entry.getSecond();
+      new_retvals[output_index] = branch.getArgument(resource_arg_index);
     }
     auto old_return = branch.front().getTerminator();
     OpBuilder builder(old_return);
@@ -799,7 +794,7 @@ LogicalResult HandleCaseOrIfOp(CaseOrIfOp op, ArrayRef<FuncOp> branches) {
   AddLoadsStoresOutsideControlFlowOp(new_op,
                                      arg_data_type_and_updated_output_index);
   // Replace uses.
-  for (int64_t i = 0; i < old_to_new_output_indices.size(); ++i) {
+  for (int64_t i = 0, end = old_to_new_output_indices.size(); i < end; ++i) {
     if (old_to_new_output_indices[i] >= 0) {
       op.getResult(i).replaceAllUsesWith(
           new_op.getResult(old_to_new_output_indices[i]));
@@ -943,7 +938,8 @@ void UpdatePartitionedCallOpWithNewCallee(
   AddLoadsStoresOutsideControlFlowOp(
       new_call, lifting_info.arg_data_type_and_updated_output_index);
   // Replace uses.
-  for (int64_t i = 0; i < lifting_info.old_to_new_output_indices.size(); ++i) {
+  for (int64_t i = 0, end = lifting_info.old_to_new_output_indices.size();
+       i < end; ++i) {
     if (lifting_info.old_to_new_output_indices[i] >= 0) {
       call_op.getResult(i).replaceAllUsesWith(
           new_call.getResult(lifting_info.old_to_new_output_indices[i]));
@@ -987,8 +983,8 @@ LogicalResult HoistForFunctionalControlFlow(
   RemoveIdentity(block);
   for (Operation& op : llvm::make_early_inc_range(*block)) {
     if (auto while_op = llvm::dyn_cast<TF::WhileOp>(&op)) {
-      auto body = llvm::cast<FuncOp>(module.lookupSymbol(while_op.body()));
-      auto cond = llvm::cast<FuncOp>(module.lookupSymbol(while_op.cond()));
+      auto body = while_op.body_func();
+      auto cond = while_op.cond_func();
       // Recursively handle the nested control flow.
       HoistForFunctionalControlFlow(&body.front(), module,
                                     lifted_partitioned_call_callees);
@@ -996,10 +992,8 @@ LogicalResult HoistForFunctionalControlFlow(
                                     lifted_partitioned_call_callees);
       if (failed(HandleWhileLoop(while_op, body, cond))) return failure();
     } else if (auto if_op = llvm::dyn_cast<TF::IfOp>(&op)) {
-      auto then_branch =
-          llvm::cast<FuncOp>(module.lookupSymbol(if_op.then_branch()));
-      auto else_branch =
-          llvm::cast<FuncOp>(module.lookupSymbol(if_op.else_branch()));
+      auto then_branch = if_op.then_func();
+      auto else_branch = if_op.else_func();
       // Recursively handle the nested control flow.
       HoistForFunctionalControlFlow(&then_branch.front(), module,
                                     lifted_partitioned_call_callees);
@@ -1020,12 +1014,10 @@ LogicalResult HoistForFunctionalControlFlow(
       }
       if (failed(HandleCaseOrIfOp(case_op, branch_functions))) return failure();
     } else if (auto call_op = llvm::dyn_cast<TF::PartitionedCallOp>(&op)) {
-      if (!call_op.f().isa<FlatSymbolRefAttr>()) {
+      auto callee = call_op.func();
+      if (!callee)
         return call_op.emitOpError(
             "resource lifting does not support call with nested references.");
-      }
-      auto callee = llvm::cast<FuncOp>(
-          module.lookupSymbol(call_op.f().getRootReference()));
       if (failed(HandlePartitionedCallOp(call_op, callee, module,
                                          lifted_partitioned_call_callees))) {
         // Nested control flow handling is done in HandlePartitionedCallOp().
@@ -1033,8 +1025,7 @@ LogicalResult HoistForFunctionalControlFlow(
       }
     } else if (auto call_op =
                    llvm::dyn_cast<TF::StatefulPartitionedCallOp>(&op)) {
-      auto callee = llvm::cast<FuncOp>(module.lookupSymbol(call_op.f()));
-      if (failed(HandlePartitionedCallOp(call_op, callee, module,
+      if (failed(HandlePartitionedCallOp(call_op, call_op.func(), module,
                                          lifted_partitioned_call_callees))) {
         return failure();
       }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index f9c81634ae5..597fbe2c0b1 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
@@ -130,25 +131,28 @@ bool NeedsCastBack(OpOperand& use, Dialect* tf_dialect) {
          !IsSupportedNonTFOp(use.getOwner());
 }
 
-// Inserts tf.Cast operation when changing the type of a result if the user is
-// not a TF operation, as we can't guarantee that the new type will be OK.
-void AddCastBackForUnsupportedNonTFUses(Operation* op, Value result,
-                                        Dialect* tf_dialect, Type old_type) {
-  // A tf.Cast operation is lazily created on the first uses that isn't a TF
-  // operation.
+// Updates the result of an operation to a new inferred type. Also inserts
+// tf.Cast operation for uses that are incompatible with the new type.
+void UpdateTypeAndInsertIncompatibleUseCasts(Dialect* tf_dialect, Type new_type,
+                                             Operation* op, Value result) {
+  // A tf.Cast operation is lazily created on the first use requires a cast.
   TF::CastOp cast_op;
   auto get_cast_op = [&]() {
     if (!cast_op) {
       OpBuilder b(op);
       b.setInsertionPointAfter(op);
-      cast_op = b.create<TF::CastOp>(op->getLoc(), old_type, result,
+      cast_op = b.create<TF::CastOp>(op->getLoc(), result.getType(), result,
                                      /*truncate=*/b.getBoolAttr(false));
     }
     return Value(cast_op);
   };
+  // First insert cast back for uses that need a cast and then
+  // update the type.
   for (OpOperand& use : make_early_inc_range(result.getUses())) {
     if (NeedsCastBack(use, tf_dialect)) use.set(get_cast_op());
   }
+
+  result.setType(new_type);
 }
 
 // Extracts a PartialTensorShape from the MLIR type.
@@ -210,36 +214,49 @@ bool CanBeRefined(Type type) {
           shape_type.getElementType().isa<TF::ResourceType, TF::VariantType>());
 }
 
+// Returns whether `original_type` type can be refined with
+// `potential_refined_type` type.
+bool CanRefineTypeWith(Type original_type, Type potential_refined_type) {
+  if (original_type == potential_refined_type || !CanBeRefined(original_type))
+    return false;
+
+  auto shape_type = potential_refined_type.dyn_cast<ShapedType>();
+  if (!shape_type) return false;
+  if (shape_type.hasRank()) return true;
+
+  auto element_type_with_subtype =
+      shape_type.getElementType().dyn_cast<TF::TensorFlowTypeWithSubtype>();
+  return element_type_with_subtype &&
+         !element_type_with_subtype.GetSubtypes().empty();
+}
+
+// Refines the type of `result` of `op` using the type `potential_refined_type`.
+// Return true if the type was changed.
+bool RefineResultType(Operation* op, Value result,
+                      Type potential_refined_type) {
+  if (!CanRefineTypeWith(result.getType(), potential_refined_type))
+    return false;
+
+  UpdateTypeAndInsertIncompatibleUseCasts(op->getDialect(),
+                                          potential_refined_type, op, result);
+  return true;
+}
+
 // Infers the shape from a (Stateful)PartionedCall operation by looking up the
 // called function and propagating the return type.
-bool InferShapeForCall(Operation* op) {
-  auto call_op = cast<CallOpInterface>(op);
-  CallInterfaceCallable callable = call_op.getCallableForCallee();
-  SymbolRefAttr sym = callable.dyn_cast<SymbolRefAttr>();
-  if (!sym) return false;
-  FuncOp func = dyn_cast<FuncOp>(SymbolTable::lookupNearestSymbolFrom(op, sym));
+bool InferShapeForCall(CallOpInterface call_op) {
+  FuncOp func = dyn_cast<FuncOp>(call_op.resolveCallable());
   if (!func) return false;
 
+  Operation* op = call_op.getOperation();
   bool changed = false;
   // Map each of the results of the call to the returned type of the
   // function.
   for (auto result : zip(op->getResults(), func.getType().getResults())) {
-    if (std::get<0>(result).getType() == std::get<1>(result)) continue;
-    // Skip already statically shaped results.
-    if (!CanBeRefined(std::get<0>(result).getType())) continue;
-
-    auto shaped_type = std::get<0>(result).getType().cast<ShapedType>();
-    auto new_type = std::get<1>(result).dyn_cast<RankedTensorType>();
-    if (!new_type) continue;
-
-    // Inserts a cast back to the original type if any user is not in the
-    // TF dialect.
-    AddCastBackForUnsupportedNonTFUses(op, std::get<0>(result),
-                                       op->getDialect(), shaped_type);
-    // Finally we inferred the shape and replace the type for this result.
-    std::get<0>(result).setType(new_type);
-    changed = true;
+    changed = RefineResultType(op, std::get<0>(result), std::get<1>(result)) ||
+              changed;
   }
+
   return changed;
 }
 
@@ -265,12 +282,43 @@ bool InferShapeForCast(CastOp op, Dialect* tf_dialect) {
   auto new_type = RankedTensorType::get(
       ranked_op_type.getShape(),
       result.getType().cast<ShapedType>().getElementType());
-  auto old_type = result.getType();
-  result.setType(new_type);
-  AddCastBackForUnsupportedNonTFUses(op, op.getResult(), tf_dialect, old_type);
+
+  UpdateTypeAndInsertIncompatibleUseCasts(tf_dialect, new_type, op,
+                                          op.getResult());
   return true;
 }
 
+// Infer the shape IfOp outputs based on the shapes of the then and else
+// function result types.
+bool InferShapeForIf(IfOp op) {
+  bool changed = false;
+  auto then_results = op.then_func().getType().getResults();
+  auto else_results = op.else_func().getType().getResults();
+  for (auto it : llvm::zip(op.getResults(), then_results, else_results)) {
+    // If then and else types do not match, skip refinement for that result.
+    if (std::get<1>(it) != std::get<2>(it)) continue;
+    changed = RefineResultType(op, std::get<0>(it), std::get<1>(it)) || changed;
+  }
+  return changed;
+}
+
+// Infer the shape IfRegion outputs based on the shapes of the then and else
+// yields.
+bool InferShapeForIfRegion(IfRegionOp op) {
+  bool changed = false;
+
+  Operation* then_yield = op.then_branch().front().getTerminator();
+  Operation* else_yield = op.else_branch().front().getTerminator();
+  for (auto result : zip(op.getResults(), then_yield->getOperandTypes(),
+                         else_yield->getOperandTypes())) {
+    // If then and else types do not match, skip refinement for that result.
+    if (std::get<1>(result) != std::get<2>(result)) continue;
+    changed = RefineResultType(op, std::get<0>(result), std::get<1>(result)) ||
+              changed;
+  }
+  return changed;
+}
+
 bool RefineWithInferTypeOpInterface(InferTypeOpInterface infer_ti,
                                     Dialect* tf_dialect) {
   Operation* op = infer_ti.getOperation();
@@ -291,12 +339,8 @@ bool RefineWithInferTypeOpInterface(InferTypeOpInterface infer_ti,
   for (auto result : zip(op->getResults(), inferred)) {
     if (std::get<0>(result).getType() == std::get<1>(result)) continue;
 
-    // Inserts a cast back to the original type if any user is not in the
-    // TF dialect.
-    AddCastBackForUnsupportedNonTFUses(op, std::get<0>(result),
-                                       op->getDialect(), std::get<1>(result));
-    // Finally we inferred the shape and replace the type for this result.
-    std::get<0>(result).setType(std::get<1>(result));
+    UpdateTypeAndInsertIncompatibleUseCasts(
+        op->getDialect(), std::get<1>(result), op, std::get<0>(result));
     changed = true;
   }
   return changed;
@@ -485,32 +529,37 @@ class ShapeInference {
   //   1) They are never reused, ie. having a single use in module.
   //   2) Their input types match those of their parent ops (excluding inputs
   //      like predicate).
-  // Returns a boolean indicating whether any change has been applied.
-  LogicalResult RefineShapeForControlFlowFunc(FuncOp func,
-                                              ArrayRef<Type> input_types,
-                                              int64_t max_iteration);
-
-  // Propagate the shapes to the functions named.
   LogicalResult PropagateShapeToFunctions(
       ModuleOp module, Operation::operand_type_range input_types,
-      ArrayRef<StringRef> func_names, int64_t max_iteration);
+      ArrayRef<FuncOp> functions, int64_t max_iteration);
+
+  // Propagates shapes to regions given the shapes of the inputs of the regions.
+  // All regions provided in `regions` are assumed to have inputs of type
+  // `input_types`.
+  LogicalResult PropagateShapeToRegions(
+      Operation::operand_type_range input_types, ArrayRef<Region*> regions,
+      int64_t max_iteration);
 
   // Shape propagation for call/control flow ops.
   LogicalResult PropagateShapeIntoAttachedFunctions(Operation* op,
                                                     int64_t max_iteration);
 
+  // Shape propagation for region based control flow.
+  LogicalResult PropagateShapeIntoAttachedRegions(Operation* op,
+                                                  int64_t max_iterations);
+
   // Propagates any constant operand of call_op to the called function body's
   // corresponding argument if the callee has only one use.
   //
   // TODO(b/154065712): Move this to a more general inter-procedural constant
   // folding pass.
-  void PropagateConstantToCallee(CallOpInterface call_op,
-                                 SymbolRefAttr callee_sym, ModuleOp module);
+  void PropagateConstantToCallee(CallOpInterface call_op, FuncOp func,
+                                 ModuleOp module);
 
   // Propagates any constant return value of the callee function to the call
   // op's corresponding result.
-  void PropagateConstantFromCallee(CallOpInterface call_op,
-                                   SymbolRefAttr callee_sym, ModuleOp module);
+  void PropagateConstantFromCallee(CallOpInterface call_op, FuncOp func,
+                                   ModuleOp module);
 
   // Tries to compute the result of folding the op. This doesn't actually
   // perform constant folding, it is just computes the equivalent constants.
@@ -635,8 +684,8 @@ bool ShapeInference::RefineTypeForPassThroughOperands(Operation* op,
              .isa<TF::TensorFlowRefType>())
       continue;
 
-    std::get<1>(entry).setType(operand_type);
-    AddCastBackForUnsupportedNonTFUses(op, result, tf_dialect_, result_type);
+    UpdateTypeAndInsertIncompatibleUseCasts(tf_dialect_, operand_type, op,
+                                            result);
     changed = true;
   }
   return changed;
@@ -666,13 +715,12 @@ bool ShapeInference::RefineShapeForPassThroughOps(Operation* op) {
         result_type.getShape() == operand_type.getShape())
       continue;
     if (!is_allowed_dtype(operand_type.getElementType()) ||
-        !is_allowed_dtype(result_type.getElementType())) {
+        !is_allowed_dtype(result_type.getElementType()))
       continue;
-    }
 
-    result.setType(RankedTensorType::get(operand_type.getShape(),
-                                         result_type.getElementType()));
-    AddCastBackForUnsupportedNonTFUses(op, result, tf_dialect_, result_type);
+    auto new_type = RankedTensorType::get(operand_type.getShape(),
+                                          result_type.getElementType());
+    UpdateTypeAndInsertIncompatibleUseCasts(tf_dialect_, new_type, op, result);
     changed = true;
   }
   return changed;
@@ -712,7 +760,8 @@ bool ShapeInference::InferShapeForSingleOperation(Operation* op) {
   // The shape function of these ops sometimes does not propagate subtypes
   // (handle shapes) for resource and variant types. We use a simple passthrough
   // to make sure they are preserved in the output.
-  if (isa<TF::IdentityOp, TF::IdentityNOp, TF::ZerosLikeOp, TF::WhileOp>(op)) {
+  if (isa<TF::IdentityOp, TF::IdentityNOp, TF::ZerosLikeOp, TF::WhileOp,
+          TF::WhileRegionOp>(op)) {
     return RefineTypeForPassThroughOperands(op, op->getOperands(),
                                             op->getResults());
   }
@@ -728,9 +777,7 @@ bool ShapeInference::InferShapeForSingleOperation(Operation* op) {
 
   // Handle call operations by looking up callee and infering return shape as
   // needed.
-  if (isa<PartitionedCallOp, StatefulPartitionedCallOp, TPUPartitionedCallOp>(
-          op))
-    return InferShapeForCall(op);
+  if (auto call = dyn_cast<CallOpInterface>(op)) return InferShapeForCall(call);
 
   // tf.Cast are only inferred if they have at least one user in the TF dialect
   // or feeding into the function return. This is necessary to avoid inserting
@@ -738,6 +785,17 @@ bool ShapeInference::InferShapeForSingleOperation(Operation* op) {
   if (auto cast_op = dyn_cast<CastOp>(op))
     return InferShapeForCast(cast_op, tf_dialect_);
 
+  // Handle IfOp here by inferring the shape from the else/then function
+  // results. Since `output_shapes` is a derived attribute, avoid going down the
+  // TF InferenceContext path as IfOp shape inference is implemented as just
+  // a lookup of the output_shapes attribute.
+  if (auto if_op = dyn_cast<IfOp>(op)) return InferShapeForIf(if_op);
+
+  // Handle IfRegion operations by infering return shape from the then and else
+  // branches.
+  if (auto if_region = dyn_cast<IfRegionOp>(op))
+    return InferShapeForIfRegion(if_region);
+
   StringRef op_name = op->getName().getStringRef();
   // Drop the `tf.` prefix to query TF registry.
   auto node_name =
@@ -910,12 +968,8 @@ bool ShapeInference::InferShapeForSingleOperation(Operation* op) {
     }
     auto new_type = get_tensor_type(shape_handle, new_element_type);
     if (result.getType() == new_type) continue;
-    // Inserts a cast back to the original type if any user is not in the TF
-    // dialect or a return.
-    AddCastBackForUnsupportedNonTFUses(op, result, tf_dialect_,
-                                       result.getType());
-    // Finally we inferred the shape and replace the type for this result.
-    result.setType(new_type);
+
+    UpdateTypeAndInsertIncompatibleUseCasts(tf_dialect_, new_type, op, result);
     changed = true;
   }
   if (changed)
@@ -924,59 +978,72 @@ bool ShapeInference::InferShapeForSingleOperation(Operation* op) {
   return changed;
 }
 
-LogicalResult ShapeInference::RefineShapeForControlFlowFunc(
-    FuncOp func, ArrayRef<Type> input_types, int64_t max_iteration) {
-  ModuleOp module = func.getParentOfType<ModuleOp>();
-  auto func_uses = SymbolTable::getSymbolUses(func, &module.getBodyRegion());
-  int num_uses = std::distance(func_uses->begin(), func_uses->end());
-  if (num_uses != 1) {
-    func.emitWarning(formatv(
-        "expected control flow function {0} to have exactly 1 use, found {1}.",
-        func.getName(), num_uses));
-    return failure();
-  }
-
-  FunctionType func_type = func.getType();
-  func.setType(FunctionType::get(input_types, func_type.getResults(),
-                                 func.getContext()));
-
-  for (auto arg_and_idx : llvm::enumerate(func.getArguments())) {
-    arg_and_idx.value().setType(input_types[arg_and_idx.index()]);
-  }
-
-  auto res = InferShapeUntilFixPoint(&func.getBody(), max_iteration);
-  if (failed(res)) return res;
-
-  auto new_return_types = InferShapeForFunctionReturnType(func);
-  if (new_return_types.hasValue()) {
-    func.setType(FunctionType::get(input_types, new_return_types.getValue(),
-                                   func.getContext()));
-  }
-
-  return success();
-}
-
 LogicalResult ShapeInference::PropagateShapeToFunctions(
     ModuleOp module, Operation::operand_type_range input_types,
-    ArrayRef<StringRef> func_names, int64_t max_iteration) {
+    ArrayRef<FuncOp> functions, int64_t max_iteration) {
   bool all_succeeded = true;
   auto types = llvm::to_vector<4>(input_types);
-  for (auto func_name : func_names) {
-    FuncOp func = module.lookupSymbol<FuncOp>(func_name);
-    all_succeeded =
-        succeeded(RefineShapeForControlFlowFunc(func, types, max_iteration)) &&
-        all_succeeded;
+  // If shape propagation fails for one function, return failure, but do not
+  // early exit and attempt to propagate shapes for all provided functions to
+  // have a best-effort propagation.
+  for (FuncOp func : functions) {
+    auto func_uses = SymbolTable::getSymbolUses(func, &module.getBodyRegion());
+    if (!llvm::hasSingleElement(func_uses.getValue())) {
+      int num_uses = std::distance(func_uses->begin(), func_uses->end());
+      func.emitWarning(
+          formatv("expected control flow function @{0} to have exactly 1 use, "
+                  "found {1}.",
+                  func.getName(), num_uses));
+      all_succeeded = false;
+      continue;
+    }
+
+    FunctionType func_type = func.getType();
+    func.setType(
+        FunctionType::get(types, func_type.getResults(), func.getContext()));
+
+    auto res =
+        PropagateShapeToRegions(input_types, {&func.getBody()}, max_iteration);
+    if (failed(res)) {
+      all_succeeded = false;
+      continue;
+    }
+
+    auto new_return_types = InferShapeForFunctionReturnType(func);
+    if (new_return_types)
+      func.setType(FunctionType::get(types, new_return_types.getValue(),
+                                     func.getContext()));
+  }
+  return success(all_succeeded);
+}
+
+LogicalResult ShapeInference::PropagateShapeToRegions(
+    Operation::operand_type_range input_types, ArrayRef<Region*> regions,
+    int64_t max_iteration) {
+  bool all_succeeded = true;
+  auto types = llvm::to_vector<4>(input_types);
+  // If shape propagation fails for one region, return failure, but do not
+  // early exit and attempt to propagate shapes for all provided regions to
+  // have a best-effort propagation.
+  for (auto region : regions) {
+    // Refine region arguments.
+    Block& entry = region->front();
+    assert(types.size() == entry.getNumArguments());
+    for (auto arg_and_idx : llvm::enumerate(entry.getArguments())) {
+      arg_and_idx.value().setType(types[arg_and_idx.index()]);
+    }
+
+    // Propagate shapes into the region.
+    all_succeeded = succeeded(InferShapeUntilFixPoint(region, max_iteration)) &&
+                    all_succeeded;
   }
   return success(all_succeeded);
 }
 
 void ShapeInference::PropagateConstantToCallee(CallOpInterface call_op,
-                                               SymbolRefAttr callee_sym,
-                                               ModuleOp module) {
-  auto func = module.lookupSymbol<FuncOp>(callee_sym.getRootReference());
+                                               FuncOp func, ModuleOp module) {
   auto func_uses = SymbolTable::getSymbolUses(func, &module.getBodyRegion());
-  int num_uses = std::distance(func_uses->begin(), func_uses->end());
-  if (num_uses != 1) return;
+  if (!llvm::hasSingleElement(func_uses.getValue())) return;
 
   OpBuilder builder(&func.front().front());
   Operation* op = call_op.getOperation();
@@ -1002,9 +1069,7 @@ void ShapeInference::PropagateConstantToCallee(CallOpInterface call_op,
 }
 
 void ShapeInference::PropagateConstantFromCallee(CallOpInterface call_op,
-                                                 SymbolRefAttr callee_sym,
-                                                 ModuleOp module) {
-  auto func = module.lookupSymbol<FuncOp>(callee_sym.getRootReference());
+                                                 FuncOp func, ModuleOp module) {
   // If the return value is a constant, use the constant as the value of
   // the call return.
   Operation* op = call_op.getOperation();
@@ -1036,28 +1101,29 @@ LogicalResult ShapeInference::PropagateShapeIntoAttachedFunctions(
   if (auto if_op = dyn_cast<TF::IfOp>(op)) {
     return PropagateShapeToFunctions(
         module, drop_begin(if_op.getOperandTypes(), 1),
-        {if_op.then_branch(), if_op.else_branch()}, max_iteration);
+        {if_op.then_func(), if_op.else_func()}, max_iteration);
   } else if (auto case_op = dyn_cast<TF::CaseOp>(op)) {
-    SmallVector<StringRef, 4> branches;
-    for (Attribute branch : case_op.branches())
-      branches.push_back(branch.cast<FlatSymbolRefAttr>().getValue());
+    SmallVector<FuncOp, 4> branches;
+    for (Attribute branch : case_op.branches()) {
+      auto sym = branch.cast<FlatSymbolRefAttr>();
+      branches.push_back(SymbolTable::lookupNearestSymbolFrom<FuncOp>(op, sym));
+    }
     return PropagateShapeToFunctions(module,
                                      drop_begin(case_op.getOperandTypes(), 1),
                                      branches, max_iteration);
   } else if (auto while_op = dyn_cast<TF::WhileOp>(op)) {
-    return PropagateShapeToFunctions(module, while_op.getOperandTypes(),
-                                     {while_op.cond(), while_op.body()},
-                                     max_iteration);
+    return PropagateShapeToFunctions(
+        module, while_op.getOperandTypes(),
+        {while_op.cond_func(), while_op.body_func()}, max_iteration);
   } else if (auto call_op = dyn_cast<CallOpInterface>(op)) {
-    CallInterfaceCallable callable = call_op.getCallableForCallee();
-    if (SymbolRefAttr sym = callable.dyn_cast<SymbolRefAttr>()) {
-      PropagateConstantToCallee(call_op, sym, module);
-      if (failed(PropagateShapeToFunctions(
-              module, call_op.getArgOperands().getTypes(),
-              {sym.getRootReference()}, max_iteration))) {
+    if (auto func = dyn_cast<FuncOp>(call_op.resolveCallable())) {
+      PropagateConstantToCallee(call_op, func, module);
+      if (failed(PropagateShapeToFunctions(module,
+                                           call_op.getArgOperands().getTypes(),
+                                           {func}, max_iteration))) {
         return failure();
       }
-      PropagateConstantFromCallee(call_op, sym, module);
+      PropagateConstantFromCallee(call_op, func, module);
       return success();
     }
   }
@@ -1067,6 +1133,16 @@ LogicalResult ShapeInference::PropagateShapeIntoAttachedFunctions(
   return success();
 }
 
+LogicalResult ShapeInference::PropagateShapeIntoAttachedRegions(
+    Operation* op, int64_t max_iteration) {
+  if (auto while_op = dyn_cast<TF::WhileRegionOp>(op)) {
+    return PropagateShapeToRegions(while_op.getOperandTypes(),
+                                   {&while_op.cond(), &while_op.body()},
+                                   max_iteration);
+  }
+  return success();
+}
+
 LogicalResult ShapeInference::TryToFold(Operation* op) {
   LLVM_DEBUG(op->print(llvm::dbgs() << "TryToFold "); llvm::dbgs() << "\n");
   // If any output result is known, then the op probably has been computed
@@ -1118,12 +1194,8 @@ LogicalResult ShapeInference::TryToFold(Operation* op) {
     if (ElementsAttr eattr = attr.dyn_cast_or_null<ElementsAttr>()) {
       if (std::get<0>(result).getType() == eattr.getType()) continue;
 
-      // Inserts a cast back to the original type if any user is not in the
-      // TF dialect.
-      Type old_type = std::get<0>(result).getType();
-      std::get<0>(result).setType(eattr.getType());
-      AddCastBackForUnsupportedNonTFUses(op, std::get<0>(result), tf_dialect_,
-                                         old_type);
+      UpdateTypeAndInsertIncompatibleUseCasts(tf_dialect_, eattr.getType(), op,
+                                              std::get<0>(result));
     }
   }
 
@@ -1164,6 +1236,11 @@ LogicalResult ShapeInference::InferShapeUntilFixPoint(Region* region,
                              "arguments and bodies";
       }
 
+      if (failed(PropagateShapeIntoAttachedRegions(op, max_iteration))) {
+        op->emitWarning() << "unable to refine shape of attached region "
+                             "arguments and bodies";
+      }
+
       changed |= InferShapeForSingleOperation(op);
     });
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
index 5e095a311ee..d3755a4a7d0 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
@@ -163,7 +163,7 @@ LogicalResult HandleWhileOp(
     const llvm::SmallDenseMap<Value, Value>& data_var_to_size_var,
     llvm::StringMap<PartitionedCallStackOpsInfo>*
         decomposed_partitioned_call_callees) {
-  auto body = module.lookupSymbol<FuncOp>(while_op.body());
+  auto body = while_op.body_func();
   llvm::SmallDenseMap<Value, Value> body_map;
   auto find_arg_stack_type = [&](int64_t index) -> llvm::Optional<Type> {
     auto it = data_var_to_size_var.find(while_op.getOperand(index));
@@ -187,7 +187,7 @@ LogicalResult HandleWhileOp(
     return failure();
   }
   // Cond should not change stacks in the arguments, so use an empty map.
-  auto cond = module.lookupSymbol<FuncOp>(while_op.cond());
+  auto cond = while_op.cond_func();
   ModifyFunctionSignature(cond, nullptr, find_arg_stack_type);
   llvm::SmallDenseMap<Value, Value> empty_map;
   if (failed(DecomposeStackOpsInternal(&cond.front(), module, &empty_map,
@@ -197,24 +197,16 @@ LogicalResult HandleWhileOp(
   if (!signature_change) return success();
   // Create the new while op.
   auto new_while_operands = llvm::to_vector<8>(while_op.getOperands());
-  auto new_output_shapes =
-      llvm::to_vector<8>(while_op.output_shapes().getValue());
   OpBuilder builder(while_op);
   assert(while_op.getNumOperands() == while_op.getNumResults());
   for (int64_t i = 0; i < while_op.getNumResults(); ++i) {
     auto it = data_var_to_size_var.find(while_op.getOperand(i));
     if (it == data_var_to_size_var.end()) continue;
     new_while_operands.push_back(it->getSecond());
-    if (!new_output_shapes.empty()) {
-      // Size is a scalar shape.
-      new_output_shapes.push_back(
-          mlir::TF::ShapeAttr::get(builder.getContext(), ArrayRef<int64_t>()));
-    }
   }
   auto new_while =
       builder.create<TF::WhileOp>(while_op.getLoc(), body.getType().getInputs(),
                                   new_while_operands, while_op.getAttrs());
-  new_while.setAttr("output_shapes", builder.getArrayAttr(new_output_shapes));
   for (int64_t i = 0; i < while_op.getNumResults(); ++i) {
     if (!getElementTypeOrSelf(while_op.getOperand(i).getType())
              .isa<TF::ResourceType>()) {
@@ -239,8 +231,8 @@ LogicalResult HandleIfOp(
     const llvm::SmallDenseMap<Value, Value>& data_var_to_size_var,
     llvm::StringMap<PartitionedCallStackOpsInfo>*
         decomposed_partitioned_call_callees) {
-  auto then_branch = module.lookupSymbol<FuncOp>(if_op.then_branch());
-  auto else_branch = module.lookupSymbol<FuncOp>(if_op.else_branch());
+  auto then_func = if_op.then_func();
+  auto else_func = if_op.else_func();
   llvm::SmallDenseMap<Value, Value> then_map;
   llvm::SmallDenseMap<Value, Value> else_map;
 
@@ -249,12 +241,12 @@ LogicalResult HandleIfOp(
     if (it == data_var_to_size_var.end()) return llvm::None;
     return it->getFirst().getType();
   };
-  ModifyFunctionSignature(then_branch, &then_map, find_arg_stack_type);
-  ModifyFunctionSignature(else_branch, &else_map, find_arg_stack_type);
+  ModifyFunctionSignature(then_func, &then_map, find_arg_stack_type);
+  ModifyFunctionSignature(else_func, &else_map, find_arg_stack_type);
   const bool signature_change = !then_map.empty() || !else_map.empty();
-  if (failed(DecomposeStackOpsInternal(&then_branch.front(), module, &then_map,
+  if (failed(DecomposeStackOpsInternal(&then_func.front(), module, &then_map,
                                        decomposed_partitioned_call_callees)) ||
-      failed(DecomposeStackOpsInternal(&else_branch.front(), module, &else_map,
+      failed(DecomposeStackOpsInternal(&else_func.front(), module, &else_map,
                                        decomposed_partitioned_call_callees))) {
     return failure();
   }
@@ -266,16 +258,16 @@ LogicalResult HandleIfOp(
     new_if_operands.push_back(it->getSecond());
   }
   auto new_if = OpBuilder(if_op).create<TF::IfOp>(
-      if_op.getLoc(), then_branch.getType().getResults(), new_if_operands,
+      if_op.getLoc(), then_func.getType().getResults(), new_if_operands,
       if_op.getAttrs());
   for (auto result : if_op.getResults()) {
     if (!getElementTypeOrSelf(result.getType()).isa<TF::ResourceType>()) {
       continue;
     }
     int64_t then_aliased_input =
-        FindAliasedInput(then_branch, result.getResultNumber());
+        FindAliasedInput(then_func, result.getResultNumber());
     int64_t else_aliased_input =
-        FindAliasedInput(else_branch, result.getResultNumber());
+        FindAliasedInput(else_func, result.getResultNumber());
     if (then_aliased_input >= 0 && then_aliased_input == else_aliased_input) {
       // Replace aliased stack output uses with input.
       result.replaceAllUsesWith(if_op.getOperand(then_aliased_input + 1));
@@ -409,11 +401,9 @@ LogicalResult HandleStackV2Op(
               ArrayRef<TensorType>{buffer.getType().cast<TensorType>()},
               stack.getContext()));
   auto local_var = builder.create<TF::MlirLocalVarOp>(
-      stack.getLoc(), ArrayRef<Type>{var_type}, ArrayRef<Value>{},
-      ArrayRef<NamedAttribute>{});
+      stack.getLoc(), ArrayRef<Type>{var_type}, ArrayRef<Value>{});
   auto local_size_var = builder.create<TF::MlirLocalVarOp>(
-      stack.getLoc(), ArrayRef<Type>{size_var_type}, ArrayRef<Value>{},
-      ArrayRef<NamedAttribute>{});
+      stack.getLoc(), ArrayRef<Type>{size_var_type}, ArrayRef<Value>{});
   // Zero-initialize the local vars.
   cutil::WriteLocalVariable(local_size_var,
                             cutil::GetR1Const({0LL}, builder, stack.getLoc()),
@@ -446,8 +436,7 @@ LogicalResult HandleStackPushV2Op(
   cutil::WriteLocalVariable(push.handle(), stack_val, builder, push.getLoc());
   index = builder.create<TF::AddV2Op>(
       push.getLoc(), ArrayRef<Type>{index.getType()},
-      ArrayRef<Value>{index, cutil::GetR1Const({1}, builder, push.getLoc())},
-      ArrayRef<NamedAttribute>{});
+      ArrayRef<Value>{index, cutil::GetR1Const({1}, builder, push.getLoc())});
   cutil::WriteLocalVariable(it->getSecond(), index, builder, push.getLoc());
   push.erase();
   return success();
@@ -467,8 +456,7 @@ LogicalResult HandleStackPopV2Op(
   auto size = cutil::ReadLocalVariable(it->getSecond(), builder, pop.getLoc());
   auto new_size = builder.create<TF::SubOp>(
       pop.getLoc(), ArrayRef<Type>{size.getType()},
-      ArrayRef<Value>{size, cutil::GetR1Const({1}, builder, pop.getLoc())},
-      ArrayRef<NamedAttribute>{});
+      ArrayRef<Value>{size, cutil::GetR1Const({1}, builder, pop.getLoc())});
   auto pop_val = cutil::GetElement(new_size, stack_val, builder, pop.getLoc());
   pop.replaceAllUsesWith(pop_val);
   // Update the size.
@@ -519,21 +507,20 @@ LogicalResult DecomposeStackOpsInternal(
         return failure();
       }
     } else if (auto pcall = llvm::dyn_cast<TF::PartitionedCallOp>(&op)) {
-      if (!pcall.f().isa<FlatSymbolRefAttr>()) {
+      if (!pcall.func()) {
         return pcall.emitOpError(
             "stack decomposition does not support call with nested references");
       }
       if (failed(HandlePartitionedCallOp(
-              pcall, module.lookupSymbol<FuncOp>(pcall.f().getRootReference()),
-              module, *data_var_to_size_var,
+              pcall, pcall.func(), module, *data_var_to_size_var,
               decomposed_partitioned_call_callees))) {
         return failure();
       }
     } else if (auto spcall =
                    llvm::dyn_cast<TF::StatefulPartitionedCallOp>(&op)) {
       if (failed(HandlePartitionedCallOp(
-              spcall, module.lookupSymbol<FuncOp>(spcall.f()), module,
-              *data_var_to_size_var, decomposed_partitioned_call_callees))) {
+              spcall, spcall.func(), module, *data_var_to_size_var,
+              decomposed_partitioned_call_callees))) {
         return failure();
       }
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
index 9c659a95078..b3a05c06a67 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
@@ -166,8 +166,7 @@ LogicalResult HandleTensorArrayV3Op(
               ArrayRef<TensorType>{buffer.getType().cast<TensorType>()},
               ta.getContext()));
   auto local_var = builder.create<TF::MlirLocalVarOp>(
-      ta.getLoc(), ArrayRef<Type>{var_type}, ArrayRef<Value>{},
-      ArrayRef<NamedAttribute>{});
+      ta.getLoc(), ArrayRef<Type>{var_type}, ArrayRef<Value>{});
   cutil::WriteLocalVariable(local_var, buffer, builder, ta.getLoc());
   ta.handle().replaceAllUsesWith(local_var);
   // The flow output is just a way for the front end to enforce ordering among
@@ -227,8 +226,7 @@ LogicalResult HandleTensorArrayWriteV3Op(
     elem = builder.create<TF::ReshapeOp>(
         write.getLoc(), ArrayRef<Type>{slice_type},
         ArrayRef<Value>{elem, cutil::GetR1Const(slice_type.getShape(), builder,
-                                                write.getLoc())},
-        ArrayRef<NamedAttribute>{});
+                                                write.getLoc())});
     elem =
         cutil::AccumulateBuffers(elem, original_elem, builder, write.getLoc());
   }
@@ -261,8 +259,7 @@ LogicalResult HandleTensorArrayConcatV3Op(
       ArrayRef<Type>{
           RankedTensorType::get(shape, buffer_type.getElementType())},
       ArrayRef<Value>{buffer,
-                      cutil::GetR1Const(shape, builder, concat.getLoc())},
-      ArrayRef<NamedAttribute>{});
+                      cutil::GetR1Const(shape, builder, concat.getLoc())});
   concat.value().replaceAllUsesWith(buffer);
 
   // Create the lengths as a list of the same value (element size).
@@ -302,8 +299,7 @@ LogicalResult HandleTensorArraySplitV3Op(
                             buffer_shape, elem_type.getElementType())},
                         ArrayRef<Value>{split.value(),
                                         cutil::GetR1Const(buffer_shape, builder,
-                                                          split.getLoc())},
-                        ArrayRef<NamedAttribute>{})
+                                                          split.getLoc())})
                     .output();
   // Accumulate with the old buffer.
   auto old_buffer =
@@ -339,8 +335,7 @@ LogicalResult CreateAndInitializeGradVariable(Type local_var_type,
                                               Operation* op, Value* var) {
   OpBuilder builder(op);
   *var = builder.create<TF::MlirLocalVarOp>(
-      op->getLoc(), ArrayRef<Type>{local_var_type}, ArrayRef<Value>{},
-      ArrayRef<NamedAttribute>{});
+      op->getLoc(), ArrayRef<Type>{local_var_type}, ArrayRef<Value>{});
   Value buffer;
   auto buffer_type = getElementTypeOrSelf(local_var_type)
                          .cast<TF::ResourceType>()
@@ -447,38 +442,20 @@ llvm::SmallDenseMap<int64_t, llvm::SmallVector<string, 4>> AccessedGradients(
       if (auto grad = llvm::dyn_cast<TF::TensorArrayGradV3Op>(&op)) {
         insert(grad.handle(), grad.source().str());
       } else if (auto while_op = llvm::dyn_cast<TF::WhileOp>(&op)) {
-        auto body = module.lookupSymbol<FuncOp>(while_op.body());
-        auto cond = module.lookupSymbol<FuncOp>(while_op.cond());
-        for (const auto& entry : AccessedGradients({body, cond}, module)) {
-          for (const string& source : entry.getSecond()) {
+        for (const auto& entry : AccessedGradients(
+                 {while_op.body_func(), while_op.cond_func()}, module))
+          for (const string& source : entry.getSecond())
             insert(while_op.getOperand(entry.getFirst()), source);
-          }
-        }
       } else if (auto if_op = llvm::dyn_cast<TF::IfOp>(&op)) {
-        auto then_branch = module.lookupSymbol<FuncOp>(if_op.then_branch());
-        auto else_branch = module.lookupSymbol<FuncOp>(if_op.else_branch());
         for (const auto& entry :
-             AccessedGradients({then_branch, else_branch}, module)) {
-          for (const string& source : entry.getSecond()) {
+             AccessedGradients({if_op.then_func(), if_op.else_func()}, module))
+          for (const string& source : entry.getSecond())
             insert(if_op.getOperand(entry.getFirst() + 1), source);
-          }
-        }
-      } else if (auto pc = llvm::dyn_cast<TF::PartitionedCallOp>(&op)) {
-        if (!pc.f().isa<FlatSymbolRefAttr>()) continue;
-        auto callee = module.lookupSymbol<FuncOp>(pc.f().getRootReference());
-        for (const auto& entry : AccessedGradients({callee}, module)) {
-          for (const string& source : entry.getSecond()) {
-            insert(pc.getOperand(entry.getFirst()), source);
-          }
-        }
-      } else if (auto spc =
-                     llvm::dyn_cast<TF::StatefulPartitionedCallOp>(&op)) {
-        auto callee = module.lookupSymbol<FuncOp>(spc.f());
-        for (const auto& entry : AccessedGradients({callee}, module)) {
-          for (const string& source : entry.getSecond()) {
-            insert(spc.getOperand(entry.getFirst()), source);
-          }
-        }
+      } else if (auto call = llvm::dyn_cast<CallOpInterface>(&op)) {
+        auto callee = dyn_cast<FuncOp>(call.resolveCallable());
+        for (const auto& entry : AccessedGradients({callee}, module))
+          for (const string& source : entry.getSecond())
+            insert(call.getArgOperands()[entry.getFirst()], source);
       }
     }
   }
@@ -532,8 +509,8 @@ LogicalResult HandleWhileOp(TF::WhileOp while_op, ModuleOp module,
                             llvm::SmallDenseMap<Value, TensorArrayStats>* stats,
                             llvm::StringMap<PartitionedCallTensorArrayOpsInfo>*
                                 decomposed_partitioned_call_callees) {
-  auto body = module.lookupSymbol<FuncOp>(while_op.body());
-  auto cond = module.lookupSymbol<FuncOp>(while_op.cond());
+  auto body = while_op.body_func();
+  auto cond = while_op.cond_func();
   auto grads = AccessedGradients({body, cond}, module);
   auto ta_arg_buffer_type = [&](int64_t index) -> Type {
     auto it = stats->find(while_op.getOperand(index));
@@ -600,8 +577,6 @@ LogicalResult HandleWhileOp(TF::WhileOp while_op, ModuleOp module,
   auto new_while =
       builder.create<TF::WhileOp>(while_op.getLoc(), body.getType().getInputs(),
                                   operands, while_op.getAttrs());
-  // Clear the output shapes as it is not needed for XLA lowering.
-  new_while.setAttr("output_shapes", builder.getArrayAttr({}));
   for (int64_t i = 0; i < while_op.getNumOperands(); ++i) {
     if (ta_arg_buffer_type(i)) {
       while_op.getResult(i).replaceAllUsesWith(while_op.getOperand(i));
@@ -617,8 +592,8 @@ LogicalResult HandleIfOp(TF::IfOp if_op, ModuleOp module,
                          llvm::SmallDenseMap<Value, TensorArrayStats>* stats,
                          llvm::StringMap<PartitionedCallTensorArrayOpsInfo>*
                              decomposed_partitioned_call_callees) {
-  auto then_branch = module.lookupSymbol<FuncOp>(if_op.then_branch());
-  auto else_branch = module.lookupSymbol<FuncOp>(if_op.else_branch());
+  auto then_branch = if_op.then_func();
+  auto else_branch = if_op.else_func();
   auto grads = AccessedGradients({then_branch, else_branch}, module);
   auto ta_arg_buffer_type = [&](int64_t index) -> Type {
     auto it = stats->find(if_op.getOperand(index + 1));
@@ -668,8 +643,6 @@ LogicalResult HandleIfOp(TF::IfOp if_op, ModuleOp module,
   auto new_if = builder.create<TF::IfOp>(if_op.getLoc(),
                                          then_branch.getType().getResults(),
                                          operands, if_op.getAttrs());
-  // Clear the output shapes as it is not needed for XLA lowering.
-  new_if.setAttr("output_shapes", builder.getArrayAttr({}));
   auto ret_forwards_input = [](FuncOp f, int64_t ret_ind) -> int64_t {
     auto retval = f.front().getTerminator()->getOperand(ret_ind);
     auto arg = retval.dyn_cast<BlockArgument>();
@@ -847,21 +820,22 @@ LogicalResult DecomposeTensorArrayOps(
         return failure();
       }
     } else if (auto pcall = llvm::dyn_cast<TF::PartitionedCallOp>(&op)) {
-      if (!pcall.f().isa<FlatSymbolRefAttr>()) {
+      auto callee = pcall.func();
+      if (!callee)
         return pcall.emitOpError(
             "TensorArray decomposition does not support call with nested "
             "references.");
-      }
-      if (failed(HandlePartitionedCallOp(
-              pcall, module.lookupSymbol<FuncOp>(pcall.f().getRootReference()),
-              module, stats, decomposed_partitioned_call_callees))) {
+
+      if (failed(
+              HandlePartitionedCallOp(pcall, callee, module, stats,
+                                      decomposed_partitioned_call_callees))) {
         return failure();
       }
     } else if (auto spcall =
                    llvm::dyn_cast<TF::StatefulPartitionedCallOp>(&op)) {
-      if (failed(HandlePartitionedCallOp(
-              spcall, module.lookupSymbol<FuncOp>(spcall.f()), module, stats,
-              decomposed_partitioned_call_callees))) {
+      if (failed(
+              HandlePartitionedCallOp(spcall, spcall.func(), module, stats,
+                                      decomposed_partitioned_call_callees))) {
         return failure();
       }
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
index 11153f0dfc3..9634e4a8be3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
@@ -155,7 +155,7 @@ LogicalResult HandleWhileOp(
     llvm::StringMap<PartitionedCallDecompositionInfo>*
         decomposed_partitioned_call_callees) {
   // Rewrite body.
-  auto body = module.lookupSymbol<FuncOp>(while_op.body());
+  auto body = while_op.body_func();
   llvm::SmallDenseMap<Value, SizeInfo> body_map;
   auto find_arg_tensor_list_type = [&](int64_t index) -> llvm::Optional<Type> {
     auto it = buffer_to_size->find(while_op.getOperand(index));
@@ -176,7 +176,7 @@ LogicalResult HandleWhileOp(
   auto output_buffer_to_size = AddTensorListSizesToReturn(body, body_map);
 
   // Rewrite cond.
-  auto cond = module.lookupSymbol<FuncOp>(while_op.cond());
+  auto cond = while_op.cond_func();
   llvm::SmallDenseMap<Value, SizeInfo> cond_map;
   ModifyFunctionSignature(cond, cutil::GetSizeType(builder), &cond_map,
                           find_arg_tensor_list_type, arg_buffer_size_is_fixed);
@@ -190,22 +190,14 @@ LogicalResult HandleWhileOp(
   }
   // Create the new while op.
   auto new_while_operands = llvm::to_vector<8>(while_op.getOperands());
-  auto new_output_shapes =
-      llvm::to_vector<8>(while_op.output_shapes().getValue());
   for (int64_t i = 0; i < while_op.getNumResults(); ++i) {
     auto it = buffer_to_size->find(while_op.getOperand(i));
     if (it == buffer_to_size->end()) continue;
     new_while_operands.push_back(it->getSecond().size);
-    if (!new_output_shapes.empty()) {
-      // Size is a scalar shape.
-      new_output_shapes.push_back(
-          mlir::TF::ShapeAttr::get(builder.getContext(), ArrayRef<int64_t>()));
-    }
   }
   auto new_while =
       builder.create<TF::WhileOp>(while_op.getLoc(), body.getType().getInputs(),
                                   new_while_operands, while_op.getAttrs());
-  new_while.setAttr("output_shapes", builder.getArrayAttr(new_output_shapes));
   for (const auto& entry : output_buffer_to_size) {
     (*buffer_to_size)[new_while.getResult(std::get<0>(entry))] = {
         new_while.getResult(std::get<1>(entry)), std::get<2>(entry)};
@@ -438,7 +430,7 @@ LogicalResult HandleTensorListFromTensorOp(
   OpBuilder builder(list);
   Value buffer = builder.create<TF::IdentityOp>(
       list.getLoc(), ArrayRef<Type>{list.tensor().getType()},
-      ArrayRef<Value>{list.tensor()}, ArrayRef<NamedAttribute>{});
+      ArrayRef<Value>{list.tensor()});
   auto type = buffer.getType().cast<TensorType>();
   if (!type.hasStaticShape()) {
     return list.emitOpError("TensorListFromTensorOp input has unknown shape.");
@@ -468,8 +460,7 @@ LogicalResult HandleTensorListPushBackOp(
       cutil::SetElement(size, buffer, push.tensor(), builder, push.getLoc());
   auto new_size = builder.create<TF::AddV2Op>(
       push.getLoc(), ArrayRef<Type>{size.getType()},
-      ArrayRef<Value>{size, cutil::GetR1Const({1LL}, builder, push.getLoc())},
-      ArrayRef<NamedAttribute>{});
+      ArrayRef<Value>{size, cutil::GetR1Const({1LL}, builder, push.getLoc())});
   push.output_handle().replaceAllUsesWith(new_buffer);
   (*buffer_to_size)[new_buffer] = {new_size, /*fixed=*/false};
   push.erase();
@@ -491,12 +482,10 @@ LogicalResult HandleTensorListPopBackOp(
   auto size = it->getSecond().size;
   OpBuilder builder(pop);
   auto new_buffer = builder.create<TF::IdentityOp>(
-      pop.getLoc(), ArrayRef<Type>{buffer.getType()}, ArrayRef<Value>{buffer},
-      ArrayRef<NamedAttribute>{});
+      pop.getLoc(), ArrayRef<Type>{buffer.getType()}, ArrayRef<Value>{buffer});
   auto new_size = builder.create<TF::SubOp>(
       pop.getLoc(), ArrayRef<Type>{size.getType()},
-      ArrayRef<Value>{size, cutil::GetR1Const({1LL}, builder, pop.getLoc())},
-      ArrayRef<NamedAttribute>{});
+      ArrayRef<Value>{size, cutil::GetR1Const({1LL}, builder, pop.getLoc())});
   auto element = cutil::GetElement(new_size, new_buffer, builder, pop.getLoc());
   pop.output_handle().replaceAllUsesWith(new_buffer);
   pop.tensor().replaceAllUsesWith(element);
@@ -567,8 +556,7 @@ LogicalResult HandleTensorListLengthOp(
         ArrayRef<Type>{RankedTensorType::get(
             {}, getElementTypeOrSelf(current_size.getType()))},
         ArrayRef<Value>{current_size,
-                        cutil::GetR1Const({}, builder, length.getLoc())},
-        ArrayRef<NamedAttribute>{});
+                        cutil::GetR1Const({}, builder, length.getLoc())});
     length.length().replaceAllUsesWith(reshape);
   }
   length.erase();
@@ -713,11 +701,8 @@ LogicalResult DecomposeTensorListOpsInternal(
         return failure();
       }
     } else if (auto if_op = llvm::dyn_cast<TF::IfOp>(&op)) {
-      auto then_branch = module.lookupSymbol<FuncOp>(if_op.then_branch());
-      auto else_branch = module.lookupSymbol<FuncOp>(if_op.else_branch());
-
-      if (failed(HandleCaseOrIfOp(if_op, {then_branch, else_branch}, module,
-                                  buffer_to_size,
+      if (failed(HandleCaseOrIfOp(if_op, {if_op.then_func(), if_op.else_func()},
+                                  module, buffer_to_size,
                                   decomposed_partitioned_call_callees))) {
         return failure();
       }
@@ -732,21 +717,21 @@ LogicalResult DecomposeTensorListOpsInternal(
         return failure();
       }
     } else if (auto pcall = llvm::dyn_cast<TF::PartitionedCallOp>(&op)) {
-      if (!pcall.f().isa<FlatSymbolRefAttr>()) {
+      if (!pcall.func())
         return pcall.emitOpError(
             "TensorList decomposition does not support call with nested "
             "references.");
-      }
+
       if (failed(HandlePartitionedCallOp(
-              pcall, module.lookupSymbol<FuncOp>(pcall.f().getRootReference()),
-              module, buffer_to_size, decomposed_partitioned_call_callees))) {
+              pcall, pcall.func(), module, buffer_to_size,
+              decomposed_partitioned_call_callees))) {
         return failure();
       }
     } else if (auto spcall =
                    llvm::dyn_cast<TF::StatefulPartitionedCallOp>(&op)) {
       if (failed(HandlePartitionedCallOp(
-              spcall, module.lookupSymbol<FuncOp>(spcall.f()), module,
-              buffer_to_size, decomposed_partitioned_call_callees))) {
+              spcall, spcall.func(), module, buffer_to_size,
+              decomposed_partitioned_call_callees))) {
         return failure();
       }
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/test_resource_alias_analysis.cc b/tensorflow/compiler/mlir/tensorflow/transforms/test_resource_alias_analysis.cc
new file mode 100644
index 00000000000..920b2024c0f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/test_resource_alias_analysis.cc
@@ -0,0 +1,111 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <utility>
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace TF {
+namespace {
+
+// A pass that annotates each operation with a resource type result with the
+// aliasing values for each such result. Each value is assigned a unique ID, and
+// that ID is used to annotate the operations.
+struct TestResourceAliasAnalysis
+    : public TF::PerFunctionAggregateAnalysisConsumerPass<
+          TestResourceAliasAnalysis, TF::ResourceAliasAnalysis> {
+  void runOnFunction(FuncOp func,
+                     const TF::ResourceAliasAnalysis::Info& analysis) {
+    int64_t next_id = 0;
+    llvm::SmallDenseMap<Value, int64_t, 8> ids;
+
+    auto assign_id = [&](Value value) {
+      if (ids.find(value) == ids.end()) ids.insert({value, next_id++});
+    };
+
+    auto get_id = [&](Value value) -> int64_t {
+      auto it = ids.find(value);
+      assert(it != ids.end());
+      return it->second;
+    };
+
+    auto print_aliases = [&](InFlightDiagnostic& diag, Value value) {
+      diag << ", ID " << get_id(value) << " : ";
+      if (analysis.IsUnknownResource(value)) {
+        diag << "Unknown";
+      } else {
+        auto aliases = llvm::to_vector<4>(analysis.GetResourceAliases(value));
+        llvm::sort(aliases,
+                   [&](Value v1, Value v2) { return get_id(v1) < get_id(v2); });
+        llvm::interleaveComma(aliases, diag,
+                              [&](Value v) { diag << get_id(v); });
+      }
+    };
+
+    // Assign a unique ID to each value seen in this function.
+    func.walk([&](Operation* op) {
+      // For all attached regions, assign ID to the region arguments.
+      for (Region& region : op->getRegions()) {
+        for (auto region_arg : filter_resources(region.getArguments()))
+          assign_id(region_arg);
+      }
+
+      // Assign ID for all results.
+      for (auto result : filter_resources(op->getResults())) assign_id(result);
+    });
+
+    // Now walk each operation, and annotate it wil remarks for aliases for
+    // each resource type result
+    func.walk([&](Operation* op) {
+      // For all attached regions, assign ID to the region arguments.
+      for (Region& region : op->getRegions()) {
+        for (auto region_arg : filter_resources(region.getArguments())) {
+          InFlightDiagnostic diag = op->emitRemark("Region #")
+                                    << region.getRegionNumber() << ", Arg #"
+                                    << region_arg.getArgNumber();
+          print_aliases(diag, region_arg);
+        }
+      }
+
+      for (auto result : filter_resources(op->getResults())) {
+        InFlightDiagnostic diag = op->emitRemark("Result #")
+                                  << result.getResultNumber();
+        print_aliases(diag, result);
+      }
+    });
+  }
+};
+
+static mlir::PassRegistration<TestResourceAliasAnalysis> pass(
+    "tf-test-resource-alias-analysis",
+    "Add remarks based on resource alias analysis result, for testing "
+    "purpose.");
+
+}  // anonymous namespace
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/test_side_effect_analysis.cc b/tensorflow/compiler/mlir/tensorflow/transforms/test_side_effect_analysis.cc
index 6b284222526..405c529840b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/test_side_effect_analysis.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/test_side_effect_analysis.cc
@@ -39,11 +39,13 @@ namespace {
 // A pass that adds "Predecessors" and "Successors" remarks for each op based on
 // SideEffectAnalysis result. For testing purpose only.
 struct TestSideEffectAnalysis
-    : public mlir::PassWrapper<TestSideEffectAnalysis, FunctionPass> {
-  void runOnFunction() override {
+    : public TF::PerFunctionAggregateAnalysisConsumerPass<
+          TestSideEffectAnalysis, TF::SideEffectAnalysis> {
+  void runOnFunction(FuncOp func,
+                     const TF::SideEffectAnalysis::Info& analysis) {
     int64_t next_id = 0;
     llvm::SmallDenseMap<Operation*, int64_t, 8> ids;
-    getFunction().walk([&](Operation* op) {
+    func.walk([&](Operation* op) {
       ids[op] = next_id++;
       op->emitRemark("ID: ") << ids[op];
     });
@@ -53,8 +55,7 @@ struct TestSideEffectAnalysis
       for (auto op : ops) id_vec.push_back(std::to_string(ids[op]));
       return llvm::join(id_vec, ",");
     };
-    auto& analysis = getAnalysis<TF::SideEffectAnalysis>();
-    getFunction().walk([&](Operation* op) {
+    func.walk([&](Operation* op) {
       if (!analysis.DirectControlPredecessors(op).empty()) {
         op->emitRemark("Predecessors: ")
             << "{" << join_ids(analysis.DirectControlPredecessors(op)) << "}";
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h
index f7a73dc1561..d46b81156f9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h
@@ -46,6 +46,9 @@ CreateRemoveVariablesInSessionInitializerPass();
 std::unique_ptr<OperationPass<ModuleOp>> CreateLiftVariablesPass(
     ::tensorflow::Session* session);
 
+// Creates a pass that removes duplicate 'tf_saved_model.bound_input' bindings.
+std::unique_ptr<OperationPass<FuncOp>> CreateDedupBoundInputBindingPass();
+
 }  // namespace tf_saved_model
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc
index 9abf67b62a9..162ecd77d4f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc
@@ -344,8 +344,9 @@ LogicalResult ReplicateCluster(tf_device::ClusterOp cluster, int num_replicas) {
   for (auto& pos_and_input : llvm::enumerate(replicated_input_ops)) {
     auto input = pos_and_input.value();
     bool is_packed = llvm::cast<TF::TPUReplicatedInputOp>(input).is_packed();
+    const int num_operands = input->getNumOperands();
     int num_inputs = is_packed ? 1 : num_replicas;
-    if (input->getNumOperands() != num_inputs)
+    if (num_operands != num_inputs)
       return input->emitOpError() << "requires " << num_inputs << " operands";
 
     auto tpu_replicated_input = llvm::cast<TF::TPUReplicatedInputOp>(input);
@@ -393,7 +394,8 @@ LogicalResult ReplicateCluster(tf_device::ClusterOp cluster, int num_replicas) {
                << "requires output of " << cluster.getOperationName()
                << " to lead to a 'tf.TPUReplicatedOutput' op";
 
-      if (def->getNumResults() != num_replicas)
+      const int def_NumResults = def->getNumResults();
+      if (def_NumResults != num_replicas)
         return def->emitOpError() << "requires " << num_replicas << " results";
 
       auto replicate_outputs = llvm::make_range(
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc
index e2f4fca1219..41362465cd9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h"
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
@@ -77,24 +78,28 @@ constexpr char kFuncDeviceAttr[] = "tf.device";
 // because tf.TPUCopyWithLayout accepts a host input and produces a device
 // output.
 struct TPUDynamicLayoutPass
-    : public PassWrapper<TPUDynamicLayoutPass, FunctionPass> {
-  void runOnFunction() override;
+    : public TF::PerFunctionAggregateAnalysisConsumerPass<
+          TPUDynamicLayoutPass, TF::ResourceAliasAnalysis> {
+  void runOnFunction(
+      FuncOp func,
+      const TF::ResourceAliasAnalysis::Info& resource_alias_analysis);
 };
 
 // Checks if the input producer op is supported in this transform. Right now, we
 // only check if it is a tf.IteratorGetNext where resource input is coming from
 // a VarHandle on CPU or a function argument assigned to CPU.
-bool IsSupportedInputOp(Operation* op,
-                        TF::ResourceAliasAnalysis* resource_alias_analysis) {
+bool IsSupportedInputOp(
+    Operation* op,
+    const TF::ResourceAliasAnalysis::Info& resource_alias_analysis) {
   TF::IteratorGetNextOp iterator_op = llvm::dyn_cast<TF::IteratorGetNextOp>(op);
   if (!iterator_op) return false;
 
   Value resource_iterator = iterator_op.iterator();
 
-  if (resource_alias_analysis->IsUnknownResource(resource_iterator))
+  if (resource_alias_analysis.IsUnknownResource(resource_iterator))
     return false;
   llvm::SmallSetVector<Value, 8> aliases =
-      resource_alias_analysis->GetResourceAliases(resource_iterator);
+      resource_alias_analysis.GetResourceAliases(resource_iterator);
 
   auto is_generator = [](Value val) {
     if (val.isa<BlockArgument>()) return true;
@@ -154,8 +159,7 @@ TF::TPUCopyWithLayoutOp BuildCopyWithLayout(tf_device::LaunchOp execute_launch,
                                             Value input, OpBuilder* builder) {
   return builder->create<TF::TPUCopyWithLayoutOp>(
       execute_launch.getLoc(), llvm::ArrayRef<Type>{input.getType()},
-      llvm::ArrayRef<Value>{input, get_layout.layout()},
-      llvm::ArrayRef<NamedAttribute>{});
+      llvm::ArrayRef<Value>{input, get_layout.layout()});
 }
 
 // Performs transformation for a non-replicated input.
@@ -178,7 +182,7 @@ bool HandleReplicatedInputs(
     const int64_t execute_arg_index, Value compilation_key,
     tf_device::LaunchOp execute_launch, tf_device::LaunchOp compile_launch,
     const int64_t replicate_arg_index, tf_device::ReplicateOp replicate,
-    TF::ResourceAliasAnalysis* resource_alias_analysis) {
+    const TF::ResourceAliasAnalysis::Info& resource_alias_analysis) {
   // We need to know the devices to copy to.
   if (!replicate.devices()) return false;
   int64_t num_replicas = replicate.n().getZExtValue();
@@ -216,7 +220,7 @@ bool HandleReplicatedInputs(
 void HandleCompileAndExecutes(
     tf_device::LaunchOp compile_launch,
     llvm::MutableArrayRef<tf_device::LaunchOp> execute_launches,
-    TF::ResourceAliasAnalysis* resource_alias_analysis) {
+    const TF::ResourceAliasAnalysis::Info& resource_alias_analysis) {
   auto compile =
       llvm::cast<TF::_TPUCompileMlirOp>(compile_launch.GetBody().front());
   tensorflow::tpu::TPUCompileMetadataProto metadata;
@@ -274,9 +278,10 @@ void HandleCompileAndExecutes(
                                                 compile.getContext()));
 }
 
-void TPUDynamicLayoutPass::runOnFunction() {
-  TF::ResourceAliasAnalysis resource_alias_analysis(getFunction());
-  getFunction().walk([&](TF::_TPUCompileMlirOp compile) {
+void TPUDynamicLayoutPass::runOnFunction(
+    FuncOp func,
+    const TF::ResourceAliasAnalysis::Info& resource_alias_analysis) {
+  func.walk([&](TF::_TPUCompileMlirOp compile) {
     // Detect tf._TPUCompileMlir -> tf.TPUExecute(s).
     auto compile_launch =
         llvm::dyn_cast<tf_device::LaunchOp>(compile.getParentOp());
@@ -296,13 +301,13 @@ void TPUDynamicLayoutPass::runOnFunction() {
     }
 
     HandleCompileAndExecutes(compile_launch, execute_launches,
-                             &resource_alias_analysis);
+                             resource_alias_analysis);
   });
 }
 
 }  // namespace
 
-std::unique_ptr<OperationPass<FuncOp>> CreateTPUDynamicLayoutPass() {
+std::unique_ptr<OperationPass<ModuleOp>> CreateTPUDynamicLayoutPass() {
   return std::make_unique<TPUDynamicLayoutPass>();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
index af0675197ac..9365807663a 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
@@ -17,11 +17,21 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -34,10 +44,7 @@ namespace TFTPU {
 
 namespace {
 
-constexpr char kAncestorsAttr[] = "ancestors";
 constexpr char kDeviceAttr[] = "device";
-constexpr char kKeyAttr[] = "key";
-constexpr char kShapesAttr[] = "shapes";
 constexpr char kXlaOutsideCompilationAttr[] = "_xla_outside_compilation";
 
 // Mapping for `_xla_outside_compilation` attribute to ops of a cluster.
@@ -80,31 +87,203 @@ struct TPUExtractOutsideCompilation
   void runOnOperation() override;
 };
 
-// Collects and clusters ops in `block` with the same `_xla_outside_compilation`
-// attribute into `clusters` This returns an error if a
-// `_xla_outside_compilation` attribute of an op is empty.
-LogicalResult CollectAndGroupOutsideClusterOps(Block* block,
-                                               OutsideClusterMap* clusters) {
-  for (Operation& op : *block) {
-    if (auto attr = op.getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr)) {
-      if (attr.getValue().empty())
-        return op.emitError()
-               << "attribute '" << kXlaOutsideCompilationAttr << "' is empty";
+// Holds information about control flow operations that wrap outside compiled
+// op. Currently only tf.If op is supported.
+class ControlFlowStackInfo {
+ public:
+  enum ControlFlowBranchType { kIfThen, kIfElse };
 
-      auto it = clusters->try_emplace(attr.getValue());
-      it.first->getSecond().push_back(&op);
+  explicit ControlFlowStackInfo(Operation* wrapping_op, Operation* nested_op)
+      : callsite_op_(wrapping_op) {
+    // Only tf.IfRegion op is supported for now.
+    auto control_flow_op = llvm::cast<TF::IfRegionOp>(callsite_op_);
+    assert(control_flow_op);
+
+    auto parent_region = nested_op->getParentRegion();
+    if (&control_flow_op.then_branch() == parent_region) {
+      type_ = ControlFlowBranchType::kIfThen;
+    } else {
+      type_ = ControlFlowBranchType::kIfElse;
     }
   }
 
-  return success();
+  Value GetIfPredicateValue() {
+    auto if_op = llvm::cast<TF::IfRegionOp>(callsite_op_);
+    return if_op.cond();
+  }
+
+  ControlFlowBranchType GetBranchType() const { return type_; }
+
+  Operation* GetCallSiteOp() const { return callsite_op_; }
+
+ private:
+  ControlFlowBranchType type_;
+
+  // `this` does not hold ownership of `callsite_op_`.
+  Operation* callsite_op_;
+};
+
+// Returns a list of ControlFlowStackInfo that represents a stack of control
+// flow operations that wraps `op`.
+llvm::SmallVector<ControlFlowStackInfo, 4> GetControlFlowStackForOp(
+    tf_device::ClusterOp tpu_cluster, Operation* op) {
+  assert(tpu_cluster.getOperation()->isProperAncestor(op));
+
+  llvm::SmallVector<ControlFlowStackInfo, 4> controlflow_stack;
+  Operation* op_in_stack = op;
+  while (op_in_stack != tpu_cluster.getOperation()) {
+    auto parent_op = op_in_stack->getParentOp();
+    if (llvm::isa<TF::IfRegionOp>(parent_op)) {
+      controlflow_stack.insert(controlflow_stack.begin(),
+                               ControlFlowStackInfo(parent_op, op_in_stack));
+    }
+    op_in_stack = parent_op;
+  }
+
+  return controlflow_stack;
 }
 
-// Moves `cluster_ops` to associated `launch_op` body.
-void MoveOutsideClusterOpsToLaunchOp(tf_device::LaunchOp launch_op,
-                                     llvm::ArrayRef<Operation*> cluster_ops) {
-  MLIRContext* context = launch_op.getContext();
-  Operation* terminator = launch_op.GetBody().getTerminator();
+// Creates a IfRegionOp with `predicate` and then/else region with yield op and
+// an empty block.
+TF::IfRegionOp CloneEmptyIfWithPredicate(Value predicate, bool is_stateless,
+                                         Location loc, OpBuilder* builder) {
+  auto host_side_if = builder->create<TF::IfRegionOp>(
+      loc, llvm::SmallVector<Type, 4>{}, predicate, is_stateless);
 
+  // Create empty then branch region.
+  auto& then_branch = host_side_if.then_branch();
+  builder->setInsertionPoint(&then_branch.front(), then_branch.front().begin());
+  builder->createBlock(&then_branch);
+  builder->create<TF::YieldOp>(loc, llvm::SmallVector<mlir::Value, 4>({}));
+
+  // Create empty else branch region.
+  auto& else_branch = host_side_if.else_branch();
+  builder->setInsertionPoint(&else_branch.front(), else_branch.front().begin());
+  builder->createBlock(&else_branch);
+  builder->create<TF::YieldOp>(loc, llvm::SmallVector<mlir::Value, 4>({}));
+  return host_side_if;
+}
+
+// Replicates tf.IfRegion op to host side computation.
+Operation* ReplicateIf(const ControlFlowStackInfo& controlflow_info,
+                       llvm::StringRef outside_cluster_name, ModuleOp module,
+                       Value compilation_key, OpBuilder* builder,
+                       int* send_recv_counter) {
+  // Create XlaSendToHostOp to send predicate value from device to host.
+  OpBuilder::InsertPoint insert_point = builder->saveInsertionPoint();
+  auto if_callsite_op =
+      llvm::cast<TF::IfRegionOp>(controlflow_info.GetCallSiteOp());
+  builder->setInsertionPoint(if_callsite_op);
+
+  const auto predicate_send_recv_key =
+      llvm::formatv("if_predicate_channel_{0}_{1}", outside_cluster_name,
+                    *send_recv_counter)
+          .str();
+  *send_recv_counter += 1;
+
+  auto predicate = if_callsite_op.cond();
+  auto predicate_shape = predicate.getType();
+  builder->create<TF::XlaSendToHostOp>(if_callsite_op.getLoc(), predicate,
+                                       predicate_send_recv_key);
+
+  // Create XlaRecvAtHostOp to receive predicate value from host.
+  builder->restoreInsertionPoint(insert_point);
+  auto recv_predicate_at_host = builder->create<TF::_XlaRecvAtHostOp>(
+      if_callsite_op.getLoc(), llvm::ArrayRef<Type>{predicate_shape},
+      /*dynamic_key=*/compilation_key,
+      builder->getStringAttr(predicate_send_recv_key),
+      /*device_ordinal=*/builder->getI64IntegerAttr(0));
+
+  // Create host side if op.
+  return CloneEmptyIfWithPredicate(recv_predicate_at_host.getResult(0),
+                                   if_callsite_op.is_stateless(),
+                                   if_callsite_op.getLoc(), builder);
+}
+
+// TODO(b/157054714): Use a better abstraction instead of
+// _TPUCompileMlirOp and _XlaRecvAtHostOp and _XlaSendFromHostOp.
+// Creates a compilation key as placeholder. A placeholder compilation cache key
+// is created because it is a required input to _XlaRecvAtHost and
+// _XlaSendFromHost but the _TPUCompileMlir has not yet been created for the TPU
+// cluster that contains the outside compiled ops. This placeholder should be
+// replaced by the TPU cluster _TPUCompileMlir in a subsequent pass.
+Value CreateCompilationKeyPlaceholder(Location loc, OpBuilder* builder) {
+  auto result_type =
+      RankedTensorType::get({2}, builder->getType<TF::StringType>());
+  return builder->create<TF::_TPUCompileMlirPlaceholderProgramKeyOp>(
+      loc, /*program=*/result_type, llvm::ArrayRef<Value>{});
+}
+
+// Replicates the control flow operations that wraps outside compiled ops to
+// `destination_block`.
+Block* ReplicateControlFlowStack(
+    llvm::StringRef outside_cluster_name,
+    const llvm::SmallVectorImpl<ControlFlowStackInfo>& stack_info,
+    tf_device::ClusterOp tpu_cluster, ModuleOp module, Value compilation_key,
+    Block* destination_block, int* send_recv_counter) {
+  assert(stack_info.size());
+  OpBuilder builder = OpBuilder::atBlockTerminator(destination_block);
+  Operation* previous_replicated_controlflow_op = nullptr;
+  for (const auto& controlflow_stack_info : stack_info) {
+    // Create control flow op given provided insertion point and
+    // ControlFlowStackInfo.
+    previous_replicated_controlflow_op =
+        ReplicateIf(controlflow_stack_info, outside_cluster_name, module,
+                    compilation_key, &builder, send_recv_counter);
+    auto if_op = llvm::cast<TF::IfRegionOp>(previous_replicated_controlflow_op);
+    auto type = controlflow_stack_info.GetBranchType();
+
+    // Update the insertion point to proper region inside the newly created
+    // control flow op.
+    if (type == ControlFlowStackInfo::kIfThen) {
+      builder.setInsertionPoint(&if_op.then_branch().front().front());
+    } else {
+      builder.setInsertionPoint(&if_op.else_branch().front().front());
+    }
+  }
+
+  // Return the inner most branch at which outside compiled op is located.
+  // This block will later be used as insertion point to create send/recv ops.
+  auto inner_most_controlflow_stack = stack_info.back();
+  auto inner_most_if =
+      llvm::cast<TF::IfRegionOp>(previous_replicated_controlflow_op);
+  if (inner_most_controlflow_stack.GetBranchType() ==
+      ControlFlowStackInfo::kIfThen) {
+    return &inner_most_if.then_branch().front();
+  } else {
+    return &inner_most_if.else_branch().front();
+  }
+}
+
+// Collects and clusters ops in `block` with the same `_xla_outside_compilation`
+// attribute into `clusters` This returns an error if a
+// `_xla_outside_compilation` attribute of an op is empty.
+// TODO(b/163141763): Make sure ops inside control flow regions are not outside
+// compiled if the entire control flow op is marked as outside compiled.
+LogicalResult CollectAndGroupOutsideClusterOps(Block* block,
+                                               OutsideClusterMap* clusters) {
+  auto walk_result = block->walk([&](Operation* op) {
+    if (auto attr = op->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr)) {
+      if (attr.getValue().empty()) {
+        op->emitError() << "attribute '" << kXlaOutsideCompilationAttr
+                        << "' is empty";
+        return WalkResult::interrupt();
+      }
+
+      auto it = clusters->try_emplace(attr.getValue());
+      it.first->getSecond().push_back(op);
+    }
+    return WalkResult::advance();
+  });
+
+  return failure(walk_result.wasInterrupted());
+}
+
+// Moves `cluster_ops` to associated `block`.
+void MoveOutsideClusterOpsToBlock(Block& block,
+                                  llvm::ArrayRef<Operation*> cluster_ops,
+                                  MLIRContext* context) {
+  Operation* terminator = block.getTerminator();
   for (Operation* cluster_op : cluster_ops) {
     // Remove `_xla_outside_compilation` and `device` attribute from ops in the
     // cluster as that information will be present in the `launch_op`.
@@ -115,7 +294,7 @@ void MoveOutsideClusterOpsToLaunchOp(tf_device::LaunchOp launch_op,
   }
 }
 
-// Creates a `tf_device::LaunchOp` to wrap cluster ops.
+// Creates a `tf_device.launch` to wrap cluster ops.
 tf_device::LaunchOp CreateLaunchOpForOutsideCluster(
     OpBuilder* builder, Operation* last_cluster_op,
     llvm::StringRef host_device) {
@@ -196,78 +375,91 @@ void SetHostComputeInsertion(
 
 // Creates the HostCompute with `inputs` and `outputs`
 // using `communication_key`.
-TF::_HostComputeMlirOp CreateHostCompute(
+TF::_XlaHostComputeMlirOp CreateHostCompute(
     OpBuilder* builder, tf_device::ClusterOp tpu_cluster,
     llvm::ArrayRef<Operation*> cluster_ops,
     const llvm::SmallSetVector<Value, 4>& inputs, llvm::ArrayRef<Value> outputs,
-    llvm::StringRef communication_key) {
+    llvm::StringRef args_communication_key,
+    llvm::StringRef retvals_communication_key) {
   llvm::SmallVector<Type, 4> device_output_types;
   for (const auto& output : outputs)
     device_output_types.push_back(output.getType());
   SetHostComputeInsertion(builder, cluster_ops, inputs);
-  auto host_compute = builder->create<TF::_HostComputeMlirOp>(
+  auto host_compute = builder->create<TF::_XlaHostComputeMlirOp>(
       tpu_cluster.getLoc(), device_output_types, inputs.getArrayRef(),
-      llvm::ArrayRef<NamedAttribute>{});
-  host_compute.setAttr(kAncestorsAttr, builder->getArrayAttr({}));
-  host_compute.setAttr(kShapesAttr, builder->getArrayAttr({}));
-  host_compute.setAttr(kKeyAttr, builder->getStringAttr(communication_key));
+      builder->getStringAttr(args_communication_key),
+      builder->getStringAttr(retvals_communication_key),
+      /*tpu_core=*/builder->getI64IntegerAttr(0));
   return host_compute;
 }
 
 void MoveOutsideCompiledOps(
-    tf_device::ClusterOp tpu_cluster, llvm::StringRef outside_cluster_name,
-    tf_device::LaunchOp host_launch_op, llvm::ArrayRef<Operation*> cluster_ops,
+    ModuleOp module, tf_device::ClusterOp tpu_cluster,
+    llvm::StringRef outside_cluster_name, tf_device::LaunchOp host_launch_op,
+    llvm::ArrayRef<Operation*> cluster_ops,
     const llvm::SmallSetVector<Value, 4>& external_inputs,
     llvm::ArrayRef<Value> external_outputs) {
+  // Since ops in `cluster_ops` do not cross function/control flow boundary, it
+  // is sufficient to identify the control flow that wraps `cluster_ops` by
+  // looking at any arbitary op inside `cluster_ops`.
+  auto controlflow_stack =
+      GetControlFlowStackForOp(tpu_cluster, cluster_ops.front());
+
+  Value compilation_key;
+  if (!controlflow_stack.empty() || !external_inputs.empty() ||
+      !external_outputs.empty()) {
+    OpBuilder builder(&host_launch_op.GetBody().front());
+    compilation_key =
+        CreateCompilationKeyPlaceholder(tpu_cluster.getLoc(), &builder);
+  }
+
+  Block* block_to_move_host_cluster = nullptr;
+  if (controlflow_stack.empty()) {
+    block_to_move_host_cluster = &host_launch_op.GetBody();
+  } else {
+    int send_recv_counter = 0;
+    block_to_move_host_cluster = ReplicateControlFlowStack(
+        outside_cluster_name, controlflow_stack, tpu_cluster, module,
+        compilation_key, &host_launch_op.GetBody(), &send_recv_counter);
+  }
+
+  MLIRContext* context = host_launch_op.getContext();
   if (external_inputs.empty() && external_outputs.empty()) {
-    MoveOutsideClusterOpsToLaunchOp(host_launch_op, cluster_ops);
+    MoveOutsideClusterOpsToBlock(*block_to_move_host_cluster, cluster_ops,
+                                 context);
     return;
   }
 
-  OpBuilder builder(host_launch_op.GetBody().getTerminator());
-  auto result_type =
-      RankedTensorType::get({}, builder.getType<TF::StringType>());
-
-  std::string txt_metadata;
-  std::string txt_module;
-  // TODO(b/157054714): Use a better abstraction instead of _TPUCompileMlirOp
-  // and _XlaRecvAtHostOp and _XlaSendFromHostOp.
-
-  // A placeholder _TpuCompileMlirOp is created because it is required input to
-  // XlaRecvAtHostOp and XlaSendFromHostOp but the _TpuCompileMlirOp has not yet
-  // been created for the TPU cluster that contains the outside compiled ops.
-  // This placeholder should be replaced by the TPU cluster _TPUCompileMlirOp in
-  // a subsequent pass.
-  auto compile_op = builder.create<TF::_TPUCompileMlirOp>(
-      tpu_cluster.getLoc(), /*compilation_status=*/result_type, /*program=*/
-      llvm::ArrayRef<Type>{result_type}, llvm::ArrayRef<Value>{}, txt_module,
-      txt_metadata);
-
+  OpBuilder builder(block_to_move_host_cluster->getTerminator());
   llvm::SmallVector<Type, 4> host_output_types;
   for (const auto& external_input : external_inputs)
     host_output_types.push_back(external_input.getType());
 
-  std::string communication_key =
-      llvm::formatv("host_compute_channel_{0}", outside_cluster_name).str();
-  // XlaRecvAtHostOp takes both the program key(dynamic_key) from the
-  // _TpuCompileMlirOp and the communication_key.
+  std::string args_communication_key =
+      llvm::formatv("host_compute_channel_{0}_args", outside_cluster_name)
+          .str();
+  std::string retvals_communication_key =
+      llvm::formatv("host_compute_channel_{0}_retvals", outside_cluster_name)
+          .str();
+
   auto recv_at_host = builder.create<TF::_XlaRecvAtHostOp>(
       tpu_cluster.getLoc(), host_output_types,
-      /*dynamic_key=*/compile_op.getResult(1),
-      builder.getStringAttr(communication_key),
-      builder.getIntegerAttr(builder.getIntegerType(64), 0));
+      /*dynamic_key=*/compilation_key,
+      builder.getStringAttr(args_communication_key),
+      /*device_ordinal=*/builder.getI64IntegerAttr(0));
 
-  auto host_compute =
-      CreateHostCompute(&builder, tpu_cluster, cluster_ops, external_inputs,
-                        external_outputs, communication_key);
-  MoveOutsideClusterOpsToLaunchOp(host_launch_op, cluster_ops);
+  auto host_compute = CreateHostCompute(
+      &builder, tpu_cluster, cluster_ops, external_inputs, external_outputs,
+      args_communication_key, retvals_communication_key);
+  MoveOutsideClusterOpsToBlock(*block_to_move_host_cluster, cluster_ops,
+                               context);
 
-  builder.setInsertionPoint(host_launch_op.GetBody().getTerminator());
+  builder.setInsertionPoint(block_to_move_host_cluster->getTerminator());
   builder.create<TF::_XlaSendFromHostOp>(
       tpu_cluster.getLoc(), external_outputs,
-      /*dynamic_key=*/compile_op.getResult(1),
-      builder.getStringAttr(communication_key),
-      /*device_ordinal=*/builder.getIntegerAttr(builder.getIntegerType(64), 0));
+      /*dynamic_key=*/compilation_key,
+      builder.getStringAttr(retvals_communication_key),
+      /*device_ordinal=*/builder.getI64IntegerAttr(0));
 
   for (auto result : llvm::zip(external_inputs, recv_at_host.getResults()))
     mlir::replaceAllUsesInRegionWith(std::get<0>(result), std::get<1>(result),
@@ -280,7 +472,8 @@ void MoveOutsideCompiledOps(
 
 // Creates a `parallel_execute` op in place of launch with 'clusters` and
 // 'launch` as regions.
-void CreateParallelExecuteFromOutsideClusters(tf_device::ClusterOp tpu_cluster,
+void CreateParallelExecuteFromOutsideClusters(ModuleOp module,
+                                              tf_device::ClusterOp tpu_cluster,
                                               const OutsideClusterMap& clusters,
                                               llvm::StringRef host_device) {
   OpBuilder builder(tpu_cluster);
@@ -296,6 +489,7 @@ void CreateParallelExecuteFromOutsideClusters(tf_device::ClusterOp tpu_cluster,
 
     Block& outside_block =
         parallel_execute_op.GetRegionBlockWithIndex(cluster.index());
+
     builder.setInsertionPointToEnd(&outside_block);
     tf_device::LaunchOp host_launch_op = CreateLaunchOpForOutsideCluster(
         &builder, cluster_ops.back(), host_device);
@@ -304,10 +498,9 @@ void CreateParallelExecuteFromOutsideClusters(tf_device::ClusterOp tpu_cluster,
     auto external_inputs = GetExternalOperands(cluster_ops);
     auto external_outputs = GetExternalOutputs(cluster_ops);
 
-    MoveOutsideCompiledOps(tpu_cluster, cluster.value().getFirst(),
+    MoveOutsideCompiledOps(module, tpu_cluster, cluster.value().getFirst(),
                            host_launch_op, cluster_ops, external_inputs,
                            external_outputs);
-
     builder.setInsertionPointToEnd(&outside_block);
     builder.create<tf_device::ReturnOp>(tpu_cluster.getLoc(),
                                         ArrayRef<Value>{});
@@ -353,7 +546,8 @@ void TPUExtractOutsideCompilation::runOnOperation() {
         std::string host_device;
         tensorflow::GetHostDeviceOutsideComputation(devices, tpu_cluster,
                                                     &host_device);
-        CreateParallelExecuteFromOutsideClusters(tpu_cluster, clusters,
+
+        CreateParallelExecuteFromOutsideClusters(module, tpu_cluster, clusters,
                                                  host_device);
 
         return WalkResult::advance();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_merge_variables_with_execute.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_merge_variables_with_execute.cc
index 3fd0dcd5a67..52c9287b619 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_merge_variables_with_execute.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_merge_variables_with_execute.cc
@@ -298,7 +298,7 @@ VariableAccessesForTPUExecute BuildVariableAccessInfo(
   // Populate infos.old_to_new_output_mapping.
   int new_output_index = 0;
   infos.old_to_new_output_mapping.resize(execute_launch.getNumResults());
-  for (int i = 0; i < execute_launch.getNumResults(); ++i) {
+  for (int i = 0, end = execute_launch.getNumResults(); i < end; ++i) {
     if (output_fused[i]) {
       infos.old_to_new_output_mapping[i] = -1;
     } else {
@@ -375,7 +375,7 @@ void ReplaceParallelExecute(tf_device::ParallelExecuteOp parallel_execute,
   // Replace the uses of the original parallel_execute for the region containing
   // the merged execute.
   auto old_region_results = parallel_execute.GetRegionOutputs(region_index);
-  for (int i = 0; i < infos.old_to_new_output_mapping.size(); ++i) {
+  for (int i = 0, end = infos.old_to_new_output_mapping.size(); i < end; ++i) {
     if (infos.old_to_new_output_mapping[i] < 0) continue;
     old_region_results[i].replaceAllUsesWith(new_parallel_execute_op->getResult(
         infos.old_to_new_output_mapping[i] + num_results_before_region));
@@ -407,7 +407,7 @@ void ReplaceExecute(tf_device::LaunchOp execute_launch,
                     tf_device::LaunchOp merged_execute_launch,
                     const VariableAccessesForTPUExecute& infos) {
   // Replace the uses.
-  for (int i = 0; i < infos.old_to_new_output_mapping.size(); ++i) {
+  for (int i = 0, end = infos.old_to_new_output_mapping.size(); i < end; ++i) {
     if (infos.old_to_new_output_mapping[i] < 0) continue;
     execute_launch.getResult(i).replaceAllUsesWith(
         merged_execute_launch.getResult(infos.old_to_new_output_mapping[i]));
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
index 050ba24417f..ca77feafc05 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
@@ -473,9 +473,8 @@ LogicalResult BuildExecuteOp(
   if (failed(result)) return failure();
 
   // TPUExecute has same output types as cluster_func.
-  *execute_op = builder->create<TF::TPUExecuteOp>(
-      cluster_func.getLoc(), output_types, inputs,
-      llvm::ArrayRef<NamedAttribute>{});
+  *execute_op = builder->create<TF::TPUExecuteOp>(cluster_func.getLoc(),
+                                                  output_types, inputs);
   return success();
 }
 
@@ -644,10 +643,7 @@ LogicalResult Rewrite(
   // Collect `num_replicas` and `num_cores_per_replica` attributes.
   int num_replicas = 1;
   tf_device::ReplicateOp replicate =
-      cluster_func.getParentOp()
-          ? llvm::dyn_cast_or_null<tf_device::ReplicateOp>(
-                cluster_func.getParentOp())
-          : nullptr;
+      cluster_func.getParentOfType<tf_device::ReplicateOp>();
   if (replicate) num_replicas = replicate.n().getLimitedValue();
 
   auto num_cores_per_replica_attr = cluster_func.getAttrOfType<IntegerAttr>(
@@ -716,9 +712,9 @@ LogicalResult Rewrite(
   // structured lowering.
   if (auto parallel_op = llvm::dyn_cast<tf_device::ParallelExecuteOp>(
           cluster_func.getParentOp())) {
-    parallel_op.walk([&](TF::_TPUCompileMlirOp parallel_compile_op) {
-      parallel_compile_op.replaceAllUsesWith(compile_op);
-      parallel_compile_op.erase();
+    parallel_op.walk([&](TF::_TPUCompileMlirPlaceholderProgramKeyOp key_op) {
+      key_op.replaceAllUsesWith(compile_op->getResult(1));
+      key_op.erase();
     });
   }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_space_to_depth_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_space_to_depth_pass.cc
index 7befa68f3d8..204a674e632 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_space_to_depth_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_space_to_depth_pass.cc
@@ -604,8 +604,7 @@ void TPUSpaceToDepthPass::runOnOperation() {
   }
 
   // Get the function on device.
-  auto device_func =
-      getOperation().lookupSymbol<mlir::FuncOp>(cluster_func->getFunc());
+  auto device_func = cluster_func->getFunc();
   if (!device_func) return;
 
   TF::Conv2DOp first_conv;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_update_embedding_enqueue_op_inputs.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_update_embedding_enqueue_op_inputs.cc
index f3588c8359b..6cd9f763b87 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_update_embedding_enqueue_op_inputs.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_update_embedding_enqueue_op_inputs.cc
@@ -13,24 +13,29 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 
 namespace mlir {
 namespace TFTPU {
 namespace {
 
+constexpr char kXlaOutsideCompilationAttr[] = "_xla_outside_compilation";
 constexpr char kTPUEmbeddingAttr[] = "_tpu_embedding_layer";
 
 struct TPUUpdateEmbeddingEnqueueOpInputs
@@ -44,8 +49,7 @@ struct TPUUpdateEmbeddingEnqueueOpInputs
 LogicalResult ExtractEmbeddingAttribute(
     Operation* op, llvm::StringMap<Operation*>* embedding_op_map) {
   auto embedding_attr = op->getAttrOfType<StringAttr>(kTPUEmbeddingAttr);
-  if (!embedding_attr)
-    return op->emitOpError("requires attribute '_tpu_embedding_layer'");
+  if (!embedding_attr) return mlir::success();
 
   if (!embedding_op_map->insert({embedding_attr.getValue(), op}).second)
     return op->emitOpError(
@@ -87,7 +91,8 @@ LogicalResult FindTPUEmbeddingOps(
 LogicalResult UpdateEmbeddingEnqueueOpInput(
     const llvm::StringMap<Operation*>& enqueue_op_map,
     const llvm::StringMap<Operation*>& recv_activation_op_map,
-    const llvm::StringMap<Operation*>& send_gradient_op_map) {
+    const llvm::StringMap<Operation*>& send_gradient_op_map,
+    OpBuilder* builder) {
   for (const auto& it : enqueue_op_map) {
     const auto& embedding_attr = it.getKey();
     Operation* embedding_op = it.second;
@@ -97,21 +102,36 @@ LogicalResult UpdateEmbeddingEnqueueOpInput(
              << TF::RecvTPUEmbeddingActivationsOp::getOperationName() << "' op";
 
     // TPU Embedding enqueue ops take different inputs depending on whether
-    // graph is in training mode or in eval/prediction mode. The inputs to the
-    // enqueue ops are present/listed as operands to SelectV2 op. Then branch
-    // operand of the SelectV2 op represents input to take during training
-    // and else branch operand represents input to take during
-    // prediction/evaluation. If SendTPUEmbeddingGradients op exists in the
-    // graph, then graph is in training mode, so correctly forward the input
-    // of SelectV2 op as operand to the TPU embedding enqueue op.
+    // graph is in training mode or in eval/prediction mode. During training,
+    // the mode parameter for TPUEmbeddingEnqueue op must be `train` and for
+    // evaluation or prediction, mode must be set to `inference`.
+    // If SendTPUEmbeddingGradients op exists in the graph, then graph is
+    // in training mode, so create a const op with value `train` use the
+    // output value of the constant as an operand to the TPU embedding
+    // enqueue op.
     bool is_training = send_gradient_op_map.count(embedding_attr);
-    for (auto enqueue_operand : embedding_op->getOperands()) {
-      if (auto select = llvm::dyn_cast_or_null<TF::SelectV2Op>(
-              enqueue_operand.getDefiningOp())) {
-        enqueue_operand.replaceAllUsesWith(is_training ? select.t()
-                                                       : select.e());
-      }
-    }
+
+    // The last operand of TPUEmbeddingEnqueue ops is the mode which
+    // represents whether graph is in training mode or in evaluation mode.
+    auto& mode_enqueue_operand =
+        embedding_op->getOpOperand(embedding_op->getNumOperands() - 1);
+
+    llvm::SmallVector<StringRef, 1> mode_string_value;
+    mode_string_value.emplace_back(is_training ? "train" : "inference");
+    builder->setInsertionPoint(embedding_op);
+    auto enqueue_mode = builder->create<TF::ConstOp>(
+        embedding_op->getLoc(),
+        DenseStringElementsAttr::get(
+            RankedTensorType::get({}, builder->getType<TF::StringType>()),
+            mode_string_value));
+
+    auto outside_compilation_attr =
+        embedding_op->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr);
+    if (outside_compilation_attr)
+      enqueue_mode.setAttr(kXlaOutsideCompilationAttr,
+                           outside_compilation_attr);
+
+    mode_enqueue_operand.set(enqueue_mode);
   }
 
   return success();
@@ -141,8 +161,9 @@ void TPUUpdateEmbeddingEnqueueOpInputs::runOnFunction() {
     return signalPassFailure();
   }
 
-  if (failed(UpdateEmbeddingEnqueueOpInput(
-          enqueue_op_map, recv_activation_op_map, send_gradient_op_map)))
+  if (failed(UpdateEmbeddingEnqueueOpInput(enqueue_op_map,
+                                           recv_activation_op_map,
+                                           send_gradient_op_map, &builder)))
     return signalPassFailure();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
index 5bc6bd4e053..3262b83fc94 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
@@ -351,7 +351,7 @@ TF::WhileOp AddStateVarsToWhileOp(TF::WhileOp while_op, FuncOp body,
   cond.setType(FunctionType::get(append_types(cond.getType().getInputs()),
                                  cond.getType().getResults(),
                                  cond.getContext()));
-  for (int64_t i = 0; i < state_vars.size(); ++i) {
+  for (int64_t i = 0, end = state_vars.size(); i < end; ++i) {
     int64_t arg_index = body.getNumArguments() - state_vars.size() + i;
     TF::VarHandleOp state_var = state_vars[i];
     auto device_attr = state_var.getAttr(kDeviceAttr);
@@ -365,16 +365,6 @@ TF::WhileOp AddStateVarsToWhileOp(TF::WhileOp while_op, FuncOp body,
       while_op.getLoc(),
       append_types(llvm::to_vector<4>(while_op.getResultTypes())),
       new_while_operands, while_op.getAttrs());
-  if (new_while_op.output_shapes().size() != 0) {
-    auto new_output_shapes = llvm::to_vector<4>(new_while_op.output_shapes());
-    // VarHandleOp is a scalar shape resource.
-    for (int64_t i = 0; i < state_vars.size(); ++i) {
-      new_output_shapes.push_back(
-          mlir::TF::ShapeAttr::get(builder.getContext(), ArrayRef<int64_t>()));
-    }
-    new_while_op.setAttr("output_shapes",
-                         builder.getArrayAttr(new_output_shapes));
-  }
   while_op.replaceAllUsesWith(
       new_while_op.getResults().take_front(while_op.getNumResults()));
   while_op.erase();
@@ -462,9 +452,8 @@ void HandleReplicateOp(TF::WhileOp while_op, tf_device::ReplicateOp replicate,
       !llvm::isa<TF::_TPUCompileMlirOp>(compile_launch.GetBody().front()))
     return;
 
-  auto module = while_op.getParentOfType<ModuleOp>();
-  auto body = llvm::cast<FuncOp>(module.lookupSymbol(while_op.body()));
-  auto cond = llvm::cast<FuncOp>(module.lookupSymbol(while_op.cond()));
+  FuncOp body = while_op.body_func();
+  FuncOp cond = while_op.cond_func();
 
   // Analyze the formattable inputs.
   auto execute_arg_to_outer_args =
@@ -521,8 +510,7 @@ void HandleReplicateOp(TF::WhileOp while_op, tf_device::ReplicateOp replicate,
       replicate.GetNumReplicatedBlockArguments() - 1));
   builder.setInsertionPoint(execute_launch);
   auto reformat_op = builder.create<TF::TPUReshardVariablesOp>(
-      execute_launch.getLoc(), llvm::ArrayRef<Type>{}, reformat_operands,
-      llvm::ArrayRef<NamedAttribute>{});
+      execute_launch.getLoc(), llvm::ArrayRef<Type>{}, reformat_operands);
   WrapOpInLaunch(&builder, execute_launch.getLoc(), reformat_op,
                  execute_launch.device());
 
@@ -579,8 +567,7 @@ void HandleReplicateOp(TF::WhileOp while_op, tf_device::ReplicateOp replicate,
       default_state_key.getResult());
   // Unformat op.
   auto unformat_op = builder.create<TF::TPUReshardVariablesOp>(
-      while_op.getLoc(), llvm::ArrayRef<Type>{}, unformat_operands,
-      llvm::ArrayRef<NamedAttribute>{});
+      while_op.getLoc(), llvm::ArrayRef<Type>{}, unformat_operands);
   WrapOpInLaunch(&builder, execute_launch.getLoc(), unformat_op,
                  execute_launch.device());
   builder.create<tf_device::ReturnOp>(while_op.getLoc(), ArrayRef<Value>{});
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc b/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc
index f09cf7b093e..0a69987deb0 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc
@@ -41,25 +41,28 @@ namespace mlir {
 
 namespace {
 
-struct BreakUpIslands : PassWrapper<BreakUpIslands, FunctionPass> {
-  void runOnFunction() final;
+class BreakUpIslands : public TF::PerFunctionAggregateAnalysisConsumerPass<
+                           BreakUpIslands, TF::SideEffectAnalysis> {
+ public:
+  void runOnFunction(FuncOp func,
+                     const TF::SideEffectAnalysis::Info& side_effect_analysis);
 
   void BreakUpIsland(tf_executor::IslandOp island_op,
-                     const TF::SideEffectAnalysis& side_effect_analysis,
+                     const TF::SideEffectAnalysis::Info& side_effect_analysis,
                      llvm::DenseMap<Operation*, llvm::SmallVector<Value, 4>>*
                          new_control_inputs);
 };
 
-void BreakUpIslands::runOnFunction() {
-  auto graph_op_range = getFunction().getBody().front().without_terminator();
+void BreakUpIslands::runOnFunction(
+    FuncOp func, const TF::SideEffectAnalysis::Info& side_effect_analysis) {
+  auto graph_op_range = func.front().without_terminator();
   tf_executor::GraphOp graph_op;
-  if (graph_op_range.begin() != graph_op_range.end() &&
-      std::next(graph_op_range.begin()) == graph_op_range.end()) {
-    graph_op = dyn_cast<tf_executor::GraphOp>(
-        getOperation().getBody().front().front());
-  }
+
+  if (llvm::hasSingleElement(graph_op_range))
+    graph_op = dyn_cast<tf_executor::GraphOp>(func.front().front());
+
   if (!graph_op) {
-    getOperation().emitError("expected function to contain only a graph_op");
+    func.emitError("expected function to contain only a graph_op");
     signalPassFailure();
     return;
   }
@@ -67,7 +70,6 @@ void BreakUpIslands::runOnFunction() {
   // New control inputs to be added. For an operation x, new_control_inputs[x]
   // contains all control inputs that need to be added to x as operands.
   llvm::DenseMap<Operation*, llvm::SmallVector<Value, 4>> new_control_inputs;
-  auto& side_effect_analysis = getAnalysis<TF::SideEffectAnalysis>();
   // Iterate in reverse order to avoid invalidating Operation* stored in
   // new_control_inputs.
   for (auto& item :
@@ -76,7 +78,7 @@ void BreakUpIslands::runOnFunction() {
       BreakUpIsland(island, side_effect_analysis, &new_control_inputs);
     }
   }
-  OpBuilder builder(getOperation());
+  OpBuilder builder(func);
 
   // For every op, add new control inputs in reverse order so that the ops don't
   // get invalidated.
@@ -181,7 +183,7 @@ struct IslandSourcesAndSinks {
 // Finds IslandSourcesAndSinks for an unmodified island.
 IslandSourcesAndSinks FindSourcesAndSinksInIsland(
     tf_executor::IslandOp island,
-    const TF::SideEffectAnalysis& side_effect_analysis) {
+    const TF::SideEffectAnalysis::Info& side_effect_analysis) {
   IslandSourcesAndSinks result;
   auto island_body = island.GetBody().without_terminator();
   for (Operation& sub_op : island_body) {
@@ -208,7 +210,7 @@ IslandSourcesAndSinks FindSourcesAndSinksInIsland(
 // are chained together by control flow values.
 void BreakUpIslands::BreakUpIsland(
     tf_executor::IslandOp island_op,
-    const TF::SideEffectAnalysis& side_effect_analysis,
+    const TF::SideEffectAnalysis::Info& side_effect_analysis,
     llvm::DenseMap<Operation*, llvm::SmallVector<Value, 4>>*
         new_control_inputs) {
   auto island_body = island_op.GetBody().without_terminator();
@@ -323,7 +325,7 @@ void BreakUpIslands::BreakUpIsland(
 
 }  // namespace
 
-std::unique_ptr<OperationPass<FuncOp>> CreateBreakUpIslandsPass() {
+std::unique_ptr<OperationPass<ModuleOp>> CreateBreakUpIslandsPass() {
   return std::make_unique<BreakUpIslands>();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
index 7983dfe0065..571d5e3e715 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
@@ -511,17 +511,19 @@ StatusOr<std::unique_ptr<Graph>> Exporter::Convert(
   // generate unique names.
   if (!output_names.empty()) {
     const int num_data_results = graph_op.getNumResults();
-    TF_RET_CHECK(output_names.size() == num_data_results)
+    const int64 output_names_size = output_names.size();
+    TF_RET_CHECK(output_names_size == num_data_results)
         << "output names (" << output_names.size()
         << ") != terminator operands (" << num_data_results << ")";
     llvm::DenseMap<Operation*, llvm::StringRef> output_op_to_name;
     llvm::StringMap<Operation*> name_to_op;
     for (const auto& it : llvm::enumerate(graph_op.GetFetch().getOperands())) {
       // Skip control rets.
-      if (it.index() >= num_data_results) break;
+      const int64 index = it.index();
+      if (index >= num_data_results) break;
       // TODO(jpienaar): If there is a result index specified, ensure only one
       // and that it matches the result index of the op.
-      std::string orig_name(output_names[it.index()]);
+      std::string orig_name(output_names[index]);
       auto tensor_id = ParseTensorName(orig_name);
       auto name = LegalizeNodeName(
           llvm::StringRef(tensor_id.node().data(), tensor_id.node().size()));
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index c7d5339f93c..94ddf76736e 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -119,7 +119,6 @@ static inline absl::string_view StringRefToView(llvm::StringRef ref) {
 namespace tensorflow {
 using mlir::NamedAttrList;
 using mlir::TensorType;
-using mlir::TF::VarHandleOp;
 using mlir::tf_saved_model::AssetOp;
 using mlir::tf_saved_model::GlobalTensorOp;
 using mlir::tf_saved_model::SessionInitializerOp;
@@ -129,12 +128,6 @@ namespace {
 
 constexpr char kTpuReplicateAttr[] = "_tpu_replicate";
 
-bool IsDisableCallShapeInferenceAttribute(const AttrValue& attr_value,
-                                          llvm::StringRef attr_name) {
-  return attr_name.compare("_disable_call_shape_inference") == 0 &&
-         attr_value.value_case() == AttrValue::kB;
-}
-
 bool IsOutputShapesAttribute(const AttrValue& attr_value,
                              llvm::StringRef attr_name) {
   return attr_name.compare("_output_shapes") == 0 &&
@@ -336,14 +329,11 @@ class ImporterBase {
                                       NamedAttrList* attributes);
 
   // Helper to create either a tf_executor operation or a TF operation wrapped
-  // in an island. When convert_to_legacy_call is true, converts the operation
-  // representing a call to a library function with a name represented in
-  // node_type_name to LegacyCallOp.
+  // in an island.
   mlir::Operation* CreateOperation(
       const Node& node, llvm::StringRef node_type_name,
       const mlir::OperationState& result,
-      const llvm::SmallVectorImpl<mlir::Value>& control_operands,
-      bool convert_to_legacy_call = false);
+      const llvm::SmallVectorImpl<mlir::Value>& control_operands);
 
   // Converts one NodeDef from the input GraphDef into an Operation and
   // inserts it into the MLIR module using builder_.
@@ -1680,8 +1670,7 @@ Status ImporterBase::EmitErrorWithLocationStr(const Node& node,
 mlir::Operation* ImporterBase::CreateOperation(
     const Node& node, llvm::StringRef node_type_name,
     const mlir::OperationState& result,
-    const llvm::SmallVectorImpl<mlir::Value>& control_operands,
-    bool convert_to_legacy_call) {
+    const llvm::SmallVectorImpl<mlir::Value>& control_operands) {
   // For the tf.executor specific operations (not wrapped in an island), we
   // have an extra returned value for the control result, and we concatenate
   // control and non-control operands.
@@ -1744,25 +1733,7 @@ mlir::Operation* ImporterBase::CreateOperation(
       mlir::OpBuilder::atBlockEnd(&island.GetBody());
 
   // Create the operation inside the island now.
-  mlir::Operation* inner_op;
-  if (convert_to_legacy_call) {
-    bool disable_call_shape_inference = false;
-    for (const auto& name_and_value : node.attrs()) {
-      const auto& attr_name = name_and_value.first;
-      const AttrValue& attr_value = name_and_value.second;
-      if (IsDisableCallShapeInferenceAttribute(attr_value, attr_name)) {
-        disable_call_shape_inference = attr_value.b();
-      }
-    }
-
-    mlir::BoolAttr attribute =
-        builder_.getBoolAttr(disable_call_shape_inference);
-    inner_op = island_builder.create<mlir::TF::LegacyCallOp>(
-        result.location, result.types, result.operands,
-        island_builder.getSymbolRefAttr(node_type_name), attribute);
-  } else {
-    inner_op = island_builder.createOperation(result);
-  }
+  mlir::Operation* inner_op = island_builder.createOperation(result);
 
   // Sets operand_segment_sizes or result_segment_sizes attribute to the op.
   const auto set_segment_sizes_attr =
@@ -1927,13 +1898,6 @@ Status ImporterBase::ConvertNode(const Node& node) {
     // Remove _output_shapes attribute that will be added by the exporter.
     if (IsOutputShapesAttribute(attr_value, attr_name)) continue;
 
-    // We represent the _diable_call_shape_inference attribute and remove
-    // the _output_shapes attribute for LegacyCall. If a call has other
-    // attributes, we can't convert it to LegacyCall.
-    if (convert_to_legacy_call &&
-        !IsDisableCallShapeInferenceAttribute(attr_value, attr_name)) {
-      convert_to_legacy_call = false;
-    }
     if (attr_value.value_case() == AttrValue::kFunc) {
       // Attribute iteration order is not defined for protocol buffer Map.
       // Process function attributes separately in the lexicographical order to
@@ -1957,26 +1921,35 @@ Status ImporterBase::ConvertNode(const Node& node) {
   result.attributes.push_back(builder_.getNamedAttr(
       "device", builder_.getStringAttr(std::string(node_def.device()))));
 
-  // Map If and StatelessIf op in TensorFlow to the common If op in MLIR and add
-  // the differentiating attribute.
-  if (node.IsIfNode()) {
-    result.name = mlir::OperationName(get_full_op_name("If"), context_);
-    mlir::BoolAttr val = builder_.getBoolAttr(node_type_name == "StatelessIf");
-    result.attributes.push_back(builder_.getNamedAttr("is_stateless", val));
+  // Map user function calls to LegacyCall ops and add the user function name
+  // as an attribute.
+  if (convert_to_legacy_call) {
+    result.name = mlir::OperationName(get_full_op_name("LegacyCall"), context_);
+    mlir::SymbolRefAttr val = builder_.getSymbolRefAttr(node_type_name);
+    result.addAttribute("f", val);
+
+    if (!result.attributes.get("_disable_call_shape_inference")) {
+      result.addAttribute("_disable_call_shape_inference",
+                          builder_.getBoolAttr(false));
+    }
   }
 
-  // Map While and StatelessWhile op in TensorFlow to the common While op in
-  // MLIR and add the differentiating attribute.
-  if (node.IsWhileNode()) {
-    result.name = mlir::OperationName(get_full_op_name("While"), context_);
-    mlir::BoolAttr val =
-        builder_.getBoolAttr(node_type_name == "StatelessWhile");
+  auto composite_control_flow_op = [&](const std::string& name) {
+    result.name = mlir::OperationName(get_full_op_name(name), context_);
+    bool stateless = absl::StartsWith(node_type_name, "Stateless");
+    mlir::BoolAttr val = builder_.getBoolAttr(stateless);
     result.attributes.push_back(builder_.getNamedAttr("is_stateless", val));
-  }
+  };
+
+  // Map Case/If/While and StatelessCase/If/While op in TensorFlow to the common
+  // Case/If/While op in MLIR and add the differentiating attribute.
+  if (node.IsCaseNode()) composite_control_flow_op("Case");
+  if (node.IsIfNode()) composite_control_flow_op("If");
+  if (node.IsWhileNode()) composite_control_flow_op("While");
 
   // Register the mapping between the TF node and the newly created operation.
-  node_values_[node.id()] = CreateOperation(
-      node, node_type_name, result, control_operands, convert_to_legacy_call);
+  node_values_[node.id()] =
+      CreateOperation(node, node_type_name, result, control_operands);
   return Status::OK();
 }
 
@@ -2387,7 +2360,8 @@ GraphDefImporter::GetArgsRetsAndTypesFromFunctionGraph(
                                      "' is missing attribute 'index'");
 
     auto index = attr->i();
-    if (nodes->size() < index + 1) nodes->resize(index + 1);
+    const int num_nodes = nodes->size();
+    if (num_nodes < index + 1) nodes->resize(index + 1);
 
     if ((*nodes)[index].node != nullptr)
       return errors::InvalidArgument(node->type_string(), " node '",
@@ -2895,7 +2869,7 @@ void AdjustBoundInputArgTypes(mlir::ModuleOp module) {
     mlir::OpBuilder builder(func.getBody());
     llvm::SmallVector<mlir::Type, 4> new_input_types;
     for (int i = 0, e = func.getNumArguments(); i < e; i++) {
-      auto arg = func.front().getArgument(i);
+      auto arg = func.getArgument(i);
       auto global_tensor = mlir::tf_saved_model::LookupBoundInputOfType<
           mlir::tf_saved_model::GlobalTensorOp>(func, i, symbol_table);
       if (global_tensor) {
@@ -3108,7 +3082,8 @@ Status CreateSavedModelIR(
       TF_ASSIGN_OR_RETURN(auto input_index_paths,
                           input_linearizer.GetLeafIndexPaths(
                               error_context + "in input signature: "));
-      if (bound_input_base != input_index_paths.size()) {
+      const int input_index_paths_size = input_index_paths.size();
+      if (bound_input_base != input_index_paths_size) {
         return errors::InvalidArgument(
             error_context,
             "Argument mismatch between concrete function input signature "
@@ -3389,12 +3364,13 @@ SavedModelSignatureDefImporter::ConvertAssets() {
   results.reserve(asset_file_defs.size());
 
   mlir::OpBuilder builder(module_->getBodyRegion());
+  unsigned i = 0;  // Use to generate unique sym_name(s) for duplicate assets.
   for (const auto& asset : asset_file_defs) {
     auto asset_op = builder.create<mlir::tf_saved_model::AssetOp>(
         module_->getLoc(),
         /*sym_name=*/
         builder.getStringAttr(
-            absl::StrCat("__tf_saved_model_asset_", asset.filename())),
+            absl::StrCat("__tf_saved_model_asset", i++, "_", asset.filename())),
         /*filename=*/
         builder.getStringAttr(
             io::JoinPath(kSavedModelAssetsDirectory, asset.filename())));
@@ -3590,9 +3566,9 @@ Status SavedModelSignatureDefImporter::LiftVariables() {
   pm.addPass(mlir::TF::CreatePromoteVarHandlesToArgsPass());
   pm.addPass(
       mlir::tf_saved_model::CreateLiftVariablesPass(bundle_.GetSession()));
+  pm.addPass(mlir::tf_saved_model::CreateDedupBoundInputBindingPass());
   if (mlir::failed(pm.run(*module_)))
-    return diag_handler.Combine(
-        errors::Internal("failed to lifting variables."));
+    return diag_handler.Combine(errors::Internal("Failed to lift variables."));
 
   return Status::OK();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.cc b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.cc
index 77da19d6853..f6d370ca604 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.cc
@@ -28,14 +28,15 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
 Status ParseOutputArrayInfo(absl::string_view array_names,
                             std::vector<string>* outputs) {
-  std::vector<string> output_names = absl::StrSplit(array_names, ',');
-  return ParseOutputArrayInfo(output_names, outputs);
+  TF_RETURN_IF_ERROR(ParseNodeNames(array_names, *outputs));
+  return Status::OK();
 }
 
 Status ParseOutputArrayInfo(const std::vector<string>& output_names,
@@ -51,22 +52,12 @@ Status ParseInputArrayInfo(absl::string_view array_names,
                            absl::string_view data_types,
                            absl::string_view shapes,
                            GraphImportConfig::InputArrays* inputs) {
-  std::vector<string> node_names = absl::StrSplit(array_names, ',');
-  std::vector<string> node_dtypes = absl::StrSplit(data_types, ',');
-
-  std::vector<string> node_shapes_str = absl::StrSplit(shapes, ':');
+  std::vector<string> node_names;
+  std::vector<string> node_dtypes;
   std::vector<std::vector<int>> node_shapes;
-  for (int i = 0; i < node_shapes_str.size(); i++) {
-    std::vector<int> dims;
-    for (auto& dim_str : absl::StrSplit(node_shapes_str[i], ',')) {
-      // Treats empty input shape as scalar
-      if (dim_str.empty()) continue;
-      int size;
-      TF_RET_CHECK(absl::SimpleAtoi(dim_str, &size));
-      dims.push_back(size);
-    }
-    node_shapes.push_back(dims);
-  }
+  TF_RETURN_IF_ERROR(ParseNodeNames(array_names, node_names));
+  TF_RETURN_IF_ERROR(ParseNodeDataTypes(data_types, node_dtypes));
+  TF_RETURN_IF_ERROR(ParseNodeShapes(shapes, node_shapes));
   return ParseInputArrayInfo(node_names, node_dtypes, node_shapes, inputs);
 }
 
@@ -75,8 +66,7 @@ Status ParseInputArrayInfo(const std::vector<string>& node_names,
                            const std::vector<std::vector<int>>& node_shapes,
                            GraphImportConfig::InputArrays* inputs) {
   std::vector<std::string> used_node_dtypes;
-  if (node_dtypes.empty() ||
-      (node_dtypes.size() == 1 && node_dtypes[0].empty())) {
+  if (node_dtypes.empty()) {
     // Mark all the node dtypes Invalid, so the importer can handle them by
     // using the type from the graph.
     used_node_dtypes.resize(node_names.size(), DataType_Name(DT_INVALID));
@@ -97,14 +87,14 @@ Status ParseInputArrayInfo(const std::vector<string>& node_names,
         node_names.size(), ", #data_types ", node_dtypes.size(), ")"));
   }
 
-  if (node_names.size() != node_shapes.size()) {
+  if (!node_shapes.empty() && node_names.size() != node_shapes.size()) {
     return errors::FailedPrecondition(absl::StrCat(
-        "Unmatched node array and data type numbers (#arrays ",
-        node_names.size(), ", #input_shapes ", node_shapes.size(), ")"));
+        "Unmatched node array and shape numbers (#arrays ", node_names.size(),
+        ", #input_shapes ", node_shapes.size(), ")"));
   }
 
   // StringMap doesn't support reserve else reserve input map size here.
-  for (int i = 0; i < node_names.size(); i++) {
+  for (int i = 0, end = node_names.size(); i < end; i++) {
     auto& name = node_names[i];
     if (name.empty()) continue;
 
@@ -119,11 +109,49 @@ Status ParseInputArrayInfo(const std::vector<string>& node_names,
           absl::StrCat("Invalid node type '", node_dtypes[i], "'"));
     }
 
-    for (auto& dim : node_shapes[i]) {
-      info.shape.add_dim()->set_size(dim);
+    if (!node_shapes.empty()) {
+      for (auto& dim : node_shapes[i]) {
+        info.shape.add_dim()->set_size(dim);
+      }
     }
   }
   return Status::OK();
 }
 
+Status ParseNodeShapes(absl::string_view shapes_str,
+                       std::vector<std::vector<int>>& shapes_vector) {
+  shapes_vector.clear();
+  if (!shapes_str.empty()) {
+    std::vector<string> node_shapes_str = absl::StrSplit(shapes_str, ':');
+    for (int i = 0; i < node_shapes_str.size(); i++) {
+      std::vector<int> dims;
+      for (const absl::string_view dim_str :
+           absl::StrSplit(node_shapes_str[i], ',')) {
+        // Treats empty input shape as scalar
+        if (dim_str.empty()) continue;
+        int size;
+        TF_RET_CHECK(absl::SimpleAtoi(dim_str, &size));
+        dims.push_back(size);
+      }
+      shapes_vector.push_back(dims);
+    }
+  }
+  return Status::OK();
+}
+
+Status ParseNodeNames(absl::string_view names_str,
+                      std::vector<std::string>& names_vector) {
+  names_vector = absl::StrSplit(names_str, ',', absl::SkipEmpty());
+  return Status::OK();
+}
+
+Status ParseNodeDataTypes(absl::string_view data_types_str,
+                          std::vector<std::string>& data_type_vector) {
+  data_type_vector.clear();
+  if (!data_types_str.empty()) {
+    data_type_vector = absl::StrSplit(data_types_str, ',');
+  }
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h
index cc38a73d106..334f935a139 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h
@@ -96,6 +96,23 @@ Status ParseInputArrayInfo(const std::vector<string>& node_names,
                            const std::vector<string>& node_dtypes,
                            const std::vector<std::vector<int>>& node_shapes,
                            GraphImportConfig::InputArrays* inputs);
+
+// Parses shapes from the given string into shapes_vector which is a structured
+// format.
+// NOTE: If shapes_str is empty, shapes_vector will also be empty.
+Status ParseNodeShapes(absl::string_view shapes_str,
+                       std::vector<std::vector<int>>& shapes_vector);
+
+// Parses names from the given string into the names_vector.
+// NOTE: If names_str is empty, names_vector will also be empty.
+Status ParseNodeNames(absl::string_view names_str,
+                      std::vector<std::string>& names_vector);
+
+// Parses data types from the given string into the data_type_vector.
+// NOTE: If data_types_str is empty, data_type_vector will also be empty.
+Status ParseNodeDataTypes(absl::string_view data_types_str,
+                          std::vector<std::string>& data_type_vector);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_MLIR_ROUNDTRIP_FLAGS_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
index b782b2c49d9..1c7988d3a40 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/grappler/utils/transitive_fanin.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 
@@ -42,11 +43,14 @@ namespace tensorflow {
 
 static StatusOr<mlir::OwningModuleRef> GraphdefToMlirImport(
     llvm::StringRef input, absl::string_view debug_info_file,
-    absl::string_view input_arrays, absl::string_view input_dtypes,
-    absl::string_view input_shapes, absl::string_view output_arrays,
-    absl::string_view control_output_arrays, bool prune_unused_nodes,
-    bool convert_legacy_fed_inputs, bool graph_as_function, bool upgrade_legacy,
-    bool enable_shape_inference, mlir::MLIRContext* context) {
+    const std::vector<std::string>& input_arrays,
+    const std::vector<std::string>& input_dtypes,
+    const std::vector<std::vector<int>>& input_shapes,
+    const std::vector<std::string>& output_arrays,
+    const std::vector<std::string>& control_output_arrays,
+    bool prune_unused_nodes, bool convert_legacy_fed_inputs,
+    bool graph_as_function, bool upgrade_legacy, bool enable_shape_inference,
+    mlir::MLIRContext* context) {
   GraphDef graphdef;
   TF_RETURN_IF_ERROR(
       tensorflow::LoadProtoFromBuffer({input.data(), input.size()}, &graphdef));
@@ -97,11 +101,14 @@ static StatusOr<mlir::OwningModuleRef> GraphdefToMlirImport(
 
 StatusOr<mlir::OwningModuleRef> GraphdefToMlirTranslateFunction(
     llvm::StringRef input, absl::string_view debug_info_file,
-    absl::string_view input_arrays, absl::string_view input_dtypes,
-    absl::string_view input_shapes, absl::string_view output_arrays,
-    absl::string_view control_output_arrays, bool prune_unused_nodes,
-    bool convert_legacy_fed_inputs, bool graph_as_function, bool upgrade_legacy,
-    bool enable_shape_inference, mlir::MLIRContext* context) {
+    const std::vector<std::string>& input_arrays,
+    const std::vector<std::string>& input_dtypes,
+    const std::vector<std::vector<int>>& input_shapes,
+    const std::vector<std::string>& output_arrays,
+    const std::vector<std::string>& control_output_arrays,
+    bool prune_unused_nodes, bool convert_legacy_fed_inputs,
+    bool graph_as_function, bool upgrade_legacy, bool enable_shape_inference,
+    mlir::MLIRContext* context) {
   auto module_or = GraphdefToMlirImport(
       input, debug_info_file, input_arrays, input_dtypes, input_shapes,
       output_arrays, control_output_arrays, prune_unused_nodes,
@@ -113,6 +120,31 @@ StatusOr<mlir::OwningModuleRef> GraphdefToMlirTranslateFunction(
   return module_or;
 }
 
+StatusOr<mlir::OwningModuleRef> GraphdefToMlirTranslateFunction(
+    llvm::StringRef input, absl::string_view debug_info_file,
+    absl::string_view input_arrays, absl::string_view input_dtypes,
+    absl::string_view input_shapes, absl::string_view output_arrays,
+    absl::string_view control_output_arrays, bool prune_unused_nodes,
+    bool convert_legacy_fed_inputs, bool graph_as_function, bool upgrade_legacy,
+    bool enable_shape_inference, mlir::MLIRContext* context) {
+  std::vector<std::string> input_array_vector;
+  std::vector<std::string> input_dtype_vector;
+  std::vector<std::vector<int>> input_shapes_vector;
+  std::vector<std::string> output_array_vector;
+  std::vector<std::string> control_output_array_vector;
+  TF_RETURN_IF_ERROR(ParseNodeNames(input_arrays, input_array_vector));
+  TF_RETURN_IF_ERROR(ParseNodeDataTypes(input_dtypes, input_dtype_vector));
+  TF_RETURN_IF_ERROR(ParseNodeNames(output_arrays, output_array_vector));
+  TF_RETURN_IF_ERROR(ParseNodeShapes(input_shapes, input_shapes_vector));
+  TF_RETURN_IF_ERROR(
+      ParseNodeNames(control_output_arrays, control_output_array_vector));
+  return GraphdefToMlirTranslateFunction(
+      input, debug_info_file, input_array_vector, input_dtype_vector,
+      input_shapes_vector, output_array_vector, control_output_array_vector,
+      prune_unused_nodes, convert_legacy_fed_inputs, graph_as_function,
+      upgrade_legacy, enable_shape_inference, context);
+}
+
 StatusOr<mlir::OwningModuleRef> SavedModelObjectGraphToMlirImport(
     absl::string_view saved_model_dir,
     const std::unordered_set<std::string>& tags,
@@ -161,11 +193,14 @@ StatusOr<mlir::OwningModuleRef> SavedModelSignatureDefsToMlirImport(
 
 StatusOr<mlir::OwningModuleRef> GraphdefToSplattedMlirTranslateFunction(
     llvm::StringRef input, absl::string_view debug_info_file,
-    absl::string_view input_arrays, absl::string_view input_dtypes,
-    absl::string_view input_shapes, absl::string_view output_arrays,
-    absl::string_view control_output_arrays, bool prune_unused_nodes,
-    bool convert_legacy_fed_inputs, bool graph_as_function, bool upgrade_legacy,
-    bool enable_shape_inference, mlir::MLIRContext* context) {
+    const std::vector<std::string>& input_arrays,
+    const std::vector<std::string>& input_dtypes,
+    const std::vector<std::vector<int>>& input_shapes,
+    const std::vector<std::string>& output_arrays,
+    const std::vector<std::string>& control_output_arrays,
+    bool prune_unused_nodes, bool convert_legacy_fed_inputs,
+    bool graph_as_function, bool upgrade_legacy, bool enable_shape_inference,
+    mlir::MLIRContext* context) {
   auto module_or = GraphdefToMlirImport(
       input, debug_info_file, input_arrays, input_dtypes, input_shapes,
       output_arrays, control_output_arrays, prune_unused_nodes,
@@ -211,4 +246,29 @@ StatusOr<mlir::OwningModuleRef> GraphdefToSplattedMlirTranslateFunction(
   return module_or;
 }
 
+StatusOr<mlir::OwningModuleRef> GraphdefToSplattedMlirTranslateFunction(
+    llvm::StringRef input, absl::string_view debug_info_file,
+    absl::string_view input_arrays, absl::string_view input_dtypes,
+    absl::string_view input_shapes, absl::string_view output_arrays,
+    absl::string_view control_output_arrays, bool prune_unused_nodes,
+    bool convert_legacy_fed_inputs, bool graph_as_function, bool upgrade_legacy,
+    bool enable_shape_inference, mlir::MLIRContext* context) {
+  std::vector<std::string> input_array_vector;
+  std::vector<std::string> input_dtype_vector;
+  std::vector<std::vector<int>> input_shapes_vector;
+  std::vector<std::string> output_array_vector;
+  std::vector<std::string> control_output_array_vector;
+  TF_RETURN_IF_ERROR(ParseNodeNames(input_arrays, input_array_vector));
+  TF_RETURN_IF_ERROR(ParseNodeDataTypes(input_dtypes, input_dtype_vector));
+  TF_RETURN_IF_ERROR(ParseNodeNames(output_arrays, output_array_vector));
+  TF_RETURN_IF_ERROR(ParseNodeShapes(input_shapes, input_shapes_vector));
+  TF_RETURN_IF_ERROR(
+      ParseNodeNames(control_output_arrays, control_output_array_vector));
+  return GraphdefToSplattedMlirTranslateFunction(
+      input, debug_info_file, input_array_vector, input_dtype_vector,
+      input_shapes_vector, output_array_vector, control_output_array_vector,
+      prune_unused_nodes, convert_legacy_fed_inputs, graph_as_function,
+      upgrade_legacy, enable_shape_inference, context);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
index ff5dc287488..0dc49d70192 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <unordered_set>
 
+#include "absl/base/macros.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
@@ -33,9 +34,25 @@ using stream_executor::port::StatusOr;
 // TODO(antiagainst): Directly manipulating files in library functions is not
 // a good idea. We should pass in a string/stream here.
 
-// Converts a TensorFlow GraphDef stored in the file with the given
-// `input_filename` into a MLIR module. Creates MLIR entities into the
-// given MLIR `context`.
+// Converts a TensorFlow GraphDef contained in `input` param into a MLIR module.
+// Creates MLIR entities into the given MLIR `context`.
+StatusOr<mlir::OwningModuleRef> GraphdefToMlirTranslateFunction(
+    llvm::StringRef input, absl::string_view debug_info_file,
+    const std::vector<std::string>& input_arrays,
+    const std::vector<std::string>& input_dtypes,
+    const std::vector<std::vector<int>>& input_shapes,
+    const std::vector<std::string>& output_arrays,
+    const std::vector<std::string>& control_output_arrays,
+    bool prune_unused_nodes, bool convert_legacy_fed_inputs,
+    bool graph_as_function, bool upgrade_legacy,
+    // TODO(jpienaar): Remove this.
+    bool enable_shape_inference, mlir::MLIRContext* context);
+
+ABSL_DEPRECATED(
+    "Please use the other overload of this function which accepts structured "
+    "inputs instead of strings")
+// Converts a TensorFlow GraphDef contained in `input` param into a MLIR module.
+// Creates MLIR entities into the given MLIR `context`.
 StatusOr<mlir::OwningModuleRef> GraphdefToMlirTranslateFunction(
     llvm::StringRef input, absl::string_view debug_info_file,
     absl::string_view input_arrays, absl::string_view input_dtypes,
@@ -47,6 +64,22 @@ StatusOr<mlir::OwningModuleRef> GraphdefToMlirTranslateFunction(
 
 // Similar as the above function, but replaces all constant tensors
 // with randomly generated splat values.
+StatusOr<mlir::OwningModuleRef> GraphdefToSplattedMlirTranslateFunction(
+    llvm::StringRef input, absl::string_view debug_info_file,
+    const std::vector<std::string>& input_arrays,
+    const std::vector<std::string>& input_dtypes,
+    const std::vector<std::vector<int>>& input_shapes,
+    const std::vector<std::string>& output_arrays,
+    const std::vector<std::string>& control_output_arrays,
+    bool prune_unused_nodes, bool convert_legacy_fed_inputs,
+    bool graph_as_function, bool upgrade_legacy, bool enable_shape_inference,
+    mlir::MLIRContext* context);
+
+ABSL_DEPRECATED(
+    "Please use the other overload of this function which accepts structured "
+    "inputs instead of strings")
+// Similar as the above function, but replaces all constant tensors
+// with randomly generated splat values.
 StatusOr<mlir::OwningModuleRef> GraphdefToSplattedMlirTranslateFunction(
     llvm::StringRef input, absl::string_view debug_info_file,
     absl::string_view input_arrays, absl::string_view input_dtypes,
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
index 5e548da55f1..eee2f0a560c 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
@@ -17,11 +17,14 @@ limitations under the License.
 
 #include "absl/types/optional.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
@@ -36,6 +39,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
@@ -52,6 +56,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
@@ -79,11 +84,17 @@ Status ParseMlirModule(llvm::StringRef mlir_module_string,
   return Status::OK();
 }
 
+// Arguments to a computation can be either a tensor or resource.
+struct TensorOrResourceShape {
+  TensorShape shape;
+  bool is_resource = false;
+};
+
 // Converts arg_shapes to xla::Shape's and store into xla_input_shapes.
 Status GetXlaInputShapes(
-    mlir::ModuleOp module, llvm::ArrayRef<TensorShape> arg_shapes,
+    mlir::ModuleOp module, llvm::ArrayRef<TensorOrResourceShape> arg_shapes,
     bool use_tuple_args,
-    const XlaCompiler::ShapeRepresentationFn shape_representation_fn,
+    const XlaHelpers::ShapeRepresentationFn shape_representation_fn,
     std::vector<xla::Shape>* xla_input_shapes) {
   xla_input_shapes->clear();
 
@@ -103,7 +114,7 @@ Status GetXlaInputShapes(
     DataType dtype;
     TF_RETURN_IF_ERROR(ConvertToDataType(func_type.getInput(i), &dtype));
     TF_ASSIGN_OR_RETURN(xla_shape,
-                        shape_representation_fn(arg_shapes[i], dtype,
+                        shape_representation_fn(arg_shapes[i].shape, dtype,
                                                 /*use_fast_memory=*/false));
 
     // Rewrite layout with sharding, if sharding is set.
@@ -132,12 +143,13 @@ Status GetXlaInputShapes(
 }
 
 // Calculates computation output shape and build OutputDescription for each
-// output based on static shapes in MLIR module
+// output based on static shapes in MLIR module. If an output is a resource
+// write, `resource_updates` is populated insead of `outputs` for that output.
 Status GetOutputInfo(
     mlir::ModuleOp module,
-    const XlaCompiler::ShapeRepresentationFn shape_representation_fn,
-    xla::Shape* xla_output_shape,
-    std::vector<XlaCompiler::OutputDescription>* outputs) {
+    const XlaHelpers::ShapeRepresentationFn shape_representation_fn,
+    xla::Shape* xla_output_shape, std::vector<XlaOutputDescription>* outputs,
+    std::vector<XlaResourceUpdate>* resource_updates) {
   auto shape_representation_fn_no_fast_memory =
       [shape_representation_fn](const TensorShape& shape, DataType dtype) {
         return shape_representation_fn(shape, dtype, /*use_fast_memory=*/false);
@@ -148,20 +160,40 @@ Status GetOutputInfo(
 
   outputs->clear();
   outputs->reserve(func_type.getNumResults());
+  resource_updates->reserve(func_type.getNumResults());
 
   std::vector<xla::Shape> shapes;
   shapes.reserve(func_type.getNumResults());
 
-  for (mlir::Type type : func_type.getResults()) {
+  llvm::SmallDenseMap<unsigned, unsigned> resource_arg_to_write;
+  for (unsigned i = 0; i < main_func.getNumArguments(); ++i)
+    if (auto aliasing_output = main_func.getArgAttrOfType<mlir::IntegerAttr>(
+            i, "tf.aliasing_output"))
+      resource_arg_to_write.insert({aliasing_output.getInt(), i});
+
+  for (auto type_and_idx : llvm::enumerate(func_type.getResults())) {
     TF_ASSIGN_OR_RETURN(
         xla::Shape shape,
-        xla::TypeToShape(type, shape_representation_fn_no_fast_memory));
-    auto tensor_type = type.dyn_cast<mlir::RankedTensorType>();
+        xla::TypeToShape(type_and_idx.value(),
+                         shape_representation_fn_no_fast_memory));
+    auto tensor_type = type_and_idx.value().dyn_cast<mlir::RankedTensorType>();
     shapes.push_back(shape);
 
+    auto it = resource_arg_to_write.find(type_and_idx.index());
+    if (it != resource_arg_to_write.end()) {
+      // Add resource write.
+      resource_updates->emplace_back();
+      XlaResourceUpdate& resource_update = resource_updates->back();
+      resource_update.input_index = it->getSecond();
+      resource_update.modified = true;
+      TF_RETURN_IF_ERROR(ConvertToDataType(tensor_type, &resource_update.type));
+      TF_RETURN_IF_ERROR(XLAShapeToTensorShape(shape, &resource_update.shape));
+      continue;
+    }
+
     // Construct OutputDescription for result.
     outputs->emplace_back();
-    XlaCompiler::OutputDescription& out_desc = outputs->back();
+    XlaOutputDescription& out_desc = outputs->back();
     TF_RETURN_IF_ERROR(ConvertToDataType(tensor_type, &out_desc.type));
     // TODO(ycao): Support constant output.
     out_desc.is_constant = false;
@@ -181,14 +213,6 @@ Status GetOutputInfo(
   return Status::OK();
 }
 
-// Gets information about how computation updates Tensorflow resources.
-// TODO(ycao): Implement logic to compute resource updates when we need to
-// support graphs with resource updates in MLIR-based TF compiler bridge.
-void GetResourceUpdatesForMlir(
-    std::vector<XlaCompiler::ResourceUpdate>* resource_updates) {
-  resource_updates->clear();
-}
-
 // Creates a vector that maps from the parameters of the XLA computation to
 // their original argument positions.
 // MLIR-based TF-Compiler bridge doesn't have constant analysis yet, thus no
@@ -202,7 +226,7 @@ void GetInputMappingForMlir(int num_inputs, std::vector<int>* input_mapping) {
 }
 
 // Refine MLIR types based on new shape information.
-Status RefineShapes(llvm::ArrayRef<TensorShape> arg_shapes,
+Status RefineShapes(llvm::ArrayRef<TensorOrResourceShape> arg_shapes,
                     mlir::ModuleOp module) {
   auto producer_or = GetTfGraphProducerVersion(module);
   if (!producer_or.ok()) return producer_or.status();
@@ -213,15 +237,20 @@ Status RefineShapes(llvm::ArrayRef<TensorShape> arg_shapes,
   {
     // Convert arg_shapes to a mlir friendly format.
     size_t count = 0;
-    for (const TensorShape& shape : arg_shapes) {
-      count += shape.dims();
+    for (const TensorOrResourceShape& tensor_resource_shape : arg_shapes) {
+      if (tensor_resource_shape.is_resource) continue;
+      count += tensor_resource_shape.shape.dims();
     }
     shape_backing.resize(count);
     arg_shapes_copy.reserve(arg_shapes.size());
     size_t offset = 0;
-    for (const TensorShape& shape : arg_shapes) {
+    for (const TensorOrResourceShape& tensor_resource_shape : arg_shapes) {
+      if (tensor_resource_shape.is_resource) {
+        arg_shapes_copy.push_back(llvm::ArrayRef<int64_t>());
+        continue;
+      }
       size_t start = offset;
-      for (tensorflow::TensorShapeDim dim : shape) {
+      for (tensorflow::TensorShapeDim dim : tensor_resource_shape.shape) {
         shape_backing[offset] = dim.size;
         ++offset;
       }
@@ -265,7 +294,7 @@ Status ConvertMLIRToXlaComputation(
     mlir::ModuleOp module_op, llvm::StringRef device_type,
     xla::XlaComputation* xla_computation, bool use_tuple_args,
     bool return_tuple,
-    const XlaCompiler::ShapeRepresentationFn shape_representation_fn,
+    const XlaHelpers::ShapeRepresentationFn shape_representation_fn,
     std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes) {
   mlir::PassManager tf2xla(module_op.getContext());
   tf2xla.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
@@ -275,35 +304,33 @@ Status ConvertMLIRToXlaComputation(
   tf2xla.addPass(mlir::TFDevice::CreateDecomposeResourceOpsPass());
   tf2xla.addPass(mlir::TF::CreatePromoteResourcesToArgsPass());
   tf2xla.addPass(mlir::createSymbolDCEPass());
+  // Guarantee all functions have one use, which enables shape inference.
+  tf2xla.addPass(mlir::TF::CreateGuaranteeAllFuncsOneUsePass());
   tf2xla.addPass(mlir::TF::CreateTFShapeInferencePass());
   // LegalizeTFControlFlow encapsulates arguments for control flow operations
   // with a tuple argument which break the assumption of resource lifting
   // inside PromoteResourcesToArgs.
   tf2xla.addPass(mlir::mhlo::createLegalizeTFControlFlowPass());
 
-  tf2xla.addNestedPass<mlir::FuncOp>(mlir::mhlo::createLegalizeTFPass(true));
+  tf2xla.addNestedPass<mlir::FuncOp>(mlir::mhlo::createLegalizeTFPass(
+      /*allow_partial_conversion=*/true, /*legalize_chlo=*/true,
+      /*tf2xla_fallback_device_type=*/device_type));
   for (auto& target_pass : custom_legalization_passes) {
     tf2xla.addNestedPass<mlir::FuncOp>(std::move(target_pass));
   }
   tf2xla.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
-  tf2xla.addPass(mlir::TF::CreateTFShapeInferencePass());
-
-  // Leverage tf2xla kernels for ops that didn't get lowered in the previous
-  // legalization pass.
-  tf2xla.addPass(mlir::mhlo::createLegalizeTfWithTf2XlaPass(device_type));
-  tf2xla.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
-
   // Run shape inference pass to propagate shapes through tensor_cast operations
   // from static to dynamic shapes. This could be generated if the shape
   // inference was originally missing in a TF op but the corresponding HLO op
   // had static shape after lowering.
   tf2xla.addPass(mlir::TF::CreateTFShapeInferencePass());
-
   // Run LegalizeTFPass again because the previous legalization passes can
   // expose more graph pruning and canonicalization opportunities that are
   // necessary for the second LegalizeTFPass(allow_partial_conversion=false)
   // invocation.
-  tf2xla.addNestedPass<mlir::FuncOp>(mlir::mhlo::createLegalizeTFPass(false));
+  tf2xla.addNestedPass<mlir::FuncOp>(mlir::mhlo::createLegalizeTFPass(
+      /*allow_partial_conversion=*/false, /*legalize_chlo=*/true,
+      /*tf2xla_fallback_device_type=*/device_type));
   // In order to export to XLA, we must sink constants to control flow regions,
   // since XLA uses functional control flow.
   tf2xla.addNestedPass<mlir::FuncOp>(
@@ -339,10 +366,10 @@ Status ConvertMLIRToXlaComputation(
 }
 
 static Status CompileMlirToXlaHlo(
-    mlir::ModuleOp module_op, llvm::ArrayRef<TensorShape> arg_shapes,
+    mlir::ModuleOp module_op, llvm::ArrayRef<TensorOrResourceShape> arg_shapes,
     llvm::StringRef device_type, bool use_tuple_args,
-    XlaCompiler::ShapeRepresentationFn shape_representation_fn,
-    XlaCompiler::CompilationResult* compilation_result,
+    XlaHelpers::ShapeRepresentationFn shape_representation_fn,
+    XlaCompilationResult* compilation_result,
     std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes) {
   if (VLOG_IS_ON(1))
     tensorflow::DumpMlirOpToFile("mlir_compile_before", module_op);
@@ -373,14 +400,10 @@ static Status CompileMlirToXlaHlo(
                                        shape_representation_fn,
                                        &compilation_result->xla_input_shapes));
 
-  // Compute all output descriptions.
-  TF_RETURN_IF_ERROR(GetOutputInfo(module_op, shape_representation_fn,
-                                   &compilation_result->xla_output_shape,
-                                   &compilation_result->outputs));
-
-  // Compute what resource variables need to be updated after XlaComputation's
-  // execution.
-  GetResourceUpdatesForMlir(&compilation_result->resource_updates);
+  // Compute all output descriptions and resource writes
+  TF_RETURN_IF_ERROR(GetOutputInfo(
+      module_op, shape_representation_fn, &compilation_result->xla_output_shape,
+      &compilation_result->outputs, &compilation_result->resource_updates));
 
   if (VLOG_IS_ON(1))
     tensorflow::DumpMlirOpToFile("mlir_compile_after", module_op);
@@ -391,8 +414,8 @@ static Status CompileMlirToXlaHlo(
 Status CompileSerializedMlirToXlaHlo(
     llvm::StringRef mlir_module_string, llvm::ArrayRef<TensorShape> arg_shapes,
     llvm::StringRef device_type, bool use_tuple_args,
-    const XlaCompiler::ShapeRepresentationFn shape_representation_fn,
-    XlaCompiler::CompilationResult* compilation_result,
+    const XlaHelpers::ShapeRepresentationFn shape_representation_fn,
+    XlaCompilationResult* compilation_result,
     std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes) {
   RegisterDialects();
   mlir::MLIRContext mlir_context;
@@ -400,27 +423,52 @@ Status CompileSerializedMlirToXlaHlo(
 
   TF_RETURN_IF_ERROR(
       ParseMlirModule(mlir_module_string, &mlir_context, &mlir_module));
-  return CompileMlirToXlaHlo(mlir_module.get(), arg_shapes, device_type,
-                             use_tuple_args, shape_representation_fn,
-                             compilation_result,
+  llvm::SmallVector<TensorOrResourceShape, 4> tensor_or_resource_shapes;
+  tensor_or_resource_shapes.reserve(arg_shapes.size());
+  for (const auto& arg_shape : arg_shapes)
+    tensor_or_resource_shapes.push_back({arg_shape});
+  return CompileMlirToXlaHlo(mlir_module.get(), tensor_or_resource_shapes,
+                             device_type, use_tuple_args,
+                             shape_representation_fn, compilation_result,
                              std::move(custom_legalization_passes));
 }
 
 // Rewrites the given module with specified args. For each of the constant args,
 // it gets inlined in the "main' function and the corresponding argument is
-// removed from the signature.
+// removed from the signature. For resource args, their subtypes are populated.
 // Returns the original indices for the other arguments on success.
 static StatusOr<std::vector<int>> RewriteWithArgs(
-    mlir::ModuleOp module, llvm::ArrayRef<const XlaCompiler::Argument> args) {
+    mlir::ModuleOp module, llvm::ArrayRef<const XlaArgument> args) {
   mlir::FuncOp main_fn = module.lookupSymbol<mlir::FuncOp>("main");
   std::vector<int> params;
 
+  bool has_resource_args = false;
   auto builder = mlir::OpBuilder(main_fn.getBody());
   std::vector<int> args_to_erase;
   for (int idx = 0; idx < args.size(); idx++) {
-    const XlaCompiler::Argument& xla_arg = args[idx];
+    const XlaArgument& xla_arg = args[idx];
     mlir::BlockArgument mlir_arg = main_fn.getArgument(idx);
-    if (xla_arg.kind != XlaCompiler::Argument::kConstant) {
+    if (xla_arg.kind == XlaArgument::kResource) {
+      mlir::Type element_type;
+      TF_RETURN_IF_ERROR(ConvertDataType(xla_arg.type, builder, &element_type));
+      auto resource_shape = absl::get<TensorShape>(xla_arg.shape).dim_sizes();
+      llvm::SmallVector<int64_t, 4> resource_subtype_shape(
+          resource_shape.begin(), resource_shape.end());
+      auto resource_subtype =
+          mlir::RankedTensorType::get(resource_subtype_shape, element_type);
+      auto resource_type =
+          mlir::TF::ResourceType::get({resource_subtype}, builder.getContext());
+
+      auto tensor_type = mlir_arg.getType().cast<mlir::TensorType>();
+      if (tensor_type.hasRank()) {
+        mlir_arg.setType(
+            mlir::RankedTensorType::get(tensor_type.getShape(), resource_type));
+      } else {
+        mlir_arg.setType(mlir::UnrankedTensorType::get(resource_type));
+      }
+      has_resource_args = true;
+    }
+    if (xla_arg.kind != XlaArgument::kConstant) {
       params.push_back(idx);
       continue;
     }
@@ -434,22 +482,40 @@ static StatusOr<std::vector<int>> RewriteWithArgs(
     args_to_erase.push_back(idx);
   }
 
+  if (has_resource_args) {
+    llvm::SmallVector<mlir::Type, 4> updated_argument_types;
+    updated_argument_types.reserve(main_fn.getNumArguments());
+    for (mlir::BlockArgument& arg : main_fn.getArguments())
+      updated_argument_types.push_back(arg.getType());
+
+    main_fn.setType(mlir::FunctionType::get(updated_argument_types,
+                                            main_fn.getType().getResults(),
+                                            main_fn.getContext()));
+  }
+
   for (int idx : llvm::reverse(args_to_erase)) main_fn.eraseArgument(idx);
+
   return params;
 }
 
 Status CompileGraphToXlaHlo(
-    const Graph& graph, llvm::ArrayRef<const XlaCompiler::Argument> args,
+    const Graph& graph, llvm::ArrayRef<const XlaArgument> args,
     llvm::StringRef device_type, bool use_tuple_args,
     const FunctionLibraryDefinition& flib_def, const GraphDebugInfo& debug_info,
-    const XlaCompiler::ShapeRepresentationFn shape_representation_fn,
-    XlaCompiler::CompilationResult* compilation_result,
+    const XlaHelpers::ShapeRepresentationFn shape_representation_fn,
+    XlaCompilationResult* compilation_result,
     std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes) {
   RegisterDialects();
 
   mlir::MLIRContext context;
   GraphImportConfig config;
   config.graph_as_function = true;
+  // Disable shape inference during import as some TensorFlow op fails during
+  // shape inference with dynamic shaped operands. This in turn causes the
+  // import to fail. Shape inference during import is going to be removed and
+  // the shape inference pass is run early in the pass pipeline, shape inference
+  // during import is not necessary.
+  config.enable_shape_inference = false;
   auto module_or =
       ConvertGraphToMlir(graph, debug_info, flib_def, config, &context);
   if (!module_or.ok()) return module_or.status();
@@ -457,10 +523,21 @@ Status CompileGraphToXlaHlo(
   mlir::ModuleOp module = module_or.ValueOrDie().get();
   TF_ASSIGN_OR_RETURN(std::vector<int> remaining_params,
                       RewriteWithArgs(module, {args.data(), args.size()}));
-  llvm::SmallVector<TensorShape, 4> arg_shapes;
-  arg_shapes.reserve(args.size());
-  for (unsigned idx : remaining_params)
-    arg_shapes.push_back(absl::get<TensorShape>(args[idx].shape));
+  llvm::SmallVector<TensorOrResourceShape, 4> arg_shapes;
+  arg_shapes.reserve(remaining_params.size());
+  for (unsigned idx : remaining_params) {
+    const auto& arg = args[idx];
+    arg_shapes.push_back({absl::get<TensorShape>(arg.shape),
+                          /*is_resource=*/arg.kind == XlaArgument::kResource});
+  }
+
+  mlir::PassManager pm(&context);
+  mlir::TF::StandardPipelineOptions tf_options;
+  mlir::TF::CreateTFStandardPipeline(pm, tf_options);
+  {
+    mlir::StatusScopedDiagnosticHandler diag_handler(module.getContext());
+    if (failed(pm.run(module))) return diag_handler.ConsumeStatus();
+  }
 
   auto status = CompileMlirToXlaHlo(
       module, arg_shapes, device_type, use_tuple_args, shape_representation_fn,
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h
index 24b60dcb346..5c64a65ecbd 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h
@@ -20,7 +20,10 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_argument.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
@@ -57,7 +60,7 @@ Status ConvertMLIRToXlaComputation(
     mlir::ModuleOp module_op, llvm::StringRef device_type,
     xla::XlaComputation* xla_computation, bool use_tuple_args,
     bool return_tuple,
-    const XlaCompiler::ShapeRepresentationFn shape_representation_fn = nullptr,
+    const XlaHelpers::ShapeRepresentationFn shape_representation_fn = nullptr,
     std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes = {});
 
 // Compiles a serialized MLIR module into XLA HLO, generates all accompanying
@@ -65,17 +68,18 @@ Status ConvertMLIRToXlaComputation(
 Status CompileSerializedMlirToXlaHlo(
     llvm::StringRef mlir_module_string, llvm::ArrayRef<TensorShape> arg_shapes,
     llvm::StringRef device_type, bool use_tuple_args,
-    const XlaCompiler::ShapeRepresentationFn shape_representation_fn,
-    XlaCompiler::CompilationResult* compilation_result,
+    const XlaHelpers::ShapeRepresentationFn shape_representation_fn,
+    XlaCompilationResult* compilation_result,
     std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes = {});
 
 // Same as the above but takes input as TensorFlow Graph.
+// TODO(lyandy): Allow populating of targets/control outputs.
 Status CompileGraphToXlaHlo(
-    const Graph& graph, llvm::ArrayRef<const XlaCompiler::Argument> args,
+    const Graph& graph, llvm::ArrayRef<const XlaArgument> args,
     llvm::StringRef device_type, bool use_tuple_args,
     const FunctionLibraryDefinition& flib_def, const GraphDebugInfo& debug_info,
-    const XlaCompiler::ShapeRepresentationFn shape_representation_fn,
-    XlaCompiler::CompilationResult* compilation_result,
+    const XlaHelpers::ShapeRepresentationFn shape_representation_fn,
+    XlaCompilationResult* compilation_result,
     std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes = {});
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_test.cc
index dde2408c83a..8a07aab11e1 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_test.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h"
 
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -448,9 +451,6 @@ TEST(CompileGraphToXlaHlo, Basic) {
   FunctionLibraryDefinition flib_def(OpRegistry::Global(), {});
   Graph graph(OpRegistry::Global());
 
-  Tensor dummy_tensor(DT_FLOAT, TensorShape({1}));
-  test::FillValues<float>(&dummy_tensor, {-1.0});
-
   Node* arg = test::graph::Arg(&graph, 0, DT_FLOAT);
   test::graph::Retval(&graph, 0, arg);
 
@@ -483,5 +483,60 @@ ENTRY %main.3 (Arg_0.1: f32[]) -> (f32[]) {
             status_or_hlo_module.ValueOrDie()->ToString());
 }
 
+// Tests a conversion from Graph to MLIR with resource arguments.
+TEST(CompileGraphToXlaHlo, Resources) {
+  FunctionLibraryDefinition flib_def(OpRegistry::Global(), {});
+  Graph graph(OpRegistry::Global());
+
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto val = ops::_Arg(scope.WithOpName("arg0"), DT_FLOAT, 0);
+  auto var = ops::_Arg(scope.WithOpName("arg1"), DT_RESOURCE, 1);
+  auto assign =
+      ops::AssignVariableOp(scope.WithOpName("assign_variable"), var, val);
+  TF_ASSERT_OK(scope.ToGraph(&graph));
+
+  XlaCompiler::CompilationResult result;
+  XlaCompiler::Argument arg0;
+  arg0.kind = XlaCompiler::Argument::kParameter;
+  arg0.shape = TensorShape({2});
+  XlaCompiler::Argument arg1;
+  arg1.kind = XlaCompiler::Argument::kResource;
+  arg1.shape = TensorShape({2});
+  arg1.type = DT_FLOAT;
+
+  TF_ASSERT_OK(
+      CompileGraphToXlaHlo(graph, /*args=*/{arg0, arg1}, "XLA_CPU_JIT",
+                           /*use_tuple_args=*/false, flib_def, GraphDebugInfo(),
+                           /*shape_representation_fn=*/nullptr, &result));
+
+  EXPECT_EQ(result.outputs.size(), 0);
+  ASSERT_EQ(result.resource_updates.size(), 1);
+  const auto& resource_update = result.resource_updates[0];
+  EXPECT_EQ(resource_update.input_index, 1);
+  EXPECT_EQ(resource_update.modified, true);
+  EXPECT_EQ(resource_update.shape, TensorShape({2}));
+  EXPECT_EQ(resource_update.type, DT_FLOAT);
+
+  const xla::HloModuleConfig module_config(
+      result.computation->GetProgramShape().ValueOrDie());
+  auto status_or_hlo_module = xla::HloModule::CreateFromProto(
+      result.computation->proto(), module_config);
+  ASSERT_TRUE(status_or_hlo_module.ok());
+
+  constexpr char expected_hlo_module_string[] =
+      R"(HloModule main.4, input_output_alias={ {0}: (1, {}, may_alias) }
+
+ENTRY %main.4 (Arg_0.1: f32[2], Arg_1.2: f32[2]) -> (f32[2]) {
+  %Arg_1.2 = f32[2]{0} parameter(1)
+  %Arg_0.1 = f32[2]{0} parameter(0)
+  ROOT %tuple.3 = (f32[2]{0}) tuple(f32[2]{0} %Arg_0.1)
+}
+
+)";
+
+  EXPECT_EQ(expected_hlo_module_string,
+            status_or_hlo_module.ValueOrDie()->ToString());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc
index bf0b3b75ace..81892934efe 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/strings/string_view.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -25,6 +26,8 @@ limitations under the License.
 #include "llvm/Support/Regex.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/core/common_runtime/device.h"
@@ -155,4 +158,19 @@ mlir::LogicalResult GetDevicesFromOp(mlir::Operation* op,
       llvm::formatv("unsupported '{0}' attribute", kDevicesAttr));
 }
 
+mlir::LogicalResult GetDeviceOrdinalFromDeviceString(mlir::Location loc,
+                                                     llvm::StringRef device,
+                                                     int64_t* device_ordinal) {
+  DeviceNameUtils::ParsedName parsed_name;
+  if (!DeviceNameUtils::ParseFullName(
+          absl::string_view(device.data(), device.size()), &parsed_name))
+    return mlir::emitError(loc) << "invalid device '" << device << "'";
+
+  if (!parsed_name.has_id)
+    return mlir::emitError(loc) << "device '" << device << "' has no id";
+
+  *device_ordinal = parsed_name.id;
+  return mlir::success();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/device_util.h b/tensorflow/compiler/mlir/tensorflow/utils/device_util.h
index 893e118024c..14e48bf7710 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/device_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/device_util.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DEVICE_UTIL_H_
 
 #include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
@@ -41,6 +42,12 @@ void AddDevicesToOp(mlir::Operation* op, const DeviceSet* device_set);
 mlir::LogicalResult GetDevicesFromOp(mlir::Operation* op,
                                      mlir::TF::RuntimeDevices* devices);
 
+// Parses a device string and returns its ordinal (id). This will return an
+// error if the device string is invalid or has no id.
+mlir::LogicalResult GetDeviceOrdinalFromDeviceString(mlir::Location loc,
+                                                     llvm::StringRef device,
+                                                     int64_t* device_ordinal);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DEVICE_UTIL_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc
index bc849e1d116..1da1f5973f6 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc
@@ -205,5 +205,47 @@ TEST(DeviceUtilTest, GetGpuDeviceMetadata) {
   ASSERT_FALSE(meta_1.hasValue());
 }
 
+TEST(DeviceUtilTest, GetDeviceOrdinalFromDeviceString) {
+  const std::string tpu0 = "/job:worker/replica:0/task:0/device:TPU:0";
+  const std::string tpu1 = "/job:worker/replica:0/task:0/device:TPU:1";
+
+  mlir::MLIRContext context;
+  auto unknown_loc = mlir::UnknownLoc::get(&context);
+
+  int64_t device_ordinal0 = -1;
+  mlir::LogicalResult result0 =
+      GetDeviceOrdinalFromDeviceString(unknown_loc, tpu0, &device_ordinal0);
+  EXPECT_TRUE(mlir::succeeded(result0));
+  EXPECT_EQ(device_ordinal0, 0);
+
+  int64_t device_ordinal1 = -1;
+  mlir::LogicalResult result1 =
+      GetDeviceOrdinalFromDeviceString(unknown_loc, tpu1, &device_ordinal1);
+  EXPECT_TRUE(mlir::succeeded(result1));
+  EXPECT_EQ(device_ordinal1, 1);
+}
+
+TEST(DeviceUtilTest, GetDeviceOrdinalFromDeviceStringInvalid) {
+  mlir::MLIRContext context;
+  auto unknown_loc = mlir::UnknownLoc::get(&context);
+
+  int64_t device_ordinal = -1;
+  mlir::LogicalResult result = GetDeviceOrdinalFromDeviceString(
+      unknown_loc, "bad_device", &device_ordinal);
+  EXPECT_TRUE(mlir::failed(result));
+}
+
+TEST(DeviceUtilTest, GetDeviceOrdinalFromDeviceStringNoId) {
+  const std::string tpu_no_id = "/job:worker/replica:0/task:0/device:TPU";
+
+  mlir::MLIRContext context;
+  auto unknown_loc = mlir::UnknownLoc::get(&context);
+
+  int64_t device_ordinal = -1;
+  mlir::LogicalResult result =
+      GetDeviceOrdinalFromDeviceString(unknown_loc, tpu_no_id, &device_ordinal);
+  EXPECT_TRUE(mlir::failed(result));
+}
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/error_util.h b/tensorflow/compiler/mlir/tensorflow/utils/error_util.h
index 4feb3837357..b5f2acc581d 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/error_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/error_util.h
@@ -27,7 +27,7 @@ limitations under the License.
 namespace mlir {
 
 // TensorFlow's Status is used for error reporting back to callers.
-using tensorflow::Status;
+using ::tensorflow::Status;
 
 // Diagnostic handler that collects all the diagnostics reported and can produce
 // a Status to return to callers. This is for the case where MLIR functions are
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
index 852bc72d7de..ad9ddb277d7 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
@@ -82,7 +82,7 @@ Status ConvertLocation(mlir::Location inst_loc,
     if (locations.size() <= 1)
       return errors::InvalidArgument("expected experimental debuf info.");
     // skip the first one, which is the name of the node_def.
-    for (int i = 0; i < locations.size() - 1; ++i) {
+    for (int i = 0, end = locations.size() - 1; i < end; ++i) {
       TF_RETURN_IF_ERROR(ConvertLocation(locations[i], debug_info));
     }
   }
@@ -121,6 +121,20 @@ Status ConvertAttribute(const mlir::TF::ShapeAttr& attr, AttrValue* value) {
   return Status::OK();
 }
 
+Status ConvertAttribute(const mlir::FlatSymbolRefAttr& attr, AttrValue* value) {
+  value->mutable_func()->set_name(attr.getValue().str());
+  return Status::OK();
+}
+
+Status ConvertAttribute(const mlir::TF::FuncAttr& attr, AttrValue* value) {
+  TF_RETURN_IF_ERROR(
+      ConvertAttribute(attr.GetName().cast<mlir::FlatSymbolRefAttr>(), value));
+  TF_RETURN_IF_ERROR(ConvertAttributes(attr.GetAttrs().getValue(),
+                                       /*attrs_to_ignore=*/{},
+                                       value->mutable_func()->mutable_attr()));
+  return Status::OK();
+}
+
 Status ConvertAttribute(const mlir::StringAttr& attr, AttrValue* value) {
   absl::string_view attr_value(attr.getValue().data(), attr.getValue().size());
   switch (mangling_util::GetMangledKind(attr_value)) {
@@ -160,11 +174,6 @@ Status ConvertAttribute(const mlir::UnitAttr& attr, AttrValue* value) {
   return Status::OK();
 }
 
-Status ConvertAttribute(const mlir::FlatSymbolRefAttr& attr, AttrValue* value) {
-  value->mutable_func()->set_name(std::string(attr.getValue()));
-  return Status::OK();
-}
-
 Status ConvertAttribute(const mlir::ArrayAttr& attr, AttrValue* value) {
   auto* list = value->mutable_list();
   for (mlir::Attribute a : attr.getValue()) {
@@ -218,25 +227,13 @@ Status ConvertAttribute(const mlir::ArrayAttr& attr, AttrValue* value) {
   return Status::OK();
 }
 
-// Updates NodeDef constructed out of an MLIR If op to map it to either
-// TensorFlow StatelessIf or If op depending on the additional attribute.
-void UpdateCompositeIfOp(NodeDef* node_def) {
+// Updates NodeDef constructed out of an MLIR Case/IfW/While op to map it to
+// either TensorFlow StatelessX or X op depending on the additional attribute.
+void UpdateCompositeOp(NodeDef* node_def) {
   auto it = node_def->mutable_attr()->find("is_stateless");
   if (it != node_def->attr().end()) {
     if (it->second.b()) {
-      *node_def->mutable_op() = "StatelessIf";
-    }
-    node_def->mutable_attr()->erase(it);
-  }
-}
-
-// Updates NodeDef constructed out of an MLIR While op to map it to either
-// TensorFlow StatelessWhile or While op depending on the additional attribute.
-void UpdateCompositeWhileOp(NodeDef* node_def) {
-  auto it = node_def->mutable_attr()->find("is_stateless");
-  if (it != node_def->attr().end()) {
-    if (it->second.b()) {
-      *node_def->mutable_op() = "StatelessWhile";
+      *node_def->mutable_op() = "Stateless" + node_def->op();
     }
     node_def->mutable_attr()->erase(it);
   }
@@ -343,8 +340,9 @@ StatusOr<std::unique_ptr<NodeDef>> GetOperationNodeDef(
   TF_RETURN_IF_ERROR(ConvertLocation(
       inst->getLoc(), node_def->mutable_experimental_debug_info()));
 
-  if (node_def->op() == "If") UpdateCompositeIfOp(node_def.get());
-  if (node_def->op() == "While") UpdateCompositeWhileOp(node_def.get());
+  if (node_def->op() == "Case") UpdateCompositeOp(node_def.get());
+  if (node_def->op() == "If") UpdateCompositeOp(node_def.get());
+  if (node_def->op() == "While") UpdateCompositeOp(node_def.get());
 
   return node_def;
 }
@@ -372,8 +370,8 @@ Status ConvertAttributes(
     AttrValue value;
     switch (attr.getKind()) {
       case mlir::StandardAttributes::SymbolRef: {
-        auto func_attr = attr.cast<mlir::FlatSymbolRefAttr>();
-        value.mutable_func()->set_name(std::string(func_attr.getValue()));
+        TF_RETURN_IF_ERROR(
+            ConvertAttribute(attr.cast<mlir::FlatSymbolRefAttr>(), &value));
         func_call_attrs[string(name)] = value;
         continue;
       }
@@ -415,6 +413,12 @@ Status ConvertAttributes(
         TF_RETURN_IF_ERROR(
             ConvertAttribute(attr.cast<mlir::TF::ShapeAttr>(), &value));
         break;
+      case static_cast<unsigned>(mlir::TF::AttrKind::FUNC): {
+        TF_RETURN_IF_ERROR(
+            ConvertAttribute(attr.cast<mlir::TF::FuncAttr>(), &value));
+        func_call_attrs[string(name)] = value;
+        continue;
+      }
       // AffineMap kind is not implemented.
       case mlir::StandardAttributes::AffineMap:
         return errors::Unimplemented("AffineMap attribute (needed for '",
@@ -503,7 +507,7 @@ Status SetSizeAttribute(absl::string_view name, size_t size,
     // This should be extremely rare as it means we are adding the same
     // attribute multiple times/have some redundancy in representing this
     // attribute.
-    int64 actual_size = result.first->second.i();
+    size_t actual_size = result.first->second.i();
     // Just check via string output as we shouldn't get here and if we do they
     // should be trivially the same, else fail.
     if (actual_size != size)
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
index f884b75bce1..843d491c330 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
@@ -149,7 +149,8 @@ Status GetTPUDevices(
            std::next(system_devices.begin()), system_devices.end())) {
     auto host_tpu_devices = lookup(device_spec);
     // Check number of TPU devices per host all match.
-    if (num_tpus_per_host != host_tpu_devices.size())
+    const int64 host_tpu_devices_size = host_tpu_devices.size();
+    if (num_tpus_per_host != host_tpu_devices_size)
       return errors::InvalidArgument(
           "expected the number of TPU devices per host to be ",
           num_tpus_per_host, ", got ", host_tpu_devices.size());
@@ -354,7 +355,8 @@ GetGeneralTPUExecutionDeviceAssignment(
 
   const int expected_device_assignment_size =
       num_replicas * num_cores_per_replica * kTPUTopologyRank;
-  if (device_assignment_attr.size() != expected_device_assignment_size)
+  const int device_assignment_attr_size = device_assignment_attr.size();
+  if (device_assignment_attr_size != expected_device_assignment_size)
     return errors::InvalidArgument(
         "length of '", kDeviceAssignmentAttr,
         "' must be 'num_replicas' * 'num_cores_per_replica' * ",
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
index 083a5abf840..a3f8e833ae3 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
@@ -242,7 +242,8 @@ mlir::LogicalResult ExtractInputsForLogicalDevices(
           cluster_func.getLoc(), sharding, input_value, builder, &tiled_inputs);
       if (mlir::failed(result)) return mlir::failure();
 
-      if (tiled_inputs.size() != num_cores_per_replica)
+      const int64 tiled_inputs_size = tiled_inputs.size();
+      if (tiled_inputs_size != num_cores_per_replica)
         cluster_func.emitError(llvm::formatv(
             "incorrect {0}-th tiled input sharding received. "
             "Product of tile sharding splits({1}) must be equal to "
@@ -376,7 +377,8 @@ mlir::LogicalResult HandleTileShardedOutputs(
 
     llvm::SmallVector<mlir::Value, 4> new_outputs;
     new_outputs.reserve(num_splits);
-    for (int i = 0; i < outputs_to_merge.size(); i = i + num_splits) {
+    for (int i = 0, end = outputs_to_merge.size(); i < end;
+         i = i + num_splits) {
       mlir::TF::ConcatOp concat_op;
       auto result =
           CreateConcatOp(concat_dimension, location,
diff --git a/tensorflow/compiler/mlir/tf_mlir_translate_main.cc b/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
index 8cfdfd01120..caac8ea1eeb 100644
--- a/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
+++ b/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
@@ -121,7 +121,7 @@ int main(int argc, char** argv) {
     mlir::MLIRContext context;
 
     auto module_or = tensorflow::SavedModelSignatureDefsToMlirImport(
-        input_filename, tags, exported_names, &context);
+        input_filename, tags, exported_names, &context, upgrade_legacy);
     if (!module_or.status().ok()) return 1;
 
     module_or.ConsumeValueOrDie()->print(output->os());
diff --git a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.cc b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.cc
index 9ba875cdce4..331bed09dce 100644
--- a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.cc
+++ b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.cc
@@ -25,8 +25,7 @@ namespace tfjs {
 // TFJSDialect
 //===----------------------------------------------------------------------===//
 
-TFJSDialect::TFJSDialect(MLIRContext *context)
-    : Dialect(getDialectNamespace(), context) {
+void TFJSDialect::initialize() {
   addOperations<
 #define GET_OP_LIST
 #include "tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.cc.inc"
diff --git a/tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.cc b/tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.cc
index b7e95629062..4e24007a8c6 100644
--- a/tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.cc
+++ b/tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.cc
@@ -45,7 +45,7 @@ void AddTFToTFJSConversionPasses(mlir::OpPassManager* pm) {
   // raise to executor dialect in order to use GraphDef converter
   pm->addNestedPass<mlir::FuncOp>(
       mlir::CreateFunctionalToExecutorDialectConversionPass());
-  pm->addNestedPass<mlir::FuncOp>(mlir::CreateBreakUpIslandsPass());
+  pm->addPass(mlir::CreateBreakUpIslandsPass());
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
index b5735f823e4..5befdcdc513 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -1,7 +1,16 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
-licenses(["notice"])
+package(
+    default_visibility = [":friends"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+package_group(
+    name = "friends",
+    includes = ["//third_party/mlir:subpackages"],
+    packages = ["//tensorflow/compiler/mlir/..."],
+)
 
 cc_library(
     name = "cubin_creator",
@@ -50,3 +59,33 @@ tf_cc_binary(
         "@llvm-project//llvm:Support",
     ],
 )
+
+tf_cc_binary(
+    name = "kernel-gen-opt",
+    srcs = ["tools/kernel-gen-opt/kernel-gen-opt.cc"],
+    visibility = ["//tensorflow/compiler/mlir/tools/kernel_gen/tests:__pkg__"],
+    deps = [
+        "//tensorflow/compiler/mlir/hlo:hlo_dialect_registration",
+        "//tensorflow/compiler/mlir/tools/kernel_gen/ir:tf_framework_dialect_registration",
+        "//tensorflow/compiler/mlir/tools/kernel_gen/transforms:passes",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MlirOptLib",
+        "@llvm-project//mlir:MlirOptMain",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+exports_files(["tf_framework_c_interface.h"])
+
+cc_library(
+    name = "tf_framework_c_interface",
+    srcs = ["tf_framework_c_interface.cc"],
+    hdrs = ["tf_framework_c_interface.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "@llvm-project//mlir:mlir_runner_utils",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
index 1f511e27d9e..82b0e613f90 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
@@ -278,7 +278,8 @@ StatusOr<std::vector<uint8_t>> tensorflow::kernel_gen::GenerateCubinForTfCode(
 
   mlir::OwningModuleRef kernel_module =
       xla::mlir_gpu::ExtractKernelModule(*module).ValueOrDie();
-  auto llvmModule = mlir::translateModuleToNVVMIR(*kernel_module);
+  llvm::LLVMContext llvmContext;
+  auto llvmModule = mlir::translateModuleToNVVMIR(*kernel_module, llvmContext);
   if (!llvmModule) {
     return InternalError("Could not translate MLIR module to NVVM");
   }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
new file mode 100644
index 00000000000..3a28d4815d2
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
@@ -0,0 +1,47 @@
+load("//third_party/mlir:tblgen.bzl", "gentbl")
+
+package(
+    default_visibility = ["//tensorflow/compiler/mlir/tools/kernel_gen:friends"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+gentbl(
+    name = "tf_framework_ops_inc_gen",
+    tbl_outs = [
+        ("-gen-op-decls", "tf_framework_ops.h.inc"),
+        ("-gen-op-defs", "tf_framework_ops.cc.inc"),
+        ("-gen-dialect-decls", "tf_framework_dialect.h.inc"),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "tf_framework_ops.td",
+    td_srcs = [
+        "tf_framework_ops.td",
+        "@llvm-project//mlir:OpBaseTdFiles",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
+    ],
+)
+
+cc_library(
+    name = "tf_framework_ops",
+    srcs = [
+        "tf_framework_ops.cc",
+        "tf_framework_ops.cc.inc",
+        "tf_framework_ops.h.inc",
+    ],
+    hdrs = ["tf_framework_ops.h"],
+    deps = [
+        ":tf_framework_ops_inc_gen",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:SideEffects",
+    ],
+)
+
+cc_library(
+    name = "tf_framework_dialect_registration",
+    srcs = ["dialect_registration.cc"],
+    deps = [
+        ":tf_framework_ops",
+        "@llvm-project//mlir:IR",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/dialect_registration.cc b/tensorflow/compiler/mlir/tools/kernel_gen/ir/dialect_registration.cc
new file mode 100644
index 00000000000..a2e5955b570
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/dialect_registration.cc
@@ -0,0 +1,21 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
+
+// Static initialization for TF Framework dialect registration.
+static mlir::DialectRegistration<
+    mlir::kernel_gen::tf_framework::TFFrameworkDialect>
+    tf_framework_ops;
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc
new file mode 100644
index 00000000000..5b7a19a3eac
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc
@@ -0,0 +1,87 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the operations used in the tf_framework dialect.
+
+#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
+
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/DialectImplementation.h"  // from @llvm-project
+
+namespace mlir {
+namespace kernel_gen {
+namespace tf_framework {
+
+void TFFrameworkDialect::initialize() {
+  addOperations<
+#define GET_OP_LIST
+#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc.inc"
+      >();
+  addTypes<OpKernelContextType>();
+}
+
+/// Parse a type registered to this dialect.
+Type TFFrameworkDialect::parseType(DialectAsmParser &parser) const {
+  StringRef keyword;
+  if (parser.parseKeyword(&keyword)) return Type();
+
+  if (keyword == "op_kernel_context") {
+    return OpKernelContextType::get(getContext());
+  }
+
+  parser.emitError(parser.getNameLoc(), "unknown TF Framework type: ")
+      << keyword;
+  return Type();
+}
+
+/// Print a type registered to this dialect.
+void TFFrameworkDialect::printType(Type type, DialectAsmPrinter &os) const {
+  switch (type.getKind()) {
+    case TFFrameworkTypes::OpKernelContextType:
+      os << "op_kernel_context";
+      return;
+    default:
+      llvm_unreachable("unexpected TF Framework type kind");
+  }
+}
+
+template <typename OpTy>
+LogicalResult Verify(OpTy op) {
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// AllocRawOp
+//===----------------------------------------------------------------------===//
+template <>
+LogicalResult Verify<AllocRawOp>(AllocRawOp op) {
+  // Check that the total number of operands matches the number of dynamic
+  // dimensions specified in the memref type.
+  unsigned result_dyn_dims = op.getType().getNumDynamicDims();
+  unsigned dyn_sizes_count = op.dyn_sizes().size();
+  if (dyn_sizes_count != result_dyn_dims)
+    return op.emitOpError()
+           << "`dyn_sizes` count " << dyn_sizes_count
+           << " does not match dynamic dimensions count in the result type"
+           << op.getType();
+  return success();
+}
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc.inc"
+
+}  // namespace tf_framework
+}  // namespace kernel_gen
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h
new file mode 100644
index 00000000000..8d6e433d9b9
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h
@@ -0,0 +1,64 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the operations used in the TFFramework dialect.
+//
+#ifndef TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_IR_TF_FRAMEWORK_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_IR_TF_FRAMEWORK_OPS_H_
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+
+namespace mlir {
+namespace kernel_gen {
+namespace tf_framework {
+
+namespace TFFrameworkTypes {
+enum Kind {
+  OpKernelContextType = Type::FIRST_TF_FRAMEWORK_TYPE,
+};
+}  // namespace TFFrameworkTypes
+
+/// OpKernelContextType corresponds to C++ class OpKernelContext defined in
+/// tensorflow/core/framework/op_kernel.h
+class OpKernelContextType
+    : public Type::TypeBase<OpKernelContextType, Type, TypeStorage> {
+ public:
+  using Base::Base;
+
+  static OpKernelContextType get(MLIRContext *context) {
+    return Base::get(context, TFFrameworkTypes::Kind::OpKernelContextType);
+  }
+
+  /// Support method to enable LLVM-style type casting.
+  static bool kindof(unsigned kind) {
+    return kind == TFFrameworkTypes::Kind::OpKernelContextType;
+  }
+};
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_dialect.h.inc"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h.inc"
+
+}  // namespace tf_framework
+}  // namespace kernel_gen
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_IR_TF_FRAMEWORK_OPS_H_
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td
new file mode 100644
index 00000000000..bc390a5aaa5
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td
@@ -0,0 +1,125 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is the operation definition file for TF Framework ops.
+
+#ifndef TF_FRAMEWORK_OPS
+#define TF_FRAMEWORK_OPS
+
+include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+
+def TFFramework_Dialect : Dialect {
+  let name = "tf_framework";
+
+  let summary = "Types and operations for tf_framework dialect";
+  let description = [{
+    This dialect contains operations and types for that correspond to
+    TensorFlow C++ Framework.
+  }];
+  let cppNamespace = "kernel_gen::tf_framework";
+}
+
+def TFFramework_OpKernelContextType : DialectType<TFFramework_Dialect,
+    CPred<"$_self.isa<::mlir::kernel_gen::tf_framework::OpKernelContextType>()">,
+          "op_kernel_construction">,
+    BuildableType<"$_builder.getType<::mlir::kernel_gen::tf_framework::OpKernelContextType>()"> {
+  let typeDescription = [{
+    OpKernelContextType corresponds to C++ class OpKernelContext defined in
+    tensorflow/core/framework/op_kernel.h
+  }];
+}
+
+// Base class for TF Framework dialect ops.
+class TFFramework_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<TFFramework_Dialect, mnemonic, traits> {
+  let verifier = "return Verify<$cppClass>(*this);";
+}
+
+//===----------------------------------------------------------------------===//
+// AllocRawOp
+//===----------------------------------------------------------------------===//
+def TFFramework_AllocRawOp : TFFramework_Op<"alloc_raw",
+    [MemoryEffects<[MemAlloc<DefaultResource>]>]> {
+  let summary = "allocation of tensors that uses TF Framework";
+  let description = [{
+    Allocation of tensors during kernel execution in the Compute method.
+
+    This should be used to allocate any temporary or output memref.
+    Corresponds to `Allocator::AllocateRaw` in
+    tensorflow/core/framework/allocator.h.
+  }];
+
+  let arguments = (ins TFFramework_OpKernelContextType:$ctx,
+                   Variadic<Index>:$dyn_sizes);
+  let results = (outs Res<AnyMemRef, "", [MemAlloc<DefaultResource>]>:$result);
+
+  let builders = [
+    OpBuilder<[{
+      OpBuilder &builder, OperationState &result, MemRefType memref_type,
+      Value ctx
+    }], [{
+      result.addOperands(ctx);
+      result.types.push_back(memref_type);
+    }]>,
+
+    OpBuilder<[{
+      OpBuilder &builder, OperationState &result, MemRefType memref_type,
+      Value ctx, ValueRange dyn_sizes
+    }], [{
+      build(builder, result, memref_type, ctx);
+      result.addOperands(dyn_sizes);
+    }]>];
+
+  let extraClassDeclaration = [{
+    MemRefType getType() { return getResult().getType().cast<MemRefType>(); }
+  }];
+  let assemblyFormat = [{
+    `(` $ctx (`,` $dyn_sizes^ )? `)` attr-dict `:` type($result)
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// DeallocRawOp
+//===----------------------------------------------------------------------===//
+def TFFramework_DeallocRawOp : TFFramework_Op<"dealloc_raw",
+    [MemoryEffects<[MemFree]>]> {
+  let summary = "deallocation of tensors that uses TF Framework";
+  let description = [{
+    Deallocation of tensors during kernel execution in the Compute method.
+
+    This should be used to deallocate any temporary memref that was allocated
+    with `tf_framework.alloc_raw`.
+    Corresponds to `Allocator::DeallocateRaw` in
+    tensorflow/core/framework/allocator.h.
+  }];
+
+  let arguments = (ins TFFramework_OpKernelContextType:$ctx,
+                   Arg<AnyMemRef, "", [MemFree]>:$memref);
+  let assemblyFormat = "`(` $ctx `,` $memref `)` attr-dict `:` type($memref)";
+}
+
+//===----------------------------------------------------------------------===//
+// NullContextOp
+//===----------------------------------------------------------------------===//
+def TFFramework_NullContextOp : TFFramework_Op<"null_context",
+    [NoSideEffect]> {
+  let summary = "Creates a fake TF context that will be lowered to nullptr";
+  let description = [{Needed for testing}];
+  let results = (outs TFFramework_OpKernelContextType:$result);
+  let assemblyFormat = "`(` `)` attr-dict `:` type($result)";
+}
+
+#endif // TF_FRAMEWORK_OPS
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc
new file mode 100644
index 00000000000..e75db59d885
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.h"
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace mlir {
+namespace kernel_gen {
+namespace tf_framework {
+namespace {
+
+using tensorflow::Allocator;
+
+Allocator* GetAllocator(void* op_kernel_ctx) {
+  auto* ctx = static_cast<tensorflow::OpKernelContext*>(op_kernel_ctx);
+  // TODO(pifon): Figure out how to set AllocatorAttributes correctly.
+  tensorflow::AllocatorAttributes attrs;
+  return ctx->get_allocator(attrs);
+}
+
+}  // namespace
+
+extern "C" void* _mlir_ciface_tf_alloc_raw(void* op_kernel_ctx,
+                                           size_t num_bytes) {
+  return GetAllocator(op_kernel_ctx)
+      ->AllocateRaw(Allocator::kAllocatorAlignment, num_bytes);
+}
+
+extern "C" void _mlir_ciface_tf_dealloc_raw(void* op_kernel_ctx, void* ptr) {
+  GetAllocator(op_kernel_ctx)->DeallocateRaw(ptr);
+}
+
+}  // namespace tf_framework
+}  // namespace kernel_gen
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.h b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.h
new file mode 100644
index 00000000000..143ebc95932
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TESTS_TF_FRAMEWORK_C_INTERFACE_H_
+#define TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TESTS_TF_FRAMEWORK_C_INTERFACE_H_
+
+#include "mlir/ExecutionEngine/RunnerUtils.h"  // from @llvm-project
+
+namespace mlir {
+namespace kernel_gen {
+namespace tf_framework {
+
+extern "C" MLIR_RUNNERUTILS_EXPORT void* _mlir_ciface_tf_alloc_raw(
+    void* op_kernel_ctx, size_t num_bytes);
+
+extern "C" MLIR_RUNNERUTILS_EXPORT void _mlir_ciface_tf_dealloc_raw(
+    void* op_kernel_ctx, void* ptr);
+
+}  // namespace tf_framework
+}  // namespace kernel_gen
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TESTS_TF_FRAMEWORK_C_INTERFACE_H_
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tools/kernel-gen-opt/kernel-gen-opt.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tools/kernel-gen-opt/kernel-gen-opt.cc
new file mode 100644
index 00000000000..c1af35617b1
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tools/kernel-gen-opt/kernel-gen-opt.cc
@@ -0,0 +1,122 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "mlir/IR/AsmState.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/InitAllDialects.h"  // from @llvm-project
+#include "mlir/InitAllPasses.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "mlir/Support/MlirOptMain.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
+
+// NOLINTNEXTLINE
+static llvm::cl::opt<std::string> inputFilename(llvm::cl::Positional,
+                                                llvm::cl::desc("<input file>"),
+                                                llvm::cl::init("-"));
+
+// NOLINTNEXTLINE
+static llvm::cl::opt<std::string> outputFilename(
+    "o", llvm::cl::desc("Output filename"), llvm::cl::value_desc("filename"),
+    llvm::cl::init("-"));
+
+// NOLINTNEXTLINE
+static llvm::cl::opt<bool> splitInputFile(
+    "split-input-file",
+    llvm::cl::desc("Split the input file into pieces and process each "
+                   "chunk independently"),
+    llvm::cl::init(false));
+
+// NOLINTNEXTLINE
+static llvm::cl::opt<bool> verifyDiagnostics(
+    "verify-diagnostics",
+    llvm::cl::desc("Check that emitted diagnostics match "
+                   "expected-* lines on the corresponding line"),
+    llvm::cl::init(false));
+
+// NOLINTNEXTLINE
+static llvm::cl::opt<bool> verifyPasses(
+    "verify-each",
+    llvm::cl::desc("Run the verifier after each transformation pass"),
+    llvm::cl::init(true));
+
+// NOLINTNEXTLINE
+static llvm::cl::opt<bool> allowUnregisteredDialects(
+    "allow-unregistered-dialect",
+    llvm::cl::desc("Allow operation with no registered dialects"),
+    llvm::cl::init(false));
+
+// NOLINTNEXTLINE
+static llvm::cl::opt<bool> showDialects(
+    "show-dialects", llvm::cl::desc("Print the list of registered dialects"),
+    llvm::cl::init(false));
+
+int main(int argc, char **argv) {
+  mlir::registerAllDialects();
+  mlir::registerAllPasses();
+
+  mlir::mhlo::registerAllDialects();
+  mlir::kernel_gen::registerKernelGenPasses();
+
+  llvm::InitLLVM y(argc, argv);
+
+  // Register any pass manager command line options.
+  mlir::registerAsmPrinterCLOptions();
+  mlir::registerPassManagerCLOptions();
+  mlir::PassPipelineCLParser passPipeline("", "Compiler passes to run");
+
+  // Parse pass names in main to ensure static initialization completed.
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "MLIR modular optimizer driver\n");
+
+  if (showDialects) {
+    mlir::MLIRContext context;
+    llvm::outs() << "Registered Dialects:\n";
+    for (mlir::Dialect *dialect : context.getRegisteredDialects()) {
+      llvm::outs() << dialect->getNamespace() << "\n";
+    }
+    return 0;
+  }
+
+  // Set up the input file.
+  std::string errorMessage;
+  auto file = mlir::openInputFile(inputFilename, &errorMessage);
+  if (!file) {
+    llvm::errs() << errorMessage << "\n";
+    return 1;
+  }
+
+  auto output = mlir::openOutputFile(outputFilename, &errorMessage);
+  if (!output) {
+    llvm::errs() << errorMessage << "\n";
+    exit(1);
+  }
+
+  if (failed(MlirOptMain(output->os(), std::move(file), passPipeline,
+                         splitInputFile, verifyDiagnostics, verifyPasses,
+                         allowUnregisteredDialects))) {
+    return 1;
+  }
+  // Keep the output file if the invocation of MlirOptMain was successful.
+  output->keep();
+  return 0;
+}
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
new file mode 100644
index 00000000000..b0f22b40f5b
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
@@ -0,0 +1,93 @@
+load("//third_party/mlir:tblgen.bzl", "gentbl")
+
+package(
+    default_visibility = ["//tensorflow/compiler/mlir/tools/kernel_gen:friends"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "tf_framework_legalize_to_llvm",
+    srcs = ["tf_framework_legalize_to_llvm.cc"],
+    hdrs = ["rewriters.h"],
+    deps = [
+        "//tensorflow/compiler/mlir/tools/kernel_gen/ir:tf_framework_ops",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:LLVMTransforms",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
+
+cc_library(
+    name = "bufferize",
+    srcs = ["bufferize.cc"],
+    hdrs = ["rewriters.h"],
+    deps = [
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
+
+cc_library(
+    name = "embed_tf_framework",
+    srcs = ["embed_tf_framework.cc"],
+    hdrs = ["rewriters.h"],
+    deps = [
+        "//tensorflow/compiler/mlir/tools/kernel_gen/ir:tf_framework_ops",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:LLVMTransforms",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
+
+gentbl(
+    name = "kernel_gen_passes_inc_gen",
+    tbl_outs = [("-gen-pass-decls -name KernelGen", "kernel_gen_passes.h.inc")],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes.td",
+    td_srcs = ["@llvm-project//mlir:PassBaseTdFiles"],
+)
+
+cc_library(
+    name = "passes",
+    srcs = [
+        "bufferize_pass.cc",
+        "embed_tf_framework_pass.cc",
+        "shape_to_descriptors_pass.cc",
+        "tf_framework_legalize_to_llvm_pass.cc",
+    ],
+    hdrs = ["passes.h"],
+    deps = [
+        ":bufferize",
+        ":embed_tf_framework",
+        ":kernel_gen_passes_inc_gen",
+        ":tf_framework_legalize_to_llvm",
+        "//tensorflow/compiler/mlir/hlo",
+        "//tensorflow/compiler/mlir/hlo:hlo_legalize_to_lhlo",
+        "//tensorflow/compiler/mlir/hlo:lhlo",
+        "//tensorflow/compiler/mlir/hlo:lhlo_legalize_to_llvm",
+        "//tensorflow/compiler/mlir/tools/kernel_gen/ir:tf_framework_ops",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:LLVMTransforms",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:Shape",
+        "@llvm-project//mlir:ShapeToSCF",
+        "@llvm-project//mlir:ShapeToStandard",
+        "@llvm-project//mlir:ShapeTransforms",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize.cc
new file mode 100644
index 00000000000..3d5c820e6dd
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize.cc
@@ -0,0 +1,110 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file implements logic for translating mixed IR to buffer form.
+
+#include <cstddef>
+#include <memory>
+
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Transforms/BufferPlacement.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+
+namespace mlir {
+namespace kernel_gen {
+namespace transforms {
+
+namespace {
+
+class TensorFromElementsOpConverter
+    : public BufferAssignmentOpConversionPattern<TensorFromElementsOp> {
+ public:
+  using BufferAssignmentOpConversionPattern<
+      TensorFromElementsOp>::BufferAssignmentOpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      TensorFromElementsOp op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const final {
+    Location loc = op.getLoc();
+    ShapedType result_type = op.getType().cast<ShapedType>();
+    int number_of_elements = op.elements().size();
+    MemRefType memref_type =
+        MemRefType::get({number_of_elements}, result_type.getElementType());
+    Value result = rewriter.create<AllocaOp>(loc, memref_type);
+    for (auto operand : llvm::enumerate(operands)) {
+      Value index = rewriter.create<ConstantIndexOp>(loc, operand.index());
+      rewriter.create<StoreOp>(loc, operand.value(), result, index);
+    }
+    rewriter.replaceOp(op, {result});
+    return success();
+  }
+};
+
+class TensorLoadOpConversion
+    : public BufferAssignmentOpConversionPattern<TensorLoadOp> {
+ public:
+  using BufferAssignmentOpConversionPattern<
+      TensorLoadOp>::BufferAssignmentOpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      TensorLoadOp op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const final {
+    TensorLoadOpAdaptor adaptor(operands);
+    rewriter.replaceOp(op, {adaptor.memref()});
+    return success();
+  }
+};
+
+class ExtractElementOpConversion
+    : public BufferAssignmentOpConversionPattern<ExtractElementOp> {
+ public:
+  using BufferAssignmentOpConversionPattern<
+      ExtractElementOp>::BufferAssignmentOpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      ExtractElementOp op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const final {
+    ExtractElementOpAdaptor adaptor(operands);
+
+    if (!adaptor.aggregate().getType().isa<MemRefType>()) {
+      return failure();
+    }
+
+    rewriter.replaceOpWithNewOp<LoadOp>(op, adaptor.aggregate(),
+                                        adaptor.indices());
+    return success();
+  }
+};
+
+}  // namespace
+
+void populateStandardBufferizePattern(MLIRContext *context,
+                                      BufferAssignmentPlacer *bufferAssignment,
+                                      TypeConverter *converter,
+                                      OwningRewritePatternList *patterns) {
+  patterns->insert<ExtractElementOpConversion, TensorFromElementsOpConverter,
+                   TensorLoadOpConversion>(context, bufferAssignment,
+                                           converter);
+}
+
+}  // namespace transforms
+}  // namespace kernel_gen
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_pass.cc
new file mode 100644
index 00000000000..ef07c801bc4
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_pass.cc
@@ -0,0 +1,132 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file implements logic for translating mixed IR to buffer form.
+// Currently it supports MHLO and some operations from the Standard dialect.
+
+#include <memory>
+
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Transforms/BufferPlacement.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h"
+
+namespace mlir {
+namespace kernel_gen {
+namespace transforms {
+namespace {
+
+#define GEN_PASS_CLASSES
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
+
+// TODO(herhut) : This could become a real pattern in bufferize pass. What we
+// would need to do is insert a copy to model the semantics correctly. The same
+// is true for the TensorLoad pattern that is already in there.  Then buffer
+// assignment free insertion and copy removal should clean this up for us.
+//
+// This patten erases `tensor_store(src_unranked_tensor, dst_unranked_memref)`
+// op and replaces the result of the defining op produced `dst_unranked_memref`
+// with the rewritten `src_unranked_tensor`.
+class UnrankedTensorStoreTestOnlyPattern
+    : public OpConversionPattern<mlir::TensorStoreOp> {
+ public:
+  using OpConversionPattern<mlir::TensorStoreOp>::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      mlir::TensorStoreOp op, ArrayRef<Value> operands,
+      ConversionPatternRewriter& rewriter) const final {
+    rewriter.replaceOp(op.memref().getDefiningOp(), op.tensor());
+    rewriter.replaceOp(op, {});
+    return success();
+  }
+};
+
+struct BufferizePass : public BufferizePassBase<BufferizePass> {
+ public:
+  void runOnOperation() override {
+    OwningRewritePatternList patterns;
+    auto& context = getContext();
+    ConversionTarget target(context);
+    target.addLegalDialect<lmhlo::LmhloDialect>();
+    target.addLegalDialect<StandardOpsDialect>();
+    target.addLegalDialect<scf::SCFDialect>();
+    target.addLegalOp<ModuleOp>();
+    target.addLegalOp<ModuleTerminatorOp>();
+    target.addIllegalDialect<mhlo::MhloDialect>();
+    target.addIllegalOp<TensorFromElementsOp>();
+    target.addIllegalOp<ExtractElementOp>();
+    target.addIllegalOp<TensorLoadOp>();
+    target.addDynamicallyLegalOp<TensorStoreOp>([&](TensorStoreOp op) {
+      return !op.tensor().getType().isa<UnrankedTensorType>();
+    });
+
+    BufferAssignmentTypeConverter converter;
+    auto typesAreLegal = [&converter](Operation* op) {
+      return converter.isLegal(op->getOperandTypes()) &&
+             converter.isLegal(op->getResultTypes());
+    };
+    target.addDynamicallyLegalOp<FuncOp>([&](FuncOp op) {
+      auto inputs = op.getType().getInputs();
+      auto results = op.getType().getResults();
+      return converter.isLegal(inputs) && converter.isLegal(results) &&
+             converter.isLegal(&op.getBody());
+    });
+    target.addDynamicallyLegalOp<CallOp>(typesAreLegal);
+    target.addDynamicallyLegalOp<ReturnOp>(typesAreLegal);
+
+    auto module = getOperation();
+    WalkResult result = module.walk([&](FuncOp func) -> WalkResult {
+      BufferAssignmentPlacer bufferAssignment(func);
+      OwningRewritePatternList patterns;
+      mhlo::populateHLOToLHLOConversionPattern(
+          func.getContext(), &bufferAssignment, &converter, &patterns);
+      populateWithBufferAssignmentOpConversionPatterns<
+          ReturnOp, ReturnOp, lmhlo::CopyOp,
+          /*allowMemrefFunctionResults=*/true>(&context, &bufferAssignment,
+                                               &converter, &patterns);
+      populateStandardBufferizePattern(func.getContext(), &bufferAssignment,
+                                       &converter, &patterns);
+      patterns.insert<UnrankedTensorStoreTestOnlyPattern>(func.getContext());
+
+      return applyPartialConversion(func, target, patterns);
+    });
+    if (result.wasInterrupted()) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp> > CreateBufferizePass() {
+  return std::make_unique<BufferizePass>();
+}
+
+}  // namespace transforms
+}  // namespace kernel_gen
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework.cc
new file mode 100644
index 00000000000..aa02aefa9d2
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework.cc
@@ -0,0 +1,127 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
+
+namespace mlir {
+namespace kernel_gen {
+namespace tf_framework {
+namespace {
+
+// Prepends argument type list of the function with an OpKernelContextType arg.
+class FuncOpConverter : public OpConversionPattern<FuncOp> {
+ public:
+  using OpConversionPattern<FuncOp>::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      FuncOp func, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const override {
+    // Convert function arguments using the provided TypeConverter.
+    auto func_type = func.getType();
+    TypeConverter::SignatureConversion conversion(func_type.getNumInputs());
+
+    conversion.addInputs(OpKernelContextType::get(rewriter.getContext()));
+    for (auto arg_type : llvm::enumerate(func_type.getInputs())) {
+      conversion.addInputs(arg_type.index(), arg_type.value());
+    }
+
+    TypeConverter type_converter;
+    if (failed(rewriter.convertRegionTypes(&func.getBody(), type_converter,
+                                           &conversion))) {
+      return failure();
+    }
+
+    // Update the signature of the function.
+    rewriter.updateRootInPlace(func, [&] {
+      func.setType(rewriter.getFunctionType(conversion.getConvertedTypes(),
+                                            func_type.getResults()));
+    });
+    return success();
+  }
+};
+
+// Converts std.alloc to tf_framework.alloc_raw using OpKernelContextType arg of
+// the parent function.
+class AllocOpConverter : public OpConversionPattern<AllocOp> {
+ public:
+  using OpConversionPattern<AllocOp>::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      AllocOp alloc, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const override {
+    auto func = alloc.getParentOfType<FuncOp>();
+    if (func.getNumArguments() == 0) {
+      return failure();
+    }
+    Value ctx = func.getArgument(0);
+    if (!ctx.getType().isa<OpKernelContextType>()) {
+      return failure();
+    }
+    // Symbolic operands that bind to the symbols of the memref's layout map are
+    // not supported by AllocRawOp.
+    if (alloc.getNumSymbolicOperands() != 0) {
+      return failure();
+    }
+    rewriter.replaceOpWithNewOp<AllocRawOp>(alloc, alloc.getType(), ctx,
+                                            operands);
+    return success();
+  }
+};
+
+// Converts std.dealloc to tf_framework.dealloc_raw using OpKernelContextType
+// arg of the parent function.
+class DeallocOpConverter : public OpConversionPattern<DeallocOp> {
+ public:
+  using OpConversionPattern<DeallocOp>::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      DeallocOp dealloc, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const override {
+    FuncOp func = dealloc.getParentOfType<FuncOp>();
+    if (func.getNumArguments() == 0) {
+      return failure();
+    }
+    Value ctx = func.getArgument(0);
+    if (!ctx.getType().isa<OpKernelContextType>()) {
+      return failure();
+    }
+    // Operand with no layout is expected.
+    auto operand_memref_type = dealloc.memref().getType().cast<MemRefType>();
+    if (!operand_memref_type.getAffineMaps().empty()) {
+      return failure();
+    }
+    DeallocOp::Adaptor transformed(operands);
+    rewriter.replaceOpWithNewOp<DeallocRawOp>(dealloc, ctx,
+                                              transformed.memref());
+    return success();
+  }
+};
+
+}  // namespace
+
+void PopulateEmbedTFFrameworkConversionPatterns(
+    MLIRContext *context, OwningRewritePatternList *patterns) {
+  patterns->insert<AllocOpConverter, DeallocOpConverter, FuncOpConverter>(
+      context);
+}
+
+}  // namespace tf_framework
+}  // namespace kernel_gen
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework_pass.cc
new file mode 100644
index 00000000000..a0cfcae65d1
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework_pass.cc
@@ -0,0 +1,77 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h"
+
+namespace mlir {
+namespace kernel_gen {
+namespace tf_framework {
+namespace {
+
+#define GEN_PASS_CLASSES
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
+
+static constexpr StringRef kTFEntry = "tf_entry";
+
+// The pass rewrites the function marked with `tf_entry` attribute.
+// * adds tf_framework::OpKernelContextType argument to the function,
+// * std.alloc becomes tf_framework.alloc_raw,
+// * std.dealloc becomes tf_framework.dealloc_raw.
+class EmbedTFFrameworkPass
+    : public EmbedTFFrameworkPassBase<EmbedTFFrameworkPass> {
+ public:
+  void runOnOperation() override {
+    ModuleOp m = getOperation();
+
+    // Populate patterns.
+    OwningRewritePatternList patterns;
+    PopulateEmbedTFFrameworkConversionPatterns(m.getContext(), &patterns);
+
+    // Set target.
+    ConversionTarget target(getContext());
+    target.addLegalDialect<tf_framework::TFFrameworkDialect>();
+
+    target.addDynamicallyLegalOp<FuncOp>([&](FuncOp op) {
+      if (!op.getAttrOfType<UnitAttr>(kTFEntry)) {
+        return true;
+      }
+      FunctionType func_type = op.getType();
+      return func_type.getNumInputs() > 0 &&
+             func_type.getInput(0).isa<OpKernelContextType>();
+    });
+    target.addDynamicallyLegalOp<AllocOp, DeallocOp>([](Operation* op) {
+      return !op->getParentOfType<FuncOp>().getAttrOfType<UnitAttr>(kTFEntry);
+    });
+
+    if (failed(applyPartialConversion(m, target, patterns))) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp> > createEmbedTFFrameworkPass() {
+  return std::make_unique<EmbedTFFrameworkPass>();
+}
+
+}  // namespace tf_framework
+}  // namespace kernel_gen
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
new file mode 100644
index 00000000000..e65d8402fb2
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
@@ -0,0 +1,58 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TRANSFORMS_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TRANSFORMS_PASSES_H_
+
+#include <memory>
+
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace kernel_gen {
+namespace tf_framework {
+
+// Test pass for applying TF Framework -> LLVM patterns.
+std::unique_ptr<OperationPass<ModuleOp> >
+createTestTFFrameworkLegalizeToLLVMPass();
+
+// Pass to replace some of the Standard ops with TF Framework ops.
+// * adds tf_framework::OpKernelContextType argument to the function
+// * std.alloc becomes tf_framework.alloc_raw
+// * std.dealloc becomes tf_framework.dealloc_raw
+std::unique_ptr<OperationPass<ModuleOp> > createEmbedTFFrameworkPass();
+
+}  // namespace tf_framework
+
+namespace transforms {
+
+// Pass to tranform shape computations in shape dialect to standard and scf
+// using memref descriptors.
+std::unique_ptr<OperationPass<ModuleOp> > CreateShapeToDescriptorsPass();
+
+// Pass to tranform computations on values to their corresponding parts on
+// buffers.
+std::unique_ptr<OperationPass<ModuleOp> > CreateBufferizePass();
+
+}  // namespace transforms
+
+#define GEN_PASS_REGISTRATION
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
+
+}  // namespace kernel_gen
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TRANSFORMS_PASSES_H_
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td
new file mode 100644
index 00000000000..6a0e328f212
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td
@@ -0,0 +1,42 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TF_FRAMEWORK_PASSES
+#define TF_FRAMEWORK_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def TestTFFrameworkLegalizeToLLVMPass
+    : Pass<"test-tf-framework-legalize-to-llvm", "ModuleOp"> {
+  let summary = "Test pass for applying TF Framework -> LLVM patterns.";
+  let constructor = "tf_framework::createTestTFFrameworkLegalizeToLLVMPass()";
+}
+
+def EmbedTFFrameworkPass : Pass<"embed-tf-framework", "ModuleOp"> {
+  let summary = "Pass to embed TF Framework for allocation and error reporting";
+  let constructor = "tf_framework::createEmbedTFFrameworkPass()";
+}
+
+def ShapeToDescriptorsPass : Pass<"test-shape-to-descriptors", "ModuleOp"> {
+  let summary = "Pass to transform shape computations to descriptors";
+  let constructor = "transforms::CreateShapeToDescriptorsPass()";
+}
+
+def BufferizePass : Pass<"test-bufferize", "ModuleOp"> {
+  let summary = "Pass to transform operations on values to buffer based ones";
+  let constructor = "transforms::CreateBufferizePass()";
+}
+
+#endif // TF_FRAMEWORK_PASSES
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h
new file mode 100644
index 00000000000..4efc1e95bc8
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h
@@ -0,0 +1,54 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TRANSFORMS_REWRITERS_H_
+#define TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TRANSFORMS_REWRITERS_H_
+
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+
+namespace mlir {
+
+class BufferAssignmentPlacer;
+class LLVMTypeConverter;
+class MLIRContext;
+class OwningRewritePatternList;
+class TypeConverter;
+
+namespace kernel_gen {
+namespace tf_framework {
+
+/// Collects a set of patterns to convert from the TF Framework dialect to LLVM.
+void PopulateTFFrameworkToLLVMConversionPatterns(
+    LLVMTypeConverter *converter, OwningRewritePatternList *patterns);
+
+/// Collects a set of patterns to embed TF Framework.
+void PopulateEmbedTFFrameworkConversionPatterns(
+    MLIRContext *context, OwningRewritePatternList *patterns);
+
+}  // namespace tf_framework
+
+namespace transforms {
+
+/// Collects a set of patterns that bufferize operations from the standard
+/// dialect.
+void populateStandardBufferizePattern(MLIRContext *context,
+                                      BufferAssignmentPlacer *bufferAssignment,
+                                      TypeConverter *converter,
+                                      OwningRewritePatternList *patterns);
+}  // namespace transforms
+}  // namespace kernel_gen
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TRANSFORMS_REWRITERS_H_
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/shape_to_descriptors_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/shape_to_descriptors_pass.cc
new file mode 100644
index 00000000000..28d3647bb63
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/shape_to_descriptors_pass.cc
@@ -0,0 +1,72 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file combines patterns for lowering shape dialect to standard ops,
+// structured control flow and descriptors.
+
+#include "mlir/Conversion/ShapeToSCF/ShapeToSCF.h"  // from @llvm-project
+#include "mlir/Conversion/ShapeToStandard/ShapeToStandard.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/Transforms/Passes.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
+
+namespace mlir {
+namespace kernel_gen {
+namespace transforms {
+namespace {
+
+#define GEN_PASS_CLASSES
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
+
+struct ShapeToDescriptorsPass
+    : public ShapeToDescriptorsPassBase<ShapeToDescriptorsPass> {
+ public:
+  void runOnOperation() override {
+    MLIRContext &ctx = getContext();
+
+    // Setup target legality.
+    ConversionTarget target(ctx);
+    target.addIllegalDialect<shape::ShapeDialect>();
+    target.addLegalDialect<scf::SCFDialect>();
+    target.addLegalDialect<StandardOpsDialect>();
+
+    // Setup conversion patterns.
+    OwningRewritePatternList patterns;
+    populateShapeRewritePatterns(&ctx, patterns);
+    populateShapeToStandardConversionPatterns(patterns, &ctx);
+    populateShapeToSCFConversionPatterns(patterns, &ctx);
+
+    // Apply conversion.
+    auto module = getOperation();
+    if (failed(applyPartialConversion(module, target, patterns)))
+      signalPassFailure();
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp> > CreateShapeToDescriptorsPass() {
+  return std::make_unique<ShapeToDescriptorsPass>();
+}
+
+}  // namespace transforms
+}  // namespace kernel_gen
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
new file mode 100644
index 00000000000..3ce111ff3ff
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
@@ -0,0 +1,201 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"  // from @llvm-project
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h"
+
+namespace mlir {
+namespace kernel_gen {
+namespace tf_framework {
+namespace {
+
+using LLVM::LLVMFuncOp;
+using LLVM::LLVMType;
+
+static constexpr StringRef kCInterfaceAlloc = "_mlir_ciface_tf_alloc_raw";
+static constexpr StringRef kCInterfaceDealloc = "_mlir_ciface_tf_dealloc_raw";
+
+/// Base class for patterns converting TF Framework ops to function calls.
+template <typename OpTy>
+class ConvertToLLVMCallOpPattern : public ConvertOpToLLVMPattern<OpTy> {
+ public:
+  using ConvertOpToLLVMPattern<OpTy>::ConvertOpToLLVMPattern;
+
+  // Attempts to find function symbol in the module, adds it if not found.
+  FlatSymbolRefAttr getOrInsertTFFunction(PatternRewriter &rewriter,
+                                          Operation *op) const {
+    ModuleOp module = op->getParentOfType<ModuleOp>();
+    StringRef tf_func_name = GetFuncName();
+    auto tf_func = module.lookupSymbol<LLVMFuncOp>(tf_func_name);
+    if (!tf_func) {
+      OpBuilder::InsertionGuard guard(rewriter);
+      rewriter.setInsertionPointToStart(module.getBody());
+      auto func_type = GetFuncType();
+      tf_func = rewriter.create<LLVMFuncOp>(rewriter.getUnknownLoc(),
+                                            tf_func_name, func_type);
+    }
+    return SymbolRefAttr::get(tf_func_name, rewriter.getContext());
+  }
+
+ protected:
+  virtual StringRef GetFuncName() const = 0;
+  virtual LLVMType GetFuncType() const = 0;
+};
+
+class AllocRawOpConverter : public ConvertToLLVMCallOpPattern<AllocRawOp> {
+ public:
+  using ConvertToLLVMCallOpPattern<AllocRawOp>::ConvertToLLVMCallOpPattern;
+
+  LogicalResult matchAndRewrite(
+      Operation *op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const override {
+    Location loc = op->getLoc();
+    AllocRawOp alloc_raw_op = cast<AllocRawOp>(op);
+    AllocRawOp::Adaptor transformed(operands);
+
+    MemRefType memref_type = alloc_raw_op.getType();
+
+    // Get memref descriptor sizes.
+    SmallVector<Value, 4> sizes;
+    getMemRefDescriptorSizes(loc, memref_type,
+                             llvm::to_vector<4>(transformed.dyn_sizes()),
+                             rewriter, sizes);
+    // Get memory block size in bytes.
+    Value num_bytes = getCumulativeSizeInBytes(
+        loc, memref_type.getElementType(), sizes, rewriter);
+
+    // Insert function call.
+    FlatSymbolRefAttr tf_func_ref = getOrInsertTFFunction(rewriter, op);
+    Value allocated_byte_ptr =
+        rewriter
+            .create<LLVM::CallOp>(
+                loc, getVoidPtrType(), tf_func_ref,
+                llvm::makeArrayRef({transformed.ctx(), num_bytes}))
+            .getResult(0);
+
+    MemRefDescriptor memRefDescriptor = CreateMemRefDescriptor(
+        loc, rewriter, memref_type, allocated_byte_ptr, sizes);
+
+    // Return the final value of the descriptor.
+    rewriter.replaceOp(op, {memRefDescriptor});
+    return success();
+  }
+
+ protected:
+  StringRef GetFuncName() const override { return kCInterfaceAlloc; }
+
+  LLVMType GetFuncType() const override {
+    LLVMType llvm_void_ptr_type = getVoidPtrType();
+    return LLVM::LLVMType::getFunctionTy(
+        llvm_void_ptr_type,
+        llvm::makeArrayRef({llvm_void_ptr_type, getIndexType()}),
+        /*isVarArg=*/false);
+  }
+
+ private:
+  MemRefDescriptor CreateMemRefDescriptor(Location loc,
+                                          ConversionPatternRewriter &rewriter,
+                                          MemRefType memref_type,
+                                          Value allocated_byte_ptr,
+                                          ArrayRef<Value> sizes) const {
+    auto memref_desc = MemRefDescriptor::undef(
+        rewriter, loc, typeConverter.convertType(memref_type));
+
+    // TF AllocateRaw returns aligned pointer => AllocatedPtr == AlignedPtr.
+    Value allocated_type_ptr = rewriter.create<LLVM::BitcastOp>(
+        loc, getElementPtrType(memref_type), allocated_byte_ptr);
+    memref_desc.setAllocatedPtr(rewriter, loc, allocated_type_ptr);
+    memref_desc.setAlignedPtr(rewriter, loc, allocated_type_ptr);
+    memref_desc.setConstantOffset(rewriter, loc, 0);
+
+    if (memref_type.getRank() == 0) {
+      return memref_desc;
+    }
+
+    // Compute strides and populate descriptor `size` and `stride` fields.
+    Value stride_carried = createIndexConstant(rewriter, loc, 1);
+    for (int pos = sizes.size() - 1; pos >= 0; --pos) {
+      Value size = sizes[pos];
+      memref_desc.setSize(rewriter, loc, pos, size);
+      memref_desc.setStride(rewriter, loc, pos, stride_carried);
+      // Update stride
+      if (pos > 0) {
+        stride_carried =
+            rewriter.create<LLVM::MulOp>(loc, stride_carried, size);
+      }
+    }
+    return memref_desc;
+  }
+};
+
+class DeallocRawOpConverter : public ConvertToLLVMCallOpPattern<DeallocRawOp> {
+ public:
+  using ConvertToLLVMCallOpPattern<DeallocRawOp>::ConvertToLLVMCallOpPattern;
+
+  LogicalResult matchAndRewrite(
+      Operation *op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const override {
+    DeallocRawOp::Adaptor transformed(operands);
+    MemRefDescriptor memref(transformed.memref());
+
+    Value allocated_bytes_ptr = rewriter.create<LLVM::BitcastOp>(
+        op->getLoc(), getVoidPtrType(),
+        memref.allocatedPtr(rewriter, op->getLoc()));
+
+    // Insert function call.
+    FlatSymbolRefAttr tf_func_ref = getOrInsertTFFunction(rewriter, op);
+    rewriter.replaceOpWithNewOp<LLVM::CallOp>(
+        op, llvm::None, tf_func_ref,
+        llvm::makeArrayRef({transformed.ctx(), allocated_bytes_ptr}));
+    return success();
+  }
+
+ protected:
+  StringRef GetFuncName() const override { return kCInterfaceDealloc; }
+  LLVMType GetFuncType() const override {
+    return LLVM::LLVMType::getFunctionTy(getVoidType(), getVoidPtrType(),
+                                         /*isVarArg=*/false);
+  }
+};
+
+class NullContextOpConverter : public ConvertOpToLLVMPattern<NullContextOp> {
+ public:
+  using ConvertOpToLLVMPattern<NullContextOp>::ConvertOpToLLVMPattern;
+
+  LogicalResult matchAndRewrite(
+      Operation *op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<LLVM::NullOp>(op, getVoidPtrType());
+    return success();
+  }
+};
+
+}  // namespace
+
+void PopulateTFFrameworkToLLVMConversionPatterns(
+    LLVMTypeConverter *converter, OwningRewritePatternList *patterns) {
+  patterns->insert<NullContextOpConverter>(*converter);
+  patterns->insert<AllocRawOpConverter, DeallocRawOpConverter>(*converter);
+}
+
+}  // namespace tf_framework
+}  // namespace kernel_gen
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm_pass.cc
new file mode 100644
index 00000000000..42e89433dff
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm_pass.cc
@@ -0,0 +1,73 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"  // from @llvm-project
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"  // from @llvm-project
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h"
+
+namespace mlir {
+namespace kernel_gen {
+namespace tf_framework {
+namespace {
+
+#define GEN_PASS_CLASSES
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
+
+class TestTFFrameworkToLLVMPass
+    : public TestTFFrameworkLegalizeToLLVMPassBase<TestTFFrameworkToLLVMPass> {
+ public:
+  void runOnOperation() override {
+    ModuleOp m = getOperation();
+
+    // Populate type conversions.
+    LLVMTypeConverter type_converter(m.getContext());
+    type_converter.addConversion([&](tf_framework::OpKernelContextType type) {
+      return LLVM::LLVMType::getInt8PtrTy(m.getContext());
+    });
+
+    // Populate patterns.
+    OwningRewritePatternList patterns;
+    populateStdToLLVMConversionPatterns(type_converter, patterns);
+    PopulateTFFrameworkToLLVMConversionPatterns(&type_converter, &patterns);
+    lmhlo::PopulateLhloToLLVMConversionPatterns(&type_converter, &patterns);
+
+    // Set target.
+    ConversionTarget target(getContext());
+    target.addLegalDialect<LLVM::LLVMDialect>();
+    target.addIllegalDialect<tf_framework::TFFrameworkDialect>();
+    target.addLegalOp<ModuleOp, ModuleTerminatorOp>();
+
+    if (failed(applyFullConversion(m, target, patterns))) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp> >
+createTestTFFrameworkLegalizeToLLVMPass() {
+  return std::make_unique<TestTFFrameworkToLLVMPass>();
+}
+
+}  // namespace tf_framework
+}  // namespace kernel_gen
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index 838b060079c..71e18af498b 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -48,17 +48,21 @@ cc_library(
     srcs = [
         "transforms/generated_legalize_tf.inc",
         "transforms/legalize_tf.cc",
+        "transforms/legalize_tf_communication.cc",
         "transforms/legalize_tf_control_flow.cc",
     ],
     hdrs = [
         "transforms/passes.h",
     ],
     deps = [
+        ":type_to_shape",
+        ":xla_legalize_tf_with_tf2xla",
         "//tensorflow/compiler/mlir/hlo",
         "//tensorflow/compiler/mlir/hlo:chlo_legalize_to_hlo",
         "//tensorflow/compiler/mlir/hlo:convert_op_folder",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:lower_tf_lib",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/client:sharding_builder",
@@ -92,7 +96,11 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:export_tf_dialect_op",
         "//tensorflow/compiler/mlir/tensorflow:lower_tf_lib",
         "//tensorflow/compiler/mlir/tensorflow:translate_utils",
-        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla:xla_compilation_device",
+        "//tensorflow/compiler/tf2xla:xla_context",
+        "//tensorflow/compiler/tf2xla:xla_expression",
+        "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
@@ -123,17 +131,21 @@ cc_library(
         ":hlo_utils",
         ":mlir_hlo_to_hlo",
         "//tensorflow/compiler/mlir/hlo",
-        "//tensorflow/compiler/mlir/hlo:hlo_dialect_registration",
+        "//tensorflow/compiler/mlir/hlo:hlo_dialect_force_registration",
         "//tensorflow/compiler/mlir/hlo:lhlo",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_casting_utils",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Translation",
     ],
     alwayslink = 1,
 )
@@ -203,6 +215,7 @@ tf_cc_test(
     name = "type_to_shape_test",
     srcs = ["type_to_shape_test.cc"],
     deps = [
+        ":hlo_utils",
         ":type_to_shape",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
@@ -224,11 +237,11 @@ cc_library(
     deps = [
         ":type_to_shape",
         "//tensorflow/compiler/mlir/hlo",
-        "//tensorflow/compiler/mlir/hlo:hlo_dialect_registration",
+        "//tensorflow/compiler/mlir/hlo:hlo_dialect_force_registration",
         "//tensorflow/compiler/mlir/tensorflow:convert_type",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/tf2xla:common",
-        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla:xla_helpers",
         "//tensorflow/compiler/xla:comparison_util",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -315,7 +328,10 @@ cc_library(
     hdrs = ["xla_mlir_translate.h"],
     deps = [
         ":hlo_to_mlir_hlo",
+        ":mhlo_to_lhlo_with_xla",
         ":mlir_hlo_to_hlo",
+        "//tensorflow/compiler/jit:xla_cpu_jit",
+        "//tensorflow/compiler/jit:xla_gpu_jit",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
@@ -370,7 +386,7 @@ cc_library(
         ":xla_legalize_tf_with_tf2xla",
         "//tensorflow/compiler/mlir/hlo",
         "//tensorflow/compiler/mlir/hlo:chlo_legalize_to_hlo",
-        "//tensorflow/compiler/mlir/hlo:hlo_dialect_registration",
+        "//tensorflow/compiler/mlir/hlo:hlo_dialect_force_registration",
         "//tensorflow/compiler/mlir/hlo:hlo_legalize_to_lhlo",
         "//tensorflow/compiler/mlir/hlo:legalize_control_flow",
         "//tensorflow/compiler/mlir/hlo:legalize_tanh_to_approximation",
diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
index ad177ce1dc5..a63fc12c285 100644
--- a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
@@ -521,6 +521,13 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
               RandomDistributionToString(instruction->random_distribution())));
       }
     }
+    case HloOpcode::kRngBitGenerator: {
+      auto rng_op = Cast<HloRngBitGeneratorInstruction>(instruction);
+      auto op = func_builder->create<mlir::mhlo::RngBitGeneratorOp>(
+          loc, result_type,
+          func_builder->getI32IntegerAttr(rng_op->algorithm()), operands[0]);
+      return op.getOperation();
+    }
     case HloOpcode::kWhile: {
       auto op = func_builder->create<mlir::mhlo::WhileOp>(
           loc, operands[0].getType(), operands[0]);
@@ -708,6 +715,15 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
       NoAttributeCase(kCopy, CopyOp);
 #undef NoAttributeCase
 #undef MakeAndReturn
+    case HloOpcode::kFusion: {
+      auto fusion = func_builder->create<mlir::mhlo::FusionOp>(
+          loc, result_type, operands,
+          builder_->getStringAttr(xla::ToString(instruction->fusion_kind())));
+      TF_RETURN_IF_ERROR(
+          ImportAsRegion(*instruction->fused_instructions_computation(),
+                         &fusion.fused_computation()));
+      return fusion.getOperation();
+    }
     case HloOpcode::kAddDependency:
       // Arbitrary op code that I suspect we will not implement for quite a
       // while and allows testing handling of unknown ops. Selected because it
diff --git a/tensorflow/compiler/mlir/xla/hlo_utils.cc b/tensorflow/compiler/mlir/xla/hlo_utils.cc
index 84c574139e9..cf78c81908d 100644
--- a/tensorflow/compiler/mlir/xla/hlo_utils.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_utils.cc
@@ -77,13 +77,14 @@ StatusOr<llvm::SmallVector<AffineMap, 1>> GetPermutationIfAvailable(
     return tensorflow::errors::Internal(
         "Permutations for dynamic shapes are not yet supported");
   }
-  llvm::SmallVector<int64_t, 2> permuted_sizes;
-  for (auto dim : llvm::reverse(shape.layout().minor_to_major())) {
-    permuted_sizes.push_back(shape.dimensions(dim));
+  int64_t accumulated_stride = 1;
+  llvm::SmallVector<int64_t, 4> strides(shape.rank(), 1);
+  for (int64 dim : LayoutUtil::MinorToMajor(shape)) {
+    strides[dim] = accumulated_stride;
+    accumulated_stride *= shape.dimensions(dim);
   }
-  return llvm::SmallVector<AffineMap, 1>{AffineMap::get(
-      permuted_sizes.size(), 0,
-      makeCanonicalStridedLayoutExpr(permuted_sizes, builder.getContext()))};
+  return llvm::SmallVector<AffineMap, 1>{
+      makeStridedLinearLayoutMap(strides, /*offset=*/0, builder.getContext())};
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
index 31512c90f09..c94110d9102 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
@@ -206,6 +206,15 @@ XlaOp MlirHloBuilder::Iota(const Shape& shape, int64 iota_dimension) {
   });
 }
 
+StatusOr<XlaOp> MlirHloBuilder::BitcastConvertTypeInternal(const Shape& shape,
+                                                           XlaOp operand) {
+  TF_ASSIGN_OR_RETURN(mlir::Type ty, ConvertShapeToType<mlir::RankedTensorType>(
+                                         shape, builder_));
+  auto op = builder_.create<mlir::mhlo::BitcastConvertOp>(loc_, ty,
+                                                          GetValue(operand));
+  return MakeXlaOp(op);
+}
+
 StatusOr<XlaOp> MlirHloBuilder::TransposeInternal(
     const Shape& shape, XlaOp operand, absl::Span<const int64> permutation) {
   TF_ASSIGN_OR_RETURN(mlir::Type ty, ConvertShapeToType<mlir::RankedTensorType>(
@@ -224,6 +233,31 @@ StatusOr<XlaOp> MlirHloBuilder::RevInternal(
   return MakeXlaOp(op);
 }
 
+StatusOr<XlaOp> MlirHloBuilder::SortInternal(const Shape& shape,
+                                             absl::Span<const XlaOp> operands,
+                                             const XlaComputation& comparator,
+                                             int64 dimension, bool is_stable) {
+  TF_ASSIGN_OR_RETURN(mlir::Type ty, ConvertShapeToType<mlir::RankedTensorType>(
+                                         shape, builder_));
+  auto op = builder_.create<mlir::mhlo::SortOp>(
+      loc_, ty, GetValues(operands), builder_.getI64IntegerAttr(dimension),
+      builder_.getBoolAttr(is_stable));
+  TF_RETURN_IF_ERROR(ImportComputation(comparator.proto(), &op.comparator()));
+  return MakeXlaOp(op);
+}
+
+StatusOr<XlaOp> MlirHloBuilder::WhileInternal(const Shape& shape,
+                                              const XlaComputation& condition,
+                                              const XlaComputation& body,
+                                              XlaOp init) {
+  TF_ASSIGN_OR_RETURN(mlir::Type ty, ConvertShapeToType<mlir::RankedTensorType>(
+                                         shape, builder_));
+  auto op = builder_.create<mlir::mhlo::WhileOp>(loc_, ty, GetValue(init));
+  TF_RETURN_IF_ERROR(ImportComputation(condition.proto(), &op.cond()));
+  TF_RETURN_IF_ERROR(ImportComputation(body.proto(), &op.body()));
+  return MakeXlaOp(op);
+}
+
 StatusOr<XlaOp> MlirHloBuilder::GatherInternal(
     const Shape& shape, XlaOp input, XlaOp start_indices,
     const GatherDimensionNumbers& dimension_numbers,
diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
index ab1a0d2c9b3..a12eb723465 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
@@ -142,6 +142,9 @@ class MlirHloBuilder : public XlaBuilder {
 
   XlaOp Iota(const Shape& shape, int64 iota_dimension) override;
 
+  StatusOr<XlaOp> BitcastConvertTypeInternal(const Shape& shape,
+                                             XlaOp operand) override;
+
   StatusOr<XlaOp> TransposeInternal(
       const Shape& shape, XlaOp operand,
       absl::Span<const int64> permutation) override;
@@ -149,6 +152,16 @@ class MlirHloBuilder : public XlaBuilder {
   StatusOr<XlaOp> RevInternal(const Shape& shape, XlaOp operand,
                               absl::Span<const int64> dimensions) override;
 
+  StatusOr<XlaOp> SortInternal(const Shape& shape,
+                               absl::Span<const XlaOp> operands,
+                               const XlaComputation& comparator,
+                               int64 dimension, bool is_stable) override;
+
+  StatusOr<XlaOp> WhileInternal(const Shape& shape,
+                                const XlaComputation& condition,
+                                const XlaComputation& body,
+                                XlaOp init) override;
+
   StatusOr<XlaOp> GatherInternal(
       const Shape& shape, XlaOp input, XlaOp start_indices,
       const GatherDimensionNumbers& dimension_numbers,
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
index a4c3c43cfbf..5398cd70777 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
@@ -43,7 +43,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/xla/type_to_shape.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/lib/matrix.h"
 #include "tensorflow/compiler/xla/client/lib/quantize.h"
 #include "tensorflow/compiler/xla/client/lib/slicing.h"
@@ -90,6 +89,18 @@ T* Unwrap(const std::unique_ptr<T>& t) {
   return t.get();
 }
 
+static mlir::LogicalResult GetXlaOp(
+    mlir::Value val, const llvm::DenseMap<mlir::Value, xla::XlaOp>& val_map,
+    xla::XlaOp* result, mlir::Operation* op) {
+  auto iter = val_map.find(val);
+  if (iter == val_map.end()) {
+    return op->emitOpError(
+        "requires all operands to be defined in the parent region for export");
+  }
+  *result = iter->second;
+  return mlir::success();
+}
+
 // Convert APInt into an int.
 // TODO(hpucha): This should be consolidated into a general place.
 static int ConvertAPInt(llvm::APInt i) { return i.getSExtValue(); }
@@ -170,8 +181,8 @@ static std::vector<std::pair<int64, int64>> Convert_source_target_pairs(
 
 static std::vector<xla::ReplicaGroup> Convert_replica_groups(
     mlir::DenseIntElementsAttr groups) {
-  int64_t num_groups = groups.getType().getDimSize(0);
-  int64_t group_size = groups.getType().getDimSize(1);
+  uint64_t num_groups = groups.getType().getDimSize(0);
+  uint64_t group_size = groups.getType().getDimSize(1);
 
   std::vector<xla::ReplicaGroup> result;
   result.reserve(num_groups);
@@ -435,14 +446,14 @@ static void ExtractShardingsFromFunction(
     llvm::SmallVectorImpl<absl::optional<xla::OpSharding>>* ret_shardings) {
   arg_shardings->resize(function.getNumArguments(),
                         absl::optional<xla::OpSharding>());
-  for (int i = 0; i < function.getNumArguments(); ++i)
+  for (int i = 0, end = function.getNumArguments(); i < end; ++i)
     if (auto sharding =
             function.getArgAttrOfType<mlir::StringAttr>(i, kShardingAttr))
       (*arg_shardings)[i] = CreateOpShardingFromStringRef(sharding.getValue());
 
   ret_shardings->resize(function.getNumResults(),
                         absl::optional<xla::OpSharding>());
-  for (int i = 0; i < function.getNumResults(); ++i)
+  for (int i = 0, end = function.getNumResults(); i < end; ++i)
     if (auto sharding =
             function.getResultAttrOfType<mlir::StringAttr>(i, kShardingAttr))
       (*ret_shardings)[i] = CreateOpShardingFromStringRef(sharding.getValue());
@@ -463,7 +474,7 @@ class ConvertToHloModule {
   // single value.
   explicit ConvertToHloModule(
       mlir::ModuleOp module, bool use_tuple_args, bool return_tuple,
-      tensorflow::XlaCompiler::ShapeRepresentationFn shape_representation_fn)
+      tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn)
       : module_(module),
         module_builder_("main"),
         use_tuple_args_(use_tuple_args),
@@ -507,7 +518,7 @@ class ConvertToHloModule {
 
   // Lower function call to HLO call instruction
   LogicalResult LowerFunctionCall(
-      mlir::CallOp* call_op, xla::XlaBuilder* builder,
+      mlir::CallOp call_op, xla::XlaBuilder* builder,
       ConvertToHloModule::ValueLoweringMap* value_lowering);
 
  private:
@@ -545,7 +556,7 @@ class ConvertToHloModule {
 
   // Shape representation function to determine entry function argument and
   // result shapes.
-  tensorflow::XlaCompiler::ShapeRepresentationFn shape_representation_fn_;
+  tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn_;
 
   // Unique suffix to give to the name of the next lowered region.
   size_t region_id_ = 0;
@@ -585,23 +596,27 @@ LogicalResult ExportXlaOp(AllReduceOp op, OpLoweringContext ctx) {
     return failure();
   }
   auto replica_groups = Convert_replica_groups(op.replica_groups());
+  xla::XlaOp operand;
+  if (failed(GetXlaOp(op.operand(), value_map, &operand, op))) return failure();
+
   if (!op.channel_id().hasValue()) {
-    value_map[op] =
-        xla::AllReduce(value_map[op.operand()], computation, replica_groups,
-                       /*channel_id=*/absl::nullopt);
+    value_map[op] = xla::AllReduce(operand, computation, replica_groups,
+                                   /*channel_id=*/absl::nullopt);
     return success();
   }
   auto channel_id = Convert_channel_handle(op.channel_id().getValue());
-  value_map[op] = xla::AllReduce(value_map[op.operand()], computation,
-                                 replica_groups, channel_id);
+  value_map[op] =
+      xla::AllReduce(operand, computation, replica_groups, channel_id);
   return success();
 }
 
 LogicalResult ExportXlaOp(BitcastConvertOp op, OpLoweringContext ctx) {
   auto& value_map = *ctx.values;
+  xla::XlaOp operand;
+  if (failed(GetXlaOp(op.operand(), value_map, &operand, op))) return failure();
+
   value_map[op] = xla::BitcastConvertType(
-      value_map[op.operand()],
-      xla::TypeToPrimitiveType(getElementTypeOrSelf(op.getType())));
+      operand, xla::TypeToPrimitiveType(getElementTypeOrSelf(op.getType())));
   return success();
 }
 
@@ -609,8 +624,11 @@ LogicalResult ExportXlaOp(BroadcastInDimOp op, OpLoweringContext ctx) {
   auto type = op.getType().dyn_cast<RankedTensorType>();
   if (!type) return failure();
   auto& value_map = *ctx.values;
+  xla::XlaOp operand;
+  if (failed(GetXlaOp(op.operand(), value_map, &operand, op))) return failure();
+
   value_map[op] =
-      BroadcastInDim(value_map[op.operand()], Convert_ArrayRef(type.getShape()),
+      BroadcastInDim(operand, Convert_ArrayRef(type.getShape()),
                      Convert_broadcast_dimensions(op.broadcast_dimensions()));
   return success();
 }
@@ -640,11 +658,15 @@ LogicalResult ExportXlaOp(IfOp op, OpLoweringContext ctx) {
                                                      &false_branch))) {
     return failure();
   }
+  xla::XlaOp pred, true_arg, false_arg;
+  if (failed(GetXlaOp(op.pred(), value_map, &pred, op))) return failure();
+  if (failed(GetXlaOp(op.true_arg(), value_map, &true_arg, op)))
+    return failure();
+  if (failed(GetXlaOp(op.false_arg(), value_map, &false_arg, op)))
+    return failure();
 
   value_map[op] =
-      xla::Conditional(value_map[op.pred()], value_map[op.true_arg()],
-                       true_branch, value_map[op.false_arg()], false_branch);
-
+      xla::Conditional(pred, true_arg, true_branch, false_arg, false_branch);
   return success();
 }
 
@@ -657,14 +679,19 @@ LogicalResult ExportXlaOp(CaseOp op, OpLoweringContext ctx) {
   std::vector<xla::XlaComputation*> computations_p(branches.size());
 
   for (unsigned i = 0; i < branches.size(); ++i) {
-    branch_operands[i] = value_map[operands[i]];
+    xla::XlaOp operand;
+    if (failed(GetXlaOp(operands[i], value_map, &operand, op)))
+      return failure();
+    branch_operands[i] = operand;
     computations_p[i] = &computations[i];
     if (failed(ctx.converter->LowerRegionAsComputation(&branches[i],
                                                        computations_p[i])))
       return failure();
   }
-  xla::XlaOp result =
-      xla::Conditional(value_map[op.index()], computations_p, branch_operands);
+  xla::XlaOp index;
+  if (failed(GetXlaOp(op.index(), value_map, &index, op))) return failure();
+
+  xla::XlaOp result = xla::Conditional(index, computations_p, branch_operands);
   if (op.getNumResults() == 1) {
     value_map[op.getResult(0)] = result;
   } else {
@@ -681,9 +708,11 @@ LogicalResult ExportXlaOp(ConstOp op, OpLoweringContext ctx) {
 
 LogicalResult ExportXlaOp(ConvertOp op, OpLoweringContext ctx) {
   auto& value_map = *ctx.values;
+  xla::XlaOp operand;
+  if (failed(GetXlaOp(op.operand(), value_map, &operand, op))) return failure();
+
   value_map[op] = xla::ConvertElementType(
-      value_map[op.operand()],
-      xla::TypeToPrimitiveType(getElementTypeOrSelf(op.getType())));
+      operand, xla::TypeToPrimitiveType(getElementTypeOrSelf(op.getType())));
   return success();
 }
 
@@ -702,7 +731,10 @@ LogicalResult ExportXlaOp(DequantizeOp op, OpLoweringContext ctx) {
   xla::QuantizedRange range(ConvertAPFloat(op.min_range()),
                             ConvertAPFloat(op.max_range()));
   auto& value_map = *ctx.values;
-  auto casted = xla::ConvertElementType(value_map[op.input()], xla::U32);
+  xla::XlaOp input;
+  if (failed(GetXlaOp(op.input(), value_map, &input, op))) return failure();
+
+  auto casted = xla::ConvertElementType(input, xla::U32);
   if (op.is_16bits()) {
     value_map[op] = xla::Dequantize<uint16>(
         casted, range, ConvertStringRef(op.mode()), op.transpose_output());
@@ -715,12 +747,14 @@ LogicalResult ExportXlaOp(DequantizeOp op, OpLoweringContext ctx) {
 
 LogicalResult ExportXlaOp(InfeedOp op, OpLoweringContext ctx) {
   auto& value_map = *ctx.values;
+  xla::XlaOp token;
+  if (failed(GetXlaOp(op.token(), value_map, &token, op))) return failure();
+
   // The shape argument expected by the xla client API is the type of the first
   // element in the result tuple.
   auto result_type = op.getType().cast<mlir::TupleType>().getType(0);
-  value_map[op] =
-      xla::InfeedWithToken(value_map[op.token()], xla::TypeToShape(result_type),
-                           std::string(op.infeed_config()));
+  value_map[op] = xla::InfeedWithToken(token, xla::TypeToShape(result_type),
+                                       std::string(op.infeed_config()));
   return success();
 }
 
@@ -745,10 +779,13 @@ LogicalResult ExportXlaOp(MapOp op, OpLoweringContext ctx) {
 
 LogicalResult ExportXlaOp(OutfeedOp op, OpLoweringContext ctx) {
   auto& value_map = *ctx.values;
-  value_map[op] =
-      xla::OutfeedWithToken(value_map[op.operand()], value_map[op.token()],
-                            xla::TypeToShape(op.operand().getType()),
-                            std::string(op.outfeed_config()));
+  xla::XlaOp operand, token;
+  if (failed(GetXlaOp(op.operand(), value_map, &operand, op))) return failure();
+  if (failed(GetXlaOp(op.token(), value_map, &token, op))) return failure();
+
+  value_map[op] = xla::OutfeedWithToken(
+      operand, token, xla::TypeToShape(op.operand().getType()),
+      std::string(op.outfeed_config()));
   return success();
 }
 
@@ -758,29 +795,34 @@ LogicalResult ExportXlaOp(PadOp op, OpLoweringContext ctx) {
   auto edge_padding_low = ConvertDenseIntAttr(op.edge_padding_low());
   auto edge_padding_high = ConvertDenseIntAttr(op.edge_padding_high());
   auto interior_padding = ConvertDenseIntAttr(op.interior_padding());
-  for (xla::int64 i = 0; i < edge_padding_low.size(); ++i) {
+  for (xla::int64 i = 0, end = edge_padding_low.size(); i < end; ++i) {
     auto* dims = padding_config.add_dimensions();
     dims->set_edge_padding_low(edge_padding_low[i]);
     dims->set_edge_padding_high(edge_padding_high[i]);
     dims->set_interior_padding(interior_padding[i]);
   }
-  value_map[op] = xla::Pad(value_map[op.getOperand(0)],
-                           value_map[op.getOperand(1)], padding_config);
+  xla::XlaOp operand, padding_value;
+  if (failed(GetXlaOp(op.operand(), value_map, &operand, op))) return failure();
+  if (failed(GetXlaOp(op.padding_value(), value_map, &padding_value, op)))
+    return failure();
+
+  value_map[op] = xla::Pad(operand, padding_value, padding_config);
   return success();
 }
 
 LogicalResult ExportXlaOp(RecvOp op, OpLoweringContext ctx) {
   auto& value_map = *ctx.values;
   auto result_type = op.getType().cast<mlir::TupleType>().getType(0);
+  xla::XlaOp token;
+  if (failed(GetXlaOp(op.token(), value_map, &token, op))) return failure();
+
   if (op.is_host_transfer()) {
-    value_map[op] =
-        xla::RecvFromHost(value_map[op.token()], xla::TypeToShape(result_type),
-                          Convert_channel_handle(op.channel_id()));
+    value_map[op] = xla::RecvFromHost(token, xla::TypeToShape(result_type),
+                                      Convert_channel_handle(op.channel_id()));
     return success();
   }
-  value_map[op] =
-      xla::RecvWithToken(value_map[op.token()], xla::TypeToShape(result_type),
-                         Convert_channel_handle(op.channel_id()));
+  value_map[op] = xla::RecvWithToken(token, xla::TypeToShape(result_type),
+                                     Convert_channel_handle(op.channel_id()));
   return success();
 }
 
@@ -810,9 +852,13 @@ LogicalResult ExportXlaOp(ReduceWindowOp op, OpLoweringContext ctx) {
   if (failed(ctx.converter->LowerRegionAsComputation(&op.body(), &body))) {
     return failure();
   }
+  xla::XlaOp operand, init_value;
+  if (failed(GetXlaOp(op.operand(), value_map, &operand, op))) return failure();
+  if (failed(GetXlaOp(op.init_value(), value_map, &init_value, op)))
+    return failure();
+
   value_map[op] = xla::ReduceWindowWithGeneralPadding(
-      value_map[op.operand()], value_map[op.init_value()], body,
-      ConvertDenseIntAttr(op.window_dimensions()),
+      operand, init_value, body, ConvertDenseIntAttr(op.window_dimensions()),
       ConvertDenseIntAttr(op.window_strides()),
       ConvertDenseIntAttr(op.base_dilations()),
       ConvertDenseIntAttr(op.window_dilations()),
@@ -822,9 +868,11 @@ LogicalResult ExportXlaOp(ReduceWindowOp op, OpLoweringContext ctx) {
 
 LogicalResult ExportXlaOp(ReshapeOp op, OpLoweringContext ctx) {
   auto& value_map = *ctx.values;
-  value_map[op] = xla::Reshape(value_map[op.operand()],
-                               xla::TypeToShape(op.getType()).dimensions());
+  xla::XlaOp operand;
+  if (failed(GetXlaOp(op.operand(), value_map, &operand, op))) return failure();
 
+  value_map[op] =
+      xla::Reshape(operand, xla::TypeToShape(op.getType()).dimensions());
   return success();
 }
 
@@ -834,17 +882,34 @@ LogicalResult ExportXlaOp(ReturnOp op, OpLoweringContext ctx) {
   return failure();
 }
 
+LogicalResult ExportXlaOp(RngBitGeneratorOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  auto result = op.getResult();
+  auto xla_arg_1 = value_map[*op.getODSOperands(0).begin()];
+  auto xla_result = xla::RngBitGenerator(
+      static_cast<xla::RandomAlgorithm>(op.rng_algorithm().getSExtValue()),
+      Unwrap(xla_arg_1), xla::TypeToShape(result.getType()).tuple_shapes(1));
+  value_map[result] = xla_result;
+  return mlir::success();
+}
+
 LogicalResult ExportXlaOp(RngNormalOp op, OpLoweringContext ctx) {
   auto& value_map = *ctx.values;
-  value_map[op] = xla::RngNormal(value_map[op.mu()], value_map[op.sigma()],
-                                 xla::TypeToShape(op.getType()));
+  xla::XlaOp mu, sigma;
+  if (failed(GetXlaOp(op.mu(), value_map, &mu, op))) return failure();
+  if (failed(GetXlaOp(op.sigma(), value_map, &sigma, op))) return failure();
+
+  value_map[op] = xla::RngNormal(mu, sigma, xla::TypeToShape(op.getType()));
   return success();
 }
 
 LogicalResult ExportXlaOp(RngUniformOp op, OpLoweringContext ctx) {
   auto& value_map = *ctx.values;
-  value_map[op] = xla::RngUniform(value_map[op.a()], value_map[op.b()],
-                                  xla::TypeToShape(op.getType()));
+  xla::XlaOp a, b;
+  if (failed(GetXlaOp(op.a(), value_map, &a, op))) return failure();
+  if (failed(GetXlaOp(op.b(), value_map, &b, op))) return failure();
+
+  value_map[op] = xla::RngUniform(a, b, xla::TypeToShape(op.getType()));
   return success();
 }
 
@@ -857,10 +922,15 @@ LogicalResult ExportXlaOp(ScatterOp op, OpLoweringContext ctx) {
   }
   xla::ScatterDimensionNumbers dimension_numbers =
       Convert_scatter_dimension_numbers(op.scatter_dimension_numbers());
-  value_map[op] = xla::Scatter(
-      value_map[op.operand()], value_map[op.scatter_indices()],
-      value_map[op.updates()], update_computation, dimension_numbers,
-      op.indices_are_sorted(), op.unique_indices());
+  xla::XlaOp operand, scatter_indices, updates;
+  if (failed(GetXlaOp(op.operand(), value_map, &operand, op))) return failure();
+  if (failed(GetXlaOp(op.scatter_indices(), value_map, &scatter_indices, op)))
+    return failure();
+  if (failed(GetXlaOp(op.updates(), value_map, &updates, op))) return failure();
+
+  value_map[op] = xla::Scatter(operand, scatter_indices, updates,
+                               update_computation, dimension_numbers,
+                               op.indices_are_sorted(), op.unique_indices());
   return success();
 }
 
@@ -873,26 +943,33 @@ LogicalResult ExportXlaOp(SelectAndScatterOp op, OpLoweringContext ctx) {
           ctx.converter->LowerRegionAsComputation(&op.scatter(), &scatter))) {
     return failure();
   }
+  xla::XlaOp operand, source, init_value;
+  if (failed(GetXlaOp(op.operand(), value_map, &operand, op))) return failure();
+  if (failed(GetXlaOp(op.source(), value_map, &source, op))) return failure();
+  if (failed(GetXlaOp(op.init_value(), value_map, &init_value, op)))
+    return failure();
+
   value_map[op] = xla::SelectAndScatterWithGeneralPadding(
-      value_map[op.operand()], select,
-      ConvertDenseIntAttr(op.window_dimensions()),
+      operand, select, ConvertDenseIntAttr(op.window_dimensions()),
       ConvertDenseIntAttr(op.window_strides()), Convert_padding(op.padding()),
-      value_map[op.source()], value_map[op.init_value()], scatter);
+      source, init_value, scatter);
   return success();
 }
 
 LogicalResult ExportXlaOp(SendOp op, OpLoweringContext ctx) {
   auto& value_map = *ctx.values;
+  xla::XlaOp operand, token;
+  if (failed(GetXlaOp(op.operand(), value_map, &operand, op))) return failure();
+  if (failed(GetXlaOp(op.token(), value_map, &token, op))) return failure();
+
   if (op.is_host_transfer()) {
-    value_map[op] =
-        xla::SendToHost(value_map[op.operand()], value_map[op.token()],
-                        xla::TypeToShape(op.operand().getType()),
-                        Convert_channel_handle(op.channel_id()));
+    value_map[op] = xla::SendToHost(operand, token,
+                                    xla::TypeToShape(op.operand().getType()),
+                                    Convert_channel_handle(op.channel_id()));
     return success();
   }
-  value_map[op] =
-      xla::SendWithToken(value_map[op.operand()], value_map[op.token()],
-                         Convert_channel_handle(op.channel_id()));
+  value_map[op] = xla::SendWithToken(operand, token,
+                                     Convert_channel_handle(op.channel_id()));
   return success();
 }
 
@@ -914,7 +991,9 @@ LogicalResult ExportXlaOp(SortOp op, OpLoweringContext ctx) {
 
 LogicalResult ExportXlaOp(TraceOp op, OpLoweringContext ctx) {
   auto& value_map = *ctx.values;
-  xla::Trace(std::string(op.tag()), value_map[op.operand()]);
+  xla::XlaOp operand;
+  if (failed(GetXlaOp(op.operand(), value_map, &operand, op))) return failure();
+  xla::Trace(std::string(op.tag()), operand);
   return success();
 }
 
@@ -933,13 +1012,40 @@ LogicalResult ExportXlaOp(WhileOp op, OpLoweringContext ctx) {
     return failure();
   }
 
-  value_map[op] = xla::While(condition, body, value_map[op.getOperand()]);
+  xla::XlaOp operand;
+  if (failed(GetXlaOp(op.getOperand(), value_map, &operand, op)))
+    return failure();
+  value_map[op] = xla::While(condition, body, operand);
   return success();
 }
 
 LogicalResult ExportXlaOp(FusionOp op, OpLoweringContext ctx) {
-  // TODO(whoever): currently not supported.
-  return failure();
+  if (!op.fusion_kind()) {
+    op.emitOpError() << "requires fusion kind for HLO translation";
+    return failure();
+  }
+
+  xla::XlaComputation fused_computation;
+  if (failed(ctx.converter->LowerRegionAsComputation(&op.fused_computation(),
+                                                     &fused_computation)))
+    return failure();
+
+  auto& values = *ctx.values;
+  llvm::SmallVector<xla::XlaOp, 4> operands;
+  for (auto operand : op.operands()) operands.push_back(values[operand]);
+
+  xla::XlaOp fusion = xla::internal::XlaBuilderBuildFusion(
+      ctx.builder, operands,
+      absl::string_view(op.fusion_kind()->data(), op.fusion_kind()->size()),
+      fused_computation);
+  if (op.getNumResults() == 1) {
+    values[op.getResult(0)] = fusion;
+  } else {
+    for (auto item : llvm::enumerate(op.getResults())) {
+      values[item.value()] = xla::GetTupleElement(fusion, item.index());
+    }
+  }
+  return success();
 }
 
 }  // namespace
@@ -1032,7 +1138,7 @@ LogicalResult ConvertToHloModule::Lower(
   ElementsAttr const_attr;
 
   if (auto call_op = dyn_cast<mlir::CallOp>(inst)) {
-    return LowerFunctionCall(&call_op, builder, &value_map);
+    return LowerFunctionCall(call_op, builder, &value_map);
   }
 
   if (auto op = dyn_cast<mlir::TensorCastOp>(inst)) {
@@ -1046,7 +1152,10 @@ LogicalResult ConvertToHloModule::Lower(
       return failure();
     }
 
-    value_map[op.getResult()] = value_map[operand];
+    xla::XlaOp xla_operand;
+    if (failed(GetXlaOp(operand, value_map, &xla_operand, op)))
+      return failure();
+    value_map[op.getResult()] = xla_operand;
     return success();
   }
 
@@ -1072,7 +1181,11 @@ LogicalResult ConvertToHloModule::Lower(
       std::vector<xla::XlaOp> returns(num_return_values);
       for (OpOperand& ret : inst->getOpOperands()) {
         unsigned index = ret.getOperandNumber();
-        returns[index] = value_map[ret.get()];
+        xla::XlaOp operand;
+        if (failed(GetXlaOp(ret.get(), value_map, &operand, inst)))
+          return failure();
+
+        returns[index] = operand;
         if (!is_entry_function || !has_ret_shardings) continue;
 
         xla::Shape return_shape = xla::TypeToShape(ret.get().getType());
@@ -1098,7 +1211,11 @@ LogicalResult ConvertToHloModule::Lower(
       return_value = xla::Tuple(builder, returns);
       builder->ClearSharding();
     } else if (num_return_values == 1) {
-      return_value = value_map[inst->getOperand(0)];
+      xla::XlaOp operand;
+      if (failed(GetXlaOp(inst->getOperand(0), value_map, &operand, inst)))
+        return failure();
+
+      return_value = operand;
     }
 
     // Build the XlaComputation and check for failures.
@@ -1117,14 +1234,17 @@ LogicalResult ConvertToHloModule::Lower(
 }
 
 LogicalResult ConvertToHloModule::LowerFunctionCall(
-    mlir::CallOp* call_op, xla::XlaBuilder* builder,
+    mlir::CallOp call_op, xla::XlaBuilder* builder,
     ConvertToHloModule::ValueLoweringMap* value_lowering) {
   auto& value_map = *value_lowering;
-  mlir::FuncOp callee = module_.lookupSymbol<mlir::FuncOp>(call_op->callee());
+  mlir::FuncOp callee = module_.lookupSymbol<mlir::FuncOp>(call_op.callee());
   if (failed(RunOnFunction(callee))) return failure();
   std::vector<xla::XlaOp> operands;
-  for (auto operand : call_op->getOperands()) {
-    operands.push_back(value_map[operand]);
+  for (auto operand : call_op.getOperands()) {
+    xla::XlaOp xla_operand;
+    if (failed(GetXlaOp(operand, value_map, &xla_operand, call_op)))
+      return failure();
+    operands.push_back(xla_operand);
   }
   // Each call to xla::Call would insert a copy of the computation to
   // the HLO. Thus each callsite would have a unique callee in the
@@ -1135,13 +1255,13 @@ LogicalResult ConvertToHloModule::LowerFunctionCall(
   xla::XlaOp call_result =
       xla::Call(builder, lowered_computation_[callee], operands);
   // Use GetTupleElement for multiple outputs
-  unsigned num_results = call_op->getNumResults();
+  unsigned num_results = call_op.getNumResults();
   if (num_results > 1) {
     for (unsigned i = 0; i != num_results; ++i) {
-      value_map[call_op->getResult(i)] = xla::GetTupleElement(call_result, i);
+      value_map[call_op.getResult(i)] = xla::GetTupleElement(call_result, i);
     }
   } else if (num_results == 1) {
-    value_map[call_op->getResult(0)] = call_result;
+    value_map[call_op.getResult(0)] = call_result;
   }
   return success();
 }
@@ -1271,8 +1391,7 @@ LogicalResult ConvertToHloModule::LowerBasicBlockAsFunction(
     llvm::ArrayRef<absl::optional<xla::OpSharding>> arg_shardings,
     llvm::ArrayRef<absl::optional<xla::OpSharding>> ret_shardings,
     xla::XlaComputation* result) {
-  // Mapping from the Value to lowered XlaOp. The code below lowers in
-  // program order and will fail if an operand is unseen. This can be improved.
+  // Mapping from the Value to lowered XlaOp.
   ValueLoweringMap lowering;
 
   // If using tuples as input, then there is only one input parameter that is a
@@ -1498,9 +1617,19 @@ LogicalResult AddDynamicParameterBindings(mlir::ModuleOp module,
 
 }  // namespace
 
+Status ConvertRegionToComputation(mlir::Region* region,
+                                  xla::XlaComputation* func) {
+  mlir::ModuleOp module;
+  ConvertToHloModule converter(module, true, true, {});
+  if (failed(converter.LowerRegionAsComputation(region, func)))
+    return tensorflow::errors::Internal(
+        "failed to convert region to computation");
+  return Status::OK();
+}
+
 Status ConvertMlirHloToHlo(mlir::ModuleOp module, xla::HloProto* hlo_proto,
                            bool use_tuple_args, bool return_tuple,
-                           const tensorflow::XlaCompiler::ShapeRepresentationFn
+                           const tensorflow::XlaHelpers::ShapeRepresentationFn
                                shape_representation_fn) {
   mlir::StatusScopedDiagnosticHandler diag_handler(module.getContext());
   ConvertToHloModule converter(module, use_tuple_args, return_tuple,
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h
index 8bfe4c76b04..6f2b5a6db95 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h
@@ -18,9 +18,10 @@ limitations under the License.
 
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 
 namespace mlir {
 
@@ -33,9 +34,14 @@ namespace mlir {
 // single value.
 Status ConvertMlirHloToHlo(mlir::ModuleOp module, ::xla::HloProto* hlo_proto,
                            bool use_tuple_args, bool return_tuple,
-                           const tensorflow::XlaCompiler::ShapeRepresentationFn
+                           const tensorflow::XlaHelpers::ShapeRepresentationFn
                                shape_representation_fn = nullptr);
 
+// Converts a region to a computation. It returns a standalone module that
+// contains the converted region as the entry computation.
+Status ConvertRegionToComputation(mlir::Region* region,
+                                  ::xla::XlaComputation* func);
+
 // Creates XlaOp equivalent of a given MLIR operation using the operand info
 // from `value_lowering` map.
 llvm::Optional<::xla::XlaOp> CreateXlaOperator(
diff --git a/tensorflow/compiler/mlir/xla/operator_writer_gen.cc b/tensorflow/compiler/mlir/xla/operator_writer_gen.cc
index 108544d96ff..407a7d3da38 100644
--- a/tensorflow/compiler/mlir/xla/operator_writer_gen.cc
+++ b/tensorflow/compiler/mlir/xla/operator_writer_gen.cc
@@ -85,18 +85,25 @@ static void BuildOperator(const Operator& op, raw_ostream& os) {
 
     // Emit an argument for an operand.
     if (auto* operand_cst = arg.dyn_cast<NamedTypeConstraint*>()) {
+      std::string xla_arg = "xla_arg_" + std::to_string(index);
       // Handle a non-variadic operand.
       if (!operand_cst->isVariableLength()) {
-        os << "  auto xla_arg_" << index << " = value_map[*op.getODSOperands("
-           << operand_number++ << ").begin()];\n";
+        os << "  xla::XlaOp " << xla_arg << ";\n";
+        os << "  if (failed(GetXlaOp(*op.getODSOperands(" << operand_number++
+           << ").begin(), value_map, &" << xla_arg << ", op)))\n";
+        os << "    return mlir::failure();\n";
         continue;
       }
 
       // Otherwise, this is a varidiac operand list.
-      os << "  std::vector<xla::XlaOp> xla_arg_" << index << ";\n"
+      os << "  std::vector<xla::XlaOp> " << xla_arg << ";\n"
          << "  for (auto operand : op.getODSOperands(" << operand_number++
-         << "))\n      xla_arg_" << index
-         << ".push_back(value_map[operand]);\n";
+         << ")) {\n";
+      os << "    xla::XlaOp result;\n";
+      os << "    if (failed(GetXlaOp(operand, value_map, &result, op)))\n";
+      os << "      return mlir::failure();\n";
+      os << "    " << xla_arg << ".push_back(result);\n";
+      os << "  }\n";
       continue;
     }
 
diff --git a/tensorflow/compiler/mlir/xla/tests/BUILD b/tensorflow/compiler/mlir/xla/tests/BUILD
index de2f6669339..2631e2b6757 100644
--- a/tensorflow/compiler/mlir/xla/tests/BUILD
+++ b/tensorflow/compiler/mlir/xla/tests/BUILD
@@ -6,7 +6,10 @@ package(licenses = ["notice"])
 glob_lit_tests(
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
-    test_file_exts = ["mlir"],
+    test_file_exts = [
+        "mlir",
+        "hlotxt",
+    ],
 )
 
 # Bundle together all of the test utilities that are used by tests.
@@ -14,6 +17,7 @@ filegroup(
     name = "test_utilities",
     testonly = True,
     data = [
+        "//tensorflow/compiler/mlir:tf-mlir-translate",
         "//tensorflow/compiler/mlir:tf-opt",
         "//tensorflow/compiler/mlir/xla:xla-opt",
         "@llvm-project//llvm:FileCheck",
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/non_identity_layouts.hlotxt b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/non_identity_layouts.hlotxt
new file mode 100644
index 00000000000..3630d2d45e4
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/non_identity_layouts.hlotxt
@@ -0,0 +1,13 @@
+// RUN: tf-mlir-translate -hlo-text-to-lhlo %s | FileCheck %s
+
+HloModule TestModule
+
+// CHECK: #[[MAP:.*]] = affine_map<(d0, d1) -> (d0 + d1 * 3)>
+
+// CHECK: func @TestComputation
+ENTRY TestComputation {
+  x = f32[3, 2]{1,0} parameter(0)
+
+  // CHECK: "lmhlo.copy"(%{{.*}}, %{{.*}}) : (memref<3x2xf32>, memref<3x2xf32, #[[MAP]]>) -> ()
+  ROOT x.copy = f32[3, 2]{0,1} copy(x)
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/ops.mlir b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/ops.mlir
index 09a85177fae..2e1b63b0db7 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/ops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/ops.mlir
@@ -1,7 +1,7 @@
-// RUN: xla-opt -split-input-file -xla-hlo-to-lhlo-with-xla %s | FileCheck --enable-var-scope %s
+// RUN: xla-opt -split-input-file -xla-hlo-to-lhlo-with-xla %s | FILECHECK_OPTS="" FileCheck --enable-var-scope %s
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 0
 // CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
 func @main(%value: tensor<2x2xf32>) -> tensor<2x2xf32> {
 // CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
@@ -14,8 +14,8 @@ func @main(%value: tensor<2x2xf32>) -> tensor<2x2xf32> {
 // -----
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<2x2xf32> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<2x2xf32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 1
 // CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
 func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32> {
 // CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
@@ -29,8 +29,8 @@ func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32
 // -----
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xi32> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<2x2xi32> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xi32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<2x2xi32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 1
 // CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
 func @main(%value0: tensor<2x2xi32>, %value1: tensor<2x2xi32>) -> tensor<2x2xi32> {
 // CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
@@ -44,7 +44,7 @@ func @main(%value0: tensor<2x2xi32>, %value1: tensor<2x2xi32>) -> tensor<2x2xi32
 // -----
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 0
 // CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
 func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
 // CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
@@ -57,8 +57,8 @@ func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
 // -----
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<1x2xf32> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<1x2xf32> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG0:.*]]: memref<1x2xf32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<1x2xf32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 1
 // CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
 func @main(%value0: tensor<1x2xf32>, %value1: tensor<1x2xf32>) -> tensor<1x2xcomplex<f32>> {
 // CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<1x2xcomplex<f32>>
@@ -72,7 +72,7 @@ func @main(%value0: tensor<1x2xf32>, %value1: tensor<1x2xf32>) -> tensor<1x2xcom
 // -----
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<1x2xcomplex<f32>> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG0:.*]]: memref<1x2xcomplex<f32>> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 0
 // CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
 func @main(%value0: tensor<1x2xcomplex<f32>>) -> tensor<1x2xcomplex<f32>> {
 // CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<1x2xcomplex<f32>>
@@ -86,8 +86,8 @@ func @main(%value0: tensor<1x2xcomplex<f32>>) -> tensor<1x2xcomplex<f32>> {
 // -----
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<2x2xf32> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<2x2xf32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 1
 // CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
 func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32> {
 // CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
@@ -101,7 +101,7 @@ func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32
 // -----
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 0
 // CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
 func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
 // CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
@@ -114,7 +114,7 @@ func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
 // -----
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 0
 // CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
 func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
 // CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
@@ -127,8 +127,8 @@ func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
 // -----
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<2x2xf32> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<2x2xf32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 1
 // CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
 func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32> {
 // CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
@@ -142,8 +142,8 @@ func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32
 // -----
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<2x2xf32> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<2x2xf32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 1
 // CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
 func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32> {
 // CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
@@ -157,8 +157,8 @@ func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32
 // -----
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<2x2xf32> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<2x2xf32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 1
 // CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
 func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32> {
 // CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
@@ -172,7 +172,7 @@ func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32
 // -----
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 0
 // CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
 func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
 // CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
@@ -185,7 +185,7 @@ func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
 // -----
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<1x2xcomplex<f32>> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG0:.*]]: memref<1x2xcomplex<f32>> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 0
 // CHECK-SAME: %[[ARG1:.*]]: memref<8xi8>
 func @main(%value0: tensor<1x2xcomplex<f32>>) -> tensor<1x2xf32> {
 // CHECK: %[[VIEW:.*]] = {{.*}} memref<8xi8> to memref<1x2xf32>
@@ -198,7 +198,7 @@ func @main(%value0: tensor<1x2xcomplex<f32>>) -> tensor<1x2xf32> {
 // -----
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<1x2xcomplex<f32>> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG0:.*]]: memref<1x2xcomplex<f32>> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 0
 // CHECK-SAME: %[[ARG1:.*]]: memref<8xi8>
 func @main(%value0: tensor<1x2xcomplex<f32>>) -> tensor<1x2xf32> {
 // CHECK: %[[VIEW:.*]] = {{.*}} memref<8xi8> to memref<1x2xf32>
@@ -211,8 +211,8 @@ func @main(%value0: tensor<1x2xcomplex<f32>>) -> tensor<1x2xf32> {
 // -----
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xi32> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<2x2xi32> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xi32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<2x2xi32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 1
 // CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
 func @main(%value0: tensor<2x2xi32>, %value1: tensor<2x2xi32>) -> tensor<2x2xi32> {
 // CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
@@ -226,7 +226,7 @@ func @main(%value0: tensor<2x2xi32>, %value1: tensor<2x2xi32>) -> tensor<2x2xi32
 // -----
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 0
 // CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
 func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
 // CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
@@ -239,9 +239,9 @@ func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
 // -----
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xi1> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<2x2xf32> {lmhlo.params = 1
-// CHECK-SAME: %[[ARG2:.*]]: memref<2x2xf32> {lmhlo.params = 2
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xi1> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<2x2xf32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<2x2xf32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 2
 // CHECK-SAME: %[[ARG3:.*]]: memref<16xi8>
 func @main(%pred: tensor<2x2xi1>, %lhs: tensor<2x2xf32>, %rhs: tensor<2x2xf32>) -> tensor<2x2xf32> {
 // CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
@@ -255,7 +255,7 @@ func @main(%pred: tensor<2x2xi1>, %lhs: tensor<2x2xf32>, %rhs: tensor<2x2xf32>)
 // -----
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 0
 // CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
 func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
 // CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
@@ -268,7 +268,7 @@ func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
 // -----
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 0
 // CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
 func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
 // CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
@@ -281,8 +281,8 @@ func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
 // -----
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xi32> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<2x2xi32> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xi32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<2x2xi32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 1
 // CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
 func @main(%value0: tensor<2x2xi32>, %value1: tensor<2x2xi32>) -> tensor<2x2xi32> {
 // CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
@@ -296,7 +296,7 @@ func @main(%value0: tensor<2x2xi32>, %value1: tensor<2x2xi32>) -> tensor<2x2xi32
 // -----
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.alloc = {{[0-9]+}} : index, lmhlo.params = 0
 // CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
 func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
 // CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/passthrough.mlir b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/passthrough.mlir
index cc07624d63d..88e5e1e0a32 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/passthrough.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/passthrough.mlir
@@ -1,9 +1,9 @@
-// RUN: xla-opt -xla-hlo-to-lhlo-with-xla %s | FileCheck --enable-var-scope %s
+// RUN: xla-opt -xla-hlo-to-lhlo-with-xla %s | FILECHECK_OPTS="" FileCheck --enable-var-scope %s
 
 // Current allocation will lead to one buffer argument for the "value" and
 // another one for the output, an no returned values.
 // CHECK-LABEL: func @main
-// CHECK-SAME:  %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.params = 0 : index},
+// CHECK-SAME:  %[[ARG0:.*]]: memref<2x2xf32> {lmhlo.alloc = 1 : index, lmhlo.params = 0 : index},
 // CHECK-SAME:  %[[ARG1:.*]]: memref<16xi8> {lmhlo.alloc = 0 : index, lmhlo.liveout = true}
 // CHECK-SAME: ) {
 func @main(%value: tensor<2x2xf32>) -> tensor<2x2xf32> {
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir
index de03921f091..69eaeeb946d 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir
@@ -10,14 +10,14 @@ func @batchmatmulv2_basic(%arg0: tensor<1x4x2xf32>, %arg1: tensor<3x2x4xf32>) ->
 // CHECK:           [[LHSSHAPE:%.*]] = shape.shape_of [[LHS]] : tensor<1x4x2xf32>
 // CHECK:           [[RHSSHAPE:%.*]] = shape.shape_of [[RHS]] : tensor<3x2x4xf32>
 // CHECK:           [[CM2:%.*]] = constant -2 : i32
-// CHECK:           [[LHSHEAD:%.*]], [[LHSTAIL:%.*]] = "shape.split_at"([[LHSSHAPE]], [[CM2]]) : (!shape.shape, i32) -> (!shape.shape, !shape.shape)
-// CHECK:           [[RHSHEAD:%.*]], [[RHSTAIL:%.*]] = "shape.split_at"([[RHSSHAPE]], [[CM2]]) : (!shape.shape, i32) -> (!shape.shape, !shape.shape)
-// CHECK:           [[BCASTHEAD:%.*]] = "shape.broadcast"([[LHSHEAD]], [[RHSHEAD]]) : (!shape.shape, !shape.shape) -> !shape.shape
-// CHECK:           [[LHSBCASTSHAPE:%.*]] = "shape.concat"([[BCASTHEAD]], [[LHSTAIL]]) : (!shape.shape, !shape.shape) -> !shape.shape
-// CHECK:           [[LHSSHAPEEXTENTS:%.*]] = shape.to_extent_tensor [[LHSBCASTSHAPE]] : tensor<3xindex>
+// CHECK:           [[LHSHEAD:%.*]], [[LHSTAIL:%.*]] = "shape.split_at"([[LHSSHAPE]], [[CM2]])
+// CHECK:           [[RHSHEAD:%.*]], [[RHSTAIL:%.*]] = "shape.split_at"([[RHSSHAPE]], [[CM2]])
+// CHECK:           [[BCASTHEAD:%.*]] = shape.broadcast [[LHSHEAD]], [[RHSHEAD]]
+// CHECK:           [[LHSBCASTSHAPE:%.*]] = shape.concat [[BCASTHEAD]], [[LHSTAIL]]
+// CHECK:           [[LHSSHAPEEXTENTS:%.*]] = shape.to_extent_tensor [[LHSBCASTSHAPE]]
 // CHECK:           [[LHSBCAST:%.*]] = "mhlo.dynamic_broadcast_in_dim"([[LHS]], [[LHSSHAPEEXTENTS]]) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x4x2xf32>, tensor<3xindex>) -> tensor<3x4x2xf32>
-// CHECK:           [[RHSBCASTSHAPE:%.*]] = "shape.concat"([[BCASTHEAD]], [[RHSTAIL]]) : (!shape.shape, !shape.shape) -> !shape.shape
-// CHECK:           [[RHSSHAPEEXTENTS:%.*]] = shape.to_extent_tensor [[RHSBCASTSHAPE]] : tensor<3xindex>
+// CHECK:           [[RHSBCASTSHAPE:%.*]] = shape.concat [[BCASTHEAD]], [[RHSTAIL]]
+// CHECK:           [[RHSSHAPEEXTENTS:%.*]] = shape.to_extent_tensor [[RHSBCASTSHAPE]]
 // CHECK:           [[RHSBCAST:%.*]] = "mhlo.dynamic_broadcast_in_dim"([[RHS]], [[RHSSHAPEEXTENTS]]) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<3x2x4xf32>, tensor<3xindex>) -> tensor<3x2x4xf32>
 // CHECK:           [[RESULT:%.*]] = "mhlo.dot_general"([[LHSBCAST]], [[RHSBCAST]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<0> : tensor<1xi64>, lhs_contracting_dimensions = dense<2> : tensor<1xi64>, rhs_batching_dimensions = dense<0> : tensor<1xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}} : (tensor<3x4x2xf32>, tensor<3x2x4xf32>) -> tensor<3x4x4xf32>
 // CHECK:           return [[RESULT]] : tensor<3x4x4xf32>
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir
index 45c90d26ab4..5f3e40f923f 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir
@@ -48,8 +48,8 @@ func @add_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi3
   // CHECK-NEXT: shape.assuming %[[WITNESS:.+]]
   // CHECK-DAG:    %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
   // CHECK-DAG:    %[[RHS_SHAPE:.+]] = shape.shape_of %arg1
-  // CHECK-NEXT:   %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
-  // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
+  // CHECK-NEXT:   %[[RESULT_SHAPE:.+]] = shape.broadcast %[[LHS_SHAPE]], %[[RHS_SHAPE]] : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = tensor_cast %[[RESULT_SHAPE]] : tensor<?xindex> to tensor<2xindex>
   // CHECK-NEXT:   %[[LHS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-NEXT:   %[[RHS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
   // CHECK-NEXT:   %[[RESULT:.+]] = mhlo.add %[[LHS_BCAST]], %[[RHS_BCAST]] : tensor<?x?xi32>
@@ -201,8 +201,8 @@ func @equal_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1>
   // NOT-CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[LHS_SHAPE]], %[[RHS_SHAPE]]
   // NOT-CHECK-NEXT: shape.assuming %[[WITNESS]] -> (tensor<?xi1>) {
   // NOT-CHECK-DAG:    %[[LHS_SHAPE1:.+]] = shape.shape_of %arg0
-  // NOT-CHECK-NEXT:   %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE1]], %[[RHS_SHAPE]])
-  // NOT-CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
+  // NOT-CHECK-NEXT:   %[[RESULT_SHAPE:.+]] = shape.broadcast %[[LHS_SHAPE1]], %[[RHS_SHAPE]] : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  // NOT-CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = tensor_cast %[[RESULT_SHAPE]] : tensor<?xindex> to tensor<1xindex>
   // NOT-CHECK-DAG:    %[[LHS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
   // NOT-CHECK-DAG:    %[[RHS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
   // NOT-CHECK-NEXT:   %[[RESULT:.+]] = "mhlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "EQ"}
@@ -290,8 +290,8 @@ func @greater_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<?xi1
   // CHECK-NEXT: shape.assuming %[[WITNESS]]
   // CHECK-DAG:    %[[LHS_SHAPE1:.+]] = shape.shape_of %arg0
   // CHECK-DAG:    %[[RHS_SHAPE1:.+]] = shape.shape_of %arg1
-  // CHECK-NEXT:   %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE1]], %[[RHS_SHAPE1]])
-  // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
+  // CHECK-NEXT:   %[[RESULT_SHAPE:.+]] = shape.broadcast %[[LHS_SHAPE1]], %[[RHS_SHAPE1]] : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = tensor_cast %[[RESULT_SHAPE]] : tensor<?xindex> to tensor<1xindex>
   // CHECK-DAG:    %[[LHS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
   // CHECK-DAG:    %[[RHS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
   // CHECK-NEXT:   "mhlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "GT"}
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-communication.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-communication.mlir
new file mode 100644
index 00000000000..550b2ba4da3
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-communication.mlir
@@ -0,0 +1,1107 @@
+// RUN: tf-opt -split-input-file -verify-diagnostics -xla-legalize-tf-communication %s | FileCheck %s
+
+// Test legalization of `tf._XlaHostComputeMlir` expands into individual
+// `mhlo.send` per operand and `mhlo.recv` per result. Channel Id's are uniquely
+// assigned per mhlo communcation op, and frontend attributes (modified keys)
+// and op shardings (based on `tpu_core`) are added. Sink tokens are created
+// if there are more than one operand or more than one result.
+//
+// The following op sharding is used:
+// Proto debug string:
+//   type: MAXIMAL
+//   tile_assignment_dimensions: 1
+//   tile_assignment_devices: 0
+// Serialized string:
+//   "\08\01\1A\01\01\22\01\00"
+
+// CHECK-LABEL: func @host_compute
+// CHECK-SAME:  ([[ARG0:%.*]]: tensor<i32>, [[ARG1:%.*]]: tensor<i64>)
+func @host_compute(%arg0: tensor<i32>, %arg1: tensor<i64>) -> (tensor<f32>, tensor<f64>) {
+  // CHECK:      [[INIT_TOKEN:%.*]] = "mhlo.create_token"
+
+  // CHECK:      [[SEND_ARG0_TOKEN:%.*]] = "mhlo.send"([[ARG0]], [[INIT_TOKEN]])
+  // CHECK-SAME: channel_id = {handle = 1 : i64, type = 2 : i64}
+  // CHECK-SAME: is_host_transfer = true
+  // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "s32", _xla_host_transfer_rendezvous = "host_compute_channel_send_dtoh_0"}
+  // CHECK-SAME: mhlo.sharding = "\08\01\1A\01\01\22\01\00"
+  // CHECK-SAME: (tensor<i32>, !mhlo.token) -> !mhlo.token
+
+  // CHECK:      [[SEND_ARG1_TOKEN:%.*]] = "mhlo.send"([[ARG1]], [[INIT_TOKEN]])
+  // CHECK-SAME: channel_id = {handle = 2 : i64, type = 2 : i64}
+  // CHECK-SAME: is_host_transfer = true
+  // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "s64", _xla_host_transfer_rendezvous = "host_compute_channel_send_dtoh_1"}
+  // CHECK-SAME: mhlo.sharding = "\08\01\1A\01\01\22\01\00"
+  // CHECK-SAME: (tensor<i64>, !mhlo.token) -> !mhlo.token
+
+  // CHECK:      [[SEND_SINK_TOKEN:%.*]] = "mhlo.after_all"([[SEND_ARG0_TOKEN]], [[SEND_ARG1_TOKEN]])
+
+  // CHECK:      [[RECV_RETVAL0_TUPLE:%.*]] = "mhlo.recv"([[SEND_SINK_TOKEN]])
+  // CHECK-SAME: channel_id = {handle = 3 : i64, type = 3 : i64}
+  // CHECK-SAME: is_host_transfer = true
+  // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "host_compute_channel_recv_htod_0"}
+  // CHECK-SAME: mhlo.sharding = "\08\01\1A\01\01\22\01\00"
+  // CHECK-SAME: (!mhlo.token) -> tuple<tensor<f32>, !mhlo.token>
+
+  // CHECK:      [[RECV_RETVAL0_VAL:%.*]] = "mhlo.get_tuple_element"([[RECV_RETVAL0_TUPLE]])
+  // CHECK-SAME: index = 0
+  // CHECK-SAME: mhlo.sharding = "\08\01\1A\01\01\22\01\00"
+  // CHECK-SAME: (tuple<tensor<f32>, !mhlo.token>) -> tensor<f32>
+
+  // CHECK:      [[RECV_RETVAL0_TOKEN:%.*]] = "mhlo.get_tuple_element"([[RECV_RETVAL0_TUPLE]])
+  // CHECK-SAME: index = 1
+  // CHECK-SAME: mhlo.sharding = "\08\01\1A\01\01\22\01\00"
+  // CHECK-SAME: (tuple<tensor<f32>, !mhlo.token>) -> !mhlo.token
+
+  // CHECK:      [[RECV_RETVAL1_TUPLE:%.*]] = "mhlo.recv"([[SEND_SINK_TOKEN]])
+  // CHECK-SAME: channel_id = {handle = 4 : i64, type = 3 : i64}
+  // CHECK-SAME: is_host_transfer = true
+  // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f64", _xla_host_transfer_rendezvous = "host_compute_channel_recv_htod_1"}
+  // CHECK-SAME: mhlo.sharding = "\08\01\1A\01\01\22\01\00"
+  // CHECK-SAME: (!mhlo.token) -> tuple<tensor<f64>, !mhlo.token>
+
+  // CHECK:      [[RECV_RETVAL1_VAL:%.*]] = "mhlo.get_tuple_element"([[RECV_RETVAL1_TUPLE]])
+  // CHECK-SAME: index = 0
+  // CHECK-SAME: mhlo.sharding = "\08\01\1A\01\01\22\01\00"
+  // CHECK-SAME: (tuple<tensor<f64>, !mhlo.token>) -> tensor<f64>
+
+  // CHECK:      [[RECV_RETVAL1_TOKEN:%.*]] = "mhlo.get_tuple_element"([[RECV_RETVAL1_TUPLE]])
+  // CHECK-SAME: index = 1
+  // CHECK-SAME: mhlo.sharding = "\08\01\1A\01\01\22\01\00"
+  // CHECK-SAME: (tuple<tensor<f64>, !mhlo.token>) -> !mhlo.token
+
+  // CHECK:      [[RECV_SINK_TOKEN:%.*]] = "mhlo.after_all"([[RECV_RETVAL0_TOKEN]], [[RECV_RETVAL1_TOKEN]])
+  %0:2 = "tf._XlaHostComputeMlir"(%arg0, %arg1) {recv_key = "host_compute_channel_recv", send_key = "host_compute_channel_send", tpu_core = 0 : i64} : (tensor<i32>, tensor<i64>) -> (tensor<f32>, tensor<f64>)
+
+  // CHECK:      return [[RECV_RETVAL0_VAL]], [[RECV_RETVAL1_VAL]] : tensor<f32>, tensor<f64>
+  return %0#0, %0#1 : tensor<f32>, tensor<f64>
+}
+
+// -----
+
+// Tests `tf._XlaHostComputeMlir` with `tpu_core` assigns the correct op
+// sharding.
+//
+// The following op sharding is used:
+// Proto debug string:
+//   type: MAXIMAL
+//   tile_assignment_dimensions: 1
+//   tile_assignment_devices: 1
+// Serialized string:
+//   "\08\01\1A\01\01\22\01\01"
+
+// CHECK-LABEL: func @host_compute_sharding
+// CHECK-SAME:  ([[ARG0:%.*]]: tensor<i32>)
+func @host_compute_sharding(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK:      "mhlo.send"
+  // CHECK-SAME: mhlo.sharding = "\08\01\1A\01\01\22\01\01"
+  // CHECK:      "mhlo.recv"
+  // CHECK-SAME: mhlo.sharding = "\08\01\1A\01\01\22\01\01"
+  // CHECK:      "mhlo.get_tuple_element"
+  // CHECK-SAME: mhlo.sharding = "\08\01\1A\01\01\22\01\01"
+  // CHECK:      "mhlo.get_tuple_element"
+  // CHECK-SAME: mhlo.sharding = "\08\01\1A\01\01\22\01\01"
+  %0 = "tf._XlaHostComputeMlir"(%arg0) {recv_key = "host_compute_channel_recv", send_key = "host_compute_channel_send", tpu_core = 1 : i64} : (tensor<i32>) -> tensor<i32>
+  return %0 : tensor<i32>
+}
+
+// -----
+
+// Tests `tf._XlaHostComputeMlir` with no operands simply forwards the input
+// token to its generated `mhlo.recv`.
+
+// CHECK-LABEL: func @host_compute_no_operands_one_result
+func @host_compute_no_operands_one_result() {
+  // CHECK:      [[INIT_TOKEN:%.*]] = "mhlo.create_token"
+
+  // CHECK-NOT:  "mhlo.send"
+  // CHECK-NOT:  "mhlo.after_all"
+  // CHECK:      "mhlo.recv"([[INIT_TOKEN]])
+  %0 = "tf._XlaHostComputeMlir"() {recv_key = "host_compute_channel_recv", send_key = "host_compute_channel_send", tpu_core = 0 : i64} : () -> tensor<i32>
+  return
+}
+
+// -----
+
+// Tests `tf._XlaHostComputeMlir` with no results simply forwards its token from
+// the generated `mhlo.send`.
+
+// CHECK-LABEL: func @host_compute_one_operand_no_results
+// CHECK-SAME:  ([[ARG0:%.*]]: tensor<i32>)
+func @host_compute_one_operand_no_results(%arg0: tensor<i32>) {
+  // CHECK:      [[INIT_TOKEN:%.*]] = "mhlo.create_token"
+
+  // CHECK:      [[SEND_TOKEN:%.*]] = "mhlo.send"([[ARG0]], [[INIT_TOKEN]])
+  // CHECK-NOT:  "mhlo.after_all"
+  "tf._XlaHostComputeMlir"(%arg0) {recv_key = "host_compute_channel_recv", send_key = "host_compute_channel_send", tpu_core = 0 : i64} : (tensor<i32>) -> ()
+
+  // CHECK:      "mhlo.recv"([[SEND_TOKEN]])
+  %0 = "tf.XlaRecvFromHost"() {key = "recv_key", shape = #tf.shape<>} : () -> tensor<i32>
+  return
+}
+
+// -----
+
+// Tests `tf._XlaHostComputeMlir` with one operand and one result does not
+// create any `mhlo.after_all` ops.
+
+// CHECK-LABEL: func @host_compute_single_operand_result
+// CHECK-SAME:  ([[ARG0:%.*]]: tensor<i32>)
+func @host_compute_single_operand_result(%arg0: tensor<i32>) {
+  // CHECK:      [[INIT_TOKEN:%.*]] = "mhlo.create_token"
+
+  // CHECK:      [[SEND_TOKEN:%.*]] = "mhlo.send"([[ARG0]], [[INIT_TOKEN]])
+  // CHECK-NOT:  "mhlo.after_all"
+  // CHECK:      "mhlo.recv"([[SEND_TOKEN]])
+  // CHECK-NOT:  "mhlo.after_all"
+  %0 = "tf._XlaHostComputeMlir"(%arg0) {recv_key = "host_compute_channel_recv", send_key = "host_compute_channel_send", tpu_core = 0 : i64} : (tensor<i32>) -> tensor<i32>
+  return
+}
+
+// -----
+
+// Test legalization of `tf.XlaSendToHost` expands into a `mhlo.send` op.
+
+// CHECK-LABEL: func @send_to_host
+// CHECK-SAME:  ([[ARG0:%.*]]: tensor<i32>)
+func @send_to_host(%arg0: tensor<i32>) {
+  // CHECK:      [[INIT_TOKEN:%.*]] = "mhlo.create_token"
+
+  // CHECK:      "mhlo.send"([[ARG0]], [[INIT_TOKEN]])
+  // CHECK-SAME: channel_id = {handle = 1 : i64, type = 2 : i64}
+  // CHECK-SAME: is_host_transfer = true
+  // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "s32", _xla_host_transfer_rendezvous = "send_key"}
+  // CHECK-SAME: (tensor<i32>, !mhlo.token) -> !mhlo.token
+  "tf.XlaSendToHost"(%arg0) {key = "send_key"} : (tensor<i32>) -> ()
+  return
+}
+
+// -----
+
+// Test legalization of `tf.XlaRecvFromHost` expands into a `mhlo.recv` op.
+
+// CHECK-LABEL: func @recv_from_host
+func @recv_from_host() -> tensor<i32> {
+  // CHECK:      [[INIT_TOKEN:%.*]] = "mhlo.create_token"
+
+  // CHECK:      [[RECV_TUPLE:%.*]] = "mhlo.recv"([[INIT_TOKEN]])
+  // CHECK-SAME: channel_id = {handle = 1 : i64, type = 3 : i64}
+  // CHECK-SAME: is_host_transfer = true
+  // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "s32", _xla_host_transfer_rendezvous = "recv_key"}
+  // CHECK-SAME: (!mhlo.token) -> tuple<tensor<i32>, !mhlo.token>
+
+
+  // CHECK:      [[RECV_VAL:%.*]] = "mhlo.get_tuple_element"([[RECV_TUPLE]])
+  // CHECK-SAME: index = 0
+  // CHECK-SAME: (tuple<tensor<i32>, !mhlo.token>) -> tensor<i32>
+
+  // CHECK:      [[RECV_TOKEN:%.*]] = "mhlo.get_tuple_element"([[RECV_TUPLE]])
+  // CHECK-SAME: index = 1
+  // CHECK-SAME: (tuple<tensor<i32>, !mhlo.token>) -> !mhlo.token
+  %0 = "tf.XlaRecvFromHost"() {key = "recv_key", shape = #tf.shape<>} : () -> tensor<i32>
+
+  // CHECK:      return [[RECV_VAL]] : tensor<i32>
+  return %0 : tensor<i32>
+}
+
+// -----
+
+// Test legalization of multiple TF/XLA communication ops are sequenced with
+// their generated tokens. Channel Id's are also uniquely assigned.
+
+// CHECK-LABEL: func @multiple_consecutive_ops
+// CHECK-SAME:  ([[ARG0:%.*]]: tensor<i32>)
+func @multiple_consecutive_ops(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK:      [[INIT_TOKEN:%.*]] = "mhlo.create_token"
+
+  // CHECK:      [[SEND0_ARG0_TOKEN:%.*]] = "mhlo.send"([[ARG0]], [[INIT_TOKEN]])
+  // CHECK-SAME: channel_id = {handle = 1 : i64, type = 2 : i64}
+  // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "s32", _xla_host_transfer_rendezvous = "send0_dtoh_0"}
+
+  // CHECK:      [[RECV0_RETVAL0_TUPLE:%.*]] = "mhlo.recv"([[SEND0_ARG0_TOKEN]])
+  // CHECK-SAME: channel_id = {handle = 2 : i64, type = 3 : i64}
+  // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "s32", _xla_host_transfer_rendezvous = "recv0_htod_0"}
+
+  // CHECK:      [[RECV0_RETVAL0_VAL:%.*]] = "mhlo.get_tuple_element"([[RECV0_RETVAL0_TUPLE]])
+  // CHECK-SAME: index = 0
+
+  // CHECK:      [[RECV0_RETVAL0_TOKEN:%.*]] = "mhlo.get_tuple_element"([[RECV0_RETVAL0_TUPLE]])
+  // CHECK-SAME: index = 1
+  %0 = "tf._XlaHostComputeMlir"(%arg0) {recv_key = "recv0", send_key = "send0", tpu_core = 0 : i64} : (tensor<i32>) -> tensor<i32>
+
+  // CHECK:      [[SEND1_ARG0_TOKEN:%.*]] = "mhlo.send"([[RECV0_RETVAL0_VAL]], [[RECV0_RETVAL0_TOKEN]])
+  // CHECK-SAME: channel_id = {handle = 3 : i64, type = 2 : i64}
+  // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "s32", _xla_host_transfer_rendezvous = "send1_dtoh_0"}
+
+  // CHECK:      [[RECV1_RETVAL0_TUPLE:%.*]] = "mhlo.recv"([[SEND1_ARG0_TOKEN]])
+  // CHECK-SAME: channel_id = {handle = 4 : i64, type = 3 : i64}
+  // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "s32", _xla_host_transfer_rendezvous = "recv1_htod_0"}
+
+  // CHECK:      [[RECV1_RETVAL0_VAL:%.*]] = "mhlo.get_tuple_element"([[RECV1_RETVAL0_TUPLE]])
+  // CHECK-SAME: index = 0
+
+  // CHECK:      [[RECV1_RETVAL0_TOKEN:%.*]] = "mhlo.get_tuple_element"([[RECV1_RETVAL0_TUPLE]])
+  // CHECK-SAME: index = 1
+  %1 = "tf._XlaHostComputeMlir"(%0) {recv_key = "recv1", send_key = "send1", tpu_core = 0 : i64} : (tensor<i32>) -> tensor<i32>
+
+  // CHECK:      return [[RECV1_RETVAL0_VAL]] : tensor<i32>
+  return %1 : tensor<i32>
+}
+
+// -----
+
+// Test private function with TF/XLA communication op being called by another
+// function gets rewritten with an extra token argument and an extra token
+// result, and the caller passes in a token. The top level function not called
+// (or public) will be updated to create a token.
+
+// CHECK: func @main([[MAIN_ARG0:%.*]]: tensor<i32>) -> tensor<i32>
+func @main(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK:      [[MAIN_TOKEN:%.*]] = "mhlo.create_token"
+
+  // CHECK:      [[CALL:%.*]]:2 = call @callee([[MAIN_ARG0]], [[MAIN_TOKEN]])
+  // CHECK-SAME: (tensor<i32>, !mhlo.token) -> (tensor<i32>, !mhlo.token)
+  %0 = call @callee(%arg0) : (tensor<i32>) -> tensor<i32>
+
+  // CHECK:      return [[CALL]]#0
+  return %0 : tensor<i32>
+}
+
+// CHECK: func @callee([[CALLEE_ARG0:%.*]]: tensor<i32>, [[CALLEE_ARG1:%.*]]: !mhlo.token) -> (tensor<i32>, !mhlo.token)
+func @callee(%arg0: tensor<i32>) -> tensor<i32> attributes {sym_visibility = "private"} {
+  // CHECK-NOT:  "mhlo.create_token"
+
+  // CHECK:      [[SEND_ARG0_TOKEN:%.*]] = "mhlo.send"([[CALLEE_ARG0]], [[CALLEE_ARG1]])
+  // CHECK:      [[RECV_RETVAL0_TUPLE:%.*]] = "mhlo.recv"([[SEND_ARG0_TOKEN]])
+  // CHECK:      [[RECV_RETVAL0_VAL:%.*]] = "mhlo.get_tuple_element"([[RECV_RETVAL0_TUPLE]])
+  // CHECK-SAME: index = 0
+  // CHECK:      [[RECV_RETVAL0_TOKEN:%.*]] = "mhlo.get_tuple_element"([[RECV_RETVAL0_TUPLE]])
+  // CHECK-SAME: index = 1
+  %0 = "tf._XlaHostComputeMlir"(%arg0) {recv_key = "recv", send_key = "send", tpu_core = 0 : i64} : (tensor<i32>) -> tensor<i32>
+
+  // CHECK:      return [[RECV_RETVAL0_VAL]], [[RECV_RETVAL0_TOKEN]]
+  return %0 : tensor<i32>
+}
+
+// -----
+
+// Test public function with TF/XLA communication op being called by another
+// function. The original public function will be modified to create a token,
+// while the function is cloned and rewritten with an extra token argument and
+// an extra token result. All callers to the original function are updated to
+// point to the cloned function and the function the caller is in is updated to
+// pass a token or create a token.
+
+// CHECK: func @main([[MAIN_ARG0:%.*]]: tensor<i32>) -> tensor<i32>
+func @main(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK:      [[MAIN_TOKEN:%.*]] = "mhlo.create_token"
+
+  // CHECK:      [[CALL:%.*]]:2 = call [[CALLEE_CLONE:@.*]]([[MAIN_ARG0]], [[MAIN_TOKEN]])
+  // CHECK-SAME: (tensor<i32>, !mhlo.token) -> (tensor<i32>, !mhlo.token)
+  %0 = call @callee(%arg0) : (tensor<i32>) -> tensor<i32>
+
+  // CHECK:      return [[CALL]]#0 : tensor<i32>
+  return %0 : tensor<i32>
+}
+
+// CHECK: func @callee([[CALLEE_ARG0:%.*]]: tensor<i32>) -> tensor<i32>
+func @callee(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK:      [[CALLEE_TOKEN:%.*]] = "mhlo.create_token"
+
+  // CHECK:      [[SEND_ARG0_TOKEN:%.*]] = "mhlo.send"([[CALLEE_ARG0]], [[CALLEE_TOKEN]])
+  // CHECK:      [[RECV_RETVAL0_TUPLE:%.*]] = "mhlo.recv"([[SEND_ARG0_TOKEN]])
+  // CHECK:      [[RECV_RETVAL0_VAL:%.*]] = "mhlo.get_tuple_element"([[RECV_RETVAL0_TUPLE]])
+  // CHECK-SAME: index = 0
+  // CHECK:      [[RECV_RETVAL0_TOKEN:%.*]] = "mhlo.get_tuple_element"([[RECV_RETVAL0_TUPLE]])
+  // CHECK-SAME: index = 1
+  %0 = "tf._XlaHostComputeMlir"(%arg0) {recv_key = "recv", send_key = "send", tpu_core = 0 : i64} : (tensor<i32>) -> tensor<i32>
+
+  // CHECK:      return [[RECV_RETVAL0_VAL]]
+  return %0 : tensor<i32>
+}
+
+// CHECK: func [[CALLEE_CLONE]]([[CALLEE_CLONE_ARG0:%.*]]: tensor<i32>, [[CALLEE_CLONE_ARG1:%.*]]: !mhlo.token) -> (tensor<i32>, !mhlo.token)
+// CHECK-NOT:  "mhlo.create_token"
+
+// CHECK:      [[CLONE_SEND_ARG0_TOKEN:%.*]] = "mhlo.send"([[CALLEE_CLONE_ARG0]], [[CALLEE_CLONE_ARG1]])
+// CHECK:      [[CLONE_RECV_RETVAL0_TUPLE:%.*]] = "mhlo.recv"([[CLONE_SEND_ARG0_TOKEN]])
+// CHECK:      [[CLONE_RECV_RETVAL0_VAL:%.*]] = "mhlo.get_tuple_element"([[CLONE_RECV_RETVAL0_TUPLE]])
+// CHECK-SAME: index = 0
+// CHECK:      [[CLONE_RECV_RETVAL0_TOKEN:%.*]] = "mhlo.get_tuple_element"([[CLONE_RECV_RETVAL0_TUPLE]])
+// CHECK-SAME: index = 1
+
+// CHECK:      return [[CLONE_RECV_RETVAL0_VAL]], [[CLONE_RECV_RETVAL0_TOKEN]]
+
+// -----
+
+// Tests generated tokens are passed into a function call that also has TF/XLA
+// communication ops.
+
+// CHECK: func @main([[MAIN_ARG0:%.*]]: tensor<i32>)
+func @main(%arg0: tensor<i32>) {
+  // CHECK:      [[MAIN_TOKEN:%.*]] = "mhlo.create_token"
+
+  // CHECK:      [[MAIN_SEND0_TOKEN:%.*]] = "mhlo.send"([[MAIN_ARG0]], [[MAIN_TOKEN]])
+  "tf.XlaSendToHost"(%arg0) {key = "send0"} : (tensor<i32>) -> ()
+
+  // CHECK:      [[CALL_TOKEN:%.*]] = call @callee([[MAIN_SEND0_TOKEN]])
+  // CHECK-SAME: (!mhlo.token) -> !mhlo.token
+  call @callee() : () -> ()
+
+  // CHECK:      [[MAIN_SEND2_TOKEN:%.*]] = "mhlo.send"([[MAIN_ARG0]], [[CALL_TOKEN]])
+  "tf.XlaSendToHost"(%arg0) {key = "send2"} : (tensor<i32>) -> ()
+  return
+}
+
+// CHECK: func @callee([[CALLEE_ARG0:%.*]]: !mhlo.token) -> !mhlo.token
+func @callee() attributes {sym_visibility = "private"} {
+  // CHECK-NOT:  "mhlo.create_token"
+
+  // CHECK:      [[ZERO:%.*]] = mhlo.constant dense<0>
+  %0 = mhlo.constant dense<0> : tensor<i32>
+
+  // CHECK:      [[CALLEE_SEND_TOKEN:%.*]] = "mhlo.send"([[ZERO]], [[CALLEE_ARG0]])
+  "tf.XlaSendToHost"(%0) {key = "send1"} : (tensor<i32>) -> ()
+
+  // CHECK:      return [[CALLEE_SEND_TOKEN]]
+  return
+}
+
+// -----
+
+// Test only the top level function generates a token.
+
+// CHECK: func @callee0()
+func @callee0() attributes {sym_visibility = "private"} {
+  // CHECK:      [[INIT_TOKEN:%.*]] = "mhlo.create_token"
+
+  // CHECK:      call @callee1([[INIT_TOKEN]])
+  call @callee1() : () -> ()
+  return
+}
+
+// CHECK: func @callee1([[CALLEE1_ARG0:%.*]]: !mhlo.token) -> !mhlo.token
+func @callee1() attributes {sym_visibility = "private"} {
+  // CHECK-NOT:  "mhlo.create_token"
+
+  // CHECK:      [[CALL_2:%.*]] = call @callee2([[CALLEE1_ARG0]])
+  call @callee2() : () -> ()
+
+  // CHECK:      return [[CALL_2]]
+  return
+}
+
+// CHECK: func @callee2([[CALLEE2_ARG0:%.*]]: !mhlo.token) -> !mhlo.token
+func @callee2() attributes {sym_visibility = "private"} {
+  // CHECK-NOT:  "mhlo.create_token"
+
+  // CHECK:      [[RECV_TUPLE:%.*]] = "mhlo.recv"([[CALLEE2_ARG0]])
+  // CHECK:      [[RECV_VAL:%.*]] = "mhlo.get_tuple_element"([[RECV_TUPLE]])
+  // CHECK-SAME: index = 0
+  // CHECK:      [[RECV_TOKEN:%.*]] = "mhlo.get_tuple_element"([[RECV_TUPLE]])
+  // CHECK-SAME: index = 1
+  %0 = "tf.XlaRecvFromHost"() {key = "recv_key", shape = #tf.shape<>} : () -> tensor<i32>
+
+  // CHECK:      return [[RECV_TOKEN]]
+  return
+}
+
+// -----
+
+// Test cloned function rewrite also checks transitive function calls to
+// TF/XLA communication ops.
+
+// CHECK: func @callee3()
+func @callee3() {
+  // CHECK:      [[CALLEE3_INIT_TOKEN:%.*]] = "mhlo.create_token"
+
+  // CHECK:      call @callee4{{.+}}([[CALLEE3_INIT_TOKEN]])
+  call @callee4() : () -> ()
+  return
+}
+
+// CHECK: func @callee4()
+func @callee4() {
+  // CHECK:      [[CALLEE4_INIT_TOKEN:%.*]] = "mhlo.create_token"
+
+  // CHECK:      [[CALL_5:%.*]] = call @callee5([[CALLEE4_INIT_TOKEN]])
+  call @callee5() : () -> ()
+
+  // CHECK:      return
+  return
+}
+
+// CHECK: func @callee5([[CALLEE5_ARG0:%.*]]: !mhlo.token) -> !mhlo.token
+func @callee5() attributes {sym_visibility = "private"} {
+  // CHECK-NOT:  "mhlo.create_token"
+
+  // CHECK:      [[RECV_TUPLE:%.*]] = "mhlo.recv"([[CALLEE5_ARG0]])
+  // CHECK:      [[RECV_VAL:%.*]] = "mhlo.get_tuple_element"([[RECV_TUPLE]])
+  // CHECK-SAME: index = 0
+  // CHECK:      [[RECV_TOKEN:%.*]] = "mhlo.get_tuple_element"([[RECV_TUPLE]])
+  // CHECK-SAME: index = 1
+  %0 = "tf.XlaRecvFromHost"() {key = "recv_key", shape = #tf.shape<>} : () -> tensor<i32>
+
+  // CHECK:      return [[RECV_TOKEN]]
+  return
+}
+
+// CHECK: func @callee4{{.+}}([[CALLEE4_ARG0:%.*]]: !mhlo.token) -> !mhlo.token attributes {sym_visibility = "private"}
+// CHECK-NOT:  "mhlo.create_token"
+// CHECK:      [[CALL_5:%.*]] = call @callee5([[CALLEE4_ARG0]])
+// CHECK:      return [[CALL_5]]
+
+// -----
+
+// Tests `mhlo.if` with branches populated with TF/XLA communication ops.
+
+// CHECK-LABEL: func @if_both_branches
+// CHECK-SAME:  ([[ARG0:%.*]]: tensor<i1>, [[ARG1:%.*]]: tensor<f32>, [[ARG2:%.*]]: tensor<f32>)
+func @if_both_branches(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32> {
+  // CHECK: [[INIT_TOKEN:%.*]] = "mhlo.create_token"
+  // CHECK: [[TRUE_TUPLE:%.*]] = "mhlo.tuple"([[ARG1]], [[INIT_TOKEN]])
+  // CHECK: [[FALSE_TUPLE:%.*]] = "mhlo.tuple"([[ARG2]], [[INIT_TOKEN]])
+
+  // CHECK: [[IF_TUPLE:%.*]] = "mhlo.if"([[ARG0]], [[TRUE_TUPLE]], [[FALSE_TUPLE]])
+  %0 = "mhlo.if"(%arg0, %arg1, %arg2) ( {
+  // CHECK: ^bb0([[TRUE_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg3: tensor<f32>):
+    // CHECK-DAG:  [[TRUE_REGION_ARG_VALUE:%.*]] = "mhlo.get_tuple_element"([[TRUE_REGION_ARG]]) {index = 0
+    // CHECK-DAG:  [[TRUE_REGION_ARG_TOKEN:%.*]] = "mhlo.get_tuple_element"([[TRUE_REGION_ARG]]) {index = 1
+
+    // CHECK:      [[TRUE_SEND_TOKEN:%.*]] = "mhlo.send"([[TRUE_REGION_ARG_VALUE]], [[TRUE_REGION_ARG_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 1 : i64, type = 2 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "send_if_true_dtoh_0"}
+
+    // CHECK:      [[TRUE_RECV_TUPLE:%.*]] = "mhlo.recv"([[TRUE_SEND_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 2 : i64, type = 3 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "recv_if_true_htod_0"}
+    %1 = "tf._XlaHostComputeMlir"(%arg3) {recv_key = "recv_if_true", send_key = "send_if_true", tpu_core = 0 : i64} : (tensor<f32>) -> tensor<f32>
+
+    // CHECK-DAG:  [[TRUE_GET_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[TRUE_RECV_TUPLE]]) {index = 0
+    // CHECK-DAG:  [[TRUE_GET_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[TRUE_RECV_TUPLE]]) {index = 1
+    // CHECK:      [[TRUE_RETURN_TUPLE:%.*]] = "mhlo.tuple"([[TRUE_GET_TUPLE_ELEMENT0]], [[TRUE_GET_TUPLE_ELEMENT1]])
+    // CHECK:      "mhlo.return"([[TRUE_RETURN_TUPLE]])
+    "mhlo.return"(%1) : (tensor<f32>) -> ()
+  },  {
+  // CHECK: ^bb0([[FALSE_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg3: tensor<f32>):
+    // CHECK-DAG:  [[FALSE_REGION_ARG_VALUE:%.*]] = "mhlo.get_tuple_element"([[FALSE_REGION_ARG]]) {index = 0
+    // CHECK-DAG:  [[FALSE_REGION_ARG_TOKEN:%.*]] = "mhlo.get_tuple_element"([[FALSE_REGION_ARG]]) {index = 1
+
+    // CHECK:      [[FALSE_SEND_TOKEN:%.*]] = "mhlo.send"([[FALSE_REGION_ARG_VALUE]], [[FALSE_REGION_ARG_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 3 : i64, type = 2 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "send_if_false_dtoh_0"}
+
+    // CHECK:      [[FALSE_RECV_TUPLE:%.*]] = "mhlo.recv"([[FALSE_SEND_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 4 : i64, type = 3 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "recv_if_false_htod_0"}
+    %1 = "tf._XlaHostComputeMlir"(%arg3) {recv_key = "recv_if_false", send_key = "send_if_false", tpu_core = 0 : i64} : (tensor<f32>) -> tensor<f32>
+
+    // CHECK-DAG:  [[FALSE_GET_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[FALSE_RECV_TUPLE]]) {index = 0
+    // CHECK-DAG:  [[FALSE_GET_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[FALSE_RECV_TUPLE]]) {index = 1
+    // CHECK:      [[FALSE_RETURN_TUPLE:%.*]] = "mhlo.tuple"([[FALSE_GET_TUPLE_ELEMENT0]], [[FALSE_GET_TUPLE_ELEMENT1]])
+    // CHECK:      "mhlo.return"([[FALSE_RETURN_TUPLE]])
+    "mhlo.return"(%1) : (tensor<f32>) -> ()
+
+  // CHECK: (tensor<i1>, tuple<tensor<f32>, !mhlo.token>, tuple<tensor<f32>, !mhlo.token>) -> tuple<tensor<f32>, !mhlo.token>
+  }) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+
+  // CHECK:      [[IF_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[IF_TUPLE]])
+  // CHECK-SAME: index = 0
+  // CHECK:      return [[IF_TUPLE_ELEMENT0]]
+  return %0 : tensor<f32>
+}
+
+// -----
+
+// Tests `mhlo.if` with only the `true` branch populated with TF/XLA
+// communication ops.
+
+// CHECK-LABEL: func @if_true_branch
+// CHECK-SAME:  ([[ARG0:%.*]]: tensor<i1>, [[ARG1:%.*]]: tensor<f32>, [[ARG2:%.*]]: tensor<f32>)
+func @if_true_branch(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32> {
+  // CHECK: [[INIT_TOKEN:%.*]] = "mhlo.create_token"
+  // CHECK: [[TRUE_TUPLE:%.*]] = "mhlo.tuple"([[ARG1]], [[INIT_TOKEN]])
+  // CHECK: [[FALSE_TUPLE:%.*]] = "mhlo.tuple"([[ARG2]], [[INIT_TOKEN]])
+
+  // CHECK: [[IF_TUPLE:%.*]] = "mhlo.if"([[ARG0]], [[TRUE_TUPLE]], [[FALSE_TUPLE]])
+  %0 = "mhlo.if"(%arg0, %arg1, %arg2) ( {
+  // CHECK: ^bb0([[TRUE_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg3: tensor<f32>):
+    // CHECK-DAG:  [[TRUE_REGION_ARG_VALUE:%.*]] = "mhlo.get_tuple_element"([[TRUE_REGION_ARG]]) {index = 0
+    // CHECK-DAG:  [[TRUE_REGION_ARG_TOKEN:%.*]] = "mhlo.get_tuple_element"([[TRUE_REGION_ARG]]) {index = 1
+
+    // CHECK:      [[TRUE_SEND_TOKEN:%.*]] = "mhlo.send"([[TRUE_REGION_ARG_VALUE]], [[TRUE_REGION_ARG_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 1 : i64, type = 2 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "send_if_true_dtoh_0"}
+
+    // CHECK:      [[TRUE_RECV_TUPLE:%.*]] = "mhlo.recv"([[TRUE_SEND_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 2 : i64, type = 3 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "recv_if_true_htod_0"}
+    %1 = "tf._XlaHostComputeMlir"(%arg3) {recv_key = "recv_if_true", send_key = "send_if_true", tpu_core = 0 : i64} : (tensor<f32>) -> tensor<f32>
+
+    // CHECK-DAG:  [[TRUE_GET_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[TRUE_RECV_TUPLE]]) {index = 0
+    // CHECK-DAG:  [[TRUE_GET_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[TRUE_RECV_TUPLE]]) {index = 1
+    // CHECK:      [[TRUE_RETURN_TUPLE:%.*]] = "mhlo.tuple"([[TRUE_GET_TUPLE_ELEMENT0]], [[TRUE_GET_TUPLE_ELEMENT1]])
+    // CHECK:      "mhlo.return"([[TRUE_RETURN_TUPLE]])
+    "mhlo.return"(%1) : (tensor<f32>) -> ()
+  },  {
+  // CHECK: ^bb0([[FALSE_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg3: tensor<f32>):
+    // CHECK-DAG:  [[FALSE_GET_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[FALSE_REGION_ARG]]) {index = 0
+    // CHECK-DAG:  [[FALSE_GET_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[FALSE_REGION_ARG]]) {index = 1
+    // CHECK:      [[FALSE_RETURN_TUPLE:%.*]] = "mhlo.tuple"([[FALSE_GET_TUPLE_ELEMENT0]], [[FALSE_GET_TUPLE_ELEMENT1]])
+    // CHECK:      "mhlo.return"([[FALSE_RETURN_TUPLE]])
+    "mhlo.return"(%arg3) : (tensor<f32>) -> ()
+
+  // CHECK: (tensor<i1>, tuple<tensor<f32>, !mhlo.token>, tuple<tensor<f32>, !mhlo.token>) -> tuple<tensor<f32>, !mhlo.token>
+  }) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+
+  // CHECK:      [[IF_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[IF_TUPLE]])
+  // CHECK-SAME: index = 0
+  // CHECK:      return [[IF_TUPLE_ELEMENT0]]
+  return %0 : tensor<f32>
+}
+
+// -----
+
+// Tests `mhlo.if` with only the `false` branch populated with TF/XLA
+// communication ops.
+
+// CHECK-LABEL: func @if_false_branch
+// CHECK-SAME:  ([[ARG0:%.*]]: tensor<i1>, [[ARG1:%.*]]: tensor<f32>, [[ARG2:%.*]]: tensor<f32>)
+func @if_false_branch(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32> {
+  // CHECK: [[INIT_TOKEN:%.*]] = "mhlo.create_token"
+  // CHECK: [[TRUE_TUPLE:%.*]] = "mhlo.tuple"([[ARG1]], [[INIT_TOKEN]])
+  // CHECK: [[FALSE_TUPLE:%.*]] = "mhlo.tuple"([[ARG2]], [[INIT_TOKEN]])
+
+  // CHECK: [[IF_TUPLE:%.*]] = "mhlo.if"([[ARG0]], [[TRUE_TUPLE]], [[FALSE_TUPLE]])
+  %0 = "mhlo.if"(%arg0, %arg1, %arg2) ( {
+  // CHECK: ^bb0([[TRUE_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg3: tensor<f32>):
+    // CHECK-DAG:  [[TRUE_GET_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[TRUE_REGION_ARG]]) {index = 0
+    // CHECK-DAG:  [[TRUE_GET_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[TRUE_REGION_ARG]]) {index = 1
+    // CHECK:      [[TRUE_RETURN_TUPLE:%.*]] = "mhlo.tuple"([[TRUE_GET_TUPLE_ELEMENT0]], [[TRUE_GET_TUPLE_ELEMENT1]])
+    // CHECK:      "mhlo.return"([[TRUE_RETURN_TUPLE]])
+    "mhlo.return"(%arg3) : (tensor<f32>) -> ()
+  },  {
+  // CHECK: ^bb0([[FALSE_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg3: tensor<f32>):
+    // CHECK-DAG:  [[FALSE_REGION_ARG_VALUE:%.*]] = "mhlo.get_tuple_element"([[FALSE_REGION_ARG]]) {index = 0
+    // CHECK-DAG:  [[FALSE_REGION_ARG_TOKEN:%.*]] = "mhlo.get_tuple_element"([[FALSE_REGION_ARG]]) {index = 1
+
+    // CHECK:      [[FALSE_SEND_TOKEN:%.*]] = "mhlo.send"([[FALSE_REGION_ARG_VALUE]], [[FALSE_REGION_ARG_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 1 : i64, type = 2 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "send_if_false_dtoh_0"}
+
+    // CHECK:      [[FALSE_RECV_TUPLE:%.*]] = "mhlo.recv"([[FALSE_SEND_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 2 : i64, type = 3 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "recv_if_false_htod_0"}
+    %1 = "tf._XlaHostComputeMlir"(%arg3) {recv_key = "recv_if_false", send_key = "send_if_false", tpu_core = 0 : i64} : (tensor<f32>) -> tensor<f32>
+
+    // CHECK-DAG:  [[FALSE_GET_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[FALSE_RECV_TUPLE]]) {index = 0
+    // CHECK-DAG:  [[FALSE_GET_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[FALSE_RECV_TUPLE]]) {index = 1
+    // CHECK:      [[FALSE_RETURN_TUPLE:%.*]] = "mhlo.tuple"([[FALSE_GET_TUPLE_ELEMENT0]], [[FALSE_GET_TUPLE_ELEMENT1]])
+    // CHECK:      "mhlo.return"([[FALSE_RETURN_TUPLE]])
+    "mhlo.return"(%1) : (tensor<f32>) -> ()
+
+  // CHECK: (tensor<i1>, tuple<tensor<f32>, !mhlo.token>, tuple<tensor<f32>, !mhlo.token>) -> tuple<tensor<f32>, !mhlo.token>
+  }) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+
+  // CHECK:      [[IF_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[IF_TUPLE]])
+  // CHECK-SAME: index = 0
+  // CHECK:      return [[IF_TUPLE_ELEMENT0]]
+  return %0 : tensor<f32>
+}
+
+// -----
+
+// Tests `mhlo.if` with tuple arg from a `mhlo.tuple` only used by `mhlo.if` is
+// replaced.
+
+// CHECK-LABEL: func @if_replace_tuple_arg
+// CHECK-SAME:  ([[ARG0:%.*]]: tensor<i1>, [[ARG1:%.*]]: tensor<f32>, [[ARG2:%.*]]: tensor<f32>)
+func @if_replace_tuple_arg(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32> {
+  // CHECK-NOT:  "mhlo.tuple"([[ARG1]], [[ARG2]])
+  // CHECK:      [[INIT_TOKEN:%.*]] = "mhlo.create_token"
+  // CHECK:      [[IF_ARG_TUPLE:%.*]] = "mhlo.tuple"([[ARG1]], [[ARG2]], [[INIT_TOKEN]])
+  %0 = "mhlo.tuple"(%arg1, %arg2) : (tensor<f32>, tensor<f32>) -> tuple<tensor<f32>, tensor<f32>>
+
+  // CHECK: [[IF_TUPLE:%.*]] = "mhlo.if"([[ARG0]], [[IF_ARG_TUPLE]], [[IF_ARG_TUPLE]])
+  %1 = "mhlo.if"(%arg0, %0, %0) ( {
+  ^bb0(%arg3: tuple<tensor<f32>, tensor<f32>>):
+    %2 = "mhlo.get_tuple_element"(%arg3) {index = 0 : i32} : (tuple<tensor<f32>, tensor<f32>>) -> tensor<f32>
+    "tf.XlaSendToHost"(%2) {key = "send_key"} : (tensor<f32>) -> ()
+    "mhlo.return"(%2) : (tensor<f32>) -> ()
+  },  {
+  ^bb0(%arg3: tuple<tensor<f32>, tensor<f32>>):
+    %2 = "mhlo.get_tuple_element"(%arg3) {index = 0 : i32} : (tuple<tensor<f32>, tensor<f32>>) -> tensor<f32>
+    "mhlo.return"(%2) : (tensor<f32>) -> ()
+  }) : (tensor<i1>, tuple<tensor<f32>, tensor<f32>>, tuple<tensor<f32>, tensor<f32>>) -> tensor<f32>
+  return %1 : tensor<f32>
+}
+
+// -----
+
+// Tests `mhlo.if` with tuple arg not from a `mhlo.tuple` is unpacked.
+
+// CHECK-LABEL: func @if_unpack_tuple_arg
+// CHECK-SAME:  ([[ARG0:%.*]]: tensor<i1>, [[ARG1:%.*]]: tuple<tensor<f32>, tensor<f32>>)
+func @if_unpack_tuple_arg(%arg0: tensor<i1>, %arg1: tuple<tensor<f32>, tensor<f32>>) -> tensor<f32> {
+  // CHECK:      [[INIT_TOKEN:%.*]] = "mhlo.create_token"
+  // CHECK-DAG:  [[IF_ARG_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[ARG1]]) {index = 0
+  // CHECK-DAG:  [[IF_ARG_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[ARG1]]) {index = 1
+  // CHECK:      [[IF_ARG_TUPLE:%.*]] = "mhlo.tuple"([[IF_ARG_ELEMENT0]], [[IF_ARG_ELEMENT1]], [[INIT_TOKEN]])
+
+  // CHECK:      [[IF_TUPLE:%.*]] = "mhlo.if"([[ARG0]], [[IF_ARG_TUPLE]], [[IF_ARG_TUPLE]])
+  %0 = "mhlo.if"(%arg0, %arg1, %arg1) ( {
+  ^bb0(%arg2: tuple<tensor<f32>, tensor<f32>>):
+    %1 = "mhlo.get_tuple_element"(%arg2) {index = 0 : i32} : (tuple<tensor<f32>, tensor<f32>>) -> tensor<f32>
+    "tf.XlaSendToHost"(%1) {key = "send_key"} : (tensor<f32>) -> ()
+    "mhlo.return"(%1) : (tensor<f32>) -> ()
+  },  {
+  ^bb0(%arg2: tuple<tensor<f32>, tensor<f32>>):
+    %1 = "mhlo.get_tuple_element"(%arg2) {index = 0 : i32} : (tuple<tensor<f32>, tensor<f32>>) -> tensor<f32>
+    "mhlo.return"(%1) : (tensor<f32>) -> ()
+  }) : (tensor<i1>, tuple<tensor<f32>, tensor<f32>>, tuple<tensor<f32>, tensor<f32>>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+// Tests `mhlo.if` tuple result is extended with a `mhlo.token`.
+
+// CHECK-LABEL: func @if_extend_tuple_result
+func @if_extend_tuple_result(%arg0: tensor<i1>, %arg1: tuple<tensor<f32>, tensor<f32>>) -> tuple<tensor<f32>, tensor<f32>> {
+  // CHECK:      [[IF_TUPLE:%.*]] = "mhlo.if"
+  %0 = "mhlo.if"(%arg0, %arg1, %arg1) ( {
+  ^bb0(%arg2: tuple<tensor<f32>, tensor<f32>>):
+    %1 = "mhlo.get_tuple_element"(%arg2) {index = 0 : i32} : (tuple<tensor<f32>, tensor<f32>>) -> tensor<f32>
+    "tf.XlaSendToHost"(%1) {key = "send_key"} : (tensor<f32>) -> ()
+    "mhlo.return"(%arg2) : (tuple<tensor<f32>, tensor<f32>>) -> ()
+  },  {
+  ^bb0(%arg2: tuple<tensor<f32>, tensor<f32>>):
+    "mhlo.return"(%arg2) : (tuple<tensor<f32>, tensor<f32>>) -> ()
+  // CHECK:      (tensor<i1>, tuple<tensor<f32>, tensor<f32>, !mhlo.token>, tuple<tensor<f32>, tensor<f32>, !mhlo.token>) -> tuple<tensor<f32>, tensor<f32>, !mhlo.token>
+  }) : (tensor<i1>, tuple<tensor<f32>, tensor<f32>>, tuple<tensor<f32>, tensor<f32>>) -> tuple<tensor<f32>, tensor<f32>>
+
+  // CHECK-DAG:  [[IF_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[IF_TUPLE]]) {index = 0
+  // CHECK-DAG:  [[IF_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[IF_TUPLE]]) {index = 1
+  // CHECK:      [[IF_SUBTUPLE_RESULT:%.*]] = "mhlo.tuple"([[IF_TUPLE_ELEMENT0]], [[IF_TUPLE_ELEMENT1]])
+  // CHECK:      return [[IF_SUBTUPLE_RESULT]]
+  return %0 : tuple<tensor<f32>, tensor<f32>>
+}
+
+// -----
+
+// Tests nested `mhlo.if` containing TF/XLA communication ops.
+
+// CHECK-LABEL: func @if_nested
+// CHECK-SAME:  ([[ARG0:%.*]]: tensor<i1>, [[ARG1:%.*]]: tensor<f32>)
+func @if_nested(%arg0: tensor<i1>, %arg1: tensor<f32>) -> tensor<f32> {
+  // CHECK:      [[INIT_TOKEN:%.*]] = "mhlo.create_token"
+  // CHECK:      [[OUTER_IF_ARG_TUPLE:%.*]] = "mhlo.tuple"([[ARG1]], [[INIT_TOKEN]])
+
+  // CHECK:      "mhlo.if"([[ARG0]], [[OUTER_IF_ARG_TUPLE]], [[OUTER_IF_ARG_TUPLE]])
+  %0 = "mhlo.if"(%arg0, %arg1, %arg1) ( {
+  // CHECK-NEXT: ^bb0([[OUTER_IF_TRUE_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg2: tensor<f32>):
+    // CHECK-DAG:  [[OUTER_IF_TRUE_ARG_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[OUTER_IF_TRUE_ARG]]) {index = 0
+    // CHECK-DAG:  [[OUTER_IF_TRUE_ARG_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[OUTER_IF_TRUE_ARG]]) {index = 1
+    // CHECK:      [[INNER_IF_ARG_TUPLE:%.*]] = "mhlo.tuple"([[OUTER_IF_TRUE_ARG_ELEMENT0]], [[OUTER_IF_TRUE_ARG_ELEMENT1]])
+
+    %1 = mhlo.constant dense<false> : tensor<i1>
+
+    // CHECK:      [[INNER_IF_TUPLE:%.*]] = "mhlo.if"({{%.*}}, [[INNER_IF_ARG_TUPLE]], [[INNER_IF_ARG_TUPLE]])
+    %2 = "mhlo.if"(%1, %arg2, %arg2) ( {
+    // CHECK-NEXT: ^bb0([[INNER_IF_TRUE_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+    ^bb0(%arg3: tensor<f32>):
+      // CHECK-DAG:  [[INNER_IF_TRUE_ARG_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[INNER_IF_TRUE_ARG]]) {index = 0
+      // CHECK-DAG:  [[INNER_IF_TRUE_ARG_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[INNER_IF_TRUE_ARG]]) {index = 1
+
+      // CHECK:      [[SEND_TOKEN:%.*]] = "mhlo.send"([[INNER_IF_TRUE_ARG_ELEMENT0]], [[INNER_IF_TRUE_ARG_ELEMENT1]])
+      "tf.XlaSendToHost"(%arg3) {key = "send_key"} : (tensor<f32>) -> ()
+
+      // CHECK:      [[INNER_IF_TRUE_RESULT:%.*]] = "mhlo.tuple"([[INNER_IF_TRUE_ARG_ELEMENT0]], [[SEND_TOKEN]])
+      // CHECK:      "mhlo.return"([[INNER_IF_TRUE_RESULT]])
+      "mhlo.return"(%arg3) : (tensor<f32>) -> ()
+
+    // CHECK-NEXT: },  {
+    },  {
+
+    // CHECK-NEXT: ^bb0([[INNER_IF_FALSE_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+    ^bb0(%arg3: tensor<f32>):
+      // CHECK-DAG:  [[INNER_IF_FALSE_ARG_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[INNER_IF_FALSE_ARG]]) {index = 0
+      // CHECK-DAG:  [[INNER_IF_FALSE_ARG_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[INNER_IF_FALSE_ARG]]) {index = 1
+      // CHECK:      [[INNER_IF_FALSE_RESULT:%.*]] = "mhlo.tuple"([[INNER_IF_FALSE_ARG_ELEMENT0]], [[INNER_IF_FALSE_ARG_ELEMENT1]])
+      // CHECK:      "mhlo.return"([[INNER_IF_FALSE_RESULT]])
+      "mhlo.return"(%arg3) : (tensor<f32>) -> ()
+    // CHECK-NEXT: (tensor<i1>, tuple<tensor<f32>, !mhlo.token>, tuple<tensor<f32>, !mhlo.token>) -> tuple<tensor<f32>, !mhlo.token>
+    }) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+
+    // CHECK-DAG:  [[INNER_IF_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[INNER_IF_TUPLE]]) {index = 1
+    // CHECK:      [[OUTER_IF_TRUE_RESULT:%.*]] = "mhlo.tuple"([[OUTER_IF_TRUE_ARG_ELEMENT0]], [[INNER_IF_TUPLE_ELEMENT1]])
+    // CHECK:      "mhlo.return"([[OUTER_IF_TRUE_RESULT]])
+    "mhlo.return"(%arg2) : (tensor<f32>) -> ()
+
+  // CHECK-NEXT: },  {
+  },  {
+
+  // CHECK-NEXT: ^bb0([[OUTER_IF_FALSE_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg2: tensor<f32>):
+    // CHECK-DAG:  [[OUTER_IF_FALSE_ARG_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[OUTER_IF_FALSE_ARG]]) {index = 0
+    // CHECK-DAG:  [[OUTER_IF_FALSE_ARG_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[OUTER_IF_FALSE_ARG]]) {index = 1
+    // CHECK:      [[OUTER_IF_FALSE_RESULT:%.*]] = "mhlo.tuple"([[OUTER_IF_FALSE_ARG_ELEMENT0]], [[OUTER_IF_FALSE_ARG_ELEMENT1]])
+    // CHECK:      "mhlo.return"([[OUTER_IF_FALSE_RESULT]])
+    "mhlo.return"(%arg2) : (tensor<f32>) -> ()
+  // CHECK-NEXT: (tensor<i1>, tuple<tensor<f32>, !mhlo.token>, tuple<tensor<f32>, !mhlo.token>) -> tuple<tensor<f32>, !mhlo.token>
+  }) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+// Tests `mhlo.if` containing a function call to TF/XLA communication ops.
+
+// CHECK-LABEL: func @if_function_call
+func @if_function_call(%arg0: tensor<i1>, %arg1: tensor<f32>) -> tensor<f32> {
+  // CHECK: "mhlo.if"
+  %0 = "mhlo.if"(%arg0, %arg1, %arg1) ( {
+  // CHECK: ^bb0([[TRUE_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg2: tensor<f32>):
+    // CHECK-DAG:  [[TRUE_REGION_ARG_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[TRUE_REGION_ARG]]) {index = 0
+    // CHECK-DAG:  [[TRUE_REGION_ARG_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[TRUE_REGION_ARG]]) {index = 1
+    // CHECK:      [[CALL_TOKEN:%.*]] = call @callee([[TRUE_REGION_ARG_ELEMENT0]], [[TRUE_REGION_ARG_ELEMENT1]])
+    call @callee(%arg2) : (tensor<f32>) -> ()
+
+    // CHECK:      [[TRUE_RETURN_TUPLE:%.*]] = "mhlo.tuple"([[TRUE_REGION_ARG_ELEMENT0]], [[CALL_TOKEN]])
+    // CHECK:      "mhlo.return"([[TRUE_RETURN_TUPLE]])
+    "mhlo.return"(%arg2) : (tensor<f32>) -> ()
+  },  {
+  ^bb0(%arg2: tensor<f32>):
+    "mhlo.return"(%arg2) : (tensor<f32>) -> ()
+  }) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// CHECK-LABEL: func @callee
+// CHECK-SAME:  ([[CALLEE_ARG0:%.*]]: tensor<f32>, [[CALLEE_ARG1:%.*]]: !mhlo.token) -> !mhlo.token
+func @callee(%arg0: tensor<f32>) attributes {sym_visibility = "private"} {
+  // CHECK: [[SEND_TOKEN:%.*]] = "mhlo.send"
+  "tf.XlaSendToHost"(%arg0) {key = "send_key"} : (tensor<f32>) -> ()
+
+  // CHECK: return [[SEND_TOKEN]]
+  return
+}
+
+// -----
+
+// Tests `mhlo.if` containing multiple TF/XLA communication ops.
+
+// CHECK-LABEL: func @if_region_multiple_ops
+func @if_region_multiple_ops(%arg0: tensor<i1>, %arg1: tensor<f32>) {
+  // CHECK: "mhlo.if"
+  %0 = "mhlo.if"(%arg0, %arg1, %arg1) ( {
+  // CHECK: ^bb0([[TRUE_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg2: tensor<f32>):
+    // CHECK: [[TRUE_REGION_ARG_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[TRUE_REGION_ARG]]) {index = 0
+    // CHECK: [[TRUE_REGION_ARG_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[TRUE_REGION_ARG]]) {index = 1
+
+    // CHECK: [[SEND0_TOKEN:%.*]] = "mhlo.send"([[TRUE_REGION_ARG_ELEMENT0]], [[TRUE_REGION_ARG_ELEMENT1]])
+    "tf.XlaSendToHost"(%arg2) {key = "send_key0"} : (tensor<f32>) -> ()
+
+    // CHECK: [[SEND1_TOKEN:%.*]] = "mhlo.send"([[TRUE_REGION_ARG_ELEMENT0]], [[SEND0_TOKEN]])
+    "tf.XlaSendToHost"(%arg2) {key = "send_key1"} : (tensor<f32>) -> ()
+
+    // CHECK: [[TRUE_RETURN_TUPLE:%.*]] = "mhlo.tuple"([[TRUE_REGION_ARG_ELEMENT0]], [[SEND1_TOKEN]])
+    // CHECK: "mhlo.return"([[TRUE_RETURN_TUPLE]])
+    "mhlo.return"(%arg2) : (tensor<f32>) -> ()
+  },  {
+  ^bb0(%arg2: tensor<f32>):
+    "mhlo.return"(%arg2) : (tensor<f32>) -> ()
+  }) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  return
+}
+
+// -----
+
+// Tests `mhlo.if` containing TF/XLA communication ops followed by other TF/XLA
+// communication ops.
+
+func @if_followed_by_communication_op(%arg0: tensor<i1>, %arg1: tensor<f32>) {
+  // CHECK: [[IF_TUPLE:%.*]] = "mhlo.if"
+  %0 = "mhlo.if"(%arg0, %arg1, %arg1) ( {
+  ^bb0(%arg2: tensor<f32>):
+    "tf.XlaSendToHost"(%arg2) {key = "send_key0"} : (tensor<f32>) -> ()
+    "mhlo.return"(%arg2) : (tensor<f32>) -> ()
+  },  {
+  ^bb0(%arg2: tensor<f32>):
+    "mhlo.return"(%arg2) : (tensor<f32>) -> ()
+  }) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+
+  // CHECK: [[IF_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[IF_TUPLE]]) {index = 1
+
+  // CHECK: "mhlo.send"({{.*}}, [[IF_TUPLE_ELEMENT1]])
+  "tf.XlaSendToHost"(%arg1) {key = "send_key1"} : (tensor<f32>) -> ()
+  return
+}
+
+// -----
+
+// Tests `mhlo.while` with cond and body populated with TF/XLA communication
+// ops.
+
+// CHECK-LABEL: func @while_cond_body
+// CHECK-SAME:  ([[ARG0:%.*]]: tensor<f32>)
+func @while_cond_body(%arg0: tensor<f32>) -> tensor<f32> {
+  // CHECK: [[INIT_TOKEN:%.*]] = "mhlo.create_token"
+  // CHECK: [[ARG_TUPLE:%.*]] = "mhlo.tuple"([[ARG0]], [[INIT_TOKEN]])
+
+  // CHECK: [[WHILE_TUPLE:%.*]] = "mhlo.while"([[ARG_TUPLE]])
+  %0 = "mhlo.while"(%arg0) ( {
+  // CHECK: ^bb0([[COND_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg1: tensor<f32>):
+    // CHECK-DAG:  [[COND_REGION_ARG_VALUE:%.*]] = "mhlo.get_tuple_element"([[COND_REGION_ARG]]) {index = 0
+    // CHECK-DAG:  [[COND_REGION_ARG_TOKEN:%.*]] = "mhlo.get_tuple_element"([[COND_REGION_ARG]]) {index = 1
+
+    // CHECK:      [[COND_SEND_TOKEN:%.*]] = "mhlo.send"([[COND_REGION_ARG_VALUE]], [[COND_REGION_ARG_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 1 : i64, type = 2 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "send_while_cond_dtoh_0"}
+
+    // CHECK:      [[COND_RECV_TUPLE:%.*]] = "mhlo.recv"([[COND_SEND_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 2 : i64, type = 3 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "recv_while_cond_htod_0"}
+    %1 = "tf._XlaHostComputeMlir"(%arg1) {recv_key = "recv_while_cond", send_key = "send_while_cond", tpu_core = 0 : i64} : (tensor<f32>) -> tensor<f32>
+
+    // CHECK-DAG:  [[COND_GET_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[COND_RECV_TUPLE]]) {index = 0
+    // CHECK-DAG:  [[COND_GET_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[COND_RECV_TUPLE]]) {index = 1
+
+    // CHECK:      [[COND_COMPARE:%.*]] = "mhlo.compare"([[COND_GET_TUPLE_ELEMENT0]], [[COND_GET_TUPLE_ELEMENT0]])
+    %2 = "mhlo.compare"(%1, %1) {comparison_direction = "LT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+
+    // CHECK:      "mhlo.return"([[COND_COMPARE]])
+    "mhlo.return"(%2) : (tensor<i1>) -> ()
+  },  {
+  // CHECK: ^bb0([[BODY_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg1: tensor<f32>):
+    // CHECK-DAG:  [[BODY_REGION_ARG_VALUE:%.*]] = "mhlo.get_tuple_element"([[BODY_REGION_ARG]]) {index = 0
+    // CHECK-DAG:  [[BODY_REGION_ARG_TOKEN:%.*]] = "mhlo.get_tuple_element"([[BODY_REGION_ARG]]) {index = 1
+
+    // CHECK:      [[BODY_SEND_TOKEN:%.*]] = "mhlo.send"([[BODY_REGION_ARG_VALUE]], [[BODY_REGION_ARG_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 3 : i64, type = 2 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "send_while_body_dtoh_0"}
+
+    // CHECK:      [[BODY_RECV_TUPLE:%.*]] = "mhlo.recv"([[BODY_SEND_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 4 : i64, type = 3 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "recv_while_body_htod_0"}
+    %1 = "tf._XlaHostComputeMlir"(%arg1) {recv_key = "recv_while_body", send_key = "send_while_body", tpu_core = 0 : i64} : (tensor<f32>) -> tensor<f32>
+
+    // CHECK-DAG:  [[BODY_GET_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[BODY_RECV_TUPLE]]) {index = 0
+    // CHECK-DAG:  [[BODY_GET_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[BODY_RECV_TUPLE]]) {index = 1
+    // CHECK:      [[BODY_RETURN_TUPLE:%.*]] = "mhlo.tuple"([[BODY_GET_TUPLE_ELEMENT0]], [[BODY_GET_TUPLE_ELEMENT1]])
+    // CHECK:      "mhlo.return"([[BODY_RETURN_TUPLE]])
+    "mhlo.return"(%1) : (tensor<f32>) -> ()
+  // CHECK: (tuple<tensor<f32>, !mhlo.token>) -> tuple<tensor<f32>, !mhlo.token>
+  }) : (tensor<f32>) -> tensor<f32>
+
+  // CHECK:      [[WHILE_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[WHILE_TUPLE]])
+  // CHECK-SAME: index = 0
+  // CHECK:      return [[WHILE_TUPLE_ELEMENT0]]
+  return %0 : tensor<f32>
+}
+
+// -----
+
+// Tests `mhlo.while` with only the `cond` region populated with TF/XLA
+// communication ops.
+
+// CHECK-LABEL: func @while_cond
+// CHECK-SAME:  ([[ARG0:%.*]]: tensor<f32>)
+func @while_cond(%arg0: tensor<f32>) -> tensor<f32> {
+  // CHECK: [[INIT_TOKEN:%.*]] = "mhlo.create_token"
+  // CHECK: [[ARG_TUPLE:%.*]] = "mhlo.tuple"([[ARG0]], [[INIT_TOKEN]])
+
+  // CHECK: [[WHILE_TUPLE:%.*]] = "mhlo.while"([[ARG_TUPLE]])
+  %0 = "mhlo.while"(%arg0) ( {
+  // CHECK: ^bb0([[COND_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg1: tensor<f32>):
+    // CHECK-DAG:  [[COND_REGION_ARG_VALUE:%.*]] = "mhlo.get_tuple_element"([[COND_REGION_ARG]]) {index = 0
+    // CHECK-DAG:  [[COND_REGION_ARG_TOKEN:%.*]] = "mhlo.get_tuple_element"([[COND_REGION_ARG]]) {index = 1
+
+    // CHECK:      [[COND_SEND_TOKEN:%.*]] = "mhlo.send"([[COND_REGION_ARG_VALUE]], [[COND_REGION_ARG_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 1 : i64, type = 2 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "send_while_cond_dtoh_0"}
+
+    // CHECK:      [[COND_RECV_TUPLE:%.*]] = "mhlo.recv"([[COND_SEND_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 2 : i64, type = 3 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "recv_while_cond_htod_0"}
+    %1 = "tf._XlaHostComputeMlir"(%arg1) {recv_key = "recv_while_cond", send_key = "send_while_cond", tpu_core = 0 : i64} : (tensor<f32>) -> tensor<f32>
+
+    // CHECK-DAG:  [[COND_GET_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[COND_RECV_TUPLE]]) {index = 0
+    // CHECK-DAG:  [[COND_GET_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[COND_RECV_TUPLE]]) {index = 1
+
+    // CHECK:      [[COND_COMPARE:%.*]] = "mhlo.compare"([[COND_GET_TUPLE_ELEMENT0]], [[COND_GET_TUPLE_ELEMENT0]])
+    %2 = "mhlo.compare"(%1, %1) {comparison_direction = "LT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+
+    // CHECK:      "mhlo.return"([[COND_COMPARE]])
+    "mhlo.return"(%2) : (tensor<i1>) -> ()
+  },  {
+  // CHECK: ^bb0([[BODY_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg1: tensor<f32>):
+    // CHECK-DAG:  [[BODY_GET_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[BODY_REGION_ARG]]) {index = 0
+    // CHECK-DAG:  [[BODY_GET_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[BODY_REGION_ARG]]) {index = 1
+    // CHECK:      [[BODY_RETURN_TUPLE:%.*]] = "mhlo.tuple"([[BODY_GET_TUPLE_ELEMENT0]], [[BODY_GET_TUPLE_ELEMENT1]])
+    // CHECK:      "mhlo.return"([[BODY_RETURN_TUPLE]])
+    "mhlo.return"(%arg1) : (tensor<f32>) -> ()
+  // CHECK: (tuple<tensor<f32>, !mhlo.token>) -> tuple<tensor<f32>, !mhlo.token>
+  }) : (tensor<f32>) -> tensor<f32>
+
+  // CHECK:      [[WHILE_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[WHILE_TUPLE]])
+  // CHECK-SAME: index = 0
+  // CHECK:      return [[WHILE_TUPLE_ELEMENT0]]
+  return %0 : tensor<f32>
+}
+
+// -----
+
+// Tests `mhlo.while` with only the `body` region populated with TF/XLA
+// communication ops.
+
+// CHECK-LABEL: func @while_body
+// CHECK-SAME:  ([[ARG0:%.*]]: tensor<f32>)
+func @while_body(%arg0: tensor<f32>) -> tensor<f32> {
+  // CHECK: [[INIT_TOKEN:%.*]] = "mhlo.create_token"
+  // CHECK: [[ARG_TUPLE:%.*]] = "mhlo.tuple"([[ARG0]], [[INIT_TOKEN]])
+
+  // CHECK: [[WHILE_TUPLE:%.*]] = "mhlo.while"([[ARG_TUPLE]])
+  %0 = "mhlo.while"(%arg0) ( {
+  // CHECK: ^bb0([[COND_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg1: tensor<f32>):
+    // CHECK-DAG:  [[COND_GET_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[COND_REGION_ARG]]) {index = 0
+    // CHECK-DAG:  [[COND_GET_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[COND_REGION_ARG]]) {index = 1
+
+    // CHECK:      [[COND_COMPARE:%.*]] = "mhlo.compare"([[COND_GET_TUPLE_ELEMENT0]], [[COND_GET_TUPLE_ELEMENT0]])
+    %2 = "mhlo.compare"(%arg1, %arg1) {comparison_direction = "LT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+
+    // CHECK:      "mhlo.return"([[COND_COMPARE]])
+    "mhlo.return"(%2) : (tensor<i1>) -> ()
+  },  {
+  // CHECK: ^bb0([[BODY_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg1: tensor<f32>):
+    // CHECK-DAG:  [[BODY_REGION_ARG_VALUE:%.*]] = "mhlo.get_tuple_element"([[BODY_REGION_ARG]]) {index = 0
+    // CHECK-DAG:  [[BODY_REGION_ARG_TOKEN:%.*]] = "mhlo.get_tuple_element"([[BODY_REGION_ARG]]) {index = 1
+
+    // CHECK:      [[BODY_SEND_TOKEN:%.*]] = "mhlo.send"([[BODY_REGION_ARG_VALUE]], [[BODY_REGION_ARG_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 1 : i64, type = 2 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "send_while_body_dtoh_0"}
+
+    // CHECK:      [[BODY_RECV_TUPLE:%.*]] = "mhlo.recv"([[BODY_SEND_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 2 : i64, type = 3 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "recv_while_body_htod_0"}
+    %1 = "tf._XlaHostComputeMlir"(%arg1) {recv_key = "recv_while_body", send_key = "send_while_body", tpu_core = 0 : i64} : (tensor<f32>) -> tensor<f32>
+
+    // CHECK-DAG:  [[BODY_GET_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[BODY_RECV_TUPLE]]) {index = 0
+    // CHECK-DAG:  [[BODY_GET_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[BODY_RECV_TUPLE]]) {index = 1
+    // CHECK:      [[BODY_RETURN_TUPLE:%.*]] = "mhlo.tuple"([[BODY_GET_TUPLE_ELEMENT0]], [[BODY_GET_TUPLE_ELEMENT1]])
+    // CHECK:      "mhlo.return"([[BODY_RETURN_TUPLE]])
+    "mhlo.return"(%1) : (tensor<f32>) -> ()
+  // CHECK: (tuple<tensor<f32>, !mhlo.token>) -> tuple<tensor<f32>, !mhlo.token>
+  }) : (tensor<f32>) -> tensor<f32>
+
+  // CHECK:      [[WHILE_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[WHILE_TUPLE]])
+  // CHECK-SAME: index = 0
+  // CHECK:      return [[WHILE_TUPLE_ELEMENT0]]
+  return %0 : tensor<f32>
+}
+
+// -----
+
+// Tests `mhlo.while` containing TF/XLA communication ops followed by other
+// TF/XLA communication ops.
+
+func @while_followed_by_communication_op(%arg0: tensor<f32>) {
+  // CHECK: [[WHILE_TUPLE:%.*]] = "mhlo.while"
+  %0 = "mhlo.while"(%arg0) ( {
+  ^bb0(%arg1: tensor<f32>):
+    "tf.XlaSendToHost"(%arg1) {key = "send_key0"} : (tensor<f32>) -> ()
+    %1 = "mhlo.compare"(%arg1, %arg1) {comparison_direction = "LT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    "mhlo.return"(%1) : (tensor<i1>) -> ()
+  },  {
+  ^bb0(%arg1: tensor<f32>):
+    "mhlo.return"(%arg1) : (tensor<f32>) -> ()
+  }) : (tensor<f32>) -> tensor<f32>
+
+  // CHECK: [[WHILE_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[WHILE_TUPLE]]) {index = 1
+
+  // CHECK: "mhlo.send"({{.*}}, [[WHILE_TUPLE_ELEMENT1]])
+  "tf.XlaSendToHost"(%arg0) {key = "send_key1"} : (tensor<f32>) -> ()
+  return
+}
+
+// -----
+
+// Tests unsupported parent of TF/XLA communication op.
+
+func @unsupported_ancestor(%arg0: tensor<?x?xf32>, %arg1: tensor<f32>) {
+  %0 = "mhlo.reduce"(%arg0, %arg1) ( {
+  ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+    %1 = mhlo.add %arg2, %arg3 : tensor<f32>
+    // expected-error@+1 {{expects ancestor(s) to be of ['mhlo.if', 'func']}}
+    "tf._XlaHostComputeMlir"() {recv_key = "host_compute_channel_recv", send_key = "host_compute_channel_send", tpu_core = 0 : i64} : () -> ()
+    "mhlo.return"(%1) : (tensor<f32>) -> ()
+  }) {dimensions = dense<[1]> : tensor<1xi64>} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?xf32>
+  return
+}
+
+// -----
+
+// Tests transitive unsupported parent of TF/XLA communication op.
+
+func @unsupported_ancestor(%arg0: tensor<?x?xf32>, %arg1: tensor<f32>) {
+  %0 = "mhlo.reduce"(%arg0, %arg1) ( {
+  ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+    %1 = mhlo.add %arg2, %arg3 : tensor<f32>
+    // expected-error@+1 {{expects ancestor(s) to be of ['mhlo.if', 'func']}}
+    call @callee() : () -> ()
+    "mhlo.return"(%1) : (tensor<f32>) -> ()
+  }) {dimensions = dense<[1]> : tensor<1xi64>} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?xf32>
+  return
+}
+
+func @callee() attributes {sym_visibility = "private"} {
+  "tf._XlaHostComputeMlir"() {recv_key = "host_compute_channel_recv", send_key = "host_compute_channel_send", tpu_core = 0 : i64} : () -> ()
+  return
+}
+
+// -----
+
+// Tests unsupported `mhlo.if` with region of more than one block and contains a
+// TF/XLA communication op.
+
+func @if_multiple_blocks(%arg0: tensor<i1>, %arg1: tensor<f32>) {
+  %0 = "mhlo.if"(%arg0, %arg1, %arg1) ( {
+  ^bb0(%arg2: tensor<f32>):
+    br ^bb1(%arg2 : tensor<f32>)
+  ^bb1(%arg3: tensor<f32>):
+    // expected-error@+1 {{expects single block region ancestor(s)}}
+    "tf.XlaSendToHost"(%arg3) {key = "send_key0"} : (tensor<f32>) -> ()
+    "mhlo.return"(%arg3) : (tensor<f32>) -> ()
+  },  {
+  ^bb0(%arg2: tensor<f32>):
+    "mhlo.return"(%arg2) : (tensor<f32>) -> ()
+  }) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  return
+}
+
+// -----
+
+// Tests function with more than one block that is to be rewritten emits an
+// error instead.
+
+// expected-error@+1 {{'func' ops with more than one block are not supported}}
+func @multi_block_func() {
+  br ^bb1
+^bb1:
+  %0 = "tf.XlaRecvFromHost"() {key = "recv_key", shape = #tf.shape<>} : () -> tensor<i32>
+  return
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-include-tf2xla-fallback.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-include-tf2xla-fallback.mlir
new file mode 100644
index 00000000000..9f72820d15b
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-include-tf2xla-fallback.mlir
@@ -0,0 +1,50 @@
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion use-tf2xla-fallback=false" -verify-diagnostics %s | FileCheck --check-prefix NO_FALLBACK %s
+// RUN: tf-opt "-xla-legalize-tf=use-tf2xla-fallback=true device-type=XLA_CPU_JIT" -verify-diagnostics %s | FileCheck --check-prefix SUPPORTED_FALLBACK_DEVICE %s
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion use-tf2xla-fallback=true" %s | FileCheck --check-prefix UNSPECIFIED_FALLBACK_DEVICE %s
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion use-tf2xla-fallback=true device-type=INVALID_DEVICE_TYPE" %s | FileCheck --check-prefix UNSUPPORTED_FALLBACK_DEVICE %s
+
+// We run this test four times:
+// 1) Legalize without using TF2XLA fallback (ops cannot be legalized).
+// 2) Use fallback with a device that supports all ops (ops can be legalized).
+// 3) Use fallback with unspecified device (ops cannot be legalized).
+// 4) Use fallback with specified but unsupported device (ops cannot be legalized).
+//
+// Note: For 3) and 4) we do not use `-verify-diagnostics` because these cases
+// produce remarks that don't occur for 1) and 2) and there is no way to check
+// the remarks only for 3) and 4) (except using two files).
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+
+// CHECK-LABEL: non_max_suppression_v4
+func @non_max_suppression_v4(%arg0: tensor<3x4xf32>, %arg1: tensor<3xf32>, %arg2: tensor<f32>, %arg3: tensor<f32>) -> tensor<2xi32> {
+  %max_size = mhlo.constant dense<2> : tensor<i32>
+  // NO_FALLBACK: tf.NonMaxSuppressionV4
+  // SUPPORTED_FALLBACK_DEVICE-NOT: tf.NonMaxSuppressionV4
+  // UNSPECIFIED_FALLBACK_DEVICE: tf.NonMaxSuppressionV4
+  // UNSUPPORTED_FALLBACK_DEVICE:  tf.NonMaxSuppressionV4
+  %0:2 = "tf.NonMaxSuppressionV4"(%arg0, %arg1, %max_size, %arg2, %arg3) {pad_to_max_output_size = true}: (tensor<3x4xf32>, tensor<3xf32>, tensor<i32>, tensor<f32>, tensor<f32>) -> (tensor<2xi32>, tensor<i32>)
+  return %0#0 : tensor<2xi32>
+}
+
+// CHECK-LABEL: mirror_pad
+func @mirror_pad(%arg0: tensor<2x3xcomplex<f64>>) -> tensor<4x7xcomplex<f64>> {
+  %0 = mhlo.constant dense<[[1, 1], [2, 2]]> : tensor<2x2xi32>
+  // NO_FALLBACK: tf.MirrorPad
+  // SUPPORTED_FALLBACK_DEVICE-NOT: tf.MirrorPad
+  // UNSPECIFIED_FALLBACK_DEVICE: tf.MirrorPad
+  // UNSUPPORTED_FALLBACK_DEVICE: tf.MirrorPad
+  %1 = "tf.MirrorPad"(%arg0, %0) {mode = "SYMMETRIC"} : (tensor<2x3xcomplex<f64>>, tensor<2x2xi32>) -> tensor<4x7xcomplex<f64>>
+  return %1 : tensor<4x7xcomplex<f64>>
+}
+
+// CHECK-LABEL: atan2
+func @atan2(%arg0: tensor<4x1xf32>, %arg1: tensor<4x1x4xf32>) -> tensor<4x4x4xf32> {
+  // NO_FALLBACK: tf.Atan2
+  // SUPPORTED_FALLBACK_DEVICE-NOT: tf.Atan2
+  // UNSPECIFIED_FALLBACK_DEVICE: tf.Atan2
+  // UNSUPPORTED_FALLBACK_DEVICE: tf.Atan2
+  %0 = "tf.Atan2"(%arg0, %arg1) : (tensor<4x1xf32>, tensor<4x1x4xf32>) -> tensor<4x4x4xf32>
+  return %0: tensor<4x4x4xf32>
+}
+
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
index ad4ef4b8f77..cd351447303 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
@@ -116,8 +116,7 @@ func @convert(%arg0: tensor<2xi32>) -> tensor<2xf32> {
 
 // CHECK-LABEL: func @constant
 func @constant(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-  // CHECK: %[[SCALAR_ONE:.*]] = mhlo.constant dense<1.000000e+00> : tensor<f32>
-  // CHECK: %[[ONE:.*]] = "mhlo.broadcast_in_dim"(%[[SCALAR_ONE]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<2xf32>
+  // CHECK: %[[ONE:.*]] = mhlo.constant dense<1.000000e+00> : tensor<2xf32>
   // CHECK: %[[RESULT:.*]] = mhlo.divide %[[ONE]], %arg0 : tensor<2xf32>
   // CHECK: return %[[RESULT]]
 
@@ -199,7 +198,6 @@ func @dynamic_update_slice(%arg0: tensor<3x4xi32>, %arg1: tensor<2x2xi32>, %arg2
 // CHECK-SAME: (%[[ARG0:.*]]: tensor<3x2xi32>, %[[ARG1:.*]]: tensor<3xf32>, %[[ARG2:.*]]: tensor<f32>)
 func @sparse_to_dense(%arg0: tensor<3x2xi32>, %arg1: tensor<3xf32>, %arg2: tensor<f32>) -> tensor<3x3xf32> {
 
-// CHECK:      %[[CST:.*]] = mhlo.constant dense<3> : tensor<2xi32>
 // CHECK:      %[[DEFAULT:.*]] = "mhlo.broadcast_in_dim"(%[[ARG2]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<3x3xf32>
 
 // CHECK:      %[[RESULT:.*]] = "mhlo.scatter"(%[[DEFAULT]], %[[ARG0]], %[[ARG1]]) ( {
@@ -259,6 +257,14 @@ func @arg_min(%arg0: tensor<6xf64>) -> tensor<i32> {
   return %1 : tensor<i32>
 }
 
+// CHECK-LABEL: non_max_suppression_v4
+func @non_max_suppression_v4(%arg0: tensor<3x4xf32>, %arg1: tensor<3xf32>, %arg2: tensor<f32>, %arg3: tensor<f32>) -> tensor<2xi32> {
+  %max_size = mhlo.constant dense<2> : tensor<i32>
+  // CHECK-NOT: tf.NonMaxSuppressionV4
+  %0:2 = "tf.NonMaxSuppressionV4"(%arg0, %arg1, %max_size, %arg2, %arg3) {pad_to_max_output_size = true}: (tensor<3x4xf32>, tensor<3xf32>, tensor<i32>, tensor<f32>, tensor<f32>) -> (tensor<2xi32>, tensor<i32>)
+  return %0#0 : tensor<2xi32>
+}
+
 // TODO(hinsu): Add a test with a valid TF op for which tf2xla kernel is
 // available but doesn't support this instance.
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index 221fa19f77c..9b32fb97260 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -1,7 +1,7 @@
-// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion legalize-chlo=false" %s | FileCheck %s
-// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion legalize-chlo=true" -verify-diagnostics %s
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion legalize-chlo=false" %s | FILECHECK_OPTS="" FileCheck %s
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion legalize-chlo=true" -verify-diagnostics %s | FileCheck %s --check-prefix CHLO --dump-input-filter=all
 // This test runs twice:
-//   1. Through FileCheck with chlo legalization disabled since verifying
+//   1. Through FILECHECK_OPTS="" FileCheck with chlo legalization disabled since verifying
 //      that the chlo ops emit produces more useful tests.
 //   2. With chlo legalization enabled, verifying diagnostics to pick up any
 //      issues with the full lowering (can catch some broadcasting corner
@@ -26,6 +26,28 @@ func @fusedBatchNorm_training(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>,
   return %0#0 : tensor<8x8x8x8xf32>
 }
 
+// fusedBatchNormV2 is almost identical to fusedBatchNormV3 (and uses the same
+// code), so only do a couple of basic checks.
+
+// CHECK-LABEL: fusedBatchNormV2_noTraining
+func @fusedBatchNormV2_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
+  // CHECK: "mhlo.batch_norm_inference"({{.*}}, %arg1, %arg2, %arg3, %arg4) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
+  %0:5 = "tf.FusedBatchNormV2"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
+  return %0#0 : tensor<8x8x8x8xf32>
+}
+
+// CHECK-LABEL: fusedBatchNormV2_training
+func @fusedBatchNormV2_training(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
+  // CHECK: %[[RESULT0:.*]] = "mhlo.batch_norm_training"({{.*}}, %arg1, %arg2) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>) -> tuple<tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>>
+  %0:5 = "tf.FusedBatchNormV2"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, exponential_avg_factor = 1.0 : f32, is_training = true} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
+  // CHECK: "mhlo.get_tuple_element"(%[[RESULT0]]) {index = 0 : i32} : (tuple<tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>>) -> tensor<8x8x8x8xf32>
+  // CHECK: "mhlo.get_tuple_element"(%[[RESULT0]]) {index = 1 : i32} : (tuple<tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>>) -> tensor<8xf32>
+  // CHECK: %[[VAR:.*]] = "mhlo.get_tuple_element"(%[[RESULT0]]) {index = 2 : i32} : (tuple<tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>>) -> tensor<8xf32>
+  // CHECK: mhlo.constant
+  // CHECK: chlo.broadcast_multiply %[[VAR]], {{.*}} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
+  return %0#0 : tensor<8x8x8x8xf32>
+}
+
 // CHECK-LABEL: fusedBatchNormV3_noTraining
 func @fusedBatchNormV3_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
   // CHECK: "mhlo.batch_norm_inference"({{.*}}, %arg1, %arg2, %arg3, %arg4) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
@@ -473,6 +495,142 @@ func @diag_part(%arg0: tensor<4x3x4x3xf32>) -> tensor<4x3xf32> {
   return %0: tensor<4x3xf32>
 }
 
+//===----------------------------------------------------------------------===//
+// MatrixDiagPart
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @matrix_diag_part
+// CHECK-SAME: %[[ARG:.*]]: tensor<7x140x128xi32>
+func @matrix_diag_part(%arg0: tensor<7x140x128xi32>) -> tensor<7x22x128xi32> {
+  // CHECK-DAG: %[[V0:.*]] = mhlo.constant dense<42> : tensor<i32>
+  // CHECK-DAG: %[[V1:.*]] = mhlo.constant dense<[-10, 11]> : tensor<2xi32>
+  // CHECK-DAG: %[[V2:.*]] = "mhlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V3:.*]] = "mhlo.iota"() {iota_dimension = 2 : i64} : () -> tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V4:.*]] = mhlo.constant dense<0> : tensor<i32>
+  // CHECK-DAG: %[[V5:.*]] = "mhlo.broadcast"(%[[V4]]) {broadcast_sizes = dense<[1, 22, 128]> : tensor<3xi64>} : (tensor<i32>) -> tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V6:.*]] = mhlo.constant dense<false> : tensor<i1>
+  // CHECK-DAG: %[[V7:.*]] = "mhlo.broadcast"(%[[V6]]) {broadcast_sizes = dense<[1, 22, 128]> : tensor<3xi64>} : (tensor<i1>) -> tensor<1x22x128xi1>
+  // CHECK-DAG: %[[V8:.*]] = mhlo.constant dense<true> : tensor<i1>
+  // CHECK-DAG: %[[V9:.*]] = "mhlo.broadcast"(%[[V8]]) {broadcast_sizes = dense<[1, 22, 128]> : tensor<3xi64>} : (tensor<i1>) -> tensor<1x22x128xi1>
+  // CHECK-DAG: %[[V10:.*]] = mhlo.constant dense<11> : tensor<i32>
+  // CHECK-DAG: %[[V11:.*]] = "mhlo.broadcast"(%[[V10]]) {broadcast_sizes = dense<[1, 22, 128]> : tensor<3xi64>} : (tensor<i32>) -> tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V12:.*]] = mhlo.constant dense<140> : tensor<i32>
+  // CHECK-DAG: %[[V13:.*]] = "mhlo.broadcast"(%[[V12]]) {broadcast_sizes = dense<[1, 22, 128]> : tensor<3xi64>} : (tensor<i32>) -> tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V14:.*]] = mhlo.constant dense<128> : tensor<i32>
+  // CHECK-DAG: %[[V15:.*]] = "mhlo.broadcast"(%[[V14]]) {broadcast_sizes = dense<[1, 22, 128]> : tensor<3xi64>} : (tensor<i32>) -> tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V16:.*]] = mhlo.constant dense<128> : tensor<i32>
+  // CHECK-DAG: %[[V17:.*]] = "mhlo.broadcast"(%[[V16]]) {broadcast_sizes = dense<[1, 22, 128]> : tensor<3xi64>} : (tensor<i32>) -> tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V18:.*]] = mhlo.subtract %[[V11]], %[[V2]] : tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V19:.*]] = "mhlo.negate"(%[[V18]]) : (tensor<1x22x128xi32>) -> tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V20:.*]] = mhlo.minimum %[[V18]], %[[V5]] : tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V21:.*]] = mhlo.add %[[V13]], %[[V20]] : tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V22:.*]] = mhlo.maximum %[[V18]], %[[V5]] : tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V23:.*]] = mhlo.subtract %[[V15]], %[[V22]] : tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V24:.*]] = mhlo.minimum %[[V21]], %[[V23]] : tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V25:.*]] = chlo.broadcast_compare %[[V18]], %[[V5]] {comparison_direction = "GE"} : (tensor<1x22x128xi32>, tensor<1x22x128xi32>) -> tensor<1x22x128xi1>
+  // CHECK-DAG: %[[V26:.*]] = mhlo.subtract %[[V17]], %[[V24]] : tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V27:.*]] = "mhlo.select"(%[[V25]], %[[V26]], %[[V5]]) : (tensor<1x22x128xi1>, tensor<1x22x128xi32>, tensor<1x22x128xi32>) -> tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V28:.*]] = mhlo.maximum %[[V18]], %[[V5]] : tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V29:.*]] = mhlo.subtract %[[V28]], %[[V27]] : tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V30:.*]] = mhlo.maximum %[[V19]], %[[V5]] : tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V31:.*]] = mhlo.subtract %[[V30]], %[[V27]] : tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V32:.*]] = mhlo.add %[[V3]], %[[V29]] : tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V33:.*]] = mhlo.add %[[V3]], %[[V31]] : tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V34:.*]] = chlo.broadcast_compare %[[V32]], %[[V5]] {comparison_direction = "GE"} : (tensor<1x22x128xi32>, tensor<1x22x128xi32>) -> tensor<1x22x128xi1>
+  // CHECK-DAG: %[[V35:.*]] = chlo.broadcast_compare %[[V32]], %[[V15]] {comparison_direction = "LT"} : (tensor<1x22x128xi32>, tensor<1x22x128xi32>) -> tensor<1x22x128xi1>
+  // CHECK-DAG: %[[V36:.*]] = mhlo.and %[[V34]], %[[V35]] : tensor<1x22x128xi1>
+  // CHECK-DAG: %[[V37:.*]] = chlo.broadcast_compare %[[V33]], %[[V5]] {comparison_direction = "GE"} : (tensor<1x22x128xi32>, tensor<1x22x128xi32>) -> tensor<1x22x128xi1>
+  // CHECK-DAG: %[[V38:.*]] = chlo.broadcast_compare %[[V33]], %[[V13]] {comparison_direction = "LT"} : (tensor<1x22x128xi32>, tensor<1x22x128xi32>) -> tensor<1x22x128xi1>
+  // CHECK-DAG: %[[V39:.*]] = mhlo.and %[[V37]], %[[V38]] : tensor<1x22x128xi1>
+  // CHECK-DAG: %[[V40:.*]] = mhlo.and %[[V36]], %[[V39]] : tensor<1x22x128xi1>
+  // CHECK-DAG: %[[V41:.*]] = "mhlo.reshape"(%[[V40]]) : (tensor<1x22x128xi1>) -> tensor<22x128xi1>
+  // CHECK-DAG: %[[V42:.*]] = "mhlo.concatenate"(%[[V33]], %[[V32]]) {dimension = 0 : i64} : (tensor<1x22x128xi32>, tensor<1x22x128xi32>) -> tensor<2x22x128xi32>
+  // CHECK-DAG: %[[V43:.*]] = "mhlo.gather"(%[[ARG]], %[[V42]]) {dimension_numbers = {collapsed_slice_dims = dense<[1, 2]> : tensor<2xi64>, index_vector_dim = 0 : i64, offset_dims = dense<0> : tensor<1xi64>, start_index_map = dense<[1, 2]> : tensor<2xi64>}, indices_are_sorted = false, slice_sizes = dense<[7, 1, 1]> : tensor<3xi64>} : (tensor<7x140x128xi32>, tensor<2x22x128xi32>) -> tensor<7x22x128xi32>
+  // CHECK-DAG: %[[V44:.*]] = "mhlo.broadcast"(%[[V41]]) {broadcast_sizes = dense<7> : tensor<1xi64>} : (tensor<22x128xi1>) -> tensor<7x22x128xi1>
+  // CHECK-DAG: %[[V45:.*]] = "mhlo.broadcast"(%[[V0]]) {broadcast_sizes = dense<[7, 22, 128]> : tensor<3xi64>} : (tensor<i32>) -> tensor<7x22x128xi32>
+  // CHECK: %[[V46:.*]] = "mhlo.select"(%[[V44]], %[[V43]], %[[V45]]) : (tensor<7x22x128xi1>, tensor<7x22x128xi32>, tensor<7x22x128xi32>) -> tensor<7x22x128xi32>
+  // CHECK: return %[[V46]] : tensor<7x22x128xi32>
+  %0 = mhlo.constant dense<42> : tensor<i32>  // padding value
+  %1 = mhlo.constant dense<[-10, 11]> : tensor<2xi32>  // k
+  %2 = "tf.MatrixDiagPartV3"(%arg0, %1, %0) {
+      T = i32, align = "RIGHT_LEFT"
+  } : (tensor<7x140x128xi32>, tensor<2xi32>, tensor<i32>) -> tensor<7x22x128xi32>
+  return %2: tensor<7x22x128xi32>
+}
+
+// CHECK-LABEL: func @matrix_diag_part_single_diagonal
+func @matrix_diag_part_single_diagonal(%arg0: tensor<7x140x128xi32>) -> tensor<7x128xi32> {
+  %0 = mhlo.constant dense<42> : tensor<i32>  // padding value
+  %1 = mhlo.constant dense<0> : tensor<2xi32>  // k
+  %2 = "tf.MatrixDiagPartV3"(%arg0, %1, %0) {
+      T = i32, align = "RIGHT_LEFT"
+  } : (tensor<7x140x128xi32>, tensor<2xi32>, tensor<i32>) -> tensor<7x128xi32>
+  // CHECK: %[[result:.*]] = "mhlo.reshape"({{.*}}) : (tensor<7x1x128xi32>) -> tensor<7x128xi32>
+  // CHECK: return %[[result]] : tensor<7x128xi32>
+  return %2: tensor<7x128xi32>
+}
+
+// CHECK-LABEL: func @matrix_diag_part_align_ll
+func @matrix_diag_part_align_ll(%arg0: tensor<7x140x128xi32>) -> tensor<7x22x128xi32> {
+  %0 = mhlo.constant dense<42> : tensor<i32>  // padding value
+  %1 = mhlo.constant dense<[-10, 11]> : tensor<2xi32>  // k
+  %2 = "tf.MatrixDiagPartV3"(%arg0, %1, %0) {
+      T = i32, align = "LEFT_LEFT"
+  } : (tensor<7x140x128xi32>, tensor<2xi32>, tensor<i32>) -> tensor<7x22x128xi32>
+  // CHECK: %[[false:.*]] = mhlo.constant dense<false> : tensor<i1>
+  // CHECK: %[[b_false:.*]] = "mhlo.broadcast"(%[[false]]) {broadcast_sizes = dense<[1, 22, 128]> : tensor<3xi64>} : (tensor<i1>) -> tensor<1x22x128xi1>
+  // CHECK: %{{[0-9]*}} = "mhlo.select"(%[[b_false]], %{{[0-9]*}}, %{{[0-9]*}}) : (tensor<1x22x128xi1>, tensor<1x22x128xi32>, tensor<1x22x128xi32>) -> tensor<1x22x128xi32>
+  return %2: tensor<7x22x128xi32>
+}
+
+// CHECK-LABEL: func @matrix_diag_part_align_lr
+func @matrix_diag_part_align_lr(%arg0: tensor<7x140x128xi32>) -> tensor<7x22x128xi32> {
+  %0 = mhlo.constant dense<42> : tensor<i32>  // padding value
+  %1 = mhlo.constant dense<[-10, 11]> : tensor<2xi32>  // k
+  %2 = "tf.MatrixDiagPartV3"(%arg0, %1, %0) {
+      T = i32, align = "LEFT_RIGHT"
+  } : (tensor<7x140x128xi32>, tensor<2xi32>, tensor<i32>) -> tensor<7x22x128xi32>
+  // CHECK: %[[le:.*]] = chlo.broadcast_compare %{{[0-9]*}}, %{{[0-9]*}} {comparison_direction = "LE"} : (tensor<1x22x128xi32>, tensor<1x22x128xi32>) -> tensor<1x22x128xi1>
+  // CHECK: %{{[0-9]*}} = "mhlo.select"(%[[le]], %{{[0-9]*}}, %{{[0-9]*}}) : (tensor<1x22x128xi1>, tensor<1x22x128xi32>, tensor<1x22x128xi32>) -> tensor<1x22x128xi32>
+  return %2: tensor<7x22x128xi32>
+}
+
+// CHECK-LABEL: func @matrix_diag_part_align_rl
+func @matrix_diag_part_align_rl(%arg0: tensor<7x140x128xi32>) -> tensor<7x22x128xi32> {
+  %0 = mhlo.constant dense<42> : tensor<i32>  // padding value
+  %1 = mhlo.constant dense<[-10, 11]> : tensor<2xi32>  // k
+  %2 = "tf.MatrixDiagPartV3"(%arg0, %1, %0) {
+      T = i32, align = "RIGHT_LEFT"
+  } : (tensor<7x140x128xi32>, tensor<2xi32>, tensor<i32>) -> tensor<7x22x128xi32>
+  // CHECK: %[[ge:.*]] = chlo.broadcast_compare %{{[0-9]*}}, %{{[0-9]*}} {comparison_direction = "GE"} : (tensor<1x22x128xi32>, tensor<1x22x128xi32>) -> tensor<1x22x128xi1>
+  // CHECK: %{{[0-9]*}} = "mhlo.select"(%[[ge]], %{{[0-9]*}}, %{{[0-9]*}}) : (tensor<1x22x128xi1>, tensor<1x22x128xi32>, tensor<1x22x128xi32>) -> tensor<1x22x128xi32>
+  return %2: tensor<7x22x128xi32>
+}
+
+// CHECK-LABEL: func @matrix_diag_part_align_rr
+func @matrix_diag_part_align_rr(%arg0: tensor<7x140x128xi32>) -> tensor<7x22x128xi32> {
+  %0 = mhlo.constant dense<42> : tensor<i32>  // padding value
+  %1 = mhlo.constant dense<[-10, 11]> : tensor<2xi32>  // k
+  %2 = "tf.MatrixDiagPartV3"(%arg0, %1, %0) {
+      T = i32, align = "RIGHT_RIGHT"
+  } : (tensor<7x140x128xi32>, tensor<2xi32>, tensor<i32>) -> tensor<7x22x128xi32>
+  // CHECK: %[[true:.*]] = mhlo.constant dense<true> : tensor<i1>
+  // CHECK: %[[b_true:.*]] = "mhlo.broadcast"(%[[true]]) {broadcast_sizes = dense<[1, 22, 128]> : tensor<3xi64>} : (tensor<i1>) -> tensor<1x22x128xi1>
+  // CHECK: %{{[0-9]*}} = "mhlo.select"(%[[b_true]], %{{[0-9]*}}, %{{[0-9]*}}) : (tensor<1x22x128xi1>, tensor<1x22x128xi32>, tensor<1x22x128xi32>) -> tensor<1x22x128xi32>
+  return %2: tensor<7x22x128xi32>
+}
+
+// CHECK-LABEL: func @matrix_diag_part_align_7d
+// CHECK: (%arg0: tensor<3x5x7x9x11x13x17xf32>) -> tensor<3x5x7x9x11x4x10xf32>
+func @matrix_diag_part_align_7d(%arg0: tensor<3x5x7x9x11x13x17xf32>) -> tensor<3x5x7x9x11x4x10xf32> {
+  %0 = mhlo.constant dense<-1.> : tensor<f32>  // padding value
+  %1 = mhlo.constant dense<[-6, -3]> : tensor<2xi32>  // k
+  %2 = "tf.MatrixDiagPartV3"(%arg0, %1, %0) {
+      T = f32, align = "LEFT_RIGHT"
+  } : (tensor<3x5x7x9x11x13x17xf32>, tensor<2xi32>, tensor<f32>) -> tensor<3x5x7x9x11x4x10xf32>
+  return %2: tensor<3x5x7x9x11x4x10xf32>
+}
+
 //===----------------------------------------------------------------------===//
 // Einsum.
 //===----------------------------------------------------------------------===//
@@ -958,7 +1116,7 @@ func @test_sparse_mat_mul(%arg0: tensor<3x4xf32>, %arg1: tensor<4x5xf32>) -> ten
 
 // SparseMatMul where one operand needs to be transposed and the other one not.
 //
-// CHECK-LABEL:   func @test_sparse_mat_mul_with_transpose
+// CHECK-LABEL:   @test_sparse_mat_mul_with_transpose
 // CHECK-SAME:      %[[ARG0:.*]]: tensor<3x4xf32>
 // CHECK-SAME:      %[[ARG1:.*]]: tensor<5x4xf32>
 // CHECK-SAME:      -> tensor<3x5xf32>
@@ -968,7 +1126,6 @@ func @test_sparse_mat_mul(%arg0: tensor<3x4xf32>, %arg1: tensor<4x5xf32>) -> ten
 // CHECK:           %[[RESULT:.*]] = "mhlo.dot"(%[[ARG0]], %[[TRANSPOSE]])
 // CHECK-SAME:        -> tensor<3x5xf32>
 // CHECK:           return %[[RESULT]]
-// CHECK:         }
 func @test_sparse_mat_mul_with_transpose(%arg0: tensor<3x4xf32>, %arg1: tensor<5x4xf32>) -> tensor<3x5xf32> {
   %0 = "tf.SparseMatMul"(%arg0, %arg1) {a_is_sparse = true, b_is_sparse = false, transpose_a = false, transpose_b = true} : (tensor<3x4xf32>, tensor<5x4xf32>) -> tensor<3x5xf32>
   return %0: tensor<3x5xf32>
@@ -976,7 +1133,7 @@ func @test_sparse_mat_mul_with_transpose(%arg0: tensor<3x4xf32>, %arg1: tensor<5
 
 // SparseMatMul where one operand needs to be casted and the other one not.
 //
-// CHECK-LABEL:   func @test_sparse_mat_mul_with_cast
+// CHECK-LABEL:   @test_sparse_mat_mul_with_cast
 // CHECK-SAME:      %[[ARG0:.*]]: tensor<3x4xf32>
 // CHECK-SAME:      %[[ARG1:.*]]: tensor<4x5xbf16>
 // CHECK-SAME:      -> tensor<3x5xf32>
@@ -985,7 +1142,6 @@ func @test_sparse_mat_mul_with_transpose(%arg0: tensor<3x4xf32>, %arg1: tensor<5
 // CHECK:           %[[RESULT:.*]] = "mhlo.dot"(%[[ARG0]], %[[CAST]])
 // CHECK-SAME:        -> tensor<3x5xf32>
 // CHECK:           return %[[RESULT]]
-// CHECK:         }
 func @test_sparse_mat_mul_with_cast(%arg0: tensor<3x4xf32>, %arg1: tensor<4x5xbf16>) -> tensor<3x5xf32> {
   %0 = "tf.SparseMatMul"(%arg0, %arg1) {a_is_sparse = true, b_is_sparse = false, transpose_a = false, transpose_b = false} : (tensor<3x4xf32>, tensor<4x5xbf16>) -> tensor<3x5xf32>
   return %0: tensor<3x5xf32>
@@ -1485,7 +1641,7 @@ func @simple_softmax(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
   // CHECK: %[[CASTED_MAX:.*]] = "mhlo.convert"(%[[MAX]]) : (tensor<2xf32>) -> tensor<2xf32>
 
   // CHECK: %[[RESULT_SHAPE:.+]] = shape.shape_of %[[ARG0]]
-  // CHECK: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]] : tensor<2xindex>
+  // CHECK: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
   // CHECK: %[[BCAST_MAX:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[CASTED_MAX]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
   // CHECK: %[[SHIFTED_INP:.*]] = mhlo.subtract %[[ARG0]], %[[BCAST_MAX]]
   // CHECK: %[[EXP:.*]] = "mhlo.exponential"(%[[SHIFTED_INP]])
@@ -1500,7 +1656,7 @@ func @simple_softmax(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
   // CHECK: %[[CASTED_SUM:.*]] = "mhlo.convert"(%[[SUM]]) : (tensor<2xf32>) -> tensor<2xf32>
 
   // CHECK: %[[RESULT_SHAPE:.+]] = shape.shape_of %[[ARG0]]
-  // CHECK: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]] : tensor<2xindex>
+  // CHECK: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
   // CHECK: %[[BCAST_SUM:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[CASTED_SUM]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
   // CHECK: %[[RESULT:.*]] = mhlo.divide %[[EXP]], %[[BCAST_SUM]]
   // CHECK: return %[[RESULT]]
@@ -1557,7 +1713,7 @@ func @simple_logsoftmax(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
   // CHECK: %[[CASTED_SUM:.*]] = "mhlo.convert"(%[[SUM]]) : (tensor<2xf32>) -> tensor<2xf32>
   // CHECK: %[[LOG:.*]] = "mhlo.log"(%[[CASTED_SUM]]) : (tensor<2xf32>) -> tensor<2xf32>
   // CHECK: %[[RESULT_SHAPE:.+]] = shape.shape_of %[[ARG0]]
-  // CHECK: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]] : tensor<2xindex>
+  // CHECK: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
   // CHECK: %[[BCAST_SUM:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[LOG]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
   // CHECK: %[[RESULT:.*]] = mhlo.subtract {{.*}}, %[[BCAST_SUM]]
   // CHECK: return %[[RESULT]]
@@ -1693,6 +1849,48 @@ func @abs_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL: @acos
+// CHLO-LABEL: @acos
+func @acos(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+  // CHECK:  "chlo.acos"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
+// CHLO:   %[[VAL_1:.*]] = "mhlo.compare"({{.*}}) {comparison_direction = "NE"}
+// CHLO:   %[[VAL_5:.*]] = mhlo.multiply %arg0, %arg0
+// CHLO:   %[[VAL_4:.*]] = mhlo.constant dense<1.000000e+00>
+// CHLO:   %[[VAL_6:.*]] = mhlo.subtract %[[VAL_4]], %[[VAL_5]]
+// CHLO:   %[[VAL_7:.*]] = "mhlo.sqrt"(%[[VAL_6]])
+// CHLO:   %[[VAL_8:.*]] = mhlo.constant dense<1.000000e+00>
+// CHLO:   %[[VAL_9:.*]] = mhlo.add %[[VAL_8]], %arg0
+// CHLO:   %[[VAL_10:.*]] = mhlo.atan2 %[[VAL_7]], %[[VAL_9]]
+// CHLO:   %[[VAL_3:.*]] = mhlo.constant dense<2.000000e+00>
+// CHLO:   %[[VAL_11:.*]] = mhlo.multiply %[[VAL_3]], %[[VAL_10]]
+// CHLO:   %[[VAL_12:.*]] = mhlo.constant dense<3.14159274>
+// CHLO:   %[[VAL_13:.*]] = "mhlo.select"(%[[VAL_1]], %[[VAL_11]], %[[VAL_12]])
+// CHLO:       return %[[VAL_13]] : tensor<2xf32>
+  %0 = "tf.Acos"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
+  return %0 : tensor<2xf32>
+}
+
+// CHECK-LABEL: @acos_dynamic
+// CHLO-LABEL: @acos_dynamic
+func @acos_dynamic(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  // CHECK:  "chlo.acos"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
+// CHLO:   %[[VAL_1:.*]] = "mhlo.compare"({{.*}}) {comparison_direction = "NE"}
+// CHLO:   %[[VAL_5:.*]] = mhlo.multiply %arg0, %arg0
+// CHLO:   %[[VAL_4:.*]] = "chlo.constant_like"(%arg0) {value = 1.000000e+00 : f32}
+// CHLO:   %[[VAL_6:.*]] = mhlo.subtract %[[VAL_4]], %[[VAL_5]]
+// CHLO:   %[[VAL_7:.*]] = "mhlo.sqrt"(%[[VAL_6]])
+// CHLO:   %[[VAL_8:.*]] = "chlo.constant_like"(%arg0) {value = 1.000000e+00 : f32}
+// CHLO:   %[[VAL_9:.*]] = mhlo.add %[[VAL_8]], %arg0
+// CHLO:   %[[VAL_10:.*]] = mhlo.atan2 %[[VAL_7]], %[[VAL_9]]
+// CHLO:   %[[VAL_3:.*]] = "chlo.constant_like"(%arg0) {value = 2.000000e+00 : f32}
+// CHLO:   %[[VAL_11:.*]] = mhlo.multiply %[[VAL_3]], %[[VAL_10]]
+// CHLO:   %[[VAL_12:.*]] = "chlo.constant_like"(%arg0) {value = 3.14159274 : f32}
+// CHLO:   %[[VAL_13:.*]] = "mhlo.select"(%[[VAL_1]], %[[VAL_11]], %[[VAL_12]])
+// CHLO:       return %[[VAL_13]]
+  %0 = "tf.Acos"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
 // CHECK-LABEL: func @cast_dynamic_i2f
 func @cast_dynamic_i2f(%arg0: tensor<?xi32>) -> tensor<?xf32> {
   // CHECK: "mhlo.convert"(%arg0) : (tensor<?xi32>) -> tensor<?xf32>
@@ -1900,7 +2098,7 @@ func @neg_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
 func @sigmoid(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   // CHECK-DAG: [[SCALAR:%.+]] = mhlo.constant dense<5.000000e-01> : tensor<f32>
   // CHECK-DAG: [[SHAPE:%.+]] = shape.shape_of %arg0 : tensor<2xf32>
-  // CHECK-DAG: [[SHAPE_VAL:%.+]] = shape.to_extent_tensor [[SHAPE]] : tensor<1xindex>
+  // CHECK-DAG: [[SHAPE_VAL:%.+]] = shape.to_extent_tensor [[SHAPE]]
   // CHECK-DAG: [[HALF:%.+]] = "mhlo.dynamic_broadcast_in_dim"([[SCALAR]], [[SHAPE_VAL]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<1xindex>) -> tensor<2xf32>
   // CHECK-DAG: [[R1:%.+]] =  mhlo.multiply %arg0, [[HALF]] : tensor<2xf32>
   // CHECK-DAG: [[R2:%.+]] =  "mhlo.tanh"([[R1]]) : (tensor<2xf32>) -> tensor<2xf32>
@@ -1922,7 +2120,7 @@ func @sigmoid_complex(%arg0: tensor<2xcomplex<f32>>) -> tensor<2xcomplex<f32>> {
 func @sigmoid_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   // CHECK-DAG: [[SCALAR:%.+]] = mhlo.constant dense<5.000000e-01> : tensor<f32>
   // CHECK-DAG: [[SHAPE:%.+]] = shape.shape_of %arg0 : tensor<*xf32>
-  // CHECK-DAG: [[SHAPE_VAL:%.+]] = shape.to_extent_tensor [[SHAPE]] : tensor<?xindex>
+  // CHECK-DAG: [[SHAPE_VAL:%.+]] = shape.to_extent_tensor [[SHAPE]]
   // CHECK-DAG: [[HALF:%.+]] = "mhlo.dynamic_broadcast_in_dim"([[SCALAR]], [[SHAPE_VAL]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<?xindex>) -> tensor<*xf32>
   // CHECK-DAG: [[R1:%.+]] =  mhlo.multiply %arg0, [[HALF]] : tensor<*xf32>
   // CHECK-DAG: [[R2:%.+]] =  "mhlo.tanh"([[R1]]) : (tensor<*xf32>) -> tensor<*xf32>
@@ -2126,11 +2324,8 @@ func @expand_dims(%arg0: tensor<2xf32>, %axis: tensor<i32>) -> tensor<1x2xf32> {
 // CHECK-LABEL: func @sign
 // CHECK-SAME: [[ARG:%arg.*]]: tensor<1x2x3x4xf32>
 func @sign(%arg0: tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32> {
-  // CHECK: [[PRED:%.*]] = "mhlo.compare"([[ARG]], [[ARG]])
-  // CHECK: [[ZEROS:%.*]] = mhlo.constant dense<0.000000e+00> : tensor<1x2x3x4xf32>
   // CHECK: [[SIGN:%.*]] = "mhlo.sign"([[ARG]])
-  // CHECK: [[SELECT:%.*]] = "mhlo.select"([[PRED]], [[ZEROS]], [[SIGN]])
-  // CHECK: return [[SELECT]] : tensor<1x2x3x4xf32>
+  // CHECK: return [[SIGN]] : tensor<1x2x3x4xf32>
   %0 = "tf.Sign"(%arg0) : (tensor<1x2x3x4xf32>) -> (tensor<1x2x3x4xf32>)
   return %0 : tensor<1x2x3x4xf32>
 }
@@ -3029,6 +3224,34 @@ func @linspace_invalid_num(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<?xf
   return %1 : tensor<?xf32>
 }
 
+//===----------------------------------------------------------------------===//
+// LegacyCall op legalizations.
+//===----------------------------------------------------------------------===//
+
+func @identity_func(%arg0: tensor<10x2xf32>) -> tensor<10x2xf32> {
+  return %arg0: tensor<10x2xf32>
+}
+
+// CHECK-LABEL: testSimpleLegacyCallOp
+func @testSimpleLegacyCallOp(%arg0: tensor<10x2xf32>) -> tensor<10x2xf32> {
+  // CHECK: %[[RESULT:.*]] = call @identity_func(%arg0) : (tensor<10x2xf32>) -> tensor<10x2xf32>
+  %0 = "tf.LegacyCall"(%arg0) {f = @identity_func} : (tensor<10x2xf32>) -> tensor<10x2xf32>
+  // CHECK: return %[[RESULT]]
+  return %0: tensor<10x2xf32>
+}
+
+func @select_first(%arg0: tensor<10x2xf32>, %arg1: tensor<10x2xf32>) -> tensor<10x2xf32> {
+  return %arg0: tensor<10x2xf32>
+}
+
+// CHECK-LABEL: testMultiInputLegacyCallOp
+func @testMultiInputLegacyCallOp(%arg0: tensor<10x2xf32>, %arg1: tensor<10x2xf32>) -> tensor<10x2xf32> {
+  // CHECK: %[[RESULT:.*]] = call @select_first(%arg0, %arg1) : (tensor<10x2xf32>, tensor<10x2xf32>) -> tensor<10x2xf32>
+  %0 = "tf.LegacyCall"(%arg0, %arg1) {_disable_call_shape_inference = true, _tpu_replicate = "cluster", device = "", f = @select_first} : (tensor<10x2xf32>, tensor<10x2xf32>) -> tensor<10x2xf32>
+  // CHECK: return %[[RESULT]]
+  return %0: tensor<10x2xf32>
+}
+
 //===----------------------------------------------------------------------===//
 // Conv op legalizations.
 //===----------------------------------------------------------------------===//
@@ -3277,8 +3500,8 @@ func @cross_replica_sum(%input: tensor<10xf32>) -> tensor<10xf32> {
 // tf.Size legalization
 //===----------------------------------------------------------------------===//
 
-// CHECK-LABEL: @size_rank_one_i32
-func @size_rank_one_i32(%input: tensor<f32>) -> (tensor<i32>) {
+// CHECK-LABEL: @size_scalar_i32
+func @size_scalar_i32(%input: tensor<f32>) -> (tensor<i32>) {
   // CHECK: %[[CONST:.*]] = mhlo.constant dense<1>
   // CHECK-SAME: tensor<i32>
   %size = "tf.Size"(%input) {T = "tfdtype$DT_FLOAT", out_type = "tfdtype$DT_INT32"} : (tensor<f32>) -> tensor<i32>
@@ -3286,8 +3509,8 @@ func @size_rank_one_i32(%input: tensor<f32>) -> (tensor<i32>) {
   return %size : tensor<i32>
 }
 
-// CHECK-LABEL: @size_rank_one_i64
-func @size_rank_one_i64(%input: tensor<f32>) -> (tensor<i64>) {
+// CHECK-LABEL: @size_scalar_i64
+func @size_scalar_i64(%input: tensor<f32>) -> (tensor<i64>) {
   // CHECK: %[[CONST:.*]] = mhlo.constant dense<1>
   // CHECK-SAME: tensor<i64>
   %size = "tf.Size"(%input) {T = "tfdtype$DT_FLOAT", out_type = "tfdtype$DT_INT64"} : (tensor<f32>) -> tensor<i64>
@@ -3295,19 +3518,40 @@ func @size_rank_one_i64(%input: tensor<f32>) -> (tensor<i64>) {
   return %size : tensor<i64>
 }
 
+// CHECK-LABEL: @size_rank_one_i64
+// CHECK-SAME: (%[[INPUT:.*]]: tensor<?xf32>)
+func @size_rank_one_i64(%input: tensor<?xf32>) -> (tensor<i64>) {
+  // CHECK: %[[INIT:.*]] = mhlo.constant dense<1>
+  // CHECK-SAME: tensor<i64>
+
+  // CHECK: %[[DIM_0:.*]] = "mhlo.get_dimension_size"(%[[INPUT]])
+  // CHECK-SAME: dimension = 0
+  // CHECK-SAME: tensor<i32>
+
+  // CHECK: %[[CAST_DIM_0:.*]] = "mhlo.convert"(%[[DIM_0]]) : (tensor<i32>) -> tensor<i64>
+  // CHECK: %[[RESULT:.*]] = chlo.broadcast_multiply %[[INIT]], %[[CAST_DIM_0]]
+
+  %size = "tf.Size"(%input) : (tensor<?xf32>) -> tensor<i64>
+  // CHECK: return %[[RESULT]]
+  return %size : tensor<i64>
+}
+
 // CHECK-LABEL: @size_ranked
 // CHECK-SAME: (%[[INPUT:.*]]: tensor<2x?x8xf32>)
 func @size_ranked(%input: tensor<2x?x8xf32>) -> (tensor<i32>) {
   // CHECK: %[[CONST:.*]] = mhlo.constant dense<1>
   // CHECK: %[[DIM_0:.*]] = "mhlo.get_dimension_size"(%[[INPUT]])
   // CHECK-SAME: dimension = 0
-  // CHECK: %[[MUL_0:.*]] = chlo.broadcast_multiply %[[CONST]], %[[DIM_0]]
+  // CHECK: %[[CAST_DIM_0:.*]] = "mhlo.convert"(%[[DIM_0]]) : (tensor<i32>) -> tensor<i32>
+  // CHECK: %[[MUL_0:.*]] = chlo.broadcast_multiply %[[CONST]], %[[CAST_DIM_0]]
   // CHECK: %[[DIM_1:.*]] = "mhlo.get_dimension_size"(%[[INPUT]])
   // CHECK-SAME: dimension = 1
-  // CHECK: %[[MUL_1:.*]] = chlo.broadcast_multiply %[[MUL_0]], %[[DIM_1]]
+  // CHECK: %[[CAST_DIM_1:.*]] = "mhlo.convert"(%[[DIM_1]]) : (tensor<i32>) -> tensor<i32>
+  // CHECK: %[[MUL_1:.*]] = chlo.broadcast_multiply %[[MUL_0]], %[[CAST_DIM_1]]
   // CHECK: %[[DIM_2:.*]] = "mhlo.get_dimension_size"(%[[INPUT]])
   // CHECK-SAME: dimension = 2
-  // CHECK: %[[MUL_2:.*]] = chlo.broadcast_multiply %[[MUL_1]], %[[DIM_2]]
+  // CHECK: %[[CAST_DIM_2:.*]] = "mhlo.convert"(%[[DIM_2]]) : (tensor<i32>) -> tensor<i32>
+  // CHECK: %[[MUL_2:.*]] = chlo.broadcast_multiply %[[MUL_1]], %[[CAST_DIM_2]]
   %size = "tf.Size"(%input) {T = "tfdtype$DT_FLOAT", out_type = "tfdtype$DT_INT32"} : (tensor<2x?x8xf32>) -> tensor<i32>
   // CHECK: return %[[MUL_2]]
   return %size : tensor<i32>
@@ -3846,36 +4090,167 @@ func @random_shuffle_3D(%input: tensor<4x?x16xf32>) -> tensor<4x?x16xf32> {
 // tf.AvgPool legalization
 //===----------------------------------------------------------------------===//
 
-// CHECK-LABEL: avgpool_valid_padding
-// CHECK-SAME: [[ARG:%.+]]: tensor<2x12x20x7xf16>
-func @avgpool_valid_padding(%arg0: tensor<2x12x20x7xf16>) -> tensor<2x3x5x7xf16> {
-  // CHECK: [[CONV32:%.+]] = "mhlo.convert"(%arg0) : (tensor<2x12x20x7xf16>) -> tensor<2x12x20x7xf32>
-  // CHECK: [[INIT:%.+]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  // CHECK: [[REDUCE:%.+]] = "mhlo.reduce_window"([[CONV32]], [[INIT]]) ( {
-  // CHECK: ^bb0([[ARG1:%.+]]: tensor<f32>, [[ARG2:%.+]]: tensor<f32>):
-  // CHECK:   [[ADD:%.+]] = mhlo.add [[ARG1]], [[ARG2]]
-  // CHECK:   "mhlo.return"([[ADD]])
-  // CHECK: }) {window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>, window_strides = dense<[1, 4, 4, 1]> : tensor<4xi64>} : (tensor<2x12x20x7xf32>, tensor<f32>) -> tensor<2x3x5x7xf32>
-  // CHECK: [[COUNT:%.+]] = mhlo.constant dense<4.000000e+00> : tensor<f32>
-  // CHECK: [[DIV:%.+]] = chlo.broadcast_divide [[REDUCE]], [[COUNT]] {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<2x3x5x7xf32>, tensor<f32>) -> tensor<2x3x5x7xf32>
-  // CHECK: [[CONV16:%.+]] = "mhlo.convert"([[DIV]]) : (tensor<2x3x5x7xf32>) -> tensor<2x3x5x7xf16>
-  // CHECK: return [[CONV16]]
-  %0 = "tf.AvgPool"(%arg0) {data_format = "NHWC", ksize = [1, 2, 2, 1], padding = "VALID", strides = [1, 4, 4, 1]} : (tensor<2x12x20x7xf16>) -> tensor<2x3x5x7xf16>
+// CHECK-LABEL:   @avgpool_valid_padding
+// CHECK-SAME:      [[ARG:%.+]]: tensor<2x12x21x7xf16>
+// CHECK:           [[CONV32:%.+]] = "mhlo.convert"(%arg0) : (tensor<2x12x21x7xf16>) -> tensor<2x12x21x7xf32>
+// CHECK:           [[ZERO:%.+]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+// CHECK:           [[DIVIDEND:%.+]] = "mhlo.reduce_window"([[CONV32]], [[ZERO]]) ( {
+// CHECK:           ^bb0([[ARG1:%.+]]: tensor<f32>, [[ARG2:%.+]]: tensor<f32>):
+// CHECK:             [[ADD:%.+]] = mhlo.add [[ARG1]], [[ARG2]]
+// CHECK:             "mhlo.return"([[ADD]])
+// CHECK:           })
+// CHECK-SAME:        window_dimensions = dense<[1, 2, 2, 1]>
+// CHECK-SAME:        window_strides = dense<[1, 4, 4, 1]>
+// CHECK-SAME:        -> tensor<2x3x5x7xf32>
+// CHECK:           [[COUNT:%.+]] = mhlo.constant dense<4.000000e+00> : tensor<f32>
+// CHECK:           [[DIV_RESULT:%.+]] = chlo.broadcast_divide [[DIVIDEND]], [[COUNT]]
+// CHECK-SAME:        broadcast_dimensions = dense<>
+// CHECK-SAME:        -> tensor<2x3x5x7xf32>
+// CHECK:           [[CONV16:%.+]] = "mhlo.convert"([[DIV_RESULT]])
+// CHECK-SAME:        -> tensor<2x3x5x7xf16>
+// CHECK:           return [[CONV16]]
+func @avgpool_valid_padding(%arg0: tensor<2x12x21x7xf16>) -> tensor<2x3x5x7xf16> {
+  %0 = "tf.AvgPool"(%arg0) {data_format = "NHWC", ksize = [1, 2, 2, 1], padding = "VALID", strides = [1, 4, 4, 1]} : (tensor<2x12x21x7xf16>) -> tensor<2x3x5x7xf16>
   return %0 : tensor<2x3x5x7xf16>
 }
 
-// CHECK-LABEL: avgpool_same_padding
-func @avgpool_same_padding(%arg0: tensor<2x13x25x7xf32>) -> tensor<2x4x7x7xf32> {
-  // CHECK: tf.AvgPool
-  %0 = "tf.AvgPool"(%arg0) {data_format = "NHWC", ksize = [1, 2, 3, 1], padding = "SAME", strides = [1, 4, 4, 1]} : (tensor<2x13x25x7xf32>) -> tensor<2x4x7x7xf32>
-  return %0 : tensor<2x4x7x7xf32>
+// CHECK-LABEL:   @avgpool_3d_valid_padding
+// CHECK-SAME:      [[ARG:%.+]]: tensor<2x4x12x21x7xf16>
+// CHECK:           [[CONV32:%.+]] = "mhlo.convert"(%arg0) : (tensor<2x4x12x21x7xf16>) -> tensor<2x4x12x21x7xf32>
+// CHECK:           [[ZERO:%.+]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+// CHECK:           [[DIVIDEND:%.+]] = "mhlo.reduce_window"([[CONV32]], [[ZERO]]) ( {
+// CHECK:           ^bb0([[ARG1:%.+]]: tensor<f32>, [[ARG2:%.+]]: tensor<f32>):
+// CHECK:           [[ADD:%.+]] = mhlo.add [[ARG1]], [[ARG2]]
+// CHECK:             "mhlo.return"([[ADD]])
+// CHECK:           })
+// CHECK-SAME:        window_dimensions = dense<[1, 1, 2, 2, 1]>
+// CHECK-SAME:        window_strides = dense<[1, 1, 4, 4, 1]>
+// CHECK-SAME:        -> tensor<2x4x3x5x7xf32>
+// CHECK:           [[COUNT:%.+]] = mhlo.constant dense<4.000000e+00> : tensor<f32>
+// CHECK:           [[DIV_RESULT:%.+]] = chlo.broadcast_divide [[DIVIDEND]], [[COUNT]]
+// CHECK-SAME:        broadcast_dimensions = dense<>
+// CHECK-SAME:        -> tensor<2x4x3x5x7xf32>
+// CHECK:           [[CONV16:%.+]] = "mhlo.convert"([[DIV_RESULT]])
+// CHECK-SAME:        -> tensor<2x4x3x5x7xf16>
+// CHECK:           return [[CONV16]]
+func @avgpool_3d_valid_padding(%arg0: tensor<2x4x12x21x7xf16>) -> tensor<2x4x3x5x7xf16> {
+  %0 = "tf.AvgPool3D"(%arg0) {data_format = "NDHWC", ksize = [1, 1, 2, 2, 1], padding = "VALID", strides = [1, 1, 4, 4, 1]} : (tensor<2x4x12x21x7xf16>) -> tensor<2x4x3x5x7xf16>
+  return %0 : tensor<2x4x3x5x7xf16>
+}
+
+// CHECK-LABEL:   @avgpool_nchw_format
+// CHECK-SAME:      [[ARG:%.+]]: tensor<2x7x12x21xf16>
+// CHECK:           [[CONV32:%.+]] = "mhlo.convert"(%arg0) : (tensor<2x7x12x21xf16>) -> tensor<2x7x12x21xf32>
+// CHECK:           [[ZERO:%.+]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+// CHECK:           [[DIVIDEND:%.+]] = "mhlo.reduce_window"([[CONV32]], [[ZERO]]) ( {
+// CHECK:           ^bb0([[ARG1:%.+]]: tensor<f32>, [[ARG2:%.+]]: tensor<f32>):
+// CHECK:             [[ADD:%.+]] = mhlo.add [[ARG1]], [[ARG2]]
+// CHECK:             "mhlo.return"([[ADD]])
+// CHECK:           })
+// CHECK-SAME:        window_dimensions = dense<[1, 1, 2, 2]>
+// CHECK-SAME:        window_strides = dense<[1, 1, 4, 4]>
+// CHECK-SAME:        -> tensor<2x7x3x5xf32>
+// CHECK:           [[COUNT:%.+]] = mhlo.constant dense<4.000000e+00> : tensor<f32>
+// CHECK:           [[DIV_RESULT:%.+]] = chlo.broadcast_divide [[DIVIDEND]], [[COUNT]]
+// CHECK-SAME:        broadcast_dimensions = dense<>
+// CHECK-SAME:        -> tensor<2x7x3x5xf32>
+// CHECK:           [[CONV16:%.+]] = "mhlo.convert"([[DIV_RESULT]])
+// CHECK-SAME:        -> tensor<2x7x3x5xf16>
+// CHECK:           return [[CONV16]]
+func @avgpool_nchw_format(%arg0: tensor<2x7x12x21xf16>) -> tensor<2x7x3x5xf16> {
+  %0 = "tf.AvgPool"(%arg0) {data_format = "NCHW", ksize = [1, 1, 2, 2], padding = "VALID", strides = [1, 1, 4, 4]} : (tensor<2x7x12x21xf16>) -> tensor<2x7x3x5xf16>
+  return %0 : tensor<2x7x3x5xf16>
+}
+
+// CHECK-LABEL:   @avgpool_3d_ncdhw_format
+// CHECK-SAME:      [[ARG:%.+]]: tensor<2x7x4x12x21xf16>
+// CHECK:           [[CONV32:%.+]] = "mhlo.convert"(%arg0) : (tensor<2x7x4x12x21xf16>) -> tensor<2x7x4x12x21xf32>
+// CHECK:           [[ZERO:%.+]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+// CHECK:           [[DIVIDEND:%.+]] = "mhlo.reduce_window"([[CONV32]], [[ZERO]]) ( {
+// CHECK:           ^bb0([[ARG1:%.+]]: tensor<f32>, [[ARG2:%.+]]: tensor<f32>):
+// CHECK:             [[ADD:%.+]] = mhlo.add [[ARG1]], [[ARG2]]
+// CHECK:             "mhlo.return"([[ADD]])
+// CHECK:           })
+// CHECK-SAME:        window_dimensions = dense<[1, 1, 1, 2, 2]>
+// CHECK-SAME:        window_strides = dense<[1, 1, 1, 4, 4]>
+// CHECK-SAME:        -> tensor<2x7x4x3x5xf32>
+// CHECK:           [[COUNT:%.+]] = mhlo.constant dense<4.000000e+00> : tensor<f32>
+// CHECK:           [[DIV_RESULT:%.+]] = chlo.broadcast_divide [[DIVIDEND]], [[COUNT]]
+// CHECK-SAME:        broadcast_dimensions = dense<>
+// CHECK-SAME:        -> tensor<2x7x4x3x5xf32>
+// CHECK:           [[CONV16:%.+]] = "mhlo.convert"([[DIV_RESULT]])
+// CHECK-SAME:        -> tensor<2x7x4x3x5xf16>
+// CHECK:           return [[CONV16]]
+func @avgpool_3d_ncdhw_format(%arg0: tensor<2x7x4x12x21xf16>) -> tensor<2x7x4x3x5xf16> {
+  %0 = "tf.AvgPool3D"(%arg0) {data_format = "NCDHW", ksize = [1, 1, 1, 2, 2], padding = "VALID", strides = [1, 1, 1, 4, 4]} : (tensor<2x7x4x12x21xf16>) -> tensor<2x7x4x3x5xf16>
+  return %0 : tensor<2x7x4x3x5xf16>
+}
+
+// CHECK-LABEL:   @avgpool_same_padding(
+// CHECK-SAME:      %[[ARG0:.*]]: tensor<2x12x21x7xf32>) -> tensor<2x4x6x7xf32>
+// CHECK:           %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+// CHECK:           %[[DIVIDEND:.*]] = "mhlo.reduce_window"(%[[ARG0]], %[[ZERO]]) ( {
+// CHECK:           ^bb0(%[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<f32>):
+// CHECK:             %[[SUM1:.*]] = mhlo.add %[[ARG1]], %[[ARG2]] : tensor<f32>
+// CHECK:             "mhlo.return"(%[[SUM1]]) : (tensor<f32>) -> ()
+// CHECK:           })
+// CHECK-SAME:        padding = dense<{{\[\[}}0, 0], [1, 1], [0, 1], [0, 0]]>
+// CHECK-SAME:        window_dimensions = dense<[1, 5, 2, 1]>
+// CHECK-SAME:        window_strides = dense<[1, 3, 4, 1]>
+// CHECK-SAME:        -> tensor<2x4x6x7xf32>
+// CHECK:           %[[ONES:.*]] = mhlo.constant dense<1.000000e+00> : tensor<2x12x21x7xf32>
+// CHECK:           %[[DIVISOR:.*]] = "mhlo.reduce_window"(%[[ONES]], %[[ZERO]]) ( {
+// CHECK:           ^bb0(%[[ARG3:.*]]: tensor<f32>, %[[ARG4:.*]]: tensor<f32>):
+// CHECK:             %[[SUM2:.*]] = mhlo.add %[[ARG3]], %[[ARG4]] : tensor<f32>
+// CHECK:             "mhlo.return"(%[[SUM2]]) : (tensor<f32>) -> ()
+// CHECK:           })
+// CHECK-SAME:        padding = dense<{{\[\[}}0, 0], [1, 1], [0, 1], [0, 0]]>
+// CHECK-SAME:        window_dimensions = dense<[1, 5, 2, 1]>
+// CHECK-SAME:        window_strides = dense<[1, 3, 4, 1]>
+// CHECK-SAME:        -> tensor<2x4x6x7xf32>
+// CHECK:           %[[RESULT:.*]] = mhlo.divide %[[DIVIDEND]], %[[DIVISOR]] : tensor<2x4x6x7xf32>
+// CHECK:           return %[[RESULT]] : tensor<2x4x6x7xf32>
+// CHECK:         }
+func @avgpool_same_padding(%arg0: tensor<2x12x21x7xf32>) -> tensor<2x4x6x7xf32> {
+  %0 = "tf.AvgPool"(%arg0) {data_format = "NHWC", ksize = [1, 5, 2, 1], padding = "SAME", strides = [1, 3, 4, 1]} : (tensor<2x12x21x7xf32>) -> tensor<2x4x6x7xf32>
+  return %0 : tensor<2x4x6x7xf32>
+}
+
+// CHECK-LABEL:   @avgpool_3d_same_padding(
+// CHECK-SAME:      %[[ARG0:.*]]: tensor<2x4x12x21x7xf32>) -> tensor<2x4x4x6x7xf32>
+// CHECK:           %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+// CHECK:           %[[DIVIDEND:.*]] = "mhlo.reduce_window"(%[[ARG0]], %[[ZERO]]) ( {
+// CHECK:           ^bb0(%[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<f32>):
+// CHECK:             %[[SUM1:.*]] = mhlo.add %[[ARG1]], %[[ARG2]] : tensor<f32>
+// CHECK:             "mhlo.return"(%[[SUM1]]) : (tensor<f32>) -> ()
+// CHECK:           })
+// CHECK-SAME:        padding = dense<{{\[\[}}0, 0], [0, 0], [1, 1], [0, 1], [0, 0]]>
+// CHECK-SAME:        window_dimensions = dense<[1, 1, 5, 2, 1]>
+// CHECK-SAME:        window_strides = dense<[1, 1, 3, 4, 1]>
+// CHECK-SAME:        -> tensor<2x4x4x6x7xf32>
+// CHECK:           %[[ONES:.*]] = mhlo.constant dense<1.000000e+00> : tensor<2x4x12x21x7xf32>
+// CHECK:           %[[DIVISOR:.*]] = "mhlo.reduce_window"(%[[ONES]], %[[ZERO]]) ( {
+// CHECK:           ^bb0(%[[ARG3:.*]]: tensor<f32>, %[[ARG4:.*]]: tensor<f32>):
+// CHECK:             %[[SUM2:.*]] = mhlo.add %[[ARG3]], %[[ARG4]] : tensor<f32>
+// CHECK:             "mhlo.return"(%[[SUM2]]) : (tensor<f32>) -> ()
+// CHECK:           })
+// CHECK-SAME:        padding = dense<{{\[\[}}0, 0], [0, 0], [1, 1], [0, 1], [0, 0]]>
+// CHECK-SAME:        window_dimensions = dense<[1, 1, 5, 2, 1]>
+// CHECK-SAME:        window_strides = dense<[1, 1, 3, 4, 1]>
+// CHECK-SAME:        -> tensor<2x4x4x6x7xf32>
+// CHECK:           %[[RESULT:.*]] = mhlo.divide %[[DIVIDEND]], %[[DIVISOR]]
+// CHECK:           return %[[RESULT]] : tensor<2x4x4x6x7xf32>
+// CHECK:         }
+func @avgpool_3d_same_padding(%arg0: tensor<2x4x12x21x7xf32>) -> tensor<2x4x4x6x7xf32> {
+  %0 = "tf.AvgPool3D"(%arg0) {data_format = "NDHWC", ksize = [1, 1, 5, 2, 1], padding = "SAME", strides = [1, 1, 3, 4, 1]} : (tensor<2x4x12x21x7xf32>) -> tensor<2x4x4x6x7xf32>
+  return %0 : tensor<2x4x4x6x7xf32>
 }
 
 //===----------------------------------------------------------------------===//
 // AvgPoolGrad op legalizations.
 //===----------------------------------------------------------------------===//
 
-// CHECK-LABEL:   func @avgpool_grad_valid_padding(
+// CHECK-LABEL:   @avgpool_grad_valid_padding(
 // CHECK-SAME:      %[[OUT_GRAD:.*]]: tensor<10x12x16x64xf32>) -> tensor<10x24x32x64xf32> {
 // CHECK:           %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
 // CHECK:           %[[DIVISOR:.*]] = mhlo.constant dense<4.000000e+00> : tensor<f32>
@@ -3907,7 +4282,7 @@ func @avgpool_grad_valid_padding(%grad: tensor<10x12x16x64xf32>) -> tensor<10x24
   return %result : tensor<10x24x32x64xf32>
 }
 
-// CHECK-LABEL:   func @avgpool_3d_grad_valid_padding(
+// CHECK-LABEL:   @avgpool_3d_grad_valid_padding(
 // CHECK-SAME:      %[[OUT_GRAD:.*]]: tensor<10x8x12x16x64xf32>) -> tensor<10x8x24x32x64xf32> {
 // CHECK:           %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
 // CHECK:           %[[DIVISOR:.*]] = mhlo.constant dense<4.000000e+00> : tensor<f32>
@@ -3936,7 +4311,7 @@ func @avgpool_3d_grad_valid_padding(%grad: tensor<10x8x12x16x64xf32>) -> tensor<
   return %result : tensor<10x8x24x32x64xf32>
 }
 
-// CHECK-LABEL:   func @avgpool_grad_same_padding(
+// CHECK-LABEL:   @avgpool_grad_same_padding(
 // CHECK-SAME:      %[[OUT_GRAD:.*]]: tensor<2x4x7x9xf32>) -> tensor<2x13x25x9xf32> {
 // CHECK:           %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
 // CHECK:           %[[ALL_ONES:.*]] = mhlo.constant dense<1.000000e+00> : tensor<2x13x25x9xf32>
@@ -3975,7 +4350,7 @@ func @avgpool_grad_same_padding(%grad: tensor<2x4x7x9xf32>) -> tensor<2x13x25x9x
   return %result : tensor<2x13x25x9xf32>
 }
 
-// CHECK-LABEL:   func @avgpool_3d_grad_same_padding(
+// CHECK-LABEL:   @avgpool_3d_grad_same_padding(
 // CHECK-SAME:      %[[OUT_GRAD:.*]]: tensor<2x8x4x7x9xf32>) -> tensor<2x8x13x25x9xf32> {
 // CHECK:           %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
 // CHECK:           %[[ALL_ONES:.*]] = mhlo.constant dense<1.000000e+00> : tensor<2x8x13x25x9xf32>
@@ -4013,7 +4388,7 @@ func @avgpool_3d_grad_same_padding(%grad: tensor<2x8x4x7x9xf32>) -> tensor<2x8x1
   return %result : tensor<2x8x13x25x9xf32>
 }
 
-// CHECK-LABEL:   func @avgpool_grad_nchw_format(
+// CHECK-LABEL:   @avgpool_grad_nchw_format(
 // CHECK-SAME:      %[[OUT_GRAD:.*]]: tensor<2x9x4x7xf32>) -> tensor<2x9x13x25xf32> {
 // CHECK:           %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
 // CHECK:           %[[ALL_ONES:.*]] = mhlo.constant dense<1.000000e+00> : tensor<2x9x13x25xf32>
@@ -4052,7 +4427,7 @@ func @avgpool_grad_nchw_format(%grad: tensor<2x9x4x7xf32>) -> tensor<2x9x13x25xf
   return %result : tensor<2x9x13x25xf32>
 }
 
-// CHECK-LABEL:   func @avgpool_3d_grad_ncdwh_format(
+// CHECK-LABEL:   @avgpool_3d_grad_ncdwh_format(
 // CHECK-SAME:      %[[OUT_GRAD:.*]]: tensor<2x9x8x4x7xf32>) -> tensor<2x9x8x13x25xf32> {
 // CHECK:           %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
 // CHECK:           %[[ALL_ONES:.*]] = mhlo.constant dense<1.000000e+00> : tensor<2x9x8x13x25xf32>
@@ -4090,7 +4465,7 @@ func @avgpool_3d_grad_ncdwh_format(%grad: tensor<2x9x8x4x7xf32>) -> tensor<2x9x8
   return %result : tensor<2x9x8x13x25xf32>
 }
 
-// CHECK-LABEL:   func @avgpool_grad_bf16(
+// CHECK-LABEL:   @avgpool_grad_bf16(
 // CHECK-SAME:      %[[OUT_GRAD:.*]]: tensor<10x12x16x64xbf16>) -> tensor<10x24x32x64xbf16> {
 // CHECK:           %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<bf16>
 // CHECK:           %[[DIVISOR:.*]] = mhlo.constant dense<4.000000e+00> : tensor<bf16>
@@ -4227,21 +4602,65 @@ func @cumsum_static(%arg0: tensor<4xf32>) -> tensor<4xf32> {
 }
 
 // CHECK-LABEL: func @cumsum_exclusive
+// CHECK-SAME: [[X:%.*]]: tensor<4xf32>
 func @cumsum_exclusive(%arg0: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: "tf.Cumsum"
+  // CHECK: [[AXIS:%.*]] = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: [[CONVERT_X:%.*]] = "mhlo.convert"([[X]]) : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: [[INIT:%.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK: [[REDUCE:%.*]] = "mhlo.reduce_window"([[CONVERT_X]], [[INIT]]) ( {
+  // CHECK: ^bb0([[A:%.*]]: tensor<f32>, [[B:%.*]]: tensor<f32>):
+  // CHECK:   [[SUM:%.*]] = mhlo.add [[A]], [[B]] : tensor<f32>
+  // CHECK:   "mhlo.return"([[SUM]]) : (tensor<f32>) -> ()
+  // CHECK: }) {padding = dense<{{\[\[}}3, 0]]> : tensor<1x2xi64>, window_dimensions = dense<4> : tensor<1xi64>, window_strides = dense<1> : tensor<1xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK: [[PAD:%.*]] = "mhlo.pad"([[REDUCE]], %{{.*}}) {edge_padding_high = dense<-1> : tensor<1xi64>, edge_padding_low = dense<1> : tensor<1xi64>, interior_padding = dense<0> : tensor<1xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK: [[CONVERT_REDUCE:%.*]] = "mhlo.convert"([[PAD]]) : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: return [[CONVERT_REDUCE]]
   %0 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "", dtype = i32, value = dense<0> : tensor<i32>} : () -> tensor<i32>
   %1 = "tf.Cumsum"(%arg0, %0) {exclusive = true, reverse = false} : (tensor<4xf32>, tensor<i32>) -> tensor<4xf32>
   return %1 : tensor<4xf32>
 }
 
 // CHECK-LABEL: func @cumsum_reverse
+// CHECK-SAME: [[X:%.*]]: tensor<4xf32>
 func @cumsum_reverse(%arg0: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: "tf.Cumsum"
+  // CHECK: [[AXIS:%.*]] = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: [[REVERSE1:%.*]] = "mhlo.reverse"([[X]]) {dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: [[CONVERT_X:%.*]] = "mhlo.convert"([[REVERSE1]]) : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: [[INIT:%.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK: [[REDUCE:%.*]] = "mhlo.reduce_window"([[CONVERT_X]], [[INIT]]) ( {
+  // CHECK: ^bb0([[A:%.*]]: tensor<f32>, [[B:%.*]]: tensor<f32>):
+  // CHECK:   [[SUM:%.*]] = mhlo.add [[A]], [[B]] : tensor<f32>
+  // CHECK:   "mhlo.return"([[SUM]]) : (tensor<f32>) -> ()
+  // CHECK: }) {padding = dense<{{\[\[}}3, 0]]> : tensor<1x2xi64>, window_dimensions = dense<4> : tensor<1xi64>, window_strides = dense<1> : tensor<1xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK: [[CONVERT_REDUCE:%.*]] = "mhlo.convert"([[REDUCE]]) : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: [[REVERSE_BACK:%.*]] = "mhlo.reverse"([[CONVERT_REDUCE]]) {dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: return [[REVERSE_BACK]]
   %0 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "", dtype = i32, value = dense<0> : tensor<i32>} : () -> tensor<i32>
   %1 = "tf.Cumsum"(%arg0, %0) {exclusive = false, reverse = true} : (tensor<4xf32>, tensor<i32>) -> tensor<4xf32>
   return %1 : tensor<4xf32>
 }
 
+// CHECK-LABEL: func @cumsum_exclusive_reverse
+// CHECK-SAME: [[X:%.*]]: tensor<4xf32>
+func @cumsum_exclusive_reverse(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK: [[AXIS:%.*]] = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: [[REVERSE1:%.*]] = "mhlo.reverse"([[X]]) {dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: [[CONVERT_X:%.*]] = "mhlo.convert"([[REVERSE1]]) : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: [[INIT:%.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK: [[REDUCE:%.*]] = "mhlo.reduce_window"([[CONVERT_X]], [[INIT]]) ( {
+  // CHECK: ^bb0([[A:%.*]]: tensor<f32>, [[B:%.*]]: tensor<f32>):
+  // CHECK:   [[SUM:%.*]] = mhlo.add [[A]], [[B]] : tensor<f32>
+  // CHECK:   "mhlo.return"([[SUM]]) : (tensor<f32>) -> ()
+  // CHECK: }) {padding = dense<{{\[\[}}3, 0]]> : tensor<1x2xi64>, window_dimensions = dense<4> : tensor<1xi64>, window_strides = dense<1> : tensor<1xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK: [[PAD:%.*]] = "mhlo.pad"([[REDUCE]], %{{.*}}) {edge_padding_high = dense<-1> : tensor<1xi64>, edge_padding_low = dense<1> : tensor<1xi64>, interior_padding = dense<0> : tensor<1xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK: [[CONVERT_REDUCE:%.*]] = "mhlo.convert"([[PAD]]) : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: [[REVERSE_BACK:%.*]] = "mhlo.reverse"([[CONVERT_REDUCE]]) {dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: return [[REVERSE_BACK]]
+  %0 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "", dtype = i32, value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tf.Cumsum"(%arg0, %0) {exclusive = true, reverse = true} : (tensor<4xf32>, tensor<i32>) -> tensor<4xf32>
+  return %1 : tensor<4xf32>
+}
+
 // CHECK-LABEL: func @cumsum_dynamic
 func @cumsum_dynamic(%arg0: tensor<?xf32>, %arg1: tensor<i32>) -> tensor<?xf32> {
   // CHECK: "tf.Cumsum"
@@ -4249,6 +4668,10 @@ func @cumsum_dynamic(%arg0: tensor<?xf32>, %arg1: tensor<i32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+//===----------------------------------------------------------------------===//
+// Qr op legalization
+//===----------------------------------------------------------------------===//
+
 // CHECK:  func @qr([[VAL_0:%.*]]: tensor<500x100x75xf32>) -> (tensor<500x100x75xf32>, tensor<500x75x75xf32>)
 func @qr(%arg0: tensor<500x100x75xf32>) -> (tensor<500x100x75xf32>, tensor<500x75x75xf32>) {
   // The tf.Qr lowering is a full algorithm that is not effective to verify with
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/case.mlir b/tensorflow/compiler/mlir/xla/tests/translate/case.mlir
index 57959568287..1032bb723c5 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/case.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/case.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-mlir-translate -split-input-file -mlir-hlo-to-hlo-text %s | FileCheck %s
+// RUN: tf-mlir-translate -split-input-file -mlir-hlo-to-hlo-text %s | FILECHECK_OPTS="" FileCheck %s
 
 func @main() -> tensor<f32> {
   %cst = constant  {name = "constant"} dense<1> : tensor<i32>
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/export.mlir b/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
index 9929bd85b43..316eda4c4aa 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
@@ -1087,3 +1087,15 @@ func @main(%arg: tensor<3x4xf32>, %token: !mhlo.token) -> !mhlo.token {
 }
 
 // CHECK-NOT:  frontend_attributes
+
+// -----
+
+// Checks exporting rng-bit-generator.
+
+// CHECK:  HloModule
+func @main(%arg: tensor<3xui64>) -> tuple<tensor<3xui64>, tensor<2x2xui32>> {
+// CHECK: %[[ARG0:.*]] = u64[3] parameter(0)
+// CHECK: ROOT %[[RESULT:.*]] = (u64[3], u32[2,2]) rng-bit-generator(u64[3] %[[ARG0]]), algorithm=rng_philox
+  %0 = "mhlo.rng_bit_generator"(%arg) {rng_algorithm = 2 : i32} : (tensor<3xui64>) -> tuple<tensor<3xui64>, tensor<2x2xui32>>
+  return %0 : tuple<tensor<3xui64>, tensor<2x2xui32>>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/fusion.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/fusion.hlotxt
new file mode 100644
index 00000000000..dc2ce6d58f8
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/translate/fusion.hlotxt
@@ -0,0 +1,35 @@
+// RUN: tf-mlir-translate -hlo-text-to-mlir-hlo %s -o - | FileCheck %s
+
+HloModule main.17
+
+// CHECK: func @main(%[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>) -> tensor<f32> {
+// CHECK:   %0 = "mhlo.fusion"(%[[ARG0:.*]], %[[ARG1:.*]]) ( {
+// CHECK:   ^bb0(%[[ARG2:.*]]: tensor<f32>, %[[ARG3:.*]]: tensor<f32>):
+// CHECK:   }) {fusion_kind = "kLoop"} : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK:   %1 = "mhlo.fusion"(%[[ARG0:.*]], %[[ARG1:.*]]) ( {
+// CHECK:   ^bb0(%[[ARG2:.*]]: tensor<f32>, %[[ARG3:.*]]: tensor<f32>):
+// CHECK:   }) {fusion_kind = "kLoop"} : (tensor<f32>, tensor<f32>) -> tuple<tensor<f32>, tensor<f32>>
+// CHECK: }
+
+%region_0.3 (Arg_0.4: f32[], Arg_1.5: f32[]) -> f32[] {
+  %Arg_0.4 = f32[] parameter(0)
+  %Arg_1.5 = f32[] parameter(1)
+  ROOT %add.6 = f32[] add(f32[] %Arg_0.4, f32[] %Arg_1.5)
+}
+
+%region_1.8 (Arg_0.9: f32[], Arg_1.10: f32[]) -> (f32[], f32[]) {
+  %Arg_0.9 = f32[] parameter(0)
+  %Arg_1.10 = f32[] parameter(1)
+  %add.11 = f32[] add(f32[] %Arg_0.9, f32[] %Arg_1.10)
+  %subtract.12 = f32[] subtract(f32[] %Arg_0.9, f32[] %Arg_1.10)
+  ROOT %tuple.13 = (f32[], f32[]) tuple(f32[] %add.11, f32[] %subtract.12)
+}
+
+ENTRY %main.17 (Arg_0.1: f32[], Arg_1.2: f32[]) -> f32[] {
+  %Arg_0.1 = f32[] parameter(0)
+  %Arg_1.2 = f32[] parameter(1)
+  %fusion.7 = f32[] fusion(f32[] %Arg_0.1, f32[] %Arg_1.2), kind=kLoop, calls=%region_0.3
+  %fusion.14 = (f32[], f32[]) fusion(f32[] %Arg_0.1, f32[] %Arg_1.2), kind=kLoop, calls=%region_1.8
+  %get-tuple-element.15 = f32[] get-tuple-element((f32[], f32[]) %fusion.14), index=0
+  ROOT %get-tuple-element.16 = f32[] get-tuple-element((f32[], f32[]) %fusion.14), index=1
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/fusion.mlir b/tensorflow/compiler/mlir/xla/tests/translate/fusion.mlir
new file mode 100644
index 00000000000..7da9b7c5f7b
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/translate/fusion.mlir
@@ -0,0 +1,27 @@
+// RUN: tf-mlir-translate -mlir-hlo-to-hlo-text %s | FileCheck %s
+
+// CHECK: %[[REGION0:.*]] ({{.*}}: f32[], {{.*}}: f32[]) -> f32[]
+// CHECK: %[[REGION1:.*]] ({{.*}}: f32[], {{.*}}: f32[]) -> (f32[], f32[])
+//
+// CHECK: ENTRY
+// CHECK:   %[[PARAM0:.*]] = f32[] parameter(0)
+// CHECK:   %[[PARAM1:.*]] = f32[] parameter(1)
+// CHECK:   %[[FUSION0:.*]] = f32[] fusion(f32[] %[[PARAM0]], f32[] %[[PARAM1]]), kind=kLoop, calls=%[[REGION0]]
+// CHECK:   %[[FUSION1:.*]] = (f32[], f32[]) fusion(f32[] %[[PARAM0]], f32[] %[[PARAM1]]), kind=kLoop, calls=%[[REGION1]]
+// CHECK:   f32[] get-tuple-element((f32[], f32[]) %[[FUSION1]]), index=0
+// CHECK:   f32[] get-tuple-element((f32[], f32[]) %[[FUSION1]]), index=1
+// CHECK: }
+func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) {
+  %result = "mhlo.fusion"(%arg0, %arg1) ( {
+    ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+      %result = "mhlo.add"(%arg2, %arg3): (tensor<f32>, tensor<f32>) -> tensor<f32>
+      "mhlo.return"(%result) : (tensor<f32>) -> ()
+    }) { fusion_kind = "kLoop" } : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  %result0, %result1 = "mhlo.fusion"(%arg0, %arg1) ( {
+    ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+      %elem0 = "mhlo.add"(%arg2, %arg3): (tensor<f32>, tensor<f32>) -> tensor<f32>
+      %elem1 = "mhlo.subtract"(%arg2, %arg3): (tensor<f32>, tensor<f32>) -> tensor<f32>
+      "mhlo.return"(%elem0, %elem1) : (tensor<f32>, tensor<f32>) -> ()
+    }) { fusion_kind="kLoop" } : (tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
+  return
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
index 2b7d44f4522..4d4e0213da8 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
@@ -1,4 +1,4 @@
-// RUN: tf-mlir-translate -hlo-text-to-mlir-hlo %s -o - | FileCheck %s -DPRIVATE="attributes {sym_visibility = \"private\"}"
+// RUN: tf-mlir-translate -hlo-text-to-mlir-hlo %s -o - | FILECHECK_OPTS="" FileCheck %s -DPRIVATE="attributes {sym_visibility = \"private\"}"
 
 HloModule main
 
@@ -1005,3 +1005,12 @@ add {
   // CHECK: "mhlo.not"(%[[ARG0]]) {name = "{{.*}}"} : (tensor<4xui16>) -> tensor<4xui16>
   ROOT %not.2 = u16[4] not(u16[4] %Arg_0.1)
 }
+
+// CHECK-LABEL:  func @rngbitgen
+// CHECK-SAME:    (%[[ARG0:.*]]: tensor<3xui64>)
+%rngbitgen (Arg_0.1: u64[3]) -> (u64[3], u32[2,2]) {
+  %Arg_0.1 = u64[3] parameter(0)
+  // CHECK: "mhlo.rng_bit_generator"(%[[ARG0]]) {rng_algorithm = 2 : i32} : (tensor<3xui64>) -> tuple<tensor<3xui64>, tensor<2x2xui32>>
+  ROOT %rng-bit-generator.2 = (u64[3], u32[2,2]) rng-bit-generator(u64[3] %Arg_0.1), algorithm=rng_philox
+}
+
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/non_isolated_computation.mlir b/tensorflow/compiler/mlir/xla/tests/translate/non_isolated_computation.mlir
new file mode 100644
index 00000000000..94f53ebbfcb
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/translate/non_isolated_computation.mlir
@@ -0,0 +1,16 @@
+// RUN: not tf-mlir-translate -mlir-hlo-to-hlo-text %s 2>&1 | FileCheck %s
+
+func @main(%arg0: tensor<i64>) -> tensor<i64> {
+  %c0 = mhlo.constant dense<1> : tensor<i64>
+  %0 = "mhlo.while"(%arg0) ( {
+  ^bb0(%arg1: tensor<i64>):
+    // CHECK: requires all operands to be defined in the parent region for export
+    %1 = "mhlo.compare"(%c0, %arg1) {comparison_direction = "LT"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+    "mhlo.return"(%1) : (tensor<i1>) -> ()
+  },  {
+  ^bb0(%arg1: tensor<i64>):
+    %2 = mhlo.add %arg1, %arg1 : tensor<i64>
+    "mhlo.return"(%2) : (tensor<i64>) -> ()
+  }) : (tensor<i64>) -> tensor<i64>
+  return %0 : tensor<i64>
+}
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
index 23f11cef4d9..5fe933ee635 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
@@ -71,9 +71,14 @@ class LegalizeTF : public PassWrapper<LegalizeTF, FunctionPass> {
  public:
   LegalizeTF() = default;
   LegalizeTF(const LegalizeTF &) {}
-  explicit LegalizeTF(bool allow_partial_conversion, bool legalize_chlo) {
+  explicit LegalizeTF(bool allow_partial_conversion, bool legalize_chlo,
+                      llvm::Optional<StringRef> tf2xla_fallback_device_type) {
     allow_partial_conversion_ = allow_partial_conversion;
     legalize_chlo_ = legalize_chlo;
+    use_tf2xla_fallback_ = tf2xla_fallback_device_type.hasValue();
+    if (tf2xla_fallback_device_type.hasValue()) {
+      device_type_ = tf2xla_fallback_device_type.getValue().str();
+    }
   }
 
   /// Performs the lowering to XLA dialect.
@@ -89,6 +94,17 @@ class LegalizeTF : public PassWrapper<LegalizeTF, FunctionPass> {
       llvm::cl::desc(
           "Also legalizes intermediate chlo ops to hlo (default true)"),
       llvm::cl::init(true)};
+  Option<bool> use_tf2xla_fallback_{
+      *this, "use-tf2xla-fallback",
+      llvm::cl::desc(
+          "Also use TF2XLA fallback for legalization (default false)"),
+      llvm::cl::init(false)};
+  Option<std::string> device_type_{
+      *this, "device-type",
+      llvm::cl::desc(
+          "The device type used by TF2XLA fallback. Must be specified if "
+          "use-tf2xla-fallback is true, otherwise not used."),
+      llvm::cl::init("INVALID_DEVICE_TYPE")};
 };
 
 /// Returns if the given TF data format string is the default format.
@@ -365,7 +381,7 @@ static Value UpdateSliceInMinorDims(Location loc, Value v, Value update,
                                     ArrayRef<int64_t> minor_starts,
                                     OpBuilder *builder) {
   llvm::SmallVector<Value, 4> dus_starts(minor_starts.size());
-  for (int64_t i = 0; i < minor_starts.size(); ++i) {
+  for (uint64_t i = 0; i < minor_starts.size(); ++i) {
     dus_starts[i] = GetScalarConstOfType(builder->getIntegerType(32), loc,
                                          minor_starts[i], builder);
   }
@@ -808,7 +824,7 @@ static DenseIntElementsAttr SliceDenseIntElementsAttrColumn2D(
   values.reserve(shaped_type.getNumElements() / shape[1]);
 
   for (auto it : llvm::enumerate(int_attr.getIntValues())) {
-    if (it.index() % shape[1] == column) {
+    if (static_cast<int>(it.index() % shape[1]) == column) {
       values.push_back(it.value().getSExtValue());
     }
   }
@@ -896,7 +912,7 @@ static DenseElementsAttr GetEpsilonValue(Type ty) {
     auto value = APFloat(APFloat::IEEEhalf(), APInt(16, raw_epsilon));
     return DenseElementsAttr::get(scalar_ty, value);
   } else if (element_ty.isBF16()) {
-    uint16_t raw_epsilon = tensorflow::bfloat16::epsilon().value;
+    uint16_t raw_epsilon = Eigen::NumTraits<Eigen::bfloat16>::epsilon().value;
     auto value = APFloat(APFloat::BFloat(), APInt(16, raw_epsilon));
     return DenseElementsAttr::get(scalar_ty, value);
   } else if (element_ty.isF32()) {
@@ -1387,6 +1403,269 @@ class ConvertDiagPartOp : public OpRewritePattern<TF::DiagPartOp> {
   }
 };
 
+// Converts TensorFlow MatrixDiagPartOp to HLO ops.
+class ConvertMatrixDiagPartV3Op
+    : public OpRewritePattern<TF::MatrixDiagPartV3Op> {
+  using Shape = llvm::SmallVector<int64_t, 4>;
+
+  // Parse the "k" parameter. MatrixDiagPartV3 allows to specify the diagonal(s)
+  // with k. This can be either a single value (for a single diagonal) or a
+  // tuple of two values (starting and ending diagonal, for a band).
+  LogicalResult ExtractK(TF::MatrixDiagPartV3Op op, int64_t (*k)[2]) const {
+    DenseIntElementsAttr kattr;
+    if (!matchPattern(op.k(), m_Constant(&kattr))) {
+      return failure();
+    }
+    DenseIntElementsAttr::iterator it = kattr.begin();
+    (*k)[0] = (*it).getSExtValue();
+    it++;
+    if (it == kattr.end()) {
+      // Handle input like e.g. "k = 5", in which case we extract a single
+      // diagonal.
+      (*k)[1] = (*k)[0];
+    } else {
+      // Handle input like e.g. "k = [-1, 1]", in which case we extract a
+      // band (multiple diagonals).
+      (*k)[1] = (*it).getSExtValue();
+    }
+    return success();
+  }
+
+  // Utility method for broadcasting integer constants to a given shape.
+  BroadcastOp BroadcastConstant(Location loc, Shape shape, int32_t constant,
+                                int int_size, PatternRewriter &rewriter) const {
+    return rewriter.create<BroadcastOp>(
+        loc, RankedTensorType::get(shape, rewriter.getIntegerType(int_size)),
+        GetScalarConstOfType(rewriter.getIntegerType(int_size), loc, constant,
+                             &rewriter),
+        GetI64ElementsAttr(shape, &rewriter));
+  }
+
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TF::MatrixDiagPartV3Op op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    ShapedType input_type = op.input().getType().dyn_cast<ShapedType>();
+    auto element_type = input_type.getElementType();
+
+    // Align is a string specifying how superdiagonals and subdiagonals should
+    // be aligned/padded for diagonals that are shorter than max_diag_len. The
+    // format is "{super}_{sub}", with {super} the superdiagonal alignment and
+    // {sub} the subdiagonal alignment. "LEFT" means rows will be padded to the
+    // left, "RIGHT" means rows will be padded ot the right.  The default is
+    // "RIGHT_LEFT".
+    StringRef align = op.getAttrOfType<StringAttr>("align").getValue();
+    enum Alignment { kLeft, kRight };
+
+    // default is RIGHT_LEFT
+    Alignment superdiagonal_align = kRight;
+    Alignment subdiagonal_align = kLeft;
+
+    if (align == "RIGHT_LEFT") {
+      superdiagonal_align = kRight;
+      subdiagonal_align = kLeft;
+    } else if (align == "RIGHT_RIGHT") {
+      superdiagonal_align = kRight;
+      subdiagonal_align = kRight;
+    } else if (align == "LEFT_RIGHT") {
+      superdiagonal_align = kLeft;
+      subdiagonal_align = kRight;
+    } else if (align == "LEFT_LEFT") {
+      superdiagonal_align = kLeft;
+      subdiagonal_align = kLeft;
+    } else {
+      return failure();  // unsupported alignment
+    }
+
+    // MatrixDiagPart operates on a matrix of shape [I, J, ..., L, M, N], and
+    // will extract the diagonal(s) out of [M, N], for all [I, J, ..., L].
+    if (!input_type || !input_type.hasStaticShape()) return failure();
+    int64_t num_dims = input_type.getRank();
+    if (num_dims < 2) return failure();
+    int64_t rows = input_type.getDimSize(num_dims - 2);  // rows
+    int64_t cols = input_type.getDimSize(num_dims - 1);  // cols
+
+    // We extract the diagonals from k[0] up to and including k[1].
+    // Addressing is 0 for the main diagonal. (So k = [0, 0] would just extract
+    // the main diagonal). It's negative for subdiagonals (under and to the left
+    // of the main diagonal) and positive for superdiagonals (above and to the
+    // right of the main diagonal).
+    int64_t k[2];
+    if (failed(ExtractK(op, &k))) return failure();
+    int num_diags = k[1] - k[0] + 1;
+
+    // Shifting diagonals away from the main diagonal might shorten them. This
+    // is the longest diagonal we will see. We make this the last dimension of
+    // the output shape.
+    int64_t max_diag_len =
+        std::min(rows + std::min(k[1], static_cast<int64_t>(0)),
+                 cols + std::min(-k[0], static_cast<int64_t>(0)));
+
+    // The first dimension is the index vector dimension we'll use for gather.
+    // It's 1 here, but will be 2 once we glue x and y together.
+    Shape indices_shape({1, num_diags, max_diag_len});
+
+    RankedTensorType iota_type =
+        RankedTensorType::get(indices_shape, rewriter.getIntegerType(32));
+    Value iotaM =
+        rewriter.create<IotaOp>(loc, iota_type, rewriter.getI64IntegerAttr(1));
+    Value iotaN =
+        rewriter.create<IotaOp>(loc, iota_type, rewriter.getI64IntegerAttr(2));
+
+    // Boradcasted constants, of the same shape as iotaM and iotaN.
+    Value b_zero = BroadcastConstant(loc, indices_shape, 0, 32, rewriter);
+    Value b_false = BroadcastConstant(loc, indices_shape, 0, 1, rewriter);
+    Value b_true = BroadcastConstant(loc, indices_shape, 1, 1, rewriter);
+    Value b_k1 = BroadcastConstant(loc, indices_shape, k[1], 32, rewriter);
+    Value b_rows = BroadcastConstant(loc, indices_shape, rows, 32, rewriter);
+    Value b_cols = BroadcastConstant(loc, indices_shape, cols, 32, rewriter);
+    Value b_max_diag_len =
+        BroadcastConstant(loc, indices_shape, max_diag_len, 32, rewriter);
+
+    // d = k[1] - m
+    // (A.k.a. the number of the diagonal, depending on m. Note that we
+    //  subtract m here. This means we start with the superdiagonals and
+    //  move downwards towards the subdiagonals. So the start indices will
+    //  be decreasing.)
+    Value d = rewriter.create<SubOp>(loc, b_k1, iotaM);
+    Value neg_d = rewriter.create<NegOp>(loc, d);
+
+    // diag_len_d = min(rows + min(d, 0), cols - max(d, 0))
+    // (Length of a diagonal for a given d. Same as max_diag_len for m = 0.)
+    Value diag_len_d = rewriter.create<MinOp>(
+        loc,
+        rewriter.create<AddOp>(loc, b_rows,
+                               rewriter.create<MinOp>(loc, d, b_zero)),
+        rewriter.create<SubOp>(loc, b_cols,
+                               rewriter.create<MaxOp>(loc, d, b_zero)));
+
+    // offset is max_diag_len - diag_len_d if we're padding, 0 otherwise.
+    Value cmp;
+    if (subdiagonal_align == kRight && superdiagonal_align == kRight) {
+      cmp = b_true;
+    } else if (superdiagonal_align == kRight) {
+      // offset = d>=0 ? max_diag_len - diag_len_d : 0
+      cmp = rewriter.create<TF::GreaterEqualOp>(loc, d, b_zero);
+    } else if (subdiagonal_align == kRight) {
+      // offset = d<=0 ? max_diag_len - diag_len_d : 0
+      cmp = rewriter.create<TF::LessEqualOp>(loc, d, b_zero);
+    } else {
+      // offset = 0
+      cmp = b_false;
+    }
+
+    // This offset shifts the diagonals to the "left" or "right", depending
+    // on alignment.
+    Value offset = rewriter.create<SelectOp>(
+        loc, b_zero.getType(), cmp,
+        rewriter.create<SubOp>(loc, b_max_diag_len, diag_len_d), b_zero);
+
+    // x = max(d, 0) - offset
+    // y = max(-d, 0) - offset
+    Value x = rewriter.create<SubOp>(
+        loc, rewriter.create<MaxOp>(loc, d, b_zero), offset);
+    Value y = rewriter.create<SubOp>(
+        loc, rewriter.create<MaxOp>(loc, neg_d, b_zero), offset);
+
+    Value n_plus_x = rewriter.create<AddOp>(loc, iotaN, x);
+    Value n_plus_y = rewriter.create<AddOp>(loc, iotaN, y);
+
+    // GatherOp is happy about letting us index out of bounds values, but those
+    // values will be undefined. So we mask them later. Set up the boolean
+    // expression that tells us which entries, in the output shape, are out of
+    // bounds and thus become the padding_value.
+    Value x_in_bounds = rewriter.create<AndOp>(
+        loc,
+        rewriter.create<TF::GreaterEqualOp>(loc, b_false.getType(), n_plus_x,
+                                            b_zero),
+        rewriter.create<TF::LessOp>(loc, b_false.getType(), n_plus_x, b_cols));
+    Value y_in_bounds = rewriter.create<AndOp>(
+        loc,
+        rewriter.create<TF::GreaterEqualOp>(loc, b_false.getType(), n_plus_y,
+                                            b_zero),
+        rewriter.create<TF::LessOp>(loc, b_false.getType(), n_plus_y, b_rows));
+    Value in_bounds = rewriter.create<ReshapeOp>(
+        loc,
+        RankedTensorType::get(Shape({num_diags, max_diag_len}),
+                              rewriter.getIntegerType(1)),
+        rewriter.create<AndOp>(loc, x_in_bounds, y_in_bounds));
+
+    // Now combine x and y into the index data structure needed for gather.
+    Shape concat_shape({2, num_diags, max_diag_len});
+    Value start_indices = rewriter.create<ConcatenateOp>(
+        loc, RankedTensorType::get(concat_shape, rewriter.getIntegerType(32)),
+        mlir::ValueRange({n_plus_y, n_plus_x}),
+        mlir::IntegerAttr::get(rewriter.getIntegerType(64), 0));
+
+    // Shape of the final output. (Except for dimension folding in the
+    // single diagonal case.)
+    Shape output_shape;
+    for (int i = 0; i < num_dims - 2; i++) {
+      output_shape.push_back(input_type.getDimSize(i));
+    }
+    output_shape.push_back(num_diags);
+    output_shape.push_back(max_diag_len);
+    auto output_type = RankedTensorType::get(output_shape, element_type);
+
+    // A slice is the shape of what GatherOp copies per lookup. So the last
+    // two dimensions (M, N in the matrix-diag-part docs) are where we go
+    // through entry by entry.
+    ArrayRef<int64_t> input_shape = input_type.getShape();
+    Shape slice_sizes(input_shape.begin(), input_shape.end());
+    int slice_dimensions = slice_sizes.size();
+    slice_sizes[slice_dimensions - 2] = 1;
+    slice_sizes[slice_dimensions - 1] = 1;
+
+    // Dimensions of the input we won't see in the output (M and N).
+    SmallVector<int64_t, 2> collapsed_dims(
+        {slice_dimensions - 2, slice_dimensions - 1});
+
+    // Which dimensions (in the input) the two offset "columns" map to.
+    SmallVector<int64_t, 2> start_index_map({num_dims - 2, num_dims - 1});
+
+    // Gather the diagonal entries.
+    // TODO(kramm): For a single diagonal, this might be slower than the
+    //              mask + sum approach. Special-case num_diags==1?
+    auto dims_attr = GatherDimensionNumbers::get(
+        /*offset_dims=*/GetI64ElementsAttrForSeq(0, num_dims - 2, &rewriter),
+        /*collapsed_slice_dims=*/GetI64ElementsAttr(collapsed_dims, &rewriter),
+        /*start_index_map=*/GetI64ElementsAttr(start_index_map, &rewriter),
+        /*index_vector_dim=*/rewriter.getI64IntegerAttr(0),
+        rewriter.getContext());
+    Value gather = rewriter.create<mhlo::GatherOp>(
+        loc, output_type, op.input(), start_indices, dims_attr,
+        GetI64ElementsAttr(slice_sizes, &rewriter));
+
+    // We now need to broadcast the "in_bounds" boolean expression, as well as
+    // the padding value, to do the final select.
+    Shape broadcast_bounds;
+    for (int i = 0; i < output_shape.size() - 2; i++) {
+      broadcast_bounds.push_back(output_shape[i]);
+    }
+    Value b_in_bounds = rewriter.create<BroadcastOp>(
+        loc, RankedTensorType::get(output_shape, rewriter.getIntegerType(1)),
+        in_bounds, GetI64ElementsAttr(broadcast_bounds, &rewriter));
+    Value b_padding = rewriter.create<BroadcastOp>(
+        loc, output_type, op.padding_value(),
+        GetI64ElementsAttr(output_shape, &rewriter));
+
+    // Replace all out-of-bounds values in the result with padding_value.
+    Value result = rewriter.create<SelectOp>(loc, output_type, b_in_bounds,
+                                             gather, b_padding);
+
+    if (num_diags == 1) {
+      // matrix_diag_part folds away the 1-sized band dimension if we only
+      // extract a single diagonal.
+      result = rewriter.create<ReshapeOp>(loc, op.getType(), result);
+    }
+
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+};
+
 // Converts TensorFlow EinsumOp to either HLO EinsumOp or UnaryEinsumOp
 // depending on arity of the op.
 class ConvertEinsumOp : public OpRewritePattern<TF::EinsumOp> {
@@ -1531,23 +1810,23 @@ using ConvertFusedBatchNormGradV3Op =
 // Converts TensorFlow FusedBatchNormV3Op to either HLO BatchNormTrainingOp or
 // HLO BatchNormInferenceOp, depending on the value of the 'is_training'
 // parameter.
-class ConvertFusedBatchNormV3Op
-    : public OpRewritePattern<TF::FusedBatchNormV3Op> {
+template <typename FusedBatchNormOpT>
+class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
  public:
-  using OpRewritePattern::OpRewritePattern;
+  using OpRewritePattern<FusedBatchNormOpT>::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(TF::FusedBatchNormV3Op op,
+  LogicalResult matchAndRewrite(FusedBatchNormOpT op,
                                 PatternRewriter &rewriter) const override {
     auto feature_dim =
         getFeatureDimensionAttr(rewriter, op.data_formatAttr(), op.x());
 
-    auto input_type_tensor = op.x().getType().cast<TensorType>();
+    auto input_type_tensor = op.x().getType().template cast<TensorType>();
     auto input_element_type = input_type_tensor.getElementType();
 
-    auto scale_type_tensor = op.scale().getType().cast<TensorType>();
+    auto scale_type_tensor = op.scale().getType().template cast<TensorType>();
     auto scale_element_type = scale_type_tensor.getElementType();
 
-    auto mean_type_tensor = op.mean().getType().cast<TensorType>();
+    auto mean_type_tensor = op.mean().getType().template cast<TensorType>();
     auto mean_element_type = mean_type_tensor.getElementType();
     // In the training case, dimensions of input tensors must be static.
     if (op.is_training() && (!input_type_tensor.hasStaticShape() ||
@@ -1561,7 +1840,7 @@ class ConvertFusedBatchNormV3Op
     Value bn_train_input = rewriter.create<mhlo::ConvertOp>(op.getLoc(), op.x(),
                                                             scale_element_type);
     TensorType bn_train_input_type_tensor =
-        bn_train_input.getType().cast<TensorType>();
+        bn_train_input.getType().template cast<TensorType>();
 
     if (op.is_training()) {
       // Training case.
@@ -1643,17 +1922,25 @@ class ConvertFusedBatchNormV3Op
             /*broadcast_dimensions=*/DenseIntElementsAttr());
       }
 
-      // TF FusedBatchNormV3 op expects 5 outputs. Outputs 3 and 4 are
-      // currently marked as "reserved spaces 1 and 2". They are used to
-      // pass the per-batch mean and variance to the gradiant. Here we
-      // maintain the same behavior by setting them to the mean and variance
-      // calculated by BatchNormTraining. Output 5 is unused; it doesn't
-      // matter what we pass there.
-      rewriter.replaceOp(op, {y_out, /*batch_mean=*/batch_mean,
-                              /*batch_variance=*/corrected_variance,
-                              /*reserve_space_1=*/reserve_space_1,
-                              /*reserve_space_2=*/batch_variance,
-                              /*reserve_space_3=*/op.x()});
+      if (std::is_same<FusedBatchNormOpT, TF::FusedBatchNormV2Op>::value) {
+        // FusedBatchNormV2 expects 4 outputs.
+        // Outputs 3 and 4 are currently marked as "reserved spaces 1 and 2".
+        // They are used to pass the per-batch mean and variance to the
+        // gradiant. Here we maintain the same behavior by setting them to the
+        // mean and variance calculated by BatchNormTraining.
+        rewriter.replaceOp(op, {y_out, /*batch_mean=*/batch_mean,
+                                /*batch_variance=*/corrected_variance,
+                                /*reserve_space_1=*/reserve_space_1,
+                                /*reserve_space_2=*/batch_variance});
+      } else {  // TF::FusedBatchNormV3Op
+        // FusedBatchNormV3 expects a 5th output, but the output is unused; it
+        // doesn't matter what we pass there.
+        rewriter.replaceOp(op, {y_out, /*batch_mean=*/batch_mean,
+                                /*batch_variance=*/corrected_variance,
+                                /*reserve_space_1=*/reserve_space_1,
+                                /*reserve_space_2=*/batch_variance,
+                                /*reserve_space_3=*/op.x()});
+      }
     } else {  // Inference case.
       auto bn_train_op = rewriter.create<BatchNormInferenceOp>(
           op.getLoc(),
@@ -1670,31 +1957,45 @@ class ConvertFusedBatchNormV3Op
       // not used for inference. It doesn't matter what values we provide for
       // the last 5 results as long as they are of the same type. Forward
       // input mean and variance to output mean, variance, reserved_space_1 and
-      // reserver_space_2. Create a constant tensor to forward to last
-      // reserve_space_3 output.
-      auto reserve_space_3_type = op.getResult(5).getType().cast<TensorType>();
-      int num_elements = reserve_space_3_type.hasStaticShape()
-                             ? reserve_space_3_type.getNumElements()
-                             : 0;
-      auto const_attr_type = RankedTensorType::get(
-          {num_elements}, getElementTypeOrSelf(reserve_space_3_type));
-
-      Value dummy_const = rewriter.create<ConstOp>(
-          op.getLoc(), DenseElementsAttr::get<float>(const_attr_type, 0.0));
-      if (const_attr_type != reserve_space_3_type)
-        dummy_const = rewriter.create<TensorCastOp>(
-            op.getLoc(), reserve_space_3_type, dummy_const);
-      rewriter.replaceOp(op, {/*y=*/y_out,
-                              /*batch_mean=*/op.mean(),
-                              /*batch_variance=*/op.variance(),
-                              /*reserve_space_1=*/op.mean(),
-                              /*reserve_space_2=*/op.variance(),
-                              /*reserve_space_3=*/dummy_const});
+      // reserved_space_2.
+      if (std::is_same<FusedBatchNormOpT, TF::FusedBatchNormV2Op>::value) {
+        rewriter.replaceOp(op, {/*y=*/y_out,
+                                /*batch_mean=*/op.mean(),
+                                /*batch_variance=*/op.variance(),
+                                /*reserve_space_1=*/op.mean(),
+                                /*reserve_space_2=*/op.variance()});
+      } else {
+        // For FusedBatchNormV3Op, also create a constant tensor to forward to
+        // last reserve_space_3 output.
+        auto reserve_space_3_type =
+            op.getResult(5).getType().template cast<TensorType>();
+        int num_elements = reserve_space_3_type.hasStaticShape()
+                               ? reserve_space_3_type.getNumElements()
+                               : 0;
+        auto const_attr_type = RankedTensorType::get(
+            {num_elements}, getElementTypeOrSelf(reserve_space_3_type));
+        Value dummy_const = rewriter.create<ConstOp>(
+            op.getLoc(), DenseElementsAttr::get<float>(const_attr_type, 0.0));
+        if (const_attr_type != reserve_space_3_type)
+          dummy_const = rewriter.create<TensorCastOp>(
+              op.getLoc(), reserve_space_3_type, dummy_const);
+        rewriter.replaceOp(op, {/*y=*/y_out,
+                                /*batch_mean=*/op.mean(),
+                                /*batch_variance=*/op.variance(),
+                                /*reserve_space_1=*/op.mean(),
+                                /*reserve_space_2=*/op.variance(),
+                                /*reserve_space_3=*/dummy_const});
+      }
     }
     return success();
   }
 };
 
+using ConvertFusedBatchNormV2Op =
+    ConvertFusedBatchNormBase<TF::FusedBatchNormV2Op>;
+using ConvertFusedBatchNormV3Op =
+    ConvertFusedBatchNormBase<TF::FusedBatchNormV3Op>;
+
 using PaddingArray =
     std::vector<std::pair<tensorflow::int64, tensorflow::int64>>;
 
@@ -1748,37 +2049,102 @@ static DenseIntElementsAttr GetReduceWindowPaddingAsAttr(
       flatten_paddings);
 }
 
+// Helper function for dividing each entry of `pooled` by the count of its
+// corresponding window, i.e., the number of non-padding entries of the window
+// which an `AvgPool` operation performed on an `input_shape`-tensor would map
+// to this entry, depending on `ksize` and `strides`. This function is used for
+// `AvgPool` and `AvgPoolGrad` legalizations.
+// `zero` is passed as a parameter because it can be reused from caller level.
+// `pooled` must have `RankedTensorType`.
+template <typename OpTy, int num_dims>
+Operation *AvgPoolDivideByCount(
+    Value pooled, const SmallVector<int64_t, num_dims> &input_shape,
+    const SmallVector<int64_t, num_dims> &ksize,
+    const SmallVector<int64_t, num_dims> &strides, OpTy op, Value zero,
+    PatternRewriter &rewriter) {
+  Location loc = op.getLoc();
+  RankedTensorType pooled_type =
+      pooled.getType().template cast<RankedTensorType>();
+  Type element_type = pooled_type.getElementType();
+  Operation *result = nullptr;
+  RankedTensorType orig_input_type =
+      RankedTensorType::get(input_shape, element_type);
+
+  if (op.padding() == "VALID") {
+    // All window counts are equal here because we don't have padding
+    // (each entry of `pooled` corresponds to a window that consists of
+    //  original input entries only).
+    int64_t window_count = std::accumulate(ksize.begin(), ksize.end(), 1,
+                                           std::multiplies<int64_t>());
+    // Divide `pooled` by window counts.
+    Value divisor =
+        GetScalarConstOfType(element_type, loc, window_count, &rewriter);
+    auto scalar_broadcast_dims = GetI64ElementsAttr({}, &rewriter);
+    result = rewriter.create<chlo::BroadcastDivOp>(
+        loc, pooled_type, pooled, divisor, scalar_broadcast_dims);
+  } else {
+    assert(op.padding() == "SAME");
+    // For SAME padding, only original entries that contributed to a window
+    // are counted for the average of this window, not padded entries.
+
+    // Build all-ones tensor of same shape as the original input.
+    ElementsAttr splat = hlo::getSplat(&rewriter, orig_input_type, 1);
+    auto all_ones_tensor = rewriter.create<ConstOp>(loc, splat);
+
+    // Get padding for the input.
+    DenseIntElementsAttr input_padding_attr =
+        GetReduceWindowPaddingAsAttr<num_dims>(
+            input_shape, op.ksize(), op.strides(), op.padding(), &rewriter);
+
+    // Count the 1's in each window, using the same padding as for the input,
+    // which gives us the window counts by which `pooled` needs to be divided.
+    auto divisor = rewriter.create<ReduceWindowOp>(
+        loc, pooled_type,
+        /*operand=*/all_ones_tensor,
+        /*init_value=*/zero,
+        /*window_dimensions=*/GetI64ElementsAttr(op.ksize()),
+        /*window_strides=*/GetI64ElementsAttr(op.strides()),
+        /*base_dilations=*/DenseIntElementsAttr(),
+        /*window_dilations=*/DenseIntElementsAttr(),
+        /*padding=*/input_padding_attr);
+    BuildReduceBody<AddOp>(element_type, &divisor.body(), &rewriter);
+
+    // Divide `pooled` by window counts.
+    result = rewriter.create<mhlo::DivOp>(loc, pooled_type, pooled, divisor);
+  }
+  return result;
+}
+
+Value GetAvgPoolInput(TF::AvgPoolOp op) { return op.value(); }
+Value GetAvgPoolInput(TF::AvgPool3DOp op) { return op.input(); }
+
 // Converts AvgPool op to HLO ReduceWindow op by setting appropriate window
 // dimensions with add as the reduction function. The reduction result is
 // then divided by the number of elements in the window.
-class ConvertAvgPoolOp : public OpRewritePattern<TF::AvgPoolOp> {
+template <typename OpTy, int num_dims>
+class ConvertAvgPoolOp : public OpRewritePattern<OpTy> {
  public:
-  using OpRewritePattern::OpRewritePattern;
+  using OpRewritePattern<OpTy>::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(TF::AvgPoolOp op,
+  LogicalResult matchAndRewrite(OpTy op,
                                 PatternRewriter &rewriter) const override {
-    auto input_type = op.value().getType().dyn_cast<RankedTensorType>();
+    Value input_value = GetAvgPoolInput(op);
+    auto input_type =
+        input_value.getType().template dyn_cast<RankedTensorType>();
     if (!input_type) return failure();
 
-    // TODO(b/147217034): support other data formats.
-    if (!IsDefaultDataFormat(op.data_format())) return failure();
-    // TODO(b/147217034): support "SAME" padding.
-    if (op.padding() != "VALID") return failure();
-
     // We will do accumulation first; use a larger bitwidth if suitable.
     Type input_element_type = input_type.getElementType();
     Type sum_element_type = GetSumAccumulationType(input_element_type);
     Type result_type;
 
     // The result type for reduction and division with the proper element type.
-    if (auto ranked_type = op.getType().dyn_cast<RankedTensorType>())
+    if (auto ranked_type = op.getType().template dyn_cast<RankedTensorType>())
       result_type =
           RankedTensorType::get(ranked_type.getShape(), sum_element_type);
     else
       result_type = UnrankedTensorType::get(sum_element_type);
 
-    Value input_value = op.value();
-
     // Convert if we need enlarge the element type's bitwidth.
     if (input_element_type != sum_element_type)
       input_value = rewriter.create<ConvertOp>(op.getLoc(), input_value,
@@ -1787,9 +2153,9 @@ class ConvertAvgPoolOp : public OpRewritePattern<TF::AvgPoolOp> {
     // Create the tf.ReduceWindow op.
     Value init =
         GetScalarConstOfType(sum_element_type, op.getLoc(), 0, &rewriter);
-    DenseIntElementsAttr paddings_attr =
-        GetReduceWindowPaddingAsAttr<4>(input_type.getShape(), op.ksize(),
-                                        op.strides(), op.padding(), &rewriter);
+    DenseIntElementsAttr paddings_attr = GetReduceWindowPaddingAsAttr<num_dims>(
+        input_type.getShape(), op.ksize(), op.strides(), op.padding(),
+        &rewriter);
     auto reduce = rewriter.create<ReduceWindowOp>(
         op.getLoc(), result_type, input_value, init,
         GetI64ElementsAttr(op.ksize()), GetI64ElementsAttr(op.strides()),
@@ -1799,19 +2165,17 @@ class ConvertAvgPoolOp : public OpRewritePattern<TF::AvgPoolOp> {
 
     // Count the number of elements in the window. The following calculation
     // is only valid for no paddings.
-    SmallVector<int64_t, 4> ksize;
+    SmallVector<int64_t, num_dims> input_shape(
+        llvm::to_vector<num_dims>(input_type.getShape()));
+    SmallVector<int64_t, num_dims> ksize, strides;
     GetI64ArrayAttrValues(op.ksize(), &ksize);
-    int64_t count = std::accumulate(ksize.begin(), ksize.end(), 1,
-                                    std::multiplies<int64_t>());
+    GetI64ArrayAttrValues(op.strides(), &strides);
 
-    // Divide by the number of elements in the window.
-    Value divisor =
-        GetScalarConstOfType(sum_element_type, op.getLoc(), count, &rewriter);
-    auto scalar_broadcast_dims = GetI64ElementsAttr({}, &rewriter);
-    Value result = rewriter.create<chlo::BroadcastDivOp>(
-        op.getLoc(), result_type, reduce, divisor, scalar_broadcast_dims);
+    Operation *result_op = AvgPoolDivideByCount<OpTy, num_dims>(
+        reduce.getResult(), input_shape, ksize, strides, op, init, rewriter);
 
     // Convert back if we enlarged the element type's bitwidth.
+    Value result = result_op->getOpResult(0);
     if (input_element_type != sum_element_type)
       result =
           rewriter.create<ConvertOp>(op.getLoc(), result, input_element_type);
@@ -1821,6 +2185,9 @@ class ConvertAvgPoolOp : public OpRewritePattern<TF::AvgPoolOp> {
   }
 };
 
+using ConvertAvgPool2DOp = ConvertAvgPoolOp<TF::AvgPoolOp, /*num_dims=*/4>;
+using ConvertAvgPool3DOp = ConvertAvgPoolOp<TF::AvgPool3DOp, /*num_dims=*/5>;
+
 // `AvgPoolGradOp` is converted to the following operations:
 // 1. Divide each entry of the output gradient (the gradient for the previous
 //    layer in backpropagation order) by the count of the corresponding window
@@ -1894,59 +2261,13 @@ class ConvertAvgPoolGradOp : public OpRewritePattern<OpTy> {
     auto orig_input_shape_values = orig_input_shape_attr.getValues<int32_t>();
     DimVector orig_input_shape(orig_input_shape_values.begin(),
                                orig_input_shape_values.end());
-    RankedTensorType orig_input_type =
-        RankedTensorType::get(orig_input_shape, element_type);
     DimVector ksize, strides;
     GetI64ArrayAttrValues(op.ksize(), &ksize);
     GetI64ArrayAttrValues(op.strides(), &strides);
     Value zero = GetScalarConstOfType(element_type, loc, 0, &rewriter);
 
-    Operation *out_grad_divided = nullptr;
-    if (op.padding() == "VALID") {
-      // All window counts are equal here because we don't have padding
-      // (each entry of `out_grad` corresponds to a window that consists of
-      //  original input entries only).
-      int64_t window_count = std::accumulate(ksize.begin(), ksize.end(), 1,
-                                             std::multiplies<int64_t>());
-      // Divide `out_grad` by window counts.
-      Value divisor =
-          GetScalarConstOfType(element_type, loc, window_count, &rewriter);
-      auto scalar_broadcast_dims = GetI64ElementsAttr({}, &rewriter);
-      out_grad_divided = rewriter.create<chlo::BroadcastDivOp>(
-          loc, out_grad_type, out_grad, divisor, scalar_broadcast_dims);
-    } else {
-      assert(op.padding() == "SAME");
-      // For SAME padding, only original entries that contributed to a window
-      // are counted for the average of this window, not padded entries.
-
-      // Build all-ones tensor of same shape as the original input.
-      ElementsAttr splat = hlo::getSplat(&rewriter, orig_input_type, 1);
-      auto all_ones_tensor = rewriter.create<ConstOp>(loc, splat);
-
-      // Get the same padding as for the original input.
-      DenseIntElementsAttr orig_padding_attr =
-          GetReduceWindowPaddingAsAttr<num_dims>(orig_input_shape, op.ksize(),
-                                                 op.strides(), op.padding(),
-                                                 &rewriter);
-
-      // Count the 1's in each window, using the same padding as for the
-      // original input, which gives us the window counts by which `out_grad`
-      // needs to be divided.
-      auto window_counts = rewriter.create<ReduceWindowOp>(
-          loc, out_grad_type,
-          /*operand=*/all_ones_tensor,
-          /*init_value=*/zero,
-          /*window_dimensions=*/GetI64ElementsAttr(op.ksize()),
-          /*window_strides=*/GetI64ElementsAttr(op.strides()),
-          /*base_dilations=*/DenseIntElementsAttr(),
-          /*window_dilations=*/DenseIntElementsAttr(),
-          /*padding=*/orig_padding_attr);
-      BuildReduceBody<AddOp>(element_type, &window_counts.body(), &rewriter);
-
-      // Divide `out_grad` by window counts.
-      out_grad_divided = rewriter.create<mhlo::DivOp>(loc, out_grad_type,
-                                                      out_grad, window_counts);
-    }
+    auto out_grad_divided = AvgPoolDivideByCount<OpTy, num_dims>(
+        out_grad, orig_input_shape, ksize, strides, op, zero, rewriter);
 
     // Get same padding as for original input.
     PaddingArray orig_padding = GetReduceWindowPaddingAsArray<num_dims>(
@@ -2325,19 +2646,21 @@ class ConvertSizeOp : public OpRewritePattern<TF::SizeOp> {
     if (!input_ty) return failure();
 
     const int64_t rank = input_ty.getRank();
-    auto result_type = op.getResult().getType();
-    Operation *size =
-        GetScalarConstOfType(result_type.cast<TensorType>().getElementType(),
-                             op.getLoc(), 1, &rewriter);
+    auto result_ty = op.getResult().getType();
+    auto element_ty = result_ty.cast<TensorType>().getElementType();
+    Value size = GetScalarConstOfType(element_ty, op.getLoc(), 1, &rewriter);
     for (int64_t i = 0; i < rank; ++i) {
-      auto dim = rewriter.create<GetDimensionSizeOp>(
-          op.getLoc(), result_type, input,
-          rewriter.getIntegerAttr(rewriter.getIntegerType(32), i));
+      auto i32_ty = rewriter.getIntegerType(32);
+      auto size_ty = RankedTensorType::get({}, i32_ty);
+      auto dim_index = rewriter.getIntegerAttr(i32_ty, i);
+      Value dim = rewriter.create<GetDimensionSizeOp>(op.getLoc(), size_ty,
+                                                      input, dim_index);
+      dim = rewriter.create<mhlo::ConvertOp>(op.getLoc(), result_ty, dim);
       size = rewriter.create<chlo::BroadcastMulOp>(
-          op.getLoc(), size->getResult(0), dim.getResult(),
+          op.getLoc(), size, dim,
           /*DenseIntElementsAttr=*/DenseIntElementsAttr());
     }
-    rewriter.replaceOp(op, size->getResult(0));
+    rewriter.replaceOp(op, size);
 
     return success();
   }
@@ -2380,7 +2703,8 @@ static void BroadcastBatchMatMulV2Operands(Value lhs, Value rhs, Location loc,
                                      rhs_type.getShape().drop_back(2),
                                      result_batch_shape_compile_time_extents);
   auto result_batch_shape = rewriter->create<shape::BroadcastOp>(
-      loc, lhs_splitted.head(), rhs_splitted.head(),
+      loc, shape::ShapeType::get(rewriter->getContext()), lhs_splitted.head(),
+      rhs_splitted.head(),
       /*error=*/nullptr);
   // Lambda which handles the broadcasting of one side to the common
   // leading-batch dimensions.
@@ -2640,7 +2964,7 @@ class ConvertSplitVOp : public OpRewritePattern<TF::SplitVOp> {
     SmallVector<Value, 4> slices;
     slices.reserve(op.getNumResults());
 
-    for (int i = 0; i < op.getNumResults(); ++i) {
+    for (int i = 0, end = op.getNumResults(); i < end; ++i) {
       end_indices[dim_index] = begin_indices[dim_index] + split_sizes[i];
       slices.push_back(rewriter.create<mhlo::SliceOp>(
           op.getLoc(), op.value(), GetI64ElementsAttr(begin_indices, &rewriter),
@@ -2815,7 +3139,7 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
     // verifier.
     int64_t slicing_dim_size =
         op.begin().getType().cast<RankedTensorType>().getShape()[0];
-    auto input_rank = input_shape.size();
+    const int input_rank = input_shape.size();
     for (int d = slicing_dim_size; d < input_rank; ++d) {
       // We only support slicing major dimensions, so minor dimensions after
       // slicing dimensions are all sliced with their full sizes.
@@ -2856,7 +3180,7 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
     }
 
     // For non-slice dims, get the full slice of that dimension.
-    for (int d = slicing_dim_size; d < input_shape.size(); ++d) {
+    for (int d = slicing_dim_size, end = input_shape.size(); d < end; ++d) {
       slice_sizes.push_back(input_shape[d]);
       slice_begin_indices.push_back(zero);
     }
@@ -3552,7 +3876,8 @@ class ConvertTileOp : public OpRewritePattern<TF::TileOp> {
         multiples.getType().getRank() != 1)
       return failure();
 
-    if (multiples.getNumElements() != input_shape.size()) return failure();
+    const int64_t input_shape_size = input_shape.size();
+    if (multiples.getNumElements() != input_shape_size) return failure();
 
     SmallVector<int64_t, 8> broadcasted_shape;
     SmallVector<int64_t, 4> broadcast_dimensions;
@@ -4339,7 +4664,7 @@ class ConvertUnpackOp : public OpRewritePattern<TF::UnpackOp> {
     SmallVector<Value, 4> results;
     results.reserve(op.getNumResults());
 
-    for (int i = 0; i < op.getNumResults(); ++i) {
+    for (int i = 0, end = op.getNumResults(); i < end; ++i) {
       begin_indices[axis] = i;
       end_indices[axis] = i + 1;
 
@@ -4698,7 +5023,12 @@ class ConvertInplaceUpdateOp : public OpRewritePattern<TF::InplaceUpdateOp> {
     SmallVector<Type, 4> unpacked_indices_type(
         indices_type.getDimSize(0),
         RankedTensorType::get({}, indices_type.getElementType()));
-    auto zero_attr = IntegerAttr::get(rewriter.getIntegerType(64), 0);
+    // Note on zero_attr integer type: DynamicUpdateSlice op start_indices are
+    // required to have matching types. This rewrite rule creates
+    // DynamicUpdateSlice ops where the first "start index" is always i32 and
+    // subsequent ones are constructed based on zero_attr. Thus the type
+    // for zero_attr needs to be i32 as well.
+    auto zero_attr = IntegerAttr::get(rewriter.getIntegerType(32), 0);
     auto unpacked_indices = rewriter.create<TF::UnpackOp>(
         op.getLoc(), unpacked_indices_type, indices, zero_attr);
 
@@ -4777,11 +5107,8 @@ class ConvertCumsumOp : public OpRewritePattern<TF::CumsumOp> {
       return failure();
     }
 
-    // TODO(jennik): Add support for the optional 'exclusive' and 'reverse'
-    // arguments.
-    if (op.exclusive() || op.reverse()) {
-      return failure();
-    }
+    ArrayRef<int64_t> input_shape = input_type.getShape();
+    int64_t rank = input_shape.size();
 
     // We can only match when the axis is a constant scalar.
     DenseIntElementsAttr axis_attr;
@@ -4789,15 +5116,6 @@ class ConvertCumsumOp : public OpRewritePattern<TF::CumsumOp> {
       return failure();
     }
 
-    // Convert if we need to enlarge the element type's bitwidth to avoid
-    // precision loss.
-    Type input_element_type = input_type.getElementType();
-    Type sum_element_type = GetSumAccumulationType(input_element_type);
-    input = rewriter.create<ConvertOp>(op.getLoc(), input, sum_element_type);
-
-    ArrayRef<int64_t> input_shape = input_type.getShape();
-    int64_t rank = input_shape.size();
-
     // Get the dimension to apply the reduction on, and offset properly if it is
     // negative.
     int64_t axis = (*axis_attr.begin()).getSExtValue();
@@ -4805,6 +5123,21 @@ class ConvertCumsumOp : public OpRewritePattern<TF::CumsumOp> {
       axis += rank;
     }
 
+    // If we're supposed to sum things up in the reverse direction, we reverse
+    // the input and then later reverse the output.
+    if (op.reverse()) {
+      llvm::SmallVector<int64_t, 4> dims_to_reverse({axis});
+      input = rewriter.create<ReverseOp>(
+          op.getLoc(), op.getType(), input,
+          GetI64ElementsAttr(dims_to_reverse, &rewriter));
+    }
+
+    // Convert if we need to enlarge the element type's bitwidth to avoid
+    // precision loss.
+    Type input_element_type = input_type.getElementType();
+    Type sum_element_type = GetSumAccumulationType(input_element_type);
+    input = rewriter.create<ConvertOp>(op.getLoc(), input, sum_element_type);
+
     SmallVector<int64_t, 4> window_dims(rank, 1);
     SmallVector<int64_t, 4> window_strides(rank, 1);
     window_dims[axis] = input_shape[axis];
@@ -4827,10 +5160,34 @@ class ConvertCumsumOp : public OpRewritePattern<TF::CumsumOp> {
     BuildReduceBody<AddOp>(sum_element_type, &reduce.body(), &rewriter);
     Value result = reduce.getResult();
 
+    if (op.exclusive()) {
+      // In "exclusive" operation, the output will start with the "init" (0)
+      // values. There is no way to express that as a ReduceWindowOp, so run the
+      // normal operation, and then use a PadOp to add the 0 "column" on the
+      // left and cut away the last column on the right.
+      llvm::SmallVector<int64_t, 4> low_padding(rank, 0);
+      llvm::SmallVector<int64_t, 4> high_padding(rank, 0);
+      llvm::SmallVector<int64_t, 4> interior_padding(rank, 0);
+      low_padding[axis] = 1;
+      high_padding[axis] = -1;
+      result = rewriter.create<PadOp>(
+          op.getLoc(), op.getType(), result, init,
+          GetI64ElementsAttr(low_padding, &rewriter),
+          GetI64ElementsAttr(high_padding, &rewriter),
+          GetI64ElementsAttr(interior_padding, &rewriter));
+    }
+
     // Convert back if we enlarged the element type's bitwidth.
     result =
         rewriter.create<ConvertOp>(op.getLoc(), result, input_element_type);
 
+    if (op.reverse()) {
+      llvm::SmallVector<int64_t, 4> dims_to_reverse({axis});
+      result = rewriter.create<ReverseOp>(
+          op.getLoc(), op.getType(), result,
+          GetI64ElementsAttr(dims_to_reverse, &rewriter));
+    }
+
     rewriter.replaceOp(op, result);
     return success();
   }
@@ -5358,50 +5715,6 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
   }
 };
 
-// Converts `TF::SparseMatMulOp` to `TF::MatMulOp`, ignoring the sparseness
-// hints, since we currently don't have an implementation that can use this
-// information. Adds appropriate casts where necessary to align element types
-// of operands and result for `TF::MatMulOp`.
-class ConvertSparseMatMulOp : public OpRewritePattern<TF::SparseMatMulOp> {
- public:
-  using OpRewritePattern<TF::SparseMatMulOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(TF::SparseMatMulOp op,
-                                PatternRewriter &rewriter) const override {
-    // Result type must be f32 for applying the pattern (currently this is
-    // required by the op anyway but this might change).
-    if (!op.product().getType().cast<TensorType>().getElementType().isF32()) {
-      return failure();
-    }
-    MLIRContext *context = rewriter.getContext();
-    llvm::SmallVector<Value, 2> operands{op.a(), op.b()};
-    for (Value &operand : operands) {
-      TensorType tensor_type = operand.getType().cast<TensorType>();
-      Type element_type = tensor_type.getElementType();
-      if (element_type.isF32()) continue;
-      // Element type can either be f32 or bf16 for `SparseMatMulOp` so it
-      // must be bf16 here.
-      assert(element_type.isBF16());
-      Type tensor_type_f32;
-      if (tensor_type.hasRank()) {
-        tensor_type_f32 = RankedTensorType::get(tensor_type.getShape(),
-                                                FloatType::getF32(context));
-      } else {
-        tensor_type_f32 = UnrankedTensorType::get(FloatType::getF32(context));
-      }
-      // Add cast to f32 to conform with element type of result.
-      operand =
-          rewriter.create<TF::CastOp>(op.getLoc(), tensor_type_f32, operand);
-    }
-    Value result = rewriter.create<TF::MatMulOp>(
-        op.getLoc(), op.product().getType(), operands[0], operands[1],
-        op.transpose_a(), op.transpose_b());
-
-    rewriter.replaceOp(op, {result});
-    return success();
-  }
-};
-
 // Emits debug information which includes the number of ops of each type which
 // failed to legalize.
 void EmitLegalizationErrors(Operation *op,
@@ -5449,9 +5762,14 @@ void EmitLegalizationErrors(Operation *op,
 
 // Performs the lowering to XLA dialect.
 void LegalizeTF::runOnFunction() {
-  if (failed(
-          legalizeTF(getFunction(), allow_partial_conversion_, legalize_chlo_)))
+  llvm::Optional<StringRef> tf2xla_fallback_device_type = llvm::None;
+  if (use_tf2xla_fallback_) {
+    tf2xla_fallback_device_type = device_type_;
+  }
+  if (failed(legalizeTF(getFunction(), allow_partial_conversion_,
+                        legalize_chlo_, tf2xla_fallback_device_type))) {
     signalPassFailure();
+  }
 }
 
 static PassRegistration<LegalizeTF> pass(
@@ -5461,53 +5779,48 @@ static PassRegistration<LegalizeTF> pass(
 
 #include "tensorflow/compiler/mlir/xla/transforms/generated_legalize_tf.inc"
 
-LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion,
-                         bool legalize_chlo) {
+LogicalResult legalizeTF(
+    Operation *op, bool allow_partial_conversion, bool legalize_chlo,
+    llvm::Optional<StringRef> tf2xla_fallback_device_type) {
   MLIRContext *context = op->getContext();
-
-  // Add lowering patterns to the list.
   OwningRewritePatternList patterns;
-  populateWithGenerated(context, &patterns);
+  // Note that the `OperationConverter` orders patterns lexicographically by:
+  // 1) Ascending legalization depth (i.e., minimum number of patterns necessary
+  //    to arrive at conversion target).
+  // 2) Descending pattern benefit.
+  // 3) Order of patterns in `OwningRewritePatternList`.
 
-  // Add patterns that lower some of the high level TensorFlow ops to lower
-  // level TensorFlow ops. So, we don't have to target all the TensorFlow ops
-  // here for lowering to HLO.
+  // Add TF->HLO legalization patterns.
+  PopulateLegalizeTfPatterns(context, &patterns);
+
+  // Add TF->TF lowering patterns.
   TF::PopulateLoweringTFPatterns(context, &patterns);
-  patterns.insert<
-      ConvertAllOp, ConvertAnyOp, ConvertArgMaxOp, ConvertBatchMatMulV2Op,
-      ConvertBiasAddOp, ConvertBroadcastToOp, ConvertBF16FloorDivOp,
-      ConvertConv2DOp, ConvertConv3DOp, ConvertDepthConv2DOp,
-      ConvertConv2DBackpropFilterOp, ConvertConv3DBackpropFilterOp,
-      ConvertConv2DBackpropInputOp, ConvertConv3DBackpropInputOp,
-      ConvertCumsumOp, ConvertDiagPartOp, ConvertEinsumOp,
-      ConvertFusedBatchNormGradOp, ConvertFusedBatchNormGradV2Op,
-      ConvertFusedBatchNormGradV3Op, ConvertFusedBatchNormV3Op,
-      ConvertInfeedDequeueTupleOp, ConvertInplaceUpdateOp, ConvertLinSpaceOp,
-      ConvertMaxOp, ConvertMinOp, ConvertAvgPoolOp, ConvertAvgPool2DGradOp,
-      ConvertAvgPool3DGradOp, ConvertMaxPool2DOp, ConvertMaxPool3DOp,
-      ConvertMaxPool2DGradOp, ConvertMaxPool3DGradOp, ConvertMeanOp,
-      ConvertOneHotOp, ConvertOutfeedEnqueueTupleOp, ConvertProdOp, ConvertQrOp,
-      ConvertDynamicRangeOp, ConvertRangeOp, ConvertSelectV2Op,
-      ConvertSigmoidOp, ConvertShapeOp, ConvertSizeOp,
-      ConvertSoftmaxOp<TF::LogSoftmaxOp, true>,
-      ConvertSoftmaxOp<TF::SoftmaxOp, false>, ConvertSparseMatMulOp,
-      ConvertSplitOp, ConvertSplitVOp, ConvertStridedSliceOp,
-      ConvertStridedSliceGradOp, ConvertSumOp, ConvertTensorScatterUpdateOp,
-      ConvertTileOp, ConvertTopKV2Op, ConvertUnpackOp,
-      ConvertUnsortedSegmentMaxOp, ConvertUnsortedSegmentMinOp,
-      ConvertUnsortedSegmentProdOp, ConvertUnsortedSegmentSumOp,
-      ConvertRandomShuffleOp, ConvertXlaShardingOp,
-      ConvertXlaDynamicUpdateSliceOp>(op->getContext());
+
+  // Add TF->HLO legalization patterns via TF2XLA fallback.
+  if (tf2xla_fallback_device_type.hasValue()) {
+    PopulateLegalizeTfWithTf2XlaPatterns(tf2xla_fallback_device_type.getValue(),
+                                         patterns);
+  }
 
   // Populate with CHLO->HLO lowerings to account for TF ops legalized to
   // CHLO first.
   if (legalize_chlo) {
     chlo::PopulateLegalizeChloToHloPatterns(context, &patterns);
   }
+  // ConstantLike op is convenient to create splat constants, but is
+  // canonicalized to plain HLO constant if statically shaped. Add the
+  // canonicalization pattern to pattern list to enable multi-hop lowering.
+  chlo::ConstantLikeOp::getCanonicalizationPatterns(patterns, context);
 
   ConversionTarget target(*context);
   if (legalize_chlo) {
     target.addIllegalDialect<chlo::HloClientDialect>();
+
+    // Mark ConstantLikeOp as dynamically legal only when it doesn't have a
+    // static result type so that it gets canonicalized to MHLO constant.
+    target.addDynamicallyLegalOp<chlo::ConstantLikeOp>([](Operation *op) {
+      return !op->getResultTypes().front().cast<ShapedType>().hasStaticShape();
+    });
   } else {
     target.addLegalDialect<chlo::HloClientDialect>();
   }
@@ -5535,9 +5848,41 @@ LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion,
   return applyPartialConversion(op, target, patterns);
 }
 
+void PopulateLegalizeTfPatterns(MLIRContext *context,
+                                OwningRewritePatternList *patterns) {
+  populateWithGenerated(context, patterns);
+  patterns->insert<
+      ConvertAllOp, ConvertAnyOp, ConvertArgMaxOp, ConvertBatchMatMulV2Op,
+      ConvertBiasAddOp, ConvertBroadcastToOp, ConvertBF16FloorDivOp,
+      ConvertConv2DOp, ConvertConv3DOp, ConvertDepthConv2DOp,
+      ConvertConv2DBackpropFilterOp, ConvertConv3DBackpropFilterOp,
+      ConvertConv2DBackpropInputOp, ConvertConv3DBackpropInputOp,
+      ConvertCumsumOp, ConvertDiagPartOp, ConvertEinsumOp,
+      ConvertFusedBatchNormGradOp, ConvertFusedBatchNormGradV2Op,
+      ConvertFusedBatchNormGradV3Op, ConvertFusedBatchNormV2Op,
+      ConvertFusedBatchNormV3Op, ConvertInfeedDequeueTupleOp,
+      ConvertInplaceUpdateOp, ConvertLinSpaceOp, ConvertMaxOp, ConvertMinOp,
+      ConvertAvgPool2DOp, ConvertAvgPool3DOp, ConvertAvgPool2DGradOp,
+      ConvertAvgPool3DGradOp, ConvertMaxPool2DOp, ConvertMaxPool3DOp,
+      ConvertMaxPool2DGradOp, ConvertMaxPool3DGradOp, ConvertMeanOp,
+      ConvertOneHotOp, ConvertOutfeedEnqueueTupleOp, ConvertProdOp, ConvertQrOp,
+      ConvertDynamicRangeOp, ConvertMatrixDiagPartV3Op, ConvertRangeOp,
+      ConvertSelectV2Op, ConvertSigmoidOp, ConvertShapeOp, ConvertSizeOp,
+      ConvertSoftmaxOp<TF::LogSoftmaxOp, true>,
+      ConvertSoftmaxOp<TF::SoftmaxOp, false>, ConvertSplitOp, ConvertSplitVOp,
+      ConvertStridedSliceOp, ConvertStridedSliceGradOp, ConvertSumOp,
+      ConvertTensorScatterUpdateOp, ConvertTileOp, ConvertTopKV2Op,
+      ConvertUnpackOp, ConvertUnsortedSegmentMaxOp, ConvertUnsortedSegmentMinOp,
+      ConvertUnsortedSegmentProdOp, ConvertUnsortedSegmentSumOp,
+      ConvertRandomShuffleOp, ConvertXlaShardingOp,
+      ConvertXlaDynamicUpdateSliceOp>(context);
+}
+
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeTFPass(
-    bool allow_partial_conversion, bool legalize_chlo) {
-  return std::make_unique<LegalizeTF>(allow_partial_conversion, legalize_chlo);
+    bool allow_partial_conversion, bool legalize_chlo,
+    llvm::Optional<StringRef> tf2xla_fallback_device_type) {
+  return std::make_unique<LegalizeTF>(allow_partial_conversion, legalize_chlo,
+                                      tf2xla_fallback_device_type);
 }
 
 }  // end namespace mhlo
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_communication.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_communication.cc
new file mode 100644
index 00000000000..1d6ce36300f
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_communication.cc
@@ -0,0 +1,907 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file implements logic for lowering TensorFlow dialect's communication
+// ops (TF/XLA) to the HLO dialect.
+
+#include <memory>
+#include <string>
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/xla/type_to_shape.h"
+#include "tensorflow/compiler/xla/client/sharding_builder.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
+
+namespace mlir {
+namespace mhlo {
+
+namespace {
+constexpr char kShardingAttr[] = "mhlo.sharding";
+constexpr char kFrontendAttributesAttr[] = "mhlo.frontend_attributes";
+const char kXlaHostTransferRendezvousNameAttr[] =
+    "_xla_host_transfer_rendezvous";
+const char kXlaHostTransferOriginalTypeAttr[] =
+    "_xla_host_transfer_original_type";
+
+// A pass that legalizes TF/XLA communication ops, propagate their respective
+// tokens (for ordering), and rewrite their respective functions and control
+// flow ops when necessary.
+// Note, this currently does not handle nested modules/functions or region based
+// ops other than certain control flow ops (`mhlo.if`, `mhlo.while`).
+class LegalizeTFCommunication
+    : public PassWrapper<LegalizeTFCommunication, OperationPass<ModuleOp>> {
+ public:
+  void runOnOperation() override;
+};
+
+// Checks if an op is a TF/XLA communication op.
+bool IsCommunicationOp(Operation* op) {
+  return isa<TF::_XlaHostComputeMlirOp, TF::XlaSendToHostOp,
+             TF::XlaRecvFromHostOp>(op);
+}
+
+// Checks if an op is a supported HLO control flow op.
+bool IsControlFlowOp(Operation* op) { return isa<IfOp, WhileOp>(op); }
+
+// Collects control flow op ancestors of a given op, up until FuncOp. If any
+// ancestor is not a control flow op or a FuncOp, or of a single block region,
+// an error will be returned.
+LogicalResult GetControlFlowAncestors(
+    Operation* op, llvm::SmallPtrSetImpl<Operation*>& control_flow_ops,
+    llvm::SmallPtrSetImpl<Block*>& control_flow_blocks) {
+  Block* block = op->getBlock();
+  Operation* parent = block->getParentOp();
+  while (block && parent && !isa<FuncOp>(parent)) {
+    if (!IsControlFlowOp(parent))
+      return op->emitOpError()
+             << "expects ancestor(s) to be of ['" << IfOp::getOperationName()
+             << "', '" << FuncOp::getOperationName() << "']";
+
+    if (!llvm::hasSingleElement(block->getParent()->getBlocks()))
+      return op->emitOpError() << "expects single block region ancestor(s)";
+
+    control_flow_ops.insert(parent);
+    control_flow_blocks.insert(block);
+
+    parent = block->getParentOp();
+    block = parent->getBlock();
+  }
+  return success();
+}
+
+// Finds communication ops in a function. `control_flow_ops` and
+// `control_flow_blocks` will be populated with control flow op ancestors for
+// every communication op.
+LogicalResult FindCommunicationOps(
+    FuncOp func, llvm::SmallPtrSetImpl<Operation*>& control_flow_ops,
+    llvm::SmallPtrSetImpl<Block*>& control_flow_blocks,
+    bool& has_communication_ops) {
+  auto result = func.walk([&](Operation* op) {
+    if (!IsCommunicationOp(op)) return WalkResult::advance();
+    has_communication_ops = true;
+    if (failed(
+            GetControlFlowAncestors(op, control_flow_ops, control_flow_blocks)))
+      return WalkResult::interrupt();
+    return WalkResult::advance();
+  });
+  return failure(result.wasInterrupted());
+}
+
+// Helper struct holding a function to be rewritten, it's control flow ops that
+// lead to a communication op or function call with a communication op
+// (transitively), and an optional clone of itself. If `clone` is set, function
+// calls to `original` will be replaced with `clone`.
+struct FuncToRewrite {
+  FuncOp original;
+  llvm::SmallPtrSet<Operation*, 4> control_flow_ops;
+  llvm::SmallPtrSet<Block*, 4> control_flow_blocks;
+  FuncOp clone;
+};
+
+// Finds all functions that need to be rewritten with communication ops and
+// and associated tokens.
+LogicalResult GetFunctionsToRewrite(
+    ModuleOp module,
+    llvm::SmallDenseMap<StringRef, FuncToRewrite>& funcs_to_rewrite) {
+  // Find functions containing communication ops.
+  SmallVector<FuncOp, 4> funcs_to_visit;
+  for (FuncOp func : module.getOps<FuncOp>()) {
+    FuncToRewrite func_to_rewrite{/*original=*/func, /*control_flow_ops=*/{},
+                                  /*control_flow_blocks=*/{},
+                                  /*clone=*/nullptr};
+    bool has_communication_ops = false;
+    if (failed(FindCommunicationOps(func, func_to_rewrite.control_flow_ops,
+                                    func_to_rewrite.control_flow_blocks,
+                                    has_communication_ops)))
+      return failure();
+
+    if (!has_communication_ops) continue;
+    funcs_to_rewrite.insert({func.getName(), func_to_rewrite});
+    funcs_to_visit.push_back(func);
+  }
+
+  // Find functions that call functions with communication ops, transitively.
+  while (!funcs_to_visit.empty()) {
+    SmallVector<FuncOp, 4> new_funcs_to_visit;
+    for (FuncOp& func : funcs_to_visit) {
+      auto uses = func.getSymbolUses(module);
+      if (!uses) continue;
+      for (auto& use : *uses) {
+        // Only `mlir::CallOp` is supported as this requires knowing how to
+        // rewrite arguments and results to a function.
+        if (!isa<mlir::CallOp>(use.getUser())) continue;
+        auto caller_parent_func = use.getUser()->getParentOfType<FuncOp>();
+        if (!caller_parent_func) continue;
+
+        FuncToRewrite func_to_rewrite{/*original=*/caller_parent_func,
+                                      /*control_flow_ops=*/{},
+                                      /*control_flow_blocks=*/{},
+                                      /*clone=*/nullptr};
+        if (failed(GetControlFlowAncestors(
+                use.getUser(), func_to_rewrite.control_flow_ops,
+                func_to_rewrite.control_flow_blocks)))
+          return failure();
+
+        auto it = funcs_to_rewrite.insert(
+            {caller_parent_func.getName(), func_to_rewrite});
+        if (it.second) {
+          new_funcs_to_visit.push_back(caller_parent_func);
+        } else {
+          it.first->getSecond().control_flow_ops.insert(
+              func_to_rewrite.control_flow_ops.begin(),
+              func_to_rewrite.control_flow_ops.end());
+          it.first->getSecond().control_flow_blocks.insert(
+              func_to_rewrite.control_flow_blocks.begin(),
+              func_to_rewrite.control_flow_blocks.end());
+        }
+      }
+    }
+
+    funcs_to_visit.swap(new_funcs_to_visit);
+  }
+
+  // Clone public functions that need to be rewritten. Function calls to this
+  // function will be replaced with the cloned function.
+  SymbolTable symbol_table(module);
+  for (auto& func : funcs_to_rewrite) {
+    if (func.getSecond().original.isPublic() &&
+        !func.getSecond().original.symbolKnownUseEmpty(module)) {
+      auto clone = func.getSecond().original.clone();
+      clone.setVisibility(SymbolTable::Visibility::Private);
+      symbol_table.insert(clone);
+      func.getSecond().clone = clone;
+    }
+  }
+
+  return success();
+}
+
+// Assigns op sharding to an op for a given device core.
+void SetOpSharding(Operation* op, int64_t tpu_core) {
+  std::string sharding_serialized =
+      ::xla::sharding_builder::AssignDevice(tpu_core).SerializeAsString();
+  op->setAttr(kShardingAttr,
+              StringAttr::get(sharding_serialized, op->getContext()));
+}
+
+// Assigns frontend attributes holding information about data type and
+// TensorFlow rendezvous channel name.
+void SetFrontendAttributes(Operation* op, StringRef key, Type type) {
+  MLIRContext* context = op->getContext();
+
+  auto rendezvous_name = StringAttr::get(key, context);
+  auto rendezvous_name_attr = NamedAttribute(
+      Identifier::get(kXlaHostTransferRendezvousNameAttr, context),
+      rendezvous_name);
+
+  auto element_type = getElementTypeOrSelf(type);
+  auto xla_element_type = ::xla::TypeToPrimitiveType(element_type);
+  const std::string& xla_element_type_str =
+      ::xla::primitive_util::LowercasePrimitiveTypeName(xla_element_type);
+  auto original_type = StringAttr::get(xla_element_type_str, context);
+  auto original_type_attr =
+      NamedAttribute(Identifier::get(kXlaHostTransferOriginalTypeAttr, context),
+                     original_type);
+
+  auto frontend_attributes = DictionaryAttr::get(
+      ArrayRef<NamedAttribute>{rendezvous_name_attr, original_type_attr},
+      context);
+  op->setAttr(kFrontendAttributesAttr, frontend_attributes);
+}
+
+// Assigns frontend attributes holding information about data type and
+// TensorFlow rendezvous channel name specific to `tf._XlaHostComputeMlir`.
+// TensorFlow rendezvous channel name is handled differently as individual names
+// are used per data send and receive.
+void SetFrontendAttributes(Operation* op, int32_t index, StringRef key,
+                           Type type, bool device_to_host) {
+  std::string formatted_key =
+      device_to_host ? llvm::formatv("{0}_dtoh_{1}", key, index).str()
+                     : llvm::formatv("{0}_htod_{1}", key, index).str();
+
+  return SetFrontendAttributes(op, formatted_key, type);
+}
+
+// Creates a `mhlo.send` op for sending value `operand`. If `index` is set,
+// `key` will be rewritten with a suffix and index. If `tpu_core` is set, op
+// sharding for the respective device will be set.
+Value CreateSendOp(OpBuilder& builder, int64_t& channel_id, Location loc,
+                   Value operand, StringRef key, const Optional<size_t>& index,
+                   const Optional<int64_t>& tpu_core, Value token) {
+  // type 2 == DEVICE_TO_HOST
+  auto channel_handle = ChannelHandle::get(
+      /*handle=*/builder.getI64IntegerAttr(channel_id++),
+      /*type=*/builder.getI64IntegerAttr(2), builder.getContext());
+  auto send = builder.create<SendOp>(
+      loc, token.getType(), operand, token, channel_handle,
+      /*is_host_transfer=*/builder.getBoolAttr(true));
+
+  if (index) {
+    SetFrontendAttributes(send, *index, key, operand.getType(),
+                          /*device_to_host=*/true);
+  } else {
+    SetFrontendAttributes(send, key, operand.getType());
+  }
+
+  if (tpu_core) SetOpSharding(send, *tpu_core);
+
+  return send.getResult();
+}
+
+// Creates a `mhlo.recv` op for receiving a value. If `index` is set, `key` will
+// be rewritten with a suffix and index. If `tpu_core` is set, op sharding for
+// the respective device will be set.
+Value CreateRecvOp(OpBuilder& builder, int64_t& channel_id, Location loc,
+                   Value result, StringRef key, const Optional<size_t>& index,
+                   const Optional<int64_t>& tpu_core, Value token) {
+  // type 3 == HOST_TO_DEVICE
+  auto channel_handle = ChannelHandle::get(
+      /*handle=*/builder.getI64IntegerAttr(channel_id++),
+      /*type=*/builder.getI64IntegerAttr(3), builder.getContext());
+  auto result_type = result.getType();
+  auto recv_result_type =
+      TupleType::get({result_type, token.getType()}, builder.getContext());
+  auto recv =
+      builder.create<RecvOp>(loc, recv_result_type, token, channel_handle,
+                             /*is_host_transfer=*/builder.getBoolAttr(true));
+  if (index) {
+    SetFrontendAttributes(recv, *index, key, result_type,
+                          /*device_to_host=*/false);
+  } else {
+    SetFrontendAttributes(recv, key, result.getType());
+  }
+  if (tpu_core) SetOpSharding(recv, *tpu_core);
+
+  auto get_tuple_element =
+      builder.create<GetTupleElementOp>(loc, recv.getResult(), /*index=*/0);
+  if (tpu_core) SetOpSharding(get_tuple_element, *tpu_core);
+
+  result.replaceAllUsesWith(get_tuple_element);
+
+  auto new_token = builder.create<GetTupleElementOp>(loc, recv.getResult(),
+                                                     /*index=*/1);
+  if (tpu_core) SetOpSharding(new_token, *tpu_core);
+
+  return new_token.getResult();
+}
+
+// Creates a new token if necessary, acting as a sink to previous tokens. If
+// there is only one token in `tokens`, the only token is returned. If `tokens`
+// is empty, `original_token` is returned instead.
+Value CreateSinkToken(OpBuilder& builder, Location loc, ArrayRef<Value> tokens,
+                      Value original_token) {
+  if (tokens.empty()) {
+    return original_token;
+  } else if (llvm::hasSingleElement(tokens)) {
+    return tokens[0];
+  } else {
+    return builder.create<AfterAllOp>(loc, original_token.getType(), tokens)
+        .getResult();
+  }
+}
+
+// Replaces `tf._XlaHostComputeMlir` with individual `mhlo.send` and `mhlo.recv`
+// ops per operand and result. Unique Channel Id's are assigned per transfer.
+// Sink tokens are created across all `mhlo.send` ops first and then by
+// all `mhlo.recv` ops.
+Value RewriteHostComputeOp(OpBuilder& builder, int64_t& channel_id,
+                           TF::_XlaHostComputeMlirOp host_compute,
+                           Value token) {
+  builder.setInsertionPoint(host_compute);
+  Location loc = host_compute.getLoc();
+  int64_t tpu_core = host_compute.tpu_coreAttr().getInt();
+
+  SmallVector<Value, 4> send_tokens;
+  for (auto operand : llvm::enumerate(host_compute.inputs())) {
+    auto send_token =
+        CreateSendOp(builder, channel_id, loc, operand.value(),
+                     host_compute.send_key(), operand.index(), tpu_core, token);
+    send_tokens.push_back(send_token);
+  }
+  token = CreateSinkToken(builder, loc, send_tokens, token);
+
+  SmallVector<Value, 4> recv_tokens;
+  for (auto result : llvm::enumerate(host_compute.outputs())) {
+    auto recv_token =
+        CreateRecvOp(builder, channel_id, loc, result.value(),
+                     host_compute.recv_key(), result.index(), tpu_core, token);
+    recv_tokens.push_back(recv_token);
+  }
+  token = CreateSinkToken(builder, loc, recv_tokens, token);
+
+  host_compute.erase();
+  return token;
+}
+
+// Replaces `tf.XlaSendToHost` with a `mhlo.send`.
+Value RewriteSendToHostOp(OpBuilder& builder, int64_t& channel_id,
+                          TF::XlaSendToHostOp send_to_host, Value token) {
+  builder.setInsertionPoint(send_to_host);
+  token = CreateSendOp(builder, channel_id, send_to_host.getLoc(),
+                       send_to_host.input(), send_to_host.key(),
+                       /*index=*/llvm::None, /*tpu_core=*/llvm::None, token);
+
+  send_to_host.erase();
+  return token;
+}
+
+// Replaces `tf.XlaRecvFromHost` with a `mhlo.recv`.
+Value RewriteRecvFromHostOp(OpBuilder& builder, int64_t& channel_id,
+                            TF::XlaRecvFromHostOp recv_from_host, Value token) {
+  builder.setInsertionPoint(recv_from_host);
+  token = CreateRecvOp(builder, channel_id, recv_from_host.getLoc(),
+                       recv_from_host.output(), recv_from_host.key(),
+                       /*index=*/llvm::None, /*tpu_core=*/llvm::None, token);
+
+  recv_from_host.erase();
+  return token;
+}
+
+// Replaces a `mlir::CallOp` with one that has an extra `!mhlo.token` operand
+// and `!mhlo.token` result. If `new_symbol` is set, the new call will be
+// updated to call the `new_symbol` instead.
+Value RewriteCallOp(OpBuilder& builder, CallOp call,
+                    const Optional<StringRef>& new_symbol, Value token) {
+  builder.setInsertionPoint(call);
+  auto new_operands = llvm::to_vector<4>(call.getArgOperands());
+  new_operands.push_back(token);
+  auto new_result_types = llvm::to_vector<4>(call.getResultTypes());
+  new_result_types.push_back(token.getType());
+  auto new_call = builder.create<CallOp>(
+      call.getLoc(), new_result_types, new_symbol ? *new_symbol : call.callee(),
+      new_operands);
+
+  for (auto results : llvm::zip(call.getResults(), new_call.getResults()))
+    std::get<0>(results).replaceAllUsesWith(std::get<1>(results));
+  call.erase();
+  return new_call.getResults().back();
+}
+
+// Helper struct holding state of which op to visit to next. If `op` is in a
+// control flow op region, `region_idx` will be set with the respective region
+// index. `token` will be current token from the last communication op/control
+// flow op transitive communication ops.
+struct OpVisitorState {
+  Optional<unsigned> region_idx;
+  Value token;
+  Operation* op;
+};
+
+// Creates a tuple from a sequence of values.
+Value CreateTuple(OpBuilder& builder, Location loc, ArrayRef<Value> operands) {
+  return builder.create<TupleOp>(loc, operands).getResult();
+}
+
+// Replaces a value `value` with a new value but the token attached. If `value`
+// is not a tuple, a new tuple is formed with `token`. If `value` is a tuple,
+// `value` is extended instead. New tuple values created are cached.
+Value GetValueWithToken(OpBuilder& builder, Value value, Value token,
+                        llvm::SmallDenseMap<Value, Value>& rewritten_values) {
+  // If value with token already exists, reuse it.
+  auto it = rewritten_values.find(value);
+  if (it != rewritten_values.end()) return it->getSecond();
+
+  auto create_tuple = [&](ArrayRef<Value> operands) {
+    auto new_result = CreateTuple(builder, value.getLoc(), operands);
+    rewritten_values.insert({value, new_result});
+    return new_result;
+  };
+
+  auto tuple_type = value.getType().dyn_cast<TupleType>();
+  // `value` is not a tuple, create a new tuple.
+  if (!tuple_type) return create_tuple({value, token});
+
+  // Extend tuple if `value` is a tuple.
+  // If `value` is an op result and the owner is a `mhlo.tuple`, simply unpack
+  // the tuple.
+  if (auto tuple_op = value.getDefiningOp<TupleOp>()) {
+    auto tuple_operands = llvm::to_vector<4>(tuple_op.getOperands());
+    tuple_operands.push_back(token);
+    return create_tuple(tuple_operands);
+  }
+
+  // `value` is not created via a `mhlo.tuple` directly, unpack individual
+  // elements directly with `mhlo.get_tuple_element`.
+  SmallVector<Value, 4> tuple_operands;
+  for (auto idx : llvm::seq<int32_t>(0, tuple_type.getTypes().size()))
+    tuple_operands.push_back(
+        builder.create<GetTupleElementOp>(value.getLoc(), value, idx)
+            .getResult());
+
+  tuple_operands.push_back(token);
+  return create_tuple(tuple_operands);
+}
+
+// Extends a type to include a `mhlo.token` type. If `type` is not a tuple type,
+// a new tuple type with `type` and `mhlo.token` type is created instead.
+TupleType GetTypeWithToken(OpBuilder& builder, Type type) {
+  auto token_type = TokenType::get(builder.getContext());
+  if (auto tuple_type = type.dyn_cast<TupleType>()) {
+    auto result_types = llvm::to_vector<4>(tuple_type.getTypes());
+    result_types.push_back(token_type);
+    return builder.getTupleType(result_types);
+  }
+
+  return builder.getTupleType({type, token_type});
+}
+
+// Creates a slice of a tuple `value` with `mhlo.get_tuple_element` from index 0
+// to `end`, exclusive.
+Value CreateSubTuple(OpBuilder& builder, Value value, size_t end) {
+  SmallVector<Value, 4> tuple_operands;
+  for (auto idx : llvm::seq<int32_t>(0, end))
+    tuple_operands.push_back(
+        builder.create<GetTupleElementOp>(value.getLoc(), value, idx)
+            .getResult());
+
+  return CreateTuple(builder, value.getLoc(), tuple_operands);
+}
+
+// Replaces uses of `value` with `replacement`. If `value` is not a tuple type,
+// an explicit `mhlo.get_tuple_element` is created to unpack the tuple and
+// return the first element. Otherwise, `mhlo.get_tuple_element` users are
+// simply updated with `replacement`, and all other users are updated with a
+// slice of `replacement`.
+void ReplaceWithTupleResult(OpBuilder& builder, Value value,
+                            Value replacement) {
+  auto tuple_type = value.getType().dyn_cast<TupleType>();
+  if (!tuple_type) {
+    if (!value.use_empty()) {
+      auto new_element = builder.create<GetTupleElementOp>(replacement.getLoc(),
+                                                           replacement, 0);
+      value.replaceAllUsesWith(new_element.getResult());
+    }
+    return;
+  }
+
+  Value sub_tuple;
+  for (auto& use : llvm::make_early_inc_range(value.getUses())) {
+    if (isa<GetTupleElementOp>(use.getOwner())) {
+      use.set(replacement);
+      continue;
+    }
+
+    if (!sub_tuple)
+      sub_tuple = CreateSubTuple(builder, replacement, tuple_type.size());
+
+    use.set(sub_tuple);
+  }
+}
+
+// Replaces control flow op block single block argument with new block argument
+// of type `new_type` (tuple type). The last element of the new block argument
+// (token) is returned.
+Value UpdateControlFlowBlockArgWithToken(OpBuilder& builder, Block& block,
+                                         Type token_type) {
+  assert(block.getNumArguments() == 1);
+  builder.setInsertionPointToStart(&block);
+  auto new_arg = block.addArgument(token_type);
+  ReplaceWithTupleResult(builder, block.getArgument(0), new_arg);
+  block.eraseArgument(0);
+  return builder
+      .create<GetTupleElementOp>(new_arg.getLoc(), new_arg,
+                                 token_type.cast<TupleType>().size() - 1)
+      .getResult();
+}
+
+// Updates control flow op terminator with an extra element `token`. If the
+// original return value is not a tuple, a new tuple is formed. Otherwise the
+// tuple is extended.
+void RewriteControlFlowTerminator(OpBuilder& builder, Operation* terminator,
+                                  Value token) {
+  assert(terminator->getNumOperands() == 1);
+  assert(terminator->getBlock()->getNumArguments() == 1);
+  // `mhlo.while` cond terminator does not need to be rewritten as it always
+  // returns a tensor<i1> predicate value.
+  if (auto while_parent = dyn_cast_or_null<WhileOp>(terminator->getParentOp()))
+    if (terminator->getParentRegion() == &while_parent.cond()) return;
+
+  builder.setInsertionPoint(terminator);
+  llvm::SmallDenseMap<Value, Value> rewritten_operands;
+  Value new_result = GetValueWithToken(builder, terminator->getOperand(0),
+                                       token, rewritten_operands);
+  terminator->setOperand(0, new_result);
+}
+
+// Rewrites a `mhlo.if` op to receive and forward a `mhlo.token`. Operands to
+// the op for all of its regions are extended to have an extra operand `token`.
+void RewriteRegionIfOp(OpBuilder& builder, IfOp region_if,
+                       SmallVectorImpl<OpVisitorState>& ops_to_visit,
+                       Value token) {
+  llvm::SmallDenseMap<Value, Value> rewritten_operands;
+
+  // Rewrite all region operands to have an extra operand `token`.
+  Value new_true_operand = GetValueWithToken(builder, region_if.true_arg(),
+                                             token, rewritten_operands);
+  Value new_false_operand = GetValueWithToken(builder, region_if.false_arg(),
+                                              token, rewritten_operands);
+
+  auto new_result_type = GetTypeWithToken(builder, region_if.getType());
+
+  // Create new `mhlo.if` op with extra token operands and result.
+  auto new_if = builder.create<IfOp>(region_if.getLoc(), new_result_type,
+                                     region_if.pred(), new_true_operand,
+                                     new_false_operand);
+
+  // Move all regions from the old `mhlo.if` op to its replacement.
+  new_if.true_branch().takeBody(region_if.true_branch());
+  new_if.false_branch().takeBody(region_if.false_branch());
+
+  // Forward result from old `mhlo.if` with replacement, and unpack result when
+  // necessary.
+  ReplaceWithTupleResult(builder, region_if.getResult(), new_if.getResult());
+
+  auto new_token = builder.create<GetTupleElementOp>(
+      new_if.getLoc(), new_if.getResult(),
+      new_if.getResult().getType().cast<TupleType>().size() - 1);
+
+  region_if.erase();
+
+  // Remove leftover operands to old `mhlo.if` if they have no uses.
+  for (auto& rewritten_operand : rewritten_operands)
+    if (auto tuple_op = rewritten_operand.getFirst().getDefiningOp<TupleOp>())
+      if (tuple_op.use_empty()) tuple_op.erase();
+
+  // Next op to visit. The replacement is visited but at its first region. The
+  // token result of the new region if is propagated.
+  ops_to_visit.push_back({/*region_idx=*/0, new_token, new_if});
+}
+
+// Rewrites a `mhlo.if`/`mhlo.while` region to receive and forward a
+// `mhlo.token`. The block argument is updated to have an extra `mhlo.token`
+// element. If the region block is to be rewritten, the next op to visit is set
+// to the first op in the block. Otherwise the terminator is updated to forward
+// `token`.
+void RewriteControlFlowOpRegion(
+    OpBuilder& builder, Operation* region_op, unsigned region_idx,
+    Type block_arg_type, SmallVectorImpl<OpVisitorState>& ops_to_visit,
+    const llvm::SmallPtrSetImpl<Block*>& control_flow_blocks, Value token) {
+  ops_to_visit.push_back({region_idx + 1, token, region_op});
+
+  Region& region = region_op->getRegion(region_idx);
+  assert(llvm::hasSingleElement(region));
+
+  auto block_token = UpdateControlFlowBlockArgWithToken(builder, region.front(),
+                                                        block_arg_type);
+
+  if (control_flow_blocks.contains(&region.front())) {
+    ops_to_visit.push_back({/*region_idx=*/llvm::None, block_token,
+                            block_token.getDefiningOp()->getNextNode()});
+    return;
+  }
+
+  RewriteControlFlowTerminator(builder, region.front().getTerminator(),
+                               block_token);
+}
+
+// Rewrites an `mhlo.if` op or its region. If `region_idx` is not set, the op
+// operands and results are rewritten. If `region_idx` is set, region
+// `region_idx` is rewritten to take in and return an additional token. Returns
+// true if the op or its region was rewritten.
+bool ProcessRegionIfOp(OpBuilder& builder, IfOp region_if,
+                       Optional<unsigned> region_idx,
+                       SmallVectorImpl<OpVisitorState>& ops_to_visit,
+                       const llvm::SmallPtrSetImpl<Block*>& control_flow_blocks,
+                       Value token) {
+  builder.setInsertionPoint(region_if);
+
+  if (!region_idx) {
+    RewriteRegionIfOp(builder, region_if, ops_to_visit, token);
+    return true;
+  }
+
+  if (*region_idx < region_if.getNumRegions()) {
+    RewriteControlFlowOpRegion(builder, region_if, *region_idx,
+                               region_if.getOperand(*region_idx + 1).getType(),
+                               ops_to_visit, control_flow_blocks, token);
+    return true;
+  }
+
+  return false;
+}
+
+// Rewrites a `mhlo.while` op to receive and forward a `mhlo.token`. Operands to
+// the op for all of its regions are extended to have an extra operand `token`.
+void RewriteRegionWhileOp(OpBuilder& builder, WhileOp region_while,
+                          SmallVectorImpl<OpVisitorState>& ops_to_visit,
+                          Value token) {
+  llvm::SmallDenseMap<Value, Value> rewritten_operands;
+
+  // Rewrite region operand to have an extra operand `token`.
+  Value new_val_operand =
+      GetValueWithToken(builder, region_while.val(), token, rewritten_operands);
+
+  auto new_result_type = GetTypeWithToken(builder, region_while.getType());
+
+  // Create new `mhlo.while` op with extra token operand and result.
+  auto new_while = builder.create<WhileOp>(region_while.getLoc(),
+                                           new_result_type, new_val_operand);
+
+  // Move all regions from the old `mhlo.while` op to its replacement.
+  new_while.cond().takeBody(region_while.cond());
+  new_while.body().takeBody(region_while.body());
+
+  // Forward result from old `mhlo.while` with replacement, and unpack result
+  // when necessary.
+  ReplaceWithTupleResult(builder, region_while.getResult(),
+                         new_while.getResult());
+
+  auto new_token = builder.create<GetTupleElementOp>(
+      new_while.getLoc(), new_while.getResult(),
+      new_while.getResult().getType().cast<TupleType>().size() - 1);
+
+  region_while.erase();
+
+  // Remove leftover operands to old `mhlo.while` if they have no uses.
+  for (auto& rewritten_operand : rewritten_operands)
+    if (auto tuple_op = rewritten_operand.getFirst().getDefiningOp<TupleOp>())
+      if (tuple_op.use_empty()) tuple_op.erase();
+
+  // Next op to visit. The replacement is visited but at its first region. The
+  // token result of the new region if is propagated.
+  ops_to_visit.push_back({/*region_idx=*/0, new_token, new_while});
+}
+
+// Rewrites an `mhlo.while` op or its region. If `region_idx` is not set, the op
+// operands and results are rewritten. If `region_idx` is set, region
+// `region_idx` is rewritten to take in and return an additional token. Returns
+// true if the op or its region was rewritten.
+bool ProcessRegionWhileOp(
+    OpBuilder& builder, WhileOp region_while, Optional<unsigned> region_idx,
+    SmallVectorImpl<OpVisitorState>& ops_to_visit,
+    const llvm::SmallPtrSetImpl<Block*>& control_flow_blocks, Value token) {
+  builder.setInsertionPoint(region_while);
+
+  if (!region_idx) {
+    RewriteRegionWhileOp(builder, region_while, ops_to_visit, token);
+    return true;
+  }
+
+  if (*region_idx < region_while.getNumRegions()) {
+    RewriteControlFlowOpRegion(builder, region_while, *region_idx,
+                               region_while.val().getType(), ops_to_visit,
+                               control_flow_blocks, token);
+    return true;
+  }
+
+  return false;
+}
+
+// Updates function type based on current function body block arguments and
+// terminator operand types.
+void UpdateFunctionType(OpBuilder& builder, FuncOp func, Block& func_body) {
+  auto new_argument_types = llvm::to_vector<4>(func_body.getArgumentTypes());
+  auto new_result_types =
+      llvm::to_vector<4>(func_body.getTerminator()->getOperandTypes());
+  func.setType(FunctionType::get(new_argument_types, new_result_types,
+                                 builder.getContext()));
+}
+
+// Replaces a function terminator `return` with another `return` that has an
+// extra `mhlo.token` operand.
+void RewriteFunctionTerminator(OpBuilder& builder, mlir::ReturnOp terminator,
+                               Value token) {
+  auto new_results = llvm::to_vector<4>(terminator.getOperands());
+  new_results.push_back(token);
+  builder.setInsertionPoint(terminator);
+  builder.create<mlir::ReturnOp>(terminator.getLoc(), new_results);
+  terminator.erase();
+}
+
+// Rewrites a function body and communication ops inside. Region control flow
+// are updated when necessary, to propagate tokens. The function may either be
+// rewritten to create a token or take in and return a token, depending on its
+// visibility and if there are any callers.
+LogicalResult RewriteFunction(
+    OpBuilder& builder, int64_t& channel_id, ModuleOp module, FuncOp func,
+    const llvm::SmallDenseMap<StringRef, FuncToRewrite>& funcs,
+    const llvm::SmallPtrSetImpl<Operation*>& control_flow_ops,
+    const llvm::SmallPtrSetImpl<Block*>& control_flow_blocks, bool is_clone) {
+  MLIRContext* context = module.getContext();
+  if (!llvm::hasSingleElement(func.getBody()))
+    return func.emitError()
+           << "'" << FuncOp::getOperationName()
+           << "' ops with more than one block are not supported";
+
+  bool rewrite_block =
+      is_clone || (!func.isPublic() && !func.symbolKnownUseEmpty(module));
+  Block& func_body = func.front();
+
+  builder.setInsertionPointToStart(&func_body);
+  auto token_type = TokenType::get(context);
+  // If a function is public, it's signature should not be modified, and instead
+  // a token will be created. Otherwise a token block argument is inserted.
+  Value init_token =
+      rewrite_block ? func_body.addArgument(token_type)
+                    : builder.create<CreateTokenOp>(func.getLoc(), token_type)
+                          .getResult();
+
+  // Stack to keep track of region based control flow op nesting and current
+  // op to visit.
+  SmallVector<OpVisitorState, 4> ops_to_visit{
+      {/*region_idx=*/llvm::None, init_token, &func_body.front()}};
+
+  while (!ops_to_visit.empty()) {
+    OpVisitorState op_to_visit = ops_to_visit.pop_back_val();
+    Operation* curr_op = op_to_visit.op;
+
+    Value token = op_to_visit.token;
+    // Ops may be removed, so the next op is kept track of beforehand.
+    Operation* next_op = curr_op->getNextNode();
+
+    if (auto host_compute = dyn_cast<TF::_XlaHostComputeMlirOp>(curr_op)) {
+      token = RewriteHostComputeOp(builder, channel_id, host_compute, token);
+    } else if (auto send_to_host = dyn_cast<TF::XlaSendToHostOp>(curr_op)) {
+      token = RewriteSendToHostOp(builder, channel_id, send_to_host, token);
+    } else if (auto recv_from_host = dyn_cast<TF::XlaRecvFromHostOp>(curr_op)) {
+      token = RewriteRecvFromHostOp(builder, channel_id, recv_from_host, token);
+    } else if (auto call = dyn_cast<mlir::CallOp>(curr_op)) {
+      // Only `mlir::CallOp` is supported as this requires knowing how to
+      // rewrite arguments and results to a function.
+      auto it = funcs.find(call.getCallee());
+      if (it != funcs.end()) {
+        FuncOp clone = it->getSecond().clone;
+        Optional<StringRef> symbol_name =
+            clone ? Optional<StringRef>(clone.getName()) : llvm::None;
+        // If the function being called is to be cloned, update the call to also
+        // point to the cloned function.
+        token = RewriteCallOp(builder, call, symbol_name, token);
+      }
+    } else if (auto region_if = dyn_cast<IfOp>(curr_op)) {
+      if (op_to_visit.region_idx || control_flow_ops.contains(region_if))
+        if (ProcessRegionIfOp(builder, region_if, op_to_visit.region_idx,
+                              ops_to_visit, control_flow_blocks, token))
+          continue;
+    } else if (auto region_while = dyn_cast<WhileOp>(curr_op)) {
+      if (op_to_visit.region_idx || control_flow_ops.contains(region_while))
+        if (ProcessRegionWhileOp(builder, region_while, op_to_visit.region_idx,
+                                 ops_to_visit, control_flow_blocks, token))
+          continue;
+    } else if (auto region_terminator = dyn_cast<mhlo::ReturnOp>(curr_op)) {
+      RewriteControlFlowTerminator(builder, region_terminator, token);
+      // There is no next op afer the control flow op terminator, simply let
+      // stack have one less element.
+      continue;
+    } else if (auto func_terminator = dyn_cast<mlir::ReturnOp>(curr_op)) {
+      if (rewrite_block)
+        RewriteFunctionTerminator(builder, func_terminator, token);
+
+      // There is no next op afer the function terminator, simply let stack have
+      // one less element/be empty.
+      continue;
+    }
+
+    // Visit next op.
+    ops_to_visit.push_back({/*region_idx=*/llvm::None, token, next_op});
+  }
+
+  if (rewrite_block) UpdateFunctionType(builder, func, func_body);
+
+  return success();
+}
+
+// Checks if a function call is pointing to a function with communication ops.
+bool IsFunctionCallWithCommunication(
+    Operation* op,
+    const llvm::SmallDenseMap<StringRef, FuncToRewrite>& funcs_to_rewrite) {
+  if (auto call = dyn_cast<mlir::CallOp>(op))
+    return funcs_to_rewrite.count(call.callee());
+
+  return false;
+}
+
+// Collects all control flow op ancestors of communication ops or function calls
+// with communication ops (transitively).
+void GetCommunicationControlFlowOps(
+    FuncOp func,
+    const llvm::SmallDenseMap<StringRef, FuncToRewrite>& funcs_to_rewrite,
+    llvm::SmallPtrSetImpl<Operation*>& control_flow_ops,
+    llvm::SmallPtrSetImpl<Block*>& control_flow_blocks) {
+  func.walk([&](Operation* op) {
+    if (IsCommunicationOp(op) ||
+        IsFunctionCallWithCommunication(op, funcs_to_rewrite))
+      if (failed(GetControlFlowAncestors(op, control_flow_ops,
+                                         control_flow_blocks)))
+        llvm_unreachable(
+            "checking original function for control flow ancestors should have "
+            "errored first");
+  });
+}
+
+void LegalizeTFCommunication::runOnOperation() {
+  auto module = getOperation();
+  llvm::SmallDenseMap<StringRef, FuncToRewrite> funcs_to_rewrite;
+  if (failed(GetFunctionsToRewrite(module, funcs_to_rewrite)))
+    return signalPassFailure();
+
+  // Module level counter to make sure Channel Id's are unique.
+  int64_t channel_id = 1;
+  OpBuilder builder(&getContext());
+  for (const auto& func_and_name : funcs_to_rewrite) {
+    const auto& func_to_rewrite = func_and_name.getSecond();
+    FuncOp func = func_to_rewrite.original;
+    if (failed(RewriteFunction(builder, channel_id, module, func,
+                               funcs_to_rewrite,
+                               func_to_rewrite.control_flow_ops,
+                               func_to_rewrite.control_flow_blocks,
+                               /*is_clone=*/false)))
+      return signalPassFailure();
+
+    FuncOp clone = func_and_name.getSecond().clone;
+    if (!clone) continue;
+    llvm::SmallPtrSet<Operation*, 4> clone_control_flow_ops;
+    llvm::SmallPtrSet<Block*, 4> clone_control_flow_blocks;
+    GetCommunicationControlFlowOps(clone, funcs_to_rewrite,
+                                   clone_control_flow_ops,
+                                   clone_control_flow_blocks);
+    if (failed(RewriteFunction(builder, channel_id, module, clone,
+                               funcs_to_rewrite, clone_control_flow_ops,
+                               clone_control_flow_blocks,
+                               /*is_clone=*/true)))
+      llvm_unreachable(
+          "rewriting of original function should have errored first");
+  }
+}
+
+static PassRegistration<LegalizeTFCommunication> pass(
+    "xla-legalize-tf-communication",
+    "Legalize TF/XLA communication ops (TensorFlow dialect) to the HLO "
+    "dialect");
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateLegalizeTFCommunicationPass() {
+  return std::make_unique<LegalizeTFCommunication>();
+}
+
+}  // namespace mhlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc
index 09e94d9a84f..760252331e0 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc
@@ -119,10 +119,8 @@ void LowerIf(TF::IfOp op, ModuleOp module) {
   // Import the regions for both the true and false cases. These regions
   // must be updated to tuple the return results together and use the xla hlo
   // return op.
-  auto then_branch = module.lookupSymbol<mlir::FuncOp>(op.then_branch());
-  auto else_branch = module.lookupSymbol<mlir::FuncOp>(op.else_branch());
-  ImportXlaRegion(then_branch, &if_op.true_branch(), loc);
-  ImportXlaRegion(else_branch, &if_op.false_branch(), loc);
+  ImportXlaRegion(op.then_func(), &if_op.true_branch(), loc);
+  ImportXlaRegion(op.else_func(), &if_op.false_branch(), loc);
 
   // De-tuple the results of the xla hlo if result.
   Detuple(if_op.getResult(), op.getResults(), &builder);
@@ -174,11 +172,9 @@ void LowerWhile(TF::WhileOp op, ModuleOp module) {
 
   // Import the regions for both the cond and body. These regions must be
   // updated to tuple the return results together and use the xla hlo return op.
-  auto body_branch = module.lookupSymbol<mlir::FuncOp>(op.body());
-  auto cond_branch = module.lookupSymbol<mlir::FuncOp>(op.cond());
-
-  ImportXlaRegion(body_branch, &while_op.body(), loc);
-  ImportXlaRegion(cond_branch, &while_op.cond(), loc, /*tuple_return=*/false);
+  ImportXlaRegion(op.body_func(), &while_op.body(), loc);
+  ImportXlaRegion(op.cond_func(), &while_op.cond(), loc,
+                  /*tuple_return=*/false);
 
   // De-tuple the results of the xla hlo while.
   Detuple(while_op.getResult(), op.getResults(), &builder);
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
index 05e061337c7..1d4c9503afa 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
@@ -128,7 +128,7 @@ def : Pat<(TF_FloorDivOp AnyRankedTensor:$l, AnyRankedTensor:$r),
 //   return x / y;
 // }
 //
-// BraodcastToDimensions is used to compute the broadcast attr to higher
+// BroadcastToDimensions is used to compute the broadcast attr to higher
 // dimensions. This computes the broadcast of 'l' to broadcast('l', 'r')
 // without returning the broadcast of 'r' to broadcast('l', 'r').
 //
@@ -143,14 +143,14 @@ def : Pat<(TF_FloorDivOp AnyRankedTensor:$l, AnyRankedTensor:$r),
           (HLOClient_BroadcastCompareOp $r, (HLO_ConstOp (GetScalarOfType<0> $r)),
            (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT),
           (BinBroadcastDimensions $l, $r), HLO_COMPARISON_DIRECTION_EQ),
-        (HLOClient_BroadcastDivOp $l, $r, (BinBroadcastDimensions $l, $r)),
-          (HLOClient_BroadcastDivOp
-           (HLO_NegOp:$neg (HLOClient_BroadcastAddOp (HLO_AbsOp $l),
+         (HLOClient_BroadcastDivOp $l, $r, (BinBroadcastDimensions $l, $r)),
+         (HLOClient_BroadcastDivOp
+          (HLO_NegOp:$neg (HLOClient_BroadcastAddOp (HLO_AbsOp $l),
                        (HLOClient_BroadcastSubOp (HLO_AbsOp $r),
                         (HLO_ConstOp (GetScalarOfType<1> $r)),
                         (NullDenseIntElementsAttr)),
                      (BinBroadcastDimensions $l, $r))),
-           (HLO_AbsOp:$abs $r), (BinBroadcastDimensions $neg, $abs))),
+          (HLO_AbsOp:$abs $r), (BinBroadcastDimensions $neg, $abs))),
         [(SignedIntTensor $l)]>;
 
 // Performs a substitution of FloorMod designed to correct for possibly negative
@@ -175,8 +175,8 @@ def : Pat<(TF_FloorModOp AnyRankedTensor:$l, AnyRankedTensor:$r),
           (BinBroadcastDimensions $rem, $r_zeros), HLO_COMPARISON_DIRECTION_LT),
          (BinBroadcastDimensions $r_cmp, $rem_cmp), HLO_COMPARISON_DIRECTION_NE),
         (NullDenseIntElementsAttr)),
-        (HLOClient_BroadcastAddOp $r,
-         $rem, (BinBroadcastDimensions $r, $rem)), $rem)>;
+       (HLOClient_BroadcastAddOp $r,
+        $rem, (BinBroadcastDimensions $r, $rem)), $rem)>;
 
 //===----------------------------------------------------------------------===//
 // Logical & bitwise binary op patterns.
@@ -489,7 +489,7 @@ def : Pat<(TF_SliceOp:$op HLO_Tensor:$input, HLO_Tensor:$starting_indices,
             $slice_sizes)]>;
 
 //===----------------------------------------------------------------------===//
-// PartitionedCall op patterns.
+// PartitionedCall and LegacyCall op patterns.
 //===----------------------------------------------------------------------===//
 
 def ArgTypesMatchCallee : Constraint<
@@ -502,6 +502,12 @@ foreach callOp = [TF_PartitionedCallOp, TF_StatefulPartitionedCallOp] in {
           [(ArgTypesMatchCallee $op, $args, $f)]>;
 }
 
+// The extra attr on this op is _disable_call_shape_inference, which we ignore
+// in the bridge.
+def : Pat<(TF_LegacyCallOp:$op $args, FlatSymbolRefAttr:$f, $attr),
+          (CallOp $f, $args),
+        [(ArgTypesMatchCallee $op, $args, $f)]>;
+
 //===----------------------------------------------------------------------===//
 // Reverse op patterns.
 //===----------------------------------------------------------------------===//
@@ -518,6 +524,7 @@ def : Pat<(TF_ReverseV2Op AnyRankedTensor:$values, (TF_ConstOp $axis)),
 
 foreach Mapping = [
                    [TF_AbsOp, HLO_AbsOp],
+                   [TF_AcosOp, HLOClient_AcosOp],
                    [TF_CeilOp, HLO_CeilOp],
                    [TF_ComplexAbsOp, HLO_AbsOp],
                    [TF_CosOp, HLO_CosOp],
@@ -540,6 +547,19 @@ foreach Mapping = [
            (Mapping[1] $input)>;
 }
 
+// Expand acos to MHLO dialect as follows:
+//   acos(x) = 2 * atan(sqrt(1 - x^2) / (1 + x))  if x != -1
+//           = pi                                 if x == -1
+def : Pat<(HLOClient_AcosOp $input), (HLO_SelectOp
+  (HLO_CompareOp $input, (HLO_ConstantLike<"0"> $input),
+     HLO_COMPARISON_DIRECTION_NE),
+  (HLO_MulOp (HLO_ConstantLike<"2.0f"> $input),
+    (HLO_Atan2Op
+      (HLO_SqrtOp (HLO_SubOp
+         (HLO_ConstantLike<"1"> $input), (HLO_MulOp $input, $input))),
+      (HLO_AddOp (HLO_ConstantLike<"1"> $input), $input))),
+  (HLO_ConstantLike<"M_PI"> $input))>;
+
 // TODO(bixia): Lower Cast with a Complex type source operand or with
 // Truncate=True for floating point value conversions.
 def : Pat<(TF_CastOp HLO_Tensor:$arg, ConstBoolAttrFalse),
@@ -557,17 +577,8 @@ foreach TfOp = [TF_ExpandDimsOp, TF_ReshapeOp, TF_SqueezeOp, ] in {
             (HLO_ReshapeOp $arg), [(AnyStaticShapeTensor $res)]>;
 }
 
-// Returns 0 if x is NaN, 0 if x is 0, -1 if x < 0 and 1 if x > 0.
-def : Pat<(TF_SignOp $x),
-          (HLO_SelectOp
-            (HLO_CompareOp
-              $x,
-              $x,
-              HLO_COMPARISON_DIRECTION_NE
-            ),
-            (HLO_ConstOp (ConstantSplat<"0"> $x)),
-            (HLO_SignOp $x)
-          )>;
+// Returns NaN if x is NaN, 0 if x is 0, -1 if x < 0 and 1 if x > 0.
+def : Pat<(TF_SignOp $x), (HLO_SignOp $x)>;
 
 def BothElementTypesSameWidthIntOrFloat : Constraint<CPred<
   "getElementTypeOrSelf($0.getType()).isSignlessIntOrFloat() && "
@@ -583,6 +594,9 @@ def : Pat<(TF_BitcastOp:$res HLO_Tensor:$arg),
           (HLO_BitcastConvertOp $arg),
           [(BothElementTypesSameWidthIntOrFloat $res, $arg)]>;
 
+// TODO(jpienaar): Lower constant like to constant to broadcast if dynamic
+// and going to MHLO.
+
 //===----------------------------------------------------------------------===//
 // Random ops.
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
index d25b38d9ece..904b80e05b1 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
@@ -24,12 +24,15 @@ limitations under the License.
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
@@ -45,7 +48,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/tf2xla/xla_expression.h"
-#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
@@ -70,12 +74,8 @@ limitations under the License.
 
 namespace mlir {
 namespace mhlo {
-namespace {
 
-template <typename T, size_t N>
-using InlinedVector = tensorflow::gtl::InlinedVector<T, N>;  // non-absl ok
-
-static bool IsOpAllowlisted(Operation* op) {
+bool IsOpAllowedTf2XlaFallback(Operation* op) {
   // Allowlisted TensorFlow ops are known to have well behaved tf2xla kernels
   // building valid MLIR using MlirHloBuilder.
   // TODO(hinsu): Drop explicit allowlist when MLIR based bridge is enabled for
@@ -100,6 +100,8 @@ static bool IsOpAllowlisted(Operation* op) {
     TypeID::get<TF::AtanhOp>(),
     TypeID::get<TF::AtanOp>(),
     TypeID::get<TF::BatchMatMulV2Op>(),
+    TypeID::get<TF::BatchToSpaceNDOp>(),
+    TypeID::get<TF::BatchToSpaceOp>(),
     TypeID::get<TF::BiasAddGradOp>(),
     TypeID::get<TF::BiasAddOp>(),
     TypeID::get<TF::BitwiseAndOp>(),
@@ -152,9 +154,12 @@ static bool IsOpAllowlisted(Operation* op) {
     TypeID::get<TF::LogicalOrOp>(),
     TypeID::get<TF::LogOp>(),
     TypeID::get<TF::MatMulOp>(),
+    TypeID::get<TF::MatrixDiagV3Op>(),
+    TypeID::get<TF::MatrixSetDiagV3Op>(),
     TypeID::get<TF::MirrorPadOp>(),
     TypeID::get<TF::MulOp>(),
     TypeID::get<TF::NegOp>(),
+    TypeID::get<TF::NonMaxSuppressionV4Op>(),
     TypeID::get<TF::NotEqualOp>(),
     TypeID::get<TF::PadOp>(),
     TypeID::get<TF::PlaceholderWithDefaultOp>(),
@@ -174,6 +179,7 @@ static bool IsOpAllowlisted(Operation* op) {
     TypeID::get<TF::RintOp>(),
     TypeID::get<TF::RoundOp>(),
     TypeID::get<TF::SelectV2Op>(),
+    TypeID::get<TF::SelfAdjointEigV2Op>(),
     TypeID::get<TF::SeluGradOp>(),
     TypeID::get<TF::SeluOp>(),
     TypeID::get<TF::SigmoidGradOp>(),
@@ -182,6 +188,8 @@ static bool IsOpAllowlisted(Operation* op) {
     TypeID::get<TF::SoftplusGradOp>(),
     TypeID::get<TF::SoftsignGradOp>(),
     TypeID::get<TF::SoftsignOp>(),
+    TypeID::get<TF::SpaceToBatchNDOp>(),
+    TypeID::get<TF::SpaceToBatchOp>(),
     TypeID::get<TF::SparseToDenseOp>(),
     TypeID::get<TF::SqrtGradOp>(),
     TypeID::get<TF::SquareOp>(),
@@ -209,8 +217,13 @@ static bool IsOpAllowlisted(Operation* op) {
   return ops.count(abstractOp->typeID);
 }
 
+namespace {
+
+template <typename T, size_t N>
+using InlinedVector = tensorflow::gtl::InlinedVector<T, N>;  // non-absl ok
+
 static std::unique_ptr<tensorflow::StaticDeviceMgr> CreateDeviceMgr(
-    const std::string& device_type, const Location& loc) {
+    const std::string& device_type) {
   // Register compilation kernels for all registered XLA backends.
   tensorflow::XlaOpRegistry::RegisterCompilationKernels();
 
@@ -219,42 +232,47 @@ static std::unique_ptr<tensorflow::StaticDeviceMgr> CreateDeviceMgr(
   return absl::make_unique<tensorflow::StaticDeviceMgr>(std::move(device));
 }
 
-class FuncLegalizer {
+class Tf2XlaRewriter {
  public:
-  static LogicalResult Legalize(FuncOp func, const std::string& device_type) {
-    FuncLegalizer legalizer(func, device_type);
-    if (failed(legalizer.PrepareParams())) return failure();
-    return legalizer.Legalize();
+  static LogicalResult RewriteOp(Operation* op, PatternRewriter& rewriter,
+                                 const std::string& device_type) {
+    Tf2XlaRewriter tf2xla_rewriter(op, rewriter, device_type);
+    return tf2xla_rewriter.LegalizeOp();
   }
 
  private:
-  FuncLegalizer(FuncOp func, const std::string& device_type)
-      : func_(func), device_type_(device_type), hlo_builder_(func) {}
+  Tf2XlaRewriter(Operation* op, PatternRewriter& rewriter,
+                 const std::string& device_type)
+      : op_(op),
+        device_type_(device_type),
+        rewriter_(rewriter),
+        hlo_builder_(op->getName().getStringRef().str(), rewriter_,
+                     op->getLoc()),
+        context_(nullptr) {}
 
-  ~FuncLegalizer() { context_->Unref(); }
+  ~Tf2XlaRewriter() {
+    if (context_) context_->Unref();
+  }
 
   // Prepares OpKernelContext params common to all the ops.
   // Emits an error on failure.
   LogicalResult PrepareParams();
 
-  // Tries to legalize supported TensorFlow ops.
-  // Emits an error on failure.
-  LogicalResult Legalize();
-
   // Tries to legalize the specified TensorFlow op, if supported.
   //
   // Emits an error and returns failure if an error is encountered during
   // conversion. Note that success return value doesn't mean successful
   // legalization.
-  LogicalResult LegalizeOp(Operation* op);
+  LogicalResult LegalizeOp();
 
   // Converts the given operand to expression of kind kConstant or kXlaOp.
   // Emits a remark and returns expression of kind kInvalid on failure.
   tensorflow::XlaExpression GetExprForOperand(Value operand, Operation* op);
 
-  FuncOp func_;
+  Operation* op_;
   std::string device_type_;
 
+  PatternRewriter& rewriter_;
   ::xla::MlirHloBuilder hlo_builder_;
   tensorflow::OpOrArgLocNameMapper name_mapper_;
 
@@ -268,15 +286,14 @@ class FuncLegalizer {
   tensorflow::OpKernelContext::Params params_;
 };
 
-LogicalResult FuncLegalizer::PrepareParams() {
+LogicalResult Tf2XlaRewriter::PrepareParams() {
   // XlaCompiler within the context is only used by the functional ops to
   // compile functions. We are not handling those at the moment so XlaCompiler
   // is not required.
   context_ = new tensorflow::XlaContext(/*compiler=*/nullptr, &hlo_builder_);
   context_->Ref();
 
-  mlir::Location loc = func_.getLoc();
-  device_mgr_ = CreateDeviceMgr(device_type_, loc);
+  device_mgr_ = CreateDeviceMgr(device_type_);
   if (!device_mgr_) return failure();
 
   // Type of params_.device is DeviceBase* so store it as Device* to access
@@ -296,18 +313,16 @@ LogicalResult FuncLegalizer::PrepareParams() {
       device_->resource_manager(),
       tensorflow::XlaContext::kXlaContextResourceName, context_);
   if (!status.ok()) {
-    emitError(loc) << "failed to create XlaContext resource: "
-                   << status.ToString();
-    return failure();
+    return emitError(op_->getLoc())
+           << "failed to create XlaContext resource: " << status.ToString();
   }
   params_.step_container = step_container_.get();
 
   tensorflow::StatusOr<int64_t> version_or =
       tensorflow::GetTfGraphProducerVersion(
-          func_.getParentOfType<mlir::ModuleOp>());
+          op_->getParentOfType<mlir::ModuleOp>());
   if (!version_or.ok()) {
-    emitError(loc) << version_or.status().ToString();
-    return failure();
+    return emitError(op_->getLoc()) << version_or.status().ToString();
   }
 
   flib_def_ = absl::make_unique<tensorflow::FunctionLibraryDefinition>(
@@ -319,62 +334,38 @@ LogicalResult FuncLegalizer::PrepareParams() {
   return success();
 }
 
-LogicalResult FuncLegalizer::Legalize() {
-  if (func_.empty()) return success();
-
-  // TensorFlow functions don't use CFGs.
-  if (!llvm::hasSingleElement(func_)) {
-    emitError(func_.getLoc()) << "requires at most one block in a TF function";
-    return failure();
-  }
-  Block& block = func_.front();
-
-  std::vector<Operation*> ops;
-  ops.reserve(block.getOperations().size());
-  for (Operation& op : block.getOperations()) {
-    ops.push_back(&op);
-  }
-
-  for (Operation* op : ops) {
-    if (failed(LegalizeOp(op))) return failure();
-  }
-  return success();
-}
-
-LogicalResult FuncLegalizer::LegalizeOp(Operation* op) {
-  if (!IsOpAllowlisted(op)) return success();
-
+LogicalResult Tf2XlaRewriter::LegalizeOp() {
   // Only static shaped operands are supported in XLA builders for now.
-  for (Type ty : op->getOperandTypes()) {
+  for (Type ty : op_->getOperandTypes()) {
     auto ranked_ty = ty.dyn_cast<ShapedType>();
     if (!ranked_ty || !ranked_ty.hasStaticShape()) {
-      op->emitRemark() << "lowering requires static shaped tensor operands";
-      return success();
+      return op_->emitRemark()
+             << "lowering requires static shaped tensor operands";
     }
   }
 
   auto nodedef_or = tensorflow::ConvertTFDialectOpToNodeDef(
-      op, name_mapper_.GetUniqueName(op), /*ignore_unregistered_attrs=*/true);
+      op_, name_mapper_.GetUniqueName(op_), /*ignore_unregistered_attrs=*/true);
   if (!nodedef_or.ok()) {
-    op->emitRemark() << "failed to convert op to NodeDef: "
-                     << nodedef_or.status().ToString();
-    return success();
+    return op_->emitRemark() << "failed to convert op to NodeDef: "
+                             << nodedef_or.status().ToString();
   }
 
+  if (failed(PrepareParams())) return failure();
+
   std::shared_ptr<const tensorflow::NodeProperties> props;
   tensorflow::Status status = tensorflow::NodeProperties::CreateFromNodeDef(
       *nodedef_or.ValueOrDie(),
       params_.function_library->GetFunctionLibraryDefinition(), &props);
   if (!status.ok()) {
-    op->emitRemark() << "failed to create NodeProperties: "
-                     << status.ToString();
-    return success();
+    return op_->emitRemark()
+           << "failed to create NodeProperties: " << status.ToString();
   }
   tensorflow::OpKernel* op_kernel_raw;
   status = params_.function_library->CreateKernel(props, &op_kernel_raw);
   if (!status.ok()) {
-    op->emitRemark() << "failed to create tf2xla kernel: " << status.ToString();
-    return success();
+    return op_->emitRemark()
+           << "failed to create tf2xla kernel: " << status.ToString();
   }
   // Transfer ownership of the kernel to a local smart pointer.
   auto op_kernel = absl::WrapUnique(op_kernel_raw);
@@ -383,9 +374,8 @@ LogicalResult FuncLegalizer::LegalizeOp(Operation* op) {
   status = tensorflow::XlaOpRegistry::CompileTimeConstantInputs(
       *op_kernel, &required_constants);
   if (!status.ok()) {
-    op->emitRemark() << "failed to compute required constants: "
-                     << status.ToString();
-    return success();
+    return op_->emitRemark()
+           << "failed to compute required constants: " << status.ToString();
   }
   llvm::SmallDenseSet<int, 4> required_consts;
   required_consts.insert(required_constants.begin(), required_constants.end());
@@ -395,89 +385,87 @@ LogicalResult FuncLegalizer::LegalizeOp(Operation* op) {
   InlinedVector<tensorflow::XlaExpression, 4> expressions;
   InlinedVector<tensorflow::Tensor, 4> tensors;
   InlinedVector<tensorflow::TensorValue, 4> inputs;
-  expressions.reserve(op->getNumOperands());
-  tensors.reserve(op->getNumOperands());
-  inputs.reserve(op->getNumOperands());
+  expressions.reserve(op_->getNumOperands());
+  tensors.reserve(op_->getNumOperands());
+  inputs.reserve(op_->getNumOperands());
 
   // Prepare the list of Tensor inputs for the kernel.
-  for (auto it : llvm::enumerate(op->getOperands())) {
+  for (auto it : llvm::enumerate(op_->getOperands())) {
     Value operand = it.value();
     size_t idx = it.index();
 
-    tensorflow::XlaExpression expr = GetExprForOperand(operand, op);
+    tensorflow::XlaExpression expr = GetExprForOperand(operand, op_);
     tensorflow::XlaExpression::Kind kind = expr.kind();
-    if (kind == tensorflow::XlaExpression::Kind::kInvalid) return success();
+    if (kind == tensorflow::XlaExpression::Kind::kInvalid) return failure();
     if (required_consts.count(idx) &&
         kind != tensorflow::XlaExpression::Kind::kConstant) {
-      op->emitRemark() << "lowering requires operand #" << idx
-                       << " to be a constant";
-      return success();
+      return op_->emitRemark()
+             << "lowering requires operand #" << idx << " to be a constant";
     }
     expressions.push_back(expr);
 
     if (!tensorflow::DataTypeCanUseMemcpy(expr.dtype())) {
-      op->emitRemark() << "skipping legalization due to unsupported type "
-                       << operand.getType();
-      return success();
+      return op_->emitRemark()
+             << "skipping legalization due to unsupported type "
+             << operand.getType();
     }
 
     auto shape_or = expr.GetShape();
     if (!shape_or.ok()) {
-      op->emitRemark() << "failed to get shape for expression. "
-                       << expr.HumanString();
-      return success();
+      return op_->emitRemark()
+             << "failed to get shape for expression. " << expr.HumanString();
     }
 
     tensors.emplace_back(
         device_->GetAllocator(tensorflow::AllocatorAttributes()), expr.dtype(),
         shape_or.ValueOrDie());
     tensorflow::Tensor& tensor = tensors.back();
-    tensorflow::XlaOpKernelContext::AssignExpressionToTensor(expr, &tensor);
+    tensorflow::XlaExpression::AssignExpressionToTensor(expr, &tensor);
     inputs.emplace_back(&tensor);
   }
 
   params_.inputs = &inputs;
   params_.op_kernel = op_kernel.get();
   llvm::SmallVector<tensorflow::AllocatorAttributes, 4> output_attr(
-      op->getNumResults());
+      op_->getNumResults());
   params_.output_attr_array = output_attr.data();
 
-  hlo_builder_.setInsertionPoint(op);
-  hlo_builder_.SetLocation(op->getLoc());
+  hlo_builder_.setInsertionPoint(op_);
+  hlo_builder_.SetLocation(op_->getLoc());
 
   // Execute the kernel.
-  tensorflow::OpKernelContext op_context(&params_, op->getNumResults());
+  tensorflow::OpKernelContext op_context(&params_, op_->getNumResults());
   device_->Compute(params_.op_kernel, &op_context);
   if (!op_context.status().ok()) {
-    op->emitRemark() << "compilation to HLO failed: "
-                     << op_context.status().ToString();
-    return success();
+    return op_->emitRemark()
+           << "compilation to HLO failed: " << op_context.status().ToString();
   }
 
   // Replace uses of old results using the corresponding value after the
   // lowering.
-  for (int i = 0, e = op->getNumResults(); i < e; i++) {
+  llvm::SmallVector<Value, 2> values;
+  values.reserve(op_->getNumResults());
+  for (int i = 0, e = op_->getNumResults(); i < e; i++) {
     tensorflow::Tensor* output = op_context.mutable_output(i);
     const tensorflow::XlaExpression* expr =
-        tensorflow::XlaOpKernelContext::CastExpressionFromTensor(*output);
+        tensorflow::XlaExpression::CastExpressionFromTensor(*output);
     if (expr->kind() != tensorflow::XlaExpression::Kind::kXlaOp)
-      return op->emitError(
+      return op_->emitError(
           "expects XlaExpression of kind kXlaOp in compiled output");
     auto value = hlo_builder_.GetValue(expr->handle());
-    mlir::OpResult old_result = op->getResult(i);
+    mlir::OpResult old_result = op_->getResult(i);
     if (value.getType() != old_result.getType()) {
       value =
           hlo_builder_.create<mlir::TensorCastOp>(value, old_result.getType());
     }
-    old_result.replaceAllUsesWith(value);
+    values.push_back(value);
   }
-
-  op->erase();
+  rewriter_.replaceOp(op_, values);
   return success();
 }
 
-tensorflow::XlaExpression FuncLegalizer::GetExprForOperand(Value operand,
-                                                           Operation* op) {
+tensorflow::XlaExpression Tf2XlaRewriter::GetExprForOperand(Value operand,
+                                                            Operation* op) {
   ElementsAttr const_attr;
   auto defining_op = operand.getDefiningOp();
   if (defining_op && matchPattern(defining_op, m_Constant(&const_attr))) {
@@ -509,6 +497,23 @@ tensorflow::XlaExpression FuncLegalizer::GetExprForOperand(Value operand,
   return tensorflow::XlaExpression::XlaOp(xla_op, dtype);
 }
 
+class Tf2XlaRewritePattern : public RewritePattern {
+ public:
+  // Set benefit to 0 (= least benefit) so this pattern is only used as a
+  // fallback.
+  explicit Tf2XlaRewritePattern(const std::string& device_type)
+      : RewritePattern(0, MatchAnyOpTypeTag()), device_type_(device_type) {}
+
+  LogicalResult matchAndRewrite(Operation* op,
+                                PatternRewriter& rewriter) const override {
+    if (!IsOpAllowedTf2XlaFallback(op)) return failure();
+    return Tf2XlaRewriter::RewriteOp(op, rewriter, device_type_);
+  }
+
+ private:
+  std::string device_type_;
+};
+
 class LegalizeTF : public PassWrapper<LegalizeTF, FunctionPass> {
  public:
   LegalizeTF() = default;
@@ -520,7 +525,9 @@ class LegalizeTF : public PassWrapper<LegalizeTF, FunctionPass> {
   LegalizeTF(const LegalizeTF&) {}
 
   void runOnFunction() override {
-    if (failed(FuncLegalizer::Legalize(getFunction(), device_type_)))
+    OwningRewritePatternList patterns;
+    patterns.insert<Tf2XlaRewritePattern>(device_type_);
+    if (failed(applyPatternsAndFoldGreedily(getFunction(), patterns)))
       signalPassFailure();
   }
 
@@ -529,8 +536,7 @@ class LegalizeTF : public PassWrapper<LegalizeTF, FunctionPass> {
   // global device type for all TensorFlow ops.
   Option<std::string> device_type_{
       *this, "device-type",
-      llvm::cl::desc("XLA device type for execution of TensorFlow ops. "
-                     "Supports XLA_CPU_JIT and XLA_TPU_JIT for now.")};
+      llvm::cl::desc("XLA device type for execution of TensorFlow ops.")};
 };
 
 static PassRegistration<LegalizeTF> pass(
@@ -539,6 +545,11 @@ static PassRegistration<LegalizeTF> pass(
 
 }  // end namespace
 
+void PopulateLegalizeTfWithTf2XlaPatterns(llvm::StringRef device_type,
+                                          OwningRewritePatternList& patterns) {
+  patterns.insert<Tf2XlaRewritePattern>(device_type.str());
+}
+
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeTfWithTf2XlaPass(
     llvm::StringRef device_type) {
   return std::make_unique<LegalizeTF>(device_type);
diff --git a/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.cc b/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.cc
index 519068893e7..832bad2dcc8 100644
--- a/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.cc
@@ -33,16 +33,20 @@ limitations under the License.
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassOptions.h"  // from @llvm-project
+#include "mlir/Translation.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/hlo_function_importer.h"
 #include "tensorflow/compiler/mlir/xla/hlo_utils.h"
 #include "tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
 
@@ -74,26 +78,8 @@ StatusOr<std::unique_ptr<HloModule>> HloModuleFromProto(
 
 // Convert the MLIR `module` from HLO dialect to LHLO dialect using XLA for the
 // given platform.
-Status ConvertModule(ModuleOp module, StringRef platform_name) {
-  SymbolTable symbol_table(module);
-  if (!symbol_table.lookup("main")) {
-    return ::xla::InvalidArgument(
-        "conversion to HLO module failed: missing main()");
-  }
-  HloProto hlo_proto;
-  TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      ConvertMlirHloToHlo(module, &hlo_proto,
-                          /*use_tuple_args=*/false,
-                          /*return_tuple=*/false,
-                          /*shape_representation_fn=*/nullptr),
-      "conversion to XLA HLO proto failed");
-
-  auto statusOrHloModule = HloModuleFromProto(hlo_proto);
-  TF_RETURN_WITH_CONTEXT_IF_ERROR(statusOrHloModule.status(),
-                                  "parsing HLO proto to HLO module failed");
-  std::unique_ptr<HloModule> hlo_module =
-      std::move(statusOrHloModule.ValueOrDie());
-
+Status ConvertModule(std::unique_ptr<HloModule> hlo_module, ModuleOp module,
+                     StringRef platform_name) {
   auto platform = ::xla::se::MultiPlatformManager::PlatformWithName(
       StringRefToView(platform_name));
   if (!platform.ok()) {
@@ -155,7 +141,29 @@ class XlaHloToLhloPass
  private:
   void runOnOperation() final {
     ModuleOp module = getOperation();
-    Status status = ConvertModule(module, platform_);
+
+    auto status = [&module, this]() -> Status {
+      SymbolTable symbol_table(module);
+      if (!symbol_table.lookup("main")) {
+        return ::xla::InvalidArgument(
+            "conversion to HLO module failed: missing main()");
+      }
+      HloProto hlo_proto;
+      TF_RETURN_WITH_CONTEXT_IF_ERROR(
+          ConvertMlirHloToHlo(module, &hlo_proto,
+                              /*use_tuple_args=*/false,
+                              /*return_tuple=*/false,
+                              /*shape_representation_fn=*/nullptr),
+          "conversion to XLA HLO proto failed");
+
+      auto statusOrHloModule = HloModuleFromProto(hlo_proto);
+      TF_RETURN_WITH_CONTEXT_IF_ERROR(statusOrHloModule.status(),
+                                      "parsing HLO proto to HLO module failed");
+      std::unique_ptr<HloModule> hlo_module =
+          std::move(statusOrHloModule.ValueOrDie());
+
+      return ConvertModule(std::move(hlo_module), module, platform_);
+    }();
     if (!status.ok()) {
       module.emitError() << status.ToString();
       return signalPassFailure();
@@ -272,7 +280,6 @@ Status LhloDialectEmitter::CreateView(const HloInstruction* instr,
     }
     return Status::OK();
   }
-
   TF_ASSIGN_OR_RETURN(Type out_type, ::xla::ConvertShapeToType<MemRefType>(
                                          current_shape, builder_));
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
@@ -283,11 +290,35 @@ Status LhloDialectEmitter::CreateView(const HloInstruction* instr,
     return Status::OK();
   }
 
+  auto out_memref_type = out_type.dyn_cast<MemRefType>();
+  if (!out_memref_type)
+    return tensorflow::errors::Internal(
+        "Expected memref type when creating a view for leaf type of a tuple.");
+
   Value byte_shift =
       builder_.create<ConstantIndexOp>(alloc.getLoc(), slice.offset());
-  values->push_back(builder_.create<ViewOp>(builder_.getUnknownLoc(), out_type,
-                                            alloc, byte_shift,
-                                            /*sizes=*/ValueRange{}));
+
+  xla::Shape physical_shape =
+      xla::ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
+          current_shape);
+  TF_ASSIGN_OR_RETURN(
+      Type physical_out_type,
+      ::xla::ConvertShapeToType<MemRefType>(physical_shape, builder_));
+
+  // TODO(timshen): revisit location handling.
+  Location loc = builder_.getUnknownLoc();
+
+  // ViewOp only takes memrefs without affine maps (layouts). Let ViewOp produce
+  // the physical shape (where dimensions are ordered in major to minor) first,
+  // then follow up with a StaticMemRefCastOp to cast the resulting memref to
+  // the original layout.
+  Value result =
+      builder_.create<ViewOp>(loc, physical_out_type, alloc, byte_shift,
+                              /*sizes=*/ValueRange{});
+  if (physical_out_type != out_type)
+    result = builder_.create<lmhlo::StaticMemRefCastOp>(loc, out_memref_type,
+                                                        result);
+  values->push_back(result);
   return Status::OK();
 }
 
@@ -333,40 +364,43 @@ Status LhloDialectEmitter::Initialize() {
   for (const BufferAllocation& alloc : assignment_.Allocations())
     ordered_allocations.push_back(&alloc);
 
-  // Sort the rather arbitrarily ordered allocations to match the input/output
-  // parameters. Specifically We want to sort buffer allocations in the
-  // following order:
-  // * Parameters always order before non-parameters.
-  // * Different parameters order by parameter number.
-  // * Different allocations for the same parameter order by the shape index.
-  //
-  // TODO(timshen): there should be only one non-parameter buffer, the temp
-  // buffer. Check on that.
-  const auto allocation_comparator = [](const BufferAllocation* lhs,
-                                        const BufferAllocation* rhs) {
-    if (lhs->is_entry_computation_parameter() !=
-        rhs->is_entry_computation_parameter()) {
-      return lhs->is_entry_computation_parameter() >
-             rhs->is_entry_computation_parameter();
-    }
-    if (lhs->is_entry_computation_parameter()) {
-      return std::tuple<int, const ::xla::ShapeIndex&>(
-                 lhs->parameter_number(), lhs->param_shape_index()) <
-             std::tuple<int, const ::xla::ShapeIndex&>(
-                 rhs->parameter_number(), rhs->param_shape_index());
-    }
-    return false;
-  };
+  if (computation_.IsEntryComputation()) {
+    // Sort the rather arbitrarily ordered allocations to match the input/output
+    // parameters. Specifically We want to sort buffer allocations in the
+    // following order:
+    // * Parameters always order before non-parameters.
+    // * Different parameters order by parameter number.
+    // * Different allocations for the same parameter order by the shape index.
+    //
+    // TODO(timshen): there should be only one non-parameter buffer, the temp
+    // buffer. Check on that.
+    const auto allocation_comparator = [](const BufferAllocation* lhs,
+                                          const BufferAllocation* rhs) {
+      if (lhs->is_entry_computation_parameter() !=
+          rhs->is_entry_computation_parameter()) {
+        return lhs->is_entry_computation_parameter() >
+               rhs->is_entry_computation_parameter();
+      }
+      if (lhs->is_entry_computation_parameter()) {
+        return std::tuple<int, const ::xla::ShapeIndex&>(
+                   lhs->parameter_number(), lhs->param_shape_index()) <
+               std::tuple<int, const ::xla::ShapeIndex&>(
+                   rhs->parameter_number(), rhs->param_shape_index());
+      }
+      return false;
+    };
 
-  std::stable_sort(ordered_allocations.begin(), ordered_allocations.end(),
-                   allocation_comparator);
+    std::stable_sort(ordered_allocations.begin(), ordered_allocations.end(),
+                     allocation_comparator);
+  }
 
   // The function signature will be composed of:
   // - one memref for each of the parameters.
   // - one memref for each other buffer allocation.
   llvm::SmallVector<MutableDictionaryAttr, 8> args_attrs;
   for (const BufferAllocation* alloc : ordered_allocations) {
-    if (alloc->is_entry_computation_parameter()) {
+    if (computation_.IsEntryComputation() &&
+        alloc->is_entry_computation_parameter()) {
       const ::xla::Shape& buffer_shape = ::xla::ShapeUtil::GetSubshape(
           computation_.parameter_instruction(alloc->parameter_number())
               ->shape(),
@@ -379,6 +413,8 @@ Status LhloDialectEmitter::Initialize() {
       block->addArgument(arg_type);
       allocations_[alloc] = block->getArguments().back();
       args_attrs.emplace_back();
+      args_attrs.back().set(builder_.getIdentifier("lmhlo.alloc"),
+                            builder_.getIndexAttr(alloc->index()));
       args_attrs.back().set(builder_.getIdentifier("lmhlo.params"),
                             builder_.getIndexAttr(alloc->parameter_number()));
     } else {
@@ -427,6 +463,22 @@ Status HloToLhloModule(const BufferAssignment& assignment,
   return computation->AcceptOrdered(&emitter, ordering);
 }
 
+mlir::OwningModuleRef HloTextToLhloTranslateFunction(
+    llvm::StringRef input, mlir::MLIRContext* context) {
+  StatusOr<std::unique_ptr<HloModule>> maybe_module =
+      xla::ParseAndReturnUnverifiedModule(
+          absl::string_view(input.data(), input.size()));
+  TF_CHECK_OK(maybe_module.status());
+
+  mlir::OwningModuleRef module =
+      mlir::ModuleOp::create(mlir::UnknownLoc::get(context));
+
+  TF_CHECK_OK(
+      ConvertModule(maybe_module.ConsumeValueOrDie(), module.get(), "Host"));
+
+  return module;
+}
+
 static PassRegistration<XlaHloToLhloPass> registration(
     "xla-hlo-to-lhlo-with-xla",
     "Emit LHLO from HLO using the existing XLA implementation");
diff --git a/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h b/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h
index ca40eb5804c..bdc977616b1 100644
--- a/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h
+++ b/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h
@@ -127,6 +127,9 @@ tensorflow::Status HloToLhloModule(const ::xla::BufferAssignment& assignment,
                                    const ::xla::HloModule& hlo_module,
                                    ModuleOp module);
 
+OwningModuleRef HloTextToLhloTranslateFunction(llvm::StringRef input,
+                                               mlir::MLIRContext* context);
+
 }  // namespace mlir
 
 #endif  // TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_MHLO_TO_LHLO_WITH_XLA_H_
diff --git a/tensorflow/compiler/mlir/xla/transforms/passes.h b/tensorflow/compiler/mlir/xla/transforms/passes.h
index bc261324055..45166941620 100644
--- a/tensorflow/compiler/mlir/xla/transforms/passes.h
+++ b/tensorflow/compiler/mlir/xla/transforms/passes.h
@@ -18,6 +18,9 @@ limitations under the License.
 
 #include <memory>
 
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 
 namespace mlir {
@@ -33,14 +36,31 @@ namespace mhlo {
 
 /// Lowers from TF dialect to HLO dialect. When allow_partial_conversion is
 /// false, emits an error if there is any operation that can't be legalized.
+/// When `tf2xla_fallback_device_type` is not `None`, also uses legalization
+/// patterns from TF2XLA fallback for provided device type (see
+/// legalize_tf_with_tf2xla.cc for details). By default, TF2XLA fallback is not
+/// used.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeTFPass(
-    bool allow_partial_conversion = false, bool legalize_chlo = true);
+    bool allow_partial_conversion = false, bool legalize_chlo = true,
+    llvm::Optional<StringRef> tf2xla_fallback_device_type = llvm::None);
 
 /// Lowers from TF dialect to HLO dialect using tf2xla op kernels for the
 /// specified device type.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeTfWithTf2XlaPass(
     llvm::StringRef device_type);
 
+/// Adds the TF to XLA via TF2XLA rewrite patterns to the pattern list.
+void PopulateLegalizeTfWithTf2XlaPatterns(llvm::StringRef device_type,
+                                          OwningRewritePatternList& patterns);
+
+/// Adds the TF to TF lowerings and TF to XLA rewrite patterns to the pattern
+/// list.
+void PopulateLegalizeTfPatterns(MLIRContext* context,
+                                OwningRewritePatternList* patterns);
+
+/// Checks whether the op is supported by the Tf2Xla fallback for legalization.
+bool IsOpAllowedTf2XlaFallback(Operation* op);
+
 /// Lowers from TF dialect's control flow to HLO dialect's control flow.
 std::unique_ptr<OperationPass<ModuleOp>> createLegalizeTFControlFlowPass();
 
@@ -48,8 +68,18 @@ std::unique_ptr<OperationPass<ModuleOp>> createLegalizeTFControlFlowPass();
 /// dialect using the conversion patterns registered by the HLO dialect. When
 /// allow_partial_conversion is false, emits an error if there is any operation
 /// that can't be legalized.
-LogicalResult legalizeTF(Operation* op, bool allow_partial_conversion = false,
-                         bool legalize_chlo = true);
+/// When `tf2xla_fallback_device_type` is not `None`, also uses legalization
+/// patterns from TF2XLA fallback for provided device type (see
+/// legalize_tf_with_tf2xla.cc for details). By default, TF2XLA fallback is not
+/// used.
+LogicalResult legalizeTF(
+    Operation* op, bool allow_partial_conversion = false,
+    bool legalize_chlo = true,
+    llvm::Optional<StringRef> tf2xla_fallback_device_type = llvm::None);
+
+// Legalizes TF/XLA communication ops (TF dialect) to HLO dialect communication
+// ops.
+std::unique_ptr<OperationPass<ModuleOp>> CreateLegalizeTFCommunicationPass();
 
 }  // namespace mhlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/type_to_shape.cc b/tensorflow/compiler/mlir/xla/type_to_shape.cc
index b684abde7a5..afc36916348 100644
--- a/tensorflow/compiler/mlir/xla/type_to_shape.cc
+++ b/tensorflow/compiler/mlir/xla/type_to_shape.cc
@@ -145,11 +145,37 @@ Shape TypeToShape(mlir::Type type) {
       // For the primitive type case, the shape of the memref is similar to the
       // vector type case (i.e., it is, modulo the layout, the same dimensions
       // and primitive type).
-      // Currently we only return shapes for identity affine maps.
-      // TODO(andydavis) Map affine map layout function to XLA layout.
-      if (m.getAffineMaps().empty() ||
-          (m.getAffineMaps().size() == 1 && m.getAffineMaps()[0].isIdentity()))
+      if (m.getAffineMaps().empty())
         return ShapeUtil::MakeShape(primitive_type, span);
+
+      if (m.getAffineMaps().size() == 1) {
+        llvm::SmallVector<int64_t, 4> strides;
+        int64_t offset;
+        if (failed(mlir::getStridesAndOffset(m, strides, offset))) return {};
+
+        llvm::SmallVector<std::pair<int64_t, int>, 4> strides_with_indices;
+        for (const auto& e : llvm::enumerate(strides)) {
+          strides_with_indices.push_back({e.value(), e.index()});
+        }
+        std::sort(strides_with_indices.begin(), strides_with_indices.end());
+
+        llvm::SmallVector<int64, 4> minor_to_major;
+        int64_t stride = 1;
+        for (const auto& pr : strides_with_indices) {
+          minor_to_major.push_back(pr.second);
+
+          // Either the affine map is not perfectly strided, or the dimensions
+          // recovered from strides don't match the actual dimensions in shapes.
+          if (stride != pr.first) return {};
+
+          stride *= m.getShape()[pr.second];
+        }
+
+        llvm::SmallVector<int64, 4> dimensions(m.getShape().begin(),
+                                               m.getShape().end());
+        return ::xla::ShapeUtil::MakeShapeWithLayout(primitive_type, dimensions,
+                                                     minor_to_major);
+      }
       break;
     }
     case mlir::StandardTypes::RankedTensor: {
diff --git a/tensorflow/compiler/mlir/xla/type_to_shape_test.cc b/tensorflow/compiler/mlir/xla/type_to_shape_test.cc
index b2a7cb85686..a4a2bc42d99 100644
--- a/tensorflow/compiler/mlir/xla/type_to_shape_test.cc
+++ b/tensorflow/compiler/mlir/xla/type_to_shape_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/hlo_utils.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -178,5 +179,22 @@ TEST(TypeToShapeTest, ConvertWithShapeRepresentationFn) {
   EXPECT_EQ(captured_tensor_shape, tensorflow::TensorShape({1, 2, 3}));
 }
 
+TEST(TypeToShapeTest, ConvertMemRefToShape) {
+  Shape shape = ShapeUtil::MakeShapeWithLayout(PrimitiveType::F32, {10, 20, 30},
+                                               {2, 0, 1});
+  MLIRContext context;
+  mlir::Builder builder(&context);
+
+  StatusOr<mlir::Type> mlir_type =
+      ConvertShapeToType<MemRefType>(shape, builder);
+  ASSERT_TRUE(mlir_type.ok());
+  mlir::Type type = mlir_type.ConsumeValueOrDie();
+  Shape converted = TypeToShape(type);
+  EXPECT_TRUE(ShapeUtil::Equal(
+      converted, ShapeUtil::MakeShapeWithLayout(PrimitiveType::F32,
+                                                {10, 20, 30}, {2, 0, 1})));
+  EXPECT_TRUE(ShapeUtil::Equal(converted, shape));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc b/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc
index 211470bd41e..158671a6242 100644
--- a/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc
+++ b/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "mlir/Translation.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/xla/hlo_to_mlir_hlo.h"
 #include "tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h"
+#include "tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
@@ -183,3 +184,8 @@ static mlir::TranslateToMLIRRegistration HloToHloMlirTranslate(
 
 static mlir::TranslateToMLIRRegistration HloTextToHloMlirTranslate(
     "hlo-text-to-mlir-hlo", xla::HloTextToMlirHloTranslateFunction);
+
+// MHLO doesn't support explicit layouts, while XLA service does.
+// TODO(timshen): remove it once MHLO supports explicit layouts.
+static mlir::TranslateToMLIRRegistration HloTextToLhloMlirTranslate(
+    "hlo-text-to-lhlo", mlir::HloTextToLhloTranslateFunction);
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 0b5a6c147dc..924834fc0fc 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -21,10 +21,6 @@ package_group(
     includes = [
         "//tensorflow/compiler/tf2xla:internal",
     ],
-    packages = [
-        # To pass open source testing in the pip Kokoros.
-        "//bazel_pip/tensorflow/compiler/tests/...",
-    ],
 )
 
 package_group(
@@ -34,7 +30,6 @@ package_group(
     ],
     packages = [
         # To pass open source testing in the pip Kokoros.
-        "//bazel_pip/tensorflow/compiler/tests/...",
         "//platforms/xla/tests/neural_nets",
     ],
 )
@@ -128,7 +123,6 @@ tf_xla_py_test(
     name = "adagrad_da_test",
     size = "small",
     srcs = ["adagrad_da_test.py"],
-    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -166,7 +160,6 @@ tf_xla_py_test(
     srcs = ["add_n_test.py"],
     # TensorList ops are not implemented in the on-demand compilation model yet.
     disabled_backends = ["cpu_ondemand"],
-    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -331,6 +324,7 @@ tf_xla_py_test(
     name = "self_adjoint_eig_op_test",
     size = "medium",
     srcs = ["self_adjoint_eig_op_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -694,7 +688,6 @@ tf_xla_py_test(
     name = "fft_test",
     size = "medium",
     srcs = ["fft_test.py"],
-    enable_mlir_bridge = True,
     python_version = "PY3",
     shard_count = 6,
     tags = [
@@ -865,6 +858,7 @@ tf_xla_py_test(
     size = "medium",
     timeout = "long",
     srcs = ["matrix_diag_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -934,9 +928,8 @@ tf_xla_py_test(
     name = "pooling_ops_test",
     size = "medium",
     srcs = ["pooling_ops_test.py"],
-    enable_mlir_bridge = True,
     python_version = "PY3",
-    shard_count = 10,
+    shard_count = 20,
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
     ],
@@ -956,7 +949,7 @@ tf_xla_py_test(
     srcs = ["pooling_ops_3d_test.py"],
     enable_mlir_bridge = True,
     python_version = "PY3",
-    shard_count = 10,
+    shard_count = 20,
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
     ],
@@ -1193,6 +1186,10 @@ tf_xla_py_test(
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+        "noasan",
+        "nomsan",
+        "notsan",
+        "optonly",
     ],
     deps = [
         ":xla_test",
@@ -1208,6 +1205,7 @@ tf_xla_py_test(
     name = "spacetobatch_op_test",
     size = "medium",
     srcs = ["spacetobatch_op_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     shard_count = 3,
     tags = [
@@ -1244,7 +1242,6 @@ tf_xla_py_test(
     name = "stack_ops_test",
     size = "small",
     srcs = ["stack_ops_test.py"],
-    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "config-cuda-only",
@@ -1305,7 +1302,6 @@ tf_xla_py_test(
     srcs = ["tensor_array_ops_test.py"],
     # TensorArray ops are not implemented in the on-demand compilation model yet.
     disabled_backends = ["cpu_ondemand"],
-    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "config-cuda-only",
@@ -1334,10 +1330,10 @@ tf_xla_py_test(
     srcs = ["tensor_list_ops_test.py"],
     # TensorList ops are not implemented in the on-demand compilation model yet.
     disabled_backends = ["cpu_ondemand"],
-    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+        "notap",  # b/162025277
     ],
     deps = [
         ":xla_test",
@@ -1889,7 +1885,6 @@ tf_xla_py_test(
     name = "special_math_test",
     size = "medium",
     srcs = ["special_math_test.py"],
-    enable_mlir_bridge = True,
     shard_count = 5,
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
diff --git a/tensorflow/compiler/tests/build_defs.bzl b/tensorflow/compiler/tests/build_defs.bzl
index 19a1d62cddd..9c941e791ee 100644
--- a/tensorflow/compiler/tests/build_defs.bzl
+++ b/tensorflow/compiler/tests/build_defs.bzl
@@ -8,6 +8,7 @@ load(
     "tf_cuda_tests_tags",
     "tf_exec_properties",
 )
+load("//tensorflow:tensorflow.bzl", "py_test")
 
 def all_backends():
     b = ["cpu"] + plugins.keys()
@@ -121,7 +122,7 @@ def tf_xla_py_test(
                     updated_name = updated_name[:-5]
                 updated_name += "_mlir_bridge_test"
 
-            native.py_test(
+            py_test(
                 name = updated_name,
                 srcs = srcs,
                 srcs_version = "PY2AND3",
diff --git a/tensorflow/compiler/tests/case_test.py b/tensorflow/compiler/tests/case_test.py
index 3b2dff537da..4da9c4fac7a 100644
--- a/tensorflow/compiler/tests/case_test.py
+++ b/tensorflow/compiler/tests/case_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for while loops in XLA."""
+"""Tests for case statements in XLA."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/compiler/tests/eager_test.py b/tensorflow/compiler/tests/eager_test.py
index 520348e0f8a..eef9d24766d 100644
--- a/tensorflow/compiler/tests/eager_test.py
+++ b/tensorflow/compiler/tests/eager_test.py
@@ -311,7 +311,7 @@ class EagerFunctionTest(xla_test.XLATestCase):
     if 'GPU' in self.device:
       # TODO(b/32333178)
       self.skipTest('Current implementation of RandomStandardNormal kernel '
-                    'is very slow on GPU, and has been blacklisted.')
+                    'is very slow on GPU, and has been denylisted.')
     with self.test_scope():
       data_format = 'channels_last'
       conv = convolutional.Conv2D(
diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py
index 326c3ec4929..9590688fda7 100644
--- a/tensorflow/compiler/tests/image_ops_test.py
+++ b/tensorflow/compiler/tests/image_ops_test.py
@@ -30,7 +30,6 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_image_ops
 from tensorflow.python.ops import image_ops
@@ -775,7 +774,6 @@ class ResizeBilinearNonAlignCornersTest(xla_test.XLATestCase):
 
 class NonMaxSuppressionTest(xla_test.XLATestCase):
 
-  @test_util.disable_mlir_bridge("%1")
   def testNMS128From1024(self):
     num_boxes = 1024
     boxes_np = np.random.normal(50, 10, (num_boxes, 4)).astype("f4")
@@ -810,7 +808,6 @@ class NonMaxSuppressionTest(xla_test.XLATestCase):
 
       self.assertEqual(indices_tf.size, max_output_size)
 
-  @test_util.disable_mlir_bridge("%1")
   def testNMS3From6Boxes(self):
     # Three boxes are selected based on IOU.
     boxes_data = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, -0.1, 1, 0.9],
@@ -852,7 +849,6 @@ class NonMaxSuppressionTest(xla_test.XLATestCase):
       self.assertEqual(num_valid, 3)
       self.assertAllClose(indices_tf[:num_valid], [3, 0, 5])
 
-  @test_util.disable_mlir_bridge("%1")
   def testNMS3Then2WithScoreThresh(self):
     # Three boxes are selected based on IOU.
     # One is filtered out by score threshold.
@@ -895,7 +891,6 @@ class NonMaxSuppressionTest(xla_test.XLATestCase):
       self.assertEqual(num_valid, 2)
       self.assertAllClose(indices_tf[:num_valid], [3, 0])
 
-  @test_util.disable_mlir_bridge("%1")
   def testNMS3Then1WithScoreMaxThresh(self):
     # Three boxes are selected based on IOU.
     # One is filtered out by score threshold.
@@ -939,7 +934,6 @@ class NonMaxSuppressionTest(xla_test.XLATestCase):
       self.assertEqual(num_valid, 1)
       self.assertAllClose(indices_tf[:num_valid], [3])
 
-  @test_util.disable_mlir_bridge("%1")
   def testSelectFromContinuousOverLap(self):
     # Tests that a suppressed box does not itself suppress other boxes.
 
@@ -984,7 +978,6 @@ class NonMaxSuppressionTest(xla_test.XLATestCase):
 
 class BatchedNonMaxSuppressionCorrectnessTest(xla_test.XLATestCase):
 
-  @test_util.disable_mlir_bridge("%1")
   def testBatchedNMSFrom6(self):
     boxes_data = [[[0, 0, 1, 1], [3, 3, 4, 4], [0, 0.4, 1, 1.4],
                    [0, 0.6, 1, 1.6], [0, 0.8, 1, 1.8], [0, 2, 1, 2]],
@@ -1022,7 +1015,6 @@ class BatchedNonMaxSuppressionCorrectnessTest(xla_test.XLATestCase):
                         indices_output)
     self.assertAllEqual([5, 4], num_valid_output)
 
-  @test_util.disable_mlir_bridge("%1")
   def testBatchedNMSFrom6Max3(self):
     boxes_data = [[[0, 0, 1, 1], [3, 3, 4, 4], [0, 0.4, 1, 1.4],
                    [0, 0.6, 1, 1.6], [0, 0.8, 1, 1.8], [0, 2, 1, 2]],
@@ -1056,7 +1048,6 @@ class BatchedNonMaxSuppressionCorrectnessTest(xla_test.XLATestCase):
     self.assertAllEqual([[0, 1, 2], [0, 1, 3]], indices_output)
     self.assertAllEqual([3, 3], num_valid_output)
 
-  @test_util.disable_mlir_bridge("%1")
   def testBatchedNMSSingleFrom6Max3(self):
     boxes_data = [[0, 0, 1, 1], [3, 3, 4, 4], [0, 0.4, 1, 1.4],
                   [0, 0.6, 1, 1.6], [0, 0.8, 1, 1.8], [0, 2, 1, 2]]
@@ -1087,7 +1078,6 @@ class BatchedNonMaxSuppressionCorrectnessTest(xla_test.XLATestCase):
     self.assertAllEqual([0, 1, 2], indices_output)
     self.assertAllEqual(3, num_valid_output)
 
-  @test_util.disable_mlir_bridge("%1")
   def testBatchedNMSSingleFrom6NoPad(self):
     boxes_data = [[0, 0, 1, 1], [3, 3, 4, 4], [0, 0.4, 1, 1.4],
                   [0, 0.6, 1, 1.6], [0, 0.8, 1, 1.8], [0, 2, 1, 2]]
@@ -1117,7 +1107,6 @@ class BatchedNonMaxSuppressionCorrectnessTest(xla_test.XLATestCase):
     self.assertAllEqual([0, 1, 2, 4, 5], indices_output)
     self.assertAllEqual(5, num_valid_output)
 
-  @test_util.disable_mlir_bridge("%1")
   def testBatchedNMSBatchDimsFrom6Max3(self):
     boxes_data = [[[[0, 0, 1, 1], [3, 3, 4, 4], [0, 0.4, 1, 1.4],
                     [0, 0.6, 1, 1.6], [0, 0.8, 1, 1.8], [0, 2, 1, 2]],
@@ -1151,7 +1140,6 @@ class BatchedNonMaxSuppressionCorrectnessTest(xla_test.XLATestCase):
     self.assertAllEqual([[[0, 1, 2], [0, 1, 3]]], indices_output)
     self.assertAllEqual([[3, 3]], num_valid_output)
 
-  @test_util.disable_mlir_bridge("%1")
   def testBatchedNMSScoreThresholdFrom6Max3(self):
     boxes_data = [[[0, 0, 1, 1], [3, 3, 4, 4], [0, 0.4, 1, 1.4],
                    [0, 0.6, 1, 1.6], [0, 0.8, 1, 1.8], [0, 2, 1, 2]],
@@ -1187,7 +1175,6 @@ class BatchedNonMaxSuppressionCorrectnessTest(xla_test.XLATestCase):
     self.assertAllEqual([3, 2], num_valid_output)
     self.assertAllEqual([[0, 1, 2], [0, 1, invalid_index]], indices_output)
 
-  @test_util.disable_mlir_bridge("%1")
   def testBatchedNMSUnsortedInputFrom6(self):
     boxes_data = [[[0, 2, 1, 2], [3, 3, 4, 4], [0, 0, 1, 1],
                    [0, 0.4, 1, 1.4], [0, 0.6, 1, 1.6], [0, 0.8, 1, 1.8]],
@@ -1224,7 +1211,6 @@ class BatchedNonMaxSuppressionCorrectnessTest(xla_test.XLATestCase):
                         indices_output)
     self.assertAllEqual([5, 4], num_valid_output)
 
-  @test_util.disable_mlir_bridge("%1")
   def testBatchedNMSNoncanonicalizedInputFrom6(self):
     boxes_data = [[[1, 0, 0, 1], [4, 3, 3, 4], [1, 0.4, 0, 1.4],
                    [1, 0.6, 0, 1.6], [1, 0.8, 0, 1.8], [1, 2, 0, 2]],
@@ -1262,7 +1248,6 @@ class BatchedNonMaxSuppressionCorrectnessTest(xla_test.XLATestCase):
                         indices_output)
     self.assertAllEqual([5, 4], num_valid_output)
 
-  @test_util.disable_mlir_bridge("%1")
   def testBatchedNMSScoreThresholdCanInputsFrom6Max3(self):
     boxes_data = [[[0, 0, 1, 1], [3, 3, 4, 4], [0, 0.4, 1, 1.4],
                    [0, 0.6, 1, 1.6], [0, 0.8, 1, 1.8], [0, 2, 1, 2]],
@@ -1298,7 +1283,6 @@ class BatchedNonMaxSuppressionCorrectnessTest(xla_test.XLATestCase):
     self.assertAllEqual([3, 2], num_valid_output)
     self.assertAllEqual([[0, 1, 2], [0, 1, invalid_index]], indices_output)
 
-  @test_util.disable_mlir_bridge("%1")
   def testBatchedNMSFrom6DynamicInput(self):
     boxes_data = [[[0, 0, 1, 1], [3, 3, 4, 4], [0, 0.4, 1, 1.4],
                    [0, 0.6, 1, 1.6], [0, 0.8, 1, 1.8], [0, 2, 1, 2]],
diff --git a/tensorflow/compiler/tests/pooling_ops_3d_test.py b/tensorflow/compiler/tests/pooling_ops_3d_test.py
index a833daa39be..9eda74b55a9 100644
--- a/tensorflow/compiler/tests/pooling_ops_3d_test.py
+++ b/tensorflow/compiler/tests/pooling_ops_3d_test.py
@@ -75,9 +75,6 @@ class Pooling3DTest(xla_test.XLATestCase):
     actual = vals.flatten()
     self.assertAllClose(expected, actual)
 
-  @test_util.disable_mlir_bridge("TODO(b/159812644): AvgPool TF to HLO lowering"
-                                 " doesn't support all paddings and data "
-                                 "formats")
   def testAvgPool3dValidPadding(self):
     expected_output = [20.5, 21.5, 22.5]
     self._VerifyValues(
@@ -88,9 +85,6 @@ class Pooling3DTest(xla_test.XLATestCase):
         padding="VALID",
         expected=expected_output)
 
-  @test_util.disable_mlir_bridge("TODO(b/159812644): AvgPool TF to HLO lowering"
-                                 " doesn't support all paddings and data "
-                                 "formats")
   def testAvgPool3dSamePadding(self):
     expected_output = [20.5, 21.5, 22.5, 26.5, 27.5, 28.5]
     self._VerifyValues(
@@ -101,9 +95,6 @@ class Pooling3DTest(xla_test.XLATestCase):
         padding="SAME",
         expected=expected_output)
 
-  @test_util.disable_mlir_bridge("TODO(b/159812644): AvgPool TF to HLO lowering"
-                                 " doesn't support all paddings and data "
-                                 "formats")
   def testAvgPool3dSamePaddingDifferentStrides(self):
     expected_output = [1.5, 4.5, 7.5, 17.5, 20.5, 23.5, 33.5, 36.5, 39.5]
     self._VerifyValues(
diff --git a/tensorflow/compiler/tests/pooling_ops_test.py b/tensorflow/compiler/tests/pooling_ops_test.py
index 293e1010b08..d9393387c0d 100644
--- a/tensorflow/compiler/tests/pooling_ops_test.py
+++ b/tensorflow/compiler/tests/pooling_ops_test.py
@@ -268,9 +268,6 @@ class PoolingTest(xla_test.XLATestCase):
         expected=[1, 3, 9, 11])
 
   # Average pooling
-  @test_util.disable_mlir_bridge("TODO(b/159812644): AvgPool TF to HLO lowering"
-                                 " doesn't support all paddings and data "
-                                 "formats")
   def testAvgPoolValidPadding(self):
     expected_output = [7, 8, 9]
     self._VerifyValues(
@@ -281,9 +278,6 @@ class PoolingTest(xla_test.XLATestCase):
         padding="VALID",
         expected=expected_output)
 
-  @test_util.disable_mlir_bridge("TODO(b/159812644): AvgPool TF to HLO lowering"
-                                 " doesn't support all paddings and data "
-                                 "formats")
   def testAvgPoolSamePadding(self):
     expected_output = [7., 8., 9., 11.5, 12.5, 13.5]
     self._VerifyValues(
diff --git a/tensorflow/compiler/tests/sort_ops_test.py b/tensorflow/compiler/tests/sort_ops_test.py
index d50fdec7c63..838718aa1e3 100644
--- a/tensorflow/compiler/tests/sort_ops_test.py
+++ b/tensorflow/compiler/tests/sort_ops_test.py
@@ -129,42 +129,35 @@ class XlaSortOpTest(xla_test.XLATestCase):
 
   def testTopKZeros(self):
     """Tests that positive and negative zeros sort correctly."""
-    # Only bfloat16 is implemented.
-    bfloat16 = dtypes.bfloat16.as_numpy_dtype
-    if bfloat16 not in self.numeric_types:
-      return
-
-    with self.session() as sess:
-      p = array_ops.placeholder(dtypes.bfloat16)
-      with self.test_scope():
-        topk = nn_ops.top_k(p, k=4)
-      results = sess.run(
-          topk,
-          {p: np.array([0., -0., 0., 3., -0., -4., 0., -0.], dtype=bfloat16)})
-      self.assertAllEqual(
-          np.array([3., 0., 0., 0.], dtype=bfloat16), results[0])
-      self.assertEqual(list([3, 0, 2, 6]), list(results[1]))
+    supported_types = set([dtypes.bfloat16.as_numpy_dtype, np.float32])
+    for dtype in supported_types.intersection(self.numeric_types):
+      with self.session() as sess:
+        p = array_ops.placeholder(dtype)
+        with self.test_scope():
+          topk = nn_ops.top_k(p, k=4)
+        results = sess.run(
+            topk,
+            {p: np.array([0., -0., 0., 3., -0., -4., 0., -0.], dtype=dtype)})
+        self.assertAllEqual(np.array([3., 0., 0., 0.], dtype=dtype), results[0])
+        self.assertEqual(list([3, 0, 2, 6]), list(results[1]))
 
   def testTopKInfinities(self):
     """Tests that positive and negative infinity sort correctly."""
-    # Only bfloat16 is implemented.
-    bfloat16 = dtypes.bfloat16.as_numpy_dtype
-    if bfloat16 not in self.numeric_types:
-      return
-
-    with self.session() as sess:
-      p = array_ops.placeholder(dtypes.bfloat16)
-      with self.test_scope():
-        topk = nn_ops.top_k(p, k=6)
-      results = sess.run(topk, {
-          p: np.array(
-              [1, 2, float("inf"), -float("inf"), -1, -2], dtype=bfloat16)
-      })
-      self.assertAllEqual(
-          np.array(
-              [float("inf"), 2.0, 1.0, -1.0, -2.0, -float("inf")],
-              dtype=bfloat16), results[0])
-      self.assertEqual(list([2, 1, 0, 4, 5, 3]), list(results[1]))
+    supported_types = set([dtypes.bfloat16.as_numpy_dtype, np.float32])
+    for dtype in supported_types.intersection(self.numeric_types):
+      with self.session() as sess:
+        p = array_ops.placeholder(dtype)
+        with self.test_scope():
+          topk = nn_ops.top_k(p, k=6)
+        results = sess.run(topk, {
+            p:
+                np.array([1, 2, float("inf"), -float("inf"), -1, -2],
+                         dtype=dtype)
+        })
+        self.assertAllEqual(
+            np.array([float("inf"), 2.0, 1.0, -1.0, -2.0, -float("inf")],
+                     dtype=dtype), results[0])
+        self.assertEqual(list([2, 1, 0, 4, 5, 3]), list(results[1]))
 
   def testInTopK(self):
     supported_types = set([np.int32, np.int64])
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 162693a9eb1..eb022da6895 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import unittest
 
 import numpy as np
+import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.compiler.tests import xla_test
@@ -90,6 +91,10 @@ class UnaryOpsTest(xla_test.XLATestCase):
     self.assertAllClose(result, expected, rtol, atol)
     self.assertAllEqual(np.sort(result), result)
 
+  def AssertAllEqual(self, result, expected, rtol, atol):
+    """Tests that result and expeted are exactly equal."""
+    self.assertAllEqual(result, expected)
+
   @test_util.disable_mlir_bridge(
       "MlirHloBuilder::Iota missing required for xla::Diag")
   def testAllTypeOps(self):
@@ -435,8 +440,12 @@ class UnaryOpsTest(xla_test.XLATestCase):
 
       self._assertOpOutputMatchesExpected(
           math_ops.sign,
-          np.array([[-2.0, -1.0, -0.0, +0.0, 1.0, 2.0]], dtype=dtype),
-          expected=np.array([[-1.0, -1.0, -0.0, +0.0, 1.0, 1.0]], dtype=dtype))
+          np.array([[-2.0, -1.0, -0.0, +0.0, 1.0, 2.0,
+                     float("nan")]],
+                   dtype=dtype),
+          expected=np.array([[-1.0, -1.0, -0.0, +0.0, 1.0, 1.0,
+                              float("nan")]],
+                            dtype=dtype))
 
       self._assertOpOutputMatchesExpected(
           math_ops.is_finite,
@@ -775,6 +784,10 @@ class UnaryOpsTest(xla_test.XLATestCase):
           np.array([1 + 3j, -4 + 7j, 2.7, -3j], dtype=dtype),
           expected=np.array([1, -4, 2.7, 0], dtype=ctypes[dtype]))
 
+  @test_util.disable_mlir_bridge(
+      "TF_PopulationCount is missing and is required to translate to "
+      "xla::PopulationCount."
+  )
   def testIntOps(self):
     for dtype in self.int_types:
       self._assertOpOutputMatchesExpected(
@@ -782,6 +795,38 @@ class UnaryOpsTest(xla_test.XLATestCase):
           np.array([0, -1, 1, 16, 42], dtype=dtype),
           expected=np.array([-1, 0, -2, -17, -43], dtype=dtype))
 
+      # Test population_count for array inputs.
+      raw_inputs = [
+          0, 1, -1, 3, -3, 5, -5, 14, -14, 127, 128, 255, 256, 65535, 65536,
+          2**31 - 1, 2**31, 2**32 - 1, 2**32, -2**32 + 1, -2**32, -2**63 + 1,
+          2**63 - 1
+      ]
+      # Only choose inputs which fit in the int dtype.
+      raw_inputs = list(
+          filter(lambda x: np.iinfo(dtype).min <= x <= np.iinfo(dtype).max,
+                 raw_inputs))
+      inputs = np.array(raw_inputs, dtype=dtype)
+
+      def count_bits(x):
+        return sum(bin(z).count("1") for z in six.iterbytes(x.tobytes()))
+
+      truth = [count_bits(x) for x in inputs]
+      self._assertOpOutputMatchesExpected(
+          bitwise_ops.population_count,
+          inputs,
+          expected=np.array(truth, dtype=np.uint8),
+          equality_test=self.AssertAllEqual)
+
+      # Test population_count for scalar inputs.
+      for raw_inp in raw_inputs:
+        inp = dtype(raw_inp)
+        truth = count_bits(inp)
+        self._assertOpOutputMatchesExpected(
+            bitwise_ops.population_count,
+            inp,
+            expected=np.uint8(truth),
+            equality_test=self.AssertAllEqual)
+
   def testNumericOps(self):
     for dtype in self.numeric_types - {np.int8, np.uint8}:
       self._assertOpOutputMatchesExpected(
@@ -923,16 +968,22 @@ class UnaryOpsTest(xla_test.XLATestCase):
           expected=np.array([1, 0x100000003f800000], np.uint64))
 
   def testInvertPermutation(self):
-    self._assertOpOutputMatchesExpected(
-        array_ops.invert_permutation,
-        np.array([1, 2, 0], np.int32),
-        expected=np.array([2, 0, 1], dtype=np.int32))
+    for np_dtype in [np.int32, np.int64]:
+      self._assertOpOutputMatchesExpected(
+          array_ops.invert_permutation,
+          np.array([1, 2, 0], np_dtype),
+          expected=np.array([2, 0, 1], dtype=np_dtype))
 
   def testInvertPermutationTwiceIsNoop(self):
-    self._assertOpOutputMatchesExpected(
-        lambda x: array_ops.invert_permutation(array_ops.invert_permutation(x)),
-        np.array([1, 2, 0], np.int32),
-        expected=np.array([1, 2, 0], dtype=np.int32))
+
+    def invert_twice(x):
+      return array_ops.invert_permutation(array_ops.invert_permutation(x))
+
+    for np_dtype in [np.int32, np.int64]:
+      self._assertOpOutputMatchesExpected(
+          invert_twice,
+          np.array([1, 2, 0], np_dtype),
+          expected=np.array([1, 2, 0], dtype=np_dtype))
 
   def testRank(self):
     rank_op = lambda x: array_ops.rank_internal(x, optimize=False)
diff --git a/tensorflow/compiler/tests/xla_test.py b/tensorflow/compiler/tests/xla_test.py
index f5f63cb60aa..3b057ed8b17 100644
--- a/tensorflow/compiler/tests/xla_test.py
+++ b/tensorflow/compiler/tests/xla_test.py
@@ -236,9 +236,7 @@ class XLATestCase(test.TestCase):
 
   @contextlib.contextmanager
   def test_scope(self):
-    """Test scope that runs tests on a Tensorflow/XLA device.
-
-    Uses a compilation_scope() to mark operators to compile.
+    """Test scope that runs tests on `self.device`.
 
     Yields:
       A scope to apply to the operators under test.
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
index c9210a1a1e7..c4fc3e4f5da 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/protobuf/config.pb.h"  // NOLINT
@@ -332,7 +333,6 @@ void UpdateToEngineNode(const std::vector<EngineInfo>& infos,
 Status CreateTRTNode(const ConversionParams& params,
                      const std::vector<EngineInfo>& infos, int pos,
                      int max_batch_size, Graph* graph,
-                     nvinfer1::IGpuAllocator* alloc,
                      std::vector<Node*>* engine_nodes) {
   const auto& info = infos.at(pos);
   std::vector<tensorflow::TensorShapeProto> input_shape_protos;
@@ -428,16 +428,30 @@ Status CreateTRTNode(const ConversionParams& params,
   // Build the engine and get its serialized representation.
   string segment_string;
   if (info.engine_type == EngineInfo::EngineType::TRTStatic) {
+    std::pair<int, Allocator*> device_allocator =
+        GetDeviceAndAllocator(params, info);
+    int cuda_device_id = 0;
+    std::unique_ptr<TRTBaseAllocator> trt_allocator;
+    if (device_allocator.first >= 0) {
+      cuda_device_id = device_allocator.first;
+      trt_allocator.reset(new TRTDeviceAllocator(device_allocator.second));
+    } else {
+      // The value in trt_allocator is a nullptr and cudamalloc will be used.
+      LOG_WARNING_WITH_PREFIX << "Can't identify the cuda device. Running on "
+                                 "device 0 and use cudamalloc as an allocator";
+    }
+    cudaSetDevice(cuda_device_id);
+
     auto trt_logger = GetLoggerRegistry()->LookUp(params.trt_logger_name);
-    // Create static engine for fp32/fp16 mode.
+    // Create static engines with precision_mode fp32/fp16.
     TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
-    // TODO(sami): What happens if 1st dim is not batch?
     TF_RETURN_IF_ERROR(ConvertGraphDefToEngine(
         info.segment_graph_def,
         calibrate_int8 ? TrtPrecisionMode::FP32 : info.precision_mode,
         max_batch_size, info.max_workspace_size_bytes, input_shapes, trt_logger,
-        alloc, /*calibrator=*/nullptr, &engine, info.use_calibration,
-        params.use_implicit_batch, /*convert_successfully=*/nullptr,
+        trt_allocator.get(), /*calibrator=*/nullptr, &engine,
+        info.use_calibration, params.use_implicit_batch,
+        /*convert_successfully=*/nullptr,
         /*profile=*/nullptr));
     TrtUniquePtrType<nvinfer1::IHostMemory> engine_data(engine->serialize());
     segment_string = string(static_cast<const char*>(engine_data->data()),
@@ -793,13 +807,27 @@ Status ConvertAfterShapes(const ConversionParams& params) {
     }
   }
 
-  // Create a TRT node for each segment using its EngineInfo.
-  int old_cuda_device = 0;
-  auto err = cudaGetDevice(&old_cuda_device);
-  if (err != cudaSuccess) {
-    LOG(ERROR) << "Couldn't get current device: " << cudaGetErrorString(err);
+  // Save the cuda device if we may need to switch to another cuda device to
+  // build static engines.
+  absl::optional<int> old_cuda_device = absl::nullopt;
+  if (!params.is_dyn_op) {
+    int cuda_device_id;
+    cudaError_t cuda_error = cudaGetDevice(&cuda_device_id);
+    if (cuda_error != cudaSuccess) {
+      LOG_WARNING_WITH_PREFIX << "Couldn't get current device: "
+                              << cudaGetErrorString(cuda_error);
+    } else {
+      VLOG(1) << "Current cuda device is " << cuda_device_id;
+      old_cuda_device = cuda_device_id;
+    }
   }
-  VLOG(1) << "Current cuda device is " << old_cuda_device;
+
+  auto restore_cuda_device = gtl::MakeCleanup([old_cuda_device] {
+    if (old_cuda_device.has_value()) {
+      cudaSetDevice(old_cuda_device.value());
+    }
+  });
+
   std::vector<Node*> engine_nodes;
   engine_nodes.resize(engine_segments.size());
   for (int i = 0; i < engine_segments.size(); ++i) {
@@ -813,24 +841,8 @@ Status ConvertAfterShapes(const ConversionParams& params) {
         2.0;
     VLOG(1) << "Assigned " << engine.max_workspace_size_bytes << " bytes to "
             << engine.engine_name;
-    // The allocator is used to build the engine. The build and the built engine
-    // will be destroyed after we get the serialized engine string, so it's fine
-    // to use unique_ptr here.
-    std::unique_ptr<TRTBaseAllocator> alloc;
-    auto device_alloc = GetDeviceAndAllocator(params, engine);
-    int cuda_device_id = 0;
-    if (device_alloc.first >= 0) {
-      cuda_device_id = device_alloc.first;
-      alloc.reset(new TRTDeviceAllocator(device_alloc.second));
-    } else {
-      // Setting allocator as nullptr should get revert to the cudamalloc
-      LOG_WARNING_WITH_PREFIX
-          << "Can't identify the cuda device. Running on device 0 ";
-    }
-    cudaSetDevice(cuda_device_id);
-    auto status =
-        CreateTRTNode(params, engine_segments, i, params.max_batch_size, &graph,
-                      alloc.get(), &engine_nodes);
+    auto status = CreateTRTNode(params, engine_segments, i,
+                                params.max_batch_size, &graph, &engine_nodes);
 
     string msg = StrCat("segment ", i, " consisting of ",
                         converted_segments.at(i).size(), " nodes by ",
@@ -859,7 +871,6 @@ Status ConvertAfterShapes(const ConversionParams& params) {
       }
     }
   }
-  cudaSetDevice(old_cuda_device);
   graph.ToGraphDef(params.output_graph_def);
   VLOG(1) << "Returning from conversion";
   return Status::OK();
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
index 54fb1d56441..3b0553426c0 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
@@ -71,7 +71,7 @@ class FakeCluster : public grappler::Cluster {
   }
 
  private:
-  const DeviceSet* device_set_;
+  const DeviceSet* device_set_ = nullptr;
 };
 
 TEST(ConvertGraphTest, GetDeviceAndAllocator) {
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index 369b339d01a..f80c0f42eca 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -2410,6 +2410,40 @@ Status ConvertTranspose(OpConverterParams* params) {
   return Status::OK();
 }
 
+Status ConvertShape(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  TF_RETURN_IF_ERROR(
+      CheckInputsWeights(*params, {{"input", TrtInputArg::kBoth}}));
+  if (params->use_implicit_batch) {
+    return errors::Unimplemented(
+        "Shape is only supported for explicit batch mode.");
+  }
+  if (HasStaticShape(inputs.at(0).GetTrtDims())) {
+    if (params->validation_only) return Status::OK();
+    nvinfer1::Dims input_dims = inputs.at(0).GetTrtDims();
+    nvinfer1::Dims output_dims{1, {input_dims.nbDims}};
+    // Create a const node with the values of output_dims
+    TRT_ShapedWeights weight = params->weight_store->GetTempWeights(
+        nvinfer1::DataType::kINT32, output_dims);
+    int32* values_ptr = static_cast<int32*>(weight.GetValues());
+    std::copy(input_dims.d, input_dims.d + input_dims.nbDims, values_ptr);
+    auto output = params->converter->CreateConstantLayer(weight, output_dims);
+    params->outputs->push_back(TRT_TensorOrWeights(output));
+    return Status::OK();
+  }
+#if IS_TRT_VERSION_GE(6, 0, 0, 0)
+  if (params->validation_only) return Status::OK();
+  nvinfer1::IShapeLayer* shape_layer =
+      params->converter->network()->addShape(*inputs.at(0).tensor());
+  TFTRT_RETURN_ERROR_IF_NULLPTR(shape_layer, params->node_def.name());
+  params->outputs->push_back(TRT_TensorOrWeights(shape_layer->getOutput(0)));
+  return Status::OK();
+#else
+  return errors::Unavailable(
+      "Shape op conversion requires TensorRT 6 or above");
+#endif
+}
+
 Status ConvertReshape(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   TF_RETURN_IF_ERROR(
@@ -3510,8 +3544,13 @@ Status ConvertPool(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
-  TF_RETURN_IF_ERROR(
-      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
+#if IS_TRT_VERSION_GE(5, 1, 0, 0)
+  std::set<DataType> allowed_types{DataType::DT_FLOAT, DataType::DT_HALF,
+                                   DataType::DT_INT8};
+#else
+  std::set<DataType> allowed_types{DataType::DT_FLOAT, DataType::DT_HALF};
+#endif
+  TF_RETURN_IF_ERROR(AllowDataTypes(*params, allowed_types));
   nvinfer1::PoolingType type;
   if (node_def.op() == "MaxPool") {
     type = nvinfer1::PoolingType::kMAX;
@@ -3744,6 +3783,7 @@ Status ConvertActivation(OpConverterParams* params) {
       params->converter->network()->addActivation(*inputs.at(0).tensor(),
                                                   op_pair->second);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  layer->setName(node_def.name().c_str());
   // Set parameters.
 #if IS_TRT_VERSION_GE(5, 1, 2, 0)
   if (node_def.op() == "Elu") {
@@ -3844,9 +3884,10 @@ Status ConvertRelu6(OpConverterParams* params) {
   nvinfer1::IActivationLayer* layer =
       params->converter->network()->addActivation(
           *inputs.at(0).tensor(), nvinfer1::ActivationType::kCLIP);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
   layer->setAlpha(0.0f);
   layer->setBeta(6.0f);
-  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  layer->setName(node_def.name().c_str());
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
   params->converter->ProvideQuantizationRange(output_tensor, 0.0f, 6.0f);
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
@@ -4402,6 +4443,7 @@ Status ConvertUnary(OpConverterParams* params) {
   nvinfer1::IUnaryLayer* layer =
       params->converter->network()->addUnary(*tensor, op_pair->second);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  layer->setName(node_def.name().c_str());
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
   // Set quantization ranges.
@@ -4479,7 +4521,7 @@ Status ConvertReduce(OpConverterParams* params) {
     int trt_axis;
     TF_RETURN_IF_ERROR(
         ConvertAxis(tf_axes_list[i], tensor->getDimensions().nbDims,
-                    node_def.name(), /*use_implicit_batch=*/true, &trt_axis));
+                    node_def.name(), params->use_implicit_batch, &trt_axis));
     axes |= (1 << trt_axis);
   }
 
@@ -4941,7 +4983,18 @@ Status ConvertFusedBatchNorm(OpConverterParams* params) {
                                  node_def.name());
   }
   nvinfer1::ITensor* tensor = inputs.at(0).tensor();
-
+  if (!params->use_implicit_batch && tensor->getDimensions().d[1] == -1) {
+    // This check is to make sure that channel dimension is known during
+    // conversion.
+    //
+    // We check this only in explicit batch mode and reject an op with unknown
+    // channel dimension during segmentation. In implicit batch mode we have
+    // known shapes during conversion even though the shapes may not be known
+    // during segmentation (see the actual argument for input_shapes when
+    // ConvertGraphDefToEngine is called from TRTEngineOp::BuildEngine).
+    return errors::InvalidArgument("Channel dimension must be static, at ",
+                                   node_def.name());
+  }
   //  Check parameter types
   auto parameter_type = inputs.at(1).weights().TrtDType();
   if ((parameter_type != nvinfer1::DataType::kFLOAT) &&
@@ -5039,6 +5092,7 @@ Status ConvertFusedBatchNorm(OpConverterParams* params) {
       combined_scale_weights.GetTrtWeights(),
       dummy_power_weights.GetTrtWeights());
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  layer->setName(node_def.name().c_str());
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
@@ -5958,6 +6012,7 @@ static void RegisterValidatableOpConverters(
     (*registration)[pool_op_type] = ConvertPool3D;
   }
 #endif
+  (*registration)["Shape"] = ConvertShape;
   (*registration)["Rsqrt"] = ConvertRsqrt;
   (*registration)["Slice"] = ConvertSlice;
   (*registration)["Softmax"] = ConvertSoftmax;
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index 52d05ff8225..aeae44a5562 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -1309,7 +1309,8 @@ std::vector<float> GetDataAsFloat(InputOutputData& data) {
 class OpConverterTest : public ::testing::Test {
  public:
   OpConverterTest()
-      : scope_(Scope::NewRootScope()), allocator_(new GpuManagedAllocator()) {
+      : tensor_buffer_allocator_(new GpuManagedAllocator()),
+        scope_(Scope::NewRootScope()) {
     QCHECK_EQ(0, cudaStreamCreate(&stream_));
     Reset();
   }
@@ -1341,7 +1342,7 @@ class OpConverterTest : public ::testing::Test {
   // Constructs a flat tensor with 'vals' in Unified Memory.
   template <typename T>
   Tensor AsTensor(gtl::ArraySlice<T> vals) {  // non-absl ok
-    Tensor ret(allocator_.get(), DataTypeToEnum<T>::value,
+    Tensor ret(tensor_buffer_allocator_.get(), DataTypeToEnum<T>::value,
                {static_cast<int64>(vals.size())});
     std::copy_n(vals.data(), vals.size(), ret.flat<T>().data());
     return ret;
@@ -1351,7 +1352,7 @@ class OpConverterTest : public ::testing::Test {
   template <typename T>
   Tensor AsTensor(gtl::ArraySlice<T> vals,  // non-absl ok
                   const TensorShape& shape) {
-    Tensor ret(allocator_.get(), DataTypeToEnum<T>::value,
+    Tensor ret(tensor_buffer_allocator_.get(), DataTypeToEnum<T>::value,
                {static_cast<int64>(vals.size())});
     CHECK(ret.CopyFrom(AsTensor(vals), shape));
     return ret;
@@ -1363,7 +1364,8 @@ class OpConverterTest : public ::testing::Test {
   template <typename T>
   Tensor AsTensor(std::vector<T> vals, const std::vector<int> input_dims,
                   DataType tf_type) {
-    Tensor ret(allocator_.get(), tf_type, {static_cast<int64>(vals.size())});
+    Tensor ret(tensor_buffer_allocator_.get(), tf_type,
+               {static_cast<int64>(vals.size())});
     if (tf_type == DT_FLOAT) {
       auto conv_vals = CastTestVector<T, float>(vals);
       std::copy_n(conv_vals.data(), conv_vals.size(), ret.flat<float>().data());
@@ -1646,13 +1648,15 @@ class OpConverterTest : public ::testing::Test {
   Logger logger_;
   TrtUniquePtrType<nvinfer1::ICudaEngine> engine_;
   cudaStream_t stream_;
-  // Used to create placeholders with shape and data type information. The
-  // created placeholders will be used as inputs to the node to be verified,
-  // thus we need the shape and data type information to get a non-empty
-  // GraphProperties.
+  std::unique_ptr<Allocator> tensor_buffer_allocator_;
+  // The scope that contains the graph being converted. Because
+  // tensor_buffer_allocator_ provides the storage for tensor contents that are
+  // represented as attributes for graph nodes within scope_,
+  // tensor_buffer_allocator_ needs to be available when destructing scope_.
+  // Therefore, scope_ comes after tensor_buffer_allocator_ in the class member
+  // field list.
   Scope scope_;
   std::unordered_map<string, Output> node_inputs_;
-  std::unique_ptr<Allocator> allocator_;
 };
 
 // General test parameters to be used with ops that take a single input tensor.
@@ -1781,7 +1785,8 @@ class ParameterizedOpConverterTestBase
   void BuildAndRun(const string& name,
                    const std::vector<std::vector<int>>& expected_output_dims,
                    const Status& expected_runtime_status,
-                   const std::vector<Matcher<std::vector<float>>>& matcher) {
+                   const std::vector<Matcher<std::vector<float>>>& matcher,
+                   const std::vector<DataType>& out_tf_types = {}) {
     TensorShape shape;
     const int n_output = expected_output_dims.size();
     ASSERT_EQ(n_output, matcher.size());
@@ -1790,12 +1795,14 @@ class ParameterizedOpConverterTestBase
       TF_EXPECT_OK(
           TensorShapeUtils::MakeShape(expected_output_dims[i], &shape));
       string out_name = (n_output == 1) ? name : StrCat(name, ":", i);
-      InputOutputData data{out_name,
-                           ConstructTensor(shape.num_elements(), 0, tf_type)};
+      DataType out_tf_type =
+          out_tf_types.size() > i ? out_tf_types[i] : tf_type;
+      InputOutputData data{
+          out_name, ConstructTensor(shape.num_elements(), 0, out_tf_type)};
       output_data.push_back(data);
     }
-    ASSERT_FALSE(input_data_.empty());
-    const int batch_size = input_data_[0].tensor.shape().dim_size(0);
+    const int batch_size =
+        input_data_.empty() ? 1 : input_data_[0].tensor.shape().dim_size(0);
     Status stat =
         OpConverterTest::BuildAndRun(input_data_, &output_data, batch_size);
     ASSERT_EQ(expected_runtime_status.ok(), stat.ok())
@@ -1820,13 +1827,15 @@ class ParameterizedOpConverterTestBase
                        const std::vector<int>& expected_output_dims,
                        const Status& expected_conversion_status,
                        const Status& expected_runtime_status,
-                       const Matcher<std::vector<float>>& matcher) {
+                       const Matcher<std::vector<float>>& matcher,
+                       const std::vector<DataType>& out_tf_types = {}) {
     RunValidationAndConversion(node_def, expected_conversion_status,
                                name.c_str(), expected_output_dims);
     if (expected_conversion_status.ok()) {
       BuildAndRun(name, std::vector<std::vector<int>>({expected_output_dims}),
                   expected_runtime_status,
-                  std::vector<Matcher<std::vector<float>>>({matcher}));
+                  std::vector<Matcher<std::vector<float>>>({matcher}),
+                  out_tf_types);
     }
   }
 
@@ -2011,6 +2020,142 @@ TEST_F(OpConverterTest, ConvertConst) {
   TestConvertConst<DT_UINT64, uint64, int32>(this);
 }
 
+template <typename T>
+NodeDef CreateFusedBatchNormOp(DataType tf_type, std::string data_format,
+                               bool is_training, float epsilon) {
+  Scope s = Scope::NewRootScope();
+  auto x = ops::Placeholder(s.WithOpName("x"), tf_type);
+  auto scale = ops::Placeholder(s.WithOpName("scale"), tf_type);
+  auto offset = ops::Placeholder(s.WithOpName("offset"), tf_type);
+  auto mean = ops::Placeholder(s.WithOpName("mean"), tf_type);
+  auto variance = ops::Placeholder(s.WithOpName("variance"), tf_type);
+  typename T::Attrs attrs;
+  attrs.data_format_ = data_format;
+  attrs.is_training_ = is_training;
+  if (epsilon > 0) {
+    attrs.epsilon_ = epsilon;
+  } else {
+    EXPECT_GE(epsilon, 0);
+  }
+  return T(s.WithOpName("my_batchnorm"), x, scale, offset, mean, variance,
+           attrs)
+      .operation.node()
+      ->def();
+}
+
+TEST_P(OpConverterTest1, ConvertFusedBatchNorm) {
+  using OpFunc = std::function<NodeDef(DataType, std::string, bool, float)>;
+  std::vector<OpFunc> get_node_def_vec{
+      CreateFusedBatchNormOp<ops::FusedBatchNorm>,
+      CreateFusedBatchNormOp<ops::FusedBatchNormV2>,
+      CreateFusedBatchNormOp<ops::FusedBatchNormV3>};
+
+  struct TestParam {
+    std::string data_format;
+    int tensor_input_idx;  // Index of an input that will be provided as tensor.
+    bool is_training;
+    float epsilon;
+    Status conversion_status;
+    bool keep_channel_unknown;
+  };
+
+  struct NodeInput {
+    std::string name;
+    std::vector<int> dims;
+    std::vector<float> val;
+  };
+  std::vector<NodeInput> node_input{
+      {"x", {2, 3, 2, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}},
+      {"scale", {3}, {7, 8, 9}},
+      {"offset", {3}, {10, 20, 30}},
+      {"mean", {3}, {1, 2, 3}},
+      {"variance", {3}, {4, 5, 6}}};
+
+  std::vector<float> expected_output{10.0,      13.495633, 23.574135, 27.148273,
+                                     37.342354, 41.013527, 30.9738,   34.469433,
+                                     45.018955, 48.59309,  59.369415, 63.04059};
+  for (auto get_node_def : get_node_def_vec) {
+    NodeDef tmp_node_def = get_node_def(tf_type, "NCHW", true, 0);
+    std::string op_name = tmp_node_def.op();
+    std::vector<TestParam> test_param{
+        {"NHWC", 0, false, 0,
+         errors::Unimplemented(StrCat(
+             op_name, " only supports data_format=NCHW, at my_batchnorm"))},
+        {"NCHW", 0, true, 0,
+         errors::Unimplemented(StrCat(
+             op_name, " only supports is_training=false, at my_batchnorm"))},
+        {"NCHW", 1, false, 0,
+         errors::Unimplemented(StrCat("The input \"scale\" for ", op_name,
+                                      " must be a constant, at my_batchnorm"))},
+        {"NCHW", 2, false, 0,
+         errors::Unimplemented(StrCat("The input \"offset\" for ", op_name,
+                                      " must be a constant, at my_batchnorm"))},
+        {"NCHW", 3, false, 0,
+         errors::Unimplemented(StrCat("The input \"mean\" for ", op_name,
+                                      " must be a constant, at my_batchnorm"))},
+        {"NCHW", 4, false, 0,
+         errors::Unimplemented(StrCat("The input \"variance\" for ", op_name,
+                                      " must be a constant, at my_batchnorm"))},
+        {"NCHW", 0, false, 0.01}};  // The last one is the only test that runs.
+    if (trt_mode == TrtTestMode::kDynamicShape) {
+      test_param.push_back(
+          {"NCHW", 0, false, 0.01,
+           errors::InvalidArgument(
+               "Channel dimension must be static, at my_batchnorm"),
+           true});
+    }
+    for (auto p : test_param) {
+      Reset();
+      NodeDef node_def =
+          get_node_def(tf_type, p.data_format, p.is_training, p.epsilon);
+      for (int i = 0; i < node_input.size(); i++) {
+        if (i == 0 || i == p.tensor_input_idx) {
+          // The first input (x) is always added as a tensor, and it hase shape
+          // NCHW. The other inputs are per channel values (1D, size C).
+          //
+          // In implicit batch mode, it is not possible to add any of the 1D
+          // inputs as a tensor: the first dim is always treated as batch dim in
+          // implicit batch mode, and that has to agree for all tensors. We have
+          // two input tensors with shapes NCHW and C and in general N != C.
+          // The converter already picked up N from the fist input, and reports
+          // an error when we try to add any other tensors with not matching
+          // first dim.
+          //
+          // This restriction does not apply in explicit batch mode: the tensors
+          // can have different first dim. The converter still expects that only
+          // the first arg is a tensor. TODO(tfeher) Check if one can relax this
+          // restriction.
+          Status expected_status =
+              (i != 0 && trt_mode == TrtTestMode::kImplicitBatch)
+                  ? errors::InvalidArgument(
+                        StrCat("Batch size doesn't match for tensor ",
+                               node_input[i].name,
+                               ": Provided batch size does not match "
+                               "converter batch size: 3 vs 2"))
+                  : Status::OK();
+          std::vector<int> partial_input_shape;
+          if (i == 0 && trt_mode == TrtTestMode::kDynamicShape &&
+              !p.keep_channel_unknown) {
+            // keep channel dim static (known)
+            partial_input_shape.resize(4, -1);
+            partial_input_shape[1] = node_input[i].dims[1];
+          }
+          AddTestTensor(node_input[i].name, node_input[i].dims, tf_type,
+                        node_input[i].val, partial_input_shape,
+                        expected_status);
+
+        } else {
+          AddTestWeights(node_input[i].name, node_input[i].dims,
+                         node_input[i].val, tf_type);
+        }
+      }
+      TestOpConverter("my_batchnorm", node_def, node_input[0].dims,
+                      p.conversion_status, Status::OK(),
+                      ArrayFloatNear(expected_output));
+    }
+  }
+}  // namespace convert
+
 TEST_P(OpConverterTest1, ConvertTranspose) {
   // Get the NodeDef for Transpose.
   Scope s = Scope::NewRootScope();
@@ -2169,6 +2314,52 @@ TEST_F(OpConverterTest, ConvertReshape) {
   }
 }
 
+TEST_P(OpConverterTest1, ConvertShape) {
+  // Get the NodeDef for Shape op.
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
+  auto shape = ops::Shape(s.WithOpName("my_shape"), input);
+  const NodeDef& node_def = shape.operation.node()->def();
+
+  Status conversion_status =
+      (trt_mode == TrtTestMode::kImplicitBatch)
+          ? errors::Unimplemented(
+                "Shape is only supported for explicit batch mode.")
+          : Status::OK();
+  std::vector<TestParamBase> test_params = {
+      TestParamBase{{1, 2, 3}, {}, {3}, {}, conversion_status},
+      // Add input as weight (we use non empty param ({1}) to trigger this).
+      TestParamBase{{1, 2, 3}, {}, {3}, {1}, conversion_status},
+  };
+
+  auto input_is_weight = [](const TestParamBase p) { return !p.param.empty(); };
+  for (auto p : test_params) {
+    SCOPED_TRACE(p);
+    Reset();
+    // The number of elements of the input tensor. We leave it 0 in case we do
+    // not need to add an input tensor. This happens in explicit batch mode: the
+    // shape is known at conversion time and therefore the shape is added to the
+    // network as a constant layer. In this case the single node network that
+    // we use for the unit test have no actual input tensor when it is converted
+    // to a TensorRT network.
+    int n_elements = 0;
+    if (input_is_weight(p) || trt_mode != TrtTestMode::kExplicitBatch) {
+      // Calculate the number of elements for adding input data.
+      n_elements = std::accumulate(p.input_dims.begin(), p.input_dims.end(), 1,
+                                   std::multiplies<int>());
+    }
+    std::vector<float> input_val(n_elements, 1);
+    if (!input_is_weight(p)) {
+      AddTestTensor("input", p.input_dims, input_val);
+    } else {
+      AddTestWeights("input", p.input_dims, input_val, tf_type);
+    }
+    TestOpConverter("my_shape", node_def, p.expected_output_dims, p.status,
+                    p.runtime_status, ElementsAreArray(p.input_dims),
+                    {DT_INT32});
+  }
+}
+
 // Helper function for testing MatMul and BatchMatMul
 // get_matmul corresponds to the function used to generate the node. It should
 // accept (DataType, transpose_a, transpose_b) as parameters.
@@ -4039,72 +4230,81 @@ TEST_P(OpConverterTest1, ConvertConv2D) {
 
   // Ok.
   std::vector<TestParams> ok_params = {
-      // Basic
-      TestParams{/*input_dims=*/{1, 1, 2, 3},
-                 /*input=*/{0, 1, 2, 3, 3, 4},
-                 /*filter_dims=*/{1, 2, 1, 1},
-                 /*filter=*/{-1, 1},
-                 /*strides=*/{1, 1, 1, 1},
-                 /*padding=*/"VALID",
-                 /*data_format=*/"NCHW",
-                 /*dilations=*/{1, 1, 1, 1},
-                 /*expected_output_dims=*/{1, 1, 2, 2},
-                 /*expected_output=*/{1, 1, 0, 1}},
-      // SAME padding (Asymmetric)
-      TestParams{/*input_dims=*/{1, 1, 2, 3},
-                 /*input=*/{0, 1, 2, 3, 3, 4},
-                 /*filter_dims=*/{1, 2, 1, 1},
-                 /*filter=*/{-1, 1},
-                 /*strides=*/{1, 1, 1, 1},
-                 /*padding=*/"SAME",
-                 /*data_format=*/"NCHW",
-                 /*dilations=*/{1, 1, 1, 1},
-                 /*expected_output_dims=*/{1, 1, 2, 3},
-                 /*expected_output=*/{1, 1, -2, 0, 1, -4}},
-      // SAME padding (Symmetric)
-      TestParams{/*input_dims=*/{1, 1, 2, 3},
-                 /*input=*/{0, 1, 2, 3, 3, 4},
-                 /*filter_dims=*/{1, 3, 1, 1},
-                 /*filter=*/{-1, 0, 1},
-                 /*strides=*/{1, 1, 1, 1},
-                 /*padding=*/"SAME",
-                 /*data_format=*/"NCHW",
-                 /*dilations=*/{1, 1, 1, 1},
-                 /*expected_output_dims=*/{1, 1, 2, 3},
-                 /*expected_output=*/{1, 2, -1, 3, 1, -3}},
-      // NHWC
-      TestParams{/*input_dims=*/{1, 2, 3, 1},
-                 /*input=*/{0, 1, 2, 3, 3, 4},
-                 /*filter_dims=*/{1, 2, 1, 1},
-                 /*filter=*/{-1, 1},
-                 /*strides=*/{1, 1, 1, 1},
-                 /*padding=*/"VALID",
-                 /*data_format=*/"NHWC",
-                 /*dilations=*/{1, 1, 1, 1},
-                 /*expected_output_dims=*/{1, 2, 2, 1},
-                 /*expected_output=*/{1, 1, 0, 1}},
-      // Dilated
-      TestParams{/*input_dims=*/{1, 1, 2, 3},
-                 /*input=*/{0, 1, 2, 3, 3, 4},
-                 /*filter_dims=*/{1, 2, 1, 1},
-                 /*filter=*/{-1, 1},
-                 /*strides=*/{1, 1, 1, 1},
-                 /*padding=*/"VALID",
-                 /*data_format=*/"NCHW",
-                 /*dilations=*/{1, 1, 1, 2},
-                 /*expected_output_dims=*/{1, 1, 2, 1},
-                 /*expected_output=*/{2, 1}},
-      // Strided
-      TestParams{/*input_dims=*/{1, 1, 2, 4},
-                 /*input=*/{0, 1, 2, 2, 3, 4, 4, 7},
-                 /*filter_dims=*/{1, 2, 1, 1},
-                 /*filter=*/{-1, 1},
-                 /*strides=*/{1, 1, 1, 2},
-                 /*padding=*/"VALID",
-                 /*data_format=*/"NCHW",
-                 /*dilations=*/{1, 1, 1, 1},
-                 /*expected_output_dims=*/{1, 1, 2, 2},
-                 /*expected_output=*/{1, 0, 1, 3}},
+// TODO(b/162447069): Enable the test parameters for TRT 7.1.3.x.
+#if !IS_TRT_VERSION_GE(7, 1, 3, 0)
+    // Basic
+    TestParams{/*input_dims=*/{1, 1, 2, 3},
+               /*input=*/{0, 1, 2, 3, 3, 4},
+               /*filter_dims=*/{1, 2, 1, 1},
+               /*filter=*/{-1, 1},
+               /*strides=*/{1, 1, 1, 1},
+               /*padding=*/"VALID",
+               /*data_format=*/"NCHW",
+               /*dilations=*/{1, 1, 1, 1},
+               /*expected_output_dims=*/{1, 1, 2, 2},
+               /*expected_output=*/{1, 1, 0, 1}},
+#endif
+// TODO(b/162448349): Enable the test parameters for TRT 7.1.3.x.
+#if !IS_TRT_VERSION_GE(7, 1, 3, 0)
+    // SAME padding (Asymmetric)
+    TestParams{/*input_dims=*/{1, 1, 2, 3},
+               /*input=*/{0, 1, 2, 3, 3, 4},
+               /*filter_dims=*/{1, 2, 1, 1},
+               /*filter=*/{-1, 1},
+               /*strides=*/{1, 1, 1, 1},
+               /*padding=*/"SAME",
+               /*data_format=*/"NCHW",
+               /*dilations=*/{1, 1, 1, 1},
+               /*expected_output_dims=*/{1, 1, 2, 3},
+               /*expected_output=*/{1, 1, -2, 0, 1, -4}},
+    // SAME padding (Symmetric)
+    TestParams{/*input_dims=*/{1, 1, 2, 3},
+               /*input=*/{0, 1, 2, 3, 3, 4},
+               /*filter_dims=*/{1, 3, 1, 1},
+               /*filter=*/{-1, 0, 1},
+               /*strides=*/{1, 1, 1, 1},
+               /*padding=*/"SAME",
+               /*data_format=*/"NCHW",
+               /*dilations=*/{1, 1, 1, 1},
+               /*expected_output_dims=*/{1, 1, 2, 3},
+               /*expected_output=*/{1, 2, -1, 3, 1, -3}},
+#endif
+// TODO(b/162447069): Enable the test parameters for TRT 7.1.3.x.
+#if !IS_TRT_VERSION_GE(7, 1, 3, 0)
+    // NHWC
+    TestParams{/*input_dims=*/{1, 2, 3, 1},
+               /*input=*/{0, 1, 2, 3, 3, 4},
+               /*filter_dims=*/{1, 2, 1, 1},
+               /*filter=*/{-1, 1},
+               /*strides=*/{1, 1, 1, 1},
+               /*padding=*/"VALID",
+               /*data_format=*/"NHWC",
+               /*dilations=*/{1, 1, 1, 1},
+               /*expected_output_dims=*/{1, 2, 2, 1},
+               /*expected_output=*/{1, 1, 0, 1}},
+    // Dilated
+    TestParams{/*input_dims=*/{1, 1, 2, 3},
+               /*input=*/{0, 1, 2, 3, 3, 4},
+               /*filter_dims=*/{1, 2, 1, 1},
+               /*filter=*/{-1, 1},
+               /*strides=*/{1, 1, 1, 1},
+               /*padding=*/"VALID",
+               /*data_format=*/"NCHW",
+               /*dilations=*/{1, 1, 1, 2},
+               /*expected_output_dims=*/{1, 1, 2, 1},
+               /*expected_output=*/{2, 1}},
+    // Strided
+    TestParams{/*input_dims=*/{1, 1, 2, 4},
+               /*input=*/{0, 1, 2, 2, 3, 4, 4, 7},
+               /*filter_dims=*/{1, 2, 1, 1},
+               /*filter=*/{-1, 1},
+               /*strides=*/{1, 1, 1, 2},
+               /*padding=*/"VALID",
+               /*data_format=*/"NCHW",
+               /*dilations=*/{1, 1, 1, 1},
+               /*expected_output_dims=*/{1, 1, 2, 2},
+               /*expected_output=*/{1, 0, 1, 3}},
+#endif
   };
 
   for (int i = 0; i < ok_params.size(); i++) {
@@ -4589,41 +4789,72 @@ TEST_F(OpConverterTest, ConvertConv3D) {
                 ElementsAreArray(ok_params[i].expected_output));
   }
 }
+#endif
 
-TEST_F(OpConverterTest, ConvertPool3D) {
-  // Get nodedef for MaxPool3D and AvgPool3D layers.
-  auto get_pool3d_nodedef = [](std::vector<int> ksize = {1, 1, 1, 1, 1},
-                               std::vector<int> strides = {1, 1, 1, 1, 1},
-                               string padding = "SAME",
-                               string data_format = "NCDHW",
-                               const bool is_max_pooling = true) -> NodeDef {
-    Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-
+template <typename T>
+NodeDef CreatePoolOp(DataType tf_type, std::vector<int> ksize,
+                     std::vector<int> strides, string padding,
+                     string data_format) {
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
+  typename T::Attrs attrs;
+  attrs.data_format_ = data_format;
+  return T(s.WithOpName("my_pool"), input, ksize, strides, padding, attrs)
+      .operation.node()
+      ->def();
+}
+TEST_P(OpConverterTest1, ConvertPool) {
+  // Get nodedef for MaxPool and AvgPool layers (2D or 3D).
+  auto get_pool_nodedef =
+      [](DataType tf_type, int nDim, std::vector<int> ksize = {},
+         std::vector<int> strides = {}, string padding = "SAME",
+         string data_format = "", const bool is_max_pooling = true) -> NodeDef {
+    if (ksize.empty()) {
+      ksize = nDim == 2 ? std::vector<int>{1, 1, 1, 1}
+                        : std::vector<int>{1, 1, 1, 1, 1};
+    }
+    if (strides.empty()) {
+      strides = nDim == 2 ? std::vector<int>{1, 1, 1, 1}
+                          : std::vector<int>{1, 1, 1, 1, 1};
+    }
+    if (data_format == "") {
+      data_format = nDim == 2 ? "NCHW" : "NCDHW";
+    }
     if (is_max_pooling) {
-      ops::MaxPool3D::Attrs attrs =
-          ops::MaxPool3D::Attrs().DataFormat(data_format);
-      auto pool3d = ops::MaxPool3D(s.WithOpName("my_maxpool3d"), input, ksize,
-                                   strides, padding, attrs);
-      return pool3d.operation.node()->def();
+      if (nDim == 3) {
+        return CreatePoolOp<ops::MaxPool3D>(tf_type, ksize, strides, padding,
+                                            data_format);
+      } else {
+        return CreatePoolOp<ops::MaxPool>(tf_type, ksize, strides, padding,
+                                          data_format);
+      }
     } else {
-      ops::AvgPool3D::Attrs attrs =
-          ops::AvgPool3D::Attrs().DataFormat(data_format);
-      auto pool3d = ops::AvgPool3D(s.WithOpName("my_avgpool3d"), input, ksize,
-                                   strides, padding, attrs);
-      return pool3d.operation.node()->def();
+      if (nDim == 3) {
+        return CreatePoolOp<ops::AvgPool3D>(tf_type, ksize, strides, padding,
+                                            data_format);
+      } else {
+        return CreatePoolOp<ops::AvgPool>(tf_type, ksize, strides, padding,
+                                          data_format);
+      }
     }
   };
 
-  {
+#if IS_TRT_VERSION_GE(6, 0, 0, 0)
+  std::vector<int> test_nDims{2, 3};
+#else
+  std::vector<int> test_nDims{2};
+#endif
+
+  for (int nDim : test_nDims) {
     // Input is weights, should fail.
     Reset();
-    NodeDef node_def = get_pool3d_nodedef();
+    NodeDef node_def = get_pool_nodedef(tf_type, nDim);
 
-    AddTestWeights<float>("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "The input \"input\" for MaxPool3D must be a tensor, at my_maxpool3d");
+    AddTestWeights<float>("input", {1, 1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               StrCat("The input \"input\" for ", node_def.op(),
+                                      " must be a tensor, at my_pool")
+                                   .c_str());
   }
 
   struct TestParams {
@@ -4633,150 +4864,110 @@ TEST_F(OpConverterTest, ConvertPool3D) {
     std::vector<int> strides;
     string padding;
     string data_format;
-    bool is_max_pooling;
     std::vector<int> expected_output_dims;
-    std::vector<float> expected_output;
+    // The expected outputs for the following operations: MaxPool2D, AvgPool2D,
+    // MaxPool3D, AvgPool3D
+    std::vector<std::vector<float>> expected_outputs;
   };
 
-  // Start here
-  const std::vector<float> common_array{-4, 2,  15, 3, 6,   -3, 22, 1,   88,
+  // We use common_input as the input to test both 2D and 3D pooling operations,
+  // to simplify TestParams. For 2D operations, only the first 1/3 of the values
+  // are used.
+  const std::vector<float> common_input{-4, 2,  15, 3, 6,   -3, 22, 1,   88,
                                         56, 36, 1,  1, 105, 1,  16, -28, 1,
                                         42, 9,  3,  1, 7,   1,  11, 61,  5};
+  // The output of 2D ops for the case where the op is equivalent to the
+  // identity op.
+  const std::vector<float> common_2d_output{-4, 2, 15, 3, 6, -3, 22, 1, 88};
   std::vector<TestParams> ok_params = {
       // Basic - just 1x1 max pooling - input = output
-      TestParams{/*input_dims=*/{1, 3, 3, 3},
-                 /*input=*/common_array,
-                 /*ksize=*/{1, 1, 1, 1, 1},
-                 /*strides=*/{1, 1, 1, 1, 1},
-                 /*padding=*/"VALID",
-                 /*data_format=*/"NCDHW",
-                 /*is_max_pooling=*/true,
-                 /*expected_output_dims=*/{1, 3, 3, 3},
-                 /*expected_output=*/common_array},
-      // Basic - just 1x1 avg pooling - input = output
-      TestParams{/*input_dims=*/{1, 3, 3, 3},
-                 /*input=*/common_array,
-                 /*ksize=*/{1, 1, 1, 1, 1},
-                 /*strides=*/{1, 1, 1, 1, 1},
-                 /*padding=*/"VALID",
-                 /*data_format=*/"NCDHW",
-                 /*is_max_pooling=*/false,
-                 /*expected_output_dims=*/{1, 3, 3, 3},
-                 /*expected_output=*/common_array},
+      TestParams{
+          /*input_dims=*/{1, 1, 3, 3, 3},
+          /*input=*/common_input,
+          /*ksize=*/{1, 1, 1, 1, 1},
+          /*strides=*/{1, 1, 1, 1, 1},
+          /*padding=*/"VALID",
+          /*data_format=*/"NCDHW",
+          /*expected_output_dims=*/{1, 1, 3, 3, 3},
+          /*expected_outputs=*/
+          {common_2d_output, common_2d_output, common_input, common_input}},
       // Basic - just 1x1 max pooling - input = output, SAME padding
-      TestParams{/*input_dims=*/{1, 3, 3, 3},
-                 /*input=*/common_array,
-                 /*ksize=*/{1, 1, 1, 1, 1},
-                 /*strides=*/{1, 1, 1, 1, 1},
-                 /*padding=*/"SAME",
-                 /*data_format=*/"NCDHW",
-                 /*is_max_pooling=*/true,
-                 /*expected_output_dims=*/{1, 3, 3, 3},
-                 /*expected_output=*/common_array},
-      // Basic - just 1x1 avg pooling - input = output, SAME padding
-      TestParams{/*input_dims=*/{1, 3, 3, 3},
-                 /*input=*/common_array,
-                 /*ksize=*/{1, 1, 1, 1, 1},
-                 /*strides=*/{1, 1, 1, 1, 1},
-                 /*padding=*/"VALID",
-                 /*data_format=*/"NCDHW",
-                 /*is_max_pooling=*/false,
-                 /*expected_output_dims=*/{1, 3, 3, 3},
-                 /*expected_output=*/common_array},
-      // 3x3 max pooling
-      TestParams{/*input_dims=*/{1, 3, 3, 3},
-                 /*input=*/common_array,
+      TestParams{
+          /*input_dims=*/{1, 1, 3, 3, 3},
+          /*input=*/common_input,
+          /*ksize=*/{1, 1, 1, 1, 1},
+          /*strides=*/{1, 1, 1, 1, 1},
+          /*padding=*/"SAME",
+          /*data_format=*/"NCDHW",
+          /*expected_output_dims=*/{1, 1, 3, 3, 3},
+          /*expected_outputs=*/
+          {common_2d_output, common_2d_output, common_input, common_input}},
+      // 3x3 pooling NCDHW
+      TestParams{/*input_dims=*/{1, 1, 3, 3, 3},
+                 /*input=*/common_input,
                  /*ksize=*/{1, 1, 3, 3, 3},
                  /*strides=*/{1, 1, 1, 1, 1},
                  /*padding=*/"VALID",
                  /*data_format=*/"NCDHW",
-                 /*is_max_pooling=*/true,
-                 /*expected_output_dims=*/{1, 1, 1, 1},
-                 /*expected_output=*/{105}},
-      // 3x3 avg pooling
-      TestParams{/*input_dims=*/{1, 3, 3, 3},
-                 /*input=*/common_array,
-                 /*ksize=*/{1, 1, 3, 3, 3},
+                 /*expected_output_dims=*/{1, 1, 1, 1, 1},
+                 /*expected_outputs=*/{{88}, {14.444445}, {105}, {17}}},
+      // 3x3 pooling, NDHWC
+      TestParams{/*input_dims=*/{1, 3, 3, 3, 1},
+                 /*input=*/common_input,
+                 /*ksize=*/{1, 3, 3, 3, 1},
                  /*strides=*/{1, 1, 1, 1, 1},
                  /*padding=*/"VALID",
+                 /*data_format=*/"NDHWC",
+                 /*expected_output_dims=*/{1, 1, 1, 1, 1},
+                 /*expected_outputs=*/{{88}, {14.444445}, {105}, {17}}},
+      // Strided
+      TestParams{/*input_dims=*/{1, 1, 3, 3, 3},
+                 /*input=*/{1, 0, 2, 0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 0,
+                            0, 0, 0, 0, 5, 0, 6, 0, 0, 0, 7, 0, 8},
+                 /*ksize=*/{1, 1, 1, 1, 1},
+                 /*strides=*/{1, 1, 2, 2, 2},
+                 /*padding=*/"VALID",
                  /*data_format=*/"NCDHW",
-                 /*is_max_pooling=*/false,
-                 /*expected_output_dims=*/{1, 1, 1, 1},
-                 /*expected_output=*/{17}},
-      // 3x3 max pooling, NDHWC
-      TestParams{/*input_dims=*/{3, 3, 3, 1},
-                 /*input=*/common_array,
-                 /*ksize=*/{1, 3, 3, 3, 1},
-                 /*strides=*/{1, 1, 1, 1, 1},
-                 /*padding=*/"VALID",
-                 /*data_format=*/"NDHWC",
-                 /*is_max_pooling=*/true,
-                 /*expected_output_dims=*/{1, 1, 1, 1},
-                 /*expected_output=*/{105}},
-      // 3x3 avg pooling, NDHWC
-      TestParams{/*input_dims=*/{3, 3, 3, 1},
-                 /*input=*/common_array,
-                 /*ksize=*/{1, 3, 3, 3, 1},
-                 /*strides=*/{1, 1, 1, 1, 1},
-                 /*padding=*/"VALID",
-                 /*data_format=*/"NDHWC",
-                 /*is_max_pooling=*/false,
-                 /*expected_output_dims=*/{1, 1, 1, 1},
-                 /*expected_output=*/{17}},
-      // Strided max
-      TestParams{
-          /*input_dims=*/{1, 3, 3, 3},
-          /*input=*/{1, 0, 2, 0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 0,
-                     0, 0, 0, 0, 5, 0, 6, 0, 0, 0, 7, 0, 8},
-          /*ksize=*/{1, 1, 1, 1, 1},
-          /*strides=*/{1, 1, 2, 2, 2},
-          /*padding=*/"VALID",
-          /*data_format=*/"NCDHW",
-          /*is_max_pooling=*/true,
-          /*expected_output_dims=*/{1, 2, 2, 2},
-          /*expected_output=*/{1, 2, 3, 4, 5, 6, 7, 8}  // Should only pick up
-                                                        // the corners
-      },
-      // Strided avg
-      TestParams{
-          /*input_dims=*/{1, 3, 3, 3},
-          /*input=*/{1, 0, 2, 0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 0,
-                     0, 0, 0, 0, 5, 0, 6, 0, 0, 0, 7, 0, 8},
-          /*ksize=*/{1, 1, 1, 1, 1},
-          /*strides=*/{1, 1, 2, 2, 2},
-          /*padding=*/"VALID",
-          /*data_format=*/"NCDHW",
-          /*is_max_pooling=*/false,
-          /*expected_output_dims=*/{1, 2, 2, 2},
-          /*expected_output=*/{1, 2, 3, 4, 5, 6, 7, 8}  // Should only pick up
-                                                        // the corners
-      }};
+                 /*expected_output_dims=*/{1, 1, 2, 2, 2},
+                 /*expected_outputs=*/
+                 {{1, 2, 3, 4},  // Should only pick up the corners
+                  {1, 2, 3, 4},
+                  {1, 2, 3, 4, 5, 6, 7, 8},
+                  {1, 2, 3, 4, 5, 6, 7, 8}}},
+  };
 
-  for (int i = 0; i < ok_params.size(); i++) {
-    Reset();
-    NodeDef node_def = get_pool3d_nodedef(
-        ok_params[i].ksize, ok_params[i].strides, ok_params[i].padding,
-        ok_params[i].data_format, ok_params[i].is_max_pooling);
-    AddTestTensor("input", ok_params[i].input_dims);
-    RunValidationAndConversion(node_def);
-    TRT_TensorOrWeights output;
-    string expected_node_name =
-        ok_params[i].is_max_pooling ? "my_maxpool3d" : "my_avgpool3d";
-    TF_EXPECT_OK(GetTensorOrWeights(expected_node_name, &output));
-    ASSERT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
-                             output.tensor()->getDimensions());
-
-    const DataVec input_data{{"input", AsTensor<float>(ok_params[i].input)}};
-    DataVec output_data{
-        {expected_node_name,
-         ConstructTensor<float>(ok_params[i].expected_output.size())}};
-    TF_EXPECT_OK(BuildAndRun(input_data, &output_data));
-    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
-                ElementsAreArray(ok_params[i].expected_output));
+  for (auto p : ok_params) {
+    int test_counter = 0;
+    for (int nDim : test_nDims) {
+      auto input = p.input;
+      auto input_dims = p.input_dims;
+      auto ksize = p.ksize;
+      auto strides = p.strides;
+      auto expected_output_dims = p.expected_output_dims;
+      std::string data_format = p.data_format;
+      if (nDim == 2) {
+        input.resize(9);
+        data_format = p.data_format == "NDHWC" ? "NHWC" : "NCHW";
+        // Remove one of the spatial dimensions
+        input_dims.erase(input_dims.begin() + 2);
+        ksize.erase(ksize.begin() + 2);
+        strides.erase(strides.begin() + 2);
+        expected_output_dims.erase(expected_output_dims.begin() + 2);
+      }
+      for (bool is_max_pooling : {true, false}) {
+        Reset();
+        NodeDef node_def =
+            get_pool_nodedef(tf_type, nDim, ksize, strides, p.padding,
+                             data_format, is_max_pooling);
+        AddTestTensor("input", input_dims, input);
+        TestOpConverter("my_pool", node_def, expected_output_dims, Status::OK(),
+                        Status::OK(),
+                        ElementsAreArray(p.expected_outputs.at(test_counter)));
+        test_counter++;
+      }
+    }
   }
 }
-#endif  // IS_TRT_VERSION_GE(6, 0, 0, 0)
 
 TEST_F(OpConverterTest, ConvertTopK) {
   // TODO(tmorris): This test isn't setting the input dtype properly. TopK with
@@ -5052,6 +5243,148 @@ TEST_P(OpConverterTest3, ConvertGather) {
   }
 }
 
+template <typename OpType>
+NodeDef CreateReduceOp(DataType tf_type, bool keep_dims) {
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
+  auto axis = ops::Placeholder(s.WithOpName("axis"), DT_INT32);
+  typename OpType::Attrs op_attrs;
+  op_attrs.keep_dims_ = keep_dims;
+  auto op = OpType(s.WithOpName("my_reduce"), input, axis, op_attrs);
+  return op.operation.node()->def();
+}
+
+// Applies reduction op on sub-sequences of input
+// output[i] = reduce(input[m * i : m * (i +1)])
+std::vector<float> CalcReduce(string op_name, std::vector<float> input, int m,
+                              float (*op)(float, float), float init) {
+  std::vector<float> output(input.size() / m);
+  for (int i = 0; i < output.size(); i++) {
+    auto begin = input.begin() + i * m;
+    auto end = input.begin() + (i + 1) * m;
+    output[i] = std::accumulate(begin, end, init, op);
+    if (op_name == "Mean") {
+      output[i] /= m;
+    }
+  }
+  return output;
+}
+TEST_P(OpConverterTest1, ConvertReduce) {
+  {
+    // Input is weights, should fail.
+    Reset();
+    const NodeDef node_def = CreateReduceOp<ops::Sum>(tf_type, false);
+    AddTestWeights<float>("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
+    AddTestWeights<int32>("axis", {1}, {1});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "The input \"input\" for Sum must be a tensor, at my_reduce");
+  }
+  {
+    // Axis is weights, should fail.
+    Reset();
+    const NodeDef node_def = CreateReduceOp<ops::Sum>(tf_type, false);
+    AddTestTensor("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
+    AddTestTensor("axis", {1}, DT_INT32, {1});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "The input \"axis\" for Sum must be a constant, at my_reduce");
+  }
+  using OpFunc = std::function<NodeDef(DataType, bool)>;
+  using ValFunc = float (*)(float, float);
+  struct ReduceTestDescriptor {
+    string name;
+    OpFunc get_node;
+    ValFunc val_func;
+    float init_val;
+  };
+  std::vector<ReduceTestDescriptor> op_test_info{
+      {"Sum", CreateReduceOp<ops::Sum>, [](float x, float y) { return x + y; },
+       0},
+      {"Prod", CreateReduceOp<ops::Prod>,
+       [](float x, float y) { return x * y; }, 1},
+      {"Mean", CreateReduceOp<ops::Mean>,
+       [](float x, float y) { return x + y; }, 0},
+      {"Min", CreateReduceOp<ops::Min>,
+       [](float x, float y) { return y < x ? y : x; }, 1000},
+      {"Max", CreateReduceOp<ops::Max>,
+       [](float x, float y) { return x < y ? y : x; }, -1000}};
+
+  std::vector<float> input_values{1, 2, 3, 4, 5, 6};
+  struct TestParams {
+    std::vector<int> input_dims;
+    std::vector<float> input_values;
+    // Helper array contains the same elements as input but permuted in a way
+    // that the reduction can be calculated over contiguous elements using
+    // CalcReduce
+    std::vector<float> helper_array;
+    std::vector<int> axis;
+    int stride;  // product of input_dims along axis
+    Status conversion_status;
+  };
+  std::vector<TestParams> params{
+      // Out of range tests
+      TestParams{{2, 3, 1}, input_values, input_values, {3}, 3},
+      TestParams{{2, 3, 1}, input_values, input_values, {-4}, 3},
+      // Ok tests
+      TestParams{{2, 3, 1}, input_values, {1, 4, 2, 5, 3, 6}, {0}, 2},
+      TestParams{{2, 3, 1}, input_values, input_values, {1}, 3},
+      TestParams{{2, 3, 1}, input_values, input_values, {2}, 1},
+      TestParams{{2, 3, 1}, input_values, input_values, {0, 1}, 6},
+      // Ok tests with negative axis values
+      TestParams{{2, 3, 1}, input_values, {1, 4, 2, 5, 3, 6}, {-3}, 2},
+      TestParams{{2, 3, 1}, input_values, input_values, {-2}, 3},
+      TestParams{{2, 3, 1}, input_values, input_values, {-1}, 1},
+      TestParams{{2, 3, 1}, input_values, input_values, {-3, 1}, 6},
+  };
+
+  for (bool keep_dims : {false, true}) {
+    for (auto& op : op_test_info) {
+      for (auto p : params) {
+        SCOPED_TRACE(StrCat(op.name, keep_dims ? "keep_dims" : ""));
+        Reset();
+        NodeDef node_def = op.get_node(tf_type, keep_dims);
+
+        AddTestTensor("input", p.input_dims, p.input_values);
+        AddTestWeights<int32>("axis", {static_cast<int>(p.axis.size())},
+                              p.axis);
+        std::vector<int> expected_output_dims(p.input_dims);
+
+        // Set expected output dim and conversion error messages
+        for (int ax : p.axis) {
+          int rank = p.input_dims.size();
+          if (ax >= rank || ax < -rank) {
+            p.conversion_status =
+                errors::InvalidArgument("Axis value of ", ax,
+                                        " is out of bounds, must be in "
+                                        "range [",
+                                        -rank, ", ", rank, "), at my_reduce");
+          } else {
+            int ax_positive = ax >= 0 ? ax : ax + rank;
+            // Zero marks elements that we will remove later.
+            expected_output_dims[ax_positive] = keep_dims ? 1 : 0;
+            if (trt_mode == TrtTestMode::kImplicitBatch &&
+                (ax == 0 || ax == -rank)) {
+              p.conversion_status = errors::Unimplemented(
+                  "TensorRT does not allow manipulation of the batch "
+                  "dimension, at my_reduce");
+            }
+          }
+        }
+        expected_output_dims.erase(std::remove(expected_output_dims.begin(),
+                                               expected_output_dims.end(), 0),
+                                   expected_output_dims.end());
+        VLOG(2) << "out dims " << expected_output_dims;
+        std::vector<float> expected_values = CalcReduce(
+            op.name, p.helper_array, p.stride, op.val_func, op.init_val);
+        TestOpConverter("my_reduce", node_def, expected_output_dims,
+                        p.conversion_status, Status::OK(),
+                        ArrayFloatNear(expected_values));
+      }
+    }
+  }
+}
+
 NodeDef CreateCastOp(DataType tf_type) {
   Scope s = Scope::NewRootScope();
   auto input = ops::Placeholder(s.WithOpName("input"), DT_HALF);
@@ -6442,19 +6775,22 @@ void TestConvertResize(OpConverterTest* test) {
   typedef typename EnumToDataType<dtype>::Type CType;
 
   std::vector<ResizeTestParams<CType>> params{
-      {
-          /*input_dims=*/{1, 2, 1},       // H, W, C
-          /*output_resize_dims=*/{2, 3},  // H_out, W_out
-          /*input_values=*/CastTestVector<float, CType>({2.0f, -1.0f}),
-          /*align_corners=*/false,
-          /*expected_output_dims=*/{2, 3, 1},  // H, W, C
-          /*expected_nearest_output_values=*/
-          CastTestVector<float, CType>({2.0f, 2.0f, -1.0f, 2.0f, 2.0f, -1.0f}),
-          /*expected_bilinear_output_values=*/
-          CastTestVector<float, CType>({2.0f, 0.f, -1.0f, 2.0f, 0.f, -1.0f}),
-      },
-      {
-          /*input_dims=*/{1, 2, 1},       // H, W, C
+// TODO(b/162442839): Enable the test parameters for TRT 7.1.3.x.
+#if !IS_TRT_VERSION_GE(7, 1, 3, 0)
+    {
+        /*input_dims=*/{1, 2, 1},       // H, W, C
+        /*output_resize_dims=*/{2, 3},  // H_out, W_out
+        /*input_values=*/CastTestVector<float, CType>({2.0f, -1.0f}),
+        /*align_corners=*/false,
+        /*expected_output_dims=*/{2, 3, 1},  // H, W, C
+        /*expected_nearest_output_values=*/
+        CastTestVector<float, CType>({2.0f, 2.0f, -1.0f, 2.0f, 2.0f, -1.0f}),
+        /*expected_bilinear_output_values=*/
+        CastTestVector<float, CType>({2.0f, 0.f, -1.0f, 2.0f, 0.f, -1.0f}),
+    },
+#endif
+    {
+      /*input_dims=*/{1, 2, 1},           // H, W, C
           /*output_resize_dims=*/{2, 3},  // H_out, W_out
           /*input_values=*/CastTestVector<float, CType>({2.0f, -1.0f}),
           /*align_corners=*/true,
@@ -6463,7 +6799,8 @@ void TestConvertResize(OpConverterTest* test) {
           CastTestVector<float, CType>({2.0f, 2.0f, -1.0f, 2.0f, 2.0f, -1.0f}),
           /*expected_bilinear_output_values=*/
           CastTestVector<float, CType>({2.0f, 0.5f, -1.0f, 2.0f, 0.5f, -1.0f}),
-      }};
+    }
+  };
 
   for (int i = 0; i < params.size(); ++i) {
     test->Reset();
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
index 1cf98d135cb..4d6f8fa1b31 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
@@ -86,6 +86,7 @@ void TRTOptimizationPass::PrintDebugInfo(grappler::Cluster* cluster,
   string offset2 = StrCat(offset, offset);
   string offset3 = StrCat(offset2, offset);
   string offset4 = StrCat(offset2, offset2);
+
   if (cluster) {
     LOG(INFO) << offset << "type             = " << cluster->type();
     LOG(INFO) << offset << "num warmup steps = " << cluster->NumWarmupSteps();
@@ -132,7 +133,15 @@ void TRTOptimizationPass::PrintDebugInfo(grappler::Cluster* cluster,
         }
       }
     }
+
+    if (cluster->GetDeviceSet()) {
+      for (const auto dev : cluster->GetDeviceSet()->devices()) {
+        LOG(INFO) << "Device name= " << dev->name() << "Pased name= "
+                  << DeviceNameUtils::ParsedNameToString(dev->parsed_name());
+      }
+    }
   }
+
   LOG(INFO) << "item: " << item.id;
   if (!item.feed.empty()) {
     LOG(INFO) << offset << "Feeds  :";
@@ -171,13 +180,6 @@ void TRTOptimizationPass::PrintDebugInfo(grappler::Cluster* cluster,
   } else {
     LOG(INFO) << offset << "No keep ops";
   }
-  for (const auto dev : cluster->GetDeviceSet()->devices()) {
-    const auto& pname = dev->parsed_name();
-    LOG(INFO) << "Device name= " << dev->name()
-              << " parsedname job= " << pname.job << " id= " << pname.id
-              << " has_id: " << pname.has_id << " has_job: " << pname.has_job
-              << "has_type: " << pname.has_type << " type =" << pname.type;
-  }
 }
 
 Status TRTOptimizationPass::Optimize(grappler::Cluster* cluster,
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
index 1094555a622..58d1c611463 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -643,8 +643,10 @@ void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
     }
     // Release any outputs that are allocated, ExecuteNativeSegment will
     // re-allocate them and fail if they are currently allocated.
+    // The Tensor pointer in the returned TensorValue must be explicitly
+    // deleted.
     for (int i = 0; i < ctx->num_outputs(); i++) {
-      ctx->release_output(i);
+      delete ctx->release_output(i).tensor;
     }
     ExecuteNativeSegment(ctx, helper);
     return;
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.cc b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
index e7820ca41fe..1337a733f91 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
@@ -711,15 +711,15 @@ Status SegmentGraph(const Graph* tf_graph,
   std::unordered_set<string> unsupported_ops;
   int num_unsupported_ops = 0;
 
-  // Getting the operations blacklisted for conversion
-  string tftrt_op_blacklist_str;
+  // Getting the operations denylisted for conversion
+  string tftrt_op_denylist_str;
   TF_CHECK_OK(
-      ReadStringFromEnvVar("TF_TRT_OP_BLACKLIST", "", &tftrt_op_blacklist_str));
+      ReadStringFromEnvVar("TF_TRT_OP_DENYLIST", "", &tftrt_op_denylist_str));
 
-  auto tftrt_op_blacklist = gtl::FlatSet<string>{};  // non-absl ok
+  auto tftrt_op_denylist = gtl::FlatSet<string>{};  // non-absl ok
 
-  for (const auto& x : str_util::Split(tftrt_op_blacklist_str, ",")) {
-    tftrt_op_blacklist.insert(x);
+  for (const auto& x : str_util::Split(tftrt_op_denylist_str, ",")) {
+    tftrt_op_denylist.insert(x);
   }
 
   // Parsing each node of the graph
@@ -761,13 +761,13 @@ Status SegmentGraph(const Graph* tf_graph,
       const Status status = candidate_fn(node->tf_node());
       if (!status.ok()) {
         exclude_node(status.error_message());
-      } else if (tftrt_op_blacklist.count(node->tf_node()->type_string())) {
+      } else if (tftrt_op_denylist.count(node->tf_node()->type_string())) {
         // WARNING verbosity since the user explicitly requests this behavior.
         LOG_WARNING_WITH_PREFIX
-            << "Blacklisted as TF-TRT candidate, "
+            << "Denylisted as TF-TRT candidate, "
             << "(Op type: " << node->tf_node()->type_string() << "), "
             << "(Op name: " << node->name() << ")";
-        exclude_node("Blacklisted with the env var TF_TRT_OP_BLACKLIST");
+        exclude_node("Denylisted with the env var TF_TRT_OP_DENYLIST");
       } else {
         VLOG(2) << "Accepted as a TF-TRT candidate, "
                 << "(Op type: " << node->tf_node()->type_string() << "), "
@@ -1031,7 +1031,8 @@ Status SegmentGraph(const Graph* tf_graph,
         });
 
     // Don't use segments whose number of effective nodes is small.
-    if (num_effective_nodes < options.minimum_segment_size) {
+    if (num_effective_nodes == 0 ||
+        num_effective_nodes < options.minimum_segment_size) {
       VLOG(1) << "Segment " << segments->size() << " has only "
               << num_effective_nodes << " effective nodes, dropping";
       continue;
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
index d4f3a524577..a73877bc3cc 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
@@ -74,7 +74,7 @@ void* TRTDeviceAllocator::allocate(uint64_t size, uint64_t alignment,
   // algorithm uses too much memory. If we don't fail immediately building the
   // engine can be *very* slow with TensorRT7 when GPU memory is limited.
   AllocationAttributes attributes;
-  attributes.no_retry_on_failure = true;
+  attributes.retry_on_failure = false;
   void* mem = allocator_->AllocateRaw(alignment, total_size, attributes);
   if (!mem) return nullptr;
 
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
index 70a0a9a7b65..2f31865751f 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <functional>
 
+#include "absl/algorithm/container.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 
 #if GOOGLE_CUDA && GOOGLE_TENSORRT
@@ -35,14 +36,16 @@ void TrtShapeOptimizationProfile::InitProfiles() {
             << "for each input (min=opt=max).";
   }
   for (auto& shape_vec : input_shapes_) {
-    std::vector<nvinfer1::Dims> dimvec;
-    for (auto& shape : shape_vec) {
-      dimvec.push_back(TensorShapeToTrtDims(shape, false));
+    if (!shape_vec.empty()) {
+      std::vector<nvinfer1::Dims> dimvec(shape_vec.size());
+      absl::c_transform(shape_vec, dimvec.begin(), [](TensorShape shape) {
+        return TensorShapeToTrtDims(shape, false);
+      });
+      // Set min=opt=max.
+      OptimizationProfileConfig profConfig{dimvec, dimvec, dimvec};
+      profiles_.push_back(std::move(profConfig));
+      VLOG(1) << "Created profile " << profiles_.back().DebugString();
     }
-    // We set min=opt=max.
-    OptimizationProfileConfig profConfig{dimvec, dimvec, dimvec};
-    profiles_.push_back(std::move(profConfig));
-    VLOG(1) << "Created profile " << profiles_.back().DebugString();
   }
 }
 
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index cac72925dfd..1e57c11b2cf 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -27,6 +27,7 @@ package_group(
         "//tensorflow/compiler/mlir/...",
         "//tensorflow/compiler/tests/...",
         "//tensorflow/compiler/tf2xla/...",
+        "//tensorflow/core/tpu/...",
         "//tensorflow/python/compiler/...",
     ],
 )
@@ -49,6 +50,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":xla_compiler",
+        ":xla_op_registry",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -144,6 +146,7 @@ cc_library(
         ":tf2xla_proto_cc",
         ":tf2xla_util",
         ":xla_compiler",
+        ":xla_op_registry",
         "//tensorflow/compiler/aot:aot_only_var_handle_op",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla/client",
@@ -315,14 +318,8 @@ cc_library(
     srcs = [
         "const_analysis.cc",
         "graph_compiler.cc",
-        "xla_compilation_device.cc",
         "xla_compiler.cc",
-        "xla_context.cc",
-        "xla_expression.cc",
-        "xla_helpers.cc",
         "xla_op_kernel.cc",
-        "xla_op_registry.cc",
-        "xla_resource.cc",
         "xla_cpu_backend.cc",
     ] + if_cuda_is_configured([
         "xla_gpu_backend.cc",
@@ -332,14 +329,10 @@ cc_library(
     hdrs = [
         "const_analysis.h",
         "graph_compiler.h",
-        "xla_compilation_device.h",
         "xla_compiler.h",
-        "xla_context.h",
-        "xla_expression.h",
         "xla_helpers.h",
         "xla_op_kernel.h",
         "xla_op_registry.h",
-        "xla_resource.h",
     ],
     visibility = [":friends"],
     deps = [
@@ -350,10 +343,18 @@ cc_library(
         ":sharding_util",
         ":side_effect_util",
         ":tf2xla_util",
+        ":xla_argument",
+        ":xla_compilation_device",
+        ":xla_context",
+        ":xla_expression",
+        ":xla_helpers",
+        ":xla_op_registry",
+        ":xla_resource",
         "//tensorflow/compiler/jit:common",
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/jit:shape_inference",
         "//tensorflow/compiler/jit:xla_cluster_util",
+        "//tensorflow/compiler/mlir/tensorflow:compile_mlir_util_no_tf_dialect_passes",
         "//tensorflow/compiler/tf2xla/lib:util",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -369,6 +370,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -387,6 +389,172 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "xla_compilation_device",
+    srcs = [
+        "xla_compilation_device.cc",
+    ],
+    hdrs = [
+        "xla_compilation_device.h",
+    ],
+    deps = [
+        ":common",
+        ":frontend_attributes_util",
+        ":sharding_util",
+        ":xla_context",
+        ":xla_helpers",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:session_options",
+        "//tensorflow/core/common_runtime:core_cpu_internal",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "xla_context",
+    srcs = [
+        "xla_context.cc",
+    ],
+    hdrs = [
+        "xla_context.h",
+    ],
+    deps = [
+        ":common",
+        ":xla_expression",
+        ":xla_helpers",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/common_runtime:core_cpu_internal",
+        "@com_google_absl//absl/types:span",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "xla_op_registry",
+    srcs = [
+        "xla_op_registry.cc",
+    ],
+    hdrs = [
+        "xla_op_registry.h",
+    ],
+    visibility = [":friends"],
+    deps = [
+        ":common",
+        ":xla_context",
+        "//tensorflow/compiler/jit:flags",
+        "//tensorflow/compiler/jit:xla_cluster_util",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_options",
+        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/common_runtime:core_cpu_internal",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "xla_expression",
+    srcs = [
+        "xla_expression.cc",
+    ],
+    hdrs = [
+        "xla_expression.h",
+    ],
+    deps = [
+        ":common",
+        ":xla_resource",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/types:optional",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "xla_resource",
+    srcs = [
+        "xla_resource.cc",
+    ],
+    hdrs = [
+        "xla_resource.h",
+    ],
+    deps = [
+        ":common",
+        ":sharding_util",
+        ":xla_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "xla_helpers",
+    srcs = [
+        "xla_helpers.cc",
+    ],
+    hdrs = [
+        "xla_helpers.h",
+    ],
+    visibility = [":friends"],
+    deps = [
+        ":common",
+        ":host_compute_metadata_proto_cc",
+        "//tensorflow/compiler/tf2xla/lib:util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/types:span",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "xla_argument",
+    srcs = [
+        "xla_argument.cc",
+    ],
+    hdrs = [
+        "xla_argument.h",
+    ],
+    deps = [
+        ":host_compute_metadata_proto_cc",
+        ":xla_resource",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/core:framework",
+        "@com_google_absl//absl/types:span",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "common",
     srcs = [
@@ -563,6 +731,8 @@ tf_cc_test(
         ":common",
         ":side_effect_util",
         ":xla_compiler",
+        ":xla_expression",
+        ":xla_resource",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:function_ops",
         "//tensorflow/cc:functional_ops",
diff --git a/tensorflow/compiler/tf2xla/const_analysis.cc b/tensorflow/compiler/tf2xla/const_analysis.cc
index 1da34266460..694aa342aac 100644
--- a/tensorflow/compiler/tf2xla/const_analysis.cc
+++ b/tensorflow/compiler/tf2xla/const_analysis.cc
@@ -74,8 +74,7 @@ Status CondConstInputIndices(
         *(fbody->graph), &compile_time_const_arg_indices,
         /*compile_time_const_nodes=*/nullptr, flib_runtime));
   }
-  for (int i = 0, iter_limit = compile_time_const_arg_indices.size();
-       i < iter_limit; i++) {
+  for (int i = 0, end = compile_time_const_arg_indices.size(); i < end; i++) {
     if (compile_time_const_arg_indices[i]) {
       // The 0th input is the pred or branch index, which is not passed to the
       // branches. So the i'th input of a branch function corresponds to the
@@ -141,7 +140,7 @@ Status GetCompileTimeConstInputs(const NodeDef& node, const OpKernel* op_kernel,
         GetFunctionBody(flib_runtime, node, "else_branch", &felse));
     return CondConstInputIndices({fthen, felse}, const_input_idxs,
                                  flib_runtime);
-  } else if (node.op() == "Case") {
+  } else if (node.op() == "Case" || node.op() == "StatelessCase") {
     std::vector<const FunctionBody*> branch_bodies;
     TF_RETURN_IF_ERROR(
         GetFunctionBodies(flib_runtime, node, "branches", &branch_bodies));
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond.cc b/tensorflow/compiler/tf2xla/functionalize_cond.cc
index 459b2814c0d..54abccb4cfc 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_cond.cc
@@ -224,8 +224,8 @@ string DebugString(const CondArgNodes& nodes) {
 }
 
 StateMap::CondId StateMap::LookupCondId(const Node* node) const {
-  if (node->id() < node_to_condid_map_.size())
-    return node_to_condid_map_[node->id()];
+  const int64 map_size = node_to_condid_map_.size();
+  if (node->id() < map_size) return node_to_condid_map_[node->id()];
   return added_node_condid_mapping_.at(node->id());
 }
 
@@ -235,15 +235,16 @@ StateMap::CondId StateMap::GetCondId(const StateMap::CondState& state) {
 }
 
 void StateMap::ResetCondId(const Node* node, StateMap::CondId id) {
-  if (node->id() < node_to_condid_map_.size())
+  const int64 map_size = node_to_condid_map_.size();
+  if (node->id() < map_size)
     node_to_condid_map_[node->id()] = id;
   else
     added_node_condid_mapping_[node->id()] = id;
 }
 
 StateMap::AncestorId StateMap::LookupAncestorId(const Node* node) const {
-  if (node->id() < node_to_ancestorid_map_.size())
-    return node_to_ancestorid_map_[node->id()];
+  const int64 map_size = node_to_ancestorid_map_.size();
+  if (node->id() < map_size) return node_to_ancestorid_map_[node->id()];
   return added_node_ancestorid_mapping_.at(node->id());
 }
 
@@ -254,7 +255,8 @@ StateMap::AncestorId StateMap::GetAncestorId(
 }
 
 void StateMap::ResetAncestorId(const Node* node, StateMap::AncestorId id) {
-  if (node->id() < node_to_ancestorid_map_.size())
+  const int64 map_size = node_to_ancestorid_map_.size();
+  if (node->id() < map_size)
     node_to_ancestorid_map_[node->id()] = id;
   else
     added_node_ancestorid_mapping_[node->id()] = id;
diff --git a/tensorflow/compiler/tf2xla/functionalize_while.cc b/tensorflow/compiler/tf2xla/functionalize_while.cc
index cea4973f42b..dce5efe5557 100644
--- a/tensorflow/compiler/tf2xla/functionalize_while.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_while.cc
@@ -130,7 +130,7 @@ Status BuildLoopCondition(const Graph& graph, WhileLoopFrame* frame,
   std::vector<bool> squash_src_outputs(graph.num_node_ids(), false);
 
   // Build one _Arg node for each Enter node.
-  for (int i = 0; i < frame->args.size(); ++i) {
+  for (int i = 0, end = frame->args.size(); i < end; ++i) {
     const WhileLoopArg& arg = frame->args[i];
 
     TF_ASSIGN_OR_RETURN(Node * arg_node,
@@ -170,7 +170,7 @@ Status BuildLoopBody(const Graph& graph, WhileLoopFrame* frame,
   std::vector<Node*> next_iterations;
   next_iterations.reserve(frame->args.size());
   arg_types->reserve(frame->args.size());
-  for (int i = 0; i < frame->args.size(); ++i) {
+  for (int i = 0, end = frame->args.size(); i < end; ++i) {
     const WhileLoopArg& arg = frame->args[i];
 
     DataType dtype = arg.enter->input_type(0);
@@ -235,7 +235,7 @@ Status FunctionalizeLoop(Graph* graph, WhileLoopFrame* frame,
     } else {
       std::vector<const Edge*> edges(arg.enter->out_edges().begin(),
                                      arg.enter->out_edges().end());
-      for (int i = 0; i < edges.size(); ++i) {
+      for (int i = 0, end = edges.size(); i < end; ++i) {
         if (edges[i]->IsControlEdge() && edges[i]->dst()->IsSink()) {
           continue;
         }
@@ -447,7 +447,7 @@ Status FunctionalizeLoop(Graph* graph, WhileLoopFrame* frame,
     }
   }
   std::vector<NodeDefBuilder::NodeOut> inputs;
-  for (int i = 0; i < frame->args.size(); ++i) {
+  for (int i = 0, end = frame->args.size(); i < end; ++i) {
     const WhileLoopArg& arg = frame->args[i];
     const Edge* in_edge;
     TF_RETURN_IF_ERROR(arg.enter->input_edge(0, &in_edge));
@@ -463,7 +463,7 @@ Status FunctionalizeLoop(Graph* graph, WhileLoopFrame* frame,
   TF_ASSIGN_OR_RETURN(Node * while_node, AddNodeDefToGraph(while_def, graph));
 
   // Copies edges to the Enter nodes and from the Exit nodes onto the While.
-  for (int i = 0; i < frame->args.size(); ++i) {
+  for (int i = 0, end = frame->args.size(); i < end; ++i) {
     const WhileLoopArg& arg = frame->args[i];
     const Edge* in_edge;
     TF_RETURN_IF_ERROR(arg.enter->input_edge(0, &in_edge));
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
index 5f6dcad5538..30a7e94775b 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -65,7 +65,7 @@ Status PrepareArguments(XlaOpKernelContext* ctx, Graph* graph,
       /*compile_time_const_nodes=*/nullptr, ctx->function_library()));
 
   args->resize(expressions.size());
-  for (int i = 0, iter_limit = args->size(); i < iter_limit; ++i) {
+  for (int i = 0, end = args->size(); i < end; ++i) {
     XlaCompiler::Argument& arg = (*args)[i];
     arg.type = ctx->input_type(i);
     arg.shape = ctx->InputShape(i);
@@ -269,7 +269,7 @@ Status GraphCompiler::CompileFunctionalNode(Node* n,
   TF_RET_CHECK(arguments.size() == expressions.size());
 
   std::vector<xla::XlaOp> handles;
-  for (int64 i = 0, iter_limit = expressions.size(); i < iter_limit; ++i) {
+  for (int64 i = 0, end = expressions.size(); i < end; ++i) {
     if (arguments[i].kind == XlaCompiler::Argument::kConstant) {
       continue;
     }
@@ -313,8 +313,7 @@ Status GraphCompiler::CompileFunctionalNode(Node* n,
     }
   }
 
-  for (int64 i = 0, iter_limit = result.resource_updates.size(); i < iter_limit;
-       i++) {
+  for (int64 i = 0, end = result.resource_updates.size(); i < end; i++) {
     if (result.resource_updates[i].modified) {
       XlaResource* resource =
           expressions[result.resource_updates[i].input_index]->resource();
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index e072225566d..26051c98cb7 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -145,7 +145,12 @@ tf_kernel_library(
         "//tensorflow/compiler/jit:xla_activity_listener",
         "//tensorflow/compiler/jit:xla_activity_proto_cc",
         "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:xla_compilation_device",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla:xla_context",
+        "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
+        "//tensorflow/compiler/tf2xla:xla_resource",
         "//tensorflow/compiler/tf2xla/lib:broadcast",
         "//tensorflow/compiler/tf2xla/lib:data_format",
         "//tensorflow/compiler/tf2xla/lib:random",
@@ -223,6 +228,8 @@ cc_library(
     deps = [
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
@@ -230,7 +237,7 @@ cc_library(
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/core:framework",
-        "//tensorflow/core:framework_bounds_check",
+        "//tensorflow/core/framework:bounds_check",
         "//tensorflow/core/kernels:conv_grad_shape_utils",
         "@com_google_absl//absl/types:span",
     ],
@@ -276,6 +283,8 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:side_effect_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:status_macros",
@@ -296,6 +305,8 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:side_effect_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla:xla_context",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla/client:xla_builder",
@@ -314,6 +325,8 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:side_effect_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla:xla_context",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla/client:xla_builder",
@@ -333,6 +346,7 @@ tf_kernel_library(
     ],
     deps = [
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/tf2xla/kernels/case_op.cc b/tensorflow/compiler/tf2xla/kernels/case_op.cc
index fbd54f1ef39..7a3d87c101c 100644
--- a/tensorflow/compiler/tf2xla/kernels/case_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/case_op.cc
@@ -160,17 +160,15 @@ void XlaCaseOp::Compile(XlaOpKernelContext* ctx) {
   XlaCompiler* compiler = ctx->compiler();
 
   std::vector<XlaCompiler::CompilationResult> branch_results(num_branches);
-  std::vector<XlaCompiler::CompilationResult*> branch_results_p(num_branches);
   for (int j = 0; j < num_branches; ++j) {
     OP_REQUIRES_OK(ctx,
                    compiler->CompileFunction(options, branches[j], arguments,
                                              &branch_results[j]));
-    branch_results_p[j] = &branch_results[j];
   }
 
   bool has_tensor_array_gradients = false;
-  for (XlaCompiler::CompilationResult* result : branch_results_p) {
-    for (const XlaCompiler::ResourceUpdate& update : result->resource_updates) {
+  for (XlaCompiler::CompilationResult& result : branch_results) {
+    for (const XlaCompiler::ResourceUpdate& update : result.resource_updates) {
       XlaResource* resource;
       OP_REQUIRES_OK(ctx,
                      ctx->GetResourceInput(update.input_index + 1, &resource));
@@ -373,5 +371,7 @@ void XlaCaseOp::Compile(XlaOpKernelContext* ctx) {
 
 REGISTER_XLA_OP(Name("Case").AllowResourceTypes().AllowVariantTypes(),
                 XlaCaseOp);
+REGISTER_XLA_OP(Name("StatelessCase").AllowResourceTypes().AllowVariantTypes(),
+                XlaCaseOp);
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/concat_op.cc b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
index 09c97de13eb..d0f24b5f561 100644
--- a/tensorflow/compiler/tf2xla/kernels/concat_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
@@ -186,9 +186,11 @@ class ConcatOffsetOp : public XlaOpKernel {
           const int32 inp0_element = inp0_dims[j];
           const int32 inp_element = inp_dims[j];
           OP_REQUIRES(ctx, inp0_element == inp_element,
-                      errors::InvalidArgument("input[", i, ",", j,
-                                              "] mismatch: ", inp0_element,
-                                              " vs. ", inp_element));
+                      errors::InvalidArgument(
+                          "All dimensions except ", axis, " must match. Input ",
+                          i, " has shape [", absl::StrJoin(inp_dims, " "),
+                          "] and doesn't match input 0 with shape [",
+                          absl::StrJoin(inp0_dims, " "), "]."));
           out_vec(j) = 0;
         }
       }
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
index e0bc2ba5052..d29644dd0de 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
@@ -44,7 +44,7 @@ namespace tensorflow {
 namespace {
 
 // Returns the expanded size of a filter used for depthwise convolution.
-// If `shape` is [H, W, ..., M, N] returns [H, W, ..., M, M*N].
+// If `shape` is [H, W, ..., M, N] returns [H, W, ..., 1, M*N].
 xla::Shape GroupedFilterShapeForDepthwiseConvolution(
     const xla::Shape& filter_shape) {
   int64 input_feature_dim = filter_shape.dimensions_size() - 2;
@@ -52,7 +52,7 @@ xla::Shape GroupedFilterShapeForDepthwiseConvolution(
   int64 depthwise_multiplier = filter_shape.dimensions(output_feature_dim);
   int64 input_feature = filter_shape.dimensions(input_feature_dim);
 
-  // Create a [H, W, ..., 1, N*M] reshape of the filter.
+  // Create a [H, W, ..., 1, M*N] reshape of the filter.
   xla::Shape grouped_filter_shape = filter_shape;
   grouped_filter_shape.set_dimensions(input_feature_dim, 1);
   grouped_filter_shape.set_dimensions(output_feature_dim,
@@ -203,6 +203,10 @@ xla::StatusOr<ConvOpAttrs> ConvOpAttrs::Create(int num_spatial_dims,
     return errors::InvalidArgument("Invalid data format: ", data_format);
   }
 
+  TF_RETURN_IF_ERROR(CheckValidPadding(attrs.padding, attrs.explicit_paddings,
+                                       /*num_dims=*/num_spatial_dims + 2,
+                                       attrs.data_format));
+
   return attrs;
 }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.cc b/tensorflow/compiler/tf2xla/kernels/if_op.cc
index 2a059f78526..3a88fcf4879 100644
--- a/tensorflow/compiler/tf2xla/kernels/if_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/if_op.cc
@@ -47,6 +47,122 @@ XlaIfOp::XlaIfOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
   }
 }
 
+// Populates tensor array gradients for compiled branches, returns whether the
+// set of found tensor array gradients is non-empty.
+static xla::StatusOr<bool> PopulateTensorArrayGradients(
+    XlaOpKernelContext* ctx, xla::XlaBuilder* b,
+    absl::Span<XlaCompiler::Argument> arguments,
+    XlaCompiler::CompilationResult* then_result,
+    XlaCompiler::CompilationResult* else_result) {
+  bool has_tensor_array_gradients = false;
+  for (XlaCompiler::CompilationResult* result : {then_result, else_result}) {
+    for (const XlaCompiler::ResourceUpdate& update : result->resource_updates) {
+      XlaResource* resource;
+      TF_RETURN_IF_ERROR(
+          ctx->GetResourceInput(update.input_index + 1, &resource));
+      XlaCompiler::Argument& arg = arguments[update.input_index];
+
+      // Add any TensorArray gradients touched by the then/else computation to
+      // the enclosing graph.
+      for (const string& grad_source : update.tensor_array_gradients_accessed) {
+        VLOG(5) << "TensorArray " << resource->name() << " accessed gradient "
+                << grad_source;
+        XlaResource* gradient;
+        TF_RETURN_IF_ERROR(resource->GetOrCreateTensorArrayGradient(
+            grad_source, b, &gradient));
+      }
+      // Add all of the TensorArray gradients to the argument. For simplicity,
+      // we always pass all known gradients.
+      for (const auto& gradient : resource->tensor_array_gradients()) {
+        arg.tensor_array_gradients.insert(gradient.first);
+      }
+      if (!resource->tensor_array_gradients().empty())
+        has_tensor_array_gradients = true;
+    }
+  }
+  return has_tensor_array_gradients;
+}
+
+// Checks that shapes matches on both sides of the conditional.
+static Status ValidateShapes(
+    XlaOpKernelContext* ctx, const XlaCompiler::CompilationResult& then_result,
+    const XlaCompiler::CompilationResult& else_result) {
+  // Check that both branches have identical input shapes.
+  if (then_result.xla_input_shapes.size() != 1) {
+    return errors::FailedPrecondition("Expected one input shape");
+  }
+
+  xla::Shape then_input_shape = then_result.xla_input_shapes[0];
+  if (!then_input_shape.IsTuple()) {
+    return errors::FailedPrecondition("Expected tuple shape");
+  }
+
+  if (else_result.xla_input_shapes.size() != 1) {
+    return errors::FailedPrecondition("Expected one input shape");
+  }
+  xla::Shape else_input_shape = else_result.xla_input_shapes[0];
+  if (!else_input_shape.IsTuple()) {
+    return errors::FailedPrecondition("Expected tuple shape");
+  }
+  if (!xla::ShapeUtil::Compatible(then_input_shape, else_input_shape)) {
+    return errors::InvalidArgument(
+        "Input shapes of then and else branches do not match: ",
+        xla::ShapeUtil::HumanString(then_input_shape), " vs. ",
+        xla::ShapeUtil::HumanString(else_input_shape));
+  }
+
+  // Check that both branches have identical output shapes.
+  if (!xla::ShapeUtil::Compatible(then_result.xla_output_shape,
+                                  else_result.xla_output_shape)) {
+    return errors::InvalidArgument(
+        "Output shapes of then and else branches do not match: ",
+        xla::ShapeUtil::HumanString(then_result.xla_output_shape), " vs. ",
+        xla::ShapeUtil::HumanString(else_result.xla_output_shape));
+  }
+
+  // Check that both branches have same TensorList output indices.
+  for (int output_index = 0; output_index < then_result.outputs.size();
+       output_index++) {
+    bool is_tensor_list_in_then_branch =
+        then_result.outputs[output_index].is_tensor_list;
+    bool is_tensor_list_in_else_branch =
+        else_result.outputs[output_index].is_tensor_list;
+    if (is_tensor_list_in_then_branch != is_tensor_list_in_else_branch) {
+      return errors::FailedPrecondition(
+          "Output #", output_index, " is ",
+          (is_tensor_list_in_then_branch ? "" : "not"),
+          " a TensorList in then branch, but is ",
+          (is_tensor_list_in_else_branch ? "" : "not"),
+          " a TensorList in else branch");
+    }
+  }
+
+  VLOG(2) << "Input shape: " << xla::ShapeUtil::HumanString(then_input_shape);
+  VLOG(2) << "Output shape: "
+          << xla::ShapeUtil::HumanString(then_result.xla_output_shape);
+
+  // We set return_updated_values_for_all_resources=true and we pass the same
+  // arguments to both computations, so the resource update count must match.
+  if (then_result.resource_updates.size() !=
+      else_result.resource_updates.size()) {
+    return errors::FailedPrecondition(
+        "Different number of resources in then and else branch");
+  }
+
+  for (int i = 0; i < then_result.resource_updates.size(); ++i) {
+    const auto& lhs = then_result.resource_updates[i];
+    const auto& rhs = else_result.resource_updates[i];
+    bool equal = lhs.input_index == rhs.input_index && lhs.shape == rhs.shape &&
+                 lhs.tensor_array_gradients_accessed ==
+                     rhs.tensor_array_gradients_accessed;
+    if (!equal) {
+      return errors::FailedPrecondition(
+          "Mismatch in resource of then and else branch for resource ", i);
+    }
+  }
+  return Status::OK();
+}
+
 // TODO(b/35949885): There is duplication here with the handling of the
 // while_op. Refactor the common code out/rework.
 void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
@@ -137,35 +253,12 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, compiler->CompileFunction(options, else_branch_,
                                                 arguments, &else_result));
 
-  bool has_tensor_array_gradients = false;
-  for (XlaCompiler::CompilationResult* result : {&then_result, &else_result}) {
-    for (const XlaCompiler::ResourceUpdate& update : result->resource_updates) {
-      XlaResource* resource;
-      OP_REQUIRES_OK(ctx,
-                     ctx->GetResourceInput(update.input_index + 1, &resource));
-      XlaCompiler::Argument& arg = arguments[update.input_index];
-
-      // Add any TensorArray gradients touched by the then/else computation to
-      // the enclosing graph.
-      for (const string& grad_source : update.tensor_array_gradients_accessed) {
-        VLOG(5) << "TensorArray " << resource->name() << " accessed gradient "
-                << grad_source;
-        XlaResource* gradient;
-        OP_REQUIRES_OK(ctx, resource->GetOrCreateTensorArrayGradient(
-                                grad_source, b, &gradient));
-      }
-      // Add all of the TensorArray gradients to the argument. For simplicity,
-      // we always pass all known gradients.
-      for (const auto& gradient : resource->tensor_array_gradients()) {
-        arg.tensor_array_gradients.insert(gradient.first);
-      }
-      if (!resource->tensor_array_gradients().empty())
-        has_tensor_array_gradients = true;
-    }
-  }
+  xla::StatusOr<bool> has_tensor_array_gradients = PopulateTensorArrayGradients(
+      ctx, b, absl::MakeSpan(arguments), &then_result, &else_result);
+  OP_REQUIRES_OK(ctx, has_tensor_array_gradients.status());
 
   // Recompile the functions to update the argument shapes for tensor arrays.
-  if (has_tensor_array_gradients) {
+  if (*has_tensor_array_gradients) {
     then_result = {};
     OP_REQUIRES_OK(ctx, compiler->CompileFunction(options, then_branch_,
                                                   arguments, &then_result));
@@ -174,72 +267,7 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
                                                   arguments, &else_result));
   }
 
-  // Check that both branches have identical input shapes.
-  OP_REQUIRES(ctx, then_result.xla_input_shapes.size() == 1,
-              errors::FailedPrecondition("Expected one input shape"));
-  xla::Shape then_input_shape = then_result.xla_input_shapes[0];
-  OP_REQUIRES(ctx, then_input_shape.IsTuple(),
-              errors::FailedPrecondition("Expected tuple shape"));
-  OP_REQUIRES(ctx, else_result.xla_input_shapes.size() == 1,
-              errors::FailedPrecondition("Expected one input shape"));
-  xla::Shape else_input_shape = else_result.xla_input_shapes[0];
-  OP_REQUIRES(ctx, else_input_shape.IsTuple(),
-              errors::FailedPrecondition("Expected tuple shape"));
-  OP_REQUIRES(ctx,
-              xla::ShapeUtil::Compatible(then_input_shape, else_input_shape),
-              errors::InvalidArgument(
-                  "Input shapes of then and else branches do not match: ",
-                  xla::ShapeUtil::HumanString(then_input_shape), " vs. ",
-                  xla::ShapeUtil::HumanString(else_input_shape)));
-
-  // Check that both branches have identical output shapes.
-  OP_REQUIRES(
-      ctx,
-      xla::ShapeUtil::Compatible(then_result.xla_output_shape,
-                                 else_result.xla_output_shape),
-      errors::InvalidArgument(
-          "Output shapes of then and else branches do not match: ",
-          xla::ShapeUtil::HumanString(then_result.xla_output_shape), " vs. ",
-          xla::ShapeUtil::HumanString(else_result.xla_output_shape)));
-
-  // Check that both branches have same TensorList output indices.
-  for (int output_index = 0; output_index < then_result.outputs.size();
-       output_index++) {
-    bool is_tensor_list_in_then_branch =
-        then_result.outputs[output_index].is_tensor_list;
-    bool is_tensor_list_in_else_branch =
-        else_result.outputs[output_index].is_tensor_list;
-    OP_REQUIRES(
-        ctx, is_tensor_list_in_then_branch == is_tensor_list_in_else_branch,
-        errors::FailedPrecondition("Output #", output_index, " is ",
-                                   (is_tensor_list_in_then_branch ? "" : "not"),
-                                   " a TensorList in then branch, but is ",
-                                   (is_tensor_list_in_else_branch ? "" : "not"),
-                                   " a TensorList in else branch"));
-  }
-
-  VLOG(2) << "Input shape: " << xla::ShapeUtil::HumanString(then_input_shape);
-  VLOG(2) << "Output shape: "
-          << xla::ShapeUtil::HumanString(then_result.xla_output_shape);
-
-  // We set return_updated_values_for_all_resources=true and we pass the same
-  // arguments to both computations, so the resource update count must match.
-  OP_REQUIRES(ctx,
-              then_result.resource_updates.size() ==
-                  else_result.resource_updates.size(),
-              errors::FailedPrecondition(
-                  "Different number of resources in then and else branch"));
-  for (int i = 0; i < then_result.resource_updates.size(); ++i) {
-    const auto& lhs = then_result.resource_updates[i];
-    const auto& rhs = else_result.resource_updates[i];
-    bool equal = lhs.input_index == rhs.input_index && lhs.shape == rhs.shape &&
-                 lhs.tensor_array_gradients_accessed ==
-                     rhs.tensor_array_gradients_accessed;
-    OP_REQUIRES(
-        ctx, equal,
-        errors::FailedPrecondition(
-            "Mismatch in resource of then and else branch for resource ", i));
-  }
+  OP_REQUIRES_OK(ctx, ValidateShapes(ctx, then_result, else_result));
 
   int num_inputs = then_result.input_mapping.size();
   std::vector<xla::XlaOp> inputs(num_inputs);
@@ -263,22 +291,18 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
     }
   }
 
-  auto input_tuple = xla::Tuple(b, inputs);
+  xla::XlaOp input_tuple = xla::Tuple(b, inputs);
   xla::XlaOp outputs =
       xla::Conditional(ctx->Input(0), input_tuple, *then_result.computation,
                        input_tuple, *else_result.computation);
+
   // Sets non-variable outputs.
   for (int i = 0; i < output_types_.size(); ++i) {
     xla::XlaOp output_handle = xla::GetTupleElement(outputs, i);
     if (VLOG_IS_ON(2)) {
-      LOG(INFO) << "Setting output " << i;
-      auto shape_or = b->GetShape(output_handle);
-      if (shape_or.ok()) {
-        LOG(INFO) << "Shape for output " << i << ": "
-                  << xla::ShapeUtil::HumanString(shape_or.ValueOrDie());
-      } else {
-        LOG(INFO) << "Shape unknown for output " << i;
-      }
+      xla::StatusOr<xla::Shape> shape = b->GetShape(output_handle);
+      VLOG(2) << "Setting output " << i << " with shape "
+              << (shape.ok() ? shape->ToString() : "<unknown>");
     }
     // We have checked that both branches have same TensorList output indices.
     if (then_result.outputs[i].is_tensor_list) {
@@ -287,6 +311,7 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
       ctx->SetOutput(i, output_handle);
     }
   }
+
   if (has_token_input_output_) {
     // Set token output for this "If" op. Token output is the last output of
     // XLA computation, which comes after all "normal" TF outputs and resource
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops.cc b/tensorflow/compiler/tf2xla/kernels/index_ops.cc
index 31637d9d8a0..df6d9b475dc 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops.cc
@@ -71,7 +71,7 @@ void XlaArgMinMaxOp::Compile(XlaOpKernelContext* ctx) {
     if (is_gpu_) {
       output = xla::ArgMinTwoPass(input, index_xla_type, axis);
     } else {
-      output = xla::ArgMin(input, index_xla_type, axis);
+      output = xla::ArgMin(input, index_xla_type, axis, /*stable=*/true);
     }
   } else {
     if (is_gpu_) {
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_diag_ops.cc b/tensorflow/compiler/tf2xla/kernels/matrix_diag_ops.cc
index 57e961917cc..c8da75157fc 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_diag_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_diag_ops.cc
@@ -243,8 +243,9 @@ class MatrixDiagOp : public XlaOpKernel {
         errors::InvalidArgument("MatrixDiag op must have at least one input"));
     const TensorShape diag_shape = context->InputShape(0);
     OP_REQUIRES(context, TensorShapeUtils::IsVectorOrHigher(diag_shape),
-                errors::InvalidArgument("Expected >= 1 dims, got shape ",
-                                        diag_shape.DebugString()));
+                errors::InvalidArgument(
+                    "diagonal must be at least 1-dim, received shape: ",
+                    diag_shape.DebugString()));
 
     const DataType dtype = context->expected_output_dtype(0);
     const xla::XlaOp zero = XlaHelpers::Zero(context->builder(), dtype);
diff --git a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
index bf9a9150ea6..a85ba547179 100644
--- a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
@@ -109,27 +109,33 @@ class ReshapeOp : public XlaOpKernel {
     VLOG(2) << "Reshape from " << input_shape.DebugString() << " to "
             << shape.DebugString() << ", unknown_index=" << unknown_index;
 
-    shape_input.clear();
-    // Run get input again, this time with dynamic dimension represented as
-    // "-1"
-    ctx->set_dynamic_dimension_is_minus_one(true);
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &shape_input));
-
     int dynamic_dimension = -1;
-
-    for (int d = 0; d < num_dims; ++d) {
-      const int32 size = shape_input[d];
-      if (size == -1) {
-        if (dynamic_dimension == -1) {
+    if (ctx->InputXlaShape(0)->is_dynamic()) {
+      std::vector<bool> dynamic_dims;
+      OP_REQUIRES_OK(ctx,
+                     ctx->ResolveInputDynamismIntoPredVector(1, &dynamic_dims));
+      for (int d = 0; d < num_dims; ++d) {
+        const bool dim_is_dynamic = dynamic_dims[d];
+        if (dim_is_dynamic) {
           dynamic_dimension = d;
-        } else {
-          if (unknown_index != d) {
-            dynamic_dimension = d;
-          }
         }
       }
-    }
 
+      // When reshaping from dynamic dimension, unkwown index is considered
+      // dynamic. E.g.,
+      //   [<=10]
+      //     |
+      // Reshape
+      //     |
+      //   [2, -1]
+      // The second dimension is dynamic.
+      if (dynamic_dimension == -1) {
+        dynamic_dimension = unknown_index;
+      }
+      VLOG(2) << "Reshape from " << ctx->InputXlaShape(0)->ToString() << " to "
+              << xla::VectorString(shape.dim_sizes())
+              << ", dynamic_dim=" << dynamic_dimension;
+    }
     // Pass unknown_index to Xla::Reshape as a hint for dynamic shape inference
     // in XLA to know which output dimension is dynamic.
     ctx->SetOutput(0, xla::ReshapeWithInferredDimension(
diff --git a/tensorflow/compiler/tf2xla/kernels/sharding_op.cc b/tensorflow/compiler/tf2xla/kernels/sharding_op.cc
index 1047580264b..da268fe283c 100644
--- a/tensorflow/compiler/tf2xla/kernels/sharding_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sharding_op.cc
@@ -30,8 +30,15 @@ class ShardingOp : public XlaOpKernel {
   ~ShardingOp() override = default;
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::XlaOp input = ctx->Input(0);
-    auto shape_or = ctx->InputXlaShape(0);
+    xla::XlaOp input;
+    {
+      // The builder might create a broadcast from a constant, so we clear
+      // sharding for the input.
+      xla::XlaScopedShardingAssignment no_sharding(ctx->builder(),
+                                                   absl::nullopt);
+      input = ctx->Input(0);
+    }
+    auto shape_or = ctx->builder()->GetShape(input);
     OP_REQUIRES_OK(ctx, shape_or.status());
 
     ctx->SetOutput(
diff --git a/tensorflow/compiler/tf2xla/kernels/split_op.cc b/tensorflow/compiler/tf2xla/kernels/split_op.cc
index 7a0e240400b..dbaa84c223d 100644
--- a/tensorflow/compiler/tf2xla/kernels/split_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/split_op.cc
@@ -105,6 +105,10 @@ class SplitVOp : public XlaOpKernel {
     const TensorShape input_shape = ctx->InputShape(0);
     const TensorShape index_shape = ctx->InputShape(2);
 
+    OP_REQUIRES(ctx, index_shape.num_elements() == 1,
+                errors::InvalidArgument(
+                    "split_dim_tensor must have exactly one element."));
+
     int64 split_dim_orig;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(2, &split_dim_orig));
     int64 split_dim = split_dim_orig < 0 ? split_dim_orig + input_shape.dims()
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
index aa71e4d4364..0e367e10ec4 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
@@ -504,7 +504,9 @@ Status ExecuteTensorListGetItem(xla::XlaOp list, xla::XlaOp index,
 
   xla::XlaOp list_part = xla::GetTupleElement(list, 0);
   xla::XlaOp read = xla::DynamicSlice(list_part, start_indices, slice_shape);
-  for (int64 i = 0; i < buffer_shape.dimensions_size(); ++i) {
+  // Propagate dynamic dimensions from buffer to the sliced buffer, except for
+  // leading dimension (which is always static 1).
+  for (int64 i = 1; i < buffer_shape.dimensions_size(); ++i) {
     if (buffer_shape.is_dynamic_dimension(i)) {
       auto buffer = xla::GetTupleElement(list, 0);
       auto gds = xla::GetDimensionSize(buffer, i);
diff --git a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
index 65569576d41..9a4722d149e 100644
--- a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -110,11 +111,11 @@ REGISTER_XLA_OP(Name("ConjugateTranspose").CompileTimeConstantInput("perm"),
 
 // InvertPermutation frequently forms part of the gradient of Transpose.
 //
-// inv = InvertPermutationOp(T<int32> p) takes a permutation of
+// inv = InvertPermutationOp(p) takes a permutation of
 // integers 0, 1, ..., n - 1 and returns the inverted
 // permutation of p. I.e., inv[p[i]] == i, for i in [0 .. n).
 //
-// REQUIRES: input is a vector of int32.
+// REQUIRES: input is a vector of int32 or int64.
 // REQUIRES: input is a permutation of 0, 1, ..., n-1.
 
 class InvertPermutationOp : public XlaOpKernel {
@@ -122,11 +123,32 @@ class InvertPermutationOp : public XlaOpKernel {
   explicit InvertPermutationOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
+    DataType dtype = ctx->expected_output_dtype(0);
+    Status status;
+    switch (dtype) {
+      case DT_INT32:
+        InvertPermutation<int32>(ctx);
+        break;
+      case DT_INT64:
+        InvertPermutation<int64>(ctx);
+        break;
+      default:
+        // This should never happen since we restrict this kernel to only match
+        // inputs with supported Tensor datatype.
+        OP_REQUIRES_OK(ctx, errors::InvalidArgument(
+                                "InvertPermutation expects x as either ",
+                                "int32 or int64, not ", DataTypeString(dtype)));
+    }
+  }
+
+  template <typename T>
+  void InvertPermutation(XlaOpKernelContext* ctx) {
     OP_REQUIRES(ctx,
                 FastBoundsCheck(ctx->InputShape(0).num_elements(),
-                                std::numeric_limits<int32>::max()),
-                errors::InvalidArgument("permutation of nonnegative int32s "
-                                        "must have <= int32 max elements"));
+                                std::numeric_limits<T>::max()),
+                errors::InvalidArgument(
+                    "permutation of nonnegative integers must have <= ",
+                    std::numeric_limits<T>::max(), " elements"));
 
     auto e = ctx->InputExpression(0);
     auto tensor_or_status = e.ResolveConstant(ctx->compiler()->client());
@@ -142,7 +164,7 @@ class InvertPermutationOp : public XlaOpKernel {
 
       int size = perm.size();
 
-      std::vector<int32> output(size);
+      std::vector<T> output(size);
       std::fill_n(output.data(), size, -1);
       for (int i = 0; i < size; ++i) {
         const int64 d = perm[i];
@@ -153,11 +175,13 @@ class InvertPermutationOp : public XlaOpKernel {
         output[d] = i;
       }
 
-      ctx->SetOutput(0, xla::ConstantR1<int32>(ctx->builder(), output));
+      ctx->SetOutput(0, xla::ConstantR1<T>(ctx->builder(), output));
     } else {
       auto indices = ctx->Input(0);
-      int size = ctx->InputShape(0).num_elements();
-      auto iota = xla::Iota(ctx->builder(), xla::S32, size);
+      T size = ctx->InputShape(0).num_elements();
+      auto iota =
+          xla::Iota(ctx->builder(),
+                    xla::primitive_util::NativeToPrimitiveType<T>(), size);
       auto result = XlaScatter(iota, iota, indices,
                                /*indices_are_vectors=*/false, /*combiner=*/{},
                                ctx->builder());
@@ -167,8 +191,9 @@ class InvertPermutationOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("InvertPermutation").TypeConstraint("T", DT_INT32),
-                InvertPermutationOp);
+REGISTER_XLA_OP(
+    Name("InvertPermutation").TypeConstraint("T", {DT_INT32, DT_INT64}),
+    InvertPermutationOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
index 6d4393ee006..6fe6b164951 100644
--- a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 
 namespace tensorflow {
@@ -76,6 +77,8 @@ XLAJIT_MAKE_UNARY(Log1p, xla::Log1p(x));
 
 XLAJIT_MAKE_UNARY(Invert, xla::Not(x));
 XLAJIT_MAKE_UNARY(LogicalNot, xla::Not(x));
+XLAJIT_MAKE_UNARY(PopulationCount,
+                  xla::ConvertElementType(xla::PopulationCount(x), xla::U8));
 XLAJIT_MAKE_UNARY(Neg, -x);
 
 XLAJIT_MAKE_UNARY(Rint, xla::RoundToEven(x));
diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD
index f0bd97c85eb..531679d3905 100644
--- a/tensorflow/compiler/tf2xla/lib/BUILD
+++ b/tensorflow/compiler/tf2xla/lib/BUILD
@@ -38,6 +38,7 @@ cc_library(
     hdrs = ["random.h"],
     deps = [
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla:xla_helpers",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:xla_builder",
diff --git a/tensorflow/compiler/tf2xla/lib/data_format.cc b/tensorflow/compiler/tf2xla/lib/data_format.cc
index 2ab86c78e44..e5913a8bbf3 100644
--- a/tensorflow/compiler/tf2xla/lib/data_format.cc
+++ b/tensorflow/compiler/tf2xla/lib/data_format.cc
@@ -66,7 +66,7 @@ xla::StatusOr<xla::XlaOp> Expand(xla::XlaOp input, int64 dim) {
 
   // Move the newly created dimension to the end with a transpose.
   std::vector<int64> permutation;
-  for (int64 i = 0, iter_limit = expanded_shape.size(); i != iter_limit; ++i) {
+  for (int64 i = 0, end = expanded_shape.size(); i != end; ++i) {
     permutation.push_back(i);
     if (i == dim) {
       ++i;
diff --git a/tensorflow/compiler/tf2xla/literal_util.cc b/tensorflow/compiler/tf2xla/literal_util.cc
index 42a95bbb9f8..74ca16bbaeb 100644
--- a/tensorflow/compiler/tf2xla/literal_util.cc
+++ b/tensorflow/compiler/tf2xla/literal_util.cc
@@ -72,7 +72,7 @@ Status HostTensorsToBorrowingLiteralTuple(absl::Span<const Tensor> host_tensors,
   buf_ptrs.reserve(host_tensors.size());
   std::vector<xla::Shape> tensor_shapes(host_tensors.size());
 
-  for (int i = 0, iter_limit = host_tensors.size(); i < iter_limit; i++) {
+  for (int i = 0, end = host_tensors.size(); i < end; i++) {
     // Validate runtime shapes and fail if it doesn't match the contract.
     const Tensor* tensor = &host_tensors[i];
     buf_ptrs.emplace_back(static_cast<const char*>(DMAHelper::base(tensor)));
diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
index 862da1f3f95..f4b9e9654d2 100644
--- a/tensorflow/compiler/tf2xla/ops/xla_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
@@ -441,7 +441,8 @@ REGISTER_OP("XlaReduce")
         auto dim_in_range = [rank](int64 dim) {
           return dim >= 0 && dim < rank;
         };
-        if (rank < dimensions_to_reduce.size() ||
+        const int dimensions_to_reduce_size = dimensions_to_reduce.size();
+        if (rank < dimensions_to_reduce_size ||
             dims_set.size() != dimensions_to_reduce.size() ||
             !absl::c_all_of(dimensions_to_reduce, dim_in_range)) {
           return errors::InvalidArgument(
diff --git a/tensorflow/compiler/tf2xla/python/xla.py b/tensorflow/compiler/tf2xla/python/xla.py
index 0ebca2d546f..846dafa2570 100644
--- a/tensorflow/compiler/tf2xla/python/xla.py
+++ b/tensorflow/compiler/tf2xla/python/xla.py
@@ -28,6 +28,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.compiler.tf2xla.ops import gen_xla_ops
+from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -415,8 +416,11 @@ sharding = gen_xla_ops.xla_sharding
 
 @ops.RegisterGradient("XlaSharding")
 def _sharding_grad(op, grad):
-  del op  # Unused
-  return [grad]
+  grad_sharding = gen_xla_ops.xla_sharding(grad)
+  # pylint: disable=protected-access
+  grad_sharding.op._set_attr(
+      "_XlaSharding", attr_value_pb2.AttrValue(s=op.get_attr("_XlaSharding")))
+  return [grad_sharding]
 
 
 spmd_full_to_shard_shape = gen_xla_ops.xla_spmd_full_to_shard_shape
diff --git a/tensorflow/compiler/tf2xla/rearrange_function_argument.cc b/tensorflow/compiler/tf2xla/rearrange_function_argument.cc
index b6f8928f31e..ed7927a9999 100644
--- a/tensorflow/compiler/tf2xla/rearrange_function_argument.cc
+++ b/tensorflow/compiler/tf2xla/rearrange_function_argument.cc
@@ -41,7 +41,7 @@ std::vector<DataType> ShuffleInputDataTypeAttribute(
     const std::vector<DataType>& in_types,
     const std::vector<int>& index_mapping) {
   std::vector<DataType> result(index_mapping.size());
-  for (int i = 0; i < in_types.size(); i++) {
+  for (int i = 0, end = in_types.size(); i < end; i++) {
     result[index_mapping.at(i)] = in_types[i];
   }
   return result;
@@ -56,7 +56,7 @@ Status InputTypesNeedsRearrange(const std::vector<DataType>& in_types,
                                 bool* need_rewrite, int* resource_input_count,
                                 std::vector<int>* index_mapping) {
   int first_resource_index = -1;
-  for (int i = 0; i < in_types.size(); i++) {
+  for (int i = 0, end = in_types.size(); i < end; i++) {
     DataType type = in_types[i];
     if (type == DT_RESOURCE) {
       first_resource_index = i;
@@ -70,7 +70,7 @@ Status InputTypesNeedsRearrange(const std::vector<DataType>& in_types,
   }
 
   *need_rewrite = false;
-  for (int i = first_resource_index + 1; i < in_types.size(); i++) {
+  for (int i = first_resource_index + 1, end = in_types.size(); i < end; i++) {
     if (in_types[i] != DT_RESOURCE) {
       *need_rewrite = true;
       break;
@@ -81,7 +81,7 @@ Status InputTypesNeedsRearrange(const std::vector<DataType>& in_types,
   }
 
   *resource_input_count = 0;
-  for (int i = 0; i < in_types.size(); i++) {
+  for (int i = 0, end = in_types.size(); i < end; i++) {
     DataType type = in_types[i];
     if (type == DT_RESOURCE) {
       ++(*resource_input_count);
@@ -90,7 +90,7 @@ Status InputTypesNeedsRearrange(const std::vector<DataType>& in_types,
   int non_resource_index = 0,
       resource_index = in_types.size() - *resource_input_count;
   index_mapping->resize(in_types.size());
-  for (int i = 0; i < in_types.size(); i++) {
+  for (int i = 0, end = in_types.size(); i < end; i++) {
     if (in_types[i] != DT_RESOURCE) {
       (*index_mapping)[i] = non_resource_index;
       non_resource_index++;
@@ -180,7 +180,7 @@ Status CalculateRetvalRearrange(
     const gtl::InlinedVector<Node*, 4>& ret_nodes,  // non-absl ok
     std::map<int, int>* retval_index_mapping,
     std::map<int, int>* resource_retval_to_arg) {
-  for (int i = 0; i < ret_nodes.size(); i++) {
+  for (int i = 0, end = ret_nodes.size(); i < end; i++) {
     Node* n = ret_nodes[i];
     DataType t;
     TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "T", &t));
@@ -261,7 +261,7 @@ Status RearrangeOutputEdges(Node* n, Graph* g,
 void RearrangeRetvalNodes(
     const gtl::InlinedVector<Node*, 4>& ret_nodes,  // non-absl ok
     Graph* g, const std::map<int, int>& retval_index_mapping) {
-  for (int i = 0; i < ret_nodes.size(); i++) {
+  for (int i = 0, end = ret_nodes.size(); i < end; i++) {
     Node* n = ret_nodes[i];
     auto iter = retval_index_mapping.find(i);
     if (iter == retval_index_mapping.end()) {
@@ -317,7 +317,7 @@ Status MaybeRewriteWhileNode(
     //     lambda resource_var1, resource_var2: [resource_var2, resource_var1],
     //     [resource_var1, resource_var2])
     if (attr_name == "body") {
-      for (int i = 0; i < fbody->ret_nodes.size(); i++) {
+      for (int i = 0, end = fbody->ret_nodes.size(); i < end; i++) {
         Node* n = fbody->ret_nodes[i];
         DataType dtype;
         TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "T", &dtype));
@@ -349,7 +349,7 @@ Status MaybeRewriteWhileNode(
 
     RearrangeArgNodes(&fbody->arg_nodes, index_mapping);
     if (attr_name == "body") {
-      for (int i = 0; i < fbody->ret_nodes.size(); i++) {
+      for (int i = 0, end = fbody->ret_nodes.size(); i < end; i++) {
         Node* n = fbody->ret_nodes[i];
         int new_index = index_mapping.at(i);
         if (new_index < types.size() - resource_input_count) {
diff --git a/tensorflow/compiler/tf2xla/sharding_util.cc b/tensorflow/compiler/tf2xla/sharding_util.cc
index 366e8d49228..90585c9d98a 100644
--- a/tensorflow/compiler/tf2xla/sharding_util.cc
+++ b/tensorflow/compiler/tf2xla/sharding_util.cc
@@ -80,6 +80,30 @@ xla::StatusOr<absl::optional<xla::OpSharding>> ParseShardingFromDevice(
   return ParseShardingFromDevice(device_name, num_cores_per_replica, sharding);
 }
 
+xla::StatusOr<absl::optional<xla::OpSharding>> ParseShardingFromEdgeSource(
+    const Edge& edge, int num_cores_per_replica) {
+  if (edge.src() == nullptr) {
+    return tensorflow::errors::InvalidArgument(
+        "Null src for ParseShardingFromEdgeSource edge=", edge.DebugString());
+  }
+  TF_ASSIGN_OR_RETURN(
+      absl::optional<xla::OpSharding> sharding,
+      ParseShardingFromDevice(*edge.src(), num_cores_per_replica));
+  if (sharding.has_value() &&
+      sharding.value().type() == xla::OpSharding::TUPLE) {
+    if (edge.src_output() < 0 ||
+        edge.src_output() >= sharding.value().tuple_shardings_size()) {
+      return tensorflow::errors::InvalidArgument(
+          "Tuple index out of bound: edge=", edge.DebugString(),
+          " sharding=", sharding->DebugString());
+    }
+    absl::optional<xla::OpSharding> subsharding =
+        sharding.value().tuple_shardings(edge.src_output());
+    return subsharding;
+  }
+  return sharding;
+}
+
 void SetShardingDeviceAssignmentFromNode(const Node& src, Node* dst) {
   string device_name = src.assigned_device_name();
   if (device_name.empty()) {
diff --git a/tensorflow/compiler/tf2xla/sharding_util.h b/tensorflow/compiler/tf2xla/sharding_util.h
index 196434826f9..07657c656d3 100644
--- a/tensorflow/compiler/tf2xla/sharding_util.h
+++ b/tensorflow/compiler/tf2xla/sharding_util.h
@@ -43,6 +43,9 @@ xla::StatusOr<absl::optional<xla::OpSharding>> ParseShardingFromDevice(
 xla::StatusOr<absl::optional<xla::OpSharding>> ParseShardingFromDevice(
     const NodeDef& node_def, int num_cores_per_replica);
 
+xla::StatusOr<absl::optional<xla::OpSharding>> ParseShardingFromEdgeSource(
+    const Edge& edge, int num_cores_per_replica);
+
 void SetShardingDeviceAssignmentFromNode(const Node& src, Node* dst);
 
 // Get sharding inforamtion from node.
diff --git a/tensorflow/compiler/tf2xla/tf2xla.cc b/tensorflow/compiler/tf2xla/tf2xla.cc
index 0454bbb771a..242a2b04ab9 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla.cc
@@ -87,7 +87,7 @@ Status ConvertGraphToXla(std::unique_ptr<Graph> graph,
   *computation = std::move(*result.computation);
 
   int num_const_results = 0;
-  for (int i = 0, iter_limit = result.outputs.size(); i < iter_limit; ++i) {
+  for (int i = 0, end = result.outputs.size(); i < end; ++i) {
     // Ending up with const results (i.e. output args) is an error, since it
     // means that one or more fetches that the user specified will be dropped
     // from the generated function.  It's most likely a configuration error,
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc
index 5229104e674..8863b08b77b 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc
@@ -143,7 +143,7 @@ Status ReplaceArgUsageWithConstNode(
       usages.push_back({e->dst()->id(), e->dst_input()});
     }
 
-    for (int i = 0; i < usages.size(); i++) {
+    for (int i = 0, end = usages.size(); i < end; i++) {
       // Make a copy of `usage_node`, and change its input to const node.
       Node* usage_node = g->FindNodeId(usages[i].dst_node_id);
       NodeDef replace_def = usage_node->def();
@@ -158,7 +158,7 @@ Status ReplaceArgUsageWithConstNode(
 
       // Later entries in `usages` might have `usage_node` as dst node, but
       // `usage_node` is removed. Replace such entries with `replace_node`.
-      for (int j = i + 1; j < usages.size(); j++) {
+      for (int j = i + 1, end = usages.size(); j < end; j++) {
         if (usages[j].dst_node_id == usages[i].dst_node_id) {
           usages[j].dst_node_id = replace_node->id();
         }
diff --git a/tensorflow/compiler/tf2xla/xla_argument.cc b/tensorflow/compiler/tf2xla/xla_argument.cc
new file mode 100644
index 00000000000..fe31025386e
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/xla_argument.cc
@@ -0,0 +1,53 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_argument.h"
+
+namespace tensorflow {
+
+bool XlaArgument::operator==(const XlaArgument& other) const {
+  if (std::tie(kind, resource_kind, type, name, initialized, max_array_size,
+               tensor_array_gradients) !=
+      std::tie(other.kind, other.resource_kind, other.type, other.name,
+               other.initialized, other.max_array_size,
+               other.tensor_array_gradients)) {
+    return false;
+  }
+  if (absl::holds_alternative<xla::Shape>(shape)) {
+    if (!absl::holds_alternative<xla::Shape>(other.shape)) {
+      return false;
+    }
+    if (!xla::Shape::Equal()(absl::get<xla::Shape>(shape),
+                             absl::get<xla::Shape>(other.shape))) {
+      return false;
+    }
+  } else {
+    if (!absl::holds_alternative<TensorShape>(other.shape)) {
+      return false;
+    }
+    if (absl::get<TensorShape>(shape) != absl::get<TensorShape>(other.shape)) {
+      return false;
+    }
+  }
+  if (constant_value.shape() != other.constant_value.shape()) {
+    return false;
+  }
+  if (is_same_data_across_replicas != other.is_same_data_across_replicas) {
+    return false;
+  }
+  return constant_value.tensor_data() == other.constant_value.tensor_data();
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_argument.h b/tensorflow/compiler/tf2xla/xla_argument.h
new file mode 100644
index 00000000000..e2cd634e1d5
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/xla_argument.h
@@ -0,0 +1,121 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_ARGUMENT_H_
+#define TENSORFLOW_COMPILER_TF2XLA_XLA_ARGUMENT_H_
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
+#include "tensorflow/compiler/tf2xla/xla_resource.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+// Describes how to derive the value of each _Arg node in the graph/function
+// being compiled. There must be one Argument for each _Arg index.
+struct XlaArgument {
+  enum Kind {
+    // Default value; not a valid kind.
+    kInvalid,
+
+    // Argument is a compile-time constant. No associated runtime parameter.
+    kConstant,
+
+    // Argument is a Variable, TensorArray, or Stack resource. Has an
+    // associated runtime parameter iff `initialized` is true.
+    kResource,
+
+    // Argument is a run-time parameter.
+    kParameter,
+
+    // Argument is an XLA token.
+    kToken,
+
+    // Argument is a TensorList.
+    kTensorList,
+  };
+
+  Kind kind = kInvalid;
+
+  // The type of the argument. If the argument is a resource, this
+  // is the type of the variable's value, not DT_RESOURCE.
+  DataType type = DT_INVALID;
+
+  // The shape of the argument. For:
+  // * a parameter: the shape of the parameter. We allow setting the xla shape
+  //   if known. This helps avoid conversions to and from TensorShape.
+  // * a constant: ignored; the shape given by constant_value is used
+  //     instead.
+  // * an uninitialized resource: ignored. We don't yet know the shape of an
+  //     uninitialized resource (otherwise we would have initialized it!)
+  // * an initialized variable: the shape of the variable's value.
+  // * an initialized TensorArray or Stack resource: the shape of an entry in
+  //   the TensorArray/Stack. Note this is the size of a single entry, not the
+  //   XLA data structure that represents the complete stack/array.
+  absl::variant<TensorShape, xla::Shape> shape;
+
+  // The value of the argument, if it is a compile-time constant. Must be a
+  // host-memory tensor.
+  Tensor constant_value;
+
+  // The name of this argument, used for debugging.
+  string name;
+
+  // The name of TensorFlow _Arg node, used for debugging.
+  string node_name;
+
+  // For a kResource, what kind of resource is it?
+  XlaResource::Kind resource_kind = XlaResource::kInvalid;
+
+  // For a kResource, has this resource been initialized?
+  bool initialized = false;
+
+  // For a kResource, is this resource on Fast Memory.
+  bool fast_mem = false;
+
+  // For a TensorArray or Stack resource, what is the array's declared size?
+  // (Used for lazy initialization.)
+  int64 max_array_size = -1;
+
+  // TensorArray resource parameters are passed as (array, gradient array 0,
+  // ..., gradient array k), where the gradient arrays are in the same order
+  // as `tensor_array_gradients`.
+  std::set<string> tensor_array_gradients;
+
+  // dynamic dims to arg number map. Empty if no dynamic shapes.
+  std::map<int32, int32> dynamic_dim_to_arg_num_map;
+  bool is_pad_arg = false;
+
+  // Whether this argument will receive the same data across all replicas.
+  bool is_same_data_across_replicas = false;
+
+  bool operator==(const XlaArgument& other) const;
+
+  // Returns a human-readable summary of the argument.
+  string HumanString() const;
+
+  // Returns the dimension sizes for either TensorShape or xla::Shape.
+  std::vector<int64> DimensionSizes() const;
+  absl::InlinedVector<int64, 4> DimensionSizesAsInlinedVector() const;
+
+  // Returns the human-readable string for either TensorShape or xla::Shape.
+  string ShapeHumanString() const;
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_XLA_ARGUMENT_H_
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 6d92fd97793..635b7170d82 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/shape_inference.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h"
 #include "tensorflow/compiler/tf2xla/graph_compiler.h"
 #include "tensorflow/compiler/tf2xla/rearrange_function_argument.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
@@ -52,6 +53,7 @@ limitations under the License.
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
+#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
@@ -64,7 +66,7 @@ Status CheckSignature(const DataTypeVector& types,
     return errors::Internal("Compilation arguments have ", args.size(),
                             " elements while function has ", types.size());
   }
-  for (int i = 0, iter_limit = types.size(); i < iter_limit; ++i) {
+  for (int i = 0, end = types.size(); i < end; ++i) {
     // Don't perform type checks on resource variables and tensor
     // lists (DT_VARIANT) as we have to trick the type system in order to
     // plumb them through. DT_VARIANTS are wrapped in a DT_UINT8 tensor.
@@ -168,7 +170,7 @@ Status BuildComputation(
     int* num_computation_outputs, int* num_nonconst_outputs,
     std::vector<XlaCompiler::OutputDescription>* outputs,
     std::vector<XlaCompiler::ResourceUpdate>* resource_updates,
-    xla::Shape* output_shape) {
+    xla::Shape* output_shape, absl::Span<int const> input_mapping) {
   // Attach a common operator name as metadata. This has no semantic effect — it
   // merely makes the HLO graph more readable when visualized via TensorBoard,
   // since TensorBoard forms groups out of operators with similar names.
@@ -192,7 +194,7 @@ Status BuildComputation(
   // replicate sharding is used. The first element is the output index, second
   // element is the sharding.
   std::unordered_map<int, xla::OpSharding> retval_index_and_sharding;
-  for (int i = 0, iter_limit = retvals.size(); i < iter_limit; ++i) {
+  for (int i = 0, end = retvals.size(); i < end; ++i) {
     XlaCompiler::OutputDescription& output = (*outputs)[i];
     const XlaExpression& retval = retvals[i];
     output.type = retval.dtype();
@@ -268,6 +270,11 @@ Status BuildComputation(
               return a->arg_num() < b->arg_num();
             });
 
+  absl::flat_hash_map<int, int> argument_to_xla_arg;
+  for (int xla_arg = 0; xla_arg < input_mapping.size(); xla_arg++) {
+    argument_to_xla_arg[input_mapping[xla_arg]] = xla_arg;
+  }
+
   std::vector<xla::XlaBuilder::InputOutputAlias> aliases;
   for (const XlaResource* resource : arg_resources) {
     DCHECK_LT(resource->arg_num(), args.size());
@@ -290,19 +297,20 @@ Status BuildComputation(
       update.type = resource->type();
       update.shape = resource->shape();
       update.modified = modified;
+      int param_num = use_tuple_arg ? 0 : update.input_index;
       if (is_entry_computation &&
           arg.resource_kind != XlaResource::kTensorArray &&
-          alias_resource_update) {
+          alias_resource_update && argument_to_xla_arg.count(param_num)) {
         // Assuming tuple arg and results are used.
         xla::ShapeIndex param_index =
             use_tuple_arg ? xla::ShapeIndex({update.input_index})
                           : xla::ShapeIndex{};
-        int param_number = use_tuple_arg ? 0 : update.input_index;
+        int xla_param_num = argument_to_xla_arg[param_num];
         int64 output_index_num = elems.size();
         xla::ShapeIndex output_index = xla::ShapeIndex({output_index_num});
         VLOG(3) << "Storing alias: " << output_index.ToString() << ": ("
-                << param_number << ", " << param_index.ToString() << ")";
-        aliases.push_back({output_index, param_number, param_index});
+                << xla_param_num << ", " << param_index.ToString() << ")";
+        aliases.push_back({output_index, xla_param_num, param_index});
       }
       for (const auto& grad : resource->tensor_array_gradients()) {
         update.tensor_array_gradients_accessed.insert(grad.first);
@@ -356,7 +364,7 @@ Status BuildComputation(
     xla::Shape shape = xla::ShapeUtil::MakeTupleShape(elem_shapes);
     // Copy specified sharding from retval_index_and_sharding.
     std::vector<xla::HloSharding> sharding_elems;
-    for (int i = 0, iter_limit = elems.size(); i < iter_limit; i++) {
+    for (int i = 0, end = elems.size(); i < end; i++) {
       const auto& iter = retval_index_and_sharding.find(i);
       TF_RET_CHECK(iter != retval_index_and_sharding.end());
       const xla::OpSharding& sub_op_sharding = iter->second;
@@ -416,39 +424,6 @@ Status BuildComputation(
 
 }  // namespace
 
-bool XlaCompiler::Argument::operator==(
-    const XlaCompiler::Argument& other) const {
-  if (std::tie(kind, resource_kind, type, name, initialized, max_array_size,
-               tensor_array_gradients) !=
-      std::tie(other.kind, other.resource_kind, other.type, other.name,
-               other.initialized, other.max_array_size,
-               other.tensor_array_gradients)) {
-    return false;
-  }
-  if (absl::holds_alternative<xla::Shape>(shape)) {
-    if (!absl::holds_alternative<xla::Shape>(other.shape)) {
-      return false;
-    }
-    if (!xla::Shape::Equal()(absl::get<xla::Shape>(shape),
-                             absl::get<xla::Shape>(other.shape))) {
-      return false;
-    }
-  } else {
-    if (!absl::holds_alternative<TensorShape>(other.shape)) {
-      return false;
-    }
-    if (absl::get<TensorShape>(shape) != absl::get<TensorShape>(other.shape)) {
-      return false;
-    }
-  }
-  if (constant_value.shape() != other.constant_value.shape()) {
-    return false;
-  }
-  if (is_same_data_across_replicas != other.is_same_data_across_replicas) {
-    return false;
-  }
-  return constant_value.tensor_data() == other.constant_value.tensor_data();
-}
 
 string XlaCompiler::Argument::HumanString() const {
   string common;
@@ -701,7 +676,7 @@ Status XlaCompiler::CompileFunction(
   // Set shapes for _Arg nodes. They are useful for constant folding (e.g. an
   // Xla op requires a compile-time constant input, and that input is shape of
   // an _Arg node.
-  for (int i = 0, iter_limit = args.size(); i < iter_limit; i++) {
+  for (int i = 0, end = args.size(); i < end; i++) {
     // Skip resource variables and tensor lists.
     DataType dtype;
     TF_RETURN_IF_ERROR(GetNodeAttr(fbody->arg_nodes[i]->def(), "T", &dtype));
@@ -753,8 +728,18 @@ Status XlaCompiler::CompileFunction(
   }
 
   VLOG(1) << "====================================================";
-  TF_RETURN_IF_ERROR(
-      CompileGraph(options, function_id, std::move(graph), args, result));
+  if (GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge) {
+    VLOG(1) << "Using MLIR bridge";
+    GraphDebugInfo debug_info;
+    TF_RETURN_IF_ERROR(CompileGraphToXlaHlo(
+        std::move(*graph), {args.data(), args.size()},
+        options_.device_type.type_string(), options.use_tuple_arg,
+        *options_.flib_def, debug_info, options_.shape_representation_fn,
+        result));
+  } else {
+    TF_RETURN_IF_ERROR(
+        CompileGraph(options, function_id, std::move(graph), args, result));
+  }
   VLOG(1) << "====================================================";
 
   cache_[{function_id, arg_vector}] = *result;
@@ -943,7 +928,7 @@ Status XlaCompiler::BuildArguments(
   // to the d'th XLA input. Note that the value -1 corresponds to constants, or
   // other args that don't correspond to an input.
   std::vector<int> arg_to_inputs(args.size(), -1);
-  for (int i = 0, iter_limit = input_to_args->size(); i < iter_limit; i++) {
+  for (int i = 0, end = input_to_args->size(); i < end; i++) {
     arg_to_inputs[input_to_args->at(i)] = i;
   }
 
@@ -989,7 +974,7 @@ Status XlaCompiler::BuildArguments(
                                       : it->second;
       }
       std::vector<bool> is_same_across_replicas;
-      for (int i = 0, iter_limit = input_to_args->size(); i < iter_limit; ++i) {
+      for (int i = 0, end = input_to_args->size(); i < end; ++i) {
         // Add an entry to is_same_across_replicas for every leaf buffer.
         is_same_across_replicas.insert(
             is_same_across_replicas.end(),
@@ -1005,7 +990,7 @@ Status XlaCompiler::BuildArguments(
       tuple = xla::Parameter(builder, 0, (*input_shapes)[0], "arg_tuple");
     }
 
-    for (int i = 0, iter_limit = input_to_args->size(); i < iter_limit; ++i) {
+    for (int i = 0, end = input_to_args->size(); i < end; ++i) {
       const XlaCompiler::Argument& arg = args[input_to_args->at(i)];
       for (const auto& dim_and_arg_num : arg.dynamic_dim_to_arg_num_map) {
         int dynamic_size_param_index = arg_to_inputs.at(dim_and_arg_num.second);
@@ -1024,6 +1009,11 @@ Status XlaCompiler::BuildArguments(
       xla::XlaScopedShardingAssignment assign_sharding(
           builder, it == arg_shardings.end() ? absl::optional<xla::OpSharding>()
                                              : it->second);
+      auto& arg = args[input_to_args->at(i)];
+
+      xla::OpMetadata arg_metadata;
+      arg_metadata.set_op_name(arg.node_name);
+      builder->SetOneShotOpMetadata(arg_metadata);
       arg_handles[i] = xla::GetTupleElement(tuple, i);
     }
   } else {
@@ -1046,7 +1036,7 @@ Status XlaCompiler::BuildArguments(
       }
     }
 
-    for (int i = 0, iter_limit = input_to_args->size(); i < iter_limit; ++i) {
+    for (int i = 0, end = input_to_args->size(); i < end; ++i) {
       const XlaCompiler::Argument& arg = args[input_to_args->at(i)];
       for (const auto& dim_and_arg_num : arg.dynamic_dim_to_arg_num_map) {
         int dynamic_size_param_index = arg_to_inputs.at(dim_and_arg_num.second);
@@ -1315,7 +1305,8 @@ Status XlaCompiler::CompileGraph(
       options.always_return_tuple, options.use_tuple_arg,
       options.alias_resource_update, &builder, result->computation.get(),
       &num_computation_outputs, &num_nonconst_outputs, &result->outputs,
-      &result->resource_updates, &result->xla_output_shape));
+      &result->resource_updates, &result->xla_output_shape,
+      result->input_mapping));
 
   VLOG(2) << "Outputs: total: " << context->retvals().size()
           << " nonconstant: " << num_nonconst_outputs;
@@ -1366,7 +1357,7 @@ void SetTransfer(const string& key, absl::Span<const DataType> types,
                  tf2xla::HostTransferMetadata* transfer) {
   transfer->set_key(key);
   CHECK(types.size() == shapes.size());
-  for (int i = 0, iter_limit = types.size(); i < iter_limit; ++i) {
+  for (int i = 0, end = types.size(); i < end; ++i) {
     tf2xla::TensorMetadata* metadata = transfer->add_metadata();
     metadata->set_type(types[i]);
     shapes[i].AsProto(metadata->mutable_shape());
@@ -1482,93 +1473,4 @@ xla::StatusOr<xla::XlaOp> XlaCompiler::GetNodeToken(const string& node_name) {
   return iter->second;
 }
 
-XlaCompiler::ShapeRepresentationFn IdentityShapeRepresentationFn() {
-  return [](const TensorShape& shape, DataType dtype,
-            bool use_fast_memory) -> xla::StatusOr<xla::Shape> {
-    xla::Shape xla_shape;
-    TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dtype, shape, &xla_shape));
-    return xla_shape;
-  };
-}
-
-// Rewrites the layout of xla_shape if there is tiled sharding.
-Status RewriteLayoutWithShardedShape(
-    const absl::optional<xla::HloSharding>& sharding, bool use_fast_memory,
-    XlaCompiler::ShapeRepresentationFn shape_representation_fn,
-    xla::Shape* xla_shape) {
-  if (sharding && !sharding->IsTileMaximal()) {
-    // After sharding, per core shape might have different layout. For example,
-    // before sharding, a shape [128, 128] will be assigned default
-    // minor-to-major {1, 0}. But after we shard this shape to [128, 64] * 2,
-    // the sharded shapes will have minor-to-major {0, 1}.
-    //
-    // As a result, for sharded shapes, we set their layout to per core shape's
-    // layout.
-    //
-    // TODO(endlessroad): for variable input & update, we might have
-    // different layouts which will prevent input output aliasing and
-    // increase memory usage. Investigate such cases.
-    int64 device = *sharding->tile_assignment().begin();
-    std::vector<int64> offset =
-        sharding->TileOffsetForDevice(*xla_shape, device);
-    std::vector<int64> limit = sharding->TileLimitForDevice(*xla_shape, device);
-    std::vector<int64> dimensions(xla_shape->rank());
-    for (int64 i = 0; i < xla_shape->rank(); ++i) {
-      dimensions[i] = limit[i] - offset[i];
-    }
-    xla::Shape per_device_xla_shape =
-        xla::ShapeUtil::MakeShape(xla_shape->element_type(), dimensions);
-    TensorShape per_device_tensor_shape;
-    TF_RETURN_IF_ERROR(
-        XLAShapeToTensorShape(per_device_xla_shape, &per_device_tensor_shape));
-    TF_ASSIGN_OR_RETURN(DataType dtype, EncodePrimitiveTypeAsDataType(
-                                            xla_shape->element_type()));
-    TF_ASSIGN_OR_RETURN(per_device_xla_shape,
-                        shape_representation_fn(per_device_tensor_shape, dtype,
-                                                use_fast_memory));
-    *xla_shape->mutable_layout() = per_device_xla_shape.layout();
-  }
-  return Status::OK();
-}
-
-// There is a shape_representation_fn or sharding for an output, this function
-// uses a reshape to fix the layout.
-xla::StatusOr<xla::XlaOp> ReshapeWithCorrectRepresentationAndSharding(
-    xla::XlaBuilder* builder, xla::XlaOp original, xla::Shape original_shape,
-    XlaCompiler::ShapeRepresentationFn shape_representation_fn,
-    absl::optional<xla::OpSharding> sharding, bool fast_mem) {
-  if (original_shape.IsTuple()) {
-    std::vector<xla::XlaOp> elements;
-    for (int64 i = 0; i < original_shape.tuple_shapes_size(); ++i) {
-      auto subsharding = sharding ? sharding->tuple_shardings(i) : sharding;
-      TF_ASSIGN_OR_RETURN(auto element,
-                          ReshapeWithCorrectRepresentationAndSharding(
-                              builder, xla::GetTupleElement(original, i),
-                              original_shape.tuple_shapes(i),
-                              shape_representation_fn, subsharding, fast_mem));
-      elements.push_back(element);
-    }
-    return xla::Tuple(builder, elements);
-  }
-  if (!original_shape.IsArray()) return original;
-  TensorShape shape;
-  TF_RETURN_IF_ERROR(XLAShapeToTensorShape(original_shape, &shape));
-  TF_ASSIGN_OR_RETURN(DataType dtype, EncodePrimitiveTypeAsDataType(
-                                          original_shape.element_type()));
-  TF_ASSIGN_OR_RETURN(auto to_shape,
-                      shape_representation_fn(shape, dtype, fast_mem));
-  if (sharding) {
-    TF_ASSIGN_OR_RETURN(auto hlo_sharding,
-                        xla::HloSharding::FromProto(*sharding));
-    TF_RETURN_IF_ERROR(RewriteLayoutWithShardedShape(
-        hlo_sharding, fast_mem, shape_representation_fn, &to_shape));
-  }
-  if (xla::ShapeUtil::Compatible(original_shape, to_shape)) {
-    for (int64 i = 0; i < original_shape.rank(); ++i) {
-      to_shape.set_dynamic_dimension(i, original_shape.is_dynamic_dimension(i));
-    }
-  }
-  return xla::Reshape(to_shape, original);
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index b95d250636a..b0d93cde846 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -21,8 +21,10 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "absl/types/variant.h"
 #include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
+#include "tensorflow/compiler/tf2xla/xla_argument.h"
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_expression.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
@@ -97,96 +99,7 @@ class XlaContext;
 // `tensor_array_gradients` ordered set.
 class XlaCompiler {
  public:
-  // Describes how to derive the value of each _Arg node in the graph/function
-  // being compiled. There must be one Argument for each _Arg index.
-  struct Argument {
-    enum Kind {
-      // Default value; not a valid kind.
-      kInvalid,
-
-      // Argument is a compile-time constant. No associated runtime parameter.
-      kConstant,
-
-      // Argument is a Variable, TensorArray, or Stack resource. Has an
-      // associated runtime parameter iff `initialized` is true.
-      kResource,
-
-      // Argument is a run-time parameter.
-      kParameter,
-
-      // Argument is an XLA token.
-      kToken,
-
-      // Argument is a TensorList.
-      kTensorList,
-    };
-
-    Kind kind = kInvalid;
-
-    // The type of the argument. If the argument is a resource, this
-    // is the type of the variable's value, not DT_RESOURCE.
-    DataType type = DT_INVALID;
-
-    // The shape of the argument. For:
-    // * a parameter: the shape of the parameter. We allow setting the xla shape
-    //   if known. This helps avoid conversions to and from TensorShape.
-    // * a constant: ignored; the shape given by constant_value is used
-    //     instead.
-    // * an uninitialized resource: ignored. We don't yet know the shape of an
-    //     uninitialized resource (otherwise we would have initialized it!)
-    // * an initialized variable: the shape of the variable's value.
-    // * an initialized TensorArray or Stack resource: the shape of an entry in
-    //   the TensorArray/Stack. Note this is the size of a single entry, not the
-    //   XLA data structure that represents the complete stack/array.
-    absl::variant<TensorShape, xla::Shape> shape;
-
-    // The value of the argument, if it is a compile-time constant. Must be a
-    // host-memory tensor.
-    Tensor constant_value;
-
-    // The name of this argument, used for debugging.
-    string name;
-
-    // The name of TensorFlow _Arg node, used for debugging.
-    string node_name;
-
-    // For a kResource, what kind of resource is it?
-    XlaResource::Kind resource_kind = XlaResource::kInvalid;
-
-    // For a kResource, has this resource been initialized?
-    bool initialized = false;
-
-    // For a kResource, is this resource on Fast Memory.
-    bool fast_mem = false;
-
-    // For a TensorArray or Stack resource, what is the array's declared size?
-    // (Used for lazy initialization.)
-    int64 max_array_size = -1;
-
-    // TensorArray resource parameters are passed as (array, gradient array 0,
-    // ..., gradient array k), where the gradient arrays are in the same order
-    // as `tensor_array_gradients`.
-    std::set<string> tensor_array_gradients;
-
-    // dynamic dims to arg number map. Empty if no dynamic shapes.
-    std::map<int32, int32> dynamic_dim_to_arg_num_map;
-    bool is_pad_arg = false;
-
-    // Whether this argument will receive the same data across all replicas.
-    bool is_same_data_across_replicas = false;
-
-    bool operator==(const Argument& other) const;
-
-    // Returns a human-readable summary of the argument.
-    string HumanString() const;
-
-    // Returns the dimension sizes for either TensorShape or xla::Shape.
-    std::vector<int64> DimensionSizes() const;
-    absl::InlinedVector<int64, 4> DimensionSizesAsInlinedVector() const;
-
-    // Returns the human-readable string for either TensorShape or xla::Shape.
-    string ShapeHumanString() const;
-  };
+  using Argument = ::tensorflow::XlaArgument;
 
   // Options pertaining to an individual call to CompileGraph() or
   // CompileFunction().
@@ -221,77 +134,11 @@ class XlaCompiler {
     bool alias_resource_update = false;
   };
 
-  struct OutputDescription {
-    // Type and shape of the output. The shape is the unflattened shape.
-    // When `type` is DT_RESOURCE, `shape` is the shape of the resource
-    // variable's value.
-    DataType type;
-    TensorShape shape;
+  using OutputDescription = ::tensorflow::XlaOutputDescription;
 
-    // Constant output value, if known to be constant at JIT compilation time.
-    // 'Tensor' is in host memory.
-    bool is_constant = false;
-    Tensor constant_value;
+  using ResourceUpdate = ::tensorflow::XlaResourceUpdate;
 
-    // When this output is a resource, i.e. `type == DT_RESOURCE`, this is
-    // the index of the input that contains the resource.
-    int input_index;
-
-    // Whether this output is a TensorList.
-    bool is_tensor_list = false;
-  };
-
-  // Describes a variable write side effect of the computation.
-  struct ResourceUpdate {
-    // Index of the input that contains the variable resource to write to.
-    int input_index;
-
-    // Type and shape of the tensor to be written back.
-    // The `shape` field has the same meaning as the Argument::shape field.
-    DataType type;
-    TensorShape shape;
-
-    // Was the value of the variable modified by the computation?
-    // (Always true, unless `return_updated_values_for_all_resources` is true.)
-    bool modified;
-
-    // If the resource is a TensorArray, the set of gradients read or written.
-    std::set<string> tensor_array_gradients_accessed;
-  };
-
-  struct CompilationResult {
-    // Vector that maps from the parameters of the XLA computation to their
-    // original argument positions. To handle compile-time constant inputs, the
-    // parameters to the XLA computation may be a subset of the original
-    // arguments. The relative ordering of parameters are maintained.
-    std::vector<int> input_mapping;
-
-    // Input shapes of the computation. If we are flattening inputs, these are
-    // the flattened shapes.
-    std::vector<xla::Shape> xla_input_shapes;
-
-    // Output shape in XLA format. The output shape is always a tuple. If we
-    // are flattening outputs, these are the flattened shapes.
-    xla::Shape xla_output_shape;
-
-    // TensorFlow shapes of outputs, together with the values of any
-    // constant arguments. Vector indexed by Tensorflow _Retval number,
-    // containing both constant and non-constant results.
-    std::vector<OutputDescription> outputs;
-
-    // TensorFlow shapes and types of sends/recvs from HostCompute Ops to their
-    // matching RecvAtHost/SendFromHost Ops in the outer graph.
-    tf2xla::HostComputeMetadata host_compute_metadata;
-
-    // Resources whose values were updated by the computation, ordered
-    // by return value position (which is the same as the order the resources
-    // were passed as arguments). Resource updates follow the non-constant
-    // results in the outputs of XLA computation.
-    std::vector<ResourceUpdate> resource_updates;
-
-    // The XLA computation built from the tensorflow subgraph.
-    std::shared_ptr<xla::XlaComputation> computation;
-  };
+  using CompilationResult = ::tensorflow::XlaCompilationResult;
 
   typedef std::function<xla::StatusOr<xla::Shape>(const TensorShape&, DataType,
                                                   bool)>
@@ -518,21 +365,6 @@ class XlaCompiler {
   TF_DISALLOW_COPY_AND_ASSIGN(XlaCompiler);
 };
 
-// Creates an identity shape representation function.
-XlaCompiler::ShapeRepresentationFn IdentityShapeRepresentationFn();
-
-// Rewrites the layout of xla_shape if there is tiled sharding.
-Status RewriteLayoutWithShardedShape(
-    const absl::optional<xla::HloSharding>& sharding, bool use_fast_memory,
-    XlaCompiler::ShapeRepresentationFn shape_representation_fn,
-    xla::Shape* xla_shape);
-
-// Adds reshapes to fix the layout of an output, if a shape_representation_fn or
-// sharding is present.
-xla::StatusOr<xla::XlaOp> ReshapeWithCorrectRepresentationAndSharding(
-    xla::XlaBuilder* builder, xla::XlaOp original, xla::Shape original_shape,
-    XlaCompiler::ShapeRepresentationFn shape_representation_fn,
-    absl::optional<xla::OpSharding> sharding, bool fast_mem);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 4f1b6c8e7a9..5df508d60b3 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -1856,5 +1856,46 @@ TEST_F(XlaCompilerTest, DoNotConstantFoldShapeOp) {
   EXPECT_TRUE(xla::LiteralTestUtil::Equal(expected_literal, actual_literal));
 }
 
+TEST_F(XlaCompilerTest, AliasResourceUpdates) {
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto a = ops::Const<int32>(scope.WithOpName("A"), {1, 2});
+  auto var = ops::_Arg(scope.WithOpName("V"), DT_RESOURCE, 1);
+  auto write = ops::AssignAddVariableOp(scope, var, a);
+  auto read = ops::ReadVariableOp(
+      scope.WithControlDependencies(std::vector<Operation>{write}), var,
+      DT_INT32);
+  auto d = ops::_Retval(scope.WithOpName("D"), read, 0);
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+
+  // Builds a description of the arguments.
+  std::vector<XlaCompiler::Argument> args(2);
+  args[0].kind = XlaCompiler::Argument::kConstant;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({2});
+  args[0].constant_value = Tensor(DT_INT32, {1, 1});
+  args[0].initialized = true;
+
+  args[1].kind = XlaCompiler::Argument::kResource;
+  args[1].resource_kind = XlaResource::kVariable;
+  args[1].initialized = true;
+  args[1].type = DT_INT32;
+  args[1].shape = TensorShape({2});
+
+  XlaCompiler compiler(DefaultOptions());
+
+  XlaCompiler::CompileOptions compile_options;
+  compile_options.alias_resource_update = true;
+
+  XlaCompiler::CompilationResult result;
+  TF_ASSERT_OK(compiler.CompileGraph(compile_options, "add", std::move(graph),
+                                     args, &result));
+
+  const xla::HloInputOutputAliasProto& alias =
+      result.computation->proto().input_output_alias();
+  EXPECT_EQ(alias.entries_size(), 1);
+  EXPECT_EQ(alias.entries(0).parameter_number(), 0);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index c94c4805d53..cb5bf34208f 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
-#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h
index eb4ad3fe6a1..e44ac05b702 100644
--- a/tensorflow/compiler/tf2xla/xla_context.h
+++ b/tensorflow/compiler/tf2xla/xla_context.h
@@ -20,7 +20,6 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_expression.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
@@ -33,6 +32,7 @@ limitations under the License.
 namespace tensorflow {
 
 class XlaOpKernelContext;
+class XlaCompiler;
 
 // The XlaContext is the data structure that holds the state of an XLA
 // compilation, that is accessible from OpKernelContexts when compiling a
diff --git a/tensorflow/compiler/tf2xla/xla_expression.cc b/tensorflow/compiler/tf2xla/xla_expression.cc
index 49f108ed6c8..f0cc8d26709 100644
--- a/tensorflow/compiler/tf2xla/xla_expression.cc
+++ b/tensorflow/compiler/tf2xla/xla_expression.cc
@@ -101,6 +101,48 @@ xla::XlaOp XlaExpression::AsXlaOp(xla::XlaBuilder* builder) const {
   });
 }
 
+xla::StatusOr<Tensor> XlaExpression::ResolveDynamism(
+    xla::Client* client) const {
+  switch (kind()) {
+    case Kind::kConstant: {
+      // Constant values are considered static.
+      Tensor constant_false(DT_BOOL, constant_value().shape());
+      auto flat = constant_false.flat<bool>();
+      for (int64 i = 0; i < flat.size(); ++i) flat(i) = false;
+      return constant_false;
+    }
+    case Kind::kXlaOp:
+      break;
+    case Kind::kTensorList:
+      TF_FALLTHROUGH_INTENDED;
+    case Kind::kResource:
+      TF_FALLTHROUGH_INTENDED;
+    case Kind::kInvalid:
+      return errors::InvalidArgument(
+          "ResolveDynamism called on unsupported XlaExpression: ",
+          HumanString());
+  }
+
+  if (!client)
+    return errors::InvalidArgument("client is required to resolve constant");
+
+  TF_ASSIGN_OR_RETURN(xla::XlaComputation constant_graph,
+                      handle().builder()->BuildDynamicInferenceGraph(handle()));
+
+  TF_ASSIGN_OR_RETURN(TensorShape shape, GetShape());
+
+  // The XLA layout is specified minor to major, and TensorFlow uses a major to
+  // minor order.
+  std::vector<int64> layout_indices(shape.dims());
+  std::iota(layout_indices.rbegin(), layout_indices.rend(), 0);
+  xla::Layout layout = xla::LayoutUtil::MakeLayout(layout_indices);
+  TF_ASSIGN_OR_RETURN(xla::Literal literal,
+                      client->ComputeConstant(constant_graph, &layout));
+  Tensor tensor(DT_BOOL);
+  TF_RETURN_IF_ERROR(LiteralToHostTensor(literal, DT_BOOL, &tensor));
+  return tensor;
+}
+
 xla::StatusOr<absl::optional<Tensor>> XlaExpression::ResolveConstant(
     xla::Client* client, bool dynamic_dimension_is_minus_one) const {
   switch (kind()) {
@@ -163,4 +205,23 @@ xla::StatusOr<TensorShape> XlaExpression::GetShape() const {
   }
 }
 
+const XlaExpression* XlaExpression::CastExpressionFromTensor(
+    const Tensor& tensor) {
+  const XlaExpression* expression =
+      reinterpret_cast<const XlaExpression*>(tensor.tensor_data().data());
+  CHECK(expression->kind() != XlaExpression::Kind::kInvalid)
+      << expression->HumanString();
+  return expression;
+}
+
+// Assigns an XlaExpression to a tensor on an XLA compilation device.
+void XlaExpression::AssignExpressionToTensor(const XlaExpression& value,
+                                             Tensor* tensor) {
+  const XlaExpression* expression =
+      reinterpret_cast<const XlaExpression*>(tensor->tensor_data().data());
+  CHECK(expression->kind() == XlaExpression::Kind::kInvalid)
+      << expression->HumanString();
+  *const_cast<XlaExpression*>(expression) = value;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_expression.h b/tensorflow/compiler/tf2xla/xla_expression.h
index 5d0bb35b182..3546368ff7b 100644
--- a/tensorflow/compiler/tf2xla/xla_expression.h
+++ b/tensorflow/compiler/tf2xla/xla_expression.h
@@ -99,11 +99,22 @@ class XlaExpression {
   xla::StatusOr<absl::optional<Tensor>> ResolveConstant(
       xla::Client* client, bool dynamic_dimension_is_minus_one = false) const;
 
+  // ResolveDynamism computes where a value inside this op is dynamic or can be
+  // inferred at compile time.
+  xla::StatusOr<Tensor> ResolveDynamism(xla::Client* client) const;
+
   // Returns the shape of the tensor.
   // The shape of a resource is the shape of a resource handle (i.e., a scalar),
   // not the shape of the resource's value.
   xla::StatusOr<TensorShape> GetShape() const;
 
+  // Retrieves an XlaExpression that was allocated by a previous Op.
+  static const XlaExpression* CastExpressionFromTensor(const Tensor& tensor);
+
+  // Assigns an XlaExpression to a tensor on an XLA compilation device.
+  static void AssignExpressionToTensor(const XlaExpression& value,
+                                       Tensor* tensor);
+
  private:
   Kind kind_ = Kind::kInvalid;
 
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index 74247bbaec7..8c4b55aec8a 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -22,8 +22,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_context.h"
-#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
@@ -128,4 +126,93 @@ xla::XlaOp XlaHelpers::ConvertElementType(const xla::XlaOp& operand,
   return xla::ConvertElementType(operand, convert_to);
 }
 
+XlaHelpers::ShapeRepresentationFn IdentityShapeRepresentationFn() {
+  return [](const TensorShape& shape, DataType dtype,
+            bool use_fast_memory) -> xla::StatusOr<xla::Shape> {
+    xla::Shape xla_shape;
+    TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dtype, shape, &xla_shape));
+    return xla_shape;
+  };
+}
+
+// Rewrites the layout of xla_shape if there is tiled sharding.
+Status RewriteLayoutWithShardedShape(
+    const absl::optional<xla::HloSharding>& sharding, bool use_fast_memory,
+    XlaHelpers::ShapeRepresentationFn shape_representation_fn,
+    xla::Shape* xla_shape) {
+  if (sharding && !sharding->IsTileMaximal()) {
+    // After sharding, per core shape might have different layout. For example,
+    // before sharding, a shape [128, 128] will be assigned default
+    // minor-to-major {1, 0}. But after we shard this shape to [128, 64] * 2,
+    // the sharded shapes will have minor-to-major {0, 1}.
+    //
+    // As a result, for sharded shapes, we set their layout to per core shape's
+    // layout.
+    //
+    // TODO(endlessroad): for variable input & update, we might have
+    // different layouts which will prevent input output aliasing and
+    // increase memory usage. Investigate such cases.
+    int64 device = *sharding->tile_assignment().begin();
+    std::vector<int64> offset =
+        sharding->TileOffsetForDevice(*xla_shape, device);
+    std::vector<int64> limit = sharding->TileLimitForDevice(*xla_shape, device);
+    std::vector<int64> dimensions(xla_shape->rank());
+    for (int64 i = 0; i < xla_shape->rank(); ++i) {
+      dimensions[i] = limit[i] - offset[i];
+    }
+    xla::Shape per_device_xla_shape =
+        xla::ShapeUtil::MakeShape(xla_shape->element_type(), dimensions);
+    TensorShape per_device_tensor_shape;
+    TF_RETURN_IF_ERROR(
+        XLAShapeToTensorShape(per_device_xla_shape, &per_device_tensor_shape));
+    TF_ASSIGN_OR_RETURN(DataType dtype, EncodePrimitiveTypeAsDataType(
+                                            xla_shape->element_type()));
+    TF_ASSIGN_OR_RETURN(per_device_xla_shape,
+                        shape_representation_fn(per_device_tensor_shape, dtype,
+                                                use_fast_memory));
+    *xla_shape->mutable_layout() = per_device_xla_shape.layout();
+  }
+  return Status::OK();
+}
+
+// There is a shape_representation_fn or sharding for an output, this function
+// uses a reshape to fix the layout.
+xla::StatusOr<xla::XlaOp> ReshapeWithCorrectRepresentationAndSharding(
+    xla::XlaBuilder* builder, xla::XlaOp original, xla::Shape original_shape,
+    XlaHelpers::ShapeRepresentationFn shape_representation_fn,
+    absl::optional<xla::OpSharding> sharding, bool fast_mem) {
+  if (original_shape.IsTuple()) {
+    std::vector<xla::XlaOp> elements;
+    for (int64 i = 0; i < original_shape.tuple_shapes_size(); ++i) {
+      auto subsharding = sharding ? sharding->tuple_shardings(i) : sharding;
+      TF_ASSIGN_OR_RETURN(auto element,
+                          ReshapeWithCorrectRepresentationAndSharding(
+                              builder, xla::GetTupleElement(original, i),
+                              original_shape.tuple_shapes(i),
+                              shape_representation_fn, subsharding, fast_mem));
+      elements.push_back(element);
+    }
+    return xla::Tuple(builder, elements);
+  }
+  if (!original_shape.IsArray()) return original;
+  TensorShape shape;
+  TF_RETURN_IF_ERROR(XLAShapeToTensorShape(original_shape, &shape));
+  TF_ASSIGN_OR_RETURN(DataType dtype, EncodePrimitiveTypeAsDataType(
+                                          original_shape.element_type()));
+  TF_ASSIGN_OR_RETURN(auto to_shape,
+                      shape_representation_fn(shape, dtype, fast_mem));
+  if (sharding) {
+    TF_ASSIGN_OR_RETURN(auto hlo_sharding,
+                        xla::HloSharding::FromProto(*sharding));
+    TF_RETURN_IF_ERROR(RewriteLayoutWithShardedShape(
+        hlo_sharding, fast_mem, shape_representation_fn, &to_shape));
+  }
+  if (xla::ShapeUtil::Compatible(original_shape, to_shape)) {
+    for (int64 i = 0; i < original_shape.rank(); ++i) {
+      to_shape.set_dynamic_dimension(i, original_shape.is_dynamic_dimension(i));
+    }
+  }
+  return xla::Reshape(to_shape, original);
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.h b/tensorflow/compiler/tf2xla/xla_helpers.h
index 490923526bd..3a9375ec1f4 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.h
+++ b/tensorflow/compiler/tf2xla/xla_helpers.h
@@ -19,8 +19,9 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_TF2XLA_XLA_HELPERS_H_
 
 #include "absl/types/span.h"
-#include "tensorflow/compiler/tf2xla/xla_context.h"
+#include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/core/framework/tensor.h"
 
 namespace tensorflow {
@@ -72,6 +73,98 @@ class XlaHelpers {
   // than the xla::PrimitiveType.
   static xla::XlaOp ConvertElementType(const xla::XlaOp& operand,
                                        const DataType new_element_type);
+
+  typedef std::function<xla::StatusOr<xla::Shape>(const TensorShape&, DataType,
+                                                  bool)>
+      ShapeRepresentationFn;
+};
+
+// Creates an identity shape representation function.
+XlaHelpers::ShapeRepresentationFn IdentityShapeRepresentationFn();
+
+// Rewrites the layout of xla_shape if there is tiled sharding.
+Status RewriteLayoutWithShardedShape(
+    const absl::optional<xla::HloSharding>& sharding, bool use_fast_memory,
+    XlaHelpers::ShapeRepresentationFn shape_representation_fn,
+    xla::Shape* xla_shape);
+
+// Adds reshapes to fix the layout of an output, if a shape_representation_fn or
+// sharding is present.
+xla::StatusOr<xla::XlaOp> ReshapeWithCorrectRepresentationAndSharding(
+    xla::XlaBuilder* builder, xla::XlaOp original, xla::Shape original_shape,
+    XlaHelpers::ShapeRepresentationFn shape_representation_fn,
+    absl::optional<xla::OpSharding> sharding, bool fast_mem);
+
+struct XlaOutputDescription {
+  // Type and shape of the output. The shape is the unflattened shape.
+  // When `type` is DT_RESOURCE, `shape` is the shape of the resource
+  // variable's value.
+  DataType type;
+  TensorShape shape;
+
+  // Constant output value, if known to be constant at JIT compilation time.
+  // 'Tensor' is in host memory.
+  bool is_constant = false;
+  Tensor constant_value;
+
+  // When this output is a resource, i.e. `type == DT_RESOURCE`, this is
+  // the index of the input that contains the resource.
+  int input_index;
+
+  // Whether this output is a TensorList.
+  bool is_tensor_list = false;
+};
+
+// Describes a variable write side effect of the computation.
+struct XlaResourceUpdate {
+  // Index of the input that contains the variable resource to write to.
+  int input_index;
+
+  // Type and shape of the tensor to be written back.
+  // The `shape` field has the same meaning as the Argument::shape field.
+  DataType type;
+  TensorShape shape;
+
+  // Was the value of the variable modified by the computation?
+  // (Always true, unless `return_updated_values_for_all_resources` is true.)
+  bool modified;
+
+  // If the resource is a TensorArray, the set of gradients read or written.
+  std::set<string> tensor_array_gradients_accessed;
+};
+
+struct XlaCompilationResult {
+  // Vector that maps from the parameters of the XLA computation to their
+  // original argument positions. To handle compile-time constant inputs, the
+  // parameters to the XLA computation may be a subset of the original
+  // arguments. The relative ordering of parameters are maintained.
+  std::vector<int> input_mapping;
+
+  // Input shapes of the computation. If we are flattening inputs, these are
+  // the flattened shapes.
+  std::vector<xla::Shape> xla_input_shapes;
+
+  // Output shape in XLA format. The output shape is always a tuple. If we
+  // are flattening outputs, these are the flattened shapes.
+  xla::Shape xla_output_shape;
+
+  // TensorFlow shapes of outputs, together with the values of any
+  // constant arguments. Vector indexed by Tensorflow _Retval number,
+  // containing both constant and non-constant results.
+  std::vector<XlaOutputDescription> outputs;
+
+  // TensorFlow shapes and types of sends/recvs from HostCompute Ops to their
+  // matching RecvAtHost/SendFromHost Ops in the outer graph.
+  tf2xla::HostComputeMetadata host_compute_metadata;
+
+  // Resources whose values were updated by the computation, ordered
+  // by return value position (which is the same as the order the resources
+  // were passed as arguments). Resource updates follow the non-constant
+  // results in the outputs of XLA computation.
+  std::vector<XlaResourceUpdate> resource_updates;
+
+  // The XLA computation built from the tensorflow subgraph.
+  std::shared_ptr<xla::XlaComputation> computation;
 };
 
 }  // end namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index 27766408716..07537546d52 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -49,33 +49,13 @@ XlaCompiler* XlaOpKernelContext::compiler() const {
   return xla_context()->compiler();
 }
 
-// Retrieves an XlaExpression that was allocated by a previous Op.
-const XlaExpression* XlaOpKernelContext::CastExpressionFromTensor(
-    const Tensor& tensor) {
-  const XlaExpression* expression =
-      reinterpret_cast<const XlaExpression*>(tensor.tensor_data().data());
-  CHECK(expression->kind() != XlaExpression::Kind::kInvalid)
-      << expression->HumanString();
-  return expression;
-}
-
-// Assigns an XlaExpression to a tensor on an XLA compilation device.
-void XlaOpKernelContext::AssignExpressionToTensor(const XlaExpression& value,
-                                                  Tensor* tensor) {
-  const XlaExpression* expression =
-      reinterpret_cast<const XlaExpression*>(tensor->tensor_data().data());
-  CHECK(expression->kind() == XlaExpression::Kind::kInvalid)
-      << expression->HumanString();
-  *const_cast<XlaExpression*>(expression) = value;
-}
-
 const XlaExpression& XlaOpKernelContext::InputExpression(int index) {
-  return *CastExpressionFromTensor(context_->input(index));
+  return *XlaExpression::CastExpressionFromTensor(context_->input(index));
 }
 
 const XlaExpression& XlaOpKernelContext::InputExpression(
     absl::string_view name) {
-  return *CastExpressionFromTensor(GetInputTensorByName(name));
+  return *XlaExpression::CastExpressionFromTensor(GetInputTensorByName(name));
 }
 
 xla::XlaOp XlaOpKernelContext::Input(int index) {
@@ -108,7 +88,8 @@ DataType XlaOpKernelContext::input_type(int index) const {
   if (type == DT_UINT8) {
     // Masqueraded XlaExpression could have different type. See
     // XlaOpKernelContext::SetOutputExpression for details.
-    auto expression = CastExpressionFromTensor(context_->input(index));
+    auto expression =
+        XlaExpression::CastExpressionFromTensor(context_->input(index));
     type = expression->dtype();
   }
   return type;
@@ -120,7 +101,7 @@ DataType XlaOpKernelContext::InputType(absl::string_view name) {
   if (type == DT_UINT8) {
     // Masqueraded XlaExpression could have different type. See
     // XlaOpKernelContext::SetOutputExpression for details.
-    auto expression = CastExpressionFromTensor(tensor);
+    auto expression = XlaExpression::CastExpressionFromTensor(tensor);
     type = expression->dtype();
   }
   return type;
@@ -262,6 +243,48 @@ Status XlaOpKernelContext::ConstantInputAsFloatScalar(int index, double* out) {
   return LiteralToFloat64Scalar(literal, out);
 }
 
+static Status LiteralToPredVector(const xla::LiteralSlice& literal,
+                                  std::vector<bool>* out) {
+  if (literal.shape().rank() != 1) {
+    return errors::InvalidArgument("value is not 1D, rank: ",
+                                   literal.shape().rank());
+  }
+  int64 size = xla::ShapeUtil::ElementsIn(literal.shape());
+  if (literal.shape().element_type() != xla::PRED) {
+    return errors::InvalidArgument("value is not PRED");
+  }
+  for (int64 i = 0; i < size; ++i) {
+    out->push_back(literal.Get<bool>({i}));
+  }
+  return Status::OK();
+}
+
+Status XlaOpKernelContext::ResolveInputDynamismIntoPredVector(
+    int index, std::vector<bool>* out) {
+  xla::Literal literal;
+  XlaExpression e = InputExpression(index);
+  auto* client = compiler() ? compiler()->client() : nullptr;
+  xla::StatusOr<Tensor> dynamism_or_status = e.ResolveDynamism(client);
+  if (!dynamism_or_status.ok()) {
+    Status status = dynamism_or_status.status();
+    errors::AppendToMessage(&status, "while evaluating input dynamism", index,
+                            " of ", context_->op_kernel().type_string());
+    return status;
+  }
+  Tensor dynamism = dynamism_or_status.ValueOrDie();
+
+  Tensor temp(dynamism.dtype());
+  TensorShape tensor_shape({InputShape(index).num_elements()});
+  if (!temp.CopyFrom(dynamism, tensor_shape)) {
+    return errors::InvalidArgument(
+        context_->op_kernel().name(), " input ", index, " has shape ",
+        dynamism.shape().DebugString(), " which is not a R1 ", tensor_shape);
+  }
+
+  TF_ASSIGN_OR_RETURN(literal, HostTensorToLiteral(temp));
+  return LiteralToPredVector(literal, out);
+}
+
 // Converts an int32 or int64 1D literal to an int64 vector.
 static Status LiteralToInt64Vector(const xla::LiteralSlice& literal,
                                    std::vector<int64>* out) {
@@ -385,7 +408,8 @@ Status XlaOpKernelContext::InputList(absl::string_view name,
   handles->clear();
   shapes->clear();
   for (const Tensor& input : inputs) {
-    handles->push_back(CastExpressionFromTensor(input)->AsXlaOp(builder()));
+    handles->push_back(
+        XlaExpression::CastExpressionFromTensor(input)->AsXlaOp(builder()));
     shapes->push_back(input.shape());
   }
   return Status::OK();
@@ -408,7 +432,7 @@ Status ReadVariableInputTensor(const Tensor& tensor, DataType type,
                                const XlaOpKernelContext* ctx,
                                TensorShape* shape, xla::XlaOp* value) {
   const XlaExpression* expression =
-      XlaOpKernelContext::CastExpressionFromTensor(tensor);
+      XlaExpression::CastExpressionFromTensor(tensor);
   XlaResource* variable = expression->resource();
   TF_RET_CHECK(variable != nullptr);
   TF_RET_CHECK(variable->kind() == XlaResource::kVariable);
@@ -461,7 +485,8 @@ Status XlaOpKernelContext::ReadVariableInput(absl::string_view name,
 Status XlaOpKernelContext::GetVariableTypeAndShape(int index, DataType* type,
                                                    TensorShape* shape) const {
   const Tensor& tensor = context_->input(index);
-  const XlaExpression* expression = CastExpressionFromTensor(tensor);
+  const XlaExpression* expression =
+      XlaExpression::CastExpressionFromTensor(tensor);
   XlaResource* variable = expression->resource();
   TF_RET_CHECK(variable != nullptr);
   TF_RET_CHECK(variable->kind() == XlaResource::kVariable);
@@ -502,8 +527,8 @@ void XlaOpKernelContext::SetOutputExpression(int index,
       TF_ASSIGN_OR_RETURN(TensorShape shape, expression.GetShape());
       TF_RETURN_IF_ERROR(context_->allocate_output(index, shape, &output));
     }
-    XlaOpKernelContext::AssignExpressionToTensor(
-        expression, context_->mutable_output(index));
+    XlaExpression::AssignExpressionToTensor(expression,
+                                            context_->mutable_output(index));
     return Status::OK();
   }();
   if (!status.ok()) {
@@ -542,7 +567,7 @@ void XlaOpKernelContext::SetResourceOutput(int index, XlaResource* resource) {
 
 Status XlaOpKernelContext::GetResourceInput(int index, XlaResource** resource) {
   const XlaExpression* expression =
-      CastExpressionFromTensor(context_->input(index));
+      XlaExpression::CastExpressionFromTensor(context_->input(index));
   TF_RET_CHECK(expression->resource() != nullptr);
   *resource = expression->resource();
   return Status::OK();
@@ -554,7 +579,7 @@ Status AssignVariableTensor(const Tensor& tensor, DataType type,
                             const XlaOpKernelContext* ctx, xla::XlaOp handle,
                             xla::XlaBuilder* builder) {
   const XlaExpression* expression =
-      XlaOpKernelContext::CastExpressionFromTensor(tensor);
+      XlaExpression::CastExpressionFromTensor(tensor);
   XlaResource* variable = expression->resource();
   TF_RET_CHECK(variable != nullptr);
   TF_RET_CHECK(variable->kind() == XlaResource::kVariable);
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index 6987b6fbb98..75c3e60171a 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -17,6 +17,9 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_TF2XLA_XLA_OP_KERNEL_H_
 
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_context.h"
+#include "tensorflow/compiler/tf2xla/xla_expression.h"
+#include "tensorflow/compiler/tf2xla/xla_resource.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -113,6 +116,9 @@ class XlaOpKernelContext {
   // returns a one-element list.
   Status InputList(absl::string_view name, std::vector<xla::XlaOp>* handles,
                    std::vector<TensorShape>* shapes);
+  // Evaluates input and returns their dynamism vector in a vector of
+  // predicates.
+  Status ResolveInputDynamismIntoPredVector(int index, std::vector<bool>* out);
 
   // Helper methods for constant inputs.
 
@@ -284,13 +290,6 @@ class XlaOpKernelContext {
   // separate specialization of the computation for each DataType.
   const xla::XlaComputation* GetOrCreateMul(const DataType type);
 
-  // Assigns an XlaExpression to a tensor on an XLA compilation device.
-  static void AssignExpressionToTensor(const XlaExpression& value,
-                                       Tensor* tensor);
-
-  // Retrieves an XlaExpression that was assigned to the specified tensor.
-  static const XlaExpression* CastExpressionFromTensor(const Tensor& tensor);
-
  private:
   // Returns the tensor of input `name`.
   const Tensor& GetInputTensorByName(absl::string_view name);
diff --git a/tensorflow/compiler/tf2xla/xla_resource.cc b/tensorflow/compiler/tf2xla/xla_resource.cc
index 32d42cb8a42..bec0b46611d 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.cc
+++ b/tensorflow/compiler/tf2xla/xla_resource.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/sharding_util.h"
-#include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 
diff --git a/tensorflow/compiler/xla/array.h b/tensorflow/compiler/xla/array.h
index 67bad0f8af7..a85d551769c 100644
--- a/tensorflow/compiler/xla/array.h
+++ b/tensorflow/compiler/xla/array.h
@@ -289,13 +289,19 @@ class Array {
   }
 
   // Fills the array with random normal variables with the specified mean.
-  void FillRandom(const T& stddev, const double mean = 0.0,
-                  const int seed = 12345) {
+  void FillRandom(const T& stddev, double mean = 0.0, int seed = 12345) {
+    FillRandomDouble(static_cast<double>(stddev), mean, seed);
+  }
+
+  void FillRandomDouble(double stddev, double mean = 0.0, int seed = 12345) {
     std::mt19937 g(seed);
-    std::normal_distribution<double> distribution(mean,
-                                                  static_cast<double>(stddev));
+    std::normal_distribution<double> distribution(mean, stddev);
     for (int64 i = 0; i < num_elements(); ++i) {
-      values_[i] = static_cast<T>(distribution(g));
+      if (std::is_same<T, bool>()) {
+        values_[i] = static_cast<T>(distribution(g) > 0.0);
+      } else {
+        values_[i] = static_cast<T>(distribution(g));
+      }
     }
   }
 
@@ -403,7 +409,8 @@ class Array {
 
   // Returns the size of the dimension at the given index.
   int64 dim(int64 n) const {
-    CHECK(n < sizes_.size());
+    const int64 sizes_size = sizes_.size();
+    CHECK(n < sizes_size);
     return sizes_[n];
   }
 
@@ -427,7 +434,7 @@ class Array {
     if (sizes_.size() != other.sizes_.size()) {
       return false;
     }
-    for (int64 i = 0; i < sizes_.size(); ++i) {
+    for (int64 i = 0, end = sizes_.size(); i < end; ++i) {
       if (sizes_[i] != other.sizes_[i]) {
         return false;
       }
diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc
index 4f020bcec27..09449aeb8b8 100644
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@@ -312,7 +312,7 @@ StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
   // device 0.
   //
   // TODO(b/118493728): Allow Execute to return one result per computation.
-  for (int64 i = 0; i < results.size(); i++) {
+  for (int64 i = 0, end = results.size(); i < end; i++) {
     TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(*results[i]));
     if (!ShapeUtil::IsEmptyTuple(shape)) {
       VLOG(3) << "Fetching result from device " << i << ": "
@@ -350,7 +350,7 @@ StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
   }
 
   std::vector<std::unique_ptr<GlobalData>> outputs;
-  for (size_t i = 0; i < response.responses_size(); ++i) {
+  for (size_t i = 0, end = response.responses_size(); i < end; ++i) {
     outputs.push_back(
         absl::make_unique<GlobalData>(stub_, response.responses(i).output()));
     if (i < computations.size() &&
diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc
index 404f9eb7519..f39a3e79fe5 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.cc
+++ b/tensorflow/compiler/xla/client/executable_build_options.cc
@@ -76,6 +76,12 @@ ExecutableBuildOptions& ExecutableBuildOptions::set_use_spmd_partitioning(
   return *this;
 }
 
+ExecutableBuildOptions& ExecutableBuildOptions::set_deduplicate_hlo(
+    bool deduplicate_hlo) {
+  deduplicate_hlo_ = deduplicate_hlo;
+  return *this;
+}
+
 ExecutableBuildOptions& ExecutableBuildOptions::set_device_assignment(
     const DeviceAssignment& device_assignment) {
   device_assignment_ = device_assignment;
diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h
index 9a7fdd974b1..d034eaa7fd6 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.h
+++ b/tensorflow/compiler/xla/client/executable_build_options.h
@@ -82,6 +82,9 @@ class ExecutableBuildOptions {
   bool use_spmd_partitioning() const { return use_spmd_partitioning_; }
   ExecutableBuildOptions& set_use_spmd_partitioning(bool use_spmd_partitioning);
 
+  bool deduplicate_hlo() const { return deduplicate_hlo_; }
+  ExecutableBuildOptions& set_deduplicate_hlo(bool deduplicate_hlo);
+
   // If set, this specifies a static device assignment for the computation.
   // Otherwise, the computation will be compiled generically and can be run with
   // any device assignment compatible with the computation's replica and
@@ -110,6 +113,7 @@ class ExecutableBuildOptions {
   int num_replicas_ = 1;
   int num_partitions_ = 1;
   bool use_spmd_partitioning_ = false;
+  bool deduplicate_hlo_ = false;
   absl::optional<DeviceAssignment> device_assignment_;
   bool alias_passthrough_params_ = false;
 };
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index 06fd8ceeb2b..a3c7c39e3ff 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -55,9 +55,13 @@ xla_test(
 cc_library(
     name = "comparators",
     srcs = ["comparators.cc"],
-    hdrs = ["comparators.h"],
+    hdrs = [
+        "comparators.h",
+        "//tensorflow/compiler/xla:literal_util",
+    ],
     deps = [
         ":constants",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
diff --git a/tensorflow/compiler/xla/client/lib/comparators.cc b/tensorflow/compiler/xla/client/lib/comparators.cc
index 74e89b767cf..cd594a5cf39 100644
--- a/tensorflow/compiler/xla/client/lib/comparators.cc
+++ b/tensorflow/compiler/xla/client/lib/comparators.cc
@@ -32,85 +32,13 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using XlaOpGenerator = XlaOp (*)(XlaOp, XlaOp, absl::Span<const int64>);
-
-XlaOp BitcastConvertFloatingPointToIntegral(const XlaOp& value,
-                                            int64 bit_width) {
-  PrimitiveType signed_type;
-  PrimitiveType unsigned_type;
-  XlaOp max_value;
-  switch (bit_width) {
-    case 16:
-      max_value =
-          ConstantR0(value.builder(),
-                     static_cast<uint16>(std::numeric_limits<int16>::max()));
-      signed_type = S16;
-      unsigned_type = U16;
-      break;
-    case 32:
-      max_value =
-          ConstantR0(value.builder(),
-                     static_cast<uint32>(std::numeric_limits<int32>::max()));
-      signed_type = S32;
-      unsigned_type = U32;
-      break;
-    case 64:
-      max_value =
-          ConstantR0(value.builder(),
-                     static_cast<uint64>(std::numeric_limits<int64>::max()));
-      signed_type = S64;
-      unsigned_type = U64;
-      break;
-    default:
-      return value.builder()->ReportError(
-          InvalidArgument("Invalid bit width %lld for Comparator floating "
-                          "point parameter.",
-                          bit_width));
-  }
-  // Switch from a floating point value to a integer value in such a way that
-  // when using the integer value to compare, we get the same result for normal
-  // values, and -Nan is treated as the smallest value, and Nan is treated as
-  // the largest value.
-  // If f is a float, and
-  // x = bit_cast<int32>(f);
-  // y = x < 0 ? numeric_limits<int32>::max() - x : x;
-  // then y is ordered as an int32 such that finite values have the obvious
-  // order, -0 is ordered before 0, and -NaN and NaN appear at the beginning
-  // and end of the ordering.
-  // Note that in order to avoid -x to overflow, we calculate
-  // numeric_limits<int32>::max() - x as unsigned, and then convert back to
-  // signed.
-  auto signed_value = BitcastConvertType(value, signed_type);
-  auto unsigned_value = BitcastConvertType(value, unsigned_type);
-  auto flipped_value =
-      BitcastConvertType(Sub(max_value, unsigned_value), signed_type);
-  auto is_negative = Lt(signed_value, Zero(value.builder(), signed_type));
-  return Select(is_negative, flipped_value, signed_value);
-}
-
-void ConvertFloatingPoint(const PrimitiveType& operand_type, XlaOp* lhs_param,
-                          XlaOp* rhs_param) {
-  if (primitive_util::IsFloatingPointType(operand_type)) {
-    PrimitiveType compare_type = operand_type;
-    // Special-case handling for BF16. We currently do not support direct
-    // comparisons with BF16, so we convert to F32 and then use the F32
-    // comparison logic.
-    if (compare_type == BF16) {
-      compare_type = F32;
-      *lhs_param = ConvertElementType(*lhs_param, F32);
-      *rhs_param = ConvertElementType(*rhs_param, F32);
-    }
-    int64 bit_width = primitive_util::BitWidth(compare_type);
-    *lhs_param = BitcastConvertFloatingPointToIntegral(*lhs_param, bit_width);
-    *rhs_param = BitcastConvertFloatingPointToIntegral(*rhs_param, bit_width);
-  }
-}
+using XlaCompareOp = XlaOp (*)(XlaOp, XlaOp, absl::Span<const int64>);
 
 XlaComputation CreateScalarComparisonComputation(
     const string& name, const std::vector<PrimitiveType>& operand_types,
-    XlaBuilder* builder, XlaOpGenerator generator) {
+    XlaBuilder* builder, XlaCompareOp generator) {
   CHECK_NE(operand_types.size(), 0);
-  std::vector<absl::optional<XlaOpGenerator>> generators(operand_types.size());
+  std::vector<absl::optional<XlaCompareOp>> generators(operand_types.size());
   generators[0] = generator;
   return CreateScalarComparisonComputation(name, operand_types, generators,
                                            builder);
@@ -119,7 +47,7 @@ XlaComputation CreateScalarComparisonComputation(
 
 XlaComputation CreateScalarComparisonComputation(
     const string& name, const std::vector<PrimitiveType>& operand_types,
-    const std::vector<absl::optional<XlaOpGenerator>>& generators,
+    const std::vector<absl::optional<XlaCompareOp>>& generators,
     XlaBuilder* builder) {
   // Create a default computation where we compare only the first two
   // parameters of type 'operand_types[0]'.
@@ -146,7 +74,6 @@ XlaComputation CreateScalarComparisonComputation(
                                absl::StrCat("p.", parameter_count, ".lhs"));
     auto rhs_param = Parameter(b.get(), parameter_count * 2 + 1, scalar_shape,
                                absl::StrCat("p.", parameter_count, ".rhs"));
-    ConvertFloatingPoint(operand_type, &lhs_param, &rhs_param);
     lhs_params.emplace_back(lhs_param);
     rhs_params.emplace_back(rhs_param);
     if (generators[parameter_count].has_value()) {
@@ -169,7 +96,8 @@ XlaComputation CreateScalarComparisonComputation(
                       generators[i].value()(lhs_params[i], rhs_params[i], {}),
                       result);
       if (i != last_generator_index) {
-        param_equal = And(param_equal, Eq(lhs_params[i], rhs_params[i]));
+        param_equal =
+            And(param_equal, EqTotalOrder(lhs_params[i], rhs_params[i]));
       }
     }
   }
@@ -181,14 +109,14 @@ XlaComputation CreateScalarComparisonComputation(
 XlaComputation CreateScalarLtComputation(
     const std::vector<PrimitiveType>& operand_types, XlaBuilder* builder) {
   return CreateScalarComparisonComputation("compare-less-than", operand_types,
-                                           builder, Lt);
+                                           builder, LtTotalOrder);
 }
 
 // Creates a scalar greater-than computation and returns it.
 XlaComputation CreateScalarGtComputation(
     const std::vector<PrimitiveType>& operand_types, XlaBuilder* builder) {
-  return CreateScalarComparisonComputation("compare-greater-than",
-                                           operand_types, builder, Gt);
+  return CreateScalarComparisonComputation(
+      "compare-greater-than", operand_types, builder, GtTotalOrder);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/comparators.h b/tensorflow/compiler/xla/client/lib/comparators.h
index 25924d4a4f4..a82a84799aa 100644
--- a/tensorflow/compiler/xla/client/lib/comparators.h
+++ b/tensorflow/compiler/xla/client/lib/comparators.h
@@ -43,14 +43,13 @@ XlaComputation CreateScalarGtComputation(
     const std::vector<PrimitiveType>& operand_types, XlaBuilder* builder);
 
 // Creates a scalar comparison computation and returns it. This function takes
-// an std::vector<absl::optional<XlaOpGenerator>> and compare the operands
-// where the generator isn't nullopt with the specified comparator
-// at that location.
+// a vector of comparator functions to compare the operands where the function
+// isn't nullopt with the specified comparator at that location.
 XlaComputation CreateScalarComparisonComputation(
     const string& name, const std::vector<PrimitiveType>& operand_types,
     const std::vector<
         absl::optional<XlaOp (*)(XlaOp, XlaOp, absl::Span<const int64>)>>&
-        generators,
+        comparators,
     XlaBuilder* builder);
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/constants.cc b/tensorflow/compiler/xla/client/lib/constants.cc
index 6bd56a8df0a..4836dff7fa0 100644
--- a/tensorflow/compiler/xla/client/lib/constants.cc
+++ b/tensorflow/compiler/xla/client/lib/constants.cc
@@ -48,7 +48,9 @@ XlaOp Epsilon(XlaBuilder* builder, PrimitiveType type) {
           builder,
           static_cast<Eigen::half>(Eigen::NumTraits<Eigen::half>::epsilon()));
     case BF16:
-      return ConstantR0<bfloat16>(builder, bfloat16::epsilon());
+      return ConstantR0<Eigen::bfloat16>(
+          builder, static_cast<Eigen::bfloat16>(
+                       Eigen::NumTraits<Eigen::bfloat16>::epsilon()));
     case F32:
       return ConstantR0<float>(builder, std::numeric_limits<float>::epsilon());
     case F64:
@@ -70,7 +72,8 @@ XlaOp MinFiniteValue(XlaBuilder* builder, PrimitiveType type) {
       return ConstantR0<Eigen::half>(builder,
                                      Eigen::NumTraits<Eigen::half>::lowest());
     case BF16:
-      return ConstantR0<bfloat16>(builder, bfloat16::lowest());
+      return ConstantR0<Eigen::bfloat16>(
+          builder, Eigen::NumTraits<Eigen::bfloat16>::lowest());
     case F32:
       return ConstantR0<float>(builder, -std::numeric_limits<float>::max());
     case F64:
@@ -86,7 +89,8 @@ XlaOp MinPositiveNormalValue(XlaBuilder* builder, PrimitiveType type) {
       return ConstantR0<Eigen::half>(builder,
                                      std::numeric_limits<Eigen::half>::min());
     case BF16:
-      return ConstantR0<bfloat16>(builder, bfloat16::min_positive_normal());
+      return ConstantR0<Eigen::bfloat16>(
+          builder, std::numeric_limits<Eigen::bfloat16>::min());
     case F32:
       return ConstantR0<float>(builder, std::numeric_limits<float>::min());
     case F64:
@@ -108,7 +112,8 @@ XlaOp MaxFiniteValue(XlaBuilder* builder, PrimitiveType type) {
       return ConstantR0<Eigen::half>(builder,
                                      Eigen::NumTraits<Eigen::half>::highest());
     case BF16:
-      return ConstantR0<bfloat16>(builder, bfloat16::highest());
+      return ConstantR0<Eigen::bfloat16>(
+          builder, Eigen::NumTraits<Eigen::bfloat16>::highest());
     case F32:
       return ConstantR0<float>(builder, std::numeric_limits<float>::max());
     case F64:
@@ -125,8 +130,8 @@ XlaOp NanValue(XlaBuilder* builder, PrimitiveType type) {
         return ConstantR0<Eigen::half>(
             builder, Eigen::NumTraits<Eigen::half>::quiet_NaN());
       case BF16:
-        return ConstantR0<bfloat16>(
-            builder, bfloat16(std::numeric_limits<float>::quiet_NaN()));
+        return ConstantR0<Eigen::bfloat16>(
+            builder, Eigen::NumTraits<Eigen::bfloat16>::quiet_NaN());
       case F32:
         return ConstantR0<float>(builder,
                                  std::numeric_limits<float>::quiet_NaN());
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index baafd7d705b..6fdaab58686 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -511,7 +511,7 @@ XlaOp Lgamma(XlaOp input) {
     XlaOp z = Select(need_to_reflect, -input, input - one);
 
     XlaOp x = base_lanczos_coeff;
-    for (int i = 0; i < kLanczosCoefficients.size(); ++i) {
+    for (int i = 0, end = kLanczosCoefficients.size(); i < end; ++i) {
       XlaOp lanczos_coefficient = ScalarLike(input, kLanczosCoefficients[i]);
       XlaOp index = ScalarLike(input, i);
       x = x + lanczos_coefficient / (z + index + one);
@@ -647,7 +647,7 @@ XlaOp Digamma(XlaOp input) {
 
     XlaOp num = zero;
     XlaOp denom = base_lanczos_coeff;
-    for (int i = 0; i < kLanczosCoefficients.size(); ++i) {
+    for (int i = 0, end = kLanczosCoefficients.size(); i < end; ++i) {
       XlaOp lanczos_coefficient = ScalarLike(input, kLanczosCoefficients[i]);
       XlaOp index = ScalarLike(input, i);
       num = num - lanczos_coefficient / ((z + index + one) * (z + index + one));
diff --git a/tensorflow/compiler/xla/client/lib/pooling.cc b/tensorflow/compiler/xla/client/lib/pooling.cc
index 45033ec07e7..fb04b147ff2 100644
--- a/tensorflow/compiler/xla/client/lib/pooling.cc
+++ b/tensorflow/compiler/xla/client/lib/pooling.cc
@@ -198,15 +198,17 @@ XlaOp AvgPoolGrad(XlaOp out_backprop, absl::Span<const int64> gradients_size,
   XlaBuilder* b = out_backprop.builder();
   return b->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     const int num_dims = kernel_size.size();
-
-    if (gradients_size.size() != num_dims) {
+    const int num_gradients = gradients_size.size();
+    if (num_gradients != num_dims) {
       return tensorflow::errors::InvalidArgument("gradients must be ", num_dims,
                                                  "-dimensional");
     }
 
     TF_ASSIGN_OR_RETURN(Shape out_backprop_xla_shape,
                         b->GetShape(out_backprop));
-    if (out_backprop_xla_shape.dimensions().size() != num_dims) {
+    const int backprop_xla_num_dims =
+        out_backprop_xla_shape.dimensions().size();
+    if (backprop_xla_num_dims != num_dims) {
       return tensorflow::errors::InvalidArgument("out_backprop must be ",
                                                  num_dims, "-dimensional");
     }
diff --git a/tensorflow/compiler/xla/client/lib/slicing.cc b/tensorflow/compiler/xla/client/lib/slicing.cc
index 1ea713467f8..ebb35c5df82 100644
--- a/tensorflow/compiler/xla/client/lib/slicing.cc
+++ b/tensorflow/compiler/xla/client/lib/slicing.cc
@@ -74,12 +74,13 @@ XlaOp UpdateSlice(XlaOp x, XlaOp update, absl::Span<const int64> start) {
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
     const int64 n_dims = shape.rank();
-    TF_RET_CHECK(start.size() == n_dims);
+    const int64 start_size = start.size();
+    TF_RET_CHECK(start_size == n_dims);
 
     // TODO(phawkins): make int64 work on all backends, remove the int32 cast.
     std::vector<int32> start_as_int32(start.begin(), start.end());
     std::vector<XlaOp> start_ops(start.size());
-    for (int i = 0; i < start.size(); ++i) {
+    for (int i = 0, end = start.size(); i < end; ++i) {
       start_ops[i] = ConstantR0(builder, start_as_int32[i]);
     }
     return DynamicUpdateSlice(x, update, start_ops);
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index 5fc9909fa2a..1389f548c5d 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -122,12 +122,13 @@ LocalExecutable::RunHelper(const absl::Span<const Shape* const> argument_shapes,
       executable_->module_config().entry_computation_layout();
 
   // Check argument number, shapes, and layouts.
-  if (argument_shapes.size() != computation_layout.parameter_count()) {
+  const int argument_shapes_size = argument_shapes.size();
+  if (argument_shapes_size != computation_layout.parameter_count()) {
     return InvalidArgument(
         "invalid number of arguments for computation: expected %d, got %u",
         computation_layout.parameter_count(), argument_shapes.size());
   }
-  for (int i = 0; i < argument_shapes.size(); ++i) {
+  for (int i = 0, end = argument_shapes.size(); i < end; ++i) {
     if (!computation_layout.parameter_layout(i).MatchesLayoutInShape(
             *argument_shapes[i])) {
       return InvalidParameterArgument(
@@ -187,7 +188,7 @@ StatusOr<ExecutionOutput> LocalExecutable::Run(
   std::vector<const Shape*> argument_shapes;
   argument_shapes.reserve(arguments.size());
   for (const ExecutionInput& arg : arguments) {
-    argument_shapes.push_back(&arg.shape());
+    argument_shapes.push_back(&arg.host_shape());
   }
   return AsyncCallAndBlockHostUntilDone<ExecutionOutput>(
       argument_shapes, run_options, [&](const ExecutableRunOptions& options) {
@@ -325,7 +326,7 @@ StatusOr<ExecutionOutput> LocalExecutable::RunAsync(
   std::vector<const Shape*> argument_shapes;
   argument_shapes.reserve(arguments.size());
   for (const ExecutionInput& arg : arguments) {
-    argument_shapes.push_back(&arg.shape());
+    argument_shapes.push_back(&arg.host_shape());
   }
   return RunAsync(argument_shapes, std::move(arguments), run_options);
 }
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 8b91f4a1739..bb072a0fe2c 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -64,10 +64,6 @@ class LocalExecutable {
 
   // Similar to RunAsync(), but allows for donating argument buffers to the
   // executable.
-  StatusOr<ExecutionOutput> RunAsync(
-      absl::Span<Shape const* const> argument_host_shapes,
-      std::vector<ExecutionInput> arguments, ExecutableRunOptions run_options);
-
   StatusOr<ExecutionOutput> RunAsync(std::vector<ExecutionInput> arguments,
                                      ExecutableRunOptions run_options);
 
@@ -78,6 +74,10 @@ class LocalExecutable {
   Executable* executable() const { return executable_.get(); }
 
  private:
+  StatusOr<ExecutionOutput> RunAsync(
+      absl::Span<Shape const* const> argument_host_shapes,
+      std::vector<ExecutionInput> arguments, ExecutableRunOptions run_options);
+
   // Validates that the given arguments and options satisfy various constraints
   // of the computation.
   //
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index cc6a680c4e9..2b69c71042d 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/comparison_util.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -39,6 +40,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace xla {
 
@@ -71,8 +73,75 @@ void SetProtoIdAndName(T* entry, const string& base_name, char separator,
   entry->set_id(id);
   entry->set_name(GetFullName(base_name, separator, id));
 }
+
+ShapeProto ConvertShapeProtoToPred(const ShapeProto& shape_proto) {
+  return ShapeUtil::ChangeElementType(Shape(shape_proto), PRED).ToProto();
+}
+
+HloInstructionProto CreateConstantInstruction(int64 id, const Shape& shape,
+                                              bool pred) {
+  HloInstructionProto const_instr;
+  Literal literal = LiteralUtil::CreateR0(pred);
+  Literal literal_broadcast = literal.Broadcast(shape, {}).ValueOrDie();
+  *const_instr.mutable_shape() = shape.ToProto();
+  *const_instr.mutable_literal() = literal_broadcast.ToProto();
+  *const_instr.mutable_opcode() = HloOpcodeString(HloOpcode::kConstant);
+  const_instr.set_id(id);
+  return const_instr;
+}
+
+// Converts a HloComputation into ReducerOr with predicate types.
+HloComputationProto CreateReduceOr(int64 reducer_id,
+                                   HloComputationProto* original_reducer) {
+  HloComputationProto reducer;
+  SetProtoIdAndName(&reducer, StrCat("reduce_or"), kNameSeparator, reducer_id);
+  std::vector<int64> operands_id;
+  for (auto& inst : original_reducer->instructions()) {
+    // Copy params.
+    if (StringToHloOpcode(inst.opcode()).ValueOrDie() ==
+        HloOpcode::kParameter) {
+      HloInstructionProto* new_param = reducer.add_instructions();
+      *new_param = inst;
+      *new_param->mutable_shape() = ConvertShapeProtoToPred(inst.shape());
+      operands_id.push_back(inst.id());
+    }
+    if (inst.id() == original_reducer->root_id()) {
+      HloInstructionProto* new_root = reducer.add_instructions();
+      *new_root = inst;
+      *new_root->mutable_shape() = ConvertShapeProtoToPred(inst.shape());
+      *new_root->mutable_opcode() = HloOpcodeString(HloOpcode::kOr);
+      new_root->clear_operand_ids();
+      for (int64 operand_id : operands_id) {
+        new_root->add_operand_ids(operand_id);
+      }
+      reducer.set_root_id(inst.id());
+    }
+  }
+  return reducer;
+}
 }  // namespace
 
+namespace internal {
+
+XlaOp XlaBuilderBuildFusion(XlaBuilder* builder,
+                            absl::Span<const XlaOp> operands,
+                            absl::string_view fusion_kind,
+                            const XlaComputation& fused_computation) {
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    instr.set_fusion_kind(std::string(fusion_kind));
+    std::vector<const Shape*> operand_shape_ptrs;
+    TF_ASSIGN_OR_RETURN(auto program_shape,
+                        fused_computation.GetProgramShape());
+    *instr.mutable_shape() = program_shape.result().ToProto();
+    builder->AddCalledComputation(fused_computation, &instr);
+    return builder->AddInstruction(std::move(instr), HloOpcode::kFusion,
+                                   operands);
+  });
+}
+
+}  // namespace internal
+
 XlaOp operator-(XlaOp x) { return Neg(x); }
 XlaOp operator+(XlaOp x, XlaOp y) { return Add(x, y); }
 XlaOp operator-(XlaOp x, XlaOp y) { return Sub(x, y); }
@@ -425,7 +494,7 @@ StatusOr<XlaComputation> XlaBuilder::Build(int64 root_id,
                              alias.param_index.ToString().c_str());
     }
     TF_RETURN_IF_ERROR(config.SetUpAlias(alias.output_index, alias.param_number,
-                                         alias.param_index));
+                                         alias.param_index, alias.kind));
   }
   *module->mutable_input_output_alias() = config.ToProto();
   return Status::OK();
@@ -508,7 +577,8 @@ XlaOp XlaBuilder::UnaryOp(HloOpcode unop, XlaOp operand) {
 
 XlaOp XlaBuilder::BinaryOp(HloOpcode binop, XlaOp lhs, XlaOp rhs,
                            absl::Span<const int64> broadcast_dimensions,
-                           absl::optional<ComparisonDirection> direction) {
+                           absl::optional<ComparisonDirection> direction,
+                           absl::optional<Comparison::Type> type) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* lhs_shape, GetShapePtr(lhs));
     TF_ASSIGN_OR_RETURN(const Shape* rhs_shape, GetShapePtr(rhs));
@@ -566,7 +636,11 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, XlaOp lhs, XlaOp rhs,
         return InvalidArgument(
             "kCompare expects a ComparisonDirection, but none provided.");
       }
-      return Compare(shape, updated_lhs, updated_rhs, *direction);
+      if (type == absl::nullopt) {
+        return Compare(shape, updated_lhs, updated_rhs, *direction);
+      } else {
+        return Compare(shape, updated_lhs, updated_rhs, *direction, *type);
+      }
     }
 
     if (direction.has_value()) {
@@ -589,8 +663,16 @@ XlaOp XlaBuilder::BinaryOpNoBroadcast(HloOpcode binop, const Shape& shape,
 
 StatusOr<XlaOp> XlaBuilder::Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
                                     ComparisonDirection direction) {
+  return Compare(shape, lhs, rhs, direction,
+                 Comparison::DefaultComparisonType(shape.element_type()));
+}
+
+StatusOr<XlaOp> XlaBuilder::Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
+                                    ComparisonDirection direction,
+                                    Comparison::Type type) {
   HloInstructionProto instr;
   instr.set_comparison_direction(ComparisonDirectionToString(direction));
+  instr.set_comparison_type(ComparisonTypeToString(type));
   *instr.mutable_shape() = shape.ToProto();
   return AddInstruction(std::move(instr), HloOpcode::kCompare, {lhs, rhs});
 }
@@ -766,15 +848,16 @@ XlaOp XlaBuilder::BroadcastInDim(
     TF_ASSIGN_OR_RETURN(auto output_shape,
                         ShapeUtil::MakeValidatedShape(
                             operand_shape->element_type(), out_dim_size));
-    if (operand_shape->rank() != broadcast_dimensions.size()) {
+    tensorflow::int64 broadcast_rank = broadcast_dimensions.size();
+    if (operand_shape->rank() != broadcast_rank) {
       return InvalidArgument(
           "Size of broadcast_dimensions has to match operand's rank; operand "
           "rank: %lld, size of broadcast_dimensions %u.",
           operand_shape->rank(), broadcast_dimensions.size());
     }
-    for (int i = 0; i < broadcast_dimensions.size(); i++) {
-      if (broadcast_dimensions[i] < 0 ||
-          broadcast_dimensions[i] > out_dim_size.size()) {
+    for (int i = 0; i < broadcast_rank; i++) {
+      const tensorflow::int64 num_dims = out_dim_size.size();
+      if (broadcast_dimensions[i] < 0 || broadcast_dimensions[i] > num_dims) {
         return InvalidArgument("Broadcast dimension %lld is out of bound",
                                broadcast_dimensions[i]);
       }
@@ -786,7 +869,7 @@ XlaOp XlaBuilder::BroadcastInDim(
                            *operand_shape, output_shape, broadcast_dimensions)
                            .status());
     std::vector<int64> in_dim_size(out_dim_size.begin(), out_dim_size.end());
-    for (int i = 0; i < broadcast_dimensions.size(); i++) {
+    for (int i = 0; i < broadcast_rank; i++) {
       in_dim_size[broadcast_dimensions[i]] = operand_shape->dimensions(i);
     }
     const auto& in_dim_shape =
@@ -835,7 +918,7 @@ StatusOr<XlaOp> XlaBuilder::SliceInternal(const Shape& shape, XlaOp operand,
                                           absl::Span<const int64> strides) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
-  for (int i = 0; i < start_indices.size(); i++) {
+  for (int i = 0, end = start_indices.size(); i < end; i++) {
     auto* slice_config = instr.add_slice_dimensions();
     slice_config->set_start(start_indices[i]);
     slice_config->set_limit(limit_indices[i]);
@@ -1543,7 +1626,7 @@ XlaOp XlaBuilder::AfterAll(absl::Span<const XlaOp> tokens) {
     if (tokens.empty()) {
       return InvalidArgument("AfterAll requires at least one operand");
     }
-    for (int i = 0; i < tokens.size(); ++i) {
+    for (int i = 0, end = tokens.size(); i < end; ++i) {
       XlaOp operand = tokens[i];
       TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
       if (!operand_shape->IsToken()) {
@@ -1706,8 +1789,6 @@ XlaOp XlaBuilder::Sort(absl::Span<const XlaOp> operands,
                        const XlaComputation& comparator, int64 dimension,
                        bool is_stable) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
-    instr.set_is_stable(is_stable);
     std::vector<const Shape*> operand_shape_ptrs;
     TF_ASSIGN_OR_RETURN(std::vector<Shape> operand_shapes,
                         GetOperandShapes(operands));
@@ -1715,17 +1796,26 @@ XlaOp XlaBuilder::Sort(absl::Span<const XlaOp> operands,
                       [](const Shape& shape) { return &shape; });
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferVariadicOpShape(
                                          HloOpcode::kSort, operand_shape_ptrs));
-    *instr.mutable_shape() = shape.ToProto();
-    if (dimension == -1) {
-      TF_ASSIGN_OR_RETURN(const Shape* keys_shape, GetShapePtr(operands[0]));
-      dimension = keys_shape->rank() - 1;
-    }
-    instr.add_dimensions(dimension);
-    AddCalledComputation(comparator, &instr);
-    return AddInstruction(std::move(instr), HloOpcode::kSort, operands);
+    return SortInternal(shape, operands, comparator, dimension, is_stable);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::SortInternal(const Shape& shape,
+                                         absl::Span<const XlaOp> operands,
+                                         const XlaComputation& comparator,
+                                         int64 dimension, bool is_stable) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
+  instr.set_is_stable(is_stable);
+  if (dimension == -1) {
+    TF_ASSIGN_OR_RETURN(const Shape* keys_shape, GetShapePtr(operands[0]));
+    dimension = keys_shape->rank() - 1;
+  }
+  instr.add_dimensions(dimension);
+  AddCalledComputation(comparator, &instr);
+  return AddInstruction(std::move(instr), HloOpcode::kSort, operands);
+}
+
 XlaOp XlaBuilder::ConvertElementType(XlaOp operand,
                                      PrimitiveType new_element_type) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -1739,16 +1829,21 @@ XlaOp XlaBuilder::ConvertElementType(XlaOp operand,
 XlaOp XlaBuilder::BitcastConvertType(XlaOp operand,
                                      PrimitiveType new_element_type) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferConvertShape(
                                          *operand_shape, new_element_type));
-    *instr.mutable_shape() = shape.ToProto();
-    return AddInstruction(std::move(instr), HloOpcode::kBitcastConvert,
-                          {operand});
+    return BitcastConvertTypeInternal(shape, operand);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::BitcastConvertTypeInternal(const Shape& shape,
+                                                       XlaOp operand) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
+  return AddInstruction(std::move(instr), HloOpcode::kBitcastConvert,
+                        {operand});
+}
+
 XlaOp XlaBuilder::Clamp(XlaOp min, XlaOp operand, XlaOp max) {
   return TernaryOp(HloOpcode::kClamp, min, operand, max);
 }
@@ -1870,8 +1965,6 @@ XlaOp XlaBuilder::RngBitGenerator(RandomAlgorithm algorithm,
 XlaOp XlaBuilder::While(const XlaComputation& condition,
                         const XlaComputation& body, XlaOp init) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
-
     // Infer shape.
     TF_ASSIGN_OR_RETURN(const auto& body_program_shape, body.GetProgramShape());
     TF_ASSIGN_OR_RETURN(const auto& condition_program_shape,
@@ -1880,14 +1973,22 @@ XlaOp XlaBuilder::While(const XlaComputation& condition,
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferWhileShape(
                                          condition_program_shape,
                                          body_program_shape, *init_shape));
-    *instr.mutable_shape() = shape.ToProto();
-    // Body comes before condition computation in the vector.
-    AddCalledComputation(body, &instr);
-    AddCalledComputation(condition, &instr);
-    return AddInstruction(std::move(instr), HloOpcode::kWhile, {init});
+    return WhileInternal(shape, condition, body, init);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::WhileInternal(const Shape& shape,
+                                          const XlaComputation& condition,
+                                          const XlaComputation& body,
+                                          XlaOp init) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
+  // Body comes before condition computation in the vector.
+  AddCalledComputation(body, &instr);
+  AddCalledComputation(condition, &instr);
+  return AddInstruction(std::move(instr), HloOpcode::kWhile, {init});
+}
+
 XlaOp XlaBuilder::Gather(XlaOp input, XlaOp start_indices,
                          const GatherDimensionNumbers& dimension_numbers,
                          absl::Span<const int64> slice_sizes,
@@ -2007,7 +2108,7 @@ XlaOp XlaBuilder::ConditionalImpl(
     std::vector<Shape> branch_operand_shapes(branch_operands.size());
     std::vector<ProgramShape> branch_computation_shapes(
         branch_computations.size());
-    for (int j = 0; j < branch_operands.size(); ++j) {
+    for (int j = 0, end = branch_operands.size(); j < end; ++j) {
       TF_ASSIGN_OR_RETURN(branch_operand_shapes[j],
                           GetShape(branch_operands[j]));
       TF_ASSIGN_OR_RETURN(branch_computation_shapes[j],
@@ -2416,7 +2517,9 @@ XlaOp XlaBuilder::AllToAll(XlaOp operand, int64 split_dimension,
     if (layout) {
       TF_RET_CHECK(shape.IsTuple() && !ShapeUtil::IsNestedTuple(shape));
       for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
-        if (layout->minor_to_major().size() != shape.tuple_shapes(i).rank()) {
+        const int64 layout_minor_to_major_size =
+            layout->minor_to_major().size();
+        if (layout_minor_to_major_size != shape.tuple_shapes(i).rank()) {
           return InvalidArgument(
               "Provided layout must be compatible with the operand shape: %s "
               "vs %s",
@@ -2800,6 +2903,196 @@ StatusOr<bool> XlaBuilder::IsConstant(XlaOp operand) const {
   return is_constant;
 }
 
+StatusOr<XlaComputation> XlaBuilder::BuildDynamicInferenceGraph(XlaOp root_op) {
+  TF_ASSIGN_OR_RETURN(const HloInstructionProto* root,
+                      LookUpInstruction(root_op));
+
+  HloComputationProto entry;
+  SetProtoIdAndName(&entry, StrCat(name_, "_dynamic_inference"), kNameSeparator,
+                    GetNextId());
+  ProgramShapeProto* program_shape = entry.mutable_program_shape();
+  *program_shape->mutable_result() =
+      ShapeUtil::ChangeElementType(Shape(root->shape()), PRED).ToProto();
+
+  std::set<int64> seen;
+  struct WorkItem {
+    explicit WorkItem(int64 handle, bool need_rewrite)
+        : handle(handle), need_rewrite(need_rewrite) {}
+    int64 handle;
+    // If need_rewrite is true, the instruction will be copied and rewrite into
+    // a pred instruction indicating if each value is dynamic. If need_rewrite
+    // is false, simply copy the instruction to the output graph.
+    // E.g.,
+    // For select(P, A, B), we need to rewrite A and B into predicates, but
+    // don't need to rewrite P.
+    bool need_rewrite;
+  };
+  std::queue<WorkItem> worklist;
+  worklist.push(WorkItem(root->id(), true));
+  entry.set_root_id(root->id());
+  std::vector<HloComputationProto> called_computatons;
+  // Rewritre instruction with id "from" into the new graph.
+  // Returns more work items that need to finish.
+  auto rewrite_instruction =
+      [&](int64 from, bool need_rewrite) -> StatusOr<std::vector<WorkItem>> {
+    // Rewrite the instruction with following rules:
+    // - Unary ops: Convert into bitcast (identity) with type Pred.
+    // - Binary ops: Convert into binary or.
+    // - Select: Convert into binary or with its two data operands.
+    // - Concat / Tuple/ GTE / Bitcast: Copy.
+    // - Param: Convert to constant True.
+    // - GetDimensionSize: Convert to constant True if dimension is dynamic,
+    // contant False if dimension is static.
+    // - Reduce: Convert to reduce or.
+    // - Constant: Convert to constant False.
+    // - Other ops: Not supported.
+    // Create the instruction for the new handle.
+    TF_ASSIGN_OR_RETURN(const HloInstructionProto* instr_proto,
+                        LookUpInstructionByHandle(from));
+
+    TF_ASSIGN_OR_RETURN(HloOpcode opcode,
+                        StringToHloOpcode(instr_proto->opcode()));
+    std::vector<WorkItem> operands_todo;
+    auto* new_instr = entry.add_instructions();
+    *new_instr = *instr_proto;
+    for (auto operand_id : new_instr->operand_ids()) {
+      operands_todo.emplace_back(operand_id, need_rewrite);
+    }
+
+    if (!need_rewrite) {
+      *new_instr->mutable_name() =
+          GetFullName(instr_proto->opcode(), kNameSeparator, instr_proto->id());
+      return operands_todo;
+    }
+    *new_instr->mutable_shape() = ConvertShapeProtoToPred(instr_proto->shape());
+    Shape new_shape(new_instr->shape());
+    switch (opcode) {
+      case HloOpcode::kAbs:
+      case HloOpcode::kRoundNearestAfz:
+      case HloOpcode::kBitcast:
+      case HloOpcode::kCeil:
+      case HloOpcode::kCollectivePermuteDone:
+      case HloOpcode::kCos:
+      case HloOpcode::kClz:
+      case HloOpcode::kExp:
+      case HloOpcode::kExpm1:
+      case HloOpcode::kFloor:
+      case HloOpcode::kImag:
+      case HloOpcode::kIsFinite:
+      case HloOpcode::kLog:
+      case HloOpcode::kLog1p:
+      case HloOpcode::kNot:
+      case HloOpcode::kNegate:
+      case HloOpcode::kPopulationCount:
+      case HloOpcode::kReal:
+      case HloOpcode::kRsqrt:
+      case HloOpcode::kLogistic:
+      case HloOpcode::kSign:
+      case HloOpcode::kSin:
+      case HloOpcode::kConvert:
+      case HloOpcode::kSqrt:
+      case HloOpcode::kCbrt:
+      case HloOpcode::kTanh:
+        CHECK_EQ(instr_proto->operand_ids_size(), 1);
+        *new_instr->mutable_opcode() = HloOpcodeString(HloOpcode::kBitcast);
+        break;
+      case HloOpcode::kAdd:
+      case HloOpcode::kAtan2:
+      case HloOpcode::kDivide:
+      case HloOpcode::kComplex:
+      case HloOpcode::kMaximum:
+      case HloOpcode::kMinimum:
+      case HloOpcode::kMultiply:
+      case HloOpcode::kPower:
+      case HloOpcode::kRemainder:
+      case HloOpcode::kSubtract:
+      case HloOpcode::kCompare:
+      case HloOpcode::kAnd:
+      case HloOpcode::kOr:
+      case HloOpcode::kXor:
+      case HloOpcode::kShiftLeft:
+      case HloOpcode::kShiftRightArithmetic:
+      case HloOpcode::kShiftRightLogical:
+        CHECK_EQ(instr_proto->operand_ids_size(), 2);
+        *new_instr->mutable_opcode() = HloOpcodeString(HloOpcode::kOr);
+        break;
+      case HloOpcode::kSelect:
+        operands_todo[0].need_rewrite = false;
+        break;
+      case HloOpcode::kGather:
+        operands_todo[1].need_rewrite = false;
+        break;
+      case HloOpcode::kReduce: {
+        int64 reducer_id = new_instr->called_computation_ids(0);
+        called_computatons.push_back(
+            CreateReduceOr(reducer_id, &embedded_[reducer_id]));
+        break;
+      }
+      case HloOpcode::kTuple:
+      case HloOpcode::kTranspose:
+      case HloOpcode::kGetTupleElement:
+      case HloOpcode::kSlice:
+      case HloOpcode::kBroadcast:
+      case HloOpcode::kConcatenate:
+      case HloOpcode::kReshape:
+        break;
+      case HloOpcode::kGetDimensionSize: {
+        int64 dimension = instr_proto->dimensions(0);
+        int64 operand_handle = instr_proto->operand_ids(0);
+        TF_ASSIGN_OR_RETURN(const HloInstructionProto* operand_proto,
+                            LookUpInstructionByHandle(operand_handle));
+
+        *new_instr = CreateConstantInstruction(
+            from, new_shape,
+            operand_proto->shape().is_dynamic_dimension(dimension));
+        operands_todo.clear();
+        break;
+      }
+      case HloOpcode::kConstant:
+        *new_instr = CreateConstantInstruction(from, new_shape, false);
+        break;
+      case HloOpcode::kParameter:
+        *new_instr = CreateConstantInstruction(from, new_shape, true);
+        break;
+      default:
+        return InvalidArgument("Dynamic inferencing %s is not supported",
+                               instr_proto->DebugString());
+    }
+    *new_instr->mutable_name() =
+        GetFullName(instr_proto->opcode(), kNameSeparator, instr_proto->id());
+    return operands_todo;
+  };
+
+  while (!worklist.empty()) {
+    WorkItem item = worklist.front();
+    worklist.pop();
+    if (!seen.insert(item.handle).second) {
+      continue;
+    }
+    TF_ASSIGN_OR_RETURN(auto todos,
+                        rewrite_instruction(item.handle, item.need_rewrite));
+    for (WorkItem& todo : todos) {
+      worklist.push(todo);
+    }
+  }
+  absl::c_sort(*entry.mutable_instructions(),
+               [](const HloInstructionProto& p1,
+                  const HloInstructionProto& p2) { return p1.id() < p2.id(); });
+  XlaComputation computation(entry.id());
+  HloModuleProto* module = computation.mutable_proto();
+  module->set_name(entry.name());
+  module->set_id(entry.id());
+  module->set_entry_computation_name(entry.name());
+  module->set_entry_computation_id(entry.id());
+  *module->mutable_host_program_shape() = *program_shape;
+  for (auto& called_comp : called_computatons) {
+    *module->add_computations() = called_comp;
+  }
+  *module->add_computations() = std::move(entry);
+  XLA_VLOG_LINES(3, module->DebugString());
+  return std::move(computation);
+}
+
 StatusOr<XlaComputation> XlaBuilder::BuildConstantSubGraph(
     XlaOp root_op, bool dynamic_dimension_is_minus_one) {
   TF_ASSIGN_OR_RETURN(bool is_constant, IsConstant(root_op));
@@ -3021,7 +3314,12 @@ StatusOr<XlaOp> XlaBuilder::AddInstruction(HloInstructionProto&& instr,
     instr.add_operand_ids(operand.handle());
   }
 
-  *instr.mutable_metadata() = metadata_;
+  if (one_shot_metadata_.has_value()) {
+    *instr.mutable_metadata() = one_shot_metadata_.value();
+    one_shot_metadata_.reset();
+  } else {
+    *instr.mutable_metadata() = metadata_;
+  }
   if (sharding_) {
     *instr.mutable_sharding() = *sharding_;
   }
@@ -3227,31 +3525,71 @@ XlaOp Eq(const XlaOp lhs, const XlaOp rhs,
   return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kEq);
 }
 
+XlaOp EqTotalOrder(const XlaOp lhs, const XlaOp rhs,
+                   absl::Span<const int64> broadcast_dimensions) {
+  auto compare_type = Comparison::Type::kFloatTotalOrder;
+  return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kEq,
+                 compare_type);
+}
+
 XlaOp Ne(const XlaOp lhs, const XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions) {
   return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kNe);
 }
 
+XlaOp NeTotalOrder(const XlaOp lhs, const XlaOp rhs,
+                   absl::Span<const int64> broadcast_dimensions) {
+  auto compare_type = Comparison::Type::kFloatTotalOrder;
+  return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kNe,
+                 compare_type);
+}
+
 XlaOp Ge(const XlaOp lhs, const XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions) {
   return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kGe);
 }
 
+XlaOp GeTotalOrder(const XlaOp lhs, const XlaOp rhs,
+                   absl::Span<const int64> broadcast_dimensions) {
+  auto compare_type = Comparison::Type::kFloatTotalOrder;
+  return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kGe,
+                 compare_type);
+}
+
 XlaOp Gt(const XlaOp lhs, const XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions) {
   return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kGt);
 }
 
+XlaOp GtTotalOrder(const XlaOp lhs, const XlaOp rhs,
+                   absl::Span<const int64> broadcast_dimensions) {
+  auto compare_type = Comparison::Type::kFloatTotalOrder;
+  return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kGt,
+                 compare_type);
+}
+
 XlaOp Le(const XlaOp lhs, const XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions) {
   return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kLe);
 }
 
+XlaOp LeTotalOrder(const XlaOp lhs, const XlaOp rhs,
+                   absl::Span<const int64> broadcast_dimensions) {
+  auto compare_type = Comparison::Type::kFloatTotalOrder;
+  return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kLe,
+                 compare_type);
+}
 XlaOp Lt(const XlaOp lhs, const XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions) {
   return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kLt);
 }
 
+XlaOp LtTotalOrder(const XlaOp lhs, const XlaOp rhs,
+                   absl::Span<const int64> broadcast_dimensions) {
+  return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kLt,
+                 Comparison::Type::kFloatTotalOrder);
+}
+
 XlaOp Compare(const XlaOp lhs, const XlaOp rhs,
               absl::Span<const int64> broadcast_dimensions,
               ComparisonDirection direction) {
@@ -3259,6 +3597,13 @@ XlaOp Compare(const XlaOp lhs, const XlaOp rhs,
                                  broadcast_dimensions, direction);
 }
 
+XlaOp Compare(const XlaOp lhs, const XlaOp rhs,
+              absl::Span<const int64> broadcast_dimensions,
+              ComparisonDirection direction, Comparison::Type compare_type) {
+  return lhs.builder()->BinaryOp(HloOpcode::kCompare, lhs, rhs,
+                                 broadcast_dimensions, direction, compare_type);
+}
+
 XlaOp Compare(const XlaOp lhs, const XlaOp rhs, ComparisonDirection direction) {
   return Compare(lhs, rhs, {}, direction);
 }
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 60bdc32e68d..6d30195d3d0 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/dynamic_parameter_binding.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -45,6 +46,16 @@ limitations under the License.
 namespace xla {
 
 class XlaBuilder;
+class XlaOp;
+
+namespace internal {
+
+XlaOp XlaBuilderBuildFusion(XlaBuilder* builder,
+                            absl::Span<const XlaOp> operands,
+                            absl::string_view fusion_kind,
+                            const XlaComputation& fused_computation);
+
+}  // namespace internal
 
 // This represents an instruction that has been enqueued using the XlaBuilder.
 // This is used to pass to subsequent computations that depends upon the
@@ -153,6 +164,11 @@ class XlaBuilder {
   // OpMetadata attached until a call to ClearOpMetadata.
   void SetOpMetadata(OpMetadata metadata) { metadata_ = std::move(metadata); }
 
+  // Similar to SetOpMetadata, but only set the metadata for the next op.
+  void SetOneShotOpMetadata(OpMetadata metadata) {
+    metadata_ = std::move(metadata);
+  }
+
   // Clears the HloMetadata state.
   void ClearOpMetadata() { metadata_.Clear(); }
 
@@ -262,6 +278,31 @@ class XlaBuilder {
   StatusOr<XlaComputation> BuildConstantSubGraph(
       XlaOp root_op, bool dynamic_dimension_is_uint_max = false);
 
+  // Similar to BuildConstantSubGraph, but with root element type changed to
+  // boolean. A true value in the root indicates that the value is dynamic while
+  // false value indicates that the value is a constant. This will copy the
+  // needed ops/computations to the subgraph.
+  //
+  // E.g.,
+  // Compuptation {
+  //   a = 3
+  //   b = param(0)
+  //   ROOT Tuple(a + b, a + 1, b + 1)
+  // }
+  // Calling BuildDynamicInferenceGraph on root will produce the following
+  // graph:
+  //
+  // Compuptation {
+  //   a = False
+  //   b = True
+  //   ROOT Tuple(a | b, a, b)
+  // }
+  //
+  // The result, which is (True, False, True) after evaluation, can be
+  // interpreted as "First element is dynamic; Second element is static; Third
+  // element is dynamic".
+  StatusOr<XlaComputation> BuildDynamicInferenceGraph(XlaOp root_op);
+
   // Returns the first error that was encountered while building the
   // computation. When an error is encountered, by default we return a vacuous
   // XlaOp and inform the user of the error that occurred while
@@ -334,12 +375,16 @@ class XlaBuilder {
   // not available until the computation is built, and eventual error in the
   // arguments of this API will be detected only at computation Build() time.
   //
-  // Note: Aliasing API is 'may-alias' and only donated buffer at runtime will
-  // be aliased with output. If a buffer is not donated at runtime, a copy will
-  // be inserted by XLA to prevent buffer clobbering.
+  // Note: Except when 'must-alias' is true, alias is assumed to be 'may-alias'
+  // and only donated buffer at runtime will be aliased with output. If a buffer
+  // is not donated at runtime, a copy will be inserted by XLA to prevent buffer
+  // clobbering.
   void SetUpAlias(const ShapeIndex& output_index, int64 param_number,
-                  const ShapeIndex& param_index) {
-    input_output_aliases_.push_back({output_index, param_number, param_index});
+                  const ShapeIndex& param_index,
+                  HloInputOutputAliasConfig::AliasKind kind =
+                      HloInputOutputAliasConfig::AliasKind::kMayAlias) {
+    input_output_aliases_.push_back(
+        {output_index, param_number, param_index, kind});
   }
 
   // Describes an input/output alias as inserted by the SetUpAlias() API.
@@ -350,6 +395,8 @@ class XlaBuilder {
     int64 param_number;
     // Specifies the index of the aliased buffer in the parameter
     ShapeIndex param_index;
+    // Specifies if the alias is a must alias or may alias.
+    HloInputOutputAliasConfig::AliasKind kind;
   };
 
   // Looks up the HloInstruction and sets the frontend attribute "attribute" to
@@ -624,6 +671,8 @@ class XlaBuilder {
   XlaOp ConvertElementType(XlaOp operand, PrimitiveType new_element_type);
 
   XlaOp BitcastConvertType(XlaOp operand, PrimitiveType new_element_type);
+  virtual StatusOr<XlaOp> BitcastConvertTypeInternal(const Shape& shape,
+                                                     XlaOp operand);
 
   XlaOp Transpose(XlaOp operand, absl::Span<const int64> permutation);
   virtual StatusOr<XlaOp> TransposeInternal(
@@ -635,6 +684,10 @@ class XlaBuilder {
 
   XlaOp Sort(absl::Span<const XlaOp> operands, const XlaComputation& comparator,
              int64 dimension = -1, bool is_stable = false);
+  virtual StatusOr<XlaOp> SortInternal(const Shape& shape,
+                                       absl::Span<const XlaOp> operands,
+                                       const XlaComputation& comparator,
+                                       int64 dimension, bool is_stable);
 
   XlaOp Clamp(XlaOp min, XlaOp operand, XlaOp max);
 
@@ -651,6 +704,9 @@ class XlaBuilder {
 
   XlaOp While(const XlaComputation& condition, const XlaComputation& body,
               XlaOp init);
+  virtual StatusOr<XlaOp> WhileInternal(const Shape& shape,
+                                        const XlaComputation& condition,
+                                        const XlaComputation& body, XlaOp init);
 
   XlaOp Conditional(XlaOp predicate, XlaOp true_operand,
                     const XlaComputation& true_computation, XlaOp false_operand,
@@ -736,14 +792,17 @@ class XlaBuilder {
   // broadcast_dimensions specifies which dimensions to use for broadcasting
   // when the operation is between tensors of different ranks. The direction is
   // only used if opcode is kCompare.
-  XlaOp BinaryOp(
-      HloOpcode binop, XlaOp lhs, XlaOp rhs,
-      absl::Span<const int64> broadcast_dimensions,
-      absl::optional<Comparison::Direction> direction = absl::nullopt);
+  XlaOp BinaryOp(HloOpcode binop, XlaOp lhs, XlaOp rhs,
+                 absl::Span<const int64> broadcast_dimensions,
+                 absl::optional<ComparisonDirection> direction = absl::nullopt,
+                 absl::optional<Comparison::Type> type = absl::nullopt);
 
   // Internal helper method for binary op compare without broadcast dimensions.
   virtual StatusOr<XlaOp> Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
-                                  Comparison::Direction direction);
+                                  ComparisonDirection direction);
+  virtual StatusOr<XlaOp> Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
+                                  ComparisonDirection direction,
+                                  Comparison::Type type);
 
   // Internal helper method that does the building for an arbitrary binary op
   // with same ranked operands that doesn't broadcast.
@@ -842,6 +901,9 @@ class XlaBuilder {
   // throughout the TensorFlow op kernel implementations).
   OpMetadata metadata_;
 
+  // A temporary metadata that will only be applied to the next op created.
+  absl::optional<OpMetadata> one_shot_metadata_;
+
   // Sharding for this operator. This is structured as a "model"-like operation,
   // in order to simplify client code, similar to metadata_.
   absl::optional<OpSharding> sharding_;
@@ -906,22 +968,13 @@ class XlaBuilder {
   friend XlaOp Select(XlaOp pred, XlaOp on_true, XlaOp on_false);
   friend XlaOp Tuple(XlaBuilder* builder, absl::Span<const XlaOp> elements);
   friend XlaOp GetTupleElement(XlaOp tuple_data, int64 index);
-  friend XlaOp Eq(XlaOp lhs, XlaOp rhs,
-                  absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Ne(XlaOp lhs, XlaOp rhs,
-                  absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Ge(XlaOp lhs, XlaOp rhs,
-                  absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Gt(XlaOp lhs, XlaOp rhs,
-                  absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Lt(XlaOp lhs, XlaOp rhs,
-                  absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Le(XlaOp lhs, XlaOp rhs,
-                  absl::Span<const int64> broadcast_dimensions);
   friend XlaOp Compare(XlaOp lhs, XlaOp rhs,
                        absl::Span<const int64> broadcast_dimensions,
                        ComparisonDirection direction);
-  friend XlaOp Compare(XlaOp lhs, XlaOp rhs, ComparisonDirection direction);
+  friend XlaOp Compare(XlaOp lhs, XlaOp rhs,
+                       absl::Span<const int64> broadcast_dimensions,
+                       ComparisonDirection direction,
+                       Comparison::Type compare_type);
   friend XlaOp Dot(XlaOp lhs, XlaOp rhs,
                    const PrecisionConfig* precision_config);
   friend XlaOp DotGeneral(XlaOp lhs, XlaOp rhs,
@@ -1205,6 +1258,10 @@ class XlaBuilder {
     TF_RETURN_IF_ERROR(CheckOpBuilder(op));
     return LookUpInstructionByHandleInternal<InstructionType>(op.handle());
   }
+
+  friend XlaOp internal::XlaBuilderBuildFusion(
+      XlaBuilder* builder, absl::Span<const XlaOp> operands,
+      absl::string_view fusion_kind, const XlaComputation& fused_computation);
 };
 
 // RAII-style object: sets the current sharding assignment in builder on
@@ -1511,29 +1568,44 @@ XlaOp GetTupleElement(XlaOp tuple_data, int64 index);
 // Enqueues an equal-to comparison instruction onto the computation.
 XlaOp Eq(XlaOp lhs, XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions = {});
+XlaOp EqTotalOrder(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64> broadcast_dimensions = {});
 
 // Enqueues a not-equal comparison instruction onto the computation.
 XlaOp Ne(XlaOp lhs, XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions = {});
+XlaOp NeTotalOrder(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64> broadcast_dimensions = {});
 
 // Enqueues a greater-or-equal comparison instruction onto the computation.
 XlaOp Ge(XlaOp lhs, XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions = {});
+XlaOp GeTotalOrder(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64> broadcast_dimensions = {});
 
 // Enqueues a greater-than comparison instruction onto the computation.
 XlaOp Gt(XlaOp lhs, XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions = {});
+XlaOp GtTotalOrder(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64> broadcast_dimensions = {});
 
 // Enqueues a less-than comparison instruction onto the computation.
 XlaOp Lt(XlaOp lhs, XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions = {});
+XlaOp LtTotalOrder(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64> broadcast_dimensions = {});
 
 // Enqueues a less-or-equal comparison instruction onto the computation.
 XlaOp Le(XlaOp lhs, XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions = {});
+XlaOp LeTotalOrder(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64> broadcast_dimensions = {});
 
 // Enqueues a comparison instruction onto the computation (optionally without
 // broadcast_dimensions for consistency with others).
+XlaOp Compare(XlaOp lhs, XlaOp rhs,
+              absl::Span<const int64> broadcast_dimensions,
+              ComparisonDirection direction, Comparison::Type compare_type);
 XlaOp Compare(XlaOp lhs, XlaOp rhs,
               absl::Span<const int64> broadcast_dimensions,
               ComparisonDirection direction);
diff --git a/tensorflow/compiler/xla/comparison_util.cc b/tensorflow/compiler/xla/comparison_util.cc
index 47fb69e3bce..06dd9642cac 100644
--- a/tensorflow/compiler/xla/comparison_util.cc
+++ b/tensorflow/compiler/xla/comparison_util.cc
@@ -54,32 +54,59 @@ StatusOr<Comparison::Direction> StringToComparisonDirection(
   return it->second;
 }
 
-Comparison::Comparison(Direction dir, PrimitiveType type) : dir_(dir) {
+StatusOr<Comparison::Type> StringToComparisonType(
+    absl::string_view compare_type_name) {
+  static auto* type_map = new absl::flat_hash_map<string, Comparison::Type>({
+      {"FLOAT", Comparison::Type::kFloat},
+      {"TOTALORDER", Comparison::Type::kFloatTotalOrder},
+      {"SIGNED", Comparison::Type::kSigned},
+      {"UNSIGNED", Comparison::Type::kUnsigned},
+  });
+  auto it = type_map->find(compare_type_name);
+  if (it == type_map->end()) {
+    return InvalidArgument("Unknown comparison type: %s", compare_type_name);
+  }
+  return it->second;
+}
+
+std::string ComparisonTypeToString(Comparison::Type type) {
+  switch (type) {
+    case Comparison::Type::kFloat:
+      return "FLOAT";
+    case Comparison::Type::kFloatTotalOrder:
+      return "TOTALORDER";
+    case Comparison::Type::kSigned:
+      return "SIGNED";
+    case Comparison::Type::kUnsigned:
+      return "UNSIGNED";
+  }
+}
+
+Comparison::Comparison(Direction dir, PrimitiveType type)
+    : dir_(dir), type_(DefaultComparisonType(type)) {}
+
+Comparison::Type Comparison::DefaultComparisonType(PrimitiveType type) {
   switch (type) {
     case S8:
     case S16:
     case S32:
     case S64:
-      type_ = Type::kSigned;
-      break;
+      return Type::kSigned;
     case PRED:
     case U8:
     case U16:
     case U32:
     case U64:
-      type_ = Type::kUnsigned;
-      break;
+      return Type::kUnsigned;
     case F16:
     case F32:
     case BF16:
     case F64:
     case C64:
     case C128:
-      type_ = Type::kFloat;
-      break;
+      return Type::kFloat;
     default:
       LOG(FATAL) << "Unsupported comparison mode."
-                 << ComparisonDirectionToString(dir) << ":"
                  << PrimitiveType_Name(type) << "\n";
   }
 }
@@ -164,20 +191,6 @@ bool Comparison::IsAntireflexive() const {
   }
 }
 
-/* static */ const char* Comparison::ComparisonTypeToString(
-    Comparison::Type type) {
-  switch (type) {
-    case Type::kFloat:
-      return "f";
-    case Type::kFloatTotalOrder:
-      return "ft";
-    case Type::kSigned:
-      return "s";
-    case Type::kUnsigned:
-      return "u";
-  }
-}
-
 std::string Comparison::ToString(std::string prefix1,
                                  std::string prefix2) const {
   return prefix1 + std::string(ComparisonDirectionToString(dir_)) + prefix2 +
diff --git a/tensorflow/compiler/xla/comparison_util.h b/tensorflow/compiler/xla/comparison_util.h
index 11335c6b5ba..33ae2c67106 100644
--- a/tensorflow/compiler/xla/comparison_util.h
+++ b/tensorflow/compiler/xla/comparison_util.h
@@ -103,11 +103,11 @@ class Comparison {
   bool Compare(const T a, const T b) const {
     return GetComparator<T>()(a, b);
   }
+  static Type DefaultComparisonType(PrimitiveType t);
 
  private:
   static Direction Converse(Direction dir);
   static Direction Inverse(Direction dir);
-  static const char* ComparisonTypeToString(Type type);
 
   const Direction dir_;
   Type type_;
@@ -117,10 +117,14 @@ inline std::ostream& operator<<(std::ostream& os, const Comparison& cmp) {
   return os << cmp.ToString();
 }
 string ComparisonDirectionToString(Comparison::Direction direction);
+std::string ComparisonTypeToString(Comparison::Type type);
 
 StatusOr<Comparison::Direction> StringToComparisonDirection(
     absl::string_view direction_name);
 
+StatusOr<Comparison::Type> StringToComparisonType(
+    absl::string_view compare_type_name);
+
 using ComparisonDirection = Comparison::Direction;
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
index 8ca6e2b294c..2dd7acb2f67 100644
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -71,7 +71,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_allow_excess_precision(true);
   opts.set_xla_force_host_platform_device_count(1);
   opts.set_xla_gpu_deterministic_reductions(false);
-  opts.set_xla_cpu_enable_xprof_traceme(true);
+  opts.set_xla_cpu_enable_xprof_traceme(false);
   opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found(false);
 
   return opts;
@@ -535,10 +535,10 @@ static void AllocateFlags() {
       flag_values->xla_gpu_force_conv_nchw(),
       "For cuDNN convolutions, always NCHW layouts."));
   flag_objects->push_back(tensorflow::Flag(
-      "xla_gpu_algorithm_blacklist_path",
-      string_setter_for(&DebugOptions::set_xla_gpu_algorithm_blacklist_path),
-      flag_values->xla_gpu_algorithm_blacklist_path(),
-      "An AlgorithmBlacklist text proto file as a blacklist of convolutions to "
+      "xla_gpu_algorithm_denylist_path",
+      string_setter_for(&DebugOptions::set_xla_gpu_algorithm_denylist_path),
+      flag_values->xla_gpu_algorithm_denylist_path(),
+      "An AlgorithmDenylist text proto file as a denylist of convolutions to "
       "avoid to use."));
   flag_objects->push_back(tensorflow::Flag(
       "xla_gpu_deterministic_reductions",
diff --git a/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py b/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
index 212ad87d94c..16563bab5bc 100644
--- a/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
+++ b/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
@@ -294,3 +294,39 @@ def manual_to_auto_spmd_partition(tensor, manual_sharding, full_shape):
   """
   return tf2xla.spmd_shard_to_full_shape(
       tensor, manual_sharding=manual_sharding, full_shape=full_shape)
+
+
+def mesh_split(tensor,
+               device_mesh,
+               tensor_split_dims_mapping,
+               use_sharding_op=False):
+  """Returns a tensor that is split along multiple dimensions in a device mesh.
+
+  Args:
+    tensor: A tf.Tensor to split.
+    device_mesh: An np.ndarray describing the topology of the device mesh and
+      each element is the ID of the device in the topology.
+    tensor_split_dims_mapping: A list of integers that map each tensor axis to
+      the device mesh axis along which it is sharded. Its length is the tensor
+      rank, and tensor_split_dims_mapping[i] is device mesh axis for tensor
+      dimension i. Use -1 for tensor dimensions that are not sharded.
+    use_sharding_op: If true, adds a sharding op to set the sharding.
+
+  Raises:
+    ValueError: The number of tensor split dimensions is different from device
+      mesh rank.
+  """
+  permutation = [d for d in tensor_split_dims_mapping if d >= 0]
+  if len(permutation) != len(device_mesh.shape):
+    raise ValueError(
+        'Number of tensor split dimensions (%r) is different from device mesh '
+        'rank (%r). tensor_split_dims_mapping: %r, device_mesh.shape: %r' %
+        (len(permutation), len(
+            device_mesh.shape), tensor_split_dims_mapping, device_mesh.shape))
+  tile_assignment = _np.transpose(device_mesh, permutation)
+  tile_shape = [
+      1 if d < 0 else device_mesh.shape[d] for d in tensor_split_dims_mapping
+  ]
+  tile_assignment = _np.reshape(tile_assignment, tile_shape)
+
+  return tile(tensor, tile_assignment, use_sharding_op=use_sharding_op)
diff --git a/tensorflow/compiler/xla/g3doc/_book.yaml b/tensorflow/compiler/xla/g3doc/_book.yaml
index e05f69b1e8b..8d217b89ae3 100644
--- a/tensorflow/compiler/xla/g3doc/_book.yaml
+++ b/tensorflow/compiler/xla/g3doc/_book.yaml
@@ -17,6 +17,8 @@ upper_tabs:
         path: /xla
       - title: XLA architecture
         path: /xla/architecture
+      - title: Known issues
+        path: /xla/known_issues
       - title: Broadcasting semantics
         path: /xla/broadcasting
       - title: Develop a new backend for XLA
diff --git a/tensorflow/compiler/xla/g3doc/index.md b/tensorflow/compiler/xla/g3doc/index.md
index 60bde306266..51d666fba9a 100644
--- a/tensorflow/compiler/xla/g3doc/index.md
+++ b/tensorflow/compiler/xla/g3doc/index.md
@@ -177,30 +177,6 @@ a bug to a single XLA program by using the
 [`replay_computation`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/tools/run_hlo_module_main.cc)
 and iteratively running it on generated programs.
 
-## Known Issues
-
-Compilation with XLA can greatly improve the performance of your programs, but
-the TensorFlow interop has a number of known sharp corners.
-
-### TensorArray TF/XLA Interconversion
-
-The problem manifests itself as an error message
-`Support for TensorList crossing the XLA/TF boundary is not implemented`.
-
-XLA supports `tf.TensorArray`. However, the _interconversion_ between TF and
-XLA representations is not implemented yet.
-This error often arises when the `TensorArray` is used inside the compiled
-block, but the derivative is taken outside.
-
-Workaround: compile the outermost scope which is taking the derivative.
-
-### Random Number Generation
-
-XLA currently ignores TF seeds to random operations. This affects stateful TF
-random operations, such as `tf.random.normal`, or `tf.nn.dropout`.  XLA will
-behave as if the compilation was seeded with a new unique seed at each run. This
-limitation does not apply to stateless random ops.
-
 ## XLA Frontends
 
 Apart from TensorFlow, XLA programs can be generated by:
diff --git a/tensorflow/compiler/xla/g3doc/known_issues.md b/tensorflow/compiler/xla/g3doc/known_issues.md
new file mode 100644
index 00000000000..1c03c716a02
--- /dev/null
+++ b/tensorflow/compiler/xla/g3doc/known_issues.md
@@ -0,0 +1,32 @@
+# Known Issues
+
+Compilation with XLA can greatly improve the performance of your programs, but
+the TensorFlow interop has a number of known sharp corners.
+
+## TensorArray TF/XLA interconversion
+
+The problem manifests itself as an error message
+`Support for TensorList crossing the XLA/TF boundary is not implemented`.
+
+XLA supports `tf.TensorArray`. However, the _interconversion_ between TF and
+XLA representations is not implemented yet.
+This error often arises when the `TensorArray` is used inside the compiled
+block, but the derivative is taken outside.
+
+Workaround: compile the outermost scope which is taking the derivative.
+
+## Dynamic `tf.TensorArray` is not supported
+
+Writes into `tf.TensorArray(..., dynamic_size=True)` are not compilable with
+XLA, as such writes require an unknown number of reallocations when the array
+exceeds the original bound.
+
+Workaround: provide a statically known bound to your arrays.
+
+## Random number generation
+
+XLA currently ignores TF seeds to random operations. This affects stateful TF
+random operations, such as `tf.random.normal`, or `tf.nn.dropout`.  XLA will
+behave as if the compilation was seeded with a new unique seed at each run. This
+limitation does not apply to stateless random ops.
+
diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md
index 3031bfbf2e2..051c1539f6b 100644
--- a/tensorflow/compiler/xla/g3doc/operation_semantics.md
+++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md
@@ -1235,7 +1235,10 @@ floating-point types.
 
 Where `Op` is one of `Eq` (equal-to), `Ne` (not equal-to), `Ge`
 (greater-or-equal-than), `Gt` (greater-than), `Le` (less-or-equal-than), `Lt`
-(less-than).
+(less-than). Another set of operators, EqTotalOrder, NeTotalOrder, GeTotalOrder,
+GtTotalOrder, LeTotalOrder, and LtTotalOrder, provide the same functionalities,
+except that they additionally support a total order over the floating point
+numbers, by enforcing -NaN < -Inf < -Finite < -0 < +0 < +Finite < +Inf < +NaN.
 
 Arguments | Type    | Semantics
 --------- | ------- | ----------------------------------------
diff --git a/tensorflow/compiler/xla/index_util.cc b/tensorflow/compiler/xla/index_util.cc
index 463a8d95fc5..4bec454e520 100644
--- a/tensorflow/compiler/xla/index_util.cc
+++ b/tensorflow/compiler/xla/index_util.cc
@@ -143,7 +143,8 @@ namespace xla {
 /* static */ bool IndexUtil::IndexInBounds(const Shape& shape,
                                            absl::Span<const int64> index) {
   int64 rank = shape.rank();
-  if (rank != index.size()) {
+  const int64 index_size = index.size();
+  if (rank != index_size) {
     return false;
   }
   for (int64 d = 0; d < rank; ++d) {
@@ -157,7 +158,8 @@ namespace xla {
 /* static */ int IndexUtil::CompareIndices(absl::Span<const int64> lhs,
                                            absl::Span<const int64> rhs) {
   int64 rank = lhs.size();
-  CHECK_EQ(rhs.size(), rank);
+  const int64 rhs_rank = rhs.size();
+  CHECK_EQ(rhs_rank, rank);
   for (int64 dim = 0; dim < rank; ++dim) {
     if (lhs[dim] < rhs[dim]) {
       return -1;
diff --git a/tensorflow/compiler/xla/layout_util.cc b/tensorflow/compiler/xla/layout_util.cc
index faa33e292c2..afd7141477f 100644
--- a/tensorflow/compiler/xla/layout_util.cc
+++ b/tensorflow/compiler/xla/layout_util.cc
@@ -342,7 +342,8 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 /* static */ std::vector<int64> LayoutUtil::MakeLogicalToPhysical(
     const Layout& layout) {
   std::vector<int64> logical_to_physical(layout.minor_to_major_size());
-  for (int64 physical = 0; physical < logical_to_physical.size(); ++physical) {
+  for (int64 physical = 0, end = logical_to_physical.size(); physical < end;
+       ++physical) {
     const int64 logical = Major(layout, physical);
     logical_to_physical[logical] = physical;
   }
diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index 73c37d6b2f3..d26e0881c53 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -48,13 +48,17 @@ namespace {
 using absl::StrCat;
 
 constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
+// Literals can be used as DMA targets, which can require alignment. We
+// force a tensorflow::Allocator::kAllocatorAlignment-byte minimum
+// alignment.
+constexpr int kMinimumAlignment = 64;
 
 // Converts between little and big endian.
 //
 // Precondition: size % 2 == 0 (elements in the array are 16 bits long)
 void ConvertEndianShort(string* bytes) {
   CHECK_EQ(bytes->size() / 2, 0);
-  for (int64 i = 0; i < bytes->size(); i += 2) {
+  for (int64 i = 0, end = bytes->size(); i < end; i += 2) {
     std::swap((*bytes)[i], (*bytes)[i + 1]);
   }
 }
@@ -133,12 +137,14 @@ void Literal::SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays) {
     }
   } else if (shape.IsArray()) {
     if (allocate_arrays) {
-      // Literals can be used as DMA targets, which can require alignment. We
-      // force a tensorflow::Allocator::kAllocatorAlignment-byte minimum
-      // alignment.
-      constexpr int kMinimumAlignment = 64;
       piece->set_buffer(static_cast<char*>(tensorflow::port::AlignedMalloc(
           piece->size_bytes(), kMinimumAlignment)));
+      if (shape.is_dynamic()) {
+        CHECK_EQ(piece->dynamic_size_buffer(), nullptr);
+        piece->set_dynamic_size_buffer(
+            static_cast<int32*>(tensorflow::port::AlignedMalloc(
+                piece->dynamic_size_buffer_bytes(), kMinimumAlignment)));
+      }
     }
   } else {
     // If the shape is neither an array nor tuple, then it must be
@@ -171,6 +177,9 @@ void Literal::DeallocateBuffers() {
         if (piece->buffer() != nullptr) {
           tensorflow::port::AlignedFree(piece->buffer());
         }
+        if (piece->dynamic_size_buffer() != nullptr) {
+          tensorflow::port::AlignedFree(piece->dynamic_size_buffer());
+        }
       });
 }
 
@@ -199,6 +208,15 @@ Literal LiteralBase::CreateFromShape(const Shape& shape) {
   return literal;
 }
 
+int32 LiteralBase::GetDynamicSize(int64 dim_index) const {
+  return GetDynamicSize(dim_index, {});
+}
+
+int32 LiteralBase::GetDynamicSize(int64 dim_index,
+                                  const ShapeIndex& shape_index) const {
+  return piece(shape_index).GetDynamicSize(dim_index);
+}
+
 absl::optional<int64> LiteralBase::GetFirstInteger() const {
   switch (shape().element_type()) {
     case U8:
@@ -231,8 +249,10 @@ template <typename NativeT>
 Status MutableLiteralBase::CopySliceFromInternal(
     const LiteralBase& src_literal, absl::Span<const int64> src_base,
     absl::Span<const int64> dest_base, absl::Span<const int64> copy_size) {
-  TF_RET_CHECK(src_literal.shape().rank() == src_base.size());
-  TF_RET_CHECK(shape().rank() == dest_base.size());
+  const int64 src_base_size = src_base.size();
+  const int64 dest_base_size = dest_base.size();
+  TF_RET_CHECK(src_literal.shape().rank() == src_base_size);
+  TF_RET_CHECK(shape().rank() == dest_base_size);
 
   auto linear_index = [](const Shape& shape,
                          absl::Span<const int64> multi_index) {
@@ -381,7 +401,9 @@ std::vector<Literal> Literal::DecomposeTuple() {
 
           // Move the respective buffer over to the element Literal.
           dest_piece->set_buffer(src_piece.buffer());
+          dest_piece->set_dynamic_size_buffer(src_piece.dynamic_size_buffer());
           src_piece.set_buffer(nullptr);
+          src_piece.set_dynamic_size_buffer(nullptr);
         });
   }
   // Set this literal to be nil-shaped.
@@ -407,23 +429,51 @@ void CopyElementsBetween(absl::Span<NativeT> dest,
         src[IndexUtil::MultidimensionalIndexToLinearIndex(src_shape, index)];
   } while (IndexUtil::BumpIndices(dest_shape, absl::MakeSpan(index)));
 }
-
 }  // namespace
 
-Status LiteralBase::Piece::CopyFrom(const LiteralBase::Piece& src) {
+int32 LiteralBase::Piece::GetDynamicSize(int64 dim_index) const {
+  CHECK(LayoutUtil::IsDenseArray(subshape()));
+  if (!subshape_->is_dynamic_dimension(dim_index)) {
+    // This is a static dimension, return size.
+    return subshape_->dimensions(dim_index);
+  }
+  CHECK_NE(dynamic_size_buffer(), nullptr);
+  return dynamic_size_buffer_[dim_index];
+}
+
+void LiteralBase::Piece::SetDynamicSize(int64 dim_index, int32 size) {
+  CHECK(LayoutUtil::IsDenseArray(subshape()));
+  CHECK(subshape_->is_dynamic_dimension(dim_index));
+  if (dynamic_size_buffer() == nullptr) {
+    // Lazily initialize the dynamic size buffer.
+    set_dynamic_size_buffer(static_cast<int32*>(tensorflow::port::AlignedMalloc(
+        dynamic_size_buffer_bytes(), kMinimumAlignment)));
+    /*for (int64 i = 0; i < subshape().rank(); ++i) {
+      // Initialized to -1 to help debug.
+      dynamic_size_buffer_[i] = -1;
+    }*/
+  }
+  dynamic_size_buffer_[dim_index] = size;
+}
+
+Status LiteralBase::Piece::CopyFrom(const LiteralBase::Piece& src,
+                                    bool only_dynamic_bound) {
   CHECK(subshape_ != nullptr);
   CHECK(src.subshape_ != nullptr);
   if (ShapeUtil::Equal(subshape(), src.subshape())) {
     // If the layouts are equal it's faster just to memcpy.
     memcpy(buffer(), src.buffer(), src.size_bytes());
   } else {
-    TF_RET_CHECK(ShapeUtil::Compatible(src.subshape(), subshape()));
     std::vector<int64> origin(subshape().rank(), 0);
     switch (subshape().element_type()) {
-#define COPY_ELEMENTS(XLA_T, NATIVE_T)                                    \
-  case (XLA_T):                                                           \
-    CopyElementsBetween<NATIVE_T>(data<NATIVE_T>(), src.data<NATIVE_T>(), \
-                                  subshape(), src.subshape());            \
+#define COPY_ELEMENTS(XLA_T, NATIVE_T)                                      \
+  case (XLA_T):                                                             \
+    if (only_dynamic_bound) {                                               \
+      CopyElementsWithDynamicBound<NATIVE_T>(src);                          \
+    } else {                                                                \
+      CopyElementsBetween<NATIVE_T>(data<NATIVE_T>(), src.data<NATIVE_T>(), \
+                                    subshape(), src.subshape());            \
+    }                                                                       \
     break;
       COPY_ELEMENTS(U8, uint8);
       COPY_ELEMENTS(U16, uint16);
@@ -447,21 +497,54 @@ Status LiteralBase::Piece::CopyFrom(const LiteralBase::Piece& src) {
             PrimitiveType_Name(subshape().element_type()));
     }
   }
+  DCHECK_EQ(dynamic_size_buffer_bytes(), src.dynamic_size_buffer_bytes());
+  if (subshape().is_dynamic() && src.subshape().is_dynamic()) {
+    CHECK_NE(dynamic_size_buffer_, nullptr);
+    CHECK_NE(src.dynamic_size_buffer_, nullptr);
+    memcpy(dynamic_size_buffer(), src.dynamic_size_buffer(),
+           src.dynamic_size_buffer_bytes());
+  }
   return Status::OK();
 }
 
+void MutableLiteralBase::SetDynamicSize(int64 dim_index, int32 size) {
+  return SetDynamicSize(dim_index, {}, size);
+}
+
+void MutableLiteralBase::SetDynamicSize(int64 dim_index,
+                                        const ShapeIndex& shape_index,
+                                        int32 size) {
+  Shape* subshape_ = ShapeUtil::GetMutableSubshape(shape_.get(), shape_index);
+  CHECK_GE(subshape_->dimensions(dim_index), size);
+  if (subshape_->dimensions(dim_index) == size) {
+    subshape_->set_dynamic_dimension(dim_index, false);
+    return;
+  }
+  subshape_->set_dynamic_dimension(dim_index, true);
+  piece(shape_index).SetDynamicSize(dim_index, size);
+}
+
 Status MutableLiteralBase::CopyFrom(const LiteralSlice& src_literal,
                                     const ShapeIndex& dest_shape_index,
-                                    const ShapeIndex& src_shape_index) {
+                                    const ShapeIndex& src_shape_index,
+                                    bool only_dynamic_bound) {
   const Shape& dest_subshape =
       ShapeUtil::GetSubshape(shape(), dest_shape_index);
   const Shape& src_subshape =
       ShapeUtil::GetSubshape(src_literal.shape(), src_shape_index);
-  if (!ShapeUtil::Compatible(dest_subshape, src_subshape)) {
-    return InvalidArgument(
-        "Destination subshape incompatible with source subshape: %s vs %s",
-        ShapeUtil::HumanString(dest_subshape),
-        ShapeUtil::HumanString(src_subshape));
+  if (only_dynamic_bound) {
+    auto bound_shape = dest_subshape.is_static() ? src_subshape : dest_subshape;
+    auto compact_shape =
+        dest_subshape.is_static() ? dest_subshape : src_subshape;
+    CHECK(ShapeUtil::DynamicShapeIsCompatible(compact_shape, bound_shape))
+        << compact_shape.ToString() << " vs " << bound_shape.ToString();
+  } else {
+    if (!ShapeUtil::Compatible(dest_subshape, src_subshape)) {
+      return InvalidArgument(
+          "Destination subshape incompatible with source subshape: %s vs %s",
+          ShapeUtil::HumanString(dest_subshape),
+          ShapeUtil::HumanString(src_subshape));
+    }
   }
   return root_piece_->ForEachMutableSubpieceWithStatus(
       [&](const ShapeIndex& index, Piece* piece) {
@@ -483,10 +566,13 @@ Status MutableLiteralBase::CopyFrom(const LiteralSlice& src_literal,
         }
         // Construct the index of the corresponding piece in the source literal.
         ShapeIndex src_piece_index = src_shape_index;
-        for (int64 i = dest_shape_index.size(); i < index.size(); ++i) {
+        for (int64 i = dest_shape_index.size(), end = index.size(); i < end;
+             ++i) {
           src_piece_index.push_back(index[i]);
         }
-        TF_RETURN_IF_ERROR(piece->CopyFrom(src_literal.piece(src_piece_index)));
+        TF_RETURN_IF_ERROR(
+            piece->CopyFrom(src_literal.piece(src_piece_index),
+                            /*only_dynamic_bound=*/only_dynamic_bound));
         return Status::OK();
       });
 }
@@ -514,7 +600,9 @@ Status Literal::MoveFrom(Literal&& src_literal,
         }
         Piece& dest_piece = piece(dest_index);
         tensorflow::port::AlignedFree(dest_piece.buffer());
+        tensorflow::port::AlignedFree(dest_piece.dynamic_size_buffer());
         dest_piece.set_buffer(src_piece.buffer());
+        dest_piece.set_dynamic_size_buffer(src_piece.dynamic_size_buffer());
       });
 
   src_literal.shape_ = absl::make_unique<Shape>(ShapeUtil::MakeNil());
@@ -629,13 +717,48 @@ Literal LiteralBase::Relayout(const Shape& shape_with_layout) const {
   return result;
 }
 
+Literal LiteralBase::ToBoundedDynamic(const Shape& bounded_shape) const {
+  CHECK(bounded_shape.is_dynamic());
+  Literal result(bounded_shape);
+  ShapeUtil::ForEachSubshape(
+      shape(), [&](const Shape& subshape, const ShapeIndex& index) {
+        if (!subshape.IsArray()) {
+          return;
+        }
+        for (int64 i = 0; i < subshape.rank(); ++i) {
+          result.SetDynamicSize(i, subshape.dimensions(i));
+        }
+      });
+  TF_CHECK_OK(result.CopyFrom(*this, {}, {}, /*only_dynamic_bound=*/true));
+
+  return result;
+}
+
+Literal LiteralBase::ToStatic() const {
+  // Create new shape with 'new_layout' set at the given shape index.
+  Shape new_shape = shape();
+  ShapeUtil::ForEachMutableSubshape(
+      &new_shape, [this](Shape* subshape, const ShapeIndex& index) {
+        if (!subshape->IsArray()) {
+          return;
+        }
+        for (int64 i = 0; i < subshape->rank(); ++i) {
+          subshape->set_dynamic_dimension(i, false);
+          subshape->set_dimensions(i, GetDynamicSize(i, index));
+        }
+      });
+  Literal result(new_shape);
+  TF_CHECK_OK(result.CopyFrom(*this, {}, {}, /*only_dynamic_bound=*/true));
+  return result;
+}
+
 StatusOr<Literal> LiteralBase::Broadcast(
     const Shape& result_shape, absl::Span<const int64> dimensions) const {
   if (!shape().IsArray()) {
     return InvalidArgument("Broadcast only supports arrays.");
   }
 
-  for (int64 i = 0; i < dimensions.size(); i++) {
+  for (int64 i = 0, end = dimensions.size(); i < end; i++) {
     TF_RET_CHECK(shape().dimensions(i) ==
                  result_shape.dimensions(dimensions[i]));
   }
@@ -652,9 +775,14 @@ StatusOr<Literal> LiteralBase::Broadcast(
   const int64 primitive_size =
       ShapeUtil::ByteSizeOfPrimitiveType(shape().element_type());
 
+  for (int64 i = 0; i < dimensions.size(); ++i) {
+    int64 dynamic_size = GetDynamicSize(i);
+    result.SetDynamicSize(dimensions[i], dynamic_size);
+  }
+
   ShapeUtil::ForEachIndex(
       result_shape, [&](absl::Span<const int64> output_index) {
-        for (int64 i = 0; i < dimensions.size(); ++i) {
+        for (int64 i = 0, end = dimensions.size(); i < end; ++i) {
           scratch_source_index[i] = output_index[dimensions[i]];
         }
         int64 dest_index = IndexUtil::MultidimensionalIndexToLinearIndex(
@@ -674,6 +802,9 @@ StatusOr<Literal> LiteralBase::Reshape(
   if (!shape().IsArray()) {
     return InvalidArgument("Reshape does not support tuples.");
   }
+  if (shape().is_dynamic()) {
+    return Unimplemented("Dynamic reshape is not implemented.");
+  }
   Literal output;
   if (!LayoutUtil::IsMonotonicWithDim0Major(shape().layout())) {
     output = Relayout(LayoutUtil::GetDefaultLayoutForRank(shape().rank()));
@@ -728,6 +859,9 @@ Literal LiteralBase::Transpose(absl::Span<const int64> permutation) const {
     layout->add_minor_to_major(inverse_permutation[index]);
   }
   Literal new_literal(permuted_shape);
+  for (int64 i = 0; i < shape().rank(); i++) {
+    new_literal.SetDynamicSize(inverse_permutation[i], GetDynamicSize(i));
+  }
   DCHECK_EQ(ShapeUtil::ByteSizeOf(new_literal.shape()),
             ShapeUtil::ByteSizeOf(shape()));
   std::memcpy(new_literal.untyped_data(), untyped_data(), size_bytes());
@@ -747,6 +881,14 @@ Literal LiteralBase::SliceInternal(
               return Get<NativeT>(new_indices);
             })
             .ok());
+  for (int64 dnum = 0; dnum < shape().rank(); ++dnum) {
+    if (shape().is_dynamic_dimension(dnum)) {
+      int64 dynamic_size = GetDynamicSize(dnum) - start_indices[dnum];
+      CHECK_GE(dynamic_size, 0) << GetDynamicSize(dnum);
+      dynamic_size = std::min(dynamic_size, result_shape.dimensions(dnum));
+      result_literal.SetDynamicSize(dnum, dynamic_size);
+    }
+  }
   return result_literal;
 }
 
@@ -763,9 +905,10 @@ Literal LiteralBase::Slice(absl::Span<const int64> start_indices,
     CHECK_GE(dimension, 0) << "dnum = " << dnum;
     result_dimensions.push_back(dimension);
   }
-  const auto result_shape =
+  auto result_shape =
       ShapeUtil::MakeShapeWithLayout(shape().element_type(), result_dimensions,
                                      LayoutUtil::MinorToMajor(shape()));
+  ShapeUtil::CopyDynamicDimensions(&result_shape, shape());
   switch (result_shape.element_type()) {
     case PRED:
       return SliceInternal<bool>(result_shape, start_indices);
@@ -861,14 +1004,20 @@ absl::optional<int64> LiteralBase::GetIntegralAsS64(
   switch (shape().element_type()) {
     case PRED:
       return Get<bool>(multi_index);
+    case S8:
+      return Get<int8>(multi_index);
     case U8:
       return Get<uint8>(multi_index);
+    case S16:
+      return Get<int16>(multi_index);
+    case U16:
+      return Get<uint16>(multi_index);
     case S32:
       return Get<int32>(multi_index);
-    case S64:
-      return Get<int64>(multi_index);
     case U32:
       return Get<uint32>(multi_index);
+    case S64:
+      return Get<int64>(multi_index);
     case U64:
       return Get<uint64>(multi_index);
     default:
@@ -1045,8 +1194,9 @@ void DenseArrayToStringHelper(const LiteralBase& literal,
           }
           // Handle the non-innermost tensors of a 2D+ tensor.
           if (brace == "{") {
+            const int64 accum_indices_size = accum_indices->size();
             if (rank > 3 && !accum_indices->empty() &&
-                accum_indices->size() < rank) {
+                accum_indices_size < rank) {
               int index = accum_indices->size() - 1;
               int value = accum_indices->back();
               return StrCat(brace, " /*i", index, "=", value, "*/\n");
@@ -1082,11 +1232,24 @@ void DenseArrayToStringHelper(const LiteralBase& literal,
 
   if (print_shape) {
     pieces->push_back(ShapeToString(print_layout, subshape));
+    if (subshape.is_dynamic()) {
+      pieces->push_back("(");
+      for (int64 i = 0; i < subshape.dimensions_size(); ++i) {
+        pieces->push_back(StrCat(literal.GetDynamicSize(i, shape_index)));
+        if (i < subshape.dimensions_size() - 1) {
+          pieces->push_back(",");
+        }
+      }
+      pieces->push_back(")");
+    }
     pieces->push_back(" ");
   }
   std::vector<int64> indices = {};
-  std::vector<int64> dimensions(subshape.dimensions().begin(),
-                                subshape.dimensions().end());
+  std::vector<int64> dimensions;
+  dimensions.reserve(subshape.rank());
+  for (int64 i = 0; i < subshape.rank(); ++i) {
+    dimensions.push_back(literal.GetDynamicSize(i, shape_index));
+  }
   to_string_recursive(dimensions, &indices);
 }
 
@@ -1367,20 +1530,51 @@ StatusOr<Literal> LiteralBase::ConvertToShape(const Shape& dest_shape) const {
   }
   Literal literal(ShapeUtil::MakeTupleShape(element_shapes),
                   /*allocate_arrays=*/false);
-  for (int i = 0; i < elements.size(); ++i) {
+  for (int i = 0, end = elements.size(); i < end; ++i) {
     TF_CHECK_OK(
         literal.MoveFrom(std::move(elements[i]), /*dest_shape_index=*/{i}));
   }
   return literal;
 }
 
+template <typename NativeT>
+void LiteralBase::Piece::CopyElementsWithDynamicBound(
+    const LiteralBase::Piece& src) {
+  auto dest_shape = subshape();
+  auto src_shape = src.subshape();
+
+  // At least one shape has to be static as bound.
+  CHECK(dest_shape.is_static() || src_shape.is_static());
+  auto bound_shape = dest_shape.is_static() ? src_shape : dest_shape;
+  if (ShapeUtil::IsZeroElementArray(dest_shape)) {
+    return;
+  }
+  std::vector<int64> index(dest_shape.rank());
+  do {
+    bool out_of_bound = false;
+    for (int64 i = 0; i < index.size(); ++i) {
+      // Do not copy elements beyond dynamic bound.
+      if (index[i] >= GetDynamicSize(i) || index[i] >= src.GetDynamicSize(i)) {
+        out_of_bound = true;
+      }
+    }
+    if (out_of_bound) {
+      continue;
+    }
+    data<NativeT>()[IndexUtil::MultidimensionalIndexToLinearIndex(dest_shape,
+                                                                  index)] =
+        src.data<NativeT>()[IndexUtil::MultidimensionalIndexToLinearIndex(
+            src_shape, index)];
+  } while (IndexUtil::BumpIndices(bound_shape, absl::MakeSpan(index)));
+}
+
 template <typename NativeT>
 bool LiteralBase::Piece::EqualElementsInternal(
     const LiteralBase::Piece& other, std::vector<int64>* multi_index) const {
   if (multi_index->size() == subshape().rank()) {
     return (Get<NativeT>(*multi_index) == other.Get<NativeT>(*multi_index));
   }
-  for (int64 i = 0; i < subshape().dimensions(multi_index->size()); ++i) {
+  for (int64 i = 0; i < GetDynamicSize(multi_index->size()); ++i) {
     multi_index->push_back(i);
     if (!EqualElementsInternal<NativeT>(other, multi_index)) {
       return false;
@@ -1390,10 +1584,24 @@ bool LiteralBase::Piece::EqualElementsInternal(
   return true;
 }
 
-bool LiteralBase::Piece::EqualElements(const LiteralBase::Piece& other) const {
+bool LiteralBase::Piece::EqualDynamicSize(
+    const LiteralBase::Piece& other) const {
   DCHECK(ShapeUtil::Compatible(subshape(), other.subshape()));
+  if (subshape().is_static()) {
+    return true;
+  }
 
-  if (ShapeUtil::Equal(subshape(), other.subshape()) &&
+  for (int64 i = 0; i < subshape().rank(); ++i) {
+    if (GetDynamicSize(i) != other.GetDynamicSize(i)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool LiteralBase::Piece::EqualElements(const LiteralBase::Piece& other) const {
+  if (subshape().is_static() &&
+      ShapeUtil::Equal(subshape(), other.subshape()) &&
       LayoutUtil::IsDenseArray(subshape())) {
     CHECK_EQ(size_bytes(), other.size_bytes());
     return memcmp(buffer(), other.buffer(), size_bytes()) == 0;
@@ -1403,14 +1611,16 @@ bool LiteralBase::Piece::EqualElements(const LiteralBase::Piece& other) const {
   switch (subshape().element_type()) {
     case PRED:
       return EqualElementsInternal<bool>(other, &multi_index);
-    case U8:
-      return EqualElementsInternal<uint8>(other, &multi_index);
+    case S8:
+      return EqualElementsInternal<int8>(other, &multi_index);
     case S16:
       return EqualElementsInternal<int16>(other, &multi_index);
     case S32:
       return EqualElementsInternal<int32>(other, &multi_index);
     case S64:
       return EqualElementsInternal<int64>(other, &multi_index);
+    case U8:
+      return EqualElementsInternal<uint8>(other, &multi_index);
     case U16:
       return EqualElementsInternal<uint16>(other, &multi_index);
     case U32:
@@ -1436,17 +1646,33 @@ bool LiteralBase::Piece::EqualElements(const LiteralBase::Piece& other) const {
 }
 
 bool LiteralBase::operator==(const LiteralBase& other) const {
-  if (!ShapeUtil::Compatible(shape(), other.shape())) {
+  // Checking the structure of tuple literals. Checks for dense arrays are
+  // performed below.
+  if (!ShapeUtil::EqualStructure(shape(), other.shape())) {
     return false;
   }
 
   return root_piece().ForEachSubpieceWithBool(
       [&](const ShapeIndex& index, const Piece& piece) {
+        const Piece& other_piece = other.piece(index);
+        const Shape& subshape = piece.subshape();
+        const Shape& other_subshape = other_piece.subshape();
+        if (subshape.element_type() != other_subshape.element_type()) {
+          return false;
+        }
         if (!piece.subshape().IsArray()) {
           return true;
         }
+        if (subshape.rank() != other_subshape.rank()) {
+          return false;
+        }
+
+        for (int64 i = 0; i < subshape.rank(); ++i) {
+          if (piece.GetDynamicSize(i) != other_piece.GetDynamicSize(i)) {
+            return false;
+          }
+        }
 
-        const Piece& other_piece = other.piece(index);
         if (!piece.EqualElements(other_piece)) {
           return false;
         }
@@ -1677,13 +1903,13 @@ bool LiteralBase::IsR1Iota() const {
   auto is_iota_at_idx = [&](const int64 idx) {
     switch (shape().element_type()) {
       case U8:
-        return Get<uint8>({idx}) == idx;
+        return static_cast<int64>(Get<uint8>({idx})) == idx;
       case U16:
-        return Get<uint16>({idx}) == idx;
+        return static_cast<int64>(Get<uint16>({idx})) == idx;
       case U32:
-        return Get<uint32>({idx}) == idx;
+        return static_cast<int64>(Get<uint32>({idx})) == idx;
       case U64:
-        return Get<uint64>({idx}) == idx;
+        return static_cast<int64>(Get<uint64>({idx})) == idx;
       case S8:
         return Get<int8>({idx}) == idx;
       case S16:
@@ -1960,8 +2186,9 @@ Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
     }
     case C128: {
       auto complex_data = data<complex128>();
-      TF_RET_CHECK(proto.c128s_size() == complex_data.size() * 2);
-      for (int64 i = 0; i < complex_data.size(); ++i) {
+      const int64 complex_data_size_doubled = complex_data.size() * 2;
+      TF_RET_CHECK(proto.c128s_size() == complex_data_size_doubled);
+      for (int64 i = 0, end = complex_data.size(); i < end; ++i) {
         complex_data[i] =
             complex128{proto.c128s(i * 2), proto.c128s(i * 2 + 1)};
       }
@@ -2035,6 +2262,7 @@ void MutableBorrowingLiteral::CopyPieceSubtree(const Shape& shape,
     }
   } else if (shape.IsArray()) {
     dest_piece->set_buffer(src_piece->buffer());
+    dest_piece->set_dynamic_size_buffer(src_piece->dynamic_size_buffer());
   } else {
     // If the shape is neither an array nor tuple, then it must be
     // zero-sized. Otherwise, some memory needs to be allocated for it.
@@ -2179,7 +2407,7 @@ BorrowingLiteral::BorrowingLiteral(absl::Span<const char* const> src_buf_ptrs,
   root_piece_.set_subshape(shape_.get());
   BuildPieceSubtree(*shape_, &root_piece_);
 
-  for (int i = 0; i < src_buf_ptrs.size(); ++i) {
+  for (int i = 0, end = src_buf_ptrs.size(); i < end; ++i) {
     const auto& src_shape = shape_->tuple_shapes(i);
     CHECK(src_shape.IsArray());
     root_piece_.child(i).set_buffer(const_cast<char*>(src_buf_ptrs[i]));
diff --git a/tensorflow/compiler/xla/literal.h b/tensorflow/compiler/xla/literal.h
index a2be92fbf5b..1ee71618887 100644
--- a/tensorflow/compiler/xla/literal.h
+++ b/tensorflow/compiler/xla/literal.h
@@ -112,6 +112,10 @@ class LiteralBase {
   template <typename NativeT>
   NativeT Get(absl::Span<const int64> multi_index) const;
 
+  // Get the dynamic size on dim_index in the literal at the given shape_index.
+  int32 GetDynamicSize(int64 dim_index, const ShapeIndex& shape_index) const;
+  int32 GetDynamicSize(int64 dim_index) const;
+
   // Returns the element value at index (0, ..., 0), however many zeroes are
   // required for that index.
   template <typename NativeT>
@@ -281,6 +285,18 @@ class LiteralBase {
   // than being limited to a single array within the shape.
   Literal Relayout(const Shape& shape_with_layout) const;
 
+  // Generate a new literal whose static sizes are equal to the previous
+  // literal's dynamic sizes.
+  Literal ToStatic() const;
+
+  // Expand a static literal into a new one with a bounded dyanmic literal. The
+  // static dimensions of the original literal becomes dynamic dimensions of the
+  // new literal, where the argument `bounded_shape` becomes the bounded shape
+  // of the new literal.
+  //
+  // Precondition: bounded_shape.is_dynamic()
+  Literal ToBoundedDynamic(const Shape& bounded_shape) const;
+
   // Creates a new literal by reshaping this literal to have the given
   // dimensions. The total number of elements must not change; The
   // implementation currently only supports monotonic dim0-major layouts.
@@ -354,10 +370,22 @@ class LiteralBase {
     template <typename NativeT>
     void Set(absl::Span<const int64> index, NativeT value);
 
+    int32 GetDynamicSize(int64 dim_index) const;
+    void SetDynamicSize(int64 dim_index, int32 size);
     // Gets/sets the buffer holding the array data.
     char* buffer() const { return buffer_; }
     void set_buffer(char* buffer) { buffer_ = buffer; }
 
+    // Gets/sets the buffer holding dynamic sizes.
+    int32* dynamic_size_buffer() const { return dynamic_size_buffer_; }
+    void set_dynamic_size_buffer(int32* dynamic_size_buffer) {
+      dynamic_size_buffer_ = dynamic_size_buffer;
+    }
+
+    int64 dynamic_size_buffer_bytes() const {
+      return subshape().dimensions_size() * sizeof(int32);
+    }
+
     // Gets or sets the subshape of this piece. This reference points to a
     // subshape within the shape in the containing Literal (Literal::shape_).
     const Shape& subshape() const { return *subshape_; }
@@ -434,15 +462,21 @@ class LiteralBase {
     }
 
     // Returns true if this piece and 'other' contain the same data. This piece
-    // and 'other' must be array-shaped and compatible.
+    // and 'other' must be array-shaped and compatible. If a literal has dynamic
+    // shape, comparison is done only for the valid elements.
     bool EqualElements(const Piece& other) const;
 
+    // Returns true if this piece and other pieces have the same dynamic
+    // dimension sizes.
+    bool EqualDynamicSize(const Piece& other) const;
+
     // Writes the shape and data (if array-shaped) into the given proto.
     void WriteToProto(LiteralProto* proto) const;
 
     // Copy the data from 'src' into this piece's buffer. Shapes of this piece
-    // and src must be compatible.
-    Status CopyFrom(const Piece& src);
+    // and src must be compatible. If only_dynamic_bound is true, only elements
+    // within dynamic bounds will be copied.
+    Status CopyFrom(const Piece& src, bool only_dynamic_bound);
 
     // Copies the data from the given proto into this piece. The shape of this
     // piece must be equal (not just compatible) to the shape of the proto.
@@ -497,9 +531,15 @@ class LiteralBase {
     bool EqualElementsInternal(const Piece& other,
                                std::vector<int64>* multi_index) const;
 
+    // Internal helper to copy elements from another given piece
+    template <typename NativeT>
+    void CopyElementsWithDynamicBound(const LiteralBase::Piece& src);
+
     // For array-shaped pieces, this is the buffer holding the literal data.
     char* buffer_ = nullptr;
 
+    int32* dynamic_size_buffer_ = nullptr;
+
     // The shape of piece. This points into the shape of the containing Literal
     // (Literal::shape_).
     const Shape* subshape_ = nullptr;
@@ -550,6 +590,11 @@ class MutableLiteralBase : public LiteralBase {
   // mutate the shape as this can produce malformed Literals.
   Shape* mutable_shape_do_not_use() { return shape_.get(); }
 
+  // Set the dynamic size on dim_index in the literal at the given shape_index.
+  void SetDynamicSize(int64 dim_index, const ShapeIndex& shape_index,
+                      int32 size);
+  void SetDynamicSize(int64 dim_index, int32 size);
+
   // Returns a pointer to the underlying buffer holding the array at the given
   // shape index. CHECKs if the subshape of the literal at the given ShapeIndex
   // is not array.
@@ -560,10 +605,12 @@ class MutableLiteralBase : public LiteralBase {
   // Copy values from 'src_literal' rooted at 'src_shape_index' into this
   // literal rooted at 'dest_shape_index'. The subshape of this literal rooted
   // at 'dest_shape_index' must be compatible with the subshape of 'src_literal'
-  // rooted at 'src_shape_index', but need not be arrays.
+  // rooted at 'src_shape_index', but need not be arrays. If only_dynamic_bound
+  // is true, only elements within dynamic bounds will be copied.
   Status CopyFrom(const LiteralSlice& src_literal,
                   const ShapeIndex& dest_shape_index = {},
-                  const ShapeIndex& src_shape_index = {});
+                  const ShapeIndex& src_shape_index = {},
+                  bool only_dynamic_bound = false);
 
   // Copies the values from src_literal, starting at src_base shape indexes,
   // to this literal, starting at dest_base, where the copy size in each
@@ -924,9 +971,14 @@ void LiteralBase::EachCell(
     return;
   }
   std::vector<int64> indices(shape().rank(), 0);
+
+  Shape shape_dynamic = shape();
+  for (int64 i = 0; i < shape_dynamic.rank(); ++i) {
+    shape_dynamic.set_dimensions(i, GetDynamicSize(i));
+  }
   do {
     per_cell(indices, Get<NativeT>(indices));
-  } while (IndexUtil::BumpIndices(shape(), absl::MakeSpan(indices)));
+  } while (IndexUtil::BumpIndices(shape_dynamic, absl::MakeSpan(indices)));
 }
 
 template <typename NativeT>
diff --git a/tensorflow/compiler/xla/literal_comparison.cc b/tensorflow/compiler/xla/literal_comparison.cc
index e1f52f72e5d..155d281df0c 100644
--- a/tensorflow/compiler/xla/literal_comparison.cc
+++ b/tensorflow/compiler/xla/literal_comparison.cc
@@ -218,23 +218,12 @@ int64 RecursiveElementCount(const Shape& shape) {
 // Returns whether the given value is infinity.
 template <typename NativeT>
 bool IsInf(NativeT val) {
-  return std::isinf(val);
+  return Eigen::numext::isinf(val);
 }
-
-template <>
-bool IsInf<half>(half val) {
-  return std::isinf(static_cast<float>(val));
-}
-
 // Returns whether the given value is nan.
 template <typename NativeT>
-float IsNan(NativeT value) {
-  return std::isnan(value);
-}
-
-template <>
-float IsNan(half value) {
-  return IsNan<float>(static_cast<float>(value));
+bool IsNan(NativeT value) {
+  return Eigen::numext::isnan(value);
 }
 
 // Converts the given floating-point value to a string.
diff --git a/tensorflow/compiler/xla/literal_test.cc b/tensorflow/compiler/xla/literal_test.cc
index 37316a2a807..a58e450a55a 100644
--- a/tensorflow/compiler/xla/literal_test.cc
+++ b/tensorflow/compiler/xla/literal_test.cc
@@ -149,6 +149,16 @@ TEST_F(LiteralUtilTest, R2ToString) {
   EXPECT_EQ(expected, literal.ToString());
 }
 
+TEST_F(LiteralUtilTest, R2DynamicToString) {
+  auto literal = LiteralUtil::CreateR2({{1, 2}, {3, 4}, {5, 6}});
+  literal.SetDynamicSize(0, {}, 2);
+  const string expected = R"(s32[<=3,2](2,2) {
+  { 1, 2 },
+  { 3, 4 }
+})";
+  EXPECT_EQ(expected, literal.ToString());
+}
+
 TEST_F(LiteralUtilTest, R3ToString) {
   const auto literal =
       LiteralUtil::CreateR3({{{1}, {2}}, {{3}, {4}}, {{5}, {6}}});
@@ -421,6 +431,28 @@ TEST_F(LiteralUtilTest, TupleEquality) {
   EXPECT_NE(tuple1, different_tuple);
 }
 
+TEST_F(LiteralUtilTest, DynamicShapeEquality) {
+  // Test equality with tuples.
+  auto r1 = LiteralUtil::CreateR1<float>({1.0, 2.0});
+  r1.SetDynamicSize(0, {}, 1);
+  auto r2 = LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
+  r2.SetDynamicSize(0, {}, 1);
+  auto tuple1 = LiteralUtil::MakeTuple({&r1, &r2});
+
+  // Tuple with the same elements. One element is shared with the original
+  // tuple, the other is a clone of the element in the original tuple.
+  auto r1_clone = LiteralUtil::CreateR1<float>({1.0, 3.0});
+  r1_clone.SetDynamicSize(0, {}, 1);
+  auto tuple2 = LiteralUtil::MakeTuple({&r1_clone, &r2});
+  EXPECT_EQ(tuple1, tuple2);
+
+  // Tuple with different dynamic sizes.
+  auto r2_clone = LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
+  r2_clone.SetDynamicSize(0, {}, 2);
+  auto tuple_3 = LiteralUtil::MakeTuple({&r1_clone, &r2_clone});
+  EXPECT_NE(tuple1, tuple_3);
+}
+
 TEST_F(LiteralUtilTest, C64Equality) {
   // Test equality with tuples.
   auto vector = LiteralUtil::CreateR1<complex64>({{1.0, 2.0}, {3.0, 4.0}});
@@ -692,6 +724,47 @@ TEST_F(LiteralUtilTest, TransposeR4) {
   });
 }
 
+TEST_F(LiteralUtilTest, TransposeDynamicR2) {
+  // F32[2, <=3] (2, 1)
+  auto original = LiteralUtil::CreateR2<float>({{1, 2, 3}, {4, 5, 6}});
+  original.SetDynamicSize(1, 1);
+  // F32[<=3, 2] (1, 2)
+  auto reshape = original.Transpose(/*permutation=*/{1, 0});
+
+  reshape.EachCell<float>([&](absl::Span<const int64> indices, float value) {
+    EXPECT_EQ(value, original.Get<float>({indices[1], indices[0]}));
+  });
+}
+
+TEST_F(LiteralUtilTest, ToStaticR2) {
+  // F32[2, <=3] (2, 1)
+  auto original = LiteralUtil::CreateR2<float>({{1, 2, 3}, {4, 5, 6}});
+  original.SetDynamicSize(1, 1);
+  // F32[2, 1]
+  auto static_literal = original.ToStatic();
+  EXPECT_EQ(static_literal.shape(), ShapeUtil::MakeShape(F32, {2, 1}));
+  EXPECT_TRUE(static_literal.shape().is_static());
+
+  static_literal.EachCell<float>(
+      [&](absl::Span<const int64> indices, float value) {
+        EXPECT_EQ(value, original.Get<float>({indices[0], indices[1]}));
+      });
+}
+
+TEST_F(LiteralUtilTest, ToBoundedDynamicR2) {
+  // F32[2, 1]
+  auto original = LiteralUtil::CreateR2<float>({{1}, {4}});
+  // F32[2, <=3] (2, 1)
+  auto dynamic_shape = ShapeUtil::MakeShape(F32, {2, 3}, {false, true});
+  auto dynamic_literal = original.ToBoundedDynamic(dynamic_shape);
+  EXPECT_EQ(dynamic_literal.shape(), dynamic_shape);
+
+  dynamic_literal.EachCell<float>(
+      [&](absl::Span<const int64> indices, float value) {
+        EXPECT_EQ(value, original.Get<float>({indices[0], indices[1]}));
+      });
+}
+
 TEST_F(LiteralUtilTest, TestR4RelayoutEquivalence) {
   // Tests that using Relayout on an array is equivalent to creating it in the
   // target layout in the first place.
@@ -797,6 +870,38 @@ TEST_F(LiteralUtilTest, SliceR3U32Full) {
   EXPECT_EQ(input_2x3x2, result);
 }
 
+TEST_F(LiteralUtilTest, SliceR2Dynamic) {
+  auto input_3x4 = LiteralUtil::CreateR2<uint32>(
+      {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}});
+  input_3x4.SetDynamicSize(1, 3);
+  // slice second dim from dynamic size 3 to dynamic size 1.
+  auto result = input_3x4.Slice({0, 1}, {2, 2});
+  auto expected = LiteralUtil::CreateR2<uint32>({{2}, {6}});
+  EXPECT_EQ(expected, result);
+  EXPECT_EQ(result.GetDynamicSize(1), 1);
+}
+
+TEST_F(LiteralUtilTest, SliceR2DynamicInBound) {
+  auto input_3x4 = LiteralUtil::CreateR2<uint32>(
+      {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}});
+  input_3x4.SetDynamicSize(1, 1);
+  auto result = input_3x4.Slice({0, 0}, {2, 2});
+  auto expected = LiteralUtil::CreateR2<uint32>({{1}, {5}});
+  EXPECT_EQ(expected, result);
+  EXPECT_EQ(result.GetDynamicSize(1), 1);
+}
+
+TEST_F(LiteralUtilTest, SliceR2DynamicOutOfBound) {
+  auto input_3x4 = LiteralUtil::CreateR2<uint32>(
+      {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}});
+  input_3x4.SetDynamicSize(1, 1);
+  auto result = input_3x4.Slice({0, 1}, {2, 3});
+  auto expected = LiteralUtil::CreateR2<uint32>({{}, {}});
+  EXPECT_EQ(expected, result);
+  // Out of bound access clamps into 0 sized dimension.
+  EXPECT_EQ(result.GetDynamicSize(1), 0);
+}
+
 TEST_F(LiteralUtilTest, PopulateR1S64) {
   Literal output(ShapeUtil::MakeShape(S64, {1}));
   output.PopulateR1<int64>({77});
@@ -1510,7 +1615,7 @@ TEST_F(LiteralUtilTest, CopyFromProto_u16) {
   EXPECT_EQ(u1, r[3]);
 }
 
-TEST_F(LiteralUtilTest, LiteralSliceTest) {
+TEST_F(LiteralUtilTest, LiteralDynamicSliceTest) {
   auto scalar = LiteralUtil::CreateR0<float>(1.0);
   auto matrix = LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
   auto tuple = LiteralUtil::MakeTuple({&scalar, &matrix});
@@ -1973,6 +2078,17 @@ TEST_F(LiteralUtilTest, BroadcastScalarToMatrix) {
             LiteralUtil::CreateR2<int32>({{9, 9}, {9, 9}}));
 }
 
+TEST_F(LiteralUtilTest, DynamicBroadcast) {
+  Literal literal = LiteralUtil::CreateR1<int64>({1, 2});
+  literal.SetDynamicSize(0, 1);
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal broadcasted_literal,
+      literal.Broadcast(/*result_shape=*/ShapeUtil::MakeShape(S64, {2, 2}),
+                        /*dimensions=*/{1}));
+  EXPECT_EQ(broadcasted_literal, LiteralUtil::CreateR2<int64>({{1}, {1}}));
+  EXPECT_EQ(broadcasted_literal.GetDynamicSize(1), 1);
+}
+
 TEST_F(LiteralUtilTest, GetAsComplex128) {
   complex128 value = {1, 0};
   Literal c1 = LiteralUtil::CreateR0<complex128>(value);
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index 4304c207cad..0286aa20b3b 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -67,7 +67,7 @@ Literal ConvertType(LiteralSlice literal) {
               primitive_util::NativeToPrimitiveType<FromNativeT>()) {
             auto src = literal.data<FromNativeT>(shape_index);
             auto dest = result.data<ToNativeT>(shape_index);
-            for (int64 i = 0; i < src.size(); ++i) {
+            for (int64 i = 0, end = src.size(); i < end; ++i) {
               dest[i] = static_cast<ToNativeT>(src[i]);
             }
           } else {
@@ -329,7 +329,7 @@ Literal ConvertType(LiteralSlice literal) {
 
 /* static */ Literal LiteralUtil::CreateR1U8(absl::string_view value) {
   Literal literal(ShapeUtil::MakeShape(U8, {static_cast<int64>(value.size())}));
-  for (int i = 0; i < value.size(); ++i) {
+  for (int i = 0, end = value.size(); i < end; ++i) {
     literal.Set<uint8>({i}, value[i]);
   }
   return literal;
@@ -345,7 +345,7 @@ Literal ConvertType(LiteralSlice literal) {
     absl::Span<const int64> new_dimensions,
     absl::Span<const int64> minor_to_major, const LiteralSlice& literal) {
   int64 new_num_elements = 1;
-  for (int64 i = 0; i < new_dimensions.size(); ++i) {
+  for (int64 i = 0, end = new_dimensions.size(); i < end; ++i) {
     new_num_elements *= new_dimensions[i];
   }
   CHECK_EQ(ShapeUtil::ElementsIn(literal.shape()), new_num_elements);
@@ -472,7 +472,7 @@ Literal ConvertType(LiteralSlice literal) {
     element_shapes.push_back(element->shape());
   }
   Literal literal(ShapeUtil::MakeTupleShape(element_shapes));
-  for (int i = 0; i < elements.size(); ++i) {
+  for (int i = 0, end = elements.size(); i < end; ++i) {
     TF_CHECK_OK(literal.CopyFrom(*elements[i], /*dest_shape_index=*/{i}));
   }
   return literal;
@@ -485,7 +485,7 @@ Literal ConvertType(LiteralSlice literal) {
     element_shapes.push_back(element.shape());
   }
   Literal literal(ShapeUtil::MakeTupleShape(element_shapes));
-  for (int i = 0; i < elements.size(); ++i) {
+  for (int i = 0, end = elements.size(); i < end; ++i) {
     TF_CHECK_OK(literal.CopyFrom(elements[i], /*dest_shape_index=*/{i}));
   }
   return literal;
@@ -499,7 +499,7 @@ Literal ConvertType(LiteralSlice literal) {
     element_shapes.push_back(element.shape());
   }
   Literal literal(ShapeUtil::MakeTupleShape(element_shapes));
-  for (int64 i = 0; i < elements.size(); ++i) {
+  for (int64 i = 0, end = elements.size(); i < end; ++i) {
     TF_CHECK_OK(
         literal.MoveFrom(std::move(elements[i]), /*dest_shape_index=*/{i}));
   }
diff --git a/tensorflow/compiler/xla/metric_table_report.cc b/tensorflow/compiler/xla/metric_table_report.cc
index bad65ac3201..1a616341315 100644
--- a/tensorflow/compiler/xla/metric_table_report.cc
+++ b/tensorflow/compiler/xla/metric_table_report.cc
@@ -80,9 +80,11 @@ void MetricTableReport::WriteReportToInfoLog(double expected_metric_sum) {
 
   int64 pos = 0;
   const string report = MakeReport(expected_metric_sum);
-  while (pos < report.size()) {
+  const int report_size = report.size();
+  while (pos < report_size) {
     int64 end_of_line = report.find('\n', pos);
-    if (end_of_line == string::npos) {
+    const int64 _npos = string::npos;
+    if (end_of_line == _npos) {
       end_of_line = report.size();
     }
     absl::string_view line(report.data() + pos, end_of_line - pos);
@@ -161,7 +163,8 @@ void MetricTableReport::AppendCategoryTable() {
     const char* const kIndentPrefix = "                              * ";
     int64 entries_to_show = std::min<int64>(max_entries_per_category_to_show_,
                                             category.entries.size());
-    if (category.entries.size() == entries_to_show + 1) {
+    const int64 category_entries_size = category.entries.size();
+    if (category_entries_size == entries_to_show + 1) {
       // May as well show the last entry on the line that would otherwise say
       // that there is a single entry not shown.
       ++entries_to_show;
@@ -224,7 +227,8 @@ void MetricTableReport::AppendTableRow(const string& text, const double metric,
   // Don't try to make a gigantic string and crash if expected_metric_sum_ is
   // wrong somehow.
   int64 padding_len = 1;
-  if (max_metric_string_size >= metric_string.size()) {
+  const int64 metric_string_size = metric_string.size();
+  if (max_metric_string_size >= metric_string_size) {
     padding_len += max_metric_string_size - metric_string.size();
   }
   string padding(padding_len, ' ');
@@ -254,7 +258,7 @@ string MetricTableReport::MetricString(double metric) {
     sp1.remove_prefix(1);
   }
   // Copy rest of input characters.
-  for (int64 i = 0; i < sp1.size(); ++i) {
+  for (int64 i = 0, end = sp1.size(); i < end; ++i) {
     if (i > 0 && (sp1.size() - i) % 3 == 0) {
       output.push_back(',');
     }
diff --git a/tensorflow/compiler/xla/pjrt/BUILD b/tensorflow/compiler/xla/pjrt/BUILD
index 6e61e0600a0..5b3b75eb352 100644
--- a/tensorflow/compiler/xla/pjrt/BUILD
+++ b/tensorflow/compiler/xla/pjrt/BUILD
@@ -59,6 +59,10 @@ cc_library(
     name = "tracked_device_buffer",
     srcs = ["tracked_device_buffer.cc"],
     hdrs = ["tracked_device_buffer.h"],
+    visibility = [
+        "//learning/pathways/data_parallel:__pkg__",
+        "//tensorflow:internal",
+    ],
     deps = [
         ":event_pool",
         ":local_device_state",
diff --git a/tensorflow/compiler/xla/pjrt/distributed/client.cc b/tensorflow/compiler/xla/pjrt/distributed/client.cc
index 830e512b156..43c0c7b277d 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/client.cc
+++ b/tensorflow/compiler/xla/pjrt/distributed/client.cc
@@ -17,14 +17,16 @@ limitations under the License.
 
 #include <chrono>  // NOLINT
 
+#include "absl/time/time.h"
 #include "tensorflow/compiler/xla/pjrt/distributed/protocol.h"
 #include "tensorflow/compiler/xla/pjrt/distributed/util.h"
 
 namespace xla {
 
 DistributedRuntimeClient::DistributedRuntimeClient(
-    std::shared_ptr<::grpc::Channel> channel)
-    : stub_(grpc::DistributedRuntimeService::NewStub(std::move(channel))) {}
+    std::shared_ptr<::grpc::Channel> channel, absl::Duration rpc_timeout)
+    : stub_(grpc::DistributedRuntimeService::NewStub(std::move(channel))),
+      rpc_timeout_(rpc_timeout) {}
 DistributedRuntimeClient::~DistributedRuntimeClient() = default;
 
 xla::Status DistributedRuntimeClient::Connect(
@@ -35,6 +37,7 @@ xla::Status DistributedRuntimeClient::Connect(
   ctx.set_deadline(absl::ToChronoTime(absl::Now() + rpc_timeout_));
   ConnectRequest request;
   request.set_protocol_version(kDistributedRuntimeProtocolVersion);
+  request.set_timeout_milliseconds(absl::ToInt64Milliseconds(rpc_timeout_));
   *request.mutable_local_topology() = local_topology;
   VLOG(10) << "Connect: " << request.DebugString();
   ConnectResponse response;
diff --git a/tensorflow/compiler/xla/pjrt/distributed/client.h b/tensorflow/compiler/xla/pjrt/distributed/client.h
index 865a752849e..049d76af4d6 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/client.h
+++ b/tensorflow/compiler/xla/pjrt/distributed/client.h
@@ -29,7 +29,10 @@ namespace xla {
 
 class DistributedRuntimeClient {
  public:
-  explicit DistributedRuntimeClient(std::shared_ptr<::grpc::Channel> channel);
+  DistributedRuntimeClient(std::shared_ptr<::grpc::Channel> channel,
+                           absl::Duration rpc_timeout);
+  explicit DistributedRuntimeClient(std::shared_ptr<::grpc::Channel> channel)
+      : DistributedRuntimeClient(channel, absl::Seconds(120)) {}
   ~DistributedRuntimeClient();
 
   xla::Status Connect(const LocalTopologyProto& local_topology,
@@ -42,7 +45,7 @@ class DistributedRuntimeClient {
 
  private:
   const std::unique_ptr<grpc::DistributedRuntimeService::Stub> stub_;
-  const absl::Duration rpc_timeout_ = absl::Seconds(120);
+  const absl::Duration rpc_timeout_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/distributed/protocol.h b/tensorflow/compiler/xla/pjrt/distributed/protocol.h
index 4daa939ac8d..e8be43006f7 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/protocol.h
+++ b/tensorflow/compiler/xla/pjrt/distributed/protocol.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 namespace xla {
 
-static constexpr int kDistributedRuntimeProtocolVersion = 1;
+static constexpr int kDistributedRuntimeProtocolVersion = 2;
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/pjrt/distributed/protocol.proto b/tensorflow/compiler/xla/pjrt/distributed/protocol.proto
index 18bfa221110..c3bbb3a7f5d 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/protocol.proto
+++ b/tensorflow/compiler/xla/pjrt/distributed/protocol.proto
@@ -61,6 +61,7 @@ message ConnectRequest {
   int32 protocol_version = 1;  // Always 1 at present.
 
   LocalTopologyProto local_topology = 2;
+  int32 timeout_milliseconds = 3;
 }
 
 message ConnectResponse {
diff --git a/tensorflow/compiler/xla/pjrt/distributed/service.cc b/tensorflow/compiler/xla/pjrt/distributed/service.cc
index 3325fcd8319..868529637de 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/service.cc
+++ b/tensorflow/compiler/xla/pjrt/distributed/service.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/pjrt/distributed/service.h"
 
+#include "absl/time/time.h"
 #include "tensorflow/compiler/xla/pjrt/distributed/protocol.h"
 #include "tensorflow/compiler/xla/pjrt/distributed/util.h"
 #include "tensorflow/compiler/xla/status.h"
@@ -69,11 +70,12 @@ void BuildGlobalTopology(absl::Span<LocalTopologyProto> local_topologies,
     mu_.AssertHeld();
     return num_nodes_present_ == nodes_.size();
   };
+  auto connect_timeout = absl::Milliseconds(request->timeout_milliseconds());
   if (!mu_.AwaitWithTimeout(absl::Condition(&all_nodes_present),
-                            kConnectTimeout)) {
+                            connect_timeout)) {
     return ToGrpcStatus(tensorflow::errors::DeadlineExceeded(
         "Timed out after %s waiting for all nodes to call Connect()",
-        absl::FormatDuration(kConnectTimeout)));
+        absl::FormatDuration(connect_timeout)));
   }
 
   if (node_id == 0) {
diff --git a/tensorflow/compiler/xla/pjrt/distributed/service.h b/tensorflow/compiler/xla/pjrt/distributed/service.h
index 9ecbdb3cc7c..fe323d9f3b2 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/service.h
+++ b/tensorflow/compiler/xla/pjrt/distributed/service.h
@@ -50,8 +50,6 @@ class DistributedRuntimeServiceImpl final
                              KeyValueSetResponse* response) override;
 
  private:
-  const absl::Duration kConnectTimeout = absl::Seconds(120);
-
   absl::Mutex mu_;
   enum class State { kInitializing, kRunning };
   State state_ ABSL_GUARDED_BY(mu_) = State::kInitializing;
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.cc b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
index 7e0d0159f4b..c5dce4a37f7 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
@@ -1004,7 +1004,7 @@ PjRtBuffer::GetBufferForHoldLocked(ScopedHold::Type type) {
     // acquiring any other kind of hold.
     WaitForOutstandingDonationHold();
     if (device_buffer_ == nullptr) {
-      return InvalidArgument("Hold requested on invalid buffer");
+      return InvalidArgument("Hold requested on deleted or donated buffer");
     } else {
       ++holds_[type];
     }
@@ -1084,7 +1084,8 @@ PjRtBuffer::CopyToHostAsyncInternal(bool discard_cached_copy,
     // We can't perform any other action while a donation hold is in progress.
     WaitForOutstandingDonationHold();
     if (device_buffer_ == nullptr) {
-      return InvalidArgument("CopyToHostAsync() called on invalid buffer.");
+      return InvalidArgument(
+          "CopyToHostAsync() called on deleted or donated buffer");
     }
     if (discard_cached_copy) {
       auto it = host_values_.find(host_layout);
@@ -1154,7 +1155,7 @@ StatusOr<std::shared_ptr<Literal>> PjRtBuffer::ToLiteral(
   TF_ASSIGN_OR_RETURN(std::shared_ptr<HostValue> host_value,
                       CopyToHostAsyncInternal(discard_cached_copy, layout));
   if (host_value == nullptr) {
-    return InvalidArgument("ToLiteral called on invalid buffer");
+    return InvalidArgument("ToLiteral called on deleted or donated buffer");
   }
   host_value->ready.WaitForNotification();
   TF_RETURN_IF_ERROR(host_value->status);
@@ -1272,7 +1273,8 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::CopyToDevice(
     // We can't perform any other action while a donation hold is in progress.
     WaitForOutstandingDonationHold();
     if (device_buffer_ == nullptr) {
-      return InvalidArgument("CopyToDevice called on invalid buffer");
+      return InvalidArgument(
+          "CopyToDevice called on deleted or donated buffer");
     }
     AcquireHoldLocked(&src_device_buffer);
   }
@@ -1313,7 +1315,8 @@ Status PjRtBuffer::BlockHostUntilReady() {
   {
     absl::MutexLock lock(&mu_);
     if (device_buffer_ == nullptr) {
-      return InvalidArgument("BlockHostUntilReady() called on invalid buffer.");
+      return InvalidArgument(
+          "BlockHostUntilReady() called on deleted or donated buffer");
     }
     device_buffer = device_buffer_;
   }
@@ -1383,7 +1386,7 @@ StatusOr<TupleHandle> MakeTupleHelper(
         local_device->compute_stream()->parent(), root_table_memory.cref()));
   }
 
-  ExecutionInput execution_input(on_device_shape);
+  ExecutionInput execution_input(on_device_shape, on_host_shape);
   ShapeTree<MaybeOwningDeviceMemory>::iterator input_iterator =
       execution_input.MutableBuffers()->begin();
   ShapeTree<MaybeOwningDeviceMemory>::iterator iterator_end =
@@ -1521,7 +1524,6 @@ StatusOr<ScopedShapedBuffer> PjRtExecutable::EnqueueExecution(
           << " mapped to device ordinal for execution: " << device_ordinal;
 
   absl::flat_hash_set<BufferSequencingEvent*> events;
-  std::vector<const Shape*> argument_host_shapes;
   std::vector<ExecutionInput> execution_inputs;
   device_buffers->reserve(argument_handles.size());
   const absl::flat_hash_set<int>& parameters_that_must_be_donated =
@@ -1570,24 +1572,22 @@ StatusOr<ScopedShapedBuffer> PjRtExecutable::EnqueueExecution(
   }
 
   LocalDeviceState* device_state = &client_->device_state(device_ordinal);
-  TupleHandle tuple_handle;
+  absl::optional<TupleHandle> tuple_handle;
   if (parameter_is_tupled_arguments_ && !options.arguments_are_tupled) {
     TF_ASSIGN_OR_RETURN(tuple_handle,
                         MakeTupleHelper(client_, device_state, argument_handles,
                                         *device_buffers, device_ordinal));
-    events.insert(tuple_handle.event.get());
-    execution_inputs.emplace_back(std::move(tuple_handle.execution_input));
-    argument_host_shapes.push_back(&tuple_handle.on_host_shape);
+    events.insert(tuple_handle->event.get());
+    execution_inputs.emplace_back(std::move(tuple_handle->execution_input));
   } else {
-    argument_host_shapes.reserve(argument_handles.size());
     execution_inputs.reserve(argument_handles.size());
     for (int i = 0; i < argument_handles.size(); ++i) {
       PjRtBuffer* handle = argument_handles[i];
-      argument_host_shapes.push_back(&handle->on_host_shape());
 
       const PjRtBuffer::ScopedHold& device_buffer = (*device_buffers)[i];
       // Make an ExecutionInput from the device buffer.
-      execution_inputs.emplace_back(handle->on_device_shape());
+      execution_inputs.emplace_back(handle->on_device_shape(),
+                                    handle->on_host_shape());
       ExecutionInput& execution_input = execution_inputs.back();
       ShapeTree<MaybeOwningDeviceMemory>::iterator input_iterator =
           execution_input.MutableBuffers()->begin();
@@ -1613,6 +1613,10 @@ StatusOr<ScopedShapedBuffer> PjRtExecutable::EnqueueExecution(
   run_options.set_run_id(run_id);
   run_options.set_rng_seed(device_state->GetNewPrngSeed());
   run_options.set_gpu_executable_run_options(client_->gpu_run_options());
+  run_options.set_launch_id(options.launch_id);
+  if (run_options.launch_id() != 0) {
+    VLOG(1) << "launch id for " << name() << ": " << run_options.launch_id();
+  }
 
   // The choice of where we wait is arbitrary; the reason for the wait is
   // pacing to avoid problems such as memory fragmentation and running ahead
@@ -1623,8 +1627,8 @@ StatusOr<ScopedShapedBuffer> PjRtExecutable::EnqueueExecution(
       device_state->compute_semaphore().ScopedAcquire(1));
 
   StatusOr<ExecutionOutput> result_buffer_or_status =
-      executables_[executable_idx]->RunAsync(
-          argument_host_shapes, std::move(execution_inputs), run_options);
+      executables_[executable_idx]->RunAsync(std::move(execution_inputs),
+                                             run_options);
 
   VLOG(1) << "Replica " << replica << " partition " << partition
           << " completed; ok=" << result_buffer_or_status.ok();
@@ -2141,13 +2145,13 @@ StatusOr<std::pair<std::vector<Shape>, Shape>> GetShardedProgramShapes(
       client->client()->Compile(computation, argument_layout_pointers,
                                 build_options));
 
-  auto py_executable = absl::make_unique<PjRtExecutable>(
+  auto executable = absl::make_unique<PjRtExecutable>(
       std::move(local_executables), options.parameter_is_tupled_arguments,
       std::move(device_assignment), std::move(local_logical_device_ids),
       std::move(local_devices), client);
-  TF_RETURN_IF_ERROR(py_executable->SetUpDonation(
-      client, options.parameter_is_tupled_arguments));
-  return py_executable;
+  TF_RETURN_IF_ERROR(
+      executable->SetUpDonation(client, options.parameter_is_tupled_arguments));
+  return executable;
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.h b/tensorflow/compiler/xla/pjrt/pjrt_client.h
index b234027adf3..bb9093a8bf7 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.h
@@ -119,6 +119,8 @@ struct PjRtCrossHostRecvBuffer {
 using PjRtCrossHostRecvNotifier =
     std::function<void(StatusOr<std::vector<PjRtCrossHostRecvBuffer>>&&)>;
 
+class PjRtExecutable;
+
 // Encapsulates the state of Python session with XLA.
 //
 // It is the responsibility of the client of this API to keep the PjRtClient
@@ -181,6 +183,13 @@ class PjRtClient {
   virtual StatusOr<absl::flat_hash_set<int>> GetParametersThatMustBeDonated(
       const LocalExecutable& executable, bool tuple_inputs) const;
 
+  // Generates a unique fingerprint for `executable`. See
+  // PjRtExecutable::fingerprint_.
+  virtual StatusOr<absl::optional<std::string>> ExecutableFingerprint(
+      const PjRtExecutable& executable) const {
+    return absl::optional<std::string>();
+  }
+
  protected:
   friend class PjRtBuffer;
   virtual void EnqueueCrossHostReceive(
@@ -668,6 +677,11 @@ struct ExecuteOptions {
   // If true, the computation must return a tuple, which will be destructured
   // into its elements.
   bool untuple_result = false;
+  // If non-zero, identifies this execution as part of a potentially
+  // multi-device launch. This can be used to detect scheduling errors, e.g. if
+  // multi-host programs are launched in different orders on different hosts,
+  // the launch IDs may be used by the runtime to detect the mismatch.
+  int32 launch_id = 0;
 };
 
 // Represents a compiled computation that can be executed given handles to
@@ -687,6 +701,8 @@ class PjRtExecutable {
                  std::vector<std::pair<int, int>> local_logical_device_ids,
                  std::vector<Device*> local_devices, PjRtClient* client);
 
+  virtual ~PjRtExecutable() = default;
+
   PjRtClient* client() const { return client_; }
 
   int num_replicas() const {
@@ -744,12 +760,14 @@ class PjRtExecutable {
   // Initializes information about which arguments to which executables must be
   // donated due to aliases that were specified by the computation.
   Status SetUpDonation(PjRtClient* client, bool tuple_inputs);
+
   StatusOr<ScopedShapedBuffer> EnqueueExecution(
       absl::Span<PjRtBuffer* const> argument_handles, int replica,
       int partition, int executable_idx, const RunId& run_id,
       const ExecuteOptions& options, Device* device,
       std::vector<PjRtBuffer::ScopedHold>* device_buffers,
       std::shared_ptr<DeviceAssignment> device_assignment) const;
+
   StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteHelper(
       absl::Span<PjRtBuffer* const> argument_handles, int replica,
       int partition, const RunId& run_id, const ExecuteOptions& options,
diff --git a/tensorflow/compiler/xla/primitive_util.cc b/tensorflow/compiler/xla/primitive_util.cc
index 2143d1dfbe7..c932469c56a 100644
--- a/tensorflow/compiler/xla/primitive_util.cc
+++ b/tensorflow/compiler/xla/primitive_util.cc
@@ -112,6 +112,21 @@ xla::PrimitiveType UnsignedIntegralTypeForBitWidth(int64 src_bitwidth) {
   }
 }
 
+xla::PrimitiveType SignedIntegralTypeForBitWidth(int64 src_bitwidth) {
+  switch (src_bitwidth) {
+    case 8:
+      return xla::S8;
+    case 16:
+      return xla::S16;
+    case 32:
+      return xla::S32;
+    case 64:
+      return xla::S64;
+    default:
+      return xla::PRIMITIVE_TYPE_INVALID;
+  }
+}
+
 PrimitiveType ComplexComponentType(PrimitiveType complex_type) {
   switch (complex_type) {
     case C64:
diff --git a/tensorflow/compiler/xla/primitive_util.h b/tensorflow/compiler/xla/primitive_util.h
index 034c14e8930..1228b4f9a32 100644
--- a/tensorflow/compiler/xla/primitive_util.h
+++ b/tensorflow/compiler/xla/primitive_util.h
@@ -153,6 +153,8 @@ int BitWidth(PrimitiveType type);
 
 PrimitiveType UnsignedIntegralTypeForBitWidth(int64 src_bitwidth);
 
+PrimitiveType SignedIntegralTypeForBitWidth(int64 src_bitwidth);
+
 // Returns the real, imag component type underlying the given complex type.
 // LOG(FATAL)'s if complex_type is not complex.
 PrimitiveType ComplexComponentType(PrimitiveType complex_type);
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index 94e345091eb..aa55a39218d 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -202,6 +202,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/core/platform:fingerprint",
         "//tensorflow/core/profiler:protos_all_cc",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -400,7 +401,7 @@ pybind_extension(
         "//tensorflow/core:lib_internal_impl",  # buildcleaner: keep
         "//tensorflow/core/profiler/lib:profiler_backends",
         "//tensorflow/core/profiler/lib:profiler_session",
-        "//tensorflow/core/profiler/rpc:profiler_server",
+        "//tensorflow/core/profiler/rpc:profiler_server_impl",
         "//tensorflow/python/profiler/internal:traceme_wrapper",
         "//tensorflow/stream_executor:device_memory_allocator",
         "//tensorflow/stream_executor:platform",
diff --git a/tensorflow/compiler/xla/python/bfloat16.cc b/tensorflow/compiler/xla/python/bfloat16.cc
index 6d80a60550b..1f21b3fb242 100644
--- a/tensorflow/compiler/xla/python/bfloat16.cc
+++ b/tensorflow/compiler/xla/python/bfloat16.cc
@@ -455,10 +455,10 @@ int NPyBfloat16_Compare(const void* a, const void* b, void* arr) {
     return 1;
   }
   // NaNs sort to the end.
-  if (!std::isnan(x) && std::isnan(y)) {
+  if (!Eigen::numext::isnan(x) && Eigen::numext::isnan(y)) {
     return -1;
   }
-  if (std::isnan(x) && !std::isnan(y)) {
+  if (Eigen::numext::isnan(x) && !Eigen::numext::isnan(y)) {
     return 1;
   }
   return 0;
@@ -962,7 +962,7 @@ struct Frexp {
 struct Heaviside {
   bfloat16 operator()(bfloat16 bx, bfloat16 h0) {
     float x = static_cast<float>(bx);
-    if (std::isnan(x)) {
+    if (Eigen::numext::isnan(x)) {
       return bx;
     }
     if (x < 0) {
@@ -984,7 +984,9 @@ struct IsInf {
   bool operator()(bfloat16 a) { return std::isinf(static_cast<float>(a)); }
 };
 struct IsNan {
-  bool operator()(bfloat16 a) { return std::isnan(static_cast<float>(a)); }
+  bool operator()(bfloat16 a) {
+    return Eigen::numext::isnan(static_cast<float>(a));
+  }
 };
 struct Ldexp {
   bfloat16 operator()(bfloat16 a, int exp) {
@@ -1200,25 +1202,25 @@ struct Ge {
 struct Maximum {
   bfloat16 operator()(bfloat16 a, bfloat16 b) {
     float fa(a), fb(b);
-    return std::isnan(fa) || fa > fb ? a : b;
+    return Eigen::numext::isnan(fa) || fa > fb ? a : b;
   }
 };
 struct Minimum {
   bfloat16 operator()(bfloat16 a, bfloat16 b) {
     float fa(a), fb(b);
-    return std::isnan(fa) || fa < fb ? a : b;
+    return Eigen::numext::isnan(fa) || fa < fb ? a : b;
   }
 };
 struct Fmax {
   bfloat16 operator()(bfloat16 a, bfloat16 b) {
     float fa(a), fb(b);
-    return std::isnan(fb) || fa > fb ? a : b;
+    return Eigen::numext::isnan(fb) || fa > fb ? a : b;
   }
 };
 struct Fmin {
   bfloat16 operator()(bfloat16 a, bfloat16 b) {
     float fa(a), fb(b);
-    return std::isnan(fb) || fa < fb ? a : b;
+    return Eigen::numext::isnan(fb) || fa < fb ? a : b;
   }
 };
 
@@ -1244,7 +1246,8 @@ struct NextAfter {
     float from_as_float(from), to_as_float(to);
     memcpy(&from_as_int, &from, sizeof(bfloat16));
     memcpy(&to_as_int, &to, sizeof(bfloat16));
-    if (std::isnan(from_as_float) || std::isnan(to_as_float)) {
+    if (Eigen::numext::isnan(from_as_float) ||
+        Eigen::numext::isnan(to_as_float)) {
       return bfloat16(std::numeric_limits<float>::quiet_NaN());
     }
     if (from_as_int == to_as_int) {
diff --git a/tensorflow/compiler/xla/python/ops.cc b/tensorflow/compiler/xla/python/ops.cc
index 9362a367dfc..3ac4709b160 100644
--- a/tensorflow/compiler/xla/python/ops.cc
+++ b/tensorflow/compiler/xla/python/ops.cc
@@ -114,24 +114,26 @@ void BuildOpsSubmodule(py::module* m) {
       "CustomCall",
       [](XlaBuilder* builder, const py::bytes& call_target_name,
          absl::Span<const XlaOp> operands, const Shape& shape,
-         const py::bytes& opaque) -> XlaOp {
-        return CustomCall(builder, call_target_name, operands, shape, opaque);
+         const py::bytes& opaque, bool has_side_effect) -> XlaOp {
+        return CustomCall(builder, call_target_name, operands, shape, opaque,
+                          has_side_effect);
       },
       py::arg("builder"), py::arg("call_target_name"), py::arg("operands"),
-      py::arg("shape"), py::arg("opaque") = py::bytes(""));
+      py::arg("shape"), py::arg("opaque") = py::bytes(""),
+      py::arg("has_side_effect") = false);
   ops.def(
       "CustomCallWithLayout",
       [](XlaBuilder* builder, const py::bytes& call_target_name,
          absl::Span<const XlaOp> operands, const Shape& shape_with_layout,
          absl::Span<const Shape> operand_shapes_with_layout,
-         const py::bytes& opaque) -> XlaOp {
-        return CustomCallWithLayout(builder, call_target_name, operands,
-                                    shape_with_layout,
-                                    operand_shapes_with_layout, opaque);
+         const py::bytes& opaque, bool has_side_effect) -> XlaOp {
+        return CustomCallWithLayout(
+            builder, call_target_name, operands, shape_with_layout,
+            operand_shapes_with_layout, opaque, has_side_effect);
       },
       py::arg("builder"), py::arg("call_target_name"), py::arg("operands"),
       py::arg("shape_with_layout"), py::arg("operand_shapes_with_layout"),
-      py::arg("opaque") = py::bytes(""));
+      py::arg("opaque") = py::bytes(""), py::arg("has_side_effect") = false);
   ops.def("Dot", &Dot, py::arg("lhs"), py::arg("rhs"),
           py::arg("precision_config") = nullptr);
   ops.def("DotGeneral", &DotGeneral, py::arg("lhs"), py::arg("rhs"),
diff --git a/tensorflow/compiler/xla/python/py_client.cc b/tensorflow/compiler/xla/python/py_client.cc
index bc7244cfc64..9b95f8e03de 100644
--- a/tensorflow/compiler/xla/python/py_client.cc
+++ b/tensorflow/compiler/xla/python/py_client.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/python/py_client.h"
 
+#include <memory>
+
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/python/py_buffer.h"
 #include "tensorflow/compiler/xla/python/py_executable.h"
@@ -83,7 +85,7 @@ PyClient::GetDefaultDeviceAssignment1D(int num_replicas) {
   return result;
 }
 
-StatusOr<std::unique_ptr<PyBuffer>> PyClient::BufferFromPyal(
+StatusOr<std::unique_ptr<PyBuffer>> PyClient::BufferFromPyval(
     const pybind11::object& argument, Device* device, bool force_copy,
     PjRtBuffer::HostBufferSemantics host_buffer_semantics) {
   if (device == nullptr) {
@@ -104,7 +106,6 @@ StatusOr<std::unique_ptr<PyBuffer>> PyClient::BufferFromPyal(
     return InvalidArgument("from_python argument must be an array.");
   }
 
-  TF_ASSIGN_OR_RETURN(PythonBufferTree tree, GetPythonBufferTree(argument));
   std::shared_ptr<PythonRefManager::ManagedPyObjects> py_buffer_ref =
       GlobalPyRefManager()->ManageReference(std::move(c->array));
 
@@ -121,18 +122,22 @@ StatusOr<std::unique_ptr<PyBuffer>> PyClient::BufferFromPyal(
                                     std::move(traceback));
 }
 
-StatusOr<std::unique_ptr<PyExecutable>> PyClient::Compile(
+StatusOr<std::shared_ptr<PyExecutable>> PyClient::Compile(
     const XlaComputation& computation, CompileOptions options) {
   std::unique_ptr<PjRtExecutable> executable;
+  absl::optional<std::string> fingerprint;
   {
     py::gil_scoped_release gil_release;
     TF_ASSIGN_OR_RETURN(executable,
                         PjRtExecutable::Compile(computation, pjrt_client_.get(),
                                                 std::move(options)));
+    TF_ASSIGN_OR_RETURN(fingerprint,
+                        pjrt_client_->ExecutableFingerprint(*executable));
   }
   auto traceback = Traceback::Get();
-  return std::make_unique<PyExecutable>(
-      shared_from_this(), std::move(executable), std::move(traceback));
+  return std::make_shared<PyExecutable>(
+      shared_from_this(), std::move(executable), std::move(traceback),
+      std::move(fingerprint));
 }
 
 class ProfileBuilder {
@@ -275,7 +280,8 @@ py::bytes PyClient::HeapProfile() {
       kind_label->set_str(buffer_string_id);
       auto* device_label = sample->add_label();
       device_label->set_key(device_string_id);
-      device_label->set_num(entry.first.device->id());
+      device_label->set_str(
+          builder.StringId(entry.first.device->DebugString()));
     } else {
       kind_label->set_str(executable_string_id);
     }
diff --git a/tensorflow/compiler/xla/python/py_client.h b/tensorflow/compiler/xla/python/py_client.h
index be61bd74419..e41415c42f2 100644
--- a/tensorflow/compiler/xla/python/py_client.h
+++ b/tensorflow/compiler/xla/python/py_client.h
@@ -120,11 +120,11 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
     return pjrt_client_->client()->CreateHostToDeviceChannelHandle();
   }
 
-  StatusOr<std::unique_ptr<PyBuffer>> BufferFromPyal(
+  StatusOr<std::unique_ptr<PyBuffer>> BufferFromPyval(
       const pybind11::object& argument, Device* device, bool force_copy,
       PjRtBuffer::HostBufferSemantics host_buffer_semantics);
 
-  StatusOr<std::unique_ptr<PyExecutable>> Compile(
+  StatusOr<std::shared_ptr<PyExecutable>> Compile(
       const XlaComputation& computation, CompileOptions options);
 
   pybind11::bytes HeapProfile();
diff --git a/tensorflow/compiler/xla/python/py_executable.cc b/tensorflow/compiler/xla/python/py_executable.cc
index c56fd3a89fc..ed524f1cb33 100644
--- a/tensorflow/compiler/xla/python/py_executable.cc
+++ b/tensorflow/compiler/xla/python/py_executable.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/python/py_executable.h"
 
 #include "absl/algorithm/container.h"
+#include "tensorflow/core/platform/fingerprint.h"
 
 namespace xla {
 
@@ -23,10 +24,12 @@ namespace py = pybind11;
 
 PyExecutable::PyExecutable(std::shared_ptr<PyClient> client,
                            std::unique_ptr<PjRtExecutable> executable,
-                           std::shared_ptr<Traceback> traceback)
+                           std::shared_ptr<Traceback> traceback,
+                           absl::optional<std::string> fingerprint)
     : client_(std::move(client)),
       executable_(std::move(executable)),
-      traceback_(std::move(traceback)) {
+      traceback_(std::move(traceback)),
+      fingerprint_(std::move(fingerprint)) {
   CHECK(PyGILState_Check());
   next_ = client_->executables_;
   client_->executables_ = this;
@@ -34,6 +37,12 @@ PyExecutable::PyExecutable(std::shared_ptr<PyClient> client,
   if (next_) {
     next_->prev_ = this;
   }
+  options_.untuple_result = true;
+  if (fingerprint_) {
+    options_.launch_id = tensorflow::Fingerprint32(*fingerprint_);
+    VLOG(1) << "Fingerprint for executable " << executable_->name() << ": "
+            << *fingerprint_;
+  }
 }
 
 PyExecutable::~PyExecutable() {
@@ -58,18 +67,33 @@ std::vector<ClientAndPtr<Device>> PyExecutable::LocalDevices() const {
   return devices;
 }
 
+StatusOr<std::vector<std::unique_ptr<PyBuffer>>> PyExecutable::PjRtExecute(
+    absl::Span<PjRtBuffer* const> args) {
+  std::vector<std::unique_ptr<PjRtBuffer>> output_buffers;
+  {
+    py::gil_scoped_release gil_release;
+    TF_ASSIGN_OR_RETURN(output_buffers, executable_->Execute(args, options_));
+  }
+  auto traceback = Traceback::Get();
+  std::vector<std::unique_ptr<PyBuffer>> outputs;
+  outputs.reserve(output_buffers.size());
+  for (auto& buffer : output_buffers) {
+    outputs.push_back(
+        std::make_unique<PyBuffer>(client_, std::move(buffer), traceback));
+  }
+  return outputs;
+}
+
 StatusOr<std::vector<std::unique_ptr<PyBuffer>>> PyExecutable::Execute(
     absl::Span<PyBuffer* const> args) {
   std::vector<std::unique_ptr<PjRtBuffer>> output_buffers;
   {
     py::gil_scoped_release gil_release;
-    ExecuteOptions options;
-    options.untuple_result = true;
     std::vector<PjRtBuffer*> arg_buffers(args.size());
     absl::c_transform(args, arg_buffers.begin(),
                       [](PyBuffer* buf) { return buf->buffer(); });
     TF_ASSIGN_OR_RETURN(output_buffers,
-                        executable_->Execute(arg_buffers, options));
+                        executable_->Execute(arg_buffers, options_));
   }
   auto traceback = Traceback::Get();
   std::vector<std::unique_ptr<PyBuffer>> outputs;
@@ -87,8 +111,6 @@ PyExecutable::ExecuteOnLocalDevices(
   std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> output_buffers;
   {
     py::gil_scoped_release gil_release;
-    ExecuteOptions options;
-    options.untuple_result = true;
     std::vector<std::vector<PjRtBuffer*>> arg_buffers(args.size());
     for (int computation = 0; computation < args.size(); ++computation) {
       arg_buffers[computation].resize(args[computation].size());
@@ -96,7 +118,7 @@ PyExecutable::ExecuteOnLocalDevices(
                         [](PyBuffer* buf) { return buf->buffer(); });
     }
     TF_ASSIGN_OR_RETURN(output_buffers, executable_->ExecuteOnLocalDevices(
-                                            arg_buffers, options));
+                                            arg_buffers, options_));
   }
   auto traceback = Traceback::Get();
   std::vector<std::vector<std::unique_ptr<PyBuffer>>> outputs;
diff --git a/tensorflow/compiler/xla/python/py_executable.h b/tensorflow/compiler/xla/python/py_executable.h
index 7f35f97f6e9..24f177261e7 100644
--- a/tensorflow/compiler/xla/python/py_executable.h
+++ b/tensorflow/compiler/xla/python/py_executable.h
@@ -37,7 +37,8 @@ class PyExecutable {
  public:
   PyExecutable(std::shared_ptr<PyClient> client,
                std::unique_ptr<PjRtExecutable> executable,
-               std::shared_ptr<Traceback> traceback);
+               std::shared_ptr<Traceback> traceback,
+               absl::optional<std::string> fingerprint);
   ~PyExecutable();
 
   std::shared_ptr<PyClient> client() const { return client_; }
@@ -57,6 +58,10 @@ class PyExecutable {
   StatusOr<std::vector<std::unique_ptr<PyBuffer>>> Execute(
       absl::Span<PyBuffer* const> args);
 
+  // Same as above, but take as inputs `PjRtBuffer*`. Only targets C++ code.
+  StatusOr<std::vector<std::unique_ptr<PyBuffer>>> PjRtExecute(
+      absl::Span<PjRtBuffer* const> args);
+
   StatusOr<std::vector<std::vector<std::unique_ptr<PyBuffer>>>>
   ExecuteOnLocalDevices(absl::Span<const std::vector<PyBuffer*>> args);
 
@@ -64,6 +69,8 @@ class PyExecutable {
 
   Traceback* traceback() { return traceback_.get(); }
 
+  const PjRtExecutable& pjrt_executable() const { return *executable_; }
+
  private:
   friend class PyClient;
 
@@ -71,6 +78,14 @@ class PyExecutable {
   std::unique_ptr<PjRtExecutable> executable_;
   std::shared_ptr<Traceback> traceback_;
 
+  // Identical executables (i.e. representing the same program) will have the
+  // same fingerprint. nullopt on platforms or executables where fingerprints
+  // aren't implemented.
+  absl::optional<std::string> fingerprint_;
+
+  // The options to pass to `executable_.Execute`.
+  ExecuteOptions options_;
+
   // Doubly-linked list of all executables known to the client. Protected by the
   // GIL.
   PyExecutable* next_;
diff --git a/tensorflow/compiler/xla/python/tpu_driver/grpc_tpu_driver.cc b/tensorflow/compiler/xla/python/tpu_driver/grpc_tpu_driver.cc
index 7632f21d5b2..c6aff604aee 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/grpc_tpu_driver.cc
+++ b/tensorflow/compiler/xla/python/tpu_driver/grpc_tpu_driver.cc
@@ -657,7 +657,7 @@ void GrpcTpuStream::StreamWriterFn() {
     request_lock_.Unlock();
 
     for (const auto& r : reqs) {
-      TraceMe activity(absl::StrCat("GrpcTpuStream::Send "));
+      TraceMe activity("GrpcTpuStream::Send ");
       ::grpc::WriteOptions opts;
       opts.set_no_compression().clear_buffer_hint();
       stream_->Write(r, opts);
@@ -721,7 +721,7 @@ std::unique_ptr<BufferHandle> GrpcTpuStream::Allocate(
     absl::Span<Event* const> wait_for) {
   auto req = absl::make_unique<StreamRequest::Entry>();
   InitializeRequest(req.get(), wait_for);
-  TraceMe activity(absl::StrCat("GrpcTpuStream::Allocate(num_bytes)"));
+  TraceMe activity("GrpcTpuStream::Allocate(num_bytes)");
   req->mutable_alloc()->set_core_id(core_id);
   req->mutable_alloc()->set_region(region);
   req->mutable_alloc()->set_num_bytes(num_bytes);
@@ -737,7 +737,7 @@ std::unique_ptr<BufferHandle> GrpcTpuStream::Allocate(
     absl::Span<Event* const> wait_for) {
   auto req = absl::make_unique<StreamRequest::Entry>();
   InitializeRequest(req.get(), wait_for);
-  TraceMe activity(absl::StrCat("GrpcTpuStream::Allocate(shape)"));
+  TraceMe activity("GrpcTpuStream::Allocate(shape)");
   req->mutable_alloc()->set_core_id(core_id);
   req->mutable_alloc()->set_region(region);
   *req->mutable_alloc()->mutable_shape() = shape;
@@ -754,7 +754,7 @@ std::unique_ptr<BufferHandle> GrpcTpuStream::AllocateTuple(
     absl::Span<Event* const> wait_for) {
   auto req = absl::make_unique<StreamRequest::Entry>();
   InitializeRequest(req.get(), wait_for);
-  TraceMe activity(absl::StrCat("GrpcTpuStream::AllocateTuple"));
+  TraceMe activity("GrpcTpuStream::AllocateTuple");
   req->mutable_alloc_tuple()->set_core_id(core_id);
   req->mutable_alloc_tuple()->set_region(region);
   for (auto child : children) {
@@ -771,7 +771,7 @@ std::shared_ptr<Event> GrpcTpuStream::Deallocate(
     std::unique_ptr<BufferHandle> handle, absl::Span<Event* const> wait_for) {
   auto req = absl::make_unique<StreamRequest::Entry>();
   InitializeRequest(req.get(), wait_for);
-  TraceMe activity(absl::StrCat("GrpcTpuStream::Deallocate"));
+  TraceMe activity("GrpcTpuStream::Deallocate");
   auto grpc_handle = static_cast<GrpcBufferHandle*>(handle.get());
   req->mutable_dealloc()->set_handle(grpc_handle->id().AsInt());
   auto event =
@@ -784,7 +784,7 @@ std::shared_ptr<Event> GrpcTpuStream::TransferToDevice(
     const void* src, BufferHandle* dst, absl::Span<Event* const> wait_for) {
   auto req = absl::make_unique<StreamRequest::Entry>();
   InitializeRequest(req.get(), wait_for);
-  TraceMe activity(absl::StrCat("GrpcTpuStream::TransferToDevice"));
+  TraceMe activity("GrpcTpuStream::TransferToDevice");
   req->mutable_transfer_to()->mutable_data()->assign(
       static_cast<const char*>(src), dst->size_in_bytes());
   req->mutable_transfer_to()->set_target_handle(
@@ -799,7 +799,7 @@ std::shared_ptr<Event> GrpcTpuStream::TransferFromDevice(
     const BufferHandle* src, void* dst, absl::Span<Event* const> wait_for) {
   auto req = absl::make_unique<StreamRequest::Entry>();
   InitializeRequest(req.get(), wait_for);
-  TraceMe activity(absl::StrCat("GrpcTpuStream::TransferFromDevice"));
+  TraceMe activity("GrpcTpuStream::TransferFromDevice");
   req->mutable_transfer_from()->set_source_handle(
       static_cast<const GrpcBufferHandle*>(src)->id().AsInt());
   EventId event_id = EventId::FromInt(req->operation_id());
@@ -818,8 +818,10 @@ std::shared_ptr<Event> GrpcTpuStream::TransferFromDeviceToDevice(
     absl::Span<Event* const> wait_for) {
   auto req = absl::make_unique<StreamRequest::Entry>();
   InitializeRequest(req.get(), wait_for);
-  TraceMe activity(absl::StrCat("GrpcTpuStream::TransferFromDeviceToDevice",
-                                req->operation_id()));
+  TraceMe activity([&req] {
+    return absl::StrCat("GrpcTpuStream::TransferFromDeviceToDevice",
+                        req->operation_id());
+  });
 
   req->mutable_transfer_from_to()->set_source_handle(
       static_cast<const GrpcBufferHandle*>(src)->id().AsInt());
@@ -836,7 +838,7 @@ std::unique_ptr<CompiledProgramHandle> GrpcTpuStream::CompileProgram(
     absl::Span<Event* const> wait_for) {
   auto req = absl::make_unique<StreamRequest::Entry>();
   InitializeRequest(req.get(), wait_for);
-  TraceMe activity(absl::StrCat("GrpcTpuStream::CompileProgram"));
+  TraceMe activity("GrpcTpuStream::CompileProgram");
   *req->mutable_compile()->mutable_hlo_program() = source;
   req->mutable_compile()->set_num_replicas(num_replicas);
   EventId event_id = EventId::FromInt(req->operation_id());
@@ -861,7 +863,7 @@ std::unique_ptr<LoadedProgramHandle> GrpcTpuStream::LoadProgram(
     absl::Span<Event* const> wait_for) {
   auto req = absl::make_unique<StreamRequest::Entry>();
   InitializeRequest(req.get(), wait_for);
-  TraceMe activity(absl::StrCat("GrpcTpuStream::LoadProgram"));
+  TraceMe activity("GrpcTpuStream::LoadProgram");
   req->mutable_load()->set_core_id(core_id);
   auto grpc_handle = static_cast<const GrpcCompiledProgramHandle*>(handle);
   if (grpc_handle->id().client_id != driver_->client_id()) {
@@ -884,7 +886,7 @@ std::shared_ptr<Event> GrpcTpuStream::UnloadProgram(
     absl::Span<Event* const> wait_for) {
   auto req = absl::make_unique<StreamRequest::Entry>();
   InitializeRequest(req.get(), wait_for);
-  TraceMe activity(absl::StrCat("GrpcTpuStream::UnloadProgram"));
+  TraceMe activity("GrpcTpuStream::UnloadProgram");
   req->mutable_unload()->set_loaded_program_handle(
       static_cast<GrpcLoadedProgramHandle*>(handle.get())->id().AsInt());
   auto event =
diff --git a/tensorflow/compiler/xla/python/tpu_driver/platform/external/compat.h b/tensorflow/compiler/xla/python/tpu_driver/platform/external/compat.h
index 285d59e2304..0c7cc370e2a 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/platform/external/compat.h
+++ b/tensorflow/compiler/xla/python/tpu_driver/platform/external/compat.h
@@ -35,7 +35,13 @@ class Thread {
 
 class TraceMe {
  public:
-  explicit TraceMe(absl::string_view tag, int level = 1) {}
+  explicit TraceMe(absl::string_view name, int level = 1) {}
+  explicit TraceMe(std::string&& name, int level = 1) = delete;
+  explicit TraceMe(const std::string& name, int level = 1) = delete;
+  explicit TraceMe(const char* raw, int level = 1)
+      : TraceMe(absl::string_view(raw), level) {}
+  template <typename NameGeneratorT>
+  explicit TraceMe(NameGeneratorT name_generator, int level = 1) {}
   ~TraceMe() {}
 };
 
diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index ed9b80775d8..510175cebf6 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -539,7 +539,7 @@ PYBIND11_MODULE(xla_extension, m) {
            &PyClient::CreateDeviceToHostChannelHandle)
       .def("create_host_to_device_channel_handle",
            &PyClient::CreateHostToDeviceChannelHandle)
-      .def("buffer_from_pyval", &PyClient::BufferFromPyal, py::arg("argument"),
+      .def("buffer_from_pyval", &PyClient::BufferFromPyval, py::arg("argument"),
            py::arg("device") = nullptr, py::arg("force_copy") = false,
            py::arg("host_buffer_semantics") =
                PjRtBuffer::HostBufferSemantics::kZeroCopy)
@@ -654,7 +654,7 @@ PYBIND11_MODULE(xla_extension, m) {
   PyTypeObject* buffer_type = reinterpret_cast<PyTypeObject*>(buffer.ptr());
   buffer_type->tp_as_buffer = PyBuffer::BufferProtocol();
 
-  py::class_<PyExecutable, std::unique_ptr<PyExecutable>> executable(
+  py::class_<PyExecutable, std::shared_ptr<PyExecutable>> executable(
       m, "Executable");
   executable.def_property_readonly("client", &PyExecutable::client)
       .def("local_logical_device_ids", &PyExecutable::local_logical_device_ids)
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index ed786992e4f..f5618b95c3e 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -296,6 +296,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_matmul",
         "//tensorflow/core:lib",
+        "//third_party/eigen3",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:inlined_vector",
@@ -473,6 +474,7 @@ cc_library(
         "//tensorflow/compiler/xla:array",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/types:optional",
@@ -1698,11 +1700,15 @@ cc_library(
 cc_library(
     name = "hlo_creation_utils",
     srcs = ["hlo_creation_utils.cc"],
-    hdrs = ["hlo_creation_utils.h"],
+    hdrs = [
+        "hlo_creation_utils.h",
+        "//tensorflow/compiler/xla:literal_util",
+    ],
     deps = [
         ":hlo",
         ":hlo_module_config",
         ":shape_inference",
+        "//tensorflow/compiler/xla:comparison_util",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:statusor",
@@ -1813,6 +1819,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "comparison_expander",
+    srcs = ["comparison_expander.cc"],
+    hdrs = ["comparison_expander.h"],
+    deps = [
+        ":hlo",
+        ":hlo_creation_utils",
+        ":hlo_pass",
+        ":op_expander_pass",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/client/lib:comparators",
+    ],
+)
+
 cc_library(
     name = "scatter_expander",
     srcs = ["scatter_expander.cc"],
@@ -1871,6 +1892,27 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "triangular_solve_expander_test",
+    size = "medium",
+    srcs = ["triangular_solve_expander_test.cc"],
+    shard_count = 3,
+    deps = [
+        ":hlo",
+        ":triangular_solve_expander",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:reference_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:verified_hlo_module",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
 cc_library(
     name = "cholesky_expander",
     srcs = ["cholesky_expander.cc"],
@@ -2235,6 +2277,7 @@ tf_cc_test(
     srcs = ["gather_expander_test.cc"],
     deps = [
         ":gather_expander",
+        ":hlo_query",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_macros_header",
@@ -3383,6 +3426,15 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "memory_space_assignment_repacking",
+    hdrs = ["memory_space_assignment_repacking.h"],
+    deps = [
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+    ],
+)
+
 cc_library(
     name = "memory_space_assignment",
     srcs = ["memory_space_assignment.cc"],
@@ -3390,6 +3442,7 @@ cc_library(
     deps = [
         ":heap_simulator",
         ":hlo_cost_analysis",
+        ":memory_space_assignment_repacking",
         ":memory_space_assignment_utils",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/core/lib/math:math_util",
@@ -3911,6 +3964,39 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "conditional_canonicalizer",
+    srcs = ["conditional_canonicalizer.cc"],
+    hdrs = ["conditional_canonicalizer.h"],
+    deps = [
+        ":hlo",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:status_macros",
+    ],
+)
+
+tf_cc_test(
+    name = "conditional_canonicalizer_test",
+    srcs = ["conditional_canonicalizer_test.cc"],
+    deps = [
+        ":conditional_canonicalizer",
+        ":hlo",
+        ":hlo_matchers",
+        ":hlo_parser",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "hlo_get_dimension_size_rewriter",
     srcs = ["hlo_get_dimension_size_rewriter.cc"],
@@ -4928,3 +5014,34 @@ cc_library(
         "//tensorflow/stream_executor/lib",
     ],
 )
+
+cc_library(
+    name = "topk_rewriter",
+    srcs = ["topk_rewriter.cc"],
+    hdrs = ["topk_rewriter.h"],
+    deps = [
+        ":hlo",
+        ":hlo_casting_utils",
+        ":hlo_pass",
+        ":pattern_matcher",
+        "//tensorflow/compiler/xla:shape_util",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+tf_cc_test(
+    name = "topk_rewriter_test",
+    srcs = ["topk_rewriter_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_dce",
+        ":hlo_matchers",
+        ":topk_rewriter",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:test_macros_cpu",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 130661bf1cd..fa4d0e47a5d 100755
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -428,6 +428,10 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
         shape, hlo, zero, dims, AddReduce_computation));
   }
 
+  // Move scalar multiply to the smallest side of convolution to
+  // reduce multiply computations.
+  Status ScalarMultiplyReduction(HloInstruction* dot);
+
   // Convenience method for replacing an instruction with a bitcast. If operand
   // is not null, then the bitcast will use the specified operand instead of the
   // operand of the instruction.
@@ -509,6 +513,9 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
   // Tries to convert slice(reshape(X)) into reshape(slice(X))
   StatusOr<bool> TryToReorderSliceAndReshape(HloInstruction* slice);
 
+  // Tries to convert slice(reverse(X)) into reverse(slice(X))
+  StatusOr<bool> TryToReorderSliceAndReverse(HloInstruction* slice);
+
   // Tries to simplify `(and (< a N) (< a K))` in cases where `N <= K` into
   // `(< a N)`. This is crucial for being able to figure out the loop trip
   // count.
@@ -560,6 +567,200 @@ bool AlgebraicSimplifierVisitor::SameShape(const HloInstruction* lhs,
   }
 }
 
+namespace {
+
+float GetConstantValue(HloInstruction* inst) {
+  switch (inst->shape().element_type()) {
+    case BF16:
+      return static_cast<float>(inst->literal().GetFirstElement<bfloat16>());
+    case F32:
+      return inst->literal().GetFirstElement<float>();
+    default:
+      LOG(FATAL) << "Unsupported data type: " << inst->shape().element_type();
+  }
+}
+
+bool IsOpCodeMultiplyCommutative(HloOpcode opcode) {
+  switch (opcode) {
+    case HloOpcode::kMultiply:
+    case HloOpcode::kTranspose:
+    case HloOpcode::kReshape:
+    case HloOpcode::kSelect:
+      return true;
+    default:
+      return false;
+  }
+}
+
+std::unique_ptr<HloInstruction> MakeScalarInstruction(HloInstruction* target,
+                                                      float multiplier) {
+  switch (target->shape().element_type()) {
+    case BF16:
+      return HloInstruction::CreateConstant(LiteralUtil::ConvertF32ToBF16(
+          LiteralUtil::CreateR0<float>(multiplier)));
+      break;
+    case F32:
+      return HloInstruction::CreateConstant(
+          LiteralUtil::CreateR0<float>(multiplier));
+      break;
+    default:
+      LOG(FATAL) << "Unsupported data type: " << target->shape().element_type();
+  }
+}
+
+}  // namespace
+
+Status AlgebraicSimplifierVisitor::ScalarMultiplyReduction(
+    HloInstruction* dot) {
+  // We only process bfloat16 and float32 for now.
+  if (dot->shape().element_type() != BF16 &&
+      dot->shape().element_type() != F32) {
+    return Status::OK();
+  }
+
+  auto lhs = dot->mutable_operand(0);
+  auto rhs = dot->mutable_operand(1);
+
+  const int64 dot_size = ShapeUtil::ElementsIn(dot->shape());
+  const int64 lhs_size = ShapeUtil::ElementsIn(lhs->shape());
+  const int64 rhs_size = ShapeUtil::ElementsIn(rhs->shape());
+
+  HloInstruction* target = nullptr;
+  // (current node, user, operand_index)
+  std::vector<std::tuple<HloInstruction*, HloInstruction*, int64>> operands;
+  std::vector<HloInstruction*> users;
+
+  // Find which side of dot has the smallest size:
+  // operand 0, operand 1, or output.
+  if (dot_size <= std::min(lhs_size, rhs_size)) {
+    target = dot;
+    if (dot_size < lhs_size) {
+      operands.emplace_back(lhs, dot, 0);
+    }
+    if (dot_size < rhs_size) {
+      operands.emplace_back(rhs, dot, 1);
+    }
+  } else if (lhs_size <= rhs_size) {
+    target = lhs;
+    if (lhs_size < rhs_size) {
+      operands.emplace_back(rhs, dot, 1);
+    }
+    if (lhs_size < dot_size && dot->user_count() == 1) {
+      users.push_back(dot->users().front());
+    }
+  } else {
+    target = rhs;
+    if (rhs_size < lhs_size) {
+      operands.emplace_back(lhs, dot, 0);
+    }
+    if (rhs_size < dot_size && dot->user_count() == 1) {
+      users.push_back(dot->users().front());
+    }
+  }
+
+  std::vector<float> values;
+
+  // DFS to find scalar multiply ops from the operands.
+  while (!operands.empty()) {
+    HloInstruction* inst;
+    HloInstruction* user;
+    int64 index;
+    std::tie(inst, user, index) = operands.back();
+    operands.pop_back();
+
+    // Skip the op types that are not commutative with multiply.
+    if (!IsOpCodeMultiplyCommutative(inst->opcode())) {
+      continue;
+    }
+
+    HloInstruction* operand;
+    HloInstruction* multiplier;
+    // Pattern match a scalar multiply.
+    if (Match(inst, m::MultiplyAnyOrder(
+                        m::Op(&operand),
+                        m::Broadcast(m::ConstantScalar(&multiplier))))) {
+      CHECK_LT(index, user->operand_count());
+      CHECK_EQ(inst, user->operands()[index]);
+
+      // When found a scalar multiply, save its scalar value.
+      values.push_back(GetConstantValue(multiplier));
+      // And remove the scalar multiply op.
+      TF_RETURN_IF_ERROR(user->ReplaceOperandWith(index, operand));
+      inst = operand;
+    }
+
+    // Push the operands of inst.
+    int64 i = 0;
+    for (auto* operand : inst->operands()) {
+      operands.emplace_back(operand, inst, i++);
+    }
+  }
+
+  // DFS to find scalar multiply ops from the users.
+  while (!users.empty()) {
+    auto inst = users.back();
+    users.pop_back();
+
+    if (!IsOpCodeMultiplyCommutative(inst->opcode())) {
+      continue;
+    }
+
+    HloInstruction* operand;
+    HloInstruction* multiplier;
+    if (Match(inst, m::MultiplyAnyOrder(
+                        m::Op(&operand),
+                        m::Broadcast(m::ConstantScalar(&multiplier))))) {
+      values.push_back(GetConstantValue(multiplier));
+
+      TF_RETURN_IF_ERROR(inst->ReplaceAllUsesWith(operand));
+      inst = operand;
+    }
+
+    // Process the instructions with only one user.
+    // Otherwise moving scalar multiply to the operands changes the values of
+    // other users.
+    if (inst->user_count() == 1) {
+      users.push_back(inst->users().front());
+    }
+  }
+
+  if (values.empty()) {
+    return Status::OK();
+  }
+
+  changed_ = true;
+
+  // Combine all constant multipliers.
+  float multiplier = 1.0;
+  for (const float v : values) {
+    multiplier *= v;
+  }
+
+  // Create a new const scalar multiply instruction.
+  HloInstruction* new_const_inst;
+  new_const_inst =
+      computation_->AddInstruction(MakeScalarInstruction(target, multiplier));
+
+  // Broadcast the scalar multiplier.
+  HloInstruction* new_broadcast = computation_->AddInstruction(
+      HloInstruction::CreateBroadcast(target->shape(), new_const_inst, {}));
+  // Create a new scalar multiply instruction.
+  HloInstruction* new_multiply =
+      computation_->AddInstruction(HloInstruction::CreateBinary(
+          target->shape(), HloOpcode::kMultiply, target, new_broadcast));
+  CHECK_EQ(new_multiply->shape(), target->shape());
+
+  // Update the dependency with the rest of the instructions.
+  if (target == lhs) {
+    return dot->ReplaceOperandWith(0, new_multiply);
+  } else if (target == rhs) {
+    return dot->ReplaceOperandWith(1, new_multiply);
+  } else {
+    CHECK_EQ(target, dot);
+    return dot->ReplaceAllUsesWith(new_multiply);
+  }
+}
+
 void AlgebraicSimplifierVisitor::ReplaceWithBitcast(HloInstruction* instruction,
                                                     HloInstruction* operand) {
   CHECK_EQ(1, instruction->operand_count());
@@ -1035,6 +1236,10 @@ Status AlgebraicSimplifierVisitor::HandleConcatenate(
     return Status::OK();
   }
 
+  if (options_.is_layout_sensitive()) {
+    return Status::OK();
+  }
+
   // Check if we can merge "adjacent" slice operands which take slices from the
   // same other op. For simplicity we only merge unstrided slices.
   int64 concatenate_dimension = concatenate->concatenate_dimension();
@@ -1134,6 +1339,23 @@ Status AlgebraicSimplifierVisitor::HandleConcatenate(
             operands[pad_value_operand]->mutable_operand(0), padding_config));
     return ReplaceInstruction(concatenate, pad);
   }
+
+  if (absl::c_count(operands, operands[0]) == operands.size() &&
+      operands[0]->shape().dimensions(concatenate_dimension) == 1) {
+    Shape new_shape = operands[0]->shape();
+    absl::InlinedVector<int64, 8> broadcast_dims;
+    for (int64 i = 0; i < new_shape.rank(); ++i) {
+      if (i == concatenate_dimension) {
+        continue;
+      }
+      broadcast_dims.push_back(i);
+    }
+    new_shape.DeleteDimension(concatenate_dimension);
+    return ReplaceInstruction(
+        concatenate,
+        MakeBroadcastHlo(MakeReshapeHlo(new_shape, operands[0]).ValueOrDie(),
+                         broadcast_dims, concatenate->shape()));
+  }
   return Status::OK();
 }
 
@@ -2098,11 +2320,8 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
             AsInt64Slice(dot->dot_dimension_numbers().lhs_batch_dimensions()),
             AsInt64Slice(
                 dot->dot_dimension_numbers().lhs_contracting_dimensions())));
-    if (dot->shape().rank() != lhs->shape().rank()) {
-      std::vector<int64> lhs_broadcast_dims(lhs->shape().rank());
-      absl::c_iota(lhs_broadcast_dims, 0);
-      new_lhs = computation_->AddInstruction(HloInstruction::CreateBroadcast(
-          dot->shape(), new_lhs, lhs_broadcast_dims));
+    if (!ShapeUtil::SameElementType(dot->shape(), new_lhs->shape())) {
+      new_lhs = MakeConvertToHlo(new_lhs, dot->shape().element_type());
     }
     TF_ASSIGN_OR_RETURN(
         HloInstruction * new_rhs,
@@ -2111,6 +2330,15 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
             AsInt64Slice(dot->dot_dimension_numbers().rhs_batch_dimensions()),
             AsInt64Slice(
                 dot->dot_dimension_numbers().rhs_contracting_dimensions())));
+    if (!ShapeUtil::SameElementType(dot->shape(), new_rhs->shape())) {
+      new_rhs = MakeConvertToHlo(new_rhs, dot->shape().element_type());
+    }
+    if (dot->shape().rank() != lhs->shape().rank()) {
+      std::vector<int64> lhs_broadcast_dims(lhs->shape().rank());
+      absl::c_iota(lhs_broadcast_dims, 0);
+      new_lhs = computation_->AddInstruction(HloInstruction::CreateBroadcast(
+          dot->shape(), new_lhs, lhs_broadcast_dims));
+    }
     if (dot->shape().rank() != rhs->shape().rank()) {
       std::vector<int64> rhs_broadcast_dims(
           dot->dot_dimension_numbers().lhs_batch_dimensions_size());
@@ -2129,8 +2357,6 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
   // If the lhs or rhs have only batch and contracting dimensions, a dot can be
   // rewritten as reduce(mul(broadcast(transpose(x)),broadcast(transpose(y))))
   if (options_.enable_dot_strength_reduction() &&
-      (ShapeUtil::ElementIsFloating(dot->shape()) ||
-       ShapeUtil::ElementIsComplex(dot->shape())) &&
       ((dot->dot_dimension_numbers().lhs_batch_dimensions_size() +
             dot->dot_dimension_numbers().lhs_contracting_dimensions_size() ==
         lhs->shape().rank()) ||
@@ -2144,6 +2370,10 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
             AsInt64Slice(dot->dot_dimension_numbers().lhs_batch_dimensions()),
             AsInt64Slice(
                 dot->dot_dimension_numbers().lhs_contracting_dimensions())));
+    if (!ShapeUtil::SameElementType(dot->shape(), new_lhs->shape())) {
+      new_lhs = MakeConvertToHlo(new_lhs, dot->shape().element_type());
+    }
+
     TF_ASSIGN_OR_RETURN(
         HloInstruction * new_rhs,
         NormalizeDotOperandToBatchMajorAndContractingMinor(
@@ -2151,6 +2381,9 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
             AsInt64Slice(dot->dot_dimension_numbers().rhs_batch_dimensions()),
             AsInt64Slice(
                 dot->dot_dimension_numbers().rhs_contracting_dimensions())));
+    if (!ShapeUtil::SameElementType(dot->shape(), new_rhs->shape())) {
+      new_rhs = MakeConvertToHlo(new_rhs, dot->shape().element_type());
+    }
 
     int64 lhs_outer_dims =
         lhs->shape().rank() -
@@ -2192,9 +2425,9 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
     std::vector<int64> reduce_dims(
         dot->dot_dimension_numbers().lhs_contracting_dimensions_size());
     PrimitiveType dot_type =
-        ShapeUtil::ElementIsComplex(dot->shape())
-            ? dot->shape().element_type()
-            : dot->shape().element_type() == F64 ? F64 : F32;
+        ShapeUtil::ElementIsFloating(dot->shape())
+            ? (dot->shape().element_type() == F64 ? F64 : F32)
+            : dot->shape().element_type();
     new_dot = AsType(new_dot, dot_type);
     const int64 outer_dims = std::max(rhs_outer_dims, lhs_outer_dims);
     absl::c_iota(
@@ -2474,6 +2707,70 @@ Status AlgebraicSimplifierVisitor::HandleMultiply(HloInstruction* multiply) {
     }
   }
 
+  {
+    HloInstruction *a, *b, *c1, *c2;
+    // Mul(Mul(x, constant1), Mul(y, constant2)) => Mul(Mul(x, y),
+    // constant1*constant2)
+    if (Match(multiply,
+              m::Multiply(
+                  m::MultiplyAnyOrder(m::NonConstant(&a), m::Constant(&c1)),
+                  m::MultiplyAnyOrder(m::NonConstant(&b), m::Constant(&c2))))) {
+      TF_ASSIGN_OR_RETURN(auto* product_of_constants,
+                          MakeBinaryHlo(HloOpcode::kMultiply, c1, c2));
+      if (ShapeUtil::IsScalar(product_of_constants->shape()) &&
+          !ShapeUtil::IsScalar(multiply->shape())) {
+        product_of_constants =
+            computation_->AddInstruction(HloInstruction::CreateBroadcast(
+                multiply->shape(), product_of_constants, {}));
+      }
+
+      return ReplaceWithNewInstruction(
+          multiply,
+          HloInstruction::CreateBinary(
+              multiply->shape(), HloOpcode::kMultiply,
+              computation_->AddInstruction(HloInstruction::CreateBinary(
+                  multiply->shape(), HloOpcode::kMultiply, a, b)),
+              product_of_constants));
+    }
+  }
+
+  {
+    HloInstruction *a, *b, *constant, *op;
+    // Mul(Mul(a, constant1), Broadcast(b)) =>
+    // Mul(Broadcast(Mul(b, constant1), a))
+    if (Match(multiply,
+              m::MultiplyAnyOrder(m::MultiplyAnyOrder(m::NonConstant(&a),
+                                                      m::Constant(&constant)),
+                                  m::Op(&op))) ||
+        Match(multiply,
+              m::MultiplyAnyOrder(
+                  m::MultiplyAnyOrder(m::NonConstant(&a),
+                                      m::Broadcast(m::Constant(&constant))),
+                  m::Op(&op)))) {
+      // Check that the other side was a broadcast, and not of a constant.
+      if (ShapeUtil::IsScalar(constant->shape()) &&
+          Match(op, m::Broadcast(m::NonConstant()))) {
+        auto dims = op->dimensions();
+        b = op->mutable_operand(0);
+        if (!ShapeUtil::IsScalar(b->shape())) {
+          constant = computation_->AddInstruction(
+              HloInstruction::CreateBroadcast(b->shape(), constant, {}));
+        }
+
+        auto new_mul =
+            computation_->AddInstruction(HloInstruction::CreateBinary(
+                b->shape(), HloOpcode::kMultiply, b, constant));
+
+        return ReplaceWithNewInstruction(
+            multiply,
+            HloInstruction::CreateBinary(
+                multiply->shape(), HloOpcode::kMultiply, a,
+                computation_->AddInstruction(HloInstruction::CreateBroadcast(
+                    multiply->shape(), new_mul, dims))));
+      }
+    }
+  }
+
   VLOG(10) << "trying transform [(A * C1) * C2 => A * (C1 * C2)]";
   HloInstruction *a, *c1, *c2;
   if (Match(multiply,
@@ -3087,6 +3384,17 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power) {
                                             HloOpcode::kMultiply, lhs, lhs));
   }
 
+  // Pow(A, 3) is used in GELU.
+  VLOG(10) << "trying transform [pow(A, 3) => A*A*A]: " << power->ToString();
+  if (IsAll(rhs, 3)) {
+    HloInstruction* tmp =
+        computation_->AddInstruction(HloInstruction::CreateBinary(
+            power->shape(), HloOpcode::kMultiply, lhs, lhs));
+    return ReplaceWithNewInstruction(
+        power, HloInstruction::CreateBinary(power->shape(),
+                                            HloOpcode::kMultiply, lhs, tmp));
+  }
+
   VLOG(10) << "trying transform [pow(A, -1) => 1/A]: " << power->ToString();
   if (IsAll(rhs, -1)) {
     return ReplaceWithNewInstruction(
@@ -3576,6 +3884,52 @@ StatusOr<bool> AlgebraicSimplifierVisitor::TryToReorderSliceAndReshape(
   return false;
 }
 
+// Allowing a slice to move through a reverse with any necessary updates to the
+// slice config.
+StatusOr<bool> AlgebraicSimplifierVisitor::TryToReorderSliceAndReverse(
+    HloInstruction* slice) {
+  VLOG(2) << "Entered TryToReorderSliceAndReverse for slice:"
+          << slice->ToString();
+  if (Match(slice, m::Slice(m::Reverse()))) {
+    HloInstruction* reverse = slice->mutable_operand(0);
+    HloInstruction* reverse_operand = reverse->mutable_operand(0);
+    std::vector<int64> new_starts = slice->slice_starts();
+    std::vector<int64> new_limits = slice->slice_limits();
+    std::vector<int64> new_strides = slice->slice_strides();
+    for (auto rdim : reverse->dimensions()) {
+      int64 start = slice->slice_starts(rdim);
+      int64 limit = slice->slice_limits(rdim);
+      int64 stride = slice->slice_strides(rdim);
+      // find_nth allows us to compute the appropriate index to begin
+      // with during reverse even in the presence of non-unit strides
+      int64 find_nth = (limit - start - 1) / stride;
+      find_nth = start + find_nth * stride;
+      limit = find_nth + 1;
+      new_starts[rdim] =
+          (reverse->shape().dimensions(rdim) - start) - (limit - start);
+      new_limits[rdim] = reverse->shape().dimensions(rdim) - start;
+      VLOG(2) << "Analyzing dim:" << rdim << " (start,limit):" << start << ","
+              << limit << " and new (start, limit):" << new_starts[rdim] << ","
+              << new_limits[rdim];
+    }
+    // New slice formed from the reverse_operand, but strides and shape of the
+    // slice output remains the same. New slice's starts and limits are updated
+    // for ONLY the reversed dimensions as indicated above.
+    HloInstruction* new_slice = computation_->AddInstruction(
+        HloInstruction::CreateSlice(slice->shape(), reverse_operand, new_starts,
+                                    new_limits, new_strides));
+    simplifier_->UpdateLayout(new_slice->mutable_shape());
+    TF_RETURN_IF_ERROR(ReplaceWithNewInstruction(
+        slice, HloInstruction::CreateReverse(new_slice->shape(), new_slice,
+                                             reverse->dimensions())));
+    // We do not delete the old reverse, since there might be another
+    // consumer of that reverse (i.e., full reverse output). DCE should take
+    // care of any deletion that is necessary if there was no use of reverse.
+    return true;
+  }
+  return false;
+}
+
 Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
   // Delete no-op slices, i.e. where shape = operand shape.
   if (ReplaceInstructionIfSameShape(slice, slice->mutable_operand(0))) {
@@ -3730,6 +4084,15 @@ Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
   if (replaced) {
     return Status::OK();
   }
+
+  bool reversed = false;
+  if (Match(slice, m::Slice(m::Reverse(m::Op())))) {
+    TF_ASSIGN_OR_RETURN(reversed, TryToReorderSliceAndReverse(slice));
+  }
+  if (reversed) {
+    return Status::OK();
+  }
+
   return Status::OK();
 }
 
@@ -3798,8 +4161,8 @@ Status AlgebraicSimplifierVisitor::HandleDynamicSlice(
     return ReplaceWithNewInstruction(dynamic_slice, std::move(new_broadcast));
   }
 
-  // Convert a dynamic slice into a slice if all offsets are  constant and the
-  // operand is not constant. If ev
+  // Convert a dynamic slice into a slice if all offsets are constant and the
+  // operand is not constant.
   if (operand->opcode() != HloOpcode::kConstant &&
       absl::c_all_of(absl::MakeSpan(dynamic_slice->operands().begin() + 1,
                                     dynamic_slice->operands().end()),
@@ -4137,13 +4500,13 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
         new_dnums.add_rhs_contracting_dimensions(
             dnums.rhs_batch_dimensions(batch_dim));
         new_dnums.add_lhs_contracting_dimensions(
-            dnums.rhs_batch_dimensions(batch_dim));
+            dnums.lhs_batch_dimensions(batch_dim));
         ++removed_dims;
       } else {
         new_dnums.add_rhs_batch_dimensions(
             dnums.rhs_batch_dimensions(batch_dim));
         new_dnums.add_lhs_batch_dimensions(
-            dnums.rhs_batch_dimensions(batch_dim));
+            dnums.lhs_batch_dimensions(batch_dim));
       }
     }
     std::vector<int64> reduce_dims;
@@ -4697,15 +5060,17 @@ StatusOr<bool> AlgebraicSimplifierVisitor::SwapConvOperands(
   for (int64 spatial_dim = 0;
        spatial_dim < dnums.input_spatial_dimensions_size(); ++spatial_dim) {
     const int64 kernel_size = window_dims[spatial_dim].size();
-    kernel_product *= kernel_size;
     const int64 dilated_kernel_size =
         1 + (kernel_size - 1) * window_dims[spatial_dim].window_dilation();
 
     const int64 input_size =
         input->shape().dimensions(dnums.input_spatial_dimensions(spatial_dim));
-    swapped_kernel_product *= input_size;
     const int64 dilated_input_size =
         1 + (input_size - 1) * window_dims[spatial_dim].base_dilation();
+    // Don't decide to swap if the input size is one, since many convolution
+    // implementations can easily hand that special case efficiently.
+    kernel_product *= kernel_size;
+    swapped_kernel_product *= input_size == 1 ? kernel_size : input_size;
 
     auto new_dim = swapped_window.add_dimensions();
     new_dim->set_size(input_size);
@@ -4896,6 +5261,10 @@ StatusOr<bool> AlgebraicSimplifierVisitor::SimplifyConvToDot(
 
 Status AlgebraicSimplifierVisitor::HandleConvolution(
     HloInstruction* convolution) {
+  if (options_.enable_scalar_multiply_reduction()) {
+    TF_RETURN_IF_ERROR(ScalarMultiplyReduction(convolution));
+  }
+
   // Zero-sized input or filter.
   if (ShapeUtil::IsZeroElementArray(convolution->operand(0)->shape()) ||
       ShapeUtil::IsZeroElementArray(convolution->operand(1)->shape())) {
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h
index 9f29df3c209..9f2a3404116 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.h
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.h
@@ -86,6 +86,17 @@ class AlgebraicSimplifierOptions {
   }
   bool enable_conv_operand_swap() const { return enable_conv_operand_swap_; }
 
+  // Move constant scalar multiply to one operand or output of convolutions with
+  // the smallest tensor size, to reduce the number of scalar multiply.
+  void set_enable_scalar_multiply_reduction(
+      bool enable_scalar_multiply_reduction) {
+    enable_scalar_multiply_reduction_ = enable_scalar_multiply_reduction;
+  }
+
+  bool enable_scalar_multiply_reduction() const {
+    return enable_scalar_multiply_reduction_;
+  }
+
   // If enable_window_reduce_replacement is true, the kReduceWindow instruction
   // can be optimized by replacement with simpler operations.
   void set_enable_window_reduce_to_reduce_replacement(
@@ -146,6 +157,7 @@ class AlgebraicSimplifierOptions {
   bool enable_dot_to_multiply_rewrite_{true};
   bool enable_conv_simplification_{true};
   bool enable_conv_operand_swap_{true};
+  bool enable_scalar_multiply_reduction_{false};
   bool enable_window_reduce_to_reduce_replacement_{true};
   bool enable_reduce_of_reshape_{true};
   bool replace_transpose_with_bitcast_{true};
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 779d6c9cdc5..95700b2a994 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -117,6 +117,52 @@ TEST_F(AlgebraicSimplifierTest, FactorFpAddition) {
                   m::ConstantScalar(0.125))));
 }
 
+// (A*C1) * (B*C2) => (A*B)*(C1*C2)
+TEST_F(AlgebraicSimplifierTest, MultiplyChain) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      c = f32[] constant(2)
+      d = f32[] constant(4)
+      x = f32[] multiply(p0, c)
+      y = f32[] multiply(p1, d)
+      ROOT z = f32[] multiply(x, y)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::MultiplyAnyOrder(
+          m::MultiplyAnyOrder(m::Parameter(0), m::Parameter(1)),
+          m::MultiplyAnyOrder(m::ConstantScalar(2), m::ConstantScalar(4)))));
+}
+
+// MUL(MUL(X, BROADCAST(constant)), BROADCAST(Y)) ==>
+// MUL(X, BROADCAST(MUL(Y, BROADCAST(constant))))
+TEST_F(AlgebraicSimplifierTest, MultiplyBroadcastReassoc) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[2,2] parameter(0)
+      p1 = f32[] parameter(1)
+      b = f32[] constant(2)
+      c = f32[2, 2] broadcast(b), dimensions={}
+      x = f32[2,2] multiply(p0, c)
+      y = f32[2,2] broadcast(p1), dimensions={}
+      ROOT z = f32[2,2] multiply(y, x)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::MultiplyAnyOrder(
+                  m::Parameter(0), m::Broadcast(m::MultiplyAnyOrder(
+                                       m::Parameter(1), m::Constant())))));
+}
+
 // A*C + B*C => (A+B)*C if C is a broadcast of a floating-point power of 2.
 TEST_F(AlgebraicSimplifierTest, FactorFpAdditionWithBroadcast) {
   const char* kModuleStr = R"(
@@ -1568,6 +1614,32 @@ TEST_F(AlgebraicSimplifierTest, Pow2) {
               GmockMatch(m::Multiply(m::Parameter(0), m::Parameter(0))));
 }
 
+// Test that pow(A, 3) is simplified to A*A*A.
+TEST_F(AlgebraicSimplifierTest, Pow3) {
+  auto m = CreateNewVerifiedModule();
+  Shape r0f32 = ShapeUtil::MakeShape(F32, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32, "param0"));
+  HloInstruction* three = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(3)));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, three));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Power(m::Parameter(0), m::Op().Is(three))));
+
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Multiply(m::Parameter(0),
+                             m::Multiply(m::Parameter(0), m::Parameter(0)))));
+}
+
 // Test that pow(A, -1) is simplified to 1/A.
 TEST_F(AlgebraicSimplifierTest, PowNegative1) {
   auto m = CreateNewVerifiedModule();
@@ -2014,6 +2086,80 @@ TEST_F(AlgebraicSimplifierTest, RemoveUnaryConcatenate) {
   EXPECT_THAT(computation->root_instruction(), param0);
 }
 
+TEST_F(AlgebraicSimplifierTest, SliceReverse) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY test {
+  param = f32[6,7,32] parameter(0)
+  constant = f32[] constant(0)
+  pad = f32[8,7,32] pad(param, constant), padding=1_1x0_0x0_0
+  rev = f32[8,7,32] reverse(pad), dimensions={0,2}
+  slice = f32[1,7,32] slice(rev), slice={[2:3:1], [0:7:1], [0:32:1]}
+  ROOT tuple = (f32[1,7,32]) tuple(slice)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  HloComputation* computation = module->entry_computation();
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Tuple(m::Reverse(m::Slice(m::Pad())))));
+  const HloInstruction* slice =
+      computation->root_instruction()->operand(0)->operand(0);
+  EXPECT_TRUE(
+      ShapeUtil::Equal(slice->shape(), ShapeUtil::MakeShape(F32, {1, 7, 32})));
+  // slice start,limit of 0th and 2nd dimensions are changed
+  // while 1st dimension's slice start, limit remains the same since
+  // it is not reversed.
+  EXPECT_EQ(slice->slice_starts(0), 5);
+  EXPECT_EQ(slice->slice_limits(0), 6);
+  EXPECT_EQ(slice->slice_starts(1), 0);
+  EXPECT_EQ(slice->slice_limits(1), 7);
+  EXPECT_EQ(slice->slice_starts(2), 0);
+  EXPECT_EQ(slice->slice_limits(2), 32);
+  EXPECT_EQ(slice->slice_strides(0), 1);
+  EXPECT_EQ(slice->slice_strides(1), 1);
+  EXPECT_EQ(slice->slice_strides(2), 1);
+}
+
+TEST_F(AlgebraicSimplifierTest, SliceReverseNonUnitEvenOddStrides) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY test {
+  param = f32[6,7,32] parameter(0)
+  constant = f32[] constant(0)
+  pad = f32[8,7,32] pad(param, constant), padding=1_1x0_0x0_0
+  rev = f32[8,7,32] reverse(pad), dimensions={0,1,2}
+  slice = f32[1,2,7] slice(rev), slice={[2:3:2], [0:7:4], [0:32:5]}
+  ROOT tuple = (f32[1,2,7]) tuple(slice)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  HloComputation* computation = module->entry_computation();
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Tuple(m::Reverse(m::Slice(m::Pad())))));
+  const HloInstruction* slice =
+      computation->root_instruction()->operand(0)->operand(0);
+  EXPECT_TRUE(
+      ShapeUtil::Equal(slice->shape(), ShapeUtil::MakeShape(F32, {1, 2, 7})));
+  // slice start,limit of all dimensions are changed
+  EXPECT_EQ(slice->slice_starts(0), 5);
+  EXPECT_EQ(slice->slice_limits(0), 6);
+  EXPECT_EQ(slice->slice_starts(1), 2);
+  EXPECT_EQ(slice->slice_limits(1), 7);
+  EXPECT_EQ(slice->slice_starts(2), 1);
+  EXPECT_EQ(slice->slice_limits(2), 32);
+  EXPECT_EQ(slice->slice_strides(0), 2);
+  EXPECT_EQ(slice->slice_strides(1), 4);
+  EXPECT_EQ(slice->slice_strides(2), 5);
+}
+
 // Test that empty operands of concatenates are removed.
 TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) {
   auto m = CreateNewVerifiedModule();
@@ -4677,6 +4823,25 @@ TEST_F(AlgebraicSimplifierTest, SliceOfConcatNonScalarInput) {
   EXPECT_EQ(root->slice_limits(0), 2);
 }
 
+TEST_F(AlgebraicSimplifierTest, ConcatToBroadcast) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY test {
+      p = f32[2,1,4] parameter(0)
+      ROOT concat = f32[2,6,4] concatenate(p,p,p,p,p,p), dimensions={1}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::Reshape(m::Parameter(0)))));
+}
+
 TEST_F(AlgebraicSimplifierTest, NegateNegate) {
   const char* hlo_string = R"(
     HloModule module
@@ -5197,6 +5362,59 @@ ENTRY AddBroadcastZeroWithDynamicSlice {
   EXPECT_THAT(root->operand(1)->opcode(), HloOpcode::kPad);
 }
 
+TEST_F(AlgebraicSimplifierTest, ScalarMultiplyReduction) {
+  const char* hlo_string = R"(
+HloModule ConstScalarMultiply
+ENTRY ConstScalarMultiply {
+  param0 = f32[16,512,4096]{2,1,0} parameter(0)
+  constant.0 = f32[] constant(0.5)
+  broadcast.0 = f32[16,512,4096] broadcast(constant.0), dimensions={}
+  multiply.0 = f32[16,512,4096]{2,1,0} multiply(param0, broadcast.0)
+  param1 = f32[16,512,4096]{2,1,0} parameter(1)
+  multiply.1 = f32[16,512,4096]{2,1,0} multiply(multiply.0, param1)
+  param2 = f32[16,512,1024]{2,1,0} parameter(2)
+  constant.1 = f32[] constant(1.109)
+  broadcast.1 = f32[16,512,1024] broadcast(constant.1), dimensions={}
+  multiply.2 = f32[16,512,1024]{2,1,0} multiply(param2, broadcast.1)
+  ROOT convolution = f32[4096,1024,1]{1,0,2} convolution(multiply.1, multiply.2), window={size=16}, dim_labels=0fb_0io->bf0
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AlgebraicSimplifierOptions options;
+  options.set_enable_scalar_multiply_reduction(true);
+  AlgebraicSimplifier simplifier(options);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kMultiply);
+  EXPECT_THAT(root,
+              GmockMatch(m::MultiplyAnyOrder(
+                  m::Op(), m::Broadcast(m::ConstantScalar(0.5f * 1.109f)))));
+}
+
+TEST_F(AlgebraicSimplifierTest, ScalarMultiplyReductionMultiUser) {
+  const char* hlo_string = R"(
+HloModule ConstScalarMultiply
+ENTRY ConstScalarMultiply {
+  param0 = f32[16,512,1024] parameter(0)
+  param1 = f32[4096,1024,1] parameter(1)
+  convolution = f32[16,512,4096] convolution(param0, param1), window={size=1}, dim_labels=0bf_oi0->0bf
+  constant.1 = f32[] constant(0.5)
+  broadcast.1 = f32[16,512,4096] broadcast(constant.1), dimensions={}
+  multiply.1 = f32[16,512,4096] multiply(convolution, broadcast.1)
+  param2 = f32[16,512,4096] parameter(2)
+  multiply.2 = f32[16,512,4096] multiply(convolution, param2)
+  ROOT add.1 = f32[16,512,4096] add(multiply.1, multiply.2)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AlgebraicSimplifierOptions options;
+  options.set_enable_scalar_multiply_reduction(true);
+  AlgebraicSimplifier simplifier(options);
+  ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+}
+
 INSTANTIATE_TEST_SUITE_P(DotOfConcatSimplificationTestInstantiation,
                          DotOfConcatSimplificationTest,
                          ::testing::ValuesIn(kDotOfConcatTestSpecs));
@@ -6145,10 +6363,10 @@ TEST_F(AlgebraicSimplifierTest, ReduceOfBatchDotToContractingDimension) {
     }
     test {
       p0 = f32[32,8,5,6] parameter(0)
-      p1 = f32[32,8,6,7] parameter(1)
+      p1 = f32[8,32,6,7] parameter(1)
       d = f32[32,8,5,7] dot(p0, p1),
         lhs_batch_dims={0,1},
-        rhs_batch_dims={0,1},
+        rhs_batch_dims={1,0},
         rhs_contracting_dims={2},
         lhs_contracting_dims={3}
      c = f32[] constant(0)
diff --git a/tensorflow/compiler/xla/service/comparison_expander.cc b/tensorflow/compiler/xla/service/comparison_expander.cc
new file mode 100644
index 00000000000..5c88ff8cae2
--- /dev/null
+++ b/tensorflow/compiler/xla/service/comparison_expander.cc
@@ -0,0 +1,133 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/comparison_expander.h"
+
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+
+HloInstruction* BitcastConvertFloatingPointToIntegral(
+    HloComputation* computation, HloInstruction* value,
+    const Shape& signed_shape, const Shape& unsigned_shape,
+    HloInstruction* zero, HloInstruction* max_value) {
+  // Switch from a floating point value to a integer value in such a way that
+  // when using the integer value to compare, we get the same result for normal
+  // values, and -Nan is treated as the smallest value, and Nan is treated as
+  // the largest value.
+  // If f is a float, and
+  // x = bit_cast<int32>(f);
+  // y = x < 0 ? numeric_limits<int32>::max() - x : x;
+  // then y is ordered as an int32 such that finite values have the obvious
+  // order, -0 is ordered before 0, and -NaN and NaN appear at the beginning
+  // and end of the ordering.
+  // Note that in order to avoid -x to overflow, we calculate
+  // numeric_limits<int32>::max() - x as unsigned, and then convert back to
+  // signed.
+  auto signed_value = computation->AddInstruction(
+      HloInstruction::CreateBitcastConvert(signed_shape, value));
+  auto unsigned_value = computation->AddInstruction(
+      HloInstruction::CreateBitcastConvert(unsigned_shape, value));
+  auto flipped_value = computation->AddInstruction(HloInstruction::CreateBinary(
+      unsigned_shape, HloOpcode::kSubtract, max_value, unsigned_value));
+  flipped_value = computation->AddInstruction(
+      HloInstruction::CreateBitcastConvert(signed_shape, flipped_value));
+  auto compare_shape = signed_shape;
+  compare_shape.set_element_type(PRED);
+  auto is_negative = computation->AddInstruction(HloInstruction::CreateCompare(
+      compare_shape, signed_value, zero, ComparisonDirection::kLt));
+  return computation->AddInstruction(
+      HloInstruction::CreateTernary(signed_shape, HloOpcode::kSelect,
+                                    is_negative, flipped_value, signed_value));
+}
+
+bool ComparisonExpander::InstructionMatchesPattern(
+    HloInstruction* instruction) {
+  if (HloCompareInstruction* compare =
+          dynamic_cast<HloCompareInstruction*>(instruction)) {
+    HloInstruction* lhs = instruction->operands()[0];
+    if (compare->type() == Comparison::Type::kFloatTotalOrder &&
+        primitive_util::IsFloatingPointType(lhs->shape().element_type())) {
+      return true;
+    }
+  }
+  return false;
+}
+
+StatusOr<HloInstruction*> ComparisonExpander::ExpandInstruction(
+    HloInstruction* instruction) {
+  CHECK(instruction->opcode() == HloOpcode::kCompare);
+  HloCompareInstruction* compare =
+      static_cast<HloCompareInstruction*>(instruction);
+  CHECK(compare->type() == Comparison::Type::kFloatTotalOrder);
+  HloComputation* computation = instruction->parent();
+  HloInstruction* lhs = instruction->operands()[0];
+  HloInstruction* rhs = instruction->operands()[1];
+  Shape compare_shape = lhs->shape();
+  PrimitiveType compare_type = compare_shape.element_type();
+  CHECK(primitive_util::IsFloatingPointType(compare_type));
+  // Special-case handling for BF16. We currently do not support direct
+  // comparisons with BF16, so we convert to F32 and then use the F32
+  // comparison logic.
+  if (compare_type == BF16) {
+    compare_type = F32;
+    compare_shape.set_element_type(compare_type);
+    lhs = computation->AddInstruction(
+        HloInstruction::CreateConvert(compare_shape, lhs));
+    rhs = computation->AddInstruction(
+        HloInstruction::CreateConvert(compare_shape, rhs));
+  }
+
+  int64 bit_width = primitive_util::BitWidth(compare_type);
+  PrimitiveType signed_type =
+      primitive_util::SignedIntegralTypeForBitWidth(bit_width);
+  PrimitiveType unsigned_type =
+      primitive_util::UnsignedIntegralTypeForBitWidth(bit_width);
+  auto signed_shape = compare_shape;
+  signed_shape.set_element_type(signed_type);
+  auto unsigned_shape = compare_shape;
+  unsigned_shape.set_element_type(unsigned_type);
+  auto zero_value = computation->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::Zero(signed_type)));
+  zero_value = computation->AddInstruction(HloInstruction::CreateBroadcast(
+      signed_shape, zero_value, zero_value->shape().dimensions()));
+  auto max_signed = computation->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::MaxValue(signed_type)));
+  auto max_shape = max_signed->shape();
+  max_shape.set_element_type(unsigned_type);
+  auto max_unsigned = computation->AddInstruction(
+      HloInstruction::CreateConvert(max_shape, max_signed));
+  auto max_value = computation->AddInstruction(HloInstruction::CreateBroadcast(
+      unsigned_shape, max_unsigned, max_shape.dimensions()));
+  lhs = BitcastConvertFloatingPointToIntegral(
+      computation, lhs, signed_shape, unsigned_shape, zero_value, max_value);
+  rhs = BitcastConvertFloatingPointToIntegral(
+      computation, rhs, signed_shape, unsigned_shape, zero_value, max_value);
+  auto new_compare = computation->AddInstruction(HloInstruction::CreateCompare(
+      instruction->shape(), lhs, rhs, compare->direction(),
+      Comparison::Type::kSigned));
+  VLOG(2) << "New comparison instruction for total order:"
+          << new_compare->ToString() << "\n";
+  return new_compare;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/comparison_expander.h b/tensorflow/compiler/xla/service/comparison_expander.h
new file mode 100644
index 00000000000..df8b5dc0137
--- /dev/null
+++ b/tensorflow/compiler/xla/service/comparison_expander.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_COMPARISON_EXPANDER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_COMPARISON_EXPANDER_H_
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/op_expander_pass.h"
+
+namespace xla {
+
+// A pass which performs expansion of the comparison operator to support total
+// order comparison of floating point numbers.
+class ComparisonExpander : public OpExpanderPass {
+ public:
+  explicit ComparisonExpander() = default;
+  ~ComparisonExpander() override = default;
+  absl::string_view name() const override { return "comparison-expander"; }
+
+ private:
+  // Returns `true` if `instruction` should be expanded by this pass.
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+  // Returns a replacement for `instruction`, or nullptr if no replacement is
+  // needed (e.g. only the to_apply subcomputation of the instruction was
+  // modified).
+  StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_COMPARISON_EXPANDER_H_
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
index ce9c8a4ea62..f8e4f591a5d 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -92,6 +92,7 @@ CompileOnlyService::CompileAheadOfTime(
         execution_options.mutable_device_assignment()));
   }
   execution_options.set_use_spmd_partitioning(options.use_spmd_partitioning());
+  execution_options.set_deduplicate_hlo(options.deduplicate_hlo());
   for (const AotXlaComputationInstance& instance : computations) {
     TF_RET_CHECK(instance.computation.has_host_program_shape());
     *execution_options.mutable_shape_with_output_layout() =
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index 57b24e372e6..312a068ba65 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -77,6 +77,7 @@ class AotCompilationOptions {
   virtual int64 replica_count() const { return 0; }
   virtual int64 num_cores() const { return 0; }
   virtual bool use_spmd_partitioning() const { return false; }
+  virtual bool deduplicate_hlo() const { return false; }
 
   // Optional allocator that may be used for allocating temp space on the device
   // during compilation.
diff --git a/tensorflow/compiler/xla/service/conditional_canonicalizer.cc b/tensorflow/compiler/xla/service/conditional_canonicalizer.cc
new file mode 100644
index 00000000000..8af8e11febd
--- /dev/null
+++ b/tensorflow/compiler/xla/service/conditional_canonicalizer.cc
@@ -0,0 +1,63 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/conditional_canonicalizer.h"
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+
+namespace xla {
+namespace {
+Status CanonicalizeNonTupleConditional(HloInstruction* conditional) {
+  TF_RET_CHECK(conditional->opcode() == HloOpcode::kConditional);
+  for (auto* branch : conditional->called_computations()) {
+    HloInstruction* root = branch->root_instruction();
+    TF_RET_CHECK(!root->shape().IsTuple());
+
+    HloInstruction* tuple =
+        branch->AddInstruction(HloInstruction::CreateTuple({root}));
+    branch->set_root_instruction(tuple, /*accept_different_shape=*/true);
+  }
+  auto parent = conditional->parent();
+  auto root_shape = conditional->shape();
+  auto new_shape = ShapeUtil::MakeTupleShape({root_shape});
+  auto new_conditional =
+      parent->AddInstruction(conditional->CloneWithNewShape(new_shape));
+  auto gte = parent->AddInstruction(
+      HloInstruction::CreateGetTupleElement(root_shape, new_conditional, 0));
+  TF_RETURN_IF_ERROR(parent->ReplaceInstruction(conditional, gte));
+  return Status::OK();
+}
+}  // namespace
+
+StatusOr<bool> ConditionalCanonicalizer::Run(HloModule* module) {
+  XLA_VLOG_LINES(
+      2, "ConditionalCanonicalizer::Run(), before:\n" + module->ToString());
+  bool changed = false;
+  for (auto* comp : module->MakeNonfusionComputations()) {
+    for (auto* inst : comp->MakeInstructionPostOrder()) {
+      if (inst->opcode() == HloOpcode::kConditional &&
+          !inst->shape().IsTuple()) {
+        TF_RETURN_IF_ERROR(CanonicalizeNonTupleConditional(inst));
+        changed = true;
+      }
+    }
+  }
+  XLA_VLOG_LINES(
+      2, "ConditionalCanonicalizer::Run(), after:\n" + module->ToString());
+  return changed;
+}
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/conditional_canonicalizer.h b/tensorflow/compiler/xla/service/conditional_canonicalizer.h
new file mode 100644
index 00000000000..a390d87a007
--- /dev/null
+++ b/tensorflow/compiler/xla/service/conditional_canonicalizer.h
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CONDITIONAL_CANONICALIZER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CONDITIONAL_CANONICALIZER_H_
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// Canonicalize output of conditionals, make non-tuple outputs into tuple with
+// single element output. After this pass, all conditional instructions have
+// tuple outputs.
+class ConditionalCanonicalizer : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "conditional canonicalizer";
+  }
+
+  StatusOr<bool> Run(HloModule* module) override;
+};
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CONDITIONAL_CANONICALIZER_H_
diff --git a/tensorflow/compiler/xla/service/conditional_canonicalizer_test.cc b/tensorflow/compiler/xla/service/conditional_canonicalizer_test.cc
new file mode 100644
index 00000000000..498260cbabf
--- /dev/null
+++ b/tensorflow/compiler/xla/service/conditional_canonicalizer_test.cc
@@ -0,0 +1,72 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/conditional_canonicalizer.h"
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+class ConditionalCanonicalizerTest : public HloTestBase {
+ protected:
+  ConditionalCanonicalizerTest() {}
+};
+
+TEST_F(ConditionalCanonicalizerTest, DenseArrayConditionalRewrite) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule _
+true_branch {
+  true_param = (s32[3,2]) parameter(0)
+  ROOT root = s32[] constant(0)
+}
+
+false_branch {
+  false_param = (s32[3,2]) parameter(0)
+  ROOT root = s32[] constant(1)
+}
+
+ENTRY entry {
+  param0 = s32[3,2] parameter(0)
+  branch = pred[] constant(false)
+  param_tuple = (s32[3 ,2]) tuple(param0)
+  ROOT conditional = s32[] conditional(branch, param_tuple, param_tuple),
+    true_computation=true_branch, false_computation=false_branch
+}
+)")
+                    .ValueOrDie();
+  ConditionalCanonicalizer pass;
+  EXPECT_TRUE(pass.Run(module.get()).ValueOrDie());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::GetTupleElement(op::Conditional()));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/conditional_code_motion.cc b/tensorflow/compiler/xla/service/conditional_code_motion.cc
index 6db4c3eb6d4..cdda0aeb925 100644
--- a/tensorflow/compiler/xla/service/conditional_code_motion.cc
+++ b/tensorflow/compiler/xla/service/conditional_code_motion.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <iterator>
 #include <stack>
 #include <string>
-#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -46,161 +45,81 @@ limitations under the License.
 
 namespace xla {
 
-namespace {
+namespace conditional_opt {
 
-struct ConditionalBoundary {
-  ConditionalBoundary(HloInstruction* op, int64 op_index, HloInstruction* usr)
-      : operand(op), operand_index(op_index), user(usr) {}
-  // `operand` is one of `user`'s operand.
-
-  // Instruction that remains in the conditional but one of its user
-  // is moved out of conditonal.
-  HloInstruction* operand;
-  // operand_index for `operand` in the `user`.
-  int64 operand_index;
-  // Instruction that moved out of conditional.
-  HloInstruction* user;
-};
-
-// Visit the root instructions to its operands follow BFS.
-// Will visit an instructions after all its users have been visited. Parameters
-// are not visited.
-class BranchVisitor {
+class BoundaryVisitor {
  public:
-  explicit BranchVisitor(const HloComputation* branch_computation) {
-    HloInstruction* root_inst = branch_computation->root_instruction();
-    worklist_.push_back(root_inst);
-    visited_.insert(root_inst);
-    for (auto parameter_inst : branch_computation->parameter_instructions()) {
-      parameter_instructions_.insert(parameter_inst);
-    }
+  // start with an existing conditional computation.
+  explicit BoundaryVisitor(HloInstruction* conditional) {
+    Boundary b(Boundary::Position::kInsideBranch);
+    b.mutable_operands().push_back(conditional);
+    worklist_.push_back(b);
   }
-  // Get next intruction to visit.
-  HloInstruction* GetNextInstruction() {
-    if (!worklist_.empty()) {
-      HloInstruction* inst = worklist_.front();
+  // Start with an empty work list.
+  BoundaryVisitor() {}
+  // Get next boundary to visit.
+  Boundary PopNextBoundary() {
+    CHECK(!worklist_.empty());
+    Boundary b = worklist_.front();
+    worklist_.pop_front();
+    // if b is already visited, it must have multiple users and is already in
+    // new boundaries. Skip it. Only checking the first operand of b because b
+    // is expected to have at least one operand, and all the operands in b
+    // must be identical instructions from different branches for b to be moved.
+    while (!worklist_.empty() && ContainsKey(visited_, b.operands()[0])) {
+      b = worklist_.front();
       worklist_.pop_front();
-      return inst;
     }
-    return nullptr;
+    visited_.insert(b.operands()[0]);
+    return b;
+  }
+  void AddToWorkList(const Boundary& b) {
+    CHECK(!b.operands().empty());
+    worklist_.push_back(b);
   }
 
-  // Add operands of one instruction to worklist for further visit.
-  void AddInstructionOperands(HloInstruction* inst) {
-    int64 operand_count = inst->operand_count();
-    for (int i = 0; i < operand_count; i++) {
-      HloInstruction* operand = inst->mutable_operand(i);
-      if (ContainsKey(visited_, operand)) {
-        continue;
+  bool HasNextBoundary() {
+    while (!worklist_.empty()) {
+      Boundary b = worklist_.front();
+      if (!ContainsKey(visited_, b.operands()[0])) {
+        break;
       }
-      bool all_user_visited = std::all_of(
-          operand->users().begin(), operand->users().end(),
-          [&](HloInstruction* user) { return ContainsKey(visited_, user); });
-
-      if (!all_user_visited) {
-        continue;
-      }
-      // Do not visit parameter_instructions.
-      if (ContainsKey(parameter_instructions_, operand)) {
-        // Add the operand and this instruction to the boundaries.
-        boundaries_.emplace_back(operand, i, inst);
-        continue;
-      }
-      worklist_.push_back(operand);
-      visited_.insert(operand);
+      worklist_.pop_front();
     }
-  }
-
-  // Add instruction and its users to conditional boundaries.
-  void AddInstructionToBoundary(HloInstruction* inst) {
-    for (auto user : inst->users()) {
-      boundaries_.emplace_back(inst, user->operand_index(inst), user);
-    }
-  }
-
-  // Add instruction to the to be removed instructions set and vector.
-  void AddInstructionToHoist(HloInstruction* inst) {
-    instructions_to_hoist_set_.insert(inst);
-    instructions_to_hoist_.emplace_back(inst);
-  }
-
-  // If visitor has next instruction to visit.
-  bool HasNextInstruction() const { return !worklist_.empty(); }
-
-  // If there is no hoist intruction.
-  int64 HoistInstructionSize() { return instructions_to_hoist_.size(); }
-
-  // Get boundaries of this branch.
-  const std::vector<ConditionalBoundary>& boundaries() const {
-    return boundaries_;
-  }
-
-  // Get instructions to hoist in this branch.
-  const std::vector<HloInstruction*>& instructions_to_hoist() const {
-    return instructions_to_hoist_;
-  }
-
-  // Get hoist instruction set in this branch.
-  const std::unordered_set<HloInstruction*>& instructions_to_hoist_set() const {
-    return instructions_to_hoist_set_;
+    return !worklist_.empty();
   }
 
  private:
   // worklist is the deque that contains instructions to be visited.
-  std::deque<HloInstruction*> worklist_;
-
-  // instructions that has been visited.
-  std::unordered_set<HloInstruction*> visited_;
-
-  // parameter instructions of the branch.
-  std::unordered_set<HloInstruction*> parameter_instructions_;
-
-  // Boundaries contains the set of instructions that its operand is within
-  // conditional but it can be hoist out of conditional.
-  std::vector<ConditionalBoundary> boundaries_;
-
-  // Instructions to hoist.
-  std::unordered_set<HloInstruction*> instructions_to_hoist_set_;
-
-  // Instructions to hoist, the order within this vector is BFS and
-  // an instruction's order will always be after its users.
-  std::vector<HloInstruction*> instructions_to_hoist_;
+  std::deque<Boundary> worklist_;
+  absl::flat_hash_set<HloInstruction*> visited_;
 };
 
-// Returns true if `instruction` is worth hoisting out.
-bool WorthHoisting(HloInstruction* instruction) {
-  for (const auto* operand : instruction->operands()) {
-    // Only move out instructions that won't share the same operand
-    // to avoid copy of the operand.
-    if (operand->user_count() > 1) {
-      return false;
-    }
-  }
-  switch (instruction->opcode()) {
-    case HloOpcode::kConvert:
-      // If Convert is after AllReduce, it is worth moving out AllReduce out
-      // of conditional for AR/CRS combine. If Convert is after other ops such
-      // as Dot or Convolutional, it is better to keep convert within
-      // conditional so that convert can be fused with Dot or Convolutional.
-      //
-      // TODO(b/154283721): figure out the scenario when convert can be fused
-      // with AllReduce out of conditional.
-      if (instruction->operand(0)->opcode() == HloOpcode::kAllReduce) {
-        return true;
-      }
-      return false;
-    case HloOpcode::kAllReduce:
-    case HloOpcode::kAdd:
-    case HloOpcode::kConstant:
-    case HloOpcode::kSubtract:
-    case HloOpcode::kMultiply:
-    case HloOpcode::kDivide:
-    case HloOpcode::kTuple:
-    case HloOpcode::kSqrt:
+// Returns estimation of potential reuses carried by a given pair of
+// instructions.  Use different integers to classify different levels
+// of reuses This is used as a placeholder only, assuming all
+// instructions can be fused to enable data reuses
+int64 ReusesCarriedBy(HloInstruction* op, HloInstruction* user) {
+  VLOG(1) << "ConditionalCodeMotion: Add reuses carried by instr: "
+          << op->ToString() << "=>" << user->ToString() << "\n";
+  switch (user->opcode()) {
     case HloOpcode::kGetTupleElement:
-      return true;
+    case HloOpcode::kTuple:
+      return 0;
     default:
-      return false;
+      break;
+  }
+  switch (op->opcode()) {
+      // These instructions are lightweight and easy to fuse.
+    case HloOpcode::kConstant:
+    case HloOpcode::kGetTupleElement:
+      return 0;
+    default:
+      // Assume fusion will not happen anyway if user count > 1)
+      if (op->user_count() > 1) {
+        return 0;
+      }
+      return 10;
   }
 }
 
@@ -220,7 +139,7 @@ bool InstructionWithinBranchIdentical(
     return *a == *b;
   };
 
-  if (instructions[0] == nullptr) {
+  if (instructions.empty()) {
     return false;
   }
 
@@ -248,109 +167,42 @@ bool InstructionWithinBranchIdentical(
                      });
 }
 
-// Returns if all the visitors/branches has next instruction to visit.
-bool HasNextInstruction(const std::vector<BranchVisitor>& visitors) {
-  bool has_next = true;
-  for (const auto& visitor : visitors) {
-    has_next &= visitor.HasNextInstruction();
-  }
-  return has_next;
-}
-
-// Create tuple element as the new root of the branch. The tuple will contain
-// the operands that can't move out of conditional but its user will be moved
-// out of conditional.
-HloInstruction* CreateNewRoot(
-    const std::vector<ConditionalBoundary>& boundaries,
-    const std::unordered_set<HloInstruction*>& instructions_to_hoist_set,
-    HloComputation* computation) {
-  std::vector<HloInstruction*> elements;
-  elements.reserve(boundaries.size());
-  for (auto boundary : boundaries) {
-    if (ContainsKey(instructions_to_hoist_set, boundary.user)) {
-      elements.push_back(boundary.operand);
+// Copy the ith instruction in boundary to outside of conditional, or do the
+// opposite (for moving in).
+Status CopyInOrOutOfConditional(
+    Boundary& boundary, int64 dest_index, HloComputation* parent,
+    absl::flat_hash_map<HloInstruction*, Boundary>& hoisted_instructions) {
+  CHECK(dest_index == 0 || boundary.IsOutsideBranch());
+  HloInstruction* op = boundary.operands()[0];
+  absl::InlinedVector<HloInstruction*, 4> new_operands;
+  for (int i = 0; i < op->operands().size(); ++i) {
+    auto op_i = op->operands()[i];
+    VLOG(2) << "Looking for operand:" << op_i->ToString() << "\n";
+    if (ContainsKey(hoisted_instructions, op_i)) {
+      auto new_op_i =
+          FindOrDie(hoisted_instructions, op_i).operands()[dest_index];
+      VLOG(2) << "new operand:" << new_op_i->ToString() << "\n";
+      new_operands.push_back(new_op_i);
+    } else {
+      CHECK(op_i->opcode() == HloOpcode::kConstant);
+      auto new_op_i = parent->AddInstruction(op_i->Clone());
+      VLOG(2) << "new operand:" << new_op_i->ToString() << "\n";
+      new_operands.push_back(new_op_i);
     }
   }
-  return computation->AddInstruction(HloInstruction::CreateTuple(elements));
-}
-
-// Copy identical instructions within conditional outside of conditional.
-void CopyIdenticalInstructionsOutOfConditional(
-    const std::vector<HloInstruction*>& instructions_to_hoist,
-    HloComputation* conditional_parent,
-    absl::flat_hash_map<HloInstruction*, HloInstruction*>*
-        hoisted_instructions) {
-  int64 instructions_size = instructions_to_hoist.size();
-  // Visit the operands before its users and copy it, so that the copied
-  // user will point to the correct operand.
-  for (int64 i = instructions_size - 1; i >= 0; i--) {
-    HloInstruction* old_instruction = instructions_to_hoist[i];
-    auto get_new_operand = [&](HloInstruction* old_operand) {
-      // If the operand can't be found in `instructions_to_hoist`, this
-      // operand will be in the `boundaries`, GetTupleElement instructions
-      // will be added later to replace this operand.
-      if (!ContainsKey(*hoisted_instructions, old_operand)) {
-        return old_operand;
-      }
-      return FindOrDie(*hoisted_instructions, old_operand);
-    };
-
-    absl::InlinedVector<HloInstruction*, 4> new_operands;
-    absl::c_transform(old_instruction->operands(),
-                      std::back_inserter(new_operands), get_new_operand);
-
-    HloInstruction* new_instruction = conditional_parent->AddInstruction(
-        old_instruction->CloneWithNewOperands(old_instruction->shape(),
-                                              new_operands));
-    // Maps the instruction outside of conditional to the instruction
-    // inside of the conditional.
-    InsertOrDie(hoisted_instructions, old_instruction, new_instruction);
-  }
-}
-
-// If there are instructions to hoist, the root of the conditional must be
-// moved out. Change the users of the conditional to the hoisted instruction
-// of the new root.
-Status ChangeConditionalUsers(
-    HloInstruction* conditional, HloInstruction* old_root,
-    const absl::flat_hash_map<HloInstruction*, HloInstruction*>&
-        hoisted_instructions) {
-  HloInstruction* new_root = FindOrDie(hoisted_instructions, old_root);
-  TF_RETURN_IF_ERROR(conditional->ReplaceAllUsesWith(new_root));
-  return Status::OK();
-}
-
-// Insert GetTupleElement before the instructions whose operands might still
-// be within the conditional.
-Status CreateGetTupleElementAfterConditional(
-    const std::vector<ConditionalBoundary>& boundaries,
-    const std::unordered_set<HloInstruction*>& instructions_to_hoist_set,
-    const absl::flat_hash_map<HloInstruction*, HloInstruction*>&
-        hoisted_instructions,
-    HloInstruction* conditional, HloComputation* computation) {
-  int boundary_instruction_size = boundaries.size();
-
-  // Inserts GetTupleElement before the boundary instructions.
-  for (int i = 0; i < boundary_instruction_size; i++) {
-    HloInstruction* gte =
-        computation->AddInstruction(HloInstruction::CreateGetTupleElement(
-            boundaries[i].operand->shape(), conditional, i));
-
-    HloInstruction* new_instruction =
-        FindOrDie(hoisted_instructions, boundaries[i].user);
-    TF_RETURN_IF_ERROR(
-        new_instruction->ReplaceOperandWith(boundaries[i].operand_index, gte));
-  }
-  return Status::OK();
-}
-
-// Remove instructions to be hoisted out of the branch computation.
-Status RemoveInstructionFromComputation(
-    const std::vector<HloInstruction*>& instructions_to_hoist,
-    HloComputation* branch) {
-  // Will visit the instructions after its users.
-  for (auto* instruction : instructions_to_hoist) {
-    TF_RETURN_IF_ERROR(branch->RemoveInstruction(instruction));
+  HloInstruction* new_instruction = parent->AddInstruction(
+      op->CloneWithNewOperands(op->shape(), new_operands));
+  VLOG(2) << "new instruction:" << new_instruction->ToString() << "\n";
+  // Maps the instruction outside of conditional to the instruction
+  // inside of the conditional.
+  for (HloInstruction* op : boundary.operands()) {
+    Boundary b2 = ContainsKey(hoisted_instructions, op)
+                      ? hoisted_instructions[op]
+                      : Boundary(boundary.IsOutsideBranch()
+                                     ? Boundary::Position::kInsideBranch
+                                     : Boundary::Position::kOutsideBranch);
+    b2.mutable_operands().push_back(new_instruction);
+    hoisted_instructions[op] = b2;
   }
   return Status::OK();
 }
@@ -482,7 +334,7 @@ StatusOr<bool> ConvertSpecialMove(HloInstruction* conditional,
       old_root = conditional->branch_computation(branch)->root_instruction();
       absl::flat_hash_map<HloInstruction*, int64> map_inst_to_tuple_index;
       std::vector<HloInstruction*> new_operands(old_root->operand_count());
-      std::unordered_set<HloInstruction*> to_hoist_set;
+      absl::flat_hash_set<HloInstruction*> to_hoist_set;
 
       for (int64 operand_num = 0; operand_num < old_root->operand_count();
            ++operand_num) {
@@ -574,128 +426,545 @@ StatusOr<bool> ConvertSpecialMove(HloInstruction* conditional,
 // are the shape of the operands are identical and their properties are
 // identical. Will start from the root instruction of each branch and get
 // the identical ops to hoist.
-StatusOr<bool> MergeIdenticalElements(HloInstruction* conditional,
-                                      bool is_layout_sensitive) {
-  VLOG(1) << " visiting conditional:" << conditional->ToString();
-  int branch_count = conditional->branch_count();
-  if (branch_count <= 0) {
+StatusOr<bool> ConditionalCodeMotion::MoveInstructionOut(
+    HloInstruction* conditional, std::vector<Boundary>& to_move_out,
+    std::vector<Boundary>& new_boundaries) {
+  if (to_move_out.empty()) {
     return false;
   }
-
-  std::vector<BranchVisitor> visitors;
-  visitors.reserve(branch_count);
-  // Visit instructions from the root instruction to the operands using BFS.
-  for (int i = 0; i < branch_count; i++) {
-    visitors.emplace_back(BranchVisitor(conditional->branch_computation(i)));
-  }
-
-  // The instructions to be visited within each branch.
-  std::vector<HloInstruction*> front_instructions(branch_count);
-
-  while (HasNextInstruction(visitors)) {
-    for (int i = 0; i < branch_count; i++) {
-      front_instructions[i] = visitors[i].GetNextInstruction();
-    }
-    // If two instructions has the same shape, opcode and its operands has the
-    // same shape, then this instruction can be moved out of conditional.
-    if (WorthHoisting(front_instructions[0]) &&
-        InstructionWithinBranchIdentical(front_instructions,
-                                         is_layout_sensitive)) {
-      for (int i = 0; i < branch_count; i++) {
-        visitors[i].AddInstructionOperands(front_instructions[i]);
-        visitors[i].AddInstructionToHoist(front_instructions[i]);
-      }
-    } else {
-      for (int i = 0; i < branch_count; i++) {
-        // If the ops are not identical, these ops and its users will
-        // be in the boundaries` of the conditional. These ops will be stayed
-        // within the conditional, but one its only user will be moved out
-        // of conditional.
-        visitors[i].AddInstructionToBoundary(front_instructions[i]);
-      }
-    }
-  }
-
-  if (visitors[0].HoistInstructionSize() < 1) {
-    return false;
-  }
-
-  HloInstruction* old_root =
-      conditional->branch_computation(0)->root_instruction();
+  VLOG(1) << "number of boundaries to move out:" << to_move_out.size() << "\n";
   HloComputation* conditional_parent = conditional->parent();
+  // save the old users before add new conditional user instructions
+  std::vector<HloInstruction*> old_conditional_users = conditional->users();
   // Maps instructions in the conditional body to instructions hoisted outside
   // the conditional that compute the same value.
-  absl::flat_hash_map<HloInstruction*, HloInstruction*> hoisted_instructions;
-  // Copy identical instructions out of the conditional.
-  CopyIdenticalInstructionsOutOfConditional(visitors[0].instructions_to_hoist(),
-                                            conditional_parent,
-                                            &hoisted_instructions);
-  // If there are instructions to hoist, the root of the conditional must be
-  // moved out. Change the users of the conditional to the hoisted instruction
-  // of the new root.
-  TF_RETURN_IF_ERROR(
-      ChangeConditionalUsers(conditional, old_root, hoisted_instructions));
-
-  // Create tuple element within each branch and set it as root.
-  for (int i = 0; i < branch_count; i++) {
-    HloInstruction* tuple = CreateNewRoot(
-        visitors[i].boundaries(), visitors[i].instructions_to_hoist_set(),
-        conditional->branch_computation(i));
-    conditional->branch_computation(i)->set_root_instruction(tuple, true);
-  }
-  // Changes conditional instruction shape to the shape of the new root.
-  *conditional->mutable_shape() =
-      conditional->branch_computation(0)->root_instruction()->shape();
-
+  absl::flat_hash_map<HloInstruction*, Boundary> hoisted_instructions;
   // Insert GetTupleElement before the instructions whose operands might still
   // be within the conditional.
-  TF_RETURN_IF_ERROR(CreateGetTupleElementAfterConditional(
-      visitors[0].boundaries(), visitors[0].instructions_to_hoist_set(),
-      hoisted_instructions, conditional, conditional_parent));
-
-  // Remove hoist instructions from the branches.
-  for (int i = 0; i < branch_count; i++) {
-    TF_RETURN_IF_ERROR(
-        RemoveInstructionFromComputation(visitors[i].instructions_to_hoist(),
-                                         conditional->branch_computation(i)));
+  VLOG(2) << "before opt:"
+          << conditional_parent->ToString(HloPrintOptions::Fingerprint())
+          << "\n";
+  int64 op_index = 0;
+  for (Boundary b : new_boundaries) {
+    HloInstruction* op = b.operands()[0];
+    CHECK(op != nullptr);
+    VLOG(2) << "Mapping new boundary instr: " << op->ToString() << "\n";
+    HloInstruction* gtr = conditional_parent->AddInstruction(
+        HloInstruction::CreateGetTupleElement(op->shape(), conditional,
+                                              op_index++));
+    Boundary b2(Boundary::Position::kOutsideBranch);
+    b2.mutable_operands().push_back(gtr);
+    hoisted_instructions[op] = b2;
   }
+  // Copy boundary instructions out of the conditional.
+  // Visit the operands before its users and copy it, so that the copied
+  // user will point to the correct operand.
+  for (int64 i = to_move_out.size() - 1; i >= 0; i--) {
+    TF_RETURN_IF_ERROR(CopyInOrOutOfConditional(
+        to_move_out[i], 0, conditional_parent, hoisted_instructions));
+  }
+  VLOG(2) << "Done copy branch instructions out\n"
+          << conditional_parent->ToString(HloPrintOptions::Fingerprint())
+          << "\n";
+  // Change original users of the conditional to use the correct operands.
+  HloInstruction* old_root =
+      conditional->branch_computation(0)->root_instruction();
+  for (auto user_instr : old_conditional_users) {
+    CHECK(user_instr->opcode() == HloOpcode::kGetTupleElement);
+    auto tuple_opd = static_cast<HloGetTupleElementInstruction*>(user_instr);
+    int64 index = tuple_opd->tuple_index();
+    HloInstruction* old_opd = old_root->operands()[index];
+    HloInstruction* new_opd = hoisted_instructions[old_opd].operands()[0];
+    CHECK(old_opd != nullptr);
+    CHECK(new_opd != nullptr);
+    TF_RETURN_IF_ERROR(user_instr->ReplaceAllUsesWith(new_opd));
+    TF_RETURN_IF_ERROR(conditional_parent->RemoveInstruction(user_instr));
+  }
+  // Create tuple element within each branch and set it as root.
+  int64 branch_count = conditional->branch_count();
+  for (int i = 0; i < branch_count; i++) {
+    auto computation = conditional->branch_computation(i);
+    std::vector<HloInstruction*> elements;
+    for (auto b1 : new_boundaries) {
+      HloInstruction* op = b1.operands()[i];
+      VLOG(1) << "branch count=" << i << "\n";
+      CHECK(op != nullptr);
+      VLOG(1) << "Adding to root " << i << " with " << op->ToString() << "\n";
+      elements.push_back(op);
+    }
+    HloInstruction* tuple =
+        computation->AddInstruction(HloInstruction::CreateTuple(elements));
+    computation->set_root_instruction(tuple, true);
+    VLOG(2) << "computation is :" << computation->ToString() << "\n";
+    // Remove hoisted instructions from the branches.
+    for (auto b2 : to_move_out) {
+      VLOG(2) << "Removing boundary:" << b2.ToString() << "\n";
+      TF_RETURN_IF_ERROR(computation->RemoveInstruction(b2.operands()[i]));
+    }
+  }
+  // Change conditional instruction shape to the shape of the new root.
+  HloInstruction* new_root =
+      conditional->branch_computation(0)->root_instruction();
+  *conditional->mutable_shape() = new_root->shape();
+  //
+  VLOG(2) << "done moving instructions out of branches\n"
+          << conditional_parent->ToString(HloPrintOptions::Fingerprint())
+          << "\n";
   return true;
 }
 
-}  // namespace
+// Hoist ops from outside of the conditional to inside the branches.
+StatusOr<bool> ConditionalCodeMotion::MoveInstructionIn(
+    HloInstruction* conditional, std::vector<Boundary>& to_move_in,
+    std::vector<Boundary>& new_boundaries) {
+  if (to_move_in.empty()) {
+    return false;
+  }
+  VLOG(1) << "number of boundaries to move in:" << to_move_in.size() << "\n";
+  HloComputation* conditional_parent = conditional->parent();
+  VLOG(2) << "before opt:"
+          << conditional_parent->ToString(HloPrintOptions::Fingerprint())
+          << "\n";
+  // Mapping instructions to be moved to their new representations.
+  absl::flat_hash_map<HloInstruction*, Boundary> hoisted_instructions;
+  int64 to_move_in_size = to_move_in.size();
+  int64 branch_count = conditional->branch_count();
+  int64 op_index = conditional->shape().tuple_shapes_size();
+  // Map conditional to its old root, then create a new root instruction in each
+  // branch.
+  Boundary b(Boundary::Position::kInsideBranch);
+  for (int i = 0; i < branch_count; i++) {
+    auto computation = conditional->branch_computation(i);
+    auto old_root = computation->root_instruction();
+    b.mutable_operands().push_back(old_root);
+    HloInstruction* new_root = nullptr;
+    if (old_root->opcode() == HloOpcode::kTuple) {
+      new_root = computation->AddInstruction(old_root->Clone());
+    } else {
+      std::vector<HloInstruction*> operands;
+      if (!old_root->shape().IsTuple()) {
+        operands.push_back(old_root);
+      } else {
+        const Shape& old_shape = old_root->shape();
+        for (int64 i = 0; i < old_shape.tuple_shapes_size(); ++i) {
+          auto element =
+              computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+                  old_shape.tuple_shapes(i), old_root, i));
+          operands.push_back(element);
+        }
+      }
+      new_root =
+          computation->AddInstruction(HloInstruction::CreateTuple(operands));
+    }
+    VLOG(2) << "setting new root: " << new_root->ToString() << "\n";
+    computation->set_root_instruction(new_root);
+    VLOG(2) << "new branch computation: " << computation->ToString() << "\n";
+  }
+  hoisted_instructions[conditional] = b;
+  for (int64 i = 0; i < to_move_in_size; i++) {
+    Boundary b_to_move = to_move_in[i];
+    HloInstruction* op = b_to_move.operands()[0];
+    CHECK(op != nullptr);
+    bool to_be_used_outside = true;
+    VLOG(2) << "Mapping new boundary instr: " << op->ToString() << "\n";
+    if (i < to_move_in_size - 1 && op->user_count() == 1 &&
+        op->users()[0] == to_move_in[i + 1].operands()[0]) {
+      to_be_used_outside = false;
+      VLOG(2) << "Instruction is not to be used outside the branch\n";
+    }
+    Boundary b(Boundary::Position::kInsideBranch);
+    for (int i = 0; i < branch_count; i++) {
+      auto computation = conditional->branch_computation(i);
+      TF_RETURN_IF_ERROR(CopyInOrOutOfConditional(b_to_move, i, computation,
+                                                  hoisted_instructions));
+      VLOG(2) << "After Copying to branch: " << computation->ToString() << "\n";
+      if (to_be_used_outside) {
+        auto new_op = hoisted_instructions[op].operands()[i];
+        auto new_root = computation->root_instruction();
+        new_root->AppendOperand(new_op);
+        *new_root->mutable_shape()->add_tuple_shapes() = new_op->shape();
+        VLOG(2) << "Extending conditional root " << i << " : "
+                << new_root->ToString() << "\n";
+      }
+      VLOG(2) << "After extending branch root: " << computation->ToString()
+              << "\n";
+    }
+    if (to_be_used_outside) {
+      // Modify uses of instructions outside of the conditionals
+      HloInstruction* gtr = conditional_parent->AddInstruction(
+          HloInstruction::CreateGetTupleElement(op->shape(), conditional,
+                                                op_index++));
+      TF_RETURN_IF_ERROR(op->ReplaceAllUsesWith(gtr));
+      if (conditional_parent->root_instruction() == op) {
+        conditional_parent->set_root_instruction(gtr);
+      }
+    }
+  }
+  VLOG(2) << "Done copying instructions inside branch: "
+          << conditional->ToString(HloPrintOptions::Fingerprint()) << "\n";
+  // Change conditional instruction shape to the shape of the new root.
+  HloInstruction* new_root =
+      conditional->branch_computation(0)->root_instruction();
+  *conditional->mutable_shape() = new_root->shape();
+  VLOG(2) << "Before removing instructions:" << conditional_parent->ToString()
+          << "\n";
+  // Remove hoisted instructions from the branches.
+  for (int64 i = to_move_in_size - 1; i >= 0; i--) {
+    Boundary boundary_to_move_in = to_move_in[i];
+    VLOG(2) << "Removing boundary:" << boundary_to_move_in.ToString() << "\n";
+    HloInstruction* op = boundary_to_move_in.operands()[0];
+    for (auto user : op->users()) {
+      VLOG(2) << "Has User: " << user->ToString() << "\n";
+    }
+    TF_RETURN_IF_ERROR(conditional_parent->RemoveInstruction(op));
+  }
+  VLOG(2) << "Done moving instructions inside branches\n"
+          << conditional_parent->ToString(HloPrintOptions::Fingerprint())
+          << "\n";
+  return true;
+}
 
-StatusOr<bool> ConditionalCodeMotion::Run(HloModule* module) {
-  bool changed = false;
+// Group single chains of operands or uses of boundaries into new boundaries
+class GroupConnectedBoundaries {
+ private:
+  std::vector<Boundary> connected_boundaries_, new_boundaries_;
+  HloInstruction* conditional_;
+  HloComputation* conditional_parent_;
+  bool is_layout_sensitive_;
+  absl::flat_hash_set<HloInstruction*> visited_;
 
-  if (pursue_full_conditional_code_motion_) {
-    std::vector<HloInstruction*> conditional_ops;
-    for (auto* comp : module->MakeComputationPostOrder()) {
-      for (auto* instr : comp->MakeInstructionPostOrder()) {
-        if (instr->opcode() == HloOpcode::kConditional) {
-          conditional_ops.push_back(instr);
+ public:
+  explicit GroupConnectedBoundaries(HloInstruction* conditional,
+                                    bool is_layout_sensitive)
+      : conditional_(conditional),
+        conditional_parent_(conditional->parent()),
+        is_layout_sensitive_(is_layout_sensitive) {}
+  // Returns true if `instruction` is worth hoisting out.
+  bool WorthHoisting(HloInstruction* instruction) {
+    // This is needed for the "moving-in" transformation, to prevent the root
+    // of the parent computation (which contains the conditional) to be moved
+    // inside the conditional.
+    if (instruction->opcode() == HloOpcode::kTuple &&
+        instruction == conditional_parent_->root_instruction()) {
+      return false;
+    }
+    switch (instruction->opcode()) {
+      case HloOpcode::kConvert:
+        // If Convert is after AllReduce, it is worth moving out AllReduce
+        // out of conditional for AR/CRS combine. If Convert is after other
+        // ops such as Dot or Convolutional, it is better to keep convert
+        // within conditional so that convert can be fused with Dot or
+        // Convolutional.
+        //
+        // TODO(b/154283721): figure out the scenario when convert can be
+        // fused with AllReduce out of conditional.
+        switch (instruction->operand(0)->opcode()) {
+          case HloOpcode::kAllReduce:
+          case HloOpcode::kReshape:
+            return true;
+          default:
+            VLOG(1) << "Instruction is convert and its operand is not know to "
+                       "be worth hoisting\n";
+            return false;
+        }
+      case HloOpcode::kAllReduce:
+      case HloOpcode::kAdd:
+      case HloOpcode::kPower:
+      case HloOpcode::kConstant:
+      case HloOpcode::kSubtract:
+      case HloOpcode::kMultiply:
+      case HloOpcode::kDivide:
+      case HloOpcode::kTuple:
+      case HloOpcode::kSqrt:
+      case HloOpcode::kReshape:
+      case HloOpcode::kGetTupleElement:
+        return true;
+      default:
+        VLOG(1) << "Instruction is not known to be worth hoisting\n";
+        return false;
+    }
+  }
+  int64 ReusesBeforeBoundary(HloInstruction* user) {
+    int64 reuses = 0;
+    for (auto op : user->operands()) {
+      // Only consider single-user cases as reuseable.
+      if (ContainsKey(visited_, op) && op->user_count() == 1) {
+        reuses += ReusesCarriedBy(op, user);
+      } else if (op->opcode() == HloOpcode::kConditional &&
+                 user->opcode() == HloOpcode::kGetTupleElement) {
+        if (user->user_count() == 1) {
+          reuses += ReusesCarriedBy(op, user->users()[0]);
         }
       }
     }
+    VLOG(1) << "Reuses before instruction " << user->ToString() << ":" << reuses
+            << "\n";
+    return reuses;
+  }
 
-    for (HloInstruction* conditional_op : conditional_ops) {
-      TF_ASSIGN_OR_RETURN(
-          bool result,
-          MergeIdenticalElements(conditional_op, is_layout_sensitive_));
-      changed |= result;
+  int64 ReusesAfterBoundary(HloInstruction* user) {
+    CHECK(user != nullptr);
+    auto all_users = user->users();
+    // For now, assume that if an instruction has multiple-consumers, it
+    // will not be reused, as the reuse may require duplication in
+    // fusion and so is expensive. If the situation changes in the future,
+    // some aspects of the overall algorithm need to be redesigned to
+    // accommandate the change.
+    if (all_users.size() > 1) {
+      return 0;
     }
+    if (!all_users.empty()) {
+      auto op = all_users[0];
+      int64 reuses = 0;
+      // Only count reuses that run through the conditional root.
+      if (op == conditional_->branch_computation(0)->root_instruction()) {
+        int64 index = op->operand_index(user);
+        for (auto op2 : conditional_->users()) {
+          // If the use is not get tuple, right now do not consider it.
+          if (op2->opcode() == HloOpcode::kGetTupleElement) {
+            auto tuple_opd = static_cast<HloGetTupleElementInstruction*>(op2);
+            if (index == tuple_opd->tuple_index()) {
+              all_users = op2->users();
+              if (!all_users.empty()) {
+                reuses += ReusesCarriedBy(user, all_users[0]);
+                break;
+              }
+            }
+          }
+        }
+      } else if (ContainsKey(visited_, op)) {
+        reuses += ReusesCarriedBy(user, op);
+      }
+      VLOG(1) << "reuses after instruction " << user->ToString() << ":"
+              << reuses << "\n";
+      return reuses;
+    }
+    return 0;
+  }
 
-    if (changed) {
-      HloPassPipeline subpipeline("after_conditional_code_motion");
-      subpipeline.AddPass<HloDCE>();
-      subpipeline.AddPass<TupleSimplifier>();
-      subpipeline.AddPass<HloDCE>();
-      TF_ASSIGN_OR_RETURN(bool cleanup_changed, subpipeline.Run(module));
-      changed |= cleanup_changed;
+  int64 BenefitForMovingBoundaries(const std::vector<Boundary>& boundaries) {
+    int64 reuses_before = 0, reuses_after = 0;
+    if (boundaries.size() == 1 && boundaries[0].IsOutsideBranch()) {
+      // The only boundary of moving-in is the get_tuple_element op.
+      return -1;
+    }
+    for (Boundary b : boundaries) {
+      auto op = b.operands()[0];
+      if (op == conditional_->branch_computation(0)->root_instruction()) {
+        continue;
+      }
+      reuses_before += ReusesBeforeBoundary(op);
+      VLOG(1) << "Reuses before boundary so far: " << reuses_before << "\n";
+      reuses_after += ReusesAfterBoundary(op);
+      VLOG(1) << "Reuese after boundary so far : " << reuses_after << "\n";
+    }
+    if (reuses_after == 0 && reuses_before == 0) {
+      return -1;
+    } else if (boundaries[0].IsInsideBranch()) {
+      return reuses_after - reuses_before;
+    } else {
+      return reuses_before - reuses_after;
     }
   }
 
+  Boundary GetNextBoundary(const Boundary& b, int64 op_index) {
+    Boundary b2(b.GetPosition());
+    for (int j = 0; j < b.operands().size(); ++j) {
+      HloInstruction* inst = b.operands()[j];
+      CHECK(inst != nullptr);
+      HloInstruction* op = (b.IsInsideBranch()) ? inst->operands()[op_index]
+                                                : inst->users()[op_index];
+      CHECK(op != nullptr);
+      b2.mutable_operands().push_back(op);
+    }
+    return b2;
+  }
+  int64 CountNonLeafOps(const xla::HloInstruction::InstructionVector& ops) {
+    int64 count = 0;
+    absl::flat_hash_set<HloInstruction*> op_set;
+    for (auto op : ops) {
+      if (!op_set.contains(op) && op->opcode() != HloOpcode::kConstant) {
+        count++;
+        op_set.insert(op);
+      }
+    }
+    return count;
+  }
+  // This function is reused both for moving the boundary outside or into a
+  // conditional. As the result, the readability is somewhat compromised.
+  // It might be nice to refactor this function to factor the outside-inside
+  // considerations into separate function pointer parameters to improve
+  // readability.
+  void AddBoundaries(const Boundary& boundary) {
+    BoundaryVisitor visitor;
+    visitor.AddToWorkList(boundary);
+    while (visitor.HasNextBoundary()) {
+      Boundary b = visitor.PopNextBoundary();
+      VLOG(1) << "visiting boundary " << b.ToString() << "\n";
+      if ((b.IsOutsideBranch() || InstructionWithinBranchIdentical(
+                                      b.operands(), is_layout_sensitive_)) &&
+          WorthHoisting(b.operands()[0])) {
+        connected_boundaries_.push_back(b);
+        VLOG(1) << "boundary can be moved\n";
+        int64 operand_count = (b.IsInsideBranch())
+                                  ? b.operands()[0]->operand_count()
+                                  : b.operands()[0]->users().size();
+        for (int i = 0; i < operand_count; i++) {
+          Boundary next_boundary = GetNextBoundary(b, i);
+          int64 next_boundary_count =
+              (next_boundary.IsInsideBranch())
+                  ? next_boundary.operands()[0]->user_count()
+                  : CountNonLeafOps(next_boundary.operands()[0]->operands());
+          // only consider adding an exclusive producor into the same group.
+          if (next_boundary_count == 1) {
+            VLOG(2) << "Add operand " << i << " to visit later\n";
+            visitor.AddToWorkList(next_boundary);
+          } else {
+            VLOG(2) << "Next boundary " << i
+                    << " has multiple uses: " << next_boundary_count << "\n";
+            if (!ContainsKey(visited_, next_boundary.operands()[0])) {
+              visited_.insert(next_boundary.operands()[0]);
+              new_boundaries_.push_back(next_boundary);
+            }
+          }
+        }
+      } else {
+        VLOG(1) << "boundary cannot be moved\n";
+        visited_.insert(b.operands()[0]);
+        new_boundaries_.push_back(b);
+      }
+    }
+  }
+  std::vector<Boundary> BoundariesToMoveInOrOut(const Boundary& b) {
+    // At the beginning of optimization, a conditional itself is added to a
+    // worklist. Here the conditional is expanded into two sets of boundaries:
+    // the first set contains the boundary that is inside branches and
+    // contains the root of all branches; the second set of boundaries
+    // contains all the users of the conditional.
+    HloInstruction* inst = b.operands()[0];
+    if (inst->opcode() == HloOpcode::kConditional) {
+      int branch_count = inst->branch_count();
+      // Add conditional roots as a new boundary to visit.
+      Boundary boundary_in(Boundary::Position::kInsideBranch);
+      for (int i = 0; i < branch_count; i++) {
+        HloComputation* branch_computation = inst->branch_computation(i);
+        HloInstruction* root_inst = branch_computation->root_instruction();
+        CHECK(root_inst != nullptr);
+        boundary_in.mutable_operands().push_back(root_inst);
+      }
+      new_boundaries_.push_back(boundary_in);
+      // Add conditional users as new boundaries to visit.
+      for (auto u : inst->users()) {
+        Boundary boundary_in(Boundary::Position::kOutsideBranch);
+        boundary_in.mutable_operands().push_back(u);
+        new_boundaries_.push_back(boundary_in);
+      }
+    } else {
+      AddBoundaries(b);
+    }
+    return connected_boundaries_;
+  }
+  void AddNewBoundaries(std::vector<Boundary>& b) {
+    b.insert(b.end(), new_boundaries_.begin(), new_boundaries_.end());
+  }
+};
+
+ConditionalCodeMotion::Decision ConditionalCodeMotion::ConsiderCodeMotion(
+    HloInstruction* conditional, const Boundary& cur_boundary,
+    std::vector<Boundary>& to_move, std::vector<Boundary>& new_boundaries) {
+  GroupConnectedBoundaries connect(conditional, is_layout_sensitive_);
+  auto move_in_or_out = connect.BoundariesToMoveInOrOut(cur_boundary);
+  if (!move_in_or_out.empty()) {
+    auto benefit = connect.BenefitForMovingBoundaries(move_in_or_out);
+    VLOG(1) << "benefit of moving in or out "
+            << cur_boundary.operands()[0]->ToString() << ":" << benefit << "\n";
+    if (benefit >= 0) {
+      new_boundaries.clear();
+      connect.AddNewBoundaries(new_boundaries);
+      // The whole sequence in move_in_or_out is either all moving into a
+      // conditional, or all moving out of a conditional. So looking only
+      // at the first entry of the sequence is sufficient to know which
+      // direction the move is intended.
+      to_move = move_in_or_out;
+      return to_move[0].IsInsideBranch() ? Decision::kMoveOutOfBranch
+                                         : Decision::kMoveIntoBranch;
+    }
+  } else {
+    connect.AddNewBoundaries(new_boundaries);
+  }
+  return ConditionalCodeMotion::Decision::kNoChange;
+}
+
+StatusOr<bool> ConditionalCodeMotion::Run(HloModule* module) {
+  // Gather all the conditional ops in the module ahead of time, to avoid
+  // potential complications of modifying the code that affecting traversal.
+  std::vector<HloInstruction*> conditional_ops;
+  for (auto* comp : module->MakeComputationPostOrder()) {
+    for (auto* instr : comp->MakeInstructionPostOrder()) {
+      if (instr->opcode() == HloOpcode::kConditional) {
+        conditional_ops.push_back(instr);
+      }
+    }
+  }
+
+  bool changed = false;
+  for (HloInstruction* conditional : conditional_ops) {
+    // Boundaries to move out or to move into the branches.
+    std::vector<Boundary> to_move_out, to_move_in, new_boundaries;
+    // The conditional is moved into a worklist as the seed (starting point).
+    // The conditional will be expanded into multiple seeds (starting points),
+    // its roots and its users, when it is visited by GroupConnectedBoundaries.
+    // A NO_CHANGE decision will always be returned for the conditional itself,
+    // so that the other seeding boundaries can be visited in turn.
+    BoundaryVisitor visitor(conditional);
+    VLOG(2) << "Analyzing conditional:" << conditional->ToString() << "\n";
+    ConditionalCodeMotion::Decision d = Decision::kNoChange;
+    // The following loop breaks out as soon as a decision to modify the
+    // conditional is reached --- irrespective of whether visitor is empty.
+    while (d == Decision::kNoChange && visitor.HasNextBoundary()) {
+      std::vector<Boundary> to_move, next_boundary;
+      Boundary boundary = visitor.PopNextBoundary();
+      VLOG(2) << "Analyzing boundary:" << boundary.ToString() << "\n";
+      d = ConsiderCodeMotion(conditional, boundary, to_move, next_boundary);
+      switch (d) {
+        case Decision::kMoveOutOfBranch:
+          VLOG(2) << "Decision is move out of branch\n";
+          to_move_out.insert(to_move_out.end(), to_move.begin(), to_move.end());
+          new_boundaries.insert(new_boundaries.end(), next_boundary.begin(),
+                                next_boundary.end());
+          break;
+        case Decision::kMoveIntoBranch:
+          VLOG(2) << "Decision is move into branch\n";
+          to_move_in.insert(to_move_in.end(), to_move.begin(), to_move.end());
+          new_boundaries.insert(new_boundaries.end(), next_boundary.begin(),
+                                next_boundary.end());
+          break;
+        case Decision::kNoChange:
+          VLOG(2) << "Decision is no change\n";
+          for (const Boundary& b : next_boundary) {
+            visitor.AddToWorkList(b);
+          }
+          break;
+      }
+    }
+    // At most one of to_move_out or to_move_in can be non-empty, since there is
+    // only one optimization decision.
+    if (!to_move_out.empty()) {
+      TF_ASSIGN_OR_RETURN(
+          bool result,
+          MoveInstructionOut(conditional, to_move_out, new_boundaries));
+      VLOG(2) << "moving out result:" << result << "\n";
+      changed |= result;
+    } else if (!to_move_in.empty()) {
+      TF_ASSIGN_OR_RETURN(
+          bool result,
+          MoveInstructionIn(conditional, to_move_in, new_boundaries));
+      VLOG(2) << "moving in result:" << result << "\n";
+      changed |= result;
+    }
+  }
   // handling convert rematerialization/hoisting
-  {
+  if (!changed && pursue_full_conditional_code_motion_) {
     std::vector<HloInstruction*> conditional_ops;
     for (auto* comp : module->MakeComputationPostOrder()) {
       for (auto* instr : comp->MakeInstructionPostOrder()) {
@@ -711,7 +980,6 @@ StatusOr<bool> ConditionalCodeMotion::Run(HloModule* module) {
       changed |= convert_result;
     }
   }
-
   if (changed) {
     HloPassPipeline subpipeline(
         "after_conditional_code_motion_after_convert_hoisting");
@@ -721,8 +989,8 @@ StatusOr<bool> ConditionalCodeMotion::Run(HloModule* module) {
     TF_ASSIGN_OR_RETURN(bool cleanup_changed, subpipeline.Run(module));
     changed |= cleanup_changed;
   }
-
   return changed;
 }
+}  // namespace conditional_opt
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/conditional_code_motion.h b/tensorflow/compiler/xla/service/conditional_code_motion.h
index 95f02833e15..68a2aa58235 100644
--- a/tensorflow/compiler/xla/service/conditional_code_motion.h
+++ b/tensorflow/compiler/xla/service/conditional_code_motion.h
@@ -23,35 +23,81 @@ limitations under the License.
 
 namespace xla {
 
-// ConditionalCodeMotion specializes in hoisting/rematerializing
-// unconditional converts in the default mode.
-// When pursue_full_conditional_code_motion_ is set to true, the
-// full HLO pass moves identical ops out of a conditional in addition to moving
-// converts.
+namespace conditional_opt {
+// At the conceptual level, a boundary can be thought of as representing a
+// single virtual operation, except this virtual operation is conditionally
+// instantiated into different concrete operations at each conditional branch.
+// So a boundary is mapped to a single concrete operation if it is outside of
+// conditional branches, and is mapped to a list of instructions if inside the
+// branches. This data structure therefore allows a common data structure
+// representation of the instructions to be moved, whether  they are inside or
+// outside of the branches. Subsequently, it allows a common implementation
+// basis to be used for both moving instructions out of and for moving them
+// inside branches.
+class Boundary {
+ public:
+  enum class Position { kInsideBranch, kOutsideBranch, kUndefined };
+  Boundary() : position_(Position::kUndefined) {}
+  explicit Boundary(Position p) : position_(p) {}
+  std::vector<HloInstruction*>& mutable_operands() { return operands_; }
+  const std::vector<HloInstruction*>& operands() const { return operands_; }
+  bool IsInsideBranch() const { return position_ == Position::kInsideBranch; }
+  bool IsOutsideBranch() const { return position_ == Position::kOutsideBranch; }
+  Position GetPosition() const { return position_; }
+  bool IsEmpty() const { return operands_.empty(); }
+  std::string ToString() const {
+    std::string res;
+    for (HloInstruction* op : operands_) {
+      res += op->ToString() + ";";
+    }
+    return res;
+  }
+
+ private:
+  // Boundary instructions in the conditional branches, one from each branch
+  // of the conditional; or a single operand from outside the conditional.
+  std::vector<HloInstruction*> operands_;
+  Position position_;
+};
+
+// HLO pass that moves identical ops in/out of conditional.
 // - The definition of identical are the shape of the operands are identical
 // and their properties are identical.
-// - Currently, only some types of instructions is supported.
-// TODO(b/154283721): relax non-sharable operand constraint and avoid copies in
-// the new root.
 // - Only the identical ops that won't share operands with other ops will
 // be moved out of conditional.
 class ConditionalCodeMotion : public HloModulePass {
  public:
   // If is_layout_sensitive is true, then the hoist process preserves layout
   // during identical comparison. Otherwise, layout is ignored.
-  explicit ConditionalCodeMotion(
-      bool is_layout_sensitive = true,
-      bool pursue_full_conditional_code_motion = false)
+  explicit ConditionalCodeMotion(bool is_layout_sensitive,
+                                 bool pursue_full_conditional_code_motion)
       : is_layout_sensitive_(is_layout_sensitive),
         pursue_full_conditional_code_motion_(
             pursue_full_conditional_code_motion) {}
   absl::string_view name() const override { return "conditional-code-motion"; }
   StatusOr<bool> Run(HloModule* module) override;
 
+  // Optimization decision for each boundary of the conditional instruction.
+  enum class Decision { kMoveOutOfBranch, kMoveIntoBranch, kNoChange };
+  // If the optimization decision is NO_CHANGE, new_boundary is set to nullptr;
+  // otherwise, it is set to the new boundary after proposed optimization.
+  virtual Decision ConsiderCodeMotion(HloInstruction* conditional,
+                                      const Boundary& cur_boundary,
+                                      std::vector<Boundary>& to_move,
+                                      std::vector<Boundary>& new_boundaries);
+
  private:
   const bool is_layout_sensitive_;
   const bool pursue_full_conditional_code_motion_;
+
+  StatusOr<bool> MoveInstructionOut(HloInstruction* conditional,
+                                    std::vector<Boundary>& to_move_out,
+                                    std::vector<Boundary>& new_boundaries);
+  StatusOr<bool> MoveInstructionIn(HloInstruction* conditional,
+                                   std::vector<Boundary>& to_move_in,
+                                   std::vector<Boundary>& new_boundaries);
 };
+}  // namespace conditional_opt
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/conditional_code_motion_test.cc b/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
index 38b2b515fa0..b0a6ba92f48 100644
--- a/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
+++ b/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
@@ -33,7 +33,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
-namespace {
+namespace conditional_opt {
 
 using ConditionalCodeMotionTest = HloTestBase;
 namespace op = xla::testing::opcode_matchers;
@@ -117,6 +117,47 @@ ENTRY main {
   EXPECT_THAT(root, AllOf(op::Tuple(op::Convert())));
 }
 
+TEST_F(ConditionalCodeMotionTest, MoveConvertOutConditional) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveDotOpOut
+
+on_true {
+  %arg_tuple.1 = (f32[93184,4]{1,0}) parameter(0)
+  %get-tuple-element.1 = f32[93184,4]{1,0} get-tuple-element(%arg_tuple.1), index=0
+  %reshape.8493 = f32[2,512,364]{2,1,0} reshape(f32[93184,4]{1,0} %get-tuple-element.1)
+  %add.8493 = f32[2,512,364]{2,1,0} add(f32[2,512,364]{2,1,0} %reshape.8493, f32[2,512,364]{2,1,0} %reshape.8493)
+  %convert.2894 = bf16[2,512,364]{2,1,0} convert(f32[2,512,364]{2,1,0} %add.8493)
+  ROOT %tuple.1 = ( bf16[2,512,364]{2,1,0}) tuple(%convert.2894)
+}
+
+on_false {
+  %arg_tuple.2 = (f32[93184,4]{1,0}) parameter(0)
+  %get-tuple-element.3 = f32[93184,4]{1,0} get-tuple-element(%arg_tuple.2), index=0
+  %reshape.9717 = f32[2,512,364]{2,1,0} reshape(f32[93184,4]{1,0} %get-tuple-element.3)
+  %add.8493 = f32[2,512,364]{2,1,0} add(f32[2,512,364]{2,1,0} %reshape.9717, f32[2,512,364]{2,1,0} %reshape.9717)
+  %sub.8493 = f32[2,512,364]{2,1,0} subtract(f32[2,512,364]{2,1,0} %add.8493, f32[2,512,364]{2,1,0} %reshape.9717)
+  %convert.3604 = bf16[2,512,364]{2,1,0} convert(f32[2,512,364]{2,1,0} %reshape.9717), metadata={op_type="Cast" op_name="gradients/Cast_125_grad/Cast"}
+  ROOT %tuple.2 = (bf16[2,512,364]{2,1,0}) tuple(%convert.3604)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  arg_tuple.11 = (f32[93184,4]{1,0}) parameter(1)
+  arg_tuple.22 = (f32[93184,4]{1,0}) parameter(2)
+  conditional = (bf16[2,512,364]{2,1,0}) conditional(pred.1, arg_tuple.11, arg_tuple.22), true_computation=on_true, false_computation=on_false
+  get-first-index = bf16[2,512,364]{2,1,0} get-tuple-element(conditional), index=0
+  ROOT result = (bf16[2,512,364]{2,1,0}) tuple(get-first-index)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Tuple(op::Convert())));
+}
+
 TEST_F(ConditionalCodeMotionTest, MoveConvertOut) {
   absl::string_view hlo_string =
       R"(
@@ -152,8 +193,20 @@ ENTRY main {
   ConditionalCodeMotion pass(true, true);
   ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
 
+  const HloInstruction* conditional =
+      FindInstruction(module.get(), "conditional");
+  const HloComputation* on_true = conditional->branch_computation(0);
+  ASSERT_EQ(on_true->instruction_count(), 2);
+  const HloComputation* on_false = conditional->branch_computation(1);
+  ASSERT_EQ(on_false->instruction_count(), 2);
+
   HloInstruction* root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, AllOf(op::Tuple(op::Add(op::Convert(), op::Convert()))));
+  EXPECT_THAT(
+      root,
+      AllOf(op::Tuple(op::Add(op::Convert(op::Reshape(op::GetTupleElement(
+                                  op::GetTupleElement(op::Conditional())))),
+                              op::Convert(op::Reshape(op::GetTupleElement(
+                                  op::GetTupleElement(op::Conditional()))))))));
 }
 
 TEST_F(ConditionalCodeMotionTest, UserShareOperandCannotBeMoved) {
@@ -173,7 +226,7 @@ on_true {
   add.2 = f32[] add(add.1, constant.2)
   add.3 = f32[] add(add.1, constant.3)
   add.4 = f32[] add(add.3, constant.5)
-  multiply.1 = f32[] multiply(add.2, constant.4)
+  multiply.1 = f32[] multiply(add.4, constant.4)
   ROOT tuple.6 = (f32[], f32[]) tuple(multiply.1, add.4)
 }
 
@@ -202,7 +255,7 @@ ENTRY main {
     false_computation=on_false
   get-first-index = f32[] get-tuple-element(conditional), index=0
   get-second-index = f32[] get-tuple-element(conditional), index=1
-  ROOT result = (f32[], f32[]) tuple(get-first-index, get-second-index)
+  ROOT result = f32[] add(get-first-index, get-second-index)
 }
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
@@ -216,13 +269,11 @@ ENTRY main {
   const HloComputation* on_false = conditional->branch_computation(1);
   ASSERT_EQ(on_false->instruction_count(), 9);
 
-  // Check only one add and multiply is moved out.
   HloInstruction* root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(
-      root,
-      AllOf(op::Tuple(
-          op::Multiply(op::GetTupleElement(op::Conditional()), op::Constant()),
-          op::Add(op::GetTupleElement(op::Conditional()), op::Constant()))));
+  EXPECT_THAT(root,
+              AllOf(op::Add(op::Multiply(op::GetTupleElement(op::Conditional()),
+                                         op::Constant()),
+                            op::GetTupleElement(op::Conditional()))));
 }
 
 TEST_F(ConditionalCodeMotionTest, ConditionalRootElementChanged) {
@@ -260,7 +311,7 @@ ENTRY main {
     conditional(pred.1, tuple.1, tuple.2), true_computation=on_true,
     false_computation=on_false
   get-first-index = f32[] get-tuple-element(conditional), index=0
-  ROOT result = (f32[]) tuple(get-first-index)
+  ROOT result = f32[] add(get-first-index, get-first-index)
 }
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
@@ -269,16 +320,21 @@ ENTRY main {
   const HloInstruction* conditional =
       FindInstruction(module.get(), "conditional");
   const HloComputation* on_true = conditional->branch_computation(0);
-  ASSERT_EQ(on_true->instruction_count(), 7);
+  ASSERT_EQ(on_true->instruction_count(), 1);
   const HloComputation* on_false = conditional->branch_computation(1);
-  ASSERT_EQ(on_false->instruction_count(), 7);
+  ASSERT_EQ(on_false->instruction_count(), 1);
 
-  // add.3 in on_true will be moved out, add.1 and add.2 will be in condtional
-  // root.
-  ASSERT_TRUE(ShapeUtil::Compatible(
-      conditional->shape(),
-      ShapeUtil::MakeTupleShape(
-          {ShapeUtil::MakeShape(F32, {}), ShapeUtil::MakeShape(F32, {})})));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::Add(
+          op::Add(
+              op::Add(op::GetTupleElement(op::Conditional()), op::Constant()),
+              op::Add(op::GetTupleElement(op::Conditional()), op::Constant())),
+          op::Add(
+              op::Add(op::GetTupleElement(op::Conditional()), op::Constant()),
+              op::Add(op::GetTupleElement(op::Conditional()),
+                      op::Constant())))));
 }
 
 TEST_F(ConditionalCodeMotionTest, ConditionalIsRootInstruction) {
@@ -329,24 +385,9 @@ ENTRY main {
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
   ConditionalCodeMotion pass(true, true);
-  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
-
-  const HloInstruction* conditional =
-      FindInstruction(module.get(), "conditional");
-  const HloComputation* on_true = conditional->branch_computation(0);
-  ASSERT_EQ(on_true->instruction_count(), 9);
-  const HloComputation* on_false = conditional->branch_computation(1);
-  ASSERT_EQ(on_false->instruction_count(), 9);
-
-  // Check only one add and multiply is moved out.
-  // add.3 and add.5 can't be moved out because they share operands with
-  // other instructions.
-  HloInstruction* root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(
-      root,
-      AllOf(op::Tuple(
-          op::Multiply(op::GetTupleElement(op::Conditional()), op::Constant()),
-          op::Add(op::GetTupleElement(op::Conditional()), op::Constant()))));
+  // If there is no instruction after the conditional, there is no benefit to
+  // move
+  ASSERT_FALSE(pass.Run(&*module).ValueOrDie());
 }
 
 TEST_F(ConditionalCodeMotionTest, LayoutMisMatchCannotMovedOut) {
@@ -469,7 +510,8 @@ ENTRY main {
     false_computation=on_false
   get-first-index = f32[3,3,128,128]
     get-tuple-element(conditional), index=0
-  ROOT result = (f32[3,3,128,128]) tuple(get-first-index)
+  add.1 = f32[3,3,128,128] add(f32[3,3,128,128] get-first-index, f32[3,3,128,128] get-first-index)
+  ROOT result = (f32[3,3,128,128]) tuple(add.1)
 }
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
@@ -487,10 +529,57 @@ ENTRY main {
       conditional->shape(), ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(
                                 BF16, {3, 3, 128, 128})})));
   HloInstruction* root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, AllOf(op::Tuple(op::Convert(op::AllReduce(
-                        op::GetTupleElement(op::Conditional()))))));
+  EXPECT_THAT(
+      root,
+      AllOf(op::Tuple(op::Add(
+          op::Convert(op::AllReduce(op::GetTupleElement(op::Conditional()))),
+          op::Convert(
+              op::AllReduce(op::GetTupleElement(op::Conditional())))))));
 }
 
-}  // namespace
+TEST_F(ConditionalCodeMotionTest, MovePowOpIn) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveIdenticalInstruction
+
+on_true {
+  arg_tuple.1 = (f32[10]) parameter(0)
+  get-tuple-element.1 = f32[10] get-tuple-element(arg_tuple.1), index=0
+  add.1 = f32[10] add(get-tuple-element.1, get-tuple-element.1)
+  ROOT tuple.3 = (f32[10]) tuple(add.1)
+}
+
+on_false {
+  arg_tuple.2 = (f32[10]) parameter(0)
+  get-tuple-element.2 = f32[10] get-tuple-element(arg_tuple.2), index=0
+  mul.1 = f32[10] multiply(get-tuple-element.2, get-tuple-element.2)
+  ROOT tuple.4 = (f32[10]) tuple(mul.1)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  tuple.1 = (f32[10]) parameter(1)
+  tuple.2 = (f32[10]) parameter(2)
+  conditional = (f32[10])
+    conditional(pred.1, tuple.1, tuple.2), true_computation=on_true,
+    false_computation=on_false
+  get-first-index = f32[10] get-tuple-element(conditional), index=0
+  ROOT pow.1 = f32[10] power(get-first-index, get-first-index)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+  const HloInstruction* conditional =
+      FindInstruction(module.get(), "conditional");
+  const HloComputation* on_true = conditional->branch_computation(0);
+  ASSERT_EQ(on_true->instruction_count(), 5);
+  const HloComputation* on_false = conditional->branch_computation(1);
+  ASSERT_EQ(on_false->instruction_count(), 5);
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::GetTupleElement(op::Conditional())));
+}
+}  // namespace conditional_opt
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/conditional_simplifier.cc b/tensorflow/compiler/xla/service/conditional_simplifier.cc
index bb19a63a9ce..199bc787b83 100644
--- a/tensorflow/compiler/xla/service/conditional_simplifier.cc
+++ b/tensorflow/compiler/xla/service/conditional_simplifier.cc
@@ -41,6 +41,26 @@ limitations under the License.
 namespace xla {
 
 namespace {
+
+// A computation with array type that only contains parameters and tuples is
+// considered emtpy.
+bool ComputationIsEmptyWithArrayRoot(const HloComputation* computation) {
+  bool empty_operations = absl::c_all_of(
+      computation->MakeInstructionPostOrder(), [](const HloInstruction* inst) {
+        return inst->opcode() == HloOpcode::kTuple ||
+               inst->opcode() == HloOpcode::kGetTupleElement ||
+               inst->opcode() == HloOpcode::kParameter;
+      });
+  bool contains_array = false;
+  ShapeUtil::ForEachSubshape(computation->root_instruction()->shape(),
+                             [&](const Shape& shape, const ShapeIndex& index) {
+                               if (shape.IsArray()) {
+                                 contains_array = true;
+                               }
+                             });
+  return empty_operations && contains_array;
+}
+
 // Tries to replace a conditional with a call operation of the corresponding
 // computation. If the given conditional has a constant branch_index, tries to
 // replace it with a call to its corresponding branch computation and then
@@ -124,7 +144,6 @@ StatusOr<bool> TryRemoveConditional(HloInstruction* conditional) {
         << conditional->ToShortString();
     return false;
   }
-
   HloInstruction* true_call_op = create_call(0);
   HloInstruction* false_call_op = create_call(1);
   auto condition_broadcast = [&](const Shape& shape) {
@@ -140,6 +159,14 @@ StatusOr<bool> TryRemoveConditional(HloInstruction* conditional) {
     return computation->AddInstruction(HloInstruction::CreateGetTupleElement(
         hlo->shape().tuple_shapes(i), hlo, i));
   };
+
+  bool branch_empty =
+      ComputationIsEmptyWithArrayRoot(conditional->branch_computation(0)) ||
+      ComputationIsEmptyWithArrayRoot(conditional->branch_computation(1));
+  // Empty branch is faster to execute than select.
+  if (branch_empty) {
+    return false;
+  }
   std::function<HloInstruction*(HloInstruction*, HloInstruction*)> select =
       [&](HloInstruction* t, HloInstruction* f) {
         if (f->shape().IsToken()) {
@@ -559,6 +586,10 @@ StatusOr<bool> ConditionalSimplifier::Run(HloModule* module) {
 
   absl::flat_hash_set<HloInstruction*> removed_conditionals;
   for (HloInstruction* conditional_op : conditional_ops) {
+    if (conditional_op->has_sharding()) {
+      // The code below doesn't handle sharding properly.
+      continue;
+    }
     changed |= MergeDuplicateTupleElements(conditional_op);
     changed |= RemoveUnusedTupleElements(conditional_op);
     changed |= ReplaceRootWithEmptyTupleIfNoUsers(conditional_op);
@@ -573,18 +604,27 @@ StatusOr<bool> ConditionalSimplifier::Run(HloModule* module) {
   // lets collect them first.
   absl::flat_hash_map<HloComputation*, absl::flat_hash_set<HloInstruction*>>
       calling_conditionals;
+  // Keys of calling_conditionals to get a deterministic ordering.
+  std::vector<HloComputation*> calling_computationals_vector;
   for (HloInstruction* conditional : conditional_ops) {
     if (removed_conditionals.contains(conditional)) {
       continue;
     }
+
     for (int64 branch = 0; branch < conditional->branch_count(); ++branch) {
-      calling_conditionals[conditional->branch_computation(branch)].insert(
-          conditional);
+      auto* branch_comp = conditional->branch_computation(branch);
+      if (!calling_conditionals.contains(branch_comp)) {
+        calling_computationals_vector.push_back(branch_comp);
+      }
+      calling_conditionals[branch_comp].insert(conditional);
     }
   }
-  for (const auto& entry : calling_conditionals) {
+
+  for (auto* comp : calling_computationals_vector) {
+    auto entry = calling_conditionals.find(comp);
+    CHECK(entry != calling_conditionals.end());
     TF_ASSIGN_OR_RETURN(bool result, TryRemoveUnusedConditionalOperands(
-                                         entry.first, entry.second));
+                                         entry->first, entry->second));
     changed |= result;
   }
 
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index 6bfd8c4db46..b88120d8128 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -191,6 +191,30 @@ bool IndicesToCopyForWhile(const HloDataflowAnalysis& dataflow,
   return any_copies;
 }
 
+// Compute the indices of the conditional outputs which need copies. Umambiguous
+// buffers(buffer with only one value) don't need copies.
+bool IndicesToCopyForConditional(const HloDataflowAnalysis& dataflow,
+                                 const HloInstruction* xla_conditional,
+                                 ShapeTree<bool>* indices_to_copy) {
+  DCHECK(ShapeUtil::Compatible(indices_to_copy->shape(),
+                               xla_conditional->shape()));
+
+  bool any_copies = false;
+  for (auto& pair : *indices_to_copy) {
+    const ShapeIndex& index = pair.first;
+    bool& should_copy = pair.second;
+
+    CHECK_EQ(dataflow.GetValueSet(xla_conditional, index).values().size(), 1);
+
+    auto value = dataflow.GetValueSet(xla_conditional, index).values()[0];
+    // The conditional must be copied if the value is a phi.
+    should_copy =
+        value->is_phi() && value->defining_instruction() == xla_conditional;
+    any_copies |= should_copy;
+  }
+  return any_copies;
+}
+
 // Add kCopy instructions around the given kWhile instruction to eliminate any
 // possible live range interference of HLO values assuming a dependency-based
 // ordering (HloDependencyOrdering). Copies are added conservatively. There
@@ -306,24 +330,30 @@ Status AddCopiesForWhile(const HloAliasAnalysis& alias_analysis,
   }
 
   body->set_root_instruction(root_copy);
-
   return Status::OK();
 }
 
-// We add copies for all the indices of the true and false computation roots, in
-// order to resolve interference. We later rely on RemoveUnnecessaryCopies to
-// drop the unnecessary ones.
+// We add copies for all non-phi indices of the true and false computation
+// roots, in order to resolve interference. We later rely on
+// RemoveUnnecessaryCopies to drop the unnecessary ones.
 Status AddCopiesForConditional(const HloAliasAnalysis& alias_analysis,
                                HloInstruction* conditional) {
   VLOG(2) << "Adding copies for kConditional instruction "
           << conditional->name();
+  ShapeTree<bool> indices_to_copy(conditional->shape());
   TF_RET_CHECK(conditional->opcode() == HloOpcode::kConditional);
-
+  if (!IndicesToCopyForConditional(alias_analysis.dataflow_analysis(),
+                                   conditional, &indices_to_copy)) {
+    VLOG(2) << "No copies necessary for kWhile instruction "
+            << conditional->name();
+    return Status::OK();
+  }
   for (HloComputation* computation : conditional->branch_computations()) {
     HloInstruction* root = computation->root_instruction();
     std::vector<HloInstruction*> users = root->users();
-    TF_ASSIGN_OR_RETURN(HloInstruction * deep_copy,
-                        computation->DeepCopyInstruction(root));
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * deep_copy,
+        computation->DeepCopyInstruction(root, &indices_to_copy));
     for (HloInstruction* user : users) {
       TF_RETURN_IF_ERROR(root->ReplaceUseWith(user, deep_copy));
     }
@@ -1128,6 +1158,7 @@ static int64 GetNumExistingCopies(const HloModule* module) {
 
 Status CopyInsertion::RemoveUnnecessaryCopies(const HloOrdering& ordering,
                                               HloModule* module) {
+  XLA_VLOG_LINES(4, module->ToString());
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
                       HloAliasAnalysis::Run(module, can_share_buffer_));
 
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 7f051d4d1b2..7c362b2da44 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -49,6 +49,7 @@ filegroup(
         "runtime_single_threaded_conv2d.cc",
         "runtime_single_threaded_fft.cc",
         "runtime_single_threaded_matmul.cc",
+        "runtime_topk.cc",
     ],
     visibility = [":friends"],
 )
@@ -64,6 +65,7 @@ filegroup(
         "runtime_single_threaded_conv2d.h",
         "runtime_single_threaded_fft.h",
         "runtime_single_threaded_matmul.h",
+        "runtime_topk.h",
     ],
     visibility = [":friends"],
 )
@@ -134,13 +136,16 @@ cc_library(
         "//tensorflow/compiler/xla/service:copy_insertion",
         "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:dump",
+        "//tensorflow/compiler/xla/service:topk_rewriter",
         "//tensorflow/compiler/xla/service:map_inliner",
         "//tensorflow/compiler/xla/service:rng_bit_generator_expander",
         "//tensorflow/compiler/xla/service:tree_reduction_rewriter",
         "//tensorflow/compiler/xla/service:hlo_get_dimension_size_rewriter",
+        "//tensorflow/compiler/xla/service:conditional_canonicalizer",
         "//tensorflow/compiler/xla/service:conditional_to_select",
         "//tensorflow/compiler/xla/service:slow_operation_alarm",
         "//tensorflow/compiler/xla/service:scatter_expander",
+        "//tensorflow/compiler/xla/service:comparison_expander",
         "//tensorflow/compiler/xla/service:slice_sinker",
         "//tensorflow/compiler/xla:cpu_function_runtime",
         "//tensorflow/compiler/xla:literal",
@@ -179,6 +184,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/service:indexed_array_analysis",
         "//tensorflow/compiler/xla/service:llvm_compiler",
+        "//tensorflow/compiler/xla/service:gather_expander",
         "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:rng_expander",
         "//tensorflow/compiler/xla/service:sort_simplifier",
@@ -229,6 +235,7 @@ cc_library(
         ":runtime_fft",
         ":runtime_fork_join",
         ":runtime_key_value_sort",
+        ":runtime_topk",
         ":runtime_matmul",
         ":runtime_matmul_mkl",
         ":runtime_single_threaded_conv2d",
@@ -615,7 +622,8 @@ cc_library(
     deps = [
         ":runtime_lightweight_check",
         "//tensorflow/compiler/xla:executable_run_options",
-        "//tensorflow/core/kernels:eigen_helpers_no_mkl",
+        "//tensorflow/core/kernels:eigen_contraction_kernel",
+        "//tensorflow/core/kernels:eigen_helpers",
         "//tensorflow/core/platform:dynamic_annotations",
         "//tensorflow/core/platform:mutex",
         "//tensorflow/core/platform:types",
@@ -703,6 +711,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":runtime_lightweight_check",
+        "//tensorflow/core/kernels:eigen_contraction_kernel",
         "//tensorflow/core/kernels:eigen_helpers",
         "//tensorflow/core/platform:dynamic_annotations",
         "//tensorflow/core/platform:types",
@@ -756,6 +765,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "runtime_topk",
+    srcs = ["runtime_topk.cc"],
+    hdrs = ["runtime_topk.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core/platform:dynamic_annotations",
+        "//tensorflow/core/platform:macros",
+        "//tensorflow/core/platform:types",
+    ],
+)
+
 cc_library(
     name = "runtime_fork_join",
     srcs = ["runtime_fork_join.cc"],
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 5464cfee082..39d2b11ad37 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -54,6 +54,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/cholesky_expander.h"
+#include "tensorflow/compiler/xla/service/comparison_expander.h"
+#include "tensorflow/compiler/xla/service/conditional_canonicalizer.h"
 #include "tensorflow/compiler/xla/service/conditional_simplifier.h"
 #include "tensorflow/compiler/xla/service/conditional_to_select.h"
 #include "tensorflow/compiler/xla/service/convolution_group_converter.h"
@@ -76,6 +78,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
 #include "tensorflow/compiler/xla/service/dynamic_padder.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
+#include "tensorflow/compiler/xla/service/gather_expander.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
@@ -103,6 +106,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/slice_sinker.h"
 #include "tensorflow/compiler/xla/service/slow_operation_alarm.h"
 #include "tensorflow/compiler/xla/service/sort_simplifier.h"
+#include "tensorflow/compiler/xla/service/topk_rewriter.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/service/tree_reduction_rewriter.h"
 #include "tensorflow/compiler/xla/service/triangular_solve_expander.h"
@@ -258,6 +262,7 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   pipeline.AddPass<ConditionalToSelect>();
   pipeline.AddPass<MapInliner>();
 
+  pipeline.AddPass<ComparisonExpander>();
   pipeline.AddPass<CholeskyExpander>();
   pipeline.AddPass<TriangularSolveExpander>();
 
@@ -284,6 +289,7 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
       /*rewrite_grad_op=*/true);
   pipeline.AddPass<LogisticExpander>(
       /*expansion_type=*/LogisticExpansionType::kExp);
+  pipeline.AddPass<ConditionalCanonicalizer>();
   pipeline.AddPass<DynamicPadder>();
   pipeline.AddPass<ScatterExpander>();
   pipeline.AddPass<HloGetDimensionSizeRewriter>();
@@ -300,6 +306,7 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
     pass.AddPass<AlgebraicSimplifier>(options);
     pass.AddPass<SortSimplifier>();
     pass.AddPass<HloDCE>();
+    pass.AddPass<GatherExpander>(GatherExpander::kEliminateSimpleGathers);
 
     // BatchNormExpander can create zero-sized ops, so zero-sized HLO
     // elimination has to come after that pass.
@@ -318,6 +325,9 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
     pass.AddPass<HloConstantFolding>();
     pass.AddPass<ConditionalSimplifier>();
   }
+  pipeline.AddPass<TopkRewriter>([](const HloSortInstruction* sort, int64) {
+    return sort->operand(0)->shape().element_type() == F32;
+  });
   pipeline.AddPass<IndexedArrayAnalysisPrinterPass>();
   pipeline.AddPass<TransposeFolding>(
       [&](const HloInstruction& dot,
@@ -614,10 +624,9 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
 
   // Compile must be thread-safe so create a new LLVM context for the module.
   mlir::MLIRContext mlir_context;
-  auto llvm_module = absl::make_unique<llvm::Module>(
-      "__compute_module",
-      mlir_context.getRegisteredDialect<mlir::LLVM::LLVMDialect>()
-          ->getLLVMContext());
+  llvm::LLVMContext llvm_context;
+  auto llvm_module =
+      absl::make_unique<llvm::Module>("__compute_module", llvm_context);
 
   auto jit = absl::make_unique<SimpleOrcJIT>(
       CompilerTargetOptions(module->config()),
@@ -826,10 +835,8 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
 
   // Compile must be thread-safe so create a new LLVM context for the module.
   mlir::MLIRContext mlir_context;
-  llvm::Module llvm_module(
-      "__compute_module",
-      mlir_context.getRegisteredDialect<mlir::LLVM::LLVMDialect>()
-          ->getLLVMContext());
+  llvm::LLVMContext llvm_context;
+  llvm::Module llvm_module("__compute_module", llvm_context);
   llvm_module.setDataLayout(target_machine->createDataLayout());
   llvm_module.setTargetTriple(triple.getTriple());
   if (pic_level != llvm::PICLevel::NotPIC) {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 0abcc91a1d7..7431e829b8e 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -247,6 +247,12 @@ StatusOr<ExecutionOutput> CpuExecutable::CreateResultShapedBuffer(
       ExecutionInput& input = arguments[alias->parameter_number];
       MaybeOwningDeviceMemory* maybe_owning_memory =
           input.MutableBuffer(alias->parameter_index);
+      if (alias->must_alias() && !maybe_owning_memory->HasOwnership()) {
+        return InvalidArgument(
+            "An input was configured to be must-alias at "
+            "compile time but not donated at runtime: %s",
+            alias->ToString());
+      }
       if (absl::optional<se::OwningDeviceMemory> owning =
               maybe_owning_memory->Release()) {
         // If the caller passes the ownership of the device memory, reuse it
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.cc b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
index c0222010fd9..ff654c83d61 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
@@ -25,7 +25,6 @@ const char* const kXlaOptimizeForSizeCpuOption = "xla_cpu_optimize_for_size";
 const char* const kLlvmIrDotTilingFactor = "xla_llvm_dot_tiling_factor";
 const char* const kXlaForceEnableExperimentalLlvmIrGemm =
     "xla_force_enable_experimental_llvm_ir_gemm";
-const char* const kXlaUseLinalgForDot = "xla_use_linalg_for_dot";
 const char* const kLlvmIrGemmTileSize = "xla_llvm_ir_gemm_tile_size";
 
 }  // namespace
@@ -64,12 +63,6 @@ bool ForceEnableExperimentalLlvmIrGemm(const HloModuleConfig& config) {
   return extra_options_map.count(kXlaForceEnableExperimentalLlvmIrGemm) > 0;
 }
 
-bool UseLinalgForDot(const HloModuleConfig& config) {
-  const auto& extra_options_map =
-      config.debug_options().xla_backend_extra_options();
-  return extra_options_map.count(kXlaUseLinalgForDot) > 0;
-}
-
 static absl::string_view RemoveSuffix(absl::string_view str,
                                       absl::string_view suffix) {
   CHECK_GE(str.size(), suffix.size());
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.h b/tensorflow/compiler/xla/service/cpu/cpu_options.h
index 5d25aef6912..99e6702d14a 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.h
@@ -27,7 +27,6 @@ namespace options {
 bool OptimizeForSizeRequested(const HloModuleConfig& config);
 bool VectorizedReduceDisabled(const HloModuleConfig& config);
 bool ForceEnableExperimentalLlvmIrGemm(const HloModuleConfig& config);
-bool UseLinalgForDot(const HloModuleConfig& config);
 absl::optional<int64> LlvmIrGemvTilingFactor(const HloModuleConfig& config);
 absl::optional<std::tuple<int64, int64, int64>> LlvmIrGemmTileSize(
     const HloModuleConfig& config);
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index 2231ecfa1e8..5bee6049a5e 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -117,6 +117,7 @@ extern const char* const kParallelForkJoinSymbolName =
     "__xla_cpu_runtime_ParallelForkJoin";
 extern const char* const kKeyValueSortSymbolName =
     "__xla_cpu_runtime_KeyValueSort";
+extern const char* const kTopKF32SymbolName = "__xla_cpu_runtime_TopKF32";
 extern const char* const kTracingStartSymbolName =
     "__xla_cpu_runtime_TracingStart";
 extern const char* const kTracingEndSymbolName = "__xla_cpu_runtime_TracingEnd";
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
index ee75b97e4dc..eb24e0bc334 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
@@ -72,6 +72,7 @@ extern const char* const kAcquireOutfeedBufferForPopulationSymbolName;
 extern const char* const kReleaseOutfeedBufferAfterPopulationSymbolName;
 extern const char* const kParallelForkJoinSymbolName;
 extern const char* const kKeyValueSortSymbolName;
+extern const char* const kTopKF32SymbolName;
 extern const char* const kAllReduceSymbolName;
 extern const char* const kCollectivePermuteSymbolName;
 extern const char* const kReplicaIdSymbolName;
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index ee4bcf4cd35..2b3865b4dba 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -270,11 +270,48 @@ Status DotOpEmitter::EmitLinalgMatmul() {
   return EmitMlirFuncAndCall(
       mlir_context_, b_, dot_info_.result_shape, operand_shapes, target_ptr,
       operand_ptrs, name, [&](mlir::OpBuilder* builder, mlir::FuncOp function) {
+        CHECK_EQ(dot_info_.dim_nums.lhs_contracting_dimensions_size(), 1);
+        CHECK_EQ(dot_info_.dim_nums.rhs_contracting_dimensions_size(), 1);
+        mlir::MLIRContext* context = builder->getContext();
         mlir::edsc::ScopedContext scope(*builder, function.getLoc());
         mlir::Value a = function.getArgument(0), b = function.getArgument(1),
                     c = function.getArgument(2);
-        mlir::edsc::intrinsics::linalg_matmul(mlir::TypeRange{},
-                                              mlir::ValueRange{b, c, a});
+
+        llvm::SmallVector<mlir::AffineExpr, 2> b_exprs(
+            dot_info_.lhs_shape.rank());
+        llvm::SmallVector<mlir::AffineExpr, 2> c_exprs(
+            dot_info_.rhs_shape.rank());
+
+        llvm::SmallVector<mlir::AffineExpr, 2> parallel_exprs;
+        mlir::AffineExpr reduce_expr;
+        for (int i = 0; i != dot_info_.result_shape.rank(); ++i) {
+          parallel_exprs.push_back(mlir::getAffineDimExpr(i, context));
+        }
+        reduce_expr =
+            mlir::getAffineDimExpr(dot_info_.result_shape.rank(), context);
+
+        // The reduction expr is shared for both inputs.
+        b_exprs[dot_info_.dim_nums.lhs_contracting_dimensions(0)] = reduce_expr;
+        c_exprs[dot_info_.dim_nums.rhs_contracting_dimensions(0)] = reduce_expr;
+
+        // Fill in the remaining parallel exprs.
+        int par_expr_num = 0;
+        for (auto* v : {&b_exprs, &c_exprs}) {
+          for (auto& e : *v) {
+            if (!e) {
+              e = parallel_exprs[par_expr_num++];
+            }
+          }
+        }
+
+        llvm::SmallVector<mlir::IteratorType, 4> types(
+            parallel_exprs.size(), mlir::IteratorType::Parallel);
+        types.push_back(mlir::IteratorType::Reduction);
+
+        mlir::edsc::StructuredIndexed s_a(a), s_b(b), s_c(c);
+        mlir::edsc::makeGenericLinalgOp(types, {s_b(b_exprs), s_c(c_exprs)},
+                                        {s_a(parallel_exprs)},
+                                        mlir::edsc::ops::macRegionBuilder);
         mlir::edsc::intrinsics::std_ret();
 
         mlir::linalg::LinalgTilingOptions tilingOptions;
@@ -283,13 +320,13 @@ Status DotOpEmitter::EmitLinalgMatmul() {
             target_machine_features_.minimum_alignment_for_allocation(
                 ShapeUtil::ByteSizeOf(dot_info_.result_shape));
         mlir_strategy::MatmulCodegenStrategy strategy;
-        strategy.tile<mlir::linalg::MatmulOp>(tilingOptions)
-            .promote<mlir::linalg::MatmulOp>(
+        strategy.tile<mlir::linalg::GenericOp>(tilingOptions)
+            .promote<mlir::linalg::GenericOp>(
                 mlir::linalg::LinalgPromotionOptions()
                     .setAlignment(alignment)
                     .setUseFullTileBuffersByDefault(true)
                     .setUseAlloca(true))
-            .vectorize<mlir::linalg::MatmulOp>()
+            .vectorize<mlir::linalg::GenericOp>()
             .setVectorTransformsOptions(
                 mlir::vector::VectorTransformsOptions()
                     .setVectorTransformsOptions(
@@ -986,9 +1023,7 @@ DotImplementationStrategy GetDotImplementationStrategy(
 
   if (IsAlignedGemm(dot_info, target_machine_features)) {
     if (CanEmitTiledLlvmIrGemm(config, dot_info, target_machine_features)) {
-      return options::UseLinalgForDot(config)
-                 ? DotImplementationStrategy::kLinalgMatmul
-                 : DotImplementationStrategy::kTiledLlvmIrGemm;
+      return DotImplementationStrategy::kLinalgMatmul;
     }
     return DotImplementationStrategy::kEigen;
   }
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index ebb2df23805..242f3c6ceb7 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -1397,6 +1397,7 @@ Status IrEmitter::HandleAllToAll(HloInstruction* instruction) {
 
 Status IrEmitter::HandleCollectivePermute(HloInstruction* crs) {
   auto* instr = Cast<HloCollectivePermuteInstruction>(crs);
+  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(instr));
   std::string source_target_pairs = absl::StrJoin(
       instr->source_target_pairs(), ",", absl::PairFormatter("="));
   llvm::Value* source_target_pairs_v =
@@ -2386,6 +2387,45 @@ Status IrEmitter::HandlePadToStatic(HloInstruction* hlo) {
   return Status::OK();
 }
 
+Status IrEmitter::HandleTopK(HloInstruction* hlo) {
+  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(hlo));
+  const HloInstruction* input = hlo->operand(0);
+  const int64 k = hlo->shape().tuple_shapes(0).dimensions().back();
+  const bool has_batch = hlo->shape().tuple_shapes(0).dimensions_size() == 2;
+  TF_RET_CHECK(input->shape().element_type() == F32);
+  TF_RET_CHECK(LayoutUtil::IsMonotonicWithDim0Major(
+      hlo->shape().tuple_shapes(0).layout()));
+  TF_RET_CHECK(LayoutUtil::IsMonotonicWithDim0Major(
+      hlo->shape().tuple_shapes(1).layout()));
+  TF_RET_CHECK(
+      LayoutUtil::IsMonotonicWithDim0Major(hlo->operand(0)->shape().layout()));
+
+  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice values_slice,
+                      assignment_.GetUniqueSlice(hlo->operand(0), {}));
+  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice out_values_slice,
+                      assignment_.GetUniqueSlice(hlo, {0}));
+  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice out_indices_slice,
+                      assignment_.GetUniqueSlice(hlo, {1}));
+  llvm::Value* values_ptr =
+      EmitBufferPointer(values_slice, hlo->operand(0)->shape());
+  llvm::Value* out_values_ptr =
+      EmitBufferPointer(out_values_slice, hlo->shape().tuple_shapes(0));
+  llvm::Value* out_indices_ptr =
+      EmitBufferPointer(out_indices_slice, hlo->shape().tuple_shapes(1));
+  EmitCallToFunc(
+      runtime::kTopKF32SymbolName,
+      {b_.getInt64(has_batch ? input->shape().dimensions(0) : 1),
+       b_.getInt64(input->shape().dimensions().back()), b_.getInt64(k),
+       BitCast(values_ptr, b_.getFloatTy()->getPointerTo()),
+       BitCast(out_values_ptr, b_.getFloatTy()->getPointerTo()),
+       BitCast(out_indices_ptr, b_.getInt32Ty()->getPointerTo())},
+      b_.getVoidTy());
+
+  llvm_ir::EmitTuple(GetIrArrayFor(hlo), {out_values_ptr, out_indices_ptr},
+                     &b_);
+  return Status::OK();
+}
+
 Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
   if (custom_call->custom_call_target() == "PadToStatic") {
     return HandlePadToStatic(custom_call);
@@ -2393,6 +2433,9 @@ Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
   if (custom_call->custom_call_target() == "SliceToDynamic") {
     return HandleSliceToDynamic(custom_call);
   }
+  if (custom_call->custom_call_target() == "TopK") {
+    return HandleTopK(custom_call);
+  }
   absl::Span<HloInstruction* const> operands(custom_call->operands());
   llvm::Type* i8_ptr_type = b_.getInt8PtrTy();
   llvm::AllocaInst* operands_alloca =
@@ -3037,10 +3080,21 @@ void IrEmitter::TracingState::EmitTracingEnd(llvm::IRBuilder<>* b,
                 {b->CreateBitCast(run_options, void_ptr_type), activity_id});
 }
 
+namespace {
+bool IsHloVeryCheap(const HloInstruction* hlo) {
+  return hlo->opcode() == HloOpcode::kBitcast ||
+         hlo->opcode() == HloOpcode::kTuple ||
+         hlo->opcode() == HloOpcode::kGetTupleElement ||
+         hlo->opcode() == HloOpcode::kParameter ||
+         hlo->opcode() == HloOpcode::kConstant;
+}
+}  // namespace
+
 Status IrEmitter::Preprocess(HloInstruction* hlo) {
   VLOG(3) << "Visiting: " << hlo->ToString();
-  if (instruction_to_profile_idx_.count(hlo)) {
-    // Only trace the same HLOs that the profiler does.
+  // When profiling is enabled, trace the same HLOs that the profiler does.
+  if (instruction_to_profile_idx_.count(hlo) ||
+      (hlo_module_config_.cpu_traceme_enabled() && !IsHloVeryCheap(hlo))) {
     tracing_state_.EmitTracingStart(&b_, hlo,
                                     GetExecutableRunOptionsArgument());
     profiling_state_.RecordCycleStart(&b_, hlo);
@@ -3052,8 +3106,9 @@ Status IrEmitter::Postprocess(HloInstruction* hlo) {
   if (auto* prof_counter = GetProfileCounterFor(*hlo)) {
     profiling_state_.RecordCycleDelta(&b_, hlo, prof_counter);
   }
-  // Only trace the same HLOs that the profiler does.
-  if (instruction_to_profile_idx_.count(hlo)) {
+  // When profiling is enabled, trace the same HLOs that the profiler does.
+  if (instruction_to_profile_idx_.count(hlo) ||
+      (hlo_module_config_.cpu_traceme_enabled() && !IsHloVeryCheap(hlo))) {
     tracing_state_.EmitTracingEnd(&b_, hlo, GetExecutableRunOptionsArgument());
   }
   return Status::OK();
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 3955deefbea..f136e3470e5 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -190,6 +190,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
  private:
   Status HandleSliceToDynamic(HloInstruction* hlo);
   Status HandlePadToStatic(HloInstruction* hlo);
+  Status HandleTopK(HloInstruction* hlo);
   Status HandleAllReduceSingleReplica(HloInstruction* crs);
   Status HandleAllReduceMultipleReplica(HloInstruction* crs);
 
diff --git a/tensorflow/compiler/xla/service/cpu/mlir_emitter.cc b/tensorflow/compiler/xla/service/cpu/mlir_emitter.cc
index ff48f554ce6..ae23f224207 100644
--- a/tensorflow/compiler/xla/service/cpu/mlir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/mlir_emitter.cc
@@ -32,7 +32,8 @@ namespace cpu {
 namespace {
 
 // Lower an MLIR module to an LLVM module.
-std::unique_ptr<llvm::Module> MakeLLVMModule(mlir::OwningModuleRef module) {
+std::unique_ptr<llvm::Module> MakeLLVMModule(mlir::OwningModuleRef module,
+                                             llvm::LLVMContext *context) {
   // When set, the LLVM backend will be allowed to reassociate floating-point
   // reductions, which enables much more efficient "horizontal" SIMD
   // implementations.
@@ -47,7 +48,7 @@ std::unique_ptr<llvm::Module> MakeLLVMModule(mlir::OwningModuleRef module) {
       mlir::LowerVectorToLLVMOptions().setReassociateFPReductions(
           kReassociateFPReductions)));
   CHECK(succeeded(manager.run(*module)));
-  return mlir::translateModuleToLLVMIR(*module);
+  return mlir::translateModuleToLLVMIR(*module, *context);
 }
 
 // Get arguments to pass a memref to an mlir function.
@@ -114,7 +115,8 @@ Status EmitMlirFuncAndCall(
   emitter(&op_builder, function);
 
   // Now link it all into the main LLVM module.
-  auto mlir_llvm_module = MakeLLVMModule(std::move(mlir_module));
+  auto mlir_llvm_module =
+      MakeLLVMModule(std::move(mlir_module), &b->getContext());
   mlir_llvm_module->setDataLayout(llvm_module->getDataLayout());
   llvm::Linker::linkModules(
       *llvm_module, std::move(mlir_llvm_module), llvm::Linker::None,
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_conv2d.cc b/tensorflow/compiler/xla/service/cpu/runtime_conv2d.cc
index 84cb41a8f17..eac0371b76d 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_conv2d.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_conv2d.cc
@@ -23,16 +23,18 @@ limitations under the License.
 #include "tensorflow/core/platform/dynamic_annotations.h"
 #include "tensorflow/core/platform/types.h"
 
-using tensorflow::int64;
-
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConvF32(
     const void* run_options_ptr, float* out, float* lhs, float* rhs,
-    int64 input_batch, int64 input_rows, int64 input_cols, int64 input_channels,
-    int64 kernel_rows, int64 kernel_cols, int64 kernel_channels,
-    int64 kernel_filters, int64 output_rows, int64 output_cols,
-    int64 row_stride, int64 col_stride, int64 padding_top, int64 padding_bottom,
-    int64 padding_left, int64 padding_right, int64 lhs_row_dilation,
-    int64 lhs_col_dilation, int64 rhs_row_dilation, int64 rhs_col_dilation) {
+    tensorflow::int64 input_batch, tensorflow::int64 input_rows,
+    tensorflow::int64 input_cols, tensorflow::int64 input_channels,
+    tensorflow::int64 kernel_rows, tensorflow::int64 kernel_cols,
+    tensorflow::int64 kernel_channels, tensorflow::int64 kernel_filters,
+    tensorflow::int64 output_rows, tensorflow::int64 output_cols,
+    tensorflow::int64 row_stride, tensorflow::int64 col_stride,
+    tensorflow::int64 padding_top, tensorflow::int64 padding_bottom,
+    tensorflow::int64 padding_left, tensorflow::int64 padding_right,
+    tensorflow::int64 lhs_row_dilation, tensorflow::int64 lhs_col_dilation,
+    tensorflow::int64 rhs_row_dilation, tensorflow::int64 rhs_col_dilation) {
   const xla::ExecutableRunOptions* run_options =
       static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
   XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
@@ -46,13 +48,17 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConvF32(
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConvF16(
     const void* run_options_ptr, Eigen::half* out, Eigen::half* lhs,
-    Eigen::half* rhs, int64 input_batch, int64 input_rows, int64 input_cols,
-    int64 input_channels, int64 kernel_rows, int64 kernel_cols,
-    int64 kernel_channels, int64 kernel_filters, int64 output_rows,
-    int64 output_cols, int64 row_stride, int64 col_stride, int64 padding_top,
-    int64 padding_bottom, int64 padding_left, int64 padding_right,
-    int64 lhs_row_dilation, int64 lhs_col_dilation, int64 rhs_row_dilation,
-    int64 rhs_col_dilation) {
+    Eigen::half* rhs, tensorflow::int64 input_batch,
+    tensorflow::int64 input_rows, tensorflow::int64 input_cols,
+    tensorflow::int64 input_channels, tensorflow::int64 kernel_rows,
+    tensorflow::int64 kernel_cols, tensorflow::int64 kernel_channels,
+    tensorflow::int64 kernel_filters, tensorflow::int64 output_rows,
+    tensorflow::int64 output_cols, tensorflow::int64 row_stride,
+    tensorflow::int64 col_stride, tensorflow::int64 padding_top,
+    tensorflow::int64 padding_bottom, tensorflow::int64 padding_left,
+    tensorflow::int64 padding_right, tensorflow::int64 lhs_row_dilation,
+    tensorflow::int64 lhs_col_dilation, tensorflow::int64 rhs_row_dilation,
+    tensorflow::int64 rhs_col_dilation) {
   const xla::ExecutableRunOptions* run_options =
       static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
   XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_conv2d_impl.h b/tensorflow/compiler/xla/service/cpu/runtime_conv2d_impl.h
index 193c25f2a4b..ec634e7f738 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_conv2d_impl.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_conv2d_impl.h
@@ -19,6 +19,10 @@ limitations under the License.
 #include "tensorflow/core/kernels/eigen_spatial_convolutions.h"
 #include "tensorflow/core/platform/types.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 // 'tensorflow' namespace is used so that int64 and other types don't require
 // qualification.
 namespace tensorflow {
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
index 0d4e7055ddb..2cee58162fc 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
@@ -25,21 +25,16 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace {
-using tensorflow::int32;
-using tensorflow::int64;
-}  // namespace
-
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSort(
-    int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes, bool is_stable,
-    char* run_options, int64* prof_counters,
+    tensorflow::int64 a, tensorflow::int64 b, tensorflow::int64 c, char** values, tensorflow::int32 values_count,
+    tensorflow::int32* values_primitive_type_size_in_bytes, bool is_stable,
+    char* run_options, tensorflow::int64* prof_counters,
     void (*less_than)(char*, char*, char**, char**, tensorflow::int64*)) {
   // 'values' and 'values_primitive_type_size_in_bytes' are managed by the JIT
   // code, so msan can't tell they are initialized.
   TF_ANNOTATE_MEMORY_IS_INITIALIZED(values, values_count * sizeof(char*));
   TF_ANNOTATE_MEMORY_IS_INITIALIZED(values_primitive_type_size_in_bytes,
-                                    values_count * sizeof(int32));
+                                    values_count * sizeof(tensorflow::int32));
 
   // High-level idea of the iteration/sorting logic:
   // Conceptually we have a 3-dimensional shape [a, b, c]. b corresponds to the
@@ -50,16 +45,16 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSort(
   // 'base_offset' value which points to the first element in that row, and add
   // i * c for accessing the 'i'-th element in that row.
 
-  int64 sort_dimension_elements = b;
-  int64 num_iteration_elements = a * c;
-  int64 sort_dimension_offset = c;
+  tensorflow::int64 sort_dimension_elements = b;
+  tensorflow::int64 num_iteration_elements = a * c;
+  tensorflow::int64 sort_dimension_offset = c;
 
-  std::unique_ptr<int64[]> indices(new int64[sort_dimension_elements]);
+  std::unique_ptr<tensorflow::int64[]> indices(new tensorflow::int64[sort_dimension_elements]);
   std::unique_ptr<char*[]> comparison_values(new char*[2 * values_count]);
   std::iota(indices.get(), indices.get() + sort_dimension_elements, 0);
   std::unique_ptr<std::string[]> reordered_values(
       new std::string[sort_dimension_elements]);
-  for (int64 index = 0; index < num_iteration_elements; ++index) {
+  for (tensorflow::int64 index = 0; index < num_iteration_elements; ++index) {
     // If the sort should be stable, we have to reinitialize indices to iota to
     // guarantee that we still keep the relative order in case of ties.
     if (is_stable && index > 0) {
@@ -71,14 +66,14 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSort(
     // calculating the base offset, we need to multiply the index into the 'a'
     // dimension with 'b' * 'c'.
     // 'index' / 'c' * 'c' * 'b' = ('index' - 'index' % 'c') * 'b'.
-    int64 base_offset =
+    tensorflow::int64 base_offset =
         index % sort_dimension_offset +
         (index - index % sort_dimension_offset) * sort_dimension_elements;
-    auto compare_function = [&](int64 a, int64 b) -> bool {
-      for (int32 i = 0; i < values_count; ++i) {
-        int64 memory_index_lhs = (base_offset + a * sort_dimension_offset) *
+    auto compare_function = [&](tensorflow::int64 a, tensorflow::int64 b) -> bool {
+      for (tensorflow::int32 i = 0; i < values_count; ++i) {
+        tensorflow::int64 memory_index_lhs = (base_offset + a * sort_dimension_offset) *
                                  values_primitive_type_size_in_bytes[i];
-        int64 memory_index_rhs = (base_offset + b * sort_dimension_offset) *
+        tensorflow::int64 memory_index_rhs = (base_offset + b * sort_dimension_offset) *
                                  values_primitive_type_size_in_bytes[i];
         comparison_values[i * 2] = values[i] + memory_index_lhs;
         comparison_values[i * 2 + 1] = values[i] + memory_index_rhs;
@@ -97,9 +92,9 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSort(
     }
 
     // Reorder the values according to the order defined by 'indices'.
-    for (int32 idx = 0; idx < values_count; ++idx) {
-      for (int64 i = 0; i < sort_dimension_elements; ++i) {
-        int64 memory_index =
+    for (tensorflow::int32 idx = 0; idx < values_count; ++idx) {
+      for (tensorflow::int64 i = 0; i < sort_dimension_elements; ++i) {
+        tensorflow::int64 memory_index =
             (base_offset + indices[i] * sort_dimension_offset) *
             values_primitive_type_size_in_bytes[idx];
 
@@ -107,8 +102,8 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSort(
             std::string(values[idx] + memory_index,
                         values_primitive_type_size_in_bytes[idx]);
       }
-      for (int64 i = 0; i < sort_dimension_elements; ++i) {
-        int64 memory_index = (base_offset + i * sort_dimension_offset) *
+      for (tensorflow::int64 i = 0; i < sort_dimension_elements; ++i) {
+        tensorflow::int64 memory_index = (base_offset + i * sort_dimension_offset) *
                              values_primitive_type_size_in_bytes[idx];
         memcpy(values[idx] + memory_index, reordered_values[i].c_str(),
                values_primitive_type_size_in_bytes[idx]);
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc b/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
index 35db15fed2c..7e19b383d6f 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
@@ -27,9 +27,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/eigen_contraction_kernel.h"
 #endif
 
-using tensorflow::int32;
-using tensorflow::int64;
-
 namespace {
 
 bool Is16BytesAligned(void* ptr) {
@@ -37,19 +34,20 @@ bool Is16BytesAligned(void* ptr) {
 }
 
 template <typename T, Eigen::AlignmentType Alignment>
-void MatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m,
-            int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) {
+void MatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs,
+            tensorflow::int64 m, tensorflow::int64 n, tensorflow::int64 k,
+            tensorflow::int32 transpose_lhs, tensorflow::int32 transpose_rhs) {
   const xla::ExecutableRunOptions* run_options =
       static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
 
-  int64 lhs_rows = m;
-  int64 lhs_cols = k;
+  tensorflow::int64 lhs_rows = m;
+  tensorflow::int64 lhs_cols = k;
   if (transpose_lhs) {
     std::swap(lhs_rows, lhs_cols);
   }
 
-  int64 rhs_rows = k;
-  int64 rhs_cols = n;
+  tensorflow::int64 rhs_rows = k;
+  tensorflow::int64 rhs_cols = n;
   if (transpose_rhs) {
     std::swap(rhs_rows, rhs_cols);
   }
@@ -75,8 +73,9 @@ void MatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m,
 
 template <typename T>
 void MatMulDispatch(const void* run_options_ptr, T* out, T* lhs, T* rhs,
-                    int64 m, int64 n, int64 k, int32 transpose_lhs,
-                    int32 transpose_rhs) {
+                    tensorflow::int64 m, tensorflow::int64 n,
+                    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+                    tensorflow::int32 transpose_rhs) {
   bool all_buffers_16b_aligned =
       Is16BytesAligned(out) && Is16BytesAligned(lhs) && Is16BytesAligned(rhs);
 
@@ -94,45 +93,52 @@ void MatMulDispatch(const void* run_options_ptr, T* out, T* lhs, T* rhs,
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulF16(
     const void* run_options_ptr, Eigen::half* out, Eigen::half* lhs,
-    Eigen::half* rhs, int64 m, int64 n, int64 k, int32 transpose_lhs,
-    int32 transpose_rhs) {
+    Eigen::half* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
   MatMulDispatch<Eigen::half>(run_options_ptr, out, lhs, rhs, m, n, k,
                               transpose_lhs, transpose_rhs);
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulF32(
-    const void* run_options_ptr, float* out, float* lhs, float* rhs, int64 m,
-    int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) {
+    const void* run_options_ptr, float* out, float* lhs, float* rhs,
+    tensorflow::int64 m, tensorflow::int64 n, tensorflow::int64 k,
+    tensorflow::int32 transpose_lhs, tensorflow::int32 transpose_rhs) {
   MatMulDispatch<float>(run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs,
                         transpose_rhs);
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulF64(
-    const void* run_options_ptr, double* out, double* lhs, double* rhs, int64 m,
-    int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) {
+    const void* run_options_ptr, double* out, double* lhs, double* rhs,
+    tensorflow::int64 m, tensorflow::int64 n, tensorflow::int64 k,
+    tensorflow::int32 transpose_lhs, tensorflow::int32 transpose_rhs) {
   MatMulDispatch<double>(run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs,
                          transpose_rhs);
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulC64(
     const void* run_options_ptr, std::complex<float>* out,
-    std::complex<float>* lhs, std::complex<float>* rhs, int64 m, int64 n,
-    int64 k, int32 transpose_lhs, int32 transpose_rhs) {
+    std::complex<float>* lhs, std::complex<float>* rhs, tensorflow::int64 m,
+    tensorflow::int64 n, tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
   MatMulDispatch<std::complex<float>>(run_options_ptr, out, lhs, rhs, m, n, k,
                                       transpose_lhs, transpose_rhs);
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulC128(
     const void* run_options_ptr, std::complex<double>* out,
-    std::complex<double>* lhs, std::complex<double>* rhs, int64 m, int64 n,
-    int64 k, int32 transpose_lhs, int32 transpose_rhs) {
+    std::complex<double>* lhs, std::complex<double>* rhs, tensorflow::int64 m,
+    tensorflow::int64 n, tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
   MatMulDispatch<std::complex<double>>(run_options_ptr, out, lhs, rhs, m, n, k,
                                        transpose_lhs, transpose_rhs);
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulS32(
-    const void* run_options_ptr, int32* out, int32* lhs, int32* rhs, int64 m,
-    int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) {
-  MatMulDispatch<int32>(run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs,
-                        transpose_rhs);
+    const void* run_options_ptr, tensorflow::int32* out, tensorflow::int32* lhs,
+    tensorflow::int32* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
+  MatMulDispatch<tensorflow::int32>(run_options_ptr, out, lhs, rhs, m, n, k,
+                                    transpose_lhs, transpose_rhs);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.cc b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.cc
index 5afccc6a86e..360ce57e808 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.cc
@@ -19,18 +19,20 @@ limitations under the License.
 #include "tensorflow/core/platform/dynamic_annotations.h"
 #include "tensorflow/core/platform/types.h"
 
-using tensorflow::int64;
-
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void
 __xla_cpu_runtime_EigenSingleThreadedConvF16(
     const void* run_options_ptr, Eigen::half* out, Eigen::half* lhs,
-    Eigen::half* rhs, int64 input_batch, int64 input_rows, int64 input_cols,
-    int64 input_channels, int64 kernel_rows, int64 kernel_cols,
-    int64 kernel_channels, int64 kernel_filters, int64 output_rows,
-    int64 output_cols, int64 row_stride, int64 col_stride, int64 padding_top,
-    int64 padding_bottom, int64 padding_left, int64 padding_right,
-    int64 lhs_row_dilation, int64 lhs_col_dilation, int64 rhs_row_dilation,
-    int64 rhs_col_dilation) {
+    Eigen::half* rhs, tensorflow::int64 input_batch,
+    tensorflow::int64 input_rows, tensorflow::int64 input_cols,
+    tensorflow::int64 input_channels, tensorflow::int64 kernel_rows,
+    tensorflow::int64 kernel_cols, tensorflow::int64 kernel_channels,
+    tensorflow::int64 kernel_filters, tensorflow::int64 output_rows,
+    tensorflow::int64 output_cols, tensorflow::int64 row_stride,
+    tensorflow::int64 col_stride, tensorflow::int64 padding_top,
+    tensorflow::int64 padding_bottom, tensorflow::int64 padding_left,
+    tensorflow::int64 padding_right, tensorflow::int64 lhs_row_dilation,
+    tensorflow::int64 lhs_col_dilation, tensorflow::int64 rhs_row_dilation,
+    tensorflow::int64 rhs_col_dilation) {
   tensorflow::xla::EigenConvImpl(
       Eigen::DefaultDevice(), out, lhs, rhs, input_batch, input_rows,
       input_cols, input_channels, kernel_rows, kernel_cols, kernel_channels,
@@ -42,12 +44,16 @@ __xla_cpu_runtime_EigenSingleThreadedConvF16(
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void
 __xla_cpu_runtime_EigenSingleThreadedConvF32(
     const void* run_options_ptr, float* out, float* lhs, float* rhs,
-    int64 input_batch, int64 input_rows, int64 input_cols, int64 input_channels,
-    int64 kernel_rows, int64 kernel_cols, int64 kernel_channels,
-    int64 kernel_filters, int64 output_rows, int64 output_cols,
-    int64 row_stride, int64 col_stride, int64 padding_top, int64 padding_bottom,
-    int64 padding_left, int64 padding_right, int64 lhs_row_dilation,
-    int64 lhs_col_dilation, int64 rhs_row_dilation, int64 rhs_col_dilation) {
+    tensorflow::int64 input_batch, tensorflow::int64 input_rows,
+    tensorflow::int64 input_cols, tensorflow::int64 input_channels,
+    tensorflow::int64 kernel_rows, tensorflow::int64 kernel_cols,
+    tensorflow::int64 kernel_channels, tensorflow::int64 kernel_filters,
+    tensorflow::int64 output_rows, tensorflow::int64 output_cols,
+    tensorflow::int64 row_stride, tensorflow::int64 col_stride,
+    tensorflow::int64 padding_top, tensorflow::int64 padding_bottom,
+    tensorflow::int64 padding_left, tensorflow::int64 padding_right,
+    tensorflow::int64 lhs_row_dilation, tensorflow::int64 lhs_col_dilation,
+    tensorflow::int64 rhs_row_dilation, tensorflow::int64 rhs_col_dilation) {
   tensorflow::xla::EigenConvImpl(
       Eigen::DefaultDevice(), out, lhs, rhs, input_batch, input_rows,
       input_cols, input_channels, kernel_rows, kernel_cols, kernel_channels,
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
index c7601f939c7..a8112c1106b 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
@@ -23,9 +23,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/eigen_contraction_kernel.h"
 #endif
 
-using tensorflow::int32;
-using tensorflow::int64;
-
 namespace {
 
 bool Is16BytesAligned(void* ptr) {
@@ -33,16 +30,17 @@ bool Is16BytesAligned(void* ptr) {
 }
 
 template <typename T, Eigen::AlignmentType Alignment>
-void MatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m,
-            int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) {
-  int64 lhs_rows = m;
-  int64 lhs_cols = k;
+void MatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs,
+            tensorflow::int64 m, tensorflow::int64 n, tensorflow::int64 k,
+            tensorflow::int32 transpose_lhs, tensorflow::int32 transpose_rhs) {
+  tensorflow::int64 lhs_rows = m;
+  tensorflow::int64 lhs_cols = k;
   if (transpose_lhs) {
     std::swap(lhs_rows, lhs_cols);
   }
 
-  int64 rhs_rows = k;
-  int64 rhs_cols = n;
+  tensorflow::int64 rhs_rows = k;
+  tensorflow::int64 rhs_cols = n;
   if (transpose_rhs) {
     std::swap(rhs_rows, rhs_cols);
   }
@@ -67,8 +65,10 @@ void MatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m,
 
 template <typename T>
 void SingleThreadedMatMulDispatch(const void* run_options_ptr, T* out, T* lhs,
-                                  T* rhs, int64 m, int64 n, int64 k,
-                                  int32 transpose_lhs, int32 transpose_rhs) {
+                                  T* rhs, tensorflow::int64 m,
+                                  tensorflow::int64 n, tensorflow::int64 k,
+                                  tensorflow::int32 transpose_lhs,
+                                  tensorflow::int32 transpose_rhs) {
   bool all_buffers_16b_aligned =
       Is16BytesAligned(out) && Is16BytesAligned(lhs) && Is16BytesAligned(rhs);
 
@@ -86,28 +86,27 @@ void SingleThreadedMatMulDispatch(const void* run_options_ptr, T* out, T* lhs,
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void
 __xla_cpu_runtime_EigenSingleThreadedMatMulF16(
     const void* run_options_ptr, Eigen::half* out, Eigen::half* lhs,
-    Eigen::half* rhs, int64 m, int64 n, int64 k, int32 transpose_lhs,
-    int32 transpose_rhs) {
+    Eigen::half* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
   SingleThreadedMatMulDispatch<Eigen::half>(run_options_ptr, out, lhs, rhs, m,
                                             n, k, transpose_lhs, transpose_rhs);
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void
-__xla_cpu_runtime_EigenSingleThreadedMatMulF32(const void* run_options_ptr,
-                                               float* out, float* lhs,
-                                               float* rhs, int64 m, int64 n,
-                                               int64 k, int32 transpose_lhs,
-                                               int32 transpose_rhs) {
+__xla_cpu_runtime_EigenSingleThreadedMatMulF32(
+    const void* run_options_ptr, float* out, float* lhs, float* rhs,
+    tensorflow::int64 m, tensorflow::int64 n, tensorflow::int64 k,
+    tensorflow::int32 transpose_lhs, tensorflow::int32 transpose_rhs) {
   SingleThreadedMatMulDispatch<float>(run_options_ptr, out, lhs, rhs, m, n, k,
                                       transpose_lhs, transpose_rhs);
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void
-__xla_cpu_runtime_EigenSingleThreadedMatMulF64(const void* run_options_ptr,
-                                               double* out, double* lhs,
-                                               double* rhs, int64 m, int64 n,
-                                               int64 k, int32 transpose_lhs,
-                                               int32 transpose_rhs) {
+__xla_cpu_runtime_EigenSingleThreadedMatMulF64(
+    const void* run_options_ptr, double* out, double* lhs, double* rhs,
+    tensorflow::int64 m, tensorflow::int64 n, tensorflow::int64 k,
+    tensorflow::int32 transpose_lhs, tensorflow::int32 transpose_rhs) {
   SingleThreadedMatMulDispatch<double>(run_options_ptr, out, lhs, rhs, m, n, k,
                                        transpose_lhs, transpose_rhs);
 }
@@ -115,8 +114,9 @@ __xla_cpu_runtime_EigenSingleThreadedMatMulF64(const void* run_options_ptr,
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void
 __xla_cpu_runtime_EigenSingleThreadedMatMulC64(
     const void* run_options_ptr, std::complex<float>* out,
-    std::complex<float>* lhs, std::complex<float>* rhs, int64 m, int64 n,
-    int64 k, int32 transpose_lhs, int32 transpose_rhs) {
+    std::complex<float>* lhs, std::complex<float>* rhs, tensorflow::int64 m,
+    tensorflow::int64 n, tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
   SingleThreadedMatMulDispatch<std::complex<float>>(
       run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
 }
@@ -124,18 +124,19 @@ __xla_cpu_runtime_EigenSingleThreadedMatMulC64(
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void
 __xla_cpu_runtime_EigenSingleThreadedMatMulC128(
     const void* run_options_ptr, std::complex<double>* out,
-    std::complex<double>* lhs, std::complex<double>* rhs, int64 m, int64 n,
-    int64 k, int32 transpose_lhs, int32 transpose_rhs) {
+    std::complex<double>* lhs, std::complex<double>* rhs, tensorflow::int64 m,
+    tensorflow::int64 n, tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
   SingleThreadedMatMulDispatch<std::complex<double>>(
       run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void
-__xla_cpu_runtime_EigenSingleThreadedMatMulS32(const void* run_options_ptr,
-                                               int32* out, int32* lhs,
-                                               int32* rhs, int64 m, int64 n,
-                                               int64 k, int32 transpose_lhs,
-                                               int32 transpose_rhs) {
-  SingleThreadedMatMulDispatch<int32>(run_options_ptr, out, lhs, rhs, m, n, k,
-                                      transpose_lhs, transpose_rhs);
+__xla_cpu_runtime_EigenSingleThreadedMatMulS32(
+    const void* run_options_ptr, tensorflow::int32* out, tensorflow::int32* lhs,
+    tensorflow::int32* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
+  SingleThreadedMatMulDispatch<tensorflow::int32>(
+      run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_topk.cc b/tensorflow/compiler/xla/service/cpu/runtime_topk.cc
new file mode 100644
index 00000000000..5174a3329fb
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_topk.cc
@@ -0,0 +1,76 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/runtime_topk.h"
+
+#include <algorithm>
+#include <memory>
+#include <numeric>
+#include <vector>
+
+#include "tensorflow/core/platform/dynamic_annotations.h"
+#include "tensorflow/core/platform/macros.h"
+
+template <typename T>
+static void TopK(tensorflow::int64 batch_size, tensorflow::int64 input_size,
+                 tensorflow::int64 k, const T* values, T* out_values,
+                 tensorflow::int32* out_indices) {
+  // 'values' is managed by the JIT code, so msan can't tell they are
+  // initialized.
+  TF_ANNOTATE_MEMORY_IS_INITIALIZED(values,
+                                    input_size * batch_size * sizeof(T));
+
+  std::vector<tensorflow::int32> temp_indices(input_size);
+  for (tensorflow::int64 batch = 0; batch != batch_size; ++batch) {
+    std::iota(temp_indices.begin(), temp_indices.end(), 0);
+
+    const T* values_batch = values + batch * input_size;
+
+    auto convert_to_int = [](T value) {
+      tensorflow::uint32 x;
+      std::memcpy(&x, &value, sizeof(x));
+      return static_cast<tensorflow::int32>(x) < 0
+                 ? std::numeric_limits<tensorflow::int32>::max() - x
+                 : x;
+    };
+
+    auto kth_element = temp_indices.begin() + k;
+    std::partial_sort(temp_indices.begin(), kth_element, temp_indices.end(),
+                      [&](size_t i1, size_t i2) {
+                        // Do the comparison in integers to enforce a total
+                        // order of -NaN < -Inf < -0 < +0 < +Inf < +NaN.
+                        tensorflow::int32 v1 = convert_to_int(values_batch[i1]);
+                        tensorflow::int32 v2 = convert_to_int(values_batch[i2]);
+                        if (v1 == v2) {
+                          return i1 < i2;  // Stabilize sorting.
+                        }
+                        return v1 > v2;
+                      });
+
+    T* out_values_batch = out_values + batch * k;
+    tensorflow::int32* out_indices_batch = out_indices + batch * k;
+    std::copy(temp_indices.begin(), kth_element, out_indices_batch);
+    for (tensorflow::int64 i = 0; i < k; i++) {
+      out_values_batch[i] = values_batch[temp_indices[i]];
+    }
+  }
+}
+
+TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_TopKF32(
+    tensorflow::int64 batch_size, tensorflow::int64 input_size,
+    tensorflow::int64 k, const float* values, float* out_values,
+    tensorflow::int32* out_indices) {
+  TopK(batch_size, input_size, k, values, out_values, out_indices);
+}
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_topk.h b/tensorflow/compiler/xla/service/cpu/runtime_topk.h
new file mode 100644
index 00000000000..de69c0603e3
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_topk.h
@@ -0,0 +1,32 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_TOPK_H
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_TOPK_H
+
+#include "tensorflow/core/platform/types.h"
+
+extern "C" {
+
+// Calculates `batch_size` topk operations with `input_size` inputs each. The
+// outputs are written to `out_values` and `out_indices`.
+extern void __xla_cpu_runtime_TopKF32(tensorflow::int64 batch_size,
+                                      tensorflow::int64 input_size,
+                                      tensorflow::int64 k, const float* values,
+                                      float* out_values,
+                                      tensorflow::int32* out_indices);
+}
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_TOPK_H
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 631c6985b03..28508bde4cd 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_topk.h"
 #include "tensorflow/compiler/xla/service/cpu/windows_compatibility.h"
 #include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -270,6 +271,7 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(ReleaseInfeedBufferAfterDequeue);
   REGISTER_CPU_RUNTIME_SYMBOL(ReleaseOutfeedBufferAfterPopulation);
   REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSort);
+  REGISTER_CPU_RUNTIME_SYMBOL(TopKF32);
   REGISTER_CPU_RUNTIME_SYMBOL(TracingStart);
   REGISTER_CPU_RUNTIME_SYMBOL(TracingEnd);
 
diff --git a/tensorflow/compiler/xla/service/cpu/tests/BUILD b/tensorflow/compiler/xla/service/cpu/tests/BUILD
index d7c50dce3ca..527071d5f31 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/tests/BUILD
@@ -253,6 +253,22 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "cpu_topk_test",
+    srcs = ["cpu_topk_test.cc"],
+    deps = [
+        ":cpu_codegen_test",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:sorting",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
+        "//tensorflow/compiler/xla/service/cpu:test_header_helper",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cc_test(
     name = "cpu_vectorization_test",
     srcs = ["cpu_vectorization_test.cc"],
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_topk_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_topk_test.cc
new file mode 100644
index 00000000000..b7647fb4b16
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_topk_test.cc
@@ -0,0 +1,86 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/client/lib/sorting.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
+#include "tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h"
+#include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+using CpuTopKTest = CpuCodegenTest;
+
+TEST_F(CpuTopKTest, CallRuntimeUnbatched) {
+  XlaBuilder builder(TestName());
+  XlaOp input =
+      Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {100}), "input");
+  TopK(input, 10);
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation xla_computation, builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(ProgramShape program_shape,
+                          xla_computation.GetProgramShape());
+  HloModuleConfig config(program_shape);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, HloModule::CreateFromProto(xla_computation.proto(), config));
+
+  constexpr char filecheck_pattern[] = R"(
+    CHECK: call void @__xla_cpu_runtime_TopKF32(i64 1, i64 100, i64 10,
+  )";
+
+  CpuAotCompilationOptions options{
+      /*triple=*/kTargetTripleForHost, /*cpu_name=*/kTargetCpuForHost,
+      /*features=*/"",
+      /*entry_point_name=*/"entry",
+      /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
+
+  CompileAheadOfTimeAndVerifyIr(std::move(module), options, filecheck_pattern,
+                                /*match_optimized_ir=*/true);
+}
+
+TEST_F(CpuTopKTest, CallRuntimeBatched) {
+  XlaBuilder builder(TestName());
+  XlaOp input =
+      Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {5, 100}), "input");
+  TopK(input, 10);
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation xla_computation, builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(ProgramShape program_shape,
+                          xla_computation.GetProgramShape());
+  HloModuleConfig config(program_shape);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, HloModule::CreateFromProto(xla_computation.proto(), config));
+
+  constexpr char filecheck_pattern[] = R"(
+    CHECK: call void @__xla_cpu_runtime_TopKF32(i64 5, i64 100, i64 10,
+  )";
+
+  CpuAotCompilationOptions options{
+      /*triple=*/kTargetTripleForHost, /*cpu_name=*/kTargetCpuForHost,
+      /*features=*/"",
+      /*entry_point_name=*/"entry",
+      /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
+
+  CompileAheadOfTimeAndVerifyIr(std::move(module), options, filecheck_pattern,
+                                /*match_optimized_ir=*/true);
+}
+
+}  // namespace
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dot_as_convolution_util.cc b/tensorflow/compiler/xla/service/dot_as_convolution_util.cc
index fcdf85d5ecb..4670ce6940a 100644
--- a/tensorflow/compiler/xla/service/dot_as_convolution_util.cc
+++ b/tensorflow/compiler/xla/service/dot_as_convolution_util.cc
@@ -24,6 +24,31 @@ limitations under the License.
 namespace xla {
 namespace dot_as_convolution_util {
 
+bool ConvSpatialDimensionIsParallel(const WindowDimension& wd, int64 lhs_size) {
+  // A parallel batch dimension in DotGeneral is represented as a
+  // spatial dimension with window size B (batch dimension size),
+  // stride B - 1, and base dilation B.
+  if (lhs_size == wd.size() && lhs_size == wd.base_dilation() &&
+      ((std::max<int64>(1, lhs_size - 1) == wd.stride() &&
+        wd.window_dilation() == 1) ||
+       (std::max<int64>(1, lhs_size - 1) == wd.window_dilation() &&
+        wd.stride() == 1)) &&
+      wd.padding_high() == 0 && wd.padding_low() == 0 &&
+      !wd.window_reversal()) {
+    return true;
+  }
+
+  // Aternative representation of a batch dimension.
+  if (wd.size() == lhs_size && wd.padding_high() == lhs_size - 1 &&
+      wd.padding_low() == lhs_size - 1 && wd.window_reversal() &&
+      wd.window_dilation() == 1 && wd.stride() == lhs_size &&
+      wd.base_dilation() == lhs_size - 1) {
+    return true;
+  }
+
+  return false;
+}
+
 /* static */ absl::optional<DotGeneralAsConvolutionDimsInfo>
 ParseDotGeneralFromConvolution(const HloInstruction* conv) {
   CHECK_EQ(conv->opcode(), HloOpcode::kConvolution);
@@ -49,14 +74,7 @@ ParseDotGeneralFromConvolution(const HloInstruction* conv) {
     int64 rhs_size = conv->operand(1)->shape().dimensions(rhs);
     int64 output = conv_dims.output_spatial_dimensions(i);
     const auto& wd = conv->window().dimensions(i);
-    if (lhs_size == wd.size() &&
-        std::max<int64>(1, lhs_size - 1) == wd.stride() &&
-        lhs_size == wd.base_dilation() && wd.window_dilation() == 1 &&
-        wd.padding_high() == 0 && wd.padding_low() == 0 &&
-        !wd.window_reversal()) {
-      // A batch dimension in DotGeneral is represented as a spatial dimension
-      // with window size B (batch dimension size), stride B - 1, and base
-      // dilation B.
+    if (ConvSpatialDimensionIsParallel(wd, lhs_size)) {
       dims.batch_dims.push_back({lhs, rhs, output, i});
     } else if (lhs_size == wd.size() && wd.base_dilation() == 1 &&
                wd.window_dilation() == 1 && wd.padding_high() == 0 &&
diff --git a/tensorflow/compiler/xla/service/dot_as_convolution_util.h b/tensorflow/compiler/xla/service/dot_as_convolution_util.h
index a3e829a3d31..6a7cacf812d 100644
--- a/tensorflow/compiler/xla/service/dot_as_convolution_util.h
+++ b/tensorflow/compiler/xla/service/dot_as_convolution_util.h
@@ -62,6 +62,12 @@ CreateShardedConvForDotGeneralConvolution(
     const DotGeneralAsConvolutionDimsInfo& dot_dnums,
     HloInstruction* sharded_lhs_hlo, HloInstruction* sharded_rhs_hlo);
 
+// Check if a spatial dim is parallel batch dimension.
+// A parallel batch dimension in DotGeneral is represented as a spatial
+// dimension with window size B (batch dimension size), stride B - 1, and base
+// dilation B.
+bool ConvSpatialDimensionIsParallel(const WindowDimension& wd, int64 lhs_size);
+
 }  // namespace dot_as_convolution_util
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
index 6ebbf622614..36429d3d755 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
@@ -148,15 +148,12 @@ class DynamicDimensionInferenceVisitor : public DfsHloVisitorWithDefault {
   Status HandleDomain(HloInstruction* hlo) override;
 
  private:
-  using DimensionConstraint = DynamicDimensionInference::DimensionConstraint;
   using OperandDynamicDimensionFn = std::function<Status(
       HloInstruction* operand, ShapeIndex index, int64 dimension,
-      int64 operand_index, HloInstruction* dynamic_size,
-      DimensionConstraint constraint)>;
+      int64 operand_index, HloInstruction* dynamic_size)>;
 
   using DynamicDimensionFn = std::function<Status(
-      ShapeIndex index, int64 dimension, HloInstruction* dynamic_size,
-      DimensionConstraint constraint)>;
+      ShapeIndex index, int64 dimension, HloInstruction* dynamic_size)>;
 
   Status ForEachOperandDynamicDimension(HloInstruction* inst,
                                         const OperandDynamicDimensionFn&);
@@ -184,8 +181,7 @@ class DynamicDimensionInferenceVisitor : public DfsHloVisitorWithDefault {
 Status DynamicDimensionInferenceVisitor::DefaultAction(HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
       hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
-               int64 operand_index, HloInstruction* dynamic_size,
-               DimensionConstraint constraint) {
+               int64 operand_index, HloInstruction* dynamic_size) {
         return UnimplementedStrCat(
             "Asked to propagate a dynamic dimension from hlo ", operand->name(),
             "@", index.ToString(), "@", dimension, " to hlo ", hlo->ToString(),
@@ -197,13 +193,11 @@ Status DynamicDimensionInferenceVisitor::HandleGetTupleElement(
     HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
       hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
-               int64 operand_index, HloInstruction* dynamic_size,
-               DimensionConstraint constraint) {
+               int64 operand_index, HloInstruction* dynamic_size) {
         if (hlo->tuple_index() == index[0]) {
           ShapeIndex new_index =
               ShapeIndexView(index).ConsumeFront().ToShapeIndex();
-          parent_->SetDynamicSize(hlo, new_index, dimension, dynamic_size,
-                                  constraint);
+          parent_->SetDynamicSize(hlo, new_index, dimension, dynamic_size);
         }
         return Status::OK();
       });
@@ -212,11 +206,9 @@ Status DynamicDimensionInferenceVisitor::HandleGetTupleElement(
 Status DynamicDimensionInferenceVisitor::HandleTuple(HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
       hlo, [&](HloInstruction*, ShapeIndex index, int64 dimension,
-               int64 operand_index, HloInstruction* dynamic_size,
-               DimensionConstraint constraint) {
+               int64 operand_index, HloInstruction* dynamic_size) {
         index.push_front(operand_index);
-        parent_->SetDynamicSize(hlo, index, dimension, dynamic_size,
-                                constraint);
+        parent_->SetDynamicSize(hlo, index, dimension, dynamic_size);
         return Status::OK();
       });
 }
@@ -224,11 +216,9 @@ Status DynamicDimensionInferenceVisitor::HandleTuple(HloInstruction* hlo) {
 Status DynamicDimensionInferenceVisitor::HandleBroadcast(HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
       hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
-               int64 operand_index, HloInstruction* dynamic_size,
-               DimensionConstraint constraint) {
+               int64 operand_index, HloInstruction* dynamic_size) {
         int64 broadcast_dim = hlo->dimensions(dimension);
-        parent_->SetDynamicSize(hlo, {}, broadcast_dim, dynamic_size,
-                                constraint);
+        parent_->SetDynamicSize(hlo, {}, broadcast_dim, dynamic_size);
         return Status::OK();
       });
 }
@@ -244,8 +234,7 @@ Status DynamicDimensionInferenceVisitor::HandleCustomCall(HloInstruction* hlo) {
         // returns the padded data output and the dynamic sizes of input
         // dimensions.
         ShapeIndex data_output = {0};
-        parent_->SetDynamicSize(hlo, data_output, i, dynamic_size,
-                                DimensionConstraint(1, 1));
+        parent_->SetDynamicSize(hlo, data_output, i, dynamic_size);
       }
     }
     return Status::OK();
@@ -255,15 +244,14 @@ Status DynamicDimensionInferenceVisitor::HandleCustomCall(HloInstruction* hlo) {
   }
   return ForEachOperandDynamicDimension(
       hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
-               int64 operand_index, HloInstruction* dynamic_size,
-               DimensionConstraint constraint) {
+               int64 operand_index, HloInstruction* dynamic_size) {
         // Resize custom call should propagate dynamic batch (0) and channel (3)
         // dimensions.
         if (hlo->custom_call_target() == "SliceToDynamic" ||
             hlo->custom_call_target() == "Sharding" ||
             (absl::StartsWith(hlo->custom_call_target(), "Resize") &&
              (dimension == 0 || dimension == 3))) {
-          parent_->SetDynamicSize(hlo, {}, dimension, dynamic_size, constraint);
+          parent_->SetDynamicSize(hlo, {}, dimension, dynamic_size);
           return Status::OK();
         }
         return Unimplemented(
@@ -274,16 +262,15 @@ Status DynamicDimensionInferenceVisitor::HandleCustomCall(HloInstruction* hlo) {
 
 Status DynamicDimensionInferenceVisitor::HandleSort(HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
-      hlo, [&](HloInstruction* operand, ShapeIndex index,
-               int64 dynamic_dimension, int64 operand_index,
-               HloInstruction* dynamic_size, DimensionConstraint constraint) {
+      hlo,
+      [&](HloInstruction* operand, ShapeIndex index, int64 dynamic_dimension,
+          int64 operand_index, HloInstruction* dynamic_size) {
         HloSortInstruction* sort = Cast<HloSortInstruction>(hlo);
         if (sort->values_count() == 0) {
-          parent_->SetDynamicSize(hlo, {}, dynamic_dimension, dynamic_size,
-                                  constraint);
+          parent_->SetDynamicSize(hlo, {}, dynamic_dimension, dynamic_size);
         } else {
           parent_->SetDynamicSize(hlo, {operand_index}, dynamic_dimension,
-                                  dynamic_size, constraint);
+                                  dynamic_size);
         }
 
         return Status::OK();
@@ -293,8 +280,7 @@ Status DynamicDimensionInferenceVisitor::HandleSort(HloInstruction* hlo) {
 Status DynamicDimensionInferenceVisitor::HandlePad(HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
       hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
-               int64 operand_index, HloInstruction* dynamic_size,
-               DimensionConstraint constraint) {
+               int64 operand_index, HloInstruction* dynamic_size) {
         if (operand_index != 0) {
           return Unimplemented(
               "Dynamic dimension on padding value is not supported");
@@ -311,8 +297,7 @@ Status DynamicDimensionInferenceVisitor::HandlePad(HloInstruction* hlo) {
               hlo->parent()->AddInstruction(HloInstruction::CreateBinary(
                   dynamic_size_adjusted->shape(), HloOpcode::kAdd,
                   dynamic_size_adjusted, adjustment));
-          parent_->SetDynamicSize(hlo, {}, dimension, dynamic_size_adjusted,
-                                  constraint);
+          parent_->SetDynamicSize(hlo, {}, dimension, dynamic_size_adjusted);
           return Status::OK();
         } else {
           return Unimplemented(
@@ -327,8 +312,7 @@ Status DynamicDimensionInferenceVisitor::HandlePad(HloInstruction* hlo) {
 Status DynamicDimensionInferenceVisitor::HandleReduce(HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
       hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
-               int64 operand_index, HloInstruction* dynamic_size,
-               DimensionConstraint constraint) {
+               int64 operand_index, HloInstruction* dynamic_size) {
         HloInstruction* reduce = hlo;
         int64 operand_count = reduce->operand_count();
         bool is_variadic_reduce = operand_count > 2;
@@ -354,13 +338,12 @@ Status DynamicDimensionInferenceVisitor::HandleReduce(HloInstruction* hlo) {
               // reduce has a dynamic dimension, we set all outputs to use the
               // same dynamic size in corresponding dimensions.
               for (int64 i = 0; i < operand_count / 2; ++i) {
-                parent_->SetDynamicSize(reduce, {i},
-                                        dimensions_not_reduced_count,
-                                        dynamic_size, constraint);
+                parent_->SetDynamicSize(
+                    reduce, {i}, dimensions_not_reduced_count, dynamic_size);
               }
             } else {
               parent_->SetDynamicSize(reduce, {}, dimensions_not_reduced_count,
-                                      dynamic_size, constraint);
+                                      dynamic_size);
             }
 
             return Status::OK();
@@ -378,7 +361,7 @@ Status DynamicDimensionInferenceVisitor::HandleDot(HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
       hlo, [&](HloInstruction* operand, ShapeIndex operand_shape_index,
                int64 operand_dimension, int64 operand_index,
-               HloInstruction* dynamic_size, DimensionConstraint constraint) {
+               HloInstruction* dynamic_size) {
         // There are three types of dimensions in a dot:
         // A. batch dims
         // B. contracting dims
@@ -451,8 +434,7 @@ Status DynamicDimensionInferenceVisitor::HandleDot(HloInstruction* hlo) {
         // work item to trace that dimension.
         auto iter = result_dim_mapping.find(operand_dimension);
         if (iter != result_dim_mapping.end()) {
-          parent_->SetDynamicSize(dot, {}, iter->second, dynamic_size,
-                                  constraint);
+          parent_->SetDynamicSize(dot, {}, iter->second, dynamic_size);
         }
 
         return Status::OK();
@@ -463,8 +445,7 @@ Status DynamicDimensionInferenceVisitor::HandleTranspose(HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
       hlo,
       [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
-          int64 operand_index, HloInstruction* dynamic_size,
-          DimensionConstraint constraint) -> Status {
+          int64 operand_index, HloInstruction* dynamic_size) -> Status {
         int64 permuted_dim = -1;
         for (int64 i = 0; i < hlo->dimensions().size(); ++i) {
           if (hlo->dimensions()[i] == dimension) {
@@ -472,8 +453,7 @@ Status DynamicDimensionInferenceVisitor::HandleTranspose(HloInstruction* hlo) {
             permuted_dim = i;
           }
         }
-        parent_->SetDynamicSize(hlo, {}, permuted_dim, dynamic_size,
-                                constraint);
+        parent_->SetDynamicSize(hlo, {}, permuted_dim, dynamic_size);
         return Status::OK();
       });
 }
@@ -482,8 +462,7 @@ Status DynamicDimensionInferenceVisitor::HandleConvolution(
     HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
       hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
-               int64 operand_index, HloInstruction* dynamic_size,
-               DimensionConstraint constraint) {
+               int64 operand_index, HloInstruction* dynamic_size) {
         HloInstruction* conv = hlo;
         const ConvolutionDimensionNumbers& dimension_numbers =
             conv->convolution_dimension_numbers();
@@ -492,7 +471,7 @@ Status DynamicDimensionInferenceVisitor::HandleConvolution(
           if (dimension == dimension_numbers.input_batch_dimension()) {
             parent_->SetDynamicSize(conv, {},
                                     dimension_numbers.output_batch_dimension(),
-                                    dynamic_size, constraint);
+                                    dynamic_size);
             return Status::OK();
           }
 
@@ -542,20 +521,18 @@ Status DynamicDimensionInferenceVisitor::HandleConcatenate(
                                        dim_size_total, dynamic_dim));
     }
     parent_->SetDynamicSize(hlo, {}, hlo->concatenate_dimension(),
-                            dim_size_total, DimensionConstraint(1, 1));
+                            dim_size_total);
   }
 
   // Simply pass through non-concat dynamic dimensions.
   return ForEachOperandDynamicDimension(
       hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
-               int64 operand_index, HloInstruction* dynamic_size,
-               DimensionConstraint constraint) {
+               int64 operand_index, HloInstruction* dynamic_size) {
         int64 concatenate_dimension = hlo->concatenate_dimension();
         if (concatenate_dimension == dimension) {
           return Status::OK();
         }
-        parent_->SetDynamicSize(hlo, index, dimension, dynamic_size,
-                                constraint);
+        parent_->SetDynamicSize(hlo, index, dimension, dynamic_size);
         return Status::OK();
       });
 }
@@ -596,18 +573,15 @@ Status DynamicDimensionInferenceVisitor::HandleSetDimensionSize(
   if (!dimension_is_static) {
     // Propagate dynamic dimension indicated by this set dimension size
     // instruction.
-    parent_->SetDynamicSize(hlo, {}, hlo->dimension(), hlo->mutable_operand(1),
-                            DimensionConstraint(1, 1));
+    parent_->SetDynamicSize(hlo, {}, hlo->dimension(), hlo->mutable_operand(1));
   }
 
   // Also Propagate dynamic dimension already set by operands.
   TF_RETURN_IF_ERROR(ForEachOperandDynamicDimension(
       hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
-               int64 operand_index, HloInstruction* dynamic_size,
-               DimensionConstraint constraint) {
+               int64 operand_index, HloInstruction* dynamic_size) {
         if (dimension != hlo->dimension()) {
-          parent_->SetDynamicSize(hlo, index, dimension, dynamic_size,
-                                  constraint);
+          parent_->SetDynamicSize(hlo, index, dimension, dynamic_size);
         }
         return Status::OK();
       }));
@@ -619,10 +593,8 @@ Status DynamicDimensionInferenceVisitor::PassThroughDynamicDimension(
     HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
       hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
-               int64 operand_index, HloInstruction* dynamic_size,
-               DimensionConstraint constraint) {
-        parent_->SetDynamicSize(hlo, index, dimension, dynamic_size,
-                                constraint);
+               int64 operand_index, HloInstruction* dynamic_size) {
+        parent_->SetDynamicSize(hlo, index, dimension, dynamic_size);
         return Status::OK();
       });
 }
@@ -654,8 +626,7 @@ Status DynamicDimensionInferenceVisitor::HandleReshape(HloInstruction* hlo) {
       hlo,
       [&](HloInstruction* operand, ShapeIndex index,
           int64 input_dynamic_dimension, int64 operand_index,
-          HloInstruction* operand_dynamic_size,
-          DimensionConstraint constraint) -> Status {
+          HloInstruction* operand_dynamic_size) -> Status {
         HloInstruction* reshape = hlo;
         if (reshape->shape().rank() == 0) {
           VLOG(0) << "Reshaping a dynamic dimension into a scalar, which has "
@@ -751,9 +722,6 @@ Status DynamicDimensionInferenceVisitor::HandleReshape(HloInstruction* hlo) {
 
         if (output_dynamic_dimension == -1 &&
             output_dim_end - output_dim_start > 1) {
-          // TODO(yunxing): We now have a better way to decide output dimension
-          // in the bridge. No need for this constraint propagation logic.
-          //
           // One input dimension is splitted into multiple output dimensions.
           // Output dimension is decomposed from input most major dimension.
           // In this case, we don't know which one is dynamic, e.g., when we
@@ -770,61 +738,17 @@ Status DynamicDimensionInferenceVisitor::HandleReshape(HloInstruction* hlo) {
           // We use the following logics to disambiguate:
           // 1. If the user sets "inferred_dimension", then use that as
           // dynamic dimension.
+          // 2. If the one dimension in the reshape is dynamic, use that as
+          // dynamic dimension.
+          // E.g.:
+          //     [<=4]
+          //      |
+          //   reshape
+          //      |
+          //   [1, <=2, 2]
+          // We use second dim as dynamic dimension.
           //
-          // 2. Use the "multiple_of" constraint, e.g, :
-          //    [<=2, 4]
-          //     | Reshape
-          //    [<=8]
-          //     | Reshape
-          //    [2, 4] // Which is dynamic?
-          //
-          //    If the dynamic value has to be multiple of 4 (constraint
-          //    created by the first reshape), then 2 must be the dynamic
-          //    dimension.
-          //
-          //    But this logic doesn't help with the case where two
-          //    dimensions are the same:
-          //
-          //    [<=3, 3]
-          //     | Reshape
-          //    [<=9]
-          //     | Reshape
-          //    [3, 3]  // Which is dynamic?
-          //
-          //    Both dynamic dimension can be multiple of 3.
-          //
-          //    We then need the next constraint to disambiguate this case:
-          //
-          // 3. Use the "stride" constraint (also see the comment at the
-          // definition):
-          //
-          //        [<=3, 3]
-          //           | Reshape
-          //         [<=9] // constraint.stride = 1
-          //          | Reshape
-          //        [3, 3]
-          //         ^  ^
-          //         |  |
-          // stride= 1  3
-          //
-          //    Each dimension will have different strides, only one will
-          //    satisfy the stride constraint.
-          //
-          //    Note that the stride constrint itself is not enough:
-          //
-          //
-          //         [<=128]
-          //           | Reshape
-          //         [1, 128]
-          //          ^  ^
-          //          |  |
-          //  stride= 1  1
-          //
-          //    In this case, both dimensions have the same stride, which is
-          //    ambiguous. That's why we need the "multiple_of" constraint
-          //    as used above.
-          //
-          // 4. If all logics above cannot disambiguate, e.g.,:
+          // 3. If all logics above cannot disambiguate, e.g.,:
           //
           //     [<=1]
           //      |
@@ -833,68 +757,15 @@ Status DynamicDimensionInferenceVisitor::HandleReshape(HloInstruction* hlo) {
           //   [1, 1, 1]
           //
           //   We bail out and return an error.
+          // TODO(yunxing): Further simplify this, remove 1. and fully rely
+          // on 2.
           output_dynamic_dimension = reshape->inferred_dimension();
           if (output_dynamic_dimension == -1) {
-            // The user of XLA didn't specify a dynamic dimension, try infer
-            // it from the current constraint.
-            //
-            // Find all output dimensions that are decomposed from the first
-            // dimension. Among those dimensions, find all dimensions that
-            // satisfy the constraint of the dynamic dimension. In the
-            // previous example, if `a` is 9 and constraint is a multiple of
-            // `3', then in the output shape both a/c and c can be dynamic.
-            int64 current_product = 1;
-            int64 dimension_iter = output_dim_start;
-
-            // compatible_dimensions are dimensions that satisfies
-            // "multiple_of" constraints.
-            std::vector<int64> compatible_dimensions;
-            while (current_product <
-                   operand->shape().dimensions(input_dynamic_dimension)) {
-              current_product *= reshape->shape().dimensions(dimension_iter);
-              if (operand->shape().dimensions(input_dynamic_dimension) /
-                      reshape->shape().dimensions(dimension_iter) ==
-                  constraint.multiple_of) {
-                compatible_dimensions.push_back(dimension_iter);
+            // Try find dynamic dimension from the result shape.
+            for (int64 i = 0; i < reshape->shape().rank(); ++i) {
+              if (reshape->shape().is_dynamic_dimension(i)) {
+                output_dynamic_dimension = i;
               }
-              dimension_iter++;
-            }
-            CHECK_EQ(current_product,
-                     operand->shape().dimensions(input_dynamic_dimension))
-                << "Not a valid reshape: " << hlo->ToString();
-            // If there is only one compatible dimension, it must be the
-            // dynamic one in the output.
-            if (compatible_dimensions.size() == 1) {
-              output_dynamic_dimension = compatible_dimensions[0];
-            }
-
-            // When there are multiple compatible dimensions, e.g:
-            //     [<=9]
-            //      | Reshape
-            //    [3, 3]
-            // Use stride constraint to figure out which one is the true
-            // dynamic one.
-            //
-            //         [<=9]
-            //          | Reshape
-            //        [3, 3]
-            //         ^  ^
-            //         |  |
-            // stride= 1  3
-            //
-            std::vector<int64> compatible_dimensions_with_stride;
-            absl::c_copy_if(
-                compatible_dimensions,
-                std::back_inserter(compatible_dimensions_with_stride),
-                [&](int64 dimension) {
-                  int64 stride_total = 1;
-                  for (int64 i = 0; i < dimension + 1; ++i) {
-                    stride_total *= reshape->shape().dimensions(dimension);
-                  }
-                  return stride_total == constraint.stride;
-                });
-            if (compatible_dimensions_with_stride.size() == 1) {
-              output_dynamic_dimension = compatible_dimensions_with_stride[0];
             }
           }
 
@@ -914,9 +785,8 @@ Status DynamicDimensionInferenceVisitor::HandleReshape(HloInstruction* hlo) {
             return InvalidArgument(
                 "Reshape's input dynamic dimension is decomposed into "
                 "multiple output dynamic dimensions, but the constraint is "
-                "ambiguous and XLA can't infer the output dimension %s. "
-                "Constraint: multiple_of: %lld, stride: %lld",
-                hlo->ToString(), constraint.multiple_of, constraint.stride);
+                "ambiguous and XLA can't infer the output dimension %s. ",
+                hlo->ToString());
           }
         }
 
@@ -931,11 +801,12 @@ Status DynamicDimensionInferenceVisitor::HandleReshape(HloInstruction* hlo) {
         if (input_dim_size == output_dim_size) {
           // Simply forward dynamic dimension.
           parent_->SetDynamicSize(reshape, {}, output_dynamic_dimension,
-                                  operand_dynamic_size, constraint);
+                                  operand_dynamic_size);
         }
 
         if (input_dim_size > output_dim_size) {
-          TF_RET_CHECK(input_dim_size % output_dim_size == 0);
+          TF_RET_CHECK(input_dim_size % output_dim_size == 0)
+              << reshape->ToString();
           const int64 divisor = input_dim_size / output_dim_size;
           HloInstruction* divisor_hlo =
               hlo->parent()->AddInstruction(HloInstruction::CreateConstant(
@@ -946,9 +817,8 @@ Status DynamicDimensionInferenceVisitor::HandleReshape(HloInstruction* hlo) {
                   operand_dynamic_size->shape(), HloOpcode::kDivide,
                   operand_dynamic_size, divisor_hlo));
 
-          parent_->SetDynamicSize(
-              reshape, {}, output_dynamic_dimension, new_dynamic_size,
-              DimensionConstraint(1, constraint.multiple_of / divisor));
+          parent_->SetDynamicSize(reshape, {}, output_dynamic_dimension,
+                                  new_dynamic_size);
         }
 
         if (input_dim_size < output_dim_size) {
@@ -985,12 +855,8 @@ Status DynamicDimensionInferenceVisitor::HandleReshape(HloInstruction* hlo) {
               hlo->parent()->AddInstruction(HloInstruction::CreateBinary(
                   output_dynamic_size->shape(), HloOpcode::kMultiply,
                   new_dynamic_size, operand_dynamic_size));
-          int64 new_multiple_of_constraint =
-              constraint.multiple_of * output_dim_size /
-              operand->shape().dimensions(input_dynamic_dimension);
-          parent_->SetDynamicSize(
-              reshape, {}, output_dynamic_dimension, new_dynamic_size,
-              DimensionConstraint(1, new_multiple_of_constraint));
+          parent_->SetDynamicSize(reshape, {}, output_dynamic_dimension,
+                                  new_dynamic_size);
         }
 
         return Status::OK();
@@ -1001,8 +867,7 @@ Status DynamicDimensionInferenceVisitor::HandleReduceWindow(
     HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
       hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
-               int64 operand_index, HloInstruction* dynamic_size,
-               DimensionConstraint constraint) {
+               int64 operand_index, HloInstruction* dynamic_size) {
         HloInstruction* reduce_window = hlo;
         const WindowDimension& window_dimension =
             reduce_window->window().dimensions(dimension);
@@ -1013,8 +878,7 @@ Status DynamicDimensionInferenceVisitor::HandleReduceWindow(
               reduce_window->ToString());
         }
 
-        parent_->SetDynamicSize(reduce_window, {}, dimension, dynamic_size,
-                                constraint);
+        parent_->SetDynamicSize(reduce_window, {}, dimension, dynamic_size);
 
         return Status::OK();
       });
@@ -1024,8 +888,7 @@ Status DynamicDimensionInferenceVisitor::HandleSelectAndScatter(
     HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
       hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
-               int64 operand_index, HloInstruction* dynamic_size,
-               DimensionConstraint constraint) {
+               int64 operand_index, HloInstruction* dynamic_size) {
         HloInstruction* select_and_scatter = hlo;
         const WindowDimension& window_dimension =
             select_and_scatter->window().dimensions(dimension);
@@ -1036,8 +899,8 @@ Status DynamicDimensionInferenceVisitor::HandleSelectAndScatter(
               select_and_scatter->ToString());
         }
 
-        parent_->SetDynamicSize(select_and_scatter, {}, dimension, dynamic_size,
-                                constraint);
+        parent_->SetDynamicSize(select_and_scatter, {}, dimension,
+                                dynamic_size);
 
         return Status::OK();
       });
@@ -1046,8 +909,7 @@ Status DynamicDimensionInferenceVisitor::HandleSelectAndScatter(
 Status DynamicDimensionInferenceVisitor::HandleSlice(HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
       hlo, [&](HloInstruction* operand, ShapeIndex /*index*/, int64 dimension,
-               int64 /*operand_index*/, HloInstruction* dynamic_size,
-               DimensionConstraint constraint) {
+               int64 /*operand_index*/, HloInstruction* dynamic_size) {
         if (hlo->slice_starts(dimension) != 0 ||
             hlo->slice_strides(dimension) != 1 ||
             hlo->slice_limits(dimension) !=
@@ -1056,7 +918,7 @@ Status DynamicDimensionInferenceVisitor::HandleSlice(HloInstruction* hlo) {
           return Status::OK();
         }
 
-        parent_->SetDynamicSize(hlo, {}, dimension, dynamic_size, constraint);
+        parent_->SetDynamicSize(hlo, {}, dimension, dynamic_size);
 
         return Status::OK();
       });
@@ -1066,8 +928,7 @@ Status DynamicDimensionInferenceVisitor::HandleDynamicSlice(
     HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
       hlo, [&](HloInstruction*, ShapeIndex /*index*/, int64 dimension,
-               int64 /*operand_index*/, HloInstruction* dynamic_size,
-               DimensionConstraint constraint) {
+               int64 /*operand_index*/, HloInstruction* dynamic_size) {
         if (hlo->shape().dimensions(dimension) !=
             hlo->operand(0)->shape().dimensions(dimension)) {
           // Slicing a single element out kills the dynamic dimension.
@@ -1080,7 +941,7 @@ Status DynamicDimensionInferenceVisitor::HandleDynamicSlice(
               hlo->ToString());
         }
 
-        parent_->SetDynamicSize(hlo, {}, dimension, dynamic_size, constraint);
+        parent_->SetDynamicSize(hlo, {}, dimension, dynamic_size);
 
         return Status::OK();
       });
@@ -1089,9 +950,9 @@ Status DynamicDimensionInferenceVisitor::HandleDynamicSlice(
 Status DynamicDimensionInferenceVisitor::HandleDynamicUpdateSlice(
     HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
-      hlo, [&](HloInstruction* /*operand*/, ShapeIndex /*index*/,
-               int64 dimension, int64 /*operand_index*/,
-               HloInstruction* dynamic_size, DimensionConstraint constraint) {
+      hlo,
+      [&](HloInstruction* /*operand*/, ShapeIndex /*index*/, int64 dimension,
+          int64 /*operand_index*/, HloInstruction* dynamic_size) {
         if (hlo->shape().dimensions(dimension) !=
             hlo->operand(0)->shape().dimensions(dimension)) {
           return Unimplemented(
@@ -1100,7 +961,7 @@ Status DynamicDimensionInferenceVisitor::HandleDynamicUpdateSlice(
               hlo->ToString());
         }
 
-        parent_->SetDynamicSize(hlo, {}, dimension, dynamic_size, constraint);
+        parent_->SetDynamicSize(hlo, {}, dimension, dynamic_size);
 
         return Status::OK();
       });
@@ -1108,16 +969,16 @@ Status DynamicDimensionInferenceVisitor::HandleDynamicUpdateSlice(
 
 Status DynamicDimensionInferenceVisitor::HandleReverse(HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
-      hlo, [&](HloInstruction* /*operand*/, ShapeIndex /*index*/,
-               int64 dimension, int64 /*operand_index*/,
-               HloInstruction* dynamic_size, DimensionConstraint constraint) {
+      hlo,
+      [&](HloInstruction* /*operand*/, ShapeIndex /*index*/, int64 dimension,
+          int64 /*operand_index*/, HloInstruction* dynamic_size) {
         if (absl::c_linear_search(hlo->dimensions(), dimension)) {
           return Unimplemented(
               "Dynamic dimension propagation on reversed dimension is not "
               "supported %s",
               hlo->ToString());
         }
-        parent_->SetDynamicSize(hlo, {}, dimension, dynamic_size, constraint);
+        parent_->SetDynamicSize(hlo, {}, dimension, dynamic_size);
 
         return Status::OK();
       });
@@ -1127,7 +988,7 @@ Status DynamicDimensionInferenceVisitor::HandleGather(HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
       hlo, [&](HloInstruction* operand, ShapeIndex /*index*/,
                int64 input_dynamic_dimension, int64 operand_index,
-               HloInstruction* dynamic_size, DimensionConstraint constraint) {
+               HloInstruction* dynamic_size) {
         const GatherDimensionNumbers& gather_dims =
             hlo->gather_dimension_numbers();
         if (operand_index != 1) {
@@ -1147,8 +1008,7 @@ Status DynamicDimensionInferenceVisitor::HandleGather(HloInstruction* hlo) {
                 output_dimension--;
               }
             }
-            parent_->SetDynamicSize(hlo, {}, output_dimension, dynamic_size,
-                                    constraint);
+            parent_->SetDynamicSize(hlo, {}, output_dimension, dynamic_size);
             return Status::OK();
           }
           return Unimplemented(
@@ -1171,8 +1031,7 @@ Status DynamicDimensionInferenceVisitor::HandleGather(HloInstruction* hlo) {
               indices_dim++;
             }
             if (indices_dim++ == input_dynamic_dimension) {
-              parent_->SetDynamicSize(hlo, {}, output_dim, dynamic_size,
-                                      constraint);
+              parent_->SetDynamicSize(hlo, {}, output_dim, dynamic_size);
               return Status::OK();
             }
           }
@@ -1220,8 +1079,7 @@ Status DynamicDimensionInferenceVisitor::HandleConditional(
     TF_RETURN_IF_ERROR(ForEachDynamicDimensionInOperand(
         hlo, operand_index,
         [&](HloInstruction*, ShapeIndex, int64, int64,
-            HloInstruction* dynamic_size,
-            DimensionConstraint constraint) -> Status {
+            HloInstruction* dynamic_size) -> Status {
           TF_RET_CHECK(hlo->operand(operand_index)->shape().IsTuple())
               << "Only tuple typed inputs can have dynamic dimension. Please "
                  "file a bug against XLA team.";
@@ -1263,8 +1121,7 @@ Status DynamicDimensionInferenceVisitor::HandleConditional(
     TF_RETURN_IF_ERROR(ForEachDynamicDimensionInOperand(
         hlo, operand_index,
         [&](HloInstruction*, ShapeIndex index, int64 dimension,
-            int64 operand_index, HloInstruction* dynamic_size,
-            DimensionConstraint constraint) {
+            int64 operand_index, HloInstruction* dynamic_size) {
           DynamicParameterBinding::DynamicParameter dynamic_parameter{
               0, {dynamic_size_to_operand_id_index_map[dynamic_size]}};
           DynamicParameterBinding::DynamicDimension dynamic_dimension{
@@ -1284,8 +1141,8 @@ Status DynamicDimensionInferenceVisitor::HandleConditional(
     // that into the root instruction as additional tuple elements.
     TF_RETURN_IF_ERROR(ForEachDynamicDimension(
         new_computation->root_instruction(),
-        [&](ShapeIndex index, int64 dim, HloInstruction* dynamic_size,
-            DimensionConstraint) -> Status {
+        [&](ShapeIndex index, int64 dim,
+            HloInstruction* dynamic_size) -> Status {
           TF_RET_CHECK(hlo->shape().IsTuple())
               << "Only tuple typed conditionals can have dynamic dimension. "
                  "Please file a bug against XLA team.";
@@ -1347,11 +1204,9 @@ Status DynamicDimensionInferenceVisitor::HandleScatter(HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
       hlo,
       [&](HloInstruction* /*operand*/, ShapeIndex /*index*/, int64 dimension,
-          int64 operand_index, HloInstruction* operand_dynamic_size,
-          DimensionConstraint constraint) {
+          int64 operand_index, HloInstruction* operand_dynamic_size) {
         if (operand_index == 0) {
-          parent_->SetDynamicSize(hlo, {}, dimension, operand_dynamic_size,
-                                  constraint);
+          parent_->SetDynamicSize(hlo, {}, dimension, operand_dynamic_size);
           return Status::OK();
         }
 
@@ -1385,7 +1240,7 @@ Status DynamicDimensionInferenceVisitor::HandleWhile(HloInstruction* hlo) {
   int64 operand_count = original_tuple_count;
   TF_RETURN_IF_ERROR(ForEachOperandDynamicDimension(
       hlo, [&](HloInstruction*, ShapeIndex index, int64 dim, int64,
-               HloInstruction* dynamic_size, DimensionConstraint constraint) {
+               HloInstruction* dynamic_size) {
         operands_to_add.push_back(dynamic_size);
         dynamic_output_mapping.mutable_element(index)->emplace(dim,
                                                                operand_count++);
@@ -1413,8 +1268,7 @@ Status DynamicDimensionInferenceVisitor::HandleWhile(HloInstruction* hlo) {
     TF_RETURN_IF_ERROR(ForEachOperandDynamicDimension(
         hlo,
         [&](HloInstruction*, ShapeIndex index, int64 dimension,
-            int64 operand_index, HloInstruction* dynamic_size,
-            DimensionConstraint constraint) -> Status {
+            int64 operand_index, HloInstruction* dynamic_size) -> Status {
           TF_RET_CHECK(!operands_to_add.empty());
           const int64 output_dynamic_size_index =
               dynamic_output_mapping.element(index).at(dimension);
@@ -1431,7 +1285,7 @@ Status DynamicDimensionInferenceVisitor::HandleWhile(HloInstruction* hlo) {
                   ShapeUtil::MakeScalarShape(S32), hlo,
                   output_dynamic_size_index));
           parent_->SetDynamicSize(result.replacement_instr, index, dimension,
-                                  output_dynamic_size, constraint);
+                                  output_dynamic_size);
           return Status::OK();
         }));
     // Set the replacement instruction as visited to avoid visiting it again.
@@ -1465,8 +1319,7 @@ Status DynamicDimensionInferenceVisitor::HandleWhile(HloInstruction* hlo) {
   // Add dynamic dimension size as new parameters.
   TF_RETURN_IF_ERROR(ForEachDynamicDimension(
       hlo->while_body()->root_instruction(),
-      [&](ShapeIndex index, int64 dim, HloInstruction* dynamic_size,
-          DimensionConstraint) -> Status {
+      [&](ShapeIndex index, int64 dim, HloInstruction* dynamic_size) -> Status {
         const int64 output_index =
             dynamic_output_mapping.element(index).at(dim);
         new_root_operands[output_index] = dynamic_size;
@@ -1503,8 +1356,7 @@ Status DynamicDimensionInferenceVisitor::HandleParameter(HloInstruction* hlo) {
 
         parent_->SetDynamicSize(target_parameter,
                                 dynamic_dimension.parameter_index,
-                                dynamic_dimension.dimension, dynamic_size,
-                                DimensionConstraint(1, 1));
+                                dynamic_dimension.dimension, dynamic_size);
         return Status::OK();
       });
 }
@@ -1517,10 +1369,8 @@ Status DynamicDimensionInferenceVisitor::ForEachDynamicDimension(
       HloInstruction* dynamic_size = parent_->GetDynamicSize(
           dynamic_dimension.inst, dynamic_dimension.index,
           dynamic_dimension.dim);
-      CHECK_NE(parent_->constraint_mapping_.count(dynamic_dimension), 0);
-      TF_RETURN_IF_ERROR(fn(dynamic_dimension.index, dynamic_dimension.dim,
-                            dynamic_size,
-                            parent_->constraint_mapping_[dynamic_dimension]));
+      TF_RETURN_IF_ERROR(
+          fn(dynamic_dimension.index, dynamic_dimension.dim, dynamic_size));
     }
   }
   return Status::OK();
@@ -1536,10 +1386,9 @@ Status DynamicDimensionInferenceVisitor::ForEachDynamicDimensionInOperand(
       HloInstruction* dynamic_size = parent_->GetDynamicSize(
           dynamic_dimension.inst, dynamic_dimension.index,
           dynamic_dimension.dim);
-      CHECK_NE(parent_->constraint_mapping_.count(dynamic_dimension), 0);
       TF_RETURN_IF_ERROR(fn(dynamic_dimension.inst, dynamic_dimension.index,
-                            dynamic_dimension.dim, operand_index, dynamic_size,
-                            parent_->constraint_mapping_[dynamic_dimension]));
+                            dynamic_dimension.dim, operand_index,
+                            dynamic_size));
     }
   }
   return Status::OK();
@@ -1555,6 +1404,24 @@ Status DynamicDimensionInferenceVisitor::ForEachOperandDynamicDimension(
   return Status::OK();
 }
 
+void DynamicDimensionInference::SetDynamicSize(HloInstruction* inst,
+                                               const ShapeIndex& index,
+                                               int64 dim,
+                                               HloInstruction* size) {
+  VLOG(1) << "Set dimension inst " << inst->ToString() << " index "
+          << index.ToString() << "@" << dim << " to " << size->ToShortString();
+  Shape subshape = ShapeUtil::GetSubshape(inst->shape(), index);
+  CHECK(!subshape.IsTuple()) << "Can't set a tuple shape to dynamic dimension";
+  CHECK(dim < subshape.rank() && dim >= 0)
+      << "Asked to set invalid dynamic dimension. Shape: "
+      << subshape.ToString() << ", Dimension: " << dim;
+  DynamicDimension dynamic_dimension{inst, index, dim};
+  // Updating a dynamic dimension twice overwrites the previous one.
+  dynamic_mapping_[dynamic_dimension] = size;
+  auto iter = per_hlo_dynamic_dimensions_.try_emplace(inst);
+  iter.first->second.emplace(dynamic_dimension);
+}
+
 void DynamicDimensionInference::CopyMapping(HloInstruction* from,
                                             HloInstruction* to) {
   auto iter = per_hlo_dynamic_dimensions_.find(from);
@@ -1564,7 +1431,7 @@ void DynamicDimensionInference::CopyMapping(HloInstruction* from,
           GetDynamicSize(dynamic_dimension.inst, dynamic_dimension.index,
                          dynamic_dimension.dim);
       SetDynamicSize(to, dynamic_dimension.index, dynamic_dimension.dim,
-                     dynamic_size, constraint_mapping_[dynamic_dimension]);
+                     dynamic_size);
     }
   }
 }
@@ -1624,8 +1491,6 @@ Status DynamicDimensionInference::ForwardDynamicSize(HloInstruction* inst,
     auto iter = dynamic_mapping_.find(dynamic_dimension);
     if (iter != dynamic_mapping_.end()) {
       dynamic_mapping_.insert({dynamic_dimension_new, iter->second});
-      constraint_mapping_.insert(
-          {dynamic_dimension_new, constraint_mapping_[dynamic_dimension]});
       auto iter = per_hlo_dynamic_dimensions_.try_emplace(new_inst);
       iter.first->second.emplace(dynamic_dimension_new);
     }
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.h b/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
index 607d68bd9c3..1597538e9ac 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
@@ -55,8 +55,7 @@ class DynamicDimensionInference {
   // go into tuples.
   bool HasDynamicDimension(HloInstruction* inst) const;
 
-  // Forward dynamic dimension size at `dim` and its constraint from `inst` to
-  // `new_inst`.
+  // Forward dynamic dimension size at `dim` from `inst` to `new_inst`.
   Status ForwardDynamicSize(HloInstruction* inst, HloInstruction* new_inst,
                             const ShapeIndex& index);
 
@@ -64,9 +63,7 @@ class DynamicDimensionInference {
   // `inst` at `index` has a dynamic size, and its runtime size is represented
   // by a scalar instruction `size`.
   void SetDynamicSize(HloInstruction* inst, const ShapeIndex& index, int64 dim,
-                      HloInstruction* size) {
-    SetDynamicSize(inst, index, dim, size, DimensionConstraint(1, 1));
-  }
+                      HloInstruction* size);
 
   // For all tensors whose dynamic dimension is `replace`, replace them with
   // `with`.
@@ -106,116 +103,6 @@ class DynamicDimensionInference {
     }
   };
 
-  // DimensionConstraint is attached to each dynamic dimension and describe the
-  // constraint of each dimension. This is used to disambiguate the index of
-  // dynamic dimension for reshapes that "splits" a dimension into two.
-  //
-  // As an example, consider the following reshapes:
-  // [<=3, 3]   <- Assume first dimension is dynamic.
-  //   |
-  // Reshape.1
-  //   |
-  //  [<=9]     <- Dimension 9 is dynamic
-  //   |
-  // Reshape.2
-  //   |
-  // [3, 3]   <- Ambiguous dimension after splitting 9 into [3, 3]
-  //
-  // There is no way to know which dimension is dynamic by looking at the second
-  // reshape locally.
-  //
-  // However, if we look at the dynamic dimension 9, since it comes from
-  // collapsing a major dynamic dimension of 3 (the dynamic size can be 0, 1, 2,
-  // 3, denoted as i in the diagram below) and a minor static dimension of 3, we
-  // know it has certain constraints that the reshape can only be one of the 4
-  // forms:
-  //
-  // o: Padded Data
-  // x: Effective Data
-  //
-  //     [<=3, 3] to [9]
-  //
-  //     +---+            +---+            +---+            +---+
-  //     |ooo|            |ooo|            |ooo|            |xxx|
-  //     |ooo|            |ooo|            |xxx|            |xxx|
-  //     |ooo|            |xxx|            |xxx|            |xxx|
-  //     +---+            +---+            +---+            +---+
-  //
-  //    Reshape          Reshape          Reshape          Reshape
-  //
-  // +-----------+    +-----------+    +-----------+    +-----------+
-  // |ooo|ooo|ooo| or |xxx|ooo|ooo| or |xxx|xxx|ooo| or |xxx|xxx|xxx|  stride=1
-  // +-----------+    +-----------+    +-----------+    +-----------+
-  //     i = 0             i = 1            i = 2            i = 3
-  //
-  // On the other hand, if the minor dimension 3 is dynamic and major dimension
-  // is static, we will have the following form:
-  //
-  //     [3, <=3] to [9]
-  //
-  //     +---+            +---+            +---+            +---+
-  //     |ooo|            |xoo|            |xxo|            |xxx|
-  //     |ooo|            |xoo|            |xxo|            |xxx|
-  //     |ooo|            |xoo|            |xxo|            |xxx|
-  //     +---+            +---+            +---+            +---+
-  //
-  //    Reshape          Reshape          Reshape          Reshape
-  //
-  // +-----------+    +-----------+    +-----------+    +-----------+
-  // |ooo|ooo|ooo| or |xoo|xoo|xoo| or |xxo|xxo|xxo| or |xxo|xxo|xxo|  stride=3
-  // +-----------+    +-----------+    +-----------+    +-----------+
-  //     i = 0             i = 1            i = 2            i = 3
-  //
-  // By encoding constraint as a stride of elements we can recover this
-  // information later when we reshape from [9] to [3, 3]. We know which form
-  // ([3, i] or [i,3]) we should reshape the [9] into.
-  //
-  //
-  struct DimensionConstraint {
-    explicit DimensionConstraint(int64 s, int64 m)
-        : stride(s), multiple_of(m) {}
-    DimensionConstraint() : stride(1), multiple_of(1) {}
-    // Stride represents the distance of a newly placed element and the previous
-    // placed element on this dynamic dimension.
-    int64 stride;
-
-    // multiple_of represents the constraints that
-    //
-    // `dynamic_size` % `multiple_of` == 0
-    int64 multiple_of;
-  };
-
-  using ConstraintMapping =
-      absl::flat_hash_map<DynamicDimension, DimensionConstraint>;
-
-  ConstraintMapping constraint_mapping_;
-
-  // Update the dynamic mapping so that we know dimension `dim` of instruction
-  // `inst` at `index` has a dynamic size, and its runtime size is represented
-  // by a scalar instruction `size`.
-  void SetDynamicSize(HloInstruction* inst, const ShapeIndex& index, int64 dim,
-                      HloInstruction* size, DimensionConstraint constraint) {
-    VLOG(1) << "Set dimension inst " << inst->ToString() << " index "
-            << index.ToString() << "@" << dim << " to " << size->ToShortString()
-            << " constraint: " << constraint.multiple_of;
-    Shape subshape = ShapeUtil::GetSubshape(inst->shape(), index);
-    CHECK(!subshape.IsTuple())
-        << "Can't set a tuple shape to dynamic dimension";
-    CHECK(dim < subshape.rank() && dim >= 0)
-        << "Asked to set invalid dynamic dimension. Shape: "
-        << subshape.ToString() << ", Dimension: " << dim;
-    DynamicDimension dynamic_dimension{inst, index, dim};
-    // Updating a dynamic dimension twice overwrites the previous one.
-    dynamic_mapping_[dynamic_dimension] = size;
-    if (constraint_mapping_.count(dynamic_dimension) != 0) {
-      CHECK_EQ(constraint_mapping_[dynamic_dimension].stride,
-               constraint.stride);
-    }
-    constraint_mapping_[dynamic_dimension] = constraint;
-    auto iter = per_hlo_dynamic_dimensions_.try_emplace(inst);
-    iter.first->second.emplace(dynamic_dimension);
-  }
-
   // Copies the internal mapping from instruction `from` to instruction `to`.
   // This is useful when an instruction is replaced by the other during the
   // inferencing process.
diff --git a/tensorflow/compiler/xla/service/dynamic_padder.cc b/tensorflow/compiler/xla/service/dynamic_padder.cc
index 44fdda0f411..c1f9da599e8 100644
--- a/tensorflow/compiler/xla/service/dynamic_padder.cc
+++ b/tensorflow/compiler/xla/service/dynamic_padder.cc
@@ -688,9 +688,7 @@ StatusOr<bool> RewriteDynamicConcat(
           dynamic_size));
     }
   }
-  for (HloInstruction* user : prev_users) {
-    TF_RETURN_IF_ERROR(concat->ReplaceUseWith(user, rewritten_concat));
-  }
+  TF_RETURN_IF_ERROR(concat->ReplaceUsesWith(prev_users, rewritten_concat));
   TF_RETURN_IF_ERROR(dynamic_dimension_inference->ForwardDynamicSize(
       concat, rewritten_concat, {}));
   return true;
diff --git a/tensorflow/compiler/xla/service/dynamic_padder_test.cc b/tensorflow/compiler/xla/service/dynamic_padder_test.cc
index e4c70317f2b..e8f429d9db6 100644
--- a/tensorflow/compiler/xla/service/dynamic_padder_test.cc
+++ b/tensorflow/compiler/xla/service/dynamic_padder_test.cc
@@ -83,8 +83,8 @@ class DynamicPadderTest : public HloTestBase {
     return module;
   }
 
-  StatusOr<bool> RunPadder() {
-    DynamicPadder padder(/*slice_dynamic_output=*/true,
+  StatusOr<bool> RunPadder(bool slice_dynamic_output = false) {
+    DynamicPadder padder(/*slice_dynamic_output=*/slice_dynamic_output,
                          CustomCallDynamicDimensionInference,
                          OpHasDynamismSupport);
     return padder.Run(module_.get());
@@ -162,7 +162,7 @@ ENTRY main {
 
   module_ = GetHloModule(hlo_text);
 
-  TF_ASSERT_OK(RunPadder().status());
+  TF_ASSERT_OK(RunPadder(/*slice_dynamic_output=*/true).status());
   // After rewrite, we should have :
   //
   //   param
@@ -218,7 +218,7 @@ ENTRY main {
 
   module_ = GetHloModule(hlo_text);
 
-  TF_ASSERT_OK(RunPadder().status());
+  TF_ASSERT_OK(RunPadder(/*slice_dynamic_output=*/true).status());
   // After rewrite, we should have :
   //
   //   param
@@ -654,26 +654,16 @@ XLA_TEST_F(ExecutionTest, DynamicConcat) {
   const string hlo_text = R"(
 HloModule DynamicConcat
 
-update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
-  lhs = s32[] parameter(0)
-  rhs = s32[] parameter(1)
-  ROOT add = s32[] add(lhs, rhs)
-}
-
 ENTRY main {
   param_0 = s32[3] parameter(0)
   param_1 = s32[3] parameter(1)
   param_2 = s32[3] parameter(2)
   size = s32[] constant(2)
-  param_padded_0 = s32[3] set-dimension-size(param_0, size), dimensions={0}
-  param_padded_2 = s32[3] set-dimension-size(param_2, size), dimensions={0}
-  %concatenate = s32[9]
-    concatenate(s32[3] param_padded_0, s32[3] param_1, s32[3] param_padded_2),
+  param_padded_0 = s32[<=3] set-dimension-size(param_0, size), dimensions={0}
+  param_padded_2 = s32[<=3] set-dimension-size(param_2, size), dimensions={0}
+  ROOT %concatenate = s32[9]
+    concatenate(s32[<=3] param_padded_0, s32[<=3] param_1, s32[<=3] param_padded_2),
     dimensions={0}
-  init = s32[] constant(0)
-  ROOT reduce = s32[] reduce(concatenate, init),
-      dimensions={0},
-      to_apply=update_s32
 }
 )";
 
@@ -686,10 +676,10 @@ ENTRY main {
       LiteralUtil::CreateR1<int32>({6, 7, -1});  // Dynamic operand.
   auto module = GetHloModule(hlo_text);
 
-  Literal result =
-      PadAndExecute(std::move(module), {&operand_0, &operand_1, &operand_2});
-
-  Literal expected = LiteralUtil::CreateR0<int32>(28);
+  Literal result = PadAndExecute(std::move(module),
+                                 {&operand_0, &operand_1, &operand_2}, false);
+  result.SetDynamicSize(0, 7);
+  Literal expected = LiteralUtil::CreateR1<int32>({1, 2, 3, 4, 5, 6, 7});
 
   EXPECT_EQ(result, expected);
 }
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 4b6c30cadc4..98d523487b4 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -2462,10 +2462,6 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
                               operand_to_generator.at(hlo->operand(i))(index));
           operands.push_back(operand_value);
         }
-        std::vector<llvm_ir::ElementGenerator> input_generators;
-        for (const HloInstruction* instr : hlo->operands()) {
-          input_generators.push_back(operand_to_generator.at(instr));
-        }
         return EmitElementalMap(Cast<HloMapInstruction>(hlo), operands);
       };
     case HloOpcode::kReduceWindow:
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index 61ce6200a28..d5cf2ee9ac0 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -93,7 +93,8 @@ StatusOr<ScopedShapedBuffer> Executable::ExecuteOnStream(
 
 static ExecutionInput MakeMaybeOwningDeviceMemoryTree(
     const ShapedBuffer& shaped_buffer) {
-  ExecutionInput result(shaped_buffer.on_device_shape());
+  ExecutionInput result(shaped_buffer.on_device_shape(),
+                        shaped_buffer.on_host_shape());
   shaped_buffer.buffers().ForEachElement(
       [&](const ShapeIndex& index, const se::DeviceMemoryBase& mem) {
         result.SetBuffer(index, MaybeOwningDeviceMemory(mem));
@@ -105,10 +106,10 @@ StatusOr<ScopedShapedBuffer> Executable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
     absl::Span<const ShapedBuffer* const> arguments,
     HloExecutionProfile* hlo_execution_profile) {
-  std::vector<ExecutionInput> args(arguments.size());
-  auto out_it = args.begin();
+  std::vector<ExecutionInput> args;
+  args.reserve(arguments.size());
   for (const ShapedBuffer* arg : arguments) {
-    *out_it++ = MakeMaybeOwningDeviceMemoryTree(*arg);
+    args.emplace_back(MakeMaybeOwningDeviceMemoryTree(*arg));
   }
   TF_ASSIGN_OR_RETURN(ExecutionOutput out,
                       ExecuteAsyncOnStream(run_options, std::move(args),
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 6881f6dd68a..2e3ddedfb8c 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -60,10 +60,17 @@ namespace xla {
 //   with their indices absent from unowned_indices_.
 class ExecutionInput {
  public:
-  ExecutionInput() = default;
-  explicit ExecutionInput(xla::Shape shape) : buffers_(std::move(shape)) {}
-  explicit ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers)
-      : buffers_(std::move(buffers)) {}
+  explicit ExecutionInput(xla::Shape shape, xla::Shape host_shape)
+      : buffers_(std::move(shape)) {
+    SetHostShape(std::move(host_shape));
+  }
+
+  explicit ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers,
+                          xla::Shape host_shape)
+      : buffers_(std::move(buffers)) {
+    SetHostShape(std::move(host_shape));
+  }
+
   ExecutionInput(ExecutionInput&&) = default;
 
   ~ExecutionInput();
@@ -74,6 +81,10 @@ class ExecutionInput {
     return dynamic_shape_ != nullptr ? *dynamic_shape_ : buffers_.shape();
   }
 
+  const Shape& host_shape() const {
+    return host_shape_ != nullptr ? *host_shape_ : shape();
+  }
+
   Status SetDynamicShape(Shape dynamic_shape);
 
   xla::StatusOr<xla::ShapedBuffer> ToShapedBuffer(
@@ -94,6 +105,8 @@ class ExecutionInput {
     unowned_indices_.erase(index);
   }
 
+  const std::set<ShapeIndex>& unowned_indices() { return unowned_indices_; }
+
   const ShapeTree<MaybeOwningDeviceMemory>& Buffers() const { return buffers_; }
 
   ShapeTree<MaybeOwningDeviceMemory>* MutableBuffers() { return &buffers_; }
@@ -107,11 +120,18 @@ class ExecutionInput {
   }
 
  private:
+  void SetHostShape(xla::Shape host_shape) {
+    if (shape() != host_shape) {
+      host_shape_ = absl::make_unique<Shape>(std::move(host_shape));
+    }
+  }
+
   ShapeTree<MaybeOwningDeviceMemory> buffers_;
   // Set of indices of buffers that should be returned to the caller if an error
   // occurs when enqueuing the computation.
   std::set<ShapeIndex> unowned_indices_;
   std::unique_ptr<Shape> dynamic_shape_;
+  std::unique_ptr<Shape> host_shape_;
 };
 
 // ExecutionOutput encapsulates the output buffers of a execution and the
@@ -172,6 +192,12 @@ class ExecutionOutput {
     return std::move(to_be_released_);
   }
 
+  std::vector<ShapeIndex> ConsumeAliasedIndices() {
+    auto aliased = std::move(aliased_indices_);
+    aliased_indices_.clear();
+    return aliased;
+  }
+
  private:
   ScopedShapedBuffer result_;
 
diff --git a/tensorflow/compiler/xla/service/gather_expander.cc b/tensorflow/compiler/xla/service/gather_expander.cc
index 1838f65e6ea..d38873a501d 100644
--- a/tensorflow/compiler/xla/service/gather_expander.cc
+++ b/tensorflow/compiler/xla/service/gather_expander.cc
@@ -269,6 +269,22 @@ static StatusOr<HloInstruction*> PermuteBatchAndOffsetDims(
   return MakeTransposeHlo(accumulator, permutation);
 }
 
+// Computes how many trips a loop implementing this gather op would take.
+static int64 GatherLoopTripCount(HloInstruction* gather_instr) {
+  HloInstruction* start_indices = gather_instr->mutable_operand(1);
+  const Shape& start_indices_shape = start_indices->shape();
+  const GatherDimensionNumbers& dim_numbers =
+      gather_instr->gather_dimension_numbers();
+
+  int64 trip_count = 1;
+  for (int64 i = 0, e = start_indices_shape.dimensions_size(); i < e; i++) {
+    if (i != dim_numbers.index_vector_dim()) {
+      trip_count *= start_indices_shape.dimensions(i);
+    }
+  }
+  return trip_count;
+}
+
 // High Level Algorithm
 //
 // We follow the following steps in sequence:
@@ -311,20 +327,13 @@ StatusOr<HloInstruction*> GatherExpander::ExpandInstruction(
   HloComputation* computation = gather_instr->parent();
   HloInstruction* operand = gather_instr->mutable_operand(0);
   HloInstruction* start_indices = gather_instr->mutable_operand(1);
-  const Shape& start_indices_shape = start_indices->shape();
   const Shape& output_shape = gather_instr->shape();
   int64 output_rank = output_shape.dimensions_size();
 
   const GatherDimensionNumbers& dim_numbers =
       gather_instr->gather_dimension_numbers();
 
-  int64 gather_loop_trip_count = 1;
-  for (int64 i = 0, e = start_indices_shape.dimensions_size(); i < e; i++) {
-    if (i != dim_numbers.index_vector_dim()) {
-      gather_loop_trip_count *= start_indices_shape.dimensions(i);
-    }
-  }
-
+  int64 gather_loop_trip_count = GatherLoopTripCount(gather_instr);
   if (!IsInt32(gather_loop_trip_count)) {
     return Unimplemented(
         "Gather operations with more than 2147483647 gather indices are not "
@@ -373,7 +382,11 @@ bool GatherExpander::InstructionMatchesPattern(HloInstruction* inst) {
   return inst->opcode() == HloOpcode::kGather &&
          // Avoid expanding gather ops that produce zero sized tensors,
          // instead punt these to ZeroSizedHloElimination.
-         !ShapeUtil::IsZeroElementArray(inst->shape());
+         !ShapeUtil::IsZeroElementArray(inst->shape()) &&
+         // In kEliminateSimpleGathers mode, we only simplify instructions
+         // which can be represented without a loop -- i.e. we only simplify
+         // gathers which have a trip count of 1.
+         (mode_ == kEliminateAllGathers || GatherLoopTripCount(inst) == 1);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gather_expander.h b/tensorflow/compiler/xla/service/gather_expander.h
index 5625a37cb46..e665fcd713c 100644
--- a/tensorflow/compiler/xla/service/gather_expander.h
+++ b/tensorflow/compiler/xla/service/gather_expander.h
@@ -21,10 +21,30 @@ limitations under the License.
 namespace xla {
 
 // This pass rewrites gather operations into (roughly) while loops of dynamic
-// slices.  This lets backends that don't support gather directly to
-// nevertheless have a minimum level of support.
+// slices.
+//
+// This pass can be used two ways:
+//
+//  - kEliminateAllGathers: For backends that don't support gather, this pass
+//    can convert every gather to a loop.
+//
+//  - kEliminateSimpleGathers: For backends that *do* support gather, this pass
+//    can strength-reduce "simple" gathers -- specifically, gathers that can be
+//    represented without a loop -- to dyanmic-slices.
+//
+// Note that even in kEliminateSimpleGathers mode, this pass may still expand a
+// gather into a loop (with a trip-count of 1).  It's up to other simplification
+// passes to remove the loop.
+//
 class GatherExpander : public OpExpanderPass {
  public:
+  enum Mode {
+    kEliminateAllGathers,
+    kEliminateSimpleGathers,
+  };
+
+  explicit GatherExpander(Mode m) : mode_(m) {}
+
   absl::string_view name() const override { return "gather_expander"; }
 
  protected:
@@ -32,6 +52,9 @@ class GatherExpander : public OpExpanderPass {
 
   StatusOr<HloInstruction*> ExpandInstruction(
       HloInstruction* gather_inst) override;
+
+ private:
+  Mode mode_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gather_expander_test.cc b/tensorflow/compiler/xla/service/gather_expander_test.cc
index 706327091d9..4b0808e9aaf 100644
--- a/tensorflow/compiler/xla/service/gather_expander_test.cc
+++ b/tensorflow/compiler/xla/service/gather_expander_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gather_expander.h"
 
+#include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -42,7 +43,9 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
 
-  Status status = GatherExpander{}.Run(module.get()).status();
+  Status status = GatherExpander{GatherExpander::kEliminateAllGathers}
+                      .Run(module.get())
+                      .status();
   EXPECT_EQ(status.code(), tensorflow::error::UNIMPLEMENTED);
 
   ASSERT_THAT(
@@ -68,7 +71,9 @@ ENTRY main {
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, GatherExpander{}.Run(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      GatherExpander{GatherExpander::kEliminateAllGathers}.Run(module.get()));
   ASSERT_TRUE(changed);
 
   HloInstruction* while_instr = nullptr;
@@ -129,7 +134,9 @@ ENTRY main {
   OpMetadata metadata;
   metadata.set_op_name("Gather");
   module->entry_computation()->root_instruction()->set_metadata(metadata);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, GatherExpander{}.Run(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      GatherExpander{GatherExpander::kEliminateAllGathers}.Run(module.get()));
   ASSERT_TRUE(changed);
 
   HloInstruction* while_instr = nullptr;
@@ -147,5 +154,54 @@ ENTRY main {
          "after gather expansion";
   EXPECT_EQ(while_instr->metadata().op_name(), "Gather");
 }
+
+TEST_F(GatherExpanderTest, EliminateSimpleGathersSkipsNontrivialGather) {
+  const string hlo_text = R"(
+HloModule TensorFlowGatherV1
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  ROOT gather = s32[2,3] gather(operand, indices),
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
+      index_vector_dim=1,
+      slice_sizes={1, 3}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  GatherExpander pass(GatherExpander::kEliminateSimpleGathers);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&pass, module.get()));
+  ASSERT_FALSE(changed);
+}
+
+TEST_F(GatherExpanderTest, EliminateSimpleGathersRewritesTrivialGather) {
+  const string hlo_text = R"(
+HloModule test
+
+ENTRY main {
+  operand = s32[100] parameter(0)
+  indices = s32[1] parameter(1)
+  ROOT gather = s32[10] gather(operand, indices),
+      offset_dims={0},
+      collapsed_slice_dims={},
+      start_index_map={0},
+      index_vector_dim=0,
+      slice_sizes={10}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  GatherExpander pass(GatherExpander::kEliminateAllGathers);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&pass, module.get()));
+  ASSERT_TRUE(changed);
+  ASSERT_FALSE(hlo_query::ContainsInstrWithOpcode(module->entry_computation(),
+                                                  {HloOpcode::kGather}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index b22f258bac6..074fbd92b27 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -27,6 +27,7 @@ load(
     "if_cuda_is_configured",
 )
 load("//tensorflow:tensorflow.bzl", "if_nccl")
+load("//third_party/mlir:tblgen.bzl", "gentbl")
 
 package(
     default_visibility = [":friends"],
@@ -170,7 +171,6 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service/llvm_ir:alias_analysis",
         "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util",
         "//tensorflow/compiler/xla/service/llvm_ir:ir_array",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
@@ -286,6 +286,7 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Core",
@@ -687,7 +688,7 @@ cc_library(
         ":gpu_autotuning_proto_cc",
         ":gpu_conv_runner",
         ":gpu_executable",
-        ":hlo_algorithm_blacklist",
+        ":hlo_algorithm_denylist",
         ":ir_emission_utils",
         ":stream_executor_util",
         "@com_google_absl//absl/algorithm:container",
@@ -1168,6 +1169,8 @@ cc_library(
         "//tensorflow/compiler/xla/service:batchnorm_expander",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:call_inliner",
+        "//tensorflow/compiler/xla/service:comparison_expander",
+        "//tensorflow/compiler/xla/service:conditional_canonicalizer",
         "//tensorflow/compiler/xla/service:conditional_simplifier",
         "//tensorflow/compiler/xla/service:convolution_4d_expander",
         "//tensorflow/compiler/xla/service:dot_decomposer",
@@ -1176,6 +1179,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:dynamic_padder",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
+        "//tensorflow/compiler/xla/service:gather_expander",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_constant_folding",
         "//tensorflow/compiler/xla/service:hlo_cse",
@@ -1660,9 +1664,9 @@ tf_proto_library_cc(
 )
 
 cc_library(
-    name = "hlo_algorithm_blacklist",
-    srcs = ["hlo_algorithm_blacklist.cc"],
-    hdrs = ["hlo_algorithm_blacklist.h"],
+    name = "hlo_algorithm_denylist",
+    srcs = ["hlo_algorithm_denylist.cc"],
+    hdrs = ["hlo_algorithm_denylist.h"],
     deps = [
         ":gpu_autotuning_proto_cc",
         "//tensorflow/compiler/xla:debug_options_flags",
@@ -1673,12 +1677,12 @@ cc_library(
 )
 
 tf_cc_test(
-    name = "hlo_algorithm_blacklist_test",
-    srcs = ["hlo_algorithm_blacklist_test.cc"],
-    data = ["data/hlo_algorithm_blacklist.pbtxt"],
+    name = "hlo_algorithm_denylist_test",
+    srcs = ["hlo_algorithm_denylist_test.cc"],
+    data = ["data/hlo_algorithm_denylist.pbtxt"],
     tags = ["no_pip"],
     deps = [
-        ":hlo_algorithm_blacklist",
+        ":hlo_algorithm_denylist",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -1875,3 +1879,49 @@ cc_library(
         "@com_google_absl//absl/types:span",
     ],
 )
+
+gentbl(
+    name = "xla_thunks_ops_inc_gen",
+    tbl_outs = [
+        ("-gen-op-decls", "ir/xla_thunks_ops.h.inc"),
+        ("-gen-op-defs", "ir/xla_thunks_ops.cc.inc"),
+        ("-gen-struct-attr-decls", "ir/xla_thunks_structs.h.inc"),
+        ("-gen-struct-attr-defs", "ir/xla_thunks_structs.cc.inc"),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "ir/xla_thunks_ops.td",
+    td_srcs = [
+        "@llvm-project//mlir:LLVMOpsTdFiles",
+    ],
+)
+
+cc_library(
+    name = "xla_thunks_ops",
+    srcs = [
+        "ir/xla_thunks_ops.cc",
+        "ir/xla_thunks_ops.cc.inc",
+        "ir/xla_thunks_ops.h.inc",
+    ],
+    hdrs = [
+        "ir/xla_thunks_ops.h",
+    ],
+    deps = [
+        ":xla_thunks_ops_inc_gen",
+        "//tensorflow/compiler/mlir/hlo",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMDialect",
+    ],
+)
+
+# Library with XLA thunks dialect static initialization.
+cc_library(
+    name = "xla_thunks_dialect_registration",
+    srcs = [
+        "ir/dialect_registration.cc",
+    ],
+    deps = [
+        ":xla_thunks_ops",
+        "@llvm-project//mlir:IR",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc b/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc
index bb76bf02eba..b3b5cf7e048 100644
--- a/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc
@@ -220,10 +220,13 @@ RefcountingHashMap<RendezvousKey, Rendezvous>& GlobalRendezvousMap() {
 CollectivePermuteThunk::CollectivePermuteThunk(
     ThunkInfo thunk_info, const BufferAllocation::Slice& src,
     const BufferAllocation::Slice& dest)
-    : Thunk(kCollectivePermute, thunk_info), src_(src), dest_(dest) {}
+    : Thunk(kCollectivePermute, thunk_info),
+      hlo_instruction_(thunk_info.hlo_instruction),
+      src_(src),
+      dest_(dest) {}
 
 Status CollectivePermuteThunk::ExecuteOnStream(const ExecuteParams& params) {
-  auto* instr = Cast<HloCollectivePermuteInstruction>(hlo_instruction());
+  auto* instr = Cast<HloCollectivePermuteInstruction>(hlo_instruction_);
   auto op_profiler =
       params.profiler->MakeScopedInstructionProfiler(profile_index());
 
diff --git a/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.h b/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.h
index 329db00c66a..44cc6a1c64e 100644
--- a/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.h
@@ -33,6 +33,7 @@ class CollectivePermuteThunk : public Thunk {
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
+  const HloInstruction* hlo_instruction_;
   BufferAllocation::Slice src_;
   BufferAllocation::Slice dest_;
 };
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
index 041aa9b6fa3..4cff48a89da 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
@@ -29,6 +29,7 @@ ConditionalThunk::ConditionalThunk(
     absl::Span<const BufferAllocation::Slice> branch_operand_buffer_indexes,
     std::vector<ThunkSequence> branch_thunk_sequences)
     : Thunk(Kind::kConditional, thunk_info),
+      hlo_instruction_(thunk_info.hlo_instruction),
       branch_index_is_bool_(
           thunk_info.hlo_instruction->operand(0)->shape().element_type() ==
           PRED),
@@ -45,13 +46,6 @@ ConditionalThunk::ConditionalThunk(
   }
 }
 
-void ConditionalThunk::ComputeAnnotations() {
-  Thunk::ComputeAnnotations();
-  for (auto& branch_thunk : branch_thunks_) {
-    branch_thunk->ComputeAnnotations();
-  }
-}
-
 Status ConditionalThunk::Initialize(const GpuExecutable& executable,
                                     se::StreamExecutor* executor) {
   if (branch_index_is_bool_) {
@@ -91,8 +85,8 @@ Status ConditionalThunk::ExecuteOnStream(const ExecuteParams& params) {
     branch_index = pred ? 0 : 1;
   } else {
     // Handle default scenario for branch_index not in [0, num_branches).
-    if (branch_index < 0 || branch_index >= hlo_instruction()->branch_count()) {
-      branch_index = hlo_instruction()->branch_count() - 1;
+    if (branch_index < 0 || branch_index >= hlo_instruction_->branch_count()) {
+      branch_index = hlo_instruction_->branch_count() - 1;
     }
   }
 
@@ -100,7 +94,7 @@ Status ConditionalThunk::ExecuteOnStream(const ExecuteParams& params) {
   profiler.StartHloComputation();
   TF_RETURN_IF_ERROR(branch_thunks_[branch_index]->ExecuteOnStream(params));
   profiler.FinishHloComputation(
-      hlo_instruction()->branch_computation(branch_index));
+      hlo_instruction_->branch_computation(branch_index));
 
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
index a00285efa7c..f91f1c52146 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
@@ -51,12 +51,12 @@ class ConditionalThunk : public Thunk {
   ConditionalThunk(const ConditionalThunk&) = delete;
   ConditionalThunk& operator=(const ConditionalThunk&) = delete;
 
-  void ComputeAnnotations() override;
   Status Initialize(const GpuExecutable& executable,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
+  const HloInstruction* hlo_instruction_;
   const bool branch_index_is_bool_;
   BufferAllocation::Slice branch_index_buffer_index_;
   std::vector<BufferAllocation::Slice> branch_operand_buffer_indexes_;
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
index df3dd6d4593..3048db95c39 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@@ -35,12 +35,11 @@ ConvolutionThunk::ConvolutionThunk(
     BufferAllocation::Slice result_slice, BufferAllocation::Slice scratch_slice,
     BufferAllocation::Slice tuple_result_slice)
     : Thunk(Kind::kConvolution, thunk_info),
+      cudnn_call_(Cast<HloCustomCallInstruction>(thunk_info.hlo_instruction)),
       operand_buffers_(std::move(operand_slices)),
       result_buffer_(result_slice),
       scratch_buffer_(scratch_slice),
-      tuple_result_buffer_(tuple_result_slice) {
-  cudnn_call_ = Cast<HloCustomCallInstruction>(hlo_instruction());
-}
+      tuple_result_buffer_(tuple_result_slice) {}
 
 Status ConvolutionThunk::ExecuteOnStream(const ExecuteParams& params) {
   const auto& buffer_allocations = *params.buffer_allocations;
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc
index 36f415d9d89..e91b2c4d0d2 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc
@@ -98,6 +98,7 @@ CudnnBatchNormForwardInferenceThunk::CudnnBatchNormForwardInferenceThunk(
     const BufferAllocation::Slice& variance, float epsilon, int64 feature_index,
     const BufferAllocation::Slice& output)
     : Thunk(Thunk::Kind::kCudnnBatchNormForwardInference, thunk_info),
+      hlo_instruction_(thunk_info.hlo_instruction),
       operand_(operand),
       scale_(scale),
       offset_(offset),
@@ -106,7 +107,7 @@ CudnnBatchNormForwardInferenceThunk::CudnnBatchNormForwardInferenceThunk(
       epsilon_(epsilon),
       feature_index_(feature_index),
       output_(output) {
-  const auto* hlo = hlo_instruction();
+  const auto* hlo = hlo_instruction_;
   CHECK_EQ(hlo->opcode(), HloOpcode::kCustomCall);
   CHECK_EQ(hlo->custom_call_target(),
            kCudnnBatchNormForwardInferenceCallTarget);
@@ -130,7 +131,7 @@ Status CudnnBatchNormForwardInferenceThunk::ExecuteOnStream(
       buffer_allocations.GetDeviceAddress(variance_));
   auto& stream = *params.stream;
   TF_RETURN_IF_ERROR(RunCudnnBatchNormForwardInference(
-      hlo_instruction(), operand, output_base, scale, offset, mean, variance,
+      hlo_instruction_, operand, output_base, scale, offset, mean, variance,
       epsilon_, feature_index_, &stream));
 
   if (!stream.ok()) {
@@ -148,6 +149,7 @@ CudnnBatchNormForwardTrainingThunk::CudnnBatchNormForwardTrainingThunk(
     const BufferAllocation::Slice& output_inv_stddev,
     const BufferAllocation::Slice& output_tuple)
     : Thunk(Thunk::Kind::kCudnnBatchNormForwardTraining, thunk_info),
+      hlo_instruction_(thunk_info.hlo_instruction),
       operand_(operand),
       scale_(scale),
       offset_(offset),
@@ -157,7 +159,7 @@ CudnnBatchNormForwardTrainingThunk::CudnnBatchNormForwardTrainingThunk(
       output_mean_(output_mean),
       output_inv_stddev_(output_inv_stddev),
       output_tuple_(output_tuple) {
-  const auto* hlo = hlo_instruction();
+  const auto* hlo = hlo_instruction_;
   CHECK_EQ(hlo->opcode(), HloOpcode::kCustomCall);
   CHECK_EQ(hlo->custom_call_target(), kCudnnBatchNormForwardTrainingCallTarget);
   CHECK_EQ(hlo->shape().tuple_shapes_size(), 3);
@@ -183,7 +185,7 @@ Status CudnnBatchNormForwardTrainingThunk::ExecuteOnStream(
       params.profiler->MakeScopedInstructionProfiler(profile_index());
   auto& stream = *params.stream;
   TF_RETURN_IF_ERROR(RunCudnnBatchNormForwardTraining(
-      hlo_instruction(), operand, output_data, output_mean, output_inv_stddev,
+      hlo_instruction_, operand, output_data, output_mean, output_inv_stddev,
       se::DeviceMemory<float>(buffer_allocations.GetDeviceAddress(scale_)),
       se::DeviceMemory<float>(buffer_allocations.GetDeviceAddress(offset_)),
       epsilon_, feature_index_, &stream));
@@ -214,6 +216,7 @@ CudnnBatchNormBackwardThunk::CudnnBatchNormBackwardThunk(
     const BufferAllocation::Slice& output_grad_offset,
     const BufferAllocation::Slice& output_tuple)
     : Thunk(Thunk::Kind::kCudnnBatchNormBackward, thunk_info),
+      hlo_instruction_(thunk_info.hlo_instruction),
       operand_(operand),
       scale_(scale),
       mean_(mean),
@@ -225,7 +228,7 @@ CudnnBatchNormBackwardThunk::CudnnBatchNormBackwardThunk(
       output_grad_scale_(output_grad_scale),
       output_grad_offset_(output_grad_offset),
       output_tuple_(output_tuple) {
-  const auto* hlo = hlo_instruction();
+  const auto* hlo = hlo_instruction_;
   CHECK_EQ(hlo->opcode(), HloOpcode::kCustomCall);
   CHECK_EQ(hlo->custom_call_target(), kCudnnBatchNormBackwardCallTarget);
   CHECK_EQ(hlo->shape().tuple_shapes_size(), 3);
@@ -253,7 +256,7 @@ Status CudnnBatchNormBackwardThunk::ExecuteOnStream(
       params.profiler->MakeScopedInstructionProfiler(profile_index());
   se::Stream* stream = params.stream;
   TF_RETURN_IF_ERROR(RunCudnnBatchNormBackward(
-      hlo_instruction(), operand, output_grad_data, grad_output,
+      hlo_instruction_, operand, output_grad_data, grad_output,
       output_grad_scale, output_grad_offset,
       se::DeviceMemory<float>(buffer_allocations.GetDeviceAddress(scale_)),
       se::DeviceMemory<float>(buffer_allocations.GetDeviceAddress(mean_)),
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h
index 5897435a58f..bb46017b8fb 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h
@@ -63,6 +63,7 @@ class CudnnBatchNormForwardInferenceThunk : public Thunk {
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
+  const HloInstruction* hlo_instruction_;
   BufferAllocation::Slice operand_;
   BufferAllocation::Slice scale_;
   BufferAllocation::Slice offset_;
@@ -92,6 +93,7 @@ class CudnnBatchNormForwardTrainingThunk : public Thunk {
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
+  const HloInstruction* hlo_instruction_;
   BufferAllocation::Slice operand_;
   BufferAllocation::Slice scale_;
   BufferAllocation::Slice offset_;
@@ -124,6 +126,7 @@ class CudnnBatchNormBackwardThunk : public Thunk {
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
+  const HloInstruction* hlo_instruction_;
   BufferAllocation::Slice operand_;
   BufferAllocation::Slice scale_;
   BufferAllocation::Slice mean_;
diff --git a/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc
index 16a1f923c91..dae15659402 100644
--- a/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc
@@ -26,11 +26,12 @@ CustomCallThunk::CustomCallThunk(
     std::vector<ShapeTree<BufferAllocation::Slice>> operand_slices,
     ShapeTree<BufferAllocation::Slice> result_slices, std::string opaque)
     : Thunk(Thunk::kCustomCall, thunk_info),
+      hlo_instruction_(thunk_info.hlo_instruction),
       call_target_(call_target),
       operand_slices_(std::move(operand_slices)),
       result_slices_(std::move(result_slices)),
       opaque_(std::move(opaque)) {
-  const HloInstruction* instr = hlo_instruction();
+  const HloInstruction* instr = hlo_instruction_;
   CHECK_EQ(instr->operand_count(), operand_slices_.size());
   for (int64 i = 0; i < instr->operand_count(); ++i) {
     const auto& s1 = operand_slices_[i].shape();
diff --git a/tensorflow/compiler/xla/service/gpu/custom_call_thunk.h b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.h
index 72175daf3dd..31c03f5252f 100644
--- a/tensorflow/compiler/xla/service/gpu/custom_call_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.h
@@ -46,6 +46,7 @@ class CustomCallThunk : public Thunk {
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
+  const HloInstruction* hlo_instruction_;
   void* call_target_;
   std::vector<ShapeTree<BufferAllocation::Slice>> operand_slices_;
   ShapeTree<BufferAllocation::Slice> result_slices_;
diff --git a/tensorflow/compiler/xla/service/gpu/data/hlo_algorithm_blacklist.pbtxt b/tensorflow/compiler/xla/service/gpu/data/hlo_algorithm_denylist.pbtxt
similarity index 100%
rename from tensorflow/compiler/xla/service/gpu/data/hlo_algorithm_blacklist.pbtxt
rename to tensorflow/compiler/xla/service/gpu/data/hlo_algorithm_denylist.pbtxt
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index eee0fc83481..3f000a2491d 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -289,7 +289,7 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitTanh(PrimitiveType prim_type,
   auto one_with_sign = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::copysign,
                                                     {one, input}, {type}, b_);
   return FPCast(Select(FCmpULT(abs_value, max_value), fast_tanh, one_with_sign),
-                value->getType());
+                value->getType(), "tanh");
 }
 
 StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitComplexAbs(
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.cc b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
index 7fc3bdd4436..ccd661d8ade 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
@@ -26,6 +26,7 @@ namespace gpu {
 ForThunk::ForThunk(ThunkInfo thunk_info, const int64 loop_limit,
                    std::unique_ptr<ThunkSequence> body_thunk_sequence)
     : Thunk(Kind::kWhile, thunk_info),
+      hlo_instruction_(thunk_info.hlo_instruction),
       loop_limit_(loop_limit),
       body_thunk_sequence_(absl::make_unique<SequentialThunk>(
           // Pass nullptr as the HloInstruction* to the body_thunk_sequence_
@@ -33,11 +34,6 @@ ForThunk::ForThunk(ThunkInfo thunk_info, const int64 loop_limit,
           // this ForThunk, and shouldn't be profiled separately from it.
           ThunkInfo(), std::move(*body_thunk_sequence))) {}
 
-void ForThunk::ComputeAnnotations() {
-  Thunk::ComputeAnnotations();
-  body_thunk_sequence_->ComputeAnnotations();
-}
-
 Status ForThunk::Initialize(const GpuExecutable& executable,
                             se::StreamExecutor* executor) {
   TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(executable, executor));
@@ -46,14 +42,14 @@ Status ForThunk::Initialize(const GpuExecutable& executable,
 
 Status ForThunk::ExecuteOnStream(const ExecuteParams& params) {
   VLOG(2) << "Executing ForThunk with " << loop_limit_ << " iters for "
-          << (hlo_instruction() ? hlo_instruction()->ToString() : "<null>");
+          << (hlo_instruction_ ? hlo_instruction_->ToString() : "<null>");
   auto op_profiler =
       params.profiler->MakeScopedInstructionProfiler(profile_index());
   for (int64 i = 0; i < loop_limit_; ++i) {
     params.profiler->StartHloComputation();
     // Invoke loop body thunk sequence.
     TF_RETURN_IF_ERROR(body_thunk_sequence_->ExecuteOnStream(params));
-    params.profiler->FinishHloComputation(hlo_instruction()->while_body());
+    params.profiler->FinishHloComputation(hlo_instruction_->while_body());
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.h b/tensorflow/compiler/xla/service/gpu/for_thunk.h
index 77a89ea6023..b6ee950737e 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.h
@@ -36,12 +36,12 @@ class ForThunk : public Thunk {
   ForThunk(const ForThunk&) = delete;
   ForThunk& operator=(const ForThunk&) = delete;
 
-  void ComputeAnnotations() override;
   Status Initialize(const GpuExecutable& executable,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
+  const HloInstruction* hlo_instruction_;
   const int64 loop_limit_;
   std::unique_ptr<SequentialThunk> body_thunk_sequence_;
 };
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
index 561dfbe3137..e55df0bb230 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
@@ -40,6 +40,7 @@ GemmThunk::GemmThunk(ThunkInfo thunk_info,
                      bool implements_whole_instruction,
                      const GemmBackendConfig &backend_config)
     : Thunk(Kind::kGemm, thunk_info),
+      hlo_instruction_(thunk_info.hlo_instruction),
       lhs_buffer_(lhs_buffer),
       rhs_buffer_(rhs_buffer),
       output_buffer_(output_buffer),
@@ -51,11 +52,11 @@ Status GemmThunk::ExecuteOnStream(const ExecuteParams &params) {
     return params.buffer_allocations->GetDeviceAddress(slice);
   };
 
-  VLOG(3) << "Running GEMM thunk on instruction: " << hlo_instruction();
+  VLOG(3) << "Running GEMM thunk on instruction: " << hlo_instruction_;
   se::DeviceMemoryBase lhs_data = get_device_address(lhs_buffer_);
   se::DeviceMemoryBase rhs_data = get_device_address(rhs_buffer_);
   se::DeviceMemoryBase output_data = get_device_address(output_buffer_);
-  return RunGemm(hlo_instruction(), backend_config_, lhs_data, rhs_data,
+  return RunGemm(hlo_instruction_, backend_config_, lhs_data, rhs_data,
                  output_data, params.stream, implements_whole_instruction_,
                  profile_index(), params.profiler);
 }
@@ -82,24 +83,28 @@ static bool DoGemmWithAlgorithm(
   // Converts from an XLA PrimitiveType to a blas::ComputationType, which is
   // used to specify the precision with which matmul computations should be
   // performed, separately from the precision of the inputs and result.
-  se::blas::ComputationType computation_type = [&](PrimitiveType type) {
-    switch (type) {
-      case F16:
-        // Use F32 as computation type for F16 as we currently only implement
-        // the cuDNN pseudo half configuration for half precision.
-        return se::blas::ComputationType::kF32;
-      case F32:
-        return se::blas::ComputationType::kF32;
-      case F64:
-        return se::blas::ComputationType::kF64;
-      case C64:
-        return se::blas::ComputationType::kComplexF32;
-      case C128:
-        return se::blas::ComputationType::kComplexF64;
-      default:
-        LOG(FATAL) << "Unsupported type.";
-    }
-  }(type);
+  se::blas::ComputationType computation_type;
+  switch (type) {
+    case F16:
+      // Use F32 as computation type for F16 as we currently only implement
+      // the cuDNN pseudo half configuration for half precision.
+      computation_type = se::blas::ComputationType::kF32;
+      break;
+    case F32:
+      computation_type = se::blas::ComputationType::kF32;
+      break;
+    case F64:
+      computation_type = se::blas::ComputationType::kF64;
+      break;
+    case C64:
+      computation_type = se::blas::ComputationType::kComplexF32;
+      break;
+    case C128:
+      computation_type = se::blas::ComputationType::kComplexF64;
+      break;
+    default:
+      return false;
+  }
 
   se::DeviceMemory<Element> lhs_data(lhs_matrix.data);
   se::DeviceMemory<Element> rhs_data(rhs_matrix.data);
@@ -296,7 +301,7 @@ Status RunGemm(const HloInstruction *gemm,
             stream, best_algorithm,
             /*output_profile_result=*/profile_result);
       default:
-        LOG(FATAL) << "Unsupported type.";
+        return false;
     }
   }();
 
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
index 2bccb7b3572..1a51a7d4e0c 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
@@ -51,6 +51,7 @@ class GemmThunk : public Thunk {
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
+  const HloInstruction* hlo_instruction_;
   const BufferAllocation::Slice lhs_buffer_;
   const BufferAllocation::Slice rhs_buffer_;
   const BufferAllocation::Slice output_buffer_;
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto b/tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto
index 35b5cfacb2d..563245da969 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto
+++ b/tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto
@@ -15,19 +15,19 @@ message ConvInstructionLog {
   repeated uint64 operand_addresses = 4;
 }
 
-message BlacklistedAlgorithm {
+message DenylistedAlgorithm {
   int64 id = 1;
   bool tensor_ops = 2;
 }
 
-message AlgorithmBlacklistEntry {
+message AlgorithmDenylistEntry {
   string hlo = 1;
   tensorflow.ComputeCapability cc = 2;
   tensorflow.CudnnVersion cudnn_version = 3;
   string blas_version = 5;
-  repeated BlacklistedAlgorithm algos = 4;
+  repeated DenylistedAlgorithm algos = 4;
 }
 
-message AlgorithmBlacklist {
-  repeated AlgorithmBlacklistEntry entries = 1;
+message AlgorithmDenylist {
+  repeated AlgorithmDenylistEntry entries = 1;
 }
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 3dcdb4c90eb..f5bf7476059 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -35,6 +35,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/batchnorm_expander.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
+#include "tensorflow/compiler/xla/service/comparison_expander.h"
+#include "tensorflow/compiler/xla/service/conditional_canonicalizer.h"
 #include "tensorflow/compiler/xla/service/conditional_simplifier.h"
 #include "tensorflow/compiler/xla/service/convolution_4d_expander.h"
 #include "tensorflow/compiler/xla/service/dot_decomposer.h"
@@ -42,6 +44,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
 #include "tensorflow/compiler/xla/service/dynamic_padder.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
+#include "tensorflow/compiler/xla/service/gather_expander.h"
 #include "tensorflow/compiler/xla/service/gpu/alias_passthrough_params.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h"
 #include "tensorflow/compiler/xla/service/gpu/fusion_merger.h"
@@ -138,6 +141,9 @@ Status GpuCompiler::OptimizeHloModule(
     pipeline.AddPass<RngExpander>();
     pipeline.AddPass<RngBitGeneratorExpander>(RandomAlgorithm::RNG_PHILOX);
 
+    // Comparison total order expander
+    pipeline.AddPass<ComparisonExpander>();
+
     // Remove zero-sized HLO from the input so that other passes don't have to
     // handle it.
     pipeline.AddPass<ZeroSizedHloElimination>();
@@ -179,7 +185,7 @@ Status GpuCompiler::OptimizeHloModule(
 
     pipeline.AddPass<LogisticExpander>(
         /*expansion_type=*/LogisticExpansionType::kExp);
-
+    pipeline.AddPass<ConditionalCanonicalizer>();
     pipeline.AddPass<DynamicPadder>();
 
     {
@@ -189,11 +195,13 @@ Status GpuCompiler::OptimizeHloModule(
           /*layout_sensitive=*/false,
           /*allow_mixed_precision=*/false);
 
-      pipeline.AddPass<HloGetDimensionSizeRewriter>();
+      pass.AddPass<HloGetDimensionSizeRewriter>();
 
       // BatchNormExpander can create zero-sized ops, so zero-sized HLO
       // elimination has to come after that pass.
-      pipeline.AddPass<ZeroSizedHloElimination>();
+      pass.AddPass<ZeroSizedHloElimination>();
+
+      pass.AddPass<GatherExpander>(GatherExpander::kEliminateSimpleGathers);
 
       AlgebraicSimplifierOptions options;
       // When transposes appear in a fusion node, we can easily adjust the
@@ -537,10 +545,10 @@ static Status CompileModuleToLlvmIrImpl(
       // computation.
       // * For each visit of these HloInstructions, either none or one Thunk
       // will be returned.
-      // * If there is a thunk returned, thunk->hlo_instruction() equals the
+      // * If there is a thunk returned, thunk->hlo_instruction_ equals the
       // input HloInstruction*.
       // * A returned thunk may contain other sub-thunks. A sub-thunk may or may
-      // not have an associated hlo_instruction().
+      // not have an associated hlo_instruction_.
       TF_RET_CHECK(thunks->size() <= 1) << instruction->ToString();
       if (!thunks->empty()) {
         auto thunk = std::move(thunks->front());
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
index 67255f02665..8fb741323f3 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_autotuning.pb.h"
-#include "tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist.h"
+#include "tensorflow/compiler/xla/service/gpu/hlo_algorithm_denylist.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
@@ -438,10 +438,9 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
     (void)blas->GetVersion(&blas_version);
   }
 
-  absl::Span<const AlgorithmDesc> blacklisted_algos =
-      GetBlacklistedConvAlgorithms(GetComputeCapability(stream_exec_),
-                                   GetCudnnVersion(stream_exec_), blas_version,
-                                   canonical_hlo);
+  absl::Span<const AlgorithmDesc> disabled_algos = GetDisabledConvAlgorithms(
+      GetComputeCapability(stream_exec_), GetCudnnVersion(stream_exec_),
+      blas_version, canonical_hlo);
 
   for (const AlgorithmDesc& alg : GetAlgorithms(kind, stream_exec_)) {
     XLA_SCOPED_LOGGING_TIMER_LEVEL(
@@ -449,7 +448,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
                      AlgorithmToString(alg)),
         2);
 
-    if (absl::c_linear_search(blacklisted_algos, alg)) {
+    if (absl::c_linear_search(disabled_algos, alg)) {
       LOG(INFO) << "Omitted potentially buggy algorithm "
                 << AlgorithmToString(alg) << " for conv " << instr->ToString();
       continue;
@@ -503,7 +502,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
 
     if (!input_output_allocator_redzone_clear ||
         !scratch_allocator_redzone_clear) {
-      AlgorithmBlacklist proto;
+      AlgorithmDenylist proto;
       auto entry = proto.add_entries();
       entry->set_hlo(canonical_hlo);
       *entry->mutable_cc() = GetComputeCapability(stream_exec_);
@@ -513,13 +512,12 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
       algo->set_id(alg.algo_id());
       algo->set_tensor_ops(alg.tensor_ops_enabled());
 
-      LOG(ERROR)
-          << "To blacklist this algorithm for this convolution, "
-             "copy-paste the following "
-             "proto to the blacklist file pointed by XLA_FLAGS "
-             "--xla_gpu_algorithm_blacklist_path="
-          << GetDebugOptionsFromFlags().xla_gpu_algorithm_blacklist_path()
-          << " : " << proto.ShortDebugString();
+      LOG(ERROR) << "To denylist this algorithm for this convolution, "
+                    "copy-paste the following "
+                    "proto to the denylist file pointed by XLA_FLAGS "
+                    "--xla_gpu_algorithm_denylist_path="
+                 << GetDebugOptionsFromFlags().xla_gpu_algorithm_denylist_path()
+                 << " : " << proto.ShortDebugString();
       continue;
     }
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_conv_runner.cc b/tensorflow/compiler/xla/service/gpu/gpu_conv_runner.cc
index a6fc4686143..5cc5fa7d16d 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_runner.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_runner.cc
@@ -484,11 +484,12 @@ Status RunGpuConv(const HloCustomCallInstruction* conv,
           return RunGpuConvImpl<int8, float, int8>(params, scratch_allocator,
                                                    stream, options);
         default:
-          LOG(FATAL) << conv->ToString();
+          return Unimplemented("Unimplemented convolution %s",
+                               conv->ToString());
       }
     }
     default:
-      LOG(FATAL) << conv->ToString();
+      return Unimplemented("Unimplemented convolution %s", conv->ToString());
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 89c5e123a48..726f1963545 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -71,7 +71,6 @@ GpuExecutable::GpuExecutable(
   CHECK(has_module() && assignment_);
   GpuDebugInfoManager::Get()->RegisterModule(module().name(), shared_module(),
                                              assignment_);
-  ComputeThunkAnnotations();
 }
 
 GpuExecutable::~GpuExecutable() {
@@ -93,12 +92,6 @@ GpuExecutable::~GpuExecutable() {
   }
 }
 
-void GpuExecutable::ComputeThunkAnnotations() {
-  for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
-    thunk->ComputeAnnotations();
-  }
-}
-
 Status GpuExecutable::CheckCompatibilityWithServiceExecutableRunOptions(
     const ServiceExecutableRunOptions* run_options) {
   se::Stream* main_stream = run_options->stream();
@@ -186,8 +179,8 @@ Status GpuExecutable::ExecuteThunks(
       stream->ThenWaitFor(FindOrDie(thunk_to_finish_event, dependency).get());
     }
 
-    VLOG(2) << "Executing the thunk for " << thunk->name() << " on stream "
-            << stream_no;
+    VLOG(2) << "Executing the thunk for " << thunk->profile_annotation()
+            << " on stream " << stream_no;
     const GpuExecutableRunOptions* gpu_options =
         run_options->run_options().gpu_executable_run_options();
     Thunk::ExecuteParams thunk_params{
@@ -487,6 +480,12 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
       ExecutionInput& input = arguments[alias->parameter_number];
       MaybeOwningDeviceMemory* maybe_owning_memory =
           input.MutableBuffer(alias->parameter_index);
+      if (alias->must_alias() && !maybe_owning_memory->HasOwnership()) {
+        return InvalidArgument(
+            "An input was configured to be must-alias at "
+            "compile time but not donated at runtime: %s",
+            alias->ToString());
+      }
       if (absl::optional<se::OwningDeviceMemory> owning =
               maybe_owning_memory->Release()) {
         // If the caller passes the ownership of the device memory, reuse it
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index 0da446c9739..516fa9b269a 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -115,9 +115,6 @@ class GpuExecutable : public Executable {
   StatusOr<const BufferAllocToDeviceMemoryMap*> ResolveConstantGlobals(
       stream_executor::Stream* stream);
 
-  // Computes annotations for each thunk and store them in thunk_annotations_.
-  void ComputeThunkAnnotations();
-
   // GpuExecutable check with either AMD's ISA version, or Nvidia's major minor
   // version for compute capability, depending on the hardware.
   Status CheckCompatibilityWithServiceExecutableRunOptions(
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist.cc b/tensorflow/compiler/xla/service/gpu/hlo_algorithm_denylist.cc
similarity index 81%
rename from tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist.cc
rename to tensorflow/compiler/xla/service/gpu/hlo_algorithm_denylist.cc
index 601c805ce16..4a0075f2870 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_algorithm_denylist.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist.h"
+#include "tensorflow/compiler/xla/service/gpu/hlo_algorithm_denylist.h"
 
 #include <string>
 
@@ -24,7 +24,7 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-constexpr char kDefaultBlacklist[] = R"pb(
+constexpr char kDefaultDenylist[] = R"pb(
   entries {
     hlo: "(f32[4,32,32,32]{2,1,3,0}, u8[0]{0}) custom-call(f32[4,32,32,32]{2,1,3,0}, f32[5,5,32,32]{1,0,2,3}), window={size=5x5 pad=2_2x2_2}, dim_labels=b01f_01io->b01f, custom_call_target=\"__cudnn$convForward\", backend_config=\"{conv_result_scale:1}\""
     cc { major: 7 }
@@ -41,28 +41,26 @@ constexpr char kDefaultBlacklist[] = R"pb(
   }
 )pb";
 
-absl::Span<const stream_executor::dnn::AlgorithmDesc>
-GetBlacklistedConvAlgorithms(tensorflow::ComputeCapability cc,
-                             tensorflow::CudnnVersion cudnn_version,
-                             const std::string& blas_version,
-                             const std::string& hlo) {
+absl::Span<const stream_executor::dnn::AlgorithmDesc> GetDisabledConvAlgorithms(
+    tensorflow::ComputeCapability cc, tensorflow::CudnnVersion cudnn_version,
+    const std::string& blas_version, const std::string& hlo) {
   // Key is the tuple of canonicalized hlo, compute capability major/minor,
   // cudnn version major/minor/patch, blas version.
   using MapType = absl::flat_hash_map<
       std::tuple<std::string, int, int, int, int, int, std::string>,
       std::vector<stream_executor::dnn::AlgorithmDesc>>;
 
-  static MapType* blacklist = [] {
+  static MapType* denylist = [] {
     MapType* list = new MapType();
-    AlgorithmBlacklist proto;
+    AlgorithmDenylist proto;
     std::string file_path =
-        GetDebugOptionsFromFlags().xla_gpu_algorithm_blacklist_path();
+        GetDebugOptionsFromFlags().xla_gpu_algorithm_denylist_path();
     if (!file_path.empty()) {
       TF_CHECK_OK(tensorflow::ReadTextProto(tensorflow::Env::Default(),
                                             file_path, &proto));
     } else {
       CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
-          std::string(kDefaultBlacklist), &proto));
+          std::string(kDefaultDenylist), &proto));
     }
     for (const auto& entry : proto.entries()) {
       for (const auto& algo : entry.algos()) {
@@ -77,10 +75,10 @@ GetBlacklistedConvAlgorithms(tensorflow::ComputeCapability cc,
     return list;
   }();
 
-  auto iter = blacklist->find(std::make_tuple(
+  auto iter = denylist->find(std::make_tuple(
       hlo, cc.major(), cc.minor(), cudnn_version.major(), cudnn_version.minor(),
       cudnn_version.patch(), std::string(blas_version)));
-  if (iter != blacklist->end()) {
+  if (iter != denylist->end()) {
     return iter->second;
   }
   return {};
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist.h b/tensorflow/compiler/xla/service/gpu/hlo_algorithm_denylist.h
similarity index 62%
rename from tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist.h
rename to tensorflow/compiler/xla/service/gpu/hlo_algorithm_denylist.h
index c1955a452aa..73d1219c1ab 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist.h
+++ b/tensorflow/compiler/xla/service/gpu/hlo_algorithm_denylist.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HLO_ALGORITHM_BLACKLIST_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HLO_ALGORITHM_BLACKLIST_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HLO_ALGORITHM_DENYLIST_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HLO_ALGORITHM_DENYLIST_H_
 
 #include <vector>
 
@@ -24,13 +24,11 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-absl::Span<const stream_executor::dnn::AlgorithmDesc>
-GetBlacklistedConvAlgorithms(tensorflow::ComputeCapability cc,
-                             tensorflow::CudnnVersion cudnn_version,
-                             const std::string& blas_version,
-                             const std::string& hlo);
+absl::Span<const stream_executor::dnn::AlgorithmDesc> GetDisabledConvAlgorithms(
+    tensorflow::ComputeCapability cc, tensorflow::CudnnVersion cudnn_version,
+    const std::string& blas_version, const std::string& hlo);
 
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HLO_ALGORITHM_BLACKLIST_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HLO_ALGORITHM_DENYLIST_H_
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist_test.cc b/tensorflow/compiler/xla/service/gpu/hlo_algorithm_denylist_test.cc
similarity index 84%
rename from tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist_test.cc
rename to tensorflow/compiler/xla/service/gpu/hlo_algorithm_denylist_test.cc
index bc24f486668..ab1cc1c79de 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_algorithm_denylist_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist.h"
+#include "tensorflow/compiler/xla/service/gpu/hlo_algorithm_denylist.h"
 
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
@@ -26,22 +26,22 @@ namespace xla {
 namespace gpu {
 namespace {
 
-class BlacklistTest : public testing::Test {
+class DenylistTest : public testing::Test {
  protected:
-  BlacklistTest() {
+  DenylistTest() {
     tensorflow::setenv(
         "XLA_FLAGS",
         absl::StrCat(
-            "--xla_gpu_algorithm_blacklist_path=",
+            "--xla_gpu_algorithm_denylist_path=",
             tensorflow::GetDataDependencyFilepath(tensorflow::io::JoinPath(
                 "tensorflow", "compiler", "xla", "service", "gpu", "data",
-                "hlo_algorithm_blacklist.pbtxt")))
+                "hlo_algorithm_denylist.pbtxt")))
             .data(),
         0);
   }
 };
 
-TEST_F(BlacklistTest, DefaultTest) {
+TEST_F(DenylistTest, DefaultTest) {
   tensorflow::ComputeCapability cc;
   cc.set_major(7);
   cc.set_minor(0);
@@ -49,7 +49,7 @@ TEST_F(BlacklistTest, DefaultTest) {
   cudnn_version.set_major(7);
   cudnn_version.set_minor(6);
   cudnn_version.set_patch(2);
-  auto list = GetBlacklistedConvAlgorithms(
+  auto list = GetDisabledConvAlgorithms(
       cc, cudnn_version, /*blas_version=*/"9000",
       R"((f16[256,112,112,64]{3,2,1,0}, u8[0]{0}) custom-call(f16[256,224,224,4]{3,2,1,0}, f16[7,7,4,64]{2,1,0,3}), window={size=7x7 stride=2x2 pad=3_3x3_3}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convForward", backend_config="{conv_result_scale:1}")");
   ASSERT_EQ(4, list.size());
@@ -59,7 +59,7 @@ TEST_F(BlacklistTest, DefaultTest) {
   EXPECT_EQ(stream_executor::dnn::AlgorithmDesc(1, true), list[3]);
 }
 
-TEST_F(BlacklistTest, NegativeTest) {
+TEST_F(DenylistTest, NegativeTest) {
   tensorflow::ComputeCapability cc;
   cc.set_major(7);
   cc.set_minor(0);
@@ -68,7 +68,7 @@ TEST_F(BlacklistTest, NegativeTest) {
   cudnn_version.set_minor(6);
   cudnn_version.set_minor(2);
   auto list =
-      GetBlacklistedConvAlgorithms(cc, cudnn_version, "9000", R"(invalid hlo)");
+      GetDisabledConvAlgorithms(cc, cudnn_version, "9000", R"(invalid hlo)");
   ASSERT_EQ(0, list.size());
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
index 23b29df6ec8..5d38d1b727c 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
@@ -38,13 +38,15 @@ using absl::StrCat;
 void HloToIrBindings::EmitBasePointersForHlos(
     absl::Span<const HloInstruction* const> io_hlos,
     absl::Span<const HloInstruction* const> non_io_hlos) {
+  CHECK(is_nested_);
+
   // I/O HLOs are bound to the arguments of the current IR function,
   // *excluding* the output argument, which is added to non-I/O HLOs.
   // I.e.,
   //
-  // void IrFunction(io_0, io_1, ..., io_{m-1}, output_arg, temp_buffer_base) {
+  // void IrFunction(io_0, io_1, ..., io_{m-1}, output_arg);
   llvm::Function* function = b_->GetInsertBlock()->getParent();
-  CHECK_EQ(io_hlos.size() + 2, function->arg_size());
+  CHECK_EQ(io_hlos.size() + 1, function->arg_size());
 
   // An HLO can have duplicated operands. This data structure remembers which
   // operand HLOs are already bound to avoid rebinding the same HLO.
@@ -55,11 +57,7 @@ void HloToIrBindings::EmitBasePointersForHlos(
           !absl::c_count(non_io_hlos, io_hlo))
         << "IO HLOs and non-IO HLOs should be disjoint";
     if (!already_bound_for_this_function.contains(io_hlo)) {
-      if (!is_nested_ && io_hlo->opcode() == HloOpcode::kGetTupleElement) {
-        BindHloToIrValue(*io_hlo, EmitGetTupleElement(io_hlo, &*arg_iter));
-      } else {
-        BindHloToIrValue(*io_hlo, &*arg_iter);
-      }
+      BindHloToIrValue(*io_hlo, &*arg_iter);
       already_bound_for_this_function.insert(io_hlo);
     }
     ++arg_iter;
@@ -69,9 +67,6 @@ void HloToIrBindings::EmitBasePointersForHlos(
   arg_iter->setName("output_arg");
   ++arg_iter;
 
-  temp_buffer_base_ = &*arg_iter;
-  temp_buffer_base_->setName("temp_buffer");
-
   for (const HloInstruction* non_io_hlo : non_io_hlos) {
     if (already_bound_for_this_function.contains(non_io_hlo)) {
       continue;
@@ -79,62 +74,23 @@ void HloToIrBindings::EmitBasePointersForHlos(
     already_bound_for_this_function.insert(non_io_hlo);
 
     if (non_io_hlo->opcode() == HloOpcode::kGetTupleElement) {
-      if (!is_nested_) {
-        // Lookup allocation GetTupleElement operand.
-        const BufferAllocation::Slice slice =
-            buffer_assignment_
-                ->GetUniqueTopLevelSlice(non_io_hlo->LatestNonGteAncestor())
-                .ConsumeValueOrDie();
-        // We are not in a nested context, so check non-thread-local allocation.
-        CHECK(!slice.allocation()->is_thread_local());
-        const int64 offset = slice.offset();
-        CHECK_NE(nullptr, temp_buffer_base_);
-        // Emit IR for GetTupleElement instruction and bind to emitted value.
-        llvm::Value* base_ptr =
-            b_->CreateInBoundsGEP(temp_buffer_base_, b_->getInt64(offset));
-        BindHloToIrValue(*non_io_hlo,
-                         EmitGetTupleElement(non_io_hlo, base_ptr));
-      }
-      continue;
-    }
-
-    if (!buffer_assignment_->HasTopLevelAllocation(non_io_hlo)) {
       continue;
     }
 
     ShapeUtil::ForEachSubshape(
         non_io_hlo->shape(),
         [&](const Shape& /*subshape*/, const ShapeIndex& index) {
-          // A non-IO HLO with a buffer is bound to
-          // (1) an alloca if it is thread-local, or
-          // (2) an internal pointer in temp_buffer_base according to its
-          // offset.
-          auto slice_result =
-              buffer_assignment_->GetUniqueSlice(non_io_hlo, index);
-          if (!slice_result.ok()) {
-            return;
-          }
-          const BufferAllocation::Slice slice =
-              slice_result.ConsumeValueOrDie();
-          if (slice.allocation()->is_thread_local()) {
+          if (non_io_hlo->opcode() == HloOpcode::kConstant) {
+            llvm::Value* global_for_constant = module_->getGlobalVariable(
+                llvm_ir::ConstantHloToGlobalName(*non_io_hlo));
+            BindHloToIrValue(*non_io_hlo, global_for_constant);
+          } else {
             llvm::Type* pointee_type =
                 llvm_ir::ShapeToIrType(non_io_hlo->shape(), module_);
             BindHloToIrValue(*non_io_hlo,
                              llvm_ir::EmitAllocaAtFunctionEntry(
                                  pointee_type, /*name=*/"", b_),
                              index);
-          } else if (slice.allocation()->is_constant()) {
-            llvm::Value* global_for_constant = module_->getGlobalVariable(
-                llvm_ir::ConstantBufferAllocationToGlobalName(
-                    *slice.allocation()));
-            BindHloToIrValue(*non_io_hlo, global_for_constant);
-          } else {
-            const int64 offset = slice.offset();
-            CHECK_NE(nullptr, temp_buffer_base_);
-            BindHloToIrValue(
-                *non_io_hlo,
-                b_->CreateInBoundsGEP(temp_buffer_base_, b_->getInt64(offset)),
-                index);
           }
         });
   }
@@ -231,14 +187,14 @@ llvm_ir::IrArray HloToIrBindings::GetIrArray(const HloInstruction& hlo,
       << " of " << hlo.ToString();
   llvm_ir::IrArray ir_array(base_ptr,
                             ShapeUtil::GetSubshape(hlo.shape(), shape_index));
-  alias_analysis_.AddAliasingInformationToIrArray(hlo, &ir_array, shape_index);
 
   // The GPU backend emits one kernel per top-level HLO, and LLVM views
   // execution of one kernel as the "whole program" executed on the GPU.
   // Therefore if hlo's output buffer is not modified within consumer, and if
   // consumer runs hlo only once (so that it doesn't create two different
   // outputs), then we can mark ir_array as invariant over the whole program.
-  if (BuffersInvariantWithinConsumer(hlo, consumer, buffer_assignment_)) {
+  if (!is_nested_ &&
+      BuffersInvariantWithinConsumer(hlo, consumer, buffer_assignment_)) {
     VLOG(2) << "Marking " << hlo.name() << " as invariant within "
             << consumer.name();
     ir_array.MarkInvariantOverWholeProgram(&module_->getContext());
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
index f57b594e9c1..5eef6727801 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 
 namespace xla {
@@ -42,8 +41,7 @@ class HloToIrBindings {
       : buffer_assignment_(buffer_assignment),
         is_nested_(is_nested),
         b_(b),
-        module_(llvm_module),
-        alias_analysis_(module, *buffer_assignment_, &b_->getContext()) {}
+        module_(llvm_module) {}
 
   void EmitBasePointersForHlos(
       absl::Span<const HloInstruction* const> io_hlos,
@@ -116,8 +114,6 @@ class HloToIrBindings {
 
   // The address of the memory block that contains all temporary buffers.
   llvm::Value* temp_buffer_base_ = nullptr;
-
-  llvm_ir::AliasAnalysis alias_analysis_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
index 43cc5f5a2ae..5fe459a70bc 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
@@ -25,13 +25,15 @@ namespace gpu {
 InfeedThunk::InfeedThunk(
     ThunkInfo thunk_info,
     const ShapeTree<BufferAllocation::Slice>& infeed_slices)
-    : Thunk(Kind::kInfeed, thunk_info), infeed_slices_(infeed_slices) {}
+    : Thunk(Kind::kInfeed, thunk_info),
+      hlo_instruction_(thunk_info.hlo_instruction),
+      infeed_slices_(infeed_slices) {}
 
 Status InfeedThunk::ExecuteOnStream(const ExecuteParams& params) {
   auto& stream = *params.stream;
   auto& buffer_allocations = *params.buffer_allocations;
 
-  VLOG(2) << "Infeeding to GPU: " << hlo_instruction()->ToString();
+  VLOG(2) << "Infeeding to GPU: " << hlo_instruction_->ToString();
 
   auto op_profiler =
       params.profiler->MakeScopedInstructionProfiler(profile_index());
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
index ec33235c466..ab410661ba1 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
@@ -43,6 +43,7 @@ class InfeedThunk : public Thunk {
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
+  const HloInstruction* hlo_instruction_;
   const ShapeTree<BufferAllocation::Slice> infeed_slices_;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index a0580e2ab04..b994ead17ca 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -29,12 +29,27 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+namespace {
+bool ElementIsF32OrF16(const Shape& shape) {
+  PrimitiveType type = shape.element_type();
+  return type == F32 || type == F16;
+}
+}  // namespace
+
 /*static*/ bool GpuInstructionFusion::IsExpensive(
     const HloInstruction& instruction) {
-  // We say that floating-point division is cheap on the GPU.
-  if (instruction.opcode() == HloOpcode::kDivide &&
-      ShapeUtil::ElementIsFloating(instruction.shape())) {
-    return false;
+  // We say that some floating-point math ops are cheap on the GPU. Unlike other
+  // intrinsics that can be expanded into many instructions, Div and Rsqrt are
+  // lowered into single hardware instructions.
+  switch (instruction.opcode()) {
+    case HloOpcode::kDivide:
+    case HloOpcode::kRsqrt:
+      if (ElementIsF32OrF16(instruction.shape())) {
+        return false;
+      }
+      break;
+    default:
+      break;
   }
   return InstructionFusion::IsExpensive(instruction);
 }
diff --git a/tensorflow/compiler/xla/service/gpu/ir/dialect_registration.cc b/tensorflow/compiler/xla/service/gpu/ir/dialect_registration.cc
new file mode 100644
index 00000000000..2e3461951d8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/ir/dialect_registration.cc
@@ -0,0 +1,20 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.h"
+
+// Static initialization for GPU thunks op registration.
+static mlir::DialectRegistration<mlir::xla_thunks::XLAThunksDialect>
+    xla_thunks_ops;
diff --git a/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.cc b/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.cc
new file mode 100644
index 00000000000..154612824ef
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.cc
@@ -0,0 +1,42 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the operations used in the Thunk dialect.
+
+#include "tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.h"
+
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/DialectImplementation.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+
+namespace mlir {
+#include "tensorflow/compiler/xla/service/gpu/ir/xla_thunks_structs.cc.inc"
+namespace xla_thunks {
+
+XLAThunksDialect::XLAThunksDialect(MLIRContext *context)
+    : Dialect(getDialectNamespace(), context, TypeID::get<XLAThunksDialect>()) {
+  addOperations<
+#define GET_OP_LIST
+#include "tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.cc.inc"
+      >();
+}
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.cc.inc"
+
+}  // namespace xla_thunks
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.h b/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.h
new file mode 100644
index 00000000000..ede9adb9ab1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.h
@@ -0,0 +1,42 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_XLA_THUNKS_OPS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_XLA_THUNKS_OPS_H_
+
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+
+namespace mlir {
+class OpBuilder;
+
+#include "tensorflow/compiler/xla/service/gpu/ir/xla_thunks_structs.h.inc"
+
+namespace xla_thunks {
+
+class XLAThunksDialect : public Dialect {
+ public:
+  explicit XLAThunksDialect(MLIRContext *context);
+  static StringRef getDialectNamespace() { return "xla_thunks"; }
+};
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.h.inc"
+
+}  // namespace xla_thunks
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_XLA_THUNKS_OPS_H_
diff --git a/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.td b/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.td
new file mode 100644
index 00000000000..38602550864
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.td
@@ -0,0 +1,57 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Operation definition file for GPU thunks.
+
+#ifndef XLA_THUNKS_OPS
+#define XLA_THUNKS_OPS
+
+include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
+include "mlir/IR/OpBase.td"
+
+class LLVMPointerTo<Type ty>
+    : ContainerType<ty,
+                    CPred<"$_self.cast<::mlir::LLVM::LLVMType>().isPointerTy()">,
+                    "$_self.cast<::mlir::LLVM::LLVMType>().getPointerElementTy()",
+                    "LLVM pointer">;
+
+def XLAThunks_Dialect : Dialect {
+  let name = "xla_thunks";
+  let cppNamespace = "xla_thunks";
+}
+
+class ThunkOp<string mnemonic, list<OpTrait> traits = []> :
+    Op<XLAThunks_Dialect, mnemonic, traits>;
+
+def AllocationSlice : StructAttr<"AllocationSlice", XLAThunks_Dialect, [
+    StructFieldAttr<"allocation_index", I64Attr>,
+    StructFieldAttr<"offset", I64Attr>,
+    StructFieldAttr<"size", I64Attr>,
+  ]> {
+  let description = "Defines a slice of an allocation for XLA thunk ops";
+}
+
+def MemzeroThunkOp : ThunkOp<"execute_memzero_thunk"> {
+  let arguments = (ins
+    LLVMPointerTo<LLVMI<8>>:$execute_params,
+    AllocationSlice:$allocation_slice
+  );
+  let results = (outs
+    I<1>:$ok,
+    LLVMPointerTo<LLVMI<8>>:$error_message
+  );
+}
+
+#endif // XLA_THUNKS_OPS
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 04e24733971..31203b9c5f0 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -192,9 +192,6 @@ Status IrEmitter::EmitCallToNestedComputation(
   llvm::Value* casted_output = AddrCastToDefault(output, b_);
   arguments.push_back(casted_output);
 
-  // It is not required to do address space cast because TempBufferBase
-  // is always in addrspace 0.
-  arguments.push_back(bindings_.GetTempBufferBase());
   Call(emitted_function, arguments);
 
   return Status::OK();
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
index 72f48c49096..e96c5f05e60 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
@@ -67,8 +67,6 @@ Status IrEmitterNested::CodegenNestedComputation() {
         root_shape, ir_emitter_context_->llvm_module()->getDataLayout());
     argument_dereferenceable_bytes.push_back(root_size);
   }
-  // The base pointer of the memory block for all pre-allocated temp buffers.
-  argument_types.push_back(b_.getInt8PtrTy());
 
   llvm::FunctionType* function_type =
       llvm::FunctionType::get(b_.getVoidTy(), argument_types, false);
@@ -119,8 +117,8 @@ Status IrEmitterNested::CodegenNestedComputation() {
     llvm::Value* root_value = bindings_.GetBasePointer(*root_instruction);
     const Shape& return_shape = root_instruction->shape();
 
-    // Second last argument is the out parameter.
-    llvm::Argument* out_parameter = std::prev(function->arg_end(), 2);
+    // Last argument is the out parameter.
+    llvm::Argument* out_parameter = std::prev(function->arg_end(), 1);
 
     if (ShapeUtil::IsScalar(return_shape)) {
       llvm::Value* ret_value = Load(root_value, "load_ret_value");
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index a232bf7fce5..61b78b6004d 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/StringRef.h"
@@ -1284,6 +1285,7 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
     if (destination_buffer != source_address) {
       // TODO(b/26783907): Figure out why we never seem to share buffers for
       // key/value sort.
+      VLOG(2) << sort->name() << " requires initial D2D copy for operand " << i;
       thunks.push_back(absl::make_unique<DeviceToDeviceCopyThunk>(
           Thunk::ThunkInfo(),
           /*source_address=*/source_address,
@@ -1294,6 +1296,7 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
 
   uint64 dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
   int64 num_stages = tensorflow::Log2Ceiling(dimension_to_sort_bound);
+  VLOG(2) << sort->name() << " requires " << num_stages << " stages.";
   CHECK_GE(1ULL << num_stages, dimension_to_sort_bound);
   CHECK_LT(1ULL << (num_stages - 1), dimension_to_sort_bound);
 
@@ -1368,11 +1371,27 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
           ir_emitter_context_->gpu_device_info().threads_per_block_limit ||
       total_shared_memory_needed >
           ir_emitter_context_->gpu_device_info().shared_memory_per_block;
+  VLOG(2) << absl::StreamFormat(
+      "%s %s use tiling. No tiling if any of the following is true: "
+      "kTileSize=%d < 128, "
+      "kThreadsPerBlock=%d > threads_per_block_limit=%d, "
+      "total_shared_memory_needed=%d > shared_memory_per_block=%d",
+      sort->name(), (no_tiling ? "won't" : "will"), kTileSize, kThreadsPerBlock,
+      ir_emitter_context_->gpu_device_info().threads_per_block_limit,
+      total_shared_memory_needed,
+      ir_emitter_context_->gpu_device_info().shared_memory_per_block);
 
   uint64 num_blocks = CeilOfRatio(num_iterations, kThreadsPerBlock);
   LaunchDimensions tiled_launch_dimensions(num_blocks, kThreadsPerBlock);
+  VLOG(2) << absl::StreamFormat("%s launch dims: %d blocks, %d threads/block",
+                                sort->name(), num_blocks, kThreadsPerBlock);
 
   auto emit_kernel = [&](absl::Span<const int64> xor_masks) {
+    VLOG(2) << absl::StreamFormat(
+        "%s uses kernel for xor masks [%s]", sort->name(),
+        absl::StrJoin(xor_masks, ", ", [](std::string* out, int64 xor_mask) {
+          absl::StrAppendFormat(out, "0x%x", xor_mask);
+        }));
     thunks.push_back(
         BuildKernelThunk(sort, /*implements_whole_instruction=*/false));
     LaunchDimensions launch_dimensions = xor_masks.size() > 1
@@ -1421,6 +1440,9 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
   if (!xor_masks.empty()) {
     TF_RETURN_IF_ERROR(emit_kernel(xor_masks));
   }
+  VLOG(2) << absl::StreamFormat(
+      "%s requires %d thunks (including any D2D copies)", sort->name(),
+      thunks.size());
 
   AddThunkToThunkSequence(absl::make_unique<SequentialThunk>(
       GetThunkInfo(sort), std::move(thunks)));
@@ -1747,6 +1769,25 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
     auto buffers_it = non_constant_buffers.begin();
     for (; arg_it != kernel->arg_end(); ++arg_it, ++buffers_it) {
       kernel_args[*buffers_it] = arg_it;
+
+      // Annotate all allocations with LLVM's `noalias`.
+      // There are three kinds of allocations:
+      // * Read-only allocations, aka input parameters that are not aliased with
+      // outputs.
+      // * Read-write allocations, including all output buffers, some of which
+      // may alias with input HLO parameters, but aliased HLO buffers are always
+      // assigned with the same allocation.
+      // * The temp buffer.
+      //
+      // Read-only allocations may overlap with each other, but since they are
+      // not mutated, they can always be annotated with `noalias` per LLVM
+      // semantics.
+      //
+      // Read-write allocations and the temp buffer don't overlap with any
+      // allocations, therefore they can also be annotated with `noalias`.
+      kernel->addParamAttr(
+          arg_it->getArgNo(),
+          llvm::Attribute::get(arg_it->getContext(), llvm::Attribute::NoAlias));
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index d2126a8d17d..1228a1b4823 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -83,10 +83,10 @@ const int kDefaultInlineThreshold = 1100;
 static string GetSmName(std::pair<int, int> compute_capability) {
   int compute_capability_version =
       compute_capability.first * 10 + compute_capability.second;
-  int sm_version = 35;
+  int sm_version = 30;
   // If the current compute capability isn't known, fallback to the
   // most recent version before it.
-  for (int v : {75, 72, 70, 62, 61, 60, 53, 52, 50, 37, 35}) {
+  for (int v : {75, 72, 70, 62, 61, 60, 53, 52, 50, 37, 35, 32, 30}) {
     if (v <= compute_capability_version) {
       sm_version = v;
       break;
@@ -630,8 +630,10 @@ StatusOr<std::vector<uint8>> EmitModuleToHsaco(
   // Locate lld.
   // TODO(whchung@gmail.com): change to tensorflow::ROCmRoot() after
   // ROCm-Device-Libs PR.
-  std::string lld_path = tensorflow::io::JoinPath("/opt/rocm", "hcc/bin");
-  auto lld_program = llvm::sys::findProgramByName("ld.lld", {lld_path});
+  std::string lld_path_1 = tensorflow::io::JoinPath("/opt/rocm", "hcc/bin");
+  std::string lld_path_2 = tensorflow::io::JoinPath("/opt/rocm", "llvm/bin");
+  auto lld_program =
+      llvm::sys::findProgramByName("ld.lld", {lld_path_1, lld_path_2});
   if (!lld_program) {
     return xla::InternalError("unable to find ld.lld in PATH: %s",
                               lld_program.getError().message());
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
index 755413beeee..25ab9a7ce6e 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
@@ -544,10 +544,11 @@ NcclAllReduceThunk::NcclAllReduceThunk(
     ThunkInfo thunk_info, int64 replica_count,
     std::vector<NcclAllReduceThunk::Buffer> buffers)
     : Thunk(Thunk::kNcclAllReduce, thunk_info),
+      hlo_instruction_(thunk_info.hlo_instruction),
       replica_count_(replica_count),
       buffers_(std::move(buffers)),
       aux_data_(absl::make_unique<AuxData>()) {
-  CHECK_EQ(hlo_instruction()->operand_count(), buffers_.size());
+  CHECK_EQ(hlo_instruction_->operand_count(), buffers_.size());
 }
 
 // Figures out which devices (named by their replica-ids) are participating in
@@ -557,7 +558,7 @@ Status NcclAllReduceThunk::ExecuteOnStream(const ExecuteParams& params) {
   auto op_profiler =
       params.profiler->MakeScopedInstructionProfiler(profile_index());
 
-  auto* instr = Cast<HloAllReduceInstruction>(hlo_instruction());
+  auto* instr = Cast<HloAllReduceInstruction>(hlo_instruction_);
   int64 local_device_ordinal = params.stream->parent()->device_ordinal();
   GlobalDeviceId global_device_id;
   if (params.gpu_global_device_ids) {
@@ -606,7 +607,7 @@ Status NcclAllReduceThunk::ExecuteOnStream(const ExecuteParams& params) {
 
   // Find or create the rendezvous for this collective operation.
   RendezvousKey rendezvous_key = RendezvousKey::FromInstruction(
-      params.run_id, global_devices, local_devices.size(), hlo_instruction());
+      params.run_id, global_devices, local_devices.size(), hlo_instruction_);
 
   if (VLOG_IS_ON(2)) {
     std::vector<std::string> local_participants;
@@ -633,13 +634,12 @@ Status NcclAllReduceThunk::ExecuteOnStream(const ExecuteParams& params) {
     pbuffer.destination_data =
         params.buffer_allocations->GetDeviceAddress(buffer.destination_buffer);
     pbuffer.primitive_type =
-        hlo_instruction()->operand(i)->shape().element_type();
+        hlo_instruction_->operand(i)->shape().element_type();
     participant.buffers.push_back(pbuffer);
   }
   participant.local_devices = std::move(local_devices);
   participant.nccl_unique_id_callback = params.nccl_unique_id_callback;
-  auto reduction_kind =
-      MatchReductionComputation(hlo_instruction()->to_apply());
+  auto reduction_kind = MatchReductionComputation(hlo_instruction_->to_apply());
   CHECK(reduction_kind.has_value());
   participant.reduction_kind = *reduction_kind;
 
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
index 1df4f0805a6..cbd4fd3aa51 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
@@ -73,6 +73,7 @@ class NcclAllReduceThunk : public Thunk {
   // build, and we don't want to expose *that* mess in the header.)
   struct AuxData;
 
+  const HloInstruction* hlo_instruction_;
   const int64 replica_count_;
   const std::vector<Buffer> buffers_;
   std::unique_ptr<AuxData> aux_data_;
diff --git a/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc
index 104366fd78c..83066a4addf 100644
--- a/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc
@@ -26,13 +26,14 @@ namespace gpu {
 OutfeedThunk::OutfeedThunk(ThunkInfo thunk_info,
                            ShapeTree<BufferAllocation::Slice> outfeed_slices)
     : Thunk(Kind::kOutfeed, thunk_info),
+      hlo_instruction_(thunk_info.hlo_instruction),
       outfeed_slices_(std::move(outfeed_slices)) {}
 
 Status OutfeedThunk::ExecuteOnStream(const ExecuteParams& params) {
   auto& stream = *params.stream;
   auto& buffer_allocations = *params.buffer_allocations;
 
-  VLOG(2) << "Outfeeding from GPU: " << hlo_instruction()->ToString();
+  VLOG(2) << "Outfeeding from GPU: " << hlo_instruction_->ToString();
 
   auto op_profiler =
       params.profiler->MakeScopedInstructionProfiler(profile_index());
@@ -41,13 +42,13 @@ Status OutfeedThunk::ExecuteOnStream(const ExecuteParams& params) {
       outfeed_manager->BlockingGetNextDestination();
 
   // Nothing to be done for empty tuples.
-  if (ShapeUtil::IsEmptyTuple(hlo_instruction()->operand(0)->shape())) {
+  if (ShapeUtil::IsEmptyTuple(hlo_instruction_->operand(0)->shape())) {
     return Status::OK();
   }
-  CHECK(ShapeUtil::Compatible(hlo_instruction()->operand(0)->shape(),
+  CHECK(ShapeUtil::Compatible(hlo_instruction_->operand(0)->shape(),
                               outfeed_buffers->shape()))
       << "XLA program outfeed request of shape "
-      << hlo_instruction()->operand(0)->shape().ToString()
+      << hlo_instruction_->operand(0)->shape().ToString()
       << " did not match the runtime's outfeed buffer of shape "
       << outfeed_buffers->shape().ToString();
 
diff --git a/tensorflow/compiler/xla/service/gpu/outfeed_thunk.h b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.h
index e99174e3c6c..9174e605783 100644
--- a/tensorflow/compiler/xla/service/gpu/outfeed_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.h
@@ -41,6 +41,7 @@ class OutfeedThunk : public Thunk {
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
+  const HloInstruction* hlo_instruction_;
   const ShapeTree<BufferAllocation::Slice> outfeed_slices_;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
index 15cf2493549..903acf4f57d 100644
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
@@ -28,12 +28,6 @@ SequentialThunk::SequentialThunk(ThunkInfo thunk_info,
                                  std::vector<std::unique_ptr<Thunk>> thunks)
     : Thunk(Kind::kSequential, thunk_info), thunks_(std::move(thunks)) {}
 
-void SequentialThunk::ComputeAnnotations() {
-  for (const auto& thunk : thunks_) {
-    thunk->ComputeAnnotations();
-  }
-}
-
 Status SequentialThunk::Initialize(const GpuExecutable& executable,
                                    se::StreamExecutor* executor) {
   for (auto& thunk : thunks_) {
diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
index 127c5bcf734..455ee60fa5c 100644
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
@@ -39,7 +39,6 @@ class SequentialThunk : public Thunk {
 
   const std::vector<std::unique_ptr<Thunk>>& thunks() const { return thunks_; }
 
-  void ComputeAnnotations() override;
   Status Initialize(const GpuExecutable& executable,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD
index a23c14017a4..a2bddd2d0d7 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD
@@ -479,7 +479,10 @@ glob_lit_tests(
         "no_pip",
     ],
     driver = "@llvm-project//mlir:run_lit.sh",
-    test_file_exts = ["hlo"],
+    test_file_exts = [
+        "hlo",
+        "mlir",
+    ],
 )
 
 # Bundle together all of the test utilities that are used by tests.
@@ -487,7 +490,17 @@ filegroup(
     name = "test_utilities",
     testonly = True,
     data = [
-        "//tensorflow/compiler/xla/service/gpu/tests:hlo_to_llvm_ir",
+        ":hlo_to_llvm_ir",
+        ":xla-thunks-opt",
         "@llvm-project//llvm:FileCheck",
     ],
 )
+
+# Binary with only the thunks dialect registered, for testing purposes.
+tf_cc_binary(
+    name = "xla-thunks-opt",
+    deps = [
+        "//tensorflow/compiler/mlir:tf_mlir_opt_main",
+        "//tensorflow/compiler/xla/service/gpu:xla_thunks_dialect_registration",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/gpu/tests/execute_memzero_thunk.mlir b/tensorflow/compiler/xla/service/gpu/tests/execute_memzero_thunk.mlir
new file mode 100644
index 00000000000..82f3f06db5c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/execute_memzero_thunk.mlir
@@ -0,0 +1,15 @@
+// RUN: xla-thunks-opt %s | FileCheck --color --dump-input=fail %s
+
+func @main( %execute_params: !llvm.ptr<i8> ) {
+  // CHECK: "xla_thunks.execute_memzero_thunk"
+  // CHECK-SAME: {allocation_index = 0 : i64, offset = 128 : i64, size = 1024 : i64}
+  // CHECK-SAME: (!llvm.ptr<i8>) -> (i1, !llvm.ptr<i8>)
+  %ok, %error_message =
+      "xla_thunks.execute_memzero_thunk"( %execute_params )
+          { allocation_slice = { allocation_index = 0
+                               , offset = 128
+                               , size = 1024 } }
+          : (!llvm.ptr<i8>) -> (i1, !llvm.ptr<i8>)
+  return
+}
+
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc
index 914b81c632f..3ebac925886 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc
@@ -45,7 +45,7 @@ ENTRY main {
 )";
 
   CompileAndVerifyIr(hlo_string, R"(
-CHECK: @fusion(i8* align 64 dereferenceable(600) %alloc0, i8* align 16 dereferenceable(400) %alloc1, i8* align 64 dereferenceable(864) %temp_buf)
+CHECK: @fusion(i8* noalias align 64 dereferenceable(600) %alloc0, i8* noalias align 16 dereferenceable(400) %alloc1, i8* noalias align 64 dereferenceable(864) %temp_buf)
 )");
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
index 38ff2da7161..8ec00d73711 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
@@ -51,16 +51,9 @@ TEST_F(GpuNoAliasTest, Concat) {
   hlo_module->AddEntryComputation(std::move(computation));
 
   CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
-; CHECK: %[[x_gep:.*]] = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %x{{.*}}, i32 0
-; CHECK: load float, float* %[[x_gep]], {{.*}}, !noalias ![[param_noalias:.*]]
-; CHECK: %[[y_gep:.*]] = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %y{{.*}}, i32 0
-; CHECK: load float, float* %[[y_gep]], {{.*}}, !noalias ![[param_noalias]]
-; CHECK: %[[result_ptr:.*]] = bitcast [2 x [6 x float]]* %fusion{{.*}} to float*
-; CHECK: %[[result_gep:.*]] = getelementptr inbounds float, float* %[[result_ptr]]
-; CHECK: store float {{.*}}, float* %[[result_gep]], align 4, !alias.scope ![[param_noalias]]
-; CHECK: ![[param_noalias]] = !{![[retval_buffer:.*]]}
-      )",
+                     R"(CHECK-LABEL: define{{.*}}void @fusion
+                        CHECK-SAME: i8* noalias align {{[0-9]*}} dereferenceable({{[0-9]*}}) %[[OUTPUT_ALLOC:[a-z0-9]*]]
+                        CHECK: %fusion.raw = {{.*}} %[[OUTPUT_ALLOC]])",
                      /*match_optimized_ir=*/false);
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/scatter.hlo b/tensorflow/compiler/xla/service/gpu/tests/scatter.hlo
index 796c0adadd2..c9e7daeb3bc 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/scatter.hlo
+++ b/tensorflow/compiler/xla/service/gpu/tests/scatter.hlo
@@ -1,6 +1,6 @@
 // RUN: hlo_to_llvm_ir %s | FileCheck %s
 
-// CHECK-LABEL: define void @scatter_TensorFlowScatterV1(i8* align 64 dereferenceable(36) %alloc0, i8* align 16 dereferenceable(36) %alloc1, i8* align 16 dereferenceable(24) %alloc2, i8* align 16 dereferenceable(8) %alloc3) {
+// CHECK-LABEL: define void @scatter_TensorFlowScatterV1(i8* noalias align 64 dereferenceable(36) %alloc0, i8* noalias align 16 dereferenceable(36) %alloc1, i8* noalias align 16 dereferenceable(24) %alloc2, i8* noalias align 16 dereferenceable(8) %alloc3) {
 // CHECK: entry:
 // CHECK:         %[[VAL_32:.*]] = alloca i32, align 4
 // CHECK:         %[[VAL_0:.*]] = getelementptr inbounds i8, i8* %[[VAL_1:.*]], i64 0
@@ -26,7 +26,7 @@
 // CHECK:         ret void
 // CHECK:       scatter_TensorFlowScatterV1.in_bounds-true:       ; preds = %[[VAL_24]]
 // CHECK:         %[[VAL_25:.*]] = getelementptr inbounds [2 x i32], [2 x i32]* %[[VAL_8]], i32 0, i32 %[[VAL_19]]
-// CHECK:         %[[VAL_26:.*]] = load i32, i32* %[[VAL_25]], align 4, !invariant.load !4, !noalias !5
+// CHECK:         %[[VAL_26:.*]] = load i32, i32* %[[VAL_25]], align 4, !invariant.load !4
 // CHECK:         %[[VAL_27:.*]] = add i32 0, %[[VAL_26]]
 // CHECK:         %[[VAL_28:.*]] = icmp ult i32 %[[VAL_26]], 3
 // CHECK:         %[[VAL_29:.*]] = and i1 true, %[[VAL_28]]
@@ -37,7 +37,7 @@
 // CHECK:         %[[VAL_31:.*]] = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* %[[VAL_2]], i32 0, i32 %[[VAL_27]], i32 %[[VAL_18]]
 // CHECK:         %[[VAL_33:.*]] = bitcast [2 x [3 x i32]]* %[[VAL_11]] to i32*
 // CHECK:         %[[VAL_34:.*]] = getelementptr inbounds i32, i32* %[[VAL_33]], i32 %[[VAL_15]]
-// CHECK:         %[[VAL_35:.*]] = load i32, i32* %[[VAL_34]], align 4, !invariant.load !4, !noalias !5
+// CHECK:         %[[VAL_35:.*]] = load i32, i32* %[[VAL_34]], align 4, !invariant.load !4
 // CHECK:         store i32 %[[VAL_35]], i32* %[[VAL_32]], align 4
 // CHECK:         %[[VAL_36:.*]] = load i32, i32* %[[VAL_32]], align 4
 // CHECK:         store atomic i32 %[[VAL_36]], i32* %[[VAL_31]] unordered, align 4
@@ -48,9 +48,6 @@
 // CHECK: !2 = !{i32 0, i32 1}
 // CHECK: !3 = !{i32 0, i32 6}
 // CHECK: !4 = !{}
-// CHECK: !5 = !{!6}
-// CHECK: !6 = !{!"buffer: {index:0, offset:0, size:36}", !7}
-// CHECK: !7 = !{!"XLA global AA domain"}
 
 
 HloModule TensorFlowScatterV1
@@ -75,7 +72,7 @@ ENTRY main {
 
 // -----
 
-// CHECK-LABEL: define void @scatter_ScatterIntoScalar(i8* align 64 dereferenceable(4) %alloc0, i8* align 16 dereferenceable(4) %alloc1, i8* align 16 dereferenceable(4) %alloc2, i8* align 16 %alloc3) {
+// CHECK-LABEL: define void @scatter_ScatterIntoScalar(i8* noalias align 64 dereferenceable(4) %alloc0, i8* noalias align 16 dereferenceable(4) %alloc1, i8* noalias align 16 dereferenceable(4) %alloc2, i8* noalias align 16 %alloc3) {
 // CHECK:       entry:
 // CHECK:         %[[VAL_60:.*]] = alloca i32, align 4
 // CHECK:         %[[VAL_37:.*]] = getelementptr inbounds i8, i8* %[[VAL_38:.*]], i64 0
@@ -101,7 +98,7 @@ ENTRY main {
 // CHECK:       scatter.in_bounds-after:                          ; preds = %[[VAL_59]], %[[VAL_55]]
 // CHECK:         br label %[[VAL_56]]
 // CHECK:       scatter.in_bounds-true:                           ; preds = %[[VAL_55]]
-// CHECK:         %[[VAL_61:.*]] = load i32, i32* %[[VAL_48]], align 4, !invariant.load !3, !noalias !4
+// CHECK:         %[[VAL_61:.*]] = load i32, i32* %[[VAL_48]], align 4, !invariant.load !3
 // CHECK:         store i32 %[[VAL_61]], i32* %[[VAL_60]], align 4
 // CHECK:         %[[VAL_62:.*]] = load i32, i32* %[[VAL_60]], align 4
 // CHECK:         store atomic i32 %[[VAL_62]], i32* %[[VAL_39]] unordered, align 4
@@ -111,9 +108,6 @@ ENTRY main {
 // CHECK: !1 = !{void (i8*, i8*, i8*, i8*)* @scatter_ScatterIntoScalar, !"reqntidx", i32 1}
 // CHECK: !2 = !{i32 0, i32 1}
 // CHECK: !3 = !{}
-// CHECK: !4 = !{!5}
-// CHECK: !5 = !{!"buffer: {index:0, offset:0, size:4}", !6}
-// CHECK: !6 = !{!"XLA global AA domain"}
 
 HloModule ScatterIntoScalar
 
@@ -137,7 +131,7 @@ ENTRY main {
 
 // -----
 
-// CHECK-LABEL: define void @scatter_TensorFlowScatter_Mul(i8* align 64 dereferenceable(36) %alloc0, i8* align 16 dereferenceable(36) %alloc1, i8* align 16 dereferenceable(24) %alloc2, i8* align 16 dereferenceable(8) %alloc3) {
+// CHECK-LABEL: define void @scatter_TensorFlowScatter_Mul(i8* noalias align 64 dereferenceable(36) %alloc0, i8* noalias align 16 dereferenceable(36) %alloc1, i8* noalias align 16 dereferenceable(24) %alloc2, i8* noalias align 16 dereferenceable(8) %alloc3) {
 // CHECK:         %[[VAL_63:.*]] = alloca i32, align 4
 // CHECK:         %[[VAL_64:.*]] = alloca i32, align 4
 // CHECK:         %[[VAL_98:.*]] = alloca i32, align 4
@@ -164,7 +158,7 @@ ENTRY main {
 // CHECK:         ret void
 // CHECK:       scatter_TensorFlowScatter_Mul.in_bounds-true:     ; preds = %[[VAL_89]]
 // CHECK:         %[[VAL_90:.*]] = getelementptr inbounds [2 x i32], [2 x i32]* %[[VAL_73]], i32 0, i32 %[[VAL_84]]
-// CHECK:         %[[VAL_91:.*]] = load i32, i32* %[[VAL_90]], align 4, !invariant.load !4, !noalias !5
+// CHECK:         %[[VAL_91:.*]] = load i32, i32* %[[VAL_90]], align 4, !invariant.load !4
 // CHECK:         %[[VAL_92:.*]] = add i32 0, %[[VAL_91]]
 // CHECK:         %[[VAL_93:.*]] = icmp ult i32 %[[VAL_91]], 3
 // CHECK:         %[[VAL_94:.*]] = and i1 true, %[[VAL_93]]
@@ -175,7 +169,7 @@ ENTRY main {
 // CHECK:         %[[VAL_97:.*]] = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* %[[VAL_67]], i32 0, i32 %[[VAL_92]], i32 %[[VAL_83]]
 // CHECK:         %[[VAL_99:.*]] = bitcast [2 x [3 x i32]]* %[[VAL_76]] to i32*
 // CHECK:         %[[VAL_100:.*]] = getelementptr inbounds i32, i32* %[[VAL_99]], i32 %[[VAL_80]]
-// CHECK:         %[[VAL_101:.*]] = load i32, i32* %[[VAL_100]], align 4, !invariant.load !4, !noalias !5
+// CHECK:         %[[VAL_101:.*]] = load i32, i32* %[[VAL_100]], align 4, !invariant.load !4
 // CHECK:         store i32 %[[VAL_101]], i32* %[[VAL_98]], align 4
 // CHECK:         %[[VAL_102:.*]] = load i32, i32* %[[VAL_98]], align 4
 // CHECK:         %[[VAL_103:.*]] = load i32, i32* %[[VAL_97]], align 4
@@ -186,7 +180,7 @@ ENTRY main {
 // CHECK:       atomic_op_loop_body:                              ; preds = %[[VAL_104]], %[[VAL_95]]
 // CHECK:         %[[VAL_105:.*]] = load i32, i32* %[[VAL_64]], align 4
 // CHECK:         store i32 %[[VAL_105]], i32* %[[VAL_63]], align 4
-// CHECK:         call void @mul_s32(i32* %[[VAL_63]], i32* %[[VAL_98]], i32* %[[VAL_63]], i8* null)
+// CHECK:         call void @mul_s32(i32* %[[VAL_63]], i32* %[[VAL_98]], i32* %[[VAL_63]])
 // CHECK:         %[[VAL_106:.*]] = load i32, i32* %[[VAL_63]], align 4
 // CHECK:         %[[VAL_107:.*]] = cmpxchg i32* %[[VAL_97]], i32 %[[VAL_105]], i32 %[[VAL_106]] seq_cst seq_cst
 // CHECK:         %[[VAL_108:.*]] = extractvalue { i32, i1 } %[[VAL_107]], 0
@@ -199,15 +193,6 @@ ENTRY main {
 // CHECK: !2 = !{i32 0, i32 1}
 // CHECK: !3 = !{i32 0, i32 6}
 // CHECK: !4 = !{}
-// CHECK: !5 = !{!6}
-// CHECK: !6 = !{!"buffer: {index:0, offset:0, size:36}", !7}
-// CHECK: !7 = !{!"XLA global AA domain"}
-// CHECK: !8 = !{!9}
-// CHECK: !9 = !{!"buffer: {index:4, offset:0, size:4}", !7}
-// CHECK: !10 = !{!11}
-// CHECK: !11 = !{!"buffer: {index:6, offset:0, size:4}", !7}
-// CHECK: !12 = !{!13}
-// CHECK: !13 = !{!"buffer: {index:5, offset:0, size:4}", !7}
 
 HloModule TensorFlowScatter_Mul
 
@@ -231,7 +216,7 @@ ENTRY main {
 
 // -----
 
-// CHECK-LABEL: define void @scatter_ScalarUpdate(i8* align 64 dereferenceable(16) %alloc0, i8* align 16 dereferenceable(16) %alloc1, i8* align 16 dereferenceable(4) %alloc2, i8* align 16 dereferenceable(4) %alloc3) {
+// CHECK-LABEL: define void @scatter_ScalarUpdate(i8* noalias align 64 dereferenceable(16) %alloc0, i8* noalias align 16 dereferenceable(16) %alloc1, i8* noalias align 16 dereferenceable(4) %alloc2, i8* noalias align 16 dereferenceable(4) %alloc3) {
 // CHECK:       entry:
 // CHECK:         %[[VAL_146:.*]] = alloca i32, align 4
 // CHECK:         %[[VAL_118:.*]] = getelementptr inbounds i8, i8* %[[VAL_119:.*]], i64 0
@@ -253,7 +238,7 @@ ENTRY main {
 // CHECK:       scatter_ScalarUpdate.in_bounds-after:             ; preds = %[[VAL_138:.*]], %[[VAL_139:.*]]
 // CHECK:         ret void
 // CHECK:       scatter_ScalarUpdate.in_bounds-true:              ; preds = %[[VAL_139]]
-// CHECK:         %[[VAL_140:.*]] = load i32, i32* %[[VAL_126]], align 4, !invariant.load !3, !noalias !4
+// CHECK:         %[[VAL_140:.*]] = load i32, i32* %[[VAL_126]], align 4, !invariant.load !3
 // CHECK:         %[[VAL_141:.*]] = add i32 0, %[[VAL_140]]
 // CHECK:         %[[VAL_142:.*]] = icmp ult i32 %[[VAL_140]], 4
 // CHECK:         %[[VAL_143:.*]] = and i1 true, %[[VAL_142]]
@@ -262,7 +247,7 @@ ENTRY main {
 // CHECK:         br label %[[VAL_137]]
 // CHECK:       scatter.in_bounds-true:                           ; preds = %[[VAL_136]]
 // CHECK:         %[[VAL_145:.*]] = getelementptr inbounds [4 x i32], [4 x i32]* %[[VAL_120]], i32 0, i32 %[[VAL_141]]
-// CHECK:         %[[VAL_147:.*]] = load i32, i32* %[[VAL_129]], align 4, !invariant.load !3, !noalias !4
+// CHECK:         %[[VAL_147:.*]] = load i32, i32* %[[VAL_129]], align 4, !invariant.load !3
 // CHECK:         store i32 %[[VAL_147]], i32* %[[VAL_146]], align 4
 // CHECK:         %[[VAL_148:.*]] = load i32, i32* %[[VAL_146]], align 4
 // CHECK:         store atomic i32 %[[VAL_148]], i32* %[[VAL_145]] unordered, align 4
@@ -272,9 +257,6 @@ ENTRY main {
 // CHECK: !1 = !{void (i8*, i8*, i8*, i8*)* @scatter_ScalarUpdate, !"reqntidx", i32 1}
 // CHECK: !2 = !{i32 0, i32 1}
 // CHECK: !3 = !{}
-// CHECK: !4 = !{!5}
-// CHECK: !5 = !{!"buffer: {index:0, offset:0, size:16}", !6}
-// CHECK: !6 = !{!"XLA global AA domain"}
 
 HloModule ScalarUpdate
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/sorting.hlo b/tensorflow/compiler/xla/service/gpu/tests/sorting.hlo
new file mode 100644
index 00000000000..272c9a25769
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/sorting.hlo
@@ -0,0 +1,394 @@
+// RUN: hlo_to_llvm_ir %s | FileCheck %s
+
+HloModule TestModule
+
+compare {
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+}
+
+// CHECK: define void @sort(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]], i8* noalias align 16 dereferenceable(24) [[ALLOC1:%.*]])
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[SORT_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0]], i64 0
+// CHECK-NEXT:    [[SORT_TYPED:%.*]] = bitcast i8* [[SORT_RAW]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[X_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC1]], i64 0
+// CHECK-NEXT:    [[X_TYPED:%.*]] = bitcast i8* [[X_RAW]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
+// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
+// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
+// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP2]], [[THREAD_ID]]
+// CHECK-NEXT:    [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]])
+// CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = urem i64 [[TMP3]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    br i1 [[TMP6]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
+// CHECK:       sort.in_bounds-after:
+// CHECK-NEXT:    ret void
+// CHECK:       sort.in_bounds-true:
+// CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP4]], 2
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 1
+// CHECK-NEXT:    [[TMP9:%.*]] = icmp slt i64 [[TMP7]], [[TMP8]]
+// CHECK-NEXT:    [[TMP10:%.*]] = icmp slt i64 [[TMP8]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]]
+// CHECK-NEXT:    br i1 [[TMP11]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
+// CHECK:       smaller_comparison_index-after:
+// CHECK-NEXT:    br label [[SORT_IN_BOUNDS_AFTER]]
+// CHECK:       smaller_comparison_index-true:
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED]], i64 0, i64 [[TMP5]], i64 [[TMP8]]
+// CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
+// CHECK-NEXT:    call void @compare(float* [[TMP12]], float* [[TMP13]], i8* [[COMPARE_RETURN_BUFFER]])
+// CHECK-NEXT:    [[TMP14:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
+// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP14]], 0
+// CHECK-NEXT:    br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]]
+// CHECK:       is_smaller_than-after:
+// CHECK-NEXT:    br label [[SMALLER_COMPARISON_INDEX_AFTER]]
+// CHECK:       is_smaller_than-true:
+// CHECK-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP12]], align 4
+// CHECK-NEXT:    [[TMP16:%.*]] = load float, float* [[TMP13]], align 4
+// CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
+// CHECK-NEXT:    store float [[TMP15]], float* [[TMP17]], align 4
+// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED]], i64 0, i64 [[TMP5]], i64 [[TMP8]]
+// CHECK-NEXT:    store float [[TMP16]], float* [[TMP18]], align 4
+// CHECK-NEXT:    br label [[IS_SMALLER_THAN_AFTER]]
+
+// CHECK: define internal void @compare(float* dereferenceable(4) [[P_0_LHS_TYPED:%.*]], float* dereferenceable(4) [[P_0_RHS_TYPED:%.*]], i8* dereferenceable(1) [[OUTPUT_ARG:%.*]])
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[LT_TYPED:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[P_0_LHS_TYPED]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[P_0_RHS_TYPED]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp olt float [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i8
+// CHECK-NEXT:    store i8 [[TMP3]], i8* [[LT_TYPED]], align 1
+// CHECK-NEXT:    [[LOAD_RET_VALUE:%.*]] = load i8, i8* [[LT_TYPED]], align 1
+// CHECK-NEXT:    store i8 [[LOAD_RET_VALUE]], i8* [[OUTPUT_ARG]], align 1
+// CHECK-NEXT:    ret void
+
+// CHECK: define void @sort__1(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]], i8* noalias align 16 dereferenceable(24) [[ALLOC1:%.*]]) {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[SORT_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0]], i64 0
+// CHECK-NEXT:    [[SORT_TYPED:%.*]] = bitcast i8* [[SORT_RAW]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[X_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC1]], i64 0
+// CHECK-NEXT:    [[X_TYPED:%.*]] = bitcast i8* [[X_RAW]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
+// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
+// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
+// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP2]], [[THREAD_ID]]
+// CHECK-NEXT:    [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]])
+// CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = urem i64 [[TMP3]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    br i1 [[TMP6]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
+// CHECK:       sort.in_bounds-after:
+// CHECK-NEXT:    ret void
+// CHECK:       sort.in_bounds-true:
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP4]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = icmp slt i64 [[TMP4]], [[TMP7]]
+// CHECK-NEXT:    [[TMP9:%.*]] = icmp slt i64 [[TMP7]], 3
+// CHECK-NEXT:    [[TMP10:%.*]] = and i1 [[TMP8]], [[TMP9]]
+// CHECK-NEXT:    br i1 [[TMP10]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
+// CHECK:       smaller_comparison_index-after:
+// CHECK-NEXT:    br label [[SORT_IN_BOUNDS_AFTER]]
+// CHECK:       smaller_comparison_index-true:
+// CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED]], i64 0, i64 [[TMP5]], i64 [[TMP4]]
+// CHECK-NEXT:    call void @compare(float* [[TMP11]], float* [[TMP12]], i8* [[COMPARE_RETURN_BUFFER]])
+// CHECK-NEXT:    [[TMP13:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
+// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP13]], 0
+// CHECK-NEXT:    br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]]
+// CHECK:       is_smaller_than-after:
+// CHECK-NEXT:    br label [[SMALLER_COMPARISON_INDEX_AFTER]]
+// CHECK:       is_smaller_than-true:
+// CHECK-NEXT:    [[TMP14:%.*]] = load float, float* [[TMP11]], align 4
+// CHECK-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP12]], align 4
+// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED]], i64 0, i64 [[TMP5]], i64 [[TMP4]]
+// CHECK-NEXT:    store float [[TMP14]], float* [[TMP16]], align 4
+// CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
+// CHECK-NEXT:    store float [[TMP15]], float* [[TMP17]], align 4
+// CHECK-NEXT:    br label [[IS_SMALLER_THAN_AFTER]]
+
+// CHECK: define void @sort__2(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]], i8* noalias align 16 dereferenceable(24) [[ALLOC1:%.*]]) {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[SORT_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0
+// CHECK-NEXT:    [[SORT_TYPED:%.*]] = bitcast i8* [[SORT_RAW]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[X_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC1:%.*]], i64 0
+// CHECK-NEXT:    [[X_TYPED:%.*]] = bitcast i8* [[X_RAW]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
+// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
+// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
+// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP2]], [[THREAD_ID]]
+// CHECK-NEXT:    [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]])
+// CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = urem i64 [[TMP3]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    br i1 [[TMP6]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
+// CHECK:       sort.in_bounds-after:
+// CHECK-NEXT:    ret void
+// CHECK:       sort.in_bounds-true:
+// CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP4]], 2
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 1
+// CHECK-NEXT:    [[TMP9:%.*]] = icmp slt i64 [[TMP7]], [[TMP8]]
+// CHECK-NEXT:    [[TMP10:%.*]] = icmp slt i64 [[TMP8]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]]
+// CHECK-NEXT:    br i1 [[TMP11]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
+// CHECK:       smaller_comparison_index-after:
+// CHECK-NEXT:    br label [[SORT_IN_BOUNDS_AFTER]]
+// CHECK:       smaller_comparison_index-true:
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED]], i64 0, i64 [[TMP5]], i64 [[TMP8]]
+// CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
+// CHECK-NEXT:    call void @compare(float* [[TMP12]], float* [[TMP13]], i8* [[COMPARE_RETURN_BUFFER]])
+// CHECK-NEXT:    [[TMP14:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
+// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP14]], 0
+// CHECK-NEXT:    br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]]
+// CHECK:       is_smaller_than-after:
+// CHECK-NEXT:    br label [[SMALLER_COMPARISON_INDEX_AFTER]]
+// CHECK:       is_smaller_than-true:
+// CHECK-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP12]], align 4
+// CHECK-NEXT:    [[TMP16:%.*]] = load float, float* [[TMP13]], align 4
+// CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
+// CHECK-NEXT:    store float [[TMP15]], float* [[TMP17]], align 4
+// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED]], i64 0, i64 [[TMP5]], i64 [[TMP8]]
+// CHECK-NEXT:    store float [[TMP16]], float* [[TMP18]], align 4
+// CHECK-NEXT:    br label [[IS_SMALLER_THAN_AFTER]]
+ENTRY main {
+  x = f32[2, 3] parameter(0)
+  ROOT sort = f32[2, 3] sort(x), dimensions={1}, to_apply=compare
+}
+
+// -----
+
+HloModule TestModule
+
+compare {
+  p.0.lhs = s32[] parameter(0)
+  p.0.rhs = s32[] parameter(1)
+  p.1.lhs = f32[] parameter(2)
+  p.1.rhs = f32[] parameter(3)
+  ROOT lt = pred[] compare(p.1.lhs, p.1.rhs), direction=LT
+}
+
+// CHECK: define void @sort(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]], i8* noalias align 64 dereferenceable(24) [[ALLOC1:%.*]], i8* noalias align 16 dereferenceable(24) [[ALLOC2:%.*]], i8* noalias align 16 dereferenceable(24) [[ALLOC3:%.*]], i8* noalias align 64 dereferenceable(16) [[ALLOC4:%.*]])
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[SORT_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC4]], i64 0
+// CHECK-NEXT:    [[SORT_TYPED:%.*]] = bitcast i8* [[SORT_RAW]] to [2 x i8*]*
+// CHECK-NEXT:    [[SORT_RAW1:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0]], i64 0
+// CHECK-NEXT:    [[SORT_TYPED2:%.*]] = bitcast i8* [[SORT_RAW1]] to [2 x [3 x i32]]*
+// CHECK-NEXT:    [[SORT_RAW3:%.*]] = getelementptr inbounds i8, i8* [[ALLOC1]], i64 0
+// CHECK-NEXT:    [[SORT_TYPED4:%.*]] = bitcast i8* [[SORT_RAW3]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[X_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC2]], i64 0
+// CHECK-NEXT:    [[X_TYPED:%.*]] = bitcast i8* [[X_RAW]] to [2 x [3 x i32]]*
+// CHECK-NEXT:    [[Y_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC3]], i64 0
+// CHECK-NEXT:    [[Y_TYPED:%.*]] = bitcast i8* [[Y_RAW]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
+// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
+// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
+// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP2]], [[THREAD_ID]]
+// CHECK-NEXT:    [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]])
+// CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = urem i64 [[TMP3]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    br i1 [[TMP6]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
+// CHECK:       sort.in_bounds-after:
+// CHECK-NEXT:    ret void
+// CHECK:       sort.in_bounds-true:
+// CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP4]], 2
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 1
+// CHECK-NEXT:    [[TMP9:%.*]] = icmp slt i64 [[TMP7]], [[TMP8]]
+// CHECK-NEXT:    [[TMP10:%.*]] = icmp slt i64 [[TMP8]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]]
+// CHECK-NEXT:    br i1 [[TMP11]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
+// CHECK:       smaller_comparison_index-after:
+// CHECK-NEXT:    br label [[SORT_IN_BOUNDS_AFTER]]
+// CHECK:       smaller_comparison_index-true:
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[SORT_TYPED2]], i64 0, i64 [[TMP5]], i64 [[TMP8]]
+// CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[SORT_TYPED2]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
+// CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED4]], i64 0, i64 [[TMP5]], i64 [[TMP8]]
+// CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED4]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
+// CHECK-NEXT:    call void @compare(i32* [[TMP12]], i32* [[TMP13]], float* [[TMP14]], float* [[TMP15]], i8* [[COMPARE_RETURN_BUFFER]])
+// CHECK-NEXT:    [[TMP16:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
+// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP16]], 0
+// CHECK-NEXT:    br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]]
+// CHECK:       is_smaller_than-after:
+// CHECK-NEXT:    br label [[SMALLER_COMPARISON_INDEX_AFTER]]
+// CHECK:       is_smaller_than-true:
+// CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP12]], align 4
+// CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP13]], align 4
+// CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[SORT_TYPED2]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
+// CHECK-NEXT:    store i32 [[TMP17]], i32* [[TMP19]], align 4
+// CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[SORT_TYPED2]], i64 0, i64 [[TMP5]], i64 [[TMP8]]
+// CHECK-NEXT:    store i32 [[TMP18]], i32* [[TMP20]], align 4
+// CHECK-NEXT:    [[TMP21:%.*]] = load float, float* [[TMP14]], align 4
+// CHECK-NEXT:    [[TMP22:%.*]] = load float, float* [[TMP15]], align 4
+// CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED4]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
+// CHECK-NEXT:    store float [[TMP21]], float* [[TMP23]], align 4
+// CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED4]], i64 0, i64 [[TMP5]], i64 [[TMP8]]
+// CHECK-NEXT:    store float [[TMP22]], float* [[TMP24]], align 4
+// CHECK-NEXT:    br label [[IS_SMALLER_THAN_AFTER]]
+
+// CHECK: define internal void @compare(i32* dereferenceable(4) [[P_0_LHS_TYPED:%.*]], i32* dereferenceable(4) [[P_0_RHS_TYPED:%.*]], float* dereferenceable(4) [[P_1_LHS_TYPED:%.*]], float* dereferenceable(4) [[P_1_RHS_TYPED:%.*]], i8* dereferenceable(1) [[OUTPUT_ARG:%.*]])
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[LT_TYPED:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[P_1_LHS_TYPED]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[P_1_RHS_TYPED]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp olt float [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i8
+// CHECK-NEXT:    store i8 [[TMP3]], i8* [[LT_TYPED]], align 1
+// CHECK-NEXT:    [[LOAD_RET_VALUE:%.*]] = load i8, i8* [[LT_TYPED]], align 1
+// CHECK-NEXT:    store i8 [[LOAD_RET_VALUE]], i8* [[OUTPUT_ARG]], align 1
+// CHECK-NEXT:    ret void
+
+// CHECK: define void @sort__1(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]], i8* noalias align 64 dereferenceable(24) [[ALLOC1:%.*]], i8* noalias align 16 dereferenceable(24) [[ALLOC2:%.*]], i8* noalias align 16 dereferenceable(24) [[ALLOC3:%.*]], i8* noalias align 64 dereferenceable(16) [[ALLOC4:%.*]])
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[SORT_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC4:%.*]], i64 0
+// CHECK-NEXT:    [[SORT_TYPED:%.*]] = bitcast i8* [[SORT_RAW]] to [2 x i8*]*
+// CHECK-NEXT:    [[SORT_RAW1:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0
+// CHECK-NEXT:    [[SORT_TYPED2:%.*]] = bitcast i8* [[SORT_RAW1]] to [2 x [3 x i32]]*
+// CHECK-NEXT:    [[SORT_RAW3:%.*]] = getelementptr inbounds i8, i8* [[ALLOC1:%.*]], i64 0
+// CHECK-NEXT:    [[SORT_TYPED4:%.*]] = bitcast i8* [[SORT_RAW3]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[X_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC2:%.*]], i64 0
+// CHECK-NEXT:    [[X_TYPED:%.*]] = bitcast i8* [[X_RAW]] to [2 x [3 x i32]]*
+// CHECK-NEXT:    [[Y_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC3:%.*]], i64 0
+// CHECK-NEXT:    [[Y_TYPED:%.*]] = bitcast i8* [[Y_RAW]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
+// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
+// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
+// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP2]], [[THREAD_ID]]
+// CHECK-NEXT:    [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]])
+// CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = urem i64 [[TMP3]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    br i1 [[TMP6]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
+// CHECK:       sort.in_bounds-after:
+// CHECK-NEXT:    ret void
+// CHECK:       sort.in_bounds-true:
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP4]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = icmp slt i64 [[TMP4]], [[TMP7]]
+// CHECK-NEXT:    [[TMP9:%.*]] = icmp slt i64 [[TMP7]], 3
+// CHECK-NEXT:    [[TMP10:%.*]] = and i1 [[TMP8]], [[TMP9]]
+// CHECK-NEXT:    br i1 [[TMP10]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
+// CHECK:       smaller_comparison_index-after:
+// CHECK-NEXT:    br label [[SORT_IN_BOUNDS_AFTER]]
+// CHECK:       smaller_comparison_index-true:
+// CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[SORT_TYPED2]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[SORT_TYPED2]], i64 0, i64 [[TMP5]], i64 [[TMP4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED4]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
+// CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED4]], i64 0, i64 [[TMP5]], i64 [[TMP4]]
+// CHECK-NEXT:    call void @compare(i32* [[TMP11]], i32* [[TMP12]], float* [[TMP13]], float* [[TMP14]], i8* [[COMPARE_RETURN_BUFFER]])
+// CHECK-NEXT:    [[TMP15:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
+// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP15]], 0
+// CHECK-NEXT:    br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]]
+// CHECK:       is_smaller_than-after:
+// CHECK-NEXT:    br label [[SMALLER_COMPARISON_INDEX_AFTER]]
+// CHECK:       is_smaller_than-true:
+// CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP11]], align 4
+// CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP12]], align 4
+// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[SORT_TYPED2]], i64 0, i64 [[TMP5]], i64 [[TMP4]]
+// CHECK-NEXT:    store i32 [[TMP16]], i32* [[TMP18]], align 4
+// CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[SORT_TYPED2]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
+// CHECK-NEXT:    store i32 [[TMP17]], i32* [[TMP19]], align 4
+// CHECK-NEXT:    [[TMP20:%.*]] = load float, float* [[TMP13]], align 4
+// CHECK-NEXT:    [[TMP21:%.*]] = load float, float* [[TMP14]], align 4
+// CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED4]], i64 0, i64 [[TMP5]], i64 [[TMP4]]
+// CHECK-NEXT:    store float [[TMP20]], float* [[TMP22]], align 4
+// CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED4]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
+// CHECK-NEXT:    store float [[TMP21]], float* [[TMP23]], align 4
+// CHECK-NEXT:    br label [[IS_SMALLER_THAN_AFTER]]
+
+// CHECK: define void @sort__2(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]], i8* noalias align 64 dereferenceable(24) [[ALLOC1:%.*]], i8* noalias align 16 dereferenceable(24) [[ALLOC2:%.*]], i8* noalias align 16 dereferenceable(24) [[ALLOC3:%.*]], i8* noalias align 64 dereferenceable(16) [[ALLOC4:%.*]])
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[SORT_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC4:%.*]], i64 0
+// CHECK-NEXT:    [[SORT_TYPED:%.*]] = bitcast i8* [[SORT_RAW]] to [2 x i8*]*
+// CHECK-NEXT:    [[SORT_RAW1:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0
+// CHECK-NEXT:    [[SORT_TYPED2:%.*]] = bitcast i8* [[SORT_RAW1]] to [2 x [3 x i32]]*
+// CHECK-NEXT:    [[SORT_RAW3:%.*]] = getelementptr inbounds i8, i8* [[ALLOC1:%.*]], i64 0
+// CHECK-NEXT:    [[SORT_TYPED4:%.*]] = bitcast i8* [[SORT_RAW3]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[X_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC2:%.*]], i64 0
+// CHECK-NEXT:    [[X_TYPED:%.*]] = bitcast i8* [[X_RAW]] to [2 x [3 x i32]]*
+// CHECK-NEXT:    [[Y_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC3:%.*]], i64 0
+// CHECK-NEXT:    [[Y_TYPED:%.*]] = bitcast i8* [[Y_RAW]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
+// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
+// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
+// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP2]], [[THREAD_ID]]
+// CHECK-NEXT:    [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]])
+// CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = urem i64 [[TMP3]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    br i1 [[TMP6]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
+// CHECK:       sort.in_bounds-after:
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast [2 x [3 x i32]]* [[SORT_TYPED2]] to i8*
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[SORT_TYPED]], i64 0, i64 0
+// CHECK-NEXT:    store i8* [[TMP7]], i8** [[TMP8]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast [2 x [3 x float]]* [[SORT_TYPED4]] to i8*
+// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[SORT_TYPED]], i64 0, i64 1
+// CHECK-NEXT:    store i8* [[TMP9]], i8** [[TMP10]], align 8
+// CHECK-NEXT:    ret void
+// CHECK:       sort.in_bounds-true:
+// CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP4]], 2
+// CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 1
+// CHECK-NEXT:    [[TMP13:%.*]] = icmp slt i64 [[TMP11]], [[TMP12]]
+// CHECK-NEXT:    [[TMP14:%.*]] = icmp slt i64 [[TMP12]], 3
+// CHECK-NEXT:    [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
+// CHECK-NEXT:    br i1 [[TMP15]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
+// CHECK:       smaller_comparison_index-after:
+// CHECK-NEXT:    br label [[SORT_IN_BOUNDS_AFTER]]
+// CHECK:       smaller_comparison_index-true:
+// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[SORT_TYPED2]], i64 0, i64 [[TMP5]], i64 [[TMP12]]
+// CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[SORT_TYPED2]], i64 0, i64 [[TMP5]], i64 [[TMP11]]
+// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED4]], i64 0, i64 [[TMP5]], i64 [[TMP12]]
+// CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED4]], i64 0, i64 [[TMP5]], i64 [[TMP11]]
+// CHECK-NEXT:    call void @compare(i32* [[TMP16]], i32* [[TMP17]], float* [[TMP18]], float* [[TMP19]], i8* [[COMPARE_RETURN_BUFFER]])
+// CHECK-NEXT:    [[TMP20:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
+// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP20]], 0
+// CHECK-NEXT:    br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]]
+// CHECK:       is_smaller_than-after:
+// CHECK-NEXT:    br label [[SMALLER_COMPARISON_INDEX_AFTER]]
+// CHECK:       is_smaller_than-true:
+// CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* [[TMP16]], align 4
+// CHECK-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP17]], align 4
+// CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[SORT_TYPED2]], i64 0, i64 [[TMP5]], i64 [[TMP11]]
+// CHECK-NEXT:    store i32 [[TMP21]], i32* [[TMP23]], align 4
+// CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[SORT_TYPED2]], i64 0, i64 [[TMP5]], i64 [[TMP12]]
+// CHECK-NEXT:    store i32 [[TMP22]], i32* [[TMP24]], align 4
+// CHECK-NEXT:    [[TMP25:%.*]] = load float, float* [[TMP18]], align 4
+// CHECK-NEXT:    [[TMP26:%.*]] = load float, float* [[TMP19]], align 4
+// CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED4]], i64 0, i64 [[TMP5]], i64 [[TMP11]]
+// CHECK-NEXT:    store float [[TMP25]], float* [[TMP27]], align 4
+// CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED4]], i64 0, i64 [[TMP5]], i64 [[TMP12]]
+// CHECK-NEXT:    store float [[TMP26]], float* [[TMP28]], align 4
+// CHECK-NEXT:    br label [[IS_SMALLER_THAN_AFTER]]
+ENTRY main {
+  x = s32[2, 3] parameter(0)
+  y = f32[2, 3] parameter(1)
+  ROOT sort = (s32[2, 3], f32[2, 3]) sort(x, y), dimensions={1}, to_apply=compare
+}
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index 0a5382291c9..7a9fedec629 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -69,10 +69,12 @@ class Thunk {
   };
 
   struct ThunkInfo {
+    // Optional. It's only used by subclasses which haven't been migrated away
+    // from HloInstructions. Once the migration is done, Thunks should be fully
+    // serializable.
     const HloInstruction* hlo_instruction = nullptr;
     absl::optional<int64> profile_index;
-    // TODO(timshen): Remove hlo_instruction and add name(),
-    // profile_annotation() here.
+    std::string profile_annotation;
   };
 
   // The hlo_instruction argument is meant to be the instruction this thunk was
@@ -80,9 +82,8 @@ class Thunk {
   // to Thunk::hlo_instruction, so it can be null.
   explicit Thunk(Kind kind, ThunkInfo thunk_info)
       : kind_(kind),
-        hlo_instruction_(thunk_info.hlo_instruction),
-        name_(hlo_instruction_ ? hlo_instruction_->name() : ""),
-        profile_index_(thunk_info.profile_index) {}
+        profile_index_(thunk_info.profile_index),
+        profile_annotation_(thunk_info.profile_annotation) {}
   virtual ~Thunk() {}
   Thunk(const Thunk&) = delete;
   Thunk& operator=(const Thunk&) = delete;
@@ -90,19 +91,6 @@ class Thunk {
   Kind kind() const { return kind_; }
   string profile_annotation() const { return profile_annotation_; }
 
-  absl::string_view name() const { return name_; }
-
-  // Constructs and caches the profile annotation string for this thunk and
-  // any child thunks.
-  virtual void ComputeAnnotations() {
-    const HloInstruction* hlo = hlo_instruction();
-    if (hlo) {
-      profile_annotation_ =
-          absl::StrFormat("Thunk:#hlo_op=%s,hlo_module=%s#", hlo->name(),
-                          hlo->GetModule()->name());
-    }
-  }
-
   // Prepares the thunk for execution on the given StreamExecutor.
   //
   // This may be called multiple times.  Its main purpose is to give us a chance
@@ -134,14 +122,8 @@ class Thunk {
   virtual Status ExecuteOnStream(const ExecuteParams& params) = 0;
 
  protected:
-  const HloInstruction* hlo_instruction() const { return hlo_instruction_; }
-
   absl::optional<int64> profile_index() const { return profile_index_; }
 
-  const HloModuleConfig& GetModuleConfig() const {
-    return hlo_instruction()->GetModule()->config();
-  }
-
   // Safely copies the given buffer to the GPU, deleting it on the host only
   // after the copy has completed.
   template <typename T>
@@ -156,13 +138,8 @@ class Thunk {
 
  private:
   Kind kind_;
-
-  // Will be removed in the future, as Thunk is migrating away from the
-  // monolithic HloInstruction.
-  const HloInstruction* hlo_instruction_;
-  std::string name_;
   absl::optional<int64> profile_index_;
-  string profile_annotation_;
+  std::string profile_annotation_;
 };
 
 // A sequence of thunks.
diff --git a/tensorflow/compiler/xla/service/gpu/thunk_emitter.cc b/tensorflow/compiler/xla/service/gpu/thunk_emitter.cc
index 089d70d658f..690d0c9de56 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/thunk_emitter.cc
@@ -386,6 +386,8 @@ Thunk::ThunkInfo ThunkEmitter::EmissionContext::GetThunkInfo(
   CHECK(hlo);
   Thunk::ThunkInfo info;
   info.hlo_instruction = hlo;
+  info.profile_annotation = absl::StrFormat(
+      "Thunk:#hlo_op=%s,hlo_module=%s#", hlo->name(), hlo->GetModule()->name());
   return info;
 }
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/thunk_schedule.h b/tensorflow/compiler/xla/service/gpu/thunk_schedule.h
index 3801dc8aee8..ceae39583f2 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk_schedule.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk_schedule.h
@@ -80,8 +80,8 @@ class ThunkSchedule {
   // `thunk`.
   //
   // Precondition: `operand` is a non-trivial (i.e. excluding
-  // thunk.hlo_instruction() itself) transitive operand of
-  // thunk.hlo_instruction().
+  // thunk.hlo_instruction_ itself) transitive operand of
+  // thunk.hlo_instruction_.
   void AddDependenciesOnTransitiveOperands(
       const Thunk& thunk, const HloInstruction& operand,
       const absl::flat_hash_map<const HloInstruction*, Thunk*>& hlo_to_thunk);
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.cc b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
index 47a24552b6c..792479df4ac 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
@@ -29,6 +29,7 @@ WhileThunk::WhileThunk(
     std::unique_ptr<ThunkSequence> condition_thunk_sequence,
     std::unique_ptr<ThunkSequence> body_thunk_sequence)
     : Thunk(Kind::kWhile, thunk_info),
+      hlo_instruction_(thunk_info.hlo_instruction),
       condition_result_buffer_index_(condition_result_buffer_index),
       // Pass nullptr as the HloInstruction* to the condition_thunk_sequence_
       // and body_thunk_sequence_ constructors because these SequentialThunks
@@ -39,12 +40,6 @@ WhileThunk::WhileThunk(
       body_thunk_sequence_(absl::make_unique<SequentialThunk>(
           ThunkInfo(), std::move(*body_thunk_sequence))) {}
 
-void WhileThunk::ComputeAnnotations() {
-  Thunk::ComputeAnnotations();
-  condition_thunk_sequence_->ComputeAnnotations();
-  body_thunk_sequence_->ComputeAnnotations();
-}
-
 Status WhileThunk::Initialize(const GpuExecutable& executable,
                               se::StreamExecutor* executor) {
   TF_RETURN_IF_ERROR(
@@ -67,7 +62,7 @@ Status WhileThunk::ExecuteOnStream(const ExecuteParams& params) {
     profiler.StartHloComputation();
     VLOG(3) << "Executing condition computation";
     TF_RETURN_IF_ERROR(condition_thunk_sequence_->ExecuteOnStream(params));
-    profiler.FinishHloComputation(hlo_instruction()->while_condition());
+    profiler.FinishHloComputation(hlo_instruction_->while_condition());
 
     // Copy the result of condition computation and break the loop if 'false'.
     bool condition_result;
@@ -91,7 +86,7 @@ Status WhileThunk::ExecuteOnStream(const ExecuteParams& params) {
     // Invoke thunk sequence for while 'body' computation, and pass on
     // 'profiler' to measure the timing of the thunks in 'body_thunk_sequence_'.
     TF_RETURN_IF_ERROR(body_thunk_sequence_->ExecuteOnStream(params));
-    profiler.FinishHloComputation(hlo_instruction()->while_body());
+    profiler.FinishHloComputation(hlo_instruction_->while_body());
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.h b/tensorflow/compiler/xla/service/gpu/while_thunk.h
index 72d9415b309..707bac15bb2 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.h
@@ -46,12 +46,12 @@ class WhileThunk : public Thunk {
   WhileThunk(const WhileThunk&) = delete;
   WhileThunk& operator=(const WhileThunk&) = delete;
 
-  void ComputeAnnotations() override;
   Status Initialize(const GpuExecutable& executable,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
+  const HloInstruction* hlo_instruction_;
   const BufferAllocation::Slice condition_result_buffer_index_;
   std::unique_ptr<SequentialThunk> condition_thunk_sequence_;
   std::unique_ptr<SequentialThunk> body_thunk_sequence_;
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index 960f60fe882..17a7b18c84b 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -35,7 +35,7 @@ import "tensorflow/compiler/xla/xla_data.proto";
 option cc_enable_arenas = true;
 
 // Serialization of HloInstruction.
-// Next ID: 72
+// Next ID: 73
 message HloInstructionProto {
   reserved 10;
   reserved "parameter_name";
@@ -248,6 +248,9 @@ message HloInstructionProto {
 
   // RNG algorithm used by kRngBitGenerator.
   xla.RandomAlgorithm rng_algorithm = 70;
+
+  // The comparison type used for kCompare.
+  string comparison_type = 72;
 }
 
 // Serialization of HloComputation.
@@ -283,6 +286,16 @@ message HloScheduleProto {
   map<int64, InstructionSequence> sequences = 1;
 }
 
+enum Kind {
+  // Define a UNDEFINED_ALIAS equal to zero to get around the default-0 proto3
+  // behavior and missing has_*() APIs.
+  UNDEFINED_ALIAS = 0;
+  // The buffers may or may not alias at runtime.
+  MAY_ALIAS = 1;
+  // The buffers must alias at runtime.
+  MUST_ALIAS = 2;
+}
+
 message HloInputOutputAliasProto {
   // The following proto describes a pair of aliased an input
   // (described by parameter number and a ShapeIndex of the parameter)
@@ -304,8 +317,8 @@ message HloInputOutputAliasProto {
     int64 parameter_number = 2;
     // ShapeIndex of the parameter instruction.
     repeated int64 parameter_shape_index = 3;
-    reserved 4;
-    reserved "kind";
+    // The kind of alias to be setup.
+    Kind kind = 4;
   }
 
   repeated AliasEntryProto entries = 1;
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index 022046209bf..d640007886c 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -509,7 +509,7 @@ class HloComputation {
 
   enum VisitState { kVisiting, kVisited };
   void ComputeInstructionPostOrder(
-      const HloComputation::ChannelDependencyGroup& channel_dependency_map,
+      const HloComputation::ChannelDependencyGroup& channel_dependency_group,
       std::vector<HloInstruction*>* post_order, HloInstruction* root,
       absl::flat_hash_map<HloInstruction*, VisitState>* visited) const;
 
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.cc b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
index 0f5267e9fbc..4ba67888409 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/comparators.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/comparison_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_clone_context.h"
@@ -258,6 +259,11 @@ HloInstruction* MakeBitcastConvertToHlo(HloInstruction* hlo,
                                         PrimitiveType type) {
   CHECK_NE(hlo->shape().element_type(), type);
   Shape shape = ShapeUtil::ChangeElementType(hlo->shape(), type);
+  // PRED are stored as one byte, PRED have a BitWidth of 1, avoid this problem
+  // by using a convert instead of bitcast convert.
+  if (type == PRED || hlo->shape().element_type() == PRED) {
+    return MakeConvertToHlo(hlo, type);
+  }
   hlo = hlo->parent()->AddInstruction(
       HloInstruction::CreateBitcastConvert(shape, hlo));
   CHECK_EQ(hlo->shape().element_type(), type);
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index ae8f49df4b4..acccf7aac9a 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -440,6 +440,10 @@ Status HloEvaluator::HandleSetDimensionSize(
   Literal result(set_dimension_size->shape());
   memcpy(result.untyped_data(), operand_literal.untyped_data(),
          operand_literal.size_bytes());
+  const Literal& size_literal =
+      GetEvaluatedLiteralFor(set_dimension_size->operand(1));
+  result.SetDynamicSize(set_dimension_size->dimension(),
+                        size_literal.Get<int32>({}));
   evaluated_[set_dimension_size] = std::move(result);
   return Status::OK();
 }
@@ -1569,9 +1573,9 @@ class OutputBatchIndexToInputIndex {
     int64 index_vector_dim = dim_numbers_.index_vector_dim();
     for (int64 i = 0, e = index_vector_.size(); i < e; i++) {
       index_vector_index_[index_vector_dim] = i;
-      // TODO(george): OK what should happen here?
-      // seems OK to crash though.
-      index_vector_[i] = *start_indices_.GetIntegralAsS64(index_vector_index_);
+      auto start_index = start_indices_.GetIntegralAsS64(index_vector_index_);
+      TF_RET_CHECK(start_index.has_value());
+      index_vector_[i] = *start_index;
     }
     return Status::OK();
   }
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index 1a154f32a6f..d5f0c62adc1 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/meta/type_traits.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_evaluator.h"
@@ -47,22 +48,26 @@ template <typename T>
 struct is_complex_t : absl::disjunction<std::is_same<T, complex64>,
                                         std::is_same<T, complex128>> {};
 
+namespace detail {
+template <typename T>
+using unsigned_promoted_type_t =
+    std::make_unsigned_t<decltype(std::declval<T>() + std::declval<T>())>;
+}
+
 // ToArithmeticSafeType(T t):
-//  - converts `t` to the bitwise-equivalent `unsigned T` if T is a signed
+//  - converts `t` to an unsigned integer at least as wide as `int` if T is an
 //    integer, and
 //  - otherwise returns `t` unchanged.
 //
 // It's UB in C++ to under/overflow a signed integer, so we wrap all arithmetic
 // in this type to force 2's complement behavior.
 template <typename T,
-          typename std::enable_if<std::is_integral<T>::value &&
-                                  std::is_signed<T>::value>::type* = nullptr>
-typename std::make_unsigned<T>::type ToArithmeticSafeType(T t) {
-  return static_cast<typename std::make_unsigned<T>::type>(t);
+          typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
+detail::unsigned_promoted_type_t<T> ToArithmeticSafeType(T t) {
+  return static_cast<detail::unsigned_promoted_type_t<T>>(t);
 }
 template <typename T,
-          typename std::enable_if<!std::is_integral<T>::value ||
-                                  !std::is_signed<T>::value>::type* = nullptr>
+          typename std::enable_if<!std::is_integral<T>::value>::type* = nullptr>
 T ToArithmeticSafeType(T t) {
   return std::move(t);
 }
@@ -1076,13 +1081,13 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
-  Status HandleConvolution(HloInstruction* conv) override {
-    auto lhs = conv->operand(0);
-    auto rhs = conv->operand(1);
+  Status HandleConvolutionWithLiterals(HloInstruction* conv,
+                                       const Literal& lhs_literal,
+                                       const Literal& rhs_literal) {
     const auto& window = conv->window();
     const Shape& result_shape = conv->shape();
-    const Shape& lhs_shape = lhs->shape();
-    const Shape& rhs_shape = rhs->shape();
+    const Shape& lhs_shape = lhs_literal.shape();
+    const Shape& rhs_shape = rhs_literal.shape();
 
     TF_CHECK_OK(ShapeUtil::ValidateShape(lhs_shape));
     TF_CHECK_OK(ShapeUtil::ValidateShape(rhs_shape));
@@ -1098,24 +1103,6 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     CHECK_GE(num_spatial_dims, 0);
     CHECK_EQ(window.dimensions_size(), num_spatial_dims);
 
-    const auto lhs_rank = lhs_shape.rank();
-    const auto rhs_rank = rhs_shape.rank();
-
-    CHECK_EQ(num_spatial_dims + 2, lhs_rank);
-    CHECK_EQ(num_spatial_dims + 2, rhs_rank);
-
-    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
-                        ShapeInference::InferConvolveShape(
-                            lhs_shape, rhs_shape, conv->feature_group_count(),
-                            conv->batch_group_count(), window, dnums));
-    CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
-        << "return shape set to: " << ShapeUtil::HumanString(result_shape)
-        << " but is inferred to be: "
-        << ShapeUtil::HumanString(inferred_return_shape);
-
-    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
-    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
-
     std::vector<int64> window_dimension_sizes;
     for (auto i : dnums.kernel_spatial_dimensions()) {
       window_dimension_sizes.push_back(ShapeUtil::GetDimension(rhs_shape, i));
@@ -1271,9 +1258,68 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  Status HandleConvolution(HloInstruction* conv) override {
+    auto lhs = conv->operand(0);
+    auto rhs = conv->operand(1);
+    const auto& window = conv->window();
+    const Shape& result_shape = conv->shape();
+    const Shape& lhs_shape = lhs->shape();
+    const Shape& rhs_shape = rhs->shape();
+
+    TF_CHECK_OK(ShapeUtil::ValidateShape(lhs_shape));
+    TF_CHECK_OK(ShapeUtil::ValidateShape(rhs_shape));
+    CHECK(lhs_shape.IsArray());
+    CHECK(rhs_shape.IsArray());
+
+    const auto& dnums = conv->convolution_dimension_numbers();
+    const int64 num_spatial_dims = dnums.output_spatial_dimensions_size();
+    CHECK_EQ(num_spatial_dims, dnums.input_spatial_dimensions_size());
+    CHECK_EQ(num_spatial_dims, dnums.kernel_spatial_dimensions_size());
+    CHECK_GE(num_spatial_dims, 0);
+    CHECK_EQ(window.dimensions_size(), num_spatial_dims);
+
+    const auto lhs_rank = lhs_shape.rank();
+    const auto rhs_rank = rhs_shape.rank();
+
+    CHECK_EQ(num_spatial_dims + 2, lhs_rank);
+    CHECK_EQ(num_spatial_dims + 2, rhs_rank);
+
+    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
+                        ShapeInference::InferConvolveShape(
+                            lhs_shape, rhs_shape, conv->feature_group_count(),
+                            conv->batch_group_count(), window, dnums));
+    CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
+        << "return shape set to: " << ShapeUtil::HumanString(result_shape)
+        << " but is inferred to be: "
+        << ShapeUtil::HumanString(inferred_return_shape);
+
+    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
+    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
+    const bool lhs_same = ShapeUtil::SameElementType(lhs_shape, result_shape);
+    const bool rhs_same = ShapeUtil::SameElementType(rhs_shape, result_shape);
+    if (rhs_same && lhs_same) {
+      return HandleConvolutionWithLiterals(conv, lhs_literal, rhs_literal);
+    }
+    if (rhs_same) {
+      return HandleConvolutionWithLiterals(
+          conv, lhs_literal.Convert(result_shape.element_type()).ValueOrDie(),
+          rhs_literal);
+    }
+    if (lhs_same) {
+      return HandleConvolutionWithLiterals(
+          conv, lhs_literal,
+          rhs_literal.Convert(result_shape.element_type()).ValueOrDie());
+    }
+    return HandleConvolutionWithLiterals(
+        conv, lhs_literal.Convert(result_shape.element_type()).ValueOrDie(),
+        rhs_literal.Convert(result_shape.element_type()).ValueOrDie());
+  }
+
   Status HandleDot(HloInstruction* dot) override {
     if (dot->dot_dimension_numbers().rhs_contracting_dimensions_size() == 1 &&
-        parent_->use_fast_path_) {
+        parent_->use_fast_path_ &&
+        ShapeUtil::SameElementType(dot->operand(0)->shape(), dot->shape()) &&
+        ShapeUtil::SameElementType(dot->operand(1)->shape(), dot->shape())) {
       return HandleDot<ReturnT>(dot);
     }
     return HandleDotSlowPath(dot);
@@ -1342,23 +1388,16 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return HandleDotSlowPath(dot);
   }
 
-  Status HandleDotSlowPath(HloInstruction* dot) {
-    auto lhs = dot->operand(0);
-    auto rhs = dot->operand(1);
-    CHECK(dot->shape().IsArray());
-    CHECK(lhs->shape().IsArray());
-    CHECK(rhs->shape().IsArray());
-
+  Status HandleDotSlowPathWithLiterals(HloInstruction* dot,
+                                       const Literal& lhs_literal,
+                                       const Literal& rhs_literal) {
     const auto& dnums = dot->dot_dimension_numbers();
 
-    const auto lhs_rank = lhs->shape().rank();
-    const auto rhs_rank = rhs->shape().rank();
+    const auto lhs_rank = lhs_literal.shape().rank();
+    const auto rhs_rank = rhs_literal.shape().rank();
 
-    CHECK(ShapeUtil::SameElementType(lhs->shape(), rhs->shape()));
-    CHECK(ShapeUtil::SameElementType(lhs->shape(), dot->shape()));
-
-    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
-    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
+    CHECK(ShapeUtil::SameElementType(lhs_literal.shape(), rhs_literal.shape()));
+    CHECK(ShapeUtil::SameElementType(lhs_literal.shape(), dot->shape()));
 
     CHECK_EQ(dnums.lhs_batch_dimensions_size(),
              dnums.rhs_batch_dimensions_size());
@@ -1406,7 +1445,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       const int64 rhs_dnum = dnums.rhs_contracting_dimensions(i);
       accumulate_index_locations.push_back(
           {&lhs_index[lhs_dnum], &rhs_index[rhs_dnum]});
-      const int64 dim_size = lhs->shape().dimensions(lhs_dnum);
+      const int64 dim_size = lhs_literal.shape().dimensions(lhs_dnum);
       accumulate_index_sizes.push_back(dim_size);
     }
     const int64 total_contraction_size = Product(accumulate_index_sizes);
@@ -1457,6 +1496,36 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  Status HandleDotSlowPath(HloInstruction* dot) {
+    auto lhs = dot->operand(0);
+    auto rhs = dot->operand(1);
+    CHECK(dot->shape().IsArray());
+    CHECK(lhs->shape().IsArray());
+    CHECK(rhs->shape().IsArray());
+    const bool lhs_same =
+        ShapeUtil::SameElementType(lhs->shape(), dot->shape());
+    const bool rhs_same =
+        ShapeUtil::SameElementType(rhs->shape(), dot->shape());
+    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
+    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
+    if (lhs_same && rhs_same) {
+      return HandleDotSlowPathWithLiterals(dot, lhs_literal, rhs_literal);
+    }
+    if (lhs_same) {
+      return HandleDotSlowPathWithLiterals(
+          dot, lhs_literal,
+          rhs_literal.Convert(dot->shape().element_type()).ValueOrDie());
+    }
+    if (rhs_same) {
+      return HandleDotSlowPathWithLiterals(
+          dot, lhs_literal.Convert(dot->shape().element_type()).ValueOrDie(),
+          rhs_literal);
+    }
+    return HandleDotSlowPathWithLiterals(
+        dot, lhs_literal.Convert(dot->shape().element_type()).ValueOrDie(),
+        rhs_literal.Convert(dot->shape().element_type()).ValueOrDie());
+  }
+
   Status HandlePad(HloInstruction* pad) override {
     CHECK(pad->operand(0)->shape().IsArray());
     // Padding value must be scalar.
@@ -2344,39 +2413,23 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   // Enable CLZ only for int32, uint32, int64 and uint64.
-  template <
-      typename NativeT,
-      typename std::enable_if<
-          (std::is_floating_point<NativeT>::value ||
-           std::is_integral<NativeT>::value || is_complex_t<NativeT>::value) &&
-          !(std::is_same<NativeT, uint32>::value ||
-            std::is_same<NativeT, int32>::value ||
-            std::is_same<NativeT, int64>::value ||
-            std::is_same<NativeT, uint64>::value)>::type* = nullptr>
+  template <typename NativeT,
+            typename std::enable_if<
+                (!std::is_integral<NativeT>::value ||
+                 std::is_same<NativeT, bool>::value)>::type* = nullptr>
   Status HandleClz(HloInstruction* clz) {
     return UnsupportedTypeError(clz);
   }
 
   template <typename NativeT,
             typename std::enable_if<
-                std::is_same<NativeT, uint32>::value ||
-                std::is_same<NativeT, int32>::value>::type* = nullptr>
+                std::is_integral<NativeT>::value &&
+                !std::is_same<NativeT, bool>::value>::type* = nullptr>
   Status HandleClz(HloInstruction* clz) {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[clz],
                         ElementWiseUnaryOp(clz, [](ElementwiseT elem_operand) {
-                          return 31 - tensorflow::Log2Floor(elem_operand);
-                        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<
-                std::is_same<NativeT, uint64>::value ||
-                std::is_same<NativeT, int64>::value>::type* = nullptr>
-  Status HandleClz(HloInstruction* clz) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[clz],
-                        ElementWiseUnaryOp(clz, [](ElementwiseT elem_operand) {
-                          return 63 - tensorflow::Log2Floor64(elem_operand);
+                          return (sizeof(elem_operand) * CHAR_BIT - 1) -
+                                 tensorflow::Log2Floor64(elem_operand);
                         }));
     return Status::OK();
   }
@@ -2385,23 +2438,18 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return HandleClz<ElementwiseT>(clz);
   }
 
-  // Enable Popcnt only for int32, uint32, int64 and uint64.
   template <typename NativeT,
             typename std::enable_if<
-                !(std::is_same<NativeT, uint32>::value ||
-                  std::is_same<NativeT, int32>::value ||
-                  std::is_same<NativeT, uint64>::value ||
-                  std::is_same<NativeT, int64>::value)>::type* = nullptr>
+                (!std::is_integral<NativeT>::value ||
+                 std::is_same<NativeT, bool>::value)>::type* = nullptr>
   Status HandlePopulationCount(HloInstruction* popcnt) {
     return UnsupportedTypeError(popcnt);
   }
 
   template <typename NativeT,
             typename std::enable_if<
-                std::is_same<NativeT, uint32>::value ||
-                std::is_same<NativeT, int32>::value ||
-                std::is_same<NativeT, uint64>::value ||
-                std::is_same<NativeT, int64>::value>::type* = nullptr>
+                std::is_integral<NativeT>::value &&
+                !std::is_same<NativeT, bool>::value>::type* = nullptr>
   Status HandlePopulationCount(HloInstruction* popcnt) {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[popcnt],
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index a50af6bf1b9..d7e8984dee8 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -1181,7 +1181,7 @@ string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) {
       instr_shape = StrCat(
           absl::string_view(instr_shape).substr(0, kMaxShapeLen - 3), "...");
     }
-    lines.push_back(instr_shape);
+    lines.push_back(HtmlLikeStringSanitize(instr_shape));
   }
   if (debug_options_.xla_hlo_graph_addresses()) {
     lines.push_back(StrFormat("[%p]", instr));
diff --git a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc
index e123161720b..34bc30d641f 100644
--- a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc
+++ b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
 
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 
 namespace xla {
@@ -24,9 +25,10 @@ bool HloInputOutputAliasConfig::OutputHasAlias(
   return alias_.element(output_index).has_value();
 }
 
-Status HloInputOutputAliasConfig::SetUpAlias(const ShapeIndex& output_index,
-                                             int64 param_number,
-                                             const ShapeIndex& param_index) {
+Status HloInputOutputAliasConfig::SetUpAlias(
+    const ShapeIndex& output_index, int64 param_number,
+    const ShapeIndex& param_index,
+    HloInputOutputAliasConfig::AliasKind must_alias) {
   TF_RET_CHECK(ShapeUtil::IndexIsValid(alias_.shape(), output_index))
       << "Trying to set up alias at " << output_index.ToString()
       << " which is an invalid index for shape "
@@ -41,7 +43,8 @@ Status HloInputOutputAliasConfig::SetUpAlias(const ShapeIndex& output_index,
       param_number, param_index.ToString(), output_index.ToString(),
       alias_.element(output_index)->parameter_number,
       alias_.element(output_index)->parameter_index.ToString());
-  (*alias_.mutable_element(output_index)) = Alias(param_number, param_index);
+  (*alias_.mutable_element(output_index)) =
+      Alias(param_number, param_index, must_alias);
   VLOG(4) << "Set up alias between output index " << output_index.ToString()
           << " and parameter " << param_index << " at index "
           << param_index.ToString();
@@ -61,6 +64,11 @@ HloInputOutputAliasProto HloInputOutputAliasConfig::ToProto() const {
           for (int64 i : data->parameter_index) {
             entry.add_parameter_shape_index(i);
           }
+          if (data->must_alias()) {
+            entry.set_kind(Kind::MUST_ALIAS);
+          } else {
+            entry.set_kind(Kind::MAY_ALIAS);
+          }
           result.add_entries()->Swap(&entry);
         }
       });
@@ -77,8 +85,9 @@ StatusOr<HloInputOutputAliasConfig> HloInputOutputAliasConfig::CreateFromProto(
     int64 param_number = entry.parameter_number();
     ShapeIndex param_index(entry.parameter_shape_index().begin(),
                            entry.parameter_shape_index().end());
+    AliasKind kind = entry.kind() == Kind::MAY_ALIAS ? kMayAlias : kMustAlias;
     TF_RETURN_IF_ERROR(
-        result.SetUpAlias(output_index, param_number, param_index));
+        result.SetUpAlias(output_index, param_number, param_index, kind));
   }
   return result;
 }
@@ -93,9 +102,9 @@ string HloInputOutputAliasConfig::ToString() const {
 
   ForEachAlias([&](const ShapeIndex& output_index, const Alias& alias) {
     pieces.push_back(absl::StrFormat(
-        "  OutputIndex %s is aliased with parameter %lld at %s:",
-        output_index.ToString(), alias.parameter_number,
-        alias.parameter_index.ToString()));
+        "  OutputIndex %s is %saliased with parameter %lld at %s:",
+        output_index.ToString(), alias.kind == kMustAlias ? "must-" : "may-",
+        alias.parameter_number, alias.parameter_index.ToString()));
   });
   return absl::StrJoin(pieces, "\n");
 }
@@ -112,6 +121,19 @@ string HloInputOutputAliasConfig::ToShortString() const {
   return absl::StrJoin(pieces, ", ");
 }
 
+bool HloInputOutputAliasConfig::ParameterMustAlias(
+    int64 param_number, const ShapeIndex& param_index) const {
+  bool result = false;
+  alias_.ForEachElement(
+      [&](const xla::ShapeIndex&, absl::optional<Alias> alias) {
+        if (alias && alias->parameter_number == param_number &&
+            alias->parameter_index == param_index && alias->must_alias()) {
+          result = true;
+        }
+      });
+  return result;
+}
+
 absl::optional<ShapeIndex> HloInputOutputAliasConfig::GetAliasedOutput(
     int64 param_number, const ShapeIndex& param_index) const {
   absl::optional<ShapeIndex> output;
diff --git a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
index d5ca28e9387..6b84bdb6a68 100644
--- a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
+++ b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
@@ -32,22 +32,32 @@ class HloModule;
 // parameter index in the entry computation.
 class HloInputOutputAliasConfig {
  public:
+  // The kind of aliases which can be set. A kMayAlias is one setup at
+  // compilation time by the user, and has to be respected. A kMustAlias one
+  // might be setup by the compiler, if it decides it is convenient to do so.
+  enum AliasKind {
+    kMayAlias,
+    kMustAlias,
+  };
   // Defines the alias information for a given output buffer. A given output
   // buffer shape index can refer only to one parameter+index.
   struct Alias {
-    Alias(int64 parameter_number, ShapeIndex parameter_index)
+    Alias(int64 parameter_number, ShapeIndex parameter_index,
+          AliasKind kind = kMayAlias)
         : parameter_number(parameter_number),
-          parameter_index(std::move(parameter_index)) {}
+          parameter_index(std::move(parameter_index)),
+          kind(kind) {}
 
     int64 parameter_number;
     ShapeIndex parameter_index;
+    AliasKind kind;
+
+    bool must_alias() const { return kind == kMustAlias; }
 
     std::string ToString() {
-      if (parameter_index.empty()) {
-        return absl::StrCat(parameter_number);
-      }
-      return absl::StrFormat("(%lld, %s)", parameter_number,
-                             parameter_index.ToString());
+      return absl::StrFormat("(%lld, %s, %s)", parameter_number,
+                             parameter_index.ToString(),
+                             kind == kMustAlias ? "must_alias" : "may_alias");
     }
   };
 
@@ -61,7 +71,8 @@ class HloInputOutputAliasConfig {
   // Sets up alias config from `output_index` to `param_index` at
   // `param_number`.
   Status SetUpAlias(const ShapeIndex& output_index, int64 param_number,
-                    const ShapeIndex& param_index);
+                    const ShapeIndex& param_index,
+                    AliasKind must_alias = kMayAlias);
 
   // Returns true if the given parameter is aliased with one of the output
   // buffers.
@@ -92,6 +103,11 @@ class HloInputOutputAliasConfig {
   absl::optional<Alias> GetAliasedParameter(
       const ShapeIndex& output_index) const;
 
+  // Returns if the parameter at the given parameter number and parameter
+  // index must-alias with an output.
+  bool ParameterMustAlias(int64 param_number,
+                          const ShapeIndex& param_index) const;
+
   using AliasFn =
       std::function<void(const ShapeIndex& output_index, const Alias&)>;
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 9957df41f1a..2ce3c12b4e9 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -174,8 +174,19 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
             comparison_direction,
             StringToComparisonDirection(proto.comparison_direction()));
       }
-      instruction =
-          CreateCompare(shape, operands(0), operands(1), *comparison_direction);
+      auto comparison_type_str = proto.comparison_type();
+      if (!comparison_type_str.empty()) {
+        // If a comparison type is specified, it *must* be valid.
+        TF_ASSIGN_OR_RETURN(auto comparison_type,
+                            StringToComparisonType(comparison_type_str));
+        instruction = CreateCompare(shape, operands(0), operands(1),
+                                    *comparison_direction, comparison_type);
+      } else {
+        // Allow the specify of comparison type to be optional.
+        // The comparison type will be determined by the types of the operands.
+        instruction = CreateCompare(shape, operands(0), operands(1),
+                                    *comparison_direction);
+      }
       break;
     }
     case HloOpcode::kTriangularSolve: {
@@ -926,8 +937,9 @@ HloInstruction::CreateRngBitGenerator(const Shape& shape, HloInstruction* state,
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateCompare(
     const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
-    ComparisonDirection direction) {
-  return absl::make_unique<HloCompareInstruction>(shape, lhs, rhs, direction);
+    ComparisonDirection direction, absl::optional<Comparison::Type> type) {
+  return absl::make_unique<HloCompareInstruction>(shape, lhs, rhs, direction,
+                                                  type);
 }
 
 /* static */ std::unique_ptr<HloInstruction>
@@ -1750,10 +1762,10 @@ void HloInstruction::DetachFromOperandsAndUsers() {
   }
 }
 
-std::unique_ptr<HloInstruction> HloInstruction::Clone(
-    const string& suffix, HloCloneContext* context) const {
+std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewShape(
+    const Shape& shape, const string& suffix, HloCloneContext* context) const {
   std::unique_ptr<HloInstruction> clone =
-      CloneWithNewOperands(shape_, operands_, context);
+      CloneWithNewOperands(shape, operands_, context);
   if (suffix.empty()) {
     clone->name_ = name();
   } else {
@@ -1790,6 +1802,13 @@ std::unique_ptr<HloInstruction> HloInstruction::Clone(
   return clone;
 }
 
+std::unique_ptr<HloInstruction> HloInstruction::Clone(
+    const string& suffix, HloCloneContext* context) const {
+  std::unique_ptr<HloInstruction> clone =
+      CloneWithNewShape(shape_, suffix, context);
+  return clone;
+}
+
 std::pair<const HloInstruction*, ShapeIndex>
 HloInstruction::LatestNonGteAncestorAndIndex() const {
   const HloInstruction* hlo = this;
@@ -2189,6 +2208,27 @@ Status HloInstruction::ReplaceOperandWithDifferentShape(
   return Status::OK();
 }
 
+Status HloInstruction::ReplaceUsesWith(absl::Span<HloInstruction* const> users,
+                                       HloInstruction* new_producer) {
+  TF_RET_CHECK(
+      ShapeUtil::CompatibleIgnoringFpPrecision(shape(), new_producer->shape()))
+      << shape() << " is not compatible with " << new_producer->shape();
+  return ReplaceAllUsesWithDifferentShape(users, new_producer);
+}
+
+Status HloInstruction::ReplaceAllUsesWithDifferentShape(
+    absl::Span<HloInstruction* const> users, HloInstruction* new_producer) {
+  for (HloInstruction* user : users) {
+    TF_RETURN_IF_ERROR(ReplaceUseWithDifferentShape(user, new_producer));
+  }
+
+  if (parent_ && parent_->root_instruction() == this) {
+    parent_->set_root_instruction(new_producer,
+                                  /*accept_different_shape=*/true);
+  }
+  return Status::OK();
+}
+
 Status HloInstruction::ReplaceAllUsesWith(HloInstruction* new_producer) {
   TF_RET_CHECK(
       ShapeUtil::CompatibleIgnoringFpPrecision(shape(), new_producer->shape()))
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 8c50a9bb8fc..bdd64c908f0 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -595,7 +595,8 @@ class HloInstruction {
   // Creates a compare op, performing the comparison specified in direction.
   static std::unique_ptr<HloInstruction> CreateCompare(
       const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
-      Comparison::Direction direction);
+      Comparison::Direction direction,
+      absl::optional<Comparison::Type> type = absl::nullopt);
 
   static std::unique_ptr<HloInstruction> CreateTriangularSolve(
       const Shape& shape, HloInstruction* a, HloInstruction* b,
@@ -1201,6 +1202,12 @@ class HloInstruction {
   // Same as ReplaceAllUsesWith, but new_producer can have a different shape.
   Status ReplaceAllUsesWithDifferentShape(HloInstruction* new_producer);
 
+  // Same as ReplaceAllUsesWith, but only replace given set of users.
+  Status ReplaceUsesWith(absl::Span<HloInstruction* const> users,
+                         HloInstruction* new_producer);
+  Status ReplaceAllUsesWithDifferentShape(
+      absl::Span<HloInstruction* const> users, HloInstruction* new_producer);
+
   // Performs a postorder DFS visit using this node as the root. If
   // call_finish_visit is true, then DfsHloVisitor::FinishVisit is called when
   // complete. If ignore_control_predecessors is true, instructions only
@@ -1413,6 +1420,11 @@ class HloInstruction {
   std::unique_ptr<HloInstruction> Clone(
       const string& suffix = "clone", HloCloneContext* context = nullptr) const;
 
+  // Clones the HLO instruction as above but with new shape.
+  std::unique_ptr<HloInstruction> CloneWithNewShape(
+      const Shape& shape, const string& suffix = "clone",
+      HloCloneContext* context = nullptr) const;
+
   // Clones the HLO instruction as above but with new shape and operands.
   std::unique_ptr<HloInstruction> CloneWithNewOperands(
       const Shape& shape, absl::Span<HloInstruction* const> new_operands,
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 3d34fa03a80..dbc1d85d1bb 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -204,12 +204,13 @@ std::unique_ptr<HloInstruction> HloFftInstruction::CloneWithNewOperandsImpl(
                                               fft_length_);
 }
 
-HloCompareInstruction::HloCompareInstruction(const Shape& shape,
-                                             HloInstruction* lhs,
-                                             HloInstruction* rhs,
-                                             ComparisonDirection direction)
+HloCompareInstruction::HloCompareInstruction(
+    const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
+    ComparisonDirection direction, absl::optional<Comparison::Type> type)
     : HloInstruction(HloOpcode::kCompare, shape),
-      compare_(direction, lhs->shape().element_type()) {
+      compare_(direction, type ? (*type)
+                               : Comparison::DefaultComparisonType(
+                                     lhs->shape().element_type())) {
   AppendOperand(lhs);
   AppendOperand(rhs);
 }
@@ -218,12 +219,21 @@ HloInstructionProto HloCompareInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
   proto.set_comparison_direction(
       ComparisonDirectionToString(compare_.GetDirection()));
+  proto.set_comparison_type(ComparisonTypeToString(compare_.GetType()));
   return proto;
 }
 
 std::vector<string> HloCompareInstruction::ExtraAttributesToStringImpl(
     const HloPrintOptions& options) const {
-  return {StrCat("direction=", ComparisonDirectionToString(direction()))};
+  std::vector<string> result;
+  result.push_back(
+      StrCat("direction=", ComparisonDirectionToString(direction())));
+  if (compare_.GetType() !=
+      Comparison::DefaultComparisonType(operand(0)->shape().element_type())) {
+    result.push_back(
+        StrCat("type=", ComparisonTypeToString(compare_.GetType())));
+  }
+  return result;
 }
 
 bool HloCompareInstruction::IdenticalSlowPath(
@@ -238,8 +248,8 @@ std::unique_ptr<HloInstruction> HloCompareInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 2);
-  return absl::make_unique<HloCompareInstruction>(shape, new_operands[0],
-                                                  new_operands[1], direction());
+  return absl::make_unique<HloCompareInstruction>(
+      shape, new_operands[0], new_operands[1], direction(), type());
 }
 
 namespace {
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index 51317b32bd0..3f92bb92f02 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -136,8 +136,10 @@ class HloCompareInstruction : public HloInstruction {
  public:
   explicit HloCompareInstruction(const Shape& shape, HloInstruction* lhs,
                                  HloInstruction* rhs,
-                                 ComparisonDirection direction);
+                                 ComparisonDirection direction,
+                                 absl::optional<Comparison::Type> type);
   ComparisonDirection direction() const { return compare_.GetDirection(); }
+  Comparison::Type type() const { return compare_.GetType(); }
   HloInstructionProto ToProto() const override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.cc b/tensorflow/compiler/xla/service/hlo_lexer.cc
index 5502665e886..749193a83ef 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.cc
+++ b/tensorflow/compiler/xla/service/hlo_lexer.cc
@@ -281,6 +281,7 @@ TokKind HloLexer::LexIdentifier() {
   KEYWORD(ROOT);
   KEYWORD(maximal);
   KEYWORD(replicated);
+  KEYWORD(last_tile_dim_replicate);
 
 #undef KEYWORD
 
@@ -495,6 +496,8 @@ string TokKindToString(TokKind kind) {
       return "kw_maximal";
     case TokKind::kw_replicated:
       return "kw_replicated";
+    case TokKind::kw_last_tile_dim_replicate:
+      return "kw_last_tile_dim_replicate";
     case TokKind::kw_nan:
       return "kw_nan";
     case TokKind::kw_inf:
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.h b/tensorflow/compiler/xla/service/hlo_lexer.h
index 6a59f180ad8..b8c7debaab4 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.h
+++ b/tensorflow/compiler/xla/service/hlo_lexer.h
@@ -61,6 +61,7 @@ enum class TokKind {
   kw_false,
   kw_maximal,
   kw_replicated,
+  kw_last_tile_dim_replicate,
   kw_nan,
   kw_inf,
 
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index c715d016c4f..4a67c1d2146 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 
+#include <algorithm>
 #include <iterator>
 #include <set>
 #include <sstream>
@@ -442,6 +443,7 @@ StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromShape(
     }
     module_config.set_use_spmd_partitioning(
         execution_options->use_spmd_partitioning());
+    module_config.set_deduplicate_hlo(execution_options->deduplicate_hlo());
     if (execution_options->has_device_assignment()) {
       TF_ASSIGN_OR_RETURN(std::unique_ptr<DeviceAssignment> device_assignment,
                           DeviceAssignment::Deserialize(
@@ -650,30 +652,28 @@ bool CompareComputationsByContent(HloComputation* a, HloComputation* b) {
 }  // anonymous namespace
 
 std::vector<HloComputation*> HloModule::MakeComputationSorted() const {
-  std::vector<HloComputation*> result;
-  result.reserve(computations_.size());
-  for (const auto& computation : computations_) {
-    result.push_back(computation.get());
+  std::vector<HloComputation*> result = MakeComputationPostOrder();
+  if (config().content_aware_computation_sorting()) {
+    absl::c_sort(result, CompareComputationsByContent);
   }
-  std::sort(result.begin(), result.end(), CompareComputationsByContent);
   return result;
 }
 
 std::vector<HloComputation*> HloModule::MakeNonfusionComputations() const {
-  std::vector<HloComputation*> result;
-  for (auto* c : computations()) {
-    if (c->IsFusionComputation()) {
-      continue;
-    }
-    result.push_back(c);
-  }
+  std::vector<HloComputation*> result = MakeComputationPostOrder();
+  result.erase(std::remove_if(
+                   result.begin(), result.end(),
+                   [](HloComputation* c) { return c->IsFusionComputation(); }),
+               result.end());
   return result;
 }
 
 std::vector<HloComputation*> HloModule::MakeNonfusionComputationsSorted()
     const {
   auto result = MakeNonfusionComputations();
-  std::sort(result.begin(), result.end(), CompareComputationsByContent);
+  if (config().content_aware_computation_sorting()) {
+    absl::c_sort(result, CompareComputationsByContent);
+  }
   return result;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.h b/tensorflow/compiler/xla/service/hlo_module_config.h
index 0abf3a496f7..ae0a8aae838 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.h
+++ b/tensorflow/compiler/xla/service/hlo_module_config.h
@@ -138,6 +138,13 @@ class HloModuleConfig {
   }
   bool use_spmd_partitioning() const { return use_spmd_partitioning_; }
 
+  // If enabled, deduplicate equivalent hlos into function calls to reduce code
+  // size.
+  void set_deduplicate_hlo(bool deduplicate_hlo) {
+    deduplicate_hlo_ = deduplicate_hlo;
+  }
+  bool deduplicate_hlo() const { return deduplicate_hlo_; }
+
   // Return a string which unambiguously represents all the fields of this data
   // structure. Used for generating a cache key for storing the compiled
   // executable.
@@ -188,6 +195,14 @@ class HloModuleConfig {
     alias_passthrough_params_ = alias_passthrough_params;
   }
 
+  bool content_aware_computation_sorting() const {
+    return content_aware_computation_sorting_;
+  }
+  void set_content_aware_computation_sorting(
+      bool content_aware_computation_sorting) {
+    content_aware_computation_sorting_ = content_aware_computation_sorting;
+  }
+
   FusionConfigCollection fusion_config_collection() const {
     return fusion_config_collection_;
   }
@@ -238,6 +253,10 @@ class HloModuleConfig {
   // needs to partition the module.
   bool use_spmd_partitioning_ = false;
 
+  // If enabled, deduplicate equivalent hlos into function calls to reduce code
+  // size.
+  bool deduplicate_hlo_ = false;
+
   // The target maximum parallelism at which to partition HLOs for parallel
   // execution on the CPU backend.
   int64 intra_op_parallelism_threads_ = -1;
@@ -251,6 +270,8 @@ class HloModuleConfig {
 
   bool alias_passthrough_params_ = false;
 
+  bool content_aware_computation_sorting_ = false;
+
   FusionConfigCollection fusion_config_collection_ =
       FusionConfigCollection::kOff;
 
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index ec0540b8607..2afa06a5df4 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -194,6 +194,7 @@ class HloParserImpl : public HloParser {
     kBracedHloComputationList,
     kFftType,
     kComparisonDirection,
+    kComparisonType,
     kWindow,
     kConvolutionDimensionNumbers,
     kSharding,
@@ -327,6 +328,7 @@ class HloParserImpl : public HloParser {
   bool ParseOpcode(HloOpcode* result);
   bool ParseFftType(FftType* result);
   bool ParseComparisonDirection(ComparisonDirection* result);
+  bool ParseComparisonType(Comparison::Type* result);
   bool ParseFusionKind(HloInstruction::FusionKind* result);
   bool ParseRandomDistribution(RandomDistribution* result);
   bool ParseRandomAlgorithm(RandomAlgorithm* result);
@@ -552,33 +554,39 @@ bool HloParserImpl::ParseAliasing(AliasingData* data) {
       return false;
     }
 
-    if (lexer_.GetKind() != TokKind::kLparen) {
-      // Short form: "{0}: 0", output index "{}" is assumed.
-      int64 param_num;
-      ParseInt64(&param_num);
-      data->emplace(std::piecewise_construct, std::forward_as_tuple(out),
-                    std::forward_as_tuple(param_num, ShapeIndex{}));
-    } else {
-      // Long form: "{0}: (0, {0})", output index is explicitly specified.
-      if (!ParseToken(TokKind::kLparen, errmsg)) {
-        return false;
-      }
-      int64 param_num;
-      ParseInt64(&param_num);
-      if (!ParseToken(TokKind::kComma, errmsg)) {
-        return false;
-      }
-      ShapeIndex param_idx;
-      if (!ParseShapeIndex(&param_idx)) {
-        return false;
-      }
-      data->emplace(std::piecewise_construct, std::forward_as_tuple(out),
-                    std::forward_as_tuple(param_num, param_idx));
-      if (!ParseToken(TokKind::kRparen, errmsg)) {
-        return false;
+    if (!ParseToken(TokKind::kLparen, errmsg)) {
+      return false;
+    }
+    int64 param_num;
+    ParseInt64(&param_num);
+    if (!ParseToken(TokKind::kComma, errmsg)) {
+      return false;
+    }
+    ShapeIndex param_idx;
+    if (!ParseShapeIndex(&param_idx)) {
+      return false;
+    }
+
+    HloInputOutputAliasConfig::AliasKind alias_kind =
+        HloInputOutputAliasConfig::kMayAlias;
+    if (EatIfPresent(TokKind::kComma)) {
+      std::string type;
+      ParseName(&type);
+      if (type == "must-alias") {
+        alias_kind = HloInputOutputAliasConfig::kMustAlias;
+      } else if (type == "may-alias") {
+        alias_kind = HloInputOutputAliasConfig::kMayAlias;
+      } else {
+        return TokenError("Unexpected aliasing kind; expected SYSTEM or USER");
       }
     }
 
+    data->emplace(std::piecewise_construct, std::forward_as_tuple(out),
+                  std::forward_as_tuple(param_num, param_idx, alias_kind));
+    if (!ParseToken(TokKind::kRparen, errmsg)) {
+      return false;
+    }
+
     if (!EatIfPresent(TokKind::kComma)) {
       break;
     }
@@ -624,8 +632,9 @@ bool HloParserImpl::ParseHloModule(HloModule* module) {
   if (aliasing_data) {
     HloInputOutputAliasConfig alias_config(module->result_shape());
     for (auto& p : *aliasing_data) {
-      Status st = alias_config.SetUpAlias(p.first, p.second.parameter_number,
-                                          p.second.parameter_index);
+      Status st =
+          alias_config.SetUpAlias(p.first, p.second.parameter_number,
+                                  p.second.parameter_index, p.second.kind);
       if (!st.ok()) {
         return TokenError(st.error_message());
       }
@@ -1355,14 +1364,16 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
     }
     case HloOpcode::kCompare: {
       optional<ComparisonDirection> direction;
+      optional<Comparison::Type> type;
       attrs["direction"] = {/*required=*/true, AttrTy::kComparisonDirection,
                             &direction};
+      attrs["type"] = {/*required=*/false, AttrTy::kComparisonType, &type};
       if (!ParseOperands(&operands, /*expected_size=*/2) ||
           !ParseAttributes(attrs)) {
         return false;
       }
       instruction = builder->AddInstruction(HloInstruction::CreateCompare(
-          shape, operands[0], operands[1], *direction));
+          shape, operands[0], operands[1], *direction, type));
       break;
     }
     case HloOpcode::kCholesky: {
@@ -2129,6 +2140,7 @@ bool HloParserImpl::ParseSingleSharding(OpSharding* sharding,
   LocTy loc = lexer_.GetLoc();
   bool maximal = false;
   bool replicated = false;
+  bool last_tile_dim_replicate = false;
   std::vector<int64> devices;
   std::vector<int64> tile_assignment_dimensions;
   while (lexer_.GetKind() != TokKind::kRbrace) {
@@ -2180,6 +2192,10 @@ bool HloParserImpl::ParseSingleSharding(OpSharding* sharding,
         }
         break;
       }
+      case TokKind::kw_last_tile_dim_replicate:
+        last_tile_dim_replicate = true;
+        lexer_.Lex();
+        break;
       case TokKind::kRbrace:
         break;
       default:
@@ -2218,6 +2234,7 @@ bool HloParserImpl::ParseSingleSharding(OpSharding* sharding,
     for (int64 device : devices) {
       sharding->add_tile_assignment_devices(device);
     }
+    sharding->set_replicate_on_last_tile_dim(last_tile_dim_replicate);
   }
 
   lexer_.Lex();
@@ -2674,7 +2691,9 @@ struct MinMaxFiniteValue<Eigen::half> {
 
 template <>
 struct MinMaxFiniteValue<bfloat16> {
-  static double max() { return static_cast<double>(bfloat16::highest()); }
+  static double max() {
+    return static_cast<double>(Eigen::NumTraits<Eigen::bfloat16>::highest());
+  }
   static double min() { return -max(); }
 };
 
@@ -3003,6 +3022,14 @@ bool HloParserImpl::ParseAttributeHelper(
             ->emplace(result);
         return true;
       }
+      case AttrTy::kComparisonType: {
+        Comparison::Type result;
+        if (!ParseComparisonType(&result)) {
+          return false;
+        }
+        static_cast<optional<Comparison::Type>*>(attr_out_ptr)->emplace(result);
+        return true;
+      }
       case AttrTy::kEnum: {
         if (lexer_.GetKind() != TokKind::kIdent) {
           return TokenError("expects an enumeration value");
@@ -3597,7 +3624,7 @@ bool HloParserImpl::ParseHloComputationList(
     if (!ParseHloComputation(&computation)) {
       return false;
     }
-    LOG(INFO) << "parsed computation " << computation->name();
+    VLOG(3) << "parsed computation " << computation->name();
     result->push_back(computation);
     return true;
   };
@@ -4115,7 +4142,7 @@ bool HloParserImpl::ParseFftType(FftType* result) {
 }
 
 bool HloParserImpl::ParseComparisonDirection(ComparisonDirection* result) {
-  VLOG(1) << "ParseComparisonDirection";
+  VLOG(3) << "ParseComparisonDirection";
   if (lexer_.GetKind() != TokKind::kIdent) {
     return TokenError("expects comparison direction");
   }
@@ -4130,6 +4157,21 @@ bool HloParserImpl::ParseComparisonDirection(ComparisonDirection* result) {
   return true;
 }
 
+bool HloParserImpl::ParseComparisonType(Comparison::Type* result) {
+  VLOG(1) << "ParseComparisonType";
+  if (lexer_.GetKind() != TokKind::kIdent) {
+    return TokenError("expects comparison type");
+  }
+  std::string val = lexer_.GetStrVal();
+  auto status_or_result = StringToComparisonType(val);
+  if (!status_or_result.ok()) {
+    return TokenError(StrFormat("expects comparison type but sees: %s", val));
+  }
+  *result = status_or_result.ValueOrDie();
+  lexer_.Lex();
+  return true;
+}
+
 bool HloParserImpl::ParseFusionKind(HloInstruction::FusionKind* result) {
   VLOG(3) << "ParseFusionKind";
   if (lexer_.GetKind() != TokKind::kIdent) {
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index 484578e5e0e..aba6aeff999 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -230,7 +230,7 @@ R"(HloModule SelectR1F32WithCmpR1F32sFromParamsSmall_module
 ENTRY %SelectR1F32WithCmpR1F32sFromParamsSmall.v4 (v1: f32[4], v2: f32[4]) -> f32[4] {
   %v1 = f32[4]{0} parameter(0), sharding={maximal device=1}
   %v2 = f32[4]{0} parameter(1), sharding={maximal device=1}
-  %greater-than = pred[4]{0} compare(f32[4]{0} %v1, f32[4]{0} %v2), direction=GT, sharding={replicated}
+  %greater-than = pred[4]{0} compare(f32[4]{0} %v1, f32[4]{0} %v2), direction=GT, type=TOTALORDER, sharding={replicated}
   ROOT %select = f32[4]{0} select(pred[4]{0} %greater-than, f32[4]{0} %v1, f32[4]{0} %v2), sharding={}
 }
 
@@ -512,7 +512,7 @@ R"(HloModule R4F32OverlapSmall_module
 %ge_F32.v3 (lhs: f32[], rhs: f32[]) -> pred[] {
   %lhs = f32[] parameter(0)
   %rhs = f32[] parameter(1)
-  ROOT %greater-than-or-equal-to = pred[] compare(f32[] %lhs, f32[] %rhs), direction=GE
+  ROOT %greater-than-or-equal-to = pred[] compare(f32[] %lhs, f32[] %rhs), direction=GE, type=TOTALORDER
 }
 
 %add_F32.v3 (lhs.1: f32[], rhs.1: f32[]) -> f32[] {
@@ -2399,7 +2399,7 @@ ENTRY c2 {
 
 TEST_F(HloParserTest, SimpleAliasing) {
   const string original = R"(
-HloModule Module, input_output_alias={ {0}: (0, {0}), {1}: (0, {1}) }
+HloModule Module, input_output_alias={ {0}: (0, {0}, must-alias), {1}: (0, {1}) }
 
 ENTRY entry {
   %p = (f32[], f32[]) parameter(0)
@@ -2413,42 +2413,13 @@ ENTRY entry {
   std::unique_ptr<HloModule> parsed_module = module.ConsumeValueOrDie();
   EXPECT_EQ(parsed_module->input_output_alias_config().GetAliasedOutput(0, {0}),
             ShapeIndex{0});
+
+  EXPECT_TRUE(
+      parsed_module->input_output_alias_config().ParameterMustAlias(0, {0}));
   EXPECT_EQ(parsed_module->input_output_alias_config().GetAliasedOutput(0, {1}),
             ShapeIndex{1});
-}
-
-TEST_F(HloParserTest, SimpleAliasingShortForm) {
-  const string original = R"(
-HloModule Module, input_output_alias={ {0}: 0, {1}: 1 }
-
-ENTRY entry {
-  %p0 = f32[] parameter(0)
-  %p1 = f32[] parameter(1)
-  ROOT %out = (f32[], f32[]) tuple(%p0, %p1)
-}
-  )";
-  auto module = ParseAndReturnVerifiedModule(original);
-  TF_ASSERT_OK(module.status());
-  std::unique_ptr<HloModule> parsed_module = module.ConsumeValueOrDie();
-  EXPECT_EQ(parsed_module->input_output_alias_config().GetAliasedOutput(0, {}),
-            ShapeIndex{0});
-  EXPECT_EQ(parsed_module->input_output_alias_config().GetAliasedOutput(1, {}),
-            ShapeIndex{1});
-}
-
-TEST_F(HloParserTest, SimpleAliasingShortFormError) {
-  const string original = R"(
-HloModule Module, input_output_alias={ {0}: A, {1}: 1 }
-
-ENTRY entry {
-  %p0 = f32[] parameter(0)
-  %p1 = f32[] parameter(1)
-  ROOT %out = (f32[], f32[]) tuple(%p0, %p1)
-}
-  )";
-  ExpectHasSubstr(
-      ParseAndReturnUnverifiedModule(original).status().error_message(),
-      "expects integer");
+  EXPECT_FALSE(
+      parsed_module->input_output_alias_config().ParameterMustAlias(0, {1}));
 }
 
 TEST_F(HloParserTest, NestedAliasing) {
@@ -2626,6 +2597,21 @@ TEST_F(HloParserTest, ParseSharding) {
   EXPECT_EQ(sharding.ToString(), original);
 }
 
+TEST_F(HloParserTest, ParseShardingPartialReplication) {
+  const string original = "{devices=[2,2]0,1,2,3 last_tile_dim_replicate}";
+  TF_ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
+  EXPECT_EQ(sharding.ToString(), original);
+  Array<int64> group_tiling({2});
+  group_tiling(0) = 0;
+  group_tiling(1) = 1;
+  std::vector<int64> group0_members({0, 1});
+  std::vector<int64> group1_members({2, 3});
+  EXPECT_EQ(
+      HloSharding::PartialTile(group_tiling, {group0_members, group1_members})
+          .ToString(),
+      original);
+}
+
 TEST_F(HloParserTest, ParseFrontendAttributes) {
   const string original =
       R"({attr_a="test_a",attr_b="b",attr_c="s64",attr_d="a/b"})";
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index 2166ecdd890..7f974a618a8 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -121,9 +121,9 @@ struct Item {
   bool placed = false;
 
   // To avoid an infinite loop rematerializing the same set of
-  // instructions ad infinitum, keep a blacklist of instructions
+  // instructions ad infinitum, keep a denylist of instructions
   // which should not be rematerialized.
-  bool blacklisted = false;
+  bool denylisted = false;
 
   // The buffers defined by this instruction.
   BufferIdList buffers_defined;
@@ -292,8 +292,8 @@ class InstructionList {
     InsertBeforeInstructions(to_insert, {max_position_item->next});
   }
 
-  void Blacklist(const HloInstruction* inst) {
-    GetItem(inst)->blacklisted = true;
+  void Denylist(const HloInstruction* inst) {
+    GetItem(inst)->denylisted = true;
   }
 
  private:
@@ -745,7 +745,7 @@ Status MemoryUsageTracker::EndInstruction() {
   for (BufferId buffer_id : in_progress_item_->buffers_used) {
     Buffer& buffer = buffers_.at(buffer_id);
     buffer.unfinished_user_count--;
-    CHECK_GE(buffer.unfinished_user_count, 0)
+    TF_RET_CHECK(buffer.unfinished_user_count >= 0)
         << buffer.ToString() << " has negative unfinished user count.";
     if (buffer.unfinished_user_count == 0) {
       // Buffer is now dead.
@@ -1158,13 +1158,13 @@ std::vector<Item*> GetInitialBlock(const InstructionList& instruction_list,
   return item_block;
 }
 
-// Returns whether any instruction in 'block' is blacklisted or
+// Returns whether any instruction in 'block' is denylisted or
 // non-rematerializable.
-bool AnyBlacklistedOrNonRematerializable(
+bool AnyDenylistedOrNonRematerializable(
     const std::vector<Item*>& block,
     absl::flat_hash_map<const HloInstruction*, bool>* rematerializable_map) {
   for (auto* item : block) {
-    if (item->blacklisted) {
+    if (item->denylisted) {
       return true;
     }
     if (!CanBeRematerialized(item->instruction, rematerializable_map)) {
@@ -1195,10 +1195,10 @@ MemoryUsageTracker::PickRematerializationCandidates(
       // instructions.
       break;
     }
-    // If any item in the starting block are blacklisted or non-rematable, then
+    // If any item in the starting block are denylisted or non-rematable, then
     // break and move on to next start_item (we can actually move to the last
     // invalid item in this block, but let's ignore that optimization for now).
-    if (AnyBlacklistedOrNonRematerializable(block, rematerializable_map)) {
+    if (AnyDenylistedOrNonRematerializable(block, rematerializable_map)) {
       continue;
     }
     while (block.size() <= max_block_size) {
@@ -1289,8 +1289,8 @@ MemoryUsageTracker::PickRematerializationCandidates(
       // Time to update the block to include the next instruction.
       auto* last_item = block[block.size() - 1];
       auto* next_item = instruction_list.next(last_item);
-      if (next_item == nullptr || next_item->blacklisted ||
-          !next_item->placed || next_item == in_progress_item_ ||
+      if (next_item == nullptr || next_item->denylisted || !next_item->placed ||
+          next_item == in_progress_item_ ||
           !CanBeRematerialized(next_item->instruction, rematerializable_map)) {
         break;
       }
@@ -1404,7 +1404,7 @@ StatusOr<int64> RematerializeInstructions(
         // instruction it was a copying of. Now 'remat' is a rematerialization
         // of 'best' and kills 'best'. Stop rematerializing this instruction
         // to avoid an infinite loop.
-        instruction_list->Blacklist(remat);
+        instruction_list->Denylist(remat);
       }
       remat_move_instructions->insert(remat);
     } else {
@@ -1460,8 +1460,8 @@ StatusOr<int64> CompressInstruction(MemoryUsageTracker* memory_tracker,
     place_before.push_back(instruction_list->GetItem(user));
   }
 
-  instruction_list->Blacklist(compressed_item->instruction);
-  instruction_list->Blacklist(uncompressed_item->instruction);
+  instruction_list->Denylist(compressed_item->instruction);
+  instruction_list->Denylist(uncompressed_item->instruction);
 
   instruction_list->InsertBeforeInstructions(uncompressed_item, place_before);
 
@@ -1583,7 +1583,7 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
   // rematerialization is added to 'remat_move_instructions' (the
   // rematerialization is essentially a move). If the next rematerialization of
   // the instruction is also a move then the rematerialization is added to the
-  // blacklist.
+  // denylist.
   absl::flat_hash_set<const HloInstruction*> remat_move_instructions;
 
   // The map from instructions to their rematerializable status.
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 30a7916c408..83130108dd7 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -211,7 +211,8 @@ static std::vector<ExecutionInput> ExecutionInputsFromScopedShapedBuffers(
             *buffer_tree.mutable_element(index) = execution_input_buffer;
           }
         });
-    execution_inputs.emplace_back(std::move(buffer_tree));
+    execution_inputs.emplace_back(std::move(buffer_tree),
+                                  input_buffer.on_host_shape());
   }
   return execution_inputs;
 }
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index b0a03707efb..92270005ffd 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -39,6 +39,47 @@ HloSharding HloSharding::Tile1D(const Shape& input_shape, int64 num_tiles) {
   return HloSharding(assignment);
 }
 
+HloSharding HloSharding::PartialTile(
+    const Array<int64>& group_tile_assignment,
+    absl::Span<const absl::Span<const int64>> replication_groups) {
+  auto new_tile_dims = group_tile_assignment.dimensions();
+  new_tile_dims.push_back(replication_groups[0].size());
+  auto new_tile_assignment = Array<int64>(new_tile_dims);
+  new_tile_assignment.Each([&](absl::Span<const int64> indices, int64* device) {
+    std::vector<int64> group_index(indices.begin(), indices.end());
+    group_index.pop_back();
+    int64 group = group_tile_assignment(group_index);
+    *device = replication_groups[group][indices.back()];
+  });
+  return PartialTile(new_tile_assignment);
+}
+
+HloSharding HloSharding::PartialTile(
+    const Array<int64>& tile_assignment_last_dim_replicate) {
+  std::vector<std::set<int64>> sorted_groups(
+      tile_assignment_last_dim_replicate.num_elements() /
+      tile_assignment_last_dim_replicate.dimensions().back());
+  auto get_group_id = [&](absl::Span<const int64> indices) {
+    int64 group_id = 0;
+    for (int64 i = 0; i < indices.size() - 1; ++i) {
+      group_id *= tile_assignment_last_dim_replicate.dim(i);
+      group_id += indices[i];
+    }
+    return group_id;
+  };
+  tile_assignment_last_dim_replicate.Each(
+      [&](absl::Span<const int64> indices, const int64 device) {
+        sorted_groups[get_group_id(indices)].insert(device);
+      });
+  Array<int64> sorted_tile(tile_assignment_last_dim_replicate.dimensions());
+  sorted_tile.Each([&](absl::Span<const int64> indices, int64* device) {
+    auto begin = sorted_groups[get_group_id(indices)].begin();
+    *device = *begin;
+    sorted_groups[get_group_id(indices)].erase(begin);
+  });
+  return HloSharding(sorted_tile, /*replicate_on_last_tile_dim=*/true);
+}
+
 HloSharding HloSharding::Tuple(const ShapeTree<HloSharding>& sub_shardings) {
   std::vector<HloSharding> flattened_list;
   flattened_list.reserve(sub_shardings.leaf_count());
@@ -101,8 +142,10 @@ string HloSharding::ToString() const {
     return StrCat(
         "{maximal device=", static_cast<int64>(*tile_assignment_.begin()), "}");
   }
-  return StrCat("{devices=[", StrJoin(tile_assignment_.dimensions(), ","), "]",
-                StrJoin(tile_assignment_, ","), "}");
+  return StrCat(
+      "{devices=[", StrJoin(tile_assignment_.dimensions(), ","), "]",
+      StrJoin(tile_assignment_, ","),
+      replicate_on_last_tile_dim_ ? " last_tile_dim_replicate}" : "}");
 }
 
 bool HloSharding::UsesDevice(int64 device) const {
@@ -148,6 +191,9 @@ std::vector<int64> HloSharding::TileIndexForDevice(int64 device) const {
     }
   });
   CHECK(!ret_index.empty());
+  if (replicate_on_last_tile_dim_) {
+    ret_index.pop_back();
+  }
   return ret_index;
 }
 
@@ -157,6 +203,12 @@ int64 HloSharding::DeviceForTileIndex(absl::Span<const int64> index) const {
   if (maximal_) {
     return *tile_assignment_.begin();
   }
+  if (replicate_on_last_tile_dim_ &&
+      index.size() < tile_assignment().num_dimensions()) {
+    std::vector<int64> first_replicated_index(index.begin(), index.end());
+    first_replicated_index.push_back(0);
+    return tile_assignment_(first_replicated_index);
+  }
   return tile_assignment_(index);
 }
 
@@ -167,8 +219,11 @@ std::vector<int64> HloSharding::TileOffsetForDevice(const Shape& shape,
   if (maximal_) {
     return std::vector<int64>(shape.dimensions_size(), 0);
   }
-
-  CHECK_EQ(shape.dimensions_size(), tile_assignment_.num_dimensions());
+  if (replicate_on_last_tile_dim_) {
+    CHECK_EQ(shape.dimensions_size(), tile_assignment_.num_dimensions() - 1);
+  } else {
+    CHECK_EQ(shape.dimensions_size(), tile_assignment_.num_dimensions());
+  }
   std::vector<int64> index = TileIndexForDevice(device);
   for (int64 i = 0; i < index.size(); ++i) {
     const int64 shape_dim = shape.dimensions(i);
@@ -341,8 +396,10 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
     return Status::OK();
   }
 
-  // The tile assignment tensor must have the same rank as the input.
-  if (shape.rank() != tile_assignment_.num_dimensions()) {
+  // The tile assignment tensor must have the same rank as the input, or input
+  // rank + 1 for replicate_on_last_tile_dim_.
+  if (shape.rank() + (replicate_on_last_tile_dim_ ? 1 : 0) !=
+      tile_assignment_.num_dimensions()) {
     return tensorflow::errors::InvalidArgument(
         "Number of tile assignment dimensions is different to the input rank. "
         "sharding=",
@@ -403,7 +460,8 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
                          proto.tile_assignment_dimensions().end()));
   std::copy(proto.tile_assignment_devices().begin(),
             proto.tile_assignment_devices().end(), tile_assignment.begin());
-  return HloSharding(tile_assignment);
+  return proto.replicate_on_last_tile_dim() ? PartialTile(tile_assignment)
+                                            : HloSharding(tile_assignment);
 }
 
 OpSharding HloSharding::ToProto() const {
@@ -429,6 +487,7 @@ OpSharding HloSharding::ToProto() const {
     result.set_type(OpSharding::MAXIMAL);
   } else {
     result.set_type(OpSharding::OTHER);
+    result.set_replicate_on_last_tile_dim(ReplicateOnLastTileDim());
   }
   return result;
 }
@@ -464,6 +523,17 @@ Shape HloSharding::TileShape(const Shape& shape, int64 device) const {
   return result_shape;
 }
 
+int64 HloSharding::NumTiles() const {
+  if (IsTileMaximal()) {
+    return 1;
+  }
+  if (ReplicateOnLastTileDim()) {
+    return tile_assignment().num_elements() /
+           tile_assignment().dimensions().back();
+  }
+  return tile_assignment().num_elements();
+}
+
 HloSharding HloSharding::GetSubSharding(const Shape& shape,
                                         const ShapeIndex& index) const {
   CHECK(IsTuple());
@@ -516,6 +586,9 @@ size_t HloSharding::Hash() const {
   for (uint32 v : tile_assignment_) {
     h = tensorflow::Hash64Combine(h, std::hash<uint32>{}(v));
   }
+  if (replicate_on_last_tile_dim_) {
+    h = tensorflow::Hash64Combine(h, std::hash<uint32>{}(1));
+  }
   return h;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.h b/tensorflow/compiler/xla/service/hlo_sharding.h
index 20fa7232e65..e7ba2bc0680 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding.h
@@ -54,6 +54,19 @@ class HloSharding {
     return HloSharding(tile_assignment);
   }
 
+  // Creates a new sharding where data is replicated within each replication
+  // group, and sharded across replication groups according to
+  // group_tile_assignment. Replication group members will be sorted.
+  static HloSharding PartialTile(
+      const Array<int64>& group_tile_assignment,
+      absl::Span<const absl::Span<const int64>> replication_groups);
+
+  // Creates a partially replicated tiled sharding with device-level tile
+  // assignment, where the last dimension is the additional replication
+  // dimension. Replication group members will be sorted.
+  static HloSharding PartialTile(
+      const Array<int64>& tile_assignment_last_dim_replicate);
+
   // Creates a new sharding which splits a one-dimensional input shape into
   // `num_tiles` tiles.
   static HloSharding Tile1D(const Shape& input_shape, int64 num_tiles);
@@ -115,6 +128,11 @@ class HloSharding {
     });
   }
 
+  // Returns if the sharding has partial replication and partial sharding. If
+  // true, data is sharded according to other dimensions of tile_assignment(),
+  // but replicated across devices along the last dimension.
+  bool ReplicateOnLastTileDim() const { return replicate_on_last_tile_dim_; }
+
   // Returns true if the sharding defines an operation on the given device.
   bool UsesDevice(int64 device) const;
 
@@ -132,6 +150,10 @@ class HloSharding {
 
   // Returns the device that should execute the given tile.
   // It is an error to call this if is_replicated() is true.
+  // When ReplicateOnLastTileDim() == true, if index.size() == data rank, it
+  // returns the first device in that replicated subgroup; otherwise,
+  // index.size() should be the same as tile_assignment()'s rank and specifies
+  // the member of the replication subgroup.
   // REQUIRES: !IsTuple()
   int64 DeviceForTileIndex(absl::Span<const int64> index) const;
 
@@ -188,7 +210,8 @@ class HloSharding {
   bool operator==(const HloSharding& other) const {
     return replicated_ == other.replicated_ && maximal_ == other.maximal_ &&
            tile_assignment_ == other.tile_assignment_ &&
-           tuple_elements_ == other.tuple_elements_;
+           tuple_elements_ == other.tuple_elements_ &&
+           replicate_on_last_tile_dim_ == other.replicate_on_last_tile_dim_;
   }
   bool operator!=(const HloSharding& other) const { return !(*this == other); }
 
@@ -220,12 +243,17 @@ class HloSharding {
   // REQUIRES: !IsTuple()
   Shape TileShape(const Shape& shape, int64 device) const;
 
+  // Gets the number of tiles. If it has partial replication, this will not
+  // equal the device count.
+  int64 NumTiles() const;
+
  private:
   HloSharding()
       : replicated_(true),
         maximal_(true),
         tuple_(false),
-        tile_assignment_({0}) {}
+        tile_assignment_({0}),
+        replicate_on_last_tile_dim_(false) {}
   // device_id values:
   // -2: magic number to mean unassigned device, used by spatial partitioning
   // -1: the id of the host
@@ -236,18 +264,22 @@ class HloSharding {
       : replicated_(false),
         maximal_(true),
         tuple_(false),
-        tile_assignment_({1}, device_id) {}
-  explicit HloSharding(const Array<int64>& tile_assignment)
+        tile_assignment_({1}, device_id),
+        replicate_on_last_tile_dim_(false) {}
+  explicit HloSharding(const Array<int64>& tile_assignment,
+                       bool replicate_on_last_tile_dim = false)
       : replicated_(false),
         maximal_(false),
         tuple_(false),
-        tile_assignment_(tile_assignment) {}
+        tile_assignment_(tile_assignment),
+        replicate_on_last_tile_dim_(replicate_on_last_tile_dim) {}
   explicit HloSharding(const std::vector<HloSharding>& tuple_shardings)
       : replicated_(false),
         maximal_(false),
         tuple_(true),
         tile_assignment_({0}),
-        tuple_elements_(tuple_shardings) {}
+        tuple_elements_(tuple_shardings),
+        replicate_on_last_tile_dim_(false) {}
 
   // Checks that the number of elements in tuple_elements_ is consistent with
   // the tuple shape passes as argument.
@@ -283,6 +315,11 @@ class HloSharding {
   // present for the root. This is a flattened list of all the leaf shardings in
   // a tuple shape, by pre-order walk (ShapeTree iterator order).
   std::vector<HloSharding> tuple_elements_;
+  // This flag is to support partial replication and partial sharding. If it is
+  // true, tile_assignment_ will have an extra dimension in addition to the data
+  // shape rank, and the added last dimension represents the subgroups of
+  // replications, i.e., elements in slice [..., :] will be replicated.
+  bool replicate_on_last_tile_dim_;
 };
 
 std::ostream& operator<<(std::ostream& out, const HloSharding& sharding);
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_util.cc b/tensorflow/compiler/xla/service/hlo_sharding_util.cc
index 7fc05608800..65295a8e620 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_util.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_util.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
 
 #include <map>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "tensorflow/compiler/xla/array.h"
@@ -23,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -190,13 +192,22 @@ absl::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
       target_dims_stack.push_back(t_size);
     } else if (s_size > t_size) {
       // Dimension split.
-      if (s_size % t_size != 0 || t_size % s_partitions != 0) {
+      if (s_size % t_size != 0 || s_size % s_partitions != 0) {
+        return absl::nullopt;
+      }
+      if (t_size % s_partitions == 0) {
+        target_tile_assignment_dimensions.push_back(s_partitions);
+        // We have part of the s_size unprocessed, so put it back to stack.
+        source_dims_stack.push_back(s_size / t_size);
+        sharding_tile_dims_stack.push_back(1);
+      } else if (s_partitions % t_size == 0) {
+        target_tile_assignment_dimensions.push_back(t_size);
+        // We have part of the s_size unprocessed, so put it back to stack.
+        source_dims_stack.push_back(s_size / t_size);
+        sharding_tile_dims_stack.push_back(s_partitions / t_size);
+      } else {
         return absl::nullopt;
       }
-      target_tile_assignment_dimensions.push_back(s_partitions);
-      // We have part of the s_size unprocessed, so put it back to stack.
-      source_dims_stack.push_back(s_size / t_size);
-      sharding_tile_dims_stack.push_back(1);
     } else {
       // Dimension merge. Also merge the source dimension with the next, and
       // process it next time.
@@ -322,6 +333,10 @@ HloSharding GatherOutputSharding(const HloSharding& index_sharding,
     }
   }
   Array<int64> new_tile_assignment = index_sharding.tile_assignment();
+  if (new_tile_assignment.num_elements() !=
+      Product(output_tile_assignment_dims)) {
+    return HloSharding::Replicate();
+  }
   new_tile_assignment.Reshape(output_tile_assignment_dims);
   return HloSharding::Tile(new_tile_assignment);
 }
@@ -341,6 +356,10 @@ HloSharding GatherIndexSharding(const HloSharding& output_sharding,
     }
   }
   Array<int64> new_tile_assignment = output_sharding.tile_assignment();
+  if (new_tile_assignment.num_elements() !=
+      Product(index_tile_assignment_dims)) {
+    return HloSharding::Replicate();
+  }
   new_tile_assignment.Reshape(index_tile_assignment_dims);
   return HloSharding::Tile(new_tile_assignment);
 }
@@ -413,6 +432,10 @@ HloSharding ScatterIndexSharding(const HloSharding& data_sharding,
     index_tile_assignment_dims.push_back(1);
   }
   Array<int64> new_tile_assignment = data_sharding.tile_assignment();
+  if (new_tile_assignment.num_elements() !=
+      Product(index_tile_assignment_dims)) {
+    return HloSharding::Replicate();
+  }
   new_tile_assignment.Reshape(index_tile_assignment_dims);
   return HloSharding::Tile(new_tile_assignment);
 }
@@ -435,6 +458,10 @@ HloSharding ScatterDataSharding(const HloSharding& index_sharding,
     }
   }
   Array<int64> new_tile_assignment = index_sharding.tile_assignment();
+  if (new_tile_assignment.num_elements() !=
+      Product(data_tile_assignment_dims)) {
+    return HloSharding::Replicate();
+  }
   new_tile_assignment.Reshape(data_tile_assignment_dims);
   return HloSharding::Tile(new_tile_assignment);
 }
@@ -524,6 +551,169 @@ HloSharding ScatterEffectiveDataSharding(const HloSharding& data_sharding,
   return HloSharding::Tile(tile_assignment);
 }
 
+namespace {
+
+// If partitioning in the operand only happens in dimensions in passthrough
+// dimensions (offset dimensions in the gather output (or scatter update) that
+// have the same size as the operand), returns the corresponding output (or
+// update) sharding by passing through the input sharding.
+absl::optional<HloSharding> PassthroughOperandToGatherOutputOrScatterUpdate(
+    const Shape& operand_shape, const HloSharding& operand_sharding,
+    const Shape& update_or_gather_shape,
+    absl::Span<const int64> collapsed_or_inserted_dims,
+    absl::Span<const int64> index_map,
+    absl::Span<const int64> offset_or_window_dims,
+    absl::Span<const int64> slice_size) {
+  if (operand_sharding.IsTileMaximal()) {
+    return operand_sharding;
+  }
+  std::vector<int64> passthrough_tile(update_or_gather_shape.rank(), 1);
+  int64 collapsed = 0;
+  for (int64 i = 0; i < operand_shape.rank(); ++i) {
+    int64 dim_partitions = operand_sharding.tile_assignment().dim(i);
+    if (absl::c_linear_search(collapsed_or_inserted_dims, i) ||
+        absl::c_linear_search(index_map, i)) {
+      if (dim_partitions > 1) {
+        return absl::nullopt;
+      }
+      collapsed++;
+      continue;
+    }
+    if (slice_size[i] != operand_shape.dimensions(i) && dim_partitions > 1) {
+      return absl::nullopt;
+    }
+    int64 offset_dim = offset_or_window_dims[i - collapsed];
+    if (i - collapsed > 0 &&
+        offset_dim < offset_or_window_dims[i - collapsed - 1]) {
+      // Output offsets are transposed, we do not support this case.
+      return absl::nullopt;
+    }
+    passthrough_tile[offset_dim] = dim_partitions;
+  }
+  Array<int64> tile_assignment = operand_sharding.tile_assignment();
+  tile_assignment.Reshape(passthrough_tile);
+  return HloSharding::Tile(tile_assignment);
+}
+
+// Inverse of PassthroughOperandToGatherOutputOrScatterUpdate.
+absl::optional<HloSharding> PassthroughGatherOutputOrScatterUpdateToOperand(
+    const Shape& operand_shape, const HloSharding& update_or_gather_sharding,
+    absl::Span<const int64> collapsed_or_inserted_dims,
+    absl::Span<const int64> index_map,
+    absl::Span<const int64> offset_or_window_dims,
+    absl::Span<const int64> slice_size) {
+  if (update_or_gather_sharding.IsTileMaximal()) {
+    return update_or_gather_sharding;
+  }
+  std::vector<int64> passthrough_tile(operand_shape.rank(), 1);
+  int64 collapsed = 0;
+  for (int64 i = 0; i < operand_shape.rank(); ++i) {
+    if (absl::c_linear_search(collapsed_or_inserted_dims, i) ||
+        absl::c_linear_search(index_map, i)) {
+      collapsed++;
+      continue;
+    }
+    int64 offset_dim = offset_or_window_dims[i - collapsed];
+    int64 dim_partitions =
+        update_or_gather_sharding.tile_assignment().dim(offset_dim);
+    if (slice_size[i] != operand_shape.dimensions(i) && dim_partitions > 1) {
+      return absl::nullopt;
+    }
+    if (i - collapsed > 0 &&
+        offset_dim < offset_or_window_dims[i - collapsed - 1]) {
+      // Output offsets are transposed, we do not support this case.
+      return absl::nullopt;
+    }
+    passthrough_tile[i] = dim_partitions;
+  }
+  Array<int64> tile_assignment = update_or_gather_sharding.tile_assignment();
+  if (tile_assignment.num_elements() != Product(passthrough_tile)) {
+    return absl::nullopt;
+  }
+  tile_assignment.Reshape(passthrough_tile);
+  return HloSharding::Tile(tile_assignment);
+}
+
+}  // namespace
+
+absl::optional<HloSharding> GatherOutputShardingFromDataOperand(
+    const HloSharding& data_operand_sharding, const HloInstruction& hlo) {
+  const auto& dnums = hlo.gather_dimension_numbers();
+  std::vector<int64> collapsed_slice_dims(dnums.collapsed_slice_dims().begin(),
+                                          dnums.collapsed_slice_dims().end());
+  std::vector<int64> start_index_map(dnums.start_index_map().begin(),
+                                     dnums.start_index_map().end());
+  std::vector<int64> offset_dims(dnums.offset_dims().begin(),
+                                 dnums.offset_dims().end());
+  return PassthroughOperandToGatherOutputOrScatterUpdate(
+      hlo.operand(0)->shape(), data_operand_sharding, hlo.shape(),
+      collapsed_slice_dims, start_index_map, offset_dims,
+      hlo.gather_slice_sizes());
+}
+
+absl::optional<HloSharding> GatherDataOperandShardingFromOutput(
+    const HloSharding& output_sharding, const HloInstruction& hlo) {
+  const auto& dnums = hlo.gather_dimension_numbers();
+  std::vector<int64> collapsed_slice_dims(dnums.collapsed_slice_dims().begin(),
+                                          dnums.collapsed_slice_dims().end());
+  std::vector<int64> start_index_map(dnums.start_index_map().begin(),
+                                     dnums.start_index_map().end());
+  std::vector<int64> offset_dims(dnums.offset_dims().begin(),
+                                 dnums.offset_dims().end());
+  return PassthroughGatherOutputOrScatterUpdateToOperand(
+      hlo.operand(0)->shape(), output_sharding, collapsed_slice_dims,
+      start_index_map, offset_dims, hlo.gather_slice_sizes());
+}
+
+absl::optional<HloSharding> ScatterOutputShardingFromUpdate(
+    const HloSharding& update_sharding, const HloInstruction& hlo) {
+  const auto& dnums = hlo.scatter_dimension_numbers();
+  std::vector<int64> inserted_window_dims(dnums.inserted_window_dims().begin(),
+                                          dnums.inserted_window_dims().end());
+  std::vector<int64> scatter_dims_to_operand_dims(
+      dnums.scatter_dims_to_operand_dims().begin(),
+      dnums.scatter_dims_to_operand_dims().end());
+  std::vector<int64> update_window_dims(dnums.update_window_dims().begin(),
+                                        dnums.update_window_dims().end());
+  std::vector<int64> slice_size(hlo.shape().rank(), 1);
+  int64 num_update_window_dims = 0;
+  for (int64 i = 0; i < hlo.shape().rank(); ++i) {
+    if (absl::c_linear_search(dnums.inserted_window_dims(), i)) {
+      continue;
+    }
+    slice_size[i] = hlo.operand(2)->shape().dimensions(
+        dnums.update_window_dims(num_update_window_dims++));
+  }
+  return PassthroughGatherOutputOrScatterUpdateToOperand(
+      hlo.shape(), update_sharding, inserted_window_dims,
+      scatter_dims_to_operand_dims, update_window_dims, slice_size);
+}
+
+absl::optional<HloSharding> ScatterUpdateShardingFromOutput(
+    const HloSharding& output_sharding, const HloInstruction& hlo) {
+  const auto& dnums = hlo.scatter_dimension_numbers();
+  std::vector<int64> inserted_window_dims(dnums.inserted_window_dims().begin(),
+                                          dnums.inserted_window_dims().end());
+  std::vector<int64> scatter_dims_to_operand_dims(
+      dnums.scatter_dims_to_operand_dims().begin(),
+      dnums.scatter_dims_to_operand_dims().end());
+  std::vector<int64> update_window_dims(dnums.update_window_dims().begin(),
+                                        dnums.update_window_dims().end());
+  std::vector<int64> slice_size(hlo.shape().rank(), 1);
+  int64 num_update_window_dims = 0;
+  for (int64 i = 0; i < hlo.shape().rank(); ++i) {
+    if (absl::c_linear_search(dnums.inserted_window_dims(), i)) {
+      continue;
+    }
+    slice_size[i] = hlo.operand(2)->shape().dimensions(
+        dnums.update_window_dims(num_update_window_dims++));
+  }
+  return PassthroughOperandToGatherOutputOrScatterUpdate(
+      hlo.shape(), output_sharding, hlo.operand(2)->shape(),
+      inserted_window_dims, scatter_dims_to_operand_dims, update_window_dims,
+      slice_size);
+}
+
 StatusOr<std::pair<std::unique_ptr<HloInstruction>, HloOpcode>>
 IdentityValueAndHloOpcodeForScatterReduceComputation(
     const HloScatterInstruction& scatter) {
@@ -588,5 +778,68 @@ std::vector<int64> DevicesForSharding(
   return devices;
 }
 
+HloSharding PartiallyReplicateTiledShardingOnDims(
+    const HloSharding& sharding, const std::vector<int64>& dims_to_replicate) {
+  if (sharding.IsTileMaximal()) {
+    return sharding;
+  }
+  int64 group_count = 1;
+  for (int64 dim : dims_to_replicate) {
+    if (sharding.ReplicateOnLastTileDim()) {
+      CHECK_LT(dim, sharding.tile_assignment().num_dimensions());
+    }
+    group_count *= sharding.tile_assignment().dim(dim);
+  }
+  if (group_count == 1) {
+    return sharding;
+  }
+  if (group_count == sharding.NumTiles()) {
+    return HloSharding::Replicate();
+  }
+  std::vector<int64> dim_permutation(
+      sharding.tile_assignment().num_dimensions());
+  std::iota(dim_permutation.begin(), dim_permutation.end(), 0);
+  absl::c_sort(dim_permutation, [&](const int64 a, const int64 b) {
+    return absl::c_linear_search(dims_to_replicate, a) <
+           absl::c_linear_search(dims_to_replicate, b);
+  });
+  auto transposed = TransposeSharding(sharding, dim_permutation);
+  auto new_tile = transposed.tile_assignment();
+  std::vector<int64> new_tile_shape(
+      sharding.tile_assignment().dimensions().begin(),
+      sharding.tile_assignment().dimensions().end());
+  for (int64 dim : dims_to_replicate) {
+    new_tile_shape[dim] = 1;
+  }
+  if (sharding.ReplicateOnLastTileDim()) {
+    new_tile_shape.back() *= group_count;
+  } else {
+    new_tile_shape.push_back(group_count);
+  }
+  new_tile.Reshape(new_tile_shape);
+  return HloSharding::PartialTile(new_tile);
+}
+
+HloSharding RemoveShapeDimensions(const HloSharding& sharding,
+                                  const std::vector<int64>& dims_to_remove) {
+  if (sharding.IsTileMaximal() || dims_to_remove.empty()) {
+    return sharding;
+  }
+  std::vector<int64> new_tile_shape;
+  new_tile_shape.reserve(sharding.tile_assignment().num_dimensions() -
+                         dims_to_remove.size());
+  for (int64 i = 0; i < sharding.tile_assignment().num_dimensions(); ++i) {
+    if (absl::c_linear_search(dims_to_remove, i)) {
+      CHECK_EQ(sharding.tile_assignment().dim(i), 1);
+    } else {
+      new_tile_shape.push_back(sharding.tile_assignment().dim(i));
+    }
+  }
+  auto new_tile = sharding.tile_assignment();
+  new_tile.Reshape(new_tile_shape);
+  return sharding.ReplicateOnLastTileDim() ? HloSharding::PartialTile(new_tile)
+                                           : HloSharding::Tile(new_tile);
+}
+
 }  // namespace hlo_sharding_util
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_util.h b/tensorflow/compiler/xla/service/hlo_sharding_util.h
index 562f6d1420d..ce19d8c7a19 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_util.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding_util.h
@@ -127,6 +127,26 @@ HloSharding ScatterEffectiveIndexSharding(const HloSharding& index_sharding,
 HloSharding ScatterEffectiveDataSharding(const HloSharding& data_sharding,
                                          const HloInstruction& hlo);
 
+// Returns an output sharding of gather by passing through the data operand's
+// sharding.
+absl::optional<HloSharding> GatherOutputShardingFromDataOperand(
+    const HloSharding& data_operand_sharding, const HloInstruction& hlo);
+
+// Returns a data operand sharding of gather by passing through the output's
+// sharding.
+absl::optional<HloSharding> GatherDataOperandShardingFromOutput(
+    const HloSharding& output_sharding, const HloInstruction& hlo);
+
+// Returns an output sharding of scatter by passing through the update operand's
+// sharding.
+absl::optional<HloSharding> ScatterOutputShardingFromUpdate(
+    const HloSharding& update_sharding, const HloInstruction& hlo);
+
+// Returns an update operand sharding of scatter by passing through the output's
+// sharding.
+absl::optional<HloSharding> ScatterUpdateShardingFromOutput(
+    const HloSharding& output_sharding, const HloInstruction& hlo);
+
 // Returns an identity value and an HloOpcode for reduce computation of scatter
 // instruction.
 // - If computation is add/or, return 0/false with corresponding op code;
@@ -143,6 +163,17 @@ IdentityValueAndHloOpcodeForScatterReduceComputation(
 std::vector<int64> DevicesForSharding(
     const HloSharding& sharding, const std::vector<int64>& available_devices);
 
+// Returns a sharding that replicates data across devices along the given
+// dimensions in the original sharding.
+HloSharding PartiallyReplicateTiledShardingOnDims(
+    const HloSharding& sharding, const std::vector<int64>& dims_to_replicate);
+
+// Returns a sharding the removes given tile dimensions.
+//
+// Precondition: if not tile maximal, the size of each tile dimension must be 1.
+HloSharding RemoveShapeDimensions(const HloSharding& sharding,
+                                  const std::vector<int64>& dims_to_remove);
+
 }  // namespace hlo_sharding_util
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_util_test.cc b/tensorflow/compiler/xla/service/hlo_sharding_util_test.cc
index 02496c75965..08f136b2e45 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_util_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_util_test.cc
@@ -76,6 +76,20 @@ TEST(HloShardingUtilTest, ReshapeShardingTiledSplit) {
   EXPECT_EQ(result.value(), output_sharding);
 }
 
+TEST(HloShardingUtilTest, ReshapeShardingTiledSplit2) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {16, 7});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {4, 4, 7});
+  Array2D<int64> tile(16, 1);
+  tile.FillIota(0);
+  HloSharding input_sharding = HloSharding::Tile(tile);
+  tile.Reshape({4, 4, 1});
+  HloSharding output_sharding = HloSharding::Tile(tile);
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+}
+
 TEST(HloShardingUtilTest, ReshapeShardingTiledSplitThenMerge) {
   Shape input_shape = ShapeUtil::MakeShape(F32, {16, 4, 7});
   Shape output_shape = ShapeUtil::MakeShape(F32, {4, 16, 7});
diff --git a/tensorflow/compiler/xla/service/hlo_value.h b/tensorflow/compiler/xla/service/hlo_value.h
index a1150ae299d..a721aabef76 100644
--- a/tensorflow/compiler/xla/service/hlo_value.h
+++ b/tensorflow/compiler/xla/service/hlo_value.h
@@ -57,6 +57,11 @@ struct HloPosition {
            (instruction->unique_id() == other.instruction->unique_id() &&
             index < other.index);
   }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const HloPosition& pos) {
+    return H::combine(std::move(h), pos.instruction->Hash(), pos.index);
+  }
 };
 
 std::ostream& operator<<(std::ostream& out, const HloPosition& position);
@@ -81,6 +86,12 @@ struct HloUse {
   }
 
   bool operator!=(const HloUse& other) const { return !(*this == other); }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const HloUse& use) {
+    return H::combine(std::move(h), use.instruction, use.operand_index,
+                      use.operand_number);
+  }
 };
 
 std::ostream& operator<<(std::ostream& out, const HloUse& use);
@@ -240,7 +251,8 @@ std::ostream& operator<<(std::ostream& out, const HloValueSet& hlo_value);
 // hold multiple HloValueSets.
 class InstructionValueSet : public ShapeTree<HloValueSet> {
  public:
-  InstructionValueSet(const Shape& shape) : ShapeTree<HloValueSet>(shape) {}
+  explicit InstructionValueSet(const Shape& shape)
+      : ShapeTree<HloValueSet>(shape) {}
 
   // Sets this value set to the union of the given value sets. Returns whether
   // this value set changed.
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 62b0d98418c..d395fddcc5d 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -670,14 +670,6 @@ Status ShapeVerifier::HandleReduce(HloInstruction* reduce) {
 }
 
 Status ShapeVerifier::HandleBitcast(HloInstruction* bitcast) {
-  // Bitcasts are not allowed to change the element type.
-  if (bitcast->operand(0)->shape().element_type() !=
-      bitcast->shape().element_type()) {
-    return InternalError(
-        "Bitcast can not change the element type from %s to %s",
-        PrimitiveType_Name(bitcast->operand(0)->shape().element_type()),
-        PrimitiveType_Name(bitcast->shape().element_type()));
-  }
   if (layout_sensitive_ &&
       shape_size_function_(bitcast->shape()) !=
           shape_size_function_(bitcast->operand(0)->shape())) {
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
index d9709c50df9..1f71c9586d5 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -540,24 +540,6 @@ TEST_F(HloVerifierTestLayoutSensitive, ConcatWithLayoutChangeNotAllowed) {
               HasSubstr("Instruction shouldn't change layouts"));
 }
 
-TEST_F(HloVerifierTest, BitcastCanNotChangeElementType) {
-  const char* const hlo_string = R"(
-  HloModule Module
-
-  ENTRY BitcastCanNotChangeElementType {
-   constant.0 = f32[2] constant({0.0, 0.0})
-   ROOT bitcast = s32[2] bitcast(constant.0)
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(hlo_string));
-
-  auto status = verifier().Run(module.get()).status();
-  ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
-              HasSubstr("Bitcast can not change the element type"));
-}
-
 TEST_F(HloVerifierTestLayoutSensitive, BitcastNeedsSameNumberOfElements) {
   const char* const hlo_string = R"(
   HloModule Module
diff --git a/tensorflow/compiler/xla/service/interpreter/BUILD b/tensorflow/compiler/xla/service/interpreter/BUILD
index 7a4eefc1ab6..3444d4cae42 100644
--- a/tensorflow/compiler/xla/service/interpreter/BUILD
+++ b/tensorflow/compiler/xla/service/interpreter/BUILD
@@ -34,6 +34,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
         "//tensorflow/compiler/xla/service:cholesky_expander",
+        "//tensorflow/compiler/xla/service:comparison_expander",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:custom_call_target_registry",
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index 1649be2ca8f..a059482d832 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 #include "tensorflow/compiler/xla/service/cholesky_expander.h"
+#include "tensorflow/compiler/xla/service/comparison_expander.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
@@ -81,6 +82,7 @@ Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
 
   pipeline.AddPass<DynamicIndexSplitter>();
   pipeline.AddPass<CholeskyExpander>();
+  pipeline.AddPass<ComparisonExpander>();
   pipeline.AddPass<TriangularSolveExpander>();
   pipeline.AddPass<LayoutAssignment>(
       hlo_module->mutable_entry_computation_layout(),
diff --git a/tensorflow/compiler/xla/service/interpreter/executable_base.cc b/tensorflow/compiler/xla/service/interpreter/executable_base.cc
index 4b020ea2d32..4b6a8aa5202 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable_base.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable_base.cc
@@ -81,8 +81,17 @@ StatusOr<ExecutionOutput> InterpreterExecutableBase::ExecuteAsyncOnStream(
   for (int64 i = 0; i < computation->num_parameters(); ++i) {
     const auto& expected_shape = computation->parameter_instruction(i)->shape();
     const auto& actual_shape = argument_buffers[i].on_device_shape();
-    if (!Shape::Equal().MinorToMajorOnlyInLayout()(expected_shape,
-                                                   actual_shape)) {
+    bool shape_match = true;
+    if (expected_shape.is_dynamic()) {
+      if (!ShapeUtil::DynamicArrayShapeIsCompatible(actual_shape,
+                                                    expected_shape)) {
+        shape_match = false;
+      }
+    } else if (!Shape::Equal().MinorToMajorOnlyInLayout()(expected_shape,
+                                                          actual_shape)) {
+      shape_match = false;
+    }
+    if (!shape_match) {
       return InvalidArgument(
           "Shape mismatch on parameter %d.  Expected %s, but was %s.", i,
           ShapeUtil::HumanStringWithLayout(expected_shape),
@@ -100,11 +109,18 @@ StatusOr<ExecutionOutput> InterpreterExecutableBase::ExecuteAsyncOnStream(
     TF_ASSIGN_OR_RETURN(Literal arg_literal,
                         transfer_manager->TransferLiteralFromDevice(
                             run_options->stream(), argument_buffers[p]));
+    const auto& expected_shape = computation->parameter_instruction(p)->shape();
+    if (expected_shape.is_dynamic()) {
+      // Expand the input literal to expected shape.
+      arg_literal = arg_literal.ToBoundedDynamic(expected_shape);
+    }
     arg_literals.push_back(std::move(arg_literal));
   }
 
   TF_ASSIGN_OR_RETURN(Literal result_literal,
                       Evaluate(*computation, arg_literals));
+  // Shrink the generated dynamic shape into static shape.
+  result_literal = result_literal.ToStatic();
 
   // Transform the result literal back into a ShapedBuffer.
   TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result_buffers,
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 3c48668e742..bea0f1fb93c 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -1357,6 +1357,20 @@ Status LayoutAssignment::PropagateOperandConstraint(
   // Propagate layouts between operands of the same instruction. This is a
   // constraint on non-layout-changing instructions.
   if (!instruction_can_change_layout_func_(user)) {
+    // Only propgate the layout of the largest concatenate operand.
+    if (user->opcode() == HloOpcode::kConcatenate) {
+      for (int64 operand_no = 0; operand_no < user->operand_count();
+           ++operand_no) {
+        const HloInstruction* sibling = user->operand(operand_no);
+        if (sibling == operand) {
+          continue;
+        }
+        if (sibling->shape().dimensions(user->concatenate_dimension()) >
+            operand->shape().dimensions(user->concatenate_dimension())) {
+          return Status::OK();
+        }
+      }
+    }
     // Make sure all siblings have the same layout as the operand.
     for (int64 operand_no = 0; operand_no < user->operand_count();
          ++operand_no) {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc
index f96c985da71..33121635b0b 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc
@@ -54,9 +54,7 @@ string SanitizeConstantName(const HloInstruction& instr) {
   return instr_name;
 }
 
-string ConstantBufferAllocationToGlobalName(
-    const BufferAllocation& allocation) {
-  const HloInstruction& instr = InstrForConstantBufferAllocation(allocation);
+string ConstantHloToGlobalName(const HloInstruction& instr) {
   string instr_name = instr.name();
   // Check that names are sanitized and stored in the HLO instructions
   // before constant buffer allocation.
@@ -64,6 +62,11 @@ string ConstantBufferAllocationToGlobalName(
   return absl::StrCat("buffer_for_", instr_name);
 }
 
+string ConstantBufferAllocationToGlobalName(
+    const BufferAllocation& allocation) {
+  return ConstantHloToGlobalName(InstrForConstantBufferAllocation(allocation));
+}
+
 const Literal& LiteralForConstantAllocation(
     const BufferAllocation& allocation) {
   return InstrForConstantBufferAllocation(allocation).literal();
diff --git a/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h
index 03e98a66900..2e2d3bf0b48 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h
@@ -24,6 +24,9 @@ namespace llvm_ir {
 // name of the corresponding constant buffer. In particular, it replaces . and
 // - with _.
 string SanitizeConstantName(const HloInstruction& instr);
+
+string ConstantHloToGlobalName(const HloInstruction& instr);
+
 // In XLA:GPU we map constant buffer allocations to globals in the generated
 // LLVM IR.  This function gives us the name of the global variable a constant
 // buffer is mapped to.  Not used on XLA:CPU.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc
index daf98478194..d89a9c2e0a5 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc
@@ -62,10 +62,11 @@ void EmitTuple(const IrArray& tuple, absl::Span<llvm::Value* const> operands,
                llvm::IRBuilder<>* b) {
   llvm::Module* module = getModuleFromBuilder(b);
   for (size_t i = 0; i < operands.size(); ++i) {
+    auto* cast =
+        b->CreatePointerCast(operands[i], PrimitiveTypeToIrType(TUPLE, module));
     auto* store = b->CreateStore(
-        b->CreatePointerCast(operands[i], PrimitiveTypeToIrType(TUPLE, module)),
-        b->CreateInBoundsGEP(tuple.GetBasePointer(),
-                             {b->getInt64(0), b->getInt64(i)}));
+        cast, b->CreateInBoundsGEP(tuple.GetBasePointer(),
+                                   {b->getInt64(0), b->getInt64(i)}));
     tuple.AnnotateLoadStoreInstructionWithMetadata(store);
   }
 }
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index c80646e0c70..5def5bbe9db 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -114,6 +114,7 @@ ExecutionOptions CreateExecutionOptions(
   execution_options.set_num_partitions(build_options.num_partitions());
   execution_options.set_use_spmd_partitioning(
       build_options.use_spmd_partitioning());
+  execution_options.set_deduplicate_hlo(build_options.deduplicate_hlo());
   if (build_options.has_device_assignment()) {
     TF_CHECK_OK(build_options.device_assignment().Serialize(
         execution_options.mutable_device_assignment()));
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc
index 4b26fba3bab..c5ae0573bed 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc
@@ -199,6 +199,12 @@ float MemorySpaceAssignmentCostAnalysis::GetInstructionElapsedDueToMemory(
 }
 
 float MemorySpaceAssignmentCostAnalysis::GetInstructionElapsed(
+    const HloInstruction& instruction) const {
+  return std::max(GetInstructionElapsedDueToCompute(instruction),
+                  GetInstructionElapsedDueToMemory(instruction));
+}
+
+float MemorySpaceAssignmentCostAnalysis::GetInstructionElapsedInAlternateMemory(
     const HloInstruction& instruction,
     absl::optional<int64> operand_in_alternate_mem,
     bool output_in_alternate_mem) const {
@@ -229,6 +235,11 @@ int64 InstructionCountPrefetchIntervalPicker::PreferredEvictionEndTime(
   return std::min(start_time + min_overlap_count_, latest_end_time);
 }
 
+int64 InstructionCountPrefetchIntervalPicker::LatestPrefetchStartTime(
+    const HloUse& use, int64 start_time, int64 end_time) const {
+  return end_time - min_overlap_count_;
+}
+
 void InstructionCountPrefetchIntervalPicker::Begin(const HloUse& use,
                                                    int64 start_time,
                                                    int64 end_time) {
@@ -258,12 +269,15 @@ std::string InstructionCountPrefetchIntervalPicker::ToNoCopyDebugString(
 CostAnalysisPrefetchIntervalPicker::CostAnalysisPrefetchIntervalPicker(
     const MemorySpaceAssignmentCostAnalysis& cost_analysis,
     float min_async_copy_to_overlap_ratio,
-    float max_async_copy_to_overlap_ratio)
+    float max_async_copy_to_overlap_ratio,
+    float preferred_async_copy_to_overlap_ratio)
     : while_nest_level_(
           cost_analysis.hlo_live_range().instruction_schedule().size(), 0),
       cost_analysis_(cost_analysis),
       min_async_copy_to_overlap_ratio_(min_async_copy_to_overlap_ratio),
-      max_async_copy_to_overlap_ratio_(max_async_copy_to_overlap_ratio) {
+      max_async_copy_to_overlap_ratio_(max_async_copy_to_overlap_ratio),
+      preferred_async_copy_to_overlap_ratio_(
+          preferred_async_copy_to_overlap_ratio) {
   instruction_schedule_ =
       &cost_analysis_.hlo_live_range().instruction_schedule();
 
@@ -277,12 +291,6 @@ CostAnalysisPrefetchIntervalPicker::CostAnalysisPrefetchIntervalPicker(
     // To avoid double counting, don't include the elapsed time of while and
     // conditional HLOs.
     const HloInstruction* instruction = instruction_and_logical_time.first;
-    if (instruction->opcode() == HloOpcode::kWhile ||
-        instruction->opcode() == HloOpcode::kConditional) {
-      continue;
-    }
-    float elapsed_time = cost_analysis_.cost_analysis().optimal_seconds(
-        *instruction_and_logical_time.first);
     int64 logical_time = instruction_and_logical_time.second;
     if (logical_time >= instructions_elapsed_time.size()) {
       instructions_elapsed_time.resize(logical_time + 1, 0.0);
@@ -291,6 +299,12 @@ CostAnalysisPrefetchIntervalPicker::CostAnalysisPrefetchIntervalPicker(
     int nest_level = cost_analysis_.CalculateWhileLoopNestLevel(
         instruction_and_logical_time.first);
     while_nest_level_[logical_time] = nest_level;
+    if (instruction->opcode() == HloOpcode::kWhile ||
+        instruction->opcode() == HloOpcode::kConditional) {
+      continue;
+    }
+    float elapsed_time = cost_analysis_.GetInstructionElapsed(
+        *instruction_and_logical_time.first);
     instructions_elapsed_time[logical_time] =
         elapsed_time *
         tensorflow::MathUtil::IPow<float>(kWhileExecutionCount, nest_level);
@@ -346,6 +360,49 @@ int64 CostAnalysisPrefetchIntervalPicker::PreferredEvictionEndTime(
   return end_time;
 }
 
+int64 CostAnalysisPrefetchIntervalPicker::LatestPrefetchStartTime(
+    const HloUse& use, int64 start_time, int64 end_time) const {
+  const Shape& shape = ShapeUtil::GetSubshape(
+      use.instruction->operand(use.operand_number)->shape(), use.operand_index);
+  // Find the earliest time that satisfies max_async_copy_to_overlap_ratio_.
+  float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(shape);
+  // Estimate the time we would save by having this op in alternate memory.
+  float elapsed_time = cost_analysis_.GetInstructionElapsed(*use.instruction);
+  float elapsed_time_in_alternate_mem =
+      cost_analysis_.GetInstructionElapsedInAlternateMemory(
+          *use.instruction, use.operand_number,
+          /*output_in_alternate_mem=*/false);
+  float inst_elapsed_reduction = elapsed_time - elapsed_time_in_alternate_mem;
+  int end_nest_level = while_nest_level_[end_time];
+
+  // Find the latest time we're allowed to start prefetching.
+  float min_interval = min_async_copy_to_overlap_ratio_ * async_copy_elapsed;
+  int latest_prefetch_time;
+  for (latest_prefetch_time = end_time - 1;
+       latest_prefetch_time >= start_time &&
+       (while_nest_level_[latest_prefetch_time] != end_nest_level ||
+        min_interval >
+            GetLogicalIntervalElapsed(latest_prefetch_time, end_time) +
+                inst_elapsed_reduction);
+       --latest_prefetch_time) {
+  }
+
+  return latest_prefetch_time;
+}
+
+int64 CostAnalysisPrefetchIntervalPicker::LatestPrefetchEndTime(
+    int64 original_prefetch_end_time, int64 proposed_prefetch_end_time) const {
+  // Iterate towards the beginning until we find a suitable end time that is the
+  // same while nest level as the original prefetch end time.
+  int64 original_nest_level = while_nest_level_[original_prefetch_end_time];
+  int64 new_prefetch_end_time;
+  for (new_prefetch_end_time = proposed_prefetch_end_time;
+       while_nest_level_[new_prefetch_end_time] != original_nest_level;
+       --new_prefetch_end_time) {
+  }
+  return new_prefetch_end_time;
+}
+
 void CostAnalysisPrefetchIntervalPicker::Begin(const HloUse& use,
                                                int64 start_time,
                                                int64 end_time) {
@@ -355,52 +412,100 @@ void CostAnalysisPrefetchIntervalPicker::Begin(const HloUse& use,
   async_copy_elapsed_ = cost_analysis_.GetAsyncCopyElapsed(shape);
   // Estimate the time we would save by having this op in alternate memory.
   float elapsed_time = cost_analysis_.GetInstructionElapsed(*use.instruction);
-  float elapsed_time_in_alternate_mem = cost_analysis_.GetInstructionElapsed(
-      *use.instruction, use.operand_number);
+  float elapsed_time_in_alternate_mem =
+      cost_analysis_.GetInstructionElapsedInAlternateMemory(
+          *use.instruction, use.operand_number,
+          /*output_in_alternate_mem=*/false);
   inst_elapsed_reduction_ = elapsed_time - elapsed_time_in_alternate_mem;
   end_logical_time_ = end_time;
-  earliest_start_logical_time_ = start_time;
-  int end_nest_level = while_nest_level_[end_time];
-  // Find the latest time we're allowed to start prefetching. If the start and
-  // end nest levels differe look for an earlier prefetch start.
-  for (current_logical_prefetch_time_ = end_time - 1;
-       current_logical_prefetch_time_ > start_time &&
-       (while_nest_level_[current_logical_prefetch_time_] != end_nest_level ||
-        min_async_copy_to_overlap_ratio_ * async_copy_elapsed_ >
-            GetLogicalIntervalElapsed(current_logical_prefetch_time_,
-                                      end_logical_time_) +
-                inst_elapsed_reduction_);
-       --current_logical_prefetch_time_) {
+  int end_nest_level = while_nest_level_[end_logical_time_];
+
+  // Find the latest time we're allowed to start prefetching.
+  float min_interval = min_async_copy_to_overlap_ratio_ * async_copy_elapsed_;
+  latest_prefetch_time_ = LatestPrefetchStartTime(use, start_time, end_time);
+
+  // Find the earliest time we're allowed to start prefetching.
+  float max_interval = max_async_copy_to_overlap_ratio_ *
+                       max_overlap_multiplier_ * async_copy_elapsed_;
+  for (earliest_prefetch_time_ = start_time;
+       earliest_prefetch_time_ <= end_logical_time_ &&
+       (while_nest_level_[earliest_prefetch_time_] != end_nest_level ||
+        max_interval < GetLogicalIntervalElapsed(earliest_prefetch_time_,
+                                                 end_logical_time_));
+       ++earliest_prefetch_time_) {
   }
+  if (earliest_prefetch_time_ > latest_prefetch_time_) {
+    // There is no available prefetch interval for the given start and end
+    // times. Set the iterators accordingly to ensure Done() returns true.
+    increasing_prefetch_time_iterator_ = earliest_prefetch_time_;
+    decreasing_prefetch_time_iterator_ = latest_prefetch_time_;
+    CHECK(Done());
+    return;
+  }
+
+  // Between the earliest and latest prefetch interval, find the interval
+  // closest to the preferred interval and start iterating from there.
+  int64 starting_prefetch_time = earliest_prefetch_time_;
+  float preferred_interval =
+      preferred_async_copy_to_overlap_ratio_ * async_copy_elapsed_;
+  float best_interval =
+      GetLogicalIntervalElapsed(earliest_prefetch_time_, end_logical_time_);
+  for (int64 prefetch_time = earliest_prefetch_time_ + 1;
+       prefetch_time <= latest_prefetch_time_; ++prefetch_time) {
+    float interval =
+        GetLogicalIntervalElapsed(prefetch_time, end_logical_time_);
+    if (while_nest_level_[prefetch_time] == end_nest_level &&
+        std::abs(preferred_interval - interval) <
+            std::abs(preferred_interval - best_interval)) {
+      best_interval = interval;
+      starting_prefetch_time = prefetch_time;
+    }
+  }
+  VLOG(4) << "Interval min/max/preferred = " << min_interval << " "
+          << max_interval << " " << preferred_interval
+          << " prefetch time earliest/latest/starting = "
+          << earliest_prefetch_time_ << " " << latest_prefetch_time_ << " "
+          << starting_prefetch_time;
+
+  increasing_prefetch_time_iterator_ = starting_prefetch_time;
+  decreasing_prefetch_time_iterator_ = starting_prefetch_time;
+  using_increasing_prefetch_time_iterator_ = true;
+  // Since both iterators start at the same position, call Next() once to
+  // advance one of the iterators.
+  Next();
 }
 
 int64 CostAnalysisPrefetchIntervalPicker::Next() {
   CHECK(!Done()) << "Prefetch interval picker's Next() is called even though "
                     "Done() is false";
-  int64 prefetch_time = current_logical_prefetch_time_;
-  if (!Done()) {
-    --current_logical_prefetch_time_;
+  if (using_increasing_prefetch_time_iterator_) {
+    int64 prefetch_time = increasing_prefetch_time_iterator_++;
+    while (increasing_prefetch_time_iterator_ <= latest_prefetch_time_ &&
+           while_nest_level_[increasing_prefetch_time_iterator_] !=
+               while_nest_level_[end_logical_time_]) {
+      ++increasing_prefetch_time_iterator_;
+    }
+    if (decreasing_prefetch_time_iterator_ >= earliest_prefetch_time_) {
+      using_increasing_prefetch_time_iterator_ = false;
+    }
+    return prefetch_time;
+  } else {
+    int64 prefetch_time = decreasing_prefetch_time_iterator_--;
+    while (decreasing_prefetch_time_iterator_ >= earliest_prefetch_time_ &&
+           while_nest_level_[decreasing_prefetch_time_iterator_] !=
+               while_nest_level_[end_logical_time_]) {
+      --decreasing_prefetch_time_iterator_;
+    }
+    if (increasing_prefetch_time_iterator_ <= latest_prefetch_time_) {
+      using_increasing_prefetch_time_iterator_ = true;
+    }
+    return prefetch_time;
   }
-  // If the prefetch start and end times differ, look for an earlier prefetch
-  // start.
-  while (!Done() && while_nest_level_[current_logical_prefetch_time_] !=
-                        while_nest_level_[end_logical_time_]) {
-    --current_logical_prefetch_time_;
-  }
-  return prefetch_time;
 }
 
 bool CostAnalysisPrefetchIntervalPicker::Done() const {
-  if (current_logical_prefetch_time_ < earliest_start_logical_time_) {
-    return true;
-  }
-  float logical_interval_elapsed = GetLogicalIntervalElapsed(
-      current_logical_prefetch_time_, end_logical_time_);
-  return (max_async_copy_to_overlap_ratio_ * max_overlap_multiplier_ *
-              async_copy_elapsed_ <
-          logical_interval_elapsed) ||
-         (min_async_copy_to_overlap_ratio_ * async_copy_elapsed_ >
-          logical_interval_elapsed + inst_elapsed_reduction_);
+  return increasing_prefetch_time_iterator_ > latest_prefetch_time_ &&
+         decreasing_prefetch_time_iterator_ < earliest_prefetch_time_;
 }
 
 void CostAnalysisPrefetchIntervalPicker::SetRetryNumber(int retry_number) {
@@ -440,13 +545,16 @@ float CostAnalysisPrefetchIntervalPicker::GetLogicalIntervalElapsed(
 }
 
 std::string CostAnalysisPrefetchIntervalPicker::ToDebugString() const {
+  int current_logical_prefetch_time = using_increasing_prefetch_time_iterator_
+                                          ? increasing_prefetch_time_iterator_
+                                          : decreasing_prefetch_time_iterator_;
   float logical_interval_elapsed = GetLogicalIntervalElapsed(
-      current_logical_prefetch_time_, end_logical_time_);
+      current_logical_prefetch_time, end_logical_time_);
   return absl::StrCat(
       "Async copy elapsed (s) = ", async_copy_elapsed_,
       ", inst elapsed reduction (s) = ", inst_elapsed_reduction_,
       ", logical interval elapsed (s) = ", logical_interval_elapsed,
-      ", interval = (", current_logical_prefetch_time_, ", ", end_logical_time_,
+      ", interval = (", current_logical_prefetch_time, ", ", end_logical_time_,
       ")");
 }
 
@@ -466,6 +574,24 @@ CostAnalysisPrefetchIntervalPicker::BufferIntervalAlternateMemoryBenefit(
   return cost_analysis_.GetMemoryBoundedness(interval);
 }
 
+bool MemorySpaceAssignment::Allocation::operator==(
+    const MemorySpaceAssignment::Allocation& other) const {
+  return defining_position() == other.defining_position() &&
+         uses() == other.uses() && memory_space() == other.memory_space() &&
+         chunk() == other.chunk() && start_time() == other.start_time() &&
+         end_time() == other.end_time() &&
+         is_copy_allocation() == other.is_copy_allocation();
+}
+
+bool MemorySpaceAssignment::CopyAllocation::operator==(
+    const MemorySpaceAssignment::CopyAllocation& other) const {
+  return static_cast<const Allocation&>(*this) ==
+             static_cast<const Allocation&>(other) &&
+         copy_done_schedule_before() == other.copy_done_schedule_before() &&
+         copy_start_schedule_after() == other.copy_start_schedule_after() &&
+         copy_start() == other.copy_start() && copy_done() == other.copy_done();
+}
+
 std::string MemorySpaceAssignment::AllocationValue::ToString() const {
   std::string out = absl::StrCat("computation = ", computation()->name());
   absl::StrAppend(&out, "\n position:\n");
@@ -484,7 +610,9 @@ std::string MemorySpaceAssignment::AllocationValue::ToShortString() const {
 }
 
 void AlternateMemoryBestFitHeap::CreateAllocationValues(
-    const HloValue* value, std::vector<AllocationValue>* allocation_values) {
+    const AlternateMemoryBestFitHeap::BufferInterval& buffer_interval,
+    std::vector<AllocationValue>& allocation_values) const {
+  const HloValue* value = buffer_interval.buffer;
   VLOG(3) << "Creating AllocationValues for: " << value->ToString();
 
   // Find and sort all non-trivial (excluding GTE, Tuple, and bitcast)
@@ -512,10 +640,10 @@ void AlternateMemoryBestFitHeap::CreateAllocationValues(
 
   // Create an AllocationValue for each non-trivial position.
   absl::flat_hash_set<const HloComputation*> computations;
-  int beginning_idx = allocation_values->size();
+  int beginning_idx = allocation_values.size();
   for (int i = 0; i < positions.size(); ++i) {
     const HloPosition& position = positions.at(i);
-    allocation_values->emplace_back(value, position);
+    allocation_values.emplace_back(value, position, buffer_interval.size);
   }
 
   std::vector<HloUse> uses(value->uses());
@@ -536,8 +664,8 @@ void AlternateMemoryBestFitHeap::CreateAllocationValues(
     HloComputation* use_computation = use.instruction->parent();
 
     AllocationValue* last_allocation_value = nullptr;
-    for (int i = beginning_idx; i < allocation_values->size(); ++i) {
-      AllocationValue* allocation_value = &allocation_values->at(i);
+    for (int i = beginning_idx; i < allocation_values.size(); ++i) {
+      AllocationValue* allocation_value = &allocation_values.at(i);
       if (allocation_value->computation() == use_computation &&
           instruction_schedule.at(
               allocation_value->defining_position().instruction) < use_time) {
@@ -548,9 +676,9 @@ void AlternateMemoryBestFitHeap::CreateAllocationValues(
     last_allocation_value->AddUse(use, use_time);
   }
 
-  for (int i = beginning_idx; i < allocation_values->size(); ++i) {
+  for (int i = beginning_idx; i < allocation_values.size(); ++i) {
     VLOG(3) << "Created allocation value: "
-            << allocation_values->at(i).ToString();
+            << allocation_values.at(i).ToString();
   }
 }
 
@@ -774,7 +902,7 @@ void AlternateMemoryBestFitHeap::AppendBufferInfoDebugString(
   std::vector<std::string> use_names;
   use_times.reserve(uses.size());
   use_names.reserve(uses.size());
-  for (auto use : uses) {
+  for (const auto& use : uses) {
     use_times.push_back(use.first);
     use_names.push_back(use.second);
   }
@@ -794,27 +922,27 @@ void AlternateMemoryBestFitHeap::AppendBufferInfoDebugString(
 }
 
 void AlternateMemoryBestFitHeap::AppendAllocationInfoDebugString(
-    const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval,
+    const AllocationValue& value,
     const MemorySpaceAssignment::Allocation& allocation,
-    std::string* debug_str) const {
+    std::string& debug_str) const {
   // Columns in allocation information:
   // buffer_id: int. This value can be used the match with buffer info.
   // size: int. In bytes.
   // offset: int. In bytes.
   // start_time: int. Logical start time of the allocation.
   // end_time: int. Logical end time of the allocation.
-  if (debug_str->empty()) {
+  if (debug_str.empty()) {
     // Append the column names.
-    absl::StrAppend(debug_str, "buffer_id,size,offset,start_time,end_time\n");
+    absl::StrAppend(&debug_str, "buffer_id,size,offset,start_time,end_time\n");
   }
   if (allocation.memory_space() == MemorySpace::kAlternate) {
     const HloBuffer& buffer =
-        alias_analysis_.GetBufferContainingValue(*interval.buffer);
-    absl::StrAppend(debug_str, buffer.id(), ",");
-    absl::StrAppend(debug_str, interval.size, ",");
-    absl::StrAppend(debug_str, allocation.chunk().offset, ",");
-    absl::StrAppend(debug_str, allocation.start_time(), ",");
-    absl::StrAppend(debug_str, allocation.end_time(), "\n");
+        alias_analysis_.GetBufferContainingValue(*value.value());
+    absl::StrAppend(&debug_str, buffer.id(), ",");
+    absl::StrAppend(&debug_str, value.size(), ",");
+    absl::StrAppend(&debug_str, allocation.chunk().offset, ",");
+    absl::StrAppend(&debug_str, allocation.start_time(), ",");
+    absl::StrAppend(&debug_str, allocation.end_time(), "\n");
   }
 }
 
@@ -845,6 +973,16 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
     }
   }
 
+  for (const auto& interval : sorted_buffer_intervals) {
+    auto colocated_intervals = GetSortedColocatedIntervals(interval);
+    if (AreIntervalsReservedInAlternateMemory(colocated_intervals)) {
+      // Increment the reserved part of alternate memory so that it is not
+      // available for other buffers.
+      reserved_in_bytes_ += options_.size_fn(*interval.buffer);
+    }
+  }
+  VLOG(2) << "Total reserved bytes = " << reserved_in_bytes_;
+
   for (auto& interval : sorted_buffer_intervals) {
     if (!interval.need_allocation) {
       continue;
@@ -872,8 +1010,7 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
 
     if (AreIntervalsReservedInAlternateMemory(colocated_intervals)) {
       VLOG(3) << "Interval " << interval.buffer->ToShortString()
-              << " is reserved in the alternate memory. Total reserved bytes = "
-              << reserved_in_bytes_;
+              << " is reserved in the alternate memory.";
       for (const BufferInterval* colocated_interval : colocated_intervals) {
         const HloValue* value = colocated_interval->buffer;
         // Color all of the aliased reserved buffers here because reserved
@@ -889,10 +1026,6 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
               options_.alternate_memory_space);
         }
       }
-      // Increment the reserved part of alternate memory so that it is not
-      // available for other buffers. Since all colocated intervals should have
-      // the same size, just use the first one.
-      reserved_in_bytes_ += options_.size_fn(*colocated_intervals[0]->buffer);
       continue;
     }
 
@@ -913,16 +1046,43 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
 
     AppendBufferInfoDebugString(interval, &buffer_info_str_);
 
+    std::vector<AllocationValue> allocation_values;
+    CreateAllocationValuesFromColocatedIntervals(colocated_intervals,
+                                                 allocation_values);
+
     // Retry allocating this value with larger limits if allocation fails.
     for (int retry_number = 0; retry_number < options_.max_retries;
          retry_number++) {
-      final_retry_ = (retry_number == options_.max_retries - 1);
+      bool final_retry = (retry_number == options_.max_retries - 1);
       options_.prefetch_interval_picker->SetRetryNumber(retry_number);
-      bool success = AllocateColocatedIntervals(colocated_intervals);
-      if (success) {
+      Result result =
+          AllocateAllocationValues(absl::MakeSpan(allocation_values));
+      VLOG(2) << "Allocation result = "
+              << absl::StrFormat("%x", static_cast<int>(result));
+      if (result_requires_uncommit(result) ||
+          (!final_retry && result_failed_because_of_async_copy(result))) {
+        UncommitPendingChunks(absl::MakeSpan(allocation_values));
+        VLOG(2) << "Couldn't allocate. Retry number " << retry_number;
+      } else if (result_is(result, Result::kFailOutOfMemory) &&
+                 num_repacks_ < options_.max_repacks) {
+        UncommitPendingChunks(absl::MakeSpan(allocation_values));
+        ++num_repacks_;
+        CHECK_NE(options_.repacker, nullptr);
+        std::vector<RepackAllocationBlock*> repack_allocation_blocks;
+        ExportAllocationsForRepacking(repack_allocation_blocks);
+        VLOG(2) << "Repacking.";
+        auto repack_status =
+            options_.repacker->Repack(absl::MakeSpan(repack_allocation_blocks));
+        CHECK_EQ(repack_status.status(), Status::OK());
+        VLOG(2) << "Repack complete. Modified = " << *repack_status;
+        if (*repack_status) {
+          ImportRepackedAllocations(absl::MakeSpan(repack_allocation_blocks));
+          --retry_number;
+        }
+      } else {
+        FinalizeAllocations(absl::MakeSpan(allocation_values));
         break;
       }
-      VLOG(2) << "Couldn't allocate. Retry number " << retry_number;
     }
   }
 
@@ -935,9 +1095,10 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
   return result_;
 }
 
-bool AlternateMemoryBestFitHeap::AllocateColocatedIntervals(
-    const std::vector<const AlternateMemoryBestFitHeap::BufferInterval*>&
-        colocated_intervals) {
+void AlternateMemoryBestFitHeap::CreateAllocationValuesFromColocatedIntervals(
+    absl::Span<const AlternateMemoryBestFitHeap::BufferInterval* const>
+        colocated_intervals,
+    std::vector<MemorySpaceAssignment::AllocationValue>& allocation_values) {
   // TODO(berkin): For now, place the phi values due to conditionals in
   // default memory.
   for (const BufferInterval* colocated_interval : colocated_intervals) {
@@ -958,11 +1119,15 @@ bool AlternateMemoryBestFitHeap::AllocateColocatedIntervals(
   }
 
   // Create AllocationValues for all the colocated intervals.
-  std::vector<AllocationValue> allocation_values;
   for (const auto& colocated_interval : colocated_intervals) {
-    CreateAllocationValues(colocated_interval->buffer, &allocation_values);
+    CreateAllocationValues(*colocated_interval, allocation_values);
   }
   FindAliases(&allocation_values);
+}
+
+AlternateMemoryBestFitHeap::Result
+AlternateMemoryBestFitHeap::AllocateAllocationValues(
+    absl::Span<MemorySpaceAssignment::AllocationValue> allocation_values) {
   const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
 
   // Data structure to contain the preferred offset for a given computation.
@@ -971,8 +1136,8 @@ bool AlternateMemoryBestFitHeap::AllocateColocatedIntervals(
   absl::flat_hash_map<const HloComputation*, int64>
       preferred_offset_for_computation;
 
-  bool allocation_success = true;
-  for (auto& allocation_value : allocation_values) {
+  Result result = Result::kSuccess;
+  for (AllocationValue& allocation_value : allocation_values) {
     int64 definition_time =
         instruction_schedule.at(allocation_value.defining_instruction());
 
@@ -1086,20 +1251,19 @@ bool AlternateMemoryBestFitHeap::AllocateColocatedIntervals(
         request.start_time = std::min(definition_time, use_time);
         request.end_time = use_time;
         request.latest_prefetch_time = latest_prefetch_time;
-        request.size = colocated_intervals[0]->size;
+        request.size = allocation_value.size();
         request.allow_no_copy_alternate_mem_allocation =
             allow_no_copy_alternate_mem_allocation;
         request.earliest_prefetch_time = earliest_prefetch_time;
         request.preferred_offset = preferred_offset;
         request.use = &use;
         request.allocation_value = &allocation_value;
-        if (!AllocateSegment(request)) {
+        result_mark(AllocateSegment(request), result);
+        if (result_requires_uncommit(result)) {
           // If the allocation finding failed (e.g., due to running out of
           // asynchronous copies), then fall back to allocating the buffer
           // entirely in the default memory.
-          UncommitPendingChunks();
-          allocation_success = false;
-          break;
+          return result;
         }
 
         // If there are multiple uses, they can try using the memory allocation
@@ -1125,24 +1289,8 @@ bool AlternateMemoryBestFitHeap::AllocateColocatedIntervals(
             aliased_allocation->chunk().offset;
       }
     }
-    if (!allocation_success) {
-      break;
-    }
   }
-  if (allocation_success) {
-    for (AllocationValue& allocation_value : allocation_values) {
-      for (auto& allocation : *allocation_value.allocation_sequence()) {
-        AppendAllocationInfoDebugString(*colocated_intervals[0], *allocation,
-                                        &allocation_info_str_);
-        allocations_->push_back(std::move(allocation));
-      }
-    }
-  }
-
-  pending_chunks_.clear();
-  pending_async_copies_.clear();
-  pending_required_assignments_.clear();
-  return allocation_success;
+  return result;
 }
 
 bool operator<(const AsynchronousCopy& a, const AsynchronousCopy& b) {
@@ -1162,15 +1310,21 @@ void AsynchronousCopyOrdering::RemoveCopy(const AsynchronousCopy& copy) {
   ranges_.erase(copy_it);
 }
 
-bool AsynchronousCopyOrdering::ViolatesOrdering(int64 start_time,
-                                                int64 end_time) const {
+absl::optional<AsynchronousCopy> AsynchronousCopyOrdering::ViolatesOrdering(
+    int64 start_time, int64 end_time) const {
   // We allow identical start and end times. It is enough to check for just the
   // start time in case we find a match in ranges_ because the found value will
   // either be identical to {start_time, end_time} (and this doesn't violate) or
   // its start_time will be smaller and end_time will be larger (this violates).
   auto copy_it = ranges_.find(
       {start_time, end_time, MemorySpaceAssignment::MemorySpace::kAlternate});
-  return copy_it != ranges_.end() && copy_it->start_time != start_time;
+  if (copy_it != ranges_.end() && copy_it->start_time != start_time) {
+    VLOG(4) << "Violates ordering: (" << start_time << ", " << end_time
+            << ") and (" << copy_it->start_time << ", " << copy_it->end_time
+            << ")";
+    return *copy_it;
+  }
+  return absl::nullopt;
 }
 
 /*static*/ MemorySpaceAssignment::Allocation*
@@ -1228,9 +1382,7 @@ void AlternateMemoryBestFitHeap::AllocateCrossProgramPrefetchBuffer(
     allocations_->push_back(std::move(allocation));
   }
 
-  pending_chunks_.clear();
-  pending_async_copies_.clear();
-  pending_required_assignments_.clear();
+  ClearPendingChunks();
 }
 
 absl::optional<RequiredMemoryAssignment>
@@ -1407,7 +1559,40 @@ bool AlternateMemoryBestFitHeap::AreIntervalsReservedInAlternateMemory(
   return false;
 }
 
-void AlternateMemoryBestFitHeap::UncommitPendingChunks() {
+void AlternateMemoryBestFitHeap::ExportAllocationsForRepacking(
+    std::vector<AlternateMemoryBestFitHeap::RepackAllocationBlock*>&
+        allocations) {
+  for (RepackAllocationBlock& allocation_block : repack_allocation_blocks_) {
+    allocations.push_back(&allocation_block);
+  }
+}
+
+void AlternateMemoryBestFitHeap::ImportRepackedAllocations(
+    absl::Span<AlternateMemoryBestFitHeap::RepackAllocationBlock*>
+        repacked_allocations) {
+  interval_tree_ = {};
+  for (RepackAllocationBlock* allocation_block : repacked_allocations) {
+    MemorySpaceAssignment::Allocation* allocation = allocation_block->opaque;
+    VLOG(3) << "Moved " << allocation->ToString() << ", size "
+            << allocation->chunk().size << " from "
+            << allocation_block->initial_offset << " to "
+            << allocation_block->offset;
+    allocation_block->opaque->mutable_chunk()->offset =
+        allocation_block->offset;
+    interval_tree_.Add(allocation_block->start_time, allocation_block->end_time,
+                       {allocation_block->offset, allocation_block->size});
+    allocation_block->initial_offset = allocation_block->offset;
+    allocation_block->offset = -1;
+  }
+}
+
+void AlternateMemoryBestFitHeap::UncommitPendingChunks(
+    absl::Span<AllocationValue> allocation_values) {
+  // Clear the allocation sequence of the allocation values so that in case we
+  // retry allocation after uncommitting.
+  for (AllocationValue& allocation_value : allocation_values) {
+    allocation_value.allocation_sequence()->clear();
+  }
   for (const auto& interval_and_chunk : pending_chunks_) {
     const BufferInterval& interval = interval_and_chunk.first;
     const Chunk& chunk = interval_and_chunk.second.chunk;
@@ -1446,6 +1631,48 @@ void AlternateMemoryBestFitHeap::UncommitPendingChunks() {
       }
     }
   }
+  ClearPendingChunks();
+}
+
+void AlternateMemoryBestFitHeap::FinalizeAllocations(
+    absl::Span<AllocationValue> allocation_values) {
+  absl::flat_hash_map<int64, std::vector<MemorySpaceAssignment::Allocation*>>
+      colocation_map;
+  for (AllocationValue& allocation_value : allocation_values) {
+    for (auto& allocation : *allocation_value.allocation_sequence()) {
+      AppendAllocationInfoDebugString(allocation_value, *allocation,
+                                      allocation_info_str_);
+      allocations_->push_back(std::move(allocation));
+      MemorySpaceAssignment::Allocation* inserted_allocation =
+          allocations_->back().get();
+      if (inserted_allocation->memory_space() == MemorySpace::kAlternate) {
+        colocation_map[inserted_allocation->chunk().offset].push_back(
+            inserted_allocation);
+      }
+    }
+  }
+  // Assume allocations that received the same offset need to be colocated.
+  // Export these to repack_allocation_blocks_ so that we can repack them to
+  // reduce fragmentation.
+  for (auto& colocation : colocation_map) {
+    std::vector<RepackAllocationBlock*> colocations;
+    for (MemorySpaceAssignment::Allocation* colocated_allocation :
+         colocation.second) {
+      repack_allocation_blocks_.push_back(
+          {colocated_allocation->start_time(), colocated_allocation->end_time(),
+           colocated_allocation->chunk().size, /*offset=*/-1,
+           colocated_allocation->chunk().offset, /*colocations=*/{},
+           colocated_allocation});
+      colocations.push_back(&repack_allocation_blocks_.back());
+    }
+    for (RepackAllocationBlock* repack_block : colocations) {
+      repack_block->colocations = colocations;
+    }
+  }
+  ClearPendingChunks();
+}
+
+void AlternateMemoryBestFitHeap::ClearPendingChunks() {
   pending_chunks_.clear();
   pending_async_copies_.clear();
   pending_required_assignments_.clear();
@@ -1461,7 +1688,7 @@ void AlternateMemoryBestFitHeap::AddToPendingChunks(
   CommitChunk(buffer_interval, chunk_candidate);
 }
 
-bool AlternateMemoryBestFitHeap::AllocateSegment(
+AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::AllocateSegment(
     const AllocationRequest& request) {
   auto allocation_sequence = request.allocation_value->allocation_sequence();
   // start_time == end_time is a special case where the value is consumed
@@ -1472,7 +1699,7 @@ bool AlternateMemoryBestFitHeap::AllocateSegment(
         GetLiveAllocationAt(*allocation_sequence, request.end_time);
     CHECK_NE(allocation, nullptr);
     allocation->AddUse(request.use->hlo_use);
-    return true;
+    return Result::kSuccess;
   }
 
   const HloPosition& defining_position =
@@ -1536,12 +1763,15 @@ bool AlternateMemoryBestFitHeap::AllocateSegment(
     }
   }
 
+  Result allocation_result = Result::kSuccess;
   // First try keeping the allocation entirely in the alternate memory.
   if (required_memory_space_at_start != MemorySpace::kDefault &&
       required_memory_space_at_end != MemorySpace::kDefault &&
-      request.allow_no_copy_alternate_mem_allocation &&
-      AllocateInAlternateMemoryNoCopy(request)) {
-    return true;
+      request.allow_no_copy_alternate_mem_allocation) {
+    allocation_result = AllocateInAlternateMemoryNoCopy(request);
+    if (allocation_result == Result::kSuccess) {
+      return Result::kSuccess;
+    }
   }
 
   auto prev_allocation_it = allocation_sequence->rbegin();
@@ -1560,8 +1790,10 @@ bool AlternateMemoryBestFitHeap::AllocateSegment(
       (*prev_allocation_it)->defining_position() == defining_position) {
     // If there was an allocation for this HloValue that was in the alternate
     // memory space, we also need to perform an eviction.
-    if (!Evict(request)) {
-      return false;
+    Result eviction_result = Evict(request);
+    if (eviction_result != Result::kSuccess) {
+      // A non-success eviction requires us to uncommit previous allocations.
+      return result_mark(Result::kFailRequiresUncommit, eviction_result);
     }
     prev_allocation_in_default_mem_it = allocation_sequence->rbegin();
   } else if (prev_allocation_in_default_mem_it == allocation_sequence->rend()) {
@@ -1582,31 +1814,28 @@ bool AlternateMemoryBestFitHeap::AllocateSegment(
         << "Not trying to prefetch because use requires buffer in default mem.";
     (*prev_allocation_in_default_mem_it)->Extend(request.end_time);
     (*prev_allocation_in_default_mem_it)->AddUse(request.use->hlo_use);
-    return true;
+    return Result::kSuccess;
   }
 
   // Finally, try to prefetch the buffer into alternate memory.
-  if (Prefetch(request, **prev_allocation_in_default_mem_it)) {
-    return true;
-  }
-  if (!final_retry_ && prefetch_failed_due_to_async_copy_) {
-    // If prefetching failed due to asynchronous copy and we're not in our final
-    // try, return false (failure) so that we can retry this interval with
-    // larger limits.
-    return false;
+  Result prefetch_result =
+      Prefetch(request, **prev_allocation_in_default_mem_it);
+  if (prefetch_result == Result::kSuccess) {
+    return Result::kSuccess;
   }
+  result_mark(prefetch_result, allocation_result);
 
   // If the end assignment was required to be in alternate memory but that
   // wasn't possible, then this allocation is invalid.
   if (required_memory_space_at_end == MemorySpace::kAlternate) {
-    return false;
+    return result_mark(Result::kFailRequiresUncommit, allocation_result);
   }
 
   // If a copy wasn't inserted, then add this use to the latest allocation in
   // default memory.
   (*prev_allocation_in_default_mem_it)->Extend(request.end_time);
   (*prev_allocation_in_default_mem_it)->AddUse(request.use->hlo_use);
-  return true;
+  return allocation_result;
 }
 
 void AlternateMemoryBestFitHeap::AddAsyncCopy(
@@ -1667,12 +1896,14 @@ bool AlternateMemoryBestFitHeap::ViolatesMaximumOutstandingAsyncCopies(
   }
 }
 
-bool AlternateMemoryBestFitHeap::ViolatesAsyncCopyOrdering(
-    int64 start_time, int64 end_time) const {
+absl::optional<AsynchronousCopy>
+AlternateMemoryBestFitHeap::ViolatesAsyncCopyOrdering(int64 start_time,
+                                                      int64 end_time) const {
   return async_copy_ordering_.ViolatesOrdering(start_time, end_time);
 }
 
-bool AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
+AlternateMemoryBestFitHeap::Result
+AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
     const AllocationRequest& request) {
   MemorySpaceAssignment::Allocation* prev_allocation = nullptr;
   bool can_eliminate_copy = false;
@@ -1691,7 +1922,7 @@ bool AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
   }
 
   if (!can_eliminate_copy) {
-    return false;
+    return Result::kFailPrevAllocationNotInAlternateMem;
   }
 
   const HloPosition& defining_position =
@@ -1699,7 +1930,7 @@ bool AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
   if (!options_.prefetch_interval_picker->CanAllocateInAlternateMemoryNoCopy(
           defining_position.shape(), request.start_time + 1,
           request.end_time)) {
-    return false;
+    return Result::kFailLiveRangeTooLong;
   }
 
   BufferInterval alternate_mem_interval;
@@ -1778,12 +2009,13 @@ bool AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
     }
     request.allocation_value->allocation_sequence()->back()->AddUse(
         request.use->hlo_use);
-    return true;
+    return Result::kSuccess;
   }
-  return false;
+  return Result::kFailOutOfMemory;
 }
 
-bool AlternateMemoryBestFitHeap::Evict(const AllocationRequest& request) {
+AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::Evict(
+    const AllocationRequest& request) {
   CHECK_GT(request.allocation_value->allocation_sequence()->size(), 0);
   MemorySpaceAssignment::Allocation* prev_allocation =
       request.allocation_value->allocation_sequence()->back().get();
@@ -1872,13 +2104,62 @@ bool AlternateMemoryBestFitHeap::Evict(const AllocationRequest& request) {
               << " and "
               << hlo_live_range_.flattened_instruction_sequence()
                      .instructions()[eviction_end_time];
-      return false;
+      // return false;
+      return Result::kFailOutOfAsyncCopies;
     }
   }
-  return true;
+  // return true;
+  return Result::kSuccess;
 }
 
-bool AlternateMemoryBestFitHeap::Prefetch(
+int64 AlternateMemoryBestFitHeap::FindPrefetchEndTime(
+    const AllocationRequest& request, int64 earliest_prefetch_time) const {
+  int64 prefetch_end_time = request.latest_prefetch_time;
+
+  for (int retry_number = 0;
+       retry_number < options_.prefetch_copy_done_reorder_max_retries;
+       ++retry_number) {
+    int64 latest_prefetch_time =
+        options_.prefetch_interval_picker->LatestPrefetchStartTime(
+            request.use->hlo_use, earliest_prefetch_time, prefetch_end_time);
+    VLOG(4) << "Latest prefetch start time = " << latest_prefetch_time
+            << ", earliest prefetch start time = " << earliest_prefetch_time
+            << ", prefetch end time = " << prefetch_end_time;
+    // Return if we couldn't find a suitable prefetch start time.
+    if (latest_prefetch_time < earliest_prefetch_time) {
+      break;
+    }
+
+    // Return either if there is no other violating asynchronous copy (since we
+    // don't need to change the prefetch end time) or if the violating
+    // asynchronous copy ends after the prefetch end time.
+    auto violating_async_copy =
+        ViolatesAsyncCopyOrdering(latest_prefetch_time, prefetch_end_time);
+    if (!violating_async_copy ||
+        violating_async_copy->end_time >= prefetch_end_time) {
+      break;
+    }
+    VLOG(4) << "Violating async copy: (" << violating_async_copy->start_time
+            << ", " << violating_async_copy->end_time << ")";
+
+    int64 new_prefetch_end_time =
+        options_.prefetch_interval_picker->LatestPrefetchEndTime(
+            prefetch_end_time, violating_async_copy->end_time);
+    if (new_prefetch_end_time > earliest_prefetch_time) {
+      VLOG(3) << "Update prefetch end time = " << new_prefetch_end_time;
+      prefetch_end_time = new_prefetch_end_time;
+    } else {
+      VLOG(3) << "Can't update prefetch end time = " << new_prefetch_end_time
+              << " because earliest prefetch start time = "
+              << earliest_prefetch_time;
+      break;
+    }
+  }
+
+  return prefetch_end_time;
+}
+
+AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::Prefetch(
     const AllocationRequest& request,
     const MemorySpaceAssignment::Allocation& prev_allocation_in_default_mem) {
   // Try partially placing the buffer in the alternate space. The time that is
@@ -1899,9 +2180,11 @@ bool AlternateMemoryBestFitHeap::Prefetch(
     earliest_prefetch_time =
         std::max(earliest_prefetch_time, *request.earliest_prefetch_time);
   }
-  options_.prefetch_interval_picker->Begin(request.use->hlo_use,
-                                           earliest_prefetch_time,
-                                           request.latest_prefetch_time);
+  int64 prefetch_end_time =
+      FindPrefetchEndTime(request, earliest_prefetch_time);
+
+  options_.prefetch_interval_picker->Begin(
+      request.use->hlo_use, earliest_prefetch_time, prefetch_end_time);
   VLOG(3) << "Trying prefetch picker = "
           << options_.prefetch_interval_picker->ToDebugString();
 
@@ -1910,33 +2193,30 @@ bool AlternateMemoryBestFitHeap::Prefetch(
   BufferInterval alternate_mem_interval;
   alternate_mem_interval.buffer = request.allocation_value->value();
   alternate_mem_interval.size = request.size;
-  // If any of the prefetch intervals couldn't be used due to number of
-  // outstanding async copy limit or async copy ordering, set
-  // prefetch_failed_due_to_async_copy_.
-  prefetch_failed_due_to_async_copy_ = false;
   // While uses might be allowed to have additional outstanding prefetches.
   int64 extra_async_copy_limit =
       request.use->hlo_use.instruction->opcode() == HloOpcode::kWhile
           ? options_.while_use_extra_outstanding_prefetch_limit
           : 0;
+  Result result = Result::kSuccess;
   while (!options_.prefetch_interval_picker->Done()) {
     alternate_mem_interval.start = options_.prefetch_interval_picker->Next();
-    CHECK_LT(alternate_mem_interval.start, request.latest_prefetch_time);
+    CHECK_LT(alternate_mem_interval.start, prefetch_end_time);
     VLOG(4) << "Trying alternate memory allocation ("
             << alternate_mem_interval.start << ", " << request.end_time << ")";
     // If this additional asynchronous copy would violate the limit, try a
     // different interval.
     if (ViolatesAsyncCopyOrdering(alternate_mem_interval.start,
-                                  request.latest_prefetch_time)) {
+                                  prefetch_end_time)) {
       VLOG(4) << "This would violate asynchronous copy ordering.";
-      prefetch_failed_due_to_async_copy_ = true;
+      result_mark(Result::kFailViolatesAsyncCopyOrdering, result);
       continue;
     }
     if (ViolatesMaximumOutstandingAsyncCopies(
-            alternate_mem_interval.start, request.latest_prefetch_time,
+            alternate_mem_interval.start, prefetch_end_time,
             /*is_prefetch=*/true, extra_async_copy_limit)) {
       VLOG(4) << "This would violate the outstanding async copy limit.";
-      prefetch_failed_due_to_async_copy_ = true;
+      result_mark(Result::kFailOutOfAsyncCopies, result);
       continue;
     }
 
@@ -1955,16 +2235,22 @@ bool AlternateMemoryBestFitHeap::Prefetch(
 
       AddAsyncCopy(prev_allocation_in_default_mem, MemorySpace::kAlternate,
                    chunk_candidate->chunk, alternate_mem_interval.start,
-                   request.end_time, request.latest_prefetch_time,
+                   request.end_time, prefetch_end_time,
                    request.allocation_value->allocation_sequence());
 
       request.allocation_value->allocation_sequence()->back()->AddUse(
           request.use->hlo_use);
-      prefetch_failed_due_to_async_copy_ = false;
-      return true;
+      return Result::kSuccess;
     }
+    result_mark(Result::kFailOutOfMemory, result);
+  }
+  // If we didn't consider any prefetch intervals, then the live range was too
+  // short.
+  if (result == Result::kSuccess) {
+    return Result::kFailLiveRangeTooShort;
+  } else {
+    return result;
   }
-  return false;
 }
 
 absl::optional<AlternateMemoryBestFitHeap::ChunkCandidate>
@@ -2113,6 +2399,9 @@ bool IsCrossProgramPrefetchCandidate(
   return value.instruction()->parent() ==
              value.instruction()->GetModule()->entry_computation() &&
          value.instruction()->opcode() == HloOpcode::kParameter &&
+         (!value.shape().has_layout() ||
+          value.shape().layout().memory_space() !=
+              options.alternate_memory_space) &&
          value.index().size() == 1 && value.shape().IsArray() &&
          !value.uses().empty() &&
          options.size_fn(value) <= options.max_size_in_bytes &&
@@ -2381,6 +2670,8 @@ Status MemorySpaceAssignment::CopyAllocation::Process(
       HloOpcode::kCopyStart, producing_instruction));
   copy_done_ = computation->AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCopyDone, copy_start_));
+  VLOG(4) << "Created " << copy_start_->name()
+          << " for position: " << defining_position().ToString();
   // Update the allocation position with the copy done instruction so that if
   // there are further copies from it, it can find the correct position.
   defining_position_ = HloPosition{copy_done_, {}};
@@ -2840,18 +3131,23 @@ Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace() {
         }
       }
 
-      if (last_use_instruction &&
-          last_use_instruction->opcode() == HloOpcode::kConditional) {
+      std::function<Status(const HloInstruction*, int64, int64,
+                           absl::string_view)>
+          split_conditional_buffer;
+      split_conditional_buffer = [&](const HloInstruction* use_instruction,
+                                     int64 start_time, int64 end_time,
+                                     absl::string_view indent_string) {
         // Special case when verifying conditional: we internally split the use
         // of alternate memory in conditionals, so fish them out from the
         // conditionals.
-        VLOG(3) << " Splitting conditional buffer: " << buffer.ToString()
-                << " value: " << value->ToShortString() << ": ("
-                << time_bound.start << ", " << time_bound.end
-                << ") off: " << chunk.offset << ", size: " << chunk.size;
-        int64 earliest_computation_start_time = time_bound.end;
+        VLOG(3) << indent_string
+                << "Splitting conditional buffer: " << buffer.ToString()
+                << " value: " << value->ToShortString() << ": (" << start_time
+                << ", " << end_time << ") off: " << chunk.offset
+                << ", size: " << chunk.size;
+        int64 earliest_computation_start_time = end_time;
         for (const HloComputation* called_computation :
-             last_use_instruction->called_computations()) {
+             use_instruction->called_computations()) {
           earliest_computation_start_time =
               std::min(earliest_computation_start_time,
                        hlo_live_range->computation_span_times()
@@ -2859,6 +3155,7 @@ Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace() {
                            .start);
           int64 parameter_time = -1;
           int64 last_use_time = -1;
+          const HloInstruction* last_use_instruction = nullptr;
           for (const HloPosition& position : value->positions()) {
             if (position.instruction->opcode() == HloOpcode::kParameter &&
                 position.instruction->parent() == called_computation) {
@@ -2868,27 +3165,45 @@ Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace() {
             }
           }
           for (const HloUse& use : value->uses()) {
-            if (use.instruction->parent() == called_computation) {
-              last_use_time = std::max(
-                  last_use_time,
-                  hlo_live_range->instruction_schedule().at(use.instruction));
+            int64 use_time =
+                hlo_live_range->instruction_schedule().at(use.instruction);
+            if (use.instruction->parent() == called_computation &&
+                use_time > last_use_time) {
+              last_use_time = use_time;
+              last_use_instruction = use.instruction;
             }
           }
           if (last_use_time != -1) {
             CHECK_NE(parameter_time, -1);
-            VLOG(3) << "  computation: " << called_computation->name() << ": ("
+            VLOG(3) << indent_string
+                    << " computation: " << called_computation->name() << ": ("
                     << parameter_time << ", " << last_use_time << ")";
-            TF_RETURN_IF_ERROR(add_allocation_and_verify(
-                parameter_time, last_use_time, chunk, value));
+            CHECK(last_use_instruction);
+            if (last_use_instruction->opcode() == HloOpcode::kConditional) {
+              // The last use is another (nested) conditional. Call this
+              // function recursively.
+              TF_RETURN_IF_ERROR(split_conditional_buffer(
+                  last_use_instruction, parameter_time, last_use_time,
+                  absl::StrCat(indent_string, "  ")));
+            } else {
+              TF_RETURN_IF_ERROR(add_allocation_and_verify(
+                  parameter_time, last_use_time, chunk, value));
+            }
           }
         }
-        VLOG(3) << "  from beginning until first computation: ("
-                << time_bound.start << ", "
-                << (earliest_computation_start_time - 1) << ")";
+        VLOG(3) << indent_string << " from beginning until first computation: ("
+                << start_time << ", " << (earliest_computation_start_time - 1)
+                << ")";
         TF_RETURN_IF_ERROR(add_allocation_and_verify(
-            time_bound.start, earliest_computation_start_time - 1, chunk,
-            value));
-      } else {
+            start_time, earliest_computation_start_time - 1, chunk, value));
+        return Status::OK();
+      };
+
+      if (last_use_instruction &&
+          last_use_instruction->opcode() == HloOpcode::kConditional) {
+        TF_RETURN_IF_ERROR(split_conditional_buffer(
+            last_use_instruction, time_bound.start, time_bound.end, " "));
+      } else if (!value->uses().empty()) {
         VLOG(3) << " buffer: " << buffer.ToString()
                 << " value: " << value->ToShortString() << ": ("
                 << time_bound.start << ", " << time_bound.end
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.h b/tensorflow/compiler/xla/service/memory_space_assignment.h
index 5c5329033fd..d366c06a599 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.h
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
+#include "tensorflow/compiler/xla/service/memory_space_assignment_repacking.h"
 
 namespace xla {
 
@@ -84,6 +85,8 @@ class MemorySpaceAssignmentCostAnalysis {
     absl::flat_hash_map<const HloInstruction*, float> while_nest_multiplier;
   };
 
+  virtual ~MemorySpaceAssignmentCostAnalysis() = default;
+
   static StatusOr<std::unique_ptr<MemorySpaceAssignmentCostAnalysis>> Create(
       const HloCostAnalysis& cost_analysis,
       float async_copy_bandwidth_bytes_per_second,
@@ -126,18 +129,23 @@ class MemorySpaceAssignmentCostAnalysis {
   // BufferInterval is prefetched.
   float GetInstructionElapsedDueToMemorySlowdown(int64 bytes) const;
 
+  // Returns the estimated elapsed duration of the instruction in seconds.  It
+  // assumes all operands and outputs of the instruction are in the default
+  // memory.
+  virtual float GetInstructionElapsed(const HloInstruction& instruction) const;
+
   // Returns the estimated elapsed duration of the instruction in seconds.  It
   // assumes all operands and outputs of the instruction are in the default
   // memory, except for the operand number that is in the alternate memory, if
   // provided, or output if output_in_alternate_mem is true.
-  float GetInstructionElapsed(
+  virtual float GetInstructionElapsedInAlternateMemory(
       const HloInstruction& instruction,
-      absl::optional<int64> operand_in_alternate_mem = absl::nullopt,
-      bool output_in_alternate_mem = false) const;
+      absl::optional<int64> operand_in_alternate_mem,
+      bool output_in_alternate_mem) const;
 
   // Returns the elapsed time it would take to asynchronously copy the shape
   // from default to alternate memory space (or vice versa).
-  float GetAsyncCopyElapsed(const Shape& shape) const;
+  virtual float GetAsyncCopyElapsed(const Shape& shape) const;
 
   int64 GetScheduleEndTime() const;
 
@@ -147,7 +155,7 @@ class MemorySpaceAssignmentCostAnalysis {
 
   const HloLiveRange& hlo_live_range() const { return *hlo_live_range_; }
 
- private:
+ protected:
   MemorySpaceAssignmentCostAnalysis(
       const HloCostAnalysis& cost_analysis,
       float async_copy_bandwidth_bytes_per_second,
@@ -164,6 +172,7 @@ class MemorySpaceAssignmentCostAnalysis {
         hlo_live_range_(std::move(hlo_live_range)),
         call_graph_(std::move(call_graph)) {}
 
+ private:
   const HloCostAnalysis& cost_analysis_;
   float async_copy_bandwidth_bytes_per_second_;
   float alternate_mem_bandwidth_bytes_per_second_;
@@ -190,6 +199,17 @@ class PrefetchIntervalPicker {
   virtual int64 PreferredEvictionEndTime(const Shape& shape, int64 start_time,
                                          int64 latest_end_time) const = 0;
 
+  // Returns the latest time that a prefetch can start.
+  virtual int64 LatestPrefetchStartTime(const HloUse& use, int64 start_time,
+                                        int64 end_time) const = 0;
+
+  // Returns the latest time that a prefetch can end that is less than or equal
+  // to proposed_prefetch_end_time.
+  virtual int64 LatestPrefetchEndTime(int64 original_prefetch_end_time,
+                                      int64 proposed_prefetch_end_time) const {
+    return proposed_prefetch_end_time;
+  }
+
   // Begins the iterator for the first start time of the prefetch.
   virtual void Begin(const HloUse& use, int64 start_time, int64 end_time) = 0;
 
@@ -248,6 +268,9 @@ class InstructionCountPrefetchIntervalPicker : public PrefetchIntervalPicker {
   int64 PreferredEvictionEndTime(const Shape& shape, int64 start_time,
                                  int64 latest_end_time) const override;
 
+  int64 LatestPrefetchStartTime(const HloUse& use, int64 start_time,
+                                int64 end_time) const override;
+
   void Begin(const HloUse& use, int64 start_time, int64 end_time) override;
 
   int64 Next() override;
@@ -267,16 +290,16 @@ class InstructionCountPrefetchIntervalPicker : public PrefetchIntervalPicker {
 // Prefetch interval picker that uses cost analysis to overlap asynchronous
 // copies with independent computation. It uses min/max (asynchronous copy
 // duration) / (independent computation duration) ratios to guide whether the
-// prefetch is within those bounds. It starts with the maximum allowed ratio
-// (earliest prefetch) in Begin() and works its way for later and later prefetch
-// with each Next() call until hitting the minimum ratio, in order not to hurt
-// the critical path.
+// prefetch is within those bounds. It starts with the preferred ratio in
+// Begin() and works its way for alternately earlier and later prefetches until
+// hitting min and max ratios.
 class CostAnalysisPrefetchIntervalPicker : public PrefetchIntervalPicker {
  public:
   CostAnalysisPrefetchIntervalPicker(
       const MemorySpaceAssignmentCostAnalysis& cost_analysis,
       float min_async_copy_to_overlap_ratio,
-      float max_async_copy_to_overlap_ratio);
+      float max_async_copy_to_overlap_ratio,
+      float preferred_async_copy_to_overlap_ratio);
 
   bool CanAllocateInAlternateMemoryNoCopy(const Shape& shape, int64 start_time,
                                           int64 end_time) const override;
@@ -284,6 +307,11 @@ class CostAnalysisPrefetchIntervalPicker : public PrefetchIntervalPicker {
   int64 PreferredEvictionEndTime(const Shape& shape, int64 start_time,
                                  int64 latest_end_time) const override;
 
+  int64 LatestPrefetchStartTime(const HloUse& use, int64 start_time,
+                                int64 end_time) const override;
+  int64 LatestPrefetchEndTime(int64 original_prefetch_end_time,
+                              int64 proposed_prefetch_end_time) const override;
+
   void Begin(const HloUse& use, int64 start_time, int64 end_time) override;
 
   int64 Next() override;
@@ -319,13 +347,17 @@ class CostAnalysisPrefetchIntervalPicker : public PrefetchIntervalPicker {
   const MemorySpaceAssignmentCostAnalysis& cost_analysis_;
   float min_async_copy_to_overlap_ratio_;
   float max_async_copy_to_overlap_ratio_;
+  float preferred_async_copy_to_overlap_ratio_;
   float max_overlap_multiplier_ = 1.0;
 
   float async_copy_elapsed_;
   float inst_elapsed_reduction_;
   int64 end_logical_time_;
-  int64 earliest_start_logical_time_;
-  int64 current_logical_prefetch_time_;
+  int64 earliest_prefetch_time_;
+  int64 latest_prefetch_time_;
+  bool using_increasing_prefetch_time_iterator_;
+  int64 increasing_prefetch_time_iterator_;
+  int64 decreasing_prefetch_time_iterator_;
 };
 
 // MemorySpaceAssignment assigns memory spaces (default or alternate) to each
@@ -348,6 +380,9 @@ class MemorySpaceAssignment {
   // space and a fast and small alternate memory space.
   enum class MemorySpace { kDefault, kAlternate };
 
+  // Forward declaration for Allocation.
+  class Allocation;
+
   // The different options to be passed to the Run() API.
   struct Options {
     // Backend-specific integer value that describes the alternate memory.
@@ -383,11 +418,25 @@ class MemorySpaceAssignment {
     // max_outstanding_prefetches).
     int64 while_use_extra_outstanding_prefetch_limit = 0;
 
+    // Specifies the maximum number of times we are willing to move a copy
+    // done of a prefetch earlier due to an asynchronous copy ordering
+    // violation.
+    int64 prefetch_copy_done_reorder_max_retries = 1;
+
     // Specifies the maximum number of retries that will be performed for each
     // value in case prefetching failed due to running out of asynchronous
     // copies or asynchronous copy ordering.
     int64 max_retries = 1;
 
+    // The maximum number of repacks that we are willing to perform in case we
+    // can't allocate a buffer due to running out of memory. If this value is
+    // greater than 0, repacker must be non-nullptr.
+    int64 max_repacks = 0;
+
+    // The repacking algorithm to reduce fragmentation. Must be non-null if
+    // max_repacks is greater than 0.
+    MemorySpaceAssignmentRepacker<Allocation*>* repacker = nullptr;
+
     // If true, tries allocating buffers across (e.g., before and inside a while
     // loop body) sequential calls (kWhile, kCall, and kConditional).
     bool allocate_across_sequential_calls = false;
@@ -475,10 +524,12 @@ class MemorySpaceAssignment {
     const std::vector<HloUse>& uses() const { return uses_; }
     MemorySpace memory_space() const { return memory_space_; }
     Chunk chunk() const { return *chunk_; }
+    Chunk* mutable_chunk() { return &*chunk_; }
     void set_start_time(int64 start_time) { start_time_ = start_time; }
     int64 start_time() const { return start_time_; }
     int64 end_time() const { return end_time_; }
 
+    bool operator==(const Allocation& other) const;
     virtual std::string ToString() const;
 
    protected:
@@ -501,6 +552,9 @@ class MemorySpaceAssignment {
   };
 
   // This class represents an allocation as a result of an asynchronous copy.
+  // Note: CopyStart instructions are inserted after `start_time` or later,
+  // while CopyDone instructions are inserted before
+  // `copy_done_schedule_before_time` or earlier.
   class CopyAllocation : public Allocation {
    public:
     CopyAllocation(const Allocation& prev_allocation, MemorySpace memory_space,
@@ -550,6 +604,7 @@ class MemorySpaceAssignment {
       copy_start_schedule_after_ = copy_start_schedule_after;
     }
 
+    bool operator==(const CopyAllocation& other) const;
     std::string ToString() const override;
 
    private:
@@ -646,13 +701,15 @@ class MemorySpaceAssignment {
       std::vector<HloPosition> aliases;
     };
 
-    AllocationValue(const HloValue* value, const HloPosition& position)
-        : value_(value), defining_position_(position) {}
+    AllocationValue(const HloValue* value, const HloPosition& position,
+                    int64 size)
+        : value_(value), defining_position_(position), size_(size) {}
 
     const HloPosition& defining_position() const { return defining_position_; }
     const HloInstruction* defining_instruction() const {
       return defining_position().instruction;
     }
+    int64 size() const { return size_; }
     const std::vector<Use>& uses() const { return uses_; }
     std::vector<Use>& uses() { return uses_; }
     const HloValue* value() const { return value_; }
@@ -671,6 +728,7 @@ class MemorySpaceAssignment {
    private:
     const HloValue* value_;
     HloPosition defining_position_;
+    int64 size_;
     std::vector<Use> uses_;
     AllocationSequence allocation_sequence_;
   };
@@ -835,9 +893,9 @@ class AsynchronousCopyOrdering {
   // Removes an asynchronous copy. CHECKs that it is removed.
   void RemoveCopy(const AsynchronousCopy& copy);
 
-  // Returns true if the addition of an asynchronous copy in the the given time
-  // interval would violate the asynchronous copy ordering. E.g., consider the
-  // following scenario:
+  // If the addition of an asynchronous copy in the given time interval would
+  // violate the asynchronous copy ordering, returns the violating
+  // already-committed asynchronous copy. E.g., consider the following scenario:
   //                                  CS          CD
   //  already committed async copy:   +-----------+
   //                new async copy:     +--------+
@@ -845,7 +903,8 @@ class AsynchronousCopyOrdering {
   // The new asynchronous copy would violate the ordering guarantee because the
   // copy start is after an already committed asynchronous copy while its copy
   // done is before the committed copy.
-  bool ViolatesOrdering(int64 start_time, int64 end_time) const;
+  absl::optional<AsynchronousCopy> ViolatesOrdering(int64 start_time,
+                                                    int64 end_time) const;
 
  private:
   // Stores asynchronous copies in a tree set respecting the pipelining order.
@@ -884,6 +943,9 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
   HeapSimulator::Result Finish() override;
 
  private:
+  using RepackAllocationBlock = MemorySpaceAssignmentRepacker<
+      MemorySpaceAssignment::Allocation*>::AllocationBlock;
+
   // An allocation request for a use segment. A use segment is the time segment
   // between the definition and the first use, and the time segment between the
   // uses of a buffer. For example, the time between the definition and Use1, is
@@ -916,6 +978,62 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
     MemorySpaceAssignment::AllocationValue* allocation_value;
   };
 
+  // Result of an allocation, prefetch, eviction etc. request.  The result is
+  // either kSuccess or a bitwise OR of one or more failures. The values are
+  // unique powers of two. To check if a result contains a particular failure,
+  // use the result_is method. To add a new failure to a result, use the
+  // result_mark method.
+  enum class Result {
+    // Successful allocation.
+    kSuccess = 0,
+    // Allocation failed because we ran out of alternate memory.
+    kFailOutOfMemory = 1,
+    // A no-copy allocation couldn't be performed because the previous
+    // allocation wasn't in the alternate memory space.
+    kFailPrevAllocationNotInAlternateMem = 2,
+    // A no-copy allocation couldn't be performed because the live range was too
+    // long.
+    kFailLiveRangeTooLong = 4,
+    // A prefetching couldn't be performed because the live range was too short.
+    kFailLiveRangeTooShort = 8,
+    // Ran out of outstanding asynchronous copy limit either during prefetching
+    // or eviction.
+    kFailOutOfAsyncCopies = 16,
+    // A prefetching couldn't be performed because the asynchronous copy
+    // ordering was violated.
+    kFailViolatesAsyncCopyOrdering = 32,
+    // An allocation failure happened that requires uncommitting all the pending
+    // allocations. Usually this is due to a situation requiring an eviction but
+    // the eviction couldn't be performed.
+    kFailRequiresUncommit = 64
+  };
+
+  // Return true if the result belongs to a failure.
+  static bool result_is(Result result, Result failure) {
+    return static_cast<int>(result) & static_cast<int>(failure);
+  }
+
+  // Mark (bitwise OR) a failure to the result.
+  static Result result_mark(Result failure, Result& result) {
+    result = static_cast<Result>(static_cast<int>(result) |
+                                 static_cast<int>(failure));
+    return result;
+  }
+
+  // Return true if the result is a failure that requires us to uncommit pending
+  // chunks.
+  static bool result_requires_uncommit(Result result) {
+    return result_is(result, Result::kFailRequiresUncommit);
+  }
+
+  // Return true if the result is a failure either due to running out of
+  // outstanding asynchronous copies or due to violating asynchronous copy
+  // ordering.
+  static bool result_failed_because_of_async_copy(Result result) {
+    return result_is(result, Result::kFailOutOfAsyncCopies) ||
+           result_is(result, Result::kFailViolatesAsyncCopyOrdering);
+  }
+
   // Given an allocation sequence, returns the live allocation at time with a
   // preference towards allocations in alternate memory. Returns nullptr if no
   // allocation is alive at that time.
@@ -926,17 +1044,24 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
   bool IsUseAllowedInAlternateMemory(const AllocationValue& value,
                                      const HloUse& use) const;
 
-  // Given an HloValue, creates AllocationValue objects and corresponding
+  // Given a BufferInterval, creates AllocationValue objects and corresponding
   // AllocationSequences and appends them into allocation_sequence_list_.
-  void CreateAllocationValues(const HloValue* value,
-                              std::vector<AllocationValue>* allocation_values);
+  void CreateAllocationValues(
+      const BufferInterval& buffer_interval,
+      std::vector<AllocationValue>& allocation_values) const;
 
-  // Finds allocations for colocated intervals. Colocated intervals consist of
-  // one or more BufferIntervals, each with a different HloValue. All of the
-  // intervals within colocated intervals have a must-alias relationship with
-  // each other. Returns true if allocation succeeded.
-  bool AllocateColocatedIntervals(
-      const std::vector<const BufferInterval*>& colocated_intervals);
+  // Given colocated intervals, populates allocation_values with the
+  // corresponding AllocationValue objects.
+  void CreateAllocationValuesFromColocatedIntervals(
+      absl::Span<const BufferInterval* const> colocated_intervals,
+      std::vector<AllocationValue>& allocation_values);
+
+  // Finds allocations for allocation values generated from colocated intervals.
+  // All of the allocation values have a must-alias relationship with each
+  // other. Returns either kSuccess if all of the sites could be placed in the
+  // alternate memory or a bitwise OR of failure reasons why they couldn't
+  Result AllocateAllocationValues(
+      absl::Span<AllocationValue> allocation_values);
 
   // Go through all the uses in the AllocationValues and find the aliasing
   // positions.
@@ -954,20 +1079,26 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
   //     if there is enough space and if the prefetch interval picker allows.
   //
   // If an eviction (2) was requested and was unsuccessful, this method returns
-  // false. This means we could not find a suitable allocation, so all previous
-  // allocations for this buffer must be removed and allocated in the default
-  // memory. Otherwise, this method returns true.
-  bool AllocateSegment(const AllocationRequest& request);
+  // Result::kFailRequiresUncommit. This means we could not find a suitable
+  // allocation, so all previous allocations for this buffer must be removed and
+  // allocated in the default memory. Otherwise, this method may return
+  // Result::kSuccess if the buffer could be placed in alternate memory or some
+  // other Result with an OR of reasons why the buffer couldn't be placed in
+  // alternate memory.
+  Result AllocateSegment(const AllocationRequest& request);
 
-  // Try allocating in alternate memory without any copies. Returns true if
-  // successful.
-  bool AllocateInAlternateMemoryNoCopy(const AllocationRequest& request);
+  // Try allocating in alternate memory without any copies.
+  Result AllocateInAlternateMemoryNoCopy(const AllocationRequest& request);
 
-  // Try evicting to default memory space. Returns true if successful.
-  bool Evict(const AllocationRequest& request);
+  // Try evicting to default memory space.
+  Result Evict(const AllocationRequest& request);
 
-  // Try prefetching to alternate memory space. Returns true if successful.
-  bool Prefetch(
+  // Returns the time a copy done of a prefetch should be scheduled.
+  int64 FindPrefetchEndTime(const AllocationRequest& request,
+                            int64 earliest_prefetch_time) const;
+
+  // Try prefetching to alternate memory space.
+  Result Prefetch(
       const AllocationRequest& request,
       const MemorySpaceAssignment::Allocation& prev_allocation_in_default_mem);
 
@@ -1030,8 +1161,20 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
       int64 start_time, int64 end_time, bool is_prefetch,
       int64 extra_async_copy_limit = 0) const;
 
-  // Return true if the asynchronous copy would violate the pipelining order.
-  bool ViolatesAsyncCopyOrdering(int64 start_time, int64 end_time) const;
+  // If the asynchronous copy would violate the pipelining order, returns the
+  // violating asynchronous copy.
+  absl::optional<AsynchronousCopy> ViolatesAsyncCopyOrdering(
+      int64 start_time, int64 end_time) const;
+
+  // Exports the allocations for repacking and puts them into the vector in the
+  // parameter.
+  void ExportAllocationsForRepacking(
+      std::vector<RepackAllocationBlock*>& allocations);
+
+  // Imports repacked allocations and updates the internal data structures
+  // consistent with the new packing.
+  void ImportRepackedAllocations(
+      absl::Span<RepackAllocationBlock*> repacked_allocations);
 
   // Adds an asynchronous copy to the allocations.
   void AddAsyncCopy(const MemorySpaceAssignment::Allocation& prev_allocation,
@@ -1047,17 +1190,24 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
                           const ChunkCandidate& chunk_candidate);
   // If we need to remove the allocations for this allocation sequence, this
   // removes pending chunks and asynchronous copies in the respective pending
-  // buffers from the interval trees.
-  void UncommitPendingChunks();
+  // buffers from the interval trees. If an allocation request returns
+  // kFailRequiresUncommit, this method must be called.
+  void UncommitPendingChunks(absl::Span<AllocationValue> allocation_values);
+
+  // Finalizes the allocations where they can no longer be uncommitted.
+  void FinalizeAllocations(absl::Span<AllocationValue> allocation_values);
+
+  // Clears all pending chunks and asynchronous copies.
+  void ClearPendingChunks();
 
   // Append buffer and allocation infos for debugging and dump it into a file,
   // if enabled.
   void AppendBufferInfoDebugString(const BufferInterval& interval,
                                    std::string* debug_str) const;
   void AppendAllocationInfoDebugString(
-      const BufferInterval& interval,
+      const AllocationValue& value,
       const MemorySpaceAssignment::Allocation& allocation,
-      std::string* debug_str) const;
+      std::string& debug_str) const;
   void DumpDebugStringsIfEnabled() const;
 
   // Returns the available heap size in the alternate memory.
@@ -1074,6 +1224,11 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
   BufferIntervalTree prefetch_interval_tree_;
   BufferIntervalTree eviction_interval_tree_;
   AsynchronousCopyOrdering async_copy_ordering_;
+  // A list of RepackAllocationBlock objects that mirrors allocation sequences,
+  // used for repacking. We use a list here because we need pointer stability
+  // for aliased allocations.
+  std::list<RepackAllocationBlock> repack_allocation_blocks_;
+  int64 num_repacks_ = 0;
   std::vector<std::pair<BufferInterval, ChunkCandidate>> pending_chunks_;
   std::vector<AsynchronousCopy> pending_async_copies_;
   std::vector<std::pair<const HloValue*, RequiredMemoryAssignment>>
@@ -1084,9 +1239,6 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
       required_assignments_;
   // Number of bytes reserved in alternate memory space.
   int64 reserved_in_bytes_ = 0;
-  // Variables to control allocation retries.
-  bool final_retry_;
-  bool prefetch_failed_due_to_async_copy_;
   // Debug strings.
   std::string buffer_info_str_;
   std::string allocation_info_str_;
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_repacking.h b/tensorflow/compiler/xla/service/memory_space_assignment_repacking.h
new file mode 100644
index 00000000000..fcfdfc797fb
--- /dev/null
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_repacking.h
@@ -0,0 +1,57 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_REPACKING_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_REPACKING_H_
+
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+
+namespace xla {
+
+// An interface to define allocation repacking algorithms.
+template <typename O>
+class MemorySpaceAssignmentRepacker {
+ public:
+  MemorySpaceAssignmentRepacker() = default;
+  virtual ~MemorySpaceAssignmentRepacker() = default;
+
+  // A contiguous block of allocation consisting of start and end (logical)
+  // times, size, and the initial offset. After repacking, if the repacking was
+  // successful and the allocations were modified, the offset field holds the
+  // new offset. To support aliased allocations, AllocationBlock also includes a
+  // vector of AllocationBlock pointers, called colocations. All AllocationBlock
+  // objects within the colocations must get the same offset. The opaque field
+  // is used by the MemorySpaceAssignment pass and should not be accessed by the
+  // repacking algorithm.
+  struct AllocationBlock {
+    int64 start_time;
+    int64 end_time;
+    int64 size;
+    int64 offset;
+    int64 initial_offset;
+    std::vector<AllocationBlock*> colocations;
+    O opaque;
+  };
+
+  // Repack the AllocationBlocks provided in the parameter. Returns true if
+  // allocations have been modified and false if not. Returns a non-ok status if
+  // there was an error.
+  virtual StatusOr<bool> Repack(absl::Span<AllocationBlock*> allocations) = 0;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_REPACKING_H_
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
index 10e11e55291..464cfb502be 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
@@ -60,7 +60,8 @@ class MemorySpaceAssignmentTest : public HloTestBase,
     CostAnalysisPrefetchIntervalPicker prefetch_interval_picker(
         CostAnalysisPrefetchIntervalPicker(
             *cost_analysis, /*min_async_copy_to_overlap_ratio=*/0.8,
-            /*max_async_copy_to_overlap_ratio=*/10.0));
+            /*max_async_copy_to_overlap_ratio=*/10.0,
+            /*preferred_async_copy_to_overlap_ratio=*/1.5));
     return AssignMemorySpace(
         module, /*max_outstanding_async_copies=*/-1,
         MemorySpaceAssignment::GetMemoryBoundednessBufferIntervalCompare(
@@ -70,19 +71,22 @@ class MemorySpaceAssignmentTest : public HloTestBase,
 
   std::unique_ptr<PresetAssignments> AssignMemorySpace(
       HloModule* module, int64 max_outstanding_async_copies = -1,
-      int64 max_prefetch_interval = 10, int64 min_prefetch_interval = 2) {
+      int64 max_prefetch_interval = 10, int64 min_prefetch_interval = 2,
+      absl::optional<MemorySpaceAssignment::Options> options = absl::nullopt) {
     InstructionCountPrefetchIntervalPicker prefetch_interval_picker(
         min_prefetch_interval, max_prefetch_interval);
     return AssignMemorySpace(module, max_outstanding_async_copies,
                              /*buffer_interval_compare=*/{},
-                             &prefetch_interval_picker);
+                             &prefetch_interval_picker, options);
   }
 
   std::unique_ptr<PresetAssignments> AssignMemorySpace(
       HloModule* module, int64 max_outstanding_async_copies,
       absl::optional<MemorySpaceAssignment::BufferIntervalCompare>
           buffer_interval_compare,
-      PrefetchIntervalPicker* prefetch_interval_picker) {
+      PrefetchIntervalPicker* prefetch_interval_picker,
+      absl::optional<MemorySpaceAssignment::Options>
+          memory_space_assignment_options = absl::nullopt) {
     auto size_fn = [](const BufferValue& buffer) {
       return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
     };
@@ -116,9 +120,15 @@ class MemorySpaceAssignmentTest : public HloTestBase,
     }
 
     MemorySpaceAssignment::Options options;
+    if (memory_space_assignment_options) {
+      options = *memory_space_assignment_options;
+    } else {
+      options.max_size_in_bytes = 128;
+      options.alignment_in_bytes = 8;
+      options.verify = true;
+    }
+
     options.alternate_memory_space = kAlternateMemorySpace;
-    options.max_size_in_bytes = 128;
-    options.alignment_in_bytes = 8;
     options.buffer_interval_compare = buffer_interval_compare;
     options.prefetch_interval_picker = prefetch_interval_picker;
     options.size_fn = size_fn;
@@ -126,7 +136,6 @@ class MemorySpaceAssignmentTest : public HloTestBase,
     options.max_outstanding_prefetches = max_outstanding_async_copies;
     options.max_outstanding_evictions = max_outstanding_async_copies;
     options.allocate_across_sequential_calls = GetParam();
-    options.verify = true;
 
     auto alias_analysis = HloAliasAnalysis::Run(module).ValueOrDie();
     std::unique_ptr<HloLiveRange> hlo_live_range =
@@ -285,6 +294,92 @@ class MemorySpaceAssignmentTest : public HloTestBase,
   MemorySpaceAssignmentCostAnalysis::Cache cache_;
 };
 
+// For testing purposes, we define a cost analysis where we can control the
+// elapsed times of each HLO and asynchronous copy.
+class FakeMemorySpaceAssignmentCostAnalysis
+    : public MemorySpaceAssignmentCostAnalysis {
+ public:
+  static StatusOr<std::unique_ptr<FakeMemorySpaceAssignmentCostAnalysis>>
+  Create(const HloCostAnalysis& cost_analysis, const HloModule& module) {
+    TF_ASSIGN_OR_RETURN(auto alias_analysis, HloAliasAnalysis::Run(&module));
+    TF_ASSIGN_OR_RETURN(auto hlo_live_range,
+                        HloLiveRange::Run(module.schedule(), *alias_analysis,
+                                          module.entry_computation()));
+    auto call_graph = CallGraph::Build(&module);
+    return absl::WrapUnique(new FakeMemorySpaceAssignmentCostAnalysis(
+        cost_analysis, /*async_copy_bandwidth_bytes_per_second=*/1,
+        /*alternate_mem_bandwidth_bytes_per_second=*/1,
+        std::move(alias_analysis), std::move(hlo_live_range),
+        std::move(call_graph)));
+  }
+
+  float GetInstructionElapsed(
+      const HloInstruction& instruction) const override {
+    if (get_instruction_elapsed_override_) {
+      return get_instruction_elapsed_override_(instruction);
+    }
+    return 1.0;
+  }
+
+  float GetInstructionElapsedInAlternateMemory(
+      const HloInstruction& instruction,
+      absl::optional<int64> operand_in_alternate_mem,
+      bool output_in_alternate_mem) const override {
+    if (get_instruction_elapsed_in_alternate_memory_override_) {
+      return get_instruction_elapsed_in_alternate_memory_override_(
+          instruction, operand_in_alternate_mem, output_in_alternate_mem);
+    }
+    if (operand_in_alternate_mem) {
+      return 0.5;
+    } else {
+      return 1.0;
+    }
+  }
+
+  float GetAsyncCopyElapsed(const Shape& shape) const override {
+    if (get_async_copy_elapsed_override_) {
+      return get_async_copy_elapsed_override_(shape);
+    }
+    return 3.0;
+  }
+
+  // The following methods can be used to override what the above API calls
+  // return.
+  void SetOverrideForGetInstructionElapsed(
+      std::function<float(const HloInstruction&)> function) {
+    get_instruction_elapsed_override_ = function;
+  }
+  void SetOverrideForGetInstructionElapsedInAlternateMemory(
+      std::function<float(const HloInstruction&, absl::optional<int64>, bool)>
+          function) {
+    get_instruction_elapsed_in_alternate_memory_override_ = function;
+  }
+  void SetOverrideForGetAsyncCopyElapsed(
+      std::function<float(const Shape&)> function) {
+    get_async_copy_elapsed_override_ = function;
+  }
+
+ protected:
+  FakeMemorySpaceAssignmentCostAnalysis(
+      const HloCostAnalysis& cost_analysis,
+      float async_copy_bandwidth_bytes_per_second,
+      float alternate_mem_bandwidth_bytes_per_second,
+      std::unique_ptr<HloAliasAnalysis> alias_analysis,
+      std::unique_ptr<HloLiveRange> hlo_live_range,
+      std::unique_ptr<CallGraph> call_graph)
+      : MemorySpaceAssignmentCostAnalysis(
+            cost_analysis, async_copy_bandwidth_bytes_per_second,
+            alternate_mem_bandwidth_bytes_per_second, std::move(alias_analysis),
+            std::move(hlo_live_range), std::move(call_graph)) {}
+
+ private:
+  std::function<float(const HloInstruction&)>
+      get_instruction_elapsed_override_ = nullptr;
+  std::function<float(const HloInstruction&, absl::optional<int64>, bool)>
+      get_instruction_elapsed_in_alternate_memory_override_ = nullptr;
+  std::function<float(const Shape&)> get_async_copy_elapsed_override_ = nullptr;
+};
+
 TEST_P(MemorySpaceAssignmentTest, ParameterOnly) {
   // A module consisting of a single parameter. Inputs/outputs are currently
   // excluded from memory space assignment.
@@ -1718,6 +1813,59 @@ TEST_P(MemorySpaceAssignmentTest, WhileInPlaceBuffer) {
   }
 }
 
+TEST_P(MemorySpaceAssignmentTest, WhileSharedBufferVerificationBug) {
+  // Tests a spurious verification failure when a while has the same value
+  // passed in twice (copy0) and that value is evicted within the while loop.
+  absl::string_view hlo_string = R"(
+  HloModule module, is_scheduled=true
+
+  while_cond {
+    p0 = (f32[3]{0}, f32[3]{0}, f32[3]{0}, pred[]) parameter(0)
+    ROOT gte = pred[] get-tuple-element(p0), index=3
+  }
+
+  while_body {
+    p0 = (f32[3]{0}, f32[3]{0}, f32[3]{0}, pred[]) parameter(0)
+    gte0 = f32[3]{0} get-tuple-element(p0), index=0
+    gte1 = f32[3]{0} get-tuple-element(p0), index=1
+    gte2 = f32[3]{0} get-tuple-element(p0), index=2
+    gte3 = pred[] get-tuple-element(p0), index=3
+    add = f32[3]{0} add(gte0, gte0)
+    negate0 = f32[3]{0} negate(add)
+    negate1 = f32[3]{0} negate(negate0)
+    negate2 = f32[3]{0} negate(negate1)
+    negate3 = f32[3]{0} negate(negate2)
+    negate4 = f32[3]{0} negate(negate3)
+    negate5 = f32[3]{0} negate(negate4)
+    negate6 = f32[3]{0} negate(negate5)
+    negate7 = f32[3]{0} negate(negate6)
+    negate8 = f32[3]{0} negate(negate7)
+    negate9 = f32[3]{0} negate(negate8)
+    negate10 = f32[3]{0} negate(negate9)
+    negate11 = f32[3]{0} negate(negate10)
+    negate12 = f32[3]{0} negate(negate11)
+    negate13 = f32[3]{0} negate(negate12)
+    negate14 = f32[3]{0} negate(negate13)
+    negate15 = f32[3]{0} negate(negate14)
+    negate16 = f32[3]{0} negate(negate15)
+    ROOT tuple = (f32[3]{0}, f32[3]{0}, f32[3]{0}, pred[]) tuple(gte0, gte0, negate16, gte3)
+  }
+
+  ENTRY entry {
+    p0 = f32[3]{0} parameter(0)
+    p1 = pred[] parameter(1)
+    copy0 = f32[3]{0} copy(p0)
+    copy1 = f32[3]{0} copy(p0)
+    tuple = (f32[3]{0}, f32[3]{0}, f32[3]{0}, pred[]) tuple(copy0, copy0, copy1, p1)
+    while = (f32[3]{0}, f32[3]{0}, f32[3]{0}, pred[]) while(tuple), condition=while_cond, body=while_body
+    ROOT gte = f32[3]{0} get-tuple-element(while), index=2
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AssignMemorySpace(module.get());
+}
+
 TEST_P(MemorySpaceAssignmentTest, ControlPredecessorsBug) {
   // Having control_predecessors on an HLO was preventing us from DCEing an op
   // that doesn't have any users (tuple.1). The scheduler assumes the graph is
@@ -2066,6 +2214,58 @@ TEST_P(MemorySpaceAssignmentTest, NestedConditional) {
   }
 }
 
+TEST_P(MemorySpaceAssignmentTest, NestedConditionalBufferReuseVerificationBug) {
+  // Tests a spurious verification failure when there are nested conditionals
+  // and the innermost conditional computation reuses the buffer. Here, both the
+  // parameter of true_computation2 and neg2 will get the same buffer. Make sure
+  // that verification doesn't claim a failure in this case.
+  absl::string_view hlo_string = R"(
+  HloModule CondAllocation, is_scheduled=true
+
+  true_computation2 {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    neg1 = f32[3]{0} negate(gte)
+    neg2 = f32[3]{0} negate(neg1)
+    ROOT neg3 = f32[3]{0} negate(neg2)
+  }
+
+  false_computation2 {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg4 = f32[3]{0} negate(gte)
+  }
+
+  true_computation1 {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    slice = f32[1]{0} slice(gte), slice={[0:1]}
+    bitcast = f32[] bitcast(slice)
+    constant = f32[] constant(0.0)
+    compare = pred[] compare(bitcast, constant), direction=GT
+    tuple = (f32[3]{0}) tuple(gte)
+    ROOT conditional = f32[3]{0} conditional(compare, tuple, tuple), true_computation=true_computation2, false_computation=false_computation2
+  }
+
+  false_computation1 {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg5 = f32[3]{0} negate(gte)
+  }
+
+  ENTRY entry {
+    p0 = f32[3]{0} parameter(0)
+    p1 = pred[] parameter(1)
+    copy = f32[3]{0} copy(p0)
+    tuple = (f32[3]{0}) tuple(copy)
+    ROOT conditional = f32[3]{0} conditional(p1, tuple, tuple), true_computation=true_computation1, false_computation=false_computation1
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AssignMemorySpace(module.get());
+}
+
 TEST_P(MemorySpaceAssignmentTest,
        RequestIdentifierShouldNotBeAllocatedInAlternateMem) {
   // Ensure that request identifier returned by Send/Recv HLOs are not allocated
@@ -3749,6 +3949,286 @@ TEST_P(MemorySpaceAssignmentTest, PendingChunkMemoryCorruptionBug) {
                     buffer_interval_compare, &prefetch_interval_picker);
 }
 
+TEST_P(MemorySpaceAssignmentTest, MoveCopyDoneEarlier) {
+  // This tests the case where an earlier placed smaller buffer may block a
+  // larger buffer due to asynchronous copy ordering. The smaller buffer (the
+  // operand of sin) will be placed first. The cos, whose operand is 3 times
+  // larger than sin's, needs longer time for the asynhronous copy. The cos is
+  // placed right after sin, leading to a copy ordering violation:
+  //
+  // param1------------------>CS----->CD->sin
+  // param0------------->CS------------------->CD->cos
+  //
+  // To fix this, we need to move copy done for cos earlier and ensure both of
+  // these buffers get alternate memory allocations:
+  //
+  // param1------------------>CS----->CD->sin
+  // param0-->CS------------------->CD------------>cos
+  absl::string_view hlo_string = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY Entry {
+    param0 = f32[8,3] parameter(0)
+    param1 = f32[2,4] parameter(1)
+    a = f32[2,4] negate(param1)
+    b = f32[2,4] negate(a)
+    c = f32[2,4] negate(b)
+    d = f32[2,4] negate(c)
+    e = f32[2,4] negate(d)
+    f = f32[2,4] negate(e)
+    g = f32[2,4] negate(f)
+    h = f32[2,4] negate(g)
+    i = f32[2,4] negate(h)
+    j = f32[2,4] negate(i)
+    k = f32[2,4] negate(j)
+    l = f32[2,4] negate(k)
+    m = f32[2,4] negate(l)
+    n = f32[2,4] negate(m)
+    sin = f32[2,4] sine(param1)
+    o = f32[2,4] negate(n)
+    cos = f32[8,3] cosine(param0)
+    ROOT tuple = (f32[8,3], f32[2,4], f32[2,4]) tuple(cos, sin, o)
+  }
+  )";
+
+  MemorySpaceAssignment::BufferIntervalCompare buffer_interval_compare =
+      [](const MemorySpaceAssignment::BufferInterval& a,
+         const MemorySpaceAssignment::BufferInterval& b) {
+        auto get_opcode_priority = [](const HloOpcode& opcode) {
+          switch (opcode) {
+            case HloOpcode::kSin:
+              return 0;
+            case HloOpcode::kCos:
+              return 1;
+            case HloOpcode::kTanh:
+              return 2;
+            default:
+              return 3;
+          }
+        };
+
+        auto get_user_priority = [&](const HloValue& value) {
+          int priority = INT_MAX;
+          for (const auto& use : value.uses()) {
+            priority = std::min(priority,
+                                get_opcode_priority(use.instruction->opcode()));
+          }
+          return priority;
+        };
+
+        return get_user_priority(*a.buffer) < get_user_priority(*b.buffer);
+      };
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  HloCostAnalysis hlo_cost_analysis(ShapeSize);
+  TF_ASSERT_OK_AND_ASSIGN(auto cost_analysis,
+                          FakeMemorySpaceAssignmentCostAnalysis::Create(
+                              hlo_cost_analysis, *module));
+  cost_analysis->SetOverrideForGetAsyncCopyElapsed([](const Shape& shape) {
+    // This should return 2 for f32[2,4] and 6 for f32[8,3].
+    return ShapeSize(shape) / 16;
+  });
+  CostAnalysisPrefetchIntervalPicker interval_picker(
+      *cost_analysis,
+      /*min_async_copy_to_overlap_ratio=*/1.0,
+      /*max_async_copy_to_overlap_ratio=*/4.0,
+      /*preferred_async_copy_to_overlap_ratio=*/1.5);
+  AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/-1,
+                    buffer_interval_compare, &interval_picker);
+
+  // Check that both cos and sin could get their operands prefetched.
+  const HloInstruction* cos =
+      module->entry_computation()->GetInstructionWithName("cos");
+  const HloInstruction* sin =
+      module->entry_computation()->GetInstructionWithName("sin");
+  EXPECT_THAT(sin->operand(0),
+              op::AsyncCopy(kAlternateMemorySpace, kDefaultMemorySpace,
+                            op::Parameter(1)));
+  EXPECT_THAT(cos->operand(0),
+              op::AsyncCopy(kAlternateMemorySpace, kDefaultMemorySpace,
+                            op::Parameter(0)));
+
+  // Sanity check that the cos' operand copy-done is scheduled earlier than
+  // sin's operand.
+  auto find_schedule_index = [&](const HloInstruction* instruction) {
+    const auto& instructions =
+        module->schedule().sequence(module->entry_computation()).instructions();
+    for (int i = 0; i < instructions.size(); ++i) {
+      if (instruction == instructions[i]) {
+        return i;
+      }
+    }
+    CHECK(false);
+    return -1;
+  };
+  EXPECT_GT(find_schedule_index(sin->operand(0)),
+            find_schedule_index(cos->operand(0)));
+}
+
+// A mock MemorySpaceAssignmentRepacker class that accepst a map of
+// (start_time,offset) -> new_offset values. Using this map, the repacker
+// repacks the allocations to the new_offset.
+class FakeMemorySpaceAssignmentRepacker
+    : public MemorySpaceAssignmentRepacker<MemorySpaceAssignment::Allocation*> {
+ public:
+  FakeMemorySpaceAssignmentRepacker(
+      absl::flat_hash_map<std::pair<int64, int64>, int64>& repack_map)
+      : repack_map_(repack_map) {}
+
+  StatusOr<bool> Repack(absl::Span<AllocationBlock*> allocations) override {
+    bool modified = false;
+    for (AllocationBlock* block : allocations) {
+      VLOG(1) << "Alloc time: [" << block->start_time << ", " << block->end_time
+              << "] size: " << block->size
+              << " init offset: " << block->initial_offset;
+      auto it = repack_map_.find({block->start_time, block->initial_offset});
+      if (it != repack_map_.end()) {
+        modified = true;
+        block->offset = it->second;
+      } else {
+        block->offset = block->initial_offset;
+      }
+      for (AllocationBlock* colocation : block->colocations) {
+        VLOG(1) << "  [" << colocation->start_time << ", "
+                << colocation->end_time << "]";
+        if (it != repack_map_.end()) {
+          colocation->offset = it->second;
+        } else {
+          colocation->offset = colocation->initial_offset;
+        }
+      }
+    }
+
+    return modified;
+  }
+
+ private:
+  // A map from (start_time, offset) to new_offset.
+  absl::flat_hash_map<std::pair<int64, int64>, int64> repack_map_;
+};
+
+TEST_P(MemorySpaceAssignmentTest, Repack) {
+  // We initially perform the following allocations at these offsets.
+  //
+  //    Max memory
+  //  -------------------------------------------
+  //
+  //
+  //
+  //
+  //      +------------+
+  //      |     b      |
+  //      +------------+
+  //  +-------+                 +------------+
+  //  |   a   |                 |     n      |
+  //  +-------+                 +------------+
+  //  -------------------------------------------
+  //    Min memory          time ->
+  //
+  // Next up, we try to allocate the prefetch for m. However due to
+  // fragmentation, this won't be possible:
+  //
+  //    Max memory
+  //  -------------------------------------------
+  //
+  //
+  //
+  //                +---------+
+  //      +------------+      |
+  //      |     b   |  |      |
+  //      +------------+      |
+  //  +-------+     |         | +------------+
+  //  |   a   |     |    d    | |     n      |
+  //  +-------+     +---------+ +------------+
+  //  -------------------------------------------
+  //    Min memory          time ->
+  //
+  // We then call repack to repack the existing allocations which allows us to
+  // allocate the prefetch for m:
+  //
+  //    Max memory
+  //  -------------------------------------------
+  //                +---------+
+  //                |         |
+  //                |         |
+  //                |         |
+  //  +-------+     |         |
+  //  |   a   |     |    d    |
+  //  +-------+     +---------+
+  //      +------------+        +------------+
+  //      |      b     |        |     n      |
+  //      +------------+        +------------+
+  //  -------------------------------------------
+  //    Min memory          time ->
+  absl::string_view hlo_string = R"(
+  HloModule bug, is_scheduled=true
+
+  ENTRY Entry {
+    param0 = f32[8,3] parameter(0)
+    param1 = f32[2,4] parameter(1)
+    a = f32[2,4] sine(param1)
+    b = f32[2,4] cosine(param1)
+    c = f32[8,3] negate(param0)
+    j = f32[2,4] negate(a)
+    d = f32[8,3] tanh(param0)
+    k = f32[2,4] negate(j)
+    l = f32[2,4] add(b, k)
+    m = f32[8,3] negate(d)
+    n = f32[2,4] sine(l)
+    o = f32[8,3] negate(m)
+    p = f32[2,4] negate(n)
+    q = f32[8,3] negate(m)
+    ROOT tuple = (f32[2,4], f32[8,3], f32[8,3]) tuple(p, q, o)
+  }
+  )";
+
+  MemorySpaceAssignment::BufferIntervalCompare buffer_interval_compare =
+      [](const MemorySpaceAssignment::BufferInterval& a,
+         const MemorySpaceAssignment::BufferInterval& b) {
+        auto get_opcode_priority = [](const HloOpcode& opcode) {
+          switch (opcode) {
+            case HloOpcode::kSin:
+              return 0;
+            case HloOpcode::kCos:
+              return 1;
+            case HloOpcode::kTanh:
+              return 2;
+            default:
+              return 3;
+          }
+        };
+
+        return get_opcode_priority(a.buffer->defining_instruction()->opcode()) <
+               get_opcode_priority(b.buffer->defining_instruction()->opcode());
+      };
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  InstructionCountPrefetchIntervalPicker prefetch_interval_picker(2, 10);
+  absl::flat_hash_map<std::pair<int64, int64>, int64> repack_map;
+  // Move "a" from offset 0 to 32.
+  repack_map[{2, 0}] = 32;
+  // Move "b" from offset 32 to 0.
+  repack_map[{3, 32}] = 0;
+  FakeMemorySpaceAssignmentRepacker repacker =
+      FakeMemorySpaceAssignmentRepacker(repack_map);
+  MemorySpaceAssignment::Options options;
+  options.max_size_in_bytes = 128;
+  options.alignment_in_bytes = 8;
+  options.verify = true;
+  options.max_repacks = 1;
+  options.repacker = &repacker;
+  AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/-1,
+                    buffer_interval_compare, &prefetch_interval_picker,
+                    options);
+
+  // If repacking succeeds, we should find the buffer for d in alternate memory.
+  const HloInstruction* d =
+      module->entry_computation()->GetInstructionWithName("d");
+  EXPECT_EQ(d->shape().layout().memory_space(), kAlternateMemorySpace);
+}
+
 TEST_P(MemorySpaceAssignmentTest, Determinism) {
   // Run memory space assignment a few times to make sure every time it compiles
   // to the same thing.
@@ -4045,5 +4525,278 @@ TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchFusionTest) {
   EXPECT_EQ(cross_program_prefetches.size(), 0);
 }
 
+TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchPinnedTest) {
+  HloComputation::Builder builder(TestName());
+
+  constexpr int kBatch = 8;
+  constexpr int kFeature = 8;
+  constexpr int kOutput = 2;
+
+  auto lhs_shape = ShapeUtil::MakeShape(F32, {kBatch, kFeature});
+  auto rhs_shape = ShapeUtil::MakeShapeWithLayout(
+      F32, {kFeature, kOutput},
+      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      kAlternateMemorySpace);
+  auto result_shape = ShapeUtil::MakeShape(F32, {kBatch, kOutput});
+  auto tuple_shape = ShapeUtil::MakeTupleShape({lhs_shape, rhs_shape});
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "p0"));
+
+  auto lhs = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(lhs_shape, param, 0));
+  auto rhs = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(rhs_shape, param, 1));
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto dot = builder.AddInstruction(HloInstruction::CreateDot(
+      result_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2)));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  HloSchedule schedule(module.get());
+  schedule.set_sequence(computation, {param, lhs, rhs, dot});
+  TF_CHECK_OK(module->set_schedule(schedule));
+
+  AssignMemorySpace(module.get());
+
+  auto cross_program_prefetches = module->CrossProgramPrefetches();
+  EXPECT_EQ(cross_program_prefetches.size(), 0);
+}
+
+using CostAnalysisPrefetchIntervalPickerTest = HloTestBase;
+
+TEST_F(CostAnalysisPrefetchIntervalPickerTest, PrefetchIntervalOrder) {
+  absl::string_view hlo_string = R"(
+  HloModule bug, is_scheduled=true
+
+  ENTRY Entry {
+    param0 = f32[2,4] parameter(0)
+    a = f32[2,4] negate(param0)
+    b = f32[2,4] negate(a)
+    c = f32[2,4] negate(b)
+    d = f32[2,4] negate(c)
+    e = f32[2,4] negate(d)
+    f = f32[2,4] negate(e)
+    g = f32[2,4] negate(f)
+    h = f32[2,4] negate(g)
+    i = f32[2,4] negate(h)
+    j = f32[2,4] negate(i)
+    k = f32[2,4] negate(j)
+    l = f32[2,4] negate(k)
+    m = f32[2,4] negate(l)
+    n = f32[2,4] negate(m)
+    o = f32[2,4] negate(n)
+    p = f32[2,4] negate(o)
+    q = f32[2,4] negate(p)
+    r = f32[2,4] negate(q)
+    s = f32[2,4] negate(r)
+    t = f32[2,4] negate(s)
+    u = f32[2,4] negate(t)
+    ROOT v = f32[2,4] add(u, param0)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  HloCostAnalysis hlo_cost_analysis(ShapeSize);
+  TF_ASSERT_OK_AND_ASSIGN(auto cost_analysis,
+                          FakeMemorySpaceAssignmentCostAnalysis::Create(
+                              hlo_cost_analysis, *module));
+  CostAnalysisPrefetchIntervalPicker interval_picker(
+      *cost_analysis,
+      /*min_async_copy_to_overlap_ratio=*/1.0,
+      /*max_async_copy_to_overlap_ratio=*/4.0,
+      /*preferred_async_copy_to_overlap_ratio=*/2.0);
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloUse use{root, /*operand_number=*/1, /*operand_index=*/{}};
+  interval_picker.Begin(use, /*start_time=*/0, /*end_time=*/22);
+
+  // Expect that the first interval is (15, 22), which has elapsed time of 6.0,
+  // twice of the async copy elased (3.0). Then we expect that intervals will be
+  // visited in alternating increasing and decreasing orders until hitting the
+  // min and max async copy overlap ratios, which are the intervals (18, 22)
+  // and (9, 22) respectively.
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 15);
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 16);
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 14);
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 17);
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 13);
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 18);  // Min async overlap ratio reached.
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 12);
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 11);
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 10);
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 9);  // Max async overlap ratio reached.
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_TRUE(interval_picker.Done());
+
+  // Expect that if the time between start_time and end_time is too short, there
+  // won't be any available intervals.
+  interval_picker.Begin(use, /*start_time=*/19, /*end_time=*/22);
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_TRUE(interval_picker.Done());
+}
+
+TEST_F(CostAnalysisPrefetchIntervalPickerTest, PrefetchIntervalOrderWhile) {
+  absl::string_view hlo_string = R"(
+  HloModule bug, is_scheduled=true
+
+  while_condition {
+    param1 = (f32[2,4]) parameter(0)    // 19
+    ROOT cond = pred[] constant(true)   // 20
+  }
+
+  while_body {
+    param2 = (f32[2,4]) parameter(0)    // 21
+    gte2 = f32[2,4] get-tuple-element(param2), index=0  // 22
+    add = f32[2,4] add(gte2, gte2)      // 23
+    ROOT tuple2 = (f32[2,4]) tuple(add) // 24
+  }
+
+  ENTRY Entry {
+    param0 = f32[2,4] parameter(0)  // 0
+    a = f32[2,4] negate(param0)     // 1
+    b = f32[2,4] negate(a)          // 2
+    c = f32[2,4] negate(b)          // 3
+    d = f32[2,4] negate(c)          // 4
+    e = f32[2,4] negate(d)          // 5
+    f = f32[2,4] negate(e)          // 6
+    g = f32[2,4] negate(f)          // 7
+    h = f32[2,4] negate(g)          // 8
+    i = f32[2,4] negate(h)          // 9
+    j = f32[2,4] negate(i)          // 10
+    k = f32[2,4] negate(j)          // 11
+    l = f32[2,4] negate(k)          // 12
+    m = f32[2,4] negate(l)          // 13
+    n = f32[2,4] negate(m)          // 14
+    o = f32[2,4] negate(n)          // 15
+    p = f32[2,4] negate(o)          // 16
+    q = f32[2,4] negate(p)          // 17
+    tuple = (f32[2,4]) tuple(q)     // 18
+    while = (f32[2,4]) while(tuple), condition=while_condition, body=while_body  // 25
+    gte1 = f32[2,4] get-tuple-element(while), index=0  // 26
+    r = f32[2,4] negate(gte1)       // 27
+    s = f32[2,4] negate(r)          // 28
+    t = f32[2,4] negate(s)          // 29
+    u = f32[2,4] negate(t)          // 30
+    ROOT v = f32[2,4] add(u, param0)  // 31
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  HloCostAnalysis hlo_cost_analysis(ShapeSize);
+  TF_ASSERT_OK_AND_ASSIGN(auto cost_analysis,
+                          FakeMemorySpaceAssignmentCostAnalysis::Create(
+                              hlo_cost_analysis, *module));
+  CostAnalysisPrefetchIntervalPicker interval_picker(
+      *cost_analysis,
+      /*min_async_copy_to_overlap_ratio=*/1.0,
+      /*max_async_copy_to_overlap_ratio=*/12.0,
+      /*preferred_async_copy_to_overlap_ratio=*/2.0);
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloUse use{root, /*operand_number=*/1, /*operand_index=*/{}};
+  interval_picker.Begin(use, /*start_time=*/0, /*end_time=*/31);
+
+  // Because there are while loop computations between [19, 24], we ensure that
+  // the interval picker avoids this interval.
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 25);
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 26);
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 18);
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 27);  // Min async overlap ratio reached.
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 17);  // Max async overlap ratio reached.
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_TRUE(interval_picker.Done());
+}
+
+TEST_F(CostAnalysisPrefetchIntervalPickerTest, NestedWhile) {
+  // This test is to check against a bug where we didn't assign
+  // while_nest_level_ for while instructions, and defaulting to 0. This could
+  // cause the prefetch interval logic to think a nested while instruction is
+  // the same level as the outermost computation.
+  absl::string_view hlo_string = R"(
+  HloModule bug, is_scheduled=true
+
+  while_condition.2 {
+    param1 = (f32[2,4]) parameter(0)    // 11
+    ROOT cond = pred[] constant(true)   // 12
+  }
+
+  while_body.2 {
+    param2 = (f32[2,4]) parameter(0)    // 13
+    gte2 = f32[2,4] get-tuple-element(param2), index=0  // 14
+    add = f32[2,4] add(gte2, gte2)      // 15
+    ROOT tuple2 = (f32[2,4]) tuple(add) // 16
+  }
+
+  while_condition.1 {
+    param3 = (f32[2,4]) parameter(0)    // 5
+    ROOT cond = pred[] constant(true)   // 6
+  }
+
+  while_body.1 {
+    param4 = (f32[2,4]) parameter(0)    // 7
+    gte1 = f32[2,4] get-tuple-element(param4), index=0  // 8
+    add1 = f32[2,4] add(gte1, gte1)     // 9
+    tuple1 = (f32[2,4]) tuple(add1)     // 10
+    while = (f32[2,4]) while(tuple1), condition=while_condition.2, body=while_body.2  // 17
+    gte2 = f32[2,4] get-tuple-element(while), index=0  // 18
+    add2 = f32[2,4] add(gte2, gte2)     // 19
+    ROOT tuple2 = (f32[2,4]) tuple(add2)  // 20
+  }
+
+  ENTRY Entry {
+    param0 = f32[2,4] parameter(0)  // 0
+    a = f32[2,4] negate(param0)     // 1
+    b = f32[2,4] negate(a)          // 2
+    c = f32[2,4] negate(b)          // 3
+    tuple = (f32[2,4]) tuple(c)     // 4
+    while = (f32[2,4]) while(tuple), condition=while_condition.1, body=while_body.1  // 21
+    gte1 = f32[2,4] get-tuple-element(while), index=0  // 22
+    ROOT root = f32[2,4] add(gte1, param0)  // 23
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  HloCostAnalysis hlo_cost_analysis(ShapeSize);
+  TF_ASSERT_OK_AND_ASSIGN(auto cost_analysis,
+                          FakeMemorySpaceAssignmentCostAnalysis::Create(
+                              hlo_cost_analysis, *module));
+  CostAnalysisPrefetchIntervalPicker interval_picker(
+      *cost_analysis,
+      /*min_async_copy_to_overlap_ratio=*/1.0,
+      /*max_async_copy_to_overlap_ratio=*/12.0,
+      /*preferred_async_copy_to_overlap_ratio=*/2.0);
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloUse use{root, /*operand_number=*/1, /*operand_index=*/{}};
+
+  // We expect the root's latest prefetch start time to be before the while loop
+  // (logical time 4).
+  EXPECT_EQ(interval_picker.LatestPrefetchStartTime(use, /*start_time=*/0,
+                                                    /*end_time=*/23),
+            4);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
index 113c9764b40..31cf36dee85 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
@@ -82,6 +82,7 @@ cc_library(
         ":kernel_lowering",
         ":lhlo_dialect_emitter",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@llvm-project//llvm:Core",
         "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:IR",
@@ -154,13 +155,33 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "passes",
+    srcs = ["passes.cc"],
+    hdrs = ["passes.h"],
+    deps = [
+        "//tensorflow/compiler/mlir/hlo:lhlo",
+        "@com_google_absl//absl/memory",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:GPUDialect",
+        "@llvm-project//mlir:GPUTransforms",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:SCFTransforms",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
+
 cc_library(
     name = "kernel_lowering",
     srcs = ["kernel_lowering.cc"],
     hdrs = ["kernel_lowering.h"],
     deps = [
+        ":passes",
         "//tensorflow/compiler/mlir/hlo",
-        "//tensorflow/compiler/mlir/hlo:hlo_dialect_registration",
+        "//tensorflow/compiler/mlir/hlo:hlo_dialect_force_registration",
         "//tensorflow/compiler/mlir/hlo:hlo_legalize_to_lhlo",
         "//tensorflow/compiler/mlir/hlo:legalize_tanh_to_approximation",
         "//tensorflow/compiler/mlir/hlo:legalize_to_linalg",
@@ -172,9 +193,7 @@ cc_library(
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
-        "@llvm-project//mlir:Affine",
         "@llvm-project//mlir:AffineToStandardTransforms",
         "@llvm-project//mlir:CFGTransforms",
         "@llvm-project//mlir:GPUDialect",
@@ -183,7 +202,6 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:LLVMTransforms",
-        "@llvm-project//mlir:LinalgOps",
         "@llvm-project//mlir:LinalgToLLVM",
         "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:NVVMDialect",
@@ -192,7 +210,6 @@ cc_library(
         "@llvm-project//mlir:SCFToGPUPass",
         "@llvm-project//mlir:SCFTransforms",
         "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
index 648c44d9ac1..ae99cc9ba63 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
@@ -22,419 +22,26 @@ limitations under the License.
 #include "mlir/Conversion/SCFToGPU/SCFToGPUPass.h"  // from @llvm-project
 #include "mlir/Conversion/SCFToStandard/SCFToStandard.h"  // from @llvm-project
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"  // from @llvm-project
-#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"  // from @llvm-project
-#include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
 #include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
-#include "mlir/Dialect/GPU/ParallelLoopMapper.h"  // from @llvm-project
 #include "mlir/Dialect/GPU/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"  // from @llvm-project
-#include "mlir/Dialect/Linalg/IR/LinalgOps.h"  // from @llvm-project
 #include "mlir/Dialect/Linalg/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/Passes.h"  // from @llvm-project
-#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/Transforms.h"  // from @llvm-project
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/Function.h"  // from @llvm-project
-#include "mlir/IR/Module.h"  // from @llvm-project
-#include "mlir/IR/OperationSupport.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/Region.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Transforms/BufferPlacement.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "mlir/Transforms/LoopUtils.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
-#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "tensorflow/compiler/xla/service/mlir_gpu/passes.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
 namespace mlir_gpu {
-namespace {
-
-using ::mlir::lmhlo::FusionOp;
-
-// Replaces a FusionOp by the operations contained in its region.
-struct FusionOpRemover
-    : public mlir::PassWrapper<FusionOpRemover, ::mlir::FunctionPass> {
-  void runOnFunction() override {
-    getFunction().walk([&](FusionOp op) {
-      mlir::OpBuilder builder(op);
-      // FusionOp has a single region with a single block, so we can just walk
-      // over it and clone operations to the outside.
-      mlir::BlockAndValueMapping mapping;
-      for (auto& nested_op : op.region().front().without_terminator()) {
-        auto clone = builder.clone(nested_op, mapping);
-        for (auto pair :
-             llvm::zip(nested_op.getResults(), clone->getResults())) {
-          mapping.map(std::get<0>(pair), std::get<1>(pair));
-        }
-      }
-      op.erase();
-    });
-  }
-};
-
-// Simple pass that replaces a load that immediately follows a store to the
-// same address with the stored value. This needs generalization.
-struct StoreForwardingPass
-    : mlir::PassWrapper<StoreForwardingPass, mlir::FunctionPass> {
-  mlir::StoreOp findStore(mlir::Operation* op,
-                          std::function<bool(mlir::StoreOp)> matches) {
-    // Search from op upwards in the current block.
-    mlir::Block* block = op->getBlock();
-    auto startFromIt =
-        std::find_if(block->rbegin(), block->rend(),
-                     [op](mlir::Operation& other) { return &other == op; });
-    for (auto storeOpIt = startFromIt; storeOpIt != block->rend();
-         ++storeOpIt) {
-      auto storeOp = llvm::dyn_cast<mlir::StoreOp>(&*(storeOpIt));
-      if (!storeOp || !matches(storeOp)) {
-        continue;
-      }
-
-      return storeOp;
-    }
-    // No store operation found. Continue search outside of the parallel
-    // loop if block is in a parallel loop.
-    if (auto parallelOp =
-            llvm::dyn_cast<mlir::scf::ParallelOp>(block->getParentOp())) {
-      return findStore(parallelOp.getOperation(), matches);
-    }
-    return {};
-  }
-
-  // Recursively search defining ops for AllocOp. Return either AllocOp if it is
-  // found or nullptr.
-  mlir::Operation* SearchAllocOp(mlir::Value memref) {
-    mlir::Operation* defOp = memref.getDefiningOp();
-    while (auto subviewOp = mlir::dyn_cast_or_null<mlir::SubViewOp>(defOp)) {
-      defOp = subviewOp.source().getDefiningOp();
-    }
-    if (auto allocOp = mlir::dyn_cast_or_null<mlir::AllocOp>(defOp)) {
-      return allocOp.getOperation();
-    }
-    return nullptr;
-  }
-
-  // Retrieves AllocOp from the cache or actually looks for it.
-  mlir::Operation* GetAllocOp(
-      mlir::Value memref,
-      llvm::DenseMap<mlir::Value, mlir::Operation*>* memrefToAllocOp) {
-    auto allocOpIt = memrefToAllocOp->find(memref);
-    if (allocOpIt != memrefToAllocOp->end()) {
-      return allocOpIt->second;
-    }
-    auto allocOp = SearchAllocOp(memref);
-    memrefToAllocOp->insert({memref, allocOp});
-    return allocOp;
-  }
-
-  void runOnFunction() override {
-    llvm::DenseMap<mlir::Value, mlir::Operation*> memrefToAllocOp;
-
-    getFunction().walk([&](mlir::LoadOp loadOp) {
-      auto storeOp = findStore(loadOp, [&](mlir::StoreOp storeOp) {
-        mlir::Operation* storeOpAlloc =
-            GetAllocOp(storeOp.memref(), &memrefToAllocOp);
-        mlir::Operation* loadOpAlloc =
-            GetAllocOp(loadOp.memref(), &memrefToAllocOp);
-        return storeOpAlloc && loadOpAlloc && (storeOpAlloc == loadOpAlloc);
-      });
-      if (!storeOp) {
-        return;
-      }
-      auto storeIndices = storeOp.getIndices();
-      auto loadIndices = loadOp.getIndices();
-      if (!std::equal(storeIndices.begin(), storeIndices.end(),
-                      loadIndices.begin(), loadIndices.end())) {
-        return;
-      }
-      loadOp.replaceAllUsesWith(storeOp.getValueToStore());
-      loadOp.erase();
-    });
-  }
-};
-
-// Simple pass that removes temporary buffers that are only written to but
-// never read from or that are read but the read value is not used.
-// Needs an analysis that proves that loads and stores are side-effect free
-// (in bounds, no aliasing, etc.).
-struct DeadTempBufferRemoval
-    : mlir::PassWrapper<DeadTempBufferRemoval, ::mlir::FunctionPass> {
-  bool operationConsideredDead(mlir::Operation* op) {
-    for (auto result : op->getResults()) {
-      if (!llvm::all_of(result.getUsers(), [&](mlir::Operation* op) {
-            // Store and Dealloc is OK.
-            if (llvm::isa<mlir::StoreOp, mlir::DeallocOp>(op)) {
-              return true;
-            }
-            // Load without uses is also ok.
-            if (auto loadOp = llvm::dyn_cast<mlir::LoadOp>(op)) {
-              return loadOp.use_empty();
-            }
-            // Subview is ok if it is dead itself.
-            if (llvm::isa<mlir::SubViewOp>(op)) {
-              return operationConsideredDead(op);
-            }
-            return false;
-          })) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  void recursiveErase(mlir::Operation* op,
-                      llvm::SmallVectorImpl<mlir::Operation*>* erase_list) {
-    for (auto result : op->getResults()) {
-      for (auto user : llvm::make_early_inc_range(result.getUsers())) {
-        recursiveErase(user, erase_list);
-      }
-    }
-    erase_list->push_back(op);
-  }
-
-  void runOnFunction() override {
-    llvm::SmallVector<mlir::Operation*, 8> dead_ops;
-    getFunction().walk([&](mlir::AllocOp allocOp) {
-      if (!operationConsideredDead(allocOp)) {
-        return;
-      }
-
-      // TODO(herhut): There should be a generic helper for this.
-      recursiveErase(allocOp, &dead_ops);
-    });
-    for (auto op : dead_ops) {
-      op->erase();
-    }
-  }
-};
-
-// TODO(herhut): Move this to MLIR core.
-struct MoveScalarComputationsIntoGpuLaunch
-    : mlir::PassWrapper<MoveScalarComputationsIntoGpuLaunch,
-                        mlir::FunctionPass> {
-  static bool isInliningBeneficiary(mlir::Operation* op) {
-    return llvm::isa<mlir::ConstantOp, mlir::DimOp, mlir::SelectOp,
-                     mlir::CmpIOp>(op);
-  }
-
-  static bool extractBeneficiaryOps(
-      mlir::Operation* op, llvm::SmallVectorImpl<mlir::Operation*>* ops,
-      llvm::SetVector<mlir::Value> args) {
-    if (!isInliningBeneficiary(op)) {
-      return false;
-    }
-
-    ops->push_back(op);
-    for (auto operand : op->getOperands()) {
-      // It is an existing arg, keep going.
-      if (args.count(operand)) {
-        continue;
-      }
-      mlir::Operation* definingOp = operand.getDefiningOp();
-      if (!definingOp || !extractBeneficiaryOps(definingOp, ops, args)) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  static void inlineOperationsIntoLaunch(mlir::gpu::LaunchOp launch) {
-    llvm::SetVector<mlir::Value> used_above;
-    mlir::getUsedValuesDefinedAbove(launch.body(), used_above);
-    mlir::BlockAndValueMapping inlined_map;
-    for (mlir::Value v : used_above) {
-      llvm::SmallVector<mlir::Operation*, 8> ops_to_move;
-      mlir::Operation* definingOp = v.getDefiningOp();
-      if (definingOp &&
-          extractBeneficiaryOps(definingOp, &ops_to_move, used_above)) {
-        mlir::OpBuilder b(launch.body());
-        for (mlir::Operation* op : llvm::reverse(ops_to_move)) {
-          auto result = b.clone(*op, inlined_map);
-          for (auto pair : llvm::zip(op->getResults(), result->getResults())) {
-            mlir::replaceAllUsesInRegionWith(std::get<0>(pair),
-                                             std::get<1>(pair), launch.body());
-          }
-          inlined_map.map(op->getResults(), result->getResults());
-        }
-      }
-    }
-  }
-
-  void runOnFunction() override {
-    mlir::FuncOp fun = getFunction();
-    fun.walk(
-        [](mlir::gpu::LaunchOp launch) { inlineOperationsIntoLaunch(launch); });
-  }
-};
-
-// Sort the operands to the kernel for a deterministic order. First operands
-// that are defined by function arguments, followed by operands that are
-// returned from the function. This only works for simple functions without
-// control flow and can be used in cases where the kernel is extracted and used
-// independently of the host-side code.
-struct RewriteKernelSignature
-    : mlir::PassWrapper<RewriteKernelSignature, mlir::FunctionPass> {
-  void runOnFunction() override {
-    mlir::FuncOp func = getFunction();
-    mlir::ModuleOp module = func.getParentOfType<mlir::ModuleOp>();
-    getFunction().walk([&](mlir::gpu::LaunchFuncOp launchOp) {
-      mlir::gpu::GPUFuncOp kernel =
-          module.lookupSymbol<mlir::gpu::GPUFuncOp>(launchOp.kernel());
-
-      if (kernel.getNumFuncArguments() !=
-          func.getNumArguments() + func.getNumResults()) {
-        kernel.emitError()
-            << "number of kernel arguments does not match number"
-            << "of arguments and results of surrounding function";
-        signalPassFailure();
-        return;
-      }
-      if (!llvm::hasSingleElement(func)) {
-        func.emitError() << "surrounding function has more than one block";
-        signalPassFailure();
-        return;
-      }
-
-      // Compute a map from function arguments to kernel function operands.
-      mlir::BlockAndValueMapping func_to_kernel;
-      for (mlir::BlockArgument arg : func.getArguments()) {
-        for (int i = 0, e = launchOp.getNumKernelOperands(); i < e; ++i) {
-          if (launchOp.getKernelOperand(i) == arg) {
-            func_to_kernel.map(arg, kernel.getArgument(i));
-            break;
-          }
-        }
-      }
-      // Also add function results that are computed by the launch.
-      mlir::Operation* returnOp = func.getBody().back().getTerminator();
-      for (mlir::Value result : returnOp->getOperands()) {
-        for (int i = 0, e = launchOp.getNumKernelOperands(); i < e; ++i) {
-          if (launchOp.getKernelOperand(i) == result) {
-            func_to_kernel.map(result, kernel.getArgument(i));
-            break;
-          }
-        }
-      }
-
-      // Create a new kernel function with modified signature. It will have the
-      // parameters and result types of the original funcion as its parameter
-      // type and otherwise will be void.
-      auto gpu_module = kernel.getParentOfType<mlir::gpu::GPUModuleOp>();
-      mlir::OpBuilder kernel_builder(gpu_module.body());
-      auto operand_types = llvm::to_vector<4>(llvm::concat<const mlir::Type>(
-          func.getType().getInputs(), func.getType().getResults()));
-      auto new_kernel = kernel_builder.create<mlir::gpu::GPUFuncOp>(
-          kernel.getLoc(), kernel.getName(),
-          kernel_builder.getFunctionType(operand_types, {}));
-      new_kernel.setAttr(mlir::gpu::GPUDialect::getKernelFuncAttrName(),
-                         kernel_builder.getUnitAttr());
-
-      // Create a map from old kernel argument to new one.
-      mlir::BlockAndValueMapping old_kernel_to_new;
-      for (int i = 0, e = func.getNumArguments(); i < e; ++i) {
-        mlir::Value func_arg = func.getArgument(i);
-        mlir::Value new_kernel_arg = new_kernel.getArgument(i);
-        mlir::Value old_kernel_arg = func_to_kernel.lookupOrNull(func_arg);
-        if (!old_kernel_arg) {
-          kernel.emitOpError()
-              << "argument " << i
-              << " to containing function is not an argument to the kernel";
-          signalPassFailure();
-          return;
-        }
-        old_kernel_to_new.map(old_kernel_arg, new_kernel_arg);
-      }
-      for (int i = 0, e = returnOp->getNumOperands(); i < e; ++i) {
-        mlir::Value ret_op = returnOp->getOperand(i);
-        mlir::Value new_kernel_arg =
-            new_kernel.getArgument(func.getNumArguments() + i);
-        mlir::Value old_kernel_arg = func_to_kernel.lookupOrNull(ret_op);
-        if (!old_kernel_arg) {
-          kernel.emitOpError()
-              << "result " << i
-              << " of containing function is not an argument to the kernel";
-          signalPassFailure();
-          return;
-        }
-        old_kernel_to_new.map(old_kernel_arg, new_kernel_arg);
-      }
-      // Steal the body by appending the blocks and inserting a branch.
-      kernel.body().cloneInto(&new_kernel.getBody(), old_kernel_to_new);
-      kernel_builder.setInsertionPointToEnd(&new_kernel.body().front());
-      kernel_builder.create<mlir::BranchOp>(
-          new_kernel.getLoc(), &*std::next(new_kernel.body().begin()));
-      // Now create a new launchOp calling the new kernel. We need to forward
-      // the arguments of the surrounding function and operands to the return.
-      mlir::SmallVector<mlir::Value, 4> new_operands;
-      new_operands.reserve(new_kernel.getNumFuncArguments());
-      new_operands.append(func.args_begin(), func.args_end());
-      new_operands.append(returnOp->operand_begin(), returnOp->operand_end());
-      mlir::OpBuilder launch_builder(launchOp);
-      launch_builder.create<mlir::gpu::LaunchFuncOp>(
-          launchOp.getLoc(), new_kernel, launchOp.getGridSizeOperandValues(),
-          launchOp.getBlockSizeOperandValues(), new_operands);
-      // Launch does not have results, so we can just erase it. And the kernel
-      // also needs to go.
-      launchOp.erase();
-      kernel.erase();
-    });
-  }
-};
-
-// Extract_element(mhlo_scalars_to_dimension_tensor(v_i), i) -> v_i
-//
-// We need to direct fusion to the inner loops. This cannot be done with
-// a passmanager alone ATM, as nested pass managers require operations to
-// be closed from above.
-struct MapParallelLoops
-    : public mlir::PassWrapper<MapParallelLoops, mlir::FunctionPass> {
-  void runOnFunction() override {
-    mlir::greedilyMapParallelSCFToGPU(getFunction().getBody());
-  }
-};
-
-// We need to direct fusion to the inner loops. This cannot be done with
-// a passmanager alone ATM, as nested pass managers require operations to
-// be closed from above.
-struct FuseInnerParallelLoops
-    : public mlir::PassWrapper<FuseInnerParallelLoops, mlir::FunctionPass> {
-  void runOnFunction() override {
-    getFunction().walk([](mlir::scf::ParallelOp op) {
-      mlir::scf::naivelyFuseParallelOps(op.region());
-    });
-  }
-};
-
-// Collapse all loop dimension into the first one.
-struct ParallelLoopCollapsingToFirstDim
-    : public mlir::PassWrapper<ParallelLoopCollapsingToFirstDim,
-                               mlir::OperationPass<mlir::ModuleOp>> {
-  void runOnOperation() override {
-    mlir::Operation* module = getOperation();
-
-    module->walk([&](mlir::scf::ParallelOp op) {
-      unsigned num_loops = op.getNumLoops();
-      std::vector<unsigned> combinedLoops;
-      combinedLoops.reserve(num_loops);
-      for (unsigned i = 0; i < num_loops; ++i) {
-        combinedLoops.push_back(i);
-      }
-      mlir::collapseParallelLoops(op, {combinedLoops});
-    });
-  }
-};
-}  // namespace
 
 Status LowerLHLOToGPU(mlir::ModuleOp module, LowerLHLOToGPUOptions options) {
   mlir::PassManager pm(module.getContext());
@@ -461,14 +68,14 @@ Status LowerLHLOToGPU(mlir::ModuleOp module, LowerLHLOToGPUOptions options) {
   // Moving `AllocOp`s and inserting missing `DeallocOp`s
   pm.addPass(::mlir::createBufferPlacementPass());
   // Next, we can strip the outer fusion operation.
-  pm.addPass(absl::make_unique<FusionOpRemover>());
+  pm.addPass(createFusionOpRemoverPass());
   // Remove unnecessary LHLO copies.
   pm.addPass(::mlir::lmhlo::createLhloCopyRemovalPass());
   // Transform LHLO operations to LinAlg.
   pm.addPass(::mlir::lmhlo::createLegalizeLhloToLinalgPass());
   // Fuse linalg operations.
-  pm.addPass(::mlir::lmhlo::createLhloFuseLinalg(/*use_parallel_loops=*/true,
-                                                 tiling_for_unrolling));
+  pm.addPass(::mlir::lmhlo::createLhloFuseLinalgPass(
+      /*use_parallel_loops=*/true, tiling_for_unrolling));
   // Legalize reduce operations directly to GPU dialect.
   pm.addPass(::mlir::lmhlo::createLegalizeToGpuPass());
   // Transform the Linalg operations inside of the loop nest into parallel
@@ -479,26 +86,26 @@ Status LowerLHLOToGPU(mlir::ModuleOp module, LowerLHLOToGPUOptions options) {
   pm.addNestedPass<::mlir::FuncOp>(::mlir::createCanonicalizerPass());
   pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
   // Fuse the inner-most loops.
-  pm.addPass(absl::make_unique<FuseInnerParallelLoops>());
+  pm.addPass(createFuseInnerParallelLoopsPass());
   // Run CSE to ensure that loads and stores to the same subview get
   // recognized as such.
   pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
   // Forward stores to buffers to loads.
-  pm.addPass(absl::make_unique<StoreForwardingPass>());
+  pm.addPass(createStoreForwardingPass());
   // Remove now unused temporary buffers.
-  pm.addPass(absl::make_unique<DeadTempBufferRemoval>());
+  pm.addPass(createDeadTempBufferRemovalPass());
   if (!options.unroll_factors.empty()) {
     pm.addPass(::mlir::createParallelLoopTilingPass(as_int64));
   }
   // Project all loop dimensions to X if necessary.
   if (options.collapse_parallel_loops) {
-    pm.addPass(absl::make_unique<ParallelLoopCollapsingToFirstDim>());
+    pm.addPass(createParallelLoopCollapsingToFirstDimPass());
   }
   // Some basic cleanup.
   pm.addNestedPass<::mlir::FuncOp>(::mlir::createCanonicalizerPass());
   pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
   // Greedily map the remaining loop to GPU hardware dimensions.
-  pm.addPass(absl::make_unique<MapParallelLoops>());
+  pm.addPass(createMapParallelLoopsPass());
   // Apply the mapping.
   pm.addPass(mlir::createParallelLoopToGpuPass());
   // Some basic cleanup.
@@ -512,16 +119,16 @@ Status LowerLHLOToGPU(mlir::ModuleOp module, LowerLHLOToGPUOptions options) {
   // Approximate of requested.
   if (options.use_approximations) {
     pm.addNestedPass<::mlir::FuncOp>(
-        ::mlir::hlo::createLegalizeTanhToApproximationPass());
+        ::mlir::mhlo::createLegalizeTanhToApproximationPass());
   }
   // Move scalar operations into the launch to ensure smaller signatures.
-  pm.addPass(absl::make_unique<MoveScalarComputationsIntoGpuLaunch>());
+  pm.addPass(createMoveScalarComputationsIntoGpuLaunchPass());
   // Take launches to launches with kernels.
   pm.addPass(::mlir::createGpuKernelOutliningPass());
   // Make sure the kernel signature resembled the original function's
   // signature
   if (options.rewrite_signature) {
-    pm.addPass(absl::make_unique<RewriteKernelSignature>());
+    pm.addPass(createRewriteKernelSignaturePass());
   }
   if (failed(pm.run(module))) {
     return InternalError("Lowering to GPU kernels failed.");
@@ -595,5 +202,6 @@ StatusOr<mlir::ModuleOp> ExtractKernelModule(mlir::ModuleOp module) {
   });
   return kernelModule;
 }
+
 }  // namespace mlir_gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
index 194eb4618d3..e0d7456fbb8 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
@@ -205,7 +205,7 @@ LhloDialectEmitter::LhloDialectEmitter(
       platform_(platform) {
   LLVMDialect* llvmDialect =
       mlir_module.getContext()->getRegisteredDialect<LLVMDialect>();
-  pointer_size_ = llvmDialect->getLLVMModule().getDataLayout().getPointerSize();
+  pointer_size_ = llvmDialect->getDataLayout().getPointerSize();
 }
 
 void LhloDialectEmitter::AddThunkToThunkSequence(std::unique_ptr<Thunk> thunk) {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc
index 458522f89e6..df2bd2e4c23 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc
@@ -30,18 +30,14 @@ namespace {
 using ::mlir::MLIRContext;
 using ::mlir::LLVM::LLVMDialect;
 
-int64 ConfigureLLVMModuleAndGetPointerSize(MLIRContext* context) {
+int64 GetPointerSize(MLIRContext* context) {
   LLVMDialect* dialect = context->getRegisteredDialect<LLVMDialect>();
-  llvm::Module& module = dialect->getLLVMModule();
-  module.setTargetTriple(gpu::nvptx::kTargetTriple);
-  module.setDataLayout(gpu::nvptx::kDataLayout);
-  return module.getDataLayout().getPointerSize();
+  return dialect->getDataLayout().getPointerSize();
 }
 
 }  // namespace
 
-MlirCompiler::MlirCompiler()
-    : pointer_size_(ConfigureLLVMModuleAndGetPointerSize(&context_)) {}
+MlirCompiler::MlirCompiler() : pointer_size_(GetPointerSize(&context_)) {}
 
 se::Platform::Id MlirCompiler::PlatformId() const {
   return stream_executor::cuda::kCudaPlatformId;
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
index 2c2076bbd97..4879c6b5099 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "llvm/IR/LLVMContext.h"
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"  // from @llvm-project
 #include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
@@ -292,10 +293,10 @@ Status InsertBufferLoadPreduleIntoKernel(
     BufferAssignment* assignment,
     const std::vector<const BufferAllocation*>& buffers) {
   mlir::OpBuilder builder(kernel.getBody());
-  auto llvm_dialect = kernel.getContext()->getRegisteredDialect<LLVMDialect>();
-  auto offset_type = LLVMType::getInt64Ty(llvm_dialect);
-  auto ptr_type = LLVMType::getInt8PtrTy(llvm_dialect);
-  auto void_type = LLVMType::getVoidTy(llvm_dialect);
+  auto* context = kernel.getContext();
+  auto offset_type = LLVMType::getInt64Ty(context);
+  auto ptr_type = LLVMType::getInt8PtrTy(context);
+  auto void_type = LLVMType::getVoidTy(context);
   auto loc = kernel.getLoc();
 
   auto num_original_args = kernel.getNumArguments();
@@ -543,7 +544,11 @@ StatusOr<std::unique_ptr<Executable>> MlirCompilerImpl::RunBackend(
   TF_RETURN_IF_ERROR(
       module_hook_.invoke(IRHook::LoweringStage::KERNEL, *kernel_module));
 
-  auto llvmModule = mlir::translateModuleToNVVMIR(*kernel_module);
+  // Translate to LLVM IR in a fresh context. The module is further translated
+  // to textual PTX and a CUBIN blob so there is no need for the context to live
+  // longer than this function.
+  llvm::LLVMContext llvmContext;
+  auto llvmModule = mlir::translateModuleToNVVMIR(*kernel_module, llvmContext);
 
   if (!llvmModule) {
     return InternalError("Translation to LLVM failed");
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/passes.cc b/tensorflow/compiler/xla/service/mlir_gpu/passes.cc
new file mode 100644
index 00000000000..887f14e90d9
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/passes.cc
@@ -0,0 +1,423 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/mlir_gpu/passes.h"
+
+#include "absl/memory/memory.h"
+#include "llvm/ADT/SetVector.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
+#include "mlir/Dialect/GPU/ParallelLoopMapper.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/Transforms.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/Transforms/LoopUtils.h"  // from @llvm-project
+#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+
+namespace xla {
+namespace mlir_gpu {
+namespace {
+
+struct FusionOpRemoverPass
+    : public mlir::PassWrapper<FusionOpRemoverPass, ::mlir::FunctionPass> {
+  void runOnFunction() override {
+    getFunction().walk([&](mlir::lmhlo::FusionOp op) {
+      mlir::OpBuilder builder(op);
+      // FusionOp has a single region with a single block, so we can just walk
+      // over it and clone operations to the outside.
+      mlir::BlockAndValueMapping mapping;
+      for (auto& nested_op : op.region().front().without_terminator()) {
+        auto clone = builder.clone(nested_op, mapping);
+        for (auto pair :
+             llvm::zip(nested_op.getResults(), clone->getResults())) {
+          mapping.map(std::get<0>(pair), std::get<1>(pair));
+        }
+      }
+      op.erase();
+    });
+  }
+};
+
+struct StoreForwardingPass
+    : mlir::PassWrapper<StoreForwardingPass, mlir::FunctionPass> {
+  mlir::StoreOp findStore(mlir::Operation* op,
+                          std::function<bool(mlir::StoreOp)> matches) {
+    // Search from op upwards in the current block.
+    mlir::Block* block = op->getBlock();
+    auto startFromIt =
+        std::find_if(block->rbegin(), block->rend(),
+                     [op](mlir::Operation& other) { return &other == op; });
+    for (auto storeOpIt = startFromIt; storeOpIt != block->rend();
+         ++storeOpIt) {
+      auto storeOp = llvm::dyn_cast<mlir::StoreOp>(&*(storeOpIt));
+      if (!storeOp || !matches(storeOp)) {
+        continue;
+      }
+
+      return storeOp;
+    }
+    // No store operation found. Continue search outside of the parallel
+    // loop if block is in a parallel loop.
+    if (auto parallelOp =
+            llvm::dyn_cast<mlir::scf::ParallelOp>(block->getParentOp())) {
+      return findStore(parallelOp.getOperation(), matches);
+    }
+    return {};
+  }
+
+  // Recursively search defining ops for AllocOp. Return either AllocOp if it is
+  // found or nullptr.
+  mlir::Operation* SearchAllocOp(mlir::Value memref) {
+    mlir::Operation* defOp = memref.getDefiningOp();
+    while (auto subviewOp = mlir::dyn_cast_or_null<mlir::SubViewOp>(defOp)) {
+      defOp = subviewOp.source().getDefiningOp();
+    }
+    if (auto allocOp = mlir::dyn_cast_or_null<mlir::AllocOp>(defOp)) {
+      return allocOp.getOperation();
+    }
+    return nullptr;
+  }
+
+  // Retrieves AllocOp from the cache or actually looks for it.
+  mlir::Operation* GetAllocOp(
+      mlir::Value memref,
+      llvm::DenseMap<mlir::Value, mlir::Operation*>* memrefToAllocOp) {
+    auto allocOpIt = memrefToAllocOp->find(memref);
+    if (allocOpIt != memrefToAllocOp->end()) {
+      return allocOpIt->second;
+    }
+    auto allocOp = SearchAllocOp(memref);
+    memrefToAllocOp->insert({memref, allocOp});
+    return allocOp;
+  }
+
+  void runOnFunction() override {
+    llvm::DenseMap<mlir::Value, mlir::Operation*> memrefToAllocOp;
+
+    getFunction().walk([&](mlir::LoadOp loadOp) {
+      auto storeOp = findStore(loadOp, [&](mlir::StoreOp storeOp) {
+        mlir::Operation* storeOpAlloc =
+            GetAllocOp(storeOp.memref(), &memrefToAllocOp);
+        mlir::Operation* loadOpAlloc =
+            GetAllocOp(loadOp.memref(), &memrefToAllocOp);
+        return storeOpAlloc && loadOpAlloc && (storeOpAlloc == loadOpAlloc);
+      });
+      if (!storeOp) {
+        return;
+      }
+      auto storeIndices = storeOp.getIndices();
+      auto loadIndices = loadOp.getIndices();
+      if (!std::equal(storeIndices.begin(), storeIndices.end(),
+                      loadIndices.begin(), loadIndices.end())) {
+        return;
+      }
+      loadOp.replaceAllUsesWith(storeOp.getValueToStore());
+      loadOp.erase();
+    });
+  }
+};
+
+struct DeadTempBufferRemovalPass
+    : mlir::PassWrapper<DeadTempBufferRemovalPass, ::mlir::FunctionPass> {
+  bool operationConsideredDead(mlir::Operation* op) {
+    for (auto result : op->getResults()) {
+      if (!llvm::all_of(result.getUsers(), [&](mlir::Operation* op) {
+            // Store and Dealloc is OK.
+            if (llvm::isa<mlir::StoreOp, mlir::DeallocOp>(op)) {
+              return true;
+            }
+            // Load without uses is also ok.
+            if (auto loadOp = llvm::dyn_cast<mlir::LoadOp>(op)) {
+              return loadOp.use_empty();
+            }
+            // Subview is ok if it is dead itself.
+            if (llvm::isa<mlir::SubViewOp>(op)) {
+              return operationConsideredDead(op);
+            }
+            return false;
+          })) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  void recursiveErase(mlir::Operation* op,
+                      llvm::SmallVectorImpl<mlir::Operation*>* erase_list) {
+    for (auto result : op->getResults()) {
+      for (auto user : llvm::make_early_inc_range(result.getUsers())) {
+        recursiveErase(user, erase_list);
+      }
+    }
+    erase_list->push_back(op);
+  }
+
+  void runOnFunction() override {
+    llvm::SmallVector<mlir::Operation*, 8> dead_ops;
+    getFunction().walk([&](mlir::AllocOp allocOp) {
+      if (!operationConsideredDead(allocOp)) {
+        return;
+      }
+
+      // TODO(herhut): There should be a generic helper for this.
+      recursiveErase(allocOp, &dead_ops);
+    });
+    for (auto op : dead_ops) {
+      op->erase();
+    }
+  }
+};
+
+struct MoveScalarComputationsIntoGpuLaunchPass
+    : mlir::PassWrapper<MoveScalarComputationsIntoGpuLaunchPass,
+                        mlir::FunctionPass> {
+  static bool isInliningBeneficiary(mlir::Operation* op) {
+    return llvm::isa<mlir::ConstantOp, mlir::DimOp, mlir::SelectOp,
+                     mlir::CmpIOp>(op);
+  }
+
+  static bool extractBeneficiaryOps(
+      mlir::Operation* op, llvm::SmallVectorImpl<mlir::Operation*>* ops,
+      llvm::SetVector<mlir::Value> args) {
+    if (!isInliningBeneficiary(op)) {
+      return false;
+    }
+
+    ops->push_back(op);
+    for (auto operand : op->getOperands()) {
+      // It is an existing arg, keep going.
+      if (args.count(operand)) {
+        continue;
+      }
+      mlir::Operation* definingOp = operand.getDefiningOp();
+      if (!definingOp || !extractBeneficiaryOps(definingOp, ops, args)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  static void inlineOperationsIntoLaunch(mlir::gpu::LaunchOp launch) {
+    llvm::SetVector<mlir::Value> used_above;
+    mlir::getUsedValuesDefinedAbove(launch.body(), used_above);
+    mlir::BlockAndValueMapping inlined_map;
+    for (mlir::Value v : used_above) {
+      llvm::SmallVector<mlir::Operation*, 8> ops_to_move;
+      mlir::Operation* definingOp = v.getDefiningOp();
+      if (definingOp &&
+          extractBeneficiaryOps(definingOp, &ops_to_move, used_above)) {
+        mlir::OpBuilder b(launch.body());
+        for (mlir::Operation* op : llvm::reverse(ops_to_move)) {
+          auto result = b.clone(*op, inlined_map);
+          for (auto pair : llvm::zip(op->getResults(), result->getResults())) {
+            mlir::replaceAllUsesInRegionWith(std::get<0>(pair),
+                                             std::get<1>(pair), launch.body());
+          }
+          inlined_map.map(op->getResults(), result->getResults());
+        }
+      }
+    }
+  }
+
+  void runOnFunction() override {
+    mlir::FuncOp fun = getFunction();
+    fun.walk(
+        [](mlir::gpu::LaunchOp launch) { inlineOperationsIntoLaunch(launch); });
+  }
+};
+
+struct RewriteKernelSignaturePass
+    : mlir::PassWrapper<RewriteKernelSignaturePass, mlir::FunctionPass> {
+  void runOnFunction() override {
+    mlir::FuncOp func = getFunction();
+    mlir::ModuleOp module = func.getParentOfType<mlir::ModuleOp>();
+    getFunction().walk([&](mlir::gpu::LaunchFuncOp launchOp) {
+      mlir::gpu::GPUFuncOp kernel =
+          module.lookupSymbol<mlir::gpu::GPUFuncOp>(launchOp.kernel());
+
+      if (kernel.getNumFuncArguments() !=
+          func.getNumArguments() + func.getNumResults()) {
+        kernel.emitError()
+            << "number of kernel arguments does not match number"
+            << "of arguments and results of surrounding function";
+        signalPassFailure();
+        return;
+      }
+      if (!llvm::hasSingleElement(func)) {
+        func.emitError() << "surrounding function has more than one block";
+        signalPassFailure();
+        return;
+      }
+
+      // Compute a map from function arguments to kernel function operands.
+      mlir::BlockAndValueMapping func_to_kernel;
+      for (mlir::BlockArgument arg : func.getArguments()) {
+        for (int i = 0, e = launchOp.getNumKernelOperands(); i < e; ++i) {
+          if (launchOp.getKernelOperand(i) == arg) {
+            func_to_kernel.map(arg, kernel.getArgument(i));
+            break;
+          }
+        }
+      }
+      // Also add function results that are computed by the launch.
+      mlir::Operation* returnOp = func.getBody().back().getTerminator();
+      for (mlir::Value result : returnOp->getOperands()) {
+        for (int i = 0, e = launchOp.getNumKernelOperands(); i < e; ++i) {
+          if (launchOp.getKernelOperand(i) == result) {
+            func_to_kernel.map(result, kernel.getArgument(i));
+            break;
+          }
+        }
+      }
+
+      // Create a new kernel function with modified signature. It will have the
+      // parameters and result types of the original funcion as its parameter
+      // type and otherwise will be void.
+      auto gpu_module = kernel.getParentOfType<mlir::gpu::GPUModuleOp>();
+      mlir::OpBuilder kernel_builder(gpu_module.body());
+      auto operand_types = llvm::to_vector<4>(llvm::concat<const mlir::Type>(
+          func.getType().getInputs(), func.getType().getResults()));
+      auto new_kernel = kernel_builder.create<mlir::gpu::GPUFuncOp>(
+          kernel.getLoc(), kernel.getName(),
+          kernel_builder.getFunctionType(operand_types, {}));
+      new_kernel.setAttr(mlir::gpu::GPUDialect::getKernelFuncAttrName(),
+                         kernel_builder.getUnitAttr());
+
+      // Create a map from old kernel argument to new one.
+      mlir::BlockAndValueMapping old_kernel_to_new;
+      for (int i = 0, e = func.getNumArguments(); i < e; ++i) {
+        mlir::Value func_arg = func.getArgument(i);
+        mlir::Value new_kernel_arg = new_kernel.getArgument(i);
+        mlir::Value old_kernel_arg = func_to_kernel.lookupOrNull(func_arg);
+        if (!old_kernel_arg) {
+          kernel.emitOpError()
+              << "argument " << i
+              << " to containing function is not an argument to the kernel";
+          signalPassFailure();
+          return;
+        }
+        old_kernel_to_new.map(old_kernel_arg, new_kernel_arg);
+      }
+      for (int i = 0, e = returnOp->getNumOperands(); i < e; ++i) {
+        mlir::Value ret_op = returnOp->getOperand(i);
+        mlir::Value new_kernel_arg =
+            new_kernel.getArgument(func.getNumArguments() + i);
+        mlir::Value old_kernel_arg = func_to_kernel.lookupOrNull(ret_op);
+        if (!old_kernel_arg) {
+          kernel.emitOpError()
+              << "result " << i
+              << " of containing function is not an argument to the kernel";
+          signalPassFailure();
+          return;
+        }
+        old_kernel_to_new.map(old_kernel_arg, new_kernel_arg);
+      }
+      // Steal the body by appending the blocks and inserting a branch.
+      kernel.body().cloneInto(&new_kernel.getBody(), old_kernel_to_new);
+      kernel_builder.setInsertionPointToEnd(&new_kernel.body().front());
+      kernel_builder.create<mlir::BranchOp>(
+          new_kernel.getLoc(), &*std::next(new_kernel.body().begin()));
+      // Now create a new launchOp calling the new kernel. We need to forward
+      // the arguments of the surrounding function and operands to the return.
+      mlir::SmallVector<mlir::Value, 4> new_operands;
+      new_operands.reserve(new_kernel.getNumFuncArguments());
+      new_operands.append(func.args_begin(), func.args_end());
+      new_operands.append(returnOp->operand_begin(), returnOp->operand_end());
+      mlir::OpBuilder launch_builder(launchOp);
+      launch_builder.create<mlir::gpu::LaunchFuncOp>(
+          launchOp.getLoc(), new_kernel, launchOp.getGridSizeOperandValues(),
+          launchOp.getBlockSizeOperandValues(), new_operands);
+      // Launch does not have results, so we can just erase it. And the kernel
+      // also needs to go.
+      launchOp.erase();
+      kernel.erase();
+    });
+  }
+};
+
+struct MapParallelLoopsPass
+    : public mlir::PassWrapper<MapParallelLoopsPass, mlir::FunctionPass> {
+  void runOnFunction() override {
+    mlir::greedilyMapParallelSCFToGPU(getFunction().getBody());
+  }
+};
+
+struct FuseInnerParallelLoopsPass
+    : public mlir::PassWrapper<FuseInnerParallelLoopsPass, mlir::FunctionPass> {
+  void runOnFunction() override {
+    getFunction().walk([](mlir::scf::ParallelOp op) {
+      mlir::scf::naivelyFuseParallelOps(op.region());
+    });
+  }
+};
+
+struct ParallelLoopCollapsingToFirstDimPass
+    : public mlir::PassWrapper<ParallelLoopCollapsingToFirstDimPass,
+                               mlir::OperationPass<mlir::ModuleOp>> {
+  void runOnOperation() override {
+    mlir::Operation* module = getOperation();
+
+    module->walk([&](mlir::scf::ParallelOp op) {
+      unsigned num_loops = op.getNumLoops();
+      std::vector<unsigned> combinedLoops;
+      combinedLoops.reserve(num_loops);
+      for (unsigned i = 0; i < num_loops; ++i) {
+        combinedLoops.push_back(i);
+      }
+      mlir::collapseParallelLoops(op, {combinedLoops});
+    });
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::FunctionPass> createFusionOpRemoverPass() {
+  return absl::make_unique<FusionOpRemoverPass>();
+}
+
+std::unique_ptr<mlir::FunctionPass> createStoreForwardingPass() {
+  return absl::make_unique<StoreForwardingPass>();
+}
+
+std::unique_ptr<mlir::FunctionPass> createDeadTempBufferRemovalPass() {
+  return absl::make_unique<DeadTempBufferRemovalPass>();
+}
+
+std::unique_ptr<mlir::FunctionPass>
+createMoveScalarComputationsIntoGpuLaunchPass() {
+  return absl::make_unique<MoveScalarComputationsIntoGpuLaunchPass>();
+}
+
+std::unique_ptr<mlir::FunctionPass> createRewriteKernelSignaturePass() {
+  return absl::make_unique<RewriteKernelSignaturePass>();
+}
+
+std::unique_ptr<mlir::FunctionPass> createFuseInnerParallelLoopsPass() {
+  return absl::make_unique<FuseInnerParallelLoopsPass>();
+}
+
+std::unique_ptr<mlir::FunctionPass> createMapParallelLoopsPass() {
+  return absl::make_unique<MapParallelLoopsPass>();
+}
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+createParallelLoopCollapsingToFirstDimPass() {
+  return absl::make_unique<ParallelLoopCollapsingToFirstDimPass>();
+}
+
+}  // namespace mlir_gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/passes.h b/tensorflow/compiler/xla/service/mlir_gpu/passes.h
new file mode 100644
index 00000000000..e3840628a2e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/passes.h
@@ -0,0 +1,66 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_PASSES_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_PASSES_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace xla {
+namespace mlir_gpu {
+
+// TODO(herhut, pifon): Move these passes to MLIR Core.
+
+/// Replaces a FusionOp by the operations contained in its region.
+std::unique_ptr<mlir::FunctionPass> createFusionOpRemoverPass();
+
+/// Replaces a load that immediately follows a store to the same address with
+/// the stored value. This needs generalization.
+std::unique_ptr<mlir::FunctionPass> createStoreForwardingPass();
+
+/// Removes temporary buffers that are only written to but never read from or
+/// that are read but the read value is not used. Needs an analysis that proves
+/// that loads and stores are side-effect free (in bounds, no aliasing, etc.).
+std::unique_ptr<mlir::FunctionPass> createDeadTempBufferRemovalPass();
+
+/// Moves scalar computations to the GPULaunchOp body.
+std::unique_ptr<mlir::FunctionPass>
+createMoveScalarComputationsIntoGpuLaunchPass();
+
+/// Sorts the operands to the kernel for a deterministic order. First operands
+/// that are defined by function arguments, followed by operands that are
+/// returned from the function. This only works for simple functions without
+/// control flow and can be used in cases where the kernel is extracted and used
+/// independently of the host-side code.
+std::unique_ptr<mlir::FunctionPass> createRewriteKernelSignaturePass();
+
+/// We need to direct fusion to the inner loops. This cannot be done with
+/// a passmanager alone ATM, as nested pass managers require operations to
+/// be closed from above.
+std::unique_ptr<mlir::FunctionPass> createFuseInnerParallelLoopsPass();
+
+/// Greedily maps loops to GPU hardware dimensions.
+std::unique_ptr<mlir::FunctionPass> createMapParallelLoopsPass();
+
+/// Collapses all loop dimension into the first one.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+createParallelLoopCollapsingToFirstDimPass();
+
+}  // namespace mlir_gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_PASSES_H_
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_as_kernel.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_as_kernel.hlo
index 953eb2022f8..8d7930ea8c0 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_as_kernel.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_as_kernel.hlo
@@ -7,24 +7,24 @@ ENTRY %Add (x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
   ROOT %add = f32[2,2]{1,0} add(f32[2,2]{1,0} %x, f32[2,2]{1,0} %y)
 }
 
-//  CHECK: func @add_kernel(%[[ARG0:.*]]: [[TYPE:!llvm<.*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]]
+//  CHECK: func @add_kernel(%[[ARG0:.*]]: [[TYPE:!llvm\..*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]]
 
 //
 //   Check that relevant sizes and strides are emitted.
 //
-//  CHECK: %[[CAST0:.*]] = llvm.bitcast %[[ARG0:.*]] : !llvm<"i8*"> to !llvm<"float*">
+//  CHECK: %[[CAST0:.*]] = llvm.bitcast %[[ARG0:.*]] : !llvm.ptr<i8> to !llvm.ptr<float>
 //  CHECK: %[[SIZE00:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
 //  CHECK: %[[SIZE01:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
 //  CHECK: %[[STRIDE01:.*]] = llvm.mlir.constant(1 : i64) : !llvm.i64
 //  CHECK: %[[STRIDE00:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
 
-//  CHECK: %[[CAST1:.*]] = llvm.bitcast %[[ARG1:.*]] : !llvm<"i8*"> to !llvm<"float*">
+//  CHECK: %[[CAST1:.*]] = llvm.bitcast %[[ARG1:.*]] : !llvm.ptr<i8> to !llvm.ptr<float>
 //  CHECK: %[[SIZE10:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
 //  CHECK: %[[SIZE11:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
 //  CHECK: %[[STRIDE11:.*]] = llvm.mlir.constant(1 : i64) : !llvm.i64
 //  CHECK: %[[STRIDE10:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
 
-//  CHECK: %[[CAST2:.*]] = llvm.bitcast %[[ARG2:.*]] : !llvm<"i8*"> to !llvm<"float*">
+//  CHECK: %[[CAST2:.*]] = llvm.bitcast %[[ARG2:.*]] : !llvm.ptr<i8> to !llvm.ptr<float>
 //  CHECK: %[[SIZE20:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
 //  CHECK: %[[SIZE21:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
 //  CHECK: %[[STRIDE21:.*]] = llvm.mlir.constant(1 : i64) : !llvm.i64
@@ -34,30 +34,30 @@ ENTRY %Add (x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
 //   Check that the emitted sizes and strides, as well the pointers to HLO buffers,
 //   are inserted into the memref descriptors.
 //
-//  CHECK: %[[DESC0:.*]] = llvm.mlir.undef : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-//  CHECK: %[[DESC01:.*]] = llvm.insertvalue %[[CAST0]], %[[DESC0]][0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-//  CHECK: %[[DESC02:.*]] = llvm.insertvalue %[[CAST0]], %[[DESC01]][1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-//  CHECK: %[[DESC03:.*]] = llvm.insertvalue %{{.*}}, %[[DESC02]][2] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-//  CHECK: %[[DESC04:.*]] = llvm.insertvalue %[[SIZE00]], %[[DESC03]][3, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-//  CHECK: %[[DESC05:.*]] = llvm.insertvalue %[[STRIDE00]], %[[DESC04]][4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-//  CHECK: %[[DESC06:.*]] = llvm.insertvalue %[[SIZE01]], %[[DESC05]][3, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-//  CHECK: %{{.*}} = llvm.insertvalue %[[STRIDE01]], %[[DESC06]][4, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK: %[[DESC0:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
+//  CHECK: %[[DESC01:.*]] = llvm.insertvalue %[[CAST0]], %[[DESC0]][0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
+//  CHECK: %[[DESC02:.*]] = llvm.insertvalue %[[CAST0]], %[[DESC01]][1] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
+//  CHECK: %[[DESC03:.*]] = llvm.insertvalue %{{.*}}, %[[DESC02]][2] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
+//  CHECK: %[[DESC04:.*]] = llvm.insertvalue %[[SIZE00]], %[[DESC03]][3, 0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
+//  CHECK: %[[DESC05:.*]] = llvm.insertvalue %[[STRIDE00]], %[[DESC04]][4, 0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
+//  CHECK: %[[DESC06:.*]] = llvm.insertvalue %[[SIZE01]], %[[DESC05]][3, 1] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
+//  CHECK: %{{.*}} = llvm.insertvalue %[[STRIDE01]], %[[DESC06]][4, 1] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
 
-//  CHECK: %[[DESC1:.*]] = llvm.mlir.undef : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-//  CHECK: %[[DESC11:.*]] = llvm.insertvalue %[[CAST1]], %[[DESC1]][0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-//  CHECK: %[[DESC12:.*]] = llvm.insertvalue %[[CAST1]], %[[DESC11]][1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-//  CHECK: %[[DESC13:.*]] = llvm.insertvalue %{{.*}}, %[[DESC12]][2] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-//  CHECK: %[[DESC14:.*]] = llvm.insertvalue %[[SIZE10]], %[[DESC13]][3, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-//  CHECK: %[[DESC15:.*]] = llvm.insertvalue %[[STRIDE10]], %[[DESC14]][4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-//  CHECK: %[[DESC16:.*]] = llvm.insertvalue %[[SIZE11]], %[[DESC15]][3, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-//  CHECK: %{{.*}} = llvm.insertvalue %[[STRIDE11]], %[[DESC16]][4, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK: %[[DESC1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
+//  CHECK: %[[DESC11:.*]] = llvm.insertvalue %[[CAST1]], %[[DESC1]][0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
+//  CHECK: %[[DESC12:.*]] = llvm.insertvalue %[[CAST1]], %[[DESC11]][1] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
+//  CHECK: %[[DESC13:.*]] = llvm.insertvalue %{{.*}}, %[[DESC12]][2] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
+//  CHECK: %[[DESC14:.*]] = llvm.insertvalue %[[SIZE10]], %[[DESC13]][3, 0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
+//  CHECK: %[[DESC15:.*]] = llvm.insertvalue %[[STRIDE10]], %[[DESC14]][4, 0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
+//  CHECK: %[[DESC16:.*]] = llvm.insertvalue %[[SIZE11]], %[[DESC15]][3, 1] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
+//  CHECK: %{{.*}} = llvm.insertvalue %[[STRIDE11]], %[[DESC16]][4, 1] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
 
-//  CHECK: %[[DESC2:.*]] = llvm.mlir.undef : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-//  CHECK: %[[DESC21:.*]] = llvm.insertvalue %[[CAST2]], %[[DESC2]][0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-//  CHECK: %[[DESC22:.*]] = llvm.insertvalue %[[CAST2]], %[[DESC21]][1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-//  CHECK: %[[DESC23:.*]] = llvm.insertvalue %{{.*}}, %[[DESC22]][2] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-//  CHECK: %[[DESC24:.*]] = llvm.insertvalue %[[SIZE20]], %[[DESC23]][3, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-//  CHECK: %[[DESC25:.*]] = llvm.insertvalue %[[STRIDE20]], %[[DESC24]][4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-//  CHECK: %[[DESC26:.*]] = llvm.insertvalue %[[SIZE21]], %[[DESC25]][3, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-//  CHECK: %{{.*}} = llvm.insertvalue %[[STRIDE21]], %[[DESC26]][4, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK: %[[DESC2:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
+//  CHECK: %[[DESC21:.*]] = llvm.insertvalue %[[CAST2]], %[[DESC2]][0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
+//  CHECK: %[[DESC22:.*]] = llvm.insertvalue %[[CAST2]], %[[DESC21]][1] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
+//  CHECK: %[[DESC23:.*]] = llvm.insertvalue %{{.*}}, %[[DESC22]][2] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
+//  CHECK: %[[DESC24:.*]] = llvm.insertvalue %[[SIZE20]], %[[DESC23]][3, 0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
+//  CHECK: %[[DESC25:.*]] = llvm.insertvalue %[[STRIDE20]], %[[DESC24]][4, 0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
+//  CHECK: %[[DESC26:.*]] = llvm.insertvalue %[[SIZE21]], %[[DESC25]][3, 1] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
+//  CHECK: %{{.*}} = llvm.insertvalue %[[STRIDE21]], %[[DESC26]][4, 1] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
 
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/copy_transpose.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/copy_transpose.hlo
index 3a3dd22b338..8656b4edeb7 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/copy_transpose.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/copy_transpose.hlo
@@ -6,7 +6,7 @@ ENTRY %CopyTranspose (x: f32[2,4]) -> f32[2,4]{0,1} {
   ROOT %copy = f32[2,4]{0,1} copy(f32[2,4] %x)
 }
 
-// CHECK: #[[MAP0:.*]] = affine_map<(d0, d1) -> (d0 * 2 + d1)>
+// CHECK: #[[MAP0:.*]] = affine_map<(d0, d1) -> (d0 + d1 * 2)>
 // CHECK: func @copy(%[[OPERAND:.*]]: memref<2x4xf32>,
 // CHECK-SAME:       %[[RESULT:.*]]: memref<2x4xf32, #[[MAP0]]>) 
 // CHECK:   "lmhlo.copy"(%[[OPERAND]], %[[RESULT]])
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 2ed5e709d81..bc79f16db2a 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -270,12 +270,13 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
   auto config = absl::make_unique<HloModuleConfig>(program_shape);
   ComputationLayout* computation_layout =
       config->mutable_entry_computation_layout();
-  if (program_shape.parameters_size() != argument_shapes.size()) {
+  const int64 argument_shapes_size = argument_shapes.size();
+  if (program_shape.parameters_size() != argument_shapes_size) {
     return InvalidArgument("computation takes %d parameters, but %u given",
                            program_shape.parameters_size(),
                            argument_shapes.size());
   }
-  for (int i = 0; i < argument_shapes.size(); ++i) {
+  for (int i = 0, end = argument_shapes.size(); i < end; ++i) {
     // Verify that shape of arguments matches the shape of the arguments in the
     // ProgramShape.
     if (!ShapeUtil::Compatible(*argument_shapes[i],
@@ -315,6 +316,7 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     }
     config->set_use_spmd_partitioning(
         execution_options->use_spmd_partitioning());
+    config->set_deduplicate_hlo(execution_options->deduplicate_hlo());
     config->set_seed(execution_options->seed());
     config->set_launch_id(execution_options->launch_id());
     config->set_debug_options(execution_options->debug_options());
@@ -371,7 +373,7 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
 
   // Dump computation proto state if flag is set.
   std::vector<std::unique_ptr<HloProto>> hlo_protos;
-  for (int64 i = 0; i < module_protos.size(); ++i) {
+  for (int64 i = 0, end = module_protos.size(); i < end; ++i) {
     auto hlo_proto = absl::make_unique<HloProto>();
     *hlo_proto->mutable_hlo_module() = *module_protos[i];
     hlo_protos.push_back(std::move(hlo_proto));
@@ -385,7 +387,7 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
   CHECK_EQ(module_protos.size(), module_configs.size());
   auto module_group =
       absl::make_unique<HloModuleGroup>(module_protos[0]->name());
-  for (int64 i = 0; i < module_protos.size(); ++i) {
+  for (int64 i = 0, end = module_protos.size(); i < end; ++i) {
     const HloModuleProto* proto = module_protos[i];
     const HloModuleConfig& config = *module_configs[i];
     TF_ASSIGN_OR_RETURN(auto module, CreateModuleFromProto(*proto, config));
@@ -433,12 +435,12 @@ Service::ExecuteParallelAndRegisterResult(
   for (int64 i = 0; i < executables.size(); i++) {
     TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*backend, device_handles[i]));
     CHECK_EQ(replicas.size(), arguments[i].size());
-    for (int64 replica = 0; replica < replicas.size(); ++replica) {
+    for (int64 replica = 0, end = replicas.size(); replica < end; ++replica) {
       device_assignment(replica, i) = replicas[replica]->device_ordinal();
     }
   }
 
-  for (int64 i = 0; i < executables.size(); i++) {
+  for (int64 i = 0, end = executables.size(); i < end; i++) {
     // Stream executors for the replicas of the current computation.
     TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*backend, device_handles[i]));
     CHECK_EQ(replicas.size(), arguments[i].size());
@@ -497,7 +499,7 @@ Service::ExecuteParallelAndRegisterResult(
   }
 
   // Wait for all executions to complete.
-  for (int64 i = 0; i < streams.size(); ++i) {
+  for (int64 i = 0, end = streams.size(); i < end; ++i) {
     Status block_status = streams[i]->BlockHostUntilDone();
     if (!block_status.ok()) {
       return InternalError("failed to complete execution for stream %d: %s", i,
@@ -715,7 +717,7 @@ Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
 
   std::vector<HloSnapshot> snapshots;
   snapshots.resize(executable_ptrs.size());
-  for (int i = 0; i < executable_ptrs.size(); i++) {
+  for (int i = 0, end = executable_ptrs.size(); i < end; i++) {
     if (executable_ptrs[i]->dumping_snapshot()) {
       *snapshots[i].mutable_hlo() = *executable_ptrs[i]->hlo_proto();
       TF_ASSIGN_OR_RETURN(auto stream,
@@ -761,7 +763,7 @@ Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
     *result->add_responses() = response;
   }
 
-  for (int i = 0; i < executable_ptrs.size(); i++) {
+  for (int i = 0, end = executable_ptrs.size(); i < end; i++) {
     Executable* executable = executable_ptrs[i];
     if (executable->dumping_snapshot()) {
       TF_ASSIGN_OR_RETURN(const ShapedBuffer* result_buffer,
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index ec8e4d23d21..8e39e32e4c3 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -643,11 +643,6 @@ Status ValidateDotDimensionNumbers(
     return InvalidArgument("%s", message);
   };
 
-  // Check if both element types are the same.
-  if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(lhs, rhs)) {
-    return fail("Element types do not match.");
-  }
-
   // Validate basic properties of dot dimension numbers.
   TF_RETURN_IF_ERROR(ValidateDotDimensionNumbers(lhs, rhs, dimension_numbers));
 
@@ -954,18 +949,18 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   TF_RETURN_IF_ERROR(ExpectArray(
       rhs, absl::StrCat("rhs of binary operation ", HloOpcodeString(opcode))));
   switch (opcode) {
+    case HloOpcode::kAdd:
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
+    case HloOpcode::kMultiply:
       return InferElementwiseBinaryOpShape(opcode, lhs, rhs,
                                            broadcast_dimensions);
 
     case HloOpcode::kSubtract:
-    case HloOpcode::kAdd:
     case HloOpcode::kAtan2:
     case HloOpcode::kPower:
     case HloOpcode::kDivide:
     case HloOpcode::kRemainder:
-    case HloOpcode::kMultiply:
     case HloOpcode::kShiftLeft:
     case HloOpcode::kShiftRightArithmetic:
     case HloOpcode::kShiftRightLogical:
@@ -1621,11 +1616,6 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         batch_group_count, feature_group_count);
   }
 
-  if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(lhs, rhs)) {
-    return InvalidArgument(
-        "Convolution with different element types: %s and %s.",
-        ShapeUtil::HumanString(lhs), ShapeUtil::HumanString(rhs));
-  }
   if (dnums.input_spatial_dimensions_size() !=
       dnums.kernel_spatial_dimensions_size()) {
     return InvalidArgument(
diff --git a/tensorflow/compiler/xla/service/sharding_propagation.cc b/tensorflow/compiler/xla/service/sharding_propagation.cc
index 6c4cf2d7866..4ff492047a3 100644
--- a/tensorflow/compiler/xla/service/sharding_propagation.cc
+++ b/tensorflow/compiler/xla/service/sharding_propagation.cc
@@ -91,9 +91,7 @@ bool IsShardingMoreSpecific(const HloSharding& lhs, const HloSharding& rhs) {
     return is_better;
   }
   if (!rhs.IsTileMaximal()) {
-    // If we already have a non-tile-maximal sharding then we can't improve
-    // that.
-    return false;
+    return lhs.NumTiles() > rhs.NumTiles();
   } else if (!rhs.IsReplicated()) {
     // If we are not replicated then only tiled (not tile maximal) shardings
     // can improve us.
@@ -124,9 +122,12 @@ HloSharding MergeForMoreSpecificSharding(const HloSharding& a,
 
 // Updates the sharding of the specified instruction with the specified sharding
 // if it is better than the current one and returns true if a new sharding have
-// been applied.
+// been applied. If may_combine_partial_sharding is true, this may combine the
+// new and existing sharding if they are both partial tiling partial
+// replication.
 bool MaybeImproveInstructionSharding(const HloSharding& sharding,
-                                     HloInstruction* instruction) {
+                                     HloInstruction* instruction,
+                                     bool may_combine_partial_sharding) {
   // We don't want to propagate tile maximal shardings.
   if (!IsSpatiallyPartitioned(sharding)) {
     return false;
@@ -136,6 +137,101 @@ bool MaybeImproveInstructionSharding(const HloSharding& sharding,
     instruction->set_sharding(sharding);
     return true;
   }
+  if (may_combine_partial_sharding && sharding.ReplicateOnLastTileDim() &&
+      instruction->sharding().ReplicateOnLastTileDim()) {
+    if (sharding.tile_assignment().num_elements() ==
+        instruction->sharding().tile_assignment().num_elements()) {
+      // Combine the tile dimension sizes from new and old.
+      int64 num_devices = sharding.tile_assignment().num_elements();
+      std::vector<int64> new_tile_dims;
+      bool compatible = true;
+      new_tile_dims.reserve(sharding.tile_assignment().num_dimensions());
+      for (int64 i = 0; i < sharding.tile_assignment().num_dimensions() - 1;
+           ++i) {
+        int64 new_dim = sharding.tile_assignment().dim(i);
+        int64 old_dim = instruction->sharding().tile_assignment().dim(i);
+        if (new_dim == 1) {
+          new_tile_dims.push_back(old_dim);
+        } else if (old_dim == 1) {
+          new_tile_dims.push_back(new_dim);
+        } else if (new_dim == old_dim) {
+          new_tile_dims.push_back(new_dim);
+        } else {
+          compatible = false;
+          break;
+        }
+      }
+      int64 replication = num_devices / Product(new_tile_dims);
+      if (compatible && num_devices % Product(new_tile_dims) == 0 &&
+          replication <
+              instruction->sharding().tile_assignment().dimensions().back()) {
+        new_tile_dims.push_back(replication);
+        Array<int64> new_tile(new_tile_dims);
+        // Maps from replication group ID to sorted members.
+        absl::flat_hash_map<int64, std::set<int64>> old_group_members;
+        absl::flat_hash_map<int64, std::set<int64>> new_group_members;
+        auto get_group_index = [&](absl::Span<const int64> tile_indices,
+                                   const HloSharding& sharding) {
+          int64 group_id = 0;
+          for (int64 i = 0; i < tile_indices.size() - 1; ++i) {
+            group_id *= sharding.tile_assignment().dim(i);
+            group_id += tile_indices[i];
+          }
+          return group_id;
+        };
+        instruction->sharding().tile_assignment().Each(
+            [&](absl::Span<const int64> indices, int64 device) {
+              old_group_members[get_group_index(indices,
+                                                instruction->sharding())]
+                  .insert(device);
+            });
+        sharding.tile_assignment().Each([&](absl::Span<const int64> indices,
+                                            int64 device) {
+          new_group_members[get_group_index(indices, sharding)].insert(device);
+        });
+        // Try to find the intersection of old and new replication groups, in
+        // order to determine the merged tile assignment.
+        new_tile.Each([&](absl::Span<const int64> indices, int64* device) {
+          if (!compatible) {
+            return;
+          }
+          std::vector<int64> old_index(indices.begin(), indices.end());
+          std::vector<int64> new_index = old_index;
+          for (int64 i = 0; i < indices.size() - 1; ++i) {
+            if (instruction->sharding().tile_assignment().dim(i) == 1) {
+              old_index[i] = 0;
+            }
+            if (sharding.tile_assignment().dim(i) == 1) {
+              new_index[i] = 0;
+            }
+          }
+          int64 old_group_id =
+              get_group_index(old_index, instruction->sharding());
+          int64 new_group_id = get_group_index(new_index, sharding);
+          if (old_group_members[old_group_id].empty() ||
+              new_group_members[new_group_id].empty() ||
+              *old_group_members[old_group_id].begin() !=
+                  *new_group_members[new_group_id].begin()) {
+            compatible = false;
+            return;
+          }
+          *device = *old_group_members[old_group_id].begin();
+          old_group_members[old_group_id].erase(*device);
+          new_group_members[new_group_id].erase(*device);
+        });
+        if (compatible) {
+          if (replication == 1) {
+            new_tile_dims.pop_back();
+            new_tile.Reshape(new_tile_dims);
+            instruction->set_sharding(HloSharding::Tile(new_tile));
+          } else {
+            instruction->set_sharding(HloSharding::PartialTile(new_tile));
+          }
+          return true;
+        }
+      }
+    }
+  }
   if (IsShardingMoreSpecific(sharding, instruction->sharding())) {
     instruction->set_sharding(sharding);
     return true;
@@ -363,7 +459,8 @@ bool SupportSpatialPartitioning(const HloInstruction* instruction,
 
 // Convolution handling for InferShardingFromOperands().
 bool InferConvolutionShardingFromOperands(HloInstruction* instruction,
-                                          bool aggressive_prop) {
+                                          bool aggressive_prop,
+                                          bool may_combine_partial_sharding) {
   const auto& dnums = instruction->convolution_dimension_numbers();
   const HloInstruction* lhs = instruction->operand(0);
   const HloInstruction* rhs = instruction->operand(1);
@@ -430,13 +527,15 @@ bool InferConvolutionShardingFromOperands(HloInstruction* instruction,
         partitioned_only_along_non_trivial_dims(lhs->sharding(),
                                                 dot_dims->batch_dims, 0)) {
       return MaybeImproveInstructionSharding(get_tiled_sharding_based_on_lhs(),
-                                             instruction);
+                                             instruction,
+                                             may_combine_partial_sharding);
     }
     if (IsSpatiallyPartitioned(rhs) &&
         partitioned_only_along_non_trivial_dims(rhs->sharding(),
                                                 dot_dims->batch_dims, 1)) {
       return MaybeImproveInstructionSharding(get_tiled_sharding_based_on_rhs(),
-                                             instruction);
+                                             instruction,
+                                             may_combine_partial_sharding);
     }
     if (aggressive_prop) {
       // If LHS/RHS is partitioned only along the non-contracting
@@ -455,19 +554,23 @@ bool InferConvolutionShardingFromOperands(HloInstruction* instruction,
         if (Product(lhs->shape().dimensions()) >=
             Product(rhs->shape().dimensions())) {
           return MaybeImproveInstructionSharding(
-              get_tiled_sharding_based_on_lhs(), instruction);
+              get_tiled_sharding_based_on_lhs(), instruction,
+              may_combine_partial_sharding);
         } else {
           return MaybeImproveInstructionSharding(
-              get_tiled_sharding_based_on_rhs(), instruction);
+              get_tiled_sharding_based_on_rhs(), instruction,
+              may_combine_partial_sharding);
         }
       }
       if (can_propagate_from_lhs) {
         return MaybeImproveInstructionSharding(
-            get_tiled_sharding_based_on_lhs(), instruction);
+            get_tiled_sharding_based_on_lhs(), instruction,
+            may_combine_partial_sharding);
       }
       if (can_propagate_from_rhs) {
         return MaybeImproveInstructionSharding(
-            get_tiled_sharding_based_on_rhs(), instruction);
+            get_tiled_sharding_based_on_rhs(), instruction,
+            may_combine_partial_sharding);
       }
     }
   }
@@ -476,8 +579,8 @@ bool InferConvolutionShardingFromOperands(HloInstruction* instruction,
     return false;
   }
   if (lhs->sharding().IsReplicated()) {
-    return MaybeImproveInstructionSharding(HloSharding::Replicate(),
-                                           instruction);
+    return MaybeImproveInstructionSharding(
+        HloSharding::Replicate(), instruction, may_combine_partial_sharding);
   }
 
   if (IsConvolutionKernelSmall(instruction)) {
@@ -488,11 +591,13 @@ bool InferConvolutionShardingFromOperands(HloInstruction* instruction,
       return false;
     }
     return MaybeImproveInstructionSharding(get_tiled_sharding_based_on_lhs(),
-                                           instruction);
+                                           instruction,
+                                           may_combine_partial_sharding);
   }
   // If the kernel is large (e.g backward convolution) then we only support
   // replicated output.
-  return MaybeImproveInstructionSharding(HloSharding::Replicate(), instruction);
+  return MaybeImproveInstructionSharding(HloSharding::Replicate(), instruction,
+                                         may_combine_partial_sharding);
 }
 
 // Tries to update the sharding of the specified instruction based on its
@@ -512,8 +617,9 @@ bool InferShardingFromOperands(HloInstruction* instruction,
     if (absl::c_any_of(instruction->operands(), [](const HloInstruction* op) {
           return op->has_sharding() && op->sharding().IsReplicated();
         })) {
-      return MaybeImproveInstructionSharding(HloSharding::Replicate(),
-                                             instruction);
+      return MaybeImproveInstructionSharding(
+          HloSharding::Replicate(), instruction,
+          /*may_combine_partial_sharding=*/is_spmd);
     }
     return false;
   }
@@ -526,7 +632,8 @@ bool InferShardingFromOperands(HloInstruction* instruction,
       }
       HloSharding new_sharding = operand->sharding().GetSubSharding(
           operand->shape(), {instruction->tuple_index()});
-      return MaybeImproveInstructionSharding(new_sharding, instruction);
+      return MaybeImproveInstructionSharding(
+          new_sharding, instruction, /*may_combine_partial_sharding=*/is_spmd);
     }
     case HloOpcode::kTuple: {
       if (absl::c_none_of(instruction->operands(),
@@ -599,40 +706,37 @@ bool InferShardingFromOperands(HloInstruction* instruction,
                                          sharding);
           return HloSharding::Tuple(instruction->shape(), tuple);
         };
-        if (operand->sharding().IsReplicated()) {
+        if (operand->sharding().IsReplicated() ||
+            (!is_spmd &&
+             absl::c_any_of(instruction->dimensions(), [operand](int64 dim) {
+               return operand->sharding().tile_assignment().dim(dim) > 1;
+             }))) {
+          // We are reducing along one of the sharded dimensions. We only
+          // support this in SPMD.
           changed |= MaybeImproveInstructionSharding(
-              get_maybe_tuple_sharding(HloSharding::Replicate()), instruction);
+              get_maybe_tuple_sharding(HloSharding::Replicate()), instruction,
+              /*may_combine_partial_sharding=*/is_spmd);
           continue;
         }
-        if (absl::c_any_of(instruction->dimensions(), [operand](int64 dim) {
-              return operand->sharding().tile_assignment().dim(dim) > 1;
-            })) {
-          // We are reducing along one of the sharded dimensions. We don't
-          // support tiled sharding in this case.
+        auto after_partial_replication =
+            operand->sharding().IsReplicated()
+                ? operand->sharding()
+                : hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
+                      operand->sharding(), instruction->dimensions());
+        if (after_partial_replication.IsReplicated()) {
           changed |= MaybeImproveInstructionSharding(
-              get_maybe_tuple_sharding(HloSharding::Replicate()), instruction);
-        } else {
-          // We are reducing along some of the non-sharded dimensions. The
-          // result sharding should be the same as the operand sharding with the
-          // reduction dimensions removed as they are removed from the result
-          // shape.
-          std::vector<int64> target_tile_assignment_dimensions;
-          const auto& dimensions = instruction->dimensions();
-          for (int64 i = 0; i < operand->shape().rank(); ++i) {
-            if (absl::c_find(dimensions, i) == dimensions.end()) {
-              target_tile_assignment_dimensions.push_back(
-                  operand->sharding().tile_assignment().dim(i));
-            }
-          }
-          Array<int64> new_tile_assignment =
-              operand->sharding().tile_assignment();
-          new_tile_assignment.Reshape(target_tile_assignment_dimensions);
-          // Use the same sharding for all tuple elements, because they are part
-          // of the same reduce instruction.
-          HloSharding new_sharding =
-              get_maybe_tuple_sharding(HloSharding::Tile(new_tile_assignment));
-          changed |= MaybeImproveInstructionSharding(new_sharding, instruction);
+              get_maybe_tuple_sharding(HloSharding::Replicate()), instruction,
+              /*may_combine_partial_sharding=*/is_spmd);
+          continue;
         }
+        // Use the same sharding for all tuple elements, because they are part
+        // of the same reduce instruction.
+        HloSharding new_sharding =
+            get_maybe_tuple_sharding(hlo_sharding_util::RemoveShapeDimensions(
+                after_partial_replication, instruction->dimensions()));
+        changed |= MaybeImproveInstructionSharding(
+            new_sharding, instruction,
+            /*may_combine_partial_sharding=*/is_spmd);
       }
       return changed;
     }
@@ -662,13 +766,23 @@ bool InferShardingFromOperands(HloInstruction* instruction,
               op->sharding().tile_assignment().dim(source_dim));
         }
       }
+      if (op->sharding().ReplicateOnLastTileDim()) {
+        target_tile_assignment_dimensions.push_back(
+            op->sharding().tile_assignment().dimensions().back());
+      }
       Array<int64> new_tile_assignment = op->sharding().tile_assignment();
       new_tile_assignment.Reshape(target_tile_assignment_dimensions);
-      HloSharding new_sharding = HloSharding::Tile(new_tile_assignment);
-      return MaybeImproveInstructionSharding(new_sharding, instruction);
+      HloSharding new_sharding =
+          op->sharding().ReplicateOnLastTileDim()
+              ? HloSharding::PartialTile(new_tile_assignment)
+              : HloSharding::Tile(new_tile_assignment);
+      return MaybeImproveInstructionSharding(
+          new_sharding, instruction, /*may_combine_partial_sharding=*/is_spmd);
     }
     case HloOpcode::kConvolution:
-      return InferConvolutionShardingFromOperands(instruction, aggressive_prop);
+      return InferConvolutionShardingFromOperands(
+          instruction, aggressive_prop,
+          /*may_combine_partial_sharding=*/is_spmd);
     case HloOpcode::kTranspose: {
       const HloInstruction* input = instruction->operand(0);
       if (!IsSpatiallyPartitioned(input)) {
@@ -676,7 +790,8 @@ bool InferShardingFromOperands(HloInstruction* instruction,
       }
       HloSharding sharding = hlo_sharding_util::TransposeSharding(
           input->sharding(), instruction->dimensions());
-      return MaybeImproveInstructionSharding(sharding, instruction);
+      return MaybeImproveInstructionSharding(
+          sharding, instruction, /*may_combine_partial_sharding=*/is_spmd);
     }
     case HloOpcode::kReduceWindow: {
       const HloInstruction* lhs = instruction->operand(0);
@@ -694,7 +809,9 @@ bool InferShardingFromOperands(HloInstruction* instruction,
                 << instruction->ToString();
         return false;
       }
-      return MaybeImproveInstructionSharding(lhs->sharding(), instruction);
+      return MaybeImproveInstructionSharding(
+          lhs->sharding(), instruction,
+          /*may_combine_partial_sharding=*/is_spmd);
     }
     case HloOpcode::kSelectAndScatter: {
       // Shard according to first operand, as output keeps the same shape.
@@ -713,7 +830,9 @@ bool InferShardingFromOperands(HloInstruction* instruction,
                 << instruction->ToString();
         return false;
       }
-      return MaybeImproveInstructionSharding(lhs->sharding(), instruction);
+      return MaybeImproveInstructionSharding(
+          lhs->sharding(), instruction,
+          /*may_combine_partial_sharding=*/is_spmd);
     }
     case HloOpcode::kReshape: {
       if (!IsSpatiallyPartitioned(instruction->operand(0))) {
@@ -724,8 +843,9 @@ bool InferShardingFromOperands(HloInstruction* instruction,
               instruction->operand(0)->shape(), instruction->shape(),
               instruction->operand(0)->sharding());
       if (new_sharding.has_value()) {
-        return MaybeImproveInstructionSharding(new_sharding.value(),
-                                               instruction);
+        return MaybeImproveInstructionSharding(
+            new_sharding.value(), instruction,
+            /*may_combine_partial_sharding=*/is_spmd);
       }
       return false;
     }
@@ -736,7 +856,7 @@ bool InferShardingFromOperands(HloInstruction* instruction,
       return MaybeImproveInstructionSharding(
           hlo_sharding_util::ReverseSharding(
               instruction->operand(0)->sharding(), instruction->dimensions()),
-          instruction);
+          instruction, /*may_combine_partial_sharding=*/is_spmd);
     }
     case HloOpcode::kDot: {
       auto& dot_dim_numbs = instruction->dot_dimension_numbers();
@@ -765,8 +885,9 @@ bool InferShardingFromOperands(HloInstruction* instruction,
       } else if (ops_sharding[0]->IsReplicated() &&
                  ops_sharding[1]->IsReplicated()) {
         // Both replicated -> replicate
-        return MaybeImproveInstructionSharding(HloSharding::Replicate(),
-                                               instruction);
+        return MaybeImproveInstructionSharding(
+            HloSharding::Replicate(), instruction,
+            /*may_combine_partial_sharding=*/is_spmd);
       } else if (!ops_sharding[0]->IsReplicated() &&
                  !ops_sharding[1]->IsReplicated()) {
         // Both tile sharded. The dot spatial partitioning implementation
@@ -785,8 +906,9 @@ bool InferShardingFromOperands(HloInstruction* instruction,
       }
 
       if (ops_sharding[representative_op]->IsReplicated()) {
-        return MaybeImproveInstructionSharding(HloSharding::Replicate(),
-                                               instruction);
+        return MaybeImproveInstructionSharding(
+            HloSharding::Replicate(), instruction,
+            /*may_combine_partial_sharding=*/is_spmd);
       } else {
         // Tile-shard instruction according to representative op.
         auto sharding = *ops_sharding[representative_op];
@@ -811,7 +933,8 @@ bool InferShardingFromOperands(HloInstruction* instruction,
           tile_assignment.Reshape(dimensions);
           sharding = HloSharding::Tile(tile_assignment);
         }
-        return MaybeImproveInstructionSharding(sharding, instruction);
+        return MaybeImproveInstructionSharding(
+            sharding, instruction, /*may_combine_partial_sharding=*/is_spmd);
       }
     }
     case HloOpcode::kParameter: {
@@ -826,7 +949,8 @@ bool InferShardingFromOperands(HloInstruction* instruction,
             if (parent->called_computations()[i - 1] == instruction->parent()) {
               if (parent->operand(i)->has_sharding()) {
                 return MaybeImproveInstructionSharding(
-                    parent->operand(i)->sharding(), instruction);
+                    parent->operand(i)->sharding(), instruction,
+                    /*may_combine_partial_sharding=*/is_spmd);
               }
               return false;
             }
@@ -853,15 +977,16 @@ bool InferShardingFromOperands(HloInstruction* instruction,
       if (instruction->shape().IsTuple()) {
         return MaybeImproveInstructionSharding(
             HloSharding::SingleTuple(instruction->shape(), operand->sharding()),
-            instruction);
+            instruction, /*may_combine_partial_sharding=*/is_spmd);
       } else {
-        return MaybeImproveInstructionSharding(operand->sharding(),
-                                               instruction);
+        return MaybeImproveInstructionSharding(
+            operand->sharding(), instruction,
+            /*may_combine_partial_sharding=*/is_spmd);
       }
     }
     case HloOpcode::kDynamicSlice:
     case HloOpcode::kDynamicUpdateSlice: {
-      auto propagate_slicing = [instruction]() {
+      auto propagate_slicing = [instruction, is_spmd]() {
         const HloInstruction* operand =
             instruction->opcode() == HloOpcode::kDynamicSlice
                 ? instruction->operand(0)
@@ -871,8 +996,9 @@ bool InferShardingFromOperands(HloInstruction* instruction,
         }
 
         if (operand->sharding().IsReplicated()) {
-          return MaybeImproveInstructionSharding(HloSharding::Replicate(),
-                                                 instruction);
+          return MaybeImproveInstructionSharding(
+              HloSharding::Replicate(), instruction,
+              /*may_combine_partial_sharding=*/is_spmd);
         }
 
         const auto& tile_assignment = operand->sharding().tile_assignment();
@@ -883,10 +1009,11 @@ bool InferShardingFromOperands(HloInstruction* instruction,
             return false;
           }
         }
-        return MaybeImproveInstructionSharding(operand->sharding(),
-                                               instruction);
+        return MaybeImproveInstructionSharding(
+            operand->sharding(), instruction,
+            /*may_combine_partial_sharding=*/is_spmd);
       };
-      auto propagate_base = [instruction]() {
+      auto propagate_base = [instruction, is_spmd]() {
         if (instruction->opcode() != HloOpcode::kDynamicUpdateSlice) {
           return false;
         }
@@ -894,25 +1021,57 @@ bool InferShardingFromOperands(HloInstruction* instruction,
           return false;
         }
         return MaybeImproveInstructionSharding(
-            instruction->operand(0)->sharding(), instruction);
+            instruction->operand(0)->sharding(), instruction,
+            /*may_combine_partial_sharding=*/is_spmd);
       };
       return propagate_slicing() || propagate_base();
     }
     case HloOpcode::kGather: {
-      if (!IsSpatiallyPartitioned(instruction->operand(1))) {
-        return false;
+      bool changed = false;
+      if (IsSpatiallyPartitioned(instruction->operand(1))) {
+        HloSharding new_sharding = hlo_sharding_util::GatherOutputSharding(
+            instruction->operand(1)->sharding(), instruction);
+        changed |= MaybeImproveInstructionSharding(
+            new_sharding, instruction,
+            /*may_combine_partial_sharding=*/is_spmd);
       }
-      HloSharding new_sharding = hlo_sharding_util::GatherOutputSharding(
-          instruction->operand(1)->sharding(), instruction);
-      return MaybeImproveInstructionSharding(new_sharding, instruction);
+      if (is_spmd && IsSpatiallyPartitioned(instruction->operand(0))) {
+        auto maybe_from_data =
+            hlo_sharding_util::GatherOutputShardingFromDataOperand(
+                instruction->operand(0)->sharding(), *instruction);
+        if (maybe_from_data) {
+          changed |= MaybeImproveInstructionSharding(
+              *maybe_from_data, instruction,
+              /*may_combine_partial_sharding=*/is_spmd);
+        }
+      }
+      return changed;
     }
     case HloOpcode::kScatter: {
+      bool changed = false;
+      if (is_spmd && IsSpatiallyPartitioned(instruction->operand(0))) {
+        changed |= MaybeImproveInstructionSharding(
+            instruction->operand(0)->sharding(), instruction,
+            /*may_combine_partial_sharding=*/is_spmd);
+      }
       if (!IsSpatiallyPartitioned(instruction->operand(1)) &&
           !IsSpatiallyPartitioned(instruction->operand(2))) {
         return false;
       }
-      return MaybeImproveInstructionSharding(HloSharding::Replicate(),
-                                             instruction);
+      if (is_spmd && IsSpatiallyPartitioned(instruction->operand(2))) {
+        auto maybe_from_update =
+            hlo_sharding_util::ScatterOutputShardingFromUpdate(
+                instruction->operand(2)->sharding(), *instruction);
+        if (maybe_from_update) {
+          changed |= MaybeImproveInstructionSharding(
+              *maybe_from_update, instruction,
+              /*may_combine_partial_sharding=*/is_spmd);
+        }
+      }
+      changed |= MaybeImproveInstructionSharding(
+          HloSharding::Replicate(), instruction,
+          /*may_combine_partial_sharding=*/is_spmd);
+      return changed;
     }
     case HloOpcode::kWhile: {
       if (!instruction->operand(0)->has_sharding()) {
@@ -923,14 +1082,28 @@ bool InferShardingFromOperands(HloInstruction* instruction,
         sharding =
             MergeForMoreSpecificSharding(sharding, instruction->sharding());
       }
-      return MaybeImproveInstructionSharding(sharding, instruction);
+      return MaybeImproveInstructionSharding(
+          sharding, instruction, /*may_combine_partial_sharding=*/is_spmd);
     }
     default: {
+      if (instruction->IsElementwise() && is_spmd) {
+        bool changed = false;
+        for (auto operand : instruction->operands()) {
+          if (IsSpatiallyPartitioned(operand)) {
+            changed |= MaybeImproveInstructionSharding(
+                operand->sharding(), instruction,
+                /*may_combine_partial_sharding=*/is_spmd);
+          }
+        }
+        return changed;
+      }
       const HloInstruction* operand = PickRepresentativeOperand(instruction);
       if (!operand || !IsSpatiallyPartitioned(operand)) {
         return false;
       }
-      return MaybeImproveInstructionSharding(operand->sharding(), instruction);
+      return MaybeImproveInstructionSharding(
+          operand->sharding(), instruction,
+          /*may_combine_partial_sharding=*/is_spmd);
     }
   }
   return false;
@@ -948,25 +1121,25 @@ absl::optional<HloSharding> GetShardingFromUser(
       if (user.sharding().IsReplicated()) {
         return user.sharding();
       }
-      // Only support when none of the partitioned dimensions in the broadcast
-      // output belong to new dimensions.
+      std::vector<int64> dims_to_replicate;
+      bool needs_replication = false;
       for (int64 i = 0; i < user.shape().rank(); ++i) {
-        if (user.sharding().tile_assignment().dim(i) > 1 &&
-            absl::c_count(user.dimensions(), i) == 0) {
-          return absl::nullopt;
+        if (absl::c_count(user.dimensions(), i) == 0) {
+          dims_to_replicate.push_back(i);
+          if (user.sharding().tile_assignment().dim(i) > 1) {
+            needs_replication = true;
+          }
         }
       }
-
-      // The instruction (operand of broadcast) will be tiled the same way
-      // as the output.
-      std::vector<int64> target_tile_assignment_dimensions;
-      for (int64 output_dim : user.dimensions()) {
-        target_tile_assignment_dimensions.push_back(
-            user.sharding().tile_assignment().dim(output_dim));
+      // If not SPMD, only support when none of the partitioned dimensions in
+      // the broadcast output belong to new dimensions.
+      if (!is_spmd && needs_replication) {
+        return absl::nullopt;
       }
-      Array<int64> new_tile_assignment = user.sharding().tile_assignment();
-      new_tile_assignment.Reshape(target_tile_assignment_dimensions);
-      return HloSharding::Tile(new_tile_assignment);
+      return hlo_sharding_util::RemoveShapeDimensions(
+          hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
+              user.sharding(), dims_to_replicate),
+          dims_to_replicate);
     }
     case HloOpcode::kConcatenate: {
       if (user.sharding().IsReplicated()) {
@@ -1191,10 +1364,11 @@ absl::optional<HloSharding> GetShardingFromUser(
         return user_sharding;
       }
       std::vector<int64> target_tile_assignment_dimensions(
-          instruction.shape().rank());
+          instruction.shape().rank() +
+          (user_sharding.ReplicateOnLastTileDim() ? 1 : 0));
       const auto& dimensions = user.dimensions();
       int64 next_output_dim = 0;
-      for (int64 i = 0; i < instruction.shape().rank(); ++i) {
+      for (int64 i = 0; i < target_tile_assignment_dimensions.size(); ++i) {
         if (absl::c_find(dimensions, i) == dimensions.end()) {
           target_tile_assignment_dimensions[i] =
               user_sharding.tile_assignment().dim(next_output_dim++);
@@ -1204,7 +1378,9 @@ absl::optional<HloSharding> GetShardingFromUser(
       }
       auto tile_assignment = user_sharding.tile_assignment();
       tile_assignment.Reshape(target_tile_assignment_dimensions);
-      return HloSharding::Tile(tile_assignment);
+      return user_sharding.ReplicateOnLastTileDim()
+                 ? HloSharding::PartialTile(tile_assignment)
+                 : HloSharding::Tile(tile_assignment);
     }
     case HloOpcode::kSort: {
       if (user.sharding().IsTuple()) {
@@ -1218,6 +1394,43 @@ absl::optional<HloSharding> GetShardingFromUser(
       return hlo_sharding_util::ReverseSharding(user.sharding(),
                                                 user.dimensions());
     }
+    case HloOpcode::kGather: {
+      if (&instruction == user.operand(1)) {
+        return hlo_sharding_util::GatherIndexSharding(user.sharding(), &user);
+      }
+      if (is_spmd) {
+        return hlo_sharding_util::GatherDataOperandShardingFromOutput(
+            user.sharding(), user);
+      }
+      return absl::nullopt;
+    }
+    case HloOpcode::kScatter: {
+      if (&instruction == user.operand(0)) {
+        return user.sharding();
+      }
+      if (&instruction == user.operand(1)) {
+        auto update = user.operand(2);
+        if (!IsSpatiallyPartitioned(update)) {
+          return absl::nullopt;
+        }
+        return hlo_sharding_util::ScatterIndexSharding(update->sharding(),
+                                                       &user);
+      }
+      CHECK_EQ(&instruction, user.operand(2));
+      auto indices = user.operand(1);
+      if (IsSpatiallyPartitioned(indices)) {
+        auto from_indices =
+            hlo_sharding_util::ScatterDataSharding(indices->sharding(), &user);
+        if (!from_indices.IsTileMaximal()) {
+          return from_indices;
+        }
+      }
+      if (is_spmd) {
+        return hlo_sharding_util::ScatterUpdateShardingFromOutput(
+            user.sharding(), user);
+      }
+      return absl::nullopt;
+    }
     default: {
       // If the user output shape is compatible with the current instruction
       // shape excluding element type and the current instruction is supported
@@ -1246,8 +1459,9 @@ bool InferShardingFromUsers(HloInstruction* instruction,
     absl::optional<HloSharding> user_sharding =
         GetShardingFromUser(*instruction, *user, aggressive_prop, is_spmd);
     if (user_sharding) {
-      improved_sharding |=
-          MaybeImproveInstructionSharding(*user_sharding, instruction);
+      improved_sharding |= MaybeImproveInstructionSharding(
+          *user_sharding, instruction,
+          /*may_combine_partial_sharding=*/is_spmd);
     }
   }
   return improved_sharding;
diff --git a/tensorflow/compiler/xla/service/sharding_propagation_test.cc b/tensorflow/compiler/xla/service/sharding_propagation_test.cc
index d62328aa9ad..a182af001c2 100644
--- a/tensorflow/compiler/xla/service/sharding_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/sharding_propagation_test.cc
@@ -118,6 +118,25 @@ ENTRY %broadcast {
               op::Sharding("{devices=[1,2,2,1]0,1,2,3}"));
 }
 
+TEST_F(ShardingPropagationTest, BroadcastForwardPartial) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %broadcast {
+  %param0 = f32[3,2048]parameter(0),
+    sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %broadcast = f32[3,2048,3] broadcast(%param0), dimensions={0,1}
+  ROOT %copy = f32[3,2048,3] copy(%broadcast)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      FindInstruction(module.get(), "broadcast"),
+      op::Sharding("{devices=[1,2,1,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, BroadcastUser) {
   const char* const hlo_string = R"(
 HloModule module
@@ -136,6 +155,25 @@ ENTRY %broadcast {
               op::Sharding("{devices=[2,4]0,1,2,3,4,5,6,7}"));
 }
 
+TEST_F(ShardingPropagationTest, BroadcastUserPartial) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %broadcast {
+  %param0 = f32[24,8]{0,1} parameter(0)
+  %copy = f32[24,8]{0,1} copy(%param0)
+  ROOT %broadcast = f32[4,24,6,8] broadcast(%copy), dimensions={1,3},
+    sharding={devices=[4,2,1,1]0,1,2,3,4,5,6,7}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      FindInstruction(module.get(), "copy"),
+      op::Sharding("{devices=[2,1,4]0,2,4,6,1,3,5,7 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, MaximalReduceForwardPass) {
   const char* const hlo_string = R"(
 HloModule module
@@ -184,6 +222,78 @@ ENTRY %reduce {
               op::Sharding("{devices=[2,2]0,1,2,3}"));
 }
 
+TEST_F(ShardingPropagationTest, ReducePartiallyOnTiledDims) {
+  const char* const hlo_string = R"(
+HloModule module
+%add {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %add = f32[] add(%lhs, %rhs)
+}
+ENTRY %reduce {
+  %param0 = f32[8,8] parameter(0), sharding={devices=[2,2]0,1,2,3}
+  %init = f32[] parameter(1)
+  %reduce = f32[8] reduce(%param0, %init), dimensions={0}, to_apply=%add
+  ROOT %copy = f32[8] copy(%reduce)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "reduce"),
+              op::Sharding("{devices=[2,2]0,2,1,3 last_tile_dim_replicate}"));
+}
+
+TEST_F(ShardingPropagationTest, ReducePartiallyOnTiledDims2) {
+  const char* const hlo_string = R"(
+HloModule module
+%add {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %add = f32[] add(%lhs, %rhs)
+}
+ENTRY %reduce {
+  %param0 = f32[8,8] parameter(0), sharding={devices=[2,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  %init = f32[] parameter(1)
+  %reduce = f32[8] reduce(%param0, %init), dimensions={0}, to_apply=%add
+  ROOT %copy = f32[8] copy(%reduce)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      FindInstruction(module.get(), "reduce"),
+      op::Sharding("{devices=[2,4]0,1,4,5,2,3,6,7 last_tile_dim_replicate}"));
+}
+
+TEST_F(ShardingPropagationTest, ReducePartiallyBackward) {
+  const char* const hlo_string = R"(
+HloModule module
+%add {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %add = f32[] add(%lhs, %rhs)
+}
+ENTRY %reduce {
+  %param0 = f32[8,8] parameter(0)
+  %input = f32[8,8] copy(%param0)
+  %init = f32[] parameter(1)
+  %reduce = f32[8] reduce(%input, %init), dimensions={0}, to_apply=%add,
+    sharding={devices=[2,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT %copy = f32[8] copy(%reduce)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "input"),
+              op::Sharding("{devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, ShardedTupleReduceForwardAndBackwardPass) {
   const char* const hlo_string = R"(
 HloModule module
@@ -1149,21 +1259,21 @@ ENTRY entry {
                           ShardingPropagation().Run(module.get()));
   EXPECT_TRUE(changed);
   EXPECT_THAT(FindInstruction(module.get(), "tp"),
-              op::Sharding("{{devices=[1,2]0,1}}"));
+              op::Sharding("{{devices=[3,1]0,1,2}}"));
   EXPECT_THAT(FindInstruction(module.get(), "tgte"),
-              op::Sharding("{devices=[1,2]0,1}"));
+              op::Sharding("{devices=[3,1]0,1,2}"));
   EXPECT_THAT(FindInstruction(module.get(), "ttr"),
-              op::Sharding("{devices=[2,1]0,1}"));
+              op::Sharding("{devices=[1,3]0,1,2}"));
   EXPECT_THAT(FindInstruction(module.get(), "tr"),
-              op::Sharding("{{devices=[2,1]0,1}}"));
+              op::Sharding("{{devices=[1,3]0,1,2}}"));
   EXPECT_THAT(FindInstruction(module.get(), "fp"),
               op::Sharding("{{devices=[1,3]0,1,2}}"));
   EXPECT_THAT(FindInstruction(module.get(), "fgte"),
               op::Sharding("{devices=[1,3]0,1,2}"));
   EXPECT_THAT(FindInstruction(module.get(), "fr"),
-              op::Sharding("{{devices=[2,1]0,1}}"));
+              op::Sharding("{{devices=[1,3]0,1,2}}"));
   EXPECT_THAT(FindInstruction(module.get(), "conditional"),
-              op::Sharding("{{devices=[2,1]0,1}}"));
+              op::Sharding("{{devices=[1,3]0,1,2}}"));
 }
 
 TEST_F(ShardingPropagationTest, TupleFromUser) {
@@ -1494,5 +1604,328 @@ ENTRY entry {
               op::Sharding("{devices=[2,1,1,1]0,1}"));
 }
 
+TEST_F(ShardingPropagationTest, GatherFromIndex) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0), sharding={replicated}
+  %indices = s32[3] parameter(1), sharding={devices=[2]0,1}
+  %gather = f32[3,9] gather(%input, %indices), offset_dims={1},
+    collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=1,
+    slice_sizes={1,9}
+  ROOT %copy = f32[3,9] copy(%gather)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "gather"),
+              op::Sharding("{devices=[2,1]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, GatherFromDataOperand) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0), sharding={devices=[1,2]0,1}
+  %indices = s32[3] parameter(1), sharding={replicated}
+  %gather = f32[3,9] gather(%input, %indices), offset_dims={1},
+    collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=1,
+    slice_sizes={1,9}
+  ROOT %copy = f32[3,9] copy(%gather)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "gather"),
+              op::Sharding("{devices=[1,2]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, GatherToIndex) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0), sharding={replicated}
+  %p1 = s32[3] parameter(1)
+  %indices = s32[3] copy(%p1)
+  ROOT %gather = f32[3,9] gather(%input, %indices), offset_dims={1},
+    collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=1,
+    slice_sizes={1,9}, sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "indices"),
+              op::Sharding("{devices=[2]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, GatherToDataOperand) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %p0 = f32[2,9] parameter(0)
+  %input = f32[2,9] copy(%p0)
+  %indices = s32[3] parameter(1), sharding={replicated}
+  ROOT %gather = f32[3,9] gather(%input, %indices), offset_dims={1},
+    collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=1,
+    slice_sizes={1,9}, sharding={devices=[1,2]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "input"),
+              op::Sharding("{devices=[1,2]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, DataOperandToScatter) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0), sharding={devices=[1,2]0,1}
+  %indices = s32[3] parameter(1), sharding={replicated}
+  %updates = f32[3,9] parameter(2), sharding={replicated}
+  %scatter = f32[2,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+  ROOT %copy = f32[2,9] copy(%scatter)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "scatter"),
+              op::Sharding("{devices=[1,2]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, UpdateOperandToScatter) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0), sharding={replicated}
+  %indices = s32[3] parameter(1), sharding={replicated}
+  %updates = f32[3,9] parameter(2), sharding={devices=[1,2]0,1}
+  %scatter = f32[2,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+  ROOT %copy = f32[2,9] copy(%scatter)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "scatter"),
+              op::Sharding("{devices=[1,2]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, ScatterToDataOperand) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %p0 = f32[2,9] parameter(0)
+  %input = f32[2,9] copy(%p0)
+  %indices = s32[3] parameter(1), sharding={replicated}
+  %updates = f32[3,9] parameter(2), sharding={replicated}
+  ROOT %scatter = f32[2,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1, sharding={devices=[1,2]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "input"),
+              op::Sharding("{devices=[1,2]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, ScatterToUpdateOperand) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0)
+  %indices = s32[3] parameter(1), sharding={replicated}
+  %p2 = f32[3,9] parameter(2)
+  %updates = f32[3,9] copy(%p2)
+  ROOT %scatter = f32[2,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1, sharding={devices=[1,2]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "updates"),
+              op::Sharding("{devices=[1,2]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, ScatterUpdateToIndex) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0), sharding={replicated}
+  %p1 = s32[3] parameter(1), sharding={replicated}
+  %indices = s32[3] copy(%p1)
+  %updates = f32[3,9] parameter(2), sharding={devices=[2,1]0,1}
+  ROOT %scatter = f32[2,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "indices"),
+              op::Sharding("{devices=[2]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, ScatterIndexToUpdate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0), sharding={replicated}
+  %indices = s32[3] parameter(1), sharding={devices=[2]0,1}
+  %p2 = f32[3,9] parameter(2), sharding={replicated}
+  %updates = f32[3,9] copy(%p2)
+  ROOT %scatter = f32[2,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "updates"),
+              op::Sharding("{devices=[2,1]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, PartialShardingOnElementwise) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %p0 = f32[2,9] parameter(0), sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %p1 = f32[2,9] parameter(1), sharding={devices=[2,1,2]0,2,1,3 last_tile_dim_replicate}
+  %lhs = f32[2,9] copy(%p0)
+  %rhs = f32[2,9] copy(%p1)
+  %add = f32[2,9] add(%lhs, %rhs)
+  ROOT %copy = f32[2,9] copy(%add)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "lhs"),
+              op::Sharding("{devices=[2,2]0,2,1,3}"));
+  EXPECT_THAT(FindInstruction(module.get(), "rhs"),
+              op::Sharding("{devices=[2,2]0,2,1,3}"));
+  EXPECT_THAT(FindInstruction(module.get(), "add"),
+              op::Sharding("{devices=[2,2]0,2,1,3}"));
+}
+
+TEST_F(ShardingPropagationTest, PartialShardingOnElementwise2) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %p0 = f32[2,9] parameter(0), sharding={devices=[1,2,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  %p1 = f32[2,9] parameter(1), sharding={devices=[2,1,4]0,1,4,5,2,3,6,7 last_tile_dim_replicate}
+  %lhs = f32[2,9] copy(%p0)
+  %rhs = f32[2,9] copy(%p1)
+  %add = f32[2,9] add(%lhs, %rhs)
+  ROOT %copy = f32[2,9] copy(%add)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      FindInstruction(module.get(), "lhs"),
+      op::Sharding("{devices=[2,2,2]0,1,4,5,2,3,6,7 last_tile_dim_replicate}"));
+  EXPECT_THAT(
+      FindInstruction(module.get(), "rhs"),
+      op::Sharding("{devices=[2,2,2]0,1,4,5,2,3,6,7 last_tile_dim_replicate}"));
+  EXPECT_THAT(
+      FindInstruction(module.get(), "add"),
+      op::Sharding("{devices=[2,2,2]0,1,4,5,2,3,6,7 last_tile_dim_replicate}"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/BUILD b/tensorflow/compiler/xla/service/spmd/BUILD
index 4433078472d..ce19934bb88 100644
--- a/tensorflow/compiler/xla/service/spmd/BUILD
+++ b/tensorflow/compiler/xla/service/spmd/BUILD
@@ -17,6 +17,8 @@ package_group(
 cc_library(
     name = "spmd_partitioner",
     srcs = [
+        "convolution_handler.cc",
+        "dot_handler.cc",
         "spmd_partitioner.cc",
         "spmd_partitioner_util.cc",
     ],
@@ -48,6 +50,8 @@ cc_library(
         "//tensorflow/compiler/xla/service:tuple_simplifier",
         "//tensorflow/core/platform:numbers",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
diff --git a/tensorflow/compiler/xla/service/spmd/convolution_handler.cc b/tensorflow/compiler/xla/service/spmd/convolution_handler.cc
new file mode 100644
index 00000000000..01d7ea2ff14
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/convolution_handler.cc
@@ -0,0 +1,1013 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/dot_as_convolution_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/window_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/numbers.h"
+
+namespace xla {
+namespace spmd {
+namespace {
+
+// Partition convolution.
+StatusOr<HloInstruction*> PartitionConvolution(
+    PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
+    const HloSharding& output_sharding, const Window& conv_window,
+    HloInstruction* original_hlo, int64 num_partitions,
+    const SpmdPartitionerOptions& options, HloInstruction* partition_id,
+    HloModule* module, SpmdBuilder* b);
+
+// Partition convolution with only paralell dims are tiled
+StatusOr<HloInstruction*> PartitionConvolutionWithParallelDimension(
+    PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
+    const HloSharding& output_sharding, const Window& conv_window,
+    HloInstruction* original_hlo, int64 num_partitions, SpmdBuilder* b) {
+  TF_RET_CHECK(original_hlo->opcode() == HloOpcode::kConvolution);
+
+  const auto& dnums = original_hlo->convolution_dimension_numbers();
+  std::vector<int64> rhs_to_lhs_indices(output_base_shape.rank());
+  rhs_to_lhs_indices[dnums.kernel_output_feature_dimension()] =
+      dnums.input_batch_dimension();
+  rhs_to_lhs_indices[dnums.kernel_input_feature_dimension()] =
+      dnums.input_feature_dimension();
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    rhs_to_lhs_indices[dnums.kernel_spatial_dimensions(i)] =
+        dnums.input_spatial_dimensions(i);
+  }
+  std::vector<int64> lhs_to_rhs_indices(output_base_shape.rank());
+  for (int64 i = 0; i < rhs_to_lhs_indices.size(); ++i) {
+    lhs_to_rhs_indices[rhs_to_lhs_indices[i]] = i;
+  }
+  auto aligned_rhs_sharding =
+      hlo_sharding_util::TransposeSharding(lhs.sharding(), rhs_to_lhs_indices);
+  auto aligned_lhs_sharding =
+      hlo_sharding_util::TransposeSharding(rhs.sharding(), lhs_to_rhs_indices);
+
+  // Handling cases where all the partitioned dimensions are parallel
+  // dimensions.
+  int64 lhs_parallel_dim_partitions = 1;
+  int64 rhs_parallel_dim_partitions = 1;
+  std::vector<int64> parallel_spatial_dims;
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    int64 lhs_dim = dnums.input_spatial_dimensions(i);
+    int64 lhs_size = lhs.base_shape().dimensions(lhs_dim);
+    const auto& wd = conv_window.dimensions(i);
+    int64 rhs_dim = dnums.kernel_spatial_dimensions(i);
+    if (dot_as_convolution_util::ConvSpatialDimensionIsParallel(wd, lhs_size)) {
+      parallel_spatial_dims.emplace_back(i);
+      lhs_parallel_dim_partitions *= ShardCountAtDim(lhs.sharding(), lhs_dim);
+      rhs_parallel_dim_partitions *= ShardCountAtDim(rhs.sharding(), rhs_dim);
+    }
+  }
+  bool lhs_partition_dims_are_parallel =
+      (lhs_parallel_dim_partitions == num_partitions);
+  bool rhs_partition_dims_are_parallel =
+      (rhs_parallel_dim_partitions == num_partitions);
+
+  // If there is a parallel dim and all the partitioned dimensions are parallel
+  // dimensions in either LHS or RHS, simply create partitioned convolutions.
+  if (parallel_spatial_dims.empty() || ((!lhs_partition_dims_are_parallel) &&
+                                        (!rhs_partition_dims_are_parallel))) {
+    return nullptr;
+  }
+  // Reshard LHS or RHS to partition at parallel dimensions as the other
+  // operand.
+  if (lhs_partition_dims_are_parallel) {
+    rhs = rhs.Reshard(aligned_rhs_sharding);
+  } else {
+    lhs = lhs.Reshard(aligned_lhs_sharding);
+  }
+
+  // Get LHS and RHS sharded shape.
+  auto lhs_shard_shape = MakePartitionedShape(lhs.base_shape(), lhs.sharding());
+  auto rhs_shard_shape = MakePartitionedShape(rhs.base_shape(), rhs.sharding());
+
+  // Update convolution window.
+  auto new_window = conv_window;
+  for (const auto& spatial_dim : parallel_spatial_dims) {
+    auto wd = new_window.mutable_dimensions(spatial_dim);
+    wd->set_size(lhs_shard_shape.dimensions(
+        dnums.input_spatial_dimensions(spatial_dim)));
+    wd->set_stride(std::max<int64>(1, wd->size() - 1));
+    wd->set_base_dilation(wd->size());
+  }
+  TF_ASSIGN_OR_RETURN(
+      Shape sharded_conv_shape,
+      ShapeInference::InferConvolveShape(
+          lhs_shard_shape, rhs_shard_shape, original_hlo->feature_group_count(),
+          original_hlo->batch_group_count(), new_window, dnums));
+  auto sharded_conv = b->AddInstruction(HloInstruction::CreateConvolve(
+      sharded_conv_shape, lhs.hlo(), rhs.hlo(),
+      original_hlo->feature_group_count(), original_hlo->batch_group_count(),
+      new_window, dnums, original_hlo->precision_config()));
+  sharded_conv->set_sharding(original_hlo->sharding());
+  return PartitionedHlo(sharded_conv, output_base_shape, lhs.state())
+      .Reshard(output_sharding)
+      .hlo();
+}
+
+// Partition convolution when both LHS and RHS are partitioned at spatial
+// dimensions. Halo exchange will happen on RHS only.
+StatusOr<HloInstruction*>
+PartitionConvolutionWithSpatialDimensionHaloExchangeOnRHS(
+    PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
+    const HloSharding& output_sharding, const Window& conv_window,
+    HloInstruction* original_hlo, HloInstruction* partition_id,
+    HloModule* module, SpmdBuilder* b) {
+  TF_RET_CHECK(original_hlo->opcode() == HloOpcode::kConvolution);
+  TF_RET_CHECK(!lhs.sharding().IsTileMaximal() &&
+               !rhs.sharding().IsTileMaximal());
+
+  const auto& dnums = original_hlo->convolution_dimension_numbers();
+  std::vector<int64> rhs_to_lhs_indices(output_base_shape.rank());
+  rhs_to_lhs_indices[dnums.kernel_output_feature_dimension()] =
+      dnums.input_batch_dimension();
+  rhs_to_lhs_indices[dnums.kernel_input_feature_dimension()] =
+      dnums.input_feature_dimension();
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    rhs_to_lhs_indices[dnums.kernel_spatial_dimensions(i)] =
+        dnums.input_spatial_dimensions(i);
+  }
+  std::vector<int64> lhs_to_rhs_indices(output_base_shape.rank());
+  for (int64 i = 0; i < rhs_to_lhs_indices.size(); ++i) {
+    lhs_to_rhs_indices[rhs_to_lhs_indices[i]] = i;
+  }
+  auto aligned_rhs_sharding =
+      hlo_sharding_util::TransposeSharding(lhs.sharding(), rhs_to_lhs_indices);
+  auto aligned_lhs_sharding =
+      hlo_sharding_util::TransposeSharding(rhs.sharding(), lhs_to_rhs_indices);
+
+  auto unsupported_sharding = [&](const HloSharding& lhs_sharding,
+                                  const HloSharding& rhs_sharding) {
+    // We currently don't support partitioning input batch or output feature
+    // dimensions.
+    return lhs_sharding.tile_assignment().dim(dnums.input_batch_dimension()) !=
+               1 ||
+           rhs_sharding.tile_assignment().dim(
+               dnums.kernel_output_feature_dimension()) != 1;
+  };
+
+  auto zero = b->AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::Zero(output_base_shape.element_type())));
+  if (ShapeSizeInBytes(lhs.base_shape()) < ShapeSizeInBytes(rhs.base_shape())) {
+    if (unsupported_sharding(aligned_lhs_sharding, rhs.sharding())) {
+      return nullptr;
+    }
+    lhs = lhs.Reshard(aligned_lhs_sharding).PadWithValue(zero);
+    rhs = rhs.PadWithValue(zero);
+  } else {
+    if (unsupported_sharding(lhs.sharding(), aligned_rhs_sharding)) {
+      return nullptr;
+    }
+    lhs = lhs.PadWithValue(zero);
+    rhs = rhs.Reshard(aligned_rhs_sharding).PadWithValue(zero);
+  }
+
+  // Reshard RHS so that each shard computes the partial sum of the full
+  // shape result, and add AllReduce. See HandleConvolutionTiledLhsAndRhs()
+  // that reshards LHS.
+  //
+  // The size of halo on each dimension can be calculated from the
+  // projection onto the RHS that shard i needs to read. RHS and LHS below
+  // refers to the shard size of RHS and LHS, WC is the number of windows,
+  // and D is the window dilation.
+  //
+  // * offset(i): LHS * i + low_padding - (WC - 1) * stride
+  // * limit(i): LHS * (i + 1) + low_padding
+  //
+  // Since shard i has RHS of range [i * RHS * D, (i + 1) * RHS * D)
+  // * left-halo: i * RHS - offset(i)
+  //              = i * (RHS * D - LHS) + (WC - 1) * stride - low_padding
+  // * right-halo: limit(i) - (i + 1) * RHS
+  //              = (i + 1) * (LHS - RHS * D) + low_pading
+  const auto& collective_ops_creator = lhs.state().collective_ops_creator;
+  std::vector<int64> shard_counts(dnums.input_spatial_dimensions_size());
+  std::vector<int64> lhs_shard_sizes(dnums.input_spatial_dimensions_size());
+  std::vector<int64> rhs_shard_sizes(dnums.input_spatial_dimensions_size());
+
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    int64 lhs_dimension = dnums.input_spatial_dimensions(i);
+    int64 rhs_dimension = dnums.kernel_spatial_dimensions(i);
+    int64 shard_count = rhs.sharding().tile_assignment().dim(rhs_dimension);
+    auto wd = conv_window.dimensions(i);
+    if (wd.base_dilation() != 1 || wd.window_reversal()) {
+      return nullptr;
+    }
+
+    int64 lhs_shard_size =
+        CeilOfRatio(lhs.base_shape().dimensions(lhs_dimension), shard_count);
+    int64 rhs_shard_size =
+        CeilOfRatio(rhs.base_shape().dimensions(rhs_dimension), shard_count);
+    shard_counts[i] = shard_count;
+    lhs_shard_sizes[i] = lhs_shard_size;
+    rhs_shard_sizes[i] = rhs_shard_size;
+  }
+
+  std::vector<OffsetCalculation> left_halo_size_functions(
+      output_base_shape.rank());
+  std::vector<OffsetCalculation> right_halo_size_functions(
+      output_base_shape.rank());
+  Window new_window = conv_window;
+
+  // Data structures needed for Pad and DynamicSlice on LHS if needed.
+  bool need_dynamic_slice_lhs = false;
+  auto partition_ordinals =
+      MakeTiledPartitionOrdinals(lhs.sharding(), partition_id, b);
+  std::vector<int64> zero_padding(output_base_shape.rank());
+  PaddingConfig pad_config = window_util::MakeSymmetricPadding(zero_padding);
+  auto zero_s32 =
+      b->AddInstruction(HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+  std::vector<HloInstruction*> dynamic_slice_start_indices(
+      output_base_shape.rank(), zero_s32);
+  Shape dynamic_slice_shape = lhs.hlo()->shape();
+  Shape pad_shape = lhs.hlo()->shape();
+
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    int64 lhs_dimension = dnums.input_spatial_dimensions(i);
+    int64 rhs_dimension = dnums.kernel_spatial_dimensions(i);
+    int64 lhs_shard_size = lhs_shard_sizes[i];
+    int64 rhs_shard_size = rhs_shard_sizes[i];
+
+    if (shard_counts[i] == 1) {
+      continue;
+    }
+
+    // Calculate the left and right halo sizes as described in the comments
+    // above. It calculcates the halo sizes with dilation, so we apply
+    // CeilOfRatio({left,right}_halo_size, window_dilation).
+    auto wd = conv_window.dimensions(i);
+    int64 padding_low = wd.padding_low();
+    int64 padding_high = wd.padding_high();
+    int64 base = lhs.base_shape().dimensions(lhs_dimension);
+    int64 window_count = 1 + (padding_low + padding_high + base -
+                              (1 + (wd.size() - 1) * wd.window_dilation())) /
+                                 wd.stride();
+    left_halo_size_functions[rhs_dimension] =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            rhs_shard_size * wd.window_dilation() - lhs_shard_size,
+            (window_count - 1) * wd.stride() - padding_low +
+                wd.window_dilation() - 1,
+            wd.window_dilation()));
+    right_halo_size_functions[rhs_dimension] =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            lhs_shard_size - rhs_shard_size * wd.window_dilation(),
+            lhs_shard_size - rhs_shard_size * wd.window_dilation() +
+                padding_low + wd.window_dilation() - 1,
+            wd.window_dilation()));
+
+    // New RHS window size includes the maximum of both left and right
+    // halos.
+    int64 halo_size =
+        left_halo_size_functions[rhs_dimension].MaxInRange(1, shard_counts[i]) +
+        right_halo_size_functions[rhs_dimension].MaxInRange(
+            0, shard_counts[i] - 1);
+    int64 new_window_size =
+        rhs.hlo()->shape().dimensions(rhs_dimension) + halo_size;
+
+    // The amount of new low padding could be dynamic (e.g., window_dilation
+    // != 1), which requires pad (to the maximum) and dynamic slice on LHS.
+    //
+    // If we consider the first window, the offset of the dilated RHS that
+    // aligns with the first valid LHS element for shard i is 'padding_low +
+    // LHS * i'. When the left halo is added to RHS, the offset of the first
+    // RHS element is (RHS * i - left_halo) * window_dilation. The
+    // difference between the two values is the amount of padding_low we
+    // need on LHS.
+    auto new_padding_low_function =
+        OffsetCalculation(HloOpcode::kMultiply,
+                          left_halo_size_functions[rhs_dimension],
+                          OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+                              0, wd.window_dilation(), 1))) -
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            rhs_shard_size * wd.window_dilation() - lhs_shard_size,
+            -padding_low, 1));
+
+    int64 new_padding_low_max =
+        new_padding_low_function.MaxInRange(0, shard_counts[i]);
+    int64 new_padding_low = new_padding_low_max;
+    int64 new_padding_high = window_count * wd.stride() +
+                             (new_window_size - 1) * wd.window_dilation() -
+                             new_padding_low - lhs_shard_size;
+
+    // We do pad/dynamic-slice only when the padding is dynamic.
+    if (!new_padding_low_function.IsConstant()) {
+      need_dynamic_slice_lhs = true;
+      new_padding_low = 0;
+      pad_config.mutable_dimensions(lhs_dimension)
+          ->set_edge_padding_low(new_padding_low_max);
+      pad_config.mutable_dimensions(lhs_dimension)
+          ->set_edge_padding_high(new_padding_low_max);
+      pad_shape.set_dimensions(lhs_dimension,
+                               lhs_shard_size + 2 * new_padding_low_max);
+      dynamic_slice_start_indices[lhs_dimension] =
+          (OffsetCalculation(
+               MultiplyAddDivideOffsetCalculation(0, new_padding_low_max, 1)) -
+           new_padding_low_function)
+              .Calculate(partition_ordinals[lhs_dimension], b);
+      dynamic_slice_shape.set_dimensions(lhs_dimension,
+                                         lhs_shard_size + new_padding_low_max);
+    }
+
+    // Since the convolution RHS operand size increased with halos, adjust
+    // the window config accordingly.
+    new_window.mutable_dimensions(i)->set_padding_low(new_padding_low);
+    new_window.mutable_dimensions(i)->set_padding_high(new_padding_high);
+    new_window.mutable_dimensions(i)->set_size(
+        rhs.hlo()->shape().dimensions(rhs_dimension) + halo_size);
+  }
+
+  HloInstruction* conv_lhs = lhs.hlo();
+  if (need_dynamic_slice_lhs) {
+    auto pad = b->AddInstruction(
+        HloInstruction::CreatePad(pad_shape, lhs.hlo(), zero, pad_config));
+    conv_lhs = b->AddInstruction(HloInstruction::CreateDynamicSlice(
+        dynamic_slice_shape, pad, dynamic_slice_start_indices,
+        dynamic_slice_shape.dimensions()));
+  }
+
+  // Exchange halo and concatenate.
+  HloInstruction* rhs_with_halo = rhs.hlo();
+  for (int i = 0; i < dnums.kernel_spatial_dimensions_size(); ++i) {
+    int64 dim = dnums.kernel_spatial_dimensions(i);
+    int64 explicit_left_padding_on_full_shape =
+        left_halo_size_functions[dim].Calculate(0);
+    int64 shard_size_with_halo = new_window.dimensions(i).size();
+
+    // offset_on_padded_shape and padded_full_shape_size are needed only if
+    // we want to mask out-of-range values in ExchangeHaloAndGetValidData().
+    // Since the default value for both the collective-permute is zero and
+    // also we call PadWithValue() on both operands at the beginning, we
+    // don't need to mask here.
+    //
+    // TODO(hyoulkee): Consider removing one of the two PadWithValue() calls
+    // if it's always safe.
+    auto offset_on_padded_shape =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            rhs_shard_sizes[i], explicit_left_padding_on_full_shape, 1)) -
+        left_halo_size_functions[dim];
+    int64 padded_full_shape_size =
+        offset_on_padded_shape.Calculate(shard_counts[i] - 1) +
+        new_window.dimensions(i).size();
+    auto concat = ExchangeHaloAndGetValidData(
+        rhs_with_halo, rhs.base_shape(), left_halo_size_functions[dim],
+        right_halo_size_functions[dim], explicit_left_padding_on_full_shape,
+        padded_full_shape_size, shard_size_with_halo, dim, rhs.sharding(),
+        offset_on_padded_shape.Calculate(partition_ordinals[dim], b), zero,
+        partition_ordinals[dim], collective_ops_creator,
+        lhs.state().next_channel_id, b,
+        /*mask_invalid_region=*/false);
+    if (!concat) {
+      return nullptr;
+    }
+    rhs_with_halo = *concat;
+  }
+
+  auto conv = b->AddInstruction(HloInstruction::CreateConvolve(
+      output_base_shape, conv_lhs, rhs_with_halo,
+      original_hlo->feature_group_count(), original_hlo->batch_group_count(),
+      new_window, dnums, original_hlo->precision_config()));
+  auto ar = collective_ops_creator.create_cross_partition_all_reduce(
+      b, conv, MakeBinaryAdd(original_hlo->shape().element_type(), module), {},
+      (*lhs.state().next_channel_id)++);
+  ar->set_sharding(HloSharding::Replicate());
+  return PartitionedHlo(ar, output_base_shape, lhs.state())
+      .Reshard(output_sharding)
+      .hlo();
+}
+
+// Partition convolution when both LHS and RHS are partitioned at spatial
+// dimensions. Halo exchange will happen on LHS only.
+StatusOr<HloInstruction*>
+PartitionConvolutionWithSpatialDimensionHaloExchangeOnLHS(
+    PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
+    const HloSharding& output_sharding, const Window& conv_window,
+    HloInstruction* original_hlo, HloInstruction* partition_id,
+    HloModule* module, SpmdBuilder* b) {
+  TF_RET_CHECK(original_hlo->opcode() == HloOpcode::kConvolution);
+  TF_RET_CHECK(!lhs.sharding().IsTileMaximal() &&
+               !rhs.sharding().IsTileMaximal());
+
+  const auto& dnums = original_hlo->convolution_dimension_numbers();
+
+  // Check if the operand shardings are aligned. Also we currently don't
+  // support partitioning non-spatial dimensions.
+  std::vector<int64> rhs_to_lhs_indices(output_base_shape.rank());
+  rhs_to_lhs_indices[dnums.kernel_output_feature_dimension()] =
+      dnums.input_batch_dimension();
+  rhs_to_lhs_indices[dnums.kernel_input_feature_dimension()] =
+      dnums.input_feature_dimension();
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    rhs_to_lhs_indices[dnums.kernel_spatial_dimensions(i)] =
+        dnums.input_spatial_dimensions(i);
+  }
+  std::vector<int64> lhs_to_rhs_indices(output_base_shape.rank());
+  for (int64 i = 0; i < rhs_to_lhs_indices.size(); ++i) {
+    lhs_to_rhs_indices[rhs_to_lhs_indices[i]] = i;
+  }
+
+  Window window = conv_window;
+  std::vector<int64> reversed_rhs_dims;
+  for (int64 i = 0; i < window.dimensions_size(); ++i) {
+    if (window.dimensions(i).window_reversal()) {
+      reversed_rhs_dims.push_back(dnums.kernel_spatial_dimensions(i));
+    }
+  }
+  if (!reversed_rhs_dims.empty()) {
+    // Make the reversed dims left-padded to prepare for window reversal.
+    auto left_padded_rhs = HaloExchangeToPadOnLeft(rhs, reversed_rhs_dims);
+    if (left_padded_rhs == nullptr) {
+      return nullptr;
+    }
+    left_padded_rhs->set_sharding(rhs.sharding());
+    rhs = PartitionedHlo(left_padded_rhs, rhs.base_shape(), rhs.state());
+  }
+  // Consider window reversal when resharding RHS or LHS. Note: this will not
+  // reverse the data in the shard. We use window reversal to do that.
+  auto aligned_rhs_sharding = hlo_sharding_util::ReverseSharding(
+      hlo_sharding_util::TransposeSharding(lhs.sharding(), rhs_to_lhs_indices),
+      reversed_rhs_dims);
+  auto aligned_lhs_sharding = hlo_sharding_util::TransposeSharding(
+      hlo_sharding_util::ReverseSharding(rhs.sharding(), reversed_rhs_dims),
+      lhs_to_rhs_indices);
+
+  auto unsupported_sharding = [&](const HloSharding& lhs_sharding,
+                                  const HloSharding& rhs_sharding) {
+    return lhs_sharding.tile_assignment().dim(dnums.input_batch_dimension()) !=
+               1 ||
+           rhs_sharding.tile_assignment().dim(
+               dnums.kernel_output_feature_dimension()) != 1;
+  };
+
+  auto zero = b->AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::Zero(output_base_shape.element_type())));
+  if (ShapeSizeInBytes(lhs.base_shape()) < ShapeSizeInBytes(rhs.base_shape())) {
+    if (unsupported_sharding(aligned_lhs_sharding, rhs.sharding())) {
+      return nullptr;
+    }
+    lhs = lhs.Reshard(aligned_lhs_sharding).PadWithValue(zero);
+    rhs = rhs.PadWithValue(zero, reversed_rhs_dims);
+  } else {
+    if (unsupported_sharding(lhs.sharding(), aligned_rhs_sharding)) {
+      return nullptr;
+    }
+    lhs = lhs.PadWithValue(zero);
+    rhs =
+        rhs.Reshard(aligned_rhs_sharding).PadWithValue(zero, reversed_rhs_dims);
+  }
+
+  // Reshard LHS by exchanging halo such that each shard computes the partial
+  // sum of the full shape result, and add AllReduce.
+  //
+  // The size of halo on each dimension can be calculated from the projection
+  // onto the LHS that each RHS shard i needs to read. RHS and LHS below refers
+  // to the shard size of RHS and LHS, WC is the number of windows, and D is the
+  // window dilation.
+  //
+  // * offset(i): RHS * D * i - low_padding
+  // * limit(i): {RHS * (i + 1) * D - (D - 1)} + (WC - 1) * stride - low_padding
+  //
+  // Since shard i has LHS of range [i * LHS, (i + 1) * LHS)
+  // * left-halo: i * LHS - offset(i)
+  //              = (LHS - RHS * D) * i + low_padding
+  // * right-halo: limit(i) - (i + 1) * LHS
+  //   = (RHS * D - LHS) * (i + 1) + (1 - D)  + (WC - 1) * stride - low_padding
+  //   = (RHS * D - LHS) * i + (RHS * D - LHS) + (1-D)
+  //     + (WC - 1) * stride - low_padding
+  std::vector<int64> shard_counts(dnums.input_spatial_dimensions_size());
+  std::vector<int64> lhs_shard_sizes(dnums.input_spatial_dimensions_size());
+  std::vector<int64> rhs_shard_sizes(dnums.input_spatial_dimensions_size());
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    int64 lhs_dimension = dnums.input_spatial_dimensions(i);
+    int64 rhs_dimension = dnums.kernel_spatial_dimensions(i);
+    int64 shard_count = lhs.sharding().tile_assignment().dim(lhs_dimension);
+    auto wd = window.dimensions(i);
+    if (wd.base_dilation() != 1) {
+      // TODO(wangtao): support parallel dim if it is replicate here.
+      return nullptr;
+    }
+
+    int64 lhs_shard_size =
+        CeilOfRatio(lhs.base_shape().dimensions(lhs_dimension), shard_count);
+    int64 rhs_shard_size =
+        CeilOfRatio(rhs.base_shape().dimensions(rhs_dimension), shard_count);
+    shard_counts[i] = shard_count;
+    lhs_shard_sizes[i] = lhs_shard_size;
+    rhs_shard_sizes[i] = rhs_shard_size;
+  }
+
+  std::vector<OffsetCalculation> left_halo_size_functions(
+      output_base_shape.rank());
+  std::vector<OffsetCalculation> right_halo_size_functions(
+      output_base_shape.rank());
+  Window new_window = window;
+
+  auto partition_ordinals =
+      MakeTiledPartitionOrdinals(lhs.sharding(), partition_id, b);
+  HloInstruction* lhs_with_halo = lhs.hlo();
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    int64 lhs_dimension = dnums.input_spatial_dimensions(i);
+    int64 lhs_shard_size = lhs_shard_sizes[i];
+    int64 rhs_shard_size = rhs_shard_sizes[i];
+
+    if (shard_counts[i] == 1) {
+      continue;
+    }
+
+    // Calculate the left and right halo sizes as described in the comments
+    // above.
+    auto wd = window.dimensions(i);
+    int64 padding_low = wd.padding_low();
+    int64 padding_high = wd.padding_high();
+    int64 base = lhs.base_shape().dimensions(lhs_dimension);
+    int64 window_count = 1 + (padding_low + padding_high + base -
+                              (1 + (wd.size() - 1) * wd.window_dilation())) /
+                                 wd.stride();
+    int64 rhs_shard_size_dilated =
+        (rhs_shard_size - 1) * wd.window_dilation() + 1;
+
+    left_halo_size_functions[lhs_dimension] =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            lhs_shard_size - rhs_shard_size * wd.window_dilation(), padding_low,
+            1));
+    right_halo_size_functions[lhs_dimension] =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            rhs_shard_size * wd.window_dilation() - lhs_shard_size,
+            rhs_shard_size * wd.window_dilation() - lhs_shard_size + 1 -
+                wd.window_dilation() + wd.stride() * (window_count - 1) -
+                padding_low,
+            1));
+
+    // Exchange halo and concatenate.
+    int64 dim = dnums.input_spatial_dimensions(i);
+    int64 explicit_left_padding_on_full_shape = padding_low;
+    int64 shard_size_with_halo =
+        wd.stride() * (window_count - 1) + rhs_shard_size_dilated;
+
+    new_window.mutable_dimensions(i)->set_padding_low(0);
+    new_window.mutable_dimensions(i)->set_padding_high(0);
+    new_window.mutable_dimensions(i)->set_size(rhs_shard_size);
+
+    // offset_on_padded_shape and padded_full_shape_size are needed only if
+    // we want to mask out-of-range values in ExchangeHaloAndGetValidData().
+    // Since the default value for both the collective-permute is zero and
+    // also we call PadWithValue() on both operands at the beginning, we
+    // don't need to mask here.
+    //
+    // TODO(hyoulkee): Consider removing one of the two PadWithValue() calls
+    // if it's always safe.
+    auto offset_on_padded_shape =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation());
+    int64 padded_full_shape_size = 0;
+    auto concat = ExchangeHaloAndGetValidData(
+        lhs_with_halo, lhs.base_shape(), left_halo_size_functions[dim],
+        right_halo_size_functions[dim], explicit_left_padding_on_full_shape,
+        padded_full_shape_size, shard_size_with_halo, dim, lhs.sharding(),
+        offset_on_padded_shape.Calculate(partition_ordinals[dim], b), zero,
+        partition_ordinals[dim], lhs.state().collective_ops_creator,
+        lhs.state().next_channel_id, b,
+        /*mask_invalid_region=*/false);
+    if (!concat) {
+      return nullptr;
+    }
+    lhs_with_halo = *concat;
+  }
+
+  auto conv = b->AddInstruction(HloInstruction::CreateConvolve(
+      output_base_shape, lhs_with_halo, rhs.hlo(),
+      original_hlo->feature_group_count(), original_hlo->batch_group_count(),
+      new_window, original_hlo->convolution_dimension_numbers(),
+      original_hlo->precision_config()));
+  auto ar =
+      lhs.state().collective_ops_creator.create_cross_partition_all_reduce(
+          b, conv, MakeBinaryAdd(output_base_shape.element_type(), module), {},
+          (*lhs.state().next_channel_id)++);
+  ar->set_sharding(HloSharding::Replicate());
+  return PartitionedHlo(ar, output_base_shape, lhs.state())
+      .Reshard(output_sharding)
+      .hlo();
+}
+
+// Partition convolution when output is sharded. Will shard LHS with replicated
+// RHS.
+StatusOr<HloInstruction*> PartitionConvolutionTiledOutput(
+    PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
+    const HloSharding& output_sharding, const Window& conv_window,
+    HloInstruction* original_hlo, SpmdBuilder* b) {
+  TF_RET_CHECK(original_hlo->opcode() == HloOpcode::kConvolution);
+  const auto& dnums = original_hlo->convolution_dimension_numbers();
+  TF_RET_CHECK(!output_sharding.IsTileMaximal());
+  // We don't currently support sharding on output feature dimension.
+  if (output_sharding.tile_assignment().dim(dnums.output_feature_dimension()) >
+      1) {
+    return nullptr;
+  }
+
+  // Check if the operand and the output sharding are aligned.
+  std::vector<int64> input_to_output_indices(output_base_shape.rank());
+  input_to_output_indices[dnums.input_batch_dimension()] =
+      dnums.output_batch_dimension();
+  input_to_output_indices[dnums.input_feature_dimension()] =
+      dnums.output_feature_dimension();
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    input_to_output_indices[dnums.input_spatial_dimensions(i)] =
+        dnums.output_spatial_dimensions(i);
+  }
+  auto target_operand_sharding = hlo_sharding_util::TransposeSharding(
+      output_sharding, input_to_output_indices);
+  lhs = lhs.Reshard(target_operand_sharding);
+
+  // Replicate the RHS.
+  rhs = rhs.Reshard(HloSharding::Replicate());
+
+  // Convolution window config does not include batch and feature dimensions,
+  // whereas ReshardAsWindowedInput() expects the same number of window
+  // dimensions as the rank of the operand. So add two more trivial
+  // dimensions.
+  std::vector<int64> ones(output_base_shape.rank(), 1);
+  auto operand_window = window_util::MakeWindow(ones);
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    *operand_window.mutable_dimensions(dnums.input_spatial_dimensions(i)) =
+        conv_window.dimensions(i);
+  }
+
+  auto zero = b->AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::Zero(output_base_shape.element_type())));
+  auto resharded_operand_and_window =
+      lhs.ReshardAsWindowedInput(operand_window, target_operand_sharding, zero);
+  if (!resharded_operand_and_window.has_value()) {
+    return nullptr;
+  }
+  Window new_window;
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    *new_window.add_dimensions() =
+        resharded_operand_and_window->shard_window.dimensions(
+            dnums.input_spatial_dimensions(i));
+  }
+  TF_ASSIGN_OR_RETURN(
+      Shape sharded_conv_shape,
+      ShapeInference::InferConvolveShape(
+          resharded_operand_and_window->sharded_input->shape(),
+          rhs.hlo()->shape(), original_hlo->feature_group_count(),
+          original_hlo->batch_group_count(), new_window, dnums));
+  auto shard_shape = MakePartitionedShape(output_base_shape, output_sharding);
+  *sharded_conv_shape.mutable_layout() = shard_shape.layout();
+  auto sharded_conv = b->AddInstruction(HloInstruction::CreateConvolve(
+      sharded_conv_shape, resharded_operand_and_window->sharded_input,
+      rhs.hlo(), original_hlo->feature_group_count(),
+      original_hlo->batch_group_count(), new_window, dnums,
+      original_hlo->precision_config()));
+  if (!resharded_operand_and_window->dynamic_slice_index_on_output
+           .has_value()) {
+    CHECK(ShapeUtil::Compatible(shard_shape, sharded_conv->shape()));
+    return sharded_conv;
+  }
+  return b->AddInstruction(HloInstruction::CreateDynamicSlice(
+      shard_shape, sharded_conv,
+      *resharded_operand_and_window->dynamic_slice_index_on_output,
+      shard_shape.dimensions()));
+}
+
+StatusOr<HloInstruction*> PartitionConvolutionGroupOnParallelDim(
+    PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
+    const HloSharding& output_sharding, const Window& conv_window,
+    HloInstruction* original_hlo, const ConvolutionDimsMapping& dims_mapping,
+    int64 num_partitions, const SpmdPartitionerOptions& options,
+    HloInstruction* partition_id, HloModule* module, SpmdBuilder* b) {
+  std::vector<int64> lhs_dims;
+  std::vector<int64> rhs_dims;
+  std::vector<int64> output_dims;
+  auto lhs_sharding_dims_adjusted_to_output =
+      lhs.sharding().IsReplicated()
+          ? std::vector<int64>(lhs.base_shape().rank(), 1)
+          : lhs.sharding().tile_assignment().dimensions();
+  auto rhs_sharding_dims_adjusted_to_output =
+      rhs.sharding().IsReplicated()
+          ? std::vector<int64>(rhs.base_shape().rank(), 1)
+          : rhs.sharding().tile_assignment().dimensions();
+  auto output_sharding_dims_adjusted_to_lhs =
+      output_sharding.tile_assignment().dimensions();
+  bool lhs_rhs_dims_matching = true;
+  for (const auto& dim : dims_mapping.parallel_spatial_dims) {
+    lhs_dims.push_back(dim.lhs);
+    rhs_dims.push_back(dim.rhs);
+    output_dims.push_back(dim.output);
+    if (lhs_sharding_dims_adjusted_to_output[dim.lhs] !=
+        rhs_sharding_dims_adjusted_to_output[dim.rhs]) {
+      lhs_rhs_dims_matching = false;
+    }
+    lhs_sharding_dims_adjusted_to_output[dim.lhs] =
+        output_sharding.tile_assignment().dim(dim.output);
+    rhs_sharding_dims_adjusted_to_output[dim.rhs] =
+        output_sharding.tile_assignment().dim(dim.output);
+    output_sharding_dims_adjusted_to_lhs[dim.output] =
+        lhs.sharding().tile_assignment().dim(dim.lhs);
+  }
+  auto lhs_grouped = GroupShardingOnDims(lhs.sharding(), lhs_dims);
+  auto rhs_grouped = GroupShardingOnDims(rhs.sharding(), rhs_dims);
+  auto output_grouped = GroupShardingOnDims(output_sharding, output_dims);
+  if (lhs_rhs_dims_matching) {
+    if (ShapeUtil::ByteSizeOf(lhs.base_shape()) >
+        ShapeUtil::ByteSizeOf(rhs.base_shape())) {
+      rhs_grouped = AlignGroupsWith(std::move(rhs_grouped), lhs_grouped);
+      rhs = rhs.Reshard(UngroupSharding(rhs_grouped));
+    } else {
+      lhs_grouped = AlignGroupsWith(std::move(lhs_grouped), rhs_grouped);
+      lhs = lhs.Reshard(UngroupSharding(lhs_grouped));
+    }
+    auto reshaped_output_tiling = output_sharding.tile_assignment();
+    reshaped_output_tiling.Reshape(output_sharding_dims_adjusted_to_lhs);
+    output_grouped = AlignGroupsWith(
+        GroupShardingOnDims(HloSharding::Tile(reshaped_output_tiling),
+                            output_dims),
+        lhs_grouped);
+  } else {
+    auto reshaped_lhs_tiling = lhs.sharding().tile_assignment();
+    reshaped_lhs_tiling.Reshape(lhs_sharding_dims_adjusted_to_output);
+    lhs_grouped = AlignGroupsWith(
+        GroupShardingOnDims(HloSharding::Tile(reshaped_lhs_tiling), lhs_dims),
+        output_grouped);
+    lhs = lhs.Reshard(UngroupSharding(lhs_grouped));
+    auto reshaped_rhs_tiling = rhs.sharding().tile_assignment();
+    reshaped_rhs_tiling.Reshape(rhs_sharding_dims_adjusted_to_output);
+    rhs_grouped = AlignGroupsWith(
+        GroupShardingOnDims(HloSharding::Tile(reshaped_rhs_tiling), rhs_dims),
+        output_grouped);
+    rhs = rhs.Reshard(UngroupSharding(rhs_grouped));
+  }
+
+  // Update LHS and RHS sharding and shape.
+  lhs.hlo()->set_sharding(lhs_grouped.sharding);
+  rhs.hlo()->set_sharding(rhs_grouped.sharding);
+  CHECK(lhs.hlo() != rhs.hlo() || lhs_grouped.sharding == rhs_grouped.sharding);
+  auto per_group_partitioner_state = CreatePerGroupPartitioningState(
+      lhs.state(), lhs_grouped.device_groups, b);
+  auto grouped_lhs_base_shape =
+      GetPerGroupBaseShape(lhs_grouped, lhs.base_shape());
+  auto grouped_lhs_shard_shape =
+      MakePartitionedShape(grouped_lhs_base_shape, lhs.sharding());
+  // Update convolution window with the new shape
+  auto new_window = conv_window;
+  for (const auto& dim : dims_mapping.parallel_spatial_dims) {
+    auto wd = new_window.mutable_dimensions(dim.spatial);
+    wd->set_size(grouped_lhs_shard_shape.dimensions(dim.lhs));
+    wd->set_stride(std::max<int64>(1, wd->size() - 1));
+    wd->set_base_dilation(wd->size());
+  }
+
+  auto new_partition_id =
+      lhs.state().collective_ops_creator.create_partition_id(b);
+  TF_ASSIGN_OR_RETURN(
+      auto conv,
+      PartitionConvolution(
+          PartitionedHlo(lhs.hlo(), grouped_lhs_base_shape,
+                         per_group_partitioner_state),
+          PartitionedHlo(rhs.hlo(),
+                         GetPerGroupBaseShape(rhs_grouped, rhs.base_shape()),
+                         per_group_partitioner_state),
+          GetPerGroupBaseShape(output_grouped, output_base_shape),
+          output_grouped.sharding, new_window, original_hlo,
+          num_partitions / output_grouped.device_groups.size(), options,
+          new_partition_id, module, b));
+  // Reset the LHS sharding to the ungrouped one.
+  lhs.hlo()->set_sharding(UngroupSharding(lhs_grouped));
+  rhs.hlo()->set_sharding(UngroupSharding(rhs_grouped));
+  conv->set_sharding(UngroupSharding(output_grouped));
+  return PartitionedHlo(conv, output_base_shape, lhs.state())
+      .Reshard(output_sharding)
+      .hlo();
+}
+
+// Partition convolution with only one kind of dims partitioned.
+StatusOr<HloInstruction*> PartitionConvolutionBaseCase(
+    PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
+    const HloSharding& output_sharding, const Window& conv_window,
+    HloInstruction* original_hlo, int64 num_partitions,
+    const SpmdPartitionerOptions& options, HloInstruction* partition_id,
+    HloModule* module, SpmdBuilder* b) {
+  TF_RET_CHECK(original_hlo->opcode() == HloOpcode::kConvolution);
+
+  // Case 1: Either RHS or LHS is only partitioned at parallel dimensions.
+  TF_ASSIGN_OR_RETURN(auto parallel_partitioned_conv,
+                      PartitionConvolutionWithParallelDimension(
+                          lhs, rhs, output_base_shape, output_sharding,
+                          conv_window, original_hlo, num_partitions, b));
+  if (parallel_partitioned_conv) {
+    return parallel_partitioned_conv;
+  }
+
+  // Case 2: both RHS and LHS are tiled.
+  // Handling cases where both operands' shardings are aligned. We check that
+  // the LHS batch dimension is not partitioned because it is mapped to the
+  // output feature dimension in aligned_rhs_sharding, which are not the same
+  // dimension.
+  if (!lhs.sharding().IsTileMaximal() && !rhs.sharding().IsTileMaximal()) {
+    if (options.conv_halo_exchange_always_on_lhs) {
+      TF_ASSIGN_OR_RETURN(
+          auto partitioned_conv,
+          PartitionConvolutionWithSpatialDimensionHaloExchangeOnLHS(
+              lhs, rhs, output_base_shape, output_sharding, conv_window,
+              original_hlo, partition_id, module, b));
+      if (partitioned_conv) {
+        return partitioned_conv;
+      }
+    } else {
+      TF_ASSIGN_OR_RETURN(
+          auto partitioned_conv,
+          PartitionConvolutionWithSpatialDimensionHaloExchangeOnRHS(
+              lhs, rhs, output_base_shape, output_sharding, conv_window,
+              original_hlo, partition_id, module, b));
+
+      if (partitioned_conv) {
+        return partitioned_conv;
+      }
+    }
+  }
+
+  // Case 3: output is tiled.
+  if (!output_sharding.IsTileMaximal()) {
+    TF_ASSIGN_OR_RETURN(auto partitioned_conv,
+                        PartitionConvolutionTiledOutput(
+                            lhs, rhs, output_base_shape, output_sharding,
+                            conv_window, original_hlo, b));
+
+    if (partitioned_conv) {
+      return partitioned_conv;
+    }
+  }
+  return nullptr;
+}
+
+// Partition convolution.
+StatusOr<HloInstruction*> PartitionConvolution(
+    PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
+    const HloSharding& output_sharding, const Window& conv_window,
+    HloInstruction* original_hlo, int64 num_partitions,
+    const SpmdPartitionerOptions& options, HloInstruction* partition_id,
+    HloModule* module, SpmdBuilder* b) {
+  TF_RET_CHECK(original_hlo->opcode() == HloOpcode::kConvolution);
+
+  TF_ASSIGN_OR_RETURN(
+      auto try_partitioned_conv,
+      PartitionConvolutionBaseCase(lhs, rhs, output_base_shape, output_sharding,
+                                   conv_window, original_hlo, num_partitions,
+                                   options, partition_id, module, b));
+  if (try_partitioned_conv) {
+    return try_partitioned_conv;
+  }
+
+  const auto& dnums = original_hlo->convolution_dimension_numbers();
+  spmd::ConvolutionDimsMapping mapping;
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    int64 lhs_dim = dnums.input_spatial_dimensions(i);
+    int64 lhs_size = lhs.base_shape().dimensions(lhs_dim);
+    const auto& wd = original_hlo->window().dimensions(i);
+    int64 rhs_dim = dnums.kernel_spatial_dimensions(i);
+    int64 output_dim = dnums.output_spatial_dimensions(i);
+    if (dot_as_convolution_util::ConvSpatialDimensionIsParallel(wd, lhs_size)) {
+      mapping.parallel_spatial_dims.emplace_back();
+      mapping.parallel_spatial_dims.back().lhs = lhs_dim;
+      mapping.parallel_spatial_dims.back().rhs = rhs_dim;
+      mapping.parallel_spatial_dims.back().output = output_dim;
+      mapping.parallel_spatial_dims.back().spatial = i;
+    } else {
+      mapping.non_parallel_spatial_dims.emplace_back();
+      mapping.non_parallel_spatial_dims.back().lhs = lhs_dim;
+      mapping.non_parallel_spatial_dims.back().rhs = rhs_dim;
+      mapping.non_parallel_spatial_dims.back().output = output_dim;
+      mapping.non_parallel_spatial_dims.back().spatial = i;
+    }
+  }
+
+  // lhs_rhs_or_output: 0 lhs, 1 rhs, 2 output.
+  auto get_partitions_for_dims =
+      [&](const HloSharding& sharding,
+          absl::Span<const ConvolutionDimsMapping::DimsMapping> dims,
+          int lhs_rhs_or_output) {
+        int64 partitions = 1;
+        if (sharding.IsTileMaximal()) {
+          return partitions;
+        }
+        for (const auto& dim : dims) {
+          if (lhs_rhs_or_output == 0) {
+            partitions *= sharding.tile_assignment().dim(dim.lhs);
+          } else if (lhs_rhs_or_output == 1) {
+            partitions *= sharding.tile_assignment().dim(dim.rhs);
+          } else {
+            CHECK_EQ(lhs_rhs_or_output, 2);
+            partitions *= sharding.tile_assignment().dim(dim.output);
+          }
+        }
+        return partitions;
+      };
+
+  const int64 lhs_parallel_spatial_partitions =
+      get_partitions_for_dims(lhs.sharding(), mapping.parallel_spatial_dims, 0);
+  const int64 rhs_parallel_spatial_partitions =
+      get_partitions_for_dims(rhs.sharding(), mapping.parallel_spatial_dims, 1);
+  const int64 output_parallel_spatial_partitions = get_partitions_for_dims(
+      original_hlo->sharding(), mapping.parallel_spatial_dims, 2);
+
+  // Recursively partition on different types of dimensions.
+  //
+  // Case 1: Group partitions by parallel spatial dims.
+  if (lhs_parallel_spatial_partitions == rhs_parallel_spatial_partitions &&
+      lhs_parallel_spatial_partitions == output_parallel_spatial_partitions &&
+      lhs_parallel_spatial_partitions > 1) {
+    TF_ASSIGN_OR_RETURN(auto try_partitioned_conv,
+                        PartitionConvolutionGroupOnParallelDim(
+                            lhs, rhs, output_base_shape, output_sharding,
+                            conv_window, original_hlo, mapping, num_partitions,
+                            options, partition_id, module, b));
+    if (try_partitioned_conv) {
+      return try_partitioned_conv;
+    }
+  }
+
+  return nullptr;
+}
+
+}  // namespace
+
+Status SpmdPartitioningVisitor::HandleConvolution(HloInstruction* hlo) {
+  auto dot_dnums = dot_as_convolution_util::ParseDotGeneralFromConvolution(hlo);
+  if (dot_dnums) {
+    // Use HandleDotHelper() for convs that are actually einsums.
+    spmd::DotGeneralDimsMapping mapping;
+    for (const auto& dims : dot_dnums->batch_dims) {
+      mapping.batch_dims.emplace_back();
+      mapping.batch_dims.back().lhs = dims.lhs;
+      mapping.batch_dims.back().rhs = dims.rhs;
+      mapping.batch_dims.back().output = dims.output;
+    }
+    for (const auto& dims : dot_dnums->contracting_dims) {
+      mapping.contracting_dims.emplace_back();
+      mapping.contracting_dims.back().lhs = dims.lhs;
+      mapping.contracting_dims.back().rhs = dims.rhs;
+      mapping.contracting_dims.back().output = dims.output;
+    }
+    for (const auto& dims : dot_dnums->lhs_non_contracting_dims) {
+      mapping.lhs_non_contracting_dims.emplace_back();
+      mapping.lhs_non_contracting_dims.back().lhs = dims.lhs;
+      mapping.lhs_non_contracting_dims.back().rhs = dims.rhs;
+      mapping.lhs_non_contracting_dims.back().output = dims.output;
+    }
+    for (const auto& dims : dot_dnums->rhs_non_contracting_dims) {
+      mapping.rhs_non_contracting_dims.emplace_back();
+      mapping.rhs_non_contracting_dims.back().lhs = dims.lhs;
+      mapping.rhs_non_contracting_dims.back().rhs = dims.rhs;
+      mapping.rhs_non_contracting_dims.back().output = dims.output;
+    }
+    auto create_sharded_conv =
+        [&](HloInstruction* lhs_hlo, HloInstruction* rhs_hlo,
+            spmd::SpmdBuilder* b) -> StatusOr<HloInstruction*> {
+      TF_ASSIGN_OR_RETURN(
+          auto sharded_conv,
+          dot_as_convolution_util::CreateShardedConvForDotGeneralConvolution(
+              *hlo, *dot_dnums, lhs_hlo, rhs_hlo));
+      return b->AddInstruction(std::move(sharded_conv));
+    };
+    return HandleDotHelper(hlo, mapping, create_sharded_conv);
+  }
+
+  auto lhs = GetPartitionedHlo(hlo->operand(0));
+  auto rhs = GetPartitionedHlo(hlo->operand(1));
+  TF_ASSIGN_OR_RETURN(
+      auto partitioned_conv,
+      PartitionConvolution(lhs, rhs, hlo->shape(), hlo->sharding(),
+                           hlo->window(), hlo, num_partitions_, options_,
+                           partition_id_, module_, &b_));
+
+  if (partitioned_conv) {
+    SetPartitionedHlo(hlo, [&] { return partitioned_conv; });
+    return Status::OK();
+  }
+  return DefaultAction(hlo);
+}
+
+}  // namespace spmd
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/dot_handler.cc b/tensorflow/compiler/xla/service/spmd/dot_handler.cc
new file mode 100644
index 00000000000..55ebe120d01
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/dot_handler.cc
@@ -0,0 +1,1587 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/numbers.h"
+
+namespace xla {
+namespace spmd {
+
+Status SpmdPartitioningVisitor::HandleDot(HloInstruction* hlo) {
+  DotGeneralDimsMapping mapping;
+  const auto& dnums = hlo->dot_dimension_numbers();
+  int64 next_output_dim = 0;
+  for (int64 i = 0; i < dnums.lhs_batch_dimensions_size(); ++i) {
+    mapping.batch_dims.emplace_back();
+    mapping.batch_dims.back().lhs = dnums.lhs_batch_dimensions(i);
+    mapping.batch_dims.back().rhs = dnums.rhs_batch_dimensions(i);
+    mapping.batch_dims.back().output = next_output_dim++;
+  }
+  for (int64 i = 0; i < dnums.lhs_contracting_dimensions_size(); ++i) {
+    mapping.contracting_dims.emplace_back();
+    mapping.contracting_dims.back().lhs = dnums.lhs_contracting_dimensions(i);
+    mapping.contracting_dims.back().rhs = dnums.rhs_contracting_dimensions(i);
+    mapping.contracting_dims.back().output = -1;
+  }
+  for (int64 i = 0; i < hlo->operand(0)->shape().rank(); ++i) {
+    if (absl::c_linear_search(dnums.lhs_batch_dimensions(), i) ||
+        absl::c_linear_search(dnums.lhs_contracting_dimensions(), i)) {
+      continue;
+    }
+    mapping.lhs_non_contracting_dims.emplace_back();
+    mapping.lhs_non_contracting_dims.back().lhs = i;
+    mapping.lhs_non_contracting_dims.back().rhs = -1;
+    mapping.lhs_non_contracting_dims.back().output = next_output_dim++;
+  }
+  for (int64 i = 0; i < hlo->operand(1)->shape().rank(); ++i) {
+    if (absl::c_linear_search(dnums.rhs_batch_dimensions(), i) ||
+        absl::c_linear_search(dnums.rhs_contracting_dimensions(), i)) {
+      continue;
+    }
+    mapping.rhs_non_contracting_dims.emplace_back();
+    mapping.rhs_non_contracting_dims.back().lhs = -1;
+    mapping.rhs_non_contracting_dims.back().rhs = i;
+    mapping.rhs_non_contracting_dims.back().output = next_output_dim++;
+  }
+  auto create_sharded_dot = [&](HloInstruction* l, HloInstruction* r,
+                                SpmdBuilder* b) -> StatusOr<HloInstruction*> {
+    TF_ASSIGN_OR_RETURN(
+        auto sharded_dot_shape,
+        ShapeInference::InferDotOpShape(l->shape(), r->shape(),
+                                        hlo->dot_dimension_numbers()));
+    return b->AddInstruction(HloInstruction::CreateDot(
+        sharded_dot_shape, l, r, hlo->dot_dimension_numbers(),
+        hlo->precision_config()));
+  };
+  return HandleDotHelper(hlo, mapping, create_sharded_dot);
+}
+
+namespace {
+
+StatusOr<HloInstruction*> PartitionBaseCase(
+    PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
+    const HloSharding& output_sharding,
+    const DotGeneralDimsMapping& dims_mapping, int64 num_partitions,
+    const std::function<StatusOr<HloInstruction*>(
+        HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot,
+    HloModule* module, HloInstruction* original_hlo, int64 lhs_batch_partitions,
+    int64 rhs_batch_partitions, int64 output_batch_partitions,
+    int64 lhs_contracting_partitions, int64 rhs_contracting_partitions,
+    int64 lhs_non_contracting_partitions, int64 rhs_non_contracting_partitions,
+    int64 output_lhs_non_contracting_partitions,
+    int64 output_rhs_non_contracting_partitions,
+    int64 threshold_for_windowed_einsum_mib, SpmdBuilder* b,
+    std::vector<SpmdPartitioningVisitor::WindowedDotGeneralLoop>*
+        windowed_dot_general_loops) {
+  const HloSharding& lhs_sharding = lhs.sharding();
+  const HloSharding& rhs_sharding = rhs.sharding();
+  std::vector<int64> lhs_to_rhs_indices(lhs.base_shape().rank(), -1);
+  std::vector<int64> lhs_to_output_indices(lhs.base_shape().rank(), -1);
+  std::vector<int64> rhs_to_lhs_indices(rhs.base_shape().rank(), -1);
+  std::vector<int64> rhs_to_output_indices(rhs.base_shape().rank(), -1);
+  std::vector<int64> output_to_lhs_indices(output_base_shape.rank(), -1);
+  std::vector<int64> output_to_rhs_indices(output_base_shape.rank(), -1);
+  auto populate_indices_mapping =
+      [&](const DotGeneralDimsMapping::DimsMapping& mapping) {
+        if (mapping.lhs >= 0) {
+          lhs_to_rhs_indices[mapping.lhs] = mapping.rhs;
+          lhs_to_output_indices[mapping.lhs] = mapping.output;
+        }
+        if (mapping.rhs >= 0) {
+          rhs_to_lhs_indices[mapping.rhs] = mapping.lhs;
+          rhs_to_output_indices[mapping.rhs] = mapping.output;
+        }
+        if (mapping.output >= 0) {
+          output_to_lhs_indices[mapping.output] = mapping.lhs;
+          output_to_rhs_indices[mapping.output] = mapping.rhs;
+        }
+      };
+  for (const auto& mapping : dims_mapping.batch_dims) {
+    populate_indices_mapping(mapping);
+  }
+  for (const auto& mapping : dims_mapping.contracting_dims) {
+    populate_indices_mapping(mapping);
+  }
+  for (const auto& mapping : dims_mapping.lhs_non_contracting_dims) {
+    populate_indices_mapping(mapping);
+  }
+  for (const auto& mapping : dims_mapping.rhs_non_contracting_dims) {
+    populate_indices_mapping(mapping);
+  }
+  auto lhs_sharding_transposed_to_match_rhs =
+      TransposeShardingWithCollapsedDims(lhs_sharding, lhs_to_rhs_indices,
+                                         rhs_to_lhs_indices);
+  auto rhs_sharding_transposed_to_match_lhs =
+      TransposeShardingWithCollapsedDims(rhs_sharding, rhs_to_lhs_indices,
+                                         lhs_to_rhs_indices);
+  auto lhs_sharding_transposed_to_match_output =
+      TransposeShardingWithCollapsedDims(lhs_sharding, lhs_to_output_indices,
+                                         output_to_lhs_indices);
+  auto rhs_sharding_transposed_to_match_output =
+      TransposeShardingWithCollapsedDims(rhs_sharding, rhs_to_output_indices,
+                                         output_to_rhs_indices);
+  auto output_sharding_transposed_to_match_lhs =
+      TransposeShardingWithCollapsedDims(output_sharding, output_to_lhs_indices,
+                                         lhs_to_output_indices);
+  auto output_sharding_transposed_to_match_rhs =
+      TransposeShardingWithCollapsedDims(output_sharding, output_to_rhs_indices,
+                                         rhs_to_output_indices);
+
+  // LHS and RHS are partitioned the same way and only partitioned in batch
+  // dimensions.
+  if (lhs_batch_partitions == rhs_batch_partitions &&
+      rhs_batch_partitions == num_partitions &&
+      lhs_sharding_transposed_to_match_rhs == rhs_sharding) {
+    TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(lhs.hlo(), rhs.hlo(), b));
+    dot->set_sharding(*lhs_sharding_transposed_to_match_output);
+    return PartitionedHlo(dot, output_base_shape, lhs.state())
+        .Reshard(output_sharding)
+        .hlo();
+  }
+
+  // Try emit batch-partitioned einsum with one operand resharded. Returns
+  // partitioned HLO or nullptr if the attempt fails. If
+  // may_reshard_with_allreduce is false, reshard must be done using
+  // all-to-all/collective-permute; otherwise this attempt fails.
+  auto try_emit_output_batch_partitioned_einsum_with_reshard =
+      [&](bool may_reshard_with_allreduce) -> StatusOr<HloInstruction*> {
+    // LHS and output are batch partitioned in the same way.
+    if (lhs_batch_partitions == num_partitions &&
+        output_batch_partitions == num_partitions &&
+        lhs_sharding_transposed_to_match_output == output_sharding) {
+      if (!may_reshard_with_allreduce &&
+          !CanReshardWithCollectivePermute(
+              rhs.sharding(), *lhs_sharding_transposed_to_match_rhs) &&
+          !GetReshardAllToAllSourceTargetDims(
+              rhs.sharding(), *lhs_sharding_transposed_to_match_rhs)) {
+        return nullptr;
+      }
+      auto resharded_rhs = rhs.Reshard(*lhs_sharding_transposed_to_match_rhs);
+      TF_ASSIGN_OR_RETURN(
+          auto dot, create_sharded_dot(lhs.hlo(), resharded_rhs.hlo(), b));
+      return dot;
+    }
+    // RHS and output are batch partitioned in the same way.
+    if (rhs_batch_partitions == num_partitions &&
+        output_batch_partitions == num_partitions &&
+        rhs_sharding_transposed_to_match_output == output_sharding) {
+      if (!may_reshard_with_allreduce &&
+          !CanReshardWithCollectivePermute(
+              lhs.sharding(), *rhs_sharding_transposed_to_match_lhs) &&
+          !GetReshardAllToAllSourceTargetDims(
+              lhs.sharding(), *rhs_sharding_transposed_to_match_lhs)) {
+        return nullptr;
+      }
+      auto resharded_lhs = lhs.Reshard(*rhs_sharding_transposed_to_match_lhs);
+      TF_ASSIGN_OR_RETURN(
+          auto dot, create_sharded_dot(resharded_lhs.hlo(), rhs.hlo(), b));
+      return dot;
+    }
+    return nullptr;
+  };
+
+  {
+    // Try batch-parallel by resharding one operand, and not using all-reduce.
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * partitioned_dot,
+        try_emit_output_batch_partitioned_einsum_with_reshard(false));
+    if (partitioned_dot) {
+      return partitioned_dot;
+    }
+  }
+
+  // Try to emit windowed DotGeneral when one operand is partitioned in the same
+  // way as the output along non-contracting dimensions, but the other operand
+  // is tiled in other dimensions.
+  auto emit_windowed_dot_general =
+      [&](int64 matching_operand, int64 windowing_operand,
+          bool windowed_at_contracting_dims,
+          bool windowed_at_batch_dims) -> StatusOr<HloInstruction*> {
+    CHECK_EQ(matching_operand + windowing_operand, 1);
+    CHECK(!windowed_at_batch_dims || !windowed_at_contracting_dims);
+    auto unpadded_result_buffer_shape =
+        MakePartitionedShape(output_base_shape, output_sharding);
+    auto padded_result_buffer_shape = unpadded_result_buffer_shape;
+    // For windowing at batch/non-contracting dims, we produce the result one
+    // partition at a time, so we need to pad the shape in case of uneven
+    // partitioning in order to make dynamic-update-slice in-bound.
+    if (!windowed_at_contracting_dims) {
+      padded_result_buffer_shape = GetPaddedShapeForUnevenPartitioning(
+          padded_result_buffer_shape,
+          windowing_operand == 0 ? *lhs_sharding_transposed_to_match_output
+                                 : *rhs_sharding_transposed_to_match_output);
+    }
+    // Mask the padding area of the windowed operand with zero if there is
+    // uneven partitioning.
+    if (windowed_at_contracting_dims) {
+      auto& to_mask = windowing_operand == 0 ? lhs : rhs;
+      to_mask =
+          to_mask.PadWithValue(b->AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::Zero(output_base_shape.element_type()))));
+    }
+    auto result_buffer = CreateZero(padded_result_buffer_shape, b);
+    auto iteration = b->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(0)));
+
+    // Create a while loop that computes one window per iteration. During each
+    // iteration, each partition sends its input window to its neighbor using
+    // collective-permute for the next iteration.
+    SpmdBuilder body_b("windowed_dot_general_body", original_hlo);
+    auto param = body_b.AddInstruction(HloInstruction::CreateParameter(
+        /*parameter_number=*/0,
+        ShapeUtil::MakeTupleShape({lhs.hlo()->shape(), rhs.hlo()->shape(),
+                                   result_buffer->shape(), iteration->shape()}),
+        "param"));
+    auto l = body_b.AddInstruction(
+        HloInstruction::CreateGetTupleElement(lhs.hlo()->shape(), param, 0));
+    auto r = body_b.AddInstruction(
+        HloInstruction::CreateGetTupleElement(rhs.hlo()->shape(), param, 1));
+    auto o = body_b.AddInstruction(HloInstruction::CreateGetTupleElement(
+        result_buffer->shape(), param, 2));
+    auto i = body_b.AddInstruction(
+        HloInstruction::CreateGetTupleElement(iteration->shape(), param, 3));
+
+    auto partition_id =
+        lhs.state().collective_ops_creator.create_partition_id(&body_b);
+    auto data_partition_id = body_b.AddInstruction(HloInstruction::CreateBinary(
+        i->shape(), HloOpcode::kAdd, i, partition_id));
+    auto partition_count = body_b.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR0<uint32>(num_partitions)));
+    data_partition_id = body_b.AddInstruction(HloInstruction::CreateBinary(
+        i->shape(), HloOpcode::kRemainder, data_partition_id, partition_count));
+    auto dot_lhs = l;
+    auto dot_rhs = r;
+    if (windowed_at_contracting_dims || windowed_at_batch_dims) {
+      // Slice the matching operand according to the partitioned contracting
+      // dimensions on the windowed operand. We do this by treating the matching
+      // operand as replicated, and resharding it to match the windowed operand.
+      auto slice_operand = matching_operand == 0 ? l : r;
+      slice_operand->set_sharding(HloSharding::Replicate());
+      auto state = lhs.state();
+      state.b = &body_b;
+      state.partition_id = data_partition_id;
+      auto slice = PartitionedHlo(slice_operand, slice_operand->shape(), state)
+                       .Reshard(windowing_operand == 0
+                                    ? *lhs_sharding_transposed_to_match_rhs
+                                    : *rhs_sharding_transposed_to_match_lhs)
+                       .hlo();
+      slice_operand->clear_sharding();
+      if (matching_operand == 0) {
+        dot_lhs = slice;
+      } else {
+        dot_rhs = slice;
+      }
+    }
+    TF_ASSIGN_OR_RETURN(auto dot,
+                        create_sharded_dot(dot_lhs, dot_rhs, &body_b));
+    if (windowed_at_contracting_dims) {
+      // Accumulate the partial output to the result buffer.
+      o = body_b.AddInstruction(
+          HloInstruction::CreateBinary(o->shape(), HloOpcode::kAdd, o, dot));
+    } else {
+      // The windowing operand is partitioned along batch/non-contracting
+      // dimensions, so we need a dynamic-update-slice to save the partial
+      // output in the result buffer.
+      auto offsets = MakePartitionOffsets(
+          o->shape(),
+          windowing_operand == 0 ? *lhs_sharding_transposed_to_match_output
+                                 : *rhs_sharding_transposed_to_match_output,
+          data_partition_id, &body_b);
+      o = body_b.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+          o->shape(), o, dot, offsets));
+    }
+
+    // ++i
+    i = body_b.AddInstruction(HloInstruction::CreateBinary(
+        i->shape(), HloOpcode::kAdd, i,
+        body_b.AddInstruction(
+            HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(1)))));
+    auto has_more = body_b.AddInstruction(HloInstruction::CreateCompare(
+        ShapeUtil::MakeShape(PRED, {}), i,
+        body_b.AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::CreateR0<uint32>(num_partitions))),
+        ComparisonDirection::kLt));
+    // Collective-permute for the next window. We don't need it for the last
+    // iteration, so we use a conditional around the collective-permute.
+    HloInstruction* conditional;
+    {
+      SpmdBuilder cp_b("window_collective_permute", original_hlo);
+      {
+        auto p = cp_b.AddInstruction(HloInstruction::CreateParameter(
+            0, windowing_operand == 0 ? l->shape() : r->shape(), "window"));
+        std::vector<std::pair<int64, int64>> sd_pairs(num_partitions);
+        for (int64 source = 0; source < num_partitions; ++source) {
+          // 0 -> n-1, 1 -> 0, 2 -> 1, ...
+          sd_pairs[source] = {source,
+                              (source - 1 + num_partitions) % num_partitions};
+        }
+        lhs.state()
+            .collective_ops_creator.create_cross_partition_collective_permute(
+                &cp_b, p, sd_pairs, (*lhs.state().next_channel_id)++);
+      }
+      SpmdBuilder ncp_b("last_iteration_noop", original_hlo);
+      {
+        ncp_b.AddInstruction(HloInstruction::CreateParameter(
+            0, windowing_operand == 0 ? l->shape() : r->shape(), "window"));
+      }
+      conditional = body_b.AddInstruction(HloInstruction::CreateConditional(
+          windowing_operand == 0 ? l->shape() : r->shape(), has_more,
+          windowing_operand == 0 ? l : r,
+          module->AddEmbeddedComputation(cp_b.Build()),
+          windowing_operand == 0 ? l : r,
+          module->AddEmbeddedComputation(ncp_b.Build())));
+    }
+    if (windowing_operand == 0) {
+      l = conditional;
+    } else {
+      r = conditional;
+    }
+    body_b.AddInstruction(HloInstruction::CreateTuple({l, r, o, i}));
+
+    SpmdBuilder cond_b("windowed_dot_general_cond", original_hlo);
+    auto cond_param = cond_b.AddInstruction(HloInstruction::CreateParameter(
+        /*parameter_number=*/0,
+        ShapeUtil::MakeTupleShape({lhs.hlo()->shape(), rhs.hlo()->shape(),
+                                   result_buffer->shape(), iteration->shape()}),
+        "param"));
+    auto cond_i = cond_b.AddInstruction(HloInstruction::CreateGetTupleElement(
+        iteration->shape(), cond_param, 3));
+    cond_b.AddInstruction(HloInstruction::CreateCompare(
+        ShapeUtil::MakeShape(PRED, {}), cond_i,
+        cond_b.AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::CreateR0<uint32>(num_partitions))),
+        ComparisonDirection::kLt));
+    auto while_loop = b->AddInstruction(HloInstruction::CreateWhile(
+        cond_param->shape(), module->AddEmbeddedComputation(cond_b.Build()),
+        module->AddEmbeddedComputation(body_b.Build()),
+        b->AddInstruction(HloInstruction::CreateTuple(
+            {lhs.hlo(), rhs.hlo(), result_buffer, iteration}))));
+    windowed_dot_general_loops->push_back({while_loop, windowing_operand,
+                                           windowed_at_contracting_dims,
+                                           windowed_at_batch_dims});
+    auto result = b->AddInstruction(HloInstruction::CreateGetTupleElement(
+        result_buffer->shape(), while_loop, 2));
+    if (!ShapeUtil::Compatible(padded_result_buffer_shape,
+                               unpadded_result_buffer_shape)) {
+      result = b->AddInstruction(HloInstruction::CreateSlice(
+          unpadded_result_buffer_shape, result,
+          std::vector<int64>(padded_result_buffer_shape.rank(), 0),
+          unpadded_result_buffer_shape.dimensions(),
+          std::vector<int64>(padded_result_buffer_shape.rank(), 1)));
+    }
+    return result;
+  };
+  if (output_lhs_non_contracting_partitions == num_partitions &&
+      output_sharding_transposed_to_match_lhs == lhs_sharding &&
+      ShapeSizeInBytes(rhs.base_shape()) >=
+          threshold_for_windowed_einsum_mib * 1024 * 1024) {
+    if (rhs_contracting_partitions == num_partitions) {
+      return emit_windowed_dot_general(0, 1, true, false);
+    }
+    if (rhs_non_contracting_partitions == num_partitions) {
+      return emit_windowed_dot_general(0, 1, false, false);
+    }
+    if (rhs_batch_partitions == num_partitions) {
+      return emit_windowed_dot_general(0, 1, false, true);
+    }
+  }
+  if (output_rhs_non_contracting_partitions == num_partitions &&
+      output_sharding_transposed_to_match_rhs == rhs_sharding &&
+      ShapeSizeInBytes(lhs.base_shape()) >=
+          threshold_for_windowed_einsum_mib * 1024 * 1024) {
+    if (lhs_contracting_partitions == num_partitions) {
+      return emit_windowed_dot_general(1, 0, true, false);
+    }
+    if (lhs_non_contracting_partitions == num_partitions) {
+      return emit_windowed_dot_general(1, 0, false, false);
+    }
+    if (lhs_batch_partitions == num_partitions) {
+      return emit_windowed_dot_general(1, 0, false, true);
+    }
+  }
+
+  {
+    // Try batch-parallel by resharding one operand, and allowing all-reduce.
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * partitioned_dot,
+        try_emit_output_batch_partitioned_einsum_with_reshard(true));
+    if (partitioned_dot) {
+      return partitioned_dot;
+    }
+  }
+
+  // LHS and RHS have the same partitioned contracting dimensions.
+  if (lhs_contracting_partitions == rhs_contracting_partitions &&
+      lhs_contracting_partitions == num_partitions) {
+    auto zero = b->AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::Zero(output_base_shape.element_type())));
+    // Pad both sides with zero, since NaN at one side cannot be masked by zero
+    // on the other side.
+    if (ShapeSizeInBytes(lhs.base_shape()) <
+        ShapeSizeInBytes(rhs.base_shape())) {
+      lhs =
+          lhs.Reshard(*rhs_sharding_transposed_to_match_lhs).PadWithValue(zero);
+      rhs = rhs.PadWithValue(zero);
+    } else {
+      lhs = lhs.PadWithValue(zero);
+      rhs =
+          rhs.Reshard(*lhs_sharding_transposed_to_match_rhs).PadWithValue(zero);
+    }
+    TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(lhs.hlo(), rhs.hlo(), b));
+    auto ar =
+        lhs.state().collective_ops_creator.create_cross_partition_all_reduce(
+            b, dot, MakeBinaryAdd(output_base_shape.element_type(), module), {},
+            (*lhs.state().next_channel_id)++);
+    ar->set_sharding(HloSharding::Replicate());
+    return PartitionedHlo(ar, output_base_shape, lhs.state())
+        .Reshard(output_sharding)
+        .hlo();
+  }
+
+  // LHS and output have the same partitioned non-contracting dimensions.
+  if (lhs_non_contracting_partitions == num_partitions &&
+      output_lhs_non_contracting_partitions == num_partitions &&
+      lhs_sharding_transposed_to_match_output == output_sharding) {
+    auto rhs_replicated = rhs.Reshard(HloSharding::Replicate()).hlo();
+    TF_ASSIGN_OR_RETURN(auto dot,
+                        create_sharded_dot(lhs.hlo(), rhs_replicated, b));
+    return dot;
+  }
+
+  // RHS and output have the same partitioned non-contracting dimensions.
+  if (rhs_non_contracting_partitions == num_partitions &&
+      output_rhs_non_contracting_partitions == num_partitions &&
+      rhs_sharding_transposed_to_match_output == output_sharding) {
+    auto lhs_replicated = lhs.Reshard(HloSharding::Replicate()).hlo();
+    TF_ASSIGN_OR_RETURN(auto dot,
+                        create_sharded_dot(lhs_replicated, rhs.hlo(), b));
+    return dot;
+  }
+
+  // Output is batch partitioned.
+  if (output_batch_partitions == num_partitions) {
+    auto resharded_lhs = lhs.Reshard(*output_sharding_transposed_to_match_lhs);
+    auto resharded_rhs = rhs.Reshard(*output_sharding_transposed_to_match_rhs);
+    TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(resharded_lhs.hlo(),
+                                                     resharded_rhs.hlo(), b));
+    return dot;
+  }
+  // Output is partitioned along LHS non-contracting dimensions.
+  if (output_lhs_non_contracting_partitions == num_partitions) {
+    auto resharded_lhs = lhs.Reshard(*output_sharding_transposed_to_match_lhs);
+    auto replicated_rhs = rhs.Reshard(HloSharding::Replicate());
+    TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(resharded_lhs.hlo(),
+                                                     replicated_rhs.hlo(), b));
+    return dot;
+  }
+  // Output is partitioned along RHS non-contracting dimensions.
+  if (output_rhs_non_contracting_partitions == num_partitions) {
+    auto replicated_lhs = lhs.Reshard(HloSharding::Replicate());
+    auto resharded_rhs = rhs.Reshard(*output_sharding_transposed_to_match_rhs);
+    TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(replicated_lhs.hlo(),
+                                                     resharded_rhs.hlo(), b));
+    return dot;
+  }
+
+  // Returns true if it is beneficial to reshard the operand at `operand_idx`
+  // across the contracting dimension.
+  const auto should_partition_contracting_dim = [&](int64 operand_idx) {
+    if (!output_sharding.IsReplicated()) {
+      return false;
+    }
+
+    if (operand_idx == 0) {
+      // If LHS and output are replicated, we compare the cost of all-gather
+      // on RHS vs all-reduce on the output.
+      return (rhs_contracting_partitions == num_partitions) &&
+             lhs.sharding().IsReplicated() &&
+             ShapeUtil::ElementsIn(rhs.base_shape()) >
+                 ShapeUtil::ElementsIn(output_base_shape);
+    } else {
+      return (lhs_contracting_partitions == num_partitions) &&
+             rhs.sharding().IsReplicated() &&
+             ShapeUtil::ElementsIn(lhs.base_shape()) >
+                 ShapeUtil::ElementsIn(output_base_shape);
+    }
+  };
+
+  // When the output is replicated and one of the operands is partitioned along
+  // contracting dimension, align the other operand to be partitioned along
+  // the contracting dimensions.
+  if (output_sharding.IsReplicated() && (should_partition_contracting_dim(0) ||
+                                         should_partition_contracting_dim(1))) {
+    auto zero = b->AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::Zero(output_base_shape.element_type())));
+    if (should_partition_contracting_dim(0)) {
+      lhs =
+          lhs.Reshard(*rhs_sharding_transposed_to_match_lhs).PadWithValue(zero);
+      rhs = rhs.PadWithValue(zero);
+    } else {
+      lhs = lhs.PadWithValue(zero);
+      rhs =
+          rhs.Reshard(*lhs_sharding_transposed_to_match_rhs).PadWithValue(zero);
+    }
+    TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(lhs.hlo(), rhs.hlo(), b));
+    return lhs.state().collective_ops_creator.create_cross_partition_all_reduce(
+        b, dot, MakeBinaryAdd(output_base_shape.element_type(), module), {},
+        (*lhs.state().next_channel_id)++);
+  }
+  return nullptr;
+}
+
+StatusOr<HloInstruction*> PartitionDot(
+    PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
+    const HloSharding& output_sharding,
+    const DotGeneralDimsMapping& dims_mapping, int64 num_partitions,
+    const std::function<StatusOr<HloInstruction*>(
+        HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot,
+    HloModule* module, HloInstruction* original_hlo,
+    int64 threshold_for_windowed_einsum_mib, SpmdBuilder* b,
+    std::vector<SpmdPartitioningVisitor::WindowedDotGeneralLoop>*
+        windowed_dot_general_loops);
+
+StatusOr<HloInstruction*> PartitionDotGroupOnBatch(
+    PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
+    const HloSharding& output_sharding,
+    const DotGeneralDimsMapping& dims_mapping, int64 num_partitions,
+    int64 lhs_contracting_partitions, int64 rhs_contracting_partitions,
+    int64 lhs_non_contracting_partitions, int64 rhs_non_contracting_partitions,
+    const std::function<StatusOr<HloInstruction*>(
+        HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot,
+    HloModule* module, HloInstruction* original_hlo,
+    int64 threshold_for_windowed_einsum_mib, SpmdBuilder* b,
+    std::vector<SpmdPartitioningVisitor::WindowedDotGeneralLoop>*
+        windowed_dot_general_loops) {
+  std::vector<int64> lhs_dims;
+  std::vector<int64> rhs_dims;
+  std::vector<int64> output_dims;
+  auto lhs_sharding_dims_adjusted_to_output =
+      lhs.sharding().IsReplicated()
+          ? std::vector<int64>(lhs.base_shape().rank(), 1)
+          : lhs.sharding().tile_assignment().dimensions();
+  auto rhs_sharding_dims_adjusted_to_output =
+      rhs.sharding().IsReplicated()
+          ? std::vector<int64>(rhs.base_shape().rank(), 1)
+          : rhs.sharding().tile_assignment().dimensions();
+  auto output_sharding_dims_adjusted_to_lhs =
+      output_sharding.tile_assignment().dimensions();
+  bool lhs_rhs_dims_matching = true;
+  for (const auto& dim : dims_mapping.batch_dims) {
+    lhs_dims.push_back(dim.lhs);
+    rhs_dims.push_back(dim.rhs);
+    output_dims.push_back(dim.output);
+    if (lhs_sharding_dims_adjusted_to_output[dim.lhs] !=
+        rhs_sharding_dims_adjusted_to_output[dim.rhs]) {
+      lhs_rhs_dims_matching = false;
+    }
+    lhs_sharding_dims_adjusted_to_output[dim.lhs] =
+        output_sharding.tile_assignment().dim(dim.output);
+    rhs_sharding_dims_adjusted_to_output[dim.rhs] =
+        output_sharding.tile_assignment().dim(dim.output);
+    output_sharding_dims_adjusted_to_lhs[dim.output] =
+        lhs.sharding().tile_assignment().dim(dim.lhs);
+  }
+  auto output_grouped = GroupShardingOnDims(output_sharding, output_dims);
+  PartitionedHlo per_group_lhs = lhs;
+  PartitionedHlo per_group_rhs = rhs;
+  auto lhs_sharding = lhs.sharding();
+  auto rhs_sharding = rhs.sharding();
+  if (lhs_rhs_dims_matching) {
+    auto lhs_grouped = GroupShardingOnDims(lhs.sharding(), lhs_dims);
+    auto rhs_grouped = GroupShardingOnDims(rhs.sharding(), rhs_dims);
+    if (ShapeUtil::ByteSizeOf(lhs.base_shape()) >
+        ShapeUtil::ByteSizeOf(rhs.base_shape())) {
+      rhs_grouped = AlignGroupsWith(std::move(rhs_grouped), lhs_grouped);
+      rhs = rhs.Reshard(UngroupSharding(rhs_grouped));
+    } else {
+      lhs_grouped = AlignGroupsWith(std::move(lhs_grouped), rhs_grouped);
+      lhs = lhs.Reshard(UngroupSharding(lhs_grouped));
+    }
+    auto reshaped_output_tiling = output_sharding.tile_assignment();
+    reshaped_output_tiling.Reshape(output_sharding_dims_adjusted_to_lhs);
+    output_grouped = AlignGroupsWith(
+        GroupShardingOnDims(HloSharding::Tile(reshaped_output_tiling),
+                            output_dims),
+        lhs_grouped);
+    auto per_group_partitioner_state = CreatePerGroupPartitioningState(
+        lhs.state(), lhs_grouped.device_groups, b);
+    lhs.hlo()->set_sharding(lhs_grouped.sharding);
+    rhs.hlo()->set_sharding(rhs_grouped.sharding);
+    CHECK(lhs.hlo() != rhs.hlo() ||
+          lhs_grouped.sharding == rhs_grouped.sharding);
+    per_group_lhs = PartitionedHlo(
+        lhs.hlo(), GetPerGroupBaseShape(lhs_grouped, lhs.base_shape()),
+        per_group_partitioner_state);
+    per_group_rhs = PartitionedHlo(
+        rhs.hlo(), GetPerGroupBaseShape(rhs_grouped, rhs.base_shape()),
+        per_group_partitioner_state);
+  } else {
+    auto per_group_partitioner_state = CreatePerGroupPartitioningState(
+        lhs.state(), output_grouped.device_groups, b);
+    auto reshard_to_output_batch =
+        [&](PartitionedHlo operand, absl::Span<const int64> batch_dims,
+            absl::Span<const int64> contracting_dims,
+            absl::Span<const int64> non_contracting_dims,
+            int64 contracting_dim_partitions,
+            int64 non_contracting_dim_partitions,
+            int64 other_contracting_dim_partitions,
+            std::vector<int64>* sharding_dims_adjusted_to_output)
+        -> absl::optional<PartitionedHlo> {
+      if (operand.sharding().IsReplicated()) {
+        auto partially_sharded = PerGroupSliceFromReplicated(
+            operand.hlo(), operand.state().partition_id,
+            output_grouped.device_groups, batch_dims,
+            output_grouped.group_dim_sizes, b);
+        partially_sharded->set_sharding(HloSharding::Replicate());
+        return PartitionedHlo(partially_sharded, partially_sharded->shape(),
+                              per_group_partitioner_state);
+      }
+      auto reshaped_tiling = operand.sharding().tile_assignment();
+      // It's possible that the operand is not initially sharded on batch
+      // dimensions in the same way as the output, although being tiled. In that
+      // case, the current sharding_dims_adjusted_to_output may contain more
+      // partitions than available devices. We remove partitioning on other
+      // dimensions.
+      if (Product(*sharding_dims_adjusted_to_output) >
+          reshaped_tiling.num_elements()) {
+        if (Product(*sharding_dims_adjusted_to_output) %
+                reshaped_tiling.num_elements() !=
+            0) {
+          return absl::nullopt;
+        }
+        int64 ratio = Product(*sharding_dims_adjusted_to_output) /
+                      reshaped_tiling.num_elements();
+        if (ratio == non_contracting_dim_partitions &&
+            (ratio != contracting_dim_partitions ||
+             contracting_dim_partitions == other_contracting_dim_partitions)) {
+          for (int64 dim : non_contracting_dims) {
+            (*sharding_dims_adjusted_to_output)[dim] = 1;
+          }
+        } else if (ratio == contracting_dim_partitions) {
+          for (int64 dim : contracting_dims) {
+            (*sharding_dims_adjusted_to_output)[dim] = 1;
+          }
+        }
+      }
+      // If the operand is initially sharded more ways than the output in the
+      // batch dimensions, sharding_dims_adjusted_to_output currently contains
+      // fewer partitions than available devices. We do not handle this case.
+      if (Product(*sharding_dims_adjusted_to_output) <
+          reshaped_tiling.num_elements()) {
+        return absl::nullopt;
+      }
+      reshaped_tiling.Reshape(*sharding_dims_adjusted_to_output);
+      auto grouped = AlignGroupsWith(
+          GroupShardingOnDims(HloSharding::Tile(reshaped_tiling), batch_dims),
+          output_grouped);
+      auto resharded = operand.Reshard(UngroupSharding(grouped));
+      resharded.hlo()->set_sharding(grouped.sharding);
+      return PartitionedHlo(resharded.hlo(),
+                            GetPerGroupBaseShape(grouped, operand.base_shape()),
+                            per_group_partitioner_state);
+    };
+    std::vector<int64> lhs_contracting_dims;
+    std::vector<int64> rhs_contracting_dims;
+    lhs_contracting_dims.reserve(dims_mapping.contracting_dims.size());
+    rhs_contracting_dims.reserve(dims_mapping.contracting_dims.size());
+    for (const auto& dim : dims_mapping.contracting_dims) {
+      lhs_contracting_dims.push_back(dim.lhs);
+      rhs_contracting_dims.push_back(dim.rhs);
+    }
+    std::vector<int64> lhs_non_contracting_dims;
+    std::vector<int64> rhs_non_contracting_dims;
+    lhs_non_contracting_dims.reserve(
+        dims_mapping.lhs_non_contracting_dims.size());
+    rhs_non_contracting_dims.reserve(
+        dims_mapping.rhs_non_contracting_dims.size());
+    for (const auto& dim : dims_mapping.lhs_non_contracting_dims) {
+      lhs_non_contracting_dims.push_back(dim.lhs);
+    }
+    for (const auto& dim : dims_mapping.rhs_non_contracting_dims) {
+      rhs_non_contracting_dims.push_back(dim.rhs);
+    }
+    if (auto resharded = reshard_to_output_batch(
+            lhs, lhs_dims, lhs_contracting_dims, lhs_non_contracting_dims,
+            lhs_contracting_partitions, lhs_non_contracting_partitions,
+            rhs_contracting_partitions,
+            &lhs_sharding_dims_adjusted_to_output)) {
+      per_group_lhs = *resharded;
+    } else {
+      return nullptr;
+    }
+    if (auto resharded = reshard_to_output_batch(
+            rhs, rhs_dims, rhs_contracting_dims, rhs_non_contracting_dims,
+            rhs_contracting_partitions, rhs_non_contracting_partitions,
+            lhs_contracting_partitions,
+            &rhs_sharding_dims_adjusted_to_output)) {
+      per_group_rhs = *resharded;
+    } else {
+      return nullptr;
+    }
+    CHECK(lhs.hlo() != rhs.hlo() ||
+          per_group_lhs.sharding() == per_group_rhs.sharding());
+  }
+  TF_ASSIGN_OR_RETURN(
+      auto dot,
+      PartitionDot(per_group_lhs, per_group_rhs,
+                   GetPerGroupBaseShape(output_grouped, output_base_shape),
+                   output_grouped.sharding, dims_mapping,
+                   num_partitions / output_grouped.device_groups.size(),
+                   create_sharded_dot, module, original_hlo,
+                   threshold_for_windowed_einsum_mib, b,
+                   windowed_dot_general_loops));
+  // Make sure the operands' sharding are set to the ungrouped ones.
+  lhs.hlo()->set_sharding(lhs_sharding);
+  rhs.hlo()->set_sharding(rhs_sharding);
+  dot->set_sharding(UngroupSharding(output_grouped));
+  return PartitionedHlo(dot, output_base_shape, lhs.state())
+      .Reshard(output_sharding)
+      .hlo();
+}
+
+StatusOr<HloInstruction*> PartitionDotGroupOnNonContracting(
+    bool lhs_matching, PartitionedHlo matching, PartitionedHlo other,
+    int64 matching_contracting_partitions, int64 other_contracting_partitions,
+    int64 matching_non_contracting_partitions,
+    int64 other_non_contracting_partitions,
+    int64 output_other_non_contracting_partitions,
+    const Shape& output_base_shape, const HloSharding& output_sharding,
+    const DotGeneralDimsMapping& dims_mapping, int64 num_partitions,
+    const std::function<StatusOr<HloInstruction*>(
+        HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot,
+    HloModule* module, HloInstruction* original_hlo,
+    int64 threshold_for_windowed_einsum_mib, SpmdBuilder* b,
+    std::vector<SpmdPartitioningVisitor::WindowedDotGeneralLoop>*
+        windowed_dot_general_loops) {
+  const bool may_replicate_other_contracting_dims =
+      (other_contracting_partitions == matching_non_contracting_partitions &&
+       other_non_contracting_partitions ==
+           output_other_non_contracting_partitions);
+  const bool may_replicate_other_non_contracting_dims =
+      matching_non_contracting_partitions == other_non_contracting_partitions &&
+      matching_contracting_partitions == other_contracting_partitions;
+  std::vector<int64> other_group_dims;
+  if (may_replicate_other_contracting_dims &&
+      (!may_replicate_other_non_contracting_dims ||
+       ShapeUtil::ByteSizeOf(other.base_shape()) <=
+           ShapeUtil::ByteSizeOf(output_base_shape))) {
+    for (const auto& dim : dims_mapping.contracting_dims) {
+      other_group_dims.push_back(lhs_matching ? dim.rhs : dim.lhs);
+    }
+  } else if (may_replicate_other_non_contracting_dims) {
+    for (const auto& dim : lhs_matching
+                               ? dims_mapping.rhs_non_contracting_dims
+                               : dims_mapping.lhs_non_contracting_dims) {
+      other_group_dims.push_back(lhs_matching ? dim.rhs : dim.lhs);
+    }
+  } else if (!other.sharding().IsReplicated()) {
+    return nullptr;
+  }
+  auto matching_sharding_dims =
+      matching.sharding().tile_assignment().dimensions();
+  std::vector<int64> matching_dims;
+  std::vector<int64> output_dims;
+  // Make sure the partitioning on matching's non-contracting dimensions
+  // defines the same device groups for both matching and output.
+  for (const auto& dim : lhs_matching ? dims_mapping.lhs_non_contracting_dims
+                                      : dims_mapping.rhs_non_contracting_dims) {
+    int64 md = lhs_matching ? dim.lhs : dim.rhs;
+    matching_sharding_dims[md] =
+        output_sharding.tile_assignment().dim(dim.output);
+    matching_dims.push_back(md);
+    output_dims.push_back(dim.output);
+  }
+  auto output_grouped = GroupShardingOnDims(output_sharding, output_dims);
+  auto reshaped_matching_tiling = matching.sharding().tile_assignment();
+  reshaped_matching_tiling.Reshape(matching_sharding_dims);
+  auto matching_grouped = AlignGroupsWith(
+      GroupShardingOnDims(HloSharding::Tile(reshaped_matching_tiling),
+                          matching_dims),
+      output_grouped);
+  matching = matching.Reshard(UngroupSharding(matching_grouped));
+  auto per_group_partitioner_state = CreatePerGroupPartitioningState(
+      matching.state(), matching_grouped.device_groups, b);
+  matching.hlo()->set_sharding(matching_grouped.sharding);
+  auto matching_p = PartitionedHlo(
+      matching.hlo(),
+      GetPerGroupBaseShape(matching_grouped, matching.base_shape()),
+      per_group_partitioner_state);
+
+  auto partially_replicated_other = other.hlo();
+  if (!other.sharding().IsReplicated()) {
+    auto other_grouped =
+        AlignGroupsWith(GroupShardingOnDims(other.sharding(), other_group_dims),
+                        output_grouped, /*ignore_group_order=*/true);
+    other = other.Reshard(UngroupSharding(other_grouped));
+    partially_replicated_other =
+        other.ReplicatePartial(other_grouped.group_dims);
+    partially_replicated_other->set_sharding(other_grouped.sharding);
+  }
+  auto other_p = PartitionedHlo(partially_replicated_other, other.base_shape(),
+                                per_group_partitioner_state);
+  TF_ASSIGN_OR_RETURN(
+      auto dot,
+      PartitionDot(lhs_matching ? matching_p : other_p,
+                   lhs_matching ? other_p : matching_p,
+                   GetPerGroupBaseShape(output_grouped, output_base_shape),
+                   output_grouped.sharding, dims_mapping,
+                   num_partitions / matching_grouped.device_groups.size(),
+                   create_sharded_dot, module, original_hlo,
+                   threshold_for_windowed_einsum_mib, b,
+                   windowed_dot_general_loops));
+  // Reset matching's sharding to the ungrouped one.
+  matching.hlo()->set_sharding(UngroupSharding(matching_grouped));
+  return dot;
+}
+
+// Recursive partitioning function. If there are partial dimensions matching in
+// the operands and output, group the devices and recursively partition the
+// in-group dot.
+StatusOr<HloInstruction*> PartitionDot(
+    PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
+    const HloSharding& output_sharding,
+    const DotGeneralDimsMapping& dims_mapping, int64 num_partitions,
+    const std::function<StatusOr<HloInstruction*>(
+        HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot,
+    HloModule* module, HloInstruction* original_hlo,
+    int64 threshold_for_windowed_einsum_mib, SpmdBuilder* b,
+    std::vector<SpmdPartitioningVisitor::WindowedDotGeneralLoop>*
+        windowed_dot_general_loops) {
+  // lhs_rhs_or_output: 0 lhs, 1 rhs, 2 output.
+  auto get_partitions_for_dims =
+      [&](const HloSharding& sharding,
+          absl::Span<const DotGeneralDimsMapping::DimsMapping> dims,
+          int lhs_rhs_or_output) {
+        int64 partitions = 1;
+        if (sharding.IsTileMaximal()) {
+          return partitions;
+        }
+        for (const auto& dim : dims) {
+          if (lhs_rhs_or_output == 0) {
+            partitions *= sharding.tile_assignment().dim(dim.lhs);
+          } else if (lhs_rhs_or_output == 1) {
+            partitions *= sharding.tile_assignment().dim(dim.rhs);
+          } else {
+            CHECK_EQ(lhs_rhs_or_output, 2);
+            partitions *= sharding.tile_assignment().dim(dim.output);
+          }
+        }
+        return partitions;
+      };
+  const int64 lhs_batch_partitions =
+      get_partitions_for_dims(lhs.sharding(), dims_mapping.batch_dims, 0);
+  const int64 rhs_batch_partitions =
+      get_partitions_for_dims(rhs.sharding(), dims_mapping.batch_dims, 1);
+  const int64 output_batch_partitions =
+      get_partitions_for_dims(output_sharding, dims_mapping.batch_dims, 2);
+  const int64 lhs_contracting_partitions =
+      get_partitions_for_dims(lhs.sharding(), dims_mapping.contracting_dims, 0);
+  const int64 rhs_contracting_partitions =
+      get_partitions_for_dims(rhs.sharding(), dims_mapping.contracting_dims, 1);
+  const int64 lhs_non_contracting_partitions = get_partitions_for_dims(
+      lhs.sharding(), dims_mapping.lhs_non_contracting_dims, 0);
+  const int64 rhs_non_contracting_partitions = get_partitions_for_dims(
+      rhs.sharding(), dims_mapping.rhs_non_contracting_dims, 1);
+  const int64 output_lhs_non_contracting_partitions = get_partitions_for_dims(
+      output_sharding, dims_mapping.lhs_non_contracting_dims, 2);
+  const int64 output_rhs_non_contracting_partitions = get_partitions_for_dims(
+      output_sharding, dims_mapping.rhs_non_contracting_dims, 2);
+  TF_ASSIGN_OR_RETURN(
+      auto try_partitioned_dot,
+      PartitionBaseCase(
+          lhs, rhs, output_base_shape, output_sharding, dims_mapping,
+          num_partitions, create_sharded_dot, module, original_hlo,
+          lhs_batch_partitions, rhs_batch_partitions, output_batch_partitions,
+          lhs_contracting_partitions, rhs_contracting_partitions,
+          lhs_non_contracting_partitions, rhs_non_contracting_partitions,
+          output_lhs_non_contracting_partitions,
+          output_rhs_non_contracting_partitions,
+          threshold_for_windowed_einsum_mib, b, windowed_dot_general_loops));
+  if (try_partitioned_dot) {
+    return try_partitioned_dot;
+  }
+
+  // Recursively partition on different types of dimensions.
+  //
+  // Case 1: Group partitions by batch.
+  if ((lhs_batch_partitions == output_batch_partitions ||
+       rhs_batch_partitions == output_batch_partitions) &&
+      output_batch_partitions > 1) {
+    TF_ASSIGN_OR_RETURN(
+        auto dot,
+        PartitionDotGroupOnBatch(
+            lhs, rhs, output_base_shape, output_sharding, dims_mapping,
+            num_partitions, lhs_contracting_partitions,
+            rhs_contracting_partitions, lhs_non_contracting_partitions,
+            rhs_non_contracting_partitions, create_sharded_dot, module,
+            original_hlo, threshold_for_windowed_einsum_mib, b,
+            windowed_dot_general_loops));
+    if (dot) {
+      return dot;
+    }
+  }
+
+  // Case 2: Group partitions by non-contracting dimensions.
+  const bool may_group_on_lhs_non_contracting =
+      lhs_non_contracting_partitions == output_lhs_non_contracting_partitions &&
+      lhs_non_contracting_partitions > 1;
+  const bool may_group_on_rhs_non_contracting =
+      rhs_non_contracting_partitions == output_rhs_non_contracting_partitions &&
+      rhs_non_contracting_partitions > 1;
+  if (may_group_on_lhs_non_contracting || may_group_on_rhs_non_contracting) {
+    // If both match output non-contracting dimensions, choose the one which
+    // will result in smaller replication of the other operand.
+    const bool lhs_matching =
+        may_group_on_lhs_non_contracting &&
+        (!may_group_on_rhs_non_contracting ||
+         lhs_non_contracting_partitions *
+                 ShapeUtil::ByteSizeOf(rhs.hlo()->shape()) <=
+             rhs_non_contracting_partitions *
+                 ShapeUtil::ByteSizeOf(lhs.hlo()->shape()));
+
+    TF_ASSIGN_OR_RETURN(
+        auto dot,
+        PartitionDotGroupOnNonContracting(
+            lhs_matching, lhs_matching ? lhs : rhs, lhs_matching ? rhs : lhs,
+            lhs_matching ? lhs_contracting_partitions
+                         : rhs_contracting_partitions,
+            lhs_matching ? rhs_contracting_partitions
+                         : lhs_contracting_partitions,
+            lhs_matching ? lhs_non_contracting_partitions
+                         : rhs_non_contracting_partitions,
+            lhs_matching ? rhs_non_contracting_partitions
+                         : lhs_non_contracting_partitions,
+            lhs_matching ? output_rhs_non_contracting_partitions
+                         : output_lhs_non_contracting_partitions,
+            output_base_shape, output_sharding, dims_mapping, num_partitions,
+            create_sharded_dot, module, original_hlo,
+            threshold_for_windowed_einsum_mib, b, windowed_dot_general_loops));
+    if (dot) {
+      return dot;
+    }
+  }
+
+  // Default action.
+  TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(lhs.Replicate().hlo(),
+                                                   rhs.Replicate().hlo(), b));
+  dot->set_sharding(HloSharding::Replicate());
+  return PartitionedHlo(dot, output_base_shape, lhs.state())
+      .Reshard(output_sharding)
+      .hlo();
+}
+
+}  // namespace
+
+Status SpmdPartitioningVisitor::HandleDotHelper(
+    HloInstruction* hlo, const DotGeneralDimsMapping& dims_mapping,
+    const std::function<StatusOr<HloInstruction*>(
+        HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot) {
+  auto& lhs = GetPartitionedHlo(hlo->operand(0));
+  auto& rhs = GetPartitionedHlo(hlo->operand(1));
+  TF_ASSIGN_OR_RETURN(
+      auto partitioned_dot,
+      PartitionDot(lhs, rhs, hlo->shape(), hlo->sharding(), dims_mapping,
+                   num_partitions_, create_sharded_dot, module_, hlo,
+                   options_.threshold_for_windowed_einsum_mib, &b_,
+                   &windowed_dot_general_loops_));
+  SetPartitionedHlo(hlo, [&] { return partitioned_dot; });
+  return Status::OK();
+}
+
+namespace {
+
+// Finds a cluster of nodes that produce the inputs for `hlo` which only depend
+// on small operands, which means the cluster should start with broadcasts,
+// constants and iotas. All other internal nodes must be non-side-effecting
+// elemntwise ops. Returns the set of nodes, and the small operands. E.g., for
+// the following graph,
+//
+//     a -> broadcast -> multiply
+//     iota  ---> add--/
+//     constant/
+//
+// FindInputNodesIfOnlyDependOnSmallOperands(multiply) will return
+//    <{broadcast, iota, constant, add, multiply}, [a]>.
+std::pair<absl::flat_hash_set<HloInstruction*>, std::vector<HloInstruction*>>
+FindInputNodesIfOnlyDependOnSmallOperands(HloInstruction* hlo) {
+  absl::flat_hash_set<HloInstruction*> nodes_found;
+  std::vector<HloInstruction*> new_operands;
+  absl::flat_hash_set<const HloInstruction*> new_operands_set;
+  std::vector<HloInstruction*> worklist;
+  worklist.push_back(hlo);
+  while (!worklist.empty()) {
+    auto inst = worklist.back();
+    worklist.pop_back();
+    if (nodes_found.count(inst) > 0) {
+      continue;
+    }
+    if (inst->opcode() == HloOpcode::kBroadcast ||
+        inst->opcode() == HloOpcode::kConstant ||
+        inst->opcode() == HloOpcode::kIota) {
+      nodes_found.insert(inst);
+      for (auto o : inst->operands()) {
+        auto res = new_operands_set.emplace(o);
+        if (res.second) {
+          new_operands.push_back(o);
+        }
+      }
+    } else if (inst->IsElementwise() && !inst->HasSideEffectNoRecurse() &&
+               inst->opcode() != HloOpcode::kAllReduce &&
+               absl::c_all_of(inst->operands(),
+                              [inst](const HloInstruction* o) {
+                                return ShapeUtil::CompatibleIgnoringElementType(
+                                    o->shape(), inst->shape());
+                              })) {
+      nodes_found.insert(inst);
+      for (auto o : inst->operands()) {
+        worklist.push_back(o);
+      }
+    } else {
+      nodes_found.clear();
+      new_operands.clear();
+      break;
+    }
+  }
+  return {std::move(nodes_found), std::move(new_operands)};
+}
+
+// Moves a cluster of memory-reducing nodes into the windowed dot-general loop
+// on contracting dimensions. Such a loop has a dynamic slice on the
+// non-windowed operand. If we move the input nodes into the loop, the
+// dynamic-slice could be merged with them by later optimization passes, which
+// reduces memory.
+//
+// small_operands             small_operands
+//        |                          |
+// input_nodes                loop { |
+//        |          =>         input_nodes
+// loop { |                          |
+//    dynamic-slice             dynamic-slice
+//    ...                       ...
+// }                          }
+//
+// Later optimization passes (TpuPadSliceMover) will merge the dynamic slice
+// with the input nodes.
+Status SinkInputNodesIntoWindowedDotGeneralLoopOnContractingDimensions(
+    HloInstruction* loop, int64 non_windowed_operand_index) {
+  auto input_tuple = loop->mutable_operand(0);
+  auto old_operand = input_tuple->mutable_operand(non_windowed_operand_index);
+  auto input_nodes = FindInputNodesIfOnlyDependOnSmallOperands(old_operand);
+  auto to_sink = std::move(input_nodes.first);
+  auto new_operands = std::move(input_nodes.second);
+  if (to_sink.empty()) {
+    return Status::OK();
+  }
+  auto computation = loop->parent();
+  // Replace the old operand with a tuple of the found small operands.
+  auto new_input_subtuple =
+      computation->AddInstruction(HloInstruction::CreateTuple(new_operands));
+  TF_RETURN_IF_ERROR(input_tuple->ReplaceOperandWithDifferentShape(
+      non_windowed_operand_index, new_input_subtuple));
+
+  auto body = loop->while_body();
+  auto body_param = body->parameter_instruction(0);
+  auto old_body_param_users = body_param->users();
+  // Update all tuple shapes.
+  for (auto tuple : std::vector<HloInstruction*>{
+           input_tuple, loop, loop->while_condition()->parameter_instruction(0),
+           body_param, body->root_instruction()}) {
+    *ShapeUtil::GetMutableSubshape(tuple->mutable_shape(),
+                                   {non_windowed_operand_index}) =
+        new_input_subtuple->shape();
+  }
+  // Now update the loop body.
+  auto new_operand_tuple_inside =
+      body->AddInstruction(HloInstruction::CreateGetTupleElement(
+          new_input_subtuple->shape(), body_param, non_windowed_operand_index));
+  TF_RETURN_IF_ERROR(body->root_instruction()->ReplaceOperandWithDifferentShape(
+      non_windowed_operand_index, new_operand_tuple_inside));
+
+  // Create nodes inside the loop body.
+  std::vector<HloInstruction*> worklist;
+  absl::flat_hash_map<const HloInstruction*, HloInstruction*> outside_to_inside;
+  auto add_users_if_available = [&](HloInstruction* inst) {
+    for (auto u : inst->users()) {
+      if (outside_to_inside.count(u) == 0 && to_sink.count(u) > 0 &&
+          absl::c_all_of(u->operands(), [&](const HloInstruction* o) {
+            return outside_to_inside.count(o) > 0;
+          })) {
+        worklist.push_back(u);
+      }
+    }
+  };
+  for (int64 i = 0; i < new_operands.size(); ++i) {
+    outside_to_inside[new_operands[i]] =
+        body->AddInstruction(HloInstruction::CreateGetTupleElement(
+            new_operands[i]->shape(), new_operand_tuple_inside, i));
+    add_users_if_available(new_operands[i]);
+  }
+  // HLOs to sink without operands.
+  std::vector<HloInstruction*> nullaries_to_sink;
+  for (auto inst : to_sink) {
+    if (inst->operand_count() == 0) {
+      nullaries_to_sink.push_back(inst);
+    }
+  }
+  // Sort nullaries_to_sink to make it deterministic.
+  absl::c_sort(nullaries_to_sink,
+               [](const HloInstruction* a, const HloInstruction* b) {
+                 return a->unique_id() < b->unique_id();
+               });
+  worklist.reserve(nullaries_to_sink.size());
+  for (auto inst : nullaries_to_sink) {
+    worklist.push_back(inst);
+  }
+  while (!worklist.empty()) {
+    auto inst = worklist.back();
+    worklist.pop_back();
+    std::vector<HloInstruction*> inst_new_operands(inst->operand_count());
+    for (int64 i = 0; i < inst->operand_count(); ++i) {
+      inst_new_operands[i] = outside_to_inside[inst->operand(i)];
+    }
+    outside_to_inside[inst] = body->AddInstruction(
+        inst->CloneWithNewOperands(inst->shape(), inst_new_operands));
+    add_users_if_available(inst);
+  }
+  TF_RET_CHECK(outside_to_inside.count(old_operand) > 0);
+  for (auto ou : old_body_param_users) {
+    if (ou->opcode() == HloOpcode::kGetTupleElement &&
+        ou->tuple_index() == non_windowed_operand_index) {
+      TF_RETURN_IF_ERROR(
+          ou->ReplaceAllUsesWith(outside_to_inside[old_operand]));
+      TF_RETURN_IF_ERROR(body->RemoveInstruction(ou));
+    }
+  }
+  return Status::OK();
+}
+
+// Moves a cluster of memory-reducing nodes (with reduce nodes at the end) into
+// the windowed dot-general loop on non-contracting dimensions. Such a loop has
+// a dynamic-update-slice at the output. If we move the user nodes into the loop
+// and before the dynamic-update-slice, the user nodes can operate on smaller
+// shapes, which reduces memory.
+//
+// small_operands                   small_operands
+//  | |                 =>                  | |
+//  | |  loop {                     loop {  | |
+//  | |    conv                             | broadcast      conv
+//  | |      |                              |     |           /
+//  | | dynamic-update-slice                |  dynamic-slice /
+//  | |         |                           |     |         /
+//  | |  }      |                           |  multiply-----
+//  |broadcast  /                           |    /
+//  | |        /                            reduce
+//  |multiply--                             |
+//  \ |                                dynamic-update-slice
+//   reduce                         }
+//
+// Later optimization passes (TpuPadSliceMover) will merge the dynamic slice
+// with the input nodes (broadcast).
+Status MoveUsersIntoWindowedDotGeneralLoopOnNonContractingDimensions(
+    HloInstruction* loop) {
+  CHECK_EQ(loop->user_count(), 1);
+  // There should be a single direct user of the while loop, which is the
+  // gte for element 2, i.e., the dot output.
+  auto user_gte = loop->users().front();
+  CHECK_EQ(user_gte->opcode(), HloOpcode::kGetTupleElement);
+  CHECK_EQ(user_gte->tuple_index(), 2);
+  auto computation = loop->parent();
+
+  // Find the reduce outputs and the input nodes they depend on, if input nodes
+  // only have small operands.
+  absl::flat_hash_set<HloInstruction*> to_move;
+  std::vector<HloInstruction*> new_operands;
+  absl::flat_hash_set<const HloInstruction*> new_operands_set;
+  std::vector<HloInstruction*> reduce_outputs;
+  std::vector<HloInstruction*> worklist;
+  Shape padded_shape = user_gte->shape();
+  Shape unpadded_shape = user_gte->shape();
+  auto original_output = user_gte;
+
+  if (user_gte->user_count() == 1 &&
+      user_gte->users().back()->opcode() == HloOpcode::kSlice) {
+    original_output = user_gte->users().back();
+    unpadded_shape = original_output->shape();
+  }
+  for (auto u : original_output->users()) {
+    worklist.push_back(u);
+  }
+  to_move.insert(original_output);
+  while (!worklist.empty()) {
+    auto inst = worklist.back();
+    worklist.pop_back();
+    if (to_move.count(inst) > 0) {
+      continue;
+    }
+    // We only support reduces with simple reduction function, since we may need
+    // to accumulate across iterations manually.
+    if (inst->opcode() == HloOpcode::kReduce &&
+        inst->to_apply()->instruction_count() == 3 &&
+        inst->to_apply()->num_parameters() == 2 &&
+        inst->to_apply()->root_instruction()->IsElementwise()) {
+      to_move.insert(inst);
+      auto other_operand = inst->mutable_operand(1);
+      auto res = new_operands_set.emplace(other_operand);
+      if (res.second) {
+        new_operands.push_back(other_operand);
+      }
+      reduce_outputs.push_back(inst);
+    } else if (inst != computation->root_instruction() &&
+               inst->user_count() > 0 && inst->IsElementwise() &&
+               !inst->HasSideEffectNoRecurse() &&
+               inst->opcode() != HloOpcode::kAllReduce &&
+               absl::c_all_of(inst->operands(),
+                              [inst](const HloInstruction* o) {
+                                return ShapeUtil::CompatibleIgnoringElementType(
+                                    o->shape(), inst->shape());
+                              })) {
+      // For an elementwise op, we need to make sure that they depend on only
+      // nodes already in to_move and nodes with small operands.
+      bool can_include = true;
+      for (auto operand : inst->operands()) {
+        if (to_move.count(operand) > 0) {
+          continue;
+        }
+        auto find_result = FindInputNodesIfOnlyDependOnSmallOperands(operand);
+        if (find_result.first.empty()) {
+          can_include = false;
+          break;
+        }
+        for (auto n : find_result.first) {
+          to_move.insert(n);
+        }
+        for (auto new_operand : find_result.second) {
+          auto res = new_operands_set.insert(new_operand);
+          if (res.second) {
+            new_operands.push_back(new_operand);
+          }
+        }
+      }
+      if (!can_include) {
+        to_move.clear();
+        break;
+      }
+      to_move.insert(inst);
+      for (auto u : inst->users()) {
+        worklist.push_back(u);
+      }
+    } else {
+      to_move.clear();
+      break;
+    }
+  }
+  // If nothing is found, to_move could contain only original_output, or cleared
+  // by the above code.
+  if (to_move.size() <= 1) {
+    return Status::OK();
+  }
+
+  // We will replace the original loop output with reduce-shape outputs. Create
+  // the initial buffers before the loop.
+  for (auto out : reduce_outputs) {
+    auto padded_out_shape = out->shape();
+    int64 operand_dim = 0;
+    int64 output_dim = 0;
+    while (output_dim < padded_out_shape.rank()) {
+      if (absl::c_linear_search(out->dimensions(), operand_dim)) {
+        // Dimension colapsed.
+        ++operand_dim;
+        continue;
+      }
+      // Kept dimensions have the same size of the padded shape.
+      padded_out_shape.set_dimensions(output_dim,
+                                      padded_shape.dimensions(operand_dim));
+      ++operand_dim;
+      ++output_dim;
+    }
+    auto broadcast =
+        computation->AddInstruction(HloInstruction::CreateBroadcast(
+            padded_out_shape,
+            computation->AddInstruction(HloInstruction::CreateConstant(
+                LiteralUtil::Zero(out->shape().element_type()))),
+            {}));
+    new_operands.push_back(broadcast);
+  }
+
+  auto input_tuple = loop->mutable_operand(0);
+  // Create the new input subtuple that contains the small operands and the
+  // reduce-shape result buffers.
+  auto new_input_subtuple =
+      computation->AddInstruction(HloInstruction::CreateTuple(new_operands));
+  TF_RETURN_IF_ERROR(
+      input_tuple->ReplaceOperandWithDifferentShape(2, new_input_subtuple));
+  auto body = loop->while_body();
+  auto body_param = body->parameter_instruction(0);
+  auto body_root = body->root_instruction();
+  CHECK_EQ(body_root->opcode(), HloOpcode::kTuple);
+  // Update tuple shapes.
+  for (auto tuple : std::vector<HloInstruction*>{
+           input_tuple, loop, loop->while_condition()->parameter_instruction(0),
+           body_param, body_root}) {
+    *ShapeUtil::GetMutableSubshape(tuple->mutable_shape(), {2}) =
+        new_input_subtuple->shape();
+  }
+  auto new_loop_input =
+      body->AddInstruction(HloInstruction::CreateGetTupleElement(
+          new_input_subtuple->shape(), body_param, 2));
+
+  // Now create the moved nodes inside the loop body.
+  absl::flat_hash_map<const HloInstruction*, HloInstruction*> outside_to_inside;
+  worklist.clear();
+  auto add_users_if_available = [&](HloInstruction* inst) {
+    for (auto u : inst->users()) {
+      if (outside_to_inside.count(u) == 0 && to_move.count(u) > 0 &&
+          absl::c_all_of(u->operands(), [&](const HloInstruction* o) {
+            return outside_to_inside.count(o) > 0;
+          })) {
+        worklist.push_back(u);
+      }
+    }
+  };
+  for (int64 i = 0; i < new_operands.size(); ++i) {
+    outside_to_inside[new_operands[i]] =
+        body->AddInstruction(HloInstruction::CreateGetTupleElement(
+            new_operands[i]->shape(), new_loop_input, i));
+    add_users_if_available(new_operands[i]);
+  }
+  // The elementwise nodes will be created with sliced shape. The original loop
+  // output corresponds to the dynamic-update-slice's update slice.
+  auto dus = body_root->mutable_operand(2);
+  CHECK_EQ(dus->opcode(), HloOpcode::kDynamicUpdateSlice);
+  outside_to_inside[original_output] = dus->mutable_operand(1);
+  add_users_if_available(original_output);
+  std::vector<HloInstruction*> slice_offsets(padded_shape.rank());
+  for (int64 i = 0; i < slice_offsets.size(); ++i) {
+    slice_offsets[i] = dus->mutable_operand(i + 2);
+  }
+  auto get_slice = [&](HloInstruction* padded) {
+    return body->AddInstruction(HloInstruction::CreateDynamicSlice(
+        ShapeUtil::ChangeElementType(dus->operand(1)->shape(),
+                                     padded->shape().element_type()),
+        padded, slice_offsets, dus->operand(1)->shape().dimensions()));
+  };
+  // Helper functions to create nodes with small operands.
+  auto add_broadcast = [&](const HloInstruction* broadcast) {
+    auto padded_operand_shape = broadcast->operand(0)->shape();
+    for (int64 i = 0; i < broadcast->dimensions().size(); ++i) {
+      padded_operand_shape.set_dimensions(
+          i, padded_shape.dimensions(broadcast->dimensions(i)));
+    }
+    auto padded_operand = PadToShape(outside_to_inside[broadcast->operand(0)],
+                                     padded_operand_shape, nullptr, body);
+    outside_to_inside[broadcast] =
+        get_slice(body->AddInstruction(broadcast->CloneWithNewOperands(
+            ShapeUtil::ChangeElementType(padded_shape,
+                                         padded_operand_shape.element_type()),
+            {padded_operand})));
+  };
+  auto add_iota = [&](const HloInstruction* iota) {
+    outside_to_inside[iota] =
+        get_slice(body->AddInstruction(iota->CloneWithNewOperands(
+            ShapeUtil::ChangeElementType(padded_shape,
+                                         iota->shape().element_type()),
+            {})));
+  };
+  auto add_constant = [&](const HloInstruction* constant) {
+    outside_to_inside[constant] = body->AddInstruction(constant->Clone());
+    outside_to_inside[constant] = get_slice(
+        PadToShape(outside_to_inside[constant],
+                   ShapeUtil::ChangeElementType(
+                       padded_shape, constant->shape().element_type()),
+                   nullptr, body));
+  };
+  while (!worklist.empty()) {
+    auto inst = worklist.back();
+    worklist.pop_back();
+    if (outside_to_inside.count(inst) > 0) {
+      continue;
+    }
+    if (inst->opcode() == HloOpcode::kBroadcast) {
+      add_broadcast(inst);
+    } else if (inst->opcode() == HloOpcode::kIota) {
+      add_iota(inst);
+    } else if (inst->opcode() == HloOpcode::kConstant) {
+      add_constant(inst);
+    } else if (inst->opcode() == HloOpcode::kReduce) {
+      // This is an output, for which we has special handling later.
+    } else {
+      std::vector<HloInstruction*> operands_inside(inst->operand_count());
+      for (int64 i = 0; i < operands_inside.size(); ++i) {
+        operands_inside[i] = outside_to_inside[inst->operand(i)];
+      }
+      outside_to_inside[inst] = body->AddInstruction(inst->CloneWithNewOperands(
+          ShapeUtil::ChangeElementType(dus->operand(1)->shape(),
+                                       inst->shape().element_type()),
+          operands_inside));
+    }
+    add_users_if_available(inst);
+  }
+  std::vector<HloInstruction*> new_outputs_inside(new_operands.size());
+  for (int64 i = 0; i < new_outputs_inside.size(); ++i) {
+    new_outputs_inside[i] = outside_to_inside[new_operands[i]];
+  }
+  // Now create the reduce outpus inside of the loop.
+  for (int64 i = 0; i < reduce_outputs.size(); ++i) {
+    auto reduce_outside = reduce_outputs[i];
+    CHECK_EQ(reduce_outside->opcode(), HloOpcode::kReduce);
+    int64 index_in_operand = new_operands.size() - reduce_outputs.size() + i;
+    auto last_iter_result = outside_to_inside[new_operands[index_in_operand]];
+    auto operand0 = outside_to_inside[reduce_outside->operand(0)];
+    auto operand1 = outside_to_inside[reduce_outside->operand(1)];
+    TF_ASSIGN_OR_RETURN(auto reduce_shape,
+                        ShapeInference::InferReduceShape(
+                            {&operand0->shape(), &operand1->shape()},
+                            reduce_outside->dimensions(),
+                            reduce_outside->to_apply()->ComputeProgramShape()));
+    *reduce_shape.mutable_layout() = reduce_outside->shape().layout();
+    std::vector<HloInstruction*> reduce_dus_offsets;
+    // If any collapsed dimension is windowed, we need to accumulate with last
+    // iteration's result. If such a dimension has padding, we also need to mask
+    // off invalid data.
+    bool needs_accumulate = false;
+    std::vector<int64> dims_to_mask;
+    for (int64 i = 0; i < slice_offsets.size(); ++i) {
+      if (absl::c_linear_search(reduce_outside->dimensions(), i)) {
+        if (reduce_outside->operand(0)->shape().dimensions(i) !=
+            operand0->shape().dimensions(i)) {
+          needs_accumulate = true;
+          if (unpadded_shape.dimensions(i) != padded_shape.dimensions(i)) {
+            dims_to_mask.push_back(i);
+          }
+        }
+        continue;
+      }
+      reduce_dus_offsets.push_back(slice_offsets[i]);
+    }
+    // Mask off invalid data in collapsed dimensions.
+    for (int64 dim : dims_to_mask) {
+      auto iota = body->AddInstruction(HloInstruction::CreateIota(
+          ShapeUtil::ChangeElementType(operand0->shape(), S32), dim));
+      auto add = body->AddInstruction(HloInstruction::CreateBinary(
+          iota->shape(), HloOpcode::kAdd, iota,
+          body->AddInstruction(HloInstruction::CreateBroadcast(
+              iota->shape(), slice_offsets[dim], {}))));
+      auto limit = body->AddInstruction(HloInstruction::CreateBroadcast(
+          iota->shape(),
+          body->AddInstruction(
+              HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(
+                  reduce_outside->operand(0)->shape().dimensions(dim)))),
+          {}));
+      auto compare = body->AddInstruction(HloInstruction::CreateCompare(
+          ShapeUtil::ChangeElementType(iota->shape(), PRED), add, limit,
+          ComparisonDirection::kLt));
+      operand0 = body->AddInstruction(HloInstruction::CreateTernary(
+          operand0->shape(), HloOpcode::kSelect, compare, operand0,
+          body->AddInstruction(HloInstruction::CreateBroadcast(
+              operand0->shape(), operand1, {}))));
+    }
+    auto output_inside =
+        body->AddInstruction(reduce_outside->CloneWithNewOperands(
+            reduce_shape, {operand0, operand1}));
+    // Accumulate with previous results if needed.
+    if (needs_accumulate) {
+      auto input_slice =
+          body->AddInstruction(HloInstruction::CreateDynamicSlice(
+              output_inside->shape(), last_iter_result, reduce_dus_offsets,
+              output_inside->shape().dimensions()));
+      output_inside = body->AddInstruction(HloInstruction::CreateBinary(
+          output_inside->shape(),
+          reduce_outside->to_apply()->root_instruction()->opcode(),
+          output_inside, input_slice));
+    }
+    // Dynamic-update-slice if needed.
+    if (!ShapeUtil::Compatible(output_inside->shape(),
+                               last_iter_result->shape())) {
+      output_inside =
+          body->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+              last_iter_result->shape(), last_iter_result, output_inside,
+              reduce_dus_offsets));
+    }
+    new_outputs_inside[index_in_operand] = output_inside;
+  }
+  // Body output.
+  auto new_output_inside =
+      body->AddInstruction(HloInstruction::CreateTuple(new_outputs_inside));
+  TF_RETURN_IF_ERROR(
+      body_root->ReplaceOperandWithDifferentShape(2, new_output_inside));
+  TF_RETURN_IF_ERROR(body->RemoveInstructionAndUnusedOperands(dus));
+  // Replace uses of the reduces outside the loop.
+  auto new_output_gte =
+      computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+          new_output_inside->shape(), loop, 2));
+  for (int64 i = 0; i < reduce_outputs.size(); ++i) {
+    int64 index_in_operand = new_operands.size() - reduce_outputs.size() + i;
+    auto new_output =
+        computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+            new_outputs_inside[index_in_operand]->shape(), new_output_gte,
+            index_in_operand));
+    if (!ShapeUtil::Compatible(new_output->shape(),
+                               reduce_outputs[i]->shape())) {
+      new_output = computation->AddInstruction(HloInstruction::CreateSlice(
+          reduce_outputs[i]->shape(), new_output,
+          std::vector<int64>(new_output->shape().rank(), 0),
+          reduce_outputs[i]->shape().dimensions(),
+          std::vector<int64>(new_output->shape().rank(), 1)));
+    }
+    TF_RETURN_IF_ERROR(reduce_outputs[i]->ReplaceAllUsesWith(new_output));
+    TF_RETURN_IF_ERROR(
+        computation->RemoveInstructionAndUnusedOperands(reduce_outputs[i]));
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status SpmdPartitioningVisitor::DoCodeMotionForWindowedDotGeneralLoops(
+    HloComputation* computation) {
+  for (auto& loop : windowed_dot_general_loops_) {
+    if (loop.windowed_in_contracting_dims || loop.windowed_in_batch_dims) {
+      // We have a dynamic-slice for the non-windowed operand in
+      // batch/contracting-dim windowed dot-general. So moving the
+      // broadcast/iota/elementwise ops into the loop could help reduce memory
+      // via fusion.
+      TF_RETURN_IF_ERROR(
+          SinkInputNodesIntoWindowedDotGeneralLoopOnContractingDimensions(
+              loop.while_loop, 1 - loop.windowed_operand));
+    }
+    if (!loop.windowed_in_contracting_dims) {
+      // We have a dynamic-update-slice for the output in
+      // batch/non-contracting-dim windowed dot-general. So moving reduce ops
+      // into the loop could help reduce memory.
+      TF_RETURN_IF_ERROR(
+          MoveUsersIntoWindowedDotGeneralLoopOnNonContractingDimensions(
+              loop.while_loop));
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace spmd
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
index 7e136be54e6..8006e47d90d 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/dot_as_convolution_util.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -166,28 +165,6 @@ template <typename F>
 
 namespace {
 
-// Returns the replica group configuration where each replica belongs to its own
-// group.
-std::vector<ReplicaGroup> CreateReplicaGroups(int64 num_replicas) {
-  std::vector<ReplicaGroup> groups(num_replicas);
-  for (int64 i = 0; i < num_replicas; ++i) {
-    groups[i].add_replica_ids(i);
-  }
-  return groups;
-}
-
-bool CanReshardWithAllToAll(const HloSharding& source,
-                            const HloSharding& target) {
-  return UniqueTiledDim(source) && UniqueTiledDim(target) &&
-         UniqueTiledDim(source) != UniqueTiledDim(target);
-}
-
-bool CanReshardWithCollectivePermute(const HloSharding& source,
-                                     const HloSharding& target) {
-  return UniqueTiledDim(source) && UniqueTiledDim(target) &&
-         UniqueTiledDim(source) == UniqueTiledDim(target) && source != target;
-}
-
 // Clears all sharding attributes from instructions in the module. This must be
 // called only after all SPMD transformation is complete.
 Status ClearShardingAttributes(HloModule* module) {
@@ -208,6 +185,28 @@ Status ClearShardingAttributes(HloModule* module) {
   return Status::OK();
 }
 
+std::vector<std::vector<int64>> GetPartitionGroupsForReplication(
+    const HloSharding& sharding, absl::Span<const int64> replication_dims) {
+  int64 group_size = 1;
+  for (int64 i : replication_dims) {
+    group_size *= sharding.tile_assignment().dim(i);
+  }
+  std::vector<std::vector<int64>> partition_groups(
+      sharding.tile_assignment().num_elements() / group_size);
+  sharding.tile_assignment().Each(
+      [&](absl::Span<const int64> indices, int64 partition) {
+        int64 group_id = 0;
+        for (int64 i = 0; i < indices.size(); ++i) {
+          if (!absl::c_linear_search(replication_dims, i)) {
+            group_id *= sharding.tile_assignment().dim(i);
+            group_id += indices[i];
+          }
+        }
+        partition_groups[group_id].push_back(partition);
+      });
+  return partition_groups;
+}
+
 }  // namespace
 
 HloInstruction* SpmdBuilder::AddInstruction(
@@ -278,8 +277,80 @@ PartitionedHlo PartitionedHlo::ReshardNoCache(const HloSharding& target) {
     return ReshardWithCollectivePermute(target);
   }
 
-  if (CanReshardWithAllToAll(sharding(), target)) {
-    return ReshardWithAllToAll(target);
+  if (auto src_tgt_dims =
+          GetReshardAllToAllSourceTargetDims(sharding(), target)) {
+    return ReshardWithAllToAll(target, *src_tgt_dims);
+  }
+
+  // Partial replicated to tiled.
+  if (sharding().ReplicateOnLastTileDim() && !target.ReplicateOnLastTileDim() &&
+      !target.IsTileMaximal()) {
+    // Get the temp sharding target from partial replicate to target tile dims.
+    // target_compatible_sharding has the same tile_assignment dimensions
+    // as the target and can reshard to target by collective permute.
+    // target_compatible_sharding could have different device assignment as
+    // targe. sharding() can reshard to target_compatible_sharding by
+    // dynamic slice.
+    auto target_compatible_sharding = PartialReplicateToTileCompatibleSharding(
+        sharding(), target.tile_assignment().dimensions());
+    // Reshard to target_compatible_sharding by dynamic slice.
+    if (target_compatible_sharding.has_value()) {
+      std::vector<int64> expand_tile_dims;
+      std::vector<int64> tiling_dim_factors;
+      int64 rank = shape.rank();
+      tiling_dim_factors.reserve(rank);
+      auto temp_target_sharding = target_compatible_sharding.value();
+      for (int64 dim = 0; dim < rank; dim++) {
+        if (temp_target_sharding.tile_assignment().dim(dim) >
+            sharding().tile_assignment().dim(dim)) {
+          expand_tile_dims.push_back(dim);
+        }
+        tiling_dim_factors.emplace_back(
+            temp_target_sharding.tile_assignment().dim(dim) /
+            sharding().tile_assignment().dim(dim));
+      }
+
+      // Get per_group partitioner state.
+      std::vector<int64> group_dims(
+          sharding().tile_assignment().num_dimensions() - 1);
+      std::iota(group_dims.begin(), group_dims.end(), 0);
+      auto sharding_grouped = GroupShardingOnDims(sharding(), group_dims);
+      auto per_group_partitioner_state = CreatePerGroupPartitioningState(
+          state_, sharding_grouped.device_groups, state_.b);
+      // 2. Get the padded_hlo, do right halo exchange if needed.
+      auto padded_hlo = PadFromPartialReplicateShape(
+          hlo_, base_shape_, sharding(), temp_target_sharding, expand_tile_dims,
+          state_.collective_ops_creator, state_.next_channel_id,
+          state_.partition_id, state_.b);
+      if (padded_hlo.has_value()) {
+        // 3. Slice out the tile from replicate ones.
+        auto shard_shape =
+            MakePartitionedShape(base_shape_, temp_target_sharding);
+        // device assignment within each group is sorted in
+        // HloSharding::PartialTile, thus partiton_id within each group can be
+        // matched with the order in tile_assignment.
+        Array<int64> tiling_assignment(tiling_dim_factors);
+        tiling_assignment.FillIota(0);
+        auto slice =
+            state_.b->AddInstruction(HloInstruction::CreateDynamicSlice(
+                shard_shape, padded_hlo.value(),
+                MakePartitionOffsets(padded_hlo.value()->shape(),
+                                     HloSharding::Tile(tiling_assignment),
+                                     per_group_partitioner_state.partition_id,
+                                     per_group_partitioner_state.b),
+                shard_shape.dimensions()));
+        slice->set_sharding(temp_target_sharding);
+        auto result = PartitionedHlo(slice, base_shape_, state_);
+        // If temp_target_sharding's device assignment is different from target,
+        // use collective permute to reshard.
+        if (CanReshardWithCollectivePermute(temp_target_sharding, target)) {
+          return result.ReshardWithCollectivePermute(target);
+        }
+        // If device assignment in temp_target_sharding and target are the same,
+        // return result directly.
+        return result;
+      }
+    }
   }
 
   // If not replicated yet, first replicate and then reshard to use one of the
@@ -296,6 +367,19 @@ PartitionedHlo PartitionedHlo::ReshardNoCache(const HloSharding& target) {
     return PartitionedHlo(copy, base_shape_, state_);
   }
 
+  // 'Replicated' to partial replicated.
+  if (target.ReplicateOnLastTileDim()) {
+    std::vector<int64> group_dims(target.tile_assignment().num_dimensions() -
+                                  1);
+    std::iota(group_dims.begin(), group_dims.end(), 0);
+    auto target_grouped = GroupShardingOnDims(target, group_dims);
+    auto partially_sharded = PerGroupSliceFromReplicated(
+        hlo_, state_.partition_id, target_grouped.device_groups, group_dims,
+        target_grouped.group_dim_sizes, state_.b);
+    partially_sharded->set_sharding(target);
+    return PartitionedHlo(partially_sharded, base_shape(), state_);
+  }
+
   // 'Replicated' to 'Tiled'.
   auto padded_hlo =
       PadBaseShapeBeforeUnevenTiledSharding(hlo_, target, state_.b);
@@ -676,42 +760,57 @@ PartitionedHlo PartitionedHlo::Replicate() {
   }
 
   // 'Tiled' to 'Replicated'.
+  std::vector<int64> all_dims(shape.rank());
+  std::iota(all_dims.begin(), all_dims.end(), 0);
+  HloInstruction* result = ReplicatePartial(all_dims);
+  result->set_sharding(HloSharding::Replicate());
+  return update_cache(PartitionedHlo(result, base_shape_, state_));
+}
+
+HloInstruction* PartitionedHlo::ReplicatePartial(absl::Span<const int64> dims) {
+  CHECK(!sharding().IsTileMaximal());
+  const Shape& shard_shape = hlo()->shape();
+  Shape target_shape = shard_shape;
+  Shape padded_target_shape = shard_shape;
+  for (int64 i : dims) {
+    padded_target_shape.set_dimensions(
+        i, shard_shape.dimensions(i) * sharding().tile_assignment().dim(i));
+    target_shape.set_dimensions(i, base_shape().dimensions(i));
+  }
+
   HloInstruction* result = nullptr;
   if (state_.collective_ops_creator.create_cross_partition_all_gather) {
-    result = state_.partitioner->AllGatherShards(state_.b, hlo_, sharding,
-                                                 NewChannel());
-  }
-  Shape padded_base_shape = shape;
-  for (int64 i = 0; i < padded_base_shape.rank(); ++i) {
-    padded_base_shape.set_dimensions(
-        i, shape.dimensions(i) * sharding.tile_assignment().dim(i));
+    result = state_.partitioner->AllGatherShards(state_.b, hlo_, sharding(),
+                                                 NewChannel(), dims,
+                                                 state_.collective_ops_creator);
   }
   if (result == nullptr) {
     auto zero = state_.b->AddInstruction(HloInstruction::CreateConstant(
-        LiteralUtil::Zero(shape.element_type())));
+        LiteralUtil::Zero(shard_shape.element_type())));
     auto zero_bcast = state_.b->AddInstruction(
-        HloInstruction::CreateBroadcast(padded_base_shape, zero, {}));
+        HloInstruction::CreateBroadcast(padded_target_shape, zero, {}));
+    auto offsets = MakePartitionOffsets(padded_target_shape, sharding(),
+                                        state_.partition_id, state_.b, dims);
     auto dus =
         state_.b->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-            padded_base_shape, zero_bcast, hlo_,
-            MakePartitionOffsets(padded_base_shape, sharding,
-                                 state_.partition_id, state_.b)));
+            padded_target_shape, zero_bcast, hlo_, offsets));
     HloComputation* reduction =
-        MakeBinaryAdd(shape.element_type(), state_.module);
+        MakeBinaryAdd(shard_shape.element_type(), state_.module);
 
     auto all_reduce =
         state_.collective_ops_creator.create_cross_partition_all_reduce(
-            state_.b, dus, reduction, NewChannel());
+            state_.b, dus, reduction,
+            GetPartitionGroupsForReplication(sharding(), dims), NewChannel());
     result = all_reduce;
   }
-  if (!ShapeUtil::Compatible(base_shape_, padded_base_shape)) {
-    std::vector<int64> start_indices(shape.rank(), 0);
-    std::vector<int64> strides(shape.rank(), 1);
-    result = state_.b->AddInstruction(HloInstruction::CreateSlice(
-        base_shape_, result, start_indices, base_shape_.dimensions(), strides));
+  if (!ShapeUtil::Compatible(target_shape, padded_target_shape)) {
+    std::vector<int64> start_indices(target_shape.rank(), 0);
+    std::vector<int64> strides(target_shape.rank(), 1);
+    result = state_.b->AddInstruction(
+        HloInstruction::CreateSlice(target_shape, result, start_indices,
+                                    base_shape_.dimensions(), strides));
   }
-  result->set_sharding(HloSharding::Replicate());
-  return update_cache(PartitionedHlo(result, base_shape_, state_));
+  return result;
 }
 
 PartitionedHlo PartitionedHlo::Broadcast() const {
@@ -740,50 +839,101 @@ PartitionedHlo PartitionedHlo::Broadcast() const {
       MakeBinaryAdd(shape.element_type(), state_.module);
 
   auto result = state_.collective_ops_creator.create_cross_partition_all_reduce(
-      state_.b, operand, reduction, NewChannel());
+      state_.b, operand, reduction, {}, NewChannel());
   result->set_sharding(HloSharding::Replicate());
   return PartitionedHlo(result, base_shape_, state_);
 }
 
 PartitionedHlo PartitionedHlo::ReshardWithAllToAll(
-    const HloSharding& target) const {
-  int64 partition_count = sharding().tile_assignment().num_elements();
-  absl::optional<int64> input_partition_dim = UniqueTiledDim(sharding());
-  absl::optional<int64> output_partition_dim = UniqueTiledDim(target);
-  CHECK(input_partition_dim.has_value());
-  CHECK(output_partition_dim.has_value());
-
-  // If the device order is different in the target, fix the order with
-  // ReshardWithCollectivePermute.
-  auto input_tile_fixed_device_order = target.tile_assignment();
-  input_tile_fixed_device_order.Reshape(
-      sharding().tile_assignment().dimensions());
-  auto input_sharding_fixed_device_order =
-      HloSharding::Tile(input_tile_fixed_device_order);
-  if (input_sharding_fixed_device_order != sharding()) {
-    auto fixed_order =
-        ReshardWithCollectivePermute(input_sharding_fixed_device_order);
-    return fixed_order.ReshardWithAllToAll(target);
+    const HloSharding& target,
+    absl::Span<const std::pair<int64, int64>> source_target_dims) const {
+  if (source_target_dims.empty()) {
+    if (target == sharding()) {
+      return *this;
+    }
+    // If the device order is different in the target, fix the order with
+    // ReshardWithCollectivePermute.
+    return ReshardWithCollectivePermute(target);
   }
 
-  auto padded_hlo =
-      PadBaseShapeBeforeUnevenTiledSharding(hlo_, target, state_.b);
+  // Swap one pair of dimensions.
+  int64 source_dim = source_target_dims[0].first;
+  int64 target_dim = source_target_dims[0].second;
+  const int64 group_size = sharding().tile_assignment().dim(source_dim) /
+                           sharding().tile_assignment().dim(target_dim);
 
-  // The order of ids in the group must follow the target sharding.
-  std::vector<ReplicaGroup> groups(1);
-  for (int64 device : target.tile_assignment()) {
-    groups[0].add_replica_ids(device);
+  auto temp_target_tile = sharding().tile_assignment();
+  {
+    std::vector<int64> reshape_tile_dims(temp_target_tile.num_dimensions() + 2);
+    int64 i = 0;
+    int64 added_source_dim = -1;
+    int64 added_target_dim = -1;
+    for (int64 j = 0; j < temp_target_tile.num_dimensions(); ++j) {
+      if (source_dim == j) {
+        reshape_tile_dims[i] = temp_target_tile.dim(j) / group_size;
+        reshape_tile_dims[++i] = group_size;
+        added_source_dim = i;
+      } else if (target_dim == j) {
+        reshape_tile_dims[i] = temp_target_tile.dim(j);
+        reshape_tile_dims[++i] = 1;
+        added_target_dim = i;
+      } else {
+        reshape_tile_dims[i] = temp_target_tile.dim(j);
+      }
+      ++i;
+    }
+    temp_target_tile.Reshape(reshape_tile_dims);
+    std::vector<int64> xpose_dims(temp_target_tile.num_dimensions());
+    std::iota(xpose_dims.begin(), xpose_dims.end(), 0);
+    xpose_dims[added_source_dim] = added_target_dim;
+    xpose_dims[added_target_dim] = added_source_dim;
+    temp_target_tile = hlo_sharding_util::TransposeSharding(
+                           HloSharding::Tile(temp_target_tile), xpose_dims)
+                           .tile_assignment();
+    auto temp_target_tile_dims = sharding().tile_assignment().dimensions();
+    temp_target_tile_dims[source_dim] =
+        sharding().tile_assignment().dim(target_dim);
+    temp_target_tile_dims[target_dim] =
+        sharding().tile_assignment().dim(source_dim);
+    temp_target_tile.Reshape(temp_target_tile_dims);
   }
+  auto temp_target = target.ReplicateOnLastTileDim()
+                         ? HloSharding::PartialTile(temp_target_tile)
+                         : HloSharding::Tile(temp_target_tile);
+  auto padded_shape = hlo_->shape();
+  padded_shape.set_dimensions(
+      target_dim,
+      RoundUpToNearest(padded_shape.dimensions(target_dim),
+                       temp_target.tile_assignment().dim(target_dim)));
+  auto padded_hlo = PadToShape(hlo_, padded_shape, state_.b);
+
+  // The order of ids in the group must follow the temp_target sharding.
+  std::vector<std::vector<int64>> groups(
+      temp_target.tile_assignment().num_elements() / group_size);
+  temp_target.tile_assignment().Each(
+      [&](absl::Span<const int64> indices, int64 device) {
+        int64 group_id = 0;
+        for (int64 dim = 0; dim < indices.size(); ++dim) {
+          if (dim == target_dim) {
+            group_id *= temp_target.tile_assignment().dim(dim) / group_size;
+            group_id += indices[dim] / group_size;
+          } else {
+            group_id *= temp_target.tile_assignment().dim(dim);
+            group_id += indices[dim];
+          }
+        }
+        groups[group_id].push_back(device);
+      });
 
   HloInstruction* result = nullptr;
 
-  // Split along the split dimension (output_partition_dim) of the all-to-all
+  // Split along the split dimension (target_dim) of the all-to-all
   // output.
   std::vector<int64> dimensions;
   for (int64 i = 0; i < base_shape_.rank(); ++i) {
-    if (i == *output_partition_dim) {
-      dimensions.push_back(partition_count);
-      dimensions.push_back(padded_hlo->shape().dimensions(i) / partition_count);
+    if (i == target_dim) {
+      dimensions.push_back(group_size);
+      dimensions.push_back(padded_hlo->shape().dimensions(i) / group_size);
     } else {
       dimensions.push_back(padded_hlo->shape().dimensions(i));
     }
@@ -794,21 +944,19 @@ PartitionedHlo PartitionedHlo::ReshardWithAllToAll(
   // After the reshape, it is guaranteed to have at least 3 dimensions.
   auto all_to_all =
       state_.collective_ops_creator.create_cross_partition_all_to_all(
-          state_.b, {reshape}, groups, (*state_.next_channel_id)++,
-          output_partition_dim);
+          state_.b, {reshape}, groups, (*state_.next_channel_id)++, target_dim);
 
   // Reorder the split dimension of the reshape to be located in front of the
   // input partition dimension, so the two dimensions can be combined.
-  int64 new_input_partition_dim = (*output_partition_dim < *input_partition_dim)
-                                      ? *input_partition_dim + 1
-                                      : *input_partition_dim;
+  int64 new_source_dim =
+      (target_dim < source_dim) ? source_dim + 1 : source_dim;
   std::vector<int64> permutation;
   for (int64 i = 0; i < all_to_all->shape().rank(); ++i) {
-    if (i == *output_partition_dim) {
+    if (i == target_dim) {
       continue;
     }
-    if (i == new_input_partition_dim) {
-      permutation.push_back(*output_partition_dim);
+    if (i == new_source_dim) {
+      permutation.push_back(target_dim);
     }
     permutation.push_back(i);
   }
@@ -819,32 +967,33 @@ PartitionedHlo PartitionedHlo::ReshardWithAllToAll(
 
   // Combine the split dimension and the input partition dimension.
   auto new_shape = ShapeInference::InferAllToAllShape(
-                       padded_hlo->shape(), *output_partition_dim,
-                       *input_partition_dim, partition_count)
+                       padded_hlo->shape(), target_dim, source_dim, group_size)
                        .ValueOrDie();
   result = state_.b->AddInstruction(
       HloInstruction::CreateReshape(new_shape, transpose));
 
-  const Shape result_shape = MakePartitionedShape(base_shape_, target);
+  const Shape result_shape = MakePartitionedShape(base_shape_, temp_target);
   if (result_shape != result->shape()) {
     result = state_.b->AddInstruction(HloInstruction::CreateSlice(
         result_shape, result, std::vector<int64>(result_shape.rank(), 0),
         result_shape.dimensions(), std::vector<int64>(result_shape.rank(), 1)));
   }
-  result->set_sharding(target);
-  return PartitionedHlo(result, base_shape_, state_);
+  result->set_sharding(temp_target);
+  auto remaining_source_target_dims = source_target_dims;
+  remaining_source_target_dims.remove_prefix(1);
+  return PartitionedHlo(result, base_shape_, state_)
+      .ReshardWithAllToAll(target, remaining_source_target_dims);
 }
 
 PartitionedHlo PartitionedHlo::ReshardWithCollectivePermute(
     const HloSharding& target) const {
-  CHECK(CanReshardWithCollectivePermute(sharding(), target));
+  CHECK(CanReshardWithCollectivePermute(sharding(), target))
+      << sharding().ToString() << " to " << target.ToString();
   std::vector<std::pair<int64, int64>> src_dst_pairs;
   sharding().tile_assignment().Each(
       [&](absl::Span<const int64> indices, int64 src_device) {
         int64 dst_device = target.tile_assignment()(indices);
-        if (dst_device != src_device) {
-          src_dst_pairs.emplace_back(src_device, dst_device);
-        }
+        src_dst_pairs.emplace_back(src_device, dst_device);
       });
   auto cp =
       state_.collective_ops_creator.create_cross_partition_collective_permute(
@@ -990,7 +1139,7 @@ Status SpmdPartitioningVisitor::HandleConcatenate(HloInstruction* hlo) {
     offset += operand->shape().dimensions(dimension);
   }
   auto all_reduce = collective_ops_creator_.create_cross_partition_all_reduce(
-      &b_, temp_output, MakeBinaryAdd(hlo->shape().element_type(), module_),
+      &b_, temp_output, MakeBinaryAdd(hlo->shape().element_type(), module_), {},
       NewChannel());
   SetPartitionedHlo(hlo, [&] {
     auto start_indices =
@@ -1005,47 +1154,7 @@ Status SpmdPartitioningVisitor::HandleConcatenate(HloInstruction* hlo) {
   return Status::OK();
 }
 
-// If partitioning in the operand only happens in dimensions in passthrough
-// dimensions (offset dimensions in the gather output (or scatter update) that
-// have the same size as the operand), returns the corresponding output (or
-// update) sharding by passing through the input sharding.
-absl::optional<HloSharding> PassthroughOperandToGatherOutputOrScatterUpdate(
-    const PartitionedHlo& operand, const Shape& update_or_gather_shape,
-    absl::Span<const int64> collapsed_or_inserted_dims,
-    absl::Span<const int64> index_map,
-    absl::Span<const int64> offset_or_window_dims,
-    absl::Span<const int64> slice_size) {
-  if (operand.sharding().IsTileMaximal()) {
-    return operand.sharding();
-  }
-  std::vector<int64> passthrough_tile(update_or_gather_shape.rank(), 1);
-  int64 collapsed = 0;
-  for (int64 i = 0; i < operand.base_shape().rank(); ++i) {
-    int64 dim_partitions = operand.sharding().tile_assignment().dim(i);
-    if (absl::c_linear_search(collapsed_or_inserted_dims, i) ||
-        absl::c_linear_search(index_map, i)) {
-      if (dim_partitions > 1) {
-        return absl::nullopt;
-      }
-      collapsed++;
-      continue;
-    }
-    if (slice_size[i] != operand.base_shape().dimensions(i) &&
-        dim_partitions > 1) {
-      return absl::nullopt;
-    }
-    int64 offset_dim = offset_or_window_dims[i - collapsed];
-    if (i - collapsed > 0 &&
-        offset_dim < offset_or_window_dims[i - collapsed - 1]) {
-      // Output offsets are transposed, we do not support this case.
-      return absl::nullopt;
-    }
-    passthrough_tile[offset_dim] = dim_partitions;
-  }
-  Array<int64> tile_assignment = operand.sharding().tile_assignment();
-  tile_assignment.Reshape(passthrough_tile);
-  return HloSharding::Tile(tile_assignment);
-}
+namespace {
 
 // Returns whether partitioning in the operand only happens in dimensions with
 // gather/scatter slice size 1.
@@ -1140,6 +1249,8 @@ IndexBoundsForGatherScatterOperandPartitionedOnTrivialSliceDims(
   return {broadcast_min, broadcast_max};
 }
 
+}  // namespace
+
 Status SpmdPartitioningVisitor::HandleScatter(HloInstruction* hlo) {
   auto scatter = Cast<HloScatterInstruction>(hlo);
   auto dnums = scatter->scatter_dimension_numbers();
@@ -1155,17 +1266,87 @@ Status SpmdPartitioningVisitor::HandleScatter(HloInstruction* hlo) {
     slice_size[i] = updates.base_shape().dimensions(
         dnums.update_window_dims(num_update_window_dims++));
   }
-  std::vector<int64> inserted_window_dims(dnums.inserted_window_dims().begin(),
-                                          dnums.inserted_window_dims().end());
   std::vector<int64> scatter_dims_to_operand_dims(
       dnums.scatter_dims_to_operand_dims().begin(),
       dnums.scatter_dims_to_operand_dims().end());
-  std::vector<int64> update_window_dims(dnums.update_window_dims().begin(),
-                                        dnums.update_window_dims().end());
-  if (!operand.sharding().IsTileMaximal()) {
-    auto maybe_passthrough = PassthroughOperandToGatherOutputOrScatterUpdate(
-        operand, updates.base_shape(), inserted_window_dims,
-        scatter_dims_to_operand_dims, update_window_dims, slice_size);
+  std::vector<int64> update_scatter_dims;
+  for (int64 i = 0; i < updates.base_shape().rank(); ++i) {
+    if (!absl::c_linear_search(dnums.update_window_dims(), i)) {
+      update_scatter_dims.push_back(i);
+    }
+  }
+  if (operand.sharding().IsTileMaximal()) {
+    if (!indices.sharding().IsTileMaximal() &&
+        (dnums.index_vector_dim() == indices.base_shape().rank() ||
+         indices.sharding().tile_assignment().dim(dnums.index_vector_dim()) ==
+             1)) {
+      auto reduction_opcode = ParseReductionComputation(scatter->to_apply());
+      if (!reduction_opcode.has_value()) {
+        return DefaultAction(hlo);
+      }
+      HloInstruction* identity;
+      switch (*reduction_opcode) {
+        case HloOpcode::kAdd:
+        case HloOpcode::kOr:
+          identity = CreateZero(operand.hlo()->shape(), &b_);
+          break;
+        case HloOpcode::kMultiply:
+        case HloOpcode::kAnd:
+          identity = CreateOne(operand.hlo()->shape(), &b_);
+          break;
+        case HloOpcode::kMinimum:
+          identity = CreateConstant(
+              operand.hlo()->shape(),
+              LiteralUtil::MaxValue(hlo->shape().element_type()), &b_);
+          break;
+        case HloOpcode::kMaximum:
+          identity = CreateConstant(
+              operand.hlo()->shape(),
+              LiteralUtil::MinValue(hlo->shape().element_type()), &b_);
+          break;
+        default:
+          return DefaultAction(hlo);
+      }
+      std::vector<int64> update_dim_to_index_dim(updates.base_shape().rank(),
+                                                 -1);
+      std::vector<int64> index_dim_to_update_dim(indices.base_shape().rank(),
+                                                 -1);
+      for (int64 i = 0; i < update_scatter_dims.size(); ++i) {
+        int64 indices_scatter_dim = i < dnums.index_vector_dim() ? i : i + 1;
+        update_dim_to_index_dim[update_scatter_dims[i]] = indices_scatter_dim;
+        index_dim_to_update_dim[indices_scatter_dim] = update_scatter_dims[i];
+      }
+      auto new_updates_sharding = TransposeShardingWithCollapsedDims(
+          indices.sharding(), index_dim_to_update_dim, update_dim_to_index_dim);
+      CHECK(new_updates_sharding.has_value());
+      updates = updates.Reshard(*new_updates_sharding);
+      // To avoid accumulating the initial operand multiple times during
+      // all-reduce, we use identity operands for all non-zero partitions.
+      auto not_partition_zero = b_.AddInstruction(HloInstruction::CreateConvert(
+          ShapeUtil::MakeScalarShape(PRED), partition_id_));
+      not_partition_zero = b_.AddInstruction(HloInstruction::CreateBroadcast(
+          ShapeUtil::ChangeElementType(identity->shape(), PRED),
+          not_partition_zero, {}));
+      auto select_operand =
+          b_.AddInstruction(HloInstruction::HloInstruction::CreateTernary(
+              identity->shape(), HloOpcode::kSelect, not_partition_zero,
+              identity, operand.Replicate().hlo()));
+      auto pscatter = b_.AddInstruction(scatter->CloneWithNewOperands(
+          scatter->shape(), {select_operand, indices.hlo(), updates.hlo()}));
+      auto all_reduce =
+          collective_ops_creator_.create_cross_partition_all_reduce(
+              &b_, pscatter, scatter->to_apply(), {}, NewChannel());
+      all_reduce->set_sharding(HloSharding::Replicate());
+      SetPartitionedHlo(hlo, [&]() {
+        return PartitionedHlo(all_reduce, hlo->shape(), MakePartitioningState())
+            .Reshard(hlo->sharding())
+            .hlo();
+      });
+      return Status::OK();
+    }
+  } else {
+    auto maybe_passthrough = hlo_sharding_util::ScatterUpdateShardingFromOutput(
+        operand.sharding(), *hlo);
     // Handle pass through cases if we can use compatible sharding for update.
     if (maybe_passthrough.has_value()) {
       indices = indices.Reshard(HloSharding::Replicate());
@@ -1865,67 +2046,22 @@ Status SpmdPartitioningVisitor::HandleBroadcast(HloInstruction* hlo) {
   auto& operand = GetPartitionedHlo(hlo->operand(0));
 
   // Tiled output.
-  std::vector<int64> wanted_input_tile_size(operand.base_shape().rank());
-  std::vector<int64> sharded_new_dims;
-  for (int64 i = 0; i < operand.base_shape().rank(); ++i) {
-    wanted_input_tile_size[i] =
-        hlo->sharding().tile_assignment().dim(hlo->dimensions(i));
-  }
+  std::vector<int64> new_dims;
   for (int64 i = 0; i < hlo->shape().rank(); ++i) {
-    if (!absl::c_linear_search(hlo->dimensions(), i) &&
-        hlo->sharding().tile_assignment().dim(i) > 1) {
-      sharded_new_dims.push_back(i);
+    if (!absl::c_linear_search(hlo->dimensions(), i)) {
+      new_dims.push_back(i);
     }
   }
-  if (sharded_new_dims.empty()) {
-    // The new dimensions are replicated, so that we can do the adjustment on
-    // the input.
-    Array<int64> wanted_input_tile_assignment(wanted_input_tile_size);
-    wanted_input_tile_assignment.Each(
-        [&](absl::Span<const int64> indices, int64* val) {
-          std::vector<int64> indices_in_broadcast(hlo->shape().rank(), 0);
-          for (int64 i = 0; i < operand.base_shape().rank(); ++i) {
-            indices_in_broadcast[hlo->dimensions(i)] = indices[i];
-          }
-          *val = hlo->sharding().tile_assignment()(indices_in_broadcast);
-        });
-    SetPartitionedHlo(hlo, [&] {
-      return b_.AddInstruction(hlo->CloneWithNewOperands(
-          MakePartitionedShape(hlo->shape(), hlo->sharding()),
-          {operand.Reshard(HloSharding::Tile(wanted_input_tile_assignment))
-               .hlo()}));
-    });
-  } else {
-    auto input = operand.Reshard(HloSharding::Replicate()).hlo();
-    // We pad and shard the input first, then broadcast to the final shard
-    // shape.
-    auto output_offsets =
-        MakePartitionOffsets(hlo->shape(), hlo->sharding(), partition_id_, &b_);
-    std::vector<HloInstruction*> input_offsets(operand.base_shape().rank());
-    auto output_shard_shape =
-        MakePartitionedShape(hlo->shape(), hlo->sharding());
-    auto input_shard_shape = input->shape();
-    auto padded_input_shape = input->shape();
-    for (int64 i = 0; i < input_offsets.size(); ++i) {
-      input_offsets[i] = output_offsets[hlo->dimensions(i)];
-      input_shard_shape.set_dimensions(
-          i, output_shard_shape.dimensions(hlo->dimensions(i)));
-      padded_input_shape.set_dimensions(
-          i, hlo->sharding().tile_assignment().dim(hlo->dimensions(i)) *
-                 input_shard_shape.dimensions(i));
-    }
-    auto padded_input = PadToShape(input, padded_input_shape, &b_);
-    auto input_shard =
-        ShapeUtil::Compatible(input_shard_shape, padded_input->shape())
-            ? padded_input
-            : b_.AddInstruction(HloInstruction::CreateDynamicSlice(
-                  input_shard_shape, padded_input, input_offsets,
-                  input_shard_shape.dimensions()));
-    SetPartitionedHlo(hlo, [&] {
-      return b_.AddInstruction(
-          hlo->CloneWithNewOperands(output_shard_shape, {input_shard}));
-    });
-  }
+  auto desired_input_sharding = hlo_sharding_util::RemoveShapeDimensions(
+      hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(hlo->sharding(),
+                                                               new_dims),
+      new_dims);
+  auto input = operand.Reshard(desired_input_sharding).hlo();
+  auto output_shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+  SetPartitionedHlo(hlo, [&] {
+    return b_.AddInstruction(
+        hlo->CloneWithNewOperands(output_shard_shape, {input}));
+  });
   return Status::OK();
 }
 
@@ -2019,16 +2155,50 @@ Status SpmdPartitioningVisitor::HandleGather(HloInstruction* hlo) {
   const auto& dnums = gather->gather_dimension_numbers();
   auto operand = GetPartitionedHlo(gather->operand(0));
   auto indices = GetPartitionedHlo(gather->operand(1));
-  std::vector<int64> collapsed_slice_dims(dnums.collapsed_slice_dims().begin(),
-                                          dnums.collapsed_slice_dims().end());
   std::vector<int64> start_index_map(dnums.start_index_map().begin(),
                                      dnums.start_index_map().end());
-  std::vector<int64> offset_dims(dnums.offset_dims().begin(),
-                                 dnums.offset_dims().end());
-  if (!operand.sharding().IsTileMaximal()) {
-    auto maybe_passthrough = PassthroughOperandToGatherOutputOrScatterUpdate(
-        operand, gather->shape(), collapsed_slice_dims, start_index_map,
-        offset_dims, gather->gather_slice_sizes());
+  std::vector<int64> batch_dims;
+  for (int64 i = 0; i < gather->shape().rank(); ++i) {
+    if (!absl::c_linear_search(dnums.offset_dims(), i)) {
+      batch_dims.push_back(i);
+    }
+  }
+  if (operand.sharding().IsTileMaximal()) {
+    if (!indices.sharding().IsTileMaximal() &&
+        (dnums.index_vector_dim() == indices.base_shape().rank() ||
+         indices.sharding().tile_assignment().dim(dnums.index_vector_dim()) ==
+             1)) {
+      auto replicated_operand = operand.Replicate();
+      TF_ASSIGN_OR_RETURN(
+          Shape partitioned_output_shape,
+          ShapeInference::InferGatherShape(replicated_operand.hlo()->shape(),
+                                           indices.hlo()->shape(), dnums,
+                                           gather->gather_slice_sizes()));
+      auto pgather = b_.AddInstruction(gather->CloneWithNewOperands(
+          partitioned_output_shape, {replicated_operand.hlo(), indices.hlo()}));
+      std::vector<int64> output_dim_to_index_dim(pgather->shape().rank(), -1);
+      std::vector<int64> index_dim_to_output_dim(indices.base_shape().rank(),
+                                                 -1);
+      for (int64 i = 0; i < batch_dims.size(); ++i) {
+        int64 indices_batch_dim = i < dnums.index_vector_dim() ? i : i + 1;
+        output_dim_to_index_dim[batch_dims[i]] = indices_batch_dim;
+        index_dim_to_output_dim[indices_batch_dim] = batch_dims[i];
+      }
+      auto pgather_sharding = TransposeShardingWithCollapsedDims(
+          indices.sharding(), index_dim_to_output_dim, output_dim_to_index_dim);
+      CHECK(pgather_sharding.has_value());
+      pgather->set_sharding(*pgather_sharding);
+      SetPartitionedHlo(hlo, [&]() {
+        return PartitionedHlo(pgather, hlo->shape(), MakePartitioningState())
+            .Reshard(hlo->sharding())
+            .hlo();
+      });
+      return Status::OK();
+    }
+  } else {
+    auto maybe_passthrough =
+        hlo_sharding_util::GatherOutputShardingFromDataOperand(
+            operand.sharding(), *hlo);
     if (maybe_passthrough.has_value()) {
       indices = indices.Reshard(HloSharding::Replicate());
       auto pshape = MakePartitionedShape(gather->shape(), *maybe_passthrough);
@@ -2116,7 +2286,7 @@ Status SpmdPartitioningVisitor::HandleGather(HloInstruction* hlo) {
       // Combine from different partitions.
       auto ar = collective_ops_creator_.create_cross_partition_all_reduce(
           &b_, filtered,
-          MakeBinaryAdd(filtered->shape().element_type(), module_),
+          MakeBinaryAdd(filtered->shape().element_type(), module_), {},
           NewChannel());
       ar->set_sharding(HloSharding::Replicate());
       SetPartitionedHlo(hlo, [&]() {
@@ -2227,31 +2397,47 @@ Status SpmdPartitioningVisitor::HandleInfeed(HloInstruction* hlo) {
         /*parameter_number=*/0, token->shape(), "infeed_token_param"));
     auto infeed = branch_b.AddInstruction(HloInstruction::CreateInfeed(
         per_branch_partitioned_shapes[i], param, hlo->infeed_config()));
-    branches[i] = module_->AddEmbeddedComputation(branch_b.Build(infeed));
     if (!ShapeUtil::Compatible(per_branch_partitioned_shapes[i], shard_shape)) {
-      TF_ASSIGN_OR_RETURN(
-          auto padded,
-          branches[i]->DeepCopyInstructionWithCustomCopier(
-              infeed, [&](HloInstruction* leaf, const ShapeIndex& leaf_index,
-                          HloComputation* comp) {
-                // Index {1} corresponds to the token.
-                if (leaf_index.empty() || leaf_index[0] != 0) {
-                  return leaf;
-                }
-                ShapeIndexView subindex(leaf_index, 1);
-                if (ShapeUtil::Compatible(
-                        ShapeUtil::GetSubshape(per_branch_partitioned_shapes[i],
-                                               subindex),
-                        ShapeUtil::GetSubshape(shard_shape, subindex))) {
-                  return leaf;
-                }
-                return PadToShape(leaf,
-                                  ShapeUtil::GetSubshape(shard_shape, subindex),
-                                  nullptr, comp);
-              }));
-      branches[i]->set_root_instruction(padded,
-                                        /*accept_different_shape=*/true);
+      std::function<HloInstruction*(const ShapeIndex&, HloInstruction*)>
+          pad_infeed = [&](const ShapeIndex& index,
+                           HloInstruction* infeed_element) -> HloInstruction* {
+        if (index == ShapeIndex({1})) {
+          // Token.
+          return infeed_element;
+        }
+        const Shape& element_shape =
+            ShapeUtil::GetSubshape(infeed->shape(), index);
+        if (element_shape.IsTuple() && element_shape.tuple_shapes_size() > 0) {
+          std::vector<HloInstruction*> padded_elements(
+              element_shape.tuple_shapes_size());
+          for (int64 i = 0; i < padded_elements.size(); ++i) {
+            auto sub_index = index;
+            sub_index.push_back(i);
+            padded_elements[i] = pad_infeed(
+                sub_index,
+                branch_b.AddInstruction(HloInstruction::CreateGetTupleElement(
+                    ShapeUtil::GetSubshape(element_shape, {i}), infeed_element,
+                    i)));
+          }
+          return branch_b.AddInstruction(
+              HloInstruction::CreateTuple(padded_elements));
+        }
+        const Shape& pad_shape =
+            ShapeUtil::GetSubshape(shard_shape, ShapeIndexView(index, 1));
+        if (ShapeUtil::Compatible(element_shape, pad_shape)) {
+          return infeed_element;
+        }
+        if (element_shape.IsArray()) {
+          CHECK(pad_shape.IsArray());
+          return PadToShape(infeed_element, pad_shape, &branch_b);
+        }
+        CHECK(element_shape.IsTuple());
+        CHECK(element_shape.tuple_shapes().empty());
+        return CreateZero(pad_shape, &branch_b);
+      };
+      pad_infeed({}, infeed);
     }
+    branches[i] = module_->AddEmbeddedComputation(branch_b.Build());
   }
   SetPartitionedHlo(hlo, [&]() {
     return b_.AddInstruction(HloInstruction::CreateConditional(
@@ -2374,17 +2560,6 @@ Status SpmdPartitioningVisitor::HandleReduce(HloInstruction* hlo) {
     if (reduce_sharded_dimension && input_count > 1) {
       return DefaultAction(hlo);
     }
-
-    // Currently we only support reducing all or none of the sharded
-    // dimensions.
-    if (reduce_sharded_dimension) {
-      for (int64 i = 0; i < inputs[0].base_shape().rank(); ++i) {
-        if (inputs[0].sharding().tile_assignment().dim(i) > 1 &&
-            absl::c_count(hlo->dimensions(), i) == 0) {
-          return DefaultAction(hlo);
-        }
-      }
-    }
   }
 
   std::vector<Shape*> new_operand_shapes(input_count * 2);
@@ -2397,7 +2572,6 @@ Status SpmdPartitioningVisitor::HandleReduce(HloInstruction* hlo) {
       auto reduce_shape,
       ShapeInference::InferReduceShape(new_operand_shapes, hlo->dimensions(),
                                        hlo->to_apply()->ComputeProgramShape()));
-  *reduce_shape.mutable_layout() = hlo->shape().layout();
 
   std::vector<HloInstruction*> input_hlos(input_count);
   for (int64 i = 0; i < input_count; ++i) {
@@ -2408,36 +2582,30 @@ Status SpmdPartitioningVisitor::HandleReduce(HloInstruction* hlo) {
   local_reduce->set_metadata(hlo->metadata());
 
   SetPartitionedHlo(hlo, [&]() {
-    HloInstruction* reduce;
+    HloInstruction* reduce = local_reduce;
     if (reduce_sharded_dimension) {
       CHECK(local_reduce->shape().IsArray());
-      reduce = collective_ops_creator_.create_cross_partition_all_reduce(
-          &b_, local_reduce, hlo->to_apply(), NewChannel());
-      reduce->set_sharding(HloSharding::Replicate());
-    } else {
-      reduce = local_reduce;
-      if (inputs[0].sharding().IsTileMaximal()) {
-        reduce->set_sharding(inputs[0].sharding());
-      } else {
-        // Remove tile assignment dimensions that are reduced.
-        std::vector<int64> tile_dimensions;
-        for (int64 i = 0; i < input_hlos[0]->shape().rank(); ++i) {
-          if (absl::c_count(hlo->dimensions(), i) == 0) {
-            tile_dimensions.push_back(
-                inputs[0].sharding().tile_assignment().dim(i));
-          }
+      std::vector<int64> preserved_dims;
+      for (int64 i = 0; i < inputs[0].base_shape().rank(); ++i) {
+        if (!absl::c_linear_search(hlo->dimensions(), i)) {
+          preserved_dims.push_back(i);
         }
-        Array<int64> new_tile = inputs[0].sharding().tile_assignment();
-        new_tile.Reshape(tile_dimensions);
-        auto sharding = HloSharding::Tile(new_tile);
-        if (input_count > 1) {
-          std::vector<HloSharding> tuple(input_count, sharding);
-          sharding = HloSharding::Tuple(hlo->shape(), tuple);
-        }
-        reduce->set_sharding(sharding);
       }
+      if (inputs[0].sharding().ReplicateOnLastTileDim()) {
+        preserved_dims.push_back(inputs[0].base_shape().rank());
+      }
+      auto grouped = GroupShardingOnDims(inputs[0].sharding(), preserved_dims);
+      auto grouped_state = CreatePerGroupPartitioningState(
+          inputs[0].state(), grouped.device_groups, &b_);
+      reduce = grouped_state.collective_ops_creator
+                   .create_cross_partition_all_reduce(
+                       &b_, local_reduce, hlo->to_apply(), {}, NewChannel());
     }
-
+    auto sharding = hlo_sharding_util::RemoveShapeDimensions(
+        hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
+            inputs[0].sharding(), hlo->dimensions()),
+        hlo->dimensions());
+    reduce->set_sharding(sharding);
     return PartitionedHlo(reduce, hlo->shape(), MakePartitioningState())
         .Reshard(hlo->sharding())
         .hlo();
@@ -2846,1774 +3014,6 @@ Status SpmdPartitioningVisitor::HandleTuple(HloInstruction* hlo) {
   return Status::OK();
 }
 
-Status SpmdPartitioningVisitor::HandleConvolutionTiledLhsAndRhs(
-    HloInstruction* hlo) {
-  TF_RET_CHECK(hlo->opcode() == HloOpcode::kConvolution);
-
-  auto lhs = GetPartitionedHlo(hlo->operand(0));
-  auto rhs = GetPartitionedHlo(hlo->operand(1));
-  TF_RET_CHECK(!lhs.sharding().IsTileMaximal() &&
-               !rhs.sharding().IsTileMaximal());
-
-  const auto& dnums = hlo->convolution_dimension_numbers();
-
-  // Check if the operand shardings are aligned. Also we currently don't
-  // support partitioning non-spatial dimensions.
-  std::vector<int64> rhs_to_lhs_indices(hlo->shape().rank());
-  rhs_to_lhs_indices[dnums.kernel_output_feature_dimension()] =
-      dnums.input_batch_dimension();
-  rhs_to_lhs_indices[dnums.kernel_input_feature_dimension()] =
-      dnums.input_feature_dimension();
-  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
-    rhs_to_lhs_indices[dnums.kernel_spatial_dimensions(i)] =
-        dnums.input_spatial_dimensions(i);
-  }
-  std::vector<int64> lhs_to_rhs_indices(hlo->shape().rank());
-  for (int64 i = 0; i < rhs_to_lhs_indices.size(); ++i) {
-    lhs_to_rhs_indices[rhs_to_lhs_indices[i]] = i;
-  }
-
-  Window window = hlo->window();
-  std::vector<int64> reversed_rhs_dims;
-  for (int64 i = 0; i < window.dimensions_size(); ++i) {
-    if (window.dimensions(i).window_reversal()) {
-      reversed_rhs_dims.push_back(dnums.kernel_spatial_dimensions(i));
-    }
-  }
-  if (!reversed_rhs_dims.empty()) {
-    // Make the reversed dims left-padded to prepare for window reversal.
-    auto left_padded_rhs = HaloExchangeToPadOnLeft(rhs, reversed_rhs_dims);
-    if (left_padded_rhs == nullptr) {
-      return DefaultAction(hlo);
-    }
-    left_padded_rhs->set_sharding(rhs.sharding());
-    rhs = PartitionedHlo(left_padded_rhs, rhs.base_shape(), rhs.state());
-  }
-  // Consider window reversal when resharding RHS or LHS. Note: this will not
-  // reverse the data in the shard. We use window reversal to do that.
-  auto aligned_rhs_sharding = hlo_sharding_util::ReverseSharding(
-      hlo_sharding_util::TransposeSharding(lhs.sharding(), rhs_to_lhs_indices),
-      reversed_rhs_dims);
-  auto aligned_lhs_sharding = hlo_sharding_util::TransposeSharding(
-      hlo_sharding_util::ReverseSharding(rhs.sharding(), reversed_rhs_dims),
-      lhs_to_rhs_indices);
-
-  auto unsupported_sharding = [&](const HloSharding& lhs_sharding,
-                                  const HloSharding& rhs_sharding) {
-    return lhs_sharding.tile_assignment().dim(dnums.input_batch_dimension()) !=
-               1 ||
-           rhs_sharding.tile_assignment().dim(
-               dnums.kernel_output_feature_dimension()) != 1;
-  };
-
-  auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::Zero(hlo->shape().element_type())));
-  if (ShapeSizeInBytes(lhs.base_shape()) < ShapeSizeInBytes(rhs.base_shape())) {
-    if (unsupported_sharding(aligned_lhs_sharding, rhs.sharding())) {
-      return DefaultAction(hlo);
-    }
-    lhs = lhs.Reshard(aligned_lhs_sharding).PadWithValue(zero);
-    rhs = rhs.PadWithValue(zero, reversed_rhs_dims);
-  } else {
-    if (unsupported_sharding(lhs.sharding(), aligned_rhs_sharding)) {
-      return DefaultAction(hlo);
-    }
-    lhs = lhs.PadWithValue(zero);
-    rhs =
-        rhs.Reshard(aligned_rhs_sharding).PadWithValue(zero, reversed_rhs_dims);
-  }
-
-  // Reshard LHS by exchanging halo such that each shard computes the partial
-  // sum of the full shape result, and add AllReduce.
-  //
-  // The size of halo on each dimension can be calculated from the projection
-  // onto the LHS that each RHS shard i needs to read. RHS and LHS below refers
-  // to the shard size of RHS and LHS, WC is the number of windows, and D is the
-  // window dilation.
-  //
-  // * offset(i): RHS * D * i - low_padding
-  // * limit(i): {(RHS - 1) * D + 1} * (i + 1) + (WC - 1) * stride - low_padding
-  //
-  // Since shard i has LHS of range [i * LHS, (i + 1) * LHS)
-  // * left-halo: i * LHS - offset(i)
-  //              = (LHS - RHS) * i + low_padding
-  // * right-halo: limit(i) - (i + 1) * LHS
-  //   = [{(RHS - 1) * D + 1} - LHS] * (i + 1) + (WC - 1) * stride - low_padding
-  std::vector<int64> shard_counts(dnums.input_spatial_dimensions_size());
-  std::vector<int64> lhs_shard_sizes(dnums.input_spatial_dimensions_size());
-  std::vector<int64> rhs_shard_sizes(dnums.input_spatial_dimensions_size());
-  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
-    int64 lhs_dimension = dnums.input_spatial_dimensions(i);
-    int64 rhs_dimension = dnums.kernel_spatial_dimensions(i);
-    int64 shard_count = lhs.sharding().tile_assignment().dim(lhs_dimension);
-    auto wd = window.dimensions(i);
-    if (wd.base_dilation() != 1) {
-      return DefaultAction(hlo);
-    }
-
-    int64 lhs_shard_size =
-        CeilOfRatio(lhs.base_shape().dimensions(lhs_dimension), shard_count);
-    int64 rhs_shard_size =
-        CeilOfRatio(rhs.base_shape().dimensions(rhs_dimension), shard_count);
-    shard_counts[i] = shard_count;
-    lhs_shard_sizes[i] = lhs_shard_size;
-    rhs_shard_sizes[i] = rhs_shard_size;
-  }
-
-  std::vector<OffsetCalculation> left_halo_size_functions(hlo->shape().rank());
-  std::vector<OffsetCalculation> right_halo_size_functions(hlo->shape().rank());
-  Window new_window = window;
-
-  auto partition_ordinals =
-      MakeTiledPartitionOrdinals(lhs.sharding(), partition_id_, &b_);
-  HloInstruction* lhs_with_halo = lhs.hlo();
-  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
-    int64 lhs_dimension = dnums.input_spatial_dimensions(i);
-    int64 lhs_shard_size = lhs_shard_sizes[i];
-    int64 rhs_shard_size = rhs_shard_sizes[i];
-
-    if (shard_counts[i] == 1) {
-      continue;
-    }
-
-    // Calculate the left and right halo sizes as described in the comments
-    // above.
-    auto wd = window.dimensions(i);
-    int64 padding_low = wd.padding_low();
-    int64 padding_high = wd.padding_high();
-    int64 base = lhs.base_shape().dimensions(lhs_dimension);
-    int64 window_count = 1 + (padding_low + padding_high + base -
-                              (1 + (wd.size() - 1) * wd.window_dilation())) /
-                                 wd.stride();
-    int64 rhs_shard_size_dilated =
-        (rhs_shard_size - 1) * wd.window_dilation() + 1;
-
-    left_halo_size_functions[lhs_dimension] =
-        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
-            lhs_shard_size - rhs_shard_size * wd.window_dilation(), padding_low,
-            1));
-    right_halo_size_functions[lhs_dimension] =
-        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
-            rhs_shard_size_dilated - lhs_shard_size,
-            rhs_shard_size_dilated - lhs_shard_size +
-                wd.stride() * (window_count - 1) - padding_low,
-            1));
-
-    // Exchange halo and concatenate.
-    int64 dim = dnums.input_spatial_dimensions(i);
-    int64 explicit_left_padding_on_full_shape = padding_low;
-    int64 shard_size_with_halo =
-        wd.stride() * (window_count - 1) + rhs_shard_size_dilated;
-
-    new_window.mutable_dimensions(i)->set_padding_low(0);
-    new_window.mutable_dimensions(i)->set_padding_high(0);
-    new_window.mutable_dimensions(i)->set_size(rhs_shard_size);
-
-    // offset_on_padded_shape and padded_full_shape_size are needed only if
-    // we want to mask out-of-range values in ExchangeHaloAndGetValidData().
-    // Since the default value for both the collective-permute is zero and
-    // also we call PadWithValue() on both operands at the beginning, we
-    // don't need to mask here.
-    //
-    // TODO(hyoulkee): Consider removing one of the two PadWithValue() calls
-    // if it's always safe.
-    auto offset_on_padded_shape =
-        OffsetCalculation(MultiplyAddDivideOffsetCalculation());
-    int64 padded_full_shape_size = 0;
-    auto concat = ExchangeHaloAndGetValidData(
-        lhs_with_halo, lhs.base_shape(), left_halo_size_functions[dim],
-        right_halo_size_functions[dim], explicit_left_padding_on_full_shape,
-        padded_full_shape_size, shard_size_with_halo, dim, lhs.sharding(),
-        offset_on_padded_shape.Calculate(partition_ordinals[dim], &b_), zero,
-        partition_ordinals[dim], collective_ops_creator_, next_channel_id_, &b_,
-        /*mask_invalid_region=*/false);
-    if (!concat) {
-      return DefaultAction(hlo);
-    }
-    lhs_with_halo = *concat;
-  }
-
-  SetPartitionedHlo(hlo, [&]() {
-    auto conv = b_.AddInstruction(HloInstruction::CreateConvolve(
-        hlo->shape(), lhs_with_halo, rhs.hlo(), hlo->feature_group_count(),
-        hlo->batch_group_count(), new_window,
-        hlo->convolution_dimension_numbers(), hlo->precision_config()));
-    auto ar = collective_ops_creator_.create_cross_partition_all_reduce(
-        &b_, conv, MakeBinaryAdd(hlo->shape().element_type(), module_),
-        NewChannel());
-    ar->set_sharding(HloSharding::Replicate());
-    return PartitionedHlo(ar, hlo->shape(), MakePartitioningState())
-        .Reshard(hlo->sharding())
-        .hlo();
-  });
-  return Status::OK();
-}
-
-Status SpmdPartitioningVisitor::HandleConvolution(HloInstruction* hlo) {
-  auto dot_dnums = dot_as_convolution_util::ParseDotGeneralFromConvolution(hlo);
-  if (dot_dnums) {
-    // Use HandleDotHelper() for convs that are actually einsums.
-    spmd::DotGeneralDimsMapping mapping;
-    for (const auto& dims : dot_dnums->batch_dims) {
-      mapping.batch_dims.emplace_back();
-      mapping.batch_dims.back().lhs = dims.lhs;
-      mapping.batch_dims.back().rhs = dims.rhs;
-      mapping.batch_dims.back().output = dims.output;
-    }
-    for (const auto& dims : dot_dnums->contracting_dims) {
-      mapping.contracting_dims.emplace_back();
-      mapping.contracting_dims.back().lhs = dims.lhs;
-      mapping.contracting_dims.back().rhs = dims.rhs;
-      mapping.contracting_dims.back().output = dims.output;
-    }
-    for (const auto& dims : dot_dnums->lhs_non_contracting_dims) {
-      mapping.lhs_non_contracting_dims.emplace_back();
-      mapping.lhs_non_contracting_dims.back().lhs = dims.lhs;
-      mapping.lhs_non_contracting_dims.back().rhs = dims.rhs;
-      mapping.lhs_non_contracting_dims.back().output = dims.output;
-    }
-    for (const auto& dims : dot_dnums->rhs_non_contracting_dims) {
-      mapping.rhs_non_contracting_dims.emplace_back();
-      mapping.rhs_non_contracting_dims.back().lhs = dims.lhs;
-      mapping.rhs_non_contracting_dims.back().rhs = dims.rhs;
-      mapping.rhs_non_contracting_dims.back().output = dims.output;
-    }
-    auto create_sharded_conv =
-        [&](HloInstruction* lhs_hlo, HloInstruction* rhs_hlo,
-            spmd::SpmdBuilder* b) -> StatusOr<HloInstruction*> {
-      TF_ASSIGN_OR_RETURN(
-          auto sharded_conv,
-          dot_as_convolution_util::CreateShardedConvForDotGeneralConvolution(
-              *hlo, *dot_dnums, lhs_hlo, rhs_hlo));
-      return b->AddInstruction(std::move(sharded_conv));
-    };
-    return HandleDotHelper(hlo, mapping, create_sharded_conv);
-  }
-
-  auto lhs = GetPartitionedHlo(hlo->operand(0));
-  auto rhs = GetPartitionedHlo(hlo->operand(1));
-  const HloSharding& sharding = hlo->sharding();
-  const auto& dnums = hlo->convolution_dimension_numbers();
-  std::vector<int64> rhs_to_lhs_indices(hlo->shape().rank());
-  rhs_to_lhs_indices[dnums.kernel_output_feature_dimension()] =
-      dnums.input_batch_dimension();
-  rhs_to_lhs_indices[dnums.kernel_input_feature_dimension()] =
-      dnums.input_feature_dimension();
-  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
-    rhs_to_lhs_indices[dnums.kernel_spatial_dimensions(i)] =
-        dnums.input_spatial_dimensions(i);
-  }
-  std::vector<int64> lhs_to_rhs_indices(hlo->shape().rank());
-  for (int64 i = 0; i < rhs_to_lhs_indices.size(); ++i) {
-    lhs_to_rhs_indices[rhs_to_lhs_indices[i]] = i;
-  }
-  auto aligned_rhs_sharding =
-      hlo_sharding_util::TransposeSharding(lhs.sharding(), rhs_to_lhs_indices);
-  auto aligned_lhs_sharding =
-      hlo_sharding_util::TransposeSharding(rhs.sharding(), lhs_to_rhs_indices);
-
-  // Handling cases where both operands' shardings are aligned. We check that
-  // the LHS batch dimension is not partitioned because it is mapped to the
-  // output feature dimension in aligned_rhs_sharding, which are not the same
-  // dimension.
-  if (!lhs.sharding().IsTileMaximal() && !rhs.sharding().IsTileMaximal()) {
-    if (options_.conv_halo_exchange_always_on_lhs) {
-      return HandleConvolutionTiledLhsAndRhs(hlo);
-    } else {
-      // Reshard RHS so that each shard computes the partial sum of the full
-      // shape result, and add AllReduce. See HandleConvolutionTiledLhsAndRhs()
-      // that reshards LHS.
-      //
-      // The size of halo on each dimension can be calculated from the
-      // projection onto the RHS that shard i needs to read. RHS and LHS below
-      // refers to the shard size of RHS and LHS, WC is the number of windows,
-      // and D is the window dilation.
-      //
-      // * offset(i): LHS * i + low_padding - (WC - 1) * stride
-      // * limit(i): LHS * (i + 1) + low_padding
-      //
-      // Since shard i has RHS of range [i * RHS * D, (i + 1) * RHS * D)
-      // * left-halo: i * RHS - offset(i)
-      //              = i * (RHS * D - LHS) + (WC - 1) * stride - low_padding
-      // * right-halo: limit(i) - (i + 1) * RHS
-      //              = (i + 1) * (LHS - RHS * D) + low_pading
-
-      auto unsupported_sharding = [&](const HloSharding& lhs_sharding,
-                                      const HloSharding& rhs_sharding) {
-        // We currently don't support partitioning input batch or output feature
-        // dimensions.
-        return lhs_sharding.tile_assignment().dim(
-                   dnums.input_batch_dimension()) != 1 ||
-               rhs_sharding.tile_assignment().dim(
-                   dnums.kernel_output_feature_dimension()) != 1;
-      };
-      auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::Zero(hlo->shape().element_type())));
-      if (ShapeSizeInBytes(lhs.base_shape()) <
-          ShapeSizeInBytes(rhs.base_shape())) {
-        if (unsupported_sharding(aligned_lhs_sharding, rhs.sharding())) {
-          return DefaultAction(hlo);
-        }
-        lhs = lhs.Reshard(aligned_lhs_sharding).PadWithValue(zero);
-        rhs = rhs.PadWithValue(zero);
-      } else {
-        if (unsupported_sharding(lhs.sharding(), aligned_rhs_sharding)) {
-          return DefaultAction(hlo);
-        }
-        lhs = lhs.PadWithValue(zero);
-        rhs = rhs.Reshard(aligned_rhs_sharding).PadWithValue(zero);
-      }
-
-      Window window = hlo->window();
-      std::vector<int64> shard_counts(dnums.input_spatial_dimensions_size());
-      std::vector<int64> lhs_shard_sizes(dnums.input_spatial_dimensions_size());
-      std::vector<int64> rhs_shard_sizes(dnums.input_spatial_dimensions_size());
-      for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
-        int64 lhs_dimension = dnums.input_spatial_dimensions(i);
-        int64 rhs_dimension = dnums.kernel_spatial_dimensions(i);
-        int64 shard_count = rhs.sharding().tile_assignment().dim(rhs_dimension);
-        auto wd = window.dimensions(i);
-        if (wd.base_dilation() != 1 || wd.window_reversal()) {
-          return DefaultAction(hlo);
-        }
-
-        int64 lhs_shard_size = CeilOfRatio(
-            lhs.base_shape().dimensions(lhs_dimension), shard_count);
-        int64 rhs_shard_size = CeilOfRatio(
-            rhs.base_shape().dimensions(rhs_dimension), shard_count);
-        shard_counts[i] = shard_count;
-        lhs_shard_sizes[i] = lhs_shard_size;
-        rhs_shard_sizes[i] = rhs_shard_size;
-      }
-
-      std::vector<OffsetCalculation> left_halo_size_functions(
-          hlo->shape().rank());
-      std::vector<OffsetCalculation> right_halo_size_functions(
-          hlo->shape().rank());
-      Window new_window = window;
-
-      // Data structures needed for Pad and DynamicSlice on LHS if needed.
-      bool need_dynamic_slice_lhs = false;
-      auto partition_ordinals =
-          MakeTiledPartitionOrdinals(lhs.sharding(), partition_id_, &b_);
-      std::vector<int64> zero_padding(hlo->shape().rank());
-      PaddingConfig pad_config =
-          window_util::MakeSymmetricPadding(zero_padding);
-      auto zero_s32 = b_.AddInstruction(
-          HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
-      std::vector<HloInstruction*> dynamic_slice_start_indices(
-          hlo->shape().rank(), zero_s32);
-      Shape dynamic_slice_shape = lhs.hlo()->shape();
-      Shape pad_shape = lhs.hlo()->shape();
-
-      for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
-        int64 lhs_dimension = dnums.input_spatial_dimensions(i);
-        int64 rhs_dimension = dnums.kernel_spatial_dimensions(i);
-        int64 lhs_shard_size = lhs_shard_sizes[i];
-        int64 rhs_shard_size = rhs_shard_sizes[i];
-
-        if (shard_counts[i] == 1) {
-          continue;
-        }
-
-        // Calculate the left and right halo sizes as described in the comments
-        // above. It calculcates the halo sizes with dilation, so we apply
-        // CeilOfRatio({left,right}_halo_size, window_dilation).
-        auto wd = window.dimensions(i);
-        int64 padding_low = wd.padding_low();
-        int64 padding_high = wd.padding_high();
-        int64 base = lhs.base_shape().dimensions(lhs_dimension);
-        int64 window_count =
-            1 + (padding_low + padding_high + base -
-                 (1 + (wd.size() - 1) * wd.window_dilation())) /
-                    wd.stride();
-        left_halo_size_functions[rhs_dimension] =
-            OffsetCalculation(MultiplyAddDivideOffsetCalculation(
-                rhs_shard_size * wd.window_dilation() - lhs_shard_size,
-                (window_count - 1) * wd.stride() - padding_low +
-                    wd.window_dilation() - 1,
-                wd.window_dilation()));
-        right_halo_size_functions[rhs_dimension] =
-            OffsetCalculation(MultiplyAddDivideOffsetCalculation(
-                lhs_shard_size - rhs_shard_size * wd.window_dilation(),
-                lhs_shard_size - rhs_shard_size * wd.window_dilation() +
-                    padding_low + wd.window_dilation() - 1,
-                wd.window_dilation()));
-
-        // New RHS window size includes the maximum of both left and right
-        // halos.
-        int64 halo_size = left_halo_size_functions[rhs_dimension].MaxInRange(
-                              1, shard_counts[i]) +
-                          right_halo_size_functions[rhs_dimension].MaxInRange(
-                              0, shard_counts[i] - 1);
-        int64 new_window_size =
-            rhs.hlo()->shape().dimensions(rhs_dimension) + halo_size;
-
-        // The amount of new low padding could be dynamic (e.g., window_dilation
-        // != 1), which requires pad (to the maximum) and dynamic slice on LHS.
-        //
-        // If we consider the first window, the offset of the dilated RHS that
-        // aligns with the first valid LHS element for shard i is 'padding_low +
-        // LHS * i'. When the left halo is added to RHS, the offset of the first
-        // RHS element is (RHS * i - left_halo) * window_dilation. The
-        // difference between the two values is the amount of padding_low we
-        // need on LHS.
-        auto new_padding_low_function =
-            OffsetCalculation(
-                HloOpcode::kMultiply, left_halo_size_functions[rhs_dimension],
-                OffsetCalculation(MultiplyAddDivideOffsetCalculation(
-                    0, wd.window_dilation(), 1))) -
-            OffsetCalculation(MultiplyAddDivideOffsetCalculation(
-                rhs_shard_size * wd.window_dilation() - lhs_shard_size,
-                -padding_low, 1));
-
-        int64 new_padding_low_max =
-            new_padding_low_function.MaxInRange(0, shard_counts[i]);
-        int64 new_padding_low = new_padding_low_max;
-        int64 new_padding_high = window_count * wd.stride() +
-                                 (new_window_size - 1) * wd.window_dilation() -
-                                 new_padding_low - lhs_shard_size;
-
-        // We do pad/dynamic-slice only when the padding is dynamic.
-        if (!new_padding_low_function.IsConstant()) {
-          need_dynamic_slice_lhs = true;
-          new_padding_low = 0;
-          pad_config.mutable_dimensions(lhs_dimension)
-              ->set_edge_padding_low(new_padding_low_max);
-          pad_config.mutable_dimensions(lhs_dimension)
-              ->set_edge_padding_high(new_padding_low_max);
-          pad_shape.set_dimensions(lhs_dimension,
-                                   lhs_shard_size + 2 * new_padding_low_max);
-          dynamic_slice_start_indices[lhs_dimension] =
-              (OffsetCalculation(MultiplyAddDivideOffsetCalculation(
-                   0, new_padding_low_max, 1)) -
-               new_padding_low_function)
-                  .Calculate(partition_ordinals[lhs_dimension], &b_);
-          dynamic_slice_shape.set_dimensions(
-              lhs_dimension, lhs_shard_size + new_padding_low_max);
-        }
-
-        // Since the convolution RHS operand size increased with halos, adjust
-        // the window config accordingly.
-        new_window.mutable_dimensions(i)->set_padding_low(new_padding_low);
-        new_window.mutable_dimensions(i)->set_padding_high(new_padding_high);
-        new_window.mutable_dimensions(i)->set_size(
-            rhs.hlo()->shape().dimensions(rhs_dimension) + halo_size);
-      }
-
-      HloInstruction* conv_lhs = lhs.hlo();
-      if (need_dynamic_slice_lhs) {
-        auto pad = b_.AddInstruction(
-            HloInstruction::CreatePad(pad_shape, lhs.hlo(), zero, pad_config));
-        conv_lhs = b_.AddInstruction(HloInstruction::CreateDynamicSlice(
-            dynamic_slice_shape, pad, dynamic_slice_start_indices,
-            dynamic_slice_shape.dimensions()));
-      }
-
-      // Exchange halo and concatenate.
-      HloInstruction* rhs_with_halo = rhs.hlo();
-      for (int i = 0; i < dnums.kernel_spatial_dimensions_size(); ++i) {
-        int64 dim = dnums.kernel_spatial_dimensions(i);
-        int64 explicit_left_padding_on_full_shape =
-            left_halo_size_functions[dim].Calculate(0);
-        int64 shard_size_with_halo = new_window.dimensions(i).size();
-
-        // offset_on_padded_shape and padded_full_shape_size are needed only if
-        // we want to mask out-of-range values in ExchangeHaloAndGetValidData().
-        // Since the default value for both the collective-permute is zero and
-        // also we call PadWithValue() on both operands at the beginning, we
-        // don't need to mask here.
-        //
-        // TODO(hyoulkee): Consider removing one of the two PadWithValue() calls
-        // if it's always safe.
-        auto offset_on_padded_shape =
-            OffsetCalculation(MultiplyAddDivideOffsetCalculation(
-                rhs_shard_sizes[i], explicit_left_padding_on_full_shape, 1)) -
-            left_halo_size_functions[dim];
-        int64 padded_full_shape_size =
-            offset_on_padded_shape.Calculate(shard_counts[i] - 1) +
-            new_window.dimensions(i).size();
-        auto concat = ExchangeHaloAndGetValidData(
-            rhs_with_halo, rhs.base_shape(), left_halo_size_functions[dim],
-            right_halo_size_functions[dim], explicit_left_padding_on_full_shape,
-            padded_full_shape_size, shard_size_with_halo, dim, rhs.sharding(),
-            offset_on_padded_shape.Calculate(partition_ordinals[dim], &b_),
-            zero, partition_ordinals[dim], collective_ops_creator_,
-            next_channel_id_, &b_, /*mask_invalid_region=*/false);
-        if (!concat) {
-          return DefaultAction(hlo);
-        }
-        rhs_with_halo = *concat;
-      }
-
-      SetPartitionedHlo(hlo, [&]() {
-        auto conv = b_.AddInstruction(HloInstruction::CreateConvolve(
-            hlo->shape(), conv_lhs, rhs_with_halo, hlo->feature_group_count(),
-            hlo->batch_group_count(), new_window, dnums,
-            hlo->precision_config()));
-        auto ar = collective_ops_creator_.create_cross_partition_all_reduce(
-            &b_, conv, MakeBinaryAdd(hlo->shape().element_type(), module_),
-            NewChannel());
-        ar->set_sharding(HloSharding::Replicate());
-        return PartitionedHlo(ar, hlo->shape(), MakePartitioningState())
-            .Reshard(hlo->sharding())
-            .hlo();
-      });
-      return Status::OK();
-    }
-  }
-
-  if (!sharding.IsTileMaximal()) {
-    // We don't currently support sharding on output feature dimension.
-    if (sharding.tile_assignment().dim(dnums.output_feature_dimension()) > 1) {
-      return DefaultAction(hlo);
-    }
-
-    // Check if the operand and the output sharding are aligned.
-    std::vector<int64> input_to_output_indices(hlo->shape().rank());
-    input_to_output_indices[dnums.input_batch_dimension()] =
-        dnums.output_batch_dimension();
-    input_to_output_indices[dnums.input_feature_dimension()] =
-        dnums.output_feature_dimension();
-    for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
-      input_to_output_indices[dnums.input_spatial_dimensions(i)] =
-          dnums.output_spatial_dimensions(i);
-    }
-    auto target_operand_sharding =
-        hlo_sharding_util::TransposeSharding(sharding, input_to_output_indices);
-    lhs = lhs.Reshard(target_operand_sharding);
-
-    // Replicate the RHS.
-    rhs = rhs.Reshard(HloSharding::Replicate());
-
-    // Convolution window config does not include batch and feature dimensions,
-    // whereas ReshardAsWindowedInput() expects the same number of window
-    // dimensions as the rank of the operand. So add two more trivial
-    // dimensions.
-    std::vector<int64> ones(hlo->shape().rank(), 1);
-    auto operand_window = window_util::MakeWindow(ones);
-    for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
-      *operand_window.mutable_dimensions(dnums.input_spatial_dimensions(i)) =
-          hlo->window().dimensions(i);
-    }
-
-    auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
-        LiteralUtil::Zero(hlo->shape().element_type())));
-    auto resharded_operand_and_window = lhs.ReshardAsWindowedInput(
-        operand_window, target_operand_sharding, zero);
-    if (!resharded_operand_and_window.has_value()) {
-      return DefaultAction(hlo);
-    }
-    Window new_window;
-    for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
-      *new_window.add_dimensions() =
-          resharded_operand_and_window->shard_window.dimensions(
-              dnums.input_spatial_dimensions(i));
-    }
-    TF_ASSIGN_OR_RETURN(
-        Shape sharded_conv_shape,
-        ShapeInference::InferConvolveShape(
-            resharded_operand_and_window->sharded_input->shape(),
-            rhs.hlo()->shape(), hlo->feature_group_count(),
-            hlo->batch_group_count(), new_window, dnums));
-    auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
-    *sharded_conv_shape.mutable_layout() = shard_shape.layout();
-    SetPartitionedHlo(hlo, [&]() {
-      auto sharded_conv = b_.AddInstruction(HloInstruction::CreateConvolve(
-          sharded_conv_shape, resharded_operand_and_window->sharded_input,
-          rhs.hlo(), hlo->feature_group_count(), hlo->batch_group_count(),
-          new_window, dnums, hlo->precision_config()));
-      if (!resharded_operand_and_window->dynamic_slice_index_on_output
-               .has_value()) {
-        CHECK(ShapeUtil::Compatible(shard_shape, sharded_conv->shape()));
-        return sharded_conv;
-      }
-      return b_.AddInstruction(HloInstruction::CreateDynamicSlice(
-          shard_shape, sharded_conv,
-          *resharded_operand_and_window->dynamic_slice_index_on_output,
-          shard_shape.dimensions()));
-    });
-    return Status::OK();
-  }
-  return DefaultAction(hlo);
-}
-
-Status SpmdPartitioningVisitor::HandleDot(HloInstruction* hlo) {
-  DotGeneralDimsMapping mapping;
-  const auto& dnums = hlo->dot_dimension_numbers();
-  int64 next_output_dim = 0;
-  for (int64 i = 0; i < dnums.lhs_batch_dimensions_size(); ++i) {
-    mapping.batch_dims.emplace_back();
-    mapping.batch_dims.back().lhs = dnums.lhs_batch_dimensions(i);
-    mapping.batch_dims.back().rhs = dnums.rhs_batch_dimensions(i);
-    mapping.batch_dims.back().output = next_output_dim++;
-  }
-  for (int64 i = 0; i < dnums.lhs_contracting_dimensions_size(); ++i) {
-    mapping.contracting_dims.emplace_back();
-    mapping.contracting_dims.back().lhs = dnums.lhs_contracting_dimensions(i);
-    mapping.contracting_dims.back().rhs = dnums.rhs_contracting_dimensions(i);
-    mapping.contracting_dims.back().output = -1;
-  }
-  for (int64 i = 0; i < hlo->operand(0)->shape().rank(); ++i) {
-    if (absl::c_linear_search(dnums.lhs_batch_dimensions(), i) ||
-        absl::c_linear_search(dnums.lhs_contracting_dimensions(), i)) {
-      continue;
-    }
-    mapping.lhs_non_contracting_dims.emplace_back();
-    mapping.lhs_non_contracting_dims.back().lhs = i;
-    mapping.lhs_non_contracting_dims.back().rhs = -1;
-    mapping.lhs_non_contracting_dims.back().output = next_output_dim++;
-  }
-  for (int64 i = 0; i < hlo->operand(1)->shape().rank(); ++i) {
-    if (absl::c_linear_search(dnums.rhs_batch_dimensions(), i) ||
-        absl::c_linear_search(dnums.rhs_contracting_dimensions(), i)) {
-      continue;
-    }
-    mapping.rhs_non_contracting_dims.emplace_back();
-    mapping.rhs_non_contracting_dims.back().lhs = -1;
-    mapping.rhs_non_contracting_dims.back().rhs = i;
-    mapping.rhs_non_contracting_dims.back().output = next_output_dim++;
-  }
-  auto create_sharded_dot = [&](HloInstruction* l, HloInstruction* r,
-                                SpmdBuilder* b) -> StatusOr<HloInstruction*> {
-    TF_ASSIGN_OR_RETURN(
-        auto sharded_dot_shape,
-        ShapeInference::InferDotOpShape(l->shape(), r->shape(),
-                                        hlo->dot_dimension_numbers()));
-    return b->AddInstruction(HloInstruction::CreateDot(
-        sharded_dot_shape, l, r, hlo->dot_dimension_numbers(),
-        hlo->precision_config()));
-  };
-  return HandleDotHelper(hlo, mapping, create_sharded_dot);
-}
-
-Status SpmdPartitioningVisitor::HandleDotHelper(
-    HloInstruction* hlo, const DotGeneralDimsMapping& dims_mapping,
-    const std::function<StatusOr<HloInstruction*>(
-        HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot) {
-  const HloSharding& lhs_sharding = hlo->operand(0)->sharding();
-  const HloSharding& rhs_sharding = hlo->operand(1)->sharding();
-
-  // Similar to hlo_sharding_util::TransposeSharding(), but allows
-  // removing/adding non-partitioned dimensions.
-  auto transpose_sharding =
-      [&](const HloSharding& source, absl::Span<int64 const> src_to_tgt,
-          absl::Span<int64 const> tgt_to_src) -> absl::optional<HloSharding> {
-    if (source.IsTileMaximal()) {
-      return source;
-    }
-    std::vector<int64> tgt_dims_skipping_new(tgt_to_src.size(), -1);
-    int64 skipped_tgt_dims = 0;
-    for (int64 i = 0; i < tgt_to_src.size(); ++i) {
-      if (tgt_to_src[i] < 0) {
-        skipped_tgt_dims++;
-      } else {
-        tgt_dims_skipping_new[i] = i - skipped_tgt_dims;
-      }
-    }
-    int64 skipped_src_dims = absl::c_count(src_to_tgt, -1);
-    std::vector<int64> perm(src_to_tgt.size());
-    for (int64 i = 0; i < src_to_tgt.size(); ++i) {
-      if (src_to_tgt[i] < 0) {
-        if (source.tile_assignment().dim(i) > 1) {
-          return absl::nullopt;
-        }
-        perm[src_to_tgt.size() - skipped_src_dims] = i;
-        skipped_src_dims--;
-      } else {
-        perm[tgt_dims_skipping_new[src_to_tgt[i]]] = i;
-      }
-    }
-    auto tgt_sharding = hlo_sharding_util::TransposeSharding(source, perm);
-    if (skipped_tgt_dims == 0) {
-      return tgt_sharding;
-    }
-    auto reshape_tiles = tgt_sharding.tile_assignment();
-    std::vector<int64> tgt_tiles(tgt_to_src.size(), 1);
-    for (int64 i = 0; i < tgt_tiles.size(); ++i) {
-      if (tgt_to_src[i] >= 0) {
-        tgt_tiles[i] = reshape_tiles.dim(tgt_dims_skipping_new[i]);
-      }
-    }
-    reshape_tiles.Reshape(tgt_tiles);
-    return HloSharding::Tile(reshape_tiles);
-  };
-
-  std::vector<int64> lhs_to_rhs_indices(hlo->operand(0)->shape().rank(), -1);
-  std::vector<int64> lhs_to_output_indices(hlo->operand(0)->shape().rank(), -1);
-  std::vector<int64> rhs_to_lhs_indices(hlo->operand(1)->shape().rank(), -1);
-  std::vector<int64> rhs_to_output_indices(hlo->operand(1)->shape().rank(), -1);
-  std::vector<int64> output_to_lhs_indices(hlo->shape().rank(), -1);
-  std::vector<int64> output_to_rhs_indices(hlo->shape().rank(), -1);
-  auto populate_indices_mapping =
-      [&](const DotGeneralDimsMapping::DimsMapping& mapping) {
-        if (mapping.lhs >= 0) {
-          lhs_to_rhs_indices[mapping.lhs] = mapping.rhs;
-          lhs_to_output_indices[mapping.lhs] = mapping.output;
-        }
-        if (mapping.rhs >= 0) {
-          rhs_to_lhs_indices[mapping.rhs] = mapping.lhs;
-          rhs_to_output_indices[mapping.rhs] = mapping.output;
-        }
-        if (mapping.output >= 0) {
-          output_to_lhs_indices[mapping.output] = mapping.lhs;
-          output_to_rhs_indices[mapping.output] = mapping.rhs;
-        }
-      };
-  for (const auto& mapping : dims_mapping.batch_dims) {
-    populate_indices_mapping(mapping);
-  }
-  for (const auto& mapping : dims_mapping.contracting_dims) {
-    populate_indices_mapping(mapping);
-  }
-  for (const auto& mapping : dims_mapping.lhs_non_contracting_dims) {
-    populate_indices_mapping(mapping);
-  }
-  for (const auto& mapping : dims_mapping.rhs_non_contracting_dims) {
-    populate_indices_mapping(mapping);
-  }
-  auto lhs_sharding_transposed_to_match_rhs =
-      transpose_sharding(lhs_sharding, lhs_to_rhs_indices, rhs_to_lhs_indices);
-  auto rhs_sharding_transposed_to_match_lhs =
-      transpose_sharding(rhs_sharding, rhs_to_lhs_indices, lhs_to_rhs_indices);
-  auto lhs_sharding_transposed_to_match_output = transpose_sharding(
-      lhs_sharding, lhs_to_output_indices, output_to_lhs_indices);
-  auto rhs_sharding_transposed_to_match_output = transpose_sharding(
-      rhs_sharding, rhs_to_output_indices, output_to_rhs_indices);
-  auto output_sharding_transposed_to_match_lhs = transpose_sharding(
-      hlo->sharding(), output_to_lhs_indices, lhs_to_output_indices);
-  auto output_sharding_transposed_to_match_rhs = transpose_sharding(
-      hlo->sharding(), output_to_rhs_indices, rhs_to_output_indices);
-
-  // lhs_rhs_or_output: 0 lhs, 1 rhs, 2 output.
-  auto get_partitions_for_dims =
-      [&](const HloSharding& sharding,
-          absl::Span<const DotGeneralDimsMapping::DimsMapping> dims,
-          int lhs_rhs_or_output) {
-        int64 partitions = 1;
-        if (sharding.IsTileMaximal()) {
-          return partitions;
-        }
-        for (const auto& dim : dims) {
-          if (lhs_rhs_or_output == 0) {
-            partitions *= sharding.tile_assignment().dim(dim.lhs);
-          } else if (lhs_rhs_or_output == 1) {
-            partitions *= sharding.tile_assignment().dim(dim.rhs);
-          } else {
-            CHECK_EQ(lhs_rhs_or_output, 2);
-            partitions *= sharding.tile_assignment().dim(dim.output);
-          }
-        }
-        return partitions;
-      };
-  const int64 lhs_batch_partitions =
-      get_partitions_for_dims(lhs_sharding, dims_mapping.batch_dims, 0);
-  const int64 rhs_batch_partitions =
-      get_partitions_for_dims(rhs_sharding, dims_mapping.batch_dims, 1);
-  const int64 output_batch_partitions =
-      get_partitions_for_dims(hlo->sharding(), dims_mapping.batch_dims, 2);
-  const int64 lhs_contracting_partitions =
-      get_partitions_for_dims(lhs_sharding, dims_mapping.contracting_dims, 0);
-  const int64 rhs_contracting_partitions =
-      get_partitions_for_dims(rhs_sharding, dims_mapping.contracting_dims, 1);
-  const int64 lhs_non_contracting_partitions = get_partitions_for_dims(
-      lhs_sharding, dims_mapping.lhs_non_contracting_dims, 0);
-  const int64 rhs_non_contracting_partitions = get_partitions_for_dims(
-      rhs_sharding, dims_mapping.rhs_non_contracting_dims, 1);
-  const int64 output_lhs_non_contracting_partitions = get_partitions_for_dims(
-      hlo->sharding(), dims_mapping.lhs_non_contracting_dims, 2);
-  const int64 output_rhs_non_contracting_partitions = get_partitions_for_dims(
-      hlo->sharding(), dims_mapping.rhs_non_contracting_dims, 2);
-
-  auto& lhs = GetPartitionedHlo(hlo->operand(0));
-  auto& rhs = GetPartitionedHlo(hlo->operand(1));
-  // LHS and RHS are partitioned the same way and only partitioned in batch
-  // dimensions.
-  if (lhs_batch_partitions == rhs_batch_partitions &&
-      rhs_batch_partitions == num_partitions_ &&
-      lhs_sharding_transposed_to_match_rhs == rhs_sharding) {
-    TF_ASSIGN_OR_RETURN(auto dot,
-                        create_sharded_dot(lhs.hlo(), rhs.hlo(), &b_));
-    SetPartitionedHlo(hlo, [&] {
-      dot->set_sharding(*lhs_sharding_transposed_to_match_output);
-      return PartitionedHlo(dot, hlo->shape(), MakePartitioningState())
-          .Reshard(hlo->sharding())
-          .hlo();
-    });
-    return Status::OK();
-  }
-
-  // Try emit batch-partitioned einsum with one operand resharded. Returns
-  // whether the attempt succeeds. If may_reshard_with_allreduce is false,
-  // reshard must be done using all-to-all; otherwise this attempt fails.
-  auto try_emit_output_batch_partitioned_einsum_with_reshard =
-      [&](bool may_reshard_with_allreduce) -> StatusOr<bool> {
-    // LHS and output are batch partitioned in the same way.
-    if (lhs_batch_partitions == num_partitions_ &&
-        output_batch_partitions == num_partitions_ &&
-        lhs_sharding_transposed_to_match_output == hlo->sharding()) {
-      if (!may_reshard_with_allreduce &&
-          !CanReshardWithAllToAll(rhs.sharding(),
-                                  *lhs_sharding_transposed_to_match_rhs)) {
-        return false;
-      }
-      auto resharded_rhs = rhs.Reshard(*lhs_sharding_transposed_to_match_rhs);
-      TF_ASSIGN_OR_RETURN(
-          auto dot, create_sharded_dot(lhs.hlo(), resharded_rhs.hlo(), &b_));
-      SetPartitionedHlo(hlo, [&] { return dot; });
-      return true;
-    }
-    // RHS and output are batch partitioned in the same way.
-    if (rhs_batch_partitions == num_partitions_ &&
-        output_batch_partitions == num_partitions_ &&
-        rhs_sharding_transposed_to_match_output == hlo->sharding()) {
-      if (!may_reshard_with_allreduce &&
-          !CanReshardWithAllToAll(lhs.sharding(),
-                                  *rhs_sharding_transposed_to_match_lhs)) {
-        return false;
-      }
-      auto resharded_lhs = lhs.Reshard(*rhs_sharding_transposed_to_match_lhs);
-      TF_ASSIGN_OR_RETURN(
-          auto dot, create_sharded_dot(resharded_lhs.hlo(), rhs.hlo(), &b_));
-      SetPartitionedHlo(hlo, [&] { return dot; });
-      return true;
-    }
-    return false;
-  };
-
-  {
-    // Try batch-parallel by resharding one operand, and not using all-reduce.
-    TF_ASSIGN_OR_RETURN(
-        bool emitted,
-        try_emit_output_batch_partitioned_einsum_with_reshard(false));
-    if (emitted) {
-      return Status::OK();
-    }
-  }
-
-  // Try to emit windowed DotGeneral when one operand is partitioned in the same
-  // way as the output along non-contracting dimensions, but the other operand
-  // is tiled in other dimensions.
-  auto emit_windowed_dot_general = [&](int64 matching_operand,
-                                       int64 windowing_operand,
-                                       bool windowed_at_contracting_dims,
-                                       bool windowed_at_batch_dims) {
-    CHECK_EQ(matching_operand + windowing_operand, 1);
-    CHECK(!windowed_at_batch_dims || !windowed_at_contracting_dims);
-    auto unpadded_result_buffer_shape =
-        MakePartitionedShape(hlo->shape(), hlo->sharding());
-    auto padded_result_buffer_shape = unpadded_result_buffer_shape;
-    // For windowing at batch/non-contracting dims, we produce the result one
-    // partition at a time, so we need to pad the shape in case of uneven
-    // partitioning in order to make dynamic-update-slice in-bound.
-    if (!windowed_at_contracting_dims) {
-      padded_result_buffer_shape = GetPaddedShapeForUnevenPartitioning(
-          padded_result_buffer_shape,
-          windowing_operand == 0 ? *lhs_sharding_transposed_to_match_output
-                                 : *rhs_sharding_transposed_to_match_output);
-    }
-    // Mask the padding area of the windowed operand with zero if there is
-    // uneven partitioning.
-    if (windowed_at_contracting_dims) {
-      auto& to_mask = windowing_operand == 0 ? lhs : rhs;
-      to_mask =
-          to_mask.PadWithValue(b_.AddInstruction(HloInstruction::CreateConstant(
-              LiteralUtil::Zero(hlo->shape().element_type()))));
-    }
-    auto result_buffer = CreateZero(padded_result_buffer_shape, &b_);
-    auto iteration = b_.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(0)));
-
-    // Create a while loop that computes one window per iteration. During each
-    // iteration, each partition sends its input window to its neighbor using
-    // collective-permute for the next iteration.
-    SpmdBuilder body_b("windowed_dot_general_body", visiting_hlo_);
-    auto param = body_b.AddInstruction(HloInstruction::CreateParameter(
-        /*parameter_number=*/0,
-        ShapeUtil::MakeTupleShape({lhs.hlo()->shape(), rhs.hlo()->shape(),
-                                   result_buffer->shape(), iteration->shape()}),
-        "param"));
-    auto l = body_b.AddInstruction(
-        HloInstruction::CreateGetTupleElement(lhs.hlo()->shape(), param, 0));
-    auto r = body_b.AddInstruction(
-        HloInstruction::CreateGetTupleElement(rhs.hlo()->shape(), param, 1));
-    auto o = body_b.AddInstruction(HloInstruction::CreateGetTupleElement(
-        result_buffer->shape(), param, 2));
-    auto i = body_b.AddInstruction(
-        HloInstruction::CreateGetTupleElement(iteration->shape(), param, 3));
-
-    auto partition_id = collective_ops_creator_.create_partition_id(&body_b);
-    auto data_partition_id = body_b.AddInstruction(HloInstruction::CreateBinary(
-        i->shape(), HloOpcode::kAdd, i, partition_id));
-    auto partition_count = body_b.AddInstruction(HloInstruction::CreateConstant(
-        LiteralUtil::CreateR0<uint32>(num_partitions_)));
-    data_partition_id = body_b.AddInstruction(HloInstruction::CreateBinary(
-        i->shape(), HloOpcode::kRemainder, data_partition_id, partition_count));
-    auto dot_lhs = l;
-    auto dot_rhs = r;
-    if (windowed_at_contracting_dims || windowed_at_batch_dims) {
-      // Slice the matching operand according to the partitioned contracting
-      // dimensions on the windowed operand. We do this by treating the matching
-      // operand as replicated, and resharding it to match the windowed operand.
-      auto slice_operand = matching_operand == 0 ? l : r;
-      slice_operand->set_sharding(HloSharding::Replicate());
-      auto state = MakePartitioningState();
-      state.b = &body_b;
-      state.partition_id = data_partition_id;
-      auto slice = PartitionedHlo(slice_operand, slice_operand->shape(), state)
-                       .Reshard(windowing_operand == 0
-                                    ? *lhs_sharding_transposed_to_match_rhs
-                                    : *rhs_sharding_transposed_to_match_lhs)
-                       .hlo();
-      slice_operand->clear_sharding();
-      if (matching_operand == 0) {
-        dot_lhs = slice;
-      } else {
-        dot_rhs = slice;
-      }
-    }
-    TF_ASSIGN_OR_RETURN(auto dot,
-                        create_sharded_dot(dot_lhs, dot_rhs, &body_b));
-    if (windowed_at_contracting_dims) {
-      // Accumulate the partial output to the result buffer.
-      o = body_b.AddInstruction(
-          HloInstruction::CreateBinary(o->shape(), HloOpcode::kAdd, o, dot));
-    } else {
-      // The windowing operand is partitioned along batch/non-contracting
-      // dimensions, so we need a dynamic-update-slice to save the partial
-      // output in the result buffer.
-      auto offsets = MakePartitionOffsets(
-          o->shape(),
-          windowing_operand == 0 ? *lhs_sharding_transposed_to_match_output
-                                 : *rhs_sharding_transposed_to_match_output,
-          data_partition_id, &body_b);
-      o = body_b.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-          o->shape(), o, dot, offsets));
-    }
-
-    // ++i
-    i = body_b.AddInstruction(HloInstruction::CreateBinary(
-        i->shape(), HloOpcode::kAdd, i,
-        body_b.AddInstruction(
-            HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(1)))));
-    auto has_more = body_b.AddInstruction(HloInstruction::CreateCompare(
-        ShapeUtil::MakeShape(PRED, {}), i,
-        body_b.AddInstruction(HloInstruction::CreateConstant(
-            LiteralUtil::CreateR0<uint32>(num_partitions_))),
-        ComparisonDirection::kLt));
-    // Collective-permute for the next window. We don't need it for the last
-    // iteration, so we use a conditional around the collective-permute.
-    HloInstruction* conditional;
-    {
-      SpmdBuilder cp_b("window_collective_permute", visiting_hlo_);
-      {
-        auto p = cp_b.AddInstruction(HloInstruction::CreateParameter(
-            0, windowing_operand == 0 ? l->shape() : r->shape(), "window"));
-        std::vector<std::pair<int64, int64>> sd_pairs(num_partitions_);
-        for (int64 source = 0; source < num_partitions_; ++source) {
-          // 0 -> n-1, 1 -> 0, 2 -> 1, ...
-          sd_pairs[source] = {source,
-                              (source - 1 + num_partitions_) % num_partitions_};
-        }
-        collective_ops_creator_.create_cross_partition_collective_permute(
-            &cp_b, p, sd_pairs, (*next_channel_id_)++);
-      }
-      SpmdBuilder ncp_b("last_iteration_noop", visiting_hlo_);
-      {
-        ncp_b.AddInstruction(HloInstruction::CreateParameter(
-            0, windowing_operand == 0 ? l->shape() : r->shape(), "window"));
-      }
-      conditional = body_b.AddInstruction(HloInstruction::CreateConditional(
-          windowing_operand == 0 ? l->shape() : r->shape(), has_more,
-          windowing_operand == 0 ? l : r,
-          module_->AddEmbeddedComputation(cp_b.Build()),
-          windowing_operand == 0 ? l : r,
-          module_->AddEmbeddedComputation(ncp_b.Build())));
-    }
-    if (windowing_operand == 0) {
-      l = conditional;
-    } else {
-      r = conditional;
-    }
-    body_b.AddInstruction(HloInstruction::CreateTuple({l, r, o, i}));
-
-    SpmdBuilder cond_b("windowed_dot_general_cond", visiting_hlo_);
-    auto cond_param = cond_b.AddInstruction(HloInstruction::CreateParameter(
-        /*parameter_number=*/0,
-        ShapeUtil::MakeTupleShape({lhs.hlo()->shape(), rhs.hlo()->shape(),
-                                   result_buffer->shape(), iteration->shape()}),
-        "param"));
-    auto cond_i = cond_b.AddInstruction(HloInstruction::CreateGetTupleElement(
-        iteration->shape(), cond_param, 3));
-    cond_b.AddInstruction(HloInstruction::CreateCompare(
-        ShapeUtil::MakeShape(PRED, {}), cond_i,
-        cond_b.AddInstruction(HloInstruction::CreateConstant(
-            LiteralUtil::CreateR0<uint32>(num_partitions_))),
-        ComparisonDirection::kLt));
-    auto while_loop = b_.AddInstruction(HloInstruction::CreateWhile(
-        cond_param->shape(), module_->AddEmbeddedComputation(cond_b.Build()),
-        module_->AddEmbeddedComputation(body_b.Build()),
-        b_.AddInstruction(HloInstruction::CreateTuple(
-            {lhs.hlo(), rhs.hlo(), result_buffer, iteration}))));
-    windowed_dot_general_loops_.push_back({while_loop, windowing_operand,
-                                           windowed_at_contracting_dims,
-                                           windowed_at_batch_dims});
-    SetPartitionedHlo(hlo, [&] {
-      auto result = b_.AddInstruction(HloInstruction::CreateGetTupleElement(
-          result_buffer->shape(), while_loop, 2));
-      if (!ShapeUtil::Compatible(padded_result_buffer_shape,
-                                 unpadded_result_buffer_shape)) {
-        result = b_.AddInstruction(HloInstruction::CreateSlice(
-            unpadded_result_buffer_shape, result,
-            std::vector<int64>(padded_result_buffer_shape.rank(), 0),
-            unpadded_result_buffer_shape.dimensions(),
-            std::vector<int64>(padded_result_buffer_shape.rank(), 1)));
-      }
-      return result;
-    });
-    return Status::OK();
-  };
-  if (output_lhs_non_contracting_partitions == num_partitions_ &&
-      output_sharding_transposed_to_match_lhs == lhs_sharding &&
-      ShapeSizeInBytes(hlo->operand(1)->shape()) >=
-          options_.threshold_for_windowed_einsum_mib * 1024 * 1024) {
-    if (rhs_contracting_partitions == num_partitions_) {
-      return emit_windowed_dot_general(0, 1, true, false);
-    }
-    if (rhs_non_contracting_partitions == num_partitions_) {
-      return emit_windowed_dot_general(0, 1, false, false);
-    }
-    if (rhs_batch_partitions == num_partitions_) {
-      return emit_windowed_dot_general(0, 1, false, true);
-    }
-  }
-  if (output_rhs_non_contracting_partitions == num_partitions_ &&
-      output_sharding_transposed_to_match_rhs == rhs_sharding &&
-      ShapeSizeInBytes(hlo->operand(0)->shape()) >=
-          options_.threshold_for_windowed_einsum_mib * 1024 * 1024) {
-    if (lhs_contracting_partitions == num_partitions_) {
-      return emit_windowed_dot_general(1, 0, true, false);
-    }
-    if (lhs_non_contracting_partitions == num_partitions_) {
-      return emit_windowed_dot_general(1, 0, false, false);
-    }
-    if (lhs_batch_partitions == num_partitions_) {
-      return emit_windowed_dot_general(1, 0, false, true);
-    }
-  }
-
-  {
-    // Try batch-parallel by resharding one operand, and allowing all-reduce.
-    TF_ASSIGN_OR_RETURN(
-        bool emitted,
-        try_emit_output_batch_partitioned_einsum_with_reshard(true));
-    if (emitted) {
-      return Status::OK();
-    }
-  }
-
-  // LHS and RHS have the same partitioned contracting dimensions.
-  if (lhs_contracting_partitions == rhs_contracting_partitions &&
-      lhs_contracting_partitions == num_partitions_) {
-    auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
-        LiteralUtil::Zero(hlo->shape().element_type())));
-    // Pad both sides with zero, since NaN at one side cannot be masked by zero
-    // on the other side.
-    if (ShapeSizeInBytes(lhs.base_shape()) <
-        ShapeSizeInBytes(rhs.base_shape())) {
-      lhs =
-          lhs.Reshard(*rhs_sharding_transposed_to_match_lhs).PadWithValue(zero);
-      rhs = rhs.PadWithValue(zero);
-    } else {
-      lhs = lhs.PadWithValue(zero);
-      rhs =
-          rhs.Reshard(*lhs_sharding_transposed_to_match_rhs).PadWithValue(zero);
-    }
-    TF_ASSIGN_OR_RETURN(auto dot,
-                        create_sharded_dot(lhs.hlo(), rhs.hlo(), &b_));
-    SetPartitionedHlo(hlo, [&] {
-      auto ar = collective_ops_creator_.create_cross_partition_all_reduce(
-          &b_, dot, MakeBinaryAdd(hlo->shape().element_type(), module_),
-          NewChannel());
-      ar->set_sharding(HloSharding::Replicate());
-      return PartitionedHlo(ar, hlo->shape(), MakePartitioningState())
-          .Reshard(hlo->sharding())
-          .hlo();
-    });
-    return Status::OK();
-  }
-
-  // LHS and output have the same partitioned non-contracting dimensions.
-  if (lhs_non_contracting_partitions == num_partitions_ &&
-      output_lhs_non_contracting_partitions == num_partitions_ &&
-      lhs_sharding == hlo->sharding()) {
-    auto rhs_replicated = rhs.Reshard(HloSharding::Replicate()).hlo();
-    TF_ASSIGN_OR_RETURN(auto dot,
-                        create_sharded_dot(lhs.hlo(), rhs_replicated, &b_));
-    SetPartitionedHlo(hlo, [&] { return dot; });
-    return Status::OK();
-  }
-
-  // RHS and output have the same partitioned non-contracting dimensions.
-  if (rhs_non_contracting_partitions == num_partitions_ &&
-      output_rhs_non_contracting_partitions == num_partitions_ &&
-      rhs_sharding_transposed_to_match_output == hlo->sharding()) {
-    auto lhs_replicated = lhs.Reshard(HloSharding::Replicate()).hlo();
-    TF_ASSIGN_OR_RETURN(auto dot,
-                        create_sharded_dot(lhs_replicated, rhs.hlo(), &b_));
-    SetPartitionedHlo(hlo, [&] { return dot; });
-    return Status::OK();
-  }
-
-  // Output is batch partitioned.
-  if (output_batch_partitions == num_partitions_) {
-    auto resharded_lhs = lhs.Reshard(*output_sharding_transposed_to_match_lhs);
-    auto resharded_rhs = rhs.Reshard(*output_sharding_transposed_to_match_rhs);
-    TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(resharded_lhs.hlo(),
-                                                     resharded_rhs.hlo(), &b_));
-    SetPartitionedHlo(hlo, [&] { return dot; });
-    return Status::OK();
-  }
-  // Output is partitioned along LHS non-contracting dimensions.
-  if (output_lhs_non_contracting_partitions == num_partitions_) {
-    auto resharded_lhs = lhs.Reshard(*output_sharding_transposed_to_match_lhs);
-    auto replicated_rhs = rhs.Reshard(HloSharding::Replicate());
-    TF_ASSIGN_OR_RETURN(
-        auto dot,
-        create_sharded_dot(resharded_lhs.hlo(), replicated_rhs.hlo(), &b_));
-    SetPartitionedHlo(hlo, [&] { return dot; });
-    return Status::OK();
-  }
-  // Output is partitioned along RHS non-contracting dimensions.
-  if (output_rhs_non_contracting_partitions == num_partitions_) {
-    auto replicated_lhs = lhs.Reshard(HloSharding::Replicate());
-    auto resharded_rhs = rhs.Reshard(*output_sharding_transposed_to_match_rhs);
-    TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(replicated_lhs.hlo(),
-                                                     resharded_rhs.hlo(), &b_));
-    SetPartitionedHlo(hlo, [&] { return dot; });
-    return Status::OK();
-  }
-
-  // Returns true if it is beneficial to reshard the operand at `operand_idx`
-  // across the contracting dimension.
-  const auto should_partition_contracting_dim = [&](int64 operand_idx) {
-    if (!hlo->sharding().IsReplicated()) {
-      return false;
-    }
-
-    if (operand_idx == 0) {
-      // If LHS and output are replicated, we compare the cost of all-gather
-      // on RHS vs all-reduce on the output.
-      return (rhs_contracting_partitions == num_partitions_) &&
-             lhs.sharding().IsReplicated() &&
-             ShapeUtil::ElementsIn(hlo->operand(1)->shape()) >
-                 ShapeUtil::ElementsIn(hlo->shape());
-    } else {
-      return (lhs_contracting_partitions == num_partitions_) &&
-             rhs.sharding().IsReplicated() &&
-             ShapeUtil::ElementsIn(hlo->operand(0)->shape()) >
-                 ShapeUtil::ElementsIn(hlo->shape());
-    }
-  };
-
-  // When the output is replicated and one of the operands is partitioned along
-  // contracting dimension, align the other operand to be partitioned along
-  // the contracting dimensions.
-  if (hlo->sharding().IsReplicated() && (should_partition_contracting_dim(0) ||
-                                         should_partition_contracting_dim(1))) {
-    auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
-        LiteralUtil::Zero(hlo->shape().element_type())));
-    if (should_partition_contracting_dim(0)) {
-      lhs =
-          lhs.Reshard(*rhs_sharding_transposed_to_match_lhs).PadWithValue(zero);
-      rhs = rhs.PadWithValue(zero);
-    } else {
-      lhs = lhs.PadWithValue(zero);
-      rhs =
-          rhs.Reshard(*lhs_sharding_transposed_to_match_rhs).PadWithValue(zero);
-    }
-    TF_ASSIGN_OR_RETURN(auto dot,
-                        create_sharded_dot(lhs.hlo(), rhs.hlo(), &b_));
-    SetPartitionedHlo(hlo, [&] {
-      auto ar = collective_ops_creator_.create_cross_partition_all_reduce(
-          &b_, dot, MakeBinaryAdd(hlo->shape().element_type(), module_),
-          NewChannel());
-      ar->set_sharding(HloSharding::Replicate());
-      return PartitionedHlo(ar, hlo->shape(), MakePartitioningState()).hlo();
-    });
-    return Status::OK();
-  }
-
-  return DefaultAction(hlo);
-}
-
-namespace {
-
-// Finds a cluster of nodes that produce the inputs for `hlo` which only depend
-// on small operands, which means the cluster should start with broadcasts,
-// constants and iotas. All other internal nodes must be non-side-effecting
-// elemntwise ops. Returns the set of nodes, and the small operands. E.g., for
-// the following graph,
-//
-//     a -> broadcast -> multiply
-//     iota  ---> add--/
-//     constant/
-//
-// FindInputNodesIfOnlyDependOnSmallOperands(multiply) will return
-//    <{broadcast, iota, constant, add, multiply}, [a]>.
-std::pair<std::unordered_set<HloInstruction*>, std::vector<HloInstruction*>>
-FindInputNodesIfOnlyDependOnSmallOperands(HloInstruction* hlo) {
-  std::unordered_set<HloInstruction*> nodes_found;
-  std::vector<HloInstruction*> new_operands;
-  std::unordered_set<const HloInstruction*> new_operands_set;
-  std::vector<HloInstruction*> worklist;
-  worklist.push_back(hlo);
-  while (!worklist.empty()) {
-    auto inst = worklist.back();
-    worklist.pop_back();
-    if (nodes_found.count(inst) > 0) {
-      continue;
-    }
-    if (inst->opcode() == HloOpcode::kBroadcast ||
-        inst->opcode() == HloOpcode::kConstant ||
-        inst->opcode() == HloOpcode::kIota) {
-      nodes_found.insert(inst);
-      for (auto o : inst->operands()) {
-        auto res = new_operands_set.emplace(o);
-        if (res.second) {
-          new_operands.push_back(o);
-        }
-      }
-    } else if (inst->IsElementwise() && !inst->HasSideEffectNoRecurse() &&
-               inst->opcode() != HloOpcode::kAllReduce &&
-               absl::c_all_of(inst->operands(),
-                              [inst](const HloInstruction* o) {
-                                return ShapeUtil::CompatibleIgnoringElementType(
-                                    o->shape(), inst->shape());
-                              })) {
-      nodes_found.insert(inst);
-      for (auto o : inst->operands()) {
-        worklist.push_back(o);
-      }
-    } else {
-      nodes_found.clear();
-      new_operands.clear();
-      break;
-    }
-  }
-  return {std::move(nodes_found), std::move(new_operands)};
-}
-
-// Moves a cluster of memory-reducing nodes into the windowed dot-general loop
-// on contracting dimensions. Such a loop has a dynamic slice on the
-// non-windowed operand. If we move the input nodes into the loop, the
-// dynamic-slice could be merged with them by later optimization passes, which
-// reduces memory.
-//
-// small_operands             small_operands
-//        |                          |
-// input_nodes                loop { |
-//        |          =>         input_nodes
-// loop { |                          |
-//    dynamic-slice             dynamic-slice
-//    ...                       ...
-// }                          }
-//
-// Later optimization passes (TpuPadSliceMover) will merge the dynamic slice
-// with the input nodes.
-Status SinkInputNodesIntoWindowedDotGeneralLoopOnContractingDimensions(
-    HloInstruction* loop, int64 non_windowed_operand_index) {
-  auto input_tuple = loop->mutable_operand(0);
-  auto old_operand = input_tuple->mutable_operand(non_windowed_operand_index);
-  auto input_nodes = FindInputNodesIfOnlyDependOnSmallOperands(old_operand);
-  auto to_sink = std::move(input_nodes.first);
-  auto new_operands = std::move(input_nodes.second);
-  if (to_sink.empty()) {
-    return Status::OK();
-  }
-  auto computation = loop->parent();
-  // Replace the old operand with a tuple of the found small operands.
-  auto new_input_subtuple =
-      computation->AddInstruction(HloInstruction::CreateTuple(new_operands));
-  TF_RETURN_IF_ERROR(input_tuple->ReplaceOperandWithDifferentShape(
-      non_windowed_operand_index, new_input_subtuple));
-
-  auto body = loop->while_body();
-  auto body_param = body->parameter_instruction(0);
-  auto old_body_param_users = body_param->users();
-  // Update all tuple shapes.
-  for (auto tuple : std::vector<HloInstruction*>{
-           input_tuple, loop, loop->while_condition()->parameter_instruction(0),
-           body_param, body->root_instruction()}) {
-    *ShapeUtil::GetMutableSubshape(tuple->mutable_shape(),
-                                   {non_windowed_operand_index}) =
-        new_input_subtuple->shape();
-  }
-  // Now update the loop body.
-  auto new_operand_tuple_inside =
-      body->AddInstruction(HloInstruction::CreateGetTupleElement(
-          new_input_subtuple->shape(), body_param, non_windowed_operand_index));
-  TF_RETURN_IF_ERROR(body->root_instruction()->ReplaceOperandWithDifferentShape(
-      non_windowed_operand_index, new_operand_tuple_inside));
-
-  // Create nodes inside the loop body.
-  std::vector<HloInstruction*> worklist;
-  std::unordered_map<const HloInstruction*, HloInstruction*> outside_to_inside;
-  auto add_users_if_available = [&](HloInstruction* inst) {
-    for (auto u : inst->users()) {
-      if (outside_to_inside.count(u) == 0 && to_sink.count(u) > 0 &&
-          absl::c_all_of(u->operands(), [&](const HloInstruction* o) {
-            return outside_to_inside.count(o) > 0;
-          })) {
-        worklist.push_back(u);
-      }
-    }
-  };
-  for (int64 i = 0; i < new_operands.size(); ++i) {
-    outside_to_inside[new_operands[i]] =
-        body->AddInstruction(HloInstruction::CreateGetTupleElement(
-            new_operands[i]->shape(), new_operand_tuple_inside, i));
-    add_users_if_available(new_operands[i]);
-  }
-  // HLOs to sink without operands.
-  std::vector<HloInstruction*> nullaries_to_sink;
-  for (auto inst : to_sink) {
-    if (inst->operand_count() == 0) {
-      nullaries_to_sink.push_back(inst);
-    }
-  }
-  // Sort nullaries_to_sink to make it deterministic.
-  absl::c_sort(nullaries_to_sink,
-               [](const HloInstruction* a, const HloInstruction* b) {
-                 return a->unique_id() < b->unique_id();
-               });
-  for (auto inst : nullaries_to_sink) {
-    worklist.push_back(inst);
-  }
-  while (!worklist.empty()) {
-    auto inst = worklist.back();
-    worklist.pop_back();
-    std::vector<HloInstruction*> inst_new_operands(inst->operand_count());
-    for (int64 i = 0; i < inst->operand_count(); ++i) {
-      inst_new_operands[i] = outside_to_inside[inst->operand(i)];
-    }
-    outside_to_inside[inst] = body->AddInstruction(
-        inst->CloneWithNewOperands(inst->shape(), inst_new_operands));
-    add_users_if_available(inst);
-  }
-  TF_RET_CHECK(outside_to_inside.count(old_operand) > 0);
-  for (auto ou : old_body_param_users) {
-    if (ou->opcode() == HloOpcode::kGetTupleElement &&
-        ou->tuple_index() == non_windowed_operand_index) {
-      TF_RETURN_IF_ERROR(
-          ou->ReplaceAllUsesWith(outside_to_inside[old_operand]));
-      TF_RETURN_IF_ERROR(body->RemoveInstruction(ou));
-    }
-  }
-  return Status::OK();
-}
-
-// Moves a cluster of memory-reducing nodes (with reduce nodes at the end) into
-// the windowed dot-general loop on non-contracting dimensions. Such a loop has
-// a dynamic-update-slice at the output. If we move the user nodes into the loop
-// and before the dynamic-update-slice, the user nodes can operate on smaller
-// shapes, which reduces memory.
-//
-// small_operands                   small_operands
-//  | |                 =>                  | |
-//  | |  loop {                     loop {  | |
-//  | |    conv                             | broadcast      conv
-//  | |      |                              |     |           /
-//  | | dynamic-update-slice                |  dynamic-slice /
-//  | |         |                           |     |         /
-//  | |  }      |                           |  multiply-----
-//  |broadcast  /                           |    /
-//  | |        /                            reduce
-//  |multiply--                             |
-//  \ |                                dynamic-update-slice
-//   reduce                         }
-//
-// Later optimization passes (TpuPadSliceMover) will merge the dynamic slice
-// with the input nodes (broadcast).
-Status MoveUsersIntoWindowedDotGeneralLoopOnNonContractingDimensions(
-    HloInstruction* loop) {
-  CHECK_EQ(loop->user_count(), 1);
-  // There should be a single direct user of the while loop, which is the
-  // gte for element 2, i.e., the dot output.
-  auto user_gte = loop->users().front();
-  CHECK_EQ(user_gte->opcode(), HloOpcode::kGetTupleElement);
-  CHECK_EQ(user_gte->tuple_index(), 2);
-  auto computation = loop->parent();
-
-  // Find the reduce outputs and the input nodes they depend on, if input nodes
-  // only have small operands.
-  std::unordered_set<HloInstruction*> to_move;
-  std::vector<HloInstruction*> new_operands;
-  std::unordered_set<const HloInstruction*> new_operands_set;
-  std::vector<HloInstruction*> reduce_outputs;
-  std::vector<HloInstruction*> worklist;
-  Shape padded_shape = user_gte->shape();
-  Shape unpadded_shape = user_gte->shape();
-  auto original_output = user_gte;
-
-  if (user_gte->user_count() == 1 &&
-      user_gte->users().back()->opcode() == HloOpcode::kSlice) {
-    original_output = user_gte->users().back();
-    unpadded_shape = original_output->shape();
-  }
-  for (auto u : original_output->users()) {
-    worklist.push_back(u);
-  }
-  to_move.insert(original_output);
-  while (!worklist.empty()) {
-    auto inst = worklist.back();
-    worklist.pop_back();
-    if (to_move.count(inst) > 0) {
-      continue;
-    }
-    // We only support reduces with simple reduction function, since we may need
-    // to accumulate across iterations manually.
-    if (inst->opcode() == HloOpcode::kReduce &&
-        inst->to_apply()->instruction_count() == 3 &&
-        inst->to_apply()->num_parameters() == 2 &&
-        inst->to_apply()->root_instruction()->IsElementwise()) {
-      to_move.insert(inst);
-      auto other_operand = inst->mutable_operand(1);
-      auto res = new_operands_set.emplace(other_operand);
-      if (res.second) {
-        new_operands.push_back(other_operand);
-      }
-      reduce_outputs.push_back(inst);
-    } else if (inst != computation->root_instruction() &&
-               inst->user_count() > 0 && inst->IsElementwise() &&
-               !inst->HasSideEffectNoRecurse() &&
-               inst->opcode() != HloOpcode::kAllReduce &&
-               absl::c_all_of(inst->operands(),
-                              [inst](const HloInstruction* o) {
-                                return ShapeUtil::CompatibleIgnoringElementType(
-                                    o->shape(), inst->shape());
-                              })) {
-      // For an elementwise op, we need to make sure that they depend on only
-      // nodes already in to_move and nodes with small operands.
-      bool can_include = true;
-      for (auto operand : inst->operands()) {
-        if (to_move.count(operand) > 0) {
-          continue;
-        }
-        auto find_result = FindInputNodesIfOnlyDependOnSmallOperands(operand);
-        if (find_result.first.empty()) {
-          can_include = false;
-          break;
-        }
-        for (auto n : find_result.first) {
-          to_move.insert(n);
-        }
-        for (auto new_operand : find_result.second) {
-          auto res = new_operands_set.insert(new_operand);
-          if (res.second) {
-            new_operands.push_back(new_operand);
-          }
-        }
-      }
-      if (!can_include) {
-        to_move.clear();
-        break;
-      }
-      to_move.insert(inst);
-      for (auto u : inst->users()) {
-        worklist.push_back(u);
-      }
-    } else {
-      to_move.clear();
-      break;
-    }
-  }
-  // If nothing is found, to_move could contain only original_output, or cleared
-  // by the above code.
-  if (to_move.size() <= 1) {
-    return Status::OK();
-  }
-
-  // We will replace the original loop output with reduce-shape outputs. Create
-  // the initial buffers before the loop.
-  for (auto out : reduce_outputs) {
-    auto padded_out_shape = out->shape();
-    int64 operand_dim = 0;
-    int64 output_dim = 0;
-    while (output_dim < padded_out_shape.rank()) {
-      if (absl::c_linear_search(out->dimensions(), operand_dim)) {
-        // Dimension colapsed.
-        ++operand_dim;
-        continue;
-      }
-      // Kept dimensions have the same size of the padded shape.
-      padded_out_shape.set_dimensions(output_dim,
-                                      padded_shape.dimensions(operand_dim));
-      ++operand_dim;
-      ++output_dim;
-    }
-    auto broadcast =
-        computation->AddInstruction(HloInstruction::CreateBroadcast(
-            padded_out_shape,
-            computation->AddInstruction(HloInstruction::CreateConstant(
-                LiteralUtil::Zero(out->shape().element_type()))),
-            {}));
-    new_operands.push_back(broadcast);
-  }
-
-  auto input_tuple = loop->mutable_operand(0);
-  // Create the new input subtuple that contains the small operands and the
-  // reduce-shape result buffers.
-  auto new_input_subtuple =
-      computation->AddInstruction(HloInstruction::CreateTuple(new_operands));
-  TF_RETURN_IF_ERROR(
-      input_tuple->ReplaceOperandWithDifferentShape(2, new_input_subtuple));
-  auto body = loop->while_body();
-  auto body_param = body->parameter_instruction(0);
-  auto body_root = body->root_instruction();
-  CHECK_EQ(body_root->opcode(), HloOpcode::kTuple);
-  // Update tuple shapes.
-  for (auto tuple : std::vector<HloInstruction*>{
-           input_tuple, loop, loop->while_condition()->parameter_instruction(0),
-           body_param, body_root}) {
-    *ShapeUtil::GetMutableSubshape(tuple->mutable_shape(), {2}) =
-        new_input_subtuple->shape();
-  }
-  auto new_loop_input =
-      body->AddInstruction(HloInstruction::CreateGetTupleElement(
-          new_input_subtuple->shape(), body_param, 2));
-
-  // Now create the moved nodes inside the loop body.
-  std::unordered_map<const HloInstruction*, HloInstruction*> outside_to_inside;
-  worklist.clear();
-  auto add_users_if_available = [&](HloInstruction* inst) {
-    for (auto u : inst->users()) {
-      if (outside_to_inside.count(u) == 0 && to_move.count(u) > 0 &&
-          absl::c_all_of(u->operands(), [&](const HloInstruction* o) {
-            return outside_to_inside.count(o) > 0;
-          })) {
-        worklist.push_back(u);
-      }
-    }
-  };
-  for (int64 i = 0; i < new_operands.size(); ++i) {
-    outside_to_inside[new_operands[i]] =
-        body->AddInstruction(HloInstruction::CreateGetTupleElement(
-            new_operands[i]->shape(), new_loop_input, i));
-    add_users_if_available(new_operands[i]);
-  }
-  // The elementwise nodes will be created with sliced shape. The original loop
-  // output corresponds to the dynamic-update-slice's update slice.
-  auto dus = body_root->mutable_operand(2);
-  CHECK_EQ(dus->opcode(), HloOpcode::kDynamicUpdateSlice);
-  outside_to_inside[original_output] = dus->mutable_operand(1);
-  add_users_if_available(original_output);
-  std::vector<HloInstruction*> slice_offsets(padded_shape.rank());
-  for (int64 i = 0; i < slice_offsets.size(); ++i) {
-    slice_offsets[i] = dus->mutable_operand(i + 2);
-  }
-  auto get_slice = [&](HloInstruction* padded) {
-    return body->AddInstruction(HloInstruction::CreateDynamicSlice(
-        ShapeUtil::ChangeElementType(dus->operand(1)->shape(),
-                                     padded->shape().element_type()),
-        padded, slice_offsets, dus->operand(1)->shape().dimensions()));
-  };
-  // Helper functions to create nodes with small operands.
-  auto add_broadcast = [&](const HloInstruction* broadcast) {
-    auto padded_operand_shape = broadcast->operand(0)->shape();
-    for (int64 i = 0; i < broadcast->dimensions().size(); ++i) {
-      padded_operand_shape.set_dimensions(
-          i, padded_shape.dimensions(broadcast->dimensions(i)));
-    }
-    auto padded_operand = PadToShape(outside_to_inside[broadcast->operand(0)],
-                                     padded_operand_shape, nullptr, body);
-    outside_to_inside[broadcast] =
-        get_slice(body->AddInstruction(broadcast->CloneWithNewOperands(
-            ShapeUtil::ChangeElementType(padded_shape,
-                                         padded_operand_shape.element_type()),
-            {padded_operand})));
-  };
-  auto add_iota = [&](const HloInstruction* iota) {
-    outside_to_inside[iota] =
-        get_slice(body->AddInstruction(iota->CloneWithNewOperands(
-            ShapeUtil::ChangeElementType(padded_shape,
-                                         iota->shape().element_type()),
-            {})));
-  };
-  auto add_constant = [&](const HloInstruction* constant) {
-    outside_to_inside[constant] = body->AddInstruction(constant->Clone());
-    outside_to_inside[constant] = get_slice(
-        PadToShape(outside_to_inside[constant],
-                   ShapeUtil::ChangeElementType(
-                       padded_shape, constant->shape().element_type()),
-                   nullptr, body));
-  };
-  while (!worklist.empty()) {
-    auto inst = worklist.back();
-    worklist.pop_back();
-    if (outside_to_inside.count(inst) > 0) {
-      continue;
-    }
-    if (inst->opcode() == HloOpcode::kBroadcast) {
-      add_broadcast(inst);
-    } else if (inst->opcode() == HloOpcode::kIota) {
-      add_iota(inst);
-    } else if (inst->opcode() == HloOpcode::kConstant) {
-      add_constant(inst);
-    } else if (inst->opcode() == HloOpcode::kReduce) {
-      // This is an output, for which we has special handling later.
-    } else {
-      std::vector<HloInstruction*> operands_inside(inst->operand_count());
-      for (int64 i = 0; i < operands_inside.size(); ++i) {
-        operands_inside[i] = outside_to_inside[inst->operand(i)];
-      }
-      outside_to_inside[inst] = body->AddInstruction(inst->CloneWithNewOperands(
-          ShapeUtil::ChangeElementType(dus->operand(1)->shape(),
-                                       inst->shape().element_type()),
-          operands_inside));
-    }
-    add_users_if_available(inst);
-  }
-  std::vector<HloInstruction*> new_outputs_inside(new_operands.size());
-  for (int64 i = 0; i < new_outputs_inside.size(); ++i) {
-    new_outputs_inside[i] = outside_to_inside[new_operands[i]];
-  }
-  // Now create the reduce outpus inside of the loop.
-  for (int64 i = 0; i < reduce_outputs.size(); ++i) {
-    auto reduce_outside = reduce_outputs[i];
-    CHECK_EQ(reduce_outside->opcode(), HloOpcode::kReduce);
-    int64 index_in_operand = new_operands.size() - reduce_outputs.size() + i;
-    auto last_iter_result = outside_to_inside[new_operands[index_in_operand]];
-    auto operand0 = outside_to_inside[reduce_outside->operand(0)];
-    auto operand1 = outside_to_inside[reduce_outside->operand(1)];
-    TF_ASSIGN_OR_RETURN(auto reduce_shape,
-                        ShapeInference::InferReduceShape(
-                            {&operand0->shape(), &operand1->shape()},
-                            reduce_outside->dimensions(),
-                            reduce_outside->to_apply()->ComputeProgramShape()));
-    *reduce_shape.mutable_layout() = reduce_outside->shape().layout();
-    std::vector<HloInstruction*> reduce_dus_offsets;
-    // If any collapsed dimension is windowed, we need to accumulate with last
-    // iteration's result. If such a dimension has padding, we also need to mask
-    // off invalid data.
-    bool needs_accumulate = false;
-    std::vector<int64> dims_to_mask;
-    for (int64 i = 0; i < slice_offsets.size(); ++i) {
-      if (absl::c_linear_search(reduce_outside->dimensions(), i)) {
-        if (reduce_outside->operand(0)->shape().dimensions(i) !=
-            operand0->shape().dimensions(i)) {
-          needs_accumulate = true;
-          if (unpadded_shape.dimensions(i) != padded_shape.dimensions(i)) {
-            dims_to_mask.push_back(i);
-          }
-        }
-        continue;
-      }
-      reduce_dus_offsets.push_back(slice_offsets[i]);
-    }
-    // Mask off invalid data in collapsed dimensions.
-    for (int64 dim : dims_to_mask) {
-      auto iota = body->AddInstruction(HloInstruction::CreateIota(
-          ShapeUtil::ChangeElementType(operand0->shape(), S32), dim));
-      auto add = body->AddInstruction(HloInstruction::CreateBinary(
-          iota->shape(), HloOpcode::kAdd, iota,
-          body->AddInstruction(HloInstruction::CreateBroadcast(
-              iota->shape(), slice_offsets[dim], {}))));
-      auto limit = body->AddInstruction(HloInstruction::CreateBroadcast(
-          iota->shape(),
-          body->AddInstruction(
-              HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(
-                  reduce_outside->operand(0)->shape().dimensions(dim)))),
-          {}));
-      auto compare = body->AddInstruction(HloInstruction::CreateCompare(
-          ShapeUtil::ChangeElementType(iota->shape(), PRED), add, limit,
-          ComparisonDirection::kLt));
-      operand0 = body->AddInstruction(HloInstruction::CreateTernary(
-          operand0->shape(), HloOpcode::kSelect, compare, operand0,
-          body->AddInstruction(HloInstruction::CreateBroadcast(
-              operand0->shape(), operand1, {}))));
-    }
-    auto output_inside =
-        body->AddInstruction(reduce_outside->CloneWithNewOperands(
-            reduce_shape, {operand0, operand1}));
-    // Accumulate with previous results if needed.
-    if (needs_accumulate) {
-      auto input_slice =
-          body->AddInstruction(HloInstruction::CreateDynamicSlice(
-              output_inside->shape(), last_iter_result, reduce_dus_offsets,
-              output_inside->shape().dimensions()));
-      output_inside = body->AddInstruction(HloInstruction::CreateBinary(
-          output_inside->shape(),
-          reduce_outside->to_apply()->root_instruction()->opcode(),
-          output_inside, input_slice));
-    }
-    // Dynamic-update-slice if needed.
-    if (!ShapeUtil::Compatible(output_inside->shape(),
-                               last_iter_result->shape())) {
-      output_inside =
-          body->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-              last_iter_result->shape(), last_iter_result, output_inside,
-              reduce_dus_offsets));
-    }
-    new_outputs_inside[index_in_operand] = output_inside;
-  }
-  // Body output.
-  auto new_output_inside =
-      body->AddInstruction(HloInstruction::CreateTuple(new_outputs_inside));
-  TF_RETURN_IF_ERROR(
-      body_root->ReplaceOperandWithDifferentShape(2, new_output_inside));
-  TF_RETURN_IF_ERROR(body->RemoveInstructionAndUnusedOperands(dus));
-  // Replace uses of the reduces outside the loop.
-  auto new_output_gte =
-      computation->AddInstruction(HloInstruction::CreateGetTupleElement(
-          new_output_inside->shape(), loop, 2));
-  for (int64 i = 0; i < reduce_outputs.size(); ++i) {
-    int64 index_in_operand = new_operands.size() - reduce_outputs.size() + i;
-    auto new_output =
-        computation->AddInstruction(HloInstruction::CreateGetTupleElement(
-            new_outputs_inside[index_in_operand]->shape(), new_output_gte,
-            index_in_operand));
-    if (!ShapeUtil::Compatible(new_output->shape(),
-                               reduce_outputs[i]->shape())) {
-      new_output = computation->AddInstruction(HloInstruction::CreateSlice(
-          reduce_outputs[i]->shape(), new_output,
-          std::vector<int64>(new_output->shape().rank(), 0),
-          reduce_outputs[i]->shape().dimensions(),
-          std::vector<int64>(new_output->shape().rank(), 1)));
-    }
-    TF_RETURN_IF_ERROR(reduce_outputs[i]->ReplaceAllUsesWith(new_output));
-    TF_RETURN_IF_ERROR(
-        computation->RemoveInstructionAndUnusedOperands(reduce_outputs[i]));
-  }
-  return Status::OK();
-}
-
-}  // namespace
-
-Status SpmdPartitioningVisitor::DoCodeMotionForWindowedDotGeneralLoops(
-    HloComputation* computation) {
-  for (auto& loop : windowed_dot_general_loops_) {
-    if (loop.windowed_in_contracting_dims || loop.windowed_in_batch_dims) {
-      // We have a dynamic-slice for the non-windowed operand in
-      // batch/contracting-dim windowed dot-general. So moving the
-      // broadcast/iota/elementwise ops into the loop could help reduce memory
-      // via fusion.
-      TF_RETURN_IF_ERROR(
-          SinkInputNodesIntoWindowedDotGeneralLoopOnContractingDimensions(
-              loop.while_loop, 1 - loop.windowed_operand));
-    }
-    if (!loop.windowed_in_contracting_dims) {
-      // We have a dynamic-update-slice for the output in
-      // batch/non-contracting-dim windowed dot-general. So moving reduce ops
-      // into the loop could help reduce memory.
-      TF_RETURN_IF_ERROR(
-          MoveUsersIntoWindowedDotGeneralLoopOnNonContractingDimensions(
-              loop.while_loop));
-    }
-  }
-  return Status::OK();
-}
-
 StatusOr<bool> SpmdPartitioningVisitor::DoPartition(
     HloComputation* computation, const HloSharding& root_sharding) {
   VLOG(2) << "Partitioning computation " << computation->name() << " for "
@@ -4648,13 +3048,36 @@ SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64 num_partitions,
       [](SpmdBuilder* b) {
         return b->AddInstruction(HloInstruction::CreatePartitionId());
       },
-      [num_replicas](SpmdBuilder* b, HloInstruction* operand,
-                     HloComputation* reduction, int64 channel_id) {
+      [num_replicas, num_partitions](
+          SpmdBuilder* b, HloInstruction* operand, HloComputation* reduction,
+          const std::vector<std::vector<int64>>& partition_subgroups,
+          int64 channel_id) {
+        if (partition_subgroups.size() <= 1) {
+          std::vector<ReplicaGroup> groups(num_replicas);
+          // TODO(yuanzx): Unify subgroup definition with AllToAll.
+          for (int64 i = 0; i < num_replicas; ++i) {
+            groups[i].add_replica_ids(i);
+          }
+          return b->AddInstruction(HloInstruction::CreateAllReduce(
+              operand->shape(), {operand}, reduction, groups,
+              /*constrain_layout=*/false, channel_id,
+              /*use_global_device_ids=*/false));
+        }
+
+        std::vector<ReplicaGroup> device_groups;
+        device_groups.reserve(partition_subgroups.size() * num_replicas);
+        for (int64 i = 0; i < num_replicas; ++i) {
+          for (const auto& pgroup : partition_subgroups) {
+            device_groups.emplace_back();
+            for (int64 pid : pgroup) {
+              device_groups.back().add_replica_ids(i * num_partitions + pid);
+            }
+          }
+        }
         return b->AddInstruction(HloInstruction::CreateAllReduce(
-            operand->shape(), {operand}, reduction,
-            CreateReplicaGroups(num_replicas),
+            operand->shape(), {operand}, reduction, device_groups,
             /*constrain_layout=*/false, channel_id,
-            /*use_global_device_ids=*/false));
+            /*use_global_device_ids=*/true));
       },
       [](SpmdBuilder* b, HloInstruction* operand,
          std::vector<std::pair<int64, int64>>& src_dst_pairs,
@@ -4663,14 +3086,20 @@ SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64 num_partitions,
             operand->shape(), operand, src_dst_pairs, channel_id));
       },
       [](SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
-         const std::vector<ReplicaGroup>& replica_groups, int64 channel_id,
-         absl::optional<int64> split_dimension) {
+         const std::vector<std::vector<int64>>& partition_subgroups,
+         int64 channel_id, absl::optional<int64> split_dimension) {
         std::vector<Shape> shapes(operands.size(), operands[0]->shape());
         const Shape output_shape = (shapes.size() == 1)
                                        ? shapes[0]
                                        : ShapeUtil::MakeTupleShape(shapes);
+        std::vector<ReplicaGroup> groups(partition_subgroups.size());
+        for (int64 i = 0; i < groups.size(); ++i) {
+          for (int64 id : partition_subgroups[i]) {
+            groups[i].add_replica_ids(id);
+          }
+        }
         return b->AddInstruction(HloInstruction::CreateAllToAll(
-            output_shape, operands, replica_groups,
+            output_shape, operands, groups,
             /*constrain_layout=*/false, channel_id, split_dimension));
       },
       [num_replicas, num_partitions](
@@ -4701,10 +3130,10 @@ SpmdPartitioner::SpmdPartitioner(int64 num_partitions, int64 num_replicas,
           num_partitions, num_replicas, std::move(options),
           GetDefaultCollectiveOpsCreator(num_partitions, num_replicas)) {}
 
-HloInstruction* SpmdPartitioner::AllGatherShards(SpmdBuilder* b,
-                                                 HloInstruction* operand,
-                                                 const HloSharding& sharding,
-                                                 int64 channel_id) {
+HloInstruction* SpmdPartitioner::AllGatherShards(
+    SpmdBuilder* b, HloInstruction* operand, const HloSharding& sharding,
+    int64 channel_id, absl::Span<const int64> selected_dims,
+    const SPMDCollectiveOpsCreator& collectives_creator) {
   CHECK(!sharding.IsTileMaximal());
   // Add one leading dimension to gather all partitions.
   std::vector<int64> shape;
@@ -4714,18 +3143,17 @@ HloInstruction* SpmdPartitioner::AllGatherShards(SpmdBuilder* b,
   }
   auto reshape = b->AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(operand->shape().element_type(), shape), operand));
-  std::vector<std::vector<int64>> partition_subgroups(1);
-  for (int64 pid : sharding.tile_assignment()) {
-    partition_subgroups[0].push_back(pid);
-  }
-  shape[0] = sharding.tile_assignment().num_elements();
-  auto result = collective_ops_creator_.create_cross_partition_all_gather(
+  auto partition_subgroups =
+      GetPartitionGroupsForReplication(sharding, selected_dims);
+  shape[0] = partition_subgroups[0].size();
+  auto result = collectives_creator.create_cross_partition_all_gather(
       b, reshape, ShapeUtil::MakeShape(operand->shape().element_type(), shape),
       partition_subgroups, channel_id, /*all_gather_dimension=*/0);
   // If n > 1 dimensions are partitioned, split the leading dimension to n.
   std::vector<int64> tiled_dims;
   for (int64 i = 0; i < sharding.tile_assignment().num_dimensions(); ++i) {
-    if (sharding.tile_assignment().dim(i) > 1) {
+    if (sharding.tile_assignment().dim(i) > 1 &&
+        absl::c_linear_search(selected_dims, i)) {
       tiled_dims.push_back(i);
     }
   }
@@ -4747,7 +3175,8 @@ HloInstruction* SpmdPartitioner::AllGatherShards(SpmdBuilder* b,
   std::vector<int64> xpose_permutation(result->shape().rank());
   int64 split_dims_added = 0;
   for (int64 i = 0; i < xpose_permutation.size(); ++i) {
-    if (sharding.tile_assignment().dim(i - split_dims_added) == 1) {
+    if (sharding.tile_assignment().dim(i - split_dims_added) == 1 ||
+        !absl::c_linear_search(selected_dims, i - split_dims_added)) {
       xpose_permutation[i] = i + tiled_dims.size() - split_dims_added;
     } else {
       xpose_permutation[i] = split_dims_added;
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
index 52e4c9021d8..a612c16bdae 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <unordered_map>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -82,8 +83,10 @@ struct SPMDCollectiveOpsCreator {
   std::function<HloInstruction*(SpmdBuilder*)> create_partition_id;
 
   // Function used to create a cross-partition all-reduce HLO.
-  std::function<HloInstruction*(SpmdBuilder*, HloInstruction* operand,
-                                HloComputation* reduction, int64 channel_id)>
+  std::function<HloInstruction*(
+      SpmdBuilder*, HloInstruction* operand, HloComputation* reduction,
+      const std::vector<std::vector<int64>>& partition_subgroups,
+      int64 channel_id)>
       create_cross_partition_all_reduce;
 
   // Function used to create a cross-partition collective-permute HLO.
@@ -96,8 +99,8 @@ struct SPMDCollectiveOpsCreator {
   // Function used to create a cross-partition all-to-all HLO.
   std::function<HloInstruction*(
       SpmdBuilder*, absl::Span<HloInstruction* const> operands,
-      const std::vector<ReplicaGroup>& replica_groups, int64 channel_id,
-      absl::optional<int64> split_dimension)>
+      const std::vector<std::vector<int64>>& partition_subgroups,
+      int64 channel_id, absl::optional<int64> split_dimension)>
       create_cross_partition_all_to_all;
 
   // Function used to create a cross-partition all-gather HLO. This is optional:
@@ -169,10 +172,13 @@ class SpmdPartitioner : public HloModulePass {
   // The default uses a single all-gather even if there are multiple sharded
   // dimensions, and adds potential reshapes and transposes to achieve that.
   // If it returns false, the partitioner will fall back to all-reduce.
-  virtual HloInstruction* AllGatherShards(SpmdBuilder* b,
-                                          HloInstruction* operand,
-                                          const HloSharding& sharding,
-                                          int64 channel_id);
+  // `selected_dims` specifies the dimensions along which the all-gather happens
+  // in the tiled sharding, which allows potentially creating a subgroup
+  // all-gather.
+  virtual HloInstruction* AllGatherShards(
+      SpmdBuilder* b, HloInstruction* operand, const HloSharding& sharding,
+      int64 channel_id, absl::Span<const int64> selected_dims,
+      const SPMDCollectiveOpsCreator& collectives_creator);
 
  protected:
   virtual std::unique_ptr<SpmdPartitioningVisitor> CreateVisitor(
@@ -215,7 +221,12 @@ class PartitionedHlo {
           std::tuple<HloSharding, Window, WindowedInputShardReturnValue>>
           window_reshard_cache;
     };
+    // Use std::unordered_map for pointer stability.
     std::unordered_map<HloInstruction*, PerHloCache> per_hlo_cache;
+    // Caches for nested partitioning of grouped sharding. Each string key
+    // represents a unique way of grouping devices.
+    absl::flat_hash_map<std::string, std::unique_ptr<ReshardCache>>
+        groupd_caches;
   };
   struct PartitioningState {
     SpmdBuilder* b;
@@ -270,21 +281,26 @@ class PartitionedHlo {
 
   const PartitioningState& state() const { return state_; }
 
+  // Helper function to replicate the data on all devices. Could only modify
+  // the reshard cache.
+  PartitionedHlo Replicate();
+
+  // Helper function to replicate the data for partitions along the given dims.
+  HloInstruction* ReplicatePartial(absl::Span<const int64> dims);
+
  private:
   // Same as Reshard except that it does not explicitly modify the reshard
   // cache, although it would indirectly modify by calling Replicate().
   PartitionedHlo ReshardNoCache(const HloSharding& target);
 
-  // Helper function to replicate the data on all devices. Could only modify
-  // the reshard cache.
-  PartitionedHlo Replicate();
-
   // Helper function to broadcast data from a single device to all devices.
   PartitionedHlo Broadcast() const;
 
   // Helper function to reshard the tensor using AllToAll (instead of the
   // default of Replicate followed by Slice).
-  PartitionedHlo ReshardWithAllToAll(const HloSharding& target) const;
+  PartitionedHlo ReshardWithAllToAll(
+      const HloSharding& target,
+      absl::Span<const std::pair<int64, int64>> source_target_dims) const;
 
   // Helper function to reshard the tensor using CollectivePermute.
   PartitionedHlo ReshardWithCollectivePermute(const HloSharding& target) const;
@@ -314,6 +330,22 @@ struct DotGeneralDimsMapping {
   std::vector<DimsMapping> rhs_non_contracting_dims;
 };
 
+struct ConvolutionDimsMapping {
+  // The dimension numbers for the operands and output corresponding to a
+  // logical dimension (e.g., batch, parallel, non-parallel). If an
+  // operand or the output doesn't have the logical dimension, it is set to
+  // -1.
+  struct DimsMapping {
+    int64 lhs;
+    int64 rhs;
+    int64 output;
+    // input mapped to index in input_spatial_dimensions().
+    int64 spatial;
+  };
+  std::vector<DimsMapping> parallel_spatial_dims;
+  std::vector<DimsMapping> non_parallel_spatial_dims;
+};
+
 class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
  public:
   SpmdPartitioningVisitor(
@@ -354,9 +386,6 @@ class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
   Status HandleIota(HloInstruction* hlo) override;
   Status HandlePartitionId(HloInstruction* hlo) override;
 
-  // Handles convolution where both LHS and RHS operands are tiled.
-  Status HandleConvolutionTiledLhsAndRhs(HloInstruction* hlo);
-
   // Implementation of dot partitioning given DotGeneralDimsMapping.
   Status HandleDotHelper(
       HloInstruction* hlo, const DotGeneralDimsMapping& dims_mapping,
@@ -415,6 +444,16 @@ class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
   StatusOr<bool> DoPartition(HloComputation* computation,
                              const HloSharding& root_sharding);
 
+  // Information about a loop created for windowed dot-general. Used when
+  // DoCodeMotionForWindowedDotGeneralLoops() executes after the visitor
+  // finishes traversing the graph.
+  struct WindowedDotGeneralLoop {
+    HloInstruction* while_loop;
+    int64 windowed_operand;
+    bool windowed_in_contracting_dims;
+    bool windowed_in_batch_dims;
+  };
+
  private:
   Status Preprocess(HloInstruction* hlo) override;
   Status Postprocess(HloInstruction* hlo) override;
@@ -443,15 +482,6 @@ class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
   // partitioned instruction.
   ConstHloInstructionMap<PartitionedHlo> partitioned_instructions_;
 
-  // Information about a loop created for windowed dot-general. Used when
-  // DoCodeMotionForWindowedDotGeneralLoops() executes after the visitor
-  // finishes traversing the graph.
-  struct WindowedDotGeneralLoop {
-    HloInstruction* while_loop;
-    int64 windowed_operand;
-    bool windowed_in_contracting_dims;
-    bool windowed_in_batch_dims;
-  };
   std::vector<WindowedDotGeneralLoop> windowed_dot_general_loops_;
 
   HloInstruction* visiting_hlo_;
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
index 1f0b1d06c1f..3ffe2954d61 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
@@ -380,6 +380,43 @@ ENTRY entry {
                       op::GetTupleElement(second_infeed))));
 }
 
+TEST_F(SpmdPartitioningTest, MixedTupleInfeed) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  token0 = token[] after-all(), sharding={maximal device=0}
+  infeed = ((f32[9,2]{1,0}, f32[2]{0}), token[]) infeed(token0),
+    sharding={{maximal device=0}, {maximal device=1}, {maximal device=0}}
+  ROOT infeed.data = (f32[9,2]{1,0}, f32[2]{0}) get-tuple-element(infeed),
+    index=0, sharding={{maximal device=0}, {maximal device=1}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("(f32[9,2], f32[2])"),
+                          op::GetTupleElement(op::Conditional(
+                              op::Convert(op::PartitionId()), op::AfterAll(),
+                              op::AfterAll()))));
+  auto first_infeed = AllOf(op::Shape("((f32[9,2], ()), token[])"),
+                            op::Infeed(op::Parameter()));
+  EXPECT_THAT(root->operand(0)->called_computations()[0]->root_instruction(),
+              AllOf(op::Shape("((f32[9,2], f32[2]), token[])"),
+                    op::Tuple(op::Tuple(op::GetTupleElement(
+                                            op::GetTupleElement(first_infeed)),
+                                        op::Broadcast(op::Constant())),
+                              op::GetTupleElement(first_infeed))));
+  auto second_infeed =
+      AllOf(op::Shape("(((), f32[2]), token[])"), op::Infeed(op::Parameter()));
+  EXPECT_THAT(root->operand(0)->called_computations()[1]->root_instruction(),
+              AllOf(op::Shape("((f32[9,2], f32[2]), token[])"),
+                    op::Tuple(op::Tuple(op::Broadcast(op::Constant()),
+                                        op::GetTupleElement(op::GetTupleElement(
+                                            second_infeed))),
+                              op::GetTupleElement(second_infeed))));
+}
+
 TEST_F(SpmdPartitioningTest, TiledToReplicatedReduce) {
   const char* const hlo_string = R"(
 HloModule module
@@ -527,6 +564,80 @@ ENTRY entry {
                                                  op::Constant())))));
 }
 
+TEST_F(SpmdPartitioningTest,
+       BroadcastBothOldAndNewDimsShardedPartiallySharded) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  param = f32[4,3] parameter(0),
+    sharding={devices=[1,2,4]0,1,4,5,2,3,6,7 last_tile_dim_replicate}
+  ROOT broadcast = f32[4,4,3] broadcast(param), dimensions={1,2},
+    sharding={devices=[2,1,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::Shape("f32[2,4,2]"),
+            op::Broadcast(AllOf(op::Shape("f32[4,2]"), op::Parameter(0)))));
+}
+
+TEST_F(SpmdPartitioningTest,
+       ConvWithParallelDimAndNonParallelSpatialDimPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,12,12,24,32] parameter(0)
+  %lhs.copy = f32[32,12,12,24,32] copy(%lhs),
+    sharding={devices=[2,2,1,1,1]0,1,2,3}
+  %rhs = f32[32,6,6,16,32] parameter(1)
+  %rhs.copy = f32[32,6,6,16,32] copy(%rhs),
+    sharding={devices=[2,2,1,1,1]0,1,2,3}
+  ROOT %conv = f32[32,7,7,24,16] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=012bf_012oi->012bf,
+    window={size=32x6x6 stride=31x1x1 lhs_dilate=32x1x1},
+    sharding={devices=[2,2,1,1,1]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                             op::Reshape(), op::Constant(),
+                                             op::Constant(), op::Constant())),
+                   op::Shape("f32[16,6,12,24,32]"));
+  auto rhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                             op::Reshape(), op::Constant(),
+                                             op::Constant(), op::Constant())),
+                   op::Shape("f32[16,3,6,16,32]"));
+  auto resharded_rhs =
+      AllOf(op::Shape("f32[16,6,6,16,32]"),
+            op::AllReduce(op::DynamicUpdateSlice(
+                op::Broadcast(), rhs, op::Constant(), op::Reshape(),
+                op::Constant(), op::Constant(), op::Constant())));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                         op::Shape("f32[16,2,12,24,32]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                          op::Shape("f32[16,3,12,24,32]"));
+  EXPECT_THAT(
+      root,
+      AllOf(op::Convolution(
+                op::Select(op::Compare(),
+                           op::DynamicSlice(
+                               op::Concatenate(left_halo, lhs, right_halo),
+                               op::Constant(), op::Add(), op::Constant(),
+                               op::Constant(), op::Constant()),
+                           op::Broadcast()),
+                resharded_rhs),
+            op::Shape("f32[16,4,7,24,16]")));
+}
+
 TEST_F(SpmdPartitioningTest, BroadcastPropagateTiledSharding) {
   const char* const hlo_string = R"(
 HloModule module
@@ -1399,6 +1510,50 @@ ENTRY entry {
                     op::Shape("f32[1,1,512,64]")));
 }
 
+TEST_F(SpmdPartitioningTest,
+       ConvolutionLhsTiledRhsTiled_UnevenDilatedRHSPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[8,28,28,8] parameter(0)
+  %lhs.copy = f32[8,28,28,8] copy(%lhs), sharding={devices=[1,4,1,1]0,1,2,3}
+  %rhs = f32[8,14,14,64] parameter(1)
+  %rhs.copy = f32[8,14,14,64] copy(%rhs), sharding={devices=[1,4,1,1]0,1,2,3}
+  ROOT %conv = f32[1,1,8,64] convolution(%lhs.copy, %rhs.copy),
+    window={size=14x14 pad=0_-1x0_-1 rhs_dilate=2x2},
+    dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[8,7,28,8]"));
+  auto rhs = AllOf(op::Pad(op::Parameter(), op::Constant()),
+                   op::Shape("f32[8,16,14,64]"));
+  auto selected_rhs = AllOf(
+      op::Select(op::Compare(),
+                 op::Copy(op::DynamicSlice(rhs, op::Constant(), op::Reshape(),
+                                           op::Constant(), op::Constant())),
+                 op::Broadcast()),
+      op::Shape("f32[8,4,14,64]"));
+  auto right_halo =
+      AllOf(op::CollectivePermute(op::Slice(lhs)), op::Shape("f32[8,2,28,8]"));
+  auto selected_lhs =
+      AllOf(op::DynamicSlice(
+                op::Pad(op::Concatenate(lhs, right_halo), op::Constant()),
+                op::Constant(), op::Reshape(), op::Constant(), op::Constant()),
+            op::Shape("f32[8,7,28,8]"));
+  EXPECT_THAT(root,
+              AllOf(op::AllReduce(op::Convolution(selected_lhs, selected_rhs)),
+                    op::Shape("f32[1,1,8,64]")));
+}
+
 TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiledWithPadding) {
   const char* const hlo_string = R"(
 HloModule module
@@ -2218,7 +2373,7 @@ ENTRY entry {
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           PartitionComputation(hlo_string, /*num_devices=*/2));
-  std::cout << module->ToString();
+  VLOG(1) << module->ToString();
   auto sort = FindInstruction(module.get(), "sort");
   EXPECT_EQ(sort->operand(0)->shape().dimensions(1), 209664);
   EXPECT_EQ(sort->operand(1)->shape().dimensions(1), 209664);
@@ -2294,7 +2449,7 @@ ENTRY entry
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           PartitionComputation(hlo_string, /*num_devices=*/2));
-  std::cout << module->ToString();
+  VLOG(1) << module->ToString();
   auto sort = FindInstruction(module.get(), "sort");
   EXPECT_EQ(sort->operand(0)->shape().dimensions(1), 209664);
   EXPECT_EQ(sort->operand(1)->shape().dimensions(1), 209664);
@@ -2612,6 +2767,35 @@ ENTRY entry {
               AllOf(op::Reduce(param0, op::Constant()), op::Shape("f32[64]")));
 }
 
+TEST_F(SpmdPartitioningTest, PartialTiledToPartialTiledReduce) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  %param0 = f32[4,4] parameter(0),
+    sharding={devices=[2,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  %constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT %reduce = f32[4] reduce(%param0, %constant.1), dimensions={0},
+    to_apply=%sum,
+    sharding={devices=[2,4]0,1,4,5,2,3,6,7 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              AllOf(op::AllReduce(op::Reduce(op::Parameter(0), op::Constant())),
+                    op::Shape("f32[2]")));
+}
+
 TEST_F(SpmdPartitioningTest, TiledToTiledTupleReduce) {
   const char* const hlo_string = R"(
 HloModule module
@@ -3576,6 +3760,25 @@ ENTRY entry {
                           op::Shape("f32[3,5]")));
 }
 
+TEST_F(SpmdPartitioningTest, IndexPassthroughGather) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[2,9,8] parameter(0), sharding={replicated}
+  %indices = s32[4,2,4] parameter(1), sharding={devices=[2,1,2]0,1,2,3}
+  ROOT %gather = f32[8,4,4] gather(%input, %indices), offset_dims={0},
+    collapsed_slice_dims={0,1}, start_index_map={0,1}, index_vector_dim=1,
+    slice_sizes={1,1,8}, sharding={devices=[1,2,2]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Gather(op::Parameter(0), op::Parameter(1)),
+                          op::Shape("f32[8,2,2]")));
+}
+
 TEST_F(SpmdPartitioningTest, GatherPartitionedOnTrivialSliceDims) {
   const char* const hlo_string = R"(
 HloModule module
@@ -3635,6 +3838,74 @@ ENTRY entry {
                           op::Shape("f32[2,5]")));
 }
 
+TEST_F(SpmdPartitioningTest, IndexPassthroughScatter) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[2,9,8] parameter(0), sharding={replicated}
+  %indices = s32[4,2,4] parameter(1), sharding={devices=[2,1,2]0,1,2,3}
+  %updates = f32[4,4,8] parameter(2), sharding={devices=[2,2,1]0,1,2,3}
+  ROOT %scatter = f32[2,9,8] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={2},
+      inserted_window_dims={0,1},
+      scatter_dims_to_operand_dims={0,1},
+      index_vector_dim=1, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::AllReduce(op::Scatter(
+                op::Select(op::Broadcast(op::Convert(op::PartitionId())),
+                           op::Broadcast(op::Constant()), op::Parameter(0)),
+                op::Parameter(1), op::Parameter(2))),
+            op::Shape("f32[2,9,8]")));
+}
+
+TEST_F(SpmdPartitioningTest, IndexPassthroughScatter_Min) {
+  const char* const hlo_string = R"(
+HloModule module
+
+min (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT min = f32[] minimum(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[2,9,8] parameter(0), sharding={replicated}
+  %indices = s32[4,2,4] parameter(1), sharding={devices=[2,1,2]0,1,2,3}
+  %updates = f32[4,4,8] parameter(2), sharding={devices=[2,2,1]0,1,2,3}
+  ROOT %scatter = f32[2,9,8] scatter(%input, %indices, %updates),
+      to_apply=min,
+      update_window_dims={2},
+      inserted_window_dims={0,1},
+      scatter_dims_to_operand_dims={0,1},
+      index_vector_dim=1, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::AllReduce(op::Scatter(
+                op::Select(op::Broadcast(op::Convert(op::PartitionId())),
+                           op::Broadcast(op::Constant()), op::Parameter(0)),
+                op::Parameter(1), op::Parameter(2))),
+            op::Shape("f32[2,9,8]")));
+}
+
 TEST_F(SpmdPartitioningTest, ScatterPartitionedOnTrivialSliceDims) {
   const char* const hlo_string = R"(
 HloModule module
@@ -3766,6 +4037,364 @@ ENTRY entry {
                                        op::Parameter(0))));
 }
 
+TEST_F(SpmdPartitioningTest, SubgroupAllToAllReshard) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[8,8,8,8] parameter(0),
+    sharding={devices=[2,2,1,2]0,1,2,3,4,5,6,7}
+  ROOT %copy = f32[8,8,8,8] copy(%param0),
+    sharding={devices=[1,2,2,2]0,1,4,5,2,3,6,7}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto reshape =
+      AllOf(op::Shape("f32[4,4,2,4,4]"), op::Reshape(op::Parameter(0)));
+  auto all_to_all = AllOf(op::Shape("f32[4,4,2,4,4]"), op::AllToAll(reshape));
+  auto xpose = AllOf(op::Shape("f32[2,4,4,4,4]"), op::Transpose(all_to_all));
+  EXPECT_THAT(root,
+              op::Copy(AllOf(op::Reshape(xpose), op::Shape("f32[8,4,4,4]"))));
+  EXPECT_EQ(root->operand(0)->operand(0)->operand(0)->replica_groups().size(),
+            4);
+}
+
+TEST_F(SpmdPartitioningTest, SubgroupAllToAllReshard2) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[8,8] parameter(0),
+    sharding={devices=[2,4]0,1,2,3,4,5,6,7}
+  ROOT %copy = f32[8,8] copy(%param0),
+    sharding={devices=[4,2]0,1,4,5,2,3,6,7}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto all_to_all = op::AllToAll(
+      AllOf(op::Shape("f32[2,2,2]"), op::Reshape(op::Parameter(0))));
+  auto reshape =
+      AllOf(op::Shape("f32[2,4]"), op::Reshape(op::Transpose(all_to_all)));
+  EXPECT_THAT(root, op::Copy(op::CollectivePermute(reshape)));
+}
+
+TEST_F(SpmdPartitioningTest, SubgroupAllToAllReshard3) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[8,8,8] parameter(0),
+    sharding={devices=[2,4,1]0,1,2,3,4,5,6,7}
+  ROOT %copy = f32[8,8,8] copy(%param0),
+    sharding={devices=[1,2,4]0,1,4,5,2,3,6,7}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto all_to_all = op::AllToAll(
+      AllOf(op::Shape("f32[4,2,4,2]"), op::Reshape(op::Parameter(0))));
+  auto reshape =
+      AllOf(op::Shape("f32[4,8,2]"), op::Reshape(op::Transpose(all_to_all)));
+  auto all_to_all2 =
+      op::AllToAll(AllOf(op::Shape("f32[4,2,4,2]"), op::Reshape(reshape)));
+  auto reshape2 =
+      AllOf(op::Shape("f32[8,4,2]"), op::Reshape(op::Transpose(all_to_all2)));
+  EXPECT_THAT(root, op::Copy(op::CollectivePermute(reshape2)));
+}
+
+TEST_F(SpmdPartitioningTest, Dot2DPartitionedNonContractingAndContracting0) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[48,12] parameter(0), sharding={devices=[2,2]0,1,2,3}
+  %rhs = f32[32,12] parameter(1), sharding={devices=[2,2]0,1,2,3}
+  ROOT %dot = f32[48,32] dot(%lhs, %rhs),
+    lhs_batch_dims={}, rhs_batch_dims={},
+    lhs_contracting_dims={1}, rhs_contracting_dims={1},
+    sharding={devices=[2,2]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto lhs = AllOf(op::Shape("f32[24,6]"), op::Parameter(0));
+  auto partial_replicated_lhs =
+      AllOf(op::Shape("f32[24,12]"),
+            op::AllReduce(op::DynamicUpdateSlice(_, lhs, _, _)));
+  auto rhs = AllOf(op::Shape("f32[16,6]"), op::Parameter(1));
+  auto partial_replicated_rhs =
+      AllOf(op::Shape("f32[16,12]"), op::AllReduce(op::DynamicUpdateSlice(
+                                         _, op::CollectivePermute(rhs), _, _)));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              AllOf(op::Dot(partial_replicated_lhs, partial_replicated_rhs),
+                    op::Shape("f32[24,16]")));
+}
+
+TEST_F(SpmdPartitioningTest, Dot2DPartitionedNonContractingAndContracting1) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[48,100] parameter(0), sharding={devices=[2,2]0,1,2,3}
+  %rhs = f32[32,100] parameter(1), sharding={devices=[2,2]0,1,2,3}
+  ROOT %dot = f32[48,32] dot(%lhs, %rhs),
+    lhs_batch_dims={}, rhs_batch_dims={},
+    lhs_contracting_dims={1}, rhs_contracting_dims={1},
+    sharding={devices=[2,2]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto lhs = AllOf(op::Shape("f32[24,50]"), op::Parameter(0));
+  auto rhs = AllOf(op::Shape("f32[16,50]"), op::Parameter(1));
+  auto partial_replicated_rhs =
+      AllOf(op::Shape("f32[32,50]"),
+            op::AllReduce(op::DynamicUpdateSlice(_, rhs, _, _)));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root, AllOf(op::Shape("f32[24,16]"),
+                  op::DynamicSlice(
+                      op::AllReduce(AllOf(op::Dot(lhs, partial_replicated_rhs),
+                                          op::Shape("f32[24,32]"))),
+                      _, _)));
+}
+
+TEST_F(SpmdPartitioningTest, Dot2DPartitionedNonContractingAndContracting2) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[48,100] parameter(0), sharding={replicated}
+  %rhs = f32[32,100] parameter(1), sharding={devices=[2,2]0,1,2,3}
+  ROOT %dot = f32[48,32] dot(%lhs, %rhs),
+    lhs_batch_dims={}, rhs_batch_dims={},
+    lhs_contracting_dims={1}, rhs_contracting_dims={1},
+    sharding={devices=[2,2]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto lhs = AllOf(op::Shape("f32[48,100]"), op::Parameter(0));
+  auto lhs_slice = AllOf(op::Shape("f32[24,100]"), op::DynamicSlice(lhs, _, _));
+  auto rhs = AllOf(op::Shape("f32[16,50]"), op::Parameter(1));
+  auto partial_replicated_rhs = AllOf(
+      op::Shape("f32[16,100]"), op::AllReduce(op::DynamicUpdateSlice(
+                                    _, op::CollectivePermute(rhs), _, _)));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[24,16]"),
+                          op::Dot(lhs_slice, partial_replicated_rhs)));
+}
+
+TEST_F(SpmdPartitioningTest, Dot2DPartitionedBatchAndNonContracting) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[4,24,100] parameter(0), sharding={devices=[2,2,1]0,1,2,3}
+  %rhs = f32[4,32,100] parameter(1), sharding={devices=[2,2,1]0,1,2,3}
+  ROOT %dot = f32[4,24,32] dot(%lhs, %rhs),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[2,2,1]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto lhs = AllOf(op::Shape("f32[2,12,100]"), op::Parameter(0));
+  auto rhs = AllOf(op::Shape("f32[2,16,100]"), op::Parameter(1));
+  auto partial_replicated_rhs =
+      AllOf(op::Shape("f32[2,32,100]"),
+            op::AllReduce(op::DynamicUpdateSlice(_, rhs, _, _, _)));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[2,12,32]"),
+                          op::Dot(lhs, partial_replicated_rhs)));
+}
+
+TEST_F(SpmdPartitioningTest, Dot2DPartitionedBatchAndContracting) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[4,24,100] parameter(0), sharding={devices=[2,1,2]0,1,2,3}
+  %rhs = f32[4,32,100] parameter(1), sharding={devices=[1,2,2]0,1,2,3}
+  ROOT %dot = f32[4,24,32] dot(%lhs, %rhs),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[2,2,1]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto lhs = AllOf(op::Shape("f32[2,24,50]"), op::Parameter(0));
+  auto rhs = AllOf(op::Shape("f32[4,16,50]"), op::Parameter(1));
+  auto resharded_rhs =
+      AllOf(op::Shape("f32[2,32,50]"),
+            op::Reshape(op::Transpose(op::AllToAll(op::Reshape(rhs)))));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[2,12,32]"),
+                          op::DynamicSlice(
+                              AllOf(op::Shape("f32[2,24,32]"),
+                                    op::AllReduce(op::Dot(lhs, resharded_rhs))),
+                              _, _, _)));
+}
+
+TEST_F(SpmdPartitioningTest, Dot2DPartitionedBatchAndContracting2) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[4,24,100] parameter(0), sharding={devices=[2,1,2]0,1,2,3}
+  %rhs = f32[4,32,100] parameter(1), sharding={replicated}
+  ROOT %dot = f32[4,24,32] dot(%lhs, %rhs),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[2,2,1]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto lhs = AllOf(op::Shape("f32[2,24,50]"), op::Parameter(0));
+  auto resharded_lhs =
+      AllOf(op::Shape("f32[2,12,100]"),
+            op::Reshape(op::Transpose(op::AllToAll(op::Reshape(lhs)))));
+  auto rhs = AllOf(op::Shape("f32[4,32,100]"), op::Parameter(1));
+  auto rhs_slice =
+      AllOf(op::Shape("f32[2,32,100]"), op::DynamicSlice(rhs, _, _, _));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[2,12,32]"),
+                          op::Dot(resharded_lhs, rhs_slice)));
+}
+
+TEST_F(SpmdPartitioningTest,
+       Dot2DPartitionedBatchNonContractingAndContracting) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[4,24,100] parameter(0), sharding={devices=[2,1,2]0,1,2,3}
+  %rhs = f32[4,32,100] parameter(1), sharding={devices=[2,2,1]0,1,2,3}
+  ROOT %dot = f32[4,24,32] dot(%lhs, %rhs),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[2,1,2]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto lhs = AllOf(op::Shape("f32[2,24,50]"), op::Parameter(0));
+  auto rhs = AllOf(op::Shape("f32[2,16,100]"), op::Parameter(1));
+  auto partial_replicated_lhs =
+      AllOf(op::Shape("f32[2,24,100]"),
+            op::AllReduce(op::DynamicUpdateSlice(_, lhs, _, _, _)));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[2,24,16]"),
+                          op::Dot(partial_replicated_lhs, rhs)));
+}
+
+TEST_F(SpmdPartitioningTest, Dot2DPartitionedBatchAndReshard) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[4,8,24,100] parameter(0), sharding={devices=[2,1,2,1]0,1,2,3}
+  %rhs = f32[4,8,32,100] parameter(1), sharding={devices=[2,1,2,1]0,1,2,3}
+  ROOT %dot = f32[4,8,24,32] dot(%lhs, %rhs),
+    lhs_batch_dims={0,1}, rhs_batch_dims={0,1},
+    lhs_contracting_dims={3}, rhs_contracting_dims={3},
+    sharding={devices=[1,2,2,1]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto lhs = AllOf(op::Shape("f32[2,8,12,100]"), op::Parameter(0));
+  auto rhs = AllOf(op::Shape("f32[2,8,16,100]"), op::Parameter(1));
+  auto partial_replicated_rhs =
+      AllOf(op::Shape("f32[2,8,32,100]"),
+            op::AllReduce(op::DynamicUpdateSlice(_, rhs, _, _, _, _)));
+  auto dot =
+      AllOf(op::Shape("f32[2,8,12,32]"), op::Dot(lhs, partial_replicated_rhs));
+  auto reshape = AllOf(op::Shape("f32[2,2,4,12,32]"), op::Reshape(dot));
+  auto all_to_all = AllOf(op::Shape("f32[2,2,4,12,32]"), op::AllToAll(reshape));
+  auto xpose = AllOf(op::Shape("f32[2,2,4,12,32]"), op::Transpose(all_to_all));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[4,4,12,32]"), op::Reshape(xpose)));
+}
+
+TEST_F(SpmdPartitioningTest,
+       ElementwiseTest_PartialReplicateToTiledHaloExchange) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  constant = f32[6,3]{1,0}
+    constant({{1,3,7},{5,1,4},{1,2,8},{2,3,7},{5,2,4},{2,2,8}}),
+    sharding={replicated}
+  constant.1 = f32[6,3]{1,0}
+    constant({{2,7,2},{2,9,2},{2,6,2},{3,7,2},{2,9,3},{2,3,2}}),
+    sharding={replicated}
+  multiply = f32[6,3]{1,0} multiply(constant, constant.1),
+    sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT add = f32[6,3]{1,0} add(multiply, constant.1),
+    sharding={devices=[4,1]0,1,2,3}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto partial_replicate_lhs =
+      AllOf(op::Shape("f32[3,3]"),
+            op::DynamicSlice(op::Constant(), op::Reshape(), op::Constant()));
+  auto partial_replicate_rhs =
+      AllOf(op::Shape("f32[3,3]"),
+            op::DynamicSlice(op::Constant(), op::Reshape(), op::Constant()));
+  auto multiply =
+      AllOf(op::Shape("f32[3,3]"),
+            op::Multiply(partial_replicate_lhs, partial_replicate_rhs));
+  auto right_halo =
+      AllOf(op::Shape("f32[1,3]"), op::CollectivePermute(op::Slice(multiply)));
+  auto add_lhs = AllOf(
+      op::Shape("f32[2,3]"),
+      op::DynamicSlice(
+          op::DynamicSlice(
+              op::Pad(op::Concatenate(multiply, right_halo), op::Constant()),
+              op::Reshape(), op::Constant()),
+          op::Reshape(), op::Constant()));
+  auto add_rhs = AllOf(op::Shape("f32[2,3]"),
+                       op::DynamicSlice(op::Pad(op::Constant(), op::Constant()),
+                                        op::Reshape(), op::Constant()));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[2,3]"), op::Add(add_lhs, add_rhs)));
+}
+
 }  // namespace
 }  // namespace spmd
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
index 3354a9c3233..3443c6e013d 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
@@ -16,7 +16,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h"
 
 #include <algorithm>
+#include <memory>
 
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_join.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
@@ -25,10 +30,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -41,6 +49,23 @@ bool HasReplicatedSharding(const HloSharding& sharding) {
   return sharding.IsReplicated();
 }
 
+HloInstruction* CreateConstant(const Shape& shape, Literal value,
+                               SpmdBuilder* b) {
+  if (shape.IsTuple()) {
+    std::vector<HloInstruction*> elements;
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      elements.push_back(CreateConstant(
+          ShapeUtil::GetTupleElementShape(shape, i), value.Clone(), b));
+    }
+    return b->AddInstruction(HloInstruction::CreateTuple(elements));
+  }
+
+  CHECK(
+      ShapeUtil::IsScalarWithElementType(value.shape(), shape.element_type()));
+  auto c = b->AddInstruction(HloInstruction::CreateConstant(std::move(value)));
+  return b->AddInstruction(HloInstruction::CreateBroadcast(shape, c, {}));
+}
+
 HloInstruction* CreateZero(const Shape& shape, SpmdBuilder* b) {
   if (shape.IsTuple()) {
     std::vector<HloInstruction*> elements;
@@ -59,6 +84,24 @@ HloInstruction* CreateZero(const Shape& shape, SpmdBuilder* b) {
   return b->AddInstruction(HloInstruction::CreateBroadcast(shape, zero, {}));
 }
 
+HloInstruction* CreateOne(const Shape& shape, SpmdBuilder* b) {
+  if (shape.IsTuple()) {
+    std::vector<HloInstruction*> elements;
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      elements.push_back(
+          CreateOne(ShapeUtil::GetTupleElementShape(shape, i), b));
+    }
+    return b->AddInstruction(HloInstruction::CreateTuple(elements));
+  }
+
+  if (shape.IsToken()) {
+    return b->AddInstruction(HloInstruction::CreateToken());
+  }
+  auto one = b->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::One(shape.element_type())));
+  return b->AddInstruction(HloInstruction::CreateBroadcast(shape, one, {}));
+}
+
 HloComputation* MakeBinaryAdd(PrimitiveType type, HloModule* module) {
   HloComputation::Builder sum_b("add");
   auto x = sum_b.AddInstruction(HloInstruction::CreateParameter(
@@ -128,6 +171,16 @@ Shape MakeNonPaddedShapeForGivenPartition(const Shape& shape,
     return ShapeUtil::MakeTupleShape(subshapes);
   }
 
+  if (sharding.IsReplicated()) {
+    return shape;
+  }
+  if (sharding.IsTileMaximal()) {
+    if (partition_id == *sharding.UniqueDevice()) {
+      return shape;
+    }
+    return ShapeUtil::MakeTupleShape({});
+  }
+
   auto partition_shape = shape;
   std::vector<int64> tile_offset =
       sharding.TileOffsetForDevice(shape, partition_id);
@@ -143,10 +196,10 @@ Shape MakeNonPaddedShapeForGivenPartition(const Shape& shape,
   return partition_shape;
 }
 
-std::vector<HloInstruction*> MakePartitionOffsets(const Shape& shape,
-                                                  const HloSharding& sharding,
-                                                  HloInstruction* partition_id,
-                                                  SpmdBuilder* b) {
+std::vector<HloInstruction*> MakePartitionOffsets(
+    const Shape& shape, const HloSharding& sharding,
+    HloInstruction* partition_id, SpmdBuilder* b,
+    absl::Span<const int64> dims) {
   CHECK(!shape.IsTuple());
 
   Array2D<int32> offset_array(
@@ -158,7 +211,8 @@ std::vector<HloInstruction*> MakePartitionOffsets(const Shape& shape,
       LiteralUtil::CreateR2FromArray2D(offset_array)));
   std::vector<HloInstruction*> offsets;
   for (int64 i = 0; i < shape.rank(); ++i) {
-    if (sharding.tile_assignment().dim(i) == 1) {
+    if (sharding.tile_assignment().dim(i) == 1 ||
+        (!dims.empty() && !absl::c_linear_search(dims, i))) {
       offsets.push_back(b->AddInstruction(
           HloInstruction::CreateConstant(LiteralUtil::Zero(S32))));
     } else {
@@ -177,8 +231,11 @@ std::vector<HloInstruction*> MakePartitionOffsets(const Shape& shape,
 std::vector<HloInstruction*> MakeTiledPartitionOrdinals(
     const HloSharding& sharding, HloInstruction* partition_id, SpmdBuilder* b) {
   CHECK(!sharding.IsTileMaximal());
-  auto table_shape =
-      ShapeUtil::MakeShape(S32, sharding.tile_assignment().dimensions());
+  auto dimensions = sharding.tile_assignment().dimensions();
+  if (sharding.ReplicateOnLastTileDim()) {
+    dimensions.pop_back();
+  }
+  auto table_shape = ShapeUtil::MakeShape(S32, dimensions);
   return MakePartitionOffsets(table_shape, sharding, partition_id, b);
 }
 
@@ -235,6 +292,195 @@ HloInstruction* PadBaseShapeBeforeUnevenTiledSharding(
   return PadToShape(hlo, padded_base_shape, b);
 }
 
+// TODO(wangtao): generize this function when target is partial replicate.
+absl::optional<HloSharding> PartialReplicateToTileCompatibleSharding(
+    const HloSharding& partial_sharding,
+    const std::vector<int64>& target_tile_dims) {
+  if (!partial_sharding.ReplicateOnLastTileDim()) {
+    return absl::nullopt;
+  }
+  int64 rank = partial_sharding.tile_assignment().num_dimensions() - 1;
+  if (target_tile_dims.size() < rank) {
+    return absl::nullopt;
+  }
+  // A dimension is expanded when target_tile_size > partial_tile_size and
+  // target_tile_size % partial_tile_size == 0.
+  // expand_tile_dims_positions is the index of the expand_dim.
+  std::vector<int64> expand_tile_dims_indices(rank, -1);
+  // expand_tile_size = target_tile_size / partial_tile_size.
+  std::vector<int64> expand_tile_sizes;
+  int num_expand_dims = 0;
+  for (int64 dim = 0; dim < rank; dim++) {
+    int64 partial_tile_size = partial_sharding.tile_assignment().dim(dim);
+    int64 target_tile_size = target_tile_dims[dim];
+    if (target_tile_size % partial_tile_size != 0 ||
+        target_tile_size < partial_tile_size) {
+      return absl::nullopt;
+    }
+
+    if (target_tile_size > partial_tile_size) {
+      expand_tile_dims_indices[dim] = num_expand_dims++;
+      expand_tile_sizes.emplace_back(target_tile_size / partial_tile_size);
+    }
+  }
+
+  // Reshape the partial replicate tile_dimensions.
+  auto reshape_dimensions = partial_sharding.tile_assignment().dimensions();
+  int64 num_replication = reshape_dimensions.back();
+  if (num_replication != Product(expand_tile_sizes)) {
+    return absl::nullopt;
+  }
+  reshape_dimensions.pop_back();
+  reshape_dimensions.insert(reshape_dimensions.end(), expand_tile_sizes.begin(),
+                            expand_tile_sizes.end());
+  auto reshape_tile_assignment = partial_sharding.tile_assignment();
+
+  // Transpose.
+  std::vector<int64> perm;
+  perm.reserve(rank);
+  for (int64 dim = 0; dim < rank; dim++) {
+    perm.emplace_back(dim);
+    if (expand_tile_dims_indices[dim] > -1) {
+      perm.emplace_back(expand_tile_dims_indices[dim] + rank);
+    }
+  }
+  auto transpose_sharding = hlo_sharding_util::TransposeSharding(
+      HloSharding::Tile(reshape_tile_assignment), perm);
+
+  // Reshape to target shape
+  auto transpose_tile_assignment = transpose_sharding.tile_assignment();
+  transpose_tile_assignment.Reshape(target_tile_dims);
+
+  return HloSharding::Tile(transpose_tile_assignment);
+}
+
+absl::optional<HloInstruction*> PadFromPartialReplicateShape(
+    HloInstruction* hlo, const Shape& base_shape,
+    const HloSharding& src_sharding, const HloSharding& dst_sharding,
+    const std::vector<int64>& expand_tile_dims,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, HloInstruction* partition_id, SpmdBuilder* b) {
+  auto padded_src_shape =
+      GetPaddedShapeForUnevenPartitioning(base_shape, src_sharding);
+  auto padded_dst_shape =
+      GetPaddedShapeForUnevenPartitioning(base_shape, dst_sharding);
+  if (ShapeUtil::Compatible(padded_dst_shape, hlo->shape())) {
+    return hlo;
+  }
+
+  auto partition_ordinals =
+      MakeTiledPartitionOrdinals(src_sharding, partition_id, b);
+
+  HloInstruction* result = hlo;
+  auto zero = b->AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::Zero(hlo->shape().element_type())));
+  std::vector<int64> expand_dims_without_halo_exchange;
+  // Pad the dimensions needs halo exchange and record the padded dims that
+  // won't need halo exchange.
+  for (auto dim : expand_tile_dims) {
+    int64 src_shard_count = src_sharding.tile_assignment().dim(dim);
+    int64 src_per_shard_size =
+        padded_src_shape.dimensions(dim) / src_shard_count;
+    // Calculate per shard size using the sharding to compare if dst_sharding
+    // needs more padding at the end.
+    int64 dst_per_shard_size =
+        padded_dst_shape.dimensions(dim) / src_shard_count;
+
+    // If dst_sharding doesn't need more padding at the end.
+    if (src_per_shard_size >= dst_per_shard_size) {
+      continue;
+    }
+    // If src sharding at this dimension is not partitoned, simply pad to
+    // the desired shape.
+    if (src_shard_count == 1) {
+      expand_dims_without_halo_exchange.emplace_back(dim);
+      continue;
+    }
+
+    // If dst_padding needs more padding at the end, need to re-distribute the
+    // data between each shard using collective permute.
+    // For example, if dimension size is 6 and shard 2 ways in the src but
+    // needs to shard 4 ways in the dst. 4 ways needs padding 2 0s at the end
+    // and has 2 elements at each shard, while 2 way sharding has 3 elements
+    // in each shard, re-distribution is needed.
+    //
+    // 1. Calculate left_halo size.
+    // left-halo size is 0
+    OffsetCalculation left_halo_size_function =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(0, 0, 1));
+
+    // 2. Calculate right_halo size.
+    // right-halo size is D * (i + 1) - S * (i + 1) = (D - S) * i + (D - S)
+    OffsetCalculation right_halo_size_function =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            dst_per_shard_size - src_per_shard_size,
+            dst_per_shard_size - src_per_shard_size, 1));
+
+    auto concat = result;
+    // 3. Halo exchange.
+    auto halo_exchange_result = ExchangeHalo(
+        result, left_halo_size_function, right_halo_size_function, dim,
+        src_sharding, collective_ops_creator, next_channel_id, b);
+
+    if (halo_exchange_result.has_value()) {
+      concat = halo_exchange_result.value();
+    } else {
+      return absl::nullopt;
+    }
+
+    // 4. Pad.
+    std::vector<int64> zero_padding(concat->shape().rank());
+    PaddingConfig pad_config = window_util::MakeSymmetricPadding(zero_padding);
+    pad_config.mutable_dimensions(dim)->set_edge_padding_low(0);
+    int64 max_right_halo_size =
+        right_halo_size_function.MaxInRange(0, src_shard_count - 1);
+    pad_config.mutable_dimensions(dim)->set_edge_padding_high(std::max(
+        0LL, padded_dst_shape.dimensions(dim) -
+                 padded_src_shape.dimensions(dim) - max_right_halo_size));
+    auto padded_concat_shape = ShapeInference::InferPadShape(
+                                   concat->shape(), zero->shape(), pad_config)
+                                   .ValueOrDie();
+    concat = b->AddInstruction(HloInstruction::CreatePad(
+        padded_concat_shape, concat, zero, pad_config));
+
+    // 5. Slice the valid result.
+    // Slice offset is (D-S) * i
+    auto zero_s32 = b->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+    OffsetCalculation start_offset_on_padded_concat_calculation =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            dst_per_shard_size - src_per_shard_size, 0, 1));
+    auto slice_shape = concat->shape();
+    slice_shape.set_dimensions(dim, dst_per_shard_size);
+    std::vector<HloInstruction*> slice_offsets(concat->shape().rank(),
+                                               zero_s32);
+    slice_offsets[dim] = start_offset_on_padded_concat_calculation.Calculate(
+        partition_ordinals[dim], b);
+    result = b->AddInstruction(HloInstruction::CreateDynamicSlice(
+        slice_shape, concat, slice_offsets, slice_shape.dimensions()));
+  }
+
+  // Pad other dimensions that won't need halo exchange with a single pad.
+  if (!expand_dims_without_halo_exchange.empty()) {
+    std::vector<int64> zero_padding(result->shape().rank());
+    PaddingConfig pad_config = window_util::MakeSymmetricPadding(zero_padding);
+
+    auto padded_shape = result->shape();
+    for (auto dim : expand_dims_without_halo_exchange) {
+      pad_config.mutable_dimensions(dim)->set_edge_padding_low(0);
+      pad_config.mutable_dimensions(dim)->set_edge_padding_high(
+          padded_dst_shape.dimensions(dim) - padded_src_shape.dimensions(dim));
+      padded_shape.set_dimensions(dim, result->shape().dimensions(dim) +
+                                           padded_dst_shape.dimensions(dim) -
+                                           padded_src_shape.dimensions(dim));
+    }
+    result = b->AddInstruction(
+        HloInstruction::CreatePad(padded_shape, result, zero, pad_config));
+  }
+
+  return result;
+}
+
 absl::optional<int64> UniqueTiledDim(const HloSharding& sharding) {
   if (sharding.IsTileMaximal()) {
     return absl::nullopt;
@@ -877,5 +1123,461 @@ HloInstruction* SliceFirstK(HloInstruction* hlo, SpmdBuilder* builder,
       output_shape, hlo, start_indices, limit_indices, strides));
 }
 
+// Check if a dimension is sharded.
+int64 ShardCountAtDim(const HloSharding& sharding, int64 dim) {
+  if (sharding.IsTileMaximal()) {
+    return 1;
+  }
+  return sharding.tile_assignment().dim(dim);
+}
+
+absl::optional<std::vector<std::pair<int64, int64>>>
+GetReshardAllToAllSourceTargetDims(const HloSharding& source,
+                                   const HloSharding& target) {
+  if (source.IsTileMaximal() || target.IsTileMaximal() ||
+      source.tile_assignment().num_dimensions() !=
+          target.tile_assignment().num_dimensions() ||
+      source.NumTiles() != target.NumTiles()) {
+    return absl::nullopt;
+  }
+  // Record partition count to index for indices that have different partition
+  // counts on source and target.
+  std::map<int64, std::vector<int64>> source_size_to_dim;
+  std::map<int64, std::vector<int64>> target_size_to_dim;
+  for (int64 i = 0; i < source.tile_assignment().num_dimensions(); ++i) {
+    if (source.tile_assignment().dim(i) == target.tile_assignment().dim(i)) {
+      continue;
+    }
+    source_size_to_dim[source.tile_assignment().dim(i)].push_back(i);
+    target_size_to_dim[target.tile_assignment().dim(i)].push_back(i);
+  }
+  // In order to shard via AllToAll, source_size_to_dim and target_size_to_dim
+  // must have the same distribution.
+  if (source_size_to_dim.empty() ||
+      source_size_to_dim.size() != target_size_to_dim.size()) {
+    return absl::nullopt;
+  }
+  for (const auto& entry : source_size_to_dim) {
+    auto target_it = target_size_to_dim.find(entry.first);
+    if (target_it == target_size_to_dim.end() ||
+        target_it->second.size() != entry.second.size()) {
+      return absl::nullopt;
+    }
+  }
+  std::vector<std::pair<int64, int64>> result;
+  auto remove_entry = [](int64 size, int64 dim,
+                         std::map<int64, std::vector<int64>>& size_to_dim) {
+    size_to_dim[size].erase(
+        std::remove_if(size_to_dim[size].begin(), size_to_dim[size].end(),
+                       [dim](int64 a) { return a == dim; }),
+        size_to_dim[size].end());
+    if (size_to_dim[size].empty()) {
+      size_to_dim.erase(size);
+    }
+  };
+  // Find one pair of dimensions to swap at a time.
+  while (!source_size_to_dim.empty()) {
+    int64 source_size = source_size_to_dim.begin()->first;
+    int64 i = source_size_to_dim.begin()->second.back();
+    int64 target_i_size = target.tile_assignment().dim(i);
+    if (target_i_size == source_size) {
+      remove_entry(source_size, i, source_size_to_dim);
+      remove_entry(source_size, i, target_size_to_dim);
+      continue;
+    }
+    auto j_it = source_size_to_dim[target_i_size].begin();
+    int64 j = *j_it;
+    if (source_size == 1) {
+      // If possible, find a j where the target partition count is not one, so
+      // that when we swap, the resulting size-1 dimension will still be useful
+      // to other dimensions.
+      while (target.tile_assignment().dim(j) == 1) {
+        if (++j_it == source_size_to_dim[target_i_size].end()) {
+          break;
+        }
+        j = *j_it;
+      }
+    } else if (target_i_size % source_size == 0) {
+      // If possible, find a j where the target partition count is source_size,
+      // so that we can do a single swap.
+      while (target.tile_assignment().dim(j) != source_size) {
+        if (++j_it == source_size_to_dim[target_i_size].end()) {
+          break;
+        }
+        j = *j_it;
+      }
+    } else {
+      return absl::nullopt;
+    }
+    result.emplace_back(j, i);
+    remove_entry(target_i_size, i, target_size_to_dim);
+    source_size_to_dim.begin()->second.back() = j;
+    remove_entry(target_i_size, j, source_size_to_dim);
+  }
+  return result;
+}
+
+bool CanReshardWithCollectivePermute(const HloSharding& source,
+                                     const HloSharding& target) {
+  return !source.IsTileMaximal() && !target.IsTileMaximal() &&
+         source.tile_assignment().dimensions() ==
+             target.tile_assignment().dimensions() &&
+         source.ReplicateOnLastTileDim() == target.ReplicateOnLastTileDim() &&
+         source.tile_assignment() != target.tile_assignment();
+}
+
+GroupedSharding GroupShardingOnDims(const HloSharding& sharding,
+                                    absl::Span<const int64> group_dims) {
+  CHECK(!sharding.IsTileMaximal());
+  std::vector<int64> grouped_tiling_dims =
+      sharding.tile_assignment().dimensions();
+  std::vector<int64> group_dim_sizes(group_dims.size());
+  for (int64 i = 0; i < group_dims.size(); ++i) {
+    group_dim_sizes[i] = grouped_tiling_dims[group_dims[i]];
+    grouped_tiling_dims[group_dims[i]] = 1;
+  }
+  std::vector<std::vector<int64>> device_groups(Product(group_dim_sizes));
+  sharding.tile_assignment().Each(
+      [&](absl::Span<const int64> indices, int64 device) {
+        int64 group_id = 0;
+        for (int64 dim : group_dims) {
+          group_id *= sharding.tile_assignment().dim(dim);
+          group_id += indices[dim];
+        }
+        device_groups[group_id].push_back(device);
+      });
+  Array<int64> grouped_tiling(grouped_tiling_dims);
+  grouped_tiling.FillIota(0);
+  return GroupedSharding(
+      std::move(device_groups),
+      std::vector<int64>(group_dims.begin(), group_dims.end()),
+      std::move(group_dim_sizes), sharding.tile_assignment().num_dimensions(),
+      HloSharding::Tile(grouped_tiling));
+}
+
+HloSharding UngroupSharding(const GroupedSharding& grouped_sharding) {
+  CHECK(!grouped_sharding.sharding.IsTileMaximal());
+  std::vector<int64> tiling_dims =
+      grouped_sharding.sharding.tile_assignment().dimensions();
+  for (int64 i = 0; i < grouped_sharding.group_dims.size(); ++i) {
+    tiling_dims[grouped_sharding.group_dims[i]] =
+        grouped_sharding.group_dim_sizes[i];
+  }
+  Array<int64> tiling(tiling_dims);
+  grouped_sharding.sharding.tile_assignment().Each(
+      [&](absl::Span<const int64> indices, int64 device) {
+        std::vector<int64> ungrouped_inds(indices.begin(), indices.end());
+        for (int64 g = 0; g < grouped_sharding.device_groups.size(); ++g) {
+          int64 remaining_group_index = g;
+          for (int64 i = grouped_sharding.group_dims.size() - 1; i >= 0; --i) {
+            ungrouped_inds[grouped_sharding.group_dims[i]] =
+                remaining_group_index % grouped_sharding.group_dim_sizes[i];
+            remaining_group_index /= grouped_sharding.group_dim_sizes[i];
+          }
+          tiling(ungrouped_inds) = grouped_sharding.device_groups[g][device];
+        }
+      });
+  return HloSharding::Tile(tiling);
+}
+
+GroupedSharding AlignGroupsWith(GroupedSharding grouped_sharding,
+                                const GroupedSharding& reference,
+                                bool ignore_group_order) {
+  // Returns src -> dst index mapping.
+  auto get_permutation = [](absl::Span<const int64> src,
+                            absl::Span<const int64> dst) {
+    CHECK_EQ(src.size(), dst.size());
+    absl::flat_hash_map<int64, int64> dst_reverse_map;
+    for (int64 i = 0; i < dst.size(); ++i) {
+      dst_reverse_map[dst[i]] = i;
+    }
+    std::vector<int64> permutation(src.size());
+    for (int64 i = 0; i < src.size(); ++i) {
+      auto it = dst_reverse_map.find(src[i]);
+      CHECK(it != dst_reverse_map.end());
+      permutation[i] = it->second;
+    }
+    return permutation;
+  };
+  CHECK_EQ(grouped_sharding.device_groups.size(),
+           reference.device_groups.size());
+  absl::flat_hash_map<int64, int64> device_to_ref_group;
+  for (int64 g = 0; g < reference.device_groups.size(); ++g) {
+    for (int64 device : reference.device_groups[g]) {
+      device_to_ref_group[device] = g;
+    }
+  }
+  auto unique_ref_dev_group = [&](absl::Span<const int64> devices) -> int64 {
+    int64 ref_g = -1;
+    for (int64 device : devices) {
+      if (ref_g == -1) {
+        ref_g = device_to_ref_group[device];
+      } else if (ref_g != device_to_ref_group[device]) {
+        return -1;
+      }
+    }
+    return ref_g;
+  };
+  bool matching_groups = true;
+  std::vector<int64> original_src_to_ref_permutation;
+  for (int64 g = 0; g < grouped_sharding.device_groups.size(); ++g) {
+    int64 ref_g = unique_ref_dev_group(grouped_sharding.device_groups[g]);
+    if (ref_g < 0 || (!ignore_group_order && g != ref_g)) {
+      matching_groups = false;
+      break;
+    }
+    if (g == 0) {
+      original_src_to_ref_permutation = get_permutation(
+          grouped_sharding.device_groups[g], reference.device_groups[ref_g]);
+    }
+  }
+  if (matching_groups) {
+    auto tiles = grouped_sharding.sharding.tile_assignment();
+    tiles.Each([&](absl::Span<const int64> indices, int64* device) {
+      *device = original_src_to_ref_permutation[*device];
+    });
+    grouped_sharding.sharding = HloSharding::Tile(tiles);
+  }
+  grouped_sharding.device_groups = std::move(reference.device_groups);
+  return grouped_sharding;
+}
+
+Shape GetPerGroupBaseShape(const GroupedSharding& grouped_sharding,
+                           const Shape& original_base_shape) {
+  auto result = original_base_shape;
+  for (int64 i = 0; i < grouped_sharding.group_dims.size(); ++i) {
+    int64 dim = grouped_sharding.group_dims[i];
+    int64 groups = grouped_sharding.group_dim_sizes[i];
+    result.set_dimensions(dim, result.dimensions(dim) / groups);
+  }
+  return result;
+}
+
+namespace {
+
+HloInstruction* GetInGroupPartitionId(
+    HloInstruction* partition_id,
+    const std::vector<std::vector<int64>>& device_groups, SpmdBuilder* b) {
+  int64 total_devices = device_groups.size() * device_groups[0].size();
+  std::vector<uint32> in_group_ids(total_devices);
+  for (uint32 i = 0; i < device_groups.size(); ++i) {
+    for (uint32 j = 0; j < device_groups[i].size(); ++j) {
+      in_group_ids[device_groups[i][j]] = j;
+    }
+  }
+  auto id_table = b->AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR1<uint32>(in_group_ids)));
+  return b->AddInstruction(HloInstruction::CreateReshape(
+      ShapeUtil::MakeScalarShape(U32),
+      b->AddInstruction(HloInstruction::CreateDynamicSlice(
+          ShapeUtil::MakeShape(U32, {1}), id_table, {partition_id}, {1}))));
+}
+
+SPMDCollectiveOpsCreator GetPerGroupCollectiveOpsCreator(
+    const SPMDCollectiveOpsCreator& creator,
+    const std::vector<std::vector<int64>>& device_groups) {
+  SPMDCollectiveOpsCreator result;
+  result.create_partition_id = [creator, device_groups](SpmdBuilder* b) {
+    return GetInGroupPartitionId(creator.create_partition_id(b), device_groups,
+                                 b);
+  };
+  auto expand_partition_groups =
+      [device_groups](
+          const std::vector<std::vector<int64>>& partition_subgroups) {
+        if (partition_subgroups.empty()) {
+          return device_groups;
+        }
+        std::vector<std::vector<int64>> result(partition_subgroups.size() *
+                                               device_groups.size());
+        for (int64 g = 0; g < device_groups.size(); ++g) {
+          for (int64 i = 0; i < partition_subgroups.size(); ++i) {
+            result[g * partition_subgroups.size() + i].resize(
+                partition_subgroups[i].size());
+            for (int64 j = 0; j < partition_subgroups[i].size(); ++j) {
+              result[g * partition_subgroups.size() + i][j] =
+                  device_groups[g][partition_subgroups[i][j]];
+            }
+          }
+        }
+        return result;
+      };
+  result.create_cross_partition_all_reduce =
+      [creator, expand_partition_groups](
+          SpmdBuilder* b, HloInstruction* operand, HloComputation* reduction,
+          const std::vector<std::vector<int64>>& partition_subgroups,
+          int64 channel_id) {
+        return creator.create_cross_partition_all_reduce(
+            b, operand, reduction, expand_partition_groups(partition_subgroups),
+            channel_id);
+      };
+  result.create_cross_partition_collective_permute =
+      [creator, device_groups](
+          SpmdBuilder* b, HloInstruction* operand,
+          std::vector<std::pair<int64, int64>>& src_dst_pairs,
+          int64 next_channel_id) {
+        std::vector<std::pair<int64, int64>> expanded_pairs(
+            src_dst_pairs.size() * device_groups.size());
+        for (int64 g = 0; g < device_groups.size(); ++g) {
+          for (int64 i = 0; i < src_dst_pairs.size(); ++i) {
+            expanded_pairs[g * src_dst_pairs.size() + i] =
+                std::pair<int64, int64>{
+                    device_groups[g][src_dst_pairs[i].first],
+                    device_groups[g][src_dst_pairs[i].second]};
+          }
+        }
+        return creator.create_cross_partition_collective_permute(
+            b, operand, expanded_pairs, next_channel_id);
+      };
+  result.create_cross_partition_all_to_all =
+      [creator, expand_partition_groups](
+          SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
+          const std::vector<std::vector<int64>>& partition_subgroups,
+          int64 channel_id, absl::optional<int64> split_dimension) {
+        return creator.create_cross_partition_all_to_all(
+            b, operands, expand_partition_groups(partition_subgroups),
+            channel_id, split_dimension);
+      };
+  if (creator.create_cross_partition_all_gather) {
+    result.create_cross_partition_all_gather =
+        [creator, expand_partition_groups](
+            SpmdBuilder* b, HloInstruction* operand, const Shape& ag_shape,
+            const std::vector<std::vector<int64>>& partition_subgroups,
+            int64 channel_id, int64 all_gather_dimension) {
+          return creator.create_cross_partition_all_gather(
+              b, operand, ag_shape,
+              expand_partition_groups(partition_subgroups), channel_id,
+              all_gather_dimension);
+        };
+  }
+  return result;
+}
+
+}  // namespace
+
+PartitionedHlo::PartitioningState CreatePerGroupPartitioningState(
+    const PartitionedHlo::PartitioningState& state,
+    const std::vector<std::vector<int64>>& device_groups, SpmdBuilder* b) {
+  auto result = state;
+  result.collective_ops_creator = GetPerGroupCollectiveOpsCreator(
+      state.collective_ops_creator, device_groups);
+  result.partition_id =
+      GetInGroupPartitionId(state.partition_id, device_groups, b);
+  // Create a string key for the groups.
+  std::vector<std::string> per_group_strings(device_groups.size());
+  for (int64 i = 0; i < per_group_strings.size(); ++i) {
+    per_group_strings[i] = absl::StrJoin(device_groups[i], ",");
+  }
+  auto& grouped_cache =
+      state.reshard_cache->groupd_caches[absl::StrJoin(per_group_strings, ";")];
+  if (!grouped_cache) {
+    grouped_cache = absl::make_unique<PartitionedHlo::ReshardCache>();
+  }
+  result.reshard_cache = grouped_cache.get();
+  return result;
+}
+
+HloInstruction* PerGroupSliceFromReplicated(
+    HloInstruction* replicated, HloInstruction* partition_id,
+    const std::vector<std::vector<int64>>& device_groups,
+    absl::Span<const int64> group_dims, absl::Span<const int64> group_dim_sizes,
+    SpmdBuilder* b) {
+  std::vector<uint32> group_ids(device_groups.size() * device_groups[0].size());
+  for (int64 g = 0; g < device_groups.size(); ++g) {
+    for (int64 device : device_groups[g]) {
+      group_ids[device] = g;
+    }
+  }
+  auto group_id_table = b->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR1<uint32>(group_ids)));
+  auto group_id = b->AddInstruction(HloInstruction::CreateReshape(
+      ShapeUtil::MakeScalarShape(U32),
+      b->AddInstruction(HloInstruction::CreateDynamicSlice(
+          ShapeUtil::MakeShape(U32, {1}), group_id_table, {partition_id},
+          {1}))));
+  std::vector<int64> group_level_tile_dims(replicated->shape().rank(), 1);
+  for (int64 i = 0; i < group_dims.size(); ++i) {
+    group_level_tile_dims[group_dims[i]] = group_dim_sizes[i];
+  }
+  Array<int64> group_level_tile(group_level_tile_dims);
+  group_level_tile.Each([&](absl::Span<const int64> indices, int64* group) {
+    *group = 0;
+    for (int64 dim : group_dims) {
+      *group *= group_level_tile.dim(dim);
+      *group += indices[dim];
+    }
+  });
+  auto group_level_sharding = HloSharding::Tile(group_level_tile);
+  auto padded_hlo = PadBaseShapeBeforeUnevenTiledSharding(
+      replicated, group_level_sharding, b);
+  auto shard_shape =
+      MakePartitionedShape(replicated->shape(), group_level_sharding);
+  return b->AddInstruction(HloInstruction::CreateDynamicSlice(
+      shard_shape, padded_hlo,
+      MakePartitionOffsets(replicated->shape(), group_level_sharding, group_id,
+                           b),
+      shard_shape.dimensions()));
+}
+
+absl::optional<HloSharding> TransposeShardingWithCollapsedDims(
+    const HloSharding& source, absl::Span<int64 const> src_to_tgt,
+    absl::Span<int64 const> tgt_to_src) {
+  if (source.IsTileMaximal()) {
+    return source;
+  }
+  std::vector<int64> tgt_dims_skipping_new(tgt_to_src.size(), -1);
+  int64 skipped_tgt_dims = 0;
+  for (int64 i = 0; i < tgt_to_src.size(); ++i) {
+    if (tgt_to_src[i] < 0) {
+      skipped_tgt_dims++;
+    } else {
+      tgt_dims_skipping_new[i] = i - skipped_tgt_dims;
+    }
+  }
+  int64 skipped_src_dims = absl::c_count(src_to_tgt, -1);
+  std::vector<int64> perm(src_to_tgt.size());
+  for (int64 i = 0; i < src_to_tgt.size(); ++i) {
+    if (src_to_tgt[i] < 0) {
+      if (source.tile_assignment().dim(i) > 1) {
+        return absl::nullopt;
+      }
+      perm[src_to_tgt.size() - skipped_src_dims] = i;
+      skipped_src_dims--;
+    } else {
+      perm[tgt_dims_skipping_new[src_to_tgt[i]]] = i;
+    }
+  }
+  auto tgt_sharding = hlo_sharding_util::TransposeSharding(source, perm);
+  if (skipped_tgt_dims == 0) {
+    return tgt_sharding;
+  }
+  auto reshape_tiles = tgt_sharding.tile_assignment();
+  std::vector<int64> tgt_tiles(tgt_to_src.size(), 1);
+  for (int64 i = 0; i < tgt_tiles.size(); ++i) {
+    if (tgt_to_src[i] >= 0) {
+      tgt_tiles[i] = reshape_tiles.dim(tgt_dims_skipping_new[i]);
+    }
+  }
+  reshape_tiles.Reshape(tgt_tiles);
+  return HloSharding::Tile(reshape_tiles);
+}
+
+absl::optional<HloOpcode> ParseReductionComputation(
+    const HloComputation* reduction_comp) {
+  if (reduction_comp->num_parameters() != 2) {
+    return absl::nullopt;
+  }
+  auto root = reduction_comp->root_instruction();
+  if (!root->IsElementwiseBinary()) {
+    return absl::nullopt;
+  }
+  if (!absl::c_linear_search(root->operands(),
+                             reduction_comp->parameter_instruction(0)) ||
+      !absl::c_linear_search(root->operands(),
+                             reduction_comp->parameter_instruction(1))) {
+    return absl::nullopt;
+  }
+  return root->opcode();
+}
+
 }  // namespace spmd
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
index 5f245667970..6906b52ca79 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
@@ -33,9 +33,16 @@ namespace spmd {
 // Returns true if the given sharding contains any replicated sharding.
 bool HasReplicatedSharding(const HloSharding& sharding);
 
+// Creates constant value instructions of the given shape. The literal must be a
+// scalar shape and is broadcast to the given shape.
+HloInstruction* CreateConstant(const Shape& shape, Literal value,
+                               SpmdBuilder* b);
 // Creates zero value instructions of the given shape.
 HloInstruction* CreateZero(const Shape& shape, SpmdBuilder* b);
 
+// Creates one value instructions of the given shape.
+HloInstruction* CreateOne(const Shape& shape, SpmdBuilder* b);
+
 template <typename NativeT>
 HloInstruction* CreateR0WithType(PrimitiveType type, NativeT value,
                                  SpmdBuilder* b) {
@@ -87,10 +94,12 @@ Shape MakeNonPaddedShapeForGivenPartition(const Shape& shape,
 
 // Generates the HLO instructions that represent the dimension offsets on any
 // device. The size of the returned vector is the rank of the given shape.
-std::vector<HloInstruction*> MakePartitionOffsets(const Shape& shape,
-                                                  const HloSharding& sharding,
-                                                  HloInstruction* partition_id,
-                                                  SpmdBuilder* b);
+// If `dims` is non-empty, the generated offsets will only be non-zero for those
+// dimensions.
+std::vector<HloInstruction*> MakePartitionOffsets(
+    const Shape& shape, const HloSharding& sharding,
+    HloInstruction* partition_id, SpmdBuilder* b,
+    absl::Span<const int64> dims = {});
 
 // Returns the offsets of the partition in the tile assignment.
 std::vector<HloInstruction*> MakeTiledPartitionOrdinals(
@@ -262,6 +271,106 @@ absl::optional<int64> GetKValueInTopKWhenPartitionSortDim(HloInstruction* hlo);
 HloInstruction* SliceFirstK(HloInstruction* hlo, SpmdBuilder* builder,
                             int64 slice_dim, int64 k);
 
+// Check if a dimension is sharded.
+int64 ShardCountAtDim(const HloSharding& sharding, int64 dim);
+
+// Returns the list of source-target pairs of dimensions to swap during
+// resharding via all-to-all. Reshard can be done by swapping each pair at a
+// time.
+absl::optional<std::vector<std::pair<int64, int64>>>
+GetReshardAllToAllSourceTargetDims(const HloSharding& source,
+                                   const HloSharding& target);
+
+// Returns whether the resharding can be done via collective-permute.
+bool CanReshardWithCollectivePermute(const HloSharding& source,
+                                     const HloSharding& target);
+
+// Represents grouping devices in a tiled sharding along certain dimensions.
+// Elements in group dimensions define different device groups, and the sharding
+// represents the in-group sharding.
+struct GroupedSharding {
+  GroupedSharding(std::vector<std::vector<int64>> device_groups,
+                  std::vector<int64> group_dims,
+                  std::vector<int64> group_dim_sizes, int64 rank,
+                  HloSharding grouped_sharding)
+      : device_groups(std::move(device_groups)),
+        group_dims(std::move(group_dims)),
+        group_dim_sizes(std::move(group_dim_sizes)),
+        sharding(std::move(grouped_sharding)) {}
+  std::vector<std::vector<int64>> device_groups;
+  std::vector<int64> group_dims;
+  std::vector<int64> group_dim_sizes;
+  int64 rank;
+  HloSharding sharding;
+};
+
+// Creates a GroupedSharding for a tiled sharding.
+GroupedSharding GroupShardingOnDims(const HloSharding& sharding,
+                                    absl::Span<const int64> group_dims);
+
+// Reconstructs the ungrouped sharding from a GroupedSharding.
+HloSharding UngroupSharding(const GroupedSharding& grouped_sharding);
+
+// Returns a new GroupedSharding that has the same group definition of
+// `reference`.
+GroupedSharding AlignGroupsWith(GroupedSharding grouped_sharding,
+                                const GroupedSharding& reference,
+                                bool ignore_group_order = false);
+
+// Returns the per-group base shape, i.e., before applying the in-group
+// sharding.
+Shape GetPerGroupBaseShape(const GroupedSharding& grouped_sharding,
+                           const Shape& original_base_shape);
+
+// Creates the nested partitioner state for in-group patitioning.
+PartitionedHlo::PartitioningState CreatePerGroupPartitioningState(
+    const PartitionedHlo::PartitioningState& state,
+    const std::vector<std::vector<int64>>& device_groups, SpmdBuilder* b);
+
+// Partially shards a replicated HLO into groups along the group dimensions, and
+// within each group data is still replicated.
+HloInstruction* PerGroupSliceFromReplicated(
+    HloInstruction* replicated, HloInstruction* partition_id,
+    const std::vector<std::vector<int64>>& device_groups,
+    absl::Span<const int64> group_dims, absl::Span<const int64> group_dim_sizes,
+    SpmdBuilder* b);
+
+// Similar to hlo_sharding_util::TransposeSharding(), but allows removing/adding
+// non-partitioned dimensions. In src_to_tgt and tgt_to_src, -1 represents a
+// non-existing dimension.
+absl::optional<HloSharding> TransposeShardingWithCollapsedDims(
+    const HloSharding& source, absl::Span<int64 const> src_to_tgt,
+    absl::Span<int64 const> tgt_to_src);
+
+// Returns the opcode if `reduction_comp` represents a simple binary elementwise
+// computation on the two operands.
+absl::optional<HloOpcode> ParseReductionComputation(
+    const HloComputation* reduction_comp);
+
+// Pad the shape from partial replicate shape for `dst_sharding`.
+// If dst_sharding needs more padding and per_shard_size increased in
+// dst_sharding, halo exchange on the right side is needed.
+absl::optional<HloInstruction*> PadFromPartialReplicateShape(
+    HloInstruction* hlo, const Shape& base_shape,
+    const HloSharding& src_sharding, const HloSharding& dst_sharding,
+    const std::vector<int64>& expand_tile_dims,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, HloInstruction* partition_id, SpmdBuilder* b);
+
+// Get the compatible sharding from a partial replicate sharding to a given
+// target tile dimensions.
+// Compatible means replicate sharding can transform to the target tile
+// dimensions by dynamic slice.
+// For example, if partial_sharding is
+// {devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+// Target tile dims is {2, 2}, the returned compatible sharding will be
+// sharding={devices=[1,2,2]0,2,1,3 last_tile_dim_replicate}.
+// If patial replicate sharding is not partial replicate or can't reshard to
+// target_tile_dims by dynamic slice, return absl::nullopt.
+absl::optional<HloSharding> PartialReplicateToTileCompatibleSharding(
+    const HloSharding& partial_sharding,
+    const std::vector<int64>& target_tile_dims);
+
 }  // namespace spmd
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/topk_rewriter.cc b/tensorflow/compiler/xla/service/topk_rewriter.cc
new file mode 100644
index 00000000000..000b1e94ece
--- /dev/null
+++ b/tensorflow/compiler/xla/service/topk_rewriter.cc
@@ -0,0 +1,194 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/topk_rewriter.h"
+
+#include "absl/algorithm/container.h"
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace xla {
+
+static bool IsNanSafeGt(HloComputation* comp) {
+  namespace m = match;
+  auto match_bitcast_f32 = [](int64 parameter_number) {
+    auto param = m::Parameter(parameter_number)
+                     .WithShape(m::Shape().WithElementType(F32));
+    auto param_s32 =
+        m::BitcastConvert(param).WithShape(m::Shape().WithElementType(S32));
+    auto param_u32 =
+        m::BitcastConvert(param).WithShape(m::Shape().WithElementType(U32));
+    return m::Select(
+        m::Lt(param_s32, m::ConstantScalar(0)),
+        m::BitcastConvert(
+            m::Subtract(m::ConstantScalar(std::numeric_limits<int32>::max()),
+                        param_u32))
+            .WithShape(m::Shape().WithElementType(S32)),
+        param_s32);
+  };
+  auto match_bitcast_bf16 = [](int64 parameter_number) {
+    auto param = m::Convert(m::Parameter(parameter_number)
+                                .WithShape(m::Shape().WithElementType(BF16)))
+                     .WithShape(m::Shape().WithElementType(F32));
+    auto param_s32 =
+        m::BitcastConvert(param).WithShape(m::Shape().WithElementType(S32));
+    auto param_u32 =
+        m::BitcastConvert(param).WithShape(m::Shape().WithElementType(U32));
+    return m::Select(
+        m::Lt(param_s32, m::ConstantScalar(0)),
+        m::BitcastConvert(
+            m::Subtract(m::ConstantScalar(std::numeric_limits<int32>::max()),
+                        param_u32))
+            .WithShape(m::Shape().WithElementType(S32)),
+        param_s32);
+  };
+  return Match(comp->root_instruction(),
+               m::Gt(match_bitcast_f32(0), match_bitcast_f32(1))) ||
+         Match(comp->root_instruction(),
+               m::Gt(match_bitcast_bf16(0), match_bitcast_bf16(1)));
+}
+
+StatusOr<bool> TopkRewriter::Run(HloModule* module) {
+  bool changed = false;
+  for (HloComputation* comp : module->computations()) {
+    for (HloInstruction* inst : comp->MakeInstructionPostOrder()) {
+      HloSortInstruction* sort = DynCast<HloSortInstruction>(inst);
+      if (sort == nullptr || sort->operand_count() != 2) {
+        continue;
+      }
+      HloInstruction* data = sort->mutable_operand(0);
+      HloIotaInstruction* iota =
+          DynCast<HloIotaInstruction>(sort->mutable_operand(1));
+      const PrimitiveType element_type = data->shape().element_type();
+      if ((data->shape().rank() != 1 && data->shape().rank() != 2) ||
+          (element_type != F32 && element_type != BF16)) {
+        continue;
+      }
+      if (iota == nullptr || iota->shape().rank() != data->shape().rank() ||
+          iota->shape().element_type() != S32 ||
+          iota->opcode() != HloOpcode::kIota ||
+          iota->iota_dimension() != sort->sort_dimension()) {
+        continue;
+      }
+      if (!IsNanSafeGt(sort->to_apply())) {
+        continue;
+      }
+      const int64 sort_dim = sort->sort_dimension();
+      const int64 batch_dim = sort_dim == 1 ? 0 : 1;
+      const bool has_batch = data->shape().rank() == 2;
+
+      bool supported = true;
+      absl::optional<int64> k;
+      for (HloInstruction* gte : sort->users()) {
+        if (gte->opcode() != HloOpcode::kGetTupleElement ||
+            gte->user_count() != 1) {
+          supported = false;
+          break;
+        }
+        const HloInstruction* slice = gte->users()[0];
+        if (slice->opcode() != HloOpcode::kSlice) {
+          // Non-slice user means we are not doing a TopK
+          supported = false;
+          break;
+        }
+        if (absl::c_any_of(slice->slice_starts(),
+                           [](int x) { return x != 0; }) ||
+            absl::c_any_of(slice->slice_strides(),
+                           [](int x) { return x != 1; })) {
+          // Strided slice or slicing at the beginning isn't supported.
+          supported = false;
+          break;
+        }
+        if (has_batch && slice->slice_limits(batch_dim) !=
+                             slice->operand(0)->shape().dimensions(batch_dim)) {
+          // Slicing along the batch dimension isn't supported.
+          supported = false;
+          break;
+        }
+        if (k == absl::nullopt) {
+          k = slice->slice_limits(sort_dim);
+        } else if (k != slice->slice_limits(sort_dim)) {
+          // Different k for the different operands isn't supported.
+          supported = false;
+          break;
+        }
+      }
+      if (k == absl::nullopt || !supported) {
+        continue;
+      }
+
+      // Profitability check.
+      if (!is_profitable_to_convert_(sort, *k)) {
+        continue;
+      }
+
+      const int64 batch_size =
+          has_batch ? sort->operand(0)->shape().dimensions(batch_dim) : 1;
+      const int64 input_size = sort->operand(0)->shape().dimensions(sort_dim);
+      HloInstruction* input = sort->mutable_operand(0);
+      if (has_batch && sort_dim == 0) {
+        input = comp->AddInstruction(HloInstruction::CreateTranspose(
+            ShapeUtil::MakeShape(element_type, {batch_size, input_size}), input,
+            {1, 0}));
+      }
+
+      Shape topk_shape =
+          has_batch ? ShapeUtil::MakeTupleShape(
+                          {ShapeUtil::MakeShape(element_type,
+                                                {batch_size, k.value()}),
+                           ShapeUtil::MakeShape(S32, {batch_size, k.value()})})
+                    : ShapeUtil::MakeTupleShape(
+                          {ShapeUtil::MakeShape(element_type, {k.value()}),
+                           ShapeUtil::MakeShape(S32, {k.value()})});
+      HloInstruction* topk = comp->AddInstruction(
+          HloInstruction::CreateCustomCall(topk_shape, {input}, "TopK"));
+      HloInstruction* value_gte =
+          comp->AddInstruction(HloInstruction::CreateGetTupleElement(
+              topk->shape().tuple_shapes(0), topk, 0));
+      HloInstruction* index_gte =
+          comp->AddInstruction(HloInstruction::CreateGetTupleElement(
+              topk->shape().tuple_shapes(1), topk, 1));
+
+      if (has_batch && sort_dim == 0) {
+        value_gte = comp->AddInstruction(HloInstruction::CreateTranspose(
+            ShapeUtil::MakeShape(element_type, {k.value(), batch_size}),
+            value_gte, {1, 0}));
+        index_gte = comp->AddInstruction(HloInstruction::CreateTranspose(
+            ShapeUtil::MakeShape(S32, {k.value(), batch_size}), index_gte,
+            {1, 0}));
+      }
+
+      for (HloInstruction* gte : sort->users()) {
+        for (HloInstruction* slice : gte->users()) {
+          if (gte->tuple_index() == 0) {
+            TF_RETURN_IF_ERROR(slice->ReplaceAllUsesWith(value_gte));
+          } else if (gte->tuple_index() == 1) {
+            TF_RETURN_IF_ERROR(slice->ReplaceAllUsesWith(index_gte));
+          } else {
+            LOG(FATAL) << "Sort with more than 2 output isn't supported in "
+                          "topk rewriter";
+          }
+        }
+      }
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/topk_rewriter.h b/tensorflow/compiler/xla/service/topk_rewriter.h
new file mode 100644
index 00000000000..68f8a8145e2
--- /dev/null
+++ b/tensorflow/compiler/xla/service/topk_rewriter.h
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_TOPK_REWRITER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_TOPK_REWRITER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+// This pass pattern-matches soups of HLOs executing a TopK operation and
+// replaces them with a TopK CustomCall when the given values are supported by
+// the CustomCall and it is more efficient to use that implementation.
+class TopkRewriter : public HloModulePass {
+ public:
+  explicit TopkRewriter(std::function<bool(const HloSortInstruction*, int64)>
+                            is_profitable_to_convert)
+      : is_profitable_to_convert_(std::move(is_profitable_to_convert)) {}
+
+  absl::string_view name() const override { return "topk-rewriter"; }
+
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  // Predicate that returns true if a sort instruction is profitable to be
+  // converted into a custom call.
+  std::function<bool(const HloSortInstruction*, int64)>
+      is_profitable_to_convert_;
+};
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_TOPK_REWRITER_H_
diff --git a/tensorflow/compiler/xla/service/topk_rewriter_test.cc b/tensorflow/compiler/xla/service/topk_rewriter_test.cc
new file mode 100644
index 00000000000..ec5b34b1c0a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/topk_rewriter_test.cc
@@ -0,0 +1,160 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/topk_rewriter.h"
+
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace op = xla::testing::opcode_matchers;
+
+namespace xla {
+namespace {
+
+using TopkRewriterTest = HloTestBase;
+
+std::string getComparator() {
+  return R"(
+%compare {
+  %p.1.lhs.8 = s32[] parameter(2)
+  %p.1.rhs.9 = s32[] parameter(3)
+  %p.0.lhs.6 = f32[] parameter(0)
+  %bitcast-convert.11 = s32[] bitcast-convert(%p.0.lhs.6)
+  %constant.15 = s32[] constant(0)
+  %compare.16 = pred[] compare(%bitcast-convert.11, %constant.15), direction=LT
+  %constant.10 = u32[] constant(2147483647)
+  %bitcast-convert.12 = u32[] bitcast-convert(%p.0.lhs.6)
+  %subtract.13 = u32[] subtract(%constant.10, %bitcast-convert.12)
+  %bitcast-convert.14 = s32[] bitcast-convert(%subtract.13)
+  %select.17 = s32[] select(%compare.16, %bitcast-convert.14,
+                            %bitcast-convert.11)
+  %p.0.rhs.7 = f32[] parameter(1)
+  %bitcast-convert.19 = s32[] bitcast-convert(%p.0.rhs.7)
+  %constant.23 = s32[] constant(0)
+  %compare.24 = pred[] compare(%bitcast-convert.19, %constant.23), direction=LT
+  %constant.18 = u32[] constant(2147483647)
+  %bitcast-convert.20 = u32[] bitcast-convert(%p.0.rhs.7)
+  %subtract.21 = u32[] subtract(%constant.18, %bitcast-convert.20)
+  %bitcast-convert.22 = s32[] bitcast-convert(%subtract.21)
+  %select.25 = s32[] select(%compare.24, %bitcast-convert.22,
+                            %bitcast-convert.19)
+  ROOT %compare.26 = pred[] compare(%select.17, %select.25), direction=GT
+})";
+}
+
+TEST_F(TopkRewriterTest, Rewrite) {
+  const std::string hlo_string = R"(
+HloModule module
+)" + getComparator() + R"(
+ENTRY cluster {
+  %arg_tuple.1 = f32[8,1234567] parameter(0)
+  %iota.4 = s32[8,1234567] iota(), iota_dimension=1
+  %sort.27 = (f32[8,1234567], s32[8,1234567]) sort(%arg_tuple.1, %iota.4),
+    dimensions={1}, is_stable=true, to_apply=%compare
+  %get-tuple-element.28 = f32[8,1234567] get-tuple-element(%sort.27), index=0
+  %slice.29 = f32[8,5] slice(%get-tuple-element.28), slice={[0:8], [0:5]}
+  %get-tuple-element.30 = s32[8,1234567] get-tuple-element(%sort.27), index=1
+  %slice.31 = s32[8,5] slice(%get-tuple-element.30), slice={[0:8], [0:5]}
+  ROOT %tuple.32 = (f32[8,5], s32[8,5]) tuple(%slice.29, %slice.31)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TopkRewriter rewriter([](const HloSortInstruction*, int64) { return true; });
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, rewriter.Run(module.get()));
+  TF_ASSERT_OK(HloDCE().Run(module.get()).status());
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Tuple(op::GetTupleElement(op::CustomCall(op::Parameter(0)), 0),
+                op::GetTupleElement(op::CustomCall(op::Parameter(0)), 1)));
+  const HloInstruction* cc =
+      module->entry_computation()->root_instruction()->operand(0)->operand(0);
+  EXPECT_THAT(cc->custom_call_target(), "TopK");
+}
+
+TEST_F(TopkRewriterTest, RewriteUnbatched) {
+  const std::string hlo_string = R"(
+HloModule module
+)" + getComparator() + R"(
+ENTRY cluster {
+  %arg_tuple.1 = f32[1234567] parameter(0)
+  %iota.4 = s32[1234567] iota(), iota_dimension=0
+  %sort.27 = (f32[1234567], s32[1234567]) sort(%arg_tuple.1, %iota.4),
+    dimensions={0}, is_stable=true, to_apply=%compare
+  %get-tuple-element.28 = f32[1234567] get-tuple-element(%sort.27), index=0
+  %slice.29 = f32[5] slice(%get-tuple-element.28), slice={[0:5]}
+  %get-tuple-element.30 = s32[1234567] get-tuple-element(%sort.27), index=1
+  %slice.31 = s32[5] slice(%get-tuple-element.30), slice={[0:5]}
+  ROOT %tuple.32 = (f32[5], s32[5]) tuple(%slice.29, %slice.31)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TopkRewriter rewriter([](const HloSortInstruction*, int64) { return true; });
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, rewriter.Run(module.get()));
+  TF_ASSERT_OK(HloDCE().Run(module.get()).status());
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Tuple(op::GetTupleElement(op::CustomCall(op::Parameter(0)), 0),
+                op::GetTupleElement(op::CustomCall(op::Parameter(0)), 1)));
+  const HloInstruction* cc =
+      module->entry_computation()->root_instruction()->operand(0)->operand(0);
+  EXPECT_THAT(cc->custom_call_target(), "TopK");
+}
+
+TEST_F(TopkRewriterTest, RewriteTranspose) {
+  const std::string hlo_string = R"(
+HloModule module
+)" + getComparator() + R"(
+ENTRY cluster {
+  %arg_tuple.1 = f32[1234567,8] parameter(0)
+  %iota.4 = s32[1234567,8] iota(), iota_dimension=0
+  %sort.27 = (f32[1234567,8], s32[1234567,8]) sort(%arg_tuple.1, %iota.4),
+    dimensions={0}, is_stable=true, to_apply=%compare
+  %get-tuple-element.28 = f32[1234567,8] get-tuple-element(%sort.27), index=0
+  %slice.29 = f32[5,8] slice(%get-tuple-element.28), slice={[0:5], [0:8]}
+  %get-tuple-element.30 = s32[1234567,8] get-tuple-element(%sort.27), index=1
+  %slice.31 = s32[5,8] slice(%get-tuple-element.30), slice={[0:5], [0:8]}
+  ROOT %tuple.32 = (f32[5,8], s32[5,8]) tuple(%slice.29, %slice.31)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TopkRewriter rewriter([](const HloSortInstruction*, int64) { return true; });
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, rewriter.Run(module.get()));
+  TF_ASSERT_OK(HloDCE().Run(module.get()).status());
+  EXPECT_TRUE(changed);
+  LOG(INFO) << module->entry_computation()->ToString();
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Tuple(op::Transpose(op::GetTupleElement(
+                    op::CustomCall(op::Transpose(op::Parameter(0))), 0)),
+                op::Transpose(op::GetTupleElement(
+                    op::CustomCall(op::Transpose(op::Parameter(0))), 1))));
+  const HloInstruction* cc = module->entry_computation()
+                                 ->root_instruction()
+                                 ->operand(0)
+                                 ->operand(0)
+                                 ->operand(0);
+  EXPECT_THAT(cc->custom_call_target(), "TopK");
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/triangular_solve_expander.cc b/tensorflow/compiler/xla/service/triangular_solve_expander.cc
index cc483c310e8..d54eb9e78c3 100644
--- a/tensorflow/compiler/xla/service/triangular_solve_expander.cc
+++ b/tensorflow/compiler/xla/service/triangular_solve_expander.cc
@@ -454,6 +454,9 @@ XlaOp BuildTriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
 
 }  // namespace
 
+TriangularSolveExpander::TriangularSolveExpander(int64 block_size)
+    : block_size_(block_size) {}
+
 bool TriangularSolveExpander::InstructionMatchesPattern(
     HloInstruction* instruction) {
   return instruction->opcode() == HloOpcode::kTriangularSolve;
@@ -496,7 +499,7 @@ StatusOr<HloInstruction*> TriangularSolveExpander::ExpandInstruction(
 
     BuildTriangularSolve(a, b, options.left_side(), options.lower(),
                          transpose_a, conjugate_a, options.unit_diagonal(),
-                         /*block_size=*/128,
+                         /*block_size=*/block_size_,
                          /*precision=*/PrecisionConfig::HIGHEST);
     TF_ASSIGN_OR_RETURN(XlaComputation xla_computation, builder.Build());
 
diff --git a/tensorflow/compiler/xla/service/triangular_solve_expander.h b/tensorflow/compiler/xla/service/triangular_solve_expander.h
index be2374ef8c8..362e8557229 100644
--- a/tensorflow/compiler/xla/service/triangular_solve_expander.h
+++ b/tensorflow/compiler/xla/service/triangular_solve_expander.h
@@ -23,6 +23,8 @@ namespace xla {
 
 class TriangularSolveExpander : public OpExpanderPass {
  public:
+  explicit TriangularSolveExpander(int64 block_size = 128);
+
   absl::string_view name() const override {
     return "triangular_solve_expander";
   }
@@ -34,6 +36,8 @@ class TriangularSolveExpander : public OpExpanderPass {
       HloInstruction* instruction) override;
 
  private:
+  // Block size for BuildTriangularSolve
+  const int64 block_size_;
   // Mapping from op signatures to existing computations.
   absl::flat_hash_map<string, HloComputation*> computation_cache_;
 };
diff --git a/tensorflow/compiler/xla/service/triangular_solve_expander_test.cc b/tensorflow/compiler/xla/service/triangular_solve_expander_test.cc
new file mode 100644
index 00000000000..84663f80e7a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/triangular_solve_expander_test.cc
@@ -0,0 +1,108 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/triangular_solve_expander.h"
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/reference_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+class TriangularExpanderTest : public HloTestBase,
+                               public ::testing::WithParamInterface<int32> {};
+
+TEST_P(TriangularExpanderTest, TestBlockSize) {
+  auto block_size = GetParam();
+  std::string hlo_string = R"(
+    HloModule TensorFlowTriangularSolve
+
+    ENTRY main {
+      a = f32[256,256]{1,0} parameter(0)
+      b = f32[256,192]{1,0} parameter(1)
+      ROOT triangular-solve = f32[256,192]{1,0} triangular-solve(a, b),
+                                    left_side=true, unit_diagonal=true,
+                                    lower=true, transpose_a=NO_TRANSPOSE
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  {
+    TriangularSolveExpander triangular_solve_expander(block_size);
+
+    TF_ASSERT_OK_AND_ASSIGN(
+        bool result, RunHloPass(&triangular_solve_expander, module.get()));
+    EXPECT_TRUE(result);
+  }
+
+  // To test triangular solver expander we generate simple bidiagonal matrix:
+  // Solve a * x = b.
+  // Check that shape is still valid.
+  // Use reference matrix multiplication to test validity of result.
+
+  Array2D<float> a(256, 256);
+  for (int64 row = 0; row < a.dim(0); ++row) {
+    a(row, row) = 1;
+    if (row > 0) {
+      a(row, row - 1) = 0.01;
+    }
+  }
+
+  Array2D<float> b(256, 192);
+  const float kMax = static_cast<float>(b.dim(0) * b.dim(1) + 1);
+  for (int64 row = 0; row < b.dim(0); ++row) {
+    for (int64 col = 0; col < b.dim(1); ++col) {
+      b(row, col) = static_cast<float>(row + col + 1) / kMax;
+    }
+  }
+  auto la = LiteralUtil::CreateR2FromArray2D(a);
+  auto lb = LiteralUtil::CreateR2FromArray2D(b);
+
+  TF_ASSERT_OK_AND_ASSIGN(Literal lx, Execute(std::move(module), {&la, &lb}));
+
+  auto x_shape = lx.shape();
+  EXPECT_EQ(x_shape.dimensions_size(), 2);
+  EXPECT_EQ(x_shape.dimensions(0), b.dim(0));
+  EXPECT_EQ(x_shape.dimensions(1), b.dim(1));
+
+  Array2D<float> x(x_shape.dimensions(0), x_shape.dimensions(1));
+  x.SetValues(lx.data<float>());
+
+  auto ref_b = ReferenceUtil::MatmulArray2D(a, x);
+  auto ref_lb = LiteralUtil::CreateR2FromArray2D(*ref_b);
+
+  EXPECT_TRUE(
+      LiteralTestUtil::NearOrEqual(ref_lb, lb, ErrorSpec{0.001, 0.001}));
+}
+
+// block_size test limits based on the following considerations:
+// - test at least twice the range of original value
+// - try to test odd values unaligned with matrix dims
+// - full 1-256 range test takes too long to run
+
+INSTANTIATE_TEST_CASE_P(TriangularExpanderTestInstances, TriangularExpanderTest,
+                        ::testing::Range(2, 256, 7));
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/shape.h b/tensorflow/compiler/xla/shape.h
index dfaac677724..6a19a1fac09 100644
--- a/tensorflow/compiler/xla/shape.h
+++ b/tensorflow/compiler/xla/shape.h
@@ -49,7 +49,7 @@ class Shape {
   // Returns the rank (number of dimensions) of the given shape. Shape must be
   // an array.
   int64 rank() const {
-    CHECK(IsArray()) << "Non-arrays do not have a rank, shape: " << ToString();
+    DCHECK(IsArray()) << "Non-arrays do not have a rank, shape: " << ToString();
     return dimensions_.size();
   }
 
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index bce40578132..0833919b124 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -339,6 +339,15 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   TF_DCHECK_OK(ValidateShape(*shape));
 }
 
+/* static */ void ShapeUtil::CopyDynamicDimensions(Shape* to,
+                                                   const Shape& from) {
+  CHECK_EQ(to->rank(), from.rank());
+  for (int64 i = 0; i < from.rank(); ++i) {
+    to->set_dynamic_dimension(i, from.is_dynamic_dimension(i));
+  }
+  TF_DCHECK_OK(ValidateShape(*to));
+}
+
 /* static */ bool ShapeUtil::ElementIsIntegral(const Shape& shape) {
   return primitive_util::IsIntegralType(shape.element_type());
 }
@@ -522,13 +531,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
     text += ")";
     return text;
   }
-  string result = StrCat(
-      primitive_util::LowercasePrimitiveTypeName(shape.element_type()), "[");
-  for (int i = 0; i < shape.dimensions().size(); i++) {
-    StrAppend(&result, (i > 0) ? "," : "",
-              shape.is_dynamic_dimension(i) ? "<=" : "", shape.dimensions(i));
-  }
-  result += "]";
+  string result = HumanString(shape);
   if (IsScalar(shape)) {
     string layout_str = LayoutUtil::HumanString(shape.layout());
     // Don't print "{}" as layout for scalars.
@@ -780,9 +783,18 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 
 /* static */ Shape ShapeUtil::ChangeElementType(const Shape& original,
                                                 PrimitiveType type) {
-  Shape new_shape = original;
-  new_shape.set_element_type(type);
-  return new_shape;
+  if (original.IsTuple()) {
+    std::vector<Shape> new_operands;
+    new_operands.reserve(original.tuple_shapes_size());
+    for (const Shape& operand : original.tuple_shapes()) {
+      new_operands.push_back(ChangeElementType(operand, type));
+    }
+    return MakeTupleShape(new_operands);
+  } else {
+    Shape new_shape = original;
+    new_shape.set_element_type(type);
+    return new_shape;
+  }
 }
 
 /* static */ bool ShapeUtil::IndexIsValid(const Shape& shape,
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index fe1a8acf6e4..3f69a8b0aca 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -269,6 +269,14 @@ class ShapeUtil {
     if (SameElementType(a, b)) {
       return a.element_type();
     }
+    // If only one of A and B are floating use the floating point type.
+    if (ElementIsFloating(a) && !ElementIsFloating(b)) {
+      return a.element_type();
+    }
+    if (ElementIsFloating(b) && !ElementIsFloating(a)) {
+      return b.element_type();
+    }
+    // Use the higher precision type.
     return primitive_util::BitWidth(a.element_type()) <
                    primitive_util::BitWidth(b.element_type())
                ? b.element_type()
@@ -377,6 +385,9 @@ class ShapeUtil {
   // Appends a major dimension to the shape with the given bound.
   static void AppendMajorDimension(int bound, Shape* shape);
 
+  // Copy the dynamic dimensions property from one shape to another.
+  static void CopyDynamicDimensions(Shape* to, const Shape& from);
+
   // Returns an empty tuple shape. Can be used as a sentinel Shape value.
   static Shape MakeNil() { return MakeTupleShape({}); }
 
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 83851fabd53..3dac381ae7d 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -728,6 +728,7 @@ xla_test(
     name = "array_elementwise_ops_test",
     srcs = ["array_elementwise_ops_test.cc"],
     shard_count = 25,
+    tags = ["no_oss"],  # b/163416869
     deps = [
         ":test_macros_header",
         "//tensorflow/compiler/xla:array2d",
@@ -1115,10 +1116,53 @@ xla_test(
     name = "convolution_test",
     timeout = "long",
     srcs = ["convolution_test.cc"],
-    shard_count = 40,
+    shard_count = 50,
     tags = [
         "no_rocm",
+        "optonly",
+        # Timed out on 2020-07-18
         "nozapfhahn",
+    ],
+    deps = CONVOLUTION_TEST_DEPS + [
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+xla_test(
+    name = "convolution_test_1d",
+    timeout = "long",
+    srcs = ["convolution_test_1d.cc"],
+    # Turn on logging so that VLOG statements don't appear uncovered to zapfhahn.
+    args = ["--vmodule=convolution_emitter=7"],
+    # In the open source build, convolution_test_1d_gpu fails because it doesn't
+    # recognize --vmodule.
+    disabled_backends = [
+        "cpu",
+        "gpu",
+    ],
+    shard_count = 50,
+    tags = [
+        "no_rocm",
+        "optonly",
+    ],
+    deps = CONVOLUTION_TEST_DEPS + [
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+xla_test(
+    name = "convolution_test_1d_no_vmodule",
+    timeout = "long",
+    srcs = ["convolution_test_1d.cc"],
+    backends = [
+        "cpu",
+        "gpu",
+    ],
+    shard_count = 50,
+    tags = [
+        "no_rocm",
         "optonly",
     ],
     deps = CONVOLUTION_TEST_DEPS + [
@@ -1147,6 +1191,23 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "convolution_test_1d_autotune_disabled",
+    timeout = "long",
+    srcs = ["convolution_test_1d.cc"],
+    args = ["--xla_gpu_autotune_level=0"],
+    backends = ["gpu"],
+    shard_count = 40,
+    tags = [
+        "no_rocm",
+        "optonly",
+    ],
+    deps = CONVOLUTION_TEST_DEPS + [
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 xla_test(
     name = "convolution_test_gpu_alternative_layout",
     timeout = "long",
@@ -1163,6 +1224,22 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "convolution_test_1d_gpu_alternative_layout",
+    timeout = "long",
+    srcs = ["convolution_test_1d.cc"],
+    backend_args = {"gpu": ["--xla_backend_extra_options=xla_gpu_experimental_conv_disable_layout_heuristic"]},
+    backends = ["gpu"],
+    shard_count = 25,
+    tags = [
+        "no_rocm",
+    ],
+    deps = CONVOLUTION_TEST_DEPS + [
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 xla_test(
     name = "convolution_variants_test",
     timeout = "long",
@@ -2012,6 +2089,31 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "dynamism_inference_test",
+    srcs = ["dynamism_inference_test.cc"],
+    deps = [
+        ":test_macros_header",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/client/lib:prng",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 xla_test(
     name = "compute_constant_test",
     srcs = ["compute_constant_test.cc"],
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index a956b85a940..fdc679a61c6 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -1203,6 +1203,16 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareEqF32s) {
   ComputeAndCompareR1<bool>(&builder, {false, false, true, false, false}, {});
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, CompareEqF32sTO) {
+  SetFastMathDisabled(true);
+  XlaBuilder builder(TestName());
+  auto lhs = ConstantR1<float>(&builder, {-2.5f, 25.5f, 2.25f, NAN, 6.0f});
+  auto rhs = ConstantR1<float>(&builder, {10.0f, 5.0f, 2.25f, NAN, NAN});
+  EqTotalOrder(lhs, rhs);
+
+  ComputeAndCompareR1<bool>(&builder, {false, false, true, true, false}, {});
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, CompareEqZeroElementF32s) {
   XlaBuilder builder(TestName());
   auto lhs = ConstantR1<float>(&builder, {});
@@ -1222,6 +1232,18 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareGeF32s) {
   ComputeAndCompareR1<bool>(&builder, {false, true, true, false, false}, {});
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, CompareGeF32sTO) {
+  SetFastMathDisabled(true);
+  XlaBuilder builder(TestName());
+  auto lhs =
+      ConstantR1<float>(&builder, {-2.5f, 25.5f, 2.25f, NAN, 6.0f, 6.0f});
+  auto rhs = ConstantR1<float>(&builder, {10.0f, 5.0f, 1.0f, 10.0f, NAN, -NAN});
+  GeTotalOrder(lhs, rhs);
+
+  ComputeAndCompareR1<bool>(&builder, {false, true, true, true, false, true},
+                            {});
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, CompareGtF32s) {
   SetFastMathDisabled(true);
   XlaBuilder builder(TestName());
diff --git a/tensorflow/compiler/xla/tests/buffer_donation_test.cc b/tensorflow/compiler/xla/tests/buffer_donation_test.cc
index 5f936870103..f78083fe2af 100644
--- a/tensorflow/compiler/xla/tests/buffer_donation_test.cc
+++ b/tensorflow/compiler/xla/tests/buffer_donation_test.cc
@@ -61,7 +61,7 @@ class BufferDonationTest : public HloTestBase {
                    absl::Span<Literal const> argument_literals,
                    absl::Span<bool const> donate_arguments,
                    absl::Span<bool const> expected_runtime_aliasing,
-                   const Literal& expected) {
+                   const Literal& expected, std::string expected_failure = "") {
     // Create a copy of the output shape because the HLO module is std::moved
     // into the compiler and may be deallocated.
     const Shape output_shape = hlo_module->result_shape();
@@ -119,13 +119,23 @@ class BufferDonationTest : public HloTestBase {
             }
           });
 
-      args.emplace_back(ExecutionInput(std::move(owned_buffers)));
+      args.emplace_back(
+          ExecutionInput(std::move(owned_buffers), argument_literal.shape()));
     }
 
-    TF_ASSERT_OK_AND_ASSIGN(
-        ExecutionOutput output,
+    StatusOr<ExecutionOutput> output_status =
         executable->ExecuteAsyncOnStream(&service_run_options, std::move(args),
-                                         /*hlo_execution_profile=*/nullptr));
+                                         /*hlo_execution_profile=*/nullptr);
+    if (!expected_failure.empty()) {
+      ASSERT_FALSE(output_status.ok());
+      ASSERT_TRUE(absl::StrContains(output_status.status().error_message(),
+                                    expected_failure))
+          << "got: \n"
+          << output_status.status().error_message() << " \nvs want\n"
+          << expected_failure;
+      return;
+    }
+    ExecutionOutput output = output_status.ConsumeValueOrDie();
 
     se::DeviceMemoryBase result_root_buffer = output.Result().root_buffer();
     LOG(INFO) << "result allocation = " << result_root_buffer.opaque()
@@ -302,5 +312,37 @@ ENTRY entry {
 #endif
 }
 
+TEST_F(BufferDonationTest, TestMustAliasNotDonated) {
+  HloModuleConfig config;
+
+  StatusOr<std::unique_ptr<VerifiedHloModule>> module =
+      ParseAndReturnVerifiedModule(R"(
+HloModule module
+
+ENTRY entry {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT out = (f32[], f32[]) tuple(a, b)
+}
+  )",
+                                   config);
+
+  TF_ASSERT_OK(module->get()->input_output_alias_config().SetUpAlias(
+      {0}, 0, {}, HloInputOutputAliasConfig::kMustAlias));
+
+  std::vector<Literal> args;
+  args.push_back(LiteralUtil::CreateR0<float>(0.1));
+  args.push_back(LiteralUtil::CreateR0<float>(0.2));
+  Literal expected = LiteralUtil::MakeTupleFromSlices(
+      {LiteralUtil::CreateR0<float>(0.1), LiteralUtil::CreateR0<float>(0.2)});
+
+#ifndef XLA_TEST_BACKEND_INTERPRETER
+  RunAndCheck(std::move(*module), args,
+              /*donate_arguments=*/{false, false}, {true, false}, expected,
+              "An input was configured to be must-alias at "
+              "compile time but not donated at runtime:");
+#endif
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/collective_ops_test.cc b/tensorflow/compiler/xla/tests/collective_ops_test.cc
index 7459b3d3f1f..ed5fabb663e 100644
--- a/tensorflow/compiler/xla/tests/collective_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/collective_ops_test.cc
@@ -568,7 +568,8 @@ XLA_TEST_F(CollectiveOpsTest, CollectivePermute_Simple) {
     ten = u32[] constant(10)
     sum = u32[] add(replica, ten)
     p = u32[2] broadcast(sum), dimensions={}
-    ROOT permute = u32[2] collective-permute(p), source_target_pairs={{1,0}, {0,1}, {2,2}}
+    permute = u32[2] collective-permute(p), source_target_pairs={{1,0}, {0,1}, {2,2}}
+    ROOT copy = u32[2] copy(permute)
   }
   )";
   const int64 kNumReplicas = 4;
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index c63f1d0edf3..8021d6fe5db 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Tests of convolution with trivial kernels and no special variations (like
+// Tests of 2+D convolution with trivial kernels and no special variations (like
 // strides and padding).
 
 #include <memory>
@@ -240,174 +240,6 @@ class Convolve_1x1x4x4_1x1x3x3_Same : public ConvolutionTest {
 TYPED_TEST_CASE(Convolve_1x1x4x4_1x1x3x3_Same, TestTypes);
 TYPED_TEST(Convolve_1x1x4x4_1x1x3x3_Same, Types) { this->RunTest(); }
 
-XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_Valid) {
-  XlaBuilder builder(TestName());
-  {
-    Shape input_shape = ShapeUtil::MakeShape(F32, {1, 2, 5});
-    Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
-    auto input = Parameter(&builder, 0, input_shape, "input");
-    auto filter = Parameter(&builder, 1, filter_shape, "filter");
-    Conv(input, filter, {1}, Padding::kValid);
-  }
-
-  Array3D<float> input({{{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}}});
-  Array3D<float> filter({{{10, 20}, {30, 40}}});
-
-  Array3D<float> expected({{{510, 610, 710, 810}}});
-
-  auto input_literal =
-      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(input))
-          .ConsumeValueOrDie();
-  auto filter_literal =
-      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(filter))
-          .ConsumeValueOrDie();
-
-  ComputeAndCompareR3<float>(&builder, expected,
-                             {input_literal.get(), filter_literal.get()},
-                             error_spec_);
-}
-
-template <typename T>
-class Convolve1D_1x2x5_1x2x2_WithRHSDilation : public ConvolutionTest {
- public:
-  void RunTest() {
-    XlaBuilder builder(TestName());
-    {
-      Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 2, 5});
-      Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 2, 2});
-      auto input = Parameter(&builder, 0, input_shape, "input");
-      auto filter = Parameter(&builder, 1, filter_shape, "filter");
-      // Convolution dimensions are bf0_oi0->bo0.
-      ConvGeneralDilated(
-          input, filter, /*window_strides=*/{1}, /*padding=*/{{0, 0}},
-          /*lhs_dilation=*/{1}, /*rhs_dilation=*/{2},
-          /*dimension_numbers=*/builder.CreateDefaultConvDimensionNumbers(1));
-    }
-
-    Array3D<T> input(
-        {{{1.0f, 2.0f, 3.0f, 4.0f, 5.0f}, {6.0f, 7.0f, 8.0f, 9.0f, 10.0f}}});
-    Array3D<T> filter({{{10.0f, 20.0f}, {30.0f, 40.0f}}});
-
-    Array3D<T> expected({{{570.0f, 670.0f, 770.0f}}});
-
-    auto input_literal =
-        client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(input))
-            .ConsumeValueOrDie();
-    auto filter_literal =
-        client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(filter))
-            .ConsumeValueOrDie();
-
-    ComputeAndCompareR3<T>(&builder, expected,
-                           {input_literal.get(), filter_literal.get()},
-                           error_spec_);
-  }
-};  // namespace
-
-TYPED_TEST_CASE(Convolve1D_1x2x5_1x2x2_WithRHSDilation, TestTypes);
-TYPED_TEST(Convolve1D_1x2x5_1x2x2_WithRHSDilation, Types) { this->RunTest(); }
-
-XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_WithLHSDilation) {
-  XlaBuilder builder(TestName());
-  {
-    Shape input_shape = ShapeUtil::MakeShape(F32, {1, 2, 5});
-    Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
-    auto input = Parameter(&builder, 0, input_shape, "input");
-    auto filter = Parameter(&builder, 1, filter_shape, "filter");
-    // Convolution dimensions are bf0_oi0->bo0.
-    ConvGeneralDilated(
-        input, filter, /*window_strides=*/{1}, /*padding=*/{{0, 0}},
-        /*lhs_dilation=*/{2}, /*rhs_dilation=*/{1},
-        /*dimension_numbers=*/builder.CreateDefaultConvDimensionNumbers(1));
-  }
-
-  Array3D<float> input({{{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}}});
-  Array3D<float> filter({{{10, 20}, {30, 40}}});
-
-  Array3D<float> expected({{{190, 320, 230, 380, 270, 440, 310, 500}}});
-
-  auto input_literal =
-      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(input))
-          .ConsumeValueOrDie();
-  auto filter_literal =
-      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(filter))
-          .ConsumeValueOrDie();
-
-  ComputeAndCompareR3<float>(&builder, expected,
-                             {input_literal.get(), filter_literal.get()},
-                             error_spec_);
-}
-
-XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_WithLHSAndRHSDilation) {
-  XlaBuilder builder(TestName());
-  {
-    Shape input_shape = ShapeUtil::MakeShape(F32, {1, 2, 5});
-    Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
-    auto input = Parameter(&builder, 0, input_shape, "input");
-    auto filter = Parameter(&builder, 1, filter_shape, "filter");
-    // Convolution dimensions are bf0_oi0->bo0.
-    ConvGeneralDilated(
-        input, filter, /*window_strides=*/{1}, /*padding=*/{{0, 0}},
-        /*lhs_dilation=*/{2}, /*rhs_dilation=*/{2},
-        /*dimension_numbers=*/builder.CreateDefaultConvDimensionNumbers(1));
-  }
-
-  Array3D<float> input({{{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}}});
-  Array3D<float> filter({{{10, 20}, {30, 40}}});
-
-  Array3D<float> expected({{{510, 0, 610, 0, 710, 0, 810}}});
-
-  auto input_literal =
-      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(input))
-          .ConsumeValueOrDie();
-  auto filter_literal =
-      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(filter))
-          .ConsumeValueOrDie();
-
-  ComputeAndCompareR3<float>(&builder, expected,
-                             {input_literal.get(), filter_literal.get()},
-                             error_spec_);
-}
-
-template <typename T>
-class Convolve1D_1x2x5_1x2x2_WithPadding : public ConvolutionTest {
- public:
-  void RunTest() {
-    XlaBuilder builder(TestName());
-    {
-      Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 2, 5});
-      Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 2, 2});
-      auto input = Parameter(&builder, 0, input_shape, "input");
-      auto filter = Parameter(&builder, 1, filter_shape, "filter");
-      // Convolution dimensions are bf0_oi0->bo0.
-      ConvGeneralDilated(
-          input, filter, /*window_strides=*/{1}, /*padding=*/{{2, 2}},
-          /*lhs_dilation=*/{1}, /*rhs_dilation=*/{1},
-          /*dimension_numbers=*/builder.CreateDefaultConvDimensionNumbers(1));
-    }
-
-    Array3D<T> input(
-        {{{1.0f, 2.0f, 3.0f, 4.0f, 5.0f}, {6.0f, 7.0f, 8.0f, 9.0f, 10.0f}}});
-    Array3D<T> filter({{{10.0f, 20.0f}, {30.0f, 40.0f}}});
-
-    Array3D<T> expected(
-        {{{0.0f, 260.0f, 510.0f, 610.0f, 710.0f, 810.0f, 350.0f, 0.0f}}});
-
-    auto input_literal =
-        client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(input))
-            .ConsumeValueOrDie();
-    auto filter_literal =
-        client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(filter))
-            .ConsumeValueOrDie();
-
-    ComputeAndCompareR3<T>(&builder, expected,
-                           {input_literal.get(), filter_literal.get()},
-                           error_spec_);
-  }
-};
-
-TYPED_TEST_CASE(Convolve1D_1x2x5_1x2x2_WithPadding, TestTypes);
-TYPED_TEST(Convolve1D_1x2x5_1x2x2_WithPadding, Types) { this->RunTest(); }
-
 XLA_TEST_F(ConvolutionTest, Convolve3D_1x4x2x3x3_2x2x2x3x3_Valid) {
   XlaBuilder builder(TestName());
   std::vector<int64> input_dims = {1, 4, 2, 3, 3};
@@ -1714,150 +1546,7 @@ INSTANTIATE_TEST_CASE_P(ConvolveWithAndWithoutCanonicalization_Instantiation,
                         ConvolveWithAndWithoutCanonicalization,
                         ::testing::Values(true, false));
 
-struct Convolve1DTestParam {
-  int64 input_feature;
-  int64 output_feature;
-  int64 batch;
-  int64 window_size;
-  int64 num_windows;
-};
 
-class Convolve1D1WindowTestBase
-    : public ConvolutionTest,
-      public ::testing::WithParamInterface<Convolve1DTestParam> {
- protected:
-  template <typename T>
-  void TestImpl() {
-    XlaBuilder builder(TestName());
-    int64 input_feature = GetParam().input_feature;
-    int64 output_feature = GetParam().output_feature;
-    int64 batch = GetParam().batch;
-    int64 num_windows = GetParam().num_windows;
-    int64 window_size = GetParam().window_size;
-    std::vector<int64> input_dims = {batch, window_size + num_windows - 1,
-                                     input_feature};
-    std::vector<int64> filter_dims = {window_size, input_feature,
-                                      output_feature};
-    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
-    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
-    {
-      auto input = Parameter(&builder, 0, input_shape, "input");
-      auto filter = Parameter(&builder, 1, filter_shape, "filter");
-
-      // Tensorflow dimension numbers for 1D convolution.
-      ConvolutionDimensionNumbers dnums;
-      dnums.set_input_batch_dimension(0);
-      dnums.set_output_batch_dimension(0);
-      dnums.add_input_spatial_dimensions(1);
-      dnums.add_output_spatial_dimensions(1);
-      dnums.set_input_feature_dimension(2);
-      dnums.set_output_feature_dimension(2);
-      dnums.add_kernel_spatial_dimensions(0);
-      dnums.set_kernel_input_feature_dimension(1);
-      dnums.set_kernel_output_feature_dimension(2);
-
-      ConvWithGeneralDimensions(input, filter, {1}, Padding::kValid, dnums);
-    }
-
-    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
-                               static_cast<T>(1.0f));
-    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
-    auto input_r3 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
-
-    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
-                                static_cast<T>(1.0f));
-
-    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
-    auto filter_r3 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
-
-    std::vector<T> expect_elems(batch * output_feature * num_windows,
-                                static_cast<T>(window_size * input_feature));
-    auto expected_r1 = LiteralUtil::CreateR1<T>(expect_elems);
-    auto expected_r3 = expected_r1.Reshape({batch, num_windows, output_feature})
-                           .ConsumeValueOrDie();
-
-    auto input_literal =
-        client_->TransferToServer(input_r3).ConsumeValueOrDie();
-    auto filter_literal =
-        client_->TransferToServer(filter_r3).ConsumeValueOrDie();
-    ComputeAndCompareLiteral(&builder, expected_r3,
-                             {input_literal.get(), filter_literal.get()},
-                             error_spec_);
-  }
-};
-
-class Convolve1D1WindowTestFloat : public Convolve1D1WindowTestBase {};
-
-XLA_TEST_P(Convolve1D1WindowTestFloat, Convolve1D1Window) { TestImpl<float>(); }
-
-INSTANTIATE_TEST_CASE_P(
-    Convolve1D1WindowTest_Instantiation, Convolve1D1WindowTestFloat,
-    ::testing::Values(Convolve1DTestParam{1, 1, 1, 1, 2},
-                      Convolve1DTestParam{160, 1, 1, 5, 1},
-                      Convolve1DTestParam{24, 1, 1, 20, 1},
-                      Convolve1DTestParam{30, 1, 1, 20, 1},
-                      Convolve1DTestParam{23, 1, 1, 20, 20},
-                      Convolve1DTestParam{25, 1, 1, 20, 1},
-                      Convolve1DTestParam{24, 1, 1, 10, 5},
-                      Convolve1DTestParam{160, 1, 1, 10, 1},
-                      Convolve1DTestParam{255, 1, 1, 3, 1},
-                      Convolve1DTestParam{130, 1, 1, 1, 2},
-                      Convolve1DTestParam{136, 1, 1, 1, 2},
-                      Convolve1DTestParam{64, 1, 1, 1, 1},
-                      Convolve1DTestParam{128, 1, 1, 1, 1},
-                      Convolve1DTestParam{139, 1, 1, 128, 1},
-                      Convolve1DTestParam{1, 10, 10, 1, 10},
-                      Convolve1DTestParam{1, 10, 130, 1, 2},
-                      Convolve1DTestParam{1, 10, 130, 1, 1},
-                      Convolve1DTestParam{1, 64, 64, 1, 10},
-                      Convolve1DTestParam{1, 65, 65, 1, 1},
-                      Convolve1DTestParam{1, 128, 128, 1, 1},
-                      Convolve1DTestParam{128, 128, 128, 128, 1},
-                      Convolve1DTestParam{1, 128, 128, 1, 1},
-                      Convolve1DTestParam{2, 2, 2, 2, 1},
-                      Convolve1DTestParam{161, 1, 1, 10, 1},
-                      Convolve1DTestParam{900, 1, 1, 10, 1},
-                      Convolve1DTestParam{640, 3, 3, 128, 1})
-
-);
-
-#if (XLA_TEST_BACKEND_GPU || XLA_TEST_BACKEND_CPU)
-class Convolve1D1WindowTestHalf : public Convolve1D1WindowTestBase {};
-
-XLA_TEST_P(Convolve1D1WindowTestHalf, Convolve1D1Window) {
-  TestImpl<Eigen::half>();
-}
-
-INSTANTIATE_TEST_CASE_P(
-    Convolve1D1WindowTest_Instantiation, Convolve1D1WindowTestHalf,
-    ::testing::Values(Convolve1DTestParam{1, 1, 1, 1, 2},
-                      Convolve1DTestParam{160, 1, 1, 5, 1},
-                      Convolve1DTestParam{24, 1, 1, 20, 1},
-                      Convolve1DTestParam{30, 1, 1, 20, 1},
-                      Convolve1DTestParam{23, 1, 1, 20, 20},
-                      Convolve1DTestParam{25, 1, 1, 20, 1},
-                      Convolve1DTestParam{24, 1, 1, 10, 5},
-                      Convolve1DTestParam{160, 1, 1, 10, 1},
-                      Convolve1DTestParam{255, 1, 1, 3, 1},
-                      Convolve1DTestParam{130, 1, 1, 1, 3},
-                      Convolve1DTestParam{64, 1, 1, 1, 1},
-                      Convolve1DTestParam{128, 1, 1, 1, 1},
-                      Convolve1DTestParam{139, 1, 1, 128, 1},
-                      Convolve1DTestParam{640, 3, 3, 128, 1},
-                      Convolve1DTestParam{900, 1, 1, 10, 1},
-                      Convolve1DTestParam{1, 10, 10, 1, 10},
-                      Convolve1DTestParam{1, 10, 130, 1, 1},
-                      Convolve1DTestParam{1, 10, 130, 1, 2},
-                      Convolve1DTestParam{1, 64, 64, 1, 10},
-                      Convolve1DTestParam{1, 65, 65, 1, 1},
-                      Convolve1DTestParam{1, 128, 128, 1, 1},
-                      Convolve1DTestParam{128, 128, 128, 128, 1},
-                      Convolve1DTestParam{1, 128, 128, 1, 1},
-                      Convolve1DTestParam{2, 2, 2, 2, 1},
-                      Convolve1DTestParam{161, 1, 1, 10, 1})
-
-);
-#endif
 
 XLA_TEST_F(ConvolutionTest, Convolve_bf16_1x1x1x2_1x1x1x2_Valid) {
   XlaBuilder builder(TestName());
diff --git a/tensorflow/compiler/xla/tests/convolution_test_1d.cc b/tensorflow/compiler/xla/tests/convolution_test_1d.cc
new file mode 100644
index 00000000000..2b2bf098145
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/convolution_test_1d.cc
@@ -0,0 +1,376 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Tests of 1D convolution with trivial kernels and no special variations (like
+// strides and padding).
+
+#include <memory>
+
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/array4d.h"
+#include "tensorflow/compiler/xla/client/global_data.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/padding.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/reference_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+class ConvolutionTest : public ClientLibraryTestBase {
+ protected:
+#if XLA_TEST_BACKEND_GPU
+  // XLA:GPU sometimes uses FFT convolution which isn't as precise as spatial
+  // convolution. So relax the absolute error threshold.
+  ErrorSpec error_spec_ = ErrorSpec(1e-2, 1e-3);
+#else
+  ErrorSpec error_spec_ = ErrorSpec(1e-4, 1e-3);
+#endif
+};
+
+#ifdef XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16
+using TestTypes = ::testing::Types<float>;
+#else
+using TestTypes = ::testing::Types<float, Eigen::half>;
+#endif
+
+struct Convolve1DTestParam {
+  int64 input_feature;
+  int64 output_feature;
+  int64 batch;
+  int64 window_size;
+  int64 num_windows;
+};
+
+class Convolve1D1WindowTestBase
+    : public ConvolutionTest,
+      public ::testing::WithParamInterface<Convolve1DTestParam> {
+ protected:
+  template <typename T>
+  void TestImpl() {
+    XlaBuilder builder(TestName());
+    int64 input_feature = GetParam().input_feature;
+    int64 output_feature = GetParam().output_feature;
+    int64 batch = GetParam().batch;
+    int64 num_windows = GetParam().num_windows;
+    int64 window_size = GetParam().window_size;
+    std::vector<int64> input_dims = {batch, window_size + num_windows - 1,
+                                     input_feature};
+    std::vector<int64> filter_dims = {window_size, input_feature,
+                                      output_feature};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 1D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.set_input_feature_dimension(2);
+      dnums.set_output_feature_dimension(2);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.set_kernel_input_feature_dimension(1);
+      dnums.set_kernel_output_feature_dimension(2);
+
+      ConvWithGeneralDimensions(input, filter, {1}, Padding::kValid, dnums);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1.0f));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r3 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(1.0f));
+
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r3 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> expect_elems(batch * output_feature * num_windows,
+                                static_cast<T>(window_size * input_feature));
+    auto expected_r1 = LiteralUtil::CreateR1<T>(expect_elems);
+    auto expected_r3 = expected_r1.Reshape({batch, num_windows, output_feature})
+                           .ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r3).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r3).ConsumeValueOrDie();
+    ComputeAndCompareLiteral(&builder, expected_r3,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+class Convolve1D1WindowTestFloat : public Convolve1D1WindowTestBase {};
+
+XLA_TEST_P(Convolve1D1WindowTestFloat, Convolve1D1Window) { TestImpl<float>(); }
+
+INSTANTIATE_TEST_CASE_P(
+    Convolve1D1WindowTest_Instantiation, Convolve1D1WindowTestFloat,
+    ::testing::Values(Convolve1DTestParam{1, 1, 1, 1, 2},
+                      Convolve1DTestParam{160, 1, 1, 5, 1},
+                      Convolve1DTestParam{24, 1, 1, 20, 1},
+                      Convolve1DTestParam{30, 1, 1, 20, 1},
+                      Convolve1DTestParam{23, 1, 1, 20, 20},
+                      Convolve1DTestParam{25, 1, 1, 20, 1},
+                      Convolve1DTestParam{24, 1, 1, 10, 5},
+                      Convolve1DTestParam{160, 1, 1, 10, 1},
+                      Convolve1DTestParam{255, 1, 1, 3, 1},
+                      Convolve1DTestParam{130, 1, 1, 1, 2},
+                      Convolve1DTestParam{136, 1, 1, 1, 2},
+                      Convolve1DTestParam{64, 1, 1, 1, 1},
+                      Convolve1DTestParam{128, 1, 1, 1, 1},
+                      Convolve1DTestParam{139, 1, 1, 128, 1},
+                      Convolve1DTestParam{1, 10, 10, 1, 10},
+                      Convolve1DTestParam{1, 10, 130, 1, 2},
+                      Convolve1DTestParam{1, 10, 130, 1, 1},
+                      Convolve1DTestParam{1, 64, 64, 1, 10},
+                      Convolve1DTestParam{1, 65, 65, 1, 1},
+                      Convolve1DTestParam{1, 128, 128, 1, 1},
+                      Convolve1DTestParam{128, 128, 128, 128, 1},
+                      Convolve1DTestParam{1, 128, 128, 1, 1},
+                      Convolve1DTestParam{2, 2, 2, 2, 1},
+                      Convolve1DTestParam{161, 1, 1, 10, 1},
+                      Convolve1DTestParam{900, 1, 1, 10, 1},
+                      Convolve1DTestParam{640, 3, 3, 128, 1})
+
+);
+
+#if (XLA_TEST_BACKEND_GPU || XLA_TEST_BACKEND_CPU)
+class Convolve1D1WindowTestHalf : public Convolve1D1WindowTestBase {};
+
+XLA_TEST_P(Convolve1D1WindowTestHalf, Convolve1D1Window) {
+  TestImpl<Eigen::half>();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    Convolve1D1WindowTest_Instantiation, Convolve1D1WindowTestHalf,
+    ::testing::Values(Convolve1DTestParam{1, 1, 1, 1, 2},
+                      Convolve1DTestParam{160, 1, 1, 5, 1},
+                      Convolve1DTestParam{24, 1, 1, 20, 1},
+                      Convolve1DTestParam{30, 1, 1, 20, 1},
+                      Convolve1DTestParam{23, 1, 1, 20, 20},
+                      Convolve1DTestParam{25, 1, 1, 20, 1},
+                      Convolve1DTestParam{24, 1, 1, 10, 5},
+                      Convolve1DTestParam{160, 1, 1, 10, 1},
+                      Convolve1DTestParam{255, 1, 1, 3, 1},
+                      Convolve1DTestParam{130, 1, 1, 1, 3},
+                      Convolve1DTestParam{64, 1, 1, 1, 1},
+                      Convolve1DTestParam{128, 1, 1, 1, 1},
+                      Convolve1DTestParam{139, 1, 1, 128, 1},
+                      Convolve1DTestParam{640, 3, 3, 128, 1},
+                      Convolve1DTestParam{900, 1, 1, 10, 1},
+                      Convolve1DTestParam{1, 10, 10, 1, 10},
+                      Convolve1DTestParam{1, 10, 130, 1, 1},
+                      Convolve1DTestParam{1, 10, 130, 1, 2},
+                      Convolve1DTestParam{1, 64, 64, 1, 10},
+                      Convolve1DTestParam{1, 65, 65, 1, 1},
+                      Convolve1DTestParam{1, 128, 128, 1, 1},
+                      Convolve1DTestParam{128, 128, 128, 128, 1},
+                      Convolve1DTestParam{1, 128, 128, 1, 1},
+                      Convolve1DTestParam{2, 2, 2, 2, 1},
+                      Convolve1DTestParam{161, 1, 1, 10, 1})
+
+);
+#endif
+
+XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_Valid) {
+  XlaBuilder builder(TestName());
+  {
+    Shape input_shape = ShapeUtil::MakeShape(F32, {1, 2, 5});
+    Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
+    auto input = Parameter(&builder, 0, input_shape, "input");
+    auto filter = Parameter(&builder, 1, filter_shape, "filter");
+    Conv(input, filter, {1}, Padding::kValid);
+  }
+
+  Array3D<float> input({{{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}}});
+  Array3D<float> filter({{{10, 20}, {30, 40}}});
+
+  Array3D<float> expected({{{510, 610, 710, 810}}});
+
+  auto input_literal =
+      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(input))
+          .ConsumeValueOrDie();
+  auto filter_literal =
+      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(filter))
+          .ConsumeValueOrDie();
+
+  ComputeAndCompareR3<float>(&builder, expected,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+}
+
+template <typename T>
+class Convolve1D_1x2x5_1x2x2_WithRHSDilation : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    {
+      Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 2, 5});
+      Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 2, 2});
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+      // Convolution dimensions are bf0_oi0->bo0.
+      ConvGeneralDilated(
+          input, filter, /*window_strides=*/{1}, /*padding=*/{{0, 0}},
+          /*lhs_dilation=*/{1}, /*rhs_dilation=*/{2},
+          /*dimension_numbers=*/builder.CreateDefaultConvDimensionNumbers(1));
+    }
+
+    Array3D<T> input(
+        {{{1.0f, 2.0f, 3.0f, 4.0f, 5.0f}, {6.0f, 7.0f, 8.0f, 9.0f, 10.0f}}});
+    Array3D<T> filter({{{10.0f, 20.0f}, {30.0f, 40.0f}}});
+
+    Array3D<T> expected({{{570.0f, 670.0f, 770.0f}}});
+
+    auto input_literal =
+        client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(input))
+            .ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(filter))
+            .ConsumeValueOrDie();
+
+    ComputeAndCompareR3<T>(&builder, expected,
+                           {input_literal.get(), filter_literal.get()},
+                           error_spec_);
+  }
+};  // namespace
+
+TYPED_TEST_CASE(Convolve1D_1x2x5_1x2x2_WithRHSDilation, TestTypes);
+TYPED_TEST(Convolve1D_1x2x5_1x2x2_WithRHSDilation, Types) { this->RunTest(); }
+
+XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_WithLHSDilation) {
+  XlaBuilder builder(TestName());
+  {
+    Shape input_shape = ShapeUtil::MakeShape(F32, {1, 2, 5});
+    Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
+    auto input = Parameter(&builder, 0, input_shape, "input");
+    auto filter = Parameter(&builder, 1, filter_shape, "filter");
+    // Convolution dimensions are bf0_oi0->bo0.
+    ConvGeneralDilated(
+        input, filter, /*window_strides=*/{1}, /*padding=*/{{0, 0}},
+        /*lhs_dilation=*/{2}, /*rhs_dilation=*/{1},
+        /*dimension_numbers=*/builder.CreateDefaultConvDimensionNumbers(1));
+  }
+
+  Array3D<float> input({{{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}}});
+  Array3D<float> filter({{{10, 20}, {30, 40}}});
+
+  Array3D<float> expected({{{190, 320, 230, 380, 270, 440, 310, 500}}});
+
+  auto input_literal =
+      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(input))
+          .ConsumeValueOrDie();
+  auto filter_literal =
+      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(filter))
+          .ConsumeValueOrDie();
+
+  ComputeAndCompareR3<float>(&builder, expected,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+}
+
+XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_WithLHSAndRHSDilation) {
+  XlaBuilder builder(TestName());
+  {
+    Shape input_shape = ShapeUtil::MakeShape(F32, {1, 2, 5});
+    Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
+    auto input = Parameter(&builder, 0, input_shape, "input");
+    auto filter = Parameter(&builder, 1, filter_shape, "filter");
+    // Convolution dimensions are bf0_oi0->bo0.
+    ConvGeneralDilated(
+        input, filter, /*window_strides=*/{1}, /*padding=*/{{0, 0}},
+        /*lhs_dilation=*/{2}, /*rhs_dilation=*/{2},
+        /*dimension_numbers=*/builder.CreateDefaultConvDimensionNumbers(1));
+  }
+
+  Array3D<float> input({{{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}}});
+  Array3D<float> filter({{{10, 20}, {30, 40}}});
+
+  Array3D<float> expected({{{510, 0, 610, 0, 710, 0, 810}}});
+
+  auto input_literal =
+      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(input))
+          .ConsumeValueOrDie();
+  auto filter_literal =
+      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(filter))
+          .ConsumeValueOrDie();
+
+  ComputeAndCompareR3<float>(&builder, expected,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+}
+
+template <typename T>
+class Convolve1D_1x2x5_1x2x2_WithPadding : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    {
+      Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 2, 5});
+      Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 2, 2});
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+      // Convolution dimensions are bf0_oi0->bo0.
+      ConvGeneralDilated(
+          input, filter, /*window_strides=*/{1}, /*padding=*/{{2, 2}},
+          /*lhs_dilation=*/{1}, /*rhs_dilation=*/{1},
+          /*dimension_numbers=*/builder.CreateDefaultConvDimensionNumbers(1));
+    }
+
+    Array3D<T> input(
+        {{{1.0f, 2.0f, 3.0f, 4.0f, 5.0f}, {6.0f, 7.0f, 8.0f, 9.0f, 10.0f}}});
+    Array3D<T> filter({{{10.0f, 20.0f}, {30.0f, 40.0f}}});
+
+    Array3D<T> expected(
+        {{{0.0f, 260.0f, 510.0f, 610.0f, 710.0f, 810.0f, 350.0f, 0.0f}}});
+
+    auto input_literal =
+        client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(input))
+            .ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(filter))
+            .ConsumeValueOrDie();
+
+    ComputeAndCompareR3<T>(&builder, expected,
+                           {input_literal.get(), filter_literal.get()},
+                           error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve1D_1x2x5_1x2x2_WithPadding, TestTypes);
+TYPED_TEST(Convolve1D_1x2x5_1x2x2_WithPadding, Types) { this->RunTest(); }
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 26cb25acbfe..60ba27b2050 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -1429,19 +1429,137 @@ ENTRY main {
   EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{4e-3, 4e-3}));
 }
 
-XLA_TEST_F(DotOperationTextTest, IntegerDotCodegen) {
+XLA_TEST_F(DotOperationTextTest, S32IotaDot) {
   absl::string_view hlo_string =
       R"(
 HloModule SmallIntegerDot
 
 ENTRY SmallIntegerDot {
-  arg0 = s32[1,2,2] parameter(0)
-  arg1 = s32[1,2,1] parameter(1)
-  ROOT dot = s32[1,2,1] dot(arg0, arg1), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+  arg0 = s32[5,55,8] iota(), iota_dimension=1
+  arg1 = s32[5,8,200] iota(), iota_dimension=2
+  ROOT dot = s32[5,55,200] dot(arg0, arg1), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
 }
 )";
 
-  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{4e-3, 4e-3}));
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0, 0}));
+}
+
+XLA_TEST_F(DotOperationTextTest, S32IotaSquaredDot) {
+  absl::string_view hlo_string =
+      R"(
+HloModule SmallIntegerDot
+
+ENTRY SmallIntegerDot {
+  arg0 = s32[16,2] iota(), iota_dimension=0
+  a = s32[16,2] multiply(arg0, arg0)
+  r = s32[16,2] multiply(a, a)
+  arg1 = s32[2,98] iota(), iota_dimension=1
+  b = s32[2,98] multiply(arg1, arg1)
+  s = s32[2,98] multiply(b, b)
+  ROOT dot = s32[16,98] dot(r, s), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0, 0}));
+}
+
+XLA_TEST_F(DotOperationTextTest, DISABLED_ON_CPU(U16IotaDot)) {
+  absl::string_view hlo_string =
+      R"(
+HloModule SmallIntegerDot
+
+ENTRY SmallIntegerDot {
+  arg0 = u16[5,55,8] parameter(0)
+  arg1 = u16[5,8,200] parameter(1)
+  dot = u16[5,55,200] dot(arg0, arg1), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+  ROOT c = s32[5,55,200] convert(dot)
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0, 0}));
+}
+
+XLA_TEST_F(DotOperationTextTest, DISABLED_ON_CPU(U16IotaSquaredDot)) {
+  absl::string_view hlo_string =
+      R"(
+HloModule SmallIntegerDot
+
+ENTRY SmallIntegerDot {
+  arg0 = u16[16,2] iota(), iota_dimension=0
+  a = u16[16,2] multiply(arg0, arg0)
+  r = u16[16,2] multiply(a, a)
+  arg1 = u16[2,98] iota(), iota_dimension=1
+  b = u16[2,98] multiply(arg1, arg1)
+  s = u16[2,98] multiply(b, b)
+  ROOT dot = u16[16,98] dot(r, s), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0, 0}));
+}
+
+XLA_TEST_F(DotOperationTextTest, DISABLED_ON_CPU(S16IotaDot)) {
+  absl::string_view hlo_string =
+      R"(
+HloModule SmallIntegerDot
+
+ENTRY SmallIntegerDot {
+  arg0 = s16[5,55,8] iota(), iota_dimension=1
+  arg1 = s16[5,8,200] iota(), iota_dimension=2
+  ROOT dot = s16[5,55,200] dot(arg0, arg1), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0, 0}));
+}
+
+XLA_TEST_F(DotOperationTextTest, DISABLED_ON_CPU(S16IotaSquaredDot)) {
+  absl::string_view hlo_string =
+      R"(
+HloModule SmallIntegerDot
+
+ENTRY SmallIntegerDot {
+  arg0 = s16[16,2] iota(), iota_dimension=0
+  a = s16[16,2] multiply(arg0, arg0)
+  r = s16[16,2] multiply(a, a)
+  arg1 = s16[2,98] iota(), iota_dimension=1
+  b = s16[2,98] multiply(arg1, arg1)
+  s = s16[2,98] multiply(b, b)
+  ROOT dot = s16[16,98] dot(r, s), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0, 0}));
+}
+
+XLA_TEST_F(DotOperationTextTest, DISABLED_ON_CPU(S8Dot)) {
+  absl::string_view hlo_string =
+      R"(
+HloModule SmallIntegerDot
+
+ENTRY SmallIntegerDot {
+  arg0 = s8[20,2] parameter(0)
+  arg1 = s8[2,20] parameter(1)
+  ROOT dot = s8[20,20] dot(arg0, arg1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0, 0}));
+}
+
+XLA_TEST_F(DotOperationTextTest, S32Dot) {
+  absl::string_view hlo_string =
+      R"(
+HloModule SmallIntegerDot
+
+ENTRY SmallIntegerDot {
+  arg0 = s32[20,55] parameter(0)
+  arg1 = s32[55,20] parameter(1)
+  ROOT dot = s32[20,20] dot(arg0, arg1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0, 0}));
 }
 
 XLA_TEST_F(DotOperationTextTest, GpuTransposeOutput) {
diff --git a/tensorflow/compiler/xla/tests/dynamism_inference_test.cc b/tensorflow/compiler/xla/tests/dynamism_inference_test.cc
new file mode 100644
index 00000000000..ba4092def16
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/dynamism_inference_test.cc
@@ -0,0 +1,215 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/match.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/client/global_data.h"
+#include "tensorflow/compiler/xla/client/lib/prng.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+// An enumerator for the client types that we want to iterate over in
+// the various tests.
+enum class ClientType { kLocal, kCompileOnly };
+ClientType client_types[] = {ClientType::kLocal, ClientType::kCompileOnly};
+
+class DynamismInferenceTest : public ::testing::Test {
+ public:
+  explicit DynamismInferenceTest(se::Platform* platform = nullptr)
+      : platform_(platform) {}
+
+  string TestName() const {
+    return ::testing::UnitTest::GetInstance()->current_test_info()->name();
+  }
+
+  Client* ClientOrDie(se::Platform* platform, ClientType client_type) {
+    if (client_type == ClientType::kLocal) {
+      StatusOr<Client*> result =
+          ClientLibrary::GetOrCreateLocalClient(platform);
+      TF_CHECK_OK(result.status())
+          << "could not create LocalClient for testing";
+      return result.ValueOrDie();
+    } else if (client_type == ClientType::kCompileOnly) {
+      StatusOr<Client*> result =
+          ClientLibrary::GetOrCreateCompileOnlyClient(platform);
+      TF_CHECK_OK(result.status())
+          << "could not create CompileOnlyClient for testing";
+      return result.ValueOrDie();
+    }
+    LOG(FATAL) << "invalid client_type value";
+  }
+
+  StatusOr<Literal> ComputeDynamismLiteral(Client* client, XlaOp operand,
+                                           XlaBuilder* builder,
+                                           Layout* output_layout = nullptr) {
+    TF_ASSIGN_OR_RETURN(auto subgraph,
+                        builder->BuildDynamicInferenceGraph(operand));
+    TF_ASSIGN_OR_RETURN(auto computed,
+                        client->ComputeConstant(subgraph, output_layout));
+    return std::move(computed);
+  }
+
+  StatusOr<bool> ComputeDynamismScalar(Client* client, XlaOp operand,
+                                       XlaBuilder* builder,
+                                       ShapeIndex index = {}) {
+    TF_ASSIGN_OR_RETURN(auto literal, ComputeDynamismLiteral(client, operand,
+                                                             builder, nullptr));
+    return literal.Get<bool>({}, index);
+  }
+
+  se::Platform* platform_;
+};
+
+TEST_F(DynamismInferenceTest, ScalarInt32Literal) {
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    XlaBuilder b(TestName());
+    auto computation = ConstantR0<int32>(&b, 42);
+
+    auto value = ComputeDynamismScalar(client, computation, &b);
+    ASSERT_TRUE(value.ok()) << value.status();
+    // A constant is not dynamic.
+    EXPECT_EQ(value.ValueOrDie(), false);
+  }
+}
+
+TEST_F(DynamismInferenceTest, TupleGteKeepsDynamism) {
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    XlaBuilder b(TestName());
+    auto c = ConstantR0<int32>(&b, 42);
+    auto p = Parameter(&b, 0, ShapeUtil::MakeScalarShape(S32), "0");
+
+    auto tuple = Tuple(&b, {c, p});
+    auto gte0 = GetTupleElement(tuple, 0);
+    auto gte1 = GetTupleElement(tuple, 1);
+    auto tuple_2 = Tuple(&b, {gte0, gte1});
+    EXPECT_EQ(ComputeDynamismScalar(client, tuple_2, &b, {0}).ValueOrDie(),
+              false);
+    EXPECT_EQ(ComputeDynamismScalar(client, tuple_2, &b, {1}).ValueOrDie(),
+              true);
+  }
+}
+
+TEST_F(DynamismInferenceTest, ConcatSliceReshapeKeepsDynamism) {
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    XlaBuilder b(TestName());
+    auto c = ConstantR0<int32>(&b, 42);
+    auto p = Parameter(&b, 0, ShapeUtil::MakeScalarShape(S32), "0");
+
+    auto concat = ConcatScalars(&b, {c, p});
+    auto slice0 = SliceInDim(concat, 0, 1, 1, 0);
+    auto reshape0 = Reshape(slice0, {});
+    auto slice1 = SliceInDim(concat, 1, 2, 1, 0);
+    auto reshape1 = Reshape(slice1, {});
+    auto tuple_2 = Tuple(&b, {reshape0, reshape1});
+    EXPECT_EQ(ComputeDynamismScalar(client, tuple_2, &b, {0}).ValueOrDie(),
+              false);
+    EXPECT_EQ(ComputeDynamismScalar(client, tuple_2, &b, {1}).ValueOrDie(),
+              true);
+  }
+}
+
+TEST_F(DynamismInferenceTest, ParameterIsDynamic) {
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    XlaBuilder b(TestName());
+    auto computation = Parameter(&b, 0, ShapeUtil::MakeScalarShape(S32), "0");
+
+    auto value = ComputeDynamismScalar(client, computation, &b);
+    ASSERT_TRUE(value.ok()) << value.status();
+    // A parameter is considered dynamic.
+    EXPECT_EQ(value.ValueOrDie(), true);
+  }
+}
+
+TEST_F(DynamismInferenceTest, UnaryOpKeepsDynamism) {
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    XlaBuilder b(TestName());
+    auto c = ConstantR0<int32>(&b, 42);
+    auto p = Parameter(&b, 0, ShapeUtil::MakeScalarShape(S32), "0");
+
+    auto neg0 = Neg(c);
+    auto neg1 = Neg(p);
+    auto tuple_2 = Tuple(&b, {neg0, neg1});
+    EXPECT_EQ(ComputeDynamismScalar(client, tuple_2, &b, {0}).ValueOrDie(),
+              false);
+    EXPECT_EQ(ComputeDynamismScalar(client, tuple_2, &b, {1}).ValueOrDie(),
+              true);
+  }
+}
+
+TEST_F(DynamismInferenceTest, BinaryOpsOrsDynamism) {
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    XlaBuilder b(TestName());
+    auto c = ConstantR0<int32>(&b, 42);
+    auto p = Parameter(&b, 0, ShapeUtil::MakeScalarShape(S32), "0");
+
+    // Static value + static value = static
+    auto add1 = Add(c, c);
+    // Dynamic value + dynamic value = dynamic
+    auto add2 = Add(p, c);
+    auto tuple_2 = Tuple(&b, {add1, add2});
+    EXPECT_EQ(ComputeDynamismScalar(client, tuple_2, &b, {0}).ValueOrDie(),
+              false);
+    EXPECT_EQ(ComputeDynamismScalar(client, tuple_2, &b, {1}).ValueOrDie(),
+              true);
+  }
+}
+
+TEST_F(DynamismInferenceTest, GetDimensionSize) {
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    XlaBuilder b(TestName());
+    // param = Param([<=2, 3])
+    // get_dimension_size(param, 0) is dynamic
+    // get_dimension_size(param, 1) is static
+    auto p =
+        Parameter(&b, 0, ShapeUtil::MakeShape(S32, {2, 3}, {true, false}), "0");
+
+    auto gds0 = GetDimensionSize(p, 0);
+    auto gds1 = GetDimensionSize(p, 1);
+    auto tuple_2 = Tuple(&b, {gds0, gds1});
+    EXPECT_EQ(ComputeDynamismScalar(client, tuple_2, &b, {0}).ValueOrDie(),
+              true);
+    EXPECT_EQ(ComputeDynamismScalar(client, tuple_2, &b, {1}).ValueOrDie(),
+              false);
+  }
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/prng_test.cc b/tensorflow/compiler/xla/tests/prng_test.cc
index b83fed07e34..201c0da87f1 100644
--- a/tensorflow/compiler/xla/tests/prng_test.cc
+++ b/tensorflow/compiler/xla/tests/prng_test.cc
@@ -103,7 +103,8 @@ INSTANTIATE_TEST_SUITE_P(
         ::testing::Values(
             // The largest negative number smaller than zero in bf16 that's not
             // denormalized.
-            std::make_pair(static_cast<float>(-bfloat16::min_positive_normal()),
+            std::make_pair(static_cast<float>(
+                               -std::numeric_limits<Eigen::bfloat16>::min()),
                            0.0f),
             // Test odd and even values.
             std::make_pair(32.75f, 33.00f), std::make_pair(32.50f, 32.75f),
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index b113b498e22..fc1ca7d3105 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -308,17 +308,18 @@ cc_library(
         ":prepare_reference_module",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:error_spec",
+        "//tensorflow/compiler/xla:literal_comparison",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client/lib:testing",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_runner",
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/service:platform_util",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/core:lib",
         "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:path",
         "//tensorflow/core/platform:status",
-        "//tensorflow/core/platform:test",
         "//tensorflow/stream_executor:platform",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -339,6 +340,7 @@ tf_cc_binary(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core/platform:logging",
         "//tensorflow/core/platform:platform_port",
+        "//tensorflow/core/platform:path",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:test",
     ] + if_cuda_or_rocm([
diff --git a/tensorflow/compiler/xla/tools/hlo_module_loader.cc b/tensorflow/compiler/xla/tools/hlo_module_loader.cc
index b3aaba7fa25..8b70b0d35a7 100644
--- a/tensorflow/compiler/xla/tools/hlo_module_loader.cc
+++ b/tensorflow/compiler/xla/tools/hlo_module_loader.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "google/protobuf/text_format.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
@@ -32,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/regexp.h"
 
 namespace xla {
@@ -87,9 +87,10 @@ StatusOr<std::unique_ptr<HloModule>> LoadModuleFromData(
         return InvalidArgument("Failed to parse input as HLO protobuf binary");
       }
     } else if (format == "pbtxt") {
-      if (!google::protobuf::TextFormat::ParseFromString(data, &proto) &&
-          !google::protobuf::TextFormat::ParseFromString(data, proto.mutable_hlo()) &&
-          !google::protobuf::TextFormat::ParseFromString(
+      if (!tensorflow::protobuf::TextFormat::ParseFromString(data, &proto) &&
+          !tensorflow::protobuf::TextFormat::ParseFromString(
+              data, proto.mutable_hlo()) &&
+          !tensorflow::protobuf::TextFormat::ParseFromString(
               data, proto.mutable_hlo()->mutable_hlo_module())) {
         return InvalidArgument("Failed to parse input as HLO protobuf text");
       }
diff --git a/tensorflow/compiler/xla/tools/run_hlo_module.cc b/tensorflow/compiler/xla/tools/run_hlo_module.cc
index 39b545af393..be9b23efb12 100644
--- a/tensorflow/compiler/xla/tools/run_hlo_module.cc
+++ b/tensorflow/compiler/xla/tools/run_hlo_module.cc
@@ -27,24 +27,66 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/testing.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/error_spec.h"
+#include "tensorflow/compiler/xla/literal_comparison.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_runner.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
-#include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/tools/hlo_module_loader.h"
 #include "tensorflow/compiler/xla/tools/prepare_reference_module.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace se = ::stream_executor;
 
 namespace xla {
 namespace {
 
+// Writes the given literal to a file in the test temporary directory.
+void WriteLiteralToTempFile(const LiteralSlice& literal, const string& name) {
+  // Bazel likes for tests to write "debugging outputs" like these to
+  // TEST_UNDECLARED_OUTPUTS_DIR.  This plays well with tools that inspect test
+  // results, especially when they're run on remote machines.
+  auto* env = tensorflow::Env::Default();
+  string binary_filename;
+  string text_filename;
+  string outdir;
+  if (tensorflow::io::GetTestUndeclaredOutputsDir(&outdir)) {
+    string filename = tensorflow::io::JoinPath(
+        outdir, absl::StrFormat("tempfile-%d-%s", env->NowMicros(), name));
+    binary_filename = absl::StrCat(filename, ".pb");
+    text_filename = absl::StrCat(filename, ".txt");
+  } else {
+    binary_filename =
+        tensorflow::io::GetTempFilename(absl::StrCat(name, ".pb"));
+    text_filename = tensorflow::io::GetTempFilename(absl::StrCat(name, ".txt"));
+  }
+
+  TF_CHECK_OK(
+      tensorflow::WriteBinaryProto(env, binary_filename, literal.ToProto()));
+  TF_CHECK_OK(
+      tensorflow::WriteStringToFile(env, text_filename, literal.ToString()));
+  LOG(ERROR) << "wrote Literal to " << name << " binary: " << binary_filename
+             << " text: " << text_filename;
+}
+
+// Callback helper that dumps literals to temporary files in the event of a
+// miscomparison.
+void OnMiscompare(const LiteralSlice& expected, const LiteralSlice& actual,
+                  const LiteralSlice& mismatches,
+                  const ShapeIndex& /*shape_index*/) {
+  LOG(INFO) << "expected: " << ShapeUtil::HumanString(expected.shape()) << " "
+            << literal_comparison::ToStringTruncated(expected);
+  LOG(INFO) << "actual:   " << ShapeUtil::HumanString(actual.shape()) << " "
+            << literal_comparison::ToStringTruncated(actual);
+  LOG(INFO) << "Dumping literals to temp files...";
+  WriteLiteralToTempFile(expected, "expected");
+  WriteLiteralToTempFile(actual, "actual");
+  WriteLiteralToTempFile(mismatches, "mismatches");
+}
+
 Literal ExecuteOnPlatform(std::unique_ptr<HloModule> module,
                           absl::Span<const Literal> args,
                           se::Platform* platform, bool run_hlo_passes) {
@@ -69,7 +111,7 @@ Literal ExecuteOnPlatform(std::unique_ptr<HloModule> module,
 }
 }  // namespace
 
-::testing::AssertionResult RunAndCompare(
+Status RunAndCompare(
     const std::string& hlo_filename, const std::string& test_platform_name,
     const std::string& reference_platform_name, std::minstd_rand0* engine,
     const RunHloModuleOptions& options,
@@ -122,7 +164,7 @@ Literal ExecuteOnPlatform(std::unique_ptr<HloModule> module,
 
   if (reference_module == nullptr) {
     std::cerr << "Skipping reference platform\n";
-    return ::testing::AssertionSuccess();
+    return Status::OK();
   }
 
   Literal reference_result =
@@ -136,10 +178,10 @@ Literal ExecuteOnPlatform(std::unique_ptr<HloModule> module,
   }
   ErrorSpec error_spec(static_cast<float>(options.abs_error_bound),
                        static_cast<float>(options.rel_error_bound));
-  return LiteralTestUtil::Near(/*expected=*/reference_result,
-                               /*actual=*/test_result,
-                               /*error_spec=*/error_spec,
-                               /*detailed_message=*/true);
+  return literal_comparison::Near(/*expected=*/reference_result,
+                                  /*actual=*/test_result,
+                                  /*error=*/error_spec,
+                                  /*detailed_message=*/true, &OnMiscompare);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/run_hlo_module.h b/tensorflow/compiler/xla/tools/run_hlo_module.h
index 932cc22f4dd..57f81cc7c94 100644
--- a/tensorflow/compiler/xla/tools/run_hlo_module.h
+++ b/tensorflow/compiler/xla/tools/run_hlo_module.h
@@ -22,7 +22,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/test.h"
 #include "tensorflow/stream_executor/platform.h"
 
 namespace xla {
@@ -63,7 +62,7 @@ struct RunHloModuleOptions {
 // the results. 'reference_module_modifier_hook' can be used to transform the
 // HloModule before it is run on the reference platform. This may be necessary
 // to match the numerics of the test platform.
-::testing::AssertionResult RunAndCompare(
+Status RunAndCompare(
     const std::string& hlo_filename, const std::string& test_platform_name,
     const std::string& reference_platform_name, std::minstd_rand0* engine,
     const RunHloModuleOptions& options,
diff --git a/tensorflow/compiler/xla/tools/run_hlo_module_main.cc b/tensorflow/compiler/xla/tools/run_hlo_module_main.cc
index 39d7826e162..9d153491862 100644
--- a/tensorflow/compiler/xla/tools/run_hlo_module_main.cc
+++ b/tensorflow/compiler/xla/tools/run_hlo_module_main.cc
@@ -156,7 +156,7 @@ int main(int argc, char** argv) {
     if (iteration_count != 1) {
       std::cerr << "\n=== Iteration " << i << "\n";
     }
-    ::testing::AssertionResult matched =
+    xla::Status matched =
         xla::RunAndCompare(hlo_filename, test_platform_name,
                            reference_platform_name, &engine, opts);
 
@@ -164,13 +164,13 @@ int main(int argc, char** argv) {
     // used. Without a reference, the test just verifies that nothing blew up
     // when running the module.
     if (!reference_platform_name.empty()) {
-      if (matched) {
+      if (matched.ok()) {
         // Success.
         std::cerr << "\n** Results on " << test_platform_name << " and "
                   << reference_platform_name << " are close enough. **\n";
       } else {
         failure_count++;
-        std::cerr << matched.message() << "\n";
+        std::cerr << matched << "\n";
       }
     }
   }
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 42b6ea6bd53..1cf30b10373 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -270,8 +270,8 @@ message DebugOptions {
   // Paths to files with ptx code.
   repeated string xla_gpu_ptx_file = 127;
 
-  // Blacklist for cuDNN convolutions.
-  string xla_gpu_algorithm_blacklist_path = 128;
+  // Denylist for cuDNN convolutions.
+  string xla_gpu_algorithm_denylist_path = 128;
 
   // Guarantee run-to-run determinism from reductions on XLA:GPU.
   bool xla_gpu_deterministic_reductions = 130;
@@ -349,6 +349,10 @@ message ExecutionOptions {
   // Indicates whether to use SPMD (true) or MPMD (false) partitioning when
   // num_partitions > 1 and XLA is requested to partition the input program.
   bool use_spmd_partitioning = 11;
+
+  // If set, deduplicate hlo into function calls to reduce binary size. Only
+  // works on TPU.
+  bool deduplicate_hlo = 12;
 }
 
 message GetDeviceHandlesRequest {
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index e8b6105d3fe..d334f879c3e 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -627,6 +627,11 @@ message OpSharding {
   // applied, this is inferred from the instruction this sharding gets attached
   // to.
   repeated OpSharding tuple_shardings = 5;
+
+  // Only used for OTHER type. If true, data is sharded according to other
+  // dimensions of tile_assignment(), but replicated across devices along the
+  // last dimension. (Experimental)
+  bool replicate_on_last_tile_dim = 6;
 }
 
 // Describes the replica groups in a cross replica op (e.g., all-reduce and
diff --git a/tensorflow/compiler/xrt/xrt_state.cc b/tensorflow/compiler/xrt/xrt_state.cc
index c2f9a1c62c9..c4094795a96 100644
--- a/tensorflow/compiler/xrt/xrt_state.cc
+++ b/tensorflow/compiler/xrt/xrt_state.cc
@@ -650,7 +650,7 @@ Status XRTTupleAllocation::AliasBufferFrom(const XRTTupleAllocation& source,
 xla::StatusOr<xla::ExecutionInput> XRTTupleAllocation::ToExecutionInput(
     const std::function<xla::StatusOr<bool>(const xla::ShapeIndex&)>&
         alias_checker) {
-  xla::ExecutionInput result(on_device_shape());
+  xla::ExecutionInput result(on_device_shape(), on_host_shape());
   for (const auto& index_buffer : buffers_) {
     if (index_buffer.second == nullptr ||
         (index_buffer.second->allocation().is_null() &&
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 18341a81df4..12e143e7933 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -458,7 +458,7 @@ tf_cuda_library(
         "//tensorflow/core/framework:common_shape_fns.h",
         "//tensorflow/core/framework:control_flow.h",  # TODO(josh11b): Make internal?
         "//tensorflow/core/framework:dataset.h",
-        "//tensorflow/core/framework:dataset_stateful_op_whitelist.h",
+        "//tensorflow/core/framework:dataset_stateful_op_allowlist.h",
         "//tensorflow/core/framework:device_base.h",
         "//tensorflow/core/framework:function.h",
         "//tensorflow/core/framework:function_handle_cache.h",
@@ -629,8 +629,8 @@ tf_gen_op_libs(
         "io_ops",
         "linalg_ops",
         "list_ops",
+        "map_ops",
         "lookup_ops",
-        "logging_ops",
         "manip_ops",
         "math_ops",
         "mkl_nn_ops",
@@ -664,6 +664,19 @@ tf_gen_op_libs(
     ],
 )
 
+tf_gen_op_libs(
+    is_external = False,
+    op_lib_names = [
+        "logging_ops",
+    ],
+    deps = [
+        ":lib",
+        ":protos_all_cc",
+        # TODO(b/162630222): remove this dependency.
+        "//tensorflow/c/kernels:summary_op_lib",
+    ],
+)
+
 tf_gen_op_libs(
     op_lib_names = [
         "string_ops",
@@ -798,36 +811,7 @@ tf_gen_op_libs(
         "ragged_conversion_ops",
         "ragged_math_ops",
     ],
-    deps = [":ragged_to_dense_util"],
-)
-
-cc_library(
-    name = "ragged_to_dense_util",
-    srcs = [
-        "ops/ragged_to_dense_util.cc",
-    ],
-    hdrs = [
-        "ops/ragged_to_dense_util.h",
-    ],
-    deps = [
-        ":framework",
-        ":protos_all_cc",
-    ],
-)
-
-tf_cc_test(
-    name = "ragged_to_dense_util_test",
-    srcs = [
-        "ops/ragged_to_dense_util_test.cc",
-    ],
-    deps = [
-        ":framework",
-        ":protos_all_cc",
-        ":ragged_to_dense_util",
-        ":test",
-        ":testlib",
-        "@com_google_googletest//:gtest_main",
-    ],
+    deps = ["//tensorflow/core/util:ragged_to_dense_util"],
 )
 
 cc_library(
@@ -860,6 +844,7 @@ cc_library(
         ":io_ops_op_lib",
         ":linalg_ops_op_lib",
         ":list_ops_op_lib",
+        ":map_ops_op_lib",
         ":logging_ops_op_lib",
         ":lookup_ops_op_lib",
         ":manip_ops_op_lib",
@@ -892,6 +877,7 @@ cc_library(
         ":user_ops_op_lib",
         ":word2vec_ops",
         "//tensorflow/c/kernels:bitcast_op_lib",
+        "//tensorflow/c/kernels:summary_op_lib",
         "//tensorflow/compiler/mlir/tensorflow:mlir_passthrough_op",
     ] + if_chromiumos(
         [],
@@ -909,6 +895,7 @@ cc_library(
             ":tpu_outfeed_ops_op_lib",
             ":tpu_ordinal_selector_ops_op_lib",
             ":tpu_replication_ops_op_lib",
+            "//tensorflow/core/tpu/ops",
         ],
     ) + if_mkl([
         ":mkl_array_ops_op_lib",
@@ -998,6 +985,7 @@ cc_library(
     name = "all_kernels_impl",
     visibility = [":__subpackages__"],
     deps = [
+        "//tensorflow/c/kernels:summary_op",
         "//tensorflow/c/kernels:bitcast_op",
         "//tensorflow/core/kernels:array",
         "//tensorflow/core/kernels:audio",
@@ -1022,9 +1010,7 @@ cc_library(
         "//tensorflow/core/kernels:functional_ops",
         "//tensorflow/core/kernels:grappler",
         "//tensorflow/core/kernels:histogram_op",
-        "//tensorflow/core/kernels:image",
         "//tensorflow/core/kernels:io",
-        "//tensorflow/core/kernels:linalg",
         "//tensorflow/core/kernels:lookup",
         "//tensorflow/core/kernels:logging",
         "//tensorflow/core/kernels:manip",
@@ -1058,32 +1044,34 @@ cc_library(
         "//tensorflow/core/kernels:summary_kernels",
         "//tensorflow/core/kernels:training_ops",
         "//tensorflow/core/kernels:word2vec_kernels",
+        "//tensorflow/core/kernels/linalg:linalg",
+        "//tensorflow/core/kernels/image:image",
         "//tensorflow/core/kernels/sparse:kernels",
     ] + if_not_windows([
         "//tensorflow/core/kernels/neon:neon_depthwise_conv_op",
     ]) + if_mkl([
-        "//tensorflow/core/kernels:mkl_aggregate_ops",
-        "//tensorflow/core/kernels:mkl_concat_op",
-        "//tensorflow/core/kernels:mkl_dequantize_op",
-        "//tensorflow/core/kernels:mkl_conv_op",
-        "//tensorflow/core/kernels:mkl_cwise_ops_common",
-        "//tensorflow/core/kernels:mkl_fused_batch_norm_op",
-        "//tensorflow/core/kernels:mkl_identity_op",
-        "//tensorflow/core/kernels:mkl_input_conversion_op",
-        "//tensorflow/core/kernels:mkl_lrn_op",
-        "//tensorflow/core/kernels:mkl_pooling_ops",
-        "//tensorflow/core/kernels:mkl_qmatmul_op",
-        "//tensorflow/core/kernels:mkl_requantize_ops",
-        "//tensorflow/core/kernels:mkl_quantize_op",
-        "//tensorflow/core/kernels:mkl_relu_op",
-        "//tensorflow/core/kernels:mkl_reshape_op",
-        "//tensorflow/core/kernels:mkl_slice_op",
-        "//tensorflow/core/kernels:mkl_softmax_op",
-        "//tensorflow/core/kernels:mkl_transpose_op",
-        "//tensorflow/core/kernels:mkl_batch_matmul_op",
-        "//tensorflow/core/kernels:mkl_matmul_op",
-        "//tensorflow/core/kernels:mkl_tfconv_op",
-        "//tensorflow/core/kernels:mkl_tmp_bf16_ops",
+        "//tensorflow/core/kernels/mkl:mkl_aggregate_ops",
+        "//tensorflow/core/kernels/mkl:mkl_concat_op",
+        "//tensorflow/core/kernels/mkl:mkl_dequantize_op",
+        "//tensorflow/core/kernels/mkl:mkl_conv_op",
+        "//tensorflow/core/kernels/mkl:mkl_cwise_ops_common",
+        "//tensorflow/core/kernels/mkl:mkl_fused_batch_norm_op",
+        "//tensorflow/core/kernels/mkl:mkl_identity_op",
+        "//tensorflow/core/kernels/mkl:mkl_input_conversion_op",
+        "//tensorflow/core/kernels/mkl:mkl_lrn_op",
+        "//tensorflow/core/kernels/mkl:mkl_pooling_ops",
+        "//tensorflow/core/kernels/mkl:mkl_qmatmul_op",
+        "//tensorflow/core/kernels/mkl:mkl_requantize_ops",
+        "//tensorflow/core/kernels/mkl:mkl_quantize_op",
+        "//tensorflow/core/kernels/mkl:mkl_relu_op",
+        "//tensorflow/core/kernels/mkl:mkl_reshape_op",
+        "//tensorflow/core/kernels/mkl:mkl_slice_op",
+        "//tensorflow/core/kernels/mkl:mkl_softmax_op",
+        "//tensorflow/core/kernels/mkl:mkl_transpose_op",
+        "//tensorflow/core/kernels/mkl:mkl_batch_matmul_op",
+        "//tensorflow/core/kernels/mkl:mkl_matmul_op",
+        "//tensorflow/core/kernels/mkl:mkl_tfconv_op",
+        "//tensorflow/core/kernels/mkl:mkl_tmp_bf16_ops",
     ]) + if_cuda_or_rocm([
         "//tensorflow/core/kernels:cudnn_rnn_kernels",
     ]) + if_cuda([
@@ -1121,6 +1109,8 @@ cc_library(
         # these also dynamically loading.
         "//tensorflow/core/kernels:dataset_ops",  # Depends on grappler
         "//tensorflow/core/kernels:list_kernels",  # Depends on variant_op_registry.h
+        "//tensorflow/core/kernels:map_kernels",
+        "//tensorflow/core/kernels:tensor_map",
     ],
 )
 
@@ -1927,6 +1917,7 @@ cc_library(
         "//tensorflow/core/platform:platform_port",
         "//tensorflow/core/platform:platform_strings",
         "//tensorflow/core/platform:prefetch",
+        "//tensorflow/core/platform:profile_utils_cpu_utils",
         "//tensorflow/core/platform:protobuf_internal",
         "//tensorflow/core/platform:regexp",
         "//tensorflow/core/platform:resource",
@@ -1986,6 +1977,7 @@ cc_library(
         ":lib",
         ":lib_internal",
         "//tensorflow/core/platform:gif",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2093,13 +2085,9 @@ cc_library(
     copts = tf_copts(),
     linkopts = ["-ldl"],
     deps = [
-        "//tensorflow/core/lib/strings:numbers",
-        "//tensorflow/core/lib/strings:strcat",
         "//tensorflow/core/platform:dynamic_annotations",
         "//tensorflow/core/platform:gif",
         "//tensorflow/core/platform:logging",
-        "//tensorflow/core/platform:numbers",
-        "//tensorflow/core/platform:strcat",
         "//tensorflow/core/platform:stringpiece",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
@@ -2706,27 +2694,27 @@ tf_cc_test_mkl(
         "//tensorflow/core/kernels:ops_util",
         "//third_party/eigen3",
     ] + if_mkl([
-        "//tensorflow/core/kernels:mkl_aggregate_ops",
-        "//tensorflow/core/kernels:mkl_batch_matmul_op",
-        "//tensorflow/core/kernels:mkl_concat_op",
-        "//tensorflow/core/kernels:mkl_conv_op",
-        "//tensorflow/core/kernels:mkl_cwise_ops_common",
-        "//tensorflow/core/kernels:mkl_dequantize_op",
-        "//tensorflow/core/kernels:mkl_fused_batch_norm_op",
-        "//tensorflow/core/kernels:mkl_identity_op",
-        "//tensorflow/core/kernels:mkl_input_conversion_op",
-        "//tensorflow/core/kernels:mkl_lrn_op",
-        "//tensorflow/core/kernels:mkl_matmul_op",
-        "//tensorflow/core/kernels:mkl_pooling_ops",
-        "//tensorflow/core/kernels:mkl_qmatmul_op",
-        "//tensorflow/core/kernels:mkl_quantize_op",
-        "//tensorflow/core/kernels:mkl_relu_op",
-        "//tensorflow/core/kernels:mkl_reshape_op",
-        "//tensorflow/core/kernels:mkl_slice_op",
-        "//tensorflow/core/kernels:mkl_softmax_op",
-        "//tensorflow/core/kernels:mkl_tfconv_op",
-        "//tensorflow/core/kernels:mkl_transpose_op",
-        "//tensorflow/core/kernels:mkl_tmp_bf16_ops",
+        "//tensorflow/core/kernels/mkl:mkl_aggregate_ops",
+        "//tensorflow/core/kernels/mkl:mkl_batch_matmul_op",
+        "//tensorflow/core/kernels/mkl:mkl_concat_op",
+        "//tensorflow/core/kernels/mkl:mkl_conv_op",
+        "//tensorflow/core/kernels/mkl:mkl_cwise_ops_common",
+        "//tensorflow/core/kernels/mkl:mkl_dequantize_op",
+        "//tensorflow/core/kernels/mkl:mkl_fused_batch_norm_op",
+        "//tensorflow/core/kernels/mkl:mkl_identity_op",
+        "//tensorflow/core/kernels/mkl:mkl_input_conversion_op",
+        "//tensorflow/core/kernels/mkl:mkl_lrn_op",
+        "//tensorflow/core/kernels/mkl:mkl_matmul_op",
+        "//tensorflow/core/kernels/mkl:mkl_pooling_ops",
+        "//tensorflow/core/kernels/mkl:mkl_qmatmul_op",
+        "//tensorflow/core/kernels/mkl:mkl_quantize_op",
+        "//tensorflow/core/kernels/mkl:mkl_relu_op",
+        "//tensorflow/core/kernels/mkl:mkl_reshape_op",
+        "//tensorflow/core/kernels/mkl:mkl_slice_op",
+        "//tensorflow/core/kernels/mkl:mkl_softmax_op",
+        "//tensorflow/core/kernels/mkl:mkl_tfconv_op",
+        "//tensorflow/core/kernels/mkl:mkl_transpose_op",
+        "//tensorflow/core/kernels/mkl:mkl_tmp_bf16_ops",
     ]),
 )
 
diff --git a/tensorflow/core/api_def/base_api/api_def_AvgPool3D.pbtxt b/tensorflow/core/api_def/base_api/api_def_AvgPool3D.pbtxt
index 8171566a212..fcaa93acac1 100644
--- a/tensorflow/core/api_def/base_api/api_def_AvgPool3D.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_AvgPool3D.pbtxt
@@ -43,4 +43,8 @@ Alternatively, the format could be "NCDHW", the data storage order is:
 END
   }
   summary: "Performs 3D average pooling on the input."
+  description: <<END
+Each entry in `output` is the mean of the corresponding size `ksize` window in
+`value`.
+END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt
index a7792dc9bf2..b2cace5c3bc 100644
--- a/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt
@@ -117,6 +117,7 @@ So, for example, in the following code
           batch_timeout_micros=100000,  # 100ms
           allowed_batch_sizes=[3, 10],
           batching_queue="")
+  ```
 
 If more than one session.run call is simultaneously trying to compute `b`
 the values of `a` will be gathered, non-deterministically concatenated
diff --git a/tensorflow/core/api_def/base_api/api_def_CollectiveReduceV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_CollectiveReduceV2.pbtxt
new file mode 100644
index 00000000000..7663ac0afa2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CollectiveReduceV2.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "CollectiveReduceV2"
+  summary: "Mutually reduces multiple tensors of identical type and shape."
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ComputeBatchSize.pbtxt b/tensorflow/core/api_def/base_api/api_def_ComputeBatchSize.pbtxt
new file mode 100644
index 00000000000..b92d02e256d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ComputeBatchSize.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "ComputeBatchSize"
+  visibility: HIDDEN
+  summary: "Computes the static batch size of a dataset sans partial batches."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DataServiceDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_DataServiceDataset.pbtxt
new file mode 100644
index 00000000000..3801878cd71
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DataServiceDataset.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "DataServiceDataset"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_EmptyTensorMap.pbtxt b/tensorflow/core/api_def/base_api/api_def_EmptyTensorMap.pbtxt
new file mode 100644
index 00000000000..fb5ce3d5413
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EmptyTensorMap.pbtxt
@@ -0,0 +1,7 @@
+op {
+  graph_op_name: "EmptyTensorMap"
+  summary: "Creates and returns an empty tensor map."
+  description: <<END
+handle: an empty tensor map
+END
+}
\ No newline at end of file
diff --git a/tensorflow/core/api_def/base_api/api_def_KthOrderStatistic.pbtxt b/tensorflow/core/api_def/base_api/api_def_KthOrderStatistic.pbtxt
new file mode 100644
index 00000000000..a7485f9a144
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_KthOrderStatistic.pbtxt
@@ -0,0 +1,20 @@
+op {
+  graph_op_name: "KthOrderStatistic"
+  summary: "Computes the Kth order statistic of a data set. The current"
+  description: <<END
+implementation uses a binary search requiring exactly 32 passes over
+the input data. The running time is linear with respect to input
+size. The median-of-medians algorithm is probably faster, but is
+difficult to implement efficiently in XLA. The implementation imposes
+a total ordering on floats. The ordering is consistent with the usual
+partial order.  Positive NaNs are greater than positive
+infinity. Negative NaNs are less than negative infinity. NaNs with
+distinct payloads are treated as distinct. Subnormal numbers are
+preserved (not flushed to zero). Positive infinity is greater than all
+numbers. Negative infinity is less than all numbers. Positive is
+greater than negative zero. There are less than k values greater than
+the kth order statistic. There are at least k values greater than or
+equal to the Kth order statistic. The semantics are not the same as
+top_k_unique.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MakeUnique.pbtxt b/tensorflow/core/api_def/base_api/api_def_MakeUnique.pbtxt
new file mode 100644
index 00000000000..985eaed40a2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MakeUnique.pbtxt
@@ -0,0 +1,10 @@
+op {
+  graph_op_name: "MakeUnique"
+  summary: "Make all elements in the non-Batch dimension unique, but \\\"close\\\" to"
+  description: <<END
+their initial value. Never returns a sub-normal number. Never returns
+zero. The sign of each input element is always identical to the sign
+of the corresponding output element. Behavior for infinite elements is
+undefined. Behavior for subnormal elements is undefined.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MatrixInverse.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatrixInverse.pbtxt
index 25eca0c766b..16c6e0a9d34 100644
--- a/tensorflow/core/api_def/base_api/api_def_MatrixInverse.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MatrixInverse.pbtxt
@@ -16,9 +16,8 @@ Equivalent to np.linalg.inv
 @end_compatibility
 END
   }
-  summary: "Computes the inverse of one or more square invertible matrices or their"
+  summary: "Computes the inverse of one or more square invertible matrices or their adjoints (conjugate transposes)."
   description: <<END
-adjoints (conjugate transposes).
 
 The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
 form square matrices. The output is a tensor of the same shape as the input
diff --git a/tensorflow/core/api_def/base_api/api_def_OptimizeDatasetV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_OptimizeDatasetV2.pbtxt
new file mode 100644
index 00000000000..a8e66499471
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_OptimizeDatasetV2.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "OptimizeDatasetV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the input dataset.
+END
+  }
+  in_arg {
+    name: "optimizations_enabled"
+    description: <<END
+A `tf.string` vector `tf.Tensor` identifying user enabled optimizations.
+END
+  }
+  in_arg {
+    name: "optimizations_disabled"
+    description: <<END
+A `tf.string` vector `tf.Tensor` identifying user disabled optimizations.
+END
+  }
+  in_arg {
+    name: "optimizations_default"
+    description: <<END
+A `tf.string` vector `tf.Tensor` identifying optimizations by default.
+END
+  }
+  summary: "Creates a dataset by applying related optimizations to `input_dataset`."
+  description: <<END
+Creates a dataset by applying related optimizations to `input_dataset`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RebatchDatasetV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_RebatchDatasetV2.pbtxt
new file mode 100644
index 00000000000..3abdff980f3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RebatchDatasetV2.pbtxt
@@ -0,0 +1,22 @@
+op {
+  graph_op_name: "RebatchDatasetV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the input dataset.
+END
+  }
+  in_arg {
+  name: "batch_sizes"
+  description: <<END
+A vector of integers representing the size of batches to produce. These values
+are cycled through in order.
+END
+  }
+  summary: "Creates a dataset that changes the batch size."
+  description: <<END
+Creates a dataset that rebatches elements from `input_dataset` into new batch
+sizes.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SquaredDifference.pbtxt b/tensorflow/core/api_def/base_api/api_def_SquaredDifference.pbtxt
index 51277692d8c..2edc3bfc198 100644
--- a/tensorflow/core/api_def/base_api/api_def_SquaredDifference.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SquaredDifference.pbtxt
@@ -1,6 +1,6 @@
 op {
   graph_op_name: "SquaredDifference"
-  summary: "Returns (x - y)(x - y) element-wise."
+  summary: "Returns conj(x - y)(x - y) element-wise."
   description: <<END
 *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
 [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessCase.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessCase.pbtxt
new file mode 100644
index 00000000000..6ad0c2046dc
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessCase.pbtxt
@@ -0,0 +1,46 @@
+op {
+  graph_op_name: "StatelessCase"
+  visibility: HIDDEN
+  in_arg {
+    name: "branch_index"
+    description: "The branch selector, an int32 Tensor."
+  }
+  in_arg {
+    name: "input"
+    description: "A list of input tensors passed to the branch function."
+  }
+  out_arg {
+    name: "output"
+    description: "A list of return values."
+  }
+  attr { name: "Tin"  description: "A list of input types." }
+  attr { name: "Tout"  description: "A list of output types." }
+  attr {
+    name: "branches"
+    description: <<END
+      A list of functions each of which takes 'inputs' and returns a list of
+      tensors, whose types are the same as what every other branch returns.
+END
+  }
+  summary: "An n-way switch statement which calls a single branch function."
+  description: <<END
+    An n-way switch statement, implementing the following:
+    ```
+    switch (branch_index) {
+      case 0:
+        output = branches[0](input);
+        break;
+      case 1:
+        output = branches[1](input);
+        break;
+      ...
+      case [[nbranches-1]]:
+      default:
+        output = branches[nbranches-1](input);
+        break;
+    }
+    ```
+
+    This should only be used when the none of branches has stateful ops.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessSampleDistortedBoundingBox.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessSampleDistortedBoundingBox.pbtxt
new file mode 100644
index 00000000000..2c5e32a0c1e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessSampleDistortedBoundingBox.pbtxt
@@ -0,0 +1,144 @@
+op {
+  graph_op_name: "StatelessSampleDistortedBoundingBox"
+  in_arg {
+    name: "image_size"
+    description: <<END
+1-D, containing `[height, width, channels]`.
+END
+  }
+  in_arg {
+    name: "bounding_boxes"
+    description: <<END
+3-D with shape `[batch, N, 4]` describing the N bounding boxes
+associated with the image.
+END
+  }
+  in_arg {
+    name: "seed"
+    description: <<END
+1-D with shape `[2]`. The seed to the random number generator. Must have dtype
+`int32` or `int64`. (When using XLA, only `int32` is allowed.)
+END
+  }
+  in_arg {
+    name: "min_object_covered"
+    description: <<END
+The cropped area of the image must contain at least this
+fraction of any bounding box supplied. The value of this parameter should be
+non-negative. In the case of 0, the cropped area does not need to overlap
+any of the bounding boxes supplied.
+END
+  }
+  out_arg {
+    name: "begin"
+    description: <<END
+1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+`tf.slice`.
+END
+  }
+  out_arg {
+    name: "size"
+    description: <<END
+1-D, containing `[target_height, target_width, -1]`. Provide as input to
+`tf.slice`.
+END
+  }
+  out_arg {
+    name: "bboxes"
+    description: <<END
+3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+Provide as input to `tf.image.draw_bounding_boxes`.
+END
+  }
+  attr {
+    name: "aspect_ratio_range"
+    description: <<END
+The cropped area of the image must have an aspect ratio =
+width / height within this range.
+END
+  }
+  attr {
+    name: "area_range"
+    description: <<END
+The cropped area of the image must contain a fraction of the
+supplied image within this range.
+END
+  }
+  attr {
+    name: "max_attempts"
+    description: <<END
+Number of attempts at generating a cropped region of the image
+of the specified constraints. After `max_attempts` failures, return the entire
+image.
+END
+  }
+  attr {
+    name: "use_image_if_no_bounding_boxes"
+    description: <<END
+Controls behavior if no bounding boxes supplied.
+If true, assume an implicit bounding box covering the whole input. If false,
+raise an error.
+END
+  }
+  summary: "Generate a randomly distorted bounding box for an image deterministically."
+  description: <<END
+Bounding box annotations are often supplied in addition to ground-truth labels
+in image recognition or object localization tasks. A common technique for
+training such a system is to randomly distort an image while preserving its
+content, i.e. *data augmentation*. This Op, given the same `seed`,
+deterministically outputs a randomly distorted localization of an object, i.e.
+bounding box, given an `image_size`, `bounding_boxes` and a series of
+constraints.
+
+The output of this Op is a single bounding box that may be used to crop the
+original image. The output is returned as 3 tensors: `begin`, `size` and
+`bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+what the bounding box looks like.
+
+Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+the height of the underlying image.
+
+The output of this Op is guaranteed to be the same given the same `seed` and is
+independent of how many times the function is called, and independent of global
+seed settings (e.g. `tf.random.set_seed`).
+
+Example usage:
+
+>>> image = np.array([[[1], [2], [3]], [[4], [5], [6]], [[7], [8], [9]]])
+>>> bbox = tf.constant(
+...   [0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4])
+>>> seed = (1, 2)
+>>> # Generate a single distorted bounding box.
+>>> bbox_begin, bbox_size, bbox_draw = (
+...   tf.image.stateless_sample_distorted_bounding_box(
+...     tf.shape(image), bounding_boxes=bbox, seed=seed))
+>>> # Employ the bounding box to distort the image.
+>>> tf.slice(image, bbox_begin, bbox_size)
+<tf.Tensor: shape=(2, 2, 1), dtype=int64, numpy=
+array([[[1],
+        [2]],
+       [[4],
+        [5]]])>
+>>> # Draw the bounding box in an image summary.
+>>> colors = np.array([[1.0, 0.0, 0.0], [0.0, 0.0, 1.0]])
+>>> tf.image.draw_bounding_boxes(
+...   tf.expand_dims(tf.cast(image, tf.float32),0), bbox_draw, colors)
+<tf.Tensor: shape=(1, 3, 3, 1), dtype=float32, numpy=
+array([[[[1.],
+         [1.],
+         [3.]],
+        [[1.],
+         [1.],
+         [6.]],
+        [[7.],
+         [8.],
+         [9.]]]], dtype=float32)>
+
+Note that if no bounding box information is available, setting
+`use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+false and no bounding boxes are supplied, an error is raised.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUCompile.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUCompile.pbtxt
new file mode 100644
index 00000000000..aa8f4bcba17
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUCompile.pbtxt
@@ -0,0 +1,21 @@
+op {
+  graph_op_name: "TPUCompile"
+  summary: "Compiles a computations for execution on one or more TPU devices."
+  description: <<END
+For the internal use of the distributed TPU compiler.
+
+'num_computations' is the number of computations to be compiled.
+'function' is a function containing the computation to compile.
+'dynamic_shapes' contains dynamic shapes of arguments whose shapes were not
+known statically at TPUReplication rewrite time.
+'guaranteed_constants' is a list of tensors which have been guaranteed to not
+change their values during the session lifetime. These contain tensors marked as
+constant using the GuaranteeConstOp.
+'metadata' is a serialized TPUCompileMetadataProto describing
+the shapes and types of the inputs to the computation, as well as a mapping onto
+the TPU pod topology.
+Each 'program' output is a string key that is passed to the _TPUExecute op and
+used to look up the program in the compilation cache.
+'may_modify_variables' indicates whether variables may be modified.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUCompileSucceededAssert.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUCompileSucceededAssert.pbtxt
new file mode 100644
index 00000000000..4e0ef472ce2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUCompileSucceededAssert.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "TPUCompileSucceededAssert"
+  summary: "Asserts that compilation succeeded. This op produces no output and closes the"
+  description: <<END
+device during failure to ensure all pending device interactions fail.
+
+'compilation_status' is a serialized CompilationResultProto.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUExecute.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUExecute.pbtxt
new file mode 100644
index 00000000000..4d2fc90cf5b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUExecute.pbtxt
@@ -0,0 +1,7 @@
+op {
+  graph_op_name: "TPUExecute"
+  summary: "Op that loads and executes a TPU program on a TPU device."
+  description: <<END
+For the internal use of the distributed TPU compiler.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUExecuteAndUpdateVariables.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUExecuteAndUpdateVariables.pbtxt
new file mode 100644
index 00000000000..b76ef96e97a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUExecuteAndUpdateVariables.pbtxt
@@ -0,0 +1,13 @@
+op {
+  graph_op_name: "TPUExecuteAndUpdateVariables"
+  summary: "Op that executes a program with optional in-place variable updates."
+  description: <<END
+It (optionally) reads device variables, loads and executes a TPU program on a
+TPU device, and then (optionally) in-place updates variables using the program
+outputs, as specified in attributes device_var_reads_indices (program input
+indices from directly reading variables) and device_var_updates_indices (program
+output indices used to update variables, -1 means no-update/read-only). Such
+program outputs are consumed by these variables will not appear in the op
+output. For the internal use of the distributed TPU compiler.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUPartitionedInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUPartitionedInput.pbtxt
new file mode 100644
index 00000000000..1d22ac138e0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUPartitionedInput.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "TPUPartitionedInput"
+  in_arg {
+    name: "inputs"
+    description: <<END
+A list of partitioned inputs which must have the same shape.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A handle which represents the full shape of partitioned tensors.
+END
+  }
+  attr {
+    name: "partition_dim"
+    description: <<END
+An integer describles which dimension is partitioned. -1 means
+those inputs are replicated.
+END
+  }
+  summary: "An op that groups a list of partitioned inputs together. This op"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUPartitionedOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUPartitionedOutput.pbtxt
new file mode 100644
index 00000000000..f1613bc8db9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUPartitionedOutput.pbtxt
@@ -0,0 +1,25 @@
+op {
+  graph_op_name: "TPUPartitionedOutput"
+  in_arg {
+    name: "inputs"
+    description: <<END
+A tensor which represents the full shape of partitioned tensors.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A list of partitioned inputs which must have the same shape.
+END
+  }
+  attr {
+    name: "partition_dim"
+    description: <<END
+An integer describles which dimension is partitioned.
+END
+  }
+  summary: "An op that demultiplexes a tensor to be sharded by XLA to a list of partitioned"
+  description: <<END
+outputs outside the XLA computation.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorMapErase.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorMapErase.pbtxt
new file mode 100644
index 00000000000..aaaa19d01ad
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorMapErase.pbtxt
@@ -0,0 +1,10 @@
+op {
+  graph_op_name: "TensorMapErase"
+  summary: "Returns a tensor map with item from given key erased."
+  description: <<END
+input_handle: the original map
+output_handle: the map with value from given key removed
+key: the key of the value to be erased
+value: the value that was erased
+END
+}
\ No newline at end of file
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorMapHasKey.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorMapHasKey.pbtxt
new file mode 100644
index 00000000000..fc46a3abfd9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorMapHasKey.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "TensorMapHasKey"
+  summary: "Returns whether the given key exists in the map."
+  description: <<END
+input_handle: the input map
+key: the key to check
+has_key: whether the key is already in the map or not
+END
+}
\ No newline at end of file
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorMapInsert.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorMapInsert.pbtxt
new file mode 100644
index 00000000000..4f3dba60d3e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorMapInsert.pbtxt
@@ -0,0 +1,10 @@
+op {
+  graph_op_name: "TensorMapInsert"
+  summary: "Returns a map that is the 'input_handle' with the given key-value pair inserted."
+  description: <<END
+input_handle: the original map
+output_handle: the map with key and value inserted
+key: the key to be inserted
+value: the value to be inserted
+END
+}
\ No newline at end of file
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorMapLookup.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorMapLookup.pbtxt
new file mode 100644
index 00000000000..f6f30f95845
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorMapLookup.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "TensorMapLookup"
+  summary: "Returns the value from a given key in a tensor map."
+  description: <<END
+input_handle: the input map
+key: the key to be looked up
+value: the value found from the given key
+END
+}
\ No newline at end of file
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorMapSize.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorMapSize.pbtxt
new file mode 100644
index 00000000000..4a2f3aef65f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorMapSize.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "TensorMapSize"
+  summary: "Returns the number of tensors in the input tensor map."
+  description: <<END
+input_handle: the input map
+size: the number of tensors in the map
+END
+}
\ No newline at end of file
diff --git a/tensorflow/core/api_def/base_api/api_def_TopKUnique.pbtxt b/tensorflow/core/api_def/base_api/api_def_TopKUnique.pbtxt
new file mode 100644
index 00000000000..5c9ccba8efa
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TopKUnique.pbtxt
@@ -0,0 +1,18 @@
+op {
+  graph_op_name: "TopKUnique"
+  summary: "Returns the TopK unique values in the array in sorted order. The"
+  description: <<END
+running time is proportional to the product of K and the input
+size. Sorting the whole array is more efficient for sufficiently large
+values of K. The median-of-medians algorithm is probably faster, but
+difficult to implement efficiently in XLA. If there are fewer than K
+unique numbers (not NANs), the results are padded with negative
+infinity. NaNs are never returned. Subnormal numbers are flushed to
+zero. If an element appears at multiple indices, the highest index is
+returned. If a TopK element never appears in the input due to padding
+values, the indices are padded with negative one. If a padding value
+appears in the input and padding is needed, the highest index of the
+padding value will be returned. The semantics are not the same as
+kth_order_statistic.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TopKWithUnique.pbtxt b/tensorflow/core/api_def/base_api/api_def_TopKWithUnique.pbtxt
new file mode 100644
index 00000000000..ac73bad4bf5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TopKWithUnique.pbtxt
@@ -0,0 +1,10 @@
+op {
+  graph_op_name: "TopKWithUnique"
+  summary: "Returns the TopK values in the array in sorted order. This is a combination"
+  description: <<END
+of MakeUnique and TopKUnique. The returned top-K will have its lower bits
+replaced by iota, thus it will be close to the original value but not exactly
+the same. The running time is proportional to the product of K and the input
+size. NaNs are never returned. Subnormal numbers are flushed to zero.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaHostCompute.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaHostCompute.pbtxt
new file mode 100644
index 00000000000..bb9b6f9a7df
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaHostCompute.pbtxt
@@ -0,0 +1,66 @@
+op {
+  graph_op_name: "XlaHostCompute"
+  in_arg {
+    name: "inputs"
+    description: <<END
+A list of tensors that will be sent to the host.
+END
+  }
+  out_arg {
+    name: "outputs"
+    description: <<END
+A list of tensors that will be returned to the device.
+END
+  }
+  attr {
+    name: "Tinputs"
+    description: <<END
+The element types of each element in `inputs`.
+END
+  }
+  attr {
+    name: "Toutputs"
+    description: <<END
+The element types of each element in `outputs`.
+END
+  }
+  attr {
+    name: "ancestors"
+    description: <<END
+A list of names of HostCompute computations that must be
+sequenced before this computation.
+END
+  }
+  attr {
+    name: "shapes"
+    description: <<END
+If shape_inference_graph is empty, a list of the shapes of `outputs`.
+END
+  }
+  attr {
+    name: "shape_inference_graph"
+    description: <<END
+If non-empty, a serialized GraphDef representing a graph
+that must be analyzed at compile time to determine the shapes of the outputs.
+END
+  }
+  attr {
+    name: "key"
+    description: <<END
+A unique identifier for this region used to match up host transfers.
+END
+  }
+  attr {
+    name: "cost_estimate_ns"
+    description: <<END
+Estimated duration of the host computation in nanoseconds.
+END
+  }
+  attr {
+    name: "tpu_core"
+    description: <<END
+Default core to use for host to device transfers.
+END
+  }
+  summary: "A pseudo-op to represent host-side computation in an XLA program."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaRecvFromHost.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaRecvFromHost.pbtxt
new file mode 100644
index 00000000000..1ca7ae081a3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaRecvFromHost.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "XlaRecvFromHost"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSendToHost.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSendToHost.pbtxt
new file mode 100644
index 00000000000..ef6f5e22fc0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSendToHost.pbtxt
@@ -0,0 +1,12 @@
+op {
+  graph_op_name: "XlaSendToHost"
+  in_arg: {
+    name: "input"
+  }
+  attr {
+    name: "Tinput"
+  }
+  attr {
+    name: "key"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StatelessSampleDistortedBoundingBox.pbtxt b/tensorflow/core/api_def/python_api/api_def_StatelessSampleDistortedBoundingBox.pbtxt
new file mode 100644
index 00000000000..2ee453ee2f5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StatelessSampleDistortedBoundingBox.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StatelessSampleDistortedBoundingBox"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index e8f1dd1c5b5..a2b9867f132 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -1,6 +1,7 @@
 load(
     "//tensorflow:tensorflow.bzl",
     "cc_header_only_library",
+    "if_tpu",
     "tf_cc_test",
     "tf_cc_test_mkl",
     "tf_cc_tests",
@@ -42,11 +43,13 @@ load(
     "mkl_deps",
 )
 
+default_package_visibility = [
+    "//tensorflow:internal",
+    "//tensorflow_models:__subpackages__",
+]
+
 package(
-    default_visibility = [
-        "//tensorflow:internal",
-        "//tensorflow_models:__subpackages__",
-    ],
+    default_visibility = default_package_visibility,
     licenses = ["notice"],  # Apache 2.0
 )
 
@@ -56,6 +59,7 @@ tf_cuda_library(
         "composite_device.h",
         "device.h",
         "device_factory.h",
+        "device_mgr.h",
         "function.h",
         "function_optimization_registry.h",
         "gradients.h",
@@ -88,7 +92,7 @@ cc_library(
         ":core_cpu",
         "//tensorflow/core/common_runtime/gpu:gpu_runtime",
         "//tensorflow/core/common_runtime/sycl:sycl_runtime",
-    ],
+    ] + if_tpu(["//tensorflow/core/tpu:tpu_runtime"]),
 )
 
 filegroup(
@@ -265,6 +269,7 @@ filegroup(
         "threadpool_device.h",
         "process_state.h",
         "pool_allocator.h",
+        "permuter.h",
     ] + if_mkl(["//tensorflow/core/graph:mkl_graph_util_header"]),
 )
 
@@ -757,6 +762,10 @@ cc_library(
         "shape_refiner.h",
     ],
     copts = tf_copts(),
+    visibility = default_package_visibility + [
+        "//platforms/performance/autograppler:__subpackages__",
+        "//platforms/performance/tf_sim:__subpackages__",
+    ],
     deps = [
         ":device",
         ":device_factory",
@@ -959,6 +968,10 @@ cc_library(
     srcs = ["lower_functional_ops.cc"],
     hdrs = ["lower_functional_ops.h"],
     copts = tf_copts(),
+    visibility = default_package_visibility + [
+        "//platforms/performance/autograppler:__subpackages__",
+        "//platforms/performance/tf_sim:__subpackages__",
+    ],
     deps = [
         ":function_utils",
         ":inline_function_utils",
@@ -1038,10 +1051,13 @@ cc_library(
     deps = [
         ":function",
         ":optimization_registry",
+        "@com_google_absl//absl/base",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
     ] + mkl_deps(),
     alwayslink = 1,
 )
@@ -1061,6 +1077,7 @@ cc_library(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
     ] + mkl_deps(),
     alwayslink = 1,
 )
@@ -1113,6 +1130,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "permuter",
+    srcs = ["permuter.cc"],
+    hdrs = ["permuter.h"],
+    copts = tf_copts(),
+    deps = [
+        ":base_collective_executor",
+        ":collective_rma_local",
+        ":collective_util",
+        ":copy_tensor",
+        ":device",
+        ":device_mgr",
+        ":dma_helper",
+        ":process_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/lib:traceme",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "pool_allocator",
     srcs = ["pool_allocator.cc"],
@@ -1569,6 +1607,7 @@ tf_cuda_library(
         ":parallel_concat_optimizer",
         ":partitioning_utils",
         ":pending_counts",
+        ":permuter",
         ":placer",
         ":pool_allocator",
         ":process_state",
@@ -1659,6 +1698,7 @@ cc_library(
     deps = [
         ":shared_counter",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
@@ -1666,6 +1706,7 @@ cc_library(
         "//tensorflow/core/profiler/lib:traceme",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -1955,6 +1996,34 @@ tf_cc_tests_gpu(
     ],
 )
 
+tf_cc_tests_gpu(
+    name = "permuter_test",
+    size = "medium",
+    srcs = [
+        "permuter_test.cc",
+    ],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = ["notap"],  # b/163417734
+    deps = [
+        ":core",
+        ":core_cpu",
+        ":core_cpu_internal",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/common_runtime/gpu:gpu_runtime",
+        "//tensorflow/core/util:protos_test_cc",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
 tf_cc_test_mkl(
     name = "mkl_runtime_tests",
     size = "small",
@@ -2583,27 +2652,27 @@ tf_cc_test_mkl(
         "//tensorflow/core/kernels:ops_util",
         "//third_party/eigen3",
     ] + if_mkl([
-        "//tensorflow/core/kernels:mkl_aggregate_ops",
-        "//tensorflow/core/kernels:mkl_batch_matmul_op",
-        "//tensorflow/core/kernels:mkl_concat_op",
-        "//tensorflow/core/kernels:mkl_conv_op",
-        "//tensorflow/core/kernels:mkl_cwise_ops_common",
-        "//tensorflow/core/kernels:mkl_dequantize_op",
-        "//tensorflow/core/kernels:mkl_fused_batch_norm_op",
-        "//tensorflow/core/kernels:mkl_identity_op",
-        "//tensorflow/core/kernels:mkl_input_conversion_op",
-        "//tensorflow/core/kernels:mkl_lrn_op",
-        "//tensorflow/core/kernels:mkl_matmul_op",
-        "//tensorflow/core/kernels:mkl_pooling_ops",
-        "//tensorflow/core/kernels:mkl_qmatmul_op",
-        "//tensorflow/core/kernels:mkl_quantize_op",
-        "//tensorflow/core/kernels:mkl_relu_op",
-        "//tensorflow/core/kernels:mkl_reshape_op",
-        "//tensorflow/core/kernels:mkl_slice_op",
-        "//tensorflow/core/kernels:mkl_softmax_op",
-        "//tensorflow/core/kernels:mkl_tfconv_op",
-        "//tensorflow/core/kernels:mkl_transpose_op",
-        "//tensorflow/core/kernels:mkl_tmp_bf16_ops",
+        "//tensorflow/core/kernels/mkl:mkl_aggregate_ops",
+        "//tensorflow/core/kernels/mkl:mkl_batch_matmul_op",
+        "//tensorflow/core/kernels/mkl:mkl_concat_op",
+        "//tensorflow/core/kernels/mkl:mkl_conv_op",
+        "//tensorflow/core/kernels/mkl:mkl_cwise_ops_common",
+        "//tensorflow/core/kernels/mkl:mkl_dequantize_op",
+        "//tensorflow/core/kernels/mkl:mkl_fused_batch_norm_op",
+        "//tensorflow/core/kernels/mkl:mkl_identity_op",
+        "//tensorflow/core/kernels/mkl:mkl_input_conversion_op",
+        "//tensorflow/core/kernels/mkl:mkl_lrn_op",
+        "//tensorflow/core/kernels/mkl:mkl_matmul_op",
+        "//tensorflow/core/kernels/mkl:mkl_pooling_ops",
+        "//tensorflow/core/kernels/mkl:mkl_qmatmul_op",
+        "//tensorflow/core/kernels/mkl:mkl_quantize_op",
+        "//tensorflow/core/kernels/mkl:mkl_relu_op",
+        "//tensorflow/core/kernels/mkl:mkl_reshape_op",
+        "//tensorflow/core/kernels/mkl:mkl_slice_op",
+        "//tensorflow/core/kernels/mkl:mkl_softmax_op",
+        "//tensorflow/core/kernels/mkl:mkl_tfconv_op",
+        "//tensorflow/core/kernels/mkl:mkl_transpose_op",
+        "//tensorflow/core/kernels/mkl:mkl_tmp_bf16_ops",
     ]),
 )
 
diff --git a/tensorflow/core/common_runtime/allocator_retry.cc b/tensorflow/core/common_runtime/allocator_retry.cc
index 3402b7fd919..966fcd1d36d 100644
--- a/tensorflow/core/common_runtime/allocator_retry.cc
+++ b/tensorflow/core/common_runtime/allocator_retry.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/common_runtime/allocator_retry.h"
+
+#include "absl/types/optional.h"
+#include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -21,6 +24,28 @@ limitations under the License.
 
 namespace tensorflow {
 
+namespace {
+class ScopedTimeTracker {
+ public:
+  explicit ScopedTimeTracker(Env* env) : env_(env) {}
+  void Enable() {
+    if (!start_us_) {  // Only override start_us when not set yet.
+      start_us_ = env_->NowMicros();
+    }
+  }
+  ~ScopedTimeTracker() {
+    if (start_us_) {
+      uint64 end_us = env_->NowMicros();
+      metrics::UpdateBfcAllocatorDelayTime(end_us - *start_us_);
+    }
+  }
+
+ private:
+  Env* env_;
+  absl::optional<uint64> start_us_;
+};
+}  // namespace
+
 AllocatorRetry::AllocatorRetry() : env_(Env::Default()) {}
 
 void* AllocatorRetry::AllocateRaw(
@@ -31,6 +56,7 @@ void* AllocatorRetry::AllocateRaw(
   if (num_bytes == 0) {
     return nullptr;
   }
+  ScopedTimeTracker tracker(env_);
   uint64 deadline_micros = 0;
   bool first = true;
   void* ptr = nullptr;
@@ -43,6 +69,7 @@ void* AllocatorRetry::AllocateRaw(
         first = false;
       }
       if (now < deadline_micros) {
+        tracker.Enable();
         mutex_lock l(mu_);
         WaitForMilliseconds(&l, &memory_returned_,
                             (deadline_micros - now) / 1000);
diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc
index 1dfe2eed426..754f8196d29 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.cc
+++ b/tensorflow/core/common_runtime/base_collective_executor.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
@@ -214,6 +215,7 @@ BaseCollectiveExecutor::~BaseCollectiveExecutor() {}
 
 void BaseCollectiveExecutor::StartAbort(const Status& s) {
   VLOG(1) << "BaseCollectiveExecutor::StartAbort " << s;
+  cem_->GetParamResolver()->StartAbort(s);
   remote_access_->StartAbort(s);
 }
 
@@ -225,19 +227,12 @@ void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx,
 
   // On any individual collective Op failure we need to abort the
   // BufRendezvous so that other Ops in the instance don't hang
-  // waiting for transmissions that will never happen.  Do so after a
-  // delay so that the original error status is more likely to
-  // propagate up, and peers are unlikely to re-create the purged
-  // BufRendezvous by late-arriving requests.
+  // waiting for transmissions that will never happen.
   StatusCallback done_safe = [this, done, is_callback_called](const Status& s) {
     auto should_call_callback = !is_callback_called->exchange(true);
     if (should_call_callback) {
       if (!s.ok()) {
-        Ref();  // Ensure this lasts until the closure executes.
-        SchedNonBlockingClosureAfter(1000000, [this, s] {
-          remote_access_->buf_rendezvous()->StartAbort(s);
-          Unref();
-        });
+        remote_access_->buf_rendezvous()->StartAbort(s);
       }
       done(s);
     }
@@ -260,6 +255,7 @@ void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx,
   Tensor* output = ctx->mutable_output(0);
   const Tensor* input = (col_params.instance.type == REDUCTION_COLLECTIVE ||
                          col_params.instance.type == GATHER_COLLECTIVE ||
+                         col_params.instance.type == PERMUTE_COLLECTIVE ||
                          (col_params.instance.type == BROADCAST_COLLECTIVE &&
                           col_params.is_source))
                             ? &ctx->input(0)
@@ -271,30 +267,32 @@ void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx,
     DCHECK_EQ(nullptr, col_impl);
     return;
   }
-  CollectiveContext* col_ctx =
-      new CollectiveContext(this, dev_mgr_, ctx, CtxParams(ctx), col_params,
-                            exec_key, step_id_, input, output);
+  core::ScopedUnref unref(col_impl);
+  auto col_ctx = std::make_shared<CollectiveContext>(
+      this, dev_mgr_, ctx, CtxParams(ctx), col_params, exec_key, step_id_,
+      input, output);
   status = col_impl->InitializeCollectiveContext(col_ctx);
   if (!status.ok()) {
     done_safe(status);
-    delete col_ctx;
-    delete col_impl;
     return;
   }
   // Run on an unbounded work queue that can handle blocking work so as to not
   // starve executor threads.
+  col_impl->Ref();
   remote_access_->RunClosure([col_impl, col_ctx, done_safe, ctx]() {
+    core::ScopedUnref unref(col_impl);
     profiler::TraceMe activity(
-        [&] {
-          return strings::StrCat(ctx->op_kernel().name_view(), ":",
-                                 ctx->op_kernel().type_string_view(),
-                                 "#id=", ctx->step_id(), "#");
+        [ctx] {
+          string op = profiler::TraceMeOp(ctx->op_kernel().name_view(),
+                                          ctx->op_kernel().type_string_view());
+          return profiler::TraceMeEncode(std::move(op),
+                                         {{"id", ctx->step_id()}});
         },
         profiler::TraceMeLevel::kInfo);
+    col_impl->Ref();
     col_impl->Run([col_impl, col_ctx, done_safe](const Status& s) {
+      core::ScopedUnref unref(col_impl);
       done_safe(s);
-      delete col_ctx;
-      delete col_impl;
     });
   });
 }
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index 92e847407c8..11f28655f05 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -36,6 +36,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+constexpr BFCAllocator::ChunkHandle BFCAllocator::kInvalidChunkHandle;
+
 BFCAllocator::BFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
                            bool allow_growth, const string& name,
                            bool garbage_collection)
@@ -228,7 +230,7 @@ void* BFCAllocator::AllocateRawInternalWithRetry(
 void* BFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes,
                                 const AllocationAttributes& allocation_attr) {
   VLOG(1) << "AllocateRaw " << Name() << "  " << num_bytes;
-  if (allocation_attr.no_retry_on_failure) {
+  if (!allocation_attr.retry_on_failure) {
     // Return immediately upon the first failure if this is for allocating an
     // optional scratch space.
     bool dump_log_on_failure = VLOG_IS_ON(2);
@@ -832,7 +834,7 @@ bool BFCAllocator::MergeTimestampedChunks(size_t required_bytes) {
   // to to_merge.  If this is a standard merge (required_bytes == 0) then
   // merge them all, otherwise merge just until a Chunk of the required size
   // is produced.
-  for (int ci = 0; ci < to_merge.size(); ++ci) {
+  for (int ci = 0, end = to_merge.size(); ci < end; ++ci) {
     void* ptr = to_merge[ci];
     // It's possible that the Chunk associated with this memory location got
     // merged and deallocated in a prior iteration so refetch the handle and
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index 509fa9e8eed..f79a6048bbb 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -133,7 +133,7 @@ class BFCAllocator : public Allocator {
   // A ChunkHandle is an index into the chunks_ vector in BFCAllocator
   // kInvalidChunkHandle means an invalid chunk
   typedef size_t ChunkHandle;
-  static constexpr int kInvalidChunkHandle = -1;
+  static constexpr ChunkHandle kInvalidChunkHandle = SIZE_MAX;
 
   typedef int BinNum;
   static constexpr int kInvalidBinNum = -1;
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
index f3dea5c606a..ba21abcbaa8 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -123,6 +124,14 @@ void CollectiveParamResolverLocal::CompleteGroupLocal(
       gr = it->second.get();
     }
   }
+  {
+    mutex_lock l(status_mu_);
+    status = status_;
+  }
+  if (!status.ok()) {
+    done(status, nullptr);
+    return;
+  }
   {
     mutex_lock gr_lock(gr->mu);
     // If there is ever an error associated with a group key, we store the error
@@ -577,27 +586,44 @@ void CollectiveParamResolverLocal::FindInstanceRec(
   InstanceRec* irec = nullptr;
   bool exit_outside_locks = false;
   {
+    bool found_instance = false;
     mutex_lock l(instance_mu_);
-    auto it = instance_table_.find(cp->instance.instance_key);
-    if (it != instance_table_.end()) {
-      irec = it->second.get();
-      {
-        mutex_lock l(irec->in_mu);
-        if (irec->is_init) {
-          exit_outside_locks = true;
-        } else {
-          irec->init_waiters.push_back([this, done](InstanceRec* irec) {
-            CallbackWithStatus(done, irec);
-          });
-          return;
+    auto group_it = instance_table_.find(gr->group.group_key);
+    if (group_it != instance_table_.end()) {
+      auto instance_it = group_it->second.find(cp->instance.instance_key);
+      if (instance_it != group_it->second.end()) {
+        irec = instance_it->second.get();
+        {
+          mutex_lock l(irec->in_mu);
+          if (irec->is_init) {
+            exit_outside_locks = true;
+          } else {
+            irec->init_waiters.push_back([this, done](InstanceRec* irec) {
+              CallbackWithStatus(done, irec);
+            });
+            return;
+          }
         }
+        found_instance = true;
       }
-    } else {
+    }
+    if (!found_instance) {
       // Create new InstanceRec.
       irec = new InstanceRec;
-      instance_table_[cp->instance.instance_key].reset(irec);
+      instance_table_[gr->group.group_key][cp->instance.instance_key].reset(
+          irec);
     }
   }
+  Status status;
+  {
+    mutex_lock l(status_mu_);
+    status = status_;
+  }
+  if (!status.ok()) {
+    mutex_lock il(irec->out_mu);
+    irec->WaitForOutMu(il);
+    irec->status = status;
+  }
   if (exit_outside_locks) {
     CallbackWithStatus(done, irec);
     return;
@@ -790,9 +816,12 @@ void CollectiveParamResolverLocal::WaitForGroup(InstanceRec* ir,
                                                 bool is_source,
                                                 const IRConsumer& f) {
   std::vector<IRConsumer> ready_waiters;
-  {
+  do {
     mutex_lock l(ir->out_mu);
     ir->WaitForOutMu(l);
+    if (!ir->status.ok()) {
+      break;
+    }
     CHECK_EQ(cp->group.group_size, ir->known.size());
     CHECK_GE(cp->default_rank, 0);
     if (!ir->known[cp->default_rank]) {
@@ -828,11 +857,64 @@ void CollectiveParamResolverLocal::WaitForGroup(InstanceRec* ir,
     if (!ir->known_waiters.empty()) {
       ready_waiters = std::move(ir->known_waiters);
     }
-  }
+  } while (false);
   f(ir);
   for (auto& f : ready_waiters) {
     f(ir);
   }
 }
 
+void CollectiveParamResolverLocal::StartAbort(const Status& s) {
+  {
+    mutex_lock l(status_mu_);
+    if (!status_.ok()) {
+      VLOG(1) << "CollectiveParamResolverLocal already aborted. Ignoring "
+                 "subsequent abortion with status: "
+              << s;
+      return;
+    }
+    status_ = s;
+  }
+  StartAbortLocal(s);
+}
+
+void CollectiveParamResolverLocal::StartAbortLocal(const Status& s) {
+  {
+    mutex_lock l(group_mu_);
+    for (const auto& item : group_table_) {
+      GroupRec* gr = item.second.get();
+      std::vector<StatusCallback> waiting;
+      {
+        mutex_lock gl(gr->mu);
+        gr->status = s;
+        waiting.swap(gr->waiting);
+      }
+      for (const StatusCallback& done : waiting) {
+        done(s);
+      }
+    }
+  }
+  std::vector<InstanceRec*> instances;
+  {
+    mutex_lock l(instance_mu_);
+    for (const auto& group_entry : instance_table_) {
+      for (const auto& item : group_entry.second) {
+        instances.push_back(item.second.get());
+      }
+    }
+  }
+  for (InstanceRec* ir : instances) {
+    std::vector<IRConsumer> known_waiters;
+    {
+      mutex_lock il(ir->out_mu);
+      ir->WaitForOutMu(il);
+      ir->status = s;
+      known_waiters.swap(ir->known_waiters);
+    }
+    for (const IRConsumer& done : known_waiters) {
+      done(ir);
+    }
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.h b/tensorflow/core/common_runtime/collective_param_resolver_local.h
index c724ed93b7e..40f0f00affc 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.h
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/platform/thread_annotations.h"
 
 namespace tensorflow {
 class CompleteGroupRequest;
@@ -58,6 +59,8 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
                              CancellationManager* cancel_mgr,
                              const StatusCallback& done) override;
 
+  void StartAbort(const Status& s) override;
+
  protected:
   // For access to InstanceRec and CompleteDefaultRanking.
   friend class CollectiveParamResolverLocalTest;
@@ -227,6 +230,9 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
   void CallbackWithStatus(const InstanceRecCallback& done, InstanceRec* irec)
       TF_LOCKS_EXCLUDED(irec->out_mu);
 
+  void StartAbortLocal(const Status& s)
+      TF_LOCKS_EXCLUDED(status_mu_, group_mu_, instance_mu_);
+
   const bool nccl_;
   const DeviceMgr* dev_mgr_;
   DeviceResolverInterface* dev_resolver_;  // Not owned.
@@ -235,8 +241,10 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
   gtl::FlatMap<int32, std::unique_ptr<GroupRec>> group_table_
       TF_GUARDED_BY(group_mu_);
   mutex instance_mu_;
-  gtl::FlatMap<int32, std::unique_ptr<InstanceRec>> instance_table_
-      TF_GUARDED_BY(instance_mu_);
+  gtl::FlatMap<int32, gtl::FlatMap<int32, std::unique_ptr<InstanceRec>>>
+      instance_table_ TF_GUARDED_BY(instance_mu_);
+  mutex status_mu_;
+  Status status_ TF_GUARDED_BY(status_mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
index ef85a1eed93..f23f03dc406 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
@@ -14,15 +14,22 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
 
+#include <atomic>
+
 #include "tensorflow/core/common_runtime/collective_executor_mgr.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/device_resolver_local.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/blocking_counter.h"
+#include "tensorflow/core/platform/random.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
@@ -34,15 +41,20 @@ class CollectiveParamResolverLocalTest : public ::testing::Test {
   CollectiveParamResolverLocalTest() {
     ConfigProto cp;
     SessionOptions options;
-    string task_name = "/job:localhost/replica:0/task:0";
+    task_name_ = "/job:localhost/replica:0/task:0";
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", NUM_DEVS});
     std::vector<std::unique_ptr<Device>> devices;
-    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices));
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name_, &devices));
     device_mgr_ = absl::make_unique<StaticDeviceMgr>(std::move(devices));
     drl_.reset(new DeviceResolverLocal(device_mgr_.get()));
+    ResetParamResolver();
+  }
+
+  void ResetParamResolver() {
+    ConfigProto cp;
     prl_.reset(new CollectiveParamResolverLocal(cp, device_mgr_.get(),
-                                                drl_.get(), task_name));
+                                                drl_.get(), task_name_));
   }
 
   void RunCompleteDefaultRanking(
@@ -74,6 +86,7 @@ class CollectiveParamResolverLocalTest : public ::testing::Test {
     }
   }
 
+  string task_name_;
   std::unique_ptr<DeviceMgr> device_mgr_;
   std::unique_ptr<DeviceResolverLocal> drl_;
   std::unique_ptr<CollectiveParamResolverLocal> prl_;
@@ -287,4 +300,192 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsBroadcastForgotSender) {
   }
 }
 
+CollectiveParams MakeCollectiveParams(int group_key, int instance_key,
+                                      bool is_source) {
+  CollectiveParams cp;
+  cp.group.group_key = group_key;
+  cp.group.group_size = NUM_DEVS;
+  cp.group.device_type = DeviceType("CPU");
+  cp.group.num_tasks = 1;
+  cp.instance.instance_key = instance_key;
+  // CompleteInstanceLocal only waits for the group for broadcasts.
+  // Testing with broadcasts yields better coverage.
+  cp.instance.type = BROADCAST_COLLECTIVE;
+  cp.is_source = is_source;
+  return cp;
+}
+
+TEST_F(CollectiveParamResolverLocalTest, AbortPendingGroup) {
+  CancellationManager cancel_mgr;
+  std::vector<CollectiveParams> cp(NUM_DEVS - 1);
+  BlockingCounter start(NUM_DEVS - 1);
+  BlockingCounter done(NUM_DEVS - 1);
+  for (int i = 0; i < NUM_DEVS - 1; ++i) {
+    Env::Default()->SchedClosure([this, i, &cancel_mgr, &cp, &start, &done] {
+      string device =
+          strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i);
+      cp[i] = MakeCollectiveParams(/*group_key*/ 100, /*instance_key*/ 100,
+                                   /*is_source*/ i == 0);
+      prl_->CompleteParamsAsync(device, &cp[i], &cancel_mgr,
+                                [&done](const Status& s) {
+                                  EXPECT_EQ(s.code(), error::ABORTED);
+                                  EXPECT_EQ(s.error_message(), "__aborted__");
+                                  done.DecrementCount();
+                                });
+      start.DecrementCount();
+    });
+  }
+  start.Wait();
+  prl_->StartAbort(Status(error::ABORTED, "__aborted__"));
+  done.Wait();
+}
+
+TEST_F(CollectiveParamResolverLocalTest, AbortPendingInstance) {
+  CancellationManager cancel_mgr;
+  std::vector<CollectiveParams> cp(NUM_DEVS);
+  int group_key = 100;
+  int instance_key = 100;
+  // First do a normal CompleteParamsAsync to complete the group;
+  {
+    BlockingCounter done(NUM_DEVS);
+    for (int i = 0; i < NUM_DEVS; ++i) {
+      Env::Default()->SchedClosure([this, group_key, instance_key, i,
+                                    &cancel_mgr, &cp, &done] {
+        string device =
+            strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i);
+        cp[i] = MakeCollectiveParams(group_key, instance_key,
+                                     /*is_source*/ i == 0);
+        prl_->CompleteParamsAsync(device, &cp[i], &cancel_mgr,
+                                  [&done](const Status& s) {
+                                    EXPECT_EQ(s.code(), error::OK);
+                                    done.DecrementCount();
+                                  });
+      });
+    }
+    done.Wait();
+  }
+  BlockingCounter start(NUM_DEVS - 1);
+  BlockingCounter done(NUM_DEVS - 1);
+  for (int i = 0; i < NUM_DEVS - 1; ++i) {
+    Env::Default()->SchedClosure(
+        [this, group_key, instance_key, i, &cancel_mgr, &cp, &start, &done] {
+          string device =
+              strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i);
+          cp[i] = MakeCollectiveParams(group_key, instance_key + 1,
+                                       /*is_source*/ i == 0);
+          prl_->CompleteParamsAsync(
+              device, &cp[i], &cancel_mgr, [&done](const Status& s) {
+                EXPECT_EQ(s.code(), error::ABORTED);
+                EXPECT_EQ(s.error_message(), "__aborted__");
+                done.DecrementCount();
+              });
+          start.DecrementCount();
+        });
+  }
+  start.Wait();
+  prl_->StartAbort(Status(error::ABORTED, "__aborted__"));
+  done.Wait();
+}
+
+TEST_F(CollectiveParamResolverLocalTest, CompleteParamsAfterAbortion) {
+  CancellationManager cancel_mgr;
+  int group_key = 100;
+  int instance_key = 100;
+  // First do a normal CompleteParamsAsync to complete the group;
+  {
+    std::vector<CollectiveParams> cp(NUM_DEVS);
+    BlockingCounter done(NUM_DEVS);
+    for (int i = 0; i < NUM_DEVS; ++i) {
+      Env::Default()->SchedClosure([this, group_key, instance_key, i,
+                                    &cancel_mgr, &cp, &done] {
+        string device =
+            strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i);
+        cp[i] = MakeCollectiveParams(group_key, instance_key,
+                                     /*is_source*/ i == 0);
+        prl_->CompleteParamsAsync(device, &cp[i], &cancel_mgr,
+                                  [&done](const Status& s) {
+                                    EXPECT_EQ(s.code(), error::OK);
+                                    done.DecrementCount();
+                                  });
+      });
+    }
+    done.Wait();
+  }
+  prl_->StartAbort(Status(error::ABORTED, "__aborted__"));
+
+  auto complete_params = [this, &cancel_mgr](int group_key, int instance_key) {
+    string device = "/job:localhost/replica:0/task:0/device:CPU:0";
+    Notification done;
+    auto cp = MakeCollectiveParams(group_key, instance_key,
+                                   /*is_source*/ true);
+    prl_->CompleteParamsAsync(device, &cp, &cancel_mgr,
+                              [&done](const Status& s) {
+                                EXPECT_EQ(s.code(), error::ABORTED);
+                                EXPECT_EQ(s.error_message(), "__aborted__");
+                                done.Notify();
+                              });
+    done.WaitForNotification();
+  };
+  // It should error without waiting for the all following combinations:
+  // - existing group, existing instance
+  complete_params(group_key, instance_key);
+  // - existing group, new instance
+  complete_params(group_key, instance_key + 1);
+  // - new group, new instance
+  complete_params(group_key + 1, instance_key + 1);
+}
+
+TEST_F(CollectiveParamResolverLocalTest, AbortNormalCompleteParamsAsync) {
+  // The concurrent nature makes it hard to test abortion, which can happen at
+  // any moment. We don't have good options to inject control points into the
+  // code to explicitly test every possible scenarios, so we run the test for
+  // many times to have a better chance to cover different cases.
+  CancellationManager cancel_mgr;
+  std::atomic<int64> num_ok{0};
+  for (int cnt = 0; cnt < 100; ++cnt) {
+    // Launching threads that keep doing CompleteInstanceLocal.
+    BlockingCounter done(NUM_DEVS);
+    for (int i = 0; i < NUM_DEVS; ++i) {
+      string device =
+          strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i);
+      Env::Default()->SchedClosure(
+          [this, i, device, &num_ok, &cancel_mgr, &done] {
+            int key = 100;
+            while (true) {
+              Status status;
+              Notification n;
+              auto cp =
+                  MakeCollectiveParams(/* group_key*/ key, /*instance_key*/ key,
+                                       /*is_source*/ i == 0);
+              prl_->CompleteParamsAsync(device, &cp, &cancel_mgr,
+                                        [&status, &n](const Status& s) {
+                                          status = s;
+                                          n.Notify();
+                                        });
+              n.WaitForNotification();
+              // The status should be either OK or the aborted status.
+              if (!status.ok()) {
+                EXPECT_EQ(status.code(), error::ABORTED);
+                EXPECT_EQ(status.error_message(), "__aborted__");
+                done.DecrementCount();
+                return;
+              }
+              ++num_ok;
+              ++key;
+            }
+          });
+    }
+    // Introduce a random delay up to 50ms, so that we're more likely to abort
+    // on different code points each time.
+    int64 delay_ms = random::New64() % 50000;
+    Env::Default()->SleepForMicroseconds(delay_ms);
+    prl_->StartAbort(Status(error::ABORTED, "__aborted__"));
+    done.Wait();
+    ResetParamResolver();
+  }
+  // There should be at least a few successes, otherwise the delay may be too
+  // short and may not cover certain stages of param resolution.
+  EXPECT_GT(num_ok.load(), 50);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/constant_folding_test.cc b/tensorflow/core/common_runtime/constant_folding_test.cc
index e621b3b5006..38368c4015e 100644
--- a/tensorflow/core/common_runtime/constant_folding_test.cc
+++ b/tensorflow/core/common_runtime/constant_folding_test.cc
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/common_runtime/constant_folding.h"
+
 #include <map>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/cc/ops/nn_ops.h"
-#include "tensorflow/core/common_runtime/constant_folding.h"
-
 #include "tensorflow/cc/ops/array_ops_internal.h"
+#include "tensorflow/cc/ops/nn_ops.h"
 #include "tensorflow/cc/ops/sendrecv_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/common_runtime/device.h"
@@ -687,8 +687,10 @@ class TestTFFileSystem : public ::tensorflow::NullFileSystem {
       : ::tensorflow::NullFileSystem(),
         data_tensor_(test::AsTensor<double>({1., 2., 3., 4.}, {2, 2})) {}
 
+  using ::tensorflow::NullFileSystem::NewReadOnlyMemoryRegionFromFile;
+
   ::tensorflow::Status NewReadOnlyMemoryRegionFromFile(
-      const string& fname,
+      const string& fname, ::tensorflow::TransactionToken* token,
       std::unique_ptr<::tensorflow::ReadOnlyMemoryRegion>* result) override {
     if (fname != kTestMemRegionName) {
       return ::tensorflow::errors::Unimplemented(
diff --git a/tensorflow/core/common_runtime/device.h b/tensorflow/core/common_runtime/device.h
index 13877933ce6..9e2db9faaf1 100644
--- a/tensorflow/core/common_runtime/device.h
+++ b/tensorflow/core/common_runtime/device.h
@@ -60,7 +60,7 @@ class Device : public DeviceBase {
   ~Device() override;
 
   // Full name of this device (see top comment).
-  const string& name() const override { return device_attributes_.name(); }
+  const std::string& name() const override { return device_attributes_.name(); }
 
   // Parsed name of this device
   const DeviceNameUtils::ParsedName& parsed_name() const {
@@ -71,7 +71,9 @@ class Device : public DeviceBase {
   // human-readable and not computer-parsed, except that two devices
   // with the same device_type() are expected to perform similarly
   // (both from a computation and communication perspective).
-  const string& device_type() const { return device_attributes_.device_type(); }
+  const std::string& device_type() const {
+    return device_attributes_.device_type();
+  }
 
   // Returns an aggregation of device attributes.
   const DeviceAttributes& attributes() const override {
@@ -157,15 +159,15 @@ class Device : public DeviceBase {
   virtual ResourceMgr* resource_manager() { return rmgr_; }
 
   // Summarizes the status of this Device, for debugging.
-  string DebugString() const { return device_attributes_.DebugString(); }
+  std::string DebugString() const { return device_attributes_.DebugString(); }
 
   // Assembles the parameter components into a complete DeviceAttributes value.
   static DeviceAttributes BuildDeviceAttributes(
-      const string& name, DeviceType device, Bytes memory_limit,
-      const DeviceLocality& locality, const string& physical_device_desc);
+      const std::string& name, DeviceType device, Bytes memory_limit,
+      const DeviceLocality& locality, const std::string& physical_device_desc);
 
   static DeviceAttributes BuildDeviceAttributes(
-      const string& name, DeviceType device, Bytes memory_limit,
+      const std::string& name, DeviceType device, Bytes memory_limit,
       const DeviceLocality& locality) {
     // Pass in an empty string as physical device name.
     return BuildDeviceAttributes(name, device, memory_limit, locality, "");
diff --git a/tensorflow/core/common_runtime/device_factory.h b/tensorflow/core/common_runtime/device_factory.h
index c026a188f5e..9d911c20e25 100644
--- a/tensorflow/core/common_runtime/device_factory.h
+++ b/tensorflow/core/common_runtime/device_factory.h
@@ -30,16 +30,16 @@ struct SessionOptions;
 class DeviceFactory {
  public:
   virtual ~DeviceFactory() {}
-  static void Register(const string& device_type, DeviceFactory* factory,
+  static void Register(const std::string& device_type, DeviceFactory* factory,
                        int priority);
-  static DeviceFactory* GetFactory(const string& device_type);
+  static DeviceFactory* GetFactory(const std::string& device_type);
 
   // Append to "*devices" all suitable devices, respecting
   // any device type specific properties/counts listed in "options".
   //
   // CPU devices are added first.
   static Status AddDevices(const SessionOptions& options,
-                           const string& name_prefix,
+                           const std::string& name_prefix,
                            std::vector<std::unique_ptr<Device>>* devices);
 
   // Helper for tests.  Create a single device of type "type".  The
@@ -73,7 +73,7 @@ class DeviceFactory {
 
   // Most clients should call AddDevices() instead.
   virtual Status CreateDevices(
-      const SessionOptions& options, const string& name_prefix,
+      const SessionOptions& options, const std::string& name_prefix,
       std::vector<std::unique_ptr<Device>>* devices) = 0;
 
   // Return the device priority number for a "device_type" string.
@@ -88,7 +88,7 @@ class DeviceFactory {
   // higher than the packaged devices.  See calls to
   // REGISTER_LOCAL_DEVICE_FACTORY to see the existing priorities used
   // for built-in devices.
-  static int32 DevicePriority(const string& device_type);
+  static int32 DevicePriority(const std::string& device_type);
 };
 
 namespace dfactory {
@@ -127,7 +127,7 @@ class Registrar {
   // GPUCompatibleCPU: 70
   // ThreadPoolDevice: 60
   // Default: 50
-  explicit Registrar(const string& device_type, int priority = 50) {
+  explicit Registrar(const std::string& device_type, int priority = 50) {
     DeviceFactory::Register(device_type, new Factory(), priority);
   }
 };
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 8ad6ae19afc..a1bbcde94bd 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -1526,8 +1526,6 @@ Status DirectSession::GetOrCreateExecutors(
     auto it = executors_.find(sorted_key);
     if (it != executors_.end()) {
       *executors_and_keys = it->second.get();
-      // Insert this under the original key.
-      executors_.emplace(key, it->second);
       return Status::OK();
     }
   }
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index 911b59eed17..9108b04ef05 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -120,6 +120,21 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "placement_test",
+    srcs = ["placement_test.cc"],
+    deps = [
+        ":context",
+        ":core",
+        ":eager_operation",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cuda_library(
     name = "eager_operation",
     srcs = [
@@ -148,6 +163,7 @@ tf_cuda_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform:platform_port",
+        "//tensorflow/core/util:abstract_stack_trace",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -321,7 +337,7 @@ tf_cuda_library(
         ],
         "//tensorflow:windows": KERNEL_AND_DEVICE_DEPS,
         "//conditions:default": KERNEL_AND_DEVICE_DEPS + [
-            "//tensorflow/compiler/jit:xla_kernel_creator_util",
+            "//tensorflow/compiler/jit:xla_kernel_creator",
         ],
     }),
 )
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index ae992f9d6f1..93b78ed6a26 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -275,13 +275,19 @@ Device* SelectBestMatchingDevice(const DeviceNameUtils::ParsedName& pattern,
 }  // namespace
 
 Status EagerContext::SelectDevice(DeviceNameUtils::ParsedName preferred,
-                                  const PrioritizedDeviceTypeVector& supported,
-                                  const DataType dtype, Device** out) const {
+                                  const NodeDef& ndef, Device** out) const {
   DCHECK(out != nullptr);
 
-  // We always place string tensors on the CPU device if we're allowed to.
-  if (dtype == DT_STRING && AllowSoftPlacement()) {
-    preferred = HostCPU()->parsed_name();
+  PrioritizedDeviceTypeVector supported_devs;
+  auto device_type_list = prioritized_device_type_list();
+  TF_RETURN_IF_ERROR(SupportedDeviceTypesForNode(
+      *device_type_list, ndef, &supported_devs, &HostCPU()->parsed_name()));
+  if (supported_devs.empty()) {
+    return errors::NotFound("Could not find device for node: ",
+                            errors::FormatNodeNameForError(ndef.name()), " = ",
+                            ndef.op(), "[", SummarizeAttrs(ndef), "]",
+                            "\nAll kernels registered for op ", ndef.op(),
+                            ":\n", KernelsRegisteredForOp(ndef.op()));
   }
 
   // Select the first matching registered device from the supported device
@@ -290,7 +296,7 @@ Status EagerContext::SelectDevice(DeviceNameUtils::ParsedName preferred,
   const auto pflr_device_set = pflr()->device_set();
   const PrioritizedDeviceVector& existing =
       pflr_device_set->prioritized_devices();
-  *out = SelectBestMatchingDevice(preferred, existing, supported);
+  *out = SelectBestMatchingDevice(preferred, existing, supported_devs);
   if (*out != nullptr) {
     return Status::OK();
   }
@@ -302,7 +308,7 @@ Status EagerContext::SelectDevice(DeviceNameUtils::ParsedName preferred,
     soft_device_name.has_id = false;
     // TODO(b/148213746): Soft placement logic picks up another task if the
     // requested does not exist.
-    *out = SelectBestMatchingDevice(soft_device_name, existing, supported);
+    *out = SelectBestMatchingDevice(soft_device_name, existing, supported_devs);
     if (*out != nullptr) {
       return Status::OK();
     }
@@ -313,7 +319,7 @@ Status EagerContext::SelectDevice(DeviceNameUtils::ParsedName preferred,
         "Could not satisfy device specification '", preferred,
         "'. enable_soft_placement=", AllowSoftPlacement(),
         ". Supported device types [",
-        absl::StrJoin(DeviceTypesToString(supported), ", "),
+        absl::StrJoin(DeviceTypesToString(supported_devs), ", "),
         "]. All available devices [",
         absl::StrJoin(DevicesToString(existing), ", "), "].");
   }
@@ -322,7 +328,7 @@ Status EagerContext::SelectDevice(DeviceNameUtils::ParsedName preferred,
       absl::StrJoin(DevicesToString(existing), ", "),
       "]. enable_soft_placement=", AllowSoftPlacement(),
       ". Supported devices types [",
-      absl::StrJoin(DeviceTypesToString(supported), ", "), "].");
+      absl::StrJoin(DeviceTypesToString(supported_devs), ", "), "].");
 }
 
 void EagerContext::ResetClusterFLR(
@@ -901,14 +907,14 @@ Status EagerContext::FindCompositeDeviceFromName(
   return errors::NotFound("Unknown composite device: ", device_name);
 }
 
-Status EagerContext::FindCustomDeviceFromName(const string& device_name,
-                                              CustomDevice** dev) const {
+bool EagerContext::FindCustomDeviceFromName(const string& device_name,
+                                            CustomDevice** dev) const {
   auto dev_it = custom_devices_.find(device_name);
   if (dev_it == custom_devices_.end()) {
-    return errors::InvalidArgument(device_name, " unknown device.");
+    return false;
   }
   *dev = dev_it->second.get();
-  return Status::OK();
+  return true;
 }
 
 Status EagerContext::RegisterCustomDevice(
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 5a14ebdfda7..286eb44fbeb 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -222,24 +222,18 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
 
   // Select an appropriate device for an operation.
   //
-  // Given the preferred device for the operation, and the list of devices the
-  // operation supports, finds the best suitable device for the operation in
-  // this context.
+  // Given the preferred device for the operation, and the node_def, finds the
+  // best suitable device for the operation in this context.
   //
   // The preferred device is specified as a `ParsedName` containing the elements
   // (details) that the resulting device should match. If there are no such
   // devices, and the context currently allows soft device placement, a suitable
   // device not matching `preferred` will be chosen.
   //
-  // The `dtype` parameter specifies the operation's result data type, if
-  // known. Setting it to DT_INVALID will make this method not use the data type
-  // for its decisions.
-  //
   // The chosen device is stored in the `device` argument. The argument is not
   // modified unless this method returns `Status::OK()`.
   Status SelectDevice(DeviceNameUtils::ParsedName preferred,
-                      const PrioritizedDeviceTypeVector& supported,
-                      const DataType dtype, Device** device) const;
+                      const NodeDef& ndef, Device** out) const;
 
   // Sets the implicit copy policy for the current thread.
   void SetThreadLocalMirroringPolicy(ContextMirroringPolicy);
@@ -487,8 +481,8 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   Status FindCompositeDeviceFromName(StringPiece device_name,
                                      CompositeDevice** device) const;
 
-  Status FindCustomDeviceFromName(const string& device_name,
-                                  CustomDevice** dev) const;
+  bool FindCustomDeviceFromName(const string& device_name,
+                                CustomDevice** dev) const;
 
   Status RegisterCustomDevice(const string& name,
                               std::unique_ptr<CustomDevice> device);
diff --git a/tensorflow/core/common_runtime/eager/context_test.cc b/tensorflow/core/common_runtime/eager/context_test.cc
index 7f34884b4db..e577b1d8152 100644
--- a/tensorflow/core/common_runtime/eager/context_test.cc
+++ b/tensorflow/core/common_runtime/eager/context_test.cc
@@ -81,95 +81,6 @@ class EagerContextTest : public ::testing::Test {
   EagerContext* context_;
 };
 
-TEST_F(EagerContextTest, SelectDeviceExplicitHardPlacement) {
-  SessionOptions options;
-  options.config.set_log_device_placement(true);
-  options.config.set_allow_soft_placement(false);
-  InitContext(options, DEVICE_PLACEMENT_EXPLICIT);
-
-  Device* dev;
-  DeviceNameUtils::ParsedName requested;
-  const PrioritizedDeviceTypeVector supported{
-      std::make_pair(DeviceType(DEVICE_GPU), 20),
-      std::make_pair(DeviceType(DEVICE_CPU), 10),
-  };
-
-  // No supported devices should result in an error.
-  requested.Clear();
-  Status status = context()->SelectDevice(
-      requested, PrioritizedDeviceTypeVector{}, DT_INVALID, &dev);
-  EXPECT_TRUE(errors::IsInvalidArgument(status));
-  EXPECT_TRUE(
-      absl::StrContains(status.error_message(), "No supported device found"))
-      << "unexpected error message " << status.error_message();
-
-  // An invalid requested device should also cause an error.
-  ASSERT_TRUE(DeviceNameUtils::ParseLocalName("GPU:99", &requested));
-  status = context()->SelectDevice(requested, supported, DT_INVALID, &dev);
-  EXPECT_TRUE(errors::IsInvalidArgument(status));
-  EXPECT_TRUE(absl::StrContains(status.error_message(),
-                                "Could not satisfy device specification"))
-      << "unexpected error message " << status.error_message();
-
-  // Should pick the "best" supported device if given no constraints.
-  requested.Clear();
-  TF_ASSERT_OK(context()->SelectDevice(requested, supported, DT_INVALID, &dev));
-  EXPECT_EQ(dev->device_type(), DEVICE_GPU);
-
-  // Should pick a CPU if asked to.
-  ASSERT_TRUE(DeviceNameUtils::ParseLocalName("CPU:1", &requested));
-  TF_ASSERT_OK(context()->SelectDevice(requested, supported, DT_INVALID, &dev));
-  EXPECT_EQ(dev->device_type(), DEVICE_CPU);
-
-  // String tensors stay in GPU under hard device placement.
-  requested.Clear();
-  TF_ASSERT_OK(context()->SelectDevice(requested, supported, DT_STRING, &dev));
-  EXPECT_EQ(dev->device_type(), DEVICE_GPU);
-}
-
-TEST_F(EagerContextTest, SelectDeviceExplicitSoftPlacement) {
-  SessionOptions options;
-  options.config.set_log_device_placement(true);
-  options.config.set_allow_soft_placement(true);
-  InitContext(options, DEVICE_PLACEMENT_EXPLICIT);
-
-  Device* dev;
-  DeviceNameUtils::ParsedName requested;
-  const PrioritizedDeviceTypeVector supported{
-      std::make_pair(DeviceType(DEVICE_GPU), 20),
-      std::make_pair(DeviceType(DEVICE_CPU), 10),
-  };
-
-  // No supported devices should result in an error.
-  requested.Clear();
-  Status status = context()->SelectDevice(
-      requested, PrioritizedDeviceTypeVector{}, DT_INVALID, &dev);
-  EXPECT_TRUE(errors::IsInvalidArgument(status));
-  EXPECT_TRUE(
-      absl::StrContains(status.error_message(), "No supported device found"))
-      << "unexpected error message " << status.error_message();
-
-  // An invalid requested device should be replaced by the "best" one.
-  ASSERT_TRUE(DeviceNameUtils::ParseLocalName("GPU:99", &requested));
-  TF_ASSERT_OK(context()->SelectDevice(requested, supported, DT_INVALID, &dev));
-  EXPECT_EQ(dev->device_type(), DEVICE_GPU);
-
-  // Should pick the "best" supported device if given no constraints.
-  requested.Clear();
-  TF_ASSERT_OK(context()->SelectDevice(requested, supported, DT_INVALID, &dev));
-  EXPECT_EQ(dev->device_type(), DEVICE_GPU);
-
-  // Should pick a CPU if asked to.
-  ASSERT_TRUE(DeviceNameUtils::ParseLocalName("CPU:1", &requested));
-  TF_ASSERT_OK(context()->SelectDevice(requested, supported, DT_INVALID, &dev));
-  EXPECT_EQ(dev->device_type(), DEVICE_CPU);
-
-  // String tensors move to CPU under soft device placement.
-  requested.Clear();
-  TF_ASSERT_OK(context()->SelectDevice(requested, supported, DT_STRING, &dev));
-  EXPECT_EQ(dev->device_type(), DEVICE_CPU);
-}
-
 TEST_F(EagerContextTest, CompositeDevice) {
   InitContext(SessionOptions(), DEVICE_PLACEMENT_EXPLICIT);
   std::vector<string> underlying_devices = {
diff --git a/tensorflow/core/common_runtime/eager/core.cc b/tensorflow/core/common_runtime/eager/core.cc
index 0191527748b..43daf37f6b2 100644
--- a/tensorflow/core/common_runtime/eager/core.cc
+++ b/tensorflow/core/common_runtime/eager/core.cc
@@ -45,7 +45,9 @@ AbstractTensorInterface* TensorHandle::Resolve(Status* status) {
     *status = custom_device->CopyTensorFromDevice(
         this, "/job:localhost/replica:0/task:0/device:CPU:0", &copy);
     if (status->ok()) {
-      return copy->Resolve(status);
+      auto result = copy->Resolve(status);
+      copy->Unref();
+      return result;
     } else {
       return nullptr;
     }
@@ -124,12 +126,14 @@ ImmediateExecutionTensorHandle* EagerContext::CopyTensorHandleToDevice(
   *status = this->FindDeviceFromName(device_name, &device);
   if (!status->ok()) {
     tensorflow::CustomDevice* dev;
-    *status = this->FindCustomDeviceFromName(device_name, &dev);
-    if (status->ok()) {
+    if (this->FindCustomDeviceFromName(device_name, &dev)) {
       *status = dev->CopyTensorToDevice(input, &result);
       if (status->ok()) {
         return result;
       }
+    } else {
+      *status =
+          tensorflow::errors::InvalidArgument(device_name, " unknown device.");
     }
     return nullptr;
   }
@@ -139,8 +143,7 @@ ImmediateExecutionTensorHandle* EagerContext::CopyTensorHandleToDevice(
     return nullptr;
   }
   tensorflow::CustomDevice* dev;
-  *status = this->FindCustomDeviceFromName(handle_device_name, &dev);
-  if (status->ok()) {
+  if (this->FindCustomDeviceFromName(handle_device_name, &dev)) {
     *status = dev->CopyTensorFromDevice(input, device_name, &result);
     if (status->ok()) {
       return result;
@@ -194,14 +197,10 @@ Status EagerOperation::Execute(absl::Span<AbstractTensorHandle*> retvals,
   if (device == kVariantDeviceNull) {
     TF_RETURN_IF_ERROR(eager::MaybePinToResourceDevice(&device, *this));
   }
-  if (device == kVariantDeviceNull) {
+  if (device == kVariantDeviceNull && ctx_.PinSmallOpsToCPU()) {
     bool pin_to_cpu;
     TF_RETURN_IF_ERROR(eager::MaybePinSmallOpsToCpu(
-        &pin_to_cpu, Name(),
-        absl::MakeSpan(
-            reinterpret_cast<ImmediateExecutionTensorHandle**>(inputs_.data()),
-            inputs_.size()),
-        ctx_));
+        &pin_to_cpu, Name(), GetInputs(), ctx_.HostCPU()->name()));
     if (pin_to_cpu) {
       device = ctx_.HostCPU();
     }
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.cc b/tensorflow/core/common_runtime/eager/eager_operation.cc
index 47610629479..947b67a4dab 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.cc
+++ b/tensorflow/core/common_runtime/eager/eager_operation.cc
@@ -235,6 +235,14 @@ Status EagerOperation::InputLength(const char* input_name, int* length) {
   return Status::OK();
 }
 
+absl::Span<ImmediateExecutionTensorHandle* const> EagerOperation::GetInputs()
+    const {
+  // TODO(b/162536003): Remove reinterpret_cast.
+  return absl::MakeSpan(
+      reinterpret_cast<ImmediateExecutionTensorHandle* const*>(inputs_.data()),
+      inputs_.size());
+}
+
 Status EagerOperation::OutputLength(const char* output_name, int* length) {
   Status status;
   const tensorflow::OpDef* op_def = GetOpDef(&status);
@@ -306,6 +314,7 @@ Status EagerOperation::Reset(
   }
   attrs_.Reset(op);
   use_xla_ = false;
+  stack_trace_.reset();
   is_function_ = is_function;
   cancellation_manager_ = nullptr;
   executor_ = executor ? executor : &ctx_.Executor();
@@ -391,7 +400,7 @@ Status EagerOperation::SetDeviceName(const char* c_name) {
     last_set_device_name_ = name;
     device_name_ = DeviceNameUtils::ParsedNameToString(device_parsed_name_);
     CustomDevice* custom_device;
-    if (ctx_.FindCustomDeviceFromName(device_name_, &custom_device).ok()) {
+    if (ctx_.FindCustomDeviceFromName(device_name_, &custom_device)) {
       device_ = custom_device;
     } else {
       // Device placement for physical devices happens lazily in
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
index dad578ba9f0..327411e19c9 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.h
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/util/abstract_stack_trace.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
@@ -81,6 +82,7 @@ class EagerOperation : public ImmediateExecutionOperation {
 
   Status AddInput(AbstractTensorHandle* input) override;
   Status AddInputList(absl::Span<AbstractTensorHandle* const> inputs) override;
+  absl::Span<ImmediateExecutionTensorHandle* const> GetInputs() const override;
   Status Execute(absl::Span<AbstractTensorHandle*> retvals,
                  int* num_retvals) override;
   const tensorflow::OpDef* OpDef() const override { return op_def_; };
@@ -120,6 +122,14 @@ class EagerOperation : public ImmediateExecutionOperation {
 
   Status SetUseXla(bool enable) override;
 
+  void SetStackTrace(AbstractStackTrace stack_trace) override {
+    stack_trace_ = stack_trace;
+  }
+
+  absl::optional<AbstractStackTrace> GetStackTrace() override {
+    return stack_trace_;
+  }
+
   Status Reset(const char* op, const char* device_name, bool remote,
                EagerExecutor* executor,
                const absl::optional<EagerRemoteFunctionParams>
@@ -218,6 +228,7 @@ class EagerOperation : public ImmediateExecutionOperation {
   VariantDevice device_;
 
   bool use_xla_ = false;
+  absl::optional<AbstractStackTrace> stack_trace_;
   bool is_function_;  // Conceptually const, but can't be because of Reset
   bool colocation_exempt_;
   CancellationManager* cancellation_manager_ = nullptr;  // Not owned.
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index a031974a969..4bffd887750 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -88,6 +88,16 @@ const string& DeviceNameOrUnspecified(VariantDevice device) {
   }
 }
 
+// Returns whether a kernel should be cached.
+bool KernelCacheEnabled(const OpDef& op_def) {
+  if (data::DatasetOpKernel::IsDatasetOp(&op_def)) {
+    return false;
+  }
+  // TODO(b/162540360): Revisit a way to mark kernels as uncachable once we have
+  // 5+ kernels to exclude.
+  return true;
+}
+
 // This function expects *handle to point to an existing tensor handle that is
 // currently on "handle_device", but where the operation expects that input to
 // reside on "expected_input_device".  The function will arrange for this
@@ -193,6 +203,22 @@ Status ValidateInputTypeAndPlacement(
     for (int i = 0; i < n_inputs; ++i) {
       TensorHandle* handle = handles[i];
       Device* expected_device = kernel->InputDevice(i);
+      if (!kernel->IsFunction() && handle->Type() == TensorHandle::PACKED) {
+        // Extract a handle on the op device from a packed input.
+        // This happens when a function is marked for XLA compilation.
+        // MaybePackInputTensor guarantees that a primitive op has no packed
+        // input at this point.
+        for (int j = 0; j < handle->NumPackedHandles(); ++j) {
+          TensorHandle* h = nullptr;
+          TF_RETURN_IF_ERROR(handle->ExtractPackedHandle(j, &h));
+          if ((h->op_device() != nullptr) &&
+              (h->op_device()->name() == op->DeviceName())) {
+            op->UpdateInput(i, h);
+            handle = h;
+            break;
+          }
+        }
+      }
       auto handle_device_variant = handle->DeviceOrHostCPU(*ctx);
       if (VariantDeviceIsCustom(handle_device_variant)) {
         return errors::Unimplemented(
@@ -395,7 +421,7 @@ Status GetOrCreateKernelAndDevice(
     // When LazyCopyFunctionRemoteInputs is disabled, all inputs need to be on
     // local devices, since we execute a remote function through worker service,
     // which doesn't accept remote inputs.
-    for (int i = 0; i < op->Inputs().size(); i++) {
+    for (int i = 0, end = op->Inputs().size(); i < end; i++) {
       TensorHandle* input = op->Inputs()[i];
       if (!ctx.LazyCopyFunctionRemoteInputs() &&
           input->Type() == TensorHandle::REMOTE) {
@@ -470,20 +496,8 @@ Status GetOrCreateKernelAndDevice(
 
     const NodeDef& ndef = op->MutableAttrs()->BuildNodeDef();
     if (device == nullptr) {
-      PrioritizedDeviceTypeVector supported_devs;
-      auto device_type_list = ctx.prioritized_device_type_list();
       TF_RETURN_IF_ERROR(
-          SupportedDeviceTypesForNode(*device_type_list, ndef, &supported_devs,
-                                      &ctx.HostCPU()->parsed_name()));
-      if (supported_devs.empty()) {
-        return errors::NotFound(
-            "Could not find device for node: ",
-            errors::FormatNodeNameForError(ndef.name()), " = ", ndef.op(), "[",
-            SummarizeAttrs(ndef), "]", "\nAll kernels registered for op ",
-            ndef.op(), ":\n", KernelsRegisteredForOp(ndef.op()));
-      }
-      TF_RETURN_IF_ERROR(ctx.SelectDevice(op->GetDeviceParsedName(),
-                                          supported_devs, DT_INVALID, &device));
+          ctx.SelectDevice(op->GetDeviceParsedName(), ndef, &device));
 
       DVLOG(1) << "Placer place op [" << op->Name()
                << "] on device: " << device->name();
@@ -551,7 +565,7 @@ Status GetOrCreateKernelAndDevice(
       // programs that build input pipeline graphs in a loop.
       const OpDef* op_def;
       TF_RETURN_IF_ERROR(OpDefForOp(op->Name().data(), &op_def));
-      if (!data::DatasetOpKernel::IsDatasetOp(op_def)) {
+      if (KernelCacheEnabled(*op_def)) {
         ctx.AddKernelToCache(cache_key, kernel.get());
       }
     }
@@ -570,6 +584,101 @@ Status GetOrCreateKernelAndDevice(
   return Status::OK();
 }
 
+Status CreateUnshapedOutput(
+    const KernelAndDevice& kernel, const int output_num, Device* output_device,
+    const DataType& output_dtype,
+    const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
+    EagerContext* ctx, TensorHandle** output) {
+#if defined(IS_MOBILE_PLATFORM)
+  return errors::Unimplemented(
+      "Remote outputs are not available on mobile devices.");
+#else  // !IS_MOBILE_PLATFORM
+  int64 op_id;
+  if (remote_func_params.has_value()) {
+    op_id = remote_func_params.value().op_id;
+  } else {
+    return errors::InvalidArgument(
+        "Unable to find a remote op id for a remote output of ", kernel.name());
+  }
+  string remote_task;
+  if (!DeviceNameUtils::GetTaskName(output_device->parsed_name(),
+                                    &remote_task)) {
+    return errors::InvalidArgument(
+        "Unable to find remote task corresponding to device ",
+        output_device->name());
+  }
+  *output = TensorHandle::CreateUnshapedRemoteHandle(
+      op_id, output_num, remote_task, output_dtype, output_device, ctx);
+  return Status::OK();
+#endif  // !IS_MOBILE_PLATFORM
+}
+
+Status AddOrExecuteNode(core::RefCountPtr<KernelAndDevice> kernel,
+                        EagerOperation* op, TensorHandle** retvals) {
+  EagerExecutor& executor = op->Executor();
+  EagerContext& ctx = op->EagerContext();
+  GraphCollector* graph_collector = nullptr;
+  if (ctx.ShouldStoreGraphs()) {
+    graph_collector = ctx.GetGraphCollector();
+  }
+  const int num_outputs = kernel->num_outputs();
+  absl::optional<EagerRemoteFunctionParams> remote_func_params =
+      op->remote_func_params();
+  if (kernel->IsCrossProcess() && !remote_func_params.has_value()) {
+    // Create an eager op id for a cross-process function if not exist.
+#if defined(IS_MOBILE_PLATFORM)
+    return errors::Unimplemented(
+        "Cross-process functions are not supported on mobile devices.");
+#else  // !IS_MOBILE_PLATFORM
+    const int64 op_id = ctx.RemoteMgr()->NextOpId();
+    remote_func_params =
+        EagerRemoteFunctionParams{op_id, /*step_id=*/absl::nullopt};
+#endif  // !IS_MOBILE_PLATFORM
+  }
+  if (executor.Async()) {
+    const DataTypeVector& output_dtypes = kernel->output_dtypes();
+    for (int i = 0, end = num_outputs; i < end; ++i) {
+      Device* output_device = ctx.CanonicalDevice(kernel->OutputDevice(i));
+      if (output_device == nullptr || output_device->IsLocal()) {
+        retvals[i] = TensorHandle::CreateEmptyLocalHandle(
+            /* d= */ output_device, /* op_device= */ kernel->device(),
+            /* resource_device= */ kernel->OutputResourceDevice(i),
+            output_dtypes[i], &ctx);
+      } else {
+        TF_RETURN_IF_ERROR(
+            CreateUnshapedOutput(*kernel, i, output_device, output_dtypes[i],
+                                 remote_func_params, &ctx, &retvals[i]));
+      }
+    }
+    auto node = absl::make_unique<AsyncExecuteNode>(
+        &ctx, op->Inputs(), remote_func_params, std::move(kernel),
+        graph_collector, op->GetCancellationManager(),
+        absl::Span<TensorHandle*>(retvals, num_outputs), op->GetStackTrace());
+    // Release the inputs from the eager operation since the AsyncExecuteNode
+    // would have taken ownership. This allows the inputs to be forwarded if
+    // possible.
+    op->Clear();
+    // For async mode, execution order will make sure that all
+    // input handles are ready before executing them.
+    // TODO(b/137118203): Consider executing "cheap" kernels inline for
+    // performance.
+    return executor.AddOrExecute(std::move(node));
+  } else {
+    for (int i = 0, end = num_outputs; i < end; ++i) {
+      retvals[i] = nullptr;
+    }
+    ExecuteNode node(&ctx, op->Inputs(), remote_func_params, kernel,
+                     graph_collector, op->GetCancellationManager(),
+                     {retvals, static_cast<size_t>(num_outputs)});
+    Status s = executor.SyncExecute(&node);
+    // We release the inputs AFTER executing the operation in sync mode since
+    // ExecuteNode does not increment the reference count and thus does not have
+    // ownership of the inputs while executing.
+    op->Clear();
+    return s;
+  }
+}
+
 // There are a lot of references to devices in this function and around.
 // Here is what they mean:
 //  EagerOperation::Device(): The device on which the user requested the op
@@ -612,51 +721,11 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
     }
   }
 
-  GraphCollector* graph_collector = nullptr;
-  if (ctx.ShouldStoreGraphs()) {
-    graph_collector = ctx.GetGraphCollector();
-  }
-
-  Status s;
-  if (executor.Async()) {
-    const DataTypeVector& output_dtypes = kernel->output_dtypes();
-    for (int i = 0; i < num_outputs; ++i) {
-      retvals[i] = TensorHandle::CreateEmptyLocalHandle(
-          /* d= */ ctx.CanonicalDevice(kernel->OutputDevice(i)),
-          /* op_device= */ kernel->device(),
-          /* resource_device= */ kernel->OutputResourceDevice(i),
-          output_dtypes[i], &ctx);
-    }
-    auto node = absl::make_unique<AsyncExecuteNode>(
-        &ctx, op->Inputs(), op->remote_func_params(), std::move(kernel),
-        graph_collector, op->GetCancellationManager(),
-        absl::Span<TensorHandle*>(retvals, num_outputs));
-    // Release the inputs from the eager operation since the AsyncExecuteNode
-    // would have taken ownership. This allows the inputs to be forwarded if
-    // possible.
-    op->Clear();
-    // For async mode, execution order will make sure that all
-    // input handles are ready before executing them.
-    // TODO(b/137118203): Consider executing "cheap" kernels inline for
-    // performance.
-    s = executor.AddOrExecute(std::move(node));
-  } else {
-    for (int i = 0; i < num_outputs; ++i) {
-      retvals[i] = nullptr;
-    }
-    ExecuteNode node(&ctx, op->Inputs(), op->remote_func_params(), kernel,
-                     graph_collector, op->GetCancellationManager(),
-                     {retvals, static_cast<size_t>(num_outputs)});
-    s = executor.SyncExecute(&node);
-    // We release the inputs AFTER executing the operation in sync mode since
-    // ExecuteNode does not increment the reference count and thus does not have
-    // ownership of the inputs while executing.
-    op->Clear();
-  }
+  Status s = AddOrExecuteNode(std::move(kernel), op, retvals);
   // Since the operation failed, we need to Unref any outputs if they were
   // allocated.
   if (!s.ok()) {
-    for (int i = 0; i < num_outputs; ++i) {
+    for (int i = 0, end = num_outputs; i < end; ++i) {
       if (retvals[i] != nullptr) {
         retvals[i]->Unref();
       }
@@ -766,7 +835,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
                                profiler::TraceMeLevel::kInfo);
     const bool eagerly_copy_function_remote_inputs =
         !ctx.LazyCopyFunctionRemoteInputs() || !op->is_function();
-    for (int i = 0; i < op->Inputs().size(); i++) {
+    for (int i = 0, end = op->Inputs().size(); i < end; i++) {
       tensorflow::TensorHandle* input = op->Inputs()[i];
       tensorflow::Device* input_device = absl::get<Device*>(input->device());
       tensorflow::Device* input_device_or_cpu =
@@ -833,7 +902,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   DataTypeVector output_dtypes;
   TF_RETURN_IF_ERROR(GetOutputDTypes(op, &output_dtypes));
 
-  const size_t num_outputs = static_cast<int>(output_dtypes.size());
+  const size_t num_outputs = output_dtypes.size();
   if (num_outputs != *num_retvals) {
     return errors::InvalidArgument(
         "num_retvals does not match expected output dtypes");
@@ -841,7 +910,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   *num_retvals = num_outputs;
 
   const tensorflow::uint64 id = remote_op->id();
-  for (int i = 0; i < num_outputs; ++i) {
+  for (size_t i = 0; i < num_outputs; ++i) {
     // TODO(nareshmodi): Change the callback to instead add the decref to a
     // list of pending decrefs that we can send as a batch with the next
     // execute.
@@ -894,7 +963,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   // Since the operation failed, we need to Unref any outputs that were
   // allocated.
   if (!s.ok()) {
-    for (int i = 0; i < num_outputs; ++i) {
+    for (size_t i = 0; i < num_outputs; ++i) {
       retvals[i]->Unref();
     }
   }
@@ -903,18 +972,34 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
 }
 #endif  // IS_MOBILE_PLATFORM
 
-Status GetKernelOutputs(std::vector<Tensor>* outputs, int num_outputs,
-                        TensorHandle** retvals, EagerContext* ctx,
-                        KernelAndDevice* kernel) {
-  for (int i = 0; i < num_outputs; ++i) {
+Status GetKernelOutputs(
+    std::vector<EagerKernelRet>* outputs, int num_outputs,
+    TensorHandle** retvals, EagerContext* ctx, KernelAndDevice* kernel,
+    const absl::optional<EagerRemoteFunctionParams>& remote_func_params) {
+  for (int i = 0, end = num_outputs; i < end; ++i) {
     if (retvals[i] == nullptr) {
-      retvals[i] = TensorHandle::CreateLocalHandle(
-          std::move((*outputs)[i]),
-          /* d= */ ctx->CanonicalDevice(kernel->OutputDevice(i)),
-          /* op_device= */ kernel->device(),
-          /* resource_device= */ kernel->OutputResourceDevice(i), ctx);
+      EagerKernelRet& ret = (*outputs)[i];
+      Device* output_device = ctx->CanonicalDevice(kernel->OutputDevice(i));
+      if (ret.index() == 0) {
+        retvals[i] = TensorHandle::CreateLocalHandle(
+            std::move(absl::get<Tensor>(ret)),
+            /* d= */ output_device,
+            /* op_device= */ kernel->device(),
+            /* resource_device= */ kernel->OutputResourceDevice(i), ctx);
+      } else {
+        const DataTypeVector& output_dtypes = kernel->output_dtypes();
+        TF_RETURN_IF_ERROR(
+            CreateUnshapedOutput(*kernel, i, output_device, output_dtypes[i],
+                                 remote_func_params, ctx, &retvals[i]));
+#if !defined(IS_MOBILE_PLATFORM)
+        TF_RETURN_IF_ERROR(
+            retvals[i]->SetRemoteShape(absl::get<TensorShape>(ret),
+                                       output_device, ctx->GetContextViewId()));
+#endif  // IS_MOBILE_PLATFORM
+      }
     } else {
-      if (TF_PREDICT_FALSE(kernel->device() != retvals[i]->op_device())) {
+      if (!kernel->IsFunction() &&
+          TF_PREDICT_FALSE(kernel->device() != retvals[i]->op_device())) {
         return errors::Internal(
             "Kernel output tensor handle has a different op device than the "
             "kernel. This should never happen.");
@@ -926,9 +1011,21 @@ Status GetKernelOutputs(std::vector<Tensor>* outputs, int num_outputs,
             "the specified kernel output device. This should never happen.");
       }
 
-      TF_RETURN_IF_ERROR(
-          retvals[i]->SetTensor(std::move((*outputs)[i]),
-                                ctx->CanonicalDevice(kernel->OutputDevice(i))));
+      EagerKernelRet& ret = (*outputs)[i];
+      if (ret.index() == 0) {
+        TF_RETURN_IF_ERROR(retvals[i]->SetTensor(
+            std::move(absl::get<Tensor>(ret)),
+            ctx->CanonicalDevice(kernel->OutputDevice(i))));
+      } else {
+#if defined(IS_MOBILE_PLATFORM)
+        return errors::Unimplemented(
+            "Remote outputs are not available on mobile devices.");
+#else  // !IS_MOBILE_PLATFORM
+        TF_RETURN_IF_ERROR(retvals[i]->SetRemoteShape(
+            absl::get<TensorShape>(ret),
+            absl::get<Device*>(retvals[i]->device()), ctx->GetContextViewId()));
+#endif  // !IS_MOBILE_PLATFORM
+      }
     }
   }
   return Status::OK();
@@ -1008,7 +1105,7 @@ Status EagerKernelExecute(
     absl::Span<TensorHandle*> retvals) {
   profiler::TraceMe activity("EagerKernelExecute",
                              profiler::TraceMeLevel::kInfo);
-  std::vector<Tensor> outputs(1);
+  std::vector<EagerKernelRet> outputs(1);
 
   ExecuteNodeArgs inputs(op_inputs.size());
   TF_RETURN_IF_ERROR(inputs.Init(ctx, op_inputs, kernel));
@@ -1033,7 +1130,7 @@ Status EagerKernelExecute(
         "happen. Please file a bug with the TensorFlow team.");
   }
   return GetKernelOutputs(&outputs, retvals.size(), retvals.data(), ctx,
-                          kernel.get());
+                          kernel.get(), remote_func_params);
 }
 
 namespace {
@@ -1215,7 +1312,7 @@ void EagerKernelExecuteAsync(
     GraphCollector* graph_collector, CancellationManager* cancellation_manager,
     TensorHandle** retvals, int num_outputs, StatusCallback done) {
   auto inputs = std::make_shared<ExecuteNodeArgs>(op_inputs.size());
-  auto outputs = std::make_shared<std::vector<Tensor>>(1);
+  auto outputs = std::make_shared<std::vector<EagerKernelRet>>(1);
 
   Status s = inputs->Init(ctx, op_inputs, kernel);
   if (!s.ok()) {
@@ -1228,7 +1325,8 @@ void EagerKernelExecuteAsync(
       ctx->StepContainer(), *inputs, outputs.get(), cancellation_manager,
       remote_func_params,
       [retvals, inputs, outputs, num_outputs, ctx, graph_collector,
-       kernel_raw = kernel.get(), done = std::move(done)](const Status& s) {
+       remote_func_params, kernel_raw = kernel.get(),
+       done = std::move(done)](const Status& s) {
         auto wrapped_done = [&](const Status& s) {
           kernel_raw->Unref();
           done(s);
@@ -1242,7 +1340,7 @@ void EagerKernelExecuteAsync(
         }
         DCHECK_EQ(num_outputs, outputs->size());
         wrapped_done(GetKernelOutputs(outputs.get(), num_outputs, retvals, ctx,
-                                      kernel_raw));
+                                      kernel_raw, remote_func_params));
       });
 }
 }  // namespace
@@ -1301,8 +1399,13 @@ void EagerLocalExecuteAsync(EagerOperation* op, TensorHandle** retvals,
     graph_collector = ctx.GetGraphCollector();
   }
 
-  for (int i = 0; i < num_outputs; ++i) {
-    retvals[i] = nullptr;
+  for (int i = 0, end = num_outputs; i < end; ++i) {
+    const DataTypeVector& output_dtypes = kernel->output_dtypes();
+    retvals[i] = TensorHandle::CreateEmptyLocalHandle(
+        /* d= */ ctx.CanonicalDevice(kernel->OutputDevice(i)),
+        /* op_device= */ kernel->device(),
+        /* resource_device= */ kernel->OutputResourceDevice(i),
+        output_dtypes[i], &ctx);
   }
 
   EagerKernelExecuteAsync(
@@ -1313,7 +1416,7 @@ void EagerLocalExecuteAsync(EagerOperation* op, TensorHandle** retvals,
         // Since the operation failed, we need to Unref any outputs if they were
         // allocated.
         if (!s.ok()) {
-          for (int i = 0; i < num_outputs; ++i) {
+          for (int i = 0, end = num_outputs; i < end; ++i) {
             if (retvals[i] != nullptr) {
               retvals[i]->Unref();
             }
diff --git a/tensorflow/core/common_runtime/eager/execute_node.h b/tensorflow/core/common_runtime/eager/execute_node.h
index 7924471066e..6d11ecbf7a4 100644
--- a/tensorflow/core/common_runtime/eager/execute_node.h
+++ b/tensorflow/core/common_runtime/eager/execute_node.h
@@ -150,14 +150,16 @@ class AsyncExecuteNode : public EagerNode {
       core::RefCountPtr<KernelAndDevice> kernel,
       GraphCollector* graph_collector,
       CancellationManager* cancellation_manager,
-      absl::Span<TensorHandle*> retvals)
+      absl::Span<TensorHandle*> retvals,
+      absl::optional<AbstractStackTrace> stack_trace)
       : EagerNode(),
         ctx_(ctx),
         inputs_(inputs),
         remote_func_params_(remote_func_params),
         kernel_(std::move(kernel)),
         graph_collector_(graph_collector),
-        cancellation_manager_(cancellation_manager) {
+        cancellation_manager_(cancellation_manager),
+        stack_trace_(stack_trace) {
     // Copy the output handles, since the container for them might get
     // destroyed.
     for (auto handle : retvals) {
@@ -194,10 +196,14 @@ class AsyncExecuteNode : public EagerNode {
       }
       ++i;
     }
-    const Status status = EagerKernelExecute(
+    Status status = EagerKernelExecute(
         ctx_, inputs_, remote_func_params_, kernel_, graph_collector_,
         cancellation_manager_, absl::MakeSpan(retvals_));
     if (!status.ok()) {
+      if (stack_trace_.has_value()) {
+        status = Status(status.code(), status.error_message(),
+                        stack_trace_->ToStackFrames());
+      }
       Abort(status);
       return status;
     }
@@ -227,6 +233,7 @@ class AsyncExecuteNode : public EagerNode {
   core::RefCountPtr<KernelAndDevice> kernel_;
   GraphCollector* graph_collector_;
   CancellationManager* const cancellation_manager_;
+  absl::optional<AbstractStackTrace> stack_trace_;
   absl::InlinedVector<TensorHandle*, 2> retvals_;
 };
 
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index f47db8eea55..00d832365e9 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -46,9 +46,6 @@ limitations under the License.
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/tensor_slice_reader_cache.h"
 #if !defined(IS_MOBILE_PLATFORM)
-#if !defined(PLATFORM_WINDOWS)
-#include "tensorflow/compiler/jit/xla_kernel_creator_util.h"
-#endif  // !PLATFORM_WINDOWS
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 #endif  // !IS_MOBILE_PLATFORM
 
@@ -242,7 +239,8 @@ struct OpExecutionState : public core::RefCounted {
 
 Status KernelAndDeviceOp::Run(
     ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
-    std::vector<Tensor>* outputs, CancellationManager* cancellation_manager,
+    std::vector<EagerKernelRet>* outputs,
+    CancellationManager* cancellation_manager,
     const absl::optional<EagerRemoteFunctionParams>& remote_func_params) {
   OpKernelContext::Params params;
   params.device = device_;
@@ -296,7 +294,7 @@ Status KernelAndDeviceOp::Run(
     // 'AnnotatedTraceMe' will trace both scheduling time on host and execution
     // time on device of the OpKernel.
     profiler::AnnotatedTraceMe activity(
-        [&] { return kernel_->TraceString(&context, /*verbose=*/false); },
+        [&] { return kernel_->TraceString(context, /*verbose=*/false); },
         profiler::TraceMeLevel::kInfo);
     device_->Compute(kernel_.get(), &context);
   }
@@ -319,7 +317,8 @@ Status KernelAndDeviceOp::Run(
 
 Status KernelAndDeviceFunc::Run(
     ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
-    std::vector<Tensor>* outputs, CancellationManager* cancellation_manager,
+    std::vector<EagerKernelRet>* outputs,
+    CancellationManager* cancellation_manager,
     const absl::optional<EagerRemoteFunctionParams>& remote_func_params) {
   Notification n;
   Status status;
@@ -334,7 +333,8 @@ Status KernelAndDeviceFunc::Run(
 
 void KernelAndDeviceFunc::RunAsync(
     ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
-    std::vector<Tensor>* outputs, CancellationManager* cancellation_manager,
+    std::vector<EagerKernelRet>* outputs,
+    CancellationManager* cancellation_manager,
     const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
     std::function<void(const Status&)> done) {
   std::shared_ptr<FunctionLibraryRuntime::Options> opts = nullptr;
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index 87c2d7a5510..7bf4afbaf24 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -82,6 +82,8 @@ class EagerKernelArgs : public FunctionArgsInterface {
   gtl::InlinedVector<TensorValue, 4> tensor_args_;
 };
 
+typedef absl::variant<Tensor, TensorShape> EagerKernelRet;
+
 // KernelAndDevice encapsulates the logic needed to run a computation eagerly.
 // The computation can be a single instantiated kernel (implemented by
 // KernelAndDeviceOp below) or a multi-device function (implemented by
@@ -124,10 +126,13 @@ class KernelAndDevice : public core::RefCounted {
 
   virtual bool IsFunction() { return false; }
 
+  virtual bool IsCrossProcess() { return false; }
+
   // TODO(ashankar): Handle list-valued inputs.
   virtual Status Run(
       ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
-      std::vector<Tensor>* outputs, CancellationManager* cancellation_manager,
+      std::vector<EagerKernelRet>* outputs,
+      CancellationManager* cancellation_manager,
       const absl::optional<EagerRemoteFunctionParams>& remote_func_params) = 0;
 
   // Execute kernel asynchronously when applicable. Different from `Run` which
@@ -140,7 +145,8 @@ class KernelAndDevice : public core::RefCounted {
   // from sync execution.
   virtual void RunAsync(
       ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
-      std::vector<Tensor>* outputs, CancellationManager* cancellation_manager,
+      std::vector<EagerKernelRet>* outputs,
+      CancellationManager* cancellation_manager,
       const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
       StatusCallback done) = 0;
 
@@ -203,14 +209,15 @@ class KernelAndDeviceOp final : public KernelAndDevice {
               GraphCollector* graph_collector) override;
 
   Status Run(ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
-             std::vector<Tensor>* outputs,
+             std::vector<EagerKernelRet>* outputs,
              CancellationManager* cancellation_manager,
              const absl::optional<EagerRemoteFunctionParams>&
                  remote_func_params) override;
 
   void RunAsync(
       ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
-      std::vector<Tensor>* outputs, CancellationManager* cancellation_manager,
+      std::vector<EagerKernelRet>* outputs,
+      CancellationManager* cancellation_manager,
       const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
       StatusCallback done) override {
     // Trivial async implementation on top of the sync version
@@ -288,6 +295,8 @@ class KernelAndDeviceFunc : public KernelAndDevice {
 
   bool IsFunction() override { return true; };
 
+  bool IsCrossProcess() override { return is_cross_process_; }
+
   Status InstantiateFunc(const Context& ctx, const NodeDef& ndef,
                          GraphCollector* graph_collector);
 
@@ -295,14 +304,15 @@ class KernelAndDeviceFunc : public KernelAndDevice {
               GraphCollector* graph_collector) override;
 
   Status Run(ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
-             std::vector<Tensor>* outputs,
+             std::vector<EagerKernelRet>* outputs,
              CancellationManager* cancellation_manager,
              const absl::optional<EagerRemoteFunctionParams>&
                  remote_func_params) override;
 
   void RunAsync(
       ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
-      std::vector<Tensor>* outputs, CancellationManager* cancellation_manager,
+      std::vector<EagerKernelRet>* outputs,
+      CancellationManager* cancellation_manager,
       const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
       StatusCallback done) override;
 
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
index a7aac4a8f6d..33e85b25fb4 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
@@ -133,7 +133,7 @@ void BM_KernelAndDeviceRun(int iters) {
   gtl::InlinedVector<TensorValue, 4> inputs;
   inputs.push_back(TensorValue(&t));
   inputs.push_back(TensorValue(&t));
-  std::vector<Tensor> outputs;
+  std::vector<EagerKernelRet> outputs;
   NodeDef ndef(AttrBuilder("MatMul")
                    .Set("T", DT_FLOAT)
                    .Set("transpose_a", false)
diff --git a/tensorflow/core/common_runtime/eager/placement_test.cc b/tensorflow/core/common_runtime/eager/placement_test.cc
new file mode 100644
index 00000000000..4ea38d2f5f9
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/placement_test.cc
@@ -0,0 +1,184 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+using ::tensorflow::test::function::NDef;
+
+constexpr char kFullCPU[] = "/job:a/replica:0/task:0/device:CPU:0";
+constexpr char kFullGPU[] = "/job:a/replica:0/task:0/device:FakeGPU:0";
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Op, kernel to set up the environment.
+//
+// The Placer uses information about the op (input types),
+// kernel (device constraints). To avoid depending on the full runtime, we
+// define dummy implementations of these, and register them with the
+// runtime.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// A dummy OpKernel that is used to register ops on different devices.
+class DummyOp : public OpKernel {
+ public:
+  explicit DummyOp(OpKernelConstruction* context) : OpKernel(context) {}
+  void Compute(OpKernelContext* context) override {}
+};
+
+// Register the following ops so they can be added to a Graph, and
+// kernels so that they can be placed on particular device types.
+REGISTER_OP("InvalidOp").Output("o: Ref(float)");
+
+REGISTER_OP("TestOp").Output("o: Ref(float)");
+REGISTER_KERNEL_BUILDER(Name("TestOp").Device(DEVICE_CPU).Priority(1), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("TestOp").Device("FakeGPU").Priority(2), DummyOp);
+
+static Device* CreateDevice(const char* type, const char* name) {
+  class FakeDevice : public Device {
+   public:
+    explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
+    Status Sync() override { return Status::OK(); }
+    Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
+  };
+  DeviceAttributes attr;
+  attr.set_name(name);
+  attr.set_device_type(type);
+  return new FakeDevice(attr);
+}
+
+class PlacementTest : public ::testing::Test {
+ public:
+  PlacementTest() : device_manager_(nullptr), context_(nullptr) {}
+
+  ~PlacementTest() override {
+    delete device_manager_;
+    if (context_) {
+      context_->Unref();
+    }
+  }
+
+  EagerContext* context() { return context_; }
+
+  void InitContext(const SessionOptions& opts,
+                   ContextDevicePlacementPolicy policy) {
+    ASSERT_EQ(context_, nullptr);
+    InitDeviceManager();
+    context_ = new EagerContext(
+        opts, policy,
+        /* default_mirroring_policy */ MIRRORING_NONE,
+        /* async */ false,
+        /* lazy_copy_function_remote_inputs */ false, device_manager_,
+        /* device_mgr_owned */ false, /* rendezvous */ nullptr,
+        /* custom_kernel_creator */ nullptr,
+        /* cluster_flr */ nullptr);
+  }
+
+ protected:
+  void InitDeviceManager() {
+    ASSERT_EQ(device_manager_, nullptr);
+    device_manager_ = new DynamicDeviceMgr();
+    std::vector<std::unique_ptr<Device>> added_devices;
+    SessionOptions opts;
+
+    // Have to use real CPU device. Other, ctx->HostCPU() will return invalid
+    // device.
+    added_devices.emplace_back(CreateDevice(DEVICE_CPU, kFullCPU));
+    added_devices.emplace_back(CreateDevice("FakeGPU", kFullGPU));
+
+    TF_CHECK_OK(device_manager_->AddDevices(std::move(added_devices)));
+  }
+
+  DynamicDeviceMgr* device_manager_;
+  EagerContext* context_;
+};
+
+TEST_F(PlacementTest, SelectDeviceExplicitHardPlacement) {
+  SessionOptions options;
+  options.config.set_log_device_placement(true);
+  options.config.set_allow_soft_placement(false);
+  InitContext(options, DEVICE_PLACEMENT_EXPLICIT);
+
+  Device* dev;
+  DeviceNameUtils::ParsedName requested;
+
+  // No supported devices should result in an error.
+  requested.Clear();
+  NodeDef invalid_op = NDef("invalid_op", "InvalidOp", {}, {});
+
+  Status status = context()->SelectDevice(requested, invalid_op, &dev);
+  LOG(ERROR) << status.ToString();
+  EXPECT_TRUE(errors::IsNotFound(status));
+  EXPECT_TRUE(absl::StrContains(status.error_message(),
+                                "Could not find device for node"))
+      << "unexpected error message " << status.error_message();
+
+  // An invalid requested device should also cause an error.
+  ASSERT_TRUE(DeviceNameUtils::ParseLocalName("FakeGPU:99", &requested));
+  NodeDef node = NDef("x", "TestOp", {}, {});
+  status = context()->SelectDevice(requested, node, &dev);
+
+  EXPECT_TRUE(errors::IsInvalidArgument(status));
+  EXPECT_TRUE(absl::StrContains(status.error_message(),
+                                "Could not satisfy device specification"))
+      << "unexpected error message " << status.error_message();
+
+  // Should pick the device with higher priority if given no constraints.
+  requested.Clear();
+  TF_ASSERT_OK(context()->SelectDevice(requested, node, &dev));
+  EXPECT_EQ(dev->device_type(), "FakeGPU");
+
+  // Should pick a CPU if asked to.
+  ASSERT_TRUE(DeviceNameUtils::ParseLocalName("CPU:0", &requested));
+  TF_ASSERT_OK(context()->SelectDevice(requested, node, &dev));
+  EXPECT_EQ(dev->device_type(), DEVICE_CPU);
+}
+
+TEST_F(PlacementTest, SelectDeviceExplicitSoftPlacement) {
+  SessionOptions options;
+  options.config.set_log_device_placement(true);
+  options.config.set_allow_soft_placement(true);
+  InitContext(options, DEVICE_PLACEMENT_EXPLICIT);
+
+  Device* dev;
+  DeviceNameUtils::ParsedName requested;
+
+  // No supported devices should result in an error.
+  requested.Clear();
+  NodeDef invalid_op = NDef("invalid_op", "InvalidOp", {}, {});
+
+  Status status = context()->SelectDevice(requested, invalid_op, &dev);
+  LOG(ERROR) << status.ToString();
+  EXPECT_TRUE(errors::IsNotFound(status));
+  EXPECT_TRUE(absl::StrContains(status.error_message(),
+                                "Could not find device for node"))
+      << "unexpected error message " << status.error_message();
+
+  // An invalid requested device should be replaced by the "best" one.
+  ASSERT_TRUE(DeviceNameUtils::ParseLocalName("FakeGPU:99", &requested));
+  NodeDef node = NDef("x", "TestOp", {}, {});
+  status = context()->SelectDevice(requested, node, &dev);
+  EXPECT_EQ(dev->device_type(), "FakeGPU");
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/placement_utils.cc b/tensorflow/core/common_runtime/eager/placement_utils.cc
index 8898516612f..148c6c6ce03 100644
--- a/tensorflow/core/common_runtime/eager/placement_utils.cc
+++ b/tensorflow/core/common_runtime/eager/placement_utils.cc
@@ -78,14 +78,15 @@ bool IsFunction(StringPiece op_name) {
 
 bool IsCustomDevice(StringPiece device_name, const EagerContext& ctx) {
   CustomDevice* custom_device;
-  return ctx.FindCustomDeviceFromName(string(device_name), &custom_device).ok();
+  return ctx.FindCustomDeviceFromName(string(device_name), &custom_device);
 }
 
-Status MaybePinSmallOpsToCpu(bool* result, StringPiece op_name,
-                             absl::Span<ImmediateExecutionTensorHandle*> args,
-                             const EagerContext& ctx) {
-  if (!ctx.PinSmallOpsToCPU() || IsFunction(op_name) ||
-      IsColocationExempt(op_name) || !IsPinnableOp(op_name)) {
+Status MaybePinSmallOpsToCpu(
+    bool* result, StringPiece op_name,
+    absl::Span<ImmediateExecutionTensorHandle* const> args,
+    StringPiece cpu_device_name) {
+  if (IsFunction(op_name) || IsColocationExempt(op_name) ||
+      !IsPinnableOp(op_name)) {
     *result = false;
     return Status::OK();
   }
@@ -104,16 +105,12 @@ Status MaybePinSmallOpsToCpu(bool* result, StringPiece op_name,
     const char* device_name = arg->DeviceName(&s);
     DataType dtype = arg->DataType();
     TF_RETURN_IF_ERROR(s);
-    if (IsCustomDevice(device_name, ctx)) {
-      *result = false;
-      return Status::OK();
-    }
 
     DVLOG(2) << "for op " << op_name << " input " << i << " "
              << DataTypeString(dtype) << " input device = " << device_name;
 
     // Input is on CPU.
-    if (device_name != ctx.HostCPU()->name()) {
+    if (device_name != cpu_device_name) {
       *result = false;
       return Status::OK();
     }
diff --git a/tensorflow/core/common_runtime/eager/placement_utils.h b/tensorflow/core/common_runtime/eager/placement_utils.h
index d58bd304b27..b051e13ea08 100644
--- a/tensorflow/core/common_runtime/eager/placement_utils.h
+++ b/tensorflow/core/common_runtime/eager/placement_utils.h
@@ -35,9 +35,10 @@ bool IsCustomDevice(StringPiece device_name, const EagerContext& ctx);
 // Pin the op to cpu if all op inputs are on the CPU, small (<64 elements) and
 // integers (int32/int64). This can be disabled by setting the environment
 // variable "TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING" to "0" or "false".
-Status MaybePinSmallOpsToCpu(bool* result, StringPiece op_name,
-                             absl::Span<ImmediateExecutionTensorHandle*> args,
-                             const EagerContext& ctx);
+Status MaybePinSmallOpsToCpu(
+    bool* result, StringPiece op_name,
+    absl::Span<ImmediateExecutionTensorHandle* const> args,
+    StringPiece cpu_device_name);
 
 // If a resource touching input is specified, all resource-touching ops run in
 // the device the resource is, regardless of anything else that has been
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index 12bd70d705d..d7b2ef4be1e 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -316,8 +316,7 @@ Status TensorHandle::CreatePackedHandle(std::vector<TensorHandle*>&& handles,
   std::vector<string> devices;
   for (auto* handle : handles) {
     if (VariantDeviceIsCustom(handle->device())) {
-      return errors::InvalidArgument(
-          "CustomDevice is not supported for packing.");
+      devices.push_back(absl::get<CustomDevice*>(handle->device())->name());
     } else {
       devices.push_back(handle->op_device() ? handle->op_device()->name()
                                             : ctx->HostCPU()->name());
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index 007ba33f231..99f88fe886a 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -53,8 +53,7 @@ class EagerContext;
 // Associates a Tensor and a Device, used in the eager runtime. Internal version
 // of the TFE_TensorHandle struct and the python EagerTensor class
 // (unrelated to python TensorHandle).
-class TensorHandle : public ImmediateExecutionTensorHandle,
-                     public core::RefCounted {
+class TensorHandle : public ImmediateExecutionTensorHandle {
   // TensorHandle for dtype != DT_RESOURCE
   TensorHandle(tensorflow::Tensor&& t, Device* d, Device* op_device,
                Device* resource_device, EagerContext* ctx);
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index d441235ff71..ff45e84bce8 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -530,9 +530,9 @@ Status ExecutorState<PropagatorStateType>::ProcessSync(
     tracing::ScopedRegion region(tracing::EventCategory::kCompute,
                                  op_kernel->name_view());
     profiler::AnnotatedTraceMe activity(
-        [&] {
+        [op_kernel, &ctx] {
           return op_kernel->TraceString(
-              &ctx, /*verbose=*/profiler::TfOpDetailsEnabled());
+              ctx, /*verbose=*/profiler::TfOpDetailsEnabled());
         },
         profiler::GetTFTraceMeLevel(is_expensive));
     device->Compute(op_kernel, &ctx);
@@ -597,9 +597,9 @@ void ExecutorState<PropagatorStateType>::ProcessAsync(
   nodestats::SetOpStart(stats);
   {
     profiler::AnnotatedTraceMe activity(
-        [&] {
+        [async_kernel, state] {
           return async_kernel->TraceString(
-              &state->ctx, /*verbose=*/profiler::TfOpDetailsEnabled());
+              state->ctx, /*verbose=*/profiler::TfOpDetailsEnabled());
         },
         profiler::GetTFTraceMeLevel(kernel_stats_->IsExpensive(item)));
     immutable_state_.params().device->ComputeAsync(async_kernel, &state->ctx,
@@ -1077,18 +1077,12 @@ bool ExecutorState<PropagatorStateType>::NodeDone(
     if (abort_run) {
       TRACEPRINTF("StartAbort: %s", s.ToString().c_str());
       if (cancellation_manager_) {
-        // Only log when the abort happens during the actual run time. Do not
-        // log OutOfRange errors as warnings because they are expected when
-        // iterating through a tf.data input pipeline.
-        // TODO(b/160644063): Grappler optimizations may result in spurious
-        // error messages. Add additional checks for those errors.
-        if (!errors::IsOutOfRange(s)) {
-          LOG(WARNING) << "[" << immutable_state_.params().device->name()
-                       << "] Executor start aborting: " << s;
-        } else {
-          VLOG(1) << "[" << immutable_state_.params().device->name()
-                  << "] Executor start aborting: " << s;
-        }
+        // Only log when the abort happens during the actual run time.
+        // Use VLOG instead of LOG(warning) because error status is expected
+        // when the executor is run under the grappler optimization phase or
+        // when iterating through a tf.data input pipeline.
+        VLOG(1) << "[" << immutable_state_.params().device->name()
+                << "] Executor start aborting: " << s;
       }
 
       if (rendezvous_) {
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 1deafe31ae2..2d53b6a9db3 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -830,10 +830,10 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctions) {
   {
     Scope s = Scope::NewRootScope();
     auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
-    auto x4_x2_two = ops::Const<int64>(s.WithOpName("x4/x2/two"), 2LL);
-    auto x4_y_two = ops::Const<int64>(s.WithOpName("x4/y/two"), 2LL);
-    auto y_x2_two = ops::Const<int64>(s.WithOpName("y/x2/two"), 2LL);
-    auto y_y_two = ops::Const<int64>(s.WithOpName("y/y/two"), 2LL);
+    auto x4_x2_two = ops::Const<int64>(s.WithOpName("x4/x2/two"), int64{2});
+    auto x4_y_two = ops::Const<int64>(s.WithOpName("x4/y/two"), int64{2});
+    auto y_x2_two = ops::Const<int64>(s.WithOpName("y/x2/two"), int64{2});
+    auto y_y_two = ops::Const<int64>(s.WithOpName("y/y/two"), int64{2});
     auto x4_x2_scale =
         ops::Cast(s.WithOpName("x4/x2/scale"), x4_x2_two, DT_FLOAT);
     auto x4_y_scale = ops::Cast(s.WithOpName("x4/y/scale"), x4_y_two, DT_FLOAT);
@@ -876,10 +876,10 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctions) {
   {
     Scope s = Scope::NewRootScope();
     auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
-    auto x4_x2_two = ops::Const<int64>(s.WithOpName("x4/x2/two"), 2LL);
-    auto x4_y_two = ops::Const<int64>(s.WithOpName("x4/y/two"), 2LL);
-    auto y_x2_two = ops::Const<int64>(s.WithOpName("y/x2/two"), 2LL);
-    auto y_y_two = ops::Const<int64>(s.WithOpName("y/y/two"), 2LL);
+    auto x4_x2_two = ops::Const<int64>(s.WithOpName("x4/x2/two"), int64{2});
+    auto x4_y_two = ops::Const<int64>(s.WithOpName("x4/y/two"), int64{2});
+    auto y_x2_two = ops::Const<int64>(s.WithOpName("y/x2/two"), int64{2});
+    auto y_y_two = ops::Const<int64>(s.WithOpName("y/y/two"), int64{2});
     auto x4_x2_scale =
         ops::Cast(s.WithOpName("x4/x2/scale"), x4_x2_two, DT_FLOAT);
     auto x4_y_scale = ops::Cast(s.WithOpName("x4/y/scale"), x4_y_two, DT_FLOAT);
@@ -957,7 +957,7 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsWithInputControlEdges) {
         s.WithOpName("Func/b/x2/input/_4").WithControlDependencies({func3}),
         func1);
     auto b_x2_two = ops::Const(
-        s.WithOpName("b/x2/two").WithControlDependencies({func3}), 2LL);
+        s.WithOpName("b/x2/two").WithControlDependencies({func3}), int64{2});
     auto b_x2_scale = ops::Cast(s.WithOpName("b/x2/scale"), b_x2_two, DT_FLOAT);
     auto b_x2_y = ops::Mul(s.WithOpName("b/x2/y"), func4, b_x2_scale);
     auto func5 = ops::Identity(s.WithOpName("Func/b/x2/output/_5"), b_x2_y);
@@ -968,7 +968,7 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsWithInputControlEdges) {
         s.WithOpName("Func/b/y/input/_7").WithControlDependencies({func6}),
         func5);
     auto b_y_two = ops::Const(
-        s.WithOpName("b/y/two").WithControlDependencies({func6}), 2LL);
+        s.WithOpName("b/y/two").WithControlDependencies({func6}), int64{2});
     auto b_y_scale = ops::Cast(s.WithOpName("b/y/scale"), b_y_two, DT_FLOAT);
     auto b_y_y = ops::Mul(s.WithOpName("b/y/y"), func7, b_y_scale);
     auto func8 = ops::Identity(s.WithOpName("Func/b/y/output/_8"), b_y_y);
@@ -1589,7 +1589,7 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_XTimesTwo) {
   {
     Scope s = Scope::NewRootScope();
     auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
-    auto two = ops::Const(s.WithOpName("two"), 2LL);
+    auto two = ops::Const(s.WithOpName("two"), int64{2});
     auto scale = ops::Cast(s.WithOpName("scale"), two, DT_FLOAT);
     auto y = ops::Mul(s.WithOpName("y"), x, scale);
     auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y, 0);
@@ -1607,7 +1607,7 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_XTimesTwo) {
     Scope s = Scope::NewRootScope();
     auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
     auto func0 = ops::_Arg(s.WithOpName("Func/_0"), DT_FLOAT, 1);
-    auto two = ops::Const(s.WithOpName("two"), 2LL);
+    auto two = ops::Const(s.WithOpName("two"), int64{2});
     auto scale = ops::Cast(s.WithOpName("scale"), two, DT_FLOAT);
     auto y = ops::Mul(s.WithOpName("y"), x, scale);
     NameAttrList fn0;
diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD
index 18d6f06a5f4..c738e490501 100644
--- a/tensorflow/core/common_runtime/gpu/BUILD
+++ b/tensorflow/core/common_runtime/gpu/BUILD
@@ -159,6 +159,7 @@ tf_cuda_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:tf32_utils",
         "//tensorflow/core/profiler/lib:annotated_traceme",
         "//tensorflow/core/profiler/lib:scoped_annotation",
         "//third_party/eigen3",
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index 5609334ce9c..5d4a8abad25 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -51,9 +51,9 @@ class GPUKernelTracker;
 
 class BaseGPUDevice : public LocalDevice {
  public:
-  BaseGPUDevice(const SessionOptions& options, const string& name,
+  BaseGPUDevice(const SessionOptions& options, const std::string& name,
                 Bytes memory_limit, const DeviceLocality& locality,
-                TfGpuId tf_gpu_id, const string& physical_device_desc,
+                TfGpuId tf_gpu_id, const std::string& physical_device_desc,
                 Allocator* gpu_allocator, Allocator* cpu_allocator,
                 bool sync_every_op);
 
@@ -160,8 +160,8 @@ class BaseGPUDevice : public LocalDevice {
   void ReinitializeDevice(OpKernelContext* context, PerOpGpuDevice* device,
                           int stream_id, Allocator* allocator);
 
-  string ComputeOpKernelDebugString(const OpKernel& op_kernel,
-                                    const int& stream_id);
+  std::string ComputeOpKernelDebugString(const OpKernel& op_kernel,
+                                         const int& stream_id);
 
   // This method returns an initialization status, in addition to
   // calling the "done" StatusCallback, if there is a failure to
@@ -309,14 +309,15 @@ class GPUKernelTracker {
 class BaseGPUDeviceFactory : public DeviceFactory {
  public:
   Status ListPhysicalDevices(std::vector<string>* devices) override;
-  Status CreateDevices(const SessionOptions& options, const string& name_prefix,
+  Status CreateDevices(const SessionOptions& options,
+                       const std::string& name_prefix,
                        std::vector<std::unique_ptr<Device>>* devices) override;
   Status GetDeviceDetails(int device_index,
                           std::unordered_map<string, string>* details) override;
 
   struct InterconnectMap {
     // Name of interconnect technology, if known.
-    string name;
+    std::string name;
     // If possible, strength should approximate Gb/sec bandwidth rate.
     // Where architecture-specific subclassing is not done that won't
     // always be possible.  The minimum expectation is that
@@ -351,7 +352,7 @@ class BaseGPUDeviceFactory : public DeviceFactory {
   // 'memory_limit' bytes of GPU memory to it, and adds it to the 'devices'
   // vector.
   Status CreateGPUDevice(const SessionOptions& options,
-                         const string& name_prefix, TfGpuId tf_gpu_id,
+                         const std::string& name_prefix, TfGpuId tf_gpu_id,
                          int64 memory_limit, const DeviceLocality& dev_locality,
                          std::vector<std::unique_ptr<Device>>* devices);
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_init.h b/tensorflow/core/common_runtime/gpu/gpu_init.h
index 4c8f0868df0..b1a82390147 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_init.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_init.h
@@ -38,7 +38,7 @@ stream_executor::Platform* GPUMachineManager();
 // Returns the string describing the name of the GPU platform in use.
 // This value is "CUDA" by default, and
 // "ROCM" when TF is built with `--config==rocm`
-string GpuPlatformName();
+std::string GpuPlatformName();
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/gradients.cc b/tensorflow/core/common_runtime/gradients.cc
index 5230f354df9..449dace2ddb 100644
--- a/tensorflow/core/common_runtime/gradients.cc
+++ b/tensorflow/core/common_runtime/gradients.cc
@@ -130,7 +130,7 @@ static Node* AddSymGrad(Graph* g, Node* n, gtl::ArraySlice<NodeOut> grads) {
   // The gradient node's outputs have the same types as the node 'n's
   // inputs, except for resources.
   DataTypeVector out_types = n->input_types();
-  for (int i = 0; i < out_types.size(); ++i) {
+  for (int i = 0, end = out_types.size(); i < end; ++i) {
     if (out_types[i] == DT_RESOURCE) {
       // TODO(apassos): figure out how to get the right dtype
       out_types[i] = DT_FLOAT;
@@ -221,7 +221,7 @@ SymbolicGradientBuilder::SymbolicGradientBuilder(
   x_grad_node_outputs_->clear();
   x_grad_node_outputs_->resize(x_node_outputs_.size());
   stop_nodes_.reserve(x_node_outputs_.size());
-  for (int i = 0; i < x_node_outputs_.size(); ++i) {
+  for (int i = 0, end = x_node_outputs_.size(); i < end; ++i) {
     stop_nodes_.insert(x_node_outputs_[i].node->id());
   }
 }
@@ -397,7 +397,7 @@ Status SymbolicGradientBuilder::Compute() {
     }
   }
 
-  for (int i = 0; i < x_node_outputs_.size(); ++i) {
+  for (int i = 0, end = x_node_outputs_.size(); i < end; ++i) {
     (*x_grad_node_outputs_)[i] = SumGradients(x_node_outputs_[i]);
   }
 
diff --git a/tensorflow/core/common_runtime/graph_constructor.cc b/tensorflow/core/common_runtime/graph_constructor.cc
index ab5b086b25c..5b2cc0d0045 100644
--- a/tensorflow/core/common_runtime/graph_constructor.cc
+++ b/tensorflow/core/common_runtime/graph_constructor.cc
@@ -470,17 +470,17 @@ Status MaybeAppendVersionWarning(const VersionDef* versions,
     return Status(
         import_status.code(),
         absl::StrCat(
-            "Converting GraphDef to Graph has failed. The binary trying to "
-            "import the GraphDef was built when GraphDef version was ",
+            "Converting GraphDef to Graph has failed with an error: '",
+            import_status.error_message(),
+            "' The binary trying to import the GraphDef was built when "
+            "GraphDef version was ",
             TF_GRAPH_DEF_VERSION,
             ". The GraphDef was produced by a binary built when GraphDef "
             "version was ",
             versions->producer(),
             ". The difference between these versions is larger than "
-            "TensorFlow's forward compatibility guarantee. The following error "
-            "might be due to the binary trying to import the GraphDef being "
-            "too old: ",
-            import_status.error_message()));
+            "TensorFlow's forward compatibility guarantee, and might be the "
+            "root cause for failing to import the GraphDef."));
   }
   return import_status;
 }
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
index d4cb79e3c05..decf8b2ccb5 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
@@ -186,7 +186,7 @@ Status HierarchicalTreeBroadcaster::InitializeCollectiveParams(
 }
 
 Status HierarchicalTreeBroadcaster::InitializeCollectiveContext(
-    CollectiveContext* col_ctx) {
+    std::shared_ptr<CollectiveContext> col_ctx) {
   CHECK(col_ctx->dev_mgr);
   col_ctx_ = col_ctx;
   col_params_ = &col_ctx->col_params;
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h
index 38954e7dfaf..40ee3f82d48 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h
@@ -39,7 +39,8 @@ class HierarchicalTreeBroadcaster : public CollectiveImplementationInterface {
 
   // Initializes members of CollectiveContext not yet initialized, i.e. device
   // and device_locality.  Also saves the CollectiveContext in this object.
-  Status InitializeCollectiveContext(CollectiveContext* col_ctx) override;
+  Status InitializeCollectiveContext(
+      std::shared_ptr<CollectiveContext> col_ctx) override;
 
   // No-op for hierarchical tree broadcaster.
   Status InitializeCollectiveGroupRuntimeDetails(
@@ -80,7 +81,7 @@ class HierarchicalTreeBroadcaster : public CollectiveImplementationInterface {
   // Executes the hierarchical broadcast defined by this op.
   void RunTree();
 
-  CollectiveContext* col_ctx_;          // Not owned
+  std::shared_ptr<CollectiveContext> col_ctx_;
   const CollectiveParams* col_params_;  // Not owned
   StatusCallback done_;
   Status status_;
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
index 2006947258c..1a98a9adbb8 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
@@ -518,8 +518,9 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
     cp->subdiv_rank.clear();
     cp->instance.impl_details.subdiv_source_rank.clear();
     // Create a stub broadcaster only for testing param initialization.
-    HierarchicalTreeBroadcaster broadcaster;
-    TF_CHECK_OK(broadcaster.InitializeCollectiveParams(cp));
+    HierarchicalTreeBroadcaster* broadcaster = new HierarchicalTreeBroadcaster;
+    core::ScopedUnref unref(broadcaster);
+    TF_CHECK_OK(broadcaster->InitializeCollectiveParams(cp));
     EXPECT_EQ(expected_subdiv_perms,
               cp->instance.impl_details.subdiv_permutations);
     EXPECT_EQ(expected_subdiv_rank, cp->subdiv_rank);
@@ -669,14 +670,16 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
       // Prepare a Broadcaster instance.
       string exec_key =
           strings::StrCat(col_params_.instance.instance_key, ":0:0");
-      HierarchicalTreeBroadcaster broadcaster;
-      CollectiveContext col_ctx(parent_->col_exec_, parent_->dev_mgr_.get(),
-                                &ctx, &op_params, col_params_, exec_key,
-                                kStepId, input_tensor_ptr, output_tensor_ptr);
-      TF_CHECK_OK(broadcaster.InitializeCollectiveContext(&col_ctx));
+      HierarchicalTreeBroadcaster* broadcaster =
+          new HierarchicalTreeBroadcaster;
+      core::ScopedUnref unref(broadcaster);
+      auto col_ctx = std::make_shared<CollectiveContext>(
+          parent_->col_exec_, parent_->dev_mgr_.get(), &ctx, &op_params,
+          col_params_, exec_key, kStepId, input_tensor_ptr, output_tensor_ptr);
+      TF_CHECK_OK(broadcaster->InitializeCollectiveContext(col_ctx));
 
       // Run the broadcast.
-      broadcaster.Run([this](Status s) { status_ = s; });
+      broadcaster->Run([this](Status s) { status_ = s; });
       if (status_.ok()) {
         CHECK(tensor_.CopyFrom(*ctx.mutable_output(0), tensor_.shape()));
       }
diff --git a/tensorflow/core/common_runtime/lower_functional_ops.cc b/tensorflow/core/common_runtime/lower_functional_ops.cc
index 7bfc36c14fc..7a16d3e2d61 100644
--- a/tensorflow/core/common_runtime/lower_functional_ops.cc
+++ b/tensorflow/core/common_runtime/lower_functional_ops.cc
@@ -159,7 +159,7 @@ Status LowerFunctionalOpsPass::Run(
     if (n->IsIfNode() && lower_control_flow(n)) {
       TF_RETURN_IF_ERROR(RewriteIfNode(n, g, keep_lowered_nodes_fetchable));
 
-    } else if (n->type_string() == "Case" && lower_control_flow(n)) {
+    } else if (n->IsCaseNode() && lower_control_flow(n)) {
       TF_RETURN_IF_ERROR(RewriteCaseNode(n, g, keep_lowered_nodes_fetchable));
 
     } else if (n->IsWhileNode() && lower_control_flow(n)) {
diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc
index 778d5445cb2..1fcdc7507b4 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc
@@ -1164,8 +1164,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     DataType T_m;
     TF_CHECK_OK(GetNodeAttr(m->def(), "T", &T_m));
 
+#ifndef ENABLE_INTEL_MKL_BFLOAT16
     // Don't try to merge if datatype is not DT_FLOAT
     if (T_m != DT_FLOAT) return n;
+#else
+    // Don't try to merge if datatype is not DT_FLOAT or DT_BFLOAT16
+    if (T_m != DT_FLOAT && T_m != DT_BFLOAT16) return n;
+#endif
 
     const Node* conv_node;
     if (m->type_string() == csinfo_.pad) {
diff --git a/tensorflow/core/common_runtime/mkl_layout_pass_test.cc b/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
index d480c0a49ce..9bfa9418bf3 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 
@@ -188,23 +189,6 @@ REGISTER_OP("BFloat16Output2")
     .SetIsStateful();
 #endif  // ENABLE_INTEL_MKL_BFLOAT16
 
-/////////////////////////////////////////////////////////////////////
-// Macros for handling registeration for various types
-/////////////////////////////////////////////////////////////////////
-
-#define REGISTER_TEST_FLOAT32(TEST) REGISTER_TEST(TEST, DT_FLOAT, Float32Input);
-
-#ifdef ENABLE_INTEL_MKL_BFLOAT16
-#define REGISTER_TEST_BFLOAT16(TEST) \
-  REGISTER_TEST(TEST, DT_BFLOAT16, BFloat16Input);
-
-#define REGISTER_TEST_ALL_TYPES(TEST) \
-  REGISTER_TEST_FLOAT32(TEST);        \
-  REGISTER_TEST_BFLOAT16(TEST);
-#else
-#define REGISTER_TEST_ALL_TYPES(TEST) REGISTER_TEST_FLOAT32(TEST);
-#endif  // ENABLE_INTEL_MKL_BFLOAT16
-
 /////////////////////////////////////////////////////////////////////
 //  Unit tests related to node merge optimization
 /////////////////////////////////////////////////////////////////////
@@ -620,8 +604,7 @@ REGISTER_TEST_FLOAT32(NodeMerge_Conv2DWithBias_ConvBpropInput_FilterFwd);
         "A:control->DMT/_2:control;B->E:2;D->E:1;DMT/_0->E:3;DMT/_1->E:4;" \
         "DMT/_2->E:5;E->Z;Y->Z:1");                                        \
   }
-// TODO(nhasabni): Enable bfloat16 test when we enable the operator.
-REGISTER_TEST_FLOAT32(NodeMerge_PadWithConv2D_Positive);
+REGISTER_TEST_ALL_TYPES(NodeMerge_PadWithConv2D_Positive);
 #undef REGISTER_TEST
 
 // Test if input control edges do not duplicate after merge.
@@ -679,8 +662,7 @@ REGISTER_TEST_FLOAT32(NodeMerge_PadWithConv2D_Positive);
         "DMT/_2:control;B->E:2;D->E:1;DMT/_0->E:3;DMT/_1->E:4;"            \
         "DMT/_2->E:5;E->Z;Y->Z:1");                                        \
   }
-// TODO(nhasabni): Enable bfloat16 test when we enable the operator.
-REGISTER_TEST_FLOAT32(Input_ControlEdge_PadWithConv2D_Positive);
+REGISTER_TEST_ALL_TYPES(Input_ControlEdge_PadWithConv2D_Positive);
 #undef REGISTER_TEST
 
 // Test if output control edges does not duplicate after merge.
@@ -737,8 +719,7 @@ REGISTER_TEST_FLOAT32(Input_ControlEdge_PadWithConv2D_Positive);
         "DMT/_0->E:3;DMT/_1->E:4;DMT/_2->E:5;E->Z;E:control->A1:control;"  \
         "Y->Z:1");                                                         \
   }
-// TODO(nhasabni): Enable bfloat16 test when we enable the operator.
-REGISTER_TEST_FLOAT32(Output_ControlEdge_PadWithConv2D_Positive);
+REGISTER_TEST_ALL_TYPES(Output_ControlEdge_PadWithConv2D_Positive);
 #undef REGISTER_TEST
 
 // Pad + Conv2D fusion with padding is VALID,
@@ -778,8 +759,7 @@ REGISTER_TEST_FLOAT32(Output_ControlEdge_PadWithConv2D_Positive);
               "DMT/_1:control;A:control->DMT/_2:control;B->E:2;DMT/_0->E:3;"\
               "DMT/_1->E:4;DMT/_2->E:5;E->Z;Y->Z:1");                      \
   }
-// TODO(nhasabni): Enable bfloat16 test when we enable the operator.
-REGISTER_TEST_FLOAT32(NodeMerge_PadWithConv2D_Common_Input);
+REGISTER_TEST_ALL_TYPES(NodeMerge_PadWithConv2D_Common_Input);
 #undef REGISTER_TEST
 
 // Pad + Conv2D with padding is VALID,
diff --git a/tensorflow/core/common_runtime/optimization_registry.cc b/tensorflow/core/common_runtime/optimization_registry.cc
index cfaeb05d66f..23e6959ab24 100644
--- a/tensorflow/core/common_runtime/optimization_registry.cc
+++ b/tensorflow/core/common_runtime/optimization_registry.cc
@@ -36,15 +36,16 @@ Status OptimizationPassRegistry::RunGrouping(
     Grouping grouping, const GraphOptimizationPassOptions& options) {
   auto group = groups_.find(grouping);
   if (group != groups_.end()) {
+    const uint64 start_us = Env::Default()->NowMicros();
     for (auto& phase : group->second) {
       VLOG(1) << "Running optimization phase " << phase.first;
       for (auto& pass : phase.second) {
         VLOG(1) << "Running optimization pass: " << pass->name();
-        const uint64 start_us = Env::Default()->NowMicros();
+        const uint64 pass_start_us = Env::Default()->NowMicros();
         Status s = pass->Run(options);
-        const uint64 end_us = Env::Default()->NowMicros();
+        const uint64 pass_end_us = Env::Default()->NowMicros();
         metrics::UpdateGraphOptimizationPassTime(pass->name(),
-                                                 end_us - start_us);
+                                                 pass_end_us - pass_start_us);
         if (!s.ok()) return s;
         if (VLOG_IS_ON(1)) {
           if (options.graph) {
@@ -67,6 +68,8 @@ Status OptimizationPassRegistry::RunGrouping(
         }
       }
     }
+    const uint64 end_us = Env::Default()->NowMicros();
+    metrics::UpdateGraphOptimizationPassTime("*", end_us - start_us);
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/common_runtime/permuter.cc b/tensorflow/core/common_runtime/permuter.cc
new file mode 100644
index 00000000000..c3081d6bc61
--- /dev/null
+++ b/tensorflow/core/common_runtime/permuter.cc
@@ -0,0 +1,118 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/permuter.h"
+
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/collective_util.h"
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+Permuter::Permuter()
+    : col_ctx_(nullptr), col_params_(nullptr), done_(nullptr), counter_(0) {}
+
+bool Permuter::CheckCounter() {
+  mutex_lock lock(mu_counter_);
+  ++counter_;
+  if (counter_ == 2) return true;
+  return false;
+}
+
+StatusCallback Permuter::HalfDone() {
+  return [this](const Status& s) {
+    status_.Update(s);
+    if (CheckCounter()) done_(status_);
+  };
+}
+
+Status Permuter::InitializeCollectiveContext(
+    std::shared_ptr<CollectiveContext> col_ctx) {
+  DCHECK(col_ctx->dev_mgr);
+  col_ctx_ = col_ctx;
+  col_params_ = &col_ctx->col_params;
+  return collective_util::InitializeDeviceAndLocality(
+      col_ctx->dev_mgr, col_ctx->device_name, &col_ctx->device,
+      &col_ctx->device_locality);
+}
+
+void Permuter::Run(StatusCallback done) {
+  done_ = std::move(done);
+  for (int i = 0; i < col_params_->instance.devices.size(); ++i) {
+    if (col_ctx_->device_name == col_params_->instance.devices[i]) {
+      DispatchSend(i, col_params_->instance.permutation[i], col_ctx_->input,
+                   HalfDone());
+      continue;
+    }
+    if (col_ctx_->device_name ==
+        col_params_->instance.devices[col_params_->instance.permutation[i]]) {
+      DispatchRecv(i, col_params_->instance.permutation[i], col_ctx_->output,
+                   HalfDone());
+    }
+  }
+}
+
+void Permuter::DispatchSend(int src_rank, int target_rank, const Tensor* tensor,
+                            const StatusCallback& done) {
+  string send_buf_key =
+      strings::StrCat(col_ctx_->exec_key, src_rank, target_rank);
+  VLOG(1) << "DispatchSend " << send_buf_key << " from_device "
+          << col_ctx_->device_name << " to_device "
+          << col_params_->instance.devices[target_rank]
+          << " target_rank=" << target_rank << " src_rank=" << src_rank;
+  col_ctx_->col_exec->PostToPeer(col_params_->instance.devices[target_rank],
+                                 col_params_->instance.task_names[target_rank],
+                                 send_buf_key, col_ctx_->device,
+                                 col_ctx_->op_ctx->op_device_context(),
+                                 col_ctx_->op_ctx->output_alloc_attr(0), tensor,
+                                 col_ctx_->device_locality, done);
+}
+
+void Permuter::DispatchRecv(int src_rank, int target_rank, Tensor* tensor,
+                            const StatusCallback& done) {
+  string recv_buf_key =
+      strings::StrCat(col_ctx_->exec_key, src_rank, target_rank);
+  VLOG(1) << "DispatchRecv " << recv_buf_key << " to_device "
+          << col_ctx_->device_name << " from_device "
+          << col_params_->instance.devices[src_rank]
+          << " target_rank=" << target_rank << " src_rank=" << src_rank;
+  col_ctx_->col_exec->RecvFromPeer(col_params_->instance.devices[src_rank],
+                                   col_params_->instance.task_names[src_rank],
+                                   col_params_->task.is_local[src_rank],
+                                   recv_buf_key, col_ctx_->device,
+                                   col_ctx_->op_ctx->op_device_context(),
+                                   col_ctx_->op_ctx->output_alloc_attr(0),
+                                   tensor, col_ctx_->device_locality, 0, done);
+}
+namespace {
+REGISTER_COLLECTIVE(Permute, Permuter);
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/permuter.h b/tensorflow/core/common_runtime/permuter.h
new file mode 100644
index 00000000000..245168b4b0d
--- /dev/null
+++ b/tensorflow/core/common_runtime/permuter.h
@@ -0,0 +1,89 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PERMUTER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PERMUTER_H_
+
+#include <deque>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/framework/collective.h"
+
+namespace tensorflow {
+class Device;
+
+// Implementation of collective permute.
+//
+// Permute takes
+// - a list of devices participating in the collective
+// - a permutation as a list of integers.
+// - a tensor
+//
+// The list of devices replaces the need for group_key and group_size. The
+// number of inputs only scales with the number of devices within one group.
+//
+// The integers in the permutation are based on indices of the list of devices.
+// E.g. devices = {"GPU:0", "GPU:1"} and permutation = {1,0} means
+// - devices[0] sends to devices[permutation[0]] and
+// - devices[1] sends to devices[permutation[1]].
+//
+// Each device sends exactly one tensor and receives exactly one tensor.
+class Permuter : public CollectiveImplementationInterface {
+ public:
+  Permuter();
+  ~Permuter() override = default;
+
+  void Run(StatusCallback done) override;
+
+  Status InitializeCollectiveParams(CollectiveParams* col_params) override {
+    return Status::OK();
+  }
+
+  // Initializes members of CollectiveContext not yet initialized, i.e. device
+  // and device_locality.  Also saves the CollectiveContext in this object.
+  Status InitializeCollectiveContext(
+      std::shared_ptr<CollectiveContext> col_ctx) override;
+
+  Status InitializeCollectiveGroupRuntimeDetails(
+      CollGroupRuntimeDetails*) override {
+    return Status::OK();
+  }
+
+ private:
+  std::shared_ptr<CollectiveContext> col_ctx_;
+  const CollectiveParams* col_params_;  // Not owned
+  StatusCallback done_;
+  Status status_;
+  mutex mu_counter_;
+  int counter_ TF_GUARDED_BY(mu_counter_);
+
+  void DispatchSend(int src_rank, int target_rank, const Tensor* tensor,
+                    const StatusCallback& done);
+
+  void DispatchRecv(int src_rank, int target_rank, Tensor* tensor,
+                    const StatusCallback& done);
+
+  // Checks if counter_ reaches 2.
+  // Atomically increments counter_ by one for sending, one for receiving.
+  // The purpose of this check is to ensure that done_ is called only once.
+  bool CheckCounter();
+
+  StatusCallback HalfDone();
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PERMUTER_H_
diff --git a/tensorflow/core/common_runtime/permuter_test.cc b/tensorflow/core/common_runtime/permuter_test.cc
new file mode 100644
index 00000000000..a5117322ffa
--- /dev/null
+++ b/tensorflow/core/common_runtime/permuter_test.cc
@@ -0,0 +1,507 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/permuter.h"
+
+#include <algorithm>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/test_collective_executor_mgr.h"
+#include "tensorflow/core/common_runtime/threadpool_device.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/unbounded_work_queue.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+namespace {
+
+static int64 kStepId = 123;
+
+// Wraps CollectiveRemoteAccessLocal with the ability to return an
+// error status to the N'th action.
+// TODO(b/113171733): factor out of this file and ring_reducer_test.cc
+// into a single common source.
+class FailTestRMA : public CollectiveRemoteAccessLocal {
+ public:
+  FailTestRMA(const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
+              std::shared_ptr<UnboundedWorkQueue> work_queue, int64 step_id,
+              int fail_after)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, work_queue, step_id),
+        fail_after_(fail_after) {}
+
+  bool MaybeFail(const StatusCallback& done) {
+    bool fail_now = false;
+    {
+      mutex_lock l(mu_);
+      if (fail_after_ > 0) {
+        fail_now = (--fail_after_ == 0);
+      }
+    }
+    if (fail_now) {
+      auto error = errors::Internal("Deliberate failure");
+      LOG(INFO) << "triggering failure " << error;
+      SchedNonBlockingClosureAfter(
+          1000, [this, error] { buf_rendezvous()->StartAbort(error); });
+      done(error);
+      return true;
+    }
+    return false;
+  }
+
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality, int stream_index,
+                    const StatusCallback& done) override {
+    if (MaybeFail(done)) return;
+    CollectiveRemoteAccessLocal::RecvFromPeer(
+        peer_device, peer_task, peer_is_local, key, to_device, to_device_ctx,
+        to_alloc_attr, to_tensor, client_locality, stream_index, done);
+  }
+
+  void PostToPeer(const string& peer_device, const string& peer_task,
+                  const string& key, Device* from_device,
+                  DeviceContext* from_device_ctx,
+                  const AllocatorAttributes& from_alloc_attr,
+                  const Tensor* from_tensor,
+                  const DeviceLocality& client_locality,
+                  const StatusCallback& done) override {
+    if (MaybeFail(done)) return;
+    CollectiveRemoteAccessLocal::PostToPeer(
+        peer_device, peer_task, key, from_device, from_device_ctx,
+        from_alloc_attr, from_tensor, client_locality, done);
+  }
+
+  mutex mu_;
+  int fail_after_ TF_GUARDED_BY(mu_);
+};
+
+class PermuterTest : public ::testing::Test {
+ protected:
+  PermuterTest() : device_type_(DEVICE_CPU) {}
+
+  ~PermuterTest() override {
+    stop_ = true;
+    for (auto i : instances_) delete i;
+    if (col_exec_) col_exec_->Unref();
+  }
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  void InitGPUDevices() {
+    auto device_factory = DeviceFactory::GetFactory("GPU");
+    CHECK(device_factory);
+    SessionOptions options;
+    Status s = device_factory->CreateDevices(
+        options, "/job:worker/replica:0/task:0", &gpu_devices_);
+    CHECK(s.ok());
+  }
+#endif
+
+  void Init(int num_workers, int num_devices_per_worker, DataType dtype,
+            const DeviceType& device_type, int fail_after) {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    InitGPUDevices();
+#endif
+    device_type_ = device_type;
+    std::vector<std::unique_ptr<Device>> local_devices;
+    SessionOptions sess_opts;
+    sess_opts.env = Env::Default();
+    Bytes mem_limit(4 << 20);
+    DeviceLocality dev_locality;
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices_per_worker; ++di) {
+        if (device_type == DEVICE_CPU) {
+          string dev_name = strings::StrCat("/job:worker/replica:0/task:", wi,
+                                            "/device:CPU:", di);
+          local_devices.push_back(absl::make_unique<ThreadPoolDevice>(
+              sess_opts, dev_name, mem_limit, dev_locality, cpu_allocator()));
+        } else if (device_type == DEVICE_GPU && !gpu_devices_.empty()) {
+          int dev_idx = (wi * num_devices_per_worker) + di;
+          if (dev_idx >= static_cast<int>(gpu_devices_.size())) {
+            LOG(INFO) << "dev_mgr has access to limited GPUs, reusing for more "
+                         "than one ring node.";
+          } else {
+            local_devices.push_back(std::move(gpu_devices_[dev_idx]));
+          }
+        } else {
+          LOG(FATAL) << "Unsupported device_type " << device_type;
+        }
+      }
+    }
+    if (!dev_mgr_ || device_type == DEVICE_CPU) {
+      dev_mgr_ = absl::make_unique<StaticDeviceMgr>(std::move(local_devices));
+    }
+    if (!gpu_ring_order_) {
+      gpu_ring_order_ = absl::make_unique<string>();
+    }
+    dev_resolver_ = absl::make_unique<DeviceResolverLocal>(dev_mgr_.get());
+    work_queue_ = std::make_shared<UnboundedWorkQueue>(Env::Default(), "test");
+    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), work_queue_,
+                           kStepId, fail_after);
+    col_exec_ = new BaseCollectiveExecutor(
+        &col_exec_mgr_, rma_, kStepId, dev_mgr_.get(), gpu_ring_order_.get());
+    col_params_.name = "test_collective";
+    col_params_.instance.data_type = dtype;
+    static const int kInstanceKey = 18;
+    col_params_.instance.instance_key = kInstanceKey;
+    col_params_.group.device_type = device_type;
+    col_params_.instance.type = PERMUTE_COLLECTIVE;
+
+    // Set up all the fake device contexts.
+    for (int wi = 0; wi < num_workers; wi++) {
+      for (int di = 0; di < num_devices_per_worker; di++) {
+        string task_name = strings::StrCat("/job:worker/replica:0/task:", wi);
+        string dev_name;
+        if (device_type == DEVICE_GPU) {
+          dev_name = strings::StrCat(task_name, "/device:GPU:0");
+        } else {
+          dev_name = strings::StrCat(task_name, "/device:CPU:", di);
+        }
+        col_params_.instance.device_names.push_back(dev_name);
+        col_params_.instance.devices.push_back(dev_name);
+        int default_rank = wi * num_devices_per_worker + di;
+        permutation_.push_back(default_rank);
+        col_params_.instance.task_names.push_back(task_name);
+        col_params_.task.is_local.push_back(true);
+      }
+    }
+
+    // Generate a permutation by permuting every two instances.
+    // E.g. [0,1] becomes [1,0]
+    //      [0,1,2,3] becomes [1,0,3,2]
+    for (int i = 0; i < permutation_.size(); i += 2) {
+      // If the total number of instances is odd,
+      // swap the last instance with the first.
+      // E.g. [0,1,2] becomes [2,0,1]
+      if (permutation_.size() == i + 1) {
+        std::swap(permutation_[i], permutation_[0]);
+        continue;
+      }
+      std::next_permutation(permutation_.begin() + i,
+                            permutation_.begin() + i + 2);
+    }
+    col_params_.instance.permutation = permutation_;
+
+    for (int wi = 0; wi < num_workers; wi++) {
+      for (int di = 0; di < num_devices_per_worker; di++) {
+        int default_rank = wi * num_devices_per_worker + di;
+        instances_.push_back(new DeviceInstance(
+            default_rank, col_params_.instance.device_names[default_rank],
+            device_type, this));
+      }
+    }
+  }
+
+  typedef std::function<void(Tensor*)> InitFunc;
+
+  void Permute(int fail_after) {
+    std::atomic<int> done(0);
+    for (auto di : instances_) {
+      SchedClosure([di, &done] {
+        di->DoPermute();
+        ++done;
+      });
+      if (fail_after > 0) {
+        // Stagger the op execution starts.
+        Env::Default()->SleepForMicroseconds(100);
+      }
+    }
+    while (done < instances_.size()) {
+      if (stop_) break;
+      Env::Default()->SleepForMicroseconds(1000);
+    }
+  }
+
+  template <typename T>
+  void RunTest(DataType dtype, const DeviceType& device_type, int num_workers,
+               int num_devices, int tensor_len, int fail_after) {
+    Init(num_workers, num_devices, dtype, device_type, fail_after);
+    std::vector<T> expected(tensor_len * num_devices * num_workers, 0.0);
+    // Initialize each instance tensor with distinct values.
+    for (int di = 0; di < instances_.size(); ++di) {
+      DeviceInstance* instance = instances_[di];
+      instance->InitTensor(
+          dtype, TensorShape({tensor_len}),
+          [this, &expected, di, tensor_len](Tensor* t) {
+            for (size_t i = 0; i < t->NumElements(); ++i) {
+              // The cast is necessary to prevent clang-tidy from insisting
+              // that a faster non-open source function be substituted.
+              float value = pow(10, static_cast<double>(di)) * i;
+              t->flat<T>()(i) = value;
+              expected[permutation_[di] * tensor_len + i] = value;
+            }
+          });
+    }
+
+    Permute(fail_after);
+
+    // At this point all of the ops have terminated.
+    for (int di = 0; di < instances_.size(); ++di) {
+      if (!instances_[di]->status_.ok()) {
+        ASSERT_GT(fail_after, 0);
+        ASSERT_NE(
+            instances_[di]->status_.error_message().find("Deliberate failure"),
+            string::npos);
+        continue;
+      }
+      TF_EXPECT_OK(instances_[di]->status_);
+      Tensor* inst = &instances_[di]->tensor_output_;
+      Tensor actual(dtype, TensorShape({tensor_len}));
+      if (device_type_ == DEVICE_CPU) {
+        CHECK(actual.CopyFrom(*inst, inst->shape()));
+      } else if (device_type_ == DEVICE_GPU) {
+        Device* dev = instances_[di]->device_;
+        auto* dev_info = dev->tensorflow_gpu_device_info();
+        CHECK(dev_info);
+        TF_CHECK_OK(dev_info->default_context->CopyDeviceTensorToCPUSync(
+            inst, "" /*tensor_name*/, dev, &actual));
+      }
+      for (int i = 0; i < tensor_len; ++i) {
+        switch (dtype) {
+          case DT_FLOAT:
+            EXPECT_FLOAT_EQ(expected[(di * tensor_len) + i],
+                            actual.template flat<T>()(i))
+                << "Mismatch at device " << di << " index " << i;
+            break;
+          case DT_DOUBLE:
+            EXPECT_DOUBLE_EQ(expected[(di * tensor_len) + i],
+                             actual.template flat<T>()(i))
+                << "Mismatch at device " << di << " index " << i;
+            break;
+          case DT_INT32:
+          case DT_INT64:
+            EXPECT_EQ(expected[(di * tensor_len) + i],
+                      actual.template flat<T>()(i))
+                << "Mismatch at device " << di << " index " << i;
+            break;
+          default:
+            LOG(FATAL) << "unimplemented";
+        }
+      }
+      //  }
+    }
+  }
+
+  class DeviceInstance {
+   public:
+    DeviceInstance(int rank, const string& dev_name,
+                   const DeviceType& device_type, PermuterTest* parent)
+        : parent_(parent),
+          dev_name_(dev_name),
+          device_type_(device_type),
+          rank_(rank) {
+      TF_CHECK_OK(parent_->dev_mgr_->LookupDevice(dev_name, &device_));
+      col_params_.name = parent_->col_params_.name;
+      col_params_.instance.data_type = parent_->col_params_.instance.data_type;
+      col_params_.instance.instance_key =
+          parent_->col_params_.instance.instance_key;
+      col_params_.group.device_type = parent_->col_params_.group.device_type;
+      col_params_.instance.device_names =
+          parent_->col_params_.instance.device_names;
+      col_params_.instance.devices = parent_->col_params_.instance.devices;
+      col_params_.instance.permutation =
+          parent->col_params_.instance.permutation;
+      col_params_.instance.task_names =
+          parent_->col_params_.instance.task_names;
+      col_params_.task.is_local = parent_->col_params_.task.is_local;
+      CHECK_EQ(col_params_.instance.devices.size(),
+               col_params_.instance.device_names.size());
+      // Default rank is order in device_names.
+      col_params_.default_rank = rank;
+    }
+
+    void InitTensor(DataType dtype, const TensorShape& shape,
+                    const InitFunc& f) {
+      tensor_input_ =
+          Tensor(device_->GetAllocator(AllocatorAttributes()), dtype, shape);
+      tensor_output_ =
+          Tensor(device_->GetAllocator(AllocatorAttributes()), dtype, shape);
+      if (device_type_ == DEVICE_CPU) {
+        f(&tensor_input_);
+      } else if (device_type_ == DEVICE_GPU) {
+        Tensor cpu_tensor(dtype, shape);
+        f(&cpu_tensor);
+        // Notification notification;
+        auto* dev_info = device_->tensorflow_gpu_device_info();
+        CHECK(dev_info);
+        TF_CHECK_OK(dev_info->default_context->CopyCPUTensorToDeviceSync(
+            &cpu_tensor, device_, &tensor_input_));
+      } else {
+        LOG(FATAL) << "Unsupported device_type " << device_type_;
+      }
+    }
+
+    void DoPermute() {
+      // Prepare an OpKernelContext.
+      OpKernelContext::Params op_params;
+      op_params.step_id = parent_->step_id_;
+      op_params.device = device_;
+      gtl::InlinedVector<TensorValue, 4> inputs;
+      inputs.push_back(TensorValue(&tensor_input_));
+      op_params.inputs = &inputs;
+      gtl::InlinedVector<AllocatorAttributes, 4> input_aa(
+          {AllocatorAttributes()});
+      op_params.input_alloc_attrs = &input_aa;
+      DeviceContext* dev_ctx = nullptr;
+      auto* dev_info = device_->tensorflow_gpu_device_info();
+      if (dev_info) {
+        dev_ctx = dev_info->default_context;
+        dev_ctx->Ref();
+      } else {
+        dev_ctx = new DeviceContext;
+      }
+      op_params.op_device_context = dev_ctx;
+      AllocatorAttributes generic_alloc_attr;
+      op_params.output_attr_array = &generic_alloc_attr;
+      OpKernelContext ctx(&op_params, 1);
+
+      // Prepare a Permuter instance.
+      string exec_key =
+          strings::StrCat(col_params_.instance.instance_key, ":0:0");
+      Permuter* permuter = new Permuter;
+      core::ScopedUnref unref(permuter);
+      auto col_ctx = std::make_shared<CollectiveContext>(
+          parent_->col_exec_, parent_->dev_mgr_.get(), &ctx, &op_params,
+          col_params_, exec_key, kStepId, &tensor_input_, &tensor_output_);
+      TF_CHECK_OK(permuter->InitializeCollectiveContext(col_ctx));
+      Notification note;
+      // Run the permute.
+      permuter->Run([this, &note](Status s) {
+        status_ = s;
+        note.Notify();
+      });
+      note.WaitForNotification();
+      dev_ctx->Unref();
+    }
+
+    PermuterTest* parent_;
+    string dev_name_;
+    DeviceType device_type_ = DEVICE_CPU;
+    int rank_;
+    Tensor tensor_input_;
+    Tensor tensor_output_;
+    Device* device_;
+    CollectiveParams col_params_;
+    Status status_;
+  };  // class DeviceInstance
+
+  bool stop_ = false;
+  int64 step_id_ = kStepId;
+  DeviceType device_type_;
+  TestCollectiveExecutorMgr col_exec_mgr_;
+  CollectiveExecutor* col_exec_ = nullptr;
+  CollectiveRemoteAccessLocal* rma_;
+  std::unique_ptr<DeviceResolverLocal> dev_resolver_;
+  std::shared_ptr<UnboundedWorkQueue> work_queue_;
+  std::vector<DeviceInstance*> instances_;
+  CollectiveParams col_params_;
+  std::vector<std::unique_ptr<tensorflow::Device>> gpu_devices_;
+  std::unique_ptr<tensorflow::DeviceMgr> dev_mgr_;
+  std::unique_ptr<string> gpu_ring_order_;
+  mutex mu_;
+  int permute_counter_ TF_GUARDED_BY(mu_) = 0;
+  std::vector<int> permutation_;
+};
+
+// TODO(b/113171733): change to use TEST_P.
+// Tests of full permute algorithm, with different device and
+// data types.
+// B = data element type
+// T = device type
+// W = number of workers
+// D = number of devices per worker
+// L = tensor length
+// A = abort after count
+#define DEF_TEST(B, T, W, D, L, A)                                            \
+  TEST_F(PermuterTest,                                                        \
+         DaTy##B##_DevTy##T##_Wkr##W##_Dev##D##_Sdiv##S##_Len##L##_Abrt##A) { \
+    DataType dtype = DT_##B;                                                  \
+    switch (dtype) {                                                          \
+      case DT_FLOAT: {                                                        \
+        RunTest<float>(dtype, DEVICE_##T, W, D, L, A);                        \
+      } break;                                                                \
+      case DT_DOUBLE: {                                                       \
+        RunTest<double>(dtype, DEVICE_##T, W, D, L, A);                       \
+      } break;                                                                \
+      case DT_INT32: {                                                        \
+        RunTest<int32>(dtype, DEVICE_##T, W, D, L, A);                        \
+      } break;                                                                \
+      case DT_INT64: {                                                        \
+        RunTest<int64>(dtype, DEVICE_##T, W, D, L, A);                        \
+      } break;                                                                \
+      default:                                                                \
+        LOG(FATAL) << "Unimplemented";                                        \
+    }                                                                         \
+  }
+
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+//       B      T    W  D  L  A
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 0)
+DEF_TEST(FLOAT, CPU, 1, 3, 3, 0)
+DEF_TEST(FLOAT, CPU, 1, 7, 3, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1001, 0)
+DEF_TEST(FLOAT, CPU, 2, 2, 3, 0)
+DEF_TEST(FLOAT, CPU, 2, 1, 128, 0)
+DEF_TEST(FLOAT, CPU, 2, 4, 128, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 4095, 0)
+DEF_TEST(FLOAT, CPU, 4, 4, 1045991, 0)
+
+DEF_TEST(DOUBLE, CPU, 2, 4, 128, 0)
+DEF_TEST(INT32, CPU, 2, 4, 128, 0)
+DEF_TEST(INT64, CPU, 2, 4, 128, 0)
+
+// Failure cases
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 1)
+DEF_TEST(FLOAT, CPU, 2, 4, 128, 1)
+DEF_TEST(FLOAT, CPU, 2, 4, 128, 5)
+#endif
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+// Can only set W=1 for GPU tests.
+//       B      T    W  D  L  A
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 0)
+DEF_TEST(FLOAT, GPU, 1, 7, 3, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 33, 0)
+DEF_TEST(FLOAT, GPU, 1, 3, 64, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1001, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 4095, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1045991, 0)
+
+DEF_TEST(BOOL, GPU, 1, 4, 1, 0)
+DEF_TEST(BOOL, GPU, 1, 4, 1001, 0)
+
+DEF_TEST(DOUBLE, GPU, 1, 8, 1001, 0)
+DEF_TEST(INT64, GPU, 1, 8, 1001, 0)
+
+// Failure cases
+DEF_TEST(FLOAT, GPU, 1, 8, 128, 6)
+#endif
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 515477cd16a..3248d3f10a7 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -398,6 +398,21 @@ std::vector<Tensor> GetLocalArgs(gtl::ArraySlice<FunctionArg> args) {
   return tensors;
 }
 
+// Update the done callback to push Tensors in `tensors` into `rets`.
+FunctionLibraryRuntime::DoneCallback TensorsToFunctionRetsDoneCallback(
+    std::vector<FunctionRet>* rets, std::vector<Tensor>* tensors,
+    FunctionLibraryRuntime::DoneCallback done) {
+  return [rets, tensors, done = std::move(done)](const Status& s) {
+    if (s.ok()) {
+      for (const auto& t : *tensors) {
+        rets->push_back(t);
+      }
+    }
+    delete tensors;
+    done(s);
+  };
+}
+
 }  // anonymous namespace
 
 Status ProcessFunctionLibraryRuntime::PinArgsAndRets(
@@ -984,30 +999,30 @@ Status ProcessFunctionLibraryRuntime::GetOutputDevices(
   for (const auto& pair : data->glue_) {
     const ComponentFunctionData& comp_data = pair.second;
     DCHECK(comp_data.ret_alloc_attrs.size() == comp_data.ret_indices.size());
+    if (comp_data.ret_indices.empty()) {
+      continue;
+    }
 
     const string& target = pair.first;
     FunctionLibraryRuntime* target_flr = GetFLR(target);
+    Device* target_device = nullptr;
     if (target_flr == nullptr) {
-      if (!comp_data.ret_indices.empty()) {
-        return errors::Unimplemented(
-            "Currently, outputting tensors on remote devices is not supported. "
-            "The ",
-            comp_data.ret_indices[0],
-            "-th return value of the function outputs to target_device: ",
-            target,
-            " Please copy the tensor to local device explicitly using "
-            "tf.identity and return the new Tensor instead.");
-      }
-      continue;
+      // TODO(b/162618595): Remove this error once we support a remote
+      // multi-device function with remote outputs.
+      return errors::Unimplemented(
+          "Currently, outputting tensors on remote devices is not supported."
+          "The ",
+          comp_data.ret_indices[0],
+          "-th return value of the function outputs to target_device: ", target,
+          " Please copy the tensor to local device explicitly using "
+          "tf.identity and return the new Tensor instead.");
+    } else {
+      target_device = target_flr->device();
     }
-    Device* target_device = target_flr->device();
-    const FunctionBody* fbody = target_flr->GetFunctionBody(comp_data.handle);
-    DCHECK(fbody != nullptr);
-
     output_devices->resize(data->num_outputs_);
     for (int j = 0; j < comp_data.ret_indices.size(); ++j) {
       int ret_index = comp_data.ret_indices[j];
-      if (fbody->ret_types[j] == DT_RESOURCE) {
+      if (data->ret_types_[ret_index] == DT_RESOURCE) {
         (*output_devices)[ret_index] = target_device;
       } else {
         (*output_devices)[ret_index] =
@@ -1021,7 +1036,7 @@ Status ProcessFunctionLibraryRuntime::GetOutputDevices(
 
 void ProcessFunctionLibraryRuntime::RunMultiDevice(
     const FunctionLibraryRuntime::Options& opts,
-    FunctionLibraryRuntime::Handle handle, std::vector<Tensor>* rets,
+    FunctionLibraryRuntime::Handle handle, std::vector<FunctionRet>* rets,
     std::vector<std::unique_ptr<CleanUpItem>>* cleanup_items,
     FunctionLibraryRuntime::DoneCallback done,
     std::function<Status(const ComponentFunctionData& comp_data,
@@ -1097,7 +1112,7 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
       cm->StartCancel();
       continue;
     }
-    std::vector<Tensor>* comp_rets = new std::vector<Tensor>;
+    std::vector<FunctionRet>* comp_rets = new std::vector<FunctionRet>;
     rets->resize(data->num_outputs_);
 
     auto component_fn_callback = [comp_rets, rets, comp_data, refcounted_done,
@@ -1136,8 +1151,11 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
               << " with handle " << handle;
       VLOG(4) << "    with " << opts_copy.DebugString();
 
-      flr->Run(opts_copy, handle, GetLocalArgs(comp_args.args), comp_rets,
-               std::move(component_fn_callback));
+      std::vector<Tensor>* comp_tensor_rets = new std::vector<Tensor>;
+      flr->Run(
+          opts_copy, handle, GetLocalArgs(comp_args.args), comp_tensor_rets,
+          TensorsToFunctionRetsDoneCallback(comp_rets, comp_tensor_rets,
+                                            std::move(component_fn_callback)));
     } else {
       opts_copy.remote_execution = true;
 
@@ -1362,6 +1380,23 @@ void ProcessFunctionLibraryRuntime::Run(
   auto* cleanup_items = new std::vector<std::unique_ptr<CleanUpItem>>;
   done = ApplyCleanUpToDoneCallback(cleanup_items, std::move(done),
                                     new_opts.step_id, created_rendezvous);
+  std::vector<FunctionRet>* function_rets = new std::vector<FunctionRet>;
+  done = [rets, function_rets, done = std::move(done)](const Status& s) {
+    Status status = s;
+    if (status.ok()) {
+      for (const auto& ret : *function_rets) {
+        if (ret.index() == 0) {
+          rets->push_back(absl::get<Tensor>(ret));
+        } else {
+          status.Update(errors::Internal(
+              "Expect a Tensor as a function output but got a TensorShape."));
+          break;
+        }
+      }
+    }
+    delete function_rets;
+    done(status);
+  };
   bool multi_device;
   {
     tf_shared_lock l(mu_);
@@ -1392,21 +1427,21 @@ void ProcessFunctionLibraryRuntime::Run(
       }
       return Status::OK();
     };
-    return RunMultiDevice(new_opts, handle, rets, cleanup_items,
+    return RunMultiDevice(new_opts, handle, function_rets, cleanup_items,
                           std::move(done), std::move(get_component_args));
   }
   std::vector<FunctionArg> local_args;
   for (const auto& tensor : args) {
     local_args.push_back(tensor);
   }
-  RunInternal(new_opts, handle, local_args, rets, cleanup_items,
+  RunInternal(new_opts, handle, local_args, function_rets, cleanup_items,
               std::move(done));
 }
 
 void ProcessFunctionLibraryRuntime::RunInternal(
     const FunctionLibraryRuntime::Options& opts,
     FunctionLibraryRuntime::Handle handle, gtl::ArraySlice<FunctionArg> args,
-    std::vector<Tensor>* rets,
+    std::vector<FunctionRet>* rets,
     std::vector<std::unique_ptr<CleanUpItem>>* cleanup_items,
     FunctionLibraryRuntime::DoneCallback done) const {
   FunctionLibraryRuntime* flr = nullptr;
@@ -1475,10 +1510,13 @@ void ProcessFunctionLibraryRuntime::RunInternal(
                int64 num_returns = remote_rets->size();
                delete remote_rets;
                // Now receive the return values from the target.
+               std::vector<Tensor>* recv_tensors = new std::vector<Tensor>;
                ReceiveTensorsAsync(target_device, source_device, "ret_",
                                    target_incarnation, num_returns,
                                    device_context, rets_alloc_attrs, rendezvous,
-                                   rets, std::move(done));
+                                   recv_tensors,
+                                   TensorsToFunctionRetsDoneCallback(
+                                       rets, recv_tensors, std::move(done)));
              });
     return;
   }
@@ -1570,11 +1608,14 @@ Status ProcessFunctionLibraryRuntime::RunSync(
 void ProcessFunctionLibraryRuntime::Run(
     const FunctionLibraryRuntime::Options& opts,
     FunctionLibraryRuntime::Handle handle, const FunctionArgsInterface& args,
-    std::vector<Tensor>* rets,
+    std::vector<FunctionRet>* rets,
     FunctionLibraryRuntime::DoneCallback done) const {
   if (!args.HasRemoteOrPackedInputs()) {
     const std::vector<Tensor> local_inputs = args.GetLocalTensors();
-    return Run(opts, handle, local_inputs, rets, std::move(done));
+    std::vector<Tensor>* tensor_rets = new std::vector<Tensor>;
+    return Run(
+        opts, handle, local_inputs, tensor_rets,
+        TensorsToFunctionRetsDoneCallback(rets, tensor_rets, std::move(done)));
   }
 
   FunctionLibraryRuntime::Options new_opts = opts;
@@ -1667,6 +1708,10 @@ Status ProcessFunctionLibraryRuntime::Clone(
       device_mgr_, env, config_ ? &(*config_) : nullptr, graph_def_version,
       out_lib_def->get(), optimizer_options, default_thread_pool_, parent_,
       custom_kernel_creator, session_metadata_, rendezvous_factory_);
+  {
+    tf_shared_lock l(mu_);
+    for (auto* d : composite_devices_) (*out_pflr)->AddCompositeDevice(d);
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index bc68c9c2807..3ba04f17880 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -191,7 +191,7 @@ class ProcessFunctionLibraryRuntime {
 
   void Run(const FunctionLibraryRuntime::Options& opts,
            FunctionLibraryRuntime::Handle handle,
-           const FunctionArgsInterface& args, std::vector<Tensor>* rets,
+           const FunctionArgsInterface& args, std::vector<FunctionRet>* rets,
            FunctionLibraryRuntime::DoneCallback done) const;
 
   Status RunSync(const FunctionLibraryRuntime::Options& opts,
@@ -221,6 +221,7 @@ class ProcessFunctionLibraryRuntime {
   void AddCompositeDevice(CompositeDevice* d) TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
     device_set_->AddDevice(d);
+    composite_devices_.push_back(d);
   }
 
  protected:
@@ -303,7 +304,7 @@ class ProcessFunctionLibraryRuntime {
 
   void RunMultiDevice(
       const FunctionLibraryRuntime::Options& opts,
-      FunctionLibraryRuntime::Handle handle, std::vector<Tensor>* rets,
+      FunctionLibraryRuntime::Handle handle, std::vector<FunctionRet>* rets,
       std::vector<std::unique_ptr<CleanUpItem>>* cleanup_items,
       FunctionLibraryRuntime::DoneCallback done,
       std::function<Status(const ComponentFunctionData& comp_data,
@@ -387,7 +388,8 @@ class ProcessFunctionLibraryRuntime {
 
   void RunInternal(const FunctionLibraryRuntime::Options& opts,
                    FunctionLibraryRuntime::Handle handle,
-                   gtl::ArraySlice<FunctionArg> args, std::vector<Tensor>* rets,
+                   gtl::ArraySlice<FunctionArg> args,
+                   std::vector<FunctionRet>* rets,
                    std::vector<std::unique_ptr<CleanUpItem>>* cleanup_items,
                    FunctionLibraryRuntime::DoneCallback done) const;
 
@@ -452,6 +454,9 @@ class ProcessFunctionLibraryRuntime {
   // fail if it spans the changed remote devices.
   std::shared_ptr<DeviceSet> device_set_ TF_GUARDED_BY(mu_);
 
+  // Composite devices owned by a EagerContext.
+  std::vector<CompositeDevice*> composite_devices_ TF_GUARDED_BY(mu_);
+
   // Holds all the function instantiations. Maps function_keys to handles.
   std::unordered_map<string, FunctionLibraryRuntime::Handle> table_
       TF_GUARDED_BY(mu_);
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index 19c33a53d20..54c821d282a 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -72,7 +72,7 @@ class TestClusterFLR : public DistributedFunctionLibraryRuntime {
 
   void Run(const FunctionLibraryRuntime::Options& opts,
            FunctionLibraryRuntime::LocalHandle handle,
-           gtl::ArraySlice<FunctionArg> args, std::vector<Tensor>* rets,
+           gtl::ArraySlice<FunctionArg> args, std::vector<FunctionRet>* rets,
            FunctionLibraryRuntime::DoneCallback done) override {}
 
   void CleanUp(uint64 step_id, FunctionLibraryRuntime::LocalHandle handle,
@@ -209,12 +209,12 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   }
 
-  template <typename T>
+  template <typename T, typename K>
   Status RunWithRuntime(
       const string& name, FunctionLibraryRuntime::Options opts,
       test::function::Attrs attrs,
       const FunctionLibraryRuntime::InstantiateOptions& instantiate_opts,
-      const T& args, std::vector<Tensor*> rets,
+      const T& args, std::vector<K*> rets,
       ProcessFunctionLibraryRuntime* pflr) {
     FunctionLibraryRuntime::Handle handle;
     Status status = pflr->Instantiate(name, attrs, instantiate_opts, &handle);
@@ -234,7 +234,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
 
     Notification done;
     opts.runner = &runner;
-    std::vector<Tensor> out;
+    std::vector<K> out;
     pflr->Run(opts, handle, args, &out, [&status, &done](const Status& s) {
       status = s;
       done.Notify();
@@ -273,7 +273,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
              const FunctionLibraryRuntime::InstantiateOptions& instantiate_opts,
              const std::vector<Tensor>& args, std::vector<Tensor*> rets,
              ProcessFunctionLibraryRuntime* pflr = nullptr) {
-    return RunWithRuntime<std::vector<Tensor>>(
+    return RunWithRuntime<std::vector<Tensor>, Tensor>(
         name, opts, attrs, instantiate_opts, args, rets, proc_flr_.get());
   }
 
@@ -281,9 +281,9 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
       const string& name, FunctionLibraryRuntime::Options opts,
       test::function::Attrs attrs,
       const FunctionLibraryRuntime::InstantiateOptions& instantiate_opts,
-      const FunctionArgsInterface& args, std::vector<Tensor*> rets,
+      const FunctionArgsInterface& args, std::vector<FunctionRet*> rets,
       ProcessFunctionLibraryRuntime* pflr = nullptr) {
-    return RunWithRuntime<FunctionArgsInterface>(
+    return RunWithRuntime<FunctionArgsInterface, FunctionRet>(
         name, opts, attrs, instantiate_opts, args, rets, proc_flr_.get());
   }
 
@@ -879,10 +879,12 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_CompositeDevice) {
     handles.push_back(TensorValue(&resource_handle0));
     handles.push_back(TensorValue(&resource_handle1));
     TestFunctionPackedArgs args(0, std::move(handles));
-    Tensor ret;
+    FunctionRet ret;
     TF_CHECK_OK(RunWithPackedArgs("AddVarAcrossDevices", opts,
                                   {{"T", DT_FLOAT}}, inst_opts, args, {&ret}));
-    test::ExpectTensorEqual<float>(ret, test::AsTensor<float>({40, 60}));
+    EXPECT_EQ(ret.index(), 0);
+    test::ExpectTensorEqual<float>(absl::get<Tensor>(ret),
+                                   test::AsTensor<float>({40, 60}));
   }
 
   // Packed Tensor
@@ -1188,6 +1190,28 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, SessionMetadataPresent) {
   EXPECT_EQ(session_metadata.version(), read_metadata.version());
 }
 
+TEST_F(ProcessFunctionLibraryRuntimeTest, CompositeDevicesAfterCloning) {
+  Init({AddVarAcrossDevices()});
+
+  Status s;
+  std::unique_ptr<CompositeDevice> composite_device =
+      CompositeDevice::MakeDevice({device0_->name(), device1_->name()},
+                                  /*unique_device_id=*/0,
+                                  device_mgr_->HostCPU()->parsed_name(), &s);
+  TF_ASSERT_OK(s);
+  AddCompositeDevice(composite_device.get());
+
+  auto* flr = proc_flr_->GetFLR("/job:a/replica:0/task:0/cpu:0");
+  ASSERT_NE(nullptr, flr);
+  std::unique_ptr<FunctionLibraryDefinition> cloned_lib_def;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> cloned_proc_flr;
+  FunctionLibraryRuntime* cloned_flr;
+  TF_ASSERT_OK(flr->Clone(&cloned_lib_def, &cloned_proc_flr, &cloned_flr));
+  EXPECT_EQ(
+      cloned_proc_flr->device_set()->FindDeviceByName(composite_device->name()),
+      composite_device.get());
+}
+
 TEST_F(ProcessFunctionLibraryRuntimeTest, SessionMetadataPresentAfterCloning) {
   const SessionMetadata session_metadata = GenerateSessionMetadata();
   Init({SessionMetadataReaderOpFn()}, &session_metadata);
@@ -1204,9 +1228,10 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, SessionMetadataPresentAfterCloning) {
   instantiate_opts.target = "/job:a/replica:0/task:0/cpu:0";
   const auto x = test::AsTensor<int64>({17});
   Tensor y;
-  TF_CHECK_OK(RunWithRuntime<std::vector<Tensor>>(
+  Status s = RunWithRuntime<std::vector<Tensor>, Tensor>(
       "SessionMetadataReaderFn", opts, {}, instantiate_opts, {x}, {&y},
-      cloned_proc_flr.get()));
+      cloned_proc_flr.get());
+  TF_CHECK_OK(s);
   SessionMetadata read_metadata;
   ASSERT_TRUE(protobuf::TextFormat::ParseFromString(y.scalar<tstring>()(),
                                                     &read_metadata));
diff --git a/tensorflow/core/common_runtime/process_state.cc b/tensorflow/core/common_runtime/process_state.cc
index 19f7a985f3e..300e5b9c6ea 100644
--- a/tensorflow/core/common_runtime/process_state.cc
+++ b/tensorflow/core/common_runtime/process_state.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/process_state.h"
 
+#include <atomic>
 #include <cstring>
 #include <vector>
 
@@ -42,7 +43,8 @@ namespace tensorflow {
   return instance;
 }
 
-ProcessState::ProcessState() : numa_enabled_(false) {}
+ProcessState::ProcessState()
+    : numa_enabled_(false), cpu_allocators_cached_(0) {}
 
 string ProcessState::MemDesc::DebugString() {
   return strings::StrCat((loc == CPU ? "CPU " : "GPU "), dev_index,
@@ -61,6 +63,12 @@ ProcessState::MemDesc ProcessState::PtrType(const void* ptr) {
 
 Allocator* ProcessState::GetCPUAllocator(int numa_node) {
   if (!numa_enabled_ || numa_node == port::kNUMANoAffinity) numa_node = 0;
+
+  // Check if allocator for the numa node is in lock-free cache.
+  if (numa_node < cpu_allocators_cached_.load(std::memory_order_acquire)) {
+    return cpu_allocators_cache_[numa_node];
+  }
+
   mutex_lock lock(mu_);
   while (cpu_allocators_.size() <= static_cast<size_t>(numa_node)) {
     // If visitors have been defined we need an Allocator built from
@@ -115,6 +123,10 @@ Allocator* ProcessState::GetCPUAllocator(int numa_node) {
       allocator = new TrackingAllocator(allocator, true);
     }
     cpu_allocators_.push_back(allocator);
+    if (cpu_allocators_.size() < cpu_allocators_cache_.max_size()) {
+      cpu_allocators_cache_[cpu_allocators_.size() - 1] = allocator;
+      cpu_allocators_cached_.fetch_add(1, std::memory_order_release);
+    }
     if (!sub_allocator) {
       DCHECK(cpu_alloc_visitors_.empty() && cpu_free_visitors_.empty());
     }
diff --git a/tensorflow/core/common_runtime/process_state.h b/tensorflow/core/common_runtime/process_state.h
index a833c22db1c..92dd680ca1a 100644
--- a/tensorflow/core/common_runtime/process_state.h
+++ b/tensorflow/core/common_runtime/process_state.h
@@ -102,6 +102,13 @@ class ProcessState : public ProcessStateInterface {
   std::vector<SubAllocator::Visitor> cpu_alloc_visitors_ TF_GUARDED_BY(mu_);
   std::vector<SubAllocator::Visitor> cpu_free_visitors_ TF_GUARDED_BY(mu_);
 
+  // A cache of cpu allocators indexed by a numa node. Used as a fast path to
+  // get CPU allocator by numa node id without locking the mutex. We can't use
+  // `cpu_allocators_` storage in the lock-free path because concurrent
+  // operation can deallocate the vector storage.
+  std::atomic<int> cpu_allocators_cached_;
+  std::array<Allocator*, 8> cpu_allocators_cache_;
+
   // Optional RecordingAllocators that wrap the corresponding
   // Allocators for runtime attribute use analysis.
   MDMap mem_desc_map_;
diff --git a/tensorflow/core/common_runtime/ring_alg.cc b/tensorflow/core/common_runtime/ring_alg.cc
index 3a1a84a376d..753f6ba982e 100644
--- a/tensorflow/core/common_runtime/ring_alg.cc
+++ b/tensorflow/core/common_runtime/ring_alg.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/ring_alg.h"
 
 #include <stdlib.h>
+
 #include <atomic>
 #include <functional>
 #include <utility>
@@ -240,7 +241,8 @@ Status RingAlg::InitializeCollectiveParams(CollectiveParams* col_params) {
   return Status::OK();
 }
 
-Status RingAlg::InitializeCollectiveContext(CollectiveContext* col_ctx) {
+Status RingAlg::InitializeCollectiveContext(
+    std::shared_ptr<CollectiveContext> col_ctx) {
   DCHECK(col_ctx->dev_mgr);
   col_ctx_ = col_ctx;
   col_params_ = &col_ctx->col_params;
diff --git a/tensorflow/core/common_runtime/ring_alg.h b/tensorflow/core/common_runtime/ring_alg.h
index c2da62c86d7..3ccb07f6d5c 100644
--- a/tensorflow/core/common_runtime/ring_alg.h
+++ b/tensorflow/core/common_runtime/ring_alg.h
@@ -39,7 +39,8 @@ class RingAlg : public CollectiveImplementationInterface {
 
   // Initializes members of CollectiveContext not yet initialized, i.e. device
   // and device_locality.  Also saves the CollectiveContext in this object.
-  Status InitializeCollectiveContext(CollectiveContext* col_ctx) override;
+  Status InitializeCollectiveContext(
+      std::shared_ptr<CollectiveContext> col_ctx) override;
 
   // No-op for ring alg.
   Status InitializeCollectiveGroupRuntimeDetails(
@@ -108,7 +109,7 @@ class RingAlg : public CollectiveImplementationInterface {
 
   const CollectiveType type_;
   const string name_;
-  CollectiveContext* col_ctx_;          // Not owned
+  std::shared_ptr<CollectiveContext> col_ctx_;
   const CollectiveParams* col_params_;  // Not owned
   StatusCallback done_;
   int group_size_;
diff --git a/tensorflow/core/common_runtime/ring_gatherer_test.cc b/tensorflow/core/common_runtime/ring_gatherer_test.cc
index 3af4890e3d3..3e70f523ff5 100644
--- a/tensorflow/core/common_runtime/ring_gatherer_test.cc
+++ b/tensorflow/core/common_runtime/ring_gatherer_test.cc
@@ -369,8 +369,9 @@ class RingGathererTest : public ::testing::Test {
     cp->instance.impl_details.subdiv_permutations.clear();
     cp->subdiv_rank.clear();
     // Create a stub ring gatherer only for testing param initialization.
-    RingGatherer gatherer;
-    TF_CHECK_OK(gatherer.InitializeCollectiveParams(cp));
+    RingGatherer* gatherer = new RingGatherer;
+    core::ScopedUnref unref(gatherer);
+    TF_CHECK_OK(gatherer->InitializeCollectiveParams(cp));
     EXPECT_EQ(expected_subdiv_perms,
               cp->instance.impl_details.subdiv_permutations);
     EXPECT_EQ(expected_subdiv_rank, cp->subdiv_rank);
@@ -476,14 +477,15 @@ class RingGathererTest : public ::testing::Test {
       // Prepare a RingGatherer instance.
       string exec_key =
           strings::StrCat(col_params_.instance.instance_key, ":0:0");
-      RingGatherer gatherer;
-      CollectiveContext col_ctx(parent_->col_exec_, parent_->dev_mgr_.get(),
-                                &ctx, &op_params, col_params_, exec_key,
-                                kStepId, &input_tensor_, output_tensor_ptr);
-      TF_CHECK_OK(gatherer.InitializeCollectiveContext(&col_ctx));
+      RingGatherer* gatherer = new RingGatherer;
+      core::ScopedUnref unref(gatherer);
+      auto col_ctx = std::make_shared<CollectiveContext>(
+          parent_->col_exec_, parent_->dev_mgr_.get(), &ctx, &op_params,
+          col_params_, exec_key, kStepId, &input_tensor_, output_tensor_ptr);
+      TF_CHECK_OK(gatherer->InitializeCollectiveContext(col_ctx));
 
       // Run the all-gather.
-      gatherer.Run([this](Status s) { status_ = s; });
+      gatherer->Run([this](Status s) { status_ = s; });
       if (status_.ok()) {
         CHECK(output_tensor_.CopyFrom(*ctx.mutable_output(0),
                                       ctx.mutable_output(0)->shape()));
diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc
index 318d6e91afb..a7f99cf0f45 100644
--- a/tensorflow/core/common_runtime/ring_reducer_test.cc
+++ b/tensorflow/core/common_runtime/ring_reducer_test.cc
@@ -393,12 +393,13 @@ class RingReducerTest : public ::testing::Test {
     cp->instance.impl_details.subdiv_permutations.clear();
     cp->subdiv_rank.clear();
     // Create a stub ring reducer only for testing param initialization.
-    RingReducer reducer;
-    TF_CHECK_OK(reducer.InitializeCollectiveParams(cp));
+    RingReducer* reducer = new RingReducer;
+    core::ScopedUnref unref(reducer);
+    TF_CHECK_OK(reducer->InitializeCollectiveParams(cp));
     EXPECT_EQ(expected_subdiv_perms,
               cp->instance.impl_details.subdiv_permutations);
     EXPECT_EQ(expected_subdiv_rank, cp->subdiv_rank);
-    reducer.group_size_tensor_ready_.Notify();  // To unblock destructor.
+    reducer->group_size_tensor_ready_.Notify();  // To unblock destructor.
   }
 
   class DeviceInstance {
@@ -506,14 +507,15 @@ class RingReducerTest : public ::testing::Test {
       // Prepare a RingReducer instance.
       string exec_key =
           strings::StrCat(col_params_.instance.instance_key, ":0:0");
-      RingReducer reducer;
-      CollectiveContext col_ctx(parent_->col_exec_, parent_->dev_mgr_.get(),
-                                &ctx, &op_params, col_params_, exec_key,
-                                kStepId, &tensor_, &tensor_);
-      TF_CHECK_OK(reducer.InitializeCollectiveContext(&col_ctx));
+      RingReducer* reducer = new RingReducer;
+      core::ScopedUnref unref(reducer);
+      auto col_ctx = std::make_shared<CollectiveContext>(
+          parent_->col_exec_, parent_->dev_mgr_.get(), &ctx, &op_params,
+          col_params_, exec_key, kStepId, &tensor_, &tensor_);
+      TF_CHECK_OK(reducer->InitializeCollectiveContext(col_ctx));
 
       // Run the all-reduce.
-      reducer.Run([this](Status s) { status_ = s; });
+      reducer->Run([this](Status s) { status_ = s; });
       if (status_.ok()) {
         CHECK(tensor_.CopyFrom(*ctx.mutable_output(0), tensor_.shape()));
       }
diff --git a/tensorflow/core/common_runtime/scoped_allocator.h b/tensorflow/core/common_runtime/scoped_allocator.h
index f25bee45b01..a8bb71d2f4b 100644
--- a/tensorflow/core/common_runtime/scoped_allocator.h
+++ b/tensorflow/core/common_runtime/scoped_allocator.h
@@ -45,7 +45,7 @@ class ScopedAllocator {
   // instance.  It must be large enough to back all of the specified
   // (offset, byte) ranges of the fields.
   ScopedAllocator(const Tensor& backing_tensor, int32 scope_id,
-                  const string& name, const gtl::ArraySlice<Field> fields,
+                  const std::string& name, const gtl::ArraySlice<Field> fields,
                   int32 expected_call_count,
                   ScopedAllocatorContainer* container);
 
@@ -60,7 +60,7 @@ class ScopedAllocator {
 
   const Tensor& tensor() const { return backing_tensor_; }
 
-  const string& name() const { return name_; }
+  const std::string& name() const { return name_; }
 
  private:
   friend class ScopedAllocatorInstance;
@@ -71,7 +71,7 @@ class ScopedAllocator {
   Tensor backing_tensor_;
   TensorBuffer* tbuf_;
   int32 id_;
-  string name_;
+  std::string name_;
   ScopedAllocatorContainer* container_;
   std::vector<Field> fields_;
   mutex mu_;
@@ -111,7 +111,7 @@ class ScopedAllocatorInstance : public Allocator {
   size_t AllocatedSize(const void* ptr) const override { return 0; }
   int64 AllocationId(const void* ptr) const override { return 0; }
   size_t AllocatedSizeSlow(const void* ptr) const override { return 0; }
-  string Name() override;
+  std::string Name() override;
 
  private:
   mutex mu_;
diff --git a/tensorflow/core/common_runtime/scoped_allocator_mgr.h b/tensorflow/core/common_runtime/scoped_allocator_mgr.h
index d10679a5488..268f7b3dc78 100644
--- a/tensorflow/core/common_runtime/scoped_allocator_mgr.h
+++ b/tensorflow/core/common_runtime/scoped_allocator_mgr.h
@@ -32,7 +32,8 @@ class ScopedAllocatorContainer : public core::RefCounted {
  public:
   // Establishes a reachable ScopedAllocator.
   Status AddScopedAllocator(
-      const Tensor& backing_tensor, int32 scope_id, const string& scope_name,
+      const Tensor& backing_tensor, int32 scope_id,
+      const std::string& scope_name,
       const gtl::ArraySlice<ScopedAllocator::Field>& fields,
       int32 expected_call_count);
 
@@ -72,7 +73,7 @@ class ScopedAllocatorContainer : public core::RefCounted {
 // At most one of these exists per device.
 class ScopedAllocatorMgr {
  public:
-  explicit ScopedAllocatorMgr(const string& device_name)
+  explicit ScopedAllocatorMgr(const std::string& device_name)
       : device_name_(device_name) {}
   ~ScopedAllocatorMgr();
 
@@ -81,7 +82,7 @@ class ScopedAllocatorMgr {
   // Establishes a reachable ScopedAllocator.
   Status AddScopedAllocator(
       const Tensor& backing_tensor, int64 step_id, int32 scope_id,
-      const string& scope_name,
+      const std::string& scope_name,
       const gtl::ArraySlice<ScopedAllocator::Field>& fields,
       int32 expected_call_count);
 
@@ -97,10 +98,10 @@ class ScopedAllocatorMgr {
                                const DataType dtype,
                                std::vector<ScopedAllocator::Field>* fields);
 
-  const string& device_name() const { return device_name_; }
+  const std::string& device_name() const { return device_name_; }
 
  private:
-  string device_name_;
+  std::string device_name_;
   mutex mu_;
   std::unordered_map<int64, ScopedAllocatorContainer*> per_step_map_
       TF_GUARDED_BY(mu_);
diff --git a/tensorflow/core/common_runtime/test_collective_executor_mgr.h b/tensorflow/core/common_runtime/test_collective_executor_mgr.h
index be99be3a738..22694120403 100644
--- a/tensorflow/core/common_runtime/test_collective_executor_mgr.h
+++ b/tensorflow/core/common_runtime/test_collective_executor_mgr.h
@@ -53,6 +53,30 @@ class TestCollectiveExecutor : public CollectiveExecutor {
   }
 };
 
+class TestParamResolver : public ParamResolverInterface {
+  void CompleteParamsAsync(const string& device, CollectiveParams* cp,
+                           CancellationManager* cancel_mgr,
+                           const StatusCallback& done) override {
+    done(errors::Internal("Unimplemented"));
+  }
+
+  void CompleteGroupAsync(const CompleteGroupRequest* request,
+                          CompleteGroupResponse* response,
+                          CancellationManager* cancel_mgr,
+                          const StatusCallback& done) override {
+    done(errors::Internal("Unimplemented"));
+  }
+
+  void CompleteInstanceAsync(const CompleteInstanceRequest* request,
+                             CompleteInstanceResponse* response,
+                             CancellationManager* cancel_mgr,
+                             const StatusCallback& done) override {
+    done(errors::Internal("Unimplemented"));
+  }
+
+  void StartAbort(const Status& s) override { return; }
+};
+
 class TestCollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
  public:
   TestCollectiveExecutorMgr() {}
@@ -87,8 +111,7 @@ class TestCollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
   }
 
   ParamResolverInterface* GetParamResolver() const override {
-    LOG(FATAL);
-    return nullptr;
+    return &param_resolver_;
   }
 
   DeviceResolverInterface* GetDeviceResolver() const override {
@@ -115,6 +138,7 @@ class TestCollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
 
   mutex mu_;
   gtl::FlatMap<int64, CollectiveExecutor*> table_ TF_GUARDED_BY(mu_);
+  mutable TestParamResolver param_resolver_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/BUILD b/tensorflow/core/data/service/BUILD
index bebd179401e..13034eb4354 100644
--- a/tensorflow/core/data/service/BUILD
+++ b/tensorflow/core/data/service/BUILD
@@ -28,8 +28,8 @@ tf_proto_library(
 )
 
 tf_proto_library(
-    name = "master_proto",
-    srcs = ["master.proto"],
+    name = "dispatcher_proto",
+    srcs = ["dispatcher.proto"],
     has_services = 1,
     cc_api_version = 2,
     protodeps = tf_additional_all_protos() + [
@@ -49,17 +49,19 @@ tf_proto_library(
 )
 
 cc_library(
-    name = "master_impl",
-    srcs = ["master_impl.cc"],
+    name = "dispatcher_impl",
+    srcs = ["dispatcher_impl.cc"],
     hdrs = [
-        "master_impl.h",
+        "dispatcher_impl.h",
     ],
     deps = [
         ":common_proto_cc",
         ":credentials_factory",
         ":data_service",
+        ":dispatcher_proto_cc",
+        ":dispatcher_state",
         ":grpc_util",
-        ":master_proto_cc",
+        ":journal",
         ":worker_cc_grpc_proto",
         ":worker_proto_cc",
         "//tensorflow/c:c_api_internal",
@@ -77,6 +79,39 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "dispatcher_state",
+    srcs = ["dispatcher_state.cc"],
+    hdrs = [
+        "dispatcher_state.h",
+    ],
+    deps = [
+        ":common_proto_cc",
+        ":data_service",
+        ":journal",
+        ":journal_proto_cc",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+tf_cc_test(
+    name = "dispatcher_state_test",
+    srcs = ["dispatcher_state_test.cc"],
+    deps = [
+        ":common_proto_cc",
+        ":dispatcher_state",
+        ":journal",
+        ":journal_proto_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/platform:errors",
+    ],
+)
+
 cc_library(
     name = "worker_impl",
     srcs = ["worker_impl.cc"],
@@ -86,9 +121,9 @@ cc_library(
     deps = [
         ":common_proto_cc",
         ":credentials_factory",
+        ":dispatcher_cc_grpc_proto",
+        ":dispatcher_proto_cc",
         ":grpc_util",
-        ":master_cc_grpc_proto",
-        ":master_proto_cc",
         ":worker_proto_cc",
         "//tensorflow/c:c_api_internal",
         "//tensorflow/c:tf_status_helper",
@@ -114,6 +149,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/core:lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         tf_grpc_cc_dependency(),
     ],
 )
@@ -130,6 +166,43 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "journal",
+    srcs = ["journal.cc"],
+    hdrs = ["journal.h"],
+    deps = [
+        ":journal_proto_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/platform:regexp",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "journal_test",
+    srcs = ["journal_test.cc"],
+    deps = [
+        ":common_proto_cc",
+        ":journal",
+        ":journal_proto_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+tf_proto_library(
+    name = "journal_proto",
+    srcs = ["journal.proto"],
+    cc_api_version = 2,
+    protodeps = [
+        ":common_proto",
+    ],
+)
+
 cc_library(
     name = "credentials_factory",
     srcs = ["credentials_factory.cc"],
@@ -171,6 +244,7 @@ cc_library(
     hdrs = ["test_cluster.h"],
     deps = [
         ":server_lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:errors",
         "@com_google_absl//absl/strings",
     ],
@@ -207,12 +281,13 @@ tf_cc_test(
 )
 
 cc_library(
-    name = "grpc_master_impl",
-    srcs = ["grpc_master_impl.cc"],
-    hdrs = ["grpc_master_impl.h"],
+    name = "grpc_dispatcher_impl",
+    srcs = ["grpc_dispatcher_impl.cc"],
+    hdrs = ["grpc_dispatcher_impl.h"],
     deps = [
-        ":master_cc_grpc_proto",
-        ":master_impl",
+        ":dispatcher_cc_grpc_proto",
+        ":dispatcher_impl",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         tf_grpc_cc_dependency(),
     ],
@@ -225,6 +300,7 @@ cc_library(
     deps = [
         ":worker_cc_grpc_proto",
         ":worker_impl",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         tf_grpc_cc_dependency(),
     ],
@@ -250,10 +326,11 @@ cc_library(
     ],
     deps = [
         ":credentials_factory",
-        ":grpc_master_impl",
+        ":grpc_dispatcher_impl",
         ":grpc_util",
         ":grpc_worker_impl",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
         tf_grpc_cc_dependency(),
     ],
@@ -268,9 +345,9 @@ cc_library(
     ],
     deps = [
         ":credentials_factory",
+        ":dispatcher_cc_grpc_proto",
+        ":dispatcher_proto_cc",
         ":grpc_util",
-        ":master_cc_grpc_proto",
-        ":master_proto_cc",
         ":worker_cc_grpc_proto",
         ":worker_proto_cc",
         "//tensorflow/core:framework",
@@ -287,12 +364,12 @@ tf_cc_test(
     tags = ["no_windows"],
     deps = [
         ":data_service",
-        ":grpc_master_impl",
+        ":dispatcher_cc_grpc_proto",
+        ":dispatcher_proto_cc",
+        ":grpc_dispatcher_impl",
         ":grpc_util",
         ":grpc_worker_impl",
         ":local_credentials_factory",
-        ":master_cc_grpc_proto",
-        ":master_proto_cc",
         ":server_lib",
         ":test_cluster",
         ":test_util",
@@ -309,11 +386,11 @@ tf_cc_test(
 )
 
 cc_grpc_library(
-    name = "master_cc_grpc_proto",
-    srcs = [":master_proto"],
+    name = "dispatcher_cc_grpc_proto",
+    srcs = [":dispatcher_proto"],
     generate_mocks = True,
     grpc_only = True,
-    deps = [":master_proto_cc"],
+    deps = [":dispatcher_proto_cc"],
 )
 
 cc_grpc_library(
diff --git a/tensorflow/core/data/service/common.proto b/tensorflow/core/data/service/common.proto
index 4bde56fe1ca..aeeb1371171 100644
--- a/tensorflow/core/data/service/common.proto
+++ b/tensorflow/core/data/service/common.proto
@@ -3,7 +3,6 @@ syntax = "proto3";
 package tensorflow.data;
 
 import "tensorflow/core/framework/graph.proto";
-import "tensorflow/core/framework/types.proto";
 
 message DatasetDef {
   // We represent datasets as tensorflow GraphDefs which define the operations
@@ -19,3 +18,19 @@ message TaskDef {
   int64 task_id = 3;
   int64 job_id = 4;
 }
+
+message TaskInfo {
+  // The address of the worker processing the task.
+  string worker_address = 1;
+  // The task id.
+  int64 task_id = 2;
+  // The id of the job that the task is part of.
+  int64 job_id = 3;
+}
+
+enum ProcessingModeDef {
+  // Each tf.data worker processes an entire epoch.
+  PARALLEL_EPOCHS = 0;
+  // Processing of an epoch is distributed across all tf.data workers.
+  ONE_EPOCH = 1;
+}
diff --git a/tensorflow/core/data/service/data_service.cc b/tensorflow/core/data/service/data_service.cc
index d4e08c77f35..be09b10c1fc 100644
--- a/tensorflow/core/data/service/data_service.cc
+++ b/tensorflow/core/data/service/data_service.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include "grpcpp/create_channel.h"
 #include "grpcpp/security/credentials.h"
 #include "tensorflow/core/data/service/credentials_factory.h"
+#include "tensorflow/core/data/service/dispatcher.grpc.pb.h"
 #include "tensorflow/core/data/service/grpc_util.h"
-#include "tensorflow/core/data/service/master.grpc.pb.h"
 #include "tensorflow/core/data/service/worker.grpc.pb.h"
 #include "tensorflow/core/framework/dataset.h"
 
@@ -54,8 +54,8 @@ std::string ProcessingModeToString(ProcessingMode mode) {
   }
 }
 
-Status DataServiceMasterClient::RegisterDataset(GraphDef dataset,
-                                                int64* dataset_id) {
+Status DataServiceDispatcherClient::RegisterDataset(GraphDef dataset,
+                                                    int64* dataset_id) {
   TF_RETURN_IF_ERROR(EnsureInitialized());
   GetOrRegisterDatasetRequest req;
   *req.mutable_dataset()->mutable_graph() = dataset;
@@ -69,9 +69,9 @@ Status DataServiceMasterClient::RegisterDataset(GraphDef dataset,
   return Status::OK();
 }
 
-Status DataServiceMasterClient::CreateJob(int64 dataset_id,
-                                          ProcessingMode processing_mode,
-                                          int64* job_id) {
+Status DataServiceDispatcherClient::CreateJob(int64 dataset_id,
+                                              ProcessingMode processing_mode,
+                                              int64* job_id) {
   TF_RETURN_IF_ERROR(EnsureInitialized());
   CreateJobRequest req;
   req.set_dataset_id(dataset_id);
@@ -88,11 +88,9 @@ Status DataServiceMasterClient::CreateJob(int64 dataset_id,
   return Status::OK();
 }
 
-Status DataServiceMasterClient::GetOrCreateJob(int64 dataset_id,
-                                               ProcessingMode processing_mode,
-                                               const std::string& job_name,
-                                               int job_name_index,
-                                               int64* job_id) {
+Status DataServiceDispatcherClient::GetOrCreateJob(
+    int64 dataset_id, ProcessingMode processing_mode,
+    const std::string& job_name, int job_name_index, int64* job_id) {
   TF_RETURN_IF_ERROR(EnsureInitialized());
   GetOrCreateJobRequest req;
   req.set_dataset_id(dataset_id);
@@ -112,9 +110,9 @@ Status DataServiceMasterClient::GetOrCreateJob(int64 dataset_id,
   return Status::OK();
 }
 
-Status DataServiceMasterClient::GetTasks(int64 job_id,
-                                         std::vector<TaskInfo>* tasks,
-                                         bool* job_finished) {
+Status DataServiceDispatcherClient::GetTasks(int64 job_id,
+                                             std::vector<TaskInfo>* tasks,
+                                             bool* job_finished) {
   TF_RETURN_IF_ERROR(EnsureInitialized());
   GetTasksRequest req;
   req.set_job_id(job_id);
@@ -132,7 +130,8 @@ Status DataServiceMasterClient::GetTasks(int64 job_id,
   return Status::OK();
 }
 
-Status DataServiceMasterClient::GetWorkers(std::vector<WorkerInfo>* workers) {
+Status DataServiceDispatcherClient::GetWorkers(
+    std::vector<WorkerInfo>* workers) {
   TF_RETURN_IF_ERROR(EnsureInitialized());
   GetWorkersRequest req;
   GetWorkersResponse resp;
@@ -148,12 +147,12 @@ Status DataServiceMasterClient::GetWorkers(std::vector<WorkerInfo>* workers) {
   return Status::OK();
 }
 
-Status DataServiceMasterClient::EnsureInitialized() {
+Status DataServiceDispatcherClient::EnsureInitialized() {
   std::shared_ptr<grpc::ChannelCredentials> credentials;
   TF_RETURN_IF_ERROR(
       CredentialsFactory::CreateClientCredentials(protocol_, &credentials));
   auto channel = grpc::CreateChannel(address_, credentials);
-  stub_ = MasterService::NewStub(channel);
+  stub_ = DispatcherService::NewStub(channel);
   return Status::OK();
 }
 
@@ -187,10 +186,11 @@ Status DataServiceWorkerClient::EnsureInitialized() {
   return Status::OK();
 }
 
-Status CreateDataServiceMasterClient(
+Status CreateDataServiceDispatcherClient(
     const std::string& address, const std::string& protocol,
-    std::unique_ptr<DataServiceMasterClient>* out) {
-  auto client = absl::make_unique<DataServiceMasterClient>(address, protocol);
+    std::unique_ptr<DataServiceDispatcherClient>* out) {
+  auto client =
+      absl::make_unique<DataServiceDispatcherClient>(address, protocol);
   TF_RETURN_IF_ERROR(client->Initialize());
   *out = std::move(client);
   return Status::OK();
diff --git a/tensorflow/core/data/service/data_service.h b/tensorflow/core/data/service/data_service.h
index bb5a8a470f0..d0e46c82ff5 100644
--- a/tensorflow/core/data/service/data_service.h
+++ b/tensorflow/core/data/service/data_service.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DATA_SERVICE_DATA_SERVICE_H_
 #define TENSORFLOW_CORE_DATA_SERVICE_DATA_SERVICE_H_
 
-#include "tensorflow/core/data/service/master.grpc.pb.h"
+#include "tensorflow/core/data/service/dispatcher.grpc.pb.h"
 #include "tensorflow/core/data/service/worker.grpc.pb.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -67,11 +67,11 @@ class DataServiceClientBase {
   const std::string protocol_;
 };
 
-// Client for communicating with the tf.data service master.
-class DataServiceMasterClient : public DataServiceClientBase {
+// Client for communicating with the tf.data service dispatcher.
+class DataServiceDispatcherClient : public DataServiceClientBase {
  public:
-  DataServiceMasterClient(const std::string& address,
-                          const std::string& protocol)
+  DataServiceDispatcherClient(const std::string& address,
+                              const std::string& protocol)
       : DataServiceClientBase(address, protocol) {}
 
   // Registers a dataset with the tf.data service, and stores the generated
@@ -90,13 +90,13 @@ class DataServiceMasterClient : public DataServiceClientBase {
                         const std::string& job_name, int job_name_index,
                         int64* job_id);
 
-  // Queries the master for the tasks associated with the specified job.
+  // Queries the dispatcher for the tasks associated with the specified job.
   // The tasks will be stored in *tasks, and whether the job is finished will
   // be stored in `*job_finished`.
   Status GetTasks(int64 job_id, std::vector<TaskInfo>* tasks,
                   bool* job_finished);
 
-  // Queries the master for its registered workers. The worker info will be
+  // Queries the dispatcher for its registered workers. The worker info will be
   // stored in `*workers`.
   Status GetWorkers(std::vector<WorkerInfo>* workers);
 
@@ -104,7 +104,7 @@ class DataServiceMasterClient : public DataServiceClientBase {
   Status EnsureInitialized() override;
 
  private:
-  std::unique_ptr<MasterService::Stub> stub_;
+  std::unique_ptr<DispatcherService::Stub> stub_;
 };
 
 // Client for communicating with the tf.data service worker.
@@ -127,10 +127,10 @@ class DataServiceWorkerClient : public DataServiceClientBase {
   std::unique_ptr<WorkerService::Stub> stub_;
 };
 
-// Creates and initializes a new tf.data service master client.
-Status CreateDataServiceMasterClient(
+// Creates and initializes a new tf.data service dispatcher client.
+Status CreateDataServiceDispatcherClient(
     const std::string& address, const std::string& protocol,
-    std::unique_ptr<DataServiceMasterClient>* out);
+    std::unique_ptr<DataServiceDispatcherClient>* out);
 
 // Creates and initializes a new tf.data service worker client.
 Status CreateDataServiceWorkerClient(
diff --git a/tensorflow/core/data/service/data_service_test.cc b/tensorflow/core/data/service/data_service_test.cc
index 19392393eeb..607570054b4 100644
--- a/tensorflow/core/data/service/data_service_test.cc
+++ b/tensorflow/core/data/service/data_service_test.cc
@@ -19,9 +19,9 @@ limitations under the License.
 #include "grpcpp/security/credentials.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/data/compression_utils.h"
+#include "tensorflow/core/data/service/dispatcher.grpc.pb.h"
+#include "tensorflow/core/data/service/dispatcher.pb.h"
 #include "tensorflow/core/data/service/grpc_util.h"
-#include "tensorflow/core/data/service/master.grpc.pb.h"
-#include "tensorflow/core/data/service/master.pb.h"
 #include "tensorflow/core/data/service/server_lib.h"
 #include "tensorflow/core/data/service/test_cluster.h"
 #include "tensorflow/core/data/service/test_util.h"
@@ -66,9 +66,10 @@ TEST(DataService, ProcessingModeToString) {
 TEST(DataService, GetWorkers) {
   TestCluster cluster(1);
   TF_ASSERT_OK(cluster.Initialize());
-  DataServiceMasterClient master(cluster.MasterAddress(), kProtocol);
+  DataServiceDispatcherClient dispatcher(cluster.DispatcherAddress(),
+                                         kProtocol);
   std::vector<WorkerInfo> workers;
-  TF_EXPECT_OK(master.GetWorkers(&workers));
+  TF_EXPECT_OK(dispatcher.GetWorkers(&workers));
   EXPECT_EQ(1, workers.size());
 }
 
diff --git a/tensorflow/core/data/service/master.proto b/tensorflow/core/data/service/dispatcher.proto
similarity index 83%
rename from tensorflow/core/data/service/master.proto
rename to tensorflow/core/data/service/dispatcher.proto
index 661264cc41b..057fc58de52 100644
--- a/tensorflow/core/data/service/master.proto
+++ b/tensorflow/core/data/service/dispatcher.proto
@@ -10,8 +10,6 @@ message RegisterWorkerRequest {
 }
 
 message RegisterWorkerResponse {
-  // An id for the worker.
-  int64 worker_id = 1;
   // Tasks to begin processing.
   repeated TaskDef tasks = 2;
 }
@@ -24,8 +22,7 @@ message TaskProgress {
 }
 
 message WorkerUpdateRequest {
-  // The worker id that the update is for.
-  int64 worker_id = 1;
+  string worker_address = 1;
   repeated TaskProgress updates = 2;
 }
 
@@ -41,13 +38,6 @@ message GetOrRegisterDatasetResponse {
   int64 dataset_id = 1;
 }
 
-enum ProcessingModeDef {
-  // Each tf.data worker processes an entire epoch.
-  PARALLEL_EPOCHS = 0;
-  // Processing of an epoch is distributed across all tf.data workers.
-  ONE_EPOCH = 1;
-}
-
 message CreateJobRequest {
   // The id of the dataset to create a job for.
   int64 dataset_id = 1;
@@ -82,13 +72,6 @@ message GetTasksRequest {
   int64 job_id = 1;
 }
 
-message TaskInfo {
-  // The address of the worker processing the task.
-  string worker_address = 1;
-  // The task id.
-  int64 id = 2;
-}
-
 message GetTasksResponse {
   // A list of all tasks for a job.
   repeated TaskInfo task_info = 1;
@@ -110,11 +93,11 @@ message GetWorkersResponse {
   repeated WorkerInfo workers = 1;
 }
 
-service MasterService {
-  // Registers a worker with the master.
+service DispatcherService {
+  // Registers a worker with the dispatcher.
   rpc RegisterWorker(RegisterWorkerRequest) returns (RegisterWorkerResponse);
 
-  // Updates the master with information about the worker's state.
+  // Updates the dispatcher with information about the worker's state.
   rpc WorkerUpdate(WorkerUpdateRequest) returns (WorkerUpdateResponse);
 
   // Registers a dataset with the server, or returns its id if it is already
@@ -134,6 +117,6 @@ service MasterService {
   // Reports a list of all tasks for a job.
   rpc GetTasks(GetTasksRequest) returns (GetTasksResponse);
 
-  // Reports a list of all workers registered with the master.
+  // Reports a list of all workers registered with the dispatcher.
   rpc GetWorkers(GetWorkersRequest) returns (GetWorkersResponse);
 }
diff --git a/tensorflow/core/data/service/dispatcher_impl.cc b/tensorflow/core/data/service/dispatcher_impl.cc
new file mode 100644
index 00000000000..a30de89ccea
--- /dev/null
+++ b/tensorflow/core/data/service/dispatcher_impl.cc
@@ -0,0 +1,472 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/data/service/dispatcher_impl.h"
+
+#include <memory>
+#include <tuple>
+#include <utility>
+
+#include "grpcpp/create_channel.h"
+#include "grpcpp/impl/codegen/server_context.h"
+#include "grpcpp/security/credentials.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/memory/memory.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/credentials_factory.h"
+#include "tensorflow/core/data/service/data_service.h"
+#include "tensorflow/core/data/service/dispatcher.pb.h"
+#include "tensorflow/core/data/service/grpc_util.h"
+#include "tensorflow/core/data/service/journal.h"
+#include "tensorflow/core/data/service/worker.grpc.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/protobuf/data/experimental/service_config.pb.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace data {
+
+namespace {
+// The name of the journal directory inside the dispatcher's working directory.
+constexpr char kJournalDir[] = "tf_data_dispatcher_journal";
+
+using Dataset = DispatcherState::Dataset;
+using Worker = DispatcherState::Worker;
+using NamedJobKey = DispatcherState::NamedJobKey;
+using Job = DispatcherState::Job;
+using Task = DispatcherState::Task;
+
+std::string JournalDir(const std::string& work_dir) {
+  return io::JoinPath(work_dir, kJournalDir);
+}
+
+Status CreateWorkerStub(const std::string& address, const std::string& protocol,
+                        std::unique_ptr<WorkerService::Stub>* stub) {
+  ::grpc::ChannelArguments args;
+  args.SetMaxReceiveMessageSize(-1);
+  std::shared_ptr<::grpc::ChannelCredentials> credentials;
+  TF_RETURN_IF_ERROR(
+      CredentialsFactory::CreateClientCredentials(protocol, &credentials));
+  auto channel = ::grpc::CreateCustomChannel(address, credentials, args);
+  *stub = WorkerService::NewStub(channel);
+  return Status::OK();
+}
+}  // namespace
+
+DataServiceDispatcherImpl::DataServiceDispatcherImpl(
+    const experimental::DispatcherConfig& config)
+    : config_(config) {
+}
+
+Status DataServiceDispatcherImpl::Start() {
+  mutex_lock l(mu_);
+  if (!config_.fault_tolerant_mode()) {
+    LOG(INFO) << "Running with fault_tolerant_mode=False. The dispatcher will "
+                 "not be able to recover its state on restart.";
+    return Status::OK();
+  }
+  if (config_.work_dir().empty()) {
+    return errors::InvalidArgument(
+        "fault_tolerant_mode is True, but no work_dir is configured.");
+  }
+  journal_writer_ = absl::make_unique<FileJournalWriter>(
+      Env::Default(), JournalDir(config_.work_dir()));
+  LOG(INFO) << "Restoring dispatcher state from journal in "
+            << JournalDir(config_.work_dir());
+  Update update;
+  bool end_of_journal = false;
+  FileJournalReader reader(Env::Default(), JournalDir(config_.work_dir()));
+  Status s = reader.Read(&update, &end_of_journal);
+  if (errors::IsNotFound(s)) {
+    LOG(INFO) << "No journal found. Starting dispatcher from new state.";
+  } else if (!s.ok()) {
+    return s;
+  } else {
+    while (!end_of_journal) {
+      TF_RETURN_IF_ERROR(ApplyWithoutJournaling(update));
+      TF_RETURN_IF_ERROR(reader.Read(&update, &end_of_journal));
+    }
+  }
+  // Initialize the journal writer in `Start` so that we fail fast in case it
+  // can't be initialized.
+  TF_RETURN_IF_ERROR(journal_writer_.value()->EnsureInitialized());
+  return Status::OK();
+}
+
+Status DataServiceDispatcherImpl::RegisterWorker(
+    const RegisterWorkerRequest* request, RegisterWorkerResponse* response) {
+  VLOG(3) << "Received register worker request";
+  mutex_lock l(mu_);
+  std::string worker_address = request->worker_address();
+  std::vector<std::shared_ptr<const Task>> tasks;
+  Status s = state_.TasksForWorker(worker_address, tasks);
+  if (errors::IsNotFound(s)) {
+    Update update;
+    update.mutable_register_worker()->set_worker_address(worker_address);
+    TF_RETURN_IF_ERROR(Apply(update));
+  } else if (!s.ok()) {
+    return s;
+  }
+
+  absl::flat_hash_map<int64, std::shared_ptr<const Task>> tasks_by_job;
+  for (const auto& task : tasks) {
+    // Should never have multiple tasks on the same worker for the same job.
+    auto& task_for_job = tasks_by_job[task->job_id];
+    DCHECK(task_for_job == nullptr);
+    task_for_job = task;
+  }
+
+  std::vector<std::shared_ptr<const Job>> jobs = state_.ListJobs();
+  // Allocate tasks to the worker.
+  for (const auto& job : jobs) {
+    if (job->finished) {
+      continue;
+    }
+    std::shared_ptr<const Task> task;
+    auto it = tasks_by_job.find(job->job_id);
+    if (it != tasks_by_job.end()) {
+      task = it->second;
+    } else {
+      TF_RETURN_IF_ERROR(CreateTask(job, worker_address, &task));
+    }
+    TaskDef* task_def = response->add_tasks();
+    std::shared_ptr<const Dataset> dataset;
+    TF_RETURN_IF_ERROR(state_.DatasetFromId(job->dataset_id, &dataset));
+    *(task_def->mutable_dataset()) = dataset->dataset_def;
+    task_def->set_dataset_id(job->dataset_id);
+    task_def->set_job_id(job->job_id);
+    task_def->set_task_id(task->task_id);
+  }
+
+  VLOG(1) << "Registered worker at address " << request->worker_address();
+  return Status::OK();
+}
+
+Status DataServiceDispatcherImpl::WorkerUpdate(
+    const WorkerUpdateRequest* request, WorkerUpdateResponse* response) {
+  mutex_lock l(mu_);
+  for (auto& update : request->updates()) {
+    int64 task_id = update.task_id();
+    std::shared_ptr<const Task> task;
+    TF_RETURN_IF_ERROR(state_.TaskFromId(task_id, &task));
+    if (update.completed()) {
+      if (task->finished) {
+        VLOG(1) << "Received completion update for already-finished task "
+                << task->task_id << " on worker " << task->worker_address;
+        continue;
+      }
+      Update update;
+      update.mutable_finish_task()->set_task_id(task_id);
+      TF_RETURN_IF_ERROR(Apply(update));
+      VLOG(3) << "Task " << task_id << " from job " << task->job_id
+              << " completed";
+    }
+  }
+  return Status::OK();
+}
+
+Status DataServiceDispatcherImpl::GetOrRegisterDataset(
+    const GetOrRegisterDatasetRequest* request,
+    GetOrRegisterDatasetResponse* response) {
+  uint64 fingerprint;
+  TF_RETURN_IF_ERROR(HashGraph(request->dataset().graph(), &fingerprint));
+  mutex_lock l(mu_);
+  VLOG(4) << "Registering dataset graph: "
+          << request->dataset().graph().DebugString();
+  std::shared_ptr<const Dataset> dataset;
+  Status s = state_.DatasetFromFingerprint(fingerprint, &dataset);
+  if (s.ok()) {
+    int64 id = dataset->dataset_id;
+    VLOG(3) << "Received duplicate RegisterDataset request with fingerprint "
+            << fingerprint << ". Returning id " << id;
+    response->set_dataset_id(id);
+    return Status::OK();
+  } else if (!errors::IsNotFound(s)) {
+    return s;
+  }
+
+  int64 id;
+  TF_RETURN_IF_ERROR(RegisterDataset(fingerprint, request->dataset(), &id));
+  response->set_dataset_id(id);
+  VLOG(3) << "Registered new dataset with id " << id;
+  return Status::OK();
+}
+
+Status DataServiceDispatcherImpl::RegisterDataset(uint64 fingerprint,
+                                                  const DatasetDef& dataset,
+                                                  int64* dataset_id)
+    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  *dataset_id = state_.NextAvailableDatasetId();
+  Update update;
+  RegisterDatasetUpdate* register_dataset = update.mutable_register_dataset();
+  register_dataset->set_dataset_id(*dataset_id);
+  register_dataset->set_fingerprint(fingerprint);
+  *register_dataset->mutable_dataset_def() = dataset;
+  return Apply(update);
+}
+
+Status DataServiceDispatcherImpl::CreateJob(const CreateJobRequest* request,
+                                            CreateJobResponse* response) {
+  VLOG(3) << "Received create job request for dataset id "
+          << request->dataset_id();
+  ProcessingMode processing_mode = ProcessingMode(request->processing_mode());
+  std::shared_ptr<const Job> job;
+  std::vector<std::shared_ptr<const Task>> tasks;
+  {
+    mutex_lock l(mu_);
+    TF_RETURN_IF_ERROR(CreateJob(request->dataset_id(), processing_mode,
+                                 absl::optional<NamedJobKey>(), &job));
+    TF_RETURN_IF_ERROR(CreateTasksForJob(job, &tasks));
+  }
+  response->set_job_id(job->job_id);
+  TF_RETURN_IF_ERROR(AssignTasks(tasks));
+
+  VLOG(3) << "Creating job " << job->job_id << " for dataset "
+          << request->dataset_id();
+  return Status::OK();
+}
+
+Status DataServiceDispatcherImpl::GetOrCreateJob(
+    const GetOrCreateJobRequest* request, GetOrCreateJobResponse* response) {
+  VLOG(3) << "Received get or create job request for dataset id "
+          << request->dataset_id() << " with name " << request->job_name()
+          << " and index " << request->job_name_index();
+  NamedJobKey key(request->job_name(), request->job_name_index());
+  ProcessingMode requested_processing_mode =
+      ProcessingMode(request->processing_mode());
+  std::shared_ptr<const Job> job;
+  std::vector<std::shared_ptr<const Task>> tasks;
+  {
+    mutex_lock l(mu_);
+    Status s = state_.NamedJobByKey(key, &job);
+    if (s.ok()) {
+      TF_RETURN_IF_ERROR(ValidateMatchingJob(job, requested_processing_mode,
+                                             request->dataset_id()));
+      response->set_job_id(job->job_id);
+      VLOG(3) << "Found existing job for name=" << key.name
+              << ", index=" << key.index << ". job_id: " << job->job_id;
+      return Status::OK();
+    } else if (!errors::IsNotFound(s)) {
+      return s;
+    }
+    TF_RETURN_IF_ERROR(
+        CreateJob(request->dataset_id(), requested_processing_mode, key, &job));
+    TF_RETURN_IF_ERROR(CreateTasksForJob(job, &tasks));
+  }
+  TF_RETURN_IF_ERROR(AssignTasks(tasks));
+  response->set_job_id(job->job_id);
+  VLOG(3) << "Created job " << job->job_id << " for dataset "
+          << request->dataset_id() << " and name " << request->job_name();
+  return Status::OK();
+}
+
+// Validates that the job matches the given processing_mode and dataset_id.
+Status DataServiceDispatcherImpl::ValidateMatchingJob(
+    std::shared_ptr<const Job> job, ProcessingMode processing_mode,
+    int64 dataset_id) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  DCHECK(job->named_job_key.has_value());
+  std::string job_name = job->named_job_key->name;
+  if (job->processing_mode != processing_mode) {
+    std::string requested = ProcessingModeToString(processing_mode);
+    std::string actual = ProcessingModeToString(job->processing_mode);
+    return errors::FailedPrecondition(
+        "Found a job with name ", job_name, ", but the processing mode <",
+        actual, "> doesn't match the requested processing mode <", requested,
+        ">.");
+  }
+  if (job->dataset_id != dataset_id) {
+    return errors::FailedPrecondition(
+        "Found a job with name ", job_name, ", but the dataset id <",
+        job->dataset_id, "> doesn't match the requested dataset id <",
+        dataset_id, ">.");
+  }
+  return Status::OK();
+}
+
+Status DataServiceDispatcherImpl::CreateJob(
+    int64 dataset_id, ProcessingMode processing_mode,
+    absl::optional<NamedJobKey> named_job_key, std::shared_ptr<const Job>* job)
+    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  switch (processing_mode) {
+    case ProcessingMode::PARALLEL_EPOCHS:
+      break;
+    case ProcessingMode::ONE_EPOCH:
+      return errors::Unimplemented(
+          "CreateJob only supports the PARALLEL_EPOCHS job mode. "
+          "ONE_EPOCH is not currently supported.");
+    default:
+      return errors::Unimplemented("ProcessingMode ",
+                                   ProcessingModeToString(processing_mode),
+                                   " not recognized");
+  }
+  int64 job_id = state_.NextAvailableJobId();
+  Update update;
+  CreateJobUpdate* create_job = update.mutable_create_job();
+  create_job->set_job_id(job_id);
+  create_job->set_dataset_id(dataset_id);
+  create_job->set_processing_mode(ProcessingModeDef(processing_mode));
+  if (named_job_key.has_value()) {
+    NamedJobKeyDef* key = create_job->mutable_named_job_key();
+    key->set_name(named_job_key->name);
+    key->set_index(named_job_key->index);
+  }
+  TF_RETURN_IF_ERROR(Apply(update));
+  TF_RETURN_IF_ERROR(state_.JobFromId(job_id, job));
+  return Status::OK();
+}
+
+Status DataServiceDispatcherImpl::CreateTasksForJob(
+    std::shared_ptr<const Job> job,
+    std::vector<std::shared_ptr<const Task>>* tasks)
+    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  std::vector<std::shared_ptr<const Worker>> workers = state_.ListWorkers();
+  tasks->clear();
+  tasks->reserve(workers.size());
+  for (const auto& worker : workers) {
+    std::shared_ptr<const Task> task;
+    TF_RETURN_IF_ERROR(CreateTask(job, worker->address, &task));
+    tasks->push_back(task);
+  }
+  return Status::OK();
+}
+
+Status DataServiceDispatcherImpl::CreateTask(std::shared_ptr<const Job> job,
+                                             const std::string& worker_address,
+                                             std::shared_ptr<const Task>* task)
+    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  int64 task_id = state_.NextAvailableTaskId();
+  Update update;
+  CreateTaskUpdate* create_task = update.mutable_create_task();
+  create_task->set_task_id(task_id);
+  create_task->set_job_id(job->job_id);
+  create_task->set_dataset_id(job->dataset_id);
+  create_task->set_worker_address(worker_address);
+  TF_RETURN_IF_ERROR(Apply(update));
+  TF_RETURN_IF_ERROR(state_.TaskFromId(task_id, task));
+  return Status::OK();
+}
+
+Status DataServiceDispatcherImpl::AssignTasks(
+    std::vector<std::shared_ptr<const Task>> tasks) LOCKS_EXCLUDED(mu_) {
+  for (const auto& task : tasks) {
+    TF_RETURN_IF_ERROR(AssignTask(task));
+  }
+  return Status::OK();
+}
+
+Status DataServiceDispatcherImpl::GetOrCreateWorkerStub(
+    const std::string& worker_address, WorkerService::Stub** out_stub)
+    LOCKS_EXCLUDED(mu_) {
+  {
+    mutex_lock l(mu_);
+    auto it = worker_stubs_.find(worker_address);
+    if (it != worker_stubs_.end()) {
+      *out_stub = it->second.get();
+      return Status::OK();
+    }
+  }
+  std::unique_ptr<WorkerService::Stub> stub;
+  TF_RETURN_IF_ERROR(
+      CreateWorkerStub(worker_address, config_.protocol(), &stub));
+  {
+    mutex_lock l(mu_);
+    // A concurrent call could have already created the stub.
+    auto& worker = worker_stubs_[worker_address];
+    if (worker == nullptr) {
+      worker = std::move(stub);
+    }
+    *out_stub = worker.get();
+  }
+  return Status::OK();
+}
+
+Status DataServiceDispatcherImpl::AssignTask(std::shared_ptr<const Task> task)
+    LOCKS_EXCLUDED(mu_) {
+  grpc::ClientContext client_ctx;
+  ProcessTaskRequest req;
+  TaskDef* task_def = req.mutable_task();
+  task_def->set_dataset_id(task->dataset_id);
+  {
+    mutex_lock l(mu_);
+    std::shared_ptr<const Dataset> dataset;
+    TF_RETURN_IF_ERROR(state_.DatasetFromId(task->dataset_id, &dataset));
+    *task_def->mutable_dataset() = dataset->dataset_def;
+  }
+  task_def->set_task_id(task->task_id);
+  ProcessTaskResponse resp;
+  WorkerService::Stub* stub;
+  TF_RETURN_IF_ERROR(GetOrCreateWorkerStub(task->worker_address, &stub));
+  grpc::Status s = stub->ProcessTask(&client_ctx, req, &resp);
+  if (!s.ok()) {
+    return grpc_util::WrapError(
+        absl::StrCat("Failed to submit task to worker ", task->worker_address),
+        s);
+  }
+  return Status::OK();
+}
+
+Status DataServiceDispatcherImpl::GetTasks(const GetTasksRequest* request,
+                                           GetTasksResponse* response) {
+  mutex_lock l(mu_);
+  VLOG(3) << "Looking up tasks for job id " << request->job_id();
+  std::vector<std::shared_ptr<const Task>> tasks;
+  TF_RETURN_IF_ERROR(state_.TasksForJob(request->job_id(), &tasks));
+  for (const auto& task : tasks) {
+    TaskInfo* task_info = response->mutable_task_info()->Add();
+    task_info->set_worker_address(task->worker_address);
+    task_info->set_task_id(task->task_id);
+    task_info->set_job_id(task->job_id);
+  }
+  std::shared_ptr<const Job> job;
+  TF_RETURN_IF_ERROR(state_.JobFromId(request->job_id(), &job));
+  response->set_job_finished(job->finished);
+  VLOG(3) << "Found " << response->task_info_size() << " tasks for job id "
+          << request->job_id();
+  return Status::OK();
+}
+
+Status DataServiceDispatcherImpl::GetWorkers(const GetWorkersRequest* request,
+                                             GetWorkersResponse* response) {
+  mutex_lock l(mu_);
+  VLOG(3) << "Enter GetWorkers";
+  std::vector<std::shared_ptr<const Worker>> workers = state_.ListWorkers();
+  for (const auto& worker : workers) {
+    WorkerInfo* info = response->add_workers();
+    info->set_address(worker->address);
+  }
+  VLOG(3) << "Returning list of " << response->workers_size()
+          << " workers from GetWorkers";
+  return Status::OK();
+}
+
+Status DataServiceDispatcherImpl::ApplyWithoutJournaling(const Update& update)
+    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  return state_.Apply(update);
+}
+
+Status DataServiceDispatcherImpl::Apply(const Update& update)
+    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  if (journal_writer_.has_value()) {
+    TF_RETURN_IF_ERROR(journal_writer_.value()->Write(update));
+  }
+  return state_.Apply(update);
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/dispatcher_impl.h b/tensorflow/core/data/service/dispatcher_impl.h
new file mode 100644
index 00000000000..f4cc6954fe8
--- /dev/null
+++ b/tensorflow/core/data/service/dispatcher_impl.h
@@ -0,0 +1,141 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_DISPATCHER_IMPL_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_DISPATCHER_IMPL_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/data_service.h"
+#include "tensorflow/core/data/service/dispatcher.pb.h"
+#include "tensorflow/core/data/service/dispatcher_state.h"
+#include "tensorflow/core/data/service/worker.grpc.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/protobuf/data/experimental/service_config.pb.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace data {
+
+// A service which coordinates a pool of workers to serve dataset elements over
+// RPC.
+//
+// Glossary:
+// * Dataset: A definition of how to generate a potentially large collection of
+//   elements.
+// * Job: A coordinated phase of reading from the tf.data service. A job
+//   produces some amount of data, and (potentially multiple) consumers consume
+//   the data from the job until there is no data left. Each job has a
+//   ProcessingModeDef which determines what data it produces.
+// * Task: A job is broken into multiple tasks, which each represent
+//   iterating over all of or part of the dataset. Workers process tasks.
+class DataServiceDispatcherImpl {
+ public:
+  explicit DataServiceDispatcherImpl(
+      const experimental::DispatcherConfig& config);
+
+  // Starts the dispatcher. If there is a journal, this will read from the
+  // journal to restore the dispatcher's state.
+  Status Start();
+
+  // See dispatcher.proto for API documentation.
+
+  /// Worker-facing API.
+  Status RegisterWorker(const RegisterWorkerRequest* request,
+                        RegisterWorkerResponse* response);
+  Status WorkerUpdate(const WorkerUpdateRequest* request,
+                      WorkerUpdateResponse* response);
+
+  /// Client-facing API.
+  Status GetOrRegisterDataset(const GetOrRegisterDatasetRequest* request,
+                              GetOrRegisterDatasetResponse* response);
+  Status CreateJob(const CreateJobRequest* request,
+                   CreateJobResponse* response);
+  Status GetOrCreateJob(const GetOrCreateJobRequest* request,
+                        GetOrCreateJobResponse* response);
+  Status GetTasks(const GetTasksRequest* request, GetTasksResponse* response);
+  Status GetWorkers(const GetWorkersRequest* request,
+                    GetWorkersResponse* response);
+
+ private:
+  // Registers a dataset with the given fingerprint, storing the new dataset's
+  // id in `*dataset-id`.
+  Status RegisterDataset(uint64 fingerprint, const DatasetDef& dataset,
+                         int64* dataset_id) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Gets a worker's stub from `worker_stubs_`, or if none exists, creates a
+  // stub and stores it in `worker_stubs_`.
+  Status GetOrCreateWorkerStub(const std::string& worker_address,
+                               WorkerService::Stub** out_stub)
+      LOCKS_EXCLUDED(mu_);
+  // Creates a job and stores it in `*job`. This method updates the
+  // dispatcher state with the new job, but does not assign tasks to workers.
+  Status CreateJob(int64 dataset_id, ProcessingMode processing_mode,
+                   absl::optional<DispatcherState::NamedJobKey> named_job_key,
+                   std::shared_ptr<const DispatcherState::Job>* job)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Creates one task for each worker, for the given job. The created tasks are
+  // stored in `*tasks`. This method only updates dispatcher metadata with the
+  // new tasks, but doesn't assign the tasks to the workers.
+  Status CreateTasksForJob(
+      std::shared_ptr<const DispatcherState::Job> job,
+      std::vector<std::shared_ptr<const DispatcherState::Task>>* tasks)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Creates a new task for a job, storing the created task in `*task`.
+  Status CreateTask(std::shared_ptr<const DispatcherState::Job> job,
+                    const std::string& worker_address,
+                    std::shared_ptr<const DispatcherState::Task>* task);
+  // Assigns the list of tasks to the workers indicated by their
+  // `worker_address` fields.
+  Status AssignTasks(
+      std::vector<std::shared_ptr<const DispatcherState::Task>> tasks)
+      LOCKS_EXCLUDED(mu_);
+  // Assigns a task to the worker indicated by its `worker_address` field.
+  Status AssignTask(std::shared_ptr<const DispatcherState::Task> task)
+      LOCKS_EXCLUDED(mu_);
+  // Validates that an existing job matches the given processing_mode and
+  // dataset_id, returning an error status describing any difference.
+  Status ValidateMatchingJob(std::shared_ptr<const DispatcherState::Job> job,
+                             ProcessingMode processing_mode, int64 dataset_id)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Applies a state update, updating both the journal and the in-memory state.
+  Status Apply(const Update& update) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Applies a state update, but doesn't update the journal. Only meant to be
+  // used when recovering state when the dispatcher starts.
+  Status ApplyWithoutJournaling(const Update& update)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  const experimental::DispatcherConfig& config_;
+
+  mutex mu_;
+
+  int64 next_task_id_ TF_GUARDED_BY(mu_) = 0;
+
+  // Cached worker stubs for communicating with workers.
+  absl::flat_hash_map<std::string, std::unique_ptr<WorkerService::Stub>>
+      worker_stubs_ TF_GUARDED_BY(mu_);
+
+  absl::optional<std::unique_ptr<JournalWriter>> journal_writer_
+      TF_GUARDED_BY(mu_);
+  DispatcherState state_ TF_GUARDED_BY(mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(DataServiceDispatcherImpl);
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_DISPATCHER_IMPL_H_
diff --git a/tensorflow/core/data/service/dispatcher_state.cc b/tensorflow/core/data/service/dispatcher_state.cc
new file mode 100644
index 00000000000..aedfab7280b
--- /dev/null
+++ b/tensorflow/core/data/service/dispatcher_state.cc
@@ -0,0 +1,244 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/data/service/dispatcher_state.h"
+
+#include <memory>
+
+#include "tensorflow/core/data/service/journal.h"
+#include "tensorflow/core/data/service/journal.pb.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace tensorflow {
+namespace data {
+
+DispatcherState::DispatcherState() {}
+
+Status DispatcherState::Apply(Update update) {
+  switch (update.update_type_case()) {
+    case Update::kRegisterDataset:
+      RegisterDataset(update.register_dataset());
+      break;
+    case Update::kRegisterWorker:
+      RegisterWorker(update.register_worker());
+      break;
+    case Update::kCreateJob:
+      CreateJob(update.create_job());
+      break;
+    case Update::kCreateTask:
+      CreateTask(update.create_task());
+      break;
+    case Update::kFinishTask:
+      FinishTask(update.finish_task());
+      break;
+    case Update::UPDATE_TYPE_NOT_SET:
+      return errors::Internal("Update type not set.");
+  }
+
+  return Status::OK();
+}
+
+void DispatcherState::RegisterDataset(
+    const RegisterDatasetUpdate& register_dataset) {
+  int64 id = register_dataset.dataset_id();
+  int64 fingerprint = register_dataset.fingerprint();
+  auto dataset = std::make_shared<Dataset>(id, fingerprint,
+                                           register_dataset.dataset_def());
+  DCHECK(!datasets_by_id_.contains(id));
+  datasets_by_id_[id] = dataset;
+  DCHECK(!datasets_by_fingerprint_.contains(fingerprint));
+  datasets_by_fingerprint_[fingerprint] = dataset;
+  next_available_dataset_id_ = std::max(next_available_dataset_id_, id + 1);
+}
+
+void DispatcherState::RegisterWorker(
+    const RegisterWorkerUpdate& register_worker) {
+  std::string address = register_worker.worker_address();
+  DCHECK(!workers_.contains(address));
+  workers_[address] = std::make_shared<Worker>(address);
+  tasks_by_worker_[address] = std::vector<std::shared_ptr<Task>>();
+}
+
+void DispatcherState::CreateJob(const CreateJobUpdate& create_job) {
+  int64 job_id = create_job.job_id();
+  absl::optional<NamedJobKey> named_job_key;
+  if (create_job.has_named_job_key()) {
+    named_job_key.emplace(create_job.named_job_key().name(),
+                          create_job.named_job_key().index());
+  }
+  auto job = std::make_shared<Job>(job_id, create_job.dataset_id(),
+                                   ProcessingMode(create_job.processing_mode()),
+                                   named_job_key);
+  DCHECK(!jobs_.contains(job_id));
+  jobs_[job_id] = job;
+  tasks_by_job_[job_id] = std::vector<std::shared_ptr<Task>>();
+  if (named_job_key.has_value()) {
+    DCHECK(!named_jobs_.contains(named_job_key.value()));
+    named_jobs_[named_job_key.value()] = job;
+  }
+  next_available_job_id_ = std::max(next_available_job_id_, job_id + 1);
+}
+
+void DispatcherState::CreateTask(const CreateTaskUpdate& create_task) {
+  int64 task_id = create_task.task_id();
+  auto& task = tasks_[task_id];
+  DCHECK_EQ(task, nullptr);
+  task = std::make_shared<Task>(task_id, create_task.job_id(),
+                                create_task.dataset_id(),
+                                create_task.worker_address());
+  tasks_by_job_[create_task.job_id()].push_back(task);
+  tasks_by_worker_[create_task.worker_address()].push_back(task);
+  next_available_task_id_ = std::max(next_available_task_id_, task_id + 1);
+}
+
+void DispatcherState::FinishTask(const FinishTaskUpdate& finish_task) {
+  VLOG(2) << "Marking task " << finish_task.task_id() << " as finished";
+  int64 task_id = finish_task.task_id();
+  auto& task = tasks_[task_id];
+  DCHECK(task != nullptr);
+  task->finished = true;
+  bool all_finished = true;
+  for (const auto& task_for_job : tasks_by_job_[task->job_id]) {
+    if (!task_for_job->finished) {
+      all_finished = false;
+    }
+  }
+  VLOG(3) << "Job " << task->job_id << " finished: " << all_finished;
+  jobs_[task->job_id]->finished = all_finished;
+}
+
+int64 DispatcherState::NextAvailableDatasetId() const {
+  return next_available_dataset_id_;
+}
+
+Status DispatcherState::DatasetFromId(
+    int64 id, std::shared_ptr<const Dataset>* dataset) const {
+  auto it = datasets_by_id_.find(id);
+  if (it == datasets_by_id_.end()) {
+    return errors::NotFound("Dataset id ", id, " not found");
+  }
+  *dataset = it->second;
+  return Status::OK();
+}
+
+Status DispatcherState::DatasetFromFingerprint(
+    uint64 fingerprint, std::shared_ptr<const Dataset>* dataset) const {
+  auto it = datasets_by_fingerprint_.find(fingerprint);
+  if (it == datasets_by_fingerprint_.end()) {
+    return errors::NotFound("Dataset fingerprint ", fingerprint, " not found");
+  }
+  *dataset = it->second;
+  return Status::OK();
+}
+
+Status DispatcherState::WorkerFromAddress(
+    const std::string& address, std::shared_ptr<const Worker>* worker) const {
+  auto it = workers_.find(address);
+  if (it == workers_.end()) {
+    return errors::NotFound("Worker with address ", address, " not found.");
+  }
+  *worker = it->second;
+  return Status::OK();
+}
+
+std::vector<std::shared_ptr<const DispatcherState::Worker>>
+DispatcherState::ListWorkers() const {
+  std::vector<std::shared_ptr<const Worker>> workers;
+  workers.reserve(workers_.size());
+  for (const auto& it : workers_) {
+    workers.push_back(it.second);
+  }
+  return workers;
+}
+
+std::vector<std::shared_ptr<const DispatcherState::Job>>
+DispatcherState::ListJobs() {
+  std::vector<std::shared_ptr<const DispatcherState::Job>> jobs;
+  jobs.reserve(jobs_.size());
+  for (const auto& it : jobs_) {
+    jobs.push_back(it.second);
+  }
+  return jobs;
+}
+
+Status DispatcherState::JobFromId(int64 id,
+                                  std::shared_ptr<const Job>* job) const {
+  auto it = jobs_.find(id);
+  if (it == jobs_.end()) {
+    return errors::NotFound("Job id ", id, " not found");
+  }
+  *job = it->second;
+  return Status::OK();
+}
+
+Status DispatcherState::NamedJobByKey(NamedJobKey named_job_key,
+                                      std::shared_ptr<const Job>* job) const {
+  auto it = named_jobs_.find(named_job_key);
+  if (it == named_jobs_.end()) {
+    return errors::NotFound("Named job key (", named_job_key.name, ", ",
+                            named_job_key.index, ") not found");
+  }
+  *job = it->second;
+  return Status::OK();
+}
+
+int64 DispatcherState::NextAvailableJobId() const {
+  return next_available_job_id_;
+}
+
+Status DispatcherState::TaskFromId(int64 id,
+                                   std::shared_ptr<const Task>* task) const {
+  auto it = tasks_.find(id);
+  if (it == tasks_.end()) {
+    return errors::NotFound("Task ", id, " not found");
+  }
+  *task = it->second;
+  return Status::OK();
+}
+
+Status DispatcherState::TasksForJob(
+    int64 job_id, std::vector<std::shared_ptr<const Task>>* tasks) const {
+  auto it = tasks_by_job_.find(job_id);
+  if (it == tasks_by_job_.end()) {
+    return errors::NotFound("Job ", job_id, " not found");
+  }
+  tasks->clear();
+  tasks->reserve(it->second.size());
+  for (const auto& task : it->second) {
+    tasks->push_back(task);
+  }
+  return Status::OK();
+}
+
+Status DispatcherState::TasksForWorker(
+    absl::string_view worker_address,
+    std::vector<std::shared_ptr<const Task>>& tasks) const {
+  auto it = tasks_by_worker_.find(worker_address);
+  if (it == tasks_by_worker_.end()) {
+    return errors::NotFound("Worker ", worker_address, " not found");
+  }
+  std::vector<std::shared_ptr<Task>> worker_tasks = it->second;
+  tasks.reserve(worker_tasks.size());
+  for (const auto& task : worker_tasks) {
+    tasks.push_back(task);
+  }
+  return Status::OK();
+}
+
+int64 DispatcherState::NextAvailableTaskId() const {
+  return next_available_task_id_;
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/dispatcher_state.h b/tensorflow/core/data/service/dispatcher_state.h
new file mode 100644
index 00000000000..8db05064a40
--- /dev/null
+++ b/tensorflow/core/data/service/dispatcher_state.h
@@ -0,0 +1,205 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_DISPATCHER_STATE_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_DISPATCHER_STATE_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/data_service.h"
+#include "tensorflow/core/data/service/journal.h"
+#include "tensorflow/core/data/service/journal.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace data {
+
+// A class encapsulating the journaled state of the dispatcher. All state
+// modifications must be done via `ApplyUpdate`. This helps to ensure that
+// replaying the journal will allow us to restore the exact same state.
+//
+// The following usage pattern will keep the journal in sync with the state of
+// the dispatcher:
+// {
+//   mutex_lock l(mu_);
+//   Update update = ...  // create an update
+//   dispatcher_state.ApplyUpdate(update);
+//   journal_writer.write(Update);
+//   // Unlock mu_
+// }
+//
+// The division of functionality between DispatcherImpl and DispatcherState is
+// as follows:
+//   - DispatcherImpl is responsible for handling RPC requests, reading from
+//     DispatcherState, and deciding what updates to apply to DispatcherState.
+//     DispatcherImpl handles all synchronization.
+//   - DispatcherState is responsible for making the state changes requested by
+//     DispatcherImpl and for providing DispatcherImpl with read-only access to
+//     the state.
+//
+// DispatcherState is thread-compatible but not thread-safe.
+class DispatcherState {
+ public:
+  DispatcherState();
+  DispatcherState(const DispatcherState&) = delete;
+  DispatcherState& operator=(const DispatcherState&) = delete;
+
+  // Applies the given update to the dispatcher's state.
+  Status Apply(Update update);
+
+  // A dataset registered with the dispatcher.
+  struct Dataset {
+    explicit Dataset(int64 dataset_id, int64 fingerprint,
+                     const DatasetDef& dataset_def)
+        : dataset_id(dataset_id),
+          fingerprint(fingerprint),
+          dataset_def(dataset_def) {}
+
+    const int64 dataset_id;
+    const int64 fingerprint;
+    const DatasetDef dataset_def;
+  };
+
+  // A worker registered with the dispatcher.
+  struct Worker {
+    explicit Worker(const std::string& address) : address(address) {}
+
+    const std::string address;
+  };
+
+  // A key for identifying a named job. The key contains a user-specified name,
+  // as well as an index describing which iteration of the job we are on.
+  struct NamedJobKey {
+    explicit NamedJobKey(absl::string_view name, int64 index)
+        : name(name), index(index) {}
+
+    friend bool operator==(const NamedJobKey& lhs, const NamedJobKey& rhs) {
+      return lhs.name == rhs.name && lhs.index == rhs.index;
+    }
+
+    template <typename H>
+    friend H AbslHashValue(H h, const NamedJobKey& k) {
+      return H::combine(std::move(h), k.name, k.index);
+    }
+
+    const std::string name;
+    const int64 index;
+  };
+
+  // A job for processing a dataset.
+  struct Job {
+    explicit Job(int64 job_id, int64 dataset_id, ProcessingMode processing_mode,
+                 absl::optional<NamedJobKey> named_job_key)
+        : job_id(job_id),
+          dataset_id(dataset_id),
+          processing_mode(processing_mode),
+          named_job_key(named_job_key) {}
+
+    const int64 job_id;
+    const int64 dataset_id;
+    const ProcessingMode processing_mode;
+    const absl::optional<NamedJobKey> named_job_key;
+    bool finished = false;
+  };
+
+  struct Task {
+    explicit Task(int64 task_id, int64 job_id, int64 dataset_id,
+                  const std::string& worker_address)
+        : task_id(task_id),
+          job_id(job_id),
+          dataset_id(dataset_id),
+          worker_address(worker_address) {}
+
+    const int64 task_id;
+    const int64 job_id;
+    const int64 dataset_id;
+    const std::string worker_address;
+    bool finished = false;
+  };
+
+  // Returns the next available dataset id.
+  int64 NextAvailableDatasetId() const;
+  // Gets a dataset by id. Returns NOT_FOUND if there is no such dataset.
+  Status DatasetFromId(int64 id, std::shared_ptr<const Dataset>* dataset) const;
+  // Gets a dataset by fingerprint. Returns NOT_FOUND if there is no such
+  // dataset.
+  Status DatasetFromFingerprint(uint64 fingerprint,
+                                std::shared_ptr<const Dataset>* dataset) const;
+
+  // Gets a worker by address. Returns NOT_FOUND if there is no such worker.
+  Status WorkerFromAddress(const std::string& address,
+                           std::shared_ptr<const Worker>* worker) const;
+  // Lists all workers registered with the dispatcher.
+  std::vector<std::shared_ptr<const Worker>> ListWorkers() const;
+
+  // Returns the next available job id.
+  int64 NextAvailableJobId() const;
+  // Returns a list of all jobs.
+  std::vector<std::shared_ptr<const Job>> ListJobs();
+  // Gets a job by id. Returns NOT_FOUND if there is no such job.
+  Status JobFromId(int64 id, std::shared_ptr<const Job>* job) const;
+  // Gets a named job by key. Returns NOT_FOUND if there is no such job.
+  Status NamedJobByKey(NamedJobKey key, std::shared_ptr<const Job>* job) const;
+
+  // Returns the next available task id.
+  int64 NextAvailableTaskId() const;
+  // Gets a task by id. Returns NOT_FOUND if there is no such task.
+  Status TaskFromId(int64 id, std::shared_ptr<const Task>* task) const;
+  // Stores a list of all tasks for the given job to `*tasks`. Returns NOT_FOUND
+  // if there is no such job.
+  Status TasksForJob(int64 job_id,
+                     std::vector<std::shared_ptr<const Task>>* tasks) const;
+  // Stores a list of all tasks for the given worker to `*tasks`. Returns
+  // NOT_FOUND if there is no such worker.
+  Status TasksForWorker(const absl::string_view worker_address,
+                        std::vector<std::shared_ptr<const Task>>& tasks) const;
+
+ private:
+  void RegisterDataset(const RegisterDatasetUpdate& register_dataset);
+  void RegisterWorker(const RegisterWorkerUpdate& register_worker);
+  void CreateJob(const CreateJobUpdate& create_job);
+  void CreateTask(const CreateTaskUpdate& create_task);
+  void FinishTask(const FinishTaskUpdate& finish_task);
+
+  int64 next_available_dataset_id_ = 0;
+  // Registered datasets, keyed by dataset ids.
+  absl::flat_hash_map<int64, std::shared_ptr<Dataset>> datasets_by_id_;
+  // Registered datasets, keyed by dataset fingerprints.
+  absl::flat_hash_map<uint64, std::shared_ptr<Dataset>>
+      datasets_by_fingerprint_;
+
+  // Registered workers, keyed by address.
+  absl::flat_hash_map<std::string, std::shared_ptr<Worker>> workers_;
+
+  int64 next_available_job_id_ = 0;
+  // Jobs, keyed by job ids.
+  absl::flat_hash_map<int64, std::shared_ptr<Job>> jobs_;
+  // Named jobs, keyed by their names and indices. Not all jobs have names, so
+  // this is a subset of the jobs stored in `jobs_`.
+  absl::flat_hash_map<NamedJobKey, std::shared_ptr<Job>> named_jobs_;
+
+  int64 next_available_task_id_ = 0;
+  // Tasks, keyed by task ids.
+  absl::flat_hash_map<int64, std::shared_ptr<Task>> tasks_;
+  // Tasks, keyed by job ids.
+  absl::flat_hash_map<int64, std::vector<std::shared_ptr<Task>>> tasks_by_job_;
+  // Tasks, keyed by worker addresses.
+  absl::flat_hash_map<std::string, std::vector<std::shared_ptr<Task>>>
+      tasks_by_worker_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_DISPATCHER_STATE_H_
diff --git a/tensorflow/core/data/service/dispatcher_state_test.cc b/tensorflow/core/data/service/dispatcher_state_test.cc
new file mode 100644
index 00000000000..004890242c2
--- /dev/null
+++ b/tensorflow/core/data/service/dispatcher_state_test.cc
@@ -0,0 +1,401 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/data/service/dispatcher_state.h"
+
+#include <memory>
+
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/journal.h"
+#include "tensorflow/core/data/service/journal.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace data {
+
+namespace {
+using Dataset = DispatcherState::Dataset;
+using Worker = DispatcherState::Worker;
+using NamedJobKey = DispatcherState::NamedJobKey;
+using Job = DispatcherState::Job;
+using Task = DispatcherState::Task;
+using ::testing::IsEmpty;
+using ::testing::SizeIs;
+
+Status RegisterDatasetWithIdAndFingerprint(int64 id, uint64 fingerprint,
+                                           DispatcherState* state) {
+  Update update;
+  RegisterDatasetUpdate* register_dataset = update.mutable_register_dataset();
+  register_dataset->set_dataset_id(id);
+  register_dataset->set_fingerprint(fingerprint);
+  TF_RETURN_IF_ERROR(state->Apply(update));
+  return Status::OK();
+}
+
+Status RegisterWorker(std::string worker_address, DispatcherState* state) {
+  Update update;
+  update.mutable_register_worker()->set_worker_address(worker_address);
+  TF_RETURN_IF_ERROR(state->Apply(update));
+  return Status::OK();
+}
+
+Status CreateAnonymousJob(int64 job_id, int64 dataset_id,
+                          DispatcherState* state) {
+  Update update;
+  CreateJobUpdate* create_job = update.mutable_create_job();
+  create_job->set_job_id(job_id);
+  create_job->set_dataset_id(dataset_id);
+  create_job->set_processing_mode(ProcessingModeDef::PARALLEL_EPOCHS);
+  TF_RETURN_IF_ERROR(state->Apply(update));
+  return Status::OK();
+}
+
+Status CreateNamedJob(int64 job_id, int64 dataset_id, NamedJobKey named_job_key,
+                      DispatcherState* state) {
+  Update update;
+  CreateJobUpdate* create_job = update.mutable_create_job();
+  create_job->set_job_id(job_id);
+  create_job->set_dataset_id(dataset_id);
+  create_job->set_processing_mode(ProcessingModeDef::PARALLEL_EPOCHS);
+  NamedJobKeyDef* key = create_job->mutable_named_job_key();
+  key->set_name(named_job_key.name);
+  key->set_index(named_job_key.index);
+  TF_RETURN_IF_ERROR(state->Apply(update));
+  return Status::OK();
+}
+
+Status CreateTask(int64 task_id, int64 job_id, int64 dataset_id,
+                  const std::string& worker_address, DispatcherState* state) {
+  Update update;
+  CreateTaskUpdate* create_task = update.mutable_create_task();
+  create_task->set_task_id(task_id);
+  create_task->set_job_id(job_id);
+  create_task->set_dataset_id(dataset_id);
+  create_task->set_worker_address(worker_address);
+  TF_RETURN_IF_ERROR(state->Apply(update));
+  return Status::OK();
+}
+
+Status FinishTask(int64 task_id, DispatcherState* state) {
+  Update update;
+  FinishTaskUpdate* finish_task = update.mutable_finish_task();
+  finish_task->set_task_id(task_id);
+  TF_RETURN_IF_ERROR(state->Apply(update));
+  return Status::OK();
+}
+}  // namespace
+
+TEST(DispatcherState, RegisterDataset) {
+  int64 id = 10;
+  uint64 fingerprint = 20;
+  DispatcherState state;
+  TF_EXPECT_OK(RegisterDatasetWithIdAndFingerprint(id, fingerprint, &state));
+  EXPECT_EQ(state.NextAvailableDatasetId(), id + 1);
+
+  {
+    std::shared_ptr<const Dataset> dataset;
+    TF_EXPECT_OK(state.DatasetFromFingerprint(fingerprint, &dataset));
+    EXPECT_EQ(dataset->dataset_id, id);
+  }
+  {
+    std::shared_ptr<const Dataset> dataset;
+    TF_EXPECT_OK(state.DatasetFromId(id, &dataset));
+    EXPECT_EQ(dataset->fingerprint, fingerprint);
+  }
+}
+
+TEST(DispatcherState, MissingDatasetId) {
+  DispatcherState state;
+  std::shared_ptr<const Dataset> dataset;
+  Status s = state.DatasetFromId(0, &dataset);
+  EXPECT_EQ(s.code(), error::NOT_FOUND);
+}
+
+TEST(DispatcherState, MissingDatasetFingerprint) {
+  DispatcherState state;
+  std::shared_ptr<const Dataset> dataset;
+  Status s = state.DatasetFromFingerprint(0, &dataset);
+  EXPECT_EQ(s.code(), error::NOT_FOUND);
+}
+
+TEST(DispatcherState, NextAvailableDatasetId) {
+  DispatcherState state;
+  int64 id = state.NextAvailableDatasetId();
+  uint64 fingerprint = 20;
+  TF_EXPECT_OK(RegisterDatasetWithIdAndFingerprint(id, fingerprint, &state));
+  EXPECT_NE(state.NextAvailableDatasetId(), id);
+  EXPECT_EQ(state.NextAvailableDatasetId(), state.NextAvailableDatasetId());
+}
+
+TEST(DispatcherState, RegisterWorker) {
+  DispatcherState state;
+  std::string address = "test_worker_address";
+  TF_EXPECT_OK(RegisterWorker(address, &state));
+  std::shared_ptr<const Worker> worker;
+  TF_EXPECT_OK(state.WorkerFromAddress(address, &worker));
+  EXPECT_EQ(worker->address, address);
+}
+
+TEST(DispatcherState, ListWorkers) {
+  DispatcherState state;
+  std::string address_1 = "address_1";
+  std::string address_2 = "address_2";
+  {
+    std::vector<std::shared_ptr<const Worker>> workers = state.ListWorkers();
+    EXPECT_THAT(workers, IsEmpty());
+  }
+  TF_EXPECT_OK(RegisterWorker(address_1, &state));
+  {
+    std::vector<std::shared_ptr<const Worker>> workers = state.ListWorkers();
+    EXPECT_THAT(workers, SizeIs(1));
+  }
+  TF_EXPECT_OK(RegisterWorker(address_2, &state));
+  {
+    std::vector<std::shared_ptr<const Worker>> workers = state.ListWorkers();
+    EXPECT_THAT(workers, SizeIs(2));
+  }
+}
+
+TEST(DispatcherState, MissingWorker) {
+  DispatcherState state;
+  std::shared_ptr<const Worker> worker;
+  Status s = state.WorkerFromAddress("test_worker_address", &worker);
+  EXPECT_EQ(s.code(), error::NOT_FOUND);
+}
+
+TEST(DispatcherState, UnknownUpdate) {
+  DispatcherState state;
+  Update update;
+  Status s = state.Apply(update);
+  EXPECT_EQ(s.code(), error::INTERNAL);
+}
+
+TEST(DispatcherState, AnonymousJob) {
+  int64 job_id = 3;
+  int64 dataset_id = 10;
+  DispatcherState state;
+  TF_EXPECT_OK(RegisterDatasetWithIdAndFingerprint(dataset_id, 1, &state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, &state));
+  std::shared_ptr<const Job> job;
+  TF_EXPECT_OK(state.JobFromId(job_id, &job));
+  EXPECT_EQ(state.NextAvailableJobId(), job_id + 1);
+  EXPECT_EQ(job->dataset_id, dataset_id);
+  EXPECT_EQ(job->job_id, job_id);
+  std::vector<std::shared_ptr<const Task>> tasks;
+  TF_EXPECT_OK(state.TasksForJob(job_id, &tasks));
+  EXPECT_THAT(tasks, IsEmpty());
+  EXPECT_FALSE(job->finished);
+}
+
+TEST(DispatcherState, NamedJob) {
+  int64 job_id = 3;
+  int64 dataset_id = 10;
+  DispatcherState state;
+  TF_EXPECT_OK(RegisterDatasetWithIdAndFingerprint(dataset_id, 1, &state));
+  NamedJobKey named_job_key("test", 1);
+  TF_EXPECT_OK(CreateNamedJob(job_id, dataset_id, named_job_key, &state));
+  std::shared_ptr<const Job> job;
+  TF_EXPECT_OK(state.NamedJobByKey(named_job_key, &job));
+  EXPECT_EQ(state.NextAvailableJobId(), job_id + 1);
+  EXPECT_EQ(job->dataset_id, dataset_id);
+  EXPECT_EQ(job->job_id, job_id);
+  EXPECT_FALSE(job->finished);
+}
+
+TEST(DispatcherState, CreateTask) {
+  int64 job_id = 3;
+  int64 dataset_id = 10;
+  int64 task_id = 8;
+  std::string worker_address = "test_worker_address";
+  DispatcherState state;
+  TF_EXPECT_OK(RegisterDatasetWithIdAndFingerprint(dataset_id, 1, &state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, &state));
+  TF_EXPECT_OK(CreateTask(task_id, job_id, dataset_id, worker_address, &state));
+  EXPECT_EQ(state.NextAvailableTaskId(), task_id + 1);
+  {
+    std::shared_ptr<const Task> task;
+    TF_EXPECT_OK(state.TaskFromId(task_id, &task));
+    EXPECT_EQ(task->task_id, task_id);
+    EXPECT_EQ(task->job_id, job_id);
+    EXPECT_EQ(task->dataset_id, dataset_id);
+    EXPECT_EQ(task->worker_address, worker_address);
+  }
+  {
+    std::vector<std::shared_ptr<const Task>> tasks;
+    TF_EXPECT_OK(state.TasksForJob(job_id, &tasks));
+    EXPECT_THAT(tasks, SizeIs(1));
+  }
+  {
+    std::vector<std::shared_ptr<const Task>> tasks;
+    TF_EXPECT_OK(state.TasksForWorker(worker_address, tasks));
+    EXPECT_EQ(1, tasks.size());
+  }
+}
+
+TEST(DispatcherState, CreateTasksForSameJob) {
+  int64 job_id = 3;
+  int64 dataset_id = 10;
+  int64 task_id_1 = 8;
+  int64 task_id_2 = 9;
+  std::string worker_address = "test_worker_address";
+  DispatcherState state;
+  TF_EXPECT_OK(RegisterDatasetWithIdAndFingerprint(dataset_id, 1, &state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, &state));
+  TF_EXPECT_OK(
+      CreateTask(task_id_1, job_id, dataset_id, worker_address, &state));
+  TF_EXPECT_OK(
+      CreateTask(task_id_2, job_id, dataset_id, worker_address, &state));
+  {
+    std::vector<std::shared_ptr<const Task>> tasks;
+    TF_EXPECT_OK(state.TasksForJob(job_id, &tasks));
+    EXPECT_THAT(tasks, SizeIs(2));
+  }
+}
+
+TEST(DispatcherState, CreateTasksForDifferentJobs) {
+  int64 job_id_1 = 3;
+  int64 job_id_2 = 4;
+  int64 dataset_id = 10;
+  int64 task_id_1 = 8;
+  int64 task_id_2 = 9;
+  std::string worker_address = "test_worker_address";
+  DispatcherState state;
+  TF_EXPECT_OK(RegisterDatasetWithIdAndFingerprint(dataset_id, 1, &state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id_1, dataset_id, &state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id_2, dataset_id, &state));
+  TF_EXPECT_OK(
+      CreateTask(task_id_1, job_id_1, dataset_id, worker_address, &state));
+  TF_EXPECT_OK(
+      CreateTask(task_id_2, job_id_2, dataset_id, worker_address, &state));
+  {
+    std::vector<std::shared_ptr<const Task>> tasks;
+    TF_EXPECT_OK(state.TasksForJob(job_id_1, &tasks));
+    EXPECT_THAT(tasks, SizeIs(1));
+  }
+  {
+    std::vector<std::shared_ptr<const Task>> tasks;
+    TF_EXPECT_OK(state.TasksForJob(job_id_2, &tasks));
+    EXPECT_THAT(tasks, SizeIs(1));
+  }
+}
+
+TEST(DispatcherState, CreateTasksForSameWorker) {
+  int64 job_id = 3;
+  int64 dataset_id = 10;
+  int64 task_id_1 = 8;
+  int64 task_id_2 = 9;
+  std::string worker_address = "test_worker_address";
+  DispatcherState state;
+  TF_EXPECT_OK(RegisterDatasetWithIdAndFingerprint(dataset_id, 1, &state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, &state));
+  TF_EXPECT_OK(
+      CreateTask(task_id_1, job_id, dataset_id, worker_address, &state));
+  TF_EXPECT_OK(
+      CreateTask(task_id_2, job_id, dataset_id, worker_address, &state));
+  {
+    std::vector<std::shared_ptr<const Task>> tasks;
+    TF_EXPECT_OK(state.TasksForWorker(worker_address, tasks));
+    EXPECT_EQ(2, tasks.size());
+  }
+}
+
+TEST(DispatcherState, CreateTasksForDifferentWorkers) {
+  int64 job_id = 3;
+  int64 dataset_id = 10;
+  int64 task_id_1 = 8;
+  int64 task_id_2 = 9;
+  std::string worker_address_1 = "test_worker_address_1";
+  std::string worker_address_2 = "test_worker_address_2";
+  DispatcherState state;
+  TF_EXPECT_OK(RegisterDatasetWithIdAndFingerprint(dataset_id, 1, &state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, &state));
+  TF_EXPECT_OK(
+      CreateTask(task_id_1, job_id, dataset_id, worker_address_1, &state));
+  TF_EXPECT_OK(
+      CreateTask(task_id_2, job_id, dataset_id, worker_address_2, &state));
+  {
+    std::vector<std::shared_ptr<const Task>> tasks;
+    TF_EXPECT_OK(state.TasksForWorker(worker_address_1, tasks));
+    EXPECT_EQ(1, tasks.size());
+  }
+  {
+    std::vector<std::shared_ptr<const Task>> tasks;
+    TF_EXPECT_OK(state.TasksForWorker(worker_address_2, tasks));
+    EXPECT_EQ(1, tasks.size());
+  }
+}
+
+TEST(DispatcherState, GetTasksForWorkerEmpty) {
+  std::string worker_address = "test_worker_address";
+  DispatcherState state;
+  TF_EXPECT_OK(RegisterWorker(worker_address, &state));
+  {
+    std::vector<std::shared_ptr<const Task>> tasks;
+    TF_EXPECT_OK(state.TasksForWorker(worker_address, tasks));
+    EXPECT_EQ(0, tasks.size());
+  }
+}
+
+TEST(DispatcherState, FinishTask) {
+  int64 job_id = 3;
+  int64 dataset_id = 10;
+  int64 task_id = 4;
+  std::string worker_address = "test_worker_address";
+  DispatcherState state;
+  TF_EXPECT_OK(RegisterDatasetWithIdAndFingerprint(dataset_id, 1, &state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, &state));
+  TF_EXPECT_OK(CreateTask(task_id, job_id, dataset_id, worker_address, &state));
+  TF_EXPECT_OK(FinishTask(task_id, &state));
+  std::shared_ptr<const Task> task;
+  TF_EXPECT_OK(state.TaskFromId(task_id, &task));
+  EXPECT_TRUE(task->finished);
+  std::shared_ptr<const Job> job;
+  TF_EXPECT_OK(state.JobFromId(job_id, &job));
+  EXPECT_TRUE(job->finished);
+}
+
+TEST(DispatcherState, FinishMultiTaskJob) {
+  int64 job_id = 3;
+  int64 dataset_id = 10;
+  int64 task_id_1 = 4;
+  int64 task_id_2 = 5;
+  std::string worker_address = "test_worker_address";
+  DispatcherState state;
+  TF_EXPECT_OK(RegisterDatasetWithIdAndFingerprint(dataset_id, 1, &state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, &state));
+  TF_EXPECT_OK(
+      CreateTask(task_id_1, job_id, dataset_id, worker_address, &state));
+  TF_EXPECT_OK(
+      CreateTask(task_id_2, job_id, dataset_id, worker_address, &state));
+
+  TF_EXPECT_OK(FinishTask(task_id_1, &state));
+  {
+    std::shared_ptr<const Job> job;
+    TF_EXPECT_OK(state.JobFromId(job_id, &job));
+    EXPECT_FALSE(job->finished);
+  }
+
+  TF_EXPECT_OK(FinishTask(task_id_2, &state));
+  {
+    std::shared_ptr<const Job> job;
+    TF_EXPECT_OK(state.JobFromId(job_id, &job));
+    EXPECT_TRUE(job->finished);
+  }
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/grpc_master_impl.cc b/tensorflow/core/data/service/grpc_dispatcher_impl.cc
similarity index 66%
rename from tensorflow/core/data/service/grpc_master_impl.cc
rename to tensorflow/core/data/service/grpc_dispatcher_impl.cc
index 20ad58a0115..f62b487fcdf 100644
--- a/tensorflow/core/data/service/grpc_master_impl.cc
+++ b/tensorflow/core/data/service/grpc_dispatcher_impl.cc
@@ -13,30 +13,32 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/data/service/grpc_master_impl.h"
+#include "tensorflow/core/data/service/grpc_dispatcher_impl.h"
 
 #include "grpcpp/server_context.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/protobuf/data/experimental/service_config.pb.h"
 
 namespace tensorflow {
 namespace data {
 
 using ::grpc::ServerBuilder;
 using ::grpc::ServerContext;
-using ::grpc::Status;
 
-GrpcMasterImpl::GrpcMasterImpl(ServerBuilder* server_builder,
-                               const std::string& protocol)
-    : impl_(protocol) {
+GrpcDispatcherImpl::GrpcDispatcherImpl(
+    ServerBuilder* server_builder, const experimental::DispatcherConfig& config)
+    : impl_(config) {
   server_builder->RegisterService(this);
-  VLOG(1) << "Registered data service master";
+  VLOG(1) << "Registered data service dispatcher";
 }
 
-#define HANDLER(method)                                         \
-  Status GrpcMasterImpl::method(ServerContext* context,         \
-                                const method##Request* request, \
-                                method##Response* response) {   \
-    return ToGrpcStatus(impl_.method(request, response));       \
+Status GrpcDispatcherImpl::Start() { return impl_.Start(); }
+
+#define HANDLER(method)                                                   \
+  grpc::Status GrpcDispatcherImpl::method(ServerContext* context,         \
+                                          const method##Request* request, \
+                                          method##Response* response) {   \
+    return ToGrpcStatus(impl_.method(request, response));                 \
   }
 HANDLER(RegisterWorker);
 HANDLER(WorkerUpdate);
diff --git a/tensorflow/core/data/service/grpc_master_impl.h b/tensorflow/core/data/service/grpc_dispatcher_impl.h
similarity index 64%
rename from tensorflow/core/data/service/grpc_master_impl.h
rename to tensorflow/core/data/service/grpc_dispatcher_impl.h
index d29bb6759f0..1810c3fb6ac 100644
--- a/tensorflow/core/data/service/grpc_master_impl.h
+++ b/tensorflow/core/data/service/grpc_dispatcher_impl.h
@@ -13,12 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_DATA_SERVICE_GRPC_MASTER_IMPL_H_
-#define TENSORFLOW_CORE_DATA_SERVICE_GRPC_MASTER_IMPL_H_
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_GRPC_DISPATCHER_IMPL_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_GRPC_DISPATCHER_IMPL_H_
 
 #include "grpcpp/server_builder.h"
-#include "tensorflow/core/data/service/master.grpc.pb.h"
-#include "tensorflow/core/data/service/master_impl.h"
+#include "tensorflow/core/data/service/dispatcher.grpc.pb.h"
+#include "tensorflow/core/data/service/dispatcher_impl.h"
+#include "tensorflow/core/protobuf/data/experimental/service_config.pb.h"
 
 namespace tensorflow {
 namespace data {
@@ -29,14 +30,16 @@ namespace data {
 //
 // ::grpc::ServerBuilder builder;
 // // configure builder
-// GrpcMasterImpl data_service(&builder);
+// GrpcDispatcherImpl data_service(&builder);
 // builder.BuildAndStart()
 //
-class GrpcMasterImpl : public MasterService::Service {
+class GrpcDispatcherImpl : public DispatcherService::Service {
  public:
-  explicit GrpcMasterImpl(grpc::ServerBuilder* server_builder,
-                          const std::string& protocol);
-  ~GrpcMasterImpl() override {}
+  explicit GrpcDispatcherImpl(grpc::ServerBuilder* server_builder,
+                              const experimental::DispatcherConfig& config);
+  ~GrpcDispatcherImpl() override {}
+
+  Status Start();
 
 #define HANDLER(method)                               \
   grpc::Status method(grpc::ServerContext* context,   \
@@ -52,12 +55,12 @@ class GrpcMasterImpl : public MasterService::Service {
 #undef HANDLER
 
  private:
-  DataServiceMasterImpl impl_;
+  DataServiceDispatcherImpl impl_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GrpcMasterImpl);
+  TF_DISALLOW_COPY_AND_ASSIGN(GrpcDispatcherImpl);
 };
 
 }  // namespace data
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_DATA_SERVICE_GRPC_MASTER_IMPL_H_
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_GRPC_DISPATCHER_IMPL_H_
diff --git a/tensorflow/core/data/service/grpc_util.cc b/tensorflow/core/data/service/grpc_util.cc
index 40950c51efe..7f9d2ac07e7 100644
--- a/tensorflow/core/data/service/grpc_util.cc
+++ b/tensorflow/core/data/service/grpc_util.cc
@@ -15,7 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/data/service/grpc_util.h"
 
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
 
 namespace tensorflow {
@@ -32,6 +36,36 @@ Status WrapError(const std::string& message, const grpc::Status& status) {
   }
 }
 
+Status Retry(const std::function<Status()>& f, const std::string& description,
+             int64 deadline_micros) {
+  Status s = f();
+  for (int num_retries = 0;; ++num_retries) {
+    if (!errors::IsUnavailable(s) && !errors::IsAborted(s) &&
+        !errors::IsCancelled(s)) {
+      return s;
+    }
+    int64 now_micros = EnvTime::NowMicros();
+    if (now_micros > deadline_micros) {
+      return s;
+    }
+    int64 deadline_with_backoff_micros =
+        now_micros + ::tensorflow::ComputeBackoffMicroseconds(num_retries);
+    // Wait for a short period of time before retrying. If our backoff would put
+    // us past the deadline, we truncate it to ensure our attempt starts before
+    // the deadline.
+    int64 backoff_until =
+        std::min(deadline_with_backoff_micros, deadline_micros);
+    int64 wait_time_micros = backoff_until - now_micros;
+    if (wait_time_micros > 100 * 1000) {
+      LOG(INFO) << "Failed to " << description << ". Will retry in "
+                << wait_time_micros / 1000 << "ms.";
+    }
+    Env::Default()->SleepForMicroseconds(wait_time_micros);
+    s = f();
+  }
+  return s;
+}
+
 }  // namespace grpc_util
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/grpc_util.h b/tensorflow/core/data/service/grpc_util.h
index 60ea10669a5..b0e39df79eb 100644
--- a/tensorflow/core/data/service/grpc_util.h
+++ b/tensorflow/core/data/service/grpc_util.h
@@ -26,6 +26,16 @@ namespace grpc_util {
 // Wraps a grpc::Status in a tensorflow::Status with the given message.
 Status WrapError(const std::string& message, const grpc::Status& status);
 
+// Retries the given function if the function produces UNAVAILABLE, ABORTED, or
+// CANCELLED status codes. We retry these codes because they can all indicate
+// preemption of a server. The retries continue until the deadline is exceeded.
+// `description` may be used to log that retries are happening. It should
+// contain a description of the action being retried, e.g. "register dataset"
+// The retry loop uses exponential backoff between retries.
+// `deadline_micros` is interpreted as microseconds since the epoch.
+Status Retry(const std::function<Status()>& f, const std::string& description,
+             int64 deadline_micros);
+
 }  // namespace grpc_util
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/grpc_worker_impl.cc b/tensorflow/core/data/service/grpc_worker_impl.cc
index 7884fa063ba..5e3183d61b8 100644
--- a/tensorflow/core/data/service/grpc_worker_impl.cc
+++ b/tensorflow/core/data/service/grpc_worker_impl.cc
@@ -23,25 +23,23 @@ namespace data {
 
 using ::grpc::ServerBuilder;
 using ::grpc::ServerContext;
-using ::grpc::Status;
 
 GrpcWorkerImpl::GrpcWorkerImpl(ServerBuilder* server_builder,
-                               const std::string& master_address,
-                               const std::string& protocol)
-    : impl_(master_address, protocol) {
+                               const experimental::WorkerConfig& config)
+    : impl_(config) {
   server_builder->RegisterService(this);
   VLOG(1) << "Registered data service worker";
 }
 
-void GrpcWorkerImpl::Start(const std::string& worker_address) {
-  impl_.Start(worker_address);
+Status GrpcWorkerImpl::Start(const std::string& worker_address) {
+  return impl_.Start(worker_address);
 }
 
-#define HANDLER(method)                                         \
-  Status GrpcWorkerImpl::method(ServerContext* context,         \
-                                const method##Request* request, \
-                                method##Response* response) {   \
-    return ToGrpcStatus(impl_.method(request, response));       \
+#define HANDLER(method)                                               \
+  grpc::Status GrpcWorkerImpl::method(ServerContext* context,         \
+                                      const method##Request* request, \
+                                      method##Response* response) {   \
+    return ToGrpcStatus(impl_.method(request, response));             \
   }
 HANDLER(ProcessTask);
 HANDLER(GetElement);
diff --git a/tensorflow/core/data/service/grpc_worker_impl.h b/tensorflow/core/data/service/grpc_worker_impl.h
index b7ece2a7738..49caab246ac 100644
--- a/tensorflow/core/data/service/grpc_worker_impl.h
+++ b/tensorflow/core/data/service/grpc_worker_impl.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include "grpcpp/server_builder.h"
 #include "tensorflow/core/data/service/worker.grpc.pb.h"
 #include "tensorflow/core/data/service/worker_impl.h"
+#include "tensorflow/core/protobuf/data/experimental/service_config.pb.h"
 
 namespace tensorflow {
 namespace data {
@@ -35,11 +36,10 @@ namespace data {
 class GrpcWorkerImpl : public WorkerService::Service {
  public:
   explicit GrpcWorkerImpl(grpc::ServerBuilder* server_builder,
-                          const std::string& master_address,
-                          const std::string& protocol);
+                          const experimental::WorkerConfig& config);
   ~GrpcWorkerImpl() override {}
 
-  void Start(const std::string& worker_address);
+  Status Start(const std::string& worker_address);
 
 #define HANDLER(method)                               \
   grpc::Status method(grpc::ServerContext* context,   \
diff --git a/tensorflow/core/data/service/journal.cc b/tensorflow/core/data/service/journal.cc
new file mode 100644
index 00000000000..b0ce0876c69
--- /dev/null
+++ b/tensorflow/core/data/service/journal.cc
@@ -0,0 +1,138 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/data/service/journal.h"
+
+#include "absl/memory/memory.h"
+#include "tensorflow/core/data/service/journal.pb.h"
+#include "tensorflow/core/lib/io/record_reader.h"
+#include "tensorflow/core/lib/io/record_writer.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/regexp.h"
+
+namespace tensorflow {
+namespace data {
+
+namespace {
+constexpr StringPiece kJournal = "journal";
+
+Status ParseSequenceNumber(const std::string& journal_file,
+                           int64* sequence_number) {
+  if (!RE2::FullMatch(journal_file, ".*_(\\d+)", sequence_number)) {
+    return errors::InvalidArgument("Failed to parse journal file name: ",
+                                   journal_file);
+  }
+  return Status::OK();
+}
+}  // namespace
+
+std::string DataServiceJournalFile(const std::string& journal_dir,
+                                   int64 sequence_number) {
+  return io::JoinPath(journal_dir,
+                      absl::StrCat(kJournal, "_", sequence_number));
+}
+
+FileJournalWriter::FileJournalWriter(Env* env, const std::string& journal_dir)
+    : env_(env), journal_dir_(journal_dir) {}
+
+Status FileJournalWriter::EnsureInitialized() {
+  if (writer_) {
+    return Status::OK();
+  }
+  std::vector<std::string> journal_files;
+  TF_RETURN_IF_ERROR(env_->RecursivelyCreateDir(journal_dir_));
+  TF_RETURN_IF_ERROR(env_->GetChildren(journal_dir_, &journal_files));
+  int64 latest_sequence_number = -1;
+  for (const auto& file : journal_files) {
+    int64 sequence_number;
+    TF_RETURN_IF_ERROR(ParseSequenceNumber(file, &sequence_number));
+    latest_sequence_number = std::max(latest_sequence_number, sequence_number);
+  }
+  std::string journal_file =
+      DataServiceJournalFile(journal_dir_, latest_sequence_number + 1);
+  TF_RETURN_IF_ERROR(env_->NewAppendableFile(journal_file, &file_));
+  writer_ = absl::make_unique<io::RecordWriter>(file_.get());
+  VLOG(1) << "Created journal writer to write to " << journal_file;
+  return Status::OK();
+}
+
+Status FileJournalWriter::Write(const Update& update) {
+  TF_RETURN_IF_ERROR(EnsureInitialized());
+  std::string s = update.SerializeAsString();
+  if (s.empty()) {
+    return errors::Internal("Failed to serialize update ", update.DebugString(),
+                            " to string");
+  }
+  TF_RETURN_IF_ERROR(writer_->WriteRecord(s));
+  TF_RETURN_IF_ERROR(writer_->Flush());
+  TF_RETURN_IF_ERROR(file_->Sync());
+  if (VLOG_IS_ON(4)) {
+    VLOG(4) << "Wrote journal entry: " << update.DebugString();
+  }
+  return Status::OK();
+}
+
+FileJournalReader::FileJournalReader(Env* env, StringPiece journal_dir)
+    : env_(env), journal_dir_(journal_dir) {}
+
+Status FileJournalReader::EnsureInitialized() {
+  if (reader_) {
+    return Status::OK();
+  }
+  return UpdateFile(DataServiceJournalFile(journal_dir_, 0));
+}
+
+Status FileJournalReader::Read(Update* update, bool* end_of_journal) {
+  TF_RETURN_IF_ERROR(EnsureInitialized());
+  while (true) {
+    tstring record;
+    Status s = reader_->ReadRecord(&offset_, &record);
+    if (errors::IsOutOfRange(s)) {
+      sequence_number_++;
+      std::string next_journal_file =
+          DataServiceJournalFile(journal_dir_, sequence_number_);
+      if (errors::IsNotFound(env_->FileExists(next_journal_file))) {
+        VLOG(3) << "Next journal file " << next_journal_file
+                << " does not exist. End of journal reached.";
+        *end_of_journal = true;
+        return Status::OK();
+      }
+      TF_RETURN_IF_ERROR(UpdateFile(next_journal_file));
+      continue;
+    }
+    TF_RETURN_IF_ERROR(s);
+    if (!update->ParseFromString(record)) {
+      return errors::DataLoss("Failed to parse journal record.");
+    }
+    if (VLOG_IS_ON(4)) {
+      VLOG(4) << "Read journal entry: " << update->DebugString();
+    }
+    *end_of_journal = false;
+    return Status::OK();
+  }
+}
+
+Status FileJournalReader::UpdateFile(const std::string& filename) {
+  VLOG(1) << "Reading from journal file " << filename;
+  TF_RETURN_IF_ERROR(env_->NewRandomAccessFile(filename, &file_));
+  reader_ = absl::make_unique<io::RecordReader>(file_.get());
+  offset_ = 0;
+  return Status::OK();
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/journal.h b/tensorflow/core/data/service/journal.h
new file mode 100644
index 00000000000..3483497705e
--- /dev/null
+++ b/tensorflow/core/data/service/journal.h
@@ -0,0 +1,117 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_JOURNAL_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_JOURNAL_H_
+
+#include "tensorflow/core/data/service/journal.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/io/record_reader.h"
+#include "tensorflow/core/lib/io/record_writer.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+namespace data {
+
+// Returns the location of the journal file within the journal directory.
+std::string DataServiceJournalFile(const std::string& journal_dir,
+                                   int64 sequence_number);
+
+// Interface for writing to a journal.
+class JournalWriter {
+ public:
+  virtual ~JournalWriter() = default;
+  // Writes and syncs an update to the journal.
+  virtual Status Write(const Update& update) = 0;
+  // Initializes the writer if it is not yet initialized.
+  virtual Status EnsureInitialized() = 0;
+};
+
+// FileJournalWriter is not thread-safe, requiring external synchronization when
+// used by multiple threads.
+//
+// FileJournalWriter writes journal files to a configured journal directory. The
+// directory is laid out in the following format:
+//
+// journal_dir/
+//   journal_0
+//   journal_1
+//   ...
+//
+// When the writer is created, it lists the directory to find the next available
+// journal file name. For example, if the journal directory contains
+// "journal_0", "journal_1", and "journal_2", the writer will write to
+// "journal_3". The writer will flush updates as they are written, so that they
+// can be stored durably in case of machine failure.
+class FileJournalWriter : public JournalWriter {
+ public:
+  // Creates a journal writer to write to the given journal directory.
+  // If there is already journal data there, the journal writer will append to
+  // the existing journal.
+  explicit FileJournalWriter(Env* env, const std::string& journal_dir);
+  FileJournalWriter(const FileJournalWriter&) = delete;
+  FileJournalWriter& operator=(const FileJournalWriter&) = delete;
+
+  Status Write(const Update& update) override;
+  Status EnsureInitialized() override;
+
+ private:
+  Env* env_;
+  const std::string journal_dir_;
+  std::unique_ptr<WritableFile> file_;
+  std::unique_ptr<io::RecordWriter> writer_;
+};
+
+// Interface for reading from a journal.
+class JournalReader {
+ public:
+  virtual ~JournalReader() = default;
+  // Reads the next update from the journal. Sets `*end_of_journal=true` if
+  // there are no more updates left in the journal.
+  virtual Status Read(Update* update, bool* end_of_journal) = 0;
+};
+
+// JournalReader is not thread-safe, requiring external synchronization when
+// used by multiple threads.
+//
+// The journal reader reads through all journal files in the configured journal
+// directory, in order of their sequence numbers. See FileJournalWriter above.
+class FileJournalReader : public JournalReader {
+ public:
+  explicit FileJournalReader(Env* env, StringPiece journal_dir);
+  FileJournalReader(const FileJournalReader&) = delete;
+  FileJournalReader& operator=(const FileJournalReader&) = delete;
+
+  Status Read(Update* update, bool* end_of_journal) override;
+
+ private:
+  // Initializes the reader if it is not yet initialized.
+  Status EnsureInitialized();
+  // Updates the `FileJournalReader` to read from a new file.
+  Status UpdateFile(const std::string& filename);
+
+  Env* env_;
+  const std::string journal_dir_;
+  // Sequence number of current journal file.
+  int64 sequence_number_ = 0;
+  // Current offset into `file_`.
+  uint64 offset_ = 0;
+  std::unique_ptr<RandomAccessFile> file_;
+  std::unique_ptr<io::RecordReader> reader_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_JOURNAL_H_
diff --git a/tensorflow/core/data/service/journal.proto b/tensorflow/core/data/service/journal.proto
new file mode 100644
index 00000000000..725724a5cd5
--- /dev/null
+++ b/tensorflow/core/data/service/journal.proto
@@ -0,0 +1,52 @@
+syntax = "proto3";
+
+package tensorflow.data;
+
+import "tensorflow/core/data/service/common.proto";
+
+// Message representing journaled dispatcher metadata updates. When we apply
+// one of these changes to the dispatcher's in-memory state, we also write an
+// Update message to the journal.
+message Update {
+  oneof update_type {
+    RegisterDatasetUpdate register_dataset = 1;
+    RegisterWorkerUpdate register_worker = 5;
+    CreateJobUpdate create_job = 2;
+    CreateTaskUpdate create_task = 3;
+    FinishTaskUpdate finish_task = 4;
+  }
+}
+
+message RegisterDatasetUpdate {
+  int64 dataset_id = 1;
+  DatasetDef dataset_def = 2;
+  uint64 fingerprint = 3;
+}
+
+message RegisterWorkerUpdate {
+  string worker_address = 1;
+}
+
+message NamedJobKeyDef {
+  string name = 1;
+  int64 index = 2;
+}
+
+message CreateJobUpdate {
+  int64 job_id = 1;
+  int64 dataset_id = 2;
+  ProcessingModeDef processing_mode = 3;
+  // Only some jobs have names, so this may be unset.
+  NamedJobKeyDef named_job_key = 4;
+}
+
+message CreateTaskUpdate {
+  int64 task_id = 1;
+  int64 job_id = 2;
+  int64 dataset_id = 3;
+  string worker_address = 4;
+}
+
+message FinishTaskUpdate {
+  int64 task_id = 1;
+}
diff --git a/tensorflow/core/data/service/journal_test.cc b/tensorflow/core/data/service/journal_test.cc
new file mode 100644
index 00000000000..313b216fe76
--- /dev/null
+++ b/tensorflow/core/data/service/journal_test.cc
@@ -0,0 +1,163 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/data/service/journal.h"
+
+#include "absl/memory/memory.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/journal.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace data {
+
+namespace {
+using ::testing::HasSubstr;
+
+bool NewJournalDir(std::string* journal_dir) {
+  std::string filename = testing::TmpDir();
+  if (!Env::Default()->CreateUniqueFileName(&filename, "journal_dir")) {
+    return false;
+  }
+  *journal_dir = filename;
+  return true;
+}
+
+Update MakeCreateJobUpdate() {
+  Update update;
+  CreateJobUpdate* create_job = update.mutable_create_job();
+  create_job->set_dataset_id(3);
+  create_job->set_job_id(8);
+  create_job->set_processing_mode(ProcessingModeDef::PARALLEL_EPOCHS);
+  return update;
+}
+
+Update MakeFinishTaskUpdate() {
+  Update update;
+  FinishTaskUpdate* finish_task = update.mutable_finish_task();
+  finish_task->set_task_id(8);
+  return update;
+}
+
+Update MakeRegisterDatasetUpdate() {
+  Update update;
+  RegisterDatasetUpdate* register_dataset = update.mutable_register_dataset();
+  register_dataset->set_dataset_id(2);
+  register_dataset->set_fingerprint(3);
+  return update;
+}
+
+Status CheckJournalContent(StringPiece journal_dir,
+                           const std::vector<Update>& expected) {
+  FileJournalReader reader(Env::Default(), journal_dir);
+  for (const auto& update : expected) {
+    Update result;
+    bool end_of_journal = true;
+    TF_RETURN_IF_ERROR(reader.Read(&result, &end_of_journal));
+    EXPECT_FALSE(end_of_journal);
+    // We can't use the testing::EqualsProto matcher because it is not available
+    // in OSS.
+    EXPECT_EQ(result.SerializeAsString(), update.SerializeAsString());
+  }
+  Update result;
+  bool end_of_journal = false;
+  TF_RETURN_IF_ERROR(reader.Read(&result, &end_of_journal));
+  EXPECT_TRUE(end_of_journal);
+  return Status::OK();
+}
+}  // namespace
+
+TEST(Journal, RoundTripMultiple) {
+  std::string journal_dir;
+  EXPECT_TRUE(NewJournalDir(&journal_dir));
+  std::vector<Update> updates = {MakeCreateJobUpdate(),
+                                 MakeRegisterDatasetUpdate(),
+                                 MakeFinishTaskUpdate()};
+  FileJournalWriter writer(Env::Default(), journal_dir);
+  for (const auto& update : updates) {
+    TF_EXPECT_OK(writer.Write(update));
+  }
+
+  TF_EXPECT_OK(CheckJournalContent(journal_dir, updates));
+}
+
+TEST(Journal, AppendExistingJournal) {
+  std::string journal_dir;
+  EXPECT_TRUE(NewJournalDir(&journal_dir));
+  std::vector<Update> updates = {MakeCreateJobUpdate(),
+                                 MakeRegisterDatasetUpdate(),
+                                 MakeFinishTaskUpdate()};
+  for (const auto& update : updates) {
+    FileJournalWriter writer(Env::Default(), journal_dir);
+    TF_EXPECT_OK(writer.Write(update));
+  }
+
+  TF_EXPECT_OK(CheckJournalContent(journal_dir, updates));
+}
+
+TEST(Journal, MissingFile) {
+  std::string journal_dir;
+  EXPECT_TRUE(NewJournalDir(&journal_dir));
+  FileJournalReader reader(Env::Default(), journal_dir);
+  Update result;
+  bool end_of_journal = true;
+  Status s = reader.Read(&result, &end_of_journal);
+  EXPECT_TRUE(errors::IsNotFound(s));
+}
+
+TEST(Journal, NonRecordData) {
+  std::string journal_dir;
+  EXPECT_TRUE(NewJournalDir(&journal_dir));
+
+  TF_ASSERT_OK(Env::Default()->RecursivelyCreateDir(journal_dir));
+  {
+    std::unique_ptr<WritableFile> file;
+    TF_ASSERT_OK(Env::Default()->NewAppendableFile(
+        DataServiceJournalFile(journal_dir, /*sequence_number=*/0), &file));
+    TF_ASSERT_OK(file->Append("not record data"));
+  }
+
+  FileJournalReader reader(Env::Default(), journal_dir);
+  Update result;
+  bool end_of_journal = true;
+  Status s = reader.Read(&result, &end_of_journal);
+  EXPECT_THAT(s.error_message(), HasSubstr("corrupted record"));
+  EXPECT_EQ(s.code(), error::DATA_LOSS);
+}
+
+TEST(Journal, InvalidRecordData) {
+  std::string journal_dir;
+  EXPECT_TRUE(NewJournalDir(&journal_dir));
+
+  TF_ASSERT_OK(Env::Default()->RecursivelyCreateDir(journal_dir));
+  {
+    std::unique_ptr<WritableFile> file;
+    TF_ASSERT_OK(Env::Default()->NewAppendableFile(
+        DataServiceJournalFile(journal_dir, /*sequence_number=*/0), &file));
+    auto writer = absl::make_unique<io::RecordWriter>(file.get());
+    TF_ASSERT_OK(writer->WriteRecord("not serializd proto"));
+  }
+
+  FileJournalReader reader(Env::Default(), journal_dir);
+  Update result;
+  bool end_of_journal = true;
+  Status s = reader.Read(&result, &end_of_journal);
+  EXPECT_THAT(s.error_message(), HasSubstr("Failed to parse journal record"));
+  EXPECT_EQ(s.code(), error::DATA_LOSS);
+}
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/master_impl.cc b/tensorflow/core/data/service/master_impl.cc
deleted file mode 100644
index 5c7917b4154..00000000000
--- a/tensorflow/core/data/service/master_impl.cc
+++ /dev/null
@@ -1,364 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/data/service/master_impl.h"
-
-#include <memory>
-#include <tuple>
-#include <utility>
-
-#include "grpcpp/create_channel.h"
-#include "grpcpp/impl/codegen/server_context.h"
-#include "grpcpp/security/credentials.h"
-#include "absl/memory/memory.h"
-#include "tensorflow/core/data/service/common.pb.h"
-#include "tensorflow/core/data/service/credentials_factory.h"
-#include "tensorflow/core/data/service/data_service.h"
-#include "tensorflow/core/data/service/grpc_util.h"
-#include "tensorflow/core/data/service/master.pb.h"
-#include "tensorflow/core/data/service/worker.grpc.pb.h"
-#include "tensorflow/core/framework/tensor.pb.h"
-#include "tensorflow/core/kernels/data/dataset_utils.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/public/session_options.h"
-
-namespace tensorflow {
-namespace data {
-
-namespace {
-Status CreateWorkerStub(const std::string& address,
-                        const std::string& protocol_,
-                        std::unique_ptr<WorkerService::Stub>* stub) {
-  ::grpc::ChannelArguments args;
-  args.SetMaxReceiveMessageSize(-1);
-  std::shared_ptr<::grpc::ChannelCredentials> credentials;
-  TF_RETURN_IF_ERROR(
-      CredentialsFactory::CreateClientCredentials(protocol_, &credentials));
-  auto channel = ::grpc::CreateCustomChannel(address, credentials, args);
-  *stub = WorkerService::NewStub(channel);
-  return Status::OK();
-}
-}  // namespace
-
-DataServiceMasterImpl::DataServiceMasterImpl(const std::string protocol)
-    : protocol_(protocol) {}
-
-Status DataServiceMasterImpl::RegisterWorker(
-    const RegisterWorkerRequest* request, RegisterWorkerResponse* response) {
-  VLOG(3) << "Received register worker request";
-  mutex_lock l(mu_);
-  int64 worker_id = next_worker_id_++;
-  workers_.push_back(
-      std::make_shared<Worker>(worker_id, request->worker_address()));
-  response->set_worker_id(worker_id);
-
-  // Allocate tasks to the worker.
-  for (auto& entry : jobs_) {
-    std::shared_ptr<Job> job = entry.second;
-    if (job->finished()) {
-      continue;
-    }
-    const Task& task = CreateTaskLocked(job.get(), request->worker_address());
-
-    TaskDef* task_def = response->add_tasks();
-    *task_def->mutable_dataset() =
-        datasets_by_id_[job->dataset_id()]->dataset_def();
-    task_def->set_dataset_id(job->dataset_id());
-    task_def->set_job_id(job->job_id());
-    task_def->set_task_id(task.task_id());
-  }
-
-  VLOG(1) << "Registered worker at address " << request->worker_address()
-          << " with id " << worker_id;
-  return Status::OK();
-}
-
-Status DataServiceMasterImpl::WorkerUpdate(const WorkerUpdateRequest* request,
-                                           WorkerUpdateResponse* response) {
-  mutex_lock l(mu_);
-  int64 worker_id = request->worker_id();
-  for (auto& update : request->updates()) {
-    int64 task_id = update.task_id();
-    if (!tasks_.contains(task_id)) {
-      return errors::NotFound("WorkerUpdate called for worker ", worker_id,
-                              " with unknown task id ", task_id);
-    }
-    if (update.completed()) {
-      int64 job_id = tasks_.at(task_id).job_id();
-      DCHECK(jobs_.contains(job_id));
-      jobs_.at(job_id)->task_finished(task_id);
-      VLOG(3) << "Task " << task_id << " from job " << job_id << " completed";
-    }
-  }
-  return Status::OK();
-}
-
-Status DataServiceMasterImpl::GetOrRegisterDataset(
-    const GetOrRegisterDatasetRequest* request,
-    GetOrRegisterDatasetResponse* response) {
-  uint64 fingerprint;
-  TF_RETURN_IF_ERROR(HashGraph(request->dataset().graph(), &fingerprint));
-  mutex_lock l(mu_);
-  VLOG(3) << "Registering dataset graph: "
-          << request->dataset().graph().DebugString();
-  if (datasets_by_fingerprint_.contains(fingerprint)) {
-    int64 id = datasets_by_fingerprint_[fingerprint]->dataset_id();
-    VLOG(3) << "Received duplicate RegisterDataset request with fingerprint "
-            << fingerprint << ". Returning id " << id;
-    response->set_dataset_id(id);
-    return Status::OK();
-  }
-  int64 id = RegisterDataset(fingerprint, request->dataset());
-
-  response->set_dataset_id(id);
-  VLOG(3) << "Registered new dataset with id " << id;
-  return Status::OK();
-}
-
-int64 DataServiceMasterImpl::RegisterDataset(uint64 fingerprint,
-                                             const DatasetDef& dataset)
-    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-  int64 dataset_id = next_dataset_id_++;
-  auto new_dataset =
-      std::make_shared<Dataset>(dataset_id, fingerprint, dataset);
-
-  DCHECK(!datasets_by_id_.contains(dataset_id));
-  datasets_by_id_[dataset_id] = new_dataset;
-  DCHECK(!datasets_by_fingerprint_.contains(fingerprint));
-  datasets_by_fingerprint_[fingerprint] = new_dataset;
-  return dataset_id;
-}
-
-Status DataServiceMasterImpl::CreateJob(const CreateJobRequest* request,
-                                        CreateJobResponse* response) {
-  VLOG(3) << "Received create job request for dataset id "
-          << request->dataset_id();
-  ProcessingMode processing_mode = ProcessingMode(request->processing_mode());
-  int64 job_id;
-  TF_RETURN_IF_ERROR(CreateJob(request->dataset_id(), processing_mode,
-                               absl::optional<std::string>(), &job_id));
-  response->set_job_id(job_id);
-
-  VLOG(3) << "Creating job " << job_id << " for dataset "
-          << request->dataset_id();
-  return Status::OK();
-}
-
-Status DataServiceMasterImpl::GetOrCreateJob(
-    const GetOrCreateJobRequest* request, GetOrCreateJobResponse* response) {
-  VLOG(3) << "Received get or create job request for dataset id "
-          << request->dataset_id() << " with name " << request->job_name()
-          << " and index " << request->job_name_index();
-  NamedJobKey key(request->job_name(), request->job_name_index());
-  ProcessingMode requested_processing_mode =
-      ProcessingMode(request->processing_mode());
-  {
-    mutex_lock l(mu_);
-    std::shared_ptr<Job>* job = gtl::FindOrNull(named_jobs_, key);
-    if (job != nullptr) {
-      TF_RETURN_IF_ERROR(ValidateMatchingJob(**job, requested_processing_mode,
-                                             request->dataset_id()));
-      int64 job_id = (*job)->job_id();
-      response->set_job_id(job_id);
-      VLOG(3) << "Found existing job for name=" << request->job_name()
-              << ", index=" << request->job_name_index()
-              << ". job_id: " << job_id;
-      return Status::OK();
-    }
-  }
-  int64 job_id;
-  TF_RETURN_IF_ERROR(CreateJob(request->dataset_id(), requested_processing_mode,
-                               request->job_name(), &job_id));
-  {
-    mutex_lock l(mu_);
-    named_jobs_[key] = jobs_[job_id];
-  }
-  response->set_job_id(job_id);
-  VLOG(3) << "Created job " << job_id << " for dataset "
-          << request->dataset_id() << " and name " << request->job_name();
-  return Status::OK();
-}
-
-// Validates that the job matches the given processing_mode and dataset_id.
-Status DataServiceMasterImpl::ValidateMatchingJob(
-    const Job& job, ProcessingMode processing_mode, int64 dataset_id) {
-  DCHECK(job.name().has_value());
-  std::string job_name = job.name().value();
-  if (job.processing_mode() != processing_mode) {
-    std::string requested = ProcessingModeToString(processing_mode);
-    std::string actual = ProcessingModeToString(job.processing_mode());
-    return errors::FailedPrecondition(
-        "Found a job with name ", job_name, ", but the processing mode <",
-        actual, "> doesn't match the requested processing mode <", requested,
-        ">.");
-  }
-  if (job.dataset_id() != dataset_id) {
-    return errors::FailedPrecondition(
-        "Found a job with name ", job_name, ", but the dataset id <",
-        job.dataset_id(), "> doesn't match the requested dataset id <",
-        dataset_id, ">.");
-  }
-  return Status::OK();
-}
-
-Status DataServiceMasterImpl::CreateJob(int64 dataset_id,
-                                        ProcessingMode processing_mode,
-                                        absl::optional<std::string> job_name,
-                                        int64* out_job_id) LOCKS_EXCLUDED(mu_) {
-  switch (processing_mode) {
-    case ProcessingMode::PARALLEL_EPOCHS:
-      break;
-    case ProcessingMode::ONE_EPOCH:
-      return errors::Unimplemented(
-          "CreateJob only supports the PARALLEL_EPOCHS job mode. "
-          "ONE_EPOCH is not currently supported.");
-    default:
-      return errors::Unimplemented("ProcessingMode ",
-                                   ProcessingModeToString(processing_mode),
-                                   " not recognized");
-  }
-  std::shared_ptr<Job> job;
-  std::vector<std::shared_ptr<Worker>> workers;
-  {
-    mutex_lock l(mu_);
-    if (!datasets_by_id_.contains(dataset_id)) {
-      return errors::NotFound("Dataset id: <", dataset_id, "> not found.");
-    }
-
-    int64 job_id = next_job_id_++;
-    DCHECK(!jobs_.contains(job_id));
-    job = std::make_shared<Job>(job_id, dataset_id, processing_mode, job_name);
-    jobs_[job_id] = job;
-
-    // Copy workers_ so that we can iterate through the workers without holding
-    // the lock. When a new worker is added in `RegisterWorker`, we iterate
-    // through the jobs in `jobs_` and give it a task for each job. So even if a
-    // new worker is registered after we release the lock, because this job has
-    // been added to `jobs_`, it will still receive a task for this job.
-    workers = workers_;
-    const Dataset& dataset = *datasets_by_id_[dataset_id];
-    if (VLOG_IS_ON(1)) {
-      VLOG(1) << "Sending tasks to workers for job " << job->job_id()
-              << ". Dataset id: " << dataset_id
-              << ". Dataset fingerprint: " << dataset.fingerprint()
-              << ". Dataset definition size: "
-              << datasets_by_id_[dataset_id]->dataset_def().ByteSizeLong();
-    }
-  }
-
-  for (auto& worker : workers) {
-    const Task& task = CreateTask(job.get(), worker->address());
-    Status s = AllocateTaskToWorker(task, worker.get());
-    if (!s.ok()) {
-      LOG(WARNING) << "Failed to allocate task with id " << task.task_id()
-                   << " to worker at address " << worker->address() << ": "
-                   << s.error_message();
-    }
-  }
-  VLOG(1) << "Done sending tasks to workers for job " << job->job_id();
-
-  *out_job_id = job->job_id();
-  return Status::OK();
-}
-
-const DataServiceMasterImpl::Task& DataServiceMasterImpl::CreateTask(
-    Job* job, const std::string& worker_address) LOCKS_EXCLUDED(mu_) {
-  mutex_lock l(mu_);
-  return CreateTaskLocked(job, worker_address);
-}
-
-const DataServiceMasterImpl::Task& DataServiceMasterImpl::CreateTaskLocked(
-    Job* job, const std::string& worker_address) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-  int64 task_id = next_task_id_++;
-  DCHECK(!tasks_.contains(task_id));
-  tasks_.insert({task_id, Task(task_id, job->job_id(), job->dataset_id(),
-                               worker_address)});
-  job->add_task_id(task_id);
-  return tasks_.at(task_id);
-}
-
-Status DataServiceMasterImpl::EnsureWorkerStubInitialized(Worker* worker) {
-  if (!worker->stub()) {
-    std::unique_ptr<WorkerService::Stub> stub;
-    TF_RETURN_IF_ERROR(CreateWorkerStub(worker->address(), protocol_, &stub));
-    worker->set_stub(std::move(stub));
-  }
-  return Status::OK();
-}
-
-Status DataServiceMasterImpl::AllocateTaskToWorker(const Task& task,
-                                                   Worker* worker)
-    LOCKS_EXCLUDED(mu_) {
-  TF_RETURN_IF_ERROR(EnsureWorkerStubInitialized(worker));
-  grpc::ClientContext client_ctx;
-  ProcessTaskRequest req;
-  req.mutable_task()->set_dataset_id(task.dataset_id());
-  {
-    mutex_lock l(mu_);
-    DCHECK(datasets_by_id_.contains(task.dataset_id()));
-    *req.mutable_task()->mutable_dataset() =
-        datasets_by_id_.at(task.dataset_id())->dataset_def();
-  }
-  req.mutable_task()->set_task_id(task.task_id());
-  ProcessTaskResponse resp;
-  grpc::Status s = worker->stub()->ProcessTask(&client_ctx, req, &resp);
-  if (!s.ok()) {
-    return grpc_util::WrapError(
-        absl::StrCat("Failed to submit task to worker ", worker->address()), s);
-  }
-  return Status::OK();
-}
-
-Status DataServiceMasterImpl::GetTasks(const GetTasksRequest* request,
-                                       GetTasksResponse* response) {
-  mutex_lock l(mu_);
-  VLOG(3) << "Looking up tasks for job id " << request->job_id();
-  auto it = jobs_.find(request->job_id());
-  if (it == jobs_.end()) {
-    return errors::NotFound("GetTasks failed. Job id <", request->job_id(),
-                            "> not found.");
-  }
-  std::shared_ptr<Job> job = it->second;
-  for (const auto& task_id : job->task_ids()) {
-    auto task_iter = tasks_.find(task_id);
-    DCHECK(task_iter != tasks_.end());
-    Task& task = task_iter->second;
-    TaskInfo* task_info = response->mutable_task_info()->Add();
-    task_info->set_worker_address(task.worker_address());
-    task_info->set_id(task.task_id());
-  }
-  response->set_job_finished(job->finished());
-  VLOG(3) << "Found " << response->task_info_size() << " tasks for job id "
-          << request->job_id();
-  return Status::OK();
-}
-
-Status DataServiceMasterImpl::GetWorkers(const GetWorkersRequest* request,
-                                         GetWorkersResponse* response) {
-  mutex_lock l(mu_);
-  VLOG(3) << "Enter GetWorkers";
-  for (auto& worker : workers_) {
-    WorkerInfo* info = response->add_workers();
-    info->set_address(worker->address());
-    info->set_id(worker->worker_id());
-  }
-  VLOG(3) << "Returning list of " << workers_.size()
-          << " workers from GetWorkers";
-  return Status::OK();
-}
-
-}  // namespace data
-}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/master_impl.h b/tensorflow/core/data/service/master_impl.h
deleted file mode 100644
index 67df2613118..00000000000
--- a/tensorflow/core/data/service/master_impl.h
+++ /dev/null
@@ -1,234 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_DATA_SERVICE_MASTER_IMPL_H_
-#define TENSORFLOW_CORE_DATA_SERVICE_MASTER_IMPL_H_
-
-#include "absl/container/flat_hash_map.h"
-#include "tensorflow/core/data/service/common.pb.h"
-#include "tensorflow/core/data/service/data_service.h"
-#include "tensorflow/core/data/service/master.pb.h"
-#include "tensorflow/core/data/service/worker.grpc.pb.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/public/session.h"
-
-namespace tensorflow {
-namespace data {
-
-// A service which coordinates a pool of workers to serve dataset elements over
-// RPC.
-//
-// Glossary:
-// * Dataset: A definition of how to generate a potentially large collection of
-//   elements.
-// * Job: A coordinated phase of reading from the tf.data service. A job
-//   produces some amount of data, and (potentially multiple) consumers consume
-//   the data from the job until there is no data left. Each job has a
-//   ProcessingModeDef which determines what data it produces.
-// * Task: A job is broken into multiple tasks, which each represent
-//   iterating over all of or part of the dataset. Workers process tasks.
-class DataServiceMasterImpl {
- public:
-  explicit DataServiceMasterImpl(const std::string protocol);
-
-  // See master.proto for API documentation.
-
-  /// Worker-facing API.
-  Status RegisterWorker(const RegisterWorkerRequest* request,
-                        RegisterWorkerResponse* response);
-  Status WorkerUpdate(const WorkerUpdateRequest* request,
-                      WorkerUpdateResponse* response);
-
-  /// Client-facing API.
-  Status GetOrRegisterDataset(const GetOrRegisterDatasetRequest* request,
-                              GetOrRegisterDatasetResponse* response);
-  Status CreateJob(const CreateJobRequest* request,
-                   CreateJobResponse* response);
-  Status GetOrCreateJob(const GetOrCreateJobRequest* request,
-                        GetOrCreateJobResponse* response);
-  Status GetTasks(const GetTasksRequest* request, GetTasksResponse* response);
-  Status GetWorkers(const GetWorkersRequest* request,
-                    GetWorkersResponse* response);
-
- private:
-  class Worker {
-   public:
-    Worker(int64 worker_id, const std::string address)
-        : worker_id_(worker_id), address_(address) {}
-
-    int64 worker_id() { return worker_id_; }
-    std::string address() { return address_; }
-    WorkerService::Stub* stub() { return stub_.get(); }
-    void set_stub(std::unique_ptr<WorkerService::Stub> stub) {
-      stub_ = std::move(stub);
-    }
-
-    std::string DebugString() {
-      return absl::StrCat("id: ", worker_id_, " address: ", address_);
-    }
-
-   private:
-    const int64 worker_id_;
-    const std::string address_;
-    std::unique_ptr<WorkerService::Stub> stub_;
-  };
-
-  class Dataset {
-   public:
-    Dataset(int64 dataset_id, int64 fingerprint, const DatasetDef& dataset_def)
-        : dataset_id_(dataset_id),
-          fingerprint_(fingerprint),
-          dataset_def_(dataset_def) {}
-
-    int64 dataset_id() const { return dataset_id_; }
-    int64 fingerprint() const { return fingerprint_; }
-    const DatasetDef& dataset_def() { return dataset_def_; }
-
-   private:
-    const int64 dataset_id_;
-    const int64 fingerprint_;
-    const DatasetDef dataset_def_;
-  };
-
-  class Job {
-   public:
-    Job(int64 job_id, int64 dataset_id, ProcessingMode processing_mode,
-        absl::optional<absl::string_view> job_name)
-        : job_id_(job_id),
-          dataset_id_(dataset_id),
-          processing_mode_(processing_mode),
-          job_name_(job_name) {}
-
-    int64 job_id() const { return job_id_; }
-    int64 dataset_id() const { return dataset_id_; }
-    ProcessingMode processing_mode() const { return processing_mode_; }
-    absl::optional<std::string> name() const { return job_name_; }
-    const std::vector<int64>& task_ids() const { return task_ids_; }
-    void add_task_id(int64 task_id) { task_ids_.push_back(task_id); }
-    void task_finished(int64 task_id) {
-      finished_tasks_.push_back(task_id);
-      if (finished_tasks_.size() == task_ids_.size()) {
-        finished_ = true;
-      }
-    }
-    bool finished() const { return finished_; }
-
-   private:
-    const int64 job_id_;
-    const int64 dataset_id_;
-    const ProcessingMode processing_mode_;
-    const absl::optional<std::string> job_name_;
-    std::vector<int64> task_ids_;
-    std::vector<int64> finished_tasks_;
-    bool finished_ = false;
-  };
-
-  class NamedJobKey {
-   public:
-    NamedJobKey(absl::string_view name, int64 index)
-        : name_(name), index_(index) {}
-
-    friend bool operator==(const NamedJobKey& lhs, const NamedJobKey& rhs) {
-      return lhs.name_ == rhs.name_ && lhs.index_ == rhs.index_;
-    }
-
-    template <typename H>
-    friend H AbslHashValue(H h, const NamedJobKey& k) {
-      return H::combine(std::move(h), k.name_, k.index_);
-    }
-
-   private:
-    const std::string name_;
-    const int64 index_;
-  };
-
-  class Task {
-   public:
-    Task(int64 task_id, int64 job_id, int64 dataset_id,
-         const std::string& worker_address)
-        : task_id_(task_id),
-          job_id_(job_id),
-          dataset_id_(dataset_id),
-          worker_address_(worker_address) {}
-
-    int64 task_id() const { return task_id_; }
-    int64 job_id() const { return job_id_; }
-    int64 dataset_id() const { return dataset_id_; }
-    std::string worker_address() const { return worker_address_; }
-
-   private:
-    const int64 task_id_;
-    const int64 job_id_;
-    const int64 dataset_id_;
-    const std::string worker_address_;
-  };
-
-  // Registers a dataset with the given fingerprint, returning a new dataset id.
-  int64 RegisterDataset(uint64 fingerprint, const DatasetDef& dataset)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
-  // Initializes a workers stub, if it hasn't been initialized already.
-  Status EnsureWorkerStubInitialized(Worker* worker);
-  // Instructs a worker to begin processing a task.
-  Status AllocateTaskToWorker(const Task& task_id, Worker* worker)
-      LOCKS_EXCLUDED(mu_);
-  // Creates a job and stores its job_id in `*job_id`.
-  Status CreateJob(int64 dataset_id, ProcessingMode processing_mode,
-                   absl::optional<std::string> job_name, int64* out_job_id)
-      LOCKS_EXCLUDED(mu_);
-  // Creates a new task for a job, returning a reference to the task.
-  const Task& CreateTask(Job* job, const std::string& worker_address)
-      LOCKS_EXCLUDED(mu_);
-  // Same as `CreateTask`, but expects that the master lock is already held.
-  const Task& CreateTaskLocked(Job* job, const std::string& worker_address)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
-  // Validates that an existing job matches the given processing_mode and
-  // dataset_id, returning an error status describing any difference.
-  Status ValidateMatchingJob(const Job& job, ProcessingMode processing_mode,
-                             int64 dataset_id);
-  // Protocol to use for communicating with workers.
-  const std::string protocol_;
-
-  mutex mu_;
-
-  int64 next_worker_id_ TF_GUARDED_BY(mu_) = 0;
-  int64 next_dataset_id_ TF_GUARDED_BY(mu_) = 0;
-  int64 next_job_id_ TF_GUARDED_BY(mu_) = 0;
-  int64 next_task_id_ TF_GUARDED_BY(mu_) = 0;
-
-  // Registered workers.
-  std::vector<std::shared_ptr<Worker>> workers_ TF_GUARDED_BY(mu_);
-  // Registered datasets, keyed by dataset ids.
-  absl::flat_hash_map<int64, std::shared_ptr<Dataset>> datasets_by_id_
-      TF_GUARDED_BY(mu_);
-  // Registered datasets, keyed by dataset fingerprints.
-  absl::flat_hash_map<uint64, std::shared_ptr<Dataset>> datasets_by_fingerprint_
-      TF_GUARDED_BY(mu_);
-  // Information about jobs, keyed by job ids.
-  absl::flat_hash_map<int64, std::shared_ptr<Job>> jobs_ TF_GUARDED_BY(mu_);
-  // Information about tasks, keyed by task ids.
-  absl::flat_hash_map<int64, Task> tasks_ TF_GUARDED_BY(mu_);
-  // Named jobs, keyed by their names and indices. Not all jobs have names, so
-  // this is a subset of the jobs stored in `jobs_`.
-  absl::flat_hash_map<NamedJobKey, std::shared_ptr<Job>> named_jobs_
-      TF_GUARDED_BY(mu_);
-
-  TF_DISALLOW_COPY_AND_ASSIGN(DataServiceMasterImpl);
-};
-
-}  // namespace data
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_DATA_SERVICE_MASTER_IMPL_H_
diff --git a/tensorflow/core/data/service/server_lib.cc b/tensorflow/core/data/service/server_lib.cc
index 33c2232f4dc..fb33319db29 100644
--- a/tensorflow/core/data/service/server_lib.cc
+++ b/tensorflow/core/data/service/server_lib.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/core/data/service/server_lib.h"
 
 #include "tensorflow/core/data/service/credentials_factory.h"
-#include "tensorflow/core/data/service/grpc_master_impl.h"
+#include "tensorflow/core/data/service/grpc_dispatcher_impl.h"
 #include "tensorflow/core/data/service/grpc_util.h"
 #include "tensorflow/core/data/service/grpc_worker_impl.h"
 #include "tensorflow/core/platform/errors.h"
@@ -28,8 +28,12 @@ namespace {
 constexpr char kPortPlaceholder[] = "%port%";
 }
 
-GrpcDataServerBase::GrpcDataServerBase(int port, const std::string& protocol)
-    : requested_port_(port), protocol_(protocol) {}
+GrpcDataServerBase::GrpcDataServerBase(int port, const std::string& protocol,
+                                       const std::string server_type)
+    : requested_port_(port),
+      protocol_(protocol),
+      server_type_(server_type),
+      bound_port_(port) {}
 
 Status GrpcDataServerBase::Start() {
   if (stopped_) {
@@ -56,7 +60,8 @@ Status GrpcDataServerBase::Start() {
   TF_RETURN_IF_ERROR(StartServiceInternal());
 
   started_ = true;
-  VLOG(1) << "Started tf.data service running at 0.0.0.0:" << BoundPort();
+  LOG(INFO) << "Started tf.data " << server_type_
+            << " running at 0.0.0.0:" << BoundPort();
   return Status::OK();
 }
 
@@ -72,18 +77,22 @@ void GrpcDataServerBase::Join() { server_->Wait(); }
 
 int GrpcDataServerBase::BoundPort() { return bound_port(); }
 
-MasterGrpcDataServer::MasterGrpcDataServer(int port,
-                                           const std::string& protocol)
-    : GrpcDataServerBase(port, protocol) {}
+DispatchGrpcDataServer::DispatchGrpcDataServer(
+    const experimental::DispatcherConfig& config)
+    : GrpcDataServerBase(config.port(), config.protocol(), "DispatchServer"),
+      config_(config) {}
 
-MasterGrpcDataServer::~MasterGrpcDataServer() { delete service_; }
+DispatchGrpcDataServer::~DispatchGrpcDataServer() { delete service_; }
 
-void MasterGrpcDataServer::AddServiceToBuilder(grpc::ServerBuilder* builder) {
-  auto service = absl::make_unique<GrpcMasterImpl>(builder, protocol_);
-  service_ = service.release();
+void DispatchGrpcDataServer::AddServiceToBuilder(grpc::ServerBuilder* builder) {
+  service_ = absl::make_unique<GrpcDispatcherImpl>(builder, config_).release();
 }
 
-Status MasterGrpcDataServer::NumWorkers(int* num_workers) {
+Status DispatchGrpcDataServer::StartServiceInternal() {
+  return service_->Start();
+}
+
+Status DispatchGrpcDataServer::NumWorkers(int* num_workers) {
   GetWorkersRequest req;
   GetWorkersResponse resp;
   grpc::ServerContext ctx;
@@ -95,53 +104,38 @@ Status MasterGrpcDataServer::NumWorkers(int* num_workers) {
   return Status::OK();
 }
 
-WorkerGrpcDataServer::WorkerGrpcDataServer(int port,
-                                           const std::string& protocol,
-                                           const std::string& master_address,
-                                           const std::string& worker_address)
-    : GrpcDataServerBase(port, protocol),
-      master_address_(master_address),
-      worker_address_(worker_address) {}
+WorkerGrpcDataServer::WorkerGrpcDataServer(
+    const experimental::WorkerConfig& config)
+    : GrpcDataServerBase(config.port(), config.protocol(), "WorkerServer"),
+      config_(config) {}
 
 WorkerGrpcDataServer::~WorkerGrpcDataServer() { delete service_; }
 
 void WorkerGrpcDataServer::AddServiceToBuilder(grpc::ServerBuilder* builder) {
-  auto service =
-      absl::make_unique<GrpcWorkerImpl>(builder, master_address_, protocol_);
-  service_ = service.release();
+  service_ = absl::make_unique<GrpcWorkerImpl>(builder, config_).release();
 }
 
 Status WorkerGrpcDataServer::StartServiceInternal() {
-  std::string worker_address = worker_address_;
+  std::string worker_address = config_.worker_address();
   if (worker_address.empty()) {
     worker_address = absl::StrCat("localhost:", kPortPlaceholder);
   }
   std::string resolved_address = str_util::StringReplace(
       worker_address, kPortPlaceholder, absl::StrCat(bound_port()),
       /*replace_all=*/false);
-  service_->Start(resolved_address);
+  TF_RETURN_IF_ERROR(service_->Start(resolved_address));
   return Status::OK();
 }
 
-Status NewMasterServer(int port, const std::string& protocol,
-                       std::unique_ptr<MasterGrpcDataServer>* out_server) {
-  *out_server = absl::make_unique<MasterGrpcDataServer>(port, protocol);
+Status NewDispatchServer(const experimental::DispatcherConfig& config,
+                         std::unique_ptr<DispatchGrpcDataServer>* out_server) {
+  *out_server = absl::make_unique<DispatchGrpcDataServer>(config);
   return Status::OK();
 }
 
-Status NewWorkerServer(int port, const std::string& protocol,
-                       const std::string& master_address,
+Status NewWorkerServer(const experimental::WorkerConfig& config,
                        std::unique_ptr<WorkerGrpcDataServer>* out_server) {
-  return NewWorkerServer(port, protocol, master_address, /*worker_address=*/"",
-                         out_server);
-}
-
-Status NewWorkerServer(int port, const std::string& protocol,
-                       const std::string& master_address,
-                       const std::string& worker_address,
-                       std::unique_ptr<WorkerGrpcDataServer>* out_server) {
-  *out_server = absl::make_unique<WorkerGrpcDataServer>(
-      port, protocol, master_address, worker_address);
+  *out_server = absl::make_unique<WorkerGrpcDataServer>(config);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/data/service/server_lib.h b/tensorflow/core/data/service/server_lib.h
index 72bec665c8e..62662e61c8a 100644
--- a/tensorflow/core/data/service/server_lib.h
+++ b/tensorflow/core/data/service/server_lib.h
@@ -19,13 +19,14 @@ limitations under the License.
 #include "grpcpp/server.h"
 #include "grpcpp/server_builder.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/data/experimental/service_config.pb.h"
 
 namespace tensorflow {
 namespace data {
 
 // Forward declared because transitively depending on .grpc.pb.h files causes
 // issues in the pywrap build.
-class GrpcMasterImpl;
+class GrpcDispatcherImpl;
 class GrpcWorkerImpl;
 
 // A grpc server for the tf.data service.
@@ -33,10 +34,9 @@ class GrpcDataServerBase {
  public:
   // Constructs a tf.data server with the specified port. If the port is 0, the
   // server will find an available port in `Start()`. The chosen port can be
-  // found in the output of `Target()`.
-  //
-  // master_address is only needed for worker data servers.
-  GrpcDataServerBase(int requested_port, const std::string& protocol);
+  // found by calling `BoundPort()`.
+  GrpcDataServerBase(int requested_port, const std::string& protocol,
+                     const std::string server_type);
   virtual ~GrpcDataServerBase() {}
 
   // Starts the server running asynchronously.
@@ -61,6 +61,7 @@ class GrpcDataServerBase {
 
   const int requested_port_;
   const std::string protocol_;
+  const std::string server_type_;
 
  private:
   int bound_port_;
@@ -70,28 +71,27 @@ class GrpcDataServerBase {
   std::unique_ptr<grpc::Server> server_;
 };
 
-class MasterGrpcDataServer : public GrpcDataServerBase {
+class DispatchGrpcDataServer : public GrpcDataServerBase {
  public:
-  MasterGrpcDataServer(int requested_port, const std::string& protocol);
-  ~MasterGrpcDataServer() override;
+  explicit DispatchGrpcDataServer(const experimental::DispatcherConfig& config);
+  ~DispatchGrpcDataServer() override;
 
-  // Returns the number of workers registerd with the master.
+  // Returns the number of workers registerd with the dispatcher.
   Status NumWorkers(int* num_workers);
 
  protected:
   void AddServiceToBuilder(grpc::ServerBuilder* builder) override;
-  Status StartServiceInternal() override { return Status::OK(); }
+  Status StartServiceInternal() override;
 
  private:
-  // Owned. We use a raw pointer because GrpcMasterImpl is forward-declared.
-  GrpcMasterImpl* service_;
+  const experimental::DispatcherConfig config_;
+  // Owned. We use a raw pointer because GrpcDispatcherImpl is forward-declared.
+  GrpcDispatcherImpl* service_;
 };
 
 class WorkerGrpcDataServer : public GrpcDataServerBase {
  public:
-  WorkerGrpcDataServer(int requested_port, const std::string& protocol,
-                       const std::string& master_address,
-                       const std::string& worker_address);
+  explicit WorkerGrpcDataServer(const experimental::WorkerConfig& config);
   ~WorkerGrpcDataServer() override;
 
  protected:
@@ -99,34 +99,17 @@ class WorkerGrpcDataServer : public GrpcDataServerBase {
   Status StartServiceInternal() override;
 
  private:
-  const std::string master_address_;
-  const std::string worker_address_;
+  const experimental::WorkerConfig config_;
   // Owned. We use a raw pointer because GrpcWorkerImpl is forward-declared.
   GrpcWorkerImpl* service_;
 };
 
-// Creates a master tf.data server and stores it in `*out_server`.
-Status NewMasterServer(int port, const std::string& protocol,
-                       std::unique_ptr<MasterGrpcDataServer>* out_server);
+// Creates a dispatch tf.data server and stores it in `*out_server`.
+Status NewDispatchServer(const experimental::DispatcherConfig& config,
+                         std::unique_ptr<DispatchGrpcDataServer>* out_server);
 
 // Creates a worker tf.data server and stores it in `*out_server`.
-//
-// The port can be a specific port or 0. If the port is 0, an available port
-// will be chosen in Start(). This value can be queried with BoundPort().
-//
-// The worker_address argument is optional. If left empty, it will default to
-// "localhost:%port%". When the worker registers with the master, the worker
-// will report the worker address, so that the master can tell clients where to
-// read from. The address may contain the placeholder "%port%", which will be
-// replaced with the value of BoundPort().
-Status NewWorkerServer(int port, const std::string& protocol,
-                       const std::string& master_address,
-                       const std::string& worker_address,
-                       std::unique_ptr<WorkerGrpcDataServer>* out_server);
-
-// Creates a worker using the default worker_address.
-Status NewWorkerServer(int port, const std::string& protocol,
-                       const std::string& master_address,
+Status NewWorkerServer(const experimental::WorkerConfig& config,
                        std::unique_ptr<WorkerGrpcDataServer>* out_server);
 
 }  // namespace data
diff --git a/tensorflow/core/data/service/test_cluster.cc b/tensorflow/core/data/service/test_cluster.cc
index ded3ebb91b5..8ae3f191407 100644
--- a/tensorflow/core/data/service/test_cluster.cc
+++ b/tensorflow/core/data/service/test_cluster.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/data/service/server_lib.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/protobuf/data/experimental/service_config.pb.h"
 
 namespace tensorflow {
 namespace data {
@@ -45,9 +46,12 @@ Status TestCluster::Initialize() {
         "Test cluster has already been initialized.");
   }
   initialized_ = true;
-  TF_RETURN_IF_ERROR(NewMasterServer(/*port=*/0, kProtocol, &master_));
-  TF_RETURN_IF_ERROR(master_->Start());
-  master_address_ = absl::StrCat("localhost:", master_->BoundPort());
+  experimental::DispatcherConfig config;
+  config.set_port(0);
+  config.set_protocol(kProtocol);
+  TF_RETURN_IF_ERROR(NewDispatchServer(config, &dispatcher_));
+  TF_RETURN_IF_ERROR(dispatcher_->Start());
+  dispatcher_address_ = absl::StrCat("localhost:", dispatcher_->BoundPort());
   workers_.reserve(num_workers_);
   worker_addresses_.reserve(num_workers_);
   for (int i = 0; i < num_workers_; ++i) {
@@ -58,15 +62,19 @@ Status TestCluster::Initialize() {
 
 Status TestCluster::AddWorker() {
   std::unique_ptr<WorkerGrpcDataServer> worker;
-  TF_RETURN_IF_ERROR(
-      NewWorkerServer(/*port=*/0, kProtocol, master_address_, &worker));
+  experimental::WorkerConfig config;
+  config.set_port(0);
+  config.set_protocol(kProtocol);
+  config.set_dispatcher_address(dispatcher_address_);
+  config.set_worker_address("localhost:%port%");
+  TF_RETURN_IF_ERROR(NewWorkerServer(config, &worker));
   TF_RETURN_IF_ERROR(worker->Start());
   worker_addresses_.push_back(absl::StrCat("localhost:", worker->BoundPort()));
   workers_.push_back(std::move(worker));
   return Status::OK();
 }
 
-std::string TestCluster::MasterAddress() { return master_address_; }
+std::string TestCluster::DispatcherAddress() { return dispatcher_address_; }
 
 std::string TestCluster::WorkerAddress(int index) {
   DCHECK_GE(index, 0);
diff --git a/tensorflow/core/data/service/test_cluster.h b/tensorflow/core/data/service/test_cluster.h
index c4b05ad0543..c5ca3db4c74 100644
--- a/tensorflow/core/data/service/test_cluster.h
+++ b/tensorflow/core/data/service/test_cluster.h
@@ -24,7 +24,7 @@ namespace data {
 // Helper class for unit testing a tf.data service cluster.
 class TestCluster {
  public:
-  // Creates a new test cluster with a master and `num_workers` workers.
+  // Creates a new test cluster with a dispatcher and `num_workers` workers.
   explicit TestCluster(int num_workers);
 
   // Initializes the test cluster. This must be called before interacting with
@@ -32,8 +32,8 @@ class TestCluster {
   Status Initialize();
   // Adds a new worker to the cluster.
   Status AddWorker();
-  // Returns the master address in the form "hostname:port".
-  std::string MasterAddress();
+  // Returns the dispatcher address in the form "hostname:port".
+  std::string DispatcherAddress();
   // Returns the address of the worker at the specified index, in the form
   // "hostname:port". The index must be non-negative and less than the number of
   // workers in the cluster.
@@ -42,8 +42,8 @@ class TestCluster {
  private:
   bool initialized_ = false;
   int num_workers_;
-  std::unique_ptr<MasterGrpcDataServer> master_;
-  std::string master_address_;
+  std::unique_ptr<DispatchGrpcDataServer> dispatcher_;
+  std::string dispatcher_address_;
   std::vector<std::unique_ptr<WorkerGrpcDataServer>> workers_;
   std::vector<std::string> worker_addresses_;
 };
diff --git a/tensorflow/core/data/service/worker_impl.cc b/tensorflow/core/data/service/worker_impl.cc
index 151410bb219..d17acffb941 100644
--- a/tensorflow/core/data/service/worker_impl.cc
+++ b/tensorflow/core/data/service/worker_impl.cc
@@ -21,9 +21,9 @@ limitations under the License.
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/data/dataset.pb.h"
 #include "tensorflow/core/data/service/credentials_factory.h"
+#include "tensorflow/core/data/service/dispatcher.grpc.pb.h"
+#include "tensorflow/core/data/service/dispatcher.pb.h"
 #include "tensorflow/core/data/service/grpc_util.h"
-#include "tensorflow/core/data/service/master.grpc.pb.h"
-#include "tensorflow/core/data/service/master.pb.h"
 #include "tensorflow/core/data/standalone.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -36,7 +36,7 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 
-const constexpr uint64 kHeartbeatIntervalMicros = 5ull * 1000 * 1000;
+const constexpr uint64 kRetryIntervalMicros = 5ull * 1000 * 1000;
 
 namespace {
 auto* tf_data_service_created =
@@ -45,35 +45,44 @@ auto* tf_data_service_created =
                                     "has been created.");
 }  // namespace
 
-DataServiceWorkerImpl::DataServiceWorkerImpl(const std::string& master_address,
-                                             const std::string& protocol)
-    : master_address_(master_address), protocol_(protocol) {
+DataServiceWorkerImpl::DataServiceWorkerImpl(
+    const experimental::WorkerConfig& config)
+    : config_(config) {
   tf_data_service_created->GetCell()->Set(true);
 }
 
 DataServiceWorkerImpl::~DataServiceWorkerImpl() {
   mutex_lock l(mu_);
   cancelled_ = true;
-  heartbeat_cv_.notify_one();
+  background_cv_.notify_one();
 }
 
-void DataServiceWorkerImpl::Start(const std::string& worker_address) {
+Status DataServiceWorkerImpl::Start(const std::string& worker_address) {
   VLOG(3) << "Starting tf.data service worker at address " << worker_address;
-  mutex_lock l(mu_);
   worker_address_ = worker_address;
 
-  Thread* thread = Env::Default()->StartThread(
-      {}, "data-service-worker-heartbeat", [this]() { HeartbeatThread(); });
-  heartbeat_thread_.reset(thread);
-  Status s = Register();
-  while (!s.ok()) {
-    LOG(WARNING) << "Failed to register with master at " << master_address_
-                 << ": " << s;
-    Env::Default()->SleepForMicroseconds(kHeartbeatIntervalMicros);
-    s = Register();
-  }
-}
+  std::unique_ptr<DispatcherService::Stub> dispatcher;
+  TF_RETURN_IF_ERROR(MakeDispatcherStub(&dispatcher));
 
+  Status s = Register(dispatcher.get());
+  while (!s.ok()) {
+    LOG(WARNING) << "Failed to register with dispatcher at "
+                 << config_.dispatcher_address() << ": " << s;
+    Env::Default()->SleepForMicroseconds(kRetryIntervalMicros);
+    s = Register(dispatcher.get());
+  }
+  Thread* thread =
+      Env::Default()->StartThread({}, "data-service-worker-background",
+                                  [this, dispatcher = dispatcher.release()]() {
+                                    BackgroundThread(dispatcher);
+                                  });
+  LOG(INFO) << "Worker registered with dispatcher running at "
+            << config_.dispatcher_address();
+  background_thread_.reset(thread);
+  mutex_lock l(mu_);
+  registered_ = true;
+  return Status::OK();
+}
 
 Status DataServiceWorkerImpl::ProcessTask(const ProcessTaskRequest* request,
                                           ProcessTaskResponse* response) {
@@ -99,7 +108,7 @@ Status DataServiceWorkerImpl::ProcessTaskInternal(const TaskDef& task_def)
                                  " already exists.");
   }
   Task& task = tasks_[task_def.task_id()];
-  task.id = task_def.task_id();
+  task.task_id = task_def.task_id();
   task.dataset = std::move(dataset);
   task.iterator = std::move(iterator);
   VLOG(3) << "Began processing for task " << task_def.task_id();
@@ -113,6 +122,13 @@ Status DataServiceWorkerImpl::GetElement(const GetElementRequest* request,
   std::vector<tensorflow::Tensor> outputs;
   {
     mutex_lock l(mu_);
+    if (!registered_) {
+      // We need to reject requests until the worker has registered with the
+      // dispatcher, so that we don't return NOT_FOUND for tasks that the worker
+      // had before preemption.
+      return errors::Unavailable(
+          "Worker has not yet registered with dispatcher.");
+    }
     auto it = tasks_.find(request->task_id());
     if (it == tasks_.end()) {
       return errors::NotFound("DataServiceWorkerImpl::GetElement failed. ",
@@ -129,8 +145,8 @@ Status DataServiceWorkerImpl::GetElement(const GetElementRequest* request,
       VLOG(3) << "Reached end_of_sequence for task " << request->task_id();
       // Release iterator memory and leave a null entry as a tombstone.
       iter.reset();
-      pending_completed_tasks_.push_back(request->task_id());
-      heartbeat_cv_.notify_one();
+      pending_completed_tasks_.insert(request->task_id());
+      background_cv_.notify_one();
     }
   }
 
@@ -169,79 +185,88 @@ Status DataServiceWorkerImpl::GetElement(const GetElementRequest* request,
   return Status::OK();
 }
 
-Status DataServiceWorkerImpl::EnsureMasterStubInitialized()
-    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-  if (!master_stub_) {
-    ::grpc::ChannelArguments args;
-    std::shared_ptr<::grpc::ChannelCredentials> credentials;
-    TF_RETURN_IF_ERROR(
-        CredentialsFactory::CreateClientCredentials(protocol_, &credentials));
-    auto channel =
-        ::grpc::CreateCustomChannel(master_address_, credentials, args);
-    master_stub_ = MasterService::NewStub(channel);
-  }
+Status DataServiceWorkerImpl::MakeDispatcherStub(
+    std::unique_ptr<DispatcherService::Stub>* stub) {
+  ::grpc::ChannelArguments args;
+  std::shared_ptr<::grpc::ChannelCredentials> credentials;
+  TF_RETURN_IF_ERROR(CredentialsFactory::CreateClientCredentials(
+      config_.protocol(), &credentials));
+  auto channel = ::grpc::CreateCustomChannel(config_.dispatcher_address(),
+                                             credentials, args);
+  *stub = DispatcherService::NewStub(channel);
   return Status::OK();
 }
 
-Status DataServiceWorkerImpl::Register() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-  VLOG(3) << "Registering with master at " << master_address_;
-  TF_RETURN_IF_ERROR(EnsureMasterStubInitialized());
+Status DataServiceWorkerImpl::Register(DispatcherService::Stub* dispatcher_stub)
+    LOCKS_EXCLUDED(mu_) {
+  VLOG(3) << "Registering with dispatcher at " << config_.dispatcher_address();
   RegisterWorkerRequest req;
   req.set_worker_address(worker_address_);
   RegisterWorkerResponse resp;
-
   grpc::ClientContext ctx;
-  grpc::Status s = master_stub_->RegisterWorker(&ctx, req, &resp);
+  grpc::Status s = dispatcher_stub->RegisterWorker(&ctx, req, &resp);
   if (!s.ok()) {
     return grpc_util::WrapError("Failed to register worker", s);
   }
   for (const TaskDef& task : resp.tasks()) {
+    mutex_lock l(mu_);
     TF_RETURN_IF_ERROR(ProcessTaskInternal(task));
   }
-  worker_id_ = resp.worker_id();
-  VLOG(3) << "Registered worker with id " << resp.worker_id();
+  VLOG(3) << "Registered worker with address " << worker_address_;
   return Status::OK();
 }
 
-Status DataServiceWorkerImpl::SendTaskUpdate() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-  VLOG(3) << "Sending " << pending_completed_tasks_.size()
-          << " task updates to master";
-  TF_RETURN_IF_ERROR(EnsureMasterStubInitialized());
+void DataServiceWorkerImpl::BackgroundThread(
+    DispatcherService::Stub* dispatcher_ptr) LOCKS_EXCLUDED(mu_) {
+  std::unique_ptr<DispatcherService::Stub> dispatcher =
+      absl::WrapUnique(dispatcher_ptr);
+  while (true) {
+    {
+      mutex_lock l(mu_);
+      while (!cancelled_ && pending_completed_tasks_.empty()) {
+        background_cv_.wait(l);
+      }
+      if (cancelled_) {
+        VLOG(3) << "Background thread shutting down";
+        return;
+      }
+    }
+    Status s = SendTaskUpdates(dispatcher.get());
+    if (!s.ok()) {
+      LOG(WARNING) << "Failed to send task updates to dispatcher: " << s;
+      Env::Default()->SleepForMicroseconds(kRetryIntervalMicros);
+    }
+  }
+}
+
+Status DataServiceWorkerImpl::SendTaskUpdates(
+    DispatcherService::Stub* dispatcher) LOCKS_EXCLUDED(mu_) {
   WorkerUpdateRequest req;
-  req.set_worker_id(worker_id_);
-  for (int task_id : pending_completed_tasks_) {
-    TaskProgress* update = req.add_updates();
-    update->set_task_id(task_id);
-    update->set_completed(true);
+  {
+    mutex_lock l(mu_);
+    VLOG(3) << "Sending " << pending_completed_tasks_.size()
+            << " task updates to dispatcher";
+    req.set_worker_address(worker_address_);
+    for (int task_id : pending_completed_tasks_) {
+      TaskProgress* update = req.add_updates();
+      update->set_task_id(task_id);
+      update->set_completed(true);
+    }
   }
 
   WorkerUpdateResponse resp;
   grpc::ClientContext ctx;
-  grpc::Status s = master_stub_->WorkerUpdate(&ctx, req, &resp);
+  grpc::Status s = dispatcher->WorkerUpdate(&ctx, req, &resp);
   if (!s.ok()) {
     return grpc_util::WrapError("Failed to send task updates", s);
   }
-  pending_completed_tasks_.clear();
+  mutex_lock l(mu_);
+  for (const auto& update : req.updates()) {
+    pending_completed_tasks_.erase(update.task_id());
+  }
   VLOG(3) << "Sent " << req.updates().size() << " task updates ";
   return Status::OK();
 }
 
-void DataServiceWorkerImpl::HeartbeatThread() {
-  while (true) {
-    mutex_lock l(mu_);
-    while (!cancelled_ && pending_completed_tasks_.empty()) {
-      heartbeat_cv_.wait(l);
-    }
-    if (cancelled_) {
-      VLOG(3) << "Heartbeat thread shutting down";
-      return;
-    }
-    Status s = SendTaskUpdate();
-    if (!s.ok()) {
-      LOG(WARNING) << "Failed to send task updates to master: " << s;
-    }
-  }
-}
-
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/worker_impl.h b/tensorflow/core/data/service/worker_impl.h
index 8c5fc2ea51c..36edbe5ce74 100644
--- a/tensorflow/core/data/service/worker_impl.h
+++ b/tensorflow/core/data/service/worker_impl.h
@@ -17,10 +17,11 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/data/service/common.pb.h"
-#include "tensorflow/core/data/service/master.grpc.pb.h"
+#include "tensorflow/core/data/service/dispatcher.grpc.pb.h"
 #include "tensorflow/core/data/service/worker.pb.h"
 #include "tensorflow/core/data/standalone.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/data/experimental/service_config.pb.h"
 #include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
@@ -29,17 +30,19 @@ namespace data {
 // A TensorFlow DataService serves dataset elements over RPC.
 class DataServiceWorkerImpl {
  public:
-  explicit DataServiceWorkerImpl(const std::string& master_address,
-                                 const std::string& protocol);
+  explicit DataServiceWorkerImpl(const experimental::WorkerConfig& config);
   ~DataServiceWorkerImpl();
 
   // Starts the worker. The worker needs to know its own address so that it can
-  // register with the master.
-  void Start(const std::string& worker_address);
+  // register with the dispatcher. This is set in `Start` instead of in the
+  // constructor because the worker may be binding to port `0`, in which case
+  // the address isn't known until the worker has started and decided which port
+  // to bind to.
+  Status Start(const std::string& worker_address);
 
   // See worker.proto for API documentation.
 
-  /// Master-facing API.
+  /// Dispatcher-facing API.
   Status ProcessTask(const ProcessTaskRequest* request,
                      ProcessTaskResponse* response);
 
@@ -48,42 +51,44 @@ class DataServiceWorkerImpl {
                     GetElementResponse* response);
 
  private:
-  // Sets master_stub_ if it isn't already set.
-  Status EnsureMasterStubInitialized();
-  // Registers the worker with the master.
-  Status Register();
-  // Sends task status to the master.
-  Status SendTaskUpdate();
+  Status MakeDispatcherStub(std::unique_ptr<DispatcherService::Stub>* stub);
+  // Registers the worker with the dispatcher.
+  Status Register(DispatcherService::Stub* dispatcher) LOCKS_EXCLUDED(mu_);
+  // Sends task status to the dispatcher and checks for dispatcher commands.
+  Status SendTaskUpdates(DispatcherService::Stub* dispatcher)
+      LOCKS_EXCLUDED(mu_);
   // Creates an iterator to process a task.
-  Status ProcessTaskInternal(const TaskDef& task);
-  // A thread for updating the master with worker status.
-  void HeartbeatThread();
+  Status ProcessTaskInternal(const TaskDef& task) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // A thread for doing async background processing not associated with a
+  // specific RPC, such as reporting finished tasks. The thread takes
+  // ownership of the passed dispatcher_ptr. We use a raw pointer instead of
+  // unique_ptr since unique_ptr cannot be passed to std::function.
+  void BackgroundThread(DispatcherService::Stub* dispatcher_ptr)
+      LOCKS_EXCLUDED(mu_);
 
   typedef struct Task {
-    int64 id;
+    int64 task_id;
     // TODO(aaudibert): Have standalone::Iterator own a reference to
     // standalone::Dataset so that we don't need to store the dataset here.
     std::unique_ptr<standalone::Dataset> dataset;
     std::unique_ptr<standalone::Iterator> iterator;
   } Task;
 
-  const std::string master_address_;
-  // Protocol for communicating with the master.
-  const std::string protocol_;
+  const experimental::WorkerConfig config_;
   // The worker's own address.
   std::string worker_address_;
 
   mutex mu_;
-  int64 worker_id_ TF_GUARDED_BY(mu_);
-  std::unique_ptr<MasterService::Stub> master_stub_ TF_GUARDED_BY(mu_);
   // Information about tasks, keyed by task ids.
   absl::flat_hash_map<int64, Task> tasks_ TF_GUARDED_BY(mu_);
-  // List of completed tasks which haven't yet been communicated to the master.
-  std::vector<int64> pending_completed_tasks_ TF_GUARDED_BY(mu_);
+  // Completed tasks which haven't yet been communicated to the dispatcher.
+  absl::flat_hash_set<int64> pending_completed_tasks_ TF_GUARDED_BY(mu_);
   bool cancelled_ TF_GUARDED_BY(mu_) = false;
-  // Condition variable for notifying the heartbeat thread.
-  condition_variable heartbeat_cv_ TF_GUARDED_BY(mu_);
-  std::unique_ptr<Thread> heartbeat_thread_;
+  // Whether the worker has registered with the dispatcher yet.
+  bool registered_ TF_GUARDED_BY(mu_) = false;
+  // Condition variable for notifying the background thread.
+  condition_variable background_cv_ TF_GUARDED_BY(mu_);
+  std::unique_ptr<Thread> background_thread_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(DataServiceWorkerImpl);
 };
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index ae1253e7dc6..30512295a7e 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -79,9 +79,9 @@ tf_cc_test(
     deps = [
         ":message_wrappers",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/framework:tensor_testutil",
     ],
 )
 
@@ -262,10 +262,10 @@ tf_cc_test(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:worker_proto_cc",
+        "//tensorflow/core/framework:tensor_testutil",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
index 7ddba8811b4..3f7867200f8 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
@@ -333,7 +333,7 @@ void ClusterFunctionLibraryRuntime::Run(
 void ClusterFunctionLibraryRuntime::Run(
     const FunctionLibraryRuntime::Options& opts,
     FunctionLibraryRuntime::LocalHandle handle,
-    gtl::ArraySlice<FunctionArg> args, std::vector<Tensor>* rets,
+    gtl::ArraySlice<FunctionArg> args, std::vector<FunctionRet>* rets,
     FunctionLibraryRuntime::DoneCallback done) {
   std::vector<Tensor> tensors;
   for (const auto& arg : args) {
@@ -346,7 +346,17 @@ void ClusterFunctionLibraryRuntime::Run(
       return;
     }
   }
-  return Run(opts, handle, tensors, rets, std::move(done));
+  std::vector<Tensor>* ret_tensors = new std::vector<Tensor>;
+  return Run(opts, handle, tensors, ret_tensors,
+             [rets, ret_tensors, done = std::move(done)](const Status& s) {
+               if (s.ok()) {
+                 for (const auto& t : *ret_tensors) {
+                   rets->push_back(t);
+                 }
+               }
+               delete ret_tensors;
+               done(s);
+             });
 }
 
 void ClusterFunctionLibraryRuntime::CleanUp(
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
index b720fe7ad6d..eb9ce64bcdb 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
@@ -49,7 +49,7 @@ class ClusterFunctionLibraryRuntime : public DistributedFunctionLibraryRuntime {
 
   void Run(const FunctionLibraryRuntime::Options& opts,
            FunctionLibraryRuntime::LocalHandle handle,
-           gtl::ArraySlice<FunctionArg> args, std::vector<Tensor>* rets,
+           gtl::ArraySlice<FunctionArg> args, std::vector<FunctionRet>* rets,
            FunctionLibraryRuntime::DoneCallback done) override;
 
   void CleanUp(uint64 step_id, FunctionLibraryRuntime::LocalHandle handle,
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
index a8738291e5d..650c52cd8da 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
@@ -220,31 +220,33 @@ Status CollectiveParamResolverDistributed::UpdateGroupCache(
     const CompleteGroupResponse& resp) {
   // Build a new record from resp.
   std::unique_ptr<GroupRec> gr(new GroupRec);
-  mutex_lock grl(gr->mu);
-  gr->group.device_type = DeviceType(resp.device_type());
-  gr->group.group_key = resp.group_key();
-  gr->group.group_size = resp.group_size();
-  gr->group.num_tasks = resp.num_tasks();
-  if (resp.device_name_size() != gr->group.group_size) {
-    return errors::Internal(
-        "CompleteGroupResponse group_size doesn't match device_name list");
+  {
+    mutex_lock grl(gr->mu);
+    gr->group.device_type = DeviceType(resp.device_type());
+    gr->group.group_key = resp.group_key();
+    gr->group.group_size = resp.group_size();
+    gr->group.num_tasks = resp.num_tasks();
+    if (resp.device_name_size() != gr->group.group_size) {
+      return errors::Internal(
+          "CompleteGroupResponse group_size doesn't match device_name list");
+    }
+    for (const string& dn : resp.device_name()) {
+      gr->device_set.insert(dn);
+      gr->device_list.push_back(dn);
+    }
+    if (resp.task_name_size() != gr->group.group_size) {
+      return errors::Internal(
+          "CompleteGroupResponse group_size doesn't match task_name list");
+    }
+    for (const string& tn : resp.task_name()) {
+      gr->task_list.push_back(tn);
+      gr->task_set.insert(tn);
+    }
+    CHECK_EQ(gr->task_set.size(), gr->group.num_tasks);
+    gr->group.runtime_details.communicator_key = resp.communicator_key();
+    VLOG(2) << "Group communicator_key="
+            << absl::CEscape(gr->group.runtime_details.communicator_key);
   }
-  for (const string& dn : resp.device_name()) {
-    gr->device_set.insert(dn);
-    gr->device_list.push_back(dn);
-  }
-  if (resp.task_name_size() != gr->group.group_size) {
-    return errors::Internal(
-        "CompleteGroupResponse group_size doesn't match task_name list");
-  }
-  for (const string& tn : resp.task_name()) {
-    gr->task_list.push_back(tn);
-    gr->task_set.insert(tn);
-  }
-  CHECK_EQ(gr->task_set.size(), gr->group.num_tasks);
-  gr->group.runtime_details.communicator_key = resp.communicator_key();
-  VLOG(2) << "Group communicator_key="
-          << absl::CEscape(gr->group.runtime_details.communicator_key);
   {
     // Group membership should never change. Once a record is in group_table_
     // it never gets removed.
@@ -302,10 +304,15 @@ void CollectiveParamResolverDistributed::CompleteGroupDistributed(
   }
 }
 
-bool CollectiveParamResolverDistributed::InstanceIsCached(int32 instance_key) {
+bool CollectiveParamResolverDistributed::InstanceIsCached(int32 group_key,
+                                                          int32 instance_key) {
   mutex_lock l(instance_mu_);
-  const auto& it = instance_table_.find(instance_key);
-  return it != instance_table_.end();
+  auto group_it = instance_table_.find(group_key);
+  if (group_it == instance_table_.end()) {
+    return false;
+  }
+  auto instance_it = group_it->second.find(instance_key);
+  return instance_it != group_it->second.end();
 }
 
 void CollectiveParamResolverDistributed::UpdateInstanceCache(
@@ -339,7 +346,8 @@ void CollectiveParamResolverDistributed::UpdateInstanceCache(
       }
       if (ir->known_count < cp->group.group_size) {
         ir->known_count = cp->group.group_size;
-        if (ir->known.size() != cp->group.group_size) {
+        const int ir_known_size = ir->known.size();
+        if (ir_known_size != cp->group.group_size) {
           ir->status = errors::Internal(
               "UpdateInstanceCache:: CompleteInstanceResponse for instance ",
               cp->instance.instance_key, " has known.size()=", ir->known.size(),
@@ -347,7 +355,7 @@ void CollectiveParamResolverDistributed::UpdateInstanceCache(
           status = ir->status;
           break;
         }
-        for (int i = 0; i < ir->known.size(); ++i) {
+        for (int i = 0; i < ir_known_size; ++i) {
           ir->known[i] = true;
         }
       }
@@ -371,7 +379,7 @@ void CollectiveParamResolverDistributed::CompleteInstanceDistributed(
   if (group_leader_.empty()) {
     // This is the group leader so resolution is local.
     return CompleteInstanceLocal(device, gr, cp, cp->is_source, done);
-  } else if (InstanceIsCached(cp->instance.instance_key)) {
+  } else if (InstanceIsCached(gr->group.group_key, cp->instance.instance_key)) {
     return CompleteInstanceLocal(device, gr, cp, cp->is_source, done);
   } else {
     CompleteInstanceCall* call = new CompleteInstanceCall(
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h
index 7d30c3d5e55..684887430c3 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h
@@ -65,7 +65,8 @@ class CollectiveParamResolverDistributed : public CollectiveParamResolverLocal {
 
   // Returns true iff there's an entry for this instance_key in the
   // local instance_table_.
-  bool InstanceIsCached(int32 instance_key) TF_LOCKS_EXCLUDED(instance_mu_);
+  bool InstanceIsCached(int32 group_key, int32 instance_key)
+      TF_LOCKS_EXCLUDED(instance_mu_);
 
   // Updates instance_table_ with contents of resp.
   void UpdateInstanceCache(const GroupRec* gr, CollectiveParams* cp,
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
index 13e61e55ee0..130a48e80d2 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
@@ -327,7 +327,8 @@ class MockNcclReducer : public CollectiveImplementationInterface {
   Status InitializeCollectiveParams(CollectiveParams*) override {
     return Status::OK();
   }
-  Status InitializeCollectiveContext(CollectiveContext*) override {
+  Status InitializeCollectiveContext(
+      std::shared_ptr<CollectiveContext>) override {
     return Status::OK();
   }
   Status InitializeCollectiveGroupRuntimeDetails(
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
index c645c74f903..46889e737e7 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
@@ -109,7 +109,8 @@ void CollectiveRemoteAccessDistributed::RecvFromPeer(
       for (const auto& chunk : extra.tensor_content()) {
         num_bytes += chunk.size();
       }
-      if (num_bytes != to_tensor->TotalBytes()) {
+      const int64 total_bytes = to_tensor->TotalBytes();
+      if (num_bytes != total_bytes) {
         done(errors::Internal("RecvBufResponse returned ", num_bytes,
                               " bytes where to_tensor expected ",
                               to_tensor->TotalBytes()));
diff --git a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
index d8613e5f9b9..03944e12590 100644
--- a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
@@ -118,13 +118,31 @@ void EagerClusterFunctionLibraryRuntime::Run(
   for (const auto& tensor : args) {
     function_args.push_back(tensor);
   }
-  Run(opts, handle, function_args, rets, std::move(done));
+  std::vector<FunctionRet>* function_rets = new std::vector<FunctionRet>;
+  Run(opts, handle, function_args, function_rets,
+      [rets, function_rets, done = std::move(done)](const Status& s) {
+        Status status = s;
+        if (status.ok()) {
+          for (const auto& t : *function_rets) {
+            if (t.index() == 0) {
+              rets->push_back(absl::get<Tensor>(t));
+            } else {
+              status.Update(
+                  errors::Internal("Expect a Tensor as a remote function "
+                                   "output but got a TensorShape."));
+              break;
+            }
+          }
+        }
+        delete function_rets;
+        done(status);
+      });
 }
 
 void EagerClusterFunctionLibraryRuntime::Run(
     const FunctionLibraryRuntime::Options& opts,
     FunctionLibraryRuntime::LocalHandle handle,
-    gtl::ArraySlice<FunctionArg> args, std::vector<Tensor>* rets,
+    gtl::ArraySlice<FunctionArg> args, std::vector<FunctionRet>* rets,
     FunctionLibraryRuntime::DoneCallback done) {
   FunctionData* function_data = nullptr;
   {
@@ -204,6 +222,14 @@ void EagerClusterFunctionLibraryRuntime::Run(
           done(s);
           return;
         }
+        if (!response->shape().empty() && !response->tensor().empty()) {
+          done(errors::Internal(
+              "Both shape and tensor are specified in the same response"));
+          return;
+        }
+        for (const auto& shape : response->shape()) {
+          rets->push_back(shape);
+        }
         for (const auto& tensor_proto : response->tensor()) {
           Tensor t;
           if (t.FromProto(tensor_proto)) {
diff --git a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
index 9df9d1aecc1..6e60ee0b13d 100644
--- a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
+++ b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
@@ -64,11 +64,12 @@ class EagerClusterFunctionLibraryRuntime
            gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
            FunctionLibraryRuntime::DoneCallback done) override;
 
-  // The component function inputs `args` can be RemoteTensorHandles, which will
-  // be lazily resolved remotely where the inputs are actually consumed.
+  // The component function inputs `args` and outputs `rets` may refer to remote
+  // tensors on a remote device, which will be lazily resolved remotely where
+  // the inputs/outputs are actually consumed.
   void Run(const FunctionLibraryRuntime::Options& opts,
            FunctionLibraryRuntime::LocalHandle handle,
-           gtl::ArraySlice<FunctionArg> args, std::vector<Tensor>* rets,
+           gtl::ArraySlice<FunctionArg> args, std::vector<FunctionRet>* rets,
            FunctionLibraryRuntime::DoneCallback done) override;
 
   void CleanUp(uint64 step_id, FunctionLibraryRuntime::LocalHandle handle,
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
index 09db3883a15..e8b4e1e5090 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
@@ -148,9 +148,8 @@ class EagerServiceImpl {
 
     bool IsStale() {
       mutex_lock l(last_accessed_mu_);
-      return (destroy_after_micros_ > 0 &&
-              (env_->env->NowMicros() - last_accessed_micros_) >
-                  destroy_after_micros_);
+      const int64 time_passed = env_->env->NowMicros() - last_accessed_micros_;
+      return (destroy_after_micros_ > 0 && time_passed > destroy_after_micros_);
     }
 
    private:
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index a2412eb9625..76fc12d1adc 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -771,12 +771,17 @@ class FunctionWithRemoteInputsTest : public EagerServiceImplTest {
                                                   &close_context_response));
   }
 
-  void CheckOutputsAndClose(const int64 op_id) {
+  void CheckOutputsAndClose(const std::vector<FunctionRet>& outputs,
+                            const int64 op_id) {
     const tensorflow::Tensor* t = nullptr;
     tensorflow::TensorHandle* tensor_handle;
     TF_ASSERT_OK(eager_service_impl_.GetTensorHandle(
         context_id_, RemoteTensorHandleInternal(2, 0), &tensor_handle));
     TF_ASSERT_OK(tensor_handle->Tensor(&t));
+    EXPECT_EQ(outputs.size(), 1);
+    EXPECT_EQ(outputs.at(0).index(), 1);
+    const TensorShape& shape = absl::get<TensorShape>(outputs.at(0));
+    EXPECT_EQ(shape, t->shape());
     CheckOutputTensorAndClose(*t);
   }
 
@@ -830,7 +835,7 @@ TEST_F(FunctionWithRemoteInputsTest, EagerPFLRTest) {
   input.set_op_device(local_device_);
   input.set_device(local_device_);
   std::vector<RemoteTensorHandle> inputs = {input};
-  std::vector<Tensor> outputs;
+  std::vector<FunctionRet> outputs;
   gtl::InlinedVector<TensorValue, 4> tensor_args = {TensorValue()};
   TestExecuteNodeArgs args(
       std::move(tensor_args),
@@ -845,7 +850,7 @@ TEST_F(FunctionWithRemoteInputsTest, EagerPFLRTest) {
                    });
   done.WaitForNotification();
   TF_ASSERT_OK(status);
-  CheckOutputsAndClose(op_id);
+  CheckOutputsAndClose(outputs, op_id);
 }
 
 // Test executes a remote function with local input and output tensors.
@@ -936,13 +941,13 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncTest) {
         *handle = remote_handles.at(index);
         return Status::OK();
       });
-  std::vector<Tensor> outputs;
+  std::vector<FunctionRet> outputs;
 
   TF_ASSERT_OK(kernel->Run(/*step_container=*/nullptr, inputs, &outputs,
                            /*cancellation_manager=*/nullptr,
                            /*remote_func_params=*/absl::nullopt));
 
-  CheckOutputsAndClose(op_id);
+  CheckOutputsAndClose(outputs, op_id);
 }
 
 // Test executes a remote function through KernelAndDeviceFunc::RunAsync.
@@ -983,7 +988,7 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncAsyncTest) {
         *handle = remote_handles.at(index);
         return Status::OK();
       });
-  std::vector<Tensor> outputs;
+  std::vector<FunctionRet> outputs;
 
   Status status;
   Notification n;
@@ -996,7 +1001,7 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncAsyncTest) {
                    });
   n.WaitForNotification();
   TF_ASSERT_OK(status);
-  CheckOutputsAndClose(op_id);
+  CheckOutputsAndClose(outputs, op_id);
 }
 
 // Test creates a context and attempts to send a tensor (using the RPC), and
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
index a5eeed6a0b6..a1d0e09faf9 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
@@ -192,9 +192,20 @@ Status RemoteCopyNode::RunLocalRecv(EagerOperation* op,
   TF_RETURN_IF_ERROR(CreateUncachedKernelAndDeviceOp(op, &kernel));
 
   EagerKernelArgs args;
-  return kernel->Run(/*step_container*/ nullptr, args, outputs,
-                     captured_state_->recv_cancellation(),
-                     /*remote_func_params=*/absl::nullopt);
+  std::vector<EagerKernelRet> rets;
+  TF_RETURN_IF_ERROR(kernel->Run(/*step_container*/ nullptr, args, &rets,
+                                 captured_state_->recv_cancellation(),
+                                 /*remote_func_params=*/absl::nullopt));
+  outputs->clear();
+  for (const auto& ret : rets) {
+    if (ret.index() == 0) {
+      outputs->push_back(absl::get<Tensor>(ret));
+    } else {
+      return errors::Internal(
+          "Expect to receive a Tensor but got a TensorShape.");
+    }
+  }
+  return Status::OK();
 }
 
 void RemoteCopyNode::RunRemoteRecv(EagerOperation* op, StatusCallback done) {
@@ -314,10 +325,13 @@ Status SerializePackedHandle(const uint64 op_id, TensorHandle* packed_handle,
     } else if (h->Type() == TensorHandle::REMOTE) {
       // Only serialize the resource dtype and shape of the first handle, since
       // all handles are of the same resource dtype and shape.
+      // If src_device is on the same task of target_device, the handle is a
+      // local handle on the target device, which means the resource dtype and
+      // shape are known on the target device.
       Device* src_device = absl::get<Device*>(h->device());
       const bool serialize_resource_dtype_and_shape =
           (i == 0) && (h->dtype == DT_RESOURCE) &&
-          (ctx->OnSameTask(src_device, target_device));
+          (!ctx->OnSameTask(src_device, target_device));
       TF_RETURN_IF_ERROR(ctx->RemoteMgr()->SerializeRemoteTensorHandle(
           h, /*wait_until_ready=*/false,
           op->add_handles()->mutable_remote_handle(), src_device,
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
index 9003f2b3f17..e755cd247a6 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
@@ -28,7 +28,7 @@ void RemoteMgr::AddOperationOutputs(
     const gtl::ArraySlice<tensorflow::TensorHandle*> handles,
     int64 operation_id) {
   mutex_lock l(remote_tensor_handle_mu_);
-  for (int i = 0; i < handles.size(); i++) {
+  for (int i = 0, end = handles.size(); i < end; i++) {
     // TODO(nareshmodi): Correctly handle operation_id not being unique.
     remote_tensor_handle_map_.emplace(
         RemoteTensorHandleInternal(operation_id, i), handles[i]);
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index dd9633676f6..cce4a3f7960 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -403,7 +403,7 @@ void GraphMgr::RecvOutputsAsync(const int64 step_id, NamedTensors* out,
       [done, rendezvous, received_keys, out, keys](const Status s) {
         rendezvous->Unref();
         size_t output_size = 0;
-        for (int i = 0; i < keys.size(); ++i) {
+        for (int i = 0, end = keys.size(); i < end; ++i) {
           (*out)[keys[i]] = (*received_keys)[i];
           output_size += (*out)[keys[i]].AllocatedBytes();
         }
diff --git a/tensorflow/core/distributed_runtime/master_env.h b/tensorflow/core/distributed_runtime/master_env.h
index 837ccd1dd48..64b73dfac22 100644
--- a/tensorflow/core/distributed_runtime/master_env.h
+++ b/tensorflow/core/distributed_runtime/master_env.h
@@ -62,7 +62,7 @@ struct WorkerCacheFactoryOptions {
 struct MasterEnv {
   Env* env = nullptr;
 
-  // Object from which WorkerInterface instances can be obtained.
+  // Object from which WorkerInterface instances can be obtained. Not owned.
   WorkerCacheInterface* worker_cache = nullptr;
 
   // The operation definitions to use.  Must be filled before use.
@@ -93,7 +93,7 @@ struct MasterEnv {
       worker_cache_factory;
 
   // Generates per-step CollectiveExecutors and has access to utilities
-  // supporting collective operations.
+  // supporting collective operations. Not owned.
   CollectiveExecutorMgrInterface* collective_executor_mgr = nullptr;
 };
 
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 9d0a03805d0..af98a0e4997 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -57,6 +57,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
 
@@ -836,7 +837,7 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
           << execution_count;
   // Maps the names of fed tensors to their index in `req`.
   std::unordered_map<StringPiece, size_t, StringPieceHasher> feeds(3);
-  for (size_t i = 0; i < callable_opts_.feed_size(); ++i) {
+  for (size_t i = 0, end = callable_opts_.feed_size(); i < end; ++i) {
     if (!feeds.insert({callable_opts_.feed(i), i}).second) {
       // MakeCallable will fail if there are two feeds with the same name.
       return errors::Internal("Duplicated feeds in callable: ",
@@ -1314,12 +1315,21 @@ Status MasterSession::CreateWorkerSessions(
     }
   });
 
+  string task_name;
+  string local_device_name;
+  DeviceNameUtils::SplitDeviceName(devices_->client_device()->name(),
+                                   &task_name, &local_device_name);
+  const int64 client_device_incarnation =
+      devices_->client_device()->attributes().incarnation();
+
   Status status = Status::OK();
   // Create all the workers & kick off the computations.
   for (size_t i = 0; i < worker_names.size(); ++i) {
     workers[i].name = &worker_names[i];
     workers[i].worker = worker_cache->GetOrCreateWorker(worker_names[i]);
     workers[i].request.set_session_handle(handle_);
+    workers[i].request.set_master_task(task_name);
+    workers[i].request.set_master_incarnation(client_device_incarnation);
     if (session_opts_.config.share_cluster_devices_in_session() ||
         session_opts_.config.experimental()
             .share_cluster_devices_in_session()) {
@@ -1564,7 +1574,7 @@ uint64 MasterSession::NewStepId(int64 graph_key) {
   } else {
     uint64 step_id = env_->collective_executor_mgr->NextStepId(graph_key);
     int32 retry_count = 0;
-    while (step_id == CollectiveExecutor::kInvalidId) {
+    while (static_cast<int64>(step_id) == CollectiveExecutor::kInvalidId) {
       Notification note;
       Status status;
       env_->collective_executor_mgr->RefreshStepIdSequenceAsync(
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 02dcfa86dd8..5cf41c3d734 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -17,6 +17,7 @@ load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")  # buildifier: disa
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_kernel_tests_linkstatic",
+    "tf_protos_profiler_service",
 )
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
@@ -335,9 +336,10 @@ cc_library(
         "//tensorflow/core/distributed_runtime:worker_cache_wrapper",
         "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_service_impl",
+        "//tensorflow/core/profiler/rpc:profiler_service_impl",
         tf_grpc_dependency(),
         tf_grpc_cc_dependency(),
-    ],
+    ] + tf_protos_profiler_service(),
     alwayslink = 1,
 )
 
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
index 425b25e2386..0faf8c1437a 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -231,7 +231,7 @@ class GrpcEagerClientCache : public EagerClientCache {
   explicit GrpcEagerClientCache(
       std::shared_ptr<tensorflow::GrpcChannelCache> cache)
       : next_round_robin_assignment_(0), cache_(cache), threads_(4) {
-    for (int i = 0; i < threads_.size(); i++) {
+    for (int i = 0, end = threads_.size(); i < end; i++) {
       threads_[i].reset(new GrpcEagerClientThread());
     }
   }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
index 8c8eef0e1a4..07ab6c69d2e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
@@ -176,8 +176,8 @@ class GrpcRemoteMaster : public MasterInterface {
               ? deadline_with_backoff_micros
               : expired_time_micros;
       Env::Default()->SleepForMicroseconds(backoff_until - now_micros);
-      if (Env::Default()->NowMicros() > expired_time_micros &&
-          timeout_in_ms > 0) {
+      const int64 now = Env::Default()->NowMicros();
+      if (now > expired_time_micros && timeout_in_ms > 0) {
         // If timeout_in_ms is set, exit the retry loop on timeout.
         return errors::DeadlineExceeded(ctx.debug_error_string());
       }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index c0b4d0ef6ec..83e072559e9 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/env_var.h"
 
@@ -248,6 +249,9 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
                         .release();
   eager_service_ = new eager::GrpcEagerServiceImpl(&worker_env_, &builder);
 
+  profiler_service_ = CreateProfilerService();
+  builder.RegisterService(profiler_service_.get());
+
   // extra service:
   if (opts.service_func != nullptr) {
     opts.service_func(&worker_env_, &builder);
@@ -267,9 +271,9 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
   CHECK_NE(nullptr, worker_cache);
 
   if (opts.collective_mgr_func) {
-    worker_env_.collective_executor_mgr =
-        opts.collective_mgr_func(config, &worker_env_, worker_cache);
-    if (!worker_env_.collective_executor_mgr) {
+    worker_env_.collective_executor_mgr.reset(
+        opts.collective_mgr_func(config, &worker_env_, worker_cache));
+    if (worker_env_.collective_executor_mgr == nullptr) {
       return errors::Internal(
           "collective_mgr_func did not return CollectiveExecutorMgr");
     }
@@ -281,9 +285,9 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
         new CollectiveParamResolverDistributed(config, worker_env_.device_mgr,
                                                dev_resolver.get(), worker_cache,
                                                default_worker_name));
-    worker_env_.collective_executor_mgr = new RpcCollectiveExecutorMgr(
+    worker_env_.collective_executor_mgr.reset(new RpcCollectiveExecutorMgr(
         config, worker_env_.device_mgr, std::move(dev_resolver),
-        std::move(param_resolver), worker_cache, default_worker_name);
+        std::move(param_resolver), worker_cache, default_worker_name));
   }
 
   // Set up worker environment.
@@ -299,7 +303,8 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
   // Finish setting up master environment.
   master_env_.ops = OpRegistry::Global();
   master_env_.worker_cache = worker_cache;
-  master_env_.collective_executor_mgr = worker_env_.collective_executor_mgr;
+  master_env_.collective_executor_mgr =
+      worker_env_.collective_executor_mgr.get();
   StatsPublisherFactory stats_factory = opts.stats_factory;
   master_env_.master_session_factory =
       [config, stats_factory](
@@ -433,6 +438,8 @@ Status GrpcServer::UpdateServerDef(const ServerDef& server_def) {
     return errors::InvalidArgument(
         "Failed to build worker cache with the provided server def.");
   }
+  // Transfer ownership of worker_cache to worker_env_.session_mgr.
+  worker_env_.session_mgr->ResetDefaultWorkerCache(worker_cache);
 
   string default_worker_name;
   string unused;
@@ -447,13 +454,14 @@ Status GrpcServer::UpdateServerDef(const ServerDef& server_def) {
       new CollectiveParamResolverDistributed(
           server_def_.default_session_config(), worker_env_.device_mgr,
           dev_resolver.get(), worker_cache, default_worker_name));
-  worker_env_.collective_executor_mgr = new RpcCollectiveExecutorMgr(
+  worker_env_.collective_executor_mgr.reset(new RpcCollectiveExecutorMgr(
       server_def_.default_session_config(), worker_env_.device_mgr,
       std::move(dev_resolver), std::move(param_resolver), worker_cache,
-      default_worker_name);
+      default_worker_name));
 
   master_env_.worker_cache = worker_cache;
-  master_env_.collective_executor_mgr = worker_env_.collective_executor_mgr;
+  master_env_.collective_executor_mgr =
+      worker_env_.collective_executor_mgr.get();
   return Status::OK();
 }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index 1e8fe35b5b4..1599575fab4 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
 
 namespace tensorflow {
 
@@ -182,6 +183,9 @@ class GrpcServer : public ServerInterface {
   std::unique_ptr<Thread> eager_thread_ TF_GUARDED_BY(mu_);
   std::shared_ptr<WorkerSession> worker_session_;
 
+  // TensorFlow profiler service implementation.
+  std::unique_ptr<grpc::ProfilerService::Service> profiler_service_ = nullptr;
+
   // The overall server configuration.
   ServerDef server_def_ TF_GUARDED_BY(mu_);
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
index 4dcddc2d133..f777ec468dc 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
@@ -229,7 +229,7 @@ Status GrpcSession::RunHelper(
   // Build an index from fetch tensor name to first index in
   // output_tensor_names.
   std::unordered_map<string, int> output_name_to_offset;
-  for (int i = 0; i < output_tensor_names.size(); ++i) {
+  for (int i = 0, end = output_tensor_names.size(); i < end; ++i) {
     const string& name = output_tensor_names[i];
     if (output_name_to_offset.insert(std::make_pair(name, i)).second) {
       req->add_fetch(name);
@@ -267,7 +267,7 @@ Status GrpcSession::RunHelper(
   // In the unlikely event that output_tensor_names contains duplicates, fill in
   // the duplicate values.
   if (output_name_to_offset.size() != output_tensor_names.size()) {
-    for (int i = 0; i < output_tensor_names.size(); ++i) {
+    for (int i = 0, end = output_tensor_names.size(); i < end; ++i) {
       const string& name = output_tensor_names[i];
       int offset = output_name_to_offset[name];
       if (offset != i) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.cc b/tensorflow/core/distributed_runtime/rpc/grpc_state.cc
index fc3695e5461..9edca51e7a3 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.cc
@@ -203,7 +203,7 @@ void ExchangeQueue::CheckInvariants() {
     return;
   }
 
-  for (int i = 1; i < exchanges_.size(); ++i) {
+  for (int i = 1, end = exchanges_.size(); i < end; ++i) {
     const Exchange& e0 = exchanges_[i - 1];
     const Exchange& e1 = exchanges_[i];
     // The first exchange in the pair is the one that arrived later and is
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index 89fe6ced725..5de81341c65 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -277,12 +277,12 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
 
   // RendezvousMgr already aborted, shouldn't send RPC call any more
   if (!call->status().ok()) {
+    DeregisterCall(call);
     // NOTE: `*sess` can potentially be deleted before we return from
     // `call->done()(...)`, so we must release the worker before calling the
     // callback.
     call->ReleaseWorker(sess->worker_cache());
     call->done()(call->status(), Args(), Args(), Tensor(), false);
-    DeregisterCall(call);
     get_call_freelist()->Release(call);
     return;
   }
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
index 7c5779246bd..412c902a04a 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
@@ -331,8 +331,10 @@ TEST_F(RpcRendezvousMgrTest, RemoteRecvAsyncMany) {
           [&mu_, &status, &counter](const Status& s, const Rendezvous::Args&,
                                     const Rendezvous::Args&, const Tensor&,
                                     const bool) {
-            mutex_lock l(mu_);
-            status.Update(s);
+            {
+              mutex_lock l(mu_);
+              status.Update(s);
+            }
             counter.DecrementCount();
           });
     }
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index 1d9a22a5817..0dd657c7fd9 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -62,11 +62,46 @@ Status SessionMgr::CreateSession(
     const protobuf::RepeatedPtrField<DeviceAttributes>&
         cluster_device_attributes,
     bool isolate_session_state) {
+  return CreateSession(session, server_def, cluster_device_attributes,
+                       isolate_session_state, /*master_task=*/"",
+                       /*master_incarnation=*/0);
+}
+
+Status SessionMgr::CreateSession(
+    const string& session, const ServerDef& server_def,
+    const protobuf::RepeatedPtrField<DeviceAttributes>&
+        cluster_device_attributes,
+    bool isolate_session_state, string master_task, int64 master_incarnation) {
   mutex_lock l(mu_);
   if (session.empty()) {
     return errors::InvalidArgument("Session must be non-empty.");
   }
 
+  // For given master task name, check if one or more `WorkerSession`s have been
+  // created previously on this worker, and if so garbage collect the expired
+  // `WorkerSession`s. This happens when the master fails before sending
+  // `DeleteSession` requests, which can cause `WorkerSession`s to be leaked.
+  if (!master_task.empty()) {
+    auto it_range = master_to_associated_sessions_.equal_range(master_task);
+    if (it_range.first != it_range.second &&
+        it_range.first->second.master_incarnation != master_incarnation) {
+      LOG(INFO) << "When creating WorkerSession for master task " << master_task
+                << ", found old WorkerSessions created by the same master task "
+                << "with a different incarnation. These sessions will "
+                << "be garbage collected. Current WorkerSession count: "
+                << sessions_.size();
+
+      auto it = it_range.first;
+      while (it != it_range.second) {
+        auto session_it = sessions_.find(it->second.session_handle);
+        if (session_it != sessions_.end()) {
+          sessions_.erase(session_it);
+        }
+        it = master_to_associated_sessions_.erase(it);
+      }
+    }
+  }
+
   WorkerCacheInterface* worker_cache = nullptr;
   string worker_name;
   if (server_def.cluster().job().empty()) {
@@ -141,9 +176,17 @@ Status SessionMgr::CreateSession(
   }
 
   sessions_.insert(std::make_pair(session, std::move(worker_session)));
+  if (!master_task.empty()) {
+    MasterAssociatedSession s{master_incarnation, session};
+    master_to_associated_sessions_.emplace(master_task, s);
+  }
   return Status::OK();
 }
 
+void SessionMgr::ResetDefaultWorkerCache(WorkerCacheInterface* worker_cache) {
+  default_worker_cache_.reset(worker_cache);
+}
+
 Status SessionMgr::UpdateSession(
     const string& session, const ServerDef& server_def,
     const protobuf::RepeatedPtrField<DeviceAttributes>&
diff --git a/tensorflow/core/distributed_runtime/session_mgr.h b/tensorflow/core/distributed_runtime/session_mgr.h
index 8c438dbd83e..dfcc69463c4 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.h
+++ b/tensorflow/core/distributed_runtime/session_mgr.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/worker_session.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 
@@ -53,6 +54,20 @@ class SessionMgr {
       const protobuf::RepeatedPtrField<DeviceAttributes>& device_attributes,
       bool isolate_session_state);
 
+  // Create WorkerSession from the master with the given `master_task` and
+  // `master_incarnation`. We first look for existing WorkerSessions associated
+  // with the specified master task. If there are sessions created by the same
+  // master but with a different incarnation, it indicates that the remote
+  // master has restarted before deleting the sessions on worker. When it
+  // happens, old sessions associated with the master will be automatically
+  // removed before the new session is created.
+  Status CreateSession(
+      const string& session, const ServerDef& server_def,
+      const protobuf::RepeatedPtrField<DeviceAttributes>& device_attributes,
+      bool isolate_session_state, string master_task, int64 master_incarnation);
+
+  void ResetDefaultWorkerCache(WorkerCacheInterface* worker_cache);
+
   // Updates state (worker cache, devices) of worker session identified by
   // session name (`session`) based on a new server_def and set of devices.
   Status UpdateSession(const string& session, const ServerDef& server_def,
@@ -105,6 +120,15 @@ class SessionMgr {
   mutex mu_;
   // A map from session identifier to internal session structure.
   std::map<string, std::shared_ptr<WorkerSession>> sessions_ TF_GUARDED_BY(mu_);
+
+  // Incarnation and WorkerSession handle associated with a master task.
+  struct MasterAssociatedSession {
+    const int64 master_incarnation;
+    const string session_handle;
+  };
+  // A map from master task name to its associated worker sessions.
+  std::unordered_multimap<string, MasterAssociatedSession>
+      master_to_associated_sessions_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/session_mgr_test.cc b/tensorflow/core/distributed_runtime/session_mgr_test.cc
index f6e0551ff56..1f5e26b7a0b 100644
--- a/tensorflow/core/distributed_runtime/session_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr_test.cc
@@ -152,6 +152,90 @@ TEST_F(SessionMgrTest, CreateSessionIsolateSessionState) {
   EXPECT_NE(devices_3[0]->resource_manager(), devices_4[0]->resource_manager());
 }
 
+TEST_F(SessionMgrTest, CreateSessionWithMasterName) {
+  ServerDef server_def;
+  server_def.set_job_name("worker");
+  server_def.set_task_index(3);
+  auto job = server_def.mutable_cluster()->add_job();
+  job->set_name("worker");
+  job->mutable_tasks()->insert({3, "localhost:3333"});
+
+  protobuf::RepeatedPtrField<DeviceAttributes> cluster_device_attributes;
+
+  const string master_name = "/job:master/replica:0/task:1";
+  const int64 old_incarnation = random::New64();
+  const int64 new_incarnation = random::New64();
+
+  // Allow multiple worker sessions to be created by the same master
+  string sess_handle1 = "test_session_handle_1";
+  TF_EXPECT_OK(mgr_.CreateSession(sess_handle1, server_def,
+                                  cluster_device_attributes, true, master_name,
+                                  old_incarnation));
+  string sess_handle2 = "test_session_handle_2";
+  TF_EXPECT_OK(mgr_.CreateSession(sess_handle2, server_def,
+                                  cluster_device_attributes, true, master_name,
+                                  old_incarnation));
+
+  std::shared_ptr<WorkerSession> session;
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession(sess_handle1, &session));
+  EXPECT_NE(nullptr, session) << "Session for " << sess_handle1 << "was null";
+
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession(sess_handle2, &session));
+  EXPECT_NE(nullptr, session) << "Session for " << sess_handle2 << "was null";
+
+  // When the master creates a WorkerSession with new incarnation, the old
+  // WorkerSessions should be garbage collected.
+  string sess_handle3 = "test_session_handle_3";
+  TF_EXPECT_OK(mgr_.CreateSession(sess_handle3, server_def,
+                                  cluster_device_attributes, true, master_name,
+                                  new_incarnation));
+
+  EXPECT_NE(mgr_.WorkerSessionForSession(sess_handle1, &session),
+            tensorflow::Status::OK())
+      << "Session for " << sess_handle1
+      << " should have been garbage collected.";
+
+  EXPECT_NE(mgr_.WorkerSessionForSession(sess_handle2, &session),
+            tensorflow::Status::OK())
+      << "Session for " << sess_handle2
+      << " should have been garbage collected.";
+
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession(sess_handle3, &session));
+  EXPECT_NE(nullptr, session) << "Session for " << sess_handle3 << "was null";
+
+  TF_EXPECT_OK(mgr_.DeleteSession(sess_handle2));
+  TF_EXPECT_OK(mgr_.DeleteSession(sess_handle3));
+}
+
+TEST_F(SessionMgrTest, CreateSessionWithoutMasterName) {
+  ServerDef server_def;
+  server_def.set_job_name("worker");
+  server_def.set_task_index(3);
+  auto job = server_def.mutable_cluster()->add_job();
+  job->set_name("worker");
+  job->mutable_tasks()->insert({3, "localhost:3333"});
+
+  protobuf::RepeatedPtrField<DeviceAttributes> cluster_device_attributes;
+
+  // WorkerSession will NOT be garbage collected for empty master names.
+  string sess_handle1 = "test_session_handle_no_master_1";
+  TF_EXPECT_OK(mgr_.CreateSession(sess_handle1, server_def,
+                                  cluster_device_attributes, true, "", 0));
+  string sess_handle2 = "test_session_handle_no_master_2";
+  TF_EXPECT_OK(mgr_.CreateSession(sess_handle2, server_def,
+                                  cluster_device_attributes, true, "", 0));
+
+  std::shared_ptr<WorkerSession> session;
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession(sess_handle1, &session));
+  EXPECT_NE(nullptr, session) << "Session for " << sess_handle1 << "was null";
+
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession(sess_handle2, &session));
+  EXPECT_NE(nullptr, session) << "Session for " << sess_handle2 << "was null";
+
+  TF_EXPECT_OK(mgr_.DeleteSession(sess_handle1));
+  TF_EXPECT_OK(mgr_.DeleteSession(sess_handle2));
+}
+
 TEST_F(SessionMgrTest, LegacySession) {
   string session_handle = "";
   std::shared_ptr<WorkerSession> session;
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index f857a63e64d..c4dc51ce47d 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -53,7 +53,8 @@ void Worker::CreateWorkerSessionAsync(const CreateWorkerSessionRequest* request,
                                       StatusCallback done) {
   Status s = env_->session_mgr->CreateSession(
       request->session_handle(), request->server_def(),
-      request->cluster_device_attributes(), request->isolate_session_state());
+      request->cluster_device_attributes(), request->isolate_session_state(),
+      request->master_task(), request->master_incarnation());
   done(s);
 }
 
@@ -197,7 +198,9 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
   ProfilerSession* profiler_session = nullptr;
   if (collector && request->exec_opts().record_timeline()) {
     // If timeline was requested, assume we want hardware level tracing.
-    profiler_session = ProfilerSession::Create().release();
+    ProfileOptions options = ProfilerSession::DefaultOptions();
+    options.set_host_tracer_level(0);
+    profiler_session = ProfilerSession::Create(options).release();
   }
   CancellationManager* cm = new CancellationManager;
   opts->SetCancelCallback([this, cm, step_id]() {
diff --git a/tensorflow/core/distributed_runtime/worker_env.h b/tensorflow/core/distributed_runtime/worker_env.h
index ecc3313d0ce..b308c5e7a18 100644
--- a/tensorflow/core/distributed_runtime/worker_env.h
+++ b/tensorflow/core/distributed_runtime/worker_env.h
@@ -60,7 +60,7 @@ struct WorkerEnv {
 
   // Generates per-step CollectiveExecutors and has access to utilities
   // supporting collective operations.
-  CollectiveExecutorMgrInterface* collective_executor_mgr = nullptr;
+  std::unique_ptr<CollectiveExecutorMgrInterface> collective_executor_mgr;
 
   // A pool of threads for scheduling compute work.
   thread::ThreadPool* compute_pool = nullptr;
diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index d47c74a629d..1842b04e4f0 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -20,6 +20,9 @@ load(
     "cc_library",
 )
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_selective_registration_deps")
+
 package(
     default_visibility = [
         "//tensorflow/core:__subpackages__",
@@ -28,7 +31,7 @@ package(
 )
 
 # Export all header files for which we do not yet provide a dedicated build
-# rule. This avoids breading all the rules in tensorflow/core/BUILD.
+# rule. This avoids breaking all the rules in tensorflow/core/BUILD.
 exports_files(
     srcs = [
         "allocator_registry.h",
@@ -36,7 +39,7 @@ exports_files(
         "collective.h",
         "control_flow.h",
         "dataset.h",
-        "dataset_stateful_op_whitelist.h",
+        "dataset_stateful_op_allowlist.h",
         "device_base.h",
         "function.h",
         "function_handle_cache.h",
@@ -163,7 +166,7 @@ filegroup(
         "common_shape_fns.h",
         "control_flow.h",
         "dataset.h",
-        "dataset_stateful_op_whitelist.h",
+        "dataset_stateful_op_allowlist.h",
         "device_base.h",
         "function.h",
         "function_handle_cache.h",
@@ -211,6 +214,7 @@ filegroup(
         "shared_ptr_variant.h",
         "stats_aggregator.h",
         "tensor.h",
+        "tensor_key.h",
         "tensor_reference.h",
         "tensor_shape.h",
         "tensor_slice.h",
@@ -290,6 +294,7 @@ filegroup(
         "resource_handle.h",
         "tensor.cc",
         "tensor.h",
+        "tensor_key.h",
         "tensor_shape.cc",
         "tensor_shape.h",
         "tensor_types.h",
@@ -325,7 +330,7 @@ filegroup(
         "control_flow.h",
         "dataset.cc",
         "dataset.h",
-        "dataset_stateful_op_whitelist.h",
+        "dataset_stateful_op_allowlist.h",
         "device_base.cc",
         "device_base.h",
         "function.cc",
@@ -425,7 +430,7 @@ filegroup(
     ],
 )
 
-# Individual targets. These should be prefered over tensorflow/core:framework
+# Individual targets. These should be preferred over tensorflow/core:framework
 # whenever possible.
 
 # This is redundant with the "tensorflow/core:framework" target. It's useful for
@@ -507,7 +512,7 @@ cc_library(
     srcs = ["tensor_testutil.cc"],
     hdrs = ["tensor_testutil.h"],
     copts = tf_copts(),
-    visibility = ["//tensorflow/core:__subpackages__"],
+    visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -618,7 +623,7 @@ cc_library(
 cc_library(
     name = "bounds_check",
     hdrs = ["bounds_check.h"],
-    visibility = ["//tensorflow/core:__pkg__"],
+    visibility = ["//tensorflow/core/kernels:friends"],
     deps = [
         "//tensorflow/core/platform:macros",
         "//third_party/eigen3",
@@ -674,7 +679,11 @@ cc_library(
     name = "type_index",
     hdrs = ["type_index.h"],
     visibility = ["//visibility:private"],
-    deps = ["//tensorflow/core/platform:types"],
+    deps = [
+        "//tensorflow/core/platform:hash",
+        "//tensorflow/core/platform:stringpiece",
+        "//tensorflow/core/platform:types",
+    ],
 )
 
 cc_library(
@@ -937,6 +946,7 @@ cc_library(
 cc_library(
     name = "selective_registration",
     hdrs = ["selective_registration.h"],
+    deps = tf_selective_registration_deps(),
 )
 
 cc_library(
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index dd226b205a9..7b8eba0fda9 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -39,17 +39,19 @@ class TensorShape;
 struct AllocationAttributes {
   AllocationAttributes() = default;
 
-  AllocationAttributes(bool no_retry_on_failure, bool allocation_will_be_logged,
+  AllocationAttributes(bool retry_on_failure, bool allocation_will_be_logged,
                        std::function<uint64()>* freed_by_func)
-      : no_retry_on_failure(no_retry_on_failure),
+      : retry_on_failure(retry_on_failure),
         allocation_will_be_logged(allocation_will_be_logged),
         freed_by_func(freed_by_func) {}
 
-  // If the first attempt to allocate the memory fails, the allocation
-  // should return immediately without retrying.
-  // An example use case is optional scratch spaces where a failure
-  // has only performance impact.
-  bool no_retry_on_failure = false;
+  // If the first attempt to allocate the memory fails, the allocation should
+  // wait and retry (with a timeout).
+  //
+  // This is usually set to true, but we may set it to false in cases where a
+  // failure has only performance impact (e.g. optional scratch space
+  // allocation).
+  bool retry_on_failure = true;
   // If a Tensor is allocated without the following set to true, then
   // it is logged as an unknown allocation. During execution Tensors
   // should be allocated through the OpKernelContext which records
diff --git a/tensorflow/core/framework/attr_value_util.cc b/tensorflow/core/framework/attr_value_util.cc
index a307c8a18c1..712e205c587 100644
--- a/tensorflow/core/framework/attr_value_util.cc
+++ b/tensorflow/core/framework/attr_value_util.cc
@@ -278,7 +278,7 @@ string SummarizeAttrValue(const AttrValue& attr_value) {
           pieces.push_back(SummarizeFunc(attr_value.list().func(i)));
         }
       }
-      constexpr int kMaxListSummarySize = 15;
+      constexpr int kMaxListSummarySize = 50;
       if (pieces.size() >= kMaxListSummarySize) {
         pieces.erase(pieces.begin() + 5, pieces.begin() + (pieces.size() - 6));
         pieces[5] = "...";
diff --git a/tensorflow/core/framework/attr_value_util.h b/tensorflow/core/framework/attr_value_util.h
index 966e716e39a..094c007d20b 100644
--- a/tensorflow/core/framework/attr_value_util.h
+++ b/tensorflow/core/framework/attr_value_util.h
@@ -36,7 +36,7 @@ class NameAttrList;
 
 // A human-readable rendering of attr_value, that is more concise than a
 // text-format proto.
-string SummarizeAttrValue(const AttrValue& attr_value);
+std::string SummarizeAttrValue(const AttrValue& attr_value);
 
 // Generates an error if attr_value doesn't have the indicated attr type.
 Status AttrValueHasType(const AttrValue& attr_value, StringPiece type);
@@ -51,7 +51,7 @@ Status AttrValueHasType(const AttrValue& attr_value, StringPiece type);
 bool ParseAttrValue(StringPiece type, StringPiece text, AttrValue* out);
 
 // Sets *out based on the type of value.
-void SetAttrValue(const string& value, AttrValue* out);
+void SetAttrValue(const std::string& value, AttrValue* out);
 void SetAttrValue(const tstring& value, AttrValue* out);
 void SetAttrValue(const char* value, AttrValue* out);
 void SetAttrValue(StringPiece value, AttrValue* out);
diff --git a/tensorflow/core/framework/attr_value_util_test.cc b/tensorflow/core/framework/attr_value_util_test.cc
index 4ffd732f8e1..6d2273fd4b6 100644
--- a/tensorflow/core/framework/attr_value_util_test.cc
+++ b/tensorflow/core/framework/attr_value_util_test.cc
@@ -160,12 +160,12 @@ TEST(AttrValueUtil, SummarizeAttrValueDoesNotElideShortLists) {
 }
 
 TEST(AttrValueUtil, SummarizeAttrValueElidesLongLists) {
-  std::vector<int> alist(30);
+  std::vector<int> alist(60);
   std::iota(alist.begin(), alist.end(), 0);
 
   AttrValue attr_value;
   SetAttrValue(alist, &attr_value);
-  EXPECT_EQ("[0, 1, 2, 3, 4, ..., 25, 26, 27, 28, 29]",
+  EXPECT_EQ("[0, 1, 2, 3, 4, ..., 55, 56, 57, 58, 59]",
             SummarizeAttrValue(attr_value));
 }
 
diff --git a/tensorflow/core/framework/bfloat16.h b/tensorflow/core/framework/bfloat16.h
index ba5637d9707..cd608ad9a4c 100644
--- a/tensorflow/core/framework/bfloat16.h
+++ b/tensorflow/core/framework/bfloat16.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_BFLOAT16_H_
 
 #include "tensorflow/core/framework/numeric_types.h"
-#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/types.h"
 
 // Compact 16-bit encoding of floating point numbers. This representation uses
diff --git a/tensorflow/core/framework/bfloat16_test.cc b/tensorflow/core/framework/bfloat16_test.cc
index db8590fef58..fe1296f19fe 100644
--- a/tensorflow/core/framework/bfloat16_test.cc
+++ b/tensorflow/core/framework/bfloat16_test.cc
@@ -35,14 +35,15 @@ TEST(Bfloat16Test, FlushDenormalsToZero) {
   for (float denorm = -std::numeric_limits<float>::denorm_min();
        denorm < std::numeric_limits<float>::denorm_min();
        denorm = std::nextafterf(denorm, 1.0f)) {
-    bfloat16 bf_trunc = bfloat16::truncate_to_bfloat16(denorm);
+    bfloat16 bf_trunc =
+        bfloat16(Eigen::bfloat16_impl::truncate_to_bfloat16(denorm));
     ASSERT_EQ(static_cast<float>(bf_trunc), 0.0f);
     if (std::signbit(denorm)) {
       ASSERT_EQ(bf_trunc.value, 0x8000) << denorm;
     } else {
       ASSERT_EQ(bf_trunc.value, 0x0000) << denorm;
     }
-    bfloat16 bf_round = bfloat16::round_to_bfloat16(denorm);
+    bfloat16 bf_round(denorm);
     ASSERT_EQ(static_cast<float>(bf_round), 0.0f);
     if (std::signbit(denorm)) {
       ASSERT_EQ(bf_round.value, 0x8000) << denorm;
@@ -88,7 +89,8 @@ class Bfloat16Test : public ::testing::Test,
                      public ::testing::WithParamInterface<Bfloat16TestParam> {};
 
 TEST_P(Bfloat16Test, TruncateTest) {
-  bfloat16 truncated = bfloat16::truncate_to_bfloat16((GetParam().input));
+  bfloat16 truncated =
+      bfloat16(Eigen::bfloat16_impl::truncate_to_bfloat16((GetParam().input)));
 
   if (std::isnan(GetParam().input)) {
     EXPECT_TRUE(std::isnan(float(truncated)) || std::isinf(float(truncated)));
@@ -97,7 +99,7 @@ TEST_P(Bfloat16Test, TruncateTest) {
 
   EXPECT_EQ(GetParam().expected_truncation, float(truncated));
 
-  bfloat16 rounded = bfloat16::round_to_bfloat16((GetParam().input));
+  bfloat16 rounded(GetParam().input);
   if (std::isnan(GetParam().input)) {
     EXPECT_TRUE(std::isnan(float(rounded)) || std::isinf(float(rounded)));
     return;
@@ -172,9 +174,13 @@ TEST(Bfloat16Test, Conversion) {
 }
 
 TEST(Bfloat16Test, Epsilon) {
-  EXPECT_LT(1.0f, static_cast<float>(bfloat16::epsilon() + bfloat16(1.0f)));
-  EXPECT_EQ(1.0f, static_cast<float>((bfloat16::epsilon() / bfloat16(2.0f)) +
-                                     bfloat16(1.0f)));
+  EXPECT_LT(1.0f,
+            static_cast<float>(Eigen::NumTraits<Eigen::bfloat16>::epsilon() +
+                               bfloat16(1.0f)));
+  EXPECT_EQ(1.0f,
+            static_cast<float>((Eigen::NumTraits<Eigen::bfloat16>::epsilon() /
+                                bfloat16(2.0f)) +
+                               bfloat16(1.0f)));
 }
 
 TEST(Bfloat16Test, Negate) {
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index 3726fde9809..e7110d9512c 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -43,6 +43,7 @@ enum CollectiveType {
   REDUCTION_COLLECTIVE = 0,
   BROADCAST_COLLECTIVE,
   GATHER_COLLECTIVE,
+  PERMUTE_COLLECTIVE,
   UNDEFINED_COLLECTIVE,
 };
 
@@ -89,6 +90,7 @@ struct CollImplDetails {
 };
 
 // Data common to all members of a collective instance.
+// TODO(b/163171014) Refactor this struct to not be a union of all fields.
 struct CollInstanceParams {
   // Identifies all participating graph nodes.
   int32 instance_key = -1;
@@ -109,6 +111,8 @@ struct CollInstanceParams {
   CollImplDetails impl_details;
   string ToString() const;
   CollInstanceParams& operator=(const struct CollInstanceParams& other);
+  std::vector<string> devices;   // all_permute only
+  std::vector<int> permutation;  // all_permute only
 };
 
 // Data common to all instance members in the same task.
@@ -188,6 +192,9 @@ class ParamResolverInterface {
                                      CompleteInstanceResponse* response,
                                      CancellationManager* cancel_mgr,
                                      const StatusCallback& done) = 0;
+
+  // Aborts the resolver. After abortion the resolver can no longer be used.
+  virtual void StartAbort(const Status& s) = 0;
 };
 
 // Graphs which utilize Collective Ops in a common instance must
@@ -381,7 +388,7 @@ class CollectiveContext {
 // implement this interface and register the implementation via the
 // CollectiveRegistry detailed below.  See common_runtime/ring_reducer and
 // common_runtime/hierarchical_tree_broadcaster for examples.
-class CollectiveImplementationInterface {
+class CollectiveImplementationInterface : public core::RefCounted {
  public:
   virtual ~CollectiveImplementationInterface() = default;
 
@@ -399,7 +406,8 @@ class CollectiveImplementationInterface {
   // Called from CollectiveExecutor right before calling Run().  The
   // CollectiveContext passed in must outlive the CollectiveImplementation
   // object.
-  virtual Status InitializeCollectiveContext(CollectiveContext* col_ctx) = 0;
+  virtual Status InitializeCollectiveContext(
+      std::shared_ptr<CollectiveContext> col_ctx) = 0;
 
   // Performs collective implementation specific group initialization.  The
   // intention is to do group-specific initialization of runtime details for the
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index b9efddf4cdb..36ae36e7b74 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -167,7 +167,8 @@ Status EinsumShape(shape_inference::InferenceContext* c) {
     return errors::InvalidArgument("Expected either 1 or 2 inputs but got: ",
                                    c->num_inputs());
   }
-  if (c->num_inputs() != input_labels.size()) {
+  const int input_labels_size = input_labels.size();
+  if (c->num_inputs() != input_labels_size) {
     return errors::InvalidArgument("Expected ", input_labels.size(),
                                    " inputs for equation ", equation,
                                    " but got: ", c->num_inputs());
@@ -177,7 +178,7 @@ Status EinsumShape(shape_inference::InferenceContext* c) {
   // the broadcast shapes that map to ellipsis.
   absl::flat_hash_map<char, DimensionHandle> label_to_dimension;
   gtl::InlinedVector<ShapeHandle, 2> input_bcast_shapes(c->num_inputs());
-  for (int i = 0; i < c->num_inputs(); ++i) {
+  for (int i = 0, end = c->num_inputs(); i < end; ++i) {
     bool has_ellipsis = false;
     TF_RETURN_IF_ERROR(ValidateEinsumEllipsis(input_labels[i], &has_ellipsis));
     ShapeHandle input_shape = c->input(i);
@@ -202,7 +203,8 @@ Status EinsumShape(shape_inference::InferenceContext* c) {
     input_bcast_shapes[i] = c->Scalar();
     // Run through the input labels; populate label_to_dimension mapping and
     // compute the broadcast shapes corresponding to the ellipsis (if present).
-    for (int label_idx = 0; label_idx < input_labels[i].size(); ++label_idx) {
+    for (int label_idx = 0, end = input_labels[i].size(); label_idx < end;
+         ++label_idx) {
       const char label = input_labels[i][label_idx];
       // Calculate the input axis that the current label is referring to. After
       // the ellipsis, the axis may be found by using negative indices; i.e the
@@ -281,7 +283,8 @@ Status EinsumShape(shape_inference::InferenceContext* c) {
 
   // Create the output shape from output labels and label_to_dimension mapping.
   std::vector<DimensionHandle> output_dims;
-  for (int label_idx = 0; label_idx < output_labels.size(); ++label_idx) {
+  for (int label_idx = 0, end = output_labels.size(); label_idx < end;
+       ++label_idx) {
     const char label = output_labels[label_idx];
     // Append the output_bcast_shape when the ellipsis is encountered.
     if (label == '.') {
@@ -473,7 +476,8 @@ Status DatasetIteratorShape(shape_inference::InferenceContext* c) {
   TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
   std::vector<PartialTensorShape> output_shapes;
   TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
-  if (output_shapes.size() != c->num_outputs()) {
+  const int output_shapes_size = output_shapes.size();
+  if (output_shapes_size != c->num_outputs()) {
     return errors::InvalidArgument(
         "`output_shapes` must be the same length as `output_types` (",
         output_shapes.size(), " vs. ", c->num_outputs());
@@ -503,7 +507,8 @@ Status MakeShapeFromFormat(TensorFormat format, DimensionOrConstant N,
     dims_actual[GetTensorInnerWidthDimIndex(num_dims, format)] =
         context->MakeDim(4);
   }
-  for (int spatial_dim = 0; spatial_dim < spatial.size(); spatial_dim++) {
+  for (int spatial_dim = 0, end = spatial.size(); spatial_dim < end;
+       spatial_dim++) {
     dims_actual[GetTensorSpatialDimIndex(num_dims, format, spatial_dim)] =
         context->MakeDim(spatial[spatial_dim]);
   }
@@ -520,8 +525,8 @@ Status DimensionsFromShape(ShapeHandle shape, TensorFormat format,
   // Batch.
   *batch_dim = context->Dim(shape, GetTensorBatchDimIndex(rank, format));
   // Spatial.
-  for (int spatial_dim_index = 0; spatial_dim_index < spatial_dims.size();
-       ++spatial_dim_index) {
+  for (int spatial_dim_index = 0, end = spatial_dims.size();
+       spatial_dim_index < end; ++spatial_dim_index) {
     spatial_dims[spatial_dim_index] = context->Dim(
         shape, GetTensorSpatialDimIndex(rank, format, spatial_dim_index));
   }
@@ -546,8 +551,8 @@ Status ShapeFromDimensions(DimensionHandle batch_dim,
   // Batch.
   out_dims[tensorflow::GetTensorBatchDimIndex(rank, format)] = batch_dim;
   // Spatial.
-  for (int spatial_dim_index = 0; spatial_dim_index < spatial_dims.size();
-       ++spatial_dim_index) {
+  for (int spatial_dim_index = 0, end = spatial_dims.size();
+       spatial_dim_index < end; ++spatial_dim_index) {
     out_dims[tensorflow::GetTensorSpatialDimIndex(
         rank, format, spatial_dim_index)] = spatial_dims[spatial_dim_index];
   }
@@ -2338,7 +2343,7 @@ Status ExplicitShapes(InferenceContext* c) {
   if (shapes.empty()) {
     return errors::Internal("shapes attribute is empty");
   }
-  for (int i = 0; i < shapes.size(); ++i) {
+  for (int i = 0, end = shapes.size(); i < end; ++i) {
     ShapeHandle output_shape;
     TF_RETURN_IF_ERROR(
         c->MakeShapeFromPartialTensorShape(shapes[i], &output_shape));
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index 12a16298f3c..dcae9ab3ef3 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -449,6 +449,46 @@ Status DatasetBase::DatasetGraphDefBuilder::AddInputDataset(
   return status;
 }
 
+Status DatasetBase::DatasetGraphDefBuilder::AddDatasetOrTensor(
+    SerializationContext* ctx, const Tensor& t, Node** output) {
+  if (t.dtype() == DT_VARIANT) {
+    // If the input tensor is a variant, it may represent a multi-dimensional
+    // array of datasets. We attempt to decode each dataset so that we can use
+    // their custom serialization logic and combine the result of their
+    // individual serializations using the `Pack` operation.
+    //
+    // If this fails, we fallback to using its Variant::Encode() based
+    // serialization.
+    Status s = AddDatasetOrTensorHelper(ctx, t, output);
+    if (s.ok()) {
+      return s;
+    }
+  }
+  return AddTensor(t, output);
+}
+
+Status DatasetBase::DatasetGraphDefBuilder::AddDatasetOrTensorHelper(
+    SerializationContext* ctx, const Tensor& t, Node** output) {
+  if (t.dims() == 0) {
+    DatasetBase* dataset;
+    TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(t, &dataset));
+    return AddInputDataset(ctx, dataset, output);
+  }
+  std::vector<NodeBuilder::NodeOut> nodes;
+  for (int i = 0; i < t.dim_size(0); ++i) {
+    Node* node;
+    TF_RETURN_IF_ERROR(AddDatasetOrTensorHelper(ctx, t.SubSlice(i), &node));
+    nodes.emplace_back(node);
+  }
+  auto op_name = "Pack";
+  auto opts = builder()->opts();
+  NodeBuilder node_builder(opts.GetNameForOp(op_name), op_name,
+                           opts.op_registry());
+  node_builder.Input(std::move(nodes));
+  *output = opts.FinalizeBuilder(&node_builder);
+  return Status::OK();
+}
+
 DatasetBaseIterator::DatasetBaseIterator(const BaseParams& params)
     : params_(params) {
   params_.dataset->Ref();
@@ -491,6 +531,7 @@ Status DatasetBaseIterator::GetNext(IteratorContext* ctx,
   }
   Status s = GetNextInternal(ctx, out_tensors, end_of_sequence);
   if (TF_PREDICT_TRUE(s.ok() && !*end_of_sequence)) {
+    DCHECK_EQ(out_tensors->size(), dataset()->output_dtypes().size());
     RecordElement(ctx, out_tensors);
   }
   if (model && model->collect_resource_usage() && node_) {
@@ -523,8 +564,9 @@ void DatasetOpKernel::Compute(OpKernelContext* ctx) {
   }
 }
 
-string DatasetOpKernel::TraceString(OpKernelContext* ctx, bool verbose) {
-  return strings::StrCat(name_view(), ":", type_string_view());
+string DatasetOpKernel::TraceString(const OpKernelContext& ctx,
+                                    bool verbose) const {
+  return profiler::TraceMeOp(name_view(), type_string_view());
 }
 
 // static
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 860eb7419e7..25dc10e8540 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/cancellation.h"
-#include "tensorflow/core/framework/dataset_stateful_op_whitelist.h"
+#include "tensorflow/core/framework/dataset_stateful_op_allowlist.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/model.h"
@@ -246,6 +246,9 @@ class GraphDefBuilderWrapper {
     SetAttrValue(value, attr);
   }
 
+ protected:
+  GraphDefBuilder* builder() { return b_; }
+
  private:
   void AddPlaceholderInternal(const Tensor& val, Node** output);
   void AddTensorInternal(const Tensor& val, Node** output);
@@ -830,6 +833,12 @@ class DatasetBase : public core::RefCounted {
         : GraphDefBuilderWrapper(b) {}
     Status AddInputDataset(SerializationContext* ctx,
                            const DatasetBase* dataset, Node** output);
+    Status AddDatasetOrTensor(SerializationContext* ctx, const Tensor& val,
+                              Node** output);
+
+   private:
+    Status AddDatasetOrTensorHelper(SerializationContext* ctx,
+                                    const Tensor& val, Node** output);
   };
 
   // Serializes the dataset into a `GraphDef`, which has two uses:
@@ -1063,7 +1072,7 @@ class DatasetOpKernel : public OpKernel {
   // the `DatasetOpKernel` class.
   static bool IsDatasetOp(const OpDef* op_def);
 
-  string TraceString(OpKernelContext* ctx, bool verbose) override;
+  string TraceString(const OpKernelContext& ctx, bool verbose) const override;
 
  protected:
   // Subclasses should implement this method. It will be called during Compute
diff --git a/tensorflow/core/framework/dataset_stateful_op_whitelist.h b/tensorflow/core/framework/dataset_stateful_op_allowlist.h
similarity index 86%
rename from tensorflow/core/framework/dataset_stateful_op_whitelist.h
rename to tensorflow/core/framework/dataset_stateful_op_allowlist.h
index aef66b7ed85..ff2430acbe2 100644
--- a/tensorflow/core/framework/dataset_stateful_op_whitelist.h
+++ b/tensorflow/core/framework/dataset_stateful_op_allowlist.h
@@ -66,13 +66,13 @@ class AllowlistedStatefulOpRegistry {
 // If possible, try to remove the stateful flag on the op first.
 // Example usage:
 //
-//   WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS("LegacyStatefulReader");
+//   ALLOW_STATEFUL_OP_FOR_DATASET_FUNCTIONS("LegacyStatefulReader");
 //
-#define WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS(name) \
-  WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ_HELPER(__COUNTER__, name)
-#define WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ_HELPER(ctr, name) \
-  WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ(ctr, name)
-#define WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ(ctr, name)   \
+#define ALLOW_STATEFUL_OP_FOR_DATASET_FUNCTIONS(name) \
+  ALLOW_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ_HELPER(__COUNTER__, name)
+#define ALLOW_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ_HELPER(ctr, name) \
+  ALLOW_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ(ctr, name)
+#define ALLOW_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ(ctr, name)       \
   static ::tensorflow::Status allowlist_op##ctr TF_ATTRIBUTE_UNUSED = \
       ::tensorflow::data::AllowlistedStatefulOpRegistry::Global()->Add(name)
 
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index 3415c7f23fc..fabb0b24a93 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -237,7 +237,7 @@ class DeviceBase {
   // Unimplemented by default
   virtual const DeviceAttributes& attributes() const;
   virtual int NumaNode() const { return attributes().locality().numa_node(); }
-  virtual const string& name() const;
+  virtual const std::string& name() const;
 
   // Materializes the given TensorProto into 'tensor' stored in Device
   // memory.  Most devices will want to override this.
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 03da4dffa7f..c7e6e2d158c 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -114,9 +114,9 @@ class FunctionDefHelper {
 
   // Constructs an AttrValue.func given the "name" and "attrs".
   static AttrValueWrapper FunctionRef(
-      const string& name,
+      const std::string& name,
       gtl::ArraySlice<std::pair<string, AttrValueWrapper>> attrs);
-  static AttrValueWrapper FunctionRef(const string& name) {
+  static AttrValueWrapper FunctionRef(const std::string& name) {
     return FunctionRef(name, {});
   }
 
@@ -127,11 +127,11 @@ class FunctionDefHelper {
     // When constructing a NodeDef, the first entry in ret is used as
     // the node name, the remaining values are ignored.
     std::vector<string> ret;
-    string op;
+    std::string op;
     std::vector<string> arg;
     std::vector<std::pair<string, AttrValueWrapper>> attr;
     std::vector<string> dep;
-    string device;
+    std::string device;
 
     NodeDef ToNodeDef() const;
   };
@@ -143,7 +143,7 @@ class FunctionDefHelper {
   // - `control_ret_def` holds a mapping from the function control
   //   output names to the nodes from `node_def`.
   static FunctionDef Create(
-      const string& function_name, gtl::ArraySlice<string> in_def,
+      const std::string& function_name, gtl::ArraySlice<string> in_def,
       gtl::ArraySlice<string> out_def, gtl::ArraySlice<string> attr_def,
       gtl::ArraySlice<Node> node_def,
       gtl::ArraySlice<std::pair<string, string>> ret_def,
@@ -153,7 +153,7 @@ class FunctionDefHelper {
   // function encoding (node_name:output_name[:output_index]).
   // - `ret_def` holds a mapping from the function output names from `out_def`
   //   to the node outputs from `node_def`.
-  static FunctionDef Create(const string& function_name,
+  static FunctionDef Create(const std::string& function_name,
                             gtl::ArraySlice<string> in_def,
                             gtl::ArraySlice<string> out_def,
                             gtl::ArraySlice<string> attr_def,
@@ -161,7 +161,7 @@ class FunctionDefHelper {
                             gtl::ArraySlice<std::pair<string, string>> ret_def);
 
   // TODO(josh11b): Get rid of these and transition to the one above.
-  static FunctionDef Define(const string& function_name,
+  static FunctionDef Define(const std::string& function_name,
                             gtl::ArraySlice<string> arg_def,
                             gtl::ArraySlice<string> ret_def,
                             gtl::ArraySlice<string> attr_def,
@@ -175,7 +175,7 @@ class FunctionDefHelper {
 
   // Helpers to construct a constant scalar.
   template <typename T>
-  static Node Const(const string& name, const T& val) {
+  static Node Const(const std::string& name, const T& val) {
     Node n = {{name}, "Const"};
     const DataType dtype = DataTypeToEnum<T>::value;
     n.attr.push_back({"dtype", dtype});
@@ -186,7 +186,7 @@ class FunctionDefHelper {
   }
 
   template <typename T>
-  static Node Const(const string& name, gtl::ArraySlice<T> vals) {
+  static Node Const(const std::string& name, gtl::ArraySlice<T> vals) {
     Node n = {{name}, "Const"};
     const DataType dtype = DataTypeToEnum<T>::value;
     n.attr.push_back({"dtype", dtype});
@@ -207,7 +207,7 @@ inline FunctionDefHelper::AttrValueWrapper::AttrValueWrapper(const char* val) {
 
 template <>
 inline FunctionDefHelper::AttrValueWrapper::AttrValueWrapper(
-    const string& val) {
+    const std::string& val) {
   InitFromString(val);
 }
 
@@ -251,13 +251,13 @@ Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
 // Particularly, it may not include all information presented in
 // "func_def" (e.g., comments, description of the function arguments,
 // etc.)
-string DebugString(const FunctionDef& func_def);
-string DebugString(const GraphDef& instantiated_func_def);
-string DebugString(gtl::ArraySlice<NodeDef> instantiated_func_nodes);
+std::string DebugString(const FunctionDef& func_def);
+std::string DebugString(const GraphDef& instantiated_func_def);
+std::string DebugString(gtl::ArraySlice<NodeDef> instantiated_func_nodes);
 
 // Returns a debug string for a top level graph (the main program and
 // its supporting functions defined in its library).
-string DebugStringWhole(const GraphDef& gdef);
+std::string DebugStringWhole(const GraphDef& gdef);
 
 // Returns true if f1 == f2. Compares all fields, including descriptions. Order
 // of NodeDefs doesn't matter.
@@ -360,14 +360,14 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
       delete;
 
   // Returns True if the library contains `func`, False otherwise.
-  bool Contains(const string& func) const;
+  bool Contains(const std::string& func) const;
 
   // Returns nullptr if "func" is not defined in "lib_def". Otherwise,
   // returns its definition proto.
   //
   // NB: This function returns a borrowed pointer, which can be invalidated by a
   // subsequent call to `ReplaceFunction()` with the given name.
-  const FunctionDef* Find(const string& func) const TF_LOCKS_EXCLUDED(mu_);
+  const FunctionDef* Find(const std::string& func) const TF_LOCKS_EXCLUDED(mu_);
 
   // Adds function definition 'fdef' to this function library.
   // Returns status 'ok' on success, or error otherwise. This is a no-op if
@@ -388,7 +388,7 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // a non-OK status if "func" was not found in the library, OK otherwise.
   // Please be careful when replacing function: make sure all previous pointers
   // returned by `Find()` are no longer in use.
-  Status ReplaceFunction(const string& func, const FunctionDef& fdef)
+  Status ReplaceFunction(const std::string& func, const FunctionDef& fdef)
       TF_LOCKS_EXCLUDED(mu_);
 
   // Replaces the gradient corresponding to `grad.function_name()`. Returns
@@ -401,7 +401,7 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // Please be careful when removing function: make sure there are no other
   // nodes using the function, and all previous pointers returned by `Find()`
   // are no longer in use.
-  Status RemoveFunction(const string& func) TF_LOCKS_EXCLUDED(mu_);
+  Status RemoveFunction(const std::string& func) TF_LOCKS_EXCLUDED(mu_);
 
   // Adds the functions and gradients in 'other' to this function library.
   // Duplicate functions and gradients are ignored.
@@ -417,7 +417,8 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // If the gradient function for 'func' is specified explicitly in
   // the library, returns the gradient function name.  Otherwise,
   // returns an empty string.
-  string FindGradient(const string& func) const TF_LOCKS_EXCLUDED(mu_);
+  std::string FindGradient(const std::string& func) const
+      TF_LOCKS_EXCLUDED(mu_);
 
   // OpRegistryInterface method. Useful for constructing a Graph.
   //
@@ -427,26 +428,27 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   //
   // NB: This function outputs a borrowed pointer, which can be invalidated by a
   // subsequent call to `ReplaceFunction()` with the given name.
-  Status LookUp(const string& op_type_name,
+  Status LookUp(const std::string& op_type_name,
                 const OpRegistrationData** op_reg_data) const override
       TF_LOCKS_EXCLUDED(mu_);
 
   // Generates new function name with the specified prefix that is unique
   // across this library.
-  string UniqueFunctionName(StringPiece prefix) const TF_LOCKS_EXCLUDED(mu_);
+  std::string UniqueFunctionName(StringPiece prefix) const
+      TF_LOCKS_EXCLUDED(mu_);
 
   // Given a node def 'ndef', inspects attributes of the callee
   // function to derive the attribute 'value' for 'attr'. Returns OK
   // iff the attribute is given by the function's definition.
   // TODO(irving): Remove; keep only the const Node& version.
   template <typename T>
-  Status GetAttr(const NodeDef& ndef, const string& attr, T* value) const;
+  Status GetAttr(const NodeDef& ndef, const std::string& attr, T* value) const;
 
   // Given a node, inspects attributes of the callee function to derive the
   // attribute 'value' for 'attr'. Returns OK iff the attribute is given by the
   // function's definition.
   template <typename T>
-  Status GetAttr(const Node& node, const string& attr, T* value) const;
+  Status GetAttr(const Node& node, const std::string& attr, T* value) const;
 
   // Returns a proto representation of the state of this function library.
   FunctionDefLibrary ToProto() const TF_LOCKS_EXCLUDED(mu_);
@@ -475,7 +477,7 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // name `func` already exists in this function library, and has the same
   // implementation as in `other`. If the implementations conflict, an invalid
   // argument error is returned.
-  Status CopyFunctionDefFrom(const string& func,
+  Status CopyFunctionDefFrom(const std::string& func,
                              const FunctionLibraryDefinition& other)
       TF_LOCKS_EXCLUDED(mu_);
 
@@ -491,7 +493,7 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
 
   std::shared_ptr<FunctionDefAndOpRegistration> FindHelper(
       const string& func) const TF_SHARED_LOCKS_REQUIRED(mu_);
-  string FindGradientHelper(const string& func) const
+  std::string FindGradientHelper(const std::string& func) const
       TF_SHARED_LOCKS_REQUIRED(mu_);
 
   Status AddHelper(std::shared_ptr<FunctionDefAndOpRegistration> registration,
@@ -518,12 +520,13 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // Remove `func` from the library. Returns non-OK Status unless `func` is in
   // the library. This should only be called when there is a guarantee that the
   // function being removed hasn't been retrieved with `Find`.
-  Status RemoveFunctionHelper(const string& func)
+  Status RemoveFunctionHelper(const std::string& func)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Remove gradient of function `func` from the library. Returns non-OK Status
   // unless `func` has a gradient.
-  Status RemoveGradient(const string& func) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  Status RemoveGradient(const std::string& func)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   mutable mutex mu_;
   const OpRegistryInterface* const default_registry_;
@@ -566,7 +569,7 @@ class FunctionLibraryRuntime {
     // The canonical device name of the device on which the function
     // should be instantiated. If empty, the function will be
     // instantiated on the local device.
-    string target;
+    std::string target;
 
     // Should the function be instantiated as a multi-device function?
     bool is_multi_device_function = false;
@@ -640,13 +643,13 @@ class FunctionLibraryRuntime {
     // `state_handle` will have the same handle and share the same
     // state (in stateful kernels); and two functions with different
     // values for `state_handle` will have independent state.
-    string state_handle;
+    std::string state_handle;
 
     // This interface is EXPERIMENTAL and subject to change.
     //
     // Instantiates the function using an executor of the given type. If empty,
     // the default TensorFlow executor will be used.
-    string executor_type;
+    std::string executor_type;
 
     // If true, the runtime will attempt to create kernels for the function at
     // instantiation time, rather than on the first run. This can be used to
@@ -680,10 +683,10 @@ class FunctionLibraryRuntime {
     bool include_optimized_graph_in_debug_string = false;
   };
   typedef uint64 Handle;
-  virtual Status Instantiate(const string& function_name, AttrSlice attrs,
+  virtual Status Instantiate(const std::string& function_name, AttrSlice attrs,
                              const InstantiateOptions& options,
                              Handle* handle) = 0;
-  Status Instantiate(const string& function_name, AttrSlice attrs,
+  Status Instantiate(const std::string& function_name, AttrSlice attrs,
                      Handle* handle) {
     auto opts = absl::make_unique<InstantiateOptions>();
     return Instantiate(function_name, attrs, *opts, handle);
@@ -738,7 +741,7 @@ class FunctionLibraryRuntime {
 
     // Parameters for remote function execution.
     bool remote_execution = false;
-    string source_device = "";  // Fully specified device name.
+    std::string source_device = "";  // Fully specified device name.
 
     // Allocator attributes specifying where the args are / rets should be put.
     // These should either be {} or match the length of args / retvals. If {},
@@ -758,7 +761,7 @@ class FunctionLibraryRuntime {
     bool run_all_kernels_inline = false;
 
     // Returns a human readable representation of this.
-    string DebugString() const;
+    std::string DebugString() const;
   };
   typedef std::function<void(const Status&)> DoneCallback;
   virtual void Run(const Options& opts, Handle handle,
@@ -786,7 +789,7 @@ class FunctionLibraryRuntime {
   // NOTE(mrry): This method assumes that the runtime is associated with a
   // default function library, and looks up `function_name` in that library.
   // It does not support overriding the function library.
-  virtual bool IsStateful(const string& function_name) const = 0;
+  virtual bool IsStateful(const std::string& function_name) const = 0;
 
   // Returns the device on which the function executes.
   virtual Device* device() = 0;
@@ -817,7 +820,7 @@ class FunctionLibraryRuntime {
 
   // Returns a debug string showing the definition of the function of
   // 'handle'.
-  virtual string DebugString(Handle handle) = 0;
+  virtual std::string DebugString(Handle handle) = 0;
 
   // Returns the graph version number.
   virtual int graph_def_version() const = 0;
@@ -847,13 +850,13 @@ class FunctionLibraryRuntime {
   // `ExecutorFactory::GetFactory()`) that will be used based on the given
   // dynamic `options` and static `attrs`. If none is specified, this method
   // will return an empty string, which leaves the decision up to the runtime.
-  static string ExecutorType(const InstantiateOptions& options,
-                             AttrSlice attrs);
+  static std::string ExecutorType(const InstantiateOptions& options,
+                                  AttrSlice attrs);
 };
 
 // Returns the device of the `arg_index`-th function input. Update
 // `composite_devices` if the input device is a composite device.
-string GetFunctionResourceInputDevice(
+std::string GetFunctionResourceInputDevice(
     const Tensor& input, const int arg_index, const FunctionDef& function_def,
     absl::flat_hash_map<string, std::vector<string>>* composite_devices);
 
@@ -864,9 +867,10 @@ string GetFunctionResourceInputDevice(
 // space. But it may be change as the implementation
 // evolves. Therefore, it should not be persisted or compared across
 // address spaces.
-string Canonicalize(const string& funcname, AttrSlice attrs,
-                    const FunctionLibraryRuntime::InstantiateOptions& options);
-string Canonicalize(const string& funcname, AttrSlice attrs);
+std::string Canonicalize(
+    const std::string& funcname, AttrSlice attrs,
+    const FunctionLibraryRuntime::InstantiateOptions& options);
+std::string Canonicalize(const std::string& funcname, AttrSlice attrs);
 
 const FunctionLibraryRuntime::Handle kInvalidHandle = -1;
 const FunctionLibraryRuntime::LocalHandle kInvalidLocalHandle = -1;
@@ -897,6 +901,9 @@ typedef
         FunctionArg;
 #endif
 
+// Either a local tensor or the shape of a remote tensor.
+typedef absl::variant<Tensor, TensorShape> FunctionRet;
+
 // Used to instantiate and run functions in a distributed system.
 class DistributedFunctionLibraryRuntime {
  public:
@@ -907,8 +914,8 @@ class DistributedFunctionLibraryRuntime {
   // local `handle` is filled for the instantiated function data and can be used
   // for subsequent run function calls on the remote target.
   virtual void Instantiate(
-      const string& function_name, const FunctionLibraryDefinition& lib_def,
-      AttrSlice attrs,
+      const std::string& function_name,
+      const FunctionLibraryDefinition& lib_def, AttrSlice attrs,
       const FunctionLibraryRuntime::InstantiateOptions& options,
       FunctionLibraryRuntime::LocalHandle* handle,
       FunctionLibraryRuntime::DoneCallback done) = 0;
@@ -925,14 +932,15 @@ class DistributedFunctionLibraryRuntime {
 
   // Run an instantiated remote function (specified by `handle`) with a list of
   // input Tensors or RemoteTensorHandles as `args` and get its output Tensors
-  // in `rets`. When using RemoteTensorHandles as function inputs, the
-  // corresponding tensor data will be resolved on the remote worker, so it is
-  // not required to be locally available on the caller side. Using
-  // RemoteTensorHandle inputs is not supported in TensorFlow v1 runtime.
-  // TODO(yujingzhang): Support outputting tensors on remote devices.
+  // or TensorShapes in `rets`. When using RemoteTensorHandles as function
+  // inputs or TensorShapes as outputs, the corresponding tensor data will be
+  // resolved on the remote worker, so it is not required to be locally
+  // available on the caller side. Using RemoteTensorHandle inputs is not
+  // supported in TensorFlow v1 runtime.
   virtual void Run(const FunctionLibraryRuntime::Options& opts,
                    FunctionLibraryRuntime::LocalHandle handle,
-                   gtl::ArraySlice<FunctionArg> args, std::vector<Tensor>* rets,
+                   gtl::ArraySlice<FunctionArg> args,
+                   std::vector<FunctionRet>* rets,
                    FunctionLibraryRuntime::DoneCallback done) = 0;
 
   // Clean up a previously instantiated function on remote worker.
@@ -1022,11 +1030,11 @@ Status ArgNumType(AttrSlice attrs, const OpDef::ArgDef& arg_def,
 namespace gradient {
 // Register a gradient creator for the "op".
 typedef std::function<Status(const AttrSlice& attrs, FunctionDef*)> Creator;
-bool RegisterOp(const string& op, Creator func);
+bool RegisterOp(const std::string& op, Creator func);
 
 // Returns OK the gradient creator for the "op" is found (may be
 // nullptr if REGISTER_OP_NO_GRADIENT is used.
-Status GetOpGradientCreator(const string& op, Creator* creator);
+Status GetOpGradientCreator(const std::string& op, Creator* creator);
 };  // namespace gradient
 
 // Declare explicit instantiations of GetAttr
diff --git a/tensorflow/core/framework/kernel_def_builder_test.cc b/tensorflow/core/framework/kernel_def_builder_test.cc
index 30bfd939ac3..48669c59641 100644
--- a/tensorflow/core/framework/kernel_def_builder_test.cc
+++ b/tensorflow/core/framework/kernel_def_builder_test.cc
@@ -74,8 +74,10 @@ TEST(KernelDefBuilderTest, TypeConstraint) {
 }
 
 TEST(KernelDefBuilderTest, Int64Constraint) {
-  const KernelDef* def =
-      KernelDefBuilder("B").Device(DEVICE_GPU).AttrConstraint("T", 5ll).Build();
+  const KernelDef* def = KernelDefBuilder("B")
+                             .Device(DEVICE_GPU)
+                             .AttrConstraint("T", int64{5})
+                             .Build();
   KernelDef expected;
   protobuf::TextFormat::ParseFromString(R"proto(
                                           op: 'B'
@@ -91,7 +93,7 @@ TEST(KernelDefBuilderTest, Int64Constraint) {
 
   def = KernelDefBuilder("C")
             .Device(DEVICE_GPU)
-            .AttrConstraint("U", gtl::ArraySlice<int64>{5ll, 17ll})
+            .AttrConstraint("U", gtl::ArraySlice<int64>{int64{5}, int64{17}})
             .AttrConstraint("V", string("proto"))
             .Build();
 
diff --git a/tensorflow/core/framework/load_library.cc b/tensorflow/core/framework/load_library.cc
index b9e33b148f7..34cd4b3386b 100644
--- a/tensorflow/core/framework/load_library.cc
+++ b/tensorflow/core/framework/load_library.cc
@@ -43,8 +43,8 @@ struct Library {
 // and OpList. Ops and kernels are registered as globals when a library is
 // loaded for the first time. Without caching, every subsequent load would not
 // perform initialization again, so the OpList would be empty.
-Status LoadLibrary(const char* library_filename, void** result,
-                   const void** buf, size_t* len) {
+Status LoadDynamicLibrary(const char* library_filename, void** result,
+                          const void** buf, size_t* len) {
   static mutex mu(LINKER_INITIALIZED);
   static std::unordered_map<string, Library> loaded_libs;
   Env* env = Env::Default();
@@ -76,7 +76,7 @@ Status LoadLibrary(const char* library_filename, void** result,
             return s;
           }));
       OpRegistry::Global()->DeferRegistrations();
-      s = env->LoadLibrary(library_filename, &library.handle);
+      s = env->LoadDynamicLibrary(library_filename, &library.handle);
       if (s.ok()) {
         s = OpRegistry::Global()->ProcessRegistrations();
       }
diff --git a/tensorflow/core/framework/log_memory.h b/tensorflow/core/framework/log_memory.h
index 1b926ddaa3f..e714c742b43 100644
--- a/tensorflow/core/framework/log_memory.h
+++ b/tensorflow/core/framework/log_memory.h
@@ -52,14 +52,14 @@ class LogMemory {
     UNKNOWN_STEP_ID = -6,
   };
 
-  static const string kLogMemoryLabel;
+  static const std::string kLogMemoryLabel;
 
   // Test to see if memory logging is enabled. For now, logging is
   // enabled whenever VLOG_IS_ON(1) for the log_memory module.
   static bool IsEnabled();
 
   // Log the beginning of a step.
-  static void RecordStep(int64 step_id, const string& handle);
+  static void RecordStep(int64 step_id, const std::string& handle);
 
   // Log a tensor buffer allocation. The name indicates which kernel
   // made the allocation. If the allocation is made through an
@@ -67,8 +67,8 @@ class LogMemory {
   // otherwise step_id is one of the SpecialStepIds defined in
   // op_kernel.h, e.g. Op Kernel construction or an optimization pass
   // such as constant folding.
-  static void RecordTensorAllocation(const string& kernel_name, int64 step_id,
-                                     const Tensor& tensor);
+  static void RecordTensorAllocation(const std::string& kernel_name,
+                                     int64 step_id, const Tensor& tensor);
 
   // Log a tensor buffer deallocation. The deallocation is triggered
   // when the buffer's refcount falls to zero, and the tracking
@@ -77,10 +77,10 @@ class LogMemory {
   // corresponding tensor previously passed in to
   // RecordTensorAllocation.
   static void RecordTensorDeallocation(int64 allocation_id,
-                                       const string& allocator_name);
+                                       const std::string& allocator_name);
 
   // Log the use of a tensor as an output from a kernel.
-  static void RecordTensorOutput(const string& kernel_name, int64 step_id,
+  static void RecordTensorOutput(const std::string& kernel_name, int64 step_id,
                                  int index, const Tensor& tensor);
 
   // Log a "raw" allocation, which is just a buffer sized in
@@ -92,7 +92,7 @@ class LogMemory {
   // is executing, otherwise step_id is one of the SpecialStepIds
   // defined in op_kernel.h, e.g. Op Kernel construction or an
   // optimization pass such as constant folding.
-  static void RecordRawAllocation(const string& operation, int64 step_id,
+  static void RecordRawAllocation(const std::string& operation, int64 step_id,
                                   size_t num_bytes, void* ptr,
                                   Allocator* allocator);
 
@@ -101,7 +101,7 @@ class LogMemory {
   // enqueued using the buffer. A deferred deallocation should always
   // be followed by a matching non-deferred deallocation when the
   // buffer is actually returned and can be reused.
-  static void RecordRawDeallocation(const string& operation, int64 step_id,
+  static void RecordRawDeallocation(const std::string& operation, int64 step_id,
                                     void* ptr, Allocator* allocator,
                                     bool deferred);
 };
diff --git a/tensorflow/core/framework/memory_types.cc b/tensorflow/core/framework/memory_types.cc
index d27ef1da61d..208ad20c21b 100644
--- a/tensorflow/core/framework/memory_types.cc
+++ b/tensorflow/core/framework/memory_types.cc
@@ -161,6 +161,7 @@ Status MemoryTypesForNode(const OpRegistryInterface* op_registry,
       }
     }
   }
+  hostmem_attr.clear();
   if (TryGetNodeAttr(ndef, "_output_hostmem", &hostmem_attr)) {
     for (int32 i : hostmem_attr) {
       if (0 <= i && i < out_mtypes->size()) {
diff --git a/tensorflow/core/framework/memory_types_test.cc b/tensorflow/core/framework/memory_types_test.cc
index 3126ea8e5f8..5228dbafc9b 100644
--- a/tensorflow/core/framework/memory_types_test.cc
+++ b/tensorflow/core/framework/memory_types_test.cc
@@ -33,12 +33,14 @@ class DummyKernel : public OpKernel {
 
 REGISTER_OP("HostMemoryTest")
     .Input("a: float")
-    .Input("b: T")
-    .Input("c: N * string")
-    .Input("d: Tlist")
-    .Input("e: Rlist")
+    .Input("b: float")
+    .Input("c: T")
+    .Input("d: N * string")
+    .Input("e: Tlist")
+    .Input("f: Rlist")
     .Output("o: N * T")
-    .Output("p: Tlist")
+    .Output("p: N * T")
+    .Output("r: Tlist")
     .Attr("T: type")
     .Attr("N: int")
     .Attr("Tlist: list(type)")
@@ -46,21 +48,25 @@ REGISTER_OP("HostMemoryTest")
 REGISTER_KERNEL_BUILDER(Name("HostMemoryTest").Device(DEVICE_CPU), DummyKernel);
 REGISTER_KERNEL_BUILDER(Name("HostMemoryTest")
                             .Device(DEVICE_GPU)
-                            .HostMemory("a")
-                            .HostMemory("c")
+                            .HostMemory("b")
                             .HostMemory("d")
-                            .HostMemory("o"),
+                            .HostMemory("e")
+                            .HostMemory("p"),
                         DummyKernel);
 
 TEST(MemoryTypesForNode, Simple) {
   NodeDef node_def;
   TF_ASSERT_OK(NodeDefBuilder("test", "HostMemoryTest")
+                   .Input(FakeInput())
                    .Input(FakeInput())
                    .Input(FakeInput(DT_BOOL))
                    .Input(FakeInput(3))
                    .Input(FakeInput({DT_INT32, DT_FLOAT, DT_INT32}))
                    .Input(FakeInput({DT_RESOURCE, DT_STRING, DT_RESOURCE}))
                    .Finalize(&node_def));
+  AddNodeAttr("_input_hostmem", {0}, &node_def);
+  AddNodeAttr("_output_hostmem", {6, 7}, &node_def);
+
   MemoryTypeVector input, output;
 
   TF_EXPECT_OK(MemoryTypesForNode(OpRegistry::Global(), DEVICE_CPU, node_def,
@@ -68,24 +74,26 @@ TEST(MemoryTypesForNode, Simple) {
   // a:float, b:bool, c:3*string, d:(int32, float, int32),
   // e:(resource, string, resource)
   EXPECT_EQ(
-      MemoryTypeVector({DEVICE_MEMORY, DEVICE_MEMORY, HOST_MEMORY, HOST_MEMORY,
-                        HOST_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY,
+      MemoryTypeVector({HOST_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY, HOST_MEMORY,
+                        HOST_MEMORY, HOST_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY,
                         DEVICE_MEMORY, HOST_MEMORY, HOST_MEMORY, HOST_MEMORY}),
       input);
   // o:3*bool, p:(int32, float, int32)
   EXPECT_EQ(MemoryTypeVector({DEVICE_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY,
-                              DEVICE_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY}),
+                              DEVICE_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY,
+                              HOST_MEMORY, HOST_MEMORY, DEVICE_MEMORY}),
             output);
 
   TF_EXPECT_OK(MemoryTypesForNode(OpRegistry::Global(), DEVICE_GPU, node_def,
                                   &input, &output));
   EXPECT_EQ(
-      MemoryTypeVector({HOST_MEMORY, DEVICE_MEMORY, HOST_MEMORY, HOST_MEMORY,
+      MemoryTypeVector({HOST_MEMORY, HOST_MEMORY, DEVICE_MEMORY, HOST_MEMORY,
                         HOST_MEMORY, HOST_MEMORY, HOST_MEMORY, HOST_MEMORY,
-                        HOST_MEMORY, HOST_MEMORY, HOST_MEMORY}),
+                        HOST_MEMORY, HOST_MEMORY, HOST_MEMORY, HOST_MEMORY}),
       input);
-  EXPECT_EQ(MemoryTypeVector({HOST_MEMORY, HOST_MEMORY, HOST_MEMORY,
-                              DEVICE_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY}),
+  EXPECT_EQ(MemoryTypeVector({DEVICE_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY,
+                              HOST_MEMORY, HOST_MEMORY, HOST_MEMORY,
+                              HOST_MEMORY, HOST_MEMORY, DEVICE_MEMORY}),
             output);
 }
 
diff --git a/tensorflow/core/framework/metrics.cc b/tensorflow/core/framework/metrics.cc
index 738863f3646..f5aff3a4e11 100644
--- a/tensorflow/core/framework/metrics.cc
+++ b/tensorflow/core/framework/metrics.cc
@@ -80,6 +80,11 @@ auto* tf_data_bytes_fetched_counter = monitoring::Counter<0>::New(
 auto* tf_data_elements_counter = monitoring::Counter<1>::New(
     "/tensorflow/data/elements", "tf.data elements", "name");
 
+auto* tf_data_experiment_counter = monitoring::Counter<1>::New(
+    "/tensorflow/data/experiment",
+    "The number of times tf.data experiment is applied to input pipelines.",
+    "name");
+
 auto* tf_data_fingerprint_counter = monitoring::Counter<1>::New(
     "/tensorflow/data/fingerprint", "tf.data fingerprint", "name");
 
@@ -148,6 +153,11 @@ auto* mlir_import_failure_count = monitoring::Counter<0>::New(
     "/tensorflow/mlir/import_failure_count",
     "The number of jobs that failed during mlir import or verification.");
 
+auto* bfc_allocator_delay =
+    monitoring::Counter<0>::New("/tensorflow/core/bfc_allocator_delay",
+                                "The total time spent running each graph "
+                                "optimization pass in microseconds.");
+
 }  // namespace
 
 void RecordTFDataAutotune(const string& name) {
@@ -174,6 +184,10 @@ void RecordTFDataBytesFetched(int64 num_bytes) {
   tf_data_bytes_fetched_counter->GetCell()->IncrementBy(num_bytes);
 }
 
+void RecordTFDataExperiment(const string& name) {
+  tf_data_experiment_counter->GetCell(name)->IncrementBy(1);
+}
+
 void RecordTFDataFingerprint(const string& name) {
   tf_data_fingerprint_counter->GetCell(name)->IncrementBy(1);
 }
@@ -274,6 +288,13 @@ void UpdateXlaCompilationTime(const uint64 compilation_time_usecs) {
   }
 }
 
+void UpdateBfcAllocatorDelayTime(const uint64 delay_usecs) {
+  static auto* bfc_allocator_delay_cell = bfc_allocator_delay->GetCell();
+  if (delay_usecs > 0) {
+    bfc_allocator_delay_cell->IncrementBy(delay_usecs);
+  }
+}
+
 void IncrementMLIRImportFailureCount() {
   static auto* mlir_import_failure_count_cell =
       mlir_import_failure_count->GetCell();
diff --git a/tensorflow/core/framework/metrics.h b/tensorflow/core/framework/metrics.h
index 263fde272ab..f7c90ce593e 100644
--- a/tensorflow/core/framework/metrics.h
+++ b/tensorflow/core/framework/metrics.h
@@ -56,6 +56,9 @@ monitoring::CounterCell* GetTFDataElementsCounter(const string& name);
 // Records the number of bytes fetched from tf.data.Dataset iterator.
 void RecordTFDataBytesFetched(int64 num_bytes);
 
+// Records the number of times tf.data experiment is applied to input pipelines.
+void RecordTFDataExperiment(const string& name);
+
 // Records the time spent in ItertatorResource::GetNext() in microseconds.
 void RecordTFDataGetNextDuration(uint64 duration_us);
 
@@ -120,6 +123,9 @@ void UpdateGrapplerPassTime(const string& pass_name,
 // Updates the metrics stored about time XLA spents compiling graphs.
 void UpdateXlaCompilationTime(const uint64 compilation_time_usecs);
 
+// Updates the metrics stored about time BFC allocator spents during delay.
+void UpdateBfcAllocatorDelayTime(const uint64 delay_usecs);
+
 // Increment the number of jobs that failed during import to mlir.
 void IncrementMLIRImportFailureCount();
 
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index 6dcaf8ecac2..3d54ffd51d8 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -52,8 +52,7 @@ class InterleaveMany : public Node {
     if (output_) {
       inherited_input_time = (*input_times)[output_->long_name()];
     } else {
-      inherited_input_time =
-          gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+      inherited_input_time = (*input_times)[kModelInputTimeKey];
     }
 
     if (num_inputs() <= 1) {
@@ -176,8 +175,7 @@ class AsyncInterleaveMany : public Node {
     if (output_) {
       inherited_input_time = (*input_times)[output_->long_name()];
     } else {
-      inherited_input_time =
-          gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+      inherited_input_time = (*input_times)[kModelInputTimeKey];
     }
 
     if (num_inputs() <= 1) {
@@ -304,7 +302,7 @@ class AsyncInterleaveMany : public Node {
 
 class KnownRatio : public Node {
  public:
-  KnownRatio(Node::Args args, int64 ratio) : Node(args), ratio_(ratio) {}
+  KnownRatio(Node::Args args, double ratio) : Node(args), ratio_(ratio) {}
 
   virtual ~KnownRatio() {}
 
@@ -323,8 +321,7 @@ class KnownRatio : public Node {
     if (output_) {
       inherited_input_time = (*input_times)[output_->long_name()];
     } else {
-      inherited_input_time =
-          gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+      inherited_input_time = (*input_times)[kModelInputTimeKey];
     }
 
     if (ratio_ == 0) {
@@ -423,8 +420,7 @@ class AsyncKnownRatio : public Node {
     if (output_) {
       inherited_input_time = (*input_times)[output_->long_name()];
     } else {
-      inherited_input_time =
-          gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+      inherited_input_time = (*input_times)[kModelInputTimeKey];
     }
     double parallelism = 1.0;
     auto* parallelism_parameter = gtl::FindOrNull(parameters_, kParallelism);
@@ -593,8 +589,7 @@ class UnknownRatio : public Node {
     if (output_) {
       inherited_input_time = (*input_times)[output_->long_name()];
     } else {
-      inherited_input_time =
-          gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+      inherited_input_time = (*input_times)[kModelInputTimeKey];
     }
 
     if (num_elements_ == 0 || inputs_.empty() ||
@@ -692,8 +687,7 @@ class Unknown : public Node {
     if (output_) {
       inherited_input_time = (*input_times)[output_->long_name()];
     } else {
-      inherited_input_time =
-          gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+      inherited_input_time = (*input_times)[kModelInputTimeKey];
     }
     (*input_times)[long_name()] = inherited_input_time;
   }
@@ -972,17 +966,17 @@ double Node::OutputTime(absl::flat_hash_map<string, double>* input_times,
   return output_times[long_name()];
 }
 
-std::shared_ptr<Node> Node::Snapshot(std::shared_ptr<Node> output) const {
+std::shared_ptr<Node> Node::Snapshot() const {
   NodePairList node_pairs;
-  auto result = SnapshotHelper(output, &node_pairs);
+  auto result = SnapshotHelper(nullptr, &node_pairs);
 
   while (!node_pairs.empty()) {
     auto node_pair = node_pairs.front();
     node_pairs.pop_front();
-    std::shared_ptr<Node> input_node = node_pair.first,
-                          parent_node_copy = node_pair.second;
-    parent_node_copy->add_input(
-        input_node->SnapshotHelper(parent_node_copy, &node_pairs));
+    std::shared_ptr<Node> current = node_pair.first,
+                          cloned_output = node_pair.second;
+    cloned_output->add_input(
+        current->SnapshotHelper(cloned_output, &node_pairs));
   }
   return result;
 }
@@ -1185,26 +1179,30 @@ void Node::DebugStringHelper(absl::flat_hash_map<string, string>* debug_strings)
 }
 
 std::shared_ptr<Node> Node::SnapshotHelper(
-    std::shared_ptr<Node> clone_base, Node::NodePairList* node_pairs) const {
+    std::shared_ptr<Node> cloned_output, Node::NodePairList* node_pairs) const {
   tf_shared_lock l(mu_);
-  std::shared_ptr<Node> result_node = Clone(clone_base);
+
+  // Clone current node(`this`), also set clone of its output node
+  // (`cloned_output`) to be the output node of the cloned node
+  // (`cloned_current`).
+  std::shared_ptr<Node> cloned_current = Clone(cloned_output);
   {
-    result_node->autotune_.store(autotune_);
-    result_node->buffered_bytes_.store(buffered_bytes_);
-    result_node->buffered_elements_.store(buffered_elements_);
-    result_node->bytes_consumed_.store(bytes_consumed_);
-    result_node->bytes_produced_.store(bytes_produced_);
-    result_node->num_elements_.store(num_elements_);
-    result_node->record_metrics_.store(false);
-    result_node->processing_time_.store(processing_time_);
-    mutex_lock l2(result_node->mu_);
-    result_node->parameters_ = parameters_;
+    cloned_current->autotune_.store(autotune_);
+    cloned_current->buffered_bytes_.store(buffered_bytes_);
+    cloned_current->buffered_elements_.store(buffered_elements_);
+    cloned_current->bytes_consumed_.store(bytes_consumed_);
+    cloned_current->bytes_produced_.store(bytes_produced_);
+    cloned_current->num_elements_.store(num_elements_);
+    cloned_current->record_metrics_.store(false);
+    cloned_current->processing_time_.store(processing_time_);
+    mutex_lock l2(cloned_current->mu_);
+    cloned_current->parameters_ = parameters_;
   }
 
   for (auto& input : inputs_) {
-    node_pairs->push_back(std::make_pair(input, result_node));
+    node_pairs->push_back(std::make_pair(input, cloned_current));
   }
-  return result_node;
+  return cloned_current;
 }
 
 void Node::TotalBufferedBytesHelper(
@@ -1291,13 +1289,13 @@ void Model::FlushMetrics() {
 }
 
 void Model::Optimize(AutotuneAlgorithm algorithm, int64 cpu_budget,
-                     int64 ram_budget) {
+                     int64 ram_budget, double model_input_time) {
   switch (algorithm) {
     case AutotuneAlgorithm::HILL_CLIMB:
-      OptimizeHillClimb(cpu_budget, ram_budget);
+      OptimizeHillClimb(cpu_budget, ram_budget, model_input_time);
       break;
     case AutotuneAlgorithm::GRADIENT_DESCENT:
-      OptimizeGradientDescent(cpu_budget, ram_budget);
+      OptimizeGradientDescent(cpu_budget, ram_budget, model_input_time);
       break;
   }
 }
@@ -1342,11 +1340,12 @@ Model::CollectEssentialParallelism(
   return essential_parameters;
 }
 
-void Model::OptimizeGradientDescent(int64 cpu_budget, int64 ram_budget) {
+void Model::OptimizeGradientDescent(int64 cpu_budget, int64 ram_budget,
+                                    double model_input_time) {
   std::shared_ptr<Node> snapshot;
   {
     tf_shared_lock lock(mu_);
-    snapshot = output_->Snapshot(nullptr);
+    snapshot = output_->Snapshot();
   }
   VLOG(2) << "Starting optimization of tunable parameters with GradientDescent";
   auto parameters = CollectTunableParameters(snapshot);
@@ -1372,7 +1371,7 @@ void Model::OptimizeGradientDescent(int64 cpu_budget, int64 ram_budget) {
   double new_value;
   for (int i = 0; i < kMaxIterations; ++i) {
     absl::flat_hash_map<string, double> gradients;
-    new_output_time = OutputTime(snapshot, &gradients);
+    new_output_time = OutputTime(snapshot, model_input_time, &gradients);
     int64 model_parallelism = 0;
     for (auto& pair : essential_parameters) {
       model_parallelism += std::round(pair.second->value);
@@ -1418,11 +1417,12 @@ void Model::OptimizeGradientDescent(int64 cpu_budget, int64 ram_budget) {
   }
 }
 
-void Model::OptimizeHillClimb(int64 cpu_budget, int64 ram_budget) {
+void Model::OptimizeHillClimb(int64 cpu_budget, int64 ram_budget,
+                              double model_input_time) {
   std::shared_ptr<Node> snapshot;
   {
     tf_shared_lock lock(mu_);
-    snapshot = output_->Snapshot(nullptr);
+    snapshot = output_->Snapshot();
   }
   VLOG(2) << "Starting optimization of tunable parameters with HillClimb";
   const double processing_time = TotalProcessingTime(snapshot);
@@ -1438,7 +1438,8 @@ void Model::OptimizeHillClimb(int64 cpu_budget, int64 ram_budget) {
     pair.second->value = pair.second->min;
   }
   while (true) {
-    const double output_time = OutputTime(snapshot, /*gradients=*/nullptr);
+    const double output_time =
+        OutputTime(snapshot, model_input_time, /*gradients=*/nullptr);
     bool all_max = true;
     for (auto& pair : parameters) {
       if (pair.second->value < pair.second->max) {
@@ -1457,7 +1458,8 @@ void Model::OptimizeHillClimb(int64 cpu_budget, int64 ram_budget) {
         continue;
       }
       pair.second->value++;
-      double new_output_time = OutputTime(snapshot, /*gradients=*/nullptr);
+      double new_output_time =
+          OutputTime(snapshot, model_input_time, /*gradients=*/nullptr);
       double delta = output_time - new_output_time;
       if (delta > best_delta &&
           (delta > kBufferSizeMinDelta || pair.second->name != kBufferSize)) {
@@ -1486,10 +1488,11 @@ void Model::OptimizeHillClimb(int64 cpu_budget, int64 ram_budget) {
   }
 }
 
-double Model::OutputTime(std::shared_ptr<Node> node,
+double Model::OutputTime(std::shared_ptr<Node> node, double model_input_time,
                          absl::flat_hash_map<string, double>* gradients) {
   // To store the input time for each node.
-  absl::flat_hash_map<string, double> input_times;
+  absl::flat_hash_map<string, double> input_times = {
+      {kModelInputTimeKey, model_input_time}};
 
   // TODO(jsimsa): Now that we are accounting for buffer size in wait time
   // computation, assuming that the input is infinitely fast will result in
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index e8d78756192..bfa6e31209a 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -42,8 +42,8 @@ constexpr int64 kAutotune = -1;
 constexpr char kParallelism[] = "parallelism";
 constexpr char kBufferSize[] = "buffer_size";
 
-// A key used to identify input time gradient.
-constexpr char kInputTimeKey[] = "input_time";
+// A key used to identify the input time of the model.
+constexpr char kModelInputTimeKey[] = "model_input_time";
 
 enum class AutotuneAlgorithm {
   HILL_CLIMB = 0,
@@ -339,8 +339,7 @@ class Node {
   //
   // The purpose for this method is to allow the model optimization logic to
   // operate over immutable state while allowing concurrent model updates.
-  std::shared_ptr<Node> Snapshot(std::shared_ptr<Node> output) const
-      TF_LOCKS_EXCLUDED(mu_);
+  std::shared_ptr<Node> Snapshot() const TF_LOCKS_EXCLUDED(mu_);
 
   // Returns the per-element processing time spent in this node.
   double SelfProcessingTime() const TF_LOCKS_EXCLUDED(mu_);
@@ -493,7 +492,7 @@ class Node {
       const TF_SHARED_LOCKS_REQUIRED(mu_);
 
   // Copy the node and add the (input, copy) pairs to the NodePairList.
-  std::shared_ptr<Node> SnapshotHelper(std::shared_ptr<Node> clone_base,
+  std::shared_ptr<Node> SnapshotHelper(std::shared_ptr<Node> cloned_output,
                                        NodePairList* node_pairs) const;
 
   // Compute total buffered bytes for the node and store in the total bytes map.
@@ -610,8 +609,8 @@ class Model {
   void FlushMetrics() TF_LOCKS_EXCLUDED(mu_);
 
   // Uses the given algorithm to perform the autotuning optimization.
-  void Optimize(AutotuneAlgorithm algorithm, int64 cpu_budget, int64 ram_budget)
-      TF_LOCKS_EXCLUDED(mu_);
+  void Optimize(AutotuneAlgorithm algorithm, int64 cpu_budget, int64 ram_budget,
+                double model_input_time) TF_LOCKS_EXCLUDED(mu_);
 
   // Removes the given node.
   void RemoveNode(std::shared_ptr<Node> node) TF_LOCKS_EXCLUDED(mu_);
@@ -639,7 +638,8 @@ class Model {
   // This process is repeated until all parameters reach their maximum values or
   // the projected output time is less than or equal to the processing time
   // needed to produce an element divided by CPU budget.
-  void OptimizeHillClimb(int64 cpu_budget, int64 ram_budget);
+  void OptimizeHillClimb(int64 cpu_budget, int64 ram_budget,
+                         double model_input_time);
 
   // This optimization algorithm starts by setting all tunable parallelism
   // parameters to the minimum value. It then improves current parameters by
@@ -648,12 +648,13 @@ class Model {
   // repeated until either the output time improvement is smaller than threshold
   // value or the output time is less than the processing time needed to produce
   // an element divided by CPU budget.
-  void OptimizeGradientDescent(int64 cpu_budget, int64 ram_budget);
+  void OptimizeGradientDescent(int64 cpu_budget, int64 ram_budget,
+                               double model_input_time);
 
   // Collects the output time and if `gradients` is not `nullptr`, the output
   // time gradient w.r.t. tunable parameters of the subtree rooted in the given
   // node.
-  double OutputTime(std::shared_ptr<Node> node,
+  double OutputTime(std::shared_ptr<Node> node, double model_input_time,
                     absl::flat_hash_map<string, double>* gradients);
 
   // Collects the processing time for the given node.
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
index 3cbe90b08f9..bdfd2c4df2d 100644
--- a/tensorflow/core/framework/model_test.cc
+++ b/tensorflow/core/framework/model_test.cc
@@ -56,7 +56,7 @@ TEST_P(AsyncInterleaveManyTest, Model) {
     async_interleave_many->remove_input(source2);
   });
   absl::flat_hash_map<string, double> input_times;
-  input_times[kInputTimeKey] = input_time;
+  input_times[kModelInputTimeKey] = input_time;
   EXPECT_EQ(async_interleave_many->TotalBufferedBytes(), 0);
   EXPECT_EQ(async_interleave_many->TotalMaximumBufferedBytes(), 0);
   async_interleave_many->record_buffer_event(110, 10);
@@ -125,7 +125,7 @@ TEST_P(AsyncKnownRatioTest, Model) {
       model::MakeSourceNode({2, "source2", async_known_many});
   async_known_many->add_input(source2);
   absl::flat_hash_map<string, double> input_times;
-  input_times[kInputTimeKey] = input_time;
+  input_times[kModelInputTimeKey] = input_time;
   EXPECT_EQ(async_known_many->TotalBufferedBytes(), 0);
   EXPECT_EQ(async_known_many->TotalMaximumBufferedBytes(), 0);
   async_known_many->record_buffer_event(110, 10);
@@ -202,6 +202,7 @@ TEST(InterleaveManyTest, Model) {
       model::MakeSourceNode({3, "source2", interleave_many});
   interleave_many->add_input(source2);
   absl::flat_hash_map<string, double> input_times;
+  input_times[kModelInputTimeKey] = 0.0;
   interleave_many->add_processing_time(100);
   EXPECT_EQ(interleave_many->processing_time(), 100);
   EXPECT_EQ(interleave_many->TotalProcessingTime(/*processing_times=*/nullptr),
@@ -241,6 +242,7 @@ TEST_P(KnownRatioTest, Model) {
       model::MakeSourceNode({2, "source2", known_many});
   known_many->add_input(source2);
   absl::flat_hash_map<string, double> input_times;
+  input_times[kModelInputTimeKey] = 0.0;
   source1->add_processing_time(100);
   EXPECT_EQ(known_many->TotalProcessingTime(/*processing_times=*/nullptr), 0);
   EXPECT_EQ(known_many->OutputTime(&input_times, nullptr), 0);
@@ -289,6 +291,7 @@ INSTANTIATE_TEST_SUITE_P(Test, KnownRatioTest, ::testing::Values(0, 1, 2, 4));
 TEST(SourceTest, Model) {
   std::shared_ptr<Node> source = model::MakeSourceNode({0, "source", nullptr});
   absl::flat_hash_map<string, double> input_times;
+  input_times[kModelInputTimeKey] = 0.0;
   source->add_processing_time(100);
   EXPECT_EQ(source->processing_time(), 100);
   EXPECT_EQ(source->TotalProcessingTime(/*processing_times=*/nullptr), 0);
@@ -313,6 +316,7 @@ TEST(UnknownRatioTest, Model) {
       model::MakeSourceNode({2, "source2", unknown_many});
   unknown_many->add_input(source2);
   absl::flat_hash_map<string, double> input_times;
+  input_times[kModelInputTimeKey] = 0.0;
   unknown_many->add_processing_time(100);
   EXPECT_EQ(unknown_many->processing_time(), 100);
   EXPECT_EQ(unknown_many->TotalProcessingTime(/*processing_times=*/nullptr), 0);
@@ -348,6 +352,7 @@ TEST(UnknownTest, Model) {
       model::MakeSourceNode({2, "source2", unknown});
   unknown->add_input(source2);
   absl::flat_hash_map<string, double> input_times;
+  input_times[kModelInputTimeKey] = 0.0;
   source1->add_processing_time(100);
   EXPECT_EQ(unknown->TotalProcessingTime(/*processing_times=*/nullptr), 0);
   EXPECT_EQ(unknown->OutputTime(&input_times, nullptr), 0);
@@ -528,7 +533,7 @@ TEST(AsyncInterleaveManyGradientTest, Model) {
     async_interleave_many->remove_input(source2);
   });
   absl::flat_hash_map<string, double> input_times;
-  input_times[kInputTimeKey] = input_time;
+  input_times[kModelInputTimeKey] = input_time;
   absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
   async_interleave_many->CollectTunableParameters(&parameters);
   async_interleave_many->record_element();
@@ -583,7 +588,7 @@ TEST_P(AsyncKnownRatioGradientTest, Model) {
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", async_known_many});
   absl::flat_hash_map<string, double> input_times;
-  input_times[kInputTimeKey] = input_time;
+  input_times[kModelInputTimeKey] = input_time;
   async_known_many->add_input(source2);
   source1->record_element();
   source1->add_processing_time(100);
@@ -638,7 +643,7 @@ TEST(InterleaveManyGradientTest, Model) {
   async_known_many->record_element();
   async_known_many->add_processing_time(300);
   absl::flat_hash_map<string, double> input_times;
-  input_times[kInputTimeKey] = input_time;
+  input_times[kModelInputTimeKey] = input_time;
   absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
   absl::flat_hash_map<string, double> gradients;
   interleave_many->CollectTunableParameters(&parameters);
@@ -669,7 +674,7 @@ TEST(KnownRatioGradientTest, Model) {
   async_known_many->record_element();
   async_known_many->add_processing_time(300);
   absl::flat_hash_map<string, double> input_times;
-  input_times[kInputTimeKey] = input_time;
+  input_times[kModelInputTimeKey] = input_time;
   absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
   absl::flat_hash_map<string, double> gradients;
   known_many->CollectTunableParameters(&parameters);
@@ -700,7 +705,7 @@ TEST(UnknownRatioGradientTest, Model) {
   async_known_many->record_element();
   async_known_many->add_processing_time(300);
   absl::flat_hash_map<string, double> input_times;
-  input_times[kInputTimeKey] = input_time;
+  input_times[kModelInputTimeKey] = input_time;
   absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
   absl::flat_hash_map<string, double> gradients;
   unknown_many->CollectTunableParameters(&parameters);
@@ -731,7 +736,7 @@ TEST(UnknownGradientTest, Model) {
   async_known_many->record_element();
   async_known_many->add_processing_time(300);
   absl::flat_hash_map<string, double> input_times;
-  input_times[kInputTimeKey] = input_time;
+  input_times[kModelInputTimeKey] = input_time;
   absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
   absl::flat_hash_map<string, double> gradients;
   unknown->CollectTunableParameters(&parameters);
@@ -747,27 +752,41 @@ TEST(UnknownGradientTest, Model) {
 TEST(SnapshotTest, Model) {
   std::shared_ptr<Node> root =
       model::MakeUnknownNode({0, std::to_string(0), nullptr});
-  std::shared_ptr<Node> cur_node = root;
+  std::shared_ptr<Node> current = root;
 
-  int64 num_nodes = 100;
+  int64 num_nodes = 20;
   for (int64 i = 1; i < num_nodes; i++) {
-    cur_node->add_input(
-        model::MakeUnknownNode({i, std::to_string(i), cur_node}));
-    cur_node = cur_node->inputs().front();
+    std::shared_ptr<Node> input =
+        model::MakeUnknownNode({i, std::to_string(i), current});
+    input->set_autotune(std::rand() % 2 == 1);
+    current->add_input(input);
+    current = input;
   }
 
-  std::shared_ptr<Node> root_copy = root->Snapshot(nullptr);
-  cur_node = root;
-  std::shared_ptr<Node> cur_node_copy = root_copy;
+  std::shared_ptr<Node> cloned_root = root->Snapshot();
+  current = root;
+  std::shared_ptr<Node> cloned_current = cloned_root;
 
   for (int64 i = 0; i < num_nodes; i++) {
-    EXPECT_EQ(cur_node->id(), cur_node_copy->id());
-    EXPECT_EQ(cur_node->name(), cur_node_copy->name());
-    EXPECT_NE(cur_node.get(), cur_node_copy.get());
+    EXPECT_EQ(current->id(), cloned_current->id());
+    EXPECT_EQ(current->name(), cloned_current->name());
+    EXPECT_EQ(current->autotune(), cloned_current->autotune());
+    EXPECT_NE(current.get(), cloned_current.get());
+
+    if (i > 0) {
+      EXPECT_EQ(current->output()->long_name(),
+                cloned_current->output()->long_name());
+      EXPECT_EQ(current->output()->autotune(),
+                cloned_current->output()->autotune());
+      EXPECT_NE(current->output(), cloned_current->output());
+    } else {
+      EXPECT_EQ(current->output(), nullptr);
+      EXPECT_EQ(cloned_current->output(), nullptr);
+    }
 
     if (i < num_nodes - 1) {
-      cur_node = cur_node->inputs().front();
-      cur_node_copy = cur_node_copy->inputs().front();
+      current = current->inputs().front();
+      cloned_current = cloned_current->inputs().front();
     }
   }
 }
@@ -843,6 +862,23 @@ INSTANTIATE_TEST_SUITE_P(
                        ::testing::Values(0, 20, 40, 80, 100),
                        ::testing::Values(0, 1, 2, 4, 10, 20, 40)));
 
+class SelfProcessingTimeTest : public ::testing::TestWithParam<int64> {};
+
+TEST_P(SelfProcessingTimeTest, Model) {
+  const int64 add_times = GetParam();
+  std::shared_ptr<Node> source = model::MakeSourceNode({0, "source", nullptr});
+  for (int i = 0; i < add_times; i++) {
+    source->add_processing_time(i);
+    source->record_element();
+  }
+  double self_processing_time =
+      (add_times == 0 ? 0.0 : (static_cast<double>(add_times) - 1.0) / 2.0);
+  EXPECT_EQ(source->SelfProcessingTime(), self_processing_time);
+}
+
+INSTANTIATE_TEST_SUITE_P(Test, SelfProcessingTimeTest,
+                         ::testing::Values(0, 1, 2, 5, 10, 20, 40));
+
 }  // namespace
 }  // namespace model
 }  // namespace data
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index d937a8e51e1..d1a7c9aebba 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -62,16 +62,16 @@ extern const char* const kColocationGroupPrefix;
 // The parameter `max_inputs_in_summary` specifies how many inputs at most to
 // serialize in the output (in order not to get a string which is overly large).
 // The value `-1` specifies that all inputs will be shown.
-string SummarizeNodeDef(const NodeDef& node_def,
-                        int max_inputs_in_summary = -1);
-string SummarizeAttrs(const NodeDef& node_def);
-string SummarizeAttrsHelper(AttrSlice attrs, StringPiece device);
+std::string SummarizeNodeDef(const NodeDef& node_def,
+                             int max_inputs_in_summary = -1);
+std::string SummarizeAttrs(const NodeDef& node_def);
+std::string SummarizeAttrsHelper(AttrSlice attrs, StringPiece device);
 
 // Produces a formatted string pattern from the node which can uniquely identify
 // this node upstream to produce an informative error message. The pattern
 // followed is: {{node <node_name>}}
-string FormatNodeDefForError(const NodeDef& node_def);
-string FormatNodeDefForError(
+std::string FormatNodeDefForError(const NodeDef& node_def);
+std::string FormatNodeDefForError(
     StringPiece node_name, bool has_experimental_debug_info,
     const NodeDef_ExperimentalDebugInfo& experimental_debug_info);
 
@@ -148,7 +148,7 @@ class AttrSlice {
   // Returns the attr with attr_name if found.  Otherwise, returns
   // nullptr.
   const AttrValue* Find(StringPiece attr_name) const;
-  const AttrValue* FindByString(const string& attr_name) const;
+  const AttrValue* FindByString(const std::string& attr_name) const;
 
   // Returns the attr_value for attr_name if found. Otherwise, returns a
   // NotFound status.
@@ -157,8 +157,8 @@ class AttrSlice {
   // Helper class to avoid allocations in EqualAttrs.
   // TODO(irving): Will go away once NodeInfo is used.
   struct Scratch {
-    string a;
-    string b;
+    std::string a;
+    std::string b;
   };
 
   // Check if all attrs and attr values match.  Does not take defaults into
@@ -175,13 +175,13 @@ class AttrSlice {
   // If this AttrSlice has an attached NodeDef, summarize it.  This is for
   // error messages only: we intentionally do not provide direct access to the
   // NodeDef, since it is not always there.
-  string SummarizeNode() const;
+  std::string SummarizeNode() const;
 
   // Iteration over all attrs
   AttrValueMap::const_iterator begin() const { return attrs_->begin(); }
   AttrValueMap::const_iterator end() const { return attrs_->end(); }
 
-  string DebugString() const;
+  std::string DebugString() const;
 
  private:
   const NodeDef* ndef_;
@@ -195,7 +195,7 @@ bool HasNodeAttr(const NodeDef& node_def, StringPiece attr_name);
 // attr with attr_name is found in node_def, or the attr does not have
 // a matching type, a non-ok status will be returned.
 Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
-                   string* value);  // type: "string"
+                   std::string* value);  // type: "string"
 Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
                    tstring* value);  // type: "tstring"
 Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
@@ -266,7 +266,7 @@ Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
 // attr with attr_name is found in node_def, or the attr does not have
 // a matching type, false is returned.
 bool TryGetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
-                    string* value);  // type: "string"
+                    std::string* value);  // type: "string"
 bool TryGetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
                     int64* value);  // type: "int"
 bool TryGetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
@@ -309,7 +309,8 @@ bool TryGetNodeAttr(
 // If no attr with attr_name is found in node_def, or the attr does not have
 // a matching type, a reference to an empty string is returned.
 // REQUIRES: Must not use the returned value beyond the lifetime of node_def.
-const string& GetNodeAttrString(const AttrSlice& attrs, StringPiece attr_name);
+const std::string& GetNodeAttrString(const AttrSlice& attrs,
+                                     StringPiece attr_name);
 
 // Specialization to parse an attribute directly into a Padding enum.
 Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
diff --git a/tensorflow/core/framework/numeric_types.h b/tensorflow/core/framework/numeric_types.h
index 6d80a1c519c..10313eb8feb 100644
--- a/tensorflow/core/framework/numeric_types.h
+++ b/tensorflow/core/framework/numeric_types.h
@@ -43,47 +43,17 @@ typedef Eigen::QUInt16 quint16;
 
 }  // namespace tensorflow
 
-
-
-
 static inline tensorflow::bfloat16 FloatToBFloat16(float float_val) {
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    return *reinterpret_cast<tensorflow::bfloat16*>(
-        reinterpret_cast<uint16_t*>(&float_val));
+  return *reinterpret_cast<tensorflow::bfloat16*>(
+      reinterpret_cast<uint16_t*>(&float_val));
 #else
-    return *reinterpret_cast<tensorflow::bfloat16*>(
-        &(reinterpret_cast<uint16_t*>(&float_val)[1]));
+  return *reinterpret_cast<tensorflow::bfloat16*>(
+      &(reinterpret_cast<uint16_t*>(&float_val)[1]));
 #endif
 }
-    
+
 namespace Eigen {
-// TODO(xpan): We probably need to overwrite more methods to have correct eigen
-// behavior. E.g. epsilon(), dummy_precision, etc. See NumTraits.h in eigen.
-template <>
-struct NumTraits<tensorflow::bfloat16>
-    : GenericNumTraits<tensorflow::bfloat16> {
-  enum {
-    IsInteger = 0,
-    IsSigned = 1,
-    RequireInitialization = 0
-  };
-  static EIGEN_STRONG_INLINE tensorflow::bfloat16 highest() {
-    return FloatToBFloat16(NumTraits<float>::highest());
-  }
-
-  static EIGEN_STRONG_INLINE tensorflow::bfloat16 lowest() {
-    return FloatToBFloat16(NumTraits<float>::lowest());
-  }
-
-  static EIGEN_STRONG_INLINE tensorflow::bfloat16 infinity() {
-    return FloatToBFloat16(NumTraits<float>::infinity());
-  }
-
-  static EIGEN_STRONG_INLINE tensorflow::bfloat16 quiet_NaN() {
-    return FloatToBFloat16(NumTraits<float>::quiet_NaN());
-  }
-};
-
 template <>
 struct NumTraits<tensorflow::tstring> : GenericNumTraits<tensorflow::tstring> {
   enum {
@@ -104,30 +74,6 @@ struct NumTraits<tensorflow::tstring> : GenericNumTraits<tensorflow::tstring> {
   static inline tensorflow::tstring quiet_NaN();
 };
 
-using ::tensorflow::operator==;
-using ::tensorflow::operator!=;
-
-namespace numext {
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE tensorflow::bfloat16 log(
-    const tensorflow::bfloat16& x) {
-  return static_cast<tensorflow::bfloat16>(::logf(static_cast<float>(x)));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE tensorflow::bfloat16 exp(
-    const tensorflow::bfloat16& x) {
-  return static_cast<tensorflow::bfloat16>(::expf(static_cast<float>(x)));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE tensorflow::bfloat16 abs(
-    const tensorflow::bfloat16& x) {
-  return static_cast<tensorflow::bfloat16>(::fabsf(static_cast<float>(x)));
-}
-
-}  // namespace numext
 }  // namespace Eigen
 
 #if defined(_MSC_VER) && !defined(__clang__)
@@ -138,6 +84,13 @@ struct hash<Eigen::half> {
     return static_cast<std::size_t>(a.x);
   }
 };
+
+template <>
+struct hash<Eigen::bfloat16> {
+  std::size_t operator()(const Eigen::bfloat16& a) const {
+    return hash<float>()(static_cast<float>(a));
+  }
+};
 }  // namespace std
 #endif  // _MSC_VER
 
diff --git a/tensorflow/core/framework/op.h b/tensorflow/core/framework/op.h
index 86bc70448d2..adc52d963c9 100644
--- a/tensorflow/core/framework/op.h
+++ b/tensorflow/core/framework/op.h
@@ -45,11 +45,12 @@ class OpRegistryInterface {
   // Returns an error status and sets *op_reg_data to nullptr if no OpDef is
   // registered under that name, otherwise returns the registered OpDef.
   // Caller must not delete the returned pointer.
-  virtual Status LookUp(const string& op_type_name,
+  virtual Status LookUp(const std::string& op_type_name,
                         const OpRegistrationData** op_reg_data) const = 0;
 
   // Shorthand for calling LookUp to get the OpDef.
-  Status LookUpOpDef(const string& op_type_name, const OpDef** op_def) const;
+  Status LookUpOpDef(const std::string& op_type_name,
+                     const OpDef** op_def) const;
 };
 
 // The standard implementation of OpRegistryInterface, along with a
@@ -71,11 +72,11 @@ class OpRegistry : public OpRegistryInterface {
 
   void Register(const OpRegistrationDataFactory& op_data_factory);
 
-  Status LookUp(const string& op_type_name,
+  Status LookUp(const std::string& op_type_name,
                 const OpRegistrationData** op_reg_data) const override;
 
   // Returns OpRegistrationData* of registered op type, else returns nullptr.
-  const OpRegistrationData* LookUp(const string& op_type_name) const;
+  const OpRegistrationData* LookUp(const std::string& op_type_name) const;
 
   // Fills *ops with all registered OpDefs (except those with names
   // starting with '_' if include_internal == false) sorted in
@@ -84,7 +85,7 @@ class OpRegistry : public OpRegistryInterface {
 
   // Returns ASCII-format OpList for all registered OpDefs (except
   // those with names starting with '_' if include_internal == false).
-  string DebugString(bool include_internal) const;
+  std::string DebugString(bool include_internal) const;
 
   // A singleton available at startup.
   static OpRegistry* Global();
@@ -153,7 +154,7 @@ class OpRegistry : public OpRegistryInterface {
   Status RegisterAlreadyLocked(const OpRegistrationDataFactory& op_data_factory)
       const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  const OpRegistrationData* LookUpSlow(const string& op_type_name) const;
+  const OpRegistrationData* LookUpSlow(const std::string& op_type_name) const;
 
   mutable mutex mu_;
   // Functions in deferred_ may only be called with mu_ held.
@@ -179,11 +180,11 @@ class OpListOpRegistry : public OpRegistryInterface {
   // Does not take ownership of op_list, *op_list must outlive *this.
   explicit OpListOpRegistry(const OpList* op_list);
   ~OpListOpRegistry() override;
-  Status LookUp(const string& op_type_name,
+  Status LookUp(const std::string& op_type_name,
                 const OpRegistrationData** op_reg_data) const override;
 
   // Returns OpRegistrationData* of op type in list, else returns nullptr.
-  const OpRegistrationData* LookUp(const string& op_type_name) const;
+  const OpRegistrationData* LookUp(const std::string& op_type_name) const;
 
  private:
   // Values are owned.
@@ -225,15 +226,15 @@ template <>
 class OpDefBuilderWrapper<true> {
  public:
   explicit OpDefBuilderWrapper(const char name[]) : builder_(name) {}
-  OpDefBuilderWrapper<true>& Attr(string spec) {
+  OpDefBuilderWrapper<true>& Attr(std::string spec) {
     builder_.Attr(std::move(spec));
     return *this;
   }
-  OpDefBuilderWrapper<true>& Input(string spec) {
+  OpDefBuilderWrapper<true>& Input(std::string spec) {
     builder_.Input(std::move(spec));
     return *this;
   }
-  OpDefBuilderWrapper<true>& Output(string spec) {
+  OpDefBuilderWrapper<true>& Output(std::string spec) {
     builder_.Output(std::move(spec));
     return *this;
   }
@@ -259,11 +260,11 @@ class OpDefBuilderWrapper<true> {
     builder_.SetAllowsUninitializedInput();
     return *this;
   }
-  OpDefBuilderWrapper<true>& Deprecated(int version, string explanation) {
+  OpDefBuilderWrapper<true>& Deprecated(int version, std::string explanation) {
     builder_.Deprecated(version, std::move(explanation));
     return *this;
   }
-  OpDefBuilderWrapper<true>& Doc(string text) {
+  OpDefBuilderWrapper<true>& Doc(std::string text) {
     builder_.Doc(std::move(text));
     return *this;
   }
diff --git a/tensorflow/core/framework/op_def_builder.h b/tensorflow/core/framework/op_def_builder.h
index aab0c63636e..b69ee46cd59 100644
--- a/tensorflow/core/framework/op_def_builder.h
+++ b/tensorflow/core/framework/op_def_builder.h
@@ -53,7 +53,7 @@ struct OpRegistrationData {
 class OpDefBuilder {
  public:
   // Constructs an OpDef with just the name field set.
-  explicit OpDefBuilder(string op_name);
+  explicit OpDefBuilder(std::string op_name);
 
   // Adds an attr to this OpDefBuilder (and returns *this). The spec has
   // format "<name>:<type>" or "<name>:<type>=<default>"
@@ -86,7 +86,7 @@ class OpDefBuilder {
   // * Ability to restrict the type of the tensor like the existing
   //   restrictions for type attrs.
   // Perhaps by linking the type of the tensor to a type attr?
-  OpDefBuilder& Attr(string spec);
+  OpDefBuilder& Attr(std::string spec);
 
   // Adds an input or output to this OpDefBuilder (and returns *this).
   // The spec has form "<name>:<type-expr>" or "<name>:Ref(<type-expr>)"
@@ -103,8 +103,8 @@ class OpDefBuilder {
   // in the spec?
   // TODO(josh11b): SparseInput() and SparseOutput() matching the Python
   // handling?
-  OpDefBuilder& Input(string spec);
-  OpDefBuilder& Output(string spec);
+  OpDefBuilder& Input(std::string spec);
+  OpDefBuilder& Output(std::string spec);
 
   // Turns on the indicated boolean flag in this OpDefBuilder (and
   // returns *this).
@@ -114,7 +114,7 @@ class OpDefBuilder {
   OpDefBuilder& SetAllowsUninitializedInput();
 
   // Deprecate the op at a certain GraphDef version.
-  OpDefBuilder& Deprecated(int version, string explanation);
+  OpDefBuilder& Deprecated(int version, std::string explanation);
 
   // Adds docs to this OpDefBuilder (and returns *this).
   // Docs have the format:
@@ -130,7 +130,7 @@ class OpDefBuilder {
   // to suppress the automatically-generated type documentation in
   // generated output.
 #ifndef TF_LEAN_BINARY
-  OpDefBuilder& Doc(string text);
+  OpDefBuilder& Doc(std::string text);
 #else
   OpDefBuilder& Doc(string text) { return *this; }
 #endif
@@ -157,7 +157,7 @@ class OpDefBuilder {
   // Adds control output to this OpDefBuilder (and returns *this).
   // The <name> must be a valid node name (matches regexp
   // [a-zA-Z][a-zA-Z0-9_]*). Named control output can only exist for functions.
-  OpDefBuilder& ControlOutput(string name);
+  OpDefBuilder& ControlOutput(std::string name);
 
   OpDef* op_def() { return &op_reg_data_.op_def; }
 
@@ -166,7 +166,7 @@ class OpDefBuilder {
   std::vector<string> inputs_;
   std::vector<string> outputs_;
   std::vector<string> control_outputs_;
-  string doc_;
+  std::string doc_;
   std::vector<string> errors_;
 };
 
diff --git a/tensorflow/core/framework/op_def_util.cc b/tensorflow/core/framework/op_def_util.cc
index f40d867bb6f..486f92b3b20 100644
--- a/tensorflow/core/framework/op_def_util.cc
+++ b/tensorflow/core/framework/op_def_util.cc
@@ -661,7 +661,7 @@ Status OpDefCompatible(const OpDef& old_op, const OpDef& new_op) {
            "' vs. '", new_in_sig, "'");
   VALIDATE(old_in_ref.size() == new_in_ref.size(),  // Should not happen
            "Unexpected change in input ref lists.");
-  for (int i = 0, iter_limit = old_in_ref.size(); i < iter_limit; ++i) {
+  for (int i = 0, end = old_in_ref.size(); i < end; ++i) {
     // Allowed to remove "ref" from an input (or leave it unchanged).
     VALIDATE(old_in_ref[i] || !new_in_ref[i], "Input ", i,
              " changed from non-ref to ref");
@@ -677,7 +677,7 @@ Status OpDefCompatible(const OpDef& old_op, const OpDef& new_op) {
            old_out_sig, "' vs. '", new_out_sig, "'");
   VALIDATE(old_out_ref.size() == new_out_ref.size(),  // Should not happen
            "Unexpected change in output ref lists");
-  for (int i = 0, iter_limit = old_out_ref.size(); i < iter_limit; ++i) {
+  for (int i = 0, end = old_out_ref.size(); i < end; ++i) {
     // Allowed to add "ref" to an output (or leave it unchanged).
     VALIDATE(!old_out_ref[i] || new_out_ref[i], "Output ", i,
              " changed from ref to non-ref");
diff --git a/tensorflow/core/framework/op_def_util.h b/tensorflow/core/framework/op_def_util.h
index 311e40afeea..4a4a2e8e897 100644
--- a/tensorflow/core/framework/op_def_util.h
+++ b/tensorflow/core/framework/op_def_util.h
@@ -54,7 +54,7 @@ const ApiDef::Arg* FindInputArg(StringPiece name, const ApiDef& api_def);
 
 // Produce a human-readable version of an op_def that is more concise
 // than a text-format proto.  Excludes descriptions.
-string SummarizeOpDef(const OpDef& op_def);
+std::string SummarizeOpDef(const OpDef& op_def);
 
 // Returns an error if new_op is not backwards-compatible with (more
 // accepting than) old_op.
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index abf73cb57df..32bb2200853 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -53,6 +53,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/platform_strings.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
@@ -172,34 +173,38 @@ Status OpKernel::OutputRange(StringPiece output_name, int* start,
   }
 }
 
-string OpKernel::GetTraceArgument(OpKernelContext* ctx) {
-  int num_inputs = ctx->num_inputs();
+string OpKernel::ShapeTraceString(const OpKernelContext& ctx) const {
+  int num_inputs = ctx.num_inputs();
   if (num_inputs == 0) return "";
   std::vector<string> tensor_shapes;
   tensor_shapes.reserve(num_inputs);
   for (int i = 0; i < num_inputs; i++) {
-    if (!ctx->has_input(i)) {
+    if (!ctx.has_input(i)) {
       tensor_shapes.emplace_back();  // Placeholder
       continue;
     }
-    DataType input_dtype = ctx->input_dtype(i);
+    DataType input_dtype = ctx.input_dtype(i);
     if (input_dtype == DataType::DT_RESOURCE ||
         input_dtype == DataType::DT_VARIANT || IsRefType(input_dtype)) {
       tensor_shapes.emplace_back();  // Placeholder
       continue;
     }
     tensor_shapes.emplace_back(strings::StrCat(
-        DataTypeString(input_dtype), ctx->input(i).shape().DebugString()));
+        DataTypeString(input_dtype), ctx.input(i).shape().DebugString()));
   }
-  return strings::StrCat("shape=(", absl::StrJoin(tensor_shapes, ";"), ")");
+  return strings::StrCat("(", absl::StrJoin(tensor_shapes, ";"), ")");
 }
 
-string OpKernel::TraceString(OpKernelContext* ctx, bool verbose) {
-  string trace_string = strings::StrCat(name_view(), ":", type_string_view());
-  if (!verbose) return trace_string;
-  string trace_args = GetTraceArgument(ctx);
-  if (trace_args.empty()) return trace_string;
-  return strings::StrCat(trace_string, "#", trace_args, "#");
+string OpKernel::TraceString(const OpKernelContext& ctx, bool verbose) const {
+  string trace_string = profiler::TraceMeOp(name_view(), type_string_view());
+  if (verbose) {
+    string shape = ShapeTraceString(ctx);
+    if (!shape.empty()) {
+      trace_string =
+          profiler::TraceMeEncode(std::move(trace_string), {{"shape", shape}});
+    }
+  }
+  return trace_string;
 }
 
 void AsyncOpKernel::Compute(OpKernelContext* context) {
@@ -413,7 +418,7 @@ Status OpKernelContext::input_ref_mutex(StringPiece name, mutex** out_mutex) {
   return Status::OK();
 }
 
-const Tensor& OpKernelContext::input(int index) {
+const Tensor& OpKernelContext::input(int index) const {
   CHECK_GE(index, 0);
   CHECK_LT(index, num_inputs()) << " name: " << op_kernel().name();
   CHECK(!input_is_ref(index));
@@ -704,10 +709,11 @@ Status OpKernelContext::allocate_tensor(
     DataType type, const TensorShape& shape, Tensor* out_tensor,
     AllocatorAttributes attr, const AllocationAttributes& allocation_attr) {
   Allocator* a = get_allocator(attr);
-  Tensor new_tensor(a, type, shape,
-                    AllocationAttributes(allocation_attr.no_retry_on_failure,
-                                         /* allocation_will_be_logged= */ true,
-                                         allocation_attr.freed_by_func));
+  Tensor new_tensor(
+      a, type, shape,
+      AllocationAttributes(
+          /*retry_on_failure=*/allocation_attr.retry_on_failure,
+          /*allocation_will_be_logged=*/true, allocation_attr.freed_by_func));
 
   if (!new_tensor.IsInitialized()) {
     return errors::ResourceExhausted(
@@ -1206,7 +1212,8 @@ void LoadDynamicKernelsInternal() {
         if (s.ok() || override_abi_check) {
           // TODO(gunan): Store the handles to the opened files.
           void* unused_filehandle;
-          TF_CHECK_OK(env->LoadLibrary(fullpath.c_str(), &unused_filehandle));
+          TF_CHECK_OK(
+              env->LoadDynamicLibrary(fullpath.c_str(), &unused_filehandle));
         } else {
           LOG(WARNING) << "Not loading plugin library " << fullpath << ": "
                        << s.error_message();
@@ -1413,10 +1420,10 @@ Status FindKernelDef(
       device_type, node_name, has_experimental_debug_info,
       experimental_debug_info, node_op, node_attrs, &reg, &was_attr_mismatch));
   if (reg == nullptr) {
-    std::string device_str = DeviceTypeString(device_type);
+    const std::string device_str = DeviceTypeString(device_type);
     Status s = errors::NotFound(
-        "No registered '", node_op, "' OpKernel for ",
-        DeviceTypeString(device_type), " devices compatible with node ",
+        "No registered '", node_op, "' OpKernel for ", device_str,
+        " devices compatible with node ",
         FormatNodeDefForError(node_name, has_experimental_debug_info,
                               experimental_debug_info));
     if (was_attr_mismatch) {
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index a4ada3303d3..3bfcedaee82 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -145,14 +145,16 @@ class OpKernel {
 
   // Accessors.
   const NodeDef& def() const { return props_->node_def; }
-  const string& name() const { return props_->node_def.name(); }
+  const std::string& name() const { return props_->node_def.name(); }
   absl::string_view name_view() const { return name_view_; }
-  const string& type_string() const { return props_->node_def.op(); }
+  const std::string& type_string() const { return props_->node_def.op(); }
   absl::string_view type_string_view() const { return type_string_view_; }
-  const string& requested_input(int i) const {
+  const std::string& requested_input(int i) const {
     return props_->node_def.input(i);
   }
-  const string& requested_device() const { return props_->node_def.device(); }
+  const std::string& requested_device() const {
+    return props_->node_def.device();
+  }
 
   int num_inputs() const { return props_->input_types.size(); }
   DataType input_type(int i) const { return props_->input_types[i]; }
@@ -177,12 +179,11 @@ class OpKernel {
   // Returns a trace string for current computation, op name/type and input
   // tensor shape/dtype are encoded for profiler cost analysis. Most OpKernel
   // should use the default implementation.
-  // Override this function to add OpKernel specific attributes that are
-  // necessary for cost analysis.
-  virtual string TraceString(OpKernelContext* ctx, bool verbose);
+  virtual std::string TraceString(const OpKernelContext& ctx,
+                                  bool verbose) const;
 
  protected:
-  string GetTraceArgument(OpKernelContext* ctx);
+  std::string ShapeTraceString(const OpKernelContext& ctx) const;
 
  private:
   const std::shared_ptr<const NodeProperties> props_;
@@ -654,7 +655,7 @@ class OpKernelContext {
     SessionState* session_state = nullptr;
 
     // Unique session identifier. Can be empty.
-    string session_handle;
+    std::string session_handle;
 
     // Metadata about the session. Can be nullptr.
     const SessionMetadata* session_metadata = nullptr;
@@ -686,7 +687,7 @@ class OpKernelContext {
     StepStatsCollectorInterface* stats_collector = nullptr;
     GraphCollector* graph_collector = nullptr;
     bool run_all_kernels_inline = false;
-    const string* executor_type = nullptr;
+    const std::string* executor_type = nullptr;
 
     // TensorSliceReaderCache support.
     checkpoint::TensorSliceReaderCacheWrapper* slice_reader_cache = nullptr;
@@ -734,7 +735,7 @@ class OpKernelContext {
   // inputs. For Ref inputs use mutable_input below.
   // REQUIRES: !IsRefType(input_dtype(index))
   // TODO(mrry): Convert this to return Status.
-  const Tensor& input(int index);
+  const Tensor& input(int index) const;
 
   // Returns the named immutable input tensor in "tensor", as defined
   // in the OpDef. May only be used for non-Ref inputs. For Ref inputs
@@ -828,7 +829,7 @@ class OpKernelContext {
 
   // Returns the registered name for the executor type that is executing the
   // current kernel. If empty, the default executor is used.
-  const string& executor_type() const;
+  const std::string& executor_type() const;
 
   // Input to output forwarding.
 
@@ -887,10 +888,14 @@ class OpKernelContext {
 
   // Tries to forward one of the inputs given in input_indices to
   // output[output_index]. If none of the given inputs can be forwarded, calls
-  // allocate_output() to allocate a new output buffer.
+  // allocate_output() to allocate a new output buffer. The index of the
+  // forwarded input will be assign to output argument forwarded_input (if it's
+  // not nullptr). If no inputs are forwarded, forwarded_input will be assigned
+  // -1.
   Status forward_input_or_allocate_output(
       gtl::ArraySlice<int> candidate_input_indices, int output_index,
-      const TensorShape& output_shape, Tensor** output) TF_MUST_USE_RESULT;
+      const TensorShape& output_shape, Tensor** output,
+      int* forwarded_input = nullptr) TF_MUST_USE_RESULT;
   Status forward_input_or_allocate_output(
       gtl::ArraySlice<StringPiece> candidate_input_names,
       StringPiece output_name, const TensorShape& output_shape,
@@ -1098,7 +1103,7 @@ class OpKernelContext {
   SessionState* session_state() const { return params_->session_state; }
 
   // Unique identifier of the session it belongs to. Can be empty.
-  string session_handle() const { return params_->session_handle; }
+  std::string session_handle() const { return params_->session_handle; }
 
   // Metadata about the session. Can be nullptr.
   const SessionMetadata* session_metadata() const {
@@ -1403,7 +1408,7 @@ Status SupportedDeviceTypesForNode(
 
 // Returns a message with a description of the kernels registered for op
 // `op_name`.
-string KernelsRegisteredForOp(StringPiece op_name);
+std::string KernelsRegisteredForOp(StringPiece op_name);
 
 // Call once after Op registration has completed.
 Status ValidateKernelRegistrations(const OpRegistryInterface& op_registry);
@@ -1495,13 +1500,13 @@ Status FindKernelDef(
     bool has_experimental_debug_info,
     const NodeDef_ExperimentalDebugInfo& experimental_debug_info,
     StringPiece node_op, StringPiece node_device, AttrSlice node_attrs,
-    const KernelDef** def, string* kernel_class_name);
+    const KernelDef** def, std::string* kernel_class_name);
 
 // If node_def has a corresponding kernel registered on device_type,
 // returns OK and fill in the kernel def and kernel_class_name. <def> and
 // <kernel_class_name> may be null.
 Status FindKernelDef(const DeviceType& device_type, const NodeDef& node_def,
-                     const KernelDef** def, string* kernel_class_name);
+                     const KernelDef** def, std::string* kernel_class_name);
 
 // Writes a list of all registered kernels to LOG(INFO), to help users debug
 // missing kernel errors.
@@ -1638,13 +1643,19 @@ inline TensorValue OpKernelContext::release_output(int index) {
 
 inline Status OpKernelContext::forward_input_or_allocate_output(
     gtl::ArraySlice<int> candidate_input_indices, int output_index,
-    const TensorShape& output_shape, Tensor** output) {
+    const TensorShape& output_shape, Tensor** output, int* forwarded_input) {
   for (int input_index : candidate_input_indices) {
     if (forward_input_to_output_with_shape(input_index, output_index,
                                            output_shape, output)) {
+      if (forwarded_input != nullptr) {
+        *forwarded_input = input_index;
+      }
       return Status::OK();
     }
   }
+  if (forwarded_input != nullptr) {
+    *forwarded_input = -1;
+  }
   return allocate_output(output_index, output_shape, output);
 }
 
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index 3c915d13fdc..186f36ccae6 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -1105,7 +1105,7 @@ void BM_TraceString(const int iters, const int verbose) {
 
   testing::StartTiming();
   for (int i = 0; i < iters; ++i) {
-    auto trace = op->TraceString(ctx.get(), verbose);
+    auto trace = op->TraceString(*ctx, verbose);
   }
   testing::StopTiming();
 }
diff --git a/tensorflow/core/framework/op_segment.h b/tensorflow/core/framework/op_segment.h
index ab3ef6009b3..9a6f6e9664b 100644
--- a/tensorflow/core/framework/op_segment.h
+++ b/tensorflow/core/framework/op_segment.h
@@ -46,8 +46,8 @@ class OpSegment {
 
   // A hold can be placed on a session, preventing all its kernels
   // from being deleted.
-  void AddHold(const string& session_handle);
-  void RemoveHold(const string& session_handle);
+  void AddHold(const std::string& session_handle);
+  void RemoveHold(const std::string& session_handle);
 
   // If the kernel for "node_name" has been created in the
   // "session_handle", returns the existing op kernel in "*kernel".
@@ -57,12 +57,13 @@ class OpSegment {
   //
   // OpSegment keeps the ownership of the returned "*kernel".
   typedef std::function<Status(OpKernel**)> CreateKernelFn;
-  Status FindOrCreate(const string& session_handle, const string& node_name,
-                      OpKernel** kernel, CreateKernelFn create_fn);
+  Status FindOrCreate(const std::string& session_handle,
+                      const std::string& node_name, OpKernel** kernel,
+                      CreateKernelFn create_fn);
 
   // Returns true if OpSegment should own the kernel.
   static bool ShouldOwnKernel(FunctionLibraryRuntime* lib,
-                              const string& node_op);
+                              const std::string& node_op);
 
  private:
   // op name -> OpKernel
diff --git a/tensorflow/core/framework/ops_util.h b/tensorflow/core/framework/ops_util.h
index b323109abfc..aaf2361cc9d 100644
--- a/tensorflow/core/framework/ops_util.h
+++ b/tensorflow/core/framework/ops_util.h
@@ -81,7 +81,7 @@ bool IsDim0SliceAligned(const TensorShape& s, int64 start, int64 end_or_size) {
 }
 
 // Returns <suffix> sanitized to have only [a-zA-Z0-9-_].
-string SanitizeThreadSuffix(string suffix);
+std::string SanitizeThreadSuffix(std::string suffix);
 
 // Helper to compute 'strides' given a tensor 'shape'. I.e.,
 // strides[i] = prod(shape.dim_size[(i+1):])
diff --git a/tensorflow/core/framework/rendezvous.h b/tensorflow/core/framework/rendezvous.h
index ccd6d102b5e..d59bbb2809e 100644
--- a/tensorflow/core/framework/rendezvous.h
+++ b/tensorflow/core/framework/rendezvous.h
@@ -74,7 +74,7 @@ class RendezvousInterface {
     friend class Rendezvous;
     friend class SendOp;
     friend class RecvOp;
-    string buf_;
+    std::string buf_;
   };
 
   // The caller is a tensor producer and it sends a message (a tensor
@@ -169,9 +169,11 @@ class Rendezvous : public RendezvousInterface, public core::RefCounted {
   // Constructs a rendezvous key for the tensor of "name" sent from
   // "src_device" to "dst_device". The tensor is generated in the frame
   // and iteration specified by "frame_iter".
-  static string CreateKey(const string& src_device, uint64 src_incarnation,
-                          const string& dst_device, const string& name,
-                          const FrameAndIter& frame_iter);
+  static std::string CreateKey(const std::string& src_device,
+                               uint64 src_incarnation,
+                               const std::string& dst_device,
+                               const std::string& name,
+                               const FrameAndIter& frame_iter);
 
   static Status ParseKey(StringPiece key, ParsedKey* out);
 };
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index 3af8d81b0dc..758837e017a 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -79,7 +79,7 @@ namespace tensorflow {
 class ResourceBase : public core::RefCounted {
  public:
   // Returns a debug string for *this.
-  virtual string DebugString() const = 0;
+  virtual std::string DebugString() const = 0;
 
   // Returns memory used by this resource.
   virtual int64 MemoryUsed() const { return 0; }
@@ -100,7 +100,7 @@ class ScopedStepContainer {
 
   ScopedStepContainer(const int64 step_id,
                       std::function<void(const string&)> cleanup,
-                      const string& prefix)
+                      const std::string& prefix)
       : container_(strings::StrCat("__", prefix, "_per_step_", step_id)),
         cleanup_(cleanup),
         dirty_(false) {}
@@ -125,25 +125,25 @@ class ScopedStepContainer {
   // Pass through to MakeResourceHandle with the container name
   template <typename T>
   ResourceHandle MakeResourceHandle(
-      const string& name, const DeviceBase& device) TF_MUST_USE_RESULT;
+      const std::string& name, const DeviceBase& device) TF_MUST_USE_RESULT;
   // Pass through to ResourceMgr::Create with the container name
   template <typename T>
-  Status Create(ResourceMgr* rm, const string& name,
+  Status Create(ResourceMgr* rm, const std::string& name,
                 T* resource) TF_MUST_USE_RESULT;
   // Pass through to ResourceMgr::Delete with the container name
   template <typename T>
-  Status Delete(ResourceMgr* rm, const string& name) TF_MUST_USE_RESULT;
+  Status Delete(ResourceMgr* rm, const std::string& name) TF_MUST_USE_RESULT;
   // Pass through to ResourceMgr::Lookup with the container name
   template <typename T>
-  Status Lookup(ResourceMgr* rm, const string& name,
+  Status Lookup(ResourceMgr* rm, const std::string& name,
                 T** resource) const TF_MUST_USE_RESULT;
   // Pass through to ResourceMgr::LookupOrCreate with the container name
   template <typename T>
-  Status LookupOrCreate(ResourceMgr* rm, const string& name, T** resource,
+  Status LookupOrCreate(ResourceMgr* rm, const std::string& name, T** resource,
                         std::function<Status(T**)> creator) TF_MUST_USE_RESULT;
 
  private:
-  const string container_;
+  const std::string container_;
   const std::function<void(const string&)> cleanup_;
   mutex mu_;
   mutable std::atomic<bool> dirty_ TF_GUARDED_BY(mu_);
@@ -152,11 +152,11 @@ class ScopedStepContainer {
 class ResourceMgr {
  public:
   ResourceMgr();
-  explicit ResourceMgr(const string& default_container);
+  explicit ResourceMgr(const std::string& default_container);
   ~ResourceMgr();
 
   // Returns the default container name for *this.
-  const string& default_container() const { return default_container_; }
+  const std::string& default_container() const { return default_container_; }
 
   // Creates a resource "name" in the "container".  The caller transfers
   // the ownership of one ref on "resource" to *this, regardless of whether this
@@ -165,7 +165,7 @@ class ResourceMgr {
   // REQUIRES: std::is_base_of<ResourceBase, T>
   // REQUIRES: resource != nullptr.
   template <typename T>
-  Status Create(const string& container, const string& name,
+  Status Create(const std::string& container, const std::string& name,
                 T* resource) TF_MUST_USE_RESULT;
 
   // If "container" has a resource "name", returns it in "*resource" and
@@ -174,7 +174,7 @@ class ResourceMgr {
   // REQUIRES: std::is_base_of<ResourceBase, T>
   // REQUIRES: resource != nullptr
   template <typename T, bool use_dynamic_cast = false>
-  Status Lookup(const string& container, const string& name,
+  Status Lookup(const std::string& container, const std::string& name,
                 T** resource) const TF_MUST_USE_RESULT;
 
   // Similar to Lookup, but looks up multiple resources at once, with only a
@@ -197,7 +197,7 @@ class ResourceMgr {
   // REQUIRES: std::is_base_of<ResourceBase, T>
   // REQUIRES: resource != nullptr
   template <typename T, bool use_dynamic_cast = false>
-  Status LookupOrCreate(const string& container, const string& name,
+  Status LookupOrCreate(const std::string& container, const std::string& name,
                         T** resource,
                         std::function<Status(T**)> creator) TF_MUST_USE_RESULT;
 
@@ -205,19 +205,20 @@ class ResourceMgr {
   //
   // REQUIRES: std::is_base_of<ResourceBase, T>
   template <typename T>
-  Status Delete(const string& container, const string& name) TF_MUST_USE_RESULT;
+  Status Delete(const std::string& container,
+                const std::string& name) TF_MUST_USE_RESULT;
 
   // Deletes the resource pointed by "handle".
   Status Delete(const ResourceHandle& handle) TF_MUST_USE_RESULT;
 
   // Deletes all resources from the "container" and removes the container.
-  Status Cleanup(const string& container) TF_MUST_USE_RESULT;
+  Status Cleanup(const std::string& container) TF_MUST_USE_RESULT;
 
   // Deletes all resources in all containers.
   void Clear();
 
   // Returns a text description for all resources.
-  string DebugString() const;
+  std::string DebugString() const;
 
  private:
   typedef std::pair<uint64, StringPiece> Key;
@@ -236,7 +237,7 @@ class ResourceMgr {
     std::unique_ptr<string> name;
 
     ResourceAndName();
-    ResourceAndName(ResourceBase* resource, string name);
+    ResourceAndName(ResourceBase* resource, std::string name);
     ResourceAndName(ResourceAndName&& other) noexcept;
     ~ResourceAndName();
 
@@ -247,31 +248,31 @@ class ResourceMgr {
   };
   typedef std::unordered_map<Key, ResourceAndName, KeyHash, KeyEqual> Container;
 
-  const string default_container_;
+  const std::string default_container_;
   mutable mutex mu_;
   std::unordered_map<string, Container*> containers_ TF_GUARDED_BY(mu_);
 
   template <typename T, bool use_dynamic_cast = false>
-  Status LookupInternal(const string& container, const string& name,
+  Status LookupInternal(const std::string& container, const std::string& name,
                         T** resource) const
       TF_SHARED_LOCKS_REQUIRED(mu_) TF_MUST_USE_RESULT;
 
-  Status DoCreate(const string& container, TypeIndex type, const string& name,
-                  ResourceBase* resource)
+  Status DoCreate(const std::string& container, TypeIndex type,
+                  const std::string& name, ResourceBase* resource)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) TF_MUST_USE_RESULT;
 
-  Status DoLookup(const string& container, TypeIndex type, const string& name,
-                  ResourceBase** resource) const
+  Status DoLookup(const std::string& container, TypeIndex type,
+                  const std::string& name, ResourceBase** resource) const
       TF_SHARED_LOCKS_REQUIRED(mu_) TF_MUST_USE_RESULT;
 
-  Status DoDelete(const string& container, uint64 type_hash_code,
-                  const string& resource_name,
-                  const string& type_name) TF_MUST_USE_RESULT;
-  Status DoDelete(const string& container, TypeIndex type,
-                  const string& resource_name) TF_MUST_USE_RESULT;
+  Status DoDelete(const std::string& container, uint64 type_hash_code,
+                  const std::string& resource_name,
+                  const std::string& type_name) TF_MUST_USE_RESULT;
+  Status DoDelete(const std::string& container, TypeIndex type,
+                  const std::string& resource_name) TF_MUST_USE_RESULT;
 
   // Inserts the type name for 'hash_code' into the hash_code to type name map.
-  Status InsertDebugTypeName(uint64 hash_code, const string& type_name)
+  Status InsertDebugTypeName(uint64 hash_code, const std::string& type_name)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) TF_MUST_USE_RESULT;
 
   // Returns the type name for the 'hash_code'.
@@ -289,14 +290,14 @@ class ResourceMgr {
 // Makes a resource handle with the specified type for a given container /
 // name.
 ResourceHandle MakeResourceHandle(
-    const string& container, const string& name, const DeviceBase& device,
-    const TypeIndex& type_index,
+    const std::string& container, const std::string& name,
+    const DeviceBase& device, const TypeIndex& type_index,
     const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {})
     TF_MUST_USE_RESULT;
 
 template <typename T>
 ResourceHandle MakeResourceHandle(
-    OpKernelContext* ctx, const string& container, const string& name,
+    OpKernelContext* ctx, const std::string& container, const std::string& name,
     const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {}) {
   return MakeResourceHandle(
       container.empty() ? ctx->resource_manager()->default_container()
@@ -306,7 +307,8 @@ ResourceHandle MakeResourceHandle(
 
 template <typename T>
 ResourceHandle MakeResourceHandle(
-    OpKernelConstruction* ctx, const string& container, const string& name,
+    OpKernelConstruction* ctx, const std::string& container,
+    const std::string& name,
     const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {}) {
   return MakeResourceHandle(
       container.empty() ? ctx->resource_manager()->default_container()
@@ -315,7 +317,8 @@ ResourceHandle MakeResourceHandle(
 }
 
 Status MakeResourceHandleToOutput(OpKernelContext* context, int output_index,
-                                  const string& container, const string& name,
+                                  const std::string& container,
+                                  const std::string& name,
                                   const TypeIndex& type_index);
 
 // Returns a resource handle from a numbered op input.
@@ -409,19 +412,19 @@ class ContainerInfo {
   // name is name().  If resource_is_private_to_kernel() is true, the
   // kernel should delete the resource when the kernel is deleted.
   ResourceMgr* resource_manager() const { return rmgr_; }
-  const string& container() const { return container_; }
-  const string& name() const { return name_; }
+  const std::string& container() const { return container_; }
+  const std::string& name() const { return name_; }
   bool resource_is_private_to_kernel() const {
     return resource_is_private_to_kernel_;
   }
 
   // Returns a readable string for *this.
-  string DebugString() const;
+  std::string DebugString() const;
 
  private:
   ResourceMgr* rmgr_ = nullptr;
-  string container_;
-  string name_;
+  std::string container_;
+  std::string name_;
   bool resource_is_private_to_kernel_ = false;
 };
 
@@ -435,8 +438,8 @@ class ContainerInfo {
 // Returns OK if the resource is found and transfers one ref of
 // *resource to the caller. Otherwise, returns an error.
 template <typename T>
-Status GetResourceFromContext(OpKernelContext* ctx, const string& input_name,
-                              T** resource);
+Status GetResourceFromContext(OpKernelContext* ctx,
+                              const std::string& input_name, T** resource);
 
 // Utility op kernel to check if a handle to resource type T is initialized.
 template <typename T>
@@ -470,8 +473,8 @@ class ResourceHandleOp : public OpKernel {
   bool IsExpensive() override { return false; }
 
  private:
-  string container_;
-  string name_;
+  std::string container_;
+  std::string name_;
   mutex mutex_;
   Tensor resource_;
   std::atomic<bool> initialized_{false};
@@ -584,8 +587,8 @@ void CheckDeriveFromResourceBase() {
 }
 
 template <typename T>
-Status ResourceMgr::Create(const string& container, const string& name,
-                           T* resource) {
+Status ResourceMgr::Create(const std::string& container,
+                           const std::string& name, T* resource) {
   CheckDeriveFromResourceBase<T>();
   CHECK(resource != nullptr);
   mutex_lock l(mu_);
@@ -593,8 +596,8 @@ Status ResourceMgr::Create(const string& container, const string& name,
 }
 
 template <typename T, bool use_dynamic_cast>
-Status ResourceMgr::Lookup(const string& container, const string& name,
-                           T** resource) const {
+Status ResourceMgr::Lookup(const std::string& container,
+                           const std::string& name, T** resource) const {
   CheckDeriveFromResourceBase<T>();
   tf_shared_lock l(mu_);
   return LookupInternal<T, use_dynamic_cast>(container, name, resource);
@@ -632,7 +635,8 @@ struct TypeCastFunctor<T, true> {
 };
 
 template <typename T, bool use_dynamic_cast>
-Status ResourceMgr::LookupInternal(const string& container, const string& name,
+Status ResourceMgr::LookupInternal(const std::string& container,
+                                   const std::string& name,
                                    T** resource) const {
   ResourceBase* found = nullptr;
   Status s = DoLookup(container, TypeIndex::Make<T>(), name, &found);
@@ -645,8 +649,8 @@ Status ResourceMgr::LookupInternal(const string& container, const string& name,
 }
 
 template <typename T, bool use_dynamic_cast>
-Status ResourceMgr::LookupOrCreate(const string& container, const string& name,
-                                   T** resource,
+Status ResourceMgr::LookupOrCreate(const std::string& container,
+                                   const std::string& name, T** resource,
                                    std::function<Status(T**)> creator) {
   CheckDeriveFromResourceBase<T>();
   *resource = nullptr;
@@ -669,14 +673,15 @@ Status ResourceMgr::LookupOrCreate(const string& container, const string& name,
 }
 
 template <typename T>
-Status ResourceMgr::Delete(const string& container, const string& name) {
+Status ResourceMgr::Delete(const std::string& container,
+                           const std::string& name) {
   CheckDeriveFromResourceBase<T>();
   return DoDelete(container, TypeIndex::Make<T>(), name);
 }
 
 template <typename T>
-Status GetResourceFromContext(OpKernelContext* ctx, const string& input_name,
-                              T** resource) {
+Status GetResourceFromContext(OpKernelContext* ctx,
+                              const std::string& input_name, T** resource) {
   DataType dtype;
   TF_RETURN_IF_ERROR(ctx->input_dtype(input_name, &dtype));
   if (dtype == DT_RESOURCE) {
@@ -684,8 +689,8 @@ Status GetResourceFromContext(OpKernelContext* ctx, const string& input_name,
     TF_RETURN_IF_ERROR(ctx->input(input_name, &handle));
     return LookupResource(ctx, handle->scalar<ResourceHandle>()(), resource);
   }
-  string container;
-  string shared_name;
+  std::string container;
+  std::string shared_name;
   {
     mutex* mu;
     TF_RETURN_IF_ERROR(ctx->input_ref_mutex(input_name, &mu));
@@ -879,7 +884,7 @@ void ResourceHandlesOp<T>::Compute(OpKernelContext* ctx) {
 
 template <typename T>
 ResourceHandle ScopedStepContainer::MakeResourceHandle(
-    const string& name, const DeviceBase& device) {
+    const std::string& name, const DeviceBase& device) {
   mutex_lock ml(mu_);
   dirty_ = true;
   return tensorflow::MakeResourceHandle(container_, name, device,
@@ -887,13 +892,14 @@ ResourceHandle ScopedStepContainer::MakeResourceHandle(
 }
 
 template <typename T>
-Status ScopedStepContainer::Lookup(ResourceMgr* rm, const string& name,
+Status ScopedStepContainer::Lookup(ResourceMgr* rm, const std::string& name,
                                    T** resource) const {
   return rm->Lookup<T>(container_, name, resource);
 }
 
 template <typename T>
-Status ScopedStepContainer::LookupOrCreate(ResourceMgr* rm, const string& name,
+Status ScopedStepContainer::LookupOrCreate(ResourceMgr* rm,
+                                           const std::string& name,
                                            T** resource,
                                            std::function<Status(T**)> creator) {
   mutex_lock ml(mu_);
@@ -902,7 +908,7 @@ Status ScopedStepContainer::LookupOrCreate(ResourceMgr* rm, const string& name,
 }
 
 template <typename T>
-Status ScopedStepContainer::Create(ResourceMgr* rm, const string& name,
+Status ScopedStepContainer::Create(ResourceMgr* rm, const std::string& name,
                                    T* resource) {
   mutex_lock ml(mu_);
   dirty_ = true;
@@ -910,7 +916,7 @@ Status ScopedStepContainer::Create(ResourceMgr* rm, const string& name,
 }
 
 template <typename T>
-Status ScopedStepContainer::Delete(ResourceMgr* rm, const string& name) {
+Status ScopedStepContainer::Delete(ResourceMgr* rm, const std::string& name) {
   return rm->Delete<T>(container_, name);
 }
 
diff --git a/tensorflow/core/framework/resource_var.h b/tensorflow/core/framework/resource_var.h
index 39fe5bbff91..f4ae7d5de61 100644
--- a/tensorflow/core/framework/resource_var.h
+++ b/tensorflow/core/framework/resource_var.h
@@ -67,7 +67,7 @@ class Var : public ResourceBase {
   mutex* mu() { return &mu_; }
   Tensor* tensor() { return &tensor_; }
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     return strings::StrCat(DataTypeString(tensor_.dtype()), "/",
                            tensor_.shape().DebugString());
   }
diff --git a/tensorflow/core/framework/session_state.h b/tensorflow/core/framework/session_state.h
index 877c9970de4..ca0abd5b9d2 100644
--- a/tensorflow/core/framework/session_state.h
+++ b/tensorflow/core/framework/session_state.h
@@ -31,13 +31,13 @@ namespace tensorflow {
 class SessionState {
  public:
   // Get a tensor from the session state.
-  Status GetTensor(const string& handle, Tensor* tensor);
+  Status GetTensor(const std::string& handle, Tensor* tensor);
 
   // Store a tensor in the session state.
-  Status AddTensor(const string& handle, const Tensor& tensor);
+  Status AddTensor(const std::string& handle, const Tensor& tensor);
 
   // Delete a tensor from the session state.
-  Status DeleteTensor(const string& handle);
+  Status DeleteTensor(const std::string& handle);
 
   int64 GetNewId();
 
@@ -60,15 +60,15 @@ class TensorStore {
   struct TensorAndKey {
     Tensor tensor;
     int64 id;
-    string device_name;
+    std::string device_name;
 
-    string GetHandle(const string& tensor_name) {
+    std::string GetHandle(const std::string& tensor_name) {
       return strings::StrCat(tensor_name, ";", id, ";", device_name);
     }
   };
 
   // Add the named tensor to the tensor store for this run.
-  Status AddTensor(const string& name, const TensorAndKey& tk);
+  Status AddTensor(const std::string& name, const TensorAndKey& tk);
 
   // Save the tensors in the tensor store of this run to the session.
   Status SaveTensors(const std::vector<string>& output_names,
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index 92e98b3fed4..456c1826572 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -62,14 +62,14 @@ InferenceContext::InferenceContext(
   }
   std::vector<std::unique_ptr<std::vector<ShapeAndType>>> handle_data(
       input_shapes.size());
-  for (int i = 0; i < input_handle_shapes_and_types.size(); ++i) {
+  for (int i = 0, end = input_handle_shapes_and_types.size(); i < end; ++i) {
     const auto& v = input_handle_shapes_and_types[i];
     if (v == nullptr) {
       continue;
     }
     handle_data[i].reset(new std::vector<ShapeAndType>(v->size()));
     auto& new_v = *handle_data[i];
-    for (int j = 0; j < v->size(); ++j) {
+    for (int j = 0, end = v->size(); j < end; ++j) {
       const auto& p = (*v)[j];
       construction_status_.Update(
           MakeShapeFromPartialTensorShape(p.first, &new_v[j].shape));
@@ -123,11 +123,12 @@ Status InferenceContext::set_output(StringPiece output_name,
   } else {
     const int start = result->second.first;
     const int size = result->second.second - start;
-    if (size != shapes.size()) {
+    const int shapes_size = shapes.size();
+    if (size != shapes_size) {
       return errors::InvalidArgument("Must have exactly ", shapes.size(),
                                      " shapes.");
     }
-    for (int i = 0; i < size; ++i) {
+    for (int i = 0; i < shapes_size; ++i) {
       outputs_[i + start] = shapes[i];
     }
   }
@@ -181,7 +182,8 @@ void InferenceContext::PreInputInit(
 }
 
 Status InferenceContext::ExpandOutputs(int new_output_size) {
-  if (new_output_size < outputs_.size()) {
+  const int outputs_size = outputs_.size();
+  if (new_output_size < outputs_size) {
     return errors::InvalidArgument("Trying to reduce number of outputs of op.");
   }
   outputs_.resize(new_output_size, nullptr);
@@ -209,8 +211,8 @@ void InferenceContext::PostInputInit(
     }
     input_handle_shapes_and_types_ = std::move(input_handle_data);
   }
-
-  if (inputs_.size() != num_inputs_from_node_def) {
+  const int inputs_size = inputs_.size();
+  if (inputs_size != num_inputs_from_node_def) {
     construction_status_ = errors::InvalidArgument(
         "Wrong number of inputs passed: ", inputs_.size(), " while ",
         num_inputs_from_node_def, " expected based on NodeDef");
@@ -718,7 +720,8 @@ Status InferenceContext::MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
   TF_RETURN_IF_ERROR(WithRankAtMost(input(input_idx), 1, &input_shape));
 
   requested_input_tensor_as_partial_shape_[input_idx] = true;
-  if (input_idx < input_tensors_as_shapes_.size() &&
+  const int input_tensors_as_shapes_size = input_tensors_as_shapes_.size();
+  if (input_idx < input_tensors_as_shapes_size &&
       input_tensors_as_shapes_[input_idx].IsSet() &&
       RankKnown(input_tensors_as_shapes_[input_idx])) {
     *out = input_tensors_as_shapes_[input_idx];
@@ -736,7 +739,8 @@ Status InferenceContext::MakeShapeFromShapeTensor(int input_idx,
   TF_RETURN_IF_ERROR(WithRank(input(input_idx), 1, &input_shape));
 
   requested_input_tensor_as_partial_shape_[input_idx] = true;
-  if (input_idx < input_tensors_as_shapes_.size() &&
+  const int input_tensors_as_shapes_size = input_tensors_as_shapes_.size();
+  if (input_idx < input_tensors_as_shapes_size &&
       input_tensors_as_shapes_[input_idx].IsSet() &&
       RankKnown(input_tensors_as_shapes_[input_idx])) {
     *out = input_tensors_as_shapes_[input_idx];
@@ -1099,14 +1103,16 @@ Status InferenceContext::AttachContext(const Status& status) {
   std::vector<string> input_from_tensors_str;
   std::vector<string> input_from_tensors_as_shape_str;
   input_from_tensors_as_shape_str.reserve(inputs_.size());
-  for (int i = 0; i < inputs_.size(); ++i) {
+  for (int i = 0, end = inputs_.size(); i < end; ++i) {
+    const int input_tensors_as_shapes_size = input_tensors_as_shapes_.size();
+    const int input_tensors_size = input_tensors_.size();
     if (requested_input_tensor_as_partial_shape_[i] &&
-        i < input_tensors_as_shapes_.size() &&
+        i < input_tensors_as_shapes_size &&
         input_tensors_as_shapes_[i].IsSet() &&
         RankKnown(input_tensors_as_shapes_[i])) {
       input_from_tensors_as_shape_str.push_back(strings::StrCat(
           "input[", i, "] = ", DebugString(input_tensors_as_shapes_[i])));
-    } else if (requested_input_tensor_[i] && i < input_tensors_.size() &&
+    } else if (requested_input_tensor_[i] && i < input_tensors_size &&
                input_tensors_[i] != nullptr) {
       input_from_tensors_str.push_back(strings::StrCat(
           "input[", i, "] = <",
@@ -1140,7 +1146,7 @@ bool InferenceContext::MergeHandleShapesAndTypes(
   }
   std::vector<ShapeAndType> new_values(shapes_and_types.size());
   bool refined = false;
-  for (int i = 0; i < shapes_and_types.size(); ++i) {
+  for (int i = 0, end = shapes_and_types.size(); i < end; ++i) {
     const ShapeAndType& existing = (*to_update)[i];
     if (shapes_and_types[i].dtype == existing.dtype) {
       new_values[i].dtype = existing.dtype;
@@ -1164,7 +1170,7 @@ bool InferenceContext::MergeHandleShapesAndTypes(
   if (!refined) {
     return false;
   }
-  for (int i = 0; i < new_values.size(); ++i) {
+  for (int i = 0, end = new_values.size(); i < end; ++i) {
     (*to_update)[i] = new_values[i];
   }
   return true;
@@ -1199,7 +1205,7 @@ bool InferenceContext::RelaxHandleShapesAndMergeTypes(
     return false;
   }
   std::vector<ShapeAndType> new_values(shapes_and_types.size());
-  for (int i = 0; i < shapes_and_types.size(); ++i) {
+  for (int i = 0, end = shapes_and_types.size(); i < end; ++i) {
     const ShapeAndType& existing = (*to_update)[i];
     if (shapes_and_types[i].dtype == existing.dtype) {
       new_values[i].dtype = existing.dtype;
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index 1ccaa8216ec..bb79b278cb1 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -344,13 +344,13 @@ class InferenceContext {
   // incomplete shape.
   DimensionHandle NumElements(ShapeHandle s);
 
-  string DebugString(ShapeHandle s);
-  string DebugString(DimensionHandle d);
-  string DebugString(const ShapeAndType& shape_and_type);
-  string DebugString(gtl::ArraySlice<ShapeAndType> shape_and_types);
+  std::string DebugString(ShapeHandle s);
+  std::string DebugString(DimensionHandle d);
+  std::string DebugString(const ShapeAndType& shape_and_type);
+  std::string DebugString(gtl::ArraySlice<ShapeAndType> shape_and_types);
 
   // Describes the whole context, for debugging purposes.
-  string DebugString() const;
+  std::string DebugString() const;
 
   // If <shape> has rank <rank>, or its rank is unknown, return OK and return
   // the shape with asserted rank in <*out>. Otherwise return an error.
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index a0f30508ee8..03499ec0220 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -1007,9 +1007,9 @@ inline const strings::AlphaNum& PrintOneElement(const strings::AlphaNum& a,
 }
 inline string PrintOneElement(const tstring& a, bool print_v2) {
   if (print_v2) {
-    return "\"" + absl::CEscape(a) + "\"";
+    return "\"" + absl::Utf8SafeCEscape(a) + "\"";
   } else {
-    return absl::CEscape(a);
+    return absl::Utf8SafeCEscape(a);
   }
 }
 inline float PrintOneElement(const Eigen::half& h, bool print_v2) {
diff --git a/tensorflow/core/framework/tensor_key.h b/tensorflow/core/framework/tensor_key.h
new file mode 100644
index 00000000000..e70c03ec9d4
--- /dev/null
+++ b/tensorflow/core/framework/tensor_key.h
@@ -0,0 +1,68 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_TENSOR_KEY_H_
+#define TENSORFLOW_CORE_FRAMEWORK_TENSOR_KEY_H_
+
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+class TensorKey : public Tensor {
+ public:
+  using Tensor::Tensor;
+
+  TensorKey(const Tensor& t) : Tensor(t) {}
+
+  // Equality operator. Needed for absl hashing.
+  friend bool operator==(const TensorKey& t1, const TensorKey& t2) {
+    if (t1.dtype() != t2.dtype() || t1.shape() != t2.shape()) {
+      return false;
+    }
+    if (DataTypeCanUseMemcpy(t1.dtype())) {
+      return t1.tensor_data() == t2.tensor_data();
+    }
+    if (t1.dtype() == DT_STRING) {
+      const auto s1 = t1.unaligned_flat<tstring>();
+      const auto s2 = t2.unaligned_flat<tstring>();
+      for (int64 i = 0, n = t1.NumElements(); i < n; ++i) {
+        if (TF_PREDICT_FALSE(s1(i) != s2(i))) {
+          return false;
+        }
+      }
+      return true;
+    }
+    return false;
+  }
+
+  friend bool operator!=(const TensorKey& t1, const TensorKey& t2) {
+    return !(t1 == t2);
+  }
+
+  // Needed for absl hash function.
+  template <typename H>
+  friend H AbslHashValue(H h, const TensorKey& k) {
+    const uint8* d = static_cast<uint8*>(k.data());
+    size_t s = k.AllocatedBytes();
+    std::vector<uint8> vec;
+    for (int i = 0; i < s; i++) {
+      vec.push_back(d[i]);
+    }
+    return H::combine(std::move(h), s);
+  }
+};
+
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/core/framework/tensor_shape.cc b/tensorflow/core/framework/tensor_shape.cc
index 1e11c4ab0ce..565014d14b1 100644
--- a/tensorflow/core/framework/tensor_shape.cc
+++ b/tensorflow/core/framework/tensor_shape.cc
@@ -593,7 +593,7 @@ Status MakeShapeHelper(const T* dims, int64 n, Shape* out) {
       if (TF_PREDICT_FALSE(new_num_elements < 0)) {
         TensorShapeProto proto;
         for (int64 j = 0; j < n; ++j) {
-          proto.add_dim()->set_size(dim);
+          proto.add_dim()->set_size(internal::SubtleMustCopy(dims[j]));
         }
         return errors::InvalidArgument(
             "Shape ", TensorShape::DebugString(proto),
diff --git a/tensorflow/core/framework/tracking_allocator.h b/tensorflow/core/framework/tracking_allocator.h
index ca18dc9a050..7b5b3914917 100644
--- a/tensorflow/core/framework/tracking_allocator.h
+++ b/tensorflow/core/framework/tracking_allocator.h
@@ -54,7 +54,7 @@ struct AllocRecord {
 class TrackingAllocator : public Allocator {
  public:
   explicit TrackingAllocator(Allocator* allocator, bool track_ids);
-  string Name() override { return allocator_->Name(); }
+  std::string Name() override { return allocator_->Name(); }
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
     return AllocateRaw(alignment, num_bytes, AllocationAttributes());
   }
diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h
index fe52f8b2b59..2b5f41be0de 100644
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@@ -395,8 +395,6 @@ MATCH_TYPE_AND_ENUM(int8, DT_INT8);
 MATCH_TYPE_AND_ENUM(tstring, DT_STRING);
 MATCH_TYPE_AND_ENUM(complex64, DT_COMPLEX64);
 MATCH_TYPE_AND_ENUM(complex128, DT_COMPLEX128);
-MATCH_TYPE_AND_ENUM(int64, DT_INT64);
-MATCH_TYPE_AND_ENUM(uint64, DT_UINT64);
 MATCH_TYPE_AND_ENUM(bool, DT_BOOL);
 MATCH_TYPE_AND_ENUM(qint8, DT_QINT8);
 MATCH_TYPE_AND_ENUM(quint8, DT_QUINT8);
@@ -408,6 +406,59 @@ MATCH_TYPE_AND_ENUM(Eigen::half, DT_HALF);
 MATCH_TYPE_AND_ENUM(ResourceHandle, DT_RESOURCE);
 MATCH_TYPE_AND_ENUM(Variant, DT_VARIANT);
 
+template <>
+struct DataTypeToEnum<long> {
+  static DataType v() { return value; }
+  static DataType ref() { return MakeRefType(value); }
+  static constexpr DataType value = sizeof(long) == 4 ? DT_INT32 : DT_INT64;
+};
+template <>
+struct IsValidDataType<long> {
+  static constexpr bool value = true;
+};
+template <>
+struct EnumToDataType<DT_INT64> {
+  typedef tensorflow::int64 Type;
+};
+
+template <>
+struct DataTypeToEnum<unsigned long> {
+  static DataType v() { return value; }
+  static DataType ref() { return MakeRefType(value); }
+  static constexpr DataType value =
+      sizeof(unsigned long) == 4 ? DT_UINT32 : DT_UINT64;
+};
+template <>
+struct IsValidDataType<unsigned long> {
+  static constexpr bool value = true;
+};
+template <>
+struct EnumToDataType<DT_UINT64> {
+  typedef tensorflow::uint64 Type;
+};
+
+template <>
+struct DataTypeToEnum<long long> {
+  static DataType v() { return DT_INT64; }
+  static DataType ref() { return MakeRefType(DT_INT64); }
+  static constexpr DataType value = DT_INT64;
+};
+template <>
+struct IsValidDataType<long long> {
+  static constexpr bool value = true;
+};
+
+template <>
+struct DataTypeToEnum<unsigned long long> {
+  static DataType v() { return DT_UINT64; }
+  static DataType ref() { return MakeRefType(DT_UINT64); }
+  static constexpr DataType value = DT_UINT64;
+};
+template <>
+struct IsValidDataType<unsigned long long> {
+  static constexpr bool value = true;
+};
+
 #undef MATCH_TYPE_AND_ENUM
 
 // All types not specialized are marked invalid.
diff --git a/tensorflow/core/framework/variant.h b/tensorflow/core/framework/variant.h
index e8a0c332968..f67d94b48e2 100644
--- a/tensorflow/core/framework/variant.h
+++ b/tensorflow/core/framework/variant.h
@@ -32,10 +32,10 @@ limitations under the License.
 namespace tensorflow {
 
 template <typename T>
-string TypeNameVariant(const T& value);
+std::string TypeNameVariant(const T& value);
 
 template <typename T>
-string DebugStringVariant(const T& value);
+std::string DebugStringVariant(const T& value);
 
 // Allows for specializations of Variant Decoding.  `data` may be modified in
 // the process of decoding to `value`.
@@ -43,13 +43,13 @@ template <typename T>
 bool DecodeVariant(VariantTensorData* data, T* value);
 
 template <typename T>
-bool DecodeVariant(string* buf, T* value);
+bool DecodeVariant(std::string* buf, T* value);
 
 template <typename T>
 void EncodeVariant(const T& value, VariantTensorData* data);
 
 template <typename T>
-void EncodeVariant(const T& value, string* buf);
+void EncodeVariant(const T& value, std::string* buf);
 
 // This is an implementation of a type-erased container that can store an
 // object of any type. The implementation is very similar to std::any, but has
@@ -234,7 +234,7 @@ class Variant {
     return GetValue()->TypeId();
   }
 
-  string DebugString() const {
+  std::string DebugString() const {
     return strings::StrCat(
         "Variant<type: ", TypeName(),
         " value: ", is_empty() ? "[empty]" : GetValue()->DebugString(), ">");
@@ -264,7 +264,7 @@ class Variant {
   // In the special case that a serialized Variant is stored (value
   // is a VariantTensorDataProto), returns value.TypeName(), the
   // TypeName field stored in the VariantTensorDataProto buffer.
-  string TypeName() const {
+  std::string TypeName() const {
     if (is_empty()) {
       return "";
     }
@@ -282,12 +282,12 @@ class Variant {
   bool Decode(VariantTensorData data);
 
   // Helper methods to directly serialize/deserialize from strings.
-  void Encode(string* buf) const {
+  void Encode(std::string* buf) const {
     if (!is_empty()) {
       GetValue()->Encode(buf);
     }
   }
-  bool Decode(string buf) {
+  bool Decode(std::string buf) {
     if (!is_empty()) {
       return GetValue()->Decode(std::move(buf));
     }
@@ -313,12 +313,12 @@ class Variant {
     virtual void CloneInto(ValueInterface* memory) const = 0;
     virtual void MoveAssign(ValueInterface* memory) = 0;
     virtual void MoveInto(ValueInterface* memory) = 0;
-    virtual string TypeName() const = 0;
-    virtual string DebugString() const = 0;
+    virtual std::string TypeName() const = 0;
+    virtual std::string DebugString() const = 0;
     virtual void Encode(VariantTensorData* data) const = 0;
     virtual bool Decode(VariantTensorData data) = 0;
-    virtual void Encode(string* buf) const = 0;
-    virtual bool Decode(string data) = 0;
+    virtual void Encode(std::string* buf) const = 0;
+    virtual bool Decode(std::string data) = 0;
   };
 
   template <typename T>
@@ -359,9 +359,9 @@ class Variant {
       new (memory) Value(InPlace(), std::move(value));
     }
 
-    string TypeName() const final { return TypeNameVariant(value); }
+    std::string TypeName() const final { return TypeNameVariant(value); }
 
-    string DebugString() const final { return DebugStringVariant(value); }
+    std::string DebugString() const final { return DebugStringVariant(value); }
 
     void Encode(VariantTensorData* data) const final {
       EncodeVariant(value, data);
@@ -371,9 +371,9 @@ class Variant {
       return DecodeVariant(&data, &value);
     }
 
-    void Encode(string* buf) const final { EncodeVariant(value, buf); }
+    void Encode(std::string* buf) const final { EncodeVariant(value, buf); }
 
-    bool Decode(string buf) final { return DecodeVariant(&buf, &value); }
+    bool Decode(std::string buf) final { return DecodeVariant(&buf, &value); }
 
     T value;
   };
diff --git a/tensorflow/core/framework/variant_encode_decode.h b/tensorflow/core/framework/variant_encode_decode.h
index 502bbd57422..340d01d5f5d 100644
--- a/tensorflow/core/framework/variant_encode_decode.h
+++ b/tensorflow/core/framework/variant_encode_decode.h
@@ -105,7 +105,7 @@ bool DecodeVariantImpl(VariantTensorData data,
                        TypeResolver<T, false /* is_pod */, false /* Tensor */,
                                     true /* protobuf */>,
                        T* value) {
-  string metadata;
+  std::string metadata;
   data.get_metadata(&metadata);
   return value->ParseFromString(std::move(metadata));
 }
@@ -136,27 +136,27 @@ template <typename T, bool = has_type_name<typename std::decay<T>::type>::value,
 struct TypeNameResolver {};
 
 template <typename T>
-string TypeNameVariantImpl(const T& value,
-                           TypeNameResolver<T, true /* has_type_name */>) {
+std::string TypeNameVariantImpl(const T& value,
+                                TypeNameResolver<T, true /* has_type_name */>) {
   return value.TypeName();
 }
 
 template <typename T>
-string TypeNameVariantImpl(
+std::string TypeNameVariantImpl(
     const T& value,
     TypeNameResolver<T, false /* has_type_name */, true /* Tensor */>) {
   return "tensorflow::Tensor";
 }
 
 template <typename T>
-string TypeNameVariantImpl(
+std::string TypeNameVariantImpl(
     const T& value, TypeNameResolver<T, false /* has_type_name */,
                                      false /* Tensor */, true /* protobuf */>) {
   return value.GetTypeName();
 }
 
 template <typename T>
-string TypeNameVariantImpl(
+std::string TypeNameVariantImpl(
     const T& value,
     TypeNameResolver<T, false /* has_type_name */, false /* Tensor */,
                      false /* protobuf */>) {
@@ -164,7 +164,7 @@ string TypeNameVariantImpl(
 }
 
 template <typename T>
-string TypeNameVariant(const T& value) {
+std::string TypeNameVariant(const T& value) {
   return TypeNameVariantImpl(value, TypeNameResolver<T>());
 }
 
@@ -194,27 +194,27 @@ struct DebugStringResolver {};
 // TODO(ebrevdo): Expand DebugStringResolver to return TypeString if
 // there is no StrCat<T>() constructor.
 template <typename T>
-string DebugStringVariantImpl(
+std::string DebugStringVariantImpl(
     const T& value, DebugStringResolver<T, true /* has_debug_string */>) {
   return value.DebugString();
 }
 
 template <typename T>
-string DebugStringVariantImpl(
+std::string DebugStringVariantImpl(
     const T& value, DebugStringResolver<T, false /* has_debug_string */,
                                         true /* can_strcat */>) {
   return strings::StrCat(value);
 }
 
 template <typename T>
-string DebugStringVariantImpl(
+std::string DebugStringVariantImpl(
     const T& value, DebugStringResolver<T, false /* has_debug_string */,
                                         false /* can_strcat */>) {
   return "?";
 }
 
 template <typename T>
-string DebugStringVariant(const T& value) {
+std::string DebugStringVariant(const T& value) {
   return DebugStringVariantImpl(value, DebugStringResolver<T>());
 }
 
@@ -230,7 +230,7 @@ bool DecodeVariant(VariantTensorData* data, T* value) {
 }
 
 template <typename T>
-void EncodeVariant(const T& value, string* buf) {
+void EncodeVariant(const T& value, std::string* buf) {
   VariantTensorData data;
   EncodeVariantImpl(value, TypeResolver<T>(), &data);
   data.set_type_name(TypeNameVariant(value));
@@ -239,7 +239,7 @@ void EncodeVariant(const T& value, string* buf) {
 }
 
 template <typename T>
-bool DecodeVariant(string* buf, T* value) {
+bool DecodeVariant(std::string* buf, T* value) {
   VariantTensorData data;
   if (!data.ParseFromString(*buf)) return false;
   if (!DecodeVariantImpl(std::move(data), TypeResolver<T>(), value)) {
@@ -250,7 +250,7 @@ bool DecodeVariant(string* buf, T* value) {
 
 // Specializations for VariantTensorDataProto
 template <>
-string TypeNameVariant(const VariantTensorDataProto& value);
+std::string TypeNameVariant(const VariantTensorDataProto& value);
 
 template <>
 void EncodeVariant(const VariantTensorDataProto& value,
@@ -260,10 +260,10 @@ template <>
 bool DecodeVariant(VariantTensorData* data, VariantTensorDataProto* value);
 
 template <>
-void EncodeVariant(const VariantTensorDataProto& value, string* buf);
+void EncodeVariant(const VariantTensorDataProto& value, std::string* buf);
 
 template <>
-bool DecodeVariant(string* buf, VariantTensorDataProto* value);
+bool DecodeVariant(std::string* buf, VariantTensorDataProto* value);
 
 // Encodes an array of Variant objects in to the given StringListEncoder.
 // `variant_array` is assumed to point to an array of `n` Variant objects.
diff --git a/tensorflow/core/framework/variant_op_registry.h b/tensorflow/core/framework/variant_op_registry.h
index 5879597e5eb..edfb9c544c0 100644
--- a/tensorflow/core/framework/variant_op_registry.h
+++ b/tensorflow/core/framework/variant_op_registry.h
@@ -93,7 +93,7 @@ class UnaryVariantOpRegistry {
       AsyncVariantDeviceCopyFn;
 
   // Add a decode function to the registry.
-  void RegisterDecodeFn(const string& type_name,
+  void RegisterDecodeFn(const std::string& type_name,
                         const VariantDecodeFn& decode_fn);
 
   // Returns nullptr if no decode function was found for the given TypeName.
@@ -124,7 +124,7 @@ class UnaryVariantOpRegistry {
   }
 
   // Add a unary op function to the registry.
-  void RegisterUnaryOpFn(VariantUnaryOp op, const string& device,
+  void RegisterUnaryOpFn(VariantUnaryOp op, const std::string& device,
                          const TypeIndex& type_index,
                          const VariantUnaryOpFn& unary_op_fn) {
     VariantUnaryOpFn* existing = GetUnaryOpFn(op, device, type_index);
@@ -146,7 +146,7 @@ class UnaryVariantOpRegistry {
   }
 
   // Add a binary op function to the registry.
-  void RegisterBinaryOpFn(VariantBinaryOp op, const string& device,
+  void RegisterBinaryOpFn(VariantBinaryOp op, const std::string& device,
                           const TypeIndex& type_index,
                           const VariantBinaryOpFn& add_fn) {
     VariantBinaryOpFn* existing = GetBinaryOpFn(op, device, type_index);
@@ -252,7 +252,7 @@ class UnaryVariantOpRegistry {
   // Find or insert a string into a persistent string storage
   // container; return the StringPiece pointing to the permanent string
   // location.
-  static StringPiece GetPersistentStringPiece(const string& str) {
+  static StringPiece GetPersistentStringPiece(const std::string& str) {
     const auto string_storage = PersistentStringStorage();
     auto found = string_storage->find(str);
     if (found == string_storage->end()) {
@@ -307,7 +307,7 @@ Status VariantDeviceCopy(
 template <typename Device>
 Status UnaryOpVariant(OpKernelContext* ctx, VariantUnaryOp op, const Variant& v,
                       Variant* v_out) {
-  const string& device = DeviceName<Device>::value;
+  const std::string& device = DeviceName<Device>::value;
   UnaryVariantOpRegistry::VariantUnaryOpFn* unary_op_fn =
       UnaryVariantOpRegistry::Global()->GetUnaryOpFn(op, device, v.TypeId());
   if (unary_op_fn == nullptr) {
@@ -336,7 +336,7 @@ Status BinaryOpVariants(OpKernelContext* ctx, VariantBinaryOp op,
         "type ids.  Type names: '",
         a.TypeName(), "' vs. '", b.TypeName(), "'");
   }
-  const string& device = DeviceName<Device>::value;
+  const std::string& device = DeviceName<Device>::value;
   UnaryVariantOpRegistry::VariantBinaryOpFn* binary_op_fn =
       UnaryVariantOpRegistry::Global()->GetBinaryOpFn(op, device, a.TypeId());
   if (binary_op_fn == nullptr) {
@@ -354,7 +354,7 @@ namespace variant_op_registry_fn_registration {
 template <typename T>
 class UnaryVariantDecodeRegistration {
  public:
-  UnaryVariantDecodeRegistration(const string& type_name) {
+  UnaryVariantDecodeRegistration(const std::string& type_name) {
     // The Variant is passed by pointer because it should be
     // mutable: get below may Decode the variant, which
     // is a self-mutating behavior.  The variant is not modified in
@@ -386,7 +386,8 @@ class UnaryVariantDeviceCopyRegistration {
   UnaryVariantDeviceCopyRegistration(
       const VariantDeviceCopyDirection direction, const TypeIndex& type_index,
       const LocalVariantDeviceCopyFn& device_copy_fn) {
-    const string type_index_name = port::MaybeAbiDemangle(type_index.name());
+    const std::string type_index_name =
+        port::MaybeAbiDemangle(type_index.name());
     UnaryVariantOpRegistry::Global()->RegisterDeviceCopyFn(
         direction, type_index,
         [type_index_name, device_copy_fn](
@@ -413,10 +414,11 @@ class UnaryVariantUnaryOpRegistration {
       LocalVariantUnaryOpFn;
 
  public:
-  UnaryVariantUnaryOpRegistration(VariantUnaryOp op, const string& device,
+  UnaryVariantUnaryOpRegistration(VariantUnaryOp op, const std::string& device,
                                   const TypeIndex& type_index,
                                   const LocalVariantUnaryOpFn& unary_op_fn) {
-    const string type_index_name = port::MaybeAbiDemangle(type_index.name());
+    const std::string type_index_name =
+        port::MaybeAbiDemangle(type_index.name());
     UnaryVariantOpRegistry::Global()->RegisterUnaryOpFn(
         op, device, type_index,
         [type_index_name, unary_op_fn](OpKernelContext* ctx, const Variant& v,
@@ -442,10 +444,12 @@ class UnaryVariantBinaryOpRegistration {
       LocalVariantBinaryOpFn;
 
  public:
-  UnaryVariantBinaryOpRegistration(VariantBinaryOp op, const string& device,
+  UnaryVariantBinaryOpRegistration(VariantBinaryOp op,
+                                   const std::string& device,
                                    const TypeIndex& type_index,
                                    const LocalVariantBinaryOpFn& binary_op_fn) {
-    const string type_index_name = port::MaybeAbiDemangle(type_index.name());
+    const std::string type_index_name =
+        port::MaybeAbiDemangle(type_index.name());
     UnaryVariantOpRegistry::Global()->RegisterBinaryOpFn(
         op, device, type_index,
         [type_index_name, binary_op_fn](OpKernelContext* ctx, const Variant& a,
diff --git a/tensorflow/core/framework/variant_tensor_data.h b/tensorflow/core/framework/variant_tensor_data.h
index 8c654ccec82..59246f2bb15 100644
--- a/tensorflow/core/framework/variant_tensor_data.h
+++ b/tensorflow/core/framework/variant_tensor_data.h
@@ -44,8 +44,8 @@ class VariantTensorData {
   VariantTensorData(VariantTensorDataProto proto);
 
   // Name of the type of objects being serialized.
-  const string& type_name() const { return type_name_; }
-  void set_type_name(const string& type_name) { type_name_ = type_name; }
+  const std::string& type_name() const { return type_name_; }
+  void set_type_name(const std::string& type_name) { type_name_ = type_name; }
 
   template <typename T, bool = std::is_pod<typename std::decay<T>::type>::value>
   struct PODResolver {};
@@ -62,9 +62,9 @@ class VariantTensorData {
     return GetMetadata<T>(value, PODResolver<T>());
   }
 
-  string& metadata_string() { return metadata_; }
+  std::string& metadata_string() { return metadata_; }
 
-  const string& metadata_string() const { return metadata_; }
+  const std::string& metadata_string() const { return metadata_; }
 
   // Tensors contained within objects being serialized.
   int tensors_size() const;
@@ -84,25 +84,27 @@ class VariantTensorData {
   bool FromConstProto(const VariantTensorDataProto& proto);
 
   // Serialization via VariantTensorDataProto
-  string SerializeAsString() const;
-  bool SerializeToString(string* buf);
-  bool ParseFromString(string s);
+  std::string SerializeAsString() const;
+  bool SerializeToString(std::string* buf);
+  bool ParseFromString(std::string s);
 
-  string DebugString() const;
+  std::string DebugString() const;
 
  public:
-  string type_name_;
-  string metadata_;
+  std::string type_name_;
+  std::string metadata_;
   std::vector<Tensor> tensors_;
 
  private:
   template <typename T>
-  void SetMetadata(const string& value, PODResolver<T, false /* is_pod */>) {
+  void SetMetadata(const std::string& value,
+                   PODResolver<T, false /* is_pod */>) {
     metadata_ = value;
   }
 
   template <typename T>
-  bool GetMetadata(string* value, PODResolver<T, false /* is_pod */>) const {
+  bool GetMetadata(std::string* value,
+                   PODResolver<T, false /* is_pod */>) const {
     *value = metadata_;
     return true;
   }
@@ -121,7 +123,7 @@ class VariantTensorData {
 };
 
 // For backwards compatibility for when this was a proto
-string ProtoDebugString(const VariantTensorData& object);
+std::string ProtoDebugString(const VariantTensorData& object);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 2cc297afb82..93f4eaf624e 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -82,6 +82,8 @@ Node::NodeClass Node::GetNodeClassForOp(const string& ts) {
           {"StatelessIf", NC_IF},
           {"While", NC_WHILE},
           {"StatelessWhile", NC_WHILE},
+          {"Case", NC_CASE},
+          {"StatelessCase", NC_CASE},
           // Not using the constants defined in FunctionLibraryDefinition
           // for the
           // 4 ops below because android inference library does not link
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 675f96fa5cd..3d3921d68c0 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -72,12 +72,12 @@ class NodeIter;         // Declared below
 
 class Node {
  public:
-  string DebugString() const;
+  std::string DebugString() const;
   int id() const { return id_; }
   int cost_id() const { return cost_id_; }
-  const string& name() const;
-  void set_name(string name);
-  const string& type_string() const;
+  const std::string& name() const;
+  void set_name(std::string name);
+  const std::string& type_string() const;
 
   // def() provides the NodeDef the user supplied, but the specifics
   // of this Node may have changed due to placement, optimization, etc.
@@ -103,11 +103,11 @@ class Node {
 
   // The device requested by the user.  For the actual assigned device,
   // use assigned_device_name() below.
-  const string& requested_device() const;
+  const std::string& requested_device() const;
 
   // This changes the user requested device but not necessarily the device that
   // on which the operation will run.
-  void set_requested_device(const string& device);
+  void set_requested_device(const std::string& device);
 
   // This gives the device the runtime has assigned this node to.  If
   // you want the device the user requested, use def().device() instead.
@@ -115,8 +115,8 @@ class Node {
   // fully specifies a device, and satisfies def().device().
   // TODO(josh11b): Move assigned_device_name outside of Node into a
   // NodeId->DeviceName map.
-  const string& assigned_device_name() const;
-  void set_assigned_device_name(const string& device_name);
+  const std::string& assigned_device_name() const;
+  void set_assigned_device_name(const std::string& device_name);
   bool has_assigned_device_name() const {
     return assigned_device_name_index_ > 0;
   }
@@ -189,23 +189,24 @@ class Node {
 
   bool IsIfNode() const { return class_ == NC_IF; }
   bool IsWhileNode() const { return class_ == NC_WHILE; }
+  bool IsCaseNode() const { return class_ == NC_CASE; }
   // Is this node a function input
   bool IsArg() const { return class_ == NC_ARG; }
   // Is this node a function output
   bool IsRetval() const { return class_ == NC_RETVAL; }
 
   template <typename T>
-  void AddAttr(const string& name, const T& val) {
+  void AddAttr(const std::string& name, const T& val) {
     SetAttrValue(val, AddAttrHelper(name));
     UpdateProperties();
   }
 
-  void AddAttr(const string& name, std::vector<string>&& val) {
+  void AddAttr(const std::string& name, std::vector<string>&& val) {
     MoveAttrValue(std::move(val), AddAttrHelper(name));
     UpdateProperties();
   }
 
-  void ClearAttr(const string& name);
+  void ClearAttr(const std::string& name);
 
   // Returns into '*e' the edge connecting to the 'idx' input of this Node.
   Status input_edge(int idx, const Edge** e) const;
@@ -249,7 +250,7 @@ class Node {
   // property of the node (stored in props_).
   void UpdateProperties();
 
-  AttrValue* AddAttrHelper(const string& name);
+  AttrValue* AddAttrHelper(const std::string& name);
 
   // A set of mutually exclusive classes for different kinds of nodes,
   // class_ is initialized in the Node::Initialize routine based on the
@@ -282,6 +283,7 @@ class Node {
     NC_SYMBOLIC_GRADIENT,
     NC_IF,
     NC_WHILE,
+    NC_CASE,
     NC_ARG,
     NC_RETVAL,
     NC_OTHER  // Not a special kind of node
@@ -290,7 +292,7 @@ class Node {
   void Initialize(int id, int cost_id, std::shared_ptr<NodeProperties> props,
                   NodeClass node_class);
 
-  static NodeClass GetNodeClassForOp(const string& ts);
+  static NodeClass GetNodeClassForOp(const std::string& ts);
 
   int id_;       // -1 until Initialize() is called
   int cost_id_;  // -1 if there is no corresponding cost accounting node
@@ -327,7 +329,7 @@ class Node {
 
 // Stores debug information associated with the Node.
 struct NodeDebugInfo {
-  const string name;
+  const std::string name;
   std::vector<string> original_node_names;
 
   NodeDebugInfo(const Node& n);
@@ -396,7 +398,7 @@ class Edge {
   // (as opposed to a data-flow) dependency.
   bool IsControlEdge() const;
 
-  string DebugString() const;
+  std::string DebugString() const;
 
  private:
   Edge() {}
@@ -593,7 +595,7 @@ class Graph {
 
   // Generate new node name with the specified prefix that is unique
   // across this graph.
-  string NewName(StringPiece prefix);
+  std::string NewName(StringPiece prefix);
 
   // Access to the list of all nodes.  Example usage:
   //   for (Node* node : graph.nodes()) { ... }
@@ -637,9 +639,9 @@ class Graph {
     DCHECK_LT(index, static_cast<int>(device_names_.size()));
   }
 
-  int InternDeviceName(const string& device_name);
+  int InternDeviceName(const std::string& device_name);
 
-  const string& get_assigned_device_name(const Node& node) const {
+  const std::string& get_assigned_device_name(const Node& node) const {
     return device_names_[node.assigned_device_name_index()];
   }
 
@@ -648,7 +650,7 @@ class Graph {
     node->assigned_device_name_index_ = device_name_index;
   }
 
-  void set_assigned_device_name(Node* node, const string& device_name) {
+  void set_assigned_device_name(Node* node, const std::string& device_name) {
     node->assigned_device_name_index_ = InternDeviceName(device_name);
   }
 
@@ -923,11 +925,11 @@ inline void Node::set_assigned_device_name_index(int index) {
   assigned_device_name_index_ = index;
 }
 
-inline void Node::set_assigned_device_name(const string& device_name) {
+inline void Node::set_assigned_device_name(const std::string& device_name) {
   graph_->set_assigned_device_name(this, device_name);
 }
 
-inline const string& Node::assigned_device_name() const {
+inline const std::string& Node::assigned_device_name() const {
   return graph_->get_assigned_device_name(*this);
 }
 
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index 48c733c0987..fdd72968a40 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -58,10 +58,10 @@ tf_cc_test(
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/framework:tensor_testutil",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index 36abb756dd0..02c69920b84 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -100,9 +100,9 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/framework:tensor_testutil",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/clusters:single_machine",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
@@ -191,10 +191,10 @@ tf_cc_test(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/framework:tensor_testutil",
     ],
 )
 
@@ -337,6 +337,7 @@ cc_library(
 tf_cc_test(
     name = "op_level_cost_estimator_test",
     srcs = ["op_level_cost_estimator_test.cc"],
+    tags = ["no_oss"],  # b/163222310
     deps = [
         ":op_level_cost_estimator",
         "//tensorflow/core:framework",
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
index e558558d00a..b23b657308d 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
@@ -102,14 +102,13 @@ TEST_F(AnalyticalCostEstimatorTest, SimpleTest) {
   Costs summary;
   TF_ASSERT_OK(estimator.PredictCosts(item.graph, &run_metadata, &summary));
 
-  EXPECT_EQ(Costs::NanoSeconds(9157), summary.execution_time);
+  EXPECT_EQ(Costs::NanoSeconds(9158), summary.execution_time);
   // Note there are totally 17 nodes (RandomUniform creates 2 nodes), but
   // grappler will not process "label", therefore we have 15 here instead
   EXPECT_EQ(15, summary.num_ops_total);
 
   // Make this estimate accurate:
   // TODO(http://b/70031255): Accurate estimator for RandomUniform op needed
-  // TODO(http://b/70031363): Accurate estimator for Softmax needed
   //
   // Change to EXPECT_FALSE when the above TODOs are done:
   EXPECT_TRUE(summary.inaccurate);
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index c3df2c1f15b..3299e2b4dc6 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -785,8 +785,8 @@ class SymbolicShapeRefiner {
     MutableGraphView gv(&grappler_function_item.graph);
 
     // Forward shapes from function input nodes to argument nodes.
-    for (int i = 0, iter_limit = grappler_function_item.inputs().size();
-         i < iter_limit; ++i) {
+    for (int i = 0, end = grappler_function_item.inputs().size(); i < end;
+         ++i) {
       auto& fun_input = grappler_function_item.input(i);
       NodeDef* fun_node = gv.GetNode(fun_input.node_name);
       const TensorId input_tensor = ParseTensorName(function_node->input(i));
@@ -1284,8 +1284,8 @@ class SymbolicShapeRefiner {
     }
 
     for (int i = grappler_function_item.inputs().size(),
-             iter_limit = function_node->input_size();
-         i < iter_limit; ++i) {
+             end = function_node->input_size();
+         i < end; ++i) {
       const string& input = function_node->input(i);
       if (!IsControlInput(input)) {
         return errors::FailedPrecondition(
@@ -2310,7 +2310,7 @@ Status GraphProperties::UpdateEnqueue(
 
   // TODO(bsteiner): handle EnqueueMany as well.
   std::vector<ShapeAndType> shapes_and_types;
-  for (int i = 1, iter_limit = ctx->input_types.size(); i < iter_limit; ++i) {
+  for (int i = 1, end = ctx->input_types.size(); i < end; ++i) {
     GraphView::InputPort inp(enqueue_node, i);
     GraphView::OutputPort fanin = shape_refiner->graph().GetRegularFanin(inp);
     InferenceContext* in = shape_refiner->GetContext(fanin.node);
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index fb0d6ecf1d0..e148f6a61c8 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -32,10 +32,15 @@ namespace grappler {
 
 constexpr int kOpsPerMac = 2;
 constexpr char kGuaranteeConst[] = "GuaranteeConst";
+constexpr char kAddN[] = "AddN";
+constexpr char kBitCast[] = "BitCast";
+constexpr char kConcatV2[] = "ConcatV2";
 constexpr char kConv2d[] = "Conv2D";
 constexpr char kConv2dBackpropFilter[] = "Conv2DBackpropFilter";
 constexpr char kConv2dBackpropInput[] = "Conv2DBackpropInput";
 constexpr char kFusedConv2dBiasActivation[] = "FusedConv2DBiasActivation";
+constexpr char kDataFormatVecPermute[] = "DataFormatVecPermute";
+constexpr char kDepthToSpace[] = "DepthToSpace";
 constexpr char kDepthwiseConv2dNative[] = "DepthwiseConv2dNative";
 constexpr char kDepthwiseConv2dNativeBackpropFilter[] =
     "DepthwiseConv2dNativeBackpropFilter";
@@ -44,6 +49,8 @@ constexpr char kDepthwiseConv2dNativeBackpropInput[] =
 constexpr char kMatMul[] = "MatMul";
 constexpr char kXlaEinsum[] = "XlaEinsum";
 constexpr char kEinsum[] = "Einsum";
+constexpr char kExpandDims[] = "ExpandDims";
+constexpr char kFill[] = "Fill";
 constexpr char kSparseMatMul[] = "SparseMatMul";
 constexpr char kSparseTensorDenseMatMul[] = "SparseTensorDenseMatMul";
 constexpr char kPlaceholder[] = "Placeholder";
@@ -52,10 +59,13 @@ constexpr char kIdentityN[] = "IdentityN";
 constexpr char kRefIdentity[] = "RefIdentity";
 constexpr char kNoOp[] = "NoOp";
 constexpr char kReshape[] = "Reshape";
+constexpr char kSplit[] = "Split";
 constexpr char kSqueeze[] = "Squeeze";
 constexpr char kRecv[] = "_Recv";
 constexpr char kSend[] = "_Send";
 constexpr char kBatchMatMul[] = "BatchMatMul";
+constexpr char kBatchMatMulV2[] = "BatchMatMulV2";
+constexpr char kPack[] = "Pack";
 constexpr char kRank[] = "Rank";
 constexpr char kShape[] = "Shape";
 constexpr char kShapeN[] = "ShapeN";
@@ -63,6 +73,7 @@ constexpr char kSize[] = "Size";
 constexpr char kStopGradient[] = "StopGradient";
 constexpr char kPreventGradient[] = "PreventGradient";
 constexpr char kGather[] = "Gather";
+constexpr char kGatherNd[] = "GatherNd";
 constexpr char kGatherV2[] = "GatherV2";
 constexpr char kScatterAdd[] = "ScatterAdd";
 constexpr char kScatterDiv[] = "ScatterDiv";
@@ -72,6 +83,9 @@ constexpr char kScatterMul[] = "ScatterMul";
 constexpr char kScatterSub[] = "ScatterSub";
 constexpr char kScatterUpdate[] = "ScatterUpdate";
 constexpr char kSlice[] = "Slice";
+constexpr char kStridedSlice[] = "StridedSlice";
+constexpr char kSpaceToDepth[] = "SpaceToDepth";
+constexpr char kTranspose[] = "Transpose";
 constexpr char kMaxPool[] = "MaxPool";
 constexpr char kMaxPoolGrad[] = "MaxPoolGrad";
 constexpr char kAvgPool[] = "AvgPool";
@@ -80,6 +94,8 @@ constexpr char kFusedBatchNorm[] = "FusedBatchNorm";
 constexpr char kFusedBatchNormGrad[] = "FusedBatchNormGrad";
 constexpr char kQuantizedMatMul[] = "QuantizedMatMul";
 constexpr char kQuantizedMatMulV2[] = "QuantizedMatMulV2";
+constexpr char kUnpack[] = "Unpack";
+constexpr char kSoftmax[] = "Softmax";
 // Dynamic control flow ops.
 constexpr char kSwitch[] = "Switch";
 constexpr char kMerge[] = "Merge";
@@ -173,25 +189,33 @@ int64 GetOutputSize(const int64 input, const int64 filter, const int64 stride,
   }
 }
 
-// Return the output element count of a binary element-wise op considering
+// Return the output element count of a multi-input element-wise op considering
 // broadcasting.
-int64 CwiseOutputElementCount(const TensorShapeProto& input_shape_1,
-                              const TensorShapeProto& input_shape_2) {
-  bool found_unknown_shapes;
-  int rank = std::max(1, input_shape_1.dim_size());
-  TensorShapeProto output_shape =
-      MaybeGetMinimumShape(input_shape_1, rank, &found_unknown_shapes);
+int64 CwiseOutputElementCount(const OpInfo& op_info) {
+  int max_rank = 1;
+  for (const OpInfo::TensorProperties& input_properties : op_info.inputs()) {
+    max_rank = std::max(max_rank, input_properties.shape().dim_size());
+  }
 
-  if (input_shape_1.dim_size() == input_shape_2.dim_size()) {
-    auto shape_1 =
-        MaybeGetMinimumShape(input_shape_1, rank, &found_unknown_shapes);
-    auto shape_2 =
-        MaybeGetMinimumShape(input_shape_2, rank, &found_unknown_shapes);
-    if (shape_1.dim_size() == shape_2.dim_size()) {
-      for (int i = 0; i < shape_1.dim_size(); i++) {
-        output_shape.mutable_dim(i)->set_size(
-            std::max(shape_1.dim(i).size(), shape_2.dim(i).size()));
-      }
+  TensorShapeProto output_shape;
+  output_shape.mutable_dim()->Reserve(max_rank);
+  for (int i = 0; i < max_rank; ++i) {
+    output_shape.add_dim();
+  }
+
+  // Expand the shape of the output to follow the numpy-style broadcast rule
+  // which matches each input starting with the trailing dimensions and working
+  // its way forward. To do this, iterate through each input shape's dimensions
+  // in reverse order, and potentially increase the corresponding output
+  // dimension.
+  for (const OpInfo::TensorProperties& input_properties : op_info.inputs()) {
+    const TensorShapeProto& input_shape = input_properties.shape();
+    for (int i = input_shape.dim_size() - 1; i >= 0; --i) {
+      int output_shape_dim_index =
+          i + output_shape.dim_size() - input_shape.dim_size();
+      output_shape.mutable_dim(output_shape_dim_index)
+          ->set_size(std::max(output_shape.dim(output_shape_dim_index).size(),
+                              input_shape.dim(i).size()));
     }
   }
 
@@ -364,6 +388,8 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
       wrap(&OpLevelCostEstimator::PredictSparseTensorDenseMatMul));
   device_cost_impl_.emplace(kBatchMatMul,
                             wrap(&OpLevelCostEstimator::PredictBatchMatMul));
+  device_cost_impl_.emplace(kBatchMatMulV2,
+                            wrap(&OpLevelCostEstimator::PredictBatchMatMul));
   device_cost_impl_.emplace(kQuantizedMatMul,
                             wrap(&OpLevelCostEstimator::PredictMatMul));
   device_cost_impl_.emplace(kQuantizedMatMulV2,
@@ -379,6 +405,8 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
 
   device_cost_impl_.emplace(kGather,
                             wrap(&OpLevelCostEstimator::PredictGatherOrSlice));
+  device_cost_impl_.emplace(kGatherNd,
+                            wrap(&OpLevelCostEstimator::PredictGatherOrSlice));
   device_cost_impl_.emplace(kGatherV2,
                             wrap(&OpLevelCostEstimator::PredictGatherOrSlice));
   device_cost_impl_.emplace(kScatterAdd,
@@ -398,6 +426,8 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
 
   device_cost_impl_.emplace(kSlice,
                             wrap(&OpLevelCostEstimator::PredictGatherOrSlice));
+  device_cost_impl_.emplace(kStridedSlice,
+                            wrap(&OpLevelCostEstimator::PredictGatherOrSlice));
 
   device_cost_impl_.emplace(kPlaceholder,
                             wrap(&OpLevelCostEstimator::PredictIdentity));
@@ -413,8 +443,6 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
                             wrap(&OpLevelCostEstimator::PredictIdentity));
   device_cost_impl_.emplace(kReshape,
                             wrap(&OpLevelCostEstimator::PredictIdentity));
-  device_cost_impl_.emplace(kSqueeze,
-                            wrap(&OpLevelCostEstimator::PredictIdentity));
   device_cost_impl_.emplace(kRecv,
                             wrap(&OpLevelCostEstimator::PredictIdentity));
   device_cost_impl_.emplace(kSend,
@@ -429,6 +457,31 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
                             wrap(&OpLevelCostEstimator::PredictIdentity));
   device_cost_impl_.emplace(kNextIteration,
                             wrap(&OpLevelCostEstimator::PredictIdentity));
+  device_cost_impl_.emplace(kBitCast,
+                            wrap(&OpLevelCostEstimator::PredictIdentity));
+
+  device_cost_impl_.emplace(kConcatV2,
+                            wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
+  device_cost_impl_.emplace(kDataFormatVecPermute,
+                            wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
+  device_cost_impl_.emplace(kDepthToSpace,
+                            wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
+  device_cost_impl_.emplace(kExpandDims,
+                            wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
+  device_cost_impl_.emplace(kFill,
+                            wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
+  device_cost_impl_.emplace(kPack,
+                            wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
+  device_cost_impl_.emplace(kSpaceToDepth,
+                            wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
+  device_cost_impl_.emplace(kSplit,
+                            wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
+  device_cost_impl_.emplace(kSqueeze,
+                            wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
+  device_cost_impl_.emplace(kTranspose,
+                            wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
+  device_cost_impl_.emplace(kUnpack,
+                            wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
 
   device_cost_impl_.emplace(kRank,
                             wrap(&OpLevelCostEstimator::PredictMetadata));
@@ -451,6 +504,8 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
   device_cost_impl_.emplace(
       kFusedBatchNormGrad,
       wrap(&OpLevelCostEstimator::PredictFusedBatchNormGrad));
+  device_cost_impl_.emplace(kSoftmax,
+                            wrap(&OpLevelCostEstimator::PredictSoftmax));
   device_cost_impl_.emplace(
       kAssignVariableOp, wrap(&OpLevelCostEstimator::PredictAssignVariableOps));
   device_cost_impl_.emplace(
@@ -459,6 +514,7 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
   device_cost_impl_.emplace(
       kAssignSubVariableOp,
       wrap(&OpLevelCostEstimator::PredictAssignVariableOps));
+  device_cost_impl_.emplace(kAddN, wrap(&OpLevelCostEstimator::PredictNaryOp));
 
   persistent_ops_ = {
       kConst,       kVariable,       kVariableV2,   kAutoReloadVariable,
@@ -475,6 +531,8 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
 
   // Unary ops alphabetically sorted
   elementwise_ops_.emplace("Acos", EIGEN_COST(scalar_acos_op<float>));
+  elementwise_ops_.emplace("All", EIGEN_COST(scalar_boolean_and_op));
+  elementwise_ops_.emplace("ArgMax", EIGEN_COST(scalar_max_op<float>));
   elementwise_ops_.emplace("Asin", EIGEN_COST(scalar_asin_op<float>));
   elementwise_ops_.emplace("Atan", EIGEN_COST(scalar_atan_op<float>));
   elementwise_ops_.emplace("Atan2", EIGEN_COST(scalar_quotient_op<float>) +
@@ -499,24 +557,30 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
   elementwise_ops_.emplace("Lgamma", 1);
   elementwise_ops_.emplace("Log", EIGEN_COST(scalar_log_op<float>));
   elementwise_ops_.emplace("Log1p", EIGEN_COST(scalar_log1p_op<float>));
+  elementwise_ops_.emplace("Max", EIGEN_COST(scalar_max_op<float>));
+  elementwise_ops_.emplace("Min", EIGEN_COST(scalar_min_op<float>));
   elementwise_ops_.emplace("Neg", EIGEN_COST(scalar_opposite_op<float>));
+  elementwise_ops_.emplace("Prod", EIGEN_COST(scalar_product_op<float>));
   elementwise_ops_.emplace("QuantizeAndDequantizeV2",
                            quantize_and_dequantize_v2_cost);
+  elementwise_ops_.emplace("QuantizedSigmoid",
+                           EIGEN_COST(scalar_logistic_op<float>));
   elementwise_ops_.emplace("QuantizeV2", quantize_v2_cost);
   elementwise_ops_.emplace("Reciprocal", EIGEN_COST(scalar_inverse_op<float>));
+  elementwise_ops_.emplace("Relu", EIGEN_COST(scalar_max_op<float>));
+  elementwise_ops_.emplace("Relu6", EIGEN_COST(scalar_max_op<float>));
   elementwise_ops_.emplace("Rint", 1);
   elementwise_ops_.emplace("Round", EIGEN_COST(scalar_round_op<float>));
   elementwise_ops_.emplace("Rsqrt", EIGEN_COST(scalar_rsqrt_op<float>));
-  elementwise_ops_.emplace("Sqrt", EIGEN_COST(scalar_sqrt_op<float>));
-  elementwise_ops_.emplace("Square", EIGEN_COST(scalar_square_op<float>));
-  elementwise_ops_.emplace("Tanh", EIGEN_COST(scalar_tanh_op<float>));
-  elementwise_ops_.emplace("Relu", EIGEN_COST(scalar_max_op<float>));
   elementwise_ops_.emplace("Sigmoid", EIGEN_COST(scalar_logistic_op<float>));
-  elementwise_ops_.emplace("QuantizedSigmoid",
-                           EIGEN_COST(scalar_logistic_op<float>));
   elementwise_ops_.emplace("Sign", EIGEN_COST(scalar_sign_op<float>));
   elementwise_ops_.emplace("Sin", EIGEN_COST(scalar_sin_op<float>));
+  elementwise_ops_.emplace("Sqrt", EIGEN_COST(scalar_sqrt_op<float>));
+  elementwise_ops_.emplace("Square", EIGEN_COST(scalar_square_op<float>));
+  elementwise_ops_.emplace("Sum", EIGEN_COST(scalar_sum_op<float>));
   elementwise_ops_.emplace("Tan", EIGEN_COST(scalar_tan_op<float>));
+  elementwise_ops_.emplace("Tanh", EIGEN_COST(scalar_tanh_op<float>));
+  elementwise_ops_.emplace("TopKV2", EIGEN_COST(scalar_max_op<float>));
   // Binary ops alphabetically sorted
   elementwise_ops_.emplace("Add", EIGEN_COST(scalar_sum_op<float>));
   elementwise_ops_.emplace("AddV2", EIGEN_COST(scalar_sum_op<float>));
@@ -545,11 +609,16 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
                            EIGEN_COST(scalar_product_op<float>));
   elementwise_ops_.emplace("RealDiv", EIGEN_COST(scalar_quotient_op<float>));
   elementwise_ops_.emplace("ReluGrad", EIGEN_COST(scalar_max_op<float>));
-  elementwise_ops_.emplace("SquareDifference", 1);
+  elementwise_ops_.emplace("Select", EIGEN_COST(scalar_boolean_or_op));
+  elementwise_ops_.emplace("SelectV2", EIGEN_COST(scalar_boolean_or_op));
+  elementwise_ops_.emplace("SquaredDifference",
+                           EIGEN_COST(scalar_square_op<float>) +
+                               EIGEN_COST(scalar_difference_op<float>));
   elementwise_ops_.emplace("Sub", EIGEN_COST(scalar_difference_op<float>));
   elementwise_ops_.emplace("TruncateDiv",
                            EIGEN_COST(scalar_quotient_op<float>));
   elementwise_ops_.emplace("TruncateMod", EIGEN_COST(scalar_mod_op<float>));
+  elementwise_ops_.emplace("Where", 1);
 
 #undef EIGEN_COST
 
@@ -635,23 +704,20 @@ DeviceInfo OpLevelCostEstimator::GetDeviceInfo(
 Costs OpLevelCostEstimator::PredictCwiseOp(const OpContext& op_context) const {
   const auto& op_info = op_context.op_info;
   bool found_unknown_shapes = false;
-  // For unary or binary element-wise operations, op count is the element count
-  // of any input. We use the count for the largest input here to be more robust
-  // in case that the shape is unknown or partially known for other input.
+  // For element-wise operations, op count is the element count of any input. We
+  // use the count for the largest input here to be more robust in case that the
+  // shape is unknown or partially known for other input.
   int64 op_count = CalculateLargestInputCount(op_info, &found_unknown_shapes);
-  // If output shape is available, try use the element count calculated from
+  // If output shape is available, try to use the element count calculated from
   // that.
   if (op_info.outputs_size() > 0) {
     op_count = std::max(
         op_count,
         CalculateTensorElementCount(op_info.outputs(0), &found_unknown_shapes));
   }
-  // For binary ops, calculate the output shape possibly resulting from
-  // broadcasting.
+  // Calculate the output shape possibly resulting from broadcasting.
   if (op_info.inputs_size() >= 2) {
-    op_count =
-        std::max(op_count, CwiseOutputElementCount(op_info.inputs(0).shape(),
-                                                   op_info.inputs(1).shape()));
+    op_count = std::max(op_count, CwiseOutputElementCount(op_info));
   }
 
   int op_cost = 1;
@@ -1109,7 +1175,7 @@ int64 OpLevelCostEstimator::CountBatchMatMulOperations(
 int64 OpLevelCostEstimator::CountBatchMatMulOperations(
     const OpInfo& op_info, BatchMatMulDimensions* batch_mat_mul,
     bool* found_unknown_shapes) {
-  if (op_info.op() != kBatchMatMul) {
+  if (op_info.op() != kBatchMatMul && op_info.op() != kBatchMatMulV2) {
     LOG(ERROR) << "Invalid Operation: " << op_info.op();
     // TODO(pcma): Try to separate invalid inputs from unknown shapes
     *found_unknown_shapes = true;
@@ -1541,7 +1607,6 @@ Costs OpLevelCostEstimator::PredictFusedConv2DBiasActivation(
 
   auto& conv_input = op_context.op_info.inputs(0);
   auto& filter = op_context.op_info.inputs(1);
-  auto& bias = op_context.op_info.inputs(2);
   auto& side_input = op_context.op_info.inputs(3);
   auto& conv_input_scale = op_context.op_info.inputs(4);
   auto& side_input_scale = op_context.op_info.inputs(5);
@@ -1551,10 +1616,6 @@ Costs OpLevelCostEstimator::PredictFusedConv2DBiasActivation(
   auto dims = ConvolutionDimensionsFromInputs(
       conv_input.shape(), filter.shape(), op_context.op_info,
       &found_unknown_shapes);
-
-  // Construct the shape of our output tensor from our convolution dimensions
-  // and format, as it may not be available yet.
-  // TODO(varomodt): should we centralize the Conv2D input/output shapes?
   OpInfo::TensorProperties output;
   if (data_format == "NCHW" || data_format == "NCHW_VECT_C") {
     output = DescribeTensor(DT_FLOAT, {dims.batch, dims.oz, dims.oy, dims.ox});
@@ -1566,15 +1627,18 @@ Costs OpLevelCostEstimator::PredictFusedConv2DBiasActivation(
   std::vector<OpContext> component_ops = {
       FusedChildContext(op_context, "Conv2D", output, {conv_input, filter}),
       FusedChildContext(op_context, "Mul", output, {output, conv_input_scale}),
-      FusedChildContext(op_context, "BiasAdd", output, {output, bias}),
+      FusedChildContext(
+          op_context, "BiasAdd", output,
+          {output, output}),  // Note we're no longer using bias at all
       FusedChildContext(op_context, "Relu", output, {output})};
 
   // Add our side_input iff it's non-empty.
   if (side_input.shape().dim_size() > 0) {
     component_ops.push_back(FusedChildContext(op_context, "Mul", side_input,
                                               {side_input, side_input_scale}));
-    component_ops.push_back(
-        FusedChildContext(op_context, "Add", output, {side_input, output}));
+    component_ops.push_back(FusedChildContext(
+        op_context, "Add", output,
+        {output, output}));  // Note that we're not using side_input here
   }
 
   // Construct an op_context which definitely has our output shape.
@@ -1666,6 +1730,13 @@ Costs OpLevelCostEstimator::PredictNoOp(const OpContext& op_context) const {
   return Costs::ZeroCosts();
 }
 
+Costs OpLevelCostEstimator::PredictPureMemoryOp(
+    const OpContext& op_context) const {
+  // Each output element is a copy of some element from input, with no required
+  // computation, so just compute memory costs.
+  return PredictOpCountBasedCost(0, op_context.op_info);
+}
+
 Costs OpLevelCostEstimator::PredictIdentity(const OpContext& op_context) const {
   const auto& op_info = op_context.op_info;
   VLOG(1) << "Op:" << op_info.op() << " Execution Time 0 (ns)";
@@ -1737,15 +1808,20 @@ Costs OpLevelCostEstimator::PredictGatherOrSlice(
 
   const double output_size = CalculateOutputSize(op_info, &unknown_shapes);
   double input_size = output_size;
+  int begin_input_index = 1, end_input_index;
   if (op_info.op() == "Slice") {
-    // Add 'begin' & 'size' tensors sizes.
-    input_size +=
-        CalculateTensorElementCount(op_info.inputs(1), &unknown_shapes) +
-        CalculateTensorElementCount(op_info.inputs(2), &unknown_shapes);
+    // Slice: 'input' (omitted), 'begin', 'size'
+    end_input_index = 3;
+  } else if (op_info.op() == "StridedSlice") {
+    // StridedSlice: 'input' (omitted), 'begin', 'end', 'strides'
+    end_input_index = 4;
   } else {
-    // Assuming this is "Gather" or "GatherV2" op, add 'indices' size.
+    // Gather, GatherV2, GatherNd: 'params' (omitted), 'indices'
+    end_input_index = 2;
+  }
+  for (int i = begin_input_index; i < end_input_index; ++i) {
     input_size +=
-        CalculateTensorElementCount(op_info.inputs(1), &unknown_shapes);
+        CalculateTensorElementCount(op_info.inputs(i), &unknown_shapes);
   }
 
   Costs costs =
@@ -2182,5 +2258,62 @@ Costs OpLevelCostEstimator::PredictFusedBatchNormGrad(
   costs.max_memory = total_output_size;
   return costs;
 }
+
+Costs OpLevelCostEstimator::PredictNaryOp(const OpContext& op_context) const {
+  const auto& op_info = op_context.op_info;
+  bool found_unknown_shapes = false;
+  // Calculate the largest known tensor size across all inputs and output.
+  int64 op_count = CalculateLargestInputCount(op_info, &found_unknown_shapes);
+  // If output shape is available, try to use the element count calculated from
+  // that.
+  if (op_info.outputs_size() > 0) {
+    op_count = std::max(
+        op_count,
+        CalculateTensorElementCount(op_info.outputs(0), &found_unknown_shapes));
+  }
+  // Also calculate the output shape possibly resulting from broadcasting.
+  // Note that the some Nary ops (such as AddN) do not support broadcasting,
+  // but we're including this here for completeness.
+  if (op_info.inputs_size() >= 2) {
+    op_count = std::max(op_count, CwiseOutputElementCount(op_info));
+  }
+
+  // Nary ops perform one operation for every element in every input tensor.
+  op_count *= op_info.inputs_size() - 1;
+
+  const auto sum_cost = Eigen::internal::functor_traits<
+      Eigen::internal::scalar_sum_op<float>>::Cost;
+  Costs costs = PredictOpCountBasedCost(op_count * sum_cost, op_info);
+  if (found_unknown_shapes) {
+    costs.inaccurate = true;
+  }
+  costs.num_ops_with_unknown_shapes = found_unknown_shapes;
+  return costs;
+}
+
+// softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))
+Costs OpLevelCostEstimator::PredictSoftmax(const OpContext& op_context) const {
+  bool found_unknown_shapes = false;
+  const int64 logits_size = CalculateTensorElementCount(
+      op_context.op_info.inputs(0), &found_unknown_shapes);
+  TensorShapeProto logits_shape = MaybeGetMinimumShape(
+      op_context.op_info.inputs(0).shape(), 2, &found_unknown_shapes);
+
+#define EIGEN_COST(X) Eigen::internal::functor_traits<Eigen::internal::X>::Cost
+
+  // Every element of <logits> will be exponentiated, have that result included
+  // in a sum across j, and also have that result multiplied by the reciprocal
+  // of the sum_j. In addition, we'll compute 1/sum_j for every i.
+  auto ops =
+      (EIGEN_COST(scalar_exp_op<float>) + EIGEN_COST(scalar_sum_op<float>) +
+       EIGEN_COST(scalar_product_op<float>)) *
+          logits_size +
+      EIGEN_COST(scalar_inverse_op<float>) * logits_shape.dim(0).size();
+
+#undef EIGEN_COST
+
+  return PredictOpCountBasedCost(ops, op_context.op_info);
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index 2bf3c5bb920..be0d7f76621 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -64,6 +64,7 @@ class OpLevelCostEstimator {
   // Implementation of costs other than
   // execution_time is optional, depending on the
   // device.
+  Costs PredictNaryOp(const OpContext& op_context) const;
   Costs PredictConv2D(const OpContext& op_context) const;
   Costs PredictCwiseOp(const OpContext& op_context) const;
   Costs PredictConv2DBackpropInput(const OpContext& op_context) const;
@@ -86,6 +87,8 @@ class OpLevelCostEstimator {
   Costs PredictFusedBatchNormGrad(const OpContext& op_context) const;
   Costs PredictEinsum(const OpContext& op_context) const;
   Costs PredictAssignVariableOps(const OpContext& op_context) const;
+  Costs PredictPureMemoryOp(const OpContext& op_context) const;
+  Costs PredictSoftmax(const OpContext& op_context) const;
 
   // Generic cost prediction method for fused operations.
   Costs PredictFusedOp(const OpContext& op_context,
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index a086fe02191..5ddefdc9602 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -110,18 +110,6 @@ void DescribeArbitraryRankOutput(const std::vector<int>& dims, DataType dtype,
   }
 }
 
-// Returns an OpInfo for a BatchMatMul
-OpContext DescribeBatchMatMul(const std::vector<int>& dims_a,
-                              const std::vector<int>& dims_b) {
-  OpContext op_context;
-  SetCpuDevice(&op_context.op_info);
-  op_context.op_info.set_op("BatchMatMul");
-
-  DescribeArbitraryRankInput(dims_a, DT_FLOAT, &op_context.op_info);
-  DescribeArbitraryRankInput(dims_b, DT_FLOAT, &op_context.op_info);
-  return op_context;
-}
-
 // Returns an OpInfo for a SparseTensorDenseMatMul
 OpContext DescribeSparseTensorDenseMatMul(const int nnz_a,
                                           const std::vector<int>& dims_b,
@@ -515,6 +503,8 @@ OpContext DescribeFusedBatchNorm(const bool is_training, const bool is_grad,
 
 class OpLevelCostEstimatorTest : public ::testing::Test {
  protected:
+  using BatchMatMulDimensions = OpLevelCostEstimator::BatchMatMulDimensions;
+
   Costs PredictCosts(const OpContext& op_context) const {
     return estimator_.PredictCosts(op_context);
   }
@@ -529,24 +519,11 @@ class OpLevelCostEstimatorTest : public ::testing::Test {
     return estimator_.CountBatchMatMulOperations(op_info, found_unknown_shapes);
   }
 
-  int64 CountBatchMatMulDimProduct(const OpInfo& op_info,
+  int64 CountBatchMatMulOperations(const OpInfo& op_info,
+                                   BatchMatMulDimensions* batch_mat_mul,
                                    bool* found_unknown_shapes) const {
-    OpLevelCostEstimator::BatchMatMulDimensions batch_mat_mul;
-
-    batch_mat_mul.matmul_dims.n = 0;
-    batch_mat_mul.matmul_dims.m = 0;
-    batch_mat_mul.matmul_dims.k = 0;
-
-    estimator_.CountBatchMatMulOperations(op_info, &batch_mat_mul,
-                                          found_unknown_shapes);
-    int dimension_product = 1;
-    for (auto dim : batch_mat_mul.batch_dims) dimension_product *= dim;
-
-    dimension_product *= batch_mat_mul.matmul_dims.n;
-    dimension_product *= batch_mat_mul.matmul_dims.m;
-    dimension_product *= batch_mat_mul.matmul_dims.k;
-
-    return dimension_product;
+    return estimator_.CountBatchMatMulOperations(op_info, batch_mat_mul,
+                                                 found_unknown_shapes);
   }
 
   void SetComputeMemoryOverlap(bool value) {
@@ -600,6 +577,49 @@ class OpLevelCostEstimatorTest : public ::testing::Test {
   OpLevelCostEstimator estimator_;
 };
 
+class OpLevelBatchMatMulCostEstimatorTest
+    : public OpLevelCostEstimatorTest,
+      public ::testing::WithParamInterface<const char*> {
+ protected:
+  // Returns an OpInfo for a BatchMatMul
+  OpContext DescribeBatchMatMul(const std::vector<int>& dims_a,
+                                const std::vector<int>& dims_b) {
+    OpContext op_context;
+    SetCpuDevice(&op_context.op_info);
+    op_context.op_info.set_op(GetParam());
+
+    DescribeArbitraryRankInput(dims_a, DT_FLOAT, &op_context.op_info);
+    DescribeArbitraryRankInput(dims_b, DT_FLOAT, &op_context.op_info);
+    return op_context;
+  }
+
+  int64 CountBatchMatMulOperations(const OpInfo& op_info,
+                                   bool* found_unknown_shapes) const {
+    return OpLevelCostEstimatorTest::CountBatchMatMulOperations(
+        op_info, found_unknown_shapes);
+  }
+
+  int64 CountBatchMatMulDimProduct(const OpInfo& op_info,
+                                   bool* found_unknown_shapes) const {
+    BatchMatMulDimensions batch_mat_mul;
+
+    batch_mat_mul.matmul_dims.n = 0;
+    batch_mat_mul.matmul_dims.m = 0;
+    batch_mat_mul.matmul_dims.k = 0;
+
+    OpLevelCostEstimatorTest::CountBatchMatMulOperations(
+        op_info, &batch_mat_mul, found_unknown_shapes);
+    int dimension_product = 1;
+    for (auto dim : batch_mat_mul.batch_dims) dimension_product *= dim;
+
+    dimension_product *= batch_mat_mul.matmul_dims.n;
+    dimension_product *= batch_mat_mul.matmul_dims.m;
+    dimension_product *= batch_mat_mul.matmul_dims.k;
+
+    return dimension_product;
+  }
+};
+
 TEST_F(OpLevelCostEstimatorTest, TestPersistentOpCosts) {
   OpContext op_context;
   SetCpuDevice(&op_context.op_info);
@@ -621,22 +641,26 @@ TEST_F(OpLevelCostEstimatorTest, TestPersistentOpCosts) {
 }
 
 TEST_F(OpLevelCostEstimatorTest, TestGatherCosts) {
-  OpContext op_context;
-  SetCpuDevice(&op_context.op_info);
-  op_context.op_info.set_op("Gather");
+  std::vector<std::string> gather_ops = {"Gather", "GatherNd", "GatherV2"};
 
-  // Huge first input shouldn't affect Gather execution and memory costs.
-  DescribeArbitraryRankInput({10000000, 10}, DT_FLOAT, &op_context.op_info);
-  DescribeArbitraryRankInput({16}, DT_INT64, &op_context.op_info);
-  DescribeArbitraryRankOutput({16, 10}, DT_FLOAT, &op_context.op_info);
+  for (const auto& op : gather_ops) {
+    OpContext op_context;
+    SetCpuDevice(&op_context.op_info);
+    op_context.op_info.set_op(op);
 
-  auto cost = estimator_.PredictCosts(op_context);
-  EXPECT_EQ(Costs::Duration(130), cost.memory_time);
-  EXPECT_EQ(Costs::Duration(16), cost.compute_time);
-  EXPECT_EQ(Costs::Duration(146), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
-  EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+    // Huge first input shouldn't affect Gather execution and memory costs.
+    DescribeArbitraryRankInput({10000000, 10}, DT_FLOAT, &op_context.op_info);
+    DescribeArbitraryRankInput({16}, DT_INT64, &op_context.op_info);
+    DescribeArbitraryRankOutput({16, 10}, DT_FLOAT, &op_context.op_info);
+
+    auto cost = estimator_.PredictCosts(op_context);
+    EXPECT_EQ(Costs::Duration(130), cost.memory_time);
+    EXPECT_EQ(Costs::Duration(16), cost.compute_time);
+    EXPECT_EQ(Costs::Duration(146), cost.execution_time);
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  }
 }
 
 TEST_F(OpLevelCostEstimatorTest, TestGatherCostsWithoutOutput) {
@@ -677,6 +701,27 @@ TEST_F(OpLevelCostEstimatorTest, TestSliceCosts) {
   EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
 }
 
+TEST_F(OpLevelCostEstimatorTest, TestStridedSliceCosts) {
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  op_context.op_info.set_op("StridedSlice");
+
+  // Huge first input shouldn't affect StridedSlice execution and memory costs.
+  DescribeArbitraryRankInput({10000000, 10}, DT_FLOAT, &op_context.op_info);
+  DescribeArbitraryRankInput({2}, DT_INT64, &op_context.op_info);
+  DescribeArbitraryRankInput({2}, DT_INT64, &op_context.op_info);
+  DescribeArbitraryRankInput({2}, DT_INT64, &op_context.op_info);
+  DescribeArbitraryRankOutput({10, 10}, DT_FLOAT, &op_context.op_info);
+
+  auto cost = estimator_.PredictCosts(op_context);
+  EXPECT_EQ(Costs::Duration(81), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(10), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(91), cost.execution_time);
+  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_FALSE(cost.inaccurate);
+  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+}
+
 TEST_F(OpLevelCostEstimatorTest, TestScatterOps) {
   std::vector<string> scatter_ops = {"ScatterAdd",   "ScatterDiv", "ScatterMax",
                                      "ScatterMin",   "ScatterMul", "ScatterSub",
@@ -909,21 +954,88 @@ TEST_F(OpLevelCostEstimatorTest, ModExecutionTime) {
   EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
 }
 
-TEST_F(OpLevelCostEstimatorTest, ReluExecutionTime) {
-  auto cost = PredictCosts(DescribeUnaryOp("Relu", 1000));
-  EXPECT_EQ(Costs::Duration(800), cost.memory_time);
-  EXPECT_EQ(Costs::Duration(100), cost.compute_time);
-  EXPECT_EQ(Costs::Duration(900), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+TEST_F(OpLevelCostEstimatorTest, SquaredDifferenceExecutionTime) {
+  auto cost = PredictCosts(DescribeBinaryOp("SquaredDifference", 1000, 2));
+  EXPECT_EQ(cost.memory_time, Costs::Duration(3600));
+  EXPECT_EQ(cost.compute_time, Costs::Duration(800));
+  EXPECT_EQ(cost.execution_time, Costs::Duration(4400));
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
 }
 
-TEST_F(OpLevelCostEstimatorTest, CastExecutionTime) {
-  auto cost = PredictCosts(DescribeUnaryOp("Cast", 1000));
-  EXPECT_EQ(Costs::Duration(800), cost.memory_time);
+TEST_F(OpLevelCostEstimatorTest, UnaryOpExecutionTime) {
+  std::vector<std::pair<std::string, int>> unary_ops = {
+      {"All", 1},      {"ArgMax", 1}, {"Cast", 1},  {"Max", 1},
+      {"Min", 1},      {"Prod", 1},   {"Relu", 1},  {"Relu6", 1},
+      {"Softmax", 43}, {"Sum", 1},    {"TopKV2", 1}};
+
+  const int kTensorSize = 1000;
+  for (auto unary_op : unary_ops) {
+    OpContext op_context = DescribeUnaryOp(unary_op.first, kTensorSize);
+
+    const int kExpectedMemoryTime = 800;
+    int expected_compute_time = std::ceil(
+        unary_op.second * kTensorSize /
+        estimator_.GetDeviceInfo(op_context.op_info.device()).gigaops);
+
+    auto cost = PredictCosts(op_context);
+    EXPECT_EQ(cost.memory_time, Costs::Duration(kExpectedMemoryTime));
+    EXPECT_EQ(cost.compute_time, Costs::Duration(expected_compute_time))
+        << unary_op.first;
+    EXPECT_EQ(cost.execution_time,
+              Costs::Duration(expected_compute_time + kExpectedMemoryTime));
+    EXPECT_EQ(cost.num_ops_total, 1);
+    EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+    EXPECT_FALSE(cost.inaccurate);
+  }
+}
+
+TEST_F(OpLevelCostEstimatorTest, BinaryOpExecutionTime) {
+  std::vector<std::pair<std::string, int>> binary_ops = {
+      {"Select", 1},
+      {"SelectV2", 1},
+      {"SquaredDifference", 2},
+      {"Where", 1},
+  };
+
+  const int kTensorSize1 = 1000;
+  const int kTensorSize2 = 2;
+  for (auto binary_op : binary_ops) {
+    OpContext op_context =
+        DescribeBinaryOp(binary_op.first, kTensorSize1, kTensorSize2);
+
+    const int kExpectedMemoryTime = 3600;
+    int expected_compute_time = std::ceil(
+        binary_op.second * kTensorSize1 * kTensorSize2 * 2 /
+        estimator_.GetDeviceInfo(op_context.op_info.device()).gigaops);
+
+    auto cost = PredictCosts(op_context);
+    EXPECT_EQ(Costs::Duration(kExpectedMemoryTime), cost.memory_time)
+        << binary_op.first;
+    EXPECT_EQ(Costs::Duration(expected_compute_time), cost.compute_time)
+        << binary_op.first;
+    EXPECT_EQ(Costs::Duration(expected_compute_time + kExpectedMemoryTime),
+              cost.execution_time)
+        << binary_op.first;
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  }
+}
+
+TEST_F(OpLevelCostEstimatorTest, BroadcastAddExecutionTime) {
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  op_context.op_info.set_op("Add");
+
+  DescribeTensor1D(100, op_context.op_info.add_inputs());
+  DescribeTensor4D(1, 10, 1, 1, op_context.op_info.add_inputs());
+
+  auto cost = PredictCosts(op_context);
+  EXPECT_EQ(Costs::Duration(44), cost.memory_time);
   EXPECT_EQ(Costs::Duration(100), cost.compute_time);
-  EXPECT_EQ(Costs::Duration(900), cost.execution_time);
+  EXPECT_EQ(Costs::Duration(144), cost.execution_time);
   EXPECT_EQ(1, cost.num_ops_total);
   EXPECT_FALSE(cost.inaccurate);
   EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
@@ -964,7 +1076,7 @@ TEST_F(OpLevelCostEstimatorTest, UnknownOrPartialShape) {
   }
 }
 
-TEST_F(OpLevelCostEstimatorTest, BatchMatMul) {
+TEST_P(OpLevelBatchMatMulCostEstimatorTest, TestBatchMatMul) {
   {
     auto cost = PredictCosts(DescribeBatchMatMul({}, {}));
     EXPECT_EQ(1, cost.num_ops_total);
@@ -1042,6 +1154,8 @@ TEST_F(OpLevelCostEstimatorTest, BatchMatMul) {
                                     &batch_matmul_inaccurate);
   EXPECT_EQ(prod, 12);
 }
+INSTANTIATE_TEST_SUITE_P(TestBatchMatMul, OpLevelBatchMatMulCostEstimatorTest,
+                         ::testing::Values("BatchMatMul", "BatchMatMulV2"));
 
 TEST_F(OpLevelCostEstimatorTest, SparseTensorDenseMatMul) {
   // Unknown shape cases
@@ -1100,9 +1214,7 @@ void ExpectTensorShape(const std::vector<int64>& expected,
   TensorShape tensor_shape_expected(expected);
   TensorShape tensor_shape(tensor_shape_proto);
 
-  LOG(INFO) << "Expected: " << tensor_shape_expected.DebugString();
-  LOG(INFO) << "TensorShape: " << tensor_shape.DebugString();
-  EXPECT_TRUE(tensor_shape_expected == tensor_shape);
+  EXPECT_EQ(tensor_shape_expected, tensor_shape);
 }
 
 TEST_F(OpLevelCostEstimatorTest, GetTensorShapeProtoFromTensorProto) {
@@ -1791,5 +1903,76 @@ TEST_F(OpLevelCostEstimatorTest, PredictResourceVariableOps) {
   }
 }
 
+TEST_F(OpLevelCostEstimatorTest, AddNExecutionTime) {
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  op_context.op_info.set_op("AddN");
+
+  DescribeTensor4D(1, 10, 10, 10, op_context.op_info.add_inputs());
+  DescribeTensor4D(1, 10, 10, 10, op_context.op_info.add_inputs());
+  DescribeTensor4D(1, 10, 10, 10, op_context.op_info.add_inputs());
+
+  auto cost = PredictCosts(op_context);
+  EXPECT_EQ(Costs::Duration(1200), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(200), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(1400), cost.execution_time);
+  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_FALSE(cost.inaccurate);
+  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+}
+
+TEST_F(OpLevelCostEstimatorTest, IdentityOpExecutionTime) {
+  std::vector<std::string> identity_ops = {
+      "_Recv",         "_Send",        "BitCast",         "Identity",
+      "Enter",         "Exit",         "IdentityN",       "Merge",
+      "NextIteration", "Placeholder",  "PreventGradient", "RefIdentity",
+      "Reshape",       "StopGradient", "Switch"};
+
+  const int kTensorSize = 1000;
+  for (auto identity_op : identity_ops) {
+    OpContext op_context = DescribeUnaryOp(identity_op, kTensorSize);
+
+    const int kExpectedMemoryTime = 0;
+    const int kExpectedComputeTime = 1;
+
+    auto cost = PredictCosts(op_context);
+    EXPECT_EQ(Costs::Duration(kExpectedMemoryTime), cost.memory_time);
+    EXPECT_EQ(Costs::Duration(kExpectedComputeTime), cost.compute_time);
+    EXPECT_EQ(Costs::Duration(kExpectedComputeTime + kExpectedMemoryTime),
+              cost.execution_time);
+    EXPECT_EQ(cost.max_memory, kTensorSize * 4);
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  }
+}
+
+TEST_F(OpLevelCostEstimatorTest, PureMemoryOpExecutionTime) {
+  std::vector<std::string> reshape_ops = {
+      "ConcatV2",     "DataFormatVecPermute",
+      "DepthToSpace", "ExpandDims",
+      "Fill",         "Pack",
+      "SpaceToDepth", "Split",
+      "Squeeze",      "Transpose",
+      "Unpack"};
+
+  const int kTensorSize = 1000;
+  for (auto reshape_op : reshape_ops) {
+    OpContext op_context = DescribeUnaryOp(reshape_op, kTensorSize);
+
+    const int kExpectedMemoryTime = 800;
+    const int kExpectedComputeTime = 0;
+
+    auto cost = PredictCosts(op_context);
+    EXPECT_EQ(Costs::Duration(kExpectedMemoryTime), cost.memory_time);
+    EXPECT_EQ(Costs::Duration(kExpectedComputeTime), cost.compute_time);
+    EXPECT_EQ(Costs::Duration(kExpectedComputeTime + kExpectedMemoryTime),
+              cost.execution_time);
+    EXPECT_EQ(cost.max_memory, kTensorSize * 4);
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  }
+}
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index 67af304b081..392eff98c78 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -111,6 +111,20 @@ void UpdateDeviceAnnotationState(const NodeDef* node,
 
 }  // namespace
 
+void LIFOManager::AddNode(const NodeDef* node) {
+  // Merge nodes are scheduled with the lowest priority in LIFO manager; virtual
+  // scheduler may run multiple input nodes of Merge (when we don't have
+  // annotation, which is quite common); simply scheduling Merge after one of
+  // its input may break scheduling constraints; some inputs of Merge may be
+  // scheduled after the Merge. So, we place Merge at the beginning of the queue
+  // to guarantee all the inputs of Merge are scheduled before the Merge.
+  if (IsMerge(*node)) {
+    nodes_.push_front(node);
+  } else {
+    nodes_.push_back(node);
+  }
+}
+
 const NodeDef* LIFOManager::GetCurrNode() {
   CHECK(!nodes_.empty()) << "GetCurrNode(), but there's no ready node";
   if (curr_pos_ == nodes_.end()) {
@@ -523,8 +537,8 @@ Status SchedulerState::Init(const GrapplerItem* item,
     if (IsPersistent(*curr_node)) {
       auto& device_state = device_[curr_node_device];
       for (int port_num = 0,
-               port_num_iter_limit = curr_node_state.output_properties.size();
-           port_num < port_num_iter_limit; ++port_num) {
+               port_num_end = curr_node_state.output_properties.size();
+           port_num < port_num_end; ++port_num) {
         device_state.persistent_nodes.insert(
             std::make_pair(curr_node, port_num));
       }
@@ -1121,8 +1135,8 @@ void SchedulerState::GenerateRunMetadata(RunMetadata* metadata) {
       const NodeState& nodestate = node_map_.at(node_def);
       NodeExecStats* node_stats = device_stepstats->add_node_stats();
       uint64 total_output_size = 0;
-      for (int slot = 0, slot_iter_limit = nodestate.output_properties.size();
-           slot < slot_iter_limit; slot++) {
+      for (int slot = 0, slot_end = nodestate.output_properties.size();
+           slot < slot_end; slot++) {
         const auto& properties = nodestate.output_properties[slot];
         NodeOutput* no = node_stats->add_output();
         no->set_slot(slot);
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 70f00f53927..0e15b9842a1 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -195,7 +195,7 @@ class LIFOManager : public ReadyNodeManager {
  public:
   LIFOManager() : ReadyNodeManager() {}
   ~LIFOManager() override {}
-  void AddNode(const NodeDef* node) override { nodes_.push_back(node); }
+  void AddNode(const NodeDef* node) override;
   const NodeDef* GetCurrNode() override;
   void RemoveCurrNode() override;
   bool Empty() const override { return nodes_.empty(); }
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
index 3a332ff03db..cca91d8fe77 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
@@ -205,6 +205,26 @@ TEST_F(ReadyNodeManagerTest, AddAndRemoveMultipleLIFOManager) {
   EXPECT_TRUE(manager.Empty());
 }
 
+TEST_F(ReadyNodeManagerTest, MergeOrderInLIFOManager) {
+  LIFOManager manager = LIFOManager();
+  node3_.set_op("Merge");
+  manager.AddNode(&node1_);
+  manager.AddNode(&node2_);
+  manager.AddNode(&node3_);
+  manager.AddNode(&node4_);
+
+  // Merge node (node3) will be scheduled at the end (even though it's added
+  // after nodde2).
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node4");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node2");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node1");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node3");
+  manager.RemoveCurrNode();
+}
+
 TEST_F(ReadyNodeManagerTest, GetSingleNodeFirstReadyManager) {
   FirstReadyManager manager;
   TF_EXPECT_OK(manager.Init(&node_states_));
diff --git a/tensorflow/core/grappler/graph_analyzer/graph_analyzer.cc b/tensorflow/core/grappler/graph_analyzer/graph_analyzer.cc
index f3796fcf861..b35099a6aae 100644
--- a/tensorflow/core/grappler/graph_analyzer/graph_analyzer.cc
+++ b/tensorflow/core/grappler/graph_analyzer/graph_analyzer.cc
@@ -92,7 +92,8 @@ void GraphAnalyzer::FindSubgraphs() {
 }
 
 void GraphAnalyzer::ExtendSubgraph(Subgraph* parent) {
-  bool will_complete = (parent->id().size() + 1 == subgraph_size_);
+  const int next_parent_id = parent->id().size() + 1;
+  bool will_complete = (next_parent_id == subgraph_size_);
   SubgraphPtrSet& sg_set = will_complete ? result_ : partial_;
 
   const GenNode* last_all_or_none_node = nullptr;
@@ -151,7 +152,8 @@ void GraphAnalyzer::ExtendSubgraphAllOrNone(Subgraph* parent,
     // point in growing it more, can just skip over the rest of the links.
     for (const auto& link : nbit->second) {
       id.insert(link.node);
-      if (id.size() > subgraph_size_) {
+      const int id_size = id.size();
+      if (id_size > subgraph_size_) {
         return;  // Too big.
       }
     }
@@ -177,7 +179,8 @@ void GraphAnalyzer::ExtendSubgraphPortAllOrNone(Subgraph* parent,
   // point in growing it more, can just skip over the rest of the links.
   for (const auto& link : nbit->second) {
     id.insert(link.node);
-    if (id.size() > subgraph_size_) {
+    const int id_size = id.size();
+    if (id_size > subgraph_size_) {
       return;  // Too big.
     }
   }
@@ -198,8 +201,8 @@ void GraphAnalyzer::AddExtendedSubgraph(Subgraph* parent,
     // This subgraph was already found by extending from a different path.
     return;
   }
-
-  if (id.size() != subgraph_size_) {
+  const int id_size = id.size();
+  if (id_size != subgraph_size_) {
     todo_.push_back(sg.get());
   }
   spec_sg_set.insert(std::move(sg));
diff --git a/tensorflow/core/grappler/graph_analyzer/sig_node.cc b/tensorflow/core/grappler/graph_analyzer/sig_node.cc
index c71ad3100f4..4779135810a 100644
--- a/tensorflow/core/grappler/graph_analyzer/sig_node.cc
+++ b/tensorflow/core/grappler/graph_analyzer/sig_node.cc
@@ -113,7 +113,8 @@ void SigNode::ComputeTopoHash(int distance) {
     return;
   }
 
-  CHECK(topo_hash_.size() == distance);
+  const int64 topo_hash_size = topo_hash_.size();
+  CHECK(topo_hash_size == distance);
 
   int prev = distance - 1;
 
@@ -154,7 +155,8 @@ void SigNode::ComputeTopoHash(int distance) {
 
 size_t SigNode::GetTopoHash(int distance) const {
   CHECK(!topo_hash_.empty());
-  if (distance >= topo_hash_.size()) {
+  const int64 topo_hash_size = topo_hash_.size();
+  if (distance >= topo_hash_size) {
     CHECK(hash_is_final_);
     return topo_hash_.back();
   } else {
@@ -393,7 +395,7 @@ void Signature::OrderLinks() {
     int first_idx = -1;
 
     int idx;
-    for (idx = 0; idx < node->hashed_peers_.size(); ++idx) {
+    for (idx = 0; idx < static_cast<int64>(node->hashed_peers_.size()); ++idx) {
       auto& entry = node->hashed_peers_[idx];
       if (entry.link_hash == cur_link_hash) {
         continue;
diff --git a/tensorflow/core/grappler/graph_analyzer/subgraph.cc b/tensorflow/core/grappler/graph_analyzer/subgraph.cc
index 28a91e0f843..e196181467e 100644
--- a/tensorflow/core/grappler/graph_analyzer/subgraph.cc
+++ b/tensorflow/core/grappler/graph_analyzer/subgraph.cc
@@ -147,7 +147,8 @@ bool SubgraphIterator::NextIfSamePort() {
   if (AtEnd()) {
     return false;
   }
-  if (link_idx_ + 1 < link_map_it_->second.size()) {
+  const int64 link_map_it_second_size = link_map_it_->second.size();
+  if (link_idx_ + 1 < link_map_it_second_size) {
     ++link_idx_;
     return true;
   } else {
@@ -174,7 +175,8 @@ void SubgraphIterator::SkipNode() {
 
 bool SubgraphIterator::PropagateNext() {
   // Loops are used to skip over the empty entries.
-  while (link_idx_ >= link_map_it_->second.size()) {
+  const int64 link_map_it_second_size = link_map_it_->second.size();
+  while (link_idx_ >= link_map_it_second_size) {
     ++link_map_it_;
     while (link_map_it_ == (*id_it_)->links().end()) {
       if (++id_it_ == id_->end()) {
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index 9d30f24e047..6b961c1e18f 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -186,6 +186,14 @@ bool IsConv2DBackpropInput(const NodeDef& node) {
 
 bool IsConv3D(const NodeDef& node) { return node.op() == "Conv3D"; }
 
+bool IsConv3DBackpropFilterV2(const NodeDef& node) {
+  return node.op() == "Conv3DBackpropFilterV2";
+}
+
+bool IsConv3DBackpropInputV2(const NodeDef& node) {
+  return node.op() == "Conv3DBackpropInputV2";
+}
+
 bool IsDepthwiseConv2dNative(const NodeDef& node) {
   return node.op() == "DepthwiseConv2dNative";
 }
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 141eda7415a..1bf26721847 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -63,6 +63,8 @@ bool IsConv2D(const NodeDef& node);
 bool IsConv2DBackpropFilter(const NodeDef& node);
 bool IsConv2DBackpropInput(const NodeDef& node);
 bool IsConv3D(const NodeDef& node);
+bool IsConv3DBackpropFilterV2(const NodeDef& node);
+bool IsConv3DBackpropInputV2(const NodeDef& node);
 bool IsDepthwiseConv2dNative(const NodeDef& node);
 bool IsDepthwiseConv2dNativeBackpropFilter(const NodeDef& node);
 bool IsDepthwiseConv2dNativeBackpropInput(const NodeDef& node);
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 5f19398bf89..2ce037178b9 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -422,9 +422,9 @@ tf_cuda_cc_test(
         ":model_pruner",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/framework:tensor_testutil",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
@@ -547,9 +547,9 @@ tf_cuda_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/framework:tensor_testutil",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:virtual_cluster",
@@ -802,9 +802,9 @@ tf_cuda_cc_test(
         ":loop_optimizer",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/framework:tensor_testutil",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
@@ -993,6 +993,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
         "//tensorflow/core/grappler/utils:topological_sort",
@@ -1163,11 +1164,11 @@ tf_cuda_cc_test(
         ":generic_layout_optimizer",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:framework",
-        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/framework:tensor_testutil",
         "//tensorflow/core/grappler:devices",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/clusters:cluster",
@@ -1211,10 +1212,10 @@ tf_cuda_cc_test(
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/framework:tensor_testutil",
         "//tensorflow/core/grappler:devices",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/clusters:cluster",
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 0679b92d914..8b784c602db 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -598,7 +598,7 @@ class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
     std::deque<InputAndShape> add_ops;
 
     // Prepare leaf AddN nodes for inputs of equal shape
-    for (int i = 0, iter_limit = shapes.size(); i < iter_limit; ++i) {
+    for (int i = 0, end = shapes.size(); i < end; ++i) {
       const auto node_name = leaf_node_name(i);
       const auto& inputs = shape_sig_to_inputs[ShapeSignature(shapes[i])];
       add_ops.push_back(AddInputsOfSymbolicallyEqualShape(*group.root_node,
@@ -750,8 +750,7 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
         ctx().node_map->AddOutput(new_add_node->name(), new_outer_node->name());
 
         // Hoist non-shared factors up into the new AddN node.
-        for (int i = 0, iter_limit = unique_factors.size(); i < iter_limit;
-             ++i) {
+        for (int i = 0, end = unique_factors.size(); i < end; ++i) {
           const string& unique_factor_i = unique_factors[i];
           new_add_node->set_input(i, unique_factor_i);
           ctx().node_map->AddOutput(unique_factor_i, new_add_node->name());
@@ -1203,7 +1202,7 @@ class RemoveIdentityTranspose : public ArithmeticOptimizerStage {
     if (a.size() != b.size()) {
       return false;
     }
-    for (int i = 0, iter_limit = a.size(); i < iter_limit; ++i) {
+    for (int i = 0, end = a.size(); i < end; ++i) {
       if (a[b[i]] != i) {
         return false;
       }
@@ -1212,7 +1211,7 @@ class RemoveIdentityTranspose : public ArithmeticOptimizerStage {
   }
 
   bool IsIdentityPermutation(const std::vector<int64>& perm) {
-    for (int64 i = 0, iter_limit = perm.size(); i < iter_limit; ++i) {
+    for (int64 i = 0, end = perm.size(); i < end; ++i) {
       if (i != perm[i]) {
         return false;
       }
@@ -1863,8 +1862,8 @@ class SqrtDivToRsqrtMulStage : public ArithmeticOptimizerStage {
   }
 };
 
-// Performs the conversion:
-// Square(Sub(x, y)) => Identity(SquaredDifference(x, y))
+// Performs the following conversion for real types:
+//   Square(Sub(x, y)) => Identity(SquaredDifference(x, y) )
 class FuseSquaredDiffStage : public ArithmeticOptimizerStage {
  public:
   explicit FuseSquaredDiffStage(const GraphOptimizerContext& ctx,
@@ -1883,6 +1882,11 @@ class FuseSquaredDiffStage : public ArithmeticOptimizerStage {
     // elsewhere.
     if (IsSub(*b) && !IsInPreserveSet(*b) &&
         (NumNonControlOutputs(*b, *ctx().node_map) == 1)) {
+      // For complex, SquaredDiff computes conj(x-y)*(x-y), so this rewrite is
+      // invalid.
+      const DataType type = GetDataTypeFromAttr(*b, "T");
+      if ((type == DT_COMPLEX64) || (type == DT_COMPLEX128))
+        return Status::OK();
       node->set_op("Identity");
       b->set_op("SquaredDifference");
       AddToOptimizationQueue(node);
@@ -3375,7 +3379,7 @@ class RemoveStackSliceSameAxis : public ArithmeticOptimizerStage {
 
     int begin_index = -1;
     int64 begin_value = 0;
-    for (int i = 0, iter_limit = slice_begin_vec.size(); i < iter_limit; ++i) {
+    for (int i = 0, end = slice_begin_vec.size(); i < end; ++i) {
       const int64 v = slice_begin_vec[i];
       if (v != 0) {
         if (begin_index != -1) {
@@ -3389,7 +3393,7 @@ class RemoveStackSliceSameAxis : public ArithmeticOptimizerStage {
 
     int end_index = -1;
     int64 end_value = 0;
-    for (int i = 0, iter_limit = slice_begin_vec.size(); i < iter_limit; ++i) {
+    for (int i = 0, end = slice_begin_vec.size(); i < end; ++i) {
       const int64 v = slice_end_vec[i];
       if (v != pack_output_shape.dim_size(i)) {
         if (end_index != -1) {
@@ -3723,12 +3727,12 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
     pipeline.AddStage<UnaryOpsComposition>(ctx, ctx_ext);
   if (options_.remove_stack_slice_same_axis)
     pipeline.AddStage<RemoveStackSliceSameAxis>(ctx, ctx_ext);
-  if (options_.fuse_squared_diff)
-    pipeline.AddStage<FuseSquaredDiffStage>(ctx, ctx_ext);
   if (options_.simplify_embedding_lookup)
     pipeline.AddStage<SimplifyEmbeddingLookupStage>(ctx, ctx_ext);
   if (options_.remove_cast_into_segment_reduction)
     pipeline.AddStage<RemoveCastIntoSegmentReductionStage>(ctx, ctx_ext);
+  if (options_.fuse_squared_diff)
+    pipeline.AddStage<FuseSquaredDiffStage>(ctx, ctx_ext);
 
   VLOG(1) << "Run " << pipeline.NumStages() << " arithmetic optimizer stages: "
           << absl::StrJoin(pipeline.StageNames(), ", ");
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index 388fe12805b..d8c60ec897b 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
 
+#include <complex>
+
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/cc/ops/array_ops.h"
@@ -2525,38 +2527,51 @@ TEST_F(ArithmeticOptimizerTest, ConvertSqrtDivToRsqrtMulExcludeFloorDiv) {
 }
 
 TEST_F(ArithmeticOptimizerTest, FuseSquaredDiff) {
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  auto x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
-  auto y = ops::Const(s.WithOpName("y"), {3.0f, 4.0f}, {1, 2});
-  Output sub_x_y = ops::Sub(s.WithOpName("sub_x_y"), x, y);
-  Output square_sub_x_y = ops::Square(s.WithOpName("output"), sub_x_y);
+  for (bool is_complex : {false, true}) {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
+    Output y = ops::Const(s.WithOpName("y"), {3.0f, 4.0f}, {1, 2});
+    Output complex_x = ops::Complex(s.WithOpName("complex_x"), x, x);
+    Output complex_y = ops::Complex(s.WithOpName("complex_y"), y, y);
+    Output sub_x_y =
+        is_complex ? ops::Sub(s.WithOpName("sub_x_y"), complex_x, complex_y)
+                   : ops::Sub(s.WithOpName("sub_x_y"), x, y);
+    Output square_sub_x_y = ops::Square(s.WithOpName("output"), sub_x_y);
 
-  GrapplerItem item;
-  item.fetch = {"output"};
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  const auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
-  ASSERT_EQ(tensors_expected.size(), 1);
+    GrapplerItem item;
+    item.fetch = {"output"};
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    const auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+    ASSERT_EQ(tensors_expected.size(), 1);
 
-  GraphDef output;
-  ArithmeticOptimizer optimizer;
-  EnableOnlyFuseSquaredDiff(&optimizer);
-  OptimizeAndPrune(&optimizer, &item, &output);
-  const auto tensors = EvaluateNodes(output, item.fetch);
-  ASSERT_EQ(tensors.size(), 1);
+    GraphDef output;
+    ArithmeticOptimizer optimizer;
+    EnableOnlyFuseSquaredDiff(&optimizer);
+    OptimizeAndPrune(&optimizer, &item, &output);
+    const auto tensors = EvaluateNodes(output, item.fetch);
+    ASSERT_EQ(tensors.size(), 1);
 
-  test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
-  EXPECT_EQ(output.node_size(), item.graph.node_size());
-  for (int i = 0; i < output.node_size(); ++i) {
-    const NodeDef& node = output.node(i);
-    if (node.name() == "output") {
-      EXPECT_EQ(node.op(), "Identity");
-      ASSERT_EQ(node.input_size(), 1);
-      EXPECT_EQ(node.input(0), "sub_x_y");
-    } else if (node.name() == "sub_x_y") {
-      EXPECT_EQ(node.op(), "SquaredDifference");
-      ASSERT_EQ(node.input_size(), 2);
-      EXPECT_EQ(node.input(0), "x");
-      EXPECT_EQ(node.input(1), "y");
+    if (is_complex) {
+      test::ExpectTensorNear<std::complex<float>>(tensors[0],
+                                                  tensors_expected[0], 1e-6);
+      EXPECT_EQ(output.node_size(), item.graph.node_size());
+    } else {
+      test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
+      // The two unused Complex nodes should get pruned.
+      EXPECT_EQ(output.node_size(), item.graph.node_size() - 2);
+    }
+    for (int i = 0; i < output.node_size(); ++i) {
+      const NodeDef& node = output.node(i);
+      if (node.name() == "output") {
+        EXPECT_EQ(node.op(), is_complex ? "Square" : "Identity");
+        ASSERT_EQ(node.input_size(), 1);
+        EXPECT_EQ(node.input(0), "sub_x_y");
+      } else if (node.name() == "sub_x_y") {
+        EXPECT_EQ(node.op(), is_complex ? "Sub" : "SquaredDifference");
+        ASSERT_EQ(node.input_size(), 2);
+        EXPECT_EQ(node.input(0), is_complex ? "complex_x" : "x");
+        EXPECT_EQ(node.input(1), is_complex ? "complex_y" : "y");
+      }
     }
   }
 }
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
index 252eb3c885c..94907d2ee6c 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
@@ -293,7 +293,7 @@ class NodeTypeAttrMap {
     }
     // Note that the mappings generated here include inputs/outputs with fixed
     // types. This makes the mappings complete (all inputs and outputs are
-    // included), and allows the graph rewriter to propagate black paint
+    // included), and allows the graph rewriter to propagate deny paint
     // from/through ops with fixed types.
     io2type_entry.first.reserve(input_arg_inds.size());
     for (int i = 0; i < static_cast<int>(input_arg_inds.size()); ++i) {
@@ -843,10 +843,10 @@ DataTypeSet AllowedDataTypes(const OpDef& op_def, const TypeAttrId& t_attr_id) {
 }
 
 Status ValidateLists(const gtl::FlatSet<string>& allow_list,
-                     const gtl::FlatSet<string>& black_list,
-                     const gtl::FlatSet<string>& gray_list,
+                     const gtl::FlatSet<string>& deny_list,
+                     const gtl::FlatSet<string>& infer_list,
                      const gtl::FlatSet<string>& clear_list) {
-  std::vector<gtl::FlatSet<string>> lists{allow_list, black_list, gray_list,
+  std::vector<gtl::FlatSet<string>> lists{allow_list, deny_list, infer_list,
                                           clear_list};
   std::multiset<string> counts;
   for (const auto& list : lists) {
@@ -967,23 +967,23 @@ class AutoMixedPrecisionImpl {
   bool SupportsF16(const NodeTypeId& node_type) const;
   const NodeTypeId* GetTensorListFloat32NodeTypeId(const NodeDef& node) const;
   bool IsSourceOrSinkOp(const string& op) const;
-  void FindFloat32TensorListOpClustersAndBlacklistUnsafe(
+  void FindFloat32TensorListOpClustersAndDenylistUnsafe(
       std::vector<absl::flat_hash_set<const NodeDef*>>* clusters,
-      absl::flat_hash_set<int>* black_set) const;
+      absl::flat_hash_set<int>* deny_set) const;
   void FindTensorListImplicitFloat32Edges(
       const absl::flat_hash_set<const NodeDef*>& tensor_list_nodes,
       std::vector<NodeTypeIdEdge>* implicit_data_edges) const;
   void AddAllowlistOps(absl::flat_hash_set<int>* allow_set) const;
-  void PropagateBlackFwdThroughClearAndGray(
-      absl::flat_hash_set<int>* black_set) const;
+  void PropagateDenyFwdThroughClearAndInfer(
+      absl::flat_hash_set<int>* deny_set) const;
   void ForceColorMatchBetweenTensorListOps(
       const absl::flat_hash_set<const NodeDef*>& tensor_list_nodes,
       absl::flat_hash_set<int>* allow_set,
-      absl::flat_hash_set<int>* black_set) const;
-  void AddClearAndGrayToAllowIfBetweenAllow(
-      const absl::flat_hash_set<int>& black_set,
+      absl::flat_hash_set<int>* deny_set) const;
+  void AddClearAndInferToAllowIfBetweenAllow(
+      const absl::flat_hash_set<int>& deny_set,
       absl::flat_hash_set<int>* allow_set) const;
-  void PropagateAllowThroughClear(const absl::flat_hash_set<int>& black_set,
+  void PropagateAllowThroughClear(const absl::flat_hash_set<int>& deny_set,
                                   absl::flat_hash_set<int>* allow_set) const;
   Status ForceColorMatchOnRecurrentEdges(
       absl::flat_hash_set<int>* allow_set) const;
@@ -1006,8 +1006,8 @@ class AutoMixedPrecisionImpl {
   bool force_all_fp16_;
   AutoMixedPrecisionMode mode_;
   gtl::FlatSet<string> f16_allowlist_;
-  gtl::FlatSet<string> f16_blacklist_;
-  gtl::FlatSet<string> f16_graylist_;
+  gtl::FlatSet<string> f16_denylist_;
+  gtl::FlatSet<string> f16_inferlist_;
   gtl::FlatSet<string> f16_clearlist_;
   absl::flat_hash_set<const NodeDef*> should_process_nodes_;
   DataType target_dtype_;  // Either DT_HALF or DT_BFLOAT16
@@ -1083,12 +1083,12 @@ Status AutoMixedPrecisionImpl::PrintDebugLogs(bool preop, size_t timestamp) {
     for (const auto& x : mp_lists->AllowList()) {
       f << x << "\n";
     }
-    f << "\nBlackList:\n";
-    for (const auto& x : mp_lists->BlackList()) {
+    f << "\nDenyList:\n";
+    for (const auto& x : mp_lists->DenyList()) {
       f << x << "\n";
     }
-    f << "\nGrayList:\n";
-    for (const auto& x : mp_lists->GrayList()) {
+    f << "\nInferList:\n";
+    for (const auto& x : mp_lists->InferList()) {
       f << x << "\n";
     }
     f << "\nClearList:\n";
@@ -1255,11 +1255,11 @@ Status AutoMixedPrecisionImpl::Optimize() {
   std::unique_ptr<AutoMixedPrecisionLists> mp_lists =
       get_mixed_precision_lists();
   f16_allowlist_ = mp_lists->AllowList();
-  f16_blacklist_ = mp_lists->BlackList();
-  f16_graylist_ = mp_lists->GrayList();
+  f16_denylist_ = mp_lists->DenyList();
+  f16_inferlist_ = mp_lists->InferList();
   f16_clearlist_ = mp_lists->ClearList();
-  TF_RETURN_IF_ERROR(ValidateLists(f16_allowlist_, f16_blacklist_,
-                                   f16_graylist_, f16_clearlist_));
+  TF_RETURN_IF_ERROR(ValidateLists(f16_allowlist_, f16_denylist_,
+                                   f16_inferlist_, f16_clearlist_));
 
   size_t timestamp = Env::Default()->NowMicros() / 1000;
   TF_RETURN_IF_ERROR(PrintDebugLogs(/* preop = */ true, timestamp));
@@ -1294,11 +1294,11 @@ Status AutoMixedPrecisionImpl::Optimize() {
   TF_RETURN_IF_ERROR(
       graph_type_view_.InitializeFromGraph(*graph_, node_type_map_));
 
-  absl::flat_hash_set<int> black_set;
+  absl::flat_hash_set<int> deny_set;
 
   std::vector<absl::flat_hash_set<const NodeDef*>> tensor_list_clusters;
-  FindFloat32TensorListOpClustersAndBlacklistUnsafe(&tensor_list_clusters,
-                                                    &black_set);
+  FindFloat32TensorListOpClustersAndDenylistUnsafe(&tensor_list_clusters,
+                                                   &deny_set);
   std::vector<NodeTypeIdEdge> ephemeral_edges;
   for (const auto& cluster : tensor_list_clusters) {
     VLOG(1) << "Found safe Tensor List cluster of size " << cluster.size();
@@ -1320,14 +1320,14 @@ Status AutoMixedPrecisionImpl::Optimize() {
   //    This is done under the assumption that allowlist ops are always
   //    numerically-safe in f16 and that they are the most important ops for
   //    improving performance.
-  // 2) Add nodes to the black_set iff they are numerically-dangerous (aka
-  //    "blacklist" ops) or they are on a forward path from a blacklist node to
-  //    a black/gray node (including the node at the end of the path) through
-  //    non-numerically-dangerous ops (aka "greylist" and "clearlist" ops).
+  // 2) Add nodes to the deny_set iff they are numerically-dangerous (aka
+  //    "denylist" ops) or they are on a forward path from a denylist node to
+  //    a deny/infer node (including the node at the end of the path) through
+  //    non-numerically-dangerous ops (aka "inferlist" and "clearlist" ops).
   //    This is done to prevent numerically-dangerous ops and their downstream
   //    effects from being changed to f16, which would risk breaking the
   //    numerical accuracy of the model.
-  // 3) For all remaining nodes that are not considered dangerous (greylist
+  // 3) For all remaining nodes that are not considered dangerous (inferlist
   //    and clearlist ops), find those that are between (i.e., both upstream
   //    and downstream of) allow nodes, and add them to the allow_set.
   //    This is done to avoid unnecessary casts between allowlist ops.
@@ -1346,29 +1346,29 @@ Status AutoMixedPrecisionImpl::Optimize() {
     return Status::OK();
   }
 
-  VLOG(2) << "Beginning pass 2 to propagate black forwards from blacklist ops "
-             "through clear/graylist ops";
-  PropagateBlackFwdThroughClearAndGray(&black_set);
+  VLOG(2) << "Beginning pass 2 to propagate deny forwards from denylist ops "
+             "through clear/inferlist ops";
+  PropagateDenyFwdThroughClearAndInfer(&deny_set);
   VLOG(2) << "Finished pass 2";
 
   VLOG(2) << "Forcing color match between data structure ops";
   for (const auto& cluster : tensor_list_clusters) {
-    ForceColorMatchBetweenTensorListOps(cluster, &allow_set, &black_set);
+    ForceColorMatchBetweenTensorListOps(cluster, &allow_set, &deny_set);
   }
 
-  VLOG(2) << "Beginning pass 3 to set clear and gray nodes to allow if they "
+  VLOG(2) << "Beginning pass 3 to set clear and infer nodes to allow if they "
              "are between allow ops";
-  AddClearAndGrayToAllowIfBetweenAllow(black_set, &allow_set);
+  AddClearAndInferToAllowIfBetweenAllow(deny_set, &allow_set);
   VLOG(2) << "Finished pass 3";
 
   VLOG(2) << "Beginning pass 4 to propagate allow from allow nodes through "
              "clearlist ops";
-  PropagateAllowThroughClear(black_set, &allow_set);
+  PropagateAllowThroughClear(deny_set, &allow_set);
   VLOG(2) << "Finished pass 4";
 
   VLOG(2) << "Forcing color match between data structure ops";
   for (const auto& cluster : tensor_list_clusters) {
-    ForceColorMatchBetweenTensorListOps(cluster, &allow_set, &black_set);
+    ForceColorMatchBetweenTensorListOps(cluster, &allow_set, &deny_set);
   }
 
   VLOG(2) << "Forcing color match on loop edges";
@@ -1426,11 +1426,11 @@ bool AutoMixedPrecisionImpl::IsSourceOrSinkOp(const string& op) const {
 // Finds all clusters of float32 Tensor List nodes that are connected via their
 // handle edges. Unsafe clusters (those with unprocessable nodes, or with edges
 // that cross untraversable boundaries via _Arg, _Ret, PartitionedCall etc.
-// nodes) are added to black_set. The caller should paint all nodes in a cluster
+// nodes) are added to deny_set. The caller should paint all nodes in a cluster
 // the same color, as they may all refer to the same Tensor List.
-void AutoMixedPrecisionImpl::FindFloat32TensorListOpClustersAndBlacklistUnsafe(
+void AutoMixedPrecisionImpl::FindFloat32TensorListOpClustersAndDenylistUnsafe(
     std::vector<absl::flat_hash_set<const NodeDef*>>* tensor_list_clusters,
-    absl::flat_hash_set<int>* black_set) const {
+    absl::flat_hash_set<int>* deny_set) const {
   absl::flat_hash_set<const NodeDef*> tensor_list_prop_set;
   for (int root_idx = 0; root_idx < graph_type_view_.num_nodes(); ++root_idx) {
     const NodeTypeId& root = *graph_type_view_.GetNode(root_idx);
@@ -1463,7 +1463,7 @@ void AutoMixedPrecisionImpl::FindFloat32TensorListOpClustersAndBlacklistUnsafe(
                          cluster.insert(node);
                          if (!ShouldProcess(*node)) {
                            // The cluster contains an un-processable node.
-                           black_set->insert(root_fp32_idx);
+                           deny_set->insert(root_fp32_idx);
                          }
                          // TODO(benbarsdell): In a theoretical pathological
                          // case of a Tensor List of Tensor List handles, the
@@ -1471,7 +1471,7 @@ void AutoMixedPrecisionImpl::FindFloat32TensorListOpClustersAndBlacklistUnsafe(
                          // sink.
                        } else if (IsSourceOrSinkOp(node->op())) {
                          // The cluster crosses an untraversable boundary.
-                         black_set->insert(root_fp32_idx);
+                         deny_set->insert(root_fp32_idx);
                        }
                      }));
     tensor_list_clusters->push_back(cluster);
@@ -1534,21 +1534,21 @@ void AutoMixedPrecisionImpl::AddAllowlistOps(
   }
 }
 
-// Adds nodes to black_set iff they are on the blacklist or they are on a
-// forward path from a blacklist node to a black/gray node (including the node
-// at the end of the path) through clear and gray nodes.
-// E.g., black -> gray -> clear -> gray -> clear -> allow -> gray
-// becomes: black -> black -> black -> black -> clear -> allow -> gray.
-void AutoMixedPrecisionImpl::PropagateBlackFwdThroughClearAndGray(
-    absl::flat_hash_set<int>* black_set) const {
+// Adds nodes to deny_set iff they are on the denylist or they are on a
+// forward path from a denylist node to a deny/infer node (including the node
+// at the end of the path) through clear and infer nodes.
+// E.g., deny -> infer -> clear -> infer -> clear -> allow -> infer
+// becomes: deny -> deny -> deny -> deny -> clear -> allow -> infer.
+void AutoMixedPrecisionImpl::PropagateDenyFwdThroughClearAndInfer(
+    absl::flat_hash_set<int>* deny_set) const {
   if (force_all_fp16_) return;
 
-  // Find clear nodes that are upstream of black or gray.
-  absl::flat_hash_set<int> upstream_of_black_or_gray_set;
+  // Find clear nodes that are upstream of deny or infer.
+  absl::flat_hash_set<int> upstream_of_deny_or_infer_set;
   for (int root_idx = 0; root_idx < graph_type_view_.num_nodes(); ++root_idx) {
     const NodeTypeId& root = *graph_type_view_.GetNode(root_idx);
-    if (!(f16_blacklist_.count(root.node->op()) ||
-          f16_graylist_.count(root.node->op()))) {
+    if (!(f16_denylist_.count(root.node->op()) ||
+          f16_inferlist_.count(root.node->op()))) {
       continue;
     }
     DfsTypeTraversal(graph_type_view_, {&root},
@@ -1556,42 +1556,42 @@ void AutoMixedPrecisionImpl::PropagateBlackFwdThroughClearAndGray(
                      DfsTypePredicates::Enter([&](int idx) -> bool {
                        const NodeTypeId& item = *graph_type_view_.GetNode(idx);
                        return idx == root_idx ||
-                              (!upstream_of_black_or_gray_set.count(idx) &&
+                              (!upstream_of_deny_or_infer_set.count(idx) &&
                                f16_clearlist_.count(item.node->op()));
                      }),
                      DfsTypeCallbacks::PreOrder([&](int idx) {
-                       upstream_of_black_or_gray_set.insert(idx);
+                       upstream_of_deny_or_infer_set.insert(idx);
                      }));
   }
 
-  // Propagate black forward through nodes in upstream_of_black_or_gray_set.
+  // Propagate deny forward through nodes in upstream_of_deny_or_infer_set.
   for (int root_idx = 0; root_idx < graph_type_view_.num_nodes(); ++root_idx) {
     const NodeTypeId& root = *graph_type_view_.GetNode(root_idx);
-    if (black_set->count(root_idx) || !f16_blacklist_.count(root.node->op())) {
+    if (deny_set->count(root_idx) || !f16_denylist_.count(root.node->op())) {
       continue;
     }
     DfsTypeTraversal(
         graph_type_view_, {&root}, TypeTraversalDirection::kFollowOutputs,
         DfsTypePredicates::Enter([&](int idx) -> bool {
-          return idx == root_idx || (!black_set->count(idx) &&
-                                     upstream_of_black_or_gray_set.count(idx));
+          return idx == root_idx || (!deny_set->count(idx) &&
+                                     upstream_of_deny_or_infer_set.count(idx));
         }),
         DfsTypeCallbacks::PreOrder([&](int idx) {
-          bool inserted = black_set->insert(idx).second;
+          bool inserted = deny_set->insert(idx).second;
           if (VLOG_IS_ON(2) && inserted) {
             const NodeTypeId& item = *graph_type_view_.GetNode(idx);
             VLOG(2) << "Painting type " << item.type_attr.DebugString()
                     << " of " << item.node->op() << " node "
-                    << item.node->name() << " BLACK";
+                    << item.node->name() << " DENY";
           }
         }));
   }
 }
 
-void AutoMixedPrecisionImpl::AddClearAndGrayToAllowIfBetweenAllow(
-    const absl::flat_hash_set<int>& black_set,
+void AutoMixedPrecisionImpl::AddClearAndInferToAllowIfBetweenAllow(
+    const absl::flat_hash_set<int>& deny_set,
     absl::flat_hash_set<int>* allow_set) const {
-  // Find clear/graylist ops that are downstream of allow ops.
+  // Find clear/inferlist ops that are downstream of allow ops.
   absl::flat_hash_set<int> downstream_of_allow_set;
   for (int root_idx = 0; root_idx < graph_type_view_.num_nodes(); ++root_idx) {
     const NodeTypeId& root = *graph_type_view_.GetNode(root_idx);
@@ -1605,13 +1605,13 @@ void AutoMixedPrecisionImpl::AddClearAndGrayToAllowIfBetweenAllow(
           return idx == root_idx ||
                  (!downstream_of_allow_set.count(idx) &&
                   !f16_allowlist_.count(item.node->op()) &&
-                  !black_set.count(idx) && ShouldProcess(*item.node) &&
+                  !deny_set.count(idx) && ShouldProcess(*item.node) &&
                   // TODO(benbarsdell): Consider allowing propagation through
                   // ops that are already float16 in order to reduce the number
                   // of casts.
                   IsFloat32(item) && SupportsF16(item) &&
                   (f16_clearlist_.count(item.node->op()) ||
-                   f16_graylist_.count(item.node->op())));
+                   f16_inferlist_.count(item.node->op())));
         }),
         DfsTypeCallbacks::PreOrder(
             [&](int idx) { downstream_of_allow_set.insert(idx); }));
@@ -1645,7 +1645,7 @@ void AutoMixedPrecisionImpl::AddClearAndGrayToAllowIfBetweenAllow(
 }
 
 void AutoMixedPrecisionImpl::PropagateAllowThroughClear(
-    const absl::flat_hash_set<int>& black_set,
+    const absl::flat_hash_set<int>& deny_set,
     absl::flat_hash_set<int>* allow_set) const {
   // Propagate allow from allow nodes through clearlist ops.
   absl::flat_hash_set<int> clear_prop_set;
@@ -1661,7 +1661,7 @@ void AutoMixedPrecisionImpl::PropagateAllowThroughClear(
         DfsTypePredicates::Enter([&](int idx) -> bool {
           const NodeTypeId& item = *graph_type_view_.GetNode(idx);
           return idx == root_idx ||
-                 (!allow_set->count(idx) && !black_set.count(idx) &&
+                 (!allow_set->count(idx) && !deny_set.count(idx) &&
                   ShouldProcess(*item.node) && IsFloat32(item) &&
                   SupportsF16(item) &&
                   (f16_clearlist_.count(item.node->op())) &&
@@ -1727,14 +1727,14 @@ Status AutoMixedPrecisionImpl::ForceColorMatchOnRecurrentEdges(
           if (allow_set->erase(merge_idx)) {
             VLOG(2) << "Painting type T of Merge node "
                     << graph_type_view_.GetNode(merge_idx)->node->name()
-                    << " BLACK to match the color of its sibling Merge nodes "
+                    << " DENY to match the color of its sibling Merge nodes "
                        "with common NextIteration node "
                     << node.name();
           }
         }
         if (allow_set->erase(nextiter_idx)) {
           VLOG(2) << "Painting type T of NextIteration node " << node.name()
-                  << " BLACK to match the color of its output Merge node(s)";
+                  << " DENY to match the color of its output Merge node(s)";
         }
       } else {
         if (allow_set->insert(nextiter_idx).second) {
@@ -1751,8 +1751,8 @@ Status AutoMixedPrecisionImpl::ForceColorMatchOnRecurrentEdges(
 void AutoMixedPrecisionImpl::ForceColorMatchBetweenTensorListOps(
     const absl::flat_hash_set<const NodeDef*>& tensor_list_nodes,
     absl::flat_hash_set<int>* allow_set,
-    absl::flat_hash_set<int>* black_set) const {
-  bool any_black = false;
+    absl::flat_hash_set<int>* deny_set) const {
+  bool any_deny = false;
   bool any_allow = false;
   std::vector<int> node_type_idxs;
   node_type_idxs.reserve(tensor_list_nodes.size());
@@ -1766,24 +1766,24 @@ void AutoMixedPrecisionImpl::ForceColorMatchBetweenTensorListOps(
     node_type_idxs.push_back(maybe_node_type_idx.value());
   }
   for (int node_type_idx : node_type_idxs) {
-    if (black_set->count(node_type_idx)) {
-      any_black = true;
+    if (deny_set->count(node_type_idx)) {
+      any_deny = true;
       break;
     } else if (allow_set->count(node_type_idx)) {
       any_allow = true;
     }
   }
-  if (!any_black && !any_allow) return;
+  if (!any_deny && !any_allow) return;
   for (int node_type_idx : node_type_idxs) {
     const NodeTypeId& node_type = *graph_type_view_.GetNode(node_type_idx);
     VLOG(2) << "Painting type " << node_type.type_attr.DebugString() << " of "
             << node_type.node->op() << " node " << node_type.node->name() << " "
-            << (any_black ? "BLACK" : "ALLOW")
+            << (any_deny ? "DENY" : "ALLOW")
             << " because at least one of its siblings is "
-            << (any_black ? "BLACK" : "ALLOW");
-    if (any_black) {
+            << (any_deny ? "DENY" : "ALLOW");
+    if (any_deny) {
       allow_set->erase(node_type_idx);
-      black_set->insert(node_type_idx);
+      deny_set->insert(node_type_idx);
     } else {
       allow_set->insert(node_type_idx);
     }
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
index 6643149a6e5..7902700fb0f 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
@@ -23,7 +23,7 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-// Represents the four lists of ops: the allow list, gray list, black list, and
+// Represents the four lists of ops: the allow list, infer list, deny list, and
 // clear list. These lists determine which ops are converted to fp16/bf16
 // (referred to as 'f16' for short) and which ops stay as fp32.
 class AutoMixedPrecisionLists {
@@ -36,13 +36,13 @@ class AutoMixedPrecisionLists {
   virtual gtl::FlatSet<string> AllowList() = 0;
   // Returns the set of ops that can run in f16 and are considered numerically-
   // safe (for execution in f16), but which may be made unsafe by an upstream
-  // blacklist op.
-  virtual gtl::FlatSet<string> GrayList() = 0;
+  // denylist op.
+  virtual gtl::FlatSet<string> InferList() = 0;
   // Returns the set of ops that are considered numerically-dangerous (i.e.,
   // unsafe for execution in f16) and whose effects may also be observed in
   // downstream nodes (e.g. for f16, in Exp -> Add, the Add is unsafe due to
   // the Exp).
-  virtual gtl::FlatSet<string> BlackList() = 0;
+  virtual gtl::FlatSet<string> DenyList() = 0;
   // Returns the set of ops that do not have numerically-significant effects
   // (i.e., they are always considered safe for execution in f16 precision), and
   // can run in f16.
@@ -51,10 +51,11 @@ class AutoMixedPrecisionLists {
  protected:
   // Adds or removes ops from list if certain environmental variables are set.
   static void UpdateList(const string& list_name, gtl::FlatSet<string>* list) {
-    CHECK(list_name == "ALLOWLIST" || list_name == "GRAYLIST" ||  // Crash OK.
-          list_name == "BLACKLIST" || list_name == "CLEARLIST" ||
+    CHECK(list_name == "ALLOWLIST" || list_name == "INFERLIST" ||  // Crash OK.
+          list_name == "DENYLIST" || list_name == "CLEARLIST" ||
           // TODO(reedwm): for bkwds compat; remove when no longer necessary:
-          list_name == "WHITELIST");
+          list_name == "WHITELIST" || list_name == "GRAYLIST" ||
+          list_name == "BLACKLIST");
     string add_env_var =
         "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_" + list_name + "_ADD";
     string remove_env_var =
@@ -126,11 +127,6 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
         "GRUBlockCellGrad",
         "LSTMBlockCell",
         "LSTMBlockCellGrad",
-        // TODO(benbarsdell): Enable these when fast and safe fp16 kernels are
-        // available for depthwise convolutions.
-        // "DepthwiseConv2dNative",
-        // "DepthwiseConv2dNativeBackpropFilter",
-        // "DepthwiseConv2dNativeBackpropInput",
         "MatMul",
     };
     if (cuda_version_ >= 9010) {
@@ -146,6 +142,11 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
       list.insert("Conv3DBackpropInput");
       list.insert("Conv3DBackpropInputV2");
     }
+    if (cudnn_version_ >= 8000) {
+      list.insert("DepthwiseConv2dNative");
+      list.insert("DepthwiseConv2dNativeBackpropFilter");
+      list.insert("DepthwiseConv2dNativeBackpropInput");
+    }
     UpdateList("ALLOWLIST", &list);
     // For backwards compatibility, keeping the original env variable here.
     // TODO(reedwm): This should be removed if we don't have active users.
@@ -154,7 +155,7 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
     return list;
   }
 
-  gtl::FlatSet<string> GrayList() override {
+  gtl::FlatSet<string> InferList() override {
     if (IsPseudoFastMath()) {
       return gtl::FlatSet<string>{};
     }
@@ -204,11 +205,14 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
         "Tanh",
         "TanhGrad",
     };
+    UpdateList("INFERLIST", &list);
+    // For backwards compatibility, keeping the original env variable here.
+    // TODO(reedwm): This should be removed if we don't have active users.
     UpdateList("GRAYLIST", &list);
     return list;
   }
 
-  gtl::FlatSet<string> BlackList() override {
+  gtl::FlatSet<string> DenyList() override {
     if (IsPseudoFastMath()) {
       return gtl::FlatSet<string>{};
     }
@@ -224,6 +228,9 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
         "SparseSoftmaxCrossEntropyWithLogits",
         "Sum",
     };
+    UpdateList("DENYLIST", &list);
+    // For backwards compatibility, keeping the original env variable here.
+    // TODO(reedwm): This should be removed if we don't have active users.
     UpdateList("BLACKLIST", &list);
     return list;
   }
@@ -344,7 +351,7 @@ class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
   AutoMixedPrecisionListsMkl() {}
 
   // Only ops which are supported by MKL in bfloat16 should be added to the
-  // allow list, gray list, or clear list.
+  // allow list, infer list, or clear list.
   gtl::FlatSet<string> AllowList() override {
     auto list = gtl::FlatSet<string>{"Conv2D",
                                      "Conv2DBackpropFilter",
@@ -360,10 +367,13 @@ class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
                                      "BatchMatMulV2"};
 
     UpdateList("ALLOWLIST", &list);
+    // For backwards compatibility, keeping the original env variable here.
+    // TODO(reedwm): This should be removed if we don't have active users.
+    UpdateList("WHITELIST", &list);
     return list;
   }
 
-  gtl::FlatSet<string> GrayList() override {
+  gtl::FlatSet<string> InferList() override {
     auto list = gtl::FlatSet<string>{
         "Add",
         "AddN",
@@ -384,11 +394,14 @@ class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
         "Mul",
         "Sub",
     };
+    UpdateList("INFERLIST", &list);
+    // For backwards compatibility, keeping the original env variable here.
+    // TODO(reedwm): This should be removed if we don't have active users.
     UpdateList("GRAYLIST", &list);
     return list;
   }
 
-  gtl::FlatSet<string> BlackList() override {
+  gtl::FlatSet<string> DenyList() override {
     auto list = gtl::FlatSet<string>{
         "Exp",
         "Expm1",
@@ -401,6 +414,9 @@ class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
         "SparseSoftmaxCrossEntropyWithLogits",
         "Sum",
     };
+    UpdateList("DENYLIST", &list);
+    // For backwards compatibility, keeping the original env variable here.
+    // TODO(reedwm): This should be removed if we don't have active users.
     UpdateList("BLACKLIST", &list);
     return list;
   }
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
index eef1f4c499a..90c8bc82b70 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
@@ -160,7 +160,7 @@ class AutoMixedPrecisionTest : public GrapplerTest {
     return AddNode(name, op, inputs, attributes, graph);
   }
 
-  void TestSimpleUnaryGrayOp(
+  void TestSimpleUnaryInferOp(
       double input_min, double input_max, double atol, double rtol,
       const std::function<Output(const tensorflow::Scope&, Output)>&
           test_op_factory) {
@@ -170,8 +170,8 @@ class AutoMixedPrecisionTest : public GrapplerTest {
                             GenerateIdentityMatrix<DT_FLOAT>(size, size));
     Output input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
     Output allow1 = ops::MatMul(s.WithOpName("allow1"), input, eye);
-    Output gry1 = test_op_factory(s.WithOpName("gry1"), allow1);
-    Output allow2 = ops::MatMul(s.WithOpName("allow2"), gry1, eye);
+    Output infer1 = test_op_factory(s.WithOpName("infer1"), allow1);
+    Output allow2 = ops::MatMul(s.WithOpName("allow2"), infer1, eye);
     Output fetch1 = ops::Identity(s.WithOpName("fetch1"), allow2);
     GrapplerItem item;
     item.fetch = {"fetch1"};
@@ -191,7 +191,7 @@ class AutoMixedPrecisionTest : public GrapplerTest {
     EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(),
               DT_FLOAT);
     EXPECT_EQ(output_view.GetNode("allow1")->attr().at("T").type(), DT_HALF);
-    EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_HALF);
+    EXPECT_EQ(output_view.GetNode("infer1")->attr().at("T").type(), DT_HALF);
     EXPECT_EQ(output_view.GetNode("allow2")->attr().at("T").type(), DT_HALF);
 
     auto tensors = EvaluateNodes(output, item.fetch, feed);
@@ -209,10 +209,10 @@ class AutoMixedPrecisionTest : public GrapplerTest {
 TEST_F(AutoMixedPrecisionTest, NoOp) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.234f, {32});
-  Output blk1 = ops::Exp(s.WithOpName("blk1"), input);
-  Output clr1 = ops::Relu(s.WithOpName("clr1"), blk1);
-  Output gry1 = ops::Sqrt(s.WithOpName("gry1"), clr1);
-  Output clr2 = ops::Relu(s.WithOpName("clr2"), gry1);
+  Output deny1 = ops::Exp(s.WithOpName("deny1"), input);
+  Output clr1 = ops::Relu(s.WithOpName("clr1"), deny1);
+  Output infer1 = ops::Sqrt(s.WithOpName("infer1"), clr1);
+  Output clr2 = ops::Relu(s.WithOpName("clr2"), infer1);
   Output fetch = ops::Identity(s.WithOpName("fetch"), clr2);
 
   GrapplerItem item;
@@ -230,9 +230,9 @@ TEST_F(AutoMixedPrecisionTest, NoOp) {
 
   GraphView output_view(&output);
   EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(), DT_FLOAT);
-  EXPECT_EQ(output_view.GetNode("blk1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("deny1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr1")->attr().at("T").type(), DT_FLOAT);
-  EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("infer1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr2")->attr().at("T").type(), DT_FLOAT);
 
   auto tensors = EvaluateNodes(output, item.fetch);
@@ -284,16 +284,16 @@ TEST_F(AutoMixedPrecisionTest, AlreadyFp16) {
 TEST_F(AutoMixedPrecisionTest, Simple) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
-  Output blk1 = ops::Exp(s.WithOpName("blk1"), input);
-  Output clr1 = ops::Relu(s.WithOpName("clr1"), blk1);
-  Output gry1 = ops::Sqrt(s.WithOpName("gry1"), clr1);
-  Output clr2 = ops::Relu(s.WithOpName("clr2"), gry1);
+  Output deny1 = ops::Exp(s.WithOpName("deny1"), input);
+  Output clr1 = ops::Relu(s.WithOpName("clr1"), deny1);
+  Output infer1 = ops::Sqrt(s.WithOpName("infer1"), clr1);
+  Output clr2 = ops::Relu(s.WithOpName("clr2"), infer1);
   Output allow1 = ops::MatMul(s.WithOpName("allow1"), clr2, clr2);
   Output clr3 = ops::Relu(s.WithOpName("clr3"), allow1);
-  Output gry2 = ops::Log(s.WithOpName("gry2"), clr3);
-  Output clr4 = ops::Relu(s.WithOpName("clr4"), gry2);
-  Output blk2 = ops::SparseMatMul(s.WithOpName("blk2"), clr4, clr4);
-  Output clr5 = ops::Relu(s.WithOpName("clr5"), blk2);
+  Output infer2 = ops::Log(s.WithOpName("infer2"), clr3);
+  Output clr4 = ops::Relu(s.WithOpName("clr4"), infer2);
+  Output deny2 = ops::SparseMatMul(s.WithOpName("deny2"), clr4, clr4);
+  Output clr5 = ops::Relu(s.WithOpName("clr5"), deny2);
   Output fetch = ops::Identity(s.WithOpName("fetch"), clr5);
 
   GrapplerItem item;
@@ -310,16 +310,16 @@ TEST_F(AutoMixedPrecisionTest, Simple) {
   GraphView output_view(&output);
   EXPECT_EQ(output.node_size(), item.graph.node_size() + 2);
   EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(), DT_FLOAT);
-  EXPECT_EQ(output_view.GetNode("blk1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("deny1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr1")->attr().at("T").type(), DT_FLOAT);
-  EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("infer1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr2")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("allow1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("clr3")->attr().at("T").type(), DT_HALF);
-  EXPECT_EQ(output_view.GetNode("gry2")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("infer2")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr4")->attr().at("T").type(), DT_FLOAT);
-  EXPECT_EQ(output_view.GetNode("blk2")->attr().at("Ta").type(), DT_FLOAT);
-  EXPECT_EQ(output_view.GetNode("blk2")->attr().at("Tb").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("deny2")->attr().at("Ta").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("deny2")->attr().at("Tb").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr5")->attr().at("T").type(), DT_FLOAT);
 
   auto tensors = EvaluateNodes(output, item.fetch);
@@ -374,13 +374,13 @@ TEST_F(AutoMixedPrecisionTest, PreserveFetches) {
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output allow1 = ops::MatMul(s.WithOpName("allow1"), input, input);
   Output clr1 = ops::Relu(s.WithOpName("clr1"), allow1);
-  Output gry1 = ops::Sqrt(s.WithOpName("gry1"), clr1);
-  Output blk1 = ops::Exp(s.WithOpName("blk1"), gry1);
-  Output clr2 = ops::Relu(s.WithOpName("clr2"), blk1);
+  Output infer1 = ops::Sqrt(s.WithOpName("infer1"), clr1);
+  Output deny1 = ops::Exp(s.WithOpName("deny1"), infer1);
+  Output clr2 = ops::Relu(s.WithOpName("clr2"), deny1);
   Output allow2 = ops::MatMul(s.WithOpName("allow2"), clr2, clr2);
   Output clr3 = ops::Relu(s.WithOpName("clr3"), allow2);
-  Output blk2 = ops::Exp(s.WithOpName("blk2"), clr3);
-  Output clr4 = ops::Relu(s.WithOpName("clr4"), blk2);
+  Output deny2 = ops::Exp(s.WithOpName("deny2"), clr3);
+  Output clr4 = ops::Relu(s.WithOpName("clr4"), deny2);
 
   GrapplerItem item;
   item.fetch = {"allow1", "clr2", "clr3"};
@@ -398,12 +398,12 @@ TEST_F(AutoMixedPrecisionTest, PreserveFetches) {
   EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("allow1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr1")->attr().at("T").type(), DT_FLOAT);
-  EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_FLOAT);
-  EXPECT_EQ(output_view.GetNode("blk1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("infer1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("deny1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr2")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("allow2")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("clr3")->attr().at("T").type(), DT_FLOAT);
-  EXPECT_EQ(output_view.GetNode("blk2")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("deny2")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr4")->attr().at("T").type(), DT_FLOAT);
 
   auto tensors = EvaluateNodes(output, item.fetch);
@@ -419,11 +419,11 @@ TEST_F(AutoMixedPrecisionTest, PreserveCPUNodes) {
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output clr1 = ops::Relu(s.WithOpName("clr1"), input);
   Output allow1 = ops::MatMul(s.WithOpName("allow1"), clr1, clr1);
-  Output gry1 = ops::Tanh(s.WithOpName("gry1"), allow1);
+  Output infer1 = ops::Tanh(s.WithOpName("infer1"), allow1);
   Output allow2 =
       ops::MatMul(s.WithOpName("allow2").WithDevice(
                       "/job:localhost/replica:0/task:0/device:CPU:0"),
-                  gry1, gry1);
+                  infer1, infer1);
   Output clr2 = ops::Relu(s.WithOpName("clr2"), allow2);
   Output fetch = ops::Identity(s.WithOpName("fetch"), clr2);
 
@@ -443,7 +443,7 @@ TEST_F(AutoMixedPrecisionTest, PreserveCPUNodes) {
   EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("allow1")->attr().at("T").type(), DT_HALF);
-  EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("infer1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("allow2")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr2")->attr().at("T").type(), DT_FLOAT);
 
@@ -521,9 +521,9 @@ TEST_F(AutoMixedPrecisionTest, FusedBatchNorm) {
                     s.WithOpName("bng1"), fbn1, allow1, scale, fbn1_rs1,
                     fbn1_rs2, ops::FusedBatchNormGrad::DataFormat("NHWC"))
                     .x_backprop;
-  Output gry1 = ops::Add(s.WithOpName("gry1"), fbn1, bng1);
+  Output infer1 = ops::Add(s.WithOpName("infer1"), fbn1, bng1);
   Output allow2 =
-      ops::Conv2D(s.WithOpName("allow2"), gry1, weight, {1, 1, 1, 1}, "SAME",
+      ops::Conv2D(s.WithOpName("allow2"), infer1, weight, {1, 1, 1, 1}, "SAME",
                   ops::Conv2D::DataFormat("NHWC"));
   Output fetch = ops::Identity(s.WithOpName("fetch"), allow2);
 
@@ -547,7 +547,7 @@ TEST_F(AutoMixedPrecisionTest, FusedBatchNorm) {
   EXPECT_EQ(output_view.GetNode("bng1")->op(), "FusedBatchNormGradV2");
   EXPECT_EQ(output_view.GetNode("bng1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("bng1")->attr().at("U").type(), DT_FLOAT);
-  EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("infer1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("allow2")->attr().at("T").type(), DT_HALF);
 
   auto tensors = EvaluateNodes(output, item.fetch);
@@ -563,10 +563,10 @@ TEST_F(AutoMixedPrecisionTest, RepeatedAndListTypeAttrs) {
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output allow1 = ops::MatMul(s.WithOpName("allow1"), input, input);
   auto clr1_op = ops::IdentityN(s.WithOpName("clr1"), {allow1, allow1, allow1});
-  Output gry1 =
-      ops::AddN(s.WithOpName("gry1"),
+  Output infer1 =
+      ops::AddN(s.WithOpName("infer1"),
                 {clr1_op.output[0], clr1_op.output[1], clr1_op.output[2]});
-  Output allow2 = ops::MatMul(s.WithOpName("allow2"), gry1, gry1);
+  Output allow2 = ops::MatMul(s.WithOpName("allow2"), infer1, infer1);
   Output fetch = ops::Identity(s.WithOpName("fetch"), allow2);
 
   GrapplerItem item;
@@ -587,7 +587,7 @@ TEST_F(AutoMixedPrecisionTest, RepeatedAndListTypeAttrs) {
   for (auto type : output_view.GetNode("clr1")->attr().at("T").list().type()) {
     EXPECT_EQ(type, DT_HALF);
   }
-  EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("infer1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("allow2")->attr().at("T").type(), DT_HALF);
 
   auto tensors = EvaluateNodes(output, item.fetch);
@@ -633,17 +633,17 @@ TEST_F(AutoMixedPrecisionTest, ExistingCast) {
 TEST_F(AutoMixedPrecisionTest, RecurrentEdgeColorMismatch) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
-  Output blk1 = ops::Exp(s.WithOpName("blk1"), input);
+  Output deny1 = ops::Exp(s.WithOpName("deny1"), input);
   Output ent1 =
-      ops::internal::Enter(s.WithOpName("ent1"), blk1, "loop1").output;
+      ops::internal::Enter(s.WithOpName("ent1"), deny1, "loop1").output;
   // Note that the second input is later replaced with "nxt1".
   Output mrg1 = ops::Merge(s.WithOpName("mrg1"), {ent1, ent1}).output;
   // For simplicity, the loop condition is constant false.
   Output con1 = ops::Const(s.WithOpName("con1"), false, {});
   Output lpc1 = ops::LoopCond(s.WithOpName("lpc1"), con1).output;
   auto swt1 = ops::Switch(s.WithOpName("swt1"), mrg1, lpc1);
-  Output gry1 = ops::Sqrt(s.WithOpName("gry1"), swt1.output_true);
-  Output allow1 = ops::MatMul(s.WithOpName("allow1"), gry1, gry1);
+  Output infer1 = ops::Sqrt(s.WithOpName("infer1"), swt1.output_true);
+  Output allow1 = ops::MatMul(s.WithOpName("allow1"), infer1, infer1);
   Output nxt1 = ops::NextIteration(s.WithOpName("nxt1"), allow1);
   Output ext1 = ops::internal::Exit(s.WithOpName("ext1"), swt1.output_false);
   Output fetch = ops::Identity(s.WithOpName("fetch"), ext1);
@@ -671,14 +671,14 @@ TEST_F(AutoMixedPrecisionTest, RecurrentEdgeColorMismatch) {
 
   GraphView output_view(&output);
   EXPECT_EQ(output.node_size(), item.graph.node_size() + 2);
-  // Note that mrg1 gets painted black because it is between blk1 and gry1. This
-  // forces nxt1 and mrg2 to be painted black as well (they would otherwise be
-  // painted allow because they are clear and have a direct path to allow1).
-  EXPECT_EQ(output_view.GetNode("blk1")->attr().at("T").type(), DT_FLOAT);
+  // Note that mrg1 gets painted deny because it is between deny1 and infer1.
+  // This forces nxt1 and mrg2 to be painted deny as well (they would otherwise
+  // be painted allow because they are clear and have a direct path to allow1).
+  EXPECT_EQ(output_view.GetNode("deny1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("ent1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("mrg1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("swt1")->attr().at("T").type(), DT_FLOAT);
-  EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("infer1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("allow1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("nxt1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("ext1")->attr().at("T").type(), DT_FLOAT);
@@ -711,8 +711,8 @@ TEST_F(AutoMixedPrecisionTest, TensorListSetGet) {
   Output tl1r1 = ops::TensorListGetItem(s.WithOpName("tl1r1"), tl1rs, idx2,
                                         shape, DT_FLOAT)
                      .item;
-  Output gry1 = ops::Tanh(s.WithOpName("gry1"), tl1r1);
-  Output allow2 = ops::MatMul(s.WithOpName("allow2"), gry1, gry1);
+  Output infer1 = ops::Tanh(s.WithOpName("infer1"), tl1r1);
+  Output allow2 = ops::MatMul(s.WithOpName("allow2"), infer1, infer1);
   auto tl1w3 =
       ops::TensorListSetItem(s.WithOpName("tl1w3"), tl1.handle, idx3, allow2);
   Output tl1r2 =
@@ -748,7 +748,7 @@ TEST_F(AutoMixedPrecisionTest, TensorListSetGet) {
   EXPECT_EQ(output_view.GetNode("allow1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl1w2")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl1r1")->attr().at(type_key).type(), DT_HALF);
-  EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("infer1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("allow2")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl1w3")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl2")->attr().at(type_key).type(), DT_FLOAT);
@@ -776,8 +776,8 @@ TEST_F(AutoMixedPrecisionTest, TensorListPushPop) {
   Output tl1r1 = ops::TensorListPopBack(s.WithOpName("tl1r1"),
                                         tl1w2.output_handle, shape, DT_FLOAT)
                      .tensor;
-  Output gry1 = ops::Tanh(s.WithOpName("gry1"), tl1r1);
-  Output allow2 = ops::MatMul(s.WithOpName("allow2"), gry1, gry1);
+  Output infer1 = ops::Tanh(s.WithOpName("infer1"), tl1r1);
+  Output allow2 = ops::MatMul(s.WithOpName("allow2"), infer1, infer1);
   auto tl1w3 =
       ops::TensorListPushBack(s.WithOpName("tl1w3"), tl1.handle, allow2);
   Output tl1r2 = ops::TensorListPopBack(s.WithOpName("tl1r2"),
@@ -811,7 +811,7 @@ TEST_F(AutoMixedPrecisionTest, TensorListPushPop) {
   EXPECT_EQ(output_view.GetNode("allow1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl1w2")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl1r1")->attr().at(type_key).type(), DT_HALF);
-  EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("infer1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("allow2")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl1w3")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl2")->attr().at(type_key).type(), DT_FLOAT);
@@ -835,8 +835,8 @@ TEST_F(AutoMixedPrecisionTest, TensorListFromTensor) {
   Output tl1r1 = ops::TensorListStack(s.WithOpName("tl1r1"), tl1.output_handle,
                                       shape, DT_FLOAT)
                      .tensor;
-  Output gry1 = ops::Tanh(s.WithOpName("gry1"), tl1r1);
-  Output allow2 = ops::MatMul(s.WithOpName("allow2"), gry1, gry1);
+  Output infer1 = ops::Tanh(s.WithOpName("infer1"), tl1r1);
+  Output allow2 = ops::MatMul(s.WithOpName("allow2"), infer1, infer1);
   Output fetch1 = ops::Identity(s.WithOpName("fetch1"), allow2);
 
   // This tests that a allow-painted object node (tl2) will force an unpainted
@@ -863,7 +863,7 @@ TEST_F(AutoMixedPrecisionTest, TensorListFromTensor) {
   EXPECT_EQ(output_view.GetNode("allow1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl1")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl1r1")->attr().at(type_key).type(), DT_HALF);
-  EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("infer1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("allow2")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl2")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl2w1")->attr().at(type_key).type(), DT_HALF);
@@ -902,8 +902,8 @@ TEST_F(AutoMixedPrecisionTest, TensorListPushBackBatchAndConcatLists) {
   Output tl3r1 =
       ops::TensorListPopBack(s.WithOpName("tl3r1"), tl3, shape, DT_FLOAT)
           .tensor;
-  Output gry1 = ops::Tanh(s.WithOpName("gry1"), tl3r1);
-  Output allow2 = ops::MatMul(s.WithOpName("allow2"), gry1, gry1);
+  Output infer1 = ops::Tanh(s.WithOpName("infer1"), tl3r1);
+  Output allow2 = ops::MatMul(s.WithOpName("allow2"), infer1, infer1);
   Output fetch1 = ops::Identity(s.WithOpName("fetch1"), allow2);
 
   GrapplerItem item;
@@ -922,7 +922,7 @@ TEST_F(AutoMixedPrecisionTest, TensorListPushBackBatchAndConcatLists) {
   const char* type_key = "element_dtype";
   EXPECT_EQ(output_view.GetNode("allow1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("allow2")->attr().at("T").type(), DT_HALF);
-  EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("infer1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl1")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl2")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl3")->attr().at(type_key).type(), DT_HALF);
@@ -967,22 +967,25 @@ TEST_F(AutoMixedPrecisionTest, TensorListThroughFunction) {
   tensorflow::Input shape = {32, 32};
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output allow1 = ops::MatMul(s.WithOpName("allow1"), input, input);
-  Output gry1 = ops::Tanh(s.WithOpName("gry1"), allow1);
+  Output infer1 = ops::Tanh(s.WithOpName("infer1"), allow1);
   auto tl1 = ops::EmptyTensorList(s.WithOpName("tl1"), {32, 32}, 8, DT_FLOAT);
-  auto tl1w1 = ops::TensorListPushBack(s.WithOpName("tl1w1"), tl1.handle, gry1);
-  auto _gry1 = tensorflow::ops::AsNodeOut(s, gry1);
+  auto tl1w1 =
+      ops::TensorListPushBack(s.WithOpName("tl1w1"), tl1.handle, infer1);
+  auto _infer1 = tensorflow::ops::AsNodeOut(s, infer1);
   auto _tl1w1_handle = tensorflow::ops::AsNodeOut(s, tl1w1.output_handle);
   auto builder =
       tensorflow::NodeBuilder("Func1", "Func1", s.graph()->op_registry());
   tensorflow::Node* func1_op;
-  TF_CHECK_OK(
-      builder.Input(_tl1w1_handle).Input(_gry1).Finalize(s.graph(), &func1_op));
+  TF_CHECK_OK(builder.Input(_tl1w1_handle)
+                  .Input(_infer1)
+                  .Finalize(s.graph(), &func1_op));
   Output func1_handle(func1_op, 0);
   Output tl1r1 = ops::TensorListPopBack(s.WithOpName("tl1r1"), func1_handle,
                                         shape, DT_FLOAT)
                      .tensor;
   auto tl2 = ops::EmptyTensorList(s.WithOpName("tl2"), {32, 32}, 8, DT_FLOAT);
-  auto tl2w1 = ops::TensorListPushBack(s.WithOpName("tl2w1"), tl2.handle, gry1);
+  auto tl2w1 =
+      ops::TensorListPushBack(s.WithOpName("tl2w1"), tl2.handle, infer1);
   Output tl2r1 = ops::TensorListPopBack(s.WithOpName("tl2r1"),
                                         tl2w1.output_handle, shape, DT_FLOAT)
                      .tensor;
@@ -1004,7 +1007,7 @@ TEST_F(AutoMixedPrecisionTest, TensorListThroughFunction) {
   const char* type_key = "element_dtype";
   EXPECT_EQ(output_view.GetNode("allow1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("allow2")->attr().at("T").type(), DT_HALF);
-  EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("infer1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl2")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl2w1")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl2r1")->attr().at(type_key).type(), DT_HALF);
@@ -1069,7 +1072,7 @@ TEST_F(AutoMixedPrecisionTest, BatchMatMul) {
 }
 
 TEST_F(AutoMixedPrecisionTest, EluOp) {
-  TestSimpleUnaryGrayOp(
+  TestSimpleUnaryInferOp(
       -5, 5, 1.0e-3, 1.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Elu(scope, input);
@@ -1077,7 +1080,7 @@ TEST_F(AutoMixedPrecisionTest, EluOp) {
 }
 
 TEST_F(AutoMixedPrecisionTest, ErfOp) {
-  TestSimpleUnaryGrayOp(
+  TestSimpleUnaryInferOp(
       -5, 5, 1.0e-3, -1,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Erf(scope, input);
@@ -1085,7 +1088,7 @@ TEST_F(AutoMixedPrecisionTest, ErfOp) {
 }
 
 TEST_F(AutoMixedPrecisionTest, ErfcOp) {
-  TestSimpleUnaryGrayOp(
+  TestSimpleUnaryInferOp(
       -5, 5, 1.0e-3, -1,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Erfc(scope, input);
@@ -1093,7 +1096,7 @@ TEST_F(AutoMixedPrecisionTest, ErfcOp) {
 }
 
 TEST_F(AutoMixedPrecisionTest, InvOp) {
-  TestSimpleUnaryGrayOp(
+  TestSimpleUnaryInferOp(
       0.01, 10, -1, 1.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Inv(scope, input);
@@ -1101,7 +1104,7 @@ TEST_F(AutoMixedPrecisionTest, InvOp) {
 }
 
 TEST_F(AutoMixedPrecisionTest, LogOp) {
-  TestSimpleUnaryGrayOp(
+  TestSimpleUnaryInferOp(
       0.01, 10, 1.0e-3, 2.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Log(scope, input);
@@ -1109,7 +1112,7 @@ TEST_F(AutoMixedPrecisionTest, LogOp) {
 }
 
 TEST_F(AutoMixedPrecisionTest, Log1pOp) {
-  TestSimpleUnaryGrayOp(
+  TestSimpleUnaryInferOp(
       -0.99, 9, 1.0e-3, 5.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Log1p(scope, input);
@@ -1117,7 +1120,7 @@ TEST_F(AutoMixedPrecisionTest, Log1pOp) {
 }
 
 TEST_F(AutoMixedPrecisionTest, LogSoftmaxOp) {
-  TestSimpleUnaryGrayOp(
+  TestSimpleUnaryInferOp(
       -8, 8, -1, 1.0e-2,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::LogSoftmax(scope, input);
@@ -1125,7 +1128,7 @@ TEST_F(AutoMixedPrecisionTest, LogSoftmaxOp) {
 }
 
 TEST_F(AutoMixedPrecisionTest, ReciprocalOp) {
-  TestSimpleUnaryGrayOp(
+  TestSimpleUnaryInferOp(
       0.01, 10, -1, 1.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Reciprocal(scope, input);
@@ -1133,7 +1136,7 @@ TEST_F(AutoMixedPrecisionTest, ReciprocalOp) {
 }
 
 TEST_F(AutoMixedPrecisionTest, SigmoidOp) {
-  TestSimpleUnaryGrayOp(
+  TestSimpleUnaryInferOp(
       -5, 5, 1.0e-3, -1,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Sigmoid(scope, input);
@@ -1141,7 +1144,7 @@ TEST_F(AutoMixedPrecisionTest, SigmoidOp) {
 }
 
 TEST_F(AutoMixedPrecisionTest, SoftmaxOp) {
-  TestSimpleUnaryGrayOp(
+  TestSimpleUnaryInferOp(
       -8, 8, 2.0e-3, -1,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Softmax(scope, input);
@@ -1149,7 +1152,7 @@ TEST_F(AutoMixedPrecisionTest, SoftmaxOp) {
 }
 
 TEST_F(AutoMixedPrecisionTest, SoftplusOp) {
-  TestSimpleUnaryGrayOp(
+  TestSimpleUnaryInferOp(
       -5, 5, 1.0e-3, 1.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Softplus(scope, input);
@@ -1157,7 +1160,7 @@ TEST_F(AutoMixedPrecisionTest, SoftplusOp) {
 }
 
 TEST_F(AutoMixedPrecisionTest, SqrtOp) {
-  TestSimpleUnaryGrayOp(
+  TestSimpleUnaryInferOp(
       0, 10, 1.0e-3, 1.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Sqrt(scope, input);
@@ -1165,7 +1168,7 @@ TEST_F(AutoMixedPrecisionTest, SqrtOp) {
 }
 
 TEST_F(AutoMixedPrecisionTest, TanhOp) {
-  TestSimpleUnaryGrayOp(
+  TestSimpleUnaryInferOp(
       -5, 5, 1.0e-3, -1,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Tanh(scope, input);
@@ -1229,16 +1232,16 @@ TEST_F(AutoMixedPrecisionMklTest, AlreadyBf16) {
 TEST_F(AutoMixedPrecisionMklTest, Simple) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
-  Output blk1 = ops::Exp(s.WithOpName("blk1"), input);
-  Output clr1 = ops::Relu(s.WithOpName("clr1"), blk1);
-  Output gry1 = ops::Sqrt(s.WithOpName("gry1"), clr1);
-  Output clr2 = ops::Relu(s.WithOpName("clr2"), gry1);
+  Output deny1 = ops::Exp(s.WithOpName("deny1"), input);
+  Output clr1 = ops::Relu(s.WithOpName("clr1"), deny1);
+  Output infer1 = ops::Sqrt(s.WithOpName("infer1"), clr1);
+  Output clr2 = ops::Relu(s.WithOpName("clr2"), infer1);
   Output allow1 = ops::MatMul(s.WithOpName("allow1"), clr2, clr2);
   Output clr3 = ops::Relu(s.WithOpName("clr3"), allow1);
-  Output blk2 = ops::Log(s.WithOpName("blk2"), clr3);
-  Output clr4 = ops::Relu(s.WithOpName("clr4"), blk2);
-  Output blk3 = ops::SparseMatMul(s.WithOpName("blk3"), clr4, clr4);
-  Output clr5 = ops::Relu(s.WithOpName("clr5"), blk3);
+  Output deny2 = ops::Log(s.WithOpName("deny2"), clr3);
+  Output clr4 = ops::Relu(s.WithOpName("clr4"), deny2);
+  Output deny3 = ops::SparseMatMul(s.WithOpName("deny3"), clr4, clr4);
+  Output clr5 = ops::Relu(s.WithOpName("clr5"), deny3);
   Output fetch = ops::Identity(s.WithOpName("fetch"), clr5);
 
   GrapplerItem item;
@@ -1255,16 +1258,16 @@ TEST_F(AutoMixedPrecisionMklTest, Simple) {
   GraphView output_view(&output);
   EXPECT_EQ(output.node_size(), item.graph.node_size() + 2);
   EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(), DT_FLOAT);
-  EXPECT_EQ(output_view.GetNode("blk1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("deny1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr1")->attr().at("T").type(), DT_FLOAT);
-  EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("infer1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr2")->attr().at("T").type(), DT_BFLOAT16);
   EXPECT_EQ(output_view.GetNode("allow1")->attr().at("T").type(), DT_BFLOAT16);
   EXPECT_EQ(output_view.GetNode("clr3")->attr().at("T").type(), DT_BFLOAT16);
-  EXPECT_EQ(output_view.GetNode("blk2")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("deny2")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr4")->attr().at("T").type(), DT_FLOAT);
-  EXPECT_EQ(output_view.GetNode("blk3")->attr().at("Ta").type(), DT_FLOAT);
-  EXPECT_EQ(output_view.GetNode("blk3")->attr().at("Tb").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("deny3")->attr().at("Ta").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("deny3")->attr().at("Tb").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr5")->attr().at("T").type(), DT_FLOAT);
 
   auto tensors = EvaluateNodes(output, item.fetch);
@@ -1294,8 +1297,8 @@ TEST_F(AutoMixedPrecisionMklTest, TensorListSetGet) {
   Output tl1r1 = ops::TensorListGetItem(s.WithOpName("tl1r1"), tl1rs, idx2,
                                         shape, DT_FLOAT)
                      .item;
-  Output gry1 = ops::Mul(s.WithOpName("gry1"), tl1r1, tl1r1);
-  Output allow2 = ops::MatMul(s.WithOpName("allow2"), gry1, gry1);
+  Output infer1 = ops::Mul(s.WithOpName("infer1"), tl1r1, tl1r1);
+  Output allow2 = ops::MatMul(s.WithOpName("allow2"), infer1, infer1);
   auto tl1w3 =
       ops::TensorListSetItem(s.WithOpName("tl1w3"), tl1.handle, idx3, allow2);
   Output tl1r2 =
@@ -1335,7 +1338,7 @@ TEST_F(AutoMixedPrecisionMklTest, TensorListSetGet) {
             DT_BFLOAT16);
   EXPECT_EQ(output_view.GetNode("tl1r1")->attr().at(type_key).type(),
             DT_BFLOAT16);
-  EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("infer1")->attr().at("T").type(), DT_BFLOAT16);
   EXPECT_EQ(output_view.GetNode("allow2")->attr().at("T").type(), DT_BFLOAT16);
   EXPECT_EQ(output_view.GetNode("tl1w3")->attr().at(type_key).type(),
             DT_BFLOAT16);
diff --git a/tensorflow/core/grappler/optimizers/common_subgraph_elimination.cc b/tensorflow/core/grappler/optimizers/common_subgraph_elimination.cc
index 2489cf93e78..57f7e7c664b 100644
--- a/tensorflow/core/grappler/optimizers/common_subgraph_elimination.cc
+++ b/tensorflow/core/grappler/optimizers/common_subgraph_elimination.cc
@@ -73,8 +73,7 @@ class UniqueNodes {
     if (it == memoized_signatures_.end()) return;
 
     std::vector<NodeDef*>& candidates = rep_[it->second];
-    for (int i = 0, candidates_size = candidates.size(); i < candidates_size;
-         ++i) {
+    for (int i = 0, end = candidates.size(); i < end; ++i) {
       if (candidates[i] == node) {
         std::swap(candidates[i], candidates[candidates.size() - 1]);
         candidates.resize(candidates.size() - 1);
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index bcb8ad37d6c..d595d2ba09c 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -187,13 +187,19 @@ float QuantizedTypeMaxAsFloat(DataType data_type) {
 }  // namespace
 
 ConstantFolding::ConstantFolding(RewriterConfig::Toggle opt_level,
-                                 DeviceBase* cpu_device)
-    : opt_level_(opt_level), cpu_device_(cpu_device) {
+                                 DeviceBase* cpu_device,
+                                 bool disable_compressed_tensor_optimization)
+    : opt_level_(opt_level),
+      cpu_device_(cpu_device),
+      disable_compressed_tensor_optimization_(
+          disable_compressed_tensor_optimization) {
   resource_mgr_.reset(new ResourceMgr());
 }
 
-ConstantFolding::ConstantFolding(DeviceBase* cpu_device)
-    : ConstantFolding(RewriterConfig::ON, cpu_device) {}
+ConstantFolding::ConstantFolding(DeviceBase* cpu_device,
+                                 bool disable_compressed_tensor_optimization)
+    : ConstantFolding(RewriterConfig::ON, cpu_device,
+                      disable_compressed_tensor_optimization) {}
 
 // static
 string ConstantFolding::AddControlDependency(const string& input_name,
@@ -642,12 +648,12 @@ Status ConstantFolding::MaterializeBroadcastGradientArgs(
   // These extra dims could be equal to 1, in which case there is no
   // broadcasting. It could also be greater than 1, in which case there would
   // be broadcasting. Since we don't know, we'll just punt.
-  for (int i = common_dims, iter_limit = shape1.size(); i < iter_limit; ++i) {
+  for (int i = common_dims, end = shape1.size(); i < end; ++i) {
     if (shape1[i] < 0) {
       return Status::OK();
     }
   }
-  for (int i = common_dims, iter_limit = shape2.size(); i < iter_limit; ++i) {
+  for (int i = common_dims, end = shape2.size(); i < end; ++i) {
     if (shape2[i] < 0) {
       return Status::OK();
     }
@@ -813,6 +819,9 @@ Status ConstantFolding::MaterializeReductionIndices(
 
 Status ConstantFolding::MaterializeConstantValuedNode(
     NodeDef* node, const GraphProperties& properties) {
+  if (disable_compressed_tensor_optimization_) {
+    return Status::OK();
+  }
   // Nodes that generate constant-valued outputs can be represented compactly in
   // compressed format, regardless of their shape.
   const std::vector<OpInfo::TensorProperties>& output_props =
@@ -974,6 +983,9 @@ bool ConstantFolding::IsFoldableUncached(
     }
   }
   if (is_merge && !merge_has_constant_input) return false;
+  if (disable_compressed_tensor_optimization_ &&
+      (IsFill(node) || IsZerosLike(node) || IsOnesLike(node)))
+    return false;
 
   // If we know the output shapes, make sure that the outputs are small enough
   // to materialize.
@@ -1463,7 +1475,7 @@ Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph,
   VLOG(2) << "Folded node: " << SummarizeNodeDef(*node);
 
   NodeDef* constant_output = nullptr;
-  for (int i = 0, iter_limit = const_nodes.size(); i < iter_limit; i++) {
+  for (int i = 0, end = const_nodes.size(); i < end; i++) {
     NodeDef* const_node = &const_nodes[i];
     VLOG(3) << "Generated constant node: " << SummarizeNodeDef(*const_node);
     if (const_node->name().empty()) {
@@ -2331,11 +2343,16 @@ Status ConstantFolding::SimplifyPad(const GraphProperties& properties,
   if (GetTensorFromConstNode(node->input(1), &paddings)) {
     // The node is replaceable iff all values in paddings are 0.
     bool replaceable = true;
-    // The operation requires it to be int32 value so we don't check for
-    // 1nt64.
-    const auto flatten = paddings.flat<int32>();
-    for (int j = 0; replaceable && j < flatten.size(); ++j) {
-      replaceable &= flatten(j) == 0;
+    if (paddings.dtype() == DT_INT32) {
+      const auto flatten = paddings.flat<int32>();
+      for (int j = 0; replaceable && j < flatten.size(); ++j) {
+        replaceable &= flatten(j) == 0;
+      }
+    } else {
+      const auto flatten = paddings.flat<int64>();
+      for (int j = 0; replaceable && j < flatten.size(); ++j) {
+        replaceable &= flatten(j) == 0;
+      }
     }
     if (replaceable) {
       ReplaceOperationWithIdentity(0, properties, node, optimized_graph);
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index 4e3deb40d15..398e16947ec 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -45,8 +45,10 @@ class ConstantFolding : public GraphOptimizer {
   static string AddControlDependency(const string& input_name, GraphDef* graph,
                                      NodeMap* node_map);
 
-  explicit ConstantFolding(DeviceBase* cpu_device);
-  ConstantFolding(RewriterConfig::Toggle opt_level, DeviceBase* cpu_device);
+  explicit ConstantFolding(DeviceBase* cpu_device,
+                           bool disable_compressed_tensor_optimization = false);
+  ConstantFolding(RewriterConfig::Toggle opt_level, DeviceBase* cpu_device,
+                  bool disable_compressed_tensor_optimization = false);
 
   ~ConstantFolding() override {}
 
@@ -334,6 +336,7 @@ class ConstantFolding : public GraphOptimizer {
   bool has_fetch_;
   bool graph_modified_;
   bool graph_contains_assign_or_inplace_op_;
+  bool disable_compressed_tensor_optimization_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 87cf18548b6..cb1ad87de60 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -220,6 +220,60 @@ class ConstantFoldingTest : public GrapplerTest {
     auto expected = EvaluateNodes(item.graph, fetch, {{"x", value}});
     test::ExpectTensorEqual<float>(expected[0], actual[0]);
   }
+
+  template <typename T>
+  void PaddingWithZeroSize() {
+    tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+    auto in1 = ops::Variable(scope.WithOpName("in1"), {4, 6}, DT_INT32);
+    auto in2 = ops::Variable(scope.WithOpName("in2"), {2, 2}, DT_INT32);
+    auto paddings1 =
+        ops::Const<T>(scope.WithOpName("paddings1"), {0, 0, 0, 0}, {2, 2});
+    auto paddings2 =
+        ops::Const<T>(scope.WithOpName("paddings2"), {1, 1, 2, 2}, {2, 2});
+    auto c1 = ops::Const(scope.WithOpName("c1"), 1);
+    auto c2 = ops::Const(scope.WithOpName("c2"), 1);
+
+    ops::PadV2 p1(scope.WithOpName("p1"), in1, paddings1, c1);
+    ops::PadV2 p2(scope.WithOpName("p2"), in2, paddings2, c2);
+
+    ops::Add out(scope.WithOpName("out"), p1, p2);
+
+    GrapplerItem item;
+    item.fetch = {"out"};
+    TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+    ConstantFolding optimizer(/*cpu_device=*/nullptr);
+    GraphDef got;
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
+    TF_EXPECT_OK(status);
+
+    GraphDef want;
+    AddNode("in1", "VariableV2", {}, {}, &want);
+    AddNode("in2", "VariableV2", {}, {}, &want);
+    AddNode("paddings1", "Const", {}, {}, &want);
+    AddNode("paddings2", "Const", {}, {}, &want);
+    AddNode("c1", "Const", {}, {}, &want);
+    AddNode("c2", "Const", {}, {}, &want);
+    AddNode(
+        "p1", "Identity",
+        {"in1", AsControlDependency("paddings1"), AsControlDependency("c1")},
+        {}, &want);
+    AddNode("p2", "PadV2", {"in2", "paddings2", "c2"}, {}, &want);
+    AddNode("out", "Add", {"p1", "p2"}, {}, &want);
+
+    CompareGraphs(want, got);
+
+    auto in1_t = GenerateRandomTensor<DT_INT32>(TensorShape({4, 6}));
+    auto in2_t = GenerateRandomTensor<DT_INT32>(TensorShape({2, 2}));
+    auto tensors_expected =
+        EvaluateNodes(item.graph, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+    EXPECT_EQ(1, tensors_expected.size());
+    auto tensors =
+        EvaluateNodes(got, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+    EXPECT_EQ(1, tensors.size());
+    test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
+  }
 };
 
 TEST_F(ConstantFoldingTest, SimpleFolding) {
@@ -2617,55 +2671,8 @@ TEST_F(ConstantFoldingTest, MergeConcat_PartialFolding) {
 }
 
 TEST_F(ConstantFoldingTest, PaddingWithZeroSize) {
-  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
-
-  auto in1 = ops::Variable(scope.WithOpName("in1"), {4, 6}, DT_INT32);
-  auto in2 = ops::Variable(scope.WithOpName("in2"), {2, 2}, DT_INT32);
-  auto paddings1 =
-      ops::Const(scope.WithOpName("paddings1"), {0, 0, 0, 0}, {2, 2});
-  auto paddings2 =
-      ops::Const(scope.WithOpName("paddings2"), {1, 1, 2, 2}, {2, 2});
-  auto c1 = ops::Const(scope.WithOpName("c1"), 1);
-  auto c2 = ops::Const(scope.WithOpName("c2"), 1);
-
-  ops::PadV2 p1(scope.WithOpName("p1"), in1, paddings1, c1);
-  ops::PadV2 p2(scope.WithOpName("p2"), in2, paddings2, c2);
-
-  ops::Add out(scope.WithOpName("out"), p1, p2);
-
-  GrapplerItem item;
-  item.fetch = {"out"};
-  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
-
-  ConstantFolding optimizer(/*cpu_device=*/nullptr);
-  GraphDef got;
-  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
-  TF_EXPECT_OK(status);
-
-  GraphDef want;
-  AddNode("in1", "VariableV2", {}, {}, &want);
-  AddNode("in2", "VariableV2", {}, {}, &want);
-  AddNode("paddings1", "Const", {}, {}, &want);
-  AddNode("paddings2", "Const", {}, {}, &want);
-  AddNode("c1", "Const", {}, {}, &want);
-  AddNode("c2", "Const", {}, {}, &want);
-  AddNode("p1", "Identity",
-          {"in1", AsControlDependency("paddings1"), AsControlDependency("c1")},
-          {}, &want);
-  AddNode("p2", "PadV2", {"in2", "paddings2", "c2"}, {}, &want);
-  AddNode("out", "Add", {"p1", "p2"}, {}, &want);
-
-  CompareGraphs(want, got);
-
-  auto in1_t = GenerateRandomTensor<DT_INT32>(TensorShape({4, 6}));
-  auto in2_t = GenerateRandomTensor<DT_INT32>(TensorShape({2, 2}));
-  auto tensors_expected =
-      EvaluateNodes(item.graph, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
-  EXPECT_EQ(1, tensors_expected.size());
-  auto tensors =
-      EvaluateNodes(got, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
-  EXPECT_EQ(1, tensors.size());
-  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
+  PaddingWithZeroSize<int32>();
+  PaddingWithZeroSize<int64>();
 }
 
 TEST_F(ConstantFoldingTest, SqueezeWithAllDimensionsGreaterThanOne) {
@@ -4023,6 +4030,58 @@ TEST_F(ConstantFoldingTest, MaterializeConstantValuedNode) {
   }
 }
 
+TEST_F(ConstantFoldingTest, MaterializeConstantValuedNodeDisableCompression) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  Output x =
+      ops::Placeholder(scope.WithOpName("x"), DT_FLOAT,
+                       ops::Placeholder::Shape(TensorShape({1, 2, 3, 4})));
+  Output ones_like = ops::OnesLike(scope.WithOpName("ones_like"), x);
+  Output zeros_like = ops::ZerosLike(scope.WithOpName("zeros_like"), x);
+  Output fill = ops::Fill(scope.WithOpName("fill"), {4, 3, 2, 1}, 42);
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  item.fetch = {"ones_like", "zeros_like", "fill"};
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({1, 2, 3, 4}));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {{"x", x_t}});
+
+  ConstantFolding optimizer(/*cpu_device=*/nullptr, true);
+  GraphDef output;
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(output.node_size(), 6);
+  for (const auto& node : output.node()) {
+    if (node.name() == "ones_like") {
+      EXPECT_EQ(node.op(), "OnesLike");
+      ASSERT_EQ(node.input_size(), 1);
+      EXPECT_EQ(node.input(0), "x");
+    }
+    if (node.name() == "zeros_like") {
+      EXPECT_EQ(node.op(), "ZerosLike");
+      ASSERT_EQ(node.input_size(), 1);
+      EXPECT_EQ(node.input(0), "x");
+    }
+    if (node.name() == "fill") {
+      EXPECT_EQ(node.op(), "Fill");
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "Const/Const");
+      EXPECT_EQ(node.input(1), "Const_1/Const");
+    }
+  }
+  auto tensors = EvaluateNodes(output, item.fetch, {{"x", x_t}});
+  ASSERT_EQ(item.fetch.size(), tensors.size());
+  ASSERT_EQ(tensors_expected.size(), tensors.size());
+  for (int i = 0; i < tensors.size(); i++) {
+    if (item.fetch[i] == "fill") {
+      test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
+    } else {
+      test::ExpectTensorEqual<float>(tensors_expected[i], tensors[i]);
+    }
+  }
+}
+
 TEST_F(ConstantFoldingTest, MaterializeConstantValuedNodeHugeFill) {
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
   Output value = ops::Const(scope.WithOpName("value"), 42, {});
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index a927afc5b30..860cbd7c35e 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -28,6 +28,7 @@ cc_library(
         ":meta_optimizer",
         ":noop_elimination",
         ":parallel_batch",
+        ":reorder_data_discarding_ops",
         ":shuffle_and_repeat_fusion",
         ":slack",
     ],
@@ -51,10 +52,46 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler/utils:functions",
+        "//tensorflow/core/kernels/data:dataset_utils",
     ] + tf_protos_all(),
     alwayslink = 1,
 )
 
+cc_library(
+    name = "disable_intra_op_parallelism",
+    srcs = ["disable_intra_op_parallelism.cc"],
+    hdrs = ["disable_intra_op_parallelism.h"],
+    deps = [
+        ":graph_utils",
+        ":optimizer_base",
+        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "//tensorflow/core:lib_internal",
+    ] + tf_protos_all(),
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "disable_intra_op_parallelism_test",
+    srcs = ["disable_intra_op_parallelism_test.cc"],
+    deps = [
+        ":disable_intra_op_parallelism",
+        ":graph_test_utils",
+        ":graph_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+    ],
+)
+
 cc_library(
     name = "filter_fusion",
     srcs = ["filter_fusion.cc"],
@@ -691,6 +728,44 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "reorder_data_discarding_ops",
+    srcs = ["reorder_data_discarding_ops.cc"],
+    hdrs = [
+        "reorder_data_discarding_ops.h",
+    ],
+    deps = [
+        ":function_utils",
+        ":graph_utils",
+        ":optimizer_base",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "//tensorflow/core:lib_internal",
+    ] + tf_protos_all(),
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "reorder_data_discarding_ops_test",
+    srcs = ["reorder_data_discarding_ops_test.cc"],
+    deps = [
+        ":graph_test_utils",
+        ":graph_utils",
+        ":reorder_data_discarding_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+    ] + tf_protos_all(),
+)
+
 cc_library(
     name = "shuffle_and_repeat_fusion",
     srcs = ["shuffle_and_repeat_fusion.cc"],
diff --git a/tensorflow/core/grappler/optimizers/data/auto_shard.cc b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
index 3e8583d74e9..4ad9cec4fe4 100644
--- a/tensorflow/core/grappler/optimizers/data/auto_shard.cc
+++ b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/grappler/utils/functions.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/errors.h"
 
@@ -50,9 +51,8 @@ constexpr char kOutputShapes[] = "output_shapes";
 constexpr char kOutputTypes[] = "output_types";
 
 // clang-format off
-constexpr std::array<const char*, 6> kReaderDatasetOps = {
+constexpr std::array<const char*, 5> kReaderDatasetOps = {
     "FixedLengthRecordDataset",
-    "FixedLengthRecordDatasetV2",
     "RecordIODataset",
     "SSTableDataset",
     "TextLineDataset",
@@ -64,25 +64,22 @@ constexpr std::array<const char*, 2> kMultipleInputsDatasetOps = {
     "ZipDataset"
 };
 
-constexpr std::array<const char*, 30> kPassThroughOps = {
+constexpr std::array<const char*, 25> kPassThroughOps = {
     "_Retval",
     "AssertNextDataset",
     "BatchDataset",
-    "BatchDatasetV2",
+    "CacheDataset",
     "ExperimentalMapAndBatchDataset",
     "ExperimentalRebatchDataset",
-    "PaddedBatchDataset",
-    "PaddedBatchDatasetV2",
-    "CacheDataset",
-    "CacheDatasetV2",
     "FilterDataset",
     "Identity",
     "MapAndBatchDataset",
     "MapDataset",
     "ModelDataset",
     "OptimizeDataset",
+    "PaddedBatchDataset",
     "ParallelMapDataset",
-    "ParallelMapDatasetV2",
+    "ParseExampleDataset",
     "PrefetchDataset",
     "ReduceDataset",
     "RebatchDataset",
@@ -90,23 +87,18 @@ constexpr std::array<const char*, 30> kPassThroughOps = {
     "ShardDataset",
     "ShuffleAndRepeatDataset",
     "ShuffleDataset",
-    "ShuffleDatasetV2",
-    "ShuffleDatasetV3",
     "SkipDataset",
     "TakeDataset",
     "WindowDataset",
 };
 
 // TODO(frankchn): Process functions within kFuncDatasetOps as well.
-constexpr std::array<const char*, 8> kFuncDatasetOps = {
+constexpr std::array<const char*, 5> kFuncDatasetOps = {
     "ExperimentalParallelInterleaveDataset",
     "FlatMapDataset",
     "InterleaveDataset",
-    "LegacyParallelInterleaveDatasetV2",
+    "LegacyParallelInterleaveDataset",
     "ParallelInterleaveDataset",
-    "ParallelInterleaveDatasetV2",
-    "ParallelInterleaveDatasetV3",
-    "ParallelInterleaveDatasetV4"
 };
 
 constexpr std::array<const char*, 5> kUnshardableSourceDatasetOps = {
@@ -125,7 +117,10 @@ template <std::size_t SIZE>
 bool IsDatasetNodeOfType(const NodeDef& node,
                          const std::array<const char*, SIZE>& arr) {
   for (const auto& dataset_op_name : arr) {
-    if (node.op() == dataset_op_name) return true;
+    if (tensorflow::data::MatchesAnyVersionRE(/*op_prefix=*/dataset_op_name,
+                                              /*op_to_match=*/node.op())) {
+      return true;
+    }
   }
   return false;
 }
diff --git a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.cc b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.cc
new file mode 100644
index 00000000000..4b6d6ac1bfa
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.cc
@@ -0,0 +1,99 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.h"
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+constexpr char kMaxIntraOpParallelismDataset[] = "MaxIntraOpParallelismDataset";
+
+constexpr std::array<const char*, 2> kMaxIntraOpParallelismDatasetOps = {
+    "MaxIntraOpParallelismDataset",
+    "ExperimentalMaxIntraOpParallelismDataset",
+};
+
+}  // namespace
+
+Status DisableIntraOpParallelism::OptimizeAndCollectStats(
+    Cluster* cluster, const GrapplerItem& item, GraphDef* output,
+    OptimizationStats* stats) {
+  *output = item.graph;
+  MutableGraphView graph(output);
+
+  const NodeDef* sink_node;
+  for (const NodeDef& node : item.graph.node()) {
+    for (const auto& target_dataset_op : kMaxIntraOpParallelismDatasetOps) {
+      if (node.op() == target_dataset_op) {
+        // If parallelism is set by the user, we keep the user setting instead
+        // of disabling it.
+        return Status::OK();
+      }
+    }
+    if (node.name() == "Sink") {
+      sink_node = &node;
+    }
+  }
+
+  NodeDef* last_node = graph_utils::GetInputNode(*sink_node, graph);
+
+  // Add a const node with value 1
+  NodeDef* max_parallelism_value = graph_utils::AddScalarConstNode(1LL, &graph);
+
+  NodeDef insert_node;
+  graph_utils::SetUniqueGraphNodeName("intra_op_parallelism", graph.graph(),
+                                      &insert_node);
+  insert_node.set_op(kMaxIntraOpParallelismDataset);
+
+  // `input_dataset` input
+  *insert_node.mutable_input()->Add() = last_node->name();
+  // `max_intra_op_parallelism` input
+  *insert_node.mutable_input()->Add() = max_parallelism_value->name();
+
+  for (const auto& attr_name : {"output_types", "output_shapes"}) {
+    graph_utils::CopyAttribute(attr_name, *last_node, &insert_node);
+  }
+
+  auto* added_node = graph.AddNode(std::move(insert_node));
+  TF_RETURN_IF_ERROR(
+      graph.UpdateFanouts(last_node->name(), added_node->name()));
+
+  stats->num_changes++;
+  return Status::OK();
+}
+
+void DisableIntraOpParallelism::Feedback(Cluster* cluster,
+                                         const GrapplerItem& item,
+                                         const GraphDef& optimize_output,
+                                         double result) {
+  // no-op
+}
+
+REGISTER_GRAPH_OPTIMIZER_AS(DisableIntraOpParallelism,
+                            "disable_intra_op_parallelism");
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.h b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.h
new file mode 100644
index 00000000000..d2355eb8766
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.h
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_DISABLE_INTRA_OP_PARALLELISM_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_DISABLE_INTRA_OP_PARALLELISM_H_
+
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// This optimization sets intra-op parallelism to be 1.
+class DisableIntraOpParallelism : public TFDataOptimizerBase {
+ public:
+  DisableIntraOpParallelism() = default;
+  ~DisableIntraOpParallelism() override = default;
+
+  string name() const override { return "disable_intra_op_parallelism"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return Status::OK();
+  }
+
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_DISABLE_INTRA_OP_PARALLELISM_H_
diff --git a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism_test.cc b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism_test.cc
new file mode 100644
index 00000000000..b1c886594ec
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism_test.cc
@@ -0,0 +1,126 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.h"
+
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_test_utils.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+using test::function::NDef;
+
+// If the user manually sets intra op parallelism, we don't insert the op.
+class IntraOpAlreadySetTest
+    : public ::testing::TestWithParam<std::tuple<string, int64>> {};
+
+TEST_P(IntraOpAlreadySetTest, IntraOpParallelism) {
+  const string op = std::get<0>(GetParam());
+  const int64 value = std::get<1>(GetParam());
+
+  GrapplerItem item;
+  MutableGraphView graph(&item.graph);
+
+  NodeDef *start_val = graph_utils::AddScalarConstNode<int64>(0, &graph);
+  NodeDef *stop_val = graph_utils::AddScalarConstNode<int64>(10, &graph);
+  NodeDef *step_val = graph_utils::AddScalarConstNode<int64>(1, &graph);
+  std::vector<string> range_inputs(3);
+  range_inputs[0] = start_val->name();
+  range_inputs[1] = stop_val->name();
+  range_inputs[2] = step_val->name();
+  std::vector<std::pair<string, AttrValue>> range_attrs;
+  NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
+                                             range_attrs, &graph);
+
+  NodeDef *max_parallelism_val =
+      graph_utils::AddScalarConstNode<int64>(value, &graph);
+  std::vector<string> parallelism_inputs(2);
+  parallelism_inputs[0] = range_node->name();
+  parallelism_inputs[1] = max_parallelism_val->name();
+  std::vector<std::pair<string, AttrValue>> parallelism_attrs;
+  graph_utils::AddNode("", op, parallelism_inputs, parallelism_attrs, &graph);
+
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp(op, item.graph));
+  EXPECT_EQ(item.graph.node_size(), 6);
+  EXPECT_EQ(max_parallelism_val->attr().at("value").tensor().int64_val(0),
+            value);
+
+  DisableIntraOpParallelism optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  EXPECT_EQ(output.node_size(), 6);
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp(op, output));
+  NodeDef parallelism_node =
+      output.node(graph_utils::FindGraphNodeWithOp(op, output));
+  NodeDef parallelism_val = output.node(
+      graph_utils::FindGraphNodeWithName(parallelism_node.input(1), output));
+  EXPECT_EQ(parallelism_val.attr().at("value").tensor().int64_val(0), value);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    Test, IntraOpAlreadySetTest,
+    ::testing::Combine(
+        ::testing::Values("MaxIntraOpParallelismDataset",
+                          "ExperimentalMaxIntraOpParallelismDataset"),
+        ::testing::Values(1, 5)));
+
+// If the user hasn't set intra op parallelism, we insert the op to disable it.
+TEST(IntraOpNotSetTest, IntraOpParallelism) {
+  GrapplerItem item;
+
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"},
+            {{"output_shapes", gtl::ArraySlice<TensorShape>{}},
+             {"output_types", gtl::ArraySlice<DataType>{}}}),
+       NDef("Sink", "Identity", {"range"}, {})});
+  EXPECT_FALSE(graph_utils::ContainsNodeWithOp("MaxIntraOpParallelismDataset",
+                                               item.graph));
+  EXPECT_EQ(item.graph.node_size(), 5);
+
+  DisableIntraOpParallelism optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  EXPECT_EQ(output.node_size(), 7);
+  EXPECT_TRUE(
+      graph_utils::ContainsNodeWithOp("MaxIntraOpParallelismDataset", output));
+  NodeDef sink_node =
+      output.node(graph_utils::FindGraphNodeWithName("Sink", output));
+  EXPECT_EQ(sink_node.input_size(), 1);
+  NodeDef parallelism_node = output.node(
+      graph_utils::FindGraphNodeWithName(sink_node.input(0), output));
+  EXPECT_EQ(parallelism_node.op(), "MaxIntraOpParallelismDataset");
+  EXPECT_EQ(parallelism_node.input_size(), 2);
+  NodeDef range_node = output.node(
+      graph_utils::FindGraphNodeWithName(parallelism_node.input(0), output));
+  EXPECT_EQ(range_node.name(), "range");
+  NodeDef parallelism_val = output.node(
+      graph_utils::FindGraphNodeWithName(parallelism_node.input(1), output));
+  EXPECT_EQ(parallelism_val.attr().at("value").tensor().int64_val(0), 1);
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
index 323e3c2c6d8..5aeeb977e73 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
@@ -139,6 +139,50 @@ NodeDef MakeShuffleV2Node(StringPiece name, StringPiece input_node_name,
       });
 }
 
+NodeDef MakeTakeNode(StringPiece name, StringPiece input_node_name,
+                     StringPiece count_node_name) {
+  return test::function::NDef(
+      name, "TakeDataset",
+      {
+          string(input_node_name),
+          string(count_node_name),
+      },
+      {
+          {"output_shapes", gtl::ArraySlice<TensorShape>{}},
+          {"output_types", gtl::ArraySlice<DataType>{}},
+      });
+}
+
+NodeDef MakeSkipNode(StringPiece name, StringPiece input_node_name,
+                     StringPiece count_node_name) {
+  return test::function::NDef(
+      name, "SkipDataset",
+      {
+          string(input_node_name),
+          string(count_node_name),
+      },
+      {
+          {"output_shapes", gtl::ArraySlice<TensorShape>{}},
+          {"output_types", gtl::ArraySlice<DataType>{}},
+      });
+}
+
+NodeDef MakeShardNode(StringPiece name, StringPiece input_node_name,
+                      StringPiece num_shards_node_name,
+                      StringPiece index_node_name) {
+  return test::function::NDef(
+      name, "ShardDataset",
+      {
+          string(input_node_name),
+          string(num_shards_node_name),
+          string(index_node_name),
+      },
+      {
+          {"output_shapes", gtl::ArraySlice<TensorShape>{}},
+          {"output_types", gtl::ArraySlice<DataType>{}},
+      });
+}
+
 }  // namespace graph_tests_utils
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/graph_test_utils.h b/tensorflow/core/grappler/optimizers/data/graph_test_utils.h
index 0dcfe656b89..268d2120ab0 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_test_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/graph_test_utils.h
@@ -66,6 +66,19 @@ NodeDef MakeShuffleV2Node(StringPiece name, StringPiece input_node_name,
                           StringPiece buffer_size_node_name,
                           StringPiece seed_generator_node_name);
 
+// Creates a test NodeDef for TakeDataset.
+NodeDef MakeTakeNode(StringPiece name, StringPiece input_node_name,
+                     StringPiece count_node_name);
+
+// Creates a test NodeDef for SkipDataset.
+NodeDef MakeSkipNode(StringPiece name, StringPiece input_node_name,
+                     StringPiece count_node_name);
+
+// Creates a test NodeDef for ShardDataset.
+NodeDef MakeShardNode(StringPiece name, StringPiece input_node_name,
+                      StringPiece num_shards_node_name,
+                      StringPiece index_node_name);
+
 }  // namespace graph_tests_utils
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/inject_prefetch.cc b/tensorflow/core/grappler/optimizers/data/inject_prefetch.cc
index eae8d294247..ed202c151ae 100644
--- a/tensorflow/core/grappler/optimizers/data/inject_prefetch.cc
+++ b/tensorflow/core/grappler/optimizers/data/inject_prefetch.cc
@@ -70,7 +70,7 @@ Status InjectPrefetch::OptimizeAndCollectStats(Cluster* cluster,
     graph_utils::SetUniqueGraphNodeName(
         strings::StrCat("inject/prefetch_", async_dataset_node->name()),
         graph.graph(), &prefetch_node);
-    prefetch_node.set_op("PrefetchDataset");
+    prefetch_node.set_op(kPrefetchDataset);
     // `input_dataset` input
     *prefetch_node.mutable_input()->Add() = async_dataset_node->name();
     // `buffer_size` input
diff --git a/tensorflow/core/grappler/optimizers/data/map_vectorization.cc b/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
index 50eac2e23df..2a2e304c8f0 100644
--- a/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
@@ -103,7 +103,8 @@ FunctionDef* CreateMapDefunWrapper(const NodeDef& map_node,
 
   // Set return values to match output names
   string output_prefix = strings::StrCat(map_defun_node->name(), ":output:");
-  for (size_t i = 0; i < vectorized_func->signature().output_arg_size(); ++i) {
+  for (size_t i = 0, end = vectorized_func->signature().output_arg_size();
+       i < end; ++i) {
     const auto& output_arg = vectorized_func->signature().output_arg(i);
     (*vectorized_func->mutable_ret())[output_arg.name()] =
         strings::StrCat(output_prefix, i);
@@ -238,7 +239,7 @@ Status AddNewBatchNode(const NodeDef& old_batch_node, const NodeDef& input_node,
     }
   }
 
-  for (size_t i = 0; i < input_shapes.size(); ++i) {
+  for (size_t i = 0, end = input_shapes.size(); i < end; ++i) {
     // Note: We already checked earlier that input shapes are all fully defined.
     TensorShapeProto* shape = output_shapes_attr.mutable_list()->add_shape();
     TensorShapeProto_Dim* dim = shape->add_dim();
diff --git a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
index 5804c3ee01a..8d50a0409df 100644
--- a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
@@ -35,8 +35,9 @@ using ConfigMap =
     std::map<string, tensorflow::RewriterConfig_CustomGraphOptimizer>;
 
 // tf.data optimizations, in the order we want to perform them.
-constexpr std::array<const char*, 15> kTFDataOptimizations = {
+constexpr std::array<const char*, 17> kTFDataOptimizations = {
     "noop_elimination",
+    "disable_intra_op_parallelism",
     "shuffle_and_repeat_fusion",
     "map_fusion",
     "filter_fusion",
@@ -49,6 +50,7 @@ constexpr std::array<const char*, 15> kTFDataOptimizations = {
     "latency_all_edges",
     "make_sloppy",
     "parallel_batch",
+    "reorder_data_discarding_ops",
     "slack",
     "inject_prefetch"};
 
diff --git a/tensorflow/core/grappler/optimizers/data/reorder_data_discarding_ops.cc b/tensorflow/core/grappler/optimizers/data/reorder_data_discarding_ops.cc
new file mode 100644
index 00000000000..f3ede251978
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/reorder_data_discarding_ops.cc
@@ -0,0 +1,135 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/reorder_data_discarding_ops.h"
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/data/function_utils.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+constexpr char kReorderDataDiscardingOpPrefix[] =
+    "reorder_data_discarding_ops/";
+
+constexpr std::array<const char*, 3> kDataDiscarding = {
+    "ShardDataset",
+    "SkipDataset",
+    "TakeDataset",
+};
+
+// TODO(zilinzhu): Support memory cache op when file cache op and
+// memory cache op are separated.
+const std::array<const char*, 4> kCardinalityPreserving = {
+    "PrefetchDataset",
+    "MapDataset",
+    "ParallelMapDataset",
+    "ParallelMapDatasetV2",
+};
+
+bool IsDataDiscarding(const NodeDef& node) {
+  for (const auto& discard_op : kDataDiscarding) {
+    if (node.op() == discard_op) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool IsCardinalityPreserving(const NodeDef& node) {
+  for (const auto& cardinality_preserving_op : kCardinalityPreserving) {
+    if (node.op() != cardinality_preserving_op) {
+      continue;
+    }
+    // Map ops with preserve_cardinality=false do not qualify.
+    auto attr = node.attr().find("preserve_cardinality");
+    if (attr != node.attr().end() && !attr->second.b()) {
+      return false;
+    }
+    return true;
+  }
+  return false;
+}
+
+}  // namespace
+
+Status ReorderDataDiscardingOps::OptimizeAndCollectStats(
+    Cluster* cluster, const GrapplerItem& item, GraphDef* output,
+    OptimizationStats* stats) {
+  *output = item.graph;
+  MutableGraphView graph(output);
+  bool updated;
+  do {
+    updated = false;
+    for (int i = 0; i < graph.graph()->node_size(); ++i) {
+      NodeDef* discard_node = graph.graph()->mutable_node(i);
+      if (!IsDataDiscarding(*discard_node)) {
+        continue;
+      }
+      NodeDef* start = discard_node;
+      NodeDef* start_parent = graph_utils::GetInputNode(*start, graph);
+      while (IsCardinalityPreserving(*start_parent)) {
+        start = start_parent;
+        start_parent = graph_utils::GetInputNode(*start, graph);
+      }
+      if (start->name() == discard_node->name()) {
+        continue;
+      }
+      NodeDef* parent = graph_utils::GetInputNode(*discard_node, graph);
+      TF_RETURN_IF_ERROR(
+          graph.UpdateFanouts(discard_node->name(), parent->name()));
+      if (!absl::StartsWith(discard_node->name(),
+                            kReorderDataDiscardingOpPrefix)) {
+        TF_RETURN_IF_ERROR(
+            graph.UpdateNodeName(discard_node->name(),
+                                 strings::StrCat(kReorderDataDiscardingOpPrefix,
+                                                 discard_node->name()),
+                                 false));
+      }
+      for (const auto& attr_name : {"output_types", "output_shapes"}) {
+        graph_utils::CopyAttribute(attr_name, *start_parent, discard_node);
+      }
+      *discard_node->mutable_input(0) = start_parent->name();
+      *start->mutable_input(0) = discard_node->name();
+      updated = true;
+      break;
+    }
+  } while (updated);
+  return Status::OK();
+}
+
+void ReorderDataDiscardingOps::Feedback(Cluster* cluster,
+                                        const GrapplerItem& item,
+                                        const GraphDef& optimize_output,
+                                        double result) {
+  // no-op
+}
+
+REGISTER_GRAPH_OPTIMIZER_AS(ReorderDataDiscardingOps,
+                            "reorder_data_discarding_ops");
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/reorder_data_discarding_ops.h b/tensorflow/core/grappler/optimizers/data/reorder_data_discarding_ops.h
new file mode 100644
index 00000000000..1cf0b0861dc
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/reorder_data_discarding_ops.h
@@ -0,0 +1,52 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_REORDER_DATA_DISCARDING_OPS_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_REORDER_DATA_DISCARDING_OPS_H_
+
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// This optimization reorders the data discarding ops (such as `skip`, `take`
+// and `shard`) to avoid unnecessary computation,
+// e.g. reordering ds.map(...).take(5) to ds.take(5).map(...).
+class ReorderDataDiscardingOps : public TFDataOptimizerBase {
+ public:
+  ReorderDataDiscardingOps() = default;
+  ~ReorderDataDiscardingOps() override = default;
+
+  string name() const override { return "reorder_data_discarding_ops"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return Status::OK();
+  }
+
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_REORDER_DATA_DISCARDING_OPS_H_
diff --git a/tensorflow/core/grappler/optimizers/data/reorder_data_discarding_ops_test.cc b/tensorflow/core/grappler/optimizers/data/reorder_data_discarding_ops_test.cc
new file mode 100644
index 00000000000..743769e3cc9
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/reorder_data_discarding_ops_test.cc
@@ -0,0 +1,94 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/reorder_data_discarding_ops.h"
+
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_test_utils.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+TEST(ReorderDataDiscardingOpsTest, ExampleOps) {
+  using test::function::NDef;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"},
+            {
+                {"output_shapes", gtl::ArraySlice<TensorShape>{}},
+                {"output_types", gtl::ArraySlice<DataType>{}},
+            }),
+       graph_tests_utils::MakeMapNode("map", "range", "XTimesTwo"),
+       NDef("take_count", "Const", {}, {{"value", 5}, {"dtype", DT_INT32}}),
+       graph_tests_utils::MakeTakeNode("take", "map", "take_count"),
+       NDef("skip_count", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       graph_tests_utils::MakeSkipNode("skip", "take", "skip_count"),
+       NDef("batch_size", "Const", {}, {{"value", 2}, {"dtype", DT_INT32}}),
+       NDef("drop_remainder", "Const", {},
+            {{"value", true}, {"dtype", DT_BOOL}}),
+       graph_tests_utils::MakeMapAndBatchNode("map_and_batch", "skip",
+                                              "batch_size", "drop_remainder",
+                                              "XTimesTwo"),
+       NDef("num_shards", "Const", {}, {{"value", 2}, {"dtype", DT_INT32}}),
+       NDef("index", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       graph_tests_utils::MakeShardNode("shard", "map_and_batch", "num_shards",
+                                        "index")},
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+      });
+
+  ReorderDataDiscardingOps optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  EXPECT_TRUE(graph_utils::ContainsGraphNodeWithName(
+      "reorder_data_discarding_ops/take", output));
+  EXPECT_TRUE(graph_utils::ContainsGraphNodeWithName(
+      "reorder_data_discarding_ops/skip", output));
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName(
+      "reorder_data_discarding_ops/shard", output));
+
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("take", output));
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("skip", output));
+  EXPECT_TRUE(graph_utils::ContainsGraphNodeWithName("shard", output));
+
+  MutableGraphView graph(&output);
+  EXPECT_EQ(graph_utils::GetInputNode(
+                *graph.GetNode("reorder_data_discarding_ops/take"), graph)
+                ->name(),
+            "range");
+  EXPECT_EQ(graph_utils::GetInputNode(
+                *graph.GetNode("reorder_data_discarding_ops/skip"), graph)
+                ->name(),
+            "reorder_data_discarding_ops/take");
+  EXPECT_EQ(
+      graph_utils::GetInputNode(*graph.GetNode("map_and_batch"), graph)->name(),
+      "map");
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/parse_single_example_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/parse_single_example_vectorizer.cc
index f81b2d01d99..bf3a80428b6 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/parse_single_example_vectorizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/parse_single_example_vectorizer.cc
@@ -87,7 +87,7 @@ class ParseSingleExampleVectorizer : public Vectorizer {
     TF_RETURN_IF_ERROR(node_builder.Finalize(outer_scope, &new_node));
 
     // Add output mappings
-    for (size_t i = 0; i < node.num_outputs(); ++i) {
+    for (int i = 0; i < node.num_outputs(); ++i) {
       outputs->emplace_back(new_node, i, true);
     }
     return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc b/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
index 185cf5642e9..240d6edd7a9 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
@@ -251,7 +251,7 @@ Status Vectorization::AddConversionMapping(Node* op_node) {
 
   // The inputs for the node to be converted may already have been converted
   // themselves. For those that are not, we promote them to MapDefun outputs.
-  for (size_t i = 0; i < op_node->num_inputs(); ++i) {
+  for (int i = 0; i < op_node->num_inputs(); ++i) {
     auto edge = input_edges[i];
     if (auto found = gtl::FindOrNull(conversion_map_,
                                      {edge->src(), edge->src_output()})) {
@@ -279,15 +279,15 @@ Status Vectorization::AddConversionMapping(Node* op_node) {
             << "\" failed with error: " << s;
     return s;
   }
-
-  if (op_node->num_outputs() != outputs.size()) {
+  const int64 op_node_num_outputs = op_node->num_outputs();
+  if (op_node_num_outputs != outputs.size()) {
     return errors::Internal(
         "Number of vectorizer outputs does not match. Expected: ",
         op_node->num_outputs(), " Actual: ", outputs.size());
   }
 
   // Add output mappings.
-  for (size_t i = 0; i < op_node->num_outputs(); ++i) {
+  for (int i = 0; i < op_node->num_outputs(); ++i) {
     conversion_map_.insert({{op_node, i}, outputs[i]});
   }
 
@@ -521,7 +521,7 @@ Status Vectorization::AddArgTensorMappings() {
 
   // Captured inputs. These are applied (without slicing) to every iteration of
   // the map function, hence are mapped to unstacked nodes.
-  for (int i = num_args; i < map_defun_fn_->arg_nodes.size(); ++i) {
+  for (int i = num_args, end = map_defun_fn_->arg_nodes.size(); i < end; ++i) {
     TF_RETURN_IF_ERROR(add_conversion(map_defun_fn_->arg_nodes[i], false));
   }
 
diff --git a/tensorflow/core/grappler/optimizers/debug_stripper.cc b/tensorflow/core/grappler/optimizers/debug_stripper.cc
index d0aa79a24d7..dfc58ab7ae9 100644
--- a/tensorflow/core/grappler/optimizers/debug_stripper.cc
+++ b/tensorflow/core/grappler/optimizers/debug_stripper.cc
@@ -63,8 +63,7 @@ Status DebugStripper::Optimize(Cluster* cluster, const GrapplerItem& item,
       node.mutable_attr()->swap(new_attr);
       // As Identity op only takes one input, mark redundant inputs as control
       // input.
-      for (int i = 1, node_input_size = node.input_size(); i < node_input_size;
-           ++i) {
+      for (int i = 1, end = node.input_size(); i < end; ++i) {
         if (!IsControlInput(node.input(i))) {
           *node.mutable_input(i) = AsControlDependency(NodeName(node.input(i)));
         }
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 0e156aaa84c..7af70e4755f 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -828,8 +828,9 @@ const bool IsExemptFromSideEffectsExecutionValidation(const string& op) {
       {// LINT.IfChange
        // Op types that should not run in program order, e.g. because they need
        // to run asynchronously to avoid deadlock.
-       "CollectiveGather", "CollectiveReduce", "CollectiveBcastSend",
-       "CollectiveBcastRecv", "NcclAllReduce", "Send", "Recv",
+       "CollectiveGather", "CollectiveReduce", "CollectiveReduceV2",
+       "CollectiveBcastSend", "CollectiveBcastRecv", "NcclAllReduce", "Send",
+       "Recv",
 
        // Legacy random ops.
        // See details in tensorflow/python/framework/auto_control_deps.py.
@@ -844,7 +845,8 @@ const bool IsExemptFromSideEffectsExecutionValidation(const string& op) {
 
        // CudnnRNN ops are stateful but they can't generate any observable
        // side-effect.
-       "CudnnRNNV2", "CudnnRNNV3", "CudnnRNNBackpropV2", "CudnnRNNBackpropV3",
+       "CudnnRNN", "CudnnRNNBackprop", "CudnnRNNV2", "CudnnRNNV3",
+       "CudnnRNNBackpropV2", "CudnnRNNBackpropV3",
 
        // TPUEmbedding EnqueueOps are stateful but this is only between ops with
        // the same device_ordinal on the same host.
@@ -1250,7 +1252,7 @@ Status InlineFunctionCalls(const GrapplerItem& item,
 
       if (n->IsIfNode()) {
         TF_RETURN_IF_ERROR(RewriteIfNode(n, graph.get(), false));
-      } else if (n->type_string() == "Case") {
+      } else if (n->IsCaseNode()) {
         TF_RETURN_IF_ERROR(RewriteCaseNode(n, graph.get(), false));
       } else if (n->IsWhileNode()) {
         TF_RETURN_IF_ERROR(RewriteWhileNode(n, graph.get(), false));
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc
index 969857879af..9e3a09b5d79 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc
@@ -38,7 +38,7 @@ namespace {
 constexpr char kNHWC[] = "NHWC";
 constexpr char kNCHW[] = "NCHW";
 constexpr float kVoltaGPURatioThreshold = 0.5;
-constexpr float kConv2DGPUFP16Threshold = 0.5;
+constexpr float kConvGPUFP16Threshold = 0.5;
 
 struct MutableNodeViewFormatter {
   void operator()(std::string* out, utils::MutableNodeView* node_view) const {
@@ -69,15 +69,15 @@ inline std::pair<int, int> GetNumGPUs(const Cluster& cluster) {
   return {num_gpus, num_volta};
 }
 
-inline bool NumConv2DOnDeviceWithDataTypeOverThreshold(
+inline bool NumConvOnDeviceWithDataTypeOverThreshold(
     const TransposeContext& context, absl::string_view device,
     const DataType& data_type) {
-  int num_conv2d_gpu = 0;
-  int num_conv2d_gpu_fp16 = 0;
+  int num_conv_gpu = 0;
+  int num_conv_gpu_fp16 = 0;
 
   for (const auto& node : context.graph_view->GetNodes()) {
     const auto* node_def = node.node();
-    if (!IsConv2D(*node_def)) {
+    if (!IsConv2D(*node_def) and !IsConv3D(*node_def)) {
       continue;
     }
     const string& device_name =
@@ -89,20 +89,20 @@ inline bool NumConv2DOnDeviceWithDataTypeOverThreshold(
                            absl::AsciiStrToLower(device))) {
       continue;
     }
-    num_conv2d_gpu++;
+    num_conv_gpu++;
     const auto* t_attr = node.GetAttr("T");
     if (t_attr == nullptr) {
       continue;
     }
     if (t_attr->type() == data_type) {
-      num_conv2d_gpu_fp16++;
+      num_conv_gpu_fp16++;
     }
   }
 
-  if (num_conv2d_gpu == 0) return false;
+  if (num_conv_gpu == 0) return false;
 
-  return (static_cast<float>(num_conv2d_gpu_fp16) /
-          static_cast<float>(num_conv2d_gpu)) >= kConv2DGPUFP16Threshold;
+  return (static_cast<float>(num_conv_gpu_fp16) /
+          static_cast<float>(num_conv_gpu)) >= kConvGPUFP16Threshold;
 }
 
 inline std::pair<string, string> GetSrcAndDstDataFormats(
@@ -111,7 +111,7 @@ inline std::pair<string, string> GetSrcAndDstDataFormats(
   string dst_format = kNCHW;
   if (((static_cast<float>(num_voltas) / static_cast<float>(num_gpus)) >=
        kVoltaGPURatioThreshold) &&
-      NumConv2DOnDeviceWithDataTypeOverThreshold(context, kGPU, DT_HALF)) {
+      NumConvOnDeviceWithDataTypeOverThreshold(context, kGPU, DT_HALF)) {
     std::swap(src_format, dst_format);
   }
   return {src_format, dst_format};
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
index bedd16a18ce..930dda6137a 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
@@ -241,8 +241,8 @@ Status Transposer::CreateConstPermNode(TransposeContext* context,
   node.mutable_attr()->insert({"dtype", attr_data_type});
 
   AttrValue attr_tensor;
-  Tensor tensor(DT_INT32, TensorShape({4}));
-  for (int i = 0, iter_limit = permutation.size(); i < iter_limit; i++) {
+  Tensor tensor(DT_INT32, TensorShape({(long long)permutation.size()}));
+  for (int i = 0, end = permutation.size(); i < end; i++) {
     tensor.flat<int>()(i) = permutation[i];
   }
   tensor.AsProtoTensorContent(attr_tensor.mutable_tensor());
@@ -752,6 +752,86 @@ Status Conv2DBackpropInputTransposer::TransposeNode(
   return context->graph_view->GetMutationBuilder()->Apply();
 }
 
+Status Conv3DTransposer::TransposeNode(TransposeContext* context,
+                                       utils::MutableNodeView* node) {
+  DCHECK(IsConv3D(*node->node()));
+  // Update the format from 4D to 5D layout.
+  std::string src_format = context->src_format;
+  std::string dst_format = context->dst_format;
+  std::string src_format_3d = src_format == "NHWC" ? "NDHWC" : "NCDHW";
+  std::string dst_format_3d = dst_format == "NHWC" ? "NDHWC" : "NCDHW";
+  context->AssignDeviceAndDataFormats(context->target_device, src_format_3d,
+                                      dst_format_3d);
+  if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 5)) {
+    return Status::OK();
+  }
+  VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
+          << "' with op '" << node->GetOp() << "' from data format '"
+          << context->src_format << "' to '" << context->dst_format << "'";
+  TF_RETURN_IF_ERROR(UpdateNode(context, node));
+  TF_RETURN_IF_ERROR(UpdateFaninEdgesWithOp(context, {0}, node, kOpTranspose));
+  TF_RETURN_IF_ERROR(UpdateFanoutEdgesWithOp(context, {0}, node, kOpTranspose));
+  // Change back the format from 5D to 4D layout.
+  context->AssignDeviceAndDataFormats(context->target_device, src_format,
+                                      dst_format);
+  return context->graph_view->GetMutationBuilder()->Apply();
+}
+
+Status Conv3DBackpropFilterTransposer::TransposeNode(
+    TransposeContext* context, utils::MutableNodeView* node) {
+  DCHECK(IsConv3DBackpropFilterV2(*node->node()));
+  // Update the format from 4D to 5D layout.
+  std::string src_format = context->src_format;
+  std::string dst_format = context->dst_format;
+  std::string src_format_3d = src_format == "NHWC" ? "NDHWC" : "NCDHW";
+  std::string dst_format_3d = dst_format == "NHWC" ? "NDHWC" : "NCDHW";
+  context->AssignDeviceAndDataFormats(context->target_device, src_format_3d,
+                                      dst_format_3d);
+  if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 5)) {
+    return Status::OK();
+  }
+  VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
+          << "' with op '" << node->GetOp() << "' from data format '"
+          << context->src_format << "' to '" << context->dst_format << "'";
+  TF_RETURN_IF_ERROR(UpdateNode(context, node));
+  TF_RETURN_IF_ERROR(
+      UpdateFaninEdgesWithOp(context, {0, 2}, node, kOpTranspose));
+  // No need to update output shape, as it is always of shape
+  // [filter_height, filter_width, in_channels, out_channels], regardless of
+  // whether NCHW or NHWC is used.
+  // Change back the format from 5D to 4D layout.
+  context->AssignDeviceAndDataFormats(context->target_device, src_format,
+                                      dst_format);
+  return context->graph_view->GetMutationBuilder()->Apply();
+}
+
+Status Conv3DBackpropInputTransposer::TransposeNode(
+    TransposeContext* context, utils::MutableNodeView* node) {
+  DCHECK(IsConv3DBackpropInputV2(*node->node()));
+  // Update the format from 4D to 5D layout.
+  std::string src_format = context->src_format;
+  std::string dst_format = context->dst_format;
+  std::string src_format_3d = src_format == "NHWC" ? "NDHWC" : "NCDHW";
+  std::string dst_format_3d = dst_format == "NHWC" ? "NDHWC" : "NCDHW";
+  context->AssignDeviceAndDataFormats(context->target_device, src_format_3d,
+                                      dst_format_3d);
+  if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 5)) {
+    return Status::OK();
+  }
+  VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
+          << "' with op '" << node->GetOp() << "' from data format '"
+          << context->src_format << "' to '" << context->dst_format << "'";
+  TF_RETURN_IF_ERROR(UpdateNode(context, node));
+  TF_RETURN_IF_ERROR(
+      UpdateFaninEdgesWithOp(context, {0}, node, kOpDataFormatVecPermute));
+  TF_RETURN_IF_ERROR(UpdateFaninEdgesWithOp(context, {2}, node, kOpTranspose));
+  TF_RETURN_IF_ERROR(UpdateFanoutEdgesWithOp(context, {0}, node, kOpTranspose));
+  // Change back the format from 5D to 4D layout.
+  context->AssignDeviceAndDataFormats(context->target_device, src_format,
+                                      dst_format);
+  return context->graph_view->GetMutationBuilder()->Apply();
+}
+
 Status FusedBatchNormExTransposer::TransposeNode(TransposeContext* context,
                                                  utils::MutableNodeView* node) {
   DCHECK(IsFusedBatchNormEx(*node->node()));
@@ -1236,7 +1316,12 @@ bool ReduceTransposer::IsAlongAxis(const Tensor& tensor,
     return false;
   }
   for (int i = 0; i < axis_size; ++i) {
-    int local_axis = tensor.flat<int>()(i);
+    int local_axis = 0;
+    if (tensor.dtype() == DT_INT32) {
+      local_axis = tensor.flat<int32>()(i);
+    } else {
+      local_axis = tensor.flat<int64>()(i);
+    }
     if (local_axis < 0) {
       local_axis += rank;
     }
@@ -1493,21 +1578,21 @@ Status SqueezeTransposer::UpdateSqueezeDims(TransposeContext* context,
   if (squeeze_dims_attr == nullptr) {
     return errors::InvalidArgument("Missing attribute ", kAttrSqueezeDims);
   }
-  const int max_num_squeeze_dim = context->src_format.length() - 1;
-  const int min_squeeze_dim = -(max_num_squeeze_dim + 1);
+  const int num_input_dims = context->src_format.length();
+  const int min_squeeze_dim = -num_input_dims;
   std::vector<int> squeeze_dims_mapped;
   const int squeeze_dims_size = squeeze_dims_attr->list().i_size();
   squeeze_dims_mapped.reserve(squeeze_dims_size);
   for (int i = 0; i < squeeze_dims_size; ++i) {
     int dim = squeeze_dims_attr->list().i(i);
-    if (dim < min_squeeze_dim || dim >= max_num_squeeze_dim) {
+    if (dim < min_squeeze_dim || dim >= num_input_dims) {
       return errors::InvalidArgument(
           "Attribute '", kAttrSqueezeDims, "' contains out of range index '",
           dim, "', index must be between [", min_squeeze_dim, ", ",
-          max_num_squeeze_dim, ")");
+          num_input_dims, ")");
     }
     if (dim < 0) {
-      dim += max_num_squeeze_dim;
+      dim += num_input_dims;
     }
     squeeze_dims_mapped.push_back(context->dst_to_src[dim]);
   }
@@ -1567,8 +1652,7 @@ Status StridedSliceTransposer::PermuteMask(TransposeContext* context,
     return errors::InvalidArgument("invalid mask value: ", mask_i);
   }
   int result = 0;
-  for (int i = 0, iter_limit = context->src_to_dst.size(); i < iter_limit;
-       i++) {
+  for (int i = 0, end = context->src_to_dst.size(); i < end; i++) {
     const int final_pos = context->src_to_dst[i];
     const int position_mask = 1 << final_pos;
     const int bit_i = (mask_i & position_mask) >> final_pos;
@@ -1680,7 +1764,9 @@ bool IsLayoutSensitiveOp(const NodeDef& node) {
          IsDepthwiseConv2dNativeBackpropInput(node) ||
          IsFusedBatchNormEx(node) || IsFusedBatchNormGrad(node) ||
          IsMaxPoolV2(node) || IsMaxPoolGrad(node) || IsMaxPoolGradV2(node) ||
-         IsMaxPoolGradGradV1(node) || IsMaxPoolGradGradV2(node);
+         IsMaxPoolGradGradV1(node) || IsMaxPoolGradGradV2(node) ||
+         IsConv3D(node) || IsConv3DBackpropInputV2(node) ||
+         IsConv3DBackpropFilterV2(node);
 }
 
 bool IsDefaultLayoutAgnosticOp(const NodeDef& node) {
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h
index 95af7933d10..7741730db59 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h
@@ -239,6 +239,30 @@ class Conv2DBackpropInputTransposer : public LayoutSensitiveOpTransposer {
                        utils::MutableNodeView* node) override;
 };
 
+class Conv3DTransposer : public LayoutSensitiveOpTransposer {
+ public:
+  explicit Conv3DTransposer() : LayoutSensitiveOpTransposer() {}
+
+  Status TransposeNode(TransposeContext* context,
+                       utils::MutableNodeView* node) override;
+};
+
+class Conv3DBackpropFilterTransposer : public LayoutSensitiveOpTransposer {
+ public:
+  explicit Conv3DBackpropFilterTransposer() : LayoutSensitiveOpTransposer() {}
+
+  Status TransposeNode(TransposeContext* context,
+                       utils::MutableNodeView* node) override;
+};
+
+class Conv3DBackpropInputTransposer : public LayoutSensitiveOpTransposer {
+ public:
+  explicit Conv3DBackpropInputTransposer() : LayoutSensitiveOpTransposer() {}
+
+  Status TransposeNode(TransposeContext* context,
+                       utils::MutableNodeView* node) override;
+};
+
 class FusedBatchNormExTransposer : public LayoutSensitiveOpTransposer {
  public:
   explicit FusedBatchNormExTransposer() : LayoutSensitiveOpTransposer() {}
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory.cc
index 59c06d42441..15bbc08079c 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory.cc
@@ -43,6 +43,17 @@ std::shared_ptr<Transposer> TransposerFactory::GetTransposer(
     return GetOrCreateIfNotFound<Conv2DBackpropInputTransposer>(
         "Conv2DBackpropInput");
   }
+  if (IsConv3D(node)) {
+    return GetOrCreateIfNotFound<Conv3DTransposer>("Conv3D");
+  }
+  if (IsConv3DBackpropInputV2(node)) {
+    return GetOrCreateIfNotFound<Conv3DBackpropInputTransposer>(
+        "Conv3DBackpropInput");
+  }
+  if (IsConv3DBackpropFilterV2(node)) {
+    return GetOrCreateIfNotFound<Conv3DBackpropFilterTransposer>(
+        "Conv3DBackpropFilter");
+  }
   if (IsFusedBatchNormEx(node)) {
     return GetOrCreateIfNotFound<FusedBatchNormExTransposer>(
         "FusedBatchNormEx");
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_test.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_test.cc
index 90e96fec673..ab0ccf57a4b 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_test.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_test.cc
@@ -370,6 +370,136 @@ class TransposerTest : public ::testing::Test {
 
   void TearDown() override { TF_ASSERT_OK(virtual_cluster_->Shutdown()); }
 
+  template <typename T>
+  void ReduceTransposerKeepDims() {
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+    GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+    GrapplerItem item;
+    Scope scope = Scope::NewRootScope();
+
+    auto input =
+        ops::RandomUniform(scope.WithOpName("input"),
+                           {kBatchSize, kHeight, kWidth, kDepthIn}, DT_FLOAT);
+    auto filter =
+        ops::RandomUniform(scope.WithOpName("filter"),
+                           {kHeight, kWidth, kDepthIn, kDepthOut}, DT_FLOAT);
+    Output conv2d = ops::Conv2D(
+        scope.WithOpName("conv2d").WithDevice("/device:GPU:0"), input, filter,
+        {1, 2, 4, 1}, "SAME", ops::Conv2D::DataFormat(kSrcFormat));
+
+    auto axis = ops::Const<T>(scope.WithOpName("axis"), {0, 1, 2}, {3});
+    auto attrs = ops::Sum::Attrs().KeepDims(true);
+    auto sum_op = ops::Sum(scope.WithOpName("sum").WithDevice("/device:GPU:0"),
+                           conv2d, axis, attrs);
+
+    auto z = ops::Identity(scope.WithOpName("z"), sum_op);
+    TF_ASSERT_OK(scope.ToGraphDef(&item.graph));
+
+    TransposeContext context;
+    TF_ASSERT_OK(TransposeContext::InitializeTransposeContext(
+        item, virtual_cluster_.get(), &context));
+    context.AssignDeviceAndDataFormats(kGPU, kSrcFormat, kDstFormat);
+
+    DefaultLayoutSensitiveOpTransposer conv2d_transposer;
+    auto* c2d = context.graph_view->GetNode("conv2d");
+    ASSERT_NE(c2d, nullptr);
+    TF_ASSERT_OK(conv2d_transposer.TransposeNode(&context, c2d));
+
+    ReduceTransposer reducer_transposer;
+    auto* sum = context.graph_view->GetNode("sum");
+    ASSERT_NE(sum, nullptr);
+    TF_ASSERT_OK(reducer_transposer.TransposeNode(&context, sum));
+
+    auto* input_transpose_node = context.graph_view->GetNode(
+        "sum-0-TransposeNHWCToNCHW-LayoutOptimizer");
+    ASSERT_NE(input_transpose_node, nullptr);
+
+    auto* updated_sum_node = context.graph_view->GetNode("sum");
+    ASSERT_NE(updated_sum_node, nullptr);
+    ASSERT_EQ(updated_sum_node->NumRegularFanins(), 2);
+    VerifyRegularFaninMatch(updated_sum_node, 0,
+                            input_transpose_node->GetName(), 0);
+
+    auto* axis_node = context.graph_view->GetNode(
+        "sum-1-DataFormatDimMapNHWCToNCHW-LayoutOptimizer");
+    ASSERT_NE(axis_node, nullptr);
+    ASSERT_EQ(axis_node->NumRegularFanins(), 1);
+    VerifyRegularFaninMatch(axis_node, 0, "axis", 0);
+
+    auto* output_transpose_node = context.graph_view->GetNode(
+        "sum-0-0-TransposeNCHWToNHWC-LayoutOptimizer");
+    ASSERT_NE(output_transpose_node, nullptr);
+
+    auto* z_output_node = context.graph_view->GetNode("z");
+    ASSERT_NE(z_output_node, nullptr);
+    ASSERT_EQ(z_output_node->NumRegularFanins(), 1);
+    VerifyRegularFaninMatch(z_output_node, 0, output_transpose_node->GetName(),
+                            0);
+  }
+
+  template <typename T>
+  void ReduceTransposerValidAxisNode() {
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+    GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+    GrapplerItem item;
+    Scope scope = Scope::NewRootScope();
+
+    auto input =
+        ops::RandomUniform(scope.WithOpName("input"),
+                           {kBatchSize, kHeight, kWidth, kDepthIn}, DT_FLOAT);
+    auto filter =
+        ops::RandomUniform(scope.WithOpName("filter"),
+                           {kHeight, kWidth, kDepthIn, kDepthOut}, DT_FLOAT);
+    Output conv2d = ops::Conv2D(
+        scope.WithOpName("conv2d").WithDevice("/device:GPU:0"), input, filter,
+        {1, 2, 4, 1}, "SAME", ops::Conv2D::DataFormat(kSrcFormat));
+
+    auto axis = ops::Const<T>(scope.WithOpName("axis"), {0, 1, 2}, {3});
+    auto sum_op = ops::Max(scope.WithOpName("max").WithDevice("/device:GPU:0"),
+                           conv2d, axis);
+
+    auto z = ops::Identity(scope.WithOpName("z"), sum_op);
+    TF_ASSERT_OK(scope.ToGraphDef(&item.graph));
+
+    TransposeContext context;
+    TF_ASSERT_OK(TransposeContext::InitializeTransposeContext(
+        item, virtual_cluster_.get(), &context));
+    context.AssignDeviceAndDataFormats(kGPU, kSrcFormat, kDstFormat);
+
+    DefaultLayoutSensitiveOpTransposer conv2d_transposer;
+    auto* c2d = context.graph_view->GetNode("conv2d");
+    ASSERT_NE(c2d, nullptr);
+    TF_ASSERT_OK(conv2d_transposer.TransposeNode(&context, c2d));
+
+    ReduceTransposer reducer_transposer;
+    auto* max = context.graph_view->GetNode("max");
+    ASSERT_NE(max, nullptr);
+    TF_ASSERT_OK(reducer_transposer.TransposeNode(&context, max));
+
+    auto* input_transpose_node = context.graph_view->GetNode(
+        "max-0-TransposeNHWCToNCHW-LayoutOptimizer");
+    ASSERT_NE(input_transpose_node, nullptr);
+
+    auto* updated_max_node = context.graph_view->GetNode("max");
+    ASSERT_NE(updated_max_node, nullptr);
+    ASSERT_EQ(updated_max_node->NumRegularFanins(), 2);
+    VerifyRegularFaninMatch(updated_max_node, 0,
+                            input_transpose_node->GetName(), 0);
+
+    auto* axis_node = context.graph_view->GetNode(
+        "max-1-DataFormatDimMapNHWCToNCHW-LayoutOptimizer");
+    ASSERT_NE(axis_node, nullptr);
+    ASSERT_EQ(axis_node->NumRegularFanins(), 1);
+    VerifyRegularFaninMatch(axis_node, 0, "axis", 0);
+
+    auto* z_output_node = context.graph_view->GetNode("z");
+    ASSERT_NE(z_output_node, nullptr);
+    ASSERT_EQ(z_output_node->NumRegularFanins(), 1);
+    VerifyRegularFaninMatch(z_output_node, 0, updated_max_node->GetName(), 0);
+  }
+
   std::unique_ptr<Cluster> virtual_cluster_;
 };
 
@@ -2137,6 +2267,133 @@ TEST_F(TransposerTest, SqueezeTransposerTestSqueezeDimsUpdated) {
   VerifyRegularFaninMatch(z_output_node, 0, squeeze_node->GetName(), 0);
 }
 
+// Same as SqueezeTransposerTestSqueezeDimsUpdated but with squeeze dims
+// specified with negative values.
+TEST_F(TransposerTest, SqueezeTransposerTestNegativeSqueezeDimsUpdated) {
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GrapplerItem item;
+  Scope scope = Scope::NewRootScope();
+  auto input =
+      ops::RandomUniform(scope.WithOpName("input"), {1, 1, 1, 8}, DT_FLOAT);
+  auto filter =
+      ops::RandomUniform(scope.WithOpName("filter"), {1, 1, 8, 1}, DT_FLOAT);
+  auto conv2d = ops::Conv2D(
+      scope.WithOpName("conv2d").WithDevice("/device:GPU:0"), input, filter,
+      {1, 1, 1, 1}, "SAME", ops::Conv2D::DataFormat(kSrcFormat));
+
+  auto squeeze_op =
+      ops::Squeeze(scope.WithOpName("squeeze").WithDevice("/device:GPU:0"),
+                   conv2d, ops::Squeeze::Attrs().Axis({-3, -2}));
+  auto z = ops::Identity(scope.WithOpName("z"), squeeze_op);
+  TF_ASSERT_OK(scope.ToGraphDef(&item.graph));
+  TransposeContext context;
+  TF_ASSERT_OK(TransposeContext::InitializeTransposeContext(
+      item, virtual_cluster_.get(), &context));
+  context.AssignDeviceAndDataFormats(kGPU, kSrcFormat, kDstFormat);
+
+  DefaultLayoutSensitiveOpTransposer conv2d_transposer;
+  auto* c2d = context.graph_view->GetNode("conv2d");
+  ASSERT_NE(c2d, nullptr);
+  TF_ASSERT_OK(conv2d_transposer.TransposeNode(&context, c2d));
+
+  SqueezeTransposer squeeze_transposer;
+  auto* squeeze = context.graph_view->GetNode("squeeze");
+  ASSERT_NE(squeeze, nullptr);
+  TF_ASSERT_OK(squeeze_transposer.TransposeNode(&context, squeeze));
+
+  auto* input_transpose_node1 = context.graph_view->GetNode(
+      "squeeze-0-TransposeNHWCToNCHW-LayoutOptimizer");
+  ASSERT_NE(input_transpose_node1, nullptr);
+  ASSERT_EQ(input_transpose_node1->NumRegularFanins(), 2);
+  VerifyRegularFaninMatch(input_transpose_node1, 0,
+                          "conv2d-0-0-TransposeNCHWToNHWC-LayoutOptimizer", 0);
+
+  auto* squeeze_node = context.graph_view->GetNode("squeeze");
+  ASSERT_NE(squeeze_node, nullptr);
+  ASSERT_EQ(squeeze_node->NumRegularFanins(), 1);
+  VerifyRegularFaninMatch(squeeze_node, 0, input_transpose_node1->GetName(), 0);
+  const auto* squeeze_dims_attr = squeeze_node->GetAttr("squeeze_dims");
+  const auto& list = squeeze_dims_attr->list();
+  ASSERT_EQ(list.i_size(), 2);
+  EXPECT_EQ(list.i(0), 2);
+  EXPECT_EQ(list.i(1), 3);
+
+  auto* output_transpose_node = context.graph_view->GetNode(
+      "squeeze-0-0-TransposeNCHWToNHWC-LayoutOptimizer");
+  EXPECT_EQ(output_transpose_node, nullptr);
+
+  auto* z_output_node = context.graph_view->GetNode("z");
+  ASSERT_NE(z_output_node, nullptr);
+  ASSERT_EQ(z_output_node->NumRegularFanins(), 1);
+  VerifyRegularFaninMatch(z_output_node, 0, squeeze_node->GetName(), 0);
+}
+
+// Same as SqueezeTransposerTestSqueezeDimsUpdated but with the source and
+// destination formats swapped (as is used in some cases when the data type is
+// DT_HALF).
+TEST_F(TransposerTest, SqueezeTransposerTestNCHWToNHWCSqueezeDimsUpdated) {
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GrapplerItem item;
+  Scope scope = Scope::NewRootScope();
+  auto input =
+      ops::RandomUniform(scope.WithOpName("input"), {1, 8, 1, 1}, DT_FLOAT);
+  auto filter =
+      ops::RandomUniform(scope.WithOpName("filter"), {1, 1, 8, 1}, DT_FLOAT);
+  auto conv2d = ops::Conv2D(
+      scope.WithOpName("conv2d").WithDevice("/device:GPU:0"), input, filter,
+      {1, 1, 1, 1}, "SAME", ops::Conv2D::DataFormat(kDstFormat));
+
+  auto squeeze_op =
+      ops::Squeeze(scope.WithOpName("squeeze").WithDevice("/device:GPU:0"),
+                   conv2d, ops::Squeeze::Attrs().Axis({2, 3}));
+  auto z = ops::Identity(scope.WithOpName("z"), squeeze_op);
+  TF_ASSERT_OK(scope.ToGraphDef(&item.graph));
+  TransposeContext context;
+  TF_ASSERT_OK(TransposeContext::InitializeTransposeContext(
+      item, virtual_cluster_.get(), &context));
+  context.AssignDeviceAndDataFormats(kGPU, kDstFormat, kSrcFormat);
+
+  DefaultLayoutSensitiveOpTransposer conv2d_transposer;
+  auto* c2d = context.graph_view->GetNode("conv2d");
+  ASSERT_NE(c2d, nullptr);
+  TF_ASSERT_OK(conv2d_transposer.TransposeNode(&context, c2d));
+
+  SqueezeTransposer squeeze_transposer;
+  auto* squeeze = context.graph_view->GetNode("squeeze");
+  ASSERT_NE(squeeze, nullptr);
+  TF_ASSERT_OK(squeeze_transposer.TransposeNode(&context, squeeze));
+
+  auto* input_transpose_node1 = context.graph_view->GetNode(
+      "squeeze-0-TransposeNCHWToNHWC-LayoutOptimizer");
+  ASSERT_NE(input_transpose_node1, nullptr);
+  ASSERT_EQ(input_transpose_node1->NumRegularFanins(), 2);
+  VerifyRegularFaninMatch(input_transpose_node1, 0,
+                          "conv2d-0-0-TransposeNHWCToNCHW-LayoutOptimizer", 0);
+
+  auto* squeeze_node = context.graph_view->GetNode("squeeze");
+  ASSERT_NE(squeeze_node, nullptr);
+  ASSERT_EQ(squeeze_node->NumRegularFanins(), 1);
+  VerifyRegularFaninMatch(squeeze_node, 0, input_transpose_node1->GetName(), 0);
+  const auto* squeeze_dims_attr = squeeze_node->GetAttr("squeeze_dims");
+  const auto& list = squeeze_dims_attr->list();
+  ASSERT_EQ(list.i_size(), 2);
+  EXPECT_EQ(list.i(0), 1);
+  EXPECT_EQ(list.i(1), 2);
+
+  auto* output_transpose_node = context.graph_view->GetNode(
+      "squeeze-0-0-TransposeNHWCToNCHW-LayoutOptimizer");
+  EXPECT_EQ(output_transpose_node, nullptr);
+
+  auto* z_output_node = context.graph_view->GetNode("z");
+  ASSERT_NE(z_output_node, nullptr);
+  ASSERT_EQ(z_output_node->NumRegularFanins(), 1);
+  VerifyRegularFaninMatch(z_output_node, 0, squeeze_node->GetName(), 0);
+}
+
 TEST_F(TransposerTest, MaxPoolV2Transposer) {
 #if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
@@ -3510,131 +3767,13 @@ TEST_F(TransposerTest, StridedSliceTransposerConstFaninBadRank) {
 }
 
 TEST_F(TransposerTest, ReduceTransposerKeepDims) {
-#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
-#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  GrapplerItem item;
-  Scope scope = Scope::NewRootScope();
-
-  auto input =
-      ops::RandomUniform(scope.WithOpName("input"),
-                         {kBatchSize, kHeight, kWidth, kDepthIn}, DT_FLOAT);
-  auto filter =
-      ops::RandomUniform(scope.WithOpName("filter"),
-                         {kHeight, kWidth, kDepthIn, kDepthOut}, DT_FLOAT);
-  Output conv2d = ops::Conv2D(
-      scope.WithOpName("conv2d").WithDevice("/device:GPU:0"), input, filter,
-      {1, 2, 4, 1}, "SAME", ops::Conv2D::DataFormat(kSrcFormat));
-
-  auto axis = ops::Const(scope.WithOpName("axis"), {0, 1, 2}, {3});
-  auto attrs = ops::Sum::Attrs().KeepDims(true);
-  auto sum_op = ops::Sum(scope.WithOpName("sum").WithDevice("/device:GPU:0"),
-                         conv2d, axis, attrs);
-
-  auto z = ops::Identity(scope.WithOpName("z"), sum_op);
-  TF_ASSERT_OK(scope.ToGraphDef(&item.graph));
-
-  TransposeContext context;
-  TF_ASSERT_OK(TransposeContext::InitializeTransposeContext(
-      item, virtual_cluster_.get(), &context));
-  context.AssignDeviceAndDataFormats(kGPU, kSrcFormat, kDstFormat);
-
-  DefaultLayoutSensitiveOpTransposer conv2d_transposer;
-  auto* c2d = context.graph_view->GetNode("conv2d");
-  ASSERT_NE(c2d, nullptr);
-  TF_ASSERT_OK(conv2d_transposer.TransposeNode(&context, c2d));
-
-  ReduceTransposer reducer_transposer;
-  auto* sum = context.graph_view->GetNode("sum");
-  ASSERT_NE(sum, nullptr);
-  TF_ASSERT_OK(reducer_transposer.TransposeNode(&context, sum));
-
-  auto* input_transpose_node =
-      context.graph_view->GetNode("sum-0-TransposeNHWCToNCHW-LayoutOptimizer");
-  ASSERT_NE(input_transpose_node, nullptr);
-
-  auto* updated_sum_node = context.graph_view->GetNode("sum");
-  ASSERT_NE(updated_sum_node, nullptr);
-  ASSERT_EQ(updated_sum_node->NumRegularFanins(), 2);
-  VerifyRegularFaninMatch(updated_sum_node, 0, input_transpose_node->GetName(),
-                          0);
-
-  auto* axis_node = context.graph_view->GetNode(
-      "sum-1-DataFormatDimMapNHWCToNCHW-LayoutOptimizer");
-  ASSERT_NE(axis_node, nullptr);
-  ASSERT_EQ(axis_node->NumRegularFanins(), 1);
-  VerifyRegularFaninMatch(axis_node, 0, "axis", 0);
-
-  auto* output_transpose_node = context.graph_view->GetNode(
-      "sum-0-0-TransposeNCHWToNHWC-LayoutOptimizer");
-  ASSERT_NE(output_transpose_node, nullptr);
-
-  auto* z_output_node = context.graph_view->GetNode("z");
-  ASSERT_NE(z_output_node, nullptr);
-  ASSERT_EQ(z_output_node->NumRegularFanins(), 1);
-  VerifyRegularFaninMatch(z_output_node, 0, output_transpose_node->GetName(),
-                          0);
+  ReduceTransposerKeepDims<int32>();
+  ReduceTransposerKeepDims<int64>();
 }
 
 TEST_F(TransposerTest, ReduceTransposerValidAxisNode) {
-#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
-#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  GrapplerItem item;
-  Scope scope = Scope::NewRootScope();
-
-  auto input =
-      ops::RandomUniform(scope.WithOpName("input"),
-                         {kBatchSize, kHeight, kWidth, kDepthIn}, DT_FLOAT);
-  auto filter =
-      ops::RandomUniform(scope.WithOpName("filter"),
-                         {kHeight, kWidth, kDepthIn, kDepthOut}, DT_FLOAT);
-  Output conv2d = ops::Conv2D(
-      scope.WithOpName("conv2d").WithDevice("/device:GPU:0"), input, filter,
-      {1, 2, 4, 1}, "SAME", ops::Conv2D::DataFormat(kSrcFormat));
-
-  auto axis = ops::Const(scope.WithOpName("axis"), {0, 1, 2}, {3});
-  auto sum_op = ops::Max(scope.WithOpName("max").WithDevice("/device:GPU:0"),
-                         conv2d, axis);
-
-  auto z = ops::Identity(scope.WithOpName("z"), sum_op);
-  TF_ASSERT_OK(scope.ToGraphDef(&item.graph));
-
-  TransposeContext context;
-  TF_ASSERT_OK(TransposeContext::InitializeTransposeContext(
-      item, virtual_cluster_.get(), &context));
-  context.AssignDeviceAndDataFormats(kGPU, kSrcFormat, kDstFormat);
-
-  DefaultLayoutSensitiveOpTransposer conv2d_transposer;
-  auto* c2d = context.graph_view->GetNode("conv2d");
-  ASSERT_NE(c2d, nullptr);
-  TF_ASSERT_OK(conv2d_transposer.TransposeNode(&context, c2d));
-
-  ReduceTransposer reducer_transposer;
-  auto* max = context.graph_view->GetNode("max");
-  ASSERT_NE(max, nullptr);
-  TF_ASSERT_OK(reducer_transposer.TransposeNode(&context, max));
-
-  auto* input_transpose_node =
-      context.graph_view->GetNode("max-0-TransposeNHWCToNCHW-LayoutOptimizer");
-  ASSERT_NE(input_transpose_node, nullptr);
-
-  auto* updated_max_node = context.graph_view->GetNode("max");
-  ASSERT_NE(updated_max_node, nullptr);
-  ASSERT_EQ(updated_max_node->NumRegularFanins(), 2);
-  VerifyRegularFaninMatch(updated_max_node, 0, input_transpose_node->GetName(),
-                          0);
-
-  auto* axis_node = context.graph_view->GetNode(
-      "max-1-DataFormatDimMapNHWCToNCHW-LayoutOptimizer");
-  ASSERT_NE(axis_node, nullptr);
-  ASSERT_EQ(axis_node->NumRegularFanins(), 1);
-  VerifyRegularFaninMatch(axis_node, 0, "axis", 0);
-
-  auto* z_output_node = context.graph_view->GetNode("z");
-  ASSERT_NE(z_output_node, nullptr);
-  ASSERT_EQ(z_output_node->NumRegularFanins(), 1);
-  VerifyRegularFaninMatch(z_output_node, 0, updated_max_node->GetName(), 0);
+  ReduceTransposerValidAxisNode<int32>();
+  ReduceTransposerValidAxisNode<int64>();
 }
 
 TEST(PermutationTest, PermutesVector) {
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
index 4e955db2f5a..f584b8d1548 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
@@ -84,11 +84,14 @@ NodeDef* AddCopyNode(const GraphOptimizerContext& ctx, const string& name,
 }
 
 NodeDef* AddEmptyNode(const GraphOptimizerContext& ctx, const string& name) {
-  CHECK(!ctx.node_map->NodeExists(name))
-      << "Node " << name << " already exists in a graph";
+  std::string new_name = name;
+  for (int count = 0; ctx.node_map->NodeExists(new_name); ++count) {
+    LOG(WARNING) << name << " already exists in the graph.";
+    new_name = absl::StrCat(name, "_", count);
+  }
   NodeDef* new_node = ctx.optimized_graph->add_node();
-  new_node->set_name(name);
-  ctx.node_map->AddNode(name, new_node);
+  new_node->set_name(new_name);
+  ctx.node_map->AddNode(new_name, new_node);
   return new_node;
 }
 
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc b/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc
index 678db7be83f..b0e923803d6 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc
@@ -228,6 +228,10 @@ TEST_F(GraphOptimizerStageTest, AddNodes) {
   NodeDef* empty_node_by_name;
   TF_CHECK_OK(stage.GetInputNode("Add_2", &empty_node_by_name));
   EXPECT_EQ(empty_node, empty_node_by_name);
+
+  // Check that AddEmptyNode adds a unique suffix if the node already exists.
+  NodeDef* unique_empty_node = stage.AddEmptyNode("Add_2");
+  EXPECT_EQ(unique_empty_node->name(), "Add_2_0");
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/optimizers/implementation_selector.cc b/tensorflow/core/grappler/optimizers/implementation_selector.cc
index 863ecae580a..d2e62ca65f3 100644
--- a/tensorflow/core/grappler/optimizers/implementation_selector.cc
+++ b/tensorflow/core/grappler/optimizers/implementation_selector.cc
@@ -41,6 +41,7 @@ namespace grappler {
 
 constexpr char kConstOp[] = "Const";
 constexpr char kCaseOp[] = "Case";
+constexpr char kStatelessCaseOp[] = "StatelessCase";
 constexpr char kDeviceIndexOp[] = "DeviceIndex";
 
 // TODO(b/157615690): clean up function implementation swap code.
@@ -353,7 +354,9 @@ Status ImplementationSelector::SelectDeviceIndex(GraphDef* graph) const {
     // case node.
     for (const auto& fanouts : node_view->GetRegularFanouts()) {
       for (const auto& fanout : fanouts) {
-        if (fanout.node_view()->GetOp() != kCaseOp) continue;
+        if (fanout.node_view()->GetOp() != kCaseOp &&
+            fanout.node_view()->GetOp() != kStatelessCaseOp)
+          continue;
         int index;
         // If any error is thrown out during device parsing, we simply skip
         // and do not modify the DeviceIndexNode.
diff --git a/tensorflow/core/grappler/optimizers/implementation_selector_test.cc b/tensorflow/core/grappler/optimizers/implementation_selector_test.cc
index 13a6e60b0ae..36b721cdce4 100644
--- a/tensorflow/core/grappler/optimizers/implementation_selector_test.cc
+++ b/tensorflow/core/grappler/optimizers/implementation_selector_test.cc
@@ -83,6 +83,30 @@ TEST_F(ImplementationSelectorTest, SelectDeviceIndex) {
   }
 }
 
+TEST_F(ImplementationSelectorTest, SelectDeviceIndexStatelessCase) {
+  using test::function::NDef;
+  ImplementationSelector optimizer;
+  GraphDef output;
+  GrapplerItem item;
+  AttrValue device_names;
+  device_names.mutable_list()->add_s("CPU");
+  device_names.mutable_list()->add_s("GPU");
+  item.graph = test::function::GDef(
+      {NDef("x", "DeviceIndex", {}, {{"device_names", device_names}},
+            CpuDevice),
+       NDef("case", "StatelessCase", {"x"}, {{"T", DT_FLOAT}}, GpuDevice)});
+
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      // Rewrite DeviceIndex op to a Const op with value of GPU index 1.
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.attr().at("value").tensor().int_val(0));
+    }
+  }
+}
+
 TEST_F(ImplementationSelectorTest, SelectDeviceIndexMultiOps) {
   using test::function::NDef;
   ImplementationSelector optimizer;
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 2f1c869965d..bce86ba5603 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -88,7 +89,8 @@ int NumIterations(const RewriterConfig& cfg) {
 // Check if optimizer is allowed to run only once.
 bool IsRunOnceOptimizer(const string& name) {
   return name == "layout" || name == "memory_optimizer" ||
-         name == "loop_optimizer" || name == "auto_mixed_precision";
+         name == "loop_optimizer" || name == "auto_mixed_precision" ||
+         name == "auto_mixed_precision_mkl";
 }
 
 bool IsTFDataFunction(const FunctionDef& func) {
@@ -183,7 +185,10 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
   MK_OPT("function", new FunctionOptimizer(
                          cfg_.function_optimization(),
                          /*lower_control_flow=*/!IsSingleThreadedExecutor()));
-  MK_OPT("constfold", new ConstantFolding(cpu_device_));
+  MK_OPT("constfold",
+         new ConstantFolding(
+             cpu_device_,
+             cfg_.experimental_disable_compressed_tensor_optimization()));
   MK_OPT("shape", new ShapeOptimizer());
   MK_OPT("remap", new Remapper(cfg_.remapping()));
   MK_OPT("layout", new GenericLayoutOptimizer());
@@ -243,8 +248,9 @@ Status MetaOptimizer::InitializeOptimizers(
     optimizers->push_back(MakeUnique<DebugStripper>());
   }
   if (cfg_.constant_folding() != RewriterConfig::OFF) {
-    optimizers->push_back(
-        MakeUnique<ConstantFolding>(cfg_.constant_folding(), cpu_device_));
+    optimizers->push_back(MakeUnique<ConstantFolding>(
+        cfg_.constant_folding(), cpu_device_,
+        cfg_.experimental_disable_compressed_tensor_optimization()));
   }
   if (cfg_.shape_optimization() != RewriterConfig::OFF) {
     optimizers->push_back(MakeUnique<ShapeOptimizer>());
@@ -401,6 +407,8 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, GrapplerItem&& item,
     return Status::OK();
   }
 
+  const uint64 start_us = Env::Default()->NowMicros();
+
   std::vector<std::unique_ptr<GraphOptimizer>> optimizers;
   if (cfg_.optimizers().empty()) {
     TF_RETURN_IF_ERROR(InitializeOptimizers(&optimizers));
@@ -530,6 +538,9 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, GrapplerItem&& item,
     DCHECK_EQ(optimized_graph->versions().producer(), original_producer);
   }
 
+  const uint64 end_us = Env::Default()->NowMicros();
+  metrics::UpdateGrapplerPassTime("OptimizeMainGraph", end_us - start_us);
+
   return Status::OK();
 }
 
@@ -602,6 +613,8 @@ Status MetaOptimizer::RunOptimizer(
 
 Status MetaOptimizer::OptimizeConsumeItem(Cluster* cluster, GrapplerItem&& item,
                                           GraphDef* optimized_graph) {
+  const uint64 start_us = Env::Default()->NowMicros();
+
   VLOG(1) << "Starting optimization for grappler item: " << item.id;
   optimization_results_.clear();
 
@@ -808,18 +821,29 @@ Status MetaOptimizer::OptimizeConsumeItem(Cluster* cluster, GrapplerItem&& item,
                         reinterpret_cast<uintptr_t>(optimized_graph)),
         *optimized_graph);
   }
+
+  const uint64 end_us = Env::Default()->NowMicros();
+  metrics::UpdateGrapplerPassTime("*", end_us - start_us);
+
   return Status::OK();
 }
 
-void MetaOptimizer::PrintResult() {
+string MetaOptimizer::GetResultString() const {
+  std::string result_string;
   for (const GraphOptimizationResult& graph_result : optimization_results_) {
-    LOG(INFO) << "Optimization results for grappler item: " << graph_result.id;
+    absl::StrAppend(&result_string,
+                    "Optimization results for grappler item: ", graph_result.id,
+                    "\n");
     for (const OptimizerResult& result : graph_result.results) {
-      LOG(INFO) << "  " << result.optimizer_name << ": " << result.message;
+      absl::StrAppend(&result_string, "  ", result.optimizer_name, ": ",
+                      result.message, "\n");
     }
   }
+  return result_string;
 }
 
+void MetaOptimizer::PrintResult() { LOG(INFO) << GetResultString(); }
+
 bool MetaOptimizerEnabled(const ConfigProto& cfg) {
   const auto& rewrite_cfg = cfg.graph_options().rewrite_options();
   if (rewrite_cfg.disable_meta_optimizer()) {
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h
index f39f0b62bb6..b21ea68f720 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -50,6 +50,8 @@ class MetaOptimizer : public GraphOptimizer {
   Status OptimizeConsumeItem(Cluster* cluster, GrapplerItem&& item,
                              GraphDef* optimized_graph);
 
+  string GetResultString() const;
+
   void PrintResult();
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
diff --git a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
index 77929bea257..f534d3ed34f 100644
--- a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
@@ -22,19 +22,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
-
-#define REGISTER_TEST_FLOAT32(TEST) REGISTER_TEST(TEST, DT_FLOAT, Float32Input);
-
-#ifdef ENABLE_INTEL_MKL_BFLOAT16
-#define REGISTER_TEST_BFLOAT16(TEST) \
-  REGISTER_TEST(TEST, DT_BFLOAT16, BFloat16Input);
-
-#define REGISTER_TEST_ALL_TYPES(TEST) \
-  REGISTER_TEST_FLOAT32(TEST);        \
-  REGISTER_TEST_BFLOAT16(TEST);
-#else
-#define REGISTER_TEST_ALL_TYPES(TEST) REGISTER_TEST_FLOAT32(TEST);
-#endif  // ENABLE_INTEL_MKL_BFLOAT16
+#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 namespace grappler {
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.cc b/tensorflow/core/grappler/optimizers/model_pruner.cc
index 3f5e3a8ea3a..5956fea4695 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner.cc
@@ -401,7 +401,7 @@ Status SplitIdentityNInputs(GraphDef* graph,
     }
 
     const int num_non_control_inputs = NumNonControlInputs(*node);
-    int terminal_second_size = terminal.second.size();
+    const int terminal_second_size = terminal.second.size();
     if (node->attr().count("T") == 0 ||
         node->attr().at("T").list().type_size() != num_non_control_inputs ||
         terminal_second_size >= num_non_control_inputs) {
diff --git a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
index 35d0c5b0e40..a043479789f 100644
--- a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
@@ -36,8 +36,8 @@ namespace internal {
 // dynamically determined.
 constexpr int64 kTensorMaxSize = 64;
 
-// All the nodes that should be blacklisted and not swapped.
-bool IsBlacklisted(const NodeDef& node) {
+// All the nodes that should be denylisted and not swapped.
+bool IsDenylisted(const NodeDef& node) {
   return
       // Collective ops should not be swapped.
       IsCollective(node) ||
@@ -94,8 +94,8 @@ Status IsNodeOutputPortHostFriendly(const GraphView& graph,
                                     bool* is_candidate) {
   *is_candidate = false;
 
-  // Make sure we are not a blacklisted op.
-  if (IsBlacklisted(node)) {
+  // Make sure we are not a denylisted op.
+  if (IsDenylisted(node)) {
     return Status::OK();
   }
 
@@ -215,7 +215,7 @@ bool IsNodeInputPortHostFriendly(const NodeDef& node, int port_id) {
 
 // Checks if a node is a candidate to pin to Host.
 // The rough algorithm is as follows:
-// 1] Check if node is blacklisted.
+// 1] Check if node is denylisted.
 // 2] Check if node can run on Host.
 // 3] Check all input/outputs are Host "friendly" (atm, friendly means small,
 //    ints, and pinned to Host).
@@ -230,7 +230,7 @@ Status IsNodeHostCandidate(const GraphView& graph, GraphProperties* properties,
   }
 
   // Skip these node types.
-  if (IsBlacklisted(node)) {
+  if (IsDenylisted(node)) {
     return Status::OK();
   }
 
diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index 9d734801916..f4bc5e38526 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -388,68 +389,84 @@ TEST_F(RemapperTest, FuseConv2DWithBias) {
   test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
 }
 
-TEST_F(RemapperTest, FuseMatMulWithBias) {
-  using ::tensorflow::ops::Placeholder;
+class RemapperFuseMatMulWithBiasTest : public RemapperTest {
+ public:
+  template <DataType DTYPE>
+  void RunTest() {
+    using ::tensorflow::ops::Placeholder;
 
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
-  auto lhs_shape = ops::Placeholder::Shape({8, 32});
-  auto rhs_shape = ops::Placeholder::Shape({32, 64});
-  auto bias_shape = ops::Placeholder::Shape({64});
+    auto lhs_shape = ops::Placeholder::Shape({8, 32});
+    auto rhs_shape = ops::Placeholder::Shape({32, 64});
+    auto bias_shape = ops::Placeholder::Shape({64});
 
-  auto lhs = Placeholder(s.WithOpName("lhs"), DT_FLOAT, lhs_shape);
-  auto rhs = Placeholder(s.WithOpName("rhs"), DT_FLOAT, rhs_shape);
-  auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape);
+    auto lhs = Placeholder(s.WithOpName("lhs"), DTYPE, lhs_shape);
+    auto rhs = Placeholder(s.WithOpName("rhs"), DTYPE, rhs_shape);
+    auto bias = Placeholder(s.WithOpName("bias"), DTYPE, bias_shape);
 
-  auto matmul = ops::MatMul(s.WithOpName("matmul"), lhs, rhs);
-  auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), matmul, bias);
-  auto fetch = ops::Identity(s.WithOpName("fetch"), bias_add);
+    auto matmul = ops::MatMul(s.WithOpName("matmul"), lhs, rhs);
+    auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), matmul, bias);
+    auto fetch = ops::Identity(s.WithOpName("fetch"), bias_add);
 
-  auto lhs_t = GenerateRandomTensor<DT_FLOAT>({8, 32});
-  auto rhs_t = GenerateRandomTensor<DT_FLOAT>({32, 64});
-  auto bias_t = GenerateRandomTensor<DT_FLOAT>({64});
+    auto lhs_t = GenerateTensorWithSetRandom<DTYPE>({8, 32});
+    auto rhs_t = GenerateTensorWithSetRandom<DTYPE>({32, 64});
+    auto bias_t = GenerateTensorWithSetRandom<DTYPE>({64});
 
-  GrapplerItem item;
-  item.fetch = {"fetch"};
-  item.feed = {{"lhs", lhs_t}, {"rhs", rhs_t}, {"bias", bias_t}};
-  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
+    GrapplerItem item;
+    item.fetch = {"fetch"};
+    item.feed = {{"lhs", lhs_t}, {"rhs", rhs_t}, {"bias", bias_t}};
+    TF_ASSERT_OK(s.ToGraphDef(&item.graph));
 
-  // Place all nodes on CPU.
-  for (int i = 0; i < item.graph.node_size(); ++i) {
-    item.graph.mutable_node(i)->set_device("/device:CPU:0");
-  }
-
-  Remapper optimizer(RewriterConfig::ON);
-  GraphDef output;
-  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
-
-  int found = 0;
-  for (const NodeDef& node : output.node()) {
-    if (node.name() == "bias_add") {
-      EXPECT_EQ(node.op(), "_FusedMatMul");
-      ASSERT_GE(node.input_size(), 3);
-      EXPECT_EQ(node.input(0), "lhs");
-      EXPECT_EQ(node.input(1), "rhs");
-
-      EXPECT_EQ(node.attr().at("num_args").i(), 1);
-      EXPECT_EQ(node.input(2), "bias");
-
-      const auto fused_ops = node.attr().at("fused_ops").list().s();
-      ASSERT_EQ(fused_ops.size(), 1);
-      EXPECT_EQ(fused_ops[0], "BiasAdd");
-      found++;
+    // Place all nodes on CPU.
+    for (int i = 0; i < item.graph.node_size(); ++i) {
+      item.graph.mutable_node(i)->set_device("/device:CPU:0");
     }
-  }
-  EXPECT_EQ(1, found);
 
-  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
-  ASSERT_EQ(tensors_expected.size(), 1);
-  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
-  ASSERT_EQ(tensors.size(), 1);
-  test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
+    Remapper optimizer(RewriterConfig::ON);
+    GraphDef output;
+    TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+    int found = 0;
+    for (const NodeDef& node : output.node()) {
+      if (node.name() == "bias_add") {
+        EXPECT_EQ(node.op(), "_FusedMatMul");
+        ASSERT_GE(node.input_size(), 3);
+        EXPECT_EQ(node.input(0), "lhs");
+        EXPECT_EQ(node.input(1), "rhs");
+
+        EXPECT_EQ(node.attr().at("num_args").i(), 1);
+        EXPECT_EQ(node.input(2), "bias");
+
+        const auto fused_ops = node.attr().at("fused_ops").list().s();
+        ASSERT_EQ(fused_ops.size(), 1);
+        EXPECT_EQ(fused_ops[0], "BiasAdd");
+        found++;
+      }
+    }
+    EXPECT_EQ(1, found);
+
+    auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+    ASSERT_EQ(tensors_expected.size(), 1);
+    auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+    ASSERT_EQ(tensors.size(), 1);
+    typedef typename EnumToDataType<DTYPE>::Type T;
+    test::ExpectTensorNear<T>(tensors[0], tensors_expected[0], 1e-6);
+  }
+};
+
+TEST_F(RemapperFuseMatMulWithBiasTest, F32) { RunTest<DT_FLOAT>(); }
+
+TEST_F(RemapperFuseMatMulWithBiasTest, Bf16) {
+#if !defined(INTEL_MKL) || !defined(ENABLE_INTEL_MKL_BFLOAT16)
+  GTEST_SKIP() << "Intel MKL with bfloat16 support is not enabled, skipping "
+                  "FuseMatMulWithBias with bfloat16.";
+#endif  // !defined(INTEL_MKL) || !defined(ENABLE_INTEL_MKL_BFLOAT16)
+  RunTest<DT_BFLOAT16>();  // NOLINT
 }
 
-TEST_F(RemapperTest, FuseConv2DWithBiasAndActivationOnGPU) {
+// TODO(b/161005848): Fix flaky test.
+TEST_F(RemapperTest, DISABLED_FuseConv2DWithBiasAndActivationOnGPU) {
 #if !(GOOGLE_CUDA)
   GTEST_SKIP() << "No CUDA, skip FuseConv2DWithBiasAndActivation on GPU";
 #endif  // !GOOGLE_CUDA
@@ -601,82 +618,99 @@ TEST_F(RemapperTest, FuseConv2DWithBiasAndActivation) {
   }
 }
 
-TEST_F(RemapperTest, FuseMatMulWithBiasAndActivation) {
-  using ::tensorflow::ops::Placeholder;
+class RemapperFuseMatMulWithBiasAndActivationTest : public RemapperTest {
+ public:
+  template <DataType DTYPE>
+  void RunTest() {
+    using ::tensorflow::ops::Placeholder;
 
-  for (const string& activation : {"Relu", "Relu6", "Elu"}) {
-    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    for (const string& activation : {"Relu", "Relu6", "Elu"}) {
+      tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
-    auto lhs_shape = ops::Placeholder::Shape({8, 32});
-    auto rhs_shape = ops::Placeholder::Shape({32, 64});
-    auto bias_shape = ops::Placeholder::Shape({64});
+      auto lhs_shape = ops::Placeholder::Shape({8, 32});
+      auto rhs_shape = ops::Placeholder::Shape({32, 64});
+      auto bias_shape = ops::Placeholder::Shape({64});
 
-    auto lhs = Placeholder(s.WithOpName("lhs"), DT_FLOAT, lhs_shape);
-    auto rhs = Placeholder(s.WithOpName("rhs"), DT_FLOAT, rhs_shape);
-    auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape);
+      auto lhs = Placeholder(s.WithOpName("lhs"), DTYPE, lhs_shape);
+      auto rhs = Placeholder(s.WithOpName("rhs"), DTYPE, rhs_shape);
+      auto bias = Placeholder(s.WithOpName("bias"), DTYPE, bias_shape);
 
-    auto matmul = ops::MatMul(s.WithOpName("matmul"), lhs, rhs);
-    auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), matmul, bias);
+      auto matmul = ops::MatMul(s.WithOpName("matmul"), lhs, rhs);
+      auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), matmul, bias);
 
-    ops::Identity fetch = [&]() -> ops::Identity {
-      auto activate = s.WithOpName("activation");
-      auto fetch = s.WithOpName("fetch");
+      ops::Identity fetch = [&]() -> ops::Identity {
+        auto activate = s.WithOpName("activation");
+        auto fetch = s.WithOpName("fetch");
 
-      if (activation == "Relu") {
-        return ops::Identity(fetch, ops::Relu(activate, bias_add));
-      } else if (activation == "Relu6") {
-        return ops::Identity(fetch, ops::Relu6(activate, bias_add));
-      } else if (activation == "Elu") {
-        return ops::Identity(fetch, ops::Elu(activate, bias_add));
+        if (activation == "Relu") {
+          return ops::Identity(fetch, ops::Relu(activate, bias_add));
+        } else if (activation == "Relu6") {
+          return ops::Identity(fetch, ops::Relu6(activate, bias_add));
+        } else if (activation == "Elu") {
+          return ops::Identity(fetch, ops::Elu(activate, bias_add));
+        }
+
+        return ops::Identity(fetch, bias);
+      }();
+
+      auto lhs_t = GenerateTensorWithSetRandom<DTYPE>({8, 32});
+      auto rhs_t = GenerateTensorWithSetRandom<DTYPE>({32, 64});
+      auto bias_t = GenerateTensorWithSetRandom<DTYPE>({64});
+
+      GrapplerItem item;
+      item.fetch = {"fetch"};
+      item.feed = {{"lhs", lhs_t}, {"rhs", rhs_t}, {"bias", bias_t}};
+      TF_ASSERT_OK(s.ToGraphDef(&item.graph));
+
+      // Place all nodes on CPU.
+      for (int i = 0; i < item.graph.node_size(); ++i) {
+        item.graph.mutable_node(i)->set_device("/device:CPU:0");
       }
 
-      return ops::Identity(fetch, bias);
-    }();
+      Remapper optimizer(RewriterConfig::ON);
+      GraphDef output;
+      TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
-    auto lhs_t = GenerateRandomTensor<DT_FLOAT>({8, 32});
-    auto rhs_t = GenerateRandomTensor<DT_FLOAT>({32, 64});
-    auto bias_t = GenerateRandomTensor<DT_FLOAT>({64});
+      int found = 0;
+      for (const NodeDef& node : output.node()) {
+        if (node.name() == "activation") {
+          EXPECT_EQ(node.op(), "_FusedMatMul");
+          ASSERT_GE(node.input_size(), 3);
+          EXPECT_EQ(node.input(0), "lhs");
+          EXPECT_EQ(node.input(1), "rhs");
 
-    GrapplerItem item;
-    item.fetch = {"fetch"};
-    item.feed = {{"lhs", lhs_t}, {"rhs", rhs_t}, {"bias", bias_t}};
-    TF_ASSERT_OK(s.ToGraphDef(&item.graph));
+          EXPECT_EQ(node.attr().at("num_args").i(), 1);
+          EXPECT_EQ(node.input(2), "bias");
 
-    // Place all nodes on CPU.
-    for (int i = 0; i < item.graph.node_size(); ++i) {
-      item.graph.mutable_node(i)->set_device("/device:CPU:0");
-    }
-
-    Remapper optimizer(RewriterConfig::ON);
-    GraphDef output;
-    TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
-
-    int found = 0;
-    for (const NodeDef& node : output.node()) {
-      if (node.name() == "activation") {
-        EXPECT_EQ(node.op(), "_FusedMatMul");
-        ASSERT_GE(node.input_size(), 3);
-        EXPECT_EQ(node.input(0), "lhs");
-        EXPECT_EQ(node.input(1), "rhs");
-
-        EXPECT_EQ(node.attr().at("num_args").i(), 1);
-        EXPECT_EQ(node.input(2), "bias");
-
-        const auto fused_ops = node.attr().at("fused_ops").list().s();
-        ASSERT_EQ(fused_ops.size(), 2);
-        EXPECT_EQ(fused_ops[0], "BiasAdd");
-        EXPECT_EQ(fused_ops[1], activation);
-        found++;
+          const auto fused_ops = node.attr().at("fused_ops").list().s();
+          ASSERT_EQ(fused_ops.size(), 2);
+          EXPECT_EQ(fused_ops[0], "BiasAdd");
+          EXPECT_EQ(fused_ops[1], activation);
+          found++;
+        }
       }
-    }
-    EXPECT_EQ(1, found);
+      EXPECT_EQ(1, found);
 
-    auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
-    ASSERT_EQ(tensors_expected.size(), 1);
-    auto tensors = EvaluateNodes(output, item.fetch, item.feed);
-    ASSERT_EQ(tensors.size(), 1);
-    test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
+      auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+      ASSERT_EQ(tensors_expected.size(), 1);
+      auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+      ASSERT_EQ(tensors.size(), 1);
+      typedef typename EnumToDataType<DTYPE>::Type T;
+      test::ExpectTensorNear<T>(tensors[0], tensors_expected[0], 1e-6);
+    }
   }
+};
+
+TEST_F(RemapperFuseMatMulWithBiasAndActivationTest, F32) {
+  RunTest<DT_FLOAT>();
+}
+
+TEST_F(RemapperFuseMatMulWithBiasAndActivationTest, Bf16) {
+#if !defined(INTEL_MKL) || !defined(ENABLE_INTEL_MKL_BFLOAT16)
+  GTEST_SKIP() << "Intel MKL with bfloat16 support is not enabled, skipping "
+                  "FuseMatMulWithBiasAndActivation with bfloat16.";
+#endif  // !defined(INTEL_MKL) || !defined(ENABLE_INTEL_MKL_BFLOAT16)
+  RunTest<DT_BFLOAT16>();  // NOLINT
 }
 
 #ifndef INTEL_MKL
diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
index 3b3a747fd18..11f95894ff9 100644
--- a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
@@ -104,32 +104,32 @@ Status CheckTypesAndGetShapes(const GraphProperties& graph_properties,
             << shapes->size();
     if (!graph_properties.HasOutputProperties(n->name())) {
       LOG(ERROR) << "Node " << n->DebugString() << " lacks output shape.";
-      return errors::Internal("Node ", n->name(), " lacks output shape.");
+      return errors::Aborted("Node ", n->name(), " lacks output shape.");
     }
     const std::vector<OpInfo::TensorProperties>& prop_list =
         graph_properties.GetOutputProperties(n->name());
     if (prop_list.size() != 1) {
-      return errors::Internal("Node ", n->name(),
-                              " does not have exactly one output as expected "
-                              "by ScopedAllocatorOptimizer");
+      return errors::Aborted("Node ", n->name(),
+                             " does not have exactly one output as expected "
+                             "by ScopedAllocatorOptimizer");
     }
     const OpInfo::TensorProperties& props = prop_list[0];
     if (shapes->empty()) {
       *type = props.dtype();
     } else if (*type != props.dtype()) {
-      return errors::Internal("Group ops don't all have same type");
-    } else if (!TensorShape::IsValid(props.shape()) ||
-               props.shape().unknown_rank()) {
-      // TensorShape::IsValid may return true if unknown_rank is True, i.e.
-      // number of dimensions is unknown.  But for ScopedAllocatorOptimizer we
-      // need to know the shape fully.
-      return errors::Internal("Complete shape not known for ", n->name());
+      return errors::Aborted("Group ops don't all have same type");
     }
     if (*type != dtype) {
       return errors::Internal(
           "Type mismatch: type in op attr = ", DataTypeString(dtype),
           ", type in output props = ", DataTypeString(*type));
     }
+    if (!TensorShape::IsValid(props.shape()) || props.shape().unknown_rank()) {
+      // TensorShape::IsValid may return true if unknown_rank is True, i.e.
+      // number of dimensions is unknown.  But for ScopedAllocatorOptimizer we
+      // need to know the shape fully.
+      return errors::Aborted("Complete shape not known for ", n->name());
+    }
     VLOG(2) << "Adding shape " << props.shape().DebugString();
     shapes->push_back(TensorShape(props.shape()));
   }
@@ -218,8 +218,9 @@ Status MaybeRewriteInput(ScopedAllocatorOptimizer* sa_opti,
                          NodeDef* input, const string& edge_name,
                          int output_index, NodeDef* op, NodeDef** new_input,
                          int* new_output_index, bool* rewrite) {
-  *rewrite = IsExit(*input) || (sa_opti->repeated_outputs().find(edge_name) !=
-                                sa_opti->repeated_outputs().end());
+  *rewrite = IsConstant(*input) || IsExit(*input) ||
+             (sa_opti->repeated_outputs().find(edge_name) !=
+              sa_opti->repeated_outputs().end());
   if (!(*rewrite)) {
     *new_input = input;
     *new_output_index = output_index;
@@ -301,8 +302,8 @@ Status GetInputs(ScopedAllocatorOptimizer* sa_opti, int64 invocation_count,
           GetOutputDataType(inode_output_props, output_index, &inode_dtype));
     }
     if (inode_dtype != dtype) {
-      return errors::Internal("ScopedAllocatorOptimizer expected input type ",
-                              dtype, " but found ", inode_dtype);
+      return errors::Aborted("ScopedAllocatorOptimizer expected input type ",
+                             dtype, " but found ", inode_dtype);
     }
     inputs->emplace_back(inode, output_index, n);
   }
@@ -362,6 +363,20 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
  public:
   ~UnaryElementwiseRewriter() override {}
 
+  // Return non-OK if any input is an op that does not use the
+  // AllocatorAttributes set by executor to allocate its output.
+  Status CheckUsesAllocatorAttributes(const std::vector<InputDesc>& inputs) {
+    for (const InputDesc& nd : inputs) {
+      if (IsConstant(*nd.from_node_def)) {
+        return errors::Aborted(
+            "Abandoning ScopedAllocatorOptimizer because input ",
+            nd.from_node_def->name(),
+            " is a Const op which does not use AllocatorAttributes");
+      }
+    }
+    return Status::OK();
+  }
+
   // Return non-OK if any input is already committed to a ScopedAllocator.
   //
   // We insert an identity to ensure that inputs are not committed to different
@@ -379,7 +394,7 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
         LOG(INFO) << "Abandoning ScopedAllocatorOptimizer because input "
                   << nd.from_node_def->name() << " output " << scope_ids[0]
                   << " is already assigned to scope_id " << scope_ids[1];
-        return errors::Internal(
+        return errors::Aborted(
             "Abandoning ScopedAllocatorOptimizer because input ",
             nd.from_node_def->name(), " output ", scope_ids[0], " is already ",
             "assigned to scope_id ", scope_ids[1]);
@@ -394,10 +409,10 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
     for (const InputDesc& nd : inputs) {
       if (op_set.find(nd.from_node_def->name()) != op_set.end()) {
         if (nd.output_slot != tensorflow::Graph::kControlSlot) {
-          return errors::Internal("Data edge exists between ",
-                                  nd.from_node_def->name(),
-                                  " and another "
-                                  "node in the set");
+          return errors::Aborted("Data edge exists between ",
+                                 nd.from_node_def->name(),
+                                 " and another "
+                                 "node in the set");
         }
       }
     }
@@ -441,6 +456,7 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
     LOG_WARNING_AND_RETURN_IF_ERROR(
         GetInputs(sa_opti, invocation_count, graph, *graph_properties_,
                   sa_opti->node_map(), ops, *dtype, inputs));
+    LOG_WARNING_AND_RETURN_IF_ERROR(CheckUsesAllocatorAttributes(*inputs));
     LOG_WARNING_AND_RETURN_IF_ERROR(CheckExistingScopedAllocator(*inputs));
     LOG_WARNING_AND_RETURN_IF_ERROR(
         CheckInternalDataDependency(op_instance_names, *inputs));
@@ -521,10 +537,10 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
 
     // Add control edges from the ScopedAllocatorOp to all of the
     // input nodes and mark them for allocation from backing tensor.
-    for (int i = 0, iter_limit = inputs.size(); i < iter_limit; ++i) {
+    for (int i = 0, end = inputs.size(); i < end; ++i) {
       auto& nd = inputs[i];
       if (IsArg(*nd.from_node_def)) {
-        return errors::Internal(
+        return errors::Aborted(
             "ScopedAllocatorOptimizer does not work well when the op inputs "
             "are _Arg ops; skipping this optimizer for this function");
       }
@@ -548,8 +564,7 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
       std::vector<InputDesc> inputs_to_first;
       LOG_WARNING_AND_RETURN_IF_ERROR(GetDataInputs(
           graph, sa_opti->node_map(), nd.from_node_def, &inputs_to_first));
-      for (int i = 0, iter_limit = inputs_to_first.size(); i < iter_limit;
-           ++i) {
+      for (int i = 0, end = inputs_to_first.size(); i < end; ++i) {
         if (fanout.find(inputs_to_first[i].from_node_def) != fanout.end()) {
           VLOG(2) << "Found node " << inputs_to_first[i].from_node_def->name()
                   << " in the fanout of " << sa_name;
@@ -589,7 +604,7 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
     VLOG(2) << "BuildSAConcatNode " << sac_name;
     // control input: edge name -> source node name
     absl::flat_hash_map<string, string> sac_ctl_inputs;
-    for (int i = 0, iter_limit = ops.size(); i < iter_limit; ++i) {
+    for (int i = 0, end = ops.size(); i < end; ++i) {
       NodeDef* old_op = ops[i];
       for (const string& old_op_input : old_op->input()) {
         int position = 0;
@@ -605,9 +620,9 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
           if (op_instance_names.find(old_op_input) != op_instance_names.end()) {
             LOG(ERROR) << "Data edge between " << old_op_input << " and "
                        << old_op->name() << " cannot build ScopedAllocator.";
-            return errors::Internal("Data edge between ", old_op_input, " and ",
-                                    old_op->name(),
-                                    " cannot build ScopedAllocator.");
+            return errors::Aborted("Data edge between ", old_op_input, " and ",
+                                   old_op->name(),
+                                   " cannot build ScopedAllocator.");
           }
           sac_inputs->push_back(
               NodeDefBuilder::NodeOut(old_op_input, 0, dtype));
@@ -938,7 +953,7 @@ int ScopedAllocatorOptimizer::NewScopedAllocatorId(int num_fields) {
 Status ScopedAllocatorOptimizer::NewIdentityId(int* id) {
   *id = next_identity_id_++;
   if (next_identity_id_ < 0) {
-    return errors::Internal("NewIdentityId overflow");
+    return errors::Aborted("NewIdentityId overflow");
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer_test.cc b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer_test.cc
index d67b8acdaa4..4f7f4f582e4 100644
--- a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <unordered_set>
 
+#include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"  // NOLINT
@@ -23,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -108,7 +110,7 @@ class ScopedAllocatorOptimizerTest : public ::testing::Test {
   // Constructs the following graph.
   // (Flow is top to bottom, like nature intends.)
   //
-  // a, b, and c are constants.  s is an Add op.  a1, a2, and a3 are Abs ops.
+  // a, b, and c are placeholders.  s is an Add op.  a1, a2, and a3 are Abs ops.
   // r1, r2, and r3 are Reshape ops.
   //
   // After this graph undergoes SA optimization, we expect a, b, and s to be
@@ -132,12 +134,12 @@ class ScopedAllocatorOptimizerTest : public ::testing::Test {
     Scope s = Scope::NewRootScope();
     s = s.WithDevice("/job:localhost/replica:0/task:0/device:CPU:0");
 
-    Output a =
-        ops::Const<float>(s.WithOpName("a"), {1.0, 0.0, 0.0, -1.0}, {2, 2});
-    Output b =
-        ops::Const<float>(s.WithOpName("b"), {1.0, -2.0, 3.0, 4.0}, {2, 2});
-    Output c =
-        ops::Const<float>(s.WithOpName("c"), {-5.0, -2.0, 0.0, -2.0}, {2, 2});
+    Output a = ops::Placeholder(s.WithOpName("a"), DT_FLOAT,
+                                ops::Placeholder::Shape({2, 2}));
+    Output b = ops::Placeholder(s.WithOpName("b"), DT_FLOAT,
+                                ops::Placeholder::Shape({2, 2}));
+    Output c = ops::Placeholder(s.WithOpName("c"), DT_FLOAT,
+                                ops::Placeholder::Shape({2, 2}));
     Output s1 = ops::Add(s.WithOpName("s1"), b, c);
     Output a1 = ops::Abs(s.WithOpName("a1"), a);
     Output a2 = ops::Abs(s.WithOpName("a2"), b);
@@ -167,14 +169,14 @@ class ScopedAllocatorOptimizerTest : public ::testing::Test {
     Scope s = Scope::NewRootScope();
     s = s.WithDevice("/job:localhost/replica:0/task:0/device:CPU:0");
 
-    Output a =
-        ops::Const<float>(s.WithOpName("a"), {0.0, 0.0, 0.0, 0.0}, {2, 2});
-    Output b =
-        ops::Const<float>(s.WithOpName("b"), {0.0, 0.0, 0.0, 0.0}, {2, 2});
-    Output ctl1 =
-        ops::Const<float>(s.WithOpName("ctl1"), {0.0, 0.0, 0.0, 0.0}, {2, 2});
-    Output ctl2 =
-        ops::Const<float>(s.WithOpName("ctl2"), {0.0, 0.0, 0.0, 0.0}, {2, 2});
+    Output a = ops::Placeholder(s.WithOpName("a"), DT_FLOAT,
+                                ops::Placeholder::Shape({2, 2}));
+    Output b = ops::Placeholder(s.WithOpName("b"), DT_FLOAT,
+                                ops::Placeholder::Shape({2, 2}));
+    Output ctl1 = ops::Placeholder(s.WithOpName("ctl1"), DT_FLOAT,
+                                   ops::Placeholder::Shape({2, 2}));
+    Output ctl2 = ops::Placeholder(s.WithOpName("ctl2"), DT_FLOAT,
+                                   ops::Placeholder::Shape({2, 2}));
     Output a1 = ops::Abs(s.WithOpName("a1").WithControlDependencies({ctl1}), a);
     Output a2 = ops::Abs(s.WithOpName("a2").WithControlDependencies({ctl2}), b);
     Output o1 = ops::Reshape(s.WithOpName("o1"), a1, {1, 4});
@@ -237,6 +239,34 @@ class ScopedAllocatorOptimizerTest : public ::testing::Test {
     TF_CHECK_OK(root_scope.ToGraphDef(graph_def));
   }
 
+  // Constructs the following graph.
+  //
+  // c1 and c2 are Const ops.  a1 and a2 are Abs ops.
+  // We expect the optimizer to succeed and insert Identity between ci and ai.
+  // This will ensure that we will still be able use ScopedAllocator with Const
+  // inputs.
+  /*
+          c1   c2
+          |    |
+          a1   a2
+          |    |
+          r1   r2
+  */
+  void BuildConstGraph(GraphDef* graph_def, bool forward) {
+    Scope s = Scope::NewRootScope();
+    s = s.WithDevice("/job:localhost/replica:0/task:0/device:CPU:0");
+
+    Output c1 =
+        ops::Const<float>(s.WithOpName("c1"), {1.0, 0.0, 0.0, -1.0}, {2, 2});
+    Output c2 =
+        ops::Const<float>(s.WithOpName("c2"), {1.0, -2.0, 3.0, 4.0}, {2, 2});
+    Output a1 = ops::Abs(s.WithOpName("a1"), c1);
+    Output a2 = ops::Abs(s.WithOpName("a2"), c2);
+    Output r1 = ops::Reshape(s.WithOpName("r1"), a1, {1, 4});
+    Output r2 = ops::Reshape(s.WithOpName("r2"), a2, {4, 1});
+    TF_CHECK_OK(s.ToGraphDef(graph_def));
+  }
+
   void SetShapes(GraphDef* graph_def) {
     TensorShapeProto shape_proto;
     shape_proto.add_dim()->set_size(2);
@@ -531,6 +561,42 @@ TEST_F(ScopedAllocatorOptimizerTest, ControlEdgeRewire) {
   EXPECT_EQ(NumControlInputs(&node_map, "ctl4"), 1);
 }
 
+// Test that the optimization succeeds when any input is a Const op, and that it
+// inserts Identity op between Const and Abs.
+TEST_F(ScopedAllocatorOptimizerTest, ConstInput) {
+  GrapplerItem item;
+  BuildConstGraph(&item.graph, false);
+  SetShapes(&item.graph);
+
+  ScopedAllocatorOptions opts;
+  opts.add_enable_op("Abs");
+  ScopedAllocatorOptimizer sao(RewriterConfig::ON, opts);
+  ScopedAllocatorOptimizer::OpNameSet ons;
+  ons.insert("Abs");
+
+  GraphDef optimized_graph;
+  TF_ASSERT_OK(sao.Optimize(nullptr /*cluster*/, item, &optimized_graph));
+
+  // Examine the resulting graphdef.
+  const NodeDef* sa_node = nullptr;
+  for (const NodeDef& node : optimized_graph.node()) {
+    if (node.op() == "_ScopedAllocator") {
+      sa_node = &node;
+      break;
+    }
+  }
+  ASSERT_NE(sa_node, nullptr);
+  int num_identity_ops = 0;
+  NodeMap node_map(&optimized_graph);
+  for (NodeDef* sa_output : node_map.GetOutputs(sa_node->name())) {
+    EXPECT_FALSE(IsConstant(*sa_output));
+    if (IsIdentity(*sa_output)) {
+      ++num_identity_ops;
+    }
+  }
+  EXPECT_EQ(num_identity_ops, 2);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index af2bd16328b..e342f7dfdf0 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -73,26 +73,21 @@ bool IsShapeConsumer(const NodeDef& node) {
 
 }  // namespace
 
-NodeMap::NodeMap(GraphDef* graph) {
-  CHECK(graph != nullptr);
-  nodes_.reserve(graph->node_size());
-  outputs_.reserve(graph->node_size());
-  for (int i = 0; i < graph->node_size(); i++) {
-    NodeDef* node = graph->mutable_node(i);
-    const string& node_name = node->name();
-    auto rslt = nodes_.emplace(node_name, node);
-    // Check that the graph doesn't contain multiple nodes with the same name.
-    if (!rslt.second) {
-      // The first node found with a given name becomes the canonical.
-      LOG(WARNING) << "Duplicated node in the graph: " << node_name;
-    }
-    NodeDef* canonical = rslt.second ? node : rslt.first->second;
-    for (const auto& input : node->input()) {
-      outputs_[NodeName(input)].insert(canonical);
-    }
-  }
+namespace internal {
+// Specialized template class method GetNodeDefFromGraph.
+template <>
+NodeDef* NodeMapInternal<GraphDef, NodeDef>::GetNodeDefFromGraph(
+    GraphDef* graph, int64 i) const {
+  return graph->mutable_node(i);
 }
 
+template <>
+const NodeDef*
+NodeMapInternal<const GraphDef, const NodeDef>::GetNodeDefFromGraph(
+    const GraphDef* graph, int64 i) const {
+  return &graph->node(i);
+}
+}  // namespace internal
 string TensorIdToString(const TensorId& tensor_id) {
   return tensor_id.index() == 0 ? string(tensor_id.node())
                                 : tensor_id.ToString();
@@ -357,8 +352,7 @@ void PermuteNodesInPlace(GraphDef* graph, std::vector<int>* permutation,
     }
     permutation->swap(inv_perm);
   }
-  for (int n = 0, permutation_size = permutation->size();
-       n + 1 < permutation_size; ++n) {
+  for (int n = 0, end = permutation->size(); n + 1 < end; ++n) {
     while (n != (*permutation)[n]) {
       std::size_t r = (*permutation)[n];
       graph->mutable_node()->SwapElements(n, r);
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index e529d5fb4ad..e9ab5b7da12 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -98,16 +98,39 @@ inline int NodePosition(const string& name) {
   return position;
 }
 
-// A utility class to lookup a node and its outputs by node name.
-class NodeMap {
+namespace internal {
+// Base template class for NodeMap and ImmutableNodeMap.
+template <typename GraphDefT, typename NodeDefT>
+class NodeMapInternal {
  public:
   // Note: The NodeMap will store pointers to nodes in graph, which may become
   // invalid if graph is changed.
-  explicit NodeMap(GraphDef* graph);
+  explicit NodeMapInternal(GraphDefT* graph) {
+    if (graph == nullptr) {
+      LOG(WARNING) << "NodeMapInternal constructor is called with a nullptr!";
+      return;
+    }
+    nodes_.reserve(graph->node_size());
+    outputs_.reserve(graph->node_size());
+    for (int i = 0; i < graph->node_size(); i++) {
+      NodeDefT* node = GetNodeDefFromGraph(graph, i);
+      const string& node_name = node->name();
+      auto rslt = nodes_.emplace(node_name, node);
+      // Check that the graph doesn't contain multiple nodes with the same name.
+      if (!rslt.second) {
+        // The first node found with a given name becomes the canonical.
+        LOG(WARNING) << "Duplicated node in the graph: " << node_name;
+      }
+      NodeDefT* canonical = rslt.second ? node : rslt.first->second;
+      for (const auto& input : node->input()) {
+        outputs_[NodeName(input)].insert(canonical);
+      }
+    }
+  }
 
   // Get unordered list of fanouts from node. Notice, that the order is
   // non-deterministic.
-  const absl::flat_hash_set<NodeDef*>& GetOutputs(
+  const absl::flat_hash_set<NodeDefT*>& GetOutputs(
       const string& node_name) const {
     auto it = outputs_.find(node_name);
     if (it == outputs_.end()) {
@@ -117,12 +140,12 @@ class NodeMap {
   }
 
   // Get fanouts ordered by name.
-  std::vector<NodeDef*> GetOutputsOrderedByNodeName(
+  std::vector<NodeDefT*> GetOutputsOrderedByNodeName(
       const string& node_name) const {
-    std::vector<NodeDef*> result;
+    std::vector<NodeDefT*> result;
     auto it = outputs_.find(node_name);
     if (it != outputs_.end()) {
-      const absl::flat_hash_set<NodeDef*>& outputs = it->second;
+      const absl::flat_hash_set<NodeDefT*>& outputs = it->second;
       result.reserve(outputs.size());
       result.assign(outputs.begin(), outputs.end());
       std::sort(result.begin(), result.end(),
@@ -135,7 +158,7 @@ class NodeMap {
 
   // This method doesn't record the outputs of the added node; the outputs need
   // to be explicitly added by the AddOutput method.
-  void AddNode(const string& node_name, NodeDef* node) {
+  void AddNode(const string& node_name, NodeDefT* node) {
     DCHECK(node != nullptr);
     auto ret = nodes_.emplace(node_name, node);
     DCHECK(ret.second)
@@ -148,7 +171,7 @@ class NodeMap {
     outputs_.erase(NodeName(name));
   }
 
-  NodeDef* GetNode(const string& name) const {
+  NodeDefT* GetNode(const string& name) const {
     const string node_name = NodeName(name);
     auto it = nodes_.find(node_name);
     if (it == nodes_.end()) {
@@ -197,9 +220,26 @@ class NodeMap {
   }
 
  private:
-  const absl::flat_hash_set<NodeDef*> empty_set_;
-  absl::node_hash_map<string, NodeDef*> nodes_;
-  absl::node_hash_map<string, absl::flat_hash_set<NodeDef*>> outputs_;
+  // Helper method to get the NodeDef pointer of i-th node in a graph.
+  NodeDefT* GetNodeDefFromGraph(GraphDefT* graph, int64 i) const;
+
+  const absl::flat_hash_set<NodeDefT*> empty_set_;
+  absl::node_hash_map<string, NodeDefT*> nodes_;
+  absl::node_hash_map<string, absl::flat_hash_set<NodeDefT*>> outputs_;
+};
+}  // namespace internal
+
+// A utility class to lookup a node and its outputs by node name.
+class NodeMap : public internal::NodeMapInternal<GraphDef, NodeDef> {
+ public:
+  explicit NodeMap(GraphDef* graph) : NodeMapInternal(graph) {}
+};
+
+// Same to NodeMap, but uses const GraphDef.
+class ImmutableNodeMap
+    : public internal::NodeMapInternal<const GraphDef, const NodeDef> {
+ public:
+  explicit ImmutableNodeMap(const GraphDef* graph) : NodeMapInternal(graph) {}
 };
 
 // A vector with a set. The set stores the same elements as the vector, and
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index a83fb824cc3..91ad261f969 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -313,7 +313,8 @@ Status ReplaceInputWithConst(const NodeDef& input_const, int input_index,
     return errors::InvalidArgument("Input node is not a constant: ",
                                    SummarizeNodeDef(input_const));
   }
-  if (input_index < 0 || input_index >= item->input_size()) {
+  const int item_input_size = item->input_size();
+  if (input_index < 0 || input_index >= item_input_size) {
     return errors::InvalidArgument(
         "Function input index is out of bound: index=", input_index,
         " input_size=", item->input_size());
@@ -354,7 +355,8 @@ Status RemoveFunctionOutputs(const absl::flat_hash_set<int>& remove_outputs,
 
   // Do some sanity checking of the removed outputs positions.
   for (int remove_output : remove_outputs) {
-    if (remove_output < 0 || remove_output >= item->output_size()) {
+    const int item_output_size = item->output_size();
+    if (remove_output < 0 || remove_output >= item_output_size) {
       return errors::InvalidArgument(
           "Function output index is out of bound: index=", remove_output,
           " output_size=", item->output_size());
@@ -366,7 +368,7 @@ Status RemoveFunctionOutputs(const absl::flat_hash_set<int>& remove_outputs,
     return remove_output_args.find(&output) != remove_output_args.end();
   };
 
-  for (int i = 0; i < item->output_size(); ++i) {
+  for (int i = 0, end = item->output_size(); i < end; ++i) {
     const OutputArgInstantiation& output = item->output(i);
     if (remove_outputs.contains(i)) {
       VLOG(3) << "Remove functions output: name=" << output.node_name
@@ -580,7 +582,7 @@ Status MakeFunctionDef(const GrapplerFunctionItem& item,
   }
 
   // Copy function arg attributes.
-  for (int i = 0; i < item.arg_attr().size(); ++i) {
+  for (int i = 0, end = item.arg_attr().size(); i < end; ++i) {
     const auto* attr = item.arg_attr().at(i);
     if (attr != nullptr) {
       (*func->mutable_arg_attr())[i] = *attr;
diff --git a/tensorflow/core/grappler/utils/grappler_test.h b/tensorflow/core/grappler/utils/grappler_test.h
index 7ac70356f2c..9225f9172e8 100644
--- a/tensorflow/core/grappler/utils/grappler_test.h
+++ b/tensorflow/core/grappler/utils/grappler_test.h
@@ -89,6 +89,15 @@ class GrapplerTest : public ::testing::Test {
     return tensor;
   }
 
+  // Creates a random tensor with given shape using `setRandom`.
+  template <DataType DTYPE>
+  Tensor GenerateTensorWithSetRandom(const TensorShape& shape) const {
+    typedef typename EnumToDataType<DTYPE>::Type T;
+    Tensor tensor(DTYPE, shape);
+    tensor.flat<T>().setRandom();
+    return tensor;
+  }
+
   // Get a constant tensor with given shape.
   template <DataType DTYPE>
   Tensor GenerateConstantTensor(
diff --git a/tensorflow/core/grappler/utils/topological_sort.cc b/tensorflow/core/grappler/utils/topological_sort.cc
index a7bef1c7014..9f108b0f396 100644
--- a/tensorflow/core/grappler/utils/topological_sort.cc
+++ b/tensorflow/core/grappler/utils/topological_sort.cc
@@ -81,8 +81,8 @@ Status ComputeTopologicalOrder(
     int ready_node = (*ready_nodes)[front];
     for (int fanout : graph_view.GetFanout(ready_node)) {
       ++num_ready_inputs[fanout];
-      if (num_ready_inputs[fanout] ==
-          static_cast<int>(graph_view.GetFanin(fanout).size())) {
+      const int max_size = graph_view.GetFanin(fanout).size();
+      if (num_ready_inputs[fanout] == max_size) {
         ready_nodes->push_back(fanout);
         ++back;
       }
@@ -96,8 +96,8 @@ Status ComputeTopologicalOrder(
                  "at node = "
               << graph.node(back).DebugString();
       for (int i = 0; i < graph_view.num_nodes(); ++i) {
-        if (num_ready_inputs[i] !=
-            static_cast<int>(graph_view.GetFanin(i).size())) {
+        const int max_size = graph_view.GetFanin(i).size();
+        if (num_ready_inputs[i] != max_size) {
           VLOG(1) << "Node not ready: " << graph.node(i).DebugString();
         }
       }
diff --git a/tensorflow/core/grappler/utils_test.cc b/tensorflow/core/grappler/utils_test.cc
index 6231fb7a780..31444735b20 100644
--- a/tensorflow/core/grappler/utils_test.cc
+++ b/tensorflow/core/grappler/utils_test.cc
@@ -349,39 +349,69 @@ TEST_F(UtilsTest, NumNonControlOutputs) {
 
   GraphDef graph;
   TF_CHECK_OK(s.ToGraphDef(&graph));
-  NodeMap node_map(&graph);
 
-  const NodeDef* add_node = node_map.GetNode("add");
-  const NodeDef* mul_node = node_map.GetNode("mul");
-  ASSERT_NE(add_node, nullptr);
+  {
+    NodeMap node_map(&graph);
 
-  // [a, b] are only non-control inputs
-  EXPECT_EQ(NumNonControlInputs(*add_node), 2);
-  EXPECT_EQ(NumControlInputs(*add_node), 1);
-  // [sqrt, shape] are non control outputs
-  EXPECT_EQ(NumNonControlOutputs(*add_node, node_map), 2);
-  // sqrt is the only data output
-  EXPECT_EQ(NumNonControlDataOutputs(*add_node, node_map), 1);
-  EXPECT_EQ(NumControlInputs(*mul_node), 0);
+    const NodeDef* add_node = node_map.GetNode("add");
+    const NodeDef* mul_node = node_map.GetNode("mul");
+    ASSERT_NE(add_node, nullptr);
 
-  EXPECT_TRUE(HasControlInputs(*add_node));
-  EXPECT_TRUE(HasRegularInputs(*add_node));
-  EXPECT_TRUE(HasControlOutputs(*add_node, node_map));
-  EXPECT_TRUE(HasRegularOutputs(*add_node, node_map));
+    // [a, b] are only non-control inputs
+    EXPECT_EQ(NumNonControlInputs(*add_node), 2);
+    EXPECT_EQ(NumControlInputs(*add_node), 1);
+    // [sqrt, shape] are non control outputs
+    EXPECT_EQ(NumNonControlOutputs(*add_node, node_map), 2);
+    // sqrt is the only data output
+    EXPECT_EQ(NumNonControlDataOutputs(*add_node, node_map), 1);
+    EXPECT_EQ(NumControlInputs(*mul_node), 0);
 
-  const NodeDef* x_node = node_map.GetNode("x");
-  ASSERT_NE(x_node, nullptr);
-  EXPECT_FALSE(HasControlInputs(*x_node));
-  EXPECT_FALSE(HasRegularInputs(*x_node));
-  EXPECT_FALSE(HasControlOutputs(*x_node, node_map));
-  EXPECT_TRUE(HasRegularOutputs(*x_node, node_map));
+    EXPECT_TRUE(HasControlInputs(*add_node));
+    EXPECT_TRUE(HasRegularInputs(*add_node));
+    EXPECT_TRUE(HasControlOutputs(*add_node, node_map));
+    EXPECT_TRUE(HasRegularOutputs(*add_node, node_map));
 
-  const NodeDef* round_node = node_map.GetNode("round");
-  ASSERT_NE(round_node, nullptr);
-  EXPECT_TRUE(HasControlInputs(*round_node));
-  EXPECT_TRUE(HasRegularInputs(*round_node));
-  EXPECT_FALSE(HasControlOutputs(*round_node, node_map));
-  EXPECT_FALSE(HasRegularOutputs(*round_node, node_map));
+    const NodeDef* x_node = node_map.GetNode("x");
+    ASSERT_NE(x_node, nullptr);
+    EXPECT_FALSE(HasControlInputs(*x_node));
+    EXPECT_FALSE(HasRegularInputs(*x_node));
+    EXPECT_FALSE(HasControlOutputs(*x_node, node_map));
+    EXPECT_TRUE(HasRegularOutputs(*x_node, node_map));
+
+    const NodeDef* round_node = node_map.GetNode("round");
+    ASSERT_NE(round_node, nullptr);
+    EXPECT_TRUE(HasControlInputs(*round_node));
+    EXPECT_TRUE(HasRegularInputs(*round_node));
+    EXPECT_FALSE(HasControlOutputs(*round_node, node_map));
+    EXPECT_FALSE(HasRegularOutputs(*round_node, node_map));
+  }
+
+  {
+    // Similar test for ImmutableNodeMap.
+    ImmutableNodeMap node_map(&graph);
+
+    const NodeDef* add_node = node_map.GetNode("add");
+    const NodeDef* mul_node = node_map.GetNode("mul");
+    ASSERT_NE(add_node, nullptr);
+
+    // [a, b] are only non-control inputs
+    EXPECT_EQ(NumNonControlInputs(*add_node), 2);
+    EXPECT_EQ(NumControlInputs(*add_node), 1);
+    EXPECT_EQ(NumControlInputs(*mul_node), 0);
+
+    EXPECT_TRUE(HasControlInputs(*add_node));
+    EXPECT_TRUE(HasRegularInputs(*add_node));
+
+    const NodeDef* x_node = node_map.GetNode("x");
+    ASSERT_NE(x_node, nullptr);
+    EXPECT_FALSE(HasControlInputs(*x_node));
+    EXPECT_FALSE(HasRegularInputs(*x_node));
+
+    const NodeDef* round_node = node_map.GetNode("round");
+    ASSERT_NE(round_node, nullptr);
+    EXPECT_TRUE(HasControlInputs(*round_node));
+    EXPECT_TRUE(HasRegularInputs(*round_node));
+  }
 }
 
 TEST(CheckAttrExists, All) {
@@ -664,6 +694,17 @@ static void BM_NodeMapConstruct(int iters, int size) {
 }
 BENCHMARK(BM_NodeMapConstruct)->Range(1, 1 << 20);
 
+static void BM_ImmutableNodeMapConstruct(int iters, int size) {
+  testing::StopTiming();
+  GraphDef graph = test::CreateRandomGraph(size);
+  testing::StartTiming();
+  for (int i = 0; i < iters; i++) {
+    ImmutableNodeMap node_map(&graph);
+  }
+  testing::StopTiming();
+}
+BENCHMARK(BM_ImmutableNodeMapConstruct)->Range(1, 1 << 20);
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 9ace481a991..14e8d691d98 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -9,12 +9,10 @@ load(
     "tf_cc_binary",
     "tf_cc_shared_object",
     "tf_cc_test",
-    "tf_cc_test_mkl",
     "tf_cc_tests",
     "tf_copts",
     "tf_cuda_library",
     "tf_kernel_library",
-    "tf_mkl_kernel_library",
     "tf_opts_nortti_if_lite_protos",
 )
 load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
@@ -70,12 +68,8 @@ package(
 package_group(
     name = "friends",
     packages = [
-        "//learning/brain/contrib/...",
-        "//learning/brain/research/...",
-        "//learning/faster_training/...",
         "//tensorflow/...",
         "//tensorflow_text/...",
-        "//third_party/car/...",
     ],
 )
 
@@ -327,7 +321,6 @@ tf_kernel_library(
     deps = [
         ":eigen_helpers",
         ":fill_functor",
-        ":image_resizer_state",
         ":ops_util",
         "//third_party/eigen3",
         "//tensorflow/core:core_cpu",
@@ -347,32 +340,6 @@ cc_library(
     ],
 )
 
-tf_kernel_library(
-    name = "extract_image_patches_op",
-    prefix = "extract_image_patches_op",
-    deps = [
-        ":bounds_check",
-        ":eigen_helpers",
-        ":ops_util",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//third_party/eigen3",
-    ],
-)
-
-tf_kernel_library(
-    name = "extract_volume_patches_op",
-    prefix = "extract_volume_patches_op",
-    deps = [
-        ":bounds_check",
-        ":eigen_helpers",
-        ":ops_util",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//third_party/eigen3",
-    ],
-)
-
 cc_library(
     name = "conv_3d",
     hdrs = ["conv_3d.h"],
@@ -659,16 +626,13 @@ cc_library(
     name = "batch_kernels",
     srcs = ["batch_kernels.cc"],
     deps = [
-        ":concat_lib_hdrs",
         ":ops_util_hdrs",
-        ":split_lib_hdrs",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/batching_util:batch_resource_base",
+        "//tensorflow/core/kernels/batching_util:concat_split_util",
         "//tensorflow/core/kernels/batching_util:periodic_function_dynamic",
-        "//tensorflow/core/kernels/batching_util:shared_batch_scheduler_hdrs",
-        "//tensorflow/core/util:incremental_barrier",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
@@ -817,10 +781,9 @@ cc_library(
     srcs = ["eigen_contraction_kernel.cc"],
     hdrs = ["eigen_contraction_kernel.h"],
     defines = select({
-        "//tensorflow:android": [],
-        "//tensorflow:arm": [],
+        "//tensorflow:android_x86": [],
+        "//tensorflow:arm_any": [],
         "//tensorflow:ios": [],
-        "//tensorflow:linux_aarch64": [],
         "//tensorflow:linux_ppc64le": [],
         "//conditions:default": [
             "TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL",
@@ -832,10 +795,9 @@ cc_library(
         "//third_party/eigen3",
         "//tensorflow/core/platform:dynamic_annotations",
     ] + select({
-        "//tensorflow:android": [],
-        "//tensorflow:arm": [],
+        "//tensorflow:android_x86": [],
+        "//tensorflow:arm_any": [],
         "//tensorflow:ios": [],
-        "//tensorflow:linux_aarch64": [],
         "//tensorflow:linux_ppc64le": [],
         "//conditions:default": ["@mkl_dnn//:mkldnn_single_threaded"],
     }),
@@ -945,43 +907,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "image_resizer_state",
-    hdrs = ["image_resizer_state.h"],
-    visibility = ["//visibility:private"],
-    deps = [
-        ":bounds_check",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//third_party/eigen3",
-    ],
-)
-
-cc_header_only_library(
-    name = "image_resizer_state_lib",
-    deps = [":image_resizer_state"],
-)
-
-cc_library(
-    name = "sampling_kernels",
-    srcs = ["sampling_kernels.cc"],
-    hdrs = ["sampling_kernels.h"],
-    visibility = ["//visibility:private"],
-    deps = ["//tensorflow/core:lib"],
-)
-
-tf_cc_test(
-    name = "sampling_kernels_test",
-    srcs = ["sampling_kernels_test.cc"],
-    deps = [
-        ":sampling_kernels",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
 # OpKernel libraries ----------------------------------------------------------
 
 ARRAY_DEPS = [
@@ -1036,8 +961,6 @@ cc_library(
         ":depth_space_ops",
         ":diag_op",
         ":edit_distance_op",
-        ":extract_image_patches_op",
-        ":extract_volume_patches_op",
         ":fingerprint_op",
         ":gather_nd_op",
         ":gather_op",
@@ -1048,10 +971,6 @@ cc_library(
         ":immutable_constant_op",
         ":inplace_ops",
         ":listdiff_op",
-        ":matrix_band_part_op",
-        ":matrix_diag_op",
-        ":matrix_set_diag_op",
-        ":mirror_pad_op",
         ":one_hot_op",
         ":pack_op",
         ":pad_op",
@@ -1183,32 +1102,6 @@ tf_kernel_library(
     deps = ARRAY_DEPS,
 )
 
-tf_kernel_library(
-    name = "matrix_band_part_op",
-    prefix = "matrix_band_part_op",
-    deps = if_cuda([
-        ":cuda_solvers",
-    ]) + ARRAY_DEPS,
-)
-
-tf_kernel_library(
-    name = "matrix_diag_op",
-    prefix = "matrix_diag_op",
-    deps = ARRAY_DEPS,
-)
-
-tf_kernel_library(
-    name = "matrix_set_diag_op",
-    prefix = "matrix_set_diag_op",
-    deps = ARRAY_DEPS + [":matrix_diag_op"],
-)
-
-tf_kernel_library(
-    name = "mirror_pad_op",
-    prefix = "mirror_pad_op",
-    deps = ARRAY_DEPS,
-)
-
 tf_kernel_library(
     name = "one_hot_op",
     prefix = "one_hot_op",
@@ -1414,7 +1307,7 @@ tf_kernel_library(
         "where_op_gpu_impl_8.cu.cc",
     ],
     deps = if_cuda_or_rocm([
-               ":cuda_solvers",
+               "//tensorflow/core/util:cuda_solvers",
            ]) + [":gpu_prim_hdrs"] +
            ARRAY_DEPS,
 )
@@ -1508,7 +1401,7 @@ tf_kernel_library(
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:ragged_to_dense_util",
+        "//tensorflow/core/util:ragged_to_dense_util",
     ],
 )
 
@@ -1763,7 +1656,6 @@ tf_cuda_cc_test(
     tags = ["no_cuda11"],  # b/159664089
     deps = [
         ":conv_ops",
-        ":image",
         ":ops_testutil",
         ":ops_util",
         "//tensorflow/cc:cc_ops",
@@ -1776,6 +1668,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels/image",
         "@com_google_absl//absl/algorithm:container",
     ],
 )
@@ -1861,7 +1754,6 @@ tf_cuda_cc_test(
     tags = tf_cuda_tests_tags(),
     deps = [
         ":conv_ops",
-        ":image",
         ":ops_testutil",
         ":ops_util",
         "//tensorflow/cc:cc_ops",
@@ -1874,69 +1766,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cc_test(
-    name = "decode_wav_op_test",
-    size = "small",
-    srcs = ["decode_wav_op_test.cc"],
-    deps = [
-        ":decode_wav_op",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:client_session",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cc_test(
-    name = "encode_jpeg_op_test",
-    size = "small",
-    srcs = ["encode_jpeg_op_test.cc"],
-    deps = [
-        ":encode_jpeg_op",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cc_test(
-    name = "encode_wav_op_test",
-    size = "small",
-    srcs = ["encode_wav_op_test.cc"],
-    deps = [
-        ":decode_wav_op",
-        ":encode_wav_op",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:client_session",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels/image",
     ],
 )
 
@@ -2794,21 +2624,6 @@ tf_cuda_cc_tests(
     ],
 )
 
-tf_kernel_library(
-    name = "eye_functor",
-    hdrs = ["eye_functor.h"],
-    gpu_srcs = [
-        "eye_functor_gpu.cu.cc",
-        "eye_functor.h",
-    ],
-    visibility = [":friends"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//third_party/eigen3",
-    ],
-    alwayslink = 0,
-)
-
 cc_library(
     name = "fifo_queue",
     srcs = ["fifo_queue.cc"],
@@ -2945,6 +2760,53 @@ tf_kernel_library(
     ],
 )
 
+cc_library(
+    name = "tensor_map",
+    srcs = ["tensor_map.cc"],
+    hdrs = ["tensor_map.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/framework:tensor_shape_proto_cc",
+        "//tensorflow/core/lib/core:refcount",
+    ],
+)
+
+tf_kernel_library(
+    name = "map_kernels",
+    srcs = ["map_kernels.cc"],
+    hdrs = ["map_kernels.h"],
+    deps = [
+        ":concat_lib",
+        ":fill_functor",
+        ":tensor_map",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_cc_tests(
+    name = "tensor_map_test",
+    size = "small",
+    srcs = [
+        "tensor_map_test.cc",
+    ],
+    tags = ["nomsan"],  # b/163222155
+    deps = [
+        ":tensor_map",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/framework:tensor_testutil",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 tf_kernel_library(
     name = "fact_op",
     prefix = "fact_op",
@@ -3000,205 +2862,6 @@ tf_kernel_library(
     ],
 )
 
-cc_library(
-    name = "image",
-    deps = [
-        ":adjust_contrast_op",
-        ":adjust_hue_op",
-        ":adjust_saturation_op",
-        ":attention_ops",
-        ":colorspace_op",
-        ":crop_and_resize_op",
-        ":decode_image_op",
-        ":draw_bounding_box_op",
-        ":encode_jpeg_op",
-        ":encode_png_op",
-        ":extract_jpeg_shape_op",
-        ":generate_box_proposals_op",
-        ":image_ops",
-        ":non_max_suppression_op",
-        ":random_crop_op",
-        ":resize_area_op",
-        ":resize_bicubic_op",
-        ":resize_bilinear_op",
-        ":resize_nearest_neighbor_op",
-        ":sample_distorted_bounding_box_op",
-        ":scale_and_translate_op",
-    ],
-)
-
-IMAGE_DEPS = [
-    ":bounds_check",
-    ":eigen_helpers",
-    ":image_resizer_state",
-    "//third_party/eigen3",
-    "//tensorflow/core:framework",
-    "//tensorflow/core:gif_internal",
-    "//tensorflow/core:jpeg_internal",
-    "//tensorflow/core:lib",
-    "//tensorflow/core:lib_internal",
-    "//tensorflow/core:png_internal",
-    "//tensorflow/core:protos_all_cc",
-    "//tensorflow/core/util/tensor_bundle",
-]
-
-tf_kernel_library(
-    name = "adjust_contrast_op",
-    prefix = "adjust_contrast_op",
-    deps = IMAGE_DEPS,
-)
-
-cc_library(
-    name = "adjust_hsv_gpu_lib",
-    hdrs = ["adjust_hsv_gpu.cu.h"],
-    deps = ["//tensorflow/core:framework"],
-)
-
-tf_kernel_library(
-    name = "adjust_hue_op",
-    prefix = "adjust_hue_op",
-    deps = IMAGE_DEPS + [":adjust_hsv_gpu_lib"],
-)
-
-tf_kernel_library(
-    name = "adjust_saturation_op",
-    prefix = "adjust_saturation_op",
-    deps = IMAGE_DEPS + [":adjust_hsv_gpu_lib"],
-)
-
-tf_kernel_library(
-    name = "attention_ops",
-    prefix = "attention_ops",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "colorspace_op",
-    prefix = "colorspace_op",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "crop_and_resize_op",
-    prefix = "crop_and_resize_op",
-    deps = IMAGE_DEPS + ["//tensorflow/core:framework_internal"],
-)
-
-tf_kernel_library(
-    name = "decode_image_op",
-    prefix = "decode_image_op",
-    deps = IMAGE_DEPS + ["@com_google_absl//absl/strings"],
-)
-
-tf_kernel_library(
-    name = "draw_bounding_box_op",
-    prefix = "draw_bounding_box_op",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "encode_jpeg_op",
-    prefix = "encode_jpeg_op",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "encode_png_op",
-    prefix = "encode_png_op",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "extract_jpeg_shape_op",
-    prefix = "extract_jpeg_shape_op",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "generate_box_proposals_op",
-    gpu_srcs = ["generate_box_proposals_op.cu.cc"],
-    deps = [":gpu_prim_hdrs"] + if_cuda([
-        ":non_max_suppression_op_gpu",
-    ]),
-)
-
-tf_kernel_library(
-    name = "non_max_suppression_op",
-    prefix = "non_max_suppression_op",
-    deps = IMAGE_DEPS + [":gpu_prim_hdrs"],
-)
-
-tf_kernel_library(
-    name = "scale_and_translate_op",
-    prefix = "scale_and_translate_op",
-    deps = IMAGE_DEPS + [":sampling_kernels"],
-)
-
-tf_kernel_library(
-    name = "random_crop_op",
-    prefix = "random_crop_op",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "resize_area_op",
-    prefix = "resize_area_op",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "resize_bicubic_op",
-    prefix = "resize_bicubic_op",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "resize_bilinear_op",
-    prefix = "resize_bilinear_op",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "resize_nearest_neighbor_op",
-    prefix = "resize_nearest_neighbor_op",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "sample_distorted_bounding_box_op",
-    prefix = "sample_distorted_bounding_box_op",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "image_ops",
-    prefix = "image_ops",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "encode_wav_op",
-    prefix = "encode_wav_op",
-    deps = [
-        ":bounds_check",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
-tf_kernel_library(
-    name = "decode_wav_op",
-    prefix = "decode_wav_op",
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
 tf_cc_tests(
     name = "eigen_test",
     size = "small",
@@ -3224,8 +2887,8 @@ tf_cc_test(
     name = "eigen_mkldnn_contraction_kernel_test",
     size = "small",
     srcs = select({
-        "//tensorflow:android": [],
-        "//tensorflow:arm": [],
+        "//tensorflow:android_x86": [],
+        "//tensorflow:arm_any": [],
         "//tensorflow:ios": [],
         "//tensorflow:linux_ppc64le": [],
         ":no_mkldnn_contraction_kernel": [],
@@ -3283,158 +2946,6 @@ tf_cc_tests(
     ],
 )
 
-tf_cc_tests(
-    name = "bonus_tests",
-    srcs = [
-        "adjust_contrast_op_test.cc",
-        "colorspace_op_test.cc",
-        "crop_and_resize_op_test.cc",
-        "mirror_pad_op_test.cc",
-        "non_max_suppression_op_test.cc",
-        "resize_area_op_test.cc",
-        "resize_bicubic_op_test.cc",
-        "resize_nearest_neighbor_op_test.cc",
-        "scale_and_translate_op_test.cc",
-    ],
-    linkopts = select({
-        "//tensorflow:macos": ["-headerpad_max_install_names"],
-        "//conditions:default": [],
-    }),
-    deps = [
-        ":image",
-        ":mirror_pad_op",
-        ":ops_testutil",
-        ":ops_util",
-        ":sampling_kernels",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cc_test(
-    name = "non_max_suppression_op_benchmark_test",
-    srcs = ["non_max_suppression_op_benchmark_test.cc"],
-    deps = [
-        ":image",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cuda_cc_test(
-    name = "resize_bilinear_op_test",
-    srcs = ["resize_bilinear_op_test.cc"],
-    tags = ["no_cuda_on_cpu_tap"],
-    deps = [
-        ":image",
-        ":ops_testutil",
-        ":ops_util",
-        ":sampling_kernels",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cuda_cc_test(
-    name = "adjust_contrast_op_benchmark_test",
-    srcs = ["adjust_contrast_op_benchmark_test.cc"],
-    deps = [
-        ":image",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cuda_cc_test(
-    name = "crop_and_resize_op_benchmark_test",
-    srcs = ["crop_and_resize_op_benchmark_test.cc"],
-    deps = [
-        ":image",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cuda_cc_test(
-    name = "mirror_pad_op_benchmark_test",
-    srcs = ["mirror_pad_op_benchmark_test.cc"],
-    deps = [
-        ":mirror_pad_op",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cuda_cc_test(
-    name = "non_max_suppression_op_gpu_test",
-    srcs = ["non_max_suppression_op_gpu_test.cc"],
-    tags = tf_cuda_tests_tags() + ["no_cuda_on_cpu_tap"],
-    deps = [
-        ":image",
-        ":ops_testutil",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-tf_cuda_cc_test(
-    name = "resize_benchmark_test",
-    srcs = ["resize_op_benchmark_test.cc"],
-    deps = [
-        ":image",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
 cc_library(
     name = "io",
     deps = [
@@ -3567,288 +3078,6 @@ tf_cc_tests(
     ],
 )
 
-cc_library(
-    name = "linalg",
-    deps = [
-        ":banded_triangular_solve_op",
-        ":cholesky_grad",
-        ":cholesky_op",
-        ":determinant_op",
-        ":eig_op",
-        ":einsum_op",
-        ":lu_op",
-        ":matrix_exponential_op",
-        ":matrix_inverse_op",
-        ":matrix_logarithm_op",
-        ":matrix_solve_ls_op",
-        ":matrix_solve_op",
-        ":matrix_square_root_op",
-        ":matrix_triangular_solve_op",
-        ":qr_op",
-        ":self_adjoint_eig_op",
-        ":self_adjoint_eig_v2_op",
-        ":svd_op",
-        ":tridiagonal_matmul_op",
-        ":tridiagonal_solve_op",
-    ],
-)
-
-tf_kernel_library(
-    name = "cuda_solvers",
-    srcs = ["cuda_solvers.cc"],
-    hdrs = ["cuda_solvers.h"],
-    # @local_config_cuda//cuda:cusolver_static, //third_party/eigen3:blas,
-    # and //third_party/libf2c all contain various parts of BLAS, LAPACK,
-    # and f2c helper functions in global namespace. Tell the compiler to
-    # allow multiple definitions when linking this.
-    linkopts = select({
-        "//tensorflow:macos": [],
-        "//tensorflow:windows": [],
-        "//conditions:default": ["-Wl,-z,muldefs"],
-    }),
-    visibility = [":friends"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/platform/default/build_config:cublas_plugin",
-        "//tensorflow/stream_executor/cuda:cublas_lib",
-        "//tensorflow/stream_executor/cuda:cusolver_lib",
-    ],
-)
-
-tf_kernel_library(
-    name = "rocm_solvers",
-    srcs = ["rocm_solvers.cc"],
-    hdrs = ["rocm_solvers.h"],
-    visibility = [":friends"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/stream_executor/lib",
-        "//tensorflow/stream_executor/platform:dso_loader",
-        "//tensorflow/stream_executor/rocm:rocblas_plugin",
-        "//tensorflow/stream_executor/rocm:rocm_gpu_executor",
-    ] + if_rocm([
-        "@local_config_rocm//rocm:rocprim",
-    ]),
-)
-
-tf_kernel_library(
-    name = "cuda_sparse",
-    srcs = if_cuda(["cuda_sparse.cc"]) + if_rocm(["rocm_sparse.cc"]),
-    hdrs = ["cuda_sparse.h"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/kernels:cuda_solvers",
-    ] + if_cuda([
-        "//tensorflow/stream_executor/cuda:cusparse_lib",
-        "@cub_archive//:cub",
-    ]) + if_rocm([
-        "@local_config_rocm//rocm:hipsparse",
-    ]),
-)
-
-LINALG_DEPS = [
-    ":linalg_ops_common",
-    "//third_party/eigen3",
-    "//tensorflow/core:framework",
-    "//tensorflow/core:lib",
-] + if_cuda([
-    ":cuda_solvers",
-    ":transpose_functor",
-]) + if_rocm([
-    ":rocm_solvers",
-])
-
-tf_kernel_library(
-    name = "cholesky_op",
-    prefix = "cholesky_op",
-    deps = if_cuda([
-        ":matrix_band_part_op",
-    ]) + LINALG_DEPS,
-)
-
-tf_kernel_library(
-    name = "cholesky_grad",
-    prefix = "cholesky_grad",
-    deps = LINALG_DEPS,
-)
-
-tf_kernel_library(
-    name = "determinant_op",
-    prefix = "determinant_op",
-    deps = if_cuda([
-        ":fill_functor",
-    ]) + LINALG_DEPS,
-)
-
-tf_kernel_library(
-    name = "matrix_exponential_op",
-    prefix = "matrix_exponential_op",
-    deps = LINALG_DEPS,
-)
-
-tf_kernel_library(
-    name = "matrix_logarithm_op",
-    prefix = "matrix_logarithm_op",
-    deps = LINALG_DEPS,
-)
-
-tf_kernel_library(
-    name = "self_adjoint_eig_op",
-    prefix = "self_adjoint_eig_op",
-    deps = LINALG_DEPS + ["//tensorflow/core:lib_internal"],
-)
-
-tf_kernel_library(
-    name = "self_adjoint_eig_v2_op",
-    prefix = "self_adjoint_eig_v2_op",
-    deps = LINALG_DEPS + ["//tensorflow/core:lib_internal"] + if_cuda([
-        ":cast_op",
-        ":cwise_op",
-    ]),
-)
-
-tf_kernel_library(
-    name = "eig_op",
-    prefix = "eig_op",
-    deps = LINALG_DEPS + ["//tensorflow/core:lib_internal"] + if_cuda([
-        ":cast_op",
-        ":cwise_op",
-    ]),
-)
-
-tf_kernel_library(
-    name = "matrix_inverse_op",
-    prefix = "matrix_inverse_op",
-    deps = LINALG_DEPS + if_cuda([":eye_functor"]),
-)
-
-tf_kernel_library(
-    name = "matrix_solve_ls_op",
-    prefix = "matrix_solve_ls_op",
-    deps = LINALG_DEPS,
-)
-
-tf_kernel_library(
-    name = "matrix_solve_op",
-    prefix = "matrix_solve_op",
-    deps = LINALG_DEPS,
-)
-
-tf_kernel_library(
-    name = "matrix_square_root_op",
-    prefix = "matrix_square_root_op",
-    deps = LINALG_DEPS,
-)
-
-tf_kernel_library(
-    name = "banded_triangular_solve_op",
-    prefix = "banded_triangular_solve_op",
-    deps = LINALG_DEPS + [":fill_functor"],
-)
-
-tf_kernel_library(
-    name = "matrix_triangular_solve_op",
-    hdrs = ["matrix_triangular_solve_op_impl.h"],
-    prefix = "matrix_triangular_solve_op",
-    deps = [
-        ":linalg_ops_common",
-        "//third_party/eigen3",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        ":fill_functor",
-        "//tensorflow/core:stream_executor",
-    ] + if_cuda([
-        "//tensorflow/core/platform/default/build_config:cublas_plugin",
-        ":cuda_solvers",
-    ]) + if_rocm([
-        "@local_config_rocm//rocm:rocprim",
-        ":rocm_solvers",
-    ]) + if_cuda_or_rocm([
-        ":transpose_functor",
-    ]),
-)
-
-tf_kernel_library(
-    name = "tridiagonal_matmul_op",
-    srcs = ["tridiagonal_matmul_op.cc"],
-    gpu_srcs = ["tridiagonal_matmul_op_gpu.cu.cc"],
-    deps = LINALG_DEPS + if_cuda([
-        ":cuda_sparse",
-    ]),
-)
-
-tf_kernel_library(
-    name = "tridiagonal_solve_op",
-    srcs = ["tridiagonal_solve_op.cc"],
-    gpu_srcs = ["tridiagonal_solve_op_gpu.cu.cc"],
-    deps = LINALG_DEPS + if_cuda([
-        ":cuda_sparse",
-    ]),
-)
-
-tf_kernel_library(
-    name = "qr_op",
-    prefix = "qr_op",
-    deps = LINALG_DEPS + if_cuda([
-        ":cwise_op",
-        ":eye_functor",
-        ":matrix_band_part_op",
-    ]),
-)
-
-tf_kernel_library(
-    name = "svd_op",
-    prefix = "svd_op",
-    deps = LINALG_DEPS + if_cuda([
-        ":eye_functor",
-    ]),
-)
-
-tf_kernel_library(
-    name = "lu_op",
-    prefix = "lu_op",
-    deps = if_cuda([
-        ":cuda_solvers",
-        ":transpose_functor",
-    ]) + [
-        "//third_party/eigen3",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-    ],
-)
-
-tf_kernel_library(
-    name = "einsum_op",
-    prefix = "einsum_op",
-    deps = [
-        ":batch_matmul_op",
-        ":fill_functor",
-        ":reduction_ops",
-        ":transpose_functor",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//third_party/eigen3",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-cc_library(
-    name = "linalg_ops_common",
-    srcs = ["linalg_ops_common.cc"],
-    hdrs = ["linalg_ops_common.h"],
-    visibility = ["//visibility:private"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//third_party/eigen3",
-    ],
-)
-
 cc_library(
     name = "logging",
     deps = [
@@ -3866,6 +3095,8 @@ LOGGING_DEPS = [
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:protos_all_cc",
+    # TODO(b/162630222): remove this dependency.
+    "//tensorflow/c/kernels:summary_op",
 ]
 
 tf_kernel_library(
@@ -4099,16 +3330,6 @@ tf_kernel_library(
     ]),
 )
 
-tf_mkl_kernel_library(
-    name = "mkl_batch_matmul_op",
-    srcs = ["mkl_batch_matmul_op.cc"],
-    hdrs = [
-        "batch_matmul_op_impl.h",
-        "mkl_matmul_ops_common.h",
-    ],
-    deps = MATH_DEPS + mkl_deps(),
-)
-
 tf_kernel_library(
     name = "betainc_op",
     prefix = "betainc_op",
@@ -4190,16 +3411,6 @@ tf_kernel_library(
     ]) + if_cuda_or_rocm([":gpu_utils"]),
 )
 
-tf_mkl_kernel_library(
-    name = "mkl_matmul_op",
-    srcs = [
-        "mkl_matmul_op.cc",
-        "mkl_matmul_op_fused.cc",
-    ],
-    hdrs = ["mkl_matmul_ops_common.h"],
-    deps = MATH_DEPS + mkl_deps(),
-)
-
 tf_kernel_library(
     name = "reduction_ops",
     gpu_srcs = ["reduction_gpu_kernels.cu.h"],
@@ -4214,7 +3425,7 @@ tf_kernel_library(
     name = "segment_reduction_ops",
     prefix = "segment_reduction_ops",
     deps = MATH_DEPS + if_cuda_or_rocm([
-        ":cuda_solvers",
+        "//tensorflow/core/util:cuda_solvers",
     ]),
 )
 
@@ -4411,45 +3622,6 @@ tf_cuda_cc_test(
     ],
 )
 
-tf_cuda_cc_test(
-    name = "banded_triangular_solve_op_test",
-    size = "small",
-    srcs = ["banded_triangular_solve_op_test.cc"],
-    deps = [
-        ":banded_triangular_solve_op",
-        ":matrix_set_diag_op",
-        ":matrix_triangular_solve_op",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cuda_cc_test(
-    name = "matrix_triangular_solve_op_test",
-    size = "small",
-    srcs = ["matrix_triangular_solve_op_test.cc"],
-    deps = [
-        ":broadcast_to_op",
-        ":matrix_triangular_solve_op",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
 tf_cuda_cc_test(
     name = "scan_ops_test",
     size = "small",
@@ -4652,7 +3824,6 @@ tf_kernel_library(
         ":conv_2d",
         ":conv_3d",
         ":eigen_contraction_kernel",
-        ":image_resizer_state",
         ":fill_functor",
         ":fused_eigen_output_kernels",
         ":ops_util",
@@ -4663,6 +3834,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/util:image_resizer_state",
         "//tensorflow/core/util/proto:proto_utils",
         "//tensorflow/stream_executor/gpu:gpu_asm_opts",
     ] + select({
@@ -4742,6 +3914,7 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        ":cast_op",
     ] + if_cuda([
         "@local_config_cuda//cuda:cudnn_header",
     ]),
@@ -5348,7 +4521,9 @@ tf_kernel_library(
 tf_kernel_library(
     name = "sendrecv_ops",
     prefix = "sendrecv_ops",
-    deps = REQUIRED_DEPS,
+    deps = REQUIRED_DEPS + [
+        "//tensorflow/core/profiler/lib:traceme",
+    ],
 )
 
 tf_cc_test(
@@ -6231,6 +5406,74 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "encode_wav_op",
+    prefix = "encode_wav_op",
+    deps = [
+        ":bounds_check",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "encode_wav_op_test",
+    size = "small",
+    srcs = ["encode_wav_op_test.cc"],
+    deps = [
+        ":decode_wav_op",
+        ":encode_wav_op",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+tf_kernel_library(
+    name = "decode_wav_op",
+    prefix = "decode_wav_op",
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "decode_wav_op_test",
+    size = "small",
+    srcs = ["decode_wav_op_test.cc"],
+    deps = [
+        ":decode_wav_op",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 filegroup(
     name = "spectrogram_test_data",
     srcs = [
@@ -6460,6 +5703,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "meta_support",
+    srcs = ["meta_support.cc"],
+    hdrs = ["meta_support.h"],
+    deps = [
+        ":quantization_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:mutex",
+        "@gemmlowp",
+    ],
+)
+
 # Android libraries -----------------------------------------------------------
 
 # Changes to the Android srcs here should be replicated in
@@ -6571,8 +5827,6 @@ filegroup(
         "matmul_op.h",
         "no_op.cc",
         "no_op.h",
-        "non_max_suppression_op.cc",
-        "non_max_suppression_op.h",
         "one_hot_op.cc",
         "one_hot_op.h",
         "ops_util.h",
@@ -6620,7 +5874,10 @@ filegroup(
         "unpack_op.cc",
         "variable_ops.cc",
         "variable_ops.h",
+    ] + [
         "//tensorflow/c/kernels:android_all_op_kernels",
+        "//tensorflow/core/kernels/image:non_max_suppression_op.cc",
+        "//tensorflow/core/kernels/image:non_max_suppression_op.h",
     ],
 )
 
@@ -6644,9 +5901,6 @@ filegroup(
 filegroup(
     name = "android_extended_ops_headers",
     srcs = [
-        "adjust_contrast_op.h",
-        "adjust_hue_op.h",
-        "adjust_saturation_op.h",
         "argmax_op.h",
         "avgpooling_op.h",
         "batch_matmul_op_impl.h",
@@ -6664,28 +5918,20 @@ filegroup(
         "depthwise_conv_op.h",
         "diag_op.h",
         "dilation_ops.h",
-        "extract_image_patches_op.h",
         "fake_quant_ops_functor.h",
         "fused_batch_norm_op.h",
         "gemm_functors.h",
-        "image_ops.h",
-        "image_resizer_state.h",
         "initializable_lookup_table.h",
         "inplace_ops.cc",
         "inplace_ops_functor.h",
         "lookup_table_init_op.h",
         "lookup_table_op.h",
         "lookup_util.h",
-        "linalg_ops_common.h",
         "list_kernels.h",
-        "matrix_diag_op.h",
-        "matrix_set_diag_op.h",
         "maxpooling_op.h",
         "mfcc.h",
         "mfcc_dct.h",
         "mfcc_mel_filterbank.h",
-        "mirror_pad_op.h",
-        "mirror_pad_op_cpu_impl.h",
         "multinomial_op.h",
         "pad_op.h",
         "pooling_ops_3d.h",
@@ -6696,8 +5942,6 @@ filegroup(
         "relu_op.h",
         "relu_op_functor.h",
         "reshape_util.h",
-        "resize_bilinear_op.h",
-        "resize_nearest_neighbor_op.h",
         "reverse_op.h",
         "save_restore_tensor.h",
         "scan_ops.h",
@@ -6727,12 +5971,26 @@ filegroup(
         "xent_op.h",
     ] + [
         "//tensorflow/core/kernels/boosted_trees/quantiles:weighted_quantiles_hdrs",
+        "//tensorflow/core/kernels/image:adjust_contrast_op.h",
+        "//tensorflow/core/kernels/image:adjust_hue_op.h",
+        "//tensorflow/core/kernels/image:adjust_saturation_op.h",
+        "//tensorflow/core/kernels/image:extract_image_patches_op.h",
+        "//tensorflow/core/kernels/image:image_ops.h",
+        "//tensorflow/core/kernels/image:mirror_pad_op.h",
+        "//tensorflow/core/kernels/image:mirror_pad_op_cpu_impl.h",
+        "//tensorflow/core/kernels/image:resize_bilinear_op.h",
+        "//tensorflow/core/kernels/image:resize_nearest_neighbor_op.h",
+        "//tensorflow/core/kernels/linalg:linalg_ops_common.h",
+        "//tensorflow/core/kernels/linalg:matrix_diag_op.h",
+        "//tensorflow/core/kernels/linalg:matrix_set_diag_op.h",
+        "//tensorflow/core/util:image_resizer_state.h",
     ],
 )
 
 filegroup(
     name = "android_extended_ops_group1",
     srcs = [
+        ":android_extended_ops_headers",
         "argmax_op.cc",
         "avgpooling_op.cc",
         "batch_matmul_op_real.cc",
@@ -6745,18 +6003,16 @@ filegroup(
         "conv_grad_input_ops.cc",
         "conv_grad_ops.h",
         "conv_grad_ops_3d.cc",
-        "conv_grad_shape_utils.h",
         "conv_grad_shape_utils.cc",
+        "conv_grad_shape_utils.h",
         "conv_ops.cc",
         "conv_ops_3d.cc",
         "conv_ops_fused_double.cc",
         "conv_ops_fused_float.cc",
         "conv_ops_fused_half.cc",
-        "conv_ops_fused_impl.h",
         "conv_ops_fused_image_transform.cc",
+        "conv_ops_fused_impl.h",
         "conv_ops_using_gemm.cc",
-        "crop_and_resize_op.cc",
-        "crop_and_resize_op.h",
         "cwise_op_abs.cc",
         "cwise_op_add_1.cc",
         "cwise_op_add_2.cc",
@@ -6772,8 +6028,6 @@ filegroup(
         "cwise_op_div.cc",
         "cwise_op_equal_to_1.cc",
         "cwise_op_equal_to_2.cc",
-        "cwise_op_not_equal_to_1.cc",
-        "cwise_op_not_equal_to_2.cc",
         "cwise_op_erf.cc",
         "cwise_op_exp.cc",
         "cwise_op_floor.cc",
@@ -6798,6 +6052,8 @@ filegroup(
         "cwise_op_mul_2.cc",
         "cwise_op_neg_1.cc",
         "cwise_op_neg_2.cc",
+        "cwise_op_not_equal_to_1.cc",
+        "cwise_op_not_equal_to_2.cc",
         "cwise_op_pow.cc",
         "cwise_op_real.cc",
         "cwise_op_reciprocal.cc",
@@ -6815,28 +6071,19 @@ filegroup(
         "cwise_op_sub.cc",
         "cwise_op_tan.cc",
         "cwise_op_tanh.cc",
-        "cwise_op_xlogy.cc",
-        "cwise_op_xlog1py.cc",
         "cwise_op_xdivy.cc",
+        "cwise_op_xlog1py.cc",
+        "cwise_op_xlogy.cc",
         "data_format_ops.cc",
+        "decode_raw_op.cc",
         "decode_wav_op.cc",
         "deep_conv2d.cc",
         "deep_conv2d.h",
         "depthwise_conv_op.cc",
         "dynamic_partition_op.cc",
-        "encode_wav_op.cc",
         "eigen_contraction_kernel.cc",
         "eigen_contraction_kernel.h",
-        "einsum_op_impl_half.cc",
-        "einsum_op_impl_bfloat16.cc",
-        "einsum_op_impl_int32.cc",
-        "einsum_op_impl_int64.cc",
-        "einsum_op_impl_float.cc",
-        "einsum_op_impl_double.cc",
-        "einsum_op_impl_complex64.cc",
-        "einsum_op_impl_complex128.cc",
-        "einsum_op_impl.h",
-        "einsum_op.h",
+        "encode_wav_op.cc",
         "fake_quant_ops.cc",
         "fifo_queue.cc",
         "fifo_queue_op.cc",
@@ -6847,7 +6094,19 @@ filegroup(
         "population_count_op.cc",
         "population_count_op.h",
         "winograd_transform.h",
-        ":android_extended_ops_headers",
+    ] + [
+        "//tensorflow/core/kernels/image:crop_and_resize_op.cc",
+        "//tensorflow/core/kernels/image:crop_and_resize_op.h",
+        "//tensorflow/core/kernels/linalg:einsum_op_impl_half.cc",
+        "//tensorflow/core/kernels/linalg:einsum_op_impl_bfloat16.cc",
+        "//tensorflow/core/kernels/linalg:einsum_op_impl_int32.cc",
+        "//tensorflow/core/kernels/linalg:einsum_op_impl_int64.cc",
+        "//tensorflow/core/kernels/linalg:einsum_op_impl_float.cc",
+        "//tensorflow/core/kernels/linalg:einsum_op_impl_double.cc",
+        "//tensorflow/core/kernels/linalg:einsum_op_impl_complex64.cc",
+        "//tensorflow/core/kernels/linalg:einsum_op_impl_complex128.cc",
+        "//tensorflow/core/kernels/linalg:einsum_op_impl.h",
+        "//tensorflow/core/kernels/linalg:einsum_op.h",
     ] + select({
         ":xsmm_convolutions": [
             "xsmm_conv2d.h",
@@ -6860,9 +6119,7 @@ filegroup(
 filegroup(
     name = "android_extended_ops_group2",
     srcs = [
-        "adjust_contrast_op.cc",
-        "adjust_hue_op.cc",
-        "adjust_saturation_op.cc",
+        ":android_extended_ops_headers",
         "base64_ops.cc",
         "batchtospace_op.cc",
         "broadcast_to_op.cc",
@@ -6872,13 +6129,10 @@ filegroup(
         "diag_op.cc",
         "dilation_ops.cc",
         "dynamic_stitch_op.cc",
-        "extract_image_patches_op.cc",
         "fft_ops.cc",
-        "image_ops.cc",
         "in_topk_op.cc",
         "in_topk_op.h",
         "initializable_lookup_table.cc",
-        "linalg_ops_common.cc",
         "list_kernels.cc",
         "logging_ops.cc",
         "logging_ops.h",
@@ -6886,20 +6140,11 @@ filegroup(
         "lookup_table_op.cc",
         "lookup_util.cc",
         "lrn_op.cc",
-        "matrix_diag_op.cc",
-        "matrix_inverse_op.cc",
-        "matrix_set_diag_op.cc",
         "maxpooling_op.cc",
         "mfcc.cc",
         "mfcc_dct.cc",
         "mfcc_mel_filterbank.cc",
         "mfcc_op.cc",
-        "mirror_pad_op.cc",
-        "mirror_pad_op_cpu_impl_1.cc",
-        "mirror_pad_op_cpu_impl_2.cc",
-        "mirror_pad_op_cpu_impl_3.cc",
-        "mirror_pad_op_cpu_impl_4.cc",
-        "mirror_pad_op_cpu_impl_5.cc",
         "multinomial_op.cc",
         "pad_op.cc",
         "padding_fifo_queue.cc",
@@ -6909,6 +6154,8 @@ filegroup(
         "queue_op.cc",
         "queue_ops.cc",
         "ragged_range_op.cc",
+        "ragged_tensor_to_sparse_kernel.cc",
+        "ragged_tensor_to_tensor_op.cc",
         "random_op.cc",
         "random_op_cpu.h",
         "random_poisson_op.cc",
@@ -6924,11 +6171,8 @@ filegroup(
         "regex_replace_op.cc",
         "relu_op.cc",
         "reshape_util.cc",
-        "resize_bilinear_op.cc",
-        "resize_nearest_neighbor_op.cc",
         "restore_op.cc",
         "reverse_op.cc",
-        "sample_distorted_bounding_box_op.cc",
         "save_op.cc",
         "save_restore_tensor.cc",
         "save_restore_v2_ops.cc",
@@ -7006,9 +6250,26 @@ filegroup(
         "unique_op.cc",
         "where_op.cc",
         "xent_op.cc",
-        ":android_extended_ops_headers",
     ] + [
         "//tensorflow/core/kernels/boosted_trees:quantile_ops.cc",
+        "//tensorflow/core/kernels/image:adjust_contrast_op.cc",
+        "//tensorflow/core/kernels/image:adjust_hue_op.cc",
+        "//tensorflow/core/kernels/image:adjust_saturation_op.cc",
+        "//tensorflow/core/kernels/image:extract_image_patches_op.cc",
+        "//tensorflow/core/kernels/image:image_ops.cc",
+        "//tensorflow/core/kernels/image:mirror_pad_op.cc",
+        "//tensorflow/core/kernels/image:mirror_pad_op_cpu_impl_1.cc",
+        "//tensorflow/core/kernels/image:mirror_pad_op_cpu_impl_2.cc",
+        "//tensorflow/core/kernels/image:mirror_pad_op_cpu_impl_3.cc",
+        "//tensorflow/core/kernels/image:mirror_pad_op_cpu_impl_4.cc",
+        "//tensorflow/core/kernels/image:mirror_pad_op_cpu_impl_5.cc",
+        "//tensorflow/core/kernels/image:resize_bilinear_op.cc",
+        "//tensorflow/core/kernels/image:resize_nearest_neighbor_op.cc",
+        "//tensorflow/core/kernels/image:sample_distorted_bounding_box_op.cc",
+        "//tensorflow/core/kernels/linalg:linalg_ops_common.cc",
+        "//tensorflow/core/kernels/linalg:matrix_diag_op.cc",
+        "//tensorflow/core/kernels/linalg:matrix_inverse_op.cc",
+        "//tensorflow/core/kernels/linalg:matrix_set_diag_op.cc",
     ],
 )
 
@@ -7047,7 +6308,6 @@ ANDROID_TEXTUAL_HDRS = [
     "eigen_spatial_convolutions-inl.h",
     "gather_nd_op_cpu_impl.h",
     "gemm_functors.h",
-    "mirror_pad_op_cpu_impl.h",
     "scatter_nd_op_cpu_impl.h",
     "slice_op_cpu_impl.h",
     "strided_slice_op_impl.h",
@@ -7062,6 +6322,8 @@ filegroup(
     srcs = [
         "//tensorflow/c/kernels:android_all_op_kernels",
         "//tensorflow/core/kernels/data:android_all_op_kernels",
+        "//tensorflow/core/kernels/image:android_all_op_kernels",
+        "//tensorflow/core/kernels/linalg:android_all_op_kernels",
     ] + glob(
         [
             "*.cc",
@@ -7092,13 +6354,6 @@ filegroup(
             "sparse_cross_op.*",
             "text_line_reader_op.*",
             "summary_image_op.*",
-            "decode_image_op.*",
-            "encode_png_op.*",
-            "encode_jpeg_op.*",
-            "extract_jpeg_shape_op.*",
-            "decode_jpeg_op.*",
-            "decode_and_crop_jpeg_op.*",
-            "decode_gif_op.*",
             "identity_reader_op.*",
             "remote_fused_graph_execute_op.*",
             "remote_fused_graph_rewriter_transform.*",
@@ -7123,7 +6378,6 @@ filegroup(
             "unicode_ops.cc",
             "unicode_script_op.cc",
             # Ops that are inherently incompatible with Android (e.g. tied to x86 platform).
-            "mkl_*",
             "xsmm_*",
             "cwise_ops_sycl_common.h",
             "nextafter_op.cc",
@@ -7144,7 +6398,10 @@ filegroup(
 
 filegroup(
     name = "android_all_ops_textual_hdrs",
-    srcs = ANDROID_TEXTUAL_HDRS,
+    srcs = ANDROID_TEXTUAL_HDRS + [
+        "//tensorflow/core/kernels/image:android_all_ops_textual_hdrs",
+        "//tensorflow/core/util:image_resizer_state.h",
+    ],
     visibility = ["//visibility:public"],
 )
 # LINT.ThenChange(//tensorflow/contrib/makefile/tf_op_files.txt)
@@ -7188,26 +6445,6 @@ build_test(
     targets = [":portable_tensorflow_kernels"],
 )
 
-cc_library(
-    name = "android_tensorflow_image_op",
-    srcs = if_android(["decode_image_op.cc"]),
-    copts = tf_copts(),
-    linkopts = ["-ldl"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core:android_gif_internal",
-        "//tensorflow/core:android_jpeg_internal",
-        "//tensorflow/core:android_png_internal",
-        "//tensorflow/core:portable_tensorflow_lib_lite",
-    ],
-    alwayslink = 1,
-)
-
-build_test(
-    name = "android_tensorflow_image_op_build_test",
-    targets = [":android_tensorflow_image_op"],
-)
-
 cc_library(
     name = "android_whole_file_read_ops",
     srcs = if_android(["whole_file_read_ops.cc"]),
@@ -7226,7 +6463,6 @@ tf_kernel_library(
     name = "quantized_ops",
     srcs = [
         "dequantize_op.cc",
-        "meta_support.cc",
         "quantize_down_and_shrink_range.cc",
         "quantize_op.cc",
         "quantized_activation_ops.cc",
@@ -7245,22 +6481,20 @@ tf_kernel_library(
         "requantize.cc",
         "reshape_op.h",
     ],
-    hdrs = [
-        "meta_support.h",
-        "reference_gemm.h",
-    ],
+    hdrs = ["reference_gemm.h"],
     deps = [
         ":concat_lib_hdrs",
         ":conv_ops",
         ":cwise_op",
         ":eigen_helpers",
-        ":image_resizer_state",
+        ":meta_support",
         ":ops_util",
         ":pooling_ops",
         ":quantization_utils",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/util:image_resizer_state",
         "//third_party/eigen3",
         "@gemmlowp",
     ],
@@ -7562,50 +6796,6 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test_mkl(
-    name = "mkl_quantized_conv_ops_perchannel_test",
-    size = "small",
-    srcs = ["mkl_quantized_conv_ops_perchannel_test.cc"],
-    deps = [
-        ":mkl_conv_op",
-        ":mkl_input_conversion_op",
-        ":ops_testutil",
-        ":ops_util",
-        ":quantization_utils",
-        ":quantized_ops",
-        "//tensorflow/core:array_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cc_test_mkl(
-    name = "mkl_quantized_conv_ops_test",
-    size = "small",
-    srcs = ["mkl_quantized_conv_ops_test.cc"],
-    deps = [
-        ":mkl_conv_op",
-        ":mkl_input_conversion_op",
-        ":ops_testutil",
-        ":ops_util",
-        ":quantization_utils",
-        ":quantized_ops",
-        "//tensorflow/core:array_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
 tf_cc_test(
     name = "quantize_op_test",
     size = "small",
@@ -7646,28 +6836,6 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test_mkl(
-    name = "mkl_qmatmul_op_test",
-    size = "small",
-    srcs = ["mkl_qmatmul_op_test.cc"],
-    deps = [
-        ":mkl_input_conversion_op",
-        ":mkl_qmatmul_op",
-        ":ops_testutil",
-        ":ops_util",
-        ":quantization_utils",
-        ":quantized_ops",
-        "//tensorflow/core:array_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
 # Android-only test for quantized multiply.
 cc_binary(
     name = "quantized_mul_op_test_android_only",
@@ -7748,66 +6916,6 @@ tf_cc_test(
     ],
 )
 
-tf_mkl_kernel_library(
-    name = "mkl_quantize_op",
-    srcs = ["mkl_quantize_op.cc"],
-    hdrs = [
-        "meta_support.h",
-        "reference_gemm.h",
-    ],
-    deps = [
-        ":bounds_check",
-        ":ops_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:mkl_graph_util",
-        "@gemmlowp",
-    ] + mkl_deps(),
-)
-
-tf_cc_test_mkl(
-    name = "mkl_quantize_op_test",
-    size = "small",
-    srcs = ["mkl_quantize_op_test.cc"],
-    deps = [
-        ":mkl_quantize_op",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/core:array_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cc_test_mkl(
-    name = "mkl_quantized_pooling_ops_test",
-    size = "small",
-    srcs = ["mkl_quantized_pooling_ops_test.cc"],
-    deps = [
-        ":mkl_input_conversion_op",
-        ":mkl_pooling_ops",
-        ":ops_testutil",
-        ":ops_util",
-        ":quantization_utils",
-        ":quantized_ops",
-        "//tensorflow/core:array_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
 tf_cc_test(
     name = "quantized_reshape_op_test",
     size = "small",
@@ -7847,30 +6955,6 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test_mkl(
-    name = "mkl_quantized_concat_op_test",
-    size = "small",
-    srcs = ["mkl_quantized_concat_op_test.cc"],
-    deps = [
-        ":mkl_concat_op",
-        ":ops_testutil",
-        ":ops_util",
-        ":quantization_utils",
-        ":quantized_ops",
-        "//tensorflow/core:array_ops_op_lib",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:mkl_array_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
 tf_cc_test(
     name = "quantized_batch_norm_op_test",
     size = "small",
@@ -8094,50 +7178,6 @@ tf_cc_test(
     ],
 )
 
-tf_mkl_kernel_library(
-    name = "mkl_qmatmul_op",
-    srcs = ["mkl_qmatmul_op.cc"],
-    hdrs = [
-        "mkl_matmul_ops_common.h",
-        "mkl_quantized_conv_ops.h",
-        "no_op.h",
-    ],
-    deps = [
-        ":bounds_check",
-        ":fill_functor",
-        ":matmul_op",
-        ":ops_util",
-        "//third_party/eigen3",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:mkl_nn_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
-    ] + mkl_deps(),
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_conv_op",
-    hdrs = [
-        "mkl_quantized_conv_ops.h",
-        "no_op.h",
-    ],
-    prefix = "mkl_conv",
-    deps = [
-        ":bounds_check",
-        ":conv_ops",
-        ":ops_util",
-        "@com_google_absl//absl/strings",
-        "//third_party/eigen3",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ] + mkl_deps(),
-)
-
 tf_cc_test(
     name = "bias_op_test",
     size = "small",
@@ -8152,354 +7192,6 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test_mkl(
-    name = "mkl_conv_ops_test",
-    size = "small",
-    srcs = ["mkl_conv_ops_test.cc"],
-    linkstatic = 1,  # Fixes dyld error on MacOS.
-    deps = [
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cc_test_mkl(
-    name = "mkl_relu_op_test",
-    size = "small",
-    srcs = ["mkl_relu_op_test.cc"],
-    linkstatic = 1,  # Fixes dyld error on MacOS.
-    deps = [
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_tfconv_op",
-    prefix = "mkl_tfconv",
-    deps = [
-        ":bounds_check",
-        ":ops_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ] + mkl_deps(),
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_input_conversion_op",
-    hdrs = ["mkl_tfconv_op.h"],
-    prefix = "mkl_input_conversion",
-    deps = [
-        ":bounds_check",
-        ":ops_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ] + mkl_deps(),
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_pooling_ops",
-    srcs = [
-        "mkl_avgpooling_op.cc",
-        "mkl_maxpooling_op.cc",
-        "mkl_pooling_ops_common.cc",
-    ],
-    hdrs = ["mkl_pooling_ops_common.h"],
-    deps = [
-        ":bounds_check",
-        ":ops_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ] + mkl_deps(),
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_dequantize_op",
-    srcs = ["mkl_dequantize_op.cc"],
-    hdrs = [
-        "meta_support.h",
-        "reference_gemm.h",
-    ],
-    deps = [
-        ":concat_lib_hdrs",
-        ":conv_ops",
-        ":cwise_op",
-        ":eigen_helpers",
-        ":image_resizer_state",
-        ":ops_util",
-        ":pooling_ops",
-        ":quantization_utils",
-        ":quantized_ops",
-        ":transpose_functor",
-        "//tensorflow/core:array_ops_op_lib",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:mkl_graph_util",
-        "//tensorflow/core:nn_ops_op_lib",
-        "//third_party/eigen3",
-        "@gemmlowp",
-    ] + mkl_deps(),
-)
-
-tf_cc_test_mkl(
-    name = "mkl_dequantize_op_test",
-    size = "small",
-    srcs = ["mkl_dequantize_op_test.cc"],
-    # TODO(b/149940073): Re-enable.
-    tags = [
-        "no_oss",
-        "notap",
-    ],
-    deps = [
-        ":mkl_dequantize_op",
-        ":mkl_tfconv_op",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/core:array_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:mkl_array_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_relu_op",
-    prefix = "mkl_relu",
-    deps = [
-        ":bounds_check",
-        ":ops_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//third_party/eigen3",
-    ] + mkl_deps(),
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_softmax_op",
-    prefix = "mkl_softmax",
-    deps = [
-        ":bounds_check",
-        ":ops_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//third_party/eigen3",
-    ] + mkl_deps(),
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_tmp_bf16_ops",
-    prefix = "mkl_tmp_bf16_ops",
-    deps = [
-        ":no_op",
-    ] + mkl_deps(),
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_fused_batch_norm_op",
-    srcs = ["mkl_fused_batch_norm_op.cc"],
-    deps = NN_DEPS + [
-        ":fused_batch_norm_op",
-        ":no_op",
-    ] + mkl_deps(),
-)
-
-tf_cc_test_mkl(
-    name = "mkl_fused_batch_norm_op_test",
-    size = "small",
-    srcs = ["mkl_fused_batch_norm_op_test.cc"],
-    linkstatic = 1,
-    deps = [
-        ":mkl_fused_batch_norm_op",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_aggregate_ops",
-    prefix = "mkl_aggregate_ops",
-    deps = MATH_DEPS + mkl_deps(),
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_concat_op",
-    prefix = "mkl_concat_op",
-    deps = [":quantization_utils"] + ARRAY_DEPS + mkl_deps(),
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_reshape_op",
-    prefix = "mkl_reshape_op",
-    deps = ARRAY_DEPS + mkl_deps(),
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_slice_op",
-    prefix = "mkl_slice_op",
-    deps = ARRAY_DEPS + mkl_deps(),
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_identity_op",
-    prefix = "mkl_identity_op",
-    deps = ARRAY_DEPS + mkl_deps(),
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_lrn_op",
-    prefix = "mkl_lrn_op",
-    deps = NN_DEPS + mkl_deps(),
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_cwise_ops_common",
-    hdrs = [
-        "cwise_ops.h",
-        "cwise_ops_common.h",
-        "cwise_ops_gradients.h",
-    ],
-    prefix = "mkl_cwise_ops_common",
-    deps = NN_DEPS + mkl_deps() + [":cwise_op"],
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_requantize_ops",
-    srcs = [
-        "mkl_requantization_range_per_channel_op.cc",
-        "mkl_requantize_per_channel_op.cc",
-    ],
-    hdrs = [
-        "meta_support.h",
-        "no_op.h",
-        "reference_gemm.h",
-    ],
-    deps = [
-        ":concat_lib_hdrs",
-        ":conv_ops",
-        ":cwise_op",
-        ":eigen_helpers",
-        ":image_resizer_state",
-        ":ops_util",
-        ":pooling_ops",
-        ":quantization_utils",
-        ":transpose_functor",
-        "//third_party/eigen3",
-        "@gemmlowp",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-    ] + mkl_deps(),
-)
-
-tf_cc_test_mkl(
-    name = "mkl_requantize_ops_test",
-    size = "small",
-    srcs = ["mkl_requantize_ops_test.cc"],
-    linkstatic = 1,  # Fixes dyld error on MacOS.
-    deps = [
-        ":mkl_requantize_ops",
-        ":ops_testutil",
-        ":ops_util",
-        ":quantization_utils",
-        ":quantized_ops",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/core:array_ops_op_lib",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cc_test_mkl(
-    name = "mkl_fused_ops_test",
-    size = "small",
-    srcs = ["mkl_fused_ops_test.cc"],
-    linkstatic = 1,
-    deps = [
-        ":conv_ops",
-        ":image",
-        ":mkl_conv_op",
-        ":mkl_matmul_op",
-        ":mkl_tfconv_op",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_transpose_op",
-    srcs = [
-        "mkl_transpose_op.cc",
-    ],
-    hdrs = ["transpose_op.h"],
-    deps = ARRAY_DEPS + mkl_deps() + [":transpose_op"],
-)
-
 # NOTE(lespeholt): This rule is deprecated, please use:
 # tensorflow/core/util/batch_util.h
 cc_library(
@@ -8623,10 +7315,7 @@ tf_kernel_library(
 # should not be linked by projects that also link the cwise_op library.
 cc_library(
     name = "cwise_lib",
-    srcs = [
-        "cwise_ops_common.cc",
-        "meta_support.cc",
-    ],
+    srcs = ["cwise_ops_common.cc"],
     hdrs = [
         "cwise_ops.h",
         "cwise_ops_common.h",
@@ -8634,10 +7323,10 @@ cc_library(
         "cwise_ops_gpu_gradients.cu.h",
         "cwise_ops_gradients.h",
         "fill_functor.h",
-        "meta_support.h",
     ],
     deps = [
         ":bounds_check",
+        ":meta_support",
         ":quantization_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -8699,7 +7388,6 @@ test_suite(
         ":cast_op_test",
         ":concat_op_test",
         ":control_flow_ops_test",
-        ":crop_and_resize_op_test",
         ":cwise_ops_test",
         ":deep_conv2d_test",
         ":dequantize_op_test",
@@ -8712,7 +7400,6 @@ test_suite(
         ":mfcc_test",
         ":multinomial_op_test",
         ":nn_ops_test",
-        ":non_max_suppression_op_test",
         ":quantization_utils_test",
         ":quantize_and_dequantize_op_test",
         ":quantize_op_test",
@@ -8726,7 +7413,6 @@ test_suite(
         ":random_poisson_op_test",
         ":reduction_ops_test",
         ":requantization_range_op_test",
-        ":resize_bilinear_op_test",
         ":scatter_op_test",
         ":segment_reduction_ops_test",
         ":slice_op_test",
@@ -8736,6 +7422,9 @@ test_suite(
         ":strided_slice_op_test",
         ":unique_op_test",
         ":variable_ops_test",
+        "//tensorflow/core/kernels/image:crop_and_resize_op_test",
+        "//tensorflow/core/kernels/image:non_max_suppression_op_test",
+        "//tensorflow/core/kernels/image:resize_bilinear_op_test",
     ],
 )
 
@@ -8830,3 +7519,15 @@ tf_kernel_library(
         "@sobol_data",
     ],
 )
+
+# ---- temporary forwarding declaration for libraries in linalg
+# TODO(b/160344057): Remove after updating dependencies.
+tf_kernel_library(
+    name = "matrix_inverse_op",
+    deps = ["//tensorflow/core/kernels/linalg:matrix_inverse_op"],
+)
+
+tf_kernel_library(
+    name = "einsum_op",
+    deps = ["//tensorflow/core/kernels/linalg:einsum_op"],
+)
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 3bd3cc5116c..04071505294 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -13,831 +13,69 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/batching_util/batch_resource_base.h"
+#include "tensorflow/core/kernels/batching_util/concat_split_util.h"
 #include "tensorflow/core/kernels/batching_util/periodic_function.h"
-#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
-#include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/kernels/split_lib.h"
-#include "tensorflow/core/lib/gtl/cleanup.h"
-#include "tensorflow/core/lib/monitoring/percentile_sampler.h"
 #include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/platform/context.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/util/incremental_barrier.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 
-namespace {
-
-void RecordPaddingSize(int32 padding_size, const string& model_name,
-                       int32 execution_batch_size) {
-  static auto* cell = tensorflow::monitoring::PercentileSampler<2>::New(
-      {"/tensorflow/serving/batching/padding_size",
-       "Tracks the padding size distribution on batches by model_name (if "
-       "available).",
-       "model_name", "execution_batch_size"},
-      /*percentiles=*/{25.0, 50.0, 75.0, 90.0, 95.0, 99.0},
-      /*max_samples=*/1024, tensorflow::monitoring::UnitOfMeasure::kNumber);
-  cell->GetCell(model_name, absl::StrCat(execution_batch_size))
-      ->Add(static_cast<double>(padding_size));
-}
-
-void RecordInputBatchSize(int32 batch_size, const string& model_name) {
-  static auto* cell = tensorflow::monitoring::PercentileSampler<1>::New(
-      {"/tensorflow/serving/batching/input_batch_size",
-       "Tracks the batch size distribution on the inputs by model_name (if "
-       "available).",
-       "model_name"},
-      /*percentiles=*/{25.0, 50.0, 75.0, 90.0, 95.0, 99.0},
-      /*max_samples=*/1024, tensorflow::monitoring::UnitOfMeasure::kNumber);
-  cell->GetCell(model_name)->Add(static_cast<double>(batch_size));
-}
-
-void RecordProcessedBatchSize(int32 batch_size, const string& model_name) {
-  static auto* cell = tensorflow::monitoring::PercentileSampler<1>::New(
-      {"/tensorflow/serving/batching/processed_batch_size",
-       "Tracks the batch size distribution on processing by model_name (if "
-       "available).",
-       "model_name"},
-      /*percentiles=*/{25.0, 50.0, 75.0, 90.0, 95.0, 99.0},
-      /*max_samples=*/1024, tensorflow::monitoring::UnitOfMeasure::kNumber);
-  cell->GetCell(model_name)->Add(static_cast<double>(batch_size));
-}
-
-void RecordBatchDelayMs(int64 batch_delay_ms, const string& model_name) {
-  static auto* cell = monitoring::PercentileSampler<1>::New(
-      {"/tensorflow/serving/batching/batch_delay_ms",
-       "Tracks the batching delay for inputs by model_name (if "
-       "available).",
-       "model_name"},
-      /*percentiles=*/{25.0, 50.0, 75.0, 90.0, 95.0, 99.0},
-      /*max_samples=*/1024, monitoring::UnitOfMeasure::kTime);
-  cell->GetCell(model_name)->Add(static_cast<double>(batch_delay_ms));
-}
-
-const string& GetModelName(OpKernelContext* ctx) {
-  static string* kModelNameUnset = new string("model_name_unset");
-  if (!ctx->session_metadata()) return *kModelNameUnset;
-  if (ctx->session_metadata()->name().empty()) return *kModelNameUnset;
-  return ctx->session_metadata()->name();
-}
-
-}  // namespace
-
-typedef Eigen::ThreadPoolDevice CPUDevice;
-typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
-
-// Concatenates 'inputs' into a single tensor along the zeroth dimension.
-// Requires that all elements of 'inputs' have element type T. Writes to
-// 'output' using 'context' for the allocation to ensure proper device
-// placement.
-template <typename T>
-Status Concat(OpKernelContext* context, const gtl::ArraySlice<Tensor> inputs,
-              Tensor* output) {
-  const int input_dims = inputs[0].dims();
-  const TensorShape& input_shape = inputs[0].shape();
-
-  // Note that we reduce the concat of k-dimensional tensors into a two
-  // dimensional concat. Assuming the dimensions of any input tensor are
-  // {y0, y1,...,ym-1}, we flatten it to {1, y}, where y = Prod_i(yi).
-  std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>> inputs_flat;
-  inputs_flat.reserve(inputs.size());
-  int64 output_dim0 = 0;
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    const Tensor& input = inputs[i];
-    if (input.dims() != input_dims) {
-      return errors::InvalidArgument(
-          "Ranks of all input tensors should match: shape[0] = ",
-          input_shape.DebugString(), " vs. shape[", i,
-          "] = ", input.shape().DebugString());
-    }
-    for (int j = 1; j < input_dims; ++j) {
-      if (input.dim_size(j) != input_shape.dim_size(j)) {
-        return errors::InvalidArgument(
-            "Dimensions of inputs should match: shape[0] = ",
-            input_shape.DebugString(), " vs. shape[", i,
-            "] = ", input.shape().DebugString());
-      }
-    }
-    if (input.NumElements() > 0) {
-      inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
-          input.shaped<T, 2>({1, input.NumElements()})));
-    }
-    output_dim0 += input.dim_size(0);
-  }
-
-  TensorShape output_shape(input_shape);
-  output_shape.set_dim(0, output_dim0);
-  TF_RETURN_IF_ERROR(
-      context->allocate_temp(DataTypeToEnum<T>::value, output_shape, output));
-  if (output->NumElements() > 0) {
-    auto output_flat = output->shaped<T, 2>({1, output->NumElements()});
-#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
-    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
-    if (std::is_same<Device, GPUDevice>::value) {
-      ConcatGPU<T>(context, inputs_flat, output, &output_flat);
-      return Status::OK();
-    }
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-    ConcatCPU<T>(context->device(), inputs_flat, &output_flat);
-  }
-
-  return Status::OK();
-}
-
-// Same as 'Concat' above, but handles Tensor dtype deduction automatically.
-Status Concat(OpKernelContext* context, const gtl::ArraySlice<Tensor> inputs,
-              Tensor* output) {
-  const DataType type = inputs[0].dtype();
-  Status concat_status;
-  switch (type) {
-#define CASE(type)                                         \
-  case DataTypeToEnum<type>::value:                        \
-    concat_status = Concat<type>(context, inputs, output); \
-    break;
-    TF_CALL_ALL_TYPES(CASE);
-#undef CASE
-    default:
-      concat_status = errors::InvalidArgument("Unsupported data type: ", type);
-      break;
-  }
-  return concat_status;
-}
-
-// The Split*() functions split 'input' with element type T into 'sizes.size()'
-// tensors along the zeroth dimension, with the ith split having zeroth-
-// dimension size 'sizes[i]'. They allocate the output tensors using 'context',
-// for proper device placement.
-
-// Handles special cases that are cheap. Sets 'done==true' iff it found an
-// applicable special case and wrote to the outputs. Otherwise acts as a no-op.
-template <typename T>
-Status SplitEasyCases(OpKernelContext* context, const Tensor& input,
-                      const gtl::ArraySlice<int64> sizes,
-                      std::vector<Tensor>* outputs, bool* done) {
-  *done = false;
-
-  int64 total_size = 0;
-  for (const int64 size : sizes) {
-    total_size += size;
-  }
-  if (total_size > input.shape().dim_size(0)) {
-    return errors::InvalidArgument(
-        "Sum of split sizes must not exceed dim0-size of input tensor");
-  }
-
-  // Special case 0: trivial 1-way split.
-  if (sizes.size() == 1 && sizes.at(0) == input.shape().dim_size(0)) {
-    outputs->push_back(input);
-    *done = true;
-    return Status::OK();
-  }
-
-  // Special case 1: input is aligned.
-  if (IsInnerDimsSizeAligned<T>(input.shape())) {
-    int64 position = 0;
-    for (const int64 size : sizes) {
-      outputs->emplace_back(input.Slice(position, position + size));
-      position += size;
-    }
-    *done = true;
-    return Status::OK();
-  }
-
-  return Status::OK();
-}
-
-// Handles the general case, on CPU.
-template <typename T>
-Status SplitCPU(OpKernelContext* context, const Tensor& input,
-                const gtl::ArraySlice<int64> sizes,
-                std::vector<Tensor>* outputs) {
-  int64 suffix_dim_size = 1;
-  for (int i = 1; i < input.shape().dims(); ++i) {
-    suffix_dim_size *= input.shape().dim_size(i);
-  }
-  auto input_reshaped =
-      input.shaped<T, 2>({input.shape().dim_size(0), suffix_dim_size});
-
-  int64 position = 0;
-  for (const int64 size : sizes) {
-    TensorShape output_shape = input.shape();
-    output_shape.set_dim(0, size);
-    Tensor output;
-    TF_RETURN_IF_ERROR(
-        context->allocate_temp(input.dtype(), output_shape, &output));
-    auto output_shaped = output.shaped<T, 2>({size, suffix_dim_size});
-
-    Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{position, 0};
-    Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{size, suffix_dim_size};
-    functor::Split<CPUDevice, T, 2>()(context->eigen_device<CPUDevice>(),
-                                      output_shaped, input_reshaped,
-                                      slice_indices, slice_sizes);
-
-    outputs->emplace_back(output);
-
-    position += size;
-  }
-
-  return Status::OK();
-}
-
-#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
-    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
-
-// Handles the general case, on GPU.
-template <typename T>
-Status SplitGPU(OpKernelContext* context, const Tensor& input,
-                const gtl::ArraySlice<int64>& sizes,
-                std::vector<Tensor>* outputs) {
-  // TODO(olston, apassos): Implement this.
-  LOG(FATAL) << "Not yet implemented";  // Crash ok
-}
-
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-// The outer function that dispatches to the various Split*() functions above.
-template <typename T>
-Status Split(OpKernelContext* context, const Tensor& input,
-             const gtl::ArraySlice<int64> sizes, std::vector<Tensor>* outputs) {
-  bool easy_cases_done;
-  TF_RETURN_IF_ERROR(
-      SplitEasyCases<T>(context, input, sizes, outputs, &easy_cases_done));
-  if (easy_cases_done) {
-    return Status::OK();
-  }
-
-#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
-    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
-// TODO(olston, apassos): Handle non-CPU cases.
-// return SplitGPU<T>(context, input, sizes, outputs);
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  return SplitCPU<T>(context, input, sizes, outputs);
-}
-
-// Same as 'Split' above, but handles Tensor dtype automatically.
-Status Split(OpKernelContext* context, const Tensor& input,
-             const gtl::ArraySlice<int64> sizes, std::vector<Tensor>* outputs) {
-  const DataType type = input.dtype();
-  Status split_status;
-  switch (type) {
-#define CASE(type)                                              \
-  case DataTypeToEnum<type>::value:                             \
-    split_status = Split<type>(context, input, sizes, outputs); \
-    break;
-    TF_CALL_ALL_TYPES(CASE);
-#undef CASE
-    default:
-      split_status = errors::InvalidArgument("Unsupported data type: ", type);
-      break;
-  }
-  return split_status;
-}
-
-// Wrapper class to allow both lock-free construction and concurrent updates on
-// a shared 'status'.
-class ThreadSafeStatus {
- public:
-  const Status& status() const& TF_LOCKS_EXCLUDED(mutex_) {
-    tf_shared_lock lock(mutex_);
-    return status_;
-  }
-  Status status() && TF_LOCKS_EXCLUDED(mutex_) {
-    tf_shared_lock lock(mutex_);
-    return std::move(status_);
-  }
-
-  // Retains the first error status: replaces the current status with
-  // `new_status` if `new_status` is not OK and the previous status is OK.
-  void Update(const Status& new_status) TF_LOCKS_EXCLUDED(mutex_) {
-    if (new_status.ok()) {
-      return;
-    }
-
-    mutex_lock lock(mutex_);
-    status_.Update(new_status);
-  }
-  void Update(Status&& new_status) TF_LOCKS_EXCLUDED(mutex_) {
-    if (new_status.ok()) {
-      return;
-    }
-
-    mutex_lock lock(mutex_);
-    status_.Update(std::forward<Status>(new_status));
-  }
-
- private:
-  mutable mutex mutex_;
-  Status status_ TF_GUARDED_BY(mutex_);
-};
+using ::tensorflow::concat_split_util::Concat;
+using ::tensorflow::concat_split_util::Split;
 
 // A class encapsulating the state and logic for batching tensors.
-class BatchResource : public ResourceBase {
+class BatchResource : public serving::BatchResourceBase {
  public:
-  // Given a BatchTask (from one op invocation) with 'num_outputs'== M and
-  // splitted into N sub tasks, TensorMatrix is a N X M matrix.
-  // Namely, TensorMatrix[i][j] indicates the i-th split tensor of j-th output;
-  // concatenating tensors along the 2nd dimension gives a output tensor.
-  typedef std::vector<std::vector<Tensor>> TensorMatrix;
-
   static Status Create(int32 num_batch_threads, int32 max_batch_size,
                        int32 batch_timeout_micros, int32 max_enqueued_batches,
                        const std::vector<int32>& allowed_batch_sizes,
                        FunctionLibraryRuntime::Handle fhandle,
                        bool enable_large_batch_splitting,
                        std::unique_ptr<BatchResource>* resource) {
-    std::unique_ptr<BatchResource> new_resource(new BatchResource);
-
-    Batcher::Options batcher_options;
+    BatcherT::Options batcher_options;
     batcher_options.num_batch_threads = num_batch_threads;
-    TF_RETURN_IF_ERROR(
-        Batcher::Create(batcher_options, &new_resource->batcher_));
+    std::shared_ptr<BatcherT> batcher;
+    TF_RETURN_IF_ERROR(BatcherT::Create(batcher_options, &batcher));
 
-    new_resource->batcher_queue_options_.input_batch_size_limit =
-        max_batch_size;
-    new_resource->batcher_queue_options_.max_enqueued_batches =
-        max_enqueued_batches;
-    new_resource->batcher_queue_options_.batch_timeout_micros =
-        batch_timeout_micros;
-    // Support for splitting large batch is still in progress.
-    new_resource->batcher_queue_options_.enable_large_batch_splitting =
-        enable_large_batch_splitting;
-    new_resource->allowed_batch_sizes_ = allowed_batch_sizes;
-    if (enable_large_batch_splitting) {
-      new_resource->batcher_queue_options_.split_input_task_func =
-          [](std::unique_ptr<BatchTask>* input_task,
-             int open_batch_remaining_slot, int max_batch_size,
-             std::vector<std::unique_ptr<BatchTask>>* output_tasks) -> Status {
-        return SplitInputTask(input_task, open_batch_remaining_slot,
-                              max_batch_size, output_tasks);
-      };
-
-      if (allowed_batch_sizes.empty()) {
-        new_resource->batcher_queue_options_.max_execution_batch_size =
-            max_batch_size;
-      } else {
-        new_resource->batcher_queue_options_.max_execution_batch_size =
-            *allowed_batch_sizes.rbegin();
-      }
-    }
-
-    new_resource->fhandle_ = fhandle;
-
-    *resource = std::move(new_resource);
+    resource->reset(new BatchResource(
+        fhandle, std::move(batcher),
+        GetBatcherQueueOptions(num_batch_threads, max_batch_size,
+                               batch_timeout_micros, max_enqueued_batches,
+                               allowed_batch_sizes,
+                               enable_large_batch_splitting),
+        allowed_batch_sizes));
     return Status::OK();
   }
 
   string DebugString() const final { return "BatchResource"; }
 
-  // Ingests data from one invocation of the batch op. The data is enqueued to
-  // be combined with others into a batch, asynchronously.
-  Status RegisterInput(int64 guid, OpKernelContext* context,
-                       const string& batcher_queue_name,
-                       AsyncOpKernel::DoneCallback done_callback) {
-    auto batch_components = MakeUnique<BatchTask>();
-    batch_components->start_time = EnvTime::NowNanos();
-    batch_components->guid = guid;
-    batch_components->propagated_context = Context(ContextKind::kThread);
-    OpInputList tensors;
-    TF_RETURN_IF_ERROR(context->input_list("in_tensors", &tensors));
-    batch_components->inputs.reserve(tensors.size());
-    for (const Tensor& tensor : tensors) {
-      if (tensor.shape().dims() == 0) {
-        return errors::InvalidArgument(
-            "Batching input tensors must have at least one dimension");
-      }
-      if (tensors.size() >= 2 &&
-          tensor.shape().dim_size(0) != tensors[0].shape().dim_size(0)) {
-        return errors::InvalidArgument(
-            "Batching input tensors supplied in a given op invocation must "
-            "have equal 0th-dimension size");
-      }
-      batch_components->inputs.push_back(tensor);
-    }
-    RecordInputBatchSize(tensors[0].shape().dim_size(0), GetModelName(context));
-    OpInputList captured_tensors;
-    const auto captured_status =
-        context->input_list("captured_tensors", &captured_tensors);
-    if (captured_status.ok()) {
-      batch_components->captured_inputs.reserve(captured_tensors.size());
-      for (const Tensor& captured_tensor : captured_tensors) {
-        batch_components->captured_inputs.push_back(captured_tensor);
-      }
-    }
-    batch_components->context = context;
-    batch_components->done_callback = std::move(done_callback);
-    batch_components->split_index = 0;
-    batch_components->output = std::make_shared<TensorMatrix>();
-    batch_components->status = std::make_shared<ThreadSafeStatus>();
-
-    BatcherQueue* batcher_queue;
-    TF_RETURN_IF_ERROR(
-        LookupOrCreateBatcherQueue(batcher_queue_name, &batcher_queue));
-    return batcher_queue->Schedule(&batch_components);
-  }
-
  private:
-  BatchResource() = default;
+  BatchResource(FunctionLibraryRuntime::Handle fhandle,
+                std::shared_ptr<BatcherT> batcher,
+                const BatcherT::QueueOptions& batcher_queue_options,
+                std::vector<int32> allowed_batch_sizes)
+      : BatchResourceBase(
+            /*has_process_batch_function=*/fhandle != kInvalidHandle,
+            std::move(batcher), batcher_queue_options,
+            std::move(allowed_batch_sizes)),
+        fhandle_(fhandle) {}
 
-  // One task to be batched, corresponds to a `slice` of input from one batch-op
-  // invocation.
-  //
-  // Given input from one batch-op invocation, a `slice` of this input is:
-  // 1) Split each Tensor in `BatchTask::inputs` along the 0th dimension.
-  // 2) 'split_index' is calculated along the 0-th dimension.
-  //
-  // Note input from one batch-op invocation is valid and considered a
-  // specialized `slice`.
-  struct BatchTask : public serving::BatchTask {
-    // A unique ID to identify this invocation of Batch.
-    int64 guid;
-
-    Context propagated_context;
-
-    std::vector<Tensor> inputs;
-    std::vector<Tensor> captured_inputs;
-    OpKernelContext* context;
-    AsyncOpKernel::DoneCallback done_callback;
-
-    // The index of this split, along the 0-th dimension of input from op
-    // invocation.
-    int split_index = 0;
-
-    // Two-dimensional tensor matrix, ownership shared by:
-    // 1) each split of task (to fill one row in this matrix)
-    // and
-    // 2) callback that runs to merge output of individual splits for an op
-    // invocation, after all splits complete.
-    std::shared_ptr<TensorMatrix> output;
-
-    // 'status' records error (could be from any split) if at least one split
-    // returns error, OK otherwise.
-    // Ownership is shared by individual splits and callback.
-    std::shared_ptr<ThreadSafeStatus> status;
-
-    bool is_partial = false;
-
-    size_t size() const override { return inputs[0].shape().dim_size(0); }
-
-    uint64 start_time;
-  };
-
-  using Batcher = serving::SharedBatchScheduler<BatchTask>;
-  using BatcherQueue = serving::BatchScheduler<BatchTask>;
-  using Batch = serving::Batch<BatchTask>;
-
-  // Validates that it's legal to combine the tasks in 'batch' into a batch.
-  // Assumes the batch is non-empty.
-  static Status ValidateBatch(const Batch& batch) {
-    for (int task_idx = 0; task_idx < batch.num_tasks(); ++task_idx) {
-      const BatchTask& task = batch.task(task_idx);
-
-      if (task.inputs.size() != batch.task(0).inputs.size()) {
-        return errors::InvalidArgument(
-            "Batching inputs must have equal number of edges");
-      }
-    }
-
-    return Status::OK();
-  }
-
-  // Returns the smallest entry in 'allowed_batch_sizes_' that is greater than
-  // or equal to 'batch_size'. If 'allowed_batch_sizes_' is empty, simply
-  // returns 'batch_size'.
-  int RoundToLowestAllowedBatchSize(int batch_size) const {
-    if (allowed_batch_sizes_.empty()) {
-      return batch_size;
-    }
-    for (int allowed_size : allowed_batch_sizes_) {
-      if (allowed_size >= batch_size) {
-        return allowed_size;
-      }
-    }
-    LOG(ERROR) << "Maximum batch size greater than largest allowed size; "
-                  "ignoring allowed sizes constraint";
-    return batch_size;
-  }
-
-  Status ConcatInputTensors(const Batch& batch, OpKernelContext* context,
-                            std::vector<Tensor>* concatenated_tensors) const {
-    if (batch.num_tasks() == 0) {
-      return errors::InvalidArgument("Empty batch.");
-    }
-
-    const int padded_batch_size = RoundToLowestAllowedBatchSize(batch.size());
-    const int padding_amount = padded_batch_size - batch.size();
-    RecordPaddingSize(padding_amount, GetModelName(context), padded_batch_size);
-    RecordProcessedBatchSize(padded_batch_size, GetModelName(context));
-
-    // All tasks should have the same number of input edges.
-    const int num_inputs = batch.task(0).inputs.size();
-    concatenated_tensors->reserve(num_inputs);
-
-    // Process each input one at a time (the typical case has just one).
-    for (int i = 0; i < num_inputs; ++i) {
-      // Concatenate the tasks ith input tensors into a big output tensor.
-      std::vector<Tensor> to_concatenate;
-      to_concatenate.reserve(batch.num_tasks());
-      for (int task_idx = 0; task_idx < batch.num_tasks(); ++task_idx) {
-        to_concatenate.push_back(batch.task(task_idx).inputs.at(i));
-      }
-
-      // Add padding as needed. Use the first row of the first task's tensor as
-      // the data for padding.
-      if (padding_amount > 0) {
-        const Tensor& padding_source = batch.task(0).inputs.at(i);
-        Tensor padding;
-        if (padding_source.shape().dim_size(0) == 0) {
-          return errors::InvalidArgument(
-              "Cannot use an empty tensor with zero rows as padding when "
-              "batching. (Input ",
-              i, " got shape ", padding_source.shape().DebugString(), ".)");
-        }
-        if (padding_source.shape().dim_size(0) == 1) {
-          padding = padding_source;
-        } else {
-          padding = padding_source.Slice(0, 1);
-        }
-        for (int i = 0; i < padding_amount; ++i) {
-          to_concatenate.push_back(padding);
-        }
-      }
-
-      Tensor concatenated_tensor;
-      Status concat_status =
-          Concat(context, to_concatenate, &concatenated_tensor);
-      TF_RETURN_IF_ERROR(concat_status);
-      concatenated_tensors->push_back(concatenated_tensor);
-    }
-    return Status::OK();
-  }
-
-  // Split 'input' of 'input_task_ptr' along 0th dimension, into a list of
-  // 'output_tasks'.
-  // Task sizes are determined by
-  // 1) open_batch_remaining_slot
-  // 2) max_batch_size
-  // 3) size-of-input-task
-  // in a way that
-  // 1) Task sizes add up to `size-of-input-task`.
-  // 2) Task sizes from left to right are like
-  //    [open_batch_remaining_slot, max_batch_size, max_batch_size, ...,
-  //    `size-of-input-task` - `sum-of-previous-elements`].
-  //
-  // REQUIRES:
-  // Caller should make sure size-of-input-task is greater than
-  // open_batch_remaining_slot.
-  static Status SplitInputTask(
-      std::unique_ptr<BatchTask>* input_task_ptr, int open_batch_remaining_slot,
-      int max_batch_size,
-      std::vector<std::unique_ptr<BatchTask>>* output_tasks) {
-    BatchTask& input_task = *(*input_task_ptr);
-    const int64 input_task_size = input_task.size();
-
-    DCHECK_GT(input_task_size, open_batch_remaining_slot);
-
-    std::shared_ptr<ThreadSafeStatus> shared_status = input_task.status;
-
-    // `split_task_done_callback` runs only after all splitted tasks are
-    // complete.
-    std::function<void()> split_task_done_callback =
-        [done_callback = input_task.done_callback, output = input_task.output,
-         op_kernel_context = input_task.context, status = shared_status]() {
-          const int num_output = op_kernel_context->num_outputs();
-          for (int i = 0; i < num_output; ++i) {
-            Tensor output_tensor;
-
-            // Concat would memcpy each input tensor to one output tensor.
-            // In this context, Concat can be further optimized to get rid of
-            // some (probably all) memcpy when input tensors are slices of
-            // another copy.
-            // TODO(b/154140947):
-            // Add a custom implementation of Split and then optimize Concat.
-            std::vector<Tensor> to_concatenate;
-            to_concatenate.reserve(output->size());
-            for (int j = 0; j < output->size(); ++j) {
-              to_concatenate.push_back(std::move((*output)[j][i]));
-            }
-            const auto concat_status =
-                Concat(op_kernel_context, to_concatenate, &output_tensor);
-            if (!concat_status.ok()) {
-              status->Update(concat_status);
-            }
-
-            op_kernel_context->set_output(i, std::move(output_tensor));
-          }
-          op_kernel_context->SetStatus(status->status());
-          done_callback();
-        };
-    IncrementalBarrier barrier(split_task_done_callback);
-
-    std::vector<int64> output_task_sizes;
-
-    if (open_batch_remaining_slot > 0) {
-      output_task_sizes.push_back(open_batch_remaining_slot);
-    }
-
-    for (int left_task_size = input_task_size - open_batch_remaining_slot;
-         left_task_size > 0; left_task_size -= max_batch_size) {
-      int next_task_size = std::min(left_task_size, max_batch_size);
-      output_task_sizes.push_back(next_task_size);
-    }
-
-    const int output_task_num = output_task_sizes.size();
-    input_task.output->resize(output_task_num);
-
-    for (int i = 0; i < output_task_num; ++i) {
-      (*input_task.output)[i].resize(input_task.context->num_outputs());
-    }
-
-    output_tasks->reserve(output_task_num);
-    for (int i = 0; i < output_task_num; i++) {
-      auto task = absl::make_unique<BatchTask>();
-      task->guid = input_task.guid;
-      task->propagated_context = Context(ContextKind::kThread);
-      task->captured_inputs = input_task.captured_inputs;
-      task->context = input_task.context;
-      task->done_callback = barrier.Inc();
-      task->start_time = input_task.start_time;
-      task->split_index = i;
-      task->inputs.reserve(input_task.inputs.size());
-      task->is_partial = true;
-      task->status = input_task.status;
-
-      task->output = input_task.output;
-      output_tasks->push_back(std::move(task));
-    }
-
-    const int num_input_tensors = input_task.inputs.size();
-
-    // Splits each input tensor according to `output_task_sizes`, and
-    // initializes input of `output_tasks` with split results.
-    for (int i = 0; i < num_input_tensors; ++i) {
-      std::vector<Tensor> split_tensors;
-      const Tensor& input_tensor = input_task.inputs[i];
-      // TODO(b/154140947):
-      // Figure out the optimal implementation of Split, by using
-      // 'Tensor::Slice' and eliminating unnecessary memcpy as much as possible.
-      const Status split_status = Split(input_task.context, input_tensor,
-                                        output_task_sizes, &split_tensors);
-      if (!split_status.ok()) {
-        return errors::Internal(
-            "When splitting input, Tensor split operation failed: ",
-            split_status.ToString());
-      }
-      if (split_tensors.size() != output_task_sizes.size()) {
-        return errors::Internal(
-            "When splitting input, tensor split operation did not work as "
-            "expected; got ",
-            split_tensors.size(), " splits; expected ",
-            output_task_sizes.size());
-      }
-      for (int j = 0; j < output_tasks->size(); ++j) {
-        BatchTask& output_task = *((*output_tasks)[j]);
-        auto moved_tensor_iter = std::next(split_tensors.begin(), j);
-        std::move(moved_tensor_iter, moved_tensor_iter + 1,
-                  std::back_inserter(output_task.inputs));
-      }
-    }
-    return Status::OK();
-  }
-
-  Status SplitOutputTensors(const std::vector<Tensor>& combined_outputs,
-                            Batch* batch) const {
-    DCHECK_GE(batch->num_tasks(), 1);
-    if (batch->num_tasks() < 1) {
-      return errors::Internal("Batch size expected to be positive; was ",
-                              batch->num_tasks());
-    }
-
-    std::vector<int64> task_sizes_plus_optional_padding;
-    task_sizes_plus_optional_padding.reserve(batch->num_tasks());
-    for (int i = 0; i < batch->num_tasks(); ++i) {
-      task_sizes_plus_optional_padding.push_back(batch->task(i).size());
-    }
-    const int padding_size =
-        RoundToLowestAllowedBatchSize(batch->size()) - batch->size();
-    if (padding_size > 0) {
-      task_sizes_plus_optional_padding.push_back(padding_size);
-    }
-
-    // For each output tensor name, a divided-up tensor with one entry per task.
-    std::map<string, std::vector<Tensor>> split_tensors;
-
-    DCHECK_EQ(batch->task(0).context->num_outputs(), combined_outputs.size());
-    int combined_outputs_size = combined_outputs.size();
-    if (combined_outputs_size != batch->task(0).context->num_outputs()) {
-      return errors::Internal("Wrong number of batched output tensors");
-    }
-
-    // Generate 'split_tensors' and populate the context outputs.
-    for (int i = 0, iter_limit = combined_outputs.size(); i < iter_limit; ++i) {
-      const Tensor& output_tensor = combined_outputs[i];
-      if (output_tensor.shape().dims() == 0) {
-        return errors::FailedPrecondition(
-            "Batched output tensor has 0 dimensions");
-      }
-      if (output_tensor.shape().dim_size(0) !=
-          static_cast<long long int>(batch->size() + padding_size)) {
-        return errors::FailedPrecondition(
-            "Batched output tensor's 0th dimension does not equal the sum of "
-            "the 0th dimension sizes of the input tensors");
-      }
-
-      std::vector<Tensor> split_tensor;
-      const Status split_status = tensor::Split(
-          output_tensor, task_sizes_plus_optional_padding, &split_tensor);
-      DCHECK(split_status.ok()) << split_status.ToString();
-      if (!split_status.ok()) {
-        return errors::Internal("Tensor split operation failed: ",
-                                split_status.ToString());
-      }
-      DCHECK_EQ(split_tensor.size(), task_sizes_plus_optional_padding.size());
-      if (split_tensor.size() != task_sizes_plus_optional_padding.size()) {
-        return errors::Internal(
-            "Tensor split operation did not work as expected; got ",
-            split_tensor.size(), " splits; expected ",
-            task_sizes_plus_optional_padding.size());
-      }
-
-      // Ignore a possible final split_tensors entry containing the padding.
-      for (int j = 0; j < batch->num_tasks(); ++j) {
-        BatchTask& task = *(batch->mutable_task(j));
-        if (task.is_partial) {
-          std::vector<Tensor>& tensor_vector = (*task.output)[task.split_index];
-          tensor_vector[i] = std::move(split_tensor[j]);
-        } else {
-          task.context->set_output(i, split_tensor[j]);
-        }
-      }
-    }
-
-    return Status::OK();
-  }
-
-  void ProcessFuncBatch(std::unique_ptr<Batch> batch) const {
-    if (batch->empty()) {
-      return;
-    }
-
-    // We use the 'propagated_context' from one of the threads which setup one
-    // of the tasks. This will propagate any common context over all the threads
-    // which are running this Session, of which this BatchOp is a part.
-    WithContext wc(batch->task(batch->num_tasks() - 1).propagated_context);
-
-    OpKernelContext* last_task_context =
-        batch->task(batch->num_tasks() - 1).context;
-
-    // Regardless of the outcome, we need to propagate the status to the
-    // individual tasks and signal that they are done. We use MakeCleanup() to
-    // ensure that this happens no matter how we exit the method below.
-    Status status;
-    bool cleanup_done = false;
-    auto cleanup_fn = [&cleanup_done, &batch](const Status& status) {
-      if (cleanup_done) {
-        return;
-      }
-      for (int i = 0; i < batch->num_tasks(); ++i) {
-        if (batch->task(i).is_partial) {
-          batch->mutable_task(i)->status->Update(status);
-        } else {
-          batch->mutable_task(i)->context->SetStatus(status);
-        }
-
-        batch->mutable_task(i)->done_callback();
-      }
-      cleanup_done = true;
-    };
-
-    auto finally =
-        gtl::MakeCleanup([&cleanup_fn, &status] { cleanup_fn(status); });
-
-    status = ValidateBatch(*batch);
-    if (!status.ok()) {
-      return;
-    }
-
-    std::vector<Tensor> concatenated_tensors;
-    status =
-        ConcatInputTensors(*batch, last_task_context, &concatenated_tensors);
-    if (!status.ok()) {
-      return;
-    }
+  void ProcessFuncBatchImpl(
+      const BatchTask& last_task, absl::Span<const Tensor> inputs,
+      std::vector<Tensor>* combined_outputs,
+      std::function<void(const Status&)> done) const override {
+    auto* last_task_context = last_task.context;
     FunctionLibraryRuntime::Options opts;
     opts.step_container = last_task_context->step_container();
     opts.cancellation_manager = last_task_context->cancellation_manager();
@@ -846,185 +84,20 @@ class BatchResource : public ResourceBase {
     opts.rendezvous = last_task_context->rendezvous();
     opts.runner = last_task_context->runner();
     opts.run_all_kernels_inline = last_task_context->run_all_kernels_inline();
-
     auto* flib = last_task_context->function_library();
-    std::vector<Tensor> combined_outputs;
-    Notification done;
-    std::vector<Tensor> args(concatenated_tensors.begin(),
-                             concatenated_tensors.end());
-    const auto& captured_inputs =
-        batch->task(batch->num_tasks() - 1).captured_inputs;
-    args.insert(args.end(), captured_inputs.begin(), captured_inputs.end());
-
-    uint64 current_time = EnvTime::NowNanos();
-    const string& model_name = GetModelName(last_task_context);
-    for (int i = 0; i < batch->num_tasks(); ++i) {
-      RecordBatchDelayMs((current_time - batch->task(i).start_time) * 1e-6,
-                         model_name);
-    }
-    // Releases the cleanup method here, because the callback of the function
-    // library runtime will handle it now.
-    finally.release();
-    flib->Run(
-        opts, fhandle_, args, &combined_outputs, [&](const Status& run_status) {
-          Status final_status;
-          auto run_finally = gtl::MakeCleanup([&]() {
-            // We do the cleanup here as an optimization, so that it runs in
-            // the underlying TF inter-op threadpool. Running it in the
-            // threadpool, let's the ensuing ops be scheduled faster,
-            // because the executor will add them to the front of the
-            // threadpool's task queue rather than the end.
-            cleanup_fn(final_status);
-            done.Notify();
-          });
-          final_status = run_status;
-          if (!final_status.ok()) {
-            return;
-          }
-          final_status = SplitOutputTensors(combined_outputs, batch.get());
-        });
+    Notification done_notif;
+    flib->Run(opts, fhandle_, inputs, combined_outputs,
+              [&](const Status& run_status) {
+                done(run_status);
+                done_notif.Notify();
+              });
     // By waiting for the notification we are ensuring that this thread isn't
     // used for processing other batches, which gives the batches time to
     // coalesce upstream. So overall the number of batches going through the
     // devices goes down, improving latency and throughput in most cases.
-    done.WaitForNotification();
+    done_notif.WaitForNotification();
   }
 
-  // Processes a batch of one or more BatchTask entries.
-  void ProcessBatch(std::unique_ptr<Batch> batch) const {
-    if (batch->empty()) {
-      return;
-    }
-
-    WithContext wc(batch->task(batch->num_tasks() - 1).propagated_context);
-
-    OpKernelContext* last_task_context =
-        batch->task(batch->num_tasks() - 1).context;
-    AsyncOpKernel::DoneCallback last_task_callback =
-        batch->task(batch->num_tasks() - 1).done_callback;
-
-    OP_REQUIRES_OK_ASYNC(last_task_context, ValidateBatch(*batch),
-                         last_task_callback);
-
-    // All tasks should have the same number of input edges.
-    const int num_input_edges = batch->task(0).inputs.size();
-    std::vector<Tensor> concatenated_tensors;
-    const Status concat_status =
-        ConcatInputTensors(*batch, last_task_context, &concatenated_tensors);
-    OP_REQUIRES_OK_ASYNC(last_task_context, concat_status, last_task_callback);
-
-    // Process each input edge one at a time (the typical case has just one).
-    for (int i = 0; i < num_input_edges; ++i) {
-      last_task_context->set_output(i, concatenated_tensors[i]);
-
-      // Emit batch->num_tasks() - 1 empty output tensors.
-      for (int task_idx = 0; task_idx < batch->num_tasks() - 1; ++task_idx) {
-        const BatchTask& task = batch->task(task_idx);
-        TensorShape output_shape(task.inputs[i].shape());
-        output_shape.set_dim(0, 0);
-        Tensor* output = nullptr;
-        OP_REQUIRES_OK_ASYNC(
-            task.context,
-            task.context->allocate_output(i, output_shape, &output),
-            task.done_callback);
-      }
-    }
-    // Emit batch->num_tasks() - 1 empty index tensors.
-    for (int task_idx = 0; task_idx < batch->num_tasks() - 1; ++task_idx) {
-      const BatchTask& task = batch->task(task_idx);
-      TensorShape index_shape({0, 3});
-      Tensor* output = nullptr;
-      OP_REQUIRES_OK_ASYNC(
-          task.context,
-          task.context->allocate_output(num_input_edges, index_shape, &output),
-          task.done_callback);
-    }
-    // Emit all ID tensors.
-    for (int task_idx = 0; task_idx < batch->num_tasks(); ++task_idx) {
-      const BatchTask& task = batch->task(task_idx);
-      Tensor* id;
-      OP_REQUIRES_OK_ASYNC(task.context,
-                           task.context->allocate_output(num_input_edges + 1,
-                                                         TensorShape({}), &id),
-                           task.done_callback);
-      id->scalar<int64>()() = task.guid;
-    }
-    OP_REQUIRES_OK_ASYNC(
-        last_task_context,
-        EmitIndexTensor(last_task_context, *batch, num_input_edges),
-        last_task_callback);
-
-    // Signal done for each element of the batch. (At this point, the contexts
-    // are no longer guaranteed to remain live.)
-    for (int task_idx = 0; task_idx < batch->num_tasks(); ++task_idx) {
-      batch->mutable_task(task_idx)->done_callback();
-    }
-  }
-
-  // Emits an index tensor, which the Unbatch op will use to un-concatenate
-  // the tensor and attribute the pieces to the right batch keys. The index
-  // tensor contains, for each input: [batch_key, start_offset, end_offset]
-  // where start_offset and end_offset represent the range of entries in the
-  // concatenated tensors that belong to that input.
-  //
-  // Emits the result to the output at 'output_index' using 'context'.
-  static Status EmitIndexTensor(OpKernelContext* context, const Batch& batch,
-                                int output_index) {
-    const TensorShape index_shape({batch.num_tasks(), 3});
-    Tensor* index = nullptr;
-    TF_RETURN_IF_ERROR(
-        context->allocate_output(output_index, index_shape, &index));
-    auto index_flat = index->shaped<int64, 2>({batch.num_tasks(), 3});
-    size_t offset = 0;
-    for (int task_idx = 0; task_idx < batch.num_tasks(); ++task_idx) {
-      const BatchTask& task = batch.task(task_idx);
-      index_flat(task_idx, 0) = task.guid;
-      index_flat(task_idx, 1) = offset;
-      index_flat(task_idx, 2) = offset + task.size();
-      offset += task.size();
-    }
-    return Status::OK();
-  }
-
-  // Looks up the batcher queue for 'queue_name'. If it did't previously exist,
-  // creates it.
-  Status LookupOrCreateBatcherQueue(const string& queue_name,
-                                    BatcherQueue** queue) {
-    mutex_lock l(batcher_queues_mu_);
-
-    auto it = batcher_queues_.find(queue_name);
-    if (it != batcher_queues_.end()) {
-      *queue = it->second.get();
-      return Status::OK();
-    }
-
-    std::unique_ptr<BatcherQueue> new_queue;
-    auto process_batch_callback = [this](std::unique_ptr<Batch> batch) {
-      if (fhandle_ == kInvalidHandle) {
-        ProcessBatch(std::move(batch));
-      } else {
-        ProcessFuncBatch(std::move(batch));
-      }
-    };
-    TF_RETURN_IF_ERROR(batcher_->AddQueue(batcher_queue_options_,
-                                          process_batch_callback, &new_queue));
-    *queue = new_queue.get();
-    batcher_queues_[queue_name] = std::move(new_queue);
-    return Status::OK();
-  }
-
-  // A batch scheduler, and options for creating queues.
-  std::shared_ptr<Batcher> batcher_;
-  Batcher::QueueOptions batcher_queue_options_;
-
-  // A collection of batcher queues, keyed on queue name.
-  // TODO(olston): Garbage-collect unused queues (perhaps simply remove empty
-  // ones (with a time delay?); it's okay if they get recreated later).
-  mutable mutex batcher_queues_mu_;
-  std::map<string, std::unique_ptr<BatcherQueue>> batcher_queues_
-      TF_GUARDED_BY(batcher_queues_mu_);
-
-  std::vector<int32> allowed_batch_sizes_;
   FunctionLibraryRuntime::Handle fhandle_;
 };
 
@@ -1061,11 +134,6 @@ class BatchFunctionKernel : public AsyncOpKernel {
       enable_large_batch_splitting_ = false;
     }
 
-    if (enable_large_batch_splitting_ && (!allowed_batch_sizes_.empty())) {
-      max_execution_batch_size_ = *allowed_batch_sizes_.rbegin();
-    } else {
-      max_execution_batch_size_ = max_batch_size_;
-    }
     OP_REQUIRES_OK(c, ValidateAllowedBatchSizes());
   }
 
@@ -1125,7 +193,6 @@ class BatchFunctionKernel : public AsyncOpKernel {
   string batcher_queue_;
   int32 num_batch_threads_;
   int32 max_batch_size_;
-  int32 max_execution_batch_size_;
   int32 batch_timeout_micros_;
   int32 max_enqueued_batches_;
   std::vector<int32> allowed_batch_sizes_;
diff --git a/tensorflow/core/kernels/batch_matmul_op_real.cc b/tensorflow/core/kernels/batch_matmul_op_real.cc
index 12c1f48a3c8..075666c1dc3 100644
--- a/tensorflow/core/kernels/batch_matmul_op_real.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_real.cc
@@ -24,6 +24,7 @@ namespace tensorflow {
 TF_CALL_float(REGISTER_BATCH_MATMUL_CPU);
 TF_CALL_double(REGISTER_BATCH_MATMUL_CPU);
 TF_CALL_half(REGISTER_BATCH_MATMUL_CPU);
+TF_CALL_int16(REGISTER_BATCH_MATMUL_CPU);
 TF_CALL_int32(REGISTER_BATCH_MATMUL_CPU);
 TF_CALL_int64(REGISTER_BATCH_MATMUL_CPU);
 
diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index 3ae415ee31c..b662e2e066a 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -53,6 +53,18 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "threadsafe_status",
+    srcs = ["threadsafe_status.cc"],
+    hdrs = ["threadsafe_status.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
 tf_cc_test(
     name = "batch_scheduler_test",
     srcs = ["batch_scheduler_test.cc"],
@@ -187,6 +199,18 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "threadsafe_status_test",
+    srcs = ["threadsafe_status_test.cc"],
+    deps = [
+        ":threadsafe_status",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "fake_clock_env",
     testonly = 1,
@@ -197,3 +221,32 @@ cc_library(
         "//tensorflow/core:tensorflow",
     ],
 )
+
+cc_library(
+    name = "concat_split_util",
+    hdrs = ["concat_split_util.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core/kernels:concat_lib",
+        "//tensorflow/core/kernels:split_lib",
+        "//tensorflow/core/platform:status",
+    ],
+)
+
+cc_library(
+    name = "batch_resource_base",
+    srcs = ["batch_resource_base.cc"],
+    hdrs = ["batch_resource_base.h"],
+    deps = [
+        ":batch_scheduler",
+        ":concat_split_util",
+        ":shared_batch_scheduler",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/batching_util:threadsafe_status",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/platform:thread_annotations",
+        "//tensorflow/core/util:incremental_barrier",
+    ],
+)
diff --git a/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h b/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
index fab71209668..bd44115db22 100644
--- a/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BASIC_BATCH_SCHEDULER_H_
 
 #include <stddef.h>
+
 #include <cstddef>
 #include <functional>
 #include <memory>
@@ -176,6 +177,61 @@ class BasicBatchScheduler : public BatchScheduler<TaskType> {
     // parameter.
     int max_enqueued_batches = 10;
 
+    // If true, an input task (i.e., input of `BasicBatchScheduler::Schedule`)
+    // with a large size (i.e., larger than the largest value of
+    // `allowed_batch_sizes`) will be split into multiple smaller batch tasks
+    // and possibly put into different batches for processing. If false, each
+    // input task is put into one batch as a whole for processing.
+    //
+    // API note:
+    // The value of this option doesn't affect processing output given the same
+    // input; it affects implementation details as stated below:
+    // 1. Improve batching efficiency by eliminating unnecessary padding in the
+    // following scenario: when an open batch has M slots while an input of size
+    // N is scheduled (M < N), the input can be split to fill remaining slots
+    // of an open batch as opposed to padding.
+    // 2.`max_batch_size` specifies the limit of input and
+    // `max_execution_batch_size` specifies the limit of a task to be processed.
+    // API user can give an input of size 128 when 'max_execution_batch_size'
+    // is 32 -> implementation can split input of 128 into 4 x 32, schedule
+    // concurrent processing, and then return concatenated results corresponding
+    // to 128.
+    bool enable_large_batch_splitting = false;
+
+    // `split_input_task_func` specifies how to split `input_task` into
+    // `output_tasks`.
+    //
+    // `input_task`: a unit of task to be split.
+    // `first_output_task_size`: task size of first output.
+    // `max_batch_size`: Maximum size of each batch.
+    // `output_tasks`: A list of output tasks after split.
+    //
+    // REQUIRED:
+    // 1) All `output_tasks` should be non-empty tasks.
+    // 2) Sizes of `output_tasks` add up to size of `input_task`.
+    //
+    // NOTE:
+    // Instantiations of `TaskType` may vary, so it's up to caller to define
+    // how (e.g., which members to access) to split input tasks.
+    std::function<Status(std::unique_ptr<TaskType>* input_task,
+                         int first_output_task_size, int input_batch_size_limit,
+                         std::vector<std::unique_ptr<TaskType>>* output_tasks)>
+        split_input_task_func;
+
+    // The maximum size of each enqueued batch (i.e., in `batches_`).
+    //
+    // The scheduler may form batches of any size between 1 and this number
+    // (inclusive). If there is a need to quantize the batch sizes, i.e. only
+    // submit batches whose size is in a small set of allowed sizes, that can be
+    // done by adding padding in the process-batch callback.
+    //
+    // REQUIRES:
+    // - If enable_large_batch_splitting is true, `max_execution_batch_size` is
+    // less than or equal to `max_batch_size`.
+    // - If enable_large_batch_splitting is false, `max_execution_batch_size` is
+    // equal to `max_batch_size`.
+    int max_execution_batch_size = 10;
+
     // The following options are typically only overridden by test code.
 
     // The environment to use.
@@ -226,12 +282,17 @@ Status BasicBatchScheduler<TaskType>::Create(
 
   typename SharedBatchScheduler<TaskType>::QueueOptions
       shared_scheduler_queue_options;
-  shared_scheduler_queue_options.input_batch_size_limit =
-      options.max_batch_size;
+  shared_scheduler_queue_options.max_batch_size = options.max_batch_size;
   shared_scheduler_queue_options.batch_timeout_micros =
       options.batch_timeout_micros;
   shared_scheduler_queue_options.max_enqueued_batches =
       options.max_enqueued_batches;
+  shared_scheduler_queue_options.enable_large_batch_splitting =
+      options.enable_large_batch_splitting;
+  shared_scheduler_queue_options.split_input_task_func =
+      options.split_input_task_func;
+  shared_scheduler_queue_options.max_execution_batch_size =
+      options.max_execution_batch_size;
   std::unique_ptr<BatchScheduler<TaskType>> shared_scheduler_queue;
   TF_RETURN_IF_ERROR(shared_scheduler->AddQueue(shared_scheduler_queue_options,
                                                 process_batch_callback,
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.cc b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
new file mode 100644
index 00000000000..44e2879b9e4
--- /dev/null
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
@@ -0,0 +1,643 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/batching_util/batch_resource_base.h"
+
+#include "tensorflow/core/framework/ops_util.h"
+#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/kernels/batching_util/concat_split_util.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/monitoring/percentile_sampler.h"
+#include "tensorflow/core/util/incremental_barrier.h"
+
+namespace tensorflow {
+namespace serving {
+namespace {
+
+void RecordPaddingSize(int32 padding_size, const string& model_name,
+                       int32 execution_batch_size) {
+  static auto* cell = tensorflow::monitoring::PercentileSampler<2>::New(
+      {"/tensorflow/serving/batching/padding_size",
+       "Tracks the padding size distribution on batches by model_name (if "
+       "available).",
+       "model_name", "execution_batch_size"},
+      /*percentiles=*/{25.0, 50.0, 75.0, 90.0, 95.0, 99.0},
+      /*max_samples=*/1024, tensorflow::monitoring::UnitOfMeasure::kNumber);
+  cell->GetCell(model_name, absl::StrCat(execution_batch_size))
+      ->Add(static_cast<double>(padding_size));
+}
+
+void RecordInputBatchSize(int32 batch_size, const string& model_name) {
+  static auto* cell = tensorflow::monitoring::PercentileSampler<1>::New(
+      {"/tensorflow/serving/batching/input_batch_size",
+       "Tracks the batch size distribution on the inputs by model_name (if "
+       "available).",
+       "model_name"},
+      /*percentiles=*/{25.0, 50.0, 75.0, 90.0, 95.0, 99.0},
+      /*max_samples=*/1024, tensorflow::monitoring::UnitOfMeasure::kNumber);
+  cell->GetCell(model_name)->Add(static_cast<double>(batch_size));
+}
+
+void RecordProcessedBatchSize(int32 batch_size, const string& model_name) {
+  static auto* cell = tensorflow::monitoring::PercentileSampler<1>::New(
+      {"/tensorflow/serving/batching/processed_batch_size",
+       "Tracks the batch size distribution on processing by model_name (if "
+       "available).",
+       "model_name"},
+      /*percentiles=*/{25.0, 50.0, 75.0, 90.0, 95.0, 99.0},
+      /*max_samples=*/1024, tensorflow::monitoring::UnitOfMeasure::kNumber);
+  cell->GetCell(model_name)->Add(static_cast<double>(batch_size));
+}
+
+void RecordBatchDelayMs(int64 batch_delay_ms, const string& model_name) {
+  static auto* cell = monitoring::PercentileSampler<1>::New(
+      {"/tensorflow/serving/batching/batch_delay_ms",
+       "Tracks the batching delay for inputs by model_name (if "
+       "available).",
+       "model_name"},
+      /*percentiles=*/{25.0, 50.0, 75.0, 90.0, 95.0, 99.0},
+      /*max_samples=*/1024, monitoring::UnitOfMeasure::kTime);
+  cell->GetCell(model_name)->Add(static_cast<double>(batch_delay_ms));
+}
+
+const string& GetModelName(OpKernelContext* ctx) {
+  static string* kModelNameUnset = new string("model_name_unset");
+  if (!ctx->session_metadata()) return *kModelNameUnset;
+  if (ctx->session_metadata()->name().empty()) return *kModelNameUnset;
+  return ctx->session_metadata()->name();
+}
+
+}  // namespace
+
+using ::tensorflow::concat_split_util::Concat;
+using ::tensorflow::concat_split_util::Split;
+using TensorMatrix = std::vector<std::vector<Tensor>>;
+
+Status BatchResourceBase::RegisterInput(
+    int64 guid, OpKernelContext* context, const string& batcher_queue_name,
+    AsyncOpKernel::DoneCallback done_callback) {
+  std::unique_ptr<BatchTask> batch_components;
+  TF_RETURN_IF_ERROR(CreateBatchTask(context, &batch_components));
+  batch_components->start_time = EnvTime::NowNanos();
+  batch_components->guid = guid;
+  batch_components->propagated_context = Context(ContextKind::kThread);
+  OpInputList tensors;
+  TF_RETURN_IF_ERROR(context->input_list("in_tensors", &tensors));
+  batch_components->inputs.reserve(tensors.size());
+  for (const Tensor& tensor : tensors) {
+    if (tensor.shape().dims() == 0) {
+      return errors::InvalidArgument(
+          "Batching input tensors must have at least one dimension");
+    }
+    if (tensors.size() >= 2 &&
+        tensor.shape().dim_size(0) != tensors[0].shape().dim_size(0)) {
+      return errors::InvalidArgument(
+          "Batching input tensors supplied in a given op invocation must "
+          "have equal 0th-dimension size");
+    }
+    batch_components->inputs.push_back(tensor);
+  }
+  RecordInputBatchSize(tensors[0].shape().dim_size(0), GetModelName(context));
+  OpInputList captured_tensors;
+  const auto captured_status =
+      context->input_list("captured_tensors", &captured_tensors);
+  if (captured_status.ok()) {
+    batch_components->captured_inputs.reserve(captured_tensors.size());
+    for (const Tensor& captured_tensor : captured_tensors) {
+      batch_components->captured_inputs.push_back(captured_tensor);
+    }
+  }
+  batch_components->context = context;
+  batch_components->done_callback = std::move(done_callback);
+  batch_components->split_index = 0;
+  batch_components->output = std::make_shared<TensorMatrix>();
+  batch_components->status = std::make_shared<ThreadSafeStatus>();
+
+  BatcherQueueT* batcher_queue;
+  TF_RETURN_IF_ERROR(
+      LookupOrCreateBatcherQueue(batcher_queue_name, &batcher_queue));
+  return batcher_queue->Schedule(&batch_components);
+}
+
+/*static*/ BatchResourceBase::BatcherT::QueueOptions
+BatchResourceBase::GetBatcherQueueOptions(
+    int32 num_batch_threads, int32 max_batch_size, int32 batch_timeout_micros,
+    int32 max_enqueued_batches, const std::vector<int32>& allowed_batch_sizes,
+    bool enable_large_batch_splitting) {
+  BatcherT::QueueOptions batcher_queue_options;
+  batcher_queue_options.max_batch_size = max_batch_size;
+  batcher_queue_options.max_enqueued_batches = max_enqueued_batches;
+  batcher_queue_options.batch_timeout_micros = batch_timeout_micros;
+  // Support for splitting large batch is still in progress.
+  batcher_queue_options.enable_large_batch_splitting =
+      enable_large_batch_splitting;
+  if (enable_large_batch_splitting) {
+    batcher_queue_options.split_input_task_func =
+        [](std::unique_ptr<BatchTask>* input_task,
+           int open_batch_remaining_slot, int max_batch_size,
+           std::vector<std::unique_ptr<BatchTask>>* output_tasks) -> Status {
+      return SplitInputTask(input_task, open_batch_remaining_slot,
+                            max_batch_size, output_tasks);
+    };
+
+    if (allowed_batch_sizes.empty()) {
+      batcher_queue_options.max_execution_batch_size = max_batch_size;
+    } else {
+      batcher_queue_options.max_execution_batch_size =
+          *allowed_batch_sizes.rbegin();
+    }
+  }
+
+  return batcher_queue_options;
+}
+
+/*static*/ Status BatchResourceBase::ValidateBatch(const BatchT& batch) {
+  for (int task_idx = 0; task_idx < batch.num_tasks(); ++task_idx) {
+    const BatchResourceBase::BatchTask& task = batch.task(task_idx);
+
+    if (task.inputs.size() != batch.task(0).inputs.size()) {
+      return errors::InvalidArgument(
+          "Batching inputs must have equal number of edges");
+    }
+  }
+
+  return Status::OK();
+}
+
+// Returns the smallest entry in 'allowed_batch_sizes_' that is greater than
+// or equal to 'batch_size'. If 'allowed_batch_sizes_' is empty, simply
+// returns 'batch_size'.
+int BatchResourceBase::RoundToLowestAllowedBatchSize(int batch_size) const {
+  if (allowed_batch_sizes_.empty()) {
+    return batch_size;
+  }
+  for (int allowed_size : allowed_batch_sizes_) {
+    if (allowed_size >= batch_size) {
+      return allowed_size;
+    }
+  }
+  LOG(ERROR) << "Maximum batch size greater than largest allowed size; "
+                "ignoring allowed sizes constraint";
+  return batch_size;
+}
+
+Status BatchResourceBase::ConcatInputTensors(
+    const BatchT& batch, OpKernelContext* context,
+    std::vector<Tensor>* concatenated_tensors) const {
+  if (batch.num_tasks() == 0) {
+    return errors::InvalidArgument("Empty batch.");
+  }
+
+  const int padded_batch_size = RoundToLowestAllowedBatchSize(batch.size());
+  const int padding_amount = padded_batch_size - batch.size();
+  RecordPaddingSize(padding_amount, GetModelName(context), padded_batch_size);
+  RecordProcessedBatchSize(padded_batch_size, GetModelName(context));
+
+  // All tasks should have the same number of input edges.
+  const int num_inputs = batch.task(0).inputs.size();
+  concatenated_tensors->reserve(num_inputs);
+
+  // Process each input one at a time (the typical case has just one).
+  for (int i = 0; i < num_inputs; ++i) {
+    // Concatenate the tasks ith input tensors into a big output tensor.
+    std::vector<Tensor> to_concatenate;
+    to_concatenate.reserve(batch.num_tasks());
+    for (int task_idx = 0; task_idx < batch.num_tasks(); ++task_idx) {
+      to_concatenate.push_back(batch.task(task_idx).inputs.at(i));
+    }
+
+    // Add padding as needed. Use the first row of the first task's tensor as
+    // the data for padding.
+    if (padding_amount > 0) {
+      const Tensor& padding_source = batch.task(0).inputs.at(i);
+      Tensor padding;
+      if (padding_source.shape().dim_size(0) == 0) {
+        return errors::InvalidArgument(
+            "Cannot use an empty tensor with zero rows as padding when "
+            "batching. (Input ",
+            i, " got shape ", padding_source.shape().DebugString(), ".)");
+      }
+      if (padding_source.shape().dim_size(0) == 1) {
+        padding = padding_source;
+      } else {
+        padding = padding_source.Slice(0, 1);
+      }
+      for (int i = 0; i < padding_amount; ++i) {
+        to_concatenate.push_back(padding);
+      }
+    }
+
+    Tensor concatenated_tensor;
+    Status concat_status =
+        Concat(context, to_concatenate, &concatenated_tensor);
+    TF_RETURN_IF_ERROR(concat_status);
+    concatenated_tensors->push_back(concatenated_tensor);
+  }
+  return Status::OK();
+}
+
+/*static*/ Status BatchResourceBase::SplitInputTask(
+    std::unique_ptr<BatchTask>* input_task_ptr, int open_batch_remaining_slot,
+    int max_batch_size, std::vector<std::unique_ptr<BatchTask>>* output_tasks) {
+  BatchTask& input_task = *(*input_task_ptr);
+  const int64 input_task_size = input_task.size();
+
+  DCHECK_GT(input_task_size, open_batch_remaining_slot);
+
+  std::shared_ptr<ThreadSafeStatus> shared_status = input_task.status;
+
+  // `split_task_done_callback` runs only after all splitted tasks are
+  // complete.
+  std::function<void()> split_task_done_callback =
+      [done_callback = input_task.done_callback, output = input_task.output,
+       op_kernel_context = input_task.context, status = shared_status]() {
+        const int num_output = op_kernel_context->num_outputs();
+        for (int i = 0; i < num_output; ++i) {
+          Tensor output_tensor;
+
+          // Concat would memcpy each input tensor to one output tensor.
+          // In this context, Concat can be further optimized to get rid of
+          // some (probably all) memcpy when input tensors are slices of
+          // another copy.
+          std::vector<Tensor> to_concatenate;
+          to_concatenate.reserve(output->size());
+          for (int j = 0; j < output->size(); ++j) {
+            to_concatenate.push_back(std::move((*output)[j][i]));
+          }
+          const auto concat_status =
+              Concat(op_kernel_context, to_concatenate, &output_tensor);
+          if (!concat_status.ok()) {
+            status->Update(concat_status);
+          }
+
+          op_kernel_context->set_output(i, std::move(output_tensor));
+        }
+        op_kernel_context->SetStatus(status->status());
+        done_callback();
+      };
+  IncrementalBarrier barrier(split_task_done_callback);
+
+  std::vector<int64> output_task_sizes;
+
+  if (open_batch_remaining_slot > 0) {
+    output_task_sizes.push_back(open_batch_remaining_slot);
+  }
+
+  for (int left_task_size = input_task_size - open_batch_remaining_slot;
+       left_task_size > 0; left_task_size -= max_batch_size) {
+    int next_task_size = std::min(left_task_size, max_batch_size);
+    output_task_sizes.push_back(next_task_size);
+  }
+
+  const int output_task_num = output_task_sizes.size();
+  input_task.output->resize(output_task_num);
+
+  for (int i = 0; i < output_task_num; ++i) {
+    (*input_task.output)[i].resize(input_task.context->num_outputs());
+  }
+
+  output_tasks->reserve(output_task_num);
+  for (int i = 0; i < output_task_num; i++) {
+    auto task = absl::make_unique<BatchTask>();
+    task->guid = input_task.guid;
+    task->propagated_context = Context(ContextKind::kThread);
+    task->captured_inputs = input_task.captured_inputs;
+    task->context = input_task.context;
+    task->done_callback = barrier.Inc();
+    task->start_time = input_task.start_time;
+    task->split_index = i;
+    task->inputs.reserve(input_task.inputs.size());
+    task->is_partial = true;
+    task->status = input_task.status;
+
+    task->output = input_task.output;
+    output_tasks->push_back(std::move(task));
+  }
+
+  const int num_input_tensors = input_task.inputs.size();
+
+  // Splits each input tensor according to `output_task_sizes`, and
+  // initializes input of `output_tasks` with split results.
+  for (int i = 0; i < num_input_tensors; ++i) {
+    std::vector<Tensor> split_tensors;
+    const Tensor& input_tensor = input_task.inputs[i];
+    // TODO(b/154140947):
+    // Figure out the optimal implementation of Split, by using
+    // 'Tensor::Slice' and eliminating unnecessary memcpy as much as possible.
+    const Status split_status = Split(input_task.context, input_tensor,
+                                      output_task_sizes, &split_tensors);
+    if (!split_status.ok()) {
+      return errors::Internal(
+          "When splitting input, Tensor split operation failed: ",
+          split_status.ToString());
+    }
+    if (split_tensors.size() != output_task_sizes.size()) {
+      return errors::Internal(
+          "When splitting input, tensor split operation did not work as "
+          "expected; got ",
+          split_tensors.size(), " splits; expected ", output_task_sizes.size());
+    }
+    for (int j = 0; j < output_tasks->size(); ++j) {
+      BatchTask& output_task = *((*output_tasks)[j]);
+      auto moved_tensor_iter = std::next(split_tensors.begin(), j);
+      std::move(moved_tensor_iter, moved_tensor_iter + 1,
+                std::back_inserter(output_task.inputs));
+    }
+  }
+  return Status::OK();
+}
+
+Status BatchResourceBase::SplitOutputTensors(
+    const std::vector<Tensor>& combined_outputs, BatchT* batch) const {
+  DCHECK_GE(batch->num_tasks(), 1);
+  if (batch->num_tasks() < 1) {
+    return errors::Internal("Batch size expected to be positive; was ",
+                            batch->num_tasks());
+  }
+
+  std::vector<int64> task_sizes_plus_optional_padding;
+  task_sizes_plus_optional_padding.reserve(batch->num_tasks());
+  for (int i = 0; i < batch->num_tasks(); ++i) {
+    task_sizes_plus_optional_padding.push_back(batch->task(i).size());
+  }
+  const int padding_size =
+      RoundToLowestAllowedBatchSize(batch->size()) - batch->size();
+  if (padding_size > 0) {
+    task_sizes_plus_optional_padding.push_back(padding_size);
+  }
+
+  // For each output tensor name, a divided-up tensor with one entry per task.
+  std::map<string, std::vector<Tensor>> split_tensors;
+
+  DCHECK_EQ(batch->task(0).context->num_outputs(), combined_outputs.size());
+  int combined_outputs_size = combined_outputs.size();
+  if (combined_outputs_size != batch->task(0).context->num_outputs()) {
+    return errors::Internal("Wrong number of batched output tensors");
+  }
+
+  // Generate 'split_tensors' and populate the context outputs.
+  for (int i = 0, iter_limit = combined_outputs.size(); i < iter_limit; ++i) {
+    const Tensor& output_tensor = combined_outputs[i];
+    if (output_tensor.shape().dims() == 0) {
+      return errors::FailedPrecondition(
+          "Batched output tensor has 0 dimensions");
+    }
+    if (output_tensor.shape().dim_size(0) !=
+        static_cast<int64>(batch->size() + padding_size)) {
+      return errors::FailedPrecondition(
+          "Batched output tensor's 0th dimension does not equal the sum of "
+          "the 0th dimension sizes of the input tensors");
+    }
+
+    std::vector<Tensor> split_tensor;
+    const Status split_status = tensor::Split(
+        output_tensor, task_sizes_plus_optional_padding, &split_tensor);
+    DCHECK(split_status.ok()) << split_status.ToString();
+    if (!split_status.ok()) {
+      return errors::Internal("Tensor split operation failed: ",
+                              split_status.ToString());
+    }
+    DCHECK_EQ(split_tensor.size(), task_sizes_plus_optional_padding.size());
+    if (split_tensor.size() != task_sizes_plus_optional_padding.size()) {
+      return errors::Internal(
+          "Tensor split operation did not work as expected; got ",
+          split_tensor.size(), " splits; expected ",
+          task_sizes_plus_optional_padding.size());
+    }
+
+    // Ignore a possible final split_tensors entry containing the padding.
+    for (int j = 0; j < batch->num_tasks(); ++j) {
+      BatchTask& task = *(batch->mutable_task(j));
+      if (task.is_partial) {
+        std::vector<Tensor>& tensor_vector = (*task.output)[task.split_index];
+        tensor_vector[i] = std::move(split_tensor[j]);
+      } else {
+        task.context->set_output(i, split_tensor[j]);
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+void BatchResourceBase::ProcessFuncBatch(std::unique_ptr<BatchT> batch) const {
+  if (batch->empty()) {
+    return;
+  }
+
+  // We use the 'propagated_context' from one of the threads which setup one
+  // of the tasks. This will propagate any common context over all the threads
+  // which are running this Session, of which this BatchOp is a part.
+  WithContext wc(batch->task(batch->num_tasks() - 1).propagated_context);
+
+  auto& last_task = batch->task(batch->num_tasks() - 1);
+  OpKernelContext* last_task_context = last_task.context;
+
+  // Regardless of the outcome, we need to propagate the status to the
+  // individual tasks and signal that they are done. We use MakeCleanup() to
+  // ensure that this happens no matter how we exit the method below.
+  Status status;
+  bool cleanup_done = false;
+  auto cleanup_fn = [&cleanup_done, &batch](const Status& status) {
+    if (cleanup_done) {
+      return;
+    }
+    for (int i = 0; i < batch->num_tasks(); ++i) {
+      if (batch->task(i).is_partial) {
+        batch->mutable_task(i)->status->Update(status);
+      } else {
+        batch->mutable_task(i)->context->SetStatus(status);
+      }
+
+      batch->mutable_task(i)->done_callback();
+    }
+    cleanup_done = true;
+  };
+
+  auto finally =
+      gtl::MakeCleanup([&cleanup_fn, &status] { cleanup_fn(status); });
+
+  status = ValidateBatch(*batch);
+  if (!status.ok()) {
+    return;
+  }
+
+  std::vector<Tensor> concatenated_tensors;
+  status = ConcatInputTensors(*batch, last_task_context, &concatenated_tensors);
+  if (!status.ok()) {
+    return;
+  }
+
+  std::vector<Tensor> combined_outputs;
+  std::vector<Tensor> args(concatenated_tensors.begin(),
+                           concatenated_tensors.end());
+  const auto& captured_inputs =
+      batch->task(batch->num_tasks() - 1).captured_inputs;
+  args.insert(args.end(), captured_inputs.begin(), captured_inputs.end());
+
+  uint64 current_time = EnvTime::NowNanos();
+  const string& model_name = GetModelName(last_task_context);
+  for (int i = 0; i < batch->num_tasks(); ++i) {
+    RecordBatchDelayMs((current_time - batch->task(i).start_time) * 1e-6,
+                       model_name);
+  }
+  // Releases the cleanup method here, because the callback of the function
+  // library runtime will handle it now.
+  finally.release();
+  ProcessFuncBatchImpl(
+      last_task, args, &combined_outputs, [&](const Status& run_status) {
+        Status final_status;
+        auto run_finally = gtl::MakeCleanup([&]() {
+          // We do the cleanup here as an optimization, so that
+          // it runs in the underlying TF inter-op threadpool.
+          // Running it in the threadpool, let's the ensuing
+          // ops be scheduled faster, because the executor will
+          // add them to the front of the threadpool's task
+          // queue rather than the end.
+          cleanup_fn(final_status);
+        });
+        final_status = run_status;
+        if (!final_status.ok()) {
+          return;
+        }
+        final_status = SplitOutputTensors(combined_outputs, batch.get());
+      });
+}
+
+// Processes a batch of one or more BatchTask entries.
+void BatchResourceBase::ProcessBatch(std::unique_ptr<BatchT> batch) const {
+  if (batch->empty()) {
+    return;
+  }
+
+  WithContext wc(batch->task(batch->num_tasks() - 1).propagated_context);
+
+  OpKernelContext* last_task_context =
+      batch->task(batch->num_tasks() - 1).context;
+  AsyncOpKernel::DoneCallback last_task_callback =
+      batch->task(batch->num_tasks() - 1).done_callback;
+
+  OP_REQUIRES_OK_ASYNC(last_task_context, ValidateBatch(*batch),
+                       last_task_callback);
+
+  // All tasks should have the same number of input edges.
+  const int num_input_edges = batch->task(0).inputs.size();
+  std::vector<Tensor> concatenated_tensors;
+  const Status concat_status =
+      ConcatInputTensors(*batch, last_task_context, &concatenated_tensors);
+  OP_REQUIRES_OK_ASYNC(last_task_context, concat_status, last_task_callback);
+
+  // Process each input edge one at a time (the typical case has just one).
+  for (int i = 0; i < num_input_edges; ++i) {
+    last_task_context->set_output(i, concatenated_tensors[i]);
+
+    // Emit batch->num_tasks() - 1 empty output tensors.
+    for (int task_idx = 0; task_idx < batch->num_tasks() - 1; ++task_idx) {
+      const BatchTask& task = batch->task(task_idx);
+      TensorShape output_shape(task.inputs[i].shape());
+      output_shape.set_dim(0, 0);
+      Tensor* output = nullptr;
+      OP_REQUIRES_OK_ASYNC(
+          task.context, task.context->allocate_output(i, output_shape, &output),
+          task.done_callback);
+    }
+  }
+  // Emit batch->num_tasks() - 1 empty index tensors.
+  for (int task_idx = 0; task_idx < batch->num_tasks() - 1; ++task_idx) {
+    const BatchTask& task = batch->task(task_idx);
+    TensorShape index_shape({0, 3});
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK_ASYNC(
+        task.context,
+        task.context->allocate_output(num_input_edges, index_shape, &output),
+        task.done_callback);
+  }
+  // Emit all ID tensors.
+  for (int task_idx = 0; task_idx < batch->num_tasks(); ++task_idx) {
+    const BatchTask& task = batch->task(task_idx);
+    Tensor* id;
+    OP_REQUIRES_OK_ASYNC(task.context,
+                         task.context->allocate_output(num_input_edges + 1,
+                                                       TensorShape({}), &id),
+                         task.done_callback);
+    id->scalar<int64>()() = task.guid;
+  }
+  OP_REQUIRES_OK_ASYNC(
+      last_task_context,
+      EmitIndexTensor(last_task_context, *batch, num_input_edges),
+      last_task_callback);
+
+  // Signal done for each element of the batch. (At this point, the contexts
+  // are no longer guaranteed to remain live.)
+  for (int task_idx = 0; task_idx < batch->num_tasks(); ++task_idx) {
+    batch->mutable_task(task_idx)->done_callback();
+  }
+}
+
+/*static*/ Status BatchResourceBase::EmitIndexTensor(OpKernelContext* context,
+                                                     const BatchT& batch,
+                                                     int output_index) {
+  const TensorShape index_shape({batch.num_tasks(), 3});
+  Tensor* index = nullptr;
+  TF_RETURN_IF_ERROR(
+      context->allocate_output(output_index, index_shape, &index));
+  auto index_flat = index->shaped<int64, 2>({batch.num_tasks(), 3});
+  size_t offset = 0;
+  for (int task_idx = 0; task_idx < batch.num_tasks(); ++task_idx) {
+    const BatchTask& task = batch.task(task_idx);
+    index_flat(task_idx, 0) = task.guid;
+    index_flat(task_idx, 1) = offset;
+    index_flat(task_idx, 2) = offset + task.size();
+    offset += task.size();
+  }
+  return Status::OK();
+}
+
+// Looks up the batcher queue for 'queue_name'. If it did't previously exist,
+// creates it.
+Status BatchResourceBase::LookupOrCreateBatcherQueue(const string& queue_name,
+                                                     BatcherQueueT** queue) {
+  mutex_lock l(batcher_queues_mu_);
+
+  auto it = batcher_queues_.find(queue_name);
+  if (it != batcher_queues_.end()) {
+    *queue = it->second.get();
+    return Status::OK();
+  }
+
+  std::unique_ptr<BatcherQueueT> new_queue;
+  auto process_batch_callback = [this](std::unique_ptr<BatchT> batch) {
+    if (!has_process_batch_function_) {
+      ProcessBatch(std::move(batch));
+    } else {
+      ProcessFuncBatch(std::move(batch));
+    }
+  };
+  TF_RETURN_IF_ERROR(batcher_->AddQueue(batcher_queue_options_,
+                                        process_batch_callback, &new_queue));
+  *queue = new_queue.get();
+  batcher_queues_[queue_name] = std::move(new_queue);
+  return Status::OK();
+}
+
+Status BatchResourceBase::CreateBatchTask(
+    OpKernelContext* context,
+    std::unique_ptr<BatchResourceBase::BatchTask>* output) const {
+  *output = absl::make_unique<BatchResourceBase::BatchTask>();
+  return Status::OK();
+}
+
+}  // namespace serving
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.h b/tensorflow/core/kernels/batching_util/batch_resource_base.h
new file mode 100644
index 00000000000..39d6e3dd951
--- /dev/null
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.h
@@ -0,0 +1,202 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_RESOURCE_BASE_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_RESOURCE_BASE_H_
+
+#include <map>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
+#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
+#include "tensorflow/core/kernels/batching_util/threadsafe_status.h"
+#include "tensorflow/core/platform/context.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+namespace serving {
+
+// Base class for resource that encapsulating the state and logic for batching
+// tensors.
+class BatchResourceBase : public ResourceBase {
+ public:
+  // Given a BatchTask (from one op invocation) with 'num_outputs'== M and
+  // splitted into N sub tasks, TensorMatrix is a N X M matrix.
+  // Namely, TensorMatrix[i][j] indicates the i-th split tensor of j-th output;
+  // concatenating tensors along the 2nd dimension gives a output tensor.
+  typedef std::vector<std::vector<Tensor>> TensorMatrix;
+
+  // Ingests data from one invocation of the batch op. The data is enqueued to
+  // be combined with others into a batch, asynchronously.
+  Status RegisterInput(int64 guid, OpKernelContext* context,
+                       const string& batcher_queue_name,
+                       AsyncOpKernel::DoneCallback done_callback);
+
+ protected:
+  // One task to be batched, corresponds to a `slice` of input from one batch-op
+  // invocation.
+  //
+  // Given input from one batch-op invocation, a `slice` of this input is:
+  // 1) Split each Tensor in `BatchTask::inputs` along the 0th dimension.
+  // 2) 'split_index' is calculated along the 0-th dimension.
+  //
+  // Note input from one batch-op invocation is valid and considered a
+  // specialized `slice`.
+  struct BatchTask : public tensorflow::serving::BatchTask {
+    // A unique ID to identify this invocation of Batch.
+    int64 guid;
+
+    Context propagated_context;
+
+    std::vector<Tensor> inputs;
+    std::vector<Tensor> captured_inputs;
+    OpKernelContext* context;
+    AsyncOpKernel::DoneCallback done_callback;
+
+    // The index of this split, along the 0-th dimension of input from op
+    // invocation.
+    int split_index = 0;
+
+    // Two-dimensional tensor matrix, ownership shared by:
+    // 1) each split of task (to fill one row in this matrix)
+    // and
+    // 2) callback that runs to merge output of individual splits for an op
+    // invocation, after all splits complete.
+    std::shared_ptr<TensorMatrix> output;
+
+    // 'status' records error (could be from any split) if at least one split
+    // returns error, OK otherwise.
+    // Ownership is shared by individual splits and callback.
+    std::shared_ptr<ThreadSafeStatus> status;
+
+    bool is_partial = false;
+
+    size_t size() const override { return inputs[0].shape().dim_size(0); }
+
+    uint64 start_time;
+  };
+
+  // Appending a T suffix to make the type alias different to those in
+  // tensorflow::serving namespace, because some versions of compiler complain
+  // about changing meaning of the symbols.
+  using BatcherT = SharedBatchScheduler<BatchResourceBase::BatchTask>;
+  using BatcherQueueT = BatchScheduler<BatchResourceBase::BatchTask>;
+  using BatchT = Batch<BatchResourceBase::BatchTask>;
+
+  BatchResourceBase(bool has_process_batch_function,
+                    std::shared_ptr<BatcherT> batcher,
+                    const BatcherT::QueueOptions& batcher_queue_options,
+                    std::vector<int32> allowed_batch_sizes)
+      : has_process_batch_function_(has_process_batch_function),
+        batcher_(std::move(batcher)),
+        batcher_queue_options_(batcher_queue_options),
+        allowed_batch_sizes_(std::move(allowed_batch_sizes)) {}
+
+  static BatcherT::QueueOptions GetBatcherQueueOptions(
+      int32 num_batch_threads, int32 max_batch_size, int32 batch_timeout_micros,
+      int32 max_enqueued_batches, const std::vector<int32>& allowed_batch_sizes,
+      bool enable_large_batch_splitting);
+
+ private:
+  // Implementation of calling the process batch function.
+  virtual void ProcessFuncBatchImpl(
+      const BatchResourceBase::BatchTask& last_task,
+      absl::Span<const Tensor> inputs, std::vector<Tensor>* combined_outputs,
+      std::function<void(const Status&)> done) const = 0;
+
+  // Factory method for creating a BatchTask, overridable by subclasses.
+  virtual Status CreateBatchTask(
+      OpKernelContext* context,
+      std::unique_ptr<BatchResourceBase::BatchTask>* output) const;
+
+  // Validates that it's legal to combine the tasks in 'batch' into a batch.
+  // Assumes the batch is non-empty.
+  static Status ValidateBatch(const BatchT& batch);
+
+  // Returns the smallest entry in 'allowed_batch_sizes_' that is greater than
+  // or equal to 'batch_size'. If 'allowed_batch_sizes_' is empty, simply
+  // returns 'batch_size'.
+  int RoundToLowestAllowedBatchSize(int batch_size) const;
+
+  Status ConcatInputTensors(const BatchT& batch, OpKernelContext* context,
+                            std::vector<Tensor>* concatenated_tensors) const;
+
+  // Split 'input' of 'input_task_ptr' along 0th dimension, into a list of
+  // 'output_tasks'.
+  // Task sizes are determined by
+  // 1) open_batch_remaining_slot
+  // 2) max_batch_size
+  // 3) size-of-input-task
+  // in a way that
+  // 1) Task sizes add up to `size-of-input-task`.
+  // 2) Task sizes from left to right are like
+  //    [open_batch_remaining_slot, max_batch_size, max_batch_size, ...,
+  //    `size-of-input-task` - `sum-of-previous-elements`].
+  //
+  // REQUIRES:
+  // Caller should make sure size-of-input-task is greater than
+  // open_batch_remaining_slot.
+  static Status SplitInputTask(
+      std::unique_ptr<BatchTask>* input_task_ptr, int open_batch_remaining_slot,
+      int max_batch_size,
+      std::vector<std::unique_ptr<BatchTask>>* output_tasks);
+
+  Status SplitOutputTensors(const std::vector<Tensor>& combined_outputs,
+                            BatchT* batch) const;
+
+  void ProcessFuncBatch(std::unique_ptr<BatchT> batch) const;
+
+  // Processes a batch of one or more BatchTask entries.
+  void ProcessBatch(std::unique_ptr<BatchT> batch) const;
+
+  // Emits an index tensor, which the Unbatch op will use to un-concatenate
+  // the tensor and attribute the pieces to the right batch keys. The index
+  // tensor contains, for each input: [batch_key, start_offset, end_offset]
+  // where start_offset and end_offset represent the range of entries in the
+  // concatenated tensors that belong to that input.
+  //
+  // Emits the result to the output at 'output_index' using 'context'.
+  static Status EmitIndexTensor(OpKernelContext* context, const BatchT& batch,
+                                int output_index);
+
+  // Looks up the batcher queue for 'queue_name'. If it did't previously exist,
+  // creates it.
+  Status LookupOrCreateBatcherQueue(const string& queue_name,
+                                    BatcherQueueT** queue);
+
+  // True if user specified a batch processing function for this resource.
+  const bool has_process_batch_function_;
+  // A batch scheduler, and options for creating queues.
+  std::shared_ptr<BatcherT> batcher_;
+  BatcherT::QueueOptions batcher_queue_options_;
+
+  // A collection of batcher queues, keyed on queue name.
+  // TODO(olston): Garbage-collect unused queues (perhaps simply remove empty
+  // ones (with a time delay?); it's okay if they get recreated later).
+  mutable mutex batcher_queues_mu_;
+  std::map<string, std::unique_ptr<BatcherQueueT>> batcher_queues_
+      TF_GUARDED_BY(batcher_queues_mu_);
+
+  std::vector<int32> allowed_batch_sizes_;
+};
+
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_RESOURCE_BASE_H_
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler.h b/tensorflow/core/kernels/batching_util/batch_scheduler.h
index bfafb5ed062..e993e592e12 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler.h
@@ -128,6 +128,8 @@ class Batch {
   // The sum of the sizes of the tasks in 'tasks_'.
   size_t size_ TF_GUARDED_BY(mu_) = 0;
 
+  std::atomic<bool> empty_ TF_GUARDED_BY(mu_){true};
+
   // Whether the batch has been closed.
   Notification closed_;
 
@@ -215,6 +217,7 @@ void Batch<TaskType>::AddTask(std::unique_ptr<TaskType> task) {
     mutex_lock l(mu_);
     size_ += task->size();
     tasks_.push_back(std::move(task));
+    empty_.store(false);
   }
 }
 
@@ -228,6 +231,9 @@ std::unique_ptr<TaskType> Batch<TaskType>::RemoveTask() {
     std::unique_ptr<TaskType> task = std::move(tasks_.back());
     size_ -= task->size();
     tasks_.pop_back();
+    if (tasks_.empty()) {
+      empty_.store(true);
+    }
     return task;
   }
 }
@@ -241,16 +247,13 @@ int Batch<TaskType>::num_tasks() const {
 }
 
 template <typename TaskType>
-bool Batch<TaskType>::empty() const {
-  {
-    // tracer is added to zoom in about this method.
-    // TODO(b/160249203): Remove tracer after evaluating a change to reduce
-    // lock contention and cpu usage (which is observed in profiler and
-    // very data-driven).
-    tensorflow::profiler::TraceMe tracer("BatchTask::empty");
-    mutex_lock l(mu_);
-    return tasks_.empty();
-  }
+bool Batch<TaskType>::empty() const TF_NO_THREAD_SAFETY_ANALYSIS {
+  // tracer is added to zoom in about this method.
+  // TODO(b/160249203): Remove tracer after evaluating a change to reduce
+  // lock contention and cpu usage (which is observed in profiler and
+  // very data-driven).
+  tensorflow::profiler::TraceMe tracer("BatchTask::empty");
+  return empty_.load();
 }
 
 template <typename TaskType>
diff --git a/tensorflow/core/kernels/batching_util/concat_split_util.h b/tensorflow/core/kernels/batching_util/concat_split_util.h
new file mode 100644
index 00000000000..50ffc664452
--- /dev/null
+++ b/tensorflow/core/kernels/batching_util/concat_split_util.h
@@ -0,0 +1,247 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_CONCAT_SPLIT_UTIL_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_CONCAT_SPLIT_UTIL_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/ops_util.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/kernels/split_lib.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace concat_split_util {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif  // TENSORFLOW_USE_SYCL
+
+// Concatenates 'inputs' into a single tensor along the zeroth dimension.
+// Requires that all elements of 'inputs' have element type T. Writes to
+// 'output' using 'context' for the allocation to ensure proper device
+// placement.
+template <typename T>
+Status Concat(OpKernelContext* context, const gtl::ArraySlice<Tensor> inputs,
+              Tensor* output) {
+  const int input_dims = inputs[0].dims();
+  const TensorShape& input_shape = inputs[0].shape();
+
+  // Note that we reduce the concat of k-dimensional tensors into a two
+  // dimensional concat. Assuming the dimensions of any input tensor are
+  // {y0, y1,...,ym-1}, we flatten it to {1, y}, where y = Prod_i(yi).
+  std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>> inputs_flat;
+  inputs_flat.reserve(inputs.size());
+  int64 output_dim0 = 0;
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    const Tensor& input = inputs[i];
+    if (input.dims() != input_dims) {
+      return errors::InvalidArgument(
+          "Ranks of all input tensors should match: shape[0] = ",
+          input_shape.DebugString(), " vs. shape[", i,
+          "] = ", input.shape().DebugString());
+    }
+    for (int j = 1; j < input_dims; ++j) {
+      if (input.dim_size(j) != input_shape.dim_size(j)) {
+        return errors::InvalidArgument(
+            "Dimensions of inputs should match: shape[0] = ",
+            input_shape.DebugString(), " vs. shape[", i,
+            "] = ", input.shape().DebugString());
+      }
+    }
+    if (input.NumElements() > 0) {
+      inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+          input.shaped<T, 2>({1, input.NumElements()})));
+    }
+    output_dim0 += input.dim_size(0);
+  }
+
+  TensorShape output_shape(input_shape);
+  output_shape.set_dim(0, output_dim0);
+  TF_RETURN_IF_ERROR(
+      context->allocate_temp(DataTypeToEnum<T>::value, output_shape, output));
+  if (output->NumElements() > 0) {
+    auto output_flat = output->shaped<T, 2>({1, output->NumElements()});
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
+    if (std::is_same<Device, GPUDevice>::value) {
+      ConcatGPU<T>(context, inputs_flat, output, &output_flat);
+      return Status::OK();
+    }
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    ConcatCPU<T>(context->device(), inputs_flat, &output_flat);
+  }
+
+  return Status::OK();
+}
+
+// Same as 'Concat' above, but handles Tensor dtype deduction automatically.
+inline Status Concat(OpKernelContext* context,
+                     const gtl::ArraySlice<Tensor> inputs, Tensor* output) {
+  const DataType type = inputs[0].dtype();
+  Status concat_status;
+  switch (type) {
+#define CASE(type)                                         \
+  case DataTypeToEnum<type>::value:                        \
+    concat_status = Concat<type>(context, inputs, output); \
+    break;
+    TF_CALL_ALL_TYPES(CASE);
+#undef CASE
+    default:
+      concat_status = errors::InvalidArgument("Unsupported data type: ", type);
+      break;
+  }
+  return concat_status;
+}
+
+// The Split*() functions split 'input' with element type T into 'sizes.size()'
+// tensors along the zeroth dimension, with the ith split having zeroth-
+// dimension size 'sizes[i]'. They allocate the output tensors using 'context',
+// for proper device placement.
+
+// Handles special cases that are cheap. Sets 'done==true' iff it found an
+// applicable special case and wrote to the outputs. Otherwise acts as a no-op.
+template <typename T>
+Status SplitEasyCases(OpKernelContext* context, const Tensor& input,
+                      const gtl::ArraySlice<int64> sizes,
+                      std::vector<Tensor>* outputs, bool* done) {
+  *done = false;
+
+  int64 total_size = 0;
+  for (const int64 size : sizes) {
+    total_size += size;
+  }
+  if (total_size > input.shape().dim_size(0)) {
+    return errors::InvalidArgument(
+        "Sum of split sizes must not exceed dim0-size of input tensor");
+  }
+
+  // Special case 0: trivial 1-way split.
+  if (sizes.size() == 1 && sizes.at(0) == input.shape().dim_size(0)) {
+    outputs->push_back(input);
+    *done = true;
+    return Status::OK();
+  }
+
+  // Special case 1: input is aligned.
+  if (IsInnerDimsSizeAligned<T>(input.shape())) {
+    int64 position = 0;
+    for (const int64 size : sizes) {
+      outputs->emplace_back(input.Slice(position, position + size));
+      position += size;
+    }
+    *done = true;
+    return Status::OK();
+  }
+
+  return Status::OK();
+}
+
+// Handles the general case, on CPU.
+template <typename T>
+Status SplitCPU(OpKernelContext* context, const Tensor& input,
+                const gtl::ArraySlice<int64> sizes,
+                std::vector<Tensor>* outputs) {
+  int64 suffix_dim_size = 1;
+  for (int i = 1; i < input.shape().dims(); ++i) {
+    suffix_dim_size *= input.shape().dim_size(i);
+  }
+  auto input_reshaped =
+      input.shaped<T, 2>({input.shape().dim_size(0), suffix_dim_size});
+
+  int64 position = 0;
+  for (const int64 size : sizes) {
+    TensorShape output_shape = input.shape();
+    output_shape.set_dim(0, size);
+    Tensor output;
+    TF_RETURN_IF_ERROR(
+        context->allocate_temp(input.dtype(), output_shape, &output));
+    auto output_shaped = output.shaped<T, 2>({size, suffix_dim_size});
+
+    Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{position, 0};
+    Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{size, suffix_dim_size};
+    functor::Split<CPUDevice, T, 2>()(context->eigen_device<CPUDevice>(),
+                                      output_shaped, input_reshaped,
+                                      slice_indices, slice_sizes);
+
+    outputs->emplace_back(output);
+
+    position += size;
+  }
+
+  return Status::OK();
+}
+
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
+
+// Handles the general case, on GPU.
+template <typename T>
+Status SplitGPU(OpKernelContext* context, const Tensor& input,
+                const gtl::ArraySlice<int64>& sizes,
+                std::vector<Tensor>* outputs) {
+  // TODO(olston, apassos): Implement this.
+  LOG(FATAL) << "Not yet implemented";  // Crash ok
+}
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+// The outer function that dispatches to the various Split*() functions above.
+template <typename T>
+Status Split(OpKernelContext* context, const Tensor& input,
+             const gtl::ArraySlice<int64> sizes, std::vector<Tensor>* outputs) {
+  bool easy_cases_done;
+  TF_RETURN_IF_ERROR(
+      SplitEasyCases<T>(context, input, sizes, outputs, &easy_cases_done));
+  if (easy_cases_done) {
+    return Status::OK();
+  }
+
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
+// TODO(olston, apassos): Handle non-CPU cases.
+// return SplitGPU<T>(context, input, sizes, outputs);
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  return SplitCPU<T>(context, input, sizes, outputs);
+}
+
+// Same as 'Split' above, but handles Tensor dtype automatically.
+inline Status Split(OpKernelContext* context, const Tensor& input,
+                    const gtl::ArraySlice<int64> sizes,
+                    std::vector<Tensor>* outputs) {
+  const DataType type = input.dtype();
+  Status split_status;
+  switch (type) {
+#define CASE(type)                                              \
+  case DataTypeToEnum<type>::value:                             \
+    split_status = Split<type>(context, input, sizes, outputs); \
+    break;
+    TF_CALL_ALL_TYPES(CASE);
+#undef CASE
+    default:
+      split_status = errors::InvalidArgument("Unsupported data type: ", type);
+      break;
+  }
+  return split_status;
+}
+
+}  // namespace concat_split_util
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_CONCAT_SPLIT_UTIL_H_
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index 71ab55ab006..ce7823a7aef 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -136,15 +136,17 @@ class SharedBatchScheduler
   struct QueueOptions {
     // The size limit of an input batch to the queue.
     //
-    // If `enable_large_batch_splitting` is True, 'input_batch_size_limit'
-    // should be greater or equal than `max_execution_batch_size`; otherwise
-    // `input_batch_size_limit` should be equal to `max_execution_batch_size`.
-    size_t input_batch_size_limit = 1000;
+    // If `enable_large_batch_splitting` is True, 'max_batch_size' should be
+    // greater or equal than `max_execution_batch_size`; otherwise
+    // `max_batch_size` should be equal to `max_execution_batch_size`.
+    // TODO(b/154140947):
+    // Rename it to 'input_batch_size_limit' here and in caller's code.
+    size_t max_batch_size = 1000;
 
     // If a task has been enqueued for this amount of time (in microseconds),
     // and a thread is available, the scheduler will immediately form a batch
     // from enqueued tasks and assign the batch to the thread for processing,
-    // even if the batch's size is below 'input_batch_size_limit'.
+    // even if the batch's size is below 'max_batch_size'.
     //
     // This parameter offers a way to bound queue latency, so that a task isn't
     // stuck in the queue indefinitely waiting for enough tasks to arrive to
@@ -169,9 +171,9 @@ class SharedBatchScheduler
     // For usage of `split_input_task_func`, please see its comment.
     bool enable_large_batch_splitting = false;
 
-    // `input_task`: a unit of task to be splitted (raw pointer not owned).
+    // `input_task`: a unit of task to be split.
     // `first_output_task_size`: task size of first output.
-    // `max_execution_batch_size`: Maximum size of each batch.
+    // `max_batch_size`: Maximum size of each batch.
     // `output_tasks`: A list of output tasks after split.
     //
     // REQUIRED:
@@ -182,7 +184,7 @@ class SharedBatchScheduler
     // Instantiations of `TaskType` may vary, so it's up to caller to define
     // how (e.g., which members to access) to split input tasks.
     std::function<Status(std::unique_ptr<TaskType>* input_task,
-                         int first_output_task_size, int input_batch_size_limit,
+                         int first_output_task_size, int max_batch_size,
                          std::vector<std::unique_ptr<TaskType>>* output_tasks)>
         split_input_task_func;
 
@@ -267,7 +269,7 @@ class Queue {
   using SchedulableBatchCallback = std::function<void()>;
   using SplitInputTaskIntoSubtasksCallback = std::function<Status(
       std::unique_ptr<TaskType>* input_task, int open_batch_remaining_slot,
-      int max_execution_batch_size,
+      int max_batch_size,
       std::vector<std::unique_ptr<TaskType>>* output_tasks)>;
   Queue(const typename SharedBatchScheduler<TaskType>::QueueOptions& options,
         Env* env, ProcessBatchCallback process_batch_callback,
@@ -295,7 +297,7 @@ class Queue {
   size_t SchedulingCapacity() const;
 
   // Returns the maximum allowed size of tasks submitted to the queue.
-  size_t max_task_size() const { return options_.input_batch_size_limit; }
+  size_t max_task_size() const { return options_.max_batch_size; }
 
   // Returns the maximum allowed size of tasks to be enqueued.
   // Returned value would be less than or equal to the maximum allowed input
@@ -304,7 +306,7 @@ class Queue {
     if (options_.enable_large_batch_splitting) {
       return options_.max_execution_batch_size;
     } else {
-      return options_.input_batch_size_limit;
+      return options_.max_batch_size;
     }
   }
 
@@ -457,10 +459,9 @@ Status SharedBatchScheduler<TaskType>::AddQueue(
     std::function<void(std::unique_ptr<Batch<TaskType>>)>
         process_batch_callback,
     std::unique_ptr<BatchScheduler<TaskType>>* queue) {
-  if (options.input_batch_size_limit == 0) {
-    return errors::InvalidArgument(
-        "input_batch_size_limit must be positive; was ",
-        options.input_batch_size_limit);
+  if (options.max_batch_size == 0) {
+    return errors::InvalidArgument("max_batch_size must be positive; was ",
+                                   options.max_batch_size);
   }
   if (options.batch_timeout_micros < 0) {
     return errors::InvalidArgument(
@@ -482,12 +483,11 @@ Status SharedBatchScheduler<TaskType>::AddQueue(
   }
 
   if (options.enable_large_batch_splitting &&
-      (options.input_batch_size_limit < options.max_execution_batch_size)) {
+      (options.max_batch_size < options.max_execution_batch_size)) {
     return errors::InvalidArgument(
-        "When enable_large_batch_splitting is true, input_batch_size_limit "
-        "must be "
+        "When enable_large_batch_splitting is true, max_batch_size must be "
         "greater than or equal to max_execution_batch_size.",
-        options.enable_large_batch_splitting, options.input_batch_size_limit,
+        options.enable_large_batch_splitting, options.max_batch_size,
         options.max_execution_batch_size);
   }
 
@@ -616,10 +616,10 @@ Status Queue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
 
 template <typename TaskType>
 Status Queue<TaskType>::ScheduleWithoutSplit(std::unique_ptr<TaskType>* task) {
-  if ((*task)->size() > options_.input_batch_size_limit) {
+  if ((*task)->size() > options_.max_batch_size) {
     return errors::InvalidArgument("Task size ", (*task)->size(),
                                    " is larger than maximum input batch size ",
-                                   options_.input_batch_size_limit);
+                                   options_.max_batch_size);
   }
 
   bool notify_of_schedulable_batch = false;
@@ -628,8 +628,7 @@ Status Queue<TaskType>::ScheduleWithoutSplit(std::unique_ptr<TaskType>* task) {
 
     DCHECK(!closed_);
 
-    if (batches_.back()->size() + (*task)->size() >
-        options_.input_batch_size_limit) {
+    if (batches_.back()->size() + (*task)->size() > options_.max_batch_size) {
       if (batches_.size() >= options_.max_enqueued_batches) {
         return errors::Unavailable(
             "The batch scheduling queue to which this task was submitted is "
@@ -670,10 +669,10 @@ Status Queue<TaskType>::ScheduleWithSplit(std::unique_ptr<TaskType>* task) {
   profiler::TraceMe trace_me([task] {
     return strings::StrCat("ScheduleWithSplit:", (*task)->size());
   });
-  if ((*task)->size() > options_.input_batch_size_limit) {
+  if ((*task)->size() > options_.max_batch_size) {
     return errors::InvalidArgument("Task size ", (*task)->size(),
                                    " is larger than maximum input batch size ",
-                                   options_.input_batch_size_limit);
+                                   options_.max_batch_size);
   }
 
   // The max size to be enqueued.
@@ -712,7 +711,7 @@ Status Queue<TaskType>::ScheduleWithSplit(std::unique_ptr<TaskType>* task) {
     std::vector<std::unique_ptr<TaskType>> output_tasks;
 
     if (input_task_size <= open_batch_remaining_slot) {
-      // This is the fast path when input doesn't need to be splitted.
+      // This is the fast path when input doesn't need to be split.
       output_tasks.push_back(std::move(*task));
     } else {
       TF_RETURN_IF_ERROR(SplitInputBatchIntoSubtasks(task, &output_tasks));
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
index 10f34cf829b..a1958777a49 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
@@ -97,7 +97,7 @@ TEST(SharedBatchSchedulerTest, Basic) {
 
           // Create two queues.
           SharedBatchScheduler<FakeTask>::QueueOptions queue_options;
-          queue_options.input_batch_size_limit = 10;
+          queue_options.max_batch_size = 10;
           queue_options.batch_timeout_micros = 10 * 1000 * 1000;  // 10 seconds
           queue_options.max_enqueued_batches = 2;
           std::unique_ptr<BatchScheduler<FakeTask>> queue_0;
@@ -155,7 +155,7 @@ TEST(SharedBatchSchedulerTest, ObeyBatchSizeConstraint) {
     std::shared_ptr<SharedBatchScheduler<FakeTask>> scheduler;
     TF_ASSERT_OK(SharedBatchScheduler<FakeTask>::Create(options, &scheduler));
     SharedBatchScheduler<FakeTask>::QueueOptions queue_options;
-    queue_options.input_batch_size_limit = 10;
+    queue_options.max_batch_size = 10;
     queue_options.batch_timeout_micros = 10 * 1000 * 1000;  // 10 seconds
     queue_options.max_enqueued_batches = 2;
     std::unique_ptr<BatchScheduler<FakeTask>> queue;
@@ -217,7 +217,7 @@ TEST(SharedBatchSchedulerTest, ObeysTimeout) {
     std::shared_ptr<SharedBatchScheduler<FakeTask>> scheduler;
     TF_ASSERT_OK(SharedBatchScheduler<FakeTask>::Create(options, &scheduler));
     SharedBatchScheduler<FakeTask>::QueueOptions queue_options;
-    queue_options.input_batch_size_limit = 4;
+    queue_options.max_batch_size = 4;
     queue_options.batch_timeout_micros = 10;
     queue_options.max_enqueued_batches = 2;
     std::unique_ptr<BatchScheduler<FakeTask>> queue;
@@ -273,7 +273,7 @@ TEST(SharedBatchSchedulerTest, ObeysTimeoutWithRealClock) {
   std::shared_ptr<SharedBatchScheduler<FakeTask>> scheduler;
   TF_ASSERT_OK(SharedBatchScheduler<FakeTask>::Create(options, &scheduler));
   SharedBatchScheduler<FakeTask>::QueueOptions queue_options;
-  queue_options.input_batch_size_limit = 10;
+  queue_options.max_batch_size = 10;
   queue_options.batch_timeout_micros = 100 * 1000;  // 100 milliseconds
   queue_options.max_enqueued_batches = 2;
   std::unique_ptr<BatchScheduler<FakeTask>> queue;
@@ -318,7 +318,7 @@ TEST(SharedBatchSchedulerTest,
     TF_ASSERT_OK(SharedBatchScheduler<FakeTask>::Create(options, &scheduler));
     SharedBatchScheduler<FakeTask>::QueueOptions queue_options;
     // Set a large batch size, so that we don't hit the batch size limit.
-    queue_options.input_batch_size_limit = 100;
+    queue_options.max_batch_size = 100;
     // Process a batch as soon as a thread is available.
     queue_options.batch_timeout_micros = 0;
     queue_options.max_enqueued_batches = 2;
@@ -371,7 +371,7 @@ TEST(SharedBatchSchedulerTest, Fairness) {
     std::shared_ptr<SharedBatchScheduler<FakeTask>> scheduler;
     TF_ASSERT_OK(SharedBatchScheduler<FakeTask>::Create(options, &scheduler));
     SharedBatchScheduler<FakeTask>::QueueOptions queue_options;
-    queue_options.input_batch_size_limit = 10;
+    queue_options.max_batch_size = 10;
     queue_options.batch_timeout_micros = 1;
     queue_options.max_enqueued_batches = 100 /* give plenty of room */;
     std::vector<std::unique_ptr<BatchScheduler<FakeTask>>> queues(2);
@@ -423,7 +423,7 @@ TEST(SharedBatchSchedulerTest, ConstMethods) {
     std::shared_ptr<SharedBatchScheduler<FakeTask>> scheduler;
     TF_ASSERT_OK(SharedBatchScheduler<FakeTask>::Create(options, &scheduler));
     SharedBatchScheduler<FakeTask>::QueueOptions queue_options;
-    queue_options.input_batch_size_limit = 2;
+    queue_options.max_batch_size = 2;
     queue_options.batch_timeout_micros = 0;
     queue_options.max_enqueued_batches = max_enqueued_batches;
     std::unique_ptr<BatchScheduler<FakeTask>> queue;
@@ -494,7 +494,7 @@ TEST(SharedBatchSchedulerTest, OneFullQueueDoesntBlockOtherQueues) {
   std::shared_ptr<SharedBatchScheduler<FakeTask>> scheduler;
   TF_ASSERT_OK(SharedBatchScheduler<FakeTask>::Create(options, &scheduler));
   SharedBatchScheduler<FakeTask>::QueueOptions queue_options;
-  queue_options.input_batch_size_limit = 10;
+  queue_options.max_batch_size = 10;
   queue_options.batch_timeout_micros = 0;
   queue_options.max_enqueued_batches = 2;
   std::unique_ptr<BatchScheduler<FakeTask>> queue_0;
@@ -550,7 +550,7 @@ TEST(SharedBatchSchedulerTest, QueueDestructorBlocksUntilAllTasksProcessed) {
     std::shared_ptr<SharedBatchScheduler<FakeTask>> scheduler;
     TF_ASSERT_OK(SharedBatchScheduler<FakeTask>::Create(options, &scheduler));
     SharedBatchScheduler<FakeTask>::QueueOptions queue_options;
-    queue_options.input_batch_size_limit = 10;
+    queue_options.max_batch_size = 10;
     queue_options.batch_timeout_micros = 0;
     queue_options.max_enqueued_batches = 2;
     std::unique_ptr<BatchScheduler<FakeTask>> queue;
diff --git a/tensorflow/core/kernels/batching_util/threadsafe_status.cc b/tensorflow/core/kernels/batching_util/threadsafe_status.cc
new file mode 100644
index 00000000000..fa5cda7161b
--- /dev/null
+++ b/tensorflow/core/kernels/batching_util/threadsafe_status.cc
@@ -0,0 +1,51 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/batching_util/threadsafe_status.h"
+
+#include "absl/base/thread_annotations.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+const Status& ThreadSafeStatus::status() const& {
+  tf_shared_lock lock(mutex_);
+  return status_;
+}
+
+Status ThreadSafeStatus::status() && {
+  tf_shared_lock lock(mutex_);
+  return std::move(status_);
+}
+
+void ThreadSafeStatus::Update(const Status& new_status) {
+  if (new_status.ok()) {
+    return;
+  }
+
+  mutex_lock lock(mutex_);
+  status_.Update(new_status);
+}
+
+void ThreadSafeStatus::Update(Status&& new_status) {
+  if (new_status.ok()) {
+    return;
+  }
+
+  mutex_lock lock(mutex_);
+  status_.Update(std::forward<Status>(new_status));
+}
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batching_util/threadsafe_status.h b/tensorflow/core/kernels/batching_util/threadsafe_status.h
new file mode 100644
index 00000000000..c14a8a90714
--- /dev/null
+++ b/tensorflow/core/kernels/batching_util/threadsafe_status.h
@@ -0,0 +1,57 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_THREADSAFE_STATUS_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_THREADSAFE_STATUS_H_
+
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+// Wrapper class to allow both lock-free construction and concurrent updates on
+// a 'status'.
+//
+// Example Usage:
+//   std::thread threads[2];
+//   ThreadSafeStatus thread_safe_status;
+//   threads[0] = std::thread([&]() {
+//     status.Update(errors::Internal("internal error"));
+//   });
+//   threads[1] = std::thread([&]() {
+//     status.Update(errors::InvalidArgument("invalid argument"));
+//   });
+//   threads[0].Join();
+//   threads[1].Join();
+//
+//   NOTE:
+//   When updated in a multi-threading setup, only the first error is retained.
+class ThreadSafeStatus {
+ public:
+  const Status& status() const& TF_LOCKS_EXCLUDED(mutex_);
+  Status status() && TF_LOCKS_EXCLUDED(mutex_);
+
+  // Retains the first error status: replaces the current status with
+  // `new_status` if `new_status` is not OK and the previous status is OK.
+  void Update(const Status& new_status) TF_LOCKS_EXCLUDED(mutex_);
+  void Update(Status&& new_status) TF_LOCKS_EXCLUDED(mutex_);
+
+ private:
+  mutable mutex mutex_;
+  Status status_ TF_GUARDED_BY(mutex_);
+};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_THREADSAFE_STATUS_H_
diff --git a/tensorflow/core/kernels/batching_util/threadsafe_status_test.cc b/tensorflow/core/kernels/batching_util/threadsafe_status_test.cc
new file mode 100644
index 00000000000..e0c5d03c8a4
--- /dev/null
+++ b/tensorflow/core/kernels/batching_util/threadsafe_status_test.cc
@@ -0,0 +1,51 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/batching_util/threadsafe_status.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(ThreadSafeStatus, DefaultOk) {
+  ThreadSafeStatus status;
+  TF_EXPECT_OK(status.status());
+}
+
+TEST(ThreadSafeStatus, Update) {
+  ThreadSafeStatus status;
+  TF_EXPECT_OK(status.status());
+
+  status.Update(errors::FailedPrecondition("original error"));
+  EXPECT_EQ(status.status().code(), error::FAILED_PRECONDITION);
+
+  status.Update(Status::OK());
+  EXPECT_EQ(status.status().code(), error::FAILED_PRECONDITION);
+
+  status.Update(errors::Internal("new error"));
+  EXPECT_EQ(status.status().code(), error::FAILED_PRECONDITION);
+}
+
+TEST(ThreadSafeStatus, Move) {
+  ThreadSafeStatus status;
+  TF_EXPECT_OK(std::move(status).status());
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/bincount_op.cc b/tensorflow/core/kernels/bincount_op.cc
index a84b25f2541..35911ee5d55 100644
--- a/tensorflow/core/kernels/bincount_op.cc
+++ b/tensorflow/core/kernels/bincount_op.cc
@@ -175,13 +175,15 @@ class BincountOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor& arr_t = ctx->input(0);
     const Tensor& size_tensor = ctx->input(1);
-    const Tensor& weights_t = ctx->input(2);
-
+    OP_REQUIRES(ctx, size_tensor.dims() == 0,
+                errors::InvalidArgument("Shape must be rank 0 but is rank ",
+                                        size_tensor.dims()));
     int32 size = size_tensor.scalar<int32>()();
     OP_REQUIRES(
         ctx, size >= 0,
         errors::InvalidArgument("size (", size, ") must be non-negative"));
 
+    const Tensor& weights_t = ctx->input(2);
     const auto arr = arr_t.flat<int32>();
     const auto weights = weights_t.flat<T>();
     Tensor* output_t;
@@ -226,6 +228,10 @@ class DenseBincountOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     const Tensor& data = ctx->input(0);
+    OP_REQUIRES(ctx, data.dims() <= 2,
+                errors::InvalidArgument(
+                    "Shape must be at most rank 2 but is rank ", data.dims()));
+
     const Tensor& size_t = ctx->input(1);
     const Tensor& weights = ctx->input(2);
 
diff --git a/tensorflow/core/kernels/boosted_trees/resources.cc b/tensorflow/core/kernels/boosted_trees/resources.cc
index 0484314f205..7cffb1fb180 100644
--- a/tensorflow/core/kernels/boosted_trees/resources.cc
+++ b/tensorflow/core/kernels/boosted_trees/resources.cc
@@ -424,7 +424,8 @@ void BoostedTreesEnsembleResource::PostPruneTree(const int32 current_tree,
                               ->mutable_post_pruned_nodes_meta();
 
   for (int32 i = 0; i < num_nodes; ++i) {
-    if (index_for_deleted < nodes_to_delete.size() &&
+    const int64 nodes_to_delete_size = nodes_to_delete.size();
+    if (index_for_deleted < nodes_to_delete_size &&
         i == nodes_to_delete[index_for_deleted]) {
       // Node i will get removed,
       ++index_for_deleted;
@@ -455,7 +456,8 @@ void BoostedTreesEnsembleResource::PostPruneTree(const int32 current_tree,
   protobuf::RepeatedPtrField<boosted_trees::Node> new_nodes;
   new_nodes.Reserve(old_to_new_ids.size());
   for (auto node : *(tree->mutable_nodes())) {
-    if (index_for_deleted < nodes_to_delete.size() &&
+    const int64 nodes_to_delete_size = nodes_to_delete.size();
+    if (index_for_deleted < nodes_to_delete_size &&
         i == nodes_to_delete[index_for_deleted]) {
       ++index_for_deleted;
       ++i;
@@ -570,7 +572,7 @@ void BoostedTreesEnsembleResource::RecursivelyDoPostPrunePreparation(
     if (node_metadata.has_original_leaf()) {
       parent_values = node_value(tree_id, node_id);
     }
-    for (int32 i = 0; i < parent_values.size(); ++i) {
+    for (int32 i = 0, end = parent_values.size(); i < end; ++i) {
       nodes_meta->at(left_id).second.emplace_back(parent_values[i] -
                                                   left_child_values[i]);
       nodes_meta->at(right_id).second.emplace_back(parent_values[i] -
diff --git a/tensorflow/core/kernels/cast_op.h b/tensorflow/core/kernels/cast_op.h
index 84c44f6b5e7..4210b0a289b 100644
--- a/tensorflow/core/kernels/cast_op.h
+++ b/tensorflow/core/kernels/cast_op.h
@@ -278,48 +278,6 @@ template <typename From, typename To>
 struct functor_traits<scalar_cast_op<std::complex<From>, std::complex<To>>>
     : functor_traits_complex_impl<std::complex<From>, std::complex<To>> {};
 
-// Specialized cast op impls for bfloat16.
-template <>
-struct scalar_cast_op<::tensorflow::bfloat16, float> {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
-  typedef float result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator()(
-      const ::tensorflow::bfloat16& a) const {
-    float ret;
-    uint16_t* p = reinterpret_cast<uint16_t*>(&ret);
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    p[0] = a.value;
-    p[1] = 0;
-#else
-    static_assert(::tensorflow::port::kLittleEndian,
-                  "Not a little endian system!");
-    p[0] = 0;
-    p[1] = a.value;
-#endif
-    return ret;
-  }
-};
-
-template <>
-struct functor_traits<scalar_cast_op<::tensorflow::bfloat16, float>> {
-  enum { Cost = NumTraits<float>::AddCost, PacketAccess = false };
-};
-
-template <>
-struct scalar_cast_op<float, ::tensorflow::bfloat16> {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
-  typedef ::tensorflow::bfloat16 result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ::tensorflow::bfloat16 operator()(
-      const float a) const {
-    return ::tensorflow::bfloat16(a);
-  }
-};
-
-template <>
-struct functor_traits<scalar_cast_op<float, ::tensorflow::bfloat16>> {
-  enum { Cost = NumTraits<float>::AddCost, PacketAccess = false };
-};
-
 }  // namespace internal
 }  // namespace Eigen
 
diff --git a/tensorflow/core/kernels/collective_nccl.cc b/tensorflow/core/kernels/collective_nccl.cc
index 013e06cc374..74ad24abfaa 100644
--- a/tensorflow/core/kernels/collective_nccl.cc
+++ b/tensorflow/core/kernels/collective_nccl.cc
@@ -58,7 +58,8 @@ Status NcclBase::InitializeCollectiveParams(CollectiveParams* col_params) {
   return Status::OK();
 }
 
-Status NcclBase::InitializeCollectiveContext(CollectiveContext* col_ctx) {
+Status NcclBase::InitializeCollectiveContext(
+    std::shared_ptr<CollectiveContext> col_ctx) {
   col_ctx_ = col_ctx;
   col_params_ = &col_ctx->col_params;
   return collective_util::InitializeDeviceAndLocality(
diff --git a/tensorflow/core/kernels/collective_nccl.h b/tensorflow/core/kernels/collective_nccl.h
index 5ef0d61aee5..b076272b6a5 100644
--- a/tensorflow/core/kernels/collective_nccl.h
+++ b/tensorflow/core/kernels/collective_nccl.h
@@ -29,7 +29,8 @@ class NcclBase : public CollectiveImplementationInterface {
   Status InitializeCollectiveParams(CollectiveParams* col_params) override;
 
   // Initializes the device objects and device localities.
-  Status InitializeCollectiveContext(CollectiveContext* col_ctx) override;
+  Status InitializeCollectiveContext(
+      std::shared_ptr<CollectiveContext> col_ctx) override;
 
   // Initialize nccl communicator key.
   Status InitializeCollectiveGroupRuntimeDetails(
@@ -40,7 +41,7 @@ class NcclBase : public CollectiveImplementationInterface {
 
   const CollectiveType type_;
   const string name_;
-  CollectiveContext* col_ctx_;          // Not owned
+  std::shared_ptr<CollectiveContext> col_ctx_;
   const CollectiveParams* col_params_;  // Not owned
 };
 
diff --git a/tensorflow/core/kernels/collective_nccl_test.cc b/tensorflow/core/kernels/collective_nccl_test.cc
index 8f3a958149b..ce4aca1cdcc 100644
--- a/tensorflow/core/kernels/collective_nccl_test.cc
+++ b/tensorflow/core/kernels/collective_nccl_test.cc
@@ -314,11 +314,11 @@ class NcclTestBase : public ::testing::Test {
       string exec_key =
           strings::StrCat(col_params_.instance.instance_key, ":0:0");
       NcclReducer reducer;
-      CollectiveContext col_ctx(parent_->col_exec_, parent_->dev_mgr_.get(),
-                                /*OpKernelContext=*/&ctx, &op_params,
-                                col_params_, exec_key, kStepId,
-                                /*input=*/&input_, /*output=*/&input_);
-      TF_CHECK_OK(reducer.InitializeCollectiveContext(&col_ctx));
+      auto col_ctx = std::make_shared<CollectiveContext>(
+          parent_->col_exec_, parent_->dev_mgr_.get(),
+          /*OpKernelContext=*/&ctx, &op_params, col_params_, exec_key, kStepId,
+          /*input=*/&input_, /*output=*/&input_);
+      TF_CHECK_OK(reducer.InitializeCollectiveContext(col_ctx));
       Notification note;
       reducer.Run([this, &note](Status s) {
         status_ = s;
@@ -344,12 +344,12 @@ class NcclTestBase : public ::testing::Test {
       string exec_key =
           strings::StrCat(col_params_.instance.instance_key, ":0:0");
       NcclBroadcaster broadcaster;
-      CollectiveContext col_ctx(
+      auto col_ctx = std::make_shared<CollectiveContext>(
           parent_->col_exec_, parent_->dev_mgr_.get(),
           /*OpKernelContext=*/&ctx, &op_params, col_params_, exec_key, kStepId,
           /*input=*/col_params_.is_source ? &input_ : nullptr,
           /*output=*/&input_);
-      TF_CHECK_OK(broadcaster.InitializeCollectiveContext(&col_ctx));
+      TF_CHECK_OK(broadcaster.InitializeCollectiveContext(col_ctx));
       Notification note;
       broadcaster.Run([this, &note](Status s) {
         status_ = s;
@@ -383,12 +383,12 @@ class NcclTestBase : public ::testing::Test {
       string exec_key =
           strings::StrCat(col_params_.instance.instance_key, ":0:0");
       NcclGatherer gatherer;
-      CollectiveContext col_ctx(parent_->col_exec_, parent_->dev_mgr_.get(),
-                                /*OpKernelContext=*/&ctx, &op_params,
-                                col_params_, exec_key, kStepId,
-                                /*input=*/&input_,
-                                /*output=*/&output_);
-      TF_CHECK_OK(gatherer.InitializeCollectiveContext(&col_ctx));
+      auto col_ctx = std::make_shared<CollectiveContext>(
+          parent_->col_exec_, parent_->dev_mgr_.get(),
+          /*OpKernelContext=*/&ctx, &op_params, col_params_, exec_key, kStepId,
+          /*input=*/&input_,
+          /*output=*/&output_);
+      TF_CHECK_OK(gatherer.InitializeCollectiveContext(col_ctx));
       Notification note;
       gatherer.Run([this, &note](Status s) {
         status_ = s;
diff --git a/tensorflow/core/kernels/collective_ops.cc b/tensorflow/core/kernels/collective_ops.cc
index 4951d0895c6..0230852d082 100644
--- a/tensorflow/core/kernels/collective_ops.cc
+++ b/tensorflow/core/kernels/collective_ops.cc
@@ -21,6 +21,32 @@ limitations under the License.
 namespace tensorflow {
 
 namespace {
+
+static string CollectiveKey(OpKernelContext* ctx, int32 group_key,
+                            int32 instance_key) {
+  return strings::StrCat(group_key, ":", instance_key, ":",
+                         ctx->frame_iter().frame_id, ":",
+                         ctx->frame_iter().iter_id);
+}
+
+static std::unique_ptr<OpKernel> BuildOpKernel(OpKernelConstruction* c,
+                                               const string& name,
+                                               NodeDef* sub_node) {
+  std::unique_ptr<OpKernel> k;
+  if (name.empty() || name == "Id") return k;
+  sub_node->set_name(name);
+  sub_node->set_op(name);
+  Status status;
+  k = CreateOpKernel(c->device_type(), c->device(),
+                     c->device()->GetAllocator(AllocatorAttributes()),
+                     *sub_node, c->graph_def_version(), &status);
+  if (!status.ok()) {
+    c->CtxFailureWithWarning(errors::Internal(
+        "Failed to build OpKernel for ", name, " : ", status.error_message()));
+  }
+  return k;
+}
+
 class CollectiveOpKernel : public AsyncOpKernel {
  public:
   explicit CollectiveOpKernel(OpKernelConstruction* c) : AsyncOpKernel(c) {}
@@ -28,9 +54,8 @@ class CollectiveOpKernel : public AsyncOpKernel {
   // A string encoding instance, frame and iter to be handed off to
   // the implementation for use in generating RecvBuf keys.
   string GetCollectiveKey(OpKernelContext* c) {
-    return strings::StrCat(col_params_.instance.instance_key, ":",
-                           c->frame_iter().frame_id, ":",
-                           c->frame_iter().iter_id);
+    return CollectiveKey(c, col_params_.group.group_key,
+                         col_params_.instance.instance_key);
   }
 
   // Returns false if calling invocation of ComputeAsync should return
@@ -205,25 +230,6 @@ class CollectiveReduceOpKernel : public CollectiveOpKernel {
     col_params_.final_op = BuildOpKernel(c, final_op_name, &sub_node);
   }
 
-  std::unique_ptr<OpKernel> BuildOpKernel(OpKernelConstruction* c,
-                                          const string& name,
-                                          NodeDef* sub_node) {
-    std::unique_ptr<OpKernel> k;
-    if (name.empty() || name == "Id") return k;
-    sub_node->set_name(name);
-    sub_node->set_op(name);
-    Status status;
-    k = CreateOpKernel(c->device_type(), c->device(),
-                       c->device()->GetAllocator(AllocatorAttributes()),
-                       *sub_node, c->graph_def_version(), &status);
-    if (!status.ok()) {
-      c->CtxFailureWithWarning(errors::Internal("Failed to build OpKernel for ",
-                                                name, " : ",
-                                                status.error_message()));
-    }
-    return k;
-  }
-
   void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
     CollectiveExecutor* col_exec = c->collective_executor();
     OP_REQUIRES_ASYNC(
@@ -430,5 +436,149 @@ REGISTER_KERNEL_BUILDER(Name("CollectiveBcastRecv").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("CollectiveBcastRecv").Device(DEVICE_GPU),
                         CollectiveBcastRecvOpKernel);
 
+class CollectiveReduceV2OpKernel : public AsyncOpKernel {
+ public:
+  explicit CollectiveReduceV2OpKernel(OpKernelConstruction* c)
+      : AsyncOpKernel(c) {
+    col_params_ = std::make_shared<CollectiveParams>();
+    OP_REQUIRES_OK(c, c->GetAttr("T", &col_params_->instance.data_type));
+    string merge_op_name;
+    OP_REQUIRES_OK(c, c->GetAttr("merge_op", &merge_op_name));
+    OP_REQUIRES_OK(c, c->GetAttr("merge_op", &merge_op_name));
+    if (merge_op_name == "Max") {
+      merge_op_name = "Maximum";
+    } else if (merge_op_name == "Min") {
+      merge_op_name = "Minimum";
+    }
+    string final_op_name;
+    OP_REQUIRES_OK(c, c->GetAttr("final_op", &final_op_name));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("communication_hint",
+                      &col_params_->instance.impl_details.communication_hint));
+    // Prepare OpKernels for reduction and final operations.
+    // The merge_op takes two inputs
+    NodeDef sub_node;
+    sub_node.add_input(c->def().input(0));
+    sub_node.add_input(c->def().input(0));
+    sub_node.set_device(c->def().device());
+    SetAttrValue(col_params_->instance.data_type,
+                 &(*sub_node.mutable_attr())["T"]);
+    col_params_->merge_op = BuildOpKernel(c, merge_op_name, &sub_node);
+    col_params_->final_op = BuildOpKernel(c, final_op_name, &sub_node);
+
+    col_params_->name = strings::StrCat(c->def().name(), ": ReduceV2(",
+                                        merge_op_name, ",", final_op_name, ")");
+    col_params_->group.device_type = c->device_type();
+    // Add a default value for subdiv offsets, which is the same as the default
+    // value in the V1 op's attribute.
+    col_params_->instance.impl_details.subdiv_offsets.push_back(0);
+    VLOG(2) << "CollectiveReduceV2 " << this << " name " << col_params_->name
+            << " communication_hint "
+            << col_params_->instance.impl_details.communication_hint;
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    CollectiveExecutor* col_exec = c->collective_executor();
+    OP_REQUIRES_ASYNC(
+        c, col_exec,
+        errors::Internal(
+            "Failed to get CollectiveExecutor from OpKernelContext for Op ",
+            col_params_->name),
+        done);
+    const Tensor& input = c->input(0);
+    const Tensor& group_size = c->input(1);
+    const Tensor& group_key = c->input(2);
+    const Tensor& instance_key = c->input(3);
+    OP_REQUIRES_ASYNC(
+        c, group_size.dims() == 0,
+        errors::Internal("Unexpected dimensions on input group_size"), done);
+    OP_REQUIRES_ASYNC(
+        c, group_key.dims() == 0,
+        errors::Internal("Unexpected dimensions on input group_key"), done);
+    OP_REQUIRES_ASYNC(
+        c, instance_key.dims() == 0,
+        errors::Internal("Unexpected dimensions on input instance_key"), done);
+
+    auto col_params = std::make_shared<CollectiveParams>();
+    col_params->name = col_params_->name;
+    col_params->group.device_type = col_params_->group.device_type;
+    col_params->group.group_size = group_size.unaligned_flat<int32>()(0);
+    col_params->group.group_key = group_key.unaligned_flat<int32>()(0);
+    col_params->instance.type = REDUCTION_COLLECTIVE;
+    col_params->instance.instance_key = instance_key.unaligned_flat<int32>()(0);
+    col_params->instance.data_type = col_params_->instance.data_type;
+    col_params->instance.impl_details.communication_hint =
+        col_params_->instance.impl_details.communication_hint;
+    col_params->instance.impl_details.timeout_seconds = 0;
+    col_params->instance.impl_details.subdiv_offsets =
+        col_params_->instance.impl_details.subdiv_offsets;
+    col_params->merge_op = std::move(col_params_->merge_op);
+    col_params->final_op = std::move(col_params_->final_op);
+    VLOG(1) << "CollectiveReduceV2 group_size " << col_params->group.group_size
+            << " group_key " << col_params->group.group_key << " instance_key "
+            << col_params->instance.instance_key;
+
+    // Allocate the output tensor, trying to reuse the input.
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK_ASYNC(
+        c, c->forward_input_or_allocate_output({0}, 0, input.shape(), &output),
+        done);
+    col_params->instance.shape = input.shape();
+
+    // Store the updated params in this OpKernel.
+    col_params_ = col_params;
+
+    // Resolve the collective params.
+    // Schedule the `CompleteParamsAsync` call on a work queue that can handle
+    // blocking work because it's not guaranteed that this call cannot block.
+    c->collective_executor()->RunClosure([c, done = std::move(done), col_params,
+                                          col_exec]() {
+      VLOG(1) << "CollectiveReduceV2 CompleteParams for collective "
+              << col_params->name << " device " << c->device()->name()
+              << " group " << col_params->group.group_key << " instance "
+              << col_params->instance.instance_key;
+      col_exec->CompleteParamsAsync(
+          c->device()->name(), col_params.get(), c->cancellation_manager(),
+          [c, done = std::move(done), col_params, col_exec](const Status& s) {
+            if (s.ok()) {
+              auto actual_done = [c, group_key = col_params->group.group_key,
+                                  instance_key =
+                                      col_params->instance.instance_key,
+                                  done = std::move(done)](const Status& s) {
+                VLOG(1) << "CollectiveReduceV2 ExecuteAsync done for "
+                           "collective "
+                        << c->op_kernel().name() << " device "
+                        << c->device()->name() << " group " << group_key
+                        << " instance " << instance_key << " status " << s;
+                OP_REQUIRES_OK_ASYNC(c, s, done);
+                done();
+              };
+              VLOG(1) << "CollectiveReduceV2 ExecuteAsync start for "
+                         "collective "
+                      << col_params->name << " device " << c->device()->name()
+                      << " group " << col_params->group.group_key
+                      << " instance " << col_params->instance.instance_key;
+              col_exec->ExecuteAsync(
+                  c, *col_params,
+                  CollectiveKey(c, col_params->group.group_key,
+                                col_params->instance.instance_key),
+                  actual_done);
+            } else {
+              c->SetStatus(s);
+              done();
+            }
+          });
+    });
+  }
+
+ private:
+  std::shared_ptr<CollectiveParams> col_params_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("CollectiveReduceV2").Device(DEVICE_CPU),
+                        CollectiveReduceV2OpKernel);
+REGISTER_KERNEL_BUILDER(Name("CollectiveReduceV2").Device(DEVICE_GPU),
+                        CollectiveReduceV2OpKernel);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 376effc6535..682da43a9b0 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -212,6 +212,8 @@ TF_CALL_ALL_TYPES(REGISTER_CPU_KERNEL);
 // the conversion from uint8 to quint8.
 REGISTER_KERNEL(CPU, quint8);
 REGISTER_KERNEL(CPU, quint16);
+REGISTER_KERNEL(CPU, qint8);
+REGISTER_KERNEL(CPU, qint16);
 #undef REGISTER_CPU_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 2dd63d1f4d0..86090864ddb 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -397,8 +397,8 @@ class Conv2DBackpropInputOp : public OpKernel {
     int stride_w = GetTensorDim(strides_, data_format_, 'W');
     OP_REQUIRES(
         context, (stride_n == 1 && stride_c == 1),
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
+        errors::Unimplemented("Current implementation does not yet support "
+                              "strides in the batch and depth dimensions."));
     OP_REQUIRES(context, stride_h > 0 && stride_w > 0,
                 errors::InvalidArgument(
                     "Row and column strides should be larger than 0."));
@@ -411,10 +411,10 @@ class Conv2DBackpropInputOp : public OpKernel {
     int dilation_c = GetTensorDim(dilations_, data_format_, 'C');
     int dilation_h = GetTensorDim(dilations_, data_format_, 'H');
     int dilation_w = GetTensorDim(dilations_, data_format_, 'W');
-    OP_REQUIRES(context, (dilation_n == 1 && dilation_c == 1),
-                errors::InvalidArgument(
-                    "Current implementation does not yet support "
-                    "dilations in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context, (dilation_n == 1 && dilation_c == 1),
+        errors::Unimplemented("Current implementation does not yet support "
+                              "dilations in the batch and depth dimensions."));
     OP_REQUIRES(
         context, dilation_h > 0 && dilation_w > 0,
         errors::InvalidArgument("Dilated rates should be larger than 0."));
@@ -517,8 +517,8 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
                                         "specify 4 dimensions"));
     OP_REQUIRES(
         context, (strides_[0] == 1 && strides_[3] == 1),
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
+        errors::Unimplemented("Current implementation does not yet support "
+                              "strides in the batch and depth dimensions."));
     OP_REQUIRES(context, strides_[1] > 0 && strides_[2] > 0,
                 errors::InvalidArgument(
                     "Row and column strides should be larger than 0."));
@@ -527,10 +527,10 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
     OP_REQUIRES(context, dilations_.size() == 4,
                 errors::InvalidArgument("Sliding window dilations field must "
                                         "specify 4 dimensions"));
-    OP_REQUIRES(context, (dilations_[0] == 1 && dilations_[3] == 1),
-                errors::InvalidArgument(
-                    "Current implementation does not yet support "
-                    "dilations in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context, (dilations_[0] == 1 && dilations_[3] == 1),
+        errors::Unimplemented("Current implementation does not yet support "
+                              "dilations in the batch and depth dimensions."));
     // TODO(yangzihao): Add a CPU implementation for dilated convolution.
     OP_REQUIRES(context, (dilations_[1] == 1 && dilations_[2] == 1),
                 errors::InvalidArgument(
@@ -561,6 +561,16 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
                        /*dilations=*/{1, 1, 1, 1}, strides_, padding_,
                        explicit_paddings_, data_format_, &dims));
 
+    OP_REQUIRES(context, dims.in_depth == filter.shape().dim_size(2),
+                errors::InvalidArgument("Computed input depth ", dims.in_depth,
+                                        " doesn't match filter input depth ",
+                                        filter.shape().dim_size(2)));
+    OP_REQUIRES(
+        context, dims.out_depth == filter.shape().dim_size(3),
+        errors::InvalidArgument("Computed output depth ", dims.out_depth,
+                                " doesn't match filter output depth ",
+                                filter.shape().dim_size(3)));
+
     Tensor* in_backprop = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, input_shape, &in_backprop));
diff --git a/tensorflow/core/kernels/conv_grad_shape_utils.cc b/tensorflow/core/kernels/conv_grad_shape_utils.cc
index acb052968e1..bba989b4f92 100644
--- a/tensorflow/core/kernels/conv_grad_shape_utils.cc
+++ b/tensorflow/core/kernels/conv_grad_shape_utils.cc
@@ -115,10 +115,10 @@ Status ConvBackpropComputeDimensionsV2(
   dims->batch_size = input_shape.dim_size(batch_dim);
   if (dims->batch_size != out_backprop_shape.dim_size(batch_dim)) {
     return errors::InvalidArgument(
-        label, ": input and out_backprop must have the same batch size",
-        "input batch: ", dims->batch_size,
-        "outbackprop batch: ", out_backprop_shape.dim_size(batch_dim),
-        " batch_dim: ", batch_dim);
+        label, ": input and out_backprop must have the same batch size.",
+        " Input batch: ", dims->batch_size,
+        ", outbackprop batch: ", out_backprop_shape.dim_size(batch_dim),
+        ", batch_dim: ", batch_dim);
   }
 
   int feature_dim = GetTensorFeatureDimIndex(num_dims, data_format);
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index ab8e24a311f..8db796c216b 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -374,8 +374,8 @@ Status InitConv2DParameters(const OpKernelConstruction* context,
   const int64 stride_w = GetTensorDim(strides, data_format, 'W');
   TF_REQUIRES(
       stride_n == 1 && stride_c == 1,
-      errors::InvalidArgument("Current implementation does not yet support "
-                              "strides in the batch and depth dimensions."));
+      errors::Unimplemented("Current implementation does not yet support "
+                            "strides in the batch and depth dimensions."));
   TF_REQUIRES(stride_h > 0 && stride_w > 0,
               errors::InvalidArgument(
                   "Row and column strides should be larger than 0."));
@@ -386,8 +386,8 @@ Status InitConv2DParameters(const OpKernelConstruction* context,
   const int64 dilation_w = GetTensorDim(dilations, data_format, 'W');
   TF_REQUIRES(
       dilation_n == 1 && dilation_c == 1,
-      errors::InvalidArgument("Current implementation does not yet support "
-                              "dilations in the batch and depth dimensions."));
+      errors::Unimplemented("Current implementation does not yet support "
+                            "dilations in the batch and depth dimensions."));
   TF_REQUIRES(
       dilation_h > 0 && dilation_w > 0,
       errors::InvalidArgument("Dilated rates should be larger than 0."));
diff --git a/tensorflow/core/kernels/conv_ops_fused_image_transform.cc b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
index 9055639aaaf..091e483b2ca 100644
--- a/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
+++ b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
@@ -34,9 +34,9 @@ limitations under the License.
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/conv_ops.h"
 #include "tensorflow/core/kernels/gemm_functors.h"
-#include "tensorflow/core/kernels/image_resizer_state.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/util/image_resizer_state.h"
 #include "tensorflow/core/util/mirror_pad_mode.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index ba33224f10a..2e97d486b54 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -68,7 +68,7 @@ class DnnScratchAllocator : public se::ScratchAllocator {
                                            memory_limit_, ").")};
     }
     AllocationAttributes allocation_attr;
-    allocation_attr.no_retry_on_failure = true;
+    allocation_attr.retry_on_failure = false;
     Status allocation_status(context_->allocate_temp(
         DT_UINT8, TensorShape({byte_size}), &temporary_memory,
         AllocatorAttributes(), allocation_attr));
diff --git a/tensorflow/core/kernels/conv_ops_using_gemm.cc b/tensorflow/core/kernels/conv_ops_using_gemm.cc
index dff1a533ee0..71eda28899e 100644
--- a/tensorflow/core/kernels/conv_ops_using_gemm.cc
+++ b/tensorflow/core/kernels/conv_ops_using_gemm.cc
@@ -62,7 +62,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/conv_ops.h"
 #include "tensorflow/core/kernels/gemm_functors.h"
-#include "tensorflow/core/kernels/image_resizer_state.h"
+#include "tensorflow/core/util/image_resizer_state.h"
 #include "tensorflow/core/util/mirror_pad_mode.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
index b9b96d3fc70..bd62f39e8ae 100644
--- a/tensorflow/core/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc
@@ -500,6 +500,9 @@ struct CudnnRnnModelShapes {
   int max_seq_length;
   int batch_size;
   int cell_num_units = 0;
+  // If you add new field to this structure, please take care of
+  // updating IsCompatibleWith() below as well as the hash function in
+  // CudnnRnnConfigHasher.
   TensorShape input_shape;
   TensorShape output_shape;
   TensorShape hidden_state_shape;
@@ -508,7 +511,8 @@ struct CudnnRnnModelShapes {
   bool IsCompatibleWith(const CudnnRnnModelShapes& rhs) const {
     return num_layers == rhs.num_layers && input_size == rhs.input_size &&
            num_units == rhs.num_units && dir_count == rhs.dir_count &&
-           cell_num_units == rhs.cell_num_units;
+           cell_num_units == rhs.cell_num_units &&
+           max_seq_length == rhs.max_seq_length;
   }
   string DebugString() const {
     return strings::Printf(
@@ -530,7 +534,7 @@ struct CudnnRnnConfigHasher {
 
     uint64 hash =
         HashList({shapes.num_layers, shapes.input_size, shapes.num_units,
-                  shapes.dir_count, shapes.batch_size});
+                  shapes.dir_count, shapes.max_seq_length, shapes.batch_size});
     if (algo_desc.has_value()) {
       hash = Hash64Combine(hash, algo_desc->hash());
     }
diff --git a/tensorflow/core/kernels/cwise_op_abs.cc b/tensorflow/core/kernels/cwise_op_abs.cc
index e4f01cf6c90..d3b09f7078a 100644
--- a/tensorflow/core/kernels/cwise_op_abs.cc
+++ b/tensorflow/core/kernels/cwise_op_abs.cc
@@ -21,12 +21,15 @@ REGISTER8(UnaryOp, CPU, "Abs", functor::abs, Eigen::half, bfloat16, float,
 REGISTER2(UnaryOp, CPU, "ComplexAbs", functor::abs, complex64, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#ifndef MLIR_GENERATED_GPU_KERNELS_ENABLED
 REGISTER4(UnaryOp, GPU, "Abs", functor::abs, Eigen::half, float, double, int64);
+#endif
 REGISTER2(UnaryOp, GPU, "ComplexAbs", functor::abs, complex64, complex128);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
+#ifndef MLIR_GENERATED_GPU_KERNELS_ENABLED
 REGISTER_KERNEL_BUILDER(Name("Abs")
                             .Device(DEVICE_GPU)
                             .HostMemory("x")
@@ -34,6 +37,7 @@ REGISTER_KERNEL_BUILDER(Name("Abs")
                             .TypeConstraint<int32>("T"),
                         UnaryOp<CPUDevice, functor::abs<int32>>);
 #endif
+#endif
 
 #if TENSORFLOW_USE_SYCL
 REGISTER3(UnaryOp, SYCL, "Abs", functor::abs, float, double, int64);
diff --git a/tensorflow/core/kernels/cwise_op_ceil.cc b/tensorflow/core/kernels/cwise_op_ceil.cc
index 4b1847d758c..f8907ff1baa 100644
--- a/tensorflow/core/kernels/cwise_op_ceil.cc
+++ b/tensorflow/core/kernels/cwise_op_ceil.cc
@@ -16,7 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER3(UnaryOp, CPU, "Ceil", functor::ceil, float, Eigen::half, double);
+REGISTER4(UnaryOp, CPU, "Ceil", functor::ceil, float, Eigen::half, bfloat16,
+          double);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Ceil", functor::ceil, float, Eigen::half, double);
diff --git a/tensorflow/core/kernels/cwise_op_clip.cc b/tensorflow/core/kernels/cwise_op_clip.cc
index c0c71c5f638..3d43cf147b1 100644
--- a/tensorflow/core/kernels/cwise_op_clip.cc
+++ b/tensorflow/core/kernels/cwise_op_clip.cc
@@ -156,6 +156,7 @@ struct TernaryClipOp<CPUDevice, T> {
 INSTANTIATE_CPU(Eigen::half);
 INSTANTIATE_CPU(float);
 INSTANTIATE_CPU(double);
+INSTANTIATE_CPU(bfloat16);
 INSTANTIATE_CPU(int8);
 INSTANTIATE_CPU(int16);
 INSTANTIATE_CPU(int32);
@@ -173,6 +174,7 @@ INSTANTIATE_CPU(uint16);
 REGISTER_CPU_KERNEL(Eigen::half);
 REGISTER_CPU_KERNEL(float);
 REGISTER_CPU_KERNEL(double);
+REGISTER_CPU_KERNEL(bfloat16);
 REGISTER_CPU_KERNEL(int8);
 REGISTER_CPU_KERNEL(int16);
 REGISTER_CPU_KERNEL(int32);
diff --git a/tensorflow/core/kernels/cwise_op_cos.cc b/tensorflow/core/kernels/cwise_op_cos.cc
index 7b434ce4294..3d406fe040a 100644
--- a/tensorflow/core/kernels/cwise_op_cos.cc
+++ b/tensorflow/core/kernels/cwise_op_cos.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Cos", functor::cos, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Cos", functor::cos, float, Eigen::half, bfloat16,
+          double, complex64, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Cos", functor::cos, float, Eigen::half, double);
diff --git a/tensorflow/core/kernels/cwise_op_cosh.cc b/tensorflow/core/kernels/cwise_op_cosh.cc
index 3388df0096a..e6dff0ea317 100644
--- a/tensorflow/core/kernels/cwise_op_cosh.cc
+++ b/tensorflow/core/kernels/cwise_op_cosh.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double, complex64,
-          complex128);
+REGISTER5(UnaryOp, CPU, "Cosh", functor::cosh, float, double, bfloat16,
+          complex64, complex128);
 
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                \
diff --git a/tensorflow/core/kernels/cwise_op_exp.cc b/tensorflow/core/kernels/cwise_op_exp.cc
index 2b157f0e7a9..d937dd0c06d 100644
--- a/tensorflow/core/kernels/cwise_op_exp.cc
+++ b/tensorflow/core/kernels/cwise_op_exp.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Exp", functor::exp, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Exp", functor::exp, float, Eigen::half, bfloat16,
+          double, complex64, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER5(UnaryOp, GPU, "Exp", functor::exp, float, Eigen::half, double,
diff --git a/tensorflow/core/kernels/cwise_op_expm1.cc b/tensorflow/core/kernels/cwise_op_expm1.cc
index 55fdc4763d3..0b145d83e5c 100644
--- a/tensorflow/core/kernels/cwise_op_expm1.cc
+++ b/tensorflow/core/kernels/cwise_op_expm1.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Expm1", functor::expm1, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Expm1", functor::expm1, float, Eigen::half, bfloat16,
+          double, complex64, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Expm1", functor::expm1, float, Eigen::half, double);
 #endif
diff --git a/tensorflow/core/kernels/cwise_op_floor.cc b/tensorflow/core/kernels/cwise_op_floor.cc
index 25210a0fa51..1dbd9bf0634 100644
--- a/tensorflow/core/kernels/cwise_op_floor.cc
+++ b/tensorflow/core/kernels/cwise_op_floor.cc
@@ -16,7 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER3(UnaryOp, CPU, "Floor", functor::floor, float, Eigen::half, double);
+REGISTER4(UnaryOp, CPU, "Floor", functor::floor, float, Eigen::half, bfloat16,
+          double);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Floor", functor::floor, float, Eigen::half, double);
diff --git a/tensorflow/core/kernels/cwise_op_floor_div.cc b/tensorflow/core/kernels/cwise_op_floor_div.cc
index 11869e43eaa..d1f6d4c0652 100644
--- a/tensorflow/core/kernels/cwise_op_floor_div.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_div.cc
@@ -18,8 +18,8 @@ limitations under the License.
 namespace tensorflow {
 REGISTER6(BinaryOp, CPU, "FloorDiv", functor::safe_floor_div, uint8, uint16,
           int8, int16, int32, int64);
-REGISTER3(BinaryOp, CPU, "FloorDiv", functor::floor_div_real, float,
-          Eigen::half, double);
+REGISTER4(BinaryOp, CPU, "FloorDiv", functor::floor_div_real, float,
+          Eigen::half, bfloat16, double);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER4(BinaryOp, GPU, "FloorDiv", functor::floor_div, uint8, uint16, int16,
diff --git a/tensorflow/core/kernels/cwise_op_floor_mod.cc b/tensorflow/core/kernels/cwise_op_floor_mod.cc
index 3305f54bcca..599ed1a9318 100644
--- a/tensorflow/core/kernels/cwise_op_floor_mod.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_mod.cc
@@ -18,7 +18,8 @@ limitations under the License.
 namespace tensorflow {
 REGISTER3(BinaryOp, CPU, "FloorMod", functor::safe_floor_mod, int32, int64,
           uint64);
-REGISTER2(BinaryOp, CPU, "FloorMod", functor::floor_fmod, float, double);
+REGISTER3(BinaryOp, CPU, "FloorMod", functor::floor_fmod, bfloat16, float,
+          double);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // A special GPU kernel for int32.
diff --git a/tensorflow/core/kernels/cwise_op_gpu_abs.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_abs.cu.cc
index f5b4f014c2c..ee14f2950d6 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_abs.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_abs.cu.cc
@@ -19,7 +19,11 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
+#ifdef MLIR_GENERATED_GPU_KERNELS_ENABLED
+DEFINE_UNARY2(abs, complex64, complex128);
+#else
 DEFINE_UNARY6(abs, Eigen::half, float, double, int64, complex64, complex128);
+#endif
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_isfinite.cc b/tensorflow/core/kernels/cwise_op_isfinite.cc
index 061dc8367e2..42c7cbd4fd7 100644
--- a/tensorflow/core/kernels/cwise_op_isfinite.cc
+++ b/tensorflow/core/kernels/cwise_op_isfinite.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER3(UnaryOp, CPU, "IsFinite", functor::isfinite, float, Eigen::half,
-          double);
+REGISTER4(UnaryOp, CPU, "IsFinite", functor::isfinite, float, Eigen::half,
+          bfloat16, double);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "IsFinite", functor::isfinite, float, Eigen::half,
diff --git a/tensorflow/core/kernels/cwise_op_isinf.cc b/tensorflow/core/kernels/cwise_op_isinf.cc
index f87a24d2085..68141f4924a 100644
--- a/tensorflow/core/kernels/cwise_op_isinf.cc
+++ b/tensorflow/core/kernels/cwise_op_isinf.cc
@@ -16,7 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER3(UnaryOp, CPU, "IsInf", functor::isinf, float, Eigen::half, double);
+REGISTER4(UnaryOp, CPU, "IsInf", functor::isinf, float, Eigen::half, bfloat16,
+          double);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "IsInf", functor::isinf, float, Eigen::half, double);
diff --git a/tensorflow/core/kernels/cwise_op_log1p.cc b/tensorflow/core/kernels/cwise_op_log1p.cc
index 06fc764fc75..88ddfd6af26 100644
--- a/tensorflow/core/kernels/cwise_op_log1p.cc
+++ b/tensorflow/core/kernels/cwise_op_log1p.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Log1p", functor::log1p, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Log1p", functor::log1p, float, Eigen::half, bfloat16,
+          double, complex64, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Log1p", functor::log1p, float, Eigen::half, double);
diff --git a/tensorflow/core/kernels/cwise_op_pow.cc b/tensorflow/core/kernels/cwise_op_pow.cc
index 1b1d626aa57..214d083e11b 100644
--- a/tensorflow/core/kernels/cwise_op_pow.cc
+++ b/tensorflow/core/kernels/cwise_op_pow.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(BinaryOp, CPU, "Pow", functor::pow, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(BinaryOp, CPU, "Pow", functor::pow, float, Eigen::half, bfloat16,
+          double, complex64, complex128);
 REGISTER2(BinaryOp, CPU, "Pow", functor::safe_pow, int32, int64);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/cwise_op_reciprocal.cc b/tensorflow/core/kernels/cwise_op_reciprocal.cc
index 8e92691474a..4fe201e9c7b 100644
--- a/tensorflow/core/kernels/cwise_op_reciprocal.cc
+++ b/tensorflow/core/kernels/cwise_op_reciprocal.cc
@@ -30,15 +30,8 @@ REGISTER3(SimpleBinaryOp, GPU, "InvGrad", functor::inverse_grad, float,
           Eigen::half, double);
 #endif
 
-#ifdef ENABLE_INTEL_MKL_BFLOAT16
-// Since Eigen backend does not support bfloat16 ops, we are selectively
-// enabling them for MKL backend.
 REGISTER6(UnaryOp, CPU, "Reciprocal", functor::inverse, float, Eigen::half,
-          double, complex64, complex128, bfloat16);
-#else
-REGISTER5(UnaryOp, CPU, "Reciprocal", functor::inverse, float, Eigen::half,
-          double, complex64, complex128);
-#endif  // ENABLE_INTEL_MKL_BFLOAT16
+          bfloat16, double, complex64, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER4(UnaryOp, GPU, "Reciprocal", functor::inverse, float, Eigen::half,
           double, int64);
@@ -47,8 +40,8 @@ REGISTER4(UnaryOp, GPU, "Reciprocal", functor::inverse, float, Eigen::half,
 REGISTER(UnaryOp, SYCL, "Reciprocal", functor::inverse, float);
 #endif  // TENSORFLOW_USE_SYCL
 
-REGISTER5(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
-          Eigen::half, double, complex64, complex128);
+REGISTER6(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
+          Eigen::half, bfloat16, double, complex64, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(SimpleBinaryOp, GPU, "ReciprocalGrad", functor::inverse_grad, float,
           Eigen::half, double);
diff --git a/tensorflow/core/kernels/cwise_op_sign.cc b/tensorflow/core/kernels/cwise_op_sign.cc
index 983cee4c944..200a56eb2d2 100644
--- a/tensorflow/core/kernels/cwise_op_sign.cc
+++ b/tensorflow/core/kernels/cwise_op_sign.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER7(UnaryOp, CPU, "Sign", functor::sign, float, double, int32, int64,
-          complex64, Eigen::half, complex128);
+REGISTER8(UnaryOp, CPU, "Sign", functor::sign, float, double, int32, int64,
+          complex64, Eigen::half, bfloat16, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER6(UnaryOp, GPU, "Sign", functor::sign, float, Eigen::half, double,
           int64, complex64, complex128);
diff --git a/tensorflow/core/kernels/cwise_op_sin.cc b/tensorflow/core/kernels/cwise_op_sin.cc
index ab6fb1ccd5e..f0fc2af7366 100644
--- a/tensorflow/core/kernels/cwise_op_sin.cc
+++ b/tensorflow/core/kernels/cwise_op_sin.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Sin", functor::sin, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Sin", functor::sin, float, Eigen::half, bfloat16,
+          double, complex64, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Sin", functor::sin, float, Eigen::half, double);
diff --git a/tensorflow/core/kernels/cwise_op_sinh.cc b/tensorflow/core/kernels/cwise_op_sinh.cc
index 114a6142bdc..4448d2fef76 100644
--- a/tensorflow/core/kernels/cwise_op_sinh.cc
+++ b/tensorflow/core/kernels/cwise_op_sinh.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double, complex64,
-          complex128);
+REGISTER5(UnaryOp, CPU, "Sinh", functor::sinh, float, double, bfloat16,
+          complex64, complex128);
 
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                \
diff --git a/tensorflow/core/kernels/cwise_op_squared_difference.cc b/tensorflow/core/kernels/cwise_op_squared_difference.cc
index 154c6adf258..12520b7e10b 100644
--- a/tensorflow/core/kernels/cwise_op_squared_difference.cc
+++ b/tensorflow/core/kernels/cwise_op_squared_difference.cc
@@ -16,8 +16,9 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER7(BinaryOp, CPU, "SquaredDifference", functor::squared_difference,
-          float, Eigen::half, double, int32, int64, complex64, complex128);
+REGISTER8(BinaryOp, CPU, "SquaredDifference", functor::squared_difference,
+          float, Eigen::half, double, bfloat16, int32, int64, complex64,
+          complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER4(BinaryOp, GPU, "SquaredDifference", functor::squared_difference,
           float, Eigen::half, double, int64);
diff --git a/tensorflow/core/kernels/cwise_op_tan.cc b/tensorflow/core/kernels/cwise_op_tan.cc
index d9793501a09..115531213ac 100644
--- a/tensorflow/core/kernels/cwise_op_tan.cc
+++ b/tensorflow/core/kernels/cwise_op_tan.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Tan", functor::tan, Eigen::half, float, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Tan", functor::tan, Eigen::half, bfloat16, float,
+          double, complex64, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Tan", functor::tan, Eigen::half, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_tanh.cc b/tensorflow/core/kernels/cwise_op_tanh.cc
index 1b6da56e537..de56a5e3e03 100644
--- a/tensorflow/core/kernels/cwise_op_tanh.cc
+++ b/tensorflow/core/kernels/cwise_op_tanh.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Tanh", functor::tanh, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Tanh", functor::tanh, float, Eigen::half, bfloat16,
+          double, complex64, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #ifndef MLIR_GENERATED_GPU_KERNELS_ENABLED
@@ -30,8 +30,8 @@ REGISTER3(UnaryOp, GPU, "Tanh", functor::tanh, float, Eigen::half, double);
 REGISTER2(UnaryOp, SYCL, "Tanh", functor::tanh, float, double);
 #endif  // TENSORFLOW_USE_SYCL
 
-REGISTER5(SimpleBinaryOp, CPU, "TanhGrad", functor::tanh_grad, float,
-          Eigen::half, double, complex64, complex128);
+REGISTER6(SimpleBinaryOp, CPU, "TanhGrad", functor::tanh_grad, float,
+          Eigen::half, bfloat16, double, complex64, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(SimpleBinaryOp, GPU, "TanhGrad", functor::tanh_grad, float,
           Eigen::half, double);
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 7ac6547e5cf..1365f8a1d31 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -54,6 +54,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:regexp",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
@@ -63,6 +64,7 @@ tf_cc_test(
     name = "dataset_utils_test",
     srcs = ["dataset_utils_test.cc"],
     deps = [
+        ":dataset_test_base",
         ":dataset_utils",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:framework",
@@ -621,7 +623,6 @@ tf_cc_test(
     name = "parallel_interleave_dataset_op_test",
     size = "small",
     srcs = ["parallel_interleave_dataset_op_test.cc"],
-    tags = ["notsan"],  # TODO(b/147147071): Remove this tag once bug fix lands.
     deps = [
         ":captured_function",
         ":dataset_test_base",
@@ -1201,6 +1202,7 @@ tf_kernel_library(
     hdrs = ["cache_dataset_ops.h"],
     deps = [
         ":cache_ops",
+        ":dataset_utils",
         ":name_utils",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -1240,6 +1242,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:platform_port",
     ],
 )
 
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index b4d0f9e5ab3..f60001b0055 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/cache_ops.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/data/name_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -641,57 +642,6 @@ class CacheDatasetOp::FileDatasetV2 : public CacheDatasetOp::FileDatasetBase {
   const Tensor resource_handle_;
 };
 
-namespace {
-template <typename T, typename FullNameFn>
-Status SaveCache(IteratorStateWriter* writer, T* cache, FullNameFn full_name) {
-  size_t cache_size = cache->size();
-  TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kCacheSize), cache_size));
-  for (size_t i = 0; i < cache_size; i++) {
-    auto& element = cache->at(i);
-    TF_RETURN_IF_ERROR(writer->WriteScalar(
-        full_name(strings::StrCat(kCache, "[", i, "]", kSizeSuffix)),
-        element.size()));
-    for (size_t j = 0; j < element.size(); ++j) {
-      TF_RETURN_IF_ERROR(writer->WriteTensor(
-          full_name(strings::StrCat(kCache, "[", i, "][", j, "]")),
-          element[j]));
-    }
-  }
-  return Status::OK();
-}
-
-template <typename T, typename FullNameFn>
-Status RestoreCache(IteratorContext* ctx, IteratorStateReader* reader, T* cache,
-                    FullNameFn full_name) {
-  size_t cache_size;
-  {
-    int64 temp;
-    TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kCacheSize), &temp));
-    cache_size = static_cast<size_t>(temp);
-  }
-  for (size_t i = 0; i < cache_size; ++i) {
-    std::vector<Tensor> element;
-    size_t element_size;
-    {
-      int64 temp;
-      TF_RETURN_IF_ERROR(reader->ReadScalar(
-          full_name(strings::StrCat(kCache, "[", i, "]", kSizeSuffix)), &temp));
-      element_size = static_cast<size_t>(temp);
-    }
-    element.reserve(element_size);
-    for (size_t j = 0; j < element_size; ++j) {
-      element.emplace_back();
-      TF_RETURN_IF_ERROR(reader->ReadTensor(
-          full_name(strings::StrCat(kCache, "[", i, "][", j, "]")),
-          &element.back()));
-    }
-    cache->emplace_back(std::move(element));
-  }
-  return Status::OK();
-}
-
-}  // namespace
-
 class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
  public:
   explicit MemoryDatasetBase(OpKernelContext* ctx, const DatasetBase* input,
@@ -764,8 +714,8 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
       mutex_lock l(mu_);
       if (cache_->IsCompleted()) {
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kCacheCompleted), ""));
-        TF_RETURN_IF_ERROR(SaveCache(
-            writer, cache_, [this](const string& s) { return full_name(s); }));
+        TF_RETURN_IF_ERROR(
+            WriteElementsToCheckpoint(writer, prefix(), cache_->data()));
       }
       return SaveInput(ctx, writer, iterator_);
     }
@@ -778,8 +728,7 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
       if (reader->Contains(full_name(kCacheCompleted))) {
         std::vector<std::vector<Tensor>> temp_cache;
         TF_RETURN_IF_ERROR(
-            RestoreCache(ctx, reader, &temp_cache,
-                         [this](const string& s) { return full_name(s); }));
+            ReadElementsFromCheckpoint(reader, prefix(), &temp_cache));
         cache_->Complete(std::move(temp_cache));
       }
       TF_RETURN_IF_ERROR(InitializeIterator(ctx));
@@ -846,8 +795,7 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
         mutex_lock l(mu_);
         if (!cache_->IsCompleted()) {
           TF_RETURN_IF_ERROR(
-              SaveCache(writer, &temp_cache_,
-                        [this](const string& s) { return full_name(s); }));
+              WriteElementsToCheckpoint(writer, prefix(), temp_cache_));
         }
         return SaveInput(ctx, writer, input_impl_);
       }
@@ -857,8 +805,7 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
         mutex_lock l(mu_);
         if (!reader->Contains(full_name(kCacheCompleted))) {
           TF_RETURN_IF_ERROR(
-              RestoreCache(ctx, reader, &temp_cache_,
-                           [this](const string& s) { return full_name(s); }));
+              ReadElementsFromCheckpoint(reader, prefix(), &temp_cache_));
         }
         return RestoreInput(ctx, reader, input_impl_);
       }
diff --git a/tensorflow/core/kernels/data/cache_ops.cc b/tensorflow/core/kernels/data/cache_ops.cc
index 90c2e905c32..f6dce49a7c0 100644
--- a/tensorflow/core/kernels/data/cache_ops.cc
+++ b/tensorflow/core/kernels/data/cache_ops.cc
@@ -63,6 +63,11 @@ size_t MemoryCache::size() {
   return cache_.size();
 }
 
+const std::vector<std::vector<Tensor>>& MemoryCache::data() {
+  tf_shared_lock l(mu_);
+  return cache_;
+}
+
 AnonymousMemoryCacheHandleOp::AnonymousMemoryCacheHandleOp(
     OpKernelConstruction* ctx)
     : AnonymousResourceOp<MemoryCacheManager>(ctx) {}
diff --git a/tensorflow/core/kernels/data/cache_ops.h b/tensorflow/core/kernels/data/cache_ops.h
index c670d6f0e50..d95d9d22b01 100644
--- a/tensorflow/core/kernels/data/cache_ops.h
+++ b/tensorflow/core/kernels/data/cache_ops.h
@@ -46,6 +46,10 @@ class MemoryCache {
   // Returns the size of the cache.
   size_t size();
 
+  // Returns a reference to the cache's data. The returned reference will be
+  // invalidated by any call to Reset().
+  const std::vector<std::vector<Tensor>>& data();
+
  private:
   mutex mu_;
   // Determines whether all elements of the dataset have been cached.
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index aac07eebfa1..0066764baa0 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -110,6 +110,18 @@ class SimpleStepStatsCollector : public StepStatsCollectorInterface {
   int64 processing_time_ TF_GUARDED_BY(mu_) = 0;
 };
 
+Status GetCapturedInput(const CapturedFunction* const func, int index,
+                        const Tensor** out) {
+  if (TF_PREDICT_FALSE(index >= func->captured_inputs().size())) {
+    return errors::OutOfRange(
+        "Out of range access to captured inputs for function ",
+        func->func().name(), ". Index: ", index,
+        ". Num captured inputs: ", func->captured_inputs().size());
+  }
+  *out = &func->captured_inputs()[index];
+  return Status::OK();
+}
+
 Status RunShortCircuit(const ShortCircuitInfo& info,
                        const std::vector<Tensor>& args,
                        const CapturedFunction* const func,
@@ -121,7 +133,10 @@ Status RunShortCircuit(const ShortCircuitInfo& info,
     if (info.indices[i] < num_args) {
       rets->push_back(args[info.indices[i]]);
     } else {
-      rets->push_back(func->captured_inputs()[info.indices[i] - num_args]);
+      const Tensor* captured_input;
+      TF_RETURN_IF_ERROR(
+          GetCapturedInput(func, info.indices[i] - num_args, &captured_input));
+      rets->push_back(*captured_input);
     }
   }
   return Status::OK();
@@ -141,7 +156,10 @@ Status RunShortCircuit(const ShortCircuitInfo& info, std::vector<Tensor>&& args,
         rets->push_back(args[info.indices[i]]);
       }
     } else {
-      rets->push_back(func->captured_inputs()[info.indices[i] - num_args]);
+      const Tensor* captured_input;
+      TF_RETURN_IF_ERROR(
+          GetCapturedInput(func, info.indices[i] - num_args, &captured_input));
+      rets->push_back(*captured_input);
     }
   }
   return Status::OK();
@@ -198,7 +216,7 @@ Status CreateShortCircuitInfo(OpKernelConstruction* ctx,
       last_use[indices[i]] = i;
     }
     can_move.resize(indices.size());
-    for (int i = 0, iter_limit = indices.size(); i < iter_limit; ++i) {
+    for (int i = 0, end = indices.size(); i < end; ++i) {
       can_move[i] = last_use[indices[i]] == i;
     }
   }
@@ -645,8 +663,7 @@ Status CapturedFunction::Instantiate(
       inst_opts.composite_devices[it.first] = &it.second;
     }
 
-    for (int i = 0, iter_limit = fdef->signature().output_arg_size();
-         i < iter_limit; ++i) {
+    for (int i = 0, end = fdef->signature().output_arg_size(); i < end; ++i) {
       inst_opts.output_devices.push_back(inst_opts.target);
     }
 
diff --git a/tensorflow/core/kernels/data/dataset_ops.cc b/tensorflow/core/kernels/data/dataset_ops.cc
index ddfaa69b14b..7c701b7886a 100644
--- a/tensorflow/core/kernels/data/dataset_ops.cc
+++ b/tensorflow/core/kernels/data/dataset_ops.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/framework/dataset_stateful_op_whitelist.h"
+#include "tensorflow/core/framework/dataset_stateful_op_allowlist.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
diff --git a/tensorflow/core/kernels/data/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc
index 3161004b7ab..d79288b86d3 100644
--- a/tensorflow/core/kernels/data/dataset_utils.cc
+++ b/tensorflow/core/kernels/data/dataset_utils.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/proto_serialization.h"
+#include "tensorflow/core/platform/regexp.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
@@ -51,6 +52,9 @@ constexpr std::array<const char*, 3> kOpsWithSeed = {
 
 constexpr char kSeedInputName[] = "seed";
 constexpr char kSeed2InputName[] = "seed2";
+constexpr char kComponent[] = "component";
+constexpr char kNumElements[] = "num_elements";
+constexpr char kNumComponents[] = "num_components";
 
 template <std::size_t SIZE>
 bool IsNodeOfType(const NodeDef& node,
@@ -428,6 +432,49 @@ Status HashGraph(const GraphDef& graph_def, uint64* hash) {
   return Status::OK();
 }
 
+Status WriteElementsToCheckpoint(
+    IteratorStateWriter* writer, StringPiece key_prefix,
+    const std::vector<std::vector<Tensor>>& elements) {
+  TF_RETURN_IF_ERROR(
+      writer->WriteScalar(key_prefix, kNumElements, elements.size()));
+  for (int i = 0; i < elements.size(); ++i) {
+    const std::vector<Tensor>& element = elements[i];
+    std::string element_prefix = absl::StrCat(key_prefix, "::", i);
+    TF_RETURN_IF_ERROR(
+        writer->WriteScalar(element_prefix, kNumComponents, element.size()));
+    for (int j = 0; j < elements[i].size(); ++j) {
+      TF_RETURN_IF_ERROR(writer->WriteTensor(
+          element_prefix, absl::StrCat(kComponent, "[", j, "]"), element[j]));
+    }
+  }
+  return Status::OK();
+}
+
+Status ReadElementsFromCheckpoint(IteratorStateReader* reader,
+                                  StringPiece key_prefix,
+                                  std::vector<std::vector<Tensor>>* elements) {
+  int64 num_elements;
+  TF_RETURN_IF_ERROR(
+      reader->ReadScalar(key_prefix, kNumElements, &num_elements));
+  elements->reserve(num_elements);
+  for (int i = 0; i < num_elements; ++i) {
+    std::string element_prefix = absl::StrCat(key_prefix, "::", i);
+    int64 num_components;
+    TF_RETURN_IF_ERROR(
+        reader->ReadScalar(element_prefix, kNumComponents, &num_components));
+    elements->emplace_back();
+    std::vector<Tensor>& element = elements->at(i);
+    element.reserve(num_components);
+    for (int j = 0; j < num_components; ++j) {
+      element.emplace_back();
+      TF_RETURN_IF_ERROR(reader->ReadTensor(
+          element_prefix, absl::StrCat(kComponent, "[", j, "]"),
+          &element.back()));
+    }
+  }
+  return Status::OK();
+}
+
 std::pair<int64, int64> MaybeOverrideSeeds(std::pair<int64, int64> seeds) {
   if (seeds.first == 0 && seeds.second == 0) {
     return {random::New64(), random::New64()};
@@ -852,5 +899,154 @@ std::string DeterminismPolicy::String() const {
   }
 }
 
+bool MatchesAnyVersionRE(StringPiece op_prefix, StringPiece op_to_match) {
+  // Matches all versions of an op by appending an optional version suffix
+  auto expected_re = strings::StrCat(RE2::QuoteMeta(op_prefix), "(V\\d+)?");
+  return RE2::FullMatch(op_to_match, expected_re);
+}
+
+std::vector<tstring> SelectOptimizations(
+    const string& job_name,
+    const absl::flat_hash_map<string, uint64>& live_experiments,
+    const std::vector<tstring>& optimizations_enabled,
+    const std::vector<tstring>& optimizations_disabled,
+    const std::vector<tstring>& optimizations_default,
+    std::function<uint64(const string&)> hash_func) {
+  std::vector<tstring> optimizations;
+  if (job_name.empty()) {
+    // If `job_name` is empty, apply the enabled and default optimizations
+    // directly.
+    optimizations.insert(optimizations.end(), optimizations_enabled.begin(),
+                         optimizations_enabled.end());
+    optimizations.insert(optimizations.end(), optimizations_default.begin(),
+                         optimizations_default.end());
+    return optimizations;
+  }
+
+  // If `job_name` is non-empty, we determine which optimizations to apply to
+  // this job based on the enable/disable settings from tf.data.Options, the
+  // opt in/out settings from environment variables, and rollout condition from
+  // `live_experiments`.
+  const char* opt_ins_raw_cs = std::getenv("TF_DATA_EXPERIMENT_OPT_IN");
+  const char* opt_outs_raw_cs = std::getenv("TF_DATA_EXPERIMENT_OPT_OUT");
+  string opt_ins_raw;
+  if (opt_ins_raw_cs != nullptr) {
+    opt_ins_raw = string(opt_ins_raw_cs);
+  }
+  string opt_outs_raw;
+  if (opt_outs_raw_cs != nullptr) {
+    opt_outs_raw = string(opt_outs_raw_cs);
+  }
+
+  // Creates a set of optimizations.
+  absl::flat_hash_set<tstring> optimizations_set;
+
+  // Creates the opt in and opt out settings.
+  std::vector<string> opt_ins, opt_outs;
+  if (opt_ins_raw == "all") {
+    for (auto& pair : live_experiments) {
+      opt_ins.push_back(pair.first);
+    }
+  } else {
+    opt_ins = str_util::Split(opt_ins_raw, ',', str_util::SkipEmpty());
+  }
+  if (opt_outs_raw == "all") {
+    for (auto& pair : live_experiments) {
+      opt_outs.push_back(pair.first);
+    }
+  } else {
+    opt_outs = str_util::Split(opt_outs_raw, ',', str_util::SkipEmpty());
+  }
+
+  // Checks if the opt in and opt out experiments are live experiments.
+  for (auto& optimization : opt_ins) {
+    if (live_experiments.find(optimization) == live_experiments.end()) {
+      LOG(WARNING) << "The experiment \"" << optimization
+                   << "\" is opted in but it is not a live experiment.";
+    }
+  }
+  for (auto& optimization : opt_outs) {
+    if (live_experiments.find(optimization) == live_experiments.end()) {
+      LOG(WARNING) << "The experiment \"" << optimization
+                   << "\" is opted out but it is not a live experiment.";
+    }
+  }
+
+  // Checks if the opt in settings conflict with opt out settings.
+  for (auto& optimization : opt_ins) {
+    if (std::find(opt_outs.begin(), opt_outs.end(), optimization) !=
+        opt_outs.end()) {
+      LOG(WARNING) << "The experiment \"" << optimization
+                   << "\" is set in both \"TF_DATA_EXPERIMENT_OPT_IN\" and "
+                      "\"TF_DATA_EXPERIMENT_OPT_OUT\". Unless the experiment "
+                      "corresponds to an explicitly enabled optimization, it "
+                      "is not applied.";
+    }
+  }
+
+  // Checks if the enable/disable settings from tf.data.Options conflict with
+  // user opt in/out settings. In which case we assume tf.data.Options settings
+  // have higher priority to overwrite.
+  for (auto& optimization : optimizations_enabled) {
+    if (std::find(opt_outs.begin(), opt_outs.end(), optimization) !=
+        opt_outs.end()) {
+      LOG(WARNING) << "The optimization \"" << optimization
+                   << "\" is opt out, but is still applied since"
+                      " it is enabled through tf.data.Options.";
+    }
+  }
+  for (auto& optimization : optimizations_disabled) {
+    if (std::find(opt_ins.begin(), opt_ins.end(), optimization) !=
+        opt_ins.end()) {
+      LOG(WARNING) << "The optimization \"" << optimization
+                   << "\" is opt in, but is not applied since"
+                      " it is disabled through tf.data.Options.";
+    }
+  }
+
+  // Add the enabled optimizations.
+  optimizations_set.insert(optimizations_enabled.begin(),
+                           optimizations_enabled.end());
+
+  // Add the default optimizations that are not explicitly opted out.
+  for (auto& optimization : optimizations_default) {
+    if (std::find(opt_outs.begin(), opt_outs.end(), optimization) ==
+        opt_outs.end()) {
+      optimizations_set.insert(optimization);
+    }
+  }
+
+  // Add the live experiments stochastically if they are neither opted in nor
+  // opted out.
+  for (auto& pair : live_experiments) {
+    string experiment = pair.first;
+    // Skip experiments that are explicitly opted out.
+    if (std::find(opt_outs.begin(), opt_outs.end(), experiment) !=
+        opt_outs.end()) {
+      continue;
+    }
+    // Skip experiments whose transformations are explicitly disabled.
+    if (std::find(optimizations_disabled.begin(), optimizations_disabled.end(),
+                  experiment) != optimizations_disabled.end()) {
+      continue;
+    }
+    // Apply experiments that are explicitly opted in.
+    if (std::find(opt_ins.begin(), opt_ins.end(), experiment) !=
+        opt_ins.end()) {
+      optimizations_set.insert(experiment);
+      continue;
+    }
+    // Otherwise, apply experiment stochastically based on job name and
+    // experiment roll out percentage.
+    if (hash_func(strings::StrCat(job_name, experiment)) % 100 < pair.second) {
+      optimizations_set.insert(experiment);
+    }
+  }
+
+  optimizations.insert(optimizations.end(), optimizations_set.begin(),
+                       optimizations_set.end());
+  return optimizations;
+}
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index 0127fe68641..7f9ea923b98 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -125,6 +125,19 @@ Status HashTensor(const Tensor& tensor, uint64* hash);
 // the same between TensorFlow builds.
 Status HashGraph(const GraphDef& graph, uint64* hash);
 
+// Writes dataset elements to the checkpoint writer using the given key prefix.
+// The elements can be read back by passing the same key prefix to
+// ReadElementsFromCheckpoint. Only one list of elements can be written under
+// the same key_prefix.
+Status WriteElementsToCheckpoint(
+    IteratorStateWriter* writer, StringPiece key_prefix,
+    const std::vector<std::vector<Tensor>>& elements);
+
+// Reads dataset elements from the checkpoint reader using the given key prefix.
+Status ReadElementsFromCheckpoint(IteratorStateReader* reader,
+                                  StringPiece key_prefix,
+                                  std::vector<std::vector<Tensor>>* elements);
+
 // Dataset op level determinism policy.
 class DeterminismPolicy {
  public:
@@ -283,6 +296,25 @@ class DummyResourceOp : public OpKernel {
   }
 };
 
+// Given an op prefix and an op to match, returns whether the op to match
+// is a regex match for any version of the op prefix. For example,
+// MatchesAnyVersionRE("BatchDataset", "BatchDataset") == true
+// MatchesAnyVersionRE("BatchDataset", "BatchDatasetV2") == true
+// MatchesAnyVersionRE("BatchDataset", "BatchDatasetV3") == true
+// MatchesAnyVersionRE("PaddedBatchDataset", "BatchDataset") == false
+bool MatchesAnyVersionRE(StringPiece op_prefix, StringPiece op_to_match);
+
+// Based on `job_name`, `optimizations_enabled`, `optimizations_disabled` and
+// `optimizations_default`, returns the list of optimizations that will be
+// applied.
+std::vector<tstring> SelectOptimizations(
+    const string& job_name,
+    const absl::flat_hash_map<string, uint64>& live_experiments,
+    const std::vector<tstring>& optimizations_enabled,
+    const std::vector<tstring>& optimizations_disabled,
+    const std::vector<tstring>& optimizations_default,
+    std::function<uint64(const string&)> hash_func);
+
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/data/dataset_utils_test.cc b/tensorflow/core/kernels/data/dataset_utils_test.cc
index 2abea3ec796..85019e3f8da 100644
--- a/tensorflow/core/kernels/data/dataset_utils_test.cc
+++ b/tensorflow/core/kernels/data/dataset_utils_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
@@ -29,6 +30,8 @@ namespace tensorflow {
 namespace data {
 namespace {
 
+using ::testing::UnorderedElementsAre;
+
 class DatasetHashUtilsTest : public ::testing::Test {
  protected:
   uint64 GetHash(const FunctionDefLibrary& library, const FunctionDef& fn) {
@@ -62,6 +65,14 @@ string full_name(string key) {
   return strings::StrCat(kFullNameRandomHex, kPipe, "Iterator:", key);
 }
 
+TEST(DatasetUtilsTest, MatchesAnyVersion) {
+  EXPECT_TRUE(MatchesAnyVersionRE("BatchDataset", "BatchDataset"));
+  EXPECT_TRUE(MatchesAnyVersionRE("BatchDataset", "BatchDatasetV2"));
+  EXPECT_TRUE(MatchesAnyVersionRE("BatchDataset", "BatchDatasetV3"));
+  EXPECT_FALSE(MatchesAnyVersionRE("BatchDataset", "BatchV2Dataset"));
+  EXPECT_FALSE(MatchesAnyVersionRE("BatchDataset", "PaddedBatchDataset"));
+}
+
 TEST(DatasetUtilsTest, VariantTensorDataRoundtrip) {
   VariantTensorDataWriter writer;
   TF_ASSERT_OK(writer.WriteScalar(full_name("Int64"), 24));
@@ -149,6 +160,33 @@ TEST(DatasetUtilsTest, VariantTensorDataWriteAfterFlushing) {
             writer.WriteTensor(full_name("Tensor"), input_tensor).code());
 }
 
+TEST(DatasetUtilsTest, CheckpointElementsRoundTrip) {
+  std::vector<std::vector<Tensor>> elements;
+  elements.push_back(CreateTensors<int32>(TensorShape({3}), {{1, 2, 3}}));
+  elements.push_back(CreateTensors<int32>(TensorShape({2}), {{4, 5}}));
+  VariantTensorDataWriter writer;
+  tstring test_prefix = full_name("test_prefix");
+  TF_ASSERT_OK(WriteElementsToCheckpoint(&writer, test_prefix, elements));
+  std::vector<const VariantTensorData*> data;
+  writer.GetData(&data);
+
+  VariantTensorDataReader reader(data);
+  std::vector<std::vector<Tensor>> read_elements;
+  TF_ASSERT_OK(
+      ReadElementsFromCheckpoint(&reader, test_prefix, &read_elements));
+  ASSERT_EQ(elements.size(), read_elements.size());
+  for (int i = 0; i < elements.size(); ++i) {
+    std::vector<Tensor>& original = elements[i];
+    std::vector<Tensor>& read = read_elements[i];
+
+    ASSERT_EQ(original.size(), read.size());
+    for (int j = 0; j < original.size(); ++j) {
+      EXPECT_EQ(original[j].NumElements(), read[j].NumElements());
+      EXPECT_EQ(original[j].flat<int32>()(0), read[j].flat<int32>()(0));
+    }
+  }
+}
+
 TEST(DatasetUtilsTest, AddToFunctionLibrary) {
   auto make_fn_a = [](const string& fn_name) {
     return FunctionDefHelper::Create(
@@ -1095,6 +1133,222 @@ TEST_F(DatasetHashUtilsTest, HashStringTensor) {
   EXPECT_NE(GetHash(v1), GetHash(v3));
 }
 
+class SelectOptimizationsHashTest : public ::testing::TestWithParam<uint64> {};
+
+TEST_P(SelectOptimizationsHashTest, DatasetUtils) {
+  const uint64 hash_result = GetParam();
+  string job_name = "job";
+  auto hash_func = [hash_result](const string& str) { return hash_result; };
+  absl::flat_hash_map<string, uint64> live_experiments = {
+      {"exp1", 0},  {"exp2", 20}, {"exp3", 33}, {"exp4", 45},
+      {"exp5", 67}, {"exp6", 88}, {"exp7", 100}};
+  std::vector<tstring> optimizations_enabled, optimizations_disabled,
+      optimizations_default;
+  std::vector<tstring> optimizations = SelectOptimizations(
+      job_name, live_experiments, optimizations_enabled, optimizations_disabled,
+      optimizations_default, hash_func);
+
+  int tested_times = 0;
+  switch (hash_result) {
+    case 0:
+    case 100:
+    case 200:
+      tested_times++;
+      EXPECT_THAT(optimizations, UnorderedElementsAre("exp2", "exp3", "exp4",
+                                                      "exp5", "exp6", "exp7"));
+      break;
+    case 33:
+    case 133:
+      tested_times++;
+      EXPECT_THAT(optimizations,
+                  UnorderedElementsAre("exp4", "exp5", "exp6", "exp7"));
+      break;
+    case 67:
+    case 167:
+      tested_times++;
+      EXPECT_THAT(optimizations, UnorderedElementsAre("exp6", "exp7"));
+      break;
+  }
+  EXPECT_EQ(tested_times, 1);
+}
+
+INSTANTIATE_TEST_SUITE_P(Test, SelectOptimizationsHashTest,
+                         ::testing::Values(0, 33, 67, 100, 133, 167, 200));
+
+class SelectOptimizationsOptTest
+    : public ::testing::TestWithParam<std::tuple<string, string>> {};
+
+TEST_P(SelectOptimizationsOptTest, DatasetUtils) {
+  const string opt_ins = std::get<0>(GetParam());
+  const string opt_outs = std::get<1>(GetParam());
+  if (!opt_ins.empty()) {
+    setenv("TF_DATA_EXPERIMENT_OPT_IN", opt_ins.c_str(), 1);
+  }
+  if (!opt_outs.empty()) {
+    setenv("TF_DATA_EXPERIMENT_OPT_OUT", opt_outs.c_str(), 1);
+  }
+  string job_name = "job";
+  auto hash_func = [](const string& str) { return 50; };
+  absl::flat_hash_map<string, uint64> live_experiments = {
+      {"exp1", 0}, {"exp2", 25}, {"exp3", 50}, {"exp4", 75}, {"exp5", 100}};
+  std::vector<tstring> optimizations_enabled, optimizations_disabled,
+      optimizations_default;
+  std::vector<tstring> optimizations = SelectOptimizations(
+      job_name, live_experiments, optimizations_enabled, optimizations_disabled,
+      optimizations_default, hash_func);
+
+  int tested_times = 0;
+  if (opt_outs == "all") {
+    EXPECT_THAT(optimizations, UnorderedElementsAre());
+    tested_times++;
+  } else if (opt_outs.empty()) {
+    if (opt_ins == "all") {
+      EXPECT_THAT(optimizations,
+                  UnorderedElementsAre("exp1", "exp2", "exp3", "exp4", "exp5"));
+      tested_times++;
+    } else if (opt_ins.empty()) {
+      EXPECT_THAT(optimizations, UnorderedElementsAre("exp4", "exp5"));
+      tested_times++;
+    } else if (opt_ins == "exp2,exp4") {
+      EXPECT_THAT(optimizations, UnorderedElementsAre("exp2", "exp4", "exp5"));
+      tested_times++;
+    }
+  } else if (opt_outs == "exp1,exp5") {
+    if (opt_ins == "all") {
+      EXPECT_THAT(optimizations, UnorderedElementsAre("exp2", "exp3", "exp4"));
+      tested_times++;
+    } else if (opt_ins.empty()) {
+      EXPECT_THAT(optimizations, UnorderedElementsAre("exp4"));
+      tested_times++;
+    } else if (opt_ins == "exp2,exp4") {
+      EXPECT_THAT(optimizations, UnorderedElementsAre("exp2", "exp4"));
+      tested_times++;
+    }
+  }
+  EXPECT_EQ(tested_times, 1);
+
+  if (!opt_ins.empty()) {
+    unsetenv("TF_DATA_EXPERIMENT_OPT_IN");
+  }
+  if (!opt_outs.empty()) {
+    unsetenv("TF_DATA_EXPERIMENT_OPT_OUT");
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    Test, SelectOptimizationsOptTest,
+    ::testing::Combine(::testing::Values("all", "", "exp2,exp4"),
+                       ::testing::Values("all", "", "exp1,exp5")));
+
+class SelectOptimizationsConflictTest
+    : public ::testing::TestWithParam<std::tuple<string, string, uint64>> {};
+
+TEST_P(SelectOptimizationsConflictTest, DatasetUtils) {
+  const string opt_ins = std::get<0>(GetParam());
+  const string opt_outs = std::get<1>(GetParam());
+  const uint64 hash_result = std::get<2>(GetParam());
+  if (!opt_ins.empty()) {
+    setenv("TF_DATA_EXPERIMENT_OPT_IN", opt_ins.c_str(), 1);
+  }
+  if (!opt_outs.empty()) {
+    setenv("TF_DATA_EXPERIMENT_OPT_OUT", opt_outs.c_str(), 1);
+  }
+  string job_name = "job";
+  auto hash_func = [hash_result](const string& str) { return hash_result; };
+  absl::flat_hash_map<string, uint64> live_experiments = {
+      {"exp1", 20}, {"exp2", 30}, {"exp3", 40},
+      {"exp4", 60}, {"exp5", 70}, {"exp6", 80}};
+  std::vector<tstring> optimizations_enabled = {"exp1", "exp4"},
+                       optimizations_disabled = {"exp2", "exp5"},
+                       optimizations_default = {"exp3", "exp6"};
+  std::vector<tstring> optimizations = SelectOptimizations(
+      job_name, live_experiments, optimizations_enabled, optimizations_disabled,
+      optimizations_default, hash_func);
+
+  int tested_times = 0;
+  if (opt_outs.empty()) {
+    EXPECT_THAT(optimizations,
+                UnorderedElementsAre("exp1", "exp3", "exp4", "exp6"));
+    tested_times++;
+  } else if (opt_outs == "exp1,exp3") {
+    EXPECT_THAT(optimizations, UnorderedElementsAre("exp1", "exp4", "exp6"));
+    tested_times++;
+  }
+  EXPECT_EQ(tested_times, 1);
+
+  if (!opt_ins.empty()) {
+    unsetenv("TF_DATA_EXPERIMENT_OPT_IN");
+  }
+  if (!opt_outs.empty()) {
+    unsetenv("TF_DATA_EXPERIMENT_OPT_OUT");
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(Test, SelectOptimizationsConflictTest,
+                         ::testing::Combine(::testing::Values("", "exp2"),
+                                            ::testing::Values("", "exp1,exp3"),
+                                            ::testing::Values(10, 50, 90)));
+
+class SelectOptimizationsJobTest
+    : public ::testing::TestWithParam<std::tuple<string, string, string>> {};
+
+TEST_P(SelectOptimizationsJobTest, DatasetUtils) {
+  const string job_name = std::get<0>(GetParam());
+  const string opt_ins = std::get<1>(GetParam());
+  const string opt_outs = std::get<2>(GetParam());
+  if (!opt_ins.empty()) {
+    setenv("TF_DATA_EXPERIMENT_OPT_IN", opt_ins.c_str(), 1);
+  }
+  if (!opt_outs.empty()) {
+    setenv("TF_DATA_EXPERIMENT_OPT_OUT", opt_outs.c_str(), 1);
+  }
+  std::vector<tstring> optimizations_enabled = {"exp4"}, optimizations_disabled,
+                       optimizations_default = {"exp2"};
+  absl::flat_hash_map<string, uint64> live_experiments = {
+      {"exp1", 0}, {"exp2", 100}, {"exp3", 100}};
+  auto hash_func = [](const string& str) { return Hash64(str); };
+  std::vector<tstring> optimizations = SelectOptimizations(
+      job_name, live_experiments, optimizations_enabled, optimizations_disabled,
+      optimizations_default, hash_func);
+
+  int tested_times = 0;
+  if (job_name.empty()) {
+    EXPECT_THAT(optimizations, UnorderedElementsAre("exp2", "exp4"));
+    tested_times++;
+  } else if (opt_ins.empty()) {
+    if (opt_outs.empty()) {
+      EXPECT_THAT(optimizations, UnorderedElementsAre("exp2", "exp3", "exp4"));
+      tested_times++;
+    } else if (opt_outs == "exp2,exp3") {
+      EXPECT_THAT(optimizations, UnorderedElementsAre("exp4"));
+      tested_times++;
+    }
+  } else if (opt_ins == "exp1") {
+    if (opt_outs.empty()) {
+      EXPECT_THAT(optimizations,
+                  UnorderedElementsAre("exp1", "exp2", "exp3", "exp4"));
+      tested_times++;
+    } else if (opt_outs == "exp2,exp3") {
+      EXPECT_THAT(optimizations, UnorderedElementsAre("exp1", "exp4"));
+      tested_times++;
+    }
+  }
+  EXPECT_EQ(tested_times, 1);
+
+  if (!opt_ins.empty()) {
+    unsetenv("TF_DATA_EXPERIMENT_OPT_IN");
+  }
+  if (!opt_outs.empty()) {
+    unsetenv("TF_DATA_EXPERIMENT_OPT_OUT");
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(Test, SelectOptimizationsJobTest,
+                         ::testing::Combine(::testing::Values("", "job"),
+                                            ::testing::Values("", "exp1"),
+                                            ::testing::Values("",
+                                                              "exp2,exp3")));
+
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index 8457cfa6145..a4682e09b2a 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -32,6 +32,7 @@ tf_kernel_library(
     deps = [
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
+        "//tensorflow/core/kernels/data:dataset_utils",
         "//tensorflow/core/kernels/data:name_utils",
     ],
 )
@@ -123,6 +124,25 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "compute_batch_size_op",
+    srcs = ["compute_batch_size_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler/optimizers/data:graph_utils",
+        "//tensorflow/core/kernels/data:dataset_utils",
+        "//tensorflow/core/kernels/data:name_utils",
+        "//tensorflow/core/kernels/data:serialization_utils",
+    ],
+)
+
 tf_kernel_library(
     name = "csv_dataset_op",
     srcs = ["csv_dataset_op.cc"],
@@ -148,6 +168,7 @@ tf_kernel_library(
         "//tensorflow/core/data:compression_utils",
         "//tensorflow/core/data:dataset_proto_cc",
         "//tensorflow/core/data/service:data_service",
+        "//tensorflow/core/data/service:grpc_util",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         "//tensorflow/core/kernels/data:dataset_utils",
         "//tensorflow/core/kernels/data:name_utils",
@@ -169,6 +190,7 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/data/service:data_service",
+        "//tensorflow/core/data/service:grpc_util",
         "//tensorflow/core/kernels/data:dataset_utils",
         "//tensorflow/core/kernels/data:iterator_ops",
     ],
@@ -374,6 +396,8 @@ tf_kernel_library(
         "//tensorflow/core/kernels/data:captured_function",
         "//tensorflow/core/kernels/data:dataset_utils",
         "//tensorflow/core/kernels/data:name_utils",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/profiler/lib:traceme_encode",
     ],
 )
 
@@ -405,6 +429,8 @@ tf_kernel_library(
         "//tensorflow/core/kernels/data:name_utils",
         "//tensorflow/core/kernels/data:parallel_map_dataset_op",
         "//tensorflow/core/kernels/data:stats_utils",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/profiler/lib:traceme_encode",
     ],
 )
 
@@ -735,6 +761,7 @@ tf_kernel_library(
         ":choose_fastest_branch_dataset_op",
         ":choose_fastest_dataset_op",
         ":compression_ops",
+        ":compute_batch_size_op",
         ":csv_dataset_op",
         ":dense_to_sparse_batch_dataset_op",
         ":directed_interleave_dataset_op",
diff --git a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
index 0fe35ed4b15..cb8dc67d6dd 100644
--- a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/data/name_utils.h"
 
 namespace tensorflow {
@@ -96,11 +97,13 @@ class AssertNextDatasetOp::Dataset : public DatasetBase {
       }
       int n = tokens.size();
       for (size_t i = 0; i < dataset()->transformations_.size(); ++i) {
-        if (dataset()->transformations_[i] != tokens[n - 2 - i]) {
-          return errors::InvalidArgument(
-              "Asserted ", dataset()->transformations_[i],
-              " transformation at offset ", i, " but encountered ",
-              tokens[n - 2 - i], " transformation instead.");
+        if (!MatchesAnyVersionRE(dataset()->transformations_[i],
+                                 tokens[n - 2 - i])) {
+          return errors::InvalidArgument("Asserted transformation matching ",
+                                         dataset()->transformations_[i],
+                                         " at offset ", i, " but encountered ",
+                                         tokens[n - 2 - i],
+                                         " transformation instead.");
         }
       }
       return dataset()->input_->MakeIterator(ctx, this, prefix(), &input_impl_);
diff --git a/tensorflow/core/kernels/data/experimental/compute_batch_size_op.cc b/tensorflow/core/kernels/data/experimental/compute_batch_size_op.cc
new file mode 100644
index 00000000000..1c4c5dea248
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/compute_batch_size_op.cc
@@ -0,0 +1,191 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/name_utils.h"
+#include "tensorflow/core/kernels/data/serialization_utils.h"
+#include "tensorflow/core/platform/stringprintf.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+namespace {
+
+using grappler::graph_utils::GetScalarConstNodeValue;
+
+constexpr char kMapAndBatchOp[] = "MapAndBatchDataset";
+constexpr char kExperimentalMapAndBatchOp[] = "ExperimentalMapAndBatchDataset";
+
+constexpr std::array<const char*, 4> kBatchDatasetOps = {
+    "BatchDataset",
+    "PaddedBatchDataset",
+    kMapAndBatchOp,
+    kExperimentalMapAndBatchOp,
+};
+
+constexpr std::array<const char*, 2> kMultipleInputDatasetOps = {
+    "ConcatenateDataset",
+    "ZipDataset",
+};
+
+constexpr std::array<const char*, 14> kPassThroughOps = {
+    "AssertCardinalityDataset",
+    "CacheDataset",
+    "FilterDataset",
+    "Identity",
+    "ModelDataset",
+    "OptimizeDataset",
+    "ParseExampleDataset",
+    "PrefetchDataset",
+    "RepeatDataset",
+    "ShardDataset",
+    "ShuffleAndRepeatDataset",
+    "ShuffleDataset",
+    "SkipDataset",
+    "TakeDataset",
+};
+
+template <std::size_t SIZE>
+bool IsDatasetNodeOfType(const NodeDef& node,
+                         const std::array<const char*, SIZE>& arr) {
+  for (const auto& dataset_op : arr) {
+    if (MatchesAnyVersionRE(dataset_op, node.op())) return true;
+  }
+  return false;
+}
+
+const NodeDef* GetInputNode(const NodeDef& node,
+                            const grappler::GraphView& graph,
+                            int64 input_index) {
+  if (node.input_size() == 0) return nullptr;
+  grappler::GraphView::InputPort input_port =
+      graph.GetInputPort(node.name(), input_index);
+  return graph.GetRegularFanin(input_port).node;
+}
+
+// TODO(rachelim): This op traverses the dataset graph using a allowlist-based
+// approach. As an alternative, we could instead rewrite all batching datasets'
+// drop_remainder parameter to True, then rerun the dataset graph to derive
+// new output shapes using C++ shape inference. This is more robust in cases
+// where datasets have shape inference implemented in C++. If this allowlist-
+// based approach proves hard to maintain, consider doing the alternative.
+class ComputeBatchSizeOp : public OpKernel {
+ public:
+  explicit ComputeBatchSizeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    DatasetBase* dataset;
+    OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset));
+
+    std::vector<std::pair<string, Tensor>> input_list;
+    GraphDef graph_def;
+    string dataset_node_name;
+    OP_REQUIRES_OK(ctx, AsGraphDefMinimal(ctx, dataset, &input_list, &graph_def,
+                                          &dataset_node_name));
+
+    // Create GraphView for easier traversal of graph.
+    grappler::GraphView graph_view(&graph_def);
+
+    const NodeDef* node = graph_view.GetNode(dataset_node_name);
+    OP_REQUIRES(ctx, node != nullptr,
+                errors::InvalidArgument("Node does not exist in graph"));
+    int64 batch_size = GetBatchSize(*node, graph_view);
+    Tensor* result;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &result));
+    result->scalar<int64>()() = batch_size;
+  }
+
+ private:
+  int64 GetBatchSizeFromBatchNode(const NodeDef& node,
+                                  const grappler::GraphView& graph) {
+    int64 arg_index;
+    if (node.op() == kMapAndBatchOp ||
+        node.op() == kExperimentalMapAndBatchOp) {
+      arg_index = node.input_size() - 3;
+    } else {
+      arg_index = 1;
+    }
+
+    auto batch_size_node = GetInputNode(node, graph, arg_index);
+    int64 batch_size;
+    auto s = GetScalarConstNodeValue(*batch_size_node, &batch_size);
+    if (!s.ok()) {
+      VLOG(1) << "Could not compute static batch size. Found batching dataset ("
+              << node.name() << "), but failed to get its input batch size: "
+              << s.error_message();
+      return -1;
+    }
+    return batch_size;
+  }
+
+  // Helper function that returns the static 0th dimension of a given dataset
+  // node in the graph. It starts from a node in the graph and recursively
+  // traverses its inputs until it finds a valid BatchDataset operation,
+  // and returns its batch size. If the batch size cannot be determined,
+  // returns -1.
+  //
+  // During recursion, it handles four kinds of cases:
+  // 1. BatchDataset type ops: Returns the value from its batch_size input node.
+  // 2. Zip / Concatenate dataset ops: Recurses into all inputs to these ops,
+  //    which are themselves all datasets, and returns the batch sizes computed
+  //    by the inputs if they are all the same.
+  // 3. Core dataset ops which cannot change the size of the 0th dimension of
+  //    dataset output elements: Recurses into the first input parameter.
+  // 4. All other ops: Fail, returning -1 for unknown.
+  // TODO(rachelim): For FlatMap type mapping dataset ops, recurse into the
+  // function definition.
+  int64 GetBatchSize(const NodeDef& node, const grappler::GraphView& graph) {
+    if (IsDatasetNodeOfType(node, kBatchDatasetOps)) {
+      return GetBatchSizeFromBatchNode(node, graph);
+    }
+    if (IsDatasetNodeOfType(node, kMultipleInputDatasetOps)) {
+      const NodeDef* input_0 = GetInputNode(node, graph, 0);
+      int64 batch_size_0 = GetBatchSize(*input_0, graph);
+      for (int i = 1; i < node.input_size(); ++i) {
+        const NodeDef* input = GetInputNode(node, graph, i);
+        auto batch_size_i = GetBatchSize(*input, graph);
+        if (batch_size_i != batch_size_0) {
+          VLOG(1) << "Could not compute batch size: inputs to " << node.name()
+                  << " (" << node.op() << ") had different batch sizes."
+                  << " Namely, input 0 had batch size " << batch_size_0
+                  << " while input " << i << " had batch size " << batch_size_i
+                  << ".";
+          return -1;
+        }
+      }
+      return batch_size_0;
+    }
+    if (IsDatasetNodeOfType(node, kPassThroughOps)) {
+      const NodeDef* input = GetInputNode(node, graph, 0);
+      return GetBatchSize(*input, graph);
+    }
+    VLOG(1) << "Encountered dataset node " << node.name() << " (" << node.op()
+            << ") that prevented further static batch size analysis.";
+
+    return -1;
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("ComputeBatchSize").Device(DEVICE_CPU),
+                        ComputeBatchSizeOp);
+
+}  // anonymous namespace
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
index ee8f72bc663..8a160aa8502 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/data/dataset.pb.h"
 #include "tensorflow/core/data/service/data_service.h"
+#include "tensorflow/core/data/service/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/model.h"
@@ -69,7 +70,7 @@ const int64 kDefaultTaskRefreshIntervalMs = 1000;  // 1 second.
 // Dataset for reading data from the tf.data service non-deterministically.
 //
 // This dataset interleaves dataset elements produced by multiple tf.data
-// workers. We periodically query the tf.data master to determine which workers
+// workers. We periodically query the dispatcher to determine which workers
 // to read from (in case workers are added or removed).
 class DataServiceDatasetOp::Dataset : public DatasetBase {
  public:
@@ -185,28 +186,48 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
     }
 
     ~Iterator() override {
-      mutex_lock l(mu_);
       VLOG(1) << "Destroying data service dataset iterator for job id "
               << job_id_;
+      CancelThreads();
+      if (deregister_fn_) deregister_fn_();
+      // Thread destructors will block until the threads finish, no need to wait
+      // here.
+    }
+
+    void CancelThreads() TF_LOCKS_EXCLUDED(mu_) {
+      mutex_lock l(mu_);
+      VLOG(1) << "Cancelling threads in DataServiceDataset::Iterator";
       cancelled_ = true;
       worker_thread_cv_.notify_all();
       manager_thread_cv_.notify_all();
       get_next_cv_.notify_all();
-      // Thread destructors will block until the threads finish, no need to wait
-      // here.
     }
 
     Status Initialize(IteratorContext* ctx) override {
       VLOG(3) << "Connecting to " << dataset()->address_
               << " in data service dataset op";
-      DataServiceMasterClient master(dataset()->address_, dataset()->protocol_);
+      TF_RETURN_IF_ERROR(RegisterCancellationCallback(
+          ctx->cancellation_manager(), [this]() { CancelThreads(); },
+          &deregister_fn_));
+      DataServiceDispatcherClient dispatcher(dataset()->address_,
+                                             dataset()->protocol_);
+      int64 deadline_micros = ctx->env()->NowMicros() + kRetryTimeoutMicros;
       if (dataset()->job_name_.empty()) {
-        TF_RETURN_IF_ERROR(master.CreateJob(
-            dataset()->dataset_id_, dataset()->processing_mode_, &job_id_));
+        TF_RETURN_IF_ERROR(grpc_util::Retry(
+            [&]() {
+              return dispatcher.CreateJob(dataset()->dataset_id_,
+                                          dataset()->processing_mode_,
+                                          &job_id_);
+            },
+            "create job", deadline_micros));
       } else {
-        TF_RETURN_IF_ERROR(master.GetOrCreateJob(
-            dataset()->dataset_id_, dataset()->processing_mode_,
-            dataset()->job_name_, iterator_index_, &job_id_));
+        TF_RETURN_IF_ERROR(grpc_util::Retry(
+            [&]() {
+              return dispatcher.GetOrCreateJob(
+                  dataset()->dataset_id_, dataset()->processing_mode_,
+                  dataset()->job_name_, iterator_index_, &job_id_);
+            },
+            "get or create job", deadline_micros));
       }
       VLOG(1) << "Created data service job with id " << job_id_;
       return Status::OK();
@@ -283,11 +304,14 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
 
     // Periodically refresh the task list.
     // Maintain one thread fetching elements for each task.
-    // TODO(aaudibert): Instead of polling, have master send updates when
+    // TODO(aaudibert): Instead of polling, have dispatcher send updates when
     // the list of tasks changes.
     void TaskThreadManager(std::unique_ptr<IteratorContext> ctx) {
-      VLOG(3) << "Starting task thread manager";
-      DataServiceMasterClient master(dataset()->address_, dataset()->protocol_);
+      auto cleanup =
+          gtl::MakeCleanup([] { VLOG(1) << "Task thread manager exiting"; });
+      VLOG(1) << "Starting task thread manager";
+      DataServiceDispatcherClient dispatcher(dataset()->address_,
+                                             dataset()->protocol_);
       uint64 next_check = Env::Default()->NowMicros();
       while (true) {
         {
@@ -305,18 +329,19 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
             return;
           }
         }
-        UpdateTasks(&master);
+        UpdateTasks(&dispatcher);
         UpdateWorkerThreads(ctx.get());
         next_check = Env::Default()->NowMicros() +
                      dataset()->task_refresh_interval_ms_ * 1000;
       }
     }
 
-    void UpdateTasks(DataServiceMasterClient* master) LOCKS_EXCLUDED(mu_) {
+    void UpdateTasks(DataServiceDispatcherClient* dispatcher)
+        LOCKS_EXCLUDED(mu_) {
       VLOG(3) << "Updating tasks";
       std::vector<TaskInfo> tasks;
       bool job_finished;
-      Status s = master->GetTasks(job_id_, &tasks, &job_finished);
+      Status s = dispatcher->GetTasks(job_id_, &tasks, &job_finished);
       if (!s.ok()) {
         LOG(WARNING) << "Failed to get task info for job id " << job_id_ << ": "
                      << s;
@@ -324,7 +349,7 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
       }
       absl::flat_hash_map<int64, TaskInfo> task_id_to_task;
       for (auto& task : tasks) {
-        task_id_to_task[task.id()] = task;
+        task_id_to_task[task.task_id()] = task;
       }
       mutex_lock l(mu_);
       job_finished_ = job_finished;
@@ -357,8 +382,9 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
           get_next_cv_.notify_all();
           continue;
         }
-        tasks_.push_back(std::make_shared<Task>(
-            task_info.id(), task_info.worker_address(), std::move(worker)));
+        tasks_.push_back(std::make_shared<Task>(task_info.task_id(),
+                                                task_info.worker_address(),
+                                                std::move(worker)));
       }
       if (dataset()->max_outstanding_requests_ == model::kAutotune) {
         // Adjust max_outstanding_requests to account for newly added tasks.
@@ -375,7 +401,6 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
           mutex_lock l(mu_);
           num_running_worker_threads_--;
           outstanding_requests_--;
-          VLOG(3) << "Exiting worker thread";
         };
         worker_threads_.push_back(ctx->StartThread(
             "tf-data-service-task_thread", [this, done = std::move(done)]() {
@@ -385,8 +410,11 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
     }
 
     void RunWorkerThread(std::function<void()> done) {
-      auto cleanup = gtl::MakeCleanup([done = std::move(done)]() { done(); });
-      VLOG(3) << "Starting worker thread";
+      auto cleanup = gtl::MakeCleanup([done = std::move(done)]() {
+        done();
+        VLOG(1) << "Worker thread exiting";
+      });
+      VLOG(1) << "Starting worker thread";
       std::shared_ptr<Task> task_to_process;
       while (true) {
         {
@@ -408,10 +436,10 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
             }
             worker_thread_cv_.wait(l);
           }
+          outstanding_requests_++;
           if (cancelled_) {
             return;
           }
-          outstanding_requests_++;
           // Search for a task to update.
           int num_tasks = tasks_.size();
           for (int i = 0; i < num_tasks; ++i) {
@@ -432,6 +460,9 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
         Status s = GetElement(task_to_process.get(), deadline_micros);
         if (!s.ok()) {
           mutex_lock l(mu_);
+          VLOG(1) << "Failed to get element for task "
+                  << task_to_process->task_id << ": " << s;
+          task_to_process->in_use = false;
           status_ = s;
           get_next_cv_.notify_all();
           return;
@@ -457,14 +488,6 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
         if (s.ok()) {
           break;
         }
-        if (errors::IsNotFound(s)) {
-          // This indicates that the worker was restarted. The restarted worker
-          // will get a new task, and the old task is lost.
-          mutex_lock l(mu_);
-          finished_tasks_++;
-          task->end_of_sequence = true;
-          return Status::OK();
-        }
         // Retry all errors that could indicate preemption.
         if (!errors::IsUnavailable(s) && !errors::IsCancelled(s) &&
             !errors::IsAborted(s)) {
@@ -528,6 +551,8 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
     condition_variable worker_thread_cv_ TF_GUARDED_BY(mu_);
     condition_variable manager_thread_cv_ TF_GUARDED_BY(mu_);
     bool cancelled_ TF_GUARDED_BY(mu_) = false;
+    // Method for deregistering the cancellation callback.
+    std::function<void()> deregister_fn_;
 
     int64 outstanding_requests_ TF_GUARDED_BY(mu_) = 0;
     // max_outstanding_requests controls how many elements may be held in memory
diff --git a/tensorflow/core/kernels/data/experimental/data_service_ops.cc b/tensorflow/core/kernels/data/experimental/data_service_ops.cc
index c6a54baad64..ba175815c73 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/data_service_ops.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/experimental/data_service_ops.h"
 
 #include "tensorflow/core/data/service/data_service.h"
+#include "tensorflow/core/data/service/grpc_util.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/platform/errors.h"
@@ -23,6 +24,10 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 
+namespace {
+const int64 kRetryTimeoutMicros = 1000LL * 1000 * 60 * 60;  // 60 minutes.
+}
+
 RegisterDatasetOp::RegisterDatasetOp(OpKernelConstruction* ctx)
     : OpKernel(ctx) {
   int64 external_state_policy_int;
@@ -53,9 +58,13 @@ void RegisterDatasetOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(
       ctx, AsGraphDef(ctx, dataset, std::move(serialization_ctx), &graph_def));
 
-  DataServiceMasterClient client(address, protocol);
+  DataServiceDispatcherClient client(address, protocol);
   int64 dataset_id;
-  OP_REQUIRES_OK(ctx, client.RegisterDataset(graph_def, &dataset_id));
+  int64 deadline_micros = EnvTime::NowMicros() + kRetryTimeoutMicros;
+  OP_REQUIRES_OK(
+      ctx, grpc_util::Retry(
+               [&]() { return client.RegisterDataset(graph_def, &dataset_id); },
+               /*description=*/"register dataset", deadline_micros));
 
   Tensor* output;
   OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape{}, &output));
diff --git a/tensorflow/core/kernels/data/experimental/data_service_ops.h b/tensorflow/core/kernels/data/experimental/data_service_ops.h
index b7d66938ae6..12dbec45b33 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_ops.h
+++ b/tensorflow/core/kernels/data/experimental/data_service_ops.h
@@ -25,7 +25,7 @@ namespace data {
 
 // Registers a dataset with the tf.data service.
 //
-// The address and protocol inputs are used to connect to the tf.data master.
+// The address and protocol inputs are used to connect to the dispatcher.
 // The external state policy attribute determines whether to ignore, warn, or
 // error out when the dataset contains external state.
 // The op produces a dataset id for identifying the registered dataset.
@@ -35,6 +35,7 @@ class RegisterDatasetOp : public OpKernel {
   static constexpr const char* const kProtocol = "protocol";
   static constexpr const char* const kExternalStatePolicy =
       "external_state_policy";
+  static constexpr const char* const kTimeoutMs = "timeout_ms";
 
   explicit RegisterDatasetOp(OpKernelConstruction* ctx);
 
diff --git a/tensorflow/core/kernels/data/experimental/lmdb_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/lmdb_dataset_op_test.cc
index 80705229d2c..77a836e9f60 100644
--- a/tensorflow/core/kernels/data/experimental/lmdb_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/experimental/lmdb_dataset_op_test.cc
@@ -179,16 +179,17 @@ TEST_F(LMDBDatasetOpTest, InvalidPathInMiddle) {
 
   bool end_of_sequence = false;
   std::vector<Tensor> out_tensors;
-  std::vector<Tensor> next;
 
   // First 10 rows should be ok
   for (int i = 0; i < 10; ++i) {
+    std::vector<Tensor> next;
     TF_ASSERT_OK(
         iterator_->GetNext(iterator_ctx_.get(), &next, &end_of_sequence));
     EXPECT_FALSE(end_of_sequence);
   }
 
   // Next read operation should raise an error
+  std::vector<Tensor> next;
   Status get_next_status =
       iterator_->GetNext(iterator_ctx_.get(), &next, &end_of_sequence);
   EXPECT_EQ(get_next_status.code(), error::INVALID_ARGUMENT);
diff --git a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
index 2167c5d9b98..9c344e01c6a 100644
--- a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
@@ -31,6 +31,8 @@ limitations under the License.
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/blocking_counter.h"
 #include "tensorflow/core/platform/stringprintf.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/lib/traceme_encode.h"
 
 namespace tensorflow {
 namespace data {
@@ -323,6 +325,11 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
             }
             *end_of_sequence = false;
             Status s = current_worker->outputs.front().status;
+            profiler::TraceMe traceme([&] {
+              return profiler::TraceMeEncode(
+                  "ParallelInterleaveConsume",
+                  {{"element_id", current_worker->outputs.front().id}});
+            });
             current_worker->outputs.front().output.swap(*out_tensors);
             current_worker->outputs.pop_front();
             current_worker->cond_var.notify_one();
@@ -564,8 +571,10 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       Status status;
       // The buffered data element.
       std::vector<Tensor> output;
+      int64 id = -1;
 
       explicit OutputElem(const Status& s) : status(s) {}
+      OutputElem(const Status& s, int64 id) : status(s), id(id) {}
     };
 
     // Worker threads operate on their relevant WorkerState structs.
@@ -813,6 +822,14 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
                   worker_thread_states_[thread_index]
                       .output_elem.output.empty() &&
                   !worker_thread_states_[thread_index].end_of_sequence) {
+                int64& id = worker_thread_states_[thread_index].output_elem.id;
+                profiler::TraceMe traceme(
+                    [&] {
+                      id = profiler::TraceMe::NewActivityId();
+                      return profiler::TraceMeEncode(
+                          "ParallelInterleaveProduce", {{"element_id", id}});
+                    },
+                    profiler::kInfo);
                 worker_thread_states_[thread_index].output_elem.status =
                     worker_thread_states_[thread_index].iterator->GetNext(
                         ctx.get(),
@@ -856,7 +873,8 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
                 worker_thread_states_[thread_index].end_of_sequence = false;
               } else {
                 workers_[thread_index].outputs.emplace_back(
-                    worker_thread_states_[thread_index].output_elem.status);
+                    worker_thread_states_[thread_index].output_elem.status,
+                    worker_thread_states_[thread_index].output_elem.id);
                 workers_[thread_index].outputs.back().output.swap(
                     worker_thread_states_[thread_index].output_elem.output);
               }
diff --git a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
index 1a3f9bfd4b3..3002987b621 100644
--- a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/stats_utils.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/stringprintf.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/lib/traceme_encode.h"
 #include "tensorflow/core/util/example_proto_fast_parsing.h"
 
 namespace tensorflow {
@@ -399,6 +401,10 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
         RecordStop(ctx);
         result->notification.WaitForNotification();
         RecordStart(ctx);
+        profiler::TraceMe traceme([&] {
+          return profiler::TraceMeEncode("ParseExampleConsume",
+                                         {{"element_id", result->id}});
+        });
         return ProcessResult(ctx, result, out_tensors, end_of_sequence);
       }
 
@@ -514,10 +520,14 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
 
      private:
       struct InvocationResult {
+        InvocationResult() = default;
+        explicit InvocationResult(int64 id) : id(id) {}
+
         Notification notification;
         Status status;
         std::vector<Tensor> return_values;
-        bool end_of_input;
+        bool end_of_input = false;
+        int64 id = -1;
       };
 
       void CancelThreads(bool wait) TF_LOCKS_EXCLUDED(mu_) {
@@ -558,6 +568,10 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
       void CallFunction(const std::shared_ptr<IteratorContext>& ctx,
                         const std::shared_ptr<InvocationResult>& result)
           TF_LOCKS_EXCLUDED(*mu_) {
+        profiler::TraceMe traceme([&] {
+          return profiler::TraceMeEncode("ParseExampleProduce",
+                                         {{"element_id", result->id}});
+        });
         // Get the next input element.
         std::vector<Tensor> input_element;
         result->status = input_impl_->GetNext(ctx.get(), &input_element,
@@ -732,6 +746,8 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
           return num_calls_ >= num_parallel_calls ||
                  invocation_results_.size() >= num_parallel_calls;
         };
+        // Counts the total number of calls to use as an id of InvocationResult.
+        int64 num_total_calls = 0;
         while (true) {
           {
             mutex_lock l(*mu_);
@@ -745,7 +761,7 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
             }
             while (!busy()) {
               invocation_results_.push_back(
-                  std::make_shared<InvocationResult>());
+                  std::make_shared<InvocationResult>(num_total_calls++));
               new_calls.push_back(invocation_results_.back());
               num_calls_++;
             }
diff --git a/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
index dd7084d0b26..e2cbe7d9dcc 100644
--- a/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
@@ -27,7 +27,8 @@ inline int64 CeilDiv(int64 dividend, int64 divisor) {
   return (dividend - 1 + divisor) / divisor;
 }
 
-constexpr const char* const kDatasetType = "Rebatch";
+constexpr const char* const kDatasetTypeV1 = "Rebatch";
+constexpr const char* const kDatasetTypeV2 = "RebatchV2";
 
 class RebatchDatasetOp : public UnaryDatasetOpKernel {
  public:
@@ -73,7 +74,7 @@ class RebatchDatasetOp : public UnaryDatasetOpKernel {
         const string& prefix) const override {
       name_utils::IteratorPrefixParams params;
       return absl::make_unique<Iterator>(Iterator::Params{
-          this, name_utils::IteratorPrefix(kDatasetType, prefix, params)});
+          this, name_utils::IteratorPrefix(kDatasetTypeV1, prefix, params)});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -87,7 +88,7 @@ class RebatchDatasetOp : public UnaryDatasetOpKernel {
     string DebugString() const override {
       name_utils::DatasetDebugStringParams params;
       params.set_args(num_replicas_);
-      return name_utils::DatasetDebugString(kDatasetType, params);
+      return name_utils::DatasetDebugString(kDatasetTypeV1, params);
     }
 
     Status CheckExternalState() const override {
@@ -266,11 +267,339 @@ class RebatchDatasetOp : public UnaryDatasetOpKernel {
   std::vector<PartialTensorShape> output_shapes_;
 };
 
+// This dataset rebatches its input batches into batches of different size(s).
+//
+// This differs from RebatchDatasetOp. Namely, RebatchDatasetV2 rebatches
+// incoming batches into batches whose new sizes are specified by the
+// `batch_sizes` argument, while RebatchDataset splits its batches based
+// on the (dynamic) input batch size and the given number of splits to make (its
+// `num_replicas` argument). When used in tf.distribute, this allows
+// RebatchDataset to split batches more correctly when the splits are
+// distributed across multiple workers and replicas.
+class RebatchDatasetV2Op : public UnaryDatasetOpKernel {
+ public:
+  explicit RebatchDatasetV2Op(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    const Tensor* batch_sizes_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("batch_sizes", &batch_sizes_tensor));
+    OP_REQUIRES(
+        ctx, batch_sizes_tensor->dims() <= 1,
+        errors::InvalidArgument("`batch_sizes` must be a scalar or a vector."));
+
+    std::vector<int64> batch_sizes;
+    batch_sizes.reserve(batch_sizes_tensor->NumElements());
+    for (int i = 0; i < batch_sizes_tensor->NumElements(); ++i) {
+      batch_sizes.push_back(batch_sizes_tensor->flat<int64>()(i));
+    }
+
+    bool drop_remainder;
+    OP_REQUIRES_OK(
+        ctx, ParseScalarArgument<bool>(ctx, "drop_remainder", &drop_remainder));
+
+    *output = new Dataset(ctx, input, std::move(batch_sizes), drop_remainder,
+                          output_types_, output_shapes_);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            std::vector<int64>&& batch_sizes, bool drop_remainder,
+            const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes)
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          batch_sizes_(std::move(batch_sizes)),
+          drop_remainder_(drop_remainder),
+          output_types_(output_types),
+          output_shapes_(output_shapes),
+          traceme_metadata_(
+              {{"batch_sizes", absl::StrJoin(batch_sizes, ",")}}) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      name_utils::IteratorPrefixParams params;
+      return absl::make_unique<Iterator>(Iterator::Params{
+          this, name_utils::IteratorPrefix(kDatasetTypeV2, prefix, params)});
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() const override {
+      return name_utils::DatasetDebugString(kDatasetTypeV2);
+    }
+
+    Status CheckExternalState() const override {
+      return input_->CheckExternalState();
+    }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+      Node* batch_sizes = nullptr;
+      TF_RETURN_IF_ERROR(b->AddVector(batch_sizes_, &batch_sizes));
+      Node* drop_remainder = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(drop_remainder_, &drop_remainder));
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {input_graph_node, batch_sizes, drop_remainder}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      ~Iterator() override {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, this, prefix(),
+                                               &input_impl_);
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        if (end_of_sequence_) {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
+
+        *end_of_sequence = false;
+
+        auto desired_batch_size = dataset()->batch_sizes_[batch_sizes_index_];
+        // Tracks the size of the current batch as it's built up, possibly from
+        // different input tensors.
+        int64 batch_size = 0;
+
+        std::vector<std::vector<Tensor>> slices_to_concatenate;
+        // Get slices from input tensors until they make up the whole batch
+        // size or we run out of input.
+        while (batch_size < desired_batch_size) {
+          if (offset_ == -1) {
+            // Get new input tensors.
+            tensors_.clear();
+            TF_RETURN_IF_ERROR(
+                input_impl_->GetNext(ctx, &tensors_, &end_of_sequence_));
+            if (end_of_sequence_) {
+              // Break and return partial batch, if any.
+              break;
+            }
+            TF_RETURN_IF_ERROR(ValidateInputTensors());
+            offset_ = 0;
+          }
+
+          int64 slice_end = std::min(offset_ + desired_batch_size - batch_size,
+                                     tensors_[0].dim_size(0));
+
+          std::vector<Tensor> slices;
+          slices.reserve(tensors_.size());
+          for (const auto& tensor : tensors_) {
+            Tensor slice = tensor.Slice(offset_, slice_end);
+            slices.push_back(tensor.Slice(offset_, slice_end));
+          }
+          slices_to_concatenate.push_back(std::move(slices));
+
+          batch_size += (slice_end - offset_);
+          offset_ = slice_end;
+          if (offset_ == tensors_[0].dim_size(0)) {
+            // Exhausted current input tensors, reset.
+            offset_ = -1;
+          }
+        }
+
+        batch_sizes_index_++;
+        batch_sizes_index_ %= dataset()->batch_sizes_.size();
+
+        // Return end_of_sequence if GetNext is expected to produce a non-empty
+        // batch and there are no more inputs, or if drop_remainder is true and
+        // we can't make a full batch.
+        if ((batch_size == 0 && desired_batch_size > 0) ||
+            (dataset()->drop_remainder_ && batch_size < desired_batch_size)) {
+          DCHECK(end_of_sequence_);
+          *end_of_sequence = true;
+          return Status::OK();
+        }
+
+        const size_t num_components = dataset()->output_dtypes().size();
+        out_tensors->reserve(num_components);
+
+        // Special case: desired batch size == 0. This may be the case when,
+        // with distribution strategies, one of replicas expects an empty batch
+        // so that the global batch size adds up correctly.
+        if (desired_batch_size == 0) {
+          DCHECK_EQ(batch_size, 0);
+          DCHECK_EQ(slices_to_concatenate.size(), 0);
+          for (const auto& dtype : dataset()->output_dtypes()) {
+            out_tensors->push_back(Tensor(dtype));
+          }
+          return Status::OK();
+        }
+
+        // Special case: when there's only one slice, we return the slice
+        // directly where possible instead of copying the tensor data.
+        if (slices_to_concatenate.size() == 1) {
+          auto tensors = std::move(slices_to_concatenate[0]);
+          for (size_t i = 0; i < num_components; ++i) {
+            // If the slice is aligned, we return it directly.
+            if (!tensors[i].IsAligned()) {
+              tensors[i] = tensor::DeepCopy(std::move(tensors[i]));
+            }
+          }
+          *out_tensors = std::move(tensors);
+          return Status::OK();
+        }
+
+        // For each component, concatenate slices into one tensor.
+        for (size_t i = 0; i < num_components; ++i) {
+          TensorShape component_shape({batch_size});
+          TensorShape remaining_shape = slices_to_concatenate[0][i].shape();
+          remaining_shape.RemoveDim(0);
+          component_shape.AppendShape(remaining_shape);
+          out_tensors->emplace_back(ctx->allocator({}),
+                                    dataset()->output_dtypes()[i],
+                                    component_shape);
+          if (!out_tensors->back().IsInitialized()) {
+            return errors::ResourceExhausted(
+                "Failed to allocate memory for the batch of component ", i);
+          }
+          int64 dst_offset = 0;
+          for (size_t j = 0; j < slices_to_concatenate.size(); ++j) {
+            auto num_slices = slices_to_concatenate[j][i].shape().dim_size(0);
+            TF_RETURN_IF_ERROR(batch_util::CopyContiguousSlices(
+                slices_to_concatenate[j][i], 0, dst_offset, num_slices,
+                &(*out_tensors)[i]));
+            dst_offset += num_slices;
+          }
+        }
+
+        return Status::OK();
+      }
+
+     protected:
+      Status SaveInternal(SerializationContext* ctx,
+                          IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (!input_impl_) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_impl_empty"), ""));
+        } else {
+          TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("batch_sizes_index"),
+                                               batch_sizes_index_));
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("offset"), offset_));
+        if (offset_ != -1) {
+          for (int i = 0; i < tensors_.size(); ++i) {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                full_name(strings::StrCat("tensors[", i, "]")), tensors_[i]));
+          }
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (!reader->Contains(full_name("input_impl_empty"))) {
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+        } else {
+          input_impl_.reset();
+        }
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("batch_sizes_index"),
+                                              &batch_sizes_index_));
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("offset"), &offset_));
+
+        tensors_.clear();
+        if (offset_ != -1) {
+          tensors_.resize(dataset()->output_dtypes().size());
+          for (int i = 0; i < tensors_.size(); ++i) {
+            TF_RETURN_IF_ERROR(reader->ReadTensor(
+                full_name(strings::StrCat("tensors[", i, "]")), &tensors_[i]));
+          }
+        }
+        return Status::OK();
+      }
+
+      TraceMeMetadata GetTraceMeMetadata() const override {
+        return dataset()->traceme_metadata_;
+      }
+
+     private:
+      Status ValidateInputTensors() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        for (size_t i = 0; i < tensors_.size(); ++i) {
+          if (tensors_[i].dims() == 0) {
+            return errors::InvalidArgument(
+                "Input element must have a non-scalar value in each "
+                "component.");
+          }
+          if (tensors_[i].dim_size(0) != tensors_[0].dim_size(0)) {
+            return errors::InvalidArgument(
+                "Input element must have the same batch size in each "
+                "component. Component 0 had size ",
+                tensors_[0].dim_size(0), " but component ", i, " had size, ",
+                tensors_[i].dim_size(0), ".");
+          }
+        }
+        return Status::OK();
+      }
+
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_;
+      // Whether we have reached the end of the input.
+      bool end_of_sequence_ TF_GUARDED_BY(mu_) = false;
+      // Represents the current input tensor(s).
+      std::vector<Tensor> tensors_ TF_GUARDED_BY(mu_);
+      // Represents the offset into the current input tensor(s).
+      // An offset of -1 indicates that there is no data left in the current
+      // slice.
+      int64 offset_ TF_GUARDED_BY(mu_) = -1;
+      // Represents the current index into the batch_sizes list.
+      int64 batch_sizes_index_ TF_GUARDED_BY(mu_) = 0;
+    };
+
+    const DatasetBase* const input_;
+    const std::vector<int64> batch_sizes_;
+    const bool drop_remainder_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+    const TraceMeMetadata traceme_metadata_;
+  };
+
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
 REGISTER_KERNEL_BUILDER(Name("RebatchDataset").Device(DEVICE_CPU),
                         RebatchDatasetOp);
 REGISTER_KERNEL_BUILDER(Name("ExperimentalRebatchDataset").Device(DEVICE_CPU),
                         RebatchDatasetOp);
 
+REGISTER_KERNEL_BUILDER(Name("RebatchDatasetV2").Device(DEVICE_CPU),
+                        RebatchDatasetV2Op);
+
 }  // anonymous namespace
 }  // namespace experimental
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
index c029f2a50b8..ec6cf02e02e 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
@@ -596,8 +596,8 @@ Status SnapshotDatasetV2Op::Dataset::Iterator::Writer::WriteMetadataFile(
 
   experimental::SnapshotMetadataRecord metadata;
   metadata.set_creation_timestamp(EnvTime::NowMicros());
-  metadata.set_graph_hash(strings::Printf("%llu", dataset()->hash_));
-  metadata.set_run_id(strings::Printf("%llu", run_id_));
+  metadata.set_graph_hash(strings::StrCat(dataset()->hash_));
+  metadata.set_run_id(strings::StrCat(run_id_));
   metadata.set_version(kFileFormatVersion);
   for (const auto& output_dtype : dataset()->output_dtypes()) {
     metadata.add_dtype(output_dtype);
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_util.cc b/tensorflow/core/kernels/data/experimental/snapshot_util.cc
index 3b051d7d572..9e936974c83 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_util.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_util.cc
@@ -215,7 +215,7 @@ Status CustomWriter::WriteTensors(const std::vector<Tensor>& tensors) {
   tensor_protos.reserve(num_complex_);
   experimental::SnapshotTensorMetadata metadata;
   int64 total_size = 0;
-  for (int i = 0; i < tensors.size(); ++i) {
+  for (int i = 0, end = tensors.size(); i < end; ++i) {
     const Tensor& tensor = tensors[i];
     experimental::TensorMetadata* tensor_metadata =
         metadata.add_tensor_metadata();
@@ -239,7 +239,7 @@ Status CustomWriter::WriteTensors(const std::vector<Tensor>& tensors) {
   char* position = uncompressed.data();
   int buffer_index = 0;
   int proto_index = 0;
-  for (int i = 0; i < tensors.size(); ++i) {
+  for (int i = 0, end = tensors.size(); i < end; ++i) {
     const auto& tensor_metadata = metadata.tensor_metadata(i);
     if (simple_tensor_mask_[i]) {
       memcpy(position, tensor_buffers[buffer_index]->data(),
@@ -514,7 +514,8 @@ class Reader::NestedDataset : public DatasetBase {
     Status GetNextInternal(IteratorContext* ctx,
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
-      *end_of_sequence = dataset()->datasets_.size() == index_;
+      const int64 num_datasets = dataset()->datasets_.size();
+      *end_of_sequence = num_datasets == index_;
       if (!*end_of_sequence) {
         Tensor tensor(DT_VARIANT, TensorShape({}));
 
@@ -704,7 +705,7 @@ Status CustomReader::ReadTensors(std::vector<Tensor>* read_tensors) {
 
   int simple_index = 0;
   int complex_index = 0;
-  for (int i = 0; i < simple_tensor_mask_.size(); ++i) {
+  for (int i = 0, end = simple_tensor_mask_.size(); i < end; ++i) {
     if (simple_tensor_mask_[i]) {
       read_tensors->push_back(std::move(simple_tensors[simple_index]));
       simple_index++;
@@ -774,7 +775,7 @@ Status CustomReader::SnappyUncompress(
   std::vector<struct iovec> iov(num_tensors);
   int index = 0;
   int64 total_size = 0;
-  for (int i = 0; i < simple_tensor_mask_.size(); ++i) {
+  for (int i = 0, end = simple_tensor_mask_.size(); i < end; ++i) {
     const auto& tensor_metadata = metadata->tensor_metadata(i);
     if (simple_tensor_mask_[i]) {
       TensorShape shape(tensor_metadata.tensor_shape());
@@ -794,7 +795,8 @@ Status CustomReader::SnappyUncompress(
     total_size += iov[index].iov_len;
     index++;
   }
-  if (size != total_size) {
+  const int64 size_int = size;
+  if (size_int != total_size) {
     return errors::Internal("Uncompressed size mismatch. Snappy expects ", size,
                             " whereas the tensor metadata suggests ",
                             total_size);
@@ -904,9 +906,10 @@ Status DetermineOpState(const std::string& mode_string, bool file_exists,
     return Status::OK();
   }
 
-  if (metadata->creation_timestamp() >=
-      (static_cast<int64>(EnvTime::NowMicros()) -
-       pending_snapshot_expiry_seconds * 1000000)) {
+  int64 expiration_timer = static_cast<int64>(EnvTime::NowMicros()) -
+                           pending_snapshot_expiry_seconds * 1000000;
+
+  if (metadata->creation_timestamp() >= expiration_timer) {
     // Someone else is already writing and time has not expired.
     *mode = PASSTHROUGH;
     return Status::OK();
diff --git a/tensorflow/core/kernels/data/model_dataset_op.cc b/tensorflow/core/kernels/data/model_dataset_op.cc
index 8c630fd9646..d32ac368fa1 100644
--- a/tensorflow/core/kernels/data/model_dataset_op.cc
+++ b/tensorflow/core/kernels/data/model_dataset_op.cc
@@ -136,9 +136,15 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
           mutex_lock l(mu_);
           TF_RETURN_IF_ERROR(EnsureOptimizeThreadStarted(ctx));
           params.model = model_;
+          int64 now_nanos = EnvTime::NowNanos();
+          RecordInput(now_nanos);
         }
-        return input_impl_->GetNext(IteratorContext(std::move(params)),
-                                    out_tensors, end_of_sequence);
+        Status s = input_impl_->GetNext(IteratorContext(std::move(params)),
+                                        out_tensors, end_of_sequence);
+        int64 now_nanos = EnvTime::NowNanos();
+        mutex_lock l(mu_);
+        RecordOutput(now_nanos);
+        return s;
       }
 
      protected:
@@ -192,8 +198,13 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
             }
             if (cancelled_) return;
           }
+          double model_input_time;
+          {
+            tf_shared_lock l(mu_);
+            model_input_time = SelfInputTime();
+          }
           model_->Optimize(dataset()->algorithm_, dataset()->cpu_budget_,
-                           dataset()->ram_budget_);
+                           dataset()->ram_budget_, /*model_input_time=*/0);
           // Exponentially increase the period of running the optimization
           // until a threshold is reached.
           if (optimization_period_ms != kOptimizationPeriodThresholdMs) {
@@ -206,12 +217,35 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
         }
       }
 
+      void RecordInput(int64 time_nanos) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (last_output_time_ != 0) {
+          DCHECK_LE(last_output_time_, time_nanos);
+          input_time_ += time_nanos - last_output_time_;
+          num_input_events_++;
+        }
+      }
+
+      void RecordOutput(int64 time_nanos) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        last_output_time_ = time_nanos;
+      }
+
+      double SelfInputTime() const TF_SHARED_LOCKS_REQUIRED(mu_) {
+        if (num_input_events_ == 0) {
+          return 0;
+        }
+        return static_cast<double>(input_time_) /
+               static_cast<double>(num_input_events_);
+      }
+
       mutex mu_;
       condition_variable cond_var_;
       std::shared_ptr<model::Model> model_;
       std::unique_ptr<Thread> model_thread_ TF_GUARDED_BY(mu_);
       bool cancelled_ TF_GUARDED_BY(mu_) = false;
       std::unique_ptr<IteratorBase> input_impl_;
+      int64 num_input_events_ TF_GUARDED_BY(mu_) = 0;
+      int64 input_time_ TF_GUARDED_BY(mu_) = 0;
+      int64 last_output_time_ TF_GUARDED_BY(mu_) = 0;
     };
 
     const DatasetBase* input_;
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc
index c976a8f7b08..f1fa96d9ac3 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op.cc
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc
@@ -18,8 +18,10 @@ limitations under the License.
 
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/data/rewrite_utils.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/host_info.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
@@ -31,10 +33,18 @@ namespace data {
 /* static */ constexpr const char* const OptimizeDatasetOp::kDatasetType;
 /* static */ constexpr const char* const OptimizeDatasetOp::kInputDataset;
 /* static */ constexpr const char* const OptimizeDatasetOp::kOptimizations;
+/* static */ constexpr const char* const
+    OptimizeDatasetOp::kOptimizationsEnabled;
+/* static */ constexpr const char* const
+    OptimizeDatasetOp::kOptimizationsDisabled;
+/* static */ constexpr const char* const
+    OptimizeDatasetOp::kOptimizationsDefault;
 /* static */ constexpr const char* const OptimizeDatasetOp::kOutputTypes;
 /* static */ constexpr const char* const OptimizeDatasetOp::kOutputShapes;
 /* static */ constexpr const char* const
     OptimizeDatasetOp::kOptimizationConfigs;
+/* static */ constexpr const char* const OptimizeDatasetOp::kOptimizeDatasetV1;
+/* static */ constexpr const char* const OptimizeDatasetOp::kOptimizeDatasetV2;
 
 constexpr char kOptimizerName[] = "tf_data_meta_optimizer";
 constexpr char kOptimizers[] = "optimizers";
@@ -42,6 +52,12 @@ constexpr char kOptimizerConfigs[] = "optimizer_configs";
 
 OptimizeDatasetOp::OptimizeDatasetOp(OpKernelConstruction* ctx)
     : UnaryDatasetOpKernel(ctx) {
+  auto& op_name = ctx->def().op();
+  if (op_name == kOptimizeDatasetV1) {
+    op_version_ = 1;
+  } else if (op_name == kOptimizeDatasetV2) {
+    op_version_ = 2;
+  }
   OP_REQUIRES_OK(ctx,
                  ctx->GetAttr(kOptimizationConfigs, &optimization_configs_));
 }
@@ -49,8 +65,46 @@ OptimizeDatasetOp::OptimizeDatasetOp(OpKernelConstruction* ctx)
 void OptimizeDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                                     DatasetBase** output) {
   std::vector<tstring> optimizations;
-  OP_REQUIRES_OK(
-      ctx, ParseVectorArgument<tstring>(ctx, kOptimizations, &optimizations));
+  if (op_version_ == 1) {
+    OP_REQUIRES_OK(
+        ctx, ParseVectorArgument<tstring>(ctx, kOptimizations, &optimizations));
+  } else if (op_version_ == 2) {
+    std::vector<tstring> optimizations_enabled, optimizations_disabled,
+        optimizations_default;
+    OP_REQUIRES_OK(ctx, ParseVectorArgument<tstring>(ctx, kOptimizationsEnabled,
+                                                     &optimizations_enabled));
+    OP_REQUIRES_OK(ctx,
+                   ParseVectorArgument<tstring>(ctx, kOptimizationsDisabled,
+                                                &optimizations_disabled));
+    OP_REQUIRES_OK(ctx, ParseVectorArgument<tstring>(ctx, kOptimizationsDefault,
+                                                     &optimizations_default));
+
+    string job_name = port::JobName();
+    // The map that stores the experiment names and for how much percentage
+    // of the jobs, the experiments will be randomly turned on.
+    //
+    // This is currently empty; we have no live experiments yet.
+    absl::flat_hash_map<string, uint64> live_experiments;
+    auto hash_func = [](const string& str) { return Hash64(str); };
+    optimizations = SelectOptimizations(
+        job_name, live_experiments, optimizations_enabled,
+        optimizations_disabled, optimizations_default, hash_func);
+
+    // Log and record the experiments that will be applied.
+    if (!job_name.empty() && !live_experiments.empty()) {
+      VLOG(1) << "The input pipeline is subject to tf.data experiment. "
+                 "Please see `go/tf-data-experiments` for more details.";
+
+      for (auto& pair : live_experiments) {
+        string experiment = pair.first;
+        if (std::find(optimizations.begin(), optimizations.end(), experiment) !=
+            optimizations.end()) {
+          VLOG(1) << "The experiment \"" << experiment << "\" is applied.";
+          metrics::RecordTFDataExperiment(experiment);
+        }
+      }
+    }
+  }
 
   auto config_factory = [this, &optimizations]() {
     return CreateConfig(optimizations, optimization_configs_);
@@ -95,6 +149,8 @@ RewriterConfig OptimizeDatasetOp::CreateConfig(
 namespace {
 REGISTER_KERNEL_BUILDER(Name("OptimizeDataset").Device(DEVICE_CPU),
                         OptimizeDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("OptimizeDatasetV2").Device(DEVICE_CPU),
+                        OptimizeDatasetOp);
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.h b/tensorflow/core/kernels/data/optimize_dataset_op.h
index a5fcc72260d..d9e366f1ad5 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op.h
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.h
@@ -25,10 +25,18 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
   static constexpr const char* const kDatasetType = "Optimize";
   static constexpr const char* const kInputDataset = "input_dataset";
   static constexpr const char* const kOptimizations = "optimizations";
+  static constexpr const char* const kOptimizationsEnabled =
+      "optimizations_enabled";
+  static constexpr const char* const kOptimizationsDisabled =
+      "optimizations_disabled";
+  static constexpr const char* const kOptimizationsDefault =
+      "optimizations_default";
   static constexpr const char* const kOutputTypes = "output_types";
   static constexpr const char* const kOutputShapes = "output_shapes";
   static constexpr const char* const kOptimizationConfigs =
       "optimization_configs";
+  static constexpr const char* const kOptimizeDatasetV1 = "OptimizeDataset";
+  static constexpr const char* const kOptimizeDatasetV2 = "OptimizeDatasetV2";
 
   explicit OptimizeDatasetOp(OpKernelConstruction* ctx);
 
@@ -41,6 +49,7 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
                                      std::vector<string> optimizations_configs);
 
   std::vector<string> optimization_configs_;
+  int op_version_ = 0;
 };
 
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index 54ad8886a95..90dd5337c1d 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/blocking_counter.h"
 #include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/stringprintf.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/profiler/lib/traceme_encode.h"
@@ -1342,12 +1343,10 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
         IteratorContext* ctx, IteratorStateReader* reader, int64 size,
         const string& name, std::vector<std::shared_ptr<Element>>* elements) {
       elements->resize(size);
-      std::unique_ptr<thread::ThreadPool> threadpool =
-          ctx->CreateThreadPool(absl::StrCat("read_", name), size);
       Status s = Status::OK();
       BlockingCounter counter(size);
       for (int idx = 0; idx < size; ++idx) {
-        threadpool->Schedule(
+        thread_pool_->Schedule(
             [this, ctx, reader, idx, name, &s, &counter, elements] {
               RecordStart(ctx);
               auto cleanup = gtl::MakeCleanup([this, ctx, &counter]() {
@@ -1357,6 +1356,11 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
               std::shared_ptr<Element> elem;
               Status ret_status = ReadElement(ctx, reader, idx, name, &elem);
               mutex_lock l(*mu_);
+              if (cancelled_) {
+                s.Update(
+                    errors::Cancelled("Cancelled in ReadElementsParallel"));
+                return;
+              }
               if (!ret_status.ok()) {
                 s.Update(ret_status);
                 return;
diff --git a/tensorflow/core/kernels/data/rewrite_utils.cc b/tensorflow/core/kernels/data/rewrite_utils.cc
index 0ea708abbc7..dd9bfdb5143 100644
--- a/tensorflow/core/kernels/data/rewrite_utils.cc
+++ b/tensorflow/core/kernels/data/rewrite_utils.cc
@@ -144,25 +144,11 @@ Status ApplyRewrites(OpKernelContext* ctx,
 Status RewriteDataset(OpKernelContext* ctx, const DatasetBase* input,
                       std::function<RewriterConfig(void)> config_factory,
                       bool record_fingerprint, DatasetBase** rewritten_input) {
-  SerializationContext::Params params;
   std::vector<std::pair<string, Tensor>> input_list;
-  params.input_list = &input_list;
-  params.external_state_policy =
-      SerializationContext::ExternalStatePolicy::kIgnore;
-  params.fail_if_unimplemented = false;
-  params.serialize_data_tensors = false;
-  params.preserve_random_seeds = false;
-  SerializationContext serialization_ctx(params);
   GraphDef graph_def;
-  TF_RETURN_IF_ERROR(
-      AsGraphDef(ctx, input, std::move(serialization_ctx), &graph_def));
-
   string output_node;
-  for (const auto& node : graph_def.node()) {
-    if (node.op() == "_Retval") {
-      output_node = node.input(0);
-    }
-  }
+  TF_RETURN_IF_ERROR(
+      AsGraphDefMinimal(ctx, input, &input_list, &graph_def, &output_node));
 
   VLOG(3) << "Before graph rewrites: " << graph_def.DebugString();
   TF_RETURN_IF_ERROR(
diff --git a/tensorflow/core/kernels/data/serialization_utils.cc b/tensorflow/core/kernels/data/serialization_utils.cc
index 5965c9b3295..628d6952c6d 100644
--- a/tensorflow/core/kernels/data/serialization_utils.cc
+++ b/tensorflow/core/kernels/data/serialization_utils.cc
@@ -53,6 +53,29 @@ Status FindStatefulOps(const GraphDef& graph_def,
 
 }  // namespace
 
+Status AsGraphDefMinimal(OpKernelContext* ctx, const DatasetBase* input,
+                         std::vector<std::pair<string, Tensor>>* input_list,
+                         GraphDef* result, string* dataset_node) {
+  SerializationContext::Params params;
+  params.input_list = input_list;
+  params.external_state_policy =
+      SerializationContext::ExternalStatePolicy::kIgnore;
+  params.fail_if_unimplemented = false;
+  params.serialize_data_tensors = false;
+  params.preserve_random_seeds = false;
+  SerializationContext serialization_ctx(params);
+  TF_RETURN_IF_ERROR(
+      AsGraphDef(ctx, input, std::move(serialization_ctx), result));
+
+  // Symbolic `_Retval` node indicates which node corresponds to the dataset.
+  for (const auto& node : result->node()) {
+    if (node.op() == "_Retval") {
+      *dataset_node = node.input(0);
+    }
+  }
+  return Status::OK();
+}
+
 Status AsGraphDef(OpKernelContext* ctx, const DatasetBase* dataset,
                   SerializationContext&& serialization_ctx,
                   GraphDef* graph_def) {
diff --git a/tensorflow/core/kernels/data/serialization_utils.h b/tensorflow/core/kernels/data/serialization_utils.h
index 2e580ec7fdc..5702919b556 100644
--- a/tensorflow/core/kernels/data/serialization_utils.h
+++ b/tensorflow/core/kernels/data/serialization_utils.h
@@ -27,6 +27,15 @@ Status AsGraphDef(OpKernelContext* ctx, const DatasetBase* dataset,
                   SerializationContext&& serialization_ctx,
                   GraphDef* graph_def);
 
+// Returns a GraphDef representation of the given dataset using the minimal
+// serialization parameters (i.e. ignoring external state, not serializing
+// data tensors, not failing if there are datasets which do not have AsGraphDef
+// implemented). Sets the `dataset_node` parameter to the dataset's
+// node name in the resulting GraphDef.
+Status AsGraphDefMinimal(OpKernelContext* ctx, const DatasetBase* input,
+                         std::vector<std::pair<string, Tensor>>* input_list,
+                         GraphDef* result, string* dataset_node);
+
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index 7ac27ead6c2..7b696371049 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -143,7 +143,7 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
           seed_generator_(seed_generator),
           parent_generator_(seed_generator->seed(), seed_generator->seed2()),
           generator_(&parent_generator_) {
-      buffer_ = absl::make_unique<std::vector<Tensor>[]>(
+      buffer_ = absl::make_unique<std::vector<std::vector<Tensor>>>(
           params.dataset->buffer_size_);
       slices_.push_back(absl::make_unique<Slice>(0, 0));
     }
@@ -201,7 +201,7 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
                     << this->dataset()->buffer_size_;
           }
           this->RecordBufferEnqueue(ctx, input_element);
-          buffer_[slices_.back()->end % this->dataset()->buffer_size_] =
+          buffer_->at(slices_.back()->end % this->dataset()->buffer_size_) =
               std::move(input_element);
           num_elements_++;
           slices_.back()->end++;
@@ -239,11 +239,11 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
             Random() % (slices_.front()->end - slices_.front()->start);
         int64 index =
             (slices_.front()->start + offset) % this->dataset()->buffer_size_;
-        *out_tensors = std::move(buffer_[index]);
+        *out_tensors = std::move(buffer_->at(index));
         this->RecordBufferDequeue(ctx, *out_tensors);
-        std::swap(
-            buffer_[index],
-            buffer_[slices_.front()->start % this->dataset()->buffer_size_]);
+        std::swap(buffer_->at(index),
+                  buffer_->at(slices_.front()->start %
+                              this->dataset()->buffer_size_));
         slices_.front()->start++;
         num_elements_--;
       } else {
@@ -293,6 +293,7 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
       TF_RETURN_IF_ERROR(writer->WriteScalar(this->full_name(kEpoch), epoch_));
       TF_RETURN_IF_ERROR(
           writer->WriteScalar(this->full_name(kNumElements), num_elements_));
+      TF_RETURN_IF_ERROR(WriteElementsToCheckpoint(writer, prefix(), *buffer_));
       TF_RETURN_IF_ERROR(
           writer->WriteScalar(this->full_name(kSlicesSize), slices_.size()));
       for (size_t i = 0; i < slices_.size(); ++i) {
@@ -303,19 +304,6 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
         TF_RETURN_IF_ERROR(writer->WriteScalar(
             this->full_name(absl::StrJoin(std::make_tuple(kSlicesEnd, i), "_")),
             slices_[i]->end));
-        for (size_t j = slices_[i]->start; j < slices_[i]->end; ++j) {
-          size_t index = j % this->dataset()->buffer_size_;
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              this->full_name(
-                  absl::StrJoin(std::make_tuple(kBuffer, index, kSize), "_")),
-              buffer_[index].size()));
-          for (size_t k = 0; k < buffer_[index].size(); ++k) {
-            TF_RETURN_IF_ERROR(writer->WriteTensor(
-                this->full_name(
-                    absl::StrJoin(std::make_tuple(kBuffer, index, k), "_")),
-                buffer_[index][k]));
-          }
-        }
       }
       if (data_produced_) {
         TF_RETURN_IF_ERROR(
@@ -360,8 +348,10 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
             reader->ReadScalar(this->full_name(kSlicesSize), &temp));
         slices_size = static_cast<size_t>(temp);
       }
-      buffer_ = absl::make_unique<std::vector<Tensor>[]>(
+      buffer_ = absl::make_unique<std::vector<std::vector<Tensor>>>(
           this->dataset()->buffer_size_);
+      TF_RETURN_IF_ERROR(
+          ReadElementsFromCheckpoint(reader, prefix(), buffer_.get()));
       slices_.clear();
       for (size_t i = 0; i < slices_size; ++i) {
         int64 start;
@@ -374,21 +364,6 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
             this->full_name(absl::StrJoin(std::make_tuple(kSlicesEnd, i), "_")),
             &end));
         slices_.push_back(absl::make_unique<Slice>(start, end));
-        for (size_t j = start; j < end; ++j) {
-          size_t index = j % this->dataset()->buffer_size_;
-          int64 list_size;
-          TF_RETURN_IF_ERROR(reader->ReadScalar(
-              this->full_name(
-                  absl::StrJoin(std::make_tuple(kBuffer, index, kSize), "_")),
-              &list_size));
-          buffer_[index] = std::vector<Tensor>(list_size);
-          for (int k = 0; k < list_size; ++k) {
-            TF_RETURN_IF_ERROR(reader->ReadTensor(
-                this->full_name(
-                    absl::StrJoin(std::make_tuple(kBuffer, index, k), "_")),
-                &buffer_[index][k]));
-          }
-        }
       }
       data_produced_ = reader->Contains(this->full_name(kDataProduced));
 
@@ -421,7 +396,8 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
 
     mutex mu_;
     SeedGenerator* const seed_generator_ TF_GUARDED_BY(mu_);  // Not owned.
-    std::unique_ptr<std::vector<Tensor>[]> buffer_ TF_GUARDED_BY(mu_);
+    std::unique_ptr<std::vector<std::vector<Tensor>>> buffer_
+        TF_GUARDED_BY(mu_);
     std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_) = nullptr;
     int64 epoch_ TF_GUARDED_BY(mu_) = 0;
     int64 num_elements_ TF_GUARDED_BY(mu_) = 0;
diff --git a/tensorflow/core/kernels/data/tensor_dataset_op.cc b/tensorflow/core/kernels/data/tensor_dataset_op.cc
index 20540cf9a57..78cc06a54c5 100644
--- a/tensorflow/core/kernels/data/tensor_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_dataset_op.cc
@@ -75,7 +75,7 @@ class TensorDatasetOp::Dataset : public DatasetBase {
     for (const Tensor& t : tensors_) {
       Node* node;
       if (ctx->serialize_data_tensors()) {
-        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        TF_RETURN_IF_ERROR(b->AddDatasetOrTensor(ctx, t, &node));
       } else {
         TF_RETURN_IF_ERROR(b->AddPlaceholder(t, &node));
         DCHECK_NE(ctx->input_list(), nullptr);
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index 8831f8d548d..e4f27f55327 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -80,7 +80,7 @@ class TensorSliceDatasetOp::Dataset : public DatasetBase {
     for (const Tensor& t : tensors_) {
       Node* node;
       if (ctx->serialize_data_tensors()) {
-        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        TF_RETURN_IF_ERROR(b->AddDatasetOrTensor(ctx, t, &node));
       } else {
         TF_RETURN_IF_ERROR(b->AddPlaceholder(t, &node));
         DCHECK_NE(ctx->input_list(), nullptr);
diff --git a/tensorflow/core/kernels/data_format_ops.cc b/tensorflow/core/kernels/data_format_ops.cc
index 181aa1b8a2c..c62c710faf1 100644
--- a/tensorflow/core/kernels/data_format_ops.cc
+++ b/tensorflow/core/kernels/data_format_ops.cc
@@ -90,9 +90,11 @@ class DataFormatVecPermuteOp : public OpKernel {
                     "input must be a vector or 2D tensor, but got shape ",
                     input.shape().DebugString()));
     if (input.dims() == 1) {
-      OP_REQUIRES(context, input.NumElements() == 2 || input.NumElements() == 4,
+      OP_REQUIRES(context,
+                  input.NumElements() == 2 || input.NumElements() == 4 ||
+                      input.NumElements() == 5,
                   errors::InvalidArgument(
-                      "1D input must be of size 2 or 4, but got shape ",
+                      "1D input must be of size 2, 4 or 5, but got shape ",
                       input.shape().DebugString()));
     } else if (input.dims() == 2) {
       OP_REQUIRES(context, input.dim_size(0) == 2 || input.dim_size(0) == 4,
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index 310bd73ba65..b809e1d1065 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/cast_op.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
 #include "tensorflow/core/kernels/depthwise_conv_op.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -1180,12 +1181,45 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
       return;
     }
 
-    auto out_backprop_ptr = out_backprop.template flat<T>().data();
-    auto input_ptr = input.template flat<T>().data();
-    auto filter_backprop_ptr = filter_backprop->template flat<T>().data();
-    LaunchDepthwiseConvBackpropFilterOp<Device, T>()(
+    // For GPU inputs with type half, we cast inputs to float and outputs back
+    // to half, as half implementation is slow and does not use full precision
+    // accumulation in some cases.
+    constexpr bool cast_to_float = std::is_same<T, Eigen::half>::value &&
+                                   std::is_same<Device, GPUDevice>::value;
+    using U = typename std::conditional<cast_to_float, float, T>::type;
+    Tensor casted_out_backprop = out_backprop;
+    Tensor casted_input = input;
+    Tensor casted_filter_backprop = *filter_backprop;
+    const Device& device = context->template eigen_device<Device>();
+    if (cast_to_float) {
+      functor::CastFunctor<Device, float, Eigen::half> cast;
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DT_FLOAT, out_backprop.shape(),
+                                            &casted_out_backprop));
+      cast(device, casted_out_backprop.template flat<float>(),
+           out_backprop.template flat<Eigen::half>());
+      OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, input.shape(),
+                                                     &casted_input));
+      cast(device, casted_input.template flat<float>(),
+           input.template flat<Eigen::half>());
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DT_FLOAT, filter_backprop->shape(),
+                                            &casted_filter_backprop));
+    }
+
+    auto out_backprop_ptr = casted_out_backprop.template flat<U>().data();
+    auto input_ptr = casted_input.template flat<U>().data();
+    auto filter_backprop_ptr = casted_filter_backprop.template flat<U>().data();
+    LaunchDepthwiseConvBackpropFilterOp<Device, U>()(
         context, args, out_backprop_ptr, input_ptr, filter_backprop_ptr,
         data_format_);
+
+    if (cast_to_float) {
+      functor::CastFunctor<Device, Eigen::half, float> cast;
+      const Tensor& casted_filter_backprop_const = casted_filter_backprop;
+      cast(device, filter_backprop->template flat<Eigen::half>(),
+           casted_filter_backprop_const.template flat<float>());
+    }
   }
 
  protected:
diff --git a/tensorflow/core/kernels/eigen_contraction_kernel.h b/tensorflow/core/kernels/eigen_contraction_kernel.h
index ef4b9dbc012..2a4cfd1637c 100644
--- a/tensorflow/core/kernels/eigen_contraction_kernel.h
+++ b/tensorflow/core/kernels/eigen_contraction_kernel.h
@@ -524,14 +524,7 @@ struct GemmKernelProvider<Eigen::QInt32, Eigen::QInt8, Eigen::QUInt8,
                                  RhsMapper> {                                  \
     TensorContractionKernel(StorageIndex m, StorageIndex k, StorageIndex n,    \
                             StorageIndex bm, StorageIndex bk, StorageIndex bn) \
-        : m(m),                                                                \
-          k(k),                                                                \
-          n(n),                                                                \
-          bm(bm),                                                              \
-          bk(bk),                                                              \
-          bn(bn),                                                              \
-          nm0(bm > 0 ? divup(m, bm) : 0),                                      \
-          nn0(bn > 0 ? divup(n, bn) : 0) {}                                    \
+        : m(m), k(k), n(n), bm(bm), bk(bk), bn(bn) {}                          \
                                                                                \
     enum { HasBeta = true };                                                   \
                                                                                \
@@ -616,7 +609,8 @@ struct GemmKernelProvider<Eigen::QInt32, Eigen::QInt8, Eigen::QUInt8,
     }                                                                          \
                                                                                \
     template <typename Device>                                                 \
-    EIGEN_DEVICE_FUNC void deallocate(Device& d, BlockMemHandle handle) {      \
+    EIGEN_DEVICE_FUNC static void deallocate(Device& d,                        \
+                                             BlockMemHandle handle) {          \
       BlockMemAllocator::deallocate(d, handle);                                \
     }                                                                          \
                                                                                \
@@ -626,7 +620,8 @@ struct GemmKernelProvider<Eigen::QInt32, Eigen::QInt8, Eigen::QUInt8,
       if (UseCustomContractionKernels()) {                                     \
         const bool is_direct_access =                                          \
             DirectLhsAccess::value &&                                          \
-            DirectLhsAccess::block(data_mapper, rows, depth, nn0, lhsBlock);   \
+            DirectLhsAccess::block(data_mapper, rows, depth,                   \
+                                   bn > 0 ? divup(n, bn) : 0, lhsBlock);       \
                                                                                \
         if (!is_direct_access) {                                               \
           lhsBlock->is_direct_access = false;                                  \
@@ -645,7 +640,8 @@ struct GemmKernelProvider<Eigen::QInt32, Eigen::QInt8, Eigen::QUInt8,
       if (UseCustomContractionKernels()) {                                     \
         const bool is_direct_access =                                          \
             DirectRhsAccess::value &&                                          \
-            DirectRhsAccess::block(data_mapper, depth, cols, nm0, rhsBlock);   \
+            DirectRhsAccess::block(data_mapper, depth, cols,                   \
+                                   bm > 0 ? divup(m, bm) : 0, rhsBlock);       \
                                                                                \
         if (!is_direct_access) {                                               \
           rhsBlock->is_direct_access = false;                                  \
@@ -723,9 +719,6 @@ struct GemmKernelProvider<Eigen::QInt32, Eigen::QInt8, Eigen::QUInt8,
     const StorageIndex bm;                                                     \
     const StorageIndex bk;                                                     \
     const StorageIndex bn;                                                     \
-    /* Number of kernels for each dimension. */                                \
-    const StorageIndex nm0;                                                    \
-    const StorageIndex nn0;                                                    \
   }
 
 // Tensor contraction kernel that do not fallback on Eigen. Currently not all
@@ -740,14 +733,7 @@ struct GemmKernelProvider<Eigen::QInt32, Eigen::QInt8, Eigen::QUInt8,
                                  RhsMapper> {                                  \
     TensorContractionKernel(StorageIndex m, StorageIndex k, StorageIndex n,    \
                             StorageIndex bm, StorageIndex bk, StorageIndex bn) \
-        : m(m),                                                                \
-          k(k),                                                                \
-          n(n),                                                                \
-          bm(bm),                                                              \
-          bk(bk),                                                              \
-          bn(bn),                                                              \
-          nm0(bm > 0 ? divup(m, bm) : 0),                                      \
-          nn0(bn > 0 ? divup(n, bn) : 0) {}                                    \
+        : m(m), k(k), n(n), bm(bm), bk(bk), bn(bn) {}                          \
                                                                                \
     enum { HasBeta = true };                                                   \
                                                                                \
@@ -818,7 +804,8 @@ struct GemmKernelProvider<Eigen::QInt32, Eigen::QInt8, Eigen::QUInt8,
     }                                                                          \
                                                                                \
     template <typename Device>                                                 \
-    EIGEN_DEVICE_FUNC void deallocate(Device& d, BlockMemHandle handle) {      \
+    EIGEN_DEVICE_FUNC static void deallocate(Device& d,                        \
+                                             BlockMemHandle handle) {          \
       BlockMemAllocator::deallocate(d, handle);                                \
     }                                                                          \
                                                                                \
@@ -827,7 +814,8 @@ struct GemmKernelProvider<Eigen::QInt32, Eigen::QInt8, Eigen::QUInt8,
         const StorageIndex depth, const StorageIndex rows) {                   \
       const bool is_direct_access =                                            \
           DirectLhsAccess::value &&                                            \
-          DirectLhsAccess::block(data_mapper, rows, depth, nn0, lhsBlock);     \
+          DirectLhsAccess::block(data_mapper, rows, depth,                     \
+                                 bn > 0 ? divup(n, bn) : 0, lhsBlock);         \
                                                                                \
       if (!is_direct_access) {                                                 \
         lhsBlock->is_direct_access = false;                                    \
@@ -840,7 +828,8 @@ struct GemmKernelProvider<Eigen::QInt32, Eigen::QInt8, Eigen::QUInt8,
         const StorageIndex depth, const StorageIndex cols) {                   \
       const bool is_direct_access =                                            \
           DirectRhsAccess::value &&                                            \
-          DirectRhsAccess::block(data_mapper, depth, cols, nm0, rhsBlock);     \
+          DirectRhsAccess::block(data_mapper, depth, cols,                     \
+                                 bm > 0 ? divup(m, bm) : 0, rhsBlock);         \
                                                                                \
       if (!is_direct_access) {                                                 \
         rhsBlock->is_direct_access = false;                                    \
@@ -890,9 +879,6 @@ struct GemmKernelProvider<Eigen::QInt32, Eigen::QInt8, Eigen::QUInt8,
     const StorageIndex bm;                                                     \
     const StorageIndex bk;                                                     \
     const StorageIndex bn;                                                     \
-    /* Number of kernels for each dimension. */                                \
-    const StorageIndex nm0;                                                    \
-    const StorageIndex nn0;                                                    \
   }
 
 REGISTER_TENSOR_CONTRACTION_KERNEL_WITH_FALLBACK(float, float, float);
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h
index c163eb887d7..ac02d3bb5cd 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h
@@ -180,6 +180,7 @@ struct gemm_pack_colmajor_block<
 
     const StorageIndex start_col = rhs.colOffset();
     const StorageIndex max_col = rhs.maxCol(peeled_k);
+    const StorageIndex rhs_depth_offset = rhs.depthOffset();
 
     for (StorageIndex col = 0; col < cols; ++col) {
       SubMapper lm = rhs.getLinearMapper(0, col);
@@ -199,7 +200,7 @@ struct gemm_pack_colmajor_block<
         if (!has_padding ||
             (!pad_col && !lm.padAnyRow(start_row, max_row - 1))) {
           const StorageIndex start_depth =
-              (c == start_col) ? rhs.depthOffset() : 0;
+              (c == start_col) ? rhs_depth_offset : 0;
 
           const StorageIndex max_depth =
               std::min<StorageIndex>(start_depth + (peeled_k - k),
@@ -286,7 +287,7 @@ struct gemm_pack_colmajor_block<
           eigen_assert(k <= peeled_k);
 
           const StorageIndex start_depth =
-              ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
+              ((c == start_col) && (r == start_row)) ? rhs_depth_offset : 0;
           const StorageIndex max_depth =
               rhs.maxDepth(peeled_k - k, start_depth);
 
@@ -359,6 +360,7 @@ struct gemm_pack_colmajor_block<
 
     const StorageIndex start_col = rhs.colOffset();
     const StorageIndex max_col = rhs.maxCol(peeled_k);
+    const StorageIndex rhs_depth_offset = rhs.depthOffset();
 
     // Original input column and row after applying all non-standard strides and
     // dilations. Computed by padOrSkip{Row,Col}.
@@ -380,7 +382,7 @@ struct gemm_pack_colmajor_block<
           eigen_assert(k <= peeled_k);
 
           const StorageIndex start_depth =
-              ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
+              ((c == start_col) && (r == start_row)) ? rhs_depth_offset : 0;
           const StorageIndex max_depth =
               rhs.maxDepth(peeled_k - k, start_depth);
 
diff --git a/tensorflow/core/kernels/fft_ops.cc b/tensorflow/core/kernels/fft_ops.cc
index 05843594839..050b83980c6 100644
--- a/tensorflow/core/kernels/fft_ops.cc
+++ b/tensorflow/core/kernels/fft_ops.cc
@@ -372,7 +372,7 @@ class CufftScratchAllocator : public se::ScratchAllocator {
       return se::port::StatusOr<se::DeviceMemory<uint8>>();
     }
     AllocationAttributes allocation_attr;
-    allocation_attr.no_retry_on_failure = true;
+    allocation_attr.retry_on_failure = false;
     Status allocation_status(context_->allocate_temp(
         DT_UINT8, TensorShape({byte_size}), &temporary_memory,
         AllocatorAttributes(), allocation_attr));
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index 174a4e45a79..0619facbd65 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -141,6 +141,8 @@ struct FillFunctor<Eigen::ThreadPoolDevice, T> {
 TF_CALL_ALL_TYPES(DEFINE_FILL_CPU);
 DEFINE_FILL_CPU(quint8);
 DEFINE_FILL_CPU(quint16);
+DEFINE_FILL_CPU(qint8);
+DEFINE_FILL_CPU(qint16);
 #undef DEFINE_FILL_CPU
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index d69292082bc..52a11e0870d 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -441,13 +441,18 @@ void RemoteCallOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
       });
 }
 
-string RemoteCallOp::TraceString(OpKernelContext* ctx, bool verbose) {
-  string trace_string =
-      strings::StrCat(name_view(), "__", func_.name(), ":", type_string_view());
-  if (!verbose) return trace_string;
-  string trace_args = GetTraceArgument(ctx);
-  if (trace_args.empty()) return trace_string;
-  return strings::StrCat(trace_string, "#", trace_args, "#");
+string RemoteCallOp::TraceString(const OpKernelContext& ctx,
+                                 bool verbose) const {
+  string trace_string = profiler::TraceMeOp(
+      strings::StrCat(name_view(), "__", func_.name()), type_string_view());
+  if (verbose) {
+    string shape = ShapeTraceString(ctx);
+    if (!shape.empty()) {
+      trace_string =
+          profiler::TraceMeEncode(std::move(trace_string), {{"shape", shape}});
+    }
+  }
+  return trace_string;
 }
 
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/function_ops.h b/tensorflow/core/kernels/function_ops.h
index 69e34c5962d..4fad8c4a3f0 100644
--- a/tensorflow/core/kernels/function_ops.h
+++ b/tensorflow/core/kernels/function_ops.h
@@ -64,7 +64,7 @@ class RemoteCallOp : public AsyncOpKernel {
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
 
-  string TraceString(OpKernelContext* ctx, bool verbose) override;
+  string TraceString(const OpKernelContext& ctx, bool verbose) const override;
 
  private:
   NameAttrList func_;
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index 03b0438ae31..e9ad7995eed 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -354,6 +354,10 @@ REGISTER_KERNEL_BUILDER(Name("If").Device(DEVICE_GPU).HostMemory("cond"), IfOp);
 REGISTER_KERNEL_BUILDER(Name("Case").Device(DEVICE_CPU), CaseOp);
 REGISTER_KERNEL_BUILDER(
     Name("Case").Device(DEVICE_GPU).HostMemory("branch_index"), CaseOp);
+REGISTER_KERNEL_BUILDER(Name("StatelessCase").Device(DEVICE_CPU), CaseOp);
+REGISTER_KERNEL_BUILDER(
+    Name("StatelessCase").Device(DEVICE_GPU).HostMemory("branch_index"),
+    CaseOp);
 
 REGISTER_KERNEL_BUILDER(Name("StatelessIf").Device(DEVICE_CPU), IfOp);
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.h b/tensorflow/core/kernels/fused_batch_norm_op.h
index 7a64046b335..624f7ecf59a 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.h
+++ b/tensorflow/core/kernels/fused_batch_norm_op.h
@@ -30,7 +30,7 @@ namespace functor {
 //   (2) batch norm + side input + activation
 enum class FusedBatchNormActivationMode { kIdentity, kRelu };
 
-string ToString(FusedBatchNormActivationMode activation_mode);
+std::string ToString(FusedBatchNormActivationMode activation_mode);
 
 Status ParseActivationMode(OpKernelConstruction* context,
                            FusedBatchNormActivationMode* activation_mode);
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 948567e019a..e9e6a93ef70 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -78,10 +78,11 @@ class GatherOp : public OpKernel {
       }
     }
 
+    int64 min_params_dim = axis < 0 ? -axis : axis + 1;
     OP_REQUIRES(
-        c, axis >= -params.dims() && axis < params.dims(),
-        errors::InvalidArgument("Expected axis in the range [", -params.dims(),
-                                ", ", params.dims(), "), but got ", axis));
+        c, params.dims() >= min_params_dim,
+        errors::InvalidArgument("Shape must be at least rank ", min_params_dim,
+                                " but is rank ", params.dims()));
 
     if (axis < 0) {
       axis = params.dims() + axis;
diff --git a/tensorflow/core/kernels/gpu_utils.h b/tensorflow/core/kernels/gpu_utils.h
index c0dd3b6bc77..a1589db3b5b 100644
--- a/tensorflow/core/kernels/gpu_utils.h
+++ b/tensorflow/core/kernels/gpu_utils.h
@@ -146,7 +146,7 @@ class AutoTuneMap {
   }
 
  private:
-  AutoTuneMap(const string& name) : name_(name) {
+  AutoTuneMap(const std::string& name) : name_(name) {
     min_score_threshold_ = 1;
     int min_warmup_iterations = 10;
     const char* threshold_str = getenv("TF_AUTOTUNE_THRESHOLD");
@@ -174,8 +174,8 @@ class AutoTuneMap {
     }
   };
 
-  string GetActionSummary(StringPiece action, const Parameters& params,
-                          const Config& config) {
+  std::string GetActionSummary(StringPiece action, const Parameters& params,
+                               const Config& config) {
     return strings::Printf("autotune_map %s %s: %s -> (%s)", name_.c_str(),
                            string(action).c_str(), params.ToString().c_str(),
                            config.ToString().c_str());
@@ -189,7 +189,7 @@ class AutoTuneMap {
   };
   std::unordered_map<Parameters, ValueType, Hasher> params_config_map_
       TF_GUARDED_BY(mu_);
-  string name_;
+  std::string name_;
   int32 min_score_threshold_;
   int32 max_autotune_count_;
   int32 max_autotune_global_count_;
diff --git a/tensorflow/core/kernels/image/BUILD b/tensorflow/core/kernels/image/BUILD
new file mode 100644
index 00000000000..f7ad9ab0371
--- /dev/null
+++ b/tensorflow/core/kernels/image/BUILD
@@ -0,0 +1,449 @@
+load("@bazel_skylib//rules:build_test.bzl", "build_test")
+load(
+    "//tensorflow:tensorflow.bzl",
+    "if_android",
+    "tf_cc_test",
+    "tf_cc_tests",
+    "tf_copts",
+    "tf_kernel_library",
+)
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load(
+    "//tensorflow/core/platform:build_config_root.bzl",
+    "tf_cuda_tests_tags",
+)
+load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
+
+# TODO(rmlarsen): Remove ASAP.
+package_group(
+    name = "friends",
+    packages = [
+        "//tensorflow/...",
+        "//tensorflow_text/...",
+    ],
+)
+
+package(
+    default_visibility = [
+        ":friends",
+        "//tensorflow:__subpackages__",
+        "//tensorflow:internal",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+# Export a few files for use on Android.
+exports_files([
+    "adjust_contrast_op.cc",
+    "adjust_contrast_op.h",
+    "adjust_hue_op.cc",
+    "adjust_hue_op.h",
+    "adjust_saturation_op.cc",
+    "adjust_saturation_op.h",
+    "crop_and_resize_op.cc",
+    "crop_and_resize_op.h",
+    "extract_image_patches_op.cc",
+    "extract_image_patches_op.h",
+    "image_ops.h",
+    "image_ops.cc",
+    "mirror_pad_op.cc",
+    "mirror_pad_op.h",
+    "mirror_pad_op_cpu_impl.h",
+    "mirror_pad_op_cpu_impl_1.cc",
+    "mirror_pad_op_cpu_impl_2.cc",
+    "mirror_pad_op_cpu_impl_3.cc",
+    "mirror_pad_op_cpu_impl_4.cc",
+    "mirror_pad_op_cpu_impl_5.cc",
+    "non_max_suppression_op.cc",
+    "non_max_suppression_op.h",
+    "resize_bilinear_op.cc",
+    "resize_bilinear_op.h",
+    "resize_nearest_neighbor_op.cc",
+    "resize_nearest_neighbor_op.h",
+    "sample_distorted_bounding_box_op.cc",
+])
+
+# Private support libraries ---------------------------------------------------
+cc_library(
+    name = "sampling_kernels",
+    srcs = ["sampling_kernels.cc"],
+    hdrs = ["sampling_kernels.h"],
+    visibility = ["//visibility:private"],
+    deps = ["//tensorflow/core:lib"],
+)
+
+tf_cc_test(
+    name = "sampling_kernels_test",
+    srcs = ["sampling_kernels_test.cc"],
+    deps = [
+        ":sampling_kernels",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+# Public support libraries ----------------------------------------------------<
+cc_library(
+    name = "image",
+    deps = [
+        ":adjust_contrast_op",
+        ":adjust_hue_op",
+        ":adjust_saturation_op",
+        ":attention_ops",
+        ":colorspace_op",
+        ":crop_and_resize_op",
+        ":decode_image_op",
+        ":draw_bounding_box_op",
+        ":encode_jpeg_op",
+        ":encode_png_op",
+        ":extract_image_patches_op",
+        ":extract_jpeg_shape_op",
+        ":extract_volume_patches_op",
+        ":generate_box_proposals_op",
+        ":image_ops",
+        ":mirror_pad_op",
+        ":non_max_suppression_op",
+        ":random_crop_op",
+        ":resize_area_op",
+        ":resize_bicubic_op",
+        ":resize_bilinear_op",
+        ":resize_nearest_neighbor_op",
+        ":sample_distorted_bounding_box_op",
+        ":scale_and_translate_op",
+    ],
+)
+
+IMAGE_DEPS = [
+    "//third_party/eigen3",
+    "//tensorflow/core:framework",
+    "//tensorflow/core:gif_internal",
+    "//tensorflow/core:jpeg_internal",
+    "//tensorflow/core:lib",
+    "//tensorflow/core:lib_internal",
+    "//tensorflow/core:png_internal",
+    "//tensorflow/core:protos_all_cc",
+    "//tensorflow/core/kernels:bounds_check",
+    "//tensorflow/core/kernels:eigen_helpers",
+    "//tensorflow/core/util/tensor_bundle",
+    "//tensorflow/core/util:image_resizer_state",
+]
+
+IMAGE_TEST_DEPS = [
+    "//tensorflow/core/kernels:ops_testutil",
+    "//tensorflow/core/kernels:ops_util",
+    "//tensorflow/core:core_cpu",
+    "//tensorflow/core:framework",
+    "//tensorflow/core:lib",
+    "//tensorflow/core:lib_internal",
+    "//tensorflow/core:protos_all_cc",
+    "//tensorflow/core:test",
+    "//tensorflow/core:test_main",
+    "//tensorflow/core:testlib",
+]
+
+tf_kernel_library(
+    name = "adjust_contrast_op",
+    prefix = "adjust_contrast_op",
+    deps = IMAGE_DEPS,
+)
+
+cc_library(
+    name = "adjust_hsv_gpu_lib",
+    hdrs = ["adjust_hsv_gpu.cu.h"],
+    deps = ["//tensorflow/core:framework"],
+)
+
+tf_kernel_library(
+    name = "adjust_hue_op",
+    prefix = "adjust_hue_op",
+    deps = IMAGE_DEPS + [":adjust_hsv_gpu_lib"],
+)
+
+tf_kernel_library(
+    name = "adjust_saturation_op",
+    prefix = "adjust_saturation_op",
+    deps = IMAGE_DEPS + [":adjust_hsv_gpu_lib"],
+)
+
+tf_kernel_library(
+    name = "attention_ops",
+    prefix = "attention_ops",
+    deps = IMAGE_DEPS,
+)
+
+tf_kernel_library(
+    name = "colorspace_op",
+    prefix = "colorspace_op",
+    deps = IMAGE_DEPS,
+)
+
+tf_kernel_library(
+    name = "crop_and_resize_op",
+    prefix = "crop_and_resize_op",
+    deps = IMAGE_DEPS + ["//tensorflow/core:framework_internal"],
+)
+
+tf_kernel_library(
+    name = "decode_image_op",
+    prefix = "decode_image_op",
+    deps = IMAGE_DEPS + ["@com_google_absl//absl/strings"],
+)
+
+tf_kernel_library(
+    name = "draw_bounding_box_op",
+    prefix = "draw_bounding_box_op",
+    deps = IMAGE_DEPS,
+)
+
+tf_kernel_library(
+    name = "encode_jpeg_op",
+    prefix = "encode_jpeg_op",
+    deps = IMAGE_DEPS,
+)
+
+tf_kernel_library(
+    name = "encode_png_op",
+    prefix = "encode_png_op",
+    deps = IMAGE_DEPS,
+)
+
+tf_kernel_library(
+    name = "extract_jpeg_shape_op",
+    prefix = "extract_jpeg_shape_op",
+    deps = IMAGE_DEPS,
+)
+
+tf_kernel_library(
+    name = "extract_image_patches_op",
+    prefix = "extract_image_patches_op",
+    deps = [
+        "//tensorflow/core/kernels:ops_util",
+    ] + IMAGE_DEPS,
+)
+
+tf_kernel_library(
+    name = "extract_volume_patches_op",
+    prefix = "extract_volume_patches_op",
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels:eigen_helpers",
+        "//tensorflow/core/kernels:ops_util",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_kernel_library(
+    name = "generate_box_proposals_op",
+    gpu_srcs = ["generate_box_proposals_op.cu.cc"],
+    deps = ["//tensorflow/core/kernels:gpu_prim_hdrs"] + if_cuda([
+        ":non_max_suppression_op_gpu",
+    ]),
+)
+
+tf_kernel_library(
+    name = "non_max_suppression_op",
+    prefix = "non_max_suppression_op",
+    deps = IMAGE_DEPS + ["//tensorflow/core/kernels:gpu_prim_hdrs"],
+)
+
+tf_kernel_library(
+    name = "scale_and_translate_op",
+    prefix = "scale_and_translate_op",
+    deps = IMAGE_DEPS + [":sampling_kernels"],
+)
+
+tf_kernel_library(
+    name = "random_crop_op",
+    prefix = "random_crop_op",
+    deps = IMAGE_DEPS,
+)
+
+tf_kernel_library(
+    name = "resize_area_op",
+    prefix = "resize_area_op",
+    deps = IMAGE_DEPS,
+)
+
+tf_kernel_library(
+    name = "resize_bicubic_op",
+    prefix = "resize_bicubic_op",
+    deps = IMAGE_DEPS,
+)
+
+tf_kernel_library(
+    name = "resize_bilinear_op",
+    prefix = "resize_bilinear_op",
+    deps = IMAGE_DEPS,
+)
+
+tf_kernel_library(
+    name = "resize_nearest_neighbor_op",
+    prefix = "resize_nearest_neighbor_op",
+    deps = IMAGE_DEPS,
+)
+
+tf_kernel_library(
+    name = "sample_distorted_bounding_box_op",
+    prefix = "sample_distorted_bounding_box_op",
+    deps = IMAGE_DEPS + ["//tensorflow/core/kernels:stateless_random_ops"],
+)
+
+tf_kernel_library(
+    name = "image_ops",
+    prefix = "image_ops",
+    deps = IMAGE_DEPS,
+)
+
+tf_kernel_library(
+    name = "mirror_pad_op",
+    prefix = "mirror_pad_op",
+    deps = IMAGE_DEPS,
+)
+
+# Tests ------------------------
+
+tf_cc_tests(
+    name = "bonus_tests",
+    srcs = [
+        "adjust_contrast_op_test.cc",
+        "colorspace_op_test.cc",
+        "crop_and_resize_op_test.cc",
+        "mirror_pad_op_test.cc",
+        "non_max_suppression_op_test.cc",
+        "resize_area_op_test.cc",
+        "resize_bicubic_op_test.cc",
+        "resize_nearest_neighbor_op_test.cc",
+        "scale_and_translate_op_test.cc",
+    ],
+    linkopts = select({
+        "//tensorflow:macos": ["-headerpad_max_install_names"],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":image",
+        ":sampling_kernels",
+        ":mirror_pad_op",
+    ] + IMAGE_TEST_DEPS,
+)
+
+tf_cc_test(
+    name = "non_max_suppression_op_benchmark_test",
+    srcs = ["non_max_suppression_op_benchmark_test.cc"],
+    deps = [
+        ":image",
+    ] + IMAGE_TEST_DEPS,
+)
+
+tf_cuda_cc_test(
+    name = "resize_bilinear_op_test",
+    srcs = ["resize_bilinear_op_test.cc"],
+    tags = ["no_cuda_on_cpu_tap"],
+    deps = [
+        ":image",
+        ":sampling_kernels",
+    ] + IMAGE_TEST_DEPS,
+)
+
+tf_cuda_cc_test(
+    name = "adjust_contrast_op_benchmark_test",
+    srcs = ["adjust_contrast_op_benchmark_test.cc"],
+    deps = [
+        ":image",
+    ] + IMAGE_TEST_DEPS,
+)
+
+tf_cuda_cc_test(
+    name = "crop_and_resize_op_benchmark_test",
+    srcs = ["crop_and_resize_op_benchmark_test.cc"],
+    deps = [
+        ":image",
+    ] + IMAGE_TEST_DEPS,
+)
+
+tf_cuda_cc_test(
+    name = "mirror_pad_op_benchmark_test",
+    srcs = ["mirror_pad_op_benchmark_test.cc"],
+    deps = [
+        ":mirror_pad_op",
+    ] + IMAGE_TEST_DEPS,
+)
+
+tf_cuda_cc_test(
+    name = "non_max_suppression_op_gpu_test",
+    srcs = ["non_max_suppression_op_gpu_test.cc"],
+    tags = tf_cuda_tests_tags() + ["no_cuda_on_cpu_tap"],
+    deps = [
+        ":image",
+        "@com_google_absl//absl/strings",
+    ] + IMAGE_TEST_DEPS,
+)
+
+tf_cuda_cc_test(
+    name = "resize_benchmark_test",
+    srcs = ["resize_op_benchmark_test.cc"],
+    deps = [
+        ":image",
+    ] + IMAGE_TEST_DEPS,
+)
+
+tf_cc_test(
+    name = "encode_jpeg_op_test",
+    size = "small",
+    srcs = ["encode_jpeg_op_test.cc"],
+    deps = [
+        ":encode_jpeg_op",
+    ] + IMAGE_TEST_DEPS,
+)
+
+cc_library(
+    name = "android_tensorflow_image_op",
+    srcs = if_android(["decode_image_op.cc"]),
+    copts = tf_copts(),
+    linkopts = ["-ldl"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:android_gif_internal",
+        "//tensorflow/core:android_jpeg_internal",
+        "//tensorflow/core:android_png_internal",
+        "//tensorflow/core:portable_tensorflow_lib_lite",
+    ],
+    alwayslink = 1,
+)
+
+build_test(
+    name = "android_tensorflow_image_op_build_test",
+    targets = [":android_tensorflow_image_op"],
+)
+
+# A file group which contains all operators which are known to work on mobile.
+filegroup(
+    name = "android_all_op_kernels",
+    srcs = glob(
+        [
+            "*.cc",
+            "*.h",
+        ],
+        exclude = [
+            "*test.cc",
+            "*test.h",
+            "*_test_*",
+            "decode_image_op.*",
+            "encode_png_op.*",
+            "encode_jpeg_op.*",
+            "extract_jpeg_shape_op.*",
+            "decode_jpeg_op.*",
+            "decode_and_crop_jpeg_op.*",
+            "decode_gif_op.*",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+filegroup(
+    name = "android_all_ops_textual_hdrs",
+    srcs = ["mirror_pad_op.h"],
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/core/kernels/adjust_contrast_op.cc b/tensorflow/core/kernels/image/adjust_contrast_op.cc
similarity index 99%
rename from tensorflow/core/kernels/adjust_contrast_op.cc
rename to tensorflow/core/kernels/image/adjust_contrast_op.cc
index c13619e0e5f..6853465d9db 100644
--- a/tensorflow/core/kernels/adjust_contrast_op.cc
+++ b/tensorflow/core/kernels/image/adjust_contrast_op.cc
@@ -16,8 +16,10 @@ limitations under the License.
 // See docs in ../ops/image_ops.cc
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/adjust_contrast_op.h"
+#include "tensorflow/core/kernels/image/adjust_contrast_op.h"
+
 #include <memory>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/core/kernels/adjust_contrast_op.h b/tensorflow/core/kernels/image/adjust_contrast_op.h
similarity index 97%
rename from tensorflow/core/kernels/adjust_contrast_op.h
rename to tensorflow/core/kernels/image/adjust_contrast_op.h
index 3e501bccee3..4bff5f73a63 100644
--- a/tensorflow/core/kernels/adjust_contrast_op.h
+++ b/tensorflow/core/kernels/image/adjust_contrast_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_ADJUST_CONTRAST_OP_H_
-#define TENSORFLOW_CORE_KERNELS_ADJUST_CONTRAST_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGEADJUST_CONTRAST_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGEADJUST_CONTRAST_OP_H_
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
 
@@ -157,4 +157,4 @@ struct AdjustContrastv2 {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_ADJUST_CONTRAST_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGEADJUST_CONTRAST_OP_H_
diff --git a/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc b/tensorflow/core/kernels/image/adjust_contrast_op_benchmark_test.cc
similarity index 100%
rename from tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc
rename to tensorflow/core/kernels/image/adjust_contrast_op_benchmark_test.cc
diff --git a/tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc b/tensorflow/core/kernels/image/adjust_contrast_op_gpu.cu.cc
similarity index 96%
rename from tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc
rename to tensorflow/core/kernels/image/adjust_contrast_op_gpu.cu.cc
index e072dc46f5f..147700c1574 100644
--- a/tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/adjust_contrast_op_gpu.cu.cc
@@ -18,9 +18,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/adjust_contrast_op.h"
-
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/image/adjust_contrast_op.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/adjust_contrast_op_test.cc b/tensorflow/core/kernels/image/adjust_contrast_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/adjust_contrast_op_test.cc
rename to tensorflow/core/kernels/image/adjust_contrast_op_test.cc
diff --git a/tensorflow/core/kernels/adjust_hsv_gpu.cu.h b/tensorflow/core/kernels/image/adjust_hsv_gpu.cu.h
similarity index 96%
rename from tensorflow/core/kernels/adjust_hsv_gpu.cu.h
rename to tensorflow/core/kernels/image/adjust_hsv_gpu.cu.h
index ba4427ffb9d..42511f249bb 100644
--- a/tensorflow/core/kernels/adjust_hsv_gpu.cu.h
+++ b/tensorflow/core/kernels/image/adjust_hsv_gpu.cu.h
@@ -11,8 +11,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_ADJUST_HSV_GPU_CU_H_
-#define TENSORFLOW_CORE_KERNELS_ADJUST_HSV_GPU_CU_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGEADJUST_HSV_GPU_CU_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGEADJUST_HSV_GPU_CU_H_
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
@@ -142,4 +142,4 @@ __global__ void adjust_hsv_nhwc(
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#endif  // TENSORFLOW_CORE_KERNELS_ADJUST_HSV_GPU_CU_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGEADJUST_HSV_GPU_CU_H_
diff --git a/tensorflow/core/kernels/adjust_hue_op.cc b/tensorflow/core/kernels/image/adjust_hue_op.cc
similarity index 99%
rename from tensorflow/core/kernels/adjust_hue_op.cc
rename to tensorflow/core/kernels/image/adjust_hue_op.cc
index c1993029ac6..764665be48e 100644
--- a/tensorflow/core/kernels/adjust_hue_op.cc
+++ b/tensorflow/core/kernels/image/adjust_hue_op.cc
@@ -17,9 +17,9 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif
 
-#include <memory>
+#include "tensorflow/core/kernels/image/adjust_hue_op.h"
 
-#include "tensorflow/core/kernels/adjust_hue_op.h"
+#include <memory>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/core/kernels/adjust_hue_op.h b/tensorflow/core/kernels/image/adjust_hue_op.h
similarity index 88%
rename from tensorflow/core/kernels/adjust_hue_op.h
rename to tensorflow/core/kernels/image/adjust_hue_op.h
index edaf7f538e3..6a5758a44fb 100644
--- a/tensorflow/core/kernels/adjust_hue_op.h
+++ b/tensorflow/core/kernels/image/adjust_hue_op.h
@@ -11,8 +11,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H_
-#define TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGEADJUST_HUE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGEADJUST_HUE_OP_H_
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
@@ -38,4 +38,4 @@ struct AdjustHueGPU {
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#endif  // TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGEADJUST_HUE_OP_H_
diff --git a/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc b/tensorflow/core/kernels/image/adjust_hue_op_gpu.cu.cc
similarity index 93%
rename from tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc
rename to tensorflow/core/kernels/image/adjust_hue_op_gpu.cu.cc
index 174ca0002af..10c1ddb6aaf 100644
--- a/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/adjust_hue_op_gpu.cu.cc
@@ -16,8 +16,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/adjust_hsv_gpu.cu.h"
-#include "tensorflow/core/kernels/adjust_hue_op.h"
+#include "tensorflow/core/kernels/image/adjust_hsv_gpu.cu.h"
+#include "tensorflow/core/kernels/image/adjust_hue_op.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/adjust_saturation_op.cc b/tensorflow/core/kernels/image/adjust_saturation_op.cc
similarity index 99%
rename from tensorflow/core/kernels/adjust_saturation_op.cc
rename to tensorflow/core/kernels/image/adjust_saturation_op.cc
index d1fc9d349be..41b0988cc50 100644
--- a/tensorflow/core/kernels/adjust_saturation_op.cc
+++ b/tensorflow/core/kernels/image/adjust_saturation_op.cc
@@ -18,8 +18,10 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif
 
-#include "tensorflow/core/kernels/adjust_saturation_op.h"
+#include "tensorflow/core/kernels/image/adjust_saturation_op.h"
+
 #include <memory>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/core/kernels/adjust_saturation_op.h b/tensorflow/core/kernels/image/adjust_saturation_op.h
similarity index 87%
rename from tensorflow/core/kernels/adjust_saturation_op.h
rename to tensorflow/core/kernels/image/adjust_saturation_op.h
index 0117a48ead8..4a1a619e1fd 100644
--- a/tensorflow/core/kernels/adjust_saturation_op.h
+++ b/tensorflow/core/kernels/image/adjust_saturation_op.h
@@ -11,8 +11,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_ADJUST_SATURATION_OP_H_
-#define TENSORFLOW_CORE_KERNELS_ADJUST_SATURATION_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGEADJUST_SATURATION_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGEADJUST_SATURATION_OP_H_
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
@@ -38,4 +38,4 @@ struct AdjustSaturationGPU {
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#endif  // TENSORFLOW_CORE_KERNELS_ADJUST_SATURATION_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGEADJUST_SATURATION_OP_H_
diff --git a/tensorflow/core/kernels/adjust_saturation_op_gpu.cu.cc b/tensorflow/core/kernels/image/adjust_saturation_op_gpu.cu.cc
similarity index 93%
rename from tensorflow/core/kernels/adjust_saturation_op_gpu.cu.cc
rename to tensorflow/core/kernels/image/adjust_saturation_op_gpu.cu.cc
index c2ef9a4d273..59541e41b46 100644
--- a/tensorflow/core/kernels/adjust_saturation_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/adjust_saturation_op_gpu.cu.cc
@@ -16,8 +16,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/adjust_hsv_gpu.cu.h"
-#include "tensorflow/core/kernels/adjust_saturation_op.h"
+#include "tensorflow/core/kernels/image/adjust_hsv_gpu.cu.h"
+#include "tensorflow/core/kernels/image/adjust_saturation_op.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/attention_ops.cc b/tensorflow/core/kernels/image/attention_ops.cc
similarity index 100%
rename from tensorflow/core/kernels/attention_ops.cc
rename to tensorflow/core/kernels/image/attention_ops.cc
diff --git a/tensorflow/core/kernels/colorspace_op.cc b/tensorflow/core/kernels/image/colorspace_op.cc
similarity index 99%
rename from tensorflow/core/kernels/colorspace_op.cc
rename to tensorflow/core/kernels/image/colorspace_op.cc
index 6c817f73058..a3164bb582d 100644
--- a/tensorflow/core/kernels/colorspace_op.cc
+++ b/tensorflow/core/kernels/image/colorspace_op.cc
@@ -16,6 +16,8 @@ limitations under the License.
 // See docs in ../ops/array_ops.cc.
 #define EIGEN_USE_THREADS
 
+#include "tensorflow/core/kernels/image/colorspace_op.h"
+
 #include <algorithm>
 #include <cmath>
 
@@ -26,7 +28,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/colorspace_op.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/kernels/colorspace_op.h b/tensorflow/core/kernels/image/colorspace_op.h
similarity index 95%
rename from tensorflow/core/kernels/colorspace_op.h
rename to tensorflow/core/kernels/image/colorspace_op.h
index 4de14bc3391..486aa1f5dca 100644
--- a/tensorflow/core/kernels/colorspace_op.h
+++ b/tensorflow/core/kernels/image/colorspace_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_COLORSPACE_OP_H_
-#define TENSORFLOW_CORE_KERNELS_COLORSPACE_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGECOLORSPACE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGECOLORSPACE_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -91,4 +91,4 @@ struct HSVToRGB {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_COLORSPACE_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGECOLORSPACE_OP_H_
diff --git a/tensorflow/core/kernels/colorspace_op_gpu.cu.cc b/tensorflow/core/kernels/image/colorspace_op_gpu.cu.cc
similarity index 95%
rename from tensorflow/core/kernels/colorspace_op_gpu.cu.cc
rename to tensorflow/core/kernels/image/colorspace_op_gpu.cu.cc
index 227490a2056..c49698e4c04 100644
--- a/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/colorspace_op_gpu.cu.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/colorspace_op.h"
+#include "tensorflow/core/kernels/image/colorspace_op.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/colorspace_op_test.cc b/tensorflow/core/kernels/image/colorspace_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/colorspace_op_test.cc
rename to tensorflow/core/kernels/image/colorspace_op_test.cc
diff --git a/tensorflow/core/kernels/crop_and_resize_op.cc b/tensorflow/core/kernels/image/crop_and_resize_op.cc
similarity index 99%
rename from tensorflow/core/kernels/crop_and_resize_op.cc
rename to tensorflow/core/kernels/image/crop_and_resize_op.cc
index 23058788a4b..1979b0514c6 100644
--- a/tensorflow/core/kernels/crop_and_resize_op.cc
+++ b/tensorflow/core/kernels/image/crop_and_resize_op.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/crop_and_resize_op.h"
+#include "tensorflow/core/kernels/image/crop_and_resize_op.h"
 
 #include <functional>
 #include <string>
diff --git a/tensorflow/core/kernels/crop_and_resize_op.h b/tensorflow/core/kernels/image/crop_and_resize_op.h
similarity index 89%
rename from tensorflow/core/kernels/crop_and_resize_op.h
rename to tensorflow/core/kernels/image/crop_and_resize_op.h
index 8c34e3e71cc..c26380e395c 100644
--- a/tensorflow/core/kernels/crop_and_resize_op.h
+++ b/tensorflow/core/kernels/image/crop_and_resize_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_CROP_AND_RESIZE_OP_H_
-#define TENSORFLOW_CORE_KERNELS_CROP_AND_RESIZE_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGECROP_AND_RESIZE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGECROP_AND_RESIZE_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_types.h"
@@ -31,7 +31,7 @@ struct CropAndResize {
                   typename TTypes<T, 4>::ConstTensor image,
                   typename TTypes<float, 2>::ConstTensor boxes,
                   typename TTypes<int32, 1>::ConstTensor box_ind,
-                  const string& method_name, float extrapolation_value,
+                  const std::string& method_name, float extrapolation_value,
                   typename TTypes<float, 4>::Tensor crops);
 };
 
@@ -43,7 +43,7 @@ struct CropAndResizeBackpropImage {
                   typename TTypes<float, 2>::ConstTensor boxes,
                   typename TTypes<int32, 1>::ConstTensor box_ind,
                   typename TTypes<T, 4>::Tensor grads_image,
-                  const string& method_name);
+                  const std::string& method_name);
 };
 
 template <typename Device, typename T>
@@ -69,4 +69,4 @@ struct CheckValidBoxIndexHelper {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_CROP_AND_RESIZE_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGECROP_AND_RESIZE_OP_H_
diff --git a/tensorflow/core/kernels/crop_and_resize_op_benchmark_test.cc b/tensorflow/core/kernels/image/crop_and_resize_op_benchmark_test.cc
similarity index 100%
rename from tensorflow/core/kernels/crop_and_resize_op_benchmark_test.cc
rename to tensorflow/core/kernels/image/crop_and_resize_op_benchmark_test.cc
diff --git a/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc b/tensorflow/core/kernels/image/crop_and_resize_op_gpu.cu.cc
similarity index 98%
rename from tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
rename to tensorflow/core/kernels/image/crop_and_resize_op_gpu.cu.cc
index d268eb7b21e..e4bbbfa108a 100644
--- a/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/crop_and_resize_op_gpu.cu.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/crop_and_resize_op.h"
+#include "tensorflow/core/kernels/image/crop_and_resize_op.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
@@ -352,7 +352,7 @@ struct CropAndResize<GPUDevice, T> {
                   typename TTypes<T, 4>::ConstTensor image,
                   typename TTypes<float, 2>::ConstTensor boxes,
                   typename TTypes<int32, 1>::ConstTensor box_ind,
-                  const string& method_name, float extrapolation_value,
+                  const std::string& method_name, float extrapolation_value,
                   typename TTypes<float, 4>::Tensor crops) {
     const int batch = image.dimension(0);
     const int image_height = image.dimension(1);
@@ -391,7 +391,7 @@ struct CropAndResizeBackpropImage<GPUDevice, T> {
                   typename TTypes<float, 2>::ConstTensor boxes,
                   typename TTypes<int32, 1>::ConstTensor box_ind,
                   typename TTypes<T, 4>::Tensor grads_image,
-                  const string& method_name) {
+                  const std::string& method_name) {
     const int batch = grads_image.dimension(0);
     const int image_height = grads_image.dimension(1);
     const int image_width = grads_image.dimension(2);
diff --git a/tensorflow/core/kernels/crop_and_resize_op_test.cc b/tensorflow/core/kernels/image/crop_and_resize_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/crop_and_resize_op_test.cc
rename to tensorflow/core/kernels/image/crop_and_resize_op_test.cc
diff --git a/tensorflow/core/kernels/decode_image_op.cc b/tensorflow/core/kernels/image/decode_image_op.cc
similarity index 100%
rename from tensorflow/core/kernels/decode_image_op.cc
rename to tensorflow/core/kernels/image/decode_image_op.cc
diff --git a/tensorflow/core/kernels/draw_bounding_box_op.cc b/tensorflow/core/kernels/image/draw_bounding_box_op.cc
similarity index 100%
rename from tensorflow/core/kernels/draw_bounding_box_op.cc
rename to tensorflow/core/kernels/image/draw_bounding_box_op.cc
diff --git a/tensorflow/core/kernels/encode_jpeg_op.cc b/tensorflow/core/kernels/image/encode_jpeg_op.cc
similarity index 100%
rename from tensorflow/core/kernels/encode_jpeg_op.cc
rename to tensorflow/core/kernels/image/encode_jpeg_op.cc
diff --git a/tensorflow/core/kernels/encode_jpeg_op_test.cc b/tensorflow/core/kernels/image/encode_jpeg_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/encode_jpeg_op_test.cc
rename to tensorflow/core/kernels/image/encode_jpeg_op_test.cc
diff --git a/tensorflow/core/kernels/encode_png_op.cc b/tensorflow/core/kernels/image/encode_png_op.cc
similarity index 100%
rename from tensorflow/core/kernels/encode_png_op.cc
rename to tensorflow/core/kernels/image/encode_png_op.cc
diff --git a/tensorflow/core/kernels/extract_image_patches_op.cc b/tensorflow/core/kernels/image/extract_image_patches_op.cc
similarity index 98%
rename from tensorflow/core/kernels/extract_image_patches_op.cc
rename to tensorflow/core/kernels/image/extract_image_patches_op.cc
index 4e87dfc93a4..a7890090acb 100644
--- a/tensorflow/core/kernels/extract_image_patches_op.cc
+++ b/tensorflow/core/kernels/image/extract_image_patches_op.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #define USE_EIGEN_TENSOR
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/extract_image_patches_op.h"
+#include "tensorflow/core/kernels/image/extract_image_patches_op.h"
 
 #include <vector>
 
diff --git a/tensorflow/core/kernels/extract_image_patches_op.h b/tensorflow/core/kernels/image/extract_image_patches_op.h
similarity index 91%
rename from tensorflow/core/kernels/extract_image_patches_op.h
rename to tensorflow/core/kernels/image/extract_image_patches_op.h
index 64b8c0338bd..ba952275c3e 100644
--- a/tensorflow/core/kernels/extract_image_patches_op.h
+++ b/tensorflow/core/kernels/image/extract_image_patches_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_EXTRACT_IMAGE_PATCHES_OP_H_
-#define TENSORFLOW_CORE_KERNELS_EXTRACT_IMAGE_PATCHES_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGEEXTRACT_IMAGE_PATCHES_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGEEXTRACT_IMAGE_PATCHES_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -53,4 +53,4 @@ struct ExtractImagePatchesForward {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_EXTRACT_IMAGE_PATCHES_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGEEXTRACT_IMAGE_PATCHES_OP_H_
diff --git a/tensorflow/core/kernels/extract_image_patches_op_gpu.cu.cc b/tensorflow/core/kernels/image/extract_image_patches_op_gpu.cu.cc
similarity index 94%
rename from tensorflow/core/kernels/extract_image_patches_op_gpu.cu.cc
rename to tensorflow/core/kernels/image/extract_image_patches_op_gpu.cu.cc
index e6a49da7fd2..37b9c9bda32 100644
--- a/tensorflow/core/kernels/extract_image_patches_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/extract_image_patches_op_gpu.cu.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/extract_image_patches_op.h"
+#include "tensorflow/core/kernels/image/extract_image_patches_op.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/extract_jpeg_shape_op.cc b/tensorflow/core/kernels/image/extract_jpeg_shape_op.cc
similarity index 100%
rename from tensorflow/core/kernels/extract_jpeg_shape_op.cc
rename to tensorflow/core/kernels/image/extract_jpeg_shape_op.cc
diff --git a/tensorflow/core/kernels/extract_volume_patches_op.cc b/tensorflow/core/kernels/image/extract_volume_patches_op.cc
similarity index 99%
rename from tensorflow/core/kernels/extract_volume_patches_op.cc
rename to tensorflow/core/kernels/image/extract_volume_patches_op.cc
index 3f003b6f7f6..e48e7602afa 100644
--- a/tensorflow/core/kernels/extract_volume_patches_op.cc
+++ b/tensorflow/core/kernels/image/extract_volume_patches_op.cc
@@ -24,7 +24,7 @@ when rates are to be added.
 #define USE_EIGEN_TENSOR
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/extract_volume_patches_op.h"
+#include "tensorflow/core/kernels/image/extract_volume_patches_op.h"
 
 #include <vector>
 
diff --git a/tensorflow/core/kernels/extract_volume_patches_op.h b/tensorflow/core/kernels/image/extract_volume_patches_op.h
similarity index 92%
rename from tensorflow/core/kernels/extract_volume_patches_op.h
rename to tensorflow/core/kernels/image/extract_volume_patches_op.h
index 7e0502b7707..f20ee6a6ade 100644
--- a/tensorflow/core/kernels/extract_volume_patches_op.h
+++ b/tensorflow/core/kernels/image/extract_volume_patches_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_EXTRACT_VOLUME_PATCHES_OP_H_
-#define TENSORFLOW_KERNELS_EXTRACT_VOLUME_PATCHES_OP_H_
+#ifndef TENSORFLOW_KERNELS_IMAGE_EXTRACT_VOLUME_PATCHES_OP_H_
+#define TENSORFLOW_KERNELS_IMAGE_EXTRACT_VOLUME_PATCHES_OP_H_
 
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -55,4 +55,4 @@ struct ExtractVolumePatchesForward {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_EXTRACT_VOLUME_PATCHES_OP_H_
+#endif  // TENSORFLOW_KERNELS_IMAGE_EXTRACT_VOLUME_PATCHES_OP_H_
diff --git a/tensorflow/core/kernels/extract_volume_patches_op_gpu.cu.cc b/tensorflow/core/kernels/image/extract_volume_patches_op_gpu.cu.cc
similarity index 94%
rename from tensorflow/core/kernels/extract_volume_patches_op_gpu.cu.cc
rename to tensorflow/core/kernels/image/extract_volume_patches_op_gpu.cu.cc
index df8b6f8bfa2..379907712a8 100644
--- a/tensorflow/core/kernels/extract_volume_patches_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/extract_volume_patches_op_gpu.cu.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/extract_volume_patches_op.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/image/extract_volume_patches_op.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/generate_box_proposals_op.cu.cc b/tensorflow/core/kernels/image/generate_box_proposals_op.cu.cc
similarity index 99%
rename from tensorflow/core/kernels/generate_box_proposals_op.cu.cc
rename to tensorflow/core/kernels/image/generate_box_proposals_op.cu.cc
index b862c42d299..721d190fa22 100644
--- a/tensorflow/core/kernels/generate_box_proposals_op.cu.cc
+++ b/tensorflow/core/kernels/image/generate_box_proposals_op.cu.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/gpu_prim.h"
-#include "tensorflow/core/kernels/non_max_suppression_op.h"
+#include "tensorflow/core/kernels/image/non_max_suppression_op.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor.h"
diff --git a/tensorflow/core/kernels/image_ops.cc b/tensorflow/core/kernels/image/image_ops.cc
similarity index 93%
rename from tensorflow/core/kernels/image_ops.cc
rename to tensorflow/core/kernels/image/image_ops.cc
index 5d879661f12..f121fb81654 100644
--- a/tensorflow/core/kernels/image_ops.cc
+++ b/tensorflow/core/kernels/image/image_ops.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA
 
-#include "tensorflow/core/kernels/image_ops.h"
+#include "tensorflow/core/kernels/image/image_ops.h"
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -70,14 +70,16 @@ class ImageProjectiveTransformV2 : public OpKernel {
     string mode_str;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("fill_mode", &mode_str));
     if (mode_str == "REFLECT") {
-      fill_mode_ = Mode::REFLECT;
+      fill_mode_ = Mode::FILL_REFLECT;
     } else if (mode_str == "WRAP") {
-      fill_mode_ = Mode::WRAP;
+      fill_mode_ = Mode::FILL_WRAP;
     } else if (mode_str == "CONSTANT") {
-      fill_mode_ = Mode::CONSTANT;
+      fill_mode_ = Mode::FILL_CONSTANT;
+    } else if (mode_str == "NEAREST") {
+      fill_mode_ = Mode::FILL_NEAREST;
     } else {
       LOG(ERROR) << "Invalid mode " << mode_str
-                 << ". Supported types: REFLECT, WRAP, CONSTANT";
+                 << ". Supported types: REFLECT, WRAP, CONSTANT, NEAREST";
     }
   }
 
@@ -179,9 +181,10 @@ namespace generator {
                                                    const DenseIndex len); \
   extern template struct MapCoordinate<GPUDevice, Mode>
 
-DECLARE_MAP_FUNCTOR(Mode::REFLECT);
-DECLARE_MAP_FUNCTOR(Mode::WRAP);
-DECLARE_MAP_FUNCTOR(Mode::CONSTANT);
+DECLARE_MAP_FUNCTOR(Mode::FILL_REFLECT);
+DECLARE_MAP_FUNCTOR(Mode::FILL_WRAP);
+DECLARE_MAP_FUNCTOR(Mode::FILL_CONSTANT);
+DECLARE_MAP_FUNCTOR(Mode::FILL_NEAREST);
 
 }  // end namespace generator
 
diff --git a/tensorflow/core/kernels/image_ops.h b/tensorflow/core/kernels/image/image_ops.h
similarity index 89%
rename from tensorflow/core/kernels/image_ops.h
rename to tensorflow/core/kernels/image/image_ops.h
index 300c65921bd..70b47e181df 100644
--- a/tensorflow/core/kernels/image_ops.h
+++ b/tensorflow/core/kernels/image/image_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_IMAGE_OPS_H_
-#define TENSORFLOW_CORE_KERNELS_IMAGE_OPS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGE_IMAGE_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGE_IMAGE_OPS_H_
 
 // See docs in ../ops/image_ops.cc.
 
@@ -30,7 +30,7 @@ namespace tensorflow {
 namespace generator {
 
 enum Interpolation { NEAREST, BILINEAR };
-enum Mode { REFLECT, WRAP, CONSTANT };
+enum Mode { FILL_REFLECT, FILL_WRAP, FILL_CONSTANT, FILL_NEAREST };
 
 using Eigen::array;
 using Eigen::DenseIndex;
@@ -41,7 +41,7 @@ struct MapCoordinate {
 };
 
 template <typename Device>
-struct MapCoordinate<Device, Mode::REFLECT> {
+struct MapCoordinate<Device, Mode::FILL_REFLECT> {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float operator()(const float out_coord,
                                                          const DenseIndex len) {
     float in_coord = out_coord;
@@ -64,7 +64,7 @@ struct MapCoordinate<Device, Mode::REFLECT> {
 };
 
 template <typename Device>
-struct MapCoordinate<Device, Mode::WRAP> {
+struct MapCoordinate<Device, Mode::FILL_WRAP> {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float operator()(const float out_coord,
                                                          const DenseIndex len) {
     float in_coord = out_coord;
@@ -82,13 +82,25 @@ struct MapCoordinate<Device, Mode::WRAP> {
 };
 
 template <typename Device>
-struct MapCoordinate<Device, Mode::CONSTANT> {
+struct MapCoordinate<Device, Mode::FILL_CONSTANT> {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float operator()(const float out_coord,
                                                          const DenseIndex len) {
     return out_coord;
   }
 };
 
+template <typename Device>
+struct MapCoordinate<Device, Mode::FILL_NEAREST> {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float operator()(const float out_coord,
+                                                         const DenseIndex len) {
+    if (out_coord < 0)
+      return 0;
+    else if (out_coord >= len)
+      return len - 1;
+    return out_coord;
+  }
+};
+
 template <typename Device, typename T, Mode M>
 class ProjectiveGenerator {
  private:
@@ -215,19 +227,24 @@ struct FillProjectiveTransform {
                   const InputType& images, const TransformsType& transform,
                   const Mode fill_mode) const {
     switch (fill_mode) {
-      case Mode::REFLECT:
+      case Mode::FILL_REFLECT:
         output->device(device) =
-            output->generate(ProjectiveGenerator<Device, T, Mode::REFLECT>(
+            output->generate(ProjectiveGenerator<Device, T, Mode::FILL_REFLECT>(
                 images, transform, interpolation));
         break;
-      case Mode::WRAP:
+      case Mode::FILL_WRAP:
         output->device(device) =
-            output->generate(ProjectiveGenerator<Device, T, Mode::WRAP>(
+            output->generate(ProjectiveGenerator<Device, T, Mode::FILL_WRAP>(
                 images, transform, interpolation));
         break;
-      case Mode::CONSTANT:
+      case Mode::FILL_CONSTANT:
+        output->device(device) = output->generate(
+            ProjectiveGenerator<Device, T, Mode::FILL_CONSTANT>(
+                images, transform, interpolation));
+        break;
+      case Mode::FILL_NEAREST:
         output->device(device) =
-            output->generate(ProjectiveGenerator<Device, T, Mode::CONSTANT>(
+            output->generate(ProjectiveGenerator<Device, T, Mode::FILL_NEAREST>(
                 images, transform, interpolation));
         break;
     }
@@ -238,4 +255,4 @@ struct FillProjectiveTransform {
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_IMAGE_OPS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGE_IMAGE_OPS_H_
diff --git a/tensorflow/core/kernels/image_ops_gpu.cu.cc b/tensorflow/core/kernels/image/image_ops_gpu.cu.cc
similarity index 96%
rename from tensorflow/core/kernels/image_ops_gpu.cu.cc
rename to tensorflow/core/kernels/image/image_ops_gpu.cu.cc
index 827fb493e4c..dd94559ffd7 100644
--- a/tensorflow/core/kernels/image_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/image_ops_gpu.cu.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/image_ops.h"
+#include "tensorflow/core/kernels/image/image_ops.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/mirror_pad_op.cc b/tensorflow/core/kernels/image/mirror_pad_op.cc
similarity index 99%
rename from tensorflow/core/kernels/mirror_pad_op.cc
rename to tensorflow/core/kernels/image/mirror_pad_op.cc
index 20211c88c8b..e22b1f1adbf 100644
--- a/tensorflow/core/kernels/mirror_pad_op.cc
+++ b/tensorflow/core/kernels/image/mirror_pad_op.cc
@@ -17,11 +17,11 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/mirror_pad_op.h"
+#include "tensorflow/core/kernels/image/mirror_pad_op.h"
+
 #include <string>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/core/kernels/mirror_pad_op.h b/tensorflow/core/kernels/image/mirror_pad_op.h
similarity index 99%
rename from tensorflow/core/kernels/mirror_pad_op.h
rename to tensorflow/core/kernels/image/mirror_pad_op.h
index 23ab574b8b6..8a8f84b7a64 100644
--- a/tensorflow/core/kernels/mirror_pad_op.h
+++ b/tensorflow/core/kernels/image/mirror_pad_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_MIRROR_PAD_OP_H_
-#define TENSORFLOW_CORE_KERNELS_MIRROR_PAD_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGE_MIRROR_PAD_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGE_MIRROR_PAD_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -444,4 +444,4 @@ struct MirrorPadGrad {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_MIRROR_PAD_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGE_MIRROR_PAD_OP_H_
diff --git a/tensorflow/core/kernels/mirror_pad_op_benchmark_test.cc b/tensorflow/core/kernels/image/mirror_pad_op_benchmark_test.cc
similarity index 100%
rename from tensorflow/core/kernels/mirror_pad_op_benchmark_test.cc
rename to tensorflow/core/kernels/image/mirror_pad_op_benchmark_test.cc
diff --git a/tensorflow/core/kernels/mirror_pad_op_cpu_impl.h b/tensorflow/core/kernels/image/mirror_pad_op_cpu_impl.h
similarity index 83%
rename from tensorflow/core/kernels/mirror_pad_op_cpu_impl.h
rename to tensorflow/core/kernels/image/mirror_pad_op_cpu_impl.h
index 45e6676e5a6..7a7c263c526 100644
--- a/tensorflow/core/kernels/mirror_pad_op_cpu_impl.h
+++ b/tensorflow/core/kernels/image/mirror_pad_op_cpu_impl.h
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_MIRROR_PAD_OP_CPU_IMPL_H_
-#define TENSORFLOW_CORE_KERNELS_MIRROR_PAD_OP_CPU_IMPL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGE_MIRROR_PAD_OP_CPU_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGE_MIRROR_PAD_OP_CPU_IMPL_H_
 
+#if CPU_PROVIDED_IXDIM
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/mirror_pad_op.h"
+#include "tensorflow/core/kernels/image/mirror_pad_op.h"
 
 namespace tensorflow {
 
@@ -39,7 +40,7 @@ TF_CALL_tstring(DEFINE_CPU_SPECS);
                                          CPU_PROVIDED_IXDIM>;
 TF_CALL_NUMBER_TYPES(DEFINE_CPU_SPECS);
 #undef DEFINE_CPU_SPECS
-
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_MIRROR_PAD_OP_CPU_IMPL_H_
+#endif  // CPU_PROVIDED_IXDIM
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGE_MIRROR_PAD_OP_CPU_IMPL_H_
diff --git a/tensorflow/core/kernels/mirror_pad_op_cpu_impl_1.cc b/tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_1.cc
similarity index 91%
rename from tensorflow/core/kernels/mirror_pad_op_cpu_impl_1.cc
rename to tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_1.cc
index 140c487221f..ad64170aa0f 100644
--- a/tensorflow/core/kernels/mirror_pad_op_cpu_impl_1.cc
+++ b/tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_1.cc
@@ -14,5 +14,5 @@ limitations under the License.
 ==============================================================================*/
 
 #define CPU_PROVIDED_IXDIM 1
-#include "tensorflow/core/kernels/mirror_pad_op_cpu_impl.h"
+#include "tensorflow/core/kernels/image/mirror_pad_op_cpu_impl.h"
 #undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/mirror_pad_op_cpu_impl_2.cc b/tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_2.cc
similarity index 91%
rename from tensorflow/core/kernels/mirror_pad_op_cpu_impl_2.cc
rename to tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_2.cc
index d67f7754e1d..76096f78030 100644
--- a/tensorflow/core/kernels/mirror_pad_op_cpu_impl_2.cc
+++ b/tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_2.cc
@@ -14,5 +14,5 @@ limitations under the License.
 ==============================================================================*/
 
 #define CPU_PROVIDED_IXDIM 2
-#include "tensorflow/core/kernels/mirror_pad_op_cpu_impl.h"
+#include "tensorflow/core/kernels/image/mirror_pad_op_cpu_impl.h"
 #undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/mirror_pad_op_cpu_impl_3.cc b/tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_3.cc
similarity index 91%
rename from tensorflow/core/kernels/mirror_pad_op_cpu_impl_3.cc
rename to tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_3.cc
index 096547f1f9c..3c29e87bc45 100644
--- a/tensorflow/core/kernels/mirror_pad_op_cpu_impl_3.cc
+++ b/tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_3.cc
@@ -14,5 +14,5 @@ limitations under the License.
 ==============================================================================*/
 
 #define CPU_PROVIDED_IXDIM 3
-#include "tensorflow/core/kernels/mirror_pad_op_cpu_impl.h"
+#include "tensorflow/core/kernels/image/mirror_pad_op_cpu_impl.h"
 #undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/mirror_pad_op_cpu_impl_4.cc b/tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_4.cc
similarity index 91%
rename from tensorflow/core/kernels/mirror_pad_op_cpu_impl_4.cc
rename to tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_4.cc
index 5a7455f3c07..5d1a3400054 100644
--- a/tensorflow/core/kernels/mirror_pad_op_cpu_impl_4.cc
+++ b/tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_4.cc
@@ -14,5 +14,5 @@ limitations under the License.
 ==============================================================================*/
 
 #define CPU_PROVIDED_IXDIM 4
-#include "tensorflow/core/kernels/mirror_pad_op_cpu_impl.h"
+#include "tensorflow/core/kernels/image/mirror_pad_op_cpu_impl.h"
 #undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/mirror_pad_op_cpu_impl_5.cc b/tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_5.cc
similarity index 91%
rename from tensorflow/core/kernels/mirror_pad_op_cpu_impl_5.cc
rename to tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_5.cc
index ed2db03a8f5..71a6c9307c6 100644
--- a/tensorflow/core/kernels/mirror_pad_op_cpu_impl_5.cc
+++ b/tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_5.cc
@@ -14,5 +14,5 @@ limitations under the License.
 ==============================================================================*/
 
 #define CPU_PROVIDED_IXDIM 5
-#include "tensorflow/core/kernels/mirror_pad_op_cpu_impl.h"
+#include "tensorflow/core/kernels/image/mirror_pad_op_cpu_impl.h"
 #undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/mirror_pad_op_gpu.cu.cc b/tensorflow/core/kernels/image/mirror_pad_op_gpu.cu.cc
similarity index 97%
rename from tensorflow/core/kernels/mirror_pad_op_gpu.cu.cc
rename to tensorflow/core/kernels/image/mirror_pad_op_gpu.cu.cc
index ac89599714d..f0afc707fc6 100644
--- a/tensorflow/core/kernels/mirror_pad_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/mirror_pad_op_gpu.cu.cc
@@ -17,9 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/mirror_pad_op.h"
-
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/image/mirror_pad_op.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/mirror_pad_op_test.cc b/tensorflow/core/kernels/image/mirror_pad_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/mirror_pad_op_test.cc
rename to tensorflow/core/kernels/image/mirror_pad_op_test.cc
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cc b/tensorflow/core/kernels/image/non_max_suppression_op.cc
similarity index 99%
rename from tensorflow/core/kernels/non_max_suppression_op.cc
rename to tensorflow/core/kernels/image/non_max_suppression_op.cc
index 20ae3a2e0d0..701753a81d6 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cc
+++ b/tensorflow/core/kernels/image/non_max_suppression_op.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/non_max_suppression_op.h"
+#include "tensorflow/core/kernels/image/non_max_suppression_op.h"
 
 #include <cmath>
 #include <functional>
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cu.cc b/tensorflow/core/kernels/image/non_max_suppression_op.cu.cc
similarity index 91%
rename from tensorflow/core/kernels/non_max_suppression_op.cu.cc
rename to tensorflow/core/kernels/image/non_max_suppression_op.cu.cc
index c2cae2ab212..37d7d42e438 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cu.cc
+++ b/tensorflow/core/kernels/image/non_max_suppression_op.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 #include <limits>
 
@@ -23,12 +23,16 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/gpu_prim.h"
-#include "tensorflow/core/kernels/non_max_suppression_op.h"
+#include "tensorflow/core/kernels/image/non_max_suppression_op.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 #include "tensorflow/core/util/gpu_launch_config.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 
-struct __align__(16) Box {
+struct
+#if GOOGLE_CUDA
+    __align__(16)
+#endif
+        Box {
   float x1, y1, x2, y2;
 };
 
@@ -114,7 +118,7 @@ __global__ void NMSReduce(const int* bitmask, const int bit_mask_len,
                           char* result_mask) {
   extern __shared__ int local[];
   // set global mask to accept all boxes
-  for (int box : CudaGridRangeX(bit_mask_len)) {
+  for (int box : GpuGridRangeX(bit_mask_len)) {
     local[box] = 0xFFFFFFFF;
   }
   __syncthreads();
@@ -127,7 +131,7 @@ __global__ void NMSReduce(const int* bitmask, const int bit_mask_len,
     accepted_boxes += 1;
     int offset = box * bit_mask_len;
     // update global mask with current box's mask
-    for (int b : CudaGridRangeX(bit_mask_len)) {
+    for (int b : GpuGridRangeX(bit_mask_len)) {
       local[b] &= ~bitmask[offset + b];
     }
     __syncthreads();
@@ -135,7 +139,7 @@ __global__ void NMSReduce(const int* bitmask, const int bit_mask_len,
   }
   // copy global mask to result_max char array. char array is needed for
   // cub::DeviceSelect later.
-  for (int box : CudaGridRangeX(num_boxes)) {
+  for (int box : GpuGridRangeX(num_boxes)) {
     result_mask[box] = CheckBit(local, box);
   }
 }
@@ -232,14 +236,14 @@ __device__ EIGEN_STRONG_INLINE void SelectHelper(const Index i_selected,
 template <typename Index, typename T, typename... Args>
 __global__ void IndexMultiSelect(const int num_elements, const Index* indices,
                                  const T* original, T* selected, Args... args) {
-  for (const int idx : CudaGridRangeX(num_elements)) {
+  for (const int idx : GpuGridRangeX(num_elements)) {
     SelectHelper(idx, indices[idx], original, selected, args...);
   }
 }
 
 template <typename T>
 __global__ void Iota(const int num_elements, const T offset, T* to_fill) {
-  for (int idx : CudaGridRangeX(num_elements)) {
+  for (int idx : GpuGridRangeX(num_elements)) {
     to_fill[idx] = static_cast<T>(idx) + offset;
   }
 }
@@ -322,13 +326,13 @@ Status NmsGpu(const float* d_sorted_boxes_float_ptr, const int num_boxes,
   TF_RETURN_IF_CUDA_ERROR(cudaGetLastError());
   // do Cub::deviceSelect::flagged
   size_t flagged_buffer_size = 0;
-  cub::DeviceSelect::Flagged(static_cast<void*>(nullptr),  // temp_storage
-                             flagged_buffer_size,
-                             static_cast<int*>(nullptr),   // input
-                             static_cast<char*>(nullptr),  // selection flag
-                             static_cast<int*>(nullptr),   // selected items
-                             static_cast<int*>(nullptr),   // num_selected
-                             num_boxes, device.stream());
+  gpuprim::DeviceSelect::Flagged(static_cast<void*>(nullptr),  // temp_storage
+                                 flagged_buffer_size,
+                                 static_cast<int*>(nullptr),   // input
+                                 static_cast<char*>(nullptr),  // selection flag
+                                 static_cast<int*>(nullptr),   // selected items
+                                 static_cast<int*>(nullptr),   // num_selected
+                                 num_boxes, device.stream());
   Tensor cub_scratch;
   TF_RETURN_IF_ERROR(context->allocate_temp(
       DataType::DT_INT8, TensorShape({(int64)flagged_buffer_size}),
@@ -337,22 +341,22 @@ Status NmsGpu(const float* d_sorted_boxes_float_ptr, const int num_boxes,
   TF_RETURN_IF_ERROR(context->allocate_temp(DataType::DT_INT32,
                                             TensorShape({1}), &d_num_selected));
 
-  cub::DeviceSelect::Flagged(
+  gpuprim::DeviceSelect::Flagged(
       (void*)cub_scratch.flat<int8>().data(),  // temp_storage
       flagged_buffer_size,
       d_indices.flat<int>().data(),  // input
       selected,                      // selection flag
       d_selected_indices,            // selected items
       d_num_selected.flat<int>().data(), num_boxes, device.stream());
-  cudaEvent_t copy_done;
+  gpuEvent_t copy_done;
   TF_RETURN_IF_CUDA_ERROR(
-      cudaEventCreateWithFlags(&copy_done, cudaEventDisableTiming));
+      gpuEventCreateWithFlags(&copy_done, gpuEventDisableTiming));
   device.memcpyDeviceToHost(h_selected_count, d_num_selected.flat<int>().data(),
                             sizeof(int));
-  TF_RETURN_IF_CUDA_ERROR(cudaEventRecord(copy_done, device.stream()));
-  TF_RETURN_IF_CUDA_ERROR(cudaEventSynchronize(copy_done));
+  TF_RETURN_IF_CUDA_ERROR(gpuEventRecord(copy_done, device.stream()));
+  TF_RETURN_IF_CUDA_ERROR(gpuEventSynchronize(copy_done));
   *h_nkeep = *h_selected_count;
-  cudaEventDestroy(copy_done);
+  gpuEventDestroy(copy_done);
   return Status::OK();
 }
 
@@ -375,9 +379,10 @@ Status CountIf(OpKernelContext* context, const float* dev_array, const Op& op,
   size_t workspace_size = 0;
   auto cuda_stream = tensorflow::GetGpuStream(context);
   auto device = context->eigen_gpu_device();
-  cub::DeviceSelect::If(nullptr, workspace_size, static_cast<float*>(nullptr),
-                        static_cast<float*>(nullptr),
-                        static_cast<int*>(nullptr), num_elements, op);
+  gpuprim::DeviceSelect::If(nullptr, workspace_size,
+                            static_cast<float*>(nullptr),
+                            static_cast<float*>(nullptr),
+                            static_cast<int*>(nullptr), num_elements, op);
 
   TF_RETURN_IF_ERROR(context->allocate_temp(
       DataType::DT_FLOAT, TensorShape({num_elements}), &scratch_output));
@@ -385,17 +390,17 @@ Status CountIf(OpKernelContext* context, const float* dev_array, const Op& op,
       DataType::DT_INT8, TensorShape({(int64)workspace_size}), &workspace));
   TF_RETURN_IF_ERROR(context->allocate_temp(DataType::DT_INT32,
                                             TensorShape({1}), &element_count));
-  cudaEvent_t copy_done;
+  gpuEvent_t copy_done;
   TF_RETURN_IF_CUDA_ERROR(
-      cudaEventCreateWithFlags(&copy_done, cudaEventDisableTiming));
-  TF_RETURN_IF_CUDA_ERROR(cub::DeviceSelect::If(
+      gpuEventCreateWithFlags(&copy_done, gpuEventDisableTiming));
+  TF_RETURN_IF_CUDA_ERROR(gpuprim::DeviceSelect::If(
       workspace.flat<int8>().data(), workspace_size, dev_array,
       scratch_output.flat<float>().data(), element_count.flat<int32>().data(),
       num_elements, op, cuda_stream));
   device.memcpyDeviceToHost(result, element_count.flat<int32>().data(),
                             sizeof(int));
-  TF_RETURN_IF_CUDA_ERROR(cudaEventRecord(copy_done, device.stream()));
-  TF_RETURN_IF_CUDA_ERROR(cudaEventSynchronize(copy_done));
+  TF_RETURN_IF_CUDA_ERROR(gpuEventRecord(copy_done, device.stream()));
+  TF_RETURN_IF_CUDA_ERROR(gpuEventSynchronize(copy_done));
   return Status::OK();
 }
 
@@ -418,7 +423,7 @@ Status DoNMS(OpKernelContext* context, const Tensor& boxes,
     return Status::OK();
   }
 
-  cudaError_t cuda_ret = cub::DeviceRadixSort::SortPairsDescending(
+  cudaError_t cuda_ret = gpuprim::DeviceRadixSort::SortPairsDescending(
       nullptr, cub_sort_temp_storage_bytes,
       static_cast<float*>(nullptr),  // scores
       static_cast<float*>(nullptr),  // sorted scores
@@ -458,7 +463,7 @@ Status DoNMS(OpKernelContext* context, const Tensor& boxes,
                               config.virtual_thread_count, 0,
                               d_indices.flat<int>().data()));
   TF_RETURN_IF_CUDA_ERROR(cudaGetLastError());
-  cuda_ret = cub::DeviceRadixSort::SortPairsDescending(
+  cuda_ret = gpuprim::DeviceRadixSort::SortPairsDescending(
       d_cub_sort_buffer.flat<int8>().data(), cub_sort_temp_storage_bytes,
       scores.flat<float>().data(), d_sorted_scores.flat<float>().data(),
       d_indices.flat<int>().data(), d_sorted_indices.flat<int>().data(),
@@ -721,24 +726,21 @@ REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV2")
                             .HostMemory("max_output_size"),
                         NonMaxSuppressionV2GPUOp);
 
-// TODO(laigd): enable once b/141559125 is fixed.
-// REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV3")
-//                             .TypeConstraint<float>("T")
-//                             .Device(DEVICE_GPU)
-//                             .HostMemory("iou_threshold")
-//                             .HostMemory("max_output_size")
-//                             .HostMemory("score_threshold"),
-//                         NonMaxSuppressionV3GPUOp);
+REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV3")
+                            .TypeConstraint<float>("T")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("iou_threshold")
+                            .HostMemory("max_output_size")
+                            .HostMemory("score_threshold"),
+                        NonMaxSuppressionV3GPUOp);
 
-// TODO(b/143610288): this op tries to allocate 4GB of memory for the mask for
-// some model and cause OOM.
-// REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV4")
-//                             .TypeConstraint<float>("T")
-//                             .Device(DEVICE_GPU)
-//                             .HostMemory("iou_threshold")
-//                             .HostMemory("max_output_size")
-//                             .HostMemory("score_threshold"),
-//                         NonMaxSuppressionV4GPUOp);
+REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV4")
+                            .TypeConstraint<float>("T")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("iou_threshold")
+                            .HostMemory("max_output_size")
+                            .HostMemory("score_threshold"),
+                        NonMaxSuppressionV4GPUOp);
 
 }  // namespace tensorflow
 #endif
diff --git a/tensorflow/core/kernels/non_max_suppression_op.h b/tensorflow/core/kernels/image/non_max_suppression_op.h
similarity index 91%
rename from tensorflow/core/kernels/non_max_suppression_op.h
rename to tensorflow/core/kernels/image/non_max_suppression_op.h
index eaa1b28ad4b..d6d3b68b099 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.h
+++ b/tensorflow/core/kernels/image/non_max_suppression_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_NON_MAX_SUPPRESSION_OP_H_
-#define TENSORFLOW_CORE_KERNELS_NON_MAX_SUPPRESSION_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGENON_MAX_SUPPRESSION_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGENON_MAX_SUPPRESSION_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_types.h"
@@ -35,7 +35,7 @@ struct NonMaxSuppression {
 
 }  // namespace functor
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 extern const int kNmsBoxesPerTread;
 
 // Given descending sorted box list, apply non-maximal-suppression with given
@@ -59,4 +59,4 @@ Status NmsGpu(const float* d_sorted_boxes_float_ptr, const int num_boxes,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_NON_MAX_SUPPRESSION_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGENON_MAX_SUPPRESSION_OP_H_
diff --git a/tensorflow/core/kernels/non_max_suppression_op_benchmark_test.cc b/tensorflow/core/kernels/image/non_max_suppression_op_benchmark_test.cc
similarity index 100%
rename from tensorflow/core/kernels/non_max_suppression_op_benchmark_test.cc
rename to tensorflow/core/kernels/image/non_max_suppression_op_benchmark_test.cc
diff --git a/tensorflow/core/kernels/non_max_suppression_op_gpu_test.cc b/tensorflow/core/kernels/image/non_max_suppression_op_gpu_test.cc
similarity index 99%
rename from tensorflow/core/kernels/non_max_suppression_op_gpu_test.cc
rename to tensorflow/core/kernels/image/non_max_suppression_op_gpu_test.cc
index 8dcb9c77a41..57f812d410e 100644
--- a/tensorflow/core/kernels/non_max_suppression_op_gpu_test.cc
+++ b/tensorflow/core/kernels/image/non_max_suppression_op_gpu_test.cc
@@ -35,7 +35,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // These tests are copied from non_max_suppression_op_test.cc file and modified
 // to use GPU ops. See other file for test details.
 
diff --git a/tensorflow/core/kernels/non_max_suppression_op_test.cc b/tensorflow/core/kernels/image/non_max_suppression_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/non_max_suppression_op_test.cc
rename to tensorflow/core/kernels/image/non_max_suppression_op_test.cc
diff --git a/tensorflow/core/kernels/random_crop_op.cc b/tensorflow/core/kernels/image/random_crop_op.cc
similarity index 94%
rename from tensorflow/core/kernels/random_crop_op.cc
rename to tensorflow/core/kernels/image/random_crop_op.cc
index b89bda4769d..7da97466636 100644
--- a/tensorflow/core/kernels/random_crop_op.cc
+++ b/tensorflow/core/kernels/image/random_crop_op.cc
@@ -64,10 +64,6 @@ class RandomCropOp : public OpKernel {
       *output = context->input(0);
     }
 
-    // TODO(shlens): Implement edge case to guarantee output size dimensions.
-    // Edge case. The target dimensions are larger then the image, so
-    // zero-pad the image. This guarantees that the image will *always*
-    // be [target_height, target_width] in size.
     OP_REQUIRES(context, width >= target_width,
                 errors::FailedPrecondition(
                     "width must be >= target_width: width = ", width,
diff --git a/tensorflow/core/kernels/resize_area_op.cc b/tensorflow/core/kernels/image/resize_area_op.cc
similarity index 99%
rename from tensorflow/core/kernels/resize_area_op.cc
rename to tensorflow/core/kernels/image/resize_area_op.cc
index 325c5ccade1..00691ae46b0 100644
--- a/tensorflow/core/kernels/resize_area_op.cc
+++ b/tensorflow/core/kernels/image/resize_area_op.cc
@@ -18,15 +18,16 @@ limitations under the License.
 
 #include <algorithm>
 #include <memory>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/image_resizer_state.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/image_resizer_state.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/resize_area_op_test.cc b/tensorflow/core/kernels/image/resize_area_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/resize_area_op_test.cc
rename to tensorflow/core/kernels/image/resize_area_op_test.cc
diff --git a/tensorflow/core/kernels/resize_bicubic_op.cc b/tensorflow/core/kernels/image/resize_bicubic_op.cc
similarity index 99%
rename from tensorflow/core/kernels/resize_bicubic_op.cc
rename to tensorflow/core/kernels/image/resize_bicubic_op.cc
index 48bd1986b7b..89f34cb80f0 100644
--- a/tensorflow/core/kernels/resize_bicubic_op.cc
+++ b/tensorflow/core/kernels/image/resize_bicubic_op.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include <math.h>
+
 #include <algorithm>
 #include <array>
 
@@ -26,9 +27,9 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/image_resizer_state.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/image_resizer_state.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/core/kernels/resize_bicubic_op_test.cc b/tensorflow/core/kernels/image/resize_bicubic_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/resize_bicubic_op_test.cc
rename to tensorflow/core/kernels/image/resize_bicubic_op_test.cc
diff --git a/tensorflow/core/kernels/resize_bilinear_op.cc b/tensorflow/core/kernels/image/resize_bilinear_op.cc
similarity index 99%
rename from tensorflow/core/kernels/resize_bilinear_op.cc
rename to tensorflow/core/kernels/image/resize_bilinear_op.cc
index a0673fea73d..b9eb650c029 100644
--- a/tensorflow/core/kernels/resize_bilinear_op.cc
+++ b/tensorflow/core/kernels/image/resize_bilinear_op.cc
@@ -16,22 +16,23 @@ limitations under the License.
 // See docs in ../ops/image_ops.cc
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/resize_bilinear_op.h"
+#include "tensorflow/core/kernels/image/resize_bilinear_op.h"
 
 #ifdef __SSE4_1__
 #include <xmmintrin.h>
 #endif
 
 #include <memory>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/image_resizer_state.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/image_resizer_state.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/resize_bilinear_op.h b/tensorflow/core/kernels/image/resize_bilinear_op.h
similarity index 90%
rename from tensorflow/core/kernels/resize_bilinear_op.h
rename to tensorflow/core/kernels/image/resize_bilinear_op.h
index b4d0066d4f3..34a6b320251 100644
--- a/tensorflow/core/kernels/resize_bilinear_op.h
+++ b/tensorflow/core/kernels/image/resize_bilinear_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_RESIZE_BILINEAR_OP_H_
-#define TENSORFLOW_CORE_KERNELS_RESIZE_BILINEAR_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGERESIZE_BILINEAR_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGERESIZE_BILINEAR_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_types.h"
@@ -43,4 +43,4 @@ struct ResizeBilinearGrad {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_RESIZE_BILINEAR_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGERESIZE_BILINEAR_OP_H_
diff --git a/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc b/tensorflow/core/kernels/image/resize_bilinear_op_gpu.cu.cc
similarity index 99%
rename from tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
rename to tensorflow/core/kernels/image/resize_bilinear_op_gpu.cu.cc
index 42a3daae116..aa475a4a3af 100644
--- a/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/resize_bilinear_op_gpu.cu.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/resize_bilinear_op.h"
+#include "tensorflow/core/kernels/image/resize_bilinear_op.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
diff --git a/tensorflow/core/kernels/resize_bilinear_op_test.cc b/tensorflow/core/kernels/image/resize_bilinear_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/resize_bilinear_op_test.cc
rename to tensorflow/core/kernels/image/resize_bilinear_op_test.cc
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc b/tensorflow/core/kernels/image/resize_nearest_neighbor_op.cc
similarity index 99%
rename from tensorflow/core/kernels/resize_nearest_neighbor_op.cc
rename to tensorflow/core/kernels/image/resize_nearest_neighbor_op.cc
index 4a357333957..a3c6a69a692 100644
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
+++ b/tensorflow/core/kernels/image/resize_nearest_neighbor_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 // See docs in ../ops/image_ops.cc
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/resize_nearest_neighbor_op.h"
+#include "tensorflow/core/kernels/image/resize_nearest_neighbor_op.h"
 
 #include <memory>
 
@@ -26,9 +26,9 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/image_resizer_state.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/image_resizer_state.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op.h b/tensorflow/core/kernels/image/resize_nearest_neighbor_op.h
similarity index 88%
rename from tensorflow/core/kernels/resize_nearest_neighbor_op.h
rename to tensorflow/core/kernels/image/resize_nearest_neighbor_op.h
index d6b053180ce..db0276477eb 100644
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op.h
+++ b/tensorflow/core/kernels/image/resize_nearest_neighbor_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_RESIZE_NEAREST_NEIGHBOR_OP_H_
-#define TENSORFLOW_CORE_KERNELS_RESIZE_NEAREST_NEIGHBOR_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGERESIZE_NEAREST_NEIGHBOR_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGERESIZE_NEAREST_NEIGHBOR_OP_H_
 
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/platform/types.h"
@@ -42,4 +42,4 @@ struct ResizeNearestNeighborGrad {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_RESIZE_NEAREST_NEIGHBOR_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGERESIZE_NEAREST_NEIGHBOR_OP_H_
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc b/tensorflow/core/kernels/image/resize_nearest_neighbor_op_gpu.cu.cc
similarity index 99%
rename from tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc
rename to tensorflow/core/kernels/image/resize_nearest_neighbor_op_gpu.cu.cc
index b6a9c77ba13..50066d5b653 100644
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/resize_nearest_neighbor_op_gpu.cu.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/resize_nearest_neighbor_op.h"
+#include "tensorflow/core/kernels/image/resize_nearest_neighbor_op.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc b/tensorflow/core/kernels/image/resize_nearest_neighbor_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc
rename to tensorflow/core/kernels/image/resize_nearest_neighbor_op_test.cc
diff --git a/tensorflow/core/kernels/resize_op_benchmark_test.cc b/tensorflow/core/kernels/image/resize_op_benchmark_test.cc
similarity index 100%
rename from tensorflow/core/kernels/resize_op_benchmark_test.cc
rename to tensorflow/core/kernels/image/resize_op_benchmark_test.cc
diff --git a/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc b/tensorflow/core/kernels/image/sample_distorted_bounding_box_op.cc
similarity index 86%
rename from tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
rename to tensorflow/core/kernels/image/sample_distorted_bounding_box_op.cc
index 2936856ec29..3b1cc3d27f0 100644
--- a/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
+++ b/tensorflow/core/kernels/image/sample_distorted_bounding_box_op.cc
@@ -14,12 +14,16 @@ limitations under the License.
 ==============================================================================*/
 // See docs in ../ops/image_ops.cc.
 #include <math.h>
+
 #include <cmath>
+
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/stateless_random_ops.h"
+#include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/util/guarded_philox_random.h"
 
@@ -201,12 +205,10 @@ bool GenerateRandomCrop(int original_width, int original_height,
 }  // namespace
 
 template <typename T>
-class SampleDistortedBoundingBoxV2Op : public OpKernel {
+class SampleDistortedBoundingBoxBaseOp : public OpKernel {
  public:
-  explicit SampleDistortedBoundingBoxV2Op(OpKernelConstruction* context)
+  explicit SampleDistortedBoundingBoxBaseOp(OpKernelConstruction* context)
       : OpKernel(context) {
-    OP_REQUIRES_OK(context, generator_.Init(context));
-
     if (context->num_inputs() == 2) {
       OP_REQUIRES_OK(context, context->GetAttr("min_object_covered",
                                                &min_object_covered_));
@@ -252,7 +254,7 @@ class SampleDistortedBoundingBoxV2Op : public OpKernel {
                                         max_attempts_));
   }
 
-  void Compute(OpKernelContext* context) override {
+  void DoCompute(OpKernelContext* context, const random::PhiloxRandom& rng) {
     const Tensor& image_size = context->input(0);
 
     OP_REQUIRES(context, image_size.dims() == 1,
@@ -287,7 +289,11 @@ class SampleDistortedBoundingBoxV2Op : public OpKernel {
                     input_boxes.shape().DebugString()));
 
     float min_object_covered_val = 0.0;
-    if (context->num_inputs() == 3) {
+    // `SampleDistortedBoundingBox` op accepts 2 inputs and has
+    // `min_object_covered` as an attribute (handled in the constructor).
+    // `SampleDistortedBoundingBoxV2` and `StatelessSampleDistortedBoundingBox`
+    // ops accept 3+ inputs, including `min_object_covered`.
+    if (context->num_inputs() >= 3) {
       const Tensor& min_object_covered = context->input(2);
 
       OP_REQUIRES(
@@ -342,8 +348,8 @@ class SampleDistortedBoundingBoxV2Op : public OpKernel {
     const float min_sample_aspect_ratio = aspect_ratio_range_[0];
     const float max_sample_aspect_ratio = aspect_ratio_range_[1];
 
-    auto local_gen = generator_.ReserveSamples32(4 * max_attempts_);
-    random::SimplePhilox random(&local_gen);
+    auto local_rng = rng;
+    random::SimplePhilox random(&local_rng);
 
     Rectangle crop_rect;
     bool sample_generated = false;
@@ -420,8 +426,7 @@ class SampleDistortedBoundingBoxV2Op : public OpKernel {
     size_data(2) = T(-1);
   }
 
- private:
-  GuardedPhiloxRandom generator_;
+ protected:
   int32 max_attempts_;
   std::vector<float> area_range_;
   std::vector<float> aspect_ratio_range_;
@@ -429,15 +434,62 @@ class SampleDistortedBoundingBoxV2Op : public OpKernel {
   bool use_image_if_no_bounding_boxes_;
 };
 
-#define REGISTER_KERNELS(type)                                  \
-  REGISTER_KERNEL_BUILDER(Name("SampleDistortedBoundingBox")    \
-                              .Device(DEVICE_CPU)               \
-                              .TypeConstraint<type>("T"),       \
-                          SampleDistortedBoundingBoxV2Op<type>) \
-  REGISTER_KERNEL_BUILDER(Name("SampleDistortedBoundingBoxV2")  \
-                              .Device(DEVICE_CPU)               \
-                              .TypeConstraint<type>("T"),       \
-                          SampleDistortedBoundingBoxV2Op<type>)
+template <typename T>
+class StatefulSampleDistortedBoundingBoxOp
+    : public SampleDistortedBoundingBoxBaseOp<T> {
+ public:
+  explicit StatefulSampleDistortedBoundingBoxOp(OpKernelConstruction* context)
+      : SampleDistortedBoundingBoxBaseOp<T>(context) {
+    OP_REQUIRES_OK(context, generator_.Init(context));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // Need to reserve samples since `generator_` is shared.
+    this->DoCompute(context,
+                    generator_.ReserveSamples32(4 * this->max_attempts_));
+  }
+
+ private:
+  GuardedPhiloxRandom generator_;
+};
+
+template <typename T>
+class StatelessSampleDistortedBoundingBoxOp
+    : public SampleDistortedBoundingBoxBaseOp<T> {
+ public:
+  explicit StatelessSampleDistortedBoundingBoxOp(OpKernelConstruction* context)
+      : SampleDistortedBoundingBoxBaseOp<T>(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& seed_t = context->input(3);
+    OP_REQUIRES(context, seed_t.dims() == 1 && seed_t.dim_size(0) == 2,
+                errors::InvalidArgument("seed must have shape [2], not ",
+                                        seed_t.shape().DebugString()));
+
+    // Create and initialize stateless random number generator (rng).
+    // There is no need to `Skip` (or reserve) samples since the scope of this
+    // rng is local.
+    random::PhiloxRandom::Key key;
+    random::PhiloxRandom::ResultType counter;
+    OP_REQUIRES_OK(context, GenerateKey(seed_t, &key, &counter));
+
+    this->DoCompute(context, random::PhiloxRandom(counter, key));
+  }
+};
+
+#define REGISTER_KERNELS(type)                                        \
+  REGISTER_KERNEL_BUILDER(Name("SampleDistortedBoundingBox")          \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<type>("T"),             \
+                          StatefulSampleDistortedBoundingBoxOp<type>) \
+  REGISTER_KERNEL_BUILDER(Name("SampleDistortedBoundingBoxV2")        \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<type>("T"),             \
+                          StatefulSampleDistortedBoundingBoxOp<type>) \
+  REGISTER_KERNEL_BUILDER(Name("StatelessSampleDistortedBoundingBox") \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<type>("T"),             \
+                          StatelessSampleDistortedBoundingBoxOp<type>)
 
 TF_CALL_INTEGRAL_TYPES(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
diff --git a/tensorflow/core/kernels/sampling_kernels.cc b/tensorflow/core/kernels/image/sampling_kernels.cc
similarity index 96%
rename from tensorflow/core/kernels/sampling_kernels.cc
rename to tensorflow/core/kernels/image/sampling_kernels.cc
index 306b8d6a390..ae62a1b2e3d 100644
--- a/tensorflow/core/kernels/sampling_kernels.cc
+++ b/tensorflow/core/kernels/image/sampling_kernels.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/sampling_kernels.h"
+#include "tensorflow/core/kernels/image/sampling_kernels.h"
+
 #include <string>
+
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 
diff --git a/tensorflow/core/kernels/sampling_kernels.h b/tensorflow/core/kernels/image/sampling_kernels.h
similarity index 100%
rename from tensorflow/core/kernels/sampling_kernels.h
rename to tensorflow/core/kernels/image/sampling_kernels.h
diff --git a/tensorflow/core/kernels/sampling_kernels_test.cc b/tensorflow/core/kernels/image/sampling_kernels_test.cc
similarity index 98%
rename from tensorflow/core/kernels/sampling_kernels_test.cc
rename to tensorflow/core/kernels/image/sampling_kernels_test.cc
index 37c2edc14a3..039a785063f 100644
--- a/tensorflow/core/kernels/sampling_kernels_test.cc
+++ b/tensorflow/core/kernels/image/sampling_kernels_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/sampling_kernels.h"
+#include "tensorflow/core/kernels/image/sampling_kernels.h"
 
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/core/kernels/scale_and_translate_op.cc b/tensorflow/core/kernels/image/scale_and_translate_op.cc
similarity index 99%
rename from tensorflow/core/kernels/scale_and_translate_op.cc
rename to tensorflow/core/kernels/image/scale_and_translate_op.cc
index fff457e55c7..1011af7d19e 100644
--- a/tensorflow/core/kernels/scale_and_translate_op.cc
+++ b/tensorflow/core/kernels/image/scale_and_translate_op.cc
@@ -16,9 +16,10 @@ limitations under the License.
 // See docs in ../ops/image_ops.cc
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/scale_and_translate_op.h"
+#include "tensorflow/core/kernels/image/scale_and_translate_op.h"
 
 #include <memory>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -27,7 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/kernels/sampling_kernels.h"
+#include "tensorflow/core/kernels/image/sampling_kernels.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/scale_and_translate_op.h b/tensorflow/core/kernels/image/scale_and_translate_op.h
similarity index 92%
rename from tensorflow/core/kernels/scale_and_translate_op.h
rename to tensorflow/core/kernels/image/scale_and_translate_op.h
index 74bc87ecc7a..9c0650a4c26 100644
--- a/tensorflow/core/kernels/scale_and_translate_op.h
+++ b/tensorflow/core/kernels/image/scale_and_translate_op.h
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_SCALE_AND_TRANSLATE_OP_H_
-#define TENSORFLOW_CORE_KERNELS_SCALE_AND_TRANSLATE_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGESCALE_AND_TRANSLATE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGESCALE_AND_TRANSLATE_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/sampling_kernels.h"
+#include "tensorflow/core/kernels/image/sampling_kernels.h"
 
 namespace tensorflow {
 namespace functor {
@@ -72,4 +72,4 @@ struct GatherSpans {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_SCALE_AND_TRANSLATE_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGESCALE_AND_TRANSLATE_OP_H_
diff --git a/tensorflow/core/kernels/scale_and_translate_op_test.cc b/tensorflow/core/kernels/image/scale_and_translate_op_test.cc
similarity index 99%
rename from tensorflow/core/kernels/scale_and_translate_op_test.cc
rename to tensorflow/core/kernels/image/scale_and_translate_op_test.cc
index 412a1012686..2959f93a266 100644
--- a/tensorflow/core/kernels/scale_and_translate_op_test.cc
+++ b/tensorflow/core/kernels/image/scale_and_translate_op_test.cc
@@ -21,9 +21,9 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/image/sampling_kernels.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/kernels/sampling_kernels.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
diff --git a/tensorflow/core/kernels/immutable_constant_op_test.cc b/tensorflow/core/kernels/immutable_constant_op_test.cc
index 7eceba7ad8b..d52a8b55a35 100644
--- a/tensorflow/core/kernels/immutable_constant_op_test.cc
+++ b/tensorflow/core/kernels/immutable_constant_op_test.cc
@@ -60,8 +60,12 @@ class TestReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
 class TestFileSystem : public NullFileSystem {
  public:
   ~TestFileSystem() override = default;
+
+  // import non-transactional method from the base class
+  using NullFileSystem::NewReadOnlyMemoryRegionFromFile;
+
   Status NewReadOnlyMemoryRegionFromFile(
-      const string& fname,
+      const string& fname, TransactionToken* token,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override {
     float val = 0;
     StringPiece scheme, host, path;
diff --git a/tensorflow/core/kernels/linalg/BUILD b/tensorflow/core/kernels/linalg/BUILD
new file mode 100644
index 00000000000..ab25fad3ec3
--- /dev/null
+++ b/tensorflow/core/kernels/linalg/BUILD
@@ -0,0 +1,363 @@
+load(
+    "//tensorflow:tensorflow.bzl",
+    "if_cuda_or_rocm",
+    "tf_kernel_library",
+)
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm",
+)
+load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
+
+package(
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+        "//tensorflow:internal",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+# TODO(rmlarsen): Remove ASAP.
+package_group(
+    name = "friends",
+    packages = ["//tensorflow/..."],
+)
+
+# Export a few files for use on Android.
+exports_files([
+    "einsum_op_impl_half.cc",
+    "einsum_op_impl_bfloat16.cc",
+    "einsum_op_impl_int32.cc",
+    "einsum_op_impl_int64.cc",
+    "einsum_op_impl_float.cc",
+    "einsum_op_impl_double.cc",
+    "einsum_op_impl_complex64.cc",
+    "einsum_op_impl_complex128.cc",
+    "einsum_op_impl.h",
+    "einsum_op.h",
+    "linalg_ops_common.h",
+    "linalg_ops_common.cc",
+    "matrix_diag_op.h",
+    "matrix_diag_op.cc",
+    "matrix_inverse_op.cc",
+    "matrix_set_diag_op.h",
+    "matrix_set_diag_op.cc",
+])
+
+# Public support libraries ----------------------------------------------------
+
+cc_library(
+    name = "linalg",
+    deps = [
+        ":banded_triangular_solve_op",
+        ":cholesky_grad",
+        ":cholesky_op",
+        ":determinant_op",
+        ":eig_op",
+        ":einsum_op",
+        ":lu_op",
+        ":matrix_band_part_op",
+        ":matrix_diag_op",
+        ":matrix_exponential_op",
+        ":matrix_inverse_op",
+        ":matrix_logarithm_op",
+        ":matrix_set_diag_op",
+        ":matrix_solve_ls_op",
+        ":matrix_solve_op",
+        ":matrix_square_root_op",
+        ":matrix_triangular_solve_op",
+        ":qr_op",
+        ":self_adjoint_eig_op",
+        ":self_adjoint_eig_v2_op",
+        ":svd_op",
+        ":tridiagonal_matmul_op",
+        ":tridiagonal_solve_op",
+    ],
+)
+
+LINALG_DEPS = [
+    ":linalg_ops_common",
+    "//third_party/eigen3",
+    "//tensorflow/core:framework",
+    "//tensorflow/core:lib",
+    "//tensorflow/core/kernels:cast_op",
+    "//tensorflow/core/kernels:fill_functor",
+] + if_cuda([
+    ":eye_functor",
+    "//tensorflow/core/util:cuda_solvers",
+    "//tensorflow/core/kernels:transpose_functor",
+]) + if_rocm([
+    "//tensorflow/core/util:rocm_solvers",
+])
+
+tf_kernel_library(
+    name = "matrix_band_part_op",
+    prefix = "matrix_band_part_op",
+    deps = LINALG_DEPS,
+)
+
+tf_kernel_library(
+    name = "matrix_diag_op",
+    prefix = "matrix_diag_op",
+    deps = LINALG_DEPS,
+)
+
+tf_kernel_library(
+    name = "matrix_set_diag_op",
+    prefix = "matrix_set_diag_op",
+    deps = LINALG_DEPS + [":matrix_diag_op"],
+)
+
+tf_kernel_library(
+    name = "cholesky_op",
+    prefix = "cholesky_op",
+    deps = if_cuda([
+        ":matrix_band_part_op",
+    ]) + LINALG_DEPS,
+)
+
+tf_kernel_library(
+    name = "cholesky_grad",
+    prefix = "cholesky_grad",
+    deps = LINALG_DEPS,
+)
+
+tf_kernel_library(
+    name = "determinant_op",
+    prefix = "determinant_op",
+    deps = LINALG_DEPS,
+)
+
+tf_kernel_library(
+    name = "matrix_exponential_op",
+    prefix = "matrix_exponential_op",
+    deps = LINALG_DEPS,
+)
+
+tf_kernel_library(
+    name = "matrix_logarithm_op",
+    prefix = "matrix_logarithm_op",
+    deps = LINALG_DEPS,
+)
+
+tf_kernel_library(
+    name = "self_adjoint_eig_op",
+    prefix = "self_adjoint_eig_op",
+    deps = LINALG_DEPS + ["//tensorflow/core:lib_internal"],
+)
+
+tf_kernel_library(
+    name = "self_adjoint_eig_v2_op",
+    prefix = "self_adjoint_eig_v2_op",
+    deps = LINALG_DEPS + ["//tensorflow/core:lib_internal"] + if_cuda([
+        "//tensorflow/core/kernels:cwise_op",
+    ]),
+)
+
+tf_kernel_library(
+    name = "eig_op",
+    prefix = "eig_op",
+    deps = LINALG_DEPS + ["//tensorflow/core:lib_internal"] + if_cuda([
+        "//tensorflow/core/kernels:cwise_op",
+    ]),
+)
+
+tf_kernel_library(
+    name = "matrix_inverse_op",
+    prefix = "matrix_inverse_op",
+    visibility = [":friends"],
+    deps = LINALG_DEPS,
+)
+
+tf_kernel_library(
+    name = "matrix_solve_ls_op",
+    prefix = "matrix_solve_ls_op",
+    deps = LINALG_DEPS,
+)
+
+tf_kernel_library(
+    name = "matrix_solve_op",
+    prefix = "matrix_solve_op",
+    deps = LINALG_DEPS,
+)
+
+tf_kernel_library(
+    name = "matrix_square_root_op",
+    prefix = "matrix_square_root_op",
+    deps = LINALG_DEPS,
+)
+
+tf_kernel_library(
+    name = "banded_triangular_solve_op",
+    prefix = "banded_triangular_solve_op",
+    deps = LINALG_DEPS,
+)
+
+tf_kernel_library(
+    name = "matrix_triangular_solve_op",
+    hdrs = ["matrix_triangular_solve_op_impl.h"],
+    prefix = "matrix_triangular_solve_op",
+    deps = [
+        ":linalg_ops_common",
+        "//third_party/eigen3",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels:fill_functor",
+        "//tensorflow/core:stream_executor",
+    ] + if_cuda([
+        "//tensorflow/core/platform/default/build_config:cublas_plugin",
+        "//tensorflow/core/util:cuda_solvers",
+    ]) + if_rocm([
+        "@local_config_rocm//rocm:rocprim",
+        "//tensorflow/core/util:rocm_solvers",
+    ]) + if_cuda_or_rocm([
+        "//tensorflow/core/kernels:transpose_functor",
+    ]),
+)
+
+tf_kernel_library(
+    name = "tridiagonal_matmul_op",
+    srcs = ["tridiagonal_matmul_op.cc"],
+    gpu_srcs = ["tridiagonal_matmul_op_gpu.cu.cc"],
+    deps = LINALG_DEPS + if_cuda([
+        "//tensorflow/core/util:cuda_sparse",
+    ]),
+)
+
+tf_kernel_library(
+    name = "tridiagonal_solve_op",
+    srcs = ["tridiagonal_solve_op.cc"],
+    gpu_srcs = ["tridiagonal_solve_op_gpu.cu.cc"],
+    deps = LINALG_DEPS + if_cuda([
+        "//tensorflow/core/util:cuda_sparse",
+    ]),
+)
+
+tf_kernel_library(
+    name = "qr_op",
+    prefix = "qr_op",
+    deps = LINALG_DEPS + if_cuda([
+        "//tensorflow/core/kernels:cwise_op",
+        ":matrix_band_part_op",
+    ]),
+)
+
+tf_kernel_library(
+    name = "svd_op",
+    prefix = "svd_op",
+    deps = LINALG_DEPS,
+)
+
+tf_kernel_library(
+    name = "lu_op",
+    prefix = "lu_op",
+    deps = if_cuda([
+        "//tensorflow/core/util:cuda_solvers",
+        "//tensorflow/core/kernels:transpose_functor",
+    ]) + [
+        "//third_party/eigen3",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_kernel_library(
+    name = "einsum_op",
+    prefix = "einsum_op",
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels:batch_matmul_op",
+        "//tensorflow/core/kernels:fill_functor",
+        "//tensorflow/core/kernels:reduction_ops",
+        "//tensorflow/core/kernels:transpose_functor",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "linalg_ops_common",
+    srcs = ["linalg_ops_common.cc"],
+    hdrs = ["linalg_ops_common.h"],
+    visibility = ["//visibility:private"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "banded_triangular_solve_op_test",
+    size = "small",
+    srcs = ["banded_triangular_solve_op_test.cc"],
+    deps = [
+        ":banded_triangular_solve_op",
+        ":matrix_set_diag_op",
+        ":matrix_triangular_solve_op",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
+
+tf_kernel_library(
+    name = "eye_functor",
+    hdrs = ["eye_functor.h"],
+    gpu_srcs = [
+        "eye_functor_gpu.cu.cc",
+        "eye_functor.h",
+    ],
+    visibility = ["//tensorflow/core/kernels:friends"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//third_party/eigen3",
+    ],
+    alwayslink = 0,
+)
+
+tf_cuda_cc_test(
+    name = "matrix_triangular_solve_op_test",
+    size = "small",
+    srcs = ["matrix_triangular_solve_op_test.cc"],
+    deps = [
+        ":matrix_triangular_solve_op",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:broadcast_to_op",
+        "//tensorflow/core/kernels:ops_testutil",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
+
+# A file group which contains all operators which are known to work on mobile.
+filegroup(
+    name = "android_all_op_kernels",
+    srcs = glob(
+        [
+            "*.cc",
+            "*.h",
+        ],
+        exclude = [
+            "*test.cc",
+            "*test.h",
+            "*_test_*",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/core/kernels/banded_triangular_solve_op.cc b/tensorflow/core/kernels/linalg/banded_triangular_solve_op.cc
similarity index 99%
rename from tensorflow/core/kernels/banded_triangular_solve_op.cc
rename to tensorflow/core/kernels/linalg/banded_triangular_solve_op.cc
index d01a015502a..6758dcf5b8b 100644
--- a/tensorflow/core/kernels/banded_triangular_solve_op.cc
+++ b/tensorflow/core/kernels/linalg/banded_triangular_solve_op.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/fill_functor.h"
-#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
diff --git a/tensorflow/core/kernels/banded_triangular_solve_op_test.cc b/tensorflow/core/kernels/linalg/banded_triangular_solve_op_test.cc
similarity index 99%
rename from tensorflow/core/kernels/banded_triangular_solve_op_test.cc
rename to tensorflow/core/kernels/linalg/banded_triangular_solve_op_test.cc
index 37e904a3e0e..7c20b88845f 100644
--- a/tensorflow/core/kernels/banded_triangular_solve_op_test.cc
+++ b/tensorflow/core/kernels/linalg/banded_triangular_solve_op_test.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/testlib.h"
-#include "tensorflow/core/kernels/matrix_set_diag_op.h"
+#include "tensorflow/core/kernels/linalg/matrix_set_diag_op.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
diff --git a/tensorflow/core/kernels/cholesky_grad.cc b/tensorflow/core/kernels/linalg/cholesky_grad.cc
similarity index 99%
rename from tensorflow/core/kernels/cholesky_grad.cc
rename to tensorflow/core/kernels/linalg/cholesky_grad.cc
index eac66e580dd..31a5570cddf 100644
--- a/tensorflow/core/kernels/cholesky_grad.cc
+++ b/tensorflow/core/kernels/linalg/cholesky_grad.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/cholesky_op.cc b/tensorflow/core/kernels/linalg/cholesky_op.cc
similarity index 98%
rename from tensorflow/core/kernels/cholesky_op.cc
rename to tensorflow/core/kernels/linalg/cholesky_op.cc
index ff8fd08f228..eae09124b36 100644
--- a/tensorflow/core/kernels/cholesky_op.cc
+++ b/tensorflow/core/kernels/linalg/cholesky_op.cc
@@ -25,16 +25,16 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
 #if GOOGLE_CUDA
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/kernels/cuda_solvers.h"
-#include "tensorflow/core/kernels/matrix_band_part_op.h"
+#include "tensorflow/core/kernels/linalg/matrix_band_part_op.h"
 #include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/util/cuda_solvers.h"
 #endif
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/determinant_op.cc b/tensorflow/core/kernels/linalg/determinant_op.cc
similarity index 99%
rename from tensorflow/core/kernels/determinant_op.cc
rename to tensorflow/core/kernels/linalg/determinant_op.cc
index b06f42384eb..8f0b0b618cf 100644
--- a/tensorflow/core/kernels/determinant_op.cc
+++ b/tensorflow/core/kernels/linalg/determinant_op.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/kernels/determinant_op.h"
+#include "tensorflow/core/kernels/linalg/determinant_op.h"
 #endif
 
 #include "third_party/eigen3/Eigen/LU"
@@ -28,14 +28,14 @@ limitations under the License.
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
 #if GOOGLE_CUDA
-#include "tensorflow/core/kernels/cuda_solvers.h"
 #include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/util/cuda_solvers.h"
 #endif
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/determinant_op.h b/tensorflow/core/kernels/linalg/determinant_op.h
similarity index 90%
rename from tensorflow/core/kernels/determinant_op.h
rename to tensorflow/core/kernels/linalg/determinant_op.h
index eefdfe0ae40..6ace1bef44b 100644
--- a/tensorflow/core/kernels/determinant_op.h
+++ b/tensorflow/core/kernels/linalg/determinant_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_DETERMINANT_OP_H_
-#define TENSORFLOW_CORE_KERNELS_DETERMINANT_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_DETERMINANT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_DETERMINANT_OP_H_
 
 #include "tensorflow/core/framework/tensor_types.h"
 
@@ -44,4 +44,4 @@ struct LogDeterminantFromPivotedLUFunctor {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_DETERMINANT_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_DETERMINANT_OP_H_
diff --git a/tensorflow/core/kernels/determinant_op_gpu.cu.cc b/tensorflow/core/kernels/linalg/determinant_op_gpu.cu.cc
similarity index 98%
rename from tensorflow/core/kernels/determinant_op_gpu.cu.cc
rename to tensorflow/core/kernels/linalg/determinant_op_gpu.cu.cc
index 9aa64b3a7da..f6ab327bce0 100644
--- a/tensorflow/core/kernels/determinant_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/linalg/determinant_op_gpu.cu.cc
@@ -21,8 +21,8 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/cuda_solvers.h"
-#include "tensorflow/core/kernels/determinant_op.h"
+#include "tensorflow/core/kernels/linalg/determinant_op.h"
+#include "tensorflow/core/util/cuda_solvers.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/eig_op_complex128.cc b/tensorflow/core/kernels/linalg/eig_op_complex128.cc
similarity index 93%
rename from tensorflow/core/kernels/eig_op_complex128.cc
rename to tensorflow/core/kernels/linalg/eig_op_complex128.cc
index 988cc2f98d9..bd4b6fe36d0 100644
--- a/tensorflow/core/kernels/eig_op_complex128.cc
+++ b/tensorflow/core/kernels/linalg/eig_op_complex128.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/eig_op_impl.h"
+#include "tensorflow/core/kernels/linalg/eig_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/eig_op_complex64.cc b/tensorflow/core/kernels/linalg/eig_op_complex64.cc
similarity index 93%
rename from tensorflow/core/kernels/eig_op_complex64.cc
rename to tensorflow/core/kernels/linalg/eig_op_complex64.cc
index 6a3f7928715..b5b4a26ee85 100644
--- a/tensorflow/core/kernels/eig_op_complex64.cc
+++ b/tensorflow/core/kernels/linalg/eig_op_complex64.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/eig_op_impl.h"
+#include "tensorflow/core/kernels/linalg/eig_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/eig_op_double.cc b/tensorflow/core/kernels/linalg/eig_op_double.cc
similarity index 93%
rename from tensorflow/core/kernels/eig_op_double.cc
rename to tensorflow/core/kernels/linalg/eig_op_double.cc
index 2cd931cc135..c360637c84a 100644
--- a/tensorflow/core/kernels/eig_op_double.cc
+++ b/tensorflow/core/kernels/linalg/eig_op_double.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/eig_op_impl.h"
+#include "tensorflow/core/kernels/linalg/eig_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/eig_op_float.cc b/tensorflow/core/kernels/linalg/eig_op_float.cc
similarity index 93%
rename from tensorflow/core/kernels/eig_op_float.cc
rename to tensorflow/core/kernels/linalg/eig_op_float.cc
index a06f76e935f..18f576fcc19 100644
--- a/tensorflow/core/kernels/eig_op_float.cc
+++ b/tensorflow/core/kernels/linalg/eig_op_float.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/eig_op_impl.h"
+#include "tensorflow/core/kernels/linalg/eig_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/eig_op_impl.h b/tensorflow/core/kernels/linalg/eig_op_impl.h
similarity index 93%
rename from tensorflow/core/kernels/eig_op_impl.h
rename to tensorflow/core/kernels/linalg/eig_op_impl.h
index 4ebb6bde08b..a7aff7c2a5d 100644
--- a/tensorflow/core/kernels/eig_op_impl.h
+++ b/tensorflow/core/kernels/linalg/eig_op_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_EIG_OP_IMPL_H_
-#define TENSORFLOW_CORE_KERNELS_EIG_OP_IMPL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_EIG_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_EIG_OP_IMPL_H_
 
 // See docs in ../ops/linalg_ops.cc.
 
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/logging.h"
@@ -95,4 +95,4 @@ class EigOp : public LinearAlgebraOp<InputScalar, OutputScalar> {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_EIG_OP_IMPL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_EIG_OP_IMPL_H_
diff --git a/tensorflow/core/kernels/einsum_op.h b/tensorflow/core/kernels/linalg/einsum_op.h
similarity index 94%
rename from tensorflow/core/kernels/einsum_op.h
rename to tensorflow/core/kernels/linalg/einsum_op.h
index 31d1109004c..f22f33c600a 100644
--- a/tensorflow/core/kernels/einsum_op.h
+++ b/tensorflow/core/kernels/linalg/einsum_op.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_EINSUM_OP_H_
-#define TENSORFLOW_CORE_KERNELS_EINSUM_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_EINSUM_OP_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_EINSUM_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
diff --git a/tensorflow/core/kernels/einsum_op_gpu.cu.cc b/tensorflow/core/kernels/linalg/einsum_op_gpu.cu.cc
similarity index 96%
rename from tensorflow/core/kernels/einsum_op_gpu.cu.cc
rename to tensorflow/core/kernels/linalg/einsum_op_gpu.cu.cc
index 2935b7fd02a..5461e43e0ab 100644
--- a/tensorflow/core/kernels/einsum_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/linalg/einsum_op_gpu.cu.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/einsum_op.h"
+#include "tensorflow/core/kernels/linalg/einsum_op.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/einsum_op_impl.h b/tensorflow/core/kernels/linalg/einsum_op_impl.h
similarity index 97%
rename from tensorflow/core/kernels/einsum_op_impl.h
rename to tensorflow/core/kernels/linalg/einsum_op_impl.h
index 620a144e886..b9b2d1f0eae 100644
--- a/tensorflow/core/kernels/einsum_op_impl.h
+++ b/tensorflow/core/kernels/linalg/einsum_op_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_EINSUM_OP_IMPL_H_
-#define TENSORFLOW_CORE_KERNELS_EINSUM_OP_IMPL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_EINSUM_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_EINSUM_OP_IMPL_H_
 
 #define EIGEN_USE_THREADS
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -31,8 +31,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/batch_matmul_op_impl.h"
-#include "tensorflow/core/kernels/einsum_op.h"
 #include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/kernels/linalg/einsum_op.h"
 #include "tensorflow/core/kernels/reduction_ops_common.h"
 #include "tensorflow/core/kernels/transpose_functor.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/util/einsum_op_util.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -715,15 +716,17 @@ class EinsumOp : public OpKernel {
     ctx->set_output(0, output);
   }
 
-  string TraceString(OpKernelContext* ctx, bool verbose) override {
-    if (!verbose) {
-      return strings::StrCat(name_view(), ":", type_string_view(),
-                             "#equation=(", equation_, ")#");
-    } else {
-      string trace_args = GetTraceArgument(ctx);
-      return strings::StrCat(name_view(), ":", type_string_view(),
-                             "#equation=(", equation_, "),", trace_args, "#");
+  string TraceString(const OpKernelContext& ctx, bool verbose) const override {
+    string op = profiler::TraceMeOp(name_view(), type_string_view());
+    string equation = strings::StrCat("(", equation_, ")");
+    if (verbose) {
+      string shape = ShapeTraceString(ctx);
+      if (!shape.empty()) {
+        return profiler::TraceMeEncode(
+            std::move(op), {{"equation", equation}, {"shape", shape}});
+      }
     }
+    return profiler::TraceMeEncode(std::move(op), {{"equation", equation}});
   }
 
  private:
@@ -777,4 +780,4 @@ DECLARE_GPU_SPECS(complex128);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_EINSUM_OP_IMPL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_EINSUM_OP_IMPL_H_
diff --git a/tensorflow/core/kernels/einsum_op_impl_bfloat16.cc b/tensorflow/core/kernels/linalg/einsum_op_impl_bfloat16.cc
similarity index 94%
rename from tensorflow/core/kernels/einsum_op_impl_bfloat16.cc
rename to tensorflow/core/kernels/linalg/einsum_op_impl_bfloat16.cc
index 44508f86a5e..e2e13052df5 100644
--- a/tensorflow/core/kernels/einsum_op_impl_bfloat16.cc
+++ b/tensorflow/core/kernels/linalg/einsum_op_impl_bfloat16.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/einsum_op_impl.h"
+#include "tensorflow/core/kernels/linalg/einsum_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/einsum_op_impl_complex128.cc b/tensorflow/core/kernels/linalg/einsum_op_impl_complex128.cc
similarity index 95%
rename from tensorflow/core/kernels/einsum_op_impl_complex128.cc
rename to tensorflow/core/kernels/linalg/einsum_op_impl_complex128.cc
index 8473cbf545d..ff78d460acf 100644
--- a/tensorflow/core/kernels/einsum_op_impl_complex128.cc
+++ b/tensorflow/core/kernels/linalg/einsum_op_impl_complex128.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/einsum_op_impl.h"
+#include "tensorflow/core/kernels/linalg/einsum_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/einsum_op_impl_complex64.cc b/tensorflow/core/kernels/linalg/einsum_op_impl_complex64.cc
similarity index 95%
rename from tensorflow/core/kernels/einsum_op_impl_complex64.cc
rename to tensorflow/core/kernels/linalg/einsum_op_impl_complex64.cc
index bd506a04f5f..cd3788846b2 100644
--- a/tensorflow/core/kernels/einsum_op_impl_complex64.cc
+++ b/tensorflow/core/kernels/linalg/einsum_op_impl_complex64.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/einsum_op_impl.h"
+#include "tensorflow/core/kernels/linalg/einsum_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/einsum_op_impl_double.cc b/tensorflow/core/kernels/linalg/einsum_op_impl_double.cc
similarity index 95%
rename from tensorflow/core/kernels/einsum_op_impl_double.cc
rename to tensorflow/core/kernels/linalg/einsum_op_impl_double.cc
index f994590779b..e0c093fa4a9 100644
--- a/tensorflow/core/kernels/einsum_op_impl_double.cc
+++ b/tensorflow/core/kernels/linalg/einsum_op_impl_double.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/einsum_op_impl.h"
+#include "tensorflow/core/kernels/linalg/einsum_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/einsum_op_impl_float.cc b/tensorflow/core/kernels/linalg/einsum_op_impl_float.cc
similarity index 95%
rename from tensorflow/core/kernels/einsum_op_impl_float.cc
rename to tensorflow/core/kernels/linalg/einsum_op_impl_float.cc
index 1875310b687..ad9135c991c 100644
--- a/tensorflow/core/kernels/einsum_op_impl_float.cc
+++ b/tensorflow/core/kernels/linalg/einsum_op_impl_float.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/einsum_op_impl.h"
+#include "tensorflow/core/kernels/linalg/einsum_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/einsum_op_impl_half.cc b/tensorflow/core/kernels/linalg/einsum_op_impl_half.cc
similarity index 95%
rename from tensorflow/core/kernels/einsum_op_impl_half.cc
rename to tensorflow/core/kernels/linalg/einsum_op_impl_half.cc
index 0486b133e62..72a9f6bec4f 100644
--- a/tensorflow/core/kernels/einsum_op_impl_half.cc
+++ b/tensorflow/core/kernels/linalg/einsum_op_impl_half.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/einsum_op_impl.h"
+#include "tensorflow/core/kernels/linalg/einsum_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/einsum_op_impl_int32.cc b/tensorflow/core/kernels/linalg/einsum_op_impl_int32.cc
similarity index 94%
rename from tensorflow/core/kernels/einsum_op_impl_int32.cc
rename to tensorflow/core/kernels/linalg/einsum_op_impl_int32.cc
index db5169498d9..7569c979c59 100644
--- a/tensorflow/core/kernels/einsum_op_impl_int32.cc
+++ b/tensorflow/core/kernels/linalg/einsum_op_impl_int32.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/einsum_op_impl.h"
+#include "tensorflow/core/kernels/linalg/einsum_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/einsum_op_impl_int64.cc b/tensorflow/core/kernels/linalg/einsum_op_impl_int64.cc
similarity index 94%
rename from tensorflow/core/kernels/einsum_op_impl_int64.cc
rename to tensorflow/core/kernels/linalg/einsum_op_impl_int64.cc
index 7f1a1eac411..6ee0ebc9637 100644
--- a/tensorflow/core/kernels/einsum_op_impl_int64.cc
+++ b/tensorflow/core/kernels/linalg/einsum_op_impl_int64.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/einsum_op_impl.h"
+#include "tensorflow/core/kernels/linalg/einsum_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/eye_functor.h b/tensorflow/core/kernels/linalg/eye_functor.h
similarity index 90%
rename from tensorflow/core/kernels/eye_functor.h
rename to tensorflow/core/kernels/linalg/eye_functor.h
index 3799cfba9ae..c77372f089a 100644
--- a/tensorflow/core/kernels/eye_functor.h
+++ b/tensorflow/core/kernels/linalg/eye_functor.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_EYE_FUNCTOR_H_
-#define TENSORFLOW_CORE_KERNELS_EYE_FUNCTOR_H_
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_EYE_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_EYE_FUNCTOR_H_
 
 #include "tensorflow/core/framework/tensor_types.h"
 
diff --git a/tensorflow/core/kernels/eye_functor_gpu.cu.cc b/tensorflow/core/kernels/linalg/eye_functor_gpu.cu.cc
similarity index 97%
rename from tensorflow/core/kernels/eye_functor_gpu.cu.cc
rename to tensorflow/core/kernels/linalg/eye_functor_gpu.cu.cc
index 90df538dd2c..85865588f2c 100644
--- a/tensorflow/core/kernels/eye_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/linalg/eye_functor_gpu.cu.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/type_traits.h"
-#include "tensorflow/core/kernels/eye_functor.h"
+#include "tensorflow/core/kernels/linalg/eye_functor.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/linalg_ops_common.cc b/tensorflow/core/kernels/linalg/linalg_ops_common.cc
similarity index 99%
rename from tensorflow/core/kernels/linalg_ops_common.cc
rename to tensorflow/core/kernels/linalg/linalg_ops_common.cc
index 56a941fbd1f..c8d33e435c7 100644
--- a/tensorflow/core/kernels/linalg_ops_common.cc
+++ b/tensorflow/core/kernels/linalg/linalg_ops_common.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
 
 #include <utility>
 
diff --git a/tensorflow/core/kernels/linalg/linalg_ops_common.h b/tensorflow/core/kernels/linalg/linalg_ops_common.h
new file mode 100644
index 00000000000..3ab37480c90
--- /dev/null
+++ b/tensorflow/core/kernels/linalg/linalg_ops_common.h
@@ -0,0 +1,221 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_LINALG_OPS_COMMON_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_LINALG_OPS_COMMON_H_
+
+// Classes to support linear algebra functionality, similar to the numpy.linalg
+// module. Supports batch computation on several matrices at once, sharding the
+// computations across different threads if necessary.
+#include <algorithm>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+// Base class for linear algebra operators.
+template <class InputScalar, class OutputScalar = InputScalar>
+class LinearAlgebraOp : public OpKernel {
+ public:
+  explicit LinearAlgebraOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override;
+
+ protected:
+  using TensorShapes = gtl::InlinedVector<TensorShape, 4>;
+  // Returns the number of leading inputs that are to be treated as matrix
+  // inputs. By default this is all the inputs. Derived classes can override
+  // this to tell the base class to ignore one or more trailing inputs.
+  virtual int NumMatrixInputs(const OpKernelContext* context) const {
+    return context->num_inputs();
+  }
+
+  // Returns true if the number of inputs and their shapes are as expected.
+  // Many ops take a single square input matrix, so we provide that as a default
+  // implementation for convenience.
+  virtual void ValidateInputMatrixShapes(
+      OpKernelContext* context, const TensorShapes& input_matrix_shapes) const {
+    ValidateSingleSquareMatrix(context, input_matrix_shapes);
+  }
+
+  // Convenience validators for common cases:
+  //
+  // Validate op taking a single matrix A.
+  static void ValidateSingleMatrix(OpKernelContext* context,
+                                   const TensorShapes& input_matrix_shapes);
+  // Validate op taking a single square matrix A.
+  static void ValidateSingleSquareMatrix(
+      OpKernelContext* context, const TensorShapes& input_matrix_shapes);
+  // Validate op taking two matrices A and B that have the same number of rows.
+  static void ValidateSolver(OpKernelContext* context,
+                             const TensorShapes& input_matrix_shapes);
+  // Validate op taking two matrices A and B that have the same number of rows
+  // and A is square.
+  static void ValidateSquareSolver(OpKernelContext* context,
+                                   const TensorShapes& input_matrix_shapes);
+
+  // Returns the output shapes of each individual matrix operation. Output
+  // matrices shapes must be rank 0, 1, or 2. Scalar outputs are rank 0.
+  //
+  // The derived class may return a number of shapes (N) less than
+  // context->num_outputs() (M) to indicate that a only leading subset of
+  // the outputs will be populated. In this case, a dummy scalar tensor with
+  // value zero will be return for the last M-N outputs.
+  //
+  // For many ops, the output dimensions are the same as the input dimensions,
+  // so we provide that as a default implementation for convenience.
+  virtual TensorShapes GetOutputMatrixShapes(
+      const TensorShapes& input_matrix_shapes) const {
+    return input_matrix_shapes;
+  }
+
+  // Returns the cost per matrix operation. This is used to determine the
+  // number of threads to use for parallelizing calls to ComputeMatrix in
+  // batch mode. Cost per unit is assumed to be roughly 1ns, based on comments
+  // in core/util/work_sharder.cc. Many linear algebra ops take roughly max(m,n)
+  // * min(m,n)^2, where the first input matrix is m-by-n. We provide that as a
+  // default implementation for convenience.
+  virtual int64 GetCostPerUnit(const TensorShapes& input_matrix_shapes) const {
+    double m = static_cast<double>(input_matrix_shapes[0].dim_size(0));
+    double n = static_cast<double>(input_matrix_shapes[0].dim_size(1));
+    double cost = std::max(m, n) * std::min(m, n) * std::min(m, n);
+    return cost >= static_cast<double>(kint64max) ? kint64max
+                                                  : static_cast<int64>(cost);
+  }
+
+  // Returns true if it is safe to forward (alias) input to output buffer
+  // and expect the kernel to perform the computation inplace.
+  virtual bool EnableInputForwarding() const { return true; }
+
+  using InputMatrix = Eigen::Matrix<InputScalar, Eigen::Dynamic, Eigen::Dynamic,
+                                    Eigen::RowMajor>;
+  using InputConstMatrixMap = Eigen::Map<const InputMatrix>;
+  using InputMatrixMap = Eigen::Map<InputMatrix>;
+  using InputConstVectorMap =
+      Eigen::Map<const Eigen::Matrix<InputScalar, 1, Eigen::Dynamic>>;
+  using InputConstMatrixMaps = gtl::InlinedVector<InputConstMatrixMap, 4>;
+  using InputMatrixMaps = gtl::InlinedVector<InputMatrixMap, 4>;
+  using InputRealScalar = typename Eigen::NumTraits<InputScalar>::Real;
+
+  using OutputMatrix = Eigen::Matrix<OutputScalar, Eigen::Dynamic,
+                                     Eigen::Dynamic, Eigen::RowMajor>;
+  using OutputConstMatrixMap = Eigen::Map<const OutputMatrix>;
+  using OutputMatrixMap = Eigen::Map<OutputMatrix>;
+  using OutputConstVectorMap =
+      Eigen::Map<const Eigen::Matrix<OutputScalar, 1, Eigen::Dynamic>>;
+  using OutputConstMatrixMaps = gtl::InlinedVector<OutputConstMatrixMap, 4>;
+  using OutputMatrixMaps = gtl::InlinedVector<OutputMatrixMap, 4>;
+  using OutputRealScalar = typename Eigen::NumTraits<OutputScalar>::Real;
+
+  // backward compatibility
+  using Scalar = OutputScalar;
+  using Matrix =
+      Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using ConstMatrixMap = Eigen::Map<const Matrix>;
+  using MatrixMap = Eigen::Map<Matrix>;
+  using ConstVectorMap =
+      Eigen::Map<const Eigen::Matrix<Scalar, 1, Eigen::Dynamic>>;
+  using ConstMatrixMaps = gtl::InlinedVector<ConstMatrixMap, 4>;
+  using MatrixMaps = gtl::InlinedVector<MatrixMap, 4>;
+  using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
+
+  // Performs a single matrix computation given input matrices, and
+  // stores the result in outputs. For batch operations, this will be called
+  // repeatedly for a single call to Compute() when multiple matrices exist in
+  // input Tensors with rank > 2. In this case the calls to ComputeMatrix are
+  // parallelized. The number of threads used is determined by a cost model from
+  // the value returned by GetCostPerUnit().
+  virtual void ComputeMatrix(OpKernelContext* context,
+                             const InputConstMatrixMaps& inputs,
+                             OutputMatrixMaps* outputs) = 0;
+
+ private:
+  using TensorInputs = gtl::InlinedVector<const Tensor*, 4>;
+  using TensorOutputs = gtl::InlinedVector<Tensor*, 4>;
+  // This function maps 2-d slices (matrices) of the input and output tensors
+  // using Eigen::Map and calls ComputeMatrix implemented in terms of the
+  // Eigen::MatrixBase API by the derived class.
+  //
+  // The 'matrix_index' parameter specifies the index of the matrix to be used
+  // from each input tensor, and the index of the matrix to be written to each
+  // output tensor. The input matrices are in row major order, and located at
+  // the memory addresses
+  //   inputs[i].flat<Scalar>().data() +
+  //   matrix_index * input_matrix_shapes[i].num_elements()
+  // for i in 0...inputs.size()-1.
+  // The output matrices are in row major order, and located at the memory
+  // address
+  //   outputs[i]->flat<Scalar>().data() +
+  //   matrix_index * output_matrix_shapes[i].num_elements().
+  // for i in 0...outputs.size()-1.
+  //
+  void ComputeTensorSlice(OpKernelContext* context, int64 matrix_index,
+                          const TensorInputs& inputs,
+                          const TensorShapes& input_matrix_shapes,
+                          const TensorOutputs& outputs,
+                          const TensorShapes& output_matrix_shapes);
+
+  void AnalyzeInputs(OpKernelContext* context, TensorInputs* inputs,
+                     TensorShapes* input_matrix_shapes,
+                     TensorShape* batch_shape);
+
+  void PrepareOutputs(OpKernelContext* context,
+                      const TensorShapes& input_matrix_shapes,
+                      const TensorShape& batch_shape, TensorOutputs* outputs,
+                      TensorShapes* output_matrix_shapes);
+};
+
+// Declare LinearAlgebraOp, which is explicitly instantiated in
+// linalg_ops_common.cc for float, double, complex64, and complex128.
+extern template class LinearAlgebraOp<float>;
+extern template class LinearAlgebraOp<double>;
+extern template class LinearAlgebraOp<complex64>;
+extern template class LinearAlgebraOp<complex128>;
+
+}  // namespace tensorflow
+
+#define INHERIT_LINALG_TYPEDEFS(Scalar)                       \
+  typedef LinearAlgebraOp<Scalar> Base;                       \
+  using RealScalar = typename Eigen::NumTraits<Scalar>::Real; \
+  using Matrix = typename Base::Matrix;                       \
+  using MatrixMap = typename Base::MatrixMap;                 \
+  using MatrixMaps = typename Base::MatrixMaps;               \
+  using ConstMatrixMap = typename Base::ConstMatrixMap;       \
+  using ConstMatrixMaps = typename Base::ConstMatrixMaps;     \
+  using ConstVectorMap = typename Base::ConstVectorMap;       \
+  using TensorShapes = typename Base::TensorShapes;
+
+#define REGISTER_LINALG_OP_CPU(OpName, OpClass, Scalar) \
+  REGISTER_KERNEL_BUILDER(                              \
+      Name(OpName).Device(DEVICE_CPU).TypeConstraint<Scalar>("T"), OpClass)
+
+#define REGISTER_LINALG_OP_GPU(OpName, OpClass, Scalar) \
+  REGISTER_KERNEL_BUILDER(                              \
+      Name(OpName).Device(DEVICE_GPU).TypeConstraint<Scalar>("T"), OpClass)
+
+// Deprecated, use one of the device-specific macros above.
+#define REGISTER_LINALG_OP(OpName, OpClass, Scalar) \
+  REGISTER_LINALG_OP_CPU(OpName, OpClass, Scalar)
+
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_LINALG_OPS_COMMON_H_
diff --git a/tensorflow/core/kernels/lu_op.cc b/tensorflow/core/kernels/linalg/lu_op.cc
similarity index 100%
rename from tensorflow/core/kernels/lu_op.cc
rename to tensorflow/core/kernels/linalg/lu_op.cc
diff --git a/tensorflow/core/kernels/lu_op_gpu.cu.cc b/tensorflow/core/kernels/linalg/lu_op_gpu.cu.cc
similarity index 99%
rename from tensorflow/core/kernels/lu_op_gpu.cu.cc
rename to tensorflow/core/kernels/linalg/lu_op_gpu.cu.cc
index 47b37ed7f7a..9d23a35057d 100644
--- a/tensorflow/core/kernels/lu_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/linalg/lu_op_gpu.cu.cc
@@ -25,9 +25,9 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/cuda_solvers.h"
 #include "tensorflow/core/kernels/transpose_functor.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_solvers.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/matrix_band_part_op.cc b/tensorflow/core/kernels/linalg/matrix_band_part_op.cc
similarity index 99%
rename from tensorflow/core/kernels/matrix_band_part_op.cc
rename to tensorflow/core/kernels/linalg/matrix_band_part_op.cc
index 4dcce5a8f58..23619bacc33 100644
--- a/tensorflow/core/kernels/matrix_band_part_op.cc
+++ b/tensorflow/core/kernels/linalg/matrix_band_part_op.cc
@@ -21,11 +21,12 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#include "tensorflow/core/kernels/matrix_band_part_op.h"
+#include "tensorflow/core/kernels/linalg/matrix_band_part_op.h"
 
 #include <algorithm>
 #include <memory>
 #include <vector>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/core/kernels/matrix_band_part_op.h b/tensorflow/core/kernels/linalg/matrix_band_part_op.h
similarity index 86%
rename from tensorflow/core/kernels/matrix_band_part_op.h
rename to tensorflow/core/kernels/linalg/matrix_band_part_op.h
index b04e36db8ed..2f68eba6dcd 100644
--- a/tensorflow/core/kernels/matrix_band_part_op.h
+++ b/tensorflow/core/kernels/linalg/matrix_band_part_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_MATRIX_BAND_PART_OP_H_
-#define TENSORFLOW_CORE_KERNELS_MATRIX_BAND_PART_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_BAND_PART_OP_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_BAND_PART_OP_H_
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -34,4 +34,4 @@ struct MatrixBandPartFunctor {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_MATRIX_BAND_PART_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_BAND_PART_OP_H_
diff --git a/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc b/tensorflow/core/kernels/linalg/matrix_band_part_op_gpu.cu.cc
similarity index 97%
rename from tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc
rename to tensorflow/core/kernels/linalg/matrix_band_part_op_gpu.cu.cc
index 9eb3e4f72a2..9c734b7fd6e 100644
--- a/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/linalg/matrix_band_part_op_gpu.cu.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/matrix_band_part_op.h"
+#include "tensorflow/core/kernels/linalg/matrix_band_part_op.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/matrix_diag_op.cc b/tensorflow/core/kernels/linalg/matrix_diag_op.cc
similarity index 99%
rename from tensorflow/core/kernels/matrix_diag_op.cc
rename to tensorflow/core/kernels/linalg/matrix_diag_op.cc
index 05d7e4e6f86..69cc8170793 100644
--- a/tensorflow/core/kernels/matrix_diag_op.cc
+++ b/tensorflow/core/kernels/linalg/matrix_diag_op.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#include "tensorflow/core/kernels/matrix_diag_op.h"
+#include "tensorflow/core/kernels/linalg/matrix_diag_op.h"
 
 #include <algorithm>
 #include <memory>
diff --git a/tensorflow/core/kernels/matrix_diag_op.h b/tensorflow/core/kernels/linalg/matrix_diag_op.h
similarity index 94%
rename from tensorflow/core/kernels/matrix_diag_op.h
rename to tensorflow/core/kernels/linalg/matrix_diag_op.h
index 707fd9b6c14..5758ba664cc 100644
--- a/tensorflow/core/kernels/matrix_diag_op.h
+++ b/tensorflow/core/kernels/linalg/matrix_diag_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_MATRIX_DIAG_OP_H_
-#define TENSORFLOW_CORE_KERNELS_MATRIX_DIAG_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_DIAG_OP_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_DIAG_OP_H_
 
 // Generator definition for MatrixDiagOp, must be compilable by nvcc.
 
@@ -69,4 +69,4 @@ struct MatrixDiag {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_MATRIX_DIAG_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_DIAG_OP_H_
diff --git a/tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc b/tensorflow/core/kernels/linalg/matrix_diag_op_gpu.cu.cc
similarity index 99%
rename from tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc
rename to tensorflow/core/kernels/linalg/matrix_diag_op_gpu.cu.cc
index 76271798d5f..6b52e70716d 100644
--- a/tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/linalg/matrix_diag_op_gpu.cu.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/matrix_diag_op.h"
+#include "tensorflow/core/kernels/linalg/matrix_diag_op.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/matrix_exponential_op.cc b/tensorflow/core/kernels/linalg/matrix_exponential_op.cc
similarity index 97%
rename from tensorflow/core/kernels/matrix_exponential_op.cc
rename to tensorflow/core/kernels/linalg/matrix_exponential_op.cc
index 01d4894438c..73407614955 100644
--- a/tensorflow/core/kernels/matrix_exponential_op.cc
+++ b/tensorflow/core/kernels/linalg/matrix_exponential_op.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
diff --git a/tensorflow/core/kernels/matrix_inverse_op.cc b/tensorflow/core/kernels/linalg/matrix_inverse_op.cc
similarity index 98%
rename from tensorflow/core/kernels/matrix_inverse_op.cc
rename to tensorflow/core/kernels/linalg/matrix_inverse_op.cc
index 52afdd15ba6..dc51776f2fe 100644
--- a/tensorflow/core/kernels/matrix_inverse_op.cc
+++ b/tensorflow/core/kernels/linalg/matrix_inverse_op.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -32,9 +32,9 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/kernels/cuda_solvers.h"
-#include "tensorflow/core/kernels/eye_functor.h"
+#include "tensorflow/core/kernels/linalg/eye_functor.h"
 #include "tensorflow/core/kernels/transpose_functor.h"
+#include "tensorflow/core/util/cuda_solvers.h"
 #endif
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/matrix_logarithm_op.cc b/tensorflow/core/kernels/linalg/matrix_logarithm_op.cc
similarity index 97%
rename from tensorflow/core/kernels/matrix_logarithm_op.cc
rename to tensorflow/core/kernels/linalg/matrix_logarithm_op.cc
index 22ca094e243..79d5472f140 100644
--- a/tensorflow/core/kernels/matrix_logarithm_op.cc
+++ b/tensorflow/core/kernels/linalg/matrix_logarithm_op.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
diff --git a/tensorflow/core/kernels/matrix_set_diag_op.cc b/tensorflow/core/kernels/linalg/matrix_set_diag_op.cc
similarity index 99%
rename from tensorflow/core/kernels/matrix_set_diag_op.cc
rename to tensorflow/core/kernels/linalg/matrix_set_diag_op.cc
index bf98fd0d47d..df32228d0f2 100644
--- a/tensorflow/core/kernels/matrix_set_diag_op.cc
+++ b/tensorflow/core/kernels/linalg/matrix_set_diag_op.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#include "tensorflow/core/kernels/matrix_set_diag_op.h"
+#include "tensorflow/core/kernels/linalg/matrix_set_diag_op.h"
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -30,7 +30,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/matrix_diag_op.h"
+#include "tensorflow/core/kernels/linalg/matrix_diag_op.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
diff --git a/tensorflow/core/kernels/matrix_set_diag_op.h b/tensorflow/core/kernels/linalg/matrix_set_diag_op.h
similarity index 89%
rename from tensorflow/core/kernels/matrix_set_diag_op.h
rename to tensorflow/core/kernels/linalg/matrix_set_diag_op.h
index 04877cd34ca..449a3607ede 100644
--- a/tensorflow/core/kernels/matrix_set_diag_op.h
+++ b/tensorflow/core/kernels/linalg/matrix_set_diag_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_MATRIX_SET_DIAG_OP_H_
-#define TENSORFLOW_CORE_KERNELS_MATRIX_SET_DIAG_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_SET_DIAG_OP_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_SET_DIAG_OP_H_
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -39,4 +39,4 @@ struct MatrixSetDiag {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_MATRIX_SET_DIAG_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_SET_DIAG_OP_H_
diff --git a/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc b/tensorflow/core/kernels/linalg/matrix_set_diag_op_gpu.cu.cc
similarity index 99%
rename from tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc
rename to tensorflow/core/kernels/linalg/matrix_set_diag_op_gpu.cu.cc
index 4e32f8a52e8..0cdb457db03 100644
--- a/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/linalg/matrix_set_diag_op_gpu.cu.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/matrix_set_diag_op.h"
+#include "tensorflow/core/kernels/linalg/matrix_set_diag_op.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/matrix_solve_ls_op_complex128.cc b/tensorflow/core/kernels/linalg/matrix_solve_ls_op_complex128.cc
similarity index 92%
rename from tensorflow/core/kernels/matrix_solve_ls_op_complex128.cc
rename to tensorflow/core/kernels/linalg/matrix_solve_ls_op_complex128.cc
index 22274cc3daf..4e64eb42371 100644
--- a/tensorflow/core/kernels/matrix_solve_ls_op_complex128.cc
+++ b/tensorflow/core/kernels/linalg/matrix_solve_ls_op_complex128.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/matrix_solve_ls_op_impl.h"
+#include "tensorflow/core/kernels/linalg/matrix_solve_ls_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/matrix_solve_ls_op_complex64.cc b/tensorflow/core/kernels/linalg/matrix_solve_ls_op_complex64.cc
similarity index 92%
rename from tensorflow/core/kernels/matrix_solve_ls_op_complex64.cc
rename to tensorflow/core/kernels/linalg/matrix_solve_ls_op_complex64.cc
index c8421a3efba..719201f3f9e 100644
--- a/tensorflow/core/kernels/matrix_solve_ls_op_complex64.cc
+++ b/tensorflow/core/kernels/linalg/matrix_solve_ls_op_complex64.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/matrix_solve_ls_op_impl.h"
+#include "tensorflow/core/kernels/linalg/matrix_solve_ls_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/matrix_solve_ls_op_double.cc b/tensorflow/core/kernels/linalg/matrix_solve_ls_op_double.cc
similarity index 92%
rename from tensorflow/core/kernels/matrix_solve_ls_op_double.cc
rename to tensorflow/core/kernels/linalg/matrix_solve_ls_op_double.cc
index c7d03cb1052..614ecee4e23 100644
--- a/tensorflow/core/kernels/matrix_solve_ls_op_double.cc
+++ b/tensorflow/core/kernels/linalg/matrix_solve_ls_op_double.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/matrix_solve_ls_op_impl.h"
+#include "tensorflow/core/kernels/linalg/matrix_solve_ls_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/matrix_solve_ls_op_float.cc b/tensorflow/core/kernels/linalg/matrix_solve_ls_op_float.cc
similarity index 92%
rename from tensorflow/core/kernels/matrix_solve_ls_op_float.cc
rename to tensorflow/core/kernels/linalg/matrix_solve_ls_op_float.cc
index c98a84beded..809cff8148c 100644
--- a/tensorflow/core/kernels/matrix_solve_ls_op_float.cc
+++ b/tensorflow/core/kernels/linalg/matrix_solve_ls_op_float.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/matrix_solve_ls_op_impl.h"
+#include "tensorflow/core/kernels/linalg/matrix_solve_ls_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/matrix_solve_ls_op_impl.h b/tensorflow/core/kernels/linalg/matrix_solve_ls_op_impl.h
similarity index 96%
rename from tensorflow/core/kernels/matrix_solve_ls_op_impl.h
rename to tensorflow/core/kernels/linalg/matrix_solve_ls_op_impl.h
index 00a05a87a3a..1c8101a05b4 100644
--- a/tensorflow/core/kernels/matrix_solve_ls_op_impl.h
+++ b/tensorflow/core/kernels/linalg/matrix_solve_ls_op_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_MATRIX_SOLVE_LS_OP_IMPL_H_
-#define TENSORFLOW_CORE_KERNELS_MATRIX_SOLVE_LS_OP_IMPL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_SOLVE_LS_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_SOLVE_LS_OP_IMPL_H_
 
 // See docs in ../ops/linalg_ops.cc.
 
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -163,4 +163,4 @@ class MatrixSolveLsOp : public LinearAlgebraOp<Scalar> {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_MATRIX_SOLVE_LS_OP_IMPL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_SOLVE_LS_OP_IMPL_H_
diff --git a/tensorflow/core/kernels/matrix_solve_op.cc b/tensorflow/core/kernels/linalg/matrix_solve_op.cc
similarity index 99%
rename from tensorflow/core/kernels/matrix_solve_op.cc
rename to tensorflow/core/kernels/linalg/matrix_solve_op.cc
index 3a75054f4ea..70f02bddf9b 100644
--- a/tensorflow/core/kernels/matrix_solve_op.cc
+++ b/tensorflow/core/kernels/linalg/matrix_solve_op.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -33,8 +33,8 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/kernels/cuda_solvers.h"
 #include "tensorflow/core/kernels/transpose_functor.h"
+#include "tensorflow/core/util/cuda_solvers.h"
 #endif
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/matrix_square_root_op.cc b/tensorflow/core/kernels/linalg/matrix_square_root_op.cc
similarity index 97%
rename from tensorflow/core/kernels/matrix_square_root_op.cc
rename to tensorflow/core/kernels/linalg/matrix_square_root_op.cc
index fe3d3043c26..ce43e358350 100644
--- a/tensorflow/core/kernels/matrix_square_root_op.cc
+++ b/tensorflow/core/kernels/linalg/matrix_square_root_op.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
diff --git a/tensorflow/core/kernels/matrix_triangular_solve_op_complex.cc b/tensorflow/core/kernels/linalg/matrix_triangular_solve_op_complex.cc
similarity index 92%
rename from tensorflow/core/kernels/matrix_triangular_solve_op_complex.cc
rename to tensorflow/core/kernels/linalg/matrix_triangular_solve_op_complex.cc
index ae3702078a0..27f3e77e29c 100644
--- a/tensorflow/core/kernels/matrix_triangular_solve_op_complex.cc
+++ b/tensorflow/core/kernels/linalg/matrix_triangular_solve_op_complex.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/matrix_triangular_solve_op_impl.h"
+#include "tensorflow/core/kernels/linalg/matrix_triangular_solve_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/matrix_triangular_solve_op_impl.h b/tensorflow/core/kernels/linalg/matrix_triangular_solve_op_impl.h
similarity index 97%
rename from tensorflow/core/kernels/matrix_triangular_solve_op_impl.h
rename to tensorflow/core/kernels/linalg/matrix_triangular_solve_op_impl.h
index fb7e6f0f5ff..99249f792b6 100644
--- a/tensorflow/core/kernels/matrix_triangular_solve_op_impl.h
+++ b/tensorflow/core/kernels/linalg/matrix_triangular_solve_op_impl.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // See docs in ../ops/linalg_ops.cc.
 //
-#ifndef TENSORFLOW_CORE_KERNELS_MATRIX_TRIANGULAR_SOLVE_OP_IMPL_H_
-#define TENSORFLOW_CORE_KERNELS_MATRIX_TRIANGULAR_SOLVE_OP_IMPL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_TRIANGULAR_SOLVE_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_TRIANGULAR_SOLVE_OP_IMPL_H_
 
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/framework/kernel_def_builder.h"
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/fill_functor.h"
-#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -38,9 +38,9 @@ limitations under the License.
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #if GOOGLE_CUDA
-#include "tensorflow/core/kernels/cuda_solvers.h"
+#include "tensorflow/core/util/cuda_solvers.h"
 #elif TENSORFLOW_USE_ROCM
-#include "tensorflow/core/kernels/rocm_solvers.h"
+#include "tensorflow/core/util/rocm_solvers.h"
 #endif
 
 namespace tensorflow {
@@ -434,4 +434,4 @@ struct LaunchBatchMatrixTriangularSolve<GPUDevice, Scalar> {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_MATRIX_TRIANGULAR_SOLVE_OP_IMPL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_TRIANGULAR_SOLVE_OP_IMPL_H_
diff --git a/tensorflow/core/kernels/matrix_triangular_solve_op_real.cc b/tensorflow/core/kernels/linalg/matrix_triangular_solve_op_real.cc
similarity index 93%
rename from tensorflow/core/kernels/matrix_triangular_solve_op_real.cc
rename to tensorflow/core/kernels/linalg/matrix_triangular_solve_op_real.cc
index 0f92964dd72..71a62441dc4 100644
--- a/tensorflow/core/kernels/matrix_triangular_solve_op_real.cc
+++ b/tensorflow/core/kernels/linalg/matrix_triangular_solve_op_real.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/matrix_triangular_solve_op_impl.h"
+#include "tensorflow/core/kernels/linalg/matrix_triangular_solve_op_impl.h"
 
 #if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
diff --git a/tensorflow/core/kernels/matrix_triangular_solve_op_test.cc b/tensorflow/core/kernels/linalg/matrix_triangular_solve_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/matrix_triangular_solve_op_test.cc
rename to tensorflow/core/kernels/linalg/matrix_triangular_solve_op_test.cc
diff --git a/tensorflow/core/kernels/qr_op_complex128.cc b/tensorflow/core/kernels/linalg/qr_op_complex128.cc
similarity index 96%
rename from tensorflow/core/kernels/qr_op_complex128.cc
rename to tensorflow/core/kernels/linalg/qr_op_complex128.cc
index 8a3e3dc0a92..0c14c6d2818 100644
--- a/tensorflow/core/kernels/qr_op_complex128.cc
+++ b/tensorflow/core/kernels/linalg/qr_op_complex128.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/qr_op_impl.h"
+#include "tensorflow/core/kernels/linalg/qr_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/qr_op_complex64.cc b/tensorflow/core/kernels/linalg/qr_op_complex64.cc
similarity index 95%
rename from tensorflow/core/kernels/qr_op_complex64.cc
rename to tensorflow/core/kernels/linalg/qr_op_complex64.cc
index 467fa6c2d6a..fc0227ef7f9 100644
--- a/tensorflow/core/kernels/qr_op_complex64.cc
+++ b/tensorflow/core/kernels/linalg/qr_op_complex64.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/qr_op_impl.h"
+#include "tensorflow/core/kernels/linalg/qr_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/qr_op_double.cc b/tensorflow/core/kernels/linalg/qr_op_double.cc
similarity index 96%
rename from tensorflow/core/kernels/qr_op_double.cc
rename to tensorflow/core/kernels/linalg/qr_op_double.cc
index 05537a0eaa3..ae00b3e7921 100644
--- a/tensorflow/core/kernels/qr_op_double.cc
+++ b/tensorflow/core/kernels/linalg/qr_op_double.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/qr_op_impl.h"
+#include "tensorflow/core/kernels/linalg/qr_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/qr_op_float.cc b/tensorflow/core/kernels/linalg/qr_op_float.cc
similarity index 96%
rename from tensorflow/core/kernels/qr_op_float.cc
rename to tensorflow/core/kernels/linalg/qr_op_float.cc
index 6aebd981865..77b8eeb0286 100644
--- a/tensorflow/core/kernels/qr_op_float.cc
+++ b/tensorflow/core/kernels/linalg/qr_op_float.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/qr_op_impl.h"
+#include "tensorflow/core/kernels/linalg/qr_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/qr_op_impl.h b/tensorflow/core/kernels/linalg/qr_op_impl.h
similarity index 96%
rename from tensorflow/core/kernels/qr_op_impl.h
rename to tensorflow/core/kernels/linalg/qr_op_impl.h
index 535df9d160d..876594bc511 100644
--- a/tensorflow/core/kernels/qr_op_impl.h
+++ b/tensorflow/core/kernels/linalg/qr_op_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_QR_OP_IMPL_H_
-#define TENSORFLOW_CORE_KERNELS_QR_OP_IMPL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_QR_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_QR_OP_IMPL_H_
 
 // See docs in ../ops/linalg_ops.cc.
 //
@@ -33,7 +33,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -41,11 +41,11 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/kernels/cuda_solvers.h"
 #include "tensorflow/core/kernels/cwise_ops.h"
-#include "tensorflow/core/kernels/eye_functor.h"
-#include "tensorflow/core/kernels/matrix_band_part_op.h"
+#include "tensorflow/core/kernels/linalg/eye_functor.h"
+#include "tensorflow/core/kernels/linalg/matrix_band_part_op.h"
 #include "tensorflow/core/kernels/transpose_functor.h"
+#include "tensorflow/core/util/cuda_solvers.h"
 #endif
 
 namespace tensorflow {
@@ -299,4 +299,4 @@ class QrOpGpu : public AsyncOpKernel {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_QR_OP_IMPL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_QR_OP_IMPL_H_
diff --git a/tensorflow/core/kernels/self_adjoint_eig_op.cc b/tensorflow/core/kernels/linalg/self_adjoint_eig_op.cc
similarity index 98%
rename from tensorflow/core/kernels/self_adjoint_eig_op.cc
rename to tensorflow/core/kernels/linalg/self_adjoint_eig_op.cc
index cea5883db7b..ebf1955b8ff 100644
--- a/tensorflow/core/kernels/self_adjoint_eig_op.cc
+++ b/tensorflow/core/kernels/linalg/self_adjoint_eig_op.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/self_adjoint_eig_v2_op_complex128.cc b/tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_complex128.cc
similarity index 93%
rename from tensorflow/core/kernels/self_adjoint_eig_v2_op_complex128.cc
rename to tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_complex128.cc
index 4c7a391d56c..424c33a7ac1 100644
--- a/tensorflow/core/kernels/self_adjoint_eig_v2_op_complex128.cc
+++ b/tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_complex128.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/self_adjoint_eig_v2_op_impl.h"
+#include "tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/self_adjoint_eig_v2_op_complex64.cc b/tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_complex64.cc
similarity index 93%
rename from tensorflow/core/kernels/self_adjoint_eig_v2_op_complex64.cc
rename to tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_complex64.cc
index 0ec5ec24dd1..bdd20998e3c 100644
--- a/tensorflow/core/kernels/self_adjoint_eig_v2_op_complex64.cc
+++ b/tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_complex64.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/self_adjoint_eig_v2_op_impl.h"
+#include "tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/self_adjoint_eig_v2_op_double.cc b/tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_double.cc
similarity index 92%
rename from tensorflow/core/kernels/self_adjoint_eig_v2_op_double.cc
rename to tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_double.cc
index 7f81bb69021..afc50500d40 100644
--- a/tensorflow/core/kernels/self_adjoint_eig_v2_op_double.cc
+++ b/tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_double.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/self_adjoint_eig_v2_op_impl.h"
+#include "tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/self_adjoint_eig_v2_op_float.cc b/tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_float.cc
similarity index 92%
rename from tensorflow/core/kernels/self_adjoint_eig_v2_op_float.cc
rename to tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_float.cc
index bf30952d1e7..1f795777a2e 100644
--- a/tensorflow/core/kernels/self_adjoint_eig_v2_op_float.cc
+++ b/tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_float.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/self_adjoint_eig_v2_op_impl.h"
+#include "tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/self_adjoint_eig_v2_op_gpu.cc b/tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_gpu.cc
similarity index 99%
rename from tensorflow/core/kernels/self_adjoint_eig_v2_op_gpu.cc
rename to tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_gpu.cc
index 3a84df07a9a..778c50ff408 100644
--- a/tensorflow/core/kernels/self_adjoint_eig_v2_op_gpu.cc
+++ b/tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_gpu.cc
@@ -26,12 +26,12 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/cast_op.h"
-#include "tensorflow/core/kernels/cuda_solvers.h"
 #include "tensorflow/core/kernels/cwise_ops.h"
 #include "tensorflow/core/kernels/transpose_functor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_solvers.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/self_adjoint_eig_v2_op_impl.h b/tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_impl.h
similarity index 91%
rename from tensorflow/core/kernels/self_adjoint_eig_v2_op_impl.h
rename to tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_impl.h
index b5274f8788b..56f2936a66e 100644
--- a/tensorflow/core/kernels/self_adjoint_eig_v2_op_impl.h
+++ b/tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_SELF_ADJOINT_EIG_V2_OP_IMPL_H_
-#define TENSORFLOW_CORE_KERNELS_SELF_ADJOINT_EIG_V2_OP_IMPL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_SELF_ADJOINT_EIG_V2_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_SELF_ADJOINT_EIG_V2_OP_IMPL_H_
 
 // See docs in ../ops/linalg_ops.cc.
 
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/logging.h"
@@ -89,4 +89,4 @@ class SelfAdjointEigV2Op : public LinearAlgebraOp<Scalar> {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_SELF_ADJOINT_EIG_V2_OP_IMPL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_SELF_ADJOINT_EIG_V2_OP_IMPL_H_
diff --git a/tensorflow/core/kernels/svd_op_complex128.cc b/tensorflow/core/kernels/linalg/svd_op_complex128.cc
similarity index 93%
rename from tensorflow/core/kernels/svd_op_complex128.cc
rename to tensorflow/core/kernels/linalg/svd_op_complex128.cc
index a0f39418aca..36ac629e38a 100644
--- a/tensorflow/core/kernels/svd_op_complex128.cc
+++ b/tensorflow/core/kernels/linalg/svd_op_complex128.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/svd_op_impl.h"
+#include "tensorflow/core/kernels/linalg/svd_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/svd_op_complex64.cc b/tensorflow/core/kernels/linalg/svd_op_complex64.cc
similarity index 93%
rename from tensorflow/core/kernels/svd_op_complex64.cc
rename to tensorflow/core/kernels/linalg/svd_op_complex64.cc
index a8fd50c67d1..50d940b534a 100644
--- a/tensorflow/core/kernels/svd_op_complex64.cc
+++ b/tensorflow/core/kernels/linalg/svd_op_complex64.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/svd_op_impl.h"
+#include "tensorflow/core/kernels/linalg/svd_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/svd_op_double.cc b/tensorflow/core/kernels/linalg/svd_op_double.cc
similarity index 93%
rename from tensorflow/core/kernels/svd_op_double.cc
rename to tensorflow/core/kernels/linalg/svd_op_double.cc
index 539dae3a081..85bbe08d8c9 100644
--- a/tensorflow/core/kernels/svd_op_double.cc
+++ b/tensorflow/core/kernels/linalg/svd_op_double.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/svd_op_impl.h"
+#include "tensorflow/core/kernels/linalg/svd_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/svd_op_float.cc b/tensorflow/core/kernels/linalg/svd_op_float.cc
similarity index 93%
rename from tensorflow/core/kernels/svd_op_float.cc
rename to tensorflow/core/kernels/linalg/svd_op_float.cc
index 03839aa49c3..961d131293b 100644
--- a/tensorflow/core/kernels/svd_op_float.cc
+++ b/tensorflow/core/kernels/linalg/svd_op_float.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/svd_op_impl.h"
+#include "tensorflow/core/kernels/linalg/svd_op_impl.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/svd_op_gpu.cu.cc b/tensorflow/core/kernels/linalg/svd_op_gpu.cu.cc
similarity index 99%
rename from tensorflow/core/kernels/svd_op_gpu.cu.cc
rename to tensorflow/core/kernels/linalg/svd_op_gpu.cu.cc
index 482fd057e4e..06d1efe6dd5 100644
--- a/tensorflow/core/kernels/svd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/linalg/svd_op_gpu.cu.cc
@@ -36,14 +36,14 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/cuda_solvers.h"
-#include "tensorflow/core/kernels/eye_functor.h"
-#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/kernels/linalg/eye_functor.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
 #include "tensorflow/core/kernels/transpose_functor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_solvers.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/svd_op_impl.h b/tensorflow/core/kernels/linalg/svd_op_impl.h
similarity index 95%
rename from tensorflow/core/kernels/svd_op_impl.h
rename to tensorflow/core/kernels/linalg/svd_op_impl.h
index 675826a057c..c43aaaa4b7b 100644
--- a/tensorflow/core/kernels/svd_op_impl.h
+++ b/tensorflow/core/kernels/linalg/svd_op_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_SVD_OP_IMPL_H_
-#define TENSORFLOW_CORE_KERNELS_SVD_OP_IMPL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_SVD_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_SVD_OP_IMPL_H_
 
 // See docs in ../ops/linalg_ops.cc.
 //
@@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -118,4 +118,4 @@ class SvdOp : public LinearAlgebraOp<Scalar> {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_SVD_OP_IMPL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_SVD_OP_IMPL_H_
diff --git a/tensorflow/core/kernels/tridiagonal_matmul_op.cc b/tensorflow/core/kernels/linalg/tridiagonal_matmul_op.cc
similarity index 98%
rename from tensorflow/core/kernels/tridiagonal_matmul_op.cc
rename to tensorflow/core/kernels/linalg/tridiagonal_matmul_op.cc
index 3ddf22012de..9d17c574148 100644
--- a/tensorflow/core/kernels/tridiagonal_matmul_op.cc
+++ b/tensorflow/core/kernels/linalg/tridiagonal_matmul_op.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/core/kernels/tridiagonal_matmul_op_gpu.cu.cc b/tensorflow/core/kernels/linalg/tridiagonal_matmul_op_gpu.cu.cc
similarity index 96%
rename from tensorflow/core/kernels/tridiagonal_matmul_op_gpu.cu.cc
rename to tensorflow/core/kernels/linalg/tridiagonal_matmul_op_gpu.cu.cc
index 1c82cc18e32..a65db40d822 100644
--- a/tensorflow/core/kernels/tridiagonal_matmul_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/linalg/tridiagonal_matmul_op_gpu.cu.cc
@@ -22,11 +22,11 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/cuda_solvers.h"
-#include "tensorflow/core/kernels/cuda_sparse.h"
-#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
 #include "tensorflow/core/kernels/transpose_functor.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/cuda_solvers.h"
+#include "tensorflow/core/util/cuda_sparse.h"
 #include "tensorflow/core/util/gpu_device_functions.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 #include "tensorflow/core/util/gpu_launch_config.h"
diff --git a/tensorflow/core/kernels/tridiagonal_solve_op.cc b/tensorflow/core/kernels/linalg/tridiagonal_solve_op.cc
similarity index 99%
rename from tensorflow/core/kernels/tridiagonal_solve_op.cc
rename to tensorflow/core/kernels/linalg/tridiagonal_solve_op.cc
index 88931ff3e66..8fe04125f9a 100644
--- a/tensorflow/core/kernels/tridiagonal_solve_op.cc
+++ b/tensorflow/core/kernels/linalg/tridiagonal_solve_op.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/core/kernels/tridiagonal_solve_op_gpu.cu.cc b/tensorflow/core/kernels/linalg/tridiagonal_solve_op_gpu.cu.cc
similarity index 99%
rename from tensorflow/core/kernels/tridiagonal_solve_op_gpu.cu.cc
rename to tensorflow/core/kernels/linalg/tridiagonal_solve_op_gpu.cu.cc
index 089fa8c040f..86514cfb033 100644
--- a/tensorflow/core/kernels/tridiagonal_solve_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/linalg/tridiagonal_solve_op_gpu.cu.cc
@@ -23,11 +23,11 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/cuda_solvers.h"
-#include "tensorflow/core/kernels/cuda_sparse.h"
-#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
 #include "tensorflow/core/kernels/transpose_functor.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/cuda_solvers.h"
+#include "tensorflow/core/util/cuda_sparse.h"
 #include "tensorflow/core/util/gpu_device_functions.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 #include "tensorflow/core/util/gpu_launch_config.h"
diff --git a/tensorflow/core/kernels/linalg_ops_common.h b/tensorflow/core/kernels/linalg_ops_common.h
index 65c2fb90f0e..0aa69801f19 100644
--- a/tensorflow/core/kernels/linalg_ops_common.h
+++ b/tensorflow/core/kernels/linalg_ops_common.h
@@ -12,211 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #ifndef TENSORFLOW_CORE_KERNELS_LINALG_OPS_COMMON_H_
 #define TENSORFLOW_CORE_KERNELS_LINALG_OPS_COMMON_H_
 
-// Classes to support linear algebra functionality, similar to the numpy.linalg
-// module. Supports batch computation on several matrices at once, sharding the
-// computations across different threads if necessary.
-#include <algorithm>
-
-#include "third_party/eigen3/Eigen/Core"
-#include "tensorflow/core/framework/kernel_def_builder.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/gtl/inlined_vector.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/work_sharder.h"
-
-namespace tensorflow {
-
-// Base class for linear algebra operators.
-template <class InputScalar, class OutputScalar = InputScalar>
-class LinearAlgebraOp : public OpKernel {
- public:
-  explicit LinearAlgebraOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override;
-
- protected:
-  using TensorShapes = gtl::InlinedVector<TensorShape, 4>;
-  // Returns the number of leading inputs that are to be treated as matrix
-  // inputs. By default this is all the inputs. Derived classes can override
-  // this to tell the base class to ignore one or more trailing inputs.
-  virtual int NumMatrixInputs(const OpKernelContext* context) const {
-    return context->num_inputs();
-  }
-
-  // Returns true if the number of inputs and their shapes are as expected.
-  // Many ops take a single square input matrix, so we provide that as a default
-  // implementation for convenience.
-  virtual void ValidateInputMatrixShapes(
-      OpKernelContext* context, const TensorShapes& input_matrix_shapes) const {
-    ValidateSingleSquareMatrix(context, input_matrix_shapes);
-  }
-
-  // Convenience validators for common cases:
-  //
-  // Validate op taking a single matrix A.
-  static void ValidateSingleMatrix(OpKernelContext* context,
-                                   const TensorShapes& input_matrix_shapes);
-  // Validate op taking a single square matrix A.
-  static void ValidateSingleSquareMatrix(
-      OpKernelContext* context, const TensorShapes& input_matrix_shapes);
-  // Validate op taking two matrices A and B that have the same number of rows.
-  static void ValidateSolver(OpKernelContext* context,
-                             const TensorShapes& input_matrix_shapes);
-  // Validate op taking two matrices A and B that have the same number of rows
-  // and A is square.
-  static void ValidateSquareSolver(OpKernelContext* context,
-                                   const TensorShapes& input_matrix_shapes);
-
-  // Returns the output shapes of each individual matrix operation. Output
-  // matrices shapes must be rank 0, 1, or 2. Scalar outputs are rank 0.
-  //
-  // The derived class may return a number of shapes (N) less than
-  // context->num_outputs() (M) to indicate that a only leading subset of
-  // the outputs will be populated. In this case, a dummy scalar tensor with
-  // value zero will be return for the last M-N outputs.
-  //
-  // For many ops, the output dimensions are the same as the input dimensions,
-  // so we provide that as a default implementation for convenience.
-  virtual TensorShapes GetOutputMatrixShapes(
-      const TensorShapes& input_matrix_shapes) const {
-    return input_matrix_shapes;
-  }
-
-  // Returns the cost per matrix operation. This is used to determine the
-  // number of threads to use for parallelizing calls to ComputeMatrix in
-  // batch mode. Cost per unit is assumed to be roughly 1ns, based on comments
-  // in core/util/work_sharder.cc. Many linear algebra ops take roughly max(m,n)
-  // * min(m,n)^2, where the first input matrix is m-by-n. We provide that as a
-  // default implementation for convenience.
-  virtual int64 GetCostPerUnit(const TensorShapes& input_matrix_shapes) const {
-    double m = static_cast<double>(input_matrix_shapes[0].dim_size(0));
-    double n = static_cast<double>(input_matrix_shapes[0].dim_size(1));
-    double cost = std::max(m, n) * std::min(m, n) * std::min(m, n);
-    return cost >= static_cast<double>(kint64max) ? kint64max
-                                                  : static_cast<int64>(cost);
-  }
-
-  // Returns true if it is safe to forward (alias) input to output buffer
-  // and expect the kernel to perform the computation inplace.
-  virtual bool EnableInputForwarding() const { return true; }
-
-  using InputMatrix = Eigen::Matrix<InputScalar, Eigen::Dynamic, Eigen::Dynamic,
-                                    Eigen::RowMajor>;
-  using InputConstMatrixMap = Eigen::Map<const InputMatrix>;
-  using InputMatrixMap = Eigen::Map<InputMatrix>;
-  using InputConstVectorMap =
-      Eigen::Map<const Eigen::Matrix<InputScalar, 1, Eigen::Dynamic>>;
-  using InputConstMatrixMaps = gtl::InlinedVector<InputConstMatrixMap, 4>;
-  using InputMatrixMaps = gtl::InlinedVector<InputMatrixMap, 4>;
-  using InputRealScalar = typename Eigen::NumTraits<InputScalar>::Real;
-
-  using OutputMatrix = Eigen::Matrix<OutputScalar, Eigen::Dynamic,
-                                     Eigen::Dynamic, Eigen::RowMajor>;
-  using OutputConstMatrixMap = Eigen::Map<const OutputMatrix>;
-  using OutputMatrixMap = Eigen::Map<OutputMatrix>;
-  using OutputConstVectorMap =
-      Eigen::Map<const Eigen::Matrix<OutputScalar, 1, Eigen::Dynamic>>;
-  using OutputConstMatrixMaps = gtl::InlinedVector<OutputConstMatrixMap, 4>;
-  using OutputMatrixMaps = gtl::InlinedVector<OutputMatrixMap, 4>;
-  using OutputRealScalar = typename Eigen::NumTraits<OutputScalar>::Real;
-
-  // backward compatibility
-  using Scalar = OutputScalar;
-  using Matrix =
-      Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-  using ConstMatrixMap = Eigen::Map<const Matrix>;
-  using MatrixMap = Eigen::Map<Matrix>;
-  using ConstVectorMap =
-      Eigen::Map<const Eigen::Matrix<Scalar, 1, Eigen::Dynamic>>;
-  using ConstMatrixMaps = gtl::InlinedVector<ConstMatrixMap, 4>;
-  using MatrixMaps = gtl::InlinedVector<MatrixMap, 4>;
-  using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
-
-  // Performs a single matrix computation given input matrices, and
-  // stores the result in outputs. For batch operations, this will be called
-  // repeatedly for a single call to Compute() when multiple matrices exist in
-  // input Tensors with rank > 2. In this case the calls to ComputeMatrix are
-  // parallelized. The number of threads used is determined by a cost model from
-  // the value returned by GetCostPerUnit().
-  virtual void ComputeMatrix(OpKernelContext* context,
-                             const InputConstMatrixMaps& inputs,
-                             OutputMatrixMaps* outputs) = 0;
-
- private:
-  using TensorInputs = gtl::InlinedVector<const Tensor*, 4>;
-  using TensorOutputs = gtl::InlinedVector<Tensor*, 4>;
-  // This function maps 2-d slices (matrices) of the input and output tensors
-  // using Eigen::Map and calls ComputeMatrix implemented in terms of the
-  // Eigen::MatrixBase API by the derived class.
-  //
-  // The 'matrix_index' parameter specifies the index of the matrix to be used
-  // from each input tensor, and the index of the matrix to be written to each
-  // output tensor. The input matrices are in row major order, and located at
-  // the memory addresses
-  //   inputs[i].flat<Scalar>().data() +
-  //   matrix_index * input_matrix_shapes[i].num_elements()
-  // for i in 0...inputs.size()-1.
-  // The output matrices are in row major order, and located at the memory
-  // address
-  //   outputs[i]->flat<Scalar>().data() +
-  //   matrix_index * output_matrix_shapes[i].num_elements().
-  // for i in 0...outputs.size()-1.
-  //
-  void ComputeTensorSlice(OpKernelContext* context, int64 matrix_index,
-                          const TensorInputs& inputs,
-                          const TensorShapes& input_matrix_shapes,
-                          const TensorOutputs& outputs,
-                          const TensorShapes& output_matrix_shapes);
-
-  void AnalyzeInputs(OpKernelContext* context, TensorInputs* inputs,
-                     TensorShapes* input_matrix_shapes,
-                     TensorShape* batch_shape);
-
-  void PrepareOutputs(OpKernelContext* context,
-                      const TensorShapes& input_matrix_shapes,
-                      const TensorShape& batch_shape, TensorOutputs* outputs,
-                      TensorShapes* output_matrix_shapes);
-};
-
-// Declare LinearAlgebraOp, which is explicitly instantiated in
-// linalg_ops_common.cc for float, double, complex64, and complex128.
-extern template class LinearAlgebraOp<float>;
-extern template class LinearAlgebraOp<double>;
-extern template class LinearAlgebraOp<complex64>;
-extern template class LinearAlgebraOp<complex128>;
-
-}  // namespace tensorflow
-
-#define INHERIT_LINALG_TYPEDEFS(Scalar)                       \
-  typedef LinearAlgebraOp<Scalar> Base;                       \
-  using RealScalar = typename Eigen::NumTraits<Scalar>::Real; \
-  using Matrix = typename Base::Matrix;                       \
-  using MatrixMap = typename Base::MatrixMap;                 \
-  using MatrixMaps = typename Base::MatrixMaps;               \
-  using ConstMatrixMap = typename Base::ConstMatrixMap;       \
-  using ConstMatrixMaps = typename Base::ConstMatrixMaps;     \
-  using ConstVectorMap = typename Base::ConstVectorMap;       \
-  using TensorShapes = typename Base::TensorShapes;
-
-#define REGISTER_LINALG_OP_CPU(OpName, OpClass, Scalar) \
-  REGISTER_KERNEL_BUILDER(                              \
-      Name(OpName).Device(DEVICE_CPU).TypeConstraint<Scalar>("T"), OpClass)
-
-#define REGISTER_LINALG_OP_GPU(OpName, OpClass, Scalar) \
-  REGISTER_KERNEL_BUILDER(                              \
-      Name(OpName).Device(DEVICE_GPU).TypeConstraint<Scalar>("T"), OpClass)
-
-// Deprecated, use one of the device-specific macros above.
-#define REGISTER_LINALG_OP(OpName, OpClass, Scalar) \
-  REGISTER_LINALG_OP_CPU(OpName, OpClass, Scalar)
+// Temporary forwarding header.
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
 
 #endif  // TENSORFLOW_CORE_KERNELS_LINALG_OPS_COMMON_H_
diff --git a/tensorflow/core/kernels/map_kernels.cc b/tensorflow/core/kernels/map_kernels.cc
new file mode 100644
index 00000000000..adf218c8183
--- /dev/null
+++ b/tensorflow/core/kernels/map_kernels.cc
@@ -0,0 +1,41 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/map_kernels.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+
+namespace tensorflow {
+
+REGISTER_KERNEL_BUILDER(Name("EmptyTensorMap").Device(DEVICE_CPU),
+                        EmptyTensorMap);
+
+REGISTER_KERNEL_BUILDER(Name("TensorMapSize").Device(DEVICE_CPU),
+                        TensorMapSize);
+
+REGISTER_KERNEL_BUILDER(Name("TensorMapLookup").Device(DEVICE_CPU),
+                        TensorMapLookup);
+
+REGISTER_KERNEL_BUILDER(Name("TensorMapInsert").Device(DEVICE_CPU),
+                        TensorMapInsert);
+
+REGISTER_KERNEL_BUILDER(Name("TensorMapErase").Device(DEVICE_CPU),
+                        TensorMapErase);
+
+REGISTER_KERNEL_BUILDER(Name("TensorMapHasKey").Device(DEVICE_CPU),
+                        TensorMapHasKey);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/map_kernels.h b/tensorflow/core/kernels/map_kernels.h
new file mode 100644
index 00000000000..34bb3e63e16
--- /dev/null
+++ b/tensorflow/core/kernels/map_kernels.h
@@ -0,0 +1,180 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_MAP_KERNELS_H_
+#define TENSORFLOW_CORE_KERNELS_MAP_KERNELS_H_
+
+#include <iostream>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/kernels/tensor_map.h"
+
+namespace tensorflow {
+
+Status GetInputMap(OpKernelContext* c, int index, const TensorMap** map) {
+  if (!TensorShapeUtils::IsScalar(c->input(index).shape())) {
+    return errors::InvalidArgument("Input map must be a scalar. Saw: ",
+                                   c->input(index).shape().DebugString());
+  }
+  const TensorMap* m = c->input(index).scalar<Variant>()().get<TensorMap>();
+  if (m == nullptr) {
+    return errors::InvalidArgument(
+        "Input handle is not a map. Saw: '",
+        c->input(index).scalar<Variant>()().DebugString(), "'");
+  }
+  *map = m;
+  return Status::OK();
+}
+
+// TODO(kattian): change into templated function
+Status ForwardInputOrCreateNewMap(OpKernelContext* c, int32 input_index,
+                                  int32 output_index,
+                                  const TensorMap& input_map,
+                                  TensorMap** output_map) {
+  // Attempt to forward the input tensor to the output if possible.
+  std::unique_ptr<Tensor> maybe_output = c->forward_input(
+      input_index, output_index, DT_VARIANT, TensorShape{},
+      c->input_memory_type(input_index), AllocatorAttributes());
+  Tensor* output_tensor;
+  if (maybe_output != nullptr && maybe_output->dtype() == DT_VARIANT &&
+      maybe_output->NumElements() == 1) {
+    output_tensor = maybe_output.get();
+    TensorMap* tmp_out = output_tensor->scalar<Variant>()().get<TensorMap>();
+    if (tmp_out == nullptr) {
+      return errors::InvalidArgument(
+          "Expected input ", input_index, " to be a TensorMap but saw ",
+          output_tensor->scalar<Variant>()().TypeName());
+    }
+    if (tmp_out->RefCountIsOne()) {
+      // Woohoo, forwarding succeeded!
+      c->set_output(output_index, *output_tensor);
+      *output_map = tmp_out;
+      return Status::OK();
+    }
+  }
+
+  // If forwarding is not possible allocate a new output tensor and copy
+  // the `input_map` to it.
+  AllocatorAttributes attr;
+  attr.set_on_host(true);
+  TF_RETURN_IF_ERROR(
+      c->allocate_output(output_index, {}, &output_tensor, attr));
+  output_tensor->scalar<Variant>()() = input_map.Copy();
+
+  *output_map = output_tensor->scalar<Variant>()().get<TensorMap>();
+  return Status::OK();
+}
+
+class EmptyTensorMap : public OpKernel {
+ public:
+  explicit EmptyTensorMap(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* c) override {
+    Tensor* result;
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
+    TensorMap empty;
+    result->scalar<Variant>()() = std::move(empty);
+  }
+};
+
+class TensorMapSize : public OpKernel {
+ public:
+  explicit TensorMapSize(OpKernelConstruction* c) : OpKernel(c) {}
+  ~TensorMapSize() override {}
+
+  void Compute(OpKernelContext* c) override {
+    const TensorMap* m = nullptr;
+    OP_REQUIRES_OK(c, GetInputMap(c, 0, &m));
+    Tensor* result;
+    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result));
+    result->scalar<int32>()() = m->tensors().size();
+  }
+};
+
+class TensorMapInsert : public OpKernel {
+ public:
+  explicit TensorMapInsert(OpKernelConstruction* c) : OpKernel(c) {}
+  ~TensorMapInsert() override {}
+
+  void Compute(OpKernelContext* c) override {
+    const TensorKey& key = c->input(1);
+    const Tensor& value = c->input(2);
+    const TensorMap* m = nullptr;
+    OP_REQUIRES_OK(c, GetInputMap(c, 0, &m));
+
+    TensorMap* output_map = nullptr;
+    OP_REQUIRES_OK(c, ForwardInputOrCreateNewMap(c, 0, 0, *m, &output_map));
+    output_map->replace(key, value);
+  }
+};
+
+class TensorMapLookup : public OpKernel {
+ public:
+  explicit TensorMapLookup(OpKernelConstruction* c) : OpKernel(c) {}
+  ~TensorMapLookup() override {}
+
+  void Compute(OpKernelContext* c) override {
+    const TensorKey& key = c->input(1);
+    const TensorMap* m = nullptr;
+    OP_REQUIRES_OK(c, GetInputMap(c, 0, &m));
+
+    OP_REQUIRES(c, m->tensors().find(key) != m->tensors().end(),
+                errors::InvalidArgument("Trying to lookup non-existent key."));
+
+    c->set_output(0, m->tensors().find(key)->second);
+  }
+};
+
+class TensorMapErase : public OpKernel {
+ public:
+  explicit TensorMapErase(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* c) override {
+    const TensorMap* m = nullptr;
+    OP_REQUIRES_OK(c, GetInputMap(c, 0, &m));
+    const TensorKey& key = c->input(1);
+
+    OP_REQUIRES(c, m->tensors().find(key) != m->tensors().end(),
+                errors::InvalidArgument("Trying to erase non-existent item."));
+
+    const Tensor& t = m->tensors().find(key)->second;
+    c->set_output(1, t);
+
+    TensorMap* output_map = nullptr;
+    OP_REQUIRES_OK(c, ForwardInputOrCreateNewMap(c, 0, 0, *m, &output_map));
+    output_map->tensors().erase(key);
+  }
+};
+
+class TensorMapHasKey : public OpKernel {
+ public:
+  explicit TensorMapHasKey(OpKernelConstruction* c) : OpKernel(c) {}
+  ~TensorMapHasKey() override {}
+
+  void Compute(OpKernelContext* c) override {
+    const TensorKey& key = c->input(1);
+    const TensorMap* m = nullptr;
+    OP_REQUIRES_OK(c, GetInputMap(c, 0, &m));
+    Tensor* result;
+    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result));
+    result->scalar<bool>()() = m->tensors().find(key) != m->tensors().end();
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_MAP_KERNELS_H_
diff --git a/tensorflow/core/kernels/matching_files_op.cc b/tensorflow/core/kernels/matching_files_op.cc
index 0ba718c88ec..515e58d518a 100644
--- a/tensorflow/core/kernels/matching_files_op.cc
+++ b/tensorflow/core/kernels/matching_files_op.cc
@@ -54,13 +54,15 @@ class MatchingFilesOp : public OpKernel {
         context, context->allocate_output("filenames", TensorShape({num_files}),
                                           &output_t));
     auto output = output_t->vec<tstring>();
-    int index = 0;
-    for (int i = 0; i < num_patterns; ++i) {
-      for (int j = 0; j < all_fnames[i].size(); j++) {
-        output(index++) = all_fnames[i][j];
+    if (output.size() > 0) {
+      int index = 0;
+      for (int i = 0; i < num_patterns; ++i) {
+        for (int j = 0; j < all_fnames[i].size(); j++) {
+          output(index++) = all_fnames[i][j];
+        }
       }
+      std::sort(&output(0), &output(0) + num_files);
     }
-    std::sort(&output(0), &output(0) + num_files);
   }
 };
 
diff --git a/tensorflow/core/kernels/mkl/BUILD b/tensorflow/core/kernels/mkl/BUILD
new file mode 100644
index 00000000000..16180a5b7bd
--- /dev/null
+++ b/tensorflow/core/kernels/mkl/BUILD
@@ -0,0 +1,428 @@
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test_mkl",
+    "tf_mkl_kernel_library",
+)
+load(
+    "//third_party/mkl:build_defs.bzl",
+    "mkl_deps",
+)
+
+package(
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+        "//tensorflow:internal",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+# Public support libraries ----------------------------------------------------
+MKL_SHORT_DEPS = [
+    "//tensorflow/core:core_cpu",
+    "//tensorflow/core:framework",
+    "//tensorflow/core:lib",
+    "//tensorflow/core:lib_internal",
+    "//tensorflow/core/kernels:bounds_check",
+    "//tensorflow/core/kernels:ops_util",
+] + mkl_deps()
+
+MKL_DEPS = MKL_SHORT_DEPS + [
+    "//third_party/eigen3",
+    "//tensorflow/core:array_grad",
+    "//tensorflow/core:math_grad",
+    "//tensorflow/core:nn_grad",
+    "//tensorflow/core:protos_all_cc",
+    "//tensorflow/core/kernels:concat_lib",
+    "//tensorflow/core/kernels:conv_2d",
+    "//tensorflow/core/kernels:eigen_contraction_kernel",
+    "//tensorflow/core/kernels:fill_functor",
+    "//tensorflow/core/kernels:gather_functor",
+    "//tensorflow/core/kernels:transpose_functor",
+]
+
+MKL_TEST_DEPS = [
+    ":mkl_input_conversion_op",
+    "//tensorflow/cc:cc_ops",
+    "//tensorflow/core:core_cpu",
+    "//tensorflow/core:framework",
+    "//tensorflow/core:framework_internal",
+    "//tensorflow/core:lib",
+    "//tensorflow/core:protos_all_cc",
+    "//tensorflow/core:test",
+    "//tensorflow/core:test_main",
+    "//tensorflow/core:testlib",
+    "//tensorflow/core/kernels:ops_testutil",
+    "//tensorflow/core/kernels:ops_util",
+]
+
+tf_mkl_kernel_library(
+    name = "mkl_batch_matmul_op",
+    srcs = ["mkl_batch_matmul_op.cc"],
+    hdrs = [
+        "mkl_matmul_ops_common.h",
+    ],
+    deps = ["//tensorflow/core/kernels:batch_matmul_op"] + MKL_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_matmul_op",
+    srcs = [
+        "mkl_matmul_op.cc",
+        "mkl_matmul_op_fused.cc",
+    ],
+    hdrs = ["mkl_matmul_ops_common.h"],
+    deps = MKL_DEPS,
+)
+
+tf_cc_test_mkl(
+    name = "mkl_quantized_conv_ops_perchannel_test",
+    size = "small",
+    srcs = ["mkl_quantized_conv_ops_perchannel_test.cc"],
+    deps = [
+        ":mkl_conv_op",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core/kernels:quantization_utils",
+        "//tensorflow/core/kernels:quantized_ops",
+    ] + MKL_TEST_DEPS,
+)
+
+tf_cc_test_mkl(
+    name = "mkl_quantized_conv_ops_test",
+    size = "small",
+    srcs = ["mkl_quantized_conv_ops_test.cc"],
+    deps = [
+        ":mkl_conv_op",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core/kernels:quantization_utils",
+        "//tensorflow/core/kernels:quantized_ops",
+    ] + MKL_TEST_DEPS,
+)
+
+tf_cc_test_mkl(
+    name = "mkl_qmatmul_op_test",
+    size = "small",
+    srcs = ["mkl_qmatmul_op_test.cc"],
+    deps = [
+        ":mkl_qmatmul_op",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core/framework:fake_input",
+        "//tensorflow/core/framework:tensor_testutil",
+        "//tensorflow/core/kernels:quantization_utils",
+        "//tensorflow/core/kernels:quantized_ops",
+    ] + MKL_TEST_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_quantize_op",
+    srcs = ["mkl_quantize_op.cc"],
+    deps = [
+        "//tensorflow/core/kernels:quantized_ops",
+        "//tensorflow/core:mkl_graph_util",
+        "@gemmlowp",
+    ] + MKL_DEPS,
+)
+
+tf_cc_test_mkl(
+    name = "mkl_quantize_op_test",
+    size = "small",
+    srcs = ["mkl_quantize_op_test.cc"],
+    deps = [
+        ":mkl_quantize_op",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+    ] + MKL_TEST_DEPS,
+)
+
+tf_cc_test_mkl(
+    name = "mkl_quantized_pooling_ops_test",
+    size = "small",
+    srcs = ["mkl_quantized_pooling_ops_test.cc"],
+    deps = [
+        ":mkl_pooling_ops",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core/kernels:quantization_utils",
+        "//tensorflow/core/kernels:quantized_ops",
+    ] + MKL_TEST_DEPS,
+)
+
+tf_cc_test_mkl(
+    name = "mkl_quantized_concat_op_test",
+    size = "small",
+    srcs = ["mkl_quantized_concat_op_test.cc"],
+    deps = [
+        ":mkl_concat_op",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:mkl_array_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core/kernels:quantization_utils",
+        "//tensorflow/core/kernels:quantized_ops",
+    ] + MKL_TEST_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_qmatmul_op",
+    srcs = ["mkl_qmatmul_op.cc"],
+    hdrs = [
+        "mkl_matmul_ops_common.h",
+        "mkl_quantized_conv_ops.h",
+    ],
+    deps = [
+        "//tensorflow/core/kernels:matmul_op",
+        "//tensorflow/core/kernels:no_op",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:mkl_nn_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+    ] + MKL_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_conv_op",
+    hdrs = [
+        "mkl_quantized_conv_ops.h",
+    ],
+    prefix = "mkl_conv",
+    deps = [
+        "@com_google_absl//absl/strings",
+        "//tensorflow/core/kernels:conv_ops",
+        "//tensorflow/core/kernels:no_op",
+    ] + MKL_DEPS,
+)
+
+tf_cc_test_mkl(
+    name = "mkl_conv_ops_test",
+    size = "small",
+    srcs = ["mkl_conv_ops_test.cc"],
+    linkstatic = 1,  # Fixes dyld error on MacOS.
+    deps = MKL_TEST_DEPS,
+)
+
+tf_cc_test_mkl(
+    name = "mkl_relu_op_test",
+    size = "small",
+    srcs = ["mkl_relu_op_test.cc"],
+    linkstatic = 1,  # Fixes dyld error on MacOS.
+    deps = MKL_TEST_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_tfconv_op",
+    prefix = "mkl_tfconv",
+    deps = MKL_SHORT_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_input_conversion_op",
+    hdrs = ["mkl_tfconv_op.h"],
+    prefix = "mkl_input_conversion",
+    deps = MKL_SHORT_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_pooling_ops",
+    srcs = [
+        "mkl_avgpooling_op.cc",
+        "mkl_maxpooling_op.cc",
+        "mkl_pooling_ops_common.cc",
+    ],
+    hdrs = ["mkl_pooling_ops_common.h"],
+    deps = MKL_SHORT_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_dequantize_op",
+    srcs = ["mkl_dequantize_op.cc"],
+    deps = [
+        "//third_party/eigen3",
+        "@gemmlowp",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:mkl_graph_util",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core/kernels:concat_lib_hdrs",
+        "//tensorflow/core/kernels:conv_ops",
+        "//tensorflow/core/kernels:cwise_op",
+        "//tensorflow/core/kernels:eigen_helpers",
+        "//tensorflow/core/kernels:ops_util",
+        "//tensorflow/core/kernels:pooling_ops",
+        "//tensorflow/core/kernels:quantization_utils",
+        "//tensorflow/core/kernels:quantized_ops",
+        "//tensorflow/core/kernels:transpose_functor",
+        "//tensorflow/core/util:image_resizer_state",
+    ] + mkl_deps(),
+)
+
+tf_cc_test_mkl(
+    name = "mkl_dequantize_op_test",
+    size = "small",
+    srcs = ["mkl_dequantize_op_test.cc"],
+    # TODO(b/149940073): Re-enable.
+    tags = [
+        "no_oss",
+        "notap",
+    ],
+    deps = [
+        ":mkl_dequantize_op",
+        ":mkl_tfconv_op",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:mkl_array_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+    ] + MKL_TEST_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_relu_op",
+    prefix = "mkl_relu",
+    deps = MKL_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_softmax_op",
+    prefix = "mkl_softmax",
+    deps = MKL_SHORT_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_tmp_bf16_ops",
+    prefix = "mkl_tmp_bf16_ops",
+    deps = MKL_DEPS + [
+        "//tensorflow/core/kernels:no_op",
+    ],
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_fused_batch_norm_op",
+    srcs = ["mkl_fused_batch_norm_op.cc"],
+    deps = [
+        "//tensorflow/core/kernels:fused_batch_norm_op",
+        "//tensorflow/core/kernels:no_op",
+    ] + mkl_deps(),
+)
+
+tf_cc_test_mkl(
+    name = "mkl_fused_batch_norm_op_test",
+    size = "small",
+    srcs = ["mkl_fused_batch_norm_op_test.cc"],
+    linkstatic = 1,
+    deps = [
+        ":mkl_fused_batch_norm_op",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core/kernels:conv_ops_gpu_hdrs",
+    ] + MKL_TEST_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_aggregate_ops",
+    prefix = "mkl_aggregate_ops",
+    deps = MKL_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_concat_op",
+    prefix = "mkl_concat_op",
+    deps = ["//tensorflow/core/kernels:quantization_utils"] + MKL_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_reshape_op",
+    prefix = "mkl_reshape_op",
+    deps = MKL_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_slice_op",
+    prefix = "mkl_slice_op",
+    deps = MKL_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_identity_op",
+    prefix = "mkl_identity_op",
+    deps = MKL_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_lrn_op",
+    prefix = "mkl_lrn_op",
+    deps = MKL_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_cwise_ops_common",
+    prefix = "mkl_cwise_ops_common",
+    deps = MKL_DEPS + ["//tensorflow/core/kernels:cwise_op"],
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_requantize_ops",
+    srcs = [
+        "mkl_requantization_range_per_channel_op.cc",
+        "mkl_requantize_per_channel_op.cc",
+    ],
+    deps = [
+        "@gemmlowp",
+        "//tensorflow/core/kernels:concat_lib_hdrs",
+        "//tensorflow/core/kernels:conv_ops",
+        "//tensorflow/core/kernels:eigen_helpers",
+        "//tensorflow/core/kernels:meta_support",
+        "//tensorflow/core/kernels:no_op",
+        "//tensorflow/core/kernels:pooling_ops",
+        "//tensorflow/core/kernels:quantization_utils",
+        "//tensorflow/core/util:image_resizer_state",
+    ] + MKL_DEPS,
+)
+
+tf_cc_test_mkl(
+    name = "mkl_requantize_ops_test",
+    size = "small",
+    srcs = ["mkl_requantize_ops_test.cc"],
+    linkstatic = 1,  # Fixes dyld error on MacOS.
+    deps = [
+        ":mkl_requantize_ops",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core/kernels:quantization_utils",
+        "//tensorflow/core/kernels:quantized_ops",
+    ] + MKL_TEST_DEPS,
+)
+
+tf_cc_test_mkl(
+    name = "mkl_fused_ops_test",
+    size = "small",
+    srcs = ["mkl_fused_ops_test.cc"],
+    linkstatic = 1,
+    deps = [
+        ":mkl_conv_op",
+        ":mkl_matmul_op",
+        ":mkl_tfconv_op",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core/kernels:bias_op",
+        "//tensorflow/core/kernels:conv_ops",
+        "//tensorflow/core/kernels:depthwise_conv_op",
+        "//tensorflow/core/kernels:matmul_op",
+        "//tensorflow/core/kernels:pad_op",
+        "//tensorflow/core/kernels:relu_op",
+        "//tensorflow/core/kernels/image:image",
+    ] + MKL_TEST_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_transpose_op",
+    srcs = [
+        "mkl_transpose_op.cc",
+    ],
+    deps = MKL_DEPS + ["//tensorflow/core/kernels:transpose_op"],
+)
diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl/mkl_aggregate_ops.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_aggregate_ops.cc
rename to tensorflow/core/kernels/mkl/mkl_aggregate_ops.cc
diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl/mkl_avgpooling_op.cc
similarity index 99%
rename from tensorflow/core/kernels/mkl_avgpooling_op.cc
rename to tensorflow/core/kernels/mkl/mkl_avgpooling_op.cc
index 8de3e327f96..754156c860a 100644
--- a/tensorflow/core/kernels/mkl_avgpooling_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_avgpooling_op.cc
@@ -21,7 +21,7 @@
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
+#include "tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h"
 #include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
@@ -249,8 +249,7 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
           orig_input_dims_mkl_order, output_dims_mkl_order, filter_dims,
           strides, padding_left, padding_right,
           ALGORITHM::pooling_avg_exclude_padding, prop_kind::forward_training,
-          static_cast<MEMORY_FORMAT>(this->data_format_mkldnn_), src_md,
-          diff_dst_md);
+          static_cast<MEMORY_FORMAT>(this->data_format_mkldnn_), src_md);
 #else
       MklPoolingParams bwdParams(
           orig_input_dims_mkl_order, output_dims_mkl_order, filter_dims,
diff --git a/tensorflow/core/kernels/mkl_batch_matmul_op.cc b/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc
similarity index 91%
rename from tensorflow/core/kernels/mkl_batch_matmul_op.cc
rename to tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc
index 87e6002d9cb..da5a239c224 100644
--- a/tensorflow/core/kernels/mkl_batch_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc
@@ -15,20 +15,26 @@ limitations under the License.
 
 // See docs in ../ops/math_ops.cc.
 
-// This file uses MKL CBLAS batched xGEMM for acceleration of TF Batch
-// Matrix-Matrix Multiplication (MatMul) operations.
-// We currently register this kernel only for MKL supported data
-// types (float, double, complex64, complex128). The macro INTEL_MKL is defined
-// by the build system only when MKL is chosen as an option at configure stage
-// and when it is undefined at build time, this file becomes an empty
-// compilation unit
+// This file uses both oneDNN and MKL CBLAS batched xGEMM for acceleration of
+// Batch Matrix-Matrix Multiplication (MatMul) operations.
+// We currently register this kernel only for oneDNN supported data
+// types (float, bfloat16). This file can be built with and without the use of
+// the binary MKL CBLAS calls, controlled by the macro INTEL_MKL_DNN_ONLY.
+// If INTEL_MKL_DNN_ONLY is defined, only oneDNN is used. For cases not
+// supported by oneDNN (ex. Batchmatmul with broadcasting) we fall back to the
+// default CPU implementation.
+// if INTEL_MKL_DNN_ONLY is not defined, both oneDNN and MKL CBLAS
+// implementations are used. This is only temporary, once we are able handle all
+// cases with oneDNN, CBLAS calls will be removed.
 
 #define EIGEN_USE_THREADS
 
 #if defined(INTEL_MKL)
 #include <vector>
 
+#if !defined(INTEL_MKL_DNN_ONLY)
 #include "mkl_cblas.h"
+#endif  // !INTEL_MKL_DNN_ONLY
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -39,7 +45,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/batch_matmul_op_impl.h"
 #include "tensorflow/core/kernels/fill_functor.h"
-#include "tensorflow/core/kernels/mkl_matmul_ops_common.h"
+#include "tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/matmul_bcast.h"
@@ -105,14 +111,14 @@ class BatchMatMulMkl : public OpKernel {
             "In[0] and In[1] must have compatible batch dimensions: ",
             lhs.shape().DebugString(), " vs. ", rhs.shape().DebugString()));
 
-#ifdef ENABLE_MKLDNN_THREADPOOL
+#if defined(INTEL_MKL_DNN_ONLY)
     if (bcast.IsBroadcastingRequired()) {
       // Calling Eigen Kernel for broadcasting case and return. Eigen does
       // not have BF16 support, so we have to fail graciously in that case.
       eigen_batch_mm_v2_.Compute(ctx);
       return;
     }
-#endif  // ENABLE_MKLDNN_THREADPOOL
+#endif  // INTEL_MKL_DNN_ONLY
     TensorShape out_shape = bcast.output_batch_shape();
     auto batch_size = bcast.output_batch_size();
 
@@ -158,11 +164,11 @@ class BatchMatMulMkl : public OpKernel {
     std::vector<MKL_INT> ldc_array(batch_size, N);
     std::vector<MKL_INT> group_size(1, batch_size);
 
-    bool threadpool_enabled = false;
-#ifdef ENABLE_MKLDNN_THREADPOOL
-    threadpool_enabled = true;
-#endif  // ENABLE_MKLDNN_THREADPOOL
-    if (std::is_same<Scalar, bfloat16>::value || threadpool_enabled) {
+    bool bcast_not_supported = false;
+#if defined(INTEL_MKL_DNN_ONLY)
+    bcast_not_supported = true;
+#endif  // INTEL_MKL_DNN_ONLY
+    if (std::is_same<Scalar, bfloat16>::value || bcast_not_supported) {
       // DNNL bfloat16 API requires a, b, and c as pointers to tensors
       // represented as flat-byte array.
       const Scalar* a = nullptr;
@@ -227,7 +233,7 @@ class BatchMatMulMkl : public OpKernel {
       const std::vector<MKL_INT>& ldb_Array, float** C_Array,
       const std::vector<MKL_INT>& ldc_Array, const MKL_INT group_count,
       const std::vector<MKL_INT>& group_size, OpKernelContext* ctx) {
-#ifndef ENABLE_MKLDNN_THREADPOOL
+#if !defined(INTEL_MKL_DNN_ONLY)
     std::vector<CBLAS_TRANSPOSE> TransA_Array(
         group_size[0], TransA ? CblasTrans : CblasNoTrans);
     std::vector<CBLAS_TRANSPOSE> TransB_Array(
@@ -249,7 +255,7 @@ class BatchMatMulMkl : public OpKernel {
     dnnl_gemm_batch<float>(TransA_Array, TransB_Array, M_Array, N_Array,
                            K_Array, alpha_Array, *A_Array, *B_Array, beta_Array,
                            *C_Array, group_count, group_size, ctx);
-#endif  // !ENABLE_MKLDNN_THREADPOOL
+#endif  // !INTEL_MKL_DNN_ONLY
   }
 // BatchMatMul BFloat16 support only exists in DNNL 1.2 onwards.
 #if defined(ENABLE_MKLDNN_V1) && defined(ENABLE_INTEL_MKL_BFLOAT16)
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl/mkl_concat_op.cc
similarity index 99%
rename from tensorflow/core/kernels/mkl_concat_op.cc
rename to tensorflow/core/kernels/mkl/mkl_concat_op.cc
index 4a5cb0a0d4f..086dc46d903 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_concat_op.cc
@@ -376,19 +376,20 @@ class MklConcatFwdPrimitive : public MklPrimitive {
       context_.data_mem_shdptr.push_back(src_mem);
       context_.data_mem.push_back(*context_.data_mem_shdptr[i]);
     }
+    // Store the expected memory format
+    context_.dst_md.reset(new memory::desc({concat_fwd_dims.dst_dims},
+                                           MklDnnType<T>(),
+                                           concat_fwd_dims.mkl_common_format));
 // Create a concat primitive descriptor
 #ifdef ENABLE_MKLDNN_V1
     context_.fwd_pd.reset(new concat::primitive_desc(
-        concat_fwd_dims.concat_dims, context_.src_md, cpu_engine_));
+        *context_.dst_md, concat_fwd_dims.concat_dims, context_.src_md,
+        cpu_engine_));
 #else
     context_.fwd_pd.reset(new concat::primitive_desc(
         concat_fwd_dims.concat_dims, context_.src_pd));
 #endif  // ENABLE_MKLDNN_V1
 
-    // Store the expected memory format
-    context_.dst_md.reset(new memory::desc({concat_fwd_dims.dst_dims},
-                                           MklDnnType<T>(),
-                                           concat_fwd_dims.mkl_common_format));
 #ifdef ENABLE_MKLDNN_V1
     // Create memory primitive based on dummy data
     context_.dst_mem.reset(
@@ -404,8 +405,7 @@ class MklConcatFwdPrimitive : public MklPrimitive {
 #ifdef ENABLE_MKLDNN_V1
     context_.concat_fwd.reset(new concat(*context_.fwd_pd));
     std::unordered_map<int, memory> net_args = {
-        { MKLDNN_ARG_DST,
-          *context_.dst_mem }};
+        {MKLDNN_ARG_DST, *context_.dst_mem}};
     for (int i = 0; i < concat_fwd_dims.num_inputs; ++i) {
       net_args.insert({MKLDNN_ARG_MULTIPLE_SRC + i, context_.data_mem[i]});
     }
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl/mkl_conv_grad_filter_ops.cc
similarity index 99%
rename from tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
rename to tensorflow/core/kernels/mkl/mkl_conv_grad_filter_ops.cc
index 12581d0bfa5..339ab938cca 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl/mkl_conv_grad_filter_ops.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
-#include "tensorflow/core/kernels/mkl_conv_ops.h"
+#include "tensorflow/core/kernels/mkl/mkl_conv_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl/mkl_conv_grad_input_ops.cc
similarity index 99%
rename from tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
rename to tensorflow/core/kernels/mkl/mkl_conv_grad_input_ops.cc
index 7177431029a..2e700d0a627 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl/mkl_conv_grad_input_ops.cc
@@ -35,7 +35,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
 #include "tensorflow/core/kernels/conv_grad_shape_utils.h"
-#include "tensorflow/core/kernels/mkl_conv_ops.h"
+#include "tensorflow/core/kernels/mkl/mkl_conv_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
similarity index 99%
rename from tensorflow/core/kernels/mkl_conv_ops.cc
rename to tensorflow/core/kernels/mkl/mkl_conv_ops.cc
index 210044436aa..84fa20ed221 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
@@ -16,7 +16,7 @@ limitations under the License.
 // See docs in ../ops/nn_ops.cc.
 #ifdef INTEL_MKL
 
-#include "tensorflow/core/kernels/mkl_conv_ops.h"
+#include "tensorflow/core/kernels/mkl/mkl_conv_ops.h"
 
 #include <algorithm>
 #include <map>
@@ -33,7 +33,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/mkl_quantized_conv_ops.h"
+#include "tensorflow/core/kernels/mkl/mkl_quantized_conv_ops.h"
 #include "tensorflow/core/kernels/no_op.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl/mkl_conv_ops.h
similarity index 99%
rename from tensorflow/core/kernels/mkl_conv_ops.h
rename to tensorflow/core/kernels/mkl/mkl_conv_ops.h
index 2ee2a621067..c4a4942e877 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl/mkl_conv_ops.h
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
-#define TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_MKL_MKL_CONV_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_MKL_MKL_CONV_OPS_H_
 
+#ifdef INTEL_MKL
 #include <limits>
 #include <memory>
 #include <vector>
@@ -640,4 +641,5 @@ class MklDummyOp : public OpKernel {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
+#endif  // INTEL_MKL
+#endif  // TENSORFLOW_CORE_KERNELS_MKL_MKL_CONV_OPS_H_
diff --git a/tensorflow/core/kernels/mkl_conv_ops_test.cc b/tensorflow/core/kernels/mkl/mkl_conv_ops_test.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_conv_ops_test.cc
rename to tensorflow/core/kernels/mkl/mkl_conv_ops_test.cc
diff --git a/tensorflow/core/kernels/mkl_cwise_ops_common.cc b/tensorflow/core/kernels/mkl/mkl_cwise_ops_common.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_cwise_ops_common.cc
rename to tensorflow/core/kernels/mkl/mkl_cwise_ops_common.cc
diff --git a/tensorflow/core/kernels/mkl_dequantize_op.cc b/tensorflow/core/kernels/mkl/mkl_dequantize_op.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_dequantize_op.cc
rename to tensorflow/core/kernels/mkl/mkl_dequantize_op.cc
diff --git a/tensorflow/core/kernels/mkl_dequantize_op_test.cc b/tensorflow/core/kernels/mkl/mkl_dequantize_op_test.cc
similarity index 88%
rename from tensorflow/core/kernels/mkl_dequantize_op_test.cc
rename to tensorflow/core/kernels/mkl/mkl_dequantize_op_test.cc
index b400fb761cb..564c2829e99 100644
--- a/tensorflow/core/kernels/mkl_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_dequantize_op_test.cc
@@ -62,23 +62,6 @@ TEST_F(MklDequantizeOpTest, small) {
   test::ExpectTensorNear<float>(expected, output, 0.1);
 }
 
-Tensor CreateMklInput() {
-  MklDnnShape mkl_shape;
-  memory::desc md =
-      memory::desc({1, 2, 2, 2}, MklDnnType<uint8>(), memory::format::nhwc);
-  mkl_shape.SetMklTensor(true);
-  mkl_shape.SetMklLayout(&md);
-  mkl_shape.SetElemType(MklDnnType<uint8>());
-  mkl_shape.SetTfLayout(4, {1, 2, 2, 2}, memory::format::nhwc);
-
-  DataType dtype = DataTypeToEnum<uint8>::v();
-  Tensor mkl_tensor(dtype, {mkl_shape.GetSerializeBufferSize()});
-  mkl_shape.SerializeMklDnnShape(
-      mkl_tensor.flat<uint8>().data(),
-      mkl_tensor.flat<uint8>().size() * sizeof(uint8));
-  return mkl_tensor;
-}
-
 template <typename T>
 class CommonTestUtilities : public OpsTestBase {
  public:
@@ -129,8 +112,7 @@ TEST_F(MklDequantizeOpTest, MKLInput) {
   AddInputFromArray<float>(TensorShape({1}), {0});
   // max_range = 200
   AddInputFromArray<float>(TensorShape({1}), {200.0f});
-  auto mkl_tensor = CreateMklInput();
-  AddInputFromArray<uint8>(mkl_tensor.shape(), mkl_tensor.flat<uint8>());
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
   AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
   AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
   TF_ASSERT_OK(RunOpKernel());
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
rename to tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op.cc
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op_test.cc b/tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_fused_batch_norm_op_test.cc
rename to tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op_test.cc
diff --git a/tensorflow/core/kernels/mkl_fused_ops_test.cc b/tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc
similarity index 93%
rename from tensorflow/core/kernels/mkl_fused_ops_test.cc
rename to tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc
index edd1201a09c..1f1bbd158f9 100644
--- a/tensorflow/core/kernels/mkl_fused_ops_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc
@@ -331,7 +331,7 @@ class MklFusedConv2DOpTest : public OpsTestBase {
 template <typename T>
 class MklFusedConv2DWithBiasOpTest : public MklFusedConv2DOpTest<T> {};
 
-TYPED_TEST_CASE_P(MklFusedConv2DWithBiasOpTest);
+TYPED_TEST_SUITE_P(MklFusedConv2DWithBiasOpTest);
 
 // -------------------------------------------------------------------------- //
 // Conv2D + BiasAdd + {Activation}                                            //
@@ -437,7 +437,7 @@ TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, SpatialConvolutionAndAddElu) {
   this->VerifyFusedConv2D(kFilterSize, kFilterCount, {"BiasAdd", "Add", "Elu"});
 }
 
-REGISTER_TYPED_TEST_CASE_P(
+REGISTER_TYPED_TEST_SUITE_P(
     MklFusedConv2DWithBiasOpTest, OneByOneConvolution, SpatialConvolution,
     OneByOneConvolutionAndRelu, SpatialConvolutionAndRelu,
     OneByOneConvolutionAndRelu6, SpatialConvolutionAndRelu6,
@@ -448,8 +448,8 @@ REGISTER_TYPED_TEST_CASE_P(
     OneByOneConvolutionAndAddElu, SpatialConvolutionAndAddElu);
 
 using MklFusedBiasAddDataTypes = ::testing::Types<float>;
-INSTANTIATE_TYPED_TEST_CASE_P(Test, MklFusedConv2DWithBiasOpTest,
-                              MklFusedBiasAddDataTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(Test, MklFusedConv2DWithBiasOpTest,
+                               MklFusedBiasAddDataTypes);
 
 // Testing MKL's fused depthwise convolution ops
 template <typename T>
@@ -668,14 +668,52 @@ INSTANTIATE_TYPED_TEST_SUITE_P(Test, MklFusedDepthwiseConv2DWithBiasOpTest,
                                MklFusedBiasAddDataTypes);
 
 // Testing fusion of pad and convolution
-
+template <typename T>
 class FusedPadConvOpTest : public OpsTestBase {
  public:
-  template <typename T>
-  void Run(DataType dtype, Tensor& image, Tensor& filter, Tensor& padding,
-           Tensor& expected, const string data_format) {
+  void Run(const string data_format) {
+    DataType dtype = DataTypeToEnum<T>::v();
+    const int depth = 1;
+    const int image_width = 4;
+    const int image_height = 3;
+    const int image_batch_count = 1;
     const int stride = 1;
 
+    Tensor image, expected;
+    if (data_format == "NHWC") {
+      image =
+          Tensor(dtype, {image_batch_count, image_height, image_width, depth});
+    } else {
+      image =
+          Tensor(dtype, {image_batch_count, depth, image_height, image_width});
+    }
+    test::FillValues<T>(&image, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+
+    const int kFilterSize = 3;
+    const int kFilterCount = 1;
+    Tensor filter(dtype, {kFilterSize, kFilterSize, depth, kFilterCount});
+    test::FillValues<T>(&filter, {1, 4, 7, 2, 5, 8, 3, 6, 9});
+
+    const int padding_height = 4;
+    const int padding_width = 2;
+    Tensor padding(DT_INT32, {padding_height, padding_width});
+    if (data_format == "NHWC") {
+      test::FillValues<int32>(&padding, {0, 0, 3, 4, 1, 2, 0, 0});
+    } else {
+      test::FillValues<int32>(&padding, {0, 0, 0, 0, 3, 4, 1, 2});
+    }
+
+    if (data_format == "NHWC") {
+      expected = Tensor(dtype, TensorShape({1, 8, 5, 1}));
+    } else {
+      expected = Tensor(dtype, TensorShape({1, 1, 8, 5}));
+    }
+    test::FillValues<T>(
+        &expected,
+        {0,  0,   0,   0,   0,   24, 42,  60,  33,  12,  105, 150, 183, 95,
+         32, 235, 312, 357, 178, 56, 187, 234, 261, 121, 32,  106, 126, 138,
+         59, 12,  0,   0,   0,   0,  0,   0,   0,   0,   0,   0});
+
     // Create a fused pad+conv2d node
     TF_EXPECT_OK(NodeDefBuilder("fused_pad_conv_op", "_MklPadWithConv2D")
                      .Input(FakeInput(dtype))     // Input
@@ -705,65 +743,25 @@ class FusedPadConvOpTest : public OpsTestBase {
     const Tensor& first = *GetOutput(0);
     const Tensor& second = *GetOutput(2);
     CommonTestUtilities<T> test_util;
-    test_util.ConvertAndCompare(dtype, first, second, expected);
+    test_util.ConvertAndCompareIntegral(dtype, first, second, expected);
   }
 };
 
-TEST_F(FusedPadConvOpTest, PaddingConvTest) {
-  const int depth = 1;
-  const int image_width = 4;
-  const int image_height = 3;
-  const int image_batch_count = 1;
-  Tensor image(DT_FLOAT, {image_batch_count, image_height, image_width, depth});
-  test::FillValues<float>(&image, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+TYPED_TEST_SUITE_P(FusedPadConvOpTest);
 
-  const int kFilterSize = 3;
-  const int kFilterCount = 1;
-  Tensor filter(DT_FLOAT, {kFilterSize, kFilterSize, depth, kFilterCount});
-  test::FillValues<float>(&filter, {1, 4, 7, 2, 5, 8, 3, 6, 9});
+TYPED_TEST_P(FusedPadConvOpTest, PaddingConvTest) { this->Run("NHWC"); }
 
-  const int padding_height = 4;
-  const int padding_width = 2;
-  Tensor padding(DT_INT32, {padding_height, padding_width});
-  test::FillValues<int32>(&padding, {0, 0, 3, 4, 1, 2, 0, 0});
+TYPED_TEST_P(FusedPadConvOpTest, PaddingConvTestNchw) { this->Run("NCHW"); }
 
-  Tensor expected(DT_FLOAT, TensorShape({1, 8, 5, 1}));
-  test::FillValues<float>(
-      &expected,
-      {0,  0,   0,   0,   0,   24, 42,  60,  33,  12,  105, 150, 183, 95,
-       32, 235, 312, 357, 178, 56, 187, 234, 261, 121, 32,  106, 126, 138,
-       59, 12,  0,   0,   0,   0,  0,   0,   0,   0,   0,   0});
+REGISTER_TYPED_TEST_SUITE_P(FusedPadConvOpTest, PaddingConvTest,
+                            PaddingConvTestNchw);
 
-  Run<float>(DT_FLOAT, image, filter, padding, expected, "NHWC");
-}
-
-TEST_F(FusedPadConvOpTest, PaddingConvTestNchw) {
-  const int depth = 1;
-  const int image_width = 4;
-  const int image_height = 3;
-  const int image_batch_count = 1;
-  Tensor image(DT_FLOAT, {image_batch_count, depth, image_height, image_width});
-  test::FillValues<float>(&image, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
-
-  const int kFilterSize = 3;
-  const int kFilterCount = 1;
-  Tensor filter(DT_FLOAT, {kFilterSize, kFilterSize, depth, kFilterCount});
-  test::FillValues<float>(&filter, {1, 4, 7, 2, 5, 8, 3, 6, 9});
-
-  const int padding_height = 4;
-  const int padding_width = 2;
-  Tensor padding(DT_INT32, {padding_height, padding_width});
-  test::FillValues<int32>(&padding, {0, 0, 0, 0, 3, 4, 1, 2});
-
-  Tensor expected(DT_FLOAT, TensorShape({1, 1, 8, 5}));
-  test::FillValues<float>(
-      &expected,
-      {0,  0,   0,   0,   0,   24, 42,  60,  33,  12,  105, 150, 183, 95,
-       32, 235, 312, 357, 178, 56, 187, 234, 261, 121, 32,  106, 126, 138,
-       59, 12,  0,   0,   0,   0,  0,   0,   0,   0,   0,   0});
-
-  Run<float>(DT_FLOAT, image, filter, padding, expected, "NCHW");
-}
+#ifdef ENABLE_INTEL_MKL_BFLOAT16
+using FusedPadConvDataTypes = ::testing::Types<float, bfloat16>;
+#else
+using FusedPadConvDataTypes = ::testing::Types<float>;
+#endif
+INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedPadConvOpTest, FusedPadConvDataTypes);
 
 class FilterCacheTest : public OpsTestBase {
  public:
@@ -928,7 +926,7 @@ class MklFusedMatMulOpTest : public OpsTestBase {
   }
 };
 
-TYPED_TEST_CASE_P(MklFusedMatMulOpTest);
+TYPED_TEST_SUITE_P(MklFusedMatMulOpTest);
 
 TYPED_TEST_P(MklFusedMatMulOpTest, WithBias) {
   const int batch = 3;
@@ -965,15 +963,15 @@ TYPED_TEST_P(MklFusedMatMulOpTest, WithBiasAndElu) {
                           {"BiasAdd", "Elu"});
 }
 
-REGISTER_TYPED_TEST_CASE_P(MklFusedMatMulOpTest,  //
-                           WithBias,              //
-                           WithBiasAndRelu,       //
-                           WithBiasAndRelu6,      //
-                           WithBiasAndElu);
+REGISTER_TYPED_TEST_SUITE_P(MklFusedMatMulOpTest,  //
+                            WithBias,              //
+                            WithBiasAndRelu,       //
+                            WithBiasAndRelu6,      //
+                            WithBiasAndElu);
 
 using MklFusedMatMulDataTypes = ::testing::Types<float>;
-INSTANTIATE_TYPED_TEST_CASE_P(Test, MklFusedMatMulOpTest,
-                              MklFusedMatMulDataTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(Test, MklFusedMatMulOpTest,
+                               MklFusedMatMulDataTypes);
 
 // Test the performance of MklFusedMatMul weight cache.
 // For the first time B matrix will be reordered and cached which will be
@@ -1363,7 +1361,7 @@ class MklPadWithFusedConv2DOpTest : public OpsTestBase {
   }
 };
 
-TYPED_TEST_CASE_P(MklPadWithFusedConv2DOpTest);
+TYPED_TEST_SUITE_P(MklPadWithFusedConv2DOpTest);
 
 TYPED_TEST_P(MklPadWithFusedConv2DOpTest, WithBiasAndRoundPad) {
   const int kFilterSize = 1;
@@ -1393,15 +1391,15 @@ TYPED_TEST_P(MklPadWithFusedConv2DOpTest, WithBiasReluAndPartialPad) {
   this->VerifyPadAndConv2DWithBiasRelu(kFilterSize, kFilterCount);
 }
 
-REGISTER_TYPED_TEST_CASE_P(MklPadWithFusedConv2DOpTest,  //
-                           WithBiasAndRoundPad,          //
-                           WithBiasAndPartialPad,        //
-                           WithBiasReluAndRoundPad,      //
-                           WithBiasReluAndPartialPad);
+REGISTER_TYPED_TEST_SUITE_P(MklPadWithFusedConv2DOpTest,  //
+                            WithBiasAndRoundPad,          //
+                            WithBiasAndPartialPad,        //
+                            WithBiasReluAndRoundPad,      //
+                            WithBiasReluAndPartialPad);
 
 using MklPadWithFusedConv2DDataTypes = ::testing::Types<float>;
-INSTANTIATE_TYPED_TEST_CASE_P(Test, MklPadWithFusedConv2DOpTest,
-                              MklPadWithFusedConv2DDataTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(Test, MklPadWithFusedConv2DOpTest,
+                               MklPadWithFusedConv2DDataTypes);
 
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_identity_op.cc b/tensorflow/core/kernels/mkl/mkl_identity_op.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_identity_op.cc
rename to tensorflow/core/kernels/mkl/mkl_identity_op.cc
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl/mkl_input_conversion_op.cc
similarity index 99%
rename from tensorflow/core/kernels/mkl_input_conversion_op.cc
rename to tensorflow/core/kernels/mkl/mkl_input_conversion_op.cc
index f7866cbcea6..ae130700a8d 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_input_conversion_op.cc
@@ -17,21 +17,21 @@ limitations under the License.
 
 #include <algorithm>
 #include <vector>
+
+#include "mkldnn.hpp"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/mkl/mkl_tfconv_op.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/util/tensor_format.h"
-
-#include "mkldnn.hpp"
-#include "tensorflow/core/kernels/mkl_tfconv_op.h"
 #include "tensorflow/core/util/mkl_util.h"
+#include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl/mkl_lrn_op.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_lrn_op.cc
rename to tensorflow/core/kernels/mkl/mkl_lrn_op.cc
diff --git a/tensorflow/core/kernels/mkl_matmul_op.cc b/tensorflow/core/kernels/mkl/mkl_matmul_op.cc
similarity index 99%
rename from tensorflow/core/kernels/mkl_matmul_op.cc
rename to tensorflow/core/kernels/mkl/mkl_matmul_op.cc
index c92fceb415c..81339489223 100644
--- a/tensorflow/core/kernels/mkl_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_matmul_op.cc
@@ -34,7 +34,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/fill_functor.h"
-#include "tensorflow/core/kernels/mkl_matmul_ops_common.h"
+#include "tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h"
 #include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/mkl_matmul_op_fused.cc b/tensorflow/core/kernels/mkl/mkl_matmul_op_fused.cc
similarity index 99%
rename from tensorflow/core/kernels/mkl_matmul_op_fused.cc
rename to tensorflow/core/kernels/mkl/mkl_matmul_op_fused.cc
index 9e05d3c0cfe..4dd7e3f8c6e 100644
--- a/tensorflow/core/kernels/mkl_matmul_op_fused.cc
+++ b/tensorflow/core/kernels/mkl/mkl_matmul_op_fused.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/fill_functor.h"
-#include "tensorflow/core/kernels/mkl_matmul_ops_common.h"
+#include "tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/mkl_matmul_ops_common.h b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
similarity index 98%
rename from tensorflow/core/kernels/mkl_matmul_ops_common.h
rename to tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
index d7af614ad04..fc03374a414 100644
--- a/tensorflow/core/kernels/mkl_matmul_ops_common.h
+++ b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_MKL_MATMUL_OPS_COMMON_H_
-#define TENSORFLOW_CORE_KERNELS_MKL_MATMUL_OPS_COMMON_H_
+#ifndef TENSORFLOW_CORE_KERNELS_MKL_MKL_MATMUL_OPS_COMMON_H_
+#define TENSORFLOW_CORE_KERNELS_MKL_MKL_MATMUL_OPS_COMMON_H_
 
 #ifdef INTEL_MKL
 #include <memory>
@@ -35,6 +35,12 @@ using mkldnn::stream;
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
+#ifdef INTEL_MKL_DNN_ONLY
+// Temporarily copying some definitions from mkl_cblas.h so the same code can
+// be used when calling oneDNN or CBLAS batchmatmul in mkl_batch_matmul_op.cc.
+typedef enum { CblasRowMajor, CblasColumnMajor } CBLAS_LAYOUT;
+#define MKL_INT int
+#endif
 
 // This structure aggregates multiple inputs to MklDnnMatMul* methods.
 struct MklDnnMatMulFwdParams {
@@ -812,4 +818,4 @@ void dnnl_gemm(char transa, char transb, int64_t m, int64_t n, int64_t k,
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
-#endif  // TENSORFLOW_CORE_KERNELS_MKL_MATMUL_OPS_COMMON_H_
+#endif  // TENSORFLOW_CORE_KERNELS_MKL_MKL_MATMUL_OPS_COMMON_H_
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc
similarity index 99%
rename from tensorflow/core/kernels/mkl_maxpooling_op.cc
rename to tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc
index ac6a1046507..ca7ebd7fd12 100644
--- a/tensorflow/core/kernels/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "mkldnn.hpp"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
+#include "tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
@@ -312,8 +312,7 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
           orig_input_dims_mkl_order, output_dims_mkl_order, filter_dims,
           strides, padding_left, padding_right, ALGORITHM::pooling_max,
           prop_kind::forward_training,
-          static_cast<MEMORY_FORMAT>(this->data_format_mkldnn_), src_md,
-          diff_dst_md);
+          static_cast<MEMORY_FORMAT>(this->data_format_mkldnn_), src_md);
 #else
       MklPoolingParams bwdParams(
           orig_input_dims_mkl_order, output_dims_mkl_order, filter_dims,
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.cc
similarity index 95%
rename from tensorflow/core/kernels/mkl_pooling_ops_common.cc
rename to tensorflow/core/kernels/mkl/mkl_pooling_ops_common.cc
index 5f1c9129ec3..9824fabce0e 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #ifdef INTEL_MKL
 
-#include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
+#include "tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h"
 
 #include <limits>
 #include <vector>
@@ -180,13 +180,15 @@ void MklPoolingBwdPrimitive<T>::Setup(const MklPoolingParams& bwdParams) {
   context_.alg_kind = bwdParams.alg_kind;
 
   // Create memory descriptor.
-  context_.diff_src_md.reset(new memory::desc(
-      {bwdParams.src_dims}, MklDnnType<T>(), MEMORY_FORMAT::any));
+  context_.src_md.reset(new memory::desc({bwdParams.src_dims}, MklDnnType<T>(),
+                                         MEMORY_FORMAT::any));
 #ifndef ENABLE_MKLDNN_V1
   context_.diff_dst_md.reset(new memory::desc(
       {bwdParams.dst_dims}, MklDnnType<T>(), bwdParams.src_format));
 #else
-  context_.diff_dst_md.reset(new memory::desc(bwdParams.diff_dst_md.data));
+  context_.src_md.reset(new memory::desc(bwdParams.src_md.data));
+  context_.dst_md.reset(new memory::desc({bwdParams.dst_dims}, MklDnnType<T>(),
+                                         MEMORY_FORMAT::any));
 #endif  // !ENABLE_MKLDNN_V1
 
 #ifndef ENABLE_MKLDNN_V1
@@ -202,15 +204,17 @@ void MklPoolingBwdPrimitive<T>::Setup(const MklPoolingParams& bwdParams) {
       *context_.diff_dst_md, bwdParams.strides, bwdParams.filter_dims,
       bwdParams.padding_left, bwdParams.padding_right, padding_kind::zero));
 #else
+  // Create a backward primitive. The implementation for backward must comply to
+  // the workspace format it gets from forward pass, so we directly use src_md
+  // and dst_md here.
   context_.bwd_desc.reset(new pooling_backward::desc(
-      bwdParams.alg_kind, *context_.diff_src_md, *context_.diff_dst_md,
-      bwdParams.strides, bwdParams.filter_dims, bwdParams.padding_left,
-      bwdParams.padding_right));
+      bwdParams.alg_kind, *context_.src_md, *context_.dst_md, bwdParams.strides,
+      bwdParams.filter_dims, bwdParams.padding_left, bwdParams.padding_right));
   // Create a forward primitive,
   // which will be used as a hint for creating backward primitive.
   context_.fwd_desc.reset(new pooling_forward::desc(
-      bwdParams.prop_kind, bwdParams.alg_kind, *context_.diff_src_md,
-      *context_.diff_dst_md, bwdParams.strides, bwdParams.filter_dims,
+      bwdParams.prop_kind, bwdParams.alg_kind, *context_.src_md,
+      *context_.dst_md, bwdParams.strides, bwdParams.filter_dims,
       bwdParams.padding_left, bwdParams.padding_right));
 #endif  // !ENABLE_MKLDNN_V1
   context_.fwd_pd.reset(
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h
similarity index 97%
rename from tensorflow/core/kernels/mkl_pooling_ops_common.h
rename to tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h
index cb3674b2dd4..3a608a66c16 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_MKL_POOLING_OPS_COMMON_H_
-#define TENSORFLOW_CORE_KERNELS_MKL_POOLING_OPS_COMMON_H_
+#ifndef TENSORFLOW_CORE_KERNELS_MKL_MKL_POOLING_OPS_COMMON_H_
+#define TENSORFLOW_CORE_KERNELS_MKL_MKL_POOLING_OPS_COMMON_H_
 
 #ifdef INTEL_MKL
 
@@ -49,20 +49,12 @@ struct MklPoolingParams {
   mkldnn::prop_kind prop_kind;
   MEMORY_FORMAT src_format;
   memory::desc src_md;
-#ifdef ENABLE_MKLDNN_V1
-  memory::desc diff_dst_md;
-#endif  // ENABLE_MKLDNN_V1
 
   MklPoolingParams(memory::dims src_dims, memory::dims dst_dims,
                    memory::dims filter_dims, memory::dims strides,
                    memory::dims padding_left, memory::dims padding_right,
                    mkldnn::algorithm alg_kind, mkldnn::prop_kind prop_kind,
-#ifdef ENABLE_MKLDNN_V1
-                   MEMORY_FORMAT src_format, memory::desc src_md,
-                   memory::desc diff_dst_md = memory::desc())
-#else
                    MEMORY_FORMAT src_format, memory::desc src_md)
-#endif  // ENABLE_MKLDNN_V1
       : src_dims(src_dims),
         dst_dims(dst_dims),
         filter_dims(filter_dims),
@@ -72,14 +64,7 @@ struct MklPoolingParams {
         alg_kind(alg_kind),
         prop_kind(prop_kind),
         src_format(src_format),
-#ifdef ENABLE_MKLDNN_V1
-        src_md(src_md),
-        diff_dst_md(diff_dst_md) {
-  }
-#else
-        src_md(src_md) {
-  }
-#endif  // ENABLE_MKLDNN_V1
+        src_md(src_md) {}
 };
 
 template <typename T>
@@ -280,8 +265,12 @@ class MklPoolingBwdPrimitive : public MklPrimitive {
     std::shared_ptr<mkldnn::memory> diff_dst_mem;
 
     // Memory descriptors.
+    std::shared_ptr<mkldnn::memory::desc> src_md;
+    std::shared_ptr<mkldnn::memory::desc> dst_md;
+#ifndef ENABLE_MKLDNN_V1
     std::shared_ptr<mkldnn::memory::desc> diff_src_md;
     std::shared_ptr<mkldnn::memory::desc> diff_dst_md;
+#endif
 
     // Forward and backward pooling descriptors and primitive descriptors.
     std::shared_ptr<mkldnn::pooling_forward::desc> fwd_desc;
@@ -306,13 +295,18 @@ class MklPoolingBwdPrimitive : public MklPrimitive {
           ws_mem(nullptr),
           diff_src_mem(nullptr),
           diff_dst_mem(nullptr),
+          src_md(nullptr),
+          dst_md(nullptr),
+#ifndef ENABLE_MKLDNN_V1
           diff_src_md(nullptr),
           diff_dst_md(nullptr),
+#endif
           fwd_desc(nullptr),
           bwd_desc(nullptr),
           fwd_pd(nullptr),
           bwd_pd(nullptr),
-          bwd(nullptr) {}
+          bwd(nullptr) {
+    }
   };
 
   struct PoolingBwdContext context_;
@@ -734,4 +728,4 @@ class MklPoolingBackwardOpBase : public MklPoolingOpBase<T> {
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
-#endif  // TENSORFLOW_CORE_KERNELS_MKL_POOLING_OPS_COMMON_H_
+#endif  // TENSORFLOW_CORE_KERNELS_MKL_MKL_POOLING_OPS_COMMON_H_
diff --git a/tensorflow/core/kernels/mkl_qmatmul_op.cc b/tensorflow/core/kernels/mkl/mkl_qmatmul_op.cc
similarity index 99%
rename from tensorflow/core/kernels/mkl_qmatmul_op.cc
rename to tensorflow/core/kernels/mkl/mkl_qmatmul_op.cc
index b59612433e6..1cc1945dd4b 100644
--- a/tensorflow/core/kernels/mkl_qmatmul_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_qmatmul_op.cc
@@ -94,8 +94,8 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/fill_functor.h"
-#include "tensorflow/core/kernels/mkl_matmul_ops_common.h"
-#include "tensorflow/core/kernels/mkl_quantized_conv_ops.h"
+#include "tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h"
+#include "tensorflow/core/kernels/mkl/mkl_quantized_conv_ops.h"
 #include "tensorflow/core/kernels/no_op.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/mkl_threadpool.h"
diff --git a/tensorflow/core/kernels/mkl_qmatmul_op_test.cc b/tensorflow/core/kernels/mkl/mkl_qmatmul_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_qmatmul_op_test.cc
rename to tensorflow/core/kernels/mkl/mkl_qmatmul_op_test.cc
diff --git a/tensorflow/core/kernels/mkl_quantize_op.cc b/tensorflow/core/kernels/mkl/mkl_quantize_op.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_quantize_op.cc
rename to tensorflow/core/kernels/mkl/mkl_quantize_op.cc
diff --git a/tensorflow/core/kernels/mkl_quantize_op_test.cc b/tensorflow/core/kernels/mkl/mkl_quantize_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_quantize_op_test.cc
rename to tensorflow/core/kernels/mkl/mkl_quantize_op_test.cc
diff --git a/tensorflow/core/kernels/mkl_quantized_concat_op_test.cc b/tensorflow/core/kernels/mkl/mkl_quantized_concat_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_quantized_concat_op_test.cc
rename to tensorflow/core/kernels/mkl/mkl_quantized_concat_op_test.cc
diff --git a/tensorflow/core/kernels/mkl_quantized_conv_ops.h b/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops.h
similarity index 95%
rename from tensorflow/core/kernels/mkl_quantized_conv_ops.h
rename to tensorflow/core/kernels/mkl/mkl_quantized_conv_ops.h
index 4121c88fb83..9fd699cf704 100644
--- a/tensorflow/core/kernels/mkl_quantized_conv_ops.h
+++ b/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_MKL_QUANTIZED_CONV_OPS_H_
-#define TENSORFLOW_CORE_KERNELS_MKL_QUANTIZED_CONV_OPS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_MKL_MKL_QUANTIZED_CONV_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_MKL_MKL_QUANTIZED_CONV_OPS_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor.h"
@@ -90,4 +90,4 @@ void MklQuantizationRangeForMultiplication(float min_a, float max_a,
 
 #endif  // INTEL_MKL
 
-#endif  // TENSORFLOW_CORE_KERNELS_MKL_QUANTIZED_CONV_OPS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_MKL_MKL_QUANTIZED_CONV_OPS_H_
diff --git a/tensorflow/core/kernels/mkl_quantized_conv_ops_perchannel_test.cc b/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_perchannel_test.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_quantized_conv_ops_perchannel_test.cc
rename to tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_perchannel_test.cc
diff --git a/tensorflow/core/kernels/mkl_quantized_conv_ops_test.cc b/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_test.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_quantized_conv_ops_test.cc
rename to tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_test.cc
diff --git a/tensorflow/core/kernels/mkl_quantized_pooling_ops_test.cc b/tensorflow/core/kernels/mkl/mkl_quantized_pooling_ops_test.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_quantized_pooling_ops_test.cc
rename to tensorflow/core/kernels/mkl/mkl_quantized_pooling_ops_test.cc
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl/mkl_relu_op.cc
similarity index 99%
rename from tensorflow/core/kernels/mkl_relu_op.cc
rename to tensorflow/core/kernels/mkl/mkl_relu_op.cc
index 5d52742d558..a1ba5dc93af 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_relu_op.cc
@@ -924,7 +924,7 @@ class MklEluOp : public MklReluOpBase<Device, T, ALGORITHM::eltwise_elu> {
     // return exp(feature) - 1 if feature > 0; feature otherwise
     T feature = (static_cast<T*>(user_i))[0];
     if (feature < static_cast<T>(0))
-      (static_cast<T*>(out_o))[0] = std::exp(feature);
+      (static_cast<T*>(out_o))[0] = Eigen::numext::exp(feature);
     else
       (static_cast<T*>(out_o))[0] = feature;
     return;
@@ -966,7 +966,7 @@ class MklEluGradOp
     if (feature > static_cast<T>(0)) {
       (static_cast<T*>(out_o))[0] = (static_cast<T*>(user_g))[0];
     } else {
-      T elu = std::exp(feature) - static_cast<T>(1);
+      T elu = Eigen::numext::exp(feature) - static_cast<T>(1);
       (static_cast<T*>(out_o))[0] =
           (static_cast<T*>(user_g))[0] * (elu + static_cast<T>(1));
     }
@@ -1004,8 +1004,8 @@ class MklTanhOp : public MklReluOpBase<Device, T, ALGORITHM::eltwise_tanh> {
     void* out_o = static_cast<void*>(dst_tensor->flat<T>().data());
     // tanh(x) = (e^x - e^(-x))/ (e^x + e^(-x))
     T feature = (static_cast<T*>(user_i))[0];
-    T e1 = std::exp(feature);
-    T e2 = std::exp(-feature);
+    T e1 = Eigen::numext::exp(feature);
+    T e2 = Eigen::numext::exp(-feature);
     (static_cast<T*>(out_o))[0] = (e1 - e2) / (e1 + e2);
     return;
   }
diff --git a/tensorflow/core/kernels/mkl_relu_op_test.cc b/tensorflow/core/kernels/mkl/mkl_relu_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_relu_op_test.cc
rename to tensorflow/core/kernels/mkl/mkl_relu_op_test.cc
diff --git a/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc b/tensorflow/core/kernels/mkl/mkl_requantization_range_per_channel_op.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc
rename to tensorflow/core/kernels/mkl/mkl_requantization_range_per_channel_op.cc
diff --git a/tensorflow/core/kernels/mkl_requantize_ops_test.cc b/tensorflow/core/kernels/mkl/mkl_requantize_ops_test.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_requantize_ops_test.cc
rename to tensorflow/core/kernels/mkl/mkl_requantize_ops_test.cc
diff --git a/tensorflow/core/kernels/mkl_requantize_per_channel_op.cc b/tensorflow/core/kernels/mkl/mkl_requantize_per_channel_op.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_requantize_per_channel_op.cc
rename to tensorflow/core/kernels/mkl/mkl_requantize_per_channel_op.cc
diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl/mkl_reshape_op.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_reshape_op.cc
rename to tensorflow/core/kernels/mkl/mkl_reshape_op.cc
diff --git a/tensorflow/core/kernels/mkl_slice_op.cc b/tensorflow/core/kernels/mkl/mkl_slice_op.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_slice_op.cc
rename to tensorflow/core/kernels/mkl/mkl_slice_op.cc
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl/mkl_softmax_op.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_softmax_op.cc
rename to tensorflow/core/kernels/mkl/mkl_softmax_op.cc
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.h b/tensorflow/core/kernels/mkl/mkl_tfconv_op.h
similarity index 97%
rename from tensorflow/core/kernels/mkl_tfconv_op.h
rename to tensorflow/core/kernels/mkl/mkl_tfconv_op.h
index f7aa4d2bebf..0a603ee2c12 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.h
+++ b/tensorflow/core/kernels/mkl/mkl_tfconv_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_MKL_TFCONV_OP_H_
-#define TENSORFLOW_CORE_KERNELS_MKL_TFCONV_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_MKL_MKL_TFCONV_OP_H_
+#define TENSORFLOW_CORE_KERNELS_MKL_MKL_TFCONV_OP_H_
 
 #ifdef INTEL_MKL
 
@@ -160,4 +160,4 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_CPU);
 
 }  // namespace tensorflow
 #endif  // INTEL_MKL
-#endif  // TENSORFLOW_CORE_KERNELS_MKL_TFCONV_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_MKL_MKL_TFCONV_OP_H_
diff --git a/tensorflow/core/kernels/mkl_tmp_bf16_ops.cc b/tensorflow/core/kernels/mkl/mkl_tmp_bf16_ops.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_tmp_bf16_ops.cc
rename to tensorflow/core/kernels/mkl/mkl_tmp_bf16_ops.cc
diff --git a/tensorflow/core/kernels/mkl_transpose_op.cc b/tensorflow/core/kernels/mkl/mkl_transpose_op.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_transpose_op.cc
rename to tensorflow/core/kernels/mkl/mkl_transpose_op.cc
diff --git a/tensorflow/core/kernels/mlir_generated/BUILD b/tensorflow/core/kernels/mlir_generated/BUILD
index fed63ce8433..9f3efe9d972 100644
--- a/tensorflow/core/kernels/mlir_generated/BUILD
+++ b/tensorflow/core/kernels/mlir_generated/BUILD
@@ -18,19 +18,26 @@ package(
 )
 
 config_setting(
-    name = "mlir_generated_gpu_kernels_enabled",
+    name = "mlir_generated_gpu_kernels_disabled",
     define_values = {
-        "tensorflow_enable_mlir_generated_gpu_kernels": "1",
+        "tensorflow_enable_mlir_generated_gpu_kernels": "0",
     },
 )
 
 tf_kernel_library(
     name = "cwise_op",
-    gpu_srcs = ["cwise_op_gpu_tanh.cu.cc"],
+    gpu_srcs = [
+        "cwise_op_gpu_base.cu.cc",
+        "cwise_op_gpu_base.cu.h",
+        "cwise_op_gpu_abs.cu.cc",
+        "cwise_op_gpu_tanh.cu.cc",
+    ],
     tags = ["manual"],
     deps = if_cuda([
+        ":abs_kernels",
         ":tanh_kernels",
         "@com_google_absl//absl/strings",
+        "//third_party/eigen3",
         "@com_google_absl//absl/types:span",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -57,6 +64,25 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_cuda_cc_test(
+    name = "gpu_abs_test",
+    size = "small",
+    srcs = if_mlir_generated_gpu_kernels_enabled(["gpu_abs_test.cc"]),
+    tags = tf_cuda_tests_tags() + ["no_rocm"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/common_runtime:device",
+        "//tensorflow/core/common_runtime:device_factory",
+        "//tensorflow/core/kernels:cwise_op",
+        "//tensorflow/core/kernels:ops_testutil",
+    ],
+)
+
 # TODO(b/160731748): Re-enable when it works again.
 # gen_kernel_library(
 #     name = "bias_add",
@@ -92,3 +118,17 @@ gen_kernel_library(
     ],
     unroll_factors = "4",
 )
+
+gen_kernel_library(
+    name = "abs",
+    same_shape = "0,1",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+        "i32",
+        "i64",
+    ],
+    unroll_factors = "4",
+)
diff --git a/tensorflow/core/kernels/mlir_generated/build_defs.bzl b/tensorflow/core/kernels/mlir_generated/build_defs.bzl
index da2b4343d96..2bf6e8fa3bb 100644
--- a/tensorflow/core/kernels/mlir_generated/build_defs.bzl
+++ b/tensorflow/core/kernels/mlir_generated/build_defs.bzl
@@ -4,8 +4,8 @@ load("@local_config_cuda//cuda:build_defs.bzl", "cuda_gpu_architectures", "if_cu
 
 def if_mlir_generated_gpu_kernels_enabled(if_true, if_false = []):
     return select({
-        "//tensorflow/core/kernels/mlir_generated:mlir_generated_gpu_kernels_enabled": if_true,
-        "//conditions:default": if_false,
+        "//tensorflow/core/kernels/mlir_generated:mlir_generated_gpu_kernels_disabled": if_false,
+        "//conditions:default": if_true,
     })
 
 def _lookup_file(filegroup, path):
@@ -136,7 +136,7 @@ def _gen_mlir_op_impl(ctx):
     ctx.actions.run_shell(
         inputs = [ctx.file.template],
         outputs = [ctx.outputs.out],
-        command = "cat %s | sed s/f99/%s/g > %s" % (
+        command = "cat %s | sed s/elem_type/%s/g > %s" % (
             ctx.file.template.path,
             ctx.attr.type,
             ctx.outputs.out.path,
diff --git a/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_abs.cu.cc b/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_abs.cu.cc
new file mode 100644
index 00000000000..1920317a7ae
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_abs.cu.cc
@@ -0,0 +1,40 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/mlir_generated/abs_f16_kernel.h"
+#include "tensorflow/core/kernels/mlir_generated/abs_f32_kernel.h"
+#include "tensorflow/core/kernels/mlir_generated/abs_f64_kernel.h"
+#include "tensorflow/core/kernels/mlir_generated/abs_i32_kernel.h"
+#include "tensorflow/core/kernels/mlir_generated/abs_i64_kernel.h"
+#include "tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cu.h"
+
+namespace tensorflow {
+namespace {
+GENERATE_OP_KERNEL_BASE(Abs);
+}  // namespace
+
+REGISTER_AND_GENERATE_KERNEL(Abs, F16, Eigen::half);
+REGISTER_AND_GENERATE_KERNEL(Abs, F32, float);
+REGISTER_AND_GENERATE_KERNEL(Abs, F64, double);
+REGISTER_AND_GENERATE_KERNEL(Abs, I32, int32);
+REGISTER_AND_GENERATE_KERNEL(Abs, I64, int64);
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cu.cc b/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cu.cc
new file mode 100644
index 00000000000..5a5c9ed6a42
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cu.cc
@@ -0,0 +1,129 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cu.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor.h"
+
+namespace tensorflow {
+namespace {
+Status CreateKernel(absl::string_view kernel_name, uint64_t num_args,
+                    absl::string_view ptx, absl::Span<const uint8_t> cubin_data,
+                    se::StreamExecutor* stream_exec,
+                    std::unique_ptr<se::KernelBase>& kernel_base) {
+  se::MultiKernelLoaderSpec loader_spec(num_args);
+
+  if (!cubin_data.empty()) {
+    loader_spec.AddCudaCubinInMemory(
+        reinterpret_cast<const char*>(cubin_data.data()), kernel_name);
+  }
+
+  kernel_base.reset(new se::KernelBase(stream_exec));
+  return stream_exec->GetKernel(loader_spec, kernel_base.get());
+}
+
+struct LaunchConfig {
+  se::BlockDim blockDim;
+  se::ThreadDim threadDim;
+};
+
+LaunchConfig GetLaunchConfiguration(std::vector<uint64> tile_sizes,
+                                    std::vector<uint64> unrolling_factors,
+                                    std::vector<uint64> shape) {
+  LaunchConfig result;
+  // Ensure the vectors are length 3 and pad with ones.
+  tile_sizes.resize(3, 1);
+  unrolling_factors.resize(3, 1);
+  shape.resize(3, 1);
+  // The number of threads is given by the tiling size.
+  result.threadDim = se::ThreadDim(tile_sizes[0], tile_sizes[1], tile_sizes[2]);
+  // We know that the kernel was generated by mapping the three outer-most
+  // dimensions to x,y,z dimensions. So we only need to compute those.
+  std::vector<int> block_dims(3);
+  for (int i = 0; i < 3; ++i) {
+    // Compute the number of grids. We use ceildiv here as we have to allocate
+    // an extra thread/block if the division is not even. The kernel contains
+    // code to handle the boundaries.
+    uint64 number_of_threads = Eigen::divup(shape[i], unrolling_factors[i]);
+    int number_of_grids = Eigen::divup(number_of_threads, tile_sizes[i]);
+    block_dims[i] = number_of_grids;
+  }
+  result.blockDim = se::BlockDim(block_dims[0], block_dims[1], block_dims[2]);
+  return result;
+}
+}  // namespace
+
+void MlirGeneratedUnaryOp::Compute(OpKernelContext* ctx) {
+  auto* stream = ctx->op_device_context()->stream();
+  se::KernelBase* kernel;
+  {
+    absl::MutexLock l(&mu_);
+    if (!kernel_) {
+      OP_REQUIRES_OK(ctx, CreateKernel(name_, 10, "", cubin_data_,
+                                       stream->parent(), kernel_));
+    }
+    kernel = kernel_.get();
+  }
+
+  const Tensor& inp = ctx->input(0);
+  Tensor* out = nullptr;
+  OP_REQUIRES_OK(
+      ctx, ctx->forward_input_or_allocate_output({0}, 0, inp.shape(), &out));
+
+  if (inp.NumElements() == 0) {
+    return;
+  }
+
+  se::KernelArgsArray<10> args;
+
+  args.add_device_memory_argument(
+      stream_executor::DeviceMemoryBase(inp.data(), inp.TotalBytes()));
+  args.add_device_memory_argument(
+      stream_executor::DeviceMemoryBase(inp.data(), inp.TotalBytes()));
+  args.add_argument<int64_t>(0);
+  args.add_argument<int64_t>(inp.NumElements());
+  args.add_argument<int64_t>(1);
+
+  args.add_device_memory_argument(
+      stream_executor::DeviceMemoryBase(out->data(), out->TotalBytes()));
+  args.add_device_memory_argument(
+      stream_executor::DeviceMemoryBase(out->data(), out->TotalBytes()));
+  args.add_argument<int64_t>(0);
+  args.add_argument<int64_t>(inp.NumElements());
+  args.add_argument<int64_t>(1);
+
+  // This has to be aligned with the configuration that was used when building
+  // the kernels. See the corresponding build rules in the `BUILD` file.
+  LaunchConfig config = GetLaunchConfiguration(
+      {256}, {4}, {static_cast<uint64>(inp.NumElements())});
+  OP_REQUIRES_OK(ctx, stream->parent()->Launch(stream, config.threadDim,
+                                               config.blockDim, *kernel, args));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cu.h b/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cu.h
new file mode 100644
index 00000000000..4e75aab6e16
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cu.h
@@ -0,0 +1,77 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_CWISE_OP_GPU_BASE_CU_H_
+#define TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_CWISE_OP_GPU_BASE_CU_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/strings/ascii.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/stream_executor.h"
+
+namespace tensorflow {
+class MlirGeneratedUnaryOp : public OpKernel {
+ public:
+  MlirGeneratedUnaryOp(OpKernelConstruction* ctx, std::string name,
+                       absl::Span<const uint8_t> cubin_data)
+      : OpKernel(ctx), name_(name), cubin_data_(cubin_data) {}
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  std::string name_;
+  absl::Span<const uint8_t> cubin_data_;
+  std::unique_ptr<se::KernelBase> kernel_;
+  absl::Mutex mu_;
+};
+
+#define GENERATE_OP_KERNEL_BASE(kernel_name)                                  \
+  class MlirGenerated##kernel_name##Op : public MlirGeneratedUnaryOp {        \
+   public:                                                                    \
+    MlirGenerated##kernel_name##Op(OpKernelConstruction* ctx,                 \
+                                   absl::Span<const uint8_t> cubin_data)      \
+        : MlirGeneratedUnaryOp(ctx,                                           \
+                               absl::AsciiStrToLower(#kernel_name "_kernel"), \
+                               cubin_data) {}                                 \
+  };
+
+#define GENERATE_OP_KERNEL_FOR(kernel_name, data_type)    \
+  class MlirGenerated##kernel_name##data_type##Op         \
+      : public MlirGenerated##kernel_name##Op {           \
+   public:                                                \
+    explicit MlirGenerated##kernel_name##data_type##Op(   \
+        OpKernelConstruction* ctx)                        \
+        : MlirGenerated##kernel_name                      \
+          ##Op(ctx, k##kernel_name##data_type##Kernel) {} \
+  };
+
+#define REGISTER_AND_GENERATE_KERNEL(kernel_name, data_type, native_data_type) \
+  namespace {                                                                  \
+  GENERATE_OP_KERNEL_FOR(kernel_name, data_type)                               \
+  }                                                                            \
+  REGISTER_KERNEL_BUILDER(Name(#kernel_name)                                   \
+                              .Device(DEVICE_GPU)                              \
+                              .TypeConstraint<native_data_type>("T"),          \
+                          MlirGenerated##kernel_name##data_type##Op);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_CWISE_OP_GPU_BASE_CU_H_
diff --git a/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_tanh.cu.cc b/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_tanh.cu.cc
index a33008b9549..b113c4cad34 100644
--- a/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_tanh.cu.cc
+++ b/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_tanh.cu.cc
@@ -13,164 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <memory>
+#include <string>
 #include <vector>
 
-#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cu.h"
 #include "tensorflow/core/kernels/mlir_generated/tanh_f16_kernel.h"
 #include "tensorflow/core/kernels/mlir_generated/tanh_f32_kernel.h"
 #include "tensorflow/core/kernels/mlir_generated/tanh_f64_kernel.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/stream_executor.h"
 
 namespace tensorflow {
 namespace {
-Status CreateKernel(absl::string_view kernel_name, uint64_t num_args,
-                    absl::string_view ptx, absl::Span<const uint8_t> cubin_data,
-                    se::StreamExecutor* stream_exec,
-                    std::unique_ptr<se::KernelBase>& kernel_base) {
-  se::MultiKernelLoaderSpec loader_spec(num_args);
-
-  if (!cubin_data.empty()) {
-    loader_spec.AddCudaCubinInMemory(
-        reinterpret_cast<const char*>(cubin_data.data()), kernel_name);
-  }
-
-  kernel_base.reset(new se::KernelBase(stream_exec));
-  return stream_exec->GetKernel(loader_spec, kernel_base.get());
-}
-
-struct LaunchConfig {
-  se::BlockDim blockDim;
-  se::ThreadDim threadDim;
-};
-
-LaunchConfig GetLaunchConfiguration(std::vector<uint64> tile_sizes,
-                                    std::vector<uint64> unrolling_factors,
-                                    std::vector<uint64> shape) {
-  LaunchConfig result;
-  // Ensure the vectors are length 3 and pad with ones.
-  tile_sizes.resize(3, 1);
-  unrolling_factors.resize(3, 1);
-  shape.resize(3, 1);
-  // The number of threads is given by the tiling size.
-  result.threadDim = se::ThreadDim(tile_sizes[0], tile_sizes[1], tile_sizes[2]);
-  // We know that the kernel was generated by mapping the three outer-most
-  // dimensions to x,y,z dimensions. So we only need to compute those.
-  std::vector<int> block_dims(3);
-  for (int i = 0; i < 3; ++i) {
-    // Compute the number of grids. We use ceildiv here as we have to allocate
-    // an extra thread/block if the division is not even. The kernel contains
-    // code to handle the boundaries.
-    int number_of_threads =
-        (shape[i] + unrolling_factors[i] - 1) / unrolling_factors[i];
-    int number_of_grids =
-        (number_of_threads + tile_sizes[i] - 1) / tile_sizes[i];
-    block_dims[i] = number_of_grids;
-  }
-  result.blockDim = se::BlockDim(block_dims[0], block_dims[1], block_dims[2]);
-  return result;
-}
-
-class MlirGeneratedTanhOp : public OpKernel {
- public:
-  explicit MlirGeneratedTanhOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-
-  void Compute(OpKernelContext* ctx) override {
-    auto* stream = ctx->op_device_context()->stream();
-    se::KernelBase* kernel;
-    {
-      std::lock_guard<std::mutex> l(mu_);
-      if (!kernel_) {
-        OP_REQUIRES_OK(ctx, CreateKernel("tanh_kernel", 10, "", cubin_data_,
-                                         stream->parent(), kernel_));
-      }
-      kernel = kernel_.get();
-    }
-
-    const Tensor& inp = ctx->input(0);
-    Tensor* out = nullptr;
-    OP_REQUIRES_OK(
-        ctx, ctx->forward_input_or_allocate_output({0}, 0, inp.shape(), &out));
-
-    if (inp.NumElements() == 0) {
-      return;
-    }
-
-    se::KernelArgsArray<10> args;
-
-    args.add_device_memory_argument(
-        stream_executor::DeviceMemoryBase(inp.data(), inp.TotalBytes()));
-    args.add_device_memory_argument(
-        stream_executor::DeviceMemoryBase(inp.data(), inp.TotalBytes()));
-    args.add_argument<int64_t>(0);
-    args.add_argument<int64_t>(inp.NumElements());
-    args.add_argument<int64_t>(1);
-
-    args.add_device_memory_argument(
-        stream_executor::DeviceMemoryBase(out->data(), out->TotalBytes()));
-    args.add_device_memory_argument(
-        stream_executor::DeviceMemoryBase(out->data(), out->TotalBytes()));
-    args.add_argument<int64_t>(0);
-    args.add_argument<int64_t>(inp.NumElements());
-    args.add_argument<int64_t>(1);
-
-    // This has to be aligned with the configuration that was used when
-    // generating the kernels. See the corresponding build rules in the `BUILD`
-    // file.
-    LaunchConfig config = GetLaunchConfiguration(
-        {256}, {4}, {static_cast<uint64>(inp.NumElements())});
-    OP_REQUIRES_OK(
-        ctx, stream->parent()->Launch(stream, config.threadDim, config.blockDim,
-                                      *kernel, args));
-  }
-
- protected:
-  absl::Span<const uint8_t> cubin_data_;
-
- private:
-  std::unique_ptr<se::KernelBase> kernel_;
-  std::mutex mu_;
-};
-
-class MlirGeneratedTanhF16Op : public MlirGeneratedTanhOp {
- public:
-  explicit MlirGeneratedTanhF16Op(OpKernelConstruction* ctx)
-      : MlirGeneratedTanhOp(ctx) {
-    cubin_data_ = kTanhF16Kernel;
-  }
-};
-
-class MlirGeneratedTanhF32Op : public MlirGeneratedTanhOp {
- public:
-  explicit MlirGeneratedTanhF32Op(OpKernelConstruction* ctx)
-      : MlirGeneratedTanhOp(ctx) {
-    cubin_data_ = kTanhF32Kernel;
-  }
-};
-
-class MlirGeneratedTanhF64Op : public MlirGeneratedTanhOp {
- public:
-  explicit MlirGeneratedTanhF64Op(OpKernelConstruction* ctx)
-      : MlirGeneratedTanhOp(ctx) {
-    cubin_data_ = kTanhF64Kernel;
-  }
-};
+GENERATE_OP_KERNEL_BASE(Tanh);
 }  // namespace
 
-REGISTER_KERNEL_BUILDER(
-    Name("Tanh").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
-    MlirGeneratedTanhF16Op);
-REGISTER_KERNEL_BUILDER(
-    Name("Tanh").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    MlirGeneratedTanhF32Op);
-REGISTER_KERNEL_BUILDER(
-    Name("Tanh").Device(DEVICE_GPU).TypeConstraint<double>("T"),
-    MlirGeneratedTanhF64Op);
+REGISTER_AND_GENERATE_KERNEL(Tanh, F16, Eigen::half)
+REGISTER_AND_GENERATE_KERNEL(Tanh, F32, float)
+REGISTER_AND_GENERATE_KERNEL(Tanh, F64, double)
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mlir_generated/gpu_abs_test.cc b/tensorflow/core/kernels/mlir_generated/gpu_abs_test.cc
new file mode 100644
index 00000000000..ae76c023440
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/gpu_abs_test.cc
@@ -0,0 +1,95 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cmath>
+#include <limits>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+class GpuAbsTest : public OpsTestBase {
+ protected:
+  void SetUp() override {
+    std::unique_ptr<tensorflow::Device> device_gpu(
+        tensorflow::DeviceFactory::NewDevice("GPU", {},
+                                             "/job:a/replica:0/task:0"));
+    SetDevice(tensorflow::DEVICE_GPU, std::move(device_gpu));
+  }
+  template <typename T, typename RT = T>
+  void RunAbsOp(std::initializer_list<T> input) {
+    TensorShape shape({2, 3});
+    TF_ASSERT_OK(NodeDefBuilder("abs_op", "Abs")
+                     .Input(FakeInput(DataTypeToEnum<T>::v()))
+                     .Attr("T", DataTypeToEnum<T>::v())
+                     .Finalize(node_def()));
+
+    TF_ASSERT_OK(InitOp());
+    AddInputFromArray<T>(shape, input);
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor expected_tensor(allocator(), DataTypeToEnum<T>::value, shape);
+    std::vector<T> expected;
+    expected.reserve(input.size());
+    for (const T& inp : input) {
+      expected.push_back(static_cast<T>(std::abs(static_cast<RT>(inp))));
+    }
+    test::FillValues<T>(&expected_tensor, expected);
+    test::ExpectEqual(expected_tensor, *GetOutput(0));
+  }
+};
+
+TEST_F(GpuAbsTest, AbsFloat) {
+  RunAbsOp<float>({-std::numeric_limits<float>::infinity(), -0.1f, -0.0f, 0.0f,
+                   0.1f, std::numeric_limits<float>::infinity()});
+}
+
+TEST_F(GpuAbsTest, AbsDouble) {
+  RunAbsOp<double>({-std::numeric_limits<double>::infinity(), -0.1, -0.0, 0.0,
+                    0.1, std::numeric_limits<double>::infinity()});
+}
+
+TEST_F(GpuAbsTest, AbsHalf) {
+  RunAbsOp<Eigen::half, float>(
+      {static_cast<Eigen::half>(-std::numeric_limits<double>::infinity()),
+       static_cast<Eigen::half>(-0.1), static_cast<Eigen::half>(-0.0),
+       static_cast<Eigen::half>(0.0), static_cast<Eigen::half>(0.1),
+       static_cast<Eigen::half>(std::numeric_limits<double>::infinity())});
+}
+
+TEST_F(GpuAbsTest, AbsInt32) {
+  RunAbsOp<int32>({std::numeric_limits<int32>::min(),
+                   std::numeric_limits<int32>::min() + 1, -1, 0, 1,
+                   std::numeric_limits<int32>::max()});
+}
+
+TEST_F(GpuAbsTest, AbsInt64) {
+  RunAbsOp<int64>({std::numeric_limits<int64>::min(),
+                   std::numeric_limits<int64>::min() + 1, -1, 0, 1,
+                   std::numeric_limits<int64>::max()});
+}
+
+}  // namespace
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/abs.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/abs.mlir.tmpl
new file mode 100644
index 00000000000..d4c9bd5eaed
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/abs.mlir.tmpl
@@ -0,0 +1,5 @@
+func @abs(%arg0: tensor<?xelem_type>) -> tensor<?xelem_type> {
+  %0 = "tf.Abs"(%arg0) { }
+    : (tensor<?xelem_type>) -> tensor<?xelem_type>
+  return %0 : tensor<?xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/bias_add.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/bias_add.mlir.tmpl
index f685a5cde81..f58b6c0c1cb 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/bias_add.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/bias_add.mlir.tmpl
@@ -1,6 +1,6 @@
-func @bias_add(%arg0: tensor<?x?xf99>,
-         %arg1: tensor<?xf99>) -> tensor<?x?xf99> {
+func @bias_add(%arg0: tensor<?x?xelem_type>,
+         %arg1: tensor<?xelem_type>) -> tensor<?x?xelem_type> {
   %0 = "tf.BiasAdd"(%arg0, %arg1) { }
-    : (tensor<?x?xf99>, tensor<?xf99>) -> tensor<?x?xf99>
-  return %0 : tensor<?x?xf99>
+    : (tensor<?x?xelem_type>, tensor<?xelem_type>) -> tensor<?x?xelem_type>
+  return %0 : tensor<?x?xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/relu.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/relu.mlir.tmpl
index 45aac242cee..7e7082ff295 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/relu.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/relu.mlir.tmpl
@@ -1,5 +1,5 @@
-func @relu(%arg0: tensor<?xf99>) -> tensor<?xf99> {
+func @relu(%arg0: tensor<?xelem_type>) -> tensor<?xelem_type> {
   %0 = "tf.Relu"(%arg0) { }
-    : (tensor<?xf99>) -> tensor<?xf99>
-  return %0 : tensor<?xf99>
+    : (tensor<?xelem_type>) -> tensor<?xelem_type>
+  return %0 : tensor<?xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/tanh.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/tanh.mlir.tmpl
index 73e32155e4b..58a9b61ef68 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/tanh.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/tanh.mlir.tmpl
@@ -1,5 +1,5 @@
-func @tanh(%arg0: tensor<?xf99>) -> tensor<?xf99> {
+func @tanh(%arg0: tensor<?xelem_type>) -> tensor<?xelem_type> {
   %0 = "tf.Tanh"(%arg0) { }
-    : (tensor<?xf99>) -> tensor<?xf99>
-  return %0 : tensor<?xf99>
+    : (tensor<?xelem_type>) -> tensor<?xelem_type>
+  return %0 : tensor<?xelem_type>
 }
diff --git a/tensorflow/core/kernels/neon/BUILD b/tensorflow/core/kernels/neon/BUILD
index 724051471d9..0a9bd5a7fd5 100644
--- a/tensorflow/core/kernels/neon/BUILD
+++ b/tensorflow/core/kernels/neon/BUILD
@@ -23,7 +23,7 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core/kernels:bounds_check",
+        "//tensorflow/core/framework:bounds_check",
         "//tensorflow/core/kernels:ops_util",
         "@gemmlowp",
     ],
diff --git a/tensorflow/core/kernels/nth_element_op.cc b/tensorflow/core/kernels/nth_element_op.cc
index 0e43cc19aae..dced32ef7df 100644
--- a/tensorflow/core/kernels/nth_element_op.cc
+++ b/tensorflow/core/kernels/nth_element_op.cc
@@ -40,23 +40,23 @@ class NthElementOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     // The second args is N, which must be a positive scalar.
     const auto& n_in = context->input(1);
-    OP_REQUIRES(context, TensorShapeUtils::IsScalar(n_in.shape()),
-                errors::InvalidArgument("N must be scalar, got shape ",
-                                        n_in.shape().DebugString()));
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(n_in.shape()),
+        errors::InvalidArgument("N must be scalar but has rank ", n_in.dims()));
     int n = n_in.scalar<int32>()();
     OP_REQUIRES(context, n >= 0,
-                errors::InvalidArgument("Need n >= 0, got ", n));
+                errors::InvalidArgument("n must be non-negative but is ", n));
 
     // The first args is input tensor, which must have 1 dimension at least.
     const Tensor& input_in = context->input(0);
     const int num_dims = input_in.dims();
     OP_REQUIRES(context, num_dims >= 1,
-                errors::InvalidArgument("Input must be >= 1-D, got shape ",
-                                        input_in.shape().DebugString()));
+                errors::InvalidArgument(
+                    "Input must be at least rank 1 but is rank ", num_dims));
     // The last dimension of input tensor must be greater than N.
     OP_REQUIRES(
         context, input_in.dim_size(num_dims - 1) > n,
-        errors::InvalidArgument("Input must have at least n+1 columns"));
+        errors::InvalidArgument("Input must have last dimension > n = ", n));
 
     // std::nth_element only support the nth-smallest selection.
     if (reverse_) {
diff --git a/tensorflow/core/kernels/ops_testutil.cc b/tensorflow/core/kernels/ops_testutil.cc
index 87f70d3a3b3..c535fe66601 100644
--- a/tensorflow/core/kernels/ops_testutil.cc
+++ b/tensorflow/core/kernels/ops_testutil.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/framework/node_properties.h"
-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 #include "tensorflow/core/common_runtime/gpu/gpu_managed_allocator.h"
 #endif
@@ -112,7 +112,7 @@ void OpsTestBase::SetDevice(const DeviceType& device_type,
       thread_pool_.get());
 
   device_type_ = device_type;
-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   if (device_type == DEVICE_GPU) {
     managed_allocator_.reset(new GpuManagedAllocator());
     allocator_ = managed_allocator_.get();
@@ -122,7 +122,8 @@ void OpsTestBase::SetDevice(const DeviceType& device_type,
   }
 #else
   CHECK_NE(device_type, DEVICE_GPU)
-      << "Requesting GPU on binary compiled without GOOGLE_CUDA.";
+      << "Requesting GPU on binary compiled without GOOGLE_CUDA or "
+         "TENSORFLOW_USE_ROCM.";
   allocator_ = device_->GetAllocator(AllocatorAttributes());
 #endif
 }
@@ -195,7 +196,7 @@ TensorValue OpsTestBase::mutable_input(int input_index) {
 Tensor* OpsTestBase::GetOutput(int output_index) {
   CHECK_LT(output_index, context_->num_outputs());
   Tensor* output = context_->mutable_output(output_index);
-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   if (device_type_ == DEVICE_GPU) {
     managed_outputs_.resize(context_->num_outputs());
     // Copy the output tensor to managed memory if we haven't done so.
diff --git a/tensorflow/core/kernels/quantized_resize_bilinear_op.cc b/tensorflow/core/kernels/quantized_resize_bilinear_op.cc
index 4da56cde547..da0a35a6554 100644
--- a/tensorflow/core/kernels/quantized_resize_bilinear_op.cc
+++ b/tensorflow/core/kernels/quantized_resize_bilinear_op.cc
@@ -25,9 +25,9 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/image_resizer_state.h"
 #include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/image_resizer_state.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc b/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
index 9ae5d7ffbdc..88931292ef2 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
@@ -37,9 +37,9 @@ limitations under the License.
 #include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/ops/ragged_to_dense_util.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/bcast.h"
+#include "tensorflow/core/util/ragged_to_dense_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/range_sampler.cc b/tensorflow/core/kernels/range_sampler.cc
index d759f315cb0..d07cc14238f 100644
--- a/tensorflow/core/kernels/range_sampler.cc
+++ b/tensorflow/core/kernels/range_sampler.cc
@@ -84,7 +84,7 @@ void RangeSampler::SampleBatchGetExpectedCountAvoid(
   int num_tries;
 
   if (unique) {
-    CHECK_LE(batch_size + avoided_values.size(), range_);
+    CHECK_LE(static_cast<int64>(batch_size + avoided_values.size()), range_);
     std::unordered_set<int64> used(batch_size);
     used.insert(avoided_values.begin(), avoided_values.end());
     int num_picked = 0;
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index c043c6a8e33..103d01b6c58 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -147,7 +147,7 @@ struct Or {
 // each block does a grid strided loop and reduces its values locally
 // the case of one block is used for low latency small reductions to scalars
 template <typename T, typename OUT_T, int num_threads, typename Op>
-__global__ void BlockReduceKernel(
+__global__ __launch_bounds__(1024) void BlockReduceKernel(
     T in, OUT_T out, int num_elems, Op op,
     typename std::iterator_traits<T>::value_type initVal) {
   const int bid = blockIdx.x;
@@ -184,7 +184,7 @@ __global__ void BlockReduceKernel(
 
 // maps a warp to each row
 template <typename T, typename OUT_T, typename Op>
-__global__ void RowReduceKernel(
+__global__ __launch_bounds__(1024) void RowReduceKernel(
     T in, OUT_T out, int num_rows, int num_cols, Op op,
     typename std::iterator_traits<T>::value_type initVal) {
   typedef typename std::iterator_traits<T>::value_type value_type;
@@ -252,7 +252,7 @@ struct storage_type<std::complex<T2>> {
 // Works only if there are <= 16 columns
 // each warps sums over multiple rows at once
 template <typename T, typename OUT_T, typename Op>
-__global__ void ColumnReduceMax16ColumnsKernel(
+__global__ __launch_bounds__(1024) void ColumnReduceMax16ColumnsKernel(
     T in, OUT_T out, int num_rows, int num_cols, Op op,
     typename std::iterator_traits<T>::value_type initVal) {
   typedef typename std::iterator_traits<T>::value_type value_type;
@@ -322,7 +322,7 @@ __global__ void ColumnReduceMax16ColumnsKernel(
 
 // Maps each block to a column range TF_RED_WARPSIZE wide
 template <typename T, typename OUT_T, typename Op>
-__global__ void ColumnReduceKernel(
+__global__ __launch_bounds__(1024) void ColumnReduceKernel(
     T in, OUT_T out, int num_rows, int num_cols, Op op,
     typename std::iterator_traits<T>::value_type initVal) {
   typedef typename std::iterator_traits<T>::value_type value_type;
@@ -388,7 +388,7 @@ __global__ void ColumnReduceKernel(
 // segments cannot cross warp boundaries (mainly used for reducing the segments
 // that come from the Max16Columns column reduction kernel)
 template <typename T, typename OUT_T, typename Op>
-__global__ void CleanupSegments(
+__global__ __launch_bounds__(1024) void CleanupSegments(
     T partial_sums, OUT_T out, int num_rows, int num_cols, int segment_size,
     Op op, typename std::iterator_traits<T>::value_type initVal) {
   typedef typename std::iterator_traits<T>::value_type value_type;
@@ -412,8 +412,8 @@ __global__ void CleanupSegments(
 
 // assigns one thread to a column
 template <typename T, typename OUT_T, typename Op>
-__global__ void ColumnReduceSimpleKernel(T in, OUT_T out, int num_planes,
-                                         int num_rows, int num_cols, Op op) {
+__global__ __launch_bounds__(1024) void ColumnReduceSimpleKernel(
+    T in, OUT_T out, int num_planes, int num_rows, int num_cols, Op op) {
   typedef typename std::iterator_traits<T>::value_type value_type;
   const int gid = threadIdx.x + blockIdx.x * blockDim.x;
   const int elems_per_plane = num_rows * num_cols;
@@ -479,11 +479,9 @@ __device__ __inline__ T ComputeSum(IN_T in_, const int plane,
 }
 
 template <typename IN_T, typename Op>
-__global__ void ColumnReduceInToTempKernel(void* __restrict__ temp,
-                                           int temp_in_offset,
-                                           int temp_out_offset, IN_T in,
-                                           int num_planes, int num_rows,
-                                           int num_cols, Op op) {
+__global__ __launch_bounds__(1024) void ColumnReduceInToTempKernel(
+    void* __restrict__ temp, int temp_in_offset, int temp_out_offset, IN_T in,
+    int num_planes, int num_rows, int num_cols, Op op) {
   typedef typename std::iterator_traits<IN_T>::value_type value_type;
 
   value_type* t = (value_type*)temp;
@@ -510,10 +508,9 @@ __global__ void ColumnReduceInToTempKernel(void* __restrict__ temp,
 }
 
 template <typename T, typename OUT_T, typename Op>
-__global__ void ColumnReduceTempToOutKernel(void* __restrict__ temp,
-                                            int temp_in_offset, T in, OUT_T out,
-                                            int num_planes, int num_rows,
-                                            int num_cols, Op op) {
+__global__ __launch_bounds__(1024) void ColumnReduceTempToOutKernel(
+    void* __restrict__ temp, int temp_in_offset, T in, OUT_T out,
+    int num_planes, int num_rows, int num_cols, Op op) {
   typedef typename std::iterator_traits<T>::value_type value_type;
   value_type* t = (value_type*)temp;
   const int tid = threadIdx.x;
diff --git a/tensorflow/core/kernels/regex_full_match_op.cc b/tensorflow/core/kernels/regex_full_match_op.cc
index 04da969df12..f00e971c0bc 100644
--- a/tensorflow/core/kernels/regex_full_match_op.cc
+++ b/tensorflow/core/kernels/regex_full_match_op.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
@@ -28,6 +30,8 @@ class RegexFullMatchOp : public OpKernel {
  public:
   explicit RegexFullMatchOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
+  ~RegexFullMatchOp() override {}
+
   void Compute(OpKernelContext* ctx) override {
     const Tensor* input_tensor;
     OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
@@ -39,19 +43,43 @@ class RegexFullMatchOp : public OpKernel {
                 errors::InvalidArgument("Pattern must be scalar, but received ",
                                         pattern_tensor->shape().DebugString()));
     const string pattern = pattern_tensor->flat<tstring>()(0);
-    const RE2 match(pattern);
-    OP_REQUIRES(ctx, match.ok(),
+    std::shared_ptr<RE2> regex = CachedRE2(pattern);
+    OP_REQUIRES(ctx, regex->ok(),
                 errors::InvalidArgument("Invalid pattern: ", pattern,
-                                        ", error: ", match.error()));
+                                        ", error: ", regex->error()));
 
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output("output", input_tensor->shape(),
                                              &output_tensor));
     auto output_flat = output_tensor->flat<bool>();
     for (size_t i = 0; i < input_flat.size(); ++i) {
-      output_flat(i) = RE2::FullMatch(input_flat(i), match);
+      output_flat(i) = RE2::FullMatch(input_flat(i), *regex);
     }
   }
+
+ private:
+  std::shared_ptr<RE2> CachedRE2(const string& pattern) {
+    {
+      tf_shared_lock l(mu_);
+      if (regex_ != nullptr && regex_->pattern() == pattern) {
+        return regex_;
+      }
+    }
+    // Construct the new RE2 object before acquiring the lock.
+    auto regex = std::make_shared<RE2>(pattern);
+    {
+      mutex_lock l(mu_);
+      // Swap instead of assigning so that we destruct the old
+      // RE2 object (when necessary) after releasing the lock.
+      regex_.swap(regex);
+      return regex_;
+    }
+  }
+
+  mutex mu_;
+  std::shared_ptr<RE2> regex_ TF_GUARDED_BY(mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(RegexFullMatchOp);
 };
 
 REGISTER_KERNEL_BUILDER(Name("RegexFullMatch").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/regex_replace_op.cc b/tensorflow/core/kernels/regex_replace_op.cc
index 4eb83c5fe0d..5e464e0a13a 100644
--- a/tensorflow/core/kernels/regex_replace_op.cc
+++ b/tensorflow/core/kernels/regex_replace_op.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
@@ -29,7 +31,7 @@ namespace {
 // Context requirements:
 //  - "input" string Tensor at input_index=0
 //  - "output" string Tensor at output_index=0
-Status InternalCompute(const RE2& match, const string& rewrite,
+Status InternalCompute(const RE2& regex, const string& rewrite,
                        const bool replace_global, OpKernelContext* ctx) {
   const Tensor* input_tensor;
   TF_RETURN_IF_ERROR(ctx->input("input", &input_tensor));
@@ -52,9 +54,9 @@ Status InternalCompute(const RE2& match, const string& rewrite,
     // accept std::string.
     string buf = output_flat(i);
     if (replace_global) {
-      RE2::GlobalReplace(&buf, match, rewrite);
+      RE2::GlobalReplace(&buf, regex, rewrite);
     } else {
-      RE2::Replace(&buf, match, rewrite);
+      RE2::Replace(&buf, regex, rewrite);
     }
     output_flat(i) = std::move(buf);
   }
@@ -68,6 +70,8 @@ class RegexReplaceOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("replace_global", &replace_global_));
   }
 
+  ~RegexReplaceOp() override {}
+
   void Compute(OpKernelContext* ctx) override {
     const Tensor* pattern_tensor;
     OP_REQUIRES_OK(ctx, ctx->input("pattern", &pattern_tensor));
@@ -75,10 +79,10 @@ class RegexReplaceOp : public OpKernel {
                 errors::InvalidArgument("Pattern must be scalar, but received ",
                                         pattern_tensor->shape().DebugString()));
     const string& pattern = pattern_tensor->scalar<tstring>()();
-    const RE2 match(pattern);
-    OP_REQUIRES(ctx, match.ok(),
+    std::shared_ptr<RE2> regex = CachedRE2(pattern);
+    OP_REQUIRES(ctx, regex->ok(),
                 errors::InvalidArgument("Invalid pattern: ", pattern,
-                                        ", error: ", match.error()));
+                                        ", error: ", regex->error()));
 
     const Tensor* rewrite_tensor;
     OP_REQUIRES_OK(ctx, ctx->input("rewrite", &rewrite_tensor));
@@ -86,11 +90,33 @@ class RegexReplaceOp : public OpKernel {
                 errors::InvalidArgument("Rewrite must be scalar, but received ",
                                         rewrite_tensor->shape().DebugString()));
     const string& rewrite = rewrite_tensor->scalar<tstring>()();
-    OP_REQUIRES_OK(ctx, InternalCompute(match, rewrite, replace_global_, ctx));
+    OP_REQUIRES_OK(ctx, InternalCompute(*regex, rewrite, replace_global_, ctx));
   }
 
  private:
+  std::shared_ptr<RE2> CachedRE2(const string& pattern) {
+    {
+      tf_shared_lock l(mu_);
+      if (regex_ != nullptr && regex_->pattern() == pattern) {
+        return regex_;
+      }
+    }
+    // Construct the new RE2 object before acquiring the lock.
+    auto regex = std::make_shared<RE2>(pattern);
+    {
+      mutex_lock l(mu_);
+      // Swap instead of assigning so that we destruct the old
+      // RE2 object (when necessary) after releasing the lock.
+      regex_.swap(regex);
+      return regex_;
+    }
+  }
+
   bool replace_global_;
+  mutex mu_;
+  std::shared_ptr<RE2> regex_ TF_GUARDED_BY(mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(RegexReplaceOp);
 };
 
 REGISTER_KERNEL_BUILDER(Name("RegexReplace").Device(DEVICE_CPU),
@@ -101,11 +127,11 @@ class StaticRegexReplaceOp : public OpKernel {
   explicit StaticRegexReplaceOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     string pattern;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("pattern", &pattern));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("rewrite", &rewrite_str_));
     re_ = MakeUnique<RE2>(pattern);
     OP_REQUIRES(ctx, re_->ok(),
                 errors::InvalidArgument("Invalid pattern: ", pattern,
                                         ", error: ", re_->error()));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("rewrite", &rewrite_str_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("replace_global", &replace_global_));
   }
 
@@ -115,8 +141,8 @@ class StaticRegexReplaceOp : public OpKernel {
   }
 
  private:
-  string rewrite_str_;
   std::unique_ptr<RE2> re_;
+  string rewrite_str_;
   bool replace_global_;
 };
 
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc b/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
index a7c80fdcb77..d5a4cb5f944 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
@@ -970,11 +970,10 @@ RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode(
       border_inputs, border_outputs, require_shape_type, &graph, &fused_node));
 
   for (const Node* node : graph.nodes()) {
-    for (int i = 0, iter_limit = node->num_inputs(); i < iter_limit; ++i) {
+    for (int i = 0, end = node->num_inputs(); i < end; ++i) {
       const Edge* edge = nullptr;
       TF_RETURN_IF_ERROR(node->input_edge(i, &edge));
-      for (int j = 0, second_iter_limit = border_outputs.size();
-           j < second_iter_limit; ++j) {
+      for (int j = 0, second_end = border_outputs.size(); j < second_end; ++j) {
         const string& output = border_outputs.at(j);
         const TensorId tid = ParseTensorName(output);
         const string output_name(tid.first);
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 510e95ca606..950a80b6b2d 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -998,8 +998,8 @@ REGISTER_SCATTER_KERNEL(Variant, CPU, "ResourceScatterUpdate",
 
 #define REGISTER_SCATTER_UPDATE_GPU(type) REGISTER_SCATTER_UPDATE(type, GPU);
 
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHMETIC_GPU);
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_MINMAX_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_SCATTER_ARITHMETIC_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_SCATTER_MINMAX_GPU);
 
 REGISTER_KERNEL_BUILDER(Name("ResourceScatterUpdate")
                             .Device(DEVICE_GPU)
diff --git a/tensorflow/core/kernels/scan_ops_gpu.h b/tensorflow/core/kernels/scan_ops_gpu.h
index aca2a8985de..d5ceca92a80 100644
--- a/tensorflow/core/kernels/scan_ops_gpu.h
+++ b/tensorflow/core/kernels/scan_ops_gpu.h
@@ -248,6 +248,10 @@ void LaunchScan(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
   int num_blocks = dimx * dimz;
 
   int ideal_block_size = dimy / items_per_thread;
+#if TENSORFLOW_COMPILER_IS_HIP_CLANG
+  const int rocm_threads_per_warp = 64;
+  ideal_block_size = std::max(ideal_block_size, rocm_threads_per_warp);
+#endif
 
   // There seems to be a bug when the type is not float and block_size 1024.
   // Launch on the smallest power of 2 block size that we can.
diff --git a/tensorflow/core/kernels/scatter_functor.cc b/tensorflow/core/kernels/scatter_functor.cc
index f17d8759d20..5d4ff1f5f5a 100644
--- a/tensorflow/core/kernels/scatter_functor.cc
+++ b/tensorflow/core/kernels/scatter_functor.cc
@@ -55,7 +55,7 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64);
 
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DECLARE_GPU_SPECS);
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 
 #undef DECLARE_GPU_SPECS
 #undef DECLARE_GPU_SPECS_INDEX
diff --git a/tensorflow/core/kernels/scatter_functor_gpu.cu.cc b/tensorflow/core/kernels/scatter_functor_gpu.cu.cc
index 7bfd0051de9..7083502da9e 100644
--- a/tensorflow/core/kernels/scatter_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_functor_gpu.cu.cc
@@ -40,13 +40,11 @@ typedef Eigen::GpuDevice GPUDevice;
   DEFINE_GPU_SPECS_INDEX(T, int32); \
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
+DEFINE_GPU_SPECS(Eigen::half);
 DEFINE_GPU_SPECS(float);
 DEFINE_GPU_SPECS(double);
 DEFINE_GPU_SPECS_OP(bool, int32, scatter_op::UpdateOp::ASSIGN);
 DEFINE_GPU_SPECS_OP(bool, int64, scatter_op::UpdateOp::ASSIGN);
-// TODO(b/27222123): The following fails to compile due to lack of support for
-// fp16.
-// TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 
 #undef DEFINE_GPU_SPECS
 #undef DEFINE_GPU_SPECS_INDEX
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index c6c93077f01..88bf16d974e 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -513,7 +513,7 @@ TF_CALL_COMPLEX_TYPES(REGISTER_SCATTER_ND_ALL_GPU);
 #define REGISTER_SCATTER_ND_UPDATE_SYCL(type) \
   REGISTER_SCATTER_ND_UPDATE(type, SYCL);
 
-#define REGISTER_SCATTER_ND_MIN_MAX_GPU(type) \
+#define REGISTER_SCATTER_ND_MIN_MAX_SYCL(type) \
   REGISTER_SCATTER_ND_MIN_MAX(type, SYCL);
 
 TF_CALL_int32(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
diff --git a/tensorflow/core/kernels/scatter_op.cc b/tensorflow/core/kernels/scatter_op.cc
index 1c1f0d7e9e0..c7ea9def4fa 100644
--- a/tensorflow/core/kernels/scatter_op.cc
+++ b/tensorflow/core/kernels/scatter_op.cc
@@ -286,9 +286,9 @@ TF_CALL_ALL_TYPES(REGISTER_SCATTER_UPDATE_CPU);
 
 #define REGISTER_SCATTER_UPDATE_GPU(type) REGISTER_SCATTER_UPDATE(type, GPU);
 
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHMETIC_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_SCATTER_ARITHMETIC_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_SCATTER_MINMAX_GPU);
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_UPDATE_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_SCATTER_UPDATE_GPU);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/tensorflow/core/kernels/segment_reduction_ops_impl.h b/tensorflow/core/kernels/segment_reduction_ops_impl.h
index 6c3fad668ae..7cf15ef5b72 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_impl.h
+++ b/tensorflow/core/kernels/segment_reduction_ops_impl.h
@@ -45,13 +45,13 @@ limitations under the License.
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #if GOOGLE_CUDA
-#include "tensorflow/core/kernels/cuda_solvers.h"
+#include "tensorflow/core/util/cuda_solvers.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 
 using stream_executor::cuda::ScopedActivateExecutorContext;
 #elif TENSORFLOW_USE_ROCM
-#include "tensorflow/core/kernels/cuda_solvers.h"
 #include "tensorflow/core/platform/rocm.h"
+#include "tensorflow/core/util/cuda_solvers.h"
 using stream_executor::rocm::ScopedActivateExecutorContext;
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/segment_reduction_ops_impl_1.cc b/tensorflow/core/kernels/segment_reduction_ops_impl_1.cc
index ae71ac31f2c..f71a8dac462 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_impl_1.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_impl_1.cc
@@ -22,6 +22,8 @@ namespace internal {
 void SegmentReductionValidationHelper(OpKernelContext* context,
                                       const Tensor& input,
                                       const Tensor& segment_ids) {
+  OP_REQUIRES(context, TensorShapeUtils::IsVectorOrHigher(input.shape()),
+              errors::InvalidArgument("input must be at least rank 1"));
   OP_REQUIRES(context, TensorShapeUtils::IsVector(segment_ids.shape()),
               errors::InvalidArgument("segment_ids should be a vector."));
   const int64 num_indices = segment_ids.NumElements();
diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc
index f4c4fae2910..91d15901b12 100644
--- a/tensorflow/core/kernels/sendrecv_ops.cc
+++ b/tensorflow/core/kernels/sendrecv_ops.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace tensorflow {
 
@@ -111,14 +112,14 @@ void SendOp::Compute(OpKernelContext* ctx) {
   }
 }
 
-string SendOp::TraceString(OpKernelContext* ctx, bool verbose) {
+string SendOp::TraceString(const OpKernelContext& ctx, bool verbose) const {
   const auto& attr = def().attr();
   auto src_it = attr.find("_src");
   auto dst_it = attr.find("_dst");
   const string& src = src_it != attr.end() ? src_it->second.s() : "";
   const string& dst = dst_it != attr.end() ? dst_it->second.s() : "";
-  return strings::StrCat(name_view(), ":", type_string_view(), "#from=", src,
-                         ",to=", dst, "#");
+  string op = profiler::TraceMeOp(name_view(), type_string_view());
+  return profiler::TraceMeEncode(std::move(op), {{"from", src}, {"to", dst}});
 }
 
 REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_CPU), SendOp);
@@ -155,14 +156,14 @@ RecvOp::RecvOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
   }
 }
 
-string RecvOp::TraceString(OpKernelContext* ctx, bool verbose) {
+string RecvOp::TraceString(const OpKernelContext& ctx, bool verbose) const {
   const auto& attr = def().attr();
   auto src_it = attr.find("_src");
   auto dst_it = attr.find("_dst");
   const string& src = src_it != attr.end() ? src_it->second.s() : "";
   const string& dst = dst_it != attr.end() ? dst_it->second.s() : "";
-  return strings::StrCat(name_view(), ":", type_string_view(), "#from=", src,
-                         ",to=", dst, "#");
+  string op = profiler::TraceMeOp(name_view(), type_string_view());
+  return profiler::TraceMeEncode(std::move(op), {{"from", src}, {"to", dst}});
 }
 
 namespace {
diff --git a/tensorflow/core/kernels/sendrecv_ops.h b/tensorflow/core/kernels/sendrecv_ops.h
index 06c5663bc04..36bc22db1e7 100644
--- a/tensorflow/core/kernels/sendrecv_ops.h
+++ b/tensorflow/core/kernels/sendrecv_ops.h
@@ -26,7 +26,7 @@ class SendOp : public OpKernel {
   explicit SendOp(OpKernelConstruction* ctx);
   void Compute(OpKernelContext* ctx) override;
 
-  string TraceString(OpKernelContext* ctx, bool verbose) override;
+  string TraceString(const OpKernelContext& ctx, bool verbose) const override;
 
  private:
   string key_prefix_;
@@ -41,7 +41,7 @@ class RecvOp : public AsyncOpKernel {
   explicit RecvOp(OpKernelConstruction* ctx);
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
 
-  string TraceString(OpKernelContext* ctx, bool verbose) override;
+  string TraceString(const OpKernelContext& ctx, bool verbose) const override;
 
  private:
   string key_prefix_;
diff --git a/tensorflow/core/kernels/sparse/BUILD b/tensorflow/core/kernels/sparse/BUILD
index 1d281bc1d61..bfb6c4934bb 100644
--- a/tensorflow/core/kernels/sparse/BUILD
+++ b/tensorflow/core/kernels/sparse/BUILD
@@ -80,8 +80,8 @@ tf_kernel_library(
         "//tensorflow/core/kernels:transpose_functor",
         "//tensorflow/core/kernels:gpu_prim_hdrs",
     ] + if_cuda_or_rocm([
-        "//tensorflow/core/kernels:cuda_solvers",
-        "//tensorflow/core/kernels:cuda_sparse",
+        "//tensorflow/core/util:cuda_solvers",
+        "//tensorflow/core/util:cuda_sparse",
     ]),
     alwayslink = 1,
 )
diff --git a/tensorflow/core/kernels/sparse/add_op.cc b/tensorflow/core/kernels/sparse/add_op.cc
index b6265a1412c..06fe1cd042e 100644
--- a/tensorflow/core/kernels/sparse/add_op.cc
+++ b/tensorflow/core/kernels/sparse/add_op.cc
@@ -32,8 +32,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/fill_functor.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "tensorflow/core/kernels/cuda_solvers.h"
-#include "tensorflow/core/kernels/cuda_sparse.h"
+#include "tensorflow/core/util/cuda_solvers.h"
+#include "tensorflow/core/util/cuda_sparse.h"
 #endif
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/sparse/conj_op.cc b/tensorflow/core/kernels/sparse/conj_op.cc
index 7275262c1f0..147160fbe6c 100644
--- a/tensorflow/core/kernels/sparse/conj_op.cc
+++ b/tensorflow/core/kernels/sparse/conj_op.cc
@@ -32,8 +32,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/sparse/sparse_matrix.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "tensorflow/core/kernels/cuda_solvers.h"
-#include "tensorflow/core/kernels/cuda_sparse.h"
+#include "tensorflow/core/util/cuda_solvers.h"
+#include "tensorflow/core/util/cuda_sparse.h"
 #endif
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_dense_op.cc b/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_dense_op.cc
index 364c2c07bd8..2e5afbdcad7 100644
--- a/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_dense_op.cc
+++ b/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_dense_op.cc
@@ -34,8 +34,8 @@ limitations under the License.
 #include "tensorflow/core/util/work_sharder.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "tensorflow/core/kernels/cuda_solvers.h"
-#include "tensorflow/core/kernels/cuda_sparse.h"
+#include "tensorflow/core/util/cuda_solvers.h"
+#include "tensorflow/core/util/cuda_sparse.h"
 #endif
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_sparse_tensor_op.cc b/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_sparse_tensor_op.cc
index 55ebfa4fc10..a81ccfa562e 100644
--- a/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_sparse_tensor_op.cc
+++ b/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_sparse_tensor_op.cc
@@ -32,8 +32,8 @@ limitations under the License.
 #include "tensorflow/core/util/work_sharder.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "tensorflow/core/kernels/cuda_solvers.h"
-#include "tensorflow/core/kernels/cuda_sparse.h"
+#include "tensorflow/core/util/cuda_solvers.h"
+#include "tensorflow/core/util/cuda_sparse.h"
 #endif
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/sparse/dense_to_csr_sparse_matrix_op.cc b/tensorflow/core/kernels/sparse/dense_to_csr_sparse_matrix_op.cc
index 459bb219343..5c62a44f9ba 100644
--- a/tensorflow/core/kernels/sparse/dense_to_csr_sparse_matrix_op.cc
+++ b/tensorflow/core/kernels/sparse/dense_to_csr_sparse_matrix_op.cc
@@ -35,8 +35,8 @@ limitations under the License.
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
-#include "tensorflow/core/kernels/cuda_solvers.h"
-#include "tensorflow/core/kernels/cuda_sparse.h"
+#include "tensorflow/core/util/cuda_solvers.h"
+#include "tensorflow/core/util/cuda_sparse.h"
 #endif
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/sparse/kernels_gpu.cu.cc b/tensorflow/core/kernels/sparse/kernels_gpu.cu.cc
index 1c014db3d0a..6b11e64307a 100644
--- a/tensorflow/core/kernels/sparse/kernels_gpu.cu.cc
+++ b/tensorflow/core/kernels/sparse/kernels_gpu.cu.cc
@@ -20,13 +20,13 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/cuda_sparse.h"
 #include "tensorflow/core/kernels/gpu_device_array.h"
 #include "tensorflow/core/kernels/gpu_device_array_gpu.h"
 #include "tensorflow/core/kernels/gpu_prim.h"
 #include "tensorflow/core/kernels/sparse/kernels.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_sparse.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/sparse/mat_mul_op.cc b/tensorflow/core/kernels/sparse/mat_mul_op.cc
index 50fa0ec88ea..bf9de570fbf 100644
--- a/tensorflow/core/kernels/sparse/mat_mul_op.cc
+++ b/tensorflow/core/kernels/sparse/mat_mul_op.cc
@@ -37,8 +37,8 @@ limitations under the License.
 #include "tensorflow/core/platform/threadpool.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "tensorflow/core/kernels/cuda_solvers.h"
-#include "tensorflow/core/kernels/cuda_sparse.h"
+#include "tensorflow/core/util/cuda_solvers.h"
+#include "tensorflow/core/util/cuda_sparse.h"
 #endif
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/sparse/mul_op.cc b/tensorflow/core/kernels/sparse/mul_op.cc
index 33c3756ce58..d08f1568db1 100644
--- a/tensorflow/core/kernels/sparse/mul_op.cc
+++ b/tensorflow/core/kernels/sparse/mul_op.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/sparse/sparse_matrix.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "tensorflow/core/kernels/cuda_sparse.h"
+#include "tensorflow/core/util/cuda_sparse.h"
 #endif
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/sparse/nnz_op.cc b/tensorflow/core/kernels/sparse/nnz_op.cc
index ebc48c3e9a4..d67620443f0 100644
--- a/tensorflow/core/kernels/sparse/nnz_op.cc
+++ b/tensorflow/core/kernels/sparse/nnz_op.cc
@@ -29,8 +29,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/sparse/sparse_matrix.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "tensorflow/core/kernels/cuda_solvers.h"
-#include "tensorflow/core/kernels/cuda_sparse.h"
+#include "tensorflow/core/util/cuda_solvers.h"
+#include "tensorflow/core/util/cuda_sparse.h"
 #endif
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/sparse/softmax_op.cc b/tensorflow/core/kernels/sparse/softmax_op.cc
index 25025bfe2a6..f1a5db8d0f0 100644
--- a/tensorflow/core/kernels/sparse/softmax_op.cc
+++ b/tensorflow/core/kernels/sparse/softmax_op.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "tensorflow/core/kernels/cuda_sparse.h"
+#include "tensorflow/core/util/cuda_sparse.h"
 #define EIGEN_USE_GPU
 #endif
 
diff --git a/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc b/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc
index fb652e13d15..fecee9e4555 100644
--- a/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc
+++ b/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc
@@ -36,8 +36,8 @@ limitations under the License.
 #include "tensorflow/core/util/work_sharder.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "tensorflow/core/kernels/cuda_solvers.h"
-#include "tensorflow/core/kernels/cuda_sparse.h"
+#include "tensorflow/core/util/cuda_solvers.h"
+#include "tensorflow/core/util/cuda_sparse.h"
 #endif
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/sparse/sparse_matrix.h b/tensorflow/core/kernels/sparse/sparse_matrix.h
index 8fec9f42fbd..fc4afd06851 100644
--- a/tensorflow/core/kernels/sparse/sparse_matrix.h
+++ b/tensorflow/core/kernels/sparse/sparse_matrix.h
@@ -312,10 +312,10 @@ class CSRSparseMatrix {
     return batch_pointers_;
   }
 
-  string TypeName() const { return kTypeName; }
+  std::string TypeName() const { return kTypeName; }
 
   // TODO(ebrevdo): A better debug string.
-  string DebugString() const { return dense_shape_.DebugString(); }
+  std::string DebugString() const { return dense_shape_.DebugString(); }
 
   // Returns the number of elements.  This is equal to 1 if the
   // CSRSparseMatrix is a singleton matrix (dense_shape is length 2).
diff --git a/tensorflow/core/kernels/sparse/sparse_matrix_components_op.cc b/tensorflow/core/kernels/sparse/sparse_matrix_components_op.cc
index 59540f63846..2eaf9bd5310 100644
--- a/tensorflow/core/kernels/sparse/sparse_matrix_components_op.cc
+++ b/tensorflow/core/kernels/sparse/sparse_matrix_components_op.cc
@@ -30,8 +30,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/sparse/sparse_matrix.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "tensorflow/core/kernels/cuda_solvers.h"
-#include "tensorflow/core/kernels/cuda_sparse.h"
+#include "tensorflow/core/util/cuda_solvers.h"
+#include "tensorflow/core/util/cuda_sparse.h"
 #endif
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc b/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc
index e1a4b4194d2..2548ceaa57c 100644
--- a/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc
+++ b/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc
@@ -33,8 +33,8 @@ limitations under the License.
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
-#include "tensorflow/core/kernels/cuda_solvers.h"
-#include "tensorflow/core/kernels/cuda_sparse.h"
+#include "tensorflow/core/util/cuda_solvers.h"
+#include "tensorflow/core/util/cuda_sparse.h"
 #endif
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/sparse/transpose_op.cc b/tensorflow/core/kernels/sparse/transpose_op.cc
index 3158eb5016d..08d37fa1692 100644
--- a/tensorflow/core/kernels/sparse/transpose_op.cc
+++ b/tensorflow/core/kernels/sparse/transpose_op.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "tensorflow/core/kernels/cuda_sparse.h"
+#include "tensorflow/core/util/cuda_sparse.h"
 #define EIGEN_USE_GPU
 #endif
 
diff --git a/tensorflow/core/kernels/sparse_cross_op.cc b/tensorflow/core/kernels/sparse_cross_op.cc
index 9a80aad5d04..583235b4a30 100644
--- a/tensorflow/core/kernels/sparse_cross_op.cc
+++ b/tensorflow/core/kernels/sparse_cross_op.cc
@@ -101,7 +101,7 @@ class KeyedSparseTensorColumn : public ColumnInterface<InternalType> {
 
  private:
   const Tensor& values_;
-  uint64 key_[2];
+  tensorflow::uint64 key_[2];
   std::vector<int64> feature_counts_;
   std::vector<int64> feature_start_indices_;
 };
@@ -201,7 +201,7 @@ class KeyedDenseTensorColumn : public ColumnInterface<InternalType> {
 
  private:
   const Tensor& tensor_;
-  uint64 key_[2];
+  tensorflow::uint64 key_[2];
 };
 
 // InternalType is int64 only when using HashCrosser.
diff --git a/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc b/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc
index 8de93cf9b30..e71257037f1 100644
--- a/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc
+++ b/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc
@@ -118,13 +118,17 @@ class SparseFillEmptyRowsOp : public OpKernel {
       return;
     }
 
+    bool rows_are_ordered = true;
+    int64 last_indices_row = 0;
     std::vector<int64> csr_offset(dense_rows, 0);
     for (int i = 0; i < N; ++i) {
       const int64 row = indices(i, 0);
       OP_REQUIRES(context, row >= 0 && row < dense_rows,
                   errors::InvalidArgument("indices(", i, ", 0) is invalid: ",
                                           row, " >= ", dense_rows));
-      ++csr_offset[indices(i, 0)];
+      ++csr_offset[row];
+      rows_are_ordered = rows_are_ordered & (row >= last_indices_row);
+      last_indices_row = row;
     }
     bool all_rows_full = true;
     for (int row = 0; row < dense_rows; ++row) {
@@ -147,7 +151,7 @@ class SparseFillEmptyRowsOp : public OpKernel {
       }
     }
 
-    if (all_rows_full) {
+    if (all_rows_full && rows_are_ordered) {
       context->set_output(kOutputIndicesOutput, indices_t);
       context->set_output(kOutputValuesOutput, values_t);
       if (reverse_index_map) {
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index eb460147d71..f5747854093 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -165,7 +165,7 @@ bool IsZero(T v);
 
 template <>
 ALWAYS_INLINE bool IsZero(bfloat16 v) {
-  return v.IsZero();
+  return !static_cast<bool>(v);
 }
 
 template <>
diff --git a/tensorflow/core/kernels/special_math/BUILD b/tensorflow/core/kernels/special_math/BUILD
index d659bdfa5dd..9a49f8263e4 100644
--- a/tensorflow/core/kernels/special_math/BUILD
+++ b/tensorflow/core/kernels/special_math/BUILD
@@ -20,7 +20,7 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:math_grad",
-        "//tensorflow/core/kernels:bounds_check",
+        "//tensorflow/core/framework:bounds_check",
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:fill_functor",
         "//third_party/eigen3",
diff --git a/tensorflow/core/kernels/split_v_op.cc b/tensorflow/core/kernels/split_v_op.cc
index 4569e11dd13..fc070610877 100644
--- a/tensorflow/core/kernels/split_v_op.cc
+++ b/tensorflow/core/kernels/split_v_op.cc
@@ -206,7 +206,7 @@ class SplitVOpCPUImpl {
     const int num_split = split_start_points.size();
     const bool use_parallelism_between_outputs =
         (num_split >= 4 &&
-         input_element_count >= std::max(num_threads, num_split) * 4096 &&
+         input_element_count >= std::min(num_threads, num_split) * 4096 &&
          input_element_count < num_split * 180 * 1024);
 
     auto range_output_func = [&indices, context, &input_shape, split_dim,
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index b4099213303..7d9dfa44129 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -305,24 +305,15 @@ class StridedSliceAssignOp : public OpKernel {
     Tensor tmp;
     if (isTensor) {
       const Tensor& input = context->input(0);
-      TensorShape shape = input.shape();
-
-      std::unique_ptr<Tensor> forwarded_input = context->forward_input(
-          0, 0, input.dtype(), shape, DEVICE_MEMORY, AllocatorAttributes());
-
-      if (forwarded_input == nullptr) {
-        Tensor* out;
-        // We were not able to forward the input, so we deep copy the tensor and
-        // set the output.
-        OP_REQUIRES_OK(context,
-                       context->allocate_output(0, input.shape(), &out));
 
+      int forwarded_input;
+      OP_REQUIRES_OK(context,
+                     context->forward_input_or_allocate_output(
+                         {0}, 0, input.shape(), &old_lhs, &forwarded_input));
+      if (forwarded_input < 0) {
         OP_REQUIRES_OK(context,
                        tensorflow::functor::DoCopy(
-                           context->eigen_device<Device>(), input, out));
-        old_lhs = out;
-      } else {
-        old_lhs = forwarded_input.get();
+                           context->eigen_device<Device>(), input, old_lhs));
       }
     } else {
       if (context->input_dtype(0) == DT_RESOURCE) {
diff --git a/tensorflow/core/kernels/summary_op.cc b/tensorflow/core/kernels/summary_op.cc
index f4c91fc9ff1..22d1a21a889 100644
--- a/tensorflow/core/kernels/summary_op.cc
+++ b/tensorflow/core/kernels/summary_op.cc
@@ -31,47 +31,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-template <typename T>
-class SummaryScalarOp : public OpKernel {
- public:
-  explicit SummaryScalarOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* c) override {
-    const Tensor& tags = c->input(0);
-    const Tensor& values = c->input(1);
-
-    OP_REQUIRES(
-        c,
-        tags.IsSameSize(values) || (TensorShapeUtils::IsScalar(tags.shape()) &&
-                                    TensorShapeUtils::IsScalar(values.shape())),
-        errors::InvalidArgument(
-            "tags and values not the same shape: ", tags.shape().DebugString(),
-            " != ", values.shape().DebugString(), SingleTag(tags)));
-    auto Ttags = tags.flat<tstring>();
-    auto Tvalues = values.flat<T>();
-    Summary s;
-    for (int i = 0; i < Ttags.size(); i++) {
-      Summary::Value* v = s.add_value();
-      const tstring& Ttags_i = Ttags(i);
-      v->set_tag(Ttags_i.data(), Ttags_i.size());
-      v->set_simple_value(float(Tvalues(i)));
-    }
-
-    Tensor* summary_tensor = nullptr;
-    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &summary_tensor));
-    CHECK(SerializeToTString(s, &summary_tensor->scalar<tstring>()()));
-  }
-
-  // If there's only one tag, include it in the error message
-  static string SingleTag(const Tensor& tags) {
-    if (tags.NumElements() == 1) {
-      return strings::StrCat(" (tag '", tags.flat<tstring>()(0), "')");
-    } else {
-      return "";
-    }
-  }
-};
-
 template <typename T>
 class SummaryHistoOp : public OpKernel {
  public:
@@ -114,9 +73,6 @@ class SummaryHistoOp : public OpKernel {
 };
 
 #define REGISTER(T)                                                       \
-  REGISTER_KERNEL_BUILDER(                                                \
-      Name("ScalarSummary").Device(DEVICE_CPU).TypeConstraint<T>("T"),    \
-      SummaryScalarOp<T>);                                                \
   REGISTER_KERNEL_BUILDER(                                                \
       Name("HistogramSummary").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       SummaryHistoOp<T>);
diff --git a/tensorflow/core/kernels/summary_op_test.cc b/tensorflow/core/kernels/summary_op_test.cc
index 1e5089bdeab..9c9e87581c6 100644
--- a/tensorflow/core/kernels/summary_op_test.cc
+++ b/tensorflow/core/kernels/summary_op_test.cc
@@ -45,111 +45,6 @@ static void EXPECT_SummaryMatches(const Summary& actual,
   EXPECT_EQ(expected.DebugString(), actual.DebugString());
 }
 
-class SummaryScalarOpTest : public OpsTestBase {
- protected:
-  void MakeOp(DataType dt) {
-    TF_ASSERT_OK(NodeDefBuilder("myop", "ScalarSummary")
-                     .Input(FakeInput())
-                     .Input(FakeInput(dt))
-                     .Finalize(node_def()));
-    TF_ASSERT_OK(InitOp());
-  }
-};
-
-TEST_F(SummaryScalarOpTest, SimpleFloat) {
-  MakeOp(DT_FLOAT);
-
-  // Feed and run
-  AddInputFromArray<tstring>(TensorShape({3}), {"tag1", "tag2", "tag3"});
-  AddInputFromArray<float>(TensorShape({3}), {1.0f, -0.73f, 10000.0f});
-  TF_ASSERT_OK(RunOpKernel());
-
-  // Check the output size.
-  Tensor* out_tensor = GetOutput(0);
-  ASSERT_EQ(0, out_tensor->dims());
-  Summary summary;
-  ParseProtoUnlimited(&summary, out_tensor->scalar<tstring>()());
-  EXPECT_SummaryMatches(summary, R"(
-      value { tag: 'tag1' simple_value: 1.0 }
-      value { tag: 'tag2' simple_value: -0.73 }
-      value { tag: 'tag3' simple_value: 10000.0 }
-  )");
-}
-
-TEST_F(SummaryScalarOpTest, SimpleDouble) {
-  MakeOp(DT_DOUBLE);
-
-  // Feed and run
-  AddInputFromArray<tstring>(TensorShape({3}), {"tag1", "tag2", "tag3"});
-  AddInputFromArray<double>(TensorShape({3}), {1.0, -0.73, 10000.0});
-  TF_ASSERT_OK(RunOpKernel());
-
-  // Check the output size.
-  Tensor* out_tensor = GetOutput(0);
-  ASSERT_EQ(0, out_tensor->dims());
-  Summary summary;
-  ParseProtoUnlimited(&summary, out_tensor->scalar<tstring>()());
-  EXPECT_SummaryMatches(summary, R"(
-      value { tag: 'tag1' simple_value: 1.0 }
-      value { tag: 'tag2' simple_value: -0.73 }
-      value { tag: 'tag3' simple_value: 10000.0 }
-  )");
-}
-
-TEST_F(SummaryScalarOpTest, SimpleHalf) {
-  MakeOp(DT_HALF);
-
-  // Feed and run
-  AddInputFromList<tstring>(TensorShape({3}), {"tag1", "tag2", "tag3"});
-  AddInputFromList<Eigen::half>(TensorShape({3}), {1.0, -2.0, 10000.0});
-  TF_ASSERT_OK(RunOpKernel());
-
-  // Check the output size.
-  Tensor* out_tensor = GetOutput(0);
-  ASSERT_EQ(0, out_tensor->dims());
-  Summary summary;
-  ParseProtoUnlimited(&summary, out_tensor->scalar<tstring>()());
-  EXPECT_SummaryMatches(summary, R"(
-      value { tag: 'tag1' simple_value: 1.0 }
-      value { tag: 'tag2' simple_value: -2.0 }
-      value { tag: 'tag3' simple_value: 10000.0 }
-  )");
-}
-
-TEST_F(SummaryScalarOpTest, Error_MismatchedSize) {
-  MakeOp(DT_FLOAT);
-
-  // Feed and run
-  AddInputFromArray<tstring>(TensorShape({2}), {"tag1", "tag2"});
-  AddInputFromArray<float>(TensorShape({3}), {1.0f, -0.73f, 10000.0f});
-  Status s = RunOpKernel();
-  EXPECT_TRUE(absl::StrContains(s.ToString(), "not the same shape")) << s;
-}
-
-TEST_F(SummaryScalarOpTest, Error_WrongDimsTags) {
-  MakeOp(DT_FLOAT);
-
-  // Feed and run
-  AddInputFromArray<tstring>(TensorShape({2, 1}), {"tag1", "tag2"});
-  AddInputFromArray<float>(TensorShape({2}), {1.0f, -0.73f});
-  Status s = RunOpKernel();
-  EXPECT_TRUE(
-      absl::StrContains(s.ToString(), "tags and values not the same shape"))
-      << s;
-}
-
-TEST_F(SummaryScalarOpTest, Error_WrongDimsValues) {
-  MakeOp(DT_FLOAT);
-
-  // Feed and run
-  AddInputFromArray<tstring>(TensorShape({2}), {"tag1", "tag2"});
-  AddInputFromArray<float>(TensorShape({2, 1}), {1.0f, -0.73f});
-  Status s = RunOpKernel();
-  EXPECT_TRUE(
-      absl::StrContains(s.ToString(), "tags and values not the same shape"))
-      << s;
-}
-
 // --------------------------------------------------------------------------
 // SummaryHistoOp
 // --------------------------------------------------------------------------
diff --git a/tensorflow/core/kernels/tensor_map.cc b/tensorflow/core/kernels/tensor_map.cc
new file mode 100644
index 00000000000..ed524a2b9cc
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_map.cc
@@ -0,0 +1,110 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/tensor_map.h"
+
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
+#include "tensorflow/core/lib/core/coding.h"
+
+namespace tensorflow {
+
+TensorMap::~TensorMap() {
+  if (tensors_) tensors_->Unref();
+}
+
+void TensorMap::Encode(VariantTensorData* data) const {
+  data->set_type_name(TypeName());
+
+  absl::flat_hash_map<TensorKey, Tensor>::const_iterator map_it =
+      tensors().begin();
+  while (map_it != tensors().end()) {
+    Tensor k = map_it->first;
+    Tensor v = map_it->second;
+    // TODO: k should also not be DT_RESOURCE or DT_VARIANT
+    CHECK_NE(k.dtype(), DT_INVALID);
+    CHECK_NE(v.dtype(), DT_INVALID);
+    *data->add_tensors() = k;
+    *data->add_tensors() = v;
+    map_it++;
+  }
+  string metadata;
+  // TODO(b/118838800): Add a proto for storing the metadata.
+  // Metadata format:
+  // <element_dtype><element_shape_proto>
+  core::PutVarint64(&metadata, static_cast<uint64>(element_dtype));
+  TensorShapeProto element_shape_proto;
+  element_shape.AsProto(&element_shape_proto);
+  element_shape_proto.AppendToString(&metadata);
+  data->set_metadata(metadata);
+}
+
+static Status TensorMapDeviceCopy(
+    const TensorMap& from, TensorMap* to,
+    const UnaryVariantOpRegistry::AsyncTensorDeviceCopyFn& copy) {
+  to->element_shape = from.element_shape;
+  to->element_dtype = from.element_dtype;
+  for (const std::pair<TensorKey, Tensor>& p : from.tensors()) {
+    TensorKey to_key(p.first.dtype());
+    Tensor to_val(p.second.dtype());
+    TF_RETURN_IF_ERROR(copy(p.first, &to_key));
+    TF_RETURN_IF_ERROR(copy(p.second, &to_val));
+    to->tensors().emplace(to_key, to_val);
+  }
+  return Status::OK();
+}
+
+#define REGISTER_LIST_COPY(DIRECTION)                                        \
+  INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION(TensorMap, DIRECTION, \
+                                                       TensorMapDeviceCopy)
+
+REGISTER_LIST_COPY(VariantDeviceCopyDirection::HOST_TO_DEVICE);
+REGISTER_LIST_COPY(VariantDeviceCopyDirection::DEVICE_TO_HOST);
+REGISTER_LIST_COPY(VariantDeviceCopyDirection::DEVICE_TO_DEVICE);
+
+REGISTER_UNARY_VARIANT_DECODE_FUNCTION(TensorMap, TensorMap::kTypeName);
+
+bool TensorMap::Decode(const VariantTensorData& data) {
+  // TODO(srbs): Change the signature to Decode(VariantTensorData data) so
+  // that we do not have to copy each tensor individually below. This would
+  // require changing VariantTensorData::tensors() as well.
+  string metadata;
+  data.get_metadata(&metadata);
+  uint64 scratch;
+  StringPiece iter(metadata);
+
+  std::vector<Tensor>::const_iterator tensors_it = data.tensors().begin();
+
+  while (tensors_it != data.tensors().end()) {
+    if (std::next(tensors_it) == data.tensors().end()) {
+      return false;
+    }
+    tensors().emplace(tensors_it[0], tensors_it[1]);
+    tensors_it += 2;
+  }
+
+  core::GetVarint64(&iter, &scratch);
+  element_dtype = static_cast<DataType>(scratch);
+  core::GetVarint64(&iter, &scratch);
+  TensorShapeProto element_shape_proto;
+  element_shape_proto.ParseFromString(string(iter.data(), iter.size()));
+  element_shape = PartialTensorShape(element_shape_proto);
+  return true;
+}
+
+const char TensorMap::kTypeName[] = "tensorflow::TensorMap";
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_map.h b/tensorflow/core/kernels/tensor_map.h
new file mode 100644
index 00000000000..859d51b47dd
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_map.h
@@ -0,0 +1,187 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_TENSOR_MAP_H_
+#define TENSORFLOW_CORE_KERNELS_TENSOR_MAP_H_
+
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_key.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/lib/core/refcount.h"
+
+namespace tensorflow {
+
+// Variant compatible type for a map of tensors. This is mutable but instances
+// should never be mutated after stored in a variant tensor.
+//
+// **NOTE**: TensorMap stores a refcounted container of tf::Tensor objects,
+// which are accessible via TensorMap::tensors().  Because it is refcounted,
+// straight copies of the form:
+//
+//    TensorMap b = a;
+//    b.tensors().insert(k,v);  // WARNING: This modifies a.tensors().
+//
+// Do not create a true copy of the underlying container - but instead increment
+// a reference count.  Modifying b.tensors() modifies a.tensors().  In this way,
+// TensorMap should be considered similar to the tf::Tensor object.
+//
+// In order to get a copy of the underlying map, use the Copy method:
+//
+//    TensorMap b = a.Copy();
+//    b.tensors().insert(k, v);  // This does not modify a.tensors().
+//
+// Note that this is not a deep copy: the memory locations of the underlying
+// tensors will still point to the same locations of the corresponding tensors
+// in the original.  To truly perform a deep copy, Device and Type-specific
+// code needs to be applied to the underlying tensors as usual.
+//
+// The most important implication of RefCounted TensorMaps is that OpKernels
+// wishing to reuse TensorMap inputs as outputs via context->forward_input()
+// need to perform an additional check on the refcount of the TensorList,
+// to ensure aliasing can be performed safely.  For example:
+//
+//     bool can_alias = false;
+//     auto fw = c->forward_input(..., DT_VARIANT, {}, ...);
+//     if (fw && fw->dtype() == DT_VARIANT && fw->NumElements() == 1) {
+//       auto* tl = fw->scalar<Variant>()().get<TensorMap>();
+//       if (tl && tl->RefCountIsOne()) {
+//         can_alias = true;
+//       }
+//     }
+//
+class TensorMap {
+ public:
+  TensorMap() : tensors_(new Tensors) {}
+  ~TensorMap();
+
+  TensorMap(const TensorMap& other)
+      : element_shape(other.element_shape),
+        element_dtype(other.element_dtype),
+        tensors_(other.tensors_) {
+    tensors_->Ref();
+  }
+
+  TensorMap(TensorMap&& rhs)
+      : element_shape(std::move(rhs.element_shape)),
+        element_dtype(rhs.element_dtype),
+        tensors_(rhs.tensors_) {
+    rhs.tensors_ = nullptr;
+  }
+
+  TensorMap& operator=(const TensorMap& rhs) {
+    if (this == &rhs) return *this;
+    element_shape = rhs.element_shape;
+    element_dtype = rhs.element_dtype;
+    tensors_->Unref();
+    tensors_ = rhs.tensors_;
+    tensors_->Ref();
+    return *this;
+  }
+
+  TensorMap& operator=(TensorMap&& rhs) {
+    if (this == &rhs) return *this;
+    element_shape = rhs.element_shape;
+    element_dtype = rhs.element_dtype;
+    std::swap(tensors_, rhs.tensors_);
+    return *this;
+  }
+
+  static const char kTypeName[];
+
+  string TypeName() const { return kTypeName; }
+
+  void Encode(VariantTensorData* data) const;
+
+  bool Decode(const VariantTensorData& data);
+
+  // TODO(apassos) fill this out
+  string DebugString() const { return "TensorMap"; }
+
+  PartialTensorShape element_shape;
+
+  DataType element_dtype;
+
+  // Access to the underlying tensor container.
+  absl::flat_hash_map<TensorKey, Tensor>& tensors() {
+    return tensors_->values_;
+  }
+  const absl::flat_hash_map<TensorKey, Tensor>& tensors() const {
+    return tensors_->values_;
+  }
+
+  // Access to shape and element dtype
+  PartialTensorShape& shape() { return element_shape; }
+  DataType dtype() { return element_dtype; }
+
+  // Get a new TensorMap containing a copy of the underlying tensor container.
+  TensorMap Copy() const {
+    TensorMap out;
+    out.element_shape = element_shape;
+    out.element_dtype = element_dtype;
+    // This performs a copy of the absl::hashmap.
+    out.tensors_->values_ = tensors_->values_;
+    return out;
+  }
+
+  // Insert key and value if the key does not already exist.
+  // Returns true if the insertion happens.
+  bool insert(const TensorKey& key, const Tensor& value) {
+    auto r = tensors_->values_.try_emplace(key, value);
+    return r.second;
+  }
+
+  // Lookup given key. Returns iterator to found key or end.
+  absl::flat_hash_map<TensorKey, Tensor>::iterator find(TensorKey key) {
+    return tensors_->values_.find(key);
+  }
+
+  Tensor& lookup(TensorKey key) { return tensors_->values_.find(key)->second; }
+
+  Tensor& operator[](TensorKey& k) { return tensors_->values_[k]; }
+
+  bool replace(const TensorKey& k, const Tensor& v) {
+    tensors_->values_[k] = v;
+    return true;
+  }
+
+  // Removes element with given key. Return size of removed element.
+  size_t erase(TensorKey key) { return tensors_->values_.erase(key); }
+
+  // Size returns the number of elements in the map
+  size_t size() { return tensors_->values_.size(); }
+
+  // Is this TensorMap the only one with a reference to the underlying
+  // container?
+  bool RefCountIsOne() const { return tensors_->RefCountIsOne(); }
+
+ private:
+  class Tensors : public core::RefCounted {
+   public:
+    absl::flat_hash_map<TensorKey, Tensor> values_;
+  };
+  Tensors* tensors_;
+};
+
+#if defined(PLATFORM_GOOGLE)
+// TODO(ebrevdo): Identify why Variant inline size is smaller on mobile devices.
+static_assert(Variant::CanInlineType<TensorMap>(),
+              "Must be able to inline TensorMap into a Variant");
+#endif
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_TENSOR_MAP_H_
diff --git a/tensorflow/core/kernels/tensor_map_test.cc b/tensorflow/core/kernels/tensor_map_test.cc
new file mode 100644
index 00000000000..0300c7006be
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_map_test.cc
@@ -0,0 +1,145 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/tensor_map.h"
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+namespace {
+
+TEST(TensorMapTest, Empty) {
+  TensorMap tm;
+  EXPECT_EQ(tm.tensors().size(), 0);
+  EXPECT_EQ(tm.tensors().begin(), tm.tensors().end());
+}
+
+TEST(TensorKeyTest, Equal) {
+  TensorKey k1 = Tensor(15);
+  TensorKey k2 = Tensor(15);
+  EXPECT_EQ(k1, k2);
+  EXPECT_EQ(k1.shape(), k2.shape());
+  EXPECT_EQ(k1.dtype(), k2.dtype());
+
+  TensorKey k3 = Tensor(37.0);
+  EXPECT_NE(k1, k3);
+  EXPECT_NE(k1.dtype(), k3.dtype());
+}
+
+TEST(TensorMapTest, Insert) {
+  EXPECT_EQ(1, 1);
+  TensorMap tm;
+  TensorKey k = Tensor(11);
+  Tensor v = Tensor(22);
+  tm.insert(k, v);
+  absl::flat_hash_map<TensorKey, Tensor> am;
+  am.try_emplace(k, v);
+
+  absl::flat_hash_map<TensorKey, Tensor>::iterator map_it =
+      tm.tensors().begin();
+  EXPECT_EQ(map_it->first, k);
+  test::ExpectTensorEqual<int32>(map_it->second, v);
+  map_it++;
+  EXPECT_EQ(map_it, tm.tensors().end());
+}
+
+TEST(TensorMapTest, Lookup) {
+  TensorMap tm;
+  TensorKey k = Tensor(11);
+  Tensor v = Tensor(22);
+  tm.insert(k, v);
+  absl::flat_hash_map<TensorKey, Tensor>::iterator map_it = tm.find(k);
+  Tensor f = map_it->second;
+
+  EXPECT_EQ(map_it->first, k);
+  test::ExpectTensorEqual<int32>(f, v);
+}
+
+TEST(TensorMapTest, Erase) {
+  TensorMap tm;
+  TensorKey k = Tensor(11);
+  Tensor v = Tensor(22);
+  tm.insert(k, v);
+  tm.erase(k);
+  EXPECT_EQ(tm.find(k), tm.tensors().end());
+}
+
+TEST(TensorMapTest, SameKeyInsert) {
+  TensorMap tm;
+  TensorKey k = Tensor(11);
+  Tensor v1 = Tensor(22);
+  Tensor v2 = Tensor(23);
+  bool b1 = tm.insert(k, v1);
+  bool b2 = tm.insert(k, v2);
+  EXPECT_EQ(b1, true);
+  EXPECT_EQ(b2, false);
+  absl::flat_hash_map<TensorKey, Tensor>::iterator map_it = tm.find(k);
+  EXPECT_EQ(map_it->first, k);
+  test::ExpectTensorEqual<int32>(map_it->second, v1);
+}
+
+TEST(TensorMapTest, Replace) {
+  TensorMap tm;
+  TensorKey k = Tensor(11);
+  Tensor v1 = Tensor(22);
+  Tensor v2 = Tensor(23);
+  tm[k] = v2;
+
+  absl::flat_hash_map<TensorKey, Tensor>::iterator map_it = tm.find(k);
+  EXPECT_EQ(map_it->first, k);
+  test::ExpectTensorEqual<int32>(map_it->second, v2);
+}
+
+TEST(TensorMapTest, Copy) {
+  TensorMap tm;
+  TensorKey k = Tensor(11);
+  Tensor v = Tensor(22);
+  tm.insert(k, v);
+  TensorMap tmc = tm.Copy();
+  EXPECT_EQ(tm.dtype(), tmc.dtype());
+  EXPECT_EQ(tm.size(), tmc.size());
+  EXPECT_NE(tm.find(k), tm.tensors().end());
+  EXPECT_NE(tmc.find(k), tmc.tensors().end());
+  EXPECT_EQ(tm.find(k)->first, tmc.find(k)->first);
+  test::ExpectTensorEqual<int32>(tm.find(k)->second, tmc.find(k)->second);
+}
+
+TEST(TensorMapTest, EncodeDecode) {
+  TensorMap tm;
+  TensorKey k = Tensor(11);
+  Tensor v = Tensor(22);
+  tm.insert(k, v);
+  VariantTensorData data;
+  tm.Encode(&data);
+  TensorMap tmc;
+  tmc.Decode(data);
+
+  EXPECT_EQ(tm.dtype(), tmc.dtype());
+  EXPECT_EQ(tm.size(), tmc.size());
+  EXPECT_NE(tm.find(k), tm.tensors().end());
+  EXPECT_NE(tmc.find(k), tmc.tensors().end());
+  EXPECT_EQ(tm.find(k)->first, tmc.find(k)->first);
+  test::ExpectTensorEqual<int32>(tm.find(k)->second, tmc.find(k)->second);
+}
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index b3714e4d27e..5948121e8a3 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -14,9 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #define EIGEN_USE_THREADS
-// clang-format off
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
-// clang-format on
 #include "tensorflow/core/kernels/training_ops.h"
 
 #include <algorithm>  // NOLINT
@@ -26,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/training_op_helpers.h"
 #include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/util.h"
 
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index 92496e63e1a..bbd22bae859 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -28,12 +28,11 @@ typedef Eigen::GpuDevice GPUDevice;
 namespace functor {
 
 template <typename T>
-__global__ void ApplyAdamKernel(int32 data_dim, T* var, T* m, T* v,
-                                const T* const beta1_power_,
-                                const T* const beta2_power_, const T* const lr_,
-                                const T* const beta1_, const T* const beta2_,
-                                const T* const epsilon_, const T* grad,
-                                bool use_nesterov) {
+__global__ __launch_bounds__(1024) void ApplyAdamKernel(
+    int32 data_dim, T* var, T* m, T* v, const T* const beta1_power_,
+    const T* const beta2_power_, const T* const lr_, const T* const beta1_,
+    const T* const beta2_, const T* const epsilon_, const T* grad,
+    bool use_nesterov) {
   eigen_assert(blockDim.y == 1);
   eigen_assert(blockDim.z == 1);
   eigen_assert(gridDim.y == 1);
@@ -68,7 +67,7 @@ __global__ void ApplyAdamKernel(int32 data_dim, T* var, T* m, T* v,
 }
 
 template <typename T, typename Tindex>
-__global__ void SparseApplyKerasMomentumKernel(
+__global__ __launch_bounds__(1024) void SparseApplyKerasMomentumKernel(
     T* var, T* accum, const T* lr, const T* grad, const Tindex* indices,
     const T* momentum, bool use_nesterov, Tindex param_rows,
     Tindex updates_size, Tindex indices_size) {
@@ -186,9 +185,11 @@ __device__ std::complex<T> impl_rsqrt(std::complex<T> x) {
 }
 
 template <typename T>
-__global__ void ApplyAdagradKernel(GpuLaunchConfig cfg, T* var, T* accum,
-                                   const T* lr, const T* grad,
-                                   bool update_slots) {
+__global__ __launch_bounds__(1024) void ApplyAdagradKernel(GpuLaunchConfig cfg,
+                                                           T* var, T* accum,
+                                                           const T* lr,
+                                                           const T* grad,
+                                                           bool update_slots) {
   GPU_1D_KERNEL_LOOP(i, cfg.virtual_thread_count) {
     if (update_slots) accum[i] += grad[i] * grad[i];
     var[i] -= lr[0] * grad[i] * impl_rsqrt(accum[i]);
@@ -196,9 +197,9 @@ __global__ void ApplyAdagradKernel(GpuLaunchConfig cfg, T* var, T* accum,
 }
 
 template <typename T>
-__global__ void ApplyAdagradV2Kernel(GpuLaunchConfig cfg, T* var, T* accum,
-                                     const T* lr, const T* epsilon,
-                                     const T* grad, bool update_slots) {
+__global__ __launch_bounds__(1024) void ApplyAdagradV2Kernel(
+    GpuLaunchConfig cfg, T* var, T* accum, const T* lr, const T* epsilon,
+    const T* grad, bool update_slots) {
   GPU_1D_KERNEL_LOOP(i, cfg.virtual_thread_count) {
     if (update_slots) accum[i] += grad[i] * grad[i];
     T update = grad[i] / (impl_sqrt(accum[i]) + epsilon[0]);
@@ -207,10 +208,9 @@ __global__ void ApplyAdagradV2Kernel(GpuLaunchConfig cfg, T* var, T* accum,
 }
 
 template <typename T>
-__global__ void ApplyAdadeltaKernel(GpuLaunchConfig cfg, T* var, T* accum,
-                                    T* accum_update, const T* plr,
-                                    const T* prho, const T* peps,
-                                    const T* grad) {
+__global__ __launch_bounds__(1024) void ApplyAdadeltaKernel(
+    GpuLaunchConfig cfg, T* var, T* accum, T* accum_update, const T* plr,
+    const T* prho, const T* peps, const T* grad) {
   T rho = prho[0];
   T eps = peps[0];
   T lr = plr[0];
@@ -224,10 +224,9 @@ __global__ void ApplyAdadeltaKernel(GpuLaunchConfig cfg, T* var, T* accum,
 }
 
 template <typename T>
-__global__ void ApplyRMSPropKernel(GpuLaunchConfig cfg, T* var, T* ms, T* mom,
-                                   const T* plr, const T* prho,
-                                   const T* pmomentum, const T* peps,
-                                   const T* grad) {
+__global__ __launch_bounds__(1024) void ApplyRMSPropKernel(
+    GpuLaunchConfig cfg, T* var, T* ms, T* mom, const T* plr, const T* prho,
+    const T* pmomentum, const T* peps, const T* grad) {
   T rho = prho[0];
   T eps = peps[0];
   T lr = plr[0];
@@ -240,10 +239,9 @@ __global__ void ApplyRMSPropKernel(GpuLaunchConfig cfg, T* var, T* ms, T* mom,
 }
 
 template <typename T>
-__global__ void ApplyCenteredRMSPropKernel(GpuLaunchConfig cfg, T* var, T* mg,
-                                           T* ms, T* mom, const T* plr,
-                                           const T* prho, const T* pmomentum,
-                                           const T* peps, const T* grad) {
+__global__ __launch_bounds__(1024) void ApplyCenteredRMSPropKernel(
+    GpuLaunchConfig cfg, T* var, T* mg, T* ms, T* mom, const T* plr,
+    const T* prho, const T* pmomentum, const T* peps, const T* grad) {
   T rho = prho[0];
   T eps = peps[0];
   T lr = plr[0];
diff --git a/tensorflow/core/kernels/transpose_functor_gpu.cu.cc b/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
index cb54533998e..0747685853e 100644
--- a/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
@@ -72,7 +72,7 @@ void TransposeSimple(const GPUDevice& d, const Tensor& in,
     host_buf[ndims * 2 + i] = perm[i];
   }
   // Copies the input strides, output strides and permutation to the device.
-  auto num_bytes = sizeof(int64) * host_buf.size();
+  auto num_bytes = sizeof(int32) * host_buf.size();
   auto dev_buf = d.allocate(num_bytes);
   // NOTE: host_buf is not allocated by GpuHostAllocator, and
   // therefore we are doing a sync copy effectively.
diff --git a/tensorflow/core/kernels/where_op.cc b/tensorflow/core/kernels/where_op.cc
index 598cb526d77..d504ec9b2ed 100644
--- a/tensorflow/core/kernels/where_op.cc
+++ b/tensorflow/core/kernels/where_op.cc
@@ -39,7 +39,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
-#include "tensorflow/core/kernels/cuda_solvers.h"
+#include "tensorflow/core/util/cuda_solvers.h"
 #if GOOGLE_CUDA
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 using stream_executor::cuda::ScopedActivateExecutorContext;
diff --git a/tensorflow/core/lib/bfloat16/BUILD b/tensorflow/core/lib/bfloat16/BUILD
index 53c2059f06d..d8213933358 100644
--- a/tensorflow/core/lib/bfloat16/BUILD
+++ b/tensorflow/core/lib/bfloat16/BUILD
@@ -12,7 +12,6 @@ package(
 
 cc_library(
     name = "bfloat16",
-    srcs = ["bfloat16.cc"],
     hdrs = ["bfloat16.h"],
     deps = [
         "//tensorflow/core/platform:byte_order",
@@ -24,7 +23,6 @@ cc_library(
 filegroup(
     name = "mobile_srcs_no_runtime",
     srcs = [
-        "bfloat16.cc",
         "bfloat16.h",
     ],
 )
diff --git a/tensorflow/core/lib/bfloat16/bfloat16.h b/tensorflow/core/lib/bfloat16/bfloat16.h
index 54d78480066..5f82c0ffd5f 100644
--- a/tensorflow/core/lib/bfloat16/bfloat16.h
+++ b/tensorflow/core/lib/bfloat16/bfloat16.h
@@ -16,520 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_LIB_BFLOAT16_BFLOAT16_H_
 #define TENSORFLOW_CORE_LIB_BFLOAT16_BFLOAT16_H_
 
-#include <cmath>
-#include <complex>
-#include <iostream>
-#include <limits>
-
+// clang-format off
 #include "tensorflow/core/platform/byte_order.h"
-
-#if defined(__CUDACC__) || (defined(__HIPCC__) && defined(__HIP__))
-// All functions callable from CUDA code must be qualified with __device__
-#define B16_DEVICE_FUNC __host__ __device__
-
-#else
-#define B16_DEVICE_FUNC
-
-#endif
-
-namespace Eigen {
-struct half;
-}
+#include "third_party/eigen3/Eigen/Core"
+// clang-format on
 
 namespace tensorflow {
-
-// Single precision complex.
-typedef std::complex<float> complex64;
-// Double precision complex.
-typedef std::complex<double> complex128;
-
-// see framework/bfloat16.h for description.
-struct bfloat16 {
-  // The default constructor must yield a zero value, not an uninitialized
-  // value; some TF kernels use T() as a zero value.
-  B16_DEVICE_FUNC bfloat16() : value(ZERO_VALUE) {}
-
-  B16_DEVICE_FUNC static bfloat16 truncate_to_bfloat16(const float v) {
-    bfloat16 output;
-    if (float_isnan(v)) {
-      output.value = NAN_VALUE;
-      return output;
-    } else if (std::fabs(v) < std::numeric_limits<float>::min()) {
-      // Flush denormal to +/- 0.
-      output.value = std::signbit(v) ? 0x8000 : 0;
-      return output;
-    }
-    const uint16_t* p = reinterpret_cast<const uint16_t*>(&v);
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    output.value = p[0];
-#else
-    output.value = p[1];
-#endif
-    return output;
-  }
-
-  B16_DEVICE_FUNC explicit bfloat16(const float v) {
-    value = round_to_bfloat16(v).value;
-  }
-
-  B16_DEVICE_FUNC explicit bfloat16(const double val)
-      : bfloat16(static_cast<float>(val)) {}
-  // Following the convention of numpy, converting between complex and
-  // float will lead to loss of imag value.
-  B16_DEVICE_FUNC explicit bfloat16(const complex64& val)
-      : bfloat16(val.real()) {}
-
-  B16_DEVICE_FUNC explicit bfloat16(const complex128& val)
-      : bfloat16(static_cast<float>(val.real())) {}
-
-  B16_DEVICE_FUNC explicit bfloat16(const unsigned short val)
-      : bfloat16(static_cast<float>(val)) {}
-
-  B16_DEVICE_FUNC explicit bfloat16(const unsigned int val)
-      : bfloat16(static_cast<float>(val)) {}
-
-  B16_DEVICE_FUNC explicit bfloat16(const int val)
-      : bfloat16(static_cast<float>(val)) {}
-
-  B16_DEVICE_FUNC explicit bfloat16(const long val)
-      : bfloat16(static_cast<float>(val)) {}
-
-  B16_DEVICE_FUNC explicit bfloat16(const long long val)
-      : bfloat16(static_cast<float>(val)) {}
-
-  template <class T>
-  B16_DEVICE_FUNC explicit bfloat16(const T& val)
-      : bfloat16(static_cast<float>(val)) {}
-
-  B16_DEVICE_FUNC explicit operator float() const {
-    float result = 0;
-
-    uint16_t* q = reinterpret_cast<uint16_t*>(&result);
-
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    q[0] = value;
-#else
-    q[1] = value;
-#endif
-    return result;
-  }
-
-  B16_DEVICE_FUNC explicit operator bool() const {
-    return static_cast<bool>(float(*this));
-  }
-
-  B16_DEVICE_FUNC explicit operator Eigen::half() const;
-
-  B16_DEVICE_FUNC explicit operator short() const {
-    return static_cast<short>(float(*this));
-  }
-
-  B16_DEVICE_FUNC explicit operator int() const {
-    return static_cast<int>(float(*this));
-  }
-
-  B16_DEVICE_FUNC explicit operator long() const {
-    return static_cast<long>(float(*this));
-  }
-
-  B16_DEVICE_FUNC explicit operator char() const {
-    return static_cast<char>(float(*this));
-  }
-
-  B16_DEVICE_FUNC explicit operator signed char() const {
-    return static_cast<signed char>(float(*this));
-  }
-
-  B16_DEVICE_FUNC explicit operator unsigned char() const {
-    return static_cast<unsigned char>(float(*this));
-  }
-
-  B16_DEVICE_FUNC explicit operator unsigned short() const {
-    return static_cast<unsigned short>(float(*this));
-  }
-
-  B16_DEVICE_FUNC explicit operator unsigned int() const {
-    return static_cast<unsigned int>(float(*this));
-  }
-
-  B16_DEVICE_FUNC explicit operator unsigned long() const {
-    return static_cast<unsigned long>(float(*this));
-  }
-
-  B16_DEVICE_FUNC explicit operator unsigned long long() const {
-    return static_cast<unsigned long long>(float(*this));
-  }
-
-  B16_DEVICE_FUNC explicit operator long long() const {
-    return static_cast<long long>(float(*this));
-  }
-
-  B16_DEVICE_FUNC explicit operator double() const {
-    return static_cast<double>(float(*this));
-  }
-
-  B16_DEVICE_FUNC explicit operator complex64() const {
-    return complex64(float(*this), float(0.0));
-  }
-
-  B16_DEVICE_FUNC explicit operator complex128() const {
-    return complex128(double(*this), double(0.0));
-  }
-
-  union FP32 {
-    unsigned int u;
-    float f;
-  };
-
-  // Converts a float point to bfloat16, with round-nearest-to-even as rounding
-  // method.
-  // TODO: There is a slightly faster implementation (8% faster on CPU)
-  // than this (documented in cl/175987786), that is exponentially harder to
-  // understand and document. Switch to the faster version when converting to
-  // BF16 becomes compute-bound.
-  B16_DEVICE_FUNC static bfloat16 round_to_bfloat16(float v) {
-    uint32_t input;
-    FP32 f;
-    f.f = v;
-    input = f.u;
-    bfloat16 output;
-
-    // Fast rounding algorithm that rounds a half value to nearest even. This
-    // reduces expected error when we convert a large number of floats. Here
-    // is how it works:
-    //
-    // Definitions:
-    // To convert a float 32 to bfloat16, a float 32 can be viewed as 32 bits
-    // with the following tags:
-    //
-    // Sign |  Exp (8 bits) | Frac (23 bits)
-    //  S     EEEEEEEE         FFFFFFLRTTTTTTTTTTTTTTT
-    //
-    //  S: Sign bit.
-    //  E: Exponent bits.
-    //  F: First 6 bits of fraction.
-    //  L: Least significant bit of resulting bfloat16 if we truncate away the
-    //  rest of the float32. This is also the 7th bit of fraction
-    //  R: Rounding bit, 8th bit of fraction.
-    //  T: Sticky bits, rest of fraction, 15 bits.
-    //
-    // To round half to nearest even, there are 3 cases where we want to round
-    // down (simply truncate the result of the bits away, which consists of
-    // rounding bit and sticky bits) and two cases where we want to round up
-    // (truncate then add one to the result).
-    //
-    // The fast converting algorithm simply adds lsb (L) to 0x7fff (15 bits of
-    // 1s) as the rounding bias, adds the rounding bias to the input, then
-    // truncates the last 16 bits away.
-    //
-    // To understand how it works, we can analyze this algorithm case by case:
-    //
-    // 1. L = 0, R = 0:
-    //   Expect: round down, this is less than half value.
-    //
-    //   Algorithm:
-    //   - Rounding bias: 0x7fff + 0 = 0x7fff
-    //   - Adding rounding bias to input may create any carry, depending on
-    //   whether there is any value set to 1 in T bits.
-    //   - R may be set to 1 if there is a carry.
-    //   - L remains 0.
-    //   - Note that this case also handles Inf and -Inf, where all fraction
-    //   bits, including L, R and Ts are all 0. The output remains Inf after
-    //   this algorithm.
-    //
-    // 2. L = 1, R = 0:
-    //   Expect: round down, this is less than half value.
-    //
-    //   Algorithm:
-    //   - Rounding bias: 0x7fff + 1 = 0x8000
-    //   - Adding rounding bias to input doesn't change sticky bits but
-    //   adds 1 to rounding bit.
-    //   - L remains 1.
-    //
-    // 3. L = 0, R = 1, all of T are 0:
-    //   Expect: round down, this is exactly at half, the result is already
-    //   even (L=0).
-    //
-    //   Algorithm:
-    //   - Rounding bias: 0x7fff + 0 = 0x7fff
-    //   - Adding rounding bias to input sets all sticky bits to 1, but
-    //   doesn't create a carry.
-    //   - R remains 1.
-    //   - L remains 0.
-    //
-    // 4. L = 1, R = 1:
-    //   Expect: round up, this is exactly at half, the result needs to be
-    //   round to the next even number.
-    //
-    //   Algorithm:
-    //   - Rounding bias: 0x7fff + 1 = 0x8000
-    //   - Adding rounding bias to input doesn't change sticky bits, but
-    //   creates a carry from rounding bit.
-    //   - The carry sets L to 0, creates another carry bit and propagate
-    //   forward to F bits.
-    //   - If all the F bits are 1, a carry then propagates to the exponent
-    //   bits, which then creates the minimum value with the next exponent
-    //   value. Note that we won't have the case where exponents are all 1,
-    //   since that's either a NaN (handled in the other if condition) or inf
-    //   (handled in case 1).
-    //
-    // 5. L = 0, R = 1, any of T is 1:
-    //   Expect: round up, this is greater than half.
-    //
-    //   Algorithm:
-    //   - Rounding bias: 0x7fff + 0 = 0x7fff
-    //   - Adding rounding bias to input creates a carry from sticky bits,
-    //   sets rounding bit to 0, then create another carry.
-    //   - The second carry sets L to 1.
-    //
-    // Examples:
-    //
-    //  Exact half value that is already even:
-    //    Input:
-    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
-    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
-    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0     1000000000000000
-    //
-    //     This falls into case 3. We truncate the rest of 16 bits and no
-    //     carry is created into F and L:
-    //
-    //    Output:
-    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
-    //     S     E E E E E E E E      F F F F F F L
-    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
-    //
-    //  Exact half value, round to next even number:
-    //    Input:
-    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
-    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
-    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 0 1     1000000000000000
-    //
-    //     This falls into case 4. We create a carry from R and T,
-    //     which then propagates into L and F:
-    //
-    //    Output:
-    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
-    //     S     E E E E E E E E      F F F F F F L
-    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
-    //
-    //
-    //  Max denormal value round to min normal value:
-    //    Input:
-    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
-    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
-    //     0     0 0 0 0 0 0 0 0      1 1 1 1 1 1 1     1111111111111111
-    //
-    //     This falls into case 4. We create a carry from R and T,
-    //     propagate into L and F, which then propagates into exponent
-    //     bits:
-    //
-    //    Output:
-    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
-    //     S     E E E E E E E E      F F F F F F L
-    //     0     0 0 0 0 0 0 0 1      0 0 0 0 0 0 0
-    //
-    //  Max normal value round to Inf:
-    //    Input:
-    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
-    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
-    //     0     1 1 1 1 1 1 1 0      1 1 1 1 1 1 1     1111111111111111
-    //
-    //     This falls into case 4. We create a carry from R and T,
-    //     propagate into L and F, which then propagates into exponent
-    //     bits:
-    //
-    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
-    //     S     E E E E E E E E      F F F F F F L
-    //     0     1 1 1 1 1 1 1 1      0 0 0 0 0 0 0
-    //
-    //
-    // Least significant bit of resulting bfloat.
-    uint32_t lsb = (input >> 16) & 1;
-    uint32_t rounding_bias = 0x7fff + lsb;
-    input += rounding_bias;
-    output.value = static_cast<uint16_t>(input >> 16);
-    if ((f.u & 0xff800000u) == 0) {
-      // Flush positive denormal to 0
-      output.value = 0x0;
-    }
-    if ((f.u & 0xff800000u) == 0x80000000u) {
-      // Flush negative denormal to -0
-      output.value = 0x8000;
-    }
-    if (float_isnan(v)) {
-      output.value = NAN_VALUE;
-    }
-    return output;
-  }
-
-  static bfloat16 epsilon() {
-    bfloat16 x;
-    x.value = 0x3c00;  // 0x1.0p-7
-    return x;
-  }
-
-  static bfloat16 highest() {
-    bfloat16 x;
-    x.value = 0x7F7F;  // 0x1.FEp127
-    return x;
-  }
-
-  static bfloat16 lowest() {
-    bfloat16 x;
-    x.value = 0xFF7F;  // -0x1.FEp127
-    return x;
-  }
-
-  static bfloat16 min_positive_normal() {
-    bfloat16 x;
-    x.value = 0x0080;  // 0x1p-126
-    return x;
-  }
-
-  bool IsZero() const { return (value & 0x7FFF) == ZERO_VALUE; }
-
-  uint16_t value;
-
-  // A value that represents "not a number".
-  static constexpr uint16_t NAN_VALUE = 0x7FC0;
-
- private:
-  // A value that represents "zero".
-  static constexpr uint16_t ZERO_VALUE = 0;
-
-  B16_DEVICE_FUNC static bool float_isnan(const float& x) {
-#ifdef __CUDA_ARCH__
-    return ::isnan(x);
-#else
-    return std::isnan(x);
-#endif
-  }
-};
-
-B16_DEVICE_FUNC inline std::ostream& operator<<(std::ostream& os,
-                                                const bfloat16& dt) {
-  os << static_cast<float>(dt);
-  return os;
-}
-
-B16_DEVICE_FUNC inline bfloat16 operator+(bfloat16 a, bfloat16 b) {
-  return bfloat16(static_cast<float>(a) + static_cast<float>(b));
-}
-B16_DEVICE_FUNC inline bfloat16 operator+(bfloat16 a, int b) {
-  return bfloat16(static_cast<float>(a) + static_cast<float>(b));
-}
-B16_DEVICE_FUNC inline bfloat16 operator+(int a, bfloat16 b) {
-  return bfloat16(static_cast<float>(a) + static_cast<float>(b));
-}
-B16_DEVICE_FUNC inline bfloat16 operator-(bfloat16 a, bfloat16 b) {
-  return bfloat16(static_cast<float>(a) - static_cast<float>(b));
-}
-B16_DEVICE_FUNC inline bfloat16 operator*(bfloat16 a, bfloat16 b) {
-  return bfloat16(static_cast<float>(a) * static_cast<float>(b));
-}
-B16_DEVICE_FUNC inline bfloat16 operator/(bfloat16 a, bfloat16 b) {
-  return bfloat16(static_cast<float>(a) / static_cast<float>(b));
-}
-B16_DEVICE_FUNC inline bfloat16 operator-(bfloat16 a) {
-  a.value ^= 0x8000;
-  return a;
-}
-B16_DEVICE_FUNC inline bool operator<(bfloat16 a, bfloat16 b) {
-  return static_cast<float>(a) < static_cast<float>(b);
-}
-B16_DEVICE_FUNC inline bool operator<=(bfloat16 a, bfloat16 b) {
-  return static_cast<float>(a) <= static_cast<float>(b);
-}
-B16_DEVICE_FUNC inline bool operator==(bfloat16 a, bfloat16 b) {
-  return static_cast<float>(a) == static_cast<float>(b);
-}
-B16_DEVICE_FUNC inline bool operator!=(bfloat16 a, bfloat16 b) {
-  return static_cast<float>(a) != static_cast<float>(b);
-}
-B16_DEVICE_FUNC inline bool operator>(bfloat16 a, bfloat16 b) {
-  return static_cast<float>(a) > static_cast<float>(b);
-}
-B16_DEVICE_FUNC inline bool operator>=(bfloat16 a, bfloat16 b) {
-  return static_cast<float>(a) >= static_cast<float>(b);
-}
-B16_DEVICE_FUNC inline bfloat16& operator+=(bfloat16& a, bfloat16 b) {
-  a = a + b;
-  return a;
-}
-B16_DEVICE_FUNC inline bfloat16& operator-=(bfloat16& a, bfloat16 b) {
-  a = a - b;
-  return a;
-}
-B16_DEVICE_FUNC inline bfloat16 operator++(bfloat16& a) {
-  a += bfloat16(1);
-  return a;
-}
-B16_DEVICE_FUNC inline bfloat16 operator--(bfloat16& a) {
-  a -= bfloat16(1);
-  return a;
-}
-B16_DEVICE_FUNC inline bfloat16 operator++(bfloat16& a, int) {
-  bfloat16 original_value = a;
-  ++a;
-  return original_value;
-}
-B16_DEVICE_FUNC inline bfloat16 operator--(bfloat16& a, int) {
-  bfloat16 original_value = a;
-  --a;
-  return original_value;
-}
-B16_DEVICE_FUNC inline bfloat16& operator*=(bfloat16& a, bfloat16 b) {
-  a = a * b;
-  return a;
-}
-B16_DEVICE_FUNC inline bfloat16& operator/=(bfloat16& a, bfloat16 b) {
-  a = a / b;
-  return a;
-}
+typedef Eigen::bfloat16 bfloat16;
 }  // end namespace tensorflow
 
-namespace std {
-template <>
-struct hash<tensorflow::bfloat16> {
-  size_t operator()(const tensorflow::bfloat16& v) const {
-    return hash<float>()(static_cast<float>(v));
-  }
-};
-
-using tensorflow::bfloat16;
-inline bool isinf(const bfloat16& a) { return std::isinf(float(a)); }
-inline bool isnan(const bfloat16& a) { return std::isnan(float(a)); }
-inline bool isfinite(const bfloat16& a) { return std::isfinite(float(a)); }
-inline bfloat16 abs(const bfloat16& a) { return bfloat16(std::abs(float(a))); }
-inline bfloat16 exp(const bfloat16& a) { return bfloat16(std::exp(float(a))); }
-inline bfloat16 expm1(const bfloat16& a) {
-  return bfloat16(std::expm1(float(a)));
-}
-inline bfloat16 log(const bfloat16& a) { return bfloat16(std::log(float(a))); }
-inline bfloat16 log1p(const bfloat16& a) {
-  return bfloat16(std::log1p(float(a)));
-}
-inline bfloat16 log10(const bfloat16& a) {
-  return bfloat16(std::log10(float(a)));
-}
-inline bfloat16 sqrt(const bfloat16& a) {
-  return bfloat16(std::sqrt(float(a)));
-}
-inline bfloat16 pow(const bfloat16& a, const bfloat16& b) {
-  return bfloat16(std::pow(float(a), float(b)));
-}
-inline bfloat16 sin(const bfloat16& a) { return bfloat16(std::sin(float(a))); }
-inline bfloat16 cos(const bfloat16& a) { return bfloat16(std::cos(float(a))); }
-inline bfloat16 tan(const bfloat16& a) { return bfloat16(std::tan(float(a))); }
-inline bfloat16 tanh(const bfloat16& a) {
-  return bfloat16(std::tanh(float(a)));
-}
-inline bfloat16 floor(const bfloat16& a) {
-  return bfloat16(std::floor(float(a)));
-}
-inline bfloat16 ceil(const bfloat16& a) {
-  return bfloat16(std::ceil(float(a)));
-}
-}  // namespace std
-
 #endif  // TENSORFLOW_CORE_LIB_BFLOAT16_BFLOAT16_H_
diff --git a/tensorflow/core/lib/gif/gif_io.cc b/tensorflow/core/lib/gif/gif_io.cc
index 32e2f6dfa52..659513d05ed 100644
--- a/tensorflow/core/lib/gif/gif_io.cc
+++ b/tensorflow/core/lib/gif/gif_io.cc
@@ -16,9 +16,11 @@ limitations under the License.
 // Functions to read images in GIF format.
 
 #include "tensorflow/core/lib/gif/gif_io.h"
+
 #include <algorithm>
+
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/gif.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
@@ -68,17 +70,17 @@ uint8* Decode(const void* srcdata, int datasize,
     }
   });
   if (error_code != D_GIF_SUCCEEDED) {
-    *error_string = strings::StrCat("failed to open gif file: ",
-                                    GifErrorStringNonNull(error_code));
+    *error_string = absl::StrCat("failed to open gif file: ",
+                                 GifErrorStringNonNull(error_code));
     return nullptr;
   }
   if (DGifSlurp(gif_file) != GIF_OK) {
-    *error_string = strings::StrCat("failed to slurp gif file: ",
-                                    GifErrorStringNonNull(gif_file->Error));
+    *error_string = absl::StrCat("failed to slurp gif file: ",
+                                 GifErrorStringNonNull(gif_file->Error));
     return nullptr;
   }
   if (gif_file->ImageCount <= 0) {
-    *error_string = strings::StrCat("gif file does not contain any image");
+    *error_string = "gif file does not contain any image";
     return nullptr;
   }
 
@@ -118,8 +120,7 @@ uint8* Decode(const void* srcdata, int datasize,
         img_desc->Height != height) {
       // If the first frame does not fill the entire canvas then return error.
       if (k == 0) {
-        *error_string =
-            strings::StrCat("the first frame does not fill the canvas");
+        *error_string = "the first frame does not fill the canvas";
         return nullptr;
       }
       // Otherwise previous frame will be reused to fill the unoccupied canvas.
@@ -144,7 +145,7 @@ uint8* Decode(const void* srcdata, int datasize,
                                     ? this_image->ImageDesc.ColorMap
                                     : gif_file->SColorMap;
     if (color_map == nullptr) {
-      *error_string = strings::StrCat("missing color map for frame ", k);
+      *error_string = absl::StrCat("missing color map for frame ", k);
       return nullptr;
     }
 
@@ -156,9 +157,9 @@ uint8* Decode(const void* srcdata, int datasize,
                                    (j - img_desc->Left)];
 
         if (color_index >= color_map->ColorCount) {
-          *error_string = strings::StrCat("found color index ", color_index,
-                                          " outside of color map range ",
-                                          color_map->ColorCount);
+          *error_string = absl::StrCat("found color index ", color_index,
+                                       " outside of color map range ",
+                                       color_map->ColorCount);
           return nullptr;
         }
 
diff --git a/tensorflow/core/lib/io/record_reader.cc b/tensorflow/core/lib/io/record_reader.cc
index 40e516f5ef9..7b9597a8533 100644
--- a/tensorflow/core/lib/io/record_reader.cc
+++ b/tensorflow/core/lib/io/record_reader.cc
@@ -167,10 +167,9 @@ Status RecordReader::GetMetadata(Metadata* md) {
   return Status::OK();
 }
 
-Status RecordReader::ReadRecord(uint64* offset, tstring* record) {
-  // Position the input stream.
+Status RecordReader::PositionInputStream(uint64 offset) {
   int64 curr_pos = input_stream_->Tell();
-  int64 desired_pos = static_cast<int64>(*offset);
+  int64 desired_pos = static_cast<int64>(offset);
   if (curr_pos > desired_pos || curr_pos < 0 /* EOF */ ||
       (curr_pos == desired_pos && last_read_failed_)) {
     last_read_failed_ = false;
@@ -180,6 +179,11 @@ Status RecordReader::ReadRecord(uint64* offset, tstring* record) {
     TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(desired_pos - curr_pos));
   }
   DCHECK_EQ(desired_pos, input_stream_->Tell());
+  return Status::OK();
+}
+
+Status RecordReader::ReadRecord(uint64* offset, tstring* record) {
+  TF_RETURN_IF_ERROR(PositionInputStream(*offset));
 
   // Read header data.
   Status s = ReadChecksummed(*offset, sizeof(uint64), record);
@@ -194,7 +198,8 @@ Status RecordReader::ReadRecord(uint64* offset, tstring* record) {
   if (!s.ok()) {
     last_read_failed_ = true;
     if (errors::IsOutOfRange(s)) {
-      s = errors::DataLoss("truncated record at ", *offset);
+      s = errors::DataLoss("truncated record at ", *offset, "' failed with ",
+                           s.error_message());
     }
     return s;
   }
@@ -204,6 +209,38 @@ Status RecordReader::ReadRecord(uint64* offset, tstring* record) {
   return Status::OK();
 }
 
+Status RecordReader::SkipRecords(uint64* offset, int num_to_skip,
+                                 int* num_skipped) {
+  TF_RETURN_IF_ERROR(PositionInputStream(*offset));
+
+  Status s;
+  tstring record;
+  *num_skipped = 0;
+  for (int i = 0; i < num_to_skip; ++i) {
+    s = ReadChecksummed(*offset, sizeof(uint64), &record);
+    if (!s.ok()) {
+      last_read_failed_ = true;
+      return s;
+    }
+    const uint64 length = core::DecodeFixed64(record.data());
+
+    // Skip data
+    s = input_stream_->SkipNBytes(length + kFooterSize);
+    if (!s.ok()) {
+      last_read_failed_ = true;
+      if (errors::IsOutOfRange(s)) {
+        s = errors::DataLoss("truncated record at ", *offset, "' failed with ",
+                             s.error_message());
+      }
+      return s;
+    }
+    *offset += kHeaderSize + length + kFooterSize;
+    DCHECK_EQ(*offset, input_stream_->Tell());
+    (*num_skipped)++;
+  }
+  return Status::OK();
+}
+
 SequentialRecordReader::SequentialRecordReader(
     RandomAccessFile* file, const RecordReaderOptions& options)
     : underlying_(file, options), offset_(0) {}
diff --git a/tensorflow/core/lib/io/record_reader.h b/tensorflow/core/lib/io/record_reader.h
index 07709990a64..34080b954e3 100644
--- a/tensorflow/core/lib/io/record_reader.h
+++ b/tensorflow/core/lib/io/record_reader.h
@@ -97,6 +97,13 @@ class RecordReader {
   // OUT_OF_RANGE for end of file, or something else for an error.
   Status ReadRecord(uint64* offset, tstring* record);
 
+  // Skip num_to_skip record starting at "*offset" and update *offset
+  // to point to the offset of the next num_to_skip + 1 record.
+  // Return OK on success, OUT_OF_RANGE for end of file, or something
+  // else for an error. "*num_skipped" records the number of records that
+  // are actually skipped. It should be equal to num_to_skip on success.
+  Status SkipRecords(uint64* offset, int num_to_skip, int* num_skipped);
+
   // Return the metadata of the Record file.
   //
   // The current implementation scans the file to completion,
@@ -110,6 +117,7 @@ class RecordReader {
 
  private:
   Status ReadChecksummed(uint64 offset, size_t n, tstring* result);
+  Status PositionInputStream(uint64 offset);
 
   RecordReaderOptions options_;
   std::unique_ptr<InputStreamInterface> input_stream_;
@@ -133,13 +141,21 @@ class SequentialRecordReader {
 
   virtual ~SequentialRecordReader() = default;
 
-  // Reads the next record in the file into *record. Returns OK on success,
+  // Read the next record in the file into *record. Returns OK on success,
   // OUT_OF_RANGE for end of file, or something else for an error.
   Status ReadRecord(tstring* record) {
     return underlying_.ReadRecord(&offset_, record);
   }
 
-  // Returns the current offset in the file.
+  // Skip the next num_to_skip record in the file. Return OK on success,
+  // OUT_OF_RANGE for end of file, or something else for an error.
+  // "*num_skipped" records the number of records that are actually skipped.
+  // It should be equal to num_to_skip on success.
+  Status SkipRecords(int num_to_skip, int* num_skipped) {
+    return underlying_.SkipRecords(&offset_, num_to_skip, num_skipped);
+  }
+
+  // Return the current offset in the file.
   uint64 TellOffset() { return offset_; }
 
   // Seek to this offset within the file and set this offset as the current
diff --git a/tensorflow/core/lib/io/record_reader_writer_test.cc b/tensorflow/core/lib/io/record_reader_writer_test.cc
index 486b238bd29..03d6792b335 100644
--- a/tensorflow/core/lib/io/record_reader_writer_test.cc
+++ b/tensorflow/core/lib/io/record_reader_writer_test.cc
@@ -158,6 +158,77 @@ TEST(RecordReaderWriterTest, TestBasics) {
   }
 }
 
+TEST(RecordReaderWriterTest, TestSkipBasic) {
+  Env* env = Env::Default();
+  string fname = testing::TmpDir() + "/record_reader_writer_skip_basic_test";
+
+  for (auto buf_size : BufferSizes()) {
+    {
+      std::unique_ptr<WritableFile> file;
+      TF_CHECK_OK(env->NewWritableFile(fname, &file));
+
+      io::RecordWriterOptions options;
+      options.zlib_options.output_buffer_size = buf_size;
+      io::RecordWriter writer(file.get(), options);
+      TF_EXPECT_OK(writer.WriteRecord("abc"));
+      TF_EXPECT_OK(writer.WriteRecord("defg"));
+      TF_EXPECT_OK(writer.WriteRecord("hij"));
+      TF_CHECK_OK(writer.Flush());
+    }
+
+    {
+      std::unique_ptr<RandomAccessFile> read_file;
+      // Read it back with the RecordReader.
+      TF_CHECK_OK(env->NewRandomAccessFile(fname, &read_file));
+      io::RecordReaderOptions options;
+      options.zlib_options.input_buffer_size = buf_size;
+      io::RecordReader reader(read_file.get(), options);
+      uint64 offset = 0;
+      int num_skipped;
+      tstring record;
+      TF_CHECK_OK(reader.SkipRecords(&offset, 2, &num_skipped));
+      EXPECT_EQ(2, num_skipped);
+      TF_CHECK_OK(reader.ReadRecord(&offset, &record));
+      EXPECT_EQ("hij", record);
+    }
+  }
+}
+
+TEST(RecordReaderWriterTest, TestSkipOutOfRange) {
+  Env* env = Env::Default();
+  string fname =
+      testing::TmpDir() + "/record_reader_writer_skip_out_of_range_test";
+
+  for (auto buf_size : BufferSizes()) {
+    {
+      std::unique_ptr<WritableFile> file;
+      TF_CHECK_OK(env->NewWritableFile(fname, &file));
+
+      io::RecordWriterOptions options;
+      options.zlib_options.output_buffer_size = buf_size;
+      io::RecordWriter writer(file.get(), options);
+      TF_EXPECT_OK(writer.WriteRecord("abc"));
+      TF_EXPECT_OK(writer.WriteRecord("defg"));
+      TF_CHECK_OK(writer.Flush());
+    }
+
+    {
+      std::unique_ptr<RandomAccessFile> read_file;
+      // Read it back with the RecordReader.
+      TF_CHECK_OK(env->NewRandomAccessFile(fname, &read_file));
+      io::RecordReaderOptions options;
+      options.zlib_options.input_buffer_size = buf_size;
+      io::RecordReader reader(read_file.get(), options);
+      uint64 offset = 0;
+      int num_skipped;
+      tstring record;
+      Status s = reader.SkipRecords(&offset, 3, &num_skipped);
+      EXPECT_EQ(2, num_skipped);
+      EXPECT_EQ(error::OUT_OF_RANGE, s.code());
+    }
+  }
+}
+
 TEST(RecordReaderWriterTest, TestSnappy) {
   Env* env = Env::Default();
   string fname = testing::TmpDir() + "/record_reader_writer_snappy_test";
diff --git a/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc b/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc
index ac8f657d20d..4bbe74c423a 100644
--- a/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc
+++ b/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc
@@ -34,7 +34,6 @@ namespace tensorflow {
 namespace jpeg {
 namespace {
 
-using absl::bit_cast;
 const char kTestData[] = "tensorflow/core/lib/jpeg/testdata/";
 
 int ComputeSumAbsoluteDifference(const uint8* a, const uint8* b, int width,
@@ -60,7 +59,7 @@ void TestJPEG(Env* env, const string& jpegfile) {
   string jpeg;
   ReadFileToStringOrDie(env, jpegfile, &jpeg);
   const int fsize = jpeg.size();
-  const uint8* const temp = bit_cast<const uint8*>(jpeg.data());
+  const uint8* const temp = absl::bit_cast<const uint8*>(jpeg.data());
 
   // Try partial decoding (half of the data)
   int w, h, c;
@@ -102,7 +101,7 @@ void TestCropAndDecodeJpeg(Env* env, const string& jpegfile,
   string jpeg;
   ReadFileToStringOrDie(env, jpegfile, &jpeg);
   const int fsize = jpeg.size();
-  auto temp = bit_cast<const uint8*>(jpeg.data());
+  const auto* temp = absl::bit_cast<const uint8*>(jpeg.data());
 
   // Decode the whole image.
   std::unique_ptr<uint8[]> imgdata1;
@@ -225,7 +224,7 @@ TEST(JpegMemTest, CropAndDecodeJpegWithStride) {
   string jpeg;
   ReadFileToStringOrDie(env, data_path + "jpeg_merge_test1.jpg", &jpeg);
   const int fsize = jpeg.size();
-  auto temp = bit_cast<const uint8*>(jpeg.data());
+  const auto* temp = absl::bit_cast<const uint8*>(jpeg.data());
 
   int w, h, c;
   ASSERT_TRUE(GetImageInfo(temp, fsize, &w, &h, &c));
@@ -263,7 +262,7 @@ TEST(JpegMemTest, CropAndDecodeJpegWithInvalidCropWindow) {
   string jpeg;
   ReadFileToStringOrDie(env, data_path + "jpeg_merge_test1.jpg", &jpeg);
   const int fsize = jpeg.size();
-  auto temp = bit_cast<const uint8*>(jpeg.data());
+  const auto* temp = absl::bit_cast<const uint8*>(jpeg.data());
 
   int w, h, c;
   ASSERT_TRUE(GetImageInfo(temp, fsize, &w, &h, &c));
diff --git a/tensorflow/core/lib/png/BUILD b/tensorflow/core/lib/png/BUILD
index 7abc82e6a0f..95debe44e5e 100644
--- a/tensorflow/core/lib/png/BUILD
+++ b/tensorflow/core/lib/png/BUILD
@@ -33,5 +33,6 @@ filegroup(
         "testdata/lena_palette.png",
         "testdata/lena_palette_trns.png",
         "testdata/lena_rgba.png",
+        "testdata/palette_only.png",
     ],
 )
diff --git a/tensorflow/core/lib/png/png_io.cc b/tensorflow/core/lib/png/png_io.cc
index d0014066ce3..35e189e7829 100644
--- a/tensorflow/core/lib/png/png_io.cc
+++ b/tensorflow/core/lib/png/png_io.cc
@@ -282,8 +282,11 @@ bool CommonInitDecode(StringPiece png_string, int desired_channels,
   }
 
   // convert palette to rgb(a) if needs be.
-  if (context->color_type == PNG_COLOR_TYPE_PALETTE)
+  // Note if desired_channels=1 then the original palette indices
+  // will be presented.
+  if (context->color_type == PNG_COLOR_TYPE_PALETTE && desired_channels != 1) {
     png_set_palette_to_rgb(context->png_ptr);
+  }
 
   // handle grayscale case for source or destination
   const bool want_gray = (context->channels < 3);
@@ -294,7 +297,9 @@ bool CommonInitDecode(StringPiece png_string, int desired_channels,
     }
   }
   if (want_gray) {  // output is grayscale
-    if (!is_gray)
+    // Note if color type is palette and context->channels < 3,
+    // then the original palette indices will be presented.
+    if (!is_gray && context->color_type != PNG_COLOR_TYPE_PALETTE)
       png_set_rgb_to_gray(context->png_ptr, 1, 0.299, 0.587);  // 601, JPG
   } else {  // output is rgb(a)
     if (is_gray)
diff --git a/tensorflow/core/lib/png/testdata/palette_only.png b/tensorflow/core/lib/png/testdata/palette_only.png
new file mode 100644
index 00000000000..c3fedd4e1fb
Binary files /dev/null and b/tensorflow/core/lib/png/testdata/palette_only.png differ
diff --git a/tensorflow/core/nccl/BUILD b/tensorflow/core/nccl/BUILD
index 3acf7579f62..388b5e62c18 100644
--- a/tensorflow/core/nccl/BUILD
+++ b/tensorflow/core/nccl/BUILD
@@ -44,6 +44,8 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor",
         "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/profiler/lib:connected_traceme",
+        "//tensorflow/core/profiler/lib:annotated_traceme",
     ]),
     alwayslink = 1,
 )
diff --git a/tensorflow/core/nccl/nccl_manager.cc b/tensorflow/core/nccl/nccl_manager.cc
index 68af9fbb2f0..bb4e7c90a06 100644
--- a/tensorflow/core/nccl/nccl_manager.cc
+++ b/tensorflow/core/nccl/nccl_manager.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/profiler/lib/annotated_traceme.h"
+#include "tensorflow/core/profiler/lib/connected_traceme.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #if GOOGLE_CUDA
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
@@ -213,6 +215,10 @@ struct NcclManager::Collective : public core::RefCounted {
   // Guarded by the mutex of the containing Communicator.
   int available_participants = 0;
   bool multi_node_ready = false;
+  // trace_context is used by tracing system to associate collective
+  // scheduling and execution (cooperative kernel launch), which happen
+  // on different threads.
+  uint64 trace_context = 0;
 
   Status status;
 };
@@ -591,6 +597,10 @@ bool NcclManager::CheckReady(const string& collective_key,
 }
 
 void NcclManager::RunCollective(Collective* collective) {
+  // For TraceMeConsumer in Connection::RPCDone().
+  tensorflow::profiler::TraceMeProducer traceme("Schedule Collective");
+  collective->trace_context = traceme.GetContextId();
+
   static mutex collective_mu(LINKER_INITIALIZED);
 
   Status status = collective->status;
@@ -657,6 +667,20 @@ void NcclManager::RunCollective(Collective* collective) {
   collective->Unref();
 }
 
+namespace {
+// For tracing purpose.
+size_t ComputeBufferSize(const NcclManager::Participant* p,
+                         DataType data_type) {
+  size_t num_elements = 0;
+  if (p->output) {
+    num_elements += p->output->NumElements();
+  } else if (p->input) {
+    num_elements += p->input->NumElements();
+  }
+  return num_elements * DataTypeSize(data_type);
+}
+}  // namespace
+
 void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
 #if TENSORFLOW_USE_ROCM
   se::Stream* comm_stream = nccl_stream->stream;
@@ -686,6 +710,9 @@ void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
 
     // Launch the nccl kernel.
     Collective* collective = next_launch.first;
+    tensorflow::profiler::TraceMeConsumer traceme("Run Collective",
+                                                  collective->trace_context);
+
     ncclDataType_t data_type = ToNcclType(collective->data_type);
     int p_idx = next_launch.second;
     Participant* p = collective->participants[p_idx].get();
@@ -701,6 +728,12 @@ void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
                 << " sendbuff " << sendbuff << " recvbuff " << recvbuff
                 << " nccl_comm " << nccl_comm << " comm_stream " << comm_stream
                 << " cuda_stream " << cu_stream;
+        profiler::AnnotatedTraceMe traceme([&] {
+          return profiler::TraceMeEncode(
+              "ncclAllReduce",
+              {{"buffer_size", ComputeBufferSize(p, collective->data_type)},
+               {"collective_type", "all_reduce"}});
+        });
         nccl_result = ncclAllReduce(sendbuff, recvbuff, p->input->NumElements(),
                                     data_type, collective->reduction_op,
                                     nccl_comm, *cu_stream);
@@ -732,6 +765,12 @@ void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
                 << " sendbuff " << sendbuff << " recvbuff " << recvbuff
                 << " nccl_comm " << nccl_comm << " comm_stream " << comm_stream
                 << " cuda_stream " << cu_stream;
+        profiler::AnnotatedTraceMe traceme([&] {
+          return profiler::TraceMeEncode(
+              "ncclBroadcast",
+              {{"buffer_size", ComputeBufferSize(p, collective->data_type)},
+               {"collective_type", "broadcast"}});
+        });
         nccl_result =
             ncclBroadcast(sendbuff, recvbuff, num_elements, data_type,
                           collective->root_rank, nccl_comm, *cu_stream);
@@ -742,6 +781,12 @@ void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
         void* recvbuff =
             p->output ? const_cast<char*>(p->output->tensor_data().data())
                       : nullptr;
+        profiler::AnnotatedTraceMe traceme([&] {
+          return profiler::TraceMeEncode(
+              "buffer_size",
+              {{"output_size", ComputeBufferSize(p, collective->data_type)},
+               {"collective_type", "reduce"}});
+        });
         nccl_result = ncclReduce(sendbuff, recvbuff, p->input->NumElements(),
                                  data_type, collective->reduction_op,
                                  collective->root_rank, nccl_comm, *cu_stream);
@@ -758,6 +803,12 @@ void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
                 << " recvcount " << p->output->NumElements() << " nccl_comm "
                 << nccl_comm << " comm_stream " << comm_stream
                 << " cuda_stream " << cu_stream;
+        profiler::AnnotatedTraceMe traceme([&] {
+          return profiler::TraceMeEncode(
+              "ncclAllGather",
+              {{"buffer_size", ComputeBufferSize(p, collective->data_type)},
+               {"collective_type", "all_gather"}});
+        });
         nccl_result = ncclAllGather(sendbuff, recvbuff, p->input->NumElements(),
                                     data_type, nccl_comm, *cu_stream);
         break;
diff --git a/tensorflow/core/ops/collective_ops.cc b/tensorflow/core/ops/collective_ops.cc
index 23d09ff61ec..51b266d8f08 100644
--- a/tensorflow/core/ops/collective_ops.cc
+++ b/tensorflow/core/ops/collective_ops.cc
@@ -104,4 +104,17 @@ REGISTER_OP("CollectiveBcastRecv")
     .SetIsStateful()
     .SetShapeFn(shape_inference::ExplicitShape);
 
+REGISTER_OP("CollectiveReduceV2")
+    .Input("input: T")
+    .Output("data: T")
+    .Attr("T: {float, float16, float64, int32, int64}")
+    .Input("group_size: int32")
+    .Input("group_key: int32")
+    .Input("instance_key: int32")
+    .Attr("merge_op: {'Min', 'Max', 'Mul', 'Add'}")
+    .Attr("final_op: {'Id', 'Div'}")
+    .Attr("communication_hint: string = 'auto'")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnchangedShape);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchMatMulV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchMatMulV2.pbtxt
index 77224c111ba..a32cad6c148 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchMatMulV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchMatMulV2.pbtxt
@@ -43,3 +43,49 @@ op {
     }
   }
 }
+op {
+  name: "BatchMatMulV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "adj_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adj_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduceV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduceV2.pbtxt
new file mode 100644
index 00000000000..dd39ac27f93
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduceV2.pbtxt
@@ -0,0 +1,66 @@
+op {
+  name: "CollectiveReduceV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "group_key"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "instance_key"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "merge_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Min"
+        s: "Max"
+        s: "Mul"
+        s: "Add"
+      }
+    }
+  }
+  attr {
+    name: "final_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Id"
+        s: "Div"
+      }
+    }
+  }
+  attr {
+    name: "communication_hint"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ComputeBatchSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ComputeBatchSize.pbtxt
new file mode 100644
index 00000000000..13ab4eef4d0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/ComputeBatchSize.pbtxt
@@ -0,0 +1,11 @@
+op {
+  name: "ComputeBatchSize"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CrossReplicaSum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CrossReplicaSum.pbtxt
index 09c2402cc5a..f879b85bd10 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CrossReplicaSum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CrossReplicaSum.pbtxt
@@ -50,3 +50,31 @@ op {
     }
   }
 }
+op {
+  name: "CrossReplicaSum"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_UINT32
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/EmptyTensorMap.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/EmptyTensorMap.pbtxt
new file mode 100644
index 00000000000..25327b4e1e8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/EmptyTensorMap.pbtxt
@@ -0,0 +1,7 @@
+op {
+  name: "EmptyTensorMap"
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/KthOrderStatistic.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/KthOrderStatistic.pbtxt
new file mode 100644
index 00000000000..8e5b79cec04
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/KthOrderStatistic.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "KthOrderStatistic"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "k"
+    type: "int"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MakeUnique.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MakeUnique.pbtxt
new file mode 100644
index 00000000000..685f52d66ea
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/MakeUnique.pbtxt
@@ -0,0 +1,11 @@
+op {
+  name: "MakeUnique"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OptimizeDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OptimizeDatasetV2.pbtxt
new file mode 100644
index 00000000000..ee43df5bfd7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/OptimizeDatasetV2.pbtxt
@@ -0,0 +1,43 @@
+op {
+  name: "OptimizeDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "optimizations_enabled"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "optimizations_disabled"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "optimizations_default"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "optimization_configs"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RebatchDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RebatchDatasetV2.pbtxt
new file mode 100644
index 00000000000..7cc91dbd8e9
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/RebatchDatasetV2.pbtxt
@@ -0,0 +1,31 @@
+op {
+  name: "RebatchDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_sizes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessCase.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessCase.pbtxt
new file mode 100644
index 00000000000..174c00e5c8a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessCase.pbtxt
@@ -0,0 +1,39 @@
+op {
+  name: "StatelessCase"
+  input_arg {
+    name: "branch_index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "branches"
+    type: "list(func)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessSampleDistortedBoundingBox.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessSampleDistortedBoundingBox.pbtxt
new file mode 100644
index 00000000000..6858a110cf4
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessSampleDistortedBoundingBox.pbtxt
@@ -0,0 +1,88 @@
+op {
+  name: "StatelessSampleDistortedBoundingBox"
+  input_arg {
+    name: "image_size"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bounding_boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_object_covered"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "begin"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "size"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "bboxes"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "aspect_ratio_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.75
+        f: 1.33
+      }
+    }
+  }
+  attr {
+    name: "area_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.05
+        f: 1
+      }
+    }
+  }
+  attr {
+    name: "max_attempts"
+    type: "int"
+    default_value {
+      i: 100
+    }
+  }
+  attr {
+    name: "use_image_if_no_bounding_boxes"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUCompile.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUCompile.pbtxt
new file mode 100644
index 00000000000..be95091c809
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUCompile.pbtxt
@@ -0,0 +1,50 @@
+op {
+  name: "TPUCompile"
+  input_arg {
+    name: "dynamic_shapes"
+    type: DT_INT64
+    number_attr: "NumDynamicShapes"
+  }
+  input_arg {
+    name: "guaranteed_constants"
+    type_list_attr: "Tguaranteed_constants"
+  }
+  output_arg {
+    name: "compilation_status"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "program"
+    type: DT_STRING
+    number_attr: "num_computations"
+  }
+  output_arg {
+    name: "may_modify_variables"
+    type: DT_BOOL
+    number_attr: "num_computations"
+  }
+  attr {
+    name: "num_computations"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "function"
+    type: "func"
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+  }
+  attr {
+    name: "NumDynamicShapes"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "Tguaranteed_constants"
+    type: "list(type)"
+    has_minimum: true
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUCompileSucceededAssert.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUCompileSucceededAssert.pbtxt
new file mode 100644
index 00000000000..bc1b3c153f1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUCompileSucceededAssert.pbtxt
@@ -0,0 +1,8 @@
+op {
+  name: "TPUCompileSucceededAssert"
+  input_arg {
+    name: "compilation_status"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUExecute.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUExecute.pbtxt
new file mode 100644
index 00000000000..f60da0ac5be
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUExecute.pbtxt
@@ -0,0 +1,26 @@
+op {
+  name: "TPUExecute"
+  input_arg {
+    name: "args"
+    type_list_attr: "Targs"
+  }
+  input_arg {
+    name: "key"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "results"
+    type_list_attr: "Tresults"
+  }
+  attr {
+    name: "Targs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tresults"
+    type: "list(type)"
+    has_minimum: true
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUExecuteAndUpdateVariables.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUExecuteAndUpdateVariables.pbtxt
new file mode 100644
index 00000000000..8fab665120c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUExecuteAndUpdateVariables.pbtxt
@@ -0,0 +1,36 @@
+op {
+  name: "TPUExecuteAndUpdateVariables"
+  input_arg {
+    name: "args"
+    type_list_attr: "Targs"
+  }
+  input_arg {
+    name: "key"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "results"
+    type_list_attr: "Tresults"
+  }
+  attr {
+    name: "Targs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tresults"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "device_var_reads_indices"
+    type: "list(int)"
+    has_minimum: true
+  }
+  attr {
+    name: "device_var_updates_indices"
+    type: "list(int)"
+    has_minimum: true
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedInput.pbtxt
new file mode 100644
index 00000000000..aab0574d99e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedInput.pbtxt
@@ -0,0 +1,29 @@
+op {
+  name: "TPUPartitionedInput"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "partition_dim"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedOutput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedOutput.pbtxt
new file mode 100644
index 00000000000..38a85e31964
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedOutput.pbtxt
@@ -0,0 +1,29 @@
+op {
+  name: "TPUPartitionedOutput"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    number_attr: "num_splits"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "num_splits"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "partition_dim"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorMapErase.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorMapErase.pbtxt
new file mode 100644
index 00000000000..8b6c16005b5
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorMapErase.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "TensorMapErase"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key"
+    type_attr: "key_dtype"
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "value_dtype"
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorMapHasKey.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorMapHasKey.pbtxt
new file mode 100644
index 00000000000..437822797af
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorMapHasKey.pbtxt
@@ -0,0 +1,19 @@
+op {
+  name: "TensorMapHasKey"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "has_key"
+    type: DT_BOOL
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorMapInsert.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorMapInsert.pbtxt
new file mode 100644
index 00000000000..10061ea1cde
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorMapInsert.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "TensorMapInsert"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key"
+    type_attr: "key_dtype"
+  }
+  input_arg {
+    name: "value"
+    type_attr: "value_dtype"
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorMapLookup.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorMapLookup.pbtxt
new file mode 100644
index 00000000000..b48fda8ac46
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorMapLookup.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "TensorMapLookup"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key"
+    type_attr: "key_dtype"
+  }
+  output_arg {
+    name: "value"
+    type_attr: "value_dtype"
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorMapSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorMapSize.pbtxt
new file mode 100644
index 00000000000..dd8ade84414
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorMapSize.pbtxt
@@ -0,0 +1,11 @@
+op {
+  name: "TensorMapSize"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TopKUnique.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TopKUnique.pbtxt
new file mode 100644
index 00000000000..12463385bcc
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/TopKUnique.pbtxt
@@ -0,0 +1,19 @@
+op {
+  name: "TopKUnique"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "topk"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "topk_indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "k"
+    type: "int"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TopKWithUnique.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TopKWithUnique.pbtxt
new file mode 100644
index 00000000000..5e3216fa554
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/TopKWithUnique.pbtxt
@@ -0,0 +1,19 @@
+op {
+  name: "TopKWithUnique"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "topk"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "topk_indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "k"
+    type: "int"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaHostCompute.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaHostCompute.pbtxt
new file mode 100644
index 00000000000..87a8f639e6c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaHostCompute.pbtxt
@@ -0,0 +1,54 @@
+op {
+  name: "XlaHostCompute"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "Tinputs"
+  }
+  output_arg {
+    name: "outputs"
+    type_list_attr: "Toutputs"
+  }
+  attr {
+    name: "Tinputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Toutputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "ancestors"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "shape_inference_graph"
+    type: "func"
+  }
+  attr {
+    name: "key"
+    type: "string"
+  }
+  attr {
+    name: "cost_estimate_ns"
+    type: "int"
+    default_value {
+      i: 1000000
+    }
+  }
+  attr {
+    name: "tpu_core"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaRecvFromHost.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaRecvFromHost.pbtxt
new file mode 100644
index 00000000000..d3760ea79b1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaRecvFromHost.pbtxt
@@ -0,0 +1,20 @@
+op {
+  name: "XlaRecvFromHost"
+  output_arg {
+    name: "output"
+    type_attr: "Toutput"
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "key"
+    type: "string"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSendToHost.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSendToHost.pbtxt
new file mode 100644
index 00000000000..f2dfeaf4444
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSendToHost.pbtxt
@@ -0,0 +1,16 @@
+op {
+  name: "XlaSendToHost"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+  }
+  attr {
+    name: "key"
+    type: "string"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 4f750cc938d..6ef5635e95a 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -837,6 +837,17 @@ REGISTER_OP("OptimizeDataset")
     .Attr("optimization_configs: list(string) = []")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("OptimizeDatasetV2")
+    .Input("input_dataset: variant")
+    .Input("optimizations_enabled: string")
+    .Input("optimizations_disabled: string")
+    .Input("optimizations_default: string")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("optimization_configs: list(string) = []")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("OptionalFromValue")
     .Input("components: Toutput_types")
     .Output("optional: variant")
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index 5e869a2f0be..dd75f99bb70 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -145,6 +145,11 @@ REGISTER_OP("UncompressElement")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::DatasetIteratorShape);
 
+REGISTER_OP("ComputeBatchSize")
+    .Input("input_dataset : variant")
+    .Output("batch_size : int64")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("CSVDataset")
     .Input("filenames: string")
     .Input("compression_type: string")
@@ -787,6 +792,15 @@ REGISTER_OP("RebatchDataset")
     .Attr("use_fallback: bool = true")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("RebatchDatasetV2")
+    .Input("input_dataset: variant")
+    .Input("batch_sizes: int64")
+    .Input("drop_remainder: bool")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("SamplingDataset")
     .Input("input_dataset: variant")
     .Input("rate: float32")
diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index 11b10f3c504..c1c11d0a565 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -135,6 +135,36 @@ REGISTER_OP("If")
     .SetIsStateful()
     .SetShapeFn(IfShapeInferenceFn);
 
+Status CaseShapeInferenceFn(shape_inference::InferenceContext* c) {
+  std::vector<PartialTensorShape> output_shapes;
+  TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
+  // If `output_shapes` attr is set use that as the shapes of the outputs
+  // else return unknown shapes.
+  if (output_shapes.empty()) return shape_inference::UnknownShape(c);
+  if (output_shapes.size() != c->num_outputs()) {
+    return errors::InvalidArgument(
+        "`output_shapes` must be the same length as num outputs (",
+        output_shapes.size(), " vs. ", c->num_outputs());
+  }
+  for (size_t i = 0; i < output_shapes.size(); ++i) {
+    shape_inference::ShapeHandle output_shape_handle;
+    TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
+        output_shapes[i], &output_shape_handle));
+    c->set_output(static_cast<int>(i), output_shape_handle);
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("StatelessCase")
+    .Input("branch_index: int32")
+    .Input("input: Tin")
+    .Output("output: Tout")
+    .Attr("Tin: list(type) >= 0")
+    .Attr("Tout: list(type) >= 0")
+    .Attr("branches: list(func) >= 1")
+    .Attr("output_shapes: list(shape) = []")
+    .SetShapeFn(CaseShapeInferenceFn);
+
 REGISTER_OP("Case")
     .Input("branch_index: int32")
     .Input("input: Tin")
@@ -144,25 +174,7 @@ REGISTER_OP("Case")
     .Attr("branches: list(func) >= 1")
     .Attr("output_shapes: list(shape) = []")
     .SetIsStateful()
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      std::vector<PartialTensorShape> output_shapes;
-      TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
-      // If `output_shapes` attr is set use that as the shapes of the outputs
-      // else return unknown shapes.
-      if (output_shapes.empty()) return shape_inference::UnknownShape(c);
-      if (output_shapes.size() != c->num_outputs()) {
-        return errors::InvalidArgument(
-            "`output_shapes` must be the same length as num outputs (",
-            output_shapes.size(), " vs. ", c->num_outputs());
-      }
-      for (size_t i = 0; i < output_shapes.size(); ++i) {
-        shape_inference::ShapeHandle output_shape_handle;
-        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
-            output_shapes[i], &output_shape_handle));
-        c->set_output(static_cast<int>(i), output_shape_handle);
-      }
-      return Status::OK();
-    });
+    .SetShapeFn(CaseShapeInferenceFn);
 
 // TODO(drpng): remove this.
 REGISTER_OP("_While")
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 43ee65c4ab4..8dfc67f22d3 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -758,6 +758,44 @@ REGISTER_OP("SampleDistortedBoundingBoxV2")
       return Status::OK();
     });
 
+REGISTER_OP("StatelessSampleDistortedBoundingBox")
+    .Input("image_size: T")
+    .Input("bounding_boxes: float")
+    .Input("min_object_covered: float")
+    .Input("seed: Tseed")
+    .Output("begin: T")
+    .Output("size: T")
+    .Output("bboxes: float")
+    .Attr("T: {uint8, int8, int16, int32, int64}")
+    .Attr("Tseed: {int32, int64}")
+    .Attr("aspect_ratio_range: list(float) = [0.75, 1.33]")
+    .Attr("area_range: list(float) = [0.05, 1.0]")
+    .Attr("max_attempts: int = 100")
+    .Attr("use_image_if_no_bounding_boxes: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      // Get inputs and validate ranks.
+      ShapeHandle image_size;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &image_size));
+      ShapeHandle bounding_boxes;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &bounding_boxes));
+      ShapeHandle min_object_covered;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &min_object_covered));
+      ShapeHandle seed;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &seed));
+      // image_size: 1-D with [height, width, channels]
+      // bounding_boxes: 3-D with shape [batch, N, 4]
+      DimensionHandle unused;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(image_size, 0), 3, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(bounding_boxes, 2), 4, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(seed, 0), 2, &unused));
+
+      c->set_output(0, c->Vector(3));
+      c->set_output(1, c->Vector(3));
+      c->set_output(2, c->MakeShape({1, 1, 4}));
+
+      return Status::OK();
+    });
+
 // --------------------------------------------------------------------------
 
 // glimpse = extract_glimpse(input, size, offsets) extract the glimpse
diff --git a/tensorflow/core/ops/logging_ops.cc b/tensorflow/core/ops/logging_ops.cc
index 6489074b546..01fe5057616 100644
--- a/tensorflow/core/ops/logging_ops.cc
+++ b/tensorflow/core/ops/logging_ops.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/dataset_stateful_op_whitelist.h"
+#include "tensorflow/core/framework/dataset_stateful_op_allowlist.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 
@@ -30,7 +30,7 @@ REGISTER_OP("Assert")
     .Attr("summarize: int = 3")
     .SetShapeFn(shape_inference::NoOutputs);
 
-WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS("Assert");
+ALLOW_STATEFUL_OP_FOR_DATASET_FUNCTIONS("Assert");
 
 REGISTER_OP("Print")
     .Input("input: T")
@@ -44,7 +44,7 @@ REGISTER_OP("Print")
     .Attr("summarize: int = 3")
     .SetShapeFn(shape_inference::UnchangedShape);
 
-WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS("Print");
+ALLOW_STATEFUL_OP_FOR_DATASET_FUNCTIONS("Print");
 
 REGISTER_OP("PrintV2")
     .Input("input: string")
@@ -62,7 +62,7 @@ REGISTER_OP("PrintV2")
       return Status::OK();
     });
 
-WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS("PrintV2");
+ALLOW_STATEFUL_OP_FOR_DATASET_FUNCTIONS("PrintV2");
 
 // ----------------------------------------------------------------------------
 // Operators that deal with SummaryProtos (encoded as DT_STRING tensors) as
@@ -87,13 +87,6 @@ REGISTER_OP("TensorSummary")
     .Attr("display_name: string = ''")
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("ScalarSummary")
-    .Input("tags: string")
-    .Input("values: T")
-    .Output("summary: string")
-    .Attr("T: realnumbertype")
-    .SetShapeFn(shape_inference::ScalarShape);
-
 REGISTER_OP("HistogramSummary")
     .Input("tag: string")
     .Input("values: T")
@@ -141,6 +134,6 @@ REGISTER_OP("Timestamp")
     .SetIsStateful()
     .SetShapeFn(shape_inference::ScalarShape);
 
-WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS("Timestamp");
+ALLOW_STATEFUL_OP_FOR_DATASET_FUNCTIONS("Timestamp");
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/lookup_ops.cc b/tensorflow/core/ops/lookup_ops.cc
index 9f198b62df1..f99c1f6922a 100644
--- a/tensorflow/core/ops/lookup_ops.cc
+++ b/tensorflow/core/ops/lookup_ops.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/dataset_stateful_op_whitelist.h"
+#include "tensorflow/core/framework/dataset_stateful_op_allowlist.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def_builder.h"
 #include "tensorflow/core/framework/shape_inference.h"
@@ -181,7 +181,7 @@ REGISTER_OP("LookupTableFindV2")
 
       return Status::OK();
     });
-WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS("LookupTableFindV2");
+ALLOW_STATEFUL_OP_FOR_DATASET_FUNCTIONS("LookupTableFindV2");
 // TODO(b/72710477): Update this.
 
 REGISTER_OP("LookupTableInsert")
@@ -231,13 +231,13 @@ REGISTER_OP("LookupTableSize")
     .Input("table_handle: Ref(string)")
     .Output("size: int64")
     .SetShapeFn(TwoElementVectorInputsAndScalarOutputs);
-WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS("LookupTableSize");
+ALLOW_STATEFUL_OP_FOR_DATASET_FUNCTIONS("LookupTableSize");
 
 REGISTER_OP("LookupTableSizeV2")
     .Input("table_handle: resource")
     .Output("size: int64")
     .SetShapeFn(ScalarAndTwoElementVectorInputsAndScalarOutputs);
-WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS("LookupTableSizeV2");
+ALLOW_STATEFUL_OP_FOR_DATASET_FUNCTIONS("LookupTableSizeV2");
 
 REGISTER_OP("LookupTableExport")
     .Input("table_handle: Ref(string)")
diff --git a/tensorflow/core/ops/map_ops.cc b/tensorflow/core/ops/map_ops.cc
new file mode 100644
index 00000000000..6eb0452e8a9
--- /dev/null
+++ b/tensorflow/core/ops/map_ops.cc
@@ -0,0 +1,80 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+namespace {
+
+// TODO(kttian): Support non-scalar values
+REGISTER_OP("EmptyTensorMap")
+    .Output("handle: variant")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorMapSize")
+    .Input("input_handle: variant")
+    .Output("size: int32")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("TensorMapInsert")
+    .Input("input_handle: variant")
+    .Input("key: key_dtype")
+    .Input("value: value_dtype")
+    .Output("output_handle: variant")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorMapLookup")
+    .Input("input_handle: variant")
+    .Input("key: key_dtype")
+    .Output("value: value_dtype")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->UnknownShape());
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorMapErase")
+    .Input("input_handle: variant")
+    .Input("key: key_dtype")
+    .Output("output_handle: variant")
+    .Output("value: value_dtype")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());        // output map
+      c->set_output(1, c->UnknownShape());  // removed element
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorMapHasKey")
+    .Input("input_handle: variant")
+    .Input("key: element_dtype")
+    .Output("has_key: bool")
+    .Attr("element_dtype: type")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 2a70f420260..cbf1ef53dde 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -131,7 +131,7 @@ REGISTER_OP("BatchMatMulV2")
     .Input("y: T")
     .Output("output: T")
     .Attr(
-        "T: {bfloat16, half, float, double, int32, int64, complex64, "
+        "T: {bfloat16, half, float, double, int16, int32, int64, complex64, "
         "complex128}")
     .Attr("adj_x: bool = false")
     .Attr("adj_y: bool = false")
@@ -494,6 +494,7 @@ REGISTER_OP("TruncateDiv")
 REGISTER_OP("RealDiv").BINARY_MORE().SetShapeFn(
     shape_inference::BroadcastBinaryOpShapeFn);
 
+// Note SquaredDifference implements conj(x - y)*(x - y).
 REGISTER_OP("SquaredDifference")
     .BINARY_FEWER()
     .SetIsCommutative()
@@ -951,11 +952,7 @@ REGISTER_OP("_FusedMatMul")
     .Output("product: T")
     .Attr("transpose_a: bool = false")
     .Attr("transpose_b: bool = false")
-#if defined(INTEL_MKL) && defined(ENABLE_INTEL_MKL_BFLOAT16)
     .Attr("T: {bfloat16, float}")
-#else
-    .Attr("T: {float}")
-#endif
     .Attr("num_args: int >= 0")
     .Attr("fused_ops: list(string) = []")
     // Attributes for the FusedBatchNorm ----------- //
@@ -1446,9 +1443,11 @@ Status RangeSize(const Tensor* start_t, const Tensor* limit_t,
   }
 
   auto size = (std::is_integral<T>::value
-                   ? ((std::abs(limit - start) + std::abs(delta) - T(1)) /
-                      std::abs(delta))
-                   : (std::ceil(std::abs((limit - start) / delta))));
+                   ? ((Eigen::numext::abs(limit - start) +
+                       Eigen::numext::abs(delta) - T(1)) /
+                      Eigen::numext::abs(delta))
+                   : (Eigen::numext::ceil(
+                         Eigen::numext::abs((limit - start) / delta))));
   c->set_output(0, c->Vector(static_cast<int64>(size)));
   return Status::OK();
 }
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index c9ad5aa2dc6..7610619019d 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -3722,6 +3722,7 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -7667,6 +7668,72 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectiveReduceV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "group_key"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "instance_key"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "merge_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Min"
+        s: "Max"
+        s: "Mul"
+        s: "Add"
+      }
+    }
+  }
+  attr {
+    name: "final_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Id"
+        s: "Div"
+      }
+    }
+  }
+  attr {
+    name: "communication_hint"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "CombinedNonMaxSuppression"
   input_arg {
@@ -7891,6 +7958,17 @@ op {
     }
   }
 }
+op {
+  name: "ComputeBatchSize"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+}
 op {
   name: "Concat"
   input_arg {
@@ -9214,6 +9292,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_INT32
@@ -13402,6 +13481,13 @@ op {
     }
   }
 }
+op {
+  name: "EmptyTensorMap"
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+}
 op {
   name: "EncodeBase64"
   input_arg {
@@ -20006,6 +20092,21 @@ op {
     type: DT_FLOAT
   }
 }
+op {
+  name: "KthOrderStatistic"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "k"
+    type: "int"
+  }
+}
 op {
   name: "L2Loss"
   input_arg {
@@ -22331,6 +22432,17 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "MakeUnique"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+}
 op {
   name: "MapAndBatchDataset"
   input_arg {
@@ -26280,6 +26392,49 @@ op {
     }
   }
 }
+op {
+  name: "OptimizeDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "optimizations_enabled"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "optimizations_disabled"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "optimizations_default"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "optimization_configs"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
 op {
   name: "OptionalFromValue"
   input_arg {
@@ -35290,6 +35445,37 @@ op {
     }
   }
 }
+op {
+  name: "RebatchDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_sizes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "Reciprocal"
   input_arg {
@@ -48905,6 +49091,45 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "StatelessCase"
+  input_arg {
+    name: "branch_index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "branches"
+    type: "list(func)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
 op {
   name: "StatelessIf"
   input_arg {
@@ -49505,6 +49730,94 @@ op {
     }
   }
 }
+op {
+  name: "StatelessSampleDistortedBoundingBox"
+  input_arg {
+    name: "image_size"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bounding_boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_object_covered"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "begin"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "size"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "bboxes"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "aspect_ratio_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.75
+        f: 1.33
+      }
+    }
+  }
+  attr {
+    name: "area_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.05
+        f: 1
+      }
+    }
+  }
+  attr {
+    name: "max_attempts"
+    type: "int"
+    default_value {
+      i: 100
+    }
+  }
+  attr {
+    name: "use_image_if_no_bounding_boxes"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "StatelessTruncatedNormal"
   input_arg {
@@ -50627,6 +50940,64 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "TPUCompile"
+  input_arg {
+    name: "dynamic_shapes"
+    type: DT_INT64
+    number_attr: "NumDynamicShapes"
+  }
+  input_arg {
+    name: "guaranteed_constants"
+    type_list_attr: "Tguaranteed_constants"
+  }
+  output_arg {
+    name: "compilation_status"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "program"
+    type: DT_STRING
+    number_attr: "num_computations"
+  }
+  output_arg {
+    name: "may_modify_variables"
+    type: DT_BOOL
+    number_attr: "num_computations"
+  }
+  attr {
+    name: "num_computations"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "function"
+    type: "func"
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+  }
+  attr {
+    name: "NumDynamicShapes"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "Tguaranteed_constants"
+    type: "list(type)"
+    has_minimum: true
+  }
+  is_stateful: true
+}
+op {
+  name: "TPUCompileSucceededAssert"
+  input_arg {
+    name: "compilation_status"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "TPUEmbeddingActivations"
   input_arg {
@@ -50652,6 +51023,68 @@ op {
     has_minimum: true
   }
 }
+op {
+  name: "TPUExecute"
+  input_arg {
+    name: "args"
+    type_list_attr: "Targs"
+  }
+  input_arg {
+    name: "key"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "results"
+    type_list_attr: "Tresults"
+  }
+  attr {
+    name: "Targs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tresults"
+    type: "list(type)"
+    has_minimum: true
+  }
+  is_stateful: true
+}
+op {
+  name: "TPUExecuteAndUpdateVariables"
+  input_arg {
+    name: "args"
+    type_list_attr: "Targs"
+  }
+  input_arg {
+    name: "key"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "results"
+    type_list_attr: "Tresults"
+  }
+  attr {
+    name: "Targs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tresults"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "device_var_reads_indices"
+    type: "list(int)"
+    has_minimum: true
+  }
+  attr {
+    name: "device_var_updates_indices"
+    type: "list(int)"
+    has_minimum: true
+  }
+  is_stateful: true
+}
 op {
   name: "TPUOrdinalSelector"
   output_arg {
@@ -50696,6 +51129,64 @@ op {
     }
   }
 }
+op {
+  name: "TPUPartitionedInput"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "partition_dim"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "TPUPartitionedOutput"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    number_attr: "num_splits"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "num_splits"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "partition_dim"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
 op {
   name: "TPUReplicateMetadata"
   attr {
@@ -52614,6 +53105,113 @@ op {
     }
   }
 }
+op {
+  name: "TensorMapErase"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key"
+    type_attr: "key_dtype"
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "value_dtype"
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorMapHasKey"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "has_key"
+    type: DT_BOOL
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorMapInsert"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key"
+    type_attr: "key_dtype"
+  }
+  input_arg {
+    name: "value"
+    type_attr: "value_dtype"
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorMapLookup"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key"
+    type_attr: "key_dtype"
+  }
+  output_arg {
+    name: "value"
+    type_attr: "value_dtype"
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorMapSize"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+}
 op {
   name: "TensorScatterAdd"
   input_arg {
@@ -53272,6 +53870,25 @@ op {
     explanation: "Use TopKV2 instead"
   }
 }
+op {
+  name: "TopKUnique"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "topk"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "topk_indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "k"
+    type: "int"
+  }
+}
 op {
   name: "TopKV2"
   input_arg {
@@ -53318,6 +53935,25 @@ op {
     }
   }
 }
+op {
+  name: "TopKWithUnique"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "topk"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "topk_indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "k"
+    type: "int"
+  }
+}
 op {
   name: "Transpose"
   input_arg {
@@ -55243,6 +55879,96 @@ op {
     }
   }
 }
+op {
+  name: "XlaHostCompute"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "Tinputs"
+  }
+  output_arg {
+    name: "outputs"
+    type_list_attr: "Toutputs"
+  }
+  attr {
+    name: "Tinputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Toutputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "ancestors"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "shape_inference_graph"
+    type: "func"
+  }
+  attr {
+    name: "key"
+    type: "string"
+  }
+  attr {
+    name: "cost_estimate_ns"
+    type: "int"
+    default_value {
+      i: 1000000
+    }
+  }
+  attr {
+    name: "tpu_core"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "XlaRecvFromHost"
+  output_arg {
+    name: "output"
+    type_attr: "Toutput"
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "key"
+    type: "string"
+  }
+  is_stateful: true
+}
+op {
+  name: "XlaSendToHost"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+  }
+  attr {
+    name: "key"
+    type: "string"
+  }
+  is_stateful: true
+}
 op {
   name: "Xlog1py"
   input_arg {
diff --git a/tensorflow/core/ops/ragged_conversion_ops.cc b/tensorflow/core/ops/ragged_conversion_ops.cc
index 6bee189c85e..44712bf7739 100644
--- a/tensorflow/core/ops/ragged_conversion_ops.cc
+++ b/tensorflow/core/ops/ragged_conversion_ops.cc
@@ -15,7 +15,7 @@ limitations under the License.
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/ops/ragged_to_dense_util.h"
+#include "tensorflow/core/util/ragged_to_dense_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/ops/tpu_cross_replica_ops.cc b/tensorflow/core/ops/tpu_cross_replica_ops.cc
index adce0b51a05..1f10fe3136d 100644
--- a/tensorflow/core/ops/tpu_cross_replica_ops.cc
+++ b/tensorflow/core/ops/tpu_cross_replica_ops.cc
@@ -78,7 +78,7 @@ REGISTER_OP("CrossReplicaSum")
     .Input("input: T")
     .Input("group_assignment: int32")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, int32, uint32}")
+    .Attr("T: {half, bfloat16, float, int32, uint32}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("CollectivePermute")
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index 305e6ccf2d4..a889666c608 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -85,6 +85,10 @@ exports_files(
         "mutex.h",
         "net.h",
         "numa.h",
+        "profile_utils/android_armv7a_cpu_utils_helper.h",
+        "profile_utils/cpu_utils.cc",
+        "profile_utils/cpu_utils.h",
+        "profile_utils/i_cpu_utils_helper.h",
         "ram_file_system.h",
         "resource_loader.h",
         "resource.h",
@@ -357,6 +361,8 @@ filegroup(
 cc_library(
     name = "mutex",
     textual_hdrs = ["mutex.h"],
+    # TODO(b/161569340): Short-term fix. Remove this visibility rule.
+    visibility = ["//tensorflow:__subpackages__"],
     deps = tf_platform_deps("mutex"),
 )
 
@@ -773,6 +779,11 @@ filegroup(
 cc_library(
     name = "types",
     hdrs = ["types.h"],
+    # TODO(b/161569340): Short-term fix. Remove this visibility rule.
+    visibility = [
+        "//tensorflow:__subpackages__",
+        "//tensorflow_text:__subpackages__",
+    ],
     deps = [
         ":platform",
         ":tstring",
@@ -951,6 +962,32 @@ cc_library(
     srcs = ["tf32_utils.cc"],
     hdrs = ["tf32_utils.h"],
     copts = tf_copts(),
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "profile_utils_cpu_utils",
+    srcs = [
+        "profile_utils/android_armv7a_cpu_utils_helper.h",
+        "profile_utils/cpu_utils.cc",
+        "profile_utils/i_cpu_utils_helper.h",
+    ],
+    hdrs = [
+        "profile_utils/cpu_utils.h",
+    ],
+    copts = tf_copts(),
+    deps = [
+        ":logging",
+        ":macros",
+        ":types",
+        "@com_google_absl//absl/base",
+    ],
+    alwayslink = 1,
+)
+
+filegroup(
+    name = "tf32_hdr",
+    srcs = ["tf32_utils.h"],
 )
 
 tf_cc_tests(
@@ -1534,6 +1571,7 @@ filegroup(
         "raw_coding.h",
         "refcount.h",
         "resource.h",
+        "regexp.h",
         "scanner.cc",
         "scanner.h",
         "setround.cc",
@@ -1629,7 +1667,6 @@ filegroup(
     srcs = [
         "profile_utils/android_armv7a_cpu_utils_helper.cc",
         "profile_utils/clock_cycle_profiler.cc",
-        "profile_utils/cpu_utils.cc",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
 )
diff --git a/tensorflow/core/platform/abi.h b/tensorflow/core/platform/abi.h
index d1498a6a649..33881690749 100644
--- a/tensorflow/core/platform/abi.h
+++ b/tensorflow/core/platform/abi.h
@@ -22,7 +22,7 @@ limitations under the License.
 namespace tensorflow {
 namespace port {
 
-string MaybeAbiDemangle(const char* name);
+std::string MaybeAbiDemangle(const char* name);
 
 }  // namespace port
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/build_config.bzl b/tensorflow/core/platform/build_config.bzl
index afdc781493b..3bfbe617122 100644
--- a/tensorflow/core/platform/build_config.bzl
+++ b/tensorflow/core/platform/build_config.bzl
@@ -28,6 +28,7 @@ load(
     _tf_platform_deps = "tf_platform_deps",
     _tf_portable_deps_no_runtime = "tf_portable_deps_no_runtime",
     _tf_portable_proto_lib = "tf_portable_proto_lib",
+    _tf_profiler_client_deps = "tf_profiler_client_deps",
     _tf_proto_library = "tf_proto_library",
     _tf_proto_library_cc = "tf_proto_library_cc",
     _tf_proto_library_py = "tf_proto_library_py",
@@ -38,6 +39,7 @@ load(
     _tf_protos_grappler = "tf_protos_grappler",
     _tf_protos_grappler_impl = "tf_protos_grappler_impl",
     _tf_protos_profiler_impl = "tf_protos_profiler_impl",
+    _tf_protos_profiler_service = "tf_protos_profiler_service",
     _tf_py_clif_cc = "tf_py_clif_cc",
     _tf_pyclif_proto_library = "tf_pyclif_proto_library",
     _tf_resource_deps = "tf_resource_deps",
@@ -80,6 +82,8 @@ tf_protos_all_impl = _tf_protos_all_impl
 tf_protos_grappler = _tf_protos_grappler
 tf_protos_grappler_impl = _tf_protos_grappler_impl
 tf_protos_profiler_impl = _tf_protos_profiler_impl
+tf_protos_profiler_service = _tf_protos_profiler_service
+tf_profiler_client_deps = _tf_profiler_client_deps
 tf_py_clif_cc = _tf_py_clif_cc
 tf_pyclif_proto_library = _tf_pyclif_proto_library
 tf_resource_deps = _tf_resource_deps
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 2440549a353..5553c9094cb 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -103,6 +103,7 @@ cc_library(
         "//tensorflow/core/platform:retrying_utils",
         "//tensorflow/core/platform:str_util",
         "//tensorflow/core/platform:stringprintf",
+        "//tensorflow/core/profiler/lib:traceme",
         "@jsoncpp_git//:jsoncpp",
     ],
     alwayslink = 1,
@@ -139,6 +140,7 @@ cc_library(
         "//tensorflow/core/platform:retrying_utils",
         "//tensorflow/core/platform:str_util",
         "//tensorflow/core/platform:stringprintf",
+        "//tensorflow/core/profiler/lib:traceme",
         "@jsoncpp_git//:jsoncpp",
     ],
     alwayslink = 1,
diff --git a/tensorflow/core/platform/cloud/gcs_dns_cache.cc b/tensorflow/core/platform/cloud/gcs_dns_cache.cc
index da499f6a8c3..7865c6011c3 100644
--- a/tensorflow/core/platform/cloud/gcs_dns_cache.cc
+++ b/tensorflow/core/platform/cloud/gcs_dns_cache.cc
@@ -64,7 +64,7 @@ GcsDnsCache::GcsDnsCache(Env* env, int64 refresh_rate_secs)
     : env_(env), refresh_rate_secs_(refresh_rate_secs) {}
 
 void GcsDnsCache::AnnotateRequest(HttpRequest* request) {
-  // TODO(saeta): Blacklist failing IP addresses.
+  // TODO(saeta): Denylist failing IP addresses.
   mutex_lock l(mu_);
   if (!started_) {
     VLOG(1) << "Starting GCS DNS cache.";
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 1bd4d86eef6..f0d2138b379 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -47,6 +47,7 @@ limitations under the License.
 #include "tensorflow/core/platform/str_util.h"
 #include "tensorflow/core/platform/stringprintf.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 
 #ifdef _WIN32
 #ifdef DeleteFile
@@ -66,6 +67,8 @@ constexpr size_t kReadAppendableFileBufferSize = 1024 * 1024;  // In bytes.
 constexpr int kGetChildrenDefaultPageSize = 1000;
 // The HTTP response code "308 Resume Incomplete".
 constexpr uint64 HTTP_CODE_RESUME_INCOMPLETE = 308;
+// The HTTP response code "412 Precondition Failed".
+constexpr uint64 HTTP_CODE_PRECONDITION_FAILED = 412;
 // The environment variable that overrides the size of the readahead buffer.
 ABSL_DEPRECATED("Use GCS_READ_CACHE_BLOCK_SIZE_MB instead.")
 constexpr char kReadaheadBufferSize[] = "GCS_READAHEAD_BUFFER_SIZE_BYTES";
@@ -382,6 +385,26 @@ class BufferedGcsRandomAccessFile : public RandomAccessFile {
   mutable string buffer_ TF_GUARDED_BY(buffer_mutex_);
 };
 
+// Function object declaration with params needed to create upload sessions.
+typedef std::function<Status(
+    uint64 start_offset, const std::string& object_to_upload,
+    const std::string& bucket, uint64 file_size, const std::string& gcs_path,
+    UploadSessionHandle* session_handle)>
+    SessionCreator;
+
+// Function object declaration with params needed to upload objects.
+typedef std::function<Status(const std::string& session_uri,
+                             uint64 start_offset, uint64 already_uploaded,
+                             const std::string& tmp_content_filename,
+                             uint64 file_size, const std::string& file_path)>
+    ObjectUploader;
+
+// Function object declaration with params needed to poll upload status.
+typedef std::function<Status(const string& session_uri, uint64 file_size,
+                             const std::string& gcs_path, bool* completed,
+                             uint64* uploaded)>
+    StatusPoller;
+
 /// \brief GCS-based implementation of a writeable file.
 ///
 /// Since GCS objects are immutable, this implementation writes to a local
@@ -392,7 +415,9 @@ class GcsWritableFile : public WritableFile {
                   GcsFileSystem* filesystem,
                   GcsFileSystem::TimeoutConfig* timeouts,
                   std::function<void()> file_cache_erase,
-                  RetryConfig retry_config, bool compose_append)
+                  RetryConfig retry_config, bool compose_append,
+                  SessionCreator session_creator,
+                  ObjectUploader object_uploader, StatusPoller status_poller)
       : bucket_(bucket),
         object_(object),
         filesystem_(filesystem),
@@ -401,7 +426,10 @@ class GcsWritableFile : public WritableFile {
         sync_needed_(true),
         retry_config_(retry_config),
         compose_append_(compose_append),
-        start_offset_(0) {
+        start_offset_(0),
+        session_creator_(std::move(session_creator)),
+        object_uploader_(std::move(object_uploader)),
+        status_poller_(std::move(status_poller)) {
     // TODO: to make it safer, outfile_ should be constructed from an FD
     VLOG(3) << "GcsWritableFile: " << GetGcsPath();
     if (GetTmpFilename(&tmp_content_filename_).ok()) {
@@ -419,7 +447,9 @@ class GcsWritableFile : public WritableFile {
                   GcsFileSystem* filesystem, const string& tmp_content_filename,
                   GcsFileSystem::TimeoutConfig* timeouts,
                   std::function<void()> file_cache_erase,
-                  RetryConfig retry_config, bool compose_append)
+                  RetryConfig retry_config, bool compose_append,
+                  SessionCreator session_creator,
+                  ObjectUploader object_uploader, StatusPoller status_poller)
       : bucket_(bucket),
         object_(object),
         filesystem_(filesystem),
@@ -428,7 +458,10 @@ class GcsWritableFile : public WritableFile {
         sync_needed_(true),
         retry_config_(retry_config),
         compose_append_(compose_append),
-        start_offset_(0) {
+        start_offset_(0),
+        session_creator_(std::move(session_creator)),
+        object_uploader_(std::move(object_uploader)),
+        status_poller_(std::move(status_poller)) {
     VLOG(3) << "GcsWritableFile: " << GetGcsPath() << "with existing file "
             << tmp_content_filename;
     tmp_content_filename_ = tmp_content_filename;
@@ -509,7 +542,7 @@ class GcsWritableFile : public WritableFile {
       return errors::Internal(
           "Could not write to the internal temporary file.");
     }
-    string session_uri;
+    UploadSessionHandle session_handle;
     uint64 start_offset = 0;
     string object_to_upload = object_;
     bool should_compose = false;
@@ -523,17 +556,21 @@ class GcsWritableFile : public WritableFile {
                             io::Basename(object_), ".", start_offset_);
       }
     }
-    TF_RETURN_IF_ERROR(
-        CreateNewUploadSession(&session_uri, start_offset, object_to_upload));
+    TF_RETURN_IF_ERROR(CreateNewUploadSession(start_offset, object_to_upload,
+                                              &session_handle));
     uint64 already_uploaded = 0;
     bool first_attempt = true;
     const Status upload_status = RetryingUtils::CallWithRetries(
-        [&first_attempt, &already_uploaded, &session_uri, &start_offset,
+        [&first_attempt, &already_uploaded, &session_handle, &start_offset,
          this]() {
-          if (!first_attempt) {
+          if (session_handle.resumable && !first_attempt) {
             bool completed;
             TF_RETURN_IF_ERROR(RequestUploadSessionStatus(
-                session_uri, &completed, &already_uploaded));
+                session_handle.session_uri, &completed, &already_uploaded));
+            LOG(INFO) << "### RequestUploadSessionStatus: completed = "
+                      << completed
+                      << ", already_uploaded = " << already_uploaded
+                      << ", file = " << GetGcsPath();
             if (completed) {
               // Erase the file from the file cache on every successful write.
               file_cache_erase_();
@@ -544,7 +581,8 @@ class GcsWritableFile : public WritableFile {
             }
           }
           first_attempt = false;
-          return UploadToSession(session_uri, start_offset, already_uploaded);
+          return UploadToSession(session_handle.session_uri, start_offset,
+                                 already_uploaded);
         },
         retry_config_);
     if (upload_status.code() == errors::Code::NOT_FOUND) {
@@ -582,33 +620,13 @@ class GcsWritableFile : public WritableFile {
   }
 
   /// Initiates a new resumable upload session.
-  Status CreateNewUploadSession(string* session_uri, uint64 start_offset,
-                                string object_to_upload) {
+  Status CreateNewUploadSession(uint64 start_offset,
+                                std::string object_to_upload,
+                                UploadSessionHandle* session_handle) {
     uint64 file_size;
     TF_RETURN_IF_ERROR(GetCurrentFileSize(&file_size));
-
-    std::vector<char> output_buffer;
-    std::unique_ptr<HttpRequest> request;
-    TF_RETURN_IF_ERROR(filesystem_->CreateHttpRequest(&request));
-
-    request->SetUri(strings::StrCat(kGcsUploadUriBase, "b/", bucket_,
-                                    "/o?uploadType=resumable&name=",
-                                    request->EscapeString(object_to_upload)));
-    request->AddHeader("X-Upload-Content-Length",
-                       std::to_string(file_size - start_offset));
-    request->SetPostEmptyBody();
-    request->SetResultBuffer(&output_buffer);
-    request->SetTimeouts(timeouts_->connect, timeouts_->idle,
-                         timeouts_->metadata);
-    TF_RETURN_WITH_CONTEXT_IF_ERROR(
-        request->Send(), " when initiating an upload to ", GetGcsPath());
-    *session_uri = request->GetResponseHeader("Location");
-    if (session_uri->empty()) {
-      return errors::Internal("Unexpected response from GCS when writing to ",
-                              GetGcsPath(),
-                              ": 'Location' header not returned.");
-    }
-    return Status::OK();
+    return session_creator_(start_offset, object_to_upload, bucket_, file_size,
+                            GetGcsPath(), session_handle);
   }
 
   /// Appends the data of append_object to the original object and deletes
@@ -635,7 +653,8 @@ class GcsWritableFile : public WritableFile {
           TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(),
                                           " when composing to ", GetGcsPath());
           TF_RETURN_WITH_CONTEXT_IF_ERROR(
-              filesystem_->DeleteFile(GetGcsPathWithObject(append_object)),
+              filesystem_->DeleteFile(GetGcsPathWithObject(append_object),
+                                      nullptr),
               " when cleaning up.");
           return Status::OK();
         },
@@ -651,87 +670,26 @@ class GcsWritableFile : public WritableFile {
                                     uint64* uploaded) {
     uint64 file_size;
     TF_RETURN_IF_ERROR(GetCurrentFileSize(&file_size));
-
-    std::unique_ptr<HttpRequest> request;
-    TF_RETURN_IF_ERROR(filesystem_->CreateHttpRequest(&request));
-    request->SetUri(session_uri);
-    request->SetTimeouts(timeouts_->connect, timeouts_->idle,
-                         timeouts_->metadata);
-    request->AddHeader("Content-Range", strings::StrCat("bytes */", file_size));
-    request->SetPutEmptyBody();
-    const Status& status = request->Send();
-    if (status.ok()) {
-      *completed = true;
-      return Status::OK();
-    }
-    *completed = false;
-    if (request->GetResponseCode() != HTTP_CODE_RESUME_INCOMPLETE) {
-      TF_RETURN_WITH_CONTEXT_IF_ERROR(status, " when resuming upload ",
-                                      GetGcsPath());
-    }
-    const string& received_range = request->GetResponseHeader("Range");
-    if (received_range.empty()) {
-      // This means GCS doesn't have any bytes of the file yet.
-      *uploaded = 0;
-    } else {
-      StringPiece range_piece(received_range);
-      absl::ConsumePrefix(&range_piece,
-                          "bytes=");  // May or may not be present.
-
-      auto return_error = [this](string error_message) {
-        return errors::Internal("Unexpected response from GCS when writing ",
-                                GetGcsPath(), ": ", error_message);
-      };
-
-      std::vector<string> range_strs = str_util::Split(range_piece, '-');
-      std::vector<int64> range_parts;
-      for (const string& range_str : range_strs) {
-        int64 tmp;
-        if (strings::safe_strto64(range_str, &tmp)) {
-          range_parts.push_back(tmp);
-        } else {
-          return return_error("Range header '" + received_range +
-                              "' could not be parsed.");
-        }
-      }
-      if (range_parts.size() != 2) {
-        return return_error("Range header '" + received_range +
-                            "' could not be parsed.");
-      }
-
-      if (range_parts[0] != 0) {
-        return return_error("The returned range '" + received_range +
-                            "' does not start at zero.");
-      }
-      // If GCS returned "Range: 0-10", this means 11 bytes were uploaded.
-      *uploaded = range_parts[1] + 1;
-    }
-    return Status::OK();
+    return status_poller_(session_uri, file_size, GetGcsPath(), completed,
+                          uploaded);
   }
 
+  /// Uploads data to object.
   Status UploadToSession(const string& session_uri, uint64 start_offset,
                          uint64 already_uploaded) {
     uint64 file_size;
     TF_RETURN_IF_ERROR(GetCurrentFileSize(&file_size));
-
-    std::unique_ptr<HttpRequest> request;
-    TF_RETURN_IF_ERROR(filesystem_->CreateHttpRequest(&request));
-    request->SetUri(session_uri);
-    if (file_size > 0) {
-      request->AddHeader("Content-Range",
-                         strings::StrCat("bytes ", already_uploaded, "-",
-                                         file_size - start_offset - 1, "/",
-                                         file_size - start_offset));
+    Status status =
+        object_uploader_(session_uri, start_offset, already_uploaded,
+                         tmp_content_filename_, file_size, GetGcsPath());
+    if (status.ok()) {
+      // Erase the file from the file cache on every successful write.
+      // Note: Only local cache, this does nothing on distributed cache. The
+      // distributed cache clears the cache as it is needed.
+      file_cache_erase_();
     }
-    request->SetTimeouts(timeouts_->connect, timeouts_->idle, timeouts_->write);
 
-    TF_RETURN_IF_ERROR(request->SetPutFromFile(
-        tmp_content_filename_, start_offset + already_uploaded));
-    TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when uploading ",
-                                    GetGcsPath());
-    // Erase the file from the file cache on every successful write.
-    file_cache_erase_();
-    return Status::OK();
+    return status;
   }
 
   string GetGcsPathWithObject(string object) const {
@@ -750,6 +708,10 @@ class GcsWritableFile : public WritableFile {
   RetryConfig retry_config_;
   bool compose_append_;
   uint64 start_offset_;
+  // Callbacks to the file system used to upload object into GCS.
+  const SessionCreator session_creator_;
+  const ObjectUploader object_uploader_;
+  const StatusPoller status_poller_;
 };
 
 class GcsReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
@@ -955,7 +917,9 @@ GcsFileSystem::GcsFileSystem(
     TimeoutConfig timeouts, const std::unordered_set<string>& allowed_locations,
     std::pair<const string, const string>* additional_header,
     bool compose_append)
-    : auth_provider_(std::move(auth_provider)),
+    : timeouts_(timeouts),
+      retry_config_(retry_config),
+      auth_provider_(std::move(auth_provider)),
       http_request_factory_(std::move(http_request_factory)),
       zone_provider_(std::move(zone_provider)),
       block_size_(block_size),
@@ -968,12 +932,11 @@ GcsFileSystem::GcsFileSystem(
           kCacheNeverExpire, kBucketLocationCacheMaxEntries)),
       allowed_locations_(allowed_locations),
       compose_append_(compose_append),
-      timeouts_(timeouts),
-      retry_config_(retry_config),
       additional_header_(additional_header) {}
 
 Status GcsFileSystem::NewRandomAccessFile(
-    const string& fname, std::unique_ptr<RandomAccessFile>* result) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<RandomAccessFile>* result) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
   TF_RETURN_IF_ERROR(CheckBucketLocationConstraint(bucket));
@@ -1067,6 +1030,9 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& fname, size_t offset,
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
 
+  profiler::TraceMe activity(
+      [fname]() { return absl::StrCat("LoadBufferFromGCS ", fname); });
+
   std::unique_ptr<HttpRequest> request;
   TF_RETURN_WITH_CONTEXT_IF_ERROR(CreateHttpRequest(&request),
                                   "when reading gs://", bucket, "/", object);
@@ -1088,6 +1054,9 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& fname, size_t offset,
   *bytes_transferred = bytes_read;
   VLOG(1) << "Successful read of gs://" << bucket << "/" << object << " @ "
           << offset << " of size: " << bytes_read;
+  activity.AppendMetadata([bytes_read]() {
+    return profiler::TraceMeEncode({{"block_size", bytes_read}});
+  });
 
   if (stats_ != nullptr) {
     stats_->RecordBlockRetrieved(fname, offset, bytes_read);
@@ -1112,6 +1081,127 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& fname, size_t offset,
   return Status::OK();
 }
 
+/// Initiates a new upload session.
+Status GcsFileSystem::CreateNewUploadSession(
+    uint64 start_offset, const std::string& object_to_upload,
+    const std::string& bucket, uint64 file_size, const std::string& gcs_path,
+    UploadSessionHandle* session_handle) {
+  std::vector<char> output_buffer;
+  std::unique_ptr<HttpRequest> request;
+  TF_RETURN_IF_ERROR(CreateHttpRequest(&request));
+
+  std::string uri = strings::StrCat(
+      kGcsUploadUriBase, "b/", bucket,
+      "/o?uploadType=resumable&name=", request->EscapeString(object_to_upload));
+  request->SetUri(uri);
+  request->AddHeader("X-Upload-Content-Length",
+                     absl::StrCat(file_size - start_offset));
+  request->SetPostEmptyBody();
+  request->SetResultBuffer(&output_buffer);
+  request->SetTimeouts(timeouts_.connect, timeouts_.idle, timeouts_.metadata);
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(),
+                                  " when initiating an upload to ", gcs_path);
+  if (session_handle != nullptr) {
+    session_handle->resumable = true;
+    session_handle->session_uri = request->GetResponseHeader("Location");
+    if (session_handle->session_uri.empty()) {
+      return errors::Internal("Unexpected response from GCS when writing to ",
+                              gcs_path, ": 'Location' header not returned.");
+    }
+  }
+  return Status::OK();
+}
+
+Status GcsFileSystem::UploadToSession(const std::string& session_uri,
+                                      uint64 start_offset,
+                                      uint64 already_uploaded,
+                                      const std::string& tmp_content_filename,
+                                      uint64 file_size,
+                                      const std::string& file_path) {
+  std::unique_ptr<HttpRequest> request;
+  TF_RETURN_IF_ERROR(CreateHttpRequest(&request));
+  request->SetUri(session_uri);
+  if (file_size > 0) {
+    request->AddHeader("Content-Range",
+                       strings::StrCat("bytes ", already_uploaded, "-",
+                                       file_size - start_offset - 1, "/",
+                                       file_size - start_offset));
+  }
+  request->SetTimeouts(timeouts_.connect, timeouts_.idle, timeouts_.write);
+
+  TF_RETURN_IF_ERROR(request->SetPutFromFile(tmp_content_filename,
+                                             start_offset + already_uploaded));
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when uploading ",
+                                  file_path);
+  return Status::OK();
+}
+
+Status GcsFileSystem::RequestUploadSessionStatus(const string& session_uri,
+                                                 uint64 file_size,
+                                                 const std::string& gcs_path,
+                                                 bool* completed,
+                                                 uint64* uploaded) {
+  CHECK(completed != nullptr) << "RequestUploadSessionStatus() called with out "
+                                 "param 'completed' == nullptr.";  // Crash ok
+  CHECK(uploaded != nullptr) << "RequestUploadSessionStatus() called with out "
+                                "param 'uploaded' == nullptr.";  // Crash ok
+  std::unique_ptr<HttpRequest> request;
+  TF_RETURN_IF_ERROR(CreateHttpRequest(&request));
+  request->SetUri(session_uri);
+  request->SetTimeouts(timeouts_.connect, timeouts_.idle, timeouts_.metadata);
+  request->AddHeader("Content-Range", strings::StrCat("bytes */", file_size));
+  request->SetPutEmptyBody();
+  Status status = request->Send();
+  if (status.ok()) {
+    *completed = true;
+    return Status::OK();
+  }
+  *completed = false;
+  if (request->GetResponseCode() != HTTP_CODE_RESUME_INCOMPLETE) {
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(status, " when resuming upload ", gcs_path);
+  }
+  const std::string received_range = request->GetResponseHeader("Range");
+  if (received_range.empty()) {
+    // This means GCS doesn't have any bytes of the file yet.
+    *uploaded = 0;
+  } else {
+    StringPiece range_piece(received_range);
+    absl::ConsumePrefix(&range_piece,
+                        "bytes=");  // May or may not be present.
+
+    auto return_error = [](const std::string& gcs_path,
+                           const std::string& error_message) {
+      return errors::Internal("Unexpected response from GCS when writing ",
+                              gcs_path, ": ", error_message);
+    };
+
+    std::vector<string> range_strs = str_util::Split(range_piece, '-');
+    if (range_strs.size() != 2) {
+      return return_error(gcs_path, "Range header '" + received_range +
+                                        "' could not be parsed.");
+    }
+
+    std::vector<int64> range_parts;
+    for (const std::string& range_str : range_strs) {
+      int64 tmp;
+      if (strings::safe_strto64(range_str, &tmp)) {
+        range_parts.push_back(tmp);
+      } else {
+        return return_error(gcs_path, "Range header '" + received_range +
+                                          "' could not be parsed.");
+      }
+    }
+
+    if (range_parts[0] != 0) {
+      return return_error(gcs_path, "The returned range '" + received_range +
+                                        "' does not start at zero.");
+    }
+    // If GCS returned "Range: 0-10", this means 11 bytes were uploaded.
+    *uploaded = range_parts[1] + 1;
+  }
+  return Status::OK();
+}
+
 Status GcsFileSystem::ParseGcsPathForScheme(StringPiece fname, string scheme,
                                             bool empty_object_ok,
                                             string* bucket, string* object) {
@@ -1149,22 +1239,46 @@ void GcsFileSystem::ClearFileCaches(const string& fname) {
 }
 
 Status GcsFileSystem::NewWritableFile(const string& fname,
+                                      TransactionToken* token,
                                       std::unique_ptr<WritableFile>* result) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
+
+  auto session_creator =
+      [this](uint64 start_offset, const std::string& object_to_upload,
+             const std::string& bucket, uint64 file_size,
+             const std::string& gcs_path, UploadSessionHandle* session_handle) {
+        return CreateNewUploadSession(start_offset, object_to_upload, bucket,
+                                      file_size, gcs_path, session_handle);
+      };
+  auto object_uploader =
+      [this](const std::string& session_uri, uint64 start_offset,
+             uint64 already_uploaded, const std::string& tmp_content_filename,
+             uint64 file_size, const std::string& file_path) {
+        return UploadToSession(session_uri, start_offset, already_uploaded,
+                               tmp_content_filename, file_size, file_path);
+      };
+  auto status_poller = [this](const string& session_uri, uint64 file_size,
+                              const std::string& gcs_path, bool* completed,
+                              uint64* uploaded) {
+    return RequestUploadSessionStatus(session_uri, file_size, gcs_path,
+                                      completed, uploaded);
+  };
+
   result->reset(new GcsWritableFile(
       bucket, object, this, &timeouts_,
       [this, fname]() { ClearFileCaches(fname); }, retry_config_,
-      compose_append_));
+      compose_append_, session_creator, object_uploader, status_poller));
   return Status::OK();
 }
 
 // Reads the file from GCS in chunks and stores it in a tmp file,
 // which is then passed to GcsWritableFile.
 Status GcsFileSystem::NewAppendableFile(const string& fname,
+                                        TransactionToken* token,
                                         std::unique_ptr<WritableFile>* result) {
   std::unique_ptr<RandomAccessFile> reader;
-  TF_RETURN_IF_ERROR(NewRandomAccessFile(fname, &reader));
+  TF_RETURN_IF_ERROR(NewRandomAccessFile(fname, token, &reader));
   std::unique_ptr<char[]> buffer(new char[kReadAppendableFileBufferSize]);
   Status status;
   uint64 offset = 0;
@@ -1190,24 +1304,47 @@ Status GcsFileSystem::NewAppendableFile(const string& fname,
   }
   old_content.close();
 
+  auto session_creator =
+      [this](uint64 start_offset, const std::string& object_to_upload,
+             const std::string& bucket, uint64 file_size,
+             const std::string& gcs_path, UploadSessionHandle* session_handle) {
+        return CreateNewUploadSession(start_offset, object_to_upload, bucket,
+                                      file_size, gcs_path, session_handle);
+      };
+  auto object_uploader =
+      [this](const std::string& session_uri, uint64 start_offset,
+             uint64 already_uploaded, const std::string& tmp_content_filename,
+             uint64 file_size, const std::string& file_path) {
+        return UploadToSession(session_uri, start_offset, already_uploaded,
+                               tmp_content_filename, file_size, file_path);
+      };
+
+  auto status_poller = [this](const string& session_uri, uint64 file_size,
+                              const std::string& gcs_path, bool* completed,
+                              uint64* uploaded) {
+    return RequestUploadSessionStatus(session_uri, file_size, gcs_path,
+                                      completed, uploaded);
+  };
+
   // Create a writable file and pass the old content to it.
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
   result->reset(new GcsWritableFile(
       bucket, object, this, old_content_filename, &timeouts_,
       [this, fname]() { ClearFileCaches(fname); }, retry_config_,
-      compose_append_));
+      compose_append_, session_creator, object_uploader, status_poller));
   return Status::OK();
 }
 
 Status GcsFileSystem::NewReadOnlyMemoryRegionFromFile(
-    const string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<ReadOnlyMemoryRegion>* result) {
   uint64 size;
-  TF_RETURN_IF_ERROR(GetFileSize(fname, &size));
+  TF_RETURN_IF_ERROR(GetFileSize(fname, token, &size));
   std::unique_ptr<char[]> data(new char[size]);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_RETURN_IF_ERROR(NewRandomAccessFile(fname, &file));
+  TF_RETURN_IF_ERROR(NewRandomAccessFile(fname, token, &file));
 
   StringPiece piece;
   TF_RETURN_IF_ERROR(file->Read(0, size, &piece, data.get()));
@@ -1216,7 +1353,7 @@ Status GcsFileSystem::NewReadOnlyMemoryRegionFromFile(
   return Status::OK();
 }
 
-Status GcsFileSystem::FileExists(const string& fname) {
+Status GcsFileSystem::FileExists(const string& fname, TransactionToken* token) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, true, &bucket, &object));
   if (object.empty()) {
@@ -1431,6 +1568,7 @@ Status GcsFileSystem::FolderExists(const string& dirname, bool* result) {
 }
 
 Status GcsFileSystem::GetChildren(const string& dirname,
+                                  TransactionToken* token,
                                   std::vector<string>* result) {
   return GetChildrenBounded(dirname, UINT64_MAX, result,
                             false /* recursively */,
@@ -1438,6 +1576,7 @@ Status GcsFileSystem::GetChildren(const string& dirname,
 }
 
 Status GcsFileSystem::GetMatchingPaths(const string& pattern,
+                                       TransactionToken* token,
                                        std::vector<string>* results) {
   MatchingPathsCache::ComputeFunc compute_func =
       [this](const string& pattern, std::vector<string>* results) {
@@ -1598,7 +1737,8 @@ Status GcsFileSystem::GetChildrenBounded(const string& dirname,
   }
 }
 
-Status GcsFileSystem::Stat(const string& fname, FileStatistics* stat) {
+Status GcsFileSystem::Stat(const string& fname, TransactionToken* token,
+                           FileStatistics* stat) {
   if (!stat) {
     return errors::Internal("'stat' cannot be nullptr.");
   }
@@ -1632,7 +1772,7 @@ Status GcsFileSystem::Stat(const string& fname, FileStatistics* stat) {
   return errors::NotFound("The specified path ", fname, " was not found.");
 }
 
-Status GcsFileSystem::DeleteFile(const string& fname) {
+Status GcsFileSystem::DeleteFile(const string& fname, TransactionToken* token) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
 
@@ -1648,33 +1788,58 @@ Status GcsFileSystem::DeleteFile(const string& fname) {
   return Status::OK();
 }
 
-Status GcsFileSystem::CreateDir(const string& dirname) {
+Status GcsFileSystem::CreateDir(const string& dirname,
+                                TransactionToken* token) {
+  string dirname_with_slash = MaybeAppendSlash(dirname);
+  VLOG(3) << "CreateDir: creating directory with dirname: " << dirname
+          << " and dirname_with_slash: " << dirname_with_slash;
   string bucket, object;
-  TF_RETURN_IF_ERROR(ParseGcsPath(dirname, true, &bucket, &object));
+  TF_RETURN_IF_ERROR(ParseGcsPath(dirname_with_slash, /*empty_object_ok=*/true,
+                                  &bucket, &object));
   if (object.empty()) {
     bool is_bucket;
     TF_RETURN_IF_ERROR(BucketExists(bucket, &is_bucket));
     return is_bucket ? Status::OK()
-                     : errors::NotFound("The specified bucket ", dirname,
-                                        " was not found.");
+                     : errors::NotFound("The specified bucket ",
+                                        dirname_with_slash, " was not found.");
   }
 
-  const string dirname_with_slash = MaybeAppendSlash(dirname);
-
-  if (FileExists(dirname_with_slash).ok()) {
+  if (FileExists(dirname_with_slash, token).ok()) {
+    // Use the original name for a correct error here.
+    VLOG(3) << "CreateDir: directory already exists, not uploading " << dirname;
     return errors::AlreadyExists(dirname);
   }
 
-  // Create a zero-length directory marker object.
-  std::unique_ptr<WritableFile> file;
-  TF_RETURN_IF_ERROR(NewWritableFile(dirname_with_slash, &file));
-  TF_RETURN_IF_ERROR(file->Close());
-  return Status::OK();
+  std::unique_ptr<HttpRequest> request;
+  TF_RETURN_IF_ERROR(CreateHttpRequest(&request));
+
+  request->SetUri(strings::StrCat(
+      kGcsUploadUriBase, "b/", bucket,
+      "/o?uploadType=media&name=", request->EscapeString(object),
+      // Adding this parameter means HTTP_CODE_PRECONDITION_FAILED
+      // will be returned if the object already exists, so avoid reuploading.
+      "&ifGenerationMatch=0"));
+
+  request->SetPostEmptyBody();
+  request->SetTimeouts(timeouts_.connect, timeouts_.idle, timeouts_.metadata);
+  const Status& status = request->Send();
+  if (status.ok()) {
+    VLOG(3) << "CreateDir: finished uploading directory " << dirname;
+    return Status::OK();
+  }
+  if (request->GetResponseCode() != HTTP_CODE_PRECONDITION_FAILED) {
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(status, " when uploading ",
+                                    dirname_with_slash);
+  }
+  VLOG(3) << "Ignoring directory already exists on object "
+          << dirname_with_slash;
+  return errors::AlreadyExists(dirname);
 }
 
 // Checks that the directory is empty (i.e no objects with this prefix exist).
 // Deletes the GCS directory marker if it exists.
-Status GcsFileSystem::DeleteDir(const string& dirname) {
+Status GcsFileSystem::DeleteDir(const string& dirname,
+                                TransactionToken* token) {
   std::vector<string> children;
   // A directory is considered empty either if there are no matching objects
   // with the corresponding name prefix or if there is exactly one matching
@@ -1689,12 +1854,13 @@ Status GcsFileSystem::DeleteDir(const string& dirname) {
   }
   if (children.size() == 1 && children[0].empty()) {
     // This is the directory marker object. Delete it.
-    return DeleteFile(MaybeAppendSlash(dirname));
+    return DeleteFile(MaybeAppendSlash(dirname), token);
   }
   return Status::OK();
 }
 
-Status GcsFileSystem::GetFileSize(const string& fname, uint64* file_size) {
+Status GcsFileSystem::GetFileSize(const string& fname, TransactionToken* token,
+                                  uint64* file_size) {
   if (!file_size) {
     return errors::Internal("'file_size' cannot be nullptr.");
   }
@@ -1704,13 +1870,14 @@ Status GcsFileSystem::GetFileSize(const string& fname, uint64* file_size) {
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
 
   FileStatistics stat;
-  TF_RETURN_IF_ERROR(Stat(fname, &stat));
+  TF_RETURN_IF_ERROR(Stat(fname, token, &stat));
   *file_size = stat.length;
   return Status::OK();
 }
 
-Status GcsFileSystem::RenameFile(const string& src, const string& target) {
-  if (!IsDirectory(src).ok()) {
+Status GcsFileSystem::RenameFile(const string& src, const string& target,
+                                 TransactionToken* token) {
+  if (!IsDirectory(src, token).ok()) {
     return RenameObject(src, target);
   }
   // Rename all individual objects in the directory one by one.
@@ -1768,10 +1935,11 @@ Status GcsFileSystem::RenameObject(const string& src, const string& target) {
   // on the server side, we can't just retry the whole RenameFile operation
   // because the source object is already gone.
   return RetryingUtils::DeleteWithRetries(
-      [this, &src]() { return DeleteFile(src); }, retry_config_);
+      [this, &src]() { return DeleteFile(src, nullptr); }, retry_config_);
 }
 
-Status GcsFileSystem::IsDirectory(const string& fname) {
+Status GcsFileSystem::IsDirectory(const string& fname,
+                                  TransactionToken* token) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, true, &bucket, &object));
   if (object.empty()) {
@@ -1798,6 +1966,7 @@ Status GcsFileSystem::IsDirectory(const string& fname) {
 }
 
 Status GcsFileSystem::DeleteRecursively(const string& dirname,
+                                        TransactionToken* token,
                                         int64* undeleted_files,
                                         int64* undeleted_dirs) {
   if (!undeleted_files || !undeleted_dirs) {
@@ -1806,7 +1975,7 @@ Status GcsFileSystem::DeleteRecursively(const string& dirname,
   }
   *undeleted_files = 0;
   *undeleted_dirs = 0;
-  if (!IsDirectory(dirname).ok()) {
+  if (!IsDirectory(dirname, token).ok()) {
     *undeleted_dirs = 1;
     return Status(
         error::NOT_FOUND,
@@ -1824,9 +1993,10 @@ Status GcsFileSystem::DeleteRecursively(const string& dirname,
     // and therefore RetryingFileSystem won't pay attention to the failures,
     // we need to make sure these failures are properly retried.
     const auto& delete_file_status = RetryingUtils::DeleteWithRetries(
-        [this, &full_path]() { return DeleteFile(full_path); }, retry_config_);
+        [this, &full_path, token]() { return DeleteFile(full_path, token); },
+        retry_config_);
     if (!delete_file_status.ok()) {
-      if (IsDirectory(full_path).ok()) {
+      if (IsDirectory(full_path, token).ok()) {
         // The object is a directory marker.
         (*undeleted_dirs)++;
       } else {
@@ -1840,7 +2010,7 @@ Status GcsFileSystem::DeleteRecursively(const string& dirname,
 // Flushes all caches for filesystem metadata and file contents. Useful for
 // reclaiming memory once filesystem operations are done (e.g. model is loaded),
 // or for resetting the filesystem to a consistent state.
-void GcsFileSystem::FlushCaches() {
+void GcsFileSystem::FlushCaches(TransactionToken* token) {
   tf_shared_lock l(block_cache_lock_);
   file_block_cache_->Flush();
   stat_cache_->Clear();
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.h b/tensorflow/core/platform/cloud/gcs_file_system.h
index f066cc31eb4..eceb76970fb 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.h
+++ b/tensorflow/core/platform/cloud/gcs_file_system.h
@@ -101,6 +101,11 @@ class GcsStatsInterface {
   virtual ~GcsStatsInterface() = default;
 };
 
+struct UploadSessionHandle {
+  std::string session_uri;
+  bool resumable;
+};
+
 /// Google Cloud Storage implementation of a file system.
 ///
 /// The clients should use RetryingGcsFileSystem defined below,
@@ -125,44 +130,52 @@ class GcsFileSystem : public FileSystem {
                 std::pair<const string, const string>* additional_header,
                 bool compose_append);
 
-  Status NewRandomAccessFile(
-      const string& fname, std::unique_ptr<RandomAccessFile>* result) override;
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
 
-  Status NewWritableFile(const string& fname,
+  Status NewRandomAccessFile(
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override;
+
+  Status NewWritableFile(const string& fname, TransactionToken* token,
                          std::unique_ptr<WritableFile>* result) override;
 
-  Status NewAppendableFile(const string& fname,
+  Status NewAppendableFile(const string& fname, TransactionToken* token,
                            std::unique_ptr<WritableFile>* result) override;
 
   Status NewReadOnlyMemoryRegionFromFile(
-      const string& fname,
+      const string& fname, TransactionToken* token,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
 
-  Status FileExists(const string& fname) override;
+  Status FileExists(const string& fname, TransactionToken* token) override;
 
-  Status Stat(const string& fname, FileStatistics* stat) override;
+  Status Stat(const string& fname, TransactionToken* token,
+              FileStatistics* stat) override;
 
-  Status GetChildren(const string& dir, std::vector<string>* result) override;
+  Status GetChildren(const string& dir, TransactionToken* token,
+                     std::vector<string>* result) override;
 
-  Status GetMatchingPaths(const string& pattern,
+  Status GetMatchingPaths(const string& pattern, TransactionToken* token,
                           std::vector<string>* results) override;
 
-  Status DeleteFile(const string& fname) override;
+  Status DeleteFile(const string& fname, TransactionToken* token) override;
 
-  Status CreateDir(const string& dirname) override;
+  Status CreateDir(const string& dirname, TransactionToken* token) override;
 
-  Status DeleteDir(const string& dirname) override;
+  Status DeleteDir(const string& dirname, TransactionToken* token) override;
 
-  Status GetFileSize(const string& fname, uint64* file_size) override;
+  Status GetFileSize(const string& fname, TransactionToken* token,
+                     uint64* file_size) override;
 
-  Status RenameFile(const string& src, const string& target) override;
+  Status RenameFile(const string& src, const string& target,
+                    TransactionToken* token) override;
 
-  Status IsDirectory(const string& fname) override;
+  Status IsDirectory(const string& fname, TransactionToken* token) override;
 
-  Status DeleteRecursively(const string& dirname, int64* undeleted_files,
+  Status DeleteRecursively(const string& dirname, TransactionToken* token,
+                           int64* undeleted_files,
                            int64* undeleted_dirs) override;
 
-  void FlushCaches() override;
+  void FlushCaches(TransactionToken* token) override;
 
   /// Set an object to collect runtime statistics from the GcsFilesystem.
   void SetStats(GcsStatsInterface* stats);
@@ -267,6 +280,31 @@ class GcsFileSystem : public FileSystem {
   virtual Status LoadBufferFromGCS(const string& fname, size_t offset, size_t n,
                                    char* buffer, size_t* bytes_transferred);
 
+  // Creates an upload session for an upcoming GCS object upload.
+  virtual Status CreateNewUploadSession(uint64 start_offset,
+                                        const std::string& object_to_upload,
+                                        const std::string& bucket,
+                                        uint64 file_size,
+                                        const std::string& gcs_path,
+                                        UploadSessionHandle* session_handle);
+
+  // Uploads object data to session.
+  virtual Status UploadToSession(const std::string& session_uri,
+                                 uint64 start_offset, uint64 already_uploaded,
+                                 const std::string& tmp_content_filename,
+                                 uint64 file_size,
+                                 const std::string& file_path);
+
+  /// \brief Requests status of a previously initiated upload session.
+  ///
+  /// If the upload has already succeeded, sets 'completed' to true.
+  /// Otherwise sets 'completed' to false and 'uploaded' to the currently
+  /// uploaded size in bytes.
+  virtual Status RequestUploadSessionStatus(const string& session_uri,
+                                            uint64 file_size,
+                                            const std::string& gcs_path,
+                                            bool* completed, uint64* uploaded);
+
   Status ParseGcsPathForScheme(StringPiece fname, string scheme,
                                bool empty_object_ok, string* bucket,
                                string* object);
@@ -282,6 +320,12 @@ class GcsFileSystem : public FileSystem {
 
   std::shared_ptr<ComputeEngineMetadataClient> compute_engine_metadata_client_;
 
+  // Used by a subclass.
+  TimeoutConfig timeouts_;
+
+  /// The retry configuration used for retrying failed calls.
+  RetryConfig retry_config_;
+
  private:
   // GCS file statistics.
   struct GcsFileStat {
@@ -378,13 +422,8 @@ class GcsFileSystem : public FileSystem {
   std::unordered_set<string> allowed_locations_;
   bool compose_append_;
 
-  TimeoutConfig timeouts_;
-
   GcsStatsInterface* stats_ = nullptr;  // Not owned.
 
-  /// The initial delay for exponential backoffs when retrying failed calls.
-  RetryConfig retry_config_;
-
   // Additional header material to be transmitted with all GCS requests
   std::unique_ptr<std::pair<const string, const string>> additional_header_;
 
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index 6892bd7cc26..b216281d630 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -86,7 +86,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 
   StringPiece filename;
   TF_EXPECT_OK(file->Name(&filename));
@@ -133,7 +134,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 
   StringPiece filename;
   TF_EXPECT_OK(file->Name(&filename));
@@ -181,7 +183,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_Errors) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 
   StringPiece filename;
   TF_EXPECT_OK(file->Name(&filename));
@@ -228,7 +231,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_ReadAtEOF) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 
   StringPiece filename;
   TF_EXPECT_OK(file->Name(&filename));
@@ -269,7 +273,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_CachedOutOfRange) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 
   StringPiece filename;
   TF_EXPECT_OK(file->Name(&filename));
@@ -320,7 +325,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_CachedNotSequential) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 
   StringPiece filename;
   TF_EXPECT_OK(file->Name(&filename));
@@ -361,7 +367,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_Growing) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 
   StringPiece filename;
   TF_EXPECT_OK(file->Name(&filename));
@@ -408,7 +415,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_ReadBackwards) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 
   StringPiece filename;
   TF_EXPECT_OK(file->Name(&filename));
@@ -450,7 +458,8 @@ TEST(GcsFileSystemTest,
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 }
 
 TEST(GcsFileSystemTest, NewRandomAccessFile_WithLocationConstraintCaching) {
@@ -496,18 +505,18 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithLocationConstraintCaching) {
   string bucket = "gs://bucket/random_access.txt";
   string another_bucket = "gs://anotherbucket/random_access.txt";
   // Multiple calls should only cause one request to the location api.
-  TF_EXPECT_OK(fs.NewRandomAccessFile(bucket, &file));
-  TF_EXPECT_OK(fs.NewRandomAccessFile(bucket, &file));
+  TF_EXPECT_OK(fs.NewRandomAccessFile(bucket, nullptr, &file));
+  TF_EXPECT_OK(fs.NewRandomAccessFile(bucket, nullptr, &file));
 
   // A new bucket should have one cache miss
-  TF_EXPECT_OK(fs.NewRandomAccessFile(another_bucket, &file));
+  TF_EXPECT_OK(fs.NewRandomAccessFile(another_bucket, nullptr, &file));
   // And then future calls to both should be cached
-  TF_EXPECT_OK(fs.NewRandomAccessFile(bucket, &file));
-  TF_EXPECT_OK(fs.NewRandomAccessFile(another_bucket, &file));
+  TF_EXPECT_OK(fs.NewRandomAccessFile(bucket, nullptr, &file));
+  TF_EXPECT_OK(fs.NewRandomAccessFile(another_bucket, nullptr, &file));
 
   // Trigger a flush, should then require one more call
-  fs.FlushCaches();
-  TF_EXPECT_OK(fs.NewRandomAccessFile(bucket, &file));
+  fs.FlushCaches(nullptr);
+  TF_EXPECT_OK(fs.NewRandomAccessFile(bucket, nullptr, &file));
 }
 
 TEST(GcsFileSystemTest,
@@ -533,10 +542,11 @@ TEST(GcsFileSystemTest,
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
-  EXPECT_EQ(tensorflow::errors::FailedPrecondition(
-                "Bucket 'bucket' is in 'barfoo' location, allowed locations "
-                "are: (us-east1)."),
-            fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  EXPECT_EQ(
+      tensorflow::errors::FailedPrecondition(
+          "Bucket 'bucket' is in 'barfoo' location, allowed locations "
+          "are: (us-east1)."),
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 }
 
 TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache_DifferentN) {
@@ -565,7 +575,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache_DifferentN) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 
   char small_scratch[3];
   StringPiece result;
@@ -630,8 +641,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache) {
     // We are instantiating this in an enclosed scope to make sure after the
     // unique ptr goes out of scope, we can still access result.
     std::unique_ptr<RandomAccessFile> file;
-    TF_EXPECT_OK(
-        fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+    TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt",
+                                        nullptr, &file));
 
     // Read the first chunk. The cache will be populated with the first block of
     // 9 bytes.
@@ -716,7 +727,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_Flush) {
   char scratch[100];
   StringPiece result;
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
   // Read the first chunk. The cache will be populated with the first block of
   // 9 bytes.
   scratch[5] = 'x';
@@ -725,7 +737,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_Flush) {
   EXPECT_EQ(scratch[5], 'x');  // Make sure we only copied 4 bytes.
   // Flush caches and read the second chunk. This will be a cache miss, and
   // the same block will be fetched again.
-  fs.FlushCaches();
+  fs.FlushCaches(nullptr);
   TF_EXPECT_OK(file->Read(4, 4, &result, scratch));
   EXPECT_EQ("4567", result);
 }
@@ -772,8 +784,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_MaxStaleness) {
     // staleness of the filesystem is > 0, they will share the same blocks.
     std::unique_ptr<RandomAccessFile> file1;
     std::unique_ptr<RandomAccessFile> file2;
-    TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/object", &file1));
-    TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/object", &file2));
+    TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/object", nullptr, &file1));
+    TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/object", nullptr, &file2));
     // Reading the first block from file1 should load it once.
     TF_EXPECT_OK(file1->Read(0, 8, &result, scratch));
     EXPECT_EQ("01234567", result);
@@ -834,7 +846,8 @@ TEST(GcsFileSystemTest,
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 
   char scratch[5];
   StringPiece result;
@@ -864,7 +877,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoObjectName) {
 
   std::unique_ptr<RandomAccessFile> file;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
-            fs.NewRandomAccessFile("gs://bucket/", &file).code());
+            fs.NewRandomAccessFile("gs://bucket/", nullptr, &file).code());
 }
 
 TEST(GcsFileSystemTest, NewRandomAccessFile_InconsistentRead) {
@@ -897,10 +910,11 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_InconsistentRead) {
 
   // Stat the file first so that the file stats are cached.
   FileStatistics stat;
-  TF_ASSERT_OK(fs.Stat("gs://bucket/random_access.txt", &stat));
+  TF_ASSERT_OK(fs.Stat("gs://bucket/random_access.txt", nullptr, &stat));
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_ASSERT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_ASSERT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 
   char scratch[6];
   StringPiece result;
@@ -964,14 +978,16 @@ TEST(GcsFileSystemTest, NewWritableFile) {
 
   // Read from the file first, to fill the block cache.
   std::unique_ptr<RandomAccessFile> rfile;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/path/writeable", &rfile));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/path/writeable", nullptr, &rfile));
   char scratch[100];
   StringPiece result;
   TF_EXPECT_OK(rfile->Read(0, 4, &result, scratch));
   EXPECT_EQ("0123", result);
   // Open the writable file.
   std::unique_ptr<WritableFile> wfile;
-  TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable", &wfile));
+  TF_EXPECT_OK(
+      fs.NewWritableFile("gs://bucket/path/writeable", nullptr, &wfile));
   TF_EXPECT_OK(wfile->Append("content1,"));
   int64 pos;
   TF_EXPECT_OK(wfile->Tell(&pos));
@@ -1055,7 +1071,8 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceeds) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<WritableFile> file;
-  TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewWritableFile("gs://bucket/path/writeable.txt", nullptr, &file));
 
   TF_EXPECT_OK(file->Append("content1,"));
   TF_EXPECT_OK(file->Append("content2"));
@@ -1127,7 +1144,8 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceedsOnGetStatus) {
   // Pull the file's first block into the cache. This will trigger the first
   // HTTP request to GCS.
   std::unique_ptr<RandomAccessFile> rfile;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/path/writeable", &rfile));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/path/writeable", nullptr, &rfile));
   char scratch[100];
   StringPiece result;
   TF_EXPECT_OK(rfile->Read(0, 4, &result, scratch));
@@ -1135,7 +1153,8 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceedsOnGetStatus) {
   // Now write to the same file. Once the write succeeds, the cached block will
   // be flushed.
   std::unique_ptr<WritableFile> wfile;
-  TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable", &wfile));
+  TF_EXPECT_OK(
+      fs.NewWritableFile("gs://bucket/path/writeable", nullptr, &wfile));
   TF_EXPECT_OK(wfile->Append("content1,"));
   TF_EXPECT_OK(wfile->Append("content2"));
   // Appending doesn't invalidate the read cache - only flushing does. This read
@@ -1213,7 +1232,8 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
       false /* compose append */);
 
   std::unique_ptr<WritableFile> file;
-  TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewWritableFile("gs://bucket/path/writeable.txt", nullptr, &file));
 
   TF_EXPECT_OK(file->Append("content1,"));
   TF_EXPECT_OK(file->Append("content2"));
@@ -1277,7 +1297,8 @@ TEST(GcsFileSystemTest, NewWritableFile_UploadReturns410) {
 
   {
     std::unique_ptr<WritableFile> file;
-    TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
+    TF_EXPECT_OK(
+        fs.NewWritableFile("gs://bucket/path/writeable.txt", nullptr, &file));
 
     TF_EXPECT_OK(file->Append("content1,"));
     TF_EXPECT_OK(file->Append("content2"));
@@ -1317,7 +1338,7 @@ TEST(GcsFileSystemTest, NewWritableFile_NoObjectName) {
 
   std::unique_ptr<WritableFile> file;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
-            fs.NewWritableFile("gs://bucket/", &file).code());
+            fs.NewWritableFile("gs://bucket/", nullptr, &file).code());
 }
 
 TEST(GcsFileSystemTest, NewAppendableFile) {
@@ -1382,12 +1403,14 @@ TEST(GcsFileSystemTest, NewAppendableFile) {
   // Create an appendable file. This should read the file from GCS, and pull its
   // contents into the block cache.
   std::unique_ptr<WritableFile> wfile;
-  TF_EXPECT_OK(fs.NewAppendableFile("gs://bucket/path/appendable", &wfile));
+  TF_EXPECT_OK(
+      fs.NewAppendableFile("gs://bucket/path/appendable", nullptr, &wfile));
   TF_EXPECT_OK(wfile->Append("content2"));
   // Verify that the file contents are in the block cache. This read should not
   // trigger an HTTP request to GCS.
   std::unique_ptr<RandomAccessFile> rfile;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/path/appendable", &rfile));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/path/appendable", nullptr, &rfile));
   char scratch[100];
   StringPiece result;
   TF_EXPECT_OK(rfile->Read(0, 8, &result, scratch));
@@ -1416,7 +1439,7 @@ TEST(GcsFileSystemTest, NewAppendableFile_NoObjectName) {
 
   std::unique_ptr<WritableFile> file;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
-            fs.NewAppendableFile("gs://bucket/", &file).code());
+            fs.NewAppendableFile("gs://bucket/", nullptr, &file).code());
 }
 
 TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile) {
@@ -1450,7 +1473,7 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile) {
 
   std::unique_ptr<ReadOnlyMemoryRegion> region;
   TF_EXPECT_OK(fs.NewReadOnlyMemoryRegionFromFile(
-      "gs://bucket/path/random_access.txt", &region));
+      "gs://bucket/path/random_access.txt", nullptr, &region));
 
   EXPECT_EQ(content, StringPiece(reinterpret_cast<const char*>(region->data()),
                                  region->length()));
@@ -1471,7 +1494,8 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile_NoObjectName) {
 
   std::unique_ptr<ReadOnlyMemoryRegion> region;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
-            fs.NewReadOnlyMemoryRegionFromFile("gs://bucket/", &region).code());
+            fs.NewReadOnlyMemoryRegionFromFile("gs://bucket/", nullptr, &region)
+                .code());
 }
 
 TEST(GcsFileSystemTest, FileExists_YesAsObject) {
@@ -1493,7 +1517,7 @@ TEST(GcsFileSystemTest, FileExists_YesAsObject) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  TF_EXPECT_OK(fs.FileExists("gs://bucket/path/file1.txt"));
+  TF_EXPECT_OK(fs.FileExists("gs://bucket/path/file1.txt", nullptr));
 }
 
 TEST(GcsFileSystemTest, FileExists_YesAsFolder) {
@@ -1523,7 +1547,7 @@ TEST(GcsFileSystemTest, FileExists_YesAsFolder) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  TF_EXPECT_OK(fs.FileExists("gs://bucket/path/subfolder"));
+  TF_EXPECT_OK(fs.FileExists("gs://bucket/path/subfolder", nullptr));
 }
 
 TEST(GcsFileSystemTest, FileExists_YesAsBucket) {
@@ -1549,8 +1573,8 @@ TEST(GcsFileSystemTest, FileExists_YesAsBucket) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  TF_EXPECT_OK(fs.FileExists("gs://bucket1"));
-  TF_EXPECT_OK(fs.FileExists("gs://bucket1/"));
+  TF_EXPECT_OK(fs.FileExists("gs://bucket1", nullptr));
+  TF_EXPECT_OK(fs.FileExists("gs://bucket1/", nullptr));
 }
 
 TEST(GcsFileSystemTest, FileExists_NotAsObjectOrFolder) {
@@ -1580,7 +1604,7 @@ TEST(GcsFileSystemTest, FileExists_NotAsObjectOrFolder) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   EXPECT_EQ(errors::Code::NOT_FOUND,
-            fs.FileExists("gs://bucket/path/file1.txt").code());
+            fs.FileExists("gs://bucket/path/file1.txt", nullptr).code());
 }
 
 TEST(GcsFileSystemTest, FileExists_NotAsBucket) {
@@ -1606,9 +1630,9 @@ TEST(GcsFileSystemTest, FileExists_NotAsBucket) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
-            fs.FileExists("gs://bucket2/").code());
+            fs.FileExists("gs://bucket2/", nullptr).code());
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
-            fs.FileExists("gs://bucket2").code());
+            fs.FileExists("gs://bucket2", nullptr).code());
 }
 
 TEST(GcsFileSystemTest, FileExists_StatCache) {
@@ -1648,8 +1672,8 @@ TEST(GcsFileSystemTest, FileExists_StatCache) {
   // The stat cache will ensure that repeated lookups don't trigger additional
   // HTTP requests.
   for (int i = 0; i < 10; i++) {
-    TF_EXPECT_OK(fs.FileExists("gs://bucket/path/file1.txt"));
-    TF_EXPECT_OK(fs.FileExists("gs://bucket/path/subfolder/"));
+    TF_EXPECT_OK(fs.FileExists("gs://bucket/path/file1.txt", nullptr));
+    TF_EXPECT_OK(fs.FileExists("gs://bucket/path/subfolder/", nullptr));
   }
 }
 
@@ -1672,8 +1696,8 @@ TEST(GcsFileSystemTest, FileExists_DirectoryMark) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  TF_EXPECT_OK(fs.FileExists("gs://bucket/dir/"));
-  TF_EXPECT_OK(fs.IsDirectory("gs://bucket/dir/"));
+  TF_EXPECT_OK(fs.FileExists("gs://bucket/dir/", nullptr));
+  TF_EXPECT_OK(fs.IsDirectory("gs://bucket/dir/", nullptr));
 }
 
 TEST(GcsFileSystemTest, GetChildren_NoItems) {
@@ -1696,7 +1720,7 @@ TEST(GcsFileSystemTest, GetChildren_NoItems) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
-  TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
+  TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", nullptr, &children));
 
   EXPECT_EQ(std::vector<string>({"subpath/"}), children);
 }
@@ -1724,7 +1748,7 @@ TEST(GcsFileSystemTest, GetChildren_ThreeFiles) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
-  TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
+  TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", nullptr, &children));
 
   EXPECT_EQ(std::vector<string>({"file1.txt", "file3.txt", "subpath/"}),
             children);
@@ -1753,7 +1777,7 @@ TEST(GcsFileSystemTest, GetChildren_SelfDirectoryMarker) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
-  TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
+  TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", nullptr, &children));
 
   EXPECT_EQ(std::vector<string>({"file3.txt", "subpath/"}), children);
 }
@@ -1781,7 +1805,7 @@ TEST(GcsFileSystemTest, GetChildren_ThreeFiles_NoSlash) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
-  TF_EXPECT_OK(fs.GetChildren("gs://bucket/path", &children));
+  TF_EXPECT_OK(fs.GetChildren("gs://bucket/path", nullptr, &children));
 
   EXPECT_EQ(std::vector<string>({"file1.txt", "file3.txt", "subpath/"}),
             children);
@@ -1806,7 +1830,7 @@ TEST(GcsFileSystemTest, GetChildren_Root) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
-  TF_EXPECT_OK(fs.GetChildren("gs://bucket-a-b-c", &children));
+  TF_EXPECT_OK(fs.GetChildren("gs://bucket-a-b-c", nullptr, &children));
 
   EXPECT_EQ(0, children.size());
 }
@@ -1831,7 +1855,7 @@ TEST(GcsFileSystemTest, GetChildren_Empty) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
-  TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
+  TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", nullptr, &children));
 
   EXPECT_EQ(0, children.size());
 }
@@ -1872,7 +1896,7 @@ TEST(GcsFileSystemTest, GetChildren_Pagination) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
-  TF_EXPECT_OK(fs.GetChildren("gs://bucket/path", &children));
+  TF_EXPECT_OK(fs.GetChildren("gs://bucket/path", nullptr, &children));
 
   EXPECT_EQ(std::vector<string>({"file1.txt", "file3.txt", "subpath/",
                                  "file4.txt", "file5.txt"}),
@@ -1899,8 +1923,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_NoWildcard) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
-  TF_EXPECT_OK(
-      fs.GetMatchingPaths("gs://bucket/path/subpath/file2.txt", &result));
+  TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/subpath/file2.txt",
+                                   nullptr, &result));
   EXPECT_EQ(std::vector<string>({"gs://bucket/path/subpath/file2.txt"}),
             result);
 }
@@ -1927,7 +1951,7 @@ TEST(GcsFileSystemTest, GetMatchingPaths_BucketAndWildcard) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
-  TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/*/*", &result));
+  TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/*/*", nullptr, &result));
   EXPECT_EQ(std::vector<string>({"gs://bucket/path/file1.txt",
                                  "gs://bucket/path/file3.txt",
                                  "gs://bucket/path/subpath"}),
@@ -1956,7 +1980,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_FolderAndWildcard_Matches) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
-  TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*/file2.txt", &result));
+  TF_EXPECT_OK(
+      fs.GetMatchingPaths("gs://bucket/path/*/file2.txt", nullptr, &result));
   EXPECT_EQ(std::vector<string>({"gs://bucket/path/subpath/file2.txt"}),
             result);
 }
@@ -1982,7 +2007,7 @@ TEST(GcsFileSystemTest, GetMatchingPaths_SelfDirectoryMarker) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
-  TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*", &result));
+  TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*", nullptr, &result));
   EXPECT_EQ(std::vector<string>({"gs://bucket/path/file3.txt"}), result);
 }
 
@@ -2007,7 +2032,7 @@ TEST(GcsFileSystemTest, GetMatchingPaths_SlashInObjectName) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
-  TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*", &result));
+  TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*", nullptr, &result));
   EXPECT_EQ(std::vector<string>(), result);
 }
 
@@ -2032,7 +2057,7 @@ TEST(GcsFileSystemTest, GetMatchingPaths_SlashInObjectNameEscaped) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
-  TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/\\/*", &result));
+  TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/\\/*", nullptr, &result));
   EXPECT_EQ(std::vector<string>({"gs://bucket/path//foo.txt"}), result);
 }
 
@@ -2058,7 +2083,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_FolderAndWildcard_NoMatches) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
-  TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*/file3.txt", &result));
+  TF_EXPECT_OK(
+      fs.GetMatchingPaths("gs://bucket/path/*/file3.txt", nullptr, &result));
   EXPECT_EQ(std::vector<string>(), result);
 }
 
@@ -2077,7 +2103,7 @@ TEST(GcsFileSystemTest, GetMatchingPaths_OnlyWildcard) {
 
   std::vector<string> result;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
-            fs.GetMatchingPaths("gs://*", &result).code());
+            fs.GetMatchingPaths("gs://*", nullptr, &result).code());
 }
 
 TEST(GcsFileSystemTest, GetMatchingPaths_Cache) {
@@ -2113,11 +2139,11 @@ TEST(GcsFileSystemTest, GetMatchingPaths_Cache) {
   // any additional HTTP requests to GCS.
   for (int i = 0; i < 10; i++) {
     std::vector<string> result;
-    TF_EXPECT_OK(
-        fs.GetMatchingPaths("gs://bucket/path/subpath/file2.txt", &result));
+    TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/subpath/file2.txt",
+                                     nullptr, &result));
     EXPECT_EQ(std::vector<string>({"gs://bucket/path/subpath/file2.txt"}),
               result);
-    TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/*/*", &result));
+    TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/*/*", nullptr, &result));
     EXPECT_EQ(std::vector<string>({"gs://bucket/path/file1.txt",
                                    "gs://bucket/path/file3.txt",
                                    "gs://bucket/path/subpath"}),
@@ -2155,17 +2181,17 @@ TEST(GcsFileSystemTest, GetMatchingPaths_Cache_Flush) {
   // This loop should trigger the first HTTP request to GCS.
   for (int i = 0; i < 10; i++) {
     std::vector<string> result;
-    TF_EXPECT_OK(
-        fs.GetMatchingPaths("gs://bucket/path/subpath/file2.txt", &result));
+    TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/subpath/file2.txt",
+                                     nullptr, &result));
     EXPECT_EQ(std::vector<string>({"gs://bucket/path/subpath/file2.txt"}),
               result);
   }
   // After flushing caches, there should be another (identical) request to GCS.
-  fs.FlushCaches();
+  fs.FlushCaches(nullptr);
   for (int i = 0; i < 10; i++) {
     std::vector<string> result;
-    TF_EXPECT_OK(
-        fs.GetMatchingPaths("gs://bucket/path/subpath/file2.txt", &result));
+    TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/subpath/file2.txt",
+                                     nullptr, &result));
     EXPECT_EQ(std::vector<string>({"gs://bucket/path/subpath/file2.txt"}),
               result);
   }
@@ -2220,11 +2246,12 @@ TEST(GcsFileSystemTest, DeleteFile) {
   char scratch[100];
   StringPiece result;
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/path/file1.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/path/file1.txt", nullptr, &file));
   TF_EXPECT_OK(file->Read(0, 8, &result, scratch));
   EXPECT_EQ("01234567", result);
   // Deleting the file triggers the next HTTP request to GCS.
-  TF_EXPECT_OK(fs.DeleteFile("gs://bucket/path/file1.txt"));
+  TF_EXPECT_OK(fs.DeleteFile("gs://bucket/path/file1.txt", nullptr));
   // Re-reading the file causes its contents to be reloaded from GCS and not
   // from the block cache.
   TF_EXPECT_OK(file->Read(0, 8, &result, scratch));
@@ -2245,7 +2272,7 @@ TEST(GcsFileSystemTest, DeleteFile_NoObjectName) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
-            fs.DeleteFile("gs://bucket/").code());
+            fs.DeleteFile("gs://bucket/", nullptr).code());
 }
 
 TEST(GcsFileSystemTest, DeleteFile_StatCacheRemoved) {
@@ -2289,14 +2316,15 @@ TEST(GcsFileSystemTest, DeleteFile_StatCacheRemoved) {
 
   // Stats the file first so the stat is cached.
   FileStatistics stat_before_deletion;
-  TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", &stat_before_deletion));
+  TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", nullptr, &stat_before_deletion));
   EXPECT_EQ(1010, stat_before_deletion.length);
 
-  TF_EXPECT_OK(fs.DeleteFile("gs://bucket/file.txt"));
+  TF_EXPECT_OK(fs.DeleteFile("gs://bucket/file.txt", nullptr));
 
   FileStatistics stat_after_deletion;
-  EXPECT_EQ(error::Code::NOT_FOUND,
-            fs.Stat("gs://bucket/file.txt", &stat_after_deletion).code());
+  EXPECT_EQ(
+      error::Code::NOT_FOUND,
+      fs.Stat("gs://bucket/file.txt", nullptr, &stat_after_deletion).code());
 }
 
 TEST(GcsFileSystemTest, DeleteDir_Empty) {
@@ -2317,7 +2345,7 @@ TEST(GcsFileSystemTest, DeleteDir_Empty) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  TF_EXPECT_OK(fs.DeleteDir("gs://bucket/path/"));
+  TF_EXPECT_OK(fs.DeleteDir("gs://bucket/path/", nullptr));
 }
 
 TEST(GcsFileSystemTest, DeleteDir_OnlyDirMarkerLeft) {
@@ -2346,7 +2374,7 @@ TEST(GcsFileSystemTest, DeleteDir_OnlyDirMarkerLeft) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  TF_EXPECT_OK(fs.DeleteDir("gs://bucket/path/"));
+  TF_EXPECT_OK(fs.DeleteDir("gs://bucket/path/", nullptr));
 }
 
 TEST(GcsFileSystemTest, DeleteDir_BucketOnly) {
@@ -2366,7 +2394,7 @@ TEST(GcsFileSystemTest, DeleteDir_BucketOnly) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  TF_EXPECT_OK(fs.DeleteDir("gs://bucket"));
+  TF_EXPECT_OK(fs.DeleteDir("gs://bucket", nullptr));
 }
 
 TEST(GcsFileSystemTest, DeleteDir_NonEmpty) {
@@ -2389,7 +2417,7 @@ TEST(GcsFileSystemTest, DeleteDir_NonEmpty) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   EXPECT_EQ(error::Code::FAILED_PRECONDITION,
-            fs.DeleteDir("gs://bucket/path/").code());
+            fs.DeleteDir("gs://bucket/path/", nullptr).code());
 }
 
 TEST(GcsFileSystemTest, GetFileSize) {
@@ -2412,7 +2440,7 @@ TEST(GcsFileSystemTest, GetFileSize) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   uint64 size;
-  TF_EXPECT_OK(fs.GetFileSize("gs://bucket/file.txt", &size));
+  TF_EXPECT_OK(fs.GetFileSize("gs://bucket/file.txt", nullptr, &size));
   EXPECT_EQ(1010, size);
 }
 
@@ -2431,7 +2459,7 @@ TEST(GcsFileSystemTest, GetFileSize_NoObjectName) {
 
   uint64 size;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
-            fs.GetFileSize("gs://bucket/", &size).code());
+            fs.GetFileSize("gs://bucket/", nullptr, &size).code());
 }
 
 TEST(GcsFileSystemTest, RenameFile_Folder) {
@@ -2515,7 +2543,8 @@ TEST(GcsFileSystemTest, RenameFile_Folder) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  TF_EXPECT_OK(fs.RenameFile("gs://bucket/path1", "gs://bucket/path2/"));
+  TF_EXPECT_OK(
+      fs.RenameFile("gs://bucket/path1", "gs://bucket/path2/", nullptr));
 }
 
 TEST(GcsFileSystemTest, RenameFile_Object) {
@@ -2612,15 +2641,17 @@ TEST(GcsFileSystemTest, RenameFile_Object) {
   StringPiece result;
   std::unique_ptr<RandomAccessFile> src;
   std::unique_ptr<RandomAccessFile> dst;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/path/src.txt", &src));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/path/src.txt", nullptr, &src));
   TF_EXPECT_OK(src->Read(0, 8, &result, scratch));
   EXPECT_EQ("01234567", result);
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/path/dst.txt", &dst));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/path/dst.txt", nullptr, &dst));
   TF_EXPECT_OK(dst->Read(0, 8, &result, scratch));
   EXPECT_EQ("76543210", result);
   // Now rename src to dst. This should flush the block cache for both files.
-  TF_EXPECT_OK(
-      fs.RenameFile("gs://bucket/path/src.txt", "gs://bucket/path/dst.txt"));
+  TF_EXPECT_OK(fs.RenameFile("gs://bucket/path/src.txt",
+                             "gs://bucket/path/dst.txt", nullptr));
   // Re-read both files. This should reload their contents from GCS.
   TF_EXPECT_OK(src->Read(0, 8, &result, scratch));
   EXPECT_EQ("89abcdef", result);
@@ -2690,14 +2721,16 @@ TEST(GcsFileSystemTest, RenameFile_Object_FlushTargetStatCache) {
   // Do an initial stat of the destination file to load their contents into the
   // stat cache.
   FileStatistics stat_before_renaming;
-  TF_EXPECT_OK(fs.Stat("gs://bucket/path/dst.txt", &stat_before_renaming));
+  TF_EXPECT_OK(
+      fs.Stat("gs://bucket/path/dst.txt", nullptr, &stat_before_renaming));
   EXPECT_EQ(1000, stat_before_renaming.length);
 
-  TF_EXPECT_OK(
-      fs.RenameFile("gs://bucket/path/src.txt", "gs://bucket/path/dst.txt"));
+  TF_EXPECT_OK(fs.RenameFile("gs://bucket/path/src.txt",
+                             "gs://bucket/path/dst.txt", nullptr));
 
   FileStatistics stat_after_renaming;
-  TF_EXPECT_OK(fs.Stat("gs://bucket/path/dst.txt", &stat_after_renaming));
+  TF_EXPECT_OK(
+      fs.Stat("gs://bucket/path/dst.txt", nullptr, &stat_after_renaming));
   EXPECT_EQ(1010, stat_after_renaming.length);
 }
 
@@ -2755,8 +2788,8 @@ TEST(GcsFileSystemTest, RenameFile_Object_DeletionRetried) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  TF_EXPECT_OK(
-      fs.RenameFile("gs://bucket/path/src.txt", "gs://bucket/path/dst.txt"));
+  TF_EXPECT_OK(fs.RenameFile("gs://bucket/path/src.txt",
+                             "gs://bucket/path/dst.txt", nullptr));
 }
 
 /// Tests the case when rewrite couldn't complete in one RPC.
@@ -2797,10 +2830,10 @@ TEST(GcsFileSystemTest, RenameFile_Object_Incomplete) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  EXPECT_EQ(
-      errors::Code::UNIMPLEMENTED,
-      fs.RenameFile("gs://bucket/path/src.txt", "gs://bucket/path/dst.txt")
-          .code());
+  EXPECT_EQ(errors::Code::UNIMPLEMENTED,
+            fs.RenameFile("gs://bucket/path/src.txt",
+                          "gs://bucket/path/dst.txt", nullptr)
+                .code());
 }
 
 TEST(GcsFileSystemTest, Stat_Object) {
@@ -2823,7 +2856,7 @@ TEST(GcsFileSystemTest, Stat_Object) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   FileStatistics stat;
-  TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", &stat));
+  TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", nullptr, &stat));
   EXPECT_EQ(1010, stat.length);
   EXPECT_NEAR(1461971724896, stat.mtime_nsec / 1000 / 1000, 1);
   EXPECT_FALSE(stat.is_directory);
@@ -2857,7 +2890,7 @@ TEST(GcsFileSystemTest, Stat_Folder) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   FileStatistics stat;
-  TF_EXPECT_OK(fs.Stat("gs://bucket/subfolder", &stat));
+  TF_EXPECT_OK(fs.Stat("gs://bucket/subfolder", nullptr, &stat));
   EXPECT_EQ(0, stat.length);
   EXPECT_EQ(0, stat.mtime_nsec);
   EXPECT_TRUE(stat.is_directory);
@@ -2890,7 +2923,8 @@ TEST(GcsFileSystemTest, Stat_ObjectOrFolderNotFound) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   FileStatistics stat;
-  EXPECT_EQ(error::Code::NOT_FOUND, fs.Stat("gs://bucket/path", &stat).code());
+  EXPECT_EQ(error::Code::NOT_FOUND,
+            fs.Stat("gs://bucket/path", nullptr, &stat).code());
 }
 
 TEST(GcsFileSystemTest, Stat_Bucket) {
@@ -2911,7 +2945,7 @@ TEST(GcsFileSystemTest, Stat_Bucket) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   FileStatistics stat;
-  TF_EXPECT_OK(fs.Stat("gs://bucket/", &stat));
+  TF_EXPECT_OK(fs.Stat("gs://bucket/", nullptr, &stat));
   EXPECT_EQ(0, stat.length);
   EXPECT_EQ(0, stat.mtime_nsec);
   EXPECT_TRUE(stat.is_directory);
@@ -2935,7 +2969,8 @@ TEST(GcsFileSystemTest, Stat_BucketNotFound) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   FileStatistics stat;
-  EXPECT_EQ(error::Code::NOT_FOUND, fs.Stat("gs://bucket/", &stat).code());
+  EXPECT_EQ(error::Code::NOT_FOUND,
+            fs.Stat("gs://bucket/", nullptr, &stat).code());
 }
 
 TEST(GcsFileSystemTest, Stat_Cache) {
@@ -2976,11 +3011,11 @@ TEST(GcsFileSystemTest, Stat_Cache) {
   // HTTP requests to GCS.
   for (int i = 0; i < 10; i++) {
     FileStatistics stat;
-    TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", &stat));
+    TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", nullptr, &stat));
     EXPECT_EQ(1010, stat.length);
     EXPECT_NEAR(1461971724896, stat.mtime_nsec / 1000 / 1000, 1);
     EXPECT_FALSE(stat.is_directory);
-    TF_EXPECT_OK(fs.Stat("gs://bucket/subfolder/", &stat));
+    TF_EXPECT_OK(fs.Stat("gs://bucket/subfolder/", nullptr, &stat));
     EXPECT_EQ(0, stat.length);
     EXPECT_EQ(0, stat.mtime_nsec);
     EXPECT_TRUE(stat.is_directory);
@@ -3016,16 +3051,16 @@ TEST(GcsFileSystemTest, Stat_Cache_Flush) {
   // There should be a single HTTP request to GCS for fs.Stat in this loop.
   for (int i = 0; i < 10; i++) {
     FileStatistics stat;
-    TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", &stat));
+    TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", nullptr, &stat));
     EXPECT_EQ(1010, stat.length);
     EXPECT_NEAR(1461971724896, stat.mtime_nsec / 1000 / 1000, 1);
     EXPECT_FALSE(stat.is_directory);
   }
   // After flushing caches, there should be a second request to GCS for fs.Stat.
-  fs.FlushCaches();
+  fs.FlushCaches(nullptr);
   for (int i = 0; i < 10; i++) {
     FileStatistics stat;
-    TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", &stat));
+    TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", nullptr, &stat));
     EXPECT_EQ(1010, stat.length);
     EXPECT_NEAR(1461971724896, stat.mtime_nsec / 1000 / 1000, 1);
     EXPECT_FALSE(stat.is_directory);
@@ -3052,7 +3087,7 @@ TEST(GcsFileSystemTest, Stat_FilenameEndingWithSlash) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   FileStatistics stat;
-  TF_EXPECT_OK(fs.Stat("gs://bucket/dir/", &stat));
+  TF_EXPECT_OK(fs.Stat("gs://bucket/dir/", nullptr, &stat));
   EXPECT_EQ(5, stat.length);
   EXPECT_TRUE(stat.is_directory);
 }
@@ -3084,7 +3119,7 @@ TEST(GcsFileSystemTest, IsDirectory_NotFound) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   EXPECT_EQ(error::Code::NOT_FOUND,
-            fs.IsDirectory("gs://bucket/file.txt").code());
+            fs.IsDirectory("gs://bucket/file.txt", nullptr).code());
 }
 
 TEST(GcsFileSystemTest, IsDirectory_NotDirectoryButObject) {
@@ -3115,7 +3150,7 @@ TEST(GcsFileSystemTest, IsDirectory_NotDirectoryButObject) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   EXPECT_EQ(error::Code::FAILED_PRECONDITION,
-            fs.IsDirectory("gs://bucket/file.txt").code());
+            fs.IsDirectory("gs://bucket/file.txt", nullptr).code());
 }
 
 TEST(GcsFileSystemTest, IsDirectory_Yes) {
@@ -3145,8 +3180,8 @@ TEST(GcsFileSystemTest, IsDirectory_Yes) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  TF_EXPECT_OK(fs.IsDirectory("gs://bucket/subfolder"));
-  TF_EXPECT_OK(fs.IsDirectory("gs://bucket/subfolder/"));
+  TF_EXPECT_OK(fs.IsDirectory("gs://bucket/subfolder", nullptr));
+  TF_EXPECT_OK(fs.IsDirectory("gs://bucket/subfolder/", nullptr));
 }
 
 TEST(GcsFileSystemTest, IsDirectory_Bucket) {
@@ -3172,8 +3207,8 @@ TEST(GcsFileSystemTest, IsDirectory_Bucket) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  TF_EXPECT_OK(fs.IsDirectory("gs://bucket"));
-  TF_EXPECT_OK(fs.IsDirectory("gs://bucket/"));
+  TF_EXPECT_OK(fs.IsDirectory("gs://bucket", nullptr));
+  TF_EXPECT_OK(fs.IsDirectory("gs://bucket/", nullptr));
 }
 
 TEST(GcsFileSystemTest, IsDirectory_BucketNotFound) {
@@ -3193,37 +3228,53 @@ TEST(GcsFileSystemTest, IsDirectory_BucketNotFound) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  EXPECT_EQ(error::Code::NOT_FOUND, fs.IsDirectory("gs://bucket/").code());
+  EXPECT_EQ(error::Code::NOT_FOUND,
+            fs.IsDirectory("gs://bucket/", nullptr).code());
 }
 
 TEST(GcsFileSystemTest, CreateDir_Folder) {
   std::vector<HttpRequest*> requests(
-      {new FakeHttpRequest(
-           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "subpath%2F?fields=size%2Cgeneration%2Cupdated\n"
-           "Auth Token: fake_token\n"
-           "Timeouts: 5 1 10\n",
-           "{}"),
-       new FakeHttpRequest(
-           "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
-           "uploadType=resumable&name=subpath%2F\n"
-           "Auth Token: fake_token\n"
-           "Header X-Upload-Content-Length: 0\n"
-           "Post: yes\n"
-           "Timeouts: 5 1 10\n",
-           "", {{"Location", "https://custom/upload/location"}}),
-       new FakeHttpRequest("Uri: https://custom/upload/location\n"
-                           "Auth Token: fake_token\n"
-                           "Timeouts: 5 1 30\n"
-                           "Put body: \n",
-                           ""),
-       new FakeHttpRequest(
-           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "subpath%2F?fields=size%2Cgeneration%2Cupdated\n"
-           "Auth Token: fake_token\n"
-           "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
-                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
+
+      {
+          // File doesn't exist.
+          new FakeHttpRequest(
+              "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+              "subpath%2F?fields=size%2Cgeneration%2Cupdated\n"
+              "Auth Token: fake_token\n"
+              "Timeouts: 5 1 10\n",
+              "{}"),
+          // Simple upload.
+          new FakeHttpRequest(
+              "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
+              "uploadType=media&name=subpath%2F&ifGenerationMatch=0\n"
+              "Auth Token: fake_token\n"
+              "Post: yes\n"
+              "Timeouts: 5 1 10\n",
+              ""),
+          // File exists.
+          new FakeHttpRequest(
+              "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+              "subpath%2F?fields=size%2Cgeneration%2Cupdated\n"
+              "Auth Token: fake_token\n"
+              "Timeouts: 5 1 10\n",
+              strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
+                              "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+          // File doesn't exist again.
+          new FakeHttpRequest(
+              "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+              "subpath%2F?fields=size%2Cgeneration%2Cupdated\n"
+              "Auth Token: fake_token\n"
+              "Timeouts: 5 1 10\n",
+              "{}"),
+          // Simulate object uploaded in between.
+          new FakeHttpRequest(
+              "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
+              "uploadType=media&name=subpath%2F&ifGenerationMatch=0\n"
+              "Auth Token: fake_token\n"
+              "Post: yes\n"
+              "Timeouts: 5 1 10\n",
+              "", errors::FailedPrecondition("412"), 412),
+      });
   GcsFileSystem fs(
       std::unique_ptr<AuthProvider>(new FakeAuthProvider),
       std::unique_ptr<HttpRequest::Factory>(
@@ -3235,9 +3286,15 @@ TEST(GcsFileSystemTest, CreateDir_Folder) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  TF_EXPECT_OK(fs.CreateDir("gs://bucket/subpath"));
-  EXPECT_EQ(errors::AlreadyExists("gs://bucket/subpath/"),
-            fs.CreateDir("gs://bucket/subpath/"));
+  TF_EXPECT_OK(fs.CreateDir("gs://bucket/subpath", nullptr));
+  // Check that when GCS returns the object already exists return that the
+  // directory already exists.
+  EXPECT_EQ(errors::AlreadyExists("gs://bucket/subpath"),
+            fs.CreateDir("gs://bucket/subpath", nullptr));
+  // Check that when GCS returns the object already has a version (failed
+  // precondition) return directory already exists.
+  EXPECT_EQ(errors::AlreadyExists("gs://bucket/subpath"),
+            fs.CreateDir("gs://bucket/subpath", nullptr));
 }
 
 TEST(GcsFileSystemTest, CreateDir_Bucket) {
@@ -3263,8 +3320,8 @@ TEST(GcsFileSystemTest, CreateDir_Bucket) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  TF_EXPECT_OK(fs.CreateDir("gs://bucket/"));
-  TF_EXPECT_OK(fs.CreateDir("gs://bucket"));
+  TF_EXPECT_OK(fs.CreateDir("gs://bucket/", nullptr));
+  TF_EXPECT_OK(fs.CreateDir("gs://bucket", nullptr));
 }
 
 TEST(GcsFileSystemTest, DeleteRecursively_Ok) {
@@ -3336,8 +3393,8 @@ TEST(GcsFileSystemTest, DeleteRecursively_Ok) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   int64 undeleted_files, undeleted_dirs;
-  TF_EXPECT_OK(fs.DeleteRecursively("gs://bucket/path", &undeleted_files,
-                                    &undeleted_dirs));
+  TF_EXPECT_OK(fs.DeleteRecursively("gs://bucket/path", nullptr,
+                                    &undeleted_files, &undeleted_dirs));
   EXPECT_EQ(0, undeleted_files);
   EXPECT_EQ(0, undeleted_dirs);
 }
@@ -3429,8 +3486,8 @@ TEST(GcsFileSystemTest, DeleteRecursively_DeletionErrors) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   int64 undeleted_files, undeleted_dirs;
-  TF_EXPECT_OK(fs.DeleteRecursively("gs://bucket/path", &undeleted_files,
-                                    &undeleted_dirs));
+  TF_EXPECT_OK(fs.DeleteRecursively("gs://bucket/path", nullptr,
+                                    &undeleted_files, &undeleted_dirs));
   EXPECT_EQ(1, undeleted_files);
   EXPECT_EQ(1, undeleted_dirs);
 }
@@ -3465,7 +3522,7 @@ TEST(GcsFileSystemTest, DeleteRecursively_NotAFolder) {
 
   int64 undeleted_files, undeleted_dirs;
   EXPECT_EQ(error::Code::NOT_FOUND,
-            fs.DeleteRecursively("gs://bucket/path", &undeleted_files,
+            fs.DeleteRecursively("gs://bucket/path", nullptr, &undeleted_files,
                                  &undeleted_dirs)
                 .code());
   EXPECT_EQ(0, undeleted_files);
@@ -3480,7 +3537,7 @@ TEST(GcsFileSystemTest, NoConstraintsEnvironmentVariableTest) {
 
   // Cover cache initialization code, any uninitialized cache will cause this to
   // fail
-  fs1.FlushCaches();
+  fs1.FlushCaches(nullptr);
 }
 
 TEST(GcsFileSystemTest, BucketLocationConstraintEnvironmentVariableTest) {
@@ -3694,7 +3751,7 @@ TEST(GcsFileSystemTest, Stat_StatsRecording) {
   EXPECT_EQ(stats.fs_, &fs);
 
   FileStatistics stat;
-  TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", &stat));
+  TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", nullptr, &stat));
   EXPECT_EQ(1, stats.stat_object_request_count_);
 }
 
@@ -3721,7 +3778,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_StatsRecording) {
   EXPECT_EQ(stats.fs_, &fs);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 
   char scratch[6];
   StringPiece result;
@@ -3862,8 +3920,8 @@ TEST(GcsFileSystemTest, NewAppendableFile_MultipleFlushesWithCompose) {
   // Create an appendable file. This should read the file from GCS, and pull its
   // contents into the block cache.
   std::unique_ptr<WritableFile> wfile;
-  TF_EXPECT_OK(
-      fs.NewAppendableFile("gs://bucket/some/path/appendable", &wfile));
+  TF_EXPECT_OK(fs.NewAppendableFile("gs://bucket/some/path/appendable", nullptr,
+                                    &wfile));
   TF_EXPECT_OK(wfile->Append(contents[1]));
   TF_EXPECT_OK(wfile->Flush());
   TF_EXPECT_OK(wfile->Append(contents[2]));
@@ -3960,7 +4018,8 @@ TEST(GcsFileSystemTest, NewAppendableFile_MultipleFlushesWithoutCompose) {
   // Create an appendable file. This should read the file from GCS, and pull its
   // contents into the block cache.
   std::unique_ptr<WritableFile> wfile;
-  TF_EXPECT_OK(fs.NewAppendableFile("gs://bucket/path/appendable", &wfile));
+  TF_EXPECT_OK(
+      fs.NewAppendableFile("gs://bucket/path/appendable", nullptr, &wfile));
   TF_EXPECT_OK(wfile->Append(contents[1]));
   TF_EXPECT_OK(wfile->Flush());
   TF_EXPECT_OK(wfile->Append(contents[2]));
diff --git a/tensorflow/core/platform/default/BUILD b/tensorflow/core/platform/default/BUILD
index 89231b0f206..40883cb69d4 100644
--- a/tensorflow/core/platform/default/BUILD
+++ b/tensorflow/core/platform/default/BUILD
@@ -268,6 +268,7 @@ cc_library(
         "//tensorflow/core/platform:init_main.h",
         "//tensorflow/core/platform:mem.h",
         "//tensorflow/core/platform:numa.h",
+        "//tensorflow/core/platform:profile_utils/cpu_utils.h",
         "//tensorflow/core/platform:snappy.h",
     ],
     copts = tf_copts(),
@@ -286,6 +287,8 @@ cc_library(
         "//tensorflow/core/platform:byte_order",
         "//tensorflow/core/platform:dynamic_annotations",
         "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:macros",
+        "//tensorflow/core/platform:profile_utils_cpu_utils",
         "//tensorflow/core/platform:types",
         "//tensorflow/core/platform",
         "@snappy",
@@ -521,6 +524,8 @@ filegroup(
         "resource.cc",
         "stacktrace.h",
         "tracing_impl.h",
+        "//tensorflow/core/platform:profile_utils/cpu_utils.h",
+        "//tensorflow/core/platform:profile_utils/i_cpu_utils_helper.h",
     ],
     visibility = ["//tensorflow/core/platform:__pkg__"],
 )
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 4d51d074706..dda65f93cda 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -597,6 +597,16 @@ def tf_protos_profiler_impl():
         clean_dep("//tensorflow/core/profiler:profiler_options_proto_cc_impl"),
     ]
 
+def tf_protos_profiler_service():
+    return [
+        clean_dep("//tensorflow/core/profiler:profiler_analysis_proto_cc_impl"),
+        clean_dep("//tensorflow/core/profiler:profiler_service_proto_cc_impl"),
+        clean_dep("//tensorflow/core/profiler:profiler_service_monitor_result_proto_cc_impl"),
+    ]
+
+def tf_profiler_client_deps():
+    return [clean_dep("//tensorflow/core/profiler/rpc/client:profiler_client_headers")]
+
 def tf_protos_grappler_impl():
     return [clean_dep("//tensorflow/core/grappler/costs:op_performance_data_cc_impl")]
 
diff --git a/tensorflow/core/platform/default/env.cc b/tensorflow/core/platform/default/env.cc
index 90e0ee97355..b933fa005a7 100644
--- a/tensorflow/core/platform/default/env.cc
+++ b/tensorflow/core/platform/default/env.cc
@@ -185,8 +185,9 @@ class PosixEnv : public Env {
     });
   }
 
-  Status LoadLibrary(const char* library_filename, void** handle) override {
-    return tensorflow::internal::LoadLibrary(library_filename, handle);
+  Status LoadDynamicLibrary(const char* library_filename,
+                            void** handle) override {
+    return tensorflow::internal::LoadDynamicLibrary(library_filename, handle);
   }
 
   Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
diff --git a/tensorflow/core/platform/default/load_library.cc b/tensorflow/core/platform/default/load_library.cc
index ef9edcc4501..bbe5824acfa 100644
--- a/tensorflow/core/platform/default/load_library.cc
+++ b/tensorflow/core/platform/default/load_library.cc
@@ -23,7 +23,7 @@ namespace tensorflow {
 
 namespace internal {
 
-Status LoadLibrary(const char* library_filename, void** handle) {
+Status LoadDynamicLibrary(const char* library_filename, void** handle) {
   *handle = dlopen(library_filename, RTLD_NOW | RTLD_LOCAL);
   if (!*handle) {
     return errors::NotFound(dlerror());
diff --git a/tensorflow/core/platform/default/port.cc b/tensorflow/core/platform/default/port.cc
index 11b3cd7fd9a..fee82623ee0 100644
--- a/tensorflow/core/platform/default/port.cc
+++ b/tensorflow/core/platform/default/port.cc
@@ -14,11 +14,11 @@ limitations under the License.
 ==============================================================================*/
 
 #include "absl/base/internal/sysinfo.h"
-
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/numa.h"
+#include "tensorflow/core/platform/profile_utils/cpu_utils.h"
 #include "tensorflow/core/platform/snappy.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -61,6 +61,8 @@ string Hostname() {
   return string(hostname);
 }
 
+string JobName() { return ""; }
+
 int NumSchedulableCPUs() {
 #if defined(__linux__) && !defined(__ANDROID__)
   cpu_set_t cpuset;
@@ -345,7 +347,7 @@ bool Snappy_UncompressToIOVec(const char* compressed, size_t compressed_length,
 string Demangle(const char* mangled) { return mangled; }
 
 double NominalCPUFrequency() {
-  return absl::base_internal::NominalCPUFrequency();
+  return tensorflow::profile_utils::CpuUtils::GetCycleCounterFrequency();
 }
 
 MemoryInfo GetMemoryInfo() {
diff --git a/tensorflow/core/platform/default/posix_file_system.cc b/tensorflow/core/platform/default/posix_file_system.cc
index 05c2b2762d4..18fea3fe15d 100644
--- a/tensorflow/core/platform/default/posix_file_system.cc
+++ b/tensorflow/core/platform/default/posix_file_system.cc
@@ -178,7 +178,8 @@ class PosixReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
 };
 
 Status PosixFileSystem::NewRandomAccessFile(
-    const string& fname, std::unique_ptr<RandomAccessFile>* result) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<RandomAccessFile>* result) {
   string translated_fname = TranslateName(fname);
   Status s;
   int fd = open(translated_fname.c_str(), O_RDONLY);
@@ -191,6 +192,7 @@ Status PosixFileSystem::NewRandomAccessFile(
 }
 
 Status PosixFileSystem::NewWritableFile(const string& fname,
+                                        TransactionToken* token,
                                         std::unique_ptr<WritableFile>* result) {
   string translated_fname = TranslateName(fname);
   Status s;
@@ -204,7 +206,8 @@ Status PosixFileSystem::NewWritableFile(const string& fname,
 }
 
 Status PosixFileSystem::NewAppendableFile(
-    const string& fname, std::unique_ptr<WritableFile>* result) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<WritableFile>* result) {
   string translated_fname = TranslateName(fname);
   Status s;
   FILE* f = fopen(translated_fname.c_str(), "a");
@@ -217,7 +220,8 @@ Status PosixFileSystem::NewAppendableFile(
 }
 
 Status PosixFileSystem::NewReadOnlyMemoryRegionFromFile(
-    const string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<ReadOnlyMemoryRegion>* result) {
   string translated_fname = TranslateName(fname);
   Status s = Status::OK();
   int fd = open(translated_fname.c_str(), O_RDONLY);
@@ -240,14 +244,15 @@ Status PosixFileSystem::NewReadOnlyMemoryRegionFromFile(
   return s;
 }
 
-Status PosixFileSystem::FileExists(const string& fname) {
+Status PosixFileSystem::FileExists(const string& fname,
+                                   TransactionToken* token) {
   if (access(TranslateName(fname).c_str(), F_OK) == 0) {
     return Status::OK();
   }
   return errors::NotFound(fname, " not found");
 }
 
-Status PosixFileSystem::GetChildren(const string& dir,
+Status PosixFileSystem::GetChildren(const string& dir, TransactionToken* token,
                                     std::vector<string>* result) {
   string translated_dir = TranslateName(dir);
   result->clear();
@@ -269,11 +274,13 @@ Status PosixFileSystem::GetChildren(const string& dir,
 }
 
 Status PosixFileSystem::GetMatchingPaths(const string& pattern,
+                                         TransactionToken* token,
                                          std::vector<string>* results) {
   return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
 }
 
-Status PosixFileSystem::DeleteFile(const string& fname) {
+Status PosixFileSystem::DeleteFile(const string& fname,
+                                   TransactionToken* token) {
   Status result;
   if (unlink(TranslateName(fname).c_str()) != 0) {
     result = IOError(fname, errno);
@@ -281,7 +288,7 @@ Status PosixFileSystem::DeleteFile(const string& fname) {
   return result;
 }
 
-Status PosixFileSystem::CreateDir(const string& name) {
+Status PosixFileSystem::CreateDir(const string& name, TransactionToken* token) {
   string translated = TranslateName(name);
   if (translated.empty()) {
     return errors::AlreadyExists(name);
@@ -292,7 +299,7 @@ Status PosixFileSystem::CreateDir(const string& name) {
   return Status::OK();
 }
 
-Status PosixFileSystem::DeleteDir(const string& name) {
+Status PosixFileSystem::DeleteDir(const string& name, TransactionToken* token) {
   Status result;
   if (rmdir(TranslateName(name).c_str()) != 0) {
     result = IOError(name, errno);
@@ -300,7 +307,8 @@ Status PosixFileSystem::DeleteDir(const string& name) {
   return result;
 }
 
-Status PosixFileSystem::GetFileSize(const string& fname, uint64* size) {
+Status PosixFileSystem::GetFileSize(const string& fname,
+                                    TransactionToken* token, uint64* size) {
   Status s;
   struct stat sbuf;
   if (stat(TranslateName(fname).c_str(), &sbuf) != 0) {
@@ -312,7 +320,8 @@ Status PosixFileSystem::GetFileSize(const string& fname, uint64* size) {
   return s;
 }
 
-Status PosixFileSystem::Stat(const string& fname, FileStatistics* stats) {
+Status PosixFileSystem::Stat(const string& fname, TransactionToken* token,
+                             FileStatistics* stats) {
   Status s;
   struct stat sbuf;
   if (stat(TranslateName(fname).c_str(), &sbuf) != 0) {
@@ -325,7 +334,8 @@ Status PosixFileSystem::Stat(const string& fname, FileStatistics* stats) {
   return s;
 }
 
-Status PosixFileSystem::RenameFile(const string& src, const string& target) {
+Status PosixFileSystem::RenameFile(const string& src, const string& target,
+                                   TransactionToken* token) {
   Status result;
   if (rename(TranslateName(src).c_str(), TranslateName(target).c_str()) != 0) {
     result = IOError(src, errno);
@@ -333,7 +343,8 @@ Status PosixFileSystem::RenameFile(const string& src, const string& target) {
   return result;
 }
 
-Status PosixFileSystem::CopyFile(const string& src, const string& target) {
+Status PosixFileSystem::CopyFile(const string& src, const string& target,
+                                 TransactionToken* token) {
   string translated_src = TranslateName(src);
   struct stat sbuf;
   if (stat(translated_src.c_str(), &sbuf) != 0) {
diff --git a/tensorflow/core/platform/default/posix_file_system.h b/tensorflow/core/platform/default/posix_file_system.h
index c418a08e944..8e301c8b2e4 100644
--- a/tensorflow/core/platform/default/posix_file_system.h
+++ b/tensorflow/core/platform/default/posix_file_system.h
@@ -27,40 +27,47 @@ class PosixFileSystem : public FileSystem {
 
   ~PosixFileSystem() {}
 
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
   Status NewRandomAccessFile(
-      const string& filename,
+      const string& filename, TransactionToken* token,
       std::unique_ptr<RandomAccessFile>* result) override;
 
-  Status NewWritableFile(const string& fname,
+  Status NewWritableFile(const string& fname, TransactionToken* token,
                          std::unique_ptr<WritableFile>* result) override;
 
-  Status NewAppendableFile(const string& fname,
+  Status NewAppendableFile(const string& fname, TransactionToken* token,
                            std::unique_ptr<WritableFile>* result) override;
 
   Status NewReadOnlyMemoryRegionFromFile(
-      const string& filename,
+      const string& filename, TransactionToken* token,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
 
-  Status FileExists(const string& fname) override;
+  Status FileExists(const string& fname, TransactionToken* token) override;
 
-  Status GetChildren(const string& dir, std::vector<string>* result) override;
+  Status GetChildren(const string& dir, TransactionToken* token,
+                     std::vector<string>* result) override;
 
-  Status Stat(const string& fname, FileStatistics* stats) override;
+  Status Stat(const string& fname, TransactionToken* token,
+              FileStatistics* stats) override;
 
-  Status GetMatchingPaths(const string& pattern,
+  Status GetMatchingPaths(const string& pattern, TransactionToken* token,
                           std::vector<string>* results) override;
 
-  Status DeleteFile(const string& fname) override;
+  Status DeleteFile(const string& fname, TransactionToken* token) override;
 
-  Status CreateDir(const string& name) override;
+  Status CreateDir(const string& name, TransactionToken* token) override;
 
-  Status DeleteDir(const string& name) override;
+  Status DeleteDir(const string& name, TransactionToken* token) override;
 
-  Status GetFileSize(const string& fname, uint64* size) override;
+  Status GetFileSize(const string& fname, TransactionToken* token,
+                     uint64* size) override;
 
-  Status RenameFile(const string& src, const string& target) override;
+  Status RenameFile(const string& src, const string& target,
+                    TransactionToken* token) override;
 
-  Status CopyFile(const string& src, const string& target) override;
+  Status CopyFile(const string& src, const string& target,
+                  TransactionToken* token) override;
 };
 
 Status IOError(const string& context, int err_number);
diff --git a/tensorflow/core/platform/default/rocm_rocdl_path.cc b/tensorflow/core/platform/default/rocm_rocdl_path.cc
index 55075969cbd..9e9261d26c8 100644
--- a/tensorflow/core/platform/default/rocm_rocdl_path.cc
+++ b/tensorflow/core/platform/default/rocm_rocdl_path.cc
@@ -36,7 +36,11 @@ string RocmRoot() {
 }
 
 string RocdlRoot() {
+#if TENSORFLOW_COMPILER_IS_HIP_CLANG
+  return tensorflow::io::JoinPath(tensorflow::RocmRoot(), "lib");
+#else
   return tensorflow::io::JoinPath(tensorflow::RocmRoot(), "hcc/lib");
+#endif
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/default/strong_hash.h b/tensorflow/core/platform/default/strong_hash.h
index f04f1b7b6ae..e7c8047235c 100644
--- a/tensorflow/core/platform/default/strong_hash.h
+++ b/tensorflow/core/platform/default/strong_hash.h
@@ -21,8 +21,10 @@ limitations under the License.
 
 namespace tensorflow {
 
-inline uint64 StrongKeyedHash(const uint64 (&key)[2], const string& s) {
-  return highwayhash::StringHasher<highwayhash::SipHashState>()(key, s);
+inline uint64 StrongKeyedHash(const tensorflow::uint64 (&key)[2],
+                              const string& s) {
+  return highwayhash::StringHasher<highwayhash::SipHashState>()(
+      {key[0], key[1]}, s);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index 05d95ba0425..4cc36e0b705 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -544,15 +544,7 @@ Status ReadBinaryProto(Env* env, const string& fname,
   std::unique_ptr<RandomAccessFile> file;
   TF_RETURN_IF_ERROR(env->NewRandomAccessFile(fname, &file));
   std::unique_ptr<FileStream> stream(new FileStream(file.get()));
-
-  // TODO(jiayq): the following coded stream is for debugging purposes to allow
-  // one to parse arbitrarily large messages for MessageLite. One most likely
-  // doesn't want to put protobufs larger than 64MB on Android, so we should
-  // eventually remove this and quit loud when a large protobuf is passed in.
   protobuf::io::CodedInputStream coded_stream(stream.get());
-  // Total bytes hard limit / warning limit are set to 1GB and 512MB
-  // respectively.
-  coded_stream.SetTotalBytesLimit(1024LL << 20, 512LL << 20);
 
   if (!proto->ParseFromCodedStream(&coded_stream) ||
       !coded_stream.ConsumedEntireMessage()) {
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index 99924ec1143..7b716798c28 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -108,7 +108,7 @@ class Env {
   /// The ownership of the returned RandomAccessFile is passed to the caller
   /// and the object should be deleted when is not used. The file object
   /// shouldn't live longer than the Env object.
-  Status NewRandomAccessFile(const string& fname,
+  Status NewRandomAccessFile(const std::string& fname,
                              std::unique_ptr<RandomAccessFile>* result);
 
   /// \brief Creates an object that writes to a new file with the specified
@@ -124,7 +124,7 @@ class Env {
   /// The ownership of the returned WritableFile is passed to the caller
   /// and the object should be deleted when is not used. The file object
   /// shouldn't live longer than the Env object.
-  Status NewWritableFile(const string& fname,
+  Status NewWritableFile(const std::string& fname,
                          std::unique_ptr<WritableFile>* result);
 
   /// \brief Creates an object that either appends to an existing file, or
@@ -139,7 +139,7 @@ class Env {
   /// The ownership of the returned WritableFile is passed to the caller
   /// and the object should be deleted when is not used. The file object
   /// shouldn't live longer than the Env object.
-  Status NewAppendableFile(const string& fname,
+  Status NewAppendableFile(const std::string& fname,
                            std::unique_ptr<WritableFile>* result);
 
   /// \brief Creates a readonly region of memory with the file context.
@@ -154,10 +154,10 @@ class Env {
   /// and the object should be deleted when is not used. The memory region
   /// object shouldn't live longer than the Env object.
   Status NewReadOnlyMemoryRegionFromFile(
-      const string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result);
+      const std::string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result);
 
   /// Returns OK if the named path exists and NOT_FOUND otherwise.
-  Status FileExists(const string& fname);
+  Status FileExists(const std::string& fname);
 
   /// Returns true if all the listed files exist, false otherwise.
   /// if status is not null, populate the vector with a detailed status
@@ -169,7 +169,7 @@ class Env {
   /// directory. The names are relative to "dir".
   ///
   /// Original contents of *results are dropped.
-  Status GetChildren(const string& dir, std::vector<string>* result);
+  Status GetChildren(const std::string& dir, std::vector<string>* result);
 
   /// \brief Returns true if the path matches the given pattern. The wildcards
   /// allowed in pattern are described in FileSystem::GetMatchingPaths.
@@ -180,11 +180,11 @@ class Env {
   /// that pattern. *results is cleared.
   ///
   /// More details about `pattern` in FileSystem::GetMatchingPaths.
-  virtual Status GetMatchingPaths(const string& pattern,
+  virtual Status GetMatchingPaths(const std::string& pattern,
                                   std::vector<string>* results);
 
   /// Deletes the named file.
-  Status DeleteFile(const string& fname);
+  Status DeleteFile(const std::string& fname);
 
   /// \brief Deletes the specified directory and all subdirectories and files
   /// underneath it. This is accomplished by traversing the directory tree
@@ -210,7 +210,7 @@ class Env {
   ///  * PERMISSION_DENIED - dirname or some descendant is not writable
   ///  * UNIMPLEMENTED - Some underlying functions (like Delete) are not
   ///                    implemented
-  Status DeleteRecursively(const string& dirname, int64* undeleted_files,
+  Status DeleteRecursively(const std::string& dirname, int64* undeleted_files,
                            int64* undeleted_dirs);
 
   /// \brief Creates the specified directory and all the necessary
@@ -218,19 +218,19 @@ class Env {
   ///  * OK - successfully created the directory and sub directories, even if
   ///         they were already created.
   ///  * PERMISSION_DENIED - dirname or some subdirectory is not writable.
-  Status RecursivelyCreateDir(const string& dirname);
+  Status RecursivelyCreateDir(const std::string& dirname);
 
   /// \brief Creates the specified directory. Typical return codes
   ///  * OK - successfully created the directory.
   ///  * ALREADY_EXISTS - directory already exists.
   ///  * PERMISSION_DENIED - dirname is not writable.
-  Status CreateDir(const string& dirname);
+  Status CreateDir(const std::string& dirname);
 
   /// Deletes the specified directory.
-  Status DeleteDir(const string& dirname);
+  Status DeleteDir(const std::string& dirname);
 
   /// Obtains statistics for the given path.
-  Status Stat(const string& fname, FileStatistics* stat);
+  Status Stat(const std::string& fname, FileStatistics* stat);
 
   /// \brief Returns whether the given path is a directory or not.
   /// Typical return codes (not guaranteed exhaustive):
@@ -239,7 +239,7 @@ class Env {
   ///  * NOT_FOUND - The path entry does not exist.
   ///  * PERMISSION_DENIED - Insufficient permissions.
   ///  * UNIMPLEMENTED - The file factory doesn't support directories.
-  Status IsDirectory(const string& fname);
+  Status IsDirectory(const std::string& fname);
 
   /// \brief Returns whether the given path is on a file system
   /// that has atomic move capabilities. This can be used
@@ -251,17 +251,17 @@ class Env {
   ///         so has_atomic_move holds the above information.
   ///  * UNIMPLEMENTED - The file system of the path hasn't been implemented in
   ///  TF
-  Status HasAtomicMove(const string& path, bool* has_atomic_move);
+  Status HasAtomicMove(const std::string& path, bool* has_atomic_move);
 
   /// Stores the size of `fname` in `*file_size`.
-  Status GetFileSize(const string& fname, uint64* file_size);
+  Status GetFileSize(const std::string& fname, uint64* file_size);
 
   /// \brief Renames file src to target. If target already exists, it will be
   /// replaced.
-  Status RenameFile(const string& src, const string& target);
+  Status RenameFile(const std::string& src, const std::string& target);
 
   /// \brief Copy the src to target.
-  Status CopyFile(const string& src, const string& target);
+  Status CopyFile(const std::string& src, const std::string& target);
 
   /// \brief Returns the absolute path of the current executable. It resolves
   /// symlinks if there is any.
@@ -334,7 +334,8 @@ class Env {
   // OK from the function.
   // Otherwise returns nullptr in "*handle" and an error status from the
   // function.
-  virtual Status LoadLibrary(const char* library_filename, void** handle) = 0;
+  virtual Status LoadDynamicLibrary(const char* library_filename,
+                                    void** handle) = 0;
 
   // \brief Get a pointer to a symbol from a dynamic library.
   //
@@ -374,7 +375,7 @@ class EnvWrapper : public Env {
   /// Returns the target to which this Env forwards all calls
   Env* target() const { return target_; }
 
-  Status GetFileSystemForFile(const string& fname,
+  Status GetFileSystemForFile(const std::string& fname,
                               FileSystem** result) override {
     return target_->GetFileSystemForFile(fname, result);
   }
@@ -383,7 +384,7 @@ class EnvWrapper : public Env {
     return target_->GetRegisteredFileSystemSchemes(schemes);
   }
 
-  Status RegisterFileSystem(const string& scheme,
+  Status RegisterFileSystem(const std::string& scheme,
                             FileSystemRegistry::Factory factory) override {
     return target_->RegisterFileSystem(scheme, factory);
   }
@@ -411,8 +412,9 @@ class EnvWrapper : public Env {
   void SchedClosureAfter(int64 micros, std::function<void()> closure) override {
     target_->SchedClosureAfter(micros, closure);
   }
-  Status LoadLibrary(const char* library_filename, void** handle) override {
-    return target_->LoadLibrary(library_filename, handle);
+  Status LoadDynamicLibrary(const char* library_filename,
+                            void** handle) override {
+    return target_->LoadDynamicLibrary(library_filename, handle);
   }
   Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
                               void** symbol) override {
@@ -468,43 +470,44 @@ struct ThreadOptions {
 
 /// A utility routine: copy contents of `src` in file system `src_fs`
 /// to `target` in file system `target_fs`.
-Status FileSystemCopyFile(FileSystem* src_fs, const string& src,
-                          FileSystem* target_fs, const string& target);
+Status FileSystemCopyFile(FileSystem* src_fs, const std::string& src,
+                          FileSystem* target_fs, const std::string& target);
 
 /// A utility routine: reads contents of named file into `*data`
-Status ReadFileToString(Env* env, const string& fname, string* data);
+Status ReadFileToString(Env* env, const std::string& fname, std::string* data);
 
 /// A utility routine: write contents of `data` to file named `fname`
 /// (overwriting existing contents, if any).
-Status WriteStringToFile(Env* env, const string& fname,
+Status WriteStringToFile(Env* env, const std::string& fname,
                          const StringPiece& data);
 
 /// Write binary representation of "proto" to the named file.
-Status WriteBinaryProto(Env* env, const string& fname,
+Status WriteBinaryProto(Env* env, const std::string& fname,
                         const protobuf::MessageLite& proto);
 
 /// Reads contents of named file and parse as binary encoded proto data
 /// and store into `*proto`.
-Status ReadBinaryProto(Env* env, const string& fname,
+Status ReadBinaryProto(Env* env, const std::string& fname,
                        protobuf::MessageLite* proto);
 
 /// Write the text representation of "proto" to the named file.
-Status WriteTextProto(Env* env, const string& fname,
+Status WriteTextProto(Env* env, const std::string& fname,
                       const protobuf::Message& proto);
 
 /// Read contents of named file and parse as text encoded proto data
 /// and store into `*proto`.
-inline Status ReadTextProto(Env* /* env */, const string& /* fname */,
+inline Status ReadTextProto(Env* /* env */, const std::string& /* fname */,
                             protobuf::MessageLite* /* proto */) {
   return errors::Unimplemented("Can't parse text protos with protolite.");
 }
-Status ReadTextProto(Env* env, const string& fname, protobuf::Message* proto);
+Status ReadTextProto(Env* env, const std::string& fname,
+                     protobuf::Message* proto);
 
 /// Read contents of named file and parse as either text or binary encoded proto
 /// data and store into `*proto`.
-Status ReadTextOrBinaryProto(Env* env, const string& fname,
+Status ReadTextOrBinaryProto(Env* env, const std::string& fname,
                              protobuf::Message* proto);
-Status ReadTextOrBinaryProto(Env* env, const string& fname,
+Status ReadTextOrBinaryProto(Env* env, const std::string& fname,
                              protobuf::MessageLite* proto);
 
 // START_SKIP_DOXYGEN
diff --git a/tensorflow/core/platform/env_test.cc b/tensorflow/core/platform/env_test.cc
index f013aff9703..79d793ee636 100644
--- a/tensorflow/core/platform/env_test.cc
+++ b/tensorflow/core/platform/env_test.cc
@@ -295,7 +295,9 @@ TEST_F(DefaultEnvTest, SleepForMicroseconds) {
 
 class TmpDirFileSystem : public NullFileSystem {
  public:
-  Status FileExists(const string& dir) override {
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
+  Status FileExists(const string& dir, TransactionToken* token) override {
     StringPiece scheme, host, path;
     io::ParseURI(dir, &scheme, &host, &path);
     if (path.empty()) return errors::NotFound(dir, " not found");
@@ -311,7 +313,7 @@ class TmpDirFileSystem : public NullFileSystem {
     return Env::Default()->FileExists(io::JoinPath(BaseDir(), path));
   }
 
-  Status CreateDir(const string& dir) override {
+  Status CreateDir(const string& dir, TransactionToken* token) override {
     StringPiece scheme, host, path;
     io::ParseURI(dir, &scheme, &host, &path);
     if (scheme != "tmpdirfs") {
@@ -328,7 +330,7 @@ class TmpDirFileSystem : public NullFileSystem {
     return status;
   }
 
-  Status IsDirectory(const string& dir) override {
+  Status IsDirectory(const string& dir, TransactionToken* token) override {
     StringPiece scheme, host, path;
     io::ParseURI(dir, &scheme, &host, &path);
     for (const auto& existing_dir : created_directories_)
@@ -336,7 +338,7 @@ class TmpDirFileSystem : public NullFileSystem {
     return errors::NotFound(dir, " not found");
   }
 
-  void FlushCaches() override { flushed_ = true; }
+  void FlushCaches(TransactionToken* token) override { flushed_ = true; }
 
  private:
   bool flushed_ = false;
diff --git a/tensorflow/core/platform/errors.h b/tensorflow/core/platform/errors.h
index 3f1ff477655..55af45a4c24 100644
--- a/tensorflow/core/platform/errors.h
+++ b/tensorflow/core/platform/errors.h
@@ -62,9 +62,11 @@ inline const strings::AlphaNum& PrepareForStrCat(const strings::AlphaNum& a) {
 // to be several layers of additional context.
 template <typename... Args>
 void AppendToMessage(::tensorflow::Status* status, Args... args) {
+  std::vector<StackFrame> stack_trace = status->stack_trace();
   *status = ::tensorflow::Status(
       status->code(),
-      ::tensorflow::strings::StrCat(status->error_message(), "\n\t", args...));
+      ::tensorflow::strings::StrCat(status->error_message(), "\n\t", args...),
+      std::move(stack_trace));
 }
 
 // For propagating errors when calling a function.
diff --git a/tensorflow/core/platform/file_system.cc b/tensorflow/core/platform/file_system.cc
index c9657e2339f..28c8553e6cc 100644
--- a/tensorflow/core/platform/file_system.cc
+++ b/tensorflow/core/platform/file_system.cc
@@ -70,8 +70,9 @@ string FileSystem::TranslateName(const string& name) const {
   return this->CleanPath(path);
 }
 
-Status FileSystem::IsDirectory(const string& name) {
+Status FileSystem::IsDirectory(const string& name, TransactionToken* token) {
   // Check if path exists.
+  // TODO(sami):Forward token to other methods once migration is complete.
   TF_RETURN_IF_ERROR(FileExists(name));
   FileStatistics stat;
   TF_RETURN_IF_ERROR(Stat(name, &stat));
@@ -86,9 +87,10 @@ Status FileSystem::HasAtomicMove(const string& path, bool* has_atomic_move) {
   return Status::OK();
 }
 
-void FileSystem::FlushCaches() {}
+void FileSystem::FlushCaches(TransactionToken* token) {}
 
 bool FileSystem::FilesExist(const std::vector<string>& files,
+                            TransactionToken* token,
                             std::vector<Status>* status) {
   bool result = true;
   for (const auto& file : files) {
@@ -105,6 +107,7 @@ bool FileSystem::FilesExist(const std::vector<string>& files,
 }
 
 Status FileSystem::DeleteRecursively(const string& dirname,
+                                     TransactionToken* token,
                                      int64* undeleted_files,
                                      int64* undeleted_dirs) {
   CHECK_NOTNULL(undeleted_files);
@@ -176,7 +179,8 @@ Status FileSystem::DeleteRecursively(const string& dirname,
   return ret;
 }
 
-Status FileSystem::RecursivelyCreateDir(const string& dirname) {
+Status FileSystem::RecursivelyCreateDir(const string& dirname,
+                                        TransactionToken* token) {
   StringPiece scheme, host, remaining_dir;
   this->ParseURI(dirname, &scheme, &host, &remaining_dir);
   std::vector<StringPiece> sub_dirs;
@@ -221,7 +225,8 @@ Status FileSystem::RecursivelyCreateDir(const string& dirname) {
   return Status::OK();
 }
 
-Status FileSystem::CopyFile(const string& src, const string& target) {
+Status FileSystem::CopyFile(const string& src, const string& target,
+                            TransactionToken* token) {
   return FileSystemCopyFile(this, src, this, target);
 }
 
@@ -436,4 +441,14 @@ string FileSystem::CreateURI(StringPiece scheme, StringPiece host,
   return strings::StrCat(scheme, "://", host, path);
 }
 
+std::string FileSystem::DecodeTransaction(const TransactionToken* token) {
+  // TODO(sami): Switch using StrCat when void* is supported
+  if (token) {
+    std::stringstream oss;
+    oss << "Token= " << token->token << ", Owner=" << token->owner;
+    return oss.str();
+  }
+  return "No Transaction";
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/file_system.h b/tensorflow/core/platform/file_system.h
index 640d3b3c027..ca9f0fd5145 100644
--- a/tensorflow/core/platform/file_system.h
+++ b/tensorflow/core/platform/file_system.h
@@ -44,6 +44,12 @@ class RandomAccessFile;
 class ReadOnlyMemoryRegion;
 class WritableFile;
 
+class FileSystem;
+struct TransactionToken {
+  FileSystem* owner;
+  void* token;
+};
+
 /// A generic interface for accessing a file system.  Implementations
 /// of custom filesystem adapters must implement this interface,
 /// RandomAccessFile, WritableFile, and ReadOnlyMemoryRegion classes.
@@ -62,7 +68,17 @@ class FileSystem {
   /// The ownership of the returned RandomAccessFile is passed to the caller
   /// and the object should be deleted when is not used.
   virtual tensorflow::Status NewRandomAccessFile(
-      const string& fname, std::unique_ptr<RandomAccessFile>* result) = 0;
+      const std::string& fname, std::unique_ptr<RandomAccessFile>* result) {
+    return NewRandomAccessFile(fname, nullptr, result);
+  };
+
+  virtual tensorflow::Status NewRandomAccessFile(
+      const std::string& fname, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) {
+    // We duplicate these methods due to Google internal coding style prevents
+    // virtual functions with default arguments. See PR #41615.
+    return Status::OK();
+  }
 
   /// \brief Creates an object that writes to a new file with the specified
   /// name.
@@ -77,7 +93,15 @@ class FileSystem {
   /// The ownership of the returned WritableFile is passed to the caller
   /// and the object should be deleted when is not used.
   virtual tensorflow::Status NewWritableFile(
-      const string& fname, std::unique_ptr<WritableFile>* result) = 0;
+      const std::string& fname, std::unique_ptr<WritableFile>* result) {
+    return NewWritableFile(fname, nullptr, result);
+  };
+
+  virtual tensorflow::Status NewWritableFile(
+      const std::string& fname, TransactionToken* token,
+      std::unique_ptr<WritableFile>* result) {
+    return Status::OK();
+  }
 
   /// \brief Creates an object that either appends to an existing file, or
   /// writes to a new file (if the file does not exist to begin with).
@@ -91,7 +115,15 @@ class FileSystem {
   /// The ownership of the returned WritableFile is passed to the caller
   /// and the object should be deleted when is not used.
   virtual tensorflow::Status NewAppendableFile(
-      const string& fname, std::unique_ptr<WritableFile>* result) = 0;
+      const std::string& fname, std::unique_ptr<WritableFile>* result) {
+    return NewAppendableFile(fname, nullptr, result);
+  };
+
+  virtual tensorflow::Status NewAppendableFile(
+      const std::string& fname, TransactionToken* token,
+      std::unique_ptr<WritableFile>* result) {
+    return Status::OK();
+  }
 
   /// \brief Creates a readonly region of memory with the file context.
   ///
@@ -104,22 +136,50 @@ class FileSystem {
   /// The ownership of the returned ReadOnlyMemoryRegion is passed to the caller
   /// and the object should be deleted when is not used.
   virtual tensorflow::Status NewReadOnlyMemoryRegionFromFile(
-      const string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result) = 0;
+      const std::string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
+    return NewReadOnlyMemoryRegionFromFile(fname, nullptr, result);
+  }
+
+  virtual tensorflow::Status NewReadOnlyMemoryRegionFromFile(
+      const std::string& fname, TransactionToken* token,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) {
+    return Status::OK();
+  }
 
   /// Returns OK if the named path exists and NOT_FOUND otherwise.
-  virtual tensorflow::Status FileExists(const string& fname) = 0;
+  virtual tensorflow::Status FileExists(const std::string& fname) {
+    return FileExists(fname, nullptr);
+  };
+
+  virtual tensorflow::Status FileExists(const std::string& fname,
+                                        TransactionToken* token) {
+    return Status::OK();
+  }
 
   /// Returns true if all the listed files exist, false otherwise.
   /// if status is not null, populate the vector with a detailed status
   /// for each file.
   virtual bool FilesExist(const std::vector<string>& files,
-                          std::vector<Status>* status);
+                          std::vector<Status>* status) {
+    return FilesExist(files, nullptr, status);
+  }
+
+  virtual bool FilesExist(const std::vector<string>& files,
+                          TransactionToken* token, std::vector<Status>* status);
 
   /// \brief Returns the immediate children in the given directory.
   ///
   /// The returned paths are relative to 'dir'.
-  virtual tensorflow::Status GetChildren(const string& dir,
-                                         std::vector<string>* result) = 0;
+  virtual tensorflow::Status GetChildren(const std::string& dir,
+                                         std::vector<string>* result) {
+    return GetChildren(dir, nullptr, result);
+  }
+
+  virtual tensorflow::Status GetChildren(const std::string& dir,
+                                         TransactionToken* token,
+                                         std::vector<string>* result) {
+    return Status::OK();
+  }
 
   /// \brief Given a pattern, stores in *results the set of paths that matches
   /// that pattern. *results is cleared.
@@ -143,8 +203,16 @@ class FileSystem {
   ///  * OK - no errors
   ///  * UNIMPLEMENTED - Some underlying functions (like GetChildren) are not
   ///                    implemented
-  virtual tensorflow::Status GetMatchingPaths(const string& pattern,
-                                              std::vector<string>* results) = 0;
+  virtual tensorflow::Status GetMatchingPaths(const std::string& pattern,
+                                              std::vector<string>* results) {
+    return GetMatchingPaths(pattern, nullptr, results);
+  }
+
+  virtual tensorflow::Status GetMatchingPaths(const std::string& pattern,
+                                              TransactionToken* token,
+                                              std::vector<string>* results) {
+    return Status::OK();
+  }
 
   /// \brief Checks if the given filename matches the pattern.
   ///
@@ -154,18 +222,40 @@ class FileSystem {
   virtual bool Match(const std::string& filename, const std::string& pattern);
 
   /// \brief Obtains statistics for the given path.
-  virtual tensorflow::Status Stat(const string& fname,
-                                  FileStatistics* stat) = 0;
+  virtual tensorflow::Status Stat(const std::string& fname,
+                                  FileStatistics* stat) {
+    return Stat(fname, nullptr, stat);
+  }
+
+  virtual tensorflow::Status Stat(const std::string& fname,
+                                  TransactionToken* token,
+                                  FileStatistics* stat) {
+    return Status::OK();
+  }
 
   /// \brief Deletes the named file.
-  virtual tensorflow::Status DeleteFile(const string& fname) = 0;
+  virtual tensorflow::Status DeleteFile(const std::string& fname) {
+    return DeleteFile(fname, nullptr);
+  }
+
+  virtual tensorflow::Status DeleteFile(const std::string& fname,
+                                        TransactionToken* token) {
+    return Status::OK();
+  }
 
   /// \brief Creates the specified directory.
   /// Typical return codes:
   ///  * OK - successfully created the directory.
   ///  * ALREADY_EXISTS - directory with name dirname already exists.
   ///  * PERMISSION_DENIED - dirname is not writable.
-  virtual tensorflow::Status CreateDir(const string& dirname) = 0;
+  virtual tensorflow::Status CreateDir(const std::string& dirname) {
+    return CreateDir(dirname, nullptr);
+  }
+
+  virtual tensorflow::Status CreateDir(const std::string& dirname,
+                                       TransactionToken* token) {
+    return Status::OK();
+  }
 
   /// \brief Creates the specified directory and all the necessary
   /// subdirectories.
@@ -173,10 +263,22 @@ class FileSystem {
   ///  * OK - successfully created the directory and sub directories, even if
   ///         they were already created.
   ///  * PERMISSION_DENIED - dirname or some subdirectory is not writable.
-  virtual tensorflow::Status RecursivelyCreateDir(const string& dirname);
+  virtual tensorflow::Status RecursivelyCreateDir(const std::string& dirname) {
+    return RecursivelyCreateDir(dirname, nullptr);
+  }
+
+  virtual tensorflow::Status RecursivelyCreateDir(const std::string& dirname,
+                                                  TransactionToken* token);
 
   /// \brief Deletes the specified directory.
-  virtual tensorflow::Status DeleteDir(const string& dirname) = 0;
+  virtual tensorflow::Status DeleteDir(const std::string& dirname) {
+    return DeleteDir(dirname, nullptr);
+  };
+
+  virtual tensorflow::Status DeleteDir(const std::string& dirname,
+                                       TransactionToken* token) {
+    return Status::OK();
+  }
 
   /// \brief Deletes the specified directory and all subdirectories and files
   /// underneath it. This is accomplished by traversing the directory tree
@@ -202,20 +304,50 @@ class FileSystem {
   ///  * PERMISSION_DENIED - dirname or some descendant is not writable
   ///  * UNIMPLEMENTED - Some underlying functions (like Delete) are not
   ///                    implemented
-  virtual tensorflow::Status DeleteRecursively(const string& dirname,
+  virtual tensorflow::Status DeleteRecursively(const std::string& dirname,
+                                               int64* undeleted_files,
+                                               int64* undeleted_dirs) {
+    return DeleteRecursively(dirname, nullptr, undeleted_files, undeleted_dirs);
+  }
+
+  virtual tensorflow::Status DeleteRecursively(const std::string& dirname,
+                                               TransactionToken* token,
                                                int64* undeleted_files,
                                                int64* undeleted_dirs);
 
   /// \brief Stores the size of `fname` in `*file_size`.
-  virtual tensorflow::Status GetFileSize(const string& fname,
-                                         uint64* file_size) = 0;
+  virtual tensorflow::Status GetFileSize(const std::string& fname,
+                                         uint64* file_size) {
+    return GetFileSize(fname, nullptr, file_size);
+  }
+
+  virtual tensorflow::Status GetFileSize(const std::string& fname,
+                                         TransactionToken* token,
+                                         uint64* file_size) {
+    return Status::OK();
+  }
 
   /// \brief Overwrites the target if it exists.
-  virtual tensorflow::Status RenameFile(const string& src,
-                                        const string& target) = 0;
+  virtual tensorflow::Status RenameFile(const std::string& src,
+                                        const std::string& target) {
+    return RenameFile(src, target, nullptr);
+  }
+
+  virtual tensorflow::Status RenameFile(const std::string& src,
+                                        const std::string& target,
+                                        TransactionToken* token) {
+    return Status::OK();
+  }
 
   /// \brief Copy the src to target.
-  virtual tensorflow::Status CopyFile(const string& src, const string& target);
+  virtual tensorflow::Status CopyFile(const std::string& src,
+                                      const std::string& target) {
+    return CopyFile(src, target, nullptr);
+  }
+
+  virtual tensorflow::Status CopyFile(const std::string& src,
+                                      const std::string& target,
+                                      TransactionToken* token);
 
   /// \brief Translate an URI to a filename for the FileSystem implementation.
   ///
@@ -235,7 +367,12 @@ class FileSystem {
   ///  * NOT_FOUND - The path entry does not exist.
   ///  * PERMISSION_DENIED - Insufficient permissions.
   ///  * UNIMPLEMENTED - The file factory doesn't support directories.
-  virtual tensorflow::Status IsDirectory(const string& fname);
+  virtual tensorflow::Status IsDirectory(const std::string& fname) {
+    return IsDirectory(fname, nullptr);
+  }
+
+  virtual tensorflow::Status IsDirectory(const std::string& fname,
+                                         TransactionToken* token);
 
   /// \brief Returns whether the given path is on a file system
   /// that has atomic move capabilities. This can be used
@@ -247,10 +384,12 @@ class FileSystem {
   ///         so has_atomic_move holds the above information.
   ///  * UNIMPLEMENTED - The file system of the path hasn't been implemented in
   ///  TF
-  virtual Status HasAtomicMove(const string& path, bool* has_atomic_move);
+  virtual Status HasAtomicMove(const std::string& path, bool* has_atomic_move);
 
   /// \brief Flushes any cached filesystem objects from memory.
-  virtual void FlushCaches();
+  virtual void FlushCaches() { FlushCaches(nullptr); }
+
+  virtual void FlushCaches(TransactionToken* token);
 
   /// \brief The separator this filesystem uses.
   ///
@@ -340,10 +479,245 @@ class FileSystem {
   void ParseURI(StringPiece remaining, StringPiece* scheme, StringPiece* host,
                 StringPiece* path) const;
 
+  // Transaction related API
+
+  /// \brief Starts a new transaction
+  virtual tensorflow::Status StartTransaction(TransactionToken** token) {
+    *token = nullptr;
+    return Status::OK();
+  }
+
+  /// \brief Adds `path` to transaction in `token`
+  virtual tensorflow::Status AddToTransaction(const std::string& path,
+                                              TransactionToken* token) {
+    return Status::OK();
+  }
+
+  /// \brief Ends transaction
+  virtual tensorflow::Status EndTransaction(TransactionToken* token) {
+    return Status::OK();
+  }
+
+  /// \brief Get token for `path` or start a new transaction and add `path` to
+  /// it.
+  virtual tensorflow::Status GetTokenOrStartTransaction(
+      const string& path, TransactionToken** token) {
+    *token = nullptr;
+    return Status::OK();
+  }
+
+  /// \brief Return transaction for `path` or nullptr in `token`
+  virtual tensorflow::Status GetTransactionForPath(const std::string& path,
+                                                   TransactionToken** token) {
+    *token = nullptr;
+    return Status::OK();
+  }
+
+  /// \brief Decode transaction to human readable string.
+  virtual std::string DecodeTransaction(const TransactionToken* token);
+
   FileSystem() {}
 
   virtual ~FileSystem() = default;
 };
+/// This macro adds forwarding methods from FileSystem class to
+/// used class since name hiding will prevent these to be accessed from
+/// derived classes and would require all use locations to migrate to
+/// Transactional API. This is an interim solution until ModularFileSystem class
+/// becomes a singleton.
+// TODO(sami): Remove this macro when filesystem plugins migration is complete.
+#define TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT \
+  using FileSystem::NewRandomAccessFile;                      \
+  using FileSystem::NewWritableFile;                          \
+  using FileSystem::NewAppendableFile;                        \
+  using FileSystem::NewReadOnlyMemoryRegionFromFile;          \
+  using FileSystem::FileExists;                               \
+  using FileSystem::GetChildren;                              \
+  using FileSystem::GetMatchingPaths;                         \
+  using FileSystem::Stat;                                     \
+  using FileSystem::DeleteFile;                               \
+  using FileSystem::RecursivelyCreateDir;                     \
+  using FileSystem::DeleteDir;                                \
+  using FileSystem::DeleteRecursively;                        \
+  using FileSystem::GetFileSize;                              \
+  using FileSystem::RenameFile;                               \
+  using FileSystem::CopyFile;                                 \
+  using FileSystem::IsDirectory;                              \
+  using FileSystem::FlushCaches
+
+/// A Wrapper class for Transactional FileSystem support.
+/// This provides means to make use of the transactions with minimal code change
+/// Any operations that are done through this interface will be through the
+/// transaction created at the time of construction of this instance.
+/// See FileSystem documentation for method descriptions.
+/// This class simply forwards all calls to wrapped filesystem either with given
+/// transaction token or with token used in its construction. This allows doing
+/// transactional filesystem access with minimal code change.
+class WrappedFileSystem : public FileSystem {
+ public:
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
+  tensorflow::Status NewRandomAccessFile(
+      const std::string& fname, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override {
+    return fs_->NewRandomAccessFile(fname, (token ? token : token_), result);
+  }
+
+  tensorflow::Status NewWritableFile(
+      const std::string& fname, TransactionToken* token,
+      std::unique_ptr<WritableFile>* result) override {
+    return fs_->NewWritableFile(fname, (token ? token : token_), result);
+  }
+
+  tensorflow::Status NewAppendableFile(
+      const std::string& fname, TransactionToken* token,
+      std::unique_ptr<WritableFile>* result) override {
+    return fs_->NewAppendableFile(fname, (token ? token : token_), result);
+  }
+
+  tensorflow::Status NewReadOnlyMemoryRegionFromFile(
+      const std::string& fname, TransactionToken* token,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override {
+    return fs_->NewReadOnlyMemoryRegionFromFile(fname, (token ? token : token_),
+                                                result);
+  }
+
+  tensorflow::Status FileExists(const std::string& fname,
+                                TransactionToken* token) override {
+    return fs_->FileExists(fname, (token ? token : token_));
+  }
+
+  bool FilesExist(const std::vector<string>& files, TransactionToken* token,
+                  std::vector<Status>* status) override {
+    return fs_->FilesExist(files, (token ? token : token_), status);
+  }
+
+  tensorflow::Status GetChildren(const std::string& dir,
+                                 TransactionToken* token,
+                                 std::vector<string>* result) override {
+    return fs_->GetChildren(dir, (token ? token : token_), result);
+  }
+
+  tensorflow::Status GetMatchingPaths(const std::string& pattern,
+                                      TransactionToken* token,
+                                      std::vector<string>* results) override {
+    return fs_->GetMatchingPaths(pattern, (token ? token : token_), results);
+  }
+
+  bool Match(const std::string& filename, const std::string& pattern) override {
+    return fs_->Match(filename, pattern);
+  }
+
+  tensorflow::Status Stat(const std::string& fname, TransactionToken* token,
+                          FileStatistics* stat) override {
+    return fs_->Stat(fname, (token ? token : token_), stat);
+  }
+
+  tensorflow::Status DeleteFile(const std::string& fname,
+                                TransactionToken* token) override {
+    return fs_->DeleteFile(fname, (token ? token : token_));
+  }
+
+  tensorflow::Status CreateDir(const std::string& dirname,
+                               TransactionToken* token) override {
+    return fs_->CreateDir(dirname, (token ? token : token_));
+  }
+
+  tensorflow::Status RecursivelyCreateDir(const std::string& dirname,
+                                          TransactionToken* token) override {
+    return fs_->RecursivelyCreateDir(dirname, (token ? token : token_));
+  }
+
+  tensorflow::Status DeleteDir(const std::string& dirname,
+                               TransactionToken* token) override {
+    return fs_->DeleteDir(dirname, (token ? token : token_));
+  }
+
+  tensorflow::Status DeleteRecursively(const std::string& dirname,
+                                       TransactionToken* token,
+                                       int64* undeleted_files,
+                                       int64* undeleted_dirs) override {
+    return fs_->DeleteRecursively(dirname, (token ? token : token_),
+                                  undeleted_files, undeleted_dirs);
+  }
+
+  tensorflow::Status GetFileSize(const std::string& fname,
+                                 TransactionToken* token,
+                                 uint64* file_size) override {
+    return fs_->GetFileSize(fname, (token ? token : token_), file_size);
+  }
+
+  tensorflow::Status RenameFile(const std::string& src,
+                                const std::string& target,
+                                TransactionToken* token) override {
+    return fs_->RenameFile(src, target, (token ? token : token_));
+  }
+
+  tensorflow::Status CopyFile(const std::string& src, const std::string& target,
+                              TransactionToken* token) override {
+    return fs_->CopyFile(src, target, (token ? token : token_));
+  }
+
+  std::string TranslateName(const std::string& name) const override {
+    return fs_->TranslateName(name);
+  }
+
+  tensorflow::Status IsDirectory(const std::string& fname,
+                                 TransactionToken* token) override {
+    return fs_->IsDirectory(fname, (token ? token : token_));
+  }
+
+  Status HasAtomicMove(const std::string& path,
+                       bool* has_atomic_move) override {
+    return fs_->HasAtomicMove(path, has_atomic_move);
+  }
+
+  void FlushCaches(TransactionToken* token) override {
+    return fs_->FlushCaches((token ? token : token_));
+  }
+
+  char Separator() const override { return fs_->Separator(); }
+
+  StringPiece Basename(StringPiece path) const override {
+    return fs_->Basename(path);
+  }
+
+  tensorflow::Status StartTransaction(TransactionToken** token) override {
+    return fs_->StartTransaction(token);
+  }
+
+  tensorflow::Status AddToTransaction(const std::string& path,
+                                      TransactionToken* token) override {
+    return fs_->AddToTransaction(path, (token ? token : token_));
+  }
+
+  tensorflow::Status EndTransaction(TransactionToken* token) override {
+    return fs_->EndTransaction(token);
+  }
+
+  tensorflow::Status GetTransactionForPath(const std::string& path,
+                                           TransactionToken** token) override {
+    return fs_->GetTransactionForPath(path, token);
+  }
+
+  tensorflow::Status GetTokenOrStartTransaction(
+      const std::string& path, TransactionToken** token) override {
+    return fs_->GetTokenOrStartTransaction(path, token);
+  }
+
+  std::string DecodeTransaction(const TransactionToken* token) override {
+    return fs_->DecodeTransaction((token ? token : token_));
+  }
+
+  WrappedFileSystem(FileSystem* file_system, TransactionToken* token)
+      : fs_(file_system), token_(token) {}
+
+  ~WrappedFileSystem() override = default;
+
+ private:
+  FileSystem* fs_;
+  TransactionToken* token_;
+};
 
 /// A file abstraction for randomly reading the contents of a file.
 class RandomAccessFile {
diff --git a/tensorflow/core/platform/file_system_test.cc b/tensorflow/core/platform/file_system_test.cc
index 0af45185612..b683723936b 100644
--- a/tensorflow/core/platform/file_system_test.cc
+++ b/tensorflow/core/platform/file_system_test.cc
@@ -32,7 +32,9 @@ static const char* const kPrefix = "ipfs://solarsystem";
 // cannot have children further.
 class InterPlanetaryFileSystem : public NullFileSystem {
  public:
-  Status FileExists(const string& fname) override {
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
+  Status FileExists(const string& fname, TransactionToken* token) override {
     string parsed_path;
     ParsePath(fname, &parsed_path);
     if (BodyExists(parsed_path)) {
@@ -42,7 +44,7 @@ class InterPlanetaryFileSystem : public NullFileSystem {
   }
 
   // Adds the dir to the parent's children list and creates an entry for itself.
-  Status CreateDir(const string& dirname) override {
+  Status CreateDir(const string& dirname, TransactionToken* token) override {
     string parsed_path;
     ParsePath(dirname, &parsed_path);
     // If the directory already exists, throw an error.
@@ -88,7 +90,7 @@ class InterPlanetaryFileSystem : public NullFileSystem {
     return Status(tensorflow::error::FAILED_PRECONDITION, "Failed to create");
   }
 
-  Status IsDirectory(const string& dirname) override {
+  Status IsDirectory(const string& dirname, TransactionToken* token) override {
     string parsed_path;
     ParsePath(dirname, &parsed_path);
     // Simulate evil_directory has bad permissions by throwing a LOG(FATAL)
@@ -105,8 +107,9 @@ class InterPlanetaryFileSystem : public NullFileSystem {
     return Status(tensorflow::error::FAILED_PRECONDITION, "Not a dir");
   }
 
-  Status GetChildren(const string& dir, std::vector<string>* result) override {
-    TF_RETURN_IF_ERROR(IsDirectory(dir));
+  Status GetChildren(const string& dir, TransactionToken* token,
+                     std::vector<string>* result) override {
+    TF_RETURN_IF_ERROR(IsDirectory(dir, nullptr));
     string parsed_path;
     ParsePath(dir, &parsed_path);
     result->insert(result->begin(), celestial_bodies_[parsed_path].begin(),
@@ -151,8 +154,8 @@ class InterPlanetaryFileSystem : public NullFileSystem {
 // common prefix of BaseDir().
 string Match(InterPlanetaryFileSystem* ipfs, const string& suffix_pattern) {
   std::vector<string> results;
-  Status s =
-      ipfs->GetMatchingPaths(ipfs->JoinPath(kPrefix, suffix_pattern), &results);
+  Status s = ipfs->GetMatchingPaths(ipfs->JoinPath(kPrefix, suffix_pattern),
+                                    nullptr, &results);
   if (!s.ok()) {
     return s.ToString();
   } else {
@@ -179,18 +182,18 @@ TEST(InterPlanetaryFileSystemTest, IPFSMatch) {
   // Returns Jupiter's and Earth's moons.
   EXPECT_EQ(Match(&ipfs, "*/*"),
             "Earth/Moon,Jupiter/Europa,Jupiter/Ganymede,Jupiter/Io");
-  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "Planet0")));
-  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "Planet1")));
+  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "Planet0"), nullptr));
+  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "Planet1"), nullptr));
   EXPECT_EQ(Match(&ipfs, "Planet[0-1]"), "Planet0,Planet1");
   EXPECT_EQ(Match(&ipfs, "Planet?"), "Planet0,Planet1");
 }
 
 TEST(InterPlanetaryFileSystemTest, MatchSimple) {
   InterPlanetaryFileSystem ipfs;
-  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "match-00")));
-  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "match-0a")));
-  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "match-01")));
-  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "match-aaa")));
+  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "match-00"), nullptr));
+  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "match-0a"), nullptr));
+  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "match-01"), nullptr));
+  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "match-aaa"), nullptr));
 
   EXPECT_EQ(Match(&ipfs, "match-*"), "match-00,match-01,match-0a,match-aaa");
   EXPECT_EQ(Match(&ipfs, "match-0[0-9]"), "match-00,match-01");
@@ -203,22 +206,23 @@ TEST(InterPlanetaryFileSystemTest, MatchSimple) {
 // that evil_directory isn't accessed.
 TEST(InterPlanetaryFileSystemTest, MatchOnlyNeeded) {
   InterPlanetaryFileSystem ipfs;
-  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "abcd")));
-  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "evil_directory")));
+  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "abcd"), nullptr));
+  TF_EXPECT_OK(
+      ipfs.CreateDir(ipfs.JoinPath(kPrefix, "evil_directory"), nullptr));
 
   EXPECT_EQ(Match(&ipfs, "abcd"), "abcd");
 }
 
 TEST(InterPlanetaryFileSystemTest, MatchDirectory) {
   InterPlanetaryFileSystem ipfs;
-  TF_EXPECT_OK(
-      ipfs.RecursivelyCreateDir(ipfs.JoinPath(kPrefix, "match-00/abc/x")));
-  TF_EXPECT_OK(
-      ipfs.RecursivelyCreateDir(ipfs.JoinPath(kPrefix, "match-0a/abc/x")));
-  TF_EXPECT_OK(
-      ipfs.RecursivelyCreateDir(ipfs.JoinPath(kPrefix, "match-01/abc/x")));
-  TF_EXPECT_OK(
-      ipfs.RecursivelyCreateDir(ipfs.JoinPath(kPrefix, "match-aaa/abc/x")));
+  TF_EXPECT_OK(ipfs.RecursivelyCreateDir(
+      ipfs.JoinPath(kPrefix, "match-00/abc/x"), nullptr));
+  TF_EXPECT_OK(ipfs.RecursivelyCreateDir(
+      ipfs.JoinPath(kPrefix, "match-0a/abc/x"), nullptr));
+  TF_EXPECT_OK(ipfs.RecursivelyCreateDir(
+      ipfs.JoinPath(kPrefix, "match-01/abc/x"), nullptr));
+  TF_EXPECT_OK(ipfs.RecursivelyCreateDir(
+      ipfs.JoinPath(kPrefix, "match-aaa/abc/x"), nullptr));
 
   EXPECT_EQ(Match(&ipfs, "match-*/abc/x"),
             "match-00/abc/x,match-01/abc/x,match-0a/abc/x,match-aaa/abc/x");
@@ -232,20 +236,20 @@ TEST(InterPlanetaryFileSystemTest, MatchDirectory) {
 
 TEST(InterPlanetaryFileSystemTest, MatchMultipleWildcards) {
   InterPlanetaryFileSystem ipfs;
-  TF_EXPECT_OK(
-      ipfs.RecursivelyCreateDir(ipfs.JoinPath(kPrefix, "match-00/abc/00")));
-  TF_EXPECT_OK(
-      ipfs.RecursivelyCreateDir(ipfs.JoinPath(kPrefix, "match-00/abc/01")));
-  TF_EXPECT_OK(
-      ipfs.RecursivelyCreateDir(ipfs.JoinPath(kPrefix, "match-00/abc/09")));
-  TF_EXPECT_OK(
-      ipfs.RecursivelyCreateDir(ipfs.JoinPath(kPrefix, "match-01/abc/00")));
-  TF_EXPECT_OK(
-      ipfs.RecursivelyCreateDir(ipfs.JoinPath(kPrefix, "match-01/abc/04")));
-  TF_EXPECT_OK(
-      ipfs.RecursivelyCreateDir(ipfs.JoinPath(kPrefix, "match-01/abc/10")));
-  TF_EXPECT_OK(
-      ipfs.RecursivelyCreateDir(ipfs.JoinPath(kPrefix, "match-02/abc/00")));
+  TF_EXPECT_OK(ipfs.RecursivelyCreateDir(
+      ipfs.JoinPath(kPrefix, "match-00/abc/00"), nullptr));
+  TF_EXPECT_OK(ipfs.RecursivelyCreateDir(
+      ipfs.JoinPath(kPrefix, "match-00/abc/01"), nullptr));
+  TF_EXPECT_OK(ipfs.RecursivelyCreateDir(
+      ipfs.JoinPath(kPrefix, "match-00/abc/09"), nullptr));
+  TF_EXPECT_OK(ipfs.RecursivelyCreateDir(
+      ipfs.JoinPath(kPrefix, "match-01/abc/00"), nullptr));
+  TF_EXPECT_OK(ipfs.RecursivelyCreateDir(
+      ipfs.JoinPath(kPrefix, "match-01/abc/04"), nullptr));
+  TF_EXPECT_OK(ipfs.RecursivelyCreateDir(
+      ipfs.JoinPath(kPrefix, "match-01/abc/10"), nullptr));
+  TF_EXPECT_OK(ipfs.RecursivelyCreateDir(
+      ipfs.JoinPath(kPrefix, "match-02/abc/00"), nullptr));
 
   EXPECT_EQ(Match(&ipfs, "match-0[0-1]/abc/0[0-8]"),
             "match-00/abc/00,match-00/abc/01,match-01/abc/00,match-01/abc/04");
@@ -273,7 +277,7 @@ TEST(InterPlanetaryFileSystemTest, HasAtomicMove) {
 class TestFileSystem : public NullFileSystem {
  public:
   // Only allow for a single root directory.
-  Status IsDirectory(const string& dirname) override {
+  Status IsDirectory(const string& dirname, TransactionToken* token) override {
     if (dirname == "." || dirname.empty()) {
       return Status::OK();
     }
@@ -281,7 +285,8 @@ class TestFileSystem : public NullFileSystem {
   }
 
   // Simulating a FS with a root dir and a single file underneath it.
-  Status GetChildren(const string& dir, std::vector<string>* result) override {
+  Status GetChildren(const string& dir, TransactionToken* token,
+                     std::vector<string>* result) override {
     if (dir == "." || dir.empty()) {
       result->push_back("test");
     }
@@ -293,10 +298,10 @@ class TestFileSystem : public NullFileSystem {
 TEST(TestFileSystemTest, RootDirectory) {
   TestFileSystem fs;
   std::vector<string> results;
-  auto ret = fs.GetMatchingPaths("./te*", &results);
+  auto ret = fs.GetMatchingPaths("./te*", nullptr, &results);
   EXPECT_EQ(1, results.size());
   EXPECT_EQ("./test", results[0]);
-  ret = fs.GetMatchingPaths("te*", &results);
+  ret = fs.GetMatchingPaths("te*", nullptr, &results);
   EXPECT_EQ(1, results.size());
   EXPECT_EQ("./test", results[0]);
 }
diff --git a/tensorflow/core/platform/fingerprint.h b/tensorflow/core/platform/fingerprint.h
index b1260615580..cebb0679f0d 100644
--- a/tensorflow/core/platform/fingerprint.h
+++ b/tensorflow/core/platform/fingerprint.h
@@ -90,6 +90,15 @@ inline uint64 Fingerprint64(const StringPiece s) {
 #endif
 }
 
+// 32-bit variant of Fingerprint64 above (same properties and caveats apply).
+inline uint32 Fingerprint32(const StringPiece s) {
+#ifdef USE_OSS_FARMHASH
+  return ::util::Fingerprint32(s.data(), s.size());
+#else
+  return farmhash::Fingerprint32(s.data(), s.size());
+#endif
+}
+
 // 128-bit variant of Fingerprint64 above (same properties and caveats apply).
 inline Fprint128 Fingerprint128(const StringPiece s) {
 #ifdef USE_OSS_FARMHASH
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index 1e1062c88c0..6e4267ec64b 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -70,7 +70,7 @@ class LibHDFS {
  private:
   void LoadAndBind() {
     auto TryLoadAndBind = [this](const char* name, void** handle) -> Status {
-      TF_RETURN_IF_ERROR(Env::Default()->LoadLibrary(name, handle));
+      TF_RETURN_IF_ERROR(Env::Default()->LoadDynamicLibrary(name, handle));
 #define BIND_HDFS_FUNC(function) \
   TF_RETURN_IF_ERROR(BindFunc(*handle, #function, &function));
 
@@ -280,7 +280,8 @@ class HDFSRandomAccessFile : public RandomAccessFile {
 };
 
 Status HadoopFileSystem::NewRandomAccessFile(
-    const string& fname, std::unique_ptr<RandomAccessFile>* result) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<RandomAccessFile>* result) {
   hdfsFS fs = nullptr;
   TF_RETURN_IF_ERROR(Connect(fname, &fs));
 
@@ -372,7 +373,8 @@ class HDFSWritableFile : public WritableFile {
 };
 
 Status HadoopFileSystem::NewWritableFile(
-    const string& fname, std::unique_ptr<WritableFile>* result) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<WritableFile>* result) {
   hdfsFS fs = nullptr;
   TF_RETURN_IF_ERROR(Connect(fname, &fs));
 
@@ -386,7 +388,8 @@ Status HadoopFileSystem::NewWritableFile(
 }
 
 Status HadoopFileSystem::NewAppendableFile(
-    const string& fname, std::unique_ptr<WritableFile>* result) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<WritableFile>* result) {
   hdfsFS fs = nullptr;
   TF_RETURN_IF_ERROR(Connect(fname, &fs));
 
@@ -400,7 +403,8 @@ Status HadoopFileSystem::NewAppendableFile(
 }
 
 Status HadoopFileSystem::NewReadOnlyMemoryRegionFromFile(
-    const string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<ReadOnlyMemoryRegion>* result) {
   // hadoopReadZero() technically supports this call with the following
   // caveats:
   // - It only works up to 2 GB. We'd have to Stat() the file to ensure that
@@ -410,7 +414,8 @@ Status HadoopFileSystem::NewReadOnlyMemoryRegionFromFile(
   return errors::Unimplemented("HDFS does not support ReadOnlyMemoryRegion");
 }
 
-Status HadoopFileSystem::FileExists(const string& fname) {
+Status HadoopFileSystem::FileExists(const string& fname,
+                                    TransactionToken* token) {
   hdfsFS fs = nullptr;
   TF_RETURN_IF_ERROR(Connect(fname, &fs));
   if (libhdfs()->hdfsExists(fs, TranslateName(fname).c_str()) == 0) {
@@ -419,7 +424,7 @@ Status HadoopFileSystem::FileExists(const string& fname) {
   return errors::NotFound(fname, " not found.");
 }
 
-Status HadoopFileSystem::GetChildren(const string& dir,
+Status HadoopFileSystem::GetChildren(const string& dir, TransactionToken* token,
                                      std::vector<string>* result) {
   result->clear();
   hdfsFS fs = nullptr;
@@ -428,7 +433,7 @@ Status HadoopFileSystem::GetChildren(const string& dir,
   // hdfsListDirectory returns nullptr if the directory is empty. Do a separate
   // check to verify the directory exists first.
   FileStatistics stat;
-  TF_RETURN_IF_ERROR(Stat(dir, &stat));
+  TF_RETURN_IF_ERROR(Stat(dir, token, &stat));
 
   int entries = 0;
   hdfsFileInfo* info =
@@ -448,11 +453,13 @@ Status HadoopFileSystem::GetChildren(const string& dir,
 }
 
 Status HadoopFileSystem::GetMatchingPaths(const string& pattern,
+                                          TransactionToken* token,
                                           std::vector<string>* results) {
   return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
 }
 
-Status HadoopFileSystem::DeleteFile(const string& fname) {
+Status HadoopFileSystem::DeleteFile(const string& fname,
+                                    TransactionToken* token) {
   hdfsFS fs = nullptr;
   TF_RETURN_IF_ERROR(Connect(fname, &fs));
 
@@ -463,7 +470,7 @@ Status HadoopFileSystem::DeleteFile(const string& fname) {
   return Status::OK();
 }
 
-Status HadoopFileSystem::CreateDir(const string& dir) {
+Status HadoopFileSystem::CreateDir(const string& dir, TransactionToken* token) {
   hdfsFS fs = nullptr;
   TF_RETURN_IF_ERROR(Connect(dir, &fs));
 
@@ -473,7 +480,7 @@ Status HadoopFileSystem::CreateDir(const string& dir) {
   return Status::OK();
 }
 
-Status HadoopFileSystem::DeleteDir(const string& dir) {
+Status HadoopFileSystem::DeleteDir(const string& dir, TransactionToken* token) {
   hdfsFS fs = nullptr;
   TF_RETURN_IF_ERROR(Connect(dir, &fs));
 
@@ -492,7 +499,7 @@ Status HadoopFileSystem::DeleteDir(const string& dir) {
   // the call is actually successful. Check again by Stat.
   if (info == nullptr && errno != 0) {
     FileStatistics stat;
-    TF_RETURN_IF_ERROR(Stat(dir, &stat));
+    TF_RETURN_IF_ERROR(Stat(dir, token, &stat));
   }
 
   if (entries > 0) {
@@ -505,7 +512,8 @@ Status HadoopFileSystem::DeleteDir(const string& dir) {
   return Status::OK();
 }
 
-Status HadoopFileSystem::GetFileSize(const string& fname, uint64* size) {
+Status HadoopFileSystem::GetFileSize(const string& fname,
+                                     TransactionToken* token, uint64* size) {
   hdfsFS fs = nullptr;
   TF_RETURN_IF_ERROR(Connect(fname, &fs));
 
@@ -519,7 +527,8 @@ Status HadoopFileSystem::GetFileSize(const string& fname, uint64* size) {
   return Status::OK();
 }
 
-Status HadoopFileSystem::RenameFile(const string& src, const string& target) {
+Status HadoopFileSystem::RenameFile(const string& src, const string& target,
+                                    TransactionToken* token) {
   hdfsFS fs = nullptr;
   TF_RETURN_IF_ERROR(Connect(src, &fs));
 
@@ -536,7 +545,8 @@ Status HadoopFileSystem::RenameFile(const string& src, const string& target) {
   return Status::OK();
 }
 
-Status HadoopFileSystem::Stat(const string& fname, FileStatistics* stats) {
+Status HadoopFileSystem::Stat(const string& fname, TransactionToken* token,
+                              FileStatistics* stats) {
   hdfsFS fs = nullptr;
   TF_RETURN_IF_ERROR(Connect(fname, &fs));
 
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.h b/tensorflow/core/platform/hadoop/hadoop_file_system.h
index f9f2c25e2ea..24fb28c522e 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.h
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.h
@@ -32,37 +32,44 @@ class HadoopFileSystem : public FileSystem {
   HadoopFileSystem();
   ~HadoopFileSystem();
 
-  Status NewRandomAccessFile(
-      const string& fname, std::unique_ptr<RandomAccessFile>* result) override;
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
 
-  Status NewWritableFile(const string& fname,
+  Status NewRandomAccessFile(
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override;
+
+  Status NewWritableFile(const string& fname, TransactionToken* token,
                          std::unique_ptr<WritableFile>* result) override;
 
-  Status NewAppendableFile(const string& fname,
+  Status NewAppendableFile(const string& fname, TransactionToken* token,
                            std::unique_ptr<WritableFile>* result) override;
 
   Status NewReadOnlyMemoryRegionFromFile(
-      const string& fname,
+      const string& fname, TransactionToken* token,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
 
-  Status FileExists(const string& fname) override;
+  Status FileExists(const string& fname, TransactionToken* token) override;
 
-  Status GetChildren(const string& dir, std::vector<string>* result) override;
+  Status GetChildren(const string& dir, TransactionToken* token,
+                     std::vector<string>* result) override;
 
-  Status GetMatchingPaths(const string& pattern,
+  Status GetMatchingPaths(const string& pattern, TransactionToken* token,
                           std::vector<string>* results) override;
 
-  Status DeleteFile(const string& fname) override;
+  Status DeleteFile(const string& fname, TransactionToken* token) override;
 
-  Status CreateDir(const string& name) override;
+  Status CreateDir(const string& dir, TransactionToken* token) override;
 
-  Status DeleteDir(const string& name) override;
+  Status DeleteDir(const string& dir, TransactionToken* token) override;
 
-  Status GetFileSize(const string& fname, uint64* size) override;
+  Status GetFileSize(const string& fname, TransactionToken* token,
+                     uint64* size) override;
 
-  Status RenameFile(const string& src, const string& target) override;
+  Status RenameFile(const string& src, const string& target,
+                    TransactionToken* token) override;
 
-  Status Stat(const string& fname, FileStatistics* stat) override;
+  Status Stat(const string& fname, TransactionToken* token,
+              FileStatistics* stat) override;
 
   string TranslateName(const string& name) const override;
 
diff --git a/tensorflow/core/platform/hash.h b/tensorflow/core/platform/hash.h
index d15d989c407..2fd1f84e087 100644
--- a/tensorflow/core/platform/hash.h
+++ b/tensorflow/core/platform/hash.h
@@ -38,7 +38,7 @@ inline uint64 Hash64(const char* data, size_t n) {
 
 inline uint64 Hash64(const char* data) { return Hash64(data, ::strlen(data)); }
 
-inline uint64 Hash64(const string& str) {
+inline uint64 Hash64(const std::string& str) {
   return Hash64(str.data(), str.size());
 }
 
diff --git a/tensorflow/core/platform/host_info.h b/tensorflow/core/platform/host_info.h
index e76b83adf34..3447b2e0330 100644
--- a/tensorflow/core/platform/host_info.h
+++ b/tensorflow/core/platform/host_info.h
@@ -21,9 +21,13 @@ limitations under the License.
 namespace tensorflow {
 namespace port {
 
-// Return the hostname of the machine on which this process is running
+// Return the hostname of the machine on which this process is running.
 string Hostname();
 
+// Return the job name as a string if it exists, otherwise return an empty
+// string.
+string JobName();
+
 }  // namespace port
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/load_library.h b/tensorflow/core/platform/load_library.h
index 01efd4c1d01..60e84238487 100644
--- a/tensorflow/core/platform/load_library.h
+++ b/tensorflow/core/platform/load_library.h
@@ -22,7 +22,7 @@ namespace tensorflow {
 
 namespace internal {
 
-Status LoadLibrary(const char* library_filename, void** handle);
+Status LoadDynamicLibrary(const char* library_filename, void** handle);
 Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
                             void** symbol);
 string FormatLibraryFileName(const string& name, const string& version);
diff --git a/tensorflow/core/platform/macros.h b/tensorflow/core/platform/macros.h
index 57773c54e3e..a38c57d1d04 100644
--- a/tensorflow/core/platform/macros.h
+++ b/tensorflow/core/platform/macros.h
@@ -121,4 +121,13 @@ limitations under the License.
   } while (0)
 #endif
 
+namespace tensorflow {
+namespace internal {
+template <typename T>
+void remove_unused_variable_compiler_warning(const T&){};
+}
+}  // namespace tensorflow
+#define TF_UNUSED_VARIABLE(x) \
+  tensorflow::internal::remove_unused_variable_compiler_warning(x)
+
 #endif  // TENSORFLOW_CORE_PLATFORM_MACROS_H_
diff --git a/tensorflow/core/platform/null_file_system.h b/tensorflow/core/platform/null_file_system.h
index 420abc1ada8..d7deca32da2 100644
--- a/tensorflow/core/platform/null_file_system.h
+++ b/tensorflow/core/platform/null_file_system.h
@@ -36,62 +36,69 @@ class NullFileSystem : public FileSystem {
 
   ~NullFileSystem() override = default;
 
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
   Status NewRandomAccessFile(
-      const string& fname, std::unique_ptr<RandomAccessFile>* result) override {
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override {
     return errors::Unimplemented("NewRandomAccessFile unimplemented");
   }
 
-  Status NewWritableFile(const string& fname,
+  Status NewWritableFile(const string& fname, TransactionToken* token,
                          std::unique_ptr<WritableFile>* result) override {
     return errors::Unimplemented("NewWritableFile unimplemented");
   }
 
-  Status NewAppendableFile(const string& fname,
+  Status NewAppendableFile(const string& fname, TransactionToken* token,
                            std::unique_ptr<WritableFile>* result) override {
     return errors::Unimplemented("NewAppendableFile unimplemented");
   }
 
   Status NewReadOnlyMemoryRegionFromFile(
-      const string& fname,
+      const string& fname, TransactionToken* token,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override {
     return errors::Unimplemented(
         "NewReadOnlyMemoryRegionFromFile unimplemented");
   }
 
-  Status FileExists(const string& fname) override {
+  Status FileExists(const string& fname, TransactionToken* token) override {
     return errors::Unimplemented("FileExists unimplemented");
   }
 
-  Status GetChildren(const string& dir, std::vector<string>* result) override {
+  Status GetChildren(const string& dir, TransactionToken* token,
+                     std::vector<string>* result) override {
     return errors::Unimplemented("GetChildren unimplemented");
   }
 
-  Status GetMatchingPaths(const string& pattern,
+  Status GetMatchingPaths(const string& pattern, TransactionToken* token,
                           std::vector<string>* results) override {
     return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
   }
 
-  Status DeleteFile(const string& fname) override {
+  Status DeleteFile(const string& fname, TransactionToken* token) override {
     return errors::Unimplemented("DeleteFile unimplemented");
   }
 
-  Status CreateDir(const string& dirname) override {
+  Status CreateDir(const string& dirname, TransactionToken* token) override {
     return errors::Unimplemented("CreateDir unimplemented");
   }
 
-  Status DeleteDir(const string& dirname) override {
+  Status DeleteDir(const string& dirname, TransactionToken* token) override {
     return errors::Unimplemented("DeleteDir unimplemented");
   }
 
-  Status GetFileSize(const string& fname, uint64* file_size) override {
+  Status GetFileSize(const string& fname, TransactionToken* token,
+                     uint64* file_size) override {
     return errors::Unimplemented("GetFileSize unimplemented");
   }
 
-  Status RenameFile(const string& src, const string& target) override {
+  Status RenameFile(const string& src, const string& target,
+                    TransactionToken* token) override {
     return errors::Unimplemented("RenameFile unimplemented");
   }
 
-  Status Stat(const string& fname, FileStatistics* stat) override {
+  Status Stat(const string& fname, TransactionToken* token,
+              FileStatistics* stat) override {
     return errors::Unimplemented("Stat unimplemented");
   }
 };
diff --git a/tensorflow/core/platform/platform.h b/tensorflow/core/platform/platform.h
index a840d7b06e3..3375a6e50eb 100644
--- a/tensorflow/core/platform/platform.h
+++ b/tensorflow/core/platform/platform.h
@@ -41,18 +41,22 @@ limitations under the License.
 #elif defined(_WIN32)
 #define PLATFORM_WINDOWS
 
-#elif defined(__arm__)
-#define PLATFORM_POSIX
-
 #elif defined(__EMSCRIPTEN__)
 #define PLATFORM_PORTABLE_GOOGLE
 #define PLATFORM_POSIX
+// EMSCRIPTEN builds are considered "mobile" for the sake of portability.
+#define IS_MOBILE_PLATFORM
+
+#elif defined(__arm__) || defined(__aarch64__)
+// If no platform specified, use:
+#define PLATFORM_POSIX
 
 // Require an outside macro to tell us if we're building for Raspberry Pi or
 // another ARM device that's not a mobile platform.
-#if !defined(RASPBERRY_PI) && !defined(ARM_NON_MOBILE)
+#if !defined(RASPBERRY_PI) && !defined(ARM_NON_MOBILE) && \
+    !defined(PLATFORM_GOOGLE)
 #define IS_MOBILE_PLATFORM
-#endif  // !defined(RASPBERRY_PI) && !defined(ARM_NON_MOBILE)
+#endif
 
 #else
 // If no platform specified, use:
diff --git a/tensorflow/core/platform/ram_file_system.h b/tensorflow/core/platform/ram_file_system.h
index 871d38f97c5..407bcb3ba0f 100644
--- a/tensorflow/core/platform/ram_file_system.h
+++ b/tensorflow/core/platform/ram_file_system.h
@@ -103,8 +103,11 @@ class RamRandomAccessFile : public RandomAccessFile, public WritableFile {
 
 class RamFileSystem : public FileSystem {
  public:
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
   Status NewRandomAccessFile(
-      const string& fname, std::unique_ptr<RandomAccessFile>* result) override {
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override {
     mutex_lock m(mu_);
     if (fs_.find(fname) == fs_.end()) {
       return errors::NotFound("");
@@ -114,7 +117,7 @@ class RamFileSystem : public FileSystem {
     return Status::OK();
   }
 
-  Status NewWritableFile(const string& fname,
+  Status NewWritableFile(const string& fname, TransactionToken* token,
                          std::unique_ptr<WritableFile>* result) override {
     mutex_lock m(mu_);
     if (fs_.find(fname) == fs_.end()) {
@@ -124,7 +127,7 @@ class RamFileSystem : public FileSystem {
         new RamRandomAccessFile(fname, fs_[fname]));
     return Status::OK();
   }
-  Status NewAppendableFile(const string& fname,
+  Status NewAppendableFile(const string& fname, TransactionToken* token,
                            std::unique_ptr<WritableFile>* result) override {
     mutex_lock m(mu_);
     if (fs_.find(fname) == fs_.end()) {
@@ -136,17 +139,18 @@ class RamFileSystem : public FileSystem {
   }
 
   Status NewReadOnlyMemoryRegionFromFile(
-      const string& fname,
+      const string& fname, TransactionToken* token,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override {
     return errors::Unimplemented("");
   }
 
-  Status FileExists(const string& fname) override {
+  Status FileExists(const string& fname, TransactionToken* token) override {
     FileStatistics stat;
-    return Stat(fname, &stat);
+    return Stat(fname, token, &stat);
   }
 
-  Status GetChildren(const string& dir, std::vector<string>* result) override {
+  Status GetChildren(const string& dir, TransactionToken* token,
+                     std::vector<string>* result) override {
     mutex_lock m(mu_);
     auto it = fs_.lower_bound(dir);
     while (it != fs_.end() && absl::StartsWith(it->first, dir)) {
@@ -157,7 +161,7 @@ class RamFileSystem : public FileSystem {
     return Status::OK();
   }
 
-  Status GetMatchingPaths(const string& pattern,
+  Status GetMatchingPaths(const string& pattern, TransactionToken* token,
                           std::vector<string>* results) override {
     mutex_lock m(mu_);
     Env* env = Env::Default();
@@ -169,7 +173,8 @@ class RamFileSystem : public FileSystem {
     return Status::OK();
   }
 
-  Status Stat(const string& fname, FileStatistics* stat) override {
+  Status Stat(const string& fname, TransactionToken* token,
+              FileStatistics* stat) override {
     mutex_lock m(mu_);
     auto it = fs_.lower_bound(fname);
     if (it == fs_.end()) {
@@ -189,7 +194,7 @@ class RamFileSystem : public FileSystem {
     return Status::OK();
   }
 
-  Status DeleteFile(const string& fname) override {
+  Status DeleteFile(const string& fname, TransactionToken* token) override {
     mutex_lock m(mu_);
     if (fs_.find(fname) != fs_.end()) {
       fs_.erase(fname);
@@ -199,15 +204,21 @@ class RamFileSystem : public FileSystem {
     return errors::NotFound("");
   }
 
-  Status CreateDir(const string& dirname) override { return Status::OK(); }
-
-  Status RecursivelyCreateDir(const string& dirname) override {
+  Status CreateDir(const string& dirname, TransactionToken* token) override {
     return Status::OK();
   }
 
-  Status DeleteDir(const string& dirname) override { return Status::OK(); }
+  Status RecursivelyCreateDir(const string& dirname,
+                              TransactionToken* token) override {
+    return Status::OK();
+  }
 
-  Status GetFileSize(const string& fname, uint64* file_size) override {
+  Status DeleteDir(const string& dirname, TransactionToken* token) override {
+    return Status::OK();
+  }
+
+  Status GetFileSize(const string& fname, TransactionToken* token,
+                     uint64* file_size) override {
     mutex_lock m(mu_);
     if (fs_.find(fname) != fs_.end()) {
       *file_size = fs_[fname]->size();
@@ -216,7 +227,8 @@ class RamFileSystem : public FileSystem {
     return errors::NotFound("");
   }
 
-  Status RenameFile(const string& src, const string& target) override {
+  Status RenameFile(const string& src, const string& target,
+                    TransactionToken* token) override {
     mutex_lock m(mu_);
     if (fs_.find(src) != fs_.end()) {
       fs_[target] = fs_[src];
diff --git a/tensorflow/core/platform/retrying_file_system.h b/tensorflow/core/platform/retrying_file_system.h
index 7c7d7563d22..52e2caf8398 100644
--- a/tensorflow/core/platform/retrying_file_system.h
+++ b/tensorflow/core/platform/retrying_file_system.h
@@ -38,86 +38,104 @@ class RetryingFileSystem : public FileSystem {
       : base_file_system_(std::move(base_file_system)),
         retry_config_(retry_config) {}
 
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
   Status NewRandomAccessFile(
-      const string& filename,
+      const string& filename, TransactionToken* token,
       std::unique_ptr<RandomAccessFile>* result) override;
 
-  Status NewWritableFile(const string& fname,
+  Status NewWritableFile(const string& filename, TransactionToken* token,
                          std::unique_ptr<WritableFile>* result) override;
 
-  Status NewAppendableFile(const string& fname,
+  Status NewAppendableFile(const string& filename, TransactionToken* token,
                            std::unique_ptr<WritableFile>* result) override;
 
   Status NewReadOnlyMemoryRegionFromFile(
-      const string& filename,
+      const string& filename, TransactionToken* token,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
 
-  Status FileExists(const string& fname) override {
+  Status FileExists(const string& fname, TransactionToken* token) override {
     return RetryingUtils::CallWithRetries(
-        [this, &fname]() { return base_file_system_->FileExists(fname); },
-        retry_config_);
-  }
-
-  Status GetChildren(const string& dir, std::vector<string>* result) override {
-    return RetryingUtils::CallWithRetries(
-        [this, &dir, result]() {
-          return base_file_system_->GetChildren(dir, result);
+        [this, &fname, token]() {
+          return base_file_system_->FileExists(fname, token);
         },
         retry_config_);
   }
 
-  Status GetMatchingPaths(const string& pattern,
+  Status GetChildren(const string& dir, TransactionToken* token,
+                     std::vector<string>* result) override {
+    return RetryingUtils::CallWithRetries(
+        [this, &dir, result, token]() {
+          return base_file_system_->GetChildren(dir, token, result);
+        },
+        retry_config_);
+  }
+
+  Status GetMatchingPaths(const string& pattern, TransactionToken* token,
                           std::vector<string>* result) override {
     return RetryingUtils::CallWithRetries(
-        [this, &pattern, result]() {
-          return base_file_system_->GetMatchingPaths(pattern, result);
+        [this, &pattern, result, token]() {
+          return base_file_system_->GetMatchingPaths(pattern, token, result);
         },
         retry_config_);
   }
 
-  Status Stat(const string& fname, FileStatistics* stat) override {
+  Status Stat(const string& fname, TransactionToken* token,
+              FileStatistics* stat) override {
     return RetryingUtils::CallWithRetries(
-        [this, &fname, stat]() { return base_file_system_->Stat(fname, stat); },
+        [this, &fname, stat, token]() {
+          return base_file_system_->Stat(fname, token, stat);
+        },
         retry_config_);
   }
 
-  Status DeleteFile(const string& fname) override {
+  Status DeleteFile(const string& fname, TransactionToken* token) override {
     return RetryingUtils::DeleteWithRetries(
-        [this, &fname]() { return base_file_system_->DeleteFile(fname); },
+        [this, &fname, token]() {
+          return base_file_system_->DeleteFile(fname, token);
+        },
         retry_config_);
   }
 
-  Status CreateDir(const string& dirname) override {
+  Status CreateDir(const string& dirname, TransactionToken* token) override {
     return RetryingUtils::CallWithRetries(
-        [this, &dirname]() { return base_file_system_->CreateDir(dirname); },
+        [this, &dirname, token]() {
+          return base_file_system_->CreateDir(dirname, token);
+        },
         retry_config_);
   }
 
-  Status DeleteDir(const string& dirname) override {
+  Status DeleteDir(const string& dirname, TransactionToken* token) override {
     return RetryingUtils::DeleteWithRetries(
-        [this, &dirname]() { return base_file_system_->DeleteDir(dirname); },
-        retry_config_);
-  }
-
-  Status GetFileSize(const string& fname, uint64* file_size) override {
-    return RetryingUtils::CallWithRetries(
-        [this, &fname, file_size]() {
-          return base_file_system_->GetFileSize(fname, file_size);
+        [this, &dirname, token]() {
+          return base_file_system_->DeleteDir(dirname, token);
         },
         retry_config_);
   }
 
-  Status RenameFile(const string& src, const string& target) override {
+  Status GetFileSize(const string& fname, TransactionToken* token,
+                     uint64* file_size) override {
     return RetryingUtils::CallWithRetries(
-        [this, &src, &target]() {
-          return base_file_system_->RenameFile(src, target);
+        [this, &fname, file_size, token]() {
+          return base_file_system_->GetFileSize(fname, token, file_size);
         },
         retry_config_);
   }
 
-  Status IsDirectory(const string& dirname) override {
+  Status RenameFile(const string& src, const string& target,
+                    TransactionToken* token) override {
     return RetryingUtils::CallWithRetries(
-        [this, &dirname]() { return base_file_system_->IsDirectory(dirname); },
+        [this, &src, &target, token]() {
+          return base_file_system_->RenameFile(src, target, token);
+        },
+        retry_config_);
+  }
+
+  Status IsDirectory(const string& dirname, TransactionToken* token) override {
+    return RetryingUtils::CallWithRetries(
+        [this, &dirname, token]() {
+          return base_file_system_->IsDirectory(dirname, token);
+        },
         retry_config_);
   }
 
@@ -126,17 +144,20 @@ class RetryingFileSystem : public FileSystem {
     return base_file_system_->HasAtomicMove(path, has_atomic_move);
   }
 
-  Status DeleteRecursively(const string& dirname, int64* undeleted_files,
+  Status DeleteRecursively(const string& dirname, TransactionToken* token,
+                           int64* undeleted_files,
                            int64* undeleted_dirs) override {
     return RetryingUtils::DeleteWithRetries(
-        [this, &dirname, undeleted_files, undeleted_dirs]() {
-          return base_file_system_->DeleteRecursively(dirname, undeleted_files,
-                                                      undeleted_dirs);
+        [this, &dirname, token, undeleted_files, undeleted_dirs]() {
+          return base_file_system_->DeleteRecursively(
+              dirname, token, undeleted_files, undeleted_dirs);
         },
         retry_config_);
   }
 
-  void FlushCaches() override { base_file_system_->FlushCaches(); }
+  void FlushCaches(TransactionToken* token) override {
+    base_file_system_->FlushCaches(token);
+  }
 
   Underlying* underlying() const { return base_file_system_.get(); }
 
@@ -218,11 +239,13 @@ class RetryingWritableFile : public WritableFile {
 
 template <typename Underlying>
 Status RetryingFileSystem<Underlying>::NewRandomAccessFile(
-    const string& filename, std::unique_ptr<RandomAccessFile>* result) {
+    const string& filename, TransactionToken* token,
+    std::unique_ptr<RandomAccessFile>* result) {
   std::unique_ptr<RandomAccessFile> base_file;
   TF_RETURN_IF_ERROR(RetryingUtils::CallWithRetries(
-      [this, &filename, &base_file]() {
-        return base_file_system_->NewRandomAccessFile(filename, &base_file);
+      [this, &filename, &base_file, token]() {
+        return base_file_system_->NewRandomAccessFile(filename, token,
+                                                      &base_file);
       },
       retry_config_));
   result->reset(new retrying_internals::RetryingRandomAccessFile(
@@ -232,11 +255,12 @@ Status RetryingFileSystem<Underlying>::NewRandomAccessFile(
 
 template <typename Underlying>
 Status RetryingFileSystem<Underlying>::NewWritableFile(
-    const string& filename, std::unique_ptr<WritableFile>* result) {
+    const string& filename, TransactionToken* token,
+    std::unique_ptr<WritableFile>* result) {
   std::unique_ptr<WritableFile> base_file;
   TF_RETURN_IF_ERROR(RetryingUtils::CallWithRetries(
-      [this, &filename, &base_file]() {
-        return base_file_system_->NewWritableFile(filename, &base_file);
+      [this, &filename, &base_file, token]() {
+        return base_file_system_->NewWritableFile(filename, token, &base_file);
       },
       retry_config_));
   result->reset(new retrying_internals::RetryingWritableFile(
@@ -246,11 +270,13 @@ Status RetryingFileSystem<Underlying>::NewWritableFile(
 
 template <typename Underlying>
 Status RetryingFileSystem<Underlying>::NewAppendableFile(
-    const string& filename, std::unique_ptr<WritableFile>* result) {
+    const string& filename, TransactionToken* token,
+    std::unique_ptr<WritableFile>* result) {
   std::unique_ptr<WritableFile> base_file;
   TF_RETURN_IF_ERROR(RetryingUtils::CallWithRetries(
-      [this, &filename, &base_file]() {
-        return base_file_system_->NewAppendableFile(filename, &base_file);
+      [this, &filename, &base_file, token]() {
+        return base_file_system_->NewAppendableFile(filename, token,
+                                                    &base_file);
       },
       retry_config_));
   result->reset(new retrying_internals::RetryingWritableFile(
@@ -260,11 +286,12 @@ Status RetryingFileSystem<Underlying>::NewAppendableFile(
 
 template <typename Underlying>
 Status RetryingFileSystem<Underlying>::NewReadOnlyMemoryRegionFromFile(
-    const string& filename, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
+    const string& filename, TransactionToken* token,
+    std::unique_ptr<ReadOnlyMemoryRegion>* result) {
   return RetryingUtils::CallWithRetries(
-      [this, &filename, result]() {
-        return base_file_system_->NewReadOnlyMemoryRegionFromFile(filename,
-                                                                  result);
+      [this, &filename, result, token]() {
+        return base_file_system_->NewReadOnlyMemoryRegionFromFile(
+            filename, token, result);
       },
       retry_config_);
 }
diff --git a/tensorflow/core/platform/retrying_file_system_test.cc b/tensorflow/core/platform/retrying_file_system_test.cc
index b43c3375265..093b85a1afc 100644
--- a/tensorflow/core/platform/retrying_file_system_test.cc
+++ b/tensorflow/core/platform/retrying_file_system_test.cc
@@ -99,77 +99,85 @@ class MockFileSystem : public FileSystem {
   explicit MockFileSystem(const ExpectedCalls& calls, bool* flushed = nullptr)
       : calls_(calls), flushed_(flushed) {}
 
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
   Status NewRandomAccessFile(
-      const string& fname, std::unique_ptr<RandomAccessFile>* result) override {
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override {
     *result = std::move(random_access_file_to_return);
     return calls_.ConsumeNextCall("NewRandomAccessFile");
   }
 
-  Status NewWritableFile(const string& fname,
+  Status NewWritableFile(const string& fname, TransactionToken* token,
                          std::unique_ptr<WritableFile>* result) override {
     *result = std::move(writable_file_to_return);
     return calls_.ConsumeNextCall("NewWritableFile");
   }
 
-  Status NewAppendableFile(const string& fname,
+  Status NewAppendableFile(const string& fname, TransactionToken* token,
                            std::unique_ptr<WritableFile>* result) override {
     *result = std::move(writable_file_to_return);
     return calls_.ConsumeNextCall("NewAppendableFile");
   }
 
   Status NewReadOnlyMemoryRegionFromFile(
-      const string& fname,
+      const string& fname, TransactionToken* token,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override {
     return calls_.ConsumeNextCall("NewReadOnlyMemoryRegionFromFile");
   }
 
-  Status FileExists(const string& fname) override {
+  Status FileExists(const string& fname, TransactionToken* token) override {
     return calls_.ConsumeNextCall("FileExists");
   }
 
-  Status GetChildren(const string& dir, std::vector<string>* result) override {
+  Status GetChildren(const string& dir, TransactionToken* token,
+                     std::vector<string>* result) override {
     return calls_.ConsumeNextCall("GetChildren");
   }
 
-  Status GetMatchingPaths(const string& dir,
+  Status GetMatchingPaths(const string& dir, TransactionToken* token,
                           std::vector<string>* result) override {
     return calls_.ConsumeNextCall("GetMatchingPaths");
   }
 
-  Status Stat(const string& fname, FileStatistics* stat) override {
+  Status Stat(const string& fname, TransactionToken* token,
+              FileStatistics* stat) override {
     return calls_.ConsumeNextCall("Stat");
   }
 
-  Status DeleteFile(const string& fname) override {
+  Status DeleteFile(const string& fname, TransactionToken* token) override {
     return calls_.ConsumeNextCall("DeleteFile");
   }
 
-  Status CreateDir(const string& dirname) override {
+  Status CreateDir(const string& dirname, TransactionToken* token) override {
     return calls_.ConsumeNextCall("CreateDir");
   }
 
-  Status DeleteDir(const string& dirname) override {
+  Status DeleteDir(const string& dirname, TransactionToken* token) override {
     return calls_.ConsumeNextCall("DeleteDir");
   }
 
-  Status GetFileSize(const string& fname, uint64* file_size) override {
+  Status GetFileSize(const string& fname, TransactionToken* token,
+                     uint64* file_size) override {
     return calls_.ConsumeNextCall("GetFileSize");
   }
 
-  Status RenameFile(const string& src, const string& target) override {
+  Status RenameFile(const string& src, const string& target,
+                    TransactionToken* token) override {
     return calls_.ConsumeNextCall("RenameFile");
   }
 
-  Status IsDirectory(const string& dirname) override {
+  Status IsDirectory(const string& dirname, TransactionToken* token) override {
     return calls_.ConsumeNextCall("IsDirectory");
   }
 
-  Status DeleteRecursively(const string& dirname, int64* undeleted_files,
+  Status DeleteRecursively(const string& dirname, TransactionToken* token,
+                           int64* undeleted_files,
                            int64* undeleted_dirs) override {
     return calls_.ConsumeNextCall("DeleteRecursively");
   }
 
-  void FlushCaches() override {
+  void FlushCaches(TransactionToken* token) override {
     if (flushed_) {
       *flushed_ = true;
     }
@@ -201,7 +209,8 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_ImmediateSuccess) {
 
   // Retrieve the wrapped random access file.
   std::unique_ptr<RandomAccessFile> random_access_file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("filename.txt", &random_access_file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("filename.txt", nullptr, &random_access_file));
 
   // Use it and check the results.
   StringPiece result;
@@ -232,7 +241,8 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_SuccessWith3rdTry) {
 
   // Retrieve the wrapped random access file.
   std::unique_ptr<RandomAccessFile> random_access_file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("filename.txt", &random_access_file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("filename.txt", nullptr, &random_access_file));
 
   // Use it and check the results.
   StringPiece result;
@@ -257,7 +267,8 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_AllRetriesFailed) {
 
   // Retrieve the wrapped random access file.
   std::unique_ptr<RandomAccessFile> random_access_file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("filename.txt", &random_access_file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("filename.txt", nullptr, &random_access_file));
 
   // Use it and check the results.
   StringPiece result;
@@ -287,7 +298,8 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_NoRetriesForSomeErrors) {
 
   // Retrieve the wrapped random access file.
   std::unique_ptr<RandomAccessFile> random_access_file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("filename.txt", &random_access_file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("filename.txt", nullptr, &random_access_file));
 
   // Use it and check the results.
   StringPiece result;
@@ -315,7 +327,7 @@ TEST(RetryingFileSystemTest, NewWritableFile_ImmediateSuccess) {
 
   // Retrieve the wrapped writable file.
   std::unique_ptr<WritableFile> writable_file;
-  TF_EXPECT_OK(fs.NewWritableFile("filename.txt", &writable_file));
+  TF_EXPECT_OK(fs.NewWritableFile("filename.txt", nullptr, &writable_file));
 
   StringPiece result;
   TF_EXPECT_OK(writable_file->Name(&result));
@@ -346,7 +358,7 @@ TEST(RetryingFileSystemTest, NewWritableFile_SuccessWith3rdTry) {
 
   // Retrieve the wrapped writable file.
   std::unique_ptr<WritableFile> writable_file;
-  TF_EXPECT_OK(fs.NewWritableFile("filename.txt", &writable_file));
+  TF_EXPECT_OK(fs.NewWritableFile("filename.txt", nullptr, &writable_file));
 
   // Use it and check the results.
   TF_EXPECT_OK(writable_file->Sync());
@@ -373,7 +385,7 @@ TEST(RetryingFileSystemTest, NewWritableFile_SuccessWith3rdTry_ViaDestructor) {
 
   // Retrieve the wrapped writable file.
   std::unique_ptr<WritableFile> writable_file;
-  TF_EXPECT_OK(fs.NewWritableFile("filename.txt", &writable_file));
+  TF_EXPECT_OK(fs.NewWritableFile("filename.txt", nullptr, &writable_file));
 
   writable_file.reset();  // Trigger Close() via destructor.
 }
@@ -399,7 +411,7 @@ TEST(RetryingFileSystemTest, NewAppendableFile_SuccessWith3rdTry) {
 
   // Retrieve the wrapped appendable file.
   std::unique_ptr<WritableFile> writable_file;
-  TF_EXPECT_OK(fs.NewAppendableFile("filename.txt", &writable_file));
+  TF_EXPECT_OK(fs.NewAppendableFile("filename.txt", nullptr, &writable_file));
 
   // Use it and check the results.
   TF_EXPECT_OK(writable_file->Sync());
@@ -423,7 +435,7 @@ TEST(RetryingFileSystemTest, NewWritableFile_AllRetriesFailed) {
 
   // Retrieve the wrapped writable file.
   std::unique_ptr<WritableFile> writable_file;
-  TF_EXPECT_OK(fs.NewWritableFile("filename.txt", &writable_file));
+  TF_EXPECT_OK(fs.NewWritableFile("filename.txt", nullptr, &writable_file));
 
   // Use it and check the results.
   const auto& status = writable_file->Sync();
@@ -443,7 +455,8 @@ TEST(RetryingFileSystemTest,
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   std::unique_ptr<ReadOnlyMemoryRegion> result;
-  TF_EXPECT_OK(fs.NewReadOnlyMemoryRegionFromFile("filename.txt", &result));
+  TF_EXPECT_OK(
+      fs.NewReadOnlyMemoryRegionFromFile("filename.txt", nullptr, &result));
 }
 
 TEST(RetryingFileSystemTest, NewReadOnlyMemoryRegionFromFile_AllRetriesFailed) {
@@ -456,7 +469,7 @@ TEST(RetryingFileSystemTest, NewReadOnlyMemoryRegionFromFile_AllRetriesFailed) {
 
   std::unique_ptr<ReadOnlyMemoryRegion> result;
   const auto& status =
-      fs.NewReadOnlyMemoryRegionFromFile("filename.txt", &result);
+      fs.NewReadOnlyMemoryRegionFromFile("filename.txt", nullptr, &result);
   EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
@@ -472,7 +485,7 @@ TEST(RetryingFileSystemTest, GetChildren_SuccessWith2ndTry) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   std::vector<string> result;
-  TF_EXPECT_OK(fs.GetChildren("gs://path", &result));
+  TF_EXPECT_OK(fs.GetChildren("gs://path", nullptr, &result));
 }
 
 TEST(RetryingFileSystemTest, GetChildren_AllRetriesFailed) {
@@ -483,7 +496,7 @@ TEST(RetryingFileSystemTest, GetChildren_AllRetriesFailed) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   std::vector<string> result;
-  const auto& status = fs.GetChildren("gs://path", &result);
+  const auto& status = fs.GetChildren("gs://path", nullptr, &result);
   EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
@@ -499,7 +512,7 @@ TEST(RetryingFileSystemTest, GetMatchingPaths_SuccessWith2ndTry) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   std::vector<string> result;
-  TF_EXPECT_OK(fs.GetMatchingPaths("gs://path/dir", &result));
+  TF_EXPECT_OK(fs.GetMatchingPaths("gs://path/dir", nullptr, &result));
 }
 
 TEST(RetryingFileSystemTest, GetMatchingPaths_AllRetriesFailed) {
@@ -511,7 +524,7 @@ TEST(RetryingFileSystemTest, GetMatchingPaths_AllRetriesFailed) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   std::vector<string> result;
-  const auto& status = fs.GetMatchingPaths("gs://path/dir", &result);
+  const auto& status = fs.GetMatchingPaths("gs://path/dir", nullptr, &result);
   EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
@@ -525,7 +538,7 @@ TEST(RetryingFileSystemTest, DeleteFile_SuccessWith2ndTry) {
   RetryingFileSystem<MockFileSystem> fs(
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
-  TF_EXPECT_OK(fs.DeleteFile("gs://path/file.txt"));
+  TF_EXPECT_OK(fs.DeleteFile("gs://path/file.txt", nullptr));
 }
 
 TEST(RetryingFileSystemTest, DeleteFile_AllRetriesFailed) {
@@ -535,7 +548,7 @@ TEST(RetryingFileSystemTest, DeleteFile_AllRetriesFailed) {
   RetryingFileSystem<MockFileSystem> fs(
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
-  const auto& status = fs.DeleteFile("gs://path/file.txt");
+  const auto& status = fs.DeleteFile("gs://path/file.txt", nullptr);
   EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
@@ -549,7 +562,7 @@ TEST(RetryingFileSystemTest, CreateDir_SuccessWith2ndTry) {
   RetryingFileSystem<MockFileSystem> fs(
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
-  TF_EXPECT_OK(fs.CreateDir("gs://path/newdir"));
+  TF_EXPECT_OK(fs.CreateDir("gs://path/newdir", nullptr));
 }
 
 TEST(RetryingFileSystemTest, CreateDir_AllRetriesFailed) {
@@ -559,7 +572,7 @@ TEST(RetryingFileSystemTest, CreateDir_AllRetriesFailed) {
   RetryingFileSystem<MockFileSystem> fs(
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
-  const auto& status = fs.CreateDir("gs://path/newdir");
+  const auto& status = fs.CreateDir("gs://path/newdir", nullptr);
   EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
@@ -573,7 +586,7 @@ TEST(RetryingFileSystemTest, DeleteDir_SuccessWith2ndTry) {
   RetryingFileSystem<MockFileSystem> fs(
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
-  TF_EXPECT_OK(fs.DeleteDir("gs://path/dir"));
+  TF_EXPECT_OK(fs.DeleteDir("gs://path/dir", nullptr));
 }
 
 TEST(RetryingFileSystemTest, DeleteDir_AllRetriesFailed) {
@@ -583,7 +596,7 @@ TEST(RetryingFileSystemTest, DeleteDir_AllRetriesFailed) {
   RetryingFileSystem<MockFileSystem> fs(
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
-  const auto& status = fs.DeleteDir("gs://path/dir");
+  const auto& status = fs.DeleteDir("gs://path/dir", nullptr);
   EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
@@ -599,7 +612,7 @@ TEST(RetryingFileSystemTest, GetFileSize_SuccessWith2ndTry) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   uint64 size;
-  TF_EXPECT_OK(fs.GetFileSize("gs://path/file.txt", &size));
+  TF_EXPECT_OK(fs.GetFileSize("gs://path/file.txt", nullptr, &size));
 }
 
 TEST(RetryingFileSystemTest, GetFileSize_AllRetriesFailed) {
@@ -610,7 +623,7 @@ TEST(RetryingFileSystemTest, GetFileSize_AllRetriesFailed) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   uint64 size;
-  const auto& status = fs.GetFileSize("gs://path/file.txt", &size);
+  const auto& status = fs.GetFileSize("gs://path/file.txt", nullptr, &size);
   EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
@@ -624,7 +637,7 @@ TEST(RetryingFileSystemTest, RenameFile_SuccessWith2ndTry) {
   RetryingFileSystem<MockFileSystem> fs(
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
-  TF_EXPECT_OK(fs.RenameFile("old_name", "new_name"));
+  TF_EXPECT_OK(fs.RenameFile("old_name", "new_name", nullptr));
 }
 
 TEST(RetryingFileSystemTest, RenameFile_AllRetriesFailed) {
@@ -634,7 +647,7 @@ TEST(RetryingFileSystemTest, RenameFile_AllRetriesFailed) {
   RetryingFileSystem<MockFileSystem> fs(
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
-  const auto& status = fs.RenameFile("old_name", "new_name");
+  const auto& status = fs.RenameFile("old_name", "new_name", nullptr);
   EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
@@ -649,7 +662,7 @@ TEST(RetryingFileSystemTest, Stat_SuccessWith2ndTry) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   FileStatistics stat;
-  TF_EXPECT_OK(fs.Stat("file_name", &stat));
+  TF_EXPECT_OK(fs.Stat("file_name", nullptr, &stat));
 }
 
 TEST(RetryingFileSystemTest, Stat_AllRetriesFailed) {
@@ -660,7 +673,7 @@ TEST(RetryingFileSystemTest, Stat_AllRetriesFailed) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   FileStatistics stat;
-  const auto& status = fs.Stat("file_name", &stat);
+  const auto& status = fs.Stat("file_name", nullptr, &stat);
   EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
@@ -672,7 +685,7 @@ TEST(RetryingFileSystemTest, FileExists_AllRetriesFailed) {
   RetryingFileSystem<MockFileSystem> fs(
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
-  const auto& status = fs.FileExists("file_name");
+  const auto& status = fs.FileExists("file_name", nullptr);
   EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
@@ -686,7 +699,7 @@ TEST(RetryingFileSystemTest, FileExists_SuccessWith2ndTry) {
   RetryingFileSystem<MockFileSystem> fs(
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
-  TF_EXPECT_OK(fs.FileExists("gs://path/dir"));
+  TF_EXPECT_OK(fs.FileExists("gs://path/dir", nullptr));
 }
 
 TEST(RetryingFileSystemTest, IsDirectory_SuccessWith2ndTry) {
@@ -699,7 +712,7 @@ TEST(RetryingFileSystemTest, IsDirectory_SuccessWith2ndTry) {
   RetryingFileSystem<MockFileSystem> fs(
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
-  TF_EXPECT_OK(fs.IsDirectory("gs://path/dir"));
+  TF_EXPECT_OK(fs.IsDirectory("gs://path/dir", nullptr));
 }
 
 TEST(RetryingFileSystemTest, IsDirectory_AllRetriesFailed) {
@@ -709,7 +722,7 @@ TEST(RetryingFileSystemTest, IsDirectory_AllRetriesFailed) {
   RetryingFileSystem<MockFileSystem> fs(
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
-  const auto& status = fs.IsDirectory("gs://path/dir");
+  const auto& status = fs.IsDirectory("gs://path/dir", nullptr);
   EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
@@ -725,8 +738,8 @@ TEST(RetryingFileSystemTest, DeleteRecursively_SuccessWith2ndTry) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
   int64 undeleted_files, undeleted_dirs;
 
-  TF_EXPECT_OK(
-      fs.DeleteRecursively("gs://path/dir", &undeleted_files, &undeleted_dirs));
+  TF_EXPECT_OK(fs.DeleteRecursively("gs://path/dir", nullptr, &undeleted_files,
+                                    &undeleted_dirs));
 }
 
 TEST(RetryingFileSystemTest, DeleteRecursively_AllRetriesFailed) {
@@ -738,8 +751,8 @@ TEST(RetryingFileSystemTest, DeleteRecursively_AllRetriesFailed) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
   int64 undeleted_files, undeleted_dirs;
 
-  const auto& status =
-      fs.DeleteRecursively("gs://path/dir", &undeleted_files, &undeleted_dirs);
+  const auto& status = fs.DeleteRecursively("gs://path/dir", nullptr,
+                                            &undeleted_files, &undeleted_dirs);
   EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
@@ -750,7 +763,7 @@ TEST(RetryingFileSystemTest, FlushCaches) {
   std::unique_ptr<MockFileSystem> base_fs(new MockFileSystem(none, &flushed));
   RetryingFileSystem<MockFileSystem> fs(
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
-  fs.FlushCaches();
+  fs.FlushCaches(nullptr);
   EXPECT_TRUE(flushed);
 }
 
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 181a39987e4..8d74ea6aff6 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -58,7 +58,7 @@ static const char* kS3TempFileTemplate = "/tmp/s3_filesystem_XXXXXX";
 #endif
 static const char* kS3FileSystemAllocationTag = "S3FileSystemAllocation";
 static const size_t kS3ReadAppendableFileBufferSize = 1024 * 1024;
-static const int64 kS3TimeoutMsec = 300000;                       // 5 min
+static const int64 kS3TimeoutMsec = 300000;                           // 5 min
 static const uint64 kS3MultiPartUploadChunkSize = 50 * 1024 * 1024;   // 50 MB
 static const uint64 kS3MultiPartDownloadChunkSize = 2 * 1024 * 1024;  // 50 MB
 static const int kS3GetChildrenMaxKeys = 100;
@@ -568,13 +568,14 @@ S3FileSystem::GetExecutor() {
 }
 
 Status S3FileSystem::NewRandomAccessFile(
-    const string& fname, std::unique_ptr<RandomAccessFile>* result) {
-  return NewRandomAccessFile(fname, result, true);
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<RandomAccessFile>* result) {
+  return NewRandomAccessFile(fname, token, result, true);
 }
 
 Status S3FileSystem::NewRandomAccessFile(
-    const string& fname, std::unique_ptr<RandomAccessFile>* result,
-    bool use_multi_part_download) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<RandomAccessFile>* result, bool use_multi_part_download) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
 
@@ -588,6 +589,7 @@ Status S3FileSystem::NewRandomAccessFile(
 }
 
 Status S3FileSystem::NewWritableFile(const string& fname,
+                                     TransactionToken* token,
                                      std::unique_ptr<WritableFile>* result) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
@@ -600,9 +602,10 @@ Status S3FileSystem::NewWritableFile(const string& fname,
 }
 
 Status S3FileSystem::NewAppendableFile(const string& fname,
+                                       TransactionToken* token,
                                        std::unique_ptr<WritableFile>* result) {
   std::unique_ptr<RandomAccessFile> reader;
-  TF_RETURN_IF_ERROR(NewRandomAccessFile(fname, &reader));
+  TF_RETURN_IF_ERROR(NewRandomAccessFile(fname, token, &reader));
   std::unique_ptr<char[]> buffer(new char[kS3ReadAppendableFileBufferSize]);
   Status status;
   uint64 offset = 0;
@@ -634,13 +637,14 @@ Status S3FileSystem::NewAppendableFile(const string& fname,
 }
 
 Status S3FileSystem::NewReadOnlyMemoryRegionFromFile(
-    const string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<ReadOnlyMemoryRegion>* result) {
   uint64 size;
-  TF_RETURN_IF_ERROR(GetFileSize(fname, &size));
+  TF_RETURN_IF_ERROR(GetFileSize(fname, token, &size));
   std::unique_ptr<char[]> data(new char[size]);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_RETURN_IF_ERROR(NewRandomAccessFile(fname, &file));
+  TF_RETURN_IF_ERROR(NewRandomAccessFile(fname, token, &file));
 
   StringPiece piece;
   TF_RETURN_IF_ERROR(file->Read(0, size, &piece, data.get()));
@@ -649,13 +653,13 @@ Status S3FileSystem::NewReadOnlyMemoryRegionFromFile(
   return Status::OK();
 }
 
-Status S3FileSystem::FileExists(const string& fname) {
+Status S3FileSystem::FileExists(const string& fname, TransactionToken* token) {
   FileStatistics stats;
-  TF_RETURN_IF_ERROR(this->Stat(fname, &stats));
+  TF_RETURN_IF_ERROR(this->Stat(fname, token, &stats));
   return Status::OK();
 }
 
-Status S3FileSystem::GetChildren(const string& dir,
+Status S3FileSystem::GetChildren(const string& dir, TransactionToken* token,
                                  std::vector<string>* result) {
   VLOG(1) << "GetChildren for path: " << dir;
   string bucket, prefix;
@@ -703,7 +707,8 @@ Status S3FileSystem::GetChildren(const string& dir,
   return Status::OK();
 }
 
-Status S3FileSystem::Stat(const string& fname, FileStatistics* stats) {
+Status S3FileSystem::Stat(const string& fname, TransactionToken* token,
+                          FileStatistics* stats) {
   VLOG(1) << "Stat on path: " << fname;
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, true, &bucket, &object));
@@ -766,11 +771,12 @@ Status S3FileSystem::Stat(const string& fname, FileStatistics* stats) {
 }
 
 Status S3FileSystem::GetMatchingPaths(const string& pattern,
+                                      TransactionToken* token,
                                       std::vector<string>* results) {
   return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
 }
 
-Status S3FileSystem::DeleteFile(const string& fname) {
+Status S3FileSystem::DeleteFile(const string& fname, TransactionToken* token) {
   VLOG(1) << "DeleteFile: " << fname;
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
@@ -786,7 +792,7 @@ Status S3FileSystem::DeleteFile(const string& fname) {
   return Status::OK();
 }
 
-Status S3FileSystem::CreateDir(const string& dirname) {
+Status S3FileSystem::CreateDir(const string& dirname, TransactionToken* token) {
   VLOG(1) << "CreateDir: " << dirname;
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(dirname, true, &bucket, &object));
@@ -805,15 +811,15 @@ Status S3FileSystem::CreateDir(const string& dirname) {
   if (filename.back() != '/') {
     filename.push_back('/');
   }
-  if (!this->FileExists(filename).ok()) {
+  if (!this->FileExists(filename, token).ok()) {
     std::unique_ptr<WritableFile> file;
-    TF_RETURN_IF_ERROR(NewWritableFile(filename, &file));
+    TF_RETURN_IF_ERROR(NewWritableFile(filename, token, &file));
     TF_RETURN_IF_ERROR(file->Close());
   }
   return Status::OK();
 }
 
-Status S3FileSystem::DeleteDir(const string& dirname) {
+Status S3FileSystem::DeleteDir(const string& dirname, TransactionToken* token) {
   VLOG(1) << "DeleteDir: " << dirname;
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(dirname, false, &bucket, &object));
@@ -844,7 +850,7 @@ Status S3FileSystem::DeleteDir(const string& dirname) {
       if (filename.back() != '/') {
         filename.push_back('/');
       }
-      return DeleteFile(filename);
+      return DeleteFile(filename, token);
     }
   } else {
     TF_RETURN_IF_ERROR(CheckForbiddenError(listObjectsOutcome.GetError()));
@@ -852,9 +858,10 @@ Status S3FileSystem::DeleteDir(const string& dirname) {
   return Status::OK();
 }
 
-Status S3FileSystem::GetFileSize(const string& fname, uint64* file_size) {
+Status S3FileSystem::GetFileSize(const string& fname, TransactionToken* token,
+                                 uint64* file_size) {
   FileStatistics stats;
-  TF_RETURN_IF_ERROR(this->Stat(fname, &stats));
+  TF_RETURN_IF_ERROR(this->Stat(fname, token, &stats));
   *file_size = stats.length;
   return Status::OK();
 }
@@ -904,8 +911,8 @@ Status S3FileSystem::CopyFile(const Aws::String& source_bucket,
   Aws::String source = Aws::String((source_bucket + "/" + source_key).c_str());
   Aws::String source_full_path = Aws::String("s3://") + source;
   uint64 file_length;
-  TF_RETURN_IF_ERROR(
-      this->GetFileSize(string(source_full_path.c_str()), &file_length));
+  TF_RETURN_IF_ERROR(this->GetFileSize(string(source_full_path.c_str()),
+                                       nullptr, &file_length));
   int num_parts;
   if (file_length <=
       multi_part_chunk_size_[Aws::Transfer::TransferDirection::UPLOAD]) {
@@ -1123,7 +1130,8 @@ Status S3FileSystem::CompleteMultiPartCopy(
   return Status::OK();
 }
 
-Status S3FileSystem::RenameFile(const string& src, const string& target) {
+Status S3FileSystem::RenameFile(const string& src, const string& target,
+                                TransactionToken* token) {
   VLOG(1) << "RenameFile from: " << src << " to: " << target;
   string src_bucket, src_object, target_bucket, target_object;
   TF_RETURN_IF_ERROR(ParseS3Path(src, false, &src_bucket, &src_object));
diff --git a/tensorflow/core/platform/s3/s3_file_system.h b/tensorflow/core/platform/s3/s3_file_system.h
index c69d678185e..8da74c668d1 100644
--- a/tensorflow/core/platform/s3/s3_file_system.h
+++ b/tensorflow/core/platform/s3/s3_file_system.h
@@ -49,41 +49,48 @@ class S3FileSystem : public FileSystem {
   S3FileSystem();
   ~S3FileSystem();
 
-  Status NewRandomAccessFile(
-      const string& fname, std::unique_ptr<RandomAccessFile>* result) override;
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
 
-  Status NewRandomAccessFile(const string& fname,
+  Status NewRandomAccessFile(
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override;
+
+  Status NewRandomAccessFile(const string& fname, TransactionToken* token,
                              std::unique_ptr<RandomAccessFile>* result,
                              bool use_multi_part_download);
 
-  Status NewWritableFile(const string& fname,
+  Status NewWritableFile(const string& fname, TransactionToken* token,
                          std::unique_ptr<WritableFile>* result) override;
 
-  Status NewAppendableFile(const string& fname,
+  Status NewAppendableFile(const string& fname, TransactionToken* token,
                            std::unique_ptr<WritableFile>* result) override;
 
   Status NewReadOnlyMemoryRegionFromFile(
-      const string& fname,
+      const string& fname, TransactionToken* token,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
 
-  Status FileExists(const string& fname) override;
+  Status FileExists(const string& fname, TransactionToken* token) override;
 
-  Status GetChildren(const string& dir, std::vector<string>* result) override;
+  Status GetChildren(const string& dir, TransactionToken* token,
+                     std::vector<string>* result) override;
 
-  Status Stat(const string& fname, FileStatistics* stat) override;
+  Status Stat(const string& fname, TransactionToken* token,
+              FileStatistics* stat) override;
 
-  Status GetMatchingPaths(const string& pattern,
+  Status GetMatchingPaths(const string& pattern, TransactionToken* token,
                           std::vector<string>* results) override;
 
-  Status DeleteFile(const string& fname) override;
+  Status DeleteFile(const string& fname, TransactionToken* token) override;
 
-  Status CreateDir(const string& name) override;
+  Status CreateDir(const string& name, TransactionToken* token) override;
 
-  Status DeleteDir(const string& name) override;
+  Status DeleteDir(const string& name, TransactionToken* token) override;
 
-  Status GetFileSize(const string& fname, uint64* size) override;
+  Status GetFileSize(const string& fname, TransactionToken* token,
+                     uint64* size) override;
 
-  Status RenameFile(const string& src, const string& target) override;
+  Status RenameFile(const string& src, const string& target,
+                    TransactionToken* token) override;
 
   Status HasAtomicMove(const string& path, bool* has_atomic_move) override;
 
diff --git a/tensorflow/core/platform/status.cc b/tensorflow/core/platform/status.cc
index c85527f27ad..04f74d024ca 100644
--- a/tensorflow/core/platform/status.cc
+++ b/tensorflow/core/platform/status.cc
@@ -89,11 +89,13 @@ class StatusLogSink : public TFLogSink {
 
 }  // namespace
 
-Status::Status(tensorflow::error::Code code, StringPiece msg) {
+Status::Status(tensorflow::error::Code code, tensorflow::StringPiece msg,
+               std::vector<StackFrame>&& stack_trace) {
   assert(code != tensorflow::error::OK);
   state_ = std::unique_ptr<State>(new State);
   state_->code = code;
   state_->msg = string(msg);
+  state_->stack_trace = std::move(stack_trace);
   VLOG(5) << "Generated non-OK status: \"" << *this << "\". "
           << CurrentStackTrace();
 }
@@ -117,6 +119,11 @@ const string& Status::empty_string() {
   return *empty;
 }
 
+const std::vector<StackFrame>& Status::empty_stack_trace() {
+  static std::vector<StackFrame>* empty = new std::vector<StackFrame>();
+  return *empty;
+}
+
 string error_name(error::Code code) {
   switch (code) {
     case tensorflow::error::OK:
diff --git a/tensorflow/core/platform/status.h b/tensorflow/core/platform/status.h
index 5ee93a179db..fc570caf6b1 100644
--- a/tensorflow/core/platform/status.h
+++ b/tensorflow/core/platform/status.h
@@ -29,6 +29,13 @@ limitations under the License.
 
 namespace tensorflow {
 
+// A struct representing a frame in a stack trace.
+struct StackFrame {
+  std::string file_name;
+  int line_number;
+  std::string function_name;
+};
+
 #if defined(__clang__)
 // Only clang supports warn_unused_result as a type annotation.
 class TF_MUST_USE_RESULT Status;
@@ -43,7 +50,15 @@ class Status {
 
   /// \brief Create a status with the specified error code and msg as a
   /// human-readable string containing more detailed information.
-  Status(tensorflow::error::Code code, tensorflow::StringPiece msg);
+  Status(tensorflow::error::Code code, tensorflow::StringPiece msg)
+      : Status(code, msg, {}) {}
+
+  /// \brief Create a status with the specified error code, msg, and stack trace
+  /// as a human-readable string containing more detailed information.
+#ifndef SWIG
+  Status(tensorflow::error::Code code, tensorflow::StringPiece msg,
+         std::vector<StackFrame>&& stack_trace);
+#endif
 
   /// Copy the specified status.
   Status(const Status& s);
@@ -66,6 +81,10 @@ class Status {
     return ok() ? empty_string() : state_->msg;
   }
 
+  const std::vector<StackFrame>& stack_trace() const {
+    return ok() ? empty_stack_trace() : state_->stack_trace;
+  }
+
   bool operator==(const Status& x) const;
   bool operator!=(const Status& x) const;
 
@@ -91,9 +110,11 @@ class Status {
 
  private:
   static const std::string& empty_string();
+  static const std::vector<StackFrame>& empty_stack_trace();
   struct State {
     tensorflow::error::Code code;
     std::string msg;
+    std::vector<StackFrame> stack_trace;
   };
   // OK status has a `NULL` state_.  Otherwise, `state_` points to
   // a `State` structure containing the error code and message(s)
diff --git a/tensorflow/core/platform/stringprintf.h b/tensorflow/core/platform/stringprintf.h
index 802b568101e..aec94c0d41f 100644
--- a/tensorflow/core/platform/stringprintf.h
+++ b/tensorflow/core/platform/stringprintf.h
@@ -33,18 +33,18 @@ namespace tensorflow {
 namespace strings {
 
 // Return a C++ string
-extern string Printf(const char* format, ...)
+extern std::string Printf(const char* format, ...)
     // Tell the compiler to do printf format string checking.
     TF_PRINTF_ATTRIBUTE(1, 2);
 
 // Append result to a supplied string
-extern void Appendf(string* dst, const char* format, ...)
+extern void Appendf(std::string* dst, const char* format, ...)
     // Tell the compiler to do printf format string checking.
     TF_PRINTF_ATTRIBUTE(2, 3);
 
 // Lower-level routine that takes a va_list and appends to a specified
 // string.  All other routines are just convenience wrappers around it.
-extern void Appendv(string* dst, const char* format, va_list ap);
+extern void Appendv(std::string* dst, const char* format, va_list ap);
 
 }  // namespace strings
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/strong_hash.h b/tensorflow/core/platform/strong_hash.h
index cbd267f90ed..987df5da59d 100644
--- a/tensorflow/core/platform/strong_hash.h
+++ b/tensorflow/core/platform/strong_hash.h
@@ -32,7 +32,7 @@ namespace tensorflow {
 //   string input = "input string";
 //   uint64 hash_value = StrongKeyedHash(key, input);
 //
-uint64 StrongKeyedHash(const uint64 (&)[2], const string&);
+uint64 StrongKeyedHash(const tensorflow::uint64 (&)[2], const string&);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/test.h b/tensorflow/core/platform/test.h
index a2cda11c608..ba507837652 100644
--- a/tensorflow/core/platform/test.h
+++ b/tensorflow/core/platform/test.h
@@ -23,20 +23,37 @@ limitations under the License.
 #include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/platform/types.h"
 
-// As of September 2016, we continue to attempt to avoid the use of gmock aka
-// googlemock included in the test framework
-// (https://github.com/google/googletest) to discourage over-eager use of mocks
-// that lead to cumbersome class hierarchies and tests that might end up not
-// testing real code in important ways.
 #include <gtest/gtest.h>  // IWYU pragma: export
 
+// Includes gmock.h and enables the use of gmock matchers in tensorflow tests.
+//
+// Test including this header can use the macros EXPECT_THAT(...) and
+// ASSERT_THAT(...) in combination with gmock matchers.
+// Example:
+//  std::vector<int> vec = Foo();
+//  EXPECT_THAT(vec, ::testing::ElementsAre(1,2,3));
+//  EXPECT_THAT(vec, ::testing::UnorderedElementsAre(2,3,1));
+//
+// For more details on gmock matchers see:
+// https://github.com/google/googletest/blob/master/googlemock/docs/CheatSheet.md#matchers
+//
+// The advantages of using gmock matchers instead of self defined matchers are
+// better error messages, more maintainable tests and more test coverage.
+#if defined(PLATFORM_GOOGLE) || defined(PLATFORM_GOOGLE_ANDROID)
+#include "testing/base/public/gmock.h"
+#else
+#include <gmock/gmock-generated-matchers.h>
+#include <gmock/gmock-matchers.h>
+#include <gmock/gmock-more-matchers.h>
+#endif
+
 namespace tensorflow {
 namespace testing {
 
 // Return a temporary directory suitable for temporary testing files.
 //
 // Where possible, consider using Env::LocalTempFilename over this function.
-string TmpDir();
+std::string TmpDir();
 
 // Returns the path to TensorFlow in the directory containing data
 // dependencies.
@@ -45,7 +62,7 @@ string TmpDir();
 // tensorflow/core/platform/resource_loader.h:GetDataDependencyFilepath. That
 // function should do the right thing both within and outside of tests allowing
 // avoiding test specific APIs.
-string TensorFlowSrcRoot();
+std::string TensorFlowSrcRoot();
 
 // Return a random number generator seed to use in randomized tests.
 // Returns the same value for the lifetime of the process.
diff --git a/tensorflow/core/platform/threadpool.h b/tensorflow/core/platform/threadpool.h
index fd90faa41bb..0193d3302fd 100644
--- a/tensorflow/core/platform/threadpool.h
+++ b/tensorflow/core/platform/threadpool.h
@@ -108,22 +108,22 @@ class ThreadPool {
   // operations like I/O the hint should be set to false.
   //
   // REQUIRES: num_threads > 0
-  ThreadPool(Env* env, const ThreadOptions& thread_options, const string& name,
-             int num_threads, bool low_latency_hint,
+  ThreadPool(Env* env, const ThreadOptions& thread_options,
+             const std::string& name, int num_threads, bool low_latency_hint,
              Eigen::Allocator* allocator = nullptr);
 
   // Constructs a pool for low-latency ops that contains "num_threads" threads
   // with specified "name". env->StartThread() is used to create individual
   // threads.
   // REQUIRES: num_threads > 0
-  ThreadPool(Env* env, const string& name, int num_threads);
+  ThreadPool(Env* env, const std::string& name, int num_threads);
 
   // Constructs a pool for low-latency ops that contains "num_threads" threads
   // with specified "name". env->StartThread() is used to create individual
   // threads with the given ThreadOptions.
   // REQUIRES: num_threads > 0
-  ThreadPool(Env* env, const ThreadOptions& thread_options, const string& name,
-             int num_threads);
+  ThreadPool(Env* env, const ThreadOptions& thread_options,
+             const std::string& name, int num_threads);
 
   // Constructs a pool that wraps around the thread::ThreadPoolInterface
   // instance provided by the caller. Caller retains ownership of
diff --git a/tensorflow/core/platform/windows/env.cc b/tensorflow/core/platform/windows/env.cc
index d75d2d5773d..ea6d1424529 100644
--- a/tensorflow/core/platform/windows/env.cc
+++ b/tensorflow/core/platform/windows/env.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
-#undef LoadLibrary
 #undef ERROR
 
 #include <string>
@@ -156,8 +155,9 @@ class WindowsEnv : public Env {
     SetThreadpoolTimer(timer, &FileDueTime, 0, 0);
   }
 
-  Status LoadLibrary(const char* library_filename, void** handle) override {
-    return tensorflow::internal::LoadLibrary(library_filename, handle);
+  Status LoadDynamicLibrary(const char* library_filename,
+                            void** handle) override {
+    return tensorflow::internal::LoadDynamicLibrary(library_filename, handle);
   }
 
   Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
diff --git a/tensorflow/core/platform/windows/load_library.cc b/tensorflow/core/platform/windows/load_library.cc
index f95e770cc6b..67fdffeca15 100644
--- a/tensorflow/core/platform/windows/load_library.cc
+++ b/tensorflow/core/platform/windows/load_library.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include <stdio.h>
 #include <time.h>
 #include <windows.h>
-#undef LoadLibrary
 #undef ERROR
 
 #include "tensorflow/core/platform/errors.h"
@@ -34,7 +33,7 @@ namespace tensorflow {
 
 namespace internal {
 
-Status LoadLibrary(const char* library_filename, void** handle) {
+Status LoadDynamicLibrary(const char* library_filename, void** handle) {
   string file_name = library_filename;
   std::replace(file_name.begin(), file_name.end(), '/', '\\');
 
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index 00f52f9b70c..52f9e479036 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -49,6 +49,8 @@ string Hostname() {
   return name;
 }
 
+string JobName() { return ""; }
+
 int NumSchedulableCPUs() {
   SYSTEM_INFO system_info;
   GetSystemInfo(&system_info);
diff --git a/tensorflow/core/platform/windows/windows_file_system.cc b/tensorflow/core/platform/windows/windows_file_system.cc
index bf8bb8914f6..475f8791144 100644
--- a/tensorflow/core/platform/windows/windows_file_system.cc
+++ b/tensorflow/core/platform/windows/windows_file_system.cc
@@ -261,7 +261,8 @@ class WinReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
 }  // namespace
 
 Status WindowsFileSystem::NewRandomAccessFile(
-    const string& fname, std::unique_ptr<RandomAccessFile>* result) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<RandomAccessFile>* result) {
   string translated_fname = TranslateName(fname);
   std::wstring ws_translated_fname = Utf8ToWideChar(translated_fname);
   result->reset();
@@ -288,7 +289,8 @@ Status WindowsFileSystem::NewRandomAccessFile(
 }
 
 Status WindowsFileSystem::NewWritableFile(
-    const string& fname, std::unique_ptr<WritableFile>* result) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<WritableFile>* result) {
   string translated_fname = TranslateName(fname);
   std::wstring ws_translated_fname = Utf8ToWideChar(translated_fname);
   result->reset();
@@ -308,7 +310,8 @@ Status WindowsFileSystem::NewWritableFile(
 }
 
 Status WindowsFileSystem::NewAppendableFile(
-    const string& fname, std::unique_ptr<WritableFile>* result) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<WritableFile>* result) {
   string translated_fname = TranslateName(fname);
   std::wstring ws_translated_fname = Utf8ToWideChar(translated_fname);
   result->reset();
@@ -338,7 +341,8 @@ Status WindowsFileSystem::NewAppendableFile(
 }
 
 Status WindowsFileSystem::NewReadOnlyMemoryRegionFromFile(
-    const string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<ReadOnlyMemoryRegion>* result) {
   string translated_fname = TranslateName(fname);
   std::wstring ws_translated_fname = Utf8ToWideChar(translated_fname);
   result->reset();
@@ -414,7 +418,8 @@ Status WindowsFileSystem::NewReadOnlyMemoryRegionFromFile(
   return s;
 }
 
-Status WindowsFileSystem::FileExists(const string& fname) {
+Status WindowsFileSystem::FileExists(const string& fname,
+                                     TransactionToken* token) {
   constexpr int kOk = 0;
   std::wstring ws_translated_fname = Utf8ToWideChar(TranslateName(fname));
   if (_waccess(ws_translated_fname.c_str(), kOk) == 0) {
@@ -424,6 +429,7 @@ Status WindowsFileSystem::FileExists(const string& fname) {
 }
 
 Status WindowsFileSystem::GetChildren(const string& dir,
+                                      TransactionToken* token,
                                       std::vector<string>* result) {
   string translated_dir = TranslateName(dir);
   std::wstring ws_translated_dir = Utf8ToWideChar(translated_dir);
@@ -459,7 +465,8 @@ Status WindowsFileSystem::GetChildren(const string& dir,
   return Status::OK();
 }
 
-Status WindowsFileSystem::DeleteFile(const string& fname) {
+Status WindowsFileSystem::DeleteFile(const string& fname,
+                                     TransactionToken* token) {
   Status result;
   std::wstring file_name = Utf8ToWideChar(fname);
   if (_wunlink(file_name.c_str()) != 0) {
@@ -468,7 +475,8 @@ Status WindowsFileSystem::DeleteFile(const string& fname) {
   return result;
 }
 
-Status WindowsFileSystem::CreateDir(const string& name) {
+Status WindowsFileSystem::CreateDir(const string& name,
+                                    TransactionToken* token) {
   Status result;
   std::wstring ws_name = Utf8ToWideChar(name);
   if (ws_name.empty()) {
@@ -480,7 +488,8 @@ Status WindowsFileSystem::CreateDir(const string& name) {
   return result;
 }
 
-Status WindowsFileSystem::DeleteDir(const string& name) {
+Status WindowsFileSystem::DeleteDir(const string& name,
+                                    TransactionToken* token) {
   Status result;
   std::wstring ws_name = Utf8ToWideChar(name);
   if (_wrmdir(ws_name.c_str()) != 0) {
@@ -489,7 +498,8 @@ Status WindowsFileSystem::DeleteDir(const string& name) {
   return result;
 }
 
-Status WindowsFileSystem::GetFileSize(const string& fname, uint64* size) {
+Status WindowsFileSystem::GetFileSize(const string& fname,
+                                      TransactionToken* token, uint64* size) {
   string translated_fname = TranslateName(fname);
   std::wstring ws_translated_dir = Utf8ToWideChar(translated_fname);
   Status result;
@@ -507,7 +517,8 @@ Status WindowsFileSystem::GetFileSize(const string& fname, uint64* size) {
   return result;
 }
 
-Status WindowsFileSystem::IsDirectory(const string& fname) {
+Status WindowsFileSystem::IsDirectory(const string& fname,
+                                      TransactionToken* token) {
   TF_RETURN_IF_ERROR(FileExists(fname));
   std::wstring ws_translated_fname = Utf8ToWideChar(TranslateName(fname));
   if (PathIsDirectoryW(ws_translated_fname.c_str())) {
@@ -516,7 +527,8 @@ Status WindowsFileSystem::IsDirectory(const string& fname) {
   return Status(tensorflow::error::FAILED_PRECONDITION, "Not a directory");
 }
 
-Status WindowsFileSystem::RenameFile(const string& src, const string& target) {
+Status WindowsFileSystem::RenameFile(const string& src, const string& target,
+                                     TransactionToken* token) {
   Status result;
   // rename() is not capable of replacing the existing file as on Linux
   // so use OS API directly
@@ -531,6 +543,7 @@ Status WindowsFileSystem::RenameFile(const string& src, const string& target) {
 }
 
 Status WindowsFileSystem::GetMatchingPaths(const string& pattern,
+                                           TransactionToken* token,
                                            std::vector<string>* results) {
   // NOTE(mrry): The existing implementation of FileSystem::GetMatchingPaths()
   // does not handle Windows paths containing backslashes correctly. Since
@@ -554,7 +567,8 @@ bool WindowsFileSystem::Match(const string& filename, const string& pattern) {
   return PathMatchSpecW(ws_path.c_str(), ws_pattern.c_str()) == TRUE;
 }
 
-Status WindowsFileSystem::Stat(const string& fname, FileStatistics* stat) {
+Status WindowsFileSystem::Stat(const string& fname, TransactionToken* token,
+                               FileStatistics* stat) {
   Status result;
   struct _stat sbuf;
   std::wstring ws_translated_fname = Utf8ToWideChar(TranslateName(fname));
diff --git a/tensorflow/core/platform/windows/windows_file_system.h b/tensorflow/core/platform/windows/windows_file_system.h
index 9bf8a018113..8c550f53b84 100644
--- a/tensorflow/core/platform/windows/windows_file_system.h
+++ b/tensorflow/core/platform/windows/windows_file_system.h
@@ -32,41 +32,48 @@ class WindowsFileSystem : public FileSystem {
 
   ~WindowsFileSystem() {}
 
-  Status NewRandomAccessFile(
-      const string& fname, std::unique_ptr<RandomAccessFile>* result) override;
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
 
-  Status NewWritableFile(const string& fname,
+  Status NewRandomAccessFile(
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override;
+
+  Status NewWritableFile(const string& fname, TransactionToken* token,
                          std::unique_ptr<WritableFile>* result) override;
 
-  Status NewAppendableFile(const string& fname,
+  Status NewAppendableFile(const string& fname, TransactionToken* token,
                            std::unique_ptr<WritableFile>* result) override;
 
   Status NewReadOnlyMemoryRegionFromFile(
-      const string& fname,
+      const string& fname, TransactionToken* token,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
 
-  Status FileExists(const string& fname) override;
+  Status FileExists(const string& fname, TransactionToken* token) override;
 
-  Status GetChildren(const string& dir, std::vector<string>* result) override;
+  Status GetChildren(const string& dir, TransactionToken* token,
+                     std::vector<string>* result) override;
 
-  Status GetMatchingPaths(const string& pattern,
+  Status GetMatchingPaths(const string& pattern, TransactionToken* token,
                           std::vector<string>* result) override;
 
   bool Match(const string& filename, const string& pattern) override;
 
-  Status Stat(const string& fname, FileStatistics* stat) override;
+  Status Stat(const string& fname, TransactionToken* token,
+              FileStatistics* stat) override;
 
-  Status DeleteFile(const string& fname) override;
+  Status DeleteFile(const string& fname, TransactionToken* token) override;
 
-  Status CreateDir(const string& name) override;
+  Status CreateDir(const string& name, TransactionToken* token) override;
 
-  Status DeleteDir(const string& name) override;
+  Status DeleteDir(const string& name, TransactionToken* token) override;
 
-  Status GetFileSize(const string& fname, uint64* size) override;
+  Status GetFileSize(const string& fname, TransactionToken* token,
+                     uint64* size) override;
 
-  Status IsDirectory(const string& fname) override;
+  Status IsDirectory(const string& fname, TransactionToken* token) override;
 
-  Status RenameFile(const string& src, const string& target) override;
+  Status RenameFile(const string& src, const string& target,
+                    TransactionToken* token) override;
 
   string TranslateName(const string& name) const override { return name; }
 
diff --git a/tensorflow/core/profiler/BUILD b/tensorflow/core/profiler/BUILD
index 618ac2f6010..543a7d6467b 100644
--- a/tensorflow/core/profiler/BUILD
+++ b/tensorflow/core/profiler/BUILD
@@ -25,6 +25,7 @@ tf_proto_library(
     name = "profiler_service_monitor_result_proto",
     srcs = ["profiler_service_monitor_result.proto"],
     cc_api_version = 2,
+    make_default_target_header_only = True,
     visibility = ["//visibility:public"],
 )
 
@@ -48,6 +49,7 @@ tf_proto_library(
     has_services = 1,
     cc_api_version = 2,
     cc_grpc_version = 1,
+    make_default_target_header_only = True,
     protodeps = [
         ":profiler_options_proto",
         ":profiler_service_monitor_result_proto",
@@ -62,6 +64,7 @@ tf_proto_library(
     has_services = 1,
     cc_api_version = 2,
     cc_grpc_version = 1,
+    make_default_target_header_only = True,
     protodeps = [":profiler_service_proto"],
     use_grpc_namespace = True,
     visibility = ["//visibility:public"],
diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index 06594b1aeaf..2274a227f4d 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -96,13 +96,16 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
         "//tensorflow/core/profiler/protobuf:input_pipeline_proto_cc",
+        "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:overview_page_proto_cc",
         "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
         "//tensorflow/core/profiler/protobuf:tf_function_proto_cc",
         "//tensorflow/core/profiler/utils:diagnostics",
+        "//tensorflow/core/profiler/utils:hardware_type_utils",
         "//tensorflow/core/profiler/utils:html_utils",
+        "//tensorflow/core/profiler/utils:kernel_stats_utils",
         "//tensorflow/core/profiler/utils:math_utils",
         "//tensorflow/core/profiler/utils:op_metrics_db_utils",
         "//tensorflow/core/profiler/utils:time_utils",
@@ -128,6 +131,7 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
         "//tensorflow/core/profiler/utils:diagnostics",
         "//tensorflow/core/profiler/utils:event_span",
+        "//tensorflow/core/profiler/utils:hardware_type_utils",
         "//tensorflow/core/profiler/utils:html_utils",
         "//tensorflow/core/profiler/utils:math_utils",
         "//tensorflow/core/profiler/utils:tf_op_utils",
@@ -149,6 +153,8 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:tf_stats_proto_cc",
+        "//tensorflow/core/profiler/utils:kernel_stats_utils",
+        "//tensorflow/core/profiler/utils:math_utils",
         "//tensorflow/core/profiler/utils:op_metrics_db_utils",
         "//tensorflow/core/profiler/utils:time_utils",
     ],
@@ -303,7 +309,6 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/rpc/client:save_profile",
         "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
     ],
@@ -318,6 +323,7 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
         "//tensorflow/core/profiler/protobuf:input_pipeline_proto_cc",
         "//tensorflow/core/profiler/protobuf:overview_page_proto_cc",
@@ -336,6 +342,7 @@ cc_library(
     hdrs = ["xplane_to_step_events.h"],
     deps = [
         "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:event_span",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
@@ -418,6 +425,7 @@ cc_library(
         "//tensorflow/core/profiler/utils:trace_utils",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_visitor",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
     ],
@@ -434,15 +442,12 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
         "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/core/profiler/utils:tf_xplane_visitor",
+        "//tensorflow/core/profiler/utils:kernel_stats_utils",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_test_utils",
-        "//tensorflow/core/profiler/utils:xplane_utils",
-        "//tensorflow/core/profiler/utils:xplane_visitor",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -505,9 +510,11 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_utils",
         "//tensorflow/core/profiler/utils:xplane_visitor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
diff --git a/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc b/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
index e91869885c5..425bf0077c3 100644
--- a/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
+++ b/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
@@ -25,21 +25,42 @@ namespace {
 
 using OperationType = OpMetrics::MemoryAccessed::OperationType;
 
-// Combines the src OpMetrics into the dst OpMetrics.
-void CombineOpMetrics(const OpMetrics& src, OpMetrics* dst) {
+void CombinePrecisionStats(const PrecisionStats& src, PrecisionStats* dst) {
+  dst->set_compute_16bit_ps(src.compute_16bit_ps() + dst->compute_16bit_ps());
+  dst->set_compute_32bit_ps(src.compute_32bit_ps() + dst->compute_32bit_ps());
+}
+
+}  // namespace
+
+void CopyOpMetricsMetadata(const OpMetrics& src, OpMetrics* dst) {
   DCHECK(dst != nullptr);
   DCHECK_EQ(src.hlo_module_id(), dst->hlo_module_id());
   DCHECK_EQ(src.name(), dst->name());
-  dst->set_category(src.category());
-  dst->set_provenance(src.provenance());
-  dst->set_is_eager(dst->is_eager() || src.is_eager());
-  dst->set_deduplicated_name(src.deduplicated_name());
+  if (dst->category().empty()) {
+    dst->set_category(src.category());
+  }
+  if (dst->provenance().empty()) {
+    dst->set_provenance(src.provenance());
+  }
+  if (dst->deduplicated_name().empty()) {
+    dst->set_deduplicated_name(src.deduplicated_name());
+  }
   if (!dst->has_layout() && src.has_layout()) {
     *dst->mutable_layout() = src.layout();
   }
   if (!dst->has_children() && src.has_children()) {
     *dst->mutable_children() = src.children();
   }
+}
+
+void CombineOpMetrics(const OpMetrics& src, OpMetrics* dst) {
+  DCHECK(dst != nullptr);
+  if (dst->occurrences() == 0) {
+    dst->set_min_time_ps(src.min_time_ps());
+  } else {
+    dst->set_min_time_ps(std::min(src.min_time_ps(), dst->min_time_ps()));
+  }
+  dst->set_is_eager(dst->is_eager() || src.is_eager());
   dst->set_occurrences(src.occurrences() + dst->occurrences());
   dst->set_time_ps(src.time_ps() + dst->time_ps());
   dst->set_self_time_ps(src.self_time_ps() + dst->self_time_ps());
@@ -50,16 +71,10 @@ void CombineOpMetrics(const OpMetrics& src, OpMetrics* dst) {
   dst->set_dma_stall_ps(src.dma_stall_ps() + dst->dma_stall_ps());
 }
 
-void CombinePrecisionStats(const PrecisionStats& src, PrecisionStats* dst) {
-  dst->set_compute_16bit_ps(src.compute_16bit_ps() + dst->compute_16bit_ps());
-  dst->set_compute_32bit_ps(src.compute_32bit_ps() + dst->compute_32bit_ps());
-}
-
-}  // namespace
-
 void CombineMemoryAccessedBreakdown(
     const protobuf::RepeatedPtrField<OpMetrics_MemoryAccessed>& src,
     protobuf::RepeatedPtrField<OpMetrics_MemoryAccessed>* dst) {
+  if (src.empty()) return;
   absl::flat_hash_map<std::pair<uint64 /*memory_space*/, OperationType>,
                       OpMetrics_MemoryAccessed*>
       dst_memory_accessed_map;
@@ -99,6 +114,7 @@ void OpMetricsDbCombiner::Combine(const OpMetricsDb& src) {
   for (const auto& src_metrics : src.metrics_db()) {
     auto* dst_metrics = LookupOrInsertNewOpMetrics(src_metrics.hlo_module_id(),
                                                    src_metrics.name());
+    CopyOpMetricsMetadata(src_metrics, dst_metrics);
     CombineOpMetrics(src_metrics, dst_metrics);
   }
 }
diff --git a/tensorflow/core/profiler/convert/op_metrics_db_combiner.h b/tensorflow/core/profiler/convert/op_metrics_db_combiner.h
index a0ca3387e7a..5c1490d2e8b 100644
--- a/tensorflow/core/profiler/convert/op_metrics_db_combiner.h
+++ b/tensorflow/core/profiler/convert/op_metrics_db_combiner.h
@@ -23,6 +23,12 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 
+// Copies OpMetrics metadata (e.g., category, provenance) from src to dst.
+void CopyOpMetricsMetadata(const OpMetrics& src, OpMetrics* dst);
+
+// Combines OpMetrics data (e.g., occurrences, time) from src into dst.
+void CombineOpMetrics(const OpMetrics& src, OpMetrics* dst);
+
 // Combines the memory access breakdown.
 void CombineMemoryAccessedBreakdown(
     const protobuf::RepeatedPtrField<OpMetrics_MemoryAccessed>& src,
diff --git a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
index 8864dbd4313..6828950e6a5 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
 #include "tensorflow/core/profiler/utils/diagnostics.h"
 #include "tensorflow/core/profiler/utils/event_span.h"
+#include "tensorflow/core/profiler/utils/hardware_type_utils.h"
 #include "tensorflow/core/profiler/utils/html_utils.h"
 #include "tensorflow/core/profiler/utils/math_utils.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
@@ -207,7 +208,8 @@ InputPipelineAnalysisResult ComputeGenericInputPipelineAnalysisResult(
                                   GetTimeInMs(type_ps, DEVICE_WAIT_HOST));
     details.set_output_ms(GetTimeInMs(type_ps, DEVICE_TO_HOST));
     details.set_device_compute_ms(GetTimeInMs(type_ps, DEVICE_COMPUTE_16) +
-                                  GetTimeInMs(type_ps, DEVICE_COMPUTE_32));
+                                  GetTimeInMs(type_ps, DEVICE_COMPUTE_32) +
+                                  GetTimeInMs(type_ps, DEVICE_COLLECTIVES));
     details.set_device_to_device_ms(GetTimeInMs(type_ps, DEVICE_TO_DEVICE) +
                                     GetTimeInMs(type_ps, DEVICE_WAIT_DEVICE));
     details.set_host_compute_ms(GetTimeInMs(type_ps, HOST_COMPUTE));
@@ -553,18 +555,23 @@ StepSummary ComputeStepTimeSummaryInMs(
 }
 
 InputPipelineAnalysisResult ConvertOpStatsToInputPipelineAnalysis(
-    const OpStats& op_stats, const HardwareType& hardware_type) {
+    const OpStats& op_stats) {
   InputPipelineAnalysisResult result =
       ComputeGenericInputPipelineAnalysisResult(
           op_stats.step_db().step_sequence());
   PopulateStepDiagnostics(op_stats, result.mutable_diagnostics());
-  result.set_hardware_type(HardwareType_Name(hardware_type));
+  result.set_hardware_type(HardwareType_Name(
+      ParseHardwareType(op_stats.run_environment().device_type())));
   GenerateHostResult(op_stats.host_op_metrics_db(), &result);
 
   InputPipelineAnalysisRecommendation recommendation = GenerateRecommendation();
   BottleneckAnalysis bottleneck_analysis = ComputeBottleneckAnalysis(
       result.input_time_breakdown(), result.step_details());
-  result.set_overall_input_percent(bottleneck_analysis.input_percent());
+  result.set_input_percent(bottleneck_analysis.input_percent());
+  result.set_output_percent(bottleneck_analysis.output_percent());
+  result.set_idle_percent(bottleneck_analysis.idle_percent());
+  result.set_compute_percent(bottleneck_analysis.compute_percent());
+
   recommendation.mutable_bottleneck_analysis()->PackFrom(bottleneck_analysis);
   *recommendation.mutable_summary_next_step() =
       GetSummaryNextStep(bottleneck_analysis.input_classification(),
@@ -651,6 +658,7 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
   double total_host_compute_ms = 0;
   double total_host_prepare_ms = 0;
   double total_host_compile_ms = 0;
+  double total_device_compute_ms = 0;
   double total_device_to_device_ms = 0;
   double total_unknown_ms = 0;
 
@@ -667,6 +675,7 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
         details.host_wait_input_ms() + details.host_to_device_ms();
     total_output_ms += details.output_ms();
     total_host_prepare_ms += details.host_prepare_ms();
+    total_device_compute_ms += details.device_compute_ms();
     total_device_to_device_ms += details.device_to_device_ms();
     total_host_compute_ms += details.host_compute_ms();
     total_host_compile_ms += details.host_compile_ms();
@@ -686,6 +695,12 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
     return analysis;
   }
   double input_percent = 100.0 * total_input_ms / total_step_time_ms;
+  double output_percent = 100.0 * total_output_ms / total_step_time_ms;
+  double compute_percent = 100.0 * total_device_compute_ms / total_step_time_ms;
+  // idle_percent includes host_prepare (i.e. kernel launch, device-to-device,
+  // host compute, host compile, and unknown.
+  double idle_percent =
+      std::max(0.0, 100.0 - input_percent - output_percent - compute_percent);
   double kernel_launch_percent =
       100.0 * total_host_prepare_ms / total_step_time_ms;
   double all_other_percent = 100.0 * total_unknown_ms / total_step_time_ms;
@@ -707,6 +722,10 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
 
   BottleneckAnalysis analysis;
   analysis.set_input_percent(input_percent);
+  analysis.set_output_percent(output_percent);
+  analysis.set_idle_percent(idle_percent);
+  analysis.set_compute_percent(compute_percent);
+
   analysis.set_input_classification(input_classification);
   analysis.set_input_statement(input_statement);
   analysis.set_kernel_launch_classification(kernel_launch_classification);
diff --git a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h
index 2191251ee88..cc54a7ea684 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h
+++ b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h
@@ -59,7 +59,7 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
         any_step_details);
 
 InputPipelineAnalysisResult ConvertOpStatsToInputPipelineAnalysis(
-    const OpStats& op_stats, const HardwareType& hardware_type);
+    const OpStats& op_stats);
 
 // Returns true if explanation for "All Others" time is also included in
 // input_statement.
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
index a92902b6cf7..25391b99846 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
@@ -25,13 +25,16 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
+#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
 #include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
 #include "tensorflow/core/profiler/utils/diagnostics.h"
+#include "tensorflow/core/profiler/utils/hardware_type_utils.h"
 #include "tensorflow/core/profiler/utils/html_utils.h"
+#include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
 #include "tensorflow/core/profiler/utils/math_utils.h"
 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
@@ -162,6 +165,8 @@ OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats) {
   OverviewPageAnalysis analysis;
   OpMetricsDb device_tf_op_metrics_db = CreateTfMetricsDbFromDeviceOpMetricsDb(
       op_stats.device_op_metrics_db(), /*with_idle=*/false);
+  KernelStatsByOpName kernel_stats_by_op_name =
+      GroupKernelReportsByOpName(op_stats.kernel_stats_db());
   uint64 total_device_time_ps = device_tf_op_metrics_db.total_time_ps();
   constexpr int kNumTopOpsShown = 10;
   double device_cumulative_fraction = 0.0;
@@ -176,6 +181,12 @@ OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats) {
     op->set_cumulative_time_fraction(device_cumulative_fraction);
     op->set_flop_rate(
         SafeDivide(metrics->flops(), PicosToNanos(metrics->time_ps())));
+    auto iter = kernel_stats_by_op_name.find(op->name());
+    if (iter != kernel_stats_by_op_name.end()) {
+      op->set_is_op_tensorcore_eligible(
+          iter->second.is_op_tensor_core_eligible);
+      op->set_is_op_using_tensorcore(iter->second.tensor_core_duration_ns != 0);
+    }
   }
   uint64 total_device_compute_ps =
       op_stats.device_op_metrics_db().precision_stats().compute_16bit_ps() +
@@ -289,7 +300,7 @@ std::string TfFunctionRecommendationHtml(const TfFunctionDb& tf_function_db) {
   auto num_functions_shown = std::min(
       static_cast<decltype(candidates)::size_type>(3), candidates.size());
 
-  for (auto i = 0; i < num_functions_shown; i++) {
+  for (decltype(candidates)::size_type i = 0; i < num_functions_shown; i++) {
     if (i > 0) absl::StrAppend(&expensive_functions, ", ");
     absl::StrAppend(&expensive_functions, "\"", candidates[i].function_name,
                     "\"");
@@ -316,14 +327,13 @@ std::string EagerRecommendationHtml(double host_op_time_eager_percent,
   return recommendation;
 }
 
-OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
-                                          HardwareType hardware_type) {
+OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats) {
   OverviewPage overview_page;
   *overview_page.mutable_run_environment() =
       ComputeRunEnvironment(op_stats.run_environment());
   *overview_page.mutable_analysis() = ComputeAnalysisResult(op_stats);
   *overview_page.mutable_input_analysis() =
-      ConvertOpStatsToInputPipelineAnalysis(op_stats, hardware_type);
+      ConvertOpStatsToInputPipelineAnalysis(op_stats);
   BottleneckAnalysis bottleneck = ComputeBottleneckAnalysis(
       overview_page.input_analysis().input_time_breakdown(),
       overview_page.input_analysis().step_details());
@@ -331,7 +341,8 @@ OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
       bottleneck, op_stats.device_op_metrics_db().precision_stats());
   SetCommonRecommendation(
       bottleneck.input_classification(), bottleneck.input_statement(), "",
-      hardware_type, TfFunctionRecommendationHtml(op_stats.tf_function_db()),
+      ParseHardwareType(op_stats.run_environment().device_type()),
+      TfFunctionRecommendationHtml(op_stats.tf_function_db()),
       EagerRecommendationHtml(
           overview_page.analysis().host_op_time_eager_percent(),
           overview_page.analysis().device_op_time_eager_percent()),
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.h b/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
index 0d49ae492fc..876f6847e9f 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
@@ -54,8 +54,7 @@ OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats);
 OverviewPageRunEnvironment ComputeRunEnvironment(
     const RunEnvironment& run_environment);
 
-OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
-                                          HardwareType hardware_type);
+OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats);
 
 // Returns a html which provides tf-function related recommendation.
 std::string TfFunctionRecommendationHtml(const TfFunctionDb& tf_function_db);
diff --git a/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc b/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
index e23813a5b5d..67024809e61 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/tf_stats.pb.h"
+#include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
+#include "tensorflow/core/profiler/utils/math_utils.h"
 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
 
@@ -40,9 +42,11 @@ TfStatsRecord ConvertOpMetricsToTfStatsRecord(
   return record;
 }
 
-TfStatsTable GenerateTfStatsTable(const OpMetricsDb& host_tf_metrics_db,
-                                  const OpMetricsDb& device_tf_metrics_db,
-                                  double ridge_point, bool exclude_idle) {
+TfStatsTable GenerateTfStatsTable(
+    const OpMetricsDb& host_tf_metrics_db,
+    const OpMetricsDb& device_tf_metrics_db,
+    const KernelStatsByOpName& kernel_stats_by_op_name, double ridge_point,
+    bool exclude_idle) {
   TfStatsTable tf_stats_table;
   TfStatsRecord sentinel;
   sentinel.set_rank(0);
@@ -61,6 +65,15 @@ TfStatsTable GenerateTfStatsTable(const OpMetricsDb& host_tf_metrics_db,
     TfStatsRecord* record = tf_stats_table.add_tf_stats_record();
     *record = ConvertOpMetricsToTfStatsRecord(
         /*on_device=*/true, *metrics, ridge_point);
+    // Compute TensorCore utilization only on device side.
+    auto iter = kernel_stats_by_op_name.find(record->op_name());
+    if (iter != kernel_stats_by_op_name.end()) {
+      record->set_gpu_tensorcore_utilization(
+          SafeDivide(iter->second.tensor_core_duration_ns,
+                     iter->second.total_duration_ns));
+    } else {
+      record->set_gpu_tensorcore_utilization(0.0);
+    }
     SetRankAndDeviceTimeFractions(total_device_time_us, *prev_record, record);
     prev_record = record;
   }
@@ -77,6 +90,8 @@ TfStatsTable GenerateTfStatsTable(const OpMetricsDb& host_tf_metrics_db,
     TfStatsRecord* record = tf_stats_table.add_tf_stats_record();
     *record = ConvertOpMetricsToTfStatsRecord(
         /*on_device=*/false, *metrics, ridge_point);
+    // Host side TensorCore utilization is always 0.0
+    record->set_gpu_tensorcore_utilization(0.0);
     SetRankAndHostTimeFractions(total_host_time_us, *prev_record, record);
     prev_record = record;
   }
@@ -90,13 +105,15 @@ TfStatsDatabase ConvertOpStatsToTfStats(const OpStats& op_stats) {
   OpMetricsDb device_tf_metrics_db =
       CreateTfMetricsDbFromDeviceOpMetricsDb(op_stats.device_op_metrics_db());
   double ridge_point = op_stats.perf_env().ridge_point();
+  KernelStatsByOpName kernel_stats_by_op_name =
+      GroupKernelReportsByOpName(op_stats.kernel_stats_db());
   TfStatsDatabase tf_stats_db;
-  *tf_stats_db.mutable_with_idle() =
-      GenerateTfStatsTable(host_tf_metrics_db, device_tf_metrics_db,
-                           ridge_point, /*exclude_idle=*/false);
-  *tf_stats_db.mutable_without_idle() =
-      GenerateTfStatsTable(host_tf_metrics_db, device_tf_metrics_db,
-                           ridge_point, /*exclude_idle=*/true);
+  *tf_stats_db.mutable_with_idle() = GenerateTfStatsTable(
+      host_tf_metrics_db, device_tf_metrics_db, kernel_stats_by_op_name,
+      ridge_point, /*exclude_idle=*/false);
+  *tf_stats_db.mutable_without_idle() = GenerateTfStatsTable(
+      host_tf_metrics_db, device_tf_metrics_db, kernel_stats_by_op_name,
+      ridge_point, /*exclude_idle=*/true);
   return tf_stats_db;
 }
 
diff --git a/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc b/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
index 5a01bf3417b..5cf2847ea0d 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
@@ -32,32 +32,69 @@ namespace tensorflow {
 namespace profiler {
 namespace {
 
-void AddTensorFlowOpEvent(absl::string_view tf_op_fullname,
-                          int64 start_timestamp_ns, int64 duration_ns,
-                          bool on_device, absl::string_view kernel_name,
-                          XPlaneBuilder* plane, XLineBuilder* line) {
+XEventBuilder AddTensorFlowOpEvent(absl::string_view tf_op_fullname,
+                                   int64 start_timestamp_ns, int64 duration_ns,
+                                   bool on_device,
+                                   absl::string_view kernel_name,
+                                   XPlaneBuilder* plane, XLineBuilder* line) {
   absl::string_view name = on_device ? kernel_name : tf_op_fullname;
   XEventBuilder event = line->AddEvent(*plane->GetOrCreateEventMetadata(name));
   event.SetTimestampNs(start_timestamp_ns);
   event.SetDurationNs(duration_ns);
-  if (!on_device) return;
+  if (!on_device) return event;
   event.ParseAndAddStatValue(*plane->GetOrCreateStatMetadata("level 0"),
                              tf_op_fullname);
+  return event;
+}
+
+void AddTensorFlowOpEventWithKernelDetails(absl::string_view tf_op_fullname,
+                                           int64 start_timestamp_ns,
+                                           int64 duration_ns, bool on_device,
+                                           absl::string_view kernel_name,
+                                           absl::string_view kernel_details,
+                                           XPlaneBuilder* plane,
+                                           XLineBuilder* line) {
+  XEventBuilder event =
+      AddTensorFlowOpEvent(tf_op_fullname, start_timestamp_ns, duration_ns,
+                           on_device, kernel_name, plane, line);
+  if (!on_device) return;
+  event.ParseAndAddStatValue(*plane->GetOrCreateStatMetadata("kernel_details"),
+                             kernel_details);
 }
 
 TEST(OpStatsToTfStats, GpuTfStats) {
-  // TfOp1 has kernel1 and kernel2; TfOp2 has kernel3.
+  // TfOp1 has kernel1 and kernel2; TfOp2 has kernel3;
+  // TfOp3 has kernel4 and kernel5 and is TensorCore eligible.
   static constexpr char kTfOp1[] = "TfOp1";
   static constexpr char kTfOp2[] = "TfOp2";
+  static constexpr char kTfOp3[] = "Conv2D";
   static constexpr char kKernel1[] = "kernel1";
   static constexpr char kKernel2[] = "kernel2";
   static constexpr char kKernel3[] = "kernel3";
+  // Kernel4 is a kernel using TensorCore
+  static constexpr char kKernel4[] = "volta_fp16_s884gemm";
+  static constexpr char kKernel5[] = "kernel5";
   constexpr int64 kKernel1StartNs = 100000;
   constexpr int64 kKernel1DurationNs = 8000;
   constexpr int64 kKernel2StartNs = 110000;
   constexpr int64 kKernel2DurationNs = 10000;
   constexpr int64 kKernel3StartNs = 120000;
   constexpr int64 kKernel3DurationNs = 10000;
+  constexpr int64 kKernel4StartNs = 130000;
+  constexpr int64 kKernel4DurationNs = 10000;
+  constexpr int64 kKernel5StartNs = 150000;
+  constexpr int64 kKernel5DurationNs = 10000;
+
+  // Mock kernel details for both kernel4 and kernel5.
+  const std::string kKernelDetails = R"MULTI(registers_per_thread:32
+static_shared_memory_usage:0
+dynamic_shared_memory_usage:16384
+grid_x:2
+grid_y:1
+grid_z:1
+block_x:32
+block_y:1
+block_z:1)MULTI";
 
   XSpace space;
   XPlaneBuilder device_plane(
@@ -79,12 +116,19 @@ TEST(OpStatsToTfStats, GpuTfStats) {
   AddTensorFlowOpEvent(absl::StrCat(kTfOp2, ":", kTfOp2), kKernel3StartNs,
                        kKernel3DurationNs, /*on_device=*/true, kKernel3,
                        &device_plane, &stream2);
+  AddTensorFlowOpEventWithKernelDetails(
+      absl::StrCat(kTfOp3, ":", kTfOp3), kKernel4StartNs, kKernel4DurationNs,
+      /*on_device=*/true, kKernel4, kKernelDetails, &device_plane, &stream2);
+  AddTensorFlowOpEventWithKernelDetails(
+      absl::StrCat(kTfOp3, ":", kTfOp3), kKernel5StartNs, kKernel5DurationNs,
+      /*on_device=*/true, kKernel5, kKernelDetails, &device_plane, &stream2);
 
-  const OpStats op_stats = ConvertXSpaceToOpStats(space);
+  const OpStats op_stats =
+      ConvertXSpaceToOpStats(space, {OP_METRICS_DB, KERNEL_STATS_DB});
   const TfStatsDatabase tf_stats = ConvertOpStatsToTfStats(op_stats);
 
-  // TfOp1, TfOp2, Idle
-  EXPECT_EQ(3, tf_stats.with_idle().tf_stats_record_size());
+  // TfOp1, TfOp3, TfOp2, Idle
+  EXPECT_EQ(4, tf_stats.with_idle().tf_stats_record_size());
 
   const TfStatsRecord& record_0 = tf_stats.with_idle().tf_stats_record(0);
   EXPECT_EQ(kTfOp1, record_0.op_name());
@@ -95,11 +139,22 @@ TEST(OpStatsToTfStats, GpuTfStats) {
             record_0.total_self_time_in_us());
 
   const TfStatsRecord& record_1 = tf_stats.with_idle().tf_stats_record(1);
-  EXPECT_EQ(kTfOp2, record_1.op_name());
-  EXPECT_EQ(kTfOp2, record_1.op_type());
+  EXPECT_EQ(kTfOp3, record_1.op_name());
+  EXPECT_EQ(kTfOp3, record_1.op_type());
   EXPECT_EQ(1, record_1.occurrences());
+  EXPECT_EQ(
+      NanosToMicros(kKernel4DurationNs) + NanosToMicros(kKernel5DurationNs),
+      record_1.total_self_time_in_us());
+  // GPU TensorCore utilization is 0.5 because kernel4 is using TensorCore and
+  // kernel5 is not using TensorCore, and they have the same duration.
+  EXPECT_DOUBLE_EQ(0.5, record_1.gpu_tensorcore_utilization());
+
+  const TfStatsRecord& record_2 = tf_stats.with_idle().tf_stats_record(2);
+  EXPECT_EQ(kTfOp2, record_2.op_name());
+  EXPECT_EQ(kTfOp2, record_2.op_type());
+  EXPECT_EQ(1, record_2.occurrences());
   EXPECT_EQ(NanosToMicros(kKernel3DurationNs),
-            record_1.total_self_time_in_us());
+            record_2.total_self_time_in_us());
 }
 
 }  // namespace
diff --git a/tensorflow/core/profiler/convert/step_events_to_steps_db.cc b/tensorflow/core/profiler/convert/step_events_to_steps_db.cc
index e4713cd73fb..f37cd6ed103 100644
--- a/tensorflow/core/profiler/convert/step_events_to_steps_db.cc
+++ b/tensorflow/core/profiler/convert/step_events_to_steps_db.cc
@@ -119,10 +119,10 @@ StepDatabaseResult ConvertStepEventsToStepDb(
   }
   absl::c_sort(step_numbers);
   for (const auto& step : step_numbers) {
-    const auto* events = gtl::FindOrNull(nonoverlapped_step_events, step);
-    if (events == nullptr) continue;
+    const auto* step_details = gtl::FindOrNull(nonoverlapped_step_events, step);
+    if (step_details == nullptr) continue;
     StepInfoResult step_info =
-        ConvertStepDetailsToStepInfo(has_device, step, *events);
+        ConvertStepDetailsToStepInfo(has_device, step, *step_details);
     if (step_info.duration_ps() == 0)
       continue;  // Do not include non-well-formed steps.
     PerCoreStepInfo per_core_step_info;
@@ -137,6 +137,17 @@ StepDatabaseResult ConvertStepEventsToStepDb(
             << DebugStepInfo((
                    *per_core_step_info
                         .mutable_step_info_per_core())[kDefaultGpuLocalCoreId]);
+    // Populates the collective ops information.
+    auto& collectives = *per_core_step_info.mutable_all_reduce_db_per_core();
+    for (const auto& it : step_details->Collectives()) {
+      collectives[it.first] = it.second;
+    }
+    // Populates the device transfer stats for this step.
+    auto& device_memory_transfers =
+        *per_core_step_info.mutable_device_memory_transfers();
+    for (const auto& dma : step_details->DeviceMemoryTransfers()) {
+      *device_memory_transfers.Add() = dma;
+    }
     // The remaining fields in PerCoreStepInfo are not filled.
     *step_db.add_step_sequence() = per_core_step_info;
   }
diff --git a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc
index 4d42d51cf6c..e404e096b70 100644
--- a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <functional>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/platform/logging.h"
@@ -33,11 +34,11 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 
-KernelStatsDb ConvertDeviceTraceXPlaneToKernelStatsDb(
+void ConvertDeviceTraceXPlaneToKernelReports(
     const XPlane& device_trace,
     const std::function<void(const XEventVisitor&, KernelReport*)>&
-        on_kernel_fn) {
-  KernelStatsDb result;
+        on_kernel_fn,
+    KernelReportMap* reports) {
   XPlaneVisitor plane = CreateTfXPlaneVisitor(&device_trace);
   plane.ForEachLine([&](const XLineVisitor& line) {
     if (IsDerivedThreadId(line.Id())) {
@@ -92,12 +93,15 @@ KernelStatsDb ConvertDeviceTraceXPlaneToKernelStatsDb(
       }
 
       if (kernel.total_duration_ns()) {
-        *result.add_reports() = kernel;
+        KernelReportValue value;
+        value.total_duration_ns = event.DurationNs();
+        value.min_duration_ns = event.DurationNs();
+        value.max_duration_ns = event.DurationNs();
+        value.occurrences = 1;
+        InsertOrUpdateKernelReport(kernel, value, reports);
       }
     });
   });
-
-  return result;
 }
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h
index 9c7fca22887..56393c18e2b 100644
--- a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h
+++ b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h
@@ -18,17 +18,20 @@ limitations under the License.
 
 #include <functional>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
 
-KernelStatsDb ConvertDeviceTraceXPlaneToKernelStatsDb(
+void ConvertDeviceTraceXPlaneToKernelReports(
     const XPlane& device_trace,
     const std::function<void(const XEventVisitor&, KernelReport*)>&
-        on_kernel_fn);
+        on_kernel_fn,
+    KernelReportMap* reports);
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db_test.cc b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db_test.cc
index 3c4ac648f92..e402b3b6672 100644
--- a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_test_utils.h"
@@ -37,7 +38,7 @@ TEST(ConvertXplaneToKernelStats, MultiKernels) {
   device_trace_builder.GetOrCreateLine(0);
 
   XLineBuilder line_builder = device_trace_builder.GetOrCreateLine(0);
-  CreateXEvent(&device_trace_builder, &line_builder, "kernel_name_0",
+  CreateXEvent(&device_trace_builder, &line_builder, "kernel_name_shortest",
                /*offset_ps=*/10000, /*duration_ps=*/1000,
                {{StatType::kLevel0, "mul_786"},
                 {StatType::kKernelDetails, R"MULTI(registers_per_thread:16
@@ -51,7 +52,7 @@ block_y:1
 block_z:1)MULTI"},
                 {StatType::kEquation, ""}});
 
-  CreateXEvent(&device_trace_builder, &line_builder, "kernel_name_1",
+  CreateXEvent(&device_trace_builder, &line_builder, "kernel_name_middle",
                /*offset_ps=*/20000, /*duration_ps=*/2000,
                {{StatType::kLevel0, "Conv2D"},
                 {StatType::kKernelDetails, R"MULTI(registers_per_thread:32
@@ -79,58 +80,68 @@ block_x:64
 block_y:1
 block_z:1)MULTI"},
                 {StatType::kEquation, ""}});
-  KernelStatsDb kernel_stats =
-      ConvertDeviceTraceXPlaneToKernelStatsDb(*device_trace, {});
+
+  KernelReportMap reports;
+  ConvertDeviceTraceXPlaneToKernelReports(*device_trace, {}, &reports);
+  KernelStatsDb kernel_stats;
+  CopyKernelReportsToDb(reports, &kernel_stats);
+  SortKernelsByTotalDurationDesc(&kernel_stats);
 
   EXPECT_EQ(kernel_stats.reports_size(), 3);
 
-  const auto& kernel0 = kernel_stats.reports().at(0);
-  EXPECT_EQ(kernel0.name(), "kernel_name_0");
-  EXPECT_EQ(kernel0.registers_per_thread(), 16);
-  EXPECT_EQ(kernel0.static_shmem_bytes(), 0);
-  EXPECT_EQ(kernel0.dynamic_shmem_bytes(), 0);
-  EXPECT_EQ(kernel0.grid_dim().at(0), 1);
-  EXPECT_EQ(kernel0.grid_dim().at(1), 1);
-  EXPECT_EQ(kernel0.grid_dim().at(2), 1);
-  EXPECT_EQ(kernel0.block_dim().at(0), 1);
-  EXPECT_EQ(kernel0.block_dim().at(1), 1);
-  EXPECT_EQ(kernel0.block_dim().at(2), 1);
-  EXPECT_EQ(kernel0.total_duration_ns(), 1);
-  EXPECT_FALSE(kernel0.is_kernel_using_tensor_core());
-  EXPECT_FALSE(kernel0.is_op_tensor_core_eligible());
-  EXPECT_EQ(kernel0.op_name(), "mul_786");
+  {
+    const auto& kernel = kernel_stats.reports().at(2);
+    EXPECT_EQ(kernel.name(), "kernel_name_shortest");
+    EXPECT_EQ(kernel.registers_per_thread(), 16);
+    EXPECT_EQ(kernel.static_shmem_bytes(), 0);
+    EXPECT_EQ(kernel.dynamic_shmem_bytes(), 0);
+    EXPECT_EQ(kernel.grid_dim().at(0), 1);
+    EXPECT_EQ(kernel.grid_dim().at(1), 1);
+    EXPECT_EQ(kernel.grid_dim().at(2), 1);
+    EXPECT_EQ(kernel.block_dim().at(0), 1);
+    EXPECT_EQ(kernel.block_dim().at(1), 1);
+    EXPECT_EQ(kernel.block_dim().at(2), 1);
+    EXPECT_EQ(kernel.total_duration_ns(), 1);
+    EXPECT_FALSE(kernel.is_kernel_using_tensor_core());
+    EXPECT_FALSE(kernel.is_op_tensor_core_eligible());
+    EXPECT_EQ(kernel.op_name(), "mul_786");
+  }
 
-  const auto& kernel1 = kernel_stats.reports().at(1);
-  EXPECT_EQ(kernel1.name(), "kernel_name_1");
-  EXPECT_EQ(kernel1.registers_per_thread(), 32);
-  EXPECT_EQ(kernel1.static_shmem_bytes(), 0);
-  EXPECT_EQ(kernel1.dynamic_shmem_bytes(), 16384);
-  EXPECT_EQ(kernel1.grid_dim().at(0), 2);
-  EXPECT_EQ(kernel1.grid_dim().at(1), 1);
-  EXPECT_EQ(kernel1.grid_dim().at(2), 1);
-  EXPECT_EQ(kernel1.block_dim().at(0), 32);
-  EXPECT_EQ(kernel1.block_dim().at(1), 1);
-  EXPECT_EQ(kernel1.block_dim().at(2), 1);
-  EXPECT_EQ(kernel1.total_duration_ns(), 2);
-  EXPECT_FALSE(kernel1.is_kernel_using_tensor_core());
-  EXPECT_TRUE(kernel1.is_op_tensor_core_eligible());
-  EXPECT_EQ(kernel1.op_name(), "Conv2D");
+  {
+    const auto& kernel = kernel_stats.reports().at(1);
+    EXPECT_EQ(kernel.name(), "kernel_name_middle");
+    EXPECT_EQ(kernel.registers_per_thread(), 32);
+    EXPECT_EQ(kernel.static_shmem_bytes(), 0);
+    EXPECT_EQ(kernel.dynamic_shmem_bytes(), 16384);
+    EXPECT_EQ(kernel.grid_dim().at(0), 2);
+    EXPECT_EQ(kernel.grid_dim().at(1), 1);
+    EXPECT_EQ(kernel.grid_dim().at(2), 1);
+    EXPECT_EQ(kernel.block_dim().at(0), 32);
+    EXPECT_EQ(kernel.block_dim().at(1), 1);
+    EXPECT_EQ(kernel.block_dim().at(2), 1);
+    EXPECT_EQ(kernel.total_duration_ns(), 2);
+    EXPECT_FALSE(kernel.is_kernel_using_tensor_core());
+    EXPECT_TRUE(kernel.is_op_tensor_core_eligible());
+    EXPECT_EQ(kernel.op_name(), "Conv2D");
+  }
 
-  const auto& kernel2 = kernel_stats.reports().at(2);
-  EXPECT_EQ(kernel2.name(), "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_tn");
-  EXPECT_EQ(kernel2.registers_per_thread(), 32);
-  EXPECT_EQ(kernel2.static_shmem_bytes(), 0);
-  EXPECT_EQ(kernel2.dynamic_shmem_bytes(), 16384);
-  EXPECT_EQ(kernel2.grid_dim().at(0), 3);
-  EXPECT_EQ(kernel2.grid_dim().at(1), 1);
-  EXPECT_EQ(kernel2.grid_dim().at(2), 1);
-  EXPECT_EQ(kernel2.block_dim().at(0), 64);
-  EXPECT_EQ(kernel2.block_dim().at(1), 1);
-  EXPECT_EQ(kernel2.block_dim().at(2), 1);
-  EXPECT_EQ(kernel2.total_duration_ns(), 3);
-  EXPECT_TRUE(kernel2.is_kernel_using_tensor_core());
-  EXPECT_TRUE(kernel2.is_op_tensor_core_eligible());
-  EXPECT_EQ(kernel2.op_name(), "Einsum_80");
+  {
+    const auto& kernel = kernel_stats.reports().at(0);
+    EXPECT_EQ(kernel.name(), "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_tn");
+    EXPECT_EQ(kernel.registers_per_thread(), 32);
+    EXPECT_EQ(kernel.static_shmem_bytes(), 0);
+    EXPECT_EQ(kernel.dynamic_shmem_bytes(), 16384);
+    EXPECT_EQ(kernel.grid_dim().at(0), 3);
+    EXPECT_EQ(kernel.grid_dim().at(1), 1);
+    EXPECT_EQ(kernel.grid_dim().at(2), 1);
+    EXPECT_EQ(kernel.block_dim().at(0), 64);
+    EXPECT_EQ(kernel.block_dim().at(1), 1);
+    EXPECT_EQ(kernel.block_dim().at(2), 1);
+    EXPECT_EQ(kernel.total_duration_ns(), 3);
+    EXPECT_TRUE(kernel.is_kernel_using_tensor_core());
+    EXPECT_TRUE(kernel.is_op_tensor_core_eligible());
+    EXPECT_EQ(kernel.op_name(), "Einsum_80");
+  }
 }
 
 }  // namespace
diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc b/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
index 530e402c8b7..3b67124ef27 100644
--- a/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
@@ -24,17 +24,20 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/profiler/protobuf/memory_profile.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
@@ -403,7 +406,7 @@ void ProcessActiveAllocations(int64 peak_bytes_profile_step_id,
 
   // Fill the sorted active_allocations proto messages at peak memory usage.
   // Merge identical allocations and show occurrences.
-  for (int i = 0; i < active_allocs.size(); i++) {
+  for (int i = 0, end = active_allocs.size(); i < end; i++) {
     ActiveAllocation* allocation = memory_profile->add_active_allocations();
     allocation->set_snapshot_index(active_allocs[i].first);
     if (active_allocs[i].first < 0) {
@@ -412,8 +415,8 @@ void ProcessActiveAllocations(int64 peak_bytes_profile_step_id,
       allocation->set_special_index(-1);
     }
     allocation->set_num_occurrences(1);
-    while (i < active_allocs.size() - 1 &&
-           active_allocs[i] == active_allocs[i + 1]) {
+    const int last_alloc = active_allocs.size() - 1;
+    while (i < last_alloc && active_allocs[i] == active_allocs[i + 1]) {
       allocation->set_num_occurrences(allocation->num_occurrences() + 1);
       i++;
     }
@@ -423,23 +426,86 @@ void ProcessActiveAllocations(int64 peak_bytes_profile_step_id,
           << memory_profile->active_allocations_size();
 }
 
+struct Sample {
+  int64 orig_index;  // original index to the snapshot.
+  MemoryProfileSnapshot* snapshot;
+};
+
+// This function samples max_num_snapshots from snapshots. We first keep the
+// snapshots referenced by active_allocations in the samples. After this, if
+// there is still room for more samples, we pick more from snapshots into the
+// samples. Then, we sort the samples in time (so that they can be correctly
+// displayed on the timeline). Finally, we need to adjust the original indices
+// (to snapshots) in active_allocations to the new indices in the samples.
 void SampleSnapshots(
     int64 max_num_snapshots,
-    protobuf::RepeatedPtrField<MemoryProfileSnapshot>* snapshots) {
+    protobuf::RepeatedPtrField<MemoryProfileSnapshot>* snapshots,
+    protobuf::RepeatedPtrField<ActiveAllocation>* active_allocations) {
   if (snapshots->size() <= max_num_snapshots) return;
-  absl::c_partial_sort(
-      *snapshots, snapshots->begin() + max_num_snapshots,
-      [](const MemoryProfileSnapshot& a, const MemoryProfileSnapshot& b) {
-        return a.aggregation_stats().free_memory_bytes() <
-               b.aggregation_stats().free_memory_bytes();
-      });
-  snapshots->erase(snapshots->begin() + max_num_snapshots, snapshots->end());
-  // Sort the memory_profile_snapshots by time_offset_ps (ascending) after
-  // sampling.
-  absl::c_sort(*snapshots, [](const MemoryProfileSnapshot& a,
-                              const MemoryProfileSnapshot& b) {
-    return a.time_offset_ps() < b.time_offset_ps();
+
+  std::vector<Sample> samples;
+
+  // First, puts the snapshots referenced by active_allocations in samples[].
+  absl::flat_hash_set<int64> allocation_snapshot_indices;
+  for (const auto& allocation : *active_allocations) {
+    auto orig_index = allocation.snapshot_index();
+    if (orig_index < 0) continue;
+    allocation_snapshot_indices.insert(orig_index);
+    samples.push_back({orig_index, &(*snapshots)[orig_index]});
+    if (allocation_snapshot_indices.size() >= max_num_snapshots) break;
+  }
+
+  // Second, extracts remaining samples from snapshots.
+  int64 num_samples_remained =
+      max_num_snapshots - allocation_snapshot_indices.size();
+  if (num_samples_remained > 0) {
+    std::vector<Sample> remaining;
+    for (int64 i = 0; i < snapshots->size(); i++) {
+      if (allocation_snapshot_indices.contains(i)) continue;
+      // snapshots[i] is not yet sampled; put it in remaining[] for further
+      // consideration.
+      remaining.push_back({i, &(*snapshots)[i]});
+    }
+    // Moves the num_samples_remained snapshots with least free bytes to the
+    // beginning of remaining[].
+    absl::c_partial_sort(
+        remaining, remaining.begin() + num_samples_remained,
+        [](const Sample& a, const Sample& b) {
+          return a.snapshot->aggregation_stats().free_memory_bytes() <
+                 b.snapshot->aggregation_stats().free_memory_bytes();
+        });
+    // Copies the first num_samples_remained in remaining[] to samples[].
+    for (int64 i = 0; i < num_samples_remained; i++)
+      samples.push_back(remaining[i]);
+  }
+
+  // Third, sorts samples[] in ascending order of time_offset_ps.
+  absl::c_sort(samples, [](const Sample& a, const Sample& b) {
+    return a.snapshot->time_offset_ps() < b.snapshot->time_offset_ps();
   });
+
+  // Fourth, constructs a map from the original snapshot index to samples index.
+  absl::flat_hash_map</*original=*/int64, /*new=*/int64> index_map;
+  for (int64 i = 0; i < samples.size(); i++) {
+    index_map[samples[i].orig_index] = i;
+  }
+
+  // Fifth, changes the original snapshot indices in active_allocations to the
+  // sample indices.
+  for (auto& allocation : *active_allocations) {
+    auto orig_index = allocation.snapshot_index();
+    if (orig_index < 0) continue;
+    auto new_index = gtl::FindWithDefault(index_map, orig_index, -1);
+    allocation.set_snapshot_index(new_index);
+  }
+
+  // Sixth, replaces *snapshot by samples[]
+  protobuf::RepeatedPtrField<MemoryProfileSnapshot> new_snapshots;
+  new_snapshots.Reserve(samples.size());
+  for (const auto& sample : samples) {
+    *new_snapshots.Add() = std::move(*sample.snapshot);
+  }
+  *snapshots = std::move(new_snapshots);
 }
 
 // Post-process the memory profile to correctly update proto fields, and break
@@ -477,10 +543,28 @@ void ProcessMemoryProfileProto(int64 max_num_snapshots,
                               .peak_bytes_in_use(),
                           allocator_memory_profile);
     ProcessActiveAllocations(peak_step_id, allocator_memory_profile);
-    SampleSnapshots(max_num_snapshots, snapshots);
+    SampleSnapshots(max_num_snapshots, snapshots,
+                    allocator_memory_profile->mutable_active_allocations());
   }
 }
 
+template <typename Proto>
+Status ConvertProtoToJson(const Proto& proto_output, std::string* json_output) {
+  protobuf::util::JsonPrintOptions json_options;
+  json_options.always_print_primitive_fields = true;
+  auto status = protobuf::util::MessageToJsonString(proto_output, json_output,
+                                                    json_options);
+  if (!status.ok()) {
+    // Convert error_msg google::protobuf::StringPiece (or absl::string_view) to
+    // tensorflow::StringPiece.
+    auto error_msg = status.message();
+    return errors::Internal(
+        "Could not convert proto to JSON string: ",
+        absl::string_view(error_msg.data(), error_msg.length()));
+  }
+  return Status::OK();
+}
+
 }  // namespace
 
 MemoryProfile ConvertXPlaneToMemoryProfile(const XPlane& host_plane,
@@ -490,5 +574,15 @@ MemoryProfile ConvertXPlaneToMemoryProfile(const XPlane& host_plane,
   return memory_profile;
 }
 
+Status ConvertXSpaceToMemoryProfileJson(const XSpace& xspace,
+                                        std::string* json_output) {
+  if (const XPlane* host_plane =
+          FindPlaneWithName(xspace, kHostThreadsPlaneName)) {
+    MemoryProfile memory_profile = ConvertXPlaneToMemoryProfile(*host_plane);
+    TF_RETURN_IF_ERROR(ConvertProtoToJson(memory_profile, json_output));
+  }
+  return Status::OK();
+}
+
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile.h b/tensorflow/core/profiler/convert/xplane_to_memory_profile.h
index 6eddaeeec71..73681794e18 100644
--- a/tensorflow/core/profiler/convert/xplane_to_memory_profile.h
+++ b/tensorflow/core/profiler/convert/xplane_to_memory_profile.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_MEMORY_PROFILE_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_MEMORY_PROFILE_H_
 
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/memory_profile.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
@@ -29,6 +30,8 @@ namespace profiler {
 MemoryProfile ConvertXPlaneToMemoryProfile(const XPlane& host_plane,
                                            int64 max_num_snapshots = 1000);
 
+Status ConvertXSpaceToMemoryProfileJson(const XSpace& xspace,
+                                        std::string* json_output);
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc b/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
index c334318dcfe..7758b215753 100644
--- a/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
@@ -40,15 +40,15 @@ TEST(ConvertXPlaneToMemoryProfile, OneAllocatorMultiActivitiesTest) {
   auto tf_executor_thread = host_plane_builder.GetOrCreateLine(0);
   CreateXEvent(&host_plane_builder, &tf_executor_thread, "MemoryAllocation",
                40000, 1000,
-               {{StatType::kBytesReserved, 2000LL},
-                {StatType::kBytesAllocated, 3000LL},
-                {StatType::kBytesAvailable, 5000LL},
-                {StatType::kPeakBytesInUse, 8500LL},
-                {StatType::kRequestedBytes, 200LL},
-                {StatType::kAllocationBytes, 256LL},
-                {StatType::kAddress, 222333LL},
-                {StatType::kStepId, -93746LL},
-                {StatType::kDataType, 1LL},
+               {{StatType::kBytesReserved, int64{2000}},
+                {StatType::kBytesAllocated, int64{3000}},
+                {StatType::kBytesAvailable, int64{5000}},
+                {StatType::kPeakBytesInUse, int64{8500}},
+                {StatType::kRequestedBytes, int64{200}},
+                {StatType::kAllocationBytes, int64{256}},
+                {StatType::kAddress, int64{222333}},
+                {StatType::kStepId, int64{-93746}},
+                {StatType::kDataType, int64{1}},
                 {StatType::kAllocatorName, "GPU_0_bfc"},
                 {StatType::kTfOp, "foo/bar"},
                 {StatType::kRegionType, "output"},
@@ -56,30 +56,30 @@ TEST(ConvertXPlaneToMemoryProfile, OneAllocatorMultiActivitiesTest) {
 
   CreateXEvent(&host_plane_builder, &tf_executor_thread, "MemoryDeallocation",
                50000, 1000,
-               {{StatType::kBytesReserved, 2000LL},
-                {StatType::kBytesAllocated, 2744LL},
-                {StatType::kBytesAvailable, 5256LL},
-                {StatType::kPeakBytesInUse, 8500LL},
-                {StatType::kRequestedBytes, 200LL},
-                {StatType::kAllocationBytes, 256LL},
-                {StatType::kAddress, 222333LL},
-                {StatType::kStepId, 0LL},
-                {StatType::kDataType, 0LL},
+               {{StatType::kBytesReserved, int64{2000}},
+                {StatType::kBytesAllocated, int64{2744}},
+                {StatType::kBytesAvailable, int64{5256}},
+                {StatType::kPeakBytesInUse, int64{8500}},
+                {StatType::kRequestedBytes, int64{200}},
+                {StatType::kAllocationBytes, int64{256}},
+                {StatType::kAddress, int64{222333}},
+                {StatType::kStepId, int64{0}},
+                {StatType::kDataType, int64{0}},
                 {StatType::kAllocatorName, "GPU_0_bfc"},
                 {StatType::kRegionType, ""},
                 {StatType::kTensorShapes, ""}});
 
   CreateXEvent(&host_plane_builder, &tf_executor_thread, "MemoryAllocation",
                70000, 1000,
-               {{StatType::kBytesReserved, 2000LL},
-                {StatType::kBytesAllocated, 5000LL},
-                {StatType::kBytesAvailable, 3000LL},
-                {StatType::kPeakBytesInUse, 9500LL},
-                {StatType::kRequestedBytes, 300LL},
-                {StatType::kAllocationBytes, 300LL},
-                {StatType::kAddress, 345678LL},
-                {StatType::kStepId, -93746LL},
-                {StatType::kDataType, 9LL},
+               {{StatType::kBytesReserved, int64{2000}},
+                {StatType::kBytesAllocated, int64{5000}},
+                {StatType::kBytesAvailable, int64{3000}},
+                {StatType::kPeakBytesInUse, int64{9500}},
+                {StatType::kRequestedBytes, int64{300}},
+                {StatType::kAllocationBytes, int64{300}},
+                {StatType::kAddress, int64{345678}},
+                {StatType::kStepId, int64{-93746}},
+                {StatType::kDataType, int64{9}},
                 {StatType::kAllocatorName, "GPU_0_bfc"},
                 {StatType::kTfOp, "mul_grad/Sum"},
                 {StatType::kRegionType, "temp"},
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
index a67da50d35c..2f4bf2689b0 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
@@ -106,7 +107,8 @@ void SetRunEnvironment(int32 accelerator_count, RunEnvironment* env) {
 }
 
 void ProcessHostPlane(const XPlane* host_plane, bool use_device_step_events,
-                      OpMetricsDb* op_metrics_db, StepEvents* step_events) {
+                      const OpStatsConfig& config, OpMetricsDb* op_metrics_db,
+                      StepEvents* step_events) {
   absl::flat_hash_map<int64, TfOp> tf_ops =
       CollectTfOpsFromHostThreadsXPlane(*host_plane);
   OpMetricsDbCombiner combiner(op_metrics_db);
@@ -114,9 +116,11 @@ void ProcessHostPlane(const XPlane* host_plane, bool use_device_step_events,
   plane.ForEachLine([&](const XLineVisitor& line) {
     ConsumeTfMetricsDbData(
         ConvertHostThreadsXLineToTfMetricsDbData(line, tf_ops), &combiner);
-    CombineStepEvents(ConvertHostThreadsXLineToStepEvents(
-                          line, use_device_step_events, *step_events),
-                      step_events);
+    if (config.contains(STEP_DB)) {
+      CombineStepEvents(ConvertHostThreadsXLineToStepEvents(
+                            line, use_device_step_events, *step_events),
+                        step_events);
+    }
   });
 }
 
@@ -138,7 +142,8 @@ void PropagateXSpaceDiagnosticsToOpStats(const XSpace& space,
   }
 }
 
-OpStats ConvertXSpaceToOpStats(const XSpace& space) {
+OpStats ConvertXSpaceToOpStats(const XSpace& space,
+                               const OpStatsConfig& config) {
   const XPlane* host_plane = FindPlaneWithName(space, kHostThreadsPlaneName);
   std::vector<const XPlane*> device_planes =
       FindPlanesWithPrefix(space, kGpuPlanePrefix);
@@ -150,36 +155,50 @@ OpStats ConvertXSpaceToOpStats(const XSpace& space) {
       op_stats.mutable_device_op_metrics_db());
   SetRunEnvironment(device_planes.size(), op_stats.mutable_run_environment());
 
-  std::vector<KernelReport> reports;
+  KernelReportMap reports;
+  // TODO(b/161942993) parallelize XPlane processing per thread.
   for (const XPlane* device_trace : device_planes) {
-    if (!op_stats.has_perf_env()) {
-      *op_stats.mutable_perf_env() = GetPerfEnvFromXPlane(*device_trace);
+    if (config.contains(OP_METRICS_DB)) {
+      if (!op_stats.has_perf_env()) {
+        *op_stats.mutable_perf_env() = GetPerfEnvFromXPlane(*device_trace);
+      }
+      const PerfEnv& perf_env = op_stats.perf_env();
+      OpMetricsDb device_op_metrics_db = ConvertDeviceTraceXPlaneToOpMetricsDb(
+          *device_trace, perf_env.peak_tera_flops_per_second(),
+          perf_env.peak_hbm_bw_giga_bytes_per_second());
+      op_metrics_db_combiner.Combine(device_op_metrics_db);
+    }
+    if (config.contains(STEP_DB)) {
+      CombineStepEvents(ConvertDeviceTraceXPlaneToStepEvents(*device_trace),
+                        &step_events);
+    }
+    if (config.contains(KERNEL_STATS_DB)) {
+      ConvertDeviceTraceXPlaneToKernelReports(*device_trace,
+                                              /*on_kernel_fn=*/{}, &reports);
     }
-    const PerfEnv& perf_env = op_stats.perf_env();
-    OpMetricsDb device_op_metrics_db = ConvertDeviceTraceXPlaneToOpMetricsDb(
-        *device_trace, perf_env.peak_tera_flops_per_second(),
-        perf_env.peak_hbm_bw_giga_bytes_per_second());
-    op_metrics_db_combiner.Combine(device_op_metrics_db);
-    CombineStepEvents(ConvertDeviceTraceXPlaneToStepEvents(*device_trace),
-                      &step_events);
-    KernelStatsDb kernel_stats_db = ConvertDeviceTraceXPlaneToKernelStatsDb(
-        *device_trace, /*on_kernel_fn=*/{});
-    reports.insert(reports.begin(), kernel_stats_db.reports().begin(),
-                   kernel_stats_db.reports().end());
   }
-  GroupKernelReports(&reports, op_stats.mutable_kernel_stats_db());
-  SortKernelsByTotalDurationDesc(op_stats.mutable_kernel_stats_db());
-  // Convert a host plane.
+
+  // Combine into reports.
+  if (config.contains(KERNEL_STATS_DB)) {
+    CopyKernelReportsToDb(reports, op_stats.mutable_kernel_stats_db());
+    // TODO(b/161943499) Replace sort with a TopK algorithm.
+    SortKernelsByTotalDurationDesc(op_stats.mutable_kernel_stats_db());
+  }
+
   bool has_device = !device_planes.empty();
-  if (host_plane) {
-    ProcessHostPlane(host_plane, has_device,
+  // Convert a host plane.
+  if (host_plane && config.contains(OP_METRICS_DB)) {
+    ProcessHostPlane(host_plane, has_device, config,
                      op_stats.mutable_host_op_metrics_db(), &step_events);
   }
-  StepEvents nonoverlapped_step_events = ToNonOverlappedStepEvents(step_events);
-  *op_stats.mutable_step_db() =
-      ConvertStepEventsToStepDb(has_device, nonoverlapped_step_events);
-  *op_stats.mutable_device_op_metrics_db()->mutable_precision_stats() =
-      ComputePrecisionStats(nonoverlapped_step_events);
+  if (config.contains(STEP_DB)) {
+    StepEvents nonoverlapped_step_events =
+        ToNonOverlappedStepEvents(step_events);
+    *op_stats.mutable_step_db() =
+        ConvertStepEventsToStepDb(has_device, nonoverlapped_step_events);
+    *op_stats.mutable_device_op_metrics_db()->mutable_precision_stats() =
+        ComputePrecisionStats(nonoverlapped_step_events);
+  }
   return op_stats;
 }
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.h b/tensorflow/core/profiler/convert/xplane_to_op_stats.h
index b69a99a2ef5..e1778006cbd 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats.h
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.h
@@ -16,14 +16,24 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_OP_STATS_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_OP_STATS_H_
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
 namespace profiler {
 
+enum OpStatsKind {
+  OP_METRICS_DB,
+  STEP_DB,
+  KERNEL_STATS_DB,
+};
+
+using OpStatsConfig = absl::flat_hash_set<OpStatsKind>;
+
 // NOTE: call GroupTfEvents before if OpStats.step_db needs to be generated.
-OpStats ConvertXSpaceToOpStats(const XSpace& space);
+OpStats ConvertXSpaceToOpStats(const XSpace& space,
+                               const OpStatsConfig& config);
 
 // Propagate and dedup the diagnostics in XSpace and add to OpStats.
 void PropagateXSpaceDiagnosticsToOpStats(const XSpace& space,
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
index 68bb8205f5e..beeb4a097bc 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
@@ -61,8 +61,8 @@ TEST(ConvertXPlaneToOpStats, PerfEnv) {
       *device_plane.GetOrCreateStatMetadata("compute_cap_minor"),
       absl::StrCat(kComputeCapMinor));
 
-  GroupTfEvents(&space, /*event_group_name_map=*/nullptr);
-  OpStats op_stats = ConvertXSpaceToOpStats(space);
+  GroupTfEvents(&space, /*group_metadata_map=*/nullptr);
+  OpStats op_stats = ConvertXSpaceToOpStats(space, {OP_METRICS_DB});
   const PerfEnv& perf_env = op_stats.perf_env();
   EXPECT_NEAR(141, perf_env.peak_tera_flops_per_second(), kMaxError);
   EXPECT_NEAR(900, perf_env.peak_hbm_bw_giga_bytes_per_second(), kMaxError);
@@ -76,8 +76,8 @@ TEST(ConvertXPlaneToOpStats, RunEnvironment) {
   XPlaneBuilder device_plane2(
       GetOrCreateGpuXPlane(&space, /*device_ordinal=*/1));
 
-  GroupTfEvents(&space, /*event_group_name_map=*/nullptr);
-  OpStats op_stats = ConvertXSpaceToOpStats(space);
+  GroupTfEvents(&space, /*group_metadata_map=*/nullptr);
+  OpStats op_stats = ConvertXSpaceToOpStats(space, {});
   const RunEnvironment& run_env = op_stats.run_environment();
 
   EXPECT_EQ("GPU", run_env.device_type());
@@ -106,8 +106,8 @@ TEST(ConvertXPlaneToOpStats, CpuOnlyStepDbTest) {
                {{StatType::kStepId, kStepId}});
   CreateXEvent(&host_plane_builder, &tf_executor_thread, "matmul", 30, 70);
 
-  GroupTfEvents(&space, /*event_group_name_map=*/nullptr);
-  OpStats op_stats = ConvertXSpaceToOpStats(space);
+  GroupTfEvents(&space, /*group_metadata_map=*/nullptr);
+  OpStats op_stats = ConvertXSpaceToOpStats(space, {OP_METRICS_DB, STEP_DB});
   const StepDatabaseResult& step_db = op_stats.step_db();
 
   EXPECT_EQ(step_db.step_sequence_size(), 1);
@@ -143,8 +143,8 @@ TEST(ConvertXPlaneToOpStats, GpuStepDbTest) {
   CreateXEvent(&device_plane_builder, &stream, "matmul", 50, 40,
                {{StatType::kCorrelationId, kCorrelationId}});
 
-  GroupTfEvents(&space, /*event_group_name_map=*/nullptr);
-  OpStats op_stats = ConvertXSpaceToOpStats(space);
+  GroupTfEvents(&space, /*group_metadata_map=*/nullptr);
+  OpStats op_stats = ConvertXSpaceToOpStats(space, {OP_METRICS_DB, STEP_DB});
   const StepDatabaseResult& step_db = op_stats.step_db();
 
   EXPECT_EQ(step_db.step_sequence_size(), 1);
@@ -161,7 +161,7 @@ TEST(ConvertXPlaneToOpStats, PropagateAndDedupErrors) {
   *space.add_errors() = kError;
   *space.add_errors() = kError;
 
-  OpStats op_stats = ConvertXSpaceToOpStats(space);
+  OpStats op_stats = ConvertXSpaceToOpStats(space, {});
 
   EXPECT_EQ(1, op_stats.diagnostics().errors_size());
   EXPECT_EQ(kError, op_stats.diagnostics().errors(/*index=*/0));
diff --git a/tensorflow/core/profiler/convert/xplane_to_profile_response.cc b/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
index 22af46c4380..54e9a8b2a10 100644
--- a/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
@@ -30,18 +30,15 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
 #include "tensorflow/core/profiler/convert/xplane_to_trace_events.h"
 #include "tensorflow/core/profiler/profiler_service.pb.h"
-#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/memory_profile.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
 #include "tensorflow/core/profiler/protobuf/tf_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/rpc/client/save_profile.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -53,13 +50,6 @@ const absl::string_view kInputPipeline = "input_pipeline";
 const absl::string_view kOverviewPage = "overview_page";
 const absl::string_view kKernelStats = "kernel_stats";
 const absl::string_view kMemoryProfile = "memory_profile";
-const absl::string_view kXPlane = "xplane";
-
-HardwareType HardwareTypeFromRunEnvironment(const RunEnvironment& run_env) {
-  if (run_env.device_type() == "GPU") return HardwareType::GPU;
-  if (run_env.device_type() == "CPU") return HardwareType::CPU_ONLY;
-  return HardwareType::UNKNOWN_HARDWARE;
-}
 
 template <typename Proto>
 void AddToolData(absl::string_view tool_name, const Proto& tool_output,
@@ -69,23 +59,6 @@ void AddToolData(absl::string_view tool_name, const Proto& tool_output,
   tool_output.SerializeToString(tool_data->mutable_data());
 }
 
-template <typename Proto>
-Status ConvertProtoToJson(const Proto& proto_output, std::string* json_output) {
-  protobuf::util::JsonPrintOptions json_options;
-  json_options.always_print_primitive_fields = true;
-  auto status = protobuf::util::MessageToJsonString(proto_output, json_output,
-                                                    json_options);
-  if (!status.ok()) {
-    // Convert error_msg google::protobuf::StringPiece (or absl::string_view) to
-    // tensorflow::StringPiece.
-    auto error_msg = status.message();
-    return errors::Internal(
-        "Could not convert proto to JSON string: ",
-        absl::string_view(error_msg.data(), error_msg.length()));
-  }
-  return Status::OK();
-}
-
 // Returns the tool name with extension.
 std::string ToolName(absl::string_view tool) {
   if (tool == kTraceViewer) return "trace.json.gz";
@@ -100,7 +73,6 @@ Status ConvertXSpaceToProfileResponse(const XSpace& xspace,
                                       ProfileResponse* response) {
   absl::flat_hash_set<absl::string_view> tools(req.tools().begin(),
                                                req.tools().end());
-  AddToolData(ToolName(kXPlane), xspace, response);
   if (tools.empty()) return Status::OK();
   if (tools.contains(kTraceViewer)) {
     Trace trace;
@@ -115,21 +87,18 @@ Status ConvertXSpaceToProfileResponse(const XSpace& xspace,
     // Trace viewer is the only tool, skip OpStats conversion.
     if (tools.size() == 1) return Status::OK();
   }
-  OpStats op_stats = ConvertXSpaceToOpStats(xspace);
-  HardwareType hw_type =
-      HardwareTypeFromRunEnvironment(op_stats.run_environment());
+  OpStats op_stats =
+      ConvertXSpaceToOpStats(xspace, {OP_METRICS_DB, STEP_DB, KERNEL_STATS_DB});
   if (tools.contains(kOverviewPage)) {
-    OverviewPage overview_page_db =
-        ConvertOpStatsToOverviewPage(op_stats, hw_type);
+    OverviewPage overview_page_db = ConvertOpStatsToOverviewPage(op_stats);
     AddToolData(ToolName(kOverviewPage), overview_page_db, response);
     if (tools.contains(kInputPipeline)) {
       AddToolData(ToolName(kInputPipeline), overview_page_db.input_analysis(),
                   response);
     }
   } else if (tools.contains(kInputPipeline)) {
-    InputPipelineAnalysisResult input_pipeline_analysis =
-        ConvertOpStatsToInputPipelineAnalysis(op_stats, hw_type);
-    AddToolData(ToolName(kInputPipeline), input_pipeline_analysis, response);
+    AddToolData(ToolName(kInputPipeline),
+                ConvertOpStatsToInputPipelineAnalysis(op_stats), response);
   }
   if (tools.contains(kTensorflowStats)) {
     TfStatsDatabase tf_stats_db = ConvertOpStatsToTfStats(op_stats);
@@ -139,15 +108,11 @@ Status ConvertXSpaceToProfileResponse(const XSpace& xspace,
     AddToolData(ToolName(kKernelStats), op_stats.kernel_stats_db(), response);
   }
   if (tools.contains(kMemoryProfile)) {
-    if (const XPlane* host_plane =
-            FindPlaneWithName(xspace, kHostThreadsPlaneName)) {
-      MemoryProfile memory_profile = ConvertXPlaneToMemoryProfile(*host_plane);
-      std::string json_output;
-      TF_RETURN_IF_ERROR(ConvertProtoToJson(memory_profile, &json_output));
-      TF_RETURN_IF_ERROR(SaveGzippedToolDataToTensorboardProfile(
-          req.repository_root(), req.session_id(), req.host_name(),
-          ToolName(kMemoryProfile), json_output));
-    }
+    std::string json_output;
+    TF_RETURN_IF_ERROR(ConvertXSpaceToMemoryProfileJson(xspace, &json_output));
+    TF_RETURN_IF_ERROR(SaveGzippedToolDataToTensorboardProfile(
+        req.repository_root(), req.session_id(), req.host_name(),
+        ToolName(kMemoryProfile), json_output));
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/profiler/convert/xplane_to_profile_response_test.cc b/tensorflow/core/profiler/convert/xplane_to_profile_response_test.cc
index ad9ca1028f6..d50cd9a98ff 100644
--- a/tensorflow/core/profiler/convert/xplane_to_profile_response_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_profile_response_test.cc
@@ -77,11 +77,10 @@ TEST(ConvertXPlaneToProfileResponse, OverviewPage) {
   request.add_tools("overview_page");
   ProfileResponse response;
   TF_CHECK_OK(ConvertXSpaceToProfileResponse(xspace, request, &response));
-  EXPECT_EQ(2, response.tool_data_size());
-  EXPECT_EQ("overview_page.pb", response.tool_data(/*index=*/1).name());
+  EXPECT_EQ(1, response.tool_data_size());
+  EXPECT_EQ("overview_page.pb", response.tool_data(0).name());
   OverviewPage overview_page;
-  ASSERT_TRUE(
-      overview_page.ParseFromString(response.tool_data(/*index=*/1).data()));
+  ASSERT_TRUE(overview_page.ParseFromString(response.tool_data(0).data()));
 }
 
 TEST(ConvertXPlaneToProfileResponse, InputPipeline) {
@@ -91,11 +90,10 @@ TEST(ConvertXPlaneToProfileResponse, InputPipeline) {
   request.add_tools("input_pipeline");
   ProfileResponse response;
   TF_CHECK_OK(ConvertXSpaceToProfileResponse(xspace, request, &response));
-  EXPECT_EQ(2, response.tool_data_size());
-  EXPECT_EQ("input_pipeline.pb", response.tool_data(/*index=*/1).name());
+  EXPECT_EQ(1, response.tool_data_size());
+  EXPECT_EQ("input_pipeline.pb", response.tool_data(0).name());
   InputPipelineAnalysisResult input_pipeline;
-  ASSERT_TRUE(
-      input_pipeline.ParseFromString(response.tool_data(/*index=*/1).data()));
+  ASSERT_TRUE(input_pipeline.ParseFromString(response.tool_data(0).data()));
 }
 
 TEST(ConvertXPlaneToProfileResponse, TensorflowStats) {
@@ -105,11 +103,10 @@ TEST(ConvertXPlaneToProfileResponse, TensorflowStats) {
   request.add_tools("tensorflow_stats");
   ProfileResponse response;
   TF_CHECK_OK(ConvertXSpaceToProfileResponse(xspace, request, &response));
-  EXPECT_EQ(2, response.tool_data_size());
-  EXPECT_EQ("tensorflow_stats.pb", response.tool_data(/*index=*/1).name());
+  EXPECT_EQ(1, response.tool_data_size());
+  EXPECT_EQ("tensorflow_stats.pb", response.tool_data(0).name());
   TfStatsDatabase tf_stats_db;
-  ASSERT_TRUE(
-      tf_stats_db.ParseFromString(response.tool_data(/*index=*/1).data()));
+  ASSERT_TRUE(tf_stats_db.ParseFromString(response.tool_data(0).data()));
 }
 
 }  // namespace
diff --git a/tensorflow/core/profiler/convert/xplane_to_step_events.cc b/tensorflow/core/profiler/convert/xplane_to_step_events.cc
index 00da02c8116..0af9ecaf4d3 100644
--- a/tensorflow/core/profiler/convert/xplane_to_step_events.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_step_events.cc
@@ -17,9 +17,11 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/match.h"
+#include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/event_span.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
@@ -50,6 +52,20 @@ inline bool IsRealCpuCompute(absl::string_view event_name) {
   return !not_real;
 }
 
+uint64 ParseNumBytesFromMemcpyDetail(absl::string_view memcpy_detail) {
+  const std::vector<absl::string_view> params =
+      absl::StrSplit(memcpy_detail, absl::ByAnyChar(":\n"));
+
+  // Processes value pairs.
+  for (uint32 ii = 0; ii < params.size(); ii += 2) {
+    if (params[ii] != "num_bytes") continue;
+    uint64 value = 0;
+    if (absl::SimpleAtoi(params[ii + 1], &value)) return value;
+    break;
+  }
+  return 0ULL;
+}
+
 }  // namespace
 
 StepEvents ConvertHostThreadsXLineToStepEvents(
@@ -126,12 +142,14 @@ StepEvents ConvertDeviceStepInfoToStepMarkers(const XLineVisitor& line) {
   return result;
 }
 
-StepEvents ConvertDeviceTraceXLineToStepEvents(const XLineVisitor& line) {
+StepEvents ConvertDeviceTraceXLineToStepEvents(const uint64 device_id,
+                                               const XLineVisitor& line) {
   StepEvents result;
   line.ForEachEvent([&](const XEventVisitor& event) {
     int64 correlation_id = -1;
     int64 group_id = -1;
     absl::string_view tensor_shapes;
+    absl::string_view memcpy_details;
     event.ForEachStat([&](const XStatVisitor& stat) {
       if (!stat.Type().has_value()) return;
       switch (stat.Type().value()) {
@@ -144,14 +162,40 @@ StepEvents ConvertDeviceTraceXLineToStepEvents(const XLineVisitor& line) {
         case StatType::kTensorShapes:
           tensor_shapes = stat.StrOrRefValue();
           break;
+        case StatType::kMemcpyDetails:
+          memcpy_details = stat.StrOrRefValue();
+          break;
       }
     });
 
     if (correlation_id >= 0 && group_id >= 0) {
-      EventTypeSpan event_type_span(
-          ClassifyGpuEvent(event.Name(), tensor_shapes),
-          Timespan(event.TimestampPs(), event.DurationPs()));
+      EventType event_type = ClassifyGpuEvent(event.Name(), tensor_shapes);
+      EventTypeSpan event_type_span(event_type, event.GetTimespan());
       result[group_id].AddEvent(event_type_span);
+      switch (event_type) {
+        case DEVICE_COLLECTIVES: {
+          AllReduceInfo collective_ops;
+          collective_ops.set_name(string(event.Name()));
+          collective_ops.set_start_time_ps(event.TimestampPs());
+          collective_ops.set_end_time_ps(event.EndOffsetPs());
+          // TODO(jiesun): figure out how to get size info etc.
+          result[group_id].AddCollectiveOpEvent(device_id, collective_ops);
+          break;
+        }
+        case HOST_TO_DEVICE:
+        case DEVICE_TO_DEVICE:
+        case DEVICE_TO_HOST: {
+          // TODO(jiesun): not all memcpy events are grouped, figure out a
+          // better way to attribute them to steps.
+          uint64 bytes_transferred =
+              ParseNumBytesFromMemcpyDetail(memcpy_details);
+          result[group_id].AddDeviceMemoryTransferEvent(
+              event_type, event.GetTimespan(), bytes_transferred);
+          break;
+        }
+        default:
+          return;
+      }
     }
   });
   return result;
@@ -167,7 +211,8 @@ StepEvents ConvertDeviceTraceXPlaneToStepEvents(const XPlane& device_trace) {
     } else if (IsDerivedThreadId(line_id)) {
       return;
     } else {
-      CombineStepEvents(ConvertDeviceTraceXLineToStepEvents(line), &result);
+      CombineStepEvents(ConvertDeviceTraceXLineToStepEvents(plane.Id(), line),
+                        &result);
     }
   });
   return result;
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc b/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
index e0b517d797a..eecfb9fba7b 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
@@ -206,7 +206,7 @@ class TfFunctionExecutions {
 
   std::string DebugString() const {
     std::string result = "\nActivations:\n";
-    for (int i = 0, iter_limit = activations_.size(); i < iter_limit; i++) {
+    for (int i = 0, end = activations_.size(); i < end; i++) {
       absl::StrAppend(&result, "[", i, "] ", activations_[i].DebugString(),
                       "\n");
     }
diff --git a/tensorflow/core/profiler/convert/xplane_to_trace_events.cc b/tensorflow/core/profiler/convert/xplane_to_trace_events.cc
index ceb3e003564..cf8e5c7c54a 100644
--- a/tensorflow/core/profiler/convert/xplane_to_trace_events.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_trace_events.cc
@@ -71,10 +71,7 @@ void ConvertXPlaneToTraceEvents(uint32 device_id, const XPlaneVisitor& xplane,
         [device_id, resource_id, trace](const XEventVisitor& xevent) {
           int64 event_type =
               xevent.Type().value_or(HostEventType::kUnknownHostEventType);
-          if (event_type == HostEventType::kMemoryAllocation ||
-              event_type == HostEventType::kMemoryDeallocation) {
-            return;
-          }
+          if (IsInternalEvent(event_type)) return;
           auto* event = trace->add_trace_events();
           auto& args = *event->mutable_args();
           event->set_device_id(device_id);
diff --git a/tensorflow/core/profiler/internal/cpu/BUILD b/tensorflow/core/profiler/internal/cpu/BUILD
index 3ec721e7395..d8c84425e2b 100644
--- a/tensorflow/core/profiler/internal/cpu/BUILD
+++ b/tensorflow/core/profiler/internal/cpu/BUILD
@@ -29,7 +29,6 @@ cc_library(
     deps = [
         ":host_tracer_utils",
         "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/internal:profiler_factory",
@@ -73,7 +72,6 @@ cc_library(
     features = ["-use_header_modules"],
     deps = [
         "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/internal:profiler_factory",
@@ -91,7 +89,6 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/compiler/xla/service/gpu:gpu_debug_info_manager",
         "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/internal:profiler_factory",
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer.cc b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
index fa21df004df..c2e8121e5f0 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
 #include "tensorflow/core/protobuf/config.pb.h"
-#include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -162,11 +161,7 @@ std::unique_ptr<ProfilerInterface> CreateHostTracer(
 }
 
 auto register_host_tracer_factory = [] {
-  bool enable;
-  TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_OSS_CPU_PROFILER", true, &enable));
-  if (enable) {
-    RegisterProfilerFactory(&CreateHostTracer);
-  }
+  RegisterProfilerFactory(&CreateHostTracer);
   return 0;
 }();
 
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
index b620b51cc99..bda7d5840ab 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
@@ -1564,8 +1564,18 @@ Status CuptiTracer::HandleCallback(CUpti_CallbackDomain domain,
     // Set up the map from correlation id to annotation string.
     const auto &annotation = AnnotationStack::Get();
     if (!annotation.empty()) {
-      collector_->annotation_map()->Add(device_id, cbdata->correlationId,
-                                        annotation);
+      if (cbid ==
+          CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice) {
+        // Kernels are launched on different devices by this API call, therefore
+        // we need to populate per device annotation map respectively.
+        for (int i = 0; i < num_gpus_; ++i) {
+          collector_->annotation_map()->Add(i, cbdata->correlationId,
+                                            annotation);
+        }
+      } else {
+        collector_->annotation_map()->Add(device_id, cbdata->correlationId,
+                                          annotation);
+      }
     }
 
     TF_RETURN_IF_ERROR(cupti_driver_api_hook_->OnDriverApiExit(
diff --git a/tensorflow/core/profiler/internal/tfprof_code.cc b/tensorflow/core/profiler/internal/tfprof_code.cc
index f104cffafcc..eb3501bc07b 100644
--- a/tensorflow/core/profiler/internal/tfprof_code.cc
+++ b/tensorflow/core/profiler/internal/tfprof_code.cc
@@ -421,14 +421,15 @@ void TFCode::AddNode(TFGraphNode* node) {
   // TODO(xpan): Consider to release CodeDef after TFCode is built. It
   // takes a lot of memory.
   std::set<string> traces;
-  for (int i = 0; i < node->call_stack()->traces().size(); ++i) {
+  for (int i = 0, end = node->call_stack()->traces().size(); i < end; ++i) {
     // Unlike op name, which is globally unique, trace name is only unique
     // w.r.t. it's parent.
     const string& trace = GetTraceString(node->call_stack()->traces().at(i));
     traces.insert(trace);
     pre_code_node = pre_code_node->AddChildren(
         trace, &node->call_stack()->traces().at(i), "");
-    if (i == node->call_stack()->traces().size() - 1) {
+    const int64 last_index = node->call_stack()->traces().size() - 1;
+    if (i == last_index) {
       pre_code_node->node->AddGraphNode(node);
     }
   }
@@ -446,12 +447,13 @@ void TFCode::Build() {
     TFGraphNode* fn = forward_it->second;
     CodeNode* leaf = nullptr;
     CodeNode* pre_code_node = root_.get();
-    for (int i = 0; i < fn->call_stack()->traces().size(); ++i) {
+    for (int i = 0, end = fn->call_stack()->traces().size(); i < end; ++i) {
       const string& trace =
           GetTraceString(fn->call_stack()->traces().at(i)) + kGradientSuffix;
       pre_code_node = pre_code_node->AddChildren(
           trace, &fn->call_stack()->traces().at(i), kGradientSuffix);
-      if (i == fn->call_stack()->traces().size() - 1) {
+      const int64 last_trace = fn->call_stack()->traces().size() - 1;
+      if (i == last_trace) {
         leaf = pre_code_node;
       }
     }
diff --git a/tensorflow/core/profiler/internal/tfprof_op.cc b/tensorflow/core/profiler/internal/tfprof_op.cc
index adc12882f9e..8daac471a14 100644
--- a/tensorflow/core/profiler/internal/tfprof_op.cc
+++ b/tensorflow/core/profiler/internal/tfprof_op.cc
@@ -146,7 +146,7 @@ const ShowMultiNode* TFOp::ShowInternal(const Options& opts,
   int64 depth = 0;
   std::vector<OpNode*> show_nodes;
   int64 start = SearchRoot(account_nodes, opts.start_name_regexes);
-  for (int64 i = start; i < account_nodes.size(); ++i, ++depth) {
+  for (int64 i = start, end = account_nodes.size(); i < end; ++i, ++depth) {
     OpNode* n = account_nodes[i];
     if (ShouldTrim(n, opts.trim_name_regexes) || depth > opts.max_depth) {
       break;
@@ -195,7 +195,8 @@ int64 TFOp::SearchRoot(const std::vector<OpNode*> nodes,
     return 0;
   }
   int64 i = 0;
-  for (; i < nodes.size(); ++i) {
+  const int64 nodes_size = nodes.size();
+  for (; i < nodes_size; ++i) {
     for (const string& regex : regexes) {
       if (RE2::FullMatch(nodes[i]->name(), regex)) {
         return i;
diff --git a/tensorflow/core/profiler/internal/tfprof_op.h b/tensorflow/core/profiler/internal/tfprof_op.h
index aa22182d36c..df222a1eb20 100644
--- a/tensorflow/core/profiler/internal/tfprof_op.h
+++ b/tensorflow/core/profiler/internal/tfprof_op.h
@@ -57,7 +57,8 @@ class TFOp : public TFMultiShow {
 
   bool ShouldShowIfExtra(const ShowMultiNode* node, const Options& opts,
                          int depth) const override {
-    if (opts.min_occurrence > node->node->graph_nodes().size()) {
+    const int max_num_graph_nodes = node->node->graph_nodes().size();
+    if (opts.min_occurrence > max_num_graph_nodes) {
       return false;
     }
     return true;
diff --git a/tensorflow/core/profiler/internal/tfprof_stats.cc b/tensorflow/core/profiler/internal/tfprof_stats.cc
index 56e6e2bcba3..bd105227449 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats.cc
+++ b/tensorflow/core/profiler/internal/tfprof_stats.cc
@@ -33,7 +33,7 @@ namespace {
 const char* const kProfilePrefix = "Profile:\n";
 
 bool CreateRunMetadataNode(const string& name, NodeDef* def) {
-  // TODO(xpan): Better solution than blacklisting this 2 nodes. They
+  // TODO(xpan): Better solution than denylisting this 2 nodes. They
   // actually cost some resources, maybe include them. Some nodes, such
   // as _SOURCE appear in multiple devices, which breaks tfprof's assumption.
   if (name == "RecvTensor" || name == "_SOURCE" ||
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.cc b/tensorflow/core/profiler/internal/tfprof_timeline.cc
index 96e880dc999..e52ef3595f2 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline.cc
+++ b/tensorflow/core/profiler/internal/tfprof_timeline.cc
@@ -348,7 +348,7 @@ void Timeline::AllocateLanes() {
       int64 start_time = tnode.second->start_micros;
       int64 end_time = tnode.second->start_micros + tnode.second->exec_micros;
       int64 l = -1;
-      for (int64 i = 0; i < p->lanes.size(); ++i) {
+      for (int64 i = 0, end = p->lanes.size(); i < end; ++i) {
         const auto& lane = p->lanes[i];
         l = i;
         for (auto cur_it = lane.rbegin(); cur_it != lane.rend(); ++cur_it) {
diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index 0f92ffd5a70..57a3fa8a586 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -46,7 +46,6 @@ cc_library(
     ],
     deps = [
         "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
         "//tensorflow/core/platform",
         "//tensorflow/core/profiler/internal:profiler_interface",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
diff --git a/tensorflow/core/profiler/lib/annotated_traceme.h b/tensorflow/core/profiler/lib/annotated_traceme.h
index c3257e2adbe..eb75a896107 100644
--- a/tensorflow/core/profiler/lib/annotated_traceme.h
+++ b/tensorflow/core/profiler/lib/annotated_traceme.h
@@ -38,12 +38,12 @@ class AnnotatedTraceMe {
     bool annotation_enabled = ScopedAnnotation::IsEnabled();
     bool traceme_enabled = TraceMe::Active(level);
     if (TF_PREDICT_FALSE(annotation_enabled || traceme_enabled)) {
-      string label = name_generator();
+      string name = name_generator();
       if (annotation_enabled) {
-        scoped_annotation_.emplace(absl::string_view(label));
+        scoped_annotation_.emplace(absl::string_view(name));
       }
       if (TF_PREDICT_TRUE(traceme_enabled)) {
-        trace_me_.emplace(std::move(label), level);
+        trace_me_.emplace([&name] { return std::move(name); }, level);
       }
     }
   }
diff --git a/tensorflow/core/profiler/lib/profiler_session.cc b/tensorflow/core/profiler/lib/profiler_session.cc
index f7d97711da0..ee6eb55300e 100644
--- a/tensorflow/core/profiler/lib/profiler_session.cc
+++ b/tensorflow/core/profiler/lib/profiler_session.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
-#include "tensorflow/core/util/env_var.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/profiler/internal/profiler_factory.h"
@@ -41,31 +40,20 @@ limitations under the License.
 #endif
 
 namespace tensorflow {
-
 namespace {
+
 ProfileOptions GetOptions(const ProfileOptions& opts) {
   if (opts.version()) return opts;
   ProfileOptions options = ProfilerSession::DefaultOptions();
   options.set_include_dataset_ops(opts.include_dataset_ops());
   return options;
 }
+
 };  // namespace
 
 /*static*/ std::unique_ptr<ProfilerSession> ProfilerSession::Create(
     const ProfileOptions& options) {
-  return absl::WrapUnique(new ProfilerSession(options));
-}
-
-/*static*/ std::unique_ptr<ProfilerSession> ProfilerSession::Create() {
-  int64 host_tracer_level = 2;
-  tensorflow::Status s = ReadInt64FromEnvVar("TF_PROFILER_HOST_TRACER_LEVEL", 2,
-                                             &host_tracer_level);
-  if (!s.ok()) {
-    LOG(WARNING) << "ProfilerSession: " << s.error_message();
-  }
-  ProfileOptions options = DefaultOptions();
-  options.set_host_tracer_level(host_tracer_level);
-  return Create(options);
+  return absl::WrapUnique(new ProfilerSession(GetOptions(options)));
 }
 
 tensorflow::Status ProfilerSession::Status() {
@@ -110,10 +98,10 @@ Status ProfilerSession::CollectData(profiler::XSpace* space) {
   // 3. Sort each plane of the XSpace
   profiler::SortXSpace(space);
   // 4. Grouping (i.e. marking step number) events in the XSpace.
-  profiler::EventGroupNameMap event_group_name_map;
-  profiler::GroupTfEvents(space, &event_group_name_map);
+  profiler::GroupMetadataMap group_metadata_map;
+  profiler::GroupTfEvents(space, &group_metadata_map);
   // 5. Generated miscellaneous derived time lines for device planes.
-  profiler::GenerateDerivedTimeLines(event_group_name_map, space);
+  profiler::GenerateDerivedTimeLines(group_metadata_map, space);
 #endif
 
   return Status::OK();
@@ -141,14 +129,14 @@ Status ProfilerSession::CollectData(RunMetadata* run_metadata) {
   return Status::OK();
 }
 
-ProfilerSession::ProfilerSession(const ProfileOptions& options)
+ProfilerSession::ProfilerSession(ProfileOptions options)
 #if !defined(IS_MOBILE_PLATFORM)
     : active_(profiler::AcquireProfilerLock()),
 #else
     : active_(false),
 #endif
       start_time_ns_(EnvTime::NowNanos()),
-      options_(GetOptions(options)) {
+      options_(std::move(options)) {
   if (!active_) {
 #if !defined(IS_MOBILE_PLATFORM)
     status_ = tensorflow::Status(error::UNAVAILABLE,
diff --git a/tensorflow/core/profiler/lib/profiler_session.h b/tensorflow/core/profiler/lib/profiler_session.h
index 6f92b047eb7..93541f501ce 100644
--- a/tensorflow/core/profiler/lib/profiler_session.h
+++ b/tensorflow/core/profiler/lib/profiler_session.h
@@ -40,7 +40,6 @@ class ProfilerSession {
  public:
   // Creates and ProfilerSession and starts profiling.
   static std::unique_ptr<ProfilerSession> Create(const ProfileOptions& options);
-  static std::unique_ptr<ProfilerSession> Create();
 
   static ProfileOptions DefaultOptions() {
     ProfileOptions options;
@@ -67,7 +66,7 @@ class ProfilerSession {
 
  private:
   // Constructs an instance of the class and starts profiling
-  explicit ProfilerSession(const ProfileOptions& options);
+  explicit ProfilerSession(ProfileOptions options);
 
   // ProfilerSession is neither copyable or movable.
   ProfilerSession(const ProfilerSession&) = delete;
diff --git a/tensorflow/core/profiler/lib/traceme.h b/tensorflow/core/profiler/lib/traceme.h
index 64103d95215..526f6d5104d 100644
--- a/tensorflow/core/profiler/lib/traceme.h
+++ b/tensorflow/core/profiler/lib/traceme.h
@@ -97,30 +97,21 @@ class TraceMe {
 #endif
   }
 
-  // string&& constructor to prevent an unnecessary string copy, e.g. when a
-  // TraceMe is constructed based on the result of a StrCat operation.
-  // Note: We can't take the string by value because a) it would make the
-  // overloads ambiguous, and b) we want lvalue strings to use the string_view
-  // constructor so we avoid copying them when tracing is disabled.
-  explicit TraceMe(std::string&& name, int level = 1) {
-    DCHECK_GE(level, 1);
-#if !defined(IS_MOBILE_PLATFORM)
-    if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level))) {
-      new (&no_init_.name) std::string(std::move(name));
-      start_time_ = EnvTime::NowNanos();
-    }
-#endif
-  }
+  // Do not allow passing a temporary string as the overhead of generating that
+  // string should only be incurred when tracing is enabled. Wrap the temporary
+  // string generation (e.g., StrCat) in a lambda and use the name_generator
+  // template instead.
+  explicit TraceMe(std::string&& name, int level = 1) = delete;
 
   // Do not allow passing strings by reference or value since the caller
   // may unintentionally maintain ownership of the name.
-  // Explicitly std::move the name or wrap it in a string_view if
-  // you really wish to maintain ownership.
+  // Explicitly wrap the name in a string_view if you really wish to maintain
+  // ownership of a string already generated for other purposes. For temporary
+  // strings (e.g., result of StrCat) use the name_generator template.
   explicit TraceMe(const std::string& name, int level = 1) = delete;
 
   // This overload is necessary to make TraceMe's with string literals work.
-  // Otherwise, the string&& and the string_view constructor would be equally
-  // good overload candidates.
+  // Otherwise, the name_generator template would be used.
   explicit TraceMe(const char* raw, int level = 1)
       : TraceMe(absl::string_view(raw), level) {}
 
diff --git a/tensorflow/core/profiler/lib/traceme_encode.h b/tensorflow/core/profiler/lib/traceme_encode.h
index 91b23740fc3..4dcd6ea469b 100644
--- a/tensorflow/core/profiler/lib/traceme_encode.h
+++ b/tensorflow/core/profiler/lib/traceme_encode.h
@@ -128,6 +128,17 @@ TF_ATTRIBUTE_ALWAYS_INLINE inline std::string TraceMeEncode(
   return traceme_internal::AppendArgs(std::string(), args);
 }
 
+// Concatenates op_name and op_type.
+TF_ATTRIBUTE_ALWAYS_INLINE inline std::string TraceMeOp(
+    absl::string_view op_name, absl::string_view op_type) {
+  return absl::StrCat(op_name, ":", op_type);
+}
+TF_ATTRIBUTE_ALWAYS_INLINE inline std::string TraceMeOp(
+    std::string&& op_name, absl::string_view op_type) {
+  absl::StrAppend(&op_name, ":", op_type);
+  return op_name;
+}
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/profiler_analysis.proto b/tensorflow/core/profiler/profiler_analysis.proto
index 4be75de8bb4..8f7bd01920a 100644
--- a/tensorflow/core/profiler/profiler_analysis.proto
+++ b/tensorflow/core/profiler/profiler_analysis.proto
@@ -1,10 +1,14 @@
 syntax = "proto3";
+
 package tensorflow;
 
 import "tensorflow/core/profiler/profiler_service.proto";
 
 message NewProfileSessionRequest {
   ProfileRequest request = 1;
+  // The place where we will dump profile data. We will normally use
+  // MODEL_DIR as the repository root. The data will be saved under
+  // MODEL_DIR/plugins/profile/.
   string repository_root = 2;
   repeated string hosts = 3;
   string session_id = 4;
@@ -36,6 +40,8 @@ message EnumProfileSessionsAndToolsResponse {
 }
 
 message ProfileSessionDataRequest {
+  // The place where we will read profile data. We will normally use
+  // MODEL_DIR/plugins/profile as the repository root.
   string repository_root = 1;
   string session_id = 2;
   // Which host the data is associated. if empty, data from all hosts are
@@ -69,10 +75,8 @@ service ProfileAnalysis {
   }
   // Enumerate existing sessions and return available profile tools.
   rpc EnumSessions(EnumProfileSessionsAndToolsRequest)
-      returns (EnumProfileSessionsAndToolsResponse) {
-  }
+      returns (EnumProfileSessionsAndToolsResponse) {}
   // Retrieve specific tool's data for specific session.
   rpc GetSessionToolData(ProfileSessionDataRequest)
-      returns (ProfileSessionDataResponse) {
-  }
+      returns (ProfileSessionDataResponse) {}
 }
diff --git a/tensorflow/core/profiler/profiler_service.proto b/tensorflow/core/profiler/profiler_service.proto
index a096a10efe2..649a5be56bf 100644
--- a/tensorflow/core/profiler/profiler_service.proto
+++ b/tensorflow/core/profiler/profiler_service.proto
@@ -48,7 +48,8 @@ message ProfileRequest {
   ProfileOptions opts = 4;
 
   // The place where we will dump profile data. We will normally use
-  // MODEL_DIR/plugin/profile/ as our repository root.
+  // MODEL_DIR as the repository root. The data will be saved under
+  // MODEL_DIR/plugins/profile/.
   string repository_root = 5;
 
   // The user provided profile session identifier.
diff --git a/tensorflow/core/profiler/protobuf/input_pipeline.proto b/tensorflow/core/profiler/protobuf/input_pipeline.proto
index abd3ff78323..b20942d3d36 100644
--- a/tensorflow/core/profiler/protobuf/input_pipeline.proto
+++ b/tensorflow/core/profiler/protobuf/input_pipeline.proto
@@ -9,6 +9,12 @@ import "tensorflow/core/profiler/protobuf/diagnostics.proto";
 message BottleneckAnalysis {
   // Percentage of step time that is spent on input.
   double input_percent = 7;
+  // Percentage of step time that is spent on output.
+  double output_percent = 8;
+  // Percentage of step time that is idle for non-I/O-related reason.
+  double idle_percent = 9;
+  // Percentage of step time that is spent on compute.
+  double compute_percent = 10;
   // Indicates if input is a bottleneck. Possible values:  "host", "device",
   // "both", or "unknown"
   string input_classification = 1;
@@ -104,7 +110,7 @@ message InputPipelineAnalysisRecommendation {
   // An analysis of different types of bottlenecks. Can be unpacked into a
   // BottleneckAnalysis.
   google.protobuf.Any bottleneck_analysis = 2;
-  // A suggested of step to take next.
+  // A suggested step to take next.
   string summary_next_step = 3;
 }
 
@@ -139,8 +145,14 @@ message InputPipelineAnalysisResult {
   StepSummary step_time_summary = 2;
   // Summary of all input-related stall as percentage of step duration.
   StepSummary input_percent_summary = 3;
-  // Overall percentage of step time that is waiting for input.
-  double overall_input_percent = 11;
+  // Percentage of step time that is waiting for input.
+  double input_percent = 11;
+  // Percentage of step time that is doing output.
+  double output_percent = 13;
+  // Percentage of step time that is idle for non-I/O-related reason.
+  double idle_percent = 14;
+  // Percentage of step time that is doing compute.
+  double compute_percent = 15;
   // Details of each step. Can be unpacked into a PerGenericStepDetails.
   repeated google.protobuf.Any step_details = 4;
   // The breakdown of the input processing time.
diff --git a/tensorflow/core/profiler/protobuf/overview_page.proto b/tensorflow/core/profiler/protobuf/overview_page.proto
index 5621ad92a0d..feb3423a00e 100644
--- a/tensorflow/core/profiler/protobuf/overview_page.proto
+++ b/tensorflow/core/profiler/protobuf/overview_page.proto
@@ -19,6 +19,11 @@ message OverviewTfOp {
   double cumulative_time_fraction = 4;
   // How many GFlops/sec that this Op achieves.
   double flop_rate = 5;
+  // Whether the Op is eligible to use TensorCores.
+  bool is_op_tensorcore_eligible = 6;
+  // Whether at least one of the kernels launched in this op is using
+  // TensorCore.
+  bool is_op_using_tensorcore = 7;
 }
 
 // Overview result for general analysis.
diff --git a/tensorflow/core/profiler/protobuf/steps_db.proto b/tensorflow/core/profiler/protobuf/steps_db.proto
index 7d5e87fad5a..cf44b817ac8 100644
--- a/tensorflow/core/profiler/protobuf/steps_db.proto
+++ b/tensorflow/core/profiler/protobuf/steps_db.proto
@@ -15,6 +15,13 @@ message GenericStepBreakdown {
   map<int32, uint64> type_ps = 1;
 }
 
+// Information about memory transfer to/from device memory.
+message DeviceMemoryTransfer {
+  uint64 occurrence = 1;
+  double time_us = 2;
+  uint64 bytes_transferred = 3;
+}
+
 // Next ID: 5
 // Result proto for StepInfo.
 message StepInfoResult {
@@ -90,8 +97,14 @@ message PerCoreStepInfo {
   // A map from core ID to program replica id. Replica id map could change
   // during a profile session, but should stay stable within a step.
   map<uint32, uint32> core_id_to_replica_id_map = 5;
-  // The result for all-reduce ops.hlo_metrics_db
+  // A map from core_id to all-reduce ops.
   map<uint32, AllReduceDbResult> all_reduce_db_per_core = 6;
+  // Information about deivce memory transfers, categoried by source and
+  // destination. Ordered by following categories:
+  // 1. HostToDevice
+  // 2. DeviceToHost
+  // 3. DeviceToDevice
+  repeated DeviceMemoryTransfer device_memory_transfers = 7;
 }
 
 // Result proto for a StepDatabase.
diff --git a/tensorflow/core/profiler/protobuf/tf_stats.proto b/tensorflow/core/profiler/protobuf/tf_stats.proto
index 2dae6230f50..099d8478831 100644
--- a/tensorflow/core/profiler/protobuf/tf_stats.proto
+++ b/tensorflow/core/profiler/protobuf/tf_stats.proto
@@ -71,4 +71,7 @@ message TfStatsRecord {
   string bound_by = 17;
   // Whether this TF-op is eagerly executed.
   bool is_eager = 18;
+  // Fraction of kernel time that utilizes GPU TensorCore.
+  // It is 0.0 if this op does not run on a GPU device.
+  double gpu_tensorcore_utilization = 19;
 }
diff --git a/tensorflow/core/profiler/rpc/BUILD b/tensorflow/core/profiler/rpc/BUILD
index 1e572dfd9bd..06e5d2e4d2b 100644
--- a/tensorflow/core/profiler/rpc/BUILD
+++ b/tensorflow/core/profiler/rpc/BUILD
@@ -1,5 +1,6 @@
 load("//tensorflow:tensorflow.bzl", "tf_external_workspace_visible")  # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")  # buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_pybind_cc_library_wrapper")  # buildifier: disable=same-origin-load
 
 package(
     licenses = ["notice"],  # Apache 2.0
@@ -11,12 +12,14 @@ cc_library(
     hdrs = ["profiler_service_impl.h"],
     features = ["-layering_check"],
     visibility = tf_external_workspace_visible(
-        ["//tensorflow_serving/model_servers:__pkg__"],
+        [
+            "//tensorflow/core/distributed_runtime/rpc:__pkg__",
+            "//tensorflow_serving/model_servers:__pkg__",
+        ],
     ),
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
-        "//tensorflow/core/profiler/convert:xplane_to_profile_response",
         "//tensorflow/core/profiler/lib:profiler_session_headers",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/memory",
@@ -25,11 +28,12 @@ cc_library(
 )
 
 cc_library(
-    name = "profiler_server",
+    name = "profiler_server_impl",
     srcs = ["profiler_server.cc"],
     hdrs = ["profiler_server.h"],
     visibility = [
         "//tensorflow/compiler/xla/python:__pkg__",
+        "//tensorflow/python:__pkg__",
         "//tensorflow/python/profiler/internal:__pkg__",
     ],
     deps = [
@@ -39,4 +43,13 @@ cc_library(
         "@com_google_absl//absl/strings",
         tf_grpc_cc_dependency(),
     ],
+    alwayslink = True,
+)
+
+tf_pybind_cc_library_wrapper(
+    name = "profiler_server_headers",
+    visibility = [
+        "//tensorflow/python/profiler/internal:__pkg__",
+    ],
+    deps = [":profiler_server_impl"],
 )
diff --git a/tensorflow/core/profiler/rpc/client/BUILD b/tensorflow/core/profiler/rpc/client/BUILD
index 609f98aa6c1..68b382eecee 100644
--- a/tensorflow/core/profiler/rpc/client/BUILD
+++ b/tensorflow/core/profiler/rpc/client/BUILD
@@ -1,4 +1,11 @@
 load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")
+load("//tensorflow:tensorflow.bzl", "tf_pybind_cc_library_wrapper")  # buildifier: disable=same-origin-load
+
+# For platform specific build config
+load(
+    "//tensorflow/core/platform:build_config.bzl",
+    "tf_profiler_client_deps",
+)
 
 package(
     licenses = ["notice"],  # Apache 2.0
@@ -8,17 +15,18 @@ cc_library(
     name = "capture_profile",
     srcs = ["capture_profile.cc"],
     hdrs = ["capture_profile.h"],
-    visibility = ["//tensorflow/python/profiler/internal:__pkg__"],
+    visibility = [
+        "//tensorflow/python/profiler/internal:__pkg__",
+    ],
     deps = [
         ":save_profile",
+        "@com_google_absl//absl/strings",
         "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler:profiler_analysis_proto_cc",
         "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
-        "@com_google_absl//absl/strings",
-        tf_grpc_cc_dependency(),
-    ],
+        "//tensorflow/core/profiler/convert:xplane_to_profile_response",
+    ] + tf_profiler_client_deps(),
 )
 
 cc_library(
@@ -35,3 +43,24 @@ cc_library(
         "@com_google_absl//absl/time",
     ],
 )
+
+cc_library(
+    name = "profiler_client_impl",
+    srcs = ["profiler_client.cc"],
+    hdrs = ["profiler_client.h"],
+    visibility = ["//tensorflow/python:__pkg__"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler:profiler_analysis_proto_cc",
+        "//tensorflow/core/profiler:profiler_service_proto_cc",
+        tf_grpc_cc_dependency(),
+    ],
+    alwayslink = True,
+)
+
+tf_pybind_cc_library_wrapper(
+    name = "profiler_client_headers",
+    visibility = ["//tensorflow/python/profiler/internal:__pkg__"],
+    deps = [":profiler_client_impl"],
+)
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.cc b/tensorflow/core/profiler/rpc/client/capture_profile.cc
index a8642aff54a..bd82ba64db2 100644
--- a/tensorflow/core/profiler/rpc/client/capture_profile.cc
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.cc
@@ -19,52 +19,48 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "grpcpp/grpcpp.h"
-#include "absl/strings/numbers.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/profiler_analysis.grpc.pb.h"
+#include "tensorflow/core/profiler/convert/xplane_to_profile_response.h"
 #include "tensorflow/core/profiler/profiler_analysis.pb.h"
 #include "tensorflow/core/profiler/profiler_options.pb.h"
-#include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
 #include "tensorflow/core/profiler/profiler_service.pb.h"
+#include "tensorflow/core/profiler/rpc/client/profiler_client.h"
 #include "tensorflow/core/profiler/rpc/client/save_profile.h"
-#include "tensorflow/core/protobuf/error_codes.pb.h"
 
 namespace tensorflow {
 namespace profiler {
 namespace {
 
 constexpr uint64 kMaxEvents = 1000000;
+const absl::string_view kXPlanePb = "xplane.pb";
 
-ProfileRequest PopulateProfileRequest(int duration_ms,
-                                      const string& repository_root,
-                                      const string& session_id,
-                                      const ProfileOptions& opts) {
-  ProfileRequest request;
+MonitorRequest PopulateMonitorRequest(int duration_ms, int monitoring_level,
+                                      bool timestamp) {
+  MonitorRequest request;
   request.set_duration_ms(duration_ms);
-  request.set_max_events(kMaxEvents);
-  request.set_repository_root(repository_root);
-  request.set_session_id(session_id);
-  request.add_tools("trace_viewer");
-  request.add_tools("op_profile");
-  request.add_tools("input_pipeline");
-  request.add_tools("kernel_stats");
-  request.add_tools("memory_viewer");
-  request.add_tools("overview_page");
-  request.add_tools("pod_viewer");
-  request.add_tools("tensorflow_stats");
-  *request.mutable_opts() = opts;
+  request.set_monitoring_level(monitoring_level);
+  request.set_timestamp(timestamp);
   return request;
 }
 
-inline Status FromGrpcStatus(const ::grpc::Status& s) {
-  return s.ok() ? Status::OK()
-                : Status(static_cast<error::Code>(s.error_code()),
-                         s.error_message());
+NewProfileSessionRequest PopulateNewProfileSessionRequest(
+    const std::string& service_addr, const std::string& repository_root,
+    const std::vector<string>& hostnames, int duration_ms,
+    const std::string& session_id, const ProfileOptions& opts) {
+  NewProfileSessionRequest request;
+  std::vector<std::string> parts = absl::StrSplit(service_addr, ':');
+  *request.mutable_request() = PopulateProfileRequest(
+      duration_ms, repository_root, session_id, parts[0], opts);
+  request.set_repository_root(repository_root);
+  request.set_session_id(session_id);
+  for (const auto& hostname : hostnames) {
+    request.add_hosts(hostname);
+  }
+  return request;
 }
 
 inline bool ShouldRetryTracing(Status status) {
@@ -78,30 +74,31 @@ inline bool ShouldRetryTracing(Status status) {
           status.error_message() == "Stream removed");
 }
 
-// Returns whether the returned trace is empty.
-// Failure are handled by CHECK, i.e. abort()
-Status Profile(const string& service_addr, const string& logdir,
-               int duration_ms, const string& session_id,
-               const ProfileOptions& opts) {
-  ProfileRequest request =
-      PopulateProfileRequest(duration_ms, logdir, session_id, opts);
-  std::vector<string> parts = absl::StrSplit(service_addr, ':');
-  request.set_host_name(parts[0]);
+// If the ProfileResponse has single 'xplane.pb' tool, convert the xplane to
+// other tools and add in ProfileResponse. Otherwise, the ProfileResponse is
+// already converted, simply return.
+Status ConvertXSpaceToToolsInProfileResponse(const ProfileRequest& request,
+                                             ProfileResponse* response) {
+  if (response->tool_data_size() != 1) return Status::OK();
+  if (response->tool_data(0).name() != kXPlanePb) return Status::OK();
+  XSpace xspace;
+  xspace.ParseFromString(response->tool_data(0).data());
+  TF_RETURN_IF_ERROR(ConvertXSpaceToProfileResponse(xspace, request, response));
+  return Status::OK();
+}
 
-  ::grpc::ClientContext context;
-  ::grpc::ChannelArguments channel_args;
-  // TODO(qiuminxu): use `NewHostPortGrpcChannel` instead once their
-  channel_args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH,
-                      std::numeric_limits<int32>::max());
-  std::unique_ptr<grpc::ProfilerService::Stub> stub =
-      grpc::ProfilerService::NewStub(::grpc::CreateCustomChannel(
-          "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
-          channel_args));
+Status Profile(const std::string& service_addr, const std::string& logdir,
+               int duration_ms, const std::string& session_id,
+               const ProfileOptions& opts) {
+  std::vector<std::string> parts = absl::StrSplit(service_addr, ':');
+  ProfileRequest request =
+      PopulateProfileRequest(duration_ms, logdir, session_id, parts[0], opts);
   ProfileResponse response;
-  TF_RETURN_IF_ERROR(
-      FromGrpcStatus(stub->Profile(&context, request, &response)));
+  TF_RETURN_IF_ERROR(ProfileGrpc(service_addr, request, &response));
 
   if (!response.empty_trace()) {
+    TF_RETURN_IF_ERROR(
+        ConvertXSpaceToToolsInProfileResponse(request, &response));
     TF_RETURN_IF_ERROR(SaveTensorboardProfile(
         logdir, session_id, request.host_name(), response, &std::cout));
     // Print this at the end so that it's not buried in irrelevant LOG messages.
@@ -122,76 +119,58 @@ Status Profile(const string& service_addr, const string& logdir,
 // Start a new profiling session that include all the hosts included in
 // hostnames, for the time interval of duration_ms. Possibly save the profiling
 // result in the directory specified by repository_root and session_id.
-Status NewSession(const string& service_addr, const string& repository_root,
+Status NewSession(const std::string& service_addr,
+                  const std::string& repository_root,
                   const std::vector<string>& hostnames, int duration_ms,
-                  const string& session_id, const ProfileOptions& opts) {
-  NewProfileSessionRequest new_session_request;
-  *new_session_request.mutable_request() =
-      PopulateProfileRequest(duration_ms, repository_root, session_id, opts);
-  new_session_request.set_repository_root(repository_root);
-  new_session_request.set_session_id(session_id);
-  for (const auto& hostname : hostnames) {
-    new_session_request.add_hosts(hostname);
-  }
-
-  ::grpc::ClientContext context;
-  ::grpc::ChannelArguments channel_args;
-  // TODO(qiuminxu): use `NewHostPortGrpcChannel` instead once their
-  channel_args.SetMaxReceiveMessageSize(std::numeric_limits<int32>::max());
-  // TODO(jiesun): GRPC support following relevant naming scheme:
-  // 1. dns:///host:port
-  // 2. ipv4:host:port or ipv6:[host]:port
-  // We might need to change the prefix which depends on what cluster name
-  // resolver will give us.
-  std::unique_ptr<grpc::ProfileAnalysis::Stub> stub =
-      grpc::ProfileAnalysis::NewStub(::grpc::CreateCustomChannel(
-          "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
-          channel_args));
-  NewProfileSessionResponse new_session_response;
-  TF_RETURN_IF_ERROR(FromGrpcStatus(
-      stub->NewSession(&context, new_session_request, &new_session_response)));
+                  const std::string& session_id, const ProfileOptions& opts) {
+  NewProfileSessionRequest request = PopulateNewProfileSessionRequest(
+      service_addr, repository_root, hostnames, duration_ms, session_id, opts);
+  NewProfileSessionResponse response;
+  TF_RETURN_IF_ERROR(NewSessionGrpc(service_addr, request, &response));
 
   std::cout << "Profile session succeed for host(s):"
             << absl::StrJoin(hostnames, ",") << std::endl;
-  if (new_session_response.empty_trace()) {
+  if (response.empty_trace()) {
     return Status(error::Code::UNAVAILABLE, "No trace event is collected");
   }
   return Status::OK();
 }
 
-MonitorRequest PopulateMonitorRequest(int duration_ms, int monitoring_level,
-                                      bool timestamp) {
-  MonitorRequest request;
-  request.set_duration_ms(duration_ms);
-  request.set_monitoring_level(monitoring_level);
-  request.set_timestamp(timestamp);
-  return request;
-}
-
 }  // namespace
 
-Status ValidateHostPortPair(const string& host_port) {
-  uint32 port;
-  std::vector<string> parts = absl::StrSplit(host_port, ':');
-  // Must be host:port, port must be a number, host must not contain a '/',
-  // host also must not be empty.
-  if (parts.size() != 2 || !absl::SimpleAtoi(parts[1], &port) ||
-      parts[0].find("/") != string::npos || parts[0].empty()) {
-    return errors::InvalidArgument("Could not interpret \"", host_port,
-                                   "\" as a host-port pair.");
-  }
-  return Status::OK();
+ProfileRequest PopulateProfileRequest(int duration_ms,
+                                      const std::string& repository_root,
+                                      const std::string& session_id,
+                                      const std::string& host_name,
+                                      const ProfileOptions& opts) {
+  ProfileRequest request;
+  request.set_duration_ms(duration_ms);
+  request.set_max_events(kMaxEvents);
+  request.set_repository_root(repository_root);
+  request.set_session_id(session_id);
+  request.set_host_name(host_name);
+  request.add_tools("trace_viewer");
+  request.add_tools("op_profile");
+  request.add_tools("input_pipeline");
+  request.add_tools("kernel_stats");
+  request.add_tools("memory_viewer");
+  request.add_tools("memory_profile");
+  request.add_tools("overview_page");
+  request.add_tools("pod_viewer");
+  request.add_tools("tensorflow_stats");
+  *request.mutable_opts() = opts;
+  return request;
 }
 
 // Starts tracing on a single or multiple hosts and saves the result in the
 // given logdir. If no trace was collected, retries tracing for
 // num_tracing_attempts.
-Status Trace(const string& service_addr, const string& logdir,
-             const string& workers_list, int duration_ms,
+Status Trace(const std::string& service_addr, const std::string& logdir,
+             const std::string& workers_list, int duration_ms,
              int num_tracing_attempts, const ProfileOptions& opts) {
   // Use the current timestamp as the run name.
-  tensorflow::string session_id = GetCurrentTimeStampAsString();
-  std::vector<string> hostnames;
+  std::string session_id = GetCurrentTimeStampAsString();
+  std::vector<std::string> hostnames;
   if (!workers_list.empty()) {
     hostnames = absl::StrSplit(workers_list, ',');
   }
@@ -223,22 +202,13 @@ Status Trace(const string& service_addr, const string& logdir,
   return status;
 }
 
-Status Monitor(const string& service_addr, int duration_ms,
-               int monitoring_level, bool display_timestamp, string* result) {
+Status Monitor(const std::string& service_addr, int duration_ms,
+               int monitoring_level, bool display_timestamp,
+               std::string* result) {
   MonitorRequest request =
       PopulateMonitorRequest(duration_ms, monitoring_level, display_timestamp);
-
-  ::grpc::ClientContext context;
-  ::grpc::ChannelArguments channel_args;
-  channel_args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH,
-                      std::numeric_limits<int32>::max());
-  std::unique_ptr<grpc::ProfilerService::Stub> stub =
-      grpc::ProfilerService::NewStub(::grpc::CreateCustomChannel(
-          "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
-          channel_args));
   MonitorResponse response;
-  TF_RETURN_IF_ERROR(
-      FromGrpcStatus(stub->Monitor(&context, request, &response)));
+  TF_RETURN_IF_ERROR(MonitorGrpc(service_addr, request, &response));
   *result = response.data();
   return Status::OK();
 }
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.h b/tensorflow/core/profiler/rpc/client/capture_profile.h
index c809d2099ae..5745f24cbfa 100644
--- a/tensorflow/core/profiler/rpc/client/capture_profile.h
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.h
@@ -17,25 +17,32 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_RPC_CLIENT_CAPTURE_PROFILE_H_
 #define TENSORFLOW_CORE_PROFILER_RPC_CLIENT_CAPTURE_PROFILE_H_
 
+#include <string>
+
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/profiler_options.pb.h"
+#include "tensorflow/core/profiler/profiler_service.pb.h"
 
 namespace tensorflow {
 namespace profiler {
 
-Status ValidateHostPortPair(const string& host_port);
+ProfileRequest PopulateProfileRequest(int duration_ms,
+                                      const std::string& repository_root,
+                                      const std::string& session_id,
+                                      const std::string& host_name,
+                                      const ProfileOptions& opts);
 
 // Collects one sample of monitoring profile and shows user-friendly metrics.
 // If timestamp flag is true, timestamp will be displayed in "%H:%M:%S" format.
-Status Monitor(const string& service_addr, int duration_ms,
-               int monitoring_level, bool display_timestamp, string* result);
+Status Monitor(const std::string& service_addr, int duration_ms,
+               int monitoring_level, bool display_timestamp,
+               std::string* result);
 
 // Starts tracing on a single or multiple hosts and saves the result in the
 // given logdir. If no trace was collected, retries tracing for
 // num_tracing_attempts.
-Status Trace(const string& service_addr, const string& logdir,
-             const string& workers_list, int duration_ms,
+Status Trace(const std::string& service_addr, const std::string& logdir,
+             const std::string& workers_list, int duration_ms,
              int num_tracing_attempts, const ProfileOptions& opts);
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/rpc/client/profiler_client.cc b/tensorflow/core/profiler/rpc/client/profiler_client.cc
new file mode 100644
index 00000000000..0d8fd8411a5
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/client/profiler_client.cc
@@ -0,0 +1,78 @@
+/* Copyright 2020 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/rpc/client/profiler_client.h"
+
+#include <limits>
+
+#include "grpcpp/grpcpp.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+inline Status FromGrpcStatus(const ::grpc::Status& s) {
+  return s.ok() ? Status::OK()
+                : Status(static_cast<error::Code>(s.error_code()),
+                         s.error_message());
+}
+
+template <typename T>
+std::unique_ptr<typename T::Stub> CreateStub(const std::string& service_addr) {
+  ::grpc::ChannelArguments channel_args;
+  channel_args.SetMaxReceiveMessageSize(std::numeric_limits<int32>::max());
+  return T::NewStub(::grpc::CreateCustomChannel(
+      "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
+      channel_args));
+}
+
+}  // namespace
+
+Status ProfileGrpc(const std::string& service_addr,
+                   const ProfileRequest& request, ProfileResponse* response) {
+  ::grpc::ClientContext context;
+  std::unique_ptr<grpc::ProfilerService::Stub> stub =
+      CreateStub<grpc::ProfilerService>(service_addr);
+  TF_RETURN_IF_ERROR(
+      FromGrpcStatus(stub->Profile(&context, request, response)));
+  return Status::OK();
+}
+
+Status NewSessionGrpc(const std::string& service_addr,
+                      const NewProfileSessionRequest& request,
+                      NewProfileSessionResponse* response) {
+  ::grpc::ClientContext context;
+  std::unique_ptr<grpc::ProfileAnalysis::Stub> stub =
+      CreateStub<grpc::ProfileAnalysis>(service_addr);
+  TF_RETURN_IF_ERROR(
+      FromGrpcStatus(stub->NewSession(&context, request, response)));
+  return Status::OK();
+}
+
+Status MonitorGrpc(const std::string& service_addr,
+                   const MonitorRequest& request, MonitorResponse* response) {
+  ::grpc::ClientContext context;
+  std::unique_ptr<grpc::ProfilerService::Stub> stub =
+      CreateStub<grpc::ProfilerService>(service_addr);
+  TF_RETURN_IF_ERROR(
+      FromGrpcStatus(stub->Monitor(&context, request, response)));
+  return Status::OK();
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/client/profiler_client.h b/tensorflow/core/profiler/rpc/client/profiler_client.h
new file mode 100644
index 00000000000..d946d607e55
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/client/profiler_client.h
@@ -0,0 +1,40 @@
+/* Copyright 2020 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// GRPC client to perform on-demand profiling
+
+#ifndef TENSORFLOW_CORE_PROFILER_RPC_CLIENT_PROFILER_CLIENT_H_
+#define TENSORFLOW_CORE_PROFILER_RPC_CLIENT_PROFILER_CLIENT_H_
+
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/profiler/profiler_analysis.grpc.pb.h"
+#include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+Status ProfileGrpc(const std::string& service_addr,
+                   const ProfileRequest& request, ProfileResponse* response);
+
+Status NewSessionGrpc(const std::string& service_addr,
+                      const NewProfileSessionRequest& request,
+                      NewProfileSessionResponse* response);
+
+Status MonitorGrpc(const std::string& service_addr,
+                   const MonitorRequest& request, MonitorResponse* response);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_RPC_CLIENT_PROFILER_CLIENT_H_
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.cc b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
index 0a234d7e4da..ba463813fc0 100644
--- a/tensorflow/core/profiler/rpc/profiler_service_impl.cc
+++ b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/profiler/convert/xplane_to_profile_response.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
 #include "tensorflow/core/profiler/lib/profiler_session.h"
 #include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
@@ -37,13 +36,16 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+const absl::string_view kXPlanePb = "xplane.pb";
+
 Status CollectDataToResponse(const ProfileRequest& req,
                              ProfilerSession* profiler,
                              ProfileResponse* response) {
   profiler::XSpace xspace;
   TF_RETURN_IF_ERROR(profiler->CollectData(&xspace));
-  TF_RETURN_IF_ERROR(
-      profiler::ConvertXSpaceToProfileResponse(xspace, req, response));
+  auto* tool_data = response->add_tool_data();
+  tool_data->set_name(kXPlanePb.data(), kXPlanePb.size());
+  xspace.SerializeToString(tool_data->mutable_data());
   return Status::OK();
 }
 
diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD
index 0262c5659b7..2d3ec1d004d 100644
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@@ -34,6 +34,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
+        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
@@ -47,6 +48,7 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -198,7 +200,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
     ],
@@ -400,10 +401,23 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
     ],
 )
 
+tf_cc_test(
+    name = "kernel_stats_utils_test",
+    srcs = ["kernel_stats_utils_test.cc"],
+    deps = [
+        ":kernel_stats_utils",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
+    ],
+)
+
 cc_library(
     name = "tfstreamz_utils",
     srcs = ["tfstreamz_utils.cc"],
diff --git a/tensorflow/core/profiler/utils/derived_timeline.cc b/tensorflow/core/profiler/utils/derived_timeline.cc
index 42e0718f8b6..364ce270439 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline.cc
@@ -130,15 +130,14 @@ void DerivedXLineBuilder::ExpandOrAddLevelEvent(const XEvent& event,
 }
 
 void DerivedXLineBuilder::ResetLastEvents(int level) {
-  for (int i = level, iter_limit = last_event_by_level_.size(); i < iter_limit;
-       ++i) {
+  for (int i = level, end = last_event_by_level_.size(); i < end; ++i) {
     last_event_by_level_[i] = absl::nullopt;
   }
   if (level == 0) ResetDependentLines();
 }
 
 void DeriveEventsFromAnnotations(const SymbolResolver& symbol_resolver,
-                                 const EventGroupNameMap& event_group_name_map,
+                                 const GroupMetadataMap& group_metadata_map,
                                  XPlane* device_trace, bool step_info_only) {
   // Merge and sort events by Timespan as they come from different lines.
   std::vector<XEventVisitor> events;
@@ -198,10 +197,11 @@ void DeriveEventsFromAnnotations(const SymbolResolver& symbol_resolver,
       XEvent step_event = CreateXEvent(
           *plane.GetOrCreateEventMetadata(absl::StrCat(*group_id)), offset_ps,
           duration_ps, group_id_stat_metadata_id, group_id);
-      if (auto group_name = gtl::FindOrNull(event_group_name_map, *group_id)) {
+      if (auto group_metadata =
+              gtl::FindOrNull(group_metadata_map, *group_id)) {
         XStat* stat = step_event.add_stats();
         stat->set_metadata_id(step_name_stat_metadata_id);
-        stat->set_str_value(*group_name);
+        stat->set_str_value(group_metadata->name);
       }
       steps.ExpandOrAddEvent(step_event);
     }
@@ -242,7 +242,7 @@ void DeriveEventsFromAnnotations(const SymbolResolver& symbol_resolver,
 }
 
 void DeriveEventsFromHostTrace(const XPlane* host_trace,
-                               const EventGroupNameMap& event_group_name_map,
+                               const GroupMetadataMap& group_metadata_map,
                                std::vector<XPlane*> device_traces) {
   struct GroupLaunchInfo {  // "Group" normally means step.
     Timespan timespan;
@@ -311,10 +311,10 @@ void DeriveEventsFromHostTrace(const XPlane* host_trace,
     for (const auto& kv : per_device_launch_info[i]) {
       int64 group_id = kv.first;
       const GroupLaunchInfo& group_info = kv.second;
-      if (auto group_name = gtl::FindOrNull(event_group_name_map, group_id)) {
+      if (auto group_metadata = gtl::FindOrNull(group_metadata_map, group_id)) {
         XEventBuilder device_event =
             launch_line.AddEvent(*device_plane.GetOrCreateEventMetadata(
-                absl::StrCat("Launch Stats for ", *group_name)));
+                absl::StrCat("Launch Stats for ", group_metadata->name)));
         device_event.SetTimestampNs(
             host_plane_start + PicosToNanos(group_info.timespan.begin_ps()));
         device_event.SetDurationPs(group_info.timespan.duration_ps());
@@ -336,23 +336,23 @@ void DeriveEventsFromHostTrace(const XPlane* host_trace,
   }
 }
 
-void GenerateDerivedTimeLines(const EventGroupNameMap& event_group_name_map,
+void GenerateDerivedTimeLines(const GroupMetadataMap& group_metadata_map,
                               XSpace* space, bool step_info_only) {
   for (XPlane& plane : *space->mutable_planes()) {
     // Derived timelines only generated for device traces.
     if (IsGpuPlaneName(plane.name())) {
-      DeriveEventsFromAnnotations(DummySymbolResolver, event_group_name_map,
+      DeriveEventsFromAnnotations(DummySymbolResolver, group_metadata_map,
                                   &plane, step_info_only);
     }
   }
 }
 
-void GenerateDerivedTimeLines(const EventGroupNameMap& event_group_name_map,
+void GenerateDerivedTimeLines(const GroupMetadataMap& group_metadata_map,
                               const std::vector<XPlane*>& device_traces,
                               bool step_info_only) {
   for (XPlane* plane : device_traces) {
-    DeriveEventsFromAnnotations(DummySymbolResolver, event_group_name_map,
-                                plane, step_info_only);
+    DeriveEventsFromAnnotations(DummySymbolResolver, group_metadata_map, plane,
+                                step_info_only);
   }
 }
 
diff --git a/tensorflow/core/profiler/utils/derived_timeline.h b/tensorflow/core/profiler/utils/derived_timeline.h
index 92489399b8f..bf8280708fa 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.h
+++ b/tensorflow/core/profiler/utils/derived_timeline.h
@@ -85,20 +85,20 @@ void ProcessTfOpEvent(absl::string_view tf_op_full_name, int64 offset_ps,
 // with the same value are merged into a single event except for XLA modules.
 // The device_trace is both input and output.
 void DeriveEventsFromAnnotations(const SymbolResolver& symbol_resolver,
-                                 const EventGroupNameMap& event_group_name_map,
+                                 const GroupMetadataMap& group_metadata_map,
                                  XPlane* device_trace,
                                  bool step_info_only = false);
 
 // Derives "Launch Activities Summary" line from host trace.
 void DeriveEventsFromHostTrace(const XPlane* host_trace,
-                               const EventGroupNameMap& event_group_name_map,
+                               const GroupMetadataMap& group_metadata_map,
                                std::vector<XPlane*> device_traces);
 
 // Loops through XPlanes of input XSpace, if it is "device" XPlane, generating
 // derived timelines for the plane by calling DeriveEventsFromAnnotations.
-void GenerateDerivedTimeLines(const EventGroupNameMap& event_group_name_map,
+void GenerateDerivedTimeLines(const GroupMetadataMap& group_metadata_map,
                               XSpace* space, bool step_info_only = false);
-void GenerateDerivedTimeLines(const EventGroupNameMap& event_group_name_map,
+void GenerateDerivedTimeLines(const GroupMetadataMap& group_metadata_map,
                               const std::vector<XPlane*>& device_traces,
                               bool step_info_only = false);
 
diff --git a/tensorflow/core/profiler/utils/derived_timeline_test.cc b/tensorflow/core/profiler/utils/derived_timeline_test.cc
index a75ba8ea085..5952382bd7f 100644
--- a/tensorflow/core/profiler/utils/derived_timeline_test.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline_test.cc
@@ -33,8 +33,8 @@ namespace {
 
 TEST(DerivedTimelineTest, EmptySpaceTest) {
   XSpace space;
-  EventGroupNameMap event_group_name_map;
-  GenerateDerivedTimeLines(event_group_name_map, &space);
+  GroupMetadataMap group_metadata_map;
+  GenerateDerivedTimeLines(group_metadata_map, &space);
   EXPECT_EQ(space.planes_size(), 0);
 }
 
@@ -43,7 +43,7 @@ TEST(DerivedTimelineTest, HloModuleNameTest) {
   const absl::string_view kHloModuleName = "hlo_module";
   const absl::string_view kKernelDetails = "kernel_details";
   XSpace space;
-  EventGroupNameMap event_group_name_map;
+  GroupMetadataMap group_metadata_map;
   XPlane* plane = GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0);
   XPlaneBuilder plane_builder(plane);
   auto line_builder = plane_builder.GetOrCreateLine(0);
@@ -53,7 +53,7 @@ TEST(DerivedTimelineTest, HloModuleNameTest) {
   CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300,
                {{StatType::kHloModule, kHloModuleName},
                 {StatType::kKernelDetails, kKernelDetails}});
-  GenerateDerivedTimeLines(event_group_name_map, &space);
+  GenerateDerivedTimeLines(group_metadata_map, &space);
   XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(plane);
   // Only the hlo module line is added and other empty lines are removed at the
   // end.
@@ -73,7 +73,7 @@ TEST(DerivedTimelineTest, TfOpLineTest) {
   const absl::string_view kTfOpName = "mul:Mul";
   const absl::string_view kKernelDetails = "kernel_details";
   XSpace space;
-  EventGroupNameMap event_group_name_map;
+  GroupMetadataMap group_metadata_map;
   XPlane* plane = GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0);
   XPlaneBuilder plane_builder(plane);
   auto line_builder = plane_builder.GetOrCreateLine(0);
@@ -83,7 +83,7 @@ TEST(DerivedTimelineTest, TfOpLineTest) {
   CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300,
                {{StatType::kLevel0, kTfOpName},
                 {StatType::kKernelDetails, kKernelDetails}});
-  GenerateDerivedTimeLines(event_group_name_map, &space);
+  GenerateDerivedTimeLines(group_metadata_map, &space);
   XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(plane);
   // Only the tf op line is added and other empty lines are removed at the end.
   EXPECT_EQ(plane_visitor.NumLines(), 2);
@@ -108,7 +108,8 @@ TEST(DerivedTimelineTest, DependencyTest) {
   const absl::string_view kTfOpName = "mul:Mul";
   const absl::string_view kKernelDetails = "kernel_details";
   XSpace space;
-  EventGroupNameMap event_group_name_map({{0, "train 0"}, {1, "train 1"}});
+  GroupMetadataMap group_metadata_map(
+      {{0, {"train 0", ""}}, {1, {"train 1", ""}}});
   XPlane* plane = GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0);
   XPlaneBuilder plane_builder(plane);
   auto line_builder = plane_builder.GetOrCreateLine(0);
@@ -120,7 +121,7 @@ TEST(DerivedTimelineTest, DependencyTest) {
                {{StatType::kGroupId, kSecondGroupId},
                 {StatType::kLevel0, kTfOpName},
                 {StatType::kKernelDetails, kKernelDetails}});
-  GenerateDerivedTimeLines(event_group_name_map, &space);
+  GenerateDerivedTimeLines(group_metadata_map, &space);
   XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(plane);
   // The step line and the TF op line are added.
   EXPECT_EQ(plane_visitor.NumLines(), 3);
@@ -137,7 +138,7 @@ TEST(DerivedTimelineTest, TfOpNameScopeTest) {
   const absl::string_view kTfOpName = "scope1/scope2/mul:Mul";
   const absl::string_view kKernelDetails = "kernel_details";
   XSpace space;
-  EventGroupNameMap event_group_name_map;
+  GroupMetadataMap group_metadata_map;
   XPlane* plane = GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0);
   XPlaneBuilder plane_builder(plane);
   auto line_builder = plane_builder.GetOrCreateLine(0);
@@ -147,7 +148,7 @@ TEST(DerivedTimelineTest, TfOpNameScopeTest) {
   CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300,
                {{StatType::kLevel0, kTfOpName},
                 {StatType::kKernelDetails, kKernelDetails}});
-  GenerateDerivedTimeLines(event_group_name_map, &space);
+  GenerateDerivedTimeLines(group_metadata_map, &space);
   XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(plane);
   // The TF name scope line and the TF op line are added.
   EXPECT_EQ(plane_visitor.NumLines(), 3);
diff --git a/tensorflow/core/profiler/utils/event_span.cc b/tensorflow/core/profiler/utils/event_span.cc
index 9a0f65941b2..137a798c7f8 100644
--- a/tensorflow/core/profiler/utils/event_span.cc
+++ b/tensorflow/core/profiler/utils/event_span.cc
@@ -128,8 +128,7 @@ std::vector<EventTypeSpan> ToNonOverlappedEvents(
   if (event_boundaries.empty()) return result;
   result.reserve(event_boundaries.size());
   PriorityTracker priority_tracker;
-  for (int64 i = 0, iter_limit = (event_boundaries.size() - 1); i < iter_limit;
-       i++) {
+  for (int64 i = 0, end = (event_boundaries.size() - 1); i < end; i++) {
     EventType highest_priority = priority_tracker.Update(event_boundaries[i]);
     result.push_back({highest_priority, Timespan::FromEndPoints(
                                             event_boundaries[i].time_ps,
@@ -141,6 +140,8 @@ std::vector<EventTypeSpan> ToNonOverlappedEvents(
 void CombineStepDetails(const StepDetails& src, StepDetails* dst) {
   dst->AppendMarkers(src.Markers());
   dst->AppendEvents(src.Events());
+  dst->AppendCollectives(src.Collectives());
+  dst->AggregateDeviceMemoryTransfers(src.DeviceMemoryTransfers());
 }
 
 EventType ClassifyDeviceCompute(absl::string_view event_name,
@@ -171,6 +172,9 @@ EventType ClassifyGpuEvent(absl::string_view event_name,
     return DEVICE_TO_HOST;
   if (absl::StartsWithIgnoreCase(event_name, "MEMCPYDtoD"))
     return DEVICE_TO_DEVICE;
+  if (absl::StartsWithIgnoreCase(event_name, "nccl")) {
+    return DEVICE_COLLECTIVES;
+  }
   return ClassifyDeviceCompute(event_name, tensor_shapes);
 }
 
@@ -283,6 +287,10 @@ StepEvents ToNonOverlappedStepEvents(const StepEvents& overlapped_step_events) {
         step_details.Markers();
     *non_overlapped_step_events[step_id].MutableEvents() =
         ToNonOverlappedEvents(step_details.Events());
+    *non_overlapped_step_events[step_id].MutableCollectives() =
+        step_details.Collectives();
+    *non_overlapped_step_events[step_id].MutableDeviceMemoryTransfers() =
+        step_details.DeviceMemoryTransfers();
   }
   return non_overlapped_step_events;
 }
@@ -299,6 +307,61 @@ void StepDetails::AppendEvents(const std::vector<EventTypeSpan>& other_events) {
   events_.insert(events_.end(), other_events.begin(), other_events.end());
 }
 
+void StepDetails::AppendCollectives(
+    const absl::flat_hash_map<uint32, AllReduceDbResult>& collectives) {
+  for (const auto& it : collectives) {
+    collectives_[it.first] = it.second;
+  }
+}
+
+void StepDetails::AggregateDeviceMemoryTransfers(
+    const std::vector<DeviceMemoryTransfer> device_memory_transfers) {
+  if (device_memory_transfers.size() != device_memory_transfers_.size()) {
+    return;  // Sanity check.
+  }
+  for (size_t i = 0; i < device_memory_transfers.size(); ++i) {
+    device_memory_transfers_[i].set_occurrence(
+        device_memory_transfers_[i].occurrence() +
+        device_memory_transfers[i].occurrence());
+    device_memory_transfers_[i].set_bytes_transferred(
+        device_memory_transfers_[i].bytes_transferred() +
+        device_memory_transfers[i].bytes_transferred());
+    device_memory_transfers_[i].set_time_us(
+        device_memory_transfers_[i].time_us() +
+        device_memory_transfers[i].time_us());
+  }
+}
+
+void StepDetails::AddCollectiveOpEvent(uint64 core_id, const AllReduceInfo& e) {
+  *collectives_[core_id].add_all_reduce_info() = e;
+}
+
+void StepDetails::AddDeviceMemoryTransferEvent(EventType event_type,
+                                               const Timespan& time_span,
+                                               uint64 bytes) {
+  int index = 0;
+  switch (event_type) {
+    case HOST_TO_DEVICE:
+      index = 0;
+      break;
+    case DEVICE_TO_HOST:
+      index = 1;
+      break;
+    case DEVICE_TO_DEVICE:
+      index = 2;
+      break;
+    default:
+      return;
+  }
+  device_memory_transfers_[index].set_occurrence(
+      device_memory_transfers_[index].occurrence() + 1);
+  device_memory_transfers_[index].set_time_us(
+      device_memory_transfers_[index].time_us() +
+      time_span.duration_ps() / 1000000.0);
+  device_memory_transfers_[index].set_bytes_transferred(
+      device_memory_transfers_[index].bytes_transferred() + bytes);
+}
+
 Timespan StepDetails::StepTime() const {
   Timespan max_host_step_time;
   Timespan max_device_step_time;
@@ -326,12 +389,12 @@ Timespan StepDetails::StepTime() const {
 
 std::string StepDetails::DebugString() const {
   std::string result = "([";
-  for (int i = 0, iter_limit = markers_.size(); i < iter_limit; i++) {
+  for (int i = 0, end = markers_.size(); i < end; i++) {
     if (i > 0) absl::StrAppend(&result, ", ");
     absl::StrAppend(&result, PrintStepMarker(markers_[i]));
   }
   absl::StrAppend(&result, "], [");
-  for (int i = 0, iter_limit = events_.size(); i < iter_limit; i++) {
+  for (int i = 0, end = events_.size(); i < end; i++) {
     if (i > 0) absl::StrAppend(&result, ", ");
     absl::StrAppend(&result, PrintEventTypeSpan(events_[i]));
   }
diff --git a/tensorflow/core/profiler/utils/event_span.h b/tensorflow/core/profiler/utils/event_span.h
index 1adc6a75d82..6ffbd228d5e 100644
--- a/tensorflow/core/profiler/utils/event_span.h
+++ b/tensorflow/core/profiler/utils/event_span.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
 #include "tensorflow/core/profiler/utils/timespan.h"
 
 namespace tensorflow {
@@ -36,29 +37,31 @@ enum EventType {
   // executing some events which were not traced.
   UNKNOWN_TIME = 0,
   // Host is computing.
-  HOST_COMPUTE = 1,
+  HOST_COMPUTE = 10,
   // Host is compiling.
-  HOST_COMPILE = 2,
+  HOST_COMPILE = 20,
   // Host-to-host communication.
-  HOST_TO_HOST = 3,
+  HOST_TO_HOST = 30,
   // Host-to-device communication.
-  HOST_TO_DEVICE = 4,
+  HOST_TO_DEVICE = 40,
   // Host is preparing to launch a computation on device.
-  HOST_PREPARE = 5,
+  HOST_PREPARE = 50,
   // Host is waiting for input.
-  HOST_WAIT_INPUT = 6,
+  HOST_WAIT_INPUT = 60,
   // Device-to-device communication.
-  DEVICE_TO_DEVICE = 7,
+  DEVICE_TO_DEVICE = 70,
   // Device-to-host communication.
-  DEVICE_TO_HOST = 8,
+  DEVICE_TO_HOST = 80,
+  // Collective Ops such as All-Reduce.
+  DEVICE_COLLECTIVES = 90,
   // Device is computing with 32-bit precision.
-  DEVICE_COMPUTE_32 = 9,
+  DEVICE_COMPUTE_32 = 100,
   // Device is computing with 16-bit precision.
-  DEVICE_COMPUTE_16 = 10,
+  DEVICE_COMPUTE_16 = 110,
   // Device is waiting for another device.
-  DEVICE_WAIT_DEVICE = 11,
+  DEVICE_WAIT_DEVICE = 120,
   // Device is waiting for host.
-  DEVICE_WAIT_HOST = 12,
+  DEVICE_WAIT_HOST = 130,
   LAST_EVENT_TYPE = DEVICE_WAIT_HOST
 };
 
@@ -108,6 +111,55 @@ struct StepMarker {
 // Details of a step. Note that this could be the result of combining the
 // StepDetails of the same step executed on different cores.
 class StepDetails {
+ public:
+  StepDetails() : device_memory_transfers_(3) {}
+
+  const std::vector<StepMarker>& Markers() const { return markers_; }
+  const std::vector<EventTypeSpan>& Events() const { return events_; }
+  const absl::flat_hash_map<uint32, AllReduceDbResult>& Collectives() const {
+    return collectives_;
+  }
+  const std::vector<DeviceMemoryTransfer>& DeviceMemoryTransfers() const {
+    return device_memory_transfers_;
+  }
+  // Returns the step time.
+  Timespan StepTime() const;
+  std::vector<StepMarker>* MutableMarkers() { return &markers_; }
+  std::vector<EventTypeSpan>* MutableEvents() { return &events_; }
+  absl::flat_hash_map<uint32, AllReduceDbResult>* MutableCollectives() {
+    return &collectives_;
+  }
+  std::vector<DeviceMemoryTransfer>* MutableDeviceMemoryTransfers() {
+    return &device_memory_transfers_;
+  }
+  // Adds a step-marker to this step.
+  void AddMarker(const StepMarker& m);
+  // Adds an EventTypeSpan to this step.
+  void AddEvent(const EventTypeSpan& e);
+  // Adds a collective op to this step.
+  void AddCollectiveOpEvent(uint64 core_id, const AllReduceInfo& e);
+  // Appends device memory transfer events to this step.
+  // Only event type of HOST_TO_DEVICE/DEVICE_TO_DEVICE/DEVICE_TO_HOST are
+  // allowed.
+  void AddDeviceMemoryTransferEvent(EventType event_type,
+                                    const Timespan& time_span, uint64 bytes);
+  // Appends the step-markers from another step to this step.
+  void AppendMarkers(const std::vector<StepMarker>& other_markers);
+  // Appends the events from another step to this step.
+  void AppendEvents(const std::vector<EventTypeSpan>& other_events);
+  // Appends the collectives from another step to this step.
+  void AppendCollectives(
+      const absl::flat_hash_map<uint32, AllReduceDbResult>& collectives);
+  // Accumulates the device memory transfers from another step to this step.
+  void AggregateDeviceMemoryTransfers(
+      const std::vector<DeviceMemoryTransfer> device_memory_transfers);
+  // Equality test.
+  bool operator==(const StepDetails& other) const;
+  // Inequality test.
+  bool operator!=(const StepDetails& other) const { return !(*this == other); }
+  // Returns a string that prints the content of this object.
+  std::string DebugString() const;
+
  private:
   // All step-markers found for marking this step in the traces. There could be
   // multiple step-markers for a single step for different reasons. One such
@@ -117,28 +169,12 @@ class StepDetails {
   std::vector<StepMarker> markers_;
   // All events belonging to this step.
   std::vector<EventTypeSpan> events_;
-
- public:
-  const std::vector<StepMarker>& Markers() const { return markers_; }
-  const std::vector<EventTypeSpan>& Events() const { return events_; }
-  // Returns the step time.
-  Timespan StepTime() const;
-  std::vector<StepMarker>* MutableMarkers() { return &markers_; }
-  std::vector<EventTypeSpan>* MutableEvents() { return &events_; }
-  // Adds a step-marker to this step.
-  void AddMarker(const StepMarker& m);
-  // Adds an EventTypeSpan to this step.
-  void AddEvent(const EventTypeSpan& e);
-  // Appends the step-markers from another step to this step.
-  void AppendMarkers(const std::vector<StepMarker>& other_markers);
-  // Appends the events from another step to this step.
-  void AppendEvents(const std::vector<EventTypeSpan>& other_events);
-  // Equality test.
-  bool operator==(const StepDetails& other) const;
-  // Inequality test.
-  bool operator!=(const StepDetails& other) const { return !(*this == other); }
-  // Returns a string that prints the content of this object.
-  std::string DebugString() const;
+  // Collective operation related events such as all-reduce etc.
+  absl::flat_hash_map<uint32, AllReduceDbResult> collectives_;
+  // Device memory transfers (including time and bytes involved).
+  // TODO(jiesun): Consider to use IntervalSet instead of just sum up the event
+  // durations.
+  std::vector<DeviceMemoryTransfer> device_memory_transfers_;
 };
 
 // Map from step_id to the events happened in that step.
diff --git a/tensorflow/core/profiler/utils/group_events.cc b/tensorflow/core/profiler/utils/group_events.cc
index 926dfe65156..86566822252 100644
--- a/tensorflow/core/profiler/utils/group_events.cc
+++ b/tensorflow/core/profiler/utils/group_events.cc
@@ -66,10 +66,10 @@ absl::optional<int64> GetKernelEventType(bool is_host_plane,
   return absl::nullopt;
 }
 
-bool IsTfOpEvent(const XPlaneVisitor& visitor, const XEvent& event) {
+Category GetTfEventCategory(const XPlaneVisitor& visitor, const XEvent& event) {
   TfOp tf_op =
       ParseTfOpFullname(visitor.GetEventMetadata(event.metadata_id())->name());
-  return tf_op.category == Category::kTensorFlow;
+  return tf_op.category;
 }
 
 int64 GetEventType(bool is_host_plane, const XPlaneVisitor& visitor,
@@ -83,10 +83,16 @@ int64 GetEventType(bool is_host_plane, const XPlaneVisitor& visitor,
     // TODO(b/148346217): Make XPlaneVisitor support KernelLaunch and
     // KernelExecute event types.
     return *kernel_event_type;
-  } else if (IsTfOpEvent(visitor, event)) {
-    return HostEventType::kTfOpRun;
   } else {
-    return HostEventType::kUnknownHostEventType;
+    Category category = GetTfEventCategory(visitor, event);
+    switch (category) {
+      case Category::kTensorFlow:
+        return HostEventType::kTfOpRun;
+      case Category::kTfData:
+        return HostEventType::kIterator;
+      default:
+        return HostEventType::kUnknownHostEventType;
+    }
   }
 }
 
@@ -148,7 +154,7 @@ bool IsImplicitRootEvent(const XEventVisitor& event) {
 }
 
 void ProcessRootEvent(int64 group_id, EventNode* root_event,
-                      EventGroupNameMap* event_group_name_map) {
+                      GroupMetadataMap* group_metadata_map) {
   root_event->PropagateGroupId(group_id);
   std::string group_name = root_event->GetGroupName();
   // TODO(jihochoi): change event name instead.
@@ -158,7 +164,7 @@ void ProcessRootEvent(int64 group_id, EventNode* root_event,
     // `step_name` stat's value if present.
     root_event->AddStepName(group_name);
   }
-  event_group_name_map->emplace(group_id, std::move(group_name));
+  (*group_metadata_map)[group_id].name = std::move(group_name);
 }
 
 bool IsTfDataEvent(const EventNode& event_node) {
@@ -269,6 +275,11 @@ void SortEventList(EventList* event_list) {
   });
 }
 
+// Returns true if it has JAX-related events.
+bool HasJaxEvent(const EventNodeMap& event_node_map) {
+  return event_node_map.contains(HostEventType::kExecuteOnLocalDevices);
+}
+
 }  // namespace
 
 EventNode::EventNode(const XPlaneVisitor* plane, XLine* raw_line,
@@ -492,6 +503,7 @@ void EventForest::ProcessLegacyRootEvents(
   for (int64 root_event_type : root_event_types) {
     if (auto root_events = gtl::FindOrNull(event_node_map_, root_event_type)) {
       for (const auto& root_event : *root_events) {
+        root_event->SetIsRoot(true);
         root_events_.push_back(root_event.get());
       }
     }
@@ -499,18 +511,21 @@ void EventForest::ProcessLegacyRootEvents(
 }
 
 void EventForest::CreateEventGroup() {
-  if (!tf_loop_root_events_.empty()) {
-    // If a TF loop is used, each TF loop iteration becomes a root.
+  // Create a group for each TF loop iteration in non-JAX profiles.
+  if (!HasJaxEvent(event_node_map_) && !tf_loop_root_events_.empty()) {
     for (EventNode* root_event : tf_loop_root_events_) {
-      ProcessRootEvent(next_group_id_++, root_event, &event_group_name_map_);
+      ProcessRootEvent(next_group_id_++, root_event, &group_metadata_map_);
     }
     return;
   }
-
   SortEventList(&root_events_);
+  // Create a group for each top root event while ignoring TF's legacy root
+  // events for JAX profiles.
   for (EventNode* root_event : root_events_) {
-    if (IsTopRoot(root_event)) {
-      ProcessRootEvent(next_group_id_++, root_event, &event_group_name_map_);
+    if (IsTopRoot(root_event) &&
+        (!HasJaxEvent(event_node_map_) ||
+         !IsLegacyRootEvent(root_event->GetEventVisitor()))) {
+      ProcessRootEvent(next_group_id_++, root_event, &group_metadata_map_);
     }
   }
 }
@@ -618,6 +633,90 @@ void EventForest::ProcessWorker() {
   }
 }
 
+void EventForest::ProcessModelIds() {
+  auto session_run_event_list =
+      gtl::FindOrNull(event_node_map_, HostEventType::kSessionRun);
+  if (!session_run_event_list) return;
+  for (const auto& session_run_event : *session_run_event_list) {
+    auto group_id = session_run_event->GetGroupId();
+    if (!group_id.has_value()) continue;
+    absl::optional<XStatVisitor> model_id =
+        session_run_event->GetEventVisitor().GetStat(StatType::kModelId);
+    if (!model_id.has_value()) continue;
+    group_metadata_map_[*group_id].model_id = model_id->ToString();
+  }
+}
+
+void EventForest::ProcessTfDataEvents() {
+  absl::flat_hash_map<std::pair<int64 /*iterator_id*/, int64 /*element_id*/>,
+                      std::vector<EventNode*>>
+      produce_iterator_map;
+  uint64 num_producers = 0;
+  for (HostEventType event_type :
+       {HostEventType::kPrefetchProduce,
+        HostEventType::kParallelInterleaveProduce,
+        HostEventType::kParallelMapProduce, HostEventType::kMapAndBatchProduce,
+        HostEventType::kParseExampleProduce}) {
+    auto produce_event_list = gtl::FindOrNull(event_node_map_, event_type);
+    if (!produce_event_list) continue;
+    VLOG(1) << produce_event_list->size() << " "
+            << GetHostEventTypeStr(event_type) << " events found.";
+    for (auto& produce_event : *produce_event_list) {
+      absl::optional<XStatVisitor> element_id =
+          produce_event->GetEventVisitor().GetStat(StatType::kElementId);
+      if (!element_id.has_value()) continue;
+      for (EventNode* produce_iterator : produce_event->GetChildren()) {
+        if (IsDatasetOp(ParseTfOpFullname(
+                produce_iterator->GetEventVisitor().Name()))) {
+          absl::optional<XStatVisitor> iterator_id =
+              produce_iterator->GetEventVisitor().GetStat(StatType::kParentId);
+          if (!iterator_id.has_value()) break;
+          produce_iterator_map[{iterator_id->IntValue(),
+                                element_id->IntValue()}]
+              .push_back(produce_iterator);
+          ++num_producers;
+          break;
+        }
+      }
+    }
+  }
+  VLOG(1) << num_producers << " producer iterators found.";
+  uint64 num_matched = 0;
+  for (HostEventType event_type :
+       {HostEventType::kPrefetchConsume,
+        HostEventType::kParallelInterleaveConsume,
+        HostEventType::kParallelMapConsume, HostEventType::kMapAndBatchConsume,
+        HostEventType::kParseExampleConsume}) {
+    auto consume_event_list = gtl::FindOrNull(event_node_map_, event_type);
+    if (!consume_event_list) continue;
+    VLOG(1) << consume_event_list->size() << " "
+            << GetHostEventTypeStr(event_type) << " events found.";
+    for (auto& consume_event : *consume_event_list) {
+      absl::optional<XStatVisitor> element_id =
+          consume_event->GetEventVisitor().GetStat(StatType::kElementId);
+      if (!element_id.has_value()) continue;
+      EventNode* consume_iterator = consume_event->GetParent();
+      if (!consume_iterator ||
+          !IsDatasetOp(
+              ParseTfOpFullname(consume_iterator->GetEventVisitor().Name()))) {
+        continue;
+      }
+      absl::optional<XStatVisitor> iterator_id =
+          consume_iterator->GetEventVisitor().GetStat(StatType::kStepId);
+      if (!iterator_id.has_value()) continue;
+      if (auto produce_iterators = gtl::FindOrNull(
+              produce_iterator_map, std::make_pair(iterator_id->IntValue(),
+                                                   element_id->IntValue()))) {
+        for (EventNode* produce_iterator : *produce_iterators) {
+          consume_iterator->AddChild(produce_iterator);
+          ++num_matched;
+        }
+      }
+    }
+  }
+  VLOG(1) << num_matched << " consumer iterators matched.";
+}
+
 EventForest::EventForest(
     const std::vector<InterThreadConnectInfo>& connect_info_list,
     const std::vector<int64>& root_event_types,
@@ -638,6 +737,18 @@ EventForest::EventForest(
   CreateEventGroup();
   MarkEagerlyExecutedGpuKernels();
   MarkEagerlyExecutedCpuTfOps();
+  ProcessModelIds();
+}
+
+EventForest::EventForest(
+    const std::function<XPlaneVisitor(const XPlane*)> visitor_factory,
+    XPlane* plane) {
+  ContextGroupMap context_groups;
+  visitors_.reserve(1);
+  CreateStatMetadata(plane);
+  visitors_.push_back(visitor_factory(plane));
+  ConnectIntraThread(visitors_.back(), plane, &context_groups);
+  ConnectContextGroups(context_groups);
 }
 
 std::vector<InterThreadConnectInfo> CreateInterThreadConnectInfoList() {
@@ -654,13 +765,13 @@ std::vector<InterThreadConnectInfo> CreateInterThreadConnectInfoList() {
   return connect_info_list;
 }
 
-void GroupTfEvents(XSpace* space, EventGroupNameMap* event_group_name_map) {
+void GroupTfEvents(XSpace* space, GroupMetadataMap* group_metadata_map) {
   if (!space) return;
   std::vector<InterThreadConnectInfo> connect_info_list =
       CreateInterThreadConnectInfoList();
   EventForest event_forest(connect_info_list, {}, CreateTfXPlaneVisitor, space);
-  if (event_group_name_map) {
-    *event_group_name_map = event_forest.GetEventGroupNameMap();
+  if (group_metadata_map) {
+    *group_metadata_map = event_forest.GetGroupMetadataMap();
   }
 }
 
diff --git a/tensorflow/core/profiler/utils/group_events.h b/tensorflow/core/profiler/utils/group_events.h
index 568ebff6577..e03acf3a37f 100644
--- a/tensorflow/core/profiler/utils/group_events.h
+++ b/tensorflow/core/profiler/utils/group_events.h
@@ -126,7 +126,12 @@ using EventNodeMap =
     absl::flat_hash_map<int64 /*event_type*/,
                         std::vector<std::unique_ptr<EventNode>>>;
 
-using EventGroupNameMap = absl::flat_hash_map<int64 /*group_id*/, std::string>;
+struct GroupMetadata {
+  std::string name;
+  std::string model_id;  // inference only.
+};
+
+using GroupMetadataMap = absl::flat_hash_map<int64 /*group_id*/, GroupMetadata>;
 
 using EventList = std::vector<EventNode*>;
 
@@ -151,12 +156,18 @@ class EventForest {
               const std::function<XPlaneVisitor(const XPlane*)> visitor_factory,
               XSpace* space);
 
+  EventForest(const std::function<XPlaneVisitor(const XPlane*)> visitor_factory,
+              XPlane* plane);
+
   const EventNodeMap& GetEventNodeMap() const { return event_node_map_; }
 
-  const EventGroupNameMap& GetEventGroupNameMap() const {
-    return event_group_name_map_;
+  const GroupMetadataMap& GetGroupMetadataMap() const {
+    return group_metadata_map_;
   }
 
+  // Connects tf.data events across threads.
+  void ProcessTfDataEvents();
+
  private:
   // Creates an EventNode for each event in event_node_map and connect events
   // according to the nesting relationship within the thread.
@@ -190,9 +201,12 @@ class EventForest {
   // eager ops (e.g., for Keras callback).
   void ProcessWorker();
 
+  // Adds model ids to group_metadata_map_ for inference profiles.
+  void ProcessModelIds();
+
   EventNodeMap event_node_map_;
   std::vector<XPlaneVisitor> visitors_;
-  EventGroupNameMap event_group_name_map_;
+  GroupMetadataMap group_metadata_map_;
   EventList root_events_;
   EventList tf_loop_root_events_;
   int64 next_group_id_ = 0;
@@ -202,7 +216,7 @@ std::vector<InterThreadConnectInfo> CreateInterThreadConnectInfoList();
 
 // Calls GroupEvents with connect_info_list and root_event_types specific to
 // TensorFlow.
-void GroupTfEvents(XSpace* space, EventGroupNameMap* event_group_name_map);
+void GroupTfEvents(XSpace* space, GroupMetadataMap* group_metadata_map);
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/group_events_test.cc b/tensorflow/core/profiler/utils/group_events_test.cc
index d029dc6a03a..195f2adb9c4 100644
--- a/tensorflow/core/profiler/utils/group_events_test.cc
+++ b/tensorflow/core/profiler/utils/group_events_test.cc
@@ -61,15 +61,15 @@ TEST(GroupEventsTest, GroupGpuTraceLegacyRootTest) {
   CreateXEvent(&device_plane_builder, &stream, "matmul", 200, 300,
                {{StatType::kCorrelationId, kCorrelationId}});
 
-  EventGroupNameMap event_group_name_map;
-  GroupTfEvents(&space, &event_group_name_map);
+  GroupMetadataMap group_metadata_map;
+  GroupTfEvents(&space, &group_metadata_map);
   XPlaneVisitor device_plane_visitor = CreateTfXPlaneVisitor(device_plane);
   EXPECT_EQ(device_plane->lines(0).events(0).stats_size(), 3);
   EXPECT_EQ(device_plane_visitor.GetStatType(
                 device_plane->lines(0).events(0).stats(1)),
             StatType::kGroupId);
-  EXPECT_EQ(event_group_name_map.size(), 1);
-  EXPECT_EQ(event_group_name_map[0], "train 123");
+  EXPECT_EQ(group_metadata_map.size(), 1);
+  EXPECT_EQ(group_metadata_map[0].name, "train 123");
 }
 
 TEST(GroupEventsTest, GroupGpuTraceTest) {
@@ -83,7 +83,7 @@ TEST(GroupEventsTest, GroupGpuTraceTest) {
 
   auto main_thread = host_plane_builder.GetOrCreateLine(0);
   CreateXEvent(&host_plane_builder, &main_thread, "train", 0, 100,
-               {{StatType::kStepNum, kStepNum}, {StatType::kIsRoot, 1LL}});
+               {{StatType::kStepNum, kStepNum}, {StatType::kIsRoot, int64{1}}});
   CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kFunctionRun,
                10, 90, {{StatType::kStepId, kStepId}});
 
@@ -102,15 +102,15 @@ TEST(GroupEventsTest, GroupGpuTraceTest) {
   CreateXEvent(&device_plane_builder, &stream, "matmul", 200, 300,
                {{StatType::kCorrelationId, kCorrelationId}});
 
-  EventGroupNameMap event_group_name_map;
-  GroupTfEvents(&space, &event_group_name_map);
+  GroupMetadataMap group_metadata_map;
+  GroupTfEvents(&space, &group_metadata_map);
   XPlaneVisitor device_plane_visitor = CreateTfXPlaneVisitor(device_plane);
   EXPECT_EQ(device_plane->lines(0).events(0).stats_size(), 3);
   EXPECT_EQ(device_plane_visitor.GetStatType(
                 device_plane->lines(0).events(0).stats(1)),
             StatType::kGroupId);
-  EXPECT_EQ(event_group_name_map.size(), 1);
-  EXPECT_EQ(event_group_name_map[0], "train 123");
+  EXPECT_EQ(group_metadata_map.size(), 1);
+  EXPECT_EQ(group_metadata_map[0].name, "train 123");
 }
 
 TEST(GroupEventsTest, GroupTensorFlowLoopTest) {
@@ -140,16 +140,16 @@ TEST(GroupEventsTest, GroupTensorFlowLoopTest) {
   CreateXEvent(&device_plane_builder, &stream, "matmul", 200, 300,
                {{StatType::kCorrelationId, kCorrelationId}});
 
-  EventGroupNameMap event_group_name_map;
-  GroupTfEvents(&space, &event_group_name_map);
+  GroupMetadataMap group_metadata_map;
+  GroupTfEvents(&space, &group_metadata_map);
   XPlaneVisitor device_plane_visitor = CreateTfXPlaneVisitor(device_plane);
   EXPECT_EQ(device_plane->lines(0).events(0).stats_size(), 3);
   EXPECT_EQ(device_plane_visitor.GetStatType(
                 device_plane->lines(0).events(0).stats(1)),
             StatType::kGroupId);
   EXPECT_EQ(device_plane->lines(0).events(0).stats(1).int64_value(), 10);
-  EXPECT_EQ(event_group_name_map.size(), 1);
-  EXPECT_EQ(event_group_name_map[10], "10");
+  EXPECT_EQ(group_metadata_map.size(), 1);
+  EXPECT_EQ(group_metadata_map[10].name, "10");
 }
 
 // When there are multiple TF loops, group_id is assigned in the order of TF
@@ -187,13 +187,13 @@ TEST(GroupEventsTest, GroupMultipleTensorFlowLoopsTest) {
                {{StatType::kStepId, kFirstStepId},
                 {StatType::kIterNum, kFirstIterNumStart + 1}});
 
-  EventGroupNameMap event_group_name_map;
-  GroupTfEvents(&space, &event_group_name_map);
-  EXPECT_EQ(event_group_name_map.size(), 4);
-  EXPECT_TRUE(event_group_name_map.count(10));
-  EXPECT_TRUE(event_group_name_map.count(11));
-  EXPECT_TRUE(event_group_name_map.count(12));
-  EXPECT_TRUE(event_group_name_map.count(13));
+  GroupMetadataMap group_metadata_map;
+  GroupTfEvents(&space, &group_metadata_map);
+  EXPECT_EQ(group_metadata_map.size(), 4);
+  EXPECT_TRUE(group_metadata_map.count(10));
+  EXPECT_TRUE(group_metadata_map.count(11));
+  EXPECT_TRUE(group_metadata_map.count(12));
+  EXPECT_TRUE(group_metadata_map.count(13));
 }
 
 TEST(GroupEventsTest, GroupFunctionalOp) {
@@ -223,8 +223,8 @@ TEST(GroupEventsTest, GroupFunctionalOp) {
                HostEventType::kExecutorStateProcess, 100, 150,
                {{StatType::kStepId, kFunctionStepId}});
 
-  EventGroupNameMap event_group_name_map;
-  GroupTfEvents(&space, &event_group_name_map);
+  GroupMetadataMap group_metadata_map;
+  GroupTfEvents(&space, &group_metadata_map);
   XPlaneVisitor host_plane_visitor = CreateTfXPlaneVisitor(host_plane);
   // Check that RemoteCallOp is grouped correctly so that all events belong
   // to the same group.
@@ -271,7 +271,7 @@ TEST(GroupEventsTest, EagerOpTest) {
   CreateXEvent(&device_plane_builder, &stream, "matmul", 200, 300,
                {{StatType::kCorrelationId, kCorrelationId}});
 
-  GroupTfEvents(&space, /*event_group_name_map=*/nullptr);
+  GroupTfEvents(&space, /*group_metadata_map=*/nullptr);
   XPlaneVisitor host_plane_visitor = CreateTfXPlaneVisitor(host_plane);
   const XEvent& eager_cpu_tf_op = host_plane->lines(0).events(3);
   EXPECT_EQ(eager_cpu_tf_op.stats_size(), 1);
@@ -323,7 +323,7 @@ TEST(GroupEventsTest, FunctionOpTest) {
   CreateXEvent(&device_plane_builder, &stream, "matmul", 200, 300,
                {{StatType::kCorrelationId, kCorrelationId}});
 
-  GroupTfEvents(&space, /*event_group_name_map=*/nullptr);
+  GroupTfEvents(&space, /*group_metadata_map=*/nullptr);
   XPlaneVisitor host_plane_visitor = CreateTfXPlaneVisitor(host_plane);
   const XEvent& cpu_tf_op = host_plane->lines(1).events(2);
   EXPECT_EQ(cpu_tf_op.stats_size(), 2);
@@ -359,7 +359,7 @@ TEST(GroupEventsTest, SemanticArgTest) {
                {{StatType::kConsumerType, kContextType},
                 {StatType::kConsumerId, kContextId}});
 
-  GroupTfEvents(&raw_space, /*event_group_name_map=*/nullptr);
+  GroupTfEvents(&raw_space, /*group_metadata_map=*/nullptr);
   int num_events = 0;
   CreateTfXPlaneVisitor(raw_plane).ForEachLine(
       [&](const tensorflow::profiler::XLineVisitor& line) {
@@ -400,7 +400,7 @@ TEST(GroupEventsTest, SemanticIntArgNoMatchTest) {
                {{StatType::kConsumerType, kContextType},
                 {StatType::kConsumerId, kConsumerId}});
 
-  GroupTfEvents(&raw_space, /*event_group_name_map=*/nullptr);
+  GroupTfEvents(&raw_space, /*group_metadata_map=*/nullptr);
   int num_events = 0;
   CreateTfXPlaneVisitor(raw_plane).ForEachLine(
       [&](const tensorflow::profiler::XLineVisitor& line) {
@@ -445,7 +445,7 @@ TEST(GroupEventsTest, SemanticUintArgNoMatchTest) {
                {{StatType::kConsumerType, kContextType},
                 {StatType::kConsumerId, kConsumerId}});
 
-  GroupTfEvents(&raw_space, /*event_group_name_map=*/nullptr);
+  GroupTfEvents(&raw_space, /*group_metadata_map=*/nullptr);
   int num_events = 0;
   CreateTfXPlaneVisitor(raw_plane).ForEachLine(
       [&](const tensorflow::profiler::XLineVisitor& line) {
@@ -485,7 +485,7 @@ TEST(GroupEventsTest, AsyncEventTest) {
                {{StatType::kIsAsync, kIsAsync}});
   CreateXEvent(&plane, &line, kChild, 20, 80);
 
-  GroupTfEvents(&raw_space, /*event_group_name_map=*/nullptr);
+  GroupTfEvents(&raw_space, /*group_metadata_map=*/nullptr);
   CreateTfXPlaneVisitor(raw_plane).ForEachLine(
       [&](const tensorflow::profiler::XLineVisitor& line) {
         EXPECT_EQ(line.NumEvents(), 3);
@@ -538,7 +538,7 @@ TEST(GroupEventsTest, WorkerTest) {
   CreateXEvent(&plane, &line, HostEventType::kFunctionRun,
                kSecondFunctionRunStartTime, kFunctionRunDuration);
 
-  GroupTfEvents(&raw_space, /*event_group_name_map=*/nullptr);
+  GroupTfEvents(&raw_space, /*group_metadata_map=*/nullptr);
   CreateTfXPlaneVisitor(raw_plane).ForEachLine(
       [&](const tensorflow::profiler::XLineVisitor& line) {
         EXPECT_EQ(line.NumEvents(), 6);
diff --git a/tensorflow/core/profiler/utils/hardware_type_utils.cc b/tensorflow/core/profiler/utils/hardware_type_utils.cc
index e2a4004555b..69b5d4796a3 100644
--- a/tensorflow/core/profiler/utils/hardware_type_utils.cc
+++ b/tensorflow/core/profiler/utils/hardware_type_utils.cc
@@ -74,5 +74,13 @@ double GetFlopMaxThroughputPerSM(const DeviceCapabilities& device_cap) {
          device_cap.clock_rate_in_ghz();
 }
 
+HardwareType ParseHardwareType(absl::string_view device_type) {
+  if (device_type == "GPU" || device_type == "Nvidia GPU")
+    return HardwareType::GPU;
+  if (device_type == "CPU") return HardwareType::CPU_ONLY;
+  if (device_type == "TPU") return HardwareType::TPU;
+  return HardwareType::UNKNOWN_HARDWARE;
+}
+
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/hardware_type_utils.h b/tensorflow/core/profiler/utils/hardware_type_utils.h
index 9d4b8b73eaf..70090fb766f 100644
--- a/tensorflow/core/profiler/utils/hardware_type_utils.h
+++ b/tensorflow/core/profiler/utils/hardware_type_utils.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
 
 namespace tensorflow {
@@ -25,6 +26,8 @@ namespace profiler {
 // streaming multiprocessor.
 double GetFlopMaxThroughputPerSM(const DeviceCapabilities& device_cap);
 
+HardwareType ParseHardwareType(absl::string_view device_type);
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/utils/kernel_stats_utils.cc b/tensorflow/core/profiler/utils/kernel_stats_utils.cc
index c40c3a89c9c..9212ecba533 100644
--- a/tensorflow/core/profiler/utils/kernel_stats_utils.cc
+++ b/tensorflow/core/profiler/utils/kernel_stats_utils.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <tuple>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/strings/match.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_split.h"
@@ -142,7 +143,7 @@ bool IsEinsumTensorCoreEligible(absl::string_view equation) {
 }
 
 bool KernelReportLessThanComparator::operator()(const KernelReport& lhs,
-                                                const KernelReport& rhs) {
+                                                const KernelReport& rhs) const {
   // Disable formatting to keep vertical alignment for better readability,
   // and make it easier to reorder columns.
   // clang-format off
@@ -180,7 +181,7 @@ bool KernelReportLessThanComparator::operator()(const KernelReport& lhs,
 }
 
 bool KernelReportEqualToComparator::operator()(const KernelReport& lhs,
-                                               const KernelReport& rhs) {
+                                               const KernelReport& rhs) const {
   // Disable formatting to keep vertical alignment for better readability,
   // and make it easier to reorder columns.
   // clang-format off
@@ -213,33 +214,69 @@ void SortKernelsByTotalDurationDesc(KernelStatsDb* kernel_stats_db) {
             });
 }
 
-void GroupKernelReports(std::vector<KernelReport>* reports,
-                        KernelStatsDb* dst) {
-  // Sort reports by grouping criteria.
-  std::sort(reports->begin(), reports->end(), KernelReportLessThanComparator());
+void CopyKernelReportsToDb(const KernelReportMap& reports, KernelStatsDb* dst) {
+  for (const auto& report_value : reports) {
+    KernelReport* report = dst->add_reports();
+    *report = report_value.first;
+    // Set value using KernelReportValue.
+    report->set_occurrences(report_value.second.occurrences);
+    report->set_min_duration_ns(report_value.second.min_duration_ns);
+    report->set_max_duration_ns(report_value.second.max_duration_ns);
+    report->set_total_duration_ns(report_value.second.total_duration_ns);
+  }
+}
 
-  // Group reports together.
-  KernelReport* prev = nullptr;
-  for (const KernelReport& report : *reports) {
-    DCHECK_EQ(3, report.grid_dim_size());
-    DCHECK_EQ(3, report.block_dim_size());
-    if (prev != nullptr && KernelReportEqualToComparator()(*prev, report)) {
-      // Previous element is identical to the one that we are adding, so
-      // aggregate them.
-      prev->set_occurrences(prev->occurrences() + 1);
-      prev->set_max_duration_ns(
-          std::max(prev->max_duration_ns(), report.max_duration_ns()));
-      prev->set_min_duration_ns(
-          std::min(prev->min_duration_ns(), report.min_duration_ns()));
-      prev->set_total_duration_ns(prev->total_duration_ns() +
-                                  report.total_duration_ns());
+void InsertOrUpdateKernelReport(const KernelReport& kernel,
+                                const KernelReportValue& value,
+                                KernelReportMap* dst) {
+  KernelReportValue& element = (*dst)[kernel];
+  if (element.occurrences == 0) {
+    element = value;
+  } else {
+    element.total_duration_ns += value.total_duration_ns;
+    element.min_duration_ns =
+        std::min(element.min_duration_ns, value.min_duration_ns);
+    element.max_duration_ns =
+        std::max(element.max_duration_ns, value.max_duration_ns);
+    element.occurrences += 1;
+  }
+}
+
+void MergeKernelReports(const KernelReportMap& reports, KernelReportMap* dst) {
+  for (auto& kernel_value : reports) {
+    InsertOrUpdateKernelReport(kernel_value.first, kernel_value.second, dst);
+  }
+}
+
+KernelStatsByOpName GroupKernelReportsByOpName(
+    const KernelStatsDb& kernel_stats_db) {
+  KernelStatsByOpName op_level_kernel_stats;
+  for (const KernelReport& kernel_report : kernel_stats_db.reports()) {
+    auto ret = op_level_kernel_stats.emplace(kernel_report.op_name(),
+                                             OpLevelKernelStats());
+    if (ret.second) {
+      // Inserted. Add a new op in <op_level_kernel_stats>.
+      OpLevelKernelStats& stats = ret.first->second;
+      stats.is_op_tensor_core_eligible =
+          kernel_report.is_op_tensor_core_eligible();
+      stats.total_duration_ns += kernel_report.total_duration_ns();
+      if (kernel_report.is_kernel_using_tensor_core()) {
+        stats.tensor_core_duration_ns += kernel_report.total_duration_ns();
+      }
     } else {
-      // Current element does not exist yet.
-      prev = dst->add_reports();
-      *prev = report;
-      prev->set_occurrences(1);
+      // Not inserted. Aggregate kernel stats to op level.
+      OpLevelKernelStats& stats = ret.first->second;
+      // Verifies operations with the same name have the same TensorCore
+      // eligibility.
+      DCHECK_EQ(stats.is_op_tensor_core_eligible,
+                kernel_report.is_op_tensor_core_eligible());
+      stats.total_duration_ns += kernel_report.total_duration_ns();
+      if (kernel_report.is_kernel_using_tensor_core()) {
+        stats.tensor_core_duration_ns += kernel_report.total_duration_ns();
+      }
     }
   }
+  return op_level_kernel_stats;
 }
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/utils/kernel_stats_utils.h b/tensorflow/core/profiler/utils/kernel_stats_utils.h
index 5b66596d683..8d0bd82bbe6 100644
--- a/tensorflow/core/profiler/utils/kernel_stats_utils.h
+++ b/tensorflow/core/profiler/utils/kernel_stats_utils.h
@@ -18,7 +18,9 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
 
 namespace tensorflow {
@@ -39,19 +41,89 @@ bool IsEinsumTensorCoreEligible(absl::string_view equation);
 
 // Less than comparator for Kernel Reports.
 struct KernelReportLessThanComparator {
-  bool operator()(const KernelReport& lhs, const KernelReport& rhs);
+  bool operator()(const KernelReport& lhs, const KernelReport& rhs) const;
 };
 
 // Equal to comparator for Kernel Reports.
 struct KernelReportEqualToComparator {
-  bool operator()(const KernelReport& lhs, const KernelReport& rhs);
+  bool operator()(const KernelReport& lhs, const KernelReport& rhs) const;
 };
 
 // Sorts kernel reorts by total duration descendingly.
 void SortKernelsByTotalDurationDesc(KernelStatsDb* kernel_stats_db);
 
-// Groups and aggregate common reports into destination KernelStatsDb.
-void GroupKernelReports(std::vector<KernelReport>* reports, KernelStatsDb* dst);
+struct KernelReportValue {
+  uint64 total_duration_ns = 0;
+  uint64 min_duration_ns = 0;
+  uint64 max_duration_ns = 0;
+  uint64 occurrences = 0;
+};
+
+struct KernelKeyWrap {
+  const KernelReport* key;
+  template <typename H>
+  friend H AbslHashValue(H h, KernelKeyWrap wrap) {
+    // Kernel reports are grouped by these fields, hence they are used as
+    // hashing criteria.
+    // clang-format off
+    return H::combine(
+        std::move(h),
+        wrap.key->is_kernel_using_tensor_core(),
+        wrap.key->is_op_tensor_core_eligible(),
+        wrap.key->block_dim(0),
+        wrap.key->block_dim(1),
+        wrap.key->block_dim(2),
+        wrap.key->grid_dim(0),
+        wrap.key->grid_dim(1),
+        wrap.key->grid_dim(2),
+        wrap.key->registers_per_thread(),
+        wrap.key->static_shmem_bytes(),
+        wrap.key->dynamic_shmem_bytes(),
+        wrap.key->name(),
+        wrap.key->op_name());
+    // clang-format on
+  }
+};
+
+struct KernelHash {
+  size_t operator()(const KernelReport& key) const {
+    return absl::Hash<KernelKeyWrap>()(KernelKeyWrap{&key});
+  }
+};
+
+using KernelReportMap =
+    absl::flat_hash_map<KernelReport, KernelReportValue, KernelHash,
+                        KernelReportEqualToComparator>;
+
+// Copies reports into the given KernelStatsDb.
+void CopyKernelReportsToDb(const KernelReportMap& reports, KernelStatsDb* dst);
+
+// Inserts or aggregates KernelReports into the given KernelReportMap.
+void InsertOrUpdateKernelReport(const KernelReport& kernel,
+                                const KernelReportValue& value,
+                                KernelReportMap* dst);
+
+// Aggregates values from one KernelReportMap into another.
+void MergeKernelReports(const KernelReportMap& reports, KernelReportMap* dst);
+
+// Kernel stats aggregated at TF operation level.
+struct OpLevelKernelStats {
+  // Whether op is eligible to use TensorCore.
+  bool is_op_tensor_core_eligible = false;
+  // The accumulated duration of all the kernels launched in this op.
+  uint64 total_duration_ns = 0;
+  // The accumulated duration of all the kernels using TensorCore in this op.
+  // If this value is not 0, at least one of the kernels launched by this op
+  // is using TensorCore.
+  uint64 tensor_core_duration_ns = 0;
+};
+
+using KernelStatsByOpName =
+    absl::flat_hash_map<absl::string_view, OpLevelKernelStats>;
+
+// Groups KernelReport in <kernel_stats_db> by tensorflow operation name.
+KernelStatsByOpName GroupKernelReportsByOpName(
+    const KernelStatsDb& kernel_stats_db);
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/kernel_stats_utils_test.cc b/tensorflow/core/profiler/utils/kernel_stats_utils_test.cc
new file mode 100644
index 00000000000..4f3d5a1f641
--- /dev/null
+++ b/tensorflow/core/profiler/utils/kernel_stats_utils_test.cc
@@ -0,0 +1,71 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
+
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+TEST(KernelStatsUtilsTest, TestGroupKernelReportsByOpName) {
+  KernelStatsDb kernel_stats_db;
+  KernelReport* kernel_report_1 = kernel_stats_db.add_reports();
+  kernel_report_1->set_name("op1_kernel1");
+  kernel_report_1->set_op_name("op1");
+  kernel_report_1->set_total_duration_ns(1000);
+  kernel_report_1->set_is_kernel_using_tensor_core(true);
+  kernel_report_1->set_is_op_tensor_core_eligible(true);
+
+  KernelReport* kernel_report_2 = kernel_stats_db.add_reports();
+  kernel_report_2->set_name("op1_kernel2");
+  kernel_report_2->set_op_name("op1");
+  kernel_report_2->set_total_duration_ns(1000);
+  kernel_report_2->set_is_kernel_using_tensor_core(false);
+  kernel_report_2->set_is_op_tensor_core_eligible(true);
+
+  KernelReport* kernel_report_3 = kernel_stats_db.add_reports();
+  kernel_report_3->set_name("op2_kernel1");
+  kernel_report_3->set_op_name("op2");
+  kernel_report_3->set_total_duration_ns(100);
+  kernel_report_3->set_is_kernel_using_tensor_core(false);
+  kernel_report_3->set_is_op_tensor_core_eligible(false);
+
+  KernelStatsByOpName kernel_stats_by_op_name =
+      GroupKernelReportsByOpName(kernel_stats_db);
+
+  // Verifies there are two OpLevelKernelStats
+  ASSERT_EQ(kernel_stats_by_op_name.size(), 2);
+  auto iter1 = kernel_stats_by_op_name.find("op1");
+  auto iter2 = kernel_stats_by_op_name.find("op2");
+  ASSERT_NE(iter1, kernel_stats_by_op_name.end());
+  ASSERT_NE(iter2, kernel_stats_by_op_name.end());
+  const OpLevelKernelStats& op1_stats = iter1->second;
+  const OpLevelKernelStats& op2_stats = iter2->second;
+
+  EXPECT_EQ(op1_stats.is_op_tensor_core_eligible, true);
+  EXPECT_EQ(op1_stats.total_duration_ns, 2000);
+  EXPECT_EQ(op1_stats.tensor_core_duration_ns, 1000);
+
+  EXPECT_EQ(op2_stats.is_op_tensor_core_eligible, false);
+  EXPECT_EQ(op2_stats.total_duration_ns, 100);
+  EXPECT_EQ(op2_stats.tensor_core_duration_ns, 0);
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/tf_op_utils.cc b/tensorflow/core/profiler/utils/tf_op_utils.cc
index eeafd8e6525..941676079b9 100644
--- a/tensorflow/core/profiler/utils/tf_op_utils.cc
+++ b/tensorflow/core/profiler/utils/tf_op_utils.cc
@@ -32,6 +32,7 @@ namespace {
 
 const absl::string_view kIterator = "Iterator";
 const absl::string_view kSeparator = "::";
+constexpr char kNameScopeSeparator = '/';
 
 }  // namespace
 
@@ -51,10 +52,17 @@ bool IsTfOpType(absl::string_view op_type) {
 }
 
 bool IsJaxOpType(absl::string_view op_type) {
-  static const LazyRE2 kJaxOpTypeRegEx = {"[a-z_]*"};
+  static const LazyRE2 kJaxOpTypeRegEx = {"[a-z_][a-z_]*"};
   return RE2::FullMatch(op_type, *kJaxOpTypeRegEx);
 }
 
+bool IsJaxOpNameAndType(absl::string_view op_name, absl::string_view op_type) {
+  if (op_name.empty() || !IsJaxOpType(op_type)) return false;
+  std::vector<absl::string_view> split_result =
+      absl::StrSplit(op_name, kNameScopeSeparator);
+  return absl::StrContains(split_result.back(), op_type);
+}
+
 TfOp ParseTfOpFullname(absl::string_view tf_op_fullname) {
   // TF Op names have the format "name:type".
   TfOp tf_op = {Category::kUnknown, tf_op_fullname, kUnknownOp};
@@ -85,7 +93,8 @@ TfOp ParseTfOpFullname(absl::string_view tf_op_fullname) {
 }
 
 std::vector<absl::string_view> ParseTfNameScopes(const TfOp& tf_op) {
-  std::vector<absl::string_view> name_scopes = absl::StrSplit(tf_op.name, '/');
+  std::vector<absl::string_view> name_scopes =
+      absl::StrSplit(tf_op.name, kNameScopeSeparator);
   // The last element is an op name not TF name scope.
   if (!name_scopes.empty()) name_scopes.pop_back();
   return name_scopes;
@@ -97,9 +106,7 @@ std::string TfOpEventName(const TfOp& tf_op) {
     // Some TraceMe names contain trailing whitespace, remove it.
     event_name = std::string(absl::StripTrailingAsciiWhitespace(tf_op.name));
   } else if (tf_op.category == Category::kTfData) {
-    std::vector<absl::string_view> op_parts =
-        absl::StrSplit(tf_op.name, kSeparator);
-    event_name = absl::StrCat(kIterator, kSeparator, op_parts.back());
+    event_name = DatasetOpEventName(tf_op.name);
   } else {
     event_name = std::string(tf_op.type);
   }
@@ -110,6 +117,18 @@ std::string TfOpEventName(absl::string_view tf_op_fullname) {
   return TfOpEventName(ParseTfOpFullname(tf_op_fullname));
 }
 
+std::string DatasetOpEventName(absl::string_view full_name) {
+  std::vector<absl::string_view> split_result =
+      absl::StrSplit(full_name, kSeparator);
+  return absl::StrCat(kIterator, kSeparator, split_result.back());
+}
+
+std::string IteratorName(absl::string_view full_name) {
+  std::vector<absl::string_view> split_result =
+      absl::StrSplit(full_name, kSeparator);
+  return std::string(split_result.back());
+}
+
 std::vector<absl::string_view> ParseTensorShapes(
     absl::string_view tensor_shapes) {
   absl::ConsumePrefix(&tensor_shapes, "(");
diff --git a/tensorflow/core/profiler/utils/tf_op_utils.h b/tensorflow/core/profiler/utils/tf_op_utils.h
index 4a63d68bffb..76e6256164b 100644
--- a/tensorflow/core/profiler/utils/tf_op_utils.h
+++ b/tensorflow/core/profiler/utils/tf_op_utils.h
@@ -56,10 +56,19 @@ std::vector<absl::string_view> ParseTfNameScopes(const TfOp& tf_op);
 std::string TfOpEventName(const TfOp& tf_op);
 std::string TfOpEventName(absl::string_view tf_op_fullname);
 
+// Trace event name for dataset ops.
+std::string DatasetOpEventName(absl::string_view full_name);
+
+// Returns the iterator name without prefix and parent iterator names.
+std::string IteratorName(absl::string_view full_name);
+
 // Returns true if the given name is a TensorFlow Dataset Op.
 inline bool IsDatasetOp(absl::string_view tf_op_type) {
   return tf_op_type == kDatasetOp;
 }
+inline bool IsDatasetOp(const TfOp& tf_op) {
+  return tf_op.category == Category::kTfData;
+}
 
 // Returns true if the given name is a TensorFlow Infeed Enqueue Op.
 inline bool IsInfeedEnqueueOp(absl::string_view tf_op_type) {
@@ -95,6 +104,9 @@ bool IsTfOpType(absl::string_view op_type);
 // Returns true if the given string matches JAX pattern.
 bool IsJaxOpType(absl::string_view op_type);
 
+// Returns true if the given strings match JAX pattern.
+bool IsJaxOpNameAndType(absl::string_view op_name, absl::string_view op_type);
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/utils/xplane_builder.h b/tensorflow/core/profiler/utils/xplane_builder.h
index d5a4d443e21..ded2b353c2c 100644
--- a/tensorflow/core/profiler/utils/xplane_builder.h
+++ b/tensorflow/core/profiler/utils/xplane_builder.h
@@ -44,13 +44,21 @@ class XStatsBuilder {
   void AddStatValue(const XStatMetadata& metadata, uint32 value) {
     AddStat(metadata)->set_uint64_value(value);
   }
-  void AddStatValue(const XStatMetadata& metadata, uint64 value) {
+  void AddStatValue(const XStatMetadata& metadata,
+                    unsigned long value) {  // NOLINT
+    AddStat(metadata)->set_uint64_value(value);
+  }
+  void AddStatValue(const XStatMetadata& metadata,
+                    unsigned long long value) {  // NOLINT
     AddStat(metadata)->set_uint64_value(value);
   }
   void AddStatValue(const XStatMetadata& metadata, int32 value) {
     AddStat(metadata)->set_int64_value(value);
   }
-  void AddStatValue(const XStatMetadata& metadata, int64 value) {
+  void AddStatValue(const XStatMetadata& metadata, long value) {  // NOLINT
+    AddStat(metadata)->set_int64_value(value);
+  }
+  void AddStatValue(const XStatMetadata& metadata, long long value) {  // NOLINT
     AddStat(metadata)->set_int64_value(value);
   }
   void AddStatValue(const XStatMetadata& metadata, double value) {
diff --git a/tensorflow/core/profiler/utils/xplane_builder_test.cc b/tensorflow/core/profiler/utils/xplane_builder_test.cc
index e55e01d8233..044a86fe909 100644
--- a/tensorflow/core/profiler/utils/xplane_builder_test.cc
+++ b/tensorflow/core/profiler/utils/xplane_builder_test.cc
@@ -32,7 +32,7 @@ TEST(TimespanTests, NonInstantSpanIncludesSingleTimeTests) {
   XEventBuilder event_builder = xline_builder.AddEvent(
       *xplane_builder.GetOrCreateEventMetadata("1st event"));
   event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("int stat"), 1234LL);
+      *xplane_builder.GetOrCreateStatMetadata("int stat"), int64{1234});
   event_builder.AddStatValue(
       *xplane_builder.GetOrCreateStatMetadata("string stat"),
       std::string("abc"));
@@ -50,7 +50,7 @@ TEST(TimespanTests, NonInstantSpanIncludesSingleTimeTests) {
       EXPECT_EQ(xevent.Name(), "1st event");
       xevent.ForEachStat([&](const XStatVisitor& stat) {
         if (stat.Name() == "int stat") {
-          EXPECT_EQ(stat.IntValue(), 1234LL);
+          EXPECT_EQ(stat.IntValue(), int64{1234});
           num_stats++;
         } else if (stat.Name() == "string stat") {
           EXPECT_EQ(stat.StrOrRefValue(), "abc");
diff --git a/tensorflow/core/profiler/utils/xplane_schema.cc b/tensorflow/core/profiler/utils/xplane_schema.cc
index 5ca8326d72c..19831a53c4c 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.cc
+++ b/tensorflow/core/profiler/utils/xplane_schema.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 
 #include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -94,6 +93,22 @@ const HostEventTypeMap& GetHostEventTypeMap() {
       // tf.data related.
       {"IteratorGetNextOp::DoCompute", kIteratorGetNextOp},
       {"IteratorGetNextAsOptionalOp::DoCompute", kIteratorGetNextAsOptionalOp},
+      {"Iterator", kIterator},
+      {"Iterator::Prefetch::Generator", kDeviceInputPipelineSecondIterator},
+      {"PrefetchProduce", kPrefetchProduce},
+      {"PrefetchConsume", kPrefetchConsume},
+      {"ParallelInterleaveProduce", kParallelInterleaveProduce},
+      {"ParallelInterleaveConsume", kParallelInterleaveConsume},
+      {"ParallelInterleaveInitializeInput",
+       kParallelInterleaveInitializedInput},
+      {"ParallelMapProduce", kParallelMapProduce},
+      {"ParallelMapConsume", kParallelMapConsume},
+      {"MapAndBatchProduce", kMapAndBatchProduce},
+      {"MapAndBatchConsume", kMapAndBatchConsume},
+      {"ParseExampleProduce", kParseExampleProduce},
+      {"ParseExampleConsume", kParseExampleConsume},
+      // JAX related.
+      {"LocalExecutable::ExecuteOnLocalDevices", kExecuteOnLocalDevices},
       // GPU related.
       {"KernelLaunch", kKernelLaunch},
       {"KernelExecute", kKernelExecute},
@@ -134,6 +149,8 @@ const StatTypeMap& GetStatTypeMap() {
       {"shape", kTensorShapes},
       {"kpi_name", kKpiName},
       {"kpi_value", kKpiValue},
+      {"element_id", kElementId},
+      {"parent_id", kParentId},
       // XPlane semantics related.
       {"_pt", kProducerType},
       {"_ct", kConsumerType},
@@ -220,14 +237,47 @@ absl::optional<int64> FindStatType(absl::string_view stat_name) {
   return absl::nullopt;
 }
 
+bool IsInternalEvent(absl::optional<int64> event_type) {
+  // TODO(b/162102421): Introduce a prefix for internal event names.
+  if (!event_type.has_value()) return false;
+  switch (*event_type) {
+    case HostEventType::kMemoryAllocation:
+    case HostEventType::kMemoryDeallocation:
+    case HostEventType::kPrefetchProduce:
+    case HostEventType::kPrefetchConsume:
+    case HostEventType::kParallelInterleaveProduce:
+    case HostEventType::kParallelInterleaveConsume:
+    case HostEventType::kParallelInterleaveInitializedInput:
+    case HostEventType::kParallelMapProduce:
+    case HostEventType::kParallelMapConsume:
+    case HostEventType::kMapAndBatchProduce:
+    case HostEventType::kMapAndBatchConsume:
+    case HostEventType::kParseExampleProduce:
+    case HostEventType::kParseExampleConsume:
+      return true;
+    default:
+      return false;
+  }
+}
+
 bool IsInternalStat(absl::optional<int64> stat_type) {
-  static const auto* const kInternalStats = new absl::flat_hash_set<int64>{
-      StatType::kKernelDetails, StatType::kLevel0,
-      StatType::kProducerType,  StatType::kProducerId,
-      StatType::kConsumerType,  StatType::kConsumerId,
-      StatType::kIsRoot,        StatType::kIsAsync,
-      StatType::kFlops,         StatType::kBytesAccessed};
-  return stat_type.has_value() && kInternalStats->contains(*stat_type);
+  // TODO(b/162102421): Introduce a prefix for internal stat names.
+  if (!stat_type.has_value()) return false;
+  switch (*stat_type) {
+    case StatType::kKernelDetails:
+    case StatType::kLevel0:
+    case StatType::kProducerType:
+    case StatType::kProducerId:
+    case StatType::kConsumerType:
+    case StatType::kConsumerId:
+    case StatType::kIsRoot:
+    case StatType::kIsAsync:
+    case StatType::kFlops:
+    case StatType::kBytesAccessed:
+      return true;
+    default:
+      return false;
+  }
 }
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/utils/xplane_schema.h b/tensorflow/core/profiler/utils/xplane_schema.h
index 41774deaa59..ea3656106ce 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.h
+++ b/tensorflow/core/profiler/utils/xplane_schema.h
@@ -84,6 +84,21 @@ enum HostEventType {
   // tf.data related.
   kIteratorGetNextOp,
   kIteratorGetNextAsOptionalOp,
+  kIterator,
+  kDeviceInputPipelineSecondIterator,
+  kPrefetchProduce,
+  kPrefetchConsume,
+  kParallelInterleaveProduce,
+  kParallelInterleaveConsume,
+  kParallelInterleaveInitializedInput,
+  kParallelMapProduce,
+  kParallelMapConsume,
+  kMapAndBatchProduce,
+  kMapAndBatchConsume,
+  kParseExampleProduce,
+  kParseExampleConsume,
+  // JAX related.
+  kExecuteOnLocalDevices,
   // GPU related.
   kKernelLaunch,
   kKernelExecute,
@@ -122,6 +137,8 @@ enum StatType {
   kTensorShapes,
   kKpiName,
   kKpiValue,
+  kElementId,
+  kParentId,
   // XPlane semantics related.
   kProducerType,
   kConsumerType,
@@ -199,6 +216,9 @@ inline bool IsStatType(StatType stat_type, absl::string_view stat_name) {
 
 absl::optional<int64> FindStatType(absl::string_view stat_name);
 
+// Returns true if the given event shouldn't be shown in the trace viewer.
+bool IsInternalEvent(absl::optional<int64> event_type);
+
 // Returns true if the given stat shouldn't be shown in the trace viewer.
 bool IsInternalStat(absl::optional<int64> stat_type);
 
diff --git a/tensorflow/core/profiler/utils/xplane_utils.cc b/tensorflow/core/profiler/utils/xplane_utils.cc
index 1fef12580dd..867d1315053 100644
--- a/tensorflow/core/profiler/utils/xplane_utils.cc
+++ b/tensorflow/core/profiler/utils/xplane_utils.cc
@@ -164,7 +164,7 @@ void SortXSpace(XSpace* space) {
 // smaller than these value.
 void NormalizeTimestamps(XPlane* plane, uint64 start_time_ns) {
   for (XLine& line : *plane->mutable_lines()) {
-    if (line.timestamp_ns() >= static_cast<long int>(start_time_ns)) {
+    if (line.timestamp_ns() >= static_cast<int64>(start_time_ns)) {
       line.set_timestamp_ns(line.timestamp_ns() - start_time_ns);
     }
   }
diff --git a/tensorflow/core/protobuf/BUILD b/tensorflow/core/protobuf/BUILD
index 21b3ed572f1..69019cec9ce 100644
--- a/tensorflow/core/protobuf/BUILD
+++ b/tensorflow/core/protobuf/BUILD
@@ -140,6 +140,7 @@ exports_files(
         # TODO(ebrevdo): Re-enable once CriticalSection is in core.
         # "critical_section.proto",
         "data/experimental/snapshot.proto",
+        "data/experimental/service_config.proto",
         "debug_event.proto",
         "meta_graph.proto",
         "named_tensor.proto",
@@ -165,6 +166,7 @@ tf_proto_library(
         # TODO(ebrevdo): Re-enable once CriticalSection is in core.
         # "critical_section.proto",
         "data/experimental/snapshot.proto",
+        "data/experimental/service_config.proto",
         "debug_event.proto",
         "meta_graph.proto",
         "named_tensor.proto",
diff --git a/tensorflow/core/protobuf/autotuning.proto b/tensorflow/core/protobuf/autotuning.proto
index 44ce088ff41..083a04dba33 100644
--- a/tensorflow/core/protobuf/autotuning.proto
+++ b/tensorflow/core/protobuf/autotuning.proto
@@ -10,7 +10,7 @@ package tensorflow;
 import "google/protobuf/any.proto";
 import "google/protobuf/duration.proto";
 
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 message CudnnVersion {
   int32 major = 1;
diff --git a/tensorflow/core/protobuf/bfc_memory_map.proto b/tensorflow/core/protobuf/bfc_memory_map.proto
index 6f7a5301af1..6e2614c79d2 100644
--- a/tensorflow/core/protobuf/bfc_memory_map.proto
+++ b/tensorflow/core/protobuf/bfc_memory_map.proto
@@ -2,7 +2,7 @@ syntax = "proto3";
 
 package tensorflow;
 
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 // Some of the data from AllocatorStats
 message MemAllocatorStats {
diff --git a/tensorflow/core/protobuf/cluster.proto b/tensorflow/core/protobuf/cluster.proto
index 6c7162fe433..41a55e06d6e 100644
--- a/tensorflow/core/protobuf/cluster.proto
+++ b/tensorflow/core/protobuf/cluster.proto
@@ -21,7 +21,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ClusterProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 // This file contains protos to be used when defining a TensorFlow
 // cluster.
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 3d20d004d46..fec929a0b03 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -13,7 +13,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ConfigProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 message GPUOptions {
   // Fraction of the available GPU memory to allocate for each process.
diff --git a/tensorflow/core/protobuf/control_flow.proto b/tensorflow/core/protobuf/control_flow.proto
index 24f862029dc..8890af6916d 100644
--- a/tensorflow/core/protobuf/control_flow.proto
+++ b/tensorflow/core/protobuf/control_flow.proto
@@ -6,7 +6,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ControlFlowProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 // Control flow context related protocol buffers.
 
diff --git a/tensorflow/core/protobuf/conv_autotuning.proto b/tensorflow/core/protobuf/conv_autotuning.proto
index b3af2f0769e..17d31770a59 100644
--- a/tensorflow/core/protobuf/conv_autotuning.proto
+++ b/tensorflow/core/protobuf/conv_autotuning.proto
@@ -6,7 +6,7 @@ package tensorflow;
 
 import "tensorflow/stream_executor/dnn.proto";
 
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 // A convolution. Currently it's only used for logging. In the future, we may
 // want to use it in the API as well.
diff --git a/tensorflow/core/protobuf/critical_section.proto b/tensorflow/core/protobuf/critical_section.proto
index 991d7ef1280..830f0560a03 100644
--- a/tensorflow/core/protobuf/critical_section.proto
+++ b/tensorflow/core/protobuf/critical_section.proto
@@ -6,7 +6,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "CriticalSectionProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 // Protocol buffer representing a CriticalSection.
 message CriticalSectionDef {
diff --git a/tensorflow/core/protobuf/data/experimental/service_config.proto b/tensorflow/core/protobuf/data/experimental/service_config.proto
new file mode 100644
index 00000000000..017aaa2a960
--- /dev/null
+++ b/tensorflow/core/protobuf/data/experimental/service_config.proto
@@ -0,0 +1,33 @@
+syntax = "proto3";
+
+package tensorflow.data.experimental;
+
+// Configuration for a tf.data service DispatchServer.
+message DispatcherConfig {
+  // The port for the dispatcher to bind to. A value of 0 indicates that the
+  // dispatcher may bind to any available port.
+  int64 port = 1;
+  // The protocol for the dispatcher to use when connecting to workers.
+  string protocol = 2;
+  // An optional work directory to use for storing dispatcher state, and for
+  // recovering during restarts.
+  string work_dir = 3;
+  // Whether to run in fault tolerant mode, where dispatcher state is saved
+  // across restarts.
+  bool fault_tolerant_mode = 4;
+}
+
+// Configuration for a tf.data service WorkerServer.
+message WorkerConfig {
+  // The port for the worker to bind to. A value of 0 indicates that the
+  // worker may bind to any available port.
+  int64 port = 1;
+  // The protocol for the worker to use when connecting to the dispatcher.
+  string protocol = 2;
+  // The address of the dispatcher to register with.
+  string dispatcher_address = 3;
+  // The address of the worker server. The substring "%port%", if specified,
+  // will be replaced with the worker's bound port. This is useful when the port
+  // is set to `0`.
+  string worker_address = 4;
+}
diff --git a/tensorflow/core/protobuf/debug.proto b/tensorflow/core/protobuf/debug.proto
index 21df4a12e7d..2fabd0319fe 100644
--- a/tensorflow/core/protobuf/debug.proto
+++ b/tensorflow/core/protobuf/debug.proto
@@ -6,7 +6,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "DebugProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 // Option for watching a node in TensorFlow Debugger (tfdbg).
 message DebugTensorWatch {
diff --git a/tensorflow/core/protobuf/debug_event.proto b/tensorflow/core/protobuf/debug_event.proto
index 5541c397fb8..5530004d725 100644
--- a/tensorflow/core/protobuf/debug_event.proto
+++ b/tensorflow/core/protobuf/debug_event.proto
@@ -9,7 +9,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "DebugEventProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.util";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 // Available modes for extracting debugging information from a Tensor.
 // TODO(cais): Document the detailed column names and semantics in a separate
diff --git a/tensorflow/core/protobuf/device_filters.proto b/tensorflow/core/protobuf/device_filters.proto
index 62dd427e03a..8fd8e2ec143 100644
--- a/tensorflow/core/protobuf/device_filters.proto
+++ b/tensorflow/core/protobuf/device_filters.proto
@@ -21,7 +21,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "DeviceFiltersProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 // This file contains protos to be used when defining a TensorFlow
 // cluster.
diff --git a/tensorflow/core/protobuf/device_properties.proto b/tensorflow/core/protobuf/device_properties.proto
index f6587c8aef2..b892bdcc854 100644
--- a/tensorflow/core/protobuf/device_properties.proto
+++ b/tensorflow/core/protobuf/device_properties.proto
@@ -19,7 +19,7 @@ package tensorflow;
 
 option cc_enable_arenas = true;
 option java_outer_classname = "DevicePropertiesProtos";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 message DeviceProperties {
   // Device type (CPU, GPU, ...)
diff --git a/tensorflow/core/protobuf/eager_service.proto b/tensorflow/core/protobuf/eager_service.proto
index 179ef19f805..57bbf48ac67 100644
--- a/tensorflow/core/protobuf/eager_service.proto
+++ b/tensorflow/core/protobuf/eager_service.proto
@@ -11,7 +11,7 @@ import "tensorflow/core/framework/versions.proto";
 import "tensorflow/core/protobuf/remote_tensor_handle.proto";
 import "tensorflow/core/protobuf/tensorflow_server.proto";
 
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 // A proto representation of an eager operation.
 message Operation {
diff --git a/tensorflow/core/protobuf/error_codes.proto b/tensorflow/core/protobuf/error_codes.proto
index a880fdfd8c9..4f94d7dedc6 100644
--- a/tensorflow/core/protobuf/error_codes.proto
+++ b/tensorflow/core/protobuf/error_codes.proto
@@ -6,7 +6,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ErrorCodesProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 // The canonical error codes for TensorFlow APIs.
 //
diff --git a/tensorflow/core/protobuf/graph_debug_info.proto b/tensorflow/core/protobuf/graph_debug_info.proto
index 3d8d7733c64..7af52628cd8 100644
--- a/tensorflow/core/protobuf/graph_debug_info.proto
+++ b/tensorflow/core/protobuf/graph_debug_info.proto
@@ -6,7 +6,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "GraphDebugInfoProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 message GraphDebugInfo {
   // This represents a file/line location in the source code.
diff --git a/tensorflow/core/protobuf/master.proto b/tensorflow/core/protobuf/master.proto
index 0bd21fd8bd7..e1732a932c9 100644
--- a/tensorflow/core/protobuf/master.proto
+++ b/tensorflow/core/protobuf/master.proto
@@ -28,7 +28,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "DistributedRuntimeProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 ////////////////////////////////////////////////////////////////////////////////
 //
diff --git a/tensorflow/core/protobuf/master_service.proto b/tensorflow/core/protobuf/master_service.proto
index aa8d13f2b86..f9ec50aa5a3 100644
--- a/tensorflow/core/protobuf/master_service.proto
+++ b/tensorflow/core/protobuf/master_service.proto
@@ -22,7 +22,7 @@ import "tensorflow/core/protobuf/master.proto";
 option java_outer_classname = "MasterServiceProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 ////////////////////////////////////////////////////////////////////////////////
 //
diff --git a/tensorflow/core/protobuf/meta_graph.proto b/tensorflow/core/protobuf/meta_graph.proto
index c560451f9db..167e32973eb 100644
--- a/tensorflow/core/protobuf/meta_graph.proto
+++ b/tensorflow/core/protobuf/meta_graph.proto
@@ -15,7 +15,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "MetaGraphProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 // NOTE: This protocol buffer is evolving, and will go through revisions in the
 // coming months.
diff --git a/tensorflow/core/protobuf/named_tensor.proto b/tensorflow/core/protobuf/named_tensor.proto
index 4210f040e82..8d401a0c481 100644
--- a/tensorflow/core/protobuf/named_tensor.proto
+++ b/tensorflow/core/protobuf/named_tensor.proto
@@ -8,7 +8,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "NamedTensorProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 // A pair of tensor name and tensor values.
 message NamedTensorProto {
diff --git a/tensorflow/core/protobuf/queue_runner.proto b/tensorflow/core/protobuf/queue_runner.proto
index 981ae0d463e..c3225d42fd2 100644
--- a/tensorflow/core/protobuf/queue_runner.proto
+++ b/tensorflow/core/protobuf/queue_runner.proto
@@ -8,7 +8,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "QueueRunnerProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 // Protocol buffer representing a QueueRunner.
 message QueueRunnerDef {
diff --git a/tensorflow/core/protobuf/remote_tensor_handle.proto b/tensorflow/core/protobuf/remote_tensor_handle.proto
index 36e3f810b73..904d6a3b209 100644
--- a/tensorflow/core/protobuf/remote_tensor_handle.proto
+++ b/tensorflow/core/protobuf/remote_tensor_handle.proto
@@ -9,7 +9,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "RemoteTensorHandleProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 message ResourceDtypeAndShape {
   DataType dtype = 1;
diff --git a/tensorflow/core/protobuf/replay_log.proto b/tensorflow/core/protobuf/replay_log.proto
index bf0155946b6..1fe4fd65853 100644
--- a/tensorflow/core/protobuf/replay_log.proto
+++ b/tensorflow/core/protobuf/replay_log.proto
@@ -5,7 +5,7 @@ package tensorflow;
 import "tensorflow/core/protobuf/master.proto";
 
 option cc_enable_arenas = true;
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 // Records the creation of a new replay session.  We record the device listing
 // here to capture the state of the cluster.
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index 9520db92742..695e73f62e8 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -9,7 +9,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "RewriterConfigProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 message AutoParallelOptions {
   bool enable = 1;
@@ -107,6 +107,10 @@ message RewriterConfig {
   // < 0 means do not skip optimization.
   int32 min_graph_nodes = 17;
 
+  // Disable optimizations that assume compressed tensors. Note that this flag
+  // is experimental and may be removed in the future.
+  bool experimental_disable_compressed_tensor_optimization = 26;
+
   enum MemOptType {
     // The default setting (SCHEDULING and SWAPPING HEURISTICS only)
     DEFAULT_MEM_OPT = 0;
diff --git a/tensorflow/core/protobuf/saved_model.proto b/tensorflow/core/protobuf/saved_model.proto
index 57f018bb249..47a707c8139 100644
--- a/tensorflow/core/protobuf/saved_model.proto
+++ b/tensorflow/core/protobuf/saved_model.proto
@@ -8,7 +8,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "SavedModelProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 // SavedModel is the high level serialization format for TensorFlow Models.
 // See [todo: doc links, similar to session_bundle] for more information.
diff --git a/tensorflow/core/protobuf/saved_object_graph.proto b/tensorflow/core/protobuf/saved_object_graph.proto
index 981908cfa3c..c756644f7ec 100644
--- a/tensorflow/core/protobuf/saved_object_graph.proto
+++ b/tensorflow/core/protobuf/saved_object_graph.proto
@@ -10,7 +10,7 @@ import "tensorflow/core/protobuf/struct.proto";
 import "tensorflow/core/protobuf/trackable_object_graph.proto";
 
 option cc_enable_arenas = true;
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 // A SavedObjectGraph is part of object-based SavedModels in TF 2.0. It
 // describes the directed graph of Python objects (or equivalent in other
diff --git a/tensorflow/core/protobuf/saver.proto b/tensorflow/core/protobuf/saver.proto
index 5ba79447750..208468b2b2c 100644
--- a/tensorflow/core/protobuf/saver.proto
+++ b/tensorflow/core/protobuf/saver.proto
@@ -6,7 +6,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "SaverProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.util";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 // Protocol buffer representing the configuration of a Saver.
 message SaverDef {
diff --git a/tensorflow/core/protobuf/struct.proto b/tensorflow/core/protobuf/struct.proto
index 0158c4be85f..ee0f089f2a3 100644
--- a/tensorflow/core/protobuf/struct.proto
+++ b/tensorflow/core/protobuf/struct.proto
@@ -6,7 +6,7 @@ import "tensorflow/core/framework/tensor.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
 
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 // `StructuredValue` represents a dynamically typed value representing various
 // data structures that are inspired by Python data structures typically used in
diff --git a/tensorflow/core/protobuf/tensor_bundle.proto b/tensorflow/core/protobuf/tensor_bundle.proto
index 04ccc0faf36..999195cc95d 100644
--- a/tensorflow/core/protobuf/tensor_bundle.proto
+++ b/tensorflow/core/protobuf/tensor_bundle.proto
@@ -11,7 +11,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "TensorBundleProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.util";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 // Protos used in the tensor bundle module (tf/core/util/tensor_bundle/).
 
diff --git a/tensorflow/core/protobuf/tensorflow_server.proto b/tensorflow/core/protobuf/tensorflow_server.proto
index 7136f29d58a..5374172df83 100644
--- a/tensorflow/core/protobuf/tensorflow_server.proto
+++ b/tensorflow/core/protobuf/tensorflow_server.proto
@@ -25,7 +25,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ServerProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 // Defines the configuration of a single TensorFlow server.
 message ServerDef {
diff --git a/tensorflow/core/protobuf/tpu/compile_metadata.proto b/tensorflow/core/protobuf/tpu/compile_metadata.proto
index 29593bb896f..2b29e8468b2 100644
--- a/tensorflow/core/protobuf/tpu/compile_metadata.proto
+++ b/tensorflow/core/protobuf/tpu/compile_metadata.proto
@@ -59,6 +59,9 @@ message TPUCompileMetadataProto {
     // Whether to let XLA to decide the layout during compilation, as opposed to
     // using a fixed layout determined by the shape.
     bool unrestricted_layout = 9;
+
+    // Name of the node that the arg comes from.
+    string name = 10;
   }
   repeated Arg args = 1;
 
diff --git a/tensorflow/core/protobuf/tpu/optimization_parameters.proto b/tensorflow/core/protobuf/tpu/optimization_parameters.proto
index 853dbe886be..f7748ef5689 100644
--- a/tensorflow/core/protobuf/tpu/optimization_parameters.proto
+++ b/tensorflow/core/protobuf/tpu/optimization_parameters.proto
@@ -54,10 +54,12 @@ message LearningRate {
 // Each optimizer's parameter proto has a link to its documentation and CPU
 // implementation (if available) for user reference.
 
-// https://www.tensorflow.org/api_docs/python/tf/train/AdagradOptimizer
-// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L151
+// https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/Adagrad
+// https://github.com/tensorflow/tensorflow/blob/6b6471f3ffb7f1fefe42d814aa5fb9ab7a535b58/tensorflow/core/kernels/training_ops.cc#L1634
 message AdagradParameters {
-  float initial_accumulator = 1;
+  // Old initial accumulator parameter.
+  reserved "initial_accumulator";
+  reserved 1;
 }
 
 // Algorithm in http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf.
@@ -74,28 +76,51 @@ message BoundedAdagradParameters {
   float max_accumulator = 3;
 }
 
-// https://www.tensorflow.org/api_docs/python/tf/train/GradientDescentOptimizer
-// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L423
+// https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/SGD
+// https://github.com/tensorflow/tensorflow/blob/6b6471f3ffb7f1fefe42d814aa5fb9ab7a535b58/tensorflow/core/kernels/training_ops.cc#L629
 message StochasticGradientDescentParameters {}
 
-// https://www.tensorflow.org/api_docs/python/tf/train/FtrlOptimizer
-// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L192
+// https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/Ftrl
+// https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/41159.pdf
+// https://github.com/tensorflow/tensorflow/blob/6b6471f3ffb7f1fefe42d814aa5fb9ab7a535b58/tensorflow/core/kernels/training_ops.cc#L2646
+//
+// The hyperparameters for FTRL are the same as for the Keras implementation,
+// with some additions. The "beta" parameter matches the behavior described in
+// the second link above; "beta" / (2 * learning rate) should be added to "l2"
+// to get equivalent behavior in the other TensorFlow implementations of this
+// optimizer. When the multiply_linear_by_lr field is set to true, a modified
+// formula is used for FTRL that treats the "linear" accumulator as being
+// pre-multiplied by the learning rate (i.e., the accumulator named "linear"
+// actually stores "linear * learning_rate"). Other than checkpoint
+// compatibility, this is mathematically equivalent for a static learning rate;
+// for a dynamic learning rate, it is nearly the same as long as the learning
+// rate does not change quickly. The benefit of setting multiply_linear_by_lr to
+// true is that the modified formula handles zero and near-zero learning rates
+// without producing NaNs, improving flexibility for learning rate ramp-up. The
+// allow_zero_accumulator parameter changes some internal formulas to allow zero
+// and near-zero accumulator values at the cost of some performance; this only
+// needs to be set if you are using an initial accumulator value of zero, which
+// is uncommon.
 message FtrlParameters {
   float l1 = 1;
   float l2 = 2;
   float lr_power = 3;
-  float initial_accum = 4;
-  float initial_linear = 5;
+  float beta = 7;
   bool multiply_linear_by_lr = 6;
+  bool allow_zero_accumulator = 8;
+
+  // Old initial accumulator parameters.
+  reserved "initial_accum", "initial_linear";
+  reserved 4, 5;
 }
 
-// The Adam optimizer does not implement hyper-parameter update; use the dynamic
-// learning rate feature instead, setting the learning rate to:
-// user learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
+// The Adam optimizer does not implement hyper-parameter update due to hardware
+// limitations; use the dynamic learning rate feature instead, setting the
+// learning rate to: user learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
 // Here, t is the current timestep.
 //
-// https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer
-// https://github.com/tensorflow/tensorflow/blob/ab51450c817674c8ff08a7ae4f8ac50cdc4bed8b/tensorflow/python/training/adam.py#L54
+// https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/Adam
+// https://github.com/tensorflow/tensorflow/blob/ab51450c817674c8ff08a7ae4f8ac50cdc4bed8b/tensorflow/python/training/adam.py#L32
 //
 // Note that the code by default implements the lazy version of Adam
 // (https://www.tensorflow.org/api_docs/python/tf/contrib/opt/LazyAdamOptimizer)
@@ -113,39 +138,47 @@ message AdamParameters {
   float beta1 = 3;
   float beta2 = 4;
   float epsilon = 5;
-  float initial_m = 6;
-  float initial_v = 7;
   bool use_non_lazy_adam = 8;
   bool use_sum_inside_sqrt = 10;
+
+  // Old initial accumulator parameters.
+  reserved "initial_m", "initial_v";
+  reserved 6, 7;
 }
 
-// https://www.tensorflow.org/api_docs/python/tf/train/MomentumOptimizer
-// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L271
+// https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/SGD
+// https://github.com/tensorflow/tensorflow/blob/6b6471f3ffb7f1fefe42d814aa5fb9ab7a535b58/tensorflow/core/kernels/training_ops.cc#L3068
 message MomentumParameters {
   float momentum = 1;
   bool use_nesterov = 2;
-  float initial_accum = 3;
+
+  // Old initial accumulator parameter.
+  reserved "initial_accum";
+  reserved 3;
 }
 
-// https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer
-// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L356
+// https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/RMSprop
+// https://github.com/tensorflow/tensorflow/blob/6b6471f3ffb7f1fefe42d814aa5fb9ab7a535b58/tensorflow/core/kernels/training_ops.cc#L4229
 message RmsPropParameters {
   float rho = 1;
   float momentum = 2;
   float epsilon = 3;
-  float initial_ms = 4;
-  float initial_mom = 5;
+
+  // Old initial accumulator parameters.
+  reserved "initial_ms", "initial_mom";
+  reserved 4, 5;
 }
 
-// https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer
-// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L372
+// https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/RMSprop
+// https://github.com/tensorflow/tensorflow/blob/6b6471f3ffb7f1fefe42d814aa5fb9ab7a535b58/tensorflow/core/kernels/training_ops.cc#L4358
 message CenteredRmsPropParameters {
   float rho = 1;
   float momentum = 2;
   float epsilon = 3;
-  float initial_ms = 4;
-  float initial_mom = 5;
-  float initial_mg = 6;
+
+  // Old initial accumulator parameters.
+  reserved "initial_ms", "initial_mom", "initial_mg";
+  reserved 4, 5, 6;
 }
 
 // Variant of algorithm in http://proceedings.mlr.press/v44/shamir15.pdf
@@ -162,26 +195,32 @@ message MdlAdagradLightParameters {
   float mdl_hard_limit = 10;
   bool hard_limit_min_benefit = 11;
   bool mdl_regularize = 12;
-  float initial_accumulator = 13;
-  float initial_weight = 14;
-  float initial_benefit = 15;
+
+  // Old initial accumulator parameters.
+  reserved "initial_accumulator", "initial_weight", "initial_benefit";
+  reserved 13, 14, 15;
 }
 
-// https://www.tensorflow.org/api_docs/python/tf/train/AdadeltaOptimizer
-// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L68
+// https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/Adadelta
+// https://github.com/tensorflow/tensorflow/blob/6b6471f3ffb7f1fefe42d814aa5fb9ab7a535b58/tensorflow/core/kernels/training_ops.cc#L933
 message AdadeltaParameters {
   float rho = 1;
   float epsilon = 2;
-  float initial_accumulator = 3;
-  float initial_update = 4;
+
+  // Old initial accumulator parameters.
+  reserved "initial_accumulator", "initial_update";
+  reserved 3, 4;
 }
 
-// https://www.tensorflow.org/api_docs/python/tf/train/ProximalAdagradOptimizer
-// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L164
+// https://www.tensorflow.org/api_docs/python/tf/compat/v1/train/ProximalAdagradOptimizer
+// https://github.com/tensorflow/tensorflow/blob/6b6471f3ffb7f1fefe42d814aa5fb9ab7a535b58/tensorflow/core/kernels/training_ops.cc#L1961
 message ProximalAdagradParameters {
   float l1 = 1;
   float l2 = 2;
-  float initial_accumulator = 3;
+
+  // Old initial accumulator parameter.
+  reserved "initial_accumulator";
+  reserved 3;
 }
 
 // The online Yogi optimizer does not implement hyper-parameter update; use the
diff --git a/tensorflow/core/protobuf/trackable_object_graph.proto b/tensorflow/core/protobuf/trackable_object_graph.proto
index 48dbd92a181..4be996bb3c4 100644
--- a/tensorflow/core/protobuf/trackable_object_graph.proto
+++ b/tensorflow/core/protobuf/trackable_object_graph.proto
@@ -3,7 +3,7 @@ syntax = "proto3";
 package tensorflow;
 
 option cc_enable_arenas = true;
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 // A TensorBundle addition which saves extra information about the objects which
 // own variables, allowing for more robust checkpoint loading into modified
diff --git a/tensorflow/core/protobuf/transport_options.proto b/tensorflow/core/protobuf/transport_options.proto
index 23f92c3529f..8d540315e06 100644
--- a/tensorflow/core/protobuf/transport_options.proto
+++ b/tensorflow/core/protobuf/transport_options.proto
@@ -2,7 +2,7 @@ syntax = "proto3";
 
 package tensorflow;
 
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 // Extra data needed on a non-RDMA RecvBufResponse.
 message RecvBufRespExtra {
diff --git a/tensorflow/core/protobuf/verifier_config.proto b/tensorflow/core/protobuf/verifier_config.proto
index 4440aad6ac9..21885ffef83 100644
--- a/tensorflow/core/protobuf/verifier_config.proto
+++ b/tensorflow/core/protobuf/verifier_config.proto
@@ -6,7 +6,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "VerifierConfigProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 // The config for graph verifiers.
 message VerifierConfig {
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index b6bbcfc3431..739ba8e03e6 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -35,7 +35,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "WorkerProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -70,6 +70,17 @@ message CreateWorkerSessionRequest {
 
   // The device attributes of all the devices in the cluster.
   repeated DeviceAttributes cluster_device_attributes = 4;
+
+  // The master task name from which the request is sent.
+  string master_task = 5;
+
+  // The incarnation ID of the master task local CPU device.
+  // If the target worker already has a WorkerSession created previously with
+  // the same master task name but a different incarnation, it usually indicates
+  // that the previous master failed before deleting the WorkerSession on the
+  // worker. To prevent memory leaks, the worker should garbage collect the old
+  // WorkerSessions.
+  int64 master_incarnation = 6;
 }
 
 message CreateWorkerSessionResponse {}
diff --git a/tensorflow/core/protobuf/worker_service.proto b/tensorflow/core/protobuf/worker_service.proto
index 38d8bc1da6b..d1a53845850 100644
--- a/tensorflow/core/protobuf/worker_service.proto
+++ b/tensorflow/core/protobuf/worker_service.proto
@@ -22,7 +22,7 @@ import "tensorflow/core/protobuf/worker.proto";
 option java_outer_classname = "WorkerServiceProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 ////////////////////////////////////////////////////////////////////////////////
 //
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 8cf889cd868..34b358612ad 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 458  // Updated: 2020/7/10
+#define TF_GRAPH_DEF_VERSION 490  // Updated: 2020/8/11
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/tpu/BUILD b/tensorflow/core/tpu/BUILD
index c5c3ab8c059..0a17ba3d408 100644
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -57,6 +57,7 @@ cc_library(
         ":tpu_defs",
         ":tpu_node_device_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
     ],
     alwayslink = 1,
 )
@@ -73,6 +74,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tpu_compile_interface",
+    srcs = ["tpu_compile_interface.cc"],
+    hdrs = ["tpu_compile_interface.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "tpu_defs",
     srcs = ["tpu_defs.cc"],
@@ -114,6 +125,7 @@ cc_library(
         ":libtftpu_header",
         ":tpu_config_c_api",
         "//tensorflow/core/tpu/kernels:tpu_compile_c_api_hdrs",
+        "//tensorflow/core/tpu/kernels:tpu_execute_c_api_hdrs",
         "//tensorflow/core/tpu/kernels:tpu_mesh_state_c_api_hdrs",
         "//tensorflow/core/tpu/kernels:tpu_util_c_api_hdrs",
         "//tensorflow/stream_executor/tpu:tpu_executor_c_api_hdrs",
@@ -132,25 +144,21 @@ cc_library(
     deps = [
         ":libtftpu_header",
         ":tpu_api",
+        ":tpu_compilation_device",
         ":tpu_config_c_api",
         ":tpu_library_init_fns",
-        "//tensorflow/core/platform:errors",
-        "//tensorflow/core/platform:status",
-        "//tensorflow/core/tpu/graph_rewrite:distributed_tpu_configuration_rewrite_registration",
+        ":tpu_node_device",
+        ":tpu_system_device",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/tpu/graph_rewrite:tpu_rewrite_pass_registration",
         "//tensorflow/core/tpu/kernels:tpu_compile_c_api_hdrs",
+        "//tensorflow/core/tpu/kernels:tpu_execute_c_api_hdrs",
         "//tensorflow/core/tpu/kernels:tpu_mesh_state_c_api_hdrs",
         "//tensorflow/core/tpu/kernels:tpu_util_c_api_hdrs",
+        "//tensorflow/stream_executor/tpu:tpu_computation_placer",
         "//tensorflow/stream_executor/tpu:tpu_executor_c_api_hdrs",
         "//tensorflow/stream_executor/tpu:tpu_node_context_c_api_hdrs",
-    ] + select({
-        "//tensorflow:oss": [
-            ":tpu_node_device",
-            ":tpu_system_device",
-            "//tensorflow/stream_executor/tpu:tpu_executor",
-            "//tensorflow/stream_executor/tpu:tpu_transfer_manager",
-        ],
-        "//conditions:default": [],
-    }),
+    ],
 )
 
 cc_library(
@@ -165,23 +173,22 @@ cc_library(
     hdrs = ["tpu_node_device.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":tpu_api",
         ":tpu_defs",
         ":tpu_node_device_util",
         "//tensorflow/compiler/jit:xla_device",
         "//tensorflow/compiler/jit/kernels:xla_ops",
+        "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:tf2xla_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:session_options",
-        "//tensorflow/core/common_runtime:copy_tensor",
-        "//tensorflow/core/common_runtime:device",
-        "//tensorflow/core/common_runtime:device_factory",
-        "//tensorflow/core/common_runtime:dma_helper",
-        "//tensorflow/core/framework:kernel_def_proto_cc",
-        "//tensorflow/core/lib/core:status",
-        "//tensorflow/core/platform:status",
-        "//tensorflow/core/tpu/kernels:tpu_configuration_ops",
-        "//tensorflow/core/tpu/kernels:tpu_util",
+        "//tensorflow/stream_executor/tpu:c_api_conversions",
+        "//tensorflow/stream_executor/tpu:status_helper",
         "//tensorflow/stream_executor/tpu:tpu_node_context",
         "//tensorflow/stream_executor/tpu:tpu_platform_interface",
         "//tensorflow/stream_executor/tpu:tpu_stream_interface",
@@ -195,11 +202,10 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":virtual_device",
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
-        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
         "//tensorflow/core:session_options",
-        "//tensorflow/core/common_runtime:device_factory",
-        "//tensorflow/core/lib/core:status",
         "//tensorflow/stream_executor/tpu:tpu_executor_base",
     ],
 )
@@ -214,3 +220,84 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
     ],
 )
+
+cc_library(
+    name = "tpu_execute",
+    srcs = ["tpu_execute.cc"],
+    hdrs = ["tpu_execute.h"],
+    deps = [
+        ":tpu_api",
+        "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/compiler/xla:shape_layout",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/service:computation_layout",
+        "//tensorflow/compiler/xla/service:computation_placer",
+        "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "//tensorflow/compiler/xla/service:maybe_owning_device_memory",
+        "//tensorflow/compiler/xla/service:transfer_manager",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/tpu/kernels:tpu_compile_c_api_hdrs",
+        "//tensorflow/core/tpu/kernels:tpu_executable_info_proto_cc",
+        "//tensorflow/stream_executor:device_memory",
+        "//tensorflow/stream_executor:stream",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/tpu:c_api_conversions",
+        "//tensorflow/stream_executor/tpu:status_helper",
+        "//tensorflow/stream_executor/tpu:tpu_executable",
+        "//tensorflow/stream_executor/tpu:tpu_executor_c_api_hdrs",
+        "//tensorflow/stream_executor/tpu:tpu_node_context",
+        "//tensorflow/stream_executor/tpu:tpu_platform_interface",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+cc_library(
+    name = "tpu_on_demand_compiler",
+    srcs = ["tpu_on_demand_compiler.cc"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/service:compiler",
+        "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_cost_analysis",
+        "//tensorflow/compiler/xla/service:hlo_module_group",
+        "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/stream_executor:device_memory_allocator",
+        "//tensorflow/stream_executor/tpu:c_api_conversions",
+        "//tensorflow/stream_executor/tpu:proto_helper",
+        "//tensorflow/stream_executor/tpu:status_helper",
+        "//tensorflow/stream_executor/tpu:tpu_executor",
+        "//tensorflow/stream_executor/tpu:tpu_executor_c_api_hdrs",
+        "@com_google_absl//absl/types:span",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "tpu_runtime",
+    srcs = [],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":tpu_api_dlsym_initializer",
+        ":tpu_compilation_device",
+        ":tpu_node_device",
+        ":tpu_system_device",
+        "//tensorflow/core/tpu:tpu_on_demand_compiler",
+        "//tensorflow/core/tpu/graph_rewrite:tpu_rewrite_pass_registration",
+        "//tensorflow/core/tpu/ops",
+        "//tensorflow/stream_executor/tpu:tpu_executor",
+        "//tensorflow/stream_executor/tpu:tpu_transfer_manager",
+    ],
+)
diff --git a/tensorflow/core/tpu/graph_rewrite/BUILD b/tensorflow/core/tpu/graph_rewrite/BUILD
index ef9e4a0a41e..bffb44c1b97 100644
--- a/tensorflow/core/tpu/graph_rewrite/BUILD
+++ b/tensorflow/core/tpu/graph_rewrite/BUILD
@@ -9,10 +9,13 @@ package(
 )
 
 cc_library(
-    name = "distributed_tpu_configuration_rewrite_registration",
-    srcs = ["distributed_tpu_configuration_rewrite_registration.cc"],
+    name = "tpu_rewrite_pass_registration",
+    srcs = ["tpu_rewrite_pass_registration.cc"],
     deps = [
         ":distributed_tpu_configuration_rewrite_pass",
+        ":distributed_tpu_rewrite_pass",
+        ":encapsulate_tpu_computations_pass",
+        ":variable_merger_pass",
         "//tensorflow/core:core_cpu",
     ],
     alwayslink = 1,
@@ -28,14 +31,12 @@ cc_library(
     ],
     deps = [
         ":distributed_tpu_rewrite_helpers",
-        "//tensorflow/cc:scope",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
-        "//tensorflow/core/protobuf/tpu:topology_proto_cc",
         "//tensorflow/core/tpu:tpu_init_mode",
         "//tensorflow/core/tpu/kernels:tpu_compile_op_options",
     ],
@@ -47,9 +48,174 @@ cc_library(
     hdrs = ["distributed_tpu_rewrite_helpers.h"],
     deps = [
         "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime:device_set",
         "//tensorflow/core/tpu:tpu_defs",
     ],
 )
+
+cc_library(
+    name = "variable_merger_pass",
+    srcs = ["variable_merger_pass.cc"],
+    hdrs = ["variable_merger_pass.h"],
+    deps = [
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/common_runtime:optimization_registry",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "encapsulate_tpu_computations_pass",
+    srcs = [
+        "encapsulate_tpu_computations_pass.cc",
+    ],
+    hdrs = [
+        "encapsulate_tpu_computations_pass.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/jit:compilation_passes",
+        "//tensorflow/compiler/jit:encapsulate_util",
+        "//tensorflow/compiler/tf2xla:side_effect_util",
+        "//tensorflow/compiler/tf2xla:tf2xla_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_options",
+        "//tensorflow/core/common_runtime:function",
+        "//tensorflow/core/common_runtime:optimization_registry",
+        "//tensorflow/core/tpu:tpu_compile_interface",
+        "//tensorflow/core/tpu:tpu_defs",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "distributed_tpu_rewrite_pass_internal",
+    srcs = ["distributed_tpu_rewrite_pass_internal.cc"],
+    hdrs = ["distributed_tpu_rewrite_pass_internal.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "@com_google_absl//absl/random",
+    ],
+)
+
+cc_library(
+    name = "distributed_tpu_rewrite_pass",
+    srcs = [
+        "distributed_tpu_rewrite_pass.cc",
+    ],
+    hdrs = [
+        "distributed_tpu_rewrite_pass.h",
+    ],
+    deps = [
+        ":cond_builder",
+        ":distributed_tpu_rewrite_helpers",
+        ":distributed_tpu_rewrite_pass_internal",
+        ":host_training_loop_optimization_util",
+        ":incomplete_nodedef_builder",
+        "//tensorflow/compiler/jit:encapsulate_util",
+        "//tensorflow/compiler/jit:shape_inference",
+        "//tensorflow/compiler/tf2xla:resource_operation_table",
+        "//tensorflow/compiler/tf2xla:sharding_util",
+        "//tensorflow/compiler/tf2xla:side_effect_util",
+        "//tensorflow/compiler/tf2xla:tf2xla_util",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:array3d",
+        "//tensorflow/compiler/xla:array4d",
+        "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/compiler/xla/client:sharding_builder",
+        "//tensorflow/compiler/xla/service:computation_placer",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_options",
+        "//tensorflow/core/common_runtime:function",
+        "//tensorflow/core/common_runtime:graph_constructor",
+        "//tensorflow/core/common_runtime:lower_function_call_op",
+        "//tensorflow/core/common_runtime:lower_functional_ops",
+        "//tensorflow/core/common_runtime:lower_if_op",
+        "//tensorflow/core/common_runtime:lower_while_op",
+        "//tensorflow/core/common_runtime:optimization_registry",
+        "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
+        "//tensorflow/core/protobuf/tpu:dynamic_padding_proto_cc",
+        "//tensorflow/core/protobuf/tpu:topology_proto_cc",
+        "//tensorflow/core/tpu:tpu_compile_interface",
+        "//tensorflow/core/tpu:tpu_defs",
+        "//tensorflow/core/tpu/kernels:tpu_util_c_api_hdrs",
+        "//tensorflow/stream_executor/tpu:tpu_platform_interface",
+        "//tensorflow/stream_executor/tpu:tpu_topology_external",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "incomplete_nodedef_builder",
+    srcs = ["incomplete_nodedef_builder.cc"],
+    hdrs = ["incomplete_nodedef_builder.h"],
+    deps = [
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "cond_builder",
+    srcs = ["cond_builder.cc"],
+    hdrs = ["cond_builder.h"],
+    deps = [
+        ":incomplete_nodedef_builder",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "host_training_loop_optimization_util",
+    srcs = [
+        "host_training_loop_optimization_util.cc",
+    ],
+    hdrs = [
+        "host_training_loop_optimization_util.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":distributed_tpu_rewrite_pass_internal",
+        "//tensorflow/compiler/tf2xla:functionalize_control_flow_util",
+        "//tensorflow/compiler/tf2xla:tf2xla_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:node_hash_set",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
diff --git a/tensorflow/core/tpu/graph_rewrite/cond_builder.cc b/tensorflow/core/tpu/graph_rewrite/cond_builder.cc
new file mode 100644
index 00000000000..e16ae08aec3
--- /dev/null
+++ b/tensorflow/core/tpu/graph_rewrite/cond_builder.cc
@@ -0,0 +1,83 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/graph_rewrite/cond_builder.h"
+
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/tpu/graph_rewrite/incomplete_nodedef_builder.h"
+
+namespace tensorflow {
+
+CondBuilder::CondBuilder(string name, string device, const NodeDebugInfo& debug,
+                         Graph* graph)
+    : graph_(graph), name_(std::move(name)), device_(std::move(device)) {
+  auto new_name = [graph, this](string suffix) {
+    return graph->NewName(strings::StrCat(name_, "/", suffix));
+  };
+  TF_CHECK_OK(
+      IncompleteNodeDefBuilder::Identity(new_name("pred"), DT_BOOL, debug)
+          .Device(device_)
+          .Build(graph_, &pred_));
+  Node* switch_pred;
+  TF_CHECK_OK(
+      IncompleteNodeDefBuilder::Switch(new_name("switch_pred"), DT_BOOL, debug)
+          .Device(device_)
+          .Build(graph_, &switch_pred));
+  graph_->AddEdge(pred(), 0, switch_pred, 0);
+  graph_->AddEdge(pred(), 0, switch_pred, 1);
+  TF_CHECK_OK(
+      IncompleteNodeDefBuilder::Identity(new_name("switch_f"), DT_BOOL, debug)
+          .Device(device_)
+          .Build(graph_, &switch_f_));
+  TF_CHECK_OK(
+      IncompleteNodeDefBuilder::Identity(new_name("switch_t"), DT_BOOL, debug)
+          .Device(device_)
+          .Build(graph_, &switch_t_));
+  graph_->AddEdge(switch_pred, kElseBranch, switch_f_, 0);
+  graph_->AddEdge(switch_pred, kThenBranch, switch_t_, 0);
+  Node* merge_pred;
+  TF_CHECK_OK(IncompleteNodeDefBuilder::Merge(new_name("merge_pred"), DT_BOOL,
+                                              debug, /*n=*/2)
+                  .Device(device_)
+                  .Build(graph_, &merge_pred));
+  graph_->AddEdge(switch_f_, 0, merge_pred, kElseBranch);
+  graph_->AddEdge(switch_t_, 0, merge_pred, kThenBranch);
+  // Note: when additional return values are added then there should be a
+  // control dependency between those merge nodes and control_successor_ to
+  // ensure that it is control successor of conditional.
+  control_successor_ = merge_pred;
+}
+
+Node* CondBuilder::pred() { return pred_; }
+
+Node* CondBuilder::switch_f() { return switch_f_; }
+
+Node* CondBuilder::switch_t() { return switch_t_; }
+
+Node* CondBuilder::control_successor() { return control_successor_; }
+
+Status CondBuilder::AddInput(const string& input_name, const DataType& type,
+                             const string& device, const NodeDebugInfo& debug,
+                             Node** input) {
+  auto b = IncompleteNodeDefBuilder::Switch(
+      graph_->NewName(strings::StrCat(name_, "/", input_name)), type, debug);
+  TF_RETURN_IF_ERROR(b.Device(device).Build(graph_, input));
+  graph_->AddEdge(pred(), 0, *input, 1);
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/graph_rewrite/cond_builder.h b/tensorflow/core/tpu/graph_rewrite/cond_builder.h
new file mode 100644
index 00000000000..29e264dfc0a
--- /dev/null
+++ b/tensorflow/core/tpu/graph_rewrite/cond_builder.h
@@ -0,0 +1,74 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_GRAPH_REWRITE_COND_BUILDER_H_
+#define TENSORFLOW_CORE_TPU_GRAPH_REWRITE_COND_BUILDER_H_
+
+#include <string>
+
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Conditional builder.
+// Convenience builder to make it easy to construct a conditional. E.g.,
+//   Node* pred = ...;
+//   CondBuilder cb("cond", g);
+//   auto switch_var = cb.AddInput("var", DT_RESOURCE);
+//   g->AddEdge(pred, 0, cb.pred(), 0);
+// Will create the nodes of a conditional that takes as input a resource
+// variable ("var") as input and that switches on pred.
+//
+// This currently only handles the case needed by distributed_tpu_rewrite_pass
+// and is not completely general.
+class CondBuilder {
+ public:
+  enum Branch { kElseBranch = 0, kThenBranch = 1 };
+
+  CondBuilder(string name, string device, const NodeDebugInfo& debug,
+              Graph* graph);
+
+  // Returns node corresponding to the predicate input.
+  Node* pred();
+
+  // Returns node corresponding to switch_f branch of predicate switch.
+  Node* switch_f();
+
+  // Returns node corresponding to switch_t branch of predicate switch.
+  Node* switch_t();
+
+  // Returns node corresponding to control successor.
+  Node* control_successor();
+
+  // Returns the Switch node to feed a value of the given type into the
+  // conditional.
+  Status AddInput(const string& input_name, const DataType& type,
+                  const string& device, const NodeDebugInfo& debug,
+                  Node** input);
+
+ private:
+  Node* control_successor_;
+  Node* switch_f_;
+  Node* switch_t_;
+  Node* pred_;
+  Graph* const graph_;
+  const string name_;
+  const string device_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_GRAPH_REWRITE_COND_BUILDER_H_
diff --git a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
new file mode 100644
index 00000000000..73510319b0a
--- /dev/null
+++ b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
@@ -0,0 +1,4149 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Compilation for distributed TPU (TPU_REPLICATED_CORE devices).
+
+#include "tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.h"
+
+#include <queue>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/jit/encapsulate_util.h"
+#include "tensorflow/compiler/tf2xla/resource_operation_table.h"
+#include "tensorflow/compiler/tf2xla/sharding_util.h"
+#include "tensorflow/compiler/tf2xla/side_effect_util.h"
+#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
+#include "tensorflow/compiler/xla/array3d.h"
+#include "tensorflow/compiler/xla/array4d.h"
+#include "tensorflow/compiler/xla/client/sharding_builder.h"
+#include "tensorflow/compiler/xla/service/computation_placer.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "tensorflow/core/common_runtime/lower_function_call_op.h"
+#include "tensorflow/core/common_runtime/lower_functional_ops.h"
+#include "tensorflow/core/common_runtime/lower_if_op.h"
+#include "tensorflow/core/common_runtime/lower_while_op.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/strings/proto_serialization.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+#include "tensorflow/core/protobuf/tpu/dynamic_padding.pb.h"
+#include "tensorflow/core/protobuf/tpu/topology.pb.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/tpu/graph_rewrite/cond_builder.h"
+#include "tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_helpers.h"
+#include "tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass_internal.h"
+#include "tensorflow/core/tpu/graph_rewrite/host_training_loop_optimization_util.h"
+#include "tensorflow/core/tpu/graph_rewrite/incomplete_nodedef_builder.h"
+#include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
+#include "tensorflow/core/tpu/tpu_compile_interface.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/dump_graph.h"
+#include "tensorflow/stream_executor/tpu/tpu_platform_interface.h"
+
+namespace tensorflow {
+
+namespace {
+
+// Device coordinates are defined as (x, y, z, core), thus resulting in a rank 4
+// topology.
+constexpr int kTPUTopologyRank = 4;
+
+// An upper bound on how many cores may be present in the topology.
+static constexpr int kTPUMaxTopologySize = 4096;
+
+// Attribute containing the serialized xla::OpSharding to be passed to the
+// corresponding XLA HLO operation, which represents how a shape is distributed
+// across logical cores, e.g., replication, single-device, or partitioning.
+const char kShardingAttribute[] = "_XlaSharding";
+
+const char kTPUPartitionedInput[] = "TPUPartitionedInput";
+const char kTPUPartitionedOutput[] = "TPUPartitionedOutput";
+
+static const char* const kTPUCompilationResultAttr = "_tpu_compilation_status";
+static const char* const kPostDeviceRewriteAttr = "_post_device_rewrite";
+
+class IntrusiveHeapLink {
+ public:
+  using size_type = size_t;
+  static constexpr size_type kNotMember = -1;
+
+  IntrusiveHeapLink() = default;
+
+  // Only IntrusiveHeap and LinkAccess objects should make these objects.
+  explicit IntrusiveHeapLink(size_type pos) : pos_{pos} {}
+
+  // Only IntrusiveHeap and LinkAccess should get the value.
+  size_type get() const { return pos_; }
+
+ private:
+  size_type pos_{kNotMember};
+};
+
+template <typename T, IntrusiveHeapLink T::*M>
+struct IntrusiveHeapDataMemberLinkAccess {
+  IntrusiveHeapLink Get(const T* elem) const { return elem->*M; }
+  void Set(T* elem, IntrusiveHeapLink link) const { elem->*M = link; }
+};
+
+template <typename T>
+struct DefaultIntrusiveHeapLinkAccess {
+  IntrusiveHeapLink Get(const T* elem) const { return elem->heap; }
+  void Set(T* elem, IntrusiveHeapLink link) const { elem->heap = link; }
+};
+
+template <typename T, typename PtrCompare,
+          typename LinkAccess = DefaultIntrusiveHeapLinkAccess<T>,
+          typename Alloc = std::allocator<T*>>
+class IntrusiveHeap {
+ public:
+  typedef typename IntrusiveHeapLink::size_type size_type;
+  typedef T value_type;
+  typedef T* pointer;
+  typedef const T* const_pointer;
+  typedef PtrCompare pointer_compare_type;
+  typedef LinkAccess link_access_type;
+  typedef Alloc allocator_type;
+
+  explicit IntrusiveHeap(
+      const pointer_compare_type& comp = pointer_compare_type(),
+      const link_access_type& link_access = link_access_type(),
+      const allocator_type& alloc = allocator_type())
+      : rep_(comp, link_access, alloc) {}
+
+  size_type size() const { return heap().size(); }
+
+  bool empty() const { return heap().empty(); }
+
+  // Return the top element, but don't remove it.
+  pointer top() const {
+    DCHECK(!empty());
+    return heap()[0];
+  }
+
+  // Remove the top() pointer from the heap and return it.
+  pointer Pop() {
+    pointer t = top();
+    Remove(t);
+    return t;
+  }
+
+  // Insert 't' into the heap.
+  void Push(pointer t) {
+    SetPositionOf(t, heap().size());
+    heap().push_back(t);
+    FixHeapUp(t);
+  }
+
+  // Adjust the heap to accommodate changes in '*t'.
+  void Adjust(pointer t) {
+    DCHECK(Contains(t));
+    size_type h = GetPositionOf(t);
+    if (h != 0 && compare()(t, heap()[(h - 1) >> 1])) {
+      FixHeapUp(t);
+    } else {
+      FixHeapDown(t);
+    }
+  }
+
+  // Remove the specified pointer from the heap.
+  void Remove(pointer t) {
+    DCHECK(Contains(t));
+    size_type h = GetPositionOf(t);
+    SetPositionOf(t, IntrusiveHeapLink::kNotMember);
+    if (h == heap().size() - 1) {
+      // Fast path for removing from back of heap.
+      heap().pop_back();
+      return;
+    }
+    // Move the element from the back of the heap to overwrite 't'.
+    pointer& elem = heap()[h];
+    elem = heap().back();
+    SetPositionOf(elem, h);  // Element has moved, so update its link.
+    heap().pop_back();
+    Adjust(elem);  // Restore the heap invariant.
+  }
+
+  void Clear() { heap().clear(); }
+
+  bool Contains(const_pointer t) const {
+    size_type h = GetPositionOf(t);
+    return (h != IntrusiveHeapLink::kNotMember) && (h < size()) &&
+           heap()[h] == t;
+  }
+
+  void reserve(size_type n) { heap().reserve(n); }
+
+  size_type capacity() const { return heap().capacity(); }
+
+  allocator_type get_allocator() const { return rep_.heap_.get_allocator(); }
+
+ private:
+  typedef std::vector<pointer, allocator_type> heap_type;
+
+  // Empty base class optimization for pointer_compare and link_access.
+  // The heap_ data member retains a copy of the allocator, so it is not
+  // stored explicitly.
+  struct Rep : pointer_compare_type, link_access_type {
+    explicit Rep(const pointer_compare_type& cmp,
+                 const link_access_type& link_access,
+                 const allocator_type& alloc)
+        : pointer_compare_type(cmp),
+          link_access_type(link_access),
+          heap_(alloc) {}
+    heap_type heap_;  // NOLINT
+  };
+
+  const pointer_compare_type& compare() const { return rep_; }
+
+  const link_access_type& link_access() const { return rep_; }
+
+  const heap_type& heap() const { return rep_.heap_; }
+  heap_type& heap() { return rep_.heap_; }
+
+  size_type GetPositionOf(const_pointer t) const {
+    return link_access().Get(t).get();
+  }
+
+  void SetPositionOf(pointer t, size_type pos) const {
+    return link_access().Set(t, IntrusiveHeapLink(pos));
+  }
+
+  void FixHeapUp(pointer t) {
+    size_type h = GetPositionOf(t);
+    while (h != 0) {
+      size_type parent = (h - 1) >> 1;
+      if (compare()(heap()[parent], t)) {
+        break;
+      }
+      heap()[h] = heap()[parent];
+      SetPositionOf(heap()[h], h);
+      h = parent;
+    }
+    heap()[h] = t;
+    SetPositionOf(t, h);
+  }
+
+  void FixHeapDown(pointer t) {
+    size_type h = GetPositionOf(t);
+    for (;;) {
+      size_type kid = (h << 1) + 1;
+      if (kid >= heap().size()) {
+        break;
+      }
+      if (kid + 1 < heap().size() && compare()(heap()[kid + 1], heap()[kid])) {
+        ++kid;
+      }
+      if (compare()(t, heap()[kid])) {
+        break;
+      }
+      heap()[h] = heap()[kid];
+      SetPositionOf(heap()[h], h);
+      h = kid;
+    }
+
+    heap()[h] = t;
+    SetPositionOf(t, h);
+  }
+
+  Rep rep_;
+};
+
+string CoreDeviceLabel(int core) {
+  return strings::StrCat("/device:", DEVICE_TPU_REPLICATED_CORE, ":", core);
+}
+
+// Creates a unique node name with a particular prefix.
+string UniqueNodeName(const StringPiece prefix, Graph* graph) {
+  return graph->NewName(strings::StrCat(prefix, "/_", internal::GetNodeId()));
+}
+
+Status SetNodeDeviceForTPUCommunication(DeviceNameUtils::ParsedName device,
+                                        const string& target_device_type,
+                                        Node* node) {
+  TF_RET_CHECK(device.has_type && device.type == DEVICE_TPU_NODE);
+  TF_RET_CHECK(device.has_id);
+  TF_RET_CHECK(HasNodeAttr(node->def(), kXlaHasHostTransferAttrName));
+
+  // Store the device instance as an attr on the Node.
+  TF_RETURN_IF_ERROR(SetDeviceOrdinalAttributeForNode(node, device.id));
+
+  // Place the execute Op on the TPU_SYSTEM device so it can access the cache of
+  // compiled protos in the resource manager.
+  device.type = target_device_type;
+  device.id = 0;
+
+  node->set_assigned_device_name(DeviceNameUtils::ParsedNameToString(device));
+  return Status::OK();
+}
+
+// Iterate over the nodes in the original graph and find all the TPUReplicate
+// nodes, and all the nodes that are part of outside_compilation clusters.
+Status FindTaggedNodes(
+    Graph* graph, std::vector<Node*>* replicate_nodes,
+    std::map<string, DistributedTPURewritePass::OutsideCompilationNodeMap>*
+        outside_compilation_nodes,
+    std::map<string, std::vector<Node*>>* head_tail_outside_compilation_nodes) {
+  for (Node* node : graph->op_nodes()) {
+    if (node->type_string() == "_TPUReplicate") {
+      replicate_nodes->push_back(node);
+      const AttrValue* cluster_attr = node->attrs().Find(kTPUReplicateAttr);
+      if (cluster_attr == nullptr) {
+        return errors::Internal("TPUReplicate node ", node->name(), " has no ",
+                                kTPUReplicateAttr, " attr.");
+      } else {
+        const string& cluster = cluster_attr->s();
+        if (cluster.empty()) {
+          return errors::Internal("Attr ", kTPUReplicateAttr, " on node ",
+                                  node->name(), " has no string value.");
+        }
+        if (outside_compilation_nodes->find(cluster) !=
+            outside_compilation_nodes->end()) {
+          return errors::Internal(
+              "TPUReplicate node ", node->name(), " has ", kTPUReplicateAttr,
+              " attr value '", cluster,
+              "' which is a duplicate of another TPUReplicate node in the "
+              "graph.");
+        }
+        (*outside_compilation_nodes)[cluster] =
+            DistributedTPURewritePass::OutsideCompilationNodeMap();
+        (*head_tail_outside_compilation_nodes)[cluster] = std::vector<Node*>();
+      }
+    }
+  }
+  for (Node* node : graph->op_nodes()) {
+    if (node->type_string() != "_TPUReplicate") {
+      const AttrValue* cluster_attr = node->attrs().Find(kTPUReplicateAttr);
+      const AttrValue* outside_compilation_attr =
+          node->attrs().Find(kOutsideCompilationAttr);
+      if (cluster_attr == nullptr) {
+        if (outside_compilation_attr != nullptr) {
+          return errors::Internal("Node ", node->name(), " has ",
+                                  kOutsideCompilationAttr, " attr but no ",
+                                  kTPUReplicateAttr, " attr.");
+        }
+      } else {
+        const string& cluster = cluster_attr->s();
+        if (cluster.empty()) {
+          return errors::Internal("Attr ", kTPUReplicateAttr, " on node ",
+                                  node->name(), " has no string value.");
+        }
+        const auto iter = outside_compilation_nodes->find(cluster);
+        if (iter == outside_compilation_nodes->end()) {
+          return errors::Internal(
+              "Attr ", kTPUReplicateAttr, " on node ", node->name(),
+              " does not correspond to a TPUReplicate node.");
+        }
+        if (outside_compilation_attr == nullptr) {
+          return errors::Internal("Node ", node->name(), " has ",
+                                  kTPUReplicateAttr, " attr but no ",
+                                  kOutsideCompilationAttr, " attr.");
+        }
+        const string& oc_cluster = outside_compilation_attr->s();
+        if (oc_cluster.empty()) {
+          return errors::Internal("Attr ", kOutsideCompilationAttr, " on node ",
+                                  node->name(), " has no string value.");
+        }
+
+        // Outside compilation cluster at head and tail of TPU computation has
+        // already been moved to host and is already replicated. As so, do not
+        // replicate outside compilation nodes with replica id attribute.
+        int replica_id;
+        if (TryGetNodeAttr(node->def(), kXlaReplicaIdAttrName, &replica_id)) {
+          const AttrValue* head_attr =
+              node->attrs().Find("_xla_only_arg_or_oc_input");
+          const AttrValue* tail_attr =
+              node->attrs().Find("_xla_only_ret_or_oc_output");
+          if (((head_attr != nullptr) && (head_attr->b())) ||
+              ((tail_attr != nullptr) && (tail_attr->b()))) {
+            // This is safe as this has the same keys as
+            // outside_compilation_nodes which we already know has this key.
+            (*head_tail_outside_compilation_nodes)[cluster].push_back(node);
+          }
+          continue;
+        }
+        iter->second[oc_cluster].push_back(node);
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// Helper class to spread TPU computation arguments and return values
+// across cores.
+// If all shapes are fully defined, balance by their size.
+// If some of them are not fully defined, the undefined shapes size will
+// be estimated with the average size of the fully defined ones.
+// If none are defined, fall back to round-robin.
+class TensorDevicePlacer {
+ public:
+  // Creates a TensorDevicePlacer object to distribute arguments or
+  // return values to a set of num_devices devices, where the types and
+  // the inferred shapes of the inputs (arguments or return values) are
+  // passed in types and shapes.
+  TensorDevicePlacer(int64 num_devices, const DataTypeVector& types,
+                     const std::vector<InferredShape>& shapes)
+      : index_nodes_(num_devices), sizes_(types.size()) {
+    int64 total_size = 0;
+    int64 num_defined = 0;
+    for (int64 i = 0; i < types.size(); ++i) {
+      sizes_[i] = GetInferredShapeSize(shapes[i], types[i]);
+      if (sizes_[i] >= 0) {
+        total_size += sizes_[i];
+        ++num_defined;
+      }
+    }
+    // If a shape is undefined, select a size for it which is the average
+    // of the defined shapes. If no shapes are defined, assign 1 so that we
+    // get round-robin behavior.
+    int64 undefined_shape_size =
+        (num_defined > 0) ? total_size / num_defined : 1;
+    for (int64 i = 0; i < sizes_.size(); ++i) {
+      if (sizes_[i] < 0) {
+        sizes_[i] = undefined_shape_size;
+      }
+    }
+
+    for (int64 i = 0; i < num_devices; ++i) {
+      heap_.Push(&index_nodes_[i]);
+    }
+  }
+
+  // Reports that the argument/return-value at index has been assigned
+  // by the user to a given device.
+  void ReportDeviceAssigned(int64 device, int64 index) {
+    DeviceNode* node = &index_nodes_.at(device);
+    node->size += sizes_.at(index);
+    heap_.Adjust(node);
+  }
+
+  // Retrieves the device at which the argument/return-value at index
+  // should be assigned to.
+  int64 RetrieveAssignment(int64 index) {
+    DeviceNode* node = heap_.top();
+    int64 device = node - index_nodes_.data();
+    node->size += sizes_.at(index);
+    heap_.Adjust(node);
+    return device;
+  }
+
+ private:
+  struct DeviceNode {
+    struct Compare {
+      // Compare functor to implement a min heap using the ::gtl::IntrusiveHeap
+      // infrastructure.
+      bool operator()(const DeviceNode* lhs, const DeviceNode* rhs) const {
+        return lhs->size < rhs->size;
+      }
+    };
+
+    IntrusiveHeapLink heap;
+    int64 size = 0;
+  };
+
+  static int64 GetInferredShapeSize(const InferredShape& ishape,
+                                    DataType dtype) {
+    return ishape.shape.IsFullyDefined()
+               ? ishape.shape.num_elements() * DataTypeSize(dtype)
+               : -1;
+  }
+
+  std::vector<DeviceNode> index_nodes_;
+  IntrusiveHeap<DeviceNode, typename DeviceNode::Compare> heap_;
+  std::vector<int64> sizes_;
+};
+
+Status ValidateCoreNumber(int64 core, int64 num_cores_per_replica) {
+  if (core < 0 || core >= num_cores_per_replica) {
+    return tensorflow::errors::InvalidArgument("Invalid core ID: ", core,
+                                               ". The valid core IDs are [0..",
+                                               num_cores_per_replica, ")");
+  }
+  return Status::OK();
+}
+
+Status FindHostComputeKeyPlaceholderNodes(
+    const Graph* graph, const std::vector<Node*>& replicate_nodes,
+    std::unordered_map<string, Node*>* host_compute_key_placeholder_map) {
+  host_compute_key_placeholder_map->clear();
+  for (const auto node : replicate_nodes) {
+    (*host_compute_key_placeholder_map)[node->name()] = nullptr;
+  }
+
+  for (Node* node : graph->op_nodes()) {
+    if (node->type_string() == "Placeholder" &&
+        str_util::EndsWith(node->name(), "_key_placeholder")) {
+      const AttrValue* call_node_attr =
+          node->attrs().Find("_host_compute_call_node");
+      if (call_node_attr != nullptr) {
+        auto iter = host_compute_key_placeholder_map->find(call_node_attr->s());
+        if (iter == host_compute_key_placeholder_map->end()) {
+          return errors::InvalidArgument(
+              "Node ", node->name(), " has _host_compute_call_node attribute '",
+              call_node_attr->s(), "' that doesn't correspond to a call node");
+        }
+        if (iter->second != nullptr) {
+          return errors::InvalidArgument(
+              "Key placeholder node ", iter->second->name(), " for call node ",
+              call_node_attr->s(), " previously found as ",
+              iter->second->name());
+        }
+        iter->second = node;
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+Status ReplaceCompilationResultNodeWithIdentity(Graph* graph, Node** node) {
+  Node* old_node = *node;
+  // We want to replace the node with an identity node with the same name.
+  const string& node_name = old_node->name();
+
+  // Create identity node.
+  TF_ASSIGN_OR_RETURN(
+      Node * id_node,
+      BuildIdentityNode(graph, node_name, DT_STRING,
+                        /*input=*/nullptr, /*requested_device=*/""));
+
+  // No incoming edges are copied as a new one will be added from compile node
+  // to id_node.
+
+  // Copy outgoing edges to the id node.
+  std::vector<const Edge*> out_edges(old_node->out_edges().begin(),
+                                     old_node->out_edges().end());
+  for (const Edge* edge : out_edges) {
+    Node* dst = edge->dst();
+    int src_output = edge->src_output();
+    int dst_input = edge->dst_input();
+
+    if (src_output == Graph::kControlSlot) {
+      graph->AddControlEdge(id_node, dst);
+    } else {
+      graph->AddEdge(id_node, src_output, dst, dst_input);
+    }
+    graph->RemoveEdge(edge);
+  }
+  graph->RemoveNode(old_node);
+
+  *node = id_node;
+  return Status::OK();
+}
+
+Status FillPaddingMap(
+    const Node& replicate_node,
+    protobuf::RepeatedPtrField<tpu::PaddingMap>* padding_maps) {
+  std::vector<string> padding_map_strs;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(replicate_node.attrs(), "padding_map", &padding_map_strs));
+  padding_maps->Reserve(padding_map_strs.size());
+  for (const string& padding_map_str : padding_map_strs) {
+    tpu::PaddingMap* padding_map = padding_maps->Add();
+    if (!padding_map->ParseFromString(padding_map_str)) {
+      return errors::InvalidArgument(
+          "Malformed padding_map serialized string: ", padding_map_str);
+    }
+  }
+  return Status::OK();
+}
+
+Status GetStepMarkerLocation(const Node& replicate_node,
+                             xla::DebugOptions::StepMarkerLocation* location) {
+  string step_marker_location_attr;
+  TF_RETURN_IF_ERROR(GetNodeAttr(replicate_node.attrs(), "step_marker_location",
+                                 &step_marker_location_attr));
+  if (step_marker_location_attr.empty()) {
+    *location = xla::DebugOptions::STEP_MARK_AT_ENTRY;
+  } else {
+    if (!xla::DebugOptions::StepMarkerLocation_Parse(step_marker_location_attr,
+                                                     location)) {
+      return errors::InvalidArgument("Malformed step_marker_location: ",
+                                     step_marker_location_attr);
+    }
+  }
+  return Status::OK();
+}
+
+// Extracts a map of dimension and number of splits for tiled input from xla
+// sharding attribute.
+Status GetDimensionIndicesAndNumSplitsFromSharding(
+    const xla::OpSharding& sharding, std::map<int, int>* split_dimension_map) {
+  for (int dim_index = 0;
+       dim_index < sharding.tile_assignment_dimensions_size(); dim_index++) {
+    if (sharding.tile_assignment_dimensions(dim_index) > 1) {
+      split_dimension_map->emplace(
+          dim_index, sharding.tile_assignment_dimensions(dim_index));
+    }
+  }
+
+  if (split_dimension_map->empty()) {
+    return errors::InvalidArgument("Arg has unnecessary tiled sharding: ",
+                                   sharding.DebugString());
+  }
+  return Status::OK();
+}
+
+// Updates contents of the function with `function_name` in function library
+// definition `flib_def` to `new_graph`. This is required when graph
+// transformation happens inside a function call body.
+Status UpdateFunctionLibDefinition(const Graph& new_graph,
+                                   const std::string& function_name,
+                                   FunctionLibraryDefinition* flib_def) {
+  FunctionDef graph_fdef;
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(new_graph, function_name, &graph_fdef));
+  TF_RETURN_IF_ERROR(flib_def->ReplaceFunction(function_name, graph_fdef));
+  return Status::OK();
+}
+
+struct NodeOut {
+  Node* node;
+  int index;
+};
+
+struct ShardedInputIndex {
+  int replica_id;
+  int argument_index;
+
+  bool operator<(const ShardedInputIndex& rhs) const {
+    return std::tie(replica_id, argument_index) <
+           std::tie(rhs.replica_id, rhs.argument_index);
+  }
+};
+
+struct ShardedInputInfo {
+  // Split node that would be connected to tiled input Node.
+  Node* split_node;
+  // List of splits nodes and output index of the split node from which sharded
+  // input will be connected to the TPUExecute node. The inputs are ordered by
+  // logical core ids.
+  std::vector<NodeOut> sharded_inputs;
+};
+
+// Adds split node and split dimension node to graph for sharding tiled inputs.
+// |graph| owns the returned Node* instance.
+xla::StatusOr<Node*> CreateSplitNode(int num_splits, int dim,
+                                     int orig_src_output, DataType dtype,
+                                     absl::string_view name_prefix,
+                                     Node* control_predecessor, Node* orig_src,
+                                     Graph* graph) {
+  const std::string input_assigned_device = orig_src->assigned_device_name();
+
+  // Add a split dimension node.
+  NodeDef split_dim_def;
+  split_dim_def.set_name(
+      graph->NewName(absl::StrCat(name_prefix, "/split_dim")));
+  split_dim_def.set_op("Const");
+  split_dim_def.set_device(input_assigned_device);
+  AddNodeAttr("dtype", DT_INT32, &split_dim_def);
+  TensorProto tensor_proto;
+  tensor_proto.set_dtype(DT_INT32);
+  tensor_proto.add_int_val(dim);
+  TensorShape shape({});
+  shape.AsProto(tensor_proto.mutable_tensor_shape());
+  AddNodeAttr("value", tensor_proto, &split_dim_def);
+  Status s;
+  Node* split_dim_node = graph->AddNode(split_dim_def, &s);
+  TF_RETURN_IF_ERROR(s);
+  // Add a split node.
+  NodeDef split_def;
+  split_def.set_name(graph->NewName(absl::StrCat(name_prefix, "/split")));
+  split_def.set_op("Split");
+  split_def.set_device(input_assigned_device);
+  AddNodeAttr("num_split", num_splits, &split_def);
+  AddNodeAttr("T", dtype, &split_def);
+  split_def.add_input(absl::StrCat(split_dim_node->name(), ":0"));
+  split_def.add_input(absl::StrCat(orig_src->name(), ":", orig_src_output));
+  Node* split_node = graph->AddNode(split_def, &s);
+  split_node->set_assigned_device_name(input_assigned_device);
+  TF_RETURN_IF_ERROR(s);
+
+  graph->AddEdge(split_dim_node, 0, split_node, 0);
+  graph->AddEdge(orig_src, orig_src_output, split_node, 1);
+
+  // Add a control dependency from `control_predecessor` to newly created
+  // constant node. This ensures that newly added split/split dim
+  // nodes are placed inside correct while loop frames when TPUExecute
+  // node is inside a host training loop.
+  graph->AddControlEdge(control_predecessor, split_dim_node);
+
+  return split_node;
+}
+
+// Creates a set of splits nodes that shards tiled input node in graph.
+xla::StatusOr<ShardedInputInfo> CreateOrGetSplitNodesForInputSharding(
+    const xla::OpSharding& sharding, int orig_arg_num, DataType dtype,
+    int replica_id, int orig_src_output, Node* orig_src,
+    Node* control_predecessor, Graph* graph,
+    std::map<ShardedInputIndex, ShardedInputInfo>*
+        arg_index_to_sharded_input_map) {
+  ShardedInputIndex input_index{replica_id, orig_arg_num};
+  auto iter = arg_index_to_sharded_input_map->find(input_index);
+  if (iter != arg_index_to_sharded_input_map->end()) {
+    return iter->second;
+  }
+  // Maps input dimension and number of splits with which the
+  // dimension sharded.
+  std::map<int, int> split_dimension_map;
+  TF_RETURN_IF_ERROR(GetDimensionIndicesAndNumSplitsFromSharding(
+      sharding, &split_dimension_map));
+  TF_RET_CHECK(!split_dimension_map.empty())
+      << "Unnecessary sharding attribute found.";
+
+  // For v1 while loop, nodes inside the loop body must either
+  //  1) Have data edges from while loop input node.
+  //  or
+  //  2) Have direct control dependency from while loop input control
+  //     node.
+  //
+  // As so, if we are adding Split node inside, while loop body,
+  // we must manually add a control dependency to a node inside
+  // a while loop (i.e. `control_predecessor`) to constant nodes
+  // without data in-edges to make sure that added split nodes
+  // have correct frame name. Else, placer will complain when
+  // `BuildControlFlow()` is invoked.
+
+  auto sharding_it = split_dimension_map.begin();
+  std::queue<Node*> split_nodes_for_dimension;
+  int split_dimension = sharding_it->first;
+  int num_split = sharding_it->second;
+
+  // Creates a tree of split nodes for sharding tiled inputs. Splits nodes
+  // are created such that input data is sharded in row major order.
+  // Split nodes at ith depth from the original input node represent nodes
+  // that split the input data at ith dimension.
+  TF_ASSIGN_OR_RETURN(
+      Node * root_split_node,
+      CreateSplitNode(num_split, split_dimension, orig_src_output, dtype,
+                      absl::StrCat("sharded_input/replica_", replica_id,
+                                   "_dim_", split_dimension),
+                      control_predecessor, orig_src, graph));
+  sharding_it++;
+
+  split_nodes_for_dimension.emplace(root_split_node);
+
+  while (sharding_it != split_dimension_map.end()) {
+    split_dimension = sharding_it->first;
+    num_split = sharding_it->second;
+    int num_split_nodes_in_dimension = split_nodes_for_dimension.size();
+    for (int i = 0; i < num_split_nodes_in_dimension; ++i) {
+      Node* input_split_node = split_nodes_for_dimension.front();
+      split_nodes_for_dimension.pop();
+      for (int src_output_index = 0;
+           src_output_index < input_split_node->num_outputs();
+           ++src_output_index) {
+        TF_ASSIGN_OR_RETURN(
+            Node * split_node,
+            CreateSplitNode(num_split, split_dimension, src_output_index, dtype,
+                            absl::StrCat("sharded_input/replica_", replica_id,
+                                         "_dim_", split_dimension),
+                            control_predecessor, input_split_node, graph));
+        split_nodes_for_dimension.emplace(split_node);
+      }
+    }
+    sharding_it++;
+  }
+
+  // `split_nodes_for_dimension` now includes final split nodes
+  // from which sharded data will be fed into TPUExcute nodes -- sorted by
+  // row major order.
+  std::vector<NodeOut> sharded_inputs_list;
+  sharded_inputs_list.reserve(split_nodes_for_dimension.size());
+  while (!split_nodes_for_dimension.empty()) {
+    Node* split_node = split_nodes_for_dimension.front();
+    split_nodes_for_dimension.pop();
+    int num_splits;
+    TF_RETURN_IF_ERROR(
+        GetNodeAttr(split_node->def(), "num_split", &num_splits));
+    for (int out_index = 0; out_index < num_splits; ++out_index) {
+      sharded_inputs_list.emplace_back(NodeOut{split_node, out_index});
+    }
+  }
+
+  ShardedInputInfo sharded_input_info{root_split_node,
+                                      std::move(sharded_inputs_list)};
+  (*arg_index_to_sharded_input_map)[input_index] = sharded_input_info;
+  return sharded_input_info;
+}
+
+// Creates a concat node to be used for aggregating sharded retvals across
+// logical cores.
+xla::StatusOr<Node*> CreateConcatNode(int dim, int num_splits, DataType dtype,
+                                      absl::string_view name_prefix,
+                                      const std::vector<NodeOut>& inputs,
+                                      Graph* graph, absl::string_view device) {
+  // Add a Concat dim node.
+  NodeDef concat_dim_def;
+  concat_dim_def.set_name(
+      graph->NewName(absl::StrCat(name_prefix, "/concat_dim")));
+  concat_dim_def.set_op("Const");
+  AddNodeAttr("dtype", DT_INT32, &concat_dim_def);
+  concat_dim_def.set_device(std::string(device));
+  TensorProto tensor_proto;
+  tensor_proto.set_dtype(DT_INT32);
+  tensor_proto.add_int_val(dim);
+  TensorShape shape({});
+  shape.AsProto(tensor_proto.mutable_tensor_shape());
+  AddNodeAttr("value", tensor_proto, &concat_dim_def);
+  Status s;
+  Node* concat_dim_node = graph->AddNode(concat_dim_def, &s);
+  TF_RETURN_IF_ERROR(s);
+
+  // Add a Concat node.
+  NodeDef concat_def;
+  concat_def.set_name(graph->NewName(absl::StrCat(name_prefix, "/concat")));
+  concat_def.set_op("Concat");
+  AddNodeAttr("N", num_splits, &concat_def);
+  AddNodeAttr("T", dtype, &concat_def);
+  concat_def.add_input(absl::StrCat(concat_dim_node->name(), ":0"));
+  concat_def.set_device(std::string(device));
+  for (const auto& i : inputs) {
+    concat_def.add_input(absl::StrCat(i.node->name(), ":", i.index));
+  }
+  Node* concat_node = graph->AddNode(concat_def, &s);
+  TF_RETURN_IF_ERROR(s);
+
+  graph->AddEdge(concat_dim_node, 0, concat_node, 0);
+
+  // 0th input to concat node is a concat dim node. So we start from 1st input
+  // and add all input edges.
+  int dst_input = 1;
+  for (const auto& i : inputs) {
+    graph->AddEdge(i.node, i.index, concat_node, dst_input);
+    ++dst_input;
+  }
+  return concat_node;
+}
+
+// Creates a set of Concat nodes that aggregates sharded outputs from TPUExecute
+// nodes into a single output. Sharded outputs are concatenated along row major
+// order. That is, tiled output along 0th dimension will be concatenated last.
+xla::StatusOr<Node*> CreateConcatNodesForRetval(
+    const xla::OpSharding& sharding, DataType dtype, int replica_id,
+    const std::vector<NodeOut>& orig_inputs, Graph* graph,
+    absl::string_view device) {
+  std::map<int, int> split_dimension_map;
+  TF_RETURN_IF_ERROR(GetDimensionIndicesAndNumSplitsFromSharding(
+      sharding, &split_dimension_map));
+
+  std::vector<NodeOut> inputs_to_sharded_retval = orig_inputs;
+
+  for (auto it = split_dimension_map.rbegin(); it != split_dimension_map.rend();
+       it++) {
+    auto dim = it->first;
+    auto num_splits = it->second;
+
+    int num_concat_nodes = inputs_to_sharded_retval.size() / num_splits;
+    int input_index_to_concat_node = 0;
+
+    std::vector<NodeOut> new_concat_nodes;
+    for (int i = 0; i < num_concat_nodes; ++i) {
+      auto concat_input_it =
+          inputs_to_sharded_retval.begin() + input_index_to_concat_node;
+      std::vector<NodeOut> inputs(concat_input_it,
+                                  concat_input_it + num_splits);
+      input_index_to_concat_node += num_splits;
+
+      TF_ASSIGN_OR_RETURN(
+          Node * concat_node,
+          CreateConcatNode(
+              dim, num_splits, dtype,
+              absl::StrCat("sharded_output/replica_", replica_id, "_dim_", dim),
+              inputs, graph, device));
+      new_concat_nodes.emplace_back(NodeOut{concat_node, 0});
+    }
+    inputs_to_sharded_retval = new_concat_nodes;
+  }
+
+  TF_RET_CHECK(inputs_to_sharded_retval.size() == 1);
+  return inputs_to_sharded_retval.at(0).node;
+}
+
+absl::optional<int> GetCoreIndexInSharding(const xla::OpSharding& sharding,
+                                           int64 core) {
+  absl::optional<int> output_index;
+  for (int i = 0; i < sharding.tile_assignment_devices_size(); i++) {
+    int64 assigned_core = sharding.tile_assignment_devices(i);
+    if (assigned_core == core) {
+      output_index = i;
+      break;
+    }
+  }
+  return output_index;
+}
+
+// Set the padding ops the same devices as the original inputs. If the original
+// inputs are on TPUs, the padding ops will be placed on TPUs and XLA on demand
+// mode will be triggered, so we don't need to copy the data back to the host
+// to do the padding.
+Status SetPaddingNodesDevices(Graph* graph) {
+  for (Node* n : graph->op_nodes()) {
+    bool tpu_padding_attr;
+    if (n->type_string() == "Pad" &&
+        GetNodeAttr(n->attrs(), kPostDeviceRewriteAttr, &tpu_padding_attr)
+            .ok()) {
+      Node* unpadded_input;
+      TF_RETURN_IF_ERROR(n->input_node(0, &unpadded_input));
+
+      const string& requested_device = unpadded_input->requested_device();
+      const string& assigned_device = unpadded_input->assigned_device_name();
+      if (!requested_device.empty() || !assigned_device.empty()) {
+        // The output nodes of the original unpadded inputs include the padded
+        // inputs and real shapes of inputs, we assign those to the same device
+        // as the original inputs.
+        for (Node* out : unpadded_input->out_nodes()) {
+          if (GetNodeAttr(out->attrs(), kPostDeviceRewriteAttr,
+                          &tpu_padding_attr)
+                  .ok()) {
+            out->set_requested_device(requested_device);
+            out->set_assigned_device_name(assigned_device);
+          }
+        }
+        // There might be a tf.shape node added before TPUCompileOp, we need to
+        // set its device as well.
+        for (Node* out : n->out_nodes()) {
+          if (n->type_string() == "Shape") {
+            out->set_requested_device(requested_device);
+            out->set_assigned_device_name(assigned_device);
+          }
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
+const string& AssignedOrRequestedDevice(const Node* node) {
+  if (!node->assigned_device_name().empty()) {
+    return node->assigned_device_name();
+  }
+  return node->requested_device();
+}
+
+bool IsTpuDevice(const string& device_string) {
+  DeviceNameUtils::ParsedName device;
+  return DeviceNameUtils::ParseFullName(device_string, &device) &&
+         device.type == DEVICE_TPU_NODE;
+}
+
+// Returns a set of device ops can be placed on TPU. There is no strict rule of
+// thumb to decide which ops should be in the list, but empirically they are
+// mostly dummy ops like Identity-like ops or control flow related ops. However
+// people can add also add other ops like Pad to allow data stay on TPU.
+const absl::flat_hash_set<std::string>& PlaceOnTPUOpList() {
+  static const auto place_on_tpu_ops = new absl::flat_hash_set<std::string>(
+      {"Identity", "IdentityN", "Enter", "Exit", "Switch", "Merge",
+       "NextIteration", "Shape"});
+  return *place_on_tpu_ops;
+}
+
+// If an op satisfies the following conditions, it will be placed on the same
+// TPU device as its inputs:
+//   (1) The op can be placed on TPU (in the PlaceOnTPUOpList)
+//   (2) The op itself has no requested or assigned devices.
+//   (3) All the data inputs of this op are placed on the same device on TPUs.
+//       There are exceptions like the NextIterations input of Switch node can
+//       be placed on CPU as it is just a boolean.
+//
+// Returns true if the node device has been changed, otherwise returns false.
+bool PlaceOpsOnTPU(Node* node) {
+  if (!AssignedOrRequestedDevice(node).empty() ||
+      !PlaceOnTPUOpList().contains(node->type_string())) {
+    return false;
+  }
+  string src_tpu_device = "";
+  Node* src_node;
+  for (const Edge* e : node->in_edges()) {
+    if (e->IsControlEdge()) {
+      continue;
+    }
+    Node* src = e->src();
+    const string& src_device = AssignedOrRequestedDevice(src);
+
+    // Make exceptions that we don't force the some inputs to place on TPUs.
+    if (node->IsSwitch() && src->IsLoopCond()) {
+      continue;
+    }
+
+    if (!IsTpuDevice(src_device) ||
+        (!src_tpu_device.empty() && src_device != src_tpu_device)) {
+      return false;
+    }
+    if (src_tpu_device.empty()) {
+      src_tpu_device = src_device;
+      src_node = src;
+    }
+  }
+  node->set_assigned_device_name(src_node->assigned_device_name());
+  node->set_requested_device(src_node->requested_device());
+  return true;
+}
+
+// Validate sharding configuration derived from XlaSharding attribute.
+// Infer the core id from the OpSharding, if necessary.
+Status ParseAndValidateSharding(const xla::OpSharding& sharding,
+                                const int num_cores_per_replica,
+                                int64* inferred_core_id,
+                                absl::optional<xla::OpSharding>* result) {
+  if (sharding.type() == xla::OpSharding::MAXIMAL) {
+    int64 core_annotation = sharding.tile_assignment_devices(0);
+    TF_RETURN_IF_ERROR(
+        ValidateCoreNumber(core_annotation, num_cores_per_replica));
+    if (*inferred_core_id == -1 || *inferred_core_id > core_annotation) {
+      *inferred_core_id = core_annotation;
+      result->emplace(sharding);
+    }
+  } else {
+    if (sharding.type() == xla::OpSharding::OTHER) {
+      for (int64 core : sharding.tile_assignment_devices()) {
+        TF_RETURN_IF_ERROR(ValidateCoreNumber(core, num_cores_per_replica));
+      }
+    }
+
+    if (!result->has_value()) {
+      *result = sharding;
+    } else {
+      std::string result_value_serialized;
+      std::string sharding_serialized;
+      SerializeToStringDeterministic(result->value(), &result_value_serialized);
+      SerializeToStringDeterministic(sharding, &sharding_serialized);
+
+      if (result_value_serialized != sharding_serialized) {
+        // We see different shardings, assign to core 0.
+        result->emplace(xla::sharding_builder::AssignDevice(0));
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// As XlaSharding node may be followed by Cast op or an Identity op,
+// recursively walk the graph and aggregate nodes connectd to
+// |input_node| or Cast/Identity op following the |input_node|.
+void FindNodesMaybeContainingShardingInfo(const Node& input_node,
+                                          std::vector<const Node*>* nodes) {
+  if (input_node.IsIdentity() || input_node.type_string() == "Cast") {
+    for (const Node* connected_node : input_node.out_nodes())
+      FindNodesMaybeContainingShardingInfo(*connected_node, nodes);
+  }
+  nodes->emplace_back(&input_node);
+}
+
+// Parse sharding configuration from |node| or it's adjacent nodes.
+// XlaSharding configuration may be derived from
+//   a) Connected Identity op node.
+//   b) Connected Cast op node.
+xla::StatusOr<absl::optional<xla::OpSharding>>
+ParseInputShardingFromAdjacentNode(const int num_cores_per_replica,
+                                   const Node& node) {
+  // If |node| has `device` attribute or is a XlaSharding op,
+  // return the parsed OpSharding.
+  TF_ASSIGN_OR_RETURN(absl::optional<xla::OpSharding> sharding,
+                      ParseShardingFromDevice(node, num_cores_per_replica));
+  if (sharding.has_value()) return sharding;
+
+  // XlaShardingOp may be followed by an identity or followed by identity
+  // and a Cast op.
+  std::vector<const Node*> potential_nodes_with_input_sharding;
+  FindNodesMaybeContainingShardingInfo(node,
+                                       &potential_nodes_with_input_sharding);
+  for (const Node* maybe_node_with_sharding_info :
+       potential_nodes_with_input_sharding) {
+    if (maybe_node_with_sharding_info->type_string() != "XlaSharding") continue;
+
+    TF_ASSIGN_OR_RETURN(absl::optional<xla::OpSharding> sharding_config,
+                        ParseShardingFromDevice(*maybe_node_with_sharding_info,
+                                                num_cores_per_replica));
+    if (sharding_config.has_value()) return sharding_config;
+  }
+  return sharding;
+}
+
+// Walk the graph from an argument node to find OpSharding configuration
+// from its neighbor nodes. Sharding configuration may be inferred from
+//  1) Parsing XlaSharding attribute from neighboring node.
+//  2) If argument node is a resource, then by parsing adjacent nodes
+//     of the connected ReadVariable op.
+Status ParseAndValidateShardingFromNeighbors(
+    const int num_cores_per_replica, const std::string& arg_node_name,
+    const Node& neighbor_node, int64* inferred_core_id, bool* is_fast_mem,
+    absl::optional<xla::OpSharding>* result) {
+  if (neighbor_node.attrs().Find(TPU_FAST_MEM_ATTR) != nullptr) {
+    *is_fast_mem = true;
+    VLOG(2) << "place " << neighbor_node.name() << " on fast memory because "
+            << arg_node_name << " has " << TPU_FAST_MEM_ATTR << " attribute";
+  }
+
+  // XlaSharding information may be encoded on node directly connected to the
+  // argument node.
+  TF_ASSIGN_OR_RETURN(
+      absl::optional<xla::OpSharding> sharding,
+      ParseInputShardingFromAdjacentNode(num_cores_per_replica, neighbor_node));
+  if (sharding.has_value()) {
+    TF_RETURN_IF_ERROR(ParseAndValidateSharding(
+        *sharding, num_cores_per_replica, inferred_core_id, result));
+    return Status::OK();
+  }
+
+  // When we use variable in TPU computation, we always have a
+  // XlaSharding op followed by a ReadVariableOp. As so, correctly parse
+  // the users of ReadVariableOp for potential sharding configuration.
+  if (neighbor_node.type_string() == "ReadVariableOp") {
+    for (const Edge* e : neighbor_node.out_edges()) {
+      if (e->IsControlEdge()) continue;
+
+      if (e->dst()->attrs().Find(TPU_FAST_MEM_ATTR) != nullptr) {
+        *is_fast_mem = true;
+        VLOG(2) << "place " << arg_node_name << " on fast memory because "
+                << e->dst()->name() << TPU_FAST_MEM_ATTR << " attribute";
+      }
+
+      TF_ASSIGN_OR_RETURN(
+          absl::optional<xla::OpSharding> sharding,
+          ParseInputShardingFromAdjacentNode(num_cores_per_replica, *e->dst()));
+      if (sharding.has_value()) {
+        TF_RETURN_IF_ERROR(ParseAndValidateSharding(
+            *sharding, num_cores_per_replica, inferred_core_id, result));
+        return Status::OK();
+      }
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+// Inputs:
+//   replication_spec_string: the device to which the TPUReplicate node was
+//     assigned.
+//   device_set: the set of TF devices.
+// Outputs:
+//   tpu_compilation_device: the name of the TPU compilation device.
+//   num_tpus_per_task: the number of TPUs in each task. Verifies that all tasks
+//     have the same number of TPU devices.
+//   tpu_devices: the TPU devices, indexed by [task][device].
+static Status GetTPUDeviceNames(
+    const string& replication_spec_string, const DeviceSet& device_set,
+    string* tpu_compilation_device, int* num_tpus_per_task,
+    std::vector<std::vector<Device*>>* tpu_devices) {
+  // TODO(b/110910013) GetSystemDevice parses the spec and returns the name of
+  // the tpu_system device, which we replace by the cpu device. We do this
+  // replacement because we want to place the TPUCompileOp (and the compile
+  // assert op) explicitly on cpu devices on the same job as the tpu_system
+  // device.
+  DeviceNameUtils::ParsedName replication_spec;
+  Device* replication_device;
+  TF_RETURN_IF_ERROR(DistributedTPURewriteHelpers::GetSystemDevice(
+      replication_spec_string, device_set, &replication_spec,
+      &replication_device));
+  *tpu_compilation_device =
+      str_util::StringReplace(replication_device->name(), DEVICE_TPU_SYSTEM,
+                              DEVICE_CPU, /*replace_all=*/true);
+
+  // Finds the set of TPU devices attached to the tasks in the job.
+  TF_RETURN_IF_ERROR(DistributedTPURewriteHelpers::GetTPUDevices(
+      replication_spec, device_set, num_tpus_per_task, tpu_devices));
+
+  return Status::OK();
+}
+
+// Parses the topology attribute of TPUReplicate, and populates *topology with
+// a physical mesh coordinate to (task, device) mapping.
+static Status ParseTopologyAttr(const string& topology_attr,
+                                const tpu::TpuTopologyExternal& tpu_topology,
+                                int num_tasks, int num_tpus_per_task,
+                                xla::Array4D<std::pair<int, int>>* topology) {
+  static_assert(4 == kTPUTopologyRank, "Assumes the topology rank is 4");
+  tpu::TopologyProto proto;
+  proto.ParseFromString(topology_attr);
+  if (proto.mesh_shape_size() != kTPUTopologyRank) {
+    return errors::InvalidArgument("TPU topology must be rank ",
+                                   kTPUTopologyRank);
+  }
+  if (proto.num_tasks() != num_tasks) {
+    return errors::InvalidArgument("Mismatched number of TPU tasks");
+  }
+  if (proto.num_tpu_devices_per_task() != num_tpus_per_task) {
+    return errors::InvalidArgument("Mismatched number of TPUs per task (",
+                                   proto.num_tpu_devices_per_task(),
+                                   " != ", num_tpus_per_task, ").");
+  }
+  if (proto.device_coordinates_size() !=
+      num_tasks * num_tpus_per_task * kTPUTopologyRank) {
+    return errors::InvalidArgument(
+        "device coordinates should be ", num_tasks, "x", num_tpus_per_task, "x",
+        kTPUTopologyRank, "; got ", proto.device_coordinates_size());
+  }
+
+  int devices_per_chip = tpu_topology.LogicalDevicesPerChip(kTensorCore);
+  *topology = xla::Array4D<std::pair<int, int>>(
+      tpu_topology.chip_bounds().x, tpu_topology.chip_bounds().y,
+      tpu_topology.chip_bounds().z, devices_per_chip, {-1, -1});
+  int pos = 0;
+  for (int task = 0; task < num_tasks; ++task) {
+    for (int device = 0; device < num_tpus_per_task; ++device) {
+      int32 x = proto.device_coordinates(pos++);
+      int32 y = proto.device_coordinates(pos++);
+      int32 z = proto.device_coordinates(pos++);
+      int32 core = proto.device_coordinates(pos++);
+
+      if (!tpu_topology.HasChip(x, y, z) || core < 0 ||
+          core >= devices_per_chip) {
+        return errors::InvalidArgument(
+            "Mesh coordinates (", x, ",", y, ",", z, ",", core,
+            ") are not valid for the current TPU topology");
+      }
+      if ((*topology)(x, y, z, core).first != -1) {
+        return errors::InvalidArgument("Duplicate coordinates (", x, ",", y,
+                                       ",", z, ",", core, ") in TPU topology");
+      }
+      (*topology)(x, y, z, core) = {task, device};
+    }
+  }
+  return Status::OK();
+}
+
+// Parses the value of the device_assignment attribute to TPUReplicate.
+// Populates *device_assignment; *device_assignment must be a 2D array with
+// shape (num_replicas, num_cores_per_replica).
+static Status ParseDeviceAssignmentAttr(
+    absl::Span<const int> device_assignment_attr,
+    const tpu::TpuTopologyExternal& tpu_topology, int num_replicas,
+    int num_cores_per_replica,
+    xla::Array2D<tpu::TpuCoreLocationExternal>* device_assignment) {
+  static_assert(4 == kTPUTopologyRank, "Assumes the topology rank is 4");
+
+  const int64 device_assignment_attr_size =
+      num_replicas * num_cores_per_replica * kTPUTopologyRank;
+  if (device_assignment_attr.size() != device_assignment_attr_size) {
+    return errors::InvalidArgument(
+        "Length of device_assignment attribute must be equal to num_replicas (",
+        num_replicas, ") * num_cores_per_replica (", num_cores_per_replica,
+        ") * ", kTPUTopologyRank, " got ", device_assignment_attr.size());
+  }
+  for (int core : device_assignment_attr) {
+    if (core < 0 || core >= kTPUMaxTopologySize) {
+      return errors::InvalidArgument(
+          "Invalid core number in device assignment: ", core);
+    }
+  }
+
+  *device_assignment = xla::Array2D<tpu::TpuCoreLocationExternal>(
+      num_replicas, num_cores_per_replica);
+  int devices_per_chip = tpu_topology.LogicalDevicesPerChip(kTensorCore);
+  xla::Array4D<int> replica_assignment(
+      tpu_topology.chip_bounds().x, tpu_topology.chip_bounds().y,
+      tpu_topology.chip_bounds().z, devices_per_chip, -1);
+  int pos = 0;
+  for (int replica = 0; replica < num_replicas; ++replica) {
+    for (int logical_core = 0; logical_core < num_cores_per_replica;
+         ++logical_core) {
+      int32 x = device_assignment_attr[pos++];
+      int32 y = device_assignment_attr[pos++];
+      int32 z = device_assignment_attr[pos++];
+      int32 core = device_assignment_attr[pos++];
+
+      if (!tpu_topology.HasChip(x, y, z) || core < 0 ||
+          core >= devices_per_chip) {
+        return errors::InvalidArgument(
+            "Mesh coordinates (", x, ",", y, ",", core,
+            ") are not valid for the current TPU topology");
+      }
+      tpu::TpuCoreLocationExternal core_location =
+          tpu_topology.Core(x, y, z, kTensorCore, core);
+
+      if (replica_assignment(x, y, z, core) != -1) {
+        return errors::InvalidArgument("Duplicate coordinates (", x, ",", y,
+                                       ",", z, ",", core,
+                                       ") in TPU device assignment");
+      }
+      replica_assignment(x, y, z, core) = replica;
+      (*device_assignment)(replica, logical_core) = core_location;
+    }
+  }
+  return Status::OK();
+}
+
+// Builds TensorFlow device assignments for the special case of a single core
+// computation that is replicated to every core in the mesh.
+// LINT.IfChange
+static Status BuildFullMeshDeviceAssignment(
+    int num_replicas, const std::vector<std::vector<Device*>>& tpu_devices,
+    int num_tasks, int num_tpus_per_task,
+    std::vector<std::vector<string>>* tf_device_assignment) {
+  // Assign TensorFlow devices to replicas arbitrarily.
+  for (int i = 0; i < num_replicas; ++i) {
+    int task = i / num_tpus_per_task;
+    int device = i % num_tpus_per_task;
+    TF_RET_CHECK(task >= 0 && task < num_tasks);
+    TF_RET_CHECK(device >= 0 && device < num_tpus_per_task);
+
+    // We don't actually know which TF device corresponds to which physical
+    // device, but it doesn't matter—they're all identical.
+    (*tf_device_assignment)[i] = {tpu_devices[task][device]->name()};
+  }
+  return Status::OK();
+}
+// LINT.ThenChange(//tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc)
+
+// Builds TensorFlow device assignments for a replicated computation and convert
+// device_assignment into xla_device_assignment.
+static Status BuildGeneralDeviceAssignment(
+    int num_replicas, int num_cores_per_replica,
+    const std::vector<std::vector<Device*>>& tpu_devices,
+    const xla::Array2D<tpu::TpuCoreLocationExternal>& device_assignment,
+    const xla::Array4D<std::pair<int, int>>& topology,
+    std::vector<std::vector<string>>* tf_device_assignment,
+    std::unique_ptr<xla::DeviceAssignment>* xla_device_assignment) {
+  // Assign TensorFlow devices to each computation's replicas according to
+  // device_assignment and 'topology'.
+  *xla_device_assignment = absl::make_unique<xla::DeviceAssignment>(
+      num_replicas, num_cores_per_replica);
+  for (int replica = 0; replica < num_replicas; ++replica) {
+    for (int computation = 0; computation < num_cores_per_replica;
+         ++computation) {
+      const tpu::TpuCoreLocationExternal& core_location =
+          device_assignment(replica, computation);
+
+      int task;
+      int device;
+      std::tie(task, device) =
+          topology(core_location.chip_coordinates().x,
+                   core_location.chip_coordinates().y,
+                   core_location.chip_coordinates().z, core_location.index());
+
+      CHECK_LT(computation, num_cores_per_replica);
+      (**xla_device_assignment)(replica, computation) = core_location.Id();
+
+      // The communication pattern between replicas will be determined later by
+      // BuildAllReduceRing.
+      TF_RET_CHECK(task >= 0 && task < tpu_devices.size());
+      TF_RET_CHECK(device >= 0 && device < tpu_devices[task].size());
+      (*tf_device_assignment)[replica].push_back(
+          tpu_devices[task][device]->name());
+    }
+  }
+  return Status::OK();
+}
+
+/*static*/ Status DistributedTPURewritePass::BuildDeviceAssignment(
+    const tpu::TpuTopologyExternal& tpu_topology, int num_tpus_per_task,
+    const std::vector<std::vector<Device*>>& tpu_devices, int num_replicas,
+    int num_cores_per_replica, const string& topology_attr,
+    absl::Span<const int> device_assignment_attr,
+    std::vector<std::vector<string>>* tf_device_assignment,
+    std::unique_ptr<xla::DeviceAssignment>* xla_device_assignment) {
+  const int num_tasks = tpu_devices.size();
+  const int num_tpu_devices = num_tasks * num_tpus_per_task;
+  VLOG(2) << "num_tasks=" << num_tasks
+          << " num_tpus_per_task=" << num_tpus_per_task;
+
+  // Checks num_replicas is sane first to avoid integer overflow.
+  if (num_replicas > num_tpu_devices) {
+#ifdef PLATFORM_CLOUD_TPU
+    return errors::InvalidArgument("Requested num_replicas=", num_replicas,
+                                   " but there are only ", num_tpu_devices,
+                                   " cores in the TPU topology.");
+#else
+    return errors::InvalidArgument("Requested num_replicas=", num_replicas,
+                                   " but there are only ", num_tpu_devices,
+                                   " cores in the TPU topology.");
+#endif
+  }
+  if (num_replicas * num_cores_per_replica > num_tpu_devices) {
+    return errors::InvalidArgument(
+        "Requested num_replicas=", num_replicas, " with ",
+        num_cores_per_replica, " cores per replica, but there are only ",
+        num_tpu_devices, " cores in the TPU topology");
+  }
+
+  tf_device_assignment->clear();
+  tf_device_assignment->resize(num_replicas);
+
+  // Special case: we allow the user to omit the topology and device assignment
+  // information in two cases:
+  // * there is only one replica and one core per replica. In this case, we
+  //   don't need to know topology information because we don't communicate with
+  //   other cores.
+  // * the number of replicas is equal to the number of cores in the slice. In
+  //   this case, all cores are running the same program so we don't need to
+  //   know which is which.
+  if (topology_attr.empty()) {
+    // LINT.IfChange
+    if (num_replicas != 1 && num_replicas != num_tpu_devices) {
+      return errors::InvalidArgument(
+          "TPUReplicate asked to create ", num_replicas,
+          " replicas, but the number of cores in the TPU topology is ",
+          num_tpu_devices,
+          " and no TPU device assignment was supplied. "
+          "A TPU device assignment is required if the number of replicas is "
+          "not 1 or the number of cores in the topology (",
+          num_tpu_devices, ")");
+    }
+
+    if (num_cores_per_replica != 1) {
+      return errors::InvalidArgument(
+          "A TPU topology must be provided if num_cores_per_replica != 1");
+    }
+
+    if (!device_assignment_attr.empty()) {
+      return errors::InvalidArgument(
+          "A TPU topology must be provided if device_assignment_attr is "
+          "non-empty");
+    }
+
+    // If there is only one replica, assign the Tensorflow computation to task 0
+    // device 0, and leave the XLA device assignment empty. We don't know which
+    // core this is in the TPU topology, but it doesn't matter—we don't need to
+    // communicate with any other cores.
+    if (num_replicas == 1) {
+      (*tf_device_assignment)[0] = {tpu_devices[0][0]->name()};
+      return Status::OK();
+    }
+
+    // Otherwise, num_replicas is equal to the number of cores, and we build a
+    // device assignment that covers the entire mesh. We do not need to know
+    // the topology to do so because all cores are identical.
+    return BuildFullMeshDeviceAssignment(num_replicas, tpu_devices, num_tasks,
+                                         num_tpus_per_task,
+                                         tf_device_assignment);
+    // LINT.ThenChange(//tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc)
+  }
+
+  // Array that maps mesh coordinates to {TF task, TF TPU device #} pairs.
+  xla::Array4D<std::pair<int, int>> topology;
+  TF_RETURN_IF_ERROR(ParseTopologyAttr(topology_attr, tpu_topology, num_tasks,
+                                       num_tpus_per_task, &topology));
+
+  // Array that maps logical (replica, core) pairs to physical mesh coordinates.
+  xla::Array2D<tpu::TpuCoreLocationExternal> device_assignment;
+  TF_RETURN_IF_ERROR(ParseDeviceAssignmentAttr(
+      device_assignment_attr, tpu_topology, num_replicas, num_cores_per_replica,
+      &device_assignment));
+
+  return BuildGeneralDeviceAssignment(
+      num_replicas, num_cores_per_replica, tpu_devices, device_assignment,
+      topology, tf_device_assignment, xla_device_assignment);
+}
+
+Status DistributedTPURewritePass::GetComputationForTPUReplicateOp(
+    const NameAttrList& function, FunctionLibraryRuntime* flr,
+    Graph* computation, DataTypeVector* arg_types,
+    DataTypeVector* retval_types) {
+  FunctionLibraryRuntime::Handle handle;
+
+  TF_RETURN_IF_ERROR(
+      flr->Instantiate(function.name(), AttrSlice(&function.attr()), &handle));
+
+  const FunctionBody* fbody = flr->GetFunctionBody(handle);
+
+  CopyGraph(*fbody->graph, computation);
+  *arg_types = fbody->arg_types;
+  *retval_types = fbody->ret_types;
+  return Status::OK();
+}
+
+// Grab the InferredShape corresponding to an edge input.
+static Status GetEdgeShape(const GraphShapeInfo& shape_info, const Edge& edge,
+                           const InferredShape** info) {
+  auto it = shape_info.find(edge.src()->name());
+  if (it == shape_info.end()) {
+    return errors::InvalidArgument(
+        "Input to replicated TPU computation is missing InferredShape: ",
+        edge.src()->name());
+  }
+  TF_RET_CHECK(it->second.size() > edge.src_output());
+  *info = &it->second[edge.src_output()];
+  return Status::OK();
+}
+
+Status DistributedTPURewritePass::GetArgAndRetvalShapes(
+    const GraphShapeInfo& shape_info, const Node& node,
+    const ParameterInfo& params_info, std::vector<InferredShape>* arg_shapes,
+    std::vector<InferredShape>* retval_shapes) {
+  std::vector<const Edge*> input_edges;
+  TF_RETURN_IF_ERROR(node.input_edges(&input_edges));
+
+  // If any replica's arg shape is unknown, we will mark the computation's arg
+  // shape as being unknown. If the shapes differ the TpuExecute Op will raise a
+  // runtime error.
+  std::vector<bool> any_replica_shape_unknown(
+      params_info.NumInputsToEachReplica());
+  arg_shapes->clear();
+  arg_shapes->resize(params_info.NumInputsToEachReplica());
+  TF_RET_CHECK(input_edges.size() == params_info.NumInputsFromHost());
+  // Determines the shapes of the per-replica arguments and checks that all
+  // replicas have identical shapes.
+  int64 edge_pos = 0;
+  auto check_shape = [&](int input_index) -> Status {
+    const InferredShape* info;
+    TF_RETURN_IF_ERROR(GetEdgeShape(shape_info, *input_edges[edge_pos], &info));
+    ++edge_pos;
+
+    if ((info->handle_type == DT_INVALID && !info->shape.IsFullyDefined()) ||
+        (info->handle_type != DT_INVALID &&
+         !info->handle_shape.IsFullyDefined())) {
+      any_replica_shape_unknown[input_index] = true;
+    }
+    xla::StatusOr<InferredShape> status =
+        MergeInferredShapes((*arg_shapes)[input_index], *info);
+    if (!status.ok()) {
+      return errors::InvalidArgument(
+          "Mismatched shapes for input ", input_index, ": ",
+          (*arg_shapes)[input_index].shape.DebugString(), " vs. ",
+          info->shape.DebugString());
+    }
+    (*arg_shapes)[input_index] = status.ValueOrDie();
+    return Status::OK();
+  };
+
+  for (int64 i = 0; i < params_info.NumReplicas(); ++i) {
+    for (int64 j = 0; j < params_info.NumPerReplicaArgs(); ++j) {
+      TF_RETURN_IF_ERROR(check_shape(j));
+    }
+  }
+
+  for (int64 i = 0; i < params_info.NumDistributedArgs(); ++i) {
+    TF_RETURN_IF_ERROR(check_shape(params_info.NumPerReplicaArgs() + i));
+  }
+
+  for (int64 i = 0;
+       i < params_info.NumPerReplicaArgs() + params_info.NumDistributedArgs();
+       ++i) {
+    if (any_replica_shape_unknown[i]) {
+      (*arg_shapes)[i].shape = PartialTensorShape();
+      (*arg_shapes)[i].handle_shape = PartialTensorShape();
+    }
+  }
+
+  // Determines the shape of the broadcast arguments.
+  for (int64 i = 0; i < params_info.NumBroadcastArgs(); ++i) {
+    TF_RET_CHECK(node.input_type(edge_pos) != DT_RESOURCE);
+    const InferredShape* info;
+    TF_RETURN_IF_ERROR(GetEdgeShape(shape_info, *input_edges[edge_pos], &info));
+    (*arg_shapes)[i + params_info.NumPerReplicaArgs() +
+                  params_info.NumDistributedArgs()]
+        .shape = info->shape;
+    ++edge_pos;
+  }
+
+  // Determines the handle shape and handle type of the resource variable
+  // arguments.
+  for (int64 i = 0; i < params_info.NumVariables(); ++i) {
+    TF_RET_CHECK(node.input_type(edge_pos) == DT_RESOURCE);
+    const InferredShape* info;
+    TF_RETURN_IF_ERROR(GetEdgeShape(shape_info, *input_edges[edge_pos], &info));
+    InferredShape& arg_shape =
+        (*arg_shapes)[i + params_info.NumPerReplicaArgs() +
+                      params_info.NumDistributedArgs() +
+                      params_info.NumBroadcastArgs()];
+    arg_shape.shape = TensorShape();  // Variables are always scalars.
+    arg_shape.handle_shape = info->handle_shape;
+    arg_shape.handle_type = info->handle_type;
+    TF_RET_CHECK(arg_shape.handle_type != DT_INVALID);
+    ++edge_pos;
+  }
+
+  // Determines the shape of the guaranteed constants.
+  // TODO(vinuraja): Can be removed because they are not required for any
+  // calculations. Leaving them here for symmetry with other structures like
+  // arg_types, arg_sharding, etc.
+  for (int64 i = 0; i < params_info.NumGuaranteedConstants(); ++i) {
+    TF_RET_CHECK(node.input_type(edge_pos) != DT_RESOURCE);
+    const InferredShape* info;
+    TF_RETURN_IF_ERROR(GetEdgeShape(shape_info, *input_edges[edge_pos], &info));
+    (*arg_shapes)[i + params_info.NumPerReplicaArgs() +
+                  params_info.NumDistributedArgs() +
+                  params_info.NumBroadcastArgs() + params_info.NumVariables()]
+        .shape = info->shape;
+    ++edge_pos;
+  }
+
+  // Extract the return value shapes.
+  auto it = shape_info.find(node.name());
+  retval_shapes->clear();
+  if (it != shape_info.end()) {
+    TF_RET_CHECK(it->second.size() >= node.num_outputs());
+    retval_shapes->resize(node.num_outputs());
+    for (int i = 0; i < node.num_outputs(); ++i) {
+      (*retval_shapes)[i].shape = it->second[i].shape;
+    }
+  } else if (node.num_outputs() > 0) {
+    return errors::InvalidArgument(
+        "Replicated TPU computation is missing InferredShape: ",
+        FormatNodeForError(node));
+  }
+  return Status::OK();
+}
+
+// Verifies that all nodes have legal sharding.
+static Status ValidateCoreNumbers(const Graph& graph,
+                                  int num_cores_per_replica) {
+  for (Node* n : graph.nodes()) {
+    TF_ASSIGN_OR_RETURN(absl::optional<xla::OpSharding> sharding,
+                        ParseShardingFromDevice(*n, num_cores_per_replica));
+  }
+  return Status::OK();
+}
+
+static Status InferXlaShardingFromNeighbors(
+    const Node& n, int num_cores_per_replica, FunctionLibraryRuntime* flr,
+    CachedFunctionHandles* cached_function_handles,
+    absl::optional<xla::OpSharding>* output_sharding, bool* is_fast_mem) {
+  int64 core = -1;
+  absl::optional<xla::OpSharding> result;
+  // We assume the variable has been allocated on fast memory if any consuming
+  // op has TPU_FAST_MEM_ATTR attribute. This is a protocol between runtime and
+  // compiler.
+  *is_fast_mem = false;
+  for (const Edge* edge : n.out_edges()) {
+    if (edge->IsControlEdge()) continue;
+
+    TF_RETURN_IF_ERROR(ParseAndValidateShardingFromNeighbors(
+        num_cores_per_replica, n.name(), *edge->dst(), &core, is_fast_mem,
+        &result));
+
+    if (!flr) continue;
+
+    // The nodes deciding this arg's device assignment might be in
+    // FunctionDef. Instantiate FunctionDefs associated with this node
+    // and check nodes using this arg.
+    std::function<Status(const Edge* call_edge)> parse_sharding_from_function =
+        [&](const Edge* call_edge) {
+          auto associated_functions = GetAssociatedFunctions(
+              *call_edge->dst(), flr->GetFunctionLibraryDefinition());
+          for (auto& associated_function : associated_functions) {
+            FunctionLibraryRuntime::Handle handle;
+            TF_RETURN_IF_ERROR(cached_function_handles->GetOrInstantiate(
+                associated_function.func_name(),
+                AttrSlice(&associated_function.attrs()), &handle));
+            const FunctionBody* body = flr->GetFunctionBody(handle);
+            Graph* g = body->graph;
+
+            for (Node* body_node : g->nodes()) {
+              if (!body_node->IsArg()) continue;
+
+              int index;
+              TF_RETURN_IF_ERROR(
+                  GetNodeAttr(body_node->attrs(), "index", &index));
+              if (index != call_edge->dst_input()) continue;
+
+              for (const Edge* out_edge : body_node->out_edges()) {
+                if (out_edge->IsControlEdge()) continue;
+
+                TF_RETURN_IF_ERROR(ParseAndValidateShardingFromNeighbors(
+                    num_cores_per_replica, n.name(), *out_edge->dst(), &core,
+                    is_fast_mem, &result));
+
+                TF_RETURN_IF_ERROR(parse_sharding_from_function(out_edge));
+              }
+            }
+          }
+          return Status::OK();
+        };
+    TF_RETURN_IF_ERROR(parse_sharding_from_function(edge));
+  }
+  *output_sharding = result;
+  return Status::OK();
+}
+
+bool UseSpmdForXlaPartitioning(const Node* replicate_node) {
+  bool spmd_attr;
+  if (!replicate_node ||
+      !TryGetNodeAttr(replicate_node->attrs(), "use_spmd_for_xla_partitioning",
+                      &spmd_attr)) {
+    spmd_attr = false;
+  }
+  return spmd_attr;
+}
+
+Status DistributedTPURewritePass::AssignArgsAndRetvalsToCores(
+    int num_cores_per_replica, const ParameterInfo& params_info,
+    const DataTypeVector& arg_types,
+    const std::vector<InferredShape>& arg_shapes,
+    const DataTypeVector& retval_types,
+    const std::vector<InferredShape>& retval_shapes, const Graph& graph,
+    const Node* replicate_node, FunctionLibraryRuntime* flr,
+    bool allow_parameter_replication_for_spmd,
+    std::vector<xla::OpSharding>* arg_sharding, std::vector<bool>* arg_fast_mem,
+    std::vector<xla::OpSharding>* retval_sharding,
+    std::vector<std::string>* arg_names) {
+  // Builds vectors of the argument and return nodes.
+  std::vector<Node*> args(arg_types.size());
+  std::vector<Node*> retvals(retval_types.size());
+  absl::flat_hash_map<int, Node*> partitioned_output_nodes;
+  for (Node* node : graph.op_nodes()) {
+    if (node->IsArg()) {
+      int index;
+      TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "index", &index));
+      TF_RET_CHECK(index >= 0 && index < args.size());
+      args[index] = node;
+    } else if (node->IsRetval()) {
+      int index;
+      TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "index", &index));
+      TF_RET_CHECK(index >= 0 && index < retvals.size());
+      retvals[index] = node;
+    }
+  }
+  for (const Edge* edge : replicate_node->out_edges()) {
+    int num_partitioned_outputs = 0;
+    for (const Edge* out_edge : edge->dst()->out_edges()) {
+      if (out_edge->dst()->type_string() == kTPUPartitionedOutput) {
+        partitioned_output_nodes[edge->src_output()] = out_edge->dst();
+        num_partitioned_outputs++;
+      }
+    }
+    if (num_partitioned_outputs > 1) {
+      return errors::InvalidArgument(
+          "More than one TPUPartitionedOutput per replciated output.");
+    }
+  }
+
+  // Verifies there are no missing arguments/return values.
+  for (int i = 0; i < args.size(); ++i) {
+    if (args[i] == nullptr) {
+      return errors::Internal("Missing function argument: ", i);
+    }
+  }
+  for (int i = 0; i < retvals.size(); ++i) {
+    if (retvals[i] == nullptr) {
+      return errors::Internal("Missing function return value: ", i);
+    }
+  }
+
+  // Assigns a core to each _Arg. Chooses the lowest-numbered core that
+  // consumes the argument. We choose the lowest-numbered core so the
+  // assignment is deterministic.
+  TensorDevicePlacer args_device_selector(num_cores_per_replica, arg_types,
+                                          arg_shapes);
+  arg_sharding->resize(args.size());
+  arg_names->resize(args.size());
+  arg_fast_mem->resize(args.size());
+  CachedFunctionHandles cached_function_handles(flr);
+  const bool use_spmd = (UseSpmdForXlaPartitioning(replicate_node) ||
+                         replicate_inputs_outputs_by_default_for_xla_spmd_) &&
+                        allow_parameter_replication_for_spmd;
+  for (int i = 0; i < args.size(); ++i) {
+    const Node* n = args[i];
+    absl::optional<int64> assigned_core;
+    absl::optional<xla::OpSharding> sharding;
+    bool is_fast_mem;
+    TF_RETURN_IF_ERROR(InferXlaShardingFromNeighbors(
+        *n, num_cores_per_replica, flr, &cached_function_handles, &sharding,
+        &is_fast_mem));
+
+    if (params_info.IsPerReplicaArg(i) || params_info.IsDistributedArg(i)) {
+      Node* input_node;
+      TF_RETURN_IF_ERROR(replicate_node->input_node(i, &input_node));
+      if (input_node->type_string() == kTPUPartitionedInput) {
+        TF_ASSIGN_OR_RETURN(absl::optional<xla::OpSharding> parsed_sharding,
+                            GetShardingFromNodeDef(input_node->def()));
+        if (!parsed_sharding.has_value())
+          return errors::InvalidArgument("Missing _XlaSharding attr from: ",
+                                         input_node->DebugString());
+        sharding = parsed_sharding;
+        VLOG(1) << "Arg " << i << " parsed sharding information from "
+                << input_node->name() << " : "
+                << parsed_sharding->DebugString();
+      }
+    }
+
+    if (sharding.has_value() && enable_automatic_model_parallelism_) {
+      return tensorflow::errors::InvalidArgument(
+          "Specifying manual sharding is not allowed when automatic "
+          "model parallelism is enabled.",
+          sharding->DebugString());
+    }
+
+    if (!sharding.has_value()) {
+      if (use_spmd &&
+          (params_info.IsVariableArg(i) || params_info.IsBroadcastArg(i) ||
+           ((params_info.IsPerReplicaArg(i) ||
+             params_info.IsDistributedArg(i)) &&
+            arg_types[i] != DT_RESOURCE))) {
+        // Use replication for host variables or non-variable per-replica
+        // inputs.
+        sharding = xla::sharding_builder::Replicate();
+      } else {
+        // TODO(dlibenzi): Distributing variables to cores other than 0 makes
+        // learning/brain/research/babelfish/trainer:trainer_tpu_test fail.
+        // For now distribute only per replica arguments, unless
+        // tf_jf_distribute_vars is set, to allow debugging the issue.
+        if (((params_info.IsPerReplicaArg(i) ||
+              params_info.IsDistributedArg(i)) &&
+             arg_types[i] != DT_RESOURCE) ||
+            (distribute_vars_ && params_info.IsVariableArg(i))) {
+          assigned_core = args_device_selector.RetrieveAssignment(i);
+        } else {
+          assigned_core = 0;
+        }
+        sharding = xla::sharding_builder::AssignDevice(*assigned_core);
+      }
+    } else if (sharding->type() == xla::OpSharding::MAXIMAL) {
+      assigned_core = sharding->tile_assignment_devices(0);
+    } else if (sharding->type() != xla::OpSharding::REPLICATED &&
+               sharding->type() != xla::OpSharding::OTHER) {
+      return tensorflow::errors::InvalidArgument(
+          "Unsupported argument sharding (for arg ", n->DebugString(),
+          "): ", sharding->DebugString());
+    }
+    if (assigned_core.has_value()) {
+      args_device_selector.ReportDeviceAssigned(*assigned_core, i);
+      VLOG(3) << "Assigning argument " << i << " (" << n->DebugString()
+              << ") to core " << *assigned_core;
+      args[i]->set_assigned_device_name(CoreDeviceLabel(*assigned_core));
+    } else if (sharding->type() == xla::OpSharding::OTHER) {
+      for (int64 core : sharding->tile_assignment_devices()) {
+        args_device_selector.ReportDeviceAssigned(core, i);
+        VLOG(3) << "Assigning argument " << i << " (" << n->DebugString()
+                << ") with tiled sharding to core " << core;
+      }
+    } else {
+      CHECK_EQ(sharding->type(), xla::OpSharding::REPLICATED);
+      for (int64 core = 0; core < num_cores_per_replica; ++core) {
+        args_device_selector.ReportDeviceAssigned(core, i);
+      }
+      VLOG(3) << "Assigning argument " << i << " (" << n->DebugString()
+              << ") to all cores";
+    }
+    (*arg_sharding)[i] = *sharding;
+    (*arg_fast_mem)[i] = is_fast_mem;
+    (*arg_names)[i] = n->name();
+    if (is_fast_mem) {
+      VLOG(3) << "Add " << TPU_FAST_MEM_ATTR << " attribute to "
+              << args[i]->name();
+    }
+    args[i]->AddAttr(kShardingAttribute, sharding->SerializeAsString());
+  }
+  TF_RETURN_IF_ERROR(cached_function_handles.ReleaseAllHandles());
+
+  // Assigns each _Retval node to the core that produces its value.
+  TensorDevicePlacer retvals_device_selector(num_cores_per_replica,
+                                             retval_types, retval_shapes);
+  retval_sharding->resize(retvals.size());
+  for (int i = 0; i < retvals.size(); ++i) {
+    const Edge* edge;
+    TF_RETURN_IF_ERROR(retvals[i]->input_edge(0, &edge));
+
+    TF_ASSIGN_OR_RETURN(
+        absl::optional<xla::OpSharding> sharding,
+        ParseShardingFromEdgeSource(*edge, num_cores_per_replica));
+
+    if (partitioned_output_nodes.contains(i)) {
+      Node* output_node = partitioned_output_nodes[i];
+      TF_ASSIGN_OR_RETURN(absl::optional<xla::OpSharding> parsed_sharding,
+                          GetShardingFromNodeDef(output_node->def()));
+      if (parsed_sharding.has_value()) {
+        sharding = parsed_sharding;
+        VLOG(1) << "Retval " << i << " parsed sharding information from "
+                << output_node->name() << " : " << sharding->DebugString();
+      }
+    }
+    absl::optional<int64> assigned_core;
+    if (sharding.has_value()) {
+      if (enable_automatic_model_parallelism_) {
+        return tensorflow::errors::InvalidArgument(
+            "Specifying manual sharding is not allowed when automatic "
+            "model parallelism is enabled.",
+            sharding->DebugString());
+      }
+
+      if (sharding.value().type() == xla::OpSharding::MAXIMAL) {
+        assigned_core = sharding.value().tile_assignment_devices(0);
+        TF_RETURN_IF_ERROR(
+            ValidateCoreNumber(*assigned_core, num_cores_per_replica));
+      } else if (sharding.value().type() != xla::OpSharding::REPLICATED &&
+                 sharding.value().type() != xla::OpSharding::OTHER) {
+        return tensorflow::errors::InvalidArgument(
+            "Unsupported argument sharding for retval ",
+            retvals[i]->DebugString(), " edge=", edge->DebugString(), ": ",
+            sharding->DebugString());
+      }
+    } else {
+      if (use_spmd) {
+        sharding = xla::sharding_builder::Replicate();
+      } else {
+        if (distribute_vars_) {
+          assigned_core = retvals_device_selector.RetrieveAssignment(i);
+        } else {
+          assigned_core = 0;
+        }
+        sharding = xla::sharding_builder::AssignDevice(*assigned_core);
+      }
+    }
+    if (assigned_core.has_value()) {
+      retvals[i]->set_assigned_device_name(CoreDeviceLabel(*assigned_core));
+      retvals_device_selector.ReportDeviceAssigned(*assigned_core, i);
+      VLOG(3) << "Assigning return value " << i << " ("
+              << retvals[i]->DebugString() << ") to core " << *assigned_core;
+    } else if (sharding->type() == xla::OpSharding::OTHER) {
+      for (int64 core : sharding->tile_assignment_devices()) {
+        retvals_device_selector.ReportDeviceAssigned(core, i);
+        VLOG(3) << "Assigning return value " << i << " ("
+                << retvals[i]->DebugString() << ") with tiled sharding to core "
+                << core;
+      }
+    } else {
+      CHECK_EQ(sharding->type(), xla::OpSharding::REPLICATED);
+      for (int64 core = 0; core < num_cores_per_replica; ++core) {
+        retvals_device_selector.ReportDeviceAssigned(core, i);
+      }
+      VLOG(3) << "Assigning return value " << i << " ("
+              << retvals[i]->DebugString() << ") to all cores.";
+    }
+    retvals[i]->AddAttr(kShardingAttribute, sharding->SerializeAsString());
+    (*retval_sharding)[i] = *sharding;
+  }
+  if (use_spmd &&
+      (absl::c_any_of(*arg_sharding,
+                      [](const xla::OpSharding& s) {
+                        return s.type() == xla::OpSharding::MAXIMAL;
+                      }) ||
+       absl::c_any_of(*retval_sharding, [](const xla::OpSharding& s) {
+         return s.type() == xla::OpSharding::MAXIMAL;
+       }))) {
+    LOG(WARNING) << "XLA SPMD only supports cases where all inputs/outputs "
+                    "exist on every partition (sharded or replicated). Fall "
+                    "back to MPMD.";
+    return AssignArgsAndRetvalsToCores(
+        num_cores_per_replica, params_info, arg_types, arg_shapes, retval_types,
+        retval_shapes, graph, replicate_node, flr,
+        /*allow_parameter_replication_for_spmd=*/false, arg_sharding,
+        arg_fast_mem, retval_sharding, arg_names);
+  }
+  return Status::OK();
+}
+
+// Builds Shape nodes that compute the shapes of arguments whose shapes are not
+// statically known.
+/* static */ Status DistributedTPURewritePass::BuildDynamicShapeNodes(
+    const Node& replicate_node, const std::vector<InferredShape>& arg_shapes,
+    const ParameterInfo& params_info, const std::vector<Node*>& variable_reads,
+    Graph* graph, std::vector<Node*>* dynamic_shape_nodes) {
+  dynamic_shape_nodes->clear();
+
+  std::vector<const Edge*> replicate_input_edges;
+  TF_RETURN_IF_ERROR(replicate_node.input_edges(&replicate_input_edges));
+
+  // The compiler determines the shape of each constant by inspecting the value
+  // of its corresponding host-memory tensor; this happens when a step is run.
+  // As a result, the shapes of constants are not needed at graph rewrite time.
+  const int num_args = arg_shapes.size() - params_info.NumGuaranteedConstants();
+  TF_RET_CHECK(num_args == params_info.NumPerReplicaArgs() +
+                               params_info.NumDistributedArgs() +
+                               params_info.NumBroadcastArgs() +
+                               params_info.NumVariables());
+
+  for (int i = 0; i < num_args; ++i) {
+    const PartialTensorShape* shape = arg_shapes[i].handle_type == DT_INVALID
+                                          ? &arg_shapes[i].shape
+                                          : &arg_shapes[i].handle_shape;
+    if (!shape->IsFullyDefined()) {
+      Node* src;
+      int src_output;
+      if (params_info.IsPerReplicaArg(i)) {
+        TF_RET_CHECK(i < replicate_input_edges.size());
+        // All replicas must have the same input shapes. Uses the shape of the
+        // inputs from the first replica.
+        src = replicate_input_edges[i]->src();
+        src_output = replicate_input_edges[i]->src_output();
+      } else if (params_info.IsDistributedArg(i) ||
+                 params_info.IsBroadcastArg(i)) {
+        int64 input_num =
+            params_info.NumPerReplicaArgs() * params_info.NumReplicas() + i -
+            params_info.NumPerReplicaArgs();
+        TF_RET_CHECK(0 <= input_num &&
+                     input_num < replicate_input_edges.size());
+        src = replicate_input_edges[input_num]->src();
+        src_output = replicate_input_edges[input_num]->src_output();
+      } else {
+        int64 var_num = i - params_info.NumPerReplicaArgs() -
+                        params_info.NumDistributedArgs() -
+                        params_info.NumBroadcastArgs();
+        TF_RET_CHECK(0 <= var_num && var_num < variable_reads.size());
+        src = variable_reads[var_num];
+        src_output = 0;
+      }
+
+      NodeDef def;
+      def.set_name(graph->NewName(strings::StrCat(src->name(), "/shape")));
+      def.set_op("Shape");
+      def.set_device(src->assigned_device_name());
+      AddNodeAttr("T", src->output_type(src_output), &def);
+      AddNodeAttr("out_type", DT_INT64, &def);
+      MergeDebugInfo(NodeDebugInfo(replicate_node.def()), &def);
+
+      Status status;
+      Node* shape_node = graph->AddNode(def, &status);
+      if (!status.ok()) return status;
+      dynamic_shape_nodes->push_back(shape_node);
+
+      shape_node->set_assigned_device_name(src->assigned_device_name());
+      graph->AddEdge(src, src_output, shape_node, 0);
+    }
+  }
+  return Status::OK();
+}
+
+// Builds a TPUCompile node that compiles the bodies of the function call
+// `nodes`.
+Status DistributedTPURewritePass::BuildCompileNode(
+    const Node* replicate_node, const NameAttrList& function,
+    uint64 library_fingerprint, const ParameterInfo& params_info,
+    const std::vector<InferredShape>& arg_shapes,
+    const DataTypeVector& arg_types,
+    const std::vector<Node*>& guaranteed_constant_nodes,
+    const string& session_handle,
+    const std::vector<xla::OpSharding>& arg_sharding,
+    const std::vector<bool>& arg_fast_mem,
+    const std::vector<std::string>& arg_names,
+    const std::vector<xla::OpSharding>& retval_sharding,
+    int num_cores_per_replica, const string& compile_device,
+    const xla::DeviceAssignment* xla_device_assignment,
+    const std::vector<Node*>& dynamic_shape_nodes, Graph* graph,
+    Node** compile_node, int64 autotuner_thresh) {
+  VLOG(1) << "BuildCompileNode";
+
+  tpu::TPUCompileMetadataProto proto;
+  proto.set_num_replicas(params_info.NumReplicas());
+  proto.set_num_cores_per_replica(num_cores_per_replica);
+  proto.set_function_library_fingerprint(library_fingerprint);
+  proto.set_enable_automatic_model_parallelism(
+      enable_cross_replica_sharding_mirrored_variables_);
+  const bool use_spmd =
+      UseSpmdForXlaPartitioning(replicate_node) && allow_xla_spmd_partition_ &&
+      !absl::c_any_of(arg_sharding,
+                      [](const xla::OpSharding& s) {
+                        return s.type() == xla::OpSharding::MAXIMAL;
+                      }) &&
+      !absl::c_any_of(retval_sharding, [](const xla::OpSharding& s) {
+        return s.type() == xla::OpSharding::MAXIMAL;
+      });
+  proto.set_use_spmd_for_xla_partitioning(use_spmd);
+
+  // Get and fill padding map.
+  if (replicate_node != nullptr) {
+    TF_RETURN_IF_ERROR(
+        FillPaddingMap(*replicate_node, proto.mutable_padding_maps()));
+    xla::DebugOptions::StepMarkerLocation location;
+    TF_RETURN_IF_ERROR(GetStepMarkerLocation(*replicate_node, &location));
+    proto.set_step_marker_location(location);
+  }
+
+  if (xla_device_assignment != nullptr) {
+    TF_RETURN_IF_ERROR(
+        xla_device_assignment->Serialize(proto.mutable_device_assignment()));
+  }
+
+  const int num_args = arg_types.size();
+  const int num_guaranteed_constants = guaranteed_constant_nodes.size();
+  const int guaranteed_const_start_index = num_args - num_guaranteed_constants;
+  TF_RET_CHECK(num_args == arg_shapes.size());
+  TF_RET_CHECK(num_args == arg_sharding.size())
+      << num_args << " != " << arg_sharding.size();
+
+  for (int i = 0; i < num_args; ++i) {
+    tpu::TPUCompileMetadataProto::Arg* arg = proto.add_args();
+    DataType type = arg_types[i];
+    const InferredShape& arg_shape = arg_shapes[i];
+    arg->set_name(arg_names[i]);
+    if (type == DT_RESOURCE) {
+      TF_RET_CHECK(arg_shape.handle_type != DT_INVALID) << i;
+      arg->set_dtype(arg_shape.handle_type);
+      arg_shape.handle_shape.AsProto(arg->mutable_shape());
+      arg->set_kind(tpu::TPUCompileMetadataProto::Arg::VARIABLE);
+      arg->set_fast_mem(arg_fast_mem[i]);
+    } else {
+      arg->set_dtype(type);
+      arg_shape.shape.AsProto(arg->mutable_shape());
+      if (i >= guaranteed_const_start_index) {
+        const DataType edge_type =
+            guaranteed_constant_nodes[i - guaranteed_const_start_index]
+                ->output_type(0);
+        TF_RET_CHECK(type == edge_type)
+            << "Arg type: " << type << " but edge type: " << edge_type;
+        arg->set_kind(tpu::TPUCompileMetadataProto::Arg::GUARANTEED_CONSTANT);
+      } else {
+        arg->set_kind(tpu::TPUCompileMetadataProto::Arg::PARAMETER);
+      }
+    }
+    // As long as the argument is not a per-replica one, it should have the same
+    // value for all replicas. For clarity, we keep the (redundant) checks for
+    // variable, broadcast and constant types, to prevent bugs in case new types
+    // with different semantics are introduced in the future.
+    arg->set_is_same_data_across_replicas(
+        !params_info.IsPerReplicaArg(i) && !params_info.IsDistributedArg(i) &&
+        (params_info.IsVariableArg(i) || params_info.IsBroadcastArg(i) ||
+         params_info.IsConstantArg(i)));
+    if (params_info.mirrored_variable_indices().count(i) > 0) {
+      CHECK_EQ(type, DT_RESOURCE);
+      arg->set_is_same_data_across_replicas(true);
+      // 64-bit type is not shardable by XLA:TPU yet.
+      bool sharding_enabled = (arg_shape.handle_type != DT_COMPLEX64 &&
+                               arg_shape.handle_type != DT_INT64 &&
+                               arg_shape.handle_type != DT_UINT64 &&
+                               arg_shape.handle_type != DT_DOUBLE);
+      arg->set_enable_xla_sharding(
+          sharding_enabled ? tpu::TPUCompileMetadataProto::Arg::TENTATIVE
+                           : tpu::TPUCompileMetadataProto::Arg::DISALLOWED);
+    }
+    *arg->mutable_sharding() = arg_sharding[i];
+  }
+
+  const int num_retvals = retval_sharding.size();
+  for (int i = 0; i < num_retvals; ++i) {
+    *proto.add_retvals()->mutable_sharding() = retval_sharding[i];
+  }
+  proto.set_session_handle(session_handle);
+
+  DataTypeVector constant_arg_types;
+  constant_arg_types.reserve(num_guaranteed_constants);
+  for (int i = 0; i < num_guaranteed_constants; ++i) {
+    constant_arg_types.push_back(arg_types[guaranteed_const_start_index + i]);
+  }
+  proto.set_xla_fusion_autotuner_thresh(autotuner_thresh);
+
+  string metadata;
+  proto.SerializeToString(&metadata);
+
+  NodeDef def;
+  def.set_name(UniqueNodeName("TPUReplicate/_compile", graph));
+  def.set_op("TPUCompile");
+  def.set_device(compile_device);
+  if (replicate_node) {
+    MergeDebugInfo(NodeDebugInfo(replicate_node->def()), &def);
+  }
+
+  AddNodeAttr("function", function, &def);
+  AddNodeAttr("num_computations", num_cores_per_replica, &def);
+  AddNodeAttr("NumDynamicShapes", static_cast<int>(dynamic_shape_nodes.size()),
+              &def);
+  AddNodeAttr("metadata", metadata, &def);
+  AddNodeAttr("Tguaranteed_constants", constant_arg_types, &def);
+
+  Status status;
+  *compile_node = graph->AddNode(def, &status);
+  TF_RETURN_IF_ERROR(status);
+
+  (*compile_node)->set_assigned_device_name(compile_device);
+
+  for (int i = 0; i < dynamic_shape_nodes.size(); ++i) {
+    graph->AddEdge(dynamic_shape_nodes[i], 0, *compile_node, i);
+  }
+
+  for (int i = 0; i < num_guaranteed_constants; ++i) {
+    graph->AddEdge(guaranteed_constant_nodes[i], 0, *compile_node,
+                   dynamic_shape_nodes.size() + i);
+  }
+  VLOG(1) << "BuildCompileNode(): " << status;
+  return status;
+}
+
+Status DistributedTPURewritePass::FindGuaranteedConstantInputs(
+    const Node& node, const NameRangeMap& input_range_map,
+    std::vector<Node*>* guaranteed_constants) {
+  std::vector<const Edge*> input_edges;
+  TF_RETURN_IF_ERROR(node.input_edges(&input_edges));
+  std::pair<int, int> variables_limits =
+      input_range_map.at("guaranteed_constants");
+  for (int i = variables_limits.first; i < variables_limits.second; ++i) {
+    guaranteed_constants->push_back(input_edges[i]->src());
+  }
+  return Status::OK();
+}
+
+Status DistributedTPURewritePass::FindVariableInputs(
+    const Node& node, const NameRangeMap& input_range_map,
+    std::vector<VariableInput>* variables) {
+  std::vector<const Edge*> input_edges;
+  TF_RETURN_IF_ERROR(node.input_edges(&input_edges));
+  std::pair<int, int> variables_limits = input_range_map.at("variables");
+  for (int i = variables_limits.first; i < variables_limits.second; ++i) {
+    Node* node = input_edges[i]->src();
+
+    // Find the type of the VarHandleOp that feeds this node, looking through
+    // any wrapping Enter or Switch nodes.
+    while (node->IsEnter() || node->IsSwitch()) {
+      TF_RETURN_IF_ERROR(node->input_node(0, &node));
+    }
+    // Fix the variable device assignment if it is requested with a full name.
+    if (!node->has_assigned_device_name() &&
+        !node->requested_device().empty()) {
+      DeviceNameUtils::ParsedName var_device;
+      TF_RET_CHECK(DeviceNameUtils::ParseFullName(node->requested_device(),
+                                                  &var_device));
+      if (var_device.has_job && var_device.has_replica && var_device.has_task &&
+          var_device.has_type && var_device.has_id) {
+        node->set_assigned_device_name(node->requested_device());
+        if (node != input_edges[i]->src() &&
+            !input_edges[i]->src()->has_assigned_device_name()) {
+          input_edges[i]->src()->set_assigned_device_name(
+              node->requested_device());
+        }
+      }
+    }
+    if (node->type_string() == "VarHandleOp") {
+      DataType dtype;
+      TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "dtype", &dtype));
+      variables->push_back(VariableInput{input_edges[i]->src(),
+                                         input_edges[i]->src_output(), dtype});
+    } else if (node->type_string() == "_Arg") {
+      std::vector<DataType> dtypes;
+      TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "_handle_dtypes", &dtypes));
+      if (dtypes.empty()) {
+        return errors::Internal(
+            "_Arg node with resource output must have non-empty _handle_dtypes "
+            "attribute: ",
+            node->DebugString());
+      }
+      variables->push_back(VariableInput{
+          input_edges[i]->src(), input_edges[i]->src_output(), dtypes[0]});
+    } else {
+      return errors::Internal(
+          "Cannot handle variable input with node type other than VarHandleOp "
+          "and _Arg: ",
+          node->DebugString());
+    }
+  }
+  return Status::OK();
+}
+
+// Builds a NoOp node, used for building control dependencies.
+static Status BuildNoopNode(const Node& source, StringPiece name,
+                            const string& device, Graph* graph, Node** node) {
+  NodeDefBuilder builder(name, "NoOp", NodeDebugInfo(source));
+  if (!device.empty()) {
+    builder.Device(device);
+  }
+  NodeDef def;
+  TF_RETURN_IF_ERROR(builder.Finalize(&def));
+
+  Status status;
+  *node = graph->AddNode(def, &status);
+  if (!device.empty()) {
+    (*node)->set_assigned_device_name(device);
+  }
+  return status;
+}
+
+Status DistributedTPURewritePass::ConnectHostComputeNodes(
+    Node* compile_node, Node* key_placeholder_node, Graph* graph) {
+  // First find all the downstream nodes of the key placeholder node, since we
+  // want to delete the connecting edges from key_placeholder_node which would
+  // invalidate the out_nodes iterator.
+  std::vector<Node*> host_transfer_nodes;
+  for (Node* node : key_placeholder_node->out_nodes()) {
+    host_transfer_nodes.push_back(node);
+  }
+  for (Node* node : host_transfer_nodes) {
+    int input_index = -1;
+    for (int i = 0; i < node->num_inputs(); i++) {
+      const Edge* e;
+      TF_RETURN_IF_ERROR(node->input_edge(i, &e));
+      if (e->src() == key_placeholder_node) {
+        if (input_index != -1) {
+          return errors::Internal(
+              "Node ", node->name(),
+              " has multiple input edges from key placeholder node");
+        }
+        input_index = e->dst_input();
+      }
+    }
+    if (input_index == -1) {
+      return errors::Internal("Node ", node->name(),
+                              " has no input edge from key placeholder node");
+    }
+    const Edge* key_edge;
+    TF_RETURN_IF_ERROR(node->input_edge(input_index, &key_edge));
+    graph->RemoveEdge(key_edge);
+    graph->AddEdge(compile_node, 1, node, input_index);
+  }
+  graph->RemoveNode(key_placeholder_node);
+  return Status::OK();
+}
+
+Status DistributedTPURewritePass::BuildVariableReads(
+    absl::Span<const VariableInput> variables, Node* control_predecessor,
+    Graph* graph, std::vector<Node*>* variable_reads) {
+  variable_reads->resize(variables.size());
+  for (int i = 0; i < variables.size(); ++i) {
+    string name =
+        graph->NewName(strings::StrCat(variables[i].node->name(), "/read"));
+    NodeDefBuilder builder(name, "ReadVariableOp",
+                           NodeDebugInfo(*variables[i].node));
+
+    builder.Attr("dtype", variables[i].dtype);
+    builder.Device(variables[i].node->assigned_device_name());
+    builder.Input(variables[i].node->name(), 0, DT_RESOURCE);
+    NodeDef def;
+    TF_RETURN_IF_ERROR(builder.Finalize(&def));
+
+    Status status;
+    Node* read_node;
+    (*variable_reads)[i] = read_node = graph->AddNode(def, &status);
+    if (!status.ok()) return status;
+
+    read_node->set_requested_device(variables[i].node->requested_device());
+    read_node->set_assigned_device_name(
+        variables[i].node->assigned_device_name());
+    graph->AddEdge(variables[i].node, variables[i].index, read_node, 0);
+
+    graph->AddControlEdge(control_predecessor, read_node);
+  }
+  return Status::OK();
+}
+
+bool DistributedTPURewritePass::ContainsResourceWriteOp(
+    const Graph& graph, const FunctionLibraryDefinition& fld) {
+  for (const Node* n : graph.nodes()) {
+    const XlaResourceOpInfo* op_info = GetResourceOpInfoForOp(n->type_string());
+    if (op_info && op_info->kind() != XlaResourceOpKind::kRead) {
+      VLOG(2) << "Found write resource op inside computation";
+      return true;
+    }
+  }
+  for (const string& func_name : fld.ListFunctionNames()) {
+    const FunctionDef* func_def = fld.Find(func_name);
+    for (const NodeDef& n : func_def->node_def()) {
+      const XlaResourceOpInfo* op_info = GetResourceOpInfoForOp(n.op());
+      if (op_info && op_info->kind() != XlaResourceOpKind::kRead) {
+        VLOG(2) << "Found write resource op inside " << func_name;
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+Status DistributedTPURewritePass::BuildVariableWrites(
+    absl::Span<const VariableInput> variables, Node* control_successor,
+    absl::Span<const VariableWrite> variable_writes, Graph* graph) {
+  CHECK_EQ(variables.size(), variable_writes.size());
+  for (int i = 0; i < variables.size(); ++i) {
+    const VariableWrite& write = variable_writes[i];
+    NodeDebugInfo debug_info(*variables[i].node);
+
+    auto name = [&](string suffix) {
+      return graph->NewName(
+          strings::StrCat(variables[i].node->name(), "/", suffix));
+    };
+
+    Node* write_node;
+    TF_RETURN_IF_ERROR(
+        IncompleteNodeDefBuilder(name("assign"), "AssignVariableOp", debug_info)
+            .AddAttr("dtype", variables[i].dtype)
+            .Device(variables[i].node->assigned_device_name())
+            .Build(graph, &write_node));
+
+    // Colocate the control flow with the variable.
+    CondBuilder cb(variables[i].node->name(),
+                   variables[i].node->assigned_device_name(), debug_info,
+                   graph);
+
+    // Inputs to conditional.
+    Node* switch_val;
+    TF_RETURN_IF_ERROR(
+        cb.AddInput("switch_val", variables[i].dtype,
+                    /*device=*/write.value->assigned_device_name(), debug_info,
+                    &switch_val));
+    Node* switch_var;
+    TF_RETURN_IF_ERROR(
+        cb.AddInput("switch_var", DT_RESOURCE,
+                    /*device=*/variables[i].node->assigned_device_name(),
+                    debug_info, &switch_var));
+    // Conditionally write the value back.
+    graph->AddEdge(variables[i].node, variables[i].index, switch_var, 0);
+    graph->AddEdge(switch_var, CondBuilder::kThenBranch, write_node, 0);
+    graph->AddEdge(switch_val, CondBuilder::kThenBranch, write_node, 1);
+    // Add control edge from the write to value that will be merged. There is no
+    // output from the write so this control edge ensures the write completes.
+    graph->AddControlEdge(write_node, cb.switch_t());
+
+    graph->AddControlEdge(cb.control_successor(), control_successor);
+
+    graph->AddEdge(write.predicate, write.predicate_output, cb.pred(), 0);
+    graph->AddEdge(write.value, write.value_output, switch_val, 0);
+  }
+  return Status::OK();
+}
+
+namespace {
+
+// Helper that creates an IdentityN node containing all of the variables
+// values on CPU device 'device', except for those that will be split across
+// cores. (For split variables, this may cause additional cross-host data
+// transfers if more than 1 devices share the same variable partition on a
+// remote host.)
+//
+// A previous iteration of this code built one Identity node per TPU core per
+// variable, but this can rapidly become hundreds of thousands of nodes. This
+// formulation creates a single IdentityN node containing all of the variables
+// on each host. This may cause some unnecessary variable copies if only a
+// subset of hosts consume a given variable, but has the virtue of being
+// simple, and most models use pure replication where all cores want all the
+// variables.
+//
+// Returns the node and its output index to be consumed by TPUExecute for the
+// requested variable index.
+xla::StatusOr<NodeOut> CreateOrGetPerHostVariableCopy(
+    const string& host_cpu_device, int64 var_index,
+    const std::vector<Node*>& variable_reads,
+    const DistributedTPURewritePass::ParameterInfo& params_info,
+    const std::vector<xla::OpSharding>& arg_shardings,
+    const Node& replicate_node,
+    absl::flat_hash_map<string, std::vector<NodeOut>>* per_host_var_copies,
+    Graph* graph) {
+  auto it = per_host_var_copies->find(host_cpu_device);
+  if (it != per_host_var_copies->end()) {
+    return it->second[var_index];
+  }
+
+  DataTypeVector dtypes;
+  // Per-variable data source for TPUExecute.
+  std::vector<NodeOut> index_mapping;
+  index_mapping.reserve(variable_reads.size());
+  dtypes.reserve(variable_reads.size());
+  for (int64 i = 0; i < variable_reads.size(); ++i) {
+    Node* read = variable_reads[i];
+    int64 orig_arg_num =
+        i + params_info.NumPerReplicaArgs() + params_info.NumBroadcastArgs();
+    if (arg_shardings[orig_arg_num].type() != xla::OpSharding::OTHER) {
+      // We haven't built the IdentityN node yet, so temporarily use nullptr.
+      index_mapping.push_back(
+          NodeOut{nullptr, static_cast<int>(dtypes.size())});
+      dtypes.push_back(read->output_type(0));
+    } else {
+      // Do not copy the full tensor of partitioned variables.
+      index_mapping.push_back(NodeOut{read, 0});
+    }
+  }
+  NodeDef ndef;
+  ndef.set_name(
+      graph->NewName(absl::StrCat(replicate_node.name(), "/_variable_copy")));
+  ndef.set_op("IdentityN");
+  ndef.set_device(host_cpu_device);
+  AddNodeAttr("T", dtypes, &ndef);
+  Status s;
+  Node* id_node = graph->AddNode(ndef, &s);
+  TF_RETURN_IF_ERROR(s);
+  id_node->set_assigned_device_name(host_cpu_device);
+
+  for (int64 i = 0; i < variable_reads.size(); ++i) {
+    if (index_mapping[i].node == nullptr) {
+      // Fill index_mapping with the actual IdentityN node.
+      index_mapping[i].node = id_node;
+      // Add the edge to id_node.
+      graph->AddEdge(variable_reads[i], 0, id_node, index_mapping[i].index);
+    }
+  }
+
+  auto result = index_mapping[var_index];
+  (*per_host_var_copies)[host_cpu_device] = std::move(index_mapping);
+  return result;
+}
+
+}  // namespace
+
+Status DistributedTPURewritePass::BuildExecuteNodes(
+    const ParameterInfo& params_info, int num_tasks, int num_cores_per_replica,
+    const Node& replicate_node, const std::vector<std::string>& arg_names,
+    const DataTypeVector& arg_types,
+    const std::vector<InferredShape>& arg_shapes,
+    const DataTypeVector& retval_types,
+    const std::vector<xla::OpSharding>& arg_shardings,
+    const std::vector<xla::OpSharding>& retval_shardings,
+    const std::vector<std::vector<string>>& tpu_device_names,
+    Node* compile_node, const std::vector<Node*>& variable_reads,
+    Node* control_predecessor, Node* control_successor,
+    std::vector<VariableWrite>* variable_writes, Graph* graph) {
+  VLOG(1) << "BuildExecuteNodes " << replicate_node.DebugString();
+  TF_RET_CHECK(params_info.NumReplicas() == tpu_device_names.size());
+
+  const int num_variables = variable_reads.size();
+  const int num_retvals_per_replica = retval_types.size();
+
+  variable_writes->resize(num_variables);
+
+  std::vector<const Edge*> replicate_input_edges;
+  TF_RETURN_IF_ERROR(replicate_node.input_edges(&replicate_input_edges));
+
+  // Map from replicate input index to the fan_in node;
+  absl::flat_hash_map<int, std::vector<Node*>> replicate_input_fan_in_nodes;
+  absl::flat_hash_map<int, std::vector<Node*>> replicate_output_fan_out_nodes;
+  absl::flat_hash_map<int, std::vector<int>>
+      replicate_output_fan_out_dst_inputs;
+  std::vector<Node*> to_be_removed_nodes;
+
+  for (const Edge* e : replicate_input_edges) {
+    if (e->src()->type_string() == kTPUPartitionedInput) {
+      int num_users = 0;
+      for (const auto& ue : e->src()->out_edges()) {
+        if (!ue->IsControlEdge()) ++num_users;
+      }
+      if (num_users != 1) {
+        return tensorflow::errors::InvalidArgument(
+            e->src()->name(), " must only have one user. Found ", num_users);
+      }
+      to_be_removed_nodes.push_back(e->src());
+      std::vector<Node*>& nodes = replicate_input_fan_in_nodes[e->dst_input()];
+      nodes.resize(num_cores_per_replica, nullptr);
+      VLOG(2) << "allocate " << num_cores_per_replica
+              << " for replicate_input_fan_in_nodes[" << e->dst_input() << "]";
+      std::vector<const Edge*> fan_in_edges;
+      TF_RETURN_IF_ERROR(e->src()->input_edges(&fan_in_edges));
+      TF_RET_CHECK(fan_in_edges.size() == num_cores_per_replica);
+
+      for (const Edge* fe : fan_in_edges) {
+        nodes[fe->dst_input()] = fe->src();
+        VLOG(2) << "replicate_input_fan_in_nodes[" << e->dst_input() << "]["
+                << fe->dst_input() << "] = " << fe->src()->name();
+      }
+    }
+  }
+
+  // Replicate output edges are sorted by replica id and then by outputs for
+  // each replica. For example, if TPU Computation has outputs (output_1,
+  // output_2, and output_3) and number of replicas is 2, then
+  // replicate_output_edges order would be:
+  // output_1_replica_1, output_2_replica_1, output_3_replica_1,
+  // output_1_replica_2, output_2_replica_2, output_3_replica_2.
+  std::vector<const Edge*> replicate_output_edges(replicate_node.num_outputs(),
+                                                  nullptr);
+  for (const Edge* edge : replicate_node.out_edges()) {
+    if (edge->IsControlEdge()) continue;
+
+    int num_partitioned_outputs = 0;
+
+    for (const Edge* out_edge : edge->dst()->out_edges()) {
+      if (out_edge->dst()->type_string() == kTPUPartitionedOutput) {
+        num_partitioned_outputs++;
+        // Paths between replicate_node and replicate_output_fan_out_nodes:
+        // ReplicateNode->TpuOutIdenity->kTPUPartitionedOutput->fan-out-nodes
+        TF_RET_CHECK(edge->dst()->out_edges().size() == 1);
+        to_be_removed_nodes.push_back(edge->dst());
+        to_be_removed_nodes.push_back(out_edge->dst());
+        // Get the right replicated id from the replicate_output_edge.
+        std::vector<Node*>& nodes =
+            replicate_output_fan_out_nodes[edge->src_output()];
+        std::vector<int>& dst_inputs =
+            replicate_output_fan_out_dst_inputs[edge->src_output()];
+        nodes.resize(num_cores_per_replica, nullptr);
+        dst_inputs.resize(num_cores_per_replica, 0);
+        TF_RET_CHECK(out_edge->dst()->out_edges().size() ==
+                     num_cores_per_replica);
+
+        for (const Edge* fe : out_edge->dst()->out_edges()) {
+          nodes[fe->src_output()] = fe->dst();
+          dst_inputs[fe->src_output()] = fe->dst_input();
+          VLOG(2) << "replicate_output_fan_out_nodes[" << out_edge->src_output()
+                  << "][" << fe->src_output()
+                  << "] = " << fe->dst()->DebugString() << " with dst_input "
+                  << fe->dst_input();
+        }
+      }
+    }
+    replicate_output_edges[edge->src_output()] = edge;
+    if (num_partitioned_outputs > 1) {
+      return errors::InvalidArgument(
+          "More than one TPUPartitionedOutput per replciated output.");
+    }
+  }
+
+  const int num_execute_args =
+      arg_shardings.size() - params_info.NumGuaranteedConstants();
+  // Inverts the arg_shardings and retval_shardings mappings to
+  // form core -> {argument number} maps.
+  std::vector<std::vector<int>> core_arg_nums(num_cores_per_replica);
+  for (int i = 0; i < num_execute_args; ++i) {
+    const auto& sharding = arg_shardings[i];
+    if (sharding.type() == xla::OpSharding::MAXIMAL) {
+      int core = sharding.tile_assignment_devices(0);
+      TF_RETURN_IF_ERROR(ValidateCoreNumber(core, num_cores_per_replica));
+      core_arg_nums[core].push_back(i);
+    } else if (sharding.type() == xla::OpSharding::OTHER) {
+      for (int64 core : sharding.tile_assignment_devices()) {
+        core_arg_nums[core].push_back(i);
+      }
+    } else if (sharding.type() == xla::OpSharding::REPLICATED) {
+      for (int core = 0; core < num_cores_per_replica; ++core) {
+        core_arg_nums[core].push_back(i);
+      }
+    } else {
+      return tensorflow::errors::InvalidArgument(
+          "Unsupported argument sharding for arg=", arg_names[i],
+          " shape=", arg_shapes[i].shape.DebugString(), ": ",
+          sharding.DebugString());
+    }
+  }
+  std::vector<std::vector<int>> core_retval_nums(num_cores_per_replica);
+  for (int i = 0; i < retval_shardings.size(); ++i) {
+    const auto& sharding = retval_shardings[i];
+    if (sharding.type() == xla::OpSharding::MAXIMAL) {
+      int core = sharding.tile_assignment_devices(0);
+      TF_RETURN_IF_ERROR(ValidateCoreNumber(core, num_cores_per_replica));
+      core_retval_nums[core].push_back(i);
+    } else if (sharding.type() == xla::OpSharding::REPLICATED) {
+      for (int core = 0; core < num_cores_per_replica; ++core) {
+        core_retval_nums[core].push_back(i);
+      }
+    } else if (sharding.type() == xla::OpSharding::OTHER) {
+      for (int64 core : sharding.tile_assignment_devices()) {
+        core_retval_nums[core].push_back(i);
+      }
+    } else {
+      return tensorflow::errors::InvalidArgument(
+          "Unsupported argument sharding: ", sharding.DebugString());
+    }
+  }
+
+  // Maps host device name to a list of per-variable pairs (variable_copy_node,
+  // output_index_of_copy_node).
+  absl::flat_hash_map<string, std::vector<NodeOut>> per_host_var_copies;
+
+  // Mapping from original resource arg number to a second level map. Second
+  // level map is from core id to output index of updated variable value.
+  absl::flat_hash_map<int, absl::flat_hash_map<int, int>>
+      orig_arg_num_to_output_index_mapping;
+  // Mapping from retval index to a second level map. Second level map is from
+  // core id to output index of sharded output value.
+  std::unordered_map<int, std::unordered_map<int, int>>
+      retval_index_to_output_index_mapping;
+
+  // Represents mapping of argument index of sharded input to each
+  // TPUExecute node to its corresponding Split node and its output index
+  // from which sharded input will be fed into TPUExecute node.
+  std::map<ShardedInputIndex, ShardedInputInfo> input_index_to_sharded_inputs;
+
+  // Builds one TPUExecute node per core per replica.
+  std::vector<std::vector<Node*>> execute_nodes(params_info.NumReplicas());
+  for (int core = 0; core < num_cores_per_replica; ++core) {
+    DataTypeVector core_retval_types;
+    for (int output : core_retval_nums[core]) {
+      core_retval_types.push_back(retval_types[output]);
+    }
+    DataTypeVector core_arg_types;
+    std::vector<int> core_variable_writes;
+    for (int input : core_arg_nums[core]) {
+      // Resource variables can be passed either by reference (as a DT_RESOURCE)
+      // tensor or by value (as the variable's current value). Per-replica or
+      // distributed resource arguments are always passed by reference and
+      // broadcast variables are always passed by value.
+      if (arg_types[input] == DT_RESOURCE &&
+          !params_info.IsPerReplicaArg(input) &&
+          !params_info.IsDistributedArg(input)) {
+        DataType handle_type = arg_shapes[input].handle_type;
+        TF_RET_CHECK(handle_type != DT_INVALID) << DataTypeString(handle_type);
+        core_arg_types.push_back(handle_type);
+        int base = input - params_info.NumPerReplicaArgs() -
+                   params_info.NumDistributedArgs() -
+                   params_info.NumBroadcastArgs();
+        // Variables passed by value will have a corresponding additional output
+        // containing an updated value for the variable.
+        core_variable_writes.push_back(base);
+        core_retval_types.push_back(handle_type);
+      } else {
+        core_arg_types.push_back(arg_types[input]);
+      }
+    }
+
+    NodeDef def;
+    def.set_op("TPUExecute");
+    MergeDebugInfo(NodeDebugInfo(replicate_node.def()), &def);
+    AddNodeAttr("Targs", core_arg_types, &def);
+    AddNodeAttr("Tresults", core_retval_types, &def);
+
+    for (int64 replica = 0; replica < params_info.NumReplicas(); ++replica) {
+      def.set_name(strings::StrCat(replicate_node.name(), "/_execute_", replica,
+                                   "_", core));
+
+      Status status;
+      Node* node = graph->AddNode(def, &status);
+      if (!status.ok()) return status;
+      execute_nodes[replica].push_back(node);
+
+      node->set_assigned_device_name(tpu_device_names[replica][core]);
+
+      // Add control edges to ensure that execution happens after
+      // `control_predecessor`, happens before `control_successor`, and is
+      // triggered by evaluating any operator that depends on the original
+      // TPUReplicate operator. See the comment at the top of the header file
+      // for more details.
+      graph->AddControlEdge(control_predecessor, node);
+      graph->AddControlEdge(node, control_successor);
+
+      // Add data input edges.
+      for (int64 i = 0; i < core_arg_nums[core].size(); ++i) {
+        int64 orig_arg_num = core_arg_nums[core][i];
+        VLOG(2) << " replica " << replica << " core " << core << " i " << i
+                << " orig_arg_num " << orig_arg_num;
+        if (params_info.IsPerReplicaArg(orig_arg_num) ||
+            params_info.IsDistributedArg(orig_arg_num)) {
+          // Per-replica input and distributed input
+          int64 input_num = params_info.IsPerReplicaArg(orig_arg_num)
+                                ? replica * params_info.NumPerReplicaArgs() +
+                                      core_arg_nums[core][i]
+                                : params_info.NumReplicas() *
+                                          params_info.NumPerReplicaArgs() +
+                                      core_arg_nums[core][i] -
+                                      params_info.NumPerReplicaArgs();
+
+          const Edge* edge = replicate_input_edges[input_num];
+          VLOG(2) << "replicate_input_edges[" << input_num << "]";
+          DataType dtype = edge->src()->output_type(edge->src_output());
+          if (dtype == DT_RESOURCE) {
+            DataType handle_dtype = arg_shapes[orig_arg_num].handle_type;
+            if (std::find(kTpuAllTypes.begin(), kTpuAllTypes.end(),
+                          handle_dtype) == kTpuAllTypes.end()) {
+              return errors::InvalidArgument(
+                  "Unsupported resource variable data type for TPU: ",
+                  DataTypeString(handle_dtype), ", caused by output ",
+                  edge->src()->name(), ":", edge->src_output());
+            }
+          } else {
+            if (std::find(kTpuAllTypes.begin(), kTpuAllTypes.end(), dtype) ==
+                kTpuAllTypes.end()) {
+              return errors::InvalidArgument(
+                  "Unsupported data type for TPU: ", DataTypeString(dtype),
+                  ", caused by output ", edge->src()->name(), ":",
+                  edge->src_output());
+            }
+          }
+          if (arg_shardings[orig_arg_num].type() == xla::OpSharding::OTHER) {
+            // Don't automatically add a split node when input node is
+            // kTPUPartitionedInput
+            if (edge->src()->type_string() == kTPUPartitionedInput) {
+              VLOG(2) << "Connect "
+                      << replicate_input_fan_in_nodes[input_num][core]->name()
+                      << " to " << node->name() << " at " << i;
+              graph->AddEdge(replicate_input_fan_in_nodes[input_num][core], 0,
+                             node, i);
+            } else {
+              if (dtype == DT_RESOURCE) {
+                return errors::InvalidArgument(
+                    "Tiled sharding for per-replica DT_RESOURCE input must",
+                    "be TPUPartitionedInput. Here got ",
+                    edge->src()->type_string());
+              }
+              const xla::OpSharding& sharding = arg_shardings[orig_arg_num];
+
+              // Create or get the Split node.
+              TF_ASSIGN_OR_RETURN(
+                  ShardedInputInfo sharded_input_info,
+                  CreateOrGetSplitNodesForInputSharding(
+                      sharding, orig_arg_num, dtype, replica,
+                      edge->src_output(), edge->src(), control_predecessor,
+                      graph, &input_index_to_sharded_inputs));
+
+              // Calculate which output we should receive from the Split node.
+              absl::optional<int> output_index =
+                  GetCoreIndexInSharding(sharding, core);
+              TF_RET_CHECK(output_index);
+
+              NodeOut split_node_and_index =
+                  sharded_input_info.sharded_inputs.at(output_index.value());
+              // Connect with Split node output.
+              graph->AddEdge(split_node_and_index.node,
+                             split_node_and_index.index, node, i);
+            }
+          } else if (edge->src()->type_string() == kTPUPartitionedInput &&
+                     arg_shardings[orig_arg_num].type() ==
+                         xla::OpSharding::REPLICATED) {
+            graph->AddEdge(replicate_input_fan_in_nodes[input_num][core], 0,
+                           node, i);
+          } else {
+            graph->AddEdge(edge->src(), edge->src_output(), node, i);
+          }
+        } else if (params_info.IsBroadcastArg(orig_arg_num)) {
+          // Broadcast input.
+          int64 input_num = params_info.FirstBroadcastArgFromHost() +
+                            core_arg_nums[core][i] -
+                            params_info.NumPerReplicaArgs() -
+                            params_info.NumDistributedArgs();
+          const Edge* edge = replicate_input_edges[input_num];
+          DataType dtype = edge->src()->output_type(edge->src_output());
+          if (std::find(kTpuAllTypes.begin(), kTpuAllTypes.end(), dtype) ==
+              kTpuAllTypes.end()) {
+            return errors::InvalidArgument(
+                "Unsupported data type for TPU: ", DataTypeString(dtype),
+                ", caused by output ", edge->src()->name(), ":",
+                edge->src_output());
+          }
+          graph->AddEdge(edge->src(), edge->src_output(), node, i);
+        } else {
+          // Variable input.
+          int64 variable_num = orig_arg_num - params_info.NumPerReplicaArgs() -
+                               params_info.NumDistributedArgs() -
+                               params_info.NumBroadcastArgs();
+          TF_RET_CHECK(variable_num < num_variables);
+
+          Node* variable_read = variable_reads[variable_num];
+          DataType dtype = variable_read->output_type(0);
+          if (std::find(kTpuAllTypes.begin(), kTpuAllTypes.end(), dtype) ==
+              kTpuAllTypes.end()) {
+            return errors::InvalidArgument(
+                "Unsupported resource variable data type for TPU: ",
+                DataTypeString(dtype), ", caused by ReadVariableOp ",
+                variable_read->DebugString());
+          }
+          DeviceNameUtils::ParsedName requested_device;
+          string requested = variable_read->requested_device();
+          TF_RET_CHECK(
+              DeviceNameUtils::ParseFullName(requested, &requested_device));
+          if (requested_device.type != "TPU") {
+            // Stage the value via the CPU device on the remote host. The graph
+            // partitioner will introduce an intermediate copy rather than
+            // copying the same tensor multiple times across the network, and we
+            // would prefer that intermediate copy to be in host memory to avoid
+            // running out of memory if the TPUExecute op on the staging device
+            // starts running before the _Send ops to the other TPU devices on
+            // the same host complete. We don't do this if the variables are
+            // already placed on TPU, otherwise it will cause an unnecessary
+            // round trip copy.
+            // TODO(b/79580121): give each replica its own on-device variable
+            // replica and then delete this code.
+            string device;
+            TF_RETURN_IF_ERROR(DeviceNameUtils::DeviceNameToCpuDeviceName(
+                tpu_device_names[replica][core], &device));
+            TF_ASSIGN_OR_RETURN(auto var_data,
+                                CreateOrGetPerHostVariableCopy(
+                                    device, variable_num, variable_reads,
+                                    params_info, arg_shardings, replicate_node,
+                                    &per_host_var_copies, graph));
+
+            if (arg_shardings[orig_arg_num].type() == xla::OpSharding::OTHER) {
+              const xla::OpSharding& sharding = arg_shardings[orig_arg_num];
+              // Create or get the Split node.
+              TF_ASSIGN_OR_RETURN(
+                  ShardedInputInfo sharded_input_info,
+                  CreateOrGetSplitNodesForInputSharding(
+                      sharding, orig_arg_num,
+                      arg_shapes[orig_arg_num].handle_type, replica,
+                      var_data.index, var_data.node, control_predecessor, graph,
+                      &input_index_to_sharded_inputs));
+
+              // Calculate which output we should receive from the Split node.
+              absl::optional<int> output_index =
+                  GetCoreIndexInSharding(sharding, core);
+              TF_RET_CHECK(output_index);
+              NodeOut split_node_and_index =
+                  sharded_input_info.sharded_inputs[output_index.value()];
+              // Connect with Split node output.
+              graph->AddEdge(split_node_and_index.node,
+                             split_node_and_index.index, node, i);
+
+            } else {
+              graph->AddEdge(var_data.node, var_data.index, node, i);
+            }
+          } else {
+            graph->AddEdge(variable_reads[variable_num], 0, node, i);
+          }
+        }
+      }
+
+      // Adds a program input edge from the compiler.
+      graph->AddEdge(compile_node, core + 1, node, node->num_inputs() - 1);
+
+      // Add data output edges.
+      int num_outputs = core_retval_nums[core].size();
+      for (int i = 0; i < num_outputs; ++i) {
+        int output_num =
+            replica * num_retvals_per_replica + core_retval_nums[core][i];
+        const auto& sharding = retval_shardings[core_retval_nums[core][i]];
+        if (sharding.type() == xla::OpSharding::OTHER) {
+          int retval_index = core_retval_nums[core][i];
+          retval_index_to_output_index_mapping[retval_index][core] = i;
+          bool is_last_core =
+              core ==
+              *std::max_element(sharding.tile_assignment_devices().begin(),
+                                sharding.tile_assignment_devices().end());
+          bool isPartitionOutNode = false;
+
+          const Edge* e = replicate_output_edges[output_num];
+          const Edge* e_out;
+          for (const Edge* out_edge : e->dst()->out_edges()) {
+            if (out_edge->dst()->type_string() == kTPUPartitionedOutput) {
+              isPartitionOutNode = true;
+              e_out = out_edge;
+            }
+          }
+          if (isPartitionOutNode) {
+            graph->AddEdge(
+                node, i, replicate_output_fan_out_nodes[output_num][core],
+                replicate_output_fan_out_dst_inputs[output_num][core]);
+            VLOG(2) << "Connect " << node->name() << " at " << i << " to "
+                    << replicate_output_fan_out_nodes[output_num][core]->name()
+                    << " at "
+                    << replicate_output_fan_out_dst_inputs[output_num][core];
+            if (is_last_core) {
+              graph->RemoveEdge(e);
+              graph->RemoveEdge(e_out);
+            }
+            continue;
+          }
+
+          // Do this in the iteration of last core in tile assignment, so all
+          // TPUExecute nodes have been created.
+          if (!is_last_core) {
+            continue;
+          }
+
+          // Add a Concat node.
+          std::vector<NodeOut> orig_inputs;
+          for (int64 core_id : sharding.tile_assignment_devices()) {
+            int core_retval_index =
+                retval_index_to_output_index_mapping[retval_index][core_id];
+            orig_inputs.push_back(
+                NodeOut{execute_nodes[replica][core_id],
+                        static_cast<int>(
+                            core_retval_nums[core_id][core_retval_index])});
+          }
+          DataType dtype = e->src()->output_type(e->src_output());
+          TF_ASSIGN_OR_RETURN(
+              Node * concat_node,
+              CreateConcatNodesForRetval(sharding, dtype, replica, orig_inputs,
+                                         graph, /*device=*/""));
+
+          const Edge* edge = replicate_output_edges[output_num];
+          Node* dst = edge->dst();
+          int dst_input = edge->dst_input();
+          graph->RemoveEdge(edge);
+          graph->AddEdge(concat_node, 0, dst, dst_input);
+
+          continue;
+        }
+
+        // If this is a replicated output, outputs on all cores will be the
+        // same, and we only take the output from core 0.
+        if (sharding.type() == xla::OpSharding::REPLICATED && core != 0) {
+          continue;
+        }
+
+        // If output has maximal sharding, make sure we only use output from
+        // TPUExecute node with logical core id equal to core id defined by the
+        // xla sharding.
+        if (sharding.type() == xla::OpSharding::MAXIMAL &&
+            core != sharding.tile_assignment_devices(0)) {
+          continue;
+        }
+
+        const Edge* replicate_edge_to_replace =
+            replicate_output_edges[output_num];
+        Node* dst = replicate_edge_to_replace->dst();
+        int dst_input = replicate_edge_to_replace->dst_input();
+        graph->RemoveEdge(replicate_edge_to_replace);
+        graph->AddEdge(node, i, dst, dst_input);
+      }
+
+      // Feed the updated variable values from the first replica to the
+      // variable write nodes.
+      if (replica == 0) {
+        for (int i = 0; i < core_variable_writes.size(); ++i) {
+          int orig_arg_num =
+              core_variable_writes[i] + params_info.NumPerReplicaArgs() +
+              params_info.NumDistributedArgs() + params_info.NumBroadcastArgs();
+          const auto& sharding = arg_shardings[orig_arg_num];
+          // If this is a tiling sharded variable, concat variable updates from
+          // all cores.
+          if (sharding.type() == xla::OpSharding::OTHER) {
+            orig_arg_num_to_output_index_mapping[orig_arg_num][core] = i;
+
+            // Do this in the iteration of last core in tile assignment, so all
+            // TPUExecute nodes have been created.
+            if (core !=
+                *std::max_element(sharding.tile_assignment_devices().begin(),
+                                  sharding.tile_assignment_devices().end())) {
+              continue;
+            }
+
+            // Add a Concat node.
+            std::vector<NodeOut> orig_inputs;
+            for (int64 core_id : sharding.tile_assignment_devices()) {
+              int core_retval_num =
+                  orig_arg_num_to_output_index_mapping[orig_arg_num][core_id];
+              orig_inputs.push_back(
+                  NodeOut{execute_nodes[0][core_id],
+                          static_cast<int>(core_retval_nums[core_id].size() +
+                                           core_retval_num)});
+            }
+
+            // Use the variable read's device for the concat. They should both
+            // be collocated with the variable.
+            absl::string_view device =
+                variable_reads[core_variable_writes[i]]->assigned_device_name();
+            TF_ASSIGN_OR_RETURN(
+                Node * concat_node,
+                CreateConcatNodesForRetval(
+                    sharding, arg_shapes[orig_arg_num].handle_type, replica,
+                    orig_inputs, graph, device));
+            // Populate VariableWrite.
+            VariableWrite& write = variable_writes->at(core_variable_writes[i]);
+            write.value = concat_node;
+            write.value_output = 0;
+            write.predicate = compile_node;
+            write.predicate_output = num_cores_per_replica + core + 1;
+
+            continue;
+          }
+
+          // If this is a replicated variable, outputs on all cores will be the
+          // same, and we only take the output from core 0 for the varialbe
+          // update.
+          if (sharding.type() == xla::OpSharding::REPLICATED && core != 0) {
+            continue;
+          }
+          VariableWrite& write = variable_writes->at(core_variable_writes[i]);
+          write.value = node;
+          write.value_output = num_outputs + i;
+          write.predicate = compile_node;
+          write.predicate_output = num_cores_per_replica + core + 1;
+        }
+      }
+    }
+  }
+
+  for (Node* node : to_be_removed_nodes) {
+    graph->RemoveNode(node);
+  }
+  return Status::OK();
+}
+
+/* static */ Status DistributedTPURewritePass::CopyOutsideCompilationNodes(
+    int replica_index, const std::vector<Node*>& outside_compilation_nodes,
+    const DeviceNameUtils::ParsedName& tpu_device,
+    const DeviceNameUtils::ParsedName& partial_device,
+    NodeToNodeReplicasMap* node_images, Graph* graph) {
+  for (Node* node : outside_compilation_nodes) {
+    NodeDef image_def = node->def();
+    MergeDebugInfo(NodeDebugInfo(node->def()), &image_def);
+    const string suffix = strings::StrCat("/R", replica_index);
+    // In addition to node name, make the frame name unique to avoid multiple
+    // LoopCond nodes in one frame.
+    TF_RETURN_IF_ERROR(
+        AddPrefixAndSuffixToNode("" /* prefix */, suffix, &image_def));
+    Status status;
+    Node* image = graph->AddNode(image_def, &status);
+    image->AddAttr(kXlaReplicaIdAttrName, replica_index);
+    TF_RETURN_IF_ERROR(status);
+    if (HasNodeAttr(image->def(), kXlaHasHostTransferAttrName)) {
+      TF_RETURN_IF_ERROR(
+          SetNodeDeviceForTPUCommunication(tpu_device, DEVICE_CPU, image));
+    } else {
+      const string& original_device_string =
+          node->assigned_device_name().empty() ? node->requested_device()
+                                               : node->assigned_device_name();
+      DeviceNameUtils::ParsedName device;
+      TF_RET_CHECK(
+          DeviceNameUtils::ParseFullName(original_device_string, &device));
+      // If the requested device can be merged with the replica's host device,
+      // then do so. For example, if the requested device is "/CPU:0" or
+      // "/GPU:0" then it will be placed on the CPU/GPU of the host where this
+      // replica is running. But if the requested device is
+      // "/task:3/replica:2/CPU:0" then it will be placed on that task/replica.
+      if (DeviceNameUtils::IsSpecification(device, partial_device)) {
+        TF_RETURN_IF_ERROR(
+            DeviceNameUtils::MergeDevNames(&device, partial_device));
+      }
+      image->set_requested_device(DeviceNameUtils::ParsedNameToString(device));
+    }
+    std::vector<Node*>& node_image_vector = (*node_images)[node];
+    node_image_vector.resize(replica_index + 1);
+    node_image_vector[replica_index] = image;
+  }
+  return Status::OK();
+}
+
+/* static */ Status DistributedTPURewritePass::ReplicateOutsideCompilationNodes(
+    const std::vector<std::vector<string>>& tf_device_assignment,
+    const HostComputeCoreMap& host_compute_core,
+    const OutsideCompilationNodeMap& outside_compilation_nodes,
+    NodeToNodeReplicasMap* node_images, Graph* graph) {
+  // Iterate over replicas.
+  for (int i = 0; i < tf_device_assignment.size(); ++i) {
+    const auto& core_devices = tf_device_assignment[i];
+    for (const auto& oc_cluster_iter : outside_compilation_nodes) {
+      const string& oc_cluster_name = oc_cluster_iter.first;
+      const auto& oc_cluster_nodes = oc_cluster_iter.second;
+      // We previously validated that host_compute_core contains an entry for
+      // each cluster.
+      int core = host_compute_core.at(oc_cluster_name);
+      TF_RET_CHECK(core >= 0 && core < core_devices.size());
+      // tpu_device is the device the HostCompute XLA Op for this cluster runs
+      // on.
+      DeviceNameUtils::ParsedName tpu_device;
+      TF_RET_CHECK(
+          DeviceNameUtils::ParseFullName(core_devices[core], &tpu_device));
+      // partial_device contains the replica and task but not the type.
+      DeviceNameUtils::ParsedName partial_device = tpu_device;
+      partial_device.has_type = false;
+      partial_device.has_id = false;
+
+      if (tf_device_assignment.size() == 1) {
+        // With a single replica don't copy any nodes just put the original
+        // nodes into the image map. We leave the device placement alone, except
+        // that we have to fill in the correct core for the host send and
+        // receive nodes.
+        for (Node* node : oc_cluster_nodes) {
+          (*node_images)[node] = {node};
+          node->AddAttr(kXlaReplicaIdAttrName, 0);
+          if (HasNodeAttr(node->def(), kXlaHasHostTransferAttrName)) {
+            TF_RETURN_IF_ERROR(
+                SetNodeDeviceForTPUCommunication(tpu_device, DEVICE_CPU, node));
+          }
+        }
+      } else {
+        // Iterate over outside_compilation clusters in this computation, adding
+        // all the nodes with appropriate device assignments.
+        TF_RETURN_IF_ERROR(
+            CopyOutsideCompilationNodes(i, oc_cluster_nodes, tpu_device,
+                                        partial_device, node_images, graph));
+      }
+    }
+  }
+  return Status::OK();
+}
+
+/* static */ Status DistributedTPURewritePass::CopyOutsideCompilationEdges(
+    const std::vector<Node*>& outside_compilation_nodes,
+    const NodeToNodeReplicasMap& node_images,
+    const std::unordered_map<string, Node*> outside_compilation_inputs,
+    Graph* graph) {
+  for (Node* node : outside_compilation_nodes) {
+    const auto& images = node_images.at(node);
+    // Make a copy of all edges and iterate on "in_edges", because we might
+    // remove edges when iteratating through them.
+    std::vector<const Edge*> in_edges(node->in_edges().begin(),
+                                      node->in_edges().end());
+    for (const Edge* edge : in_edges) {
+      Node* src = edge->src();
+      const auto iter = node_images.find(src);
+      if (iter == node_images.end()) {
+        if (images.size() > 1) {
+          // The source node is a 'normal' node not part of any
+          // rewrite. Broadcast the value to all replicas. (If images.size() ==
+          // 1 the cluster is not replicated and we can leave the original edge
+          // in place.)
+          for (Node* dst : images) {
+            graph->AddEdge(src, edge->src_output(), dst, edge->dst_input());
+          }
+        }
+        continue;
+      }
+
+      // The source node is a replicated outside_compilation node.
+      const auto& src_images = iter->second;
+      if (src_images.size() != images.size()) {
+        return errors::InvalidArgument(
+            "Graph contains an edge from node ", src->name(),
+            " in an outside_compilation block replicated ", src_images.size(),
+            " ways to node ", node->name(),
+            " in an outside_compilation block replicated ", images.size(),
+            " ways. Replication factors must match. Leave a comment on "
+            "tracking bug b/76419636 if you need this to be supported.");
+      }
+      bool is_lifted_arg;
+      string outside_compilation_cluster;
+      if (GetNodeAttr(src->def(), kXlaIsLiftedArgAttrName, &is_lifted_arg)
+              .ok() &&
+          GetNodeAttr(src->def(), kOutsideCompilationAttr,
+                      &outside_compilation_cluster)
+              .ok()) {
+        const auto input_iter =
+            outside_compilation_inputs.find(outside_compilation_cluster);
+        TF_RET_CHECK(input_iter != outside_compilation_inputs.end());
+        TF_RET_CHECK(input_iter->second->type_string() == "IdentityN");
+        int dst_input = edge->dst_input();
+        if (src_images.size() == 1) {
+          graph->RemoveEdge(edge);
+        }
+        for (int i = 0; i < src_images.size(); ++i) {
+          graph->AddEdge(input_iter->second, i, images[i], dst_input);
+        }
+        continue;
+      }
+
+      bool is_placeholder_for_arg;
+      string outside_compilation_input_attr;
+      if (GetNodeAttr(src->def(), kXlaIsPlaceholderForArg,
+                      &is_placeholder_for_arg)
+              .ok() &&
+          GetNodeAttr(src->def(), kXlaOutsideCompilationInputsAttrName,
+                      &outside_compilation_input_attr)
+              .ok()) {
+        const auto input_iter =
+            outside_compilation_inputs.find(outside_compilation_input_attr);
+        TF_RET_CHECK(input_iter != outside_compilation_inputs.end());
+        TF_RET_CHECK(input_iter->second->type_string() == "IdentityN");
+        int dst_input = edge->dst_input();
+        if (src_images.size() == 1) {
+          graph->RemoveEdge(edge);
+        }
+        for (int i = 0; i < src_images.size(); ++i) {
+          graph->AddEdge(input_iter->second, i, images[i], dst_input);
+        }
+        continue;
+      }
+
+      if (images.size() > 1) {
+        // If images.size() == 1 neither cluster is replicated and we can
+        // leave the original edges in place.
+        for (int i = 0; i < src_images.size(); ++i) {
+          graph->AddEdge(src_images[i], edge->src_output(), images[i],
+                         edge->dst_input());
+        }
+      }
+    }
+    for (const Edge* edge : node->out_edges()) {
+      Node* dst = edge->dst();
+      const auto iter = node_images.find(dst);
+      if (iter == node_images.end()) {
+        // The source node is a 'normal' node not part of any rewrite.
+        if (edge->IsControlEdge()) {
+          // Make the dst node have a control dependency on every replica.
+          if (images.size() > 1) {
+            for (int i = 0; i < images.size(); ++i) {
+              graph->AddControlEdge(images[i], dst);
+            }
+          }
+          // else the cluster is not replicated so we can leave the original
+          // edge in place.
+        } else {
+          // The edge
+          // is only valid if the outside_compilation block is not replicated.
+          if (images.size() > 1) {
+            return errors::InvalidArgument(
+                "Graph contains an edge from node ", node->name(),
+                " in an outside_compilation block replicated ", images.size(),
+                " ways to node ", dst->name(),
+                " that is not part of an outside_compilation block. Edges from "
+                "outside_compilation to regular graph nodes are only supported "
+                "for replication factors of 1. Leave a comment on tracking bug "
+                "b/76419636 if you need this to be supported.");
+          }
+          // else the cluster is not replicated so we can leave the original
+          // edge in place.
+        }
+      }
+      // The case where src and dst are both in node_images is covered elsewhere
+      // when iterating over in_edges of dst.
+    }
+  }
+  return Status::OK();
+}
+
+/* static */ Status DistributedTPURewritePass::ReplicateOutsideCompilationEdges(
+    const OutsideCompilationNodeMap& outside_compilation_nodes,
+    const NodeToNodeReplicasMap& node_images,
+    const std::unordered_map<string, Node*> outside_compilation_inputs,
+    Graph* graph) {
+  for (const auto& oc_cluster_iter : outside_compilation_nodes) {
+    TF_RETURN_IF_ERROR(
+        CopyOutsideCompilationEdges(oc_cluster_iter.second, node_images,
+                                    outside_compilation_inputs, graph));
+  }
+  return Status::OK();
+}
+
+/* static */ Status DistributedTPURewritePass::RemoveOutsideCompilationNodes(
+    const NodeToNodeReplicasMap& node_images, Graph* graph) {
+  for (const auto& iter : node_images) {
+    if (iter.second.size() > 1) {
+      // The cluster was replicated so remove the original node.
+      Node* node = iter.first;
+      graph->RemoveNode(node);
+    }
+  }
+  return Status::OK();
+}
+
+/* static */ Status
+DistributedTPURewritePass::LowerOutsideCompilationFunctionalNodes(
+    Graph* g, const FunctionLibraryDefinition& flib_def,
+    const TPUReplicateDeviceNamesMapping& tpu_replicate_device_names_mapping) {
+  bool modified = false;
+  do {
+    std::vector<Node*> nodes_to_lower;
+    for (Node* n : g->op_nodes()) {
+      if (!HasNodeAttr(n->def(), kOutsideCompilationAttr)) {
+        continue;
+      }
+
+      if (n->IsWhileNode() || n->IsIfNode() || IsFunctionCall(flib_def, *n)) {
+        // Only lower functional ops with DT_RESOURCE input, because otherwise
+        // placer will complain. For normal cases, lowering will cause slowdown
+        // when related functions are huge (b/139037679).
+        bool has_resource_input = false;
+        for (const Edge* e : n->in_edges()) {
+          if (!e->IsControlEdge() &&
+              e->src()->output_type(e->src_output()) == DT_RESOURCE) {
+            has_resource_input = true;
+            break;
+          }
+        }
+        if (has_resource_input) {
+          nodes_to_lower.push_back(n);
+        }
+      }
+    }
+
+    modified = !nodes_to_lower.empty();
+
+    auto lower_functional_node = [&flib_def, &g](Node* n) -> Status {
+      // Clear device assignment. Otherwise all lowered nodes will have
+      // device assignment, which is not what we want.
+      n->set_requested_device("");
+
+      int replica_id;
+      TF_RETURN_IF_ERROR(
+          GetNodeAttr(n->def(), kXlaReplicaIdAttrName, &replica_id));
+
+      string outside_compilation_attr;
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), kOutsideCompilationAttr,
+                                     &outside_compilation_attr));
+
+      // There are two different kinds of functional outside compilation nodes:
+      // 1. Nodes that are in outside compilation blocks already. They are
+      //    generated by FunctionalizeControlFlowForXlaPass, and only have
+      //    attribute kOutsideCompilationAttr.
+      // 2. Mirrored control flow built for outside compilation in functional
+      //    nodes. They are generated by ExtractOutsideCompilationPass, and have
+      //    both kOutsideCompilationAttr and kXlaHasHostTransferAttrName.
+      // When lowering them, they need to be treated differently.
+      // For 1), their body functions are always V1 functions written by users,
+      // and their "control outputs" are control inputs of _Retval nodes. They
+      // should be lowered as V1 functions.
+      // For 2), we always add necessary "control outputs"
+      // (_XlaRecvAtHost/_XlaSendAtHost nodes) to "control_ret" field in their
+      // FunctionDef's. They should be lowered as V2 functions.
+      bool is_host_side_mirrored_control_flow =
+          HasNodeAttr(n->def(), kXlaHasHostTransferAttrName);
+
+      int num_node_ids = g->num_node_ids();
+      bool is_call_node = IsFunctionCall(flib_def, *n);
+      if (n->IsWhileNode()) {
+        TF_RETURN_IF_ERROR(RewriteWhileNode(n, g,
+                                            /*keep_node_fetchable=*/false));
+      } else if (n->IsIfNode()) {
+        TF_RETURN_IF_ERROR(RewriteIfNode(n, g, /*keep_node_fetchable=*/false));
+      } else {
+        TF_RET_CHECK(is_call_node);
+        // See comments for "is_host_side_mirrored_control_flow" above.
+        // If this is a node that's in outside compilation block, lower it as
+        // V1 function. This is controlled by removing
+        // kLowerAsMultiDeviceFunctionAttr from the node.
+        if (!is_host_side_mirrored_control_flow) {
+          n->ClearAttr(LowerFunctionalOpsPass::kLowerAsMultiDeviceFunctionAttr);
+        } else {
+          n->ClearAttr(LowerFunctionalOpsPass::kLowerAsMultiDeviceFunctionAttr);
+          n->AddAttr(LowerFunctionalOpsPass::kLowerAsMultiDeviceFunctionAttr,
+                     true);
+        }
+        TF_RETURN_IF_ERROR(
+            RewriteFunctionCallNode(n, g, flib_def,
+                                    /*keep_caller_fetchable=*/false));
+      }
+
+      for (int i = num_node_ids; i < g->num_node_ids(); i++) {
+        Node* node = g->FindNodeId(i);
+        if (!node) {
+          continue;
+        }
+
+        if (!is_call_node && is_host_side_mirrored_control_flow &&
+            IsFunctionCall(flib_def, *node)) {
+          // For If/While nodes, if they are host side mirrored control flow,
+          // mark their body function calls with kXlaHasHostTransferAttrName
+          // attribute to make sure we lower them as V2 function.
+          node->AddAttr(kXlaHasHostTransferAttrName, true);
+        }
+
+        if (IsFunctionCall(flib_def, *node) || node->IsWhileNode() ||
+            node->IsIfNode()) {
+          // Set kOutsideCompilationAttr attribute so we lower these
+          // nested function call nodes later.
+          node->AddAttr(kOutsideCompilationAttr, outside_compilation_attr);
+          // Set kXlaReplicaIdAttrName attribute so we know replica id when we
+          // lower this function call node.
+          node->AddAttr(kXlaReplicaIdAttrName, replica_id);
+        } else if (node->type_string() == "_XlaRecvAtHost" ||
+                   node->type_string() == "_XlaSendFromHost") {
+          // For "_XlaRecvAtHost" and "_XlaSendFromHost" nodes, make sure they
+          // have kXlaReplicaIdAttrName attribute so later we know which host
+          // device to assign.
+          node->AddAttr(kXlaReplicaIdAttrName, replica_id);
+        }
+      }
+      return Status::OK();
+    };
+
+    for (Node* n : nodes_to_lower) {
+      TF_RETURN_IF_ERROR(lower_functional_node(n));
+    }
+  } while (modified);
+
+  // Set device for all _XlaRecvAtHost and _XlaSendFromHost nodes.
+  for (Node* n : g->op_nodes()) {
+    if (n->type_string() != "_XlaRecvAtHost" &&
+        n->type_string() != "_XlaSendFromHost") {
+      continue;
+    }
+
+    string replicate;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), kTPUReplicateAttr, &replicate));
+    auto iter = tpu_replicate_device_names_mapping.find(replicate);
+    TF_RET_CHECK(iter != tpu_replicate_device_names_mapping.end());
+    const auto& tpu_device_names = iter->second;
+
+    int replica_id;
+    TF_RETURN_IF_ERROR(
+        GetNodeAttr(n->def(), kXlaReplicaIdAttrName, &replica_id));
+    TF_RET_CHECK(replica_id < tpu_device_names.size());
+    const string& tpu_device_name = tpu_device_names[replica_id][0];
+    string host_device_name;
+    TF_RETURN_IF_ERROR(DeviceNameUtils::DeviceNameToCpuDeviceName(
+        tpu_device_name, &host_device_name));
+    n->set_assigned_device_name(host_device_name);
+    // We may run TPU rewrite passes again on the subgraphs of the resulting
+    // graph. Clear kTPUReplicateAttr and kOutsideCompilationAttr for
+    // "_XlaRecvAtHost" nodes and "_XlaSendFromHost" nodes, in order to make
+    // sure that TPU rewrite passes take no effect on host-side subgraphs for
+    // outside compilation.
+    n->ClearAttr(kTPUReplicateAttr);
+    n->ClearAttr(kOutsideCompilationAttr);
+  }
+
+  // Remove IdentityN nodes generated for outside compilation. IdentityN is
+  // exempt from resource edge colocation, but here we do need input and output
+  // for these IdentityN nodes to be colocated.
+  std::vector<Node*> identityn_nodes;
+  for (Node* n : g->op_nodes()) {
+    if (n->type_string() == "IdentityN" &&
+        HasNodeAttr(n->def(), kXlaOutsideCompilationInputsAttrName)) {
+      identityn_nodes.push_back(n);
+    }
+  }
+  for (Node* n : identityn_nodes) {
+    std::vector<const Edge*> out_edges(n->out_edges().begin(),
+                                       n->out_edges().end());
+    for (const Edge* e : out_edges) {
+      if (e->IsControlEdge()) {
+        continue;
+      }
+
+      int src_output = e->src_output();
+      const Edge* input_edge;
+      TF_RETURN_IF_ERROR(n->input_edge(src_output, &input_edge));
+      Node* dst = e->dst();
+      int dst_input = e->dst_input();
+      g->RemoveEdge(e);
+      g->AddEdge(input_edge->src(), input_edge->src_output(), dst, dst_input);
+    }
+    g->RemoveNode(n);
+  }
+
+  return Status::OK();
+}
+
+/* static */ Status DistributedTPURewritePass::ParseHostComputeCores(
+    const Node& replicate_node,
+    const OutsideCompilationNodeMap& outside_compilation_nodes,
+    HostComputeCoreMap* host_compute_core) {
+  std::vector<string> hc_core_string;
+  TF_RETURN_IF_ERROR(GetNodeAttr(replicate_node.attrs(), "host_compute_core",
+                                 &hc_core_string));
+  TF_RETURN_IF_ERROR(
+      ParseHostComputeCoreList(hc_core_string, host_compute_core));
+  for (const auto& iter : outside_compilation_nodes) {
+    const string& oc_cluster_name = iter.first;
+    if (host_compute_core->find(oc_cluster_name) == host_compute_core->end()) {
+      // By default put host compute Ops on replicated core 0.
+      (*host_compute_core)[oc_cluster_name] = 0;
+    }
+  }
+  return Status::OK();
+}
+
+/* static */ Status DistributedTPURewritePass::GetDeviceTopology(
+    const DeviceSet& device_set, const Node& replicate_node, int* num_replicas,
+    int* num_cores_per_replica, int* num_tasks,
+    std::vector<std::vector<string>>* tf_device_assignment,
+    std::unique_ptr<xla::DeviceAssignment>* xla_device_assignment,
+    string* tpu_compilation_device) {
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(replicate_node.attrs(), "num_replicas", num_replicas));
+  if (*num_replicas < 1) {
+    return errors::InvalidArgument("num_replicas must be >= 1, got ",
+                                   *num_replicas);
+  }
+
+  // Find the set of TPU devices in the TF job.
+  // Indexed by [task number][tpu device number].
+  std::vector<std::vector<Device*>> tpu_devices;
+  int num_tpus_per_task;
+  TF_RETURN_IF_ERROR(GetTPUDeviceNames(replicate_node.requested_device(),
+                                       device_set, tpu_compilation_device,
+                                       &num_tpus_per_task, &tpu_devices));
+
+  string topology;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(replicate_node.attrs(), "topology", &topology));
+  TF_RETURN_IF_ERROR(GetNodeAttr(
+      replicate_node.attrs(), "num_cores_per_replica", num_cores_per_replica));
+  std::vector<int> device_assignment;
+  TF_RETURN_IF_ERROR(GetNodeAttr(replicate_node.attrs(), "device_assignment",
+                                 &device_assignment));
+
+  // TODO(cwhipkey): since we can control multiple pods of different shapes
+  // from a single worker, it may be desirable to propagate the remote device
+  // information around (e.g., in DeviceAttributes). This can lead to the mesh
+  // topology proto being leaked to cloud TPU users (e.g. through GetStatus
+  // calls); this may be okay, but to be conservative, just assume that the
+  // master session has the proper flags set.
+
+  // We do not initialize platform right now, but we can still retrieve the
+  // TPU topology even with an uninitialized platform.
+  auto* tpu_platform = tpu::TpuPlatformInterface::GetRegisteredPlatform(
+      /*initialize_platform=*/false);
+  TF_RET_CHECK(tpu_platform);
+  tpu::TpuTopologyExternal tpu_topology(tpu_platform->GetTopologyPtr());
+  TF_RET_CHECK(num_tpus_per_task ==
+               tpu_topology.LogicalDevicesPerHost(kTensorCore));
+  TF_RETURN_IF_ERROR(BuildDeviceAssignment(
+      tpu_topology, num_tpus_per_task, tpu_devices, *num_replicas,
+      *num_cores_per_replica, topology, device_assignment, tf_device_assignment,
+      xla_device_assignment));
+
+  return Status::OK();
+}
+
+/* static */ Status DistributedTPURewritePass::GetIOTypes(
+    int num_replicas, const Node& replicate_node, FunctionLibraryRuntime* flr,
+    Graph* graph, NameRangeMap* input_name_map, const NameAttrList** function,
+    std::unique_ptr<Graph>* computation, DataTypeVector* arg_types,
+    DataTypeVector* retval_types, ParameterInfo* params_info) {
+  DataTypeVector input_types, broadcast_input_types, guaranteed_constant_types;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(replicate_node.attrs(), "Tinputs", &input_types));
+  TF_RETURN_IF_ERROR(GetNodeAttr(replicate_node.attrs(), "Tbroadcast_inputs",
+                                 &broadcast_input_types));
+  TF_RETURN_IF_ERROR(GetNodeAttr(replicate_node.attrs(),
+                                 "Tguaranteed_constants",
+                                 &guaranteed_constant_types));
+  int num_distributed_vars;
+  TF_RETURN_IF_ERROR(GetNodeAttr(replicate_node.attrs(),
+                                 "num_distributed_variables",
+                                 &num_distributed_vars));
+  const int num_per_replica_inputs = input_types.size() - num_distributed_vars;
+
+  if (num_per_replica_inputs % num_replicas != 0) {
+    return errors::InvalidArgument(
+        "Number of inputs to TPUReplicate (", num_per_replica_inputs,
+        ") is not divisible by the number of replicas (", num_replicas, ").");
+  }
+
+  int num_variables;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(replicate_node.attrs(), "NumVariables", &num_variables));
+
+  NameRangeMap output_name_map;
+  TF_RETURN_IF_ERROR(NameRangesForNode(replicate_node, replicate_node.op_def(),
+                                       input_name_map, &output_name_map));
+
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(replicate_node.attrs(), "computation", function));
+
+  *computation = absl::make_unique<Graph>(graph->op_registry());
+  TF_RETURN_IF_ERROR(GetComputationForTPUReplicateOp(
+      **function, flr, computation->get(), arg_types, retval_types));
+
+  *params_info = ParameterInfo(
+      num_replicas, num_per_replica_inputs / num_replicas, num_distributed_vars,
+      broadcast_input_types.size(), num_variables,
+      guaranteed_constant_types.size(), retval_types->size());
+
+  if (arg_types->size() != params_info->NumInputsToEachReplica()) {
+    return errors::InvalidArgument(
+        "Computation argument to TPUReplicate has wrong number of "
+        "arguments. Expected ",
+        params_info->NumInputsToEachReplica(), " inputs, got ",
+        arg_types->size());
+  }
+  if (replicate_node.num_outputs() != params_info->NumOutputsToHost()) {
+    return errors::InvalidArgument(
+        "Wrong number of outputs from TPUReplicate. Expected ",
+        params_info->NumOutputsToHost(), " outputs, got ",
+        replicate_node.num_outputs());
+  }
+  if (enable_cross_replica_sharding_mirrored_variables_) {
+    std::vector<int> mirrored_variable_indices;
+    TF_RETURN_IF_ERROR(GetNodeAttr(replicate_node.attrs(),
+                                   TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR,
+                                   &mirrored_variable_indices));
+    for (int index : mirrored_variable_indices) {
+      TF_RET_CHECK(params_info->IsPerReplicaArg(index) ||
+                   params_info->IsDistributedArg(index))
+          << "Mirrored variables not categorized as per-replica arguments, "
+             "index: "
+          << index;
+      params_info->mutable_mirrored_variable_indices()->insert(index);
+    }
+  }
+  return Status::OK();
+}
+
+/* static */ Status DistributedTPURewritePass::BuildSequencingNodes(
+    const string& tpu_compilation_device, const Node& replicate_node,
+    Graph* graph, Node** host_transfer_sequencer, Node** control_before,
+    Node** control_after) {
+  *host_transfer_sequencer = nullptr;
+
+  TF_RETURN_IF_ERROR(
+      BuildNoopNode(replicate_node,
+                    graph->NewName(strings::StrCat(replicate_node.name(), "/",
+                                                   "control_before")),
+                    /*device=*/"", graph, control_before));
+  for (const Edge* e : replicate_node.in_edges()) {
+    if (!e->IsControlEdge()) {
+      continue;
+    }
+    Node* predecessor = e->src();
+    if (predecessor->IsSource()) continue;
+    if (predecessor->type_string() == "NoOp" &&
+        predecessor->attrs().Find("_xla_host_transfer_sequencer") != nullptr) {
+      // The node is the sequencer for host transfer operations. Its control
+      // dependency needs to be placed after the execute node, not before.
+      if (*host_transfer_sequencer != nullptr) {
+        return errors::Internal("Replicate node ", replicate_node.name(),
+                                " has two transfer sequencer nodes: ",
+                                (*host_transfer_sequencer)->name(), " and ",
+                                predecessor->name());
+      }
+      // Set the correct device to match the other sequencing nodes.
+      predecessor->set_assigned_device_name(tpu_compilation_device);
+      *host_transfer_sequencer = predecessor;
+    } else {
+      graph->AddControlEdge(predecessor, *control_before);
+    }
+  }
+
+  TF_RETURN_IF_ERROR(
+      BuildNoopNode(replicate_node,
+                    graph->NewName(strings::StrCat(replicate_node.name(), "/",
+                                                   "control_after")),
+                    /*device=*/tpu_compilation_device, graph, control_after));
+  for (Node* successor : replicate_node.out_nodes()) {
+    if (successor->attrs().Find("_xla_tail_outside_compilation") != nullptr) {
+      graph->AddControlEdge(successor, *control_after);
+    } else {
+      graph->AddControlEdge(*control_after, successor);
+    }
+  }
+  return Status::OK();
+}
+
+/* static */ Status DistributedTPURewritePass::DealWithConstantsAndVariables(
+    const Node& replicate_node, const NameRangeMap& input_name_map,
+    Graph* graph, Node* host_transfer_sequencer, Node* control_before,
+    Node* control_after, absl::Span<const VariableInput> variable_nodes,
+    std::vector<Node*>* guaranteed_constant_nodes,
+    std::vector<Node*>* variable_reads) {
+  TF_RETURN_IF_ERROR(FindGuaranteedConstantInputs(
+      replicate_node, input_name_map, guaranteed_constant_nodes));
+
+  TF_RETURN_IF_ERROR(BuildVariableReads(variable_nodes, control_before, graph,
+                                        variable_reads));
+  // Add the control dependency from host transfer nodes.
+  if (host_transfer_sequencer != nullptr) {
+    graph->AddControlEdge(host_transfer_sequencer, control_after);
+  }
+  return Status::OK();
+}
+
+/* static */ Status
+DistributedTPURewritePass::BuildCompilationStatusReturnNodes(
+    Node* replicate_node, Node* compile_node, Node** control_after_compilation,
+    Graph* graph) {
+  const Edge* compilation_edge = nullptr;
+  for (const auto* e : replicate_node->out_edges()) {
+    if (e->IsControlEdge() &&
+        e->dst()->type_string() == "TPUCompilationResult") {
+      TF_RET_CHECK(compilation_edge == nullptr)
+          << "Multiple compilation result nodes attached to the same replicate "
+             "cluster.";
+      compilation_edge = e;
+    }
+  }
+
+  // TODO(jpienaar): This should be checked by default, current tests not using
+  // this are ones that use the "abort upon successful compile flag" which will
+  // be removed. Leaving this in until then.
+  if (compilation_edge != nullptr) {
+    Node* compilation_status = compilation_edge->dst();
+    const AttrValue* compile_status_cluster_attr =
+        compilation_status->attrs().Find(kTPUCompilationResultAttr);
+    TF_RET_CHECK(compile_status_cluster_attr != nullptr);
+    const string& compile_status_cluster = compile_status_cluster_attr->s();
+    TF_RET_CHECK(!compile_status_cluster.empty());
+    const AttrValue* replicate_cluster_attr =
+        replicate_node->attrs().Find(kTPUReplicateAttr);
+    TF_RET_CHECK(replicate_cluster_attr != nullptr);
+    const string& replicate_cluster = replicate_cluster_attr->s();
+    TF_RET_CHECK(!replicate_cluster.empty());
+    TF_RET_CHECK(compile_status_cluster == replicate_cluster);
+
+    TF_RETURN_IF_ERROR(
+        ReplaceCompilationResultNodeWithIdentity(graph, &compilation_status));
+    graph->AddEdge(compile_node, 0, compilation_status, 0);
+  }
+
+  NodeDef def;
+  def.set_name(UniqueNodeName("tpu_compile_succeeded_assert", graph));
+  // Create an op to assert that compilation succeeded. The alternative would
+  // have been to have each execute op check and return an error.
+  def.set_op("TPUCompileSucceededAssert");
+  MergeDebugInfo(NodeDebugInfo(replicate_node->def()), &def);
+  Status status;
+  Node* compile_succeeded = graph->AddNode(def, &status);
+  compile_succeeded->set_assigned_device_name(
+      compile_node->assigned_device_name());
+  TF_RETURN_IF_ERROR(status);
+  graph->AddEdge(compile_node, 0, compile_succeeded, 0);
+
+  // Build a sequencing node for when compilation has completed.
+  TF_RETURN_IF_ERROR(
+      BuildNoopNode(*replicate_node,
+                    graph->NewName(strings::StrCat(compile_node->name(), "/",
+                                                   "after_compilation")),
+                    /*device=*/"", graph, control_after_compilation));
+  graph->AddControlEdge(compile_succeeded, *control_after_compilation);
+
+  return Status::OK();
+}
+
+// Updates the head and tail outside compiled nodes so that nodes have the
+// correct device and removes the replication and outside compilation attributes
+// so that these nodes do not trigger further graph optimization passes.
+/* static */ Status DistributedTPURewritePass::UpdateHeadTailOutsideCompilation(
+    const std::vector<std::vector<string>>& tf_device_assignment,
+    const std::vector<Node*>& head_tail_outside_compilation_nodes) {
+  for (Node* node : head_tail_outside_compilation_nodes) {
+    int replica_id;
+    TF_RETURN_IF_ERROR(
+        GetNodeAttr(node->def(), kXlaReplicaIdAttrName, &replica_id));
+    // Since we set the device, this will now run on a task other than 0. We
+    // clear the two following attributes so that we don't trigger encapsulation
+    // again on the remote host (which will fail due to a missing
+    // _TPUReplicateMetadata node for the cluster).
+    for (const Edge* e : node->in_edges()) {
+      // Resource consuming ops should colocate with its resource input.
+      if (e->src()->IsArg() &&
+          e->src()->output_type(e->src_output()) == DT_RESOURCE) {
+        node->set_requested_device(tf_device_assignment[replica_id][0]);
+      }
+    }
+    if (node->requested_device().empty()) {
+      string cpu_device;
+      TF_RETURN_IF_ERROR(DeviceNameUtils::DeviceNameToCpuDeviceName(
+          tf_device_assignment[replica_id][0], &cpu_device));
+      node->set_requested_device(cpu_device);
+    }
+    node->ClearAttr(kTPUReplicateAttr);
+    node->ClearAttr(kOutsideCompilationAttr);
+  }
+  return Status::OK();
+}
+
+/* static */
+Status DistributedTPURewritePass::FingerprintFunctionLibrary(
+    const FunctionLibraryDefinition& library, uint64* fingerprint) {
+  // TODO(phawkins): rather than fingerprinting the entire function library,
+  // consider fingerprinting just the transitive dependencies of a
+  // computation.
+  std::string serialized;
+  FunctionDefLibrary library_proto = library.ToProto();
+  if (library_proto.ByteSizeLong() >= 1.5 * 1024 * 1024 * 1024) {
+    LOG(WARNING) << "Serializing large proto, size: "
+                 << library_proto.ByteSizeLong();
+  }
+  TF_RET_CHECK(SerializeToStringDeterministic(library_proto, &serialized));
+  *fingerprint = TpuCompileInterface::Get()->FingerprintString(serialized);
+  return Status::OK();
+}
+
+// Performs the rewrite on a single TPUReplicate node.
+/* static */ Status DistributedTPURewritePass::RewriteTPUReplicateNode(
+    const string& session_handle, const DeviceSet& device_set,
+    Node* replicate_node, FunctionLibraryDefinition* flib_def,
+    FunctionLibraryRuntime* flr, Node* host_compute_key_placeholder_node,
+    const OutsideCompilationNodeMap& outside_compilation_nodes,
+    const std::vector<Node*>& head_tail_outside_compilation_nodes,
+    NodeToNodeReplicasMap* outside_compilation_node_images, Graph* graph,
+    const GraphShapeInfo& shape_info,
+    TPUReplicateDeviceNamesMapping* tpu_replicate_device_names_mapping,
+    int64 autotuner_thresh) {
+  VLOG(2) << "Rewriting node " << replicate_node->name();
+
+  // num_replicas and num_cores_per_replica are the 'virtual' replicas (copies
+  // of the computation) and cores (virtual cores within computations) specified
+  // by the user. They will be mapped to physical TPU cores below.
+  int num_replicas;
+  int num_cores_per_replica;
+  int num_tasks;  // Number of tasks.
+  std::vector<std::vector<string>> tf_device_assignment;
+  std::unique_ptr<xla::DeviceAssignment> xla_device_assignment;
+  string tpu_compilation_device;
+  TF_RETURN_IF_ERROR(GetDeviceTopology(
+      device_set, *replicate_node, &num_replicas, &num_cores_per_replica,
+      &num_tasks, &tf_device_assignment, &xla_device_assignment,
+      &tpu_compilation_device));
+
+  TF_RETURN_IF_ERROR(UpdateHeadTailOutsideCompilation(
+      tf_device_assignment, head_tail_outside_compilation_nodes));
+
+  string replicate;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(replicate_node->def(), kTPUReplicateAttr, &replicate));
+  tpu_replicate_device_names_mapping->emplace(replicate, tf_device_assignment);
+
+  NameRangeMap input_name_map;
+  const NameAttrList* function;
+  std::unique_ptr<Graph> computation;
+  DataTypeVector arg_types, retval_types;
+  ParameterInfo params_info;
+  TF_RETURN_IF_ERROR(GetIOTypes(num_replicas, *replicate_node, flr, graph,
+                                &input_name_map, &function, &computation,
+                                &arg_types, &retval_types, &params_info));
+
+  std::vector<InferredShape> arg_shapes, retval_shapes;
+  TF_RETURN_IF_ERROR(GetArgAndRetvalShapes(
+      shape_info, *replicate_node, params_info, &arg_shapes, &retval_shapes));
+
+  TF_RETURN_IF_ERROR(ValidateCoreNumbers(*computation, num_cores_per_replica));
+
+  std::vector<xla::OpSharding> arg_sharding;
+  std::vector<bool> arg_fast_mem;
+  std::vector<std::string> arg_names;
+  std::vector<xla::OpSharding> retval_sharding;
+  TF_RETURN_IF_ERROR(AssignArgsAndRetvalsToCores(
+      num_cores_per_replica, params_info, arg_types, arg_shapes, retval_types,
+      retval_shapes, *computation, replicate_node, flr,
+      allow_xla_spmd_partition_, &arg_sharding, &arg_fast_mem, &retval_sharding,
+      &arg_names));
+
+  VLOG(1) << DumpGraphToFile("distributed_tpu_graph_to_replicate", *computation,
+                             flib_def);
+
+  GraphDef graph_def;
+  graph->ToGraphDef(&graph_def);
+  FunctionLibraryDefinition reachable_functions =
+      flib_def->ReachableDefinitions(graph_def);
+  uint64 library_fingerprint;
+
+  TF_RETURN_IF_ERROR(
+      FingerprintFunctionLibrary(reachable_functions, &library_fingerprint));
+  VLOG(1) << "Fingerprint functions: "
+          << absl::StrJoin(reachable_functions.ListFunctionNames(), ", ");
+  VLOG(1) << "library_fingerprint: " << library_fingerprint;
+
+  // Builds trigger nodes that put barriers around the expansion of
+  // TPUReplicate. In particular, we must guarantee:
+  // a) variable reads happen after all predecessors of the original
+  //    TPUReplicate.
+  // b) variable writes happen before all successors of the original
+  //    TPUReplicate.
+  // c) all replicas execute, even if output tensors are only requested from
+  //    a subset of replicas. This is necessary both to ensure that variable
+  //    updates happen, but also Send/Recv will deadlock if only one half of
+  //    the communicating pair runs.
+  Node* host_transfer_sequencer;
+  Node* control_before;
+  Node* control_after;
+  TF_RETURN_IF_ERROR(BuildSequencingNodes(
+      tpu_compilation_device, *replicate_node, graph, &host_transfer_sequencer,
+      &control_before, &control_after));
+
+  // Build a vector of variable nodes that are inputs.
+  std::vector<VariableInput> variable_inputs;
+  TF_RETURN_IF_ERROR(
+      FindVariableInputs(*replicate_node, input_name_map, &variable_inputs));
+
+  std::vector<Node*> guaranteed_constant_nodes;
+  std::vector<Node*> variable_reads;
+  TF_RETURN_IF_ERROR(DealWithConstantsAndVariables(
+      *replicate_node, input_name_map, graph, host_transfer_sequencer,
+      control_before, control_after, variable_inputs,
+      &guaranteed_constant_nodes, &variable_reads));
+
+  // Builds Shape nodes that compute the dynamic shapes of arguments whose
+  // shapes are not statically known.
+  std::vector<Node*> dynamic_shape_nodes;
+  TF_RETURN_IF_ERROR(BuildDynamicShapeNodes(*replicate_node, arg_shapes,
+                                            params_info, variable_reads, graph,
+                                            &dynamic_shape_nodes));
+
+  // Builds a TPUCompile node that compiles `clusters` on `compile_device`.
+  Node* compile_node;
+  TF_RETURN_IF_ERROR(BuildCompileNode(
+      replicate_node, *function, library_fingerprint, params_info, arg_shapes,
+      arg_types, guaranteed_constant_nodes, session_handle, arg_sharding,
+      arg_fast_mem, arg_names, retval_sharding, num_cores_per_replica,
+      /*compile_device=*/tpu_compilation_device, xla_device_assignment.get(),
+      dynamic_shape_nodes, graph, &compile_node, autotuner_thresh));
+
+  // Compilation must be sequenced after the control node if the TPU computation
+  // in a control-flow construct, such as a loop.
+  graph->AddControlEdge(control_before, compile_node);
+
+  Node* control_after_compilation;
+  TF_RETURN_IF_ERROR(BuildCompilationStatusReturnNodes(
+      replicate_node, compile_node, &control_after_compilation, graph));
+
+  std::vector<VariableWrite> variable_writes;
+  TF_RETURN_IF_ERROR(BuildExecuteNodes(
+      params_info, num_tasks, num_cores_per_replica, *replicate_node, arg_names,
+      arg_types, arg_shapes, retval_types, arg_sharding, retval_sharding,
+      tf_device_assignment, compile_node, variable_reads,
+      control_after_compilation, control_after, &variable_writes, graph));
+  bool contains_resource_write_op =
+      ContainsResourceWriteOp(*graph, reachable_functions);
+
+  VLOG(2) << "contains_resource_write_op: " << contains_resource_write_op;
+  // Skip conditional write if there is no resource writing op inside TPU
+  // computation.
+  if (contains_resource_write_op) {
+    TF_RETURN_IF_ERROR(BuildVariableWrites(variable_inputs, control_after,
+                                           variable_writes, graph));
+  }
+
+  if (host_compute_key_placeholder_node != nullptr) {
+    TF_RETURN_IF_ERROR(ConnectHostComputeNodes(
+        compile_node, host_compute_key_placeholder_node, graph));
+  }
+
+  HostComputeCoreMap host_compute_core;
+  TF_RETURN_IF_ERROR(ParseHostComputeCores(
+      *replicate_node, outside_compilation_nodes, &host_compute_core));
+  TF_RETURN_IF_ERROR(ReplicateOutsideCompilationNodes(
+      tf_device_assignment, host_compute_core, outside_compilation_nodes,
+      outside_compilation_node_images, graph));
+
+  graph->RemoveNode(replicate_node);
+  return Status::OK();
+}
+
+// Adds sharded weight update optimization for each host training loop.
+//
+// For any host training loop found in the graph, TPUVariableReshard ops
+// are inserted to match the best layout chosen by the XLA.
+/* static */ Status
+DistributedTPURewritePass::PerformHostTrainingLoopOptimization(
+    Graph* graph, FunctionLibraryDefinition* flib_def,
+    FunctionLibraryRuntime* flr) {
+  std::vector<tpu::HostTrainingLoopInfo> host_training_loops_info;
+  Status s = tpu::DetectHostTrainingLoop(
+      /*current_function_name=*/nullptr,
+      /*current_function_attr=*/nullptr, flib_def, graph, flr,
+      &host_training_loops_info);
+  if (!s.ok()) {
+    VLOG(2) << "No valid host training loop found. Skipping sharded weight "
+            << "update optimization.";
+    return Status::OK();
+  }
+
+  for (const auto& host_loop : host_training_loops_info) {
+    const auto& function_name = host_loop.encapsulating_function_name;
+    // `function_name` has value when host training loop is inside a
+    // function call node. When host training loop is found inside a function
+    // call node, then, in addition to adding TPUVariableReshard ops, function
+    // library definition needs to be updated as well.
+    if (function_name.has_value()) {
+      const auto& function_attr = host_loop.encapsulating_function_attrs;
+      TF_RET_CHECK(function_attr.has_value())
+          << "Unable to find function attribute for function: "
+          << *function_name;
+
+      const FunctionDef* function_def = flib_def->Find(*function_name);
+      TF_RET_CHECK(function_def)
+          << "Unable to find function : " << *function_name;
+
+      std::unique_ptr<FunctionBody> fbody;
+      TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+          *function_def, AttrSlice(&function_attr.value()), flib_def, &fbody));
+      Graph* function_graph = fbody->graph;
+      TF_RETURN_IF_ERROR(tpu::AddReshardOp(function_graph, host_loop));
+      TF_RETURN_IF_ERROR(UpdateFunctionLibDefinition(*function_graph,
+                                                     *function_name, flib_def));
+    } else {
+      TF_RETURN_IF_ERROR(tpu::AddReshardOp(graph, host_loop));
+    }
+  }
+  return Status::OK();
+}
+
+Status DistributedTPURewritePass::PlaceUnassignedDeviceNodesOnTPUIfPossible(
+    Graph* graph) {
+  ReverseDFS(*graph, {}, PlaceOpsOnTPU);
+  return Status::OK();
+}
+
+Status DistributedTPURewritePass::Run(
+    const GraphOptimizationPassOptions& options) {
+  VLOG(1) << "DistributedTPURewritePass::Run";
+
+  Graph* graph = options.graph->get();
+
+  VLOG(1) << DumpGraphToFile("distributed_tpu_compilation_before", *graph,
+                             options.flib_def);
+
+  const auto* config = &options.session_options->config;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(
+      new ProcessFunctionLibraryRuntime(
+          nullptr, options.session_options->env, config,
+          graph->versions().producer(), options.flib_def,
+          config ? config->graph_options().optimizer_options()
+                 : OptimizerOptions()));
+
+  FunctionLibraryRuntime* flr =
+      pflr->GetFLR(ProcessFunctionLibraryRuntime::kDefaultFLRDevice);
+
+  // This pass can only run in the session master, which should fill
+  // in the device_set field to the options.
+  TF_RET_CHECK(options.device_set != nullptr);
+
+  // Find all the replicate nodes before mutating the graph.
+  std::vector<Node*> replicate_nodes;
+  // Map from compiled subgraph cluster name to the outside_compilation nodes in
+  // that cluster.
+  std::map<string, OutsideCompilationNodeMap> outside_compilation_nodes;
+  std::map<string, std::vector<Node*>> head_tail_outside_compilation_nodes;
+  TF_RETURN_IF_ERROR(FindTaggedNodes(graph, &replicate_nodes,
+                                     &outside_compilation_nodes,
+                                     &head_tail_outside_compilation_nodes));
+
+  if (replicate_nodes.empty()) {
+    // Remove unused TPUPartitionedInput nodes.
+    for (Node* n : graph->nodes()) {
+      if (n->type_string() == kTPUPartitionedInput) graph->RemoveNode(n);
+    }
+    return Status::OK();
+  }
+
+  std::unordered_map<string, Node*> host_compute_key_placeholder_map;
+  TF_RETURN_IF_ERROR(FindHostComputeKeyPlaceholderNodes(
+      graph, replicate_nodes, &host_compute_key_placeholder_map));
+
+  GraphShapeInfo shape_info;
+  TF_RETURN_IF_ERROR(InferShapes(graph, /*arg_shapes=*/{},
+                                 flr->GetFunctionLibraryDefinition(),
+                                 &shape_info));
+  int64 autotuner_thresh = options.session_options->config.experimental()
+                               .xla_fusion_autotuner_thresh();
+
+  NodeToNodeReplicasMap outside_compilation_node_images;
+  TPUReplicateDeviceNamesMapping tpu_replicate_device_names_mapping;
+  for (Node* node : replicate_nodes) {
+    TF_RETURN_IF_ERROR(RewriteTPUReplicateNode(
+        options.session_handle, *options.device_set, node, options.flib_def,
+        flr, host_compute_key_placeholder_map[node->name()],
+        outside_compilation_nodes[node->name()],
+        head_tail_outside_compilation_nodes[node->name()],
+        &outside_compilation_node_images, graph, shape_info,
+        &tpu_replicate_device_names_mapping, autotuner_thresh));
+  }
+
+  // Place the padding nodes generated by dynamic padder on the correct devices.
+  // TODO(rxsang): Place padding ops on TPUs in
+  // PlaceUnassignedDeviceNodesOnTPUIfPossible function.
+  TF_RETURN_IF_ERROR(SetPaddingNodesDevices(graph));
+
+  std::unordered_map<string, Node*> outside_compilation_inputs;
+  for (Node* n : graph->op_nodes()) {
+    string lifted_arg_inputs_attr;
+    if (n->type_string() == "IdentityN" &&
+        GetNodeAttr(n->def(), kXlaOutsideCompilationInputsAttrName,
+                    &lifted_arg_inputs_attr)
+            .ok()) {
+      outside_compilation_inputs[lifted_arg_inputs_attr] = n;
+    }
+  }
+  for (const auto& iter : outside_compilation_nodes) {
+    TF_RETURN_IF_ERROR(ReplicateOutsideCompilationEdges(
+        iter.second, outside_compilation_node_images,
+        outside_compilation_inputs, graph));
+  }
+  TF_RETURN_IF_ERROR(
+      RemoveOutsideCompilationNodes(outside_compilation_node_images, graph));
+  TF_RETURN_IF_ERROR(LowerOutsideCompilationFunctionalNodes(
+      graph, *options.flib_def, tpu_replicate_device_names_mapping));
+
+  TF_RETURN_IF_ERROR(PlaceUnassignedDeviceNodesOnTPUIfPossible(graph));
+  VLOG(1) << DumpGraphToFile("distributed_tpu_compilation_after", *graph,
+                             options.flib_def);
+  VLOG(1) << "DistributedTPURewritePass::Run() finished";
+
+  if (enable_cross_replica_sharding_mirrored_variables_) {
+    VLOG(1) << "Starting host training loop optimization.";
+    VLOG(1) << DumpGraphToFile("host_loop_optimization_before", *graph,
+                               options.flib_def);
+    TF_RETURN_IF_ERROR(
+        PerformHostTrainingLoopOptimization(graph, options.flib_def, flr));
+    VLOG(1) << DumpGraphToFile("host_loop_optimization_after", *graph,
+                               options.flib_def);
+    VLOG(1) << "Host training loop optimization finished.";
+  }
+
+  return Status::OK();
+}
+
+bool DistributedTPURewritePass::distribute_vars_ = false;
+bool DistributedTPURewritePass::allow_xla_spmd_partition_ = true;
+bool DistributedTPURewritePass::
+    replicate_inputs_outputs_by_default_for_xla_spmd_ = false;
+bool DistributedTPURewritePass::
+    enable_cross_replica_sharding_mirrored_variables_ = true;
+bool DistributedTPURewritePass::enable_automatic_model_parallelism_ = false;
+
+/*static*/ void DistributedTPURewritePass::SetDistributedTpuRewritePassOptions(
+    bool distribute_vars, bool allow_xla_spmd_partition,
+    bool replicate_inputs_outputs_by_default_for_xla_spmd,
+    bool enable_cross_replica_sharding_mirrored_variables,
+    bool enable_automatic_model_parallelism) {
+  distribute_vars_ = distribute_vars;
+  allow_xla_spmd_partition_ = allow_xla_spmd_partition;
+  replicate_inputs_outputs_by_default_for_xla_spmd_ =
+      replicate_inputs_outputs_by_default_for_xla_spmd;
+  enable_cross_replica_sharding_mirrored_variables_ =
+      enable_cross_replica_sharding_mirrored_variables;
+  enable_automatic_model_parallelism_ = enable_automatic_model_parallelism;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.h b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.h
new file mode 100644
index 00000000000..a9692cc0edb
--- /dev/null
+++ b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.h
@@ -0,0 +1,595 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Rewrites TPUReplicate nodes into replicated computations on TPU.
+//
+// To represent a distributed TPU computation, we use the
+// TPUReplicate operator, that describes a subgraph (represented as a
+// Tensorflow function) to replicate across a TPU pod.
+//
+// Model parallelism and data parallelism:
+// ---------------------------------------
+// We support two different kinds of parallelism on TPU:
+// * data parallelism (replication), or parallelization across batches, and
+// * model parallelism, or parallelization within a batch.
+//
+// The function passed to a TPUReplicate operator is replicated many
+// times across a TPU pod (data parallelism). The `num_replicas` attribute
+// controls how many replicas of the computation to create. Replicas are mostly
+// independent; replicas can only communicate using the CrossReplicaSum
+// operator, which is typically used to communicate gradients during training.
+//
+// Each replica may optionally use more than one TPU core (model
+// parallelism). The `num_cores_per_replica` attribute controls how many cores
+// there are per replica. For each core, there is a virtual TPU_REPLICATED_CORE
+// device that is only valid within replicated TPU computations (e.g.,
+// TPU_REPLICATED_CORE:0, TPU_REPLICATED_CORE:1, etc.); each TPU_REPLICATED_CORE
+// device corresponds to one TPU core in every replica.
+// Each replica has runs its own copy of the computation assigned to each
+// TPU_REPLICATED_CORE device.
+//
+// The Python code is responsible for providing a device_assignment that
+// describes how the replicated logical cores map to physical cores on the TPU
+// topology.
+//
+// Inputs to TPUReplicate:
+// ------------------------------
+// The TPUReplicate operator takes three kinds of inputs, in the
+// following order:
+// * per-replica inputs. If there are three per-replica inputs (A, B, C) and two
+//   replicas, the first six arguments to TPUReplicate will be:
+//   A0 B0 C0 A1 B1 C1
+//   where Ai is the A input to the i-th replica.
+// * distributed inputs. These inputs follow the per-replica inputs.
+//   If there are two distributed inputs (E, F) and two replicas, the following
+//   arguments to TPUReplicate will be: E F.
+//   But there is local E and F on each replica.
+// * broadcast inputs. These inputs follow the distributed inputs. All
+//   replicas receive a copy of each of these inputs.
+// * variables. Resource variables accessed by the computation follow the
+//   broadcast inputs.
+//
+// For example, for a computation with two replicas, three per-replica inputs
+// (A, B, C), two distributed inputs(E, F), two broadcast inputs (X, Y), and two
+// variables (V, W), the arguments to TPUReplicate will be:
+// A0 B0 C0 A1 B1 C1 E F X Y V W
+// and each replica will receive the following arguments:
+// A B C E F X Y V W
+//
+// Distributed TPU compilation requires that the shapes of all operators
+// be known statically at compilation time, before any nodes have executed.
+// Shapes are determined using shape information emitted by InferShapes. It
+// is not possible to replicate Tensorflow operators with unknown or dynamic
+// shapes for TPU at present.
+//
+// Graph rewrite:
+// --------------
+// Compilation replaces TPUReplicate operators with:
+// * a single TPUCompile node that compiles the computations,
+// * one TPUExecute node for each TPU device in the system that
+//   executes the relevant computation,
+// * one ReadVariableOp for each variable accessed by the replicated
+//   computation,
+// * one AssignVariableOp for each variable accessed by the replicated
+//   computation. An assignment is built even if a variable is only read by the
+//   computation. We do not know which variables are written until we apply the
+//   XlaCompiler to the computation, but that does not happen until after the
+//   rewrite. Conservatively, we write back the values of all variables after
+//   the computation completes.
+//   TODO(phawkins): only write back variables that the computation may write.
+// * one Shape node for each Tensor or Variable input to the computation whose
+//   shape is not statically known at rewrite time. The input shapes are fed
+//   to the TPUCompile node.
+//
+// To ensure that the reads and writes seem to happen at the right time in the
+// graph execution, we add control edges from all predecessors of the original
+// TPUReplicate operator to each of the ReadVariableOp operators.
+// Similarly, we add control edges from all of the AssignVariableOp operators to
+// all of the successors of the TPUReplicate operator.
+//
+// The TPUReplicate rewrite must run before placement, since resource
+// variable inputs will have DT_RESOURCE, which cannot be sent across devices,
+// leading to objections from the placer. The rewrite rewrites the resource
+// accesses into explicit ReadVariableOp and AssignVariableOp operators that the
+// placer is free to colocate with the variables.
+
+#ifndef TENSORFLOW_CORE_TPU_GRAPH_REWRITE_DISTRIBUTED_TPU_REWRITE_PASS_H_
+#define TENSORFLOW_CORE_TPU_GRAPH_REWRITE_DISTRIBUTED_TPU_REWRITE_PASS_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/container/node_hash_map.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/jit/shape_inference.h"
+#include "tensorflow/compiler/xla/service/computation_placer.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/stream_executor/tpu/tpu_topology.h"
+
+namespace tensorflow {
+
+// Replaces clusters assigned to TPU_SYSTEM devices with
+// TPUCompile and TPUExecute nodes assigned to the corresponding
+// TPU devices.
+class DistributedTPURewritePass : public GraphOptimizationPass {
+ public:
+  static void SetDistributedTpuRewritePassOptions(
+      bool distribute_vars, bool allow_xla_spmd_partition,
+      bool replicate_inputs_outputs_by_default_for_xla_spmd,
+      bool enable_cross_replica_sharding_mirrored_variables,
+      bool enable_automatic_model_parallelism);
+
+  Status Run(const GraphOptimizationPassOptions& options) override;
+
+  // The following methods are public only for the use of unit tests.
+
+  // See comment at the top of the file for how the inputs are ordered.
+  // Encapsulates the different TPU replicated node input and output
+  // information, and provide common APIs over them.
+  class ParameterInfo {
+   public:
+    ParameterInfo() {}
+    ParameterInfo(int64 num_replicas, int64 num_per_replica_args,
+                  int64 num_distributed_args, int64 num_broadcast_args,
+                  int64 num_variables, int64 num_guaranteed_constants,
+                  int64 num_retvals_per_replica)
+        : num_replicas_(num_replicas),
+          num_per_replica_args_(num_per_replica_args),
+          num_distributed_args_(num_distributed_args),
+          num_broadcast_args_(num_broadcast_args),
+          num_variables_(num_variables),
+          num_guaranteed_constants_(num_guaranteed_constants),
+          num_retvals_per_replica_(num_retvals_per_replica) {}
+
+    int64 NumReplicas() const { return num_replicas_; }
+
+    int64 NumPerReplicaArgs() const { return num_per_replica_args_; }
+
+    int64 NumDistributedArgs() const { return num_distributed_args_; }
+
+    int64 NumBroadcastArgs() const { return num_broadcast_args_; }
+
+    int64 NumVariables() const { return num_variables_; }
+
+    int64 NumGuaranteedConstants() const { return num_guaranteed_constants_; }
+
+    int64 NumRetvalsPerReplica() const { return num_retvals_per_replica_; }
+
+    bool IsPerReplicaArg(int64 index) const {
+      return index < num_per_replica_args_;
+    }
+
+    bool IsDistributedArg(int64 index) const {
+      return index >= num_per_replica_args_ &&
+             index < (num_per_replica_args_ + num_distributed_args_);
+    }
+
+    bool IsBroadcastArg(int64 index) const {
+      return index >= num_per_replica_args_ &&
+             index < (num_per_replica_args_ + num_distributed_args_ +
+                      num_broadcast_args_);
+    }
+
+    bool IsVariableArg(int64 index) const {
+      return index >= (num_per_replica_args_ + num_broadcast_args_) &&
+             index < (num_per_replica_args_ + num_distributed_args_ +
+                      num_broadcast_args_ + num_variables_);
+    }
+
+    bool IsConstantArg(int64 index) const {
+      return index >= (num_per_replica_args_ + num_distributed_args_ +
+                       num_broadcast_args_ + num_variables_) &&
+             index < (num_per_replica_args_ + num_distributed_args_ +
+                      num_broadcast_args_ + num_variables_ +
+                      num_guaranteed_constants_);
+    }
+
+    // Returns the number of inputs which has been received by the host.
+    int64 NumInputsFromHost() const {
+      return num_replicas_ * num_per_replica_args_ + num_distributed_args_ +
+             num_broadcast_args_ + num_variables_ + num_guaranteed_constants_;
+    }
+
+    // Returns the number of inputs which will be sent to each replica.
+    int64 NumInputsToEachReplica() const {
+      return num_per_replica_args_ + num_distributed_args_ +
+             num_broadcast_args_ + num_variables_ + num_guaranteed_constants_;
+    }
+
+    // Returns the total number of output values returned to the host (for all
+    // replicas).
+    int64 NumOutputsToHost() const {
+      return num_replicas_ * num_retvals_per_replica_;
+    }
+
+    // Returns the position of the first per-replica argument, within the set
+    // of all hosts arguments.
+    // Broadcast arguments follow the distributed arguments.
+    int64 FirstBroadcastArgFromHost() const {
+      return num_replicas_ * num_per_replica_args_ + num_distributed_args_;
+    }
+
+    // Indices of mirrored variables across replicas, which should be
+    // categorized as per_replica_args.
+    const std::set<int64>& mirrored_variable_indices() const {
+      return mirrored_variable_indices_;
+    }
+    std::set<int64>* mutable_mirrored_variable_indices() {
+      return &mirrored_variable_indices_;
+    }
+
+   private:
+    int64 num_replicas_ = 1;
+    int64 num_per_replica_args_ = 0;
+    int64 num_distributed_args_ = 0;
+    int64 num_broadcast_args_ = 0;
+    int64 num_variables_ = 0;
+    int64 num_guaranteed_constants_ = 0;
+    int64 num_retvals_per_replica_ = 0;
+    std::set<int64> mirrored_variable_indices_;
+  };
+
+  // Mapping from TPUReplicate cluster name to tpu device names. Value is a
+  // mapping from [replica][core] to a TF device name.
+  typedef absl::flat_hash_map<string, std::vector<std::vector<string>>>
+      TPUReplicateDeviceNamesMapping;
+
+  // Determines which devices to use to run the computation.
+  // Inputs:
+  // * num_tpus_per_task: the number of TPU devices attached to each task
+  // * tpu_devices: a [task][device] collection of TPU devices
+  // * num_replicas: the number of replicas requested
+  // * num_cores_per_replica: the number of cores in each computation instance
+  // * topology_attr: the topology TPUReplicate attribute
+  // * device_assignment_attr: the device_assignment TPUReplicate attribute
+  // Outputs:
+  // * tf_device_assignment: a mapping from [replica][core] to a TF device name
+  // * xla_device_assignment: a mapping from [replica][core] to a linearized TPU
+  //   coordinate.
+  // TODO(phawkins): change tf_device_assignment to an xla::Array2D.
+  static Status BuildDeviceAssignment(
+      const tpu::TpuTopologyExternal& topology, int num_tpus_per_task,
+      const std::vector<std::vector<Device*>>& tpu_devices, int num_replicas,
+      int num_cores_per_replica, const string& topology_attr,
+      absl::Span<const int> device_assignment_attr,
+      std::vector<std::vector<string>>* tf_device_assignment,
+      std::unique_ptr<xla::DeviceAssignment>* xla_device_assignment);
+
+  // Returns the `computation` graph attached to TPUReplicate operator
+  // `node`. `flr` is a FunctionLibraryRuntime to use when
+  // instantiating the function body. Sets `*arg_types` and
+  // `*retval_types` to the argument/return types of the function.
+  static Status GetComputationForTPUReplicateOp(const NameAttrList& function,
+                                                FunctionLibraryRuntime* flr,
+                                                Graph* computation,
+                                                DataTypeVector* arg_types,
+                                                DataTypeVector* retval_types);
+
+  // Returns the shapes of the argument tensors and return values of the
+  // TPUReplicate operator `node` using the _output_shapes,
+  // _output_handle_shapes, and _output_handle_types annotations on the input
+  // nodes. Expects inputs in the following order (see comment at top of file):
+  // * num_replicas * num_per_replica_args per-replica inputs,
+  // * num_broadcast_args broadcast inputs,
+  // * num_variables variable inputs.
+  // Returns an error if the input shapes to `node` are not statically known.
+  // Also verifies that all replicas have identical input shapes for their
+  // per-replica inputs.
+  static Status GetArgAndRetvalShapes(
+      const GraphShapeInfo& shape_info, const Node& node,
+      const ParameterInfo& params_info, std::vector<InferredShape>* arg_shapes,
+      std::vector<InferredShape>* retval_shapes);
+
+  // Assigns arguments and return values to cores. The assignment is represented
+  // as an XLA op sharding, so that an argument can be replicated across cores.
+  // `arg_sharding` and `retval_sharding` are vectors of shardings indexed by
+  // argument/retval number.
+  // `arg_fast_mem` is vector of fast_mem indication which is indexed by
+  // argument number.
+  static Status AssignArgsAndRetvalsToCores(
+      int num_cores_per_replica, const ParameterInfo& params_info,
+      const DataTypeVector& arg_types,
+      const std::vector<InferredShape>& arg_shapes,
+      const DataTypeVector& retval_types,
+      const std::vector<InferredShape>& retval_shapes, const Graph& graph,
+      const Node* replicate_node, FunctionLibraryRuntime* flr,
+      bool allow_parameter_replication_for_spmd,
+      std::vector<::xla::OpSharding>* arg_sharding,
+      std::vector<bool>* arg_fast_mem,
+      std::vector<::xla::OpSharding>* retval_sharding,
+      std::vector<std::string>* arg_names);
+
+  // Computes a fingerprint of the contents of `library`.
+  static Status FingerprintFunctionLibrary(
+      const FunctionLibraryDefinition& library, uint64* fingerprint);
+
+  // Populates `*variables` with the "variables" inputs to `index`-th output of
+  // `node`.
+  struct VariableInput {
+    Node* node;
+    int index;
+
+    // Type of the variable's value. Note that this is different to the type of
+    // the output of 'variable', which is always DT_RESOURCE.
+    DataType dtype;
+  };
+  static Status FindVariableInputs(const Node& node,
+                                   const NameRangeMap& input_range_map,
+                                   std::vector<VariableInput>* variables);
+
+  // Populates '*guaranteed_constants' with the "guaranteed_constants" inputs
+  // to 'node'.
+  static Status FindGuaranteedConstantInputs(
+      const Node& node, const NameRangeMap& input_range_map,
+      std::vector<Node*>* guaranteed_constants);
+
+  // Builds Shape nodes that compute the shapes of arguments whose shapes are
+  // not statically known.
+  static Status BuildDynamicShapeNodes(
+      const Node& replicate_node, const std::vector<InferredShape>& arg_shapes,
+      const ParameterInfo& params_info,
+      const std::vector<Node*>& variable_reads, Graph* graph,
+      std::vector<Node*>* dynamic_shape_nodes);
+
+  // Builds a TPUCompile node that compiles the computation in
+  // `function_names`. calls `nodes`.
+  // TODO(b/33943292): at present, for model parallelism with Send/Recv to work
+  // the `nodes` must correspond to the computations assigned to TPU:0,
+  // TPU:1, ... in order since XLA hard-codes the chip IDs in the generated
+  // executables.
+  static Status BuildCompileNode(
+      const Node* replicate_node, const NameAttrList& function,
+      uint64 library_fingerprint, const ParameterInfo& params_info,
+      const std::vector<InferredShape>& arg_shapes,
+      const DataTypeVector& arg_types,
+      const std::vector<Node*>& guaranteed_constant_nodes,
+      const string& session_handle,
+      const std::vector<::xla::OpSharding>& arg_sharding,
+      const std::vector<bool>& arg_fast_mem,
+      const std::vector<std::string>& arg_names,
+      const std::vector<::xla::OpSharding>& retval_sharding,
+      int num_cores_per_replica, const string& compile_device,
+      const xla::DeviceAssignment* xla_device_assignment,
+      const std::vector<Node*>& dynamic_shape_nodes, Graph* graph,
+      Node** compile_node, int64 autotuner_thresh);
+
+  // Builds a TPUCompileSucceededAssert node that verifies that compilation
+  // succeeded and replaces the TPUCompilationStatus node in the graph.
+  static Status BuildCompilationStatusReturnNodes(
+      Node* replicate_node, Node* compile_node,
+      Node** control_after_compilation, Graph* graph);
+
+  // Builds ReadVariableOp nodes that read `variables`, with a control
+  // edges that ensure they happen after `control_predecessor`.
+  static Status BuildVariableReads(absl::Span<const VariableInput> variables,
+                                   Node* control_predecessor, Graph* graph,
+                                   std::vector<Node*>* variable_reads);
+
+  // Returns true if graph or functions contain resource write op, otherwise
+  // return false.
+  // TODO(b/137048563): Recognize unused resource rewrite op.
+  static bool ContainsResourceWriteOp(const Graph& graph,
+                                      const FunctionLibraryDefinition& fld);
+  // Struct that describes a variable value to be written back from TPUExecute.
+  struct VariableWrite {
+    // A node:output pair containing a boolean tensor that determines whether
+    // the value should be written back.
+    Node* predicate;
+    int predicate_output;
+
+    // A node:output pair containing the value to be written back.
+    Node* value;
+    int value_output;
+  };
+
+  // Builds AssignVariableOp nodes that write `variables` with the values from
+  // `variable_writes`, with control edges that ensure the writes happen before
+  // `control_successor`.
+  static Status BuildVariableWrites(
+      absl::Span<const VariableInput> variables, Node* control_successor,
+      absl::Span<const VariableWrite> variable_writes, Graph* graph);
+
+  // Builds TPUExecute operators assigned to each TPU device
+  // involved in the computation.
+  // Arguments:
+  // * `params_info` is the structure containing the information about the
+  //    TPUReplicate node inputs and outputs.
+  // * `num_tasks` is the number of TensorFlow tasks in the slice.
+  // * `num_cores_per_replica` is the number of cores which are dedicated to
+  //    each replica.
+  // * `replicate_node` is the original TPUReplicate node.
+  // * `arg_names` are the names of the arguments to the computation function
+  //    passed as argument to TPUReplicate, including per-replica,
+  //    broadcast, and variable arguments.
+  // * `arg_types` are the corresponding types of the arguments.
+  // * `arg_shapes` are the corresponding shapes (and handle types/shapes, if
+  //    applicable).
+  // * `arg_shardings` and `retval_shardings` are mappings from
+  //    arguments/return indices to shardings, as returned by
+  //    `AssignArgsAndRetvalsToCores`.
+  // * `pod_devices` lists the devices to assign to each core of each replica.
+  // * `variable_reads` is a vectors of ReadVariableOp operators, one for each
+  //    variable argument to the computation.
+  // * The execute operators will have a control edge from
+  //   `control_predecessor` and another control edge to `control_successor`.
+  // Populates '*variable_writes' with information about variable values to
+  // write back.
+  static Status BuildExecuteNodes(
+      const ParameterInfo& params_info, int num_tasks,
+      int num_cores_per_replica, const Node& replicate_node,
+      const std::vector<std::string>& arg_names,
+      const DataTypeVector& arg_types,
+      const std::vector<InferredShape>& arg_shapes,
+      const DataTypeVector& retval_types,
+      const std::vector<::xla::OpSharding>& arg_shardings,
+      const std::vector<::xla::OpSharding>& retval_shardings,
+      const std::vector<std::vector<string>>& tpu_device_names,
+      Node* compile_node, const std::vector<Node*>& variable_reads,
+      Node* control_predecessor, Node* control_successor,
+      std::vector<VariableWrite>* variable_writes, Graph* graph);
+
+  // Connects the compile node to all the host transfer nodes, and removes the
+  // key placeholder node that was previously standing in for it.
+  // Arguments:
+  // * `compile_node` is the TPUCompile node that has been added to the graph.
+  // * `key_placeholder_node` is the placeholder node to send the key to all the
+  // host
+  // * transfer nodes in the original graph.
+  // * `graph` is the graph being rewritten.
+  static Status ConnectHostComputeNodes(Node* compile_node,
+                                        Node* key_placeholder_node,
+                                        Graph* graph);
+
+  // Map from a Node in an outside_compilation cluster in the original graph to
+  // the list of Nodes, one for each replica, that it is expanded into during
+  // replication.
+  typedef absl::node_hash_map<Node*, std::vector<Node*>> NodeToNodeReplicasMap;
+
+  // Map from the name of an outside_compilation cluster to the model-parallel
+  // core index that the HostCompute Op should be placed on in that cluster.
+  typedef std::map<string, int> HostComputeCoreMap;
+
+  // Map from the name of an outside_compilation cluster to the list of Nodes
+  // that should run on the host for that cluster.
+  typedef std::map<string, std::vector<Node*>> OutsideCompilationNodeMap;
+
+  // Copies the outside_compilation nodes in a cluster to create replica
+  // replica_index.
+  static Status CopyOutsideCompilationNodes(
+      int replica_index, const std::vector<Node*>& outside_compilation_nodes,
+      const DeviceNameUtils::ParsedName& tpu_device,
+      const DeviceNameUtils::ParsedName& partial_device,
+      NodeToNodeReplicasMap* node_images, Graph* graph);
+
+  // Replicates all the nodes in outside_compilation clusters in a compiled
+  // computation.
+  static Status ReplicateOutsideCompilationNodes(
+      const std::vector<std::vector<string>>& tf_device_assignment,
+      const HostComputeCoreMap& host_compute_core,
+      const OutsideCompilationNodeMap& outside_compilation_nodes,
+      NodeToNodeReplicasMap* node_images, Graph* graph);
+
+  // Lifts the edges between original outside_compilation nodes in a cluster
+  // onto their replicas.
+  static Status CopyOutsideCompilationEdges(
+      const std::vector<Node*>& outside_compilation_nodes,
+      const NodeToNodeReplicasMap& node_images,
+      const std::unordered_map<string, Node*> outside_compilation_inputs,
+      Graph* graph);
+
+  // Lifts all the edges in outside_compilation clusters in a compiled
+  // computation to their replicas.
+  static Status ReplicateOutsideCompilationEdges(
+      const OutsideCompilationNodeMap& outside_compilation_nodes,
+      const NodeToNodeReplicasMap& node_images,
+      const std::unordered_map<string, Node*> outside_compilation_inputs,
+      Graph* graph);
+
+  // Removes all the original outside_compilation nodes from the graph,
+  // following replication.
+  static Status RemoveOutsideCompilationNodes(
+      const NodeToNodeReplicasMap& node_images, Graph* graph);
+
+  // Lowers outside compilation functional nodes (If/While/function call).
+  // Otherwise, when we have multiple workers, device placer will not be able to
+  // place nodes if outside compilation has DT_RESOURCE inputs (e.g. a
+  // DT_RESOURCE input fed into multiple While nodes on different devices).
+  static Status LowerOutsideCompilationFunctionalNodes(
+      Graph* g, const FunctionLibraryDefinition& flib_def,
+      const TPUReplicateDeviceNamesMapping& tpu_replicate_device_names_mapping);
+
+  // Parses the 'host_compute_core' attribute on replicate_node to get the
+  // replicated core id of each outside_compilation cluster.
+  static Status ParseHostComputeCores(
+      const Node& replicate_node,
+      const OutsideCompilationNodeMap& outside_compilation_nodes,
+      HostComputeCoreMap* host_compute_core);
+
+  // Gets the physical topology information about the TPU system.
+  static Status GetDeviceTopology(
+      const DeviceSet& device_set, const Node& replicate_node,
+      int* num_replicas, int* num_cores_per_replica, int* num_tasks,
+      std::vector<std::vector<string>>* tf_device_assignment,
+      std::unique_ptr<xla::DeviceAssignment>* xla_device_assignment,
+      string* tpu_compilation_device);
+
+  // Gets the types of args, retvals, and parameters.
+  static Status GetIOTypes(
+      int num_replicas, const Node& replicate_node, FunctionLibraryRuntime* flr,
+      Graph* graph, NameRangeMap* input_name_map, const NameAttrList** function,
+      std::unique_ptr<Graph>* computation, DataTypeVector* arg_types,
+      DataTypeVector* retval_types, ParameterInfo* params_info);
+
+  // Find known constants and deals with variable reads.
+  static Status DealWithConstantsAndVariables(
+      const Node& replicate_node, const NameRangeMap& input_name_map,
+      Graph* graph, Node* host_transfer_sequencer, Node* control_before,
+      Node* control_after, absl::Span<const VariableInput> variable_nodes,
+      std::vector<Node*>* guaranteed_constant_nodes,
+      std::vector<Node*>* variable_reads);
+
+  // Adds NoOp nodes for sequencing computation and variable reads/writes.
+  static Status BuildSequencingNodes(const string& tpu_compilation_device,
+                                     const Node& replicate_node, Graph* graph,
+                                     Node** host_transfer_sequencer,
+                                     Node** control_before,
+                                     Node** control_after);
+
+  // Performs the pass's rewrite on a TPUReplicate node `node`.
+  static Status RewriteTPUReplicateNode(
+      const string& session_handle, const DeviceSet& device_set,
+      Node* replicate_node, FunctionLibraryDefinition* flib_def,
+      FunctionLibraryRuntime* flr, Node* host_compute_key_placeholder_node,
+      const OutsideCompilationNodeMap& outside_compilation_nodes,
+      const std::vector<Node*>& head_tail_outside_compilation_nodes,
+      NodeToNodeReplicasMap* outside_compilation_node_images, Graph* graph,
+      const GraphShapeInfo& shape_info,
+      TPUReplicateDeviceNamesMapping* tpu_replicate_device_names_mapping,
+      int64 autotuner_thresh);
+
+  // Performs host training loop optimization. For example, when TPUExecute
+  // node is inside a while loop, then model weight variables can be sharded
+  // in XLA preferred layout and then unsharded only at the very last iteration
+  // to reduce the number of all_gather.
+  static Status PerformHostTrainingLoopOptimization(
+      Graph* graph, FunctionLibraryDefinition* flib_def,
+      FunctionLibraryRuntime* flr);
+
+  // Heuristically place some nodes with unassigned devices on TPUs for
+  // performance reasons.
+  static Status PlaceUnassignedDeviceNodesOnTPUIfPossible(Graph* graph);
+
+  // Updates the head and tail outside compiled nodes so that nodes have the
+  // correct device and removes the replication and outside compilation
+  // attributes so that these nodes do not trigger further graph optimization
+  // passes.
+  static Status UpdateHeadTailOutsideCompilation(
+      const std::vector<std::vector<string>>& tf_device_assignment,
+      const std::vector<Node*>& head_tail_outside_compilation_nodes);
+
+ private:
+  static bool distribute_vars_;
+  static bool allow_xla_spmd_partition_;
+  static bool replicate_inputs_outputs_by_default_for_xla_spmd_;
+  static bool enable_cross_replica_sharding_mirrored_variables_;
+  static bool enable_automatic_model_parallelism_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_GRAPH_REWRITE_DISTRIBUTED_TPU_REWRITE_PASS_H_
diff --git a/tensorflow/compiler/jit/xla_kernel_creator_util.h b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass_internal.cc
similarity index 50%
rename from tensorflow/compiler/jit/xla_kernel_creator_util.h
rename to tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass_internal.cc
index f090f55f354..46c10b90dc1 100644
--- a/tensorflow/compiler/jit/xla_kernel_creator_util.h
+++ b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass_internal.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,22 +12,34 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_JIT_XLA_KERNEL_CREATOR_UTIL_H_
-#define TENSORFLOW_COMPILER_JIT_XLA_KERNEL_CREATOR_UTIL_H_
 
-#include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass_internal.h"
+
+#include <limits>
+
+#include "absl/random/random.h"
 
 namespace tensorflow {
+namespace {
 
-class FunctionLibraryRuntime;
-class OpKernel;
+static int64 overridden_node_id = -1;
 
-// Given a supported NodeDef, returns a XlaLaunchOp that computes the node.
-Status CreateXlaKernel(FunctionLibraryRuntime* flr, const NodeDef& node_def,
-                       std::unique_ptr<OpKernel>* kernel);
+}  // namespace
 
+namespace internal {
+
+void OverrideNodeIdForTesting(const int64 node_id) {
+  overridden_node_id = node_id;
+}
+
+uint64 GetNodeId() {
+  static absl::BitGen bitgen;
+  if (overridden_node_id > -1) {
+    return overridden_node_id;
+  } else {
+    return absl::Uniform(bitgen, uint64{0}, std::numeric_limits<uint64>::max());
+  }
+}
+
+}  // namespace internal
 }  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_JIT_XLA_KERNEL_CREATOR_UTIL_H_
diff --git a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass_internal.h b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass_internal.h
new file mode 100644
index 00000000000..ce80249c30f
--- /dev/null
+++ b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass_internal.h
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_GRAPH_REWRITE_DISTRIBUTED_TPU_REWRITE_PASS_INTERNAL_H_
+#define TENSORFLOW_CORE_TPU_GRAPH_REWRITE_DISTRIBUTED_TPU_REWRITE_PASS_INTERNAL_H_
+
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+
+// Implementation details of distributed_tpu_rewrite_pass.cc, please DO NOT
+// depend on these.
+namespace internal {
+
+// When set to a value >= 0, overrides the node_id. Used for getting
+// deterministic node_ids during testing.
+void OverrideNodeIdForTesting(int64 node_id);
+
+// Retrieves the node id, used to make some node names unique in the rewrite
+// pass.
+uint64 GetNodeId();
+
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_GRAPH_REWRITE_DISTRIBUTED_TPU_REWRITE_PASS_INTERNAL_H_
diff --git a/tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.cc b/tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.cc
new file mode 100644
index 00000000000..40f9353beb4
--- /dev/null
+++ b/tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.cc
@@ -0,0 +1,2979 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.h"
+
+#include <queue>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
+#include "tensorflow/compiler/jit/encapsulate_util.h"
+#include "tensorflow/compiler/jit/extract_outside_compilation_pass.h"
+#include "tensorflow/compiler/tf2xla/side_effect_util.h"
+#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/strings/proto_serialization.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/core/tpu/tpu_compile_interface.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
+#include "tensorflow/core/util/dump_graph.h"
+
+namespace tensorflow {
+
+namespace {
+
+const char* const kTPUReplicatedInput = "TPUReplicatedInput";
+const char* const kTPUReplicatedOutput = "TPUReplicatedOutput";
+const char* const kPivotForClusterAttr = "_pivot_for_cluster";
+
+// Finds the `index` of an _Arg or _Retval node.
+Status GetIndexAttr(const Node& n, int num_args, int* index) {
+  TF_RETURN_IF_ERROR(GetNodeAttr(n.attrs(), "index", index));
+  if (*index < 0 || *index >= num_args) {
+    return errors::InvalidArgument("Invalid ", n.type_string(), " number ",
+                                   *index);
+  }
+  return Status::OK();
+}
+
+// Rewrite function to be passed to EncapsulateSubgraphsInFunctions that sorts
+// the arguments into the order expected by TPUReplicate computations:
+// 1) replicated arguments
+// 2) non-replicated (broadcast) arguments
+// 3) resource variable arguments
+// See the documentation of EncapsulateSubgraphsInFunctions for the meaning
+// of the arguments.
+Status RewriteSubgraph(const std::vector<OutputTensor>& arg_source_tensors,
+                       std::unique_ptr<Graph>* graph_ptr,
+                       std::vector<int>* input_permutation,
+                       std::vector<int>* output_permutation,
+                       NodeDef* call_def) {
+  // Replicated inputs have TPUReplicatedInput nodes as predecessors in the
+  // input graph.
+  auto is_replicated_input = [&](const Node& n, bool* is_packed = nullptr) {
+    CHECK_EQ("_Arg", n.type_string());
+    int index;
+    TF_CHECK_OK(GetIndexAttr(n, arg_source_tensors.size(), &index));
+    bool ret =
+        arg_source_tensors.at(index).node->type_string() == kTPUReplicatedInput;
+    if (is_packed) {
+      if (!ret || !GetNodeAttr(arg_source_tensors.at(index).node->attrs(),
+                               "is_packed", is_packed)
+                       .ok()) {
+        *is_packed = false;
+      }
+    }
+    return ret;
+  };
+
+  auto get_replicated_input_index = [&](const Node& n) {
+    CHECK_EQ("_Arg", n.type_string());
+    int index;
+    TF_CHECK_OK(GetIndexAttr(n, arg_source_tensors.size(), &index));
+    if (arg_source_tensors.at(index).node->type_string() !=
+        kTPUReplicatedInput) {
+      return -1;
+    }
+    int replicated_index;
+    TF_CHECK_OK(GetNodeAttr(arg_source_tensors.at(index).node->attrs(), "index",
+                            &replicated_index));
+
+    return replicated_index;
+  };
+
+  auto is_guaranteed_constant = [&](const Node& n) {
+    bool guaranteed_constant = false;
+    if (!GetNodeAttr(n.attrs(), "_is_guaranteed_constant", &guaranteed_constant)
+             .ok()) {
+      return false;
+    }
+    // Replicated input nodes can be marked as guaranteed constants if they are
+    // const.
+    return guaranteed_constant && !is_replicated_input(n);
+  };
+
+  Graph* graph = graph_ptr->get();
+  Node* metadata_node = nullptr;
+  const int num_args = input_permutation->size();
+  const int num_retvals = output_permutation->size();
+
+  std::vector<Node*> args;
+  std::vector<Node*> retvals;
+  args.reserve(num_args);
+  retvals.reserve(num_retvals);
+  for (Node* n : graph->nodes()) {
+    if (n->type_string() == "_Arg") {
+      args.push_back(n);
+    } else if (n->type_string() == "_Retval") {
+      retvals.push_back(n);
+    } else if (n->type_string() == "TPUReplicateMetadata") {
+      metadata_node = n;
+    } else if (!str_util::StrContains(n->requested_device(),
+                                      DEVICE_TPU_REPLICATED_CORE)) {
+      // If an operator isn't assigned to a TPU core device, assign it to
+      // TPU_REPLICATED_CORE without a specific core ID. For some operators,
+      // such as variable reads/writes, the operator may be assigned to non-TPU
+      // devices due to colocation.
+      n->set_assigned_device_name(
+          strings::StrCat("/device:", DEVICE_TPU_REPLICATED_CORE));
+    }
+  }
+
+  // Read the metadata node and remove it from the graph.
+  if (metadata_node == nullptr) {
+    return errors::InvalidArgument("Missing TPUReplicateMetadata node");
+  }
+
+  for (const auto& attr : metadata_node->attrs()) {
+    if (attr.first == "computation_shape") {
+      // Convert the deprecated computation_shape attribute into a
+      // num_cores_per_replica value. If a computation_shape is present, it
+      // overrides num_cores_per_replica.
+      std::vector<int> shape;
+      TF_RETURN_IF_ERROR(
+          GetNodeAttr(metadata_node->attrs(), "computation_shape", &shape));
+      if (!shape.empty()) {
+        int64 num_cores_per_replica = 1LL;
+        for (int dim : shape) {
+          num_cores_per_replica *= dim;
+        }
+        call_def->mutable_attr()->erase("num_cores_per_replica");
+        AddNodeAttr("num_cores_per_replica", num_cores_per_replica, call_def);
+      }
+    } else {
+      call_def->mutable_attr()->insert(attr);
+    }
+  }
+  MergeDebugInfo(NodeDebugInfo(metadata_node->def()), call_def);
+  graph->RemoveNode(metadata_node);
+
+  if (std::find(args.begin(), args.end(), nullptr) != args.end()) {
+    return errors::InvalidArgument("Missing or non-consecutive arguments");
+  }
+
+  // Reorders the arguments.
+  std::sort(args.begin(), args.end(), [&](Node* a, Node* b) {
+    // Non-constants appear before constants
+    bool a_is_guaranteed_constant = is_guaranteed_constant(*a);
+    bool b_is_guaranteed_constant = is_guaranteed_constant(*b);
+    // Non-packed values appear before packed values.
+    bool a_is_packed;
+    bool b_is_packed;
+    // Replicated values appear before non-replicated values.
+    bool a_not_replicated = !is_replicated_input(*a, &a_is_packed);
+    bool b_not_replicated = !is_replicated_input(*b, &b_is_packed);
+    int a_replicated_index = get_replicated_input_index(*a);
+    int b_replicated_index = get_replicated_input_index(*b);
+    // Non-resources appear before resources
+    bool a_is_resource = (a->output_type(0) == DT_RESOURCE);
+    bool b_is_resource = (b->output_type(0) == DT_RESOURCE);
+    // Uses the name as a tiebreaker so the output is deterministic.
+    StringPiece a_name(a->name());
+    StringPiece b_name(b->name());
+    return std::tie(a_is_guaranteed_constant, a_not_replicated, a_is_packed,
+                    a_is_resource, a_replicated_index, a_name) <
+           std::tie(b_is_guaranteed_constant, b_not_replicated, b_is_packed,
+                    b_is_resource, b_replicated_index, b_name);
+  });
+  // Sorts the retvals by name so the order is deterministic.
+  std::sort(retvals.begin(), retvals.end(),
+            [](Node* a, Node* b) { return a->name() < b->name(); });
+
+  // Computes the permutation to produce the correct argument order, and update
+  // the argument indices.
+  int variable_start_index = num_args;
+  int guaranteed_const_start_index = num_args;
+  for (int i = 0; i < num_args; ++i) {
+    int index;
+    TF_RETURN_IF_ERROR(GetIndexAttr(*args[i], num_args, &index));
+    if (args[i]->output_type(0) == DT_RESOURCE &&
+        !is_replicated_input(*args[i]) && variable_start_index == num_args) {
+      variable_start_index = i;
+    } else if (is_guaranteed_constant(*args[i]) &&
+               guaranteed_const_start_index == num_args) {
+      guaranteed_const_start_index = i;
+    }
+    (*input_permutation)[index] = i;
+    args[i]->AddAttr("index", i);
+  }
+  VLOG(4) << "variable_start_index: " << variable_start_index
+          << " guaranteed_const_start_index: " << guaranteed_const_start_index;
+
+  // Computes the permutation to produce the correct retval order, and update
+  // the argument indices.
+  for (int i = 0; i < num_retvals; ++i) {
+    int index;
+    TF_RETURN_IF_ERROR(GetIndexAttr(*retvals[i], num_retvals, &index));
+    (*output_permutation)[index] = i;
+    retvals[i]->AddAttr("index", i);
+  }
+
+  AddNodeAttr(kTPUReplicateAttr, call_def->name(), call_def);
+  AddNodeAttr("_variable_start_index", variable_start_index, call_def);
+  AddNodeAttr("_guaranteed_const_start_index", guaranteed_const_start_index,
+              call_def);
+
+  // Uniquify the function name.
+  GraphDef gdef;
+  graph->ToGraphDef(&gdef);
+
+  // Before serialization, sort each node's control inputs to achieve
+  // determinism. Sorting control inputs could help (but not necessarily)
+  // create a deterministic serialization and fingerprint. Other sources of
+  // nondeterminism include unstable node ordering.
+  SortControlInputs(&gdef);
+  // Fingerprint the function.
+  // Nondeterminism in serialization would not lead to incorrect results, but
+  // may cause spurious cache misses. DeterministicSerialization is a
+  // best-effort deterministic serialization.
+  string serialized;
+  TF_RET_CHECK(SerializeToStringDeterministic(gdef, &serialized));
+  uint64 fingerprint =
+      TpuCompileInterface::Get()->FingerprintString(serialized);
+  LOG(INFO) << "Subgraph fingerprint:" << fingerprint;
+  call_def->set_op(strings::StrCat(call_def->op(), "_", fingerprint));
+  return Status::OK();
+}
+
+DataType EdgeType(const Edge* edge) {
+  return edge->dst()->input_type(edge->dst_input());
+}
+
+// Adds the control inputs of `node` to `*deps`.
+void AddControlInputs(const Node& node, gtl::FlatSet<Node*>* deps) {
+  for (const Edge* edge : node.in_edges()) {
+    if (edge->IsControlEdge()) {
+      deps->insert(edge->src());
+    }
+  }
+}
+
+// Adds the control outputs of `node` to `*deps`.
+void AddControlOutputs(const Node& node, gtl::FlatSet<Node*>* deps) {
+  for (const Edge* edge : node.out_edges()) {
+    if (edge->IsControlEdge()) {
+      deps->insert(edge->dst());
+    }
+  }
+}
+
+// We add Identity nodes for _Arg/_Retval in XLA computation. Remove those
+// Identity nodes to simplify furthur processing.
+Status RemoveIdentityNodesForArgRetval(Graph* g) {
+  // Collect Identity nodes for _Arg/_Retval.
+  std::vector<Node*> identity_nodes;
+  for (Node* n : g->nodes()) {
+    if (n->type_string() == "Identity" &&
+        (HasNodeAttr(n->def(), "_tpu_input_identity") ||
+         HasNodeAttr(n->def(), "_tpu_output_identity"))) {
+      identity_nodes.push_back(n);
+    }
+  }
+
+  // Remove those Identity nodes.
+  for (Node* n : identity_nodes) {
+    const Edge* input_edge;
+    TF_RETURN_IF_ERROR(n->input_edge(0, &input_edge));
+
+    std::vector<const Edge*> output_edges;
+    for (const Edge* e : n->out_edges()) {
+      output_edges.push_back(e);
+    }
+    for (const Edge* e : output_edges) {
+      if (e->IsControlEdge()) {
+        Node* dst = e->dst();
+        g->RemoveEdge(e);
+        g->AddControlEdge(input_edge->src(), dst);
+      } else {
+        Node* dst = e->dst();
+        int dst_input = e->dst_input();
+        g->RemoveEdge(e);
+        g->AddEdge(input_edge->src(), input_edge->src_output(), dst, dst_input);
+      }
+    }
+    g->RemoveNode(n);
+  }
+
+  return Status::OK();
+}
+
+// Move outside compilation nodes at the beginning of XLA computation to host.
+// For XLA computation graph, we will add new _Arg nodes to replace those
+// outside compilation nodes.
+// For host graph, we will move those outside compilation nodes to host,
+// replicate them, and use them as XLA node's input.
+Status MoveHeadOutsideCompilationToHost(
+    const string& outside_compilation_attr_name, const string& xla_func_name,
+    const std::string& cluster_name, Graph* g, Graph* xla_graph, Node* xla_node,
+    Node* pivot_node) {
+  // Find outside compilation nodes that only have _Arg or other outside
+  // compilation nodes as input. These nodes will be moved to host graph.
+  std::vector<Node*> oc_nodes_at_head;
+  const string kOnlyArgOrOcInputAttrName = "_xla_only_arg_or_oc_input";
+  ReverseDFS(
+      *xla_graph, /*enter=*/nullptr,
+      [&](Node* n) {
+        bool has_non_arg_or_oc_input = false;
+        for (const Edge* e : n->in_edges()) {
+          if (e->src() == xla_graph->source_node()) {
+            continue;
+          }
+          if (!e->src()->IsArg() &&
+              (!HasNodeAttr(e->src()->def(), outside_compilation_attr_name) ||
+               !HasNodeAttr(e->src()->def(), kOnlyArgOrOcInputAttrName))) {
+            has_non_arg_or_oc_input = true;
+            break;
+          }
+        }
+        if (HasNodeAttr(n->def(), outside_compilation_attr_name) &&
+            !has_non_arg_or_oc_input &&
+            !HasNodeAttr(n->def(), kXlaIsPlaceholderForArg)) {
+          n->AddAttr(kOnlyArgOrOcInputAttrName, true);
+          oc_nodes_at_head.push_back(n);
+        }
+      },
+      NodeComparatorName());
+  std::vector<Node*> const_nodes_to_remove;
+  for (Node* n : oc_nodes_at_head) {
+    // If a Const node is in "oc_nodes_at_head" but some of its successors are
+    // not, copy this Const node and use the copied node for those successors.
+    if (n->type_string() != "Const") {
+      continue;
+    }
+
+    std::vector<const Edge*> edges_to_replace;
+    for (const Edge* e : n->out_edges()) {
+      if (!e->IsControlEdge() &&
+          HasNodeAttr(e->dst()->def(), outside_compilation_attr_name) &&
+          !HasNodeAttr(e->dst()->def(), kOnlyArgOrOcInputAttrName)) {
+        edges_to_replace.push_back(e);
+      }
+    }
+    if (edges_to_replace.empty()) {
+      continue;
+    }
+
+    Node* const_copy = xla_graph->CopyNode(n);
+    for (const Edge* e : edges_to_replace) {
+      Node* dst = e->dst();
+      int dst_input = e->dst_input();
+      xla_graph->RemoveEdge(e);
+      xla_graph->AddEdge(const_copy, 0, dst, dst_input);
+    }
+    // Make sure the copied node can be traced from source node.
+    xla_graph->AddControlEdge(xla_graph->source_node(), const_copy);
+
+    // If this Const node has no data output any more, remove it later.
+    bool has_output_edge = false;
+    for (const Edge* e : n->out_edges()) {
+      if (!e->IsControlEdge()) {
+        has_output_edge = true;
+        break;
+      }
+    }
+    if (!has_output_edge) {
+      const_nodes_to_remove.push_back(n);
+    }
+  }
+  for (Node* n : const_nodes_to_remove) {
+    xla_graph->RemoveNode(n);
+    oc_nodes_at_head.erase(
+        std::remove(oc_nodes_at_head.begin(), oc_nodes_at_head.end(), n),
+        oc_nodes_at_head.end());
+  }
+  if (VLOG_IS_ON(5)) {
+    for (Node* n : oc_nodes_at_head) {
+      VLOG(5) << "oc_nodes_at_head: " << n->DebugString();
+    }
+  }
+
+  // Copy all nodes in `oc_nodes_at_head` to host graph, and also replicate
+  // them.
+
+  // Sometimes `xla_node` can have a lot of inputs, calling Node::input_edge
+  // will become very expensive in this case because it is doing a linear
+  // search inside. Create an input_edges vector ahead to make the lookups
+  // faster.
+  std::vector<const Edge*> input_edges;
+  TF_RETURN_IF_ERROR(xla_node->input_edges(&input_edges));
+
+  std::vector<DataType> input_types;
+  TF_RETURN_IF_ERROR(GetNodeAttr(xla_node->attrs(), "Tinputs", &input_types));
+  int num_distributed_vars;
+  TF_RETURN_IF_ERROR(GetNodeAttr(xla_node->attrs(), "num_distributed_variables",
+                                 &num_distributed_vars));
+  int num_replicas;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(xla_node->attrs(), "num_replicas", &num_replicas));
+  int old_num_per_replica_inputs =
+      (input_types.size() - num_distributed_vars) / num_replicas;
+  VLOG(5) << "old_num_per_replica_inputs: " << old_num_per_replica_inputs;
+  std::map<Node*, std::vector<Node*>> node_images;
+  for (Node* n : oc_nodes_at_head) {
+    for (int replica_id = 0; replica_id < num_replicas; replica_id++) {
+      NodeDef copy_def = n->def();
+      copy_def.set_name(absl::StrCat(n->name(), "_head_oc/R", replica_id));
+      copy_def.clear_device();
+
+      Status s;
+      Node* copy_node = g->AddNode(copy_def, &s);
+      TF_RETURN_IF_ERROR(s);
+
+      copy_node->AddAttr(kXlaReplicaIdAttrName, replica_id);
+      copy_node->AddAttr(kTPUReplicateAttr, cluster_name);
+
+      for (const Edge* e : n->in_edges()) {
+        if (e->src() == xla_graph->source_node()) {
+          continue;
+        }
+        // Either e->src() is _Arg node, or it's in `node_images`.
+        if (e->src()->IsArg()) {
+          int index;
+          TF_RETURN_IF_ERROR(GetNodeAttr(e->src()->attrs(), "index", &index));
+          const int new_index =
+              (index < old_num_per_replica_inputs)
+                  ? (old_num_per_replica_inputs * replica_id + index)
+                  : (old_num_per_replica_inputs * num_replicas +
+                     (index - old_num_per_replica_inputs));
+          const Edge* original_edge = input_edges.at(new_index);
+          g->AddEdge(original_edge->src(), original_edge->src_output(),
+                     copy_node, e->dst_input());
+        } else {
+          g->AddEdge(node_images[e->src()][replica_id], e->src_output(),
+                     copy_node, e->dst_input());
+        }
+      }
+
+      // Add control edge between `copy_node` and `xla_node`, so these outside
+      // compilation nodes will be executed before XLA computation happens.
+      g->AddControlEdge(copy_node, xla_node);
+
+      // Add control edge between `pivot_node` and `copy_node`, so `copy_node`
+      // belongs to same while loop as `xla_node`.
+      if (pivot_node) {
+        g->AddControlEdge(pivot_node, copy_node);
+      }
+
+      node_images[n].push_back(copy_node);
+    }
+  }
+
+  // Record output edges from `oc_nodes_at_head`. We will create an _Arg node
+  // for each of these edges. An obvious optimization here is to deduplicate
+  // these edges by <src, src_output>. But that optimization will complicate
+  // the code, and in practice we usually do not have output edges with the
+  // same <src, src_output>.
+  std::vector<const Edge*> oc_output_edges;
+  std::vector<DataType> new_arg_types;
+  for (Node* n : oc_nodes_at_head) {
+    for (const Edge* e : n->out_edges()) {
+      if (!e->IsControlEdge() &&
+          node_images.find(e->dst()) == node_images.end()) {
+        VLOG(5) << "oc_output_edges: " << e->DebugString();
+        oc_output_edges.push_back(e);
+        new_arg_types.push_back(e->src()->output_type(e->src_output()));
+      }
+    }
+  }
+  int new_num_per_replica_inputs =
+      old_num_per_replica_inputs + oc_output_edges.size();
+  VLOG(5) << "new_num_per_replica_inputs: " << new_num_per_replica_inputs;
+
+  // Process input edges for XLA node.
+  int num_variables;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(xla_node->attrs(), "NumVariables", &num_variables));
+  std::vector<DataType> broadcast_input_types, guaranteed_constant_types;
+  TF_RETURN_IF_ERROR(GetNodeAttr(xla_node->attrs(), "Tbroadcast_inputs",
+                                 &broadcast_input_types));
+  TF_RETURN_IF_ERROR(GetNodeAttr(xla_node->attrs(), "Tguaranteed_constants",
+                                 &guaranteed_constant_types));
+  int num_other_inputs = num_distributed_vars + num_variables +
+                         broadcast_input_types.size() +
+                         guaranteed_constant_types.size();
+  VLOG(5) << "num_other_inputs: " << num_other_inputs;
+
+  // Update `Tinputs` attribute for `xla_node`.
+  std::vector<DataType> new_input_types;
+  // Order of new_input_types: old per-replica inputs -> new per-replica inputs
+  // -> distributed variables
+  new_input_types.reserve(num_replicas * new_num_per_replica_inputs +
+                          num_distributed_vars);
+  for (int replica_id = 0; replica_id < num_replicas; ++replica_id) {
+    for (int i = 0; i < old_num_per_replica_inputs; ++i) {
+      new_input_types.push_back(input_types[i]);
+    }
+    for (int i = old_num_per_replica_inputs; i < new_num_per_replica_inputs;
+         ++i) {
+      new_input_types.push_back(new_arg_types[i - old_num_per_replica_inputs]);
+    }
+  }
+  const int num_new_per_replica_input_types = new_input_types.size();
+  for (int i = input_types.size() - num_distributed_vars;
+       i < input_types.size(); i++) {
+    new_input_types.push_back(input_types[i]);
+  }
+  xla_node->ClearAttr("Tinputs");
+  xla_node->AddAttr("Tinputs", new_input_types);
+
+  int new_variable_start_index =
+      num_new_per_replica_input_types / num_replicas + num_distributed_vars +
+      broadcast_input_types.size();
+  if (xla_node->attrs().Find("_variable_start_index") != nullptr) {
+    xla_node->ClearAttr("_variable_start_index");
+    xla_node->AddAttr("_variable_start_index", new_variable_start_index);
+  }
+  int new_guaranteed_const_start_index =
+      new_variable_start_index + num_variables;
+  if (xla_node->attrs().Find("_guaranteed_const_start_index") != nullptr) {
+    xla_node->ClearAttr("_guaranteed_const_start_index");
+    xla_node->AddAttr("_guaranteed_const_start_index",
+                      new_guaranteed_const_start_index);
+  }
+
+  // Move non per-replica input edges.
+  std::vector<const Edge*> new_input_edges(
+      num_replicas * new_num_per_replica_inputs + num_other_inputs);
+  int end_input_index =
+      num_replicas * new_num_per_replica_inputs + num_other_inputs - 1;
+  int start_input_index = end_input_index + 1 - num_other_inputs;
+  for (int input_index = end_input_index; input_index >= start_input_index;
+       input_index--) {
+    const Edge* e =
+        input_edges.at(input_index - num_replicas * new_arg_types.size());
+    Node* src = e->src();
+    int src_output = e->src_output();
+    g->RemoveEdge(e);
+    const Edge* new_input_edge =
+        g->AddEdge(src, src_output, xla_node, input_index);
+    new_input_edges[input_index] = new_input_edge;
+  }
+
+  // Re-order old per-replica inputs edges, and add new per-replica input edges.
+  std::vector<std::pair<Node*, int>> per_replica_inputs;
+  std::vector<const Edge*> old_per_replica_edges;
+  for (int i = 0; i < old_num_per_replica_inputs * num_replicas; i++) {
+    const Edge* e = input_edges.at(i);
+    per_replica_inputs.push_back(std::make_pair(e->src(), e->src_output()));
+    old_per_replica_edges.push_back(e);
+  }
+  for (const Edge* e : old_per_replica_edges) {
+    g->RemoveEdge(e);
+  }
+  for (int replica_id = 0; replica_id < num_replicas; replica_id++) {
+    for (int input_index = 0; input_index < old_num_per_replica_inputs;
+         input_index++) {
+      Node* src = per_replica_inputs[replica_id * old_num_per_replica_inputs +
+                                     input_index]
+                      .first;
+      int src_output =
+          per_replica_inputs[replica_id * old_num_per_replica_inputs +
+                             input_index]
+              .second;
+      const Edge* new_input_edge =
+          g->AddEdge(src, src_output, xla_node,
+                     replica_id * new_num_per_replica_inputs + input_index);
+      new_input_edges[input_index] = new_input_edge;
+    }
+    for (int input_index = old_num_per_replica_inputs;
+         input_index < new_num_per_replica_inputs; input_index++) {
+      Node* original_src =
+          oc_output_edges[input_index - old_num_per_replica_inputs]->src();
+      int original_src_output =
+          oc_output_edges[input_index - old_num_per_replica_inputs]
+              ->src_output();
+      Node* src = node_images[original_src][replica_id];
+      const Edge* new_input_edge =
+          g->AddEdge(src, original_src_output, xla_node,
+                     replica_id * new_num_per_replica_inputs + input_index);
+      new_input_edges[input_index] = new_input_edge;
+    }
+  }
+
+  // Adjust original _Arg nodes in `xla_graph`.
+  for (Node* n : xla_graph->nodes()) {
+    if (n->IsArg()) {
+      int index;
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
+      if (index >= old_num_per_replica_inputs) {
+        index += new_arg_types.size();
+        n->ClearAttr("index");
+        n->AddAttr("index", index);
+      }
+    }
+  }
+
+  // Create new _Arg nodes in `xla_graph`.
+  for (int i = old_num_per_replica_inputs; i < new_num_per_replica_inputs;
+       i++) {
+    NodeDefBuilder arg_builder(absl::StrCat("arg_", i),
+                               FunctionLibraryDefinition::kArgOp);
+    arg_builder.Attr("T", new_arg_types[i - old_num_per_replica_inputs]);
+    arg_builder.Attr("index", i);
+    NodeDef arg_def;
+    TF_RETURN_IF_ERROR(arg_builder.Finalize(&arg_def));
+    Status s;
+    Node* arg_node = xla_graph->AddNode(arg_def, &s);
+    TF_RETURN_IF_ERROR(s);
+    const Edge* original_edge = oc_output_edges[i - old_num_per_replica_inputs];
+    Node* dst = original_edge->dst();
+    int dst_input = original_edge->dst_input();
+    xla_graph->RemoveEdge(original_edge);
+    xla_graph->AddEdge(arg_node, 0, dst, dst_input);
+  }
+
+  // For lifted arg nodes:
+  // 1. Add a Placeholder node in `xla_graph`. When we build host side graph
+  //    in ExtractOutsideCompilationPass, we will use this new Placeholder node
+  //    instead of lifted arg node here.
+  // 2. Add an IdentityN node in `g` to indicate its inputs. We will reconnect
+  //    this IdentityN node and this lifted arg node's usage nodes in
+  //    DistributedTPURewritePass.
+  for (Node* n : oc_nodes_at_head) {
+    bool is_lifted_arg;
+    string outside_compilation_attr;
+    if (!TryGetNodeAttr(n->def(), kXlaIsLiftedArgAttrName, &is_lifted_arg) ||
+        !TryGetNodeAttr(n->def(), kOutsideCompilationAttr,
+                        &outside_compilation_attr)) {
+      continue;
+    }
+
+    TF_RET_CHECK(n->IsIdentity());
+    NodeDefBuilder ph_builder(absl::StrCat("placeholder_", n->name()),
+                              "Placeholder");
+    DataType dtype;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "T", &dtype));
+    ph_builder.Attr("dtype", dtype);
+    ph_builder.Attr(kXlaIsLiftedArgAttrName, true);
+    ph_builder.Attr(kOutsideCompilationAttr, outside_compilation_attr);
+    NodeDef ph_def;
+    TF_RETURN_IF_ERROR(ph_builder.Finalize(&ph_def));
+    Status s;
+    xla_graph->AddNode(ph_def, &s);
+    TF_RETURN_IF_ERROR(s);
+
+    Node* input_node;
+    TF_RETURN_IF_ERROR(n->input_node(0, &input_node));
+    TF_RET_CHECK(input_node->type_string() == "_Arg");
+    int index;
+    TF_RETURN_IF_ERROR(GetNodeAttr(input_node->def(), "index", &index));
+    // TODO(b/74023706): for now we only support resource input (e.g. summary
+    // writer), which is non-replicated input. Support replicated input as
+    // well.
+    TF_RET_CHECK(index >= new_num_per_replica_inputs + num_distributed_vars);
+    const Edge* input_edge =
+        new_input_edges.at(num_replicas * new_num_per_replica_inputs + index -
+                           new_num_per_replica_inputs);
+    NodeDefBuilder id_builder(absl::StrCat("lifted_arg_input_", index),
+                              "IdentityN");
+    DataType input_dtype =
+        input_edge->src()->output_type(input_edge->src_output());
+    id_builder.Attr("T", std::vector<DataType>(num_replicas, input_dtype));
+    std::vector<NodeDefBuilder::NodeOut> inputs(
+        num_replicas,
+        NodeDefBuilder::NodeOut{input_edge->src()->name(),
+                                input_edge->src_output(), input_dtype});
+    id_builder.Attr(kXlaOutsideCompilationInputsAttrName,
+                    outside_compilation_attr);
+    id_builder.Input(inputs);
+    NodeDef id_def;
+    TF_RETURN_IF_ERROR(id_builder.Finalize(&id_def));
+    Node* id_node = g->AddNode(id_def, &s);
+    TF_RETURN_IF_ERROR(s);
+    for (int i = 0; i < num_replicas; i++) {
+      g->AddEdge(input_edge->src(), input_edge->src_output(), id_node, i);
+    }
+  }
+
+  // Remove `oc_nodes_at_head`.
+  for (Node* n : oc_nodes_at_head) {
+    xla_graph->RemoveNode(n);
+  }
+
+  VLOG(4) << "MoveHeadOutsideCompilationToHost host graph: "
+          << DumpGraphToFile(absl::StrCat("move_head_oc_host_", xla_func_name),
+                             *g);
+  VLOG(4) << "MoveHeadOutsideCompilationToHost XLA graph: "
+          << DumpGraphToFile(absl::StrCat("move_head_oc_xla_", xla_func_name),
+                             *xla_graph);
+
+  return Status::OK();
+}
+
+// If there are any unused _Arg nodes in `xla_graph`, remove them from
+// `xla_graph` and remove corresponding input edge in host graph `g`.
+Status RemoveUnusedXlaInput(const string& xla_func_name, Graph* g,
+                            Graph* xla_graph, Node* xla_node) {
+  // Find unused _Arg nodes, and remove them.
+  std::vector<DataType> input_types;
+  TF_RETURN_IF_ERROR(GetNodeAttr(xla_node->def(), "Tinputs", &input_types));
+  std::vector<int> mirrored_variable_indices;
+  if (xla_node->attrs().Find(TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR) !=
+      nullptr) {
+    TF_RETURN_IF_ERROR(GetNodeAttr(xla_node->def(),
+                                   TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR,
+                                   &mirrored_variable_indices));
+  }
+  std::vector<DataType> broadcast_input_types;
+  TF_RETURN_IF_ERROR(GetNodeAttr(xla_node->def(), "Tbroadcast_inputs",
+                                 &broadcast_input_types));
+  std::vector<DataType> guaranteed_constant_types;
+  TF_RETURN_IF_ERROR(GetNodeAttr(xla_node->def(), "Tguaranteed_constants",
+                                 &guaranteed_constant_types));
+  int num_variables;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(xla_node->def(), "NumVariables", &num_variables));
+  int num_replicas;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(xla_node->def(), "num_replicas", &num_replicas));
+  int num_distributed_vars;
+  TF_RETURN_IF_ERROR(GetNodeAttr(xla_node->attrs(), "num_distributed_variables",
+                                 &num_distributed_vars));
+  int num_per_replica_inputs =
+      (input_types.size() - num_distributed_vars) / num_replicas;
+  std::set<int> arg_indices_to_remove;
+  std::vector<Node*> arg_nodes_to_update, nodes_to_remove;
+  int num_args = 0, num_removed_per_replica_inputs = 0,
+      num_removed_distributed_vars = 0;
+  for (Node* n : xla_graph->nodes()) {
+    if (!n->IsArg()) {
+      continue;
+    }
+
+    bool has_output = false;
+    for (const Edge* e : n->out_edges()) {
+      if (e->dst() != xla_graph->sink_node()) {
+        has_output = true;
+        break;
+      }
+    }
+
+    num_args++;
+    int index;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "index", &index));
+    if (has_output) {
+      arg_nodes_to_update.push_back(n);
+      continue;
+    }
+
+    arg_indices_to_remove.insert(index);
+    if (index < num_per_replica_inputs) {
+      num_removed_per_replica_inputs++;
+    } else if (index < num_per_replica_inputs + num_distributed_vars) {
+      num_removed_distributed_vars++;
+    }
+    nodes_to_remove.push_back(n);
+  }
+  for (Node* n : nodes_to_remove) {
+    xla_graph->RemoveNode(n);
+  }
+
+  // Update `index` for other _Arg nodes.
+  std::map<int, int> arg_index_mapping;
+  int new_arg_index = 0;
+  for (int i = 0; i < num_args; i++) {
+    if (arg_indices_to_remove.find(i) != arg_indices_to_remove.end()) {
+      continue;
+    } else {
+      arg_index_mapping[i] = new_arg_index;
+      new_arg_index++;
+    }
+  }
+  for (Node* n : arg_nodes_to_update) {
+    int index;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "index", &index));
+    n->ClearAttr("index");
+    n->AddAttr("index", arg_index_mapping[index]);
+  }
+
+  // Re-order replicated index edges for `xla_node`.
+
+  // Sometimes `xla_node` can have a lot of inputs, calling Node::input_edge
+  // will become very expensive in this case because it is doing a linear search
+  // inside. Create a input_edges vector ahead to make the lookups faster.
+  std::vector<const Edge*> input_edges;
+  TF_RETURN_IF_ERROR(xla_node->input_edges(&input_edges));
+
+  const int num_new_per_replica_inputs =
+      num_per_replica_inputs - num_removed_per_replica_inputs;
+  for (int i = 0; i < num_replicas; i++) {
+    for (int j = 0; j < num_per_replica_inputs; j++) {
+      auto iter = arg_index_mapping.find(j);
+      if (iter != arg_index_mapping.end()) {
+        const Edge* e = input_edges.at(i * num_per_replica_inputs + j);
+        Node* src = e->src();
+        int src_output = e->src_output();
+        int dst_input = i * num_new_per_replica_inputs + iter->second;
+
+        g->RemoveEdge(e);
+        g->AddEdge(src, src_output, xla_node, dst_input);
+      } else {
+        const Edge* e = input_edges.at(i * num_per_replica_inputs + j);
+        g->RemoveEdge(e);
+      }
+    }
+  }
+
+  // Move other data input edges.
+  for (int i = num_replicas * num_per_replica_inputs;
+       i < xla_node->num_inputs(); i++) {
+    int arg_index =
+        num_per_replica_inputs + i - num_replicas * num_per_replica_inputs;
+    auto iter = arg_index_mapping.find(arg_index);
+    if (iter != arg_index_mapping.end()) {
+      const Edge* e = input_edges.at(i);
+      Node* src = e->src();
+      int src_output = e->src_output();
+      int dst_input = num_replicas * num_new_per_replica_inputs + iter->second -
+                      num_new_per_replica_inputs;
+
+      g->RemoveEdge(e);
+      g->AddEdge(src, src_output, xla_node, dst_input);
+    } else {
+      const Edge* e = input_edges.at(i);
+      g->RemoveEdge(e);
+    }
+  }
+
+  // Update attributes for `xla_node`.
+  std::vector<DataType> new_input_types;
+  for (int i = 0; i < num_replicas; i++) {
+    for (int j = 0; j < num_per_replica_inputs; j++) {
+      auto iter = arg_index_mapping.find(j);
+      if (iter != arg_index_mapping.end()) {
+        new_input_types.push_back(input_types[iter->first]);
+      }
+    }
+  }
+  for (int i = 0; i < num_distributed_vars; ++i) {
+    auto iter = arg_index_mapping.find(i + num_per_replica_inputs);
+    if (iter != arg_index_mapping.end()) {
+      new_input_types.push_back(
+          input_types[iter->first - num_per_replica_inputs +
+                      num_per_replica_inputs * num_replicas]);
+    }
+  }
+  xla_node->ClearAttr("Tinputs");
+  xla_node->AddAttr("Tinputs", new_input_types);
+
+  const int num_new_distributed_vars =
+      num_distributed_vars - num_removed_distributed_vars;
+  xla_node->ClearAttr("num_distributed_variables");
+  xla_node->AddAttr("num_distributed_variables", num_new_distributed_vars);
+
+  if (!mirrored_variable_indices.empty()) {
+    std::vector<int> new_mirrored_variable_indices;
+    absl::flat_hash_set<int> old_mirrored_variable_indices_set;
+    for (int index : mirrored_variable_indices) {
+      old_mirrored_variable_indices_set.insert(index);
+    }
+    for (int i = 0; i < num_per_replica_inputs + num_distributed_vars; i++) {
+      auto iter = arg_index_mapping.find(i);
+      if (iter != arg_index_mapping.end() &&
+          old_mirrored_variable_indices_set.contains(iter->first)) {
+        new_mirrored_variable_indices.push_back(iter->second);
+      }
+    }
+    xla_node->ClearAttr(TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR);
+    xla_node->AddAttr(TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR,
+                      new_mirrored_variable_indices);
+  }
+
+  int num_replicated_inputs = num_per_replica_inputs + num_distributed_vars;
+  std::vector<DataType> new_broadcast_input_types;
+  for (int i = 0; i < broadcast_input_types.size(); i++) {
+    int arg_index = num_replicated_inputs + i;
+    if (arg_index_mapping.find(arg_index) != arg_index_mapping.end()) {
+      new_broadcast_input_types.push_back(broadcast_input_types[i]);
+    }
+  }
+  xla_node->ClearAttr("Tbroadcast_inputs");
+  xla_node->AddAttr("Tbroadcast_inputs", new_broadcast_input_types);
+  int new_num_variables = 0;
+  for (int i = 0; i < num_variables; i++) {
+    int arg_index = num_replicated_inputs + broadcast_input_types.size() + i;
+    if (arg_index_mapping.find(arg_index) != arg_index_mapping.end()) {
+      new_num_variables++;
+    }
+  }
+  xla_node->ClearAttr("NumVariables");
+  xla_node->AddAttr("NumVariables", new_num_variables);
+  std::vector<DataType> new_guaranteed_constant_types;
+  for (int i = 0; i < guaranteed_constant_types.size(); i++) {
+    int arg_index = num_replicated_inputs + broadcast_input_types.size() +
+                    num_variables + i;
+    if (arg_index_mapping.find(arg_index) != arg_index_mapping.end()) {
+      new_guaranteed_constant_types.push_back(guaranteed_constant_types[i]);
+    }
+  }
+  xla_node->ClearAttr("Tguaranteed_constants");
+  xla_node->AddAttr("Tguaranteed_constants", new_guaranteed_constant_types);
+
+  int new_variable_start_index = num_new_per_replica_inputs +
+                                 num_new_distributed_vars +
+                                 new_broadcast_input_types.size();
+  if (xla_node->attrs().Find("_variable_start_index") != nullptr) {
+    xla_node->ClearAttr("_variable_start_index");
+    xla_node->AddAttr("_variable_start_index", new_variable_start_index);
+  }
+  int new_guaranteed_const_start_index =
+      new_variable_start_index + new_num_variables;
+  if (xla_node->attrs().Find("_guaranteed_const_start_index") != nullptr) {
+    xla_node->ClearAttr("_guaranteed_const_start_index");
+    xla_node->AddAttr("_guaranteed_const_start_index",
+                      new_guaranteed_const_start_index);
+  }
+
+  VLOG(4) << "RemoveUnusedXlaInput host graph: "
+          << DumpGraphToFile(
+                 absl::StrCat("remove_unused_input_host_", xla_func_name), *g);
+  VLOG(4) << "RemoveUnusedXlaInput XLA graph: "
+          << DumpGraphToFile(
+                 absl::StrCat("remove_unused_input_xla_", xla_func_name),
+                 *xla_graph);
+
+  return Status::OK();
+}
+
+// Move outside compilation nodes at the end of XLA computation to host.
+// For XLA computation graph, we will add new _Retval nodes to replace those
+// outside compilation nodes.
+// For host graph, we will move those outside compilation nodes to host,
+// replicate them, and use them as XLA node's output.
+Status MoveTailOutsideCompilationToHost(
+    const string& outside_compilation_attr_name, const string& xla_func_name,
+    const std::string& cluster_name, Graph* g, Graph* xla_graph, Node* xla_node,
+    Node* pivot_node) {
+  // Find outside compilation nodes that only have _Retval or other outside
+  // compilation nodes as output. These nodes will be moved to host graph.
+  std::vector<Node*> oc_nodes_at_tail;
+  const string kOnlyRetOrOcOutputAttrName = "_xla_only_ret_or_oc_output";
+  DFS(
+      *xla_graph, /*enter=*/nullptr,
+      [&](Node* n) {
+        bool has_non_ret_or_oc_output = false;
+        for (const Edge* e : n->out_edges()) {
+          if (e->dst() == xla_graph->sink_node()) {
+            continue;
+          }
+          if (!e->dst()->IsRetval() &&
+              (!HasNodeAttr(e->dst()->def(), outside_compilation_attr_name) ||
+               !HasNodeAttr(e->dst()->def(), kOnlyRetOrOcOutputAttrName))) {
+            has_non_ret_or_oc_output = true;
+            break;
+          }
+        }
+        if (HasNodeAttr(n->def(), outside_compilation_attr_name) &&
+            !has_non_ret_or_oc_output) {
+          n->AddAttr(kOnlyRetOrOcOutputAttrName, true);
+          oc_nodes_at_tail.push_back(n);
+        }
+      },
+      NodeComparatorName());
+  if (VLOG_IS_ON(5)) {
+    for (Node* n : oc_nodes_at_tail) {
+      VLOG(5) << "oc_nodes_at_tail: " << n->DebugString();
+    }
+  }
+
+  // Record input edges from `oc_nodes_at_tail`. We will create an _Retval node
+  // for each of these edges. An obvious optimization here is to deduplicate
+  // these edges by <src, src_output>. But that optimization will complicate
+  // the code, and in practice we usually do not have input edges with the
+  // same <src, src_output>.
+  std::vector<const Edge*> oc_input_edges;
+  std::vector<DataType> new_ret_types;
+  for (Node* n : oc_nodes_at_tail) {
+    for (const Edge* e : n->in_edges()) {
+      if (!e->IsControlEdge() &&
+          !HasNodeAttr(e->src()->def(), kOnlyRetOrOcOutputAttrName)) {
+        VLOG(5) << "oc_input_edges: " << e->DebugString();
+        oc_input_edges.push_back(e);
+        new_ret_types.push_back(e->src()->output_type(e->src_output()));
+      }
+    }
+  }
+  std::vector<DataType> output_types;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(xla_node->attrs(), "output_types", &output_types));
+  int num_replicas;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(xla_node->attrs(), "num_replicas", &num_replicas));
+  int old_num_replicated_outputs = output_types.size() / num_replicas;
+  int new_num_replicated_outputs =
+      old_num_replicated_outputs + oc_input_edges.size();
+  VLOG(5) << "old_num_replicated_outputs: " << old_num_replicated_outputs;
+  VLOG(5) << "new_num_replicated_outputs: " << new_num_replicated_outputs;
+
+  // Update `output_types` attribute for `xla_node`.
+  std::vector<DataType> new_output_types;
+  for (int replica_id = 0; replica_id < num_replicas; replica_id++) {
+    for (int i = 0; i < old_num_replicated_outputs; i++) {
+      new_output_types.push_back(output_types[i]);
+    }
+    for (int i = old_num_replicated_outputs; i < new_num_replicated_outputs;
+         i++) {
+      new_output_types.push_back(new_ret_types[i - old_num_replicated_outputs]);
+    }
+  }
+  xla_node->ClearAttr("output_types");
+  xla_node->AddAttr("output_types", new_output_types);
+
+  // Re-order old replicated output edges. Since a node could potentially
+  // connect to multiple nodes, build a vector<vector<pair>> mapping of
+  // output index to input nodes/index.
+  // The outer vector represents the output index, the inner vector
+  // represents the destination node and input index pair with the possibility
+  // of multiple node/index pairs.
+  std::vector<std::vector<std::pair<Node*, int>>> replicated_outputs(
+      old_num_replicated_outputs * num_replicas);
+  std::vector<const Edge*> old_replicated_edges;
+  for (const Edge* e : xla_node->out_edges()) {
+    if (e->src_output() >= 0 &&
+        e->src_output() < old_num_replicated_outputs * num_replicas) {
+      replicated_outputs[e->src_output()].push_back(
+          std::make_pair(e->dst(), e->dst_input()));
+      old_replicated_edges.push_back(e);
+    }
+  }
+  for (const Edge* e : old_replicated_edges) {
+    g->RemoveEdge(e);
+  }
+  for (int replica_id = 0; replica_id < num_replicas; replica_id++) {
+    for (int output_index = 0; output_index < old_num_replicated_outputs;
+         output_index++) {
+      for (const auto& node_input_pair :
+           replicated_outputs[replica_id * old_num_replicated_outputs +
+                              output_index]) {
+        Node* dst = node_input_pair.first;
+        int dst_input = node_input_pair.second;
+        g->AddEdge(xla_node,
+                   replica_id * new_num_replicated_outputs + output_index, dst,
+                   dst_input);
+      }
+    }
+  }
+
+  // Copy all nodes in `oc_nodes_at_tail` to host graph, and also replicate
+  // them.
+  std::map<Node*, std::vector<Node*>> node_images;
+  for (Node* n : oc_nodes_at_tail) {
+    for (int replica_id = 0; replica_id < num_replicas; replica_id++) {
+      NodeDef copy_def = n->def();
+      copy_def.set_name(absl::StrCat(n->name(), "_tail_oc/R", replica_id));
+      copy_def.clear_device();
+
+      Status s;
+      Node* copy_node = g->AddNode(copy_def, &s);
+      TF_RETURN_IF_ERROR(s);
+
+      copy_node->AddAttr(kXlaReplicaIdAttrName, replica_id);
+      copy_node->AddAttr(kTPUReplicateAttr, cluster_name);
+
+      for (const Edge* e : n->out_edges()) {
+        if (e->dst() == xla_graph->sink_node()) {
+          continue;
+        }
+        // Either e->dst() is _Retval, or it's in `node_images`.
+        if (e->dst()->IsRetval()) {
+          int index;
+          TF_RETURN_IF_ERROR(GetNodeAttr(e->dst()->attrs(), "index", &index));
+          for (const auto& output :
+               replicated_outputs[replica_id * old_num_replicated_outputs +
+                                  index]) {
+            // Remove original input edge, if existent.
+            const Edge* original_edge;
+            Status s = output.first->input_edge(output.second, &original_edge);
+            if (s.ok()) {
+              g->RemoveEdge(original_edge);
+            }
+            g->AddEdge(copy_node, e->src_output(), output.first, output.second);
+          }
+        } else {
+          g->AddEdge(copy_node, e->src_output(),
+                     node_images[e->dst()][replica_id], e->dst_input());
+        }
+      }
+
+      // Add attribute "_xla_tail_outside_compilation" to `copy_node`, and add a
+      // control edge between `xla_node` and `copy_node`. As a result, in later
+      // rewriting pass, a control edge will be added between `copy_node` and
+      // "control_after" node for the XLA computation, so `copy_node` will be
+      // executed before XLA computation's final results.
+      copy_node->AddAttr("_xla_tail_outside_compilation", true);
+      g->AddControlEdge(xla_node, copy_node);
+
+      // Add control edge between `pivot_node` and `copy_node`, so `copy_node`
+      // belongs to same while loop as `xla_node`.
+      if (pivot_node) {
+        g->AddControlEdge(pivot_node, copy_node);
+      }
+
+      node_images[n].push_back(copy_node);
+    }
+  }
+
+  // Connect new output values of `xla_node` to dst nodes of `oc_input_edges`.
+  for (int i = 0; i < new_ret_types.size(); i++) {
+    const Edge* original_edge = oc_input_edges[i];
+    for (int replica_id = 0; replica_id < num_replicas; replica_id++) {
+      int src_output = replica_id * new_num_replicated_outputs +
+                       old_num_replicated_outputs + i;
+      Node* dst = node_images[original_edge->dst()][replica_id];
+      g->AddEdge(xla_node, src_output, dst, original_edge->dst_input());
+    }
+  }
+
+  // Create new _Retval nodes in `xla_graph`.
+  for (int i = old_num_replicated_outputs; i < new_num_replicated_outputs;
+       i++) {
+    NodeDefBuilder ret_builder(absl::StrCat("ret_", i),
+                               FunctionLibraryDefinition::kRetOp);
+    ret_builder.Attr("T", new_ret_types[i - old_num_replicated_outputs]);
+    ret_builder.Attr("index", i);
+    const Edge* original_edge = oc_input_edges[i - old_num_replicated_outputs];
+    Node* src = original_edge->src();
+    int src_output = original_edge->src_output();
+    ret_builder.Input(src->name(), src_output, src->output_type(src_output));
+    NodeDef ret_def;
+    TF_RETURN_IF_ERROR(ret_builder.Finalize(&ret_def));
+    Status s;
+    Node* ret_node = xla_graph->AddNode(ret_def, &s);
+    TF_RETURN_IF_ERROR(s);
+    xla_graph->RemoveEdge(original_edge);
+    xla_graph->AddEdge(src, src_output, ret_node, 0);
+  }
+
+  // Remove `oc_nodes_at_tail`.
+  for (Node* n : oc_nodes_at_tail) {
+    xla_graph->RemoveNode(n);
+  }
+
+  // We cannot leave _Retval with no input. Add a placeholder input, which will
+  // be removed later with unused _Retval.
+  std::vector<Node*> unused_rets;
+  for (Node* n : xla_graph->nodes()) {
+    if (n->IsRetval() && n->in_edges().empty()) {
+      unused_rets.push_back(n);
+    }
+  }
+  for (Node* n : unused_rets) {
+    NodeDefBuilder builder(absl::StrCat("placeholder_", n->name()),
+                           "Placeholder");
+    DataType dtype;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "T", &dtype));
+    builder.Attr("dtype", dtype);
+    builder.Attr(kXlaIsPlaceholderForTailOcAttrName, true);
+    NodeDef def;
+    TF_RETURN_IF_ERROR(builder.Finalize(&def));
+    Status s;
+    Node* placeholder = xla_graph->AddNode(def, &s);
+    TF_RETURN_IF_ERROR(s);
+    xla_graph->AddEdge(placeholder, 0, n, 0);
+  }
+
+  VLOG(4) << "MoveTailOutsideCompilationToHost host graph: "
+          << DumpGraphToFile(absl::StrCat("move_tail_oc_host_", xla_func_name),
+                             *g);
+  VLOG(4) << "MoveTaildOutsideCompilationToHost XLA graph: "
+          << DumpGraphToFile(absl::StrCat("move_tail_oc_xla_", xla_func_name),
+                             *xla_graph);
+
+  return Status::OK();
+}
+
+Status ReplaceArgUsedByOutsideCompilationWithPlaceholder(
+    const string& outside_compilation_attr_name, const string& xla_func_name,
+    Graph* g, Graph* xla_graph, Node* xla_node) {
+  std::vector<DataType> input_types;
+  TF_RETURN_IF_ERROR(GetNodeAttr(xla_node->attrs(), "Tinputs", &input_types));
+  int num_distributed_vars;
+  TF_RETURN_IF_ERROR(GetNodeAttr(xla_node->attrs(), "num_distributed_variables",
+                                 &num_distributed_vars));
+  int num_replicas;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(xla_node->attrs(), "num_replicas", &num_replicas));
+  int num_per_replica_inputs =
+      (input_types.size() - num_distributed_vars) / num_replicas;
+
+  for (Node* n : xla_graph->op_nodes()) {
+    if (!n->IsArg()) {
+      continue;
+    }
+
+    DataType dtype;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "T", &dtype));
+    // TODO(b/74023706): enable moving normal data tensors.
+    if (dtype != DT_RESOURCE) {
+      continue;
+    }
+
+    std::vector<const Edge*> oc_out_edges;
+    for (const Edge* e : n->out_edges()) {
+      if (e->IsControlEdge() ||
+          !HasNodeAttr(e->dst()->def(), kOutsideCompilationAttr)) {
+        continue;
+      }
+
+      oc_out_edges.push_back(e);
+    }
+    if (oc_out_edges.empty()) {
+      continue;
+    }
+
+    // Sometimes `xla_node` can have a lot of inputs, calling Node::input_edge
+    // will become very expensive in this case because it is doing a linear
+    // search inside. Create an input_edges vector ahead to make the lookups
+    // faster.
+    std::vector<const Edge*> input_edges;
+    TF_RETURN_IF_ERROR(xla_node->input_edges(&input_edges));
+
+    // Build an IdentityN node to record inputs for this _Arg node.
+    int index;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "index", &index));
+    string oc_identifier = absl::StrCat("oc_only_arg_", index);
+    NodeDefBuilder id_builder(absl::StrCat(oc_identifier, "_inputs"),
+                              "IdentityN");
+    std::vector<DataType> dtypes(num_replicas, dtype);
+    id_builder.Attr("T", dtypes);
+    id_builder.Attr(kXlaOutsideCompilationInputsAttrName, oc_identifier);
+    std::vector<NodeDefBuilder::NodeOut> inputs(num_replicas);
+    if (index >= num_per_replica_inputs) {
+      const Edge* e = input_edges.at(num_replicas * num_per_replica_inputs +
+                                     (index - num_per_replica_inputs));
+      for (int i = 0; i < num_replicas; i++) {
+        inputs[i] =
+            NodeDefBuilder::NodeOut{e->src()->name(), e->src_output(),
+                                    e->src()->output_type(e->src_output())};
+      }
+    } else {
+      for (int i = 0; i < num_replicas; i++) {
+        const Edge* e = input_edges.at(i * num_per_replica_inputs + index);
+        inputs[i] =
+            NodeDefBuilder::NodeOut{e->src()->name(), e->src_output(),
+                                    e->src()->output_type(e->src_output())};
+      }
+    }
+    id_builder.Input(inputs);
+    NodeDef id_def;
+    TF_RETURN_IF_ERROR(id_builder.Finalize(&id_def));
+    Status s;
+    Node* id_node = g->AddNode(id_def, &s);
+    TF_RETURN_IF_ERROR(s);
+    if (index >= num_per_replica_inputs) {
+      const Edge* e = input_edges.at(num_replicas * num_per_replica_inputs +
+                                     (index - num_per_replica_inputs));
+      for (int i = 0; i < num_replicas; i++) {
+        g->AddEdge(e->src(), e->src_output(), id_node, i);
+      }
+    } else {
+      for (int i = 0; i < num_replicas; i++) {
+        const Edge* e = input_edges.at(i * num_per_replica_inputs + index);
+        g->AddEdge(e->src(), e->src_output(), id_node, i);
+      }
+    }
+
+    for (const Edge* e : oc_out_edges) {
+      // 'e' will use a new Placeholder node as input.
+      NodeDefBuilder ph_builder(xla_graph->NewName("ph_for_arg_in_oc_"),
+                                "Placeholder");
+      ph_builder.Attr("dtype", dtype);
+
+      string outside_compilation_attr;
+      TF_RETURN_IF_ERROR(GetNodeAttr(e->dst()->def(), kOutsideCompilationAttr,
+                                     &outside_compilation_attr));
+      ph_builder.Attr(kOutsideCompilationAttr, outside_compilation_attr);
+      ph_builder.Attr(kXlaOutsideCompilationInputsAttrName, oc_identifier);
+      ph_builder.Attr(kXlaIsPlaceholderForArg, true);
+      NodeDef ph_def;
+      TF_RETURN_IF_ERROR(ph_builder.Finalize(&ph_def));
+      Status s;
+      Node* ph_node = xla_graph->AddNode(ph_def, &s);
+      TF_RETURN_IF_ERROR(s);
+      Node* dst = e->dst();
+      int dst_input = e->dst_input();
+      xla_graph->RemoveEdge(e);
+      xla_graph->AddEdge(ph_node, 0, dst, dst_input);
+      xla_graph->AddControlEdge(xla_graph->source_node(), ph_node);
+    }
+  }
+  VLOG(4) << "ReplaceOutsideCompilationOnlyArgWithPlaceholder host graph: "
+          << DumpGraphToFile(
+                 absl::StrCat("replace_oc_only_arg_host_", xla_func_name), *g);
+  VLOG(4) << "ReplaceOutsideCompilationOnlyArgWithPlaceholder XLA graph: "
+          << DumpGraphToFile(
+                 absl::StrCat("replace_oc_only_arg_xla_", xla_func_name),
+                 *xla_graph);
+  return Status::OK();
+}
+
+// If there are any unused _Retval nodes in `xla_graph` (whose input is a
+// Placeholder node), remove them from `xla_graph` and remove corresponding
+// output edge in host graph `g`.
+Status RemoveUnusedXlaOutput(const string& xla_func_name, Graph* g,
+                             Graph* xla_graph, Node* xla_node) {
+  // Find unused _Retval nodes, and remove them.
+  std::vector<DataType> output_types;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(xla_node->def(), "output_types", &output_types));
+  int num_replicas;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(xla_node->def(), "num_replicas", &num_replicas));
+  int num_replicated_outputs = output_types.size() / num_replicas;
+  std::set<int> ret_indices_to_remove;
+  std::vector<Node*> ret_nodes_to_update, nodes_to_remove;
+  int num_rets = 0;
+  for (Node* n : xla_graph->nodes()) {
+    if (!n->IsRetval()) {
+      continue;
+    }
+
+    num_rets++;
+
+    const Edge* e;
+    TF_RETURN_IF_ERROR(n->input_edge(0, &e));
+    if (e->src()->type_string() != "Placeholder" ||
+        !HasNodeAttr(e->src()->def(), kXlaIsPlaceholderForTailOcAttrName)) {
+      ret_nodes_to_update.push_back(n);
+      continue;
+    }
+
+    int index;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "index", &index));
+    ret_indices_to_remove.insert(index);
+    nodes_to_remove.push_back(e->src());
+    nodes_to_remove.push_back(n);
+  }
+  for (Node* n : nodes_to_remove) {
+    xla_graph->RemoveNode(n);
+  }
+
+  // Update `index` for other _Arg nodes.
+  std::map<int, int> ret_index_mapping;
+  int new_ret_index = 0;
+  for (int i = 0; i < num_rets; i++) {
+    if (ret_indices_to_remove.find(i) != ret_indices_to_remove.end()) {
+      continue;
+    } else {
+      ret_index_mapping[i] = new_ret_index;
+      new_ret_index++;
+    }
+  }
+  for (Node* n : ret_nodes_to_update) {
+    int index;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "index", &index));
+    n->ClearAttr("index");
+    n->AddAttr("index", ret_index_mapping[index]);
+  }
+
+  // Update `output_types` attribute for `xla_node`.
+  std::vector<DataType> new_output_types;
+  for (int i = 0; i < num_replicas; i++) {
+    for (const auto& e : ret_index_mapping) {
+      new_output_types.push_back(output_types[e.first]);
+    }
+  }
+
+  xla_node->ClearAttr("output_types");
+  xla_node->AddAttr("output_types", new_output_types);
+
+  // Re-order replicated output edges for `xla_node`.
+  std::vector<std::vector<const Edge*>> output_edges(num_replicas *
+                                                     num_replicated_outputs);
+  for (const Edge* e : xla_node->out_edges()) {
+    if (e->src_output() >= 0 &&
+        e->src_output() < num_replicas * num_replicated_outputs) {
+      output_edges[e->src_output()].push_back(e);
+    }
+  }
+  for (int i = 0; i < num_replicas; i++) {
+    for (int j = 0; j < num_replicated_outputs; j++) {
+      auto iter = ret_index_mapping.find(j);
+      if (iter != ret_index_mapping.end()) {
+        for (const Edge* e : output_edges[i * num_replicated_outputs + j]) {
+          Node* dst = e->dst();
+          int dst_input = e->dst_input();
+          int src_output =
+              i * (num_replicated_outputs - ret_indices_to_remove.size()) +
+              iter->second;
+          g->RemoveEdge(e);
+          g->AddEdge(xla_node, src_output, dst, dst_input);
+        }
+      } else {
+        TF_RET_CHECK(output_edges[i * num_replicated_outputs + j].empty())
+            << "Output edge not removed: "
+            << output_edges[i * num_replicated_outputs + j][0]->DebugString();
+      }
+    }
+  }
+
+  VLOG(4) << "RemoveUnusedXlaOutput host graph: "
+          << DumpGraphToFile(
+                 absl::StrCat("remove_unused_output_host_", xla_func_name), *g);
+  VLOG(4) << "RemoveUnusedXlaOutput XLA graph: "
+          << DumpGraphToFile(
+                 absl::StrCat("remove_unused_output_xla_", xla_func_name),
+                 *xla_graph);
+
+  return Status::OK();
+}
+
+// For data edges between _Arg and _Retval in `xla_graph`, remove them and
+// change input/output edges in `g` (host graph). For now, we only consider
+// replicated inputs.
+Status RemoveEdgesBetweenArgAndRetval(const string& xla_func_name, Graph* g,
+                                      Graph* xla_graph, Node* xla_node) {
+  // Collect data edges between _Arg and _Retval.
+  int num_replicas;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(xla_node->def(), "num_replicas", &num_replicas));
+  std::vector<DataType> input_types;
+  TF_RETURN_IF_ERROR(GetNodeAttr(xla_node->def(), "Tinputs", &input_types));
+  int num_distributed_vars;
+  TF_RETURN_IF_ERROR(GetNodeAttr(xla_node->attrs(), "num_distributed_variables",
+                                 &num_distributed_vars));
+  int old_num_per_replica_inputs =
+      (input_types.size() - num_distributed_vars) / num_replicas;
+  std::vector<DataType> output_types;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(xla_node->def(), "output_types", &output_types));
+  int old_num_outputs = output_types.size() / num_replicas;
+  std::vector<const Edge*> edges;
+  for (const Edge* e : xla_graph->edges()) {
+    if (!e->IsControlEdge() && e->src()->IsArg() && e->dst()->IsRetval()) {
+      edges.push_back(e);
+    }
+  }
+
+  // In host graph `g`, remove output edge from `xla_node` and connect input &
+  // output directly.
+  std::vector<std::vector<const Edge*>> xla_node_out_edges(
+      xla_node->num_outputs());
+  for (const Edge* e : xla_node->out_edges()) {
+    if (!e->IsControlEdge()) {
+      xla_node_out_edges[e->src_output()].push_back(e);
+    }
+  }
+
+  // Sometimes `xla_node` can have a lot of inputs, calling Node::input_edge
+  // will become very expensive in this case because it is doing a linear
+  // search inside. Create an input_edges vector ahead to make the lookups
+  // faster.
+  std::vector<const Edge*> input_edges;
+  TF_RETURN_IF_ERROR(xla_node->input_edges(&input_edges));
+  for (const Edge* e : edges) {
+    int arg_index;
+    TF_RETURN_IF_ERROR(GetNodeAttr(e->src()->def(), "index", &arg_index));
+    int ret_index;
+    TF_RETURN_IF_ERROR(GetNodeAttr(e->dst()->def(), "index", &ret_index));
+
+    for (int replica_id = 0; replica_id < num_replicas; replica_id++) {
+      int input_index;
+      if (arg_index < old_num_per_replica_inputs) {
+        input_index = replica_id * old_num_per_replica_inputs + arg_index;
+      } else {
+        input_index = num_replicas * old_num_per_replica_inputs +
+                      (arg_index - old_num_per_replica_inputs);
+      }
+      const Edge* input_edge = input_edges.at(input_index);
+
+      int output_index = replica_id * old_num_outputs + ret_index;
+      for (const Edge* output_edge : xla_node_out_edges[output_index]) {
+        Node* dst = output_edge->dst();
+        int dst_input = output_edge->dst_input();
+
+        g->RemoveEdge(output_edge);
+        g->AddEdge(input_edge->src(), input_edge->src_output(), dst, dst_input);
+      }
+    }
+  }
+
+  // Remove edges from `xla_graph`. Add a Placeholder node for the _Retval node,
+  // which will be removed by `RemoveUnusedXlaOutput()` later.
+  for (const Edge* e : edges) {
+    NodeDefBuilder placeholder_builder(
+        absl::StrCat("placeholder_", e->dst()->name()), "Placeholder");
+    placeholder_builder.Attr("dtype", e->src()->output_type(e->src_output()));
+    placeholder_builder.Attr(kXlaIsPlaceholderForTailOcAttrName, true);
+    NodeDef placeholder_def;
+    TF_RETURN_IF_ERROR(placeholder_builder.Finalize(&placeholder_def));
+    Status s;
+    Node* placeholder_node = xla_graph->AddNode(placeholder_def, &s);
+    TF_RETURN_IF_ERROR(s);
+
+    Node* dst = e->dst();
+    int dst_input = e->dst_input();
+    xla_graph->RemoveEdge(e);
+    xla_graph->AddEdge(placeholder_node, 0, dst, dst_input);
+  }
+
+  VLOG(4) << "RemoveUnusedArgRetvalPair host graph: "
+          << DumpGraphToFile(
+                 absl::StrCat("remove_unused_arg_ret_host_", xla_func_name),
+                 *g);
+  VLOG(4) << "RemoveUnusedArgRetvalPair XLA graph: "
+          << DumpGraphToFile(
+                 absl::StrCat("remove_unused_arg_ret_xla_", xla_func_name),
+                 *xla_graph);
+
+  return Status::OK();
+}
+
+// Remove any TPUReplicatedInput nodes with no output edges. Those nodes are
+// usually TPUMirroredVariable handles which are not used by any computations.
+void RemoveUnusedTPUReplicatedInputs(Graph* graph) {
+  for (Node* n : graph->nodes()) {
+    if (n->type_string() == kTPUReplicatedInput) {
+      bool has_output = false;
+      for (const Edge* e : n->out_edges()) {
+        if (!e->dst()->IsSink()) {
+          has_output = true;
+          break;
+        }
+      }
+      if (!has_output) {
+        graph->RemoveNode(n);
+      }
+    }
+  }
+}
+
+// We might have duplicated cluster names in the graph, e.g. when a tf.function
+// containing tpu_strategy.run() is called multiple times with
+// the same inputs. Find clusters with duplicated names and rename them.
+Status RenameClustersWithDuplicatedNames(Graph* g) {
+  // Find all TPU clusters by finding all TPUReplicateMetadata nodes.
+  std::unordered_map<string, std::vector<Node*>> cluster_name_to_metadata_nodes;
+  std::unordered_set<string> cluster_names;
+  for (Node* n : g->nodes()) {
+    if (n->type_string() != "TPUReplicateMetadata") {
+      continue;
+    }
+    string cluster_name;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), kTPUReplicateAttr, &cluster_name));
+    cluster_name_to_metadata_nodes[cluster_name].push_back(n);
+    cluster_names.insert(cluster_name);
+  }
+  // Look for clusters with duplicated name.
+  for (const auto& iter : cluster_name_to_metadata_nodes) {
+    if (iter.second.size() == 1) {
+      continue;
+    }
+
+    // Rename clusters.
+    for (int i = 1; i < iter.second.size(); i++) {
+      // Find an available cluster name.
+      string new_cluster_name;
+      int cluster_name_suffix = 1;
+      while (true) {
+        new_cluster_name = absl::StrCat(iter.first, "_", cluster_name_suffix);
+        if (cluster_names.find(new_cluster_name) == cluster_names.end()) {
+          break;
+        }
+        cluster_name_suffix++;
+      }
+      cluster_names.insert(new_cluster_name);
+
+      // Change _tpu_replicate attribute for all nodes in this cluster.
+      // Start with outputs of TPUReplicateMetadata and follow output edges.
+      std::queue<Node*> queue;
+      queue.push(iter.second.at(i));
+      std::unordered_set<Node*> visited;
+      while (!queue.empty()) {
+        Node* n = queue.front();
+        queue.pop();
+
+        visited.insert(n);
+
+        n->ClearAttr(kTPUReplicateAttr);
+        n->AddAttr(kTPUReplicateAttr, new_cluster_name);
+
+        string cluster_name;
+        for (const Edge* e : n->out_edges()) {
+          if (GetNodeAttr(e->dst()->def(), kTPUReplicateAttr, &cluster_name)
+                  .ok() &&
+              cluster_name == iter.first &&
+              visited.find(e->dst()) == visited.end()) {
+            queue.push(e->dst());
+          }
+        }
+      }
+      // Change "_tpu_compilation_status" attr for TPUCompilationResult node.
+      for (const Edge* e : iter.second.at(i)->out_edges()) {
+        if (e->dst()->type_string() == "TPUCompilationResult") {
+          e->dst()->ClearAttr("_tpu_compilation_status");
+          e->dst()->AddAttr("_tpu_compilation_status", new_cluster_name);
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// Instantiate a function that is associated with a functional control flow
+// node. The function name is found by looking up `function_name_attr` of given
+// node.
+xla::StatusOr<std::unique_ptr<FunctionBody>> InstantiateAssociatedFunction(
+    const Node& n, absl::string_view function_name_attr,
+    FunctionLibraryDefinition* fld) {
+  std::unique_ptr<FunctionBody> fbody;
+  NameAttrList func_attr_list;
+  TF_RETURN_IF_ERROR(GetNodeAttr(n.def(), function_name_attr, &func_attr_list));
+  const FunctionDef* fdef = fld->Find(func_attr_list.name());
+  if (fdef == nullptr) {
+    return errors::Internal("Cannot find ", function_name_attr, " function",
+                            "for node ", n.DebugString());
+  }
+  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+      *fdef, AttrSlice(&func_attr_list.attr()), fld, &fbody));
+  return fbody;
+}
+
+// Find inputs of If node that are only used for outside compilation if used at
+// all in both if/else branches
+xla::StatusOr<absl::flat_hash_set<int>> FindArgsToLiftForIfNode(
+    const Node& if_node, FunctionLibraryDefinition* fld) {
+  absl::flat_hash_set<int> args_to_lift_indices;
+  std::vector<DataType> dtypes;
+  TF_RETURN_IF_ERROR(GetNodeAttr(if_node.def(), "Tin", &dtypes));
+
+  int num_args = dtypes.size();
+
+  for (int i = 0; i < num_args; i++) {
+    // TODO(b/74023706): enable non resource inputs as well.
+    if (dtypes[i] == DT_RESOURCE) {
+      args_to_lift_indices.insert(i);
+    }
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<FunctionBody> then_branch_fbody,
+      InstantiateAssociatedFunction(if_node, "then_branch", fld));
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<FunctionBody> else_branch_fbody,
+      InstantiateAssociatedFunction(if_node, "else_branch", fld));
+
+  for (int i = 0; i < num_args; ++i) {
+    bool used = false;
+
+    const Node* then_arg_node = then_branch_fbody->arg_nodes[i];
+    for (const Edge* e : then_arg_node->out_edges()) {
+      used = true;
+      if (e->IsControlEdge() ||
+          HasNodeAttr(e->dst()->def(), kOutsideCompilationAttr))
+        continue;
+
+      args_to_lift_indices.erase(i);
+      break;
+    }
+
+    const Node* else_arg_node = else_branch_fbody->arg_nodes[i];
+    for (const Edge* e : else_arg_node->out_edges()) {
+      used = true;
+      if (e->IsControlEdge() ||
+          HasNodeAttr(e->dst()->def(), kOutsideCompilationAttr))
+        continue;
+
+      args_to_lift_indices.erase(i);
+      break;
+    }
+
+    // Do not lift arguments that are not used at all. Otherwise, this unused
+    // arg would be outside compiled, its output tensor will be forced to
+    // transfer to host needlessly.
+    if (!used) args_to_lift_indices.erase(i);
+  }
+
+  return args_to_lift_indices;
+}
+
+// Find inputs of While node that are:
+// 1. not used in cond func,
+// 2. only used for outside compilation in body func,
+// 3. loop invariant.
+// These inputs can be lifted out of the while loop.
+xla::StatusOr<absl::flat_hash_set<int>> FindArgsToLiftForWhileNode(
+    Node* while_node, FunctionLibraryDefinition* fld) {
+  // DT_RESOURCE inputs are candidates.
+  absl::flat_hash_set<int> result;
+  std::vector<DataType> dtypes;
+  TF_RETURN_IF_ERROR(GetNodeAttr(while_node->def(), "T", &dtypes));
+  for (int i = 0; i < dtypes.size(); i++) {
+    // TODO(b/74023706): enable non resource inputs as well.
+    if (dtypes[i] == DT_RESOURCE) {
+      result.insert(i);
+    }
+  }
+
+  // Remove inputs that are used in cond func.
+  NameAttrList cond_func;
+  TF_RETURN_IF_ERROR(GetNodeAttr(while_node->def(), "cond", &cond_func));
+  const FunctionDef* cond_fdef = fld->Find(cond_func.name());
+  if (cond_fdef == nullptr) {
+    return errors::Internal("Cannot find cond function ", cond_func.name(),
+                            " for while node ", while_node->DebugString());
+  }
+  std::unique_ptr<FunctionBody> cond_fbody;
+  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+      *cond_fdef, AttrSlice(&cond_func.attr()), fld, &cond_fbody));
+  for (int i = 0; i < cond_fbody->arg_nodes.size(); i++) {
+    const Node* arg_node = cond_fbody->arg_nodes[i];
+    for (const Edge* e : arg_node->out_edges()) {
+      if (!e->IsControlEdge()) {
+        result.erase(i);
+      }
+    }
+  }
+
+  // Remove inputs that are not loop invariant.
+  NameAttrList body_func;
+  TF_RETURN_IF_ERROR(GetNodeAttr(while_node->def(), "body", &body_func));
+  const FunctionDef* body_fdef = fld->Find(body_func.name());
+  if (body_fdef == nullptr) {
+    return errors::Internal("Cannot find body function ", body_func.name(),
+                            " for while node ", while_node->DebugString());
+  }
+  std::unique_ptr<FunctionBody> body_fbody;
+  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+      *body_fdef, AttrSlice(&body_func.attr()), fld, &body_fbody));
+  for (int i = 0; i < body_fbody->ret_nodes.size(); i++) {
+    const Node* node = body_fbody->ret_nodes[i];
+    do {
+      TF_RETURN_IF_ERROR(node->input_node(0, &node));
+    } while (node->IsIdentity());
+    if (node != body_fbody->arg_nodes[i]) {
+      result.erase(i);
+    }
+  }
+
+  // Remove inputs that only have one output edge (loop invariant, but not used
+  // in outside compilation).
+  for (int i = 0; i < body_fbody->arg_nodes.size(); i++) {
+    const Node* arg_node = body_fbody->arg_nodes[i];
+    int data_edge_count = std::count_if(
+        arg_node->out_edges().begin(), arg_node->out_edges().end(),
+        [](const Edge* e) { return !e->IsControlEdge(); });
+    if (data_edge_count == 1) {
+      result.erase(i);
+    }
+  }
+
+  // Remove inputs that have non-outside-compilation usage.
+  for (int i = 0; i < body_fbody->arg_nodes.size(); i++) {
+    const Node* arg_node = body_fbody->arg_nodes[i];
+    for (const Edge* e : arg_node->out_edges()) {
+      if (!e->dst()->IsRetval() &&
+          !HasNodeAttr(e->dst()->def(), kOutsideCompilationAttr)) {
+        result.erase(i);
+        break;
+      }
+    }
+  }
+
+  return result;
+}
+
+// Find inputs of function call node that are only used for outside compilation.
+// These inputs can be lifted out of the function call node.
+xla::StatusOr<absl::flat_hash_set<int>> FindArgsToLiftForCallNode(
+    Node* call_node, const FunctionBody& fbody) {
+  // DT_RESOURCE inputs are candidates.
+  absl::flat_hash_set<int> result;
+  std::vector<DataType> dtypes(call_node->input_types().begin(),
+                               call_node->input_types().end());
+  for (int i = 0; i < dtypes.size(); i++) {
+    // TODO(b/74023706): enable for non resource inputs as well.
+    if (dtypes[i] == DT_RESOURCE) {
+      result.insert(i);
+    }
+  }
+
+  // Remove inputs that have non-outside-compilation usage, or not used at all.
+  for (int i = 0; i < fbody.arg_nodes.size(); i++) {
+    const Node* arg_node = fbody.arg_nodes[i];
+    if (arg_node->out_edges().empty()) {
+      result.erase(i);
+      continue;
+    }
+
+    for (const Edge* e : arg_node->out_edges()) {
+      if (!HasNodeAttr(e->dst()->def(), kOutsideCompilationAttr)) {
+        result.erase(i);
+        break;
+      }
+    }
+  }
+  return result;
+}
+
+Status LiftOutsideCompilationOnlyArgs(Graph* g, FunctionLibraryRuntime* flr,
+                                      FunctionLibraryDefinition* fld,
+                                      int* lifted_arg_count, bool* rewritten);
+
+Status LiftOutsideCompilationOnlyArgsAndReplaceFunctionDef(
+    const FunctionBody& fbody, FunctionLibraryRuntime* flr,
+    FunctionLibraryDefinition* fld, int* lifted_arg_count,
+    absl::optional<string> new_func_name, bool* rewritten) {
+  *rewritten = false;
+  TF_RETURN_IF_ERROR(LiftOutsideCompilationOnlyArgs(
+      fbody.graph, flr, fld, lifted_arg_count, rewritten));
+
+  if (*rewritten) {
+    FunctionDef rewritten_fdef;
+    TF_RETURN_IF_ERROR(GraphToFunctionDef(
+        *(fbody.graph), fbody.fdef.signature().name(), &rewritten_fdef));
+    if (new_func_name) {
+      rewritten_fdef.mutable_signature()->set_name(*new_func_name);
+      TF_RETURN_IF_ERROR(fld->AddFunctionDef(rewritten_fdef));
+    } else {
+      TF_RETURN_IF_ERROR(
+          fld->ReplaceFunction(fbody.fdef.signature().name(), rewritten_fdef));
+    }
+  }
+
+  return Status::OK();
+}
+
+Status MakeIdentityNodesForArgsToLift(
+    const absl::flat_hash_set<int>& args_to_lift,
+    const int arg_to_input_edge_offset, Graph* g, Node* n,
+    absl::flat_hash_map<int, string>* lifted_arg_index_to_oc_cluster_name,
+    int* lifted_arg_count) {
+  int num_input = n->num_inputs();
+  for (int arg_index = 0; arg_index < num_input; ++arg_index) {
+    if (!args_to_lift.contains(arg_index)) continue;
+
+    int input_edge_index = arg_index + arg_to_input_edge_offset;
+    const Edge* arg_edge;
+    TF_RETURN_IF_ERROR(n->input_edge(input_edge_index, &arg_edge));
+
+    string node_name =
+        g->NewName(absl::StrCat("lifted_arg", *lifted_arg_count));
+    (*lifted_arg_count)++;
+    (*lifted_arg_index_to_oc_cluster_name)[arg_index] = node_name;
+    NodeDefBuilder id_builder(node_name, "Identity");
+    id_builder.Attr("T", n->input_type(input_edge_index));
+    id_builder.Attr(kOutsideCompilationAttr, id_builder.node_name());
+    id_builder.Attr(kXlaIsLiftedArgAttrName, true);
+    id_builder.Input(arg_edge->src()->name(), arg_edge->src_output(),
+                     n->input_type(input_edge_index));
+    NodeDef id_def;
+    TF_RETURN_IF_ERROR(id_builder.Finalize(&id_def));
+    Status s;
+    Node* id_node = g->AddNode(id_def, &s);
+    TF_RETURN_IF_ERROR(s);
+    g->AddEdge(arg_edge->src(), arg_edge->src_output(), id_node, 0);
+    g->AddControlEdge(id_node, n);
+  }
+
+  return Status::OK();
+}
+
+// Replaces all usages of lifted args with placeholder nodes. Afterwards,
+// removing these args should be safe since they no longer have users.
+Status RemoveArgsToLiftFromFunctionBody(
+    const absl::flat_hash_set<int>& args_to_lift,
+    const std::vector<DataType>& arg_dtypes,
+    const absl::flat_hash_map<int, string>& lifted_arg_index_to_oc_cluster_name,
+    const absl::flat_hash_map<int, int>& index_mapping,
+    const FunctionBody* fbody) {
+  for (int i = 0; i < fbody->arg_nodes.size(); ++i) {
+    Node* arg_node = fbody->arg_nodes[i];
+
+    if (!args_to_lift.contains(i)) {
+      int new_index = index_mapping.at(i);
+      arg_node->ClearAttr("index");
+      arg_node->AddAttr("index", new_index);
+      arg_node->ClearAttr("T");
+      arg_node->AddAttr("T", arg_dtypes[i]);
+      continue;
+    }
+
+    std::vector<const Edge*> out_edges_to_oc;
+    for (const Edge* e : arg_node->out_edges()) {
+      if (HasNodeAttr(e->dst()->def(), kOutsideCompilationAttr)) {
+        out_edges_to_oc.push_back(e);
+      }
+    }
+
+    for (const Edge* e : out_edges_to_oc) {
+      string outside_compilation_cluster;
+      TF_RETURN_IF_ERROR(GetNodeAttr(e->dst()->def(), kOutsideCompilationAttr,
+                                     &outside_compilation_cluster));
+      NodeDefBuilder ph_builder(fbody->graph->NewName("lifted_arg"),
+                                "Placeholder");
+      ph_builder.Attr("dtype", arg_dtypes[i]);
+      ph_builder.Attr(kOutsideCompilationAttr, outside_compilation_cluster);
+      TF_RET_CHECK(lifted_arg_index_to_oc_cluster_name.contains(i));
+      ph_builder.Attr(kXlaLiftedArgOutsideCompilationAttrName,
+                      lifted_arg_index_to_oc_cluster_name.at(i));
+
+      NodeDef ph_def;
+      TF_RETURN_IF_ERROR(ph_builder.Finalize(&ph_def));
+
+      Status s;
+      Node* ph_node = fbody->graph->AddNode(ph_def, &s);
+      TF_RETURN_IF_ERROR(s);
+
+      Node* dst = e->dst();
+      int dst_input = e->dst_input();
+      fbody->graph->RemoveEdge(e);
+      fbody->graph->AddEdge(ph_node, 0, dst, dst_input);
+    }
+
+    fbody->graph->RemoveNode(arg_node);
+  }
+
+  return Status::OK();
+}
+
+Status CleanUpInEdges(const absl::flat_hash_map<int, int>& index_mapping,
+                      const int arg_to_input_edge_offset, Graph* g, Node* n) {
+  int num_inputs = n->num_inputs();
+  for (int i = 0; i < num_inputs; ++i) {
+    if (i < arg_to_input_edge_offset) continue;
+
+    int arg_idx = i - arg_to_input_edge_offset;
+    const Edge* e;
+    TF_RETURN_IF_ERROR(n->input_edge(i, &e));
+
+    // If an edge maps to a lifted argument, simply remove that edge from graph.
+    if (!index_mapping.contains(arg_idx)) {
+      g->RemoveEdge(e);
+      continue;
+    }
+
+    // If an edge maps to same input port, nothing to do.
+    if (index_mapping.at(arg_idx) == arg_idx) continue;
+
+    g->AddEdge(e->src(), e->src_output(), n,
+               index_mapping.at(arg_idx) + arg_to_input_edge_offset);
+    g->RemoveEdge(e);
+  }
+
+  return Status::OK();
+}
+
+Status UpdateTypeAttribute(const absl::flat_hash_map<int, int>& index_mapping,
+                           const string& type_attr_name,
+                           const std::vector<DataType>& dtypes, Node* n) {
+  std::vector<DataType> new_dtypes;
+  new_dtypes.reserve(index_mapping.size());
+  for (int i = 0; i < dtypes.size(); ++i) {
+    if (index_mapping.contains(i)) {
+      new_dtypes.emplace_back(dtypes[i]);
+    }
+  }
+
+  n->ClearAttr(type_attr_name);
+  n->AddAttr(type_attr_name, new_dtypes);
+
+  return Status::OK();
+}
+
+// While V2 always creates Identity node for each While node output, which is
+// not necessary for XLA computation. Remove those Identity nodes.
+void RemoveOutputIdentityNodesForWhileV2(Graph* g, Node* while_node) {
+  std::vector<const Edge*> edges_to_identity_node;
+  for (const Edge* e : while_node->out_edges()) {
+    if (!e->IsControlEdge() && e->dst()->IsIdentity()) {
+      edges_to_identity_node.push_back(e);
+    }
+  }
+  for (const Edge* e : edges_to_identity_node) {
+    Node* identity = e->dst();
+    std::vector<const Edge*> out_edges(identity->out_edges().begin(),
+                                       identity->out_edges().end());
+    for (const Edge* out_edge : out_edges) {
+      if (out_edge->IsControlEdge()) {
+        g->AddControlEdge(while_node, out_edge->dst());
+      } else {
+        Node* dst = out_edge->dst();
+        int dst_input = out_edge->dst_input();
+        g->RemoveEdge(out_edge);
+        g->AddEdge(while_node, e->src_output(), dst, dst_input);
+      }
+    }
+    g->RemoveNode(identity);
+  }
+}
+
+// If corresponding While node output is used, change it to use While node input
+// instead.
+Status ReplaceOutputEdgesWithInputEdgeSourceForWhile(
+    const absl::flat_hash_set<int>& args_to_lift, Graph* g, Node* while_node) {
+  std::vector<const Edge*> edges_to_replace;
+  for (const Edge* e : while_node->out_edges()) {
+    if (args_to_lift.contains(e->src_output())) {
+      edges_to_replace.push_back(e);
+    }
+  }
+  for (const Edge* e : edges_to_replace) {
+    const Edge* input_edge;
+    TF_RETURN_IF_ERROR(while_node->input_edge(e->src_output(), &input_edge));
+    Node* dst = e->dst();
+    int dst_input = e->dst_input();
+    g->RemoveEdge(e);
+    g->AddEdge(input_edge->src(), input_edge->src_output(), dst, dst_input);
+  }
+
+  return Status::OK();
+}
+
+// Calculates mapping from argument index before lifting to index afterwards.
+absl::flat_hash_map<int, int> ArgIndexMapping(
+    const int num_args, const absl::flat_hash_set<int>& args_to_lift) {
+  absl::flat_hash_map<int, int> index_mapping;
+  int new_index = 0;
+  for (int i = 0; i < num_args; i++) {
+    if (!args_to_lift.contains(i)) {
+      index_mapping[i] = new_index;
+      ++new_index;
+    }
+  }
+
+  return index_mapping;
+}
+
+// Remove outputs of While node body function that maps to lifted arguments.
+void CleanUpRetvalsForWhileBody(
+    const absl::flat_hash_map<int, int>& index_mapping,
+    const std::vector<DataType>& dtypes, FunctionBody* fbody) {
+  for (int i = 0; i < fbody->ret_nodes.size(); i++) {
+    Node* ret_node = fbody->ret_nodes[i];
+    if (index_mapping.contains(i)) {
+      int new_index = index_mapping.at(i);
+      ret_node->ClearAttr("index");
+      ret_node->AddAttr("index", new_index);
+      ret_node->ClearAttr("T");
+      ret_node->AddAttr("T", dtypes[i]);
+    } else {
+      fbody->graph->RemoveNode(ret_node);
+    }
+  }
+}
+
+Status LiftOutsideCompilationOnlyArgsFromWhileNode(
+    Graph* g, Node* while_node, FunctionLibraryDefinition* fld,
+    int* lifted_arg_count, bool* rewritten) {
+  *rewritten = false;
+
+  TF_ASSIGN_OR_RETURN(absl::flat_hash_set<int> args_to_lift,
+                      FindArgsToLiftForWhileNode(while_node, fld));
+  if (args_to_lift.empty()) return Status::OK();
+
+  RemoveOutputIdentityNodesForWhileV2(g, while_node);
+
+  TF_RETURN_IF_ERROR(ReplaceOutputEdgesWithInputEdgeSourceForWhile(
+      args_to_lift, g, while_node));
+
+  std::vector<DataType> dtypes;
+  TF_RETURN_IF_ERROR(GetNodeAttr(while_node->def(), "T", &dtypes));
+
+  absl::flat_hash_map<int, int> index_mapping =
+      ArgIndexMapping(dtypes.size(), args_to_lift);
+
+  // For each lifted arg, add an outside compilation Identity node to send
+  // it to host.
+  absl::flat_hash_map<int, string> lifted_arg_index_to_oc_cluster_name;
+  TF_RETURN_IF_ERROR(MakeIdentityNodesForArgsToLift(
+      args_to_lift, /*arg_to_input_edge_offset=*/0, g, while_node,
+      &lifted_arg_index_to_oc_cluster_name, lifted_arg_count));
+
+  // For cond func, remove _Arg nodes.
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<FunctionBody> cond_fbody,
+                      InstantiateAssociatedFunction(*while_node, "cond", fld));
+  TF_RETURN_IF_ERROR(RemoveArgsToLiftFromFunctionBody(
+      args_to_lift, dtypes, lifted_arg_index_to_oc_cluster_name, index_mapping,
+      cond_fbody.get()));
+
+  FunctionDef rewritten_cond_fdef;
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(*(cond_fbody->graph),
+                                        cond_fbody->fdef.signature().name(),
+                                        &rewritten_cond_fdef));
+  TF_RETURN_IF_ERROR(fld->ReplaceFunction(cond_fbody->fdef.signature().name(),
+                                          rewritten_cond_fdef));
+
+  // For body func, remove _Retval nodes, and replace _Arg nodes with
+  // Placeholder nodes.
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<FunctionBody> body_fbody,
+                      InstantiateAssociatedFunction(*while_node, "body", fld));
+
+  TF_RETURN_IF_ERROR(RemoveArgsToLiftFromFunctionBody(
+      args_to_lift, dtypes, lifted_arg_index_to_oc_cluster_name, index_mapping,
+      body_fbody.get()));
+
+  CleanUpRetvalsForWhileBody(index_mapping, dtypes, body_fbody.get());
+
+  FunctionDef rewritten_body_fdef;
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(*(body_fbody->graph),
+                                        body_fbody->fdef.signature().name(),
+                                        &rewritten_body_fdef));
+  TF_RETURN_IF_ERROR(fld->ReplaceFunction(body_fbody->fdef.signature().name(),
+                                          rewritten_body_fdef));
+
+  // Remove edges from lifted args to While node, and change "T" attr of the
+  // While node.
+  TF_RETURN_IF_ERROR(CleanUpInEdges(
+      index_mapping, /*arg_to_input_edge_offset=*/0, g, while_node));
+
+  TF_RETURN_IF_ERROR(
+      UpdateTypeAttribute(index_mapping, "T", dtypes, while_node));
+
+  *rewritten = true;
+
+  return Status::OK();
+}
+
+Status LiftOutsideCompilationOnlyArgsFromIfNode(Graph* g, Node* if_node,
+                                                FunctionLibraryDefinition* fld,
+                                                int* lifted_arg_count,
+                                                bool* rewritten) {
+  *rewritten = false;
+  TF_ASSIGN_OR_RETURN(absl::flat_hash_set<int> args_to_lift,
+                      FindArgsToLiftForIfNode(*if_node, fld));
+  if (args_to_lift.empty()) return Status::OK();
+
+  std::vector<DataType> dtypes;
+  TF_RETURN_IF_ERROR(GetNodeAttr(if_node->def(), "Tin", &dtypes));
+
+  absl::flat_hash_map<int, int> index_mapping;
+  int new_index = 0;
+  for (int i = 0; i < dtypes.size(); i++) {
+    if (!args_to_lift.contains(i)) {
+      index_mapping[i] = new_index;
+      ++new_index;
+    }
+  }
+
+  // For each lifted arg, add an outside compilation Identity node to send
+  // it to host.
+  absl::flat_hash_map<int, string> lifted_arg_index_to_oc_cluster_name;
+  TF_RETURN_IF_ERROR(MakeIdentityNodesForArgsToLift(
+      args_to_lift, /*arg_to_input_edge_offset=*/1, g, if_node,
+      &lifted_arg_index_to_oc_cluster_name, lifted_arg_count));
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<FunctionBody> then_branch_fbody,
+      InstantiateAssociatedFunction(*if_node, "then_branch", fld));
+
+  TF_RETURN_IF_ERROR(RemoveArgsToLiftFromFunctionBody(
+      args_to_lift, dtypes, lifted_arg_index_to_oc_cluster_name, index_mapping,
+      then_branch_fbody.get()));
+
+  FunctionDef rewritten_then_branch_fdef;
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(
+      *(then_branch_fbody->graph), then_branch_fbody->fdef.signature().name(),
+      &rewritten_then_branch_fdef));
+  TF_RETURN_IF_ERROR(fld->ReplaceFunction(
+      then_branch_fbody->fdef.signature().name(), rewritten_then_branch_fdef));
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<FunctionBody> else_branch_fbody,
+      InstantiateAssociatedFunction(*if_node, "else_branch", fld));
+
+  TF_RETURN_IF_ERROR(RemoveArgsToLiftFromFunctionBody(
+      args_to_lift, dtypes, lifted_arg_index_to_oc_cluster_name, index_mapping,
+      else_branch_fbody.get()));
+
+  FunctionDef rewritten_else_branch_fdef;
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(
+      *(else_branch_fbody->graph), else_branch_fbody->fdef.signature().name(),
+      &rewritten_else_branch_fdef));
+  TF_RETURN_IF_ERROR(fld->ReplaceFunction(
+      else_branch_fbody->fdef.signature().name(), rewritten_else_branch_fdef));
+
+  // Remove edges from lifted args to If node, and change "Tin" attr of the
+  // If node.
+  TF_RETURN_IF_ERROR(CleanUpInEdges(
+      index_mapping, /*arg_to_input_edge_offset=*/1, g, if_node));
+  TF_RETURN_IF_ERROR(
+      UpdateTypeAttribute(index_mapping, "Tin", dtypes, if_node));
+
+  *rewritten = true;
+
+  return Status::OK();
+}
+
+Status LiftOutsideCompilationOnlyArgsFromCallNode(
+    Graph* g, Node* call_node, FunctionLibraryRuntime* flr,
+    FunctionLibraryDefinition* fld, int* lifted_arg_count, bool* rewritten) {
+  *rewritten = false;
+
+  // Instantiate the function.
+  NameAttrList func;
+  if (fld->Contains(call_node->type_string())) {
+    func.set_name(call_node->type_string());
+    *func.mutable_attr() = call_node->def().attr();
+  } else if (call_node->IsPartitionedCall()) {
+    TF_RETURN_IF_ERROR(GetNodeAttr(call_node->def(), "f", &func));
+  } else {
+    TF_RET_CHECK(call_node->type_string() ==
+                 FunctionLibraryDefinition::kGradientOp);
+    func.set_name(FunctionLibraryDefinition::kGradientOp);
+    *func.mutable_attr() = call_node->def().attr();
+  }
+  FunctionLibraryRuntime::Handle handle;
+  TF_RETURN_IF_ERROR(
+      flr->Instantiate(func.name(), AttrSlice(&func.attr()), &handle));
+  auto cleanup_handle = gtl::MakeCleanup(
+      [&flr, &handle]() { flr->ReleaseHandle(handle).IgnoreError(); });
+  const FunctionBody* fbody = flr->GetFunctionBody(handle);
+
+  // Find _Arg nodes to lift.
+  TF_ASSIGN_OR_RETURN(absl::flat_hash_set<int> args_to_lift,
+                      FindArgsToLiftForCallNode(call_node, *fbody));
+  if (args_to_lift.empty()) return Status::OK();
+
+  std::vector<DataType> dtypes;
+  dtypes = std::vector<DataType>(call_node->input_types().begin(),
+                                 call_node->input_types().end());
+
+  absl::flat_hash_map<int, int> index_mapping =
+      ArgIndexMapping(dtypes.size(), args_to_lift);
+
+  // For each lifted arg, add an outside compilation Identity node to send
+  // it to host.
+  absl::flat_hash_map<int, string> lifted_arg_index_to_oc_cluster_name;
+  TF_RETURN_IF_ERROR(MakeIdentityNodesForArgsToLift(
+      args_to_lift, /*arg_to_input_edge_offset=*/0, g, call_node,
+      &lifted_arg_index_to_oc_cluster_name, lifted_arg_count));
+
+  // Remove _Arg nodes.
+  TF_RETURN_IF_ERROR(RemoveArgsToLiftFromFunctionBody(
+      args_to_lift, dtypes, lifted_arg_index_to_oc_cluster_name, index_mapping,
+      fbody));
+
+  // Store rewritten function as a new function, because the original function
+  // might be defined by user and we should not modify it.
+  FunctionDef rewritten_fdef;
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(
+      *(fbody->graph), fbody->fdef.signature().name(), &rewritten_fdef));
+  string new_func_name =
+      fld->UniqueFunctionName(fbody->fdef.signature().name());
+  rewritten_fdef.mutable_signature()->set_name(new_func_name);
+  TF_RETURN_IF_ERROR(fld->AddFunctionDef(rewritten_fdef));
+
+  // Remove edges from lifted args to call node.
+  TF_RETURN_IF_ERROR(CleanUpInEdges(
+      index_mapping, /*arg_to_input_edge_offset=*/0, g, call_node));
+
+  // Rewrite the call node to use the rewritten function.
+  NodeDef node_def;
+  node_def.set_name(g->NewName(call_node->name()));
+  node_def.set_op(new_func_name);
+  if (call_node->IsPartitionedCall()) {
+    NameAttrList f;
+    TF_RETURN_IF_ERROR(GetNodeAttr(call_node->def(), "f", &f));
+    *node_def.mutable_attr() = f.attr();
+  } else if (fld->Contains(call_node->type_string())) {
+    *node_def.mutable_attr() = call_node->def().attr();
+  } else {
+    TF_RET_CHECK(call_node->type_string() ==
+                 FunctionLibraryDefinition::kGradientOp);
+    *node_def.mutable_attr() = call_node->def().attr();
+    node_def.mutable_attr()->erase(FunctionLibraryDefinition::kFuncAttr);
+  }
+  TF_ASSIGN_OR_RETURN(call_node, ReplaceNode(g, call_node, node_def));
+
+  *rewritten = true;
+
+  return Status::OK();
+}
+
+// Lifts outside compilation only _Arg nodes out of If/While/function nodes.
+Status LiftOutsideCompilationOnlyArgs(Graph* g, FunctionLibraryRuntime* flr,
+                                      FunctionLibraryDefinition* fld,
+                                      int* lifted_arg_count, bool* rewritten) {
+  *rewritten = false;
+
+  // Handle deeper functional nodes first.
+  std::vector<Node*> while_nodes, if_nodes, call_nodes;
+  for (Node* n : g->op_nodes()) {
+    if (HasNodeAttr(n->def(), kOutsideCompilationAttr)) {
+      continue;
+    }
+
+    if (n->IsWhileNode()) {
+      TF_ASSIGN_OR_RETURN(std::unique_ptr<FunctionBody> body_fbody,
+                          InstantiateAssociatedFunction(*n, "body", fld));
+      bool func_rewritten = false;
+      TF_RETURN_IF_ERROR(LiftOutsideCompilationOnlyArgsAndReplaceFunctionDef(
+          *body_fbody, flr, fld, lifted_arg_count,
+          /*new_func_name=*/absl::nullopt, &func_rewritten));
+      *rewritten = *rewritten || func_rewritten;
+
+      while_nodes.push_back(n);
+    } else if (n->IsIfNode()) {
+      TF_ASSIGN_OR_RETURN(
+          std::unique_ptr<FunctionBody> then_branch_fbody,
+          InstantiateAssociatedFunction(*n, "then_branch", fld));
+      bool func_rewritten = false;
+      TF_RETURN_IF_ERROR(LiftOutsideCompilationOnlyArgsAndReplaceFunctionDef(
+          *then_branch_fbody, flr, fld, lifted_arg_count,
+          /*new_func_name=*/absl::nullopt, &func_rewritten));
+      *rewritten |= func_rewritten;
+
+      TF_ASSIGN_OR_RETURN(
+          std::unique_ptr<FunctionBody> else_branch_fbody,
+          InstantiateAssociatedFunction(*n, "else_branch", fld));
+      func_rewritten = false;
+      TF_RETURN_IF_ERROR(LiftOutsideCompilationOnlyArgsAndReplaceFunctionDef(
+          *else_branch_fbody, flr, fld, lifted_arg_count,
+          /*new_func_name=*/absl::nullopt, &func_rewritten));
+      *rewritten |= func_rewritten;
+
+      if_nodes.push_back(n);
+    } else if (IsFunctionCall(*fld, *n)) {
+      // Function call nodes need to be rewritten, so handle them later.
+      call_nodes.push_back(n);
+    }
+  }
+
+  std::vector<Node*> rewritten_call_nodes;
+  for (Node* call_node : call_nodes) {
+    if (call_node->IsPartitionedCall()) {
+      std::unique_ptr<FunctionBody> function_fbody;
+      TF_ASSIGN_OR_RETURN(function_fbody,
+                          InstantiateAssociatedFunction(*call_node, "f", fld));
+      bool func_rewritten = false;
+      string new_func_name =
+          fld->UniqueFunctionName(function_fbody->fdef.signature().name());
+      TF_RETURN_IF_ERROR(LiftOutsideCompilationOnlyArgsAndReplaceFunctionDef(
+          *function_fbody, flr, fld, lifted_arg_count, new_func_name,
+          &func_rewritten));
+      if (func_rewritten) {
+        NameAttrList f;
+        TF_RETURN_IF_ERROR(GetNodeAttr(call_node->def(), "f", &f));
+        f.set_name(new_func_name);
+        call_node->ClearAttr("f");
+        call_node->AddAttr("f", f);
+      }
+
+      *rewritten |= func_rewritten;
+      rewritten_call_nodes.push_back(call_node);
+    } else if (fld->Contains(call_node->type_string())) {
+      std::unique_ptr<FunctionBody> function_fbody;
+      const FunctionDef* fdef = fld->Find(call_node->type_string());
+      TF_RET_CHECK(fdef);
+      TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(*fdef, call_node->attrs(), fld,
+                                                 &function_fbody));
+      bool func_rewritten = false;
+      string new_func_name =
+          fld->UniqueFunctionName(function_fbody->fdef.signature().name());
+      TF_RETURN_IF_ERROR(LiftOutsideCompilationOnlyArgsAndReplaceFunctionDef(
+          *function_fbody, flr, fld, lifted_arg_count, new_func_name,
+          &func_rewritten));
+      if (func_rewritten) {
+        NodeDef node_def;
+        node_def.set_name(g->NewName(call_node->name()));
+        node_def.set_op(new_func_name);
+        *node_def.mutable_attr() = call_node->def().attr();
+        TF_ASSIGN_OR_RETURN(call_node, ReplaceNode(g, call_node, node_def));
+      }
+
+      *rewritten |= func_rewritten;
+      rewritten_call_nodes.push_back(call_node);
+    } else {
+      TF_RET_CHECK(call_node->type_string() ==
+                   FunctionLibraryDefinition::kGradientOp);
+      FunctionLibraryRuntime::Handle handle;
+      TF_RETURN_IF_ERROR(flr->Instantiate(call_node->type_string(),
+                                          call_node->attrs(), &handle));
+      auto cleanup_handle = gtl::MakeCleanup(
+          [&flr, &handle]() { flr->ReleaseHandle(handle).IgnoreError(); });
+      bool func_rewritten = false;
+      string new_func_name = fld->UniqueFunctionName(
+          absl::StrCat(call_node->name(), "_lift_args"));
+      const FunctionBody* function_fbody = flr->GetFunctionBody(handle);
+      TF_RETURN_IF_ERROR(LiftOutsideCompilationOnlyArgsAndReplaceFunctionDef(
+          *function_fbody, flr, fld, lifted_arg_count, new_func_name,
+          &func_rewritten));
+      if (func_rewritten) {
+        NodeDef node_def;
+        node_def.set_name(g->NewName(call_node->name()));
+        node_def.set_op(new_func_name);
+        *node_def.mutable_attr() = call_node->def().attr();
+        node_def.mutable_attr()->erase(FunctionLibraryDefinition::kFuncAttr);
+        TF_ASSIGN_OR_RETURN(call_node, ReplaceNode(g, call_node, node_def));
+      }
+
+      *rewritten |= func_rewritten;
+      rewritten_call_nodes.push_back(call_node);
+    }
+  }
+
+  for (Node* n : while_nodes) {
+    bool node_rewritten = false;
+    TF_RETURN_IF_ERROR(LiftOutsideCompilationOnlyArgsFromWhileNode(
+        g, n, fld, lifted_arg_count, &node_rewritten));
+    *rewritten = *rewritten || node_rewritten;
+  }
+
+  for (Node* n : if_nodes) {
+    bool node_rewritten = false;
+    TF_RETURN_IF_ERROR(LiftOutsideCompilationOnlyArgsFromIfNode(
+        g, n, fld, lifted_arg_count, &node_rewritten));
+    *rewritten = *rewritten || node_rewritten;
+  }
+
+  for (Node* n : rewritten_call_nodes) {
+    bool node_rewritten = false;
+    TF_RETURN_IF_ERROR(LiftOutsideCompilationOnlyArgsFromCallNode(
+        g, n, flr, fld, lifted_arg_count, &node_rewritten));
+    *rewritten = *rewritten || node_rewritten;
+  }
+
+  if (*rewritten) {
+    VLOG(4) << DumpGraphToFile("after_lifting_args", *g, fld);
+  }
+
+  return Status::OK();
+}
+
+}  // namespace
+
+/*static*/ Status EncapsulateTPUComputationsPass::Encapsulate(
+    std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def) {
+  // Check for undeclared outputs before Encapsulation, so we can give a better
+  // error message.
+  // TODO(phawkins): merge this with the encapsulation code to avoid the extra
+  // O(n) pass over the edges.
+  for (const Edge* e : (*graph)->edges()) {
+    if (!e->IsControlEdge() &&
+        e->src()->attrs().Find(kTPUReplicateAttr) != nullptr &&
+        e->src()->attrs().Find(kOutsideCompilationAttr) == nullptr &&
+        e->dst()->attrs().Find(kTPUReplicateAttr) == nullptr &&
+        e->dst()->type_string() != kTPUReplicatedOutput) {
+      return errors::InvalidArgument(
+          "Undeclared output of TPU computation. A common cause of this error "
+          "is variable initializers that depend on the TPU computation. Edge: ",
+          FormatNodeForError(*e->src()), ":", e->src_output(), " -> ",
+          FormatNodeForError(*e->dst()), ":", e->dst_input());
+    }
+  }
+
+  RemoveUnusedTPUReplicatedInputs(graph->get());
+
+  TF_RETURN_IF_ERROR(RenameClustersWithDuplicatedNames(graph->get()));
+
+  TF_RETURN_IF_ERROR(
+      PerformStaticShapeInferenceBeforeEncapsulation(graph->get()));
+
+  auto output = absl::make_unique<Graph>((*graph)->op_registry());
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      EncapsulateSubgraphsInFunctions(
+          kTPUReplicateAttr, **graph, RewriteSubgraph,
+          /*reuse_existing_functions=*/true, &output, flib_def),
+      "EncapsulateTPUComputationsPass failed");
+  graph->swap(output);
+
+  return Status::OK();
+}
+
+/*static*/ Status EncapsulateTPUComputationsPass::BuildTPUReplicateOps(
+    Graph* graph) {
+  // Finds all of the replicate function calls, to avoid mutating the graph
+  // while iterating.
+  std::vector<Node*> replicate_nodes;
+  std::vector<Node*> guarantee_const_nodes;
+  for (Node* n : graph->nodes()) {
+    string name;
+    if (TryGetNodeAttr(n->attrs(), kTPUReplicateAttr, &name) &&
+        !TryGetNodeAttr(n->attrs(), kOutsideCompilationAttr, &name)) {
+      replicate_nodes.push_back(n);
+    } else if (n->type_string() == "GuaranteeConst") {
+      guarantee_const_nodes.push_back(n);
+    }
+  }
+
+  // Replace any GuaranteeConst nodes with Identity nodes. These nodes have now
+  // served their purpose and have no runtime effect, except increasing
+  // inference latency due to executor overhead. Subsequent rewrites will remove
+  // the Identity nodes.
+  for (Node* n : guarantee_const_nodes) {
+    std::vector<std::pair<Node*, int>> predecessors;
+    for (const Edge* e : n->in_edges()) {
+      predecessors.emplace_back(e->src(), e->src_output());
+    }
+    std::vector<std::pair<Node*, int>> successors;
+    for (const Edge* e : n->out_edges()) {
+      successors.emplace_back(e->dst(), e->dst_input());
+    }
+    NodeDef ndef;
+    ndef.set_name(n->name());
+    ndef.set_op("Identity");
+    ndef.set_device(n->requested_device());
+    MergeDebugInfo(NodeDebugInfo(n->def()), &ndef);
+    AddNodeAttr("T", n->output_type(0), &ndef);
+
+    graph->RemoveNode(n);
+    Status s;
+    Node* id_node = graph->AddNode(ndef, &s);
+    TF_RETURN_IF_ERROR(s);
+
+    for (const auto& pred : predecessors) {
+      if (pred.second < 0) {
+        graph->AddControlEdge(pred.first, id_node);
+      } else {
+        graph->AddEdge(pred.first, pred.second, id_node, 0);
+      }
+    }
+    for (const auto& succ : successors) {
+      if (succ.second < 0) {
+        graph->AddControlEdge(id_node, succ.first);
+      } else {
+        graph->AddEdge(id_node, 0, succ.first, succ.second);
+      }
+    }
+  }
+
+  // Replaces each replicate function call together with its neighboring
+  // TPUReplicatedInput/TPUReplicatedOutput nodes with a TPUReplicate node.
+  for (Node* replicate : replicate_nodes) {
+    int num_replicas;
+    TF_RETURN_IF_ERROR(
+        GetNodeAttr(replicate->attrs(), "num_replicas", &num_replicas));
+    int variable_start_index;
+    TF_RETURN_IF_ERROR(GetNodeAttr(replicate->attrs(), "_variable_start_index",
+                                   &variable_start_index));
+    int guaranteed_const_start_index;
+    TF_RETURN_IF_ERROR(GetNodeAttr(replicate->attrs(),
+                                   "_guaranteed_const_start_index",
+                                   &guaranteed_const_start_index));
+
+    if (HasNodeAttr(replicate->def(), "use_tpu")) {
+      bool use_tpu;
+      TF_RETURN_IF_ERROR(GetNodeAttr(replicate->attrs(), "use_tpu", &use_tpu));
+      if (!use_tpu) {
+        LOG(WARNING) << "use_tpu=false attr on a TPUReplicate node is ignored.";
+      }
+    }
+
+    std::vector<const Edge*> in_edges;
+    TF_RETURN_IF_ERROR(replicate->input_edges(&in_edges));
+
+    // Counts the number of replicated, non-replicated, and variable inputs.
+    int pos = 0;
+    std::vector<int> mirrored_variable_indices;
+    int distributed_var_start_index = 0;
+    while (pos < in_edges.size() &&
+           in_edges[pos]->src()->type_string() == kTPUReplicatedInput) {
+      // Checks that each TPUReplicatedInput node has the correct number of
+      // replicas.
+      int input_num_replicas;
+      TF_RETURN_IF_ERROR(
+          GetNodeAttr(in_edges[pos]->src()->attrs(), "N", &input_num_replicas));
+
+      bool is_mirrored_variable;
+      CHECK(GetNodeAttr(in_edges[pos]->src()->attrs(), "is_mirrored_variable",
+                        &is_mirrored_variable)
+                .ok());
+      if (is_mirrored_variable) {
+        mirrored_variable_indices.push_back(pos);
+      }
+
+      bool is_packed = false;
+      GetNodeAttr(in_edges[pos]->src()->attrs(), "is_packed", &is_packed)
+          .IgnoreError();
+
+      bool is_distributed_variable =
+          is_packed && (in_edges[pos]->src()->output_type(
+                            in_edges[pos]->src_output()) == DT_RESOURCE);
+
+      if (!is_distributed_variable && input_num_replicas != num_replicas) {
+        return errors::InvalidArgument(
+            "Mismatched number of replicas. Computation has ", num_replicas,
+            " replicas, input '", FormatNodeForError(*in_edges[pos]->src()),
+            "' has ", input_num_replicas, " replicas.");
+      }
+
+      if (!is_distributed_variable) {
+        if (distributed_var_start_index < pos) {
+          return errors::InvalidArgument(
+              "Expect a distributed resource after index ",
+              distributed_var_start_index,
+              ", but got a replicated resource at index ", pos);
+        } else {
+          ++distributed_var_start_index;
+        }
+      }
+      ++pos;
+    }
+    const int num_replicated_inputs = distributed_var_start_index;
+    const int num_distributed_vars = pos - num_replicated_inputs;
+
+    const int num_variables =
+        std::max(0, guaranteed_const_start_index - variable_start_index);
+
+    const int num_guaranteed_constants =
+        in_edges.size() - guaranteed_const_start_index;
+    TF_RET_CHECK(num_guaranteed_constants >= 0);
+
+    VLOG(1) << "Replicate node '" << replicate->name() << "'"
+            << " input edges: " << in_edges.size()
+            << " num_replicated_inputs: " << num_replicated_inputs
+            << " num_distributed_vars: " << num_distributed_vars
+            << " num_variables: " << num_variables
+            << " num_guaranteed_constants: " << num_guaranteed_constants
+            << " num_mirrored_variables: " << mirrored_variable_indices.size();
+
+    const int num_broadcast_inputs =
+        in_edges.size() - (num_replicated_inputs + num_distributed_vars +
+                           num_variables + num_guaranteed_constants);
+    TF_RET_CHECK(num_broadcast_inputs >= 0);
+
+    const int num_inputs = num_replicated_inputs * num_replicas +
+                           num_distributed_vars + num_broadcast_inputs +
+                           num_guaranteed_constants + num_variables;
+
+    std::vector<Node*> nodes_to_remove = {replicate};
+
+    // Data and control inputs to the new TPUReplicate node.
+    std::vector<std::pair<Node*, int>> data_inputs(num_inputs);
+    gtl::FlatSet<Node*> control_inputs;
+
+    AddControlInputs(*replicate, &control_inputs);
+
+    // Replicated inputs. Adds the inputs from the TPUReplicatedInput inputs,
+    // in replica-major order. See the comments in
+    // distributed_tpu_rewrite_pass.h for a description of the argument order.
+    DataTypeVector replicated_input_types(num_replicated_inputs * num_replicas +
+                                          num_distributed_vars);
+
+    // Inputs with is_distributed_variable = false.
+    for (int i = 0; i < num_replicated_inputs; ++i) {
+      std::vector<const Edge*> replica_in_edges;
+      TF_RETURN_IF_ERROR(in_edges[i]->src()->input_edges(&replica_in_edges));
+      for (int replica = 0; replica < num_replicas; ++replica) {
+        int pos = replica * num_replicated_inputs + i;
+        const Edge* edge = replica_in_edges[replica];
+        data_inputs[pos] = {edge->src(), edge->src_output()};
+        replicated_input_types[pos] = EdgeType(edge);
+      }
+      AddControlInputs(*in_edges[i]->src(), &control_inputs);
+      nodes_to_remove.push_back(in_edges[i]->src());
+    }
+
+    // Inputs with is_distributed_variable = true.
+    for (int i = 0; i < num_distributed_vars; ++i) {
+      int pos = num_replicas * num_replicated_inputs + i;
+      std::vector<const Edge*> replica_in_edges;
+      TF_RETURN_IF_ERROR(
+          in_edges[num_replicated_inputs + i]->src()->input_edges(
+              &replica_in_edges));
+      TF_RET_CHECK(replica_in_edges.size() == 1);
+      const Edge* edge = replica_in_edges[0];
+      data_inputs[pos] = {edge->src(), edge->src_output()};
+      replicated_input_types[pos] = EdgeType(edge);
+      AddControlInputs(*in_edges[num_replicated_inputs + i]->src(),
+                       &control_inputs);
+      nodes_to_remove.push_back(in_edges[num_replicated_inputs + i]->src());
+    }
+
+    // Appends the broadcast inputs.
+    DataTypeVector broadcast_input_types(num_broadcast_inputs);
+    for (int i = 0; i < num_broadcast_inputs; ++i) {
+      int pos = num_replicas * num_replicated_inputs + num_distributed_vars + i;
+      const Edge* edge =
+          in_edges[num_replicated_inputs + num_distributed_vars + i];
+      data_inputs[pos] = {edge->src(), edge->src_output()};
+      broadcast_input_types[i] = EdgeType(edge);
+    }
+
+    // Appends the variable inputs.
+    for (int i = 0; i < num_variables; ++i) {
+      int pos = num_replicas * num_replicated_inputs + num_distributed_vars +
+                num_broadcast_inputs + i;
+      const Edge* edge = in_edges[num_replicated_inputs + num_distributed_vars +
+                                  num_broadcast_inputs + i];
+      data_inputs[pos] = {edge->src(), edge->src_output()};
+    }
+
+    DataTypeVector guaranteed_constant_types(num_guaranteed_constants);
+    for (int i = 0; i < num_guaranteed_constants; ++i) {
+      int pos = num_replicas * num_replicated_inputs + num_distributed_vars +
+                num_broadcast_inputs + num_variables + i;
+      const Edge* edge = in_edges[num_replicated_inputs + num_distributed_vars +
+                                  num_broadcast_inputs + num_variables + i];
+      data_inputs[pos] = {edge->src(), edge->src_output()};
+      guaranteed_constant_types[i] = EdgeType(edge);
+    }
+
+    // Outputs. All outputs from a replicated computation are replicated.
+    const int num_outputs = replicate->output_types().size();
+    gtl::FlatSet<Node*> control_outputs;
+    std::vector<Node*> replicated_outputs(num_outputs);
+    for (const Edge* e : replicate->out_edges()) {
+      if (e->IsControlEdge()) {
+        control_outputs.insert(e->dst());
+      } else {
+        TF_RET_CHECK(e->src_output() < num_outputs);
+        TF_RET_CHECK(e->dst()->type_string() == kTPUReplicatedOutput)
+            << e->DebugString();
+        TF_RET_CHECK(e->dst()->output_types().size() == num_replicas);
+        replicated_outputs[e->src_output()] = e->dst();
+        nodes_to_remove.push_back(e->dst());
+
+        AddControlOutputs(*e->dst(), &control_outputs);
+      }
+    }
+
+    // Flattens the edges outgoing from the TPUReplicatedOutput nodes in
+    // replica-major order.
+    std::vector<std::vector<std::pair<Node*, int>>> data_outputs(num_replicas *
+                                                                 num_outputs);
+    DataTypeVector output_types(num_replicas * num_outputs);
+    for (int i = 0; i < num_outputs; ++i) {
+      std::vector<std::vector<const Edge*>> replica_out_edges(num_replicas);
+      TF_RET_CHECK(replicated_outputs[i] != nullptr);
+      for (const Edge* e : replicated_outputs[i]->out_edges()) {
+        TF_RET_CHECK(!e->IsControlEdge());
+        replica_out_edges[e->src_output()].push_back(e);
+      }
+
+      for (int replica = 0; replica < num_replicas; ++replica) {
+        const int pos = replica * num_outputs + i;
+        for (const Edge* edge : replica_out_edges[replica]) {
+          data_outputs[pos].push_back({edge->dst(), edge->dst_input()});
+        }
+        output_types[pos] = replicated_outputs[i]->input_type(0);
+      }
+    }
+
+    // TODO(b/79092708): Consolidate and cleanup to avoid TPU specialization.
+    NodeDef def;
+    def.set_name(replicate->name());
+    def.set_op("_TPUReplicate");
+    MergeDebugInfo(NodeDebugInfo(replicate->def()), &def);
+    NameAttrList computation;
+    computation.set_name(replicate->type_string());
+    AddNodeAttr("computation", computation, &def);
+    for (const auto& attr : replicate->attrs()) {
+      def.mutable_attr()->insert(attr);
+    }
+    AddNodeAttr("Tinputs", replicated_input_types, &def);
+    AddNodeAttr("Tbroadcast_inputs", broadcast_input_types, &def);
+    AddNodeAttr("NumVariables", num_variables, &def);
+    AddNodeAttr("Tguaranteed_constants", guaranteed_constant_types, &def);
+    AddNodeAttr("output_types", output_types, &def);
+    AddNodeAttr(TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR,
+                mirrored_variable_indices, &def);
+    AddNodeAttr("num_distributed_variables", num_distributed_vars, &def);
+
+    for (Node* node : nodes_to_remove) {
+      VLOG(2) << "Deleting node " << node->DebugString();
+      // Ensure that we do not attempt to add control edges to nodes that are
+      // deleted.
+      control_inputs.erase(node);
+      control_outputs.erase(node);
+      graph->RemoveNode(node);
+    }
+
+    Status status;
+    Node* tpu_replicate = graph->AddNode(def, &status);
+    if (!status.ok()) {
+      return status;
+    }
+    for (int i = 0; i < data_inputs.size(); ++i) {
+      graph->AddEdge(data_inputs[i].first, data_inputs[i].second, tpu_replicate,
+                     i);
+    }
+    for (Node* n : control_inputs) {
+      graph->AddControlEdge(n, tpu_replicate);
+    }
+    for (int i = 0; i < data_outputs.size(); ++i) {
+      for (const auto& successor : data_outputs[i]) {
+        graph->AddEdge(tpu_replicate, i, successor.first, successor.second);
+      }
+    }
+    for (Node* n : control_outputs) {
+      graph->AddControlEdge(tpu_replicate, n);
+    }
+  }
+  return Status::OK();
+}
+
+Status EncapsulateTPUComputationsPass::Run(
+    const GraphOptimizationPassOptions& options) {
+  VLOG(1) << "EncapsulateTPUComputations(): "
+          << DumpGraphToFile("encapsulate_tpu_computations_before",
+                             **options.graph, options.flib_def);
+
+  TF_RETURN_IF_ERROR(Encapsulate(options.graph, options.flib_def));
+  VLOG(1) << "EncapsulateTPUComputations() half-way: "
+          << DumpGraphToFile("encapsulate_tpu_computations_halfway",
+                             **options.graph, options.flib_def);
+
+  TF_RETURN_IF_ERROR(BuildTPUReplicateOps(options.graph->get()));
+  VLOG(1) << "EncapsulateTPUComputations() finished: "
+          << DumpGraphToFile("encapsulate_tpu_computations_after",
+                             **options.graph, options.flib_def);
+  return Status::OK();
+}
+
+Status ExtractOutsideCompilationPass::ProcessHeadTailOutsideCompilation(
+    const string& outside_compilation_attr_name, int* lifted_arg_count,
+    std::unordered_map<string, XlaClusterInfo>* clusters, Graph* g,
+    FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld) {
+  // Gather a list of pivots by cluster so we can easily look them up.
+  absl::node_hash_map<string, Node*> pivots;
+  string cluster_name;
+  for (Node* node : g->nodes()) {
+    if (TryGetNodeAttr(node->attrs(), kPivotForClusterAttr, &cluster_name)) {
+      pivots[cluster_name] = node;
+    }
+  }
+  for (auto& iter : *clusters) {
+    // Find pivot node for this XLA cluster.
+    Node* pivot_node = pivots[iter.first];
+
+    // Instantiate XLA computation function.
+    string xla_func_name = iter.second.func_name_attrs.name();
+    std::unique_ptr<FunctionBody> xla_fbody;
+    TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+        *fld->Find(xla_func_name),
+        AttrSlice(&iter.second.func_name_attrs.attr()), fld, &xla_fbody));
+    Graph* xla_graph = xla_fbody->graph;
+
+    // Make sure all nodes can be traced from sink node.
+    FixupSourceAndSinkEdges(xla_graph);
+
+    // We create Identity nodes for all _Arg/_Retval nodes in XLA computation.
+    // Remove those Identity nodes to simplify furthur processing.
+    TF_RETURN_IF_ERROR(RemoveIdentityNodesForArgRetval(xla_graph));
+
+    bool rewritten;
+    TF_RETURN_IF_ERROR(LiftOutsideCompilationOnlyArgs(
+        xla_graph, flr, fld, lifted_arg_count, &rewritten));
+
+    // Move head outside compilation to host.
+    TF_RETURN_IF_ERROR(MoveHeadOutsideCompilationToHost(
+        outside_compilation_attr_name, iter.second.func_name_attrs.name(),
+        iter.second.cluster_name, g, xla_graph, iter.second.node, pivot_node));
+
+    // Move tail outside compilation to host.
+    TF_RETURN_IF_ERROR(MoveTailOutsideCompilationToHost(
+        outside_compilation_attr_name, iter.second.func_name_attrs.name(),
+        iter.second.cluster_name, g, xla_graph, iter.second.node, pivot_node));
+
+    // Replace outside compilation only _Arg nodes with Placeholder nodes.
+    TF_RETURN_IF_ERROR(ReplaceArgUsedByOutsideCompilationWithPlaceholder(
+        outside_compilation_attr_name, xla_func_name, g, xla_graph,
+        iter.second.node));
+
+    // There might be direct data edges between _Arg node and _Retval node in
+    // `xla_graph`. Remove those edges to avoid back-and-forth data transfer
+    // between host and XLA.
+    TF_RETURN_IF_ERROR(RemoveEdgesBetweenArgAndRetval(
+        iter.second.func_name_attrs.name(), g, xla_graph, iter.second.node));
+
+    // After `MoveHeadOutsideCompilationToHost`, there might be unused XLA
+    // inputs. Remove them.
+    TF_RETURN_IF_ERROR(RemoveUnusedXlaInput(iter.second.func_name_attrs.name(),
+                                            g, xla_graph, iter.second.node));
+
+    // After `MoveTailOutsideCompilationToHost`, there might be unused XLA
+    // outputs. Remove them.
+    TF_RETURN_IF_ERROR(RemoveUnusedXlaOutput(iter.second.func_name_attrs.name(),
+                                             g, xla_graph, iter.second.node));
+
+    // Replace original function.
+    FunctionDef replace_fdef;
+    TF_RETURN_IF_ERROR(
+        GraphToFunctionDef(*xla_graph, xla_func_name, &replace_fdef));
+    TF_RETURN_IF_ERROR(fld->ReplaceFunction(xla_func_name, replace_fdef));
+
+    FixupSourceAndSinkEdges(g);
+  }
+
+  return Status::OK();
+}
+
+Status ExtractOutsideCompilationPass::Run(
+    const GraphOptimizationPassOptions& options) {
+  const auto* config =
+      (options.session_options ? &options.session_options->config : nullptr);
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(
+      new ProcessFunctionLibraryRuntime(
+          /*device_mgr=*/nullptr, options.session_options->env,
+          /*config=*/config, TF_GRAPH_DEF_VERSION, options.flib_def,
+          config ? config->graph_options().optimizer_options()
+                 : OptimizerOptions()));
+  FunctionLibraryRuntime* flr =
+      pflr->GetFLR(ProcessFunctionLibraryRuntime::kDefaultFLRDevice);
+
+  // Find XLA compile ops and their corresponding FunctionDefs.
+  static std::map<string, string>* kNodeTypeToFunctionAttrMapping =
+      new std::map<string, string>{
+          {"_TPUReplicate", "computation"},
+      };
+  std::unordered_map<string, XlaClusterInfo> clusters;
+  int lifted_arg_count = 0;
+  for (Node* n : (*options.graph)->nodes()) {
+    auto iter = kNodeTypeToFunctionAttrMapping->find(n->type_string());
+    if (iter == kNodeTypeToFunctionAttrMapping->end()) {
+      continue;
+    }
+
+    string xla_cluster_name = n->name();
+
+    string func_attr = iter->second;
+    NameAttrList func;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), func_attr, &func));
+
+    std::vector<string> core_list;
+    TF_RETURN_IF_ERROR(
+        GetNodeAttr(n->attrs(), "host_compute_core", &core_list));
+    std::map<string, int> host_compute_core;
+    TF_RETURN_IF_ERROR(ParseHostComputeCoreList(core_list, &host_compute_core));
+
+    clusters.emplace(xla_cluster_name, XlaClusterInfo{xla_cluster_name, func, n,
+                                                      host_compute_core});
+  }
+  TF_RETURN_IF_ERROR(ProcessHeadTailOutsideCompilation(
+      kOutsideCompilationAttr, &lifted_arg_count, &clusters,
+      options.graph->get(), flr, options.flib_def));
+  bool modified;
+  TF_RETURN_IF_ERROR(ExtractOutsideCompilation(
+      kTPUReplicateAttr, kOutsideCompilationAttr, clusters,
+      options.graph->get(), flr, options.flib_def, &modified));
+  if (modified) {
+    TF_RETURN_IF_ERROR(
+        PruneUnreachableFunctionsFromGraph(**options.graph, options.flib_def));
+  }
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.h b/tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.h
new file mode 100644
index 00000000000..91677f7c6cb
--- /dev/null
+++ b/tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.h
@@ -0,0 +1,73 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Rewrites computations generated by the tpu.replicate() Python code into
+// TPUReplicate operators.
+//
+// The tpu.replicate() does two main things:
+// a) marks operators that make up a TPU computation with the attribute
+//    _tpu_replicate=XYZ, where XYZ is a unique key.
+// b) adds TPUReplicatedInput and TPUReplicatedOutput nodes to represent
+//    replicated inputs. These nodes are not marked with the _tpu_replicate
+//    attribute.
+
+#ifndef TENSORFLOW_CORE_TPU_GRAPH_REWRITES_ENCAPSULATE_TPU_COMPUTATIONS_PASS_H_
+#define TENSORFLOW_CORE_TPU_GRAPH_REWRITES_ENCAPSULATE_TPU_COMPUTATIONS_PASS_H_
+
+#include "tensorflow/compiler/jit/encapsulate_util.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+// Encapsulates nodes marked with the _tpu_replicate attribute into
+// TPUReplicate operators.
+class EncapsulateTPUComputationsPass : public GraphOptimizationPass {
+ public:
+  Status Run(const GraphOptimizationPassOptions& options) override;
+
+  // The following methods are public only for unit tests.
+
+  // This pass has two stages:
+  // a) first, we call the EncapsulateSubgraphsPass to encapsulate all nodes
+  //    marked with the same _tpu_replicate attribute into functions. These
+  //    functions contain the computations to be passed to TPUReplicate. During
+  //    encapsulation, we sort the arguments into the order expected by
+  //    TPUReplicate.
+  static Status Encapsulate(std::unique_ptr<Graph>* graph,
+                            FunctionLibraryDefinition* flib_def);
+
+  // b) we rewrite the function calls generated in phase (a) into TPUReplicate
+  //    operators. We also flatten the TPUReplicatedInput and
+  //    TPUReplicatedOutput replicated input and output nodes of the function
+  //    call into the replicated input and outputs of the TPUReplicate operator.
+  static Status BuildTPUReplicateOps(Graph* graph);
+};
+
+// Graph optimization pass that calls `ExtractOutsideCompilation` for all XLA
+// computation nodes.
+class ExtractOutsideCompilationPass : public GraphOptimizationPass {
+ public:
+  Status Run(const GraphOptimizationPassOptions& options) override;
+
+  static Status ProcessHeadTailOutsideCompilation(
+      const string& outside_compilation_attr_name, int* lifted_arg_count,
+      std::unordered_map<string, XlaClusterInfo>* clusters, Graph* g,
+      FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_GRAPH_REWRITES_ENCAPSULATE_TPU_COMPUTATIONS_PASS_H_
diff --git a/tensorflow/core/tpu/graph_rewrite/host_training_loop_optimization_util.cc b/tensorflow/core/tpu/graph_rewrite/host_training_loop_optimization_util.cc
new file mode 100644
index 00000000000..fad8e22399c
--- /dev/null
+++ b/tensorflow/core/tpu/graph_rewrite/host_training_loop_optimization_util.cc
@@ -0,0 +1,629 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/graph_rewrite/host_training_loop_optimization_util.h"
+
+#include <deque>
+#include <map>
+#include <unordered_map>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/node_hash_set.h"
+#include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h"
+#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+#include "tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass_internal.h"
+
+namespace tensorflow {
+namespace tpu {
+
+namespace {
+
+constexpr char kDefaultShardingValue[] = "";
+
+const Edge* FindEdgeConnecting(const Node* src, const Node* dst) {
+  for (const auto e : src->out_edges()) {
+    if (e->dst()->name() == dst->name()) return &(*e);
+  }
+  return nullptr;
+}
+
+// Contains TPUExecute node and its DT_RESOURCE input nodes that
+// correspond to model weights.
+struct ExecuteNodeInfo {
+  Node* execute_node;
+  std::vector<const Edge*> var_inputs;
+};
+
+// Returns whether `node` is in `execute_nodes` or `(identity) -> execute`.
+bool IsExecuteNodeOrIdentityToExecuteNode(
+    const Graph& graph, const std::unordered_set<Node*>& loop_nodes,  // NOLINT
+    const absl::flat_hash_set<Node*>& execute_nodes, Node* node) {
+  if (execute_nodes.find(node) != execute_nodes.end()) return true;
+  if (loop_nodes.find(node) == loop_nodes.end()) return false;
+  if (node->IsNextIteration()) return true;
+  if (!node->IsIdentity()) return false;
+
+  for (const Edge* e : node->out_edges()) {
+    if (e->IsControlEdge()) continue;
+
+    Node* node = e->dst();
+    if (!IsExecuteNodeOrIdentityToExecuteNode(graph, loop_nodes, execute_nodes,
+                                              node)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// From input node to the TPUExecute op, finds the corresponding Enter node
+// by searching/traversing nodes in below pattern of nodes:
+// Enter ----> (identity) --->  While body input
+// Returns nullptr if the Enter node is not found.
+xla::StatusOr<Node*> FindEnterNodeFromTPUExecuteNodeInput(Node* input_node) {
+  Node* node = input_node;
+  while (node->IsIdentity()) {
+    TF_RETURN_IF_ERROR(node->input_node(0, &node));
+  }
+
+  if (node->IsEnter()) {
+    return node;
+  }
+  return nullptr;
+}
+
+xla::StatusOr<bool> ResourceOnlyUsedForTPUExecuteInLoop(
+    const Graph& graph, const std::unordered_set<Node*>& loop_nodes,  // NOLINT
+    const Node* enter_node, const absl::flat_hash_set<Node*> execute_nodes) {
+  for (const Edge* output_edge : enter_node->out_edges()) {
+    Node* output_node = output_edge->dst();
+    if (output_edge->IsControlEdge() || output_node->IsExit()) continue;
+
+    // If output node is not execute node, it must be output node
+    // to the while loop body.
+    if (!IsExecuteNodeOrIdentityToExecuteNode(graph, loop_nodes, execute_nodes,
+                                              output_node)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Given a TPUCompile node, find all TPUExecute nodes that executes the compiled
+// program and its model weight variable inputs as well.
+// TPUCompileMetadataProto of TPUCompile node must be reset to `new_metadata`
+// if new reshard ops are added.
+Status ExtractExecuteNodeInfo(const Node* compile_node, const Graph& graph,
+                              const std::unordered_set<Node*>& loop_nodes,  // NOLINT
+                              std::vector<ExecuteNodeInfo>* execute_node_info,
+                              TPUCompileMetadataProto* new_metadata) {
+  string metadata_string;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(compile_node->attrs(), "metadata", &metadata_string));
+  new_metadata->ParsePartialFromString(metadata_string);
+  if (new_metadata->num_cores_per_replica() != 1) {
+    // We do not support model parallelism yet.
+    return Status::OK();
+  }
+
+  execute_node_info->clear();
+  for (Node* node : compile_node->out_nodes()) {
+    if (node->type_string() == "TPUExecute") {
+      execute_node_info->push_back({node});
+    }
+  }
+  if (execute_node_info->empty()) {
+    return Status::OK();
+  }
+  TF_RET_CHECK(execute_node_info->size() == new_metadata->num_replicas())
+      << "Number of replicas does not equal number of execute nodes: "
+      << new_metadata->num_replicas() << " vs " << execute_node_info->size();
+  DataTypeVector arg_types;
+  TF_RETURN_IF_ERROR(GetNodeAttr((*execute_node_info)[0].execute_node->attrs(),
+                                 "Targs", &arg_types));
+  for (int64 i = 0; i < arg_types.size(); ++i) {
+    if (arg_types[i] != DT_RESOURCE) {
+      continue;
+    }
+    const auto sharding_config = new_metadata->args(i).enable_xla_sharding();
+    if (sharding_config != TPUCompileMetadataProto::Arg::TENTATIVE &&
+        sharding_config != TPUCompileMetadataProto::Arg::ALLOWED) {
+      continue;
+    }
+    std::vector<const Edge*> edges(execute_node_info->size());
+    bool is_supported = true;
+    std::unordered_map<Node*, absl::flat_hash_set<Node*>>
+        enter_to_execute_nodes;
+    for (int64 j = 0; j < edges.size(); ++j) {
+      auto execute = (*execute_node_info)[j].execute_node;
+      TF_RETURN_IF_ERROR(execute->input_edge(i, &edges[j]));
+      TF_RET_CHECK(edges[j]->src()->output_type(edges[j]->src_output()) ==
+                   arg_types[i])
+          << "Execute op has an unexpected input type.";
+      // Traverse backwards to find the Enter node from which the input is
+      // passed.
+      // This makes sure that we are checking the usages of all potential
+      // aliases of the input node as well.
+      TF_ASSIGN_OR_RETURN(auto enter_node, FindEnterNodeFromTPUExecuteNodeInput(
+                                               edges[j]->src()));
+      if (enter_node == nullptr) {
+        is_supported = false;
+        enter_to_execute_nodes.clear();
+        break;
+      }
+      enter_to_execute_nodes[enter_node].insert(edges[j]->dst());
+    }
+
+    for (const auto& it : enter_to_execute_nodes) {
+      // Size of execute nodes should be either 1 (per-replica variables) or
+      // num_replicas (distributed variables).
+      if ((it.second.size() != 1) &&
+          (it.second.size() != new_metadata->num_replicas())) {
+        is_supported = false;
+        break;
+      }
+      TF_ASSIGN_OR_RETURN(bool no_other_use,
+                          ResourceOnlyUsedForTPUExecuteInLoop(
+                              graph, loop_nodes, it.first, it.second));
+      if (!no_other_use) {
+        is_supported = false;
+        break;
+      }
+    }
+
+    // Add the variable input edges only when they are supported for all
+    // executes.
+    if (is_supported) {
+      for (int64 j = 0; j < edges.size(); ++j) {
+        (*execute_node_info)[j].var_inputs.push_back(edges[j]);
+      }
+      new_metadata->mutable_args(i)->set_enable_xla_sharding(
+          TPUCompileMetadataProto::Arg::ALLOWED);
+    }
+  }
+
+  int64 total = 0;
+  for (const auto& a : new_metadata->args()) {
+    if (a.enable_xla_sharding() == TPUCompileMetadataProto::Arg::ALLOWED) {
+      total++;
+    }
+  }
+  TF_RET_CHECK(total == (*execute_node_info)[0].var_inputs.size())
+      << " total " << total << " var_inputs "
+      << (*execute_node_info)[0].var_inputs.size();
+  if (total == 0) {
+    // We don't need to process anything if no input is added.
+    execute_node_info->clear();
+  }
+  return Status::OK();
+}
+
+bool IsTPUCompileOp(const Node& n) { return n.type_string() == "TPUCompile"; }
+
+void FindTPUCompileNodes(
+    const std::string* current_function_name,
+    const AttrValueMap* current_function_attr,
+    const std::unordered_map<string, WhileLoopFrame>& frames,
+    std::vector<HostTrainingLoopInfo>* host_training_loops_info) {
+  // Adds frames with no children (i.e., the innermost frames) to a worklist.
+  std::deque<const WhileLoopFrame*> worklist;
+
+  for (auto& frame : frames) {
+    if (frame.second.num_children == 0) {
+      worklist.push_back(&frame.second);
+    }
+  }
+
+  // Check TPUCompile node from the innermost while loop to the outermost
+  // while loop.
+  while (!worklist.empty()) {
+    const WhileLoopFrame* frame = worklist.front();
+    worklist.pop_front();
+
+    for (const auto& n : frame->nodes) {
+      if (!IsTPUCompileOp(*n)) continue;
+
+      HostTrainingLoopInfo host_training_loop_info;
+      host_training_loop_info.compile_node_name = n->name();
+      host_training_loop_info.loop_cond_node_name = frame->loop_cond->name();
+      host_training_loop_info.while_loop_name = frame->name;
+
+      for (const auto arg : frame->args) {
+        LoopArgInfo arg_info;
+        arg_info.enter_node_name = arg.enter->name();
+        if (arg.exit) arg_info.exit_node_name = arg.exit->name();
+
+        host_training_loop_info.loop_arguments.push_back(std::move(arg_info));
+      }
+      host_training_loop_info.loop_nodes = frame->nodes;
+
+      if (current_function_name) {
+        host_training_loop_info.encapsulating_function_name =
+            *current_function_name;
+      }
+      if (current_function_attr) {
+        host_training_loop_info.encapsulating_function_attrs =
+            *current_function_attr;
+      }
+
+      host_training_loops_info->emplace_back(
+          std::move(host_training_loop_info));
+    }
+
+    // If the parent has no remaining children, add it to the worklist.
+    --frame->parent->num_children;
+    if (frame->parent->num_children == 0) {
+      worklist.push_back(frame->parent);
+    }
+  }
+}
+
+// From while loop cond node, finds all loop exit nodes by searching/traversing
+// nodes in below pattern of nodes:
+// LoopCond -----> Switch -----> Exit
+std::vector<Node*> FindLoopExitNodes(const Node& loop_cond) {
+  std::vector<Node*> loop_exit_nodes;
+  for (const auto e_cond : loop_cond.out_edges()) {
+    if (e_cond->IsControlEdge() || !e_cond->dst()->IsSwitch()) continue;
+    auto switch_node = e_cond->dst();
+
+    for (const auto e_switch : switch_node->out_edges()) {
+      if (e_switch->IsControlEdge() || !e_switch->dst()->IsExit()) continue;
+
+      loop_exit_nodes.push_back(e_switch->dst());
+    }
+  }
+  return loop_exit_nodes;
+}
+
+// Find any one of switch nodes in the while loop by traversing the graph
+// from while loop condition node.
+xla::StatusOr<Node*> GetLoopSwitchNode(const Node& loop_cond_node) {
+  Node* loop_switch_node;
+  for (auto n : loop_cond_node.out_nodes()) {
+    if (n->IsSwitch()) {
+      loop_switch_node = n;
+      break;
+    }
+  }
+
+  TF_RET_CHECK(loop_switch_node->IsSwitch())
+      << "Unable to find any switch nodes.";
+  return loop_switch_node;
+}
+
+// Returns or creates a node in that is executed before each loop iteration
+// in the while loop.
+Status GetOrCreateBeforeEachIterationNode(Graph* graph, Node* loop_switch_node,
+                                          Node** node_out) {
+  // If while loop switch node already has a outgoing data to true brach
+  // of the switch op, then reuse that node.
+  for (const auto out_edge : loop_switch_node->out_edges()) {
+    if (out_edge->src_output() == 1) {
+      *node_out = out_edge->dst();
+      return Status::OK();
+    }
+  }
+
+  // Create Identity node that represents execution at every loop iteration.
+  NodeDef at_loop_iteration_nodedef;
+  at_loop_iteration_nodedef.set_op("Identity");
+  DataType dtype;
+  TF_RETURN_IF_ERROR(GetNodeAttr(loop_switch_node->def(), "T", &dtype));
+
+  AddNodeAttr("T", dtype, &at_loop_iteration_nodedef);
+  at_loop_iteration_nodedef.set_name(graph->NewName(strings::StrCat(
+      "TPUVariableReshard/before_iteration", "/_", internal::GetNodeId())));
+
+  Status status;
+  Node* at_loop_iteration_node =
+      graph->AddNode(at_loop_iteration_nodedef, &status);
+  TF_RETURN_IF_ERROR(status);
+
+  graph->AddEdge(loop_switch_node, 1, at_loop_iteration_node, 0);
+  *node_out = at_loop_iteration_node;
+  return Status::OK();
+}
+
+// Injects NoOp node in that is executed after the very last iteration
+// of the while loop but before the while loop exit node.
+Status AddNoOpAfterLastIteration(Graph* graph, Node* loop_switch_node,
+                                 Node** node_out) {
+  // Find the exit node from loop switch node.
+  Node* exit_node;
+  for (const auto out_node : loop_switch_node->out_nodes()) {
+    if (out_node->IsExit()) {
+      exit_node = out_node;
+      break;
+    }
+  }
+
+  TF_RET_CHECK(exit_node != nullptr)
+      << "Cannot find exit node connected to switch node :"
+      << loop_switch_node->name();
+
+  // Create NoOp that represents execution at the end of while loop
+  // last iteration.
+  NodeDef after_last_loop_iteration;
+  after_last_loop_iteration.set_op("Identity");
+  DataType dtype;
+  TF_RETURN_IF_ERROR(GetNodeAttr(loop_switch_node->def(), "T", &dtype));
+
+  AddNodeAttr("T", dtype, &after_last_loop_iteration);
+  after_last_loop_iteration.set_name(graph->NewName(strings::StrCat(
+      "TPUVariableReshard/last_iteration", "/_", internal::GetNodeId())));
+
+  Status status;
+  Node* after_last_iteration_node =
+      graph->AddNode(after_last_loop_iteration, &status);
+  TF_RETURN_IF_ERROR(status);
+
+  // Newly created node must be executed once after last iteration of the while
+  // loop and before while loop exits.
+  graph->AddEdge(loop_switch_node, 0, after_last_iteration_node, 0);
+  graph->AddControlEdge(after_last_iteration_node, exit_node);
+  *node_out = after_last_iteration_node;
+  return Status::OK();
+}
+
+}  // namespace
+
+Status DetectHostTrainingLoop(
+    const std::string* current_function_name,
+    const AttrValueMap* current_function_attr,
+    const FunctionLibraryDefinition* library, Graph* graph,
+    FunctionLibraryRuntime* flr,
+    std::vector<HostTrainingLoopInfo>* host_training_loops_info) {
+  std::vector<AssociatedFunctionInfo> associated_function_list;
+  for (const auto* n : graph->nodes()) {
+    const auto associated_functions = GetAssociatedFunctions(*n, library);
+    if (associated_functions.empty()) continue;
+
+    associated_function_list.insert(associated_function_list.end(),
+                                    associated_functions.begin(),
+                                    associated_functions.end());
+  }
+
+  Status ret_status = Status::OK();
+  for (const auto& function : associated_function_list) {
+    if (function.type() != AssociatedFunctionInfo::kFunctionAttr) continue;
+
+    // Convert the function to Graph.
+    FunctionLibraryRuntime::Handle handle;
+    TF_RETURN_IF_ERROR(flr->Instantiate(function.func_name(),
+                                        AttrSlice(&function.attrs()), &handle));
+    auto cleanup_handle = gtl::MakeCleanup([&]() {
+      auto s = flr->ReleaseHandle(handle);
+      if (!s.ok()) {
+        ret_status.Update(s);
+      }
+    });
+    const FunctionBody* body = flr->GetFunctionBody(handle);
+    Graph* function_graph = body->graph;
+    TF_RETURN_IF_ERROR(DetectHostTrainingLoop(
+        &function.func_name(), &function.attrs(), library, function_graph, flr,
+        host_training_loops_info));
+  }
+
+  // BuildControlFlowInfo() requires that the graph's source node is connected
+  // to all source nodes in the graph. Many graphs violate this invariant.
+  // As so, add edges to source/sink nodes so that this invariant is kept.
+  FixupSourceAndSinkEdges(graph);
+  std::vector<ControlFlowInfo> cf_info;
+  TF_RETURN_IF_ERROR(
+      BuildControlFlowInfo(graph, &cf_info, /*unreachable_nodes=*/nullptr));
+
+  std::unordered_map<string, WhileLoopFrame> frames;
+  TF_RETURN_IF_ERROR(ExtractWhileLoopFrames(cf_info, graph, &frames));
+  FindTPUCompileNodes(current_function_name, current_function_attr, frames,
+                      host_training_loops_info);
+  return ret_status;
+}
+
+Status AddReshardOp(Graph* graph, const HostTrainingLoopInfo& host_loop_info) {
+  const auto& compile_node_name = host_loop_info.compile_node_name;
+  const auto node_name_map = graph->BuildNodeNameIndex();
+  const auto node_it = node_name_map.find(compile_node_name);
+  TF_RET_CHECK(node_it != node_name_map.end())
+      << "Unable to find compile node : " << compile_node_name;
+
+  const auto compile_node = node_it->second;
+  std::vector<ExecuteNodeInfo> execute_nodes_info;
+
+  Status status;
+  TPUCompileMetadataProto metadata;
+  status =
+      ExtractExecuteNodeInfo(compile_node, *graph, host_loop_info.loop_nodes,
+                             &execute_nodes_info, &metadata);
+  if (!status.ok()) {
+    LOG(ERROR) << "Encountered error when trying to extract execute nodes, "
+                  "skipping host loop optimization. Status: "
+               << status.ToString();
+    return Status::OK();
+  }
+
+  if (execute_nodes_info.empty()) {
+    return Status::OK();
+  }
+
+  // Update the TPUCompileMetadata such that sharding config of the
+  // sharded resource variable inputs is set to ALLOWED instead of
+  // TENTATIVE.
+  string new_metadata_string;
+  metadata.SerializeToString(&new_metadata_string);
+  compile_node->ClearAttr("metadata");
+  compile_node->AddAttr("metadata", new_metadata_string);
+
+  // Unsharding of the model weight variables must happen only at the very
+  // last loop iteration. As so, add while loop condition predicate as an
+  // input to the sharding switch node. If loop condition is true, we do not
+  // unshard.
+  const auto& cond_node_name = host_loop_info.loop_cond_node_name;
+  auto loop_cond_node_it = node_name_map.find(cond_node_name);
+  TF_RET_CHECK(loop_cond_node_it != node_name_map.end())
+      << "Cannot find loop condition node : " << cond_node_name;
+  auto* loop_condition_node = loop_cond_node_it->second;
+
+  // In order to make sure that shard/unshard operations are invoked
+  // at the start of every loop body and at the end of last iteration
+  // of the loop, respectively, traverse the graph and find a switch node
+  // of the host training loop.
+  TF_ASSIGN_OR_RETURN(Node * switch_node,
+                      GetLoopSwitchNode(*loop_condition_node));
+
+  Node* after_last_iteration_node;
+  TF_RETURN_IF_ERROR(AddNoOpAfterLastIteration(graph, switch_node,
+                                               &after_last_iteration_node));
+
+  Node* before_loop_iteration_node;
+  TF_RETURN_IF_ERROR(GetOrCreateBeforeEachIterationNode(
+      graph, switch_node, &before_loop_iteration_node));
+
+  // Create const op that represents default sharding value
+  // (i.e. no-op sharding).
+  NodeDef default_sharding;
+  default_sharding.set_op("Const");
+  default_sharding.set_name(graph->NewName(strings::StrCat(
+      "TPUVariableReshard/default_shard_state", "/_", internal::GetNodeId())));
+  AddNodeAttr("dtype", DT_STRING, &default_sharding);
+
+  Tensor t(DT_STRING, {2});
+  t.vec<tstring>()(0) = kDefaultShardingValue;
+  t.vec<tstring>()(1) = kDefaultShardingValue;
+  t.AsProtoTensorContent(
+      (*default_sharding.mutable_attr())["value"].mutable_tensor());
+
+  Node* default_sharding_node = graph->AddNode(default_sharding, &status);
+  TF_RETURN_IF_ERROR(status);
+  // Add control edge between loop condition to make sure that
+  // default_sharding_node node is inside the while loop frame.
+  graph->AddControlEdge(loop_condition_node, default_sharding_node);
+
+  // Build a no-op node used to add control edges after unshard nodes.
+  NodeDef after_unshard;
+  after_unshard.set_op("NoOp");
+  after_unshard.set_name(graph->NewName(strings::StrCat(
+      "TPUVariableReshard/last_iteration", "/_", internal::GetNodeId())));
+  auto after_unshard_node = graph->AddNode(after_unshard, &status);
+  TF_RETURN_IF_ERROR(status);
+
+  for (auto info : execute_nodes_info) {
+    auto execute_node = info.execute_node;
+    // Create Reshard op that optionally shards model weight variables
+    // prior to program execution.
+    NodeDef reshard_node_def;
+    reshard_node_def.set_name(graph->NewName(strings::StrCat(
+        "TPUVariableReshard/reshard", "/_", internal::GetNodeId())));
+    reshard_node_def.set_op("TPUReshardVariables");
+    AddNodeAttr("N", static_cast<int>(info.var_inputs.size()),
+                &reshard_node_def);
+    Node* reshard_op_node = graph->AddNode(reshard_node_def, &status);
+    if (!status.ok()) return status;
+
+    reshard_op_node->set_assigned_device_name(
+        execute_node->assigned_device_name());
+
+    // Reshard op must execute at every loop iteration prior to
+    // TPUExecute node.
+    graph->AddControlEdge(before_loop_iteration_node, reshard_op_node);
+    graph->AddControlEdge(reshard_op_node, execute_node);
+
+    for (int i = 0; i < info.var_inputs.size(); ++i) {
+      const auto variable_edge = info.var_inputs[i];
+      graph->AddEdge(variable_edge->src(), variable_edge->src_output(),
+                     reshard_op_node, i);
+    }
+
+    const int new_key_input = info.var_inputs.size();
+    // Add program input edge from the compiler(i.e. compilation key).
+    const auto compilation_key_edge =
+        FindEdgeConnecting(compile_node, execute_node);
+    graph->AddEdge(compile_node, compilation_key_edge->src_output(),
+                   reshard_op_node, new_key_input);
+
+    // Create VarHandleOp to store sharding state. Sharding state holds string
+    // compilation key that identifies whether the graph is re-compiled and the
+    // variables need to be sharded again.
+    NodeDef var_handle_def;
+    var_handle_def.set_op("VarHandleOp");
+    var_handle_def.set_name(graph->NewName(strings::StrCat(
+        "TPUVariableReshard/reshard_state", "/_", internal::GetNodeId())));
+    AddNodeAttr("dtype", DT_STRING, &var_handle_def);
+    AddNodeAttr("shape", TensorShape({}), &var_handle_def);
+    Node* var_handle_node = graph->AddNode(var_handle_def, &status);
+    if (!status.ok()) return status;
+
+    // Add control edge between `var_handle_def` node and while loop
+    // loop condition so that `var_handle_def` is inside the same while loop
+    // frame.
+    // TODO(hongjunchoi): Consider adding control edge from another node--such
+    // as input control node.
+    graph->AddControlEdge(loop_condition_node, var_handle_node);
+
+    // Connect data edge between var handle op and reshard op.
+    const int format_state_input = new_key_input + 1;
+    graph->AddEdge(var_handle_node, 0, reshard_op_node, format_state_input);
+
+    // Create Reshard op that represents unsharding after TPUExecute.
+    NodeDef unshard_node_def;
+    unshard_node_def.set_name(graph->NewName(strings::StrCat(
+        "TPUVariableReshard/unshard", "/_", internal::GetNodeId())));
+    unshard_node_def.set_op("TPUReshardVariables");
+    AddNodeAttr("N", static_cast<int>(info.var_inputs.size()),
+                &unshard_node_def);
+    Node* unshard_op_node = graph->AddNode(unshard_node_def, &status);
+    TF_RETURN_IF_ERROR(status);
+
+    unshard_op_node->set_assigned_device_name(
+        execute_node->assigned_device_name());
+
+    for (int i = 0; i < info.var_inputs.size(); ++i) {
+      const auto variable_edge = info.var_inputs[i];
+      // Connect model weight resource variables to unshard op. Since unshard op
+      // must be only invoked after the very last loop iteration, for each while
+      // loop inputs, we traverse backwards to find the switch node of the host
+      // training loop and connect `output_false` field of the switch node with
+      // unshard op.
+      TF_ASSIGN_OR_RETURN(
+          Node * enter_node,
+          FindEnterNodeFromTPUExecuteNodeInput(variable_edge->src()));
+      graph->AddEdge(enter_node, 0, unshard_op_node, i);
+    }
+
+    // Add control dependency before/after unshard node and the control nodes.
+    graph->AddControlEdge(after_last_iteration_node, unshard_op_node);
+    graph->AddControlEdge(unshard_op_node, after_unshard_node);
+
+    graph->AddEdge(default_sharding_node, 0, unshard_op_node, new_key_input);
+
+    // Add data edge from sharding state var handle op to unshard op.
+    graph->AddEdge(var_handle_node, 0, unshard_op_node, format_state_input);
+  }
+  // Add control dependency from after_unshard_node to all exits nodes. This is
+  // to make sure that the unshard ops will be executed as long as any of the
+  // exits are used.
+  for (auto exit : FindLoopExitNodes(*loop_condition_node)) {
+    graph->AddControlEdge(after_unshard_node, exit);
+  }
+  return Status::OK();
+}
+
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/graph_rewrite/host_training_loop_optimization_util.h b/tensorflow/core/tpu/graph_rewrite/host_training_loop_optimization_util.h
new file mode 100644
index 00000000000..822dc9edd51
--- /dev/null
+++ b/tensorflow/core/tpu/graph_rewrite/host_training_loop_optimization_util.h
@@ -0,0 +1,80 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_GRAPH_REWRITE_HOST_TRAINING_LOOP_OPTIMIZATION_UTIL_H_
+#define TENSORFLOW_CORE_TPU_GRAPH_REWRITE_HOST_TRAINING_LOOP_OPTIMIZATION_UTIL_H_
+
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+namespace tpu {
+
+struct LoopArgInfo {
+  std::string enter_node_name;
+  // Exit nodes are optional for loop invariant while loop args.
+  absl::optional<std::string> exit_node_name;
+};
+
+struct HostTrainingLoopInfo {
+  // Name and attribute information about the function in which
+  // host training loop is included. If host training loop is not
+  // inside a function call, then `function_name` and `function_attrs`
+  // are nullopt.
+  absl::optional<std::string> encapsulating_function_name;
+  absl::optional<AttrValueMap> encapsulating_function_attrs;
+
+  // TPU Compile node as within a host training loop.
+  std::string compile_node_name;
+
+  // Name of the while loop in which TPU compile op is located.
+  std::string while_loop_name;
+
+  // Name of the node that represents loop condition.
+  std::string loop_cond_node_name;
+
+  // Exit and Enter node names for each loop arguments.
+  std::vector<LoopArgInfo> loop_arguments;
+
+  std::unordered_set<Node*> loop_nodes;  // NOLINT
+};
+
+// Walks through the `graph`, recursively if functional nodes exist, and
+// identifies all host training loops. Host training loops are the inner
+// most while loops that encapsulates TPUCompileOp node. This would be
+// later used/analyzed to inroduce host loop specific optimizations such
+// as adding sharded weight update.
+Status DetectHostTrainingLoop(
+    const std::string* current_function_name,
+    const AttrValueMap* current_function_attr,
+    const FunctionLibraryDefinition* library, Graph* graph,
+    FunctionLibraryRuntime* flr,
+    std::vector<HostTrainingLoopInfo>* host_training_loops_info);
+
+// Injects VariableReshardOps to before and after TPUExecute op inside
+// host training loop body. This effectively applies sharded weight update
+// on model weight variables.
+Status AddReshardOp(Graph* graph, const HostTrainingLoopInfo& host_loop_info);
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_GRAPH_REWRITE_HOST_TRAINING_LOOP_OPTIMIZATION_UTIL_H_
diff --git a/tensorflow/core/tpu/graph_rewrite/incomplete_nodedef_builder.cc b/tensorflow/core/tpu/graph_rewrite/incomplete_nodedef_builder.cc
new file mode 100644
index 00000000000..47187204f69
--- /dev/null
+++ b/tensorflow/core/tpu/graph_rewrite/incomplete_nodedef_builder.cc
@@ -0,0 +1,73 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/graph_rewrite/incomplete_nodedef_builder.h"
+
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/common_runtime/function.h"
+
+namespace tensorflow {
+
+IncompleteNodeDefBuilder::IncompleteNodeDefBuilder(const string& name,
+                                                   const string& op,
+                                                   const NodeDebugInfo& debug) {
+  nodedef_.set_name(name);
+  nodedef_.set_op(op);
+  MergeDebugInfo(debug, &nodedef_);
+}
+
+IncompleteNodeDefBuilder& IncompleteNodeDefBuilder::AddAttr(
+    const string& attr, const DataType& type) {
+  AddNodeAttr(attr, type, &nodedef_);
+  return *this;
+}
+
+IncompleteNodeDefBuilder& IncompleteNodeDefBuilder::AddAttr(const string& attr,
+                                                            int val) {
+  AddNodeAttr(attr, val, &nodedef_);
+  return *this;
+}
+
+IncompleteNodeDefBuilder& IncompleteNodeDefBuilder::Device(
+    const string& device) {
+  nodedef_.set_device(device);
+  return *this;
+}
+
+Status IncompleteNodeDefBuilder::Build(Graph* graph, Node** n) {
+  Status status;
+  *n = graph->AddNode(nodedef_, &status);
+  return status;
+}
+
+IncompleteNodeDefBuilder IncompleteNodeDefBuilder::Identity(
+    const string& name, const DataType& type, const NodeDebugInfo& debug) {
+  return IncompleteNodeDefBuilder(name, "Identity", debug).AddAttr("T", type);
+}
+
+IncompleteNodeDefBuilder IncompleteNodeDefBuilder::Merge(
+    const string& name, const DataType& type, const NodeDebugInfo& debug,
+    int n) {
+  return IncompleteNodeDefBuilder(name, "Merge", debug)
+      .AddAttr("T", type)
+      .AddAttr("N", n);
+}
+
+IncompleteNodeDefBuilder IncompleteNodeDefBuilder::Switch(
+    const string& name, const DataType& type, const NodeDebugInfo& debug) {
+  return IncompleteNodeDefBuilder(name, "Switch", debug).AddAttr("T", type);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/graph_rewrite/incomplete_nodedef_builder.h b/tensorflow/core/tpu/graph_rewrite/incomplete_nodedef_builder.h
new file mode 100644
index 00000000000..88e484f00cf
--- /dev/null
+++ b/tensorflow/core/tpu/graph_rewrite/incomplete_nodedef_builder.h
@@ -0,0 +1,58 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_GRAPH_REWRITE_NODEDEF_BUILDER_H_
+#define TENSORFLOW_CORE_TPU_GRAPH_REWRITE_NODEDEF_BUILDER_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Convenience builder to build NodeDefs without specifying the inputs. This is
+// similar to NodeDefBuilder except inputs are not specified.
+// TODO(jpienaar): Clean up NodeDefBuilder and remove this class.
+class IncompleteNodeDefBuilder {
+ public:
+  IncompleteNodeDefBuilder(const string& name, const string& op,
+                           const NodeDebugInfo& debug);
+
+  IncompleteNodeDefBuilder& AddAttr(const string& attr, const DataType& type);
+  IncompleteNodeDefBuilder& AddAttr(const string& attr, int val);
+
+  IncompleteNodeDefBuilder& Device(const string& device);
+
+  Status Build(Graph* graph, Node** n);
+
+  static IncompleteNodeDefBuilder Identity(const string& name,
+                                           const DataType& type,
+                                           const NodeDebugInfo& debug);
+  static IncompleteNodeDefBuilder Merge(const string& name,
+                                        const DataType& type,
+                                        const NodeDebugInfo& debug, int n);
+  static IncompleteNodeDefBuilder Switch(const string& name,
+                                         const DataType& type,
+                                         const NodeDebugInfo& debug);
+
+ private:
+  NodeDef nodedef_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_GRAPH_REWRITE_NODEDEF_BUILDER_H_
diff --git a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_configuration_rewrite_registration.cc b/tensorflow/core/tpu/graph_rewrite/tpu_rewrite_pass_registration.cc
similarity index 63%
rename from tensorflow/core/tpu/graph_rewrite/distributed_tpu_configuration_rewrite_registration.cc
rename to tensorflow/core/tpu/graph_rewrite/tpu_rewrite_pass_registration.cc
index db2b3a53f20..83a652d7aaa 100644
--- a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_configuration_rewrite_registration.cc
+++ b/tensorflow/core/tpu/graph_rewrite/tpu_rewrite_pass_registration.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/tpu/graph_rewrite/distributed_tpu_configuration_rewrite_pass.h"
+#include "tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.h"
+#include "tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.h"
+#include "tensorflow/core/tpu/graph_rewrite/variable_merger_pass.h"
 
 namespace tensorflow {
 namespace {
@@ -24,6 +27,13 @@ REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 20,
                       DistributedTPUConfigurationRewritePass);
 REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 20,
                       DistributedTPUShutdownRewritePass);
-
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 34,
+                      EncapsulateTPUComputationsPass);
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 39,
+                      ExtractOutsideCompilationPass);
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 40,
+                      DistributedTPURewritePass);
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 0,
+                      VariableMergerPass);
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/graph_rewrite/variable_merger_pass.cc b/tensorflow/core/tpu/graph_rewrite/variable_merger_pass.cc
new file mode 100644
index 00000000000..354acb32838
--- /dev/null
+++ b/tensorflow/core/tpu/graph_rewrite/variable_merger_pass.cc
@@ -0,0 +1,204 @@
+
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tpu/graph_rewrite/variable_merger_pass.h"
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/util/dump_graph.h"
+
+namespace tensorflow {
+
+namespace {
+
+// The name of a stateful op is semantically meaningful because ops with the
+// same name will share the same kernel. We therefore form new op names using a
+// deterministic function (a fingerprint) of the old names.
+uint64 MergedOpFingerprint(absl::Span<Node* const> ops) {
+  std::vector<string> op_names;
+  op_names.reserve(ops.size());
+  for (const Node* node : ops) {
+    op_names.push_back(node->name());
+  }
+  return Fingerprint64(absl::StrJoin(op_names, ","));
+}
+
+Status MergeVarHandleOps(const string& device, absl::Span<Node* const> nodes,
+                         Graph* graph) {
+  int num_var_handles(nodes.size());
+  if (num_var_handles <= 1) return Status::OK();
+
+  std::vector<string> containers(num_var_handles);
+  std::vector<string> names(num_var_handles);
+  DataTypeVector dtypes(num_var_handles);
+  std::vector<PartialTensorShape> shapes(num_var_handles);
+  for (int i = 0; i < num_var_handles; ++i) {
+    TF_RETURN_IF_ERROR(
+        GetNodeAttr(nodes[i]->attrs(), "container", &containers[i]));
+    TF_RETURN_IF_ERROR(
+        GetNodeAttr(nodes[i]->attrs(), "shared_name", &names[i]));
+    TF_RETURN_IF_ERROR(GetNodeAttr(nodes[i]->attrs(), "dtype", &dtypes[i]));
+    TF_RETURN_IF_ERROR(GetNodeAttr(nodes[i]->attrs(), "shape", &shapes[i]));
+  }
+  NodeDefBuilder builder(graph->NewName(strings::StrCat(
+                             "VarHandles_", MergedOpFingerprint(nodes))),
+                         "_VarHandlesOp");
+  builder.Attr("N", num_var_handles);
+  builder.Attr("containers", containers);
+  builder.Attr("shared_names", names);
+  builder.Attr("dtypes", dtypes);
+  builder.Attr("shapes", shapes);
+  builder.Device(device);
+  NodeDef node_def;
+  TF_RETURN_IF_ERROR(builder.Finalize(&node_def));
+  Status status;
+  Node* node = graph->AddNode(node_def, &status);
+  TF_RETURN_IF_ERROR(status);
+  node->set_assigned_device_name(device);
+
+  graph->AddControlEdge(graph->source_node(), node);
+  for (int i = 0; i < num_var_handles; ++i) {
+    std::vector<std::pair<Node*, int>> consumers;
+    for (const Edge* e : nodes[i]->out_edges()) {
+      consumers.emplace_back(e->dst(), e->dst_input());
+    }
+    graph->RemoveNode(nodes[i]);
+    for (const auto& t : consumers) {
+      graph->AddEdge(node, t.second < 0 ? -1 : i, t.first, t.second);
+    }
+  }
+  return Status::OK();
+}
+
+Status MergeReadVariableOps(Node* handle_op, Node* control_node,
+                            absl::Span<Node* const> nodes, Graph* graph) {
+  int num_reads(nodes.size());
+  if (num_reads <= 1) return Status::OK();
+
+  DataTypeVector dtypes(num_reads);
+  for (int i = 0; i < num_reads; ++i) {
+    TF_RETURN_IF_ERROR(GetNodeAttr(nodes[i]->attrs(), "dtype", &dtypes[i]));
+  }
+  NodeDef node_def;
+  node_def.set_name(graph->NewName(
+      strings::StrCat("ReadVariables_", MergedOpFingerprint(nodes))));
+  node_def.set_op("_ReadVariablesOp");
+  AddNodeAttr("N", num_reads, &node_def);
+  AddNodeAttr("dtypes", dtypes, &node_def);
+  node_def.set_device(handle_op->requested_device());
+  Status status;
+  Node* node = graph->AddNode(node_def, &status);
+  TF_RETURN_IF_ERROR(status);
+  node->set_assigned_device_name(handle_op->assigned_device_name());
+  if (control_node) graph->AddControlEdge(control_node, node);
+  for (int i = 0; i < num_reads; ++i) {
+    const Edge* handle_edge;
+    TF_RETURN_IF_ERROR(nodes[i]->input_edge(0, &handle_edge));
+    graph->AddEdge(handle_edge->src(), handle_edge->src_output(), node, i);
+
+    std::vector<std::pair<Node*, int>> consumers;
+    for (const Edge* e : nodes[i]->out_edges()) {
+      consumers.emplace_back(e->dst(), e->dst_input());
+    }
+    graph->RemoveNode(nodes[i]);
+    for (const auto& t : consumers) {
+      graph->AddEdge(node, t.second < 0 ? -1 : i, t.first, t.second);
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status VariableMergerPass::Run(const GraphOptimizationPassOptions& options) {
+  Graph* graph = options.graph->get();
+
+  VLOG(1) << DumpGraphToFile("variable_merger_pass_before", *graph);
+
+  // Find VarHandleOps that are graph roots and group them by assigned device.
+  // Also find any ReadVariableOps that are consumers of those handles.
+  absl::flat_hash_map<string, std::vector<Node*>> var_handle_ops_by_device;
+  absl::flat_hash_set<Node*> read_variable_ops;
+
+  for (Node* m : graph->source_node()->out_nodes()) {
+    // We check that the VarHandleOp has no control edges, other than the one we
+    // followed from the source node.
+    if (m->type_string() == "VarHandleOp" && m->in_edges().size() == 1) {
+      var_handle_ops_by_device[m->assigned_device_name()].push_back(m);
+      for (Node* n : m->out_nodes()) {
+        // ReadVariableOp could have control edges, we will group them by
+        // merged VarHandleOp and control dependency.
+        if (n->type_string() == "ReadVariableOp" && n->in_edges().size() <= 2) {
+          read_variable_ops.insert(n);
+        }
+      }
+    }
+  }
+
+  auto node_name_comparator = [](Node* a, Node* b) {
+    return a->name() < b->name();
+  };
+
+  // First merge the var handle ops.
+  for (auto& vh : var_handle_ops_by_device) {
+    // Sort the handles by name for determinism.
+    std::sort(vh.second.begin(), vh.second.end(), node_name_comparator);
+    TF_RETURN_IF_ERROR(MergeVarHandleOps(vh.first, vh.second, graph));
+  }
+
+  // ReadVariableOps by a pair of <VarHandleOp, ControlDependencyNode>.
+  // ControlDependencyNode could be nullptr.
+  absl::flat_hash_map<std::pair<Node*, Node*>, std::vector<Node*>> read_var_ops;
+
+  for (Node* n : read_variable_ops) {
+    Node* control_node = nullptr;
+    Node* var_handle_op = nullptr;
+    // Each ReadVariableOp has at most one control input since we only choose
+    // ReadVariableOp with at most 2 input edges.
+    for (const Edge* e : n->in_edges()) {
+      if (e->IsControlEdge()) {
+        control_node = e->src();
+      } else {
+        var_handle_op = e->src();
+      }
+    }
+    TF_RET_CHECK(var_handle_op != nullptr);
+    read_var_ops[std::pair<Node*, Node*>(var_handle_op, control_node)]
+        .push_back(n);
+  }
+
+  for (auto& r : read_var_ops) {
+    // Sort the reads by name for determinism.
+    std::sort(r.second.begin(), r.second.end(), node_name_comparator);
+    TF_RETURN_IF_ERROR(
+        MergeReadVariableOps(r.first.first, r.first.second, r.second, graph));
+  }
+
+  VLOG(1) << DumpGraphToFile("variable_merger_pass_after", *graph);
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/graph_rewrite/variable_merger_pass.h b/tensorflow/core/tpu/graph_rewrite/variable_merger_pass.h
new file mode 100644
index 00000000000..0f487da30e1
--- /dev/null
+++ b/tensorflow/core/tpu/graph_rewrite/variable_merger_pass.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Optimization pass that merges VarHandleOps and ReadVariableOps into their
+// fused forms.
+//
+// The goal of this pass is to fix a latency problem sometimes observed in
+// inference benchmarks. Often a inference step starts by reading the value of
+// many weights. Reading a resource variable requires a VarHandleOp and a
+// ReadVariableOp per variable. Running hundreds of trivial ops can add hundreds
+// of microseconds of latency to the critical path of an inference step. The
+// inter-op latency of the executor can be easily hundreds of nanoseconds, which
+// rapidly adds up over many inexpensive ops.
+//
+// This pass merges VarHandleOps that have only the graph source node as a
+// predecessor into a single VarHandlesOp that reads all at once.
+// It then merges ReadVariablesOp that have no control inputs and originate from
+// the same handle op into a single large ReadVariablesOp.
+
+#ifndef TENSORFLOW_CORE_TPU_GRAPH_REWRITE_VARIABLE_MERGER_PASS_H_
+#define TENSORFLOW_CORE_TPU_GRAPH_REWRITE_VARIABLE_MERGER_PASS_H_
+
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+class VariableMergerPass : public GraphOptimizationPass {
+ public:
+  Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_GRAPH_REWRITE_VARIABLE_MERGER_PASS_H_
diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index 6fca7dda24a..6d3369022ad 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -9,6 +9,11 @@ load(
     "tf_kernel_library",
 )
 
+# Config setting to enable go/libtpu support.
+WITH_TPU_SUPPORT = "//tensorflow:with_tpu_support"
+
+DEFAULT = "//conditions:default"
+
 package(
     default_visibility = [
         "//tensorflow/core/tpu:__subpackages__",
@@ -20,7 +25,20 @@ package(
 tf_kernel_library(
     name = "kernels",
     visibility = ["//visibility:public"],
-    deps = [":tpu_configuration_ops"],
+    deps = [
+        ":cross_replica_ops",
+        ":host_compute_ops",
+        ":image_resize_ops",
+        ":infeed_ops",
+        ":outfeed_ops",
+        ":replication_ops",
+        ":topk_ops",
+        ":tpu_compile_op",
+        ":tpu_configuration_ops",
+        ":tpu_execute_op",
+        ":tpu_handle_to_key_op",
+        ":transfer_ops",
+    ],
 )
 
 cc_library(
@@ -62,6 +80,7 @@ cc_library(
         "//tensorflow/stream_executor/tpu:tpu_platform_interface",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
+        "@com_google_absl//absl/types:variant",
     ],
     alwayslink = 1,
 )
@@ -77,7 +96,12 @@ tf_kernel_library(
     srcs = ["tpu_configuration_ops.cc"],
     hdrs = ["tpu_configuration_ops.h"],
     deps = [
+        ":tpu_compilation_cache_factory",
+        ":tpu_compilation_cache_interface",
+        ":tpu_compilation_cache_local_lookup",
+        ":tpu_compilation_cache_lookup",
         ":tpu_mesh_state_interface",
+        ":tpu_op_consts",
         "//tensorflow/c:tf_status",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/compiler/xla:util",
@@ -131,6 +155,20 @@ tf_proto_library_cc(
     ],
 )
 
+cc_library(
+    name = "tpu_compilation_cache_factory",
+    srcs = ["tpu_compilation_cache_factory.cc"],
+    hdrs = ["tpu_compilation_cache_factory.h"],
+    deps = [
+        ":tpu_compilation_cache_external",
+        ":tpu_compilation_cache_interface",
+        ":tpu_op_consts",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/platform:types",
+    ],
+)
+
 cc_library(
     name = "tpu_compilation_cache_key",
     srcs = [],
@@ -158,47 +196,34 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/client:compile_only_client",
         "//tensorflow/compiler/xla/service:computation_layout",
+        "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:dump",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_module_group",
         "//tensorflow/core:framework",
         "//tensorflow/core/framework:protos_all_cc",
+        "//tensorflow/core/platform:errors",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
         "//tensorflow/stream_executor/tpu:proto_helper",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
+        "@com_google_absl//absl/types:variant",
     ],
 )
 
 cc_library(
     name = "tpu_compilation_cache_entry",
-    srcs = ["tpu_compilation_cache_entry.cc"],
     hdrs = [
         "tpu_compilation_cache_entry.h",
     ],
     deps = [
-        ":compiled_subgraph",
-        ":tpu_compilation_cache_proto_cc",
         ":tpu_executable_info_proto_cc",
-        ":tpu_program_group",
+        ":tpu_program_group_interface",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
-        "//tensorflow/core:framework",
         "//tensorflow/core/lib/core:refcount",
-        "//tensorflow/core/platform:casts",
-    ],
-)
-
-cc_library(
-    name = "tpu_compilation_cache_entry_impl",
-    srcs = [],
-    hdrs = ["tpu_compilation_cache_entry_impl.h"],
-    deps = [
-        ":compiled_subgraph",
-        ":tpu_compilation_cache_interface",
-        ":tpu_executable_info_proto_cc",
     ],
 )
 
@@ -269,6 +294,8 @@ cc_library(
         "//tensorflow/compiler/tf2xla:host_compute_metadata_proto_cc",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/core/lib/core:status",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -282,6 +309,8 @@ cc_library(
         ":tpu_compile_op_support",
         ":tpu_compile_proto_cc",
         ":tpu_executable_info_proto_cc",
+        ":tpu_mesh_state_c_api_hdrs",
+        ":tpu_mesh_state_interface",
         ":tpu_program_c_api_hdrs",
         ":tpu_program_group_interface",
         "//tensorflow/compiler/tf2xla:xla_compiler",
@@ -292,6 +321,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
+        "//tensorflow/core/tpu:tpu_api",
         "//tensorflow/stream_executor/tpu:proto_helper",
         "//tensorflow/stream_executor/tpu:status_helper",
         "//tensorflow/stream_executor/tpu:tpu_platform_interface",
@@ -305,6 +335,7 @@ cc_library(
     hdrs = ["tpu_compilation_cache_interface.h"],
     deps = [
         ":compiled_subgraph",
+        ":tpu_compilation_cache_entry",
         ":tpu_compilation_cache_key",
         ":tpu_compilation_cache_proto_cc",
         ":tpu_compilation_metrics_hdrs",
@@ -317,9 +348,8 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/distributed_runtime/rpc:grpc_call",
-        "//tensorflow/core/platform:casts",  # buildcleaner: keep
         "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/tpu:tpu_api",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/strings",
@@ -337,7 +367,6 @@ cc_library(
     deps = [
         ":compiled_subgraph",
         ":tpu_compilation_cache_entry",
-        ":tpu_compilation_cache_entry_impl",
         ":tpu_compilation_cache_interface",
         ":tpu_compilation_cache_key",
         ":tpu_compilation_cache_proto_cc",
@@ -347,17 +376,20 @@ cc_library(
         ":tpu_compile_op_support",
         ":tpu_mesh_state_interface",
         ":tpu_op_consts",
+        ":tpu_program_c_api_hdrs",
         ":tpu_program_group",
         ":tpu_util",
         ":trace_util_hdrs",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
-        "//tensorflow/core/platform:refcount",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
         "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
@@ -368,7 +400,7 @@ cc_library(
     name = "tpu_compilation_metrics_hdrs",
     hdrs = ["tpu_compilation_metrics.h"],
     deps = [
-        "//tensorflow/core/platform:types",
+        "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -376,6 +408,10 @@ cc_library(
 cc_library(
     name = "tpu_compilation_metrics",
     srcs = ["tpu_compilation_metrics.cc"],
+    copts = select({
+        WITH_TPU_SUPPORT: ["-DLIBTFTPU"],
+        DEFAULT: [],
+    }),
     deps = [
         ":tpu_compilation_metrics_hdrs",
     ],
@@ -412,6 +448,7 @@ cc_library(
     deps = [
         ":tpu_mesh_state_c_api_hdrs",
         "//tensorflow/core/tpu:libtftpu_header",
+        "//tensorflow/stream_executor/tpu:c_api_decl",
         "//tensorflow/stream_executor/tpu:proto_helper",
     ],
     alwayslink = True,
@@ -422,6 +459,7 @@ cc_library(
     hdrs = ["tpu_program_c_api.h"],
     deps = [
         ":tpu_util_c_api_hdrs",
+        "//tensorflow/core/tpu:libtftpu_header",
         "//tensorflow/stream_executor/tpu:proto_helper",
     ],
     alwayslink = True,
@@ -471,7 +509,10 @@ tf_proto_library_cc(
 cc_library(
     name = "tpu_compile_op_hdrs",
     hdrs = ["tpu_compile_op.h"],
-    deps = ["//tensorflow/core:framework"],
+    deps = [
+        ":tpu_compile_op_common",
+        "//tensorflow/core:framework",
+    ],
 )
 
 cc_library(
@@ -500,6 +541,255 @@ cc_library(
     deps = [
         ":tpu_program_c_api_hdrs",
         ":tpu_util_c_api_hdrs",
+        "//tensorflow/core/tpu:libtftpu_header",
         "//tensorflow/stream_executor/tpu:tpu_executor_c_api_hdrs",
     ],
 )
+
+cc_library(
+    name = "tpu_compile_op_impl",
+    srcs = ["tpu_compile_op_impl.cc"],
+    hdrs = ["tpu_compile_op_impl.h"],
+    copts = select({
+        WITH_TPU_SUPPORT: ["-DLIBTFTPU"],
+        DEFAULT: [],
+    }),
+    deps = [
+        ":tpu_compilation_cache_key",
+        ":tpu_compile_c_api_hdrs",
+        ":tpu_compile_op_common",
+        ":tpu_compile_op_support",
+        ":tpu_compile_proto_cc",
+        ":tpu_mesh_state_c_api_hdrs",
+        ":tpu_program_group",
+        ":tpu_program_group_interface",
+        ":tpu_util",
+        "//tensorflow/compiler/jit:shape_inference",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/stream_executor/tpu:tpu_executor",
+        "//tensorflow/stream_executor/tpu:tpu_executor_c_api_hdrs",
+        "@com_google_absl//absl/types:variant",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "tpu_compile_op_lib",
+    srcs = ["tpu_compile_op.cc"],
+    deps = [
+        ":tpu_compile_op_hdrs",
+        ":tpu_compile_op_options",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/core/protobuf/tpu:compilation_result_proto_cc",
+        "//tensorflow/stream_executor/tpu:tpu_node_context",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "tpu_compile_op",
+    deps = [
+        ":tpu_compile_op_hdrs",
+        ":tpu_compile_op_impl",
+        ":tpu_compile_op_lib",
+        ":tpu_compile_op_options",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/core/protobuf/tpu:compilation_result_proto_cc",
+        "//tensorflow/stream_executor/tpu:tpu_node_context",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "tpu_execute_op",
+    srcs = ["tpu_execute_op.cc"],
+    hdrs = ["tpu_execute_op.h"],
+    deps = [
+        ":tpu_compilation_cache_entry",
+        ":tpu_compilation_cache_external",
+        ":tpu_compilation_cache_interface",
+        ":tpu_compilation_cache_local_lookup",
+        ":tpu_compilation_cache_lookup",
+        ":tpu_executable_info_proto_cc",
+        ":tpu_op_consts",
+        "//tensorflow/compiler/jit:xla_device_no_jit_rewrite_registration",
+        "//tensorflow/compiler/jit:xla_launch_util",
+        "//tensorflow/compiler/jit:xla_tensor",
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:tf2xla_util",
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/service:dump",
+        "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service:maybe_owning_device_memory",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/tpu:tpu_configuration",
+        "//tensorflow/core/tpu:tpu_defs",
+        "//tensorflow/core/tpu:tpu_execute",
+        "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor/tpu:tpu_node_context",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:span",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "cross_replica_ops",
+    srcs = ["cross_replica_ops.cc"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "topk_ops",
+    srcs = ["topk_ops.cc"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/core/tpu:tpu_defs",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "host_compute_ops",
+    srcs = ["host_compute_ops.cc"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/tpu:tpu_defs",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "infeed_ops",
+    srcs = ["infeed_ops.cc"],
+    hdrs = ["infeed_ops.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":transfer_ops",
+        "//tensorflow/compiler/jit:xla_device_no_jit_rewrite_registration",
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/common_runtime:dma_helper",
+        "//tensorflow/core/framework:protos_all_cc",
+        "//tensorflow/core/kernels:transpose_functor",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/tpu:tpu_api",
+        "//tensorflow/core/tpu:tpu_defs",
+        "//tensorflow/stream_executor:multi_platform_manager",
+        "//tensorflow/stream_executor/tpu:c_api_conversions",
+        "//tensorflow/stream_executor/tpu:tpu_transfer_manager_base",
+        "//tensorflow/stream_executor/tpu:tpu_transfer_manager_interface",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "transfer_ops",
+    srcs = ["transfer_ops.cc"],
+    hdrs = ["transfer_ops.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/jit:xla_device_no_jit_rewrite_registration",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:ops_util",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/stream_executor:multi_platform_manager",
+        "//tensorflow/stream_executor/tpu:tpu_node_context",
+        "//tensorflow/stream_executor/tpu:tpu_platform_interface",
+        "//tensorflow/stream_executor/tpu:tpu_transfer_manager_interface",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "outfeed_ops",
+    srcs = ["outfeed_ops.cc"],
+    hdrs = ["outfeed_ops.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":transfer_ops",
+        "//tensorflow/compiler/jit:xla_device_no_jit_rewrite_registration",
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/framework:protos_all_cc",
+        "//tensorflow/core/tpu:tpu_defs",
+        "//tensorflow/stream_executor:multi_platform_manager",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "image_resize_ops",
+    srcs = ["image_resize_ops.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/tpu:tpu_defs",
+        "@com_google_absl//absl/strings",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "replication_ops",
+    srcs = ["replication_ops.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/jit:xla_device_no_jit_rewrite_registration",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/tpu:tpu_defs",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "tpu_handle_to_key_op",
+    srcs = ["tpu_handle_to_key_op.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":tpu_compilation_cache_interface",
+        ":tpu_op_consts",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/tpu:tpu_configuration",
+    ],
+    alwayslink = True,
+)
diff --git a/tensorflow/core/tpu/kernels/cross_replica_ops.cc b/tensorflow/core/tpu/kernels/cross_replica_ops.cc
new file mode 100644
index 00000000000..89dba79cc63
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/cross_replica_ops.cc
@@ -0,0 +1,139 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace {
+
+// Convert 1D group_assignment into 2D replica_groups.
+std::vector<xla::ReplicaGroup> Convert(
+    const std::vector<int64>& group_assignment,
+    const TensorShape& group_assignment_shape) {
+  VLOG(1) << "group_assignment size: " << group_assignment.size();
+  VLOG(1) << "group_assignment_shape: " << group_assignment_shape.DebugString();
+
+  std::vector<xla::ReplicaGroup> replica_groups;
+  const int64 num_groups = group_assignment_shape.dim_size(0);
+  const int64 num_replica_per_group = group_assignment_shape.dim_size(1);
+
+  replica_groups.reserve(num_groups);
+  for (int64 g = 0; g < num_groups; ++g) {
+    xla::ReplicaGroup replica_group;
+
+    for (int64 i = 0; i < num_replica_per_group; ++i) {
+      int64 replica = group_assignment[num_replica_per_group * g + i];
+      replica_group.add_replica_ids(replica);
+    }
+    replica_groups.push_back(replica_group);
+  }
+  return replica_groups;
+}
+
+class CrossReplicaSumOp : public XlaOpKernel {
+ public:
+  explicit CrossReplicaSumOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    std::vector<int64> flattened_group_assignment;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputReshapedToIntVector(
+                            1, &flattened_group_assignment));
+    std::vector<xla::ReplicaGroup> replica_groups =
+        Convert(flattened_group_assignment, ctx->InputShape(1));
+    ctx->SetOutput(0, xla::CrossReplicaSum(ctx->Input(0), replica_groups));
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(CrossReplicaSumOp);
+};
+
+class AllToAllOp : public XlaOpKernel {
+ public:
+  explicit AllToAllOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("split_dimension", &split_dimension_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("concat_dimension", &concat_dimension_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("split_count", &split_count_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    std::vector<int64> flattened_group_assignment;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputReshapedToIntVector(
+                            1, &flattened_group_assignment));
+
+    std::vector<xla::ReplicaGroup> replica_groups =
+        Convert(flattened_group_assignment, ctx->InputShape(1));
+    ctx->SetOutput(
+        0, xla::AllToAll(ctx->Input(0), split_dimension_, concat_dimension_,
+                         split_count_, replica_groups));
+  }
+
+ private:
+  int64 split_dimension_;
+  int64 concat_dimension_;
+  int64 split_count_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(AllToAllOp);
+};
+
+class CollectivePermuteOp : public XlaOpKernel {
+ public:
+  explicit CollectivePermuteOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorShape source_target_shape = ctx->InputShape(1);
+    OP_REQUIRES(
+        ctx,
+        source_target_shape.dims() == 2 && source_target_shape.dim_size(1) == 2,
+        errors::InvalidArgument(
+            "CollectivePermuteOp requires source_target_pair's shape to"
+            " [num_pairs, 2]. Get ",
+            source_target_shape));
+
+    xla::Literal source_target_literal;
+    OP_REQUIRES_OK(ctx,
+                   ctx->ConstantInputAsInt64Literal(1, &source_target_literal));
+    const int num_pairs = source_target_shape.dim_size(0);
+    std::vector<std::pair<int64, int64>> source_target_pairs(num_pairs);
+    for (int i = 0; i < num_pairs; ++i) {
+      source_target_pairs[i] = {source_target_literal.Get<int64>({i, 0}),
+                                source_target_literal.Get<int64>({i, 1})};
+    }
+    ctx->SetOutput(0,
+                   xla::CollectivePermute(ctx->Input(0), source_target_pairs));
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(CollectivePermuteOp);
+};
+
+REGISTER_XLA_OP(Name("AllToAll").CompileTimeConstantInput("group_assignment"),
+                AllToAllOp);
+REGISTER_XLA_OP(Name("CollectivePermute")
+                    .TypeConstraint("T", {DT_FLOAT, DT_BFLOAT16, DT_INT32,
+                                          DT_COMPLEX64})
+                    .CompileTimeConstantInput("source_target_pairs"),
+                CollectivePermuteOp);
+REGISTER_XLA_OP(
+    Name("CrossReplicaSum").CompileTimeConstantInput("group_assignment"),
+    CrossReplicaSumOp);
+
+}  // anonymous namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/host_compute_ops.cc b/tensorflow/core/tpu/kernels/host_compute_ops.cc
new file mode 100644
index 00000000000..77a7d6f3bf8
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/host_compute_ops.cc
@@ -0,0 +1,217 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/literal_util.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
+
+namespace tensorflow {
+namespace {
+
+// The RecvAtHost op is used to deliver data from the device at the start of a
+// host compute block.
+class RecvAtHostOp : public AsyncOpKernel {
+ public:
+  explicit RecvAtHostOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("key", &key_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("device_ordinal", &device_ordinal_));
+    OP_REQUIRES(ctx, ctx->num_inputs() == 1,
+                errors::Internal("RecvAtHost must have exactly one input"));
+    OP_REQUIRES(ctx, ctx->input_type(0) == DT_STRING,
+                errors::Internal("RecvAtHost input must have string type"));
+    DeviceNameUtils::ParsedName parsed_name;
+    OP_REQUIRES(
+        ctx,
+        DeviceNameUtils::ParseFullName(ctx->device()->name(), &parsed_name),
+        errors::Internal("Could not parse device name."));
+    parsed_name.type = "TPU";
+    parsed_name.id = device_ordinal_;
+    tpu_device_ = DeviceNameUtils::ParsedNameToString(parsed_name);
+    parsed_name.type = "CPU";
+    parsed_name.id = 0;
+    cpu_device_ = DeviceNameUtils::ParsedNameToString(parsed_name);
+    VLOG(2) << "  tpu_device_ = " << tpu_device_;
+    VLOG(2) << "  cpu_device_ = " << cpu_device_;
+  }
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    const Tensor& input = ctx->input(0);
+    VLOG(2) << input.DebugString();
+    OP_REQUIRES_ASYNC(
+        ctx,
+        TensorShapeUtils::IsVector(input.shape()) &&
+            input.shape().dim_size(0) == 2,
+        errors::InvalidArgument("Input shape ", input.shape().DebugString(),
+                                " is not a vector of length 2."),
+        done);
+    const string rendezvous_key_base = input.vec<tstring>()(1);
+    OP_REQUIRES_ASYNC(
+        ctx, ctx->rendezvous() != nullptr,
+        errors::Internal("Op kernel context needs to provide a rendezvous."),
+        done);
+
+    // Early return if there is no output to be received. Call `done()` to
+    // unblock following execution.
+    if (ctx->num_outputs() == 0) {
+      done();
+      return;
+    }
+
+    // Make all the parsed keys before starting any rendezvous->Recv calls to
+    // avoid having to deal with an error case after some Recv have been
+    // started.
+    std::vector<string> rendezvous_key(ctx->num_outputs());
+    std::vector<Rendezvous::ParsedKey> parsed_key(ctx->num_outputs());
+    for (int i = 0; i < ctx->num_outputs(); ++i) {
+      rendezvous_key[i] = Rendezvous::CreateKey(
+          tpu_device_, /*src_incarnation=*/1, cpu_device_,
+          strings::StrCat(rendezvous_key_base, key_, "_dtoh_", i),
+          FrameAndIter(0, 0));
+
+      OP_REQUIRES_OK_ASYNC(
+          ctx, Rendezvous::ParseKey(rendezvous_key[i], &parsed_key[i]), done);
+    }
+
+    std::atomic_int_fast32_t* counter =
+        new std::atomic_int_fast32_t(ctx->num_outputs());
+
+    int num_outputs = ctx->num_outputs();
+    for (int i = 0; i < num_outputs; ++i) {
+      Rendezvous::Args args;
+      args.device_context = ctx->op_device_context();
+      args.alloc_attrs = ctx->output_alloc_attr(i);
+
+      const string& key = rendezvous_key[i];
+      VLOG(2) << "Recv " << key;
+      ctx->rendezvous()->RecvAsync(
+          parsed_key[i], args,
+          [ctx, i, counter, key, done](const Status& s,
+                                       const Rendezvous::Args& send_args,
+                                       const Rendezvous::Args& recv_args,
+                                       const Tensor& val, bool is_dead) {
+            ctx->SetStatus(s);
+            if (s.ok()) {
+              ctx->set_output(i, val);
+            }
+            int previously_finished = counter->fetch_sub(1);
+            VLOG(2) << "Processing Recv " << key << " " << s
+                    << " previously finished " << previously_finished;
+            if (previously_finished == 1) {
+              delete counter;
+              done();
+            }
+          });
+    }
+  }
+
+ private:
+  string key_;
+  int device_ordinal_;
+  string tpu_device_;
+  string cpu_device_;
+
+  // RecvAtHostOp is neither copyable nor movable.
+  RecvAtHostOp(const RecvAtHostOp&) = delete;
+  RecvAtHostOp& operator=(const RecvAtHostOp&) = delete;
+};
+
+// The SendFromHost op is used to deliver data to the device at the end of a
+// host compute block.
+class SendFromHostOp : public OpKernel {
+ public:
+  explicit SendFromHostOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("key", &key_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("device_ordinal", &device_ordinal_));
+    OP_REQUIRES(ctx, ctx->num_inputs() > 0,
+                errors::Internal("SendFromHost must have at least one input"));
+    OP_REQUIRES(
+        ctx, ctx->input_type(ctx->num_inputs() - 1) == DT_STRING,
+        errors::Internal("SendFromHost last input must have string type"));
+    DeviceNameUtils::ParsedName parsed_name;
+    OP_REQUIRES(
+        ctx,
+        DeviceNameUtils::ParseFullName(ctx->device()->name(), &parsed_name),
+        errors::Internal("Could not parse device name."));
+    parsed_name.type = "TPU";
+    parsed_name.id = device_ordinal_;
+    tpu_device_ = DeviceNameUtils::ParsedNameToString(parsed_name);
+    parsed_name.type = "CPU";
+    parsed_name.id = 0;
+    cpu_device_ = DeviceNameUtils::ParsedNameToString(parsed_name);
+    VLOG(2) << "  tpu_device_ = " << tpu_device_;
+    VLOG(2) << "  cpu_device_ = " << cpu_device_;
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& key_input = ctx->input(ctx->num_inputs() - 1);
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(key_input.shape()) &&
+                    key_input.shape().dim_size(0) == 2,
+                errors::InvalidArgument("Key input shape ",
+                                        key_input.shape().DebugString(),
+                                        " is not a vector of length 2."));
+    const string rendezvous_key_base = key_input.vec<tstring>()(1);
+    OP_REQUIRES(
+        ctx, ctx->rendezvous() != nullptr,
+        errors::Internal("Op kernel context needs to provide a rendezvous."));
+
+    for (int i = 0; i < ctx->num_inputs() - 1; ++i) {
+      Rendezvous::Args args;
+      args.device_context = ctx->op_device_context();
+      args.alloc_attrs = ctx->input_alloc_attr(i);
+
+      // TODO(misard) Fix this once we have replication.
+      string tpu_device = strings::StrCat("/device:TPU:", device_ordinal_);
+      const string& rendezvous_key = Rendezvous::CreateKey(
+          cpu_device_, /*src_incarnation=*/1, tpu_device_,
+          strings::StrCat(rendezvous_key_base, key_, "_htod_", i),
+          FrameAndIter(0, 0));
+
+      Rendezvous::ParsedKey parsed_key;
+      OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(rendezvous_key, &parsed_key));
+      VLOG(2) << "Send " << rendezvous_key;
+      OP_REQUIRES_OK(
+          ctx, ctx->rendezvous()->Send(parsed_key, args, ctx->input(i), false));
+    }
+  }
+
+ private:
+  string key_;
+  int device_ordinal_;
+  string cpu_device_;
+  string tpu_device_;
+
+  // SendFromHostOp is neither copyable nor movable.
+  SendFromHostOp(const SendFromHostOp&) = delete;
+  SendFromHostOp& operator=(const SendFromHostOp&) = delete;
+};
+
+}  // anonymous namespace
+
+// These ops execute on the CPU device and must specify a non-negative value for
+// device_ordinal to indicate which TPU to send infeed to.
+REGISTER_KERNEL_BUILDER(Name("_XlaRecvAtHost").Device(DEVICE_CPU),
+                        RecvAtHostOp);
+
+REGISTER_KERNEL_BUILDER(Name("_XlaSendFromHost").Device(DEVICE_CPU),
+                        SendFromHostOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/image_resize_ops.cc b/tensorflow/core/tpu/kernels/image_resize_ops.cc
new file mode 100644
index 00000000000..fd0f5e4c7a6
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/image_resize_ops.cc
@@ -0,0 +1,155 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
+
+namespace tensorflow {
+
+class TpuCustomResizeOp : public XlaOpKernel {
+ public:
+  explicit TpuCustomResizeOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("align_corners", &align_corners_));
+    OP_REQUIRES_OK(ctx,
+                   ctx->GetAttr("half_pixel_centers", &half_pixel_centers_));
+  }
+
+  xla::Shape GetOutputShape(XlaOpKernelContext* ctx) const {
+    std::vector<int64> out_size;
+    auto status = ctx->ConstantInputAsIntVector(1, &out_size);
+    CHECK_EQ(out_size.size(), 2) << status.ToString();
+    xla::Shape output_shape =
+        TensorShapeToXLAShape(ctx->output_xla_type(0), ctx->InputShape(0));
+    output_shape.mutable_dimensions()[1] = out_size[0];
+    output_shape.mutable_dimensions()[2] = out_size[1];
+    return output_shape;
+  }
+
+  string OpaqueField() const {
+    return absl::StrCat("\"", align_corners_, half_pixel_centers_, "\"");
+  }
+
+  void CompileGrad(XlaOpKernelContext* ctx, const char* target,
+                   const xla::Shape& output_shape) {
+    auto input_shape =
+        TensorShapeToXLAShape(ctx->output_xla_type(0), ctx->InputShape(0));
+    if (ctx->InputShape(1).dim_sizes() == ctx->InputShape(0).dim_sizes()) {
+      ctx->SetOutput(
+          0, xla::ConvertElementType(ctx->Input(0), ctx->output_xla_type(0)));
+      return;
+    }
+    // The gradient should be done in two phases for large resizes.
+    auto input = ctx->Input(0);
+    if (input_shape.dimensions(1) / output_shape.dimensions(1) > 3 &&
+        input_shape.dimensions(2) / output_shape.dimensions(2) > 3) {
+      auto intermediate_shape = output_shape;
+      intermediate_shape.mutable_dimensions()[1] = input_shape.dimensions(1);
+      input = xla::CustomCall(ctx->builder(), target, {ctx->Input(0)},
+                              intermediate_shape, OpaqueField());
+    }
+    ctx->SetOutput(0, xla::CustomCall(ctx->builder(), target, {input},
+                                      output_shape, OpaqueField()));
+  }
+
+  void CompileForward(XlaOpKernelContext* ctx, const char* target) {
+    auto output_shape = GetOutputShape(ctx);
+    if (ctx->InputShape(0).dim_size(1) == output_shape.dimensions(1) &&
+        ctx->InputShape(0).dim_size(2) == output_shape.dimensions(2)) {
+      ctx->SetOutput(
+          0, xla::ConvertElementType(ctx->Input(0), ctx->output_xla_type(0)));
+      return;
+    }
+    if (ctx->InputShape(0).dim_size(1) == 1 &&
+        ctx->InputShape(0).dim_size(2) == 1) {
+      ctx->SetOutput(0,
+                     ctx->Input(0) + xla::Zeros(ctx->builder(), output_shape));
+      return;
+    }
+    ctx->SetOutput(0, xla::CustomCall(ctx->builder(), target, {ctx->Input(0)},
+                                      output_shape, OpaqueField()));
+  }
+
+ private:
+  bool align_corners_;
+  bool half_pixel_centers_;
+};
+
+class TpuResizeNearestNeighborOp : public TpuCustomResizeOp {
+ public:
+  explicit TpuResizeNearestNeighborOp(OpKernelConstruction* ctx)
+      : TpuCustomResizeOp(ctx) {}
+  void Compile(XlaOpKernelContext* ctx) override {
+    CompileForward(ctx, "ResizeNearest");
+  }
+};
+
+class TpuResizeBilinearOp : public TpuCustomResizeOp {
+ public:
+  explicit TpuResizeBilinearOp(OpKernelConstruction* ctx)
+      : TpuCustomResizeOp(ctx) {}
+  void Compile(XlaOpKernelContext* ctx) override {
+    CompileForward(ctx, "ResizeBilinear");
+  }
+};
+
+class TpuResizeNearestNeighborGradOp : public TpuCustomResizeOp {
+ public:
+  explicit TpuResizeNearestNeighborGradOp(OpKernelConstruction* ctx)
+      : TpuCustomResizeOp(ctx) {}
+  void Compile(XlaOpKernelContext* ctx) override {
+    CompileGrad(ctx, "ResizeNearestGrad", GetOutputShape(ctx));
+  }
+};
+
+class TpuResizeBilinearGradOp : public TpuCustomResizeOp {
+ public:
+  explicit TpuResizeBilinearGradOp(OpKernelConstruction* ctx)
+      : TpuCustomResizeOp(ctx) {}
+  void Compile(XlaOpKernelContext* ctx) override {
+    auto output_shape =
+        TensorShapeToXLAShape(ctx->output_xla_type(0), ctx->InputShape(1));
+    CompileGrad(ctx, "ResizeBilinearGrad", output_shape);
+  }
+};
+
+REGISTER_XLA_OP(Name("ResizeNearestNeighbor")
+                    .CompileTimeConstantInput("size")
+                    .Device(DEVICE_TPU_XLA_JIT),
+                TpuResizeNearestNeighborOp);
+
+REGISTER_XLA_OP(Name("ResizeNearestNeighborGrad")
+                    .CompileTimeConstantInput("size")
+                    .Device(DEVICE_TPU_XLA_JIT),
+                TpuResizeNearestNeighborGradOp);
+
+REGISTER_XLA_OP(Name("ResizeBilinear")
+                    .CompileTimeConstantInput("size")
+                    .Device(DEVICE_TPU_XLA_JIT),
+                TpuResizeBilinearOp);
+
+REGISTER_XLA_OP(Name("ResizeBilinearGrad").Device(DEVICE_TPU_XLA_JIT),
+                TpuResizeBilinearGradOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/infeed_ops.cc b/tensorflow/core/tpu/kernels/infeed_ops.cc
new file mode 100644
index 00000000000..1d10667f2c2
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/infeed_ops.cc
@@ -0,0 +1,548 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/kernels/infeed_ops.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "tensorflow/compiler/jit/xla_device.h"
+#include "tensorflow/compiler/tf2xla/literal_util.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/kernels/transpose_functor.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/tpu/kernels/transfer_ops.h"
+#include "tensorflow/core/tpu/tpu_api.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
+#include "tensorflow/stream_executor/tpu/c_api_conversions.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/stream_executor/tpu/tpu_transfer_manager.h"
+#include "tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.h"
+
+namespace tensorflow {
+namespace {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef tensorflow::tpu::NoncopyableBuffer LinearizerBuffer;
+typedef std::deque<LinearizerBuffer> LinearizerBufferList;
+
+// For the given shape, chooses a layout for infeed on TPU. The returned shape
+// has the same dimensions as the original shape, and only the layout is
+// changed.
+xla::Shape GetTPUInfeedLayout(const xla::Shape& shape) {
+  XLA_Shape c_shape;
+  XLA_Shape c_infeed_shape;
+
+  ApiConverter::ToC(shape, &c_shape);
+
+  tpu::ExecutorApiFn()->TpuTransferManager_GetInfeedLayoutFn(&c_shape,
+                                                             &c_infeed_shape);
+  xla::Shape infeed_shape = ApiConverter::FromC(&c_infeed_shape);
+  ApiConverter::Free(&c_shape);
+  ApiConverter::Free(&c_infeed_shape);
+  return infeed_shape;
+}
+
+// Transposes the given tensor using the tensorflow C++ transpose implementation
+// to obtain a XLA literal for the host tensor laid out as the given layout. The
+// returned tensor is normalized to the dim0major layout -- F32[10,20,30]{2,0,1}
+// is returned as F32[20,10,30]{2,1,0}.
+xla::StatusOr<Tensor> TransposeTensor(OpKernelContext* ctx,
+                                      const Tensor& input_tensor,
+                                      const xla::Shape& xla_shape) {
+  profiler::TraceMe trace_me("TransposeTensor", /*level=*/2);
+  const int64 rank = xla_shape.rank();
+  std::vector<int32> permutation(rank);
+  std::vector<int64> transposed_shapes(rank);
+  for (int64 i = 0; i < rank; ++i) {
+    permutation[i] = xla_shape.layout().minor_to_major(rank - 1 - i);
+    transposed_shapes[i] = xla_shape.dimensions(permutation[i]);
+  }
+
+  Tensor transposed_tensor;
+
+  // If this is a trivial transpose (i.e., bitcast), just create an aliased
+  // tensor with the transposed shape.
+  if (xla::LayoutUtil::IsMonotonicWithDim0Major(
+          xla::ShapeUtil::DropDegenerateDimensions(xla_shape).layout())) {
+    TensorShape shape;
+    TF_RETURN_IF_ERROR(TensorShapeUtils::MakeShape(transposed_shapes, &shape));
+    TF_RETURN_IF_ERROR(transposed_tensor.BitcastFrom(
+        input_tensor, input_tensor.dtype(), shape));
+    return transposed_tensor;
+  }
+
+  AllocatorAttributes alloc_attr;
+  alloc_attr.set_on_host(true);
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(input_tensor.dtype(),
+                                        TensorShape(transposed_shapes),
+                                        &transposed_tensor, alloc_attr));
+  // Eigen Transpose fails with SIGFPE if there is a dimension of size 0.
+  if (input_tensor.NumElements() > 0) {
+    TF_RETURN_IF_ERROR(DoTranspose<CPUDevice>(ctx->eigen_device<CPUDevice>(),
+                                              input_tensor, permutation,
+                                              &transposed_tensor));
+  }
+  return transposed_tensor;
+}
+
+xla::StatusOr<bool> GetLayoutOverride(OpKernelConstruction* ctx,
+                                      const char* attrn_name,
+                                      std::vector<int64>* minor_to_major) {
+  if (!ctx->HasAttr(attrn_name)) {
+    return false;
+  }
+  TF_RETURN_IF_ERROR(ctx->GetAttr(attrn_name, minor_to_major));
+  return !minor_to_major->empty();
+}
+
+Status GetInfeedShapeWithLayout(OpKernelConstruction* ctx,
+                                const char* attrn_name,
+                                const xla::Shape& input_shape,
+                                xla::Shape* output_shape) {
+  std::vector<int64> minor_to_major;
+  TF_ASSIGN_OR_RETURN(bool has_override,
+                      GetLayoutOverride(ctx, attrn_name, &minor_to_major));
+  if (!has_override) {
+    *output_shape = input_shape;
+    if (output_shape->IsTuple()) {
+      int64 tuple_elements = xla::ShapeUtil::TupleElementCount(*output_shape);
+      for (int64 i = 0; i < tuple_elements; ++i) {
+        xla::Shape* sub_shape =
+            xla::ShapeUtil::GetMutableSubshape(output_shape, {i});
+        *sub_shape->mutable_layout() = GetTPUInfeedLayout(*sub_shape).layout();
+      }
+    } else {
+      *output_shape->mutable_layout() =
+          GetTPUInfeedLayout(*output_shape).layout();
+    }
+    return Status::OK();
+  }
+
+  auto layout_func = [](const xla::Shape& shape) -> xla::Layout {
+    return GetTPUInfeedLayout(shape).layout();
+  };
+  return GetShapeWithLayout(input_shape, minor_to_major, layout_func,
+                            output_shape);
+}
+
+// LinearizedBuffersWrapper is an opaque C++ data structure for the outputs of
+// PrelinearizeOp and PrelinearizeTupleOp. It holds the resultant linearized
+// buffers and references to input tensors whose underlying storage are shared
+// with linearized buffers.
+// NOTE: This is not a feature-complete implementation of the DT_VARIANT
+// specification. In particular, we cannot currently serialize an arbitrary
+// `LinearizerBufferList` (aka `std::deque<LinearizerBuffer>`)
+// object, so the `Encode()` and `Decode()` methods are not implemented.
+struct LinearizedBuffersWrapper {
+  explicit LinearizedBuffersWrapper() {}
+  explicit LinearizedBuffersWrapper(LinearizerBufferList bufs,
+                                    std::vector<tensorflow::Tensor> ts)
+      : buffers(std::move(bufs)), tensors(std::move(ts)) {}
+  LinearizedBuffersWrapper(const LinearizedBuffersWrapper& wrapper) {
+    // tensorflow::Variant requires this copy constructor to compile.
+    LOG(FATAL) << "LinearizedBuffersWrapper should not copy.";
+  }
+  LinearizedBuffersWrapper& operator=(const LinearizedBuffersWrapper& wrapper) =
+      delete;
+  LinearizedBuffersWrapper(LinearizedBuffersWrapper&&) = default;
+  LinearizedBuffersWrapper& operator=(LinearizedBuffersWrapper&&) = default;
+  ~LinearizedBuffersWrapper() = default;
+
+  // These functions are tensorflow::Variant requirements.
+  string TypeName() const { return "(anonymous)::LinearizedBuffersWrapper"; }
+  void Encode(tensorflow::VariantTensorData* data) const {
+    LOG(ERROR) << "Encode() is not implemented for LinearizedBuffersWrapper "
+                  "objects.";
+  }
+  bool Decode(const tensorflow::VariantTensorData& data) {
+    LOG(ERROR) << "Decode() is not implemented for LinearizedBuffersWrapper "
+                  "objects.";
+    return false;
+  }
+
+  LinearizerBufferList buffers;
+  // Save references on tensors whose underlying storage are shared with
+  // LiteralLinearizer::Buffer in `buffers`.
+  std::vector<tensorflow::Tensor> tensors;
+};
+
+Status AutoTransposeAndLinearize(OpKernelContext* ctx,
+                                 const Tensor& input_tensor,
+                                 const xla::Shape& shape,
+                                 LinearizerBufferList* linearized_buffers,
+                                 std::vector<Tensor>* saved_input_tensors) {
+  const Tensor* tensor = &input_tensor;
+  // If the given layout is not in dim0major layout, tranposes the tensor.
+  bool has_transposed = false;
+  Tensor transposed_tensor;
+  if (!xla::LayoutUtil::IsMonotonicWithDim0Major(shape.layout())) {
+    // If the given layout is not in dim0major layout, transpose the tensor.
+    TF_ASSIGN_OR_RETURN(transposed_tensor,
+                        TransposeTensor(ctx, input_tensor, shape));
+    tensor = &transposed_tensor;
+    has_transposed = true;
+  }
+
+  xla::BorrowingLiteral literal;
+  TF_RETURN_IF_ERROR(HostTensorToBorrowingLiteral(*tensor, &literal));
+
+  TF_RETURN_IF_ERROR(
+      xla::TpuTransferManagerInterface::GetRegisteredTpuTransferManager()
+          ->LinearizeToBuffers(literal, linearized_buffers));
+
+  // The input tensor is ref-counted. Save a handle on the input tensor if
+  // its underlying storage is shared with linearized buffers to prevent
+  // input tensor from getting freed.
+  for (const auto& buffer : *linearized_buffers) {
+    if (!buffer.owns_data() && !has_transposed) {
+      // `buffer` is created from zero-copy fast path from the un-transposed
+      // input tensor so its underlying data is shared with input tensor.
+      // Save a handle to input tensor to increment its ref-count and avoid
+      // it getting deallocated after PrelinearizeTupleOp completes.
+      saved_input_tensors->push_back(*tensor);
+      // A literal can be linearized to zero to two buffers. If any of the
+      // linearized buffer shares storage with input tensor. We save exactly
+      // one handle on the input tensor.
+      break;
+    }
+  }
+  return Status::OK();
+}
+
+// PrelinearizeOp is used to linearize one tensor to the device format.
+class PrelinearizeOp : public OpKernel {
+ public:
+  explicit PrelinearizeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &shape_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+    xla::Shape shape;
+    OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype_, shape_, &shape));
+    OP_REQUIRES_OK(ctx,
+                   GetInfeedShapeWithLayout(ctx, "layout", shape, &xla_shape_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& input_tensor = ctx->input(0);
+    // Validate input.
+    OP_REQUIRES(
+        ctx, input_tensor.dtype() == dtype_,
+        errors::InvalidArgument("Prelinearize dtype mismatch; expected ",
+                                DataType_Name(dtype_), ", got ",
+                                DataType_Name(input_tensor.dtype())));
+    OP_REQUIRES(
+        ctx, input_tensor.shape() == shape_,
+        errors::InvalidArgument("Prelinearize shape mismatch; expected ",
+                                shape_.DebugString(), ", got ",
+                                input_tensor.shape().DebugString()));
+
+    // Auto-transpose and prelinearize.
+    LinearizerBufferList linearized_buffers;
+    std::vector<Tensor> saved_input_tensors;
+    auto status =
+        AutoTransposeAndLinearize(ctx, input_tensor, xla_shape_,
+                                  &linearized_buffers, &saved_input_tensors);
+    OP_REQUIRES_OK(ctx, status);
+
+    // Write to output.
+    tensorflow::Tensor* output;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, tensorflow::TensorShape{}, &output));
+    output->scalar<tensorflow::Variant>()() = LinearizedBuffersWrapper{
+        std::move(linearized_buffers), std::move(saved_input_tensors)};
+  }
+
+  bool IsExpensive() override { return true; }
+
+ private:
+  TensorShape shape_;
+  DataType dtype_;
+  xla::Shape xla_shape_;
+
+  // PrelinearizeOp is neither copyable nor movable.
+  PrelinearizeOp(const PrelinearizeOp&) = delete;
+  PrelinearizeOp& operator=(const PrelinearizeOp&) = delete;
+};
+
+// PrelinearizeTupleOp is used to linearize multiple tensors to the device
+// format.
+class PrelinearizeTupleOp : public OpKernel {
+ public:
+  explicit PrelinearizeTupleOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("shapes", &shapes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtypes", &dtypes_));
+    OP_REQUIRES(
+        ctx, shapes_.size() == dtypes_.size(),
+        errors::InvalidArgument(
+            "shapes and dtypes must be the same length. shapes length = ",
+            shapes_.size(), ", dtypes length = ", dtypes_.size()));
+
+    std::vector<xla::Shape> xla_shapes;
+    for (int i = 0; i < shapes_.size(); i++) {
+      xla::Shape xla_shape;
+      OP_REQUIRES_OK(ctx,
+                     TensorShapeToXLAShape(dtypes_[i], shapes_[i], &xla_shape));
+      xla_shapes.push_back(xla_shape);
+    }
+    OP_REQUIRES_OK(
+        ctx, GetInfeedShapeWithLayout(
+                 ctx, "layouts", xla::ShapeUtil::MakeTupleShape(xla_shapes),
+                 &tuple_shape_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    OpInputList values;
+    OP_REQUIRES_OK(ctx, ctx->input_list("inputs", &values));
+    OP_REQUIRES(ctx, values.size() == shapes_.size(),
+                errors::InvalidArgument(
+                    "Wrong number of inputs to PrelinearizeTuple."));
+
+    LinearizerBufferList all_linearized_buffers;
+    std::vector<Tensor> all_saved_input_tensors;
+    for (int i = 0; i < values.size(); i++) {
+      // Validate input.
+      const Tensor& input_tensor = values[i];
+      OP_REQUIRES(ctx, input_tensor.dtype() == dtypes_[i],
+                  errors::InvalidArgument(
+                      "PrelinearizeTuple dtype mismatch at tuple element ", i,
+                      "; expected ", DataType_Name(dtypes_[i]), ", got ",
+                      DataType_Name(input_tensor.dtype())));
+      OP_REQUIRES(ctx, input_tensor.shape() == shapes_[i],
+                  errors::InvalidArgument(
+                      "PrelinearizeTuple shape mismatch at tuple element ", i,
+                      "; expected ", shapes_[i].DebugString(), ", got ",
+                      input_tensor.shape().DebugString()));
+
+      // Auto-transpose and prelinearize.
+      LinearizerBufferList linearized_buffers;
+      std::vector<Tensor> saved_input_tensors;
+      auto status = AutoTransposeAndLinearize(
+          ctx, input_tensor, tuple_shape_.tuple_shapes(i), &linearized_buffers,
+          &saved_input_tensors);
+      OP_REQUIRES_OK(ctx, status);
+      all_linearized_buffers.insert(
+          all_linearized_buffers.end(),
+          std::make_move_iterator(linearized_buffers.begin()),
+          std::make_move_iterator(linearized_buffers.end()));
+      all_saved_input_tensors.insert(
+          all_saved_input_tensors.end(),
+          std::make_move_iterator(saved_input_tensors.begin()),
+          std::make_move_iterator(saved_input_tensors.end()));
+    }
+
+    tensorflow::Tensor* output;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, tensorflow::TensorShape{}, &output));
+    output->scalar<tensorflow::Variant>()() = LinearizedBuffersWrapper{
+        std::move(all_linearized_buffers), std::move(all_saved_input_tensors)};
+  }
+
+  bool IsExpensive() override { return true; }
+
+ private:
+  std::vector<TensorShape> shapes_;
+  DataTypeVector dtypes_;
+  xla::Shape tuple_shape_;
+
+  // PrelinearizeTupleOp is neither copyable nor movable.
+  PrelinearizeTupleOp(const PrelinearizeTupleOp&) = delete;
+  PrelinearizeTupleOp& operator=(const PrelinearizeTupleOp&) = delete;
+};
+
+// The InfeedEnqueuePrelinearizedBufferOp op is used to transfer prelinearized
+// buffers to the device infeed queue.
+class InfeedEnqueuePrelinearizedBufferOp : public TpuTransferAsyncOpKernel {
+ public:
+  explicit InfeedEnqueuePrelinearizedBufferOp(OpKernelConstruction* ctx)
+      : TpuTransferAsyncOpKernel(ctx, "prelinearized_buffers_to_infeed", 8) {}
+
+  Status DoWork(OpKernelContext* ctx,
+                xla::TpuTransferManagerInterface* transfer_manager,
+                stream_executor::StreamExecutor* stream_executor) override {
+    const Tensor& input_tensor = ctx->input(0);
+    const LinearizedBuffersWrapper* wrapper =
+        input_tensor.scalar<tensorflow::Variant>()()
+            .get<LinearizedBuffersWrapper>();
+    TF_RETURN_IF_ERROR(transfer_manager->TransferBuffersToInfeed(
+        stream_executor, wrapper->buffers));
+
+    return Status::OK();
+  }
+
+ private:
+  // InfeedEnqueuePrelinearizedBufferOp is neither copyable nor movable.
+  InfeedEnqueuePrelinearizedBufferOp(
+      const InfeedEnqueuePrelinearizedBufferOp&) = delete;
+  InfeedEnqueuePrelinearizedBufferOp& operator=(
+      const InfeedEnqueuePrelinearizedBufferOp&) = delete;
+};
+
+}  // anonymous namespace
+
+TpuInfeedEnqueueOp::TpuInfeedEnqueueOp(OpKernelConstruction* ctx)
+    : TpuTransferAsyncOpKernel(ctx, "infeed_enqueue", 8) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &shape_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  xla::Shape shape;
+  OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype_, shape_, &shape));
+  OP_REQUIRES_OK(ctx,
+                 GetInfeedShapeWithLayout(ctx, "layout", shape, &xla_shape_));
+}
+
+Status TpuInfeedEnqueueOp::DoWork(
+    OpKernelContext* ctx, xla::TpuTransferManagerInterface* transfer_manager,
+    stream_executor::StreamExecutor* stream_executor) {
+  const Tensor& input_tensor = ctx->input(0);
+
+  // Validate runtime shape and fail if it doesn't match the contract.
+  if (input_tensor.dtype() != dtype_) {
+    return errors::InvalidArgument("Infeed dtype mismatch.");
+  }
+  if (input_tensor.shape() != shape_) {
+    return errors::InvalidArgument("Infeed shape mismatch; expected ",
+                                   shape_.DebugString(), ", got ",
+                                   input_tensor.shape().DebugString());
+  }
+
+  const Tensor* tensor = &input_tensor;
+  Tensor transposed_tensor;
+  if (!xla::LayoutUtil::IsMonotonicWithDim0Major(xla_shape_.layout())) {
+    // If the given layout is not in dim0major layout, transpose the tensor.
+    TF_ASSIGN_OR_RETURN(transposed_tensor,
+                        TransposeTensor(ctx, input_tensor, xla_shape_));
+    tensor = &transposed_tensor;
+  }
+
+  xla::BorrowingLiteral literal;
+  TF_RETURN_IF_ERROR(HostTensorToBorrowingLiteral(*tensor, &literal));
+
+  // Transfer the given literal to the Infeed interface of the device.
+  TF_RETURN_IF_ERROR(
+      transfer_manager->TransferLiteralToInfeed(stream_executor, literal));
+  return Status::OK();
+}
+
+TpuInfeedEnqueueTupleOp::TpuInfeedEnqueueTupleOp(OpKernelConstruction* ctx)
+    : TpuTransferAsyncOpKernel(ctx, "infeed_enqueue", 8) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("shapes", &shapes_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("dtypes", &dtypes_));
+  OP_REQUIRES(
+      ctx, shapes_.size() == dtypes_.size(),
+      errors::InvalidArgument("shapes and dtypes must be the same length."));
+
+  std::vector<xla::Shape> xla_shapes;
+  for (int i = 0; i < shapes_.size(); i++) {
+    xla::Shape xla_shape;
+    OP_REQUIRES_OK(ctx,
+                   TensorShapeToXLAShape(dtypes_[i], shapes_[i], &xla_shape));
+    xla_shapes.push_back(xla_shape);
+  }
+  OP_REQUIRES_OK(
+      ctx, GetInfeedShapeWithLayout(ctx, "layouts",
+                                    xla::ShapeUtil::MakeTupleShape(xla_shapes),
+                                    &tuple_shape_));
+}
+
+Status TpuInfeedEnqueueTupleOp::DoWork(
+    OpKernelContext* ctx, xla::TpuTransferManagerInterface* transfer_manager,
+    stream_executor::StreamExecutor* stream_executor) {
+  OpInputList values;
+  TF_RETURN_IF_ERROR(ctx->input_list("inputs", &values));
+  if (values.size() != shapes_.size()) {
+    return errors::InvalidArgument(
+        "Wrong number of inputs to InfeedEnqueueTuple.");
+  }
+
+  for (const auto& shapes : shapes_) {
+    VLOG(1) << "TransferLiteralToInfeed " << shapes.DebugString();
+  }
+
+  std::vector<Tensor> maybe_transposed_tensors;
+  maybe_transposed_tensors.reserve(values.size());
+  for (int i = 0; i < values.size(); i++) {
+    // Validate runtime shapes and fail if it doesn't match the contract.
+    const Tensor* tensor = &values[i];
+    if (tensor->shape() != shapes_[i]) {
+      return errors::InvalidArgument("Infeed shape mismatch for tuple element ",
+                                     i, "; expected ", shapes_[i].DebugString(),
+                                     ", got ", tensor->shape().DebugString());
+    }
+    if (!xla::LayoutUtil::IsMonotonicWithDim0Major(
+            tuple_shape_.tuple_shapes(i).layout())) {
+      // If the given layout is not in dim0major layout, tranposes the given
+      // tensor.
+      TF_ASSIGN_OR_RETURN(
+          Tensor transposed_tensor,
+          TransposeTensor(ctx, *tensor, tuple_shape_.tuple_shapes(i)));
+      maybe_transposed_tensors.emplace_back(transposed_tensor);
+    } else {
+      maybe_transposed_tensors.emplace_back(*tensor);
+    }
+  }
+
+  xla::BorrowingLiteral tuple;
+  TF_RETURN_IF_ERROR(
+      HostTensorsToBorrowingLiteralTuple(maybe_transposed_tensors, &tuple));
+
+  // Transfer the given literal to the Infeed interface of the device.
+  TF_RETURN_IF_ERROR(
+      transfer_manager->TransferLiteralToInfeed(stream_executor, tuple));
+
+  VLOG(1) << "TransferLiteralToInfeed complete.";
+
+  return Status::OK();
+}
+
+// These ops execute on either the TPU device or the CPU device. When running on
+// CPU they must specify a non-negative value for device_ordinal to indicate
+// which TPU to send infeed to.
+REGISTER_KERNEL_BUILDER(
+    Name("InfeedEnqueue").Device(DEVICE_TPU_NODE).HostMemory("input"),
+    TpuInfeedEnqueueOp);
+REGISTER_KERNEL_BUILDER(Name("InfeedEnqueue").Device(DEVICE_CPU),
+                        TpuInfeedEnqueueOp);
+
+REGISTER_KERNEL_BUILDER(
+    Name("InfeedEnqueueTuple").Device(DEVICE_TPU_NODE).HostMemory("inputs"),
+    TpuInfeedEnqueueTupleOp);
+REGISTER_KERNEL_BUILDER(Name("InfeedEnqueueTuple").Device(DEVICE_CPU),
+                        TpuInfeedEnqueueTupleOp);
+
+// Prelinearize ops run on CPU as part of tf.data input pipeline.
+REGISTER_KERNEL_BUILDER(Name("Prelinearize").Device(DEVICE_CPU),
+                        PrelinearizeOp);
+REGISTER_KERNEL_BUILDER(Name("PrelinearizeTuple").Device(DEVICE_CPU),
+                        PrelinearizeTupleOp);
+
+// InfeedEnqueuePrelinearizedBuffer op run on CPU and takes a device_ordinal to
+// select the right device to infeed.
+REGISTER_KERNEL_BUILDER(
+    Name("InfeedEnqueuePrelinearizedBuffer").Device(DEVICE_CPU),
+    InfeedEnqueuePrelinearizedBufferOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/infeed_ops.h b/tensorflow/core/tpu/kernels/infeed_ops.h
new file mode 100644
index 00000000000..622583b6a73
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/infeed_ops.h
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_INFEED_OPS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_INFEED_OPS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tpu/kernels/transfer_ops.h"
+
+namespace tensorflow {
+
+// TODO(b/65200690): Rework this when there is a callback based infeed API to
+// StreamExecutor.
+
+// The InfeedEnqueue op is used to deliver data to the device infeed queue.
+class TpuInfeedEnqueueOp : public TpuTransferAsyncOpKernel {
+ public:
+  explicit TpuInfeedEnqueueOp(OpKernelConstruction* ctx);
+  Status DoWork(OpKernelContext* ctx,
+                xla::TpuTransferManagerInterface* transfer_manager,
+                stream_executor::StreamExecutor* stream_executor) override;
+
+ private:
+  TensorShape shape_;
+  DataType dtype_;
+  xla::Shape xla_shape_;
+
+  // TpuInfeedEnqueueOp is neither copyable nor movable.
+  TpuInfeedEnqueueOp(const TpuInfeedEnqueueOp&) = delete;
+  TpuInfeedEnqueueOp& operator=(const TpuInfeedEnqueueOp&) = delete;
+};
+
+// The InfeedEnqueueTuple op is used on the host to deliver multiple tensors to
+// the device infeed queue as an XLA tuple.
+class TpuInfeedEnqueueTupleOp : public TpuTransferAsyncOpKernel {
+ public:
+  explicit TpuInfeedEnqueueTupleOp(OpKernelConstruction* ctx);
+  Status DoWork(OpKernelContext* ctx,
+                xla::TpuTransferManagerInterface* transfer_manager,
+                stream_executor::StreamExecutor* stream_executor) override;
+
+ private:
+  std::vector<TensorShape> shapes_;
+  DataTypeVector dtypes_;
+  xla::Shape tuple_shape_;
+
+  // TpuInfeedEnqueueTupleOp is neither copyable nor movable.
+  TpuInfeedEnqueueTupleOp(const TpuInfeedEnqueueTupleOp&) = delete;
+  TpuInfeedEnqueueTupleOp& operator=(const TpuInfeedEnqueueTupleOp&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_INFEED_OPS_H_
diff --git a/tensorflow/core/tpu/kernels/outfeed_ops.cc b/tensorflow/core/tpu/kernels/outfeed_ops.cc
new file mode 100644
index 00000000000..51a3a71a297
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/outfeed_ops.cc
@@ -0,0 +1,116 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/kernels/outfeed_ops.h"
+
+#include "tensorflow/compiler/jit/xla_device.h"
+#include "tensorflow/compiler/tf2xla/literal_util.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/tpu/kernels/transfer_ops.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+
+namespace tensorflow {
+
+TpuOutfeedDequeueOp::TpuOutfeedDequeueOp(OpKernelConstruction* ctx)
+    : TpuTransferAsyncOpKernel(ctx, "outfeed_dequeue", 1) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &shape_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype_, shape_, &xla_shape_));
+}
+
+Status TpuOutfeedDequeueOp::DoWork(
+    OpKernelContext* ctx, xla::TpuTransferManagerInterface* transfer_manager,
+    stream_executor::StreamExecutor* stream_executor) {
+  Tensor* output;
+  TF_RETURN_IF_ERROR(ctx->allocate_output(0, shape_, &output));
+
+  // Transfer from the outfeed interface of the device.
+  xla::MutableBorrowingLiteral literal;
+  TF_RETURN_IF_ERROR(
+      HostTensorToMutableBorrowingLiteral(xla_shape_, output, &literal));
+
+  VLOG(1) << "TransferLiteralFromOutfeed "
+          << xla::ShapeUtil::HumanStringWithLayout(xla_shape_);
+
+  TF_RETURN_IF_ERROR(transfer_manager->TransferLiteralFromOutfeed(
+      stream_executor, xla_shape_, literal));
+
+  VLOG(1) << "TransferLiteralFromOutfeed complete.";
+
+  return Status::OK();
+}
+
+// The OutfeedDequeueTuple op is used to retrieve multiple tensors from the
+// device outfeed queue.
+TpuOutfeedDequeueTupleOp::TpuOutfeedDequeueTupleOp(OpKernelConstruction* ctx)
+    : TpuTransferAsyncOpKernel(ctx, "outfeed_dequeue", 1) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("shapes", &shapes_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("dtypes", &dtypes_));
+  OP_REQUIRES(
+      ctx, shapes_.size() == dtypes_.size(),
+      errors::InvalidArgument("shapes and dtypes must be the same length."));
+  // The `dtypes` list is inferred from the supplied inputs, so it
+  // is always the correct length.
+  for (int i = 0; i < shapes_.size(); i++) {
+    xla::Shape xla_shape;
+    OP_REQUIRES_OK(ctx,
+                   TensorShapeToXLAShape(dtypes_[i], shapes_[i], &xla_shape));
+    xla_shapes_.push_back(xla_shape);
+  }
+  tuple_shape_ = xla::ShapeUtil::MakeTupleShape(xla_shapes_);
+}
+
+Status TpuOutfeedDequeueTupleOp::DoWork(
+    OpKernelContext* ctx, xla::TpuTransferManagerInterface* transfer_manager,
+    stream_executor::StreamExecutor* stream_executor) {
+  VLOG(1) << "TransferLiteralFromOutfeed "
+          << xla::ShapeUtil::HumanStringWithLayout(tuple_shape_);
+
+  for (int i = 0; i < shapes_.size(); ++i) {
+    Tensor* output;
+    TF_RETURN_IF_ERROR(ctx->allocate_output(i, shapes_[i], &output));
+
+    xla::MutableBorrowingLiteral literal;
+    TF_RETURN_IF_ERROR(
+        HostTensorToMutableBorrowingLiteral(xla_shapes_[i], output, &literal));
+    TF_RETURN_IF_ERROR(transfer_manager->TransferLiteralFromOutfeed(
+        stream_executor, xla_shapes_[i], literal));
+  }
+  return Status::OK();
+}
+
+// These ops execute on either the TPU device or the CPU device. When
+// running on CPU they must specify a non-negative value for
+// device_ordinal to indicate which TPU to receive outfeed from.
+REGISTER_KERNEL_BUILDER(
+    Name("OutfeedDequeue").Device(DEVICE_TPU_NODE).HostMemory("output"),
+    TpuOutfeedDequeueOp);
+REGISTER_KERNEL_BUILDER(Name("OutfeedDequeue").Device(DEVICE_CPU),
+                        TpuOutfeedDequeueOp);
+
+REGISTER_KERNEL_BUILDER(
+    Name("OutfeedDequeueTuple").Device(DEVICE_TPU_NODE).HostMemory("outputs"),
+    TpuOutfeedDequeueTupleOp);
+REGISTER_KERNEL_BUILDER(Name("OutfeedDequeueTuple").Device(DEVICE_CPU),
+                        TpuOutfeedDequeueTupleOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/outfeed_ops.h b/tensorflow/core/tpu/kernels/outfeed_ops.h
new file mode 100644
index 00000000000..5e3ed87c04b
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/outfeed_ops.h
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_OUTFEED_OPS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_OUTFEED_OPS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/tpu/kernels/transfer_ops.h"
+
+namespace tensorflow {
+
+// The OutfeedDequeue op is used to retrieve a single tensor from the device
+// outfeed queue.
+class TpuOutfeedDequeueOp : public TpuTransferAsyncOpKernel {
+ public:
+  explicit TpuOutfeedDequeueOp(OpKernelConstruction* ctx);
+
+  Status DoWork(OpKernelContext* ctx,
+                xla::TpuTransferManagerInterface* transfer_manager,
+                stream_executor::StreamExecutor* stream_executor) override;
+
+ private:
+  TensorShape shape_;
+  DataType dtype_;
+  xla::Shape xla_shape_;
+
+  // OutfeedDequeueOp is neither copyable nor movable.
+  TpuOutfeedDequeueOp(const TpuOutfeedDequeueOp&) = delete;
+  TpuOutfeedDequeueOp& operator=(const TpuOutfeedDequeueOp&) = delete;
+};
+
+// The OutfeedDequeueTuple op is used to retrieve multiple tensors from the
+// device outfeed queue.
+class TpuOutfeedDequeueTupleOp : public TpuTransferAsyncOpKernel {
+ public:
+  explicit TpuOutfeedDequeueTupleOp(OpKernelConstruction* ctx);
+
+  Status DoWork(OpKernelContext* ctx,
+                xla::TpuTransferManagerInterface* transfer_manager,
+                stream_executor::StreamExecutor* stream_executor) override;
+
+ private:
+  std::vector<TensorShape> shapes_;
+  DataTypeVector dtypes_;
+  std::vector<xla::Shape> xla_shapes_;
+  xla::Shape tuple_shape_;
+
+  // OutfeedDequeueTupleOp is neither copyable nor movable.
+  TpuOutfeedDequeueTupleOp(const TpuOutfeedDequeueTupleOp&) = delete;
+  TpuOutfeedDequeueTupleOp& operator=(const TpuOutfeedDequeueTupleOp&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_OUTFEED_OPS_H_
diff --git a/tensorflow/core/tpu/kernels/replication_ops.cc b/tensorflow/core/tpu/kernels/replication_ops.cc
new file mode 100644
index 00000000000..4c986e880e7
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/replication_ops.cc
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/xla_device_ops.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
+
+namespace tensorflow {
+
+REGISTER_KERNEL_BUILDER(Name("_TPUReplicate").Device(DEVICE_TPU_SYSTEM),
+                        XlaDeviceDummyOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/topk_ops.cc b/tensorflow/core/tpu/kernels/topk_ops.cc
new file mode 100644
index 00000000000..11ca4e2d74b
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/topk_ops.cc
@@ -0,0 +1,360 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/tpu_defs.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+
+namespace tensorflow {
+namespace {
+
+using ::tensorflow::errors::InvalidArgument;
+
+// Computes the Kth order statistic of a data set. The current
+// implementation uses a binary search requiring exactly 32 passes
+// over the input data. The running time is linear with respect to
+// input size. The median-of-medians algorithm is probably faster, but
+// is difficult to implement efficiently in XLA. The implementation
+// imposes a total ordering on floats. The ordering is consistent with
+// the usual partial order.  Positive NaNs are greater than positive
+// infinity. Negative NaNs are less than negative infinity. NaNs with
+// distinct payloads are treated as distinct. Subnormal numbers are
+// preserved (not flushed to zero). Positive infinity is greater than
+// all numbers. Negative infinity is less than all numbers. Positive
+// is greater than negative zero. There are less than k values greater
+// than the kth order statistic. There are at least k values greater
+// than or equal to the Kth order statistic. The semantics are not the
+// same as TopKUnique.
+xla::XlaOp CreateKthOrderStatisticComputation(xla::XlaBuilder* builder,
+                                              const TensorShape& input_shape,
+                                              const xla::XlaOp input,
+                                              const xla::XlaOp k) {
+  const int64 height = input_shape.dim_size(0);
+  const int64 width = input_shape.dim_size(1);
+
+  xla::XlaOp input_sm32 = xla::BitcastConvertType(input, xla::S32);
+  xla::XlaOp zero_r0 = xla::ConstantR0<int32>(builder, 0);
+  xla::XlaOp zero_r1 = xla::Broadcast(zero_r0, {height});
+  xla::XlaOp zero_r2 = xla::Broadcast(zero_r0, {height, width});
+
+  xla::XlaOp max_r0 = xla::ConstantR0<int32>(builder, 0x7FFFFFFF);
+  xla::XlaOp max_r1 = xla::Broadcast(max_r0, {height});
+
+  // Start at positive zero, so that pivot is always less than top.
+  xla::XlaOp negative_zero_r0 = xla::ConstantR0<int32>(builder, 0x80000000);
+  xla::XlaOp negative_zero_r1 = xla::Broadcast(negative_zero_r0, {height});
+  xla::XlaOp top_r1 = zero_r1;
+
+  for (uint32 mask = 1U << 31; mask; mask >>= 1) {
+    xla::XlaOp broadcast_mask_r1 =
+        xla::Broadcast(xla::ConstantR0<int32>(builder, mask), {height});
+
+    // The first iteration of the loop determines if the kth element
+    // is positive or negative. If the kth element is negative, we
+    // start the search from +QNAN (0x7FFFFFF). If k is negative, we
+    // start from -0 (0x8000000). The pivot is less than the top and
+    // is always half way between the top and the implicit bottom in
+    // IEEE754 space.
+    xla::XlaOp pivot_r1 = xla::Xor(top_r1, broadcast_mask_r1);
+    xla::XlaOp pivot_r2 = xla::Add(pivot_r1, zero_r2, {0});
+    xla::XlaOp both_negative_r2 =
+        xla::Lt(xla::And(input_sm32, pivot_r2), zero_r0);
+    xla::XlaOp left_r2 = xla::Select(both_negative_r2, pivot_r2, input_sm32);
+    xla::XlaOp right_r2 = xla::Select(both_negative_r2, input_sm32, pivot_r2);
+    xla::XlaOp pred_r2 = xla::Gt(left_r2, right_r2);
+    xla::XlaOp conv_r2 = xla::ConvertElementType(pred_r2, xla::S32);
+
+    xla::XlaComputation add = CreateScalarAddComputation(xla::S32, builder);
+    xla::XlaOp sum_r1 = xla::Reduce(conv_r2, zero_r0, add, {1});
+
+    xla::XlaOp pivot_too_low_r1 = xla::Le(k, sum_r1, {});
+
+    if (mask == (1U << 31)) {
+      top_r1 = xla::Select(pivot_too_low_r1, max_r1, negative_zero_r1);
+    } else {
+      top_r1 = xla::Select(pivot_too_low_r1, top_r1, pivot_r1);
+    }
+  }
+  return xla::BitcastConvertType(top_r1, xla::F32);
+}
+
+class KthOrderStatistic : public XlaOpKernel {
+ public:
+  explicit KthOrderStatistic(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("k", &k_));
+    OP_REQUIRES(ctx, k_ >= 0, errors::InvalidArgument("Need k >= 0, got ", k_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* builder = ctx->builder();
+    xla::XlaOp input = ctx->Input(0);
+    const TensorShape& input_shape = ctx->InputShape(0);
+    OP_REQUIRES(
+        ctx, input_shape.dims() == 2,
+        InvalidArgument("input must be rank-2: ", input_shape.DebugString()));
+
+    xla::XlaOp k = xla::ConstantR0<int32>(builder, k_);
+    xla::XlaOp kth_order_statistics =
+        CreateKthOrderStatisticComputation(builder, input_shape, input, k);
+    ctx->SetOutput(0, kth_order_statistics);
+  }
+
+ private:
+  int32 k_;
+};
+
+REGISTER_XLA_OP(Name("KthOrderStatistic"), KthOrderStatistic);
+
+// Returns the TopK unique values in the array in sorted order and the
+// indices of those elements. The running time is proportional to the
+// product of K and the input size. Sorting the whole array is more
+// efficient for sufficiently large values of K. The median-of-medians
+// algorithm is probably faster, but difficult to implement
+// efficiently in XLA. If there are fewer than K unique values, the
+// results are padded with negative infinity. NaNs are never
+// returned. Subnormal numbers are flushed to zero.
+//
+// If an element appears at multiple indices, the highest index is
+// returned. If a TopK element never appears in the input due to
+// padding values, the indices are padded with negative one. If a
+// padding value appears in the input and padding is needed, the
+// highest index of the padding value will be returned.
+//
+// The semantics are not the same as KthOrderStatistic.
+//
+// If masked_with_iota is true, the index is already encoded in the lower bits
+// of the mantissa, which will be extracted as the index in the output.
+// Otherwise, every iteration will use the following algorithm to get the index:
+//   index = max([i if data[i] == max else -1 for i in size])
+//
+// TODO(b/74994968): Replace TopKUnique with an LLO implementation of
+// TopK with reasonable semantics.
+std::pair<xla::XlaOp, xla::XlaOp> CreateTopKUnique(
+    xla::XlaBuilder* builder, const xla::XlaOp input,
+    const TensorShape& input_shape, int64 k, bool masked_with_iota) {
+  const int64 height = input_shape.dim_size(0);
+  const int64 width = input_shape.dim_size(1);
+
+  xla::XlaOp iota_r1 = xla::Iota(builder, xla::S32, width);
+  xla::XlaOp iota_r2 = xla::Broadcast(iota_r1, {height});
+
+  xla::XlaOp negative_one_r0 = xla::ConstantR0<int>(builder, -1);
+  xla::XlaOp negative_one_r2 = xla::Broadcast(negative_one_r0, {height, width});
+
+  xla::XlaOp negative_infinity_r0 = xla::ConstantR0<float>(builder, -INFINITY);
+  xla::XlaOp negative_infinity_r2 =
+      xla::Broadcast(negative_infinity_r0, {height, width});
+
+  xla::XlaOp scratch_pad_r2 = input;
+  std::vector<xla::XlaOp> topk_r1s;
+  std::vector<xla::XlaOp> topk_indices;
+  for (int i = 0; i < k; ++i) {
+    xla::XlaOp kth_order_statistic_r1 =
+        xla::Reduce(scratch_pad_r2, negative_infinity_r0,
+                    CreateScalarMaxComputation(xla::F32, builder), {1});
+    topk_r1s.push_back(kth_order_statistic_r1);
+
+    xla::XlaOp ge_r2 = xla::Ge(input, kth_order_statistic_r1, {0});
+    scratch_pad_r2 = xla::Select(ge_r2, negative_infinity_r2, input);
+
+    if (!masked_with_iota) {
+      xla::XlaOp eq_r2 = xla::Eq(input, kth_order_statistic_r1, {0});
+      xla::XlaOp indices_r2 = xla::Select(eq_r2, iota_r2, negative_one_r2);
+      xla::XlaOp topk_index_r1 =
+          xla::Reduce(indices_r2, negative_one_r0,
+                      CreateScalarMaxComputation(xla::S32, builder), {1});
+      topk_indices.push_back(topk_index_r1);
+    }
+  }
+  xla::XlaOp topk_r1_concat = xla::ConcatInDim(builder, topk_r1s, 0);
+  xla::XlaOp topk_r2 =
+      xla::Transpose(xla::Reshape(topk_r1_concat, {k, height}), {1, 0});
+
+  xla::XlaOp topk_indices_r2;
+  if (masked_with_iota) {
+    int32 log2_ceiling = tensorflow::Log2Ceiling(width);
+    int32 next_power_of_two = 1U << log2_ceiling;
+    int32 count_mask = next_power_of_two - 1;
+    xla::XlaOp mask_r0 = xla::ConstantR0(builder, count_mask);
+    xla::XlaOp mask_r2 = xla::Broadcast(mask_r0, {height, k});
+    xla::XlaOp topk_r2_s32 = xla::BitcastConvertType(topk_r2, xla::S32);
+    topk_indices_r2 = xla::And(topk_r2_s32, mask_r2);
+  } else {
+    xla::XlaOp topk_indices_concat = xla::ConcatInDim(builder, topk_indices, 0);
+    topk_indices_r2 =
+        xla::Transpose(xla::Reshape(topk_indices_concat, {k, height}), {1, 0});
+  }
+  return std::make_pair(topk_r2, topk_indices_r2);
+}
+
+class TopKUnique : public XlaOpKernel {
+ public:
+  explicit TopKUnique(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("k", &k_));
+    OP_REQUIRES(ctx, k_ >= 0, errors::InvalidArgument("Need k >= 0, got ", k_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* builder = ctx->builder();
+    xla::XlaOp input = ctx->Input(0);
+    const TensorShape& input_shape = ctx->InputShape(0);
+    OP_REQUIRES(
+        ctx, input_shape.dims() == 2,
+        InvalidArgument("input must be rank-2: ", input_shape.DebugString()));
+
+    auto topk = CreateTopKUnique(builder, input, input_shape, k_, false);
+    ctx->SetOutput(0, topk.first);
+    ctx->SetOutput(1, topk.second);
+  }
+
+ private:
+  int k_;
+};
+REGISTER_XLA_OP(Name("TopKUnique"), TopKUnique);
+
+// Make all elements in the non-Batch dimension unique and close to
+// their initial value on a relative scale, but potential far from
+// their initial value in an absolute scale.
+//
+// This operation is meant to be combined with TopKUnique to avoid
+// suppressing identical elements. For most TopK users, the indices of
+// the TopK elements are important but the relative order of the TopK
+// elements and their exact values is not so important. Ideally, the
+// the indices of the TopK elements of the output of MakeUnique are
+// the same as the indices of the TopK elements of the inputs.
+//
+// Its an open question whether it is better to accept the risk of two
+// elements in the input to TopK have exactly the same value or the
+// risk that MakeUnique will alter the indices of the TopK
+// elements. Model owners are encouraged to experiment!
+//
+// Never returns a sub-normal number. Never returns zero. The sign of
+// each input element is always identical to the sign of the
+// corresponding output element. Behavior for infinite elements is
+// undefined. Behavior for subnormal elements is undefined.
+//
+// Algorithm:
+// 1. Replace zeros with the smallest representable normal floating
+// point number with the same sign.
+// 2. Mask away enough low order bits that every value can be distinct.
+// 3. Replace the low order bits with iota.
+//
+// TODO(b/74994968): Replace MakeUnique with an LLO implementation of
+// TopK with reasonable semantics.
+xla::XlaOp CreateMakeUnique(xla::XlaBuilder* builder, const xla::XlaOp input,
+                            const TensorShape& input_shape) {
+  const int64 height = input_shape.dim_size(0);
+  const int64 width = input_shape.dim_size(1);
+
+  xla::XlaOp zero_r0 = xla::ConstantR0(builder, 0U);
+  xla::XlaOp zero_r2 = xla::Broadcast(zero_r0, {height, width});
+
+  // count_mask is used to mask away the low order bits to ensure
+  // that every element is distinct.
+  uint32 log2_ceiling = static_cast<uint32>(std::ceil(std::log2(width)));
+  uint32 next_power_of_two = 1U << log2_ceiling;
+  uint32 count_mask = ~(next_power_of_two - 1);
+  xla::XlaOp count_mask_r0 = xla::ConstantR0(builder, count_mask);
+  xla::XlaOp count_mask_r2 = xla::Broadcast(count_mask_r0, {height, width});
+
+  // smallest_normal is the bit representation of the smallest
+  // positive normal floating point number. The sign is zero,
+  // exponent is one, and the fraction is zero.
+  uint32 smallest_normal = 1U << 23;
+  xla::XlaOp smallest_normal_r0 = xla::ConstantR0(builder, smallest_normal);
+  xla::XlaOp smallest_normal_r2 =
+      xla::Broadcast(smallest_normal_r0, {height, width});
+
+  // Used to mask away the sign bit when computing the absolute
+  // value.
+  uint32 low_bit_mask = ~(1U << 31);
+  xla::XlaOp low_bit_mask_r0 = xla::ConstantR0(builder, low_bit_mask);
+  xla::XlaOp low_bit_mask_r2 = xla::Broadcast(low_bit_mask_r0, {height, width});
+
+  xla::XlaOp iota_r1 = xla::Iota(builder, xla::U32, width);
+  xla::XlaOp iota_r2 = xla::Broadcast(iota_r1, {height});
+
+  // Compare the absolute value with positive zero to handle
+  // negative zero.
+  //
+  // Pseudocode: input_no_zeros = abs(input) == 0 ? FLT_MIN : input
+  xla::XlaOp input_u32_r2 = xla::BitcastConvertType(input, xla::U32);
+  xla::XlaOp abs_r2 = xla::And(input_u32_r2, low_bit_mask_r2);
+  xla::XlaOp if_zero_r2 = xla::Eq(abs_r2, zero_r2);
+  xla::XlaOp smallest_normal_preserving_sign_r2 =
+      xla::Or(input_u32_r2, smallest_normal_r2);
+  xla::XlaOp input_no_zeros_r2 =
+      xla::Select(if_zero_r2, smallest_normal_preserving_sign_r2, input_u32_r2);
+
+  // Discard the low-order bits and replace with iota.
+  xla::XlaOp and_r2 = xla::And(input_no_zeros_r2, count_mask_r2);
+  xla::XlaOp or_r2 = xla::Or(and_r2, iota_r2);
+  return xla::BitcastConvertType(or_r2, xla::F32);
+}
+
+class MakeUnique : public XlaOpKernel {
+ public:
+  explicit MakeUnique(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* builder = ctx->builder();
+    xla::XlaOp input = ctx->Input(0);
+    const TensorShape& input_shape = ctx->InputShape(0);
+    OP_REQUIRES(
+        ctx, input_shape.dims() == 2,
+        InvalidArgument("input must be rank-2: ", input_shape.DebugString()));
+
+    ctx->SetOutput(0, CreateMakeUnique(builder, input, input_shape));
+  }
+};
+REGISTER_XLA_OP(Name("MakeUnique"), MakeUnique);
+
+// Returns the TopK approximate values in the array in sorted order and the
+// indices of those elements. The running time is proportional to the
+// product of K and the input size.
+//
+// The algorithm first updates the lower bits of each element with iota,
+// which is used to derive the index. The iota also serves the purpose to
+// make each element unique so that each iteration, we are guaranteed to
+// get one and only one unique top-1 element.
+class TopKWithUnique : public XlaOpKernel {
+ public:
+  explicit TopKWithUnique(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("k", &k_));
+    OP_REQUIRES(ctx, k_ >= 0, errors::InvalidArgument("Need k >= 0, got ", k_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* builder = ctx->builder();
+    xla::XlaOp input = ctx->Input(0);
+    const TensorShape& input_shape = ctx->InputShape(0);
+    OP_REQUIRES(
+        ctx, input_shape.dims() == 2,
+        InvalidArgument("input must be rank-2: ", input_shape.DebugString()));
+
+    xla::XlaOp unique = CreateMakeUnique(builder, input, input_shape);
+    auto topk = CreateTopKUnique(builder, unique, input_shape, k_, true);
+    ctx->SetOutput(0, topk.first);
+    ctx->SetOutput(1, topk.second);
+  }
+
+ private:
+  int k_;
+};
+REGISTER_XLA_OP(Name("TopKWithUnique"), TopKWithUnique);
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.cc
deleted file mode 100644
index 73f55853306..00000000000
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
-
-#include "tensorflow/core/platform/casts.h"
-
-namespace tensorflow {
-namespace tpu {
-
-TpuCompilationCacheEntry::TpuCompilationCacheEntry(
-    const TpuProgramGroupInterface* tpu_program_group, int core_index)
-    : tpu_program_group_(
-          tensorflow::down_cast<const TpuProgramGroup*>(tpu_program_group)),
-      core_index_(core_index) {}
-
-// Constructor for an empty entry.
-TpuCompilationCacheEntry::TpuCompilationCacheEntry()
-    : tpu_program_group_(nullptr) {}
-
-const TPUExecutableInfoProto* TpuCompilationCacheEntry::get_executable_info()
-    const {
-  return &(tpu_program_group_->executable_info());
-}
-
-const TPUHostTransferInfoProto*
-TpuCompilationCacheEntry::get_host_transfer_info() const {
-  return &(tpu_program_group_->host_transfer_info());
-}
-
-const xla::HloProto* TpuCompilationCacheEntry::get_hlo_metadata() const {
-  return tpu_program_group_->hlo_metadatas()[core_index_];
-}
-
-// TODO(henrytan,jiawenhao): When should we expect more than one
-// XLA_TpuProgram* per TpuProgram? Remove the program_count CHECK below then.
-const XLA_TpuProgram* TpuCompilationCacheEntry::get_tpu_program() const {
-  CHECK_EQ(tpu_program_group_->program_count(), 1);
-  return tpu_program_group_->tpu_programs()[core_index_];
-}
-
-}  // namespace tpu
-}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h
index b3766b8b4dd..832d76bfceb 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h
@@ -18,30 +18,32 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/tpu/kernels/tpu_executable_info.pb.h"
-#include "tensorflow/core/tpu/kernels/tpu_program_group.h"
+#include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
 
 namespace tensorflow {
 namespace tpu {
 
-// A version of `CompilationCacheEntry` to access Tpu binary program
-// `XLA_TpuProgram`.
+// Cache entry to hold a `TpuProgramGroupInterface` object that can be used to
+// fetch a TPU program for a given TPU core index.
 class TpuCompilationCacheEntry {
  public:
   explicit TpuCompilationCacheEntry(
-      const TpuProgramGroupInterface* tpu_program_group, int core_index);
+      const TpuProgramGroupInterface* tpu_program_group, int core_index)
+      : tpu_program_group_(tpu_program_group), core_index_(core_index) {}
+
   // Constructor for an empty entry.
-  TpuCompilationCacheEntry();
-  const TPUExecutableInfoProto* get_executable_info() const;
-  const TPUHostTransferInfoProto* get_host_transfer_info() const;
-  const xla::HloProto* get_hlo_metadata() const;
-  // TODO(henrytan): maybe nicer to return C++ wrapper of `XLA_TpuProgram`
-  const XLA_TpuProgram* get_tpu_program() const;
+  TpuCompilationCacheEntry() : tpu_program_group_(nullptr), core_index_(-1) {}
+
+  const TpuProgramGroupInterface* tpu_program_group() const {
+    return tpu_program_group_;
+  }
+
+  int core_index() const { return core_index_; }
 
  private:
-  const TpuProgramGroup* tpu_program_group_;
+  const TpuProgramGroupInterface* tpu_program_group_;
   int core_index_;
 };
-
 }  // namespace tpu
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_impl.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_impl.h
deleted file mode 100644
index 501f802b01f..00000000000
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_impl.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_ENTRY_IMPL_H_
-#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_ENTRY_IMPL_H_
-
-#include "tensorflow/core/tpu/kernels/compiled_subgraph.h"
-#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
-#include "tensorflow/core/tpu/kernels/tpu_executable_info.pb.h"
-
-namespace tensorflow {
-namespace tpu {
-
-// Wrapper for a cache entry that holds a reference to the entry until the
-// wrapper is deleted. This wrapper is the concrete type of
-// CompilationCacheEntryRef returned by Lookup.
-template <typename CacheEntryType>
-class CompilationCacheEntryRefImpl
-    : public CompilationCacheEntryRef<CacheEntryType> {
- public:
-  CompilationCacheEntryRefImpl(TpuCompilationCacheInterface* parent,
-                               CompiledSubgraph* entry, int index);
-
-  ~CompilationCacheEntryRefImpl() override;
-
-  Status ToSubEntryRef(CompilationCacheFetchTarget fetch_target) override;
-
- protected:
-  TpuCompilationCacheInterface* parent_;  // Not owned.
-  // A reference to entry_ is acquired in the constructor and released via
-  // parent->DiscardEntryRefs in the destructor.
-  CompiledSubgraph* entry_;
-  // The index of the program in entry_ that is returned by the get method.
-  int index_;
-};
-
-template <typename CacheEntryType>
-CompilationCacheEntryRefImpl<CacheEntryType>::CompilationCacheEntryRefImpl(
-    TpuCompilationCacheInterface* parent, CompiledSubgraph* entry, int index)
-    : parent_(parent), entry_(entry), index_(index) {
-  if (entry_ == nullptr) {
-    return;
-  }
-  if (entry_->main_entry == nullptr) {
-    entry_->Ref();
-  } else {
-    // This is a sharding/unsharding entry nested in a main entry. Only
-    // refcount the main entry.
-    entry_->main_entry->Ref();
-  }
-}
-
-template <typename CacheEntryType>
-CompilationCacheEntryRefImpl<CacheEntryType>::~CompilationCacheEntryRefImpl() {
-  if (entry_ == nullptr) {
-    return;
-  }
-  if (entry_->main_entry == nullptr) {
-    parent_->DiscardEntryRefs({entry_});
-  } else {
-    parent_->DiscardEntryRefs({entry_->main_entry});
-  }
-}
-
-template <typename CacheEntryType>
-Status CompilationCacheEntryRefImpl<CacheEntryType>::ToSubEntryRef(
-    CompilationCacheFetchTarget fetch_target) {
-  CompiledSubgraph* target = nullptr;
-  switch (fetch_target) {
-    case CompilationCacheFetchTarget::MAIN:
-      target = entry_;
-      break;
-    case CompilationCacheFetchTarget::SHARDING:
-      target = entry_->sharding_entry.get();
-      break;
-    case CompilationCacheFetchTarget::UNSHARDING:
-      target = entry_->unsharding_entry.get();
-      break;
-    default:
-      return xla::InvalidArgument("Invalid fetch target: %d", fetch_target);
-  }
-
-  if (target == nullptr) {
-    // Cache entry does not have an unsharding subentry. Unref and replace
-    // with nullptr.
-    parent_->DiscardEntryRefs({entry_});
-  }
-  // Otherwise, since the refcount is always on the main entry, we don't
-  // need ref/unref.
-  entry_ = target;
-  return Status::OK();
-}
-
-}  // namespace tpu
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_ENTRY_IMPL_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
index b4b18d1743b..80010d70cd4 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
@@ -16,15 +16,18 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/random.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/tpu/kernels/compiled_subgraph.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_metrics.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
+#include "tensorflow/core/tpu/kernels/tpu_program_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_util.h"
 #include "tensorflow/core/tpu/kernels/trace_util.h"
 
@@ -48,23 +51,22 @@ void PopulateEntry(const std::string& key, CompiledSubgraph* entry,
   entry->tpu_program_group =
       absl::make_unique<TpuProgramGroup>(std::move(tpu_program_group));
   entry->initialized = true;
+
+  if (entry->initialization_status.ok()) {
+    // Compute the entries total size once all members are initialized.
+    entry->total_size = entry->ComputeTotalSize();
+  }
+}
+
+std::unique_ptr<CompiledSubgraph> CreateAndInitializeCompiledSubgraph(
+    CompiledSubgraph* main_entry) {
+  auto entry = absl::make_unique<CompiledSubgraph>();
+  entry->main_entry = main_entry;
+  entry->tpu_program_group = absl::make_unique<TpuProgramGroup>();
+  return entry;
 }
 }  // namespace
 
-TpuCompilationCacheExternal::EntryRefImpl::EntryRefImpl(
-    TpuCompilationCacheInterface* parent, CompiledSubgraph* entry, int index)
-    : CompilationCacheEntryRefImpl<TpuCompilationCacheEntry>(parent, entry,
-                                                             index) {}
-
-TpuCompilationCacheEntry TpuCompilationCacheExternal::EntryRefImpl::get() {
-  if (entry_ == nullptr) {
-    // Create an empty entry if the entry is nullptr. This corresponds to
-    // non-existing sharding/unsharding entries.
-    return TpuCompilationCacheEntry();
-  }
-  return TpuCompilationCacheEntry(entry_->tpu_program_group.get(), index_);
-}
-
 CompiledSubgraph* TpuCompilationCacheExternal::InitializeEntry(
     const string& key,
     const std::function<Status(TpuProgramGroupInterface*)>& initialize_program,
@@ -73,7 +75,6 @@ CompiledSubgraph* TpuCompilationCacheExternal::InitializeEntry(
   main_entry->parent = this;
   main_entry->subgraph_key = key;
   main_entry->uid = get_uid();
-  // TODO(henrytan): implement TpuCompilationCacheKey.debug_string.
   main_entry->cache_entry_debug_string = subgraph_key.prefix;
   VLOG(1) << "Cache Initializing Entry Session Debug "
           << main_entry->cache_entry_debug_string;
@@ -112,17 +113,29 @@ CompiledSubgraph* TpuCompilationCacheExternal::InitializeEntry(
       std::pair<int64, CompiledSubgraph*>(main_entry->uid, main_entry));
   CHECK(uid_inserted.second);
 
-  if (initialization_status.ok()) {
-    // Compute the entries total size once all members are initialized.
-    main_entry->total_size = tpu_program_group.program_size();
+  if (tpu_program_group.has_sharding_program()) {
+    main_entry->sharding_entry =
+        CreateAndInitializeCompiledSubgraph(main_entry);
+    TpuProgramGroup sharding_programs;
+    sharding_programs.Initialize(
+        tpu_program_group.tpu_programs(TpuProgramShardingType::kSharding));
+    PopulateEntry(key, main_entry->sharding_entry.get(),
+                  std::move(sharding_programs));
+
+    main_entry->unsharding_entry =
+        CreateAndInitializeCompiledSubgraph(main_entry);
+    TpuProgramGroup unsharding_programs;
+    unsharding_programs.Initialize(
+        tpu_program_group.tpu_programs(TpuProgramShardingType::kUnsharding));
+    PopulateEntry(key, main_entry->unsharding_entry.get(),
+                  std::move(unsharding_programs));
   }
 
-  // TODO(henrytan): handle sharding/unsharding.
   PopulateEntry(key, main_entry, std::move(tpu_program_group));
 
   for (int64 i = 0; i < main_entry->proto_key.size(); ++i) {
     auto entry_inserted = entries_by_proto_key_.insert(
-        std::pair<string, std::pair<CompiledSubgraph*, int>>(
+        std::pair<std::string, std::pair<CompiledSubgraph*, int>>(
             main_entry->proto_key[i], std::make_pair(main_entry, i)));
     CHECK(entry_inserted.second);
   }
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
index 86615b15d4c..51b5ffbed0d 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
@@ -32,7 +32,6 @@ limitations under the License.
 #include "tensorflow/core/tpu/kernels/compiled_subgraph.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
-#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_impl.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
@@ -46,17 +45,6 @@ namespace tpu {
 
 class TpuCompilationCacheExternal : public TpuCompilationCacheInterface {
  public:
-  using Status = ::stream_executor::port::Status;
-
-  class EntryRefImpl
-      : public CompilationCacheEntryRefImpl<TpuCompilationCacheEntry> {
-   public:
-    EntryRefImpl(TpuCompilationCacheInterface* parent, CompiledSubgraph* entry,
-                 int index);
-
-    TpuCompilationCacheEntry get() override;
-  };
-
   explicit TpuCompilationCacheExternal(int64 max_cache_size)
       : TpuCompilationCacheInterface(max_cache_size) {}
 
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_factory.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_factory.cc
new file mode 100644
index 00000000000..86469ae7ebb
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_factory.cc
@@ -0,0 +1,55 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_factory.h"
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+#include "tensorflow/core/tpu/kernels/tpu_op_consts.h"
+
+namespace tensorflow {
+namespace tpu {
+namespace {
+
+TpuCompilationCacheInterface* CreateCompilationCacheExternal() {
+  // NOTE: Change the 1 << 33 value to change the compilation cache size.
+  // TODO(frankchn): Make this configurable.
+  return new TpuCompilationCacheExternal(int64{1} << 33);  // 8 GB
+}
+
+// Using a pointer here to fulfill the trivially destructible requirement for
+// static variables.
+static std::function<TpuCompilationCacheInterface*()>*
+    compilation_cache_creation_fn =
+        new std::function<TpuCompilationCacheInterface*()>(
+            CreateCompilationCacheExternal);
+
+}  // namespace
+
+std::function<TpuCompilationCacheInterface*()> GetCompilationCacheCreateFn() {
+  return *compilation_cache_creation_fn;
+}
+
+void SetCompilationCacheCreateFn(
+    std::function<TpuCompilationCacheInterface*()> fn) {
+  delete compilation_cache_creation_fn;
+  compilation_cache_creation_fn =
+      new std::function<TpuCompilationCacheInterface*()>(fn);
+}
+
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_factory.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_factory.h
new file mode 100644
index 00000000000..4710f916c48
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_factory.h
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_FACTORY_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_FACTORY_H_
+
+#include <functional>
+
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+
+std::function<TpuCompilationCacheInterface*()> GetCompilationCacheCreateFn();
+
+void SetCompilationCacheCreateFn(
+    std::function<TpuCompilationCacheInterface*()> fn);
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_FACTORY_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc
index 2631dccbc21..4cd2b864203 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/casts.h"
 #include "tensorflow/core/tpu/kernels/tpu_util.h"
+#include "tensorflow/core/tpu/tpu_api.h"
 
 namespace tensorflow {
 namespace tpu {
@@ -37,10 +38,77 @@ void TpuCompilationCacheInterface::RefHolder::AddRef(CompiledSubgraph* entry) {
   entries_.push_back(entry);
 }
 
-string TpuCompilationCacheInterface::RefHolder::DebugString() const {
+std::string TpuCompilationCacheInterface::RefHolder::DebugString() const {
   return "TpuCompilationCacheRefHolder";
 }
 
+CompilationCacheEntryRef::CompilationCacheEntryRef()
+    : parent_(nullptr), entry_(nullptr), index_(0) {}
+
+CompilationCacheEntryRef::CompilationCacheEntryRef(
+    TpuCompilationCacheInterface* parent, CompiledSubgraph* entry, int index)
+    : parent_(parent), entry_(entry), index_(index) {
+  if (entry_ == nullptr) {
+    return;
+  }
+  if (entry_->main_entry == nullptr) {
+    entry_->Ref();
+  } else {
+    // This is a sharding/unsharding entry nested in a main entry. Only
+    // refcount the main entry.
+    entry_->main_entry->Ref();
+  }
+}
+
+CompilationCacheEntryRef::~CompilationCacheEntryRef() {
+  if (entry_ == nullptr) {
+    return;
+  }
+  if (entry_->main_entry == nullptr) {
+    parent_->DiscardEntryRefs({entry_});
+  } else {
+    parent_->DiscardEntryRefs({entry_->main_entry});
+  }
+}
+
+TpuCompilationCacheEntry CompilationCacheEntryRef::get() {
+  if (entry_ == nullptr) {
+    // Create an empty entry if the entry is nullptr. This corresponds to
+    // non-existing sharding/unsharding entries.
+    return TpuCompilationCacheEntry();
+  }
+
+  return TpuCompilationCacheEntry(entry_->tpu_program_group.get(), index_);
+}
+
+Status CompilationCacheEntryRef::ToSubEntryRef(
+    CompilationCacheFetchTarget fetch_target) {
+  CompiledSubgraph* target = nullptr;
+  switch (fetch_target) {
+    case CompilationCacheFetchTarget::MAIN:
+      target = entry_;
+      break;
+    case CompilationCacheFetchTarget::SHARDING:
+      target = entry_->sharding_entry.get();
+      break;
+    case CompilationCacheFetchTarget::UNSHARDING:
+      target = entry_->unsharding_entry.get();
+      break;
+    default:
+      return xla::InvalidArgument("Invalid fetch target: %d", fetch_target);
+  }
+
+  if (target == nullptr) {
+    // Cache entry does not have an unsharding subentry. Unref and replace
+    // with nullptr.
+    parent_->DiscardEntryRefs({entry_});
+  }
+  // Otherwise, since the refcount is always on the main entry, we don't
+  // need ref/unref.
+  entry_ = target;
+  return Status::OK();
+}
+
 TpuCompilationCacheInterface::TpuCompilationCacheInterface(int64 max_cache_size)
     : max_cache_size_(max_cache_size) {
   CHECK_GE(max_cache_size_, 0);
@@ -155,7 +223,7 @@ void TpuCompilationCacheInterface::UnloadAndDestroy(CompiledSubgraph* entry) {
   entry->Unref();
 }
 
-size_t TpuCompilationCacheInterface::RemoveEntry(const string& key) {
+size_t TpuCompilationCacheInterface::RemoveEntry(const std::string& key) {
   auto erased = cache_.erase(key);
   TpuCompilationMetrics::SetCacheEntryCount(cache_.size());
 
@@ -195,7 +263,7 @@ CompiledSubgraph* TpuCompilationCacheInterface::DiscardEntryRef(
     }
     erased = entries_by_uid_.erase(entry->uid);
     CHECK_EQ(erased, 1);
-    for (const string& key : entry->proto_key) {
+    for (const std::string& key : entry->proto_key) {
       erased = entries_by_proto_key_.erase(key);
       CHECK_EQ(erased, 1);
     }
@@ -268,10 +336,10 @@ void TpuCompilationCacheInterface::LookupEntryMarkedForEviction(
   }
 }
 
-void TpuCompilationCacheInterface::InsertEntry(const string& key,
+void TpuCompilationCacheInterface::InsertEntry(const std::string& key,
                                                CompiledSubgraph* entry) {
   auto cache_inserted =
-      cache_.insert(std::pair<string, CompiledSubgraph*>(key, entry));
+      cache_.insert(std::pair<std::string, CompiledSubgraph*>(key, entry));
   CHECK(cache_inserted.second);
   TpuCompilationMetrics::SetCacheEntryCount(cache_.size());
 
@@ -294,7 +362,8 @@ Status TpuCompilationCacheInterface::CompileIfKeyAbsent(
     const TpuCompilationCacheKey& subgraph_key,
     const SessionMetadata* session_metadata,
     CompilationRefHolder* per_step_ref_holder, int64* uid,
-    std::vector<string>* proto_key, std::vector<bool>* may_modify_variables,
+    std::vector<std::string>* proto_key,
+    std::vector<bool>* may_modify_variables,
     absl::Span<const xla::HloProto* const>* hlo_metadatas,
     const std::function<Status(TpuProgramGroupInterface*)>& compile_function) {
   std::vector<CompiledSubgraph*> removed_entries;
@@ -307,7 +376,7 @@ Status TpuCompilationCacheInterface::CompileIfKeyAbsent(
   return status;
 }
 
-string TpuCompilationCacheInterface::FindCacheKey(
+std::string TpuCompilationCacheInterface::FindCacheKey(
     const TpuCompilationCacheKey& subgraph_key) {
   if (!subgraph_key.has_guaranteed_const) {
     return subgraph_key.prefix;
@@ -330,7 +399,8 @@ Status TpuCompilationCacheInterface::CompileIfKeyAbsentHelper(
     const TpuCompilationCacheKey& subgraph_key,
     const SessionMetadata* session_metadata,
     CompilationRefHolder* per_step_ref_holder, int64* uid,
-    std::vector<string>* proto_key, std::vector<bool>* may_modify_variables,
+    std::vector<std::string>* proto_key,
+    std::vector<bool>* may_modify_variables,
     std::vector<CompiledSubgraph*>* removed_entries,
     absl::Span<const xla::HloProto* const>* hlo_metadatas,
     const std::function<Status(TpuProgramGroupInterface*)>& compile_function) {
@@ -344,17 +414,18 @@ Status TpuCompilationCacheInterface::CompileIfKeyAbsentHelper(
   // for the lifetime of the object, see InitializeEntry() call below.
   absl::MutexLock lock(&mu_);
 
-  string cache_key = FindCacheKey(subgraph_key);
+  std::string cache_key = FindCacheKey(subgraph_key);
   auto iter = cache_.find(cache_key);
   bool is_new_key = iter == cache_.end();
 
-  const string session_name = tpu::SessionNameFromMetadata(session_metadata);
+  const std::string session_name =
+      tpu::SessionNameFromMetadata(session_metadata);
 
   if (is_new_key) {
     cache_key = subgraph_key.ToString();
     TpuCompilationMetrics::IncrementCacheLookupCount(
         /*is_cache_hit=*/false, session_name);
-    const string msg =
+    const std::string msg =
         strings::StrCat("TPU host compilation cache miss: cache_key(",
                         cache_key, "), session_name(", session_name, ")");
     TRACESTRING(msg);
@@ -362,8 +433,8 @@ Status TpuCompilationCacheInterface::CompileIfKeyAbsentHelper(
 
     // Check if caller has disabled compilation. Set using
     // internal::ScopedTpuCompileDisabler.
-    if (!IsTpuCompilationEnabled()) {
-      const string error_msg = strings::StrCat(
+    if (!UtilApiFn()->TpuCompile_IsTpuCompilationEnabledFn()) {
+      const std::string error_msg = strings::StrCat(
           "[TpuCompilationDisabled]: Compilation cache miss, but compilation "
           "disabled, session_name(",
           session_name, ") Debug String: ", subgraph_key.debug_string);
@@ -402,7 +473,7 @@ Status TpuCompilationCacheInterface::CompileIfKeyAbsentHelper(
   } else {
     TpuCompilationMetrics::IncrementCacheLookupCount(
         /*is_cache_hit=*/true, session_name);
-    const string msg =
+    const std::string msg =
         strings::StrCat("TPU host compilation cache hit: cache_key(", cache_key,
                         "), session_name(", session_name, ")");
     TRACESTRING(msg);
@@ -465,8 +536,8 @@ Status TpuCompilationCacheInterface::CompileIfKeyAbsentHelper(
   return entry->initialization_status;
 }
 
-Status TpuCompilationCacheInterface::GetKeysFromUid(int64 uid,
-                                                    std::vector<string>* keys) {
+Status TpuCompilationCacheInterface::GetKeysFromUid(
+    int64 uid, std::vector<std::string>* keys) {
   keys->clear();
 
   absl::MutexLock lock(&mu_);
@@ -478,5 +549,49 @@ Status TpuCompilationCacheInterface::GetKeysFromUid(int64 uid,
   return Status::OK();
 }
 
+Status TpuCompilationCacheInterface::Lookup(
+    int64 uid, int proto_index,
+    std::unique_ptr<CompilationCacheEntryRef>* entry) {
+  entry->reset();
+
+  profiler::TraceMe proto_lookup_traceme(
+      "TPU compilation cache proto lookup by uid",
+      /*level=*/2);
+
+  absl::MutexLock lock(&mu_);
+  const auto iter = entries_by_uid_.find(uid);
+  if (iter == entries_by_uid_.end()) {
+    return errors::NotFound("No subgraph found for uid ", uid);
+  }
+  CompiledSubgraph* cache_entry = iter->second;
+  if (proto_index < 0 ||
+      proto_index >= cache_entry->tpu_program_group->program_count()) {
+    return errors::NotFound("No proto found for core index ", proto_index,
+                            " in subgraph with uid ", uid);
+  }
+  *entry = absl::make_unique<CompilationCacheEntryRef>(this, cache_entry,
+                                                       proto_index);
+  return Status::OK();
+}
+
+Status TpuCompilationCacheInterface::Lookup(
+    const std::string& proto_key,
+    std::unique_ptr<CompilationCacheEntryRef>* entry) {
+  entry->reset();
+
+  profiler::TraceMe proto_lookup_traceme("TPU compilation cache proto lookup",
+                                         /*level=*/2);
+
+  absl::MutexLock lock(&mu_);
+  const auto iter = entries_by_proto_key_.find(proto_key);
+  if (iter == entries_by_proto_key_.end()) {
+    return errors::NotFound("No proto found for key ", proto_key);
+  }
+  CompiledSubgraph* cache_entry = iter->second.first;
+  int proto_index = iter->second.second;
+  *entry = absl::make_unique<CompilationCacheEntryRef>(this, cache_entry,
+                                                       proto_index);
+  return Status::OK();
+}
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h
index 9726d5b78b9..7b206fb1cf4 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h
@@ -25,7 +25,6 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/distributed_runtime/rpc/grpc_call.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -33,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/tpu/kernels/compiled_subgraph.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_metrics.h"
 #include "tensorflow/core/tpu/kernels/trace_util.h"
@@ -49,18 +49,20 @@ class CompilationRefHolder : public ResourceBase {
   ~CompilationRefHolder() override = default;
 };
 
-// Base class for a reference to a cached tpu program. A unique_ptr to a
-// CompilationCacheEntryRef is returned by all the cache Lookup methods below,
-// and ensures the underlying proto is not garbage-collected until the client
-// discards the ptr.
-template <typename CacheEntryType>
+// Wrapper for a cache entry returned by all the TpuCompilationCacheInterface
+// `Lookup` methods, and ensures the underlying proto is not garbage-collected
+// until the client discards the ptr.
 class CompilationCacheEntryRef {
  public:
-  virtual ~CompilationCacheEntryRef() = default;
+  CompilationCacheEntryRef();
+  CompilationCacheEntryRef(TpuCompilationCacheInterface* parent,
+                           CompiledSubgraph* entry, int index);
 
-  // Returns a CompilationCacheEntry that should not be used beyond the lifetime
-  // of the tpu::CompilationCacheEntryRef.
-  virtual CacheEntryType get() = 0;
+  virtual ~CompilationCacheEntryRef();
+
+  // Returns a TpuCompilationCacheEntry that should not be used beyond the
+  // lifetime of the CompilationCacheEntryRef.
+  virtual TpuCompilationCacheEntry get();
 
   // Mutates this ref to point to the entry's subentry (for
   // sharding/unsharding) or main entry (unchanged) as specified by
@@ -70,7 +72,15 @@ class CompilationCacheEntryRef {
   //
   // If the requested subentry does not exist, the ref will point to a nullptr
   // entry, and the original entry will be unref'ed.
-  virtual Status ToSubEntryRef(CompilationCacheFetchTarget fetch_target) = 0;
+  virtual Status ToSubEntryRef(CompilationCacheFetchTarget fetch_target);
+
+ protected:
+  TpuCompilationCacheInterface* parent_;  // Not owned.
+  // A reference to entry_ is acquired in the constructor and released via
+  // parent->DiscardEntryRefs in the destructor.
+  CompiledSubgraph* entry_;
+  // The index of the program in entry_ that is returned by the get method.
+  int index_;
 };
 
 class TpuCompilationCacheInterface : public ResourceBase {
@@ -98,7 +108,8 @@ class TpuCompilationCacheInterface : public ResourceBase {
       const TpuCompilationCacheKey& subgraph_key,
       const SessionMetadata* session_metadata,
       CompilationRefHolder* per_step_ref_holder, int64* uid,
-      std::vector<string>* proto_key, std::vector<bool>* may_modify_variables,
+      std::vector<std::string>* proto_key,
+      std::vector<bool>* may_modify_variables,
       absl::Span<const xla::HloProto* const>* hlo_metadatas,
       const std::function<Status(TpuProgramGroupInterface*)>& compile_function);
 
@@ -125,19 +136,18 @@ class TpuCompilationCacheInterface : public ResourceBase {
   // Looks up an executable corresponding to the model-parallel core index of
   // the subgraph represented by key. On success a pointer to an EntryRef
   // holding the program is returned in entry.
-  template <typename CacheEntryRef, typename CacheEntryRefImpl>
-  Status Lookup(const string& proto_key, std::unique_ptr<CacheEntryRef>* entry);
+  Status Lookup(const std::string& proto_key,
+                std::unique_ptr<CompilationCacheEntryRef>* entry);
 
   // Looks up an executable corresponding to the model-parallel core index of
   // the subgraph represented by uid. On success a pointer to an EntryRef
   // holding the program is returned in entry.
-  template <typename CacheEntryRef, typename CacheEntryRefImpl>
   Status Lookup(int64 uid, int proto_index,
-                std::unique_ptr<CacheEntryRef>* entry);
+                std::unique_ptr<CompilationCacheEntryRef>* entry);
 
   // Looks up the subgraph represented by uid, and returns the vector of keys,
   // one per core, corresponding to that subgraph.
-  Status GetKeysFromUid(int64 uid, std::vector<string>* keys);
+  Status GetKeysFromUid(int64 uid, std::vector<std::string>* keys);
 
   // Makes a reference holder for this cache, that can be stored in the per-step
   // resource manager and will ensure that compiled entries persist until the
@@ -171,7 +181,7 @@ class TpuCompilationCacheInterface : public ResourceBase {
     // parent_->DiscardEntryRefs.
     void AddRef(CompiledSubgraph* entry);
 
-    string DebugString() const override;
+    std::string DebugString() const override;
 
    private:
     TpuCompilationCacheInterface* parent_;  // Not owned.
@@ -186,7 +196,8 @@ class TpuCompilationCacheInterface : public ResourceBase {
       const TpuCompilationCacheKey& subgraph_key,
       const SessionMetadata* session_metadata,
       CompilationRefHolder* per_step_ref_holder, int64* uid,
-      std::vector<string>* proto_key, std::vector<bool>* may_modify_variables,
+      std::vector<std::string>* proto_key,
+      std::vector<bool>* may_modify_variables,
       std::vector<CompiledSubgraph*>* removed_entries,
       absl::Span<const xla::HloProto* const>* hlo_metadatas,
       const std::function<Status(TpuProgramGroupInterface*)>& compile_function);
@@ -231,14 +242,14 @@ class TpuCompilationCacheInterface : public ResourceBase {
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Removes the entry with given key from cache.
-  size_t RemoveEntry(const string& key) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  size_t RemoveEntry(const std::string& key) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Inserts the given key and entry to cache.
-  void InsertEntry(const string& key, CompiledSubgraph* entry)
+  void InsertEntry(const std::string& key, CompiledSubgraph* entry)
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Returns the cache key matching given subgraph_key.
-  string FindCacheKey(const TpuCompilationCacheKey& subgraph_key)
+  std::string FindCacheKey(const TpuCompilationCacheKey& subgraph_key)
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Creates a new entry by running initialize_programs and places it in the
@@ -248,7 +259,7 @@ class TpuCompilationCacheInterface : public ResourceBase {
   //
   // **InitializeEntry releases mu_ during the call to initialize_programs.**
   virtual CompiledSubgraph* InitializeEntry(
-      const string& key,
+      const std::string& key,
       const std::function<Status(TpuProgramGroupInterface*)>&
           initialize_programs,
       const TpuCompilationCacheKey& subgraph_key)
@@ -277,13 +288,16 @@ class TpuCompilationCacheInterface : public ResourceBase {
   // cache_ key matching a given subgraph key. When doing a lookup, check
   // session_key_map_ first to avoid unnecessay fingerprint computation.
   // Map from key prefix + session_handle to a cache_ key.
-  absl::node_hash_map<string, string> session_key_map_ ABSL_GUARDED_BY(mu_);
+  absl::node_hash_map<std::string, std::string> session_key_map_
+      ABSL_GUARDED_BY(mu_);
   // Map from key prefix + fingerprint to a cache_ key.
-  absl::node_hash_map<string, string> fingerprint_key_map_ ABSL_GUARDED_BY(mu_);
+  absl::node_hash_map<std::string, std::string> fingerprint_key_map_
+      ABSL_GUARDED_BY(mu_);
   // All the subgraph entries that can be looked up in the cache. An entry is
   // marked for eviction iff it is present in cache_ and not in
   // entries_by_last_use_.
-  std::unordered_map<string, CompiledSubgraph*> cache_ ABSL_GUARDED_BY(mu_);
+  std::unordered_map<std::string, CompiledSubgraph*> cache_
+      ABSL_GUARDED_BY(mu_);
   // All the subgraph entries that can be looked up in the cache, indexed by
   // uid.
   absl::node_hash_map<int64, CompiledSubgraph*> entries_by_uid_
@@ -291,7 +305,7 @@ class TpuCompilationCacheInterface : public ResourceBase {
   // All the protos that can be looked up in the cache, indexed by proto
   // key. The value of the map is a subgraph and the index of the proto compiled
   // for that subgraph.
-  std::unordered_map<string, std::pair<CompiledSubgraph*, int>>
+  std::unordered_map<std::string, std::pair<CompiledSubgraph*, int>>
       entries_by_proto_key_ ABSL_GUARDED_BY(mu_);
   // Map from last_use to entry, used to mark entries for eviction in LRU
   // order. If an entry's last_use counter is not present as a key in
@@ -305,50 +319,6 @@ class TpuCompilationCacheInterface : public ResourceBase {
   TpuCompilationCacheInterface& operator=(const TpuCompilationCacheInterface&) =
       delete;
 };
-
-template <typename CacheEntryRef, typename CacheEntryRefImpl>
-Status TpuCompilationCacheInterface::Lookup(
-    int64 uid, int proto_index, std::unique_ptr<CacheEntryRef>* entry) {
-  entry->reset();
-
-  profiler::TraceMe proto_lookup_traceme(
-      "TPU compilation cache proto lookup by uid",
-      /*level=*/2);
-
-  absl::MutexLock lock(&mu_);
-  const auto iter = entries_by_uid_.find(uid);
-  if (iter == entries_by_uid_.end()) {
-    return errors::NotFound("No subgraph found for uid ", uid);
-  }
-  CompiledSubgraph* cache_entry = iter->second;
-  if (proto_index < 0 ||
-      proto_index >= cache_entry->tpu_program_group->program_count()) {
-    return errors::NotFound("No proto found for core index ", proto_index,
-                            " in subgraph with uid ", uid);
-  }
-  *entry = absl::make_unique<CacheEntryRefImpl>(this, cache_entry, proto_index);
-  return Status::OK();
-}
-
-template <typename CacheEntryRef, typename CacheEntryRefImpl>
-Status TpuCompilationCacheInterface::Lookup(
-    const string& proto_key, std::unique_ptr<CacheEntryRef>* entry) {
-  entry->reset();
-
-  profiler::TraceMe proto_lookup_traceme("TPU compilation cache proto lookup",
-                                         /*level=*/2);
-
-  absl::MutexLock lock(&mu_);
-  const auto iter = entries_by_proto_key_.find(proto_key);
-  if (iter == entries_by_proto_key_.end()) {
-    return errors::NotFound("No proto found for key ", proto_key);
-  }
-  CompiledSubgraph* cache_entry = iter->second.first;
-  int proto_index = iter->second.second;
-  *entry = absl::make_unique<CacheEntryRefImpl>(this, cache_entry, proto_index);
-  return Status::OK();
-}
-
 }  // namespace tpu
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.cc
index f30a503d2d2..29864a310d1 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.cc
@@ -16,70 +16,50 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tpu {
-namespace {
-class CompilationCacheFetchTargetUtility {
- public:
-  CompilationCacheFetchTargetUtility()
-      : names_({"Invalid", "Main", "Sharding", "Unsharding"}) {}
-
-  std::string name(CompilationCacheFetchTarget target) const {
-    return names_[static_cast<int>(target)];
-  }
-
- private:
-  const std::vector<std::string> names_;
-};
-
-std::string GetName(CompilationCacheFetchTarget target) {
-  static const auto* util = new CompilationCacheFetchTargetUtility();
-  return util->name(target);
-}
-
-}  // namespace
 
 TpuCompilationCacheLocalLookup::TpuCompilationCacheLocalLookup(
     TpuCompilationCacheInterface* cache)
-    : cache_(cache) {}
+    : cache_(cache) {
+  cache_->Ref();
+}
 
 TpuCompilationCacheLocalLookup::~TpuCompilationCacheLocalLookup() {
   cache_->Unref();
 }
 
 Status TpuCompilationCacheLocalLookup::Lookup(
-    const string& proto_key,
-    std::unique_ptr<TpuCompilationCacheEntryRef>* entry,
+    const string& proto_key, std::unique_ptr<CompilationCacheEntryRef>* entry,
     CompilationCacheFetchTarget fetch_target) {
   profiler::TraceMe proto_lookup_traceme("Local TPU proto cache lookup",
                                          /*level=*/2);
-  Status s = cache_->Lookup<TpuCompilationCacheEntryRef, EntryRefImpl>(
-      proto_key, entry);
+  Status s = cache_->Lookup(proto_key, entry);
   VLOG(1) << "Looked up key " << proto_key << " in local subgraph cache status "
           << s;
   if (!s.ok()) {
     return s;
   }
   s = (*entry)->ToSubEntryRef(fetch_target);
-
-  VLOG(1) << "Fetched subentry: " << GetName(fetch_target) << " with status "
+  VLOG(1) << "Fetched subentry: "
+          << CompilationCacheFetchTarget_Name(fetch_target) << " with status "
           << s;
   return s;
 }
 
 Status TpuCompilationCacheLocalLookup::Lookup(
     int64 uid, int proto_index,
-    std::unique_ptr<TpuCompilationCacheEntryRef>* entry,
+    std::unique_ptr<CompilationCacheEntryRef>* entry,
     CompilationCacheFetchTarget fetch_target) {
   profiler::TraceMe proto_lookup_traceme("Local TPU proto cache lookup by uid",
                                          /*level=*/2);
-  Status s = cache_->Lookup<TpuCompilationCacheEntryRef, EntryRefImpl>(
-      uid, proto_index, entry);
+  Status s = cache_->Lookup(uid, proto_index, entry);
   VLOG(1) << "Looked up uid " << uid << ", index " << proto_index
           << " in local subgraph cache status " << s;
   if (!s.ok()) {
     return s;
   }
   s = (*entry)->ToSubEntryRef(fetch_target);
-  VLOG(1) << "Fetched subentry: " << GetName(fetch_target) << " with status "
+  VLOG(1) << "Fetched subentry: "
+          << CompilationCacheFetchTarget_Name(fetch_target) << " with status "
           << s;
   return s;
 }
@@ -87,6 +67,5 @@ Status TpuCompilationCacheLocalLookup::Lookup(
 string TpuCompilationCacheLocalLookup::DebugString() const {
   return "TpuCompilationCacheLocalLookup";
 }
-
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h
index eb5aadcd3e2..8db4c11ebea 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h
@@ -28,24 +28,17 @@ namespace tpu {
 // Class for looking up TPU programs when the execute and compile Op are in the
 // same address space. The proto is simply looked up in the compilation cache,
 // without any serialization taking place.
-class TpuCompilationCacheLocalLookup
-    : public TpuCompilationCacheLookup<
-          CompilationCacheEntryRef<TpuCompilationCacheEntry>> {
+class TpuCompilationCacheLocalLookup : public TpuCompilationCacheLookup {
  public:
-  using TpuCompilationCacheEntryRef =
-      ::tensorflow::tpu::CompilationCacheEntryRef<TpuCompilationCacheEntry>;
-  using EntryRefImpl =
-      ::tensorflow::tpu::TpuCompilationCacheExternal::EntryRefImpl;
-
   explicit TpuCompilationCacheLocalLookup(TpuCompilationCacheInterface* cache);
   ~TpuCompilationCacheLocalLookup() override;
 
   Status Lookup(const string& proto_key,
-                std::unique_ptr<TpuCompilationCacheEntryRef>* entry,
+                std::unique_ptr<CompilationCacheEntryRef>* entry,
                 CompilationCacheFetchTarget fetch_target) override;
 
   Status Lookup(int64 uid, int proto_index,
-                std::unique_ptr<TpuCompilationCacheEntryRef>* entry,
+                std::unique_ptr<CompilationCacheEntryRef>* entry,
                 CompilationCacheFetchTarget fetch_target) override;
 
   string DebugString() const override;
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h
index 0d1a53d31d2..ab476322a8a 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h
@@ -23,10 +23,11 @@ limitations under the License.
 namespace tensorflow {
 namespace tpu {
 
+// TODO(b/162241759): consider merging TpuCompilationCacheLookup and
+// TpuCompilationCacheInterface.
 // Base class allowing Execute Ops to look up TPU programs. Different subclasses
 // are used when the execute Op is in the same address space as the compile Op,
 // and when they need to communicate over RPC.
-template <typename TpuCompilationCacheEntryRefType>
 class TpuCompilationCacheLookup : public ResourceBase {
  public:
   ~TpuCompilationCacheLookup() override = default;
@@ -43,12 +44,11 @@ class TpuCompilationCacheLookup : public ResourceBase {
   // fetch_target requests one of them, then after this call
   //   (*entry)->get().get_executable() will return nullptr.
   virtual Status Lookup(const string& proto_key,
-                        std::unique_ptr<TpuCompilationCacheEntryRefType>* entry,
+                        std::unique_ptr<CompilationCacheEntryRef>* entry,
                         CompilationCacheFetchTarget fetch_target) = 0;
 
-  virtual Status Lookup(
-      const string& proto_key,
-      std::unique_ptr<TpuCompilationCacheEntryRefType>* entry) {
+  virtual Status Lookup(const string& proto_key,
+                        std::unique_ptr<CompilationCacheEntryRef>* entry) {
     return Lookup(proto_key, std::move(entry),
                   CompilationCacheFetchTarget::MAIN);
   }
@@ -58,17 +58,15 @@ class TpuCompilationCacheLookup : public ResourceBase {
   // returned in program. The wrapper is guaranteed to be valid only during the
   // execution of the Op requesting the proto.
   virtual Status Lookup(int64 uid, int proto_index,
-                        std::unique_ptr<TpuCompilationCacheEntryRefType>* entry,
+                        std::unique_ptr<CompilationCacheEntryRef>* entry,
                         CompilationCacheFetchTarget fetch_target) = 0;
 
-  virtual Status Lookup(
-      int64 uid, int proto_index,
-      std::unique_ptr<TpuCompilationCacheEntryRefType>* entry) {
+  virtual Status Lookup(int64 uid, int proto_index,
+                        std::unique_ptr<CompilationCacheEntryRef>* entry) {
     return Lookup(uid, proto_index, std::move(entry),
                   CompilationCacheFetchTarget::MAIN);
   }
 };
-
 }  // namespace tpu
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/tpu/kernels/tpu_compile.proto b/tensorflow/core/tpu/kernels/tpu_compile.proto
index 5b70de67a05..bdf754493ce 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile.proto
+++ b/tensorflow/core/tpu/kernels/tpu_compile.proto
@@ -19,6 +19,8 @@ package tensorflow.tpu;
 import "tensorflow/compiler/tf2xla/host_compute_metadata.proto";
 import "tensorflow/compiler/xla/service/hlo.proto";
 import "tensorflow/compiler/xla/xla_data.proto";
+import "tensorflow/core/framework/attr_value.proto";
+import "tensorflow/core/framework/function.proto";
 import "tensorflow/core/framework/tensor.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
@@ -142,3 +144,42 @@ message TpuAotCompilationRequestProto {
   // XLA compiler compilation result.
   XlaCompilationResultProto compilation_result = 7;
 }
+
+// TPU compilation request for compiling computations into XLA HLO IR and build
+// TPU programs.
+message TpuCompilationRequestProto {
+  // A flag reserved for using experimental version of the compilation. By
+  // default the value should be false.
+  bool use_experimental = 1;
+
+  // Use mlir to lower computation(s) to Hlo.
+  bool use_mlir = 2;
+
+  // If true, returns hlo metadatas.
+  bool return_hlo_protos = 3;
+
+  // If true, unloads cache on session close.
+  bool unload_cache_on_session_close = 4;
+
+  // Compilation metadata.
+  TPUCompileMetadataProto metadata = 5;
+
+  // Computation argument shapes.
+  repeated TensorShapeProto arg_shapes = 6;
+
+  // Input tensor that gives const guarantee to the TF runtime.
+  repeated TensorProto guaranteed_constants = 7;
+
+  // MLIR module definition.
+  string mlir_module = 8;
+
+  // A set of named functions used as the input to lowering to Hlo when mlir is
+  // not used.
+  FunctionDefLibrary fdef_lib = 9;
+
+  // The version of the graph definition used to lower TF function to Hlo.
+  int32 graph_def_version = 10;
+
+  // Function containing the computation to compile.
+  NameAttrList function = 11;
+}
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_c_api.h b/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
index 999300959bf..44607631e15 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
@@ -23,21 +23,31 @@ limitations under the License.
 
 extern "C" {
 
-// Executes the computations using XLA TPU compiler and returns TPU programs
-// ready for execution.
+// Compiles HLO IR and returns `count` number of TPU programs ready for
+// execution.
+// The API allocates the `XLA_TpuProgram*[]` array `tpu_programs` and creates
+// `XLA_TpuProgram` object(s) using the `TpuProgram_New` API. The caller is
+// responsible to deallocate both the `XLA_TpuProgram*[]` array and the
+// `XLA_TpuProgram` object(s) using `TpuProgram_FreeArray` and `TpuProgram_Free`
+// API respectively.
 TFTPU_CAPI_EXPORT void TpuCompile_CompileAheadOfTime(
     TpuSerializedProto aot_compilation_request, XLA_TpuProgram** tpu_programs[],
     size_t* count, SE_Status* status);
 
-// Builds `DeviceAssignment` from `TpuCompileMetadata` serialized proto.
-TFTPU_CAPI_EXPORT void TpuCompile_BuildXLADeviceAssignment(
-    TpuSerializedProto serialized_tpu_compile_metadata,
-    const XLA_TpuMeshState* mesh_state,
-    TpuSerializedProto* serialized_device_assignment, SE_Status* status);
+// Compiles Mlir or TF function computation by lowering into HLO IR and returns
+// `count` number of TPU programs ready for execution.
+// The API allocates the `XLA_TpuProgram*[]` array `tpu_programs` and creates
+// `XLA_TpuProgram` object(s) using the `TpuProgram_New` API. The caller is
+// responsible to deallocate both the `XLA_TpuProgram*[]` array and the
+// `XLA_TpuProgram` object(s) using `TpuProgram_FreeArray` and `TpuProgram_Free`
+// API respectively.
+TFTPU_CAPI_EXPORT void TpuCompile_CompileAndBuild(
+    TpuSerializedProto compilation_request, const XLA_TpuMeshState* mesh_state,
+    XLA_TpuProgram** tpu_programs[], size_t* count, SE_Status* status);
 
 struct TfTpu_CompileApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CompileAheadOfTime);
-  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_BuildXLADeviceAssignment);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CompileAndBuild);
 };
 
 }  // extern "C"
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op.cc b/tensorflow/core/tpu/kernels/tpu_compile_op.cc
new file mode 100644
index 00000000000..da158f01bb6
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op.cc
@@ -0,0 +1,86 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tpu/kernels/tpu_compile_op.h"
+
+#include <string>
+
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/protobuf/tpu/compilation_result.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_op_options.h"
+#include "tensorflow/stream_executor/tpu/tpu_node_context.h"
+
+namespace tensorflow {
+namespace tpu {
+using ::stream_executor::port::StatusOr;
+
+TpuCompileOp::TpuCompileOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+  StatusOr<std::unique_ptr<TpuCompileOpKernelCommon>> compile_op_impl =
+      CompileOpImplFactory::Get()->CreateNonMlirImpl(ctx);
+  OP_REQUIRES_OK(ctx, compile_op_impl.status());
+  impl_ = std::move(compile_op_impl.ValueOrDie());
+}
+
+void TpuCompileOp::Compute(OpKernelContext* ctx) { impl_->Compute(ctx); }
+
+TpuCompileMlirOp::TpuCompileMlirOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+  StatusOr<std::unique_ptr<TpuCompileOpKernelCommon>> compile_op_impl =
+      CompileOpImplFactory::Get()->CreateMlirImpl(ctx);
+  OP_REQUIRES_OK(ctx, compile_op_impl.status());
+  impl_ = std::move(compile_op_impl.ValueOrDie());
+}
+
+void TpuCompileMlirOp::Compute(OpKernelContext* ctx) { impl_->Compute(ctx); }
+
+void TpuCompileSucceededAssertOp::Compute(OpKernelContext* ctx) {
+  const Tensor compilation_result = ctx->input(0);
+  CompilationResultProto proto;
+  Status status;
+  if (!proto.ParseFromString(compilation_result.scalar<tstring>()())) {
+    status =
+        errors::InvalidArgument("Unable to parse compilation result proto");
+  }
+  if (!status.ok() || proto.status_code() != error::Code::OK) {
+    status.Update(Status(proto.status_code(), proto.status_error_message()));
+    errors::AppendToMessage(&status, "TPU compilation failed");
+    if (tensorflow::internal::TpuCompilationFailureClosesChips()) {
+      // At this point, if compilation fails we do not know if a task
+      // is already running that expects results from this compiled
+      // program to complete. So close the TPU driver to release all
+      // awaiting interactions (all awaiting interaction will fail and
+      // continue to fail until reinitialized).
+      LOG(ERROR) << "Cloud TPU: Closing chips. TPU compilation is considered "
+                    "as part of device state, and a failed compilation results "
+                    "in a device reset.";
+
+      Status close_status = TpuNodeContext::CloseTpuHost();
+
+      if (!close_status.ok()) {
+        errors::AppendToMessage(&status, close_status.error_message());
+      }
+    }
+    ctx->CtxFailureWithWarning(status);
+  }
+}
+
+REGISTER_MODULE_INITIALIZER(register_tpu_compile_op_kernel, {
+  VLOG(1) << "Register TpuCompileOp kernel.";
+  REGISTER_KERNEL_BUILDER(Name("TPUCompile").Device(DEVICE_CPU), TpuCompileOp);
+  REGISTER_KERNEL_BUILDER(Name("_TPUCompileMlir").Device(DEVICE_CPU),
+                          TpuCompileMlirOp);
+  REGISTER_KERNEL_BUILDER(Name("TPUCompileSucceededAssert").Device(DEVICE_CPU),
+                          TpuCompileSucceededAssertOp);
+});
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op.h b/tensorflow/core/tpu/kernels/tpu_compile_op.h
index 8a1963dde5c..48c0d9de3b3 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op.h
@@ -18,14 +18,10 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_op_common.h"
 
 namespace tensorflow {
-
 namespace tpu {
-// Forward declaration.
-class TpuCompileOpKernelImpl;
-}  // namespace tpu
-
 // The TPUCompile operator compiles a Tensorflow function into a
 // TPU executable to be run by TPUExecute.
 //
@@ -37,9 +33,9 @@ class TpuCompileOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override;
 
  private:
-  std::unique_ptr<tpu::TpuCompileOpKernelImpl> impl_;
+  std::unique_ptr<TpuCompileOpKernelCommon> impl_;
 
-  DISALLOW_COPY_AND_ASSIGN(TpuCompileOp);
+  TF_DISALLOW_COPY_AND_ASSIGN(TpuCompileOp);
 };
 
 // The TPUCompile operator compiles a MLIR module into a
@@ -53,9 +49,9 @@ class TpuCompileMlirOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override;
 
  private:
-  std::unique_ptr<tpu::TpuCompileOpKernelImpl> impl_;
+  std::unique_ptr<TpuCompileOpKernelCommon> impl_;
 
-  DISALLOW_COPY_AND_ASSIGN(TpuCompileMlirOp);
+  TF_DISALLOW_COPY_AND_ASSIGN(TpuCompileMlirOp);
 };
 
 class TpuCompileSucceededAssertOp : public OpKernel {
@@ -67,9 +63,9 @@ class TpuCompileSucceededAssertOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override;
 
  private:
-  DISALLOW_COPY_AND_ASSIGN(TpuCompileSucceededAssertOp);
+  TF_DISALLOW_COPY_AND_ASSIGN(TpuCompileSucceededAssertOp);
 };
-
+}  // namespace tpu
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
index 5afa5c878be..ce18e844e66 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
@@ -117,7 +117,7 @@ Status SetPerCoreArgShapes(
   } else {
     TF_RET_CHECK(proto_arg.sharding().type() == xla::OpSharding::REPLICATED)
         << "Unsupported argument sharding: "
-        << proto_arg.sharding().DebugString();
+        << " proto_arg=" << proto_arg.DebugString();
     for (int core = 0; core < per_core_arg_shapes->size(); ++core) {
       (*arg_core_mapping)[arg_index].indices.push_back(
           (*per_core_arg_shapes)[core].size());
@@ -130,6 +130,19 @@ Status SetPerCoreArgShapes(
 
 }  // namespace
 
+CompileOpImplFactory* CompileOpImplFactory::factory_ = nullptr;
+
+/* static */
+CompileOpImplFactory* CompileOpImplFactory::Get() { return factory_; }
+
+/* static */
+void CompileOpImplFactory::Register(CompileOpImplFactory* factory) {
+  CHECK_EQ(factory_, nullptr)
+      << "CompileOpImplFactory can only be registered "
+         "once and there can only be one factory active and used.";
+  factory_ = factory;
+}
+
 Status TpuCompileOpKernelCommon::AssignReturnValueToCore(
     std::vector<tpu::ShardingAndIndex>* retval_core_mapping) {
   std::vector<int> per_core_retval_counts(metadata_.num_cores_per_replica(), 0);
@@ -163,18 +176,20 @@ Status TpuCompileOpKernelCommon::AssignReturnValueToCore(
 
 Status TpuCompileOpKernelCommon::BuildComputationArgumentDescriptions(
     const std::vector<TensorShape>& arg_shapes,
-    const OpInputList& guaranteed_constants, const XlaCompiler& compiler,
+    const GuaranteedConsts& guaranteed_constants, const XlaCompiler& compiler,
     std::vector<XlaCompiler::Argument>* args,
     std::vector<tpu::ShardingAndIndex>* arg_core_mapping,
     std::vector<std::vector<xla::Shape>>* per_core_arg_shapes) {
   // Builds a description of the computation's arguments.
   int constant_count = 0;
+  size_t guaranteed_constants_size = 0;
   for (int i = 0; i < metadata_.args_size(); ++i) {
     const tpu::TPUCompileMetadataProto::Arg& proto_arg = metadata_.args(i);
     args->push_back(XlaCompiler::Argument());
     XlaCompiler::Argument& arg = args->back();
     arg.type = proto_arg.dtype();
     arg.shape = arg_shapes[i];
+    arg.node_name = proto_arg.name();
     switch (proto_arg.kind()) {
       case tpu::TPUCompileMetadataProto::Arg::PARAMETER:
         arg.kind = XlaCompiler::Argument::kParameter;
@@ -187,10 +202,26 @@ Status TpuCompileOpKernelCommon::BuildComputationArgumentDescriptions(
         break;
       case tpu::TPUCompileMetadataProto::Arg::GUARANTEED_CONSTANT:
         arg.kind = XlaCompiler::Argument::kConstant;
-        TF_RET_CHECK(constant_count < guaranteed_constants.size())
+        guaranteed_constants_size =
+            guaranteed_constants.index() == 0
+                ? absl::get<0>(guaranteed_constants).size()
+                : absl::get<1>(guaranteed_constants)->size();
+        TF_RET_CHECK(constant_count < guaranteed_constants_size)
             << "More constant args in TPUCompileMetadataProto than constant "
                "tensors.";
-        arg.constant_value = guaranteed_constants[constant_count++];
+        if (guaranteed_constants.index() == 0) {
+          // `guaranteed_constants` is of type `absl::Span<const TensorProto*
+          // const>`.
+          Tensor tensor;
+          CHECK(tensor.FromProto(
+              *absl::get<0>(guaranteed_constants)[constant_count++]))
+              << "Failed to deserialize invalid `TensorProto` into `Tensor`.";
+          arg.constant_value = tensor;
+        } else {
+          // `guaranteed_constants` is of type `const OpInputList* const`.
+          arg.constant_value =
+              (*absl::get<1>(guaranteed_constants))[constant_count++];
+        }
         break;
       case tpu::TPUCompileMetadataProto::Arg::INVALID:
       default:
@@ -213,7 +244,7 @@ Status TpuCompileOpKernelCommon::BuildComputationArgumentDescriptions(
     TF_RETURN_IF_ERROR(SetPerCoreArgShapes(
         proto_arg, i, &xla_arg_shape, arg_core_mapping, per_core_arg_shapes));
   }
-  TF_RET_CHECK(constant_count == guaranteed_constants.size())
+  TF_RET_CHECK(constant_count == guaranteed_constants_size)
       << "Not all of the constant tensors were consumed.";
 
   return Status::OK();
@@ -246,7 +277,7 @@ Status TpuCompileOpKernelCommon::CompileTFFunctionToHlo(
     const FunctionLibraryDefinition& flib_def, int graph_def_version,
     const XlaCompiler::ShapeRepresentationFn shape_representation_fn,
     const std::vector<TensorShape>& arg_shapes,
-    const OpInputList& guaranteed_constants, const NameAttrList& function,
+    const GuaranteedConsts& guaranteed_constants, const NameAttrList& function,
     std::function<Status(ResourceMgr*)> populate_resource_manager_fn,
     xla::CompileOnlyClient* client,
     std::vector<tpu::ShardingAndIndex>* arg_core_mapping,
@@ -354,16 +385,18 @@ Status TpuCompileOpKernelCommon::CompileTFFunctionToHlo(
 }
 
 /* static */ void TpuCompileOpKernelCommon::ExitCountdown(
-    OpKernelContext* ctx, std::shared_ptr<std::atomic<bool>> done) {
+    Env* env, std::shared_ptr<std::atomic<bool>> done) {
   const int kSleepSeconds = 300;
   LOG(INFO) << "TpuCompileOp was cancelled. Sleeping for " << kSleepSeconds
             << " seconds to give time for TPUCompileOp to finished.";
-  ctx->env()->SleepForMicroseconds(kSleepSeconds * 1000000);
+  env->SleepForMicroseconds(kSleepSeconds * 1000000);
   if (done->load()) {
-    // If the TPUCompileOp has finished, then terminate peacefully.
+    // If the TpuCompileOp has finished, then terminate peacefully.
     return;
   }
 
+  LOG(ERROR) << "Aborting process due to cancelled TpuCompileOp. This "
+             << "termination is to ensure a consistent state.";
   std::exit(42);
 }
 
@@ -380,45 +413,6 @@ Status TpuCompileOpKernelCommon::CompileTFFunctionToHlo(
   return Status::OK();
 }
 
-/* static */ Status TpuCompileOpKernelCommon::ComputeArgumentShapes(
-    const tpu::TPUCompileMetadataProto& metadata,
-    const std::vector<TensorShape>& dynamic_shapes,
-    std::vector<TensorShape>* arg_shapes) {
-  arg_shapes->resize(metadata.args_size());
-  int dynamic_shape_pos = 0;
-  for (int i = 0; i < metadata.args_size(); ++i) {
-    const tpu::TPUCompileMetadataProto::Arg& arg = metadata.args(i);
-    // The XLA compiler determines the shape of each constant by inspecting the
-    // value of its corresponding host-memory tensor. As a result, we don't need
-    // to give the compiler graph-inferred shapes for constant arguments.
-    if (arg.kind() == tpu::TPUCompileMetadataProto::Arg::GUARANTEED_CONSTANT) {
-      continue;
-    }
-    TF_RETURN_IF_ERROR(PartialTensorShape::IsValidShape(arg.shape()));
-    PartialTensorShape static_shape(arg.shape());
-
-    TensorShape& shape = (*arg_shapes)[i];
-    if (static_shape.IsFullyDefined()) {
-      TF_RET_CHECK(static_shape.AsTensorShape(&shape));
-    } else {
-      TF_RET_CHECK(dynamic_shape_pos < dynamic_shapes.size())
-          << "Too few dynamic shapes";
-      shape = dynamic_shapes[dynamic_shape_pos++];
-      if (!static_shape.IsCompatibleWith(shape)) {
-        return errors::InvalidArgument(
-            "Mismatch between static and dynamic shape for argument. Static "
-            "shape: ",
-            static_shape.DebugString(),
-            "; dynamic shape: ", shape.DebugString());
-      }
-    }
-  }
-  // Checks we consumed all of the dynamic shapes.
-  TF_RET_CHECK(dynamic_shape_pos == dynamic_shapes.size())
-      << "Too many dynamic shapes";
-  return Status::OK();
-}
-
 // Function arguments and return values lose their device assignments, so we
 // must recreate them.
 /* static */ Status TpuCompileOpKernelCommon::AssignDevicesToArgsAndRetvals(
@@ -544,7 +538,8 @@ void TpuCompileOpKernelCommon::Compute(OpKernelContext* ctx) {
 
         // Sleep and exit in another thread so the cancellation manager can
         // continue running callbacks.
-        ctx->env()->SchedClosure([ctx, done]() { ExitCountdown(ctx, done); });
+        Env* env = ctx->env();
+        env->SchedClosure([env, done]() { ExitCountdown(env, done); });
       });
 
   // If the RPC was cancelled before we registered the cancellation callback,
@@ -573,10 +568,21 @@ Status TpuCompileOpKernelCommon::CompileLocallyAndFillHostCache(
     const OpInputList& guaranteed_constants, const TpuCompilationCacheKey& key,
     TpuProgramGroupInterface* tpu_program_group) {
   absl::Time start_time = absl::Now();
-  Status compile_status =
-      Compile(*flib_runtime->GetFunctionLibraryDefinition(),
-              flib_runtime->graph_def_version(), mesh_state, dynamic_shapes,
-              guaranteed_constants, tpu_program_group);
+  std::vector<TensorShape> arg_shapes;
+  TF_RETURN_IF_ERROR(
+      ComputeArgumentShapes(metadata_, dynamic_shapes, &arg_shapes));
+  Status compile_status;
+  if (use_mlir_) {
+    compile_status = Compile(MlirToHloArgs{mlir_module_}, mesh_state->data(),
+                             arg_shapes, tpu_program_group);
+  } else {
+    compile_status =
+        Compile(FunctionToHloArgs{&function_,
+                                  flib_runtime->GetFunctionLibraryDefinition(),
+                                  flib_runtime->graph_def_version(),
+                                  {&guaranteed_constants}},
+                mesh_state->data(), arg_shapes, tpu_program_group);
+  }
 
   absl::Time end_time = absl::Now();
   auto duration = end_time - start_time;
@@ -616,9 +622,8 @@ Status TpuCompileOpKernelCommon::ComputeInternal(OpKernelContext* ctx) {
   }
 
   const TpuCompilationCacheKey key = CreateCompilationCacheKey(
-      function_.name(), metadata_.function_library_fingerprint(),
-      /*mlir_module=*/"", guaranteed_constants, dynamic_shapes, metadata_,
-      *mesh_state);
+      function_.name(), metadata_.function_library_fingerprint(), mlir_module_,
+      guaranteed_constants, dynamic_shapes, metadata_, *mesh_state);
 
   // Process-wide cache of TPU executables.
   TpuCompilationCacheInterface* cache;
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.h b/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
index 43949c8e704..327aa460ddd 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
@@ -19,9 +19,11 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
+#include "absl/types/variant.h"
 #include "tensorflow/compiler/jit/shape_inference.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/compile_only_client.h"
+#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
@@ -33,6 +35,29 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tpu {
+// Forward declaration, defined below.
+class TpuCompileOpKernelCommon;
+
+// A base factory class for creating a `TpuCompileOpKernelImpl` variant.
+// By design, the actual factory can only be set once.
+class CompileOpImplFactory {
+ public:
+  virtual ~CompileOpImplFactory() = default;
+
+  virtual stream_executor::port::StatusOr<
+      std::unique_ptr<TpuCompileOpKernelCommon>>
+  CreateNonMlirImpl(OpKernelConstruction* ctx) = 0;
+
+  virtual stream_executor::port::StatusOr<
+      std::unique_ptr<TpuCompileOpKernelCommon>>
+  CreateMlirImpl(OpKernelConstruction* ctx) = 0;
+
+  static CompileOpImplFactory* Get();
+  static void Register(CompileOpImplFactory* factory);
+
+ private:
+  static CompileOpImplFactory* factory_;
+};
 
 // Abstract base class for TpuCompileOpKernel implementation.
 class TpuCompileOpKernelCommon {
@@ -66,14 +91,13 @@ class TpuCompileOpKernelCommon {
 
   void Compute(OpKernelContext* ctx);
 
-  // Computes shapes for each argument. Uses both the static shape from the
-  // metadata, and the dynamic shapes where the static shape is not
-  // defined. There must be one dynamic_shape for each argument with a
-  // partially defined shape, in index order.
-  static Status ComputeArgumentShapes(
-      const tpu::TPUCompileMetadataProto& metadata,
-      const std::vector<TensorShape>& dynamic_shapes,
-      std::vector<TensorShape>* arg_shapes);
+  // Lowers Mlir or TF Function computation into HLO IR and using XLA compiler
+  // compiles into TPU programs ready for execution.
+  virtual Status Compile(
+      const absl::variant<MlirToHloArgs, FunctionToHloArgs>& computation,
+      const XLA_TpuMeshState* mesh_state,
+      const std::vector<TensorShape>& arg_shapes,
+      TpuProgramGroupInterface* tpu_program_group) = 0;
 
   // Performs shape inference on `computation`, filling shape_info with operator
   // shapes. The shapes of the _Arg nodes are taken from `arg_shapes`.
@@ -85,15 +109,6 @@ class TpuCompileOpKernelCommon {
  protected:
   Status ComputeInternal(OpKernelContext* ctx);
 
-  // Compile function that invokes the different helper functions to compile
-  // the given function.
-  virtual Status Compile(const FunctionLibraryDefinition& flib_def,
-                         int graph_def_version,
-                         const TpuMeshStateInterface* mesh_state,
-                         const std::vector<TensorShape>& dynamic_shapes,
-                         const OpInputList& guaranteed_constants,
-                         TpuProgramGroupInterface* tpu_program_group) = 0;
-
   // Compile TPU program locally and populate the host compilation cache.
   Status CompileLocallyAndFillHostCache(
       FunctionLibraryRuntime* flib_runtime,
@@ -120,8 +135,7 @@ class TpuCompileOpKernelCommon {
 
   // Sleeps for `kSleepSeconds` seconds to give time for TPUCompileOp to finish
   // before terminating peacefully.
-  static void ExitCountdown(OpKernelContext* ctx,
-                            std::shared_ptr<std::atomic<bool>> done);
+  static void ExitCountdown(Env* env, std::shared_ptr<std::atomic<bool>> done);
 
   // Converts the `dynamic_shapes` arguments to the compile operator into
   // TensorShapes.
@@ -154,7 +168,8 @@ class TpuCompileOpKernelCommon {
       const FunctionLibraryDefinition& flib_def, int graph_def_version,
       const XlaCompiler::ShapeRepresentationFn shape_representation_fn,
       const std::vector<TensorShape>& arg_shapes,
-      const OpInputList& guaranteed_constants, const NameAttrList& function,
+      const GuaranteedConsts& guaranteed_constants,
+      const NameAttrList& function,
       std::function<Status(ResourceMgr*)> populate_resource_manager_fn,
       xla::CompileOnlyClient* client,
       std::vector<tpu::ShardingAndIndex>* arg_core_mapping,
@@ -177,7 +192,7 @@ class TpuCompileOpKernelCommon {
   // computation.
   Status BuildComputationArgumentDescriptions(
       const std::vector<TensorShape>& arg_shapes,
-      const OpInputList& guaranteed_constants, const XlaCompiler& compiler,
+      const GuaranteedConsts& guaranteed_constants, const XlaCompiler& compiler,
       std::vector<XlaCompiler::Argument>* args,
       std::vector<tpu::ShardingAndIndex>* arg_core_mapping,
       std::vector<std::vector<xla::Shape>>* per_core_arg_shapes);
@@ -213,7 +228,6 @@ class TpuCompileOpKernelCommon {
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(TpuCompileOpKernelCommon);
 };
-
 }  // namespace tpu
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_impl.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_impl.cc
new file mode 100644
index 00000000000..8703dd818f5
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_impl.cc
@@ -0,0 +1,78 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tpu/kernels/tpu_compile_op_impl.h"
+
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
+#include "tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h"
+#include "tensorflow/core/tpu/kernels/tpu_program_group.h"
+#include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+using stream_executor::port::StatusOr;
+
+Status TpuCompileOpKernelImpl::Compile(
+    const absl::variant<MlirToHloArgs, FunctionToHloArgs>& computation,
+    const XLA_TpuMeshState* mesh_state,
+    const std::vector<TensorShape>& arg_shapes,
+    TpuProgramGroupInterface* tpu_program_group) {
+  TF_ASSIGN_OR_RETURN(
+      TpuCompilationRequestProto compilation_request,
+      CreateTpuCompilationRequest(computation, metadata_, arg_shapes));
+
+  return TpuProgramGroup::CompileAndBuild(compilation_request, mesh_state,
+                                          tpu_program_group);
+}
+
+class TpuCompileOpImplFactory : public CompileOpImplFactory {
+ public:
+  StatusOr<std::unique_ptr<TpuCompileOpKernelCommon>> CreateNonMlirImpl(
+      OpKernelConstruction* ctx) override {
+    NameAttrList function_name;
+    TPUCompileMetadataProto metadata;
+    TF_RETURN_IF_ERROR(CompileOpMetadataFromContext(ctx, &metadata,
+                                                    &function_name,
+                                                    /*mlir_module=*/nullptr));
+    VLOG(1) << "Create tensorflow::tpu::TpuCompileOpKernelImpl";
+    return {std::make_unique<TpuCompileOpKernelImpl>(
+        function_name, metadata, metadata.num_cores_per_replica(),
+        /*return_hlo_protos=*/false,
+        /*unload_cache_on_session_close=*/false)};
+  }
+
+  StatusOr<std::unique_ptr<TpuCompileOpKernelCommon>> CreateMlirImpl(
+      OpKernelConstruction* ctx) override {
+    TPUCompileMetadataProto metadata;
+    std::string mlir_module;
+    TF_RETURN_IF_ERROR(CompileOpMetadataFromContext(
+        ctx, &metadata, /*function_name=*/nullptr, &mlir_module));
+    VLOG(1) << "Create tensorflow::tpu::TpuCompileOpKernelImpl";
+    return {std::make_unique<TpuCompileOpKernelImpl>(
+        mlir_module, metadata, metadata.num_cores_per_replica(),
+        /*return_hlo_protos=*/false,
+        /*unload_cache_on_session_close=*/false)};
+  }
+};
+
+#if defined(LIBTFTPU)
+REGISTER_MODULE_INITIALIZER(tpu_compile_op_impl_factory, {
+  VLOG(1) << "register TpuCompileOpImplFactory()";
+  CompileOpImplFactory::Register(new TpuCompileOpImplFactory());
+});
+#endif  // LIBTFTPU
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_impl.h b/tensorflow/core/tpu/kernels/tpu_compile_op_impl.h
new file mode 100644
index 00000000000..3f058683223
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_impl.h
@@ -0,0 +1,63 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_IMPL_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_IMPL_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/types/variant.h"
+#include "tensorflow/compiler/jit/shape_inference.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_op_common.h"
+#include "tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h"
+#include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Base class for TpuCompileOp and TpuCompileMlirOp.
+// Depends on whether it is given a computation in the form of serialized MLIR
+// module or a Tensorflow function, TpuCompileOpKernelImpl converts computation
+// into XLA HLO and then into a TPU execuable binary.
+class TpuCompileOpKernelImpl : public TpuCompileOpKernelCommon {
+ public:
+  TpuCompileOpKernelImpl(const std::string& mlir_module,
+                         const tpu::TPUCompileMetadataProto& metadata,
+                         int num_computations, bool return_hlo_protos,
+                         bool unload_cache_on_session_close)
+      : TpuCompileOpKernelCommon(mlir_module, metadata, num_computations,
+                                 return_hlo_protos,
+                                 unload_cache_on_session_close) {}
+
+  TpuCompileOpKernelImpl(const NameAttrList& function,
+                         const tpu::TPUCompileMetadataProto& metadata,
+                         int num_computations, bool return_hlo_protos,
+                         bool unload_cache_on_session_close)
+      : TpuCompileOpKernelCommon(
+            function, metadata, num_computations, return_hlo_protos,
+            unload_cache_on_session_close, /*persistent_cache=*/nullptr) {}
+
+  Status Compile(
+      const absl::variant<MlirToHloArgs, FunctionToHloArgs>& computation,
+      const XLA_TpuMeshState* mesh_state,
+      const std::vector<TensorShape>& arg_shapes,
+      TpuProgramGroupInterface* tpu_program_group) override;
+};
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_IMPL_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_support.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_support.cc
index 6102dd50ff2..3440b6d265a 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_support.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_support.cc
@@ -16,27 +16,28 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
+#include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h"
 #include "tensorflow/core/tpu/kernels/tpu_executable_info.pb.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"
 
 namespace tensorflow {
 namespace tpu {
-
-using stream_executor::port::Status;
-using stream_executor::port::StatusOr;
-using xla::ComputationLayout;
-using xla::DebugOptions;
-using xla::DeviceAssignment;
-using xla::HloModuleConfig;
-using xla::HloSharding;
-using xla::InvalidArgument;
-using xla::ProgramShape;
-using xla::Shape;
-using xla::ShapeTree;
-using xla::ShapeUtil;
+using ::stream_executor::port::Status;
+using ::stream_executor::port::StatusOr;
+using ::xla::ComputationLayout;
+using ::xla::DebugOptions;
+using ::xla::DeviceAssignment;
+using ::xla::HloModuleConfig;
+using ::xla::HloSharding;
+using ::xla::InvalidArgument;
+using ::xla::ProgramShape;
+using ::xla::Shape;
+using ::xla::ShapeTree;
+using ::xla::ShapeUtil;
 
 Status ValidateResultShape(const Shape& client_shape,
                            const Shape& result_shape) {
@@ -435,5 +436,147 @@ StatusOr<TpuAotCompilationRequestProto> CreateTpuAotCompilationRequest(
   VLOG(1) << "TpuAotCompilationRequest:\n" << aot_request.DebugString();
   return aot_request;
 }
+
+StatusOr<TpuCompilationRequestProto> CreateTpuCompilationRequest(
+    const absl::variant<MlirToHloArgs, FunctionToHloArgs>& computation,
+    const TPUCompileMetadataProto& metadata,
+    const std::vector<TensorShape>& arg_shapes) {
+  VLOG(1) << "CreateTpuCompilationRequest.";
+  TpuCompilationRequestProto compilation_request;
+  bool use_mlir = computation.index() == 0;
+  compilation_request.set_use_mlir(use_mlir);
+  if (use_mlir) {
+    VLOG(1) << "Serializing MlirModule";
+    const MlirToHloArgs& mlir_computation = absl::get<0>(computation);
+    *compilation_request.mutable_mlir_module() = mlir_computation.mlir_module;
+  } else {
+    VLOG(1) << "Serializing FunctionDefinitionLibrary";
+    const FunctionToHloArgs& function_computation = absl::get<1>(computation);
+    *compilation_request.mutable_fdef_lib() =
+        function_computation.flib_def->ToProto();
+    compilation_request.set_graph_def_version(
+        function_computation.graph_def_version);
+    *compilation_request.mutable_function() = *function_computation.function;
+    // TODO(b/160937500): serializing and copying large guaranteed_constants can
+    // be a perf hit. There is a future work to refactor the compilation layer
+    // to avoid passing guaranteed_constants over C_API.
+    if (function_computation.guaranteed_constants.index() == 0) {
+      absl::Span<const TensorProto* const> guaranteed_constants =
+          absl::get<0>(function_computation.guaranteed_constants);
+      for (const TensorProto* constant : guaranteed_constants) {
+        *compilation_request.add_guaranteed_constants() = *constant;
+      }
+    } else {
+      CHECK_EQ(function_computation.guaranteed_constants.index(), 1);
+      const OpInputList& guaranteed_constants =
+          *absl::get<1>(function_computation.guaranteed_constants);
+      for (const Tensor& constant : guaranteed_constants) {
+        constant.AsProtoTensorContent(
+            compilation_request.add_guaranteed_constants());
+      }
+    }
+  }
+
+  for (const TensorShape& shape : arg_shapes) {
+    shape.AsProto(compilation_request.add_arg_shapes());
+  }
+
+  *(compilation_request.mutable_metadata()) = metadata;
+
+  VLOG(1) << "TpuCompilationRequest:\n" << compilation_request.DebugString();
+  return compilation_request;
+}
+
+Status CompileOpMetadataFromContext(OpKernelConstruction* ctx,
+                                    TPUCompileMetadataProto* metadata,
+                                    NameAttrList* function_name,
+                                    std::string* mlir_module) {
+  CHECK_NE(metadata, nullptr);
+
+  int num_computations;
+  TF_RETURN_IF_ERROR(ctx->GetAttr("num_computations", &num_computations));
+
+  std::string metadata_string;
+  TF_RETURN_IF_ERROR(ctx->GetAttr("metadata", &metadata_string));
+  if (!metadata->ParsePartialFromString(metadata_string)) {
+    return errors::InvalidArgument("Unable to parse TPUCompileMetadataProto");
+  }
+
+  if (function_name != nullptr) {
+    TF_RETURN_IF_ERROR(ctx->GetAttr("function", function_name));
+  }
+
+  if (mlir_module != nullptr) {
+    TF_RETURN_IF_ERROR(ctx->GetAttr("mlir_module", mlir_module));
+  }
+
+  if (num_computations != metadata->num_cores_per_replica()) {
+    return errors::InvalidArgument(
+        "num_computations must be equal to "
+        "num_cores_per_replica in the 'metadata' "
+        "attribute (",
+        num_computations, " vs ", metadata->num_cores_per_replica(), ")");
+  }
+
+  if (metadata->has_device_assignment()) {
+    StatusOr<std::unique_ptr<DeviceAssignment>> device_assignment_or_error =
+        DeviceAssignment::Deserialize(metadata->device_assignment());
+    TF_RETURN_IF_ERROR(device_assignment_or_error.status());
+    const DeviceAssignment& device_assignment =
+        *device_assignment_or_error.ValueOrDie();
+    const int num_replicas = metadata->num_replicas();
+    if (device_assignment.replica_count() != num_replicas) {
+      return errors::InvalidArgument(
+          "Device assignment replica_count != num_replicas; ",
+          device_assignment.replica_count(), " vs ", num_replicas);
+    }
+    if (device_assignment.computation_count() !=
+        metadata->num_cores_per_replica()) {
+      return errors::InvalidArgument(
+          "Device assignment computation_count != num_cores_per_replica; ",
+          device_assignment.computation_count(), " vs ",
+          metadata->num_cores_per_replica());
+    }
+  }
+  return Status::OK();
+}
+
+Status ComputeArgumentShapes(const tpu::TPUCompileMetadataProto& metadata,
+                             const std::vector<TensorShape>& dynamic_shapes,
+                             std::vector<TensorShape>* arg_shapes) {
+  arg_shapes->resize(metadata.args_size());
+  int dynamic_shape_pos = 0;
+  for (int i = 0; i < metadata.args_size(); ++i) {
+    const tpu::TPUCompileMetadataProto::Arg& arg = metadata.args(i);
+    // The XLA compiler determines the shape of each constant by inspecting the
+    // value of its corresponding host-memory tensor. As a result, we don't need
+    // to give the compiler graph-inferred shapes for constant arguments.
+    if (arg.kind() == tpu::TPUCompileMetadataProto::Arg::GUARANTEED_CONSTANT) {
+      continue;
+    }
+    TF_RETURN_IF_ERROR(PartialTensorShape::IsValidShape(arg.shape()));
+    PartialTensorShape static_shape(arg.shape());
+
+    TensorShape& shape = (*arg_shapes)[i];
+    if (static_shape.IsFullyDefined()) {
+      TF_RET_CHECK(static_shape.AsTensorShape(&shape));
+    } else {
+      TF_RET_CHECK(dynamic_shape_pos < dynamic_shapes.size())
+          << "Too few dynamic shapes";
+      shape = dynamic_shapes[dynamic_shape_pos++];
+      if (!static_shape.IsCompatibleWith(shape)) {
+        return errors::InvalidArgument(
+            "Mismatch between static and dynamic shape for argument. Static "
+            "shape: ",
+            static_shape.DebugString(),
+            "; dynamic shape: ", shape.DebugString());
+      }
+    }
+  }
+  // Checks we consumed all of the dynamic shapes.
+  TF_RET_CHECK(dynamic_shape_pos == dynamic_shapes.size())
+      << "Too many dynamic shapes";
+  return Status::OK();
+}
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_support.h b/tensorflow/core/tpu/kernels/tpu_compile_op_support.h
index 95c4131e6ed..ea13d33b521 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_support.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_support.h
@@ -18,10 +18,10 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h"
-#include "tensorflow/core/tpu/kernels/tpu_compile.pb.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
+#include "absl/types/variant.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
@@ -31,16 +31,37 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_tree.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile.pb.h"
 
 namespace tensorflow {
 namespace tpu {
 
 namespace se = ::stream_executor;
 
+// List of parameters for lowering Mlir to HLO IR.
+struct MlirToHloArgs {
+  const std::string& mlir_module;
+};
+
+// Variant of guaranteed constant tensors types.
+using GuaranteedConsts = absl::variant<absl::Span<const TensorProto* const>,
+                                       const OpInputList* const>;
+
+// List of parameters for lowering function library definition to HLO IR.
+struct FunctionToHloArgs {
+  const NameAttrList* const function;
+  const FunctionLibraryDefinition* const flib_def;
+  int graph_def_version;
+  GuaranteedConsts guaranteed_constants;
+};
+
 // Persistent cache for compiled TPU program and the related compiler metadata
 // intended for TPU inference.
 // TODO(henrytan): there is an opportunity to consolidate the interface with the
@@ -128,6 +149,24 @@ CreateTpuAotCompilationRequest(
     const std::vector<std::vector<std::pair<int, bool>>>&
         per_core_variable_indices,
     const absl::optional<xla::DeviceAssignment>& device_assignment);
+
+se::port::StatusOr<TpuCompilationRequestProto> CreateTpuCompilationRequest(
+    const absl::variant<MlirToHloArgs, FunctionToHloArgs>& computation,
+    const TPUCompileMetadataProto& metadata,
+    const std::vector<TensorShape>& arg_shapes);
+
+se::port::Status CompileOpMetadataFromContext(OpKernelConstruction* ctx,
+                                              TPUCompileMetadataProto* metadata,
+                                              NameAttrList* function_name,
+                                              std::string* mlir_module);
+
+// Computes shapes for each argument. Uses both the static shape from the
+// metadata, and the dynamic shapes where the static shape is not
+// defined. There must be one dynamic_shape for each argument with a
+// partially defined shape, in index order.
+Status ComputeArgumentShapes(const TPUCompileMetadataProto& metadata,
+                             const std::vector<TensorShape>& dynamic_shapes,
+                             std::vector<TensorShape>* arg_shapes);
 }  // namespace tpu
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
index 065a7f77dd6..5a8c283c7c2 100644
--- a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
@@ -23,7 +23,12 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_factory.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h"
 #include "tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h"
+#include "tensorflow/core/tpu/kernels/tpu_op_consts.h"
 #include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/core/tpu/tpu_config_c_api.h"
 #include "tensorflow/core/tpu/tpu_configuration.h"
@@ -67,6 +72,16 @@ Status DeleteIfExists(ResourceMgr* resource_manager,
 
 }  // namespace
 
+Status CreateTpuCompilationCache(
+    ResourceMgr* rmgr, tpu::TpuCompilationCacheInterface** compilation_cache) {
+  return rmgr->LookupOrCreate<tpu::TpuCompilationCacheInterface>(
+      rmgr->default_container(), tpu::kCompilationCacheResourceName,
+      compilation_cache, [&](tpu::TpuCompilationCacheInterface** new_cache) {
+        *new_cache = tpu::GetCompilationCacheCreateFn()();
+        return Status::OK();
+      });
+}
+
 void ConfigureDistributedTpuOp::Compute(OpKernelContext* ctx) {
   VLOG(1) << "ConfigureDistributedTpuOp";
   XLA_SCOPED_LOGGING_TIMER("ConfigureDistributedTpuOp");
@@ -98,9 +113,15 @@ void ConfigureDistributedTpuOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, DeleteIfExists<tpu::TpuMeshStateInterface>(
                           rmgr, tpu::kTpuMeshStateInterfaceResourceName));
 
+  // Create the subgraph compilation cache and put it in the local resource
+  // manager.
+  tpu::TpuCompilationCacheInterface* compilation_cache;
+  OP_REQUIRES_OK(ctx, CreateTpuCompilationCache(rmgr, &compilation_cache));
+  core::ScopedUnref compilation_cache_ref(compilation_cache);
+
   tpu::ConfigApiFn()->ConfigureDistributedTpuOp_DoWorkFn(
       num_devices_per_host.size(), num_devices_per_host.data(),
-      &host_config_output_size, &host_config_output, status);
+      compilation_cache, &host_config_output_size, &host_config_output, status);
 
   auto* tpu_mesh = tpu::TpuMeshStateInterface::Create();
   OP_REQUIRES_OK(
@@ -204,6 +225,10 @@ void ShutdownDistributedTpuOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
   TF_DeleteStatus(status);
 
+  OP_REQUIRES_OK(
+      ctx, DeleteIfExists<tpu::TpuCompilationCacheInterface>(
+               GetTPUConfigResourceMgr(), tpu::kCompilationCacheResourceName));
+
   VLOG(1) << "ShutdownDistributedTpuOp done";
 }
 
@@ -230,10 +255,41 @@ void InitializeHostForDistributedTpuOp::Compute(OpKernelContext* ctx) {
                                      mesh_state_interface));
   }
 
+  VLOG(1) << "Removing existing proto compilation cache lookup if it exists";
+  OP_REQUIRES_OK(ctx, DeleteIfExists<tpu::TpuCompilationCacheLookup>(
+                          rmgr, tpu::kCompiledProtoCacheResourceName));
+
+  if (enable_whole_mesh_compilations_) {
+    // If this is a whole mesh compilation mode, create the compilation cache,
+    // if missing.
+    tpu::TpuCompilationCacheInterface* compilation_cache;
+    OP_REQUIRES_OK(ctx, CreateTpuCompilationCache(rmgr, &compilation_cache));
+    compilation_cache->Unref();
+  }
+
+  tpu::TpuCompilationCacheInterface* local_compilation_cache;
+  Status s = rmgr->Lookup(rmgr->default_container(),
+                          tpu::kCompilationCacheResourceName,
+                          &local_compilation_cache);
+  if (!s.ok()) {
+    local_compilation_cache = nullptr;
+  }
+
   tpu::ConfigApiFn()->InitializeHostForDistributedTpuOp_DoWorkFn(
       tpu_host_config.size(), tpu_host_config.data(),
-      enable_whole_mesh_compilations_, &device_id_output_size,
-      &device_id_output, status);
+      enable_whole_mesh_compilations_, local_compilation_cache,
+      &device_id_output_size, &device_id_output, status);
+
+  if (local_compilation_cache != nullptr) {
+    local_compilation_cache->Unref();
+
+    tpu::TpuCompilationCacheLookup* proto_lookup;
+    proto_lookup =
+        new tpu::TpuCompilationCacheLocalLookup(local_compilation_cache);
+    OP_REQUIRES_OK(
+        ctx, rmgr->Create(rmgr->default_container(),
+                          tpu::kCompiledProtoCacheResourceName, proto_lookup));
+  }
 
   Tensor* ctx_output;
   OP_REQUIRES_OK(
diff --git a/tensorflow/core/tpu/kernels/tpu_configuration_ops.h b/tensorflow/core/tpu/kernels/tpu_configuration_ops.h
index f75a47e5aaf..d0bf5809842 100644
--- a/tensorflow/core/tpu/kernels/tpu_configuration_ops.h
+++ b/tensorflow/core/tpu/kernels/tpu_configuration_ops.h
@@ -16,9 +16,13 @@ limitations under the License.
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_CONFIGURATION_OPS_H_
 
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
 
 namespace tensorflow {
 
+Status CreateTpuCompilationCache(
+    ResourceMgr* rmgr, tpu::TpuCompilationCacheInterface** compilation_cache);
+
 // The ConfigureDistributedTpu op is used to start an TPUDriver from
 // TensorFlow. It should be run on a TPU_SYSTEM device and returns the
 // connection host:port for the CompilationCacheServer. The
diff --git a/tensorflow/core/tpu/kernels/tpu_execute_c_api.h b/tensorflow/core/tpu/kernels/tpu_execute_c_api.h
index db73af76efd..81d23441ddc 100644
--- a/tensorflow/core/tpu/kernels/tpu_execute_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_execute_c_api.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/tpu/kernels/tpu_program_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
+#include "tensorflow/core/tpu/libtftpu.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 
 extern "C" {
@@ -26,13 +27,33 @@ typedef struct XLA_DeviceAssignment {
   size_t size;
 } XLA_DeviceAssignment;
 
-void TpuExecutable_LoadProgramAndEnqueueToStream(
+TFTPU_CAPI_EXPORT void TpuExecutable_LoadProgramAndEnqueueToStream(
     const XLA_TpuProgram* program, SE_DeviceMemoryBase* arguments,
     size_t arguments_len, SE_DeviceMemoryBase* result,
     SE_DeviceMemoryBase* cross_program_prefetch_addr, int32_t rng_seed,
     XLA_DeviceAssignment* device_assignment, SE_Stream* stream,
     SE_Status* status);
 
+TFTPU_CAPI_EXPORT void HardwareLayout_HostShapeToDeviceShape(
+    XLA_Shape* host_shape, XLA_Shape* device_shape);
+TFTPU_CAPI_EXPORT int64_t HardwareLayout_ShapeSize(XLA_Shape* shape);
+TFTPU_CAPI_EXPORT int64_t HardwareLayout_ShapeSizeCompact(XLA_Shape* shape);
+TFTPU_CAPI_EXPORT int64_t HardwareLayout_ShapeSizeCompactRaw(XLA_Shape* shape);
+
+TFTPU_CAPI_EXPORT void TpuExecute_RuntimeInputToPaddedData(
+    uint32_t* runtime_input_ptr, size_t runtime_input_size,
+    int8_t* padded_data_ptr, size_t padded_data_size, XLA_Shape* runtime_shape,
+    XLA_Shape* compile_time_shape, SE_Status* status);
+
+struct TfTpu_ExecuteApiFn {
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_LoadProgramAndEnqueueToStream);
+  TFTPU_ADD_FN_IN_STRUCT(HardwareLayout_HostShapeToDeviceShape);
+  TFTPU_ADD_FN_IN_STRUCT(HardwareLayout_ShapeSize);
+  TFTPU_ADD_FN_IN_STRUCT(HardwareLayout_ShapeSizeCompact);
+  TFTPU_ADD_FN_IN_STRUCT(HardwareLayout_ShapeSizeCompactRaw);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecute_RuntimeInputToPaddedData);
+};
+
 }  // extern "C"
 
 #endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_EXECUTE_C_API_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_execute_op.cc b/tensorflow/core/tpu/kernels/tpu_execute_op.cc
new file mode 100644
index 00000000000..3522ace379a
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_execute_op.cc
@@ -0,0 +1,814 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tpu/kernels/tpu_execute_op.h"
+
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/memory/memory.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/jit/xla_device.h"
+#include "tensorflow/compiler/jit/xla_launch_util.h"
+#include "tensorflow/compiler/jit/xla_tensor.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/service/dump.h"
+#include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/resource_var.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h"
+#include "tensorflow/core/tpu/kernels/tpu_executable_info.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_op_consts.h"
+#include "tensorflow/core/tpu/tpu_configuration.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
+#include "tensorflow/core/tpu/tpu_execute.h"
+#include "tensorflow/core/util/stream_executor_util.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/tpu/tpu_node_context.h"
+
+namespace tensorflow {
+namespace {
+using ::tensorflow::tpu::CompilationCacheEntryRef;
+using ::tensorflow::tpu::TpuCompilationCacheLookup;
+using ::tensorflow::tpu::TpuNodeContext;
+
+// Looks up the input `key` in the compilation cache, populating
+// `*rendezvous_key_base` and `*entry`.
+Status GetComputationCacheEntry(
+    OpKernelContext* context, string* rendezvous_key_base,
+    std::unique_ptr<CompilationCacheEntryRef>* entry) {
+  const Tensor* key;
+  TF_RETURN_IF_ERROR(context->input("key", &key));
+  profiler::TraceMe trace_me("TpuExecuteOp::LookupProto", /*level=*/2);
+  if (!TensorShapeUtils::IsVector(key->shape()) ||
+      key->shape().dim_size(0) != 2) {
+    return errors::InvalidArgument(
+        "Key argument to TPUExecute must be a 2-element vector");
+  }
+
+  ResourceMgr* rmgr = GetTPUConfigResourceMgr();
+  TpuCompilationCacheLookup* proto_lookup;
+  TF_RETURN_IF_ERROR(rmgr->Lookup(rmgr->default_container(),
+                                  tpu::kCompiledProtoCacheResourceName,
+                                  &proto_lookup));
+  core::ScopedUnref lookup_unref(proto_lookup);
+  TF_RETURN_IF_ERROR(proto_lookup->Lookup(key->vec<tstring>()(0), entry));
+  *rendezvous_key_base = key->vec<tstring>()(1);
+  return Status::OK();
+}
+
+struct VariableUpdateMap {
+  // Maps input index to the updated output index. If the variable doesn't have
+  // an updated output, the corresponding output is set to -1.
+  absl::flat_hash_map<int, int> input_to_output;
+  // Maps output index to (the input index, whether the update is generated from
+  // compilation).
+  absl::flat_hash_map<int, std::pair<int, bool>> output_to_input;
+  // Part of the input indices that are from the compilation, in the compiled
+  // order.
+  std::vector<int> input_in_compiled_update_order;
+};
+
+// Creates a VariableUpdateMap from both the compilation and the fused variable
+// reads/updates.
+xla::StatusOr<VariableUpdateMap> BuildVariableUpdateMap(
+    absl::Span<const TPUExecutableInfoProto::UpdateIndexPair* const>
+        compiled_variable_updates,
+    absl::Span<int const> fused_device_var_reads_in_computation_inputs,
+    const std::vector<int>& fused_device_var_updates_in_computation_outputs,
+    int64 computation_output_count) {
+  VariableUpdateMap map;
+  auto add_pair = [&](int input, int output, bool from_compilation) -> Status {
+    TF_RET_CHECK(map.input_to_output.emplace(input, output).second)
+        << "Duplicate variable input index: " << input;
+    if (output >= 0) {
+      TF_RET_CHECK(map.output_to_input
+                       .emplace(output, std::make_pair(input, from_compilation))
+                       .second)
+          << "Duplicate variable output index: " << output;
+    }
+    return Status::OK();
+  };
+
+  // First add the updates produced by the compilation. Not all variables are
+  // updated, and if not, they do not have an output in the XLA computation. The
+  // update output indices in the XLA computation start after the non-variable
+  // outputs.
+  int num_updated_variables = 0;
+  for (int i = 0; i < compiled_variable_updates.size(); ++i) {
+    const bool updated = compiled_variable_updates[i]->updated();
+    if (updated) ++num_updated_variables;
+  }
+  TF_RET_CHECK(num_updated_variables <= computation_output_count)
+      << num_updated_variables << " <= " << computation_output_count;
+  int64 compiled_variable_output_index =
+      computation_output_count - num_updated_variables;
+  for (auto update : compiled_variable_updates) {
+    map.input_in_compiled_update_order.push_back(update->index());
+    if (!update->updated()) {
+      TF_RETURN_IF_ERROR(add_pair(update->index(), -1, true));
+      continue;
+    }
+    TF_RETURN_IF_ERROR(
+        add_pair(update->index(), compiled_variable_output_index, true));
+    ++compiled_variable_output_index;
+  }
+
+  // Now add the updates from the attributes.
+  TF_RET_CHECK(fused_device_var_reads_in_computation_inputs.size() ==
+               fused_device_var_updates_in_computation_outputs.size());
+  for (int64 i = 0; i < fused_device_var_reads_in_computation_inputs.size();
+       ++i) {
+    TF_RETURN_IF_ERROR(
+        add_pair(fused_device_var_reads_in_computation_inputs[i],
+                 fused_device_var_updates_in_computation_outputs[i], false));
+  }
+  return map;
+}
+
+// Buffers representing the inputs to a computation.
+struct InputBuffers {
+  explicit InputBuffers(xla::Shape device_shape)
+      : buffers(std::move(device_shape)) {}
+
+  InputBuffers(const InputBuffers&) = delete;
+  InputBuffers& operator=(const InputBuffers&) = delete;
+
+  ~InputBuffers() = default;
+
+  xla::ShapedBuffer ToShapedBuffer(xla::Shape host_shape,
+                                   se::DeviceMemoryAllocator* allocator,
+                                   int device_ordinal) {
+    CHECK_NE(allocator, nullptr);
+    xla::ShapedBuffer shaped_buffer(std::move(host_shape), buffers.shape(),
+                                    allocator->platform(), device_ordinal);
+    shaped_buffer.set_buffers(buffers.Map<se::DeviceMemoryBase>(
+        [](xla::MaybeOwningDeviceMemory* buffer) {
+          CHECK(buffer);
+          return buffer->AsDeviceMemoryBase();
+        }));
+    return shaped_buffer;
+  }
+
+  // Describes the buffer tree.
+  xla::ShapeTree<xla::MaybeOwningDeviceMemory> buffers;
+
+  // Information about resource variables passed directly to TPUExecute.
+  std::vector<VariableInfo> variables;
+
+  // Mapping from input index to offsets in 'variables'. < 0 if the input does
+  // not correspond to a variable in 'variables'.
+  std::vector<int> variable_index;
+};
+
+// Builds an InputBuffers object that describes the inputs to the computation.
+xla::StatusOr<std::unique_ptr<InputBuffers>> BuildComputationInputs(
+    OpKernelContext* context, const xla::Shape& input_host_shape,
+    const VariableUpdateMap& variable_updates, xla::Backend* backend,
+    int device_ordinal, se::Stream* stream) {
+  profiler::TraceMe trace_me("BuildComputationInputs", /*level=*/2);
+  OpInputList arg_list;
+  TF_RETURN_IF_ERROR(context->input_list("args", &arg_list));
+
+  if (arg_list.size() != xla::ShapeUtil::TupleElementCount(input_host_shape)) {
+    return errors::InvalidArgument(
+        "Number of parameters (", arg_list.size(),
+        ") does not match input shape: ",
+        xla::ShapeUtil::TupleElementCount(input_host_shape));
+  }
+
+  auto validate_shape = [&](int i, const Tensor& tensor) {
+    const xla::Shape& expected =
+        xla::ShapeUtil::GetTupleElementShape(input_host_shape, i);
+    VLOG(4) << "Input " << i << " TF shape " << tensor.shape().DebugString();
+    XlaTensor* xla_tensor = XlaTensor::FromTensor(&tensor);
+
+    if (xla_tensor == nullptr) {
+      // FromTensor failed; tensor must be empty.
+      if (!xla::ShapeUtil::IsZeroElementArray(expected)) {
+        return errors::InvalidArgument(
+            "Run-time shape mismatch for TPUExecute argument[", i, "] (",
+            context->op_kernel().requested_input(i), "). Expected ",
+            expected.DebugString(), "; got empty tensor");
+      }
+    } else {
+      // Compare host shapes, easier than getting the expected device shape.
+      const xla::Shape& xla_shape = xla_tensor->shaped_buffer().on_host_shape();
+      if (!xla::ShapeUtil::Compatible(expected, xla_shape)) {
+        return errors::InvalidArgument(
+            "Run-time shape mismatch for TPUExecute argument[", i, "] (",
+            context->op_kernel().requested_input(i), "). Expected ",
+            expected.DebugString(), "; got ", xla_shape.DebugString());
+      }
+    }
+
+    return Status::OK();
+  };
+
+  // Iterate over the inputs, validating the shapes of non-variable inputs,
+  // and creating a VariableInfo object for each variable. We consider variable
+  // inputs in a separate phase because we must acquire variable locks in order.
+  std::vector<VariableInfo> variables;
+  std::vector<int> variable_index(arg_list.size(), -1);
+  variables.reserve(arg_list.size());
+  for (int i = 0; i < arg_list.size(); ++i) {
+    // Arguments are assumed to be variables if they have a resource type.
+    // (Non-variable resources are not supported.)
+    if (context->input_dtype(i) == DT_RESOURCE) {
+      variable_index[i] = variables.size();
+      // TODO(phawkins): we may be looking up many variables here; it would be
+      // better if we did not repeatedly acquire the resource manager's lock.
+      const ResourceHandle& handle = HandleFromInput(context, i);
+      Var* variable;
+      TF_RETURN_IF_ERROR(LookupResource(context, handle, &variable));
+      variables.push_back(VariableInfo(i, handle.name(), variable));
+    } else {
+      TF_RETURN_IF_ERROR(validate_shape(i, arg_list[i]));
+    }
+  }
+
+  // Lock the variables, and validate their shapes. We hold the variable locks
+  // for the duration of the TPU execution so we can donate the variable buffers
+  // to the computation. If we copied the variable's Tensor instead, its
+  // reference count would be greater than one due to the reference the Var
+  // object holds, and we would never be able to reuse variable buffers.
+  // TODO(phawkins): add a 'reuse_buffers' attribute to TPUExecute that allows
+  // the user to elect to copy the buffers and permit concurrent access instead.
+  TF_RETURN_IF_ERROR(LockVariables(absl::MakeSpan(variables)));
+  for (int i = 0; i < variables.size(); ++i) {
+    TF_RETURN_IF_ERROR(
+        validate_shape(variables[i].index(), *variables[i].var()->tensor()));
+  }
+
+  se::DeviceMemoryAllocator* const allocator = backend->memory_allocator();
+  xla::TransferManager* const transfer_manager = backend->transfer_manager();
+
+  auto input_buffers = absl::make_unique<InputBuffers>(
+      transfer_manager->HostShapeToDeviceShape(input_host_shape));
+
+  // Allocates a buffer for the root tuple.
+  const int64 root_size =
+      transfer_manager->GetByteSizeRequirement(input_buffers->buffers.shape());
+  TF_ASSIGN_OR_RETURN(*input_buffers->buffers.mutable_element({}),
+                      allocator->Allocate(device_ordinal, root_size));
+
+  // Helper function that sets the input buffers for 'arg_index' to 'buffers'.
+  // If 'donate_buffers' is true, donates ownership of the buffers in 'buffers'
+  // to the computation and overwrites the entries in 'buffers' with nulls.
+  auto set_input_buffers_helper = [&](int arg_index, bool donate_buffers,
+                                      xla::ShapedBuffer* buffers) {
+    buffers->buffers().ForEachMutableElement([&](const xla::ShapeIndex& index,
+                                                 se::DeviceMemoryBase* buffer) {
+      xla::ShapeIndex in_index = {arg_index};
+      for (int64 j : index) {
+        in_index.push_back(j);
+      }
+      auto* in_buffer = input_buffers->buffers.mutable_element(in_index);
+      if (donate_buffers) {
+        *in_buffer = se::OwningDeviceMemory(*buffer, device_ordinal, allocator);
+        *buffer = se::DeviceMemoryBase();
+      } else {
+        *in_buffer = *buffer;
+      }
+    });
+  };
+
+  // Assigns the buffers of 'tensor' as computation input 'i'. Allocates fresh
+  // buffers for zero-element tensors where required.
+  auto assign_input = [&](int i, const Tensor& tensor,
+                          bool may_reuse) -> xla::Status {
+    XlaTensor* xla_tensor = XlaTensor::FromTensor(&tensor);
+
+    // Size 0 tensors have no backing XlaTensor, but may still need to have
+    // tuple buffers allocated.
+    if (xla_tensor == nullptr) {
+      CHECK_EQ(tensor.NumElements(), 0);
+      const xla::Shape& host_shape =
+          xla::ShapeUtil::GetSubshape(input_host_shape, {i});
+      TF_ASSIGN_OR_RETURN(xla::ScopedShapedBuffer buffers,
+                          transfer_manager->AllocateScopedShapedBuffer(
+                              host_shape, allocator, device_ordinal));
+      set_input_buffers_helper(/*arg_index=*/i, /*donate_buffers=*/true,
+                               &buffers);
+    } else {
+      bool can_reuse_buffers = tensor.RefCountIsOne() && may_reuse;
+      set_input_buffers_helper(/*arg_index=*/i,
+                               /*donate_buffers=*/can_reuse_buffers,
+                               &xla_tensor->shaped_buffer());
+      xla_tensor->WaitForDefinitionEventOnStream(stream);
+    }
+    return Status::OK();
+  };
+
+  for (int i = 0; i < arg_list.size(); ++i) {
+    auto it = variable_updates.input_to_output.find(i);
+    if (it == variable_updates.input_to_output.end()) {
+      TF_RETURN_IF_ERROR(assign_input(i, arg_list[i], /*may_reuse=*/true));
+      continue;
+    }
+    // input i is a variable
+    bool updated = it->second >= 0;
+    if (arg_list[i].dtype() != DT_RESOURCE) {
+      TF_RETURN_IF_ERROR(assign_input(i, arg_list[i], updated));
+    } else {
+      int vi = variable_index[i];
+      TF_RETURN_IF_ERROR(
+          assign_input(i, *variables[vi].var()->tensor(), updated));
+    }
+  }
+
+  input_buffers->variables = std::move(variables);
+  input_buffers->variable_index = std::move(variable_index);
+
+  return std::move(input_buffers);
+}
+
+struct OutputBuffers {
+  OutputBuffers(xla::ScopedShapedBuffer b, se::DeviceMemoryAllocator* allocator)
+      : owned_buffers(b.on_device_shape(), true),
+        buffers(b.release()),
+        memory_allocator(allocator) {}
+
+  ~OutputBuffers() {
+    buffers.buffers().ForEachElement(
+        [&](const xla::ShapeIndex& index, const se::DeviceMemoryBase& buffer) {
+          if (owned_buffers.element(index) && !buffer.is_null()) {
+            Status status =
+                memory_allocator->Deallocate(buffers.device_ordinal(), buffer);
+            if (!status.ok()) {
+              LOG(ERROR) << "Error deallocating buffer " << status;
+            }
+          }
+        });
+  }
+
+  // Which of the buffers do we own?
+  xla::ShapeTree<bool> owned_buffers;
+
+  xla::ShapedBuffer buffers;
+
+  se::DeviceMemoryAllocator* const memory_allocator;
+};
+
+// Allocates Tensors for the outputs of the computation. Ownership of most
+// output buffers is passed to the output Tensors. Returns an OutputBuffer that
+// owns the root buffer that should be passed to the XLA computation, as well as
+// any output buffers that do not have corresponding output tensors. The latter
+// may happen for zero-element tensors of type int64 or complex64 which still
+// require a tuple buffer but do not have a corresponding XlaTensor.
+xla::StatusOr<std::unique_ptr<OutputBuffers>> AllocateOutputTensors(
+    OpKernelContext* context, xla::ScopedShapedBuffer scoped_buffers,
+    absl::Span<const TensorShapeProto* const> output_tensor_shape_protos,
+    const VariableUpdateMap& variable_updates, TpuNodeContext* node_context,
+    se::Stream* stream, int device_ordinal, InputBuffers* input_buffers,
+    const std::shared_ptr<se::Event>& definition_event) {
+  VLOG(4) << "Output buffers: " << scoped_buffers.ToString();
+
+  profiler::TraceMe trace_me("AllocateOutputTensors", /*level=*/2);
+  // Shapes of the outputs, in TensorShape form.
+  const int64 sub_elements =
+      xla::ShapeUtil::TupleElementCount(scoped_buffers.on_host_shape());
+  if (sub_elements != output_tensor_shape_protos.size()) {
+    return errors::InvalidArgument(
+        "Mismatched numbers of output shapes: ", sub_elements, " vs. ",
+        output_tensor_shape_protos.size());
+  }
+
+  xla::TransferManager* const transfer_manager =
+      node_context->backend()->transfer_manager();
+
+  std::vector<TensorShape> output_tensor_shapes;
+  output_tensor_shapes.reserve(sub_elements);
+  for (int64 i = 0; i < sub_elements; ++i) {
+    TF_RETURN_IF_ERROR(
+        TensorShape::IsValidShape(*output_tensor_shape_protos[i]));
+    TensorShape shape(*output_tensor_shape_protos[i]);
+    const xla::Shape& xla_shape =
+        xla::ShapeUtil::GetSubshape(scoped_buffers.on_host_shape(), {i});
+    if (!xla_shape.IsArray() ||
+        xla::ShapeUtil::ElementsIn(xla_shape) != shape.num_elements()) {
+      return errors::InvalidArgument(
+          "Mismatched number of elements in output shape: ",
+          xla::ShapeUtil::HumanString(xla_shape), " vs ", shape.DebugString());
+    }
+    output_tensor_shapes.push_back(shape);
+  }
+
+  // Builds a shaped buffer for the outputs.
+  TF_RET_CHECK(scoped_buffers.on_host_shape().IsTuple());
+  TF_RET_CHECK(!xla::ShapeUtil::IsNestedTuple(scoped_buffers.on_host_shape()));
+
+  se::DeviceMemoryAllocator* const allocator =
+      node_context->backend()->memory_allocator();
+
+  auto output_buffers =
+      absl::make_unique<OutputBuffers>(std::move(scoped_buffers), allocator);
+
+  xla::Shape output_host_shape = output_buffers->buffers.on_host_shape();
+  xla::Shape output_device_shape = output_buffers->buffers.on_device_shape();
+
+  if (!output_host_shape.is_static()) {
+    TF_RETURN_IF_ERROR(transfer_manager->ReadDynamicShapes(
+        stream, &output_buffers->buffers, &output_host_shape,
+        &output_device_shape));
+    for (int64 i = 0; i < sub_elements; ++i) {
+      const xla::Shape& subshape =
+          xla::ShapeUtil::GetSubshape(output_host_shape, {i});
+      TensorShape shape;
+      TF_RETURN_IF_ERROR(XLAShapeToTensorShape(subshape, &shape));
+      output_tensor_shapes[i] = shape;
+    }
+  }
+
+  // Transfers ownership of the buffers that back XLA computation output 'i'
+  // to 'output_tensor'.
+  auto transfer_buffers = [&](int i, Tensor* output_tensor) {
+    const xla::Shape& host_shape =
+        xla::ShapeUtil::GetTupleElementShape(output_host_shape, i);
+    const xla::Shape& device_shape =
+        xla::ShapeUtil::GetTupleElementShape(output_device_shape, i);
+
+    // Transfers ownership of the output buffers to the output Tensor, if
+    // there the tensor is backed by an XlaTensor. Tensors of size 0 have no
+    // backing XlaTensor, so we let retain 'output_buffers' ownership of any
+    // buffers in that case.
+    if (output_tensor->NumElements() > 0) {
+      xla::ScopedShapedBuffer shaped_buffer(host_shape, device_shape, allocator,
+                                            device_ordinal);
+      shaped_buffer.buffers().ForEachMutableElement(
+          [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
+            xla::ShapeIndex out_index = {i};
+            for (int64 j : index) {
+              out_index.push_back(j);
+            }
+            *buffer = output_buffers->buffers.buffers().element(out_index);
+            *output_buffers->owned_buffers.mutable_element(out_index) = false;
+          });
+
+      XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor);
+      xla_tensor->set_shaped_buffer(std::move(shaped_buffer));
+      xla_tensor->ResetDefinitionEvent(definition_event, stream);
+    }
+  };
+
+  const int num_updated_variables = variable_updates.output_to_input.size();
+  TF_RET_CHECK(num_updated_variables <= output_tensor_shapes.size())
+      << num_updated_variables << " <= " << output_tensor_shapes.size();
+
+  OpInputList arg_list;
+  TF_RETURN_IF_ERROR(context->input_list("args", &arg_list));
+
+  // The TPU program outputs the updated variables including DT_RESOURCE and
+  // non-DT_RESOURCE. The TPUExecuteOp needs to output all non-DT_RESOURCE
+  // variables (updated or not).
+  //
+  //                       updated          not_updated
+  //                 |------------------|------------------|
+  // DT_RESOURCE     | allocate persist |    do nothing    |
+  //                 |------------------|------------------|
+  //                 |     allocate     | forward Op input |
+  // not DT_RESOURCE |      output      |   to Op output   | Op output
+  //                 |------------------|------------------|
+  //                    program output
+
+  // Allocates a fresh tensor for each updated variable. While the variable
+  // inputs need come in no particular order, the variable values are
+  // always added last by XlaCompiler class, in the same order as the
+  // corresponding input variables.
+  int op_output_index = 0;
+  int compiled_update_index = 0;
+  auto process_non_updated_variable = [&](int input_index) {
+    const int variable_index = input_buffers->variable_index.at(input_index);
+    // If a DT_RESOURCE input is not updated, nothing needs to be done
+    // because there is no corresponding output. If a non-resource input
+    // is not updated, forward the input to the output.
+    if (variable_index < 0) {
+      context->set_output(op_output_index, arg_list[input_index]);
+      ++op_output_index;
+    }
+  };
+  for (int i = 0; i < output_tensor_shapes.size(); ++i) {
+    auto it = variable_updates.output_to_input.find(i);
+    if (it == variable_updates.output_to_input.end()) {
+      // Not a variable update.
+      // Allocates a fresh tensor for each output of the operator. We always
+      // allocate a new host-side tensor, but the on-device buffers that back
+      // that tensor may be aliases of input buffers.
+      Tensor* output_tensor;
+      TF_RETURN_IF_ERROR(context->allocate_output(
+          op_output_index, output_tensor_shapes[i], &output_tensor));
+      transfer_buffers(i, output_tensor);
+      ++op_output_index;
+      continue;
+    }
+    const int input_index = it->second.first;
+    // We must process the compiled updates in order, which includes the
+    // non-updated variables, i.e., those without an XLA output.
+    const bool from_compilation = it->second.second;
+    while (from_compilation &&
+           variable_updates
+                   .input_in_compiled_update_order[compiled_update_index] !=
+               input_index) {
+      process_non_updated_variable(
+          variable_updates
+              .input_in_compiled_update_order[compiled_update_index]);
+      ++compiled_update_index;
+    }
+    ++compiled_update_index;
+    const int variable_index = input_buffers->variable_index.at(input_index);
+    PersistentTensor unused;
+    Tensor* output_tensor;
+    if (variable_index >= 0) {
+      // This output corresponds to a DT_RESOURCE input to the TPUExecute
+      // operator. Update the corresponding variable.
+      VariableInfo& var = input_buffers->variables[variable_index];
+      // TODO(b/35625933): the correct thing to do would be to transfer
+      // ownership of the PersistentTensor into the Var object. However, Var
+      // contains a Tensor so we can't.
+      TF_RETURN_IF_ERROR(context->allocate_persistent(
+          var.var()->tensor()->dtype(), output_tensor_shapes[i], &unused,
+          &output_tensor));
+      *var.var()->tensor() = *output_tensor;
+    } else {
+      // This output corresponds to a non-resource input to the TPUExecute
+      // operator. This case occurs for the distributed TPU rewrite which
+      // adds variable values as inputs and outputs rather than passing the
+      // variables themselves; reading and writing the variable is handled
+      // outside the op.
+      // TODO(phawkins): remove this case when placement of variables on TPU
+      // devices is well supported and we no longer need to place "remote"
+      // variables on CPU devices.
+      TF_RETURN_IF_ERROR(context->allocate_output(
+          op_output_index, output_tensor_shapes[i], &output_tensor));
+      ++op_output_index;
+    }
+    transfer_buffers(i, output_tensor);
+  }
+
+  // Process any remaining non-updated variables.
+  for (; compiled_update_index <
+         variable_updates.input_in_compiled_update_order.size();
+       ++compiled_update_index) {
+    process_non_updated_variable(
+        variable_updates.input_in_compiled_update_order[compiled_update_index]);
+  }
+  return std::move(output_buffers);
+}
+
+}  // namespace
+
+// TPUExecuteOp
+
+TPUExecuteOp::TPUExecuteOp(OpKernelConstruction* context)
+    : AsyncOpKernel(context, /* is_deferred = */ true) {}
+
+AsyncOpKernel* TPUExecuteOp::AsAsync() {
+  // If TPU launches are asynchronous, we can perform the launch without
+  // blocking the calling thread, and so the executor may treat this kernel as
+  // a regular (synchronous) OpKernel.
+  return nullptr;
+}
+
+void TPUExecuteOp::Compute(OpKernelContext* context) {
+  Status s = DoWork(context);
+  // NOTE: We can't use `OP_REQUIRES_OK()` here because that macro includes
+  // a dynamic check that we are not in an AsyncOpKernel.
+  if (TF_PREDICT_FALSE(!s.ok())) {
+    context->SetStatus(s);
+  }
+}
+
+void TPUExecuteOp::ComputeAsync(OpKernelContext* context, DoneCallback done) {
+  // If TPU launches are asynchronous, then perform the launch on this
+  // thread to avoid a thread hop, which has an observable latency cost.
+  OP_REQUIRES_OK_ASYNC(context, DoWork(context), done);
+  done();
+}
+
+Status TPUExecuteOp::DoWork(OpKernelContext* context) {
+  VLOG(1) << "Cloud TPU: TPUExecuteOp::Compute";
+
+  const XlaDevice::Metadata* metadata;
+  TF_RETURN_IF_ERROR(XlaDevice::GetMetadata(context, &metadata));
+  const int device_ordinal = metadata->device_ordinal();
+
+  // We are guaranteed that the object underlying TpuNodeContext won't be
+  // deleted out from under us, while node_context is alive.
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<TpuNodeContext> node_context,
+                      TpuNodeContext::Create(device_ordinal));
+
+  profiler::TraceMe trace_me(
+      [device_ordinal, context] {
+        return profiler::TraceMeEncode(
+            "TpuExecuteOp", {{"device_ordinal", device_ordinal},
+                             {"id", context->step_id()},
+                             {"iter_num", context->frame_iter().iter_id}});
+      },
+      /*level=*/2);
+  profiler::TraceMe trace_me_init("TPUExecuteOp::Init", /*level=*/2);
+
+  string rendezvous_key_base;
+  std::unique_ptr<CompilationCacheEntryRef> entry_ref;
+  TF_RETURN_IF_ERROR(
+      GetComputationCacheEntry(context, &rendezvous_key_base, &entry_ref));
+
+  // Shapes of the inputs and outputs, in xla::Shape form.
+  tpu::TpuCompilationCacheEntry entry = entry_ref->get();
+  const tpu::TpuProgramGroup* tpu_program_group =
+      tensorflow::down_cast<const tpu::TpuProgramGroup*>(
+          entry.tpu_program_group());
+  CHECK_NE(tpu_program_group, nullptr);
+  const int core_index = entry.core_index();
+  const TPUExecutableInfoProto& executable =
+      tpu_program_group->executable_info(core_index);
+
+  xla::Backend* const backend = node_context->backend();
+  xla::TransferManager* const transfer_manager = backend->transfer_manager();
+  TF_RET_CHECK(context->op_device_context());
+  se::Stream* stream = context->op_device_context()->stream();
+
+  TF_RET_CHECK(executable.input_shapes_size() == 1);
+
+  xla::Shape host_shape(executable.input_shapes(0));
+
+  TF_ASSIGN_OR_RETURN(
+      auto variable_update_map,
+      BuildVariableUpdateMap(executable.variable_indices(),
+                             fused_device_var_reads_in_computation_inputs_,
+                             fused_device_var_updates_in_computation_outputs_,
+                             executable.output_tensor_shapes().size()));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<InputBuffers> input_buffers,
+      BuildComputationInputs(context, host_shape, variable_update_map, backend,
+                             device_ordinal, stream));
+
+  // Ideally this should be the host-to-device stream from XlaDeviceContext.
+  // The particular anti-dependency this is avoiding (why we need a separate
+  // transfer stream) is between the executable writing tuple tables and
+  // TPUExecute()'s deregister_stream; if they come from the same stream pool
+  // antidependencies will occur. XlaBackend has a different pool of streams
+  // to the stream->GetOrCreateSubStream() that TPUExecute() uses, so these
+  // will never refer to the same stream.
+  //
+  // TODO(jmolloy): Add the necessary plumbing to obtain the proper
+  // host-to-device stream here.
+  TF_ASSIGN_OR_RETURN(auto transfer_stream_ptr,
+                      backend->BorrowStream(device_ordinal));
+
+  se::DeviceMemoryAllocator* const allocator = backend->memory_allocator();
+  auto shaped_buffer = input_buffers->ToShapedBuffer(std::move(host_shape),
+                                                     allocator, device_ordinal);
+  if (transfer_manager->CanShapedBufferBeAccessedNow(stream->parent(),
+                                                     shaped_buffer)) {
+    TF_RETURN_IF_ERROR(transfer_manager->WriteRootTupleIndexTable(
+        transfer_stream_ptr.get(), shaped_buffer));
+    stream->ThenWaitFor(transfer_stream_ptr.get());
+  } else {
+    TF_RETURN_IF_ERROR(
+        transfer_manager->WriteRootTupleIndexTable(stream, shaped_buffer));
+  }
+  VLOG(4) << "Input buffers: " << shaped_buffer.ToString();
+
+  // Snapshot the inputs, if a snapshot was requested.
+  std::shared_ptr<xla::HloSnapshot> hlo_snapshot;
+  if (executable.has_session_module()) {
+    hlo_snapshot =
+        std::make_shared<xla::HloSnapshot>(executable.session_module());
+    auto literal =
+        std::make_shared<xla::Literal>(shaped_buffer.on_host_shape());
+    transfer_manager->TransferLiteralFromDevice(
+        stream, shaped_buffer, literal.get(),
+        [hlo_snapshot, literal](Status status) {
+          if (!status.ok()) {
+            LOG(ERROR) << "TransferLiteralFromDevice for HLO snapshot inputs "
+                          "failed: "
+                       << status;
+            return;
+          }
+          *hlo_snapshot->add_arguments() = literal->ToProto();
+        });
+  }
+
+  auto definition_event = std::make_shared<se::Event>(stream->parent());
+  TF_RET_CHECK(definition_event->Init())
+      << "TPU definition event initialization failed";
+
+  trace_me_init.Stop();
+
+  const uint32 rng_seed = GetXLARandomSeed();
+
+  std::unique_ptr<xla::DeviceAssignment> device_assignment;
+  if (executable.has_device_assignment()) {
+    TF_ASSIGN_OR_RETURN(device_assignment, xla::DeviceAssignment::Deserialize(
+                                               executable.device_assignment()));
+  }
+
+  VLOG(4) << "Input buffers after alias resolution: "
+          << shaped_buffer.ToString();
+
+  std::vector<xla::ExecutionInput> input;
+  input.emplace_back(xla::ExecutionInput(std::move(input_buffers->buffers),
+                                         shaped_buffer.on_host_shape()));
+
+  // The buffers to be freed are in the `output` and will be automatically
+  // freed when it goes out of the scope. In async mode, this means the buffers
+  // will be freed before anyone calls "BlockHostUntilDone", which indicates
+  // that some of the (input) buffers will be freed while the program is running
+  // and looks scary. However, this turns out to be not a problem since although
+  // we free a memory and reassign it to other users while a program is running,
+  // all subsequent writes to the program that could possibly clobber the memory
+  // will depend on the program to finish.
+  const TPUHostTransferInfoProto& host_transfer_info =
+      tpu_program_group->host_transfer_info(core_index);
+  TF_ASSIGN_OR_RETURN(
+      xla::ExecutionOutput output,
+      TPUExecute(executable, host_transfer_info,
+                 *tpu_program_group->hlo_metadata(core_index), std::move(input),
+                 rendezvous_key_base, rng_seed, node_context.get(),
+                 device_assignment.get(), context->cancellation_manager(),
+                 context, stream, transfer_stream_ptr.get(),
+                 tpu_program_group->tpu_program(core_index)));
+  stream->ThenRecordEvent(definition_event.get());
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<OutputBuffers> output_buffers,
+      AllocateOutputTensors(
+          context, output.ConsumeResult(), executable.output_tensor_shapes(),
+          variable_update_map, node_context.get(), stream, device_ordinal,
+          input_buffers.get(), definition_event));
+
+  // Transfer the outputs and save the snapshot to disk.
+  if (hlo_snapshot) {
+    auto literal =
+        std::make_shared<xla::Literal>(output_buffers->buffers.on_host_shape());
+    transfer_manager->TransferLiteralFromDevice(
+        stream, output_buffers->buffers, literal.get(),
+        [hlo_snapshot, literal](Status status) {
+          if (status.ok()) {
+            *hlo_snapshot->mutable_result() = literal->ToProto();
+          } else {
+            LOG(ERROR) << "TransferLiteralFromDevice for HLO snapshot "
+                          "outputs failed: "
+                       << status;
+          }
+          DumpHloSnapshotIfEnabled(*hlo_snapshot,
+                                   xla::GetDebugOptionsFromFlags());
+        });
+  }
+  return Status::OK();
+}
+
+TPUExecuteOp::~TPUExecuteOp() = default;
+
+TPUExecuteAndUpdateVariablesOp::TPUExecuteAndUpdateVariablesOp(
+    OpKernelConstruction* context)
+    : TPUExecuteOp(context) {
+  OP_REQUIRES_OK(context, context->GetAttr(
+                              "device_var_reads_indices",
+                              &fused_device_var_reads_in_computation_inputs_));
+  OP_REQUIRES_OK(
+      context,
+      context->GetAttr("device_var_updates_indices",
+                       &fused_device_var_updates_in_computation_outputs_));
+}
+
+REGISTER_KERNEL_BUILDER(
+    Name("TPUExecute").Device(DEVICE_TPU_NODE).HostMemory("key"), TPUExecuteOp);
+
+REGISTER_KERNEL_BUILDER(Name("TPUExecuteAndUpdateVariables")
+                            .Device(DEVICE_TPU_NODE)
+                            .HostMemory("key"),
+                        TPUExecuteAndUpdateVariablesOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_execute_op.h b/tensorflow/core/tpu/kernels/tpu_execute_op.h
new file mode 100644
index 00000000000..c66118ad45e
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_execute_op.h
@@ -0,0 +1,67 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_EXECUTE_OP_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_EXECUTE_OP_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+// Op that executes a precompiled TPU computation.
+class TPUExecuteOp : public AsyncOpKernel {
+ public:
+  explicit TPUExecuteOp(OpKernelConstruction* context);
+  ~TPUExecuteOp() override;
+
+  AsyncOpKernel* AsAsync() override;
+
+  void Compute(OpKernelContext* context) override;
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) override;
+
+ protected:
+  // Used by TPUExecuteAndUpdateVariablesOp to set the fused variable reads and
+  // updates indices in the XLA computation. The two vectors must have the same
+  // size, and a pair of read index and write index represents a variable's
+  // input to the program and its updated value from the program. If the
+  // variable is not updated, use -1 as the output index.
+  std::vector<int> fused_device_var_reads_in_computation_inputs_;
+  std::vector<int> fused_device_var_updates_in_computation_outputs_;
+
+ private:
+  Status DoWork(OpKernelContext* context);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TPUExecuteOp);
+};
+
+// A variant of TPUExecuteOp that contains fused device variable reads and
+// updates.
+class TPUExecuteAndUpdateVariablesOp : public TPUExecuteOp {
+ public:
+  explicit TPUExecuteAndUpdateVariablesOp(OpKernelConstruction* context);
+  ~TPUExecuteAndUpdateVariablesOp() override = default;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(TPUExecuteAndUpdateVariablesOp);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_EXECUTE_OP_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_handle_to_key_op.cc b/tensorflow/core/tpu/kernels/tpu_handle_to_key_op.cc
new file mode 100644
index 00000000000..ec2ae91d3eb
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_handle_to_key_op.cc
@@ -0,0 +1,62 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+#include "tensorflow/core/tpu/kernels/tpu_op_consts.h"
+#include "tensorflow/core/tpu/tpu_configuration.h"
+
+namespace tensorflow {
+
+class TpuHandleToProtoKeyOp : public OpKernel {
+ public:
+  explicit TpuHandleToProtoKeyOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  ~TpuHandleToProtoKeyOp() override = default;
+  TpuHandleToProtoKeyOp(const TpuHandleToProtoKeyOp&) = delete;
+  TpuHandleToProtoKeyOp& operator=(const TpuHandleToProtoKeyOp&) = delete;
+
+  void Compute(OpKernelContext* ctx) override {
+    VLOG(1) << "TpuHandleToProtoKeyOp::Compute " << ctx->op_kernel().name()
+            << " on device " << ctx->op_kernel().requested_device();
+    const Tensor& uid = ctx->input(0);
+
+    ResourceMgr* rm = GetTPUConfigResourceMgr();
+    tpu::TpuCompilationCacheInterface* cache;
+    OP_REQUIRES_OK(ctx, rm->Lookup<tpu::TpuCompilationCacheInterface>(
+                            rm->default_container(),
+                            tpu::kCompilationCacheResourceName, &cache));
+    core::ScopedUnref cache_unref(cache);
+
+    std::vector<std::string> keys;
+    OP_REQUIRES_OK(ctx, cache->GetKeysFromUid(uid.scalar<int64>()(), &keys));
+
+    TensorShape output_shape;
+    output_shape.AddDim(keys.size());
+    Tensor* result = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &result));
+    for (int i = 0; i < keys.size(); ++i) {
+      result->vec<tstring>()(i) = keys[i];
+    }
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("TpuHandleToProtoKey").Device(DEVICE_CPU),
+                        TpuHandleToProtoKeyOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_op_util.cc b/tensorflow/core/tpu/kernels/tpu_op_util.cc
index b3b675e2734..0d02cac7377 100644
--- a/tensorflow/core/tpu/kernels/tpu_op_util.cc
+++ b/tensorflow/core/tpu/kernels/tpu_op_util.cc
@@ -76,9 +76,10 @@ std::string GuaranteedConstFingerprint(
   if (fingerprint_in_metadata.empty()) {
     uint64_t fingerprint = 0;
     for (const Tensor& constant : guaranteed_constants) {
-      fingerprint = TpuCompile_CreateGuaranteedConstFingerprint(
-          fingerprint, constant.tensor_data().data(),
-          constant.tensor_data().size());
+      fingerprint =
+          tpu::UtilApiFn()->TpuCompile_CreateGuaranteedConstFingerprintFn(
+              fingerprint, constant.tensor_data().data(),
+              constant.tensor_data().size());
     }
     return std::to_string(fingerprint);
   } else {
@@ -109,21 +110,23 @@ TpuCompilationCacheKey CreateCompilationCacheKey(
     }
   }
   CompilationCacheKeyResult result =
-      TpuCompile_CreateCompilationCacheKey(CompilationCacheKeyProperty{
-          config_prefix.data(),
-          shapes_prefix.data(),
-          function_name.data(),
-          mlir_module.data(),
-          flattened_device_ids.data(),
-          flattened_device_ids.size(),
-          guaranteed_constants.size(),
-          function_library_fingerprint,
-          metadata.num_cores_per_replica(),
-          metadata.num_replicas(),
-          mesh_state.data(),
-      });
-  auto buffer_cleanup = gtl::MakeCleanup(
-      [result]() { TpuCompile_DestroyCompilationCacheKey(result); });
+      tpu::UtilApiFn()->TpuCompile_CreateCompilationCacheKeyFn(
+          CompilationCacheKeyProperty{
+              config_prefix.data(),
+              shapes_prefix.data(),
+              function_name.data(),
+              mlir_module.data(),
+              flattened_device_ids.data(),
+              flattened_device_ids.size(),
+              guaranteed_constants.size(),
+              function_library_fingerprint,
+              metadata.num_cores_per_replica(),
+              metadata.num_replicas(),
+              mesh_state.data(),
+          });
+  auto buffer_cleanup = gtl::MakeCleanup([result]() {
+    tpu::UtilApiFn()->TpuCompile_DestroyCompilationCacheKeyFn(result);
+  });
   TpuCompilationCacheKey key;
   key.prefix = result.key;
   key.debug_string = result.debug_string;
diff --git a/tensorflow/core/tpu/kernels/tpu_program_c_api.h b/tensorflow/core/tpu/kernels/tpu_program_c_api.h
index 254527e7a2a..41c7d47cf97 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_program_c_api.h
@@ -16,40 +16,81 @@ limitations under the License.
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_PROGRAM_C_API_H_
 
 #include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
+#include "tensorflow/core/tpu/libtftpu.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"
 
 typedef struct XLA_TpuProgram XLA_TpuProgram;
 
+// Enum for choosing sharding/unsharding program from a `XLA_TpuProgram` obj.
+enum TpuProgramShardingType { kInvalid = 0, kMain, kSharding, kUnsharding };
+
 extern "C" {
 
 // Creates a new TPU program.
-XLA_TpuProgram* TpuProgram_New();
+TFTPU_CAPI_EXPORT XLA_TpuProgram* TpuProgram_New();
 
 // Destroys the `tpu_program`.
-void TpuProgram_Free(XLA_TpuProgram* tpu_program);
+TFTPU_CAPI_EXPORT void TpuProgram_Free(XLA_TpuProgram* tpu_program);
+
+// Creates an array of `XLA_TpuProgram*`.
+TFTPU_CAPI_EXPORT XLA_TpuProgram** TpuProgram_NewArray(size_t count);
+
+// Destroys an array of `XLA_TpuProgram*`.
+TFTPU_CAPI_EXPORT void TpuProgram_FreeArray(XLA_TpuProgram* tpu_program[]);
 
 // Unloads and destroys the `tpu_program`. Once the TPU program is unloaded and
 // destroyed, it is in an unusable state.
-void TpuProgram_UnloadAndDestroy(XLA_TpuProgram* tpu_program,
-                                 SE_Status* status);
+TFTPU_CAPI_EXPORT void TpuProgram_UnloadAndDestroy(XLA_TpuProgram* tpu_program,
+                                                   SE_Status* status);
 
 // Gets TPU program size in bytes from the `tpu_program`.
-int64_t TpuProgram_GetProgramSize(const XLA_TpuProgram* tpu_program);
+TFTPU_CAPI_EXPORT int64_t
+TpuProgram_GetProgramSize(const XLA_TpuProgram* tpu_program);
 
 // Logs the summary of current memory state snapshot of the `tpu_program`.
-bool TpuProgram_LogProgramMemorySummary(const XLA_TpuProgram* tpu_program);
+TFTPU_CAPI_EXPORT bool TpuProgram_LogProgramMemorySummary(
+    const XLA_TpuProgram* tpu_program);
 
 // Gets TPU program executable info from the `tpu_program`.
-void TpuProgram_GetExecutableInfo(const XLA_TpuProgram* tpu_program,
-                                  TpuSerializedProto* executable_info);
+TFTPU_CAPI_EXPORT void TpuProgram_GetExecutableInfo(
+    const XLA_TpuProgram* tpu_program, TpuSerializedProto* executable_info);
 
 // Gets host transfer info proto.
-void TpuProgram_GetHostTransferInfo(const XLA_TpuProgram* tpu_program,
-                                    TpuSerializedProto* host_transfer_info);
+TFTPU_CAPI_EXPORT void TpuProgram_GetHostTransferInfo(
+    const XLA_TpuProgram* tpu_program, TpuSerializedProto* host_transfer_info);
 
 // Gets HLO metadata proto.
-void TpuProgram_GetHloMetadata(const XLA_TpuProgram* tpu_program,
-                               TpuSerializedProto* hlo_metadata);
+TFTPU_CAPI_EXPORT void TpuProgram_GetHloMetadata(
+    const XLA_TpuProgram* tpu_program, TpuSerializedProto* hlo_metadata);
+
+// Gets may modify variables boolean value.
+TFTPU_CAPI_EXPORT void TpuProgram_GetMayModifyVariables(
+    const XLA_TpuProgram* tpu_program, bool* may_modify_variables);
+
+// Check if TPU program has sharding.
+TFTPU_CAPI_EXPORT bool TpuProgram_HasSharding(
+    const XLA_TpuProgram* tpu_program);
+
+// Gets TPU program by sharding type. Return value is valid only when the
+// `status.status()` returns `OK`.
+TFTPU_CAPI_EXPORT XLA_TpuProgram* TpuProgram_GetTpuProgram(
+    XLA_TpuProgram* tpu_program, TpuProgramShardingType type);
+
+struct TfTpu_TpuProgramApiFn {
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_NewArray);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_FreeArray);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_UnloadAndDestroy);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetProgramSize);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_LogProgramMemorySummary);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetExecutableInfo);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetHostTransferInfo);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetHloMetadata);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetMayModifyVariables);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_HasSharding);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetTpuProgram);
+};
 
 }  // extern "C"
 
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group.cc b/tensorflow/core/tpu/kernels/tpu_program_group.cc
index 653b999a67d..39d1f38b104 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group.cc
+++ b/tensorflow/core/tpu/kernels/tpu_program_group.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include "tensorflow/core/tpu/kernels/tpu_compile.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
+#include "tensorflow/core/tpu/kernels/tpu_program_c_api.h"
+#include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"
 #include "tensorflow/stream_executor/tpu/status_helper.h"
 
@@ -32,6 +34,7 @@ namespace {
 
 namespace se_tpu = ::stream_executor::tpu;
 
+using stream_executor::port::Status;
 using stream_executor::port::StatusOr;
 using xla::Shape;
 
@@ -60,8 +63,8 @@ StatusOr<std::vector<XLA_TpuProgram*>> CompileAheadOfTime(
   size_t count = 0;
   StatusHelper status;
   VLOG(1) << "Run TpuCompile_CompileAheadOfTime.";
-  TpuCompile_CompileAheadOfTime(serialized_aot_request, &xla_tpu_programs,
-                                &count, status.c_status);
+  CompileApiFn()->TpuCompile_CompileAheadOfTimeFn(
+      serialized_aot_request, &xla_tpu_programs, &count, status.c_status);
   VLOG(1) << "Run CompileAheadOfTime completed.";
   if (!status.status().ok()) {
     return status.status();
@@ -70,9 +73,8 @@ StatusOr<std::vector<XLA_TpuProgram*>> CompileAheadOfTime(
   for (size_t i = 0; i < count; ++i) {
     tpu_programs[i] = xla_tpu_programs[i];
   }
-  delete[] xla_tpu_programs;
+  TpuProgramApiFn()->TpuProgram_FreeArrayFn(xla_tpu_programs);
   return tpu_programs;
-  return Status::OK();
 }
 
 StatusOr<std::vector<XLA_TpuProgram*>> CompileAheadOfTime(
@@ -97,13 +99,76 @@ StatusOr<std::vector<XLA_TpuProgram*>> CompileAheadOfTime(
       compilation_result, metadata, per_core_arg_shapes, per_core_output_shapes,
       per_core_variable_indices, device_assignment);
 }
-
 }  // namespace
 
+void TpuProgramGroup::Initialize(
+    absl::Span<XLA_TpuProgram* const> xla_tpu_programs) {
+  CHECK_GT(xla_tpu_programs.size(), 0);
+  set_tpu_programs(xla_tpu_programs);
+
+  std::vector<bool> may_modify_variables_array(xla_tpu_programs.size(), false);
+  std::vector<TPUExecutableInfoProto> executable_infos(xla_tpu_programs.size());
+  std::vector<TPUHostTransferInfoProto> host_transfer_infos(
+      xla_tpu_programs.size());
+  std::vector<xla::HloProto> hlo_metadatas(xla_tpu_programs.size());
+  for (size_t i = 0; i < xla_tpu_programs.size(); ++i) {
+    const XLA_TpuProgram* xla_tpu_program = xla_tpu_programs[i];
+    bool may_modify_variables;
+    TpuProgramApiFn()->TpuProgram_GetMayModifyVariablesFn(
+        xla_tpu_program, &may_modify_variables);
+    may_modify_variables_array[i] = may_modify_variables;
+
+    TpuSerializedProto serialized_executable_info;
+    TpuProgramApiFn()->TpuProgram_GetExecutableInfoFn(
+        xla_tpu_program, &serialized_executable_info);
+    TPUExecutableInfoProto executable_info =
+        se_tpu::DeserializeProto<TPUExecutableInfoProto>(
+            serialized_executable_info);
+    executable_infos[i] = executable_info;
+    StreamExecutor_Tpu_FreeSerializedProto(&serialized_executable_info);
+
+    TPUHostTransferInfoProto host_transfer_info;
+    TpuSerializedProto serialized_host_transfer_info;
+    TpuProgramApiFn()->TpuProgram_GetHostTransferInfoFn(
+        xla_tpu_program, &serialized_host_transfer_info);
+    if (serialized_host_transfer_info.size > 0) {
+      host_transfer_info = se_tpu::DeserializeProto<TPUHostTransferInfoProto>(
+          serialized_host_transfer_info);
+      StreamExecutor_Tpu_FreeSerializedProto(&serialized_host_transfer_info);
+    }
+    host_transfer_infos[i] = host_transfer_info;
+
+    TpuSerializedProto serialized_hlo_metadata;
+    TpuProgramApiFn()->TpuProgram_GetHloMetadataFn(xla_tpu_program,
+                                                   &serialized_hlo_metadata);
+    xla::HloProto hlo_metadata =
+        se_tpu::DeserializeProto<xla::HloProto>(serialized_hlo_metadata);
+    hlo_metadatas[i] = hlo_metadata;
+    StreamExecutor_Tpu_FreeSerializedProto(&serialized_hlo_metadata);
+  }
+
+  may_modify_variables_ = may_modify_variables_array;
+  executable_infos_ = executable_infos;
+  host_transfer_infos_ = host_transfer_infos;
+  hlo_metadatas_ = hlo_metadatas;
+  RefreshHloMetadatasPtrs();
+}
+
+bool TpuProgramGroup::has_sharding_program() const {
+  for (const XLA_TpuProgram* tpu_program : tpu_programs_) {
+    if (!TpuProgramApiFn()->TpuProgram_HasShardingFn(tpu_program)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+size_t TpuProgramGroup::program_count() const { return tpu_programs_.size(); }
+
 int64_t TpuProgramGroup::program_size() const {
   int64_t total_size = 0;
-  for (XLA_TpuProgram* tpu_program : tpu_programs_) {
-    total_size += TpuProgram_GetProgramSize(tpu_program);
+  for (const XLA_TpuProgram* tpu_program : tpu_programs_) {
+    total_size += TpuProgramApiFn()->TpuProgram_GetProgramSizeFn(tpu_program);
   }
   return total_size;
 }
@@ -111,7 +176,8 @@ int64_t TpuProgramGroup::program_size() const {
 bool TpuProgramGroup::LogProgramMemorySummary() {
   bool success = true;
   for (const XLA_TpuProgram* tpu_program : tpu_programs_) {
-    success &= TpuProgram_LogProgramMemorySummary(tpu_program);
+    success &=
+        TpuProgramApiFn()->TpuProgram_LogProgramMemorySummaryFn(tpu_program);
   }
   return success;
 }
@@ -119,7 +185,8 @@ bool TpuProgramGroup::LogProgramMemorySummary() {
 void TpuProgramGroup::UnloadAndDestroyPrograms() {
   for (XLA_TpuProgram* tpu_program : tpu_programs_) {
     StatusHelper status;
-    TpuProgram_UnloadAndDestroy(tpu_program, status.c_status);
+    TpuProgramApiFn()->TpuProgram_UnloadAndDestroyFn(tpu_program,
+                                                     status.c_status);
     auto s = status.status();
     if (!s.ok()) {
       LOG(ERROR) << "TpuProgramGroup::UnloadPrograms(): " << s.ToString();
@@ -151,12 +218,6 @@ void TpuProgramGroup::UnloadAndDestroyPrograms() {
   TF_RET_CHECK(per_core_output_shapes.size() ==
                per_core_variable_indices.size());
 
-  // TODO(henrytan): add an interface to TpuProgramGroupInterface to set
-  // may_modify_variables.
-  TpuProgramGroup* tpu_program_group =
-      tensorflow::down_cast<TpuProgramGroup*>(tpu_program_group_interface);
-  tpu_program_group->may_modify_variables_ = may_modify_variables;
-
   // With shardable input/output pairs, XLA could generate separate
   // sharding/unsharding programs along with the main program. The
   // sharding/unsharding programs will be in nested entries of the AOT
@@ -170,45 +231,21 @@ void TpuProgramGroup::UnloadAndDestroyPrograms() {
   // SPMD could return 1 result for all partitions.
   TF_RET_CHECK(xla_tpu_programs.size() == 1 ||
                xla_tpu_programs.size() == metadata.num_cores_per_replica());
-  tpu_program_group->set_tpu_programs(xla_tpu_programs);
-
-  // TODO(jiawenhao): Handle the case of xla_tpu_programs.size() > 1.
-  TpuSerializedProto serialized_executable_info;
-  TpuProgram_GetExecutableInfo(xla_tpu_programs[0],
-                               &serialized_executable_info);
-  TPUExecutableInfoProto executable_info =
-      se_tpu::DeserializeProto<TPUExecutableInfoProto>(
-          serialized_executable_info);
-  tpu_program_group->set_executable_info(executable_info);
-  StreamExecutor_Tpu_FreeSerializedProto(&serialized_executable_info);
-
-  TPUHostTransferInfoProto host_transfer_info;
-  TpuSerializedProto serialized_host_transfer_info;
-  TpuProgram_GetHostTransferInfo(xla_tpu_programs[0],
-                                 &serialized_host_transfer_info);
-  if (serialized_host_transfer_info.size > 0) {
-    host_transfer_info = se_tpu::DeserializeProto<TPUHostTransferInfoProto>(
-        serialized_host_transfer_info);
-    StreamExecutor_Tpu_FreeSerializedProto(&serialized_host_transfer_info);
-  }
-  tpu_program_group->set_host_transfer_info(host_transfer_info);
-
-  TpuSerializedProto serialized_hlo_metadata;
-  TpuProgram_GetHloMetadata(xla_tpu_programs[0], &serialized_hlo_metadata);
-  xla::HloProto hlo_metadata =
-      se_tpu::DeserializeProto<xla::HloProto>(serialized_hlo_metadata);
-  tpu_program_group->set_hlo_metadata(hlo_metadata);
-  StreamExecutor_Tpu_FreeSerializedProto(&serialized_hlo_metadata);
 
+  // TODO(henrytan): add an interface to TpuProgramGroupInterface to set
+  // may_modify_variables.
+  TpuProgramGroup* tpu_program_group =
+      tensorflow::down_cast<TpuProgramGroup*>(tpu_program_group_interface);
+  tpu_program_group->Initialize(xla_tpu_programs);
+  tpu_program_group->may_modify_variables_ = may_modify_variables;
   return Status::OK();
 }
 
 TpuProgramGroup::TpuProgramGroup(TpuProgramGroup&& other)
     : may_modify_variables_(std::move(other.may_modify_variables_)),
-      host_compute_metadata_(std::move(other.host_compute_metadata_)),
       tpu_programs_(std::move(other.tpu_programs_)),
-      executable_info_(std::move(other.executable_info_)),
-      host_transfer_info_(std::move(other.host_transfer_info_)),
+      executable_infos_(std::move(other.executable_infos_)),
+      host_transfer_infos_(std::move(other.host_transfer_infos_)),
       hlo_metadatas_(std::move(other.hlo_metadatas_)) {
   RefreshHloMetadatasPtrs();
 }
@@ -225,6 +262,12 @@ absl::Span<const xla::HloProto* const> TpuProgramGroup::hlo_metadatas() const {
   return hlo_metadatas_ptrs_;
 }
 
+const xla::HloProto* TpuProgramGroup::hlo_metadata(int index) const {
+  CHECK_GE(index, 0);
+  CHECK_LT(index, hlo_metadatas_ptrs_.size());
+  return hlo_metadatas_ptrs_[index];
+}
+
 void TpuProgramGroup::RefreshHloMetadatasPtrs() {
   hlo_metadatas_ptrs_.reserve(hlo_metadatas_.size());
   for (const auto& hlo_metadata_internal_ : hlo_metadatas_) {
@@ -238,5 +281,94 @@ Status TpuProgramGroup::LogCompilationStats(const TpuCompilationCacheKey& key,
   // implementation can be pushing into some external storage for analytics.
   return Status::OK();
 }
+
+const std::vector<bool>& TpuProgramGroup::may_modify_variables() const {
+  return may_modify_variables_;
+}
+
+void TpuProgramGroup::set_may_modify_variables(
+    const std::vector<bool>& may_modify_variables) {
+  may_modify_variables_ = may_modify_variables;
+}
+
+const std::vector<XLA_TpuProgram*>& TpuProgramGroup::tpu_programs() const {
+  return tpu_programs_;
+}
+
+const XLA_TpuProgram* TpuProgramGroup::tpu_program(int index) const {
+  CHECK_GE(index, 0);
+  CHECK_LT(index, tpu_programs_.size());
+  return tpu_programs_[index];
+}
+
+void TpuProgramGroup::set_tpu_programs(
+    absl::Span<XLA_TpuProgram* const> tpu_programs) {
+  tpu_programs_.resize(tpu_programs.size());
+  for (size_t i = 0; i < tpu_programs.size(); ++i) {
+    tpu_programs_[i] = tpu_programs[i];
+  }
+}
+
+const TPUExecutableInfoProto& TpuProgramGroup::executable_info(
+    int index) const {
+  CHECK_GE(index, 0);
+  CHECK_LT(index, executable_infos_.size());
+  return executable_infos_[index];
+}
+
+const TPUHostTransferInfoProto& TpuProgramGroup::host_transfer_info(
+    int index) const {
+  CHECK_GE(index, 0);
+  CHECK_LT(index, host_transfer_infos_.size());
+  return host_transfer_infos_[index];
+}
+
+/*static*/
+Status TpuProgramGroup::CompileAndBuild(
+    const TpuCompilationRequestProto& compilation_request,
+    const XLA_TpuMeshState* mesh_state,
+    TpuProgramGroupInterface* tpu_program_group_interface) {
+  se_tpu::SerializedProto serialized_compilation_request =
+      se_tpu::SerializeProto(compilation_request);
+  auto cleanup = gtl::MakeCleanup([serialized_compilation_request] {
+    se_tpu::SerializedProto_Free(serialized_compilation_request);
+  });
+  size_t count = 0;
+  XLA_TpuProgram** xla_tpu_programs = nullptr;
+  StatusHelper status;
+  CompileApiFn()->TpuCompile_CompileAndBuildFn(serialized_compilation_request,
+                                               mesh_state, &xla_tpu_programs,
+                                               &count, status.c_status);
+  if (!status.ok()) {
+    VLOG(1) << "Run CompileAndBuild failed.";
+    return status.status();
+  }
+
+  // SPMD could return 1 result for all partitions.
+  TF_RET_CHECK(count == 1 ||
+               count == compilation_request.metadata().num_cores_per_replica());
+
+  VLOG(1) << "Initialize TpuProgramGroup.";
+  TpuProgramGroup* tpu_program_group =
+      tensorflow::down_cast<TpuProgramGroup*>(tpu_program_group_interface);
+  tpu_program_group->Initialize(
+      absl::MakeConstSpan(&xla_tpu_programs[0], count));
+  TpuProgramApiFn()->TpuProgram_FreeArrayFn(xla_tpu_programs);
+  return status.status();
+}
+
+std::vector<XLA_TpuProgram*> TpuProgramGroup::tpu_programs(
+    TpuProgramShardingType sharding_type) const {
+  std::vector<XLA_TpuProgram*> tpu_programs;
+  tpu_programs.reserve(tpu_programs_.size());
+  for (size_t i = 0; i < tpu_programs_.size(); ++i) {
+    if (TpuProgramApiFn()->TpuProgram_HasShardingFn(tpu_programs_[i])) {
+      tpu_programs.push_back(TpuProgramApiFn()->TpuProgram_GetTpuProgramFn(
+          tpu_programs_[i], sharding_type));
+      CHECK_NE(tpu_programs[i], nullptr);
+    }
+  }
+  return tpu_programs;
+}
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group.h b/tensorflow/core/tpu/kernels/tpu_program_group.h
index 19fbb7a21f0..b76ef3d507a 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group.h
+++ b/tensorflow/core/tpu/kernels/tpu_program_group.h
@@ -25,6 +25,8 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
 #include "tensorflow/core/tpu/kernels/tpu_executable_info.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h"
+#include "tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h"
 #include "tensorflow/core/tpu/kernels/tpu_program_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
 #include "tensorflow/stream_executor/tpu/tpu_platform_interface.h"
@@ -84,6 +86,14 @@ class TpuProgramGroup : public TpuProgramGroupInterface {
  public:
   using Status = ::stream_executor::port::Status;
 
+  // Compiles Mlir or TF function computation by lowering into HLO IR and
+  // returns TPU programs ready for execution.
+  static Status CompileAndBuild(
+      const TpuCompilationRequestProto& compilation_request,
+      const XLA_TpuMeshState* mesh_state,
+      TpuProgramGroupInterface* tpu_program_group_interface);
+
+  // Compiles HLO IR and returns TPU programs ready for execution.
   static Status Build(
       const TPUCompileMetadataProto& metadata,
       const tensorflow::XlaCompiler::CompilationResult& compilation_result,
@@ -92,11 +102,16 @@ class TpuProgramGroup : public TpuProgramGroupInterface {
       const absl::optional<xla::DeviceAssignment>& xla_device_assignment,
       TpuProgramGroupInterface* tpu_program_group_interface);
 
+  // Initializes `TpuProgramGroup` object with `xla_tpu_programs`.
+  void Initialize(absl::Span<XLA_TpuProgram* const> xla_tpu_programs);
+
   TpuProgramGroup() = default;
   TpuProgramGroup(TpuProgramGroup&& other);
   TpuProgramGroup& operator=(TpuProgramGroup&&) = delete;
 
-  size_t program_count() const override { return tpu_programs_.size(); }
+  bool has_sharding_program() const override;
+
+  size_t program_count() const override;
 
   int64_t program_size() const override;
 
@@ -107,55 +122,29 @@ class TpuProgramGroup : public TpuProgramGroupInterface {
   Status LogCompilationStats(const TpuCompilationCacheKey& key,
                              absl::Duration duration) override;
 
-  const std::vector<bool>& may_modify_variables() const override {
-    return may_modify_variables_;
-  }
-  void set_may_modify_variables(const std::vector<bool>& may_modify_variables) {
-    may_modify_variables_ = may_modify_variables;
-  }
+  const std::vector<bool>& may_modify_variables() const override;
+  void set_may_modify_variables(const std::vector<bool>& may_modify_variables);
 
-  const tf2xla::HostComputeMetadata& host_compute_metadata() const {
-    return host_compute_metadata_;
-  }
-  void set_host_compute_metadata(
-      const tf2xla::HostComputeMetadata& host_compute_metadata) {
-    host_compute_metadata_ = host_compute_metadata;
-  }
+  const std::vector<XLA_TpuProgram*>& tpu_programs() const;
+  std::vector<XLA_TpuProgram*> tpu_programs(TpuProgramShardingType type) const;
+  const XLA_TpuProgram* tpu_program(int index) const;
+  void set_tpu_programs(absl::Span<XLA_TpuProgram* const> tpu_programs);
 
-  const std::vector<XLA_TpuProgram*>& tpu_programs() const {
-    return tpu_programs_;
-  }
-  void set_tpu_programs(std::vector<XLA_TpuProgram*> tpu_programs) {
-    tpu_programs_ = tpu_programs;
-  }
-
-  const TPUExecutableInfoProto& executable_info() const {
-    return executable_info_;
-  }
-  void set_executable_info(const TPUExecutableInfoProto& executable_info) {
-    executable_info_ = executable_info;
-  }
-
-  const TPUHostTransferInfoProto& host_transfer_info() const {
-    return host_transfer_info_;
-  }
-  void set_host_transfer_info(
-      const TPUHostTransferInfoProto& host_transfer_info) {
-    host_transfer_info_ = host_transfer_info;
-  }
+  const TPUExecutableInfoProto& executable_info(int index) const;
 
+  const TPUHostTransferInfoProto& host_transfer_info(int index) const;
   void set_hlo_metadata(const xla::HloProto& hlo_metadata);
+  const xla::HloProto* hlo_metadata(int index) const;
   absl::Span<const xla::HloProto* const> hlo_metadatas() const override;
 
  private:
   void RefreshHloMetadatasPtrs();
 
   std::vector<bool> may_modify_variables_;
-  tf2xla::HostComputeMetadata host_compute_metadata_;
 
   std::vector<XLA_TpuProgram*> tpu_programs_;  // Not owned.
-  TPUExecutableInfoProto executable_info_;
-  TPUHostTransferInfoProto host_transfer_info_;
+  std::vector<TPUExecutableInfoProto> executable_infos_;
+  std::vector<TPUHostTransferInfoProto> host_transfer_infos_;
 
   // To be consistent with the TpuProgramGroupInterface::hlo_metadatas()
   // signature, we store HloProto values in hlo_metadatas_ when
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group_interface.h b/tensorflow/core/tpu/kernels/tpu_program_group_interface.h
index cb7347783b1..4af94f8e1ad 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group_interface.h
+++ b/tensorflow/core/tpu/kernels/tpu_program_group_interface.h
@@ -20,6 +20,8 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/time/time.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -34,13 +36,16 @@ class TpuProgramGroupInterface {
  public:
   virtual ~TpuProgramGroupInterface() = default;
 
+  // Check if whether sharding/unsharding program exists.
+  virtual bool has_sharding_program() const = 0;
+
   // Computes program count.
   virtual size_t program_count() const = 0;
 
   // Computes total program size.
   virtual int64_t program_size() const = 0;
 
-  // Unloads and destroys safely Tpu programs.
+  // Unloads and destroys safely TPU programs.
   virtual void UnloadAndDestroyPrograms() = 0;
 
   // Logs program memory summary.
diff --git a/tensorflow/core/tpu/kernels/tpu_util.h b/tensorflow/core/tpu/kernels/tpu_util.h
index f404ca4a8b7..579fbdf5e85 100644
--- a/tensorflow/core/tpu/kernels/tpu_util.h
+++ b/tensorflow/core/tpu/kernels/tpu_util.h
@@ -54,20 +54,6 @@ Status DynamicShapesToTensorShapes(const OpInputList& dynamic_shapes,
                                    std::vector<TensorShape>* shapes);
 Status DynamicShapesToTensorShapes(const InputList& dynamic_shapes,
                                    std::vector<TensorShape>* shapes);
-
-// Given a tensor of `shape` and `type`, as what shape should it be stored on
-// the TPU device? This function tranposes or flattens the excessively-padded
-// tensors to rank 1, but leaves other tensor shapes alone.
-xla::StatusOr<xla::Shape> TpuShapeRepresentation(const TensorShape& shape,
-                                                 DataType type,
-                                                 bool use_fast_memory);
-
-// Given a tensor, returns the shape of its representation on device,
-// fully padded. Contents of `shape` are undefined on error.
-Status TpuPaddedShapeFn(const Tensor& tensor, xla::Shape* shape);
-
-// A callback called on exit.
-void LogAndExit(int code);
 }  // namespace tpu
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/tpu/kernels/tpu_util_c_api.h b/tensorflow/core/tpu/kernels/tpu_util_c_api.h
index a6cc2f01703..ddc7a842f49 100644
--- a/tensorflow/core/tpu/kernels/tpu_util_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_util_c_api.h
@@ -19,14 +19,6 @@ limitations under the License.
 #include "tensorflow/core/tpu/libtftpu.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"
 
-typedef struct SE_Status SE_Status;
-
-enum TpuCoreTypeEnum {
-  kTensorCore,
-  kEmbeddingV1,
-  kEmbeddingV2,
-};
-
 // Property for creating compilation cache key.
 struct CompilationCacheKeyProperty {
   const char* config_prefix;
@@ -54,12 +46,6 @@ extern "C" {
 // Checks if whether a TPU compilation is enabled.
 TFTPU_CAPI_EXPORT bool TpuCompile_IsTpuCompilationEnabled();
 
-// Converts an XLA `Shape` into its equivalent TPU `Shape` representation.
-TFTPU_CAPI_EXPORT void TpuCompile_ToTpuShapeRepresentation(
-    TpuSerializedProto serialized_xla_shape, int data_type,
-    bool use_fast_memory, TpuSerializedProto* serialized_tensor_shape,
-    SE_Status* status);
-
 // XLA compilation cannot be cancelled. To avoid hanging the TF worker will exit
 // when cancellation is requested for an XLA compile op. Some tests require this
 // behavior to be disabled, and we test for this condition with the following
@@ -92,7 +78,6 @@ TFTPU_CAPI_EXPORT uint64_t TpuCompile_CreateGuaranteedConstFingerprint(
 struct TfTpu_UtilApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_IsTpuCompilationEnabled);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_ShouldTpuCompileOpIgnoreCancellation);
-  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_ToTpuShapeRepresentation);
   TFTPU_ADD_FN_IN_STRUCT(TpuTopology_AvailableCoreCount);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CreateCompilationCacheKey);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_DestroyCompilationCacheKey);
diff --git a/tensorflow/core/tpu/kernels/transfer_ops.cc b/tensorflow/core/tpu/kernels/transfer_ops.cc
new file mode 100644
index 00000000000..40b85e2cfbd
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/transfer_ops.cc
@@ -0,0 +1,98 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/kernels/transfer_ops.h"
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/stream_executor/tpu/tpu_node_context.h"
+#include "tensorflow/stream_executor/tpu/tpu_platform_interface.h"
+#include "tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.h"
+
+namespace tensorflow {
+
+TpuTransferAsyncOpKernel::TpuTransferAsyncOpKernel(OpKernelConstruction* ctx,
+                                                   const string& transfer_type,
+                                                   int number_of_threads)
+    : AsyncOpKernel(ctx),
+      thread_pool_(new thread::ThreadPool(
+          ctx->env(),
+          strings::StrCat(transfer_type, "_thread_",
+                          SanitizeThreadSuffix(def().name())),
+          /*num_threads=*/8)) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("device_ordinal", &device_ordinal_));
+  if (ctx->device_type() == DeviceType(DEVICE_CPU)) {
+    OP_REQUIRES(
+        ctx, device_ordinal_ >= 0,
+        errors::InvalidArgument(transfer_type,
+                                " ops must specify a device_ordinal when "
+                                "placed on CPU."));
+  }
+}
+
+void TpuTransferAsyncOpKernel::ComputeAsync(OpKernelContext* ctx,
+                                            DoneCallback done) {
+  CancellationToken token =
+      ctx->cancellation_manager()->get_cancellation_token();
+  bool already_cancelled;
+  {
+    // Only protect registering the cancellation callback as mu_ cannot be held
+    // at a point where `done` could be called.
+    mutex_lock lock(mu_);
+    already_cancelled = !ctx->cancellation_manager()->RegisterCallback(
+        token, [this]() { Cancel(); });
+  }
+  OP_REQUIRES_ASYNC(ctx, !already_cancelled,
+                    errors::Cancelled("Infeed was cancelled."), done);
+  thread_pool_->Schedule([this, ctx, done, token]() {
+    Status s = RunTransfer(ctx);
+    ctx->cancellation_manager()->DeregisterCallback(token);
+    OP_REQUIRES_OK_ASYNC(ctx, s, done);
+    done();
+  });
+}
+
+Status TpuTransferAsyncOpKernel::RunTransfer(OpKernelContext* ctx) {
+  auto* tpu_platform = tpu::TpuPlatformInterface::GetRegisteredPlatform();
+
+  int real_device_ordinal = device_ordinal_;
+  if (real_device_ordinal < 0) {
+    const XlaDevice::Metadata* metadata;
+    TF_RETURN_IF_ERROR(XlaDevice::GetMetadata(ctx, &metadata));
+    real_device_ordinal = metadata->device_ordinal();
+  }
+  stream_executor::StreamExecutor* stream_executor =
+      tpu_platform->ExecutorForDevice(real_device_ordinal).ValueOrDie();
+
+  // When Xprof profiling is off (which is the default), constructing the
+  // activity is simple enough that its overhead is negligible.
+  profiler::TraceMe activity(
+      [this] { return profiler::TraceMeOp(name(), type_string()); },
+      profiler::TraceMeLevel::kInfo);
+  return DoWork(
+      ctx, xla::TpuTransferManagerInterface::GetRegisteredTpuTransferManager(),
+      stream_executor);
+}
+
+void TpuTransferAsyncOpKernel::Cancel() {
+  mutex_lock lock(mu_);
+  TF_CHECK_OK(tpu::TpuNodeContext::CloseTpuHost());
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/transfer_ops.h b/tensorflow/core/tpu/kernels/transfer_ops.h
new file mode 100644
index 00000000000..d98d743f569
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/transfer_ops.h
@@ -0,0 +1,56 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TRANSFER_OPS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TRANSFER_OPS_H_
+
+#include "tensorflow/compiler/jit/xla_device.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/util/stream_executor_util.h"
+#include "tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.h"
+
+namespace tensorflow {
+
+// Base class providing common functionality for async ops that transfer from
+// host to TPU.
+class TpuTransferAsyncOpKernel : public AsyncOpKernel {
+ public:
+  explicit TpuTransferAsyncOpKernel(OpKernelConstruction* ctx,
+                                    const string& transfer_type,
+                                    int number_of_threads);
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
+
+ protected:
+  virtual Status DoWork(OpKernelContext* context,
+                        xla::TpuTransferManagerInterface* transfer_manager,
+                        stream_executor::StreamExecutor* stream_executor) = 0;
+
+ private:
+  Status RunTransfer(OpKernelContext* ctx);
+  void Cancel();
+
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+  int device_ordinal_;
+  mutex mu_;
+
+  // TpuTransferAsyncOpKernel is neither copyable nor movable.
+  TpuTransferAsyncOpKernel(const TpuTransferAsyncOpKernel&) = delete;
+  TpuTransferAsyncOpKernel& operator=(const TpuTransferAsyncOpKernel&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TRANSFER_OPS_H_
diff --git a/tensorflow/core/tpu/ops/BUILD b/tensorflow/core/tpu/ops/BUILD
new file mode 100644
index 00000000000..5b49f5abc78
--- /dev/null
+++ b/tensorflow/core/tpu/ops/BUILD
@@ -0,0 +1,88 @@
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "ops",
+    linkstatic = 1,
+    deps = [
+        ":host_compute_ops",
+        ":topk_ops",
+        ":tpu_compile_op",
+        ":tpu_execute_op",
+        ":tpu_partitioned_ops",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "tpu_partitioned_ops",
+    srcs = [
+        "tpu_partitioned_input_op.cc",
+        "tpu_partitioned_output_op.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "tpu_compile_op",
+    srcs = [
+        "tpu_compile_op.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "tpu_execute_op",
+    srcs = [
+        "tpu_execute_op.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "host_compute_ops",
+    srcs = [
+        "host_compute_ops.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "topk_ops",
+    srcs = [
+        "topk_ops.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/core/tpu/ops/host_compute_ops.cc b/tensorflow/core/tpu/ops/host_compute_ops.cc
new file mode 100644
index 00000000000..c053ac948f9
--- /dev/null
+++ b/tensorflow/core/tpu/ops/host_compute_ops.cc
@@ -0,0 +1,117 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+REGISTER_OP("_XlaHostComputeMlir")
+    .Input("inputs: Tinputs")
+    .Output("outputs: Toutputs")
+    .Attr("Tinputs: list(type) >= 0")
+    .Attr("Toutputs: list(type) >= 0")
+    .Attr("send_key: string")
+    .Attr("recv_key: string")
+    .Attr("tpu_core: int = 0")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      return ::tensorflow::shape_inference::UnknownShape(c);
+    })
+    .SetIsStateful()
+    .Doc(R"doc(
+A pseudo-op to represent host-side computation in an XLA program.
+
+inputs: A list of tensors that will be sent to the host.
+outputs: A list of tensors that will be returned to the device.
+Tinputs: The element types of each element in `inputs`.
+Toutputs: The element types of each element in `outputs`.
+send_key: A unique identifier for this region used to match up host recv.
+recv_key: A unique identifier for this region used to match up host send.
+tpu_core: Default core to use for host to device transfers.
+)doc");
+
+REGISTER_OP("XlaHostCompute")
+    .Input("inputs: Tinputs")
+    .Output("outputs: Toutputs")
+    .Attr("Tinputs: list(type) >= 0")
+    .Attr("Toutputs: list(type) >= 0")
+    .Attr("ancestors: list(string) >= 0")
+    .Attr("shapes: list(shape) >= 0")
+    .Attr("shape_inference_graph: func")
+    .Attr("key: string")
+    .Attr("cost_estimate_ns: int=1000000")
+    .Attr("tpu_core: int = 0")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      const AttrValue* graph;
+      TF_RETURN_IF_ERROR(c->attrs().Find("shape_inference_graph", &graph));
+      if (graph->func().name().empty()) {
+        const AttrValue* shapes;
+        TF_RETURN_IF_ERROR(c->attrs().Find("shapes", &shapes));
+        if (shapes->list().shape_size() != c->num_outputs()) {
+          return errors::InvalidArgument(
+              "_XlaHostCompute has ", c->num_outputs(),
+              " outputs but 'shapes' attr has ", shapes->list().shape_size(),
+              " elements");
+        }
+        for (int i = 0; i < c->num_outputs(); ++i) {
+          shape_inference::ShapeHandle handle;
+          TF_RETURN_IF_ERROR(
+              c->MakeShapeFromShapeProto(shapes->list().shape(i), &handle));
+          c->set_output(i, handle);
+        }
+        return Status::OK();
+      } else {
+        // There is a shape inference graph so the output shapes are not
+        // statically known.
+        return ::tensorflow::shape_inference::UnknownShape(c);
+      }
+    });
+
+REGISTER_OP("XlaSendToHost")
+    .Input("input: Tinput")
+    .Attr("Tinput: type")
+    .Attr("key: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      return ::tensorflow::shape_inference::UnknownShape(c);
+    })
+    .SetIsStateful();
+
+REGISTER_OP("XlaRecvFromHost")
+    .Output("output: Toutput")
+    .Attr("Toutput: type")
+    .Attr("shape: shape")
+    .Attr("key: string")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      const AttrValue* shape_attr;
+      TF_RETURN_IF_ERROR(c->attrs().Find("shape", &shape_attr));
+      if (!shape_attr->has_shape()) {
+        return errors::InvalidArgument(
+            "XlaRecvFromHost op does not have valid \"Toutput\" attr.");
+      }
+      shape_inference::ShapeHandle handle;
+      TF_RETURN_IF_ERROR(
+          c->MakeShapeFromShapeProto(shape_attr->shape(), &handle));
+      c->set_output(0, handle);
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/ops/topk_ops.cc b/tensorflow/core/tpu/ops/topk_ops.cc
new file mode 100644
index 00000000000..56a00253759
--- /dev/null
+++ b/tensorflow/core/tpu/ops/topk_ops.cc
@@ -0,0 +1,85 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+using shape_inference::ShapeHandle;
+
+REGISTER_OP("KthOrderStatistic")
+    .Input("input: float32")
+    .Output("output: float32")
+    .Attr("k: int")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      ShapeHandle input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
+
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->Subshape(input, 0, -1, &s));
+      c->set_output(0, s);
+      return Status::OK();
+    });
+
+REGISTER_OP("TopKUnique")
+    .Input("input: float32")
+    .Output("topk: float32")
+    .Output("topk_indices: int32")
+    .Attr("k: int")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      ShapeHandle input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
+
+      int32 k;
+      TF_RETURN_IF_ERROR(c->GetAttr("k", &k));
+
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->ReplaceDim(input, 1, c->MakeDim(k), &s));
+      c->set_output(0, s);
+      c->set_output(1, s);
+      return Status::OK();
+    });
+
+REGISTER_OP("MakeUnique")
+    .Input("input: float32")
+    .Output("output: float32")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      ShapeHandle input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
+      c->set_output(0, input);
+      return Status::OK();
+    });
+
+REGISTER_OP("TopKWithUnique")
+    .Input("input: float32")
+    .Output("topk: float32")
+    .Output("topk_indices: int32")
+    .Attr("k: int")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      ShapeHandle input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
+
+      int32 k;
+      TF_RETURN_IF_ERROR(c->GetAttr("k", &k));
+
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->ReplaceDim(input, 1, c->MakeDim(k), &s));
+      c->set_output(0, s);
+      c->set_output(1, s);
+      return Status::OK();
+    });
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/ops/tpu_compile_op.cc b/tensorflow/core/tpu/ops/tpu_compile_op.cc
new file mode 100644
index 00000000000..6f62e36f857
--- /dev/null
+++ b/tensorflow/core/tpu/ops/tpu_compile_op.cc
@@ -0,0 +1,104 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+REGISTER_OP("_TPUCompileMlir")
+    .Attr("num_computations: int >= 0")
+    .Attr("mlir_module: string=\"\"")
+    .Attr("metadata: string")
+    .Attr("NumDynamicShapes: int >= 0")
+    // Do not try to optimize me away. We would like the compilation-op to be
+    // invoked for every step, and not be constant-folded away, in case the
+    // program is evicted from the compilation cache.
+    .SetIsStateful()
+    .Input("dynamic_shapes: NumDynamicShapes * int64")
+    .Output("compilation_status: string")
+    .Output("program: num_computations * string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      int num_computations;
+      TF_RETURN_IF_ERROR(
+          GetNodeAttr(c->attrs(), "num_computations", &num_computations));
+      // Compilation status.
+      c->set_output(0, c->Scalar());
+      // Programs.
+      for (int i = 0; i < num_computations; ++i) {
+        c->set_output(i + 1, c->Vector(2));
+      }
+      return Status::OK();
+    });
+
+REGISTER_OP("_TPUCompileMlirPlaceholderProgramKey")
+    .SetIsStateful()
+    .Output("program: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Vector(2));
+      return Status::OK();
+    })
+    .SetIsStateful()
+    .Doc(
+        R"(
+Placeholder program key (compilation cache key) of a _TPUCompileMlir `program`.
+
+This op can be used when certain rewrite passes materialize ops that require a
+program key but the _TPUCompileMlir op has not been added yet. Subsequent
+rewrite passes must replace this op with a _TPUCompileMlir op `program` output.
+)");
+
+REGISTER_OP("TPUCompile")
+    .Attr("num_computations: int >= 0")
+    .Attr("function: func")
+    .Attr("metadata: string")
+    .Attr("NumDynamicShapes: int >= 0")
+    .Attr("Tguaranteed_constants: list(type) >= 0")
+    // Do not try to optimize me away. We would like the compilation-op to be
+    // invoked for every step, and not be constant-folded away, in case the
+    // program is evicted from the compilation cache.
+    .SetIsStateful()
+    .Input("dynamic_shapes: NumDynamicShapes * int64")
+    .Input("guaranteed_constants: Tguaranteed_constants")
+    .Output("compilation_status: string")
+    .Output("program: num_computations * string")
+    .Output("may_modify_variables: num_computations * bool")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      int num_computations;
+      TF_RETURN_IF_ERROR(
+          GetNodeAttr(c->attrs(), "num_computations", &num_computations));
+      // Compilation status.
+      c->set_output(0, c->Scalar());
+      // Programs.
+      for (int i = 0; i < num_computations; ++i) {
+        c->set_output(i + 1, c->Vector(2));
+      }
+      // May modify variables.
+      for (int i = 0; i < num_computations; ++i) {
+        c->set_output(num_computations + i + 1, c->Scalar());
+      }
+      return Status::OK();
+    });
+
+REGISTER_OP("TPUCompileSucceededAssert")
+    .Input("compilation_status: string")
+    // Do not optimize me away. Read the comment on TPUCompileOp for more
+    // details.
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::NoOutputs);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/ops/tpu_execute_op.cc b/tensorflow/core/tpu/ops/tpu_execute_op.cc
new file mode 100644
index 00000000000..68ddc862031
--- /dev/null
+++ b/tensorflow/core/tpu/ops/tpu_execute_op.cc
@@ -0,0 +1,60 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+REGISTER_OP("TPUExecute")
+    .Input("args: Targs")
+    .Attr("Targs: list(type) >= 0")
+    .Input("key: string")
+    .Output("results: Tresults")
+    .Attr("Tresults: list(type) >= 0")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle key;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 1), 1, &key));
+      shape_inference::DimensionHandle unused;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(key, 0), 2, &unused));
+      for (int i = 0; i < c->num_outputs(); ++i) {
+        c->set_output(i, c->UnknownShape());
+      }
+      return Status::OK();
+    });
+
+REGISTER_OP("TPUExecuteAndUpdateVariables")
+    .Input("args: Targs")
+    .Attr("Targs: list(type) >= 0")
+    .Input("key: string")
+    .Output("results: Tresults")
+    .Attr("Tresults: list(type) >= 0")
+    .Attr("device_var_reads_indices: list(int) >= 0")
+    .Attr("device_var_updates_indices: list(int) >= 0")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle key;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 1), 1, &key));
+      shape_inference::DimensionHandle unused;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(key, 0), 2, &unused));
+      for (int i = 0; i < c->num_outputs(); ++i) {
+        c->set_output(i, c->UnknownShape());
+      }
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/ops/tpu_partitioned_input_op.cc b/tensorflow/core/tpu/ops/tpu_partitioned_input_op.cc
new file mode 100644
index 00000000000..f6ccf279956
--- /dev/null
+++ b/tensorflow/core/tpu/ops/tpu_partitioned_input_op.cc
@@ -0,0 +1,99 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+ShapeHandle _UpdatePartitionDim(InferenceContext* c, const ShapeHandle handle,
+                                const int partition_dim) {
+  ShapeHandle newoutput0;
+  shape_inference::DimensionHandle new_dim;
+  TF_CHECK_OK(
+      c->Multiply(c->Dim(handle, partition_dim), c->num_inputs(), &new_dim));
+  TF_CHECK_OK(c->ReplaceDim(handle, partition_dim, new_dim, &newoutput0));
+  return newoutput0;
+}
+
+REGISTER_OP("TPUPartitionedInput")
+    .Input("inputs: N * T")
+    .Output("output: T")
+    .Attr("N: int >= 1")
+    .Attr("T: type")
+    .Attr("partition_dim: int = 0")
+    .SetShapeFn([](InferenceContext* c) {
+      DataType dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("T", &dtype));
+      int partition_dim;
+      TF_RETURN_IF_ERROR(c->GetAttr("partition_dim", &partition_dim));
+
+      ShapeHandle cur = c->input(c->num_inputs() - 1);
+      for (int i = c->num_inputs() - 2; i >= 0; --i) {
+        TF_RETURN_WITH_CONTEXT_IF_ERROR(c->Merge(c->input(i), cur, &cur),
+                                        "From merging shape ", i,
+                                        " with other shapes.");
+      }
+      if (partition_dim == -1 || dtype == DT_RESOURCE) {
+        c->set_output(0, cur);
+      } else {
+        ShapeHandle newoutput0 = _UpdatePartitionDim(c, cur, partition_dim);
+        c->set_output(0, newoutput0);
+      }
+
+      // If this is a resource, unify the resource shapes.
+      if (dtype == DT_RESOURCE) {
+        ShapeHandle previous_shape_handle;
+        for (int i = c->num_inputs() - 1; i >= 0; --i) {
+          ShapeHandle shape_handle =
+              c->input_handle_shapes_and_types(i)->at(0).shape;
+          if (!c->FullyDefined(shape_handle)) {
+            return errors::InvalidArgument("Inputs must have static shape,",
+                                           "input[", i,
+                                           "] has unknown dimension.");
+          }
+          if (i != c->num_inputs() - 1) {
+            ShapeHandle tmp;
+            if (!c->Merge(shape_handle, previous_shape_handle, &tmp).ok()) {
+              return errors::InvalidArgument(
+                  "Inputs must have the same shape.");
+            }
+          } else {
+            previous_shape_handle = shape_handle;
+          }
+        }
+        if (partition_dim == -1) {
+          c->set_output_handle_shapes_and_types(
+              0, *c->input_handle_shapes_and_types(0));
+        } else {
+          ShapeHandle newoutput0 =
+              _UpdatePartitionDim(c, previous_shape_handle, partition_dim);
+
+          std::vector<shape_inference::ShapeAndType> output_shapes_and_types;
+          output_shapes_and_types.push_back(shape_inference::ShapeAndType(
+              newoutput0, c->input_handle_shapes_and_types(0)->at(0).dtype));
+          c->set_output_handle_shapes_and_types(0, output_shapes_and_types);
+        }
+      }
+
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/ops/tpu_partitioned_output_op.cc b/tensorflow/core/tpu/ops/tpu_partitioned_output_op.cc
new file mode 100644
index 00000000000..5282abff679
--- /dev/null
+++ b/tensorflow/core/tpu/ops/tpu_partitioned_output_op.cc
@@ -0,0 +1,57 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+REGISTER_OP("TPUPartitionedOutput")
+    .Input("inputs:  T")
+    .Output("output: num_splits * T")
+    .Attr("T: type")
+    .Attr("num_splits: int >= 1")
+    .Attr("partition_dim: int = 0")
+    .SetShapeFn([](InferenceContext* c) {
+      DataType dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("T", &dtype));
+      int partition_dim;
+      TF_RETURN_IF_ERROR(c->GetAttr("partition_dim", &partition_dim));
+      int num_splits;
+      TF_RETURN_IF_ERROR(c->GetAttr("num_splits", &num_splits));
+      if (dtype == DT_RESOURCE) {
+        return errors::Unimplemented("Not implemented.");
+      }
+
+      ShapeHandle input = c->input(0);
+      ShapeHandle newoutput0;
+      shape_inference::DimensionHandle new_dim;
+      TF_RETURN_WITH_CONTEXT_IF_ERROR(
+          c->Divide(c->Dim(input, partition_dim), num_splits,
+                    true /* evenly_divisible */, &new_dim),
+          "Number of ways to split should evenly divide the split dimension");
+      TF_CHECK_OK(c->ReplaceDim(input, partition_dim, new_dim, &newoutput0));
+      for (int i = num_splits - 1; i >= 0; --i) {
+        c->set_output(i, newoutput0);
+      }
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_api.cc b/tensorflow/core/tpu/tpu_api.cc
index 8dad82b3029..cd6ca80e4e7 100644
--- a/tensorflow/core/tpu/tpu_api.cc
+++ b/tensorflow/core/tpu/tpu_api.cc
@@ -38,6 +38,16 @@ TfTpu_CompileApiFn* CompileApiFn() {
   return &compile_api_fn;
 }
 
+TfTpu_ExecuteApiFn* ExecuteApiFn() {
+  static TfTpu_ExecuteApiFn execute_api_fn;
+  return &execute_api_fn;
+}
+
+TfTpu_TpuProgramApiFn* TpuProgramApiFn() {
+  static TfTpu_TpuProgramApiFn tpu_program_api_fn;
+  return &tpu_program_api_fn;
+}
+
 TfTpu_ExecutorApiFn* ExecutorApiFn() {
   static TfTpu_ExecutorApiFn executor_api_fn;
   return &executor_api_fn;
diff --git a/tensorflow/core/tpu/tpu_api.h b/tensorflow/core/tpu/tpu_api.h
index c47ace6601d..b6edbfd14bb 100644
--- a/tensorflow/core/tpu/tpu_api.h
+++ b/tensorflow/core/tpu/tpu_api.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_TPU_TPU_API_H_
 
 #include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
+#include "tensorflow/core/tpu/kernels/tpu_execute_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/core/tpu/libtftpu.h"
@@ -35,6 +36,10 @@ TfTpu_MeshStateApiFn* MeshStateApiFn();
 
 TfTpu_CompileApiFn* CompileApiFn();
 
+TfTpu_ExecuteApiFn* ExecuteApiFn();
+
+TfTpu_TpuProgramApiFn* TpuProgramApiFn();
+
 TfTpu_ExecutorApiFn* ExecutorApiFn();
 
 TfTpu_NodeContextApiFn* NodeContextApiFn();
diff --git a/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
index 383f7a445f7..2f11e06cced 100644
--- a/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
+++ b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/core/tpu/tpu_node_device.h"
 #include "tensorflow/core/tpu/tpu_system_device.h"
+#include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_platform.h"
 #endif
 
@@ -30,7 +31,7 @@ limitations under the License.
   Struct->FnName##Fn =                                                       \
       reinterpret_cast<decltype(FnName)*>(dlsym(library_handle, #FnName));   \
   if (!(Struct->FnName##Fn)) {                                               \
-    LOG(ERROR) << #FnName " not available in this library.";                 \
+    LOG(FATAL) << #FnName " not available in this library.";                 \
     return errors::Unimplemented(#FnName " not available in this library."); \
   }
 
@@ -62,7 +63,7 @@ Status InitializeTpuLibrary(void* library_handle) {
     RegisterTpuPlatform();
     RegisterTpuSystemDevice();
     RegisterTpuNodeDevice(
-        /*tpu_autoclustering_flag=*/false,
+        /*tpu_autoclustering=*/false,
         /*tpu_xla_device_failure_closes_chips=*/true,
         /*tpu_use_substreams_for_cross_tpu_device_transfers=*/true);
   }
diff --git a/tensorflow/core/tpu/tpu_api_dlsym_initializer.h b/tensorflow/core/tpu/tpu_api_dlsym_initializer.h
index 257fa25ad37..1126e132264 100644
--- a/tensorflow/core/tpu/tpu_api_dlsym_initializer.h
+++ b/tensorflow/core/tpu/tpu_api_dlsym_initializer.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
+#include "tensorflow/core/tpu/kernels/tpu_execute_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/core/tpu/libtftpu.h"
diff --git a/tensorflow/core/tpu/tpu_compilation_device.cc b/tensorflow/core/tpu/tpu_compilation_device.cc
index 2b2314820bc..f124000c9e3 100644
--- a/tensorflow/core/tpu/tpu_compilation_device.cc
+++ b/tensorflow/core/tpu/tpu_compilation_device.cc
@@ -18,7 +18,14 @@ limitations under the License.
 #include "tensorflow/core/tpu/tpu_node_device_util.h"
 
 namespace tensorflow {
+namespace {
 
-REGISTER_XLA_BACKEND(DEVICE_TPU_XLA_JIT, kTpuAllTypes, TpuOpFilter);
+bool RegisterTpuXlaBackend() {
+  REGISTER_XLA_BACKEND(DEVICE_TPU_XLA_JIT, kTpuAllTypes, TpuOpFilter);
+  return true;
+}
 
+static bool tpu_xla_backend_registered = RegisterTpuXlaBackend();
+
+}  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_compile_interface.cc b/tensorflow/core/tpu/tpu_compile_interface.cc
new file mode 100644
index 00000000000..1f585565f3a
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_compile_interface.cc
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/tpu_compile_interface.h"
+
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/logging.h"
+
+class TpuCompileInterfaceExternal : public TpuCompileInterface {
+ public:
+  uint64_t FingerprintString(absl::string_view str) override {
+    return ::tensorflow::Fingerprint64(str);
+  }
+};
+
+static TpuCompileInterface* impl_ = new TpuCompileInterfaceExternal;
+TpuCompileInterface* TpuCompileInterface::Get() { return impl_; }
+
+bool TpuCompileInterface::RegisterImplementation(TpuCompileInterface* impl) {
+  VLOG(1) << "Updating TpuCompileInterface.";
+  if (impl_ != nullptr) {
+    delete impl_;
+  }
+  impl_ = impl;
+  return true;
+}
diff --git a/tensorflow/core/tpu/tpu_compile_interface.h b/tensorflow/core/tpu/tpu_compile_interface.h
new file mode 100644
index 00000000000..7e7b1f8315a
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_compile_interface.h
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_TPU_COMPILE_INTERFACE_H_
+#define TENSORFLOW_CORE_TPU_TPU_COMPILE_INTERFACE_H_
+
+#include "absl/strings/string_view.h"
+
+// Some legacy code requires different implementations for operations like
+// fingerprint/hashing during compilation and/or graph rewriting. These
+// alternate implementations can be registered (via a module initializer) to
+// change the default behavior.
+class TpuCompileInterface {
+ public:
+  virtual ~TpuCompileInterface() {}
+  static TpuCompileInterface* Get();
+  static bool RegisterImplementation(TpuCompileInterface* impl);
+
+  virtual uint64_t FingerprintString(absl::string_view str) = 0;
+};
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_COMPILE_INTERFACE_H_
diff --git a/tensorflow/core/tpu/tpu_config_c_api.h b/tensorflow/core/tpu/tpu_config_c_api.h
index a96cbf38f64..55de89d17c9 100644
--- a/tensorflow/core/tpu/tpu_config_c_api.h
+++ b/tensorflow/core/tpu/tpu_config_c_api.h
@@ -35,8 +35,8 @@ extern "C" {
 
 TFTPU_CAPI_EXPORT void ConfigureDistributedTpuOp_DoWork(
     const size_t num_cores_per_host_size, const int32_t* num_cores_per_host,
-    size_t* host_config_output_size, char** host_config_output,
-    TF_Status* status);
+    void* tpu_compilation_cache_interface, size_t* host_config_output_size,
+    char** host_config_output, TF_Status* status);
 
 TFTPU_CAPI_EXPORT void WaitForDistributedTpuOp_DoWork(
     const size_t num_hosts, const size_t num_cores_per_host,
@@ -49,8 +49,8 @@ TFTPU_CAPI_EXPORT void ShutdownDistributedTpuOp_DoWork(TF_Status* status);
 
 TFTPU_CAPI_EXPORT void InitializeHostForDistributedTpuOp_DoWork(
     const size_t tpu_host_config_size, const char* tpu_host_config,
-    const bool enable_whole_mesh_compilations, size_t* core_id_output_size,
-    int32_t** core_id_output, TF_Status* status);
+    const bool enable_whole_mesh_compilations, void* local_compilation_cache,
+    size_t* core_id_output_size, int32_t** core_id_output, TF_Status* status);
 
 TFTPU_CAPI_EXPORT void SetGlobalTPUArrayOp_DoWork(
     const size_t tpu_topology_size, const char* tpu_topology,
diff --git a/tensorflow/core/tpu/tpu_defs.cc b/tensorflow/core/tpu/tpu_defs.cc
index ad7f02a3d95..69669bfdb7b 100644
--- a/tensorflow/core/tpu/tpu_defs.cc
+++ b/tensorflow/core/tpu/tpu_defs.cc
@@ -24,4 +24,7 @@ const char* const DEVICE_TPU_XLA_JIT = "XLA_TPU_JIT";
 const char* const TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR =
     "_mirrored_variable_indices";
 
+const char* const kTPUReplicateAttr = "_tpu_replicate";
+const char* const kOutsideCompilationAttr = "_xla_outside_compilation";
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_defs.h b/tensorflow/core/tpu/tpu_defs.h
index 294b4253ee0..008e386dde6 100644
--- a/tensorflow/core/tpu/tpu_defs.h
+++ b/tensorflow/core/tpu/tpu_defs.h
@@ -47,10 +47,14 @@ extern const char* const TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR;
 // variable.
 extern const char* const TPU_FAST_MEM_ATTR;  // "_TPU_FAST_MEM"
 
+extern const char* const kTPUReplicateAttr;
+extern const char* const kOutsideCompilationAttr;
+
 // Supported types for TPUs.
-static constexpr std::array<DataType, 11> kTpuAllTypes = {
-    {DT_INT32, DT_UINT32, DT_BFLOAT16, DT_FLOAT, DT_DOUBLE, DT_BOOL,
-     DT_COMPLEX64, DT_INT64, DT_UINT64, DT_QINT8, DT_QUINT8}};
+static constexpr std::array<DataType, 16> kTpuAllTypes = {
+    {DT_INT32, DT_UINT32, DT_HALF, DT_BFLOAT16, DT_FLOAT, DT_DOUBLE, DT_BOOL,
+     DT_COMPLEX64, DT_INT64, DT_UINT64, DT_QINT8, DT_QUINT8, DT_INT8, DT_UINT8,
+     DT_INT16, DT_UINT16}};
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
index 22790db5a3e..961858665a4 100644
--- a/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
+++ b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
@@ -336,8 +336,7 @@ Status LoadOpShapeFunction::operator()(
                     });
   std::vector<shape_inference::ShapeHandle> inputs(user_param_count);
   int input_index = 0;
-  for (int i = 0, iter_limit = state_variable_specs.size(); i < iter_limit;
-       ++i) {
+  for (int i = 0, end = state_variable_specs.size(); i < end; ++i) {
     if (state_variable_specs[i].has_user_defined() || is_debug_op_) {
       std::vector<shape_inference::ShapeHandle> input_temp;
       TF_RETURN_IF_ERROR(c->input(state_variable_specs[i].name(), &input_temp));
@@ -389,8 +388,7 @@ Status RetrieveOpShapeFunction::operator()(
   TF_RETURN_IF_ERROR(c->GetAttr("num_shards", &num_shards));
   int shard_id;
   TF_RETURN_IF_ERROR(c->GetAttr("shard_id", &shard_id));
-  for (int j = 0, iter_limit = state_variable_specs.size(); j < iter_limit;
-       ++j) {
+  for (int j = 0, end = state_variable_specs.size(); j < end; ++j) {
     if (state_variable_specs[j].has_user_defined() || is_debug_op_) {
       auto shape = c->MakeShape(
           std::vector<shape_inference::DimensionHandle>(2, c->UnknownDim()));
diff --git a/tensorflow/core/tpu/tpu_execute.cc b/tensorflow/core/tpu/tpu_execute.cc
new file mode 100644
index 00000000000..29a05c0d538
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_execute.cc
@@ -0,0 +1,513 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/tpu_execute.h"
+
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+
+#include "absl/base/casts.h"
+#include "absl/memory/memory.h"
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/service/computation_layout.h"
+#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
+#include "tensorflow/compiler/xla/service/service_executable_run_options.h"
+#include "tensorflow/compiler/xla/service/transfer_manager.h"
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/shape_layout.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/tpu/tpu_api.h"
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/tpu/c_api_conversions.h"
+#include "tensorflow/stream_executor/tpu/status_helper.h"
+#include "tensorflow/stream_executor/tpu/tpu_executable.h"
+#include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
+#include "tensorflow/stream_executor/tpu/tpu_platform_interface.h"
+
+namespace tensorflow {
+
+namespace {
+
+using ::tensorflow::tpu::TpuNodeContext;
+
+static bool tpu_cancellation_terminates_process = false;
+static bool tpu_cancellation_closes_chips = true;
+
+// Host-side runtime for transfers between TPU and host.
+// TODO(b/161940519): Implement this class.
+class HostTransferManager {
+ public:
+  explicit HostTransferManager(TpuNodeContext*, xla::Backend*) {}
+
+  using HostCommmandHandler = xla::TpuExecutable::HostCommandHandler;
+
+  // Returns a function to be called when the TPU triggers a host command
+  // interrupt while executing the current program.
+  xla::StatusOr<HostCommmandHandler> Initialize(
+      const TPUHostTransferInfoProto& program,
+      const std::string& rendezvous_key_base, OpKernelContext* ctx);
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(HostTransferManager);
+};
+
+xla::StatusOr<HostTransferManager::HostCommmandHandler>
+HostTransferManager::Initialize(const TPUHostTransferInfoProto& program,
+                                const string& rendezvous_key_base,
+                                OpKernelContext* ctx) {
+  return HostCommmandHandler([](uint32, int64) {
+    LOG(WARNING) << "HostTransferManager is unimplemented.";
+  });
+}
+
+// Sleep for 5 seconds, then call std::quick_exit(42) to quickly restart.
+void ExitCountdown(Env* env) {
+  const int kSleepSeconds = 5;
+  LOG(INFO) << "TpuExecute was cancelled. Sleeping for " << kSleepSeconds
+            << " seconds before terminating the process to give time "
+               "for other errors to propagate";
+  env->SleepForMicroseconds(kSleepSeconds * 1000000);
+  LOG(ERROR) << "Aborting process due to cancelled TPUExecute. Consult "
+                "the anomalies reported above (if any), run state of job "
+                "(including failed RPCs) and worker logs. This "
+                "termination is to ensure a consistent state, if your job "
+                "does not restart, modify the retries allowed. See "
+                "b/62262381 and b/65223927.";
+  std::quick_exit(42);
+}
+
+xla::Shape HostShapeToDeviceShape(const xla::Shape& host_shape) {
+  XLA_Shape c_host_shape;
+  XLA_Shape c_device_shape;
+  ApiConverter::ToC(host_shape, &c_host_shape);
+  tensorflow::tpu::ExecuteApiFn()->HardwareLayout_HostShapeToDeviceShapeFn(
+      &c_host_shape, &c_device_shape);
+  xla::Shape device_shape = ApiConverter::FromC(&c_device_shape);
+  ApiConverter::Free(&c_host_shape);
+  ApiConverter::Free(&c_device_shape);
+  return device_shape;
+}
+
+int64 ShapeSizeCompact(const xla::Shape& shape) {
+  XLA_Shape c_shape;
+  ApiConverter::ToC(shape, &c_shape);
+  int64 size =
+      tensorflow::tpu::ExecuteApiFn()->HardwareLayout_ShapeSizeCompactFn(
+          &c_shape);
+  ApiConverter::Free(&c_shape);
+  return size;
+}
+
+int64 ShapeSizeCompactRaw(const xla::Shape& shape) {
+  XLA_Shape c_shape;
+  ApiConverter::ToC(shape, &c_shape);
+  int64 size =
+      tensorflow::tpu::ExecuteApiFn()->HardwareLayout_ShapeSizeCompactRawFn(
+          &c_shape);
+  ApiConverter::Free(&c_shape);
+  return size;
+}
+
+// Given a tuple, fix all non-leaf nodes (tuples) such that the tuple tables
+// point to the correct leaf nodes.
+xla::Status FixTupleTableAsync(se::Stream* stream,
+                               const xla::Shape& tuple_shape,
+                               xla::ExecutionInput* mem,
+                               xla::TransferManager* transfer_manager) {
+  return xla::ShapeUtil::ForEachSubshapeWithStatus(
+      tuple_shape,
+      [&](const xla::Shape& element_shape,
+          const xla::ShapeIndex& index) -> Status {
+        if (!element_shape.IsTuple()) {
+          return Status::OK();
+        }
+        std::vector<se::DeviceMemoryBase> elements;
+        xla::ShapeIndex element_index = index;
+        element_index.push_back(0);
+        for (int64 i = 0; i < element_shape.tuple_shapes_size(); ++i) {
+          // Gather all children of the tuple element.
+          element_index.back() = i;
+          elements.push_back(mem->Buffer(element_index).AsDeviceMemoryBase());
+        }
+        se::DeviceMemoryBase tuple_table_addr =
+            mem->Buffer(index).AsDeviceMemoryBase();
+        return transfer_manager->WriteSingleTupleIndexTable(
+            stream, elements, element_shape, &tuple_table_addr);
+      });
+}
+
+// Returns true if `dynamic_shape` has dimensions that are less-equal to the
+// "bounded_shape".
+bool DynamicShapeIsCompatible(const xla::Shape& dynamic_shape,
+                              const xla::Shape& bounded_shape) {
+  if (dynamic_shape.rank() != bounded_shape.rank()) {
+    return false;
+  }
+  for (int64 i = 0; i < dynamic_shape.rank(); ++i) {
+    if (dynamic_shape.dimensions(i) > bounded_shape.dimensions(i)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// For dynamic inputs, copy them and attach metadata of shape sizes to the
+// end of the tensor.
+//
+// The buffer for dynamic shapes contains three parts:
+// +--------+
+// |Payload |
+// +--------+
+// | Padding|
+// +--------+
+// |Metadata|
+// +--------+
+//
+// Metadata contains the sizes of shape without padding, eventually
+// representing the size of valid data.
+xla::Status UpdateDynamicInputs(
+    se::Stream* stream, se::DeviceMemoryAllocator* allocator,
+    std::vector<xla::ExecutionInput>* runtime_inputs,
+    const std::vector<xla::Shape>& compile_time_shapes) {
+  TF_RET_CHECK(runtime_inputs->size() == compile_time_shapes.size());
+  for (int64 i = 0; i < compile_time_shapes.size(); i++) {
+    // TODO(yunxing): Iterating over thousands of elements can be slow. One way
+    // to optimize for fast path without dynamic shapes is add a field in
+    // compilation result indicating if dynamic input is presented.
+    if (compile_time_shapes[i].is_static()) {
+      continue;
+    }
+    auto& runtime_input = (*runtime_inputs)[i];
+    xla::Shape compile_time_shapes_on_device =
+        HostShapeToDeviceShape(compile_time_shapes[i]);
+    bool element_modified = false;
+    TF_RETURN_IF_ERROR(xla::ShapeUtil::ForEachSubshapeWithStatus(
+        compile_time_shapes_on_device,
+        [&](const xla::Shape& compile_time_shape,
+            const xla::ShapeIndex& index) -> Status {
+          if (compile_time_shape.IsTuple() || compile_time_shape.is_static()) {
+            return Status::OK();
+          }
+
+          const xla::Shape& runtime_shape =
+              xla::ShapeUtil::GetSubshape(runtime_input.shape(), index);
+
+          TF_RET_CHECK(!runtime_shape.IsTuple());
+          TF_RET_CHECK(
+              DynamicShapeIsCompatible(runtime_shape, compile_time_shape));
+
+          xla::MaybeOwningDeviceMemory* mutable_input_mem =
+              runtime_input.MutableBuffer(index);
+          auto padded_data = std::make_shared<std::vector<int8>>(
+              ShapeSizeCompact(compile_time_shape), -1);
+          auto raw_input_runtime = std::make_shared<std::vector<uint32>>(
+              ShapeSizeCompact(runtime_shape) / sizeof(uint32));
+          stream->ThenMemcpyD2H(
+              se::DeviceMemory<int8>(mutable_input_mem->AsDeviceMemoryBase()),
+              absl::MakeSpan(absl::bit_cast<int8*>(raw_input_runtime->data()),
+                             ShapeSizeCompactRaw(runtime_shape)));
+          stream->ThenDoHostCallback([raw_input_runtime, padded_data,
+                                      runtime_shape, compile_time_shape]() {
+            // After getting the data onto the host, transpose the data to
+            // the correct layout by delinearizing it and linearizing it again.
+            XLA_Shape c_runtime_shape, c_compile_time_shape;
+            ApiConverter::ToC(runtime_shape, &c_runtime_shape);
+            ApiConverter::ToC(compile_time_shape, &c_compile_time_shape);
+            StatusHelper status;
+            tensorflow::tpu::ExecuteApiFn()
+                ->TpuExecute_RuntimeInputToPaddedDataFn(
+                    raw_input_runtime->data(), raw_input_runtime->size(),
+                    padded_data->data(), padded_data->size(), &c_runtime_shape,
+                    &c_compile_time_shape, status.c_status);
+            ApiConverter::Free(&c_runtime_shape);
+            ApiConverter::Free(&c_compile_time_shape);
+            return status.status();
+          });
+          // Allocate new input and transfer the padded and transposed data to
+          // the new input location.
+          TF_ASSIGN_OR_RETURN(
+              auto new_input,
+              allocator->Allocate(stream->parent()->device_ordinal(),
+                                  ShapeSizeCompact(compile_time_shape)));
+          auto typed_new_input_memory =
+              se::DeviceMemory<int8>(new_input.cref());
+          stream->ThenMemcpyH2D<int8>(*padded_data, &typed_new_input_memory);
+
+          // Retain the memory until the end of the transfer.
+          stream->ThenDoHostCallback([padded_data]() { return Status::OK(); });
+
+          // Modify the memory location in the input shape tree to point to the
+          // new input.
+          *mutable_input_mem =
+              xla::MaybeOwningDeviceMemory(std::move(new_input));
+          element_modified = true;
+          return Status::OK();
+        }));
+    if (element_modified) {
+      // The input location has been modified, need to fix tuple table to
+      // point to the correct address.
+      TF_ASSIGN_OR_RETURN(
+          auto transfer_manager,
+          xla::TransferManager::GetForPlatform(stream->parent()->platform()));
+      TF_RETURN_IF_ERROR(FixTupleTableAsync(stream,
+                                            compile_time_shapes_on_device,
+                                            &runtime_input, transfer_manager));
+    }
+  }
+  return Status::OK();
+}
+
+void TPUCancelExecution(Env* env, int device_ordinal) {
+  if (tpu_cancellation_terminates_process) {
+    LOG(INFO) << "TPUCancelExecution StopChipHeartbeats on device "
+              << device_ordinal;
+    Status status = TpuNodeContext::StopChipHeartbeats();
+    LOG(INFO) << "TPUCancelExecution StopChipHeartbeats done: " << status
+              << " on device " << device_ordinal;
+    // Sleep and exit in another thread so the cancellation manager can
+    // continue running callbacks. The new thread will call quick_exit,
+    // so we discard the returned Thread pointer because we won't have
+    // an opportunity to delete it.
+    (void)env->StartThread(ThreadOptions(), "tpu_execute_exit_countdown",
+                           [env]() { ExitCountdown(env); });
+  } else if (tpu_cancellation_closes_chips) {
+    LOG(INFO) << "TPUCancelExecution CloseTPUHost on device " << device_ordinal;
+    Status status = TpuNodeContext::CloseTpuHost();
+    LOG(INFO) << "TPUCancelExecution CloseTPUHost done: " << status
+              << " on device " << device_ordinal;
+  } else {
+    LOG(INFO) << "TPUCancelExecution CloseTPUHost on device " << device_ordinal
+              << " is suppressed";
+  }
+}
+
+std::pair<CancellationToken, bool> RegisterCancellation(
+    OpKernelContext* ctx, CancellationManager* cancellation_manager,
+    int device_ordinal) {
+  // Set up a cancellation callback, to ensure the TPU program we run will
+  // halt if the RPC is cancelled. Without this the TPU program might block
+  // forever. The mechanism itself is a big hammer; we close all devices
+  // attached to this host on each cancellation callback. This is necessary to
+  // ensure the system will eventually halt, since the TensorNodes on each
+  // chip may be stuck waiting for mutual communication.
+  //
+  // By closing all devices, we ensure all subsequent attempts to use the
+  // device will fail, until the devices are re-initialized via a new call to
+  // tpu.initialize_system.
+  //
+  // In a multi-TensorNode setup, CloseTPUHost may be called once for each
+  // TensorNode, and each call will close all TensorNodes. This quadratic
+  // behavior ensures the mechanism is robust to various orderings
+  // (i.e. races) between the TPU programs, which are run on separate threads.
+  // In practice the quadratic behavior isn't that bad; the first call will
+  // actually halt any running TPU programs (which may be expensive), while
+  // subsequent calls will attempt to close an already-closed device (which is
+  // cheap).
+  //
+  // TODO(b/62262381): The cancellation manager is shared between multiple TPU
+  // execute ops and the cancellation will not be invoked only when RPC is
+  // cancelled (it may also be induced by OOM errors from a different TPU
+  // execute), this results in a pretty coarse cancellation domain. This
+  // cancellation callback should only execute in a narrower scope to not be
+  // triggered in such cases.
+  CancellationToken token = cancellation_manager->get_cancellation_token();
+  // Don't rely on OpKernelContext being available when the callback runs.
+  Env* env = ctx->env();
+  bool already_cancelled = !cancellation_manager->RegisterCallback(
+      token,
+      [device_ordinal, env]() { TPUCancelExecution(env, device_ordinal); });
+  return std::pair<CancellationToken, bool>(token, already_cancelled);
+}
+
+void UnregisterCancellation(
+    OpKernelContext* ctx, CancellationManager* cancellation_manager,
+    se::Stream* stream, int device_ordinal, CancellationToken token,
+    std::shared_ptr<HostTransferManager> host_transfer_manager) {
+  // If execution reaches this point, the host callback enqueued below will get
+  // called regardless of stream status. Call inc_num_deferred_ops_function here
+  // and dec_num_deferred_ops_function in the host callback.
+  ctx->inc_num_deferred_ops_function()();
+  auto dec_num_deferred_ops_function = ctx->dec_num_deferred_ops_function();
+
+  // Try to avoid running callbacks on the compute stream, because this reduces
+  // the frequency of back-to-back programs (which are most efficient because
+  // they don't require host synchronization). Instead, borrow a substream and
+  // have the substream wait on the compute stream.
+  se::Stream* deregister_stream = stream->GetOrCreateSubStream();
+  deregister_stream->ThenWaitFor(stream);
+  deregister_stream->ThenDoHostCallback([=]() {
+    // Ensure the host_transfer_manager is copied into the callback scope.
+    (void)host_transfer_manager;
+
+    // We must deregister the callback in the success case, to avoid closing all
+    // devices. In the failure case we must NOT call DeregisterCallback as that
+    // waits for all previous cancellation callbacks to complete and any call
+    // to XlaDevice::Sync() will cause deadlock. Consider:
+    //   1) CancellationManager::StartCancel() is in progress (state is
+    //      cancelling_).
+    //   2) The call below to DeregisterCallback will block until state is
+    //   cancelled_ (all callbacks are completed).
+    //   3) A different cancellation callback has called XlaDevice::Sync(),
+    //   which will block until (2) is done.
+    //   4) StartCancel() in (1) cannot complete until (3) is done.
+    //
+    // Instead, call TryDeregisterCallback. The functional difference is
+    // TryDeregisterCallback will not block if cancellation is in proress
+    // so makes no guarantees as to the state of any callbacks.
+    // This is not a problem, as our cancellation handler does not rely on
+    // any external state.
+    VLOG(1) << "cancellation_manager->TryDeregisterCallback on device "
+            << device_ordinal;
+    cancellation_manager->TryDeregisterCallback(token);
+    VLOG(1) << "cancellation_manager->TryDeregisterCallback done on device "
+            << device_ordinal;
+
+    // ExecutorState is held alive until at least this point to ensure
+    // cancellation_manager is valid. After all outstanding
+    // dec_num_deferred_ops_function are called, ExecutorState::Finish will be
+    // allowed to proceed.
+    dec_num_deferred_ops_function();
+  });
+  stream->ReturnSubStream(deregister_stream);
+}
+
+}  // namespace
+
+xla::StatusOr<xla::ExecutionOutput> TPUExecute(
+    const TPUExecutableInfoProto& executable,
+    const TPUHostTransferInfoProto& host_transfers,
+    const xla::HloProto& hlo_metadata,
+    std::vector<xla::ExecutionInput> arguments,
+    const string& rendezvous_key_base, uint32 rng_seed,
+    TpuNodeContext* node_context, xla::DeviceAssignment* device_assignment,
+    CancellationManager* cancellation_manager, OpKernelContext* ctx,
+    stream_executor::Stream* stream,
+    stream_executor::Stream* host_to_device_stream,
+    const XLA_TpuProgram* tpu_program) {
+  profiler::TraceMe traceme("TPUExecute", 2);
+  TF_RET_CHECK(tpu::TpuPlatformInterface::GetRegisteredPlatform() != nullptr);
+  TF_RET_CHECK(tpu_program != nullptr);
+  VLOG(1) << "TPUExecute on device " << node_context->device_ordinal();
+
+  xla::Backend* backend = node_context->backend();
+
+  // Create a HostTransferManager to handle Send/Recv operations from the TPU.
+  std::shared_ptr<HostTransferManager> host_transfer_manager =
+      std::make_shared<HostTransferManager>(node_context, backend);
+  TF_ASSIGN_OR_RETURN(HostTransferManager::HostCommmandHandler handler,
+                      host_transfer_manager->Initialize(
+                          host_transfers, rendezvous_key_base, ctx));
+
+  VLOG(2) << "Cloud TPU: Executing computation on device "
+          << node_context->device_ordinal();
+
+  xla::ExecutableRunOptions run_options;
+  run_options.set_stream(stream);
+  run_options.set_device_assignment(device_assignment);
+  run_options.set_rng_seed(rng_seed);
+  run_options.set_allocator(backend->memory_allocator());
+  run_options.set_host_to_device_stream(host_to_device_stream);
+
+  const xla::ServiceExecutableRunOptions service_run_options(run_options);
+
+  std::unique_ptr<xla::HloModule> module;
+  std::vector<xla::Shape> input_shapes;
+  {
+    xla::ComputationLayout computation_layout(
+        xla::ShapeLayout(xla::Shape(executable.output_shape())));
+    for (const xla::ShapeProto& shape_proto : executable.input_shapes()) {
+      xla::Shape shape(shape_proto);
+      computation_layout.add_parameter_layout(xla::ShapeLayout(shape));
+      input_shapes.push_back(std::move(shape));
+    }
+    module = absl::make_unique<xla::HloModule>(
+        "TpuExecutableModule",
+        xla::HloModuleConfig(std::move(computation_layout)));
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      module->input_output_alias_config(),
+      xla::HloInputOutputAliasConfig::CreateFromProto(
+          backend->transfer_manager()->HostShapeToDeviceShape(
+              module->config().entry_computation_layout().result_shape()),
+          hlo_metadata.hlo_module().input_output_alias()));
+  TF_RET_CHECK(executable.input_shapes().size() == arguments.size());
+
+  for (auto& prefetch : hlo_metadata.hlo_module().cross_program_prefetches()) {
+    module->AddCrossProgramPrefetch(
+        prefetch.parameter(),
+        xla::ShapeIndex(prefetch.index().begin(), prefetch.index().end()));
+  }
+
+  TF_RETURN_IF_ERROR(UpdateDynamicInputs(stream, backend->memory_allocator(),
+                                         &arguments, input_shapes));
+
+  auto tpu_executable = absl::make_unique<xla::TpuExecutable>(
+      tpu_program, std::move(module), /*host_command_handler=*/handler);
+
+  const int32 device_ordinal = node_context->device_ordinal();
+  CancellationToken token;
+  bool already_cancelled;
+  std::tie(token, already_cancelled) =
+      RegisterCancellation(ctx, cancellation_manager, device_ordinal);
+
+  // If the RPC was already cancelled before we managed to register the
+  // cancellation callback, we shouldn't attempt to run the TPU program, since
+  // it might block forever.
+  if (already_cancelled) {
+    return errors::Cancelled(
+        "RPC cancelled, not running TPU program on device ", device_ordinal);
+  }
+
+  xla::StatusOr<xla::ExecutionOutput> output =
+      tpu_executable->ExecuteAsyncOnStream(&service_run_options,
+                                           std::move(arguments),
+                                           /*hlo_execution_profile=*/nullptr);
+
+  // If !output.ok(), it means we failed to enqueue the program the TPU. This is
+  // possibly caused by a failed cancellation callback closing the chips.
+  if (!output.ok()) {
+    // If cancellation manager is already cancelled or cancelling, it means
+    // another failure has occurred earlier and this TpuExecuteOp is cancelled
+    // regardless of whether itself is an error.
+    already_cancelled = cancellation_manager->IsCancelling() ||
+                        cancellation_manager->IsCancelled();
+    if (already_cancelled) {
+      return errors::Cancelled(
+          "RPC cancelled, not running TPU program on device ", device_ordinal);
+    }
+  }
+  UnregisterCancellation(ctx, cancellation_manager, stream, device_ordinal,
+                         token, host_transfer_manager);
+  VLOG(1) << "Cloud TPU: TPUExecute done";
+  return output;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_execute.h b/tensorflow/core/tpu/tpu_execute.h
new file mode 100644
index 00000000000..e2142ad7a7a
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_execute.h
@@ -0,0 +1,54 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_EXECUTE_H_
+#define TENSORFLOW_CORE_TPU_TPU_EXECUTE_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/computation_placer.h"
+#include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
+#include "tensorflow/core/tpu/kernels/tpu_executable_info.pb.h"
+#include "tensorflow/stream_executor/stream.h"
+#include "tensorflow/stream_executor/tpu/tpu_node_context.h"
+
+namespace tensorflow {
+
+// Runs a TPU executable. `input_allocations` and `output_allocations` are
+// non-owning pointers to the root buffers of each argument/result tuple.
+// `output_shape` is the output shape of the XLA computation from which
+// `program` was derived. If `session_module` is not nullptr, it will be filled
+// with the input and output literals of the execution.
+xla::StatusOr<xla::ExecutionOutput> TPUExecute(
+    const TPUExecutableInfoProto& executable,
+    const TPUHostTransferInfoProto& host_transfers,
+    const xla::HloProto& hlo_metadata,
+    std::vector<xla::ExecutionInput> arguments,
+    const std::string& rendezvous_key_base, uint32 rng_seed,
+    tpu::TpuNodeContext* node_context, xla::DeviceAssignment* device_assignment,
+    CancellationManager* cancellation_manager, OpKernelContext* ctx,
+    stream_executor::Stream* stream,
+    stream_executor::Stream* host_to_device_stream,
+    const XLA_TpuProgram* tpu_program);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_EXECUTE_H_
diff --git a/tensorflow/core/tpu/tpu_library_init_fns.inc b/tensorflow/core/tpu/tpu_library_init_fns.inc
index 633afe610d1..9ac4fb9ec6d 100644
--- a/tensorflow/core/tpu/tpu_library_init_fns.inc
+++ b/tensorflow/core/tpu/tpu_library_init_fns.inc
@@ -32,7 +32,40 @@ tensorflow::Status SetCompileStructFn(void* library_handle) {
   auto* compile_fn = tensorflow::tpu::CompileApiFn();
 
   TFTPU_SET_FN(compile_fn, TpuCompile_CompileAheadOfTime);
-  TFTPU_SET_FN(compile_fn, TpuCompile_BuildXLADeviceAssignment);
+  TFTPU_SET_FN(compile_fn, TpuCompile_CompileAndBuild);
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status SetExecuteStructFn(void* library_handle) {
+  auto* execute_fn = tensorflow::tpu::ExecuteApiFn();
+
+  TFTPU_SET_FN(execute_fn, TpuExecutable_LoadProgramAndEnqueueToStream);
+  TFTPU_SET_FN(execute_fn, HardwareLayout_HostShapeToDeviceShape);
+  TFTPU_SET_FN(execute_fn, HardwareLayout_ShapeSize);
+  TFTPU_SET_FN(execute_fn, HardwareLayout_ShapeSizeCompact);
+  TFTPU_SET_FN(execute_fn, HardwareLayout_ShapeSizeCompactRaw);
+  TFTPU_SET_FN(execute_fn, TpuExecute_RuntimeInputToPaddedData);
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status SetTpuProgramStructFn(void* library_handle) {
+  auto* tpu_program_fn = tensorflow::tpu::TpuProgramApiFn();
+
+  TFTPU_SET_FN(tpu_program_fn, TpuProgram_New);
+  TFTPU_SET_FN(tpu_program_fn, TpuProgram_Free);
+  TFTPU_SET_FN(tpu_program_fn, TpuProgram_NewArray);
+  TFTPU_SET_FN(tpu_program_fn, TpuProgram_FreeArray);
+  TFTPU_SET_FN(tpu_program_fn, TpuProgram_UnloadAndDestroy);
+  TFTPU_SET_FN(tpu_program_fn, TpuProgram_GetProgramSize);
+  TFTPU_SET_FN(tpu_program_fn, TpuProgram_LogProgramMemorySummary);
+  TFTPU_SET_FN(tpu_program_fn, TpuProgram_GetExecutableInfo);
+  TFTPU_SET_FN(tpu_program_fn, TpuProgram_GetHostTransferInfo);
+  TFTPU_SET_FN(tpu_program_fn, TpuProgram_GetHloMetadata);
+  TFTPU_SET_FN(tpu_program_fn, TpuProgram_GetMayModifyVariables);
+  TFTPU_SET_FN(tpu_program_fn, TpuProgram_HasSharding);
+  TFTPU_SET_FN(tpu_program_fn, TpuProgram_GetTpuProgram);
 
   return tensorflow::Status::OK();
 }
@@ -49,6 +82,8 @@ tensorflow::Status SetExecutorStructFn(void* library_handle) {
   TFTPU_SET_FN(executor_fn, TpuPlatform_VisibleDeviceCount);
   TFTPU_SET_FN(executor_fn, TpuPlatform_TpuMemoryLimit);
   TFTPU_SET_FN(executor_fn, TpuPlatform_ShouldRegisterTpuDeviceToDeviceCopy);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_GetTopologyPtr);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_GetHostLocation);
 
   TFTPU_SET_FN(executor_fn, TpuExecutor_Init);
   TFTPU_SET_FN(executor_fn, TpuExecutor_Free);
@@ -100,6 +135,7 @@ tensorflow::Status SetExecutorStructFn(void* library_handle) {
 
   TFTPU_SET_FN(executor_fn, TpuStatus_New);
   TFTPU_SET_FN(executor_fn, TpuStatus_Create);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Set);
   TFTPU_SET_FN(executor_fn, TpuStatus_Free);
   TFTPU_SET_FN(executor_fn, TpuStatus_Message);
   TFTPU_SET_FN(executor_fn, TpuStatus_Code);
@@ -125,10 +161,45 @@ tensorflow::Status SetExecutorStructFn(void* library_handle) {
   TFTPU_SET_FN(executor_fn, TpuTransferManager_TransferLiteralFromDevice);
   TFTPU_SET_FN(executor_fn, TpuTransferManager_GetByteSizeRequirement);
   TFTPU_SET_FN(executor_fn, TpuTransferManager_WriteSingleTupleIndexTable);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_GetInfeedLayout);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_LinearizeToBuffers);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_FreeBuffers);
 
   TFTPU_SET_FN(executor_fn, TpuComputationPlacer_New);
   TFTPU_SET_FN(executor_fn, TpuComputationPlacer_Free);
 
+  TFTPU_SET_FN(executor_fn, TpuTopology_LogicalDevicesPerHost);
+  TFTPU_SET_FN(executor_fn, TpuTopology_LogicalDevicesPerChip);
+  TFTPU_SET_FN(executor_fn, TpuTopology_ChipBounds_X);
+  TFTPU_SET_FN(executor_fn, TpuTopology_ChipBounds_Y);
+  TFTPU_SET_FN(executor_fn, TpuTopology_ChipBounds_Z);
+  TFTPU_SET_FN(executor_fn, TpuTopology_HasChip);
+  TFTPU_SET_FN(executor_fn, TpuTopology_Core);
+  TFTPU_SET_FN(executor_fn, TpuTopology_NumCores);
+  TFTPU_SET_FN(executor_fn, TpuTopology_Cores);
+  TFTPU_SET_FN(executor_fn, TpuTopology_IdForHost);
+  TFTPU_SET_FN(executor_fn, TpuTopology_Version);
+
+  TFTPU_SET_FN(executor_fn, TpuCoreLocation_ChipCoordinates);
+  TFTPU_SET_FN(executor_fn, TpuCoreLocation_HostCoordinates);
+  TFTPU_SET_FN(executor_fn, TpuCoreLocation_Index);
+  TFTPU_SET_FN(executor_fn, TpuCoreLocation_Id);
+
+  TFTPU_SET_FN(executor_fn, TpuHostLocation_Id);
+
+  TFTPU_SET_FN(executor_fn, TpuCompiler_New);
+  TFTPU_SET_FN(executor_fn, TpuCompiler_Free);
+
+  TFTPU_SET_FN(executor_fn, TpuCompiler_RunHloPasses);
+  TFTPU_SET_FN(executor_fn, TpuCompiler_RunBackend);
+  TFTPU_SET_FN(executor_fn, TpuCompiler_Compile);
+  TFTPU_SET_FN(executor_fn, TpuCompiler_ShapeSize);
+  TFTPU_SET_FN(executor_fn, TpuExecutable_ExecuteAsyncOnStream);
+  TFTPU_SET_FN(executor_fn, TpuExecutable_Free);
+
+  TFTPU_SET_FN(executor_fn, XlaShapeToTpuShapeRepresentation);
+  TFTPU_SET_FN(executor_fn, XlaShapeToTpuPaddedShape);
+
   return tensorflow::Status::OK();
 }
 
@@ -150,7 +221,6 @@ tensorflow::Status SetTpuUtilStructFns(void* library_handle) {
   TFTPU_SET_FN(util_fn, TpuTopology_AvailableCoreCount);
   TFTPU_SET_FN(util_fn, TpuCompile_IsTpuCompilationEnabled);
   TFTPU_SET_FN(util_fn, TpuCompile_ShouldTpuCompileOpIgnoreCancellation);
-  TFTPU_SET_FN(util_fn, TpuCompile_ToTpuShapeRepresentation);
   TFTPU_SET_FN(util_fn, TpuCompile_CreateCompilationCacheKey);
   TFTPU_SET_FN(util_fn, TpuCompile_DestroyCompilationCacheKey);
   TFTPU_SET_FN(util_fn, TpuCompile_CreateGuaranteedConstFingerprint);
@@ -162,6 +232,8 @@ tensorflow::Status InitializeTpuStructFns(void* library_handle) {
   TF_RETURN_IF_ERROR(SetTpuConfigStructFns(library_handle));
   TF_RETURN_IF_ERROR(SetTpuMeshStateStructFns(library_handle));
   TF_RETURN_IF_ERROR(SetCompileStructFn(library_handle));
+  TF_RETURN_IF_ERROR(SetExecuteStructFn(library_handle));
+  TF_RETURN_IF_ERROR(SetTpuProgramStructFn(library_handle));
   TF_RETURN_IF_ERROR(SetExecutorStructFn(library_handle));
   TF_RETURN_IF_ERROR(SetTpuNodeContextStructFns(library_handle));
   TF_RETURN_IF_ERROR(SetTpuUtilStructFns(library_handle));
@@ -169,4 +241,4 @@ tensorflow::Status InitializeTpuStructFns(void* library_handle) {
   return tensorflow::Status::OK();
 }
 
-}  // namespace
\ No newline at end of file
+}  // namespace
diff --git a/tensorflow/core/tpu/tpu_node_device.cc b/tensorflow/core/tpu/tpu_node_device.cc
index 071d53632ee..42a1533a97c 100644
--- a/tensorflow/core/tpu/tpu_node_device.cc
+++ b/tensorflow/core/tpu/tpu_node_device.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/kernels/xla_ops.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_device_ops.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/common_runtime/copy_tensor.h"
@@ -28,10 +29,11 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_reference.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/public/session_options.h"
-#include "tensorflow/core/tpu/kernels/tpu_configuration_ops.h"
-#include "tensorflow/core/tpu/kernels/tpu_util.h"
+#include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
 #include "tensorflow/core/tpu/tpu_node_device_util.h"
+#include "tensorflow/stream_executor/tpu/c_api_conversions.h"
+#include "tensorflow/stream_executor/tpu/status_helper.h"
 #include "tensorflow/stream_executor/tpu/tpu_node_context.h"
 #include "tensorflow/stream_executor/tpu/tpu_platform_interface.h"
 #include "tensorflow/stream_executor/tpu/tpu_stream_interface.h"
@@ -43,6 +45,58 @@ static bool tpu_autoclustering_flag = false;
 static bool tpu_xla_device_failure_closes_chips_flag = true;
 static bool tpu_use_substreams_for_cross_tpu_device_transfers_flag = true;
 
+// Given a tensor of `shape` and `type`, as what shape should it be stored on
+// the TPU device? This function tranposes or flattens the excessively-padded
+// tensors to rank 1, but leaves other tensor shapes alone.
+xla::StatusOr<xla::Shape> TpuShapeRepresentation(const TensorShape& shape,
+                                                 DataType type,
+                                                 bool use_fast_memory) {
+  xla::Shape xla_shape;
+  TF_RETURN_IF_ERROR(
+      tensorflow::TensorShapeToXLAShape(type, shape, &xla_shape));
+  ApiConverter::StackHelper<XLA_Shape> se_shape(xla_shape);
+  ApiConverter::StackHelper<XLA_Shape> tpu_shape;
+  StatusHelper status;
+  tpu::ExecutorApiFn()->XlaShapeToTpuShapeRepresentationFn(
+      &se_shape.value, type, use_fast_memory, &tpu_shape.value,
+      status.c_status);
+  if (!status.status().ok()) {
+    return status.status();
+  }
+  return tpu_shape.AsCpp<xla::Shape>();
+}
+
+// Given a tensor, returns the shape of its representation on device,
+// fully padded. Contents of `shape` are undefined on error.
+Status TpuPaddedShapeFn(const Tensor& tensor, xla::Shape* shape) {
+  const tensorflow::XlaTensor* xla_tensor =
+      tensorflow::XlaTensor::FromTensor(&tensor);
+  if (xla_tensor == nullptr) {
+    return errors::InvalidArgument(
+        "Expected an XlaTensor when computing padded shape");
+  }
+
+  if (!xla_tensor->has_shaped_buffer()) {
+    return errors::InvalidArgument(
+        "XlaTensor is expected to have device memory allocated when "
+        "computing padded shape");
+  }
+
+  const xla::Shape& on_device_shape =
+      xla_tensor->shaped_buffer().on_device_shape();
+
+  StatusHelper status;
+  ApiConverter::StackHelper<XLA_Shape> se_shape(on_device_shape);
+  ApiConverter::StackHelper<XLA_Shape> tpu_shape;
+  tpu::ExecutorApiFn()->XlaShapeToTpuPaddedShapeFn(
+      &se_shape.value, &tpu_shape.value, status.c_status);
+  if (!status.ok()) {
+    return status.status();
+  }
+  *shape = tpu_shape.AsCpp<xla::Shape>();
+  return Status::OK();
+}
+
 // Check if TPU has been initialized. TPU initialization is not necessary
 // for 1x1.
 Status CheckIfTPUInitialized() {
@@ -315,9 +369,8 @@ Status TpuNodeDeviceFactory::CreateDevices(
     options.device_ordinal = i;
     options.compilation_device_name = DEVICE_TPU_XLA_JIT;
     options.use_multiple_streams = true;
-    // TODO(jiawenhao): Implement and enable these.
-    // options.shape_representation_fn = tpu::TpuShapeRepresentation;
-    // options.padded_shape_fn = tpu::TpuPaddedShapeFn;
+    options.shape_representation_fn = &TpuShapeRepresentation;
+    options.padded_shape_fn = &TpuPaddedShapeFn;
     auto device = absl::make_unique<XlaDevice>(session_options, options);
 
     // The GpuDeviceInfo actually provides information not only for GPU
diff --git a/tensorflow/core/tpu/tpu_on_demand_compiler.cc b/tensorflow/core/tpu/tpu_on_demand_compiler.cc
new file mode 100644
index 00000000000..eae7ff86835
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_on_demand_compiler.cc
@@ -0,0 +1,349 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/service/compiler.h"
+#include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_module_group.h"
+#include "tensorflow/compiler/xla/service/shaped_buffer.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/tpu/c_api_conversions.h"
+#include "tensorflow/stream_executor/tpu/proto_helper.h"
+#include "tensorflow/stream_executor/tpu/status_helper.h"
+#include "tensorflow/stream_executor/tpu/tpu_executor.h"
+#include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
+#include "tensorflow/stream_executor/tpu/tpu_platform.h"
+#include "tensorflow/stream_executor/tpu/tpu_stream.h"
+
+namespace ApiConverter {
+static SE_ExecutableRunOptions ToC(
+    const xla::ServiceExecutableRunOptions& options) {
+  SE_ExecutableRunOptions se_options;
+  se_options.allocator = ApiConverter::ToC(options.run_options().allocator());
+  se_options.device_ordinal = options.run_options().device_ordinal();
+  if (options.run_options().host_to_device_stream() != nullptr) {
+    se_options.host_to_device_stream =
+        static_cast<TpuStream*>(
+            options.run_options().host_to_device_stream()->implementation())
+            ->se_stream();
+  } else {
+    se_options.host_to_device_stream = nullptr;
+  }
+
+  if (options.run_options().device_assignment() != nullptr) {
+    xla::DeviceAssignmentProto dev_assign_proto;
+    options.run_options()
+        .device_assignment()
+        ->Serialize(&dev_assign_proto)
+        .IgnoreError();
+    se_options.device_assignment =
+        stream_executor::tpu::SerializeProto(dev_assign_proto);
+  } else {
+    se_options.device_assignment.bytes = nullptr;
+    se_options.device_assignment.size = 0;
+  }
+
+  se_options.rng_seed = options.run_options().rng_seed();
+  se_options.run_id = options.run_options().run_id().ToInt();
+  se_options.launch_id = options.run_options().launch_id();
+
+  CHECK_EQ(options.run_options().then_execute_function(), nullptr)
+      << "ThenExecuteFunction not supported by this platform.";
+
+  auto impl =
+      const_cast<stream_executor::Stream*>(options.stream())->implementation();
+  se_options.stream = static_cast<TpuStream*>(impl)->se_stream();
+  return se_options;
+}
+}  // namespace ApiConverter
+
+namespace xla {
+
+namespace {
+
+using ::tensorflow::tpu::ExecutorApiFn;
+
+class TpuExecutable : public Executable {
+ public:
+  TpuExecutable(SE_Executable* se_executable,
+                std::shared_ptr<HloModule> hlo_module)
+      : Executable(std::move(hlo_module), nullptr, nullptr),
+        se_executable_(se_executable) {}
+
+  ~TpuExecutable() override {
+    ExecutorApiFn()->TpuExecutable_FreeFn(se_executable_);
+  }
+
+  StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
+      const ServiceExecutableRunOptions* run_options,
+      std::vector<ExecutionInput> arguments,
+      HloExecutionProfile* hlo_execution_profile) override {
+    SE_ExecutableRunOptions se_run_options = ApiConverter::ToC(*run_options);
+    SE_ExecutionInput** se_args = new SE_ExecutionInput*[arguments.size()];
+    for (int i = 0; i < arguments.size(); ++i) {
+      auto& arg = arguments[i];
+      se_args[i] = new SE_ExecutionInput;
+
+      ApiConverter::ToC(arg.shape(), &se_args[i]->shape_tree.shape);
+      auto* arg_buffers = arg.MutableBuffers();
+      absl::InlinedVector<SE_MaybeOwningDeviceMemory, 2> se_buffers;
+      for (auto& pair : *arg_buffers) {
+        se_buffers.push_back(ApiConverter::ToC(pair.second));
+      }
+      se_args[i]->shape_tree.buffers =
+          new SE_MaybeOwningDeviceMemory[se_buffers.size()];
+      for (int j = 0; j < se_buffers.size(); ++j) {
+        se_args[i]->shape_tree.buffers[j] = se_buffers[j];
+      }
+
+      ApiConverter::ToC(arg.shape(), &se_args[i]->dynamic_shape);
+      ApiConverter::ToC(arg.host_shape(), &se_args[i]->host_shape);
+      const auto& unowned_indices = arg.unowned_indices();
+      se_args[i]->unowned_indices_size = unowned_indices.size();
+      se_args[i]->unowned_indices = new XLA_ShapeIndex[unowned_indices.size()];
+      int j = 0;
+      for (auto& idx : unowned_indices) {
+        se_args[i]->unowned_indices[j] = ApiConverter::ToC(idx);
+        ++j;
+      }
+    }
+    SE_ExecutionOutput se_execution_output;
+    StatusHelper status;
+    ExecutorApiFn()->TpuExecutable_ExecuteAsyncOnStreamFn(
+        se_executable_, &se_run_options, se_args, arguments.size(), nullptr,
+        &se_execution_output, status.c_status);
+    if (!status.ok()) {
+      return status.status();
+    }
+
+    xla::ScopedShapedBuffer result(
+        ApiConverter::FromC(&se_execution_output.result),
+        run_options->stream()->parent()->GetAllocator());
+
+    ExecutionOutput output(std::move(result));
+    for (int i = 0; i < se_execution_output.aliased_indices_size; ++i) {
+      output.AddAliasedIndex(
+          ApiConverter::FromC(&se_execution_output.aliased_indices[i]));
+    }
+
+    for (int i = 0; i < se_execution_output.to_be_released_size; ++i) {
+      output.AddToBeReleased(
+          ApiConverter::FromC(&se_execution_output.to_be_released[i],
+                              run_options->stream()->parent()->GetAllocator())
+              .Release()
+              .value());
+    }
+
+    return output;
+  }
+
+ private:
+  SE_Executable* se_executable_;
+};
+
+XLA_HloModuleConfig HloModuleConfigToC(const xla::HloModuleConfig& config) {
+  XLA_HloModuleConfig hlo_config;
+
+  hlo_config.seed = config.seed();
+  hlo_config.launch_id = config.launch_id();
+  hlo_config.replica_count = config.replica_count();
+  hlo_config.num_partitions = config.num_partitions();
+  hlo_config.use_spmd_partitioning = config.use_spmd_partitioning();
+  hlo_config.has_static_device_assignment =
+      config.has_static_device_assignment();
+  hlo_config.has_entry_computation_layout =
+      config.has_entry_computation_layout();
+
+  if (config.has_static_device_assignment()) {
+    DeviceAssignmentProto dev_proto;
+    config.static_device_assignment().Serialize(&dev_proto).IgnoreError();
+    hlo_config.static_device_assignment =
+        stream_executor::tpu::SerializeProto(dev_proto);
+  }
+  if (config.has_entry_computation_layout()) {
+    auto layout = config.entry_computation_layout();
+    ApiConverter::ToC(layout.result_layout().shape(),
+                      &hlo_config.entry_computation_layout.result_layout);
+    hlo_config.entry_computation_layout.parameter_layouts =
+        new XLA_Shape[layout.parameter_count()];
+    for (int i = 0; i < layout.parameter_count(); ++i) {
+      ApiConverter::ToC(
+          layout.parameter_layout(i).shape(),
+          &hlo_config.entry_computation_layout.parameter_layouts[i]);
+    }
+    hlo_config.entry_computation_layout.parameter_count =
+        layout.parameter_count();
+  }
+  return hlo_config;
+}
+
+class TpuCompiler : public Compiler {
+ public:
+  TpuCompiler() { compiler_ = ExecutorApiFn()->TpuCompiler_NewFn(); }
+  ~TpuCompiler() override { ExecutorApiFn()->TpuCompiler_FreeFn(compiler_); }
+
+  stream_executor::Platform::Id PlatformId() const override {
+    return tensorflow::TpuPlatform::kId;
+  }
+
+  StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
+      std::unique_ptr<HloModule> module,
+      stream_executor::StreamExecutor* executor,
+      stream_executor::DeviceMemoryAllocator* device_allocator) override {
+    XLA_HloModule hlo_module;
+    hlo_module.module_config = HloModuleConfigToC(module->config());
+    hlo_module.proto = stream_executor::tpu::SerializeProto(module->ToProto());
+    auto allocator = ApiConverter::ToC(device_allocator);
+    XLA_HloModule result;
+    StatusHelper status;
+    ExecutorApiFn()->TpuCompiler_RunHloPassesFn(
+        compiler_, &hlo_module,
+        static_cast<tensorflow::TpuExecutor*>(executor->implementation())
+            ->se_executor(),
+        &allocator, &result, status.c_status);
+    if (!status.ok()) {
+      return status.status();
+    }
+    HloModuleProto result_proto =
+        stream_executor::tpu::DeserializeProto<HloModuleProto>(result.proto);
+    return HloModule::CreateFromProto(result_proto, module->config());
+  }
+
+  StatusOr<
+      std::tuple<std::unique_ptr<HloModule>, std::unique_ptr<BufferAssignment>>>
+  RunHloPassesAndBufferAssignement(
+      std::unique_ptr<HloModule> module,
+      stream_executor::StreamExecutor* executor,
+      stream_executor::DeviceMemoryAllocator* device_allocator) override {
+    return Unimplemented(
+        "This compiler does not support RunHloPassesAndBufferAssignment.");
+  }
+
+  StatusOr<std::unique_ptr<Executable>> RunBackend(
+      std::unique_ptr<HloModule> module,
+      stream_executor::StreamExecutor* executor,
+      stream_executor::DeviceMemoryAllocator* device_allocator) override {
+    XLA_HloModule hlo_module;
+    hlo_module.module_config = HloModuleConfigToC(module->config());
+    hlo_module.proto = stream_executor::tpu::SerializeProto(module->ToProto());
+    auto allocator = ApiConverter::ToC(device_allocator);
+
+    SE_Executable* result;
+    StatusHelper status;
+    ExecutorApiFn()->TpuCompiler_RunBackendFn(
+        compiler_, &hlo_module,
+        static_cast<tensorflow::TpuExecutor*>(executor->implementation())
+            ->se_executor(),
+        &allocator, &result, status.c_status);
+    if (!status.ok()) {
+      return status.status();
+    }
+
+    std::unique_ptr<Executable> exec =
+        absl::make_unique<TpuExecutable>(result, std::move(module));
+    return exec;
+  }
+
+  StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
+      std::unique_ptr<HloModuleGroup> module_group,
+      std::vector<std::vector<stream_executor::StreamExecutor*>> stream_exec,
+      stream_executor::DeviceMemoryAllocator* device_allocator) override {
+    XLA_HloModuleGroup se_module_group;
+    se_module_group.proto =
+        stream_executor::tpu::SerializeProto(module_group->ToProto());
+    se_module_group.module_config =
+        new XLA_HloModuleConfig[module_group->size()];
+    for (int i = 0; i < module_group->size(); ++i) {
+      const auto& config = module_group->module(i).config();
+      se_module_group.module_config[i] = HloModuleConfigToC(config);
+    }
+
+    SE_StreamExecutorList* se_lists =
+        new SE_StreamExecutorList[stream_exec.size()];
+    for (int i = 0; i < stream_exec.size(); ++i) {
+      se_lists[i].exec = new SE_StreamExecutor*[stream_exec[i].size()];
+      for (int j = 0; j < stream_exec[i].size(); ++j) {
+        se_lists[i].exec[j] = static_cast<tensorflow::TpuExecutor*>(
+                                  stream_exec[i][j]->implementation())
+                                  ->se_executor();
+      }
+    }
+
+    SE_DeviceMemoryAllocator allocator = ApiConverter::ToC(device_allocator);
+
+    SE_Executable** se_executables = new SE_Executable*[module_group->size()];
+
+    StatusHelper status;
+
+    ExecutorApiFn()->TpuCompiler_CompileFn(
+        compiler_, &se_module_group, se_lists, stream_exec.size(), &allocator,
+        se_executables, status.c_status);
+
+    if (!status.ok()) {
+      return status.status();
+    }
+
+    std::vector<std::unique_ptr<Executable>> executables;
+    std::vector<std::unique_ptr<HloModule>> modules =
+        module_group->ConsumeModules();
+    for (int i = 0; i < module_group->size(); ++i) {
+      executables[i] = absl::make_unique<TpuExecutable>(se_executables[i],
+                                                        std::move(modules[i]));
+    }
+
+    return executables;
+  }
+
+  // Compiles the HLO module group for ahead-of-time execution.  This is
+  // intended for use in static compilation.
+  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
+                     const AotCompilationOptions& options) override {
+    return Unimplemented("This compiler does not support CompileAheadOfTime.");
+  }
+
+  // Returns a function that computes the size in bytes of the logical
+  // buffer that contains a shape.
+  HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override {
+    return [this](const xla::Shape& shape) {
+      XLA_Shape c_shape;
+      ApiConverter::ToC(shape, &c_shape);
+      int64 bytes =
+          ExecutorApiFn()->TpuCompiler_ShapeSizeFn(compiler_, &c_shape);
+      ApiConverter::Free(&c_shape);
+      return bytes;
+    };
+  }
+
+ private:
+  Tpu_Compiler* compiler_;
+};
+
+static bool InitModule() {
+  xla::Compiler::RegisterCompilerFactory(tensorflow::TpuPlatform::kId, []() {
+    return absl::make_unique<TpuCompiler>();
+  });
+  return true;
+}
+
+static bool module_initialized = InitModule();
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/core/tpu/tpu_system_device.cc b/tensorflow/core/tpu/tpu_system_device.cc
index de72021da06..7a6c4e949e3 100644
--- a/tensorflow/core/tpu/tpu_system_device.cc
+++ b/tensorflow/core/tpu/tpu_system_device.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/graph/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/tpu/virtual_device.h"
diff --git a/tensorflow/core/util/BUILD b/tensorflow/core/util/BUILD
index 8e878c2464d..4d2ff9a8058 100644
--- a/tensorflow/core/util/BUILD
+++ b/tensorflow/core/util/BUILD
@@ -14,6 +14,7 @@ load(
     "tf_copts",
     "tf_cuda_library",
     "tf_cuda_only_cc_test",
+    "tf_kernel_library",
 )
 load("//tensorflow:tensorflow.bzl", "tf_version_info_genrule")
 load(
@@ -24,19 +25,27 @@ load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "if_static",
 )
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm",
+)
+
+default_package_visibility = [
+    "//tensorflow/core:__subpackages__",
+]
 
 package(
-    default_visibility = [
-        "//tensorflow/core:__subpackages__",
-    ],
+    default_visibility = default_package_visibility,
     licenses = ["notice"],  # Apache 2.0
 )
 
-# List of exported proto source files.
+# List of exported source files.
 exports_files(
     srcs = [
         "event.proto",
         "example_proto_fast_parsing_test.proto",
+        "image_resizer_state.h",
         "memmapped_file_system.proto",
         "saved_tensor_slice.proto",
     ],
@@ -61,6 +70,7 @@ filegroup(
 filegroup(
     name = "mobile_srcs_only_runtime",
     srcs = [
+        "abstract_stack_trace.h",
         "batch_util.cc",
         "batch_util.h",
         "bcast.cc",
@@ -93,6 +103,8 @@ filegroup(
         "port.h",
         "presized_cuckoo_map.h",
         "ptr_util.h",
+        "ragged_to_dense_util.cc",
+        "ragged_to_dense_util.h",
         "reffed_status_callback.h",
         "saved_tensor_slice_util.cc",
         "saved_tensor_slice_util.h",
@@ -311,6 +323,7 @@ filegroup(
 filegroup(
     name = "framework_srcs",
     srcs = [
+        "abstract_stack_trace.h",
         "activation_mode.h",
         "batch_util.h",
         "bcast.h",
@@ -368,6 +381,35 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ragged_to_dense_util",
+    srcs = [
+        "ragged_to_dense_util.cc",
+    ],
+    hdrs = [
+        "ragged_to_dense_util.h",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "ragged_to_dense_util_test",
+    srcs = [
+        "ragged_to_dense_util_test.cc",
+    ],
+    deps = [
+        ":ragged_to_dense_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:testlib",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 tf_cc_test(
     name = "stats_calculator_test",
     srcs = ["stats_calculator_test.cc"],
@@ -406,6 +448,22 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "abstract_stack_trace",
+    hdrs = ["abstract_stack_trace.h"],
+    visibility = [
+        "//tensorflow/c/eager:__pkg__",
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/common_runtime/eager:__pkg__",
+        "//tensorflow/core/platform:__pkg__",
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/eager:__pkg__",
+    ],
+    deps = [
+        "//tensorflow/core/platform:status",
+    ],
+)
+
 tf_cuda_library(
     name = "gpu_cuda_alias",
     hdrs = ["gpu_cuda_alias.h"],
@@ -462,6 +520,7 @@ cc_library(
         "//tensorflow/core/lib/gtl:array_slice",
         "//tensorflow/core/lib/gtl:inlined_vector",
         "//tensorflow/core/platform:types",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -509,14 +568,82 @@ cc_library(
     name = "incremental_barrier",
     srcs = ["incremental_barrier.cc"],
     hdrs = ["incremental_barrier.h"],
+    visibility = default_package_visibility + ["//tensorflow_serving:__subpackages__"],
     deps = [
         "//tensorflow/core:lib",
         "@com_google_absl//absl/functional:bind_front",
     ],
 )
 
-# Tests.
+tf_kernel_library(
+    name = "cuda_solvers",
+    srcs = ["cuda_solvers.cc"],
+    hdrs = ["cuda_solvers.h"],
+    # @local_config_cuda//cuda:cusolver_static, //third_party/eigen3:blas,
+    # and //third_party/libf2c all contain various parts of BLAS, LAPACK,
+    # and f2c helper functions in global namespace. Tell the compiler to
+    # allow multiple definitions when linking this.
+    linkopts = select({
+        "//tensorflow:macos": [],
+        "//tensorflow:windows": [],
+        "//conditions:default": ["-Wl,-z,muldefs"],
+    }),
+    visibility = ["//tensorflow/core/kernels:friends"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/platform/default/build_config:cublas_plugin",
+        "//tensorflow/stream_executor/cuda:cublas_lib",
+        "//tensorflow/stream_executor/cuda:cusolver_lib",
+    ],
+)
 
+tf_kernel_library(
+    name = "rocm_solvers",
+    srcs = ["rocm_solvers.cc"],
+    hdrs = ["rocm_solvers.h"],
+    visibility = ["//tensorflow/core/kernels:friends"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform:dso_loader",
+        "//tensorflow/stream_executor/rocm:rocblas_plugin",
+        "//tensorflow/stream_executor/rocm:rocm_gpu_executor",
+    ] + if_rocm([
+        "@local_config_rocm//rocm:rocprim",
+    ]),
+)
+
+tf_kernel_library(
+    name = "cuda_sparse",
+    srcs = if_cuda(["cuda_sparse.cc"]) + if_rocm(["rocm_sparse.cc"]),
+    hdrs = ["cuda_sparse.h"],
+    deps = [
+        ":cuda_solvers",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ] + if_cuda([
+        "//tensorflow/stream_executor/cuda:cusparse_lib",
+        "@cub_archive//:cub",
+    ]) + if_rocm([
+        "@local_config_rocm//rocm:hipsparse",
+    ]),
+)
+
+cc_library(
+    name = "image_resizer_state",
+    hdrs = ["image_resizer_state.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels:bounds_check",
+        "//third_party/eigen3",
+    ],
+)
+
+# Tests.
 tf_cc_test(
     name = "overflow_test",
     size = "small",
diff --git a/tensorflow/core/util/abstract_stack_trace.h b/tensorflow/core/util/abstract_stack_trace.h
new file mode 100644
index 00000000000..442adc6f380
--- /dev/null
+++ b/tensorflow/core/util/abstract_stack_trace.h
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_ABSTRACT_STACK_TRACE_H_
+#define TENSORFLOW_CORE_UTIL_ABSTRACT_STACK_TRACE_H_
+
+#include <string>
+
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Language agnostic stack trace class. It only saves an id, and language
+// clients are responsible for managing the actual stack trace objects.
+class AbstractStackTrace {
+ public:
+  AbstractStackTrace(int id, std::vector<StackFrame> (*to_stack_frames)(int))
+      : id_(id), to_stack_frames_(to_stack_frames) {}
+
+  // Returns stack trace as a vector of `StackFrame`s.
+  std::vector<StackFrame> ToStackFrames() const {
+    return to_stack_frames_(id_);
+  }
+
+ private:
+  int id_;
+  std::vector<StackFrame> (*to_stack_frames_)(int);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_ABSTRACT_STACK_TRACE_H_
diff --git a/tensorflow/core/util/bcast.h b/tensorflow/core/util/bcast.h
index 075de84964e..0a2c68d3f82 100644
--- a/tensorflow/core/util/bcast.h
+++ b/tensorflow/core/util/bcast.h
@@ -133,13 +133,13 @@ BCastList<N>::BCastList(const BCastList::Vec (&x)[N],
                         const bool return_flattened_batch_indices) {
   typedef BCastList::Vec Vec;
   bool all_equal = true;
-  int largest_rank = 0;
+  size_t largest_rank = 0;
   output_batch_size_ = 1;
   for (int i = 0; i < N; ++i) {
     if (x[i] != x[0]) {
       all_equal = false;
     }
-    if (static_cast<int>(x[i].size()) > largest_rank) {
+    if (x[i].size() > largest_rank) {
       largest_rank = x[i].size();
     }
   }
@@ -176,7 +176,7 @@ BCastList<N>::BCastList(const BCastList::Vec (&x)[N],
 
   // 1-extend and align all vectors.
   for (int i = 0; i < N; ++i) {
-    if (static_cast<int>(copy[i].size()) < largest_rank) {
+    if (copy[i].size() < largest_rank) {
       copy[i].resize(largest_rank, 1);
     }
   }
diff --git a/tensorflow/core/kernels/cuda_solvers.cc b/tensorflow/core/util/cuda_solvers.cc
similarity index 99%
rename from tensorflow/core/kernels/cuda_solvers.cc
rename to tensorflow/core/util/cuda_solvers.cc
index f41ce2a5d27..3e4d2a05ac6 100644
--- a/tensorflow/core/kernels/cuda_solvers.cc
+++ b/tensorflow/core/util/cuda_solvers.cc
@@ -14,7 +14,7 @@
    ==============================================================================
 */
 #ifdef GOOGLE_CUDA
-#include "tensorflow/core/kernels/cuda_solvers.h"
+#include "tensorflow/core/util/cuda_solvers.h"
 
 #include <chrono>
 #include <complex>
diff --git a/tensorflow/core/kernels/cuda_solvers.h b/tensorflow/core/util/cuda_solvers.h
similarity index 94%
rename from tensorflow/core/kernels/cuda_solvers.h
rename to tensorflow/core/util/cuda_solvers.h
index 6833905e379..79f45c9b0ea 100644
--- a/tensorflow/core/kernels/cuda_solvers.h
+++ b/tensorflow/core/util/cuda_solvers.h
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================
 */
 
-#ifndef TENSORFLOW_CORE_KERNELS_CUDA_SOLVERS_H_
-#define TENSORFLOW_CORE_KERNELS_CUDA_SOLVERS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_CUDA_SOLVERS_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_CUDA_SOLVERS_H_
 
 // This header declares the class CudaSolver, which contains wrappers of linear
 // algebra solvers in the cuBlas and cuSolverDN libraries for use in TensorFlow
@@ -169,14 +169,16 @@ class CudaSolver {
   // to the underlying Tensor to prevent it from being deallocated prematurely.
   template <typename Scalar>
   ScratchSpace<Scalar> GetScratchSpace(const TensorShape& shape,
-                                       const string& debug_info, bool on_host);
+                                       const std::string& debug_info,
+                                       bool on_host);
   template <typename Scalar>
-  ScratchSpace<Scalar> GetScratchSpace(int64 size, const string& debug_info,
+  ScratchSpace<Scalar> GetScratchSpace(int64 size,
+                                       const std::string& debug_info,
                                        bool on_host);
   // Returns a DeviceLapackInfo that will live for the duration of the
   // CudaSolver object.
   inline DeviceLapackInfo GetDeviceLapackInfo(int64 size,
-                                              const string& debug_info);
+                                              const std::string& debug_info);
 
   // Allocates a temporary tensor that will live for the duration of the
   // CudaSolver object.
@@ -377,12 +379,12 @@ class ScratchSpace {
   ScratchSpace(OpKernelContext* context, int64 size, bool on_host)
       : ScratchSpace(context, TensorShape({size}), "", on_host) {}
 
-  ScratchSpace(OpKernelContext* context, int64 size, const string& debug_info,
-               bool on_host)
+  ScratchSpace(OpKernelContext* context, int64 size,
+               const std::string& debug_info, bool on_host)
       : ScratchSpace(context, TensorShape({size}), debug_info, on_host) {}
 
   ScratchSpace(OpKernelContext* context, const TensorShape& shape,
-               const string& debug_info, bool on_host)
+               const std::string& debug_info, bool on_host)
       : context_(context), debug_info_(debug_info), on_host_(on_host) {
     AllocatorAttributes alloc_attr;
     if (on_host) {
@@ -411,7 +413,7 @@ class ScratchSpace {
   }
   int64 bytes() const { return scratch_tensor_.TotalBytes(); }
   int64 size() const { return scratch_tensor_.NumElements(); }
-  const string& debug_info() const { return debug_info_; }
+  const std::string& debug_info() const { return debug_info_; }
 
   Tensor& tensor() { return scratch_tensor_; }
   const Tensor& tensor() const { return scratch_tensor_; }
@@ -424,21 +426,22 @@ class ScratchSpace {
 
  private:
   OpKernelContext* context_;  // not owned
-  const string debug_info_;
+  const std::string debug_info_;
   const bool on_host_;
   Tensor scratch_tensor_;
 };
 
 class HostLapackInfo : public ScratchSpace<int> {
  public:
-  HostLapackInfo(OpKernelContext* context, int64 size, const string& debug_info)
-      : ScratchSpace<int>(context, size, debug_info, /* on_host */ true){};
+  HostLapackInfo(OpKernelContext* context, int64 size,
+                 const std::string& debug_info)
+      : ScratchSpace<int>(context, size, debug_info, /* on_host */ true) {}
 };
 
 class DeviceLapackInfo : public ScratchSpace<int> {
  public:
   DeviceLapackInfo(OpKernelContext* context, int64 size,
-                   const string& debug_info)
+                   const std::string& debug_info)
       : ScratchSpace<int>(context, size, debug_info, /* on_host */ false) {}
 
   // Allocates a new scratch space on the host and launches a copy of the
@@ -460,7 +463,7 @@ class DeviceLapackInfo : public ScratchSpace<int> {
 #if GOOGLE_CUDA
 template <typename Scalar>
 ScratchSpace<Scalar> CudaSolver::GetScratchSpace(const TensorShape& shape,
-                                                 const string& debug_info,
+                                                 const std::string& debug_info,
                                                  bool on_host) {
   ScratchSpace<Scalar> new_scratch_space(context_, shape, debug_info, on_host);
   scratch_tensor_refs_.emplace_back(new_scratch_space.tensor());
@@ -469,13 +472,13 @@ ScratchSpace<Scalar> CudaSolver::GetScratchSpace(const TensorShape& shape,
 
 template <typename Scalar>
 ScratchSpace<Scalar> CudaSolver::GetScratchSpace(int64 size,
-                                                 const string& debug_info,
+                                                 const std::string& debug_info,
                                                  bool on_host) {
   return GetScratchSpace<Scalar>(TensorShape({size}), debug_info, on_host);
 }
 
 inline DeviceLapackInfo CudaSolver::GetDeviceLapackInfo(
-    int64 size, const string& debug_info) {
+    int64 size, const std::string& debug_info) {
   DeviceLapackInfo new_dev_info(context_, size, debug_info);
   scratch_tensor_refs_.emplace_back(new_dev_info.tensor());
   return new_dev_info;
@@ -486,4 +489,4 @@ inline DeviceLapackInfo CudaSolver::GetDeviceLapackInfo(
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#endif  // TENSORFLOW_CORE_KERNELS_CUDA_SOLVERS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_CUDA_SOLVERS_H_
diff --git a/tensorflow/core/kernels/cuda_sparse.cc b/tensorflow/core/util/cuda_sparse.cc
similarity index 99%
rename from tensorflow/core/kernels/cuda_sparse.cc
rename to tensorflow/core/util/cuda_sparse.cc
index 141aae61571..47e018560e1 100644
--- a/tensorflow/core/kernels/cuda_sparse.cc
+++ b/tensorflow/core/util/cuda_sparse.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #ifdef GOOGLE_CUDA
 
-#include "tensorflow/core/kernels/cuda_sparse.h"
+#include "tensorflow/core/util/cuda_sparse.h"
 
 #include <complex>
 #include <memory>
@@ -28,7 +28,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/cuda_solvers.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -38,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_solvers.h"
 
 // TODO(rmlarsen,penporn): Investigate using newer kernels in CUDA 10.1+.
 
diff --git a/tensorflow/core/kernels/cuda_sparse.h b/tensorflow/core/util/cuda_sparse.h
similarity index 94%
rename from tensorflow/core/kernels/cuda_sparse.h
rename to tensorflow/core/util/cuda_sparse.h
index 2d41cc72421..76580766d69 100644
--- a/tensorflow/core/kernels/cuda_sparse.h
+++ b/tensorflow/core/util/cuda_sparse.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_CUDA_SPARSE_H_
-#define TENSORFLOW_CORE_KERNELS_CUDA_SPARSE_H_
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_CUDA_SPARSE_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_CUDA_SPARSE_H_
 
 // This header declares the class GpuSparse, which contains wrappers of
 // cuSparse libraries for use in TensorFlow kernels.
@@ -205,49 +205,49 @@ class GpuSparse {
   // Solves tridiagonal system of equations.
   // See: https://docs.nvidia.com/cuda/cusparse/index.html#gtsv2
   template <typename Scalar>
-  Status Gtsv2(int m, int n, const Scalar *dl, const Scalar *d,
-               const Scalar *du, Scalar *B, int ldb, void *pBuffer) const;
+  Status Gtsv2(int m, int n, const Scalar* dl, const Scalar* d,
+               const Scalar* du, Scalar* B, int ldb, void* pBuffer) const;
 
   // Computes the size of a temporary buffer used by Gtsv2.
   // See: https://docs.nvidia.com/cuda/cusparse/index.html#gtsv2_bufferSize
   template <typename Scalar>
-  Status Gtsv2BufferSizeExt(int m, int n, const Scalar *dl, const Scalar *d,
-                            const Scalar *du, const Scalar *B, int ldb,
-                            size_t *bufferSizeInBytes) const;
+  Status Gtsv2BufferSizeExt(int m, int n, const Scalar* dl, const Scalar* d,
+                            const Scalar* du, const Scalar* B, int ldb,
+                            size_t* bufferSizeInBytes) const;
 
   // Solves tridiagonal system of equations without partial pivoting.
   // See: https://docs.nvidia.com/cuda/cusparse/index.html#gtsv2_nopivot
   template <typename Scalar>
-  Status Gtsv2NoPivot(int m, int n, const Scalar *dl, const Scalar *d,
-                      const Scalar *du, Scalar *B, int ldb,
-                      void *pBuffer) const;
+  Status Gtsv2NoPivot(int m, int n, const Scalar* dl, const Scalar* d,
+                      const Scalar* du, Scalar* B, int ldb,
+                      void* pBuffer) const;
 
   // Computes the size of a temporary buffer used by Gtsv2NoPivot.
   // See:
   // https://docs.nvidia.com/cuda/cusparse/index.html#gtsv2_nopivot_bufferSize
   template <typename Scalar>
-  Status Gtsv2NoPivotBufferSizeExt(int m, int n, const Scalar *dl,
-                                   const Scalar *d, const Scalar *du,
-                                   const Scalar *B, int ldb,
-                                   size_t *bufferSizeInBytes) const;
+  Status Gtsv2NoPivotBufferSizeExt(int m, int n, const Scalar* dl,
+                                   const Scalar* d, const Scalar* du,
+                                   const Scalar* B, int ldb,
+                                   size_t* bufferSizeInBytes) const;
 
   // Solves a batch of tridiagonal systems of equations. Doesn't support
   // multiple right-hand sides per each system. Doesn't do pivoting.
   // See: https://docs.nvidia.com/cuda/cusparse/index.html#gtsv2stridedbatch
   template <typename Scalar>
-  Status Gtsv2StridedBatch(int m, const Scalar *dl, const Scalar *d,
-                           const Scalar *du, Scalar *x, int batchCount,
-                           int batchStride, void *pBuffer) const;
+  Status Gtsv2StridedBatch(int m, const Scalar* dl, const Scalar* d,
+                           const Scalar* du, Scalar* x, int batchCount,
+                           int batchStride, void* pBuffer) const;
 
   // Computes the size of a temporary buffer used by Gtsv2StridedBatch.
   // See:
   // https://docs.nvidia.com/cuda/cusparse/index.html#gtsv2stridedbatch_bufferSize
   template <typename Scalar>
-  Status Gtsv2StridedBatchBufferSizeExt(int m, const Scalar *dl,
-                                        const Scalar *d, const Scalar *du,
-                                        const Scalar *x, int batchCount,
+  Status Gtsv2StridedBatchBufferSizeExt(int m, const Scalar* dl,
+                                        const Scalar* d, const Scalar* du,
+                                        const Scalar* x, int batchCount,
                                         int batchStride,
-                                        size_t *bufferSizeInBytes) const;
+                                        size_t* bufferSizeInBytes) const;
 
   // Compresses the indices of rows or columns. It can be interpreted as a
   // conversion from COO to CSR sparse storage format. See:
@@ -448,7 +448,7 @@ class GpuSparse {
 
  private:
   bool initialized_;
-  OpKernelContext *context_;  // not owned.
+  OpKernelContext* context_;  // not owned.
   gpuStream_t gpu_stream_;
   gpusparseHandle_t* gpusparse_handle_;  // not owned.
 
@@ -584,4 +584,4 @@ class GpuSparseCsrSortingConversionInfo {
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#endif  // TENSORFLOW_CORE_KERNELS_CUDA_SPARSE_H_
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_CUDA_SPARSE_H_
diff --git a/tensorflow/core/util/device_name_utils.h b/tensorflow/core/util/device_name_utils.h
index 93fab444da4..a1515ba8508 100644
--- a/tensorflow/core/util/device_name_utils.h
+++ b/tensorflow/core/util/device_name_utils.h
@@ -46,8 +46,8 @@ namespace tensorflow {
 class DeviceNameUtils {
  public:
   // Returns a fully qualified device name given the parameters.
-  static string FullName(const string& job, int replica, int task,
-                         const string& type, int id);
+  static std::string FullName(const std::string& job, int replica, int task,
+                              const std::string& type, int id);
 
   struct ParsedName {
     void Clear() {
@@ -79,13 +79,13 @@ class DeviceNameUtils {
     }
 
     bool has_job = false;
-    string job;
+    std::string job;
     bool has_replica = false;
     int replica = 0;
     bool has_task = false;
     int task = 0;
     bool has_type = false;
-    string type;
+    std::string type;
     bool has_id = false;
     int id = 0;
   };
@@ -107,7 +107,7 @@ class DeviceNameUtils {
   // an error and *canonical_name is set to "".
   static Status CanonicalizeDeviceName(StringPiece fullname,
                                        StringPiece basename,
-                                       string* canonical_name);
+                                       std::string* canonical_name);
 
   // Returns true if "name" specifies any non-trivial constraint on the device.
   static bool HasSomeDetails(const ParsedName& name) {
@@ -163,11 +163,11 @@ class DeviceNameUtils {
   static const ParsedName AddressSpace(const ParsedName& name);
 
   // Returns the local device given its "type" and "id".
-  static string LocalName(StringPiece type, int id);
+  static std::string LocalName(StringPiece type, int id);
 
   // Returns a short local device name (cpu:0, gpu:1, etc) based on
   // the given fullname.
-  static string LocalName(StringPiece fullname);
+  static std::string LocalName(StringPiece fullname);
 
   // If "name" is a valid local device name (cpu:0, gpu:1, etc.),
   // fills in parsed.type and parsed.id accordingly. Returns true iff
@@ -181,13 +181,14 @@ class DeviceNameUtils {
   // component into *device.  This function will still return true if
   // the task component is empty, but it requires the relative device
   // component to be fully specified.
-  static bool SplitDeviceName(StringPiece name, string* task, string* device);
+  static bool SplitDeviceName(StringPiece name, std::string* task,
+                              std::string* device);
 
   // Get the task name from ParsedName. Return false if the task component is
   // not fully specified.
-  static bool GetTaskName(const ParsedName& pn, string* task);
+  static bool GetTaskName(const ParsedName& pn, std::string* task);
 
-  static string ParsedNameToString(const ParsedName& pn);
+  static std::string ParsedNameToString(const ParsedName& pn);
 
   // Returns canonical and legacy full names for the given parsed
   // device name 'pn'. The returned string names are often useful to
@@ -202,8 +203,8 @@ class DeviceNameUtils {
 
   // Returns name of the CPU:0 device on the same host as the device
   // `device_name`.
-  static Status DeviceNameToCpuDeviceName(const string& device_name,
-                                          string* host_device_name);
+  static Status DeviceNameToCpuDeviceName(const std::string& device_name,
+                                          std::string* host_device_name);
 };
 
 std::ostream& operator<<(std::ostream& os,
diff --git a/tensorflow/core/util/example_proto_helper.cc b/tensorflow/core/util/example_proto_helper.cc
index 117991a2f64..bc6d0b896fb 100644
--- a/tensorflow/core/util/example_proto_helper.cc
+++ b/tensorflow/core/util/example_proto_helper.cc
@@ -155,11 +155,13 @@ int64 CopyIntoSparseTensor(const Tensor& in, const int batch,
   CHECK_EQ(dtype, values->dtype());
 
   // Update indices.
-  auto ix_t = indices->matrix<int64>();
-  int64* ix_p = &ix_t(offset, 0);
-  for (int64 i = 0; i < num_elements; ++i, ix_p += 2) {
-    *ix_p = batch;    // Column 0 stores the batch entry
-    *(ix_p + 1) = i;  // Column 1 stores the index in the batch
+  if (num_elements > 0) {
+    auto ix_t = indices->matrix<int64>();
+    int64* ix_p = &ix_t(offset, 0);
+    for (int64 i = 0; i < num_elements; ++i, ix_p += 2) {
+      *ix_p = batch;    // Column 0 stores the batch entry
+      *(ix_p + 1) = i;  // Column 1 stores the index in the batch
+    }
   }
 
   // Copy values over.
diff --git a/tensorflow/core/util/gpu_launch_config.h b/tensorflow/core/util/gpu_launch_config.h
index 0b943e917da..4c2df39e1a2 100644
--- a/tensorflow/core/util/gpu_launch_config.h
+++ b/tensorflow/core/util/gpu_launch_config.h
@@ -168,10 +168,25 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
       block_size_limit);
   CHECK_EQ(err, cudaSuccess);
 #elif TENSORFLOW_USE_ROCM
+#if TENSORFLOW_COMPILER_IS_HIP_CLANG
   hipError_t err = hipOccupancyMaxPotentialBlockSize(
       &block_count, &thread_per_block, func, dynamic_shared_memory_size,
       block_size_limit);
   CHECK_EQ(err, hipSuccess);
+#else
+  // Earlier versions of this HIP routine incorrectly returned void.
+  // TODO re-enable hipError_t error checking when HIP is fixed.
+  // ROCm interface uses unsigned int, convert after checking
+  uint32_t block_count_uint = 0;
+  uint32_t thread_per_block_uint = 0;
+  CHECK_GE(block_size_limit, 0);
+  uint32_t block_size_limit_uint = static_cast<uint32_t>(block_size_limit);
+  hipOccupancyMaxPotentialBlockSize(&block_count_uint, &thread_per_block_uint,
+                                    func, dynamic_shared_memory_size,
+                                    block_size_limit_uint);
+  block_count = static_cast<int>(block_count_uint);
+  thread_per_block = static_cast<int>(thread_per_block_uint);
+#endif
 #endif
 
   block_count =
@@ -201,9 +216,22 @@ GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize(
       &block_count, func, fixed_block_size, dynamic_shared_memory_size);
   CHECK_EQ(err, cudaSuccess);
 #elif TENSORFLOW_USE_ROCM
+#if TENSORFLOW_COMPILER_IS_HIP_CLANG
   hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
       &block_count, func, fixed_block_size, dynamic_shared_memory_size);
   CHECK_EQ(err, hipSuccess);
+#else
+  // Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&)
+  // that the kernel is quite simple and will largely be memory-limited.
+  const int physical_thread_count = std::min(
+      d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
+      work_element_count);
+  // Assume the kernel be simple enough that it is okay to use 1024 threads
+  // per workgroup.
+  int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
+  block_count = std::min(DivUp(physical_thread_count, thread_per_block),
+                         d.getNumGpuMultiProcessors());
+#endif
 #endif
   block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
                          DivUp(work_element_count, fixed_block_size));
diff --git a/tensorflow/core/kernels/image_resizer_state.h b/tensorflow/core/util/image_resizer_state.h
similarity index 98%
rename from tensorflow/core/kernels/image_resizer_state.h
rename to tensorflow/core/util/image_resizer_state.h
index 1b1550fd47a..b302021918d 100644
--- a/tensorflow/core/kernels/image_resizer_state.h
+++ b/tensorflow/core/util/image_resizer_state.h
@@ -18,12 +18,12 @@ limitations under the License.
 // reduce code duplication and ensure consistency across the different
 // resizers, it performs the input validation.
 
-#ifndef TENSORFLOW_CORE_KERNELS_IMAGE_RESIZER_STATE_H_
-#define TENSORFLOW_CORE_KERNELS_IMAGE_RESIZER_STATE_H_
+#ifndef TENSORFLOW_CORE_KERNELS_UTIL_IMAGE_RESIZER_STATE_H_
+#define TENSORFLOW_CORE_KERNELS_UTIL_IMAGE_RESIZER_STATE_H_
 
 #define EIGEN_USE_THREADS
-
 #include <math.h>
+
 #include <algorithm>
 #include <array>
 
@@ -228,4 +228,4 @@ struct ImageResizerGradientState {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_IMAGE_RESIZER_STATE_H_
+#endif  // TENSORFLOW_CORE_KERNELS_UTIL_IMAGE_RESIZER_STATE_H_
diff --git a/tensorflow/core/util/memmapped_file_system.cc b/tensorflow/core/util/memmapped_file_system.cc
index a07c1fbb7be..c6bda8b07e9 100644
--- a/tensorflow/core/util/memmapped_file_system.cc
+++ b/tensorflow/core/util/memmapped_file_system.cc
@@ -86,7 +86,8 @@ class RandomAccessFileFromMemmapped : public RandomAccessFile {
 
 MemmappedFileSystem::MemmappedFileSystem() {}
 
-Status MemmappedFileSystem::FileExists(const string& fname) {
+Status MemmappedFileSystem::FileExists(const string& fname,
+                                       TransactionToken* token) {
   if (!mapped_memory_) {
     return errors::FailedPrecondition("MemmappedEnv is not initialized");
   }
@@ -98,7 +99,8 @@ Status MemmappedFileSystem::FileExists(const string& fname) {
 }
 
 Status MemmappedFileSystem::NewRandomAccessFile(
-    const string& filename, std::unique_ptr<RandomAccessFile>* result) {
+    const string& filename, TransactionToken* token,
+    std::unique_ptr<RandomAccessFile>* result) {
   if (!mapped_memory_) {
     return errors::FailedPrecondition("MemmappedEnv is not initialized");
   }
@@ -113,7 +115,8 @@ Status MemmappedFileSystem::NewRandomAccessFile(
 }
 
 Status MemmappedFileSystem::NewReadOnlyMemoryRegionFromFile(
-    const string& filename, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
+    const string& filename, TransactionToken* token,
+    std::unique_ptr<ReadOnlyMemoryRegion>* result) {
   if (!mapped_memory_) {
     return errors::FailedPrecondition("MemmappedEnv is not initialized");
   }
@@ -127,7 +130,8 @@ Status MemmappedFileSystem::NewReadOnlyMemoryRegionFromFile(
   return Status::OK();
 }
 
-Status MemmappedFileSystem::GetFileSize(const string& filename, uint64* size) {
+Status MemmappedFileSystem::GetFileSize(const string& filename,
+                                        TransactionToken* token, uint64* size) {
   if (!mapped_memory_) {
     return errors::FailedPrecondition("MemmappedEnv is not initialized");
   }
@@ -139,9 +143,10 @@ Status MemmappedFileSystem::GetFileSize(const string& filename, uint64* size) {
   return Status::OK();
 }
 
-Status MemmappedFileSystem::Stat(const string& fname, FileStatistics* stat) {
+Status MemmappedFileSystem::Stat(const string& fname, TransactionToken* token,
+                                 FileStatistics* stat) {
   uint64 size;
-  auto status = GetFileSize(fname, &size);
+  auto status = GetFileSize(fname, token, &size);
   if (status.ok()) {
     stat->length = size;
   }
@@ -149,40 +154,48 @@ Status MemmappedFileSystem::Stat(const string& fname, FileStatistics* stat) {
 }
 
 Status MemmappedFileSystem::NewWritableFile(const string& filename,
+                                            TransactionToken* token,
                                             std::unique_ptr<WritableFile>* wf) {
   return errors::Unimplemented("memmapped format doesn't support writing");
 }
 
 Status MemmappedFileSystem::NewAppendableFile(
-    const string& filename, std::unique_ptr<WritableFile>* result) {
+    const string& filename, TransactionToken* token,
+    std::unique_ptr<WritableFile>* result) {
   return errors::Unimplemented("memmapped format doesn't support writing");
 }
 
 Status MemmappedFileSystem::GetChildren(const string& filename,
+                                        TransactionToken* token,
                                         std::vector<string>* strings) {
   return errors::Unimplemented("memmapped format doesn't support GetChildren");
 }
 
 Status MemmappedFileSystem::GetMatchingPaths(const string& pattern,
+                                             TransactionToken* token,
                                              std::vector<string>* results) {
   return errors::Unimplemented(
       "memmapped format doesn't support GetMatchingPaths");
 }
 
-Status MemmappedFileSystem::DeleteFile(const string& filename) {
+Status MemmappedFileSystem::DeleteFile(const string& filename,
+                                       TransactionToken* token) {
   return errors::Unimplemented("memmapped format doesn't support DeleteFile");
 }
 
-Status MemmappedFileSystem::CreateDir(const string& dirname) {
+Status MemmappedFileSystem::CreateDir(const string& dirname,
+                                      TransactionToken* token) {
   return errors::Unimplemented("memmapped format doesn't support CreateDir");
 }
 
-Status MemmappedFileSystem::DeleteDir(const string& dirname) {
+Status MemmappedFileSystem::DeleteDir(const string& dirname,
+                                      TransactionToken* token) {
   return errors::Unimplemented("memmapped format doesn't support DeleteDir");
 }
 
 Status MemmappedFileSystem::RenameFile(const string& filename_from,
-                                       const string& filename_to) {
+                                       const string& filename_to,
+                                       TransactionToken* token) {
   return errors::Unimplemented("memmapped format doesn't support RenameFile");
 }
 
diff --git a/tensorflow/core/util/memmapped_file_system.h b/tensorflow/core/util/memmapped_file_system.h
index 64b8c580fd4..27305a500f5 100644
--- a/tensorflow/core/util/memmapped_file_system.h
+++ b/tensorflow/core/util/memmapped_file_system.h
@@ -60,32 +60,39 @@ class MemmappedFileSystem : public FileSystem {
 
   MemmappedFileSystem();
   ~MemmappedFileSystem() override = default;
-  Status FileExists(const string& fname) override;
+
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
+  Status FileExists(const string& fname, TransactionToken* token) override;
   Status NewRandomAccessFile(
-      const string& filename,
+      const string& filename, TransactionToken* token,
       std::unique_ptr<RandomAccessFile>* result) override;
   Status NewReadOnlyMemoryRegionFromFile(
-      const string& filename,
+      const string& filename, TransactionToken* token,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
 
   // All these functions return Unimplemented error, the memmapped storage is
   // read only.
-  Status NewWritableFile(const string& fname,
+  Status NewWritableFile(const string& fname, TransactionToken* token,
                          std::unique_ptr<WritableFile>* result) override;
-  Status NewAppendableFile(const string& fname,
+  Status NewAppendableFile(const string& fname, TransactionToken* token,
                            std::unique_ptr<WritableFile>* result) override;
-  Status GetChildren(const string& dir, std::vector<string>* r) override;
-  Status GetMatchingPaths(const string& pattern,
+  Status GetChildren(const string& dir, TransactionToken* token,
+                     std::vector<string>* r) override;
+  Status GetMatchingPaths(const string& pattern, TransactionToken* token,
                           std::vector<string>* results) override;
-  Status DeleteFile(const string& f) override;
-  Status CreateDir(const string& d) override;
-  Status DeleteDir(const string& d) override;
-  Status RenameFile(const string& s, const string& t) override;
+  Status DeleteFile(const string& f, TransactionToken* token) override;
+  Status CreateDir(const string& d, TransactionToken* token) override;
+  Status DeleteDir(const string& d, TransactionToken* token) override;
+  Status RenameFile(const string& s, const string& t,
+                    TransactionToken* token) override;
 
   // These functions are implemented.
-  Status GetFileSize(const string& f, uint64* s) override;
+  Status GetFileSize(const string& f, TransactionToken* token,
+                     uint64* s) override;
   // Currently just returns size.
-  Status Stat(const string& fname, FileStatistics* stat) override;
+  Status Stat(const string& fname, TransactionToken* token,
+              FileStatistics* stat) override;
 
   // Initializes filesystem from a file in memmapped format.
   Status InitializeFromFile(Env* env, const string& filename);
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index a4a3f5ff778..56a14cca04a 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -2281,5 +2281,23 @@ inline bool IsConv1x1StrideNot1(memory::dims filter_dims,
 }
 
 }  // namespace tensorflow
+
+/////////////////////////////////////////////////////////////////////
+// Macros for handling registeration for various types
+/////////////////////////////////////////////////////////////////////
+
+#define REGISTER_TEST_FLOAT32(TEST) REGISTER_TEST(TEST, DT_FLOAT, Float32Input);
+
+#ifdef ENABLE_INTEL_MKL_BFLOAT16
+#define REGISTER_TEST_BFLOAT16(TEST) \
+  REGISTER_TEST(TEST, DT_BFLOAT16, BFloat16Input);
+
+#define REGISTER_TEST_ALL_TYPES(TEST) \
+  REGISTER_TEST_FLOAT32(TEST);        \
+  REGISTER_TEST_BFLOAT16(TEST);
+#else
+#define REGISTER_TEST_ALL_TYPES(TEST) REGISTER_TEST_FLOAT32(TEST);
+#endif  // ENABLE_INTEL_MKL_BFLOAT16
+
 #endif  // INTEL_MKL
 #endif  // TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
diff --git a/tensorflow/core/util/padding.cc b/tensorflow/core/util/padding.cc
index c26de3b748c..3948f1af9b3 100644
--- a/tensorflow/core/util/padding.cc
+++ b/tensorflow/core/util/padding.cc
@@ -37,7 +37,8 @@ Status CheckValidPadding(Padding padding_type,
                          const std::vector<int64>& explicit_paddings,
                          int num_dims, TensorFormat data_format) {
   if (padding_type == Padding::EXPLICIT) {
-    if (static_cast<int>(explicit_paddings.size()) != 2 * num_dims) {
+    const int num_paddings = explicit_paddings.size();
+    if (num_paddings != 2 * num_dims) {
       return errors::InvalidArgument(
           "explicit_paddings attribute must contain ", 2 * num_dims,
           " values, but got: ", explicit_paddings.size());
diff --git a/tensorflow/core/util/padding.h b/tensorflow/core/util/padding.h
index 90e353d23fa..b96c47753b8 100644
--- a/tensorflow/core/util/padding.h
+++ b/tensorflow/core/util/padding.h
@@ -53,12 +53,12 @@ Status CheckValidPadding(Padding padding_type,
 
 // Return the string containing the list of valid padding types, that can be
 // used as an Attr() in REGISTER_OP.
-string GetPaddingAttrString();
+std::string GetPaddingAttrString();
 
 // Like GetPaddingAttrString(), but also includes EXPLICIT.
-string GetPaddingAttrStringWithExplicit();
+std::string GetPaddingAttrStringWithExplicit();
 
-string GetExplicitPaddingsAttrString();
+std::string GetExplicitPaddingsAttrString();
 
 // Sets padding value based on the given string padding value.
 Status GetPaddingFromString(StringPiece str_value, Padding* value);
diff --git a/tensorflow/core/ops/ragged_to_dense_util.cc b/tensorflow/core/util/ragged_to_dense_util.cc
similarity index 99%
rename from tensorflow/core/ops/ragged_to_dense_util.cc
rename to tensorflow/core/util/ragged_to_dense_util.cc
index ecb95e163ab..cd95b5ec75b 100644
--- a/tensorflow/core/ops/ragged_to_dense_util.cc
+++ b/tensorflow/core/util/ragged_to_dense_util.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/ops/ragged_to_dense_util.h"
+#include "tensorflow/core/util/ragged_to_dense_util.h"
 
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
diff --git a/tensorflow/core/ops/ragged_to_dense_util.h b/tensorflow/core/util/ragged_to_dense_util.h
similarity index 100%
rename from tensorflow/core/ops/ragged_to_dense_util.h
rename to tensorflow/core/util/ragged_to_dense_util.h
diff --git a/tensorflow/core/ops/ragged_to_dense_util_test.cc b/tensorflow/core/util/ragged_to_dense_util_test.cc
similarity index 99%
rename from tensorflow/core/ops/ragged_to_dense_util_test.cc
rename to tensorflow/core/util/ragged_to_dense_util_test.cc
index d3d9e68ae2e..7b8f2c4d3b2 100644
--- a/tensorflow/core/ops/ragged_to_dense_util_test.cc
+++ b/tensorflow/core/util/ragged_to_dense_util_test.cc
@@ -13,10 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/ops/ragged_to_dense_util.h"
+#include "tensorflow/core/util/ragged_to_dense_util.h"
 
 #include <gmock/gmock.h>
-#include <gtest/gtest.h>
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor_shape.h"
diff --git a/tensorflow/core/util/reporter.cc b/tensorflow/core/util/reporter.cc
index 8e9d863b4c2..44465a58329 100644
--- a/tensorflow/core/util/reporter.cc
+++ b/tensorflow/core/util/reporter.cc
@@ -91,6 +91,14 @@ Status TestReporter::SetProperty(const string& name, double value) {
   return Status::OK();
 }
 
+Status TestReporter::AddMetric(const string& name, double value) {
+  if (report_file_.IsClosed()) return Status::OK();
+  auto* metric = benchmark_entry_.add_metrics();
+  metric->set_name(name);
+  metric->set_value(value);
+  return Status::OK();
+}
+
 Status TestReporter::Initialize() { return report_file_.Initialize(); }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/reporter.h b/tensorflow/core/util/reporter.h
index 51d7502701c..900fe40353e 100644
--- a/tensorflow/core/util/reporter.h
+++ b/tensorflow/core/util/reporter.h
@@ -111,6 +111,9 @@ class TestReporter {
   // Set property on Benchmark to the given value.
   Status SetProperty(const string& name, const string& value);
 
+  // Add the given value to the metrics on the Benchmark.
+  Status AddMetric(const string& name, double value);
+
   // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
   ~TestReporter() { Close().IgnoreError(); }  // Autoclose in destructor.
 
diff --git a/tensorflow/core/util/reporter_test.cc b/tensorflow/core/util/reporter_test.cc
index 4c06560b852..77e7ed6467e 100644
--- a/tensorflow/core/util/reporter_test.cc
+++ b/tensorflow/core/util/reporter_test.cc
@@ -138,5 +138,30 @@ TEST(TestReporter, SetProperties) {
   EXPECT_EQ(4.0, extras.at("double_prop").double_value());
 }
 
+TEST(TestReporter, AddMetrics) {
+  string fname =
+      strings::StrCat(testing::TmpDir(), "/test_reporter_benchmarks_");
+  TestReporter test_reporter(fname, "b3/4/5");
+  TF_EXPECT_OK(test_reporter.Initialize());
+  TF_EXPECT_OK(test_reporter.AddMetric("metric1", 2.0));
+  TF_EXPECT_OK(test_reporter.AddMetric("metric2", 3.0));
+
+  TF_EXPECT_OK(test_reporter.Close());
+  string expected_fname = strings::StrCat(fname, "b3__4__5");
+  string read;
+  TF_EXPECT_OK(ReadFileToString(Env::Default(), expected_fname, &read));
+
+  BenchmarkEntries benchmark_entries;
+  ASSERT_TRUE(benchmark_entries.ParseFromString(read));
+  ASSERT_EQ(1, benchmark_entries.entry_size());
+  const BenchmarkEntry& benchmark_entry = benchmark_entries.entry(0);
+  const auto& metrics = benchmark_entry.metrics();
+  ASSERT_EQ(2, metrics.size());
+  EXPECT_EQ("metric1", metrics.at(0).name());
+  EXPECT_EQ(2.0, metrics.at(0).value());
+  EXPECT_EQ("metric2", metrics.at(1).name());
+  EXPECT_EQ(3.0, metrics.at(1).value());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/rocm_solvers.cc b/tensorflow/core/util/rocm_solvers.cc
similarity index 99%
rename from tensorflow/core/kernels/rocm_solvers.cc
rename to tensorflow/core/util/rocm_solvers.cc
index 5faf718332e..13dadf602a7 100644
--- a/tensorflow/core/kernels/rocm_solvers.cc
+++ b/tensorflow/core/util/rocm_solvers.cc
@@ -14,7 +14,7 @@
    ==============================================================================
 */
 #if TENSORFLOW_USE_ROCM
-#include "tensorflow/core/kernels/rocm_solvers.h"
+#include "tensorflow/core/util/rocm_solvers.h"
 
 #include <complex>
 #include <unordered_map>
diff --git a/tensorflow/core/kernels/rocm_solvers.h b/tensorflow/core/util/rocm_solvers.h
similarity index 96%
rename from tensorflow/core/kernels/rocm_solvers.h
rename to tensorflow/core/util/rocm_solvers.h
index 94d3c82a497..afc8b936d05 100644
--- a/tensorflow/core/kernels/rocm_solvers.h
+++ b/tensorflow/core/util/rocm_solvers.h
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================
 */
 
-#ifndef TENSORFLOW_CORE_KERNELS_ROCM_SOLVERS_H_
-#define TENSORFLOW_CORE_KERNELS_ROCM_SOLVERS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_ROCM_SOLVERS_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_ROCM_SOLVERS_H_
 
 // This header declares the class ROCmSolver, which contains wrappers of linear
 // algebra solvers in the cuBlas and cuSolverDN libraries for use in TensorFlow
@@ -158,4 +158,4 @@ class ScratchSpace {
 
 #endif  // TENSORFLOW_USE_ROCM
 
-#endif  // TENSORFLOW_CORE_KERNELS_ROCM_SOLVERS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_ROCM_SOLVERS_H_
diff --git a/tensorflow/core/kernels/rocm_sparse.cc b/tensorflow/core/util/rocm_sparse.cc
similarity index 99%
rename from tensorflow/core/kernels/rocm_sparse.cc
rename to tensorflow/core/util/rocm_sparse.cc
index 97488692bc1..cc7b56fdc01 100644
--- a/tensorflow/core/kernels/rocm_sparse.cc
+++ b/tensorflow/core/util/rocm_sparse.cc
@@ -24,8 +24,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/cuda_solvers.h"
-#include "tensorflow/core/kernels/cuda_sparse.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -35,6 +33,8 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_solvers.h"
+#include "tensorflow/core/util/cuda_sparse.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index 17718cf7bb6..bb18000fcfe 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -925,7 +925,8 @@ Status BundleReader::GetValue(const BundleEntryProto& entry, Tensor* val) {
   }
   if (crc32c::Unmask(entry.crc32c()) != actual_crc32c) {
     return errors::DataLoss(
-        "Checksum does not match: stored ",
+        "TensorBundle at ", prefix_, " shard ", entry.shard_id(), " (",
+        entry.size(), " bytes): Checksum does not match: stored ",
         strings::Printf("%08u", crc32c::Unmask(entry.crc32c())),
         " vs. calculated on the restored bytes ", actual_crc32c);
   }
diff --git a/tensorflow/core/util/tensor_format.cc b/tensorflow/core/util/tensor_format.cc
index 5dbd8ef318f..008c4d45200 100644
--- a/tensorflow/core/util/tensor_format.cc
+++ b/tensorflow/core/util/tensor_format.cc
@@ -73,7 +73,7 @@ string ToString(FilterTensorFormat format) {
   }
 }
 
-bool FormatFromString(const string& format_str, TensorFormat* format) {
+bool FormatFromString(absl::string_view format_str, TensorFormat* format) {
   if (format_str == "NHWC" || format_str == "NDHWC") {
     *format = FORMAT_NHWC;
     return true;
@@ -101,7 +101,7 @@ bool FormatFromString(const string& format_str, TensorFormat* format) {
   return false;
 }
 
-bool FilterFormatFromString(const string& format_str,
+bool FilterFormatFromString(absl::string_view format_str,
                             FilterTensorFormat* format) {
   if (format_str == "HWIO" || format_str == "DHWIO") {
     *format = FORMAT_HWIO;
diff --git a/tensorflow/core/util/tensor_format.h b/tensorflow/core/util/tensor_format.h
index aea7021d0bd..07762f84300 100644
--- a/tensorflow/core/util/tensor_format.h
+++ b/tensorflow/core/util/tensor_format.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <array>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -97,18 +98,18 @@ enum FilterTensorFormat {
 
 // Parse tensor format from the given string.
 // Return true if the parsing succeeds, and false if it fails.
-bool FormatFromString(const string& format_str, TensorFormat* format);
+bool FormatFromString(absl::string_view format_str, TensorFormat* format);
 
 // Parse tensor format from the given string.
 // Return true if the parsing succeeds, and false if it fails.
-bool FilterFormatFromString(const string& format_str,
+bool FilterFormatFromString(absl::string_view format_str,
                             FilterTensorFormat* format);
 
 // Convert a tensor format into string.
-string ToString(TensorFormat format);
+std::string ToString(TensorFormat format);
 
 // Convert a filter tensor format into string.
-string ToString(FilterTensorFormat format);
+std::string ToString(FilterTensorFormat format);
 
 // Returns the number of spatial dims of a tensor of rank 'num_dims' and tensor
 // format 'format'.
@@ -504,13 +505,13 @@ inline void GetExplicitPaddingForDim(
 }
 
 // Return the string that specifies the data format for convnet operations.
-string GetConvnetDataFormatAttrString();
-string GetConvnet3dDataFormatAttrString();
+std::string GetConvnetDataFormatAttrString();
+std::string GetConvnet3dDataFormatAttrString();
 
 // Return the string that specifies the filter format for convnet operations.
-string GetConvnetFilterFormatAttrString();
-string GetConvnet3dFilterFormatAttrString();
-string GetConvnetDataFormat2D3DAttrString();
+std::string GetConvnetFilterFormatAttrString();
+std::string GetConvnet3dFilterFormatAttrString();
+std::string GetConvnetDataFormat2D3DAttrString();
 
 // Returns a tensor shape for the specified format and dimension sizes.
 // Works for both 2D and 3D operations. The output shapes are as follows:
diff --git a/tensorflow/core/util/util.h b/tensorflow/core/util/util.h
index 4aa47aa48a2..74b3ec79eb0 100644
--- a/tensorflow/core/util/util.h
+++ b/tensorflow/core/util/util.h
@@ -49,12 +49,12 @@ class MovingAverage {
 
 // Returns a string printing bytes in ptr[0..n).  The output looks
 // like "00 01 ef cd cd ef".
-string PrintMemory(const char* ptr, size_t n);
+std::string PrintMemory(const char* ptr, size_t n);
 
 // Given a flattened index into a tensor, computes a string s so that
 // StrAppend("tensor", s) is a Python indexing expression.  E.g.,
 // "tensor", "tensor[i]", "tensor[i, j]", etc.
-string SliceDebugString(const TensorShape& shape, const int64 flat);
+std::string SliceDebugString(const TensorShape& shape, const int64 flat);
 
 // disable MKL in runtime
 #ifdef INTEL_MKL
diff --git a/tensorflow/examples/android/.gitignore b/tensorflow/examples/android/.gitignore
index d245ab61095..fbd0a2dc7d7 100644
--- a/tensorflow/examples/android/.gitignore
+++ b/tensorflow/examples/android/.gitignore
@@ -27,3 +27,4 @@ out/
 .navigation/
 /captures
 .externalNativeBuild
+.cxx
diff --git a/tensorflow/examples/android/AndroidManifest.xml b/tensorflow/examples/android/AndroidManifest.xml
index a3b53da6a35..7ddf9e08bf4 100644
--- a/tensorflow/examples/android/AndroidManifest.xml
+++ b/tensorflow/examples/android/AndroidManifest.xml
@@ -16,6 +16,7 @@
 -->
 
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
     package="org.tensorflow.demo">
 
     <uses-permission android:name="android.permission.CAMERA" />
@@ -25,10 +26,10 @@
     <uses-permission android:name="android.permission.RECORD_AUDIO" />
 
     <application android:allowBackup="true"
-        android:debuggable="true"
         android:label="@string/app_name"
         android:icon="@drawable/ic_launcher"
-        android:theme="@style/MaterialTheme">
+        android:theme="@style/MaterialTheme"
+        android:banner="@drawable/ic_launcher">
 
         <activity android:name="org.tensorflow.demo.ClassifierActivity"
                   android:screenOrientation="portrait"
diff --git a/tensorflow/examples/android/build.gradle b/tensorflow/examples/android/build.gradle
index c499b935223..17fda55d693 100644
--- a/tensorflow/examples/android/build.gradle
+++ b/tensorflow/examples/android/build.gradle
@@ -29,8 +29,8 @@ buildscript {
     }
 
     dependencies {
-        classpath 'com.android.tools.build:gradle:3.3.1'
-        classpath 'org.apache.httpcomponents:httpclient:4.5.4'
+        classpath 'com.android.tools.build:gradle:4.0.1'
+        classpath 'org.apache.httpcomponents:httpclient:4.5.12'
     }
 }
 
@@ -82,13 +82,13 @@ apply from: "download-models.gradle"
 apply plugin: 'com.android.application'
 
 android {
-    compileSdkVersion 23
+    compileSdkVersion 29
 
     if (nativeBuildSystem == 'cmake') {
         defaultConfig {
             applicationId = 'org.tensorflow.demo'
             minSdkVersion 21
-            targetSdkVersion 23
+            targetSdkVersion 29
             ndk {
                 abiFilters "${cpuType}"
             }
@@ -141,7 +141,7 @@ android {
         release.setRoot('build-types/release')
     }
     defaultConfig {
-        targetSdkVersion 23
+        targetSdkVersion 29
         minSdkVersion 21
     }
 }
diff --git a/tensorflow/examples/android/download-models.gradle b/tensorflow/examples/android/download-models.gradle
index d3b67eab52b..727ef2cc850 100644
--- a/tensorflow/examples/android/download-models.gradle
+++ b/tensorflow/examples/android/download-models.gradle
@@ -23,7 +23,7 @@ buildscript {
         jcenter()
     }
     dependencies {
-        classpath 'de.undercouch:gradle-download-task:3.2.0'
+        classpath 'de.undercouch:gradle-download-task:4.0.2'
     }
 }
 
diff --git a/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.properties b/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.properties
index bd9ee87db37..4a0bf945ec9 100644
--- a/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.properties
+++ b/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.properties
@@ -1,6 +1,6 @@
-#Sat Nov 18 15:06:47 CET 2017
+#Thu Jul 23 05:42:16 CEST 2020
 distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
 zipStoreBase=GRADLE_USER_HOME
 zipStorePath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-4.1-all.zip
+distributionUrl=https\://services.gradle.org/distributions/gradle-6.5.1-all.zip
diff --git a/tensorflow/examples/android/gradlew b/tensorflow/examples/android/gradlew
old mode 100644
new mode 100755
diff --git a/tensorflow/examples/android/settings.gradle b/tensorflow/examples/android/settings.gradle
new file mode 100644
index 00000000000..c3e96d78369
--- /dev/null
+++ b/tensorflow/examples/android/settings.gradle
@@ -0,0 +1 @@
+rootProject.name = "TensorFlow sample"
diff --git a/tensorflow/examples/label_image/BUILD b/tensorflow/examples/label_image/BUILD
index a0e5005d45a..7c3a6dca1b2 100644
--- a/tensorflow/examples/label_image/BUILD
+++ b/tensorflow/examples/label_image/BUILD
@@ -38,7 +38,7 @@ tf_cc_binary(
             "//tensorflow/core:portable_tensorflow_lib",
             # cc:android_tensorflow_image_op is for including jpeg/gif/png
             # decoder to enable real-image evaluation on Android
-            "//tensorflow/core/kernels:android_tensorflow_image_op",
+            "//tensorflow/core/kernels/image:android_tensorflow_image_op",
         ],
         "//conditions:default": [
             "//tensorflow/cc:cc_ops",
diff --git a/tensorflow/examples/saved_model/integration_tests/saved_model_test.py b/tensorflow/examples/saved_model/integration_tests/saved_model_test.py
index 6333e55999e..434d5ed4ad5 100644
--- a/tensorflow/examples/saved_model/integration_tests/saved_model_test.py
+++ b/tensorflow/examples/saved_model/integration_tests/saved_model_test.py
@@ -90,8 +90,8 @@ class SavedModelTest(scripts.TestCase, parameterized.TestCase):
               retrain_flag_value=["true", "false"],
               regularization_loss_multiplier=[None, 2],  # Test for b/134528831.
           )),
-      test_combinations=(distribute_combinations.NamedGPUCombination(),
-                         distribute_combinations.NamedTPUCombination()))
+      test_combinations=(distribute_combinations.GPUCombination(),
+                         distribute_combinations.TPUCombination()))
 
   @combinations.generate(**TEST_MNIST_CNN_GENERATE_KWARGS)
   def test_mnist_cnn(self, use_keras_save_api, named_strategy,
diff --git a/tensorflow/g3doc/README.txt b/tensorflow/g3doc/README.txt
deleted file mode 100644
index 515a9e9a025..00000000000
--- a/tensorflow/g3doc/README.txt
+++ /dev/null
@@ -1,46 +0,0 @@
-Docs have moved!  If you just want to view TensorFlow documentation,
-go to:
-
-   https://www.tensorflow.org/
-
-Documentation (on Github, tensorflow.org, and anywhere else we decide to
-serve it from) is now generated from the files in
-tensorflow/docs_src/ (for tutorials and other guides) and
-TensorFlow source code (for the API reference pages). If you see a problem with
-API reference, edit the code comments in the appropriate language. If you see a
-problem with our other docs, edit the files in docs_src.
-
-To preview the results of your changes, or generate an offline copy of
-the docs, run:
-
-  bazel run -- tensorflow/tools/docs:generate \
-    --src_dir=/path/to/tensorflow/docs_src/ \
-    --output_dir=/tmp/tfdocs/
-
-`src_dir` must be absolute path to documentation source.
-When authoring docs, note that we have some new syntax for references --
-at least for docs coming from Python docstrings or
-tensorflow/docs_src/.  Use:
-
-* `tf.symbol` to make a link to the reference page for a Python
-  symbol.  Note that class members don't get their own page, but the
-  syntax still works, since `tf.MyClass.method` links to the right
-  part of the tf.MyClass page.
-
-* `tensorflow::symbol` to make a link to the reference page for a C++
-  symbol. (This only works for a few symbols but will work for more soon.)
-
-* @{$doc_page} to make a link to another (not an API reference) doc
-  page. To link to
-    - red/green/blue/index.md use @{$blue} or @{$green/blue},
-    - foo/bar/baz.md use @{$baz} or @{$bar/baz}.
-  The shorter one is preferred, so we can move pages around without
-  breaking these references. The main exception is that the Python API
-  guides should probably be referred to using @{$python/<guide-name>}
-  to avoid ambiguity. To link to an anchor in that doc and use
-  different link text (by default it uses the title of the target
-  page) use:
-        @{$doc_page#anchor-tag$link-text}
-  (You can skip #anchor-tag if you just want to override the link text).
-
-Thanks!
diff --git a/tensorflow/go/genop/generate.go b/tensorflow/go/genop/generate.go
new file mode 100644
index 00000000000..3a02ff3ce2a
--- /dev/null
+++ b/tensorflow/go/genop/generate.go
@@ -0,0 +1,21 @@
+/*
+Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package stub file for non-Windows builds.
+
+//go:generate bash generate.sh
+
+package main
diff --git a/tensorflow/go/genop/generate.sh b/tensorflow/go/genop/generate.sh
index 18bee11da5a..547dd790e05 100644
--- a/tensorflow/go/genop/generate.sh
+++ b/tensorflow/go/genop/generate.sh
@@ -24,6 +24,16 @@ then
   GOPATH=$(go env GOPATH)
 fi
 
+# convert GOPATH's Windows style to UNIX style
+if [ $1 == "win" ]; then
+  # eg: convert "D:\go-14;D:\go-13" to "D\go-14;D\go-13"
+  GOPATH=${GOPATH//:\\/\\}
+  # eg: convert "D\go-14;D\go-13" to "\D\go-14:\D\go-13"
+  GOPATH=\\${GOPATH//;/:\\}
+  # eg: convert "\D\go-14:\D\go-13" to "/D/go-14:/D/go-13"
+  GOPATH=${GOPATH//\\/\/}
+fi
+
 cd $(dirname $0)
 for g in $(echo "${GOPATH//:/ }"); do
     TF_DIR="${g}/src/github.com/tensorflow/tensorflow"
diff --git a/tensorflow/go/genop/generate.win.go b/tensorflow/go/genop/generate.win.go
new file mode 100644
index 00000000000..5b50dda618d
--- /dev/null
+++ b/tensorflow/go/genop/generate.win.go
@@ -0,0 +1,21 @@
+/*
+Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package stub file for Windows builds.
+
+//go:generate bash generate.sh win
+
+package main
diff --git a/tensorflow/go/genop/internal/genop.go b/tensorflow/go/genop/internal/genop.go
index c4ea8abb543..f6bfdbbdf29 100644
--- a/tensorflow/go/genop/internal/genop.go
+++ b/tensorflow/go/genop/internal/genop.go
@@ -110,13 +110,13 @@ func generateFunctionsForOps(w io.Writer, ops *odpb.OpList, apimap *apiDefMap) e
 	if err := tmplHeader.Execute(w, thisPackage); err != nil {
 		return err
 	}
-	blacklist := map[string]bool{
+	denylist := map[string]bool{
 		"Const":           true,
 		"PyFunc":          true,
 		"PyFuncStateless": true,
 	}
 	for _, op := range ops.Op {
-		if blacklist[op.Name] {
+		if denylist[op.Name] {
 			continue
 		}
 		apidef, err := apimap.Get(op.Name)
diff --git a/tensorflow/go/genop/main.go b/tensorflow/go/genop/main.go
index 4a53084ed13..87c1d27c3b5 100644
--- a/tensorflow/go/genop/main.go
+++ b/tensorflow/go/genop/main.go
@@ -14,8 +14,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-//go:generate bash generate.sh
-
 // Command genop generates a Go source file with functions for TensorFlow ops.
 package main
 
diff --git a/tensorflow/go/graph.go b/tensorflow/go/graph.go
index b3b2c9cc20a..ac28c3ac5bd 100644
--- a/tensorflow/go/graph.go
+++ b/tensorflow/go/graph.go
@@ -61,9 +61,33 @@ type GraphImportOptions struct {
 	// Execution device
 	Device string
 
+	// inputMapping defines a mapping between Outputs in the graph
+	// and Outputs they should be replaced with.
+	inputMapping map[struct {
+		Name  string
+		Index int
+	}]Output
+
 	// TODO: extend this structure to support more options from TF_ImportGraphDefOptions
 }
 
+// AddInputMapping adds a mapping between an Output in the imported graph
+// and an Ouput in the destination graph that it should be replaced with,
+// where src:srcIndex is the name of the Operation and Output index to
+// replace and dst is the output to replace it with.
+func (o *GraphImportOptions) AddInputMapping(src string, srcIndex int, dst Output) {
+	if o.inputMapping == nil {
+		o.inputMapping = make(map[struct {
+			Name  string
+			Index int
+		}]Output)
+	}
+	o.inputMapping[struct {
+		Name  string
+		Index int
+	}{src, srcIndex}] = dst
+}
+
 // NewGraph returns a new Graph.
 func NewGraph() *Graph {
 	g := &Graph{C.TF_NewGraph()}
@@ -122,6 +146,12 @@ func (g *Graph) ImportWithOptions(def []byte, options GraphImportOptions) error
 		C.TF_ImportGraphDefOptionsSetDefaultDevice(opts, cdev)
 	}
 
+	for src, dst := range options.inputMapping {
+		cSrcName := C.CString(src.Name)
+		C.TF_ImportGraphDefOptionsAddInputMapping(opts, cSrcName, C.int(src.Index), dst.c())
+		C.free(unsafe.Pointer(cSrcName))
+	}
+
 	buf := C.TF_NewBuffer()
 	defer C.TF_DeleteBuffer(buf)
 	buf.length = C.size_t(len(def))
diff --git a/tensorflow/go/graph_test.go b/tensorflow/go/graph_test.go
index 067c7db5c3c..bb112303807 100644
--- a/tensorflow/go/graph_test.go
+++ b/tensorflow/go/graph_test.go
@@ -82,6 +82,73 @@ func TestGraphWriteToAndImport(t *testing.T) {
 	}
 }
 
+func TestGraphInputMapping(t *testing.T) {
+	// Construct a graph
+	g := NewGraph()
+	v, err := NewTensor(int64(1))
+	if err != nil {
+		t.Fatal(err)
+	}
+	input, err := Placeholder(g, "input", v.DataType())
+	if err != nil {
+		t.Fatal(err)
+	}
+	neg, err := Neg(g, "neg", input)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Serialize the graph
+	buf := new(bytes.Buffer)
+	if _, err := g.WriteTo(buf); err != nil {
+		t.Fatal(err)
+	}
+
+	g = NewGraph()
+	v, err = NewTensor(int64(1))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	replacement, err := Placeholder(g, "replacement", v.DataType())
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	options := GraphImportOptions{
+		Prefix: "imported",
+	}
+	options.AddInputMapping("input", 0, replacement)
+	// Import it into the same graph, with a prefix and replacement
+	if err := g.ImportWithOptions(buf.Bytes(), options); err != nil {
+		t.Error(err)
+	}
+	if err := hasOperations(g, "replacement", "imported/neg"); err != nil {
+		t.Error(err)
+	}
+
+	sess, err := NewSession(g, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	neg = g.Operation("imported/neg").Output(0)
+
+	outputs, err := sess.Run(
+		map[Output]*Tensor{replacement: v},
+		[]Output{neg},
+		nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(outputs) != 1 {
+		t.Fatal(len(outputs))
+	}
+	if outputs[0].Value().(int64) != -1 {
+		t.Fatalf("Got %v, wanted int64 -1", outputs[0].Value())
+	}
+}
+
 func TestGraphAddGradients(t *testing.T) {
 	g := NewGraph()
 	x1, err := Placeholder(g, "x1", Float)
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 843ef2fb7e1..4d39ab20deb 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -2472,79 +2472,6 @@ func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset
 	return offset
 }
 
-// Checks a tensor for NaN and Inf values.
-//
-// When run, reports an `InvalidArgument` error if `tensor` has any values
-// that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
-//
-// Arguments:
-//
-//	message: Prefix of the error message.
-func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"message": message}
-	opspec := tf.OpSpec{
-		Type: "CheckNumerics",
-		Input: []tf.Input{
-			tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Broadcast an array for a compatible shape.
-//
-// Broadcasting is the process of making arrays to have compatible shapes
-// for arithmetic operations. Two shapes are compatible if for each
-// dimension pair they are either equal or one of them is one. When trying
-// to broadcast a Tensor to a shape, it starts with the trailing dimensions,
-// and works its way forward.
-//
-// For example,
-//
-// >>> x = tf.constant([1, 2, 3])
-// >>> y = tf.broadcast_to(x, [3, 3])
-// >>> print(y)
-// tf.Tensor(
-//     [[1 2 3]
-//      [1 2 3]
-//      [1 2 3]], shape=(3, 3), dtype=int32)
-//
-// In the above example, the input Tensor with the shape of `[1, 3]`
-// is broadcasted to output Tensor with shape of `[3, 3]`.
-//
-// When doing broadcasted operations such as multiplying a tensor
-// by a scalar, broadcasting (usually) confers some time or space
-// benefit, as the broadcasted tensor is never materialized.
-//
-// However, `broadcast_to` does not carry with it any such benefits.
-// The newly-created tensor takes the full memory of the broadcasted
-// shape. (In a graph context, `broadcast_to` might be fused to
-// subsequent operation and then be optimized away, however.)
-//
-// Arguments:
-//	input: A Tensor to broadcast.
-//	shape: An 1-D `int` Tensor. The shape of the desired output.
-//
-// Returns A Tensor.
-func BroadcastTo(scope *Scope, input tf.Output, shape tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BroadcastTo",
-		Input: []tf.Input{
-			input, shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Converts an array of flat indices into a tuple of coordinate arrays.
 //
 //
@@ -8418,6 +8345,47 @@ func OptionalFromValue(scope *Scope, components []tf.Output) (optional tf.Output
 	return op.Output(0)
 }
 
+// OptimizeDatasetV2Attr is an optional argument to OptimizeDatasetV2.
+type OptimizeDatasetV2Attr func(optionalAttr)
+
+// OptimizeDatasetV2OptimizationConfigs sets the optional optimization_configs attribute to value.
+// If not specified, defaults to <>
+func OptimizeDatasetV2OptimizationConfigs(value []string) OptimizeDatasetV2Attr {
+	return func(m optionalAttr) {
+		m["optimization_configs"] = value
+	}
+}
+
+// Creates a dataset by applying related optimizations to `input_dataset`.
+//
+// Creates a dataset by applying related optimizations to `input_dataset`.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the input dataset.
+//	optimizations_enabled: A `tf.string` vector `tf.Tensor` identifying user enabled optimizations.
+//	optimizations_disabled: A `tf.string` vector `tf.Tensor` identifying user disabled optimizations.
+//	optimizations_default: A `tf.string` vector `tf.Tensor` identifying optimizations by default.
+//
+//
+func OptimizeDatasetV2(scope *Scope, input_dataset tf.Output, optimizations_enabled tf.Output, optimizations_disabled tf.Output, optimizations_default tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...OptimizeDatasetV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OptimizeDatasetV2",
+		Input: []tf.Input{
+			input_dataset, optimizations_enabled, optimizations_disabled, optimizations_default,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // OptimizeDatasetAttr is an optional argument to OptimizeDataset.
 type OptimizeDatasetAttr func(optionalAttr)
 
@@ -8508,99 +8476,6 @@ func IteratorGetNextSync(scope *Scope, iterator tf.Output, output_types []tf.Dat
 	return components
 }
 
-// RaggedCountSparseOutputAttr is an optional argument to RaggedCountSparseOutput.
-type RaggedCountSparseOutputAttr func(optionalAttr)
-
-// RaggedCountSparseOutputMinlength sets the optional minlength attribute to value.
-//
-// value: Minimum value to count. Can be set to -1 for no minimum.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RaggedCountSparseOutputMinlength(value int64) RaggedCountSparseOutputAttr {
-	return func(m optionalAttr) {
-		m["minlength"] = value
-	}
-}
-
-// RaggedCountSparseOutputMaxlength sets the optional maxlength attribute to value.
-//
-// value: Maximum value to count. Can be set to -1 for no maximum.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RaggedCountSparseOutputMaxlength(value int64) RaggedCountSparseOutputAttr {
-	return func(m optionalAttr) {
-		m["maxlength"] = value
-	}
-}
-
-// Performs sparse-output bin counting for a ragged tensor input.
-//
-//   Counts the number of times each value occurs in the input.
-//
-// Arguments:
-//	splits: Tensor containing the row splits of the ragged tensor to count.
-//	values: Tensor containing values of the sparse tensor to count.
-//	weights: A Tensor of the same shape as indices containing per-index weight values.
-// May also be the empty tensor if no weights are used.
-//	binary_output: Whether to output the number of occurrences of each value or 1.
-//
-// Returns:
-//	output_indices: Indices tensor for the resulting sparse tensor object.
-//	output_values: Values tensor for the resulting sparse tensor object.
-//	output_dense_shape: Shape tensor for the resulting sparse tensor object.
-//   END
-//   }
-//   attr {
-//     name: "T"
-//     description: <<END
-// Dtype of the input values tensor.
-func RaggedCountSparseOutput(scope *Scope, splits tf.Output, values tf.Output, weights tf.Output, binary_output bool, optional ...RaggedCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"binary_output": binary_output}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RaggedCountSparseOutput",
-		Input: []tf.Input{
-			splits, values, weights,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Gets the next output from the given iterator .
-func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "IteratorGetNext",
-		Input: []tf.Input{
-			iterator,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("IteratorGetNext", err)
-		return
-	}
-	return components
-}
-
 // Makes a new iterator from the given `dataset` and stores it in `iterator`.
 //
 // This operation may be executed multiple times. Each execution will reset the
@@ -10233,6 +10108,34 @@ func TensorArraySizeV2(scope *Scope, handle tf.Output, flow_in tf.Output) (size
 	return op.Output(0)
 }
 
+// Creates a dataset that changes the batch size.
+//
+// Creates a dataset that rebatches elements from `input_dataset` into new batch
+// sizes.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the input dataset.
+//	batch_sizes: A vector of integers representing the size of batches to produce. These values
+// are cycled through in order.
+//
+//
+//
+func RebatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_sizes tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "RebatchDatasetV2",
+		Input: []tf.Input{
+			input_dataset, batch_sizes, drop_remainder,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // RebatchDatasetAttr is an optional argument to RebatchDataset.
 type RebatchDatasetAttr func(optionalAttr)
 
@@ -11324,6 +11227,114 @@ func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged
 	return op.Output(0)
 }
 
+// RaggedCountSparseOutputAttr is an optional argument to RaggedCountSparseOutput.
+type RaggedCountSparseOutputAttr func(optionalAttr)
+
+// RaggedCountSparseOutputMinlength sets the optional minlength attribute to value.
+//
+// value: Minimum value to count. Can be set to -1 for no minimum.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RaggedCountSparseOutputMinlength(value int64) RaggedCountSparseOutputAttr {
+	return func(m optionalAttr) {
+		m["minlength"] = value
+	}
+}
+
+// RaggedCountSparseOutputMaxlength sets the optional maxlength attribute to value.
+//
+// value: Maximum value to count. Can be set to -1 for no maximum.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RaggedCountSparseOutputMaxlength(value int64) RaggedCountSparseOutputAttr {
+	return func(m optionalAttr) {
+		m["maxlength"] = value
+	}
+}
+
+// Performs sparse-output bin counting for a ragged tensor input.
+//
+//   Counts the number of times each value occurs in the input.
+//
+// Arguments:
+//	splits: Tensor containing the row splits of the ragged tensor to count.
+//	values: Tensor containing values of the sparse tensor to count.
+//	weights: A Tensor of the same shape as indices containing per-index weight values.
+// May also be the empty tensor if no weights are used.
+//	binary_output: Whether to output the number of occurrences of each value or 1.
+//
+// Returns:
+//	output_indices: Indices tensor for the resulting sparse tensor object.
+//	output_values: Values tensor for the resulting sparse tensor object.
+//	output_dense_shape: Shape tensor for the resulting sparse tensor object.
+//   END
+//   }
+//   attr {
+//     name: "T"
+//     description: <<END
+// Dtype of the input values tensor.
+func RaggedCountSparseOutput(scope *Scope, splits tf.Output, values tf.Output, weights tf.Output, binary_output bool, optional ...RaggedCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"binary_output": binary_output}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RaggedCountSparseOutput",
+		Input: []tf.Input{
+			splits, values, weights,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Gets the next output from the given iterator .
+func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "IteratorGetNext",
+		Input: []tf.Input{
+			iterator,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("IteratorGetNext", err)
+		return
+	}
+	return components
+}
+
+// Computes the static batch size of a dataset sans partial batches.
+func ComputeBatchSize(scope *Scope, input_dataset tf.Output) (batch_size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ComputeBatchSize",
+		Input: []tf.Input{
+			input_dataset,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Uncompresses a compressed dataset element.
 func UncompressElement(scope *Scope, compressed tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
@@ -12066,6 +12077,153 @@ func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Ou
 	return op.Output(0)
 }
 
+// StatelessSampleDistortedBoundingBoxAttr is an optional argument to StatelessSampleDistortedBoundingBox.
+type StatelessSampleDistortedBoundingBoxAttr func(optionalAttr)
+
+// StatelessSampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
+//
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func StatelessSampleDistortedBoundingBoxAspectRatioRange(value []float32) StatelessSampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
+	}
+}
+
+// StatelessSampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+//
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func StatelessSampleDistortedBoundingBoxAreaRange(value []float32) StatelessSampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
+	}
+}
+
+// StatelessSampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
+//
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func StatelessSampleDistortedBoundingBoxMaxAttempts(value int64) StatelessSampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
+	}
+}
+
+// StatelessSampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+//
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
+// If not specified, defaults to false
+func StatelessSampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) StatelessSampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["use_image_if_no_bounding_boxes"] = value
+	}
+}
+
+// Generate a randomly distorted bounding box for an image deterministically.
+//
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving its
+// content, i.e. *data augmentation*. This Op, given the same `seed`,
+// deterministically outputs a randomly distorted localization of an object, i.e.
+// bounding box, given an `image_size`, `bounding_boxes` and a series of
+// constraints.
+//
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
+//
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// the height of the underlying image.
+//
+// The output of this Op is guaranteed to be the same given the same `seed` and is
+// independent of how many times the function is called, and independent of global
+// seed settings (e.g. `tf.random.set_seed`).
+//
+// Example usage:
+//
+// >>> image = np.array([[[1], [2], [3]], [[4], [5], [6]], [[7], [8], [9]]])
+// >>> bbox = tf.constant(
+// ...   [0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4])
+// >>> seed = (1, 2)
+// >>> # Generate a single distorted bounding box.
+// >>> bbox_begin, bbox_size, bbox_draw = (
+// ...   tf.image.stateless_sample_distorted_bounding_box(
+// ...     tf.shape(image), bounding_boxes=bbox, seed=seed))
+// >>> # Employ the bounding box to distort the image.
+// >>> tf.slice(image, bbox_begin, bbox_size)
+// <tf.Tensor: shape=(2, 2, 1), dtype=int64, numpy=
+// array([[[1],
+//         [2]],
+//        [[4],
+//         [5]]])>
+// >>> # Draw the bounding box in an image summary.
+// >>> colors = np.array([[1.0, 0.0, 0.0], [0.0, 0.0, 1.0]])
+// >>> tf.image.draw_bounding_boxes(
+// ...   tf.expand_dims(tf.cast(image, tf.float32),0), bbox_draw, colors)
+// <tf.Tensor: shape=(1, 3, 3, 1), dtype=float32, numpy=
+// array([[[[1.],
+//          [1.],
+//          [3.]],
+//         [[1.],
+//          [1.],
+//          [6.]],
+//         [[7.],
+//          [8.],
+//          [9.]]]], dtype=float32)>
+//
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
+//
+// Arguments:
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
+//	min_object_covered: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
+//	seed: 1-D with shape `[2]`. The seed to the random number generator. Must have dtype
+// `int32` or `int64`. (When using XLA, only `int32` is allowed.)
+//
+// Returns:
+//	begin: 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.
+//	size: 1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.
+//	bboxes: 3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func StatelessSampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, min_object_covered tf.Output, seed tf.Output, optional ...StatelessSampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessSampleDistortedBoundingBox",
+		Input: []tf.Input{
+			image_size, bounding_boxes, min_object_covered, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
 type SampleDistortedBoundingBoxAttr func(optionalAttr)
 
@@ -13185,119 +13343,40 @@ func BoostedTreesPredict(scope *Scope, tree_ensemble_handle tf.Output, bucketize
 	return op.Output(0)
 }
 
-// RandomGammaAttr is an optional argument to RandomGamma.
-type RandomGammaAttr func(optionalAttr)
+// ResizeBilinearAttr is an optional argument to ResizeBilinear.
+type ResizeBilinearAttr func(optionalAttr)
 
-// RandomGammaSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomGammaSeed(value int64) RandomGammaAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomGammaSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomGammaSeed2(value int64) RandomGammaAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from the Gamma distribution(s) described by alpha.
-//
-// This op uses the algorithm by Marsaglia et al. to acquire samples via
-// transformation-rejection from pairs of uniform and normal random variables.
-// See http://dl.acm.org/citation.cfm?id=358414
-//
-// Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in alpha.
-//	alpha: A tensor in which each scalar is a "shape" parameter describing the
-// associated gamma distribution.
-//
-// Returns A tensor with shape `shape + shape(alpha)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
-func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomGamma",
-		Input: []tf.Input{
-			shape, alpha,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns 0 if x == 0, and x * log1p(y) otherwise, elementwise.
-func Xlog1py(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Xlog1py",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedResizeBilinearAttr is an optional argument to QuantizedResizeBilinear.
-type QuantizedResizeBilinearAttr func(optionalAttr)
-
-// QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
+// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
 //
 // value: If true, the centers of the 4 corner pixels of the input and output tensors are
 // aligned, preserving the values at the corner pixels. Defaults to false.
 // If not specified, defaults to false
-func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
+func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
 	return func(m optionalAttr) {
 		m["align_corners"] = value
 	}
 }
 
-// QuantizedResizeBilinearHalfPixelCenters sets the optional half_pixel_centers attribute to value.
+// ResizeBilinearHalfPixelCenters sets the optional half_pixel_centers attribute to value.
 // If not specified, defaults to false
-func QuantizedResizeBilinearHalfPixelCenters(value bool) QuantizedResizeBilinearAttr {
+func ResizeBilinearHalfPixelCenters(value bool) ResizeBilinearAttr {
 	return func(m optionalAttr) {
 		m["half_pixel_centers"] = value
 	}
 }
 
-// Resize quantized `images` to `size` using quantized bilinear interpolation.
+// Resize `images` to `size` using bilinear interpolation.
 //
-// Input images and output images must be quantized types.
+// Input images can be of different types but output images are always float.
 //
 // Arguments:
 //	images: 4-D with shape `[batch, height, width, channels]`.
 //	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
 // new size for the images.
 //
-//
-//
-// Returns:
-//	resized_images: 4-D with shape
+// Returns 4-D with shape
 // `[batch, new_height, new_width, channels]`.
-//	out_min
-//	out_max
-func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min tf.Output, max tf.Output, optional ...QuantizedResizeBilinearAttr) (resized_images tf.Output, out_min tf.Output, out_max tf.Output) {
+func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13306,14 +13385,81 @@ func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedResizeBilinear",
+		Type: "ResizeBilinear",
 		Input: []tf.Input{
-			images, size, min, max,
+			images, size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
+}
+
+// Returns the set of files matching one or more glob patterns.
+//
+// Note that this routine only supports wildcard characters in the
+// basename portion of the pattern, not in the directory portion.
+// Note also that the order of filenames returned is deterministic.
+//
+// Arguments:
+//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
+//
+// Returns A vector of matching filenames.
+func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatchingFiles",
+		Input: []tf.Input{
+			pattern,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Restore a Reader to its initial clean state.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//
+// Returns the created operation.
+func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderResetV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Restore a reader to a previously saved state.
+//
+// Not all Readers support being restored, so this can produce an
+// Unimplemented error.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//	state: Result of a ReaderSerializeState of a Reader with type
+// matching reader_handle.
+//
+// Returns the created operation.
+func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderRestoreStateV2",
+		Input: []tf.Input{
+			reader_handle, state,
+		},
+	}
+	return scope.AddOperation(opspec)
 }
 
 // ResizeAreaAttr is an optional argument to ResizeArea.
@@ -13370,184 +13516,16 @@ func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...Resi
 	return op.Output(0)
 }
 
-// Restore a reader to a previously saved state.
-//
-// Not all Readers support being restored, so this can produce an
-// Unimplemented error.
+// Returns the number of work units this Reader has finished processing.
 //
 // Arguments:
 //	reader_handle: Handle to a Reader.
-//	state: Result of a ReaderSerializeState of a Reader with type
-// matching reader_handle.
-//
-// Returns the created operation.
-func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
+func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderRestoreStateV2",
-		Input: []tf.Input{
-			reader_handle, state,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes rectified linear 6: `min(max(features, 0), 6)`.
-func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Relu6",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RaggedRangeAttr is an optional argument to RaggedRange.
-type RaggedRangeAttr func(optionalAttr)
-
-// RaggedRangeTsplits sets the optional Tsplits attribute to value.
-// If not specified, defaults to DT_INT64
-func RaggedRangeTsplits(value tf.DataType) RaggedRangeAttr {
-	return func(m optionalAttr) {
-		m["Tsplits"] = value
-	}
-}
-
-// Returns a `RaggedTensor` containing the specified sequences of numbers.
-//
-//
-// Returns a `RaggedTensor` `result` composed from `rt_dense_values` and
-// `rt_nested_splits`, such that
-// `result[i] = range(starts[i], limits[i], deltas[i])`.
-//
-// ```python
-// (rt_nested_splits, rt_dense_values) = ragged_range(
-//       starts=[2, 5, 8], limits=[3, 5, 12], deltas=1)
-// result = tf.ragged.from_row_splits(rt_dense_values, rt_nested_splits)
-// print(result)
-// <tf.RaggedTensor [[2], [], [8, 9, 10, 11]] >
-// ```
-//
-// The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors.
-// The vector inputs must all have the same size.  Scalar inputs are broadcast
-// to match the size of the vector inputs.
-//
-// Arguments:
-//	starts: The starts of each range.
-//	limits: The limits of each range.
-//	deltas: The deltas of each range.
-//
-// Returns:
-//	rt_nested_splits: The `row_splits` for the returned `RaggedTensor`.
-//	rt_dense_values: The `flat_values` for the returned `RaggedTensor`.
-func RaggedRange(scope *Scope, starts tf.Output, limits tf.Output, deltas tf.Output, optional ...RaggedRangeAttr) (rt_nested_splits tf.Output, rt_dense_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RaggedRange",
-		Input: []tf.Input{
-			starts, limits, deltas,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Split a `SparseTensor` into `num_split` tensors along one dimension.
-//
-// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
-// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
-// For example, if `split_dim = 1` and `num_split = 2` and the input is
-//
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
-//
-// Graphically the output tensors are:
-//
-//     output_tensor[0] = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
-//
-//     output_tensor[1] = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
-//
-// Arguments:
-//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
-// `[0, rank(shape))`.
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
-//	num_split: The number of ways to split.
-//
-// Returns:
-//	output_indices
-//	output_values: A list of 1-D tensors represents the values of the output sparse
-// tensors.
-//	output_shape: A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_split": num_split}
-	opspec := tf.OpSpec{
-		Type: "SparseSplit",
-		Input: []tf.Input{
-			split_dim, indices, values, shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	return output_indices, output_values, output_shape
-}
-
-// Produce a string tensor that encodes the state of a Reader.
-//
-// Not all Readers support being serialized, so this can produce an
-// Unimplemented error.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderSerializeStateV2",
+		Type: "ReaderNumWorkUnitsCompletedV2",
 		Input: []tf.Input{
 			reader_handle,
 		},
@@ -14945,9 +14923,8 @@ func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
 	}
 }
 
-// Computes the inverse of one or more square invertible matrices or their
+// Computes the inverse of one or more square invertible matrices or their adjoints (conjugate transposes).
 //
-// adjoints (conjugate transposes).
 //
 // The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
 // form square matrices. The output is a tensor of the same shape as the input
@@ -15043,58 +15020,6 @@ func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// ResizeBilinearAttr is an optional argument to ResizeBilinear.
-type ResizeBilinearAttr func(optionalAttr)
-
-// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// ResizeBilinearHalfPixelCenters sets the optional half_pixel_centers attribute to value.
-// If not specified, defaults to false
-func ResizeBilinearHalfPixelCenters(value bool) ResizeBilinearAttr {
-	return func(m optionalAttr) {
-		m["half_pixel_centers"] = value
-	}
-}
-
-// Resize `images` to `size` using bilinear interpolation.
-//
-// Input images can be of different types but output images are always float.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResizeBilinear",
-		Input: []tf.Input{
-			images, size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Creates a TensorList by indexing into a Tensor.
 //
 // Each member of the TensorList corresponds to one row of the input tensor,
@@ -15341,6 +15266,94 @@ func TensorListLength(scope *Scope, input_handle tf.Output) (length tf.Output) {
 	return op.Output(0)
 }
 
+// Returns whether the given key exists in the map.
+//
+// input_handle: the input map
+// key: the key to check
+// has_key: whether the key is already in the map or not
+func TensorMapHasKey(scope *Scope, input_handle tf.Output, key tf.Output) (has_key tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorMapHasKey",
+		Input: []tf.Input{
+			input_handle, key,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the value from a given key in a tensor map.
+//
+// input_handle: the input map
+// key: the key to be looked up
+// value: the value found from the given key
+func TensorMapLookup(scope *Scope, input_handle tf.Output, key tf.Output, value_dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"value_dtype": value_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorMapLookup",
+		Input: []tf.Input{
+			input_handle, key,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Inverse 3D fast Fourier transform.
+//
+// Computes the inverse 3-dimensional discrete Fourier transform over the
+// inner-most 3 dimensions of `input`.
+//
+// Arguments:
+//	input: A complex tensor.
+//
+// Returns A complex tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifftn with 3 dimensions.
+// @end_compatibility
+func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IFFT3D",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns a map that is the 'input_handle' with the given key-value pair inserted.
+//
+// input_handle: the original map
+// output_handle: the map with key and value inserted
+// key: the key to be inserted
+// value: the value to be inserted
+func TensorMapInsert(scope *Scope, input_handle tf.Output, key tf.Output, value tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorMapInsert",
+		Input: []tf.Input{
+			input_handle, key, value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Merges summaries.
 //
 // This op creates a
@@ -16517,36 +16530,6 @@ func TensorListSplit(scope *Scope, tensor tf.Output, element_shape tf.Output, le
 	return op.Output(0)
 }
 
-// Check if the input matches the regex pattern.
-//
-// The input is a string tensor of any shape. The pattern is the
-// regular expression to be matched with every element of the input tensor.
-// The boolean values (True or False) of the output tensor indicate
-// if the input matches the regex pattern provided.
-//
-// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
-//
-// Arguments:
-//	input: A string tensor of the text to be processed.
-//	pattern: The regular expression to match the input.
-//
-// Returns A bool tensor with the same shape as `input`.
-func StaticRegexFullMatch(scope *Scope, input tf.Output, pattern string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"pattern": pattern}
-	opspec := tf.OpSpec{
-		Type: "StaticRegexFullMatch",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // ParseSingleSequenceExampleAttr is an optional argument to ParseSingleSequenceExample.
 type ParseSingleSequenceExampleAttr func(optionalAttr)
 
@@ -16711,6 +16694,36 @@ func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list
 	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values
 }
 
+// Check if the input matches the regex pattern.
+//
+// The input is a string tensor of any shape. The pattern is the
+// regular expression to be matched with every element of the input tensor.
+// The boolean values (True or False) of the output tensor indicate
+// if the input matches the regex pattern provided.
+//
+// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+//
+// Arguments:
+//	input: A string tensor of the text to be processed.
+//	pattern: The regular expression to match the input.
+//
+// Returns A bool tensor with the same shape as `input`.
+func StaticRegexFullMatch(scope *Scope, input tf.Output, pattern string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pattern": pattern}
+	opspec := tf.OpSpec{
+		Type: "StaticRegexFullMatch",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the number of elements in the given table.
 //
 // Arguments:
@@ -17235,6 +17248,45 @@ func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, inp
 	return op.Output(0), op.Output(1)
 }
 
+// TPUPartitionedInputAttr is an optional argument to TPUPartitionedInput.
+type TPUPartitionedInputAttr func(optionalAttr)
+
+// TPUPartitionedInputPartitionDim sets the optional partition_dim attribute to value.
+//
+// value: An integer describles which dimension is partitioned. -1 means
+// those inputs are replicated.
+// If not specified, defaults to 0
+func TPUPartitionedInputPartitionDim(value int64) TPUPartitionedInputAttr {
+	return func(m optionalAttr) {
+		m["partition_dim"] = value
+	}
+}
+
+// An op that groups a list of partitioned inputs together. This op
+//
+// Arguments:
+//	inputs: A list of partitioned inputs which must have the same shape.
+//
+// Returns A handle which represents the full shape of partitioned tensors.
+func TPUPartitionedInput(scope *Scope, inputs []tf.Output, optional ...TPUPartitionedInputAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TPUPartitionedInput",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
 //
 // Each comparison returns a boolean `true` (if `input_value > threshold`)
@@ -18617,6 +18669,37 @@ func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_i
 	return op.Output(0)
 }
 
+// CollectiveReduceV2Attr is an optional argument to CollectiveReduceV2.
+type CollectiveReduceV2Attr func(optionalAttr)
+
+// CollectiveReduceV2CommunicationHint sets the optional communication_hint attribute to value.
+// If not specified, defaults to "auto"
+func CollectiveReduceV2CommunicationHint(value string) CollectiveReduceV2Attr {
+	return func(m optionalAttr) {
+		m["communication_hint"] = value
+	}
+}
+
+// Mutually reduces multiple tensors of identical type and shape.
+func CollectiveReduceV2(scope *Scope, input tf.Output, group_size tf.Output, group_key tf.Output, instance_key tf.Output, merge_op string, final_op string, optional ...CollectiveReduceV2Attr) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"merge_op": merge_op, "final_op": final_op}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CollectiveReduceV2",
+		Input: []tf.Input{
+			input, group_size, group_key, instance_key,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the sum along segments of a tensor.
 //
 // Read
@@ -19810,6 +19893,28 @@ func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []t
 	return output
 }
 
+// Returns the TopK values in the array in sorted order. This is a combination
+//
+// of MakeUnique and TopKUnique. The returned top-K will have its lower bits
+// replaced by iota, thus it will be close to the original value but not exactly
+// the same. The running time is proportional to the product of K and the input
+// size. NaNs are never returned. Subnormal numbers are flushed to zero.
+func TopKWithUnique(scope *Scope, input tf.Output, k int64) (topk tf.Output, topk_indices tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"k": k}
+	opspec := tf.OpSpec{
+		Type: "TopKWithUnique",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // ImageSummaryAttr is an optional argument to ImageSummary.
 type ImageSummaryAttr func(optionalAttr)
 
@@ -20184,6 +20289,28 @@ func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// Returns a tensor map with item from given key erased.
+//
+// input_handle: the original map
+// output_handle: the map with value from given key removed
+// key: the key of the value to be erased
+// value: the value that was erased
+func TensorMapErase(scope *Scope, input_handle tf.Output, key tf.Output, value_dtype tf.DataType) (output_handle tf.Output, value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"value_dtype": value_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorMapErase",
+		Input: []tf.Input{
+			input_handle, key,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // Shuffle dimensions of x according to a permutation.
 //
 // The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
@@ -20389,6 +20516,137 @@ func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// QuantizedResizeBilinearAttr is an optional argument to QuantizedResizeBilinear.
+type QuantizedResizeBilinearAttr func(optionalAttr)
+
+// QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// QuantizedResizeBilinearHalfPixelCenters sets the optional half_pixel_centers attribute to value.
+// If not specified, defaults to false
+func QuantizedResizeBilinearHalfPixelCenters(value bool) QuantizedResizeBilinearAttr {
+	return func(m optionalAttr) {
+		m["half_pixel_centers"] = value
+	}
+}
+
+// Resize quantized `images` to `size` using quantized bilinear interpolation.
+//
+// Input images and output images must be quantized types.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+//
+//
+// Returns:
+//	resized_images: 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+//	out_min
+//	out_max
+func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min tf.Output, max tf.Output, optional ...QuantizedResizeBilinearAttr) (resized_images tf.Output, out_min tf.Output, out_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedResizeBilinear",
+		Input: []tf.Input{
+			images, size, min, max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// RandomGammaAttr is an optional argument to RandomGamma.
+type RandomGammaAttr func(optionalAttr)
+
+// RandomGammaSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomGammaSeed(value int64) RandomGammaAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomGammaSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomGammaSeed2(value int64) RandomGammaAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from the Gamma distribution(s) described by alpha.
+//
+// This op uses the algorithm by Marsaglia et al. to acquire samples via
+// transformation-rejection from pairs of uniform and normal random variables.
+// See http://dl.acm.org/citation.cfm?id=358414
+//
+// Arguments:
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in alpha.
+//	alpha: A tensor in which each scalar is a "shape" parameter describing the
+// associated gamma distribution.
+//
+// Returns A tensor with shape `shape + shape(alpha)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
+func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomGamma",
+		Input: []tf.Input{
+			shape, alpha,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns 0 if x == 0, and x * log1p(y) otherwise, elementwise.
+func Xlog1py(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Xlog1py",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns 0 if x == 0, and x * log(y) otherwise, elementwise.
 func Xlogy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
@@ -21560,202 +21818,6 @@ func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
-type FusedBatchNormAttr func(optionalAttr)
-
-// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
-
-// FusedBatchNormExponentialAvgFactor sets the optional exponential_avg_factor attribute to value.
-// If not specified, defaults to 1
-func FusedBatchNormExponentialAvgFactor(value float32) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["exponential_avg_factor"] = value
-	}
-}
-
-// FusedBatchNormDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormIsTraining sets the optional is_training attribute to value.
-//
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
-//
-// Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
-//
-// Returns:
-//	y: A 4D Tensor for output data.
-//	batch_mean: A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.
-//	batch_variance: A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.
-//	reserve_space_1: A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.
-//	reserve_space_2: A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be reused in the gradient computation.
-func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FusedBatchNorm",
-		Input: []tf.Input{
-			x, scale, offset, mean, variance,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// SparseMatMulAttr is an optional argument to SparseMatMul.
-type SparseMatMulAttr func(optionalAttr)
-
-// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
-// If not specified, defaults to false
-func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["a_is_sparse"] = value
-	}
-}
-
-// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
-// If not specified, defaults to false
-func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["b_is_sparse"] = value
-	}
-}
-
-// Multiply matrix "a" by matrix "b".
-//
-// The inputs must be two-dimensional matrices and the inner dimension of "a" must
-// match the outer dimension of "b". Both "a" and "b" must be `Tensor`s not
-// `SparseTensor`s.  This op is optimized for the case where at least one of "a" or
-// "b" is sparse, in the sense that they have a large proportion of zero values.
-// The breakeven for using this versus a dense matrix multiply on one platform was
-// 30% zero values in the sparse matrix.
-//
-// The gradient computation of this operation will only take advantage of sparsity
-// in the input gradient when that gradient comes from a Relu.
-func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseMatMul",
-		Input: []tf.Input{
-			a, b,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Resizes the list.
-//
-//
-// input_handle: the input list
-// size: size of the output list
-//
-func TensorListResize(scope *Scope, input_handle tf.Output, size tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListResize",
-		Input: []tf.Input{
-			input_handle, size,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes inverse hyperbolic tangent of x element-wise.
-//
-//   Given an input tensor, this function computes inverse hyperbolic tangent
-//   for every element in the tensor. Input range is `[-1,1]` and output range is
-//   `[-inf, inf]`. If input is `-1`, output will be `-inf` and if the
-//   input is `1`, output will be `inf`. Values outside the range will have
-//   `nan` as output.
-//
-//   ```python
-//   x = tf.constant([-float("inf"), -1, -0.5, 1, 0, 0.5, 10, float("inf")])
-//   tf.math.atanh(x) ==> [nan -inf -0.54930615 inf  0. 0.54930615 nan nan]
-//   ```
-func Atanh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Atanh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes hyperbolic tangent of `x` element-wise.
 //
 //   Given an input tensor, this function computes hyperbolic tangent of every
@@ -22919,75 +22981,6 @@ func NcclReduce(scope *Scope, input []tf.Output, reduction string) (data tf.Outp
 	return op.Output(0)
 }
 
-// QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr is an optional argument to QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize.
-type QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr func(optionalAttr)
-
-// QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType sets the optional out_type attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_QUINT8
-func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataType) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
-//
-// value: List of dilation values.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizePaddingList sets the optional padding_list attribute to value.
-// If not specified, defaults to <>
-func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizePaddingList(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
-	return func(m optionalAttr) {
-		m["padding_list"] = value
-	}
-}
-
-// Computes quantized depthwise Conv2D with Bias, Relu and Requantize.
-//
-// Arguments:
-//	input: The original input tensor.
-//	filter: The original filter tensor.
-//	bias: The original bias tensor.
-//	min_input: The float value that the minimum quantized input value represents.
-//	max_input: The float value that the maximum quantized input value represents.
-//	min_filter: The float value that the minimum quantized filter value represents.
-//	max_filter: The float value that the maximum quantized filter value represents.
-//	min_freezed_output: The minimum float value of the output tensor.
-//	max_freezed_output: The maximum float value of the output tensor.
-//	strides: List of stride values.
-//
-//
-// Returns:
-//	output: The output tensor.
-//	min_output: The float value that the minimum quantized output value represents.
-//	max_output: The float value that the maximum quantized output value represents.
-func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize(scope *Scope, input tf.Output, filter tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, min_freezed_output tf.Output, max_freezed_output tf.Output, strides []int64, padding string, optional ...QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize",
-		Input: []tf.Input{
-			input, filter, bias, min_input, max_input, min_filter, max_filter, min_freezed_output, max_freezed_output,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
 // QuantizedDepthwiseConv2DWithBiasAndReluAttr is an optional argument to QuantizedDepthwiseConv2DWithBiasAndRelu.
 type QuantizedDepthwiseConv2DWithBiasAndReluAttr func(optionalAttr)
 
@@ -23278,131 +23271,6 @@ func QuantizedConv2DPerChannel(scope *Scope, input tf.Output, filter tf.Output,
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Concatenates quantized tensors along one dimension.
-//
-// Arguments:
-//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [0, rank(values)).
-//	values: The `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	input_mins: The minimum scalar values for each of the input tensors.
-//	input_maxes: The maximum scalar values for each of the input tensors.
-//
-// Returns:
-//	output: A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-//	output_min: The float value that the minimum quantized output value represents.
-//	output_max: The float value that the maximum quantized output value represents.
-func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedConcat",
-		Input: []tf.Input{
-			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns the batched diagonal part of a batched tensor.
-//
-// Returns a tensor with the `k[0]`-th to `k[1]`-th diagonals of the batched
-// `input`.
-//
-// Assume `input` has `r` dimensions `[I, J, ..., L, M, N]`.
-// Let `max_diag_len` be the maximum length among all diagonals to be extracted,
-// `max_diag_len = min(M + min(k[1], 0), N + min(-k[0], 0))`
-// Let `num_diags` be the number of diagonals to extract,
-// `num_diags = k[1] - k[0] + 1`.
-//
-// If `num_diags == 1`, the output tensor is of rank `r - 1` with shape
-// `[I, J, ..., L, max_diag_len]` and values:
-//
-// ```
-// diagonal[i, j, ..., l, n]
-//   = input[i, j, ..., l, n+y, n+x] ; if 0 <= n+y < M and 0 <= n+x < N,
-//     padding_value                 ; otherwise.
-// ```
-// where `y = max(-k[1], 0)`, `x = max(k[1], 0)`.
-//
-// Otherwise, the output tensor has rank `r` with dimensions
-// `[I, J, ..., L, num_diags, max_diag_len]` with values:
-//
-// ```
-// diagonal[i, j, ..., l, m, n]
-//   = input[i, j, ..., l, n+y, n+x] ; if 0 <= n+y < M and 0 <= n+x < N,
-//     padding_value                 ; otherwise.
-// ```
-// where `d = k[1] - m`, `y = max(-d, 0)`, and `x = max(d, 0)`.
-//
-// The input must be at least a matrix.
-//
-// For example:
-//
-// ```
-// input = np.array([[[1, 2, 3, 4],  # Input shape: (2, 3, 4)
-//                    [5, 6, 7, 8],
-//                    [9, 8, 7, 6]],
-//                   [[5, 4, 3, 2],
-//                    [1, 2, 3, 4],
-//                    [5, 6, 7, 8]]])
-//
-// # A main diagonal from each batch.
-// tf.matrix_diag_part(input) ==> [[1, 6, 7],  # Output shape: (2, 3)
-//                                 [5, 2, 7]]
-//
-// # A superdiagonal from each batch.
-// tf.matrix_diag_part(input, k = 1)
-//   ==> [[2, 7, 6],  # Output shape: (2, 3)
-//        [4, 3, 8]]
-//
-// # A tridiagonal band from each batch.
-// tf.matrix_diag_part(input, k = (-1, 1))
-//   ==> [[[2, 7, 6],  # Output shape: (2, 3, 3)
-//         [1, 6, 7],
-//         [5, 8, 0]],
-//        [[4, 3, 8],
-//         [5, 2, 7],
-//         [1, 6, 0]]]
-//
-// # Padding value = 9
-// tf.matrix_diag_part(input, k = (1, 3), padding_value = 9)
-//   ==> [[[4, 9, 9],  # Output shape: (2, 3, 3)
-//         [3, 8, 9],
-//         [2, 7, 6]],
-//        [[2, 9, 9],
-//         [3, 4, 9],
-//         [4, 3, 8]]]
-// ```
-//
-// Arguments:
-//	input: Rank `r` tensor where `r >= 2`.
-//	k: Diagonal offset(s). Positive value means superdiagonal, 0 refers to the main
-// diagonal, and negative value means subdiagonals. `k` can be a single integer
-// (for a single diagonal) or a pair of integers specifying the low and high ends
-// of a matrix band. `k[0]` must not be larger than `k[1]`.
-//	padding_value: The value to fill the area outside the specified diagonal band with.
-// Default is 0.
-//
-// Returns The extracted diagonal(s).
-func MatrixDiagPartV2(scope *Scope, input tf.Output, k tf.Output, padding_value tf.Output) (diagonal tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixDiagPartV2",
-		Input: []tf.Input{
-			input, k, padding_value,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // A container for a multi device iterator resource.
 //
 // Returns:
@@ -24977,6 +24845,176 @@ func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backpr
 	return op.Output(0)
 }
 
+// Resizes the list.
+//
+//
+// input_handle: the input list
+// size: size of the output list
+//
+func TensorListResize(scope *Scope, input_handle tf.Output, size tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListResize",
+		Input: []tf.Input{
+			input_handle, size,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
+type FusedBatchNormAttr func(optionalAttr)
+
+// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
+//
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
+	}
+}
+
+// FusedBatchNormExponentialAvgFactor sets the optional exponential_avg_factor attribute to value.
+// If not specified, defaults to 1
+func FusedBatchNormExponentialAvgFactor(value float32) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["exponential_avg_factor"] = value
+	}
+}
+
+// FusedBatchNormDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormIsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+//
+// Arguments:
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
+//
+// Returns:
+//	y: A 4D Tensor for output data.
+//	batch_mean: A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.
+//	batch_variance: A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.
+//	reserve_space_1: A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.
+//	reserve_space_2: A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.
+func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedBatchNorm",
+		Input: []tf.Input{
+			x, scale, offset, mean, variance,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// SparseMatMulAttr is an optional argument to SparseMatMul.
+type SparseMatMulAttr func(optionalAttr)
+
+// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["a_is_sparse"] = value
+	}
+}
+
+// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["b_is_sparse"] = value
+	}
+}
+
+// Multiply matrix "a" by matrix "b".
+//
+// The inputs must be two-dimensional matrices and the inner dimension of "a" must
+// match the outer dimension of "b". Both "a" and "b" must be `Tensor`s not
+// `SparseTensor`s.  This op is optimized for the case where at least one of "a" or
+// "b" is sparse, in the sense that they have a large proportion of zero values.
+// The breakeven for using this versus a dense matrix multiply on one platform was
+// 30% zero values in the sparse matrix.
+//
+// The gradient computation of this operation will only take advantage of sparsity
+// in the input gradient when that gradient comes from a Relu.
+func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseMatMul",
+		Input: []tf.Input{
+			a, b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the LSTM cell backward propagation for 1 timestep.
 //
 // This implementation is to be used in conjunction of LSTMBlockCell.
@@ -25234,6 +25272,24 @@ func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []i
 	return op.Output(0), op.Output(1)
 }
 
+// Returns the number of tensors in the input tensor map.
+//
+// input_handle: the input map
+// size: the number of tensors in the map
+func TensorMapSize(scope *Scope, input_handle tf.Output) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorMapSize",
+		Input: []tf.Input{
+			input_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
 type MaxPoolGradGradAttr func(optionalAttr)
 
@@ -25800,6 +25856,38 @@ func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	return scope.AddOperation(opspec)
 }
 
+// Computes the Kth order statistic of a data set. The current
+//
+// implementation uses a binary search requiring exactly 32 passes over
+// the input data. The running time is linear with respect to input
+// size. The median-of-medians algorithm is probably faster, but is
+// difficult to implement efficiently in XLA. The implementation imposes
+// a total ordering on floats. The ordering is consistent with the usual
+// partial order.  Positive NaNs are greater than positive
+// infinity. Negative NaNs are less than negative infinity. NaNs with
+// distinct payloads are treated as distinct. Subnormal numbers are
+// preserved (not flushed to zero). Positive infinity is greater than all
+// numbers. Negative infinity is less than all numbers. Positive is
+// greater than negative zero. There are less than k values greater than
+// the kth order statistic. There are at least k values greater than or
+// equal to the Kth order statistic. The semantics are not the same as
+// top_k_unique.
+func KthOrderStatistic(scope *Scope, input tf.Output, k int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"k": k}
+	opspec := tf.OpSpec{
+		Type: "KthOrderStatistic",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
 type AvgPool3DGradAttr func(optionalAttr)
 
@@ -25956,114 +26044,6 @@ func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, pa
 	return op.Output(0)
 }
 
-// UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
-type UniqueWithCountsAttr func(optionalAttr)
-
-// UniqueWithCountsOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func UniqueWithCountsOutIdx(value tf.DataType) UniqueWithCountsAttr {
-	return func(m optionalAttr) {
-		m["out_idx"] = value
-	}
-}
-
-// Finds unique elements in a 1-D tensor.
-//
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`. This operation also returns a
-// tensor `idx` the same size as `x` that contains the index of each value of `x`
-// in the unique output `y`. Finally, it returns a third tensor `count` that
-// contains the count of each element of `y` in `x`. In other words:
-//
-// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-//
-// For example:
-//
-// ```
-// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-// y, idx, count = unique_with_counts(x)
-// y ==> [1, 2, 4, 7, 8]
-// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-// count ==> [2, 1, 3, 1, 2]
-// ```
-//
-// Arguments:
-//	x: 1-D.
-//
-// Returns:
-//	y: 1-D.
-//	idx: 1-D.
-//	count: 1-D.
-func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAttr) (y tf.Output, idx tf.Output, count tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UniqueWithCounts",
-		Input: []tf.Input{
-			x,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// ResizeBicubicGradAttr is an optional argument to ResizeBicubicGrad.
-type ResizeBicubicGradAttr func(optionalAttr)
-
-// ResizeBicubicGradAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
-// aligned. Defaults to false.
-// If not specified, defaults to false
-func ResizeBicubicGradAlignCorners(value bool) ResizeBicubicGradAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// ResizeBicubicGradHalfPixelCenters sets the optional half_pixel_centers attribute to value.
-// If not specified, defaults to false
-func ResizeBicubicGradHalfPixelCenters(value bool) ResizeBicubicGradAttr {
-	return func(m optionalAttr) {
-		m["half_pixel_centers"] = value
-	}
-}
-
-// Computes the gradient of bicubic interpolation.
-//
-// Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-// The image tensor that was resized.
-//
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
-// Gradients with respect to the input image. Input image must have been
-// float or double.
-func ResizeBicubicGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBicubicGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResizeBicubicGrad",
-		Input: []tf.Input{
-			grads, original_image,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Returns a list which has the passed-in `Tensor` as last element and the other elements of the given list in `input_handle`.
 //
 // tensor: The tensor to put on the list.
@@ -27080,91 +27060,107 @@ func NoOp(scope *Scope) (o *tf.Operation) {
 	return scope.AddOperation(opspec)
 }
 
-// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
-type ResourceSparseApplyRMSPropAttr func(optionalAttr)
-
-// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
+// Set a summary_writer_interface to record statistics using given stats_aggregator.
 //
 // Returns the created operation.
-func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
+func StatsAggregatorSetSummaryWriter(scope *Scope, stats_aggregator tf.Output, summary tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyRMSProp",
+		Type: "StatsAggregatorSetSummaryWriter",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+			stats_aggregator, summary,
 		},
-		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// StringToNumberAttr is an optional argument to StringToNumber.
-type StringToNumberAttr func(optionalAttr)
-
-// StringToNumberOutType sets the optional out_type attribute to value.
+// Subtracts a value from the current value of a variable.
 //
-// value: The numeric type to interpret each string in `string_tensor` as.
-// If not specified, defaults to DT_FLOAT
-func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
+// Any ReadVariableOp with a control dependency on this op is guaranteed to
+// see the decremented value or a subsequent newer one.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
+//
+// Returns the created operation.
+func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AssignSubVariableOp",
+		Input: []tf.Input{
+			resource, value,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
+type FusedBatchNormGradAttr func(optionalAttr)
+
+// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
+//
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["epsilon"] = value
 	}
 }
 
-// Converts each string in the input Tensor to the specified numeric type.
+// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
 //
-// (Note that int32 overflow results in an error while float overflow
-// results in a rounded value.)
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
 //
-// Example:
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Gradient for batch normalization.
 //
-// >>> strings = ["5.0", "3.0", "7.0"]
-// >>> tf.strings.to_number(strings)
-// <tf.Tensor: shape=(3,), dtype=float32, numpy=array([5., 3., 7.], dtype=float32)>
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
+// Arguments:
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
+// mean to be reused in gradient computation. When is_training is
+// False, a 1D Tensor for the population mean to be reused in both
+// 1st and 2nd order gradient computation.
+//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
+// variance (inverted variance in the cuDNN case) to be reused in
+// gradient computation. When is_training is False, a 1D Tensor
+// for the population variance to be reused in both 1st and 2nd
+// order gradient computation.
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
+// Returns:
+//	x_backprop: A 4D Tensor for the gradient with respect to x.
+//	scale_backprop: A 1D Tensor for the gradient with respect to scale.
+//	offset_backprop: A 1D Tensor for the gradient with respect to offset.
+//	reserve_space_3: Unused placeholder to match the mean input in FusedBatchNorm.
+//	reserve_space_4: Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -27173,30 +27169,124 @@ func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToN
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringToNumber",
+		Type: "FusedBatchNormGrad",
 		Input: []tf.Input{
-			string_tensor,
+			y_backprop, x, scale, reserve_space_1, reserve_space_2,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Transforms a Tensor into a serialized TensorProto proto.
+// DecodeCSVAttr is an optional argument to DecodeCSV.
+type DecodeCSVAttr func(optionalAttr)
+
+// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
+//
+// value: char delimiter to separate fields in a record.
+// If not specified, defaults to ","
+func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["field_delim"] = value
+	}
+}
+
+// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
+//
+// value: If false, treats double quotation marks as regular
+// characters inside of the string fields (ignoring RFC 4180, Section 2,
+// Bullet 5).
+// If not specified, defaults to true
+func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["use_quote_delim"] = value
+	}
+}
+
+// DecodeCSVNaValue sets the optional na_value attribute to value.
+//
+// value: Additional string to recognize as NA/NaN.
+// If not specified, defaults to ""
+func DecodeCSVNaValue(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["na_value"] = value
+	}
+}
+
+// DecodeCSVSelectCols sets the optional select_cols attribute to value.
+// If not specified, defaults to <>
+func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["select_cols"] = value
+	}
+}
+
+// Convert CSV records to tensors. Each column maps to one tensor.
+//
+// RFC 4180 format is expected for the CSV records.
+// (https://tools.ietf.org/html/rfc4180)
+// Note that we allow leading and trailing spaces with int or float field.
 //
 // Arguments:
-//	tensor: A Tensor of type `T`.
+//	records: Each string is a record/row in the csv and all records should have
+// the same format.
+//	record_defaults: One tensor per column of the input record, with either a
+// scalar default value for that column or an empty vector if the column is
+// required.
 //
-// Returns A serialized TensorProto proto of the input tensor.
-func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
+// Returns Each tensor will have the same shape as records.
+func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeCSV",
+		Input: []tf.Input{
+			records, tf.OutputList(record_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("DecodeCSV", err)
+		return
+	}
+	return output
+}
+
+// Convert JSON-encoded Example records to binary protocol buffer strings.
+//
+// This op translates a tensor containing Example records, encoded using
+// the [standard JSON
+// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
+// into a tensor containing the same records encoded as binary protocol
+// buffers. The resulting tensor can then be fed to any of the other
+// Example-parsing ops.
+//
+// Arguments:
+//	json_examples: Each string is a JSON object serialized according to the JSON
+// mapping of the Example proto.
+//
+// Returns Each string is a binary Example protocol buffer corresponding
+// to the respective element of `json_examples`.
+func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SerializeTensor",
+		Type: "DecodeJSONExample",
 		Input: []tf.Input{
-			tensor,
+			json_examples,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -27227,204 +27317,50 @@ func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (outp
 	return op.Output(0)
 }
 
-// Returns element-wise integer closest to x.
-//
-// If the result is midway between two representable values,
-// the even representable is chosen.
-// For example:
-//
-// ```
-// rint(-1.5) ==> -2.0
-// rint(0.5000001) ==> 1.0
-// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
-// ```
-func Rint(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Rint",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// ParseSequenceExampleAttr is an optional argument to ParseSequenceExample.
+type ParseSequenceExampleAttr func(optionalAttr)
 
-// Reverses specific dimensions of a tensor.
-//
-// Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
-// of `tensor`, this operation reverses each dimension i of `tensor` where
-// `dims[i]` is `True`.
-//
-// `tensor` can have up to 8 dimensions. The number of dimensions
-// of `tensor` must equal the number of elements in `dims`. In other words:
-//
-// `rank(tensor) = size(dims)`
-//
-// For example:
-//
-// ```
-// # tensor 't' is [[[[ 0,  1,  2,  3],
-// #                  [ 4,  5,  6,  7],
-// #                  [ 8,  9, 10, 11]],
-// #                 [[12, 13, 14, 15],
-// #                  [16, 17, 18, 19],
-// #                  [20, 21, 22, 23]]]]
-// # tensor 't' shape is [1, 2, 3, 4]
-//
-// # 'dims' is [False, False, False, True]
-// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-//                         [ 7,  6,  5,  4],
-//                         [ 11, 10, 9, 8]],
-//                        [[15, 14, 13, 12],
-//                         [19, 18, 17, 16],
-//                         [23, 22, 21, 20]]]]
-//
-// # 'dims' is [False, True, False, False]
-// reverse(t, dims) ==> [[[[12, 13, 14, 15],
-//                         [16, 17, 18, 19],
-//                         [20, 21, 22, 23]
-//                        [[ 0,  1,  2,  3],
-//                         [ 4,  5,  6,  7],
-//                         [ 8,  9, 10, 11]]]]
-//
-// # 'dims' is [False, False, True, False]
-// reverse(t, dims) ==> [[[[8, 9, 10, 11],
-//                         [4, 5, 6, 7],
-//                         [0, 1, 2, 3]]
-//                        [[20, 21, 22, 23],
-//                         [16, 17, 18, 19],
-//                         [12, 13, 14, 15]]]]
-// ```
-//
-// Arguments:
-//	tensor: Up to 8-D.
-//	dims: 1-D. The dimensions to reverse.
-//
-// Returns The same shape as `tensor`.
-func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Reverse",
-		Input: []tf.Input{
-			tensor, dims,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Wraps an arbitrary MLIR computation expressed as a module with a main() function.
-//
-// This operation does not have an associated kernel and is not intended to be
-// executed in a regular TensorFlow session. Instead it is intended to be used for
-// testing or for special case where a user intends to pass custom MLIR computation
-// through a TensorFlow graph with the intent of having custom tooling processing
-// it downstream (when targeting a different environment, like TensorFlow lite for
-// example).
-// The MLIR module is expected to have a main() function that will be used as an
-// entry point. The inputs to the operations will be passed as argument to the
-// main() function and the returned values of the main function mapped to the
-// outputs.
-// Example usage:
-//
-// ```
-// import tensorflow as tf
-// from tensorflow.compiler.mlir.tensorflow.gen_mlir_passthrough_op import mlir_passthrough_op
-//
-// mlir_module = '''python
-// func @main(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10x10xf32> {
-//    %add = "magic.op"(%arg0, %arg1) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10x10xf32>
-//    return %ret : tensor<10x10xf32>
-// }
-// '''
-//
-// @tf.function
-// def foo(x, y):
-//   return mlir_passthrough_op([x, y], mlir_module, Toutputs=[tf.float32])
-//
-// graph_def = foo.get_concrete_function(tf.TensorSpec([10], tf.float32), tf.TensorSpec([10], tf.float32)).graph.as_graph_def()
-// ```
-func MlirPassthroughOp(scope *Scope, inputs []tf.Output, mlir_module string, Toutputs []tf.DataType) (outputs []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"mlir_module": mlir_module, "Toutputs": Toutputs}
-	opspec := tf.OpSpec{
-		Type: "MlirPassthroughOp",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("MlirPassthroughOp", err)
-		return
-	}
-	return outputs
-}
-
-// StringLowerAttr is an optional argument to StringLower.
-type StringLowerAttr func(optionalAttr)
-
-// StringLowerEncoding sets the optional encoding attribute to value.
-// If not specified, defaults to ""
-func StringLowerEncoding(value string) StringLowerAttr {
-	return func(m optionalAttr) {
-		m["encoding"] = value
-	}
-}
-
-// Converts all uppercase characters into their respective lowercase replacements.
-//
-// Example:
-//
-// >>> tf.strings.lower("CamelCase string and ALL CAPS")
-// <tf.Tensor: shape=(), dtype=string, numpy=b'camelcase string and all caps'>
-//
-func StringLower(scope *Scope, input tf.Output, optional ...StringLowerAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StringLower",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ParseSequenceExampleV2Attr is an optional argument to ParseSequenceExampleV2.
-type ParseSequenceExampleV2Attr func(optionalAttr)
-
-// ParseSequenceExampleV2NcontextSparse sets the optional Ncontext_sparse attribute to value.
+// ParseSequenceExampleNcontextSparse sets the optional Ncontext_sparse attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func ParseSequenceExampleV2NcontextSparse(value int64) ParseSequenceExampleV2Attr {
+func ParseSequenceExampleNcontextSparse(value int64) ParseSequenceExampleAttr {
 	return func(m optionalAttr) {
 		m["Ncontext_sparse"] = value
 	}
 }
 
-// ParseSequenceExampleV2ContextSparseTypes sets the optional context_sparse_types attribute to value.
+// ParseSequenceExampleNcontextDense sets the optional Ncontext_dense attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleNcontextDense(value int64) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["Ncontext_dense"] = value
+	}
+}
+
+// ParseSequenceExampleNfeatureListSparse sets the optional Nfeature_list_sparse attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleNfeatureListSparse(value int64) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["Nfeature_list_sparse"] = value
+	}
+}
+
+// ParseSequenceExampleNfeatureListDense sets the optional Nfeature_list_dense attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleNfeatureListDense(value int64) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["Nfeature_list_dense"] = value
+	}
+}
+
+// ParseSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
 //
 // value: A list of Ncontext_sparse types; the data types of data in
 // each context Feature given in context_sparse_keys.
@@ -27433,37 +27369,23 @@ func ParseSequenceExampleV2NcontextSparse(value int64) ParseSequenceExampleV2Att
 // If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
-func ParseSequenceExampleV2ContextSparseTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
+func ParseSequenceExampleContextSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
 	return func(m optionalAttr) {
 		m["context_sparse_types"] = value
 	}
 }
 
-// ParseSequenceExampleV2ContextRaggedValueTypes sets the optional context_ragged_value_types attribute to value.
-//
-// value: RaggedTensor.value dtypes for the ragged context features.
+// ParseSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
 // If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
-func ParseSequenceExampleV2ContextRaggedValueTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
+func ParseSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSequenceExampleAttr {
 	return func(m optionalAttr) {
-		m["context_ragged_value_types"] = value
+		m["feature_list_dense_types"] = value
 	}
 }
 
-// ParseSequenceExampleV2ContextRaggedSplitTypes sets the optional context_ragged_split_types attribute to value.
-//
-// value: RaggedTensor.row_split dtypes for the ragged context features.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleV2ContextRaggedSplitTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
-	return func(m optionalAttr) {
-		m["context_ragged_split_types"] = value
-	}
-}
-
-// ParseSequenceExampleV2ContextDenseShapes sets the optional context_dense_shapes attribute to value.
+// ParseSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
 //
 // value: A list of Ncontext_dense shapes; the shapes of data in
 // each context Feature given in context_dense_keys.
@@ -27473,43 +27395,13 @@ func ParseSequenceExampleV2ContextRaggedSplitTypes(value []tf.DataType) ParseSeq
 // If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
-func ParseSequenceExampleV2ContextDenseShapes(value []tf.Shape) ParseSequenceExampleV2Attr {
+func ParseSequenceExampleContextDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
 	return func(m optionalAttr) {
 		m["context_dense_shapes"] = value
 	}
 }
 
-// ParseSequenceExampleV2NfeatureListSparse sets the optional Nfeature_list_sparse attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func ParseSequenceExampleV2NfeatureListSparse(value int64) ParseSequenceExampleV2Attr {
-	return func(m optionalAttr) {
-		m["Nfeature_list_sparse"] = value
-	}
-}
-
-// ParseSequenceExampleV2NfeatureListDense sets the optional Nfeature_list_dense attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func ParseSequenceExampleV2NfeatureListDense(value int64) ParseSequenceExampleV2Attr {
-	return func(m optionalAttr) {
-		m["Nfeature_list_dense"] = value
-	}
-}
-
-// ParseSequenceExampleV2FeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleV2FeatureListDenseTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
-	return func(m optionalAttr) {
-		m["feature_list_dense_types"] = value
-	}
-}
-
-// ParseSequenceExampleV2FeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
+// ParseSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
 //
 // value: A list of Nfeature_list_sparse types; the data types
 // of data in each FeatureList given in feature_list_sparse_keys.
@@ -27518,37 +27410,13 @@ func ParseSequenceExampleV2FeatureListDenseTypes(value []tf.DataType) ParseSeque
 // If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
-func ParseSequenceExampleV2FeatureListSparseTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
+func ParseSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
 	return func(m optionalAttr) {
 		m["feature_list_sparse_types"] = value
 	}
 }
 
-// ParseSequenceExampleV2FeatureListRaggedValueTypes sets the optional feature_list_ragged_value_types attribute to value.
-//
-// value: RaggedTensor.value dtypes for the ragged FeatureList features.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleV2FeatureListRaggedValueTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
-	return func(m optionalAttr) {
-		m["feature_list_ragged_value_types"] = value
-	}
-}
-
-// ParseSequenceExampleV2FeatureListRaggedSplitTypes sets the optional feature_list_ragged_split_types attribute to value.
-//
-// value: RaggedTensor.row_split dtypes for the ragged FeatureList features.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleV2FeatureListRaggedSplitTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
-	return func(m optionalAttr) {
-		m["feature_list_ragged_split_types"] = value
-	}
-}
-
-// ParseSequenceExampleV2FeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
+// ParseSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
 //
 // value: A list of Nfeature_list_dense shapes; the shapes of
 // data in each FeatureList given in feature_list_dense_keys.
@@ -27558,35 +27426,21 @@ func ParseSequenceExampleV2FeatureListRaggedSplitTypes(value []tf.DataType) Pars
 // If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
-func ParseSequenceExampleV2FeatureListDenseShapes(value []tf.Shape) ParseSequenceExampleV2Attr {
+func ParseSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
 	return func(m optionalAttr) {
 		m["feature_list_dense_shapes"] = value
 	}
 }
 
-// Transforms a vector of tf.io.SequenceExample protos (as strings) into
-// typed tensors.
+// Transforms a vector of brain.SequenceExample protos (as strings) into typed tensors.
 //
 // Arguments:
-//	serialized: A scalar or vector containing binary serialized SequenceExample protos.
-//	debug_name: A scalar or vector containing the names of the serialized protos.
+//	serialized: A vector containing binary serialized SequenceExample protos.
+//	debug_name: A vector containing the names of the serialized protos.
 // May contain, for example, table key (descriptive) name for the
 // corresponding serialized proto.  This is purely useful for debugging
 // purposes, and the presence of values here has no effect on the output.
 // May also be an empty vector if no name is available.
-//	context_sparse_keys: The keys expected in the Examples' features associated with context_sparse
-// values.
-//	context_dense_keys: The keys expected in the SequenceExamples' context features associated with
-// dense values.
-//	context_ragged_keys: The keys expected in the Examples' features associated with context_ragged
-// values.
-//	feature_list_sparse_keys: The keys expected in the FeatureLists associated with sparse values.
-//	feature_list_dense_keys: The keys expected in the SequenceExamples' feature_lists associated
-// with lists of dense values.
-//	feature_list_ragged_keys: The keys expected in the FeatureLists associated with ragged values.
-//	feature_list_dense_missing_assumed_empty: A vector corresponding 1:1 with feature_list_dense_keys, indicating which
-// features may be missing from the SequenceExamples.  If the associated
-// FeatureList is missing, it is treated as empty.
 //	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
 // context_dense_defaults[j] provides default values
 // when the SequenceExample's context map lacks context_dense_key[j].
@@ -27595,18 +27449,34 @@ func ParseSequenceExampleV2FeatureListDenseShapes(value []tf.Shape) ParseSequenc
 // The input type is inferred from context_dense_defaults[j], even when it's
 // empty.  If context_dense_defaults[j] is not empty, its shape must match
 // context_dense_shapes[j].
-func ParseSequenceExampleV2(scope *Scope, serialized tf.Output, debug_name tf.Output, context_sparse_keys tf.Output, context_dense_keys tf.Output, context_ragged_keys tf.Output, feature_list_sparse_keys tf.Output, feature_list_dense_keys tf.Output, feature_list_ragged_keys tf.Output, feature_list_dense_missing_assumed_empty tf.Output, context_dense_defaults []tf.Output, optional ...ParseSequenceExampleV2Attr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, context_ragged_values []tf.Output, context_ragged_row_splits []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output, feature_list_dense_lengths []tf.Output, feature_list_ragged_values []tf.Output, feature_list_ragged_outer_splits []tf.Output, feature_list_ragged_inner_splits []tf.Output) {
+//	feature_list_dense_missing_assumed_empty: A vector listing the
+// FeatureList keys which may be missing from the SequenceExamples.  If the
+// associated FeatureList is missing, it is treated as empty.  By default,
+// any FeatureList not listed in this vector must exist in the SequenceExamples.
+//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with context_sparse
+// values.
+//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' context features associated with
+// dense values.
+//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
+// (scalars).  The keys expected in the FeatureLists associated with sparse
+// values.
+//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' feature_lists associated
+// with lists of dense values.
+func ParseSequenceExample(scope *Scope, serialized tf.Output, debug_name tf.Output, context_dense_defaults []tf.Output, feature_list_dense_missing_assumed_empty []string, context_sparse_keys []string, context_dense_keys []string, feature_list_sparse_keys []string, feature_list_dense_keys []string, optional ...ParseSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output, feature_list_dense_lengths []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"feature_list_dense_missing_assumed_empty": feature_list_dense_missing_assumed_empty, "context_sparse_keys": context_sparse_keys, "context_dense_keys": context_dense_keys, "feature_list_sparse_keys": feature_list_sparse_keys, "feature_list_dense_keys": feature_list_dense_keys}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ParseSequenceExampleV2",
+		Type: "ParseSequenceExample",
 		Input: []tf.Input{
-			serialized, debug_name, context_sparse_keys, context_dense_keys, context_ragged_keys, feature_list_sparse_keys, feature_list_dense_keys, feature_list_ragged_keys, feature_list_dense_missing_assumed_empty, tf.OutputList(context_dense_defaults),
+			serialized, debug_name, tf.OutputList(context_dense_defaults),
 		},
 		Attrs: attrs,
 	}
@@ -27617,281 +27487,68 @@ func ParseSequenceExampleV2(scope *Scope, serialized tf.Output, debug_name tf.Ou
 	var idx int
 	var err error
 	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSequenceExampleV2", err)
+		scope.UpdateErr("ParseSequenceExample", err)
 		return
 	}
 	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExampleV2", err)
+		scope.UpdateErr("ParseSequenceExample", err)
 		return
 	}
 	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSequenceExampleV2", err)
+		scope.UpdateErr("ParseSequenceExample", err)
 		return
 	}
 	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExampleV2", err)
-		return
-	}
-	if context_ragged_values, idx, err = makeOutputList(op, idx, "context_ragged_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExampleV2", err)
-		return
-	}
-	if context_ragged_row_splits, idx, err = makeOutputList(op, idx, "context_ragged_row_splits"); err != nil {
-		scope.UpdateErr("ParseSequenceExampleV2", err)
+		scope.UpdateErr("ParseSequenceExample", err)
 		return
 	}
 	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSequenceExampleV2", err)
+		scope.UpdateErr("ParseSequenceExample", err)
 		return
 	}
 	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExampleV2", err)
+		scope.UpdateErr("ParseSequenceExample", err)
 		return
 	}
 	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSequenceExampleV2", err)
+		scope.UpdateErr("ParseSequenceExample", err)
 		return
 	}
 	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExampleV2", err)
+		scope.UpdateErr("ParseSequenceExample", err)
 		return
 	}
 	if feature_list_dense_lengths, idx, err = makeOutputList(op, idx, "feature_list_dense_lengths"); err != nil {
-		scope.UpdateErr("ParseSequenceExampleV2", err)
+		scope.UpdateErr("ParseSequenceExample", err)
 		return
 	}
-	if feature_list_ragged_values, idx, err = makeOutputList(op, idx, "feature_list_ragged_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExampleV2", err)
-		return
-	}
-	if feature_list_ragged_outer_splits, idx, err = makeOutputList(op, idx, "feature_list_ragged_outer_splits"); err != nil {
-		scope.UpdateErr("ParseSequenceExampleV2", err)
-		return
-	}
-	if feature_list_ragged_inner_splits, idx, err = makeOutputList(op, idx, "feature_list_ragged_inner_splits"); err != nil {
-		scope.UpdateErr("ParseSequenceExampleV2", err)
-		return
-	}
-	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, context_ragged_values, context_ragged_row_splits, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values, feature_list_dense_lengths, feature_list_ragged_values, feature_list_ragged_outer_splits, feature_list_ragged_inner_splits
+	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values, feature_list_dense_lengths
 }
 
-// Gives a guarantee to the TF runtime that the input tensor is a constant.
-//
-// The runtime is then free to make optimizations based on this.
-//
-// Only accepts value typed tensors as inputs and rejects resource variable handles
-// as input.
-//
-// Returns the input tensor without modification.
-func GuaranteeConst(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "GuaranteeConst",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// UniqueV2Attr is an optional argument to UniqueV2.
+type UniqueV2Attr func(optionalAttr)
 
-// Transforms a tf.Example proto (as a string) into typed tensors.
-//
-// Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	dense_defaults: A list of Tensors (some may be empty), whose length matches
-// the length of `dense_keys`. dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	num_sparse: The number of sparse features to be parsed from the example. This
-// must match the lengths of `sparse_keys` and `sparse_types`.
-//	sparse_keys: A list of `num_sparse` strings.
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: The keys expected in the Examples' features associated with dense
-// values.
-//	sparse_types: A list of `num_sparse` types; the data types of data in each
-// Feature given in sparse_keys.
-// Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: The shapes of data in each Feature given in dense_keys.
-// The length of this list must match the length of `dense_keys`.  The
-// number of elements in the Feature corresponding to dense_key[j] must
-// always equal dense_shapes[j].NumEntries().  If dense_shapes[j] ==
-// (D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
-// will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
-// ..., DN), the shape of the output Tensor dense_values[j] will be (M,
-// D1, .., DN), where M is the number of blocks of elements of length
-// D1 * .... * DN, in the input.
-func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes}
-	opspec := tf.OpSpec{
-		Type: "ParseSingleExample",
-		Input: []tf.Input{
-			serialized, tf.OutputList(dense_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
-}
-
-// Scatter `updates` into a new tensor according to `indices`.
-//
-// Creates a new tensor by applying sparse `updates` to individual values or
-// slices within a tensor (initially zero for numeric, empty for string) of
-// the given `shape` according to indices.  This operator is the inverse of the
-// `tf.gather_nd` operator which extracts values or slices from a given tensor.
-//
-// This operation is similar to tensor_scatter_add, except that the tensor is
-// zero-initialized. Calling `tf.scatter_nd(indices, values, shape)` is identical
-// to `tensor_scatter_add(tf.zeros(shape, values.dtype), indices, values)`
-//
-// If `indices` contains duplicates, then their updates are accumulated (summed).
-//
-// **WARNING**: The order in which updates are applied is nondeterministic, so the
-// output will be nondeterministic if `indices` contains duplicates -- because
-// of some numerical approximation issues, numbers summed in different order
-// may yield different results.
-//
-// `indices` is an integer tensor containing indices into a new tensor of shape
-// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
-//
-//     indices.shape[-1] <= shape.rank
-//
-// The last dimension of `indices` corresponds to indices into elements
-// (if `indices.shape[-1] = shape.rank`) or slices
-// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
-// `shape`.  `updates` is a tensor with shape
-//
-//     indices.shape[:-1] + shape[indices.shape[-1]:]
-//
-// The simplest form of scatter is to insert individual elements in a tensor by
-// index. For example, say we want to insert 4 scattered elements in a rank-1
-// tensor with 8 elements.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
-// </div>
-//
-// In Python, this scatter operation would look like this:
-//
-// ```python
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     shape = tf.constant([8])
-//     scatter = tf.scatter_nd(indices, updates, shape)
-//     print(scatter)
-// ```
-//
-// The resulting tensor would look like this:
-//
-//     [0, 11, 0, 10, 9, 0, 0, 12]
-//
-// We can also, insert entire slices of a higher rank tensor all at once. For
-// example, if we wanted to insert two slices in the first dimension of a
-// rank-3 tensor with two matrices of new values.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
-// </div>
-//
-// In Python, this scatter operation would look like this:
-//
-// ```python
-//     indices = tf.constant([[0], [2]])
-//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]],
-//                            [[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
-//     shape = tf.constant([4, 4, 4])
-//     scatter = tf.scatter_nd(indices, updates, shape)
-//     print(scatter)
-// ```
-//
-// The resulting tensor would look like this:
-//
-//     [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
-//      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
-//
-// Note that on CPU, if an out of bound index is found, an error is returned.
-// On GPU, if an out of bound index is found, the index is ignored.
-//
-// Arguments:
-//	indices: Index tensor.
-//	updates: Updates to scatter into output.
-//	shape: 1-D. The shape of the resulting tensor.
-//
-// Returns A new tensor with the given shape and updates applied according
-// to the indices.
-func ScatterNd(scope *Scope, indices tf.Output, updates tf.Output, shape tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ScatterNd",
-		Input: []tf.Input{
-			indices, updates, shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UniqueAttr is an optional argument to Unique.
-type UniqueAttr func(optionalAttr)
-
-// UniqueOutIdx sets the optional out_idx attribute to value.
+// UniqueV2OutIdx sets the optional out_idx attribute to value.
 // If not specified, defaults to DT_INT32
-func UniqueOutIdx(value tf.DataType) UniqueAttr {
+func UniqueV2OutIdx(value tf.DataType) UniqueV2Attr {
 	return func(m optionalAttr) {
 		m["out_idx"] = value
 	}
 }
 
-// Finds unique elements in a 1-D tensor.
+// Finds unique elements along an axis of a tensor.
 //
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`; `x` does not need to be sorted.
-// This operation also returns a tensor `idx` the same size as `x` that contains
-// the index of each value of `x` in the unique output `y`. In other words:
+// This operation either returns a tensor `y` containing unique elements
+// along the `axis` of a tensor. The returned unique elements is sorted
+// in the same order as they occur along `axis` in `x`.
+// This operation also returns a tensor `idx` that is the same size as
+// the number of the elements in `x` along the `axis` dimension. It
+// contains the index in the unique output `y`.
+// In other words, for an `1-D` tensor `x` with `axis = None:
 //
 // `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
 //
-// Examples:
+// For example:
 //
 // ```
 // # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
@@ -27900,20 +27557,41 @@ func UniqueOutIdx(value tf.DataType) UniqueAttr {
 // idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
 // ```
 //
+// For an `2-D` tensor `x` with `axis = 0`:
+//
 // ```
-// # tensor 'x' is [4, 5, 1, 2, 3, 3, 4, 5]
-// y, idx = unique(x)
-// y ==> [4, 5, 1, 2, 3]
-// idx ==> [0, 1, 2, 3, 4, 4, 0, 1]
+// # tensor 'x' is [[1, 0, 0],
+// #                [1, 0, 0],
+// #                [2, 0, 0]]
+// y, idx = unique(x, axis=0)
+// y ==> [[1, 0, 0],
+//        [2, 0, 0]]
+// idx ==> [0, 0, 1]
+// ```
+//
+// For an `2-D` tensor `x` with `axis = 1`:
+//
+// ```
+// # tensor 'x' is [[1, 0, 0],
+// #                [1, 0, 0],
+// #                [2, 0, 0]]
+// y, idx = unique(x, axis=1)
+// y ==> [[1, 0],
+//        [1, 0],
+//        [2, 0]]
+// idx ==> [0, 1, 1]
 // ```
 //
 // Arguments:
-//	x: 1-D.
+//	x: A `Tensor`.
+//	axis: A `Tensor` of type `int32` (default: None). The axis of the Tensor to
+// find the unique elements.
 //
 // Returns:
-//	y: 1-D.
-//	idx: 1-D.
-func Unique(scope *Scope, x tf.Output, optional ...UniqueAttr) (y tf.Output, idx tf.Output) {
+//	y: A `Tensor`. Unique elements along the `axis` of `Tensor` x.
+//	idx: A 1-D Tensor. Has the same type as x that contains the index of each
+// value of x in the output y.
+func UniqueV2(scope *Scope, x tf.Output, axis tf.Output, optional ...UniqueV2Attr) (y tf.Output, idx tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -27922,9 +27600,9 @@ func Unique(scope *Scope, x tf.Output, optional ...UniqueAttr) (y tf.Output, idx
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Unique",
+		Type: "UniqueV2",
 		Input: []tf.Input{
-			x,
+			x, axis,
 		},
 		Attrs: attrs,
 	}
@@ -27932,133 +27610,90 @@ func Unique(scope *Scope, x tf.Output, optional ...UniqueAttr) (y tf.Output, idx
 	return op.Output(0), op.Output(1)
 }
 
-// Converts a `RaggedTensor` into a `SparseTensor` with the same values.
+// RetrieveTPUEmbeddingADAMParametersAttr is an optional argument to RetrieveTPUEmbeddingADAMParameters.
+type RetrieveTPUEmbeddingADAMParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingADAMParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func RetrieveTPUEmbeddingADAMParametersTableId(value int64) RetrieveTPUEmbeddingADAMParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingADAMParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingADAMParametersTableName(value string) RetrieveTPUEmbeddingADAMParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingADAMParametersConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingADAMParametersConfig(value string) RetrieveTPUEmbeddingADAMParametersAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Retrieve ADAM embedding parameters.
 //
-// input=ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
-// output=SparseTensor(indices=sparse_indices, values=sparse_values,
-//                     dense_shape=sparse_dense_shape)
-//
-// Arguments:
-//	rt_nested_splits: The `row_splits` for the `RaggedTensor`.
-//	rt_dense_values: The `flat_values` for the `RaggedTensor`.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
 // Returns:
-//	sparse_indices: The indices for the `SparseTensor`.
-//	sparse_values: The values of the `SparseTensor`.
-//	sparse_dense_shape: `sparse_dense_shape` is a tight bounding box of the input `RaggedTensor`.
-func RaggedTensorToSparse(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output) (sparse_indices tf.Output, sparse_values tf.Output, sparse_dense_shape tf.Output) {
+//	parameters: Parameter parameters updated by the ADAM optimization algorithm.
+//	momenta: Parameter momenta updated by the ADAM optimization algorithm.
+//	velocities: Parameter velocities updated by the ADAM optimization algorithm.
+func RetrieveTPUEmbeddingADAMParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingADAMParametersAttr) (parameters tf.Output, momenta tf.Output, velocities tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RaggedTensorToSparse",
-		Input: []tf.Input{
-			tf.OutputList(rt_nested_splits), rt_dense_values,
-		},
+		Type: "RetrieveTPUEmbeddingADAMParameters",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns the name of the device on which `resource` has been placed.
-func ExperimentalIteratorGetDevice(scope *Scope, resource tf.Output) (device tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalIteratorGetDevice",
-		Input: []tf.Input{
-			resource,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// StatelessRandomBinomialAttr is an optional argument to StatelessRandomBinomial.
+type StatelessRandomBinomialAttr func(optionalAttr)
 
-// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
-func ExperimentalBytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalBytesProducedStatsDataset",
-		Input: []tf.Input{
-			input_dataset, tag,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
+// StatelessRandomBinomialDtype sets the optional dtype attribute to value.
 //
-// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
-// ](http://arxiv.org/abs/1511.07289)
-func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Elu",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
-type AddSparseToTensorsMapAttr func(optionalAttr)
-
-// AddSparseToTensorsMapContainer sets the optional container attribute to value.
-//
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
+// value: The type of the output.
+// If not specified, defaults to DT_INT64
+func StatelessRandomBinomialDtype(value tf.DataType) StatelessRandomBinomialAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["dtype"] = value
 	}
 }
 
-// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+// Outputs deterministic pseudorandom random numbers from a binomial distribution.
 //
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
+// Outputs random values from a binomial distribution.
 //
-// A `SparseTensor` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`.
-//
-// This operator takes the given `SparseTensor` and adds it to a container
-// object (a `SparseTensorsMap`).  A unique key within this container is generated
-// in the form of an `int64`, and this is the value that is returned.
-//
-// The `SparseTensor` can then be read out as part of a minibatch by passing
-// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddSparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+// The outputs are a deterministic function of `shape`, `seed`, `counts`, and `probs`.
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//	counts: The counts of the binomial distribution. Must be broadcastable with `probs`,
+// and broadcastable with the rightmost dimensions of `shape`.
+//	probs: The probability of success for the binomial distribution. Must be broadcastable
+// with `counts` and broadcastable with the rightmost dimensions of `shape`.
 //
-// Returns 0-D.  The handle of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.
-func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
+// Returns Random values with specified shape.
+func StatelessRandomBinomial(scope *Scope, shape tf.Output, seed tf.Output, counts tf.Output, probs tf.Output, optional ...StatelessRandomBinomialAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28067,9 +27702,9 @@ func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AddSparseToTensorsMap",
+		Type: "StatelessRandomBinomial",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			shape, seed, counts, probs,
 		},
 		Attrs: attrs,
 	}
@@ -28077,311 +27712,15 @@ func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values
 	return op.Output(0)
 }
 
-// Transforms a vector of tf.Example protos (as strings) into typed tensors.
-//
-// Arguments:
-//	serialized: A scalar or vector containing binary serialized Example protos.
-//	names: A tensor containing the names of the serialized protos.
-// Corresponds 1:1 with the `serialized` tensor.
-// May contain, for example, table key (descriptive) names for the
-// corresponding serialized protos.  These are purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no names are available.
-// If non-empty, this tensor must have the same shape as "serialized".
-//	sparse_keys: Vector of strings.
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: Vector of strings.
-// The keys expected in the Examples' features associated with dense values.
-//	ragged_keys: Vector of strings.
-// The keys expected in the Examples' features associated with ragged values.
-//	dense_defaults: A list of Tensors (some may be empty).  Corresponds 1:1 with `dense_keys`.
-// dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	num_sparse: The number of sparse keys.
-//	sparse_types: A list of `num_sparse` types; the data types of data in each Feature
-// given in sparse_keys.
-// Currently the ParseExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	ragged_value_types: A list of `num_ragged` types; the data types of data in each Feature
-// given in ragged_keys (where `num_ragged = sparse_keys.size()`).
-// Currently the ParseExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	ragged_split_types: A list of `num_ragged` types; the data types of row_splits in each Feature
-// given in ragged_keys (where `num_ragged = sparse_keys.size()`).
-// May be DT_INT32 or DT_INT64.
-//	dense_shapes: A list of `num_dense` shapes; the shapes of data in each Feature
-// given in dense_keys (where `num_dense = dense_keys.size()`).
-// The number of elements in the Feature corresponding to dense_key[j]
-// must always equal dense_shapes[j].NumEntries().
-// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
-// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
-// The dense outputs are just the inputs row-stacked by batch.
-// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
-// the shape of the output Tensor dense_values[j] will be
-// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
-// of elements of length D1 * .... * DN, across all minibatch entries
-// in the input.  Any minibatch entry with less than M blocks of elements of
-// length D1 * ... * DN will be padded with the corresponding default_value
-// scalar element along the second dimension.
-func ParseExampleV2(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys tf.Output, dense_keys tf.Output, ragged_keys tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_types []tf.DataType, ragged_value_types []tf.DataType, ragged_split_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output, ragged_values []tf.Output, ragged_row_splits []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_types": sparse_types, "ragged_value_types": ragged_value_types, "ragged_split_types": ragged_split_types, "dense_shapes": dense_shapes}
-	opspec := tf.OpSpec{
-		Type: "ParseExampleV2",
-		Input: []tf.Input{
-			serialized, names, sparse_keys, dense_keys, ragged_keys, tf.OutputList(dense_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseExampleV2", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseExampleV2", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseExampleV2", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseExampleV2", err)
-		return
-	}
-	if ragged_values, idx, err = makeOutputList(op, idx, "ragged_values"); err != nil {
-		scope.UpdateErr("ParseExampleV2", err)
-		return
-	}
-	if ragged_row_splits, idx, err = makeOutputList(op, idx, "ragged_row_splits"); err != nil {
-		scope.UpdateErr("ParseExampleV2", err)
-		return
-	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values, ragged_values, ragged_row_splits
-}
+// DecodePaddedRawAttr is an optional argument to DecodePaddedRaw.
+type DecodePaddedRawAttr func(optionalAttr)
 
-// Saves input tensors slices to disk.
+// DecodePaddedRawLittleEndian sets the optional little_endian attribute to value.
 //
-// This is like `Save` except that tensors can be listed in the saved file as being
-// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
-// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
-// have as many elements as `tensor_names`.
-//
-// Elements of the `shapes_and_slices` input must either be:
-//
-// *  The empty string, in which case the corresponding tensor is
-//    saved normally.
-// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
-//    `dimI` are the dimensions of the larger tensor and `slice-spec`
-//    specifies what part is covered by the tensor to save.
-//
-// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
-// where each `sliceI` is either:
-//
-// *  The string `-` meaning that the slice covers all indices of this dimension
-// *  `start,length` where `start` and `length` are integers.  In that
-//    case the slice covers `length` indices starting at `start`.
-//
-// See also `Save`.
-//
-// Arguments:
-//	filename: Must have a single element. The name of the file to which we write the
-// tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
-// saving the tensors.
-//	data: `N` tensors to save.
-//
-// Returns the created operation.
-func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SaveSlices",
-		Input: []tf.Input{
-			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// FusedBatchNormGradV3Attr is an optional argument to FusedBatchNormGradV3.
-type FusedBatchNormGradV3Attr func(optionalAttr)
-
-// FusedBatchNormGradV3Epsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormGradV3Epsilon(value float32) FusedBatchNormGradV3Attr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
-
-// FusedBatchNormGradV3DataFormat sets the optional data_format attribute to value.
-//
-// value: The data format for y_backprop, x, x_backprop.
-// Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormGradV3DataFormat(value string) FusedBatchNormGradV3Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormGradV3IsTraining sets the optional is_training attribute to value.
-//
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
+// value: Whether the input `input_bytes` is in little-endian order. Ignored for
+// `out_type` values that are stored in a single byte, like `uint8`
 // If not specified, defaults to true
-func FusedBatchNormGradV3IsTraining(value bool) FusedBatchNormGradV3Attr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Gradient for batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
-//
-// Arguments:
-//	y_backprop: A 4D Tensor for the gradient with respect to y.
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
-// mean to be reused in gradient computation. When is_training is
-// False, a 1D Tensor for the population mean to be reused in both
-// 1st and 2nd order gradient computation.
-//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
-// variance (inverted variance in the cuDNN case) to be reused in
-// gradient computation. When is_training is False, a 1D Tensor
-// for the population variance to be reused in both 1st and 2nd
-// order gradient computation.
-//	reserve_space_3: When is_training is True, a 1D Tensor for some intermediate results to be reused
-// in gradient computation. When is_training is False, a dummy empty Tensor will be
-// created.
-//
-// Returns:
-//	x_backprop: A 4D Tensor for the gradient with respect to x.
-//	scale_backprop: A 1D Tensor for the gradient with respect to scale.
-//	offset_backprop: A 1D Tensor for the gradient with respect to offset.
-//	reserve_space_4: Unused placeholder to match the mean input in FusedBatchNorm.
-//	reserve_space_5: Unused placeholder to match the variance input
-// in FusedBatchNorm.
-func FusedBatchNormGradV3(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, reserve_space_3 tf.Output, optional ...FusedBatchNormGradV3Attr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_4 tf.Output, reserve_space_5 tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FusedBatchNormGradV3",
-		Input: []tf.Input{
-			y_backprop, x, scale, reserve_space_1, reserve_space_2, reserve_space_3,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// AvgPool3DAttr is an optional argument to AvgPool3D.
-type AvgPool3DAttr func(optionalAttr)
-
-// AvgPool3DDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DDataFormat(value string) AvgPool3DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs 3D average pooling on the input.
-//
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The average pooled output tensor.
-func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AvgPool3D",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the number of records this Reader has produced.
-//
-// This is the same as the number of ReaderRead executions that have
-// succeeded.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderNumRecordsProducedV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DecodeRawAttr is an optional argument to DecodeRaw.
-type DecodeRawAttr func(optionalAttr)
-
-// DecodeRawLittleEndian sets the optional little_endian attribute to value.
-//
-// value: Whether the input `bytes` are in little-endian order.
-// Ignored for `out_type` values that are stored in a single byte like
-// `uint8`.
-// If not specified, defaults to true
-func DecodeRawLittleEndian(value bool) DecodeRawAttr {
+func DecodePaddedRawLittleEndian(value bool) DecodePaddedRawAttr {
 	return func(m optionalAttr) {
 		m["little_endian"] = value
 	}
@@ -28390,13 +27729,15 @@ func DecodeRawLittleEndian(value bool) DecodeRawAttr {
 // Reinterpret the bytes of a string as a vector of numbers.
 //
 // Arguments:
-//	bytes: All the elements must have the same length.
+//	input_bytes: Tensor of string to be decoded.
+//	fixed_length: Length in bytes for each element of the decoded output. Must be a multiple
+// of the size of the output type.
 //
 //
-// Returns A Tensor with one more dimension than the input `bytes`.  The
-// added dimension will have size equal to the length of the elements
-// of `bytes` divided by the number of bytes to represent `out_type`.
-func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
+// Returns A Tensor with one more dimension than the input `bytes`. The added dimension
+// will have size equal to the length of the elements of `bytes` divided by the
+// number of bytes to represent `out_type`.
+func DecodePaddedRaw(scope *Scope, input_bytes tf.Output, fixed_length tf.Output, out_type tf.DataType, optional ...DecodePaddedRawAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28405,595 +27746,9 @@ func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeRaw",
+		Type: "DecodePaddedRaw",
 		Input: []tf.Input{
-			bytes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Gather ragged slices from `params` axis `0` according to `indices`.
-//
-// Outputs a `RaggedTensor` output composed from `output_dense_values` and
-// `output_nested_splits`, such that:
-//
-// ```python
-// output.shape = indices.shape + params.shape[1:]
-// output.ragged_rank = indices.shape.ndims + params.ragged_rank
-// output[i...j, d0...dn] = params[indices[i...j], d0...dn]
-// ```
-//
-// where
-//
-// * `params =
-//    ragged.from_nested_row_splits(params_dense_values, params_nested_splits)`
-//    provides the values that should be gathered.
-// * `indices` ia a dense tensor with dtype `int32` or `int64`, indicating which
-//    values should be gathered.
-// * `output =
-//    ragged.from_nested_row_splits(output_dense_values, output_nested_splits)`
-//    is the output tensor.
-//
-// (Note: This c++ op is used to implement the higher-level python
-// `tf.ragged.gather` op, which also supports ragged indices.)
-//
-//
-// Arguments:
-//	params_nested_splits: The `nested_row_splits` tensors that define the row-partitioning for the
-// `params` RaggedTensor input.
-//	params_dense_values: The `flat_values` for the `params` RaggedTensor. There was a terminology change
-// at the python level from dense_values to flat_values, so dense_values is the
-// deprecated name.
-//	indices: Indices in the outermost dimension of `params` of the values that should be
-// gathered.
-//	OUTPUT_RAGGED_RANK: The ragged rank of the output RaggedTensor. `output_nested_splits` will contain
-// this number of `row_splits` tensors. This value should equal
-// `indices.shape.ndims + params.ragged_rank - 1`.
-//
-// Returns:
-//	output_nested_splits: The `nested_row_splits` tensors that define the row-partitioning for the
-// returned RaggedTensor.
-//	output_dense_values: The `flat_values` for the returned RaggedTensor.
-func RaggedGather(scope *Scope, params_nested_splits []tf.Output, params_dense_values tf.Output, indices tf.Output, OUTPUT_RAGGED_RANK int64) (output_nested_splits []tf.Output, output_dense_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"OUTPUT_RAGGED_RANK": OUTPUT_RAGGED_RANK}
-	opspec := tf.OpSpec{
-		Type: "RaggedGather",
-		Input: []tf.Input{
-			tf.OutputList(params_nested_splits), params_dense_values, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output_nested_splits, idx, err = makeOutputList(op, idx, "output_nested_splits"); err != nil {
-		scope.UpdateErr("RaggedGather", err)
-		return
-	}
-	output_dense_values = op.Output(idx)
-	return output_nested_splits, output_dense_values
-}
-
-// QuantizeV2Attr is an optional argument to QuantizeV2.
-type QuantizeV2Attr func(optionalAttr)
-
-// QuantizeV2Mode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func QuantizeV2Mode(value string) QuantizeV2Attr {
-	return func(m optionalAttr) {
-		m["mode"] = value
-	}
-}
-
-// QuantizeV2RoundMode sets the optional round_mode attribute to value.
-// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
-func QuantizeV2RoundMode(value string) QuantizeV2Attr {
-	return func(m optionalAttr) {
-		m["round_mode"] = value
-	}
-}
-
-// QuantizeV2NarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func QuantizeV2NarrowRange(value bool) QuantizeV2Attr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// QuantizeV2Axis sets the optional axis attribute to value.
-// If not specified, defaults to -1
-func QuantizeV2Axis(value int64) QuantizeV2Attr {
-	return func(m optionalAttr) {
-		m["axis"] = value
-	}
-}
-
-// QuantizeV2EnsureMinimumRange sets the optional ensure_minimum_range attribute to value.
-// If not specified, defaults to 0.01
-func QuantizeV2EnsureMinimumRange(value float32) QuantizeV2Attr {
-	return func(m optionalAttr) {
-		m["ensure_minimum_range"] = value
-	}
-}
-
-// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
-//
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.  The
-// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
-// when rounding float values to their quantized equivalents.
-//
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-//
-// ```
-// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
-// if T == qint8: out[i] -= (range(T) + 1) / 2.0
-// ```
-//
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-//
-// *MIN_COMBINED Mode Example*
-//
-// Assume the input is type float and has a possible range of [0.0, 6.0] and the
-// output type is quint8 ([0, 255]). The min_range and max_range values should be
-// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
-// value of the input by 255/6 and cast to quint8.
-//
-// If the output type was qint8 ([-128, 127]), the operation will additionally
-// subtract each value by 128 prior to casting, so that the range of values aligns
-// with the range of qint8.
-//
-// If the mode is 'MIN_FIRST', then this approach is used:
-//
-// ```
-// num_discrete_values = 1 << (# of bits in T)
-// range_adjust = num_discrete_values / (num_discrete_values - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = num_discrete_values / range
-// quantized = round(input * range_scale) - round(range_min * range_scale) +
-//   numeric_limits<T>::min()
-// quantized = max(quantized, numeric_limits<T>::min())
-// quantized = min(quantized, numeric_limits<T>::max())
-// ```
-//
-// The biggest difference between this and MIN_COMBINED is that the minimum range
-// is rounded first, before it's subtracted from the rounded value. With
-// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
-// and dequantizing will introduce a larger and larger error.
-//
-// *SCALED mode Example*
-//
-// `SCALED` mode matches the quantization approach used in
-// `QuantizeAndDequantize{V2|V3}`.
-//
-// If the mode is `SCALED`, the quantization is performed by multiplying each
-// input value by a scaling_factor.
-// The scaling_factor is determined from `min_range` and `max_range` to be as large
-// as possible such that the range from `min_range` to `max_range` is representable
-// within values of type T.
-//
-// ```c++
-//
-//   const int min_T = std::numeric_limits<T>::min();
-//   const int max_T = std::numeric_limits<T>::max();
-//   const float max_float = std::numeric_limits<float>::max();
-//
-//   const float scale_factor_from_min_side =
-//       (min_T * min_range > 0) ? min_T / min_range : max_float;
-//   const float scale_factor_from_max_side =
-//       (max_T * max_range > 0) ? max_T / max_range : max_float;
-//
-//   const float scale_factor = std::min(scale_factor_from_min_side,
-//                                       scale_factor_from_max_side);
-// ```
-//
-// We next use the scale_factor to adjust min_range and max_range as follows:
-//
-// ```c++
-//       min_range = min_T / scale_factor;
-//       max_range = max_T / scale_factor;
-// ```
-//
-//
-// e.g. if T = qint8, and initially min_range = -10, and max_range = 9, we would
-// compare -128/-10.0 = 12.8 to 127/9.0 = 14.11, and set scaling_factor = 12.8
-// In this case, min_range would remain -10, but max_range would be adjusted to
-// 127 / 12.8 = 9.921875
-//
-// So we will quantize input values in the range (-10, 9.921875) to (-128, 127).
-//
-// The input tensor can now be quantized by clipping values to the range
-// `min_range` to `max_range`, then multiplying by scale_factor as follows:
-//
-// ```c++
-// result = round(min(max_range, max(min_range, input)) * scale_factor)
-// ```
-//
-// The adjusted `min_range` and `max_range` are returned as outputs 2 and 3 of
-// this operation. These outputs should be used as the range for any further
-// calculations.
-//
-//
-// *narrow_range (bool) attribute*
-//
-// If true, we do not use the minimum quantized value.
-// i.e. for int8 the quantized output, it would be restricted to the range
-// -127..127 instead of the full -128..127 range.
-// This is provided for compatibility with certain inference backends.
-// (Only applies to SCALED mode)
-//
-//
-// *axis (int) attribute*
-//
-// An optional `axis` attribute can specify a dimension index of the input tensor,
-// such that quantization ranges will be calculated and applied separately for each
-// slice of the tensor along that dimension. This is useful for per-channel
-// quantization.
-//
-// If axis is specified, min_range and max_range
-//
-// if `axis`=None, per-tensor quantization is performed as normal.
-//
-//
-// *ensure_minimum_range (float) attribute*
-//
-// Ensures the minimum quantization range is at least this value.
-// The legacy default value for this is 0.01, but it is strongly suggested to
-// set it to 0 for new uses.
-//
-//
-// Arguments:
-//
-//	min_range: The minimum value of the quantization range. This value may be adjusted by the
-// op depending on other parameters. The adjusted value is written to `output_min`.
-// If the `axis` attribute is specified, this must be a 1-D tensor whose size
-// matches the `axis` dimension of the input and output tensors.
-//	max_range: The maximum value of the quantization range. This value may be adjusted by the
-// op depending on other parameters. The adjusted value is written to `output_max`.
-// If the `axis` attribute is specified, this must be a 1-D tensor whose size
-// matches the `axis` dimension of the input and output tensors.
-//
-//
-// Returns:
-//	output: The quantized data produced from the float input.
-//	output_min: The final quantization range minimum, used to clip input values before scaling
-// and rounding them to quantized values.
-// If the `axis` attribute is specified, this will be a 1-D tensor whose size
-// matches the `axis` dimension of the input and output tensors.
-//	output_max: The final quantization range maximum, used to clip input values before scaling
-// and rounding them to quantized values.
-// If the `axis` attribute is specified, this will be a 1-D tensor whose size
-// matches the `axis` dimension of the input and output tensors.
-func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"T": T}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizeV2",
-		Input: []tf.Input{
-			input, min_range, max_range,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns the truth value of (x >= y) element-wise.
-//
-// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-//
-// Example:
-//
-// ```python
-// x = tf.constant([5, 4, 6, 7])
-// y = tf.constant([5, 2, 5, 10])
-// tf.math.greater_equal(x, y) ==> [True, True, True, False]
-//
-// x = tf.constant([5, 4, 6, 7])
-// y = tf.constant([5])
-// tf.math.greater_equal(x, y) ==> [True, False, True, True]
-// ```
-func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "GreaterEqual",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// BatchAttr is an optional argument to Batch.
-type BatchAttr func(optionalAttr)
-
-// BatchMaxEnqueuedBatches sets the optional max_enqueued_batches attribute to value.
-// If not specified, defaults to 10
-func BatchMaxEnqueuedBatches(value int64) BatchAttr {
-	return func(m optionalAttr) {
-		m["max_enqueued_batches"] = value
-	}
-}
-
-// BatchAllowedBatchSizes sets the optional allowed_batch_sizes attribute to value.
-// If not specified, defaults to <>
-func BatchAllowedBatchSizes(value []int64) BatchAttr {
-	return func(m optionalAttr) {
-		m["allowed_batch_sizes"] = value
-	}
-}
-
-// BatchContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func BatchContainer(value string) BatchAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// BatchSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func BatchSharedName(value string) BatchAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// BatchBatchingQueue sets the optional batching_queue attribute to value.
-// If not specified, defaults to ""
-func BatchBatchingQueue(value string) BatchAttr {
-	return func(m optionalAttr) {
-		m["batching_queue"] = value
-	}
-}
-
-// Batches all input tensors nondeterministically.
-//
-// When many instances of this Op are being run concurrently with the same
-// container/shared_name in the same device, some will output zero-shaped Tensors
-// and others will output Tensors of size up to max_batch_size.
-//
-// All Tensors in in_tensors are batched together (so, for example, labels and
-// features should be batched with a single instance of this operation.
-//
-// Each invocation of batch emits an `id` scalar which will be used to identify
-// this particular invocation when doing unbatch or its gradient.
-//
-// Each op which emits a non-empty batch will also emit a non-empty batch_index
-// Tensor, which, is a [K, 3] matrix where each row contains the invocation's id,
-// start, and length of elements of each set of Tensors present in batched_tensors.
-//
-// Batched tensors are concatenated along the first dimension, and all tensors in
-// in_tensors must have the first dimension of the same size.
-//
-// in_tensors: The tensors to be batched.
-// num_batch_threads: Number of scheduling threads for processing batches of work.
-//  Determines the number of batches processed in parallel.
-// max_batch_size: Batch sizes will never be bigger than this.
-// batch_timeout_micros: Maximum number of microseconds to wait before outputting
-//  an incomplete batch.
-// allowed_batch_sizes: Optional list of allowed batch sizes. If left empty, does
-//  nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
-//  batches up to one of those sizes. The entries must increase monotonically, and
-//  the final entry must equal max_batch_size.
-// grad_timeout_micros: The timeout to use for the gradient. See Unbatch.
-// batched_tensors: Either empty tensors or a batch of concatenated Tensors.
-// batch_index: If out_tensors is non-empty, has information to invert it.
-// container: Controls the scope of sharing of this batch.
-// id: always contains a scalar with a unique ID for this invocation of Batch.
-// shared_name: Concurrently running instances of batch in the same device with the
-//  same container and shared_name will batch their elements together. If left
-//  empty, the op name will be used as the shared name.
-// T: the types of tensors to be batched.
-func Batch(scope *Scope, in_tensors []tf.Output, num_batch_threads int64, max_batch_size int64, batch_timeout_micros int64, grad_timeout_micros int64, optional ...BatchAttr) (batched_tensors []tf.Output, batch_index tf.Output, id tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_batch_threads": num_batch_threads, "max_batch_size": max_batch_size, "batch_timeout_micros": batch_timeout_micros, "grad_timeout_micros": grad_timeout_micros}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Batch",
-		Input: []tf.Input{
-			tf.OutputList(in_tensors),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if batched_tensors, idx, err = makeOutputList(op, idx, "batched_tensors"); err != nil {
-		scope.UpdateErr("Batch", err)
-		return
-	}
-	batch_index = op.Output(idx)
-	id = op.Output(idx)
-	return batched_tensors, batch_index, id
-}
-
-// UnicodeDecodeAttr is an optional argument to UnicodeDecode.
-type UnicodeDecodeAttr func(optionalAttr)
-
-// UnicodeDecodeErrors sets the optional errors attribute to value.
-//
-// value: Error handling policy when there is invalid formatting found in the input.
-// The value of 'strict' will cause the operation to produce a InvalidArgument
-// error on any invalid input formatting. A value of 'replace' (the default) will
-// cause the operation to replace any invalid formatting in the input with the
-// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
-// skip any invalid formatting in the input and produce no corresponding output
-// character.
-// If not specified, defaults to "replace"
-func UnicodeDecodeErrors(value string) UnicodeDecodeAttr {
-	return func(m optionalAttr) {
-		m["errors"] = value
-	}
-}
-
-// UnicodeDecodeReplacementChar sets the optional replacement_char attribute to value.
-//
-// value: The replacement character codepoint to be used in place of any invalid
-// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
-// be used. The default value is the default unicode replacement character is
-// 0xFFFD or U+65533.)
-// If not specified, defaults to 65533
-func UnicodeDecodeReplacementChar(value int64) UnicodeDecodeAttr {
-	return func(m optionalAttr) {
-		m["replacement_char"] = value
-	}
-}
-
-// UnicodeDecodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
-//
-// value: Whether to replace the C0 control characters (00-1F) with the
-// `replacement_char`. Default is false.
-// If not specified, defaults to false
-func UnicodeDecodeReplaceControlCharacters(value bool) UnicodeDecodeAttr {
-	return func(m optionalAttr) {
-		m["replace_control_characters"] = value
-	}
-}
-
-// UnicodeDecodeTsplits sets the optional Tsplits attribute to value.
-// If not specified, defaults to DT_INT64
-func UnicodeDecodeTsplits(value tf.DataType) UnicodeDecodeAttr {
-	return func(m optionalAttr) {
-		m["Tsplits"] = value
-	}
-}
-
-// Decodes each string in `input` into a sequence of Unicode code points.
-//
-// The character codepoints for all strings are returned using a single vector
-// `char_values`, with strings expanded to characters in row-major order.
-//
-// The `row_splits` tensor indicates where the codepoints for
-// each input string begin and end within the `char_values` tensor.
-// In particular, the values for the `i`th
-// string (in row-major order) are stored in the slice
-// `[row_splits[i]:row_splits[i+1]]`. Thus:
-//
-// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
-//   character in the `i`th string (in row-major order).
-// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
-//   string (in row-major order).
-//
-// Arguments:
-//	input: The text to be decoded. Can have any shape. Note that the output is flattened
-// to a vector of char values.
-//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
-// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
-//
-// Returns:
-//	row_splits: A 1D int32 tensor containing the row splits.
-//	char_values: A 1D int32 Tensor containing the decoded codepoints.
-func UnicodeDecode(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeAttr) (row_splits tf.Output, char_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"input_encoding": input_encoding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UnicodeDecode",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Create a dense tensor from a ragged tensor, possibly altering its shape.
-//
-// The `ragged_to_dense` op creates a dense tensor from a list of row partition
-// tensors, a value vector, and default values. If the shape is unspecified, the
-// minimal shape required to contain all the elements in the ragged tensor (the
-// natural shape) will be used. If some dimensions are left unspecified, then the
-// size of the natural shape is used in that dimension.
-//
-// The default_value will be broadcast to the output shape. After that, the values
-// from the ragged tensor overwrite the default values. Note that the default_value
-// must have less dimensions than the value.
-//
-// The row partition tensors are in the order of the dimensions.
-// At present, the types can be:
-// * "ROW_SPLITS": the row_splits tensor from the ragged tensor.
-// * "VALUE_ROWIDS": the value_rowids tensor from the ragged tensor.
-// * "FIRST_DIM_SIZE": if value_rowids is used for the first dimension, then it
-//   is preceded by "FIRST_DIM_SIZE".
-//
-// Arguments:
-//	shape: The desired shape of the the output tensor. If left unspecified (empty),
-// the minimal shape required to contain all the elements in the ragged tensor
-// (the natural shape) will be used. If some dimensions are left unspecified, then
-// the size of the natural shape is used in that dimension.
-//
-// Note that dense dimensions cannot be modified by the shape argument. Trying to
-// change the size of a dense dimension will cause the op to fail.
-// Examples:
-// natural shape: [4, 5, 6]
-// shape: -1
-// output shape: [4, 5, 6]
-//
-// natural shape: [4, 5, 6]
-// shape: [3, -1, 2]
-// output shape: [3, 5, 2]
-//
-// natural shape: [4, 5, 6]
-// shape: [3, 7, 2]
-// output shape: [3, 7, 2]
-//
-//	values: A 1D tensor representing the values of the ragged tensor.
-//	default_value: The default_value when the shape is larger than the ragged tensor. The
-// default_value is broadcast until it is the shape of the output tensor, and
-// then overwritten by values in the ragged tensor. The default value must be
-// compatible with this broadcast operation, and must have fewer dimensions than
-// the value tensor.
-//
-//	row_partition_types: The types of the row partition tensors. At present, these can be:
-// * "ROW_SPLITS": the row_splits tensor from the ragged tensor.
-// * "VALUE_ROWIDS": the value_rowids tensor from the ragged tensor.
-// * "FIRST_DIM_SIZE": if value_rowids is used for the first dimension, then it
-//   is preceeded by "FIRST_DIM_SIZE".
-// The tensors are in the order of the dimensions.
-//
-// Returns The resulting dense tensor.
-func RaggedTensorToTensor(scope *Scope, shape tf.Output, values tf.Output, default_value tf.Output, row_partition_tensors []tf.Output, row_partition_types []string) (result tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"row_partition_types": row_partition_types}
-	opspec := tf.OpSpec{
-		Type: "RaggedTensorToTensor",
-		Input: []tf.Input{
-			shape, values, default_value, tf.OutputList(row_partition_tensors),
+			input_bytes, fixed_length,
 		},
 		Attrs: attrs,
 	}
@@ -29673,45 +28428,6 @@ func BlockLSTMV2(scope *Scope, seq_len_max tf.Output, x tf.Output, cs_prev tf.Ou
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
 }
 
-// Return a tensor with the same shape and contents as the input tensor or value.
-func Identity(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Identity",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Outputs a `Summary` protocol buffer with scalar values.
-//
-// The input `tags` and `values` must have the same shape.  The generated summary
-// has a summary value for each tag-value pair in `tags` and `values`.
-//
-// Arguments:
-//	tags: Tags for the summary.
-//	values: Same shape as `tags.  Values for the summary.
-//
-// Returns Scalar.  Serialized `Summary` protocol buffer.
-func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ScalarSummary",
-		Input: []tf.Input{
-			tags, values,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
 type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
 
@@ -29779,6 +28495,45 @@ func Neg(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Return a tensor with the same shape and contents as the input tensor or value.
+func Identity(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Identity",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs a `Summary` protocol buffer with scalar values.
+//
+// The input `tags` and `values` must have the same shape.  The generated summary
+// has a summary value for each tag-value pair in `tags` and `values`.
+//
+// Arguments:
+//	tags: Tags for the summary.
+//	values: Same shape as `tags.  Values for the summary.
+//
+// Returns Scalar.  Serialized `Summary` protocol buffer.
+func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ScalarSummary",
+		Input: []tf.Input{
+			tags, values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Concatenates tensors along one dimension.
 //
 // Arguments:
@@ -31502,57 +30257,6 @@ func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr)
 	return op.Output(0)
 }
 
-// Computes the minimum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// This operator is similar to the unsorted segment sum operator found
-// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the minimum such that:
-//
-// \\(output_i = \min_{j...} data_[j...]\\) where min is over tuples `j...` such
-// that `segment_ids[j...] == i`.
-//
-// If the minimum is empty for a given segment ID `i`, it outputs the largest
-// possible value for the specific numeric type,
-// `output[i] = numeric_limits<T>::max()`.
-//
-// For example:
-//
-// ``` python
-// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
-// tf.unsorted_segment_min(c, tf.constant([0, 1, 0]), num_segments=2)
-// # ==> [[ 1,  2, 2, 1],
-// #       [5,  6, 7, 8]]
-// ```
-//
-// If the given segment ID `i` is negative, then the corresponding value is
-// dropped, and will not be included in the result.
-//
-// Arguments:
-//
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
-//
-//
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentMin",
-		Input: []tf.Input{
-			data, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // ResourceScatterNdSubAttr is an optional argument to ResourceScatterNdSub.
 type ResourceScatterNdSubAttr func(optionalAttr)
 
@@ -31630,6 +30334,107 @@ func ResourceScatterNdSub(scope *Scope, ref tf.Output, indices tf.Output, update
 	return scope.AddOperation(opspec)
 }
 
+// Computes the minimum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the minimum such that:
+//
+// \\(output_i = \min_{j...} data_[j...]\\) where min is over tuples `j...` such
+// that `segment_ids[j...] == i`.
+//
+// If the minimum is empty for a given segment ID `i`, it outputs the largest
+// possible value for the specific numeric type,
+// `output[i] = numeric_limits<T>::max()`.
+//
+// For example:
+//
+// ``` python
+// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+// tf.unsorted_segment_min(c, tf.constant([0, 1, 0]), num_segments=2)
+// # ==> [[ 1,  2, 2, 1],
+// #       [5,  6, 7, 8]]
+// ```
+//
+// If the given segment ID `i` is negative, then the corresponding value is
+// dropped, and will not be included in the result.
+//
+// Arguments:
+//
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentMin",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AvgPool3DAttr is an optional argument to AvgPool3D.
+type AvgPool3DAttr func(optionalAttr)
+
+// AvgPool3DDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DDataFormat(value string) AvgPool3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs 3D average pooling on the input.
+//
+// Each entry in `output` is the mean of the corresponding size `ksize` window in
+// `value`.
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The average pooled output tensor.
+func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AvgPool3D",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
 type DataFormatDimMapAttr func(optionalAttr)
 
@@ -31704,144 +30509,26 @@ func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.
 	return scope.AddOperation(opspec)
 }
 
-// UpperBoundAttr is an optional argument to UpperBound.
-type UpperBoundAttr func(optionalAttr)
-
-// UpperBoundOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func UpperBoundOutType(value tf.DataType) UpperBoundAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Applies upper_bound(sorted_search_values, values) along each row.
+// Asserts that compilation succeeded. This op produces no output and closes the
 //
-// Each set of rows with the same index in (sorted_inputs, values) is treated
-// independently.  The resulting row is the equivalent of calling
-// `np.searchsorted(sorted_inputs, values, side='right')`.
+// device during failure to ensure all pending device interactions fail.
 //
-// The result is not a global index to the entire
-// `Tensor`, but rather just the index in the last dimension.
-//
-// A 2-D example:
-//   sorted_sequence = [[0, 3, 9, 9, 10],
-//                      [1, 2, 3, 4, 5]]
-//   values = [[2, 4, 9],
-//             [0, 2, 6]]
-//
-//   result = UpperBound(sorted_sequence, values)
-//
-//   result == [[1, 2, 4],
-//              [0, 2, 5]]
-//
-// Arguments:
-//	sorted_inputs: 2-D Tensor where each row is ordered.
-//	values: 2-D Tensor with the same numbers of rows as `sorted_search_values`. Contains
-// the values that will be searched for in `sorted_search_values`.
-//
-// Returns A `Tensor` with the same shape as `values`.  It contains the last scalar index
-// into the last dimension where values can be inserted without changing the
-// ordered property.
-func UpperBound(scope *Scope, sorted_inputs tf.Output, values tf.Output, optional ...UpperBoundAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UpperBound",
-		Input: []tf.Input{
-			sorted_inputs, values,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
-type ResourceApplyFtrlV2Attr func(optionalAttr)
-
-// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceApplyFtrlV2MultiplyLinearByLr sets the optional multiply_linear_by_lr attribute to value.
-// If not specified, defaults to false
-func ResourceApplyFtrlV2MultiplyLinearByLr(value bool) ResourceApplyFtrlV2Attr {
-	return func(m optionalAttr) {
-		m["multiply_linear_by_lr"] = value
-	}
-}
-
-// Update '*var' according to the Ftrl-proximal scheme.
-//
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 shrinkage regularization. Must be a scalar.
-//
-//	lr_power: Scaling factor. Must be a scalar.
+// 'compilation_status' is a serialized CompilationResultProto.
 //
 // Returns the created operation.
-func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
+func TPUCompileSucceededAssert(scope *Scope, compilation_status tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrlV2",
+		Type: "TPUCompileSucceededAssert",
 		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
+			compilation_status,
 		},
-		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Deprecated. Use TensorArraySplitV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
-func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArraySplitV2",
-		Input: []tf.Input{
-			handle, value, lengths, flow_in,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
 type ComputeAccidentalHitsAttr func(optionalAttr)
 
@@ -31961,7 +30648,7 @@ func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...Va
 	return op.Output(0)
 }
 
-// Returns (x - y)(x - y) element-wise.
+// Returns conj(x - y)(x - y) element-wise.
 //
 // *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
 // [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
@@ -32158,61 +30845,6 @@ func BoostedTreesQuantileStreamResourceHandleOp(scope *Scope, optional ...Booste
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
-type ResourceSparseApplyAdagradAttr func(optionalAttr)
-
-// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceSparseApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
-// If not specified, defaults to true
-func ResourceSparseApplyAdagradUpdateSlots(value bool) ResourceSparseApplyAdagradAttr {
-	return func(m optionalAttr) {
-		m["update_slots"] = value
-	}
-}
-
-// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
-//
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//
-// Returns the created operation.
-func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagrad",
-		Input: []tf.Input{
-			var_, accum, lr, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // EagerPyFuncAttr is an optional argument to EagerPyFunc.
 type EagerPyFuncAttr func(optionalAttr)
 
@@ -33027,30 +31659,6 @@ func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns the set of files matching one or more glob patterns.
-//
-// Note that this routine only supports wildcard characters in the
-// basename portion of the pattern, not in the directory portion.
-// Note also that the order of filenames returned is deterministic.
-//
-// Arguments:
-//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
-//
-// Returns A vector of matching filenames.
-func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatchingFiles",
-		Input: []tf.Input{
-			pattern,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
 //
 // true, this follows Python semantics in that the result here is consistent
@@ -33494,6 +32102,74 @@ func CSRSparseMatrixComponents(scope *Scope, csr_sparse_matrix tf.Output, index
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// Gather ragged slices from `params` axis `0` according to `indices`.
+//
+// Outputs a `RaggedTensor` output composed from `output_dense_values` and
+// `output_nested_splits`, such that:
+//
+// ```python
+// output.shape = indices.shape + params.shape[1:]
+// output.ragged_rank = indices.shape.ndims + params.ragged_rank
+// output[i...j, d0...dn] = params[indices[i...j], d0...dn]
+// ```
+//
+// where
+//
+// * `params =
+//    ragged.from_nested_row_splits(params_dense_values, params_nested_splits)`
+//    provides the values that should be gathered.
+// * `indices` ia a dense tensor with dtype `int32` or `int64`, indicating which
+//    values should be gathered.
+// * `output =
+//    ragged.from_nested_row_splits(output_dense_values, output_nested_splits)`
+//    is the output tensor.
+//
+// (Note: This c++ op is used to implement the higher-level python
+// `tf.ragged.gather` op, which also supports ragged indices.)
+//
+//
+// Arguments:
+//	params_nested_splits: The `nested_row_splits` tensors that define the row-partitioning for the
+// `params` RaggedTensor input.
+//	params_dense_values: The `flat_values` for the `params` RaggedTensor. There was a terminology change
+// at the python level from dense_values to flat_values, so dense_values is the
+// deprecated name.
+//	indices: Indices in the outermost dimension of `params` of the values that should be
+// gathered.
+//	OUTPUT_RAGGED_RANK: The ragged rank of the output RaggedTensor. `output_nested_splits` will contain
+// this number of `row_splits` tensors. This value should equal
+// `indices.shape.ndims + params.ragged_rank - 1`.
+//
+// Returns:
+//	output_nested_splits: The `nested_row_splits` tensors that define the row-partitioning for the
+// returned RaggedTensor.
+//	output_dense_values: The `flat_values` for the returned RaggedTensor.
+func RaggedGather(scope *Scope, params_nested_splits []tf.Output, params_dense_values tf.Output, indices tf.Output, OUTPUT_RAGGED_RANK int64) (output_nested_splits []tf.Output, output_dense_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"OUTPUT_RAGGED_RANK": OUTPUT_RAGGED_RANK}
+	opspec := tf.OpSpec{
+		Type: "RaggedGather",
+		Input: []tf.Input{
+			tf.OutputList(params_nested_splits), params_dense_values, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output_nested_splits, idx, err = makeOutputList(op, idx, "output_nested_splits"); err != nil {
+		scope.UpdateErr("RaggedGather", err)
+		return
+	}
+	output_dense_values = op.Output(idx)
+	return output_nested_splits, output_dense_values
+}
+
 // StringSplitV2Attr is an optional argument to StringSplitV2.
 type StringSplitV2Attr func(optionalAttr)
 
@@ -33656,6 +32332,230 @@ func CSRSparseMatrixToSparseTensor(scope *Scope, sparse_matrix tf.Output, type_
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// OrderedMapStageAttr is an optional argument to OrderedMapStage.
+type OrderedMapStageAttr func(optionalAttr)
+
+// OrderedMapStageCapacity sets the optional capacity attribute to value.
+//
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapStageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func OrderedMapStageContainer(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapStageSharedName sets the optional shared_name attribute to value.
+//
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage (key, values) in the underlying container which behaves like a ordered
+//
+// associative container.   Elements are ordered by key.
+//
+// Arguments:
+//	key: int64
+//
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
+//
+// Returns the created operation.
+func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapStage",
+		Input: []tf.Input{
+			key, indices, tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// TPUReplicateMetadataAttr is an optional argument to TPUReplicateMetadata.
+type TPUReplicateMetadataAttr func(optionalAttr)
+
+// TPUReplicateMetadataNumCoresPerReplica sets the optional num_cores_per_replica attribute to value.
+//
+// value: Number of cores per replica. Used for model parallelism.
+// If not specified, defaults to 1
+func TPUReplicateMetadataNumCoresPerReplica(value int64) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["num_cores_per_replica"] = value
+	}
+}
+
+// TPUReplicateMetadataTopology sets the optional topology attribute to value.
+//
+// value: TopologyProto indicating the topology of the TPU pod slice.
+// If not specified, defaults to ""
+func TPUReplicateMetadataTopology(value string) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["topology"] = value
+	}
+}
+
+// TPUReplicateMetadataUseTpu sets the optional use_tpu attribute to value.
+//
+// value: Whether to place the computation on the TPU.
+// If not specified, defaults to true
+func TPUReplicateMetadataUseTpu(value bool) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["use_tpu"] = value
+	}
+}
+
+// TPUReplicateMetadataDeviceAssignment sets the optional device_assignment attribute to value.
+//
+// value: The assignment of devices for the computation.
+// If not specified, defaults to <>
+func TPUReplicateMetadataDeviceAssignment(value []int64) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["device_assignment"] = value
+	}
+}
+
+// TPUReplicateMetadataComputationShape sets the optional computation_shape attribute to value.
+//
+// value: DEPRECATED. Use num_cores_per_replica instead.
+// If not specified, defaults to <>
+func TPUReplicateMetadataComputationShape(value []int64) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["computation_shape"] = value
+	}
+}
+
+// TPUReplicateMetadataHostComputeCore sets the optional host_compute_core attribute to value.
+// If not specified, defaults to <>
+func TPUReplicateMetadataHostComputeCore(value []string) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["host_compute_core"] = value
+	}
+}
+
+// TPUReplicateMetadataPaddingMap sets the optional padding_map attribute to value.
+// If not specified, defaults to <>
+func TPUReplicateMetadataPaddingMap(value []string) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["padding_map"] = value
+	}
+}
+
+// TPUReplicateMetadataStepMarkerLocation sets the optional step_marker_location attribute to value.
+// If not specified, defaults to "STEP_MARK_AT_ENTRY"
+func TPUReplicateMetadataStepMarkerLocation(value string) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["step_marker_location"] = value
+	}
+}
+
+// TPUReplicateMetadataAllowSoftPlacement sets the optional allow_soft_placement attribute to value.
+// If not specified, defaults to false
+func TPUReplicateMetadataAllowSoftPlacement(value bool) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["allow_soft_placement"] = value
+	}
+}
+
+// TPUReplicateMetadataUseSpmdForXlaPartitioning sets the optional use_spmd_for_xla_partitioning attribute to value.
+// If not specified, defaults to false
+func TPUReplicateMetadataUseSpmdForXlaPartitioning(value bool) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["use_spmd_for_xla_partitioning"] = value
+	}
+}
+
+// Metadata indicating how the TPU computation should be replicated.
+//
+// This operation holds the metadata common to operations of a `tpu.replicate()` computation subgraph.
+//
+// Arguments:
+//	num_replicas: Number of replicas of the computation
+//
+// Returns the created operation.
+func TPUReplicateMetadata(scope *Scope, num_replicas int64, optional ...TPUReplicateMetadataAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_replicas": num_replicas}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TPUReplicateMetadata",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns the TopK unique values in the array in sorted order. The
+//
+// running time is proportional to the product of K and the input
+// size. Sorting the whole array is more efficient for sufficiently large
+// values of K. The median-of-medians algorithm is probably faster, but
+// difficult to implement efficiently in XLA. If there are fewer than K
+// unique numbers (not NANs), the results are padded with negative
+// infinity. NaNs are never returned. Subnormal numbers are flushed to
+// zero. If an element appears at multiple indices, the highest index is
+// returned. If a TopK element never appears in the input due to padding
+// values, the indices are padded with negative one. If a padding value
+// appears in the input and padding is needed, the highest index of the
+// padding value will be returned. The semantics are not the same as
+// kth_order_statistic.
+func TopKUnique(scope *Scope, input tf.Output, k int64) (topk tf.Output, topk_indices tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"k": k}
+	opspec := tf.OpSpec{
+		Type: "TopKUnique",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // SizeAttr is an optional argument to Size.
 type SizeAttr func(optionalAttr)
 
@@ -34350,6 +33250,221 @@ func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAt
 	return op.Output(0)
 }
 
+// Returns the name of the device on which `resource` has been placed.
+func ExperimentalIteratorGetDevice(scope *Scope, resource tf.Output) (device tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalIteratorGetDevice",
+		Input: []tf.Input{
+			resource,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
+func ExperimentalBytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalBytesProducedStatsDataset",
+		Input: []tf.Input{
+			input_dataset, tag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Transforms a vector of tf.Example protos (as strings) into typed tensors.
+//
+// Arguments:
+//	serialized: A scalar or vector containing binary serialized Example protos.
+//	names: A tensor containing the names of the serialized protos.
+// Corresponds 1:1 with the `serialized` tensor.
+// May contain, for example, table key (descriptive) names for the
+// corresponding serialized protos.  These are purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no names are available.
+// If non-empty, this tensor must have the same shape as "serialized".
+//	sparse_keys: Vector of strings.
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: Vector of strings.
+// The keys expected in the Examples' features associated with dense values.
+//	ragged_keys: Vector of strings.
+// The keys expected in the Examples' features associated with ragged values.
+//	dense_defaults: A list of Tensors (some may be empty).  Corresponds 1:1 with `dense_keys`.
+// dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	num_sparse: The number of sparse keys.
+//	sparse_types: A list of `num_sparse` types; the data types of data in each Feature
+// given in sparse_keys.
+// Currently the ParseExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	ragged_value_types: A list of `num_ragged` types; the data types of data in each Feature
+// given in ragged_keys (where `num_ragged = sparse_keys.size()`).
+// Currently the ParseExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	ragged_split_types: A list of `num_ragged` types; the data types of row_splits in each Feature
+// given in ragged_keys (where `num_ragged = sparse_keys.size()`).
+// May be DT_INT32 or DT_INT64.
+//	dense_shapes: A list of `num_dense` shapes; the shapes of data in each Feature
+// given in dense_keys (where `num_dense = dense_keys.size()`).
+// The number of elements in the Feature corresponding to dense_key[j]
+// must always equal dense_shapes[j].NumEntries().
+// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
+// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
+// The dense outputs are just the inputs row-stacked by batch.
+// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
+// the shape of the output Tensor dense_values[j] will be
+// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
+// of elements of length D1 * .... * DN, across all minibatch entries
+// in the input.  Any minibatch entry with less than M blocks of elements of
+// length D1 * ... * DN will be padded with the corresponding default_value
+// scalar element along the second dimension.
+func ParseExampleV2(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys tf.Output, dense_keys tf.Output, ragged_keys tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_types []tf.DataType, ragged_value_types []tf.DataType, ragged_split_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output, ragged_values []tf.Output, ragged_row_splits []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_types": sparse_types, "ragged_value_types": ragged_value_types, "ragged_split_types": ragged_split_types, "dense_shapes": dense_shapes}
+	opspec := tf.OpSpec{
+		Type: "ParseExampleV2",
+		Input: []tf.Input{
+			serialized, names, sparse_keys, dense_keys, ragged_keys, tf.OutputList(dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseExampleV2", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseExampleV2", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseExampleV2", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseExampleV2", err)
+		return
+	}
+	if ragged_values, idx, err = makeOutputList(op, idx, "ragged_values"); err != nil {
+		scope.UpdateErr("ParseExampleV2", err)
+		return
+	}
+	if ragged_row_splits, idx, err = makeOutputList(op, idx, "ragged_row_splits"); err != nil {
+		scope.UpdateErr("ParseExampleV2", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values, ragged_values, ragged_row_splits
+}
+
+// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
+//
+// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
+// ](http://arxiv.org/abs/1511.07289)
+func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Elu",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
+type AddSparseToTensorsMapAttr func(optionalAttr)
+
+// AddSparseToTensorsMapContainer sets the optional container attribute to value.
+//
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+//
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
+//
+// A `SparseTensor` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`.
+//
+// This operator takes the given `SparseTensor` and adds it to a container
+// object (a `SparseTensorsMap`).  A unique key within this container is generated
+// in the form of an `int64`, and this is the value that is returned.
+//
+// The `SparseTensor` can then be read out as part of a minibatch by passing
+// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddSparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+//
+// Returns 0-D.  The handle of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.
+func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AddSparseToTensorsMap",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Advance the counter of a counter-based RNG.
 //
 // The state of the RNG after
@@ -34376,277 +33491,6 @@ func RngSkip(scope *Scope, resource tf.Output, algorithm tf.Output, delta tf.Out
 	return scope.AddOperation(opspec)
 }
 
-// Generates values in an interval.
-//
-// A sequence of `num` evenly-spaced values are generated beginning at `start`.
-// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
-// so that the last one is exactly `stop`.
-//
-// For example:
-//
-// ```
-// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
-// ```
-//
-// Arguments:
-//	start: 0-D tensor. First entry in the range.
-//	stop: 0-D tensor. Last entry in the range.
-//	num: 0-D tensor. Number of values to generate.
-//
-// Returns 1-D. The generated values.
-func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LinSpace",
-		Input: []tf.Input{
-			start, stop, num,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MultinomialAttr is an optional argument to Multinomial.
-type MultinomialAttr func(optionalAttr)
-
-// MultinomialSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 is set to be non-zero, the internal random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func MultinomialSeed(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// MultinomialSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func MultinomialSeed2(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// MultinomialOutputDtype sets the optional output_dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["output_dtype"] = value
-	}
-}
-
-// Draws samples from a multinomial distribution.
-//
-// Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
-//
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Multinomial",
-		Input: []tf.Input{
-			logits, num_samples,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// NonDeterministicIntsAttr is an optional argument to NonDeterministicInts.
-type NonDeterministicIntsAttr func(optionalAttr)
-
-// NonDeterministicIntsDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_INT64
-func NonDeterministicIntsDtype(value tf.DataType) NonDeterministicIntsAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Non-deterministically generates some integers.
-//
-// This op may use some OS-provided source of non-determinism (e.g. an RNG), so each execution will give different results.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//
-// Returns Non-deterministic integer values with specified shape.
-func NonDeterministicInts(scope *Scope, shape tf.Output, optional ...NonDeterministicIntsAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "NonDeterministicInts",
-		Input: []tf.Input{
-			shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that caches elements from `input_dataset`.
-//
-// A CacheDataset will iterate over the input_dataset, and store tensors. If the
-// cache already exists, the cache will be used. If the cache is inappropriate
-// (e.g. cannot be opened, contains tensors of the wrong shape / size), an error
-// will the returned when used.
-//
-// Arguments:
-//
-//	filename: A path on the filesystem where we should cache the dataset. Note: this
-// will be a directory.
-//
-//
-func CacheDataset(scope *Scope, input_dataset tf.Output, filename tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "CacheDataset",
-		Input: []tf.Input{
-			input_dataset, filename,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ThreadPoolHandleAttr is an optional argument to ThreadPoolHandle.
-type ThreadPoolHandleAttr func(optionalAttr)
-
-// ThreadPoolHandleMaxIntraOpParallelism sets the optional max_intra_op_parallelism attribute to value.
-//
-// value: The maximum degree of parallelism to use within operations that execute on this
-// threadpool.
-// If not specified, defaults to 1
-func ThreadPoolHandleMaxIntraOpParallelism(value int64) ThreadPoolHandleAttr {
-	return func(m optionalAttr) {
-		m["max_intra_op_parallelism"] = value
-	}
-}
-
-// ThreadPoolHandleContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func ThreadPoolHandleContainer(value string) ThreadPoolHandleAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// ThreadPoolHandleSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func ThreadPoolHandleSharedName(value string) ThreadPoolHandleAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
-//
-// Arguments:
-//	num_threads: The number of threads in the thread pool.
-//	display_name: A human-readable name for the threads that may be visible in some
-// visualizations.
-// threadpool.
-//
-// Returns A resource that can be consumed by one or more ExperimentalThreadPoolDataset
-// ops.
-func ThreadPoolHandle(scope *Scope, num_threads int64, display_name string, optional ...ThreadPoolHandleAttr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_threads": num_threads, "display_name": display_name}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ThreadPoolHandle",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
-type SparseReduceMaxSparseAttr func(optionalAttr)
-
-// SparseReduceMaxSparseKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceMaxSparseKeepDims(value bool) SparseReduceMaxSparseAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the max of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
-// SparseTensor.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
-//
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReduceMaxSparse",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
 // Computes the maximum along segments of a tensor.
 //
 // Read
@@ -34740,125 +33584,6 @@ func StringUpper(scope *Scope, input tf.Output, optional ...StringUpperAttr) (ou
 	return op.Output(0)
 }
 
-// Set a summary_writer_interface to record statistics using given stats_aggregator.
-//
-// Returns the created operation.
-func StatsAggregatorSetSummaryWriter(scope *Scope, stats_aggregator tf.Output, summary tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StatsAggregatorSetSummaryWriter",
-		Input: []tf.Input{
-			stats_aggregator, summary,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
-type FusedBatchNormGradAttr func(optionalAttr)
-
-// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
-
-// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format for y_backprop, x, x_backprop.
-// Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
-//
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Gradient for batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
-//
-// Arguments:
-//	y_backprop: A 4D Tensor for the gradient with respect to y.
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
-// mean to be reused in gradient computation. When is_training is
-// False, a 1D Tensor for the population mean to be reused in both
-// 1st and 2nd order gradient computation.
-//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
-// variance (inverted variance in the cuDNN case) to be reused in
-// gradient computation. When is_training is False, a 1D Tensor
-// for the population variance to be reused in both 1st and 2nd
-// order gradient computation.
-//
-// Returns:
-//	x_backprop: A 4D Tensor for the gradient with respect to x.
-//	scale_backprop: A 1D Tensor for the gradient with respect to scale.
-//	offset_backprop: A 1D Tensor for the gradient with respect to offset.
-//	reserve_space_3: Unused placeholder to match the mean input in FusedBatchNorm.
-//	reserve_space_4: Unused placeholder to match the variance input
-// in FusedBatchNorm.
-func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FusedBatchNormGrad",
-		Input: []tf.Input{
-			y_backprop, x, scale, reserve_space_1, reserve_space_2,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// Subtracts a value from the current value of a variable.
-//
-// Any ReadVariableOp with a control dependency on this op is guaranteed to
-// see the decremented value or a subsequent newer one.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
-//
-// Returns the created operation.
-func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AssignSubVariableOp",
-		Input: []tf.Input{
-			resource, value,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // SparseReduceMaxAttr is an optional argument to SparseReduceMax.
 type SparseReduceMaxAttr func(optionalAttr)
 
@@ -35307,307 +34032,6 @@ func SparseCrossV2(scope *Scope, indices []tf.Output, values []tf.Output, shapes
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Pads a tensor with mirrored values.
-//
-// This operation pads a `input` with mirrored values according to the `paddings`
-// you specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is
-// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many values to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many values to add after the contents of `input`
-// in that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater
-// than `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true
-// (if false, respectively).
-//
-// The padded size of each dimension D of the output is:
-//
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-//
-// For example:
-//
-// ```
-// # 't' is [[1, 2, 3], [4, 5, 6]].
-// # 'paddings' is [[1, 1]], [2, 2]].
-// # 'mode' is SYMMETRIC.
-// # rank of 't' is 2.
-// pad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]
-//                       [2, 1, 1, 2, 3, 3, 2]
-//                       [5, 4, 4, 5, 6, 6, 5]
-//                       [5, 4, 4, 5, 6, 6, 5]]
-// ```
-//
-// Arguments:
-//	input: The input tensor to be padded.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	mode: Either `REFLECT` or `SYMMETRIC`. In reflect mode the padded regions
-// do not include the borders, while in symmetric mode the padded regions
-// do include the borders. For example, if `input` is `[1, 2, 3]` and `paddings`
-// is `[0, 2]`, then the output is `[1, 2, 3, 2, 1]` in reflect mode, and
-// it is `[1, 2, 3, 3, 2]` in symmetric mode.
-//
-// Returns The padded tensor.
-func MirrorPad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"mode": mode}
-	opspec := tf.OpSpec{
-		Type: "MirrorPad",
-		Input: []tf.Input{
-			input, paddings,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TensorArrayV3Attr is an optional argument to TensorArrayV3.
-type TensorArrayV3Attr func(optionalAttr)
-
-// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
-//
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
-}
-
-// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
-//
-// value: A boolean that determines whether writes to the TensorArray
-// are allowed to grow the size.  By default, this is not allowed.
-// If not specified, defaults to false
-func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["dynamic_size"] = value
-	}
-}
-
-// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
-//
-// value: If true (default), Tensors in the TensorArray are cleared
-// after being read.  This disables multiple read semantics but allows early
-// release of memory.
-// If not specified, defaults to true
-func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["clear_after_read"] = value
-	}
-}
-
-// TensorArrayV3IdenticalElementShapes sets the optional identical_element_shapes attribute to value.
-//
-// value: If true (default is false), then all
-// elements in the TensorArray will be expected to have have identical shapes.
-// This allows certain behaviors, like dynamically checking for
-// consistent shapes on write, and being able to fill in properly
-// shaped zero tensors on stack -- even if the element_shape attribute
-// is not fully defined.
-// If not specified, defaults to false
-func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["identical_element_shapes"] = value
-	}
-}
-
-// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
-//
-// value: Overrides the name used for the temporary tensor_array
-// resource. Default value is the name of the 'TensorArray' op (which
-// is guaranteed unique).
-// If not specified, defaults to ""
-func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
-	}
-}
-
-// An array of Tensors of given size.
-//
-// Write data via Write and read via Read or Pack.
-//
-// Arguments:
-//	size: The size of the array.
-//	dtype: The type of the elements on the tensor_array.
-//
-// Returns:
-//	handle: The handle to the TensorArray.
-//	flow: A scalar used to control gradient flow.
-func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayV3",
-		Input: []tf.Input{
-			size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
-type MatrixSolveLsAttr func(optionalAttr)
-
-// MatrixSolveLsFast sets the optional fast attribute to value.
-// If not specified, defaults to true
-func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
-	return func(m optionalAttr) {
-		m["fast"] = value
-	}
-}
-
-// Solves one or more linear least-squares problems.
-//
-// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
-// type as `matrix` and shape `[..., M, K]`.
-// The output is a tensor shape `[..., N, K]` where each output matrix solves
-// each of the equations
-// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
-// in the least squares sense.
-//
-// We use the following notation for (complex) matrix and right-hand sides
-// in the batch:
-//
-// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
-// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
-// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
-// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
-//
-// If `fast` is `True`, then the solution is computed by solving the normal
-// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
-// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
-// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 + \lambda ||Z||_F^2\\).
-// If \\(m \lt n\\) then `output` is computed as
-// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
-// minimum-norm solution to the under-determined linear system, i.e.
-// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
-// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
-// when \\(A\\) is numerically full rank and has a condition number
-// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or \\(\lambda\\) is
-// sufficiently large.
-//
-// If `fast` is `False` an algorithm based on the numerically robust complete
-// orthogonal decomposition is used. This computes the minimum-norm
-// least-squares solution, even when \\(A\\) is rank deficient. This path is
-// typically 6-7 times slower than the fast path. If `fast` is `False` then
-// `l2_regularizer` is ignored.
-//
-// Arguments:
-//	matrix: Shape is `[..., M, N]`.
-//	rhs: Shape is `[..., M, K]`.
-//	l2_regularizer: Scalar tensor.
-//
-// @compatibility(numpy)
-// Equivalent to np.linalg.lstsq
-// @end_compatibility
-//
-// Returns Shape is `[..., N, K]`.
-func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixSolveLs",
-		Input: []tf.Input{
-			matrix, rhs, l2_regularizer,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Generates sparse cross from a list of sparse and dense tensors.
-//
-// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
-// representing features of one feature column. It outputs a 2D `SparseTensor` with
-// the batchwise crosses of these features.
-//
-// For example, if the inputs are
-//
-//     inputs[0]: SparseTensor with shape = [2, 2]
-//     [0, 0]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-//     inputs[1]: SparseTensor with shape = [2, 1]
-//     [0, 0]: "d"
-//     [1, 0]: "e"
-//
-//     inputs[2]: Tensor [["f"], ["g"]]
-//
-// then the output will be
-//
-//     shape = [2, 2]
-//     [0, 0]: "a_X_d_X_f"
-//     [1, 0]: "b_X_e_X_g"
-//     [1, 1]: "c_X_e_X_g"
-//
-// if hashed_output=true then the output will be
-//
-//     shape = [2, 2]
-//     [0, 0]: FingerprintCat64(
-//                 Fingerprint64("f"), FingerprintCat64(
-//                     Fingerprint64("d"), Fingerprint64("a")))
-//     [1, 0]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("b")))
-//     [1, 1]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("c")))
-//
-// Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.   values of each `SparseTensor`.
-//	shapes: 1-D.   Shapes of each `SparseTensor`.
-//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
-//	hashed_output: If true, returns the hash of the cross instead of the string.
-// This will allow us avoiding string manipulations.
-//	num_buckets: It is used if hashed_output is true.
-// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
-//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
-// function to combine the crosses fingerprints.
-//
-//
-//
-// Returns:
-//	output_indices: 2-D.  Indices of the concatenated `SparseTensor`.
-//	output_values: 1-D.  Non-empty values of the concatenated or hashed
-// `SparseTensor`.
-//	output_shape: 1-D.  Shape of the concatenated `SparseTensor`.
-func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
-	opspec := tf.OpSpec{
-		Type: "SparseCross",
-		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
 // Generate a glob pattern matching all sharded file names.
 func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
 	if scope.Err() != nil {
@@ -37095,28 +35519,6 @@ func StringFormat(scope *Scope, inputs []tf.Output, optional ...StringFormatAttr
 	return op.Output(0)
 }
 
-// Converts a SparseTensor to a (possibly batched) CSRSparseMatrix.
-//
-// Arguments:
-//	indices: SparseTensor indices.
-//	values: SparseTensor values.
-//	dense_shape: SparseTensor dense shape.
-//
-// Returns A (possibly batched) CSRSparseMatrix.
-func SparseTensorToCSRSparseMatrix(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (sparse_matrix tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseTensorToCSRSparseMatrix",
-		Input: []tf.Input{
-			indices, values, dense_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes fingerprints of the input strings.
 //
 // Arguments:
@@ -37138,30 +35540,195 @@ func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Inverse 3D fast Fourier transform.
+// Saves input tensors slices to disk.
 //
-// Computes the inverse 3-dimensional discrete Fourier transform over the
-// inner-most 3 dimensions of `input`.
+// This is like `Save` except that tensors can be listed in the saved file as being
+// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
+// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
+// have as many elements as `tensor_names`.
+//
+// Elements of the `shapes_and_slices` input must either be:
+//
+// *  The empty string, in which case the corresponding tensor is
+//    saved normally.
+// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
+//    `dimI` are the dimensions of the larger tensor and `slice-spec`
+//    specifies what part is covered by the tensor to save.
+//
+// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
+// where each `sliceI` is either:
+//
+// *  The string `-` meaning that the slice covers all indices of this dimension
+// *  `start,length` where `start` and `length` are integers.  In that
+//    case the slice covers `length` indices starting at `start`.
+//
+// See also `Save`.
 //
 // Arguments:
-//	input: A complex tensor.
+//	filename: Must have a single element. The name of the file to which we write the
+// tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
+// saving the tensors.
+//	data: `N` tensors to save.
 //
-// Returns A complex tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.ifftn with 3 dimensions.
-// @end_compatibility
-func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns the created operation.
+func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IFFT3D",
+		Type: "SaveSlices",
 		Input: []tf.Input{
-			input,
+			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
 		},
 	}
+	return scope.AddOperation(opspec)
+}
+
+// FusedBatchNormGradV3Attr is an optional argument to FusedBatchNormGradV3.
+type FusedBatchNormGradV3Attr func(optionalAttr)
+
+// FusedBatchNormGradV3Epsilon sets the optional epsilon attribute to value.
+//
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradV3Epsilon(value float32) FusedBatchNormGradV3Attr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
+	}
+}
+
+// FusedBatchNormGradV3DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradV3DataFormat(value string) FusedBatchNormGradV3Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormGradV3IsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradV3IsTraining(value bool) FusedBatchNormGradV3Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Gradient for batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+//
+// Arguments:
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
+// mean to be reused in gradient computation. When is_training is
+// False, a 1D Tensor for the population mean to be reused in both
+// 1st and 2nd order gradient computation.
+//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
+// variance (inverted variance in the cuDNN case) to be reused in
+// gradient computation. When is_training is False, a 1D Tensor
+// for the population variance to be reused in both 1st and 2nd
+// order gradient computation.
+//	reserve_space_3: When is_training is True, a 1D Tensor for some intermediate results to be reused
+// in gradient computation. When is_training is False, a dummy empty Tensor will be
+// created.
+//
+// Returns:
+//	x_backprop: A 4D Tensor for the gradient with respect to x.
+//	scale_backprop: A 1D Tensor for the gradient with respect to scale.
+//	offset_backprop: A 1D Tensor for the gradient with respect to offset.
+//	reserve_space_4: Unused placeholder to match the mean input in FusedBatchNorm.
+//	reserve_space_5: Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGradV3(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, reserve_space_3 tf.Output, optional ...FusedBatchNormGradV3Attr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_4 tf.Output, reserve_space_5 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedBatchNormGradV3",
+		Input: []tf.Input{
+			y_backprop, x, scale, reserve_space_1, reserve_space_2, reserve_space_3,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// Returns the number of records this Reader has produced.
+//
+// This is the same as the number of ReaderRead executions that have
+// succeeded.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderNumRecordsProducedV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeRawAttr is an optional argument to DecodeRaw.
+type DecodeRawAttr func(optionalAttr)
+
+// DecodeRawLittleEndian sets the optional little_endian attribute to value.
+//
+// value: Whether the input `bytes` are in little-endian order.
+// Ignored for `out_type` values that are stored in a single byte like
+// `uint8`.
+// If not specified, defaults to true
+func DecodeRawLittleEndian(value bool) DecodeRawAttr {
+	return func(m optionalAttr) {
+		m["little_endian"] = value
+	}
+}
+
+// Reinterpret the bytes of a string as a vector of numbers.
+//
+// Arguments:
+//	bytes: All the elements must have the same length.
+//
+//
+// Returns A Tensor with one more dimension than the input `bytes`.  The
+// added dimension will have size equal to the length of the elements
+// of `bytes` divided by the number of bytes to represent `out_type`.
+func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeRaw",
+		Input: []tf.Input{
+			bytes,
+		},
+		Attrs: attrs,
+	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
@@ -37235,121 +35802,28 @@ func QueueDequeueUpToV2(scope *Scope, handle tf.Output, n tf.Output, component_t
 	return components
 }
 
-// Says whether the targets are in the top `K` predictions.
-//
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
-//
-// More formally, let
-//
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
-//
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+// Converts a SparseTensor to a (possibly batched) CSRSparseMatrix.
 //
 // Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
+//	indices: SparseTensor indices.
+//	values: SparseTensor values.
+//	dense_shape: SparseTensor dense shape.
 //
-// Returns Computed Precision at `k` as a `bool Tensor`.
-func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"k": k}
-	opspec := tf.OpSpec{
-		Type: "InTopK",
-		Input: []tf.Input{
-			predictions, targets,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x - y element-wise.
-//
-// *NOTE*: `Subtract` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Returns A (possibly batched) CSRSparseMatrix.
+func SparseTensorToCSRSparseMatrix(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (sparse_matrix tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sub",
+		Type: "SparseTensorToCSRSparseMatrix",
 		Input: []tf.Input{
-			x, y,
+			indices, values, dense_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
-type FusedResizeAndPadConv2DAttr func(optionalAttr)
-
-// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
-	return func(m optionalAttr) {
-		m["resize_align_corners"] = value
-	}
-}
-
-// Performs a resize and padding as a preprocess during a convolution.
-//
-// It's often possible to do spatial transformations more efficiently as part of
-// the packing stage of a convolution, so this op allows for an optimized
-// implementation where these stages are fused together. This prevents the need to
-// write out the intermediate results as whole tensors, reducing memory pressure,
-// and we can get some latency gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and defaults to
-// 'NHWC' order.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
-//
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
-//	padding: The type of padding algorithm to use.
-func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FusedResizeAndPadConv2D",
-		Input: []tf.Input{
-			input, size, paddings, filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes the product along segments of a tensor.
 //
 // Read
@@ -38266,6 +36740,77 @@ func InfeedEnqueuePrelinearizedBuffer(scope *Scope, input tf.Output, optional ..
 	return scope.AddOperation(opspec)
 }
 
+// Create a dense tensor from a ragged tensor, possibly altering its shape.
+//
+// The `ragged_to_dense` op creates a dense tensor from a list of row partition
+// tensors, a value vector, and default values. If the shape is unspecified, the
+// minimal shape required to contain all the elements in the ragged tensor (the
+// natural shape) will be used. If some dimensions are left unspecified, then the
+// size of the natural shape is used in that dimension.
+//
+// The default_value will be broadcast to the output shape. After that, the values
+// from the ragged tensor overwrite the default values. Note that the default_value
+// must have less dimensions than the value.
+//
+// The row partition tensors are in the order of the dimensions.
+// At present, the types can be:
+// * "ROW_SPLITS": the row_splits tensor from the ragged tensor.
+// * "VALUE_ROWIDS": the value_rowids tensor from the ragged tensor.
+// * "FIRST_DIM_SIZE": if value_rowids is used for the first dimension, then it
+//   is preceded by "FIRST_DIM_SIZE".
+//
+// Arguments:
+//	shape: The desired shape of the the output tensor. If left unspecified (empty),
+// the minimal shape required to contain all the elements in the ragged tensor
+// (the natural shape) will be used. If some dimensions are left unspecified, then
+// the size of the natural shape is used in that dimension.
+//
+// Note that dense dimensions cannot be modified by the shape argument. Trying to
+// change the size of a dense dimension will cause the op to fail.
+// Examples:
+// natural shape: [4, 5, 6]
+// shape: -1
+// output shape: [4, 5, 6]
+//
+// natural shape: [4, 5, 6]
+// shape: [3, -1, 2]
+// output shape: [3, 5, 2]
+//
+// natural shape: [4, 5, 6]
+// shape: [3, 7, 2]
+// output shape: [3, 7, 2]
+//
+//	values: A 1D tensor representing the values of the ragged tensor.
+//	default_value: The default_value when the shape is larger than the ragged tensor. The
+// default_value is broadcast until it is the shape of the output tensor, and
+// then overwritten by values in the ragged tensor. The default value must be
+// compatible with this broadcast operation, and must have fewer dimensions than
+// the value tensor.
+//
+//	row_partition_types: The types of the row partition tensors. At present, these can be:
+// * "ROW_SPLITS": the row_splits tensor from the ragged tensor.
+// * "VALUE_ROWIDS": the value_rowids tensor from the ragged tensor.
+// * "FIRST_DIM_SIZE": if value_rowids is used for the first dimension, then it
+//   is preceeded by "FIRST_DIM_SIZE".
+// The tensors are in the order of the dimensions.
+//
+// Returns The resulting dense tensor.
+func RaggedTensorToTensor(scope *Scope, shape tf.Output, values tf.Output, default_value tf.Output, row_partition_tensors []tf.Output, row_partition_types []string) (result tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"row_partition_types": row_partition_types}
+	opspec := tf.OpSpec{
+		Type: "RaggedTensorToTensor",
+		Input: []tf.Input{
+			shape, values, default_value, tf.OutputList(row_partition_tensors),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the derivative of a Gamma random sample w.r.t. `alpha`.
 func RandomGammaGrad(scope *Scope, alpha tf.Output, sample tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
@@ -38399,6 +36944,68 @@ func Prelinearize(scope *Scope, input tf.Output, optional ...PrelinearizeAttr) (
 	return op.Output(0)
 }
 
+// StatefulUniformFullIntAttr is an optional argument to StatefulUniformFullInt.
+type StatefulUniformFullIntAttr func(optionalAttr)
+
+// StatefulUniformFullIntDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_UINT64
+func StatefulUniformFullIntDtype(value tf.DataType) StatefulUniformFullIntAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random integers from a uniform distribution.
+//
+// The generated values are uniform integers covering the whole range of `dtype`.
+//
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	algorithm: The RNG algorithm.
+//	shape: The shape of the output tensor.
+//
+// Returns Random values with specified shape.
+func StatefulUniformFullInt(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulUniformFullIntAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatefulUniformFullInt",
+		Input: []tf.Input{
+			resource, algorithm, shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Transforms a Tensor into a serialized TensorProto proto.
+//
+// Arguments:
+//	tensor: A Tensor of type `T`.
+//
+// Returns A serialized TensorProto proto of the input tensor.
+func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SerializeTensor",
+		Input: []tf.Input{
+			tensor,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the sparse Cholesky decomposition of `input`.
 //
 // Computes the Sparse Cholesky decomposition of a sparse matrix, with the given
@@ -38545,6 +37152,88 @@ func StatefulTruncatedNormal(scope *Scope, resource tf.Output, algorithm tf.Outp
 	return op.Output(0)
 }
 
+// PaddingFIFOQueueV2Attr is an optional argument to PaddingFIFOQueueV2.
+type PaddingFIFOQueueV2Attr func(optionalAttr)
+
+// PaddingFIFOQueueV2Shapes sets the optional shapes attribute to value.
+//
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types.
+// Shapes of fixed rank but variable size are allowed by setting
+// any shape dimension to -1.  In this case, the inputs' shape may vary along
+// the given dimension, and DequeueMany will pad the given dimension with
+// zeros up to the maximum shape of all elements in the given batch.
+// If the length of this attr is 0, different queue elements may have
+// different ranks and shapes, but only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shapes"] = value
+	}
+}
+
+// PaddingFIFOQueueV2Capacity sets the optional capacity attribute to value.
+//
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func PaddingFIFOQueueV2Capacity(value int64) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// PaddingFIFOQueueV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func PaddingFIFOQueueV2Container(value string) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// PaddingFIFOQueueV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func PaddingFIFOQueueV2SharedName(value string) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that produces elements in first-in first-out order.
+//
+// Variable-size shapes are allowed by setting the corresponding shape dimensions
+// to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
+// size of any given element in the minibatch.  See below for details.
+//
+// Arguments:
+//	component_types: The type of each component in a value.
+//
+// Returns The handle to the queue.
+func PaddingFIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...PaddingFIFOQueueV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "PaddingFIFOQueueV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns true if queue is closed.
 //
 // This operation returns true if the queue is closed and false if the queue
@@ -38588,166 +37277,137 @@ func IsBoostedTreesQuantileStreamResourceInitialized(scope *Scope, quantile_stre
 	return op.Output(0)
 }
 
-// ParseSequenceExampleAttr is an optional argument to ParseSequenceExample.
-type ParseSequenceExampleAttr func(optionalAttr)
-
-// ParseSequenceExampleNcontextSparse sets the optional Ncontext_sparse attribute to value.
-// If not specified, defaults to 0
+// Applies softmax to a batched N-D `SparseTensor`.
 //
-// REQUIRES: value >= 0
-func ParseSequenceExampleNcontextSparse(value int64) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["Ncontext_sparse"] = value
-	}
-}
-
-// ParseSequenceExampleNcontextDense sets the optional Ncontext_dense attribute to value.
-// If not specified, defaults to 0
+// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
+// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
 //
-// REQUIRES: value >= 0
-func ParseSequenceExampleNcontextDense(value int64) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["Ncontext_dense"] = value
-	}
-}
-
-// ParseSequenceExampleNfeatureListSparse sets the optional Nfeature_list_sparse attribute to value.
-// If not specified, defaults to 0
+// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
+// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
+// zero elements do not participate*.  Specifically, the algorithm is equivalent
+// to the following:
 //
-// REQUIRES: value >= 0
-func ParseSequenceExampleNfeatureListSparse(value int64) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["Nfeature_list_sparse"] = value
-	}
-}
-
-// ParseSequenceExampleNfeatureListDense sets the optional Nfeature_list_dense attribute to value.
-// If not specified, defaults to 0
+//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
+//       with shape `[B, C]`, along the size-C dimension;
+//   (2) Masks out the original implicitly-zero locations;
+//   (3) Renormalizes the remaining elements.
 //
-// REQUIRES: value >= 0
-func ParseSequenceExampleNfeatureListDense(value int64) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["Nfeature_list_dense"] = value
-	}
-}
-
-// ParseSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
-//
-// value: A list of Ncontext_sparse types; the data types of data in
-// each context Feature given in context_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleContextSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["context_sparse_types"] = value
-	}
-}
-
-// ParseSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_dense_types"] = value
-	}
-}
-
-// ParseSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
-//
-// value: A list of Ncontext_dense shapes; the shapes of data in
-// each context Feature given in context_dense_keys.
-// The number of elements in the Feature corresponding to context_dense_key[j]
-// must always equal context_dense_shapes[j].NumEntries().
-// The shape of context_dense_values[j] will match context_dense_shapes[j].
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleContextDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["context_dense_shapes"] = value
-	}
-}
-
-// ParseSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
-//
-// value: A list of Nfeature_list_sparse types; the data types
-// of data in each FeatureList given in feature_list_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_sparse_types"] = value
-	}
-}
-
-// ParseSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
-//
-// value: A list of Nfeature_list_dense shapes; the shapes of
-// data in each FeatureList given in feature_list_dense_keys.
-// The shape of each Feature in the FeatureList corresponding to
-// feature_list_dense_key[j] must always equal
-// feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_dense_shapes"] = value
-	}
-}
-
-// Transforms a vector of brain.SequenceExample protos (as strings) into typed tensors.
+// Hence, the `SparseTensor` result has exactly the same non-zero indices and
+// shape.
 //
 // Arguments:
-//	serialized: A vector containing binary serialized SequenceExample protos.
-//	debug_name: A vector containing the names of the serialized protos.
-// May contain, for example, table key (descriptive) name for the
-// corresponding serialized proto.  This is purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no name is available.
-//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
-// context_dense_defaults[j] provides default values
-// when the SequenceExample's context map lacks context_dense_key[j].
-// If an empty Tensor is provided for context_dense_defaults[j],
-// then the Feature context_dense_keys[j] is required.
-// The input type is inferred from context_dense_defaults[j], even when it's
-// empty.  If context_dense_defaults[j] is not empty, its shape must match
-// context_dense_shapes[j].
-//	feature_list_dense_missing_assumed_empty: A vector listing the
-// FeatureList keys which may be missing from the SequenceExamples.  If the
-// associated FeatureList is missing, it is treated as empty.  By default,
-// any FeatureList not listed in this vector must exist in the SequenceExamples.
-//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with context_sparse
-// values.
-//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' context features associated with
-// dense values.
-//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
-// (scalars).  The keys expected in the FeatureLists associated with sparse
-// values.
-//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' feature_lists associated
-// with lists of dense values.
-func ParseSequenceExample(scope *Scope, serialized tf.Output, debug_name tf.Output, context_dense_defaults []tf.Output, feature_list_dense_missing_assumed_empty []string, context_sparse_keys []string, context_dense_keys []string, feature_list_sparse_keys []string, feature_list_dense_keys []string, optional ...ParseSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output, feature_list_dense_lengths []tf.Output) {
+//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
+// SparseTensor, in canonical ordering.
+//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//
+// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
+func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"feature_list_dense_missing_assumed_empty": feature_list_dense_missing_assumed_empty, "context_sparse_keys": context_sparse_keys, "context_dense_keys": context_dense_keys, "feature_list_sparse_keys": feature_list_sparse_keys, "feature_list_dense_keys": feature_list_dense_keys}
-	for _, a := range optional {
-		a(attrs)
+	opspec := tf.OpSpec{
+		Type: "SparseSoftmax",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// An Op to permute tensors across replicated TPU instances.
+//
+// Each instance supplies its own input.
+//
+// For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
+// source_target_pairs=`[[0,1],[1,2],[2,3],[3,0]]` gets the outputs:
+// `[D, A, B, C]`.
+//
+// Arguments:
+//	input: The local input to be permuted. Currently only supports float and
+// bfloat16.
+//	source_target_pairs: A tensor with shape [num_pairs, 2].
+//
+// Returns The permuted input.
+func CollectivePermute(scope *Scope, input tf.Output, source_target_pairs tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ParseSequenceExample",
+		Type: "CollectivePermute",
 		Input: []tf.Input{
-			serialized, debug_name, tf.OutputList(context_dense_defaults),
+			input, source_target_pairs,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Gives a guarantee to the TF runtime that the input tensor is a constant.
+//
+// The runtime is then free to make optimizations based on this.
+//
+// Only accepts value typed tensors as inputs and rejects resource variable handles
+// as input.
+//
+// Returns the input tensor without modification.
+func GuaranteeConst(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GuaranteeConst",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Transforms a tf.Example proto (as a string) into typed tensors.
+//
+// Arguments:
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	dense_defaults: A list of Tensors (some may be empty), whose length matches
+// the length of `dense_keys`. dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	num_sparse: The number of sparse features to be parsed from the example. This
+// must match the lengths of `sparse_keys` and `sparse_types`.
+//	sparse_keys: A list of `num_sparse` strings.
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: The keys expected in the Examples' features associated with dense
+// values.
+//	sparse_types: A list of `num_sparse` types; the data types of data in each
+// Feature given in sparse_keys.
+// Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: The shapes of data in each Feature given in dense_keys.
+// The length of this list must match the length of `dense_keys`.  The
+// number of elements in the Feature corresponding to dense_key[j] must
+// always equal dense_shapes[j].NumEntries().  If dense_shapes[j] ==
+// (D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
+// will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
+// ..., DN), the shape of the output Tensor dense_values[j] will be (M,
+// D1, .., DN), where M is the number of blocks of elements of length
+// D1 * .... * DN, in the input.
+func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes}
+	opspec := tf.OpSpec{
+		Type: "ParseSingleExample",
+		Input: []tf.Input{
+			serialized, tf.OutputList(dense_defaults),
 		},
 		Attrs: attrs,
 	}
@@ -38757,43 +37417,68 @@ func ParseSequenceExample(scope *Scope, serialized tf.Output, debug_name tf.Outp
 	}
 	var idx int
 	var err error
-	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
 		return
 	}
-	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
 		return
 	}
-	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
 		return
 	}
-	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
 		return
 	}
-	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
+}
+
+// StringToNumberAttr is an optional argument to StringToNumber.
+type StringToNumberAttr func(optionalAttr)
+
+// StringToNumberOutType sets the optional out_type attribute to value.
+//
+// value: The numeric type to interpret each string in `string_tensor` as.
+// If not specified, defaults to DT_FLOAT
+func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Converts each string in the input Tensor to the specified numeric type.
+//
+// (Note that int32 overflow results in an error while float overflow
+// results in a rounded value.)
+//
+// Example:
+//
+// >>> strings = ["5.0", "3.0", "7.0"]
+// >>> tf.strings.to_number(strings)
+// <tf.Tensor: shape=(3,), dtype=float32, numpy=array([5., 3., 7.], dtype=float32)>
+//
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
+	if scope.Err() != nil {
 		return
 	}
-	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
-	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "StringToNumber",
+		Input: []tf.Input{
+			string_tensor,
+		},
+		Attrs: attrs,
 	}
-	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if feature_list_dense_lengths, idx, err = makeOutputList(op, idx, "feature_list_dense_lengths"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values, feature_list_dense_lengths
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
 // Fast Fourier transform.
@@ -38824,237 +37509,6 @@ func FFT(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// UniqueV2Attr is an optional argument to UniqueV2.
-type UniqueV2Attr func(optionalAttr)
-
-// UniqueV2OutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func UniqueV2OutIdx(value tf.DataType) UniqueV2Attr {
-	return func(m optionalAttr) {
-		m["out_idx"] = value
-	}
-}
-
-// Finds unique elements along an axis of a tensor.
-//
-// This operation either returns a tensor `y` containing unique elements
-// along the `axis` of a tensor. The returned unique elements is sorted
-// in the same order as they occur along `axis` in `x`.
-// This operation also returns a tensor `idx` that is the same size as
-// the number of the elements in `x` along the `axis` dimension. It
-// contains the index in the unique output `y`.
-// In other words, for an `1-D` tensor `x` with `axis = None:
-//
-// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-//
-// For example:
-//
-// ```
-// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-// y, idx = unique(x)
-// y ==> [1, 2, 4, 7, 8]
-// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-// ```
-//
-// For an `2-D` tensor `x` with `axis = 0`:
-//
-// ```
-// # tensor 'x' is [[1, 0, 0],
-// #                [1, 0, 0],
-// #                [2, 0, 0]]
-// y, idx = unique(x, axis=0)
-// y ==> [[1, 0, 0],
-//        [2, 0, 0]]
-// idx ==> [0, 0, 1]
-// ```
-//
-// For an `2-D` tensor `x` with `axis = 1`:
-//
-// ```
-// # tensor 'x' is [[1, 0, 0],
-// #                [1, 0, 0],
-// #                [2, 0, 0]]
-// y, idx = unique(x, axis=1)
-// y ==> [[1, 0],
-//        [1, 0],
-//        [2, 0]]
-// idx ==> [0, 1, 1]
-// ```
-//
-// Arguments:
-//	x: A `Tensor`.
-//	axis: A `Tensor` of type `int32` (default: None). The axis of the Tensor to
-// find the unique elements.
-//
-// Returns:
-//	y: A `Tensor`. Unique elements along the `axis` of `Tensor` x.
-//	idx: A 1-D Tensor. Has the same type as x that contains the index of each
-// value of x in the output y.
-func UniqueV2(scope *Scope, x tf.Output, axis tf.Output, optional ...UniqueV2Attr) (y tf.Output, idx tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UniqueV2",
-		Input: []tf.Input{
-			x, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// DecodePaddedRawAttr is an optional argument to DecodePaddedRaw.
-type DecodePaddedRawAttr func(optionalAttr)
-
-// DecodePaddedRawLittleEndian sets the optional little_endian attribute to value.
-//
-// value: Whether the input `input_bytes` is in little-endian order. Ignored for
-// `out_type` values that are stored in a single byte, like `uint8`
-// If not specified, defaults to true
-func DecodePaddedRawLittleEndian(value bool) DecodePaddedRawAttr {
-	return func(m optionalAttr) {
-		m["little_endian"] = value
-	}
-}
-
-// Reinterpret the bytes of a string as a vector of numbers.
-//
-// Arguments:
-//	input_bytes: Tensor of string to be decoded.
-//	fixed_length: Length in bytes for each element of the decoded output. Must be a multiple
-// of the size of the output type.
-//
-//
-// Returns A Tensor with one more dimension than the input `bytes`. The added dimension
-// will have size equal to the length of the elements of `bytes` divided by the
-// number of bytes to represent `out_type`.
-func DecodePaddedRaw(scope *Scope, input_bytes tf.Output, fixed_length tf.Output, out_type tf.DataType, optional ...DecodePaddedRawAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"out_type": out_type}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodePaddedRaw",
-		Input: []tf.Input{
-			input_bytes, fixed_length,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingADAMParametersAttr is an optional argument to RetrieveTPUEmbeddingADAMParameters.
-type RetrieveTPUEmbeddingADAMParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingADAMParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func RetrieveTPUEmbeddingADAMParametersTableId(value int64) RetrieveTPUEmbeddingADAMParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingADAMParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingADAMParametersTableName(value string) RetrieveTPUEmbeddingADAMParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingADAMParametersConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingADAMParametersConfig(value string) RetrieveTPUEmbeddingADAMParametersAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Retrieve ADAM embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns:
-//	parameters: Parameter parameters updated by the ADAM optimization algorithm.
-//	momenta: Parameter momenta updated by the ADAM optimization algorithm.
-//	velocities: Parameter velocities updated by the ADAM optimization algorithm.
-func RetrieveTPUEmbeddingADAMParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingADAMParametersAttr) (parameters tf.Output, momenta tf.Output, velocities tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingADAMParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// StatelessRandomBinomialAttr is an optional argument to StatelessRandomBinomial.
-type StatelessRandomBinomialAttr func(optionalAttr)
-
-// StatelessRandomBinomialDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_INT64
-func StatelessRandomBinomialDtype(value tf.DataType) StatelessRandomBinomialAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom random numbers from a binomial distribution.
-//
-// Outputs random values from a binomial distribution.
-//
-// The outputs are a deterministic function of `shape`, `seed`, `counts`, and `probs`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//	counts: The counts of the binomial distribution. Must be broadcastable with `probs`,
-// and broadcastable with the rightmost dimensions of `shape`.
-//	probs: The probability of success for the binomial distribution. Must be broadcastable
-// with `counts` and broadcastable with the rightmost dimensions of `shape`.
-//
-// Returns Random values with specified shape.
-func StatelessRandomBinomial(scope *Scope, shape tf.Output, seed tf.Output, counts tf.Output, probs tf.Output, optional ...StatelessRandomBinomialAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessRandomBinomial",
-		Input: []tf.Input{
-			shape, seed, counts, probs,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingAdagradParametersGradAccumDebug.
 type RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr func(optionalAttr)
 
@@ -39283,135 +37737,6 @@ func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, option
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
-//
-// N is the size of the segment being reduced.
-//
-// See `tf.sparse.segment_sum` for usage examples.
-//
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtN",
-		Input: []tf.Input{
-			data, indices, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UnicodeDecodeWithOffsetsAttr is an optional argument to UnicodeDecodeWithOffsets.
-type UnicodeDecodeWithOffsetsAttr func(optionalAttr)
-
-// UnicodeDecodeWithOffsetsErrors sets the optional errors attribute to value.
-//
-// value: Error handling policy when there is invalid formatting found in the input.
-// The value of 'strict' will cause the operation to produce a InvalidArgument
-// error on any invalid input formatting. A value of 'replace' (the default) will
-// cause the operation to replace any invalid formatting in the input with the
-// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
-// skip any invalid formatting in the input and produce no corresponding output
-// character.
-// If not specified, defaults to "replace"
-func UnicodeDecodeWithOffsetsErrors(value string) UnicodeDecodeWithOffsetsAttr {
-	return func(m optionalAttr) {
-		m["errors"] = value
-	}
-}
-
-// UnicodeDecodeWithOffsetsReplacementChar sets the optional replacement_char attribute to value.
-//
-// value: The replacement character codepoint to be used in place of any invalid
-// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
-// be used. The default value is the default unicode replacement character is
-// 0xFFFD or U+65533.)
-// If not specified, defaults to 65533
-func UnicodeDecodeWithOffsetsReplacementChar(value int64) UnicodeDecodeWithOffsetsAttr {
-	return func(m optionalAttr) {
-		m["replacement_char"] = value
-	}
-}
-
-// UnicodeDecodeWithOffsetsReplaceControlCharacters sets the optional replace_control_characters attribute to value.
-//
-// value: Whether to replace the C0 control characters (00-1F) with the
-// `replacement_char`. Default is false.
-// If not specified, defaults to false
-func UnicodeDecodeWithOffsetsReplaceControlCharacters(value bool) UnicodeDecodeWithOffsetsAttr {
-	return func(m optionalAttr) {
-		m["replace_control_characters"] = value
-	}
-}
-
-// UnicodeDecodeWithOffsetsTsplits sets the optional Tsplits attribute to value.
-// If not specified, defaults to DT_INT64
-func UnicodeDecodeWithOffsetsTsplits(value tf.DataType) UnicodeDecodeWithOffsetsAttr {
-	return func(m optionalAttr) {
-		m["Tsplits"] = value
-	}
-}
-
-// Decodes each string in `input` into a sequence of Unicode code points.
-//
-// The character codepoints for all strings are returned using a single vector
-// `char_values`, with strings expanded to characters in row-major order.
-// Similarly, the character start byte offsets are returned using a single vector
-// `char_to_byte_starts`, with strings expanded in row-major order.
-//
-// The `row_splits` tensor indicates where the codepoints and start offsets for
-// each input string begin and end within the `char_values` and
-// `char_to_byte_starts` tensors.  In particular, the values for the `i`th
-// string (in row-major order) are stored in the slice
-// `[row_splits[i]:row_splits[i+1]]`. Thus:
-//
-// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
-//   character in the `i`th string (in row-major order).
-// * `char_to_bytes_starts[row_splits[i]+j]` is the start byte offset for the `j`th
-//   character in the `i`th string (in row-major order).
-// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
-//   string (in row-major order).
-//
-// Arguments:
-//	input: The text to be decoded. Can have any shape. Note that the output is flattened
-// to a vector of char values.
-//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
-// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
-//
-// Returns:
-//	row_splits: A 1D int32 tensor containing the row splits.
-//	char_values: A 1D int32 Tensor containing the decoded codepoints.
-//	char_to_byte_starts: A 1D int32 Tensor containing the byte index in the input string where each
-// character in `char_values` starts.
-func UnicodeDecodeWithOffsets(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeWithOffsetsAttr) (row_splits tf.Output, char_values tf.Output, char_to_byte_starts tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"input_encoding": input_encoding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UnicodeDecodeWithOffsets",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
 //   This op is used as a placeholder in If branch functions. It doesn't provide a
 //   valid output when run, so must either be removed (e.g. replaced with a
 //   function input) or guaranteed not to be used (e.g. if mirroring an
@@ -39567,34 +37892,6 @@ func FloorDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// An Op to permute tensors across replicated TPU instances.
-//
-// Each instance supplies its own input.
-//
-// For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
-// source_target_pairs=`[[0,1],[1,2],[2,3],[3,0]]` gets the outputs:
-// `[D, A, B, C]`.
-//
-// Arguments:
-//	input: The local input to be permuted. Currently only supports float and
-// bfloat16.
-//	source_target_pairs: A tensor with shape [num_pairs, 2].
-//
-// Returns The permuted input.
-func CollectivePermute(scope *Scope, input tf.Output, source_target_pairs tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "CollectivePermute",
-		Input: []tf.Input{
-			input, source_target_pairs,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // QuantizedReluXAttr is an optional argument to QuantizedReluX.
 type QuantizedReluXAttr func(optionalAttr)
 
@@ -39890,6 +38187,95 @@ func BlockLSTMGrad(scope *Scope, seq_len_max tf.Output, x tf.Output, cs_prev tf.
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6), op.Output(7)
 }
 
+// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
+type ResourceSparseApplyAdagradAttr func(optionalAttr)
+
+// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceSparseApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
+// If not specified, defaults to true
+func ResourceSparseApplyAdagradUpdateSlots(value bool) ResourceSparseApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["update_slots"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyAdagrad",
+		Input: []tf.Input{
+			var_, accum, lr, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Op that executes a program with optional in-place variable updates.
+//
+// It (optionally) reads device variables, loads and executes a TPU program on a
+// TPU device, and then (optionally) in-place updates variables using the program
+// outputs, as specified in attributes device_var_reads_indices (program input
+// indices from directly reading variables) and device_var_updates_indices (program
+// output indices used to update variables, -1 means no-update/read-only). Such
+// program outputs are consumed by these variables will not appear in the op
+// output. For the internal use of the distributed TPU compiler.
+func TPUExecuteAndUpdateVariables(scope *Scope, args []tf.Output, key tf.Output, Tresults []tf.DataType, device_var_reads_indices []int64, device_var_updates_indices []int64) (results []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"Tresults": Tresults, "device_var_reads_indices": device_var_reads_indices, "device_var_updates_indices": device_var_updates_indices}
+	opspec := tf.OpSpec{
+		Type: "TPUExecuteAndUpdateVariables",
+		Input: []tf.Input{
+			tf.OutputList(args), key,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if results, idx, err = makeOutputList(op, idx, "results"); err != nil {
+		scope.UpdateErr("TPUExecuteAndUpdateVariables", err)
+		return
+	}
+	return results
+}
+
 // OutfeedDequeueTupleAttr is an optional argument to OutfeedDequeueTuple.
 type OutfeedDequeueTupleAttr func(optionalAttr)
 
@@ -39941,97 +38327,6 @@ func OutfeedDequeueTuple(scope *Scope, dtypes []tf.DataType, shapes []tf.Shape,
 	return outputs
 }
 
-// DecodeCompressedAttr is an optional argument to DecodeCompressed.
-type DecodeCompressedAttr func(optionalAttr)
-
-// DecodeCompressedCompressionType sets the optional compression_type attribute to value.
-//
-// value: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-// If not specified, defaults to ""
-func DecodeCompressedCompressionType(value string) DecodeCompressedAttr {
-	return func(m optionalAttr) {
-		m["compression_type"] = value
-	}
-}
-
-// Decompress strings.
-//
-// This op decompresses each element of the `bytes` input `Tensor`, which
-// is assumed to be compressed using the given `compression_type`.
-//
-// The `output` is a string `Tensor` of the same shape as `bytes`,
-// each element containing the decompressed data from the corresponding
-// element in `bytes`.
-//
-// Arguments:
-//	bytes: A Tensor of string which is compressed.
-//
-// Returns A Tensor with the same shape as input `bytes`, uncompressed
-// from bytes.
-func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompressedAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeCompressed",
-		Input: []tf.Input{
-			bytes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process. The hash function is a keyed hash function, where attribute `key`
-// defines the key of the hash function. `key` is an array of 2 elements.
-//
-// A strong hash is important when inputs may be malicious, e.g. URLs with
-// additional components. Adversaries could try to make their inputs hash to the
-// same bucket for a denial-of-service attack or to skew the results. A strong
-// hash can be used to make it difficult to find inputs with a skewed hash value
-// distribution over buckets. This requires that the hash function is
-// seeded by a high-entropy (random) "key" unknown to the adversary.
-//
-// The additional robustness comes at a cost of roughly 4x higher compute
-// time than `tf.string_to_hash_bucket_fast`.
-//
-// Examples:
-//
-// >>> tf.strings.to_hash_bucket_strong(["Hello", "TF"], 3, [1, 2]).numpy()
-// array([2, 0])
-//
-// Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
-//	key: The key used to seed the hash function, passed as a list of two uint64
-// elements.
-//
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
-	opspec := tf.OpSpec{
-		Type: "StringToHashBucketStrong",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Deserialize bucket boundaries and ready flag into current QuantileAccumulator.
 //
 // An op that deserializes bucket boundaries and are boundaries ready flag into current QuantileAccumulator.
@@ -40184,6 +38479,168 @@ func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (o
 	return op.Output(0)
 }
 
+// Produce a string tensor that encodes the state of a Reader.
+//
+// Not all Readers support being serialized, so this can produce an
+// Unimplemented error.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderSerializeStateV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes rectified linear 6: `min(max(features, 0), 6)`.
+func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Relu6",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Split a `SparseTensor` into `num_split` tensors along one dimension.
+//
+// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
+// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
+// For example, if `split_dim = 1` and `num_split = 2` and the input is
+//
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
+//
+// Graphically the output tensors are:
+//
+//     output_tensor[0] = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
+//
+//     output_tensor[1] = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
+//
+// Arguments:
+//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
+// `[0, rank(shape))`.
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//	num_split: The number of ways to split.
+//
+// Returns:
+//	output_indices
+//	output_values: A list of 1-D tensors represents the values of the output sparse
+// tensors.
+//	output_shape: A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_split": num_split}
+	opspec := tf.OpSpec{
+		Type: "SparseSplit",
+		Input: []tf.Input{
+			split_dim, indices, values, shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	return output_indices, output_values, output_shape
+}
+
+// RaggedRangeAttr is an optional argument to RaggedRange.
+type RaggedRangeAttr func(optionalAttr)
+
+// RaggedRangeTsplits sets the optional Tsplits attribute to value.
+// If not specified, defaults to DT_INT64
+func RaggedRangeTsplits(value tf.DataType) RaggedRangeAttr {
+	return func(m optionalAttr) {
+		m["Tsplits"] = value
+	}
+}
+
+// Returns a `RaggedTensor` containing the specified sequences of numbers.
+//
+//
+// Returns a `RaggedTensor` `result` composed from `rt_dense_values` and
+// `rt_nested_splits`, such that
+// `result[i] = range(starts[i], limits[i], deltas[i])`.
+//
+// ```python
+// (rt_nested_splits, rt_dense_values) = ragged_range(
+//       starts=[2, 5, 8], limits=[3, 5, 12], deltas=1)
+// result = tf.ragged.from_row_splits(rt_dense_values, rt_nested_splits)
+// print(result)
+// <tf.RaggedTensor [[2], [], [8, 9, 10, 11]] >
+// ```
+//
+// The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors.
+// The vector inputs must all have the same size.  Scalar inputs are broadcast
+// to match the size of the vector inputs.
+//
+// Arguments:
+//	starts: The starts of each range.
+//	limits: The limits of each range.
+//	deltas: The deltas of each range.
+//
+// Returns:
+//	rt_nested_splits: The `row_splits` for the returned `RaggedTensor`.
+//	rt_dense_values: The `flat_values` for the returned `RaggedTensor`.
+func RaggedRange(scope *Scope, starts tf.Output, limits tf.Output, deltas tf.Output, optional ...RaggedRangeAttr) (rt_nested_splits tf.Output, rt_dense_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RaggedRange",
+		Input: []tf.Input{
+			starts, limits, deltas,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // Replaces the contents of the table with the specified keys and values.
 //
 // The tensor `keys` must be of the same type as the keys of the table.
@@ -40795,6 +39252,32 @@ func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filenam
 	return scope.AddOperation(opspec)
 }
 
+// Computes inverse hyperbolic tangent of x element-wise.
+//
+//   Given an input tensor, this function computes inverse hyperbolic tangent
+//   for every element in the tensor. Input range is `[-1,1]` and output range is
+//   `[-inf, inf]`. If input is `-1`, output will be `-inf` and if the
+//   input is `1`, output will be `inf`. Values outside the range will have
+//   `nan` as output.
+//
+//   ```python
+//   x = tf.constant([-float("inf"), -1, -0.5, 1, 0, 0.5, 10, float("inf")])
+//   tf.math.atanh(x) ==> [nan -inf -0.54930615 inf  0. 0.54930615 nan nan]
+//   ```
+func Atanh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Atanh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns an element-wise indication of the sign of a number.
 //
 // `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
@@ -40867,24 +39350,6 @@ func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Outpu
 	return scope.AddOperation(opspec)
 }
 
-// Returns the number of work units this Reader has finished processing.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderNumWorkUnitsCompletedV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
 type FractionalMaxPoolAttr func(optionalAttr)
 
@@ -41207,272 +39672,18 @@ func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.Dat
 	return outputs
 }
 
-// LoadTPUEmbeddingMDLAdagradLightParametersAttr is an optional argument to LoadTPUEmbeddingMDLAdagradLightParameters.
-type LoadTPUEmbeddingMDLAdagradLightParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingMDLAdagradLightParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func LoadTPUEmbeddingMDLAdagradLightParametersTableId(value int64) LoadTPUEmbeddingMDLAdagradLightParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingMDLAdagradLightParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingMDLAdagradLightParametersTableName(value string) LoadTPUEmbeddingMDLAdagradLightParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// LoadTPUEmbeddingMDLAdagradLightParametersConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingMDLAdagradLightParametersConfig(value string) LoadTPUEmbeddingMDLAdagradLightParametersAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Load MDL Adagrad Light embedding parameters.
+// Creates and returns an empty tensor map.
 //
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the MDL Adagrad Light optimization algorithm.
-//	accumulators: Value of accumulators used in the MDL Adagrad Light optimization algorithm.
-//	weights: Value of weights used in the MDL Adagrad Light optimization algorithm.
-//	benefits: Value of benefits used in the MDL Adagrad Light optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingMDLAdagradLightParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, weights tf.Output, benefits tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMDLAdagradLightParametersAttr) (o *tf.Operation) {
+// handle: an empty tensor map
+func EmptyTensorMap(scope *Scope) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingMDLAdagradLightParameters",
-		Input: []tf.Input{
-			parameters, accumulators, weights, benefits,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// MapPeekAttr is an optional argument to MapPeek.
-type MapPeekAttr func(optionalAttr)
-
-// MapPeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapPeekCapacity(value int64) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapPeekMemoryLimit(value int64) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapPeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapPeekContainer(value string) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapPeekSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapPeekSharedName(value string) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op peeks at the values at the specified key.  If the
-//
-// underlying container does not contain this key
-// this op will block until it does.
-func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapPeek",
-		Input: []tf.Input{
-			key, indices,
-		},
-		Attrs: attrs,
+		Type: "EmptyTensorMap",
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapPeek", err)
-		return
-	}
-	return values
-}
-
-// RetrieveTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to RetrieveTPUEmbeddingCenteredRMSPropParameters.
-type RetrieveTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func RetrieveTPUEmbeddingCenteredRMSPropParametersTableId(value int64) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingCenteredRMSPropParametersTableName(value string) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingCenteredRMSPropParametersConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingCenteredRMSPropParametersConfig(value string) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Retrieve centered RMSProp embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns:
-//	parameters: Parameter parameters updated by the centered RMSProp optimization algorithm.
-//	ms: Parameter ms updated by the centered RMSProp optimization algorithm.
-//	mom: Parameter mom updated by the centered RMSProp optimization algorithm.
-//	mg: Parameter mg updated by the centered RMSProp optimization algorithm.
-func RetrieveTPUEmbeddingCenteredRMSPropParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingCenteredRMSPropParametersAttr) (parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingCenteredRMSPropParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// Transforms a vector of brain.Example protos (as strings) into typed tensors.
-//
-// Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	names: A vector containing the names of the serialized protos.
-// May contain, for example, table key (descriptive) names for the
-// corresponding serialized protos.  These are purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no names are available.
-// If non-empty, this vector must be the same length as "serialized".
-//	sparse_keys: A list of Nsparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: A list of Ndense string Tensors (scalars).
-// The keys expected in the Examples' features associated with dense values.
-//	dense_defaults: A list of Ndense Tensors (some may be empty).
-// dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	sparse_types: A list of Nsparse types; the data types of data in each Feature
-// given in sparse_keys.
-// Currently the ParseExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
-// given in dense_keys.
-// The number of elements in the Feature corresponding to dense_key[j]
-// must always equal dense_shapes[j].NumEntries().
-// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
-// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
-// The dense outputs are just the inputs row-stacked by batch.
-// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
-// the shape of the output Tensor dense_values[j] will be
-// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
-// of elements of length D1 * .... * DN, across all minibatch entries
-// in the input.  Any minibatch entry with less than M blocks of elements of
-// length D1 * ... * DN will be padded with the corresponding default_value
-// scalar element along the second dimension.
-func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
-	opspec := tf.OpSpec{
-		Type: "ParseExample",
-		Input: []tf.Input{
-			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
+	return op.Output(0)
 }
 
 // DatasetToGraphAttr is an optional argument to DatasetToGraph.
@@ -41881,106 +40092,6 @@ func ResourceApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, l
 	return scope.AddOperation(opspec)
 }
 
-// TensorArrayConcatV2Attr is an optional argument to TensorArrayConcatV2.
-type TensorArrayConcatV2Attr func(optionalAttr)
-
-// TensorArrayConcatV2ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayConcatV2ElementShapeExcept0(value tf.Shape) TensorArrayConcatV2Attr {
-	return func(m optionalAttr) {
-		m["element_shape_except0"] = value
-	}
-}
-
-// Deprecated. Use TensorArrayConcatV3
-func TensorArrayConcatV2(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV2Attr) (value tf.Output, lengths tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayConcatV2",
-		Input: []tf.Input{
-			handle, flow_in,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// MatrixSolveAttr is an optional argument to MatrixSolve.
-type MatrixSolveAttr func(optionalAttr)
-
-// MatrixSolveAdjoint sets the optional adjoint attribute to value.
-//
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-// adjoint.
-// If not specified, defaults to false
-func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
-	return func(m optionalAttr) {
-		m["adjoint"] = value
-	}
-}
-
-// Solves systems of linear equations.
-//
-// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
-// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
-// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `True` then each output matrix satisfies
-// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
-//
-// Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
-//
-// Returns Shape is `[..., M, K]`.
-func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixSolve",
-		Input: []tf.Input{
-			matrix, rhs,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Writes contents to the file at input filename. Creates file and recursively
-//
-// creates directory if not existing.
-//
-// Arguments:
-//	filename: scalar. The name of the file to which we write the contents.
-//	contents: scalar. The content to be written to the output file.
-//
-// Returns the created operation.
-func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "WriteFile",
-		Input: []tf.Input{
-			filename, contents,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
 type ResourceSparseApplyMomentumAttr func(optionalAttr)
 
@@ -42087,198 +40198,112 @@ func Recv(scope *Scope, tensor_type tf.DataType, tensor_name string, send_device
 	return op.Output(0)
 }
 
-// OrderedMapStageAttr is an optional argument to OrderedMapStage.
-type OrderedMapStageAttr func(optionalAttr)
+// UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
+type UniqueWithCountsAttr func(optionalAttr)
 
-// OrderedMapStageCapacity sets the optional capacity attribute to value.
-//
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
+// UniqueWithCountsOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueWithCountsOutIdx(value tf.DataType) UniqueWithCountsAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["out_idx"] = value
 	}
 }
 
-// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Finds unique elements in a 1-D tensor.
 //
-// REQUIRES: value >= 0
-func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapStageContainer sets the optional container attribute to value.
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`. This operation also returns a
+// tensor `idx` the same size as `x` that contains the index of each value of `x`
+// in the unique output `y`. Finally, it returns a third tensor `count` that
+// contains the count of each element of `y` in `x`. In other words:
 //
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
-// If not specified, defaults to ""
-func OrderedMapStageContainer(value string) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapStageSharedName sets the optional shared_name attribute to value.
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
 //
-// value: It is necessary to match this name to the matching Unstage Op.
-// If not specified, defaults to ""
-func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Stage (key, values) in the underlying container which behaves like a ordered
+// For example:
 //
-// associative container.   Elements are ordered by key.
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx, count = unique_with_counts(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// count ==> [2, 1, 3, 1, 2]
+// ```
 //
 // Arguments:
-//	key: int64
+//	x: 1-D.
 //
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
-//
-//
-// Returns the created operation.
-func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
+// Returns:
+//	y: 1-D.
+//	idx: 1-D.
+//	count: 1-D.
+func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAttr) (y tf.Output, idx tf.Output, count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapStage",
+		Type: "UniqueWithCounts",
 		Input: []tf.Input{
-			key, indices, tf.OutputList(values),
+			x,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// TPUReplicateMetadataAttr is an optional argument to TPUReplicateMetadata.
-type TPUReplicateMetadataAttr func(optionalAttr)
+// ResizeBicubicGradAttr is an optional argument to ResizeBicubicGrad.
+type ResizeBicubicGradAttr func(optionalAttr)
 
-// TPUReplicateMetadataNumCoresPerReplica sets the optional num_cores_per_replica attribute to value.
+// ResizeBicubicGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: Number of cores per replica. Used for model parallelism.
-// If not specified, defaults to 1
-func TPUReplicateMetadataNumCoresPerReplica(value int64) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["num_cores_per_replica"] = value
-	}
-}
-
-// TPUReplicateMetadataTopology sets the optional topology attribute to value.
-//
-// value: TopologyProto indicating the topology of the TPU pod slice.
-// If not specified, defaults to ""
-func TPUReplicateMetadataTopology(value string) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["topology"] = value
-	}
-}
-
-// TPUReplicateMetadataUseTpu sets the optional use_tpu attribute to value.
-//
-// value: Whether to place the computation on the TPU.
-// If not specified, defaults to true
-func TPUReplicateMetadataUseTpu(value bool) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["use_tpu"] = value
-	}
-}
-
-// TPUReplicateMetadataDeviceAssignment sets the optional device_assignment attribute to value.
-//
-// value: The assignment of devices for the computation.
-// If not specified, defaults to <>
-func TPUReplicateMetadataDeviceAssignment(value []int64) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["device_assignment"] = value
-	}
-}
-
-// TPUReplicateMetadataComputationShape sets the optional computation_shape attribute to value.
-//
-// value: DEPRECATED. Use num_cores_per_replica instead.
-// If not specified, defaults to <>
-func TPUReplicateMetadataComputationShape(value []int64) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["computation_shape"] = value
-	}
-}
-
-// TPUReplicateMetadataHostComputeCore sets the optional host_compute_core attribute to value.
-// If not specified, defaults to <>
-func TPUReplicateMetadataHostComputeCore(value []string) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["host_compute_core"] = value
-	}
-}
-
-// TPUReplicateMetadataPaddingMap sets the optional padding_map attribute to value.
-// If not specified, defaults to <>
-func TPUReplicateMetadataPaddingMap(value []string) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["padding_map"] = value
-	}
-}
-
-// TPUReplicateMetadataStepMarkerLocation sets the optional step_marker_location attribute to value.
-// If not specified, defaults to "STEP_MARK_AT_ENTRY"
-func TPUReplicateMetadataStepMarkerLocation(value string) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["step_marker_location"] = value
-	}
-}
-
-// TPUReplicateMetadataAllowSoftPlacement sets the optional allow_soft_placement attribute to value.
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
 // If not specified, defaults to false
-func TPUReplicateMetadataAllowSoftPlacement(value bool) TPUReplicateMetadataAttr {
+func ResizeBicubicGradAlignCorners(value bool) ResizeBicubicGradAttr {
 	return func(m optionalAttr) {
-		m["allow_soft_placement"] = value
+		m["align_corners"] = value
 	}
 }
 
-// TPUReplicateMetadataUseSpmdForXlaPartitioning sets the optional use_spmd_for_xla_partitioning attribute to value.
+// ResizeBicubicGradHalfPixelCenters sets the optional half_pixel_centers attribute to value.
 // If not specified, defaults to false
-func TPUReplicateMetadataUseSpmdForXlaPartitioning(value bool) TPUReplicateMetadataAttr {
+func ResizeBicubicGradHalfPixelCenters(value bool) ResizeBicubicGradAttr {
 	return func(m optionalAttr) {
-		m["use_spmd_for_xla_partitioning"] = value
+		m["half_pixel_centers"] = value
 	}
 }
 
-// Metadata indicating how the TPU computation should be replicated.
-//
-// This operation holds the metadata common to operations of a `tpu.replicate()` computation subgraph.
+// Computes the gradient of bicubic interpolation.
 //
 // Arguments:
-//	num_replicas: Number of replicas of the computation
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
+// The image tensor that was resized.
 //
-// Returns the created operation.
-func TPUReplicateMetadata(scope *Scope, num_replicas int64, optional ...TPUReplicateMetadataAttr) (o *tf.Operation) {
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
+// Gradients with respect to the input image. Input image must have been
+// float or double.
+func ResizeBicubicGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBicubicGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_replicas": num_replicas}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TPUReplicateMetadata",
-
+		Type: "ResizeBicubicGrad",
+		Input: []tf.Input{
+			grads, original_image,
+		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
 // TensorListConcatAttr is an optional argument to TensorListConcat.
@@ -42381,6 +40406,67 @@ func LoadTPUEmbeddingRMSPropParametersGradAccumDebug(scope *Scope, parameters tf
 	return scope.AddOperation(opspec)
 }
 
+// LoadTPUEmbeddingAdadeltaParametersAttr is an optional argument to LoadTPUEmbeddingAdadeltaParameters.
+type LoadTPUEmbeddingAdadeltaParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingAdadeltaParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func LoadTPUEmbeddingAdadeltaParametersTableId(value int64) LoadTPUEmbeddingAdadeltaParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingAdadeltaParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdadeltaParametersTableName(value string) LoadTPUEmbeddingAdadeltaParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// LoadTPUEmbeddingAdadeltaParametersConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdadeltaParametersConfig(value string) LoadTPUEmbeddingAdadeltaParametersAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Load Adadelta embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the Adadelta optimization algorithm.
+//	accumulators: Value of accumulators used in the Adadelta optimization algorithm.
+//	updates: Value of updates used in the Adadelta optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingAdadeltaParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, updates tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdadeltaParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingAdadeltaParameters",
+		Input: []tf.Input{
+			parameters, accumulators, updates,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingFTRLParametersGradAccumDebug.
 type LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr func(optionalAttr)
 
@@ -42443,67 +40529,6 @@ func LoadTPUEmbeddingFTRLParametersGradAccumDebug(scope *Scope, parameters tf.Ou
 	return scope.AddOperation(opspec)
 }
 
-// LoadTPUEmbeddingAdadeltaParametersAttr is an optional argument to LoadTPUEmbeddingAdadeltaParameters.
-type LoadTPUEmbeddingAdadeltaParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingAdadeltaParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func LoadTPUEmbeddingAdadeltaParametersTableId(value int64) LoadTPUEmbeddingAdadeltaParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingAdadeltaParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingAdadeltaParametersTableName(value string) LoadTPUEmbeddingAdadeltaParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// LoadTPUEmbeddingAdadeltaParametersConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingAdadeltaParametersConfig(value string) LoadTPUEmbeddingAdadeltaParametersAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Load Adadelta embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the Adadelta optimization algorithm.
-//	accumulators: Value of accumulators used in the Adadelta optimization algorithm.
-//	updates: Value of updates used in the Adadelta optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingAdadeltaParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, updates tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdadeltaParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingAdadeltaParameters",
-		Input: []tf.Input{
-			parameters, accumulators, updates,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
 type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
 
@@ -42625,25 +40650,6 @@ func ResourceSparseApplyAdagradV2(scope *Scope, var_ tf.Output, accum tf.Output,
 	return scope.AddOperation(opspec)
 }
 
-// Restore a Reader to its initial clean state.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//
-// Returns the created operation.
-func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderResetV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // FakeQuantWithMinMaxVarsPerChannelGradientAttr is an optional argument to FakeQuantWithMinMaxVarsPerChannelGradient.
 type FakeQuantWithMinMaxVarsPerChannelGradientAttr func(optionalAttr)
 
@@ -43002,45 +41008,6 @@ func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumul
 	return scope.AddOperation(opspec)
 }
 
-// Applies softmax to a batched N-D `SparseTensor`.
-//
-// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
-// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
-//
-// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
-// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
-// zero elements do not participate*.  Specifically, the algorithm is equivalent
-// to the following:
-//
-//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
-//       with shape `[B, C]`, along the size-C dimension;
-//   (2) Masks out the original implicitly-zero locations;
-//   (3) Renormalizes the remaining elements.
-//
-// Hence, the `SparseTensor` result has exactly the same non-zero indices and
-// shape.
-//
-// Arguments:
-//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
-// SparseTensor, in canonical ordering.
-//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//
-// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
-func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSoftmax",
-		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Adjust the hue of one or more images.
 //
 // `images` is a tensor of at least 3 dimensions.  The last dimension is
@@ -43140,6 +41107,106 @@ func CollectiveReduce(scope *Scope, input tf.Output, group_size int64, group_key
 	return op.Output(0)
 }
 
+// TensorArrayConcatV2Attr is an optional argument to TensorArrayConcatV2.
+type TensorArrayConcatV2Attr func(optionalAttr)
+
+// TensorArrayConcatV2ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayConcatV2ElementShapeExcept0(value tf.Shape) TensorArrayConcatV2Attr {
+	return func(m optionalAttr) {
+		m["element_shape_except0"] = value
+	}
+}
+
+// Deprecated. Use TensorArrayConcatV3
+func TensorArrayConcatV2(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV2Attr) (value tf.Output, lengths tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayConcatV2",
+		Input: []tf.Input{
+			handle, flow_in,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Writes contents to the file at input filename. Creates file and recursively
+//
+// creates directory if not existing.
+//
+// Arguments:
+//	filename: scalar. The name of the file to which we write the contents.
+//	contents: scalar. The content to be written to the output file.
+//
+// Returns the created operation.
+func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteFile",
+		Input: []tf.Input{
+			filename, contents,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// MatrixSolveAttr is an optional argument to MatrixSolve.
+type MatrixSolveAttr func(optionalAttr)
+
+// MatrixSolveAdjoint sets the optional adjoint attribute to value.
+//
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+// adjoint.
+// If not specified, defaults to false
+func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
+	return func(m optionalAttr) {
+		m["adjoint"] = value
+	}
+}
+
+// Solves systems of linear equations.
+//
+// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
+// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
+// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `True` then each output matrix satisfies
+// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
+//
+// Arguments:
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
+//
+// Returns Shape is `[..., M, K]`.
+func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixSolve",
+		Input: []tf.Input{
+			matrix, rhs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ResourceApplyAdaMaxAttr is an optional argument to ResourceApplyAdaMax.
 type ResourceApplyAdaMaxAttr func(optionalAttr)
 
@@ -43272,6 +41339,453 @@ func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output,
 	return op.Output(0), op.Output(1)
 }
 
+// QuantizeV2Attr is an optional argument to QuantizeV2.
+type QuantizeV2Attr func(optionalAttr)
+
+// QuantizeV2Mode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func QuantizeV2Mode(value string) QuantizeV2Attr {
+	return func(m optionalAttr) {
+		m["mode"] = value
+	}
+}
+
+// QuantizeV2RoundMode sets the optional round_mode attribute to value.
+// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
+func QuantizeV2RoundMode(value string) QuantizeV2Attr {
+	return func(m optionalAttr) {
+		m["round_mode"] = value
+	}
+}
+
+// QuantizeV2NarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func QuantizeV2NarrowRange(value bool) QuantizeV2Attr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// QuantizeV2Axis sets the optional axis attribute to value.
+// If not specified, defaults to -1
+func QuantizeV2Axis(value int64) QuantizeV2Attr {
+	return func(m optionalAttr) {
+		m["axis"] = value
+	}
+}
+
+// QuantizeV2EnsureMinimumRange sets the optional ensure_minimum_range attribute to value.
+// If not specified, defaults to 0.01
+func QuantizeV2EnsureMinimumRange(value float32) QuantizeV2Attr {
+	return func(m optionalAttr) {
+		m["ensure_minimum_range"] = value
+	}
+}
+
+// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
+//
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.  The
+// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
+// when rounding float values to their quantized equivalents.
+//
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
+//
+// ```
+// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
+// if T == qint8: out[i] -= (range(T) + 1) / 2.0
+// ```
+//
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+//
+// *MIN_COMBINED Mode Example*
+//
+// Assume the input is type float and has a possible range of [0.0, 6.0] and the
+// output type is quint8 ([0, 255]). The min_range and max_range values should be
+// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
+// value of the input by 255/6 and cast to quint8.
+//
+// If the output type was qint8 ([-128, 127]), the operation will additionally
+// subtract each value by 128 prior to casting, so that the range of values aligns
+// with the range of qint8.
+//
+// If the mode is 'MIN_FIRST', then this approach is used:
+//
+// ```
+// num_discrete_values = 1 << (# of bits in T)
+// range_adjust = num_discrete_values / (num_discrete_values - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = num_discrete_values / range
+// quantized = round(input * range_scale) - round(range_min * range_scale) +
+//   numeric_limits<T>::min()
+// quantized = max(quantized, numeric_limits<T>::min())
+// quantized = min(quantized, numeric_limits<T>::max())
+// ```
+//
+// The biggest difference between this and MIN_COMBINED is that the minimum range
+// is rounded first, before it's subtracted from the rounded value. With
+// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
+// and dequantizing will introduce a larger and larger error.
+//
+// *SCALED mode Example*
+//
+// `SCALED` mode matches the quantization approach used in
+// `QuantizeAndDequantize{V2|V3}`.
+//
+// If the mode is `SCALED`, the quantization is performed by multiplying each
+// input value by a scaling_factor.
+// The scaling_factor is determined from `min_range` and `max_range` to be as large
+// as possible such that the range from `min_range` to `max_range` is representable
+// within values of type T.
+//
+// ```c++
+//
+//   const int min_T = std::numeric_limits<T>::min();
+//   const int max_T = std::numeric_limits<T>::max();
+//   const float max_float = std::numeric_limits<float>::max();
+//
+//   const float scale_factor_from_min_side =
+//       (min_T * min_range > 0) ? min_T / min_range : max_float;
+//   const float scale_factor_from_max_side =
+//       (max_T * max_range > 0) ? max_T / max_range : max_float;
+//
+//   const float scale_factor = std::min(scale_factor_from_min_side,
+//                                       scale_factor_from_max_side);
+// ```
+//
+// We next use the scale_factor to adjust min_range and max_range as follows:
+//
+// ```c++
+//       min_range = min_T / scale_factor;
+//       max_range = max_T / scale_factor;
+// ```
+//
+//
+// e.g. if T = qint8, and initially min_range = -10, and max_range = 9, we would
+// compare -128/-10.0 = 12.8 to 127/9.0 = 14.11, and set scaling_factor = 12.8
+// In this case, min_range would remain -10, but max_range would be adjusted to
+// 127 / 12.8 = 9.921875
+//
+// So we will quantize input values in the range (-10, 9.921875) to (-128, 127).
+//
+// The input tensor can now be quantized by clipping values to the range
+// `min_range` to `max_range`, then multiplying by scale_factor as follows:
+//
+// ```c++
+// result = round(min(max_range, max(min_range, input)) * scale_factor)
+// ```
+//
+// The adjusted `min_range` and `max_range` are returned as outputs 2 and 3 of
+// this operation. These outputs should be used as the range for any further
+// calculations.
+//
+//
+// *narrow_range (bool) attribute*
+//
+// If true, we do not use the minimum quantized value.
+// i.e. for int8 the quantized output, it would be restricted to the range
+// -127..127 instead of the full -128..127 range.
+// This is provided for compatibility with certain inference backends.
+// (Only applies to SCALED mode)
+//
+//
+// *axis (int) attribute*
+//
+// An optional `axis` attribute can specify a dimension index of the input tensor,
+// such that quantization ranges will be calculated and applied separately for each
+// slice of the tensor along that dimension. This is useful for per-channel
+// quantization.
+//
+// If axis is specified, min_range and max_range
+//
+// if `axis`=None, per-tensor quantization is performed as normal.
+//
+//
+// *ensure_minimum_range (float) attribute*
+//
+// Ensures the minimum quantization range is at least this value.
+// The legacy default value for this is 0.01, but it is strongly suggested to
+// set it to 0 for new uses.
+//
+//
+// Arguments:
+//
+//	min_range: The minimum value of the quantization range. This value may be adjusted by the
+// op depending on other parameters. The adjusted value is written to `output_min`.
+// If the `axis` attribute is specified, this must be a 1-D tensor whose size
+// matches the `axis` dimension of the input and output tensors.
+//	max_range: The maximum value of the quantization range. This value may be adjusted by the
+// op depending on other parameters. The adjusted value is written to `output_max`.
+// If the `axis` attribute is specified, this must be a 1-D tensor whose size
+// matches the `axis` dimension of the input and output tensors.
+//
+//
+// Returns:
+//	output: The quantized data produced from the float input.
+//	output_min: The final quantization range minimum, used to clip input values before scaling
+// and rounding them to quantized values.
+// If the `axis` attribute is specified, this will be a 1-D tensor whose size
+// matches the `axis` dimension of the input and output tensors.
+//	output_max: The final quantization range maximum, used to clip input values before scaling
+// and rounding them to quantized values.
+// If the `axis` attribute is specified, this will be a 1-D tensor whose size
+// matches the `axis` dimension of the input and output tensors.
+func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"T": T}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizeV2",
+		Input: []tf.Input{
+			input, min_range, max_range,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns the truth value of (x >= y) element-wise.
+//
+// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+//
+// Example:
+//
+// ```python
+// x = tf.constant([5, 4, 6, 7])
+// y = tf.constant([5, 2, 5, 10])
+// tf.math.greater_equal(x, y) ==> [True, True, True, False]
+//
+// x = tf.constant([5, 4, 6, 7])
+// y = tf.constant([5])
+// tf.math.greater_equal(x, y) ==> [True, False, True, True]
+// ```
+func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GreaterEqual",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// BatchAttr is an optional argument to Batch.
+type BatchAttr func(optionalAttr)
+
+// BatchMaxEnqueuedBatches sets the optional max_enqueued_batches attribute to value.
+// If not specified, defaults to 10
+func BatchMaxEnqueuedBatches(value int64) BatchAttr {
+	return func(m optionalAttr) {
+		m["max_enqueued_batches"] = value
+	}
+}
+
+// BatchAllowedBatchSizes sets the optional allowed_batch_sizes attribute to value.
+// If not specified, defaults to <>
+func BatchAllowedBatchSizes(value []int64) BatchAttr {
+	return func(m optionalAttr) {
+		m["allowed_batch_sizes"] = value
+	}
+}
+
+// BatchContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func BatchContainer(value string) BatchAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// BatchSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func BatchSharedName(value string) BatchAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// BatchBatchingQueue sets the optional batching_queue attribute to value.
+// If not specified, defaults to ""
+func BatchBatchingQueue(value string) BatchAttr {
+	return func(m optionalAttr) {
+		m["batching_queue"] = value
+	}
+}
+
+// Batches all input tensors nondeterministically.
+//
+// When many instances of this Op are being run concurrently with the same
+// container/shared_name in the same device, some will output zero-shaped Tensors
+// and others will output Tensors of size up to max_batch_size.
+//
+// All Tensors in in_tensors are batched together (so, for example, labels and
+// features should be batched with a single instance of this operation.
+//
+// Each invocation of batch emits an `id` scalar which will be used to identify
+// this particular invocation when doing unbatch or its gradient.
+//
+// Each op which emits a non-empty batch will also emit a non-empty batch_index
+// Tensor, which, is a [K, 3] matrix where each row contains the invocation's id,
+// start, and length of elements of each set of Tensors present in batched_tensors.
+//
+// Batched tensors are concatenated along the first dimension, and all tensors in
+// in_tensors must have the first dimension of the same size.
+//
+// in_tensors: The tensors to be batched.
+// num_batch_threads: Number of scheduling threads for processing batches of work.
+//  Determines the number of batches processed in parallel.
+// max_batch_size: Batch sizes will never be bigger than this.
+// batch_timeout_micros: Maximum number of microseconds to wait before outputting
+//  an incomplete batch.
+// allowed_batch_sizes: Optional list of allowed batch sizes. If left empty, does
+//  nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
+//  batches up to one of those sizes. The entries must increase monotonically, and
+//  the final entry must equal max_batch_size.
+// grad_timeout_micros: The timeout to use for the gradient. See Unbatch.
+// batched_tensors: Either empty tensors or a batch of concatenated Tensors.
+// batch_index: If out_tensors is non-empty, has information to invert it.
+// container: Controls the scope of sharing of this batch.
+// id: always contains a scalar with a unique ID for this invocation of Batch.
+// shared_name: Concurrently running instances of batch in the same device with the
+//  same container and shared_name will batch their elements together. If left
+//  empty, the op name will be used as the shared name.
+// T: the types of tensors to be batched.
+func Batch(scope *Scope, in_tensors []tf.Output, num_batch_threads int64, max_batch_size int64, batch_timeout_micros int64, grad_timeout_micros int64, optional ...BatchAttr) (batched_tensors []tf.Output, batch_index tf.Output, id tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_batch_threads": num_batch_threads, "max_batch_size": max_batch_size, "batch_timeout_micros": batch_timeout_micros, "grad_timeout_micros": grad_timeout_micros}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Batch",
+		Input: []tf.Input{
+			tf.OutputList(in_tensors),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if batched_tensors, idx, err = makeOutputList(op, idx, "batched_tensors"); err != nil {
+		scope.UpdateErr("Batch", err)
+		return
+	}
+	batch_index = op.Output(idx)
+	id = op.Output(idx)
+	return batched_tensors, batch_index, id
+}
+
+// UnicodeDecodeAttr is an optional argument to UnicodeDecode.
+type UnicodeDecodeAttr func(optionalAttr)
+
+// UnicodeDecodeErrors sets the optional errors attribute to value.
+//
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeDecodeErrors(value string) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
+	}
+}
+
+// UnicodeDecodeReplacementChar sets the optional replacement_char attribute to value.
+//
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD or U+65533.)
+// If not specified, defaults to 65533
+func UnicodeDecodeReplacementChar(value int64) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
+	}
+}
+
+// UnicodeDecodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
+//
+// value: Whether to replace the C0 control characters (00-1F) with the
+// `replacement_char`. Default is false.
+// If not specified, defaults to false
+func UnicodeDecodeReplaceControlCharacters(value bool) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["replace_control_characters"] = value
+	}
+}
+
+// UnicodeDecodeTsplits sets the optional Tsplits attribute to value.
+// If not specified, defaults to DT_INT64
+func UnicodeDecodeTsplits(value tf.DataType) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["Tsplits"] = value
+	}
+}
+
+// Decodes each string in `input` into a sequence of Unicode code points.
+//
+// The character codepoints for all strings are returned using a single vector
+// `char_values`, with strings expanded to characters in row-major order.
+//
+// The `row_splits` tensor indicates where the codepoints for
+// each input string begin and end within the `char_values` tensor.
+// In particular, the values for the `i`th
+// string (in row-major order) are stored in the slice
+// `[row_splits[i]:row_splits[i+1]]`. Thus:
+//
+// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
+//   string (in row-major order).
+//
+// Arguments:
+//	input: The text to be decoded. Can have any shape. Note that the output is flattened
+// to a vector of char values.
+//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
+// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+//
+// Returns:
+//	row_splits: A 1D int32 tensor containing the row splits.
+//	char_values: A 1D int32 Tensor containing the decoded codepoints.
+func UnicodeDecode(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeAttr) (row_splits tf.Output, char_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"input_encoding": input_encoding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UnicodeDecode",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // LSTMBlockCellAttr is an optional argument to LSTMBlockCell.
 type LSTMBlockCellAttr func(optionalAttr)
 
@@ -43369,6 +41883,63 @@ func LSTMBlockCell(scope *Scope, x tf.Output, cs_prev tf.Output, h_prev tf.Outpu
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
 }
 
+// Returns x - y element-wise.
+//
+// *NOTE*: `Subtract` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sub",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Says whether the targets are in the top `K` predictions.
+//
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
+//
+// More formally, let
+//
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
+//
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+//
+// Arguments:
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
+//
+// Returns Computed Precision at `k` as a `bool Tensor`.
+func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"k": k}
+	opspec := tf.OpSpec{
+		Type: "InTopK",
+		Input: []tf.Input{
+			predictions, targets,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // A TPU core selector Op.
 //
 // This Op produces a set of TPU cores (for warm-up) or a single TPU core
@@ -43515,35 +42086,6 @@ func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output, optiona
 	return op.Output(0)
 }
 
-// Convert JSON-encoded Example records to binary protocol buffer strings.
-//
-// This op translates a tensor containing Example records, encoded using
-// the [standard JSON
-// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
-// into a tensor containing the same records encoded as binary protocol
-// buffers. The resulting tensor can then be fed to any of the other
-// Example-parsing ops.
-//
-// Arguments:
-//	json_examples: Each string is a JSON object serialized according to the JSON
-// mapping of the Example proto.
-//
-// Returns Each string is a binary Example protocol buffer corresponding
-// to the respective element of `json_examples`.
-func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeJSONExample",
-		Input: []tf.Input{
-			json_examples,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Elementwise computes the bitwise AND of `x` and `y`.
 //
 // The result will have those bits set, that are set in both `x` and `y`. The
@@ -43580,91 +42122,6 @@ func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// DecodeCSVAttr is an optional argument to DecodeCSV.
-type DecodeCSVAttr func(optionalAttr)
-
-// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
-//
-// value: char delimiter to separate fields in a record.
-// If not specified, defaults to ","
-func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["field_delim"] = value
-	}
-}
-
-// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
-//
-// value: If false, treats double quotation marks as regular
-// characters inside of the string fields (ignoring RFC 4180, Section 2,
-// Bullet 5).
-// If not specified, defaults to true
-func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["use_quote_delim"] = value
-	}
-}
-
-// DecodeCSVNaValue sets the optional na_value attribute to value.
-//
-// value: Additional string to recognize as NA/NaN.
-// If not specified, defaults to ""
-func DecodeCSVNaValue(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["na_value"] = value
-	}
-}
-
-// DecodeCSVSelectCols sets the optional select_cols attribute to value.
-// If not specified, defaults to <>
-func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["select_cols"] = value
-	}
-}
-
-// Convert CSV records to tensors. Each column maps to one tensor.
-//
-// RFC 4180 format is expected for the CSV records.
-// (https://tools.ietf.org/html/rfc4180)
-// Note that we allow leading and trailing spaces with int or float field.
-//
-// Arguments:
-//	records: Each string is a record/row in the csv and all records should have
-// the same format.
-//	record_defaults: One tensor per column of the input record, with either a
-// scalar default value for that column or an empty vector if the column is
-// required.
-//
-// Returns Each tensor will have the same shape as records.
-func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeCSV",
-		Input: []tf.Input{
-			records, tf.OutputList(record_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("DecodeCSV", err)
-		return
-	}
-	return output
-}
-
 // SerializeIteratorAttr is an optional argument to SerializeIterator.
 type SerializeIteratorAttr func(optionalAttr)
 
@@ -44000,444 +42457,6 @@ func ImageProjectiveTransformV2(scope *Scope, images tf.Output, transforms tf.Ou
 	return op.Output(0)
 }
 
-// Computes rectified linear gradients for a Relu operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu operation.
-//	features: The features passed as input to the corresponding Relu operation, OR
-// the outputs of that operation (both work equivalently).
-//
-// Returns `gradients * (features > 0)`.
-func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReluGrad",
-		Input: []tf.Input{
-			gradients, features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
-type ResourceApplyMomentumAttr func(optionalAttr)
-
-// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update '*var' according to the momentum scheme.
-//
-// Set use_nesterov = True if you want to use Nesterov momentum.
-//
-// accum = accum * momentum + grad
-// var -= lr * accum
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
-//	momentum: Momentum. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyMomentum",
-		Input: []tf.Input{
-			var_, accum, lr, grad, momentum,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// IRFFTAttr is an optional argument to IRFFT.
-type IRFFTAttr func(optionalAttr)
-
-// IRFFTTreal sets the optional Treal attribute to value.
-// If not specified, defaults to DT_FLOAT
-func IRFFTTreal(value tf.DataType) IRFFTAttr {
-	return func(m optionalAttr) {
-		m["Treal"] = value
-	}
-}
-
-// Inverse real-valued fast Fourier transform.
-//
-// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most dimension of `input`.
-//
-// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
-// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
-// `fft_length` is not provided, it is computed from the size of the inner-most
-// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
-// compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
-//
-// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
-// than the corresponding dimension of `input`, the dimension is cropped. If it is
-// larger, the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A complex tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
-//
-// Returns A float32 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length` samples of its inverse
-//   1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft
-// @end_compatibility
-func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output, optional ...IRFFTAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "IRFFT",
-		Input: []tf.Input{
-			input, fft_length,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// EnqueueTPUEmbeddingSparseBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseBatch.
-type EnqueueTPUEmbeddingSparseBatchAttr func(optionalAttr)
-
-// EnqueueTPUEmbeddingSparseBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. Should be >= 0 and less than the number
-// of TPU cores in the task on which the node is placed.
-// If not specified, defaults to -1
-func EnqueueTPUEmbeddingSparseBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingSparseBatchAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// EnqueueTPUEmbeddingSparseBatchCombiners sets the optional combiners attribute to value.
-//
-// value: A list of string scalars, one for each embedding table that specify
-// how to normalize the embedding activations after weighted summation.
-// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
-// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
-// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
-// all tables.
-// If not specified, defaults to <>
-func EnqueueTPUEmbeddingSparseBatchCombiners(value []string) EnqueueTPUEmbeddingSparseBatchAttr {
-	return func(m optionalAttr) {
-		m["combiners"] = value
-	}
-}
-
-// An op that enqueues TPUEmbedding input indices from a SparseTensor.
-//
-// This Op eases the porting of code that uses embedding_lookup_sparse(),
-// although some Python preprocessing of the SparseTensor arguments to
-// embedding_lookup_sparse() is required to produce the arguments to this Op,
-// since only a single EnqueueTPUEmbeddingSparseBatch Op is allowed per training
-// step.
-//
-// The tensors at corresponding positions in the three input lists
-// must have the same shape, i.e. rank 1 with dim_size() equal to the total
-// number of lookups into the table described by the corresponding table_id.
-//
-// Arguments:
-//	sample_indices: A list of rank 1 Tensors specifying the training example and
-// feature to which the corresponding embedding_indices and aggregation_weights
-// values belong. sample_indices[i] must equal b * nf + f, where nf is the
-// number of features from the corresponding table, f is in [0, nf), and
-// b is in [0, batch size).
-//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
-//	aggregation_weights: A list of rank 1 Tensors containing per sample -- i.e. per
-// (training example, feature) -- aggregation weights.
-//	mode_override: A string input that overrides the mode specified in the
-// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
-// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
-// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
-//
-// Returns the created operation.
-func EnqueueTPUEmbeddingSparseBatch(scope *Scope, sample_indices []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingSparseBatchAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EnqueueTPUEmbeddingSparseBatch",
-		Input: []tf.Input{
-			tf.OutputList(sample_indices), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
-type ResourceScatterNdUpdateAttr func(optionalAttr)
-
-// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
-//
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
-// If not specified, defaults to true
-func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Applies sparse `updates` to individual values or slices within a given
-//
-// variable according to `indices`.
-//
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-// ```
-//
-// For example, say we want to update 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that update would look like this:
-//
-// ```python
-//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1] ,[7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     update = tf.scatter_nd_update(ref, indices, updates)
-//     with tf.Session() as sess:
-//       print sess.run(update)
-// ```
-//
-// The resulting update to ref would look like this:
-//
-//     [1, 11, 3, 10, 9, 6, 7, 12]
-//
-// See `tf.scatter_nd` for more details about how to make updates to
-// slices.
-//
-// Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated
-// values to add to ref.
-//
-// Returns the created operation.
-func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdUpdate",
-		Input: []tf.Input{
-			ref, indices, updates,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes square root of x element-wise.
-//
-// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
-func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sqrt",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor.
-//
-// This operation folds the padded areas of `input` by `MirrorPad` according to the
-// `paddings` you specify. `paddings` must be the same as `paddings` argument
-// given to the corresponding `MirrorPad` op.
-//
-// The folded size of each dimension D of the output is:
-//
-// `input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`
-//
-// For example:
-//
-// ```
-// # 't' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].
-// # 'paddings' is [[0, 1]], [0, 1]].
-// # 'mode' is SYMMETRIC.
-// # rank of 't' is 2.
-// pad(t, paddings) ==> [[ 1,  5]
-//                       [11, 28]]
-// ```
-//
-// Arguments:
-//	input: The input tensor to be folded.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	mode: The mode used in the `MirrorPad` op.
-//
-// Returns The folded tensor.
-func MirrorPadGrad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"mode": mode}
-	opspec := tf.OpSpec{
-		Type: "MirrorPadGrad",
-		Input: []tf.Input{
-			input, paddings,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Produces the max pool of the input tensor for quantized types.
-//
-// Arguments:
-//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	ksize: The size of the window for each dimension of the input tensor.
-// The length must be 4 to match the number of dimensions of the input.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. The length must be 4 to match the number of dimensions of the input.
-//	padding: The type of padding algorithm to use.
-//
-// Returns:
-//	output
-//	min_output: The float value that the lowest quantized output value represents.
-//	max_output: The float value that the highest quantized output value represents.
-func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "QuantizedMaxPool",
-		Input: []tf.Input{
-			input, min_input, max_input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
-type ResourceApplyAdagradAttr func(optionalAttr)
-
-// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
-// If not specified, defaults to true
-func ResourceApplyAdagradUpdateSlots(value bool) ResourceApplyAdagradAttr {
-	return func(m optionalAttr) {
-		m["update_slots"] = value
-	}
-}
-
-// Update '*var' according to the adagrad scheme.
-//
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagrad",
-		Input: []tf.Input{
-			var_, accum, lr, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // The gradient of SparseFillEmptyRows.
 //
 // Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
@@ -44618,99 +42637,286 @@ func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output,
 	return op.Output(0), op.Output(1)
 }
 
-// Elementwise computes the bitwise left-shift of `x` and `y`.
+// Says whether the targets are in the top `K` predictions.
 //
-// If `y` is negative, or greater than or equal to the width of `x` in bits the
-// result is implementation defined.
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
 //
-// Example:
+// More formally, let
 //
-// ```python
-// import tensorflow as tf
-// from tensorflow.python.ops import bitwise_ops
-// import numpy as np
-// dtype_list = [tf.int8, tf.int16, tf.int32, tf.int64]
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
 //
-// for dtype in dtype_list:
-//   lhs = tf.constant([-1, -5, -3, -14], dtype=dtype)
-//   rhs = tf.constant([5, 0, 7, 11], dtype=dtype)
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
 //
-//   left_shift_result = bitwise_ops.left_shift(lhs, rhs)
+// Arguments:
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
 //
-//   print(left_shift_result)
-//
-// # This will print:
-// # tf.Tensor([ -32   -5 -128    0], shape=(4,), dtype=int8)
-// # tf.Tensor([   -32     -5   -384 -28672], shape=(4,), dtype=int16)
-// # tf.Tensor([   -32     -5   -384 -28672], shape=(4,), dtype=int32)
-// # tf.Tensor([   -32     -5   -384 -28672], shape=(4,), dtype=int64)
-//
-// lhs = np.array([-2, 64, 101, 32], dtype=np.int8)
-// rhs = np.array([-1, -5, -3, -14], dtype=np.int8)
-// bitwise_ops.left_shift(lhs, rhs)
-// # <tf.Tensor: shape=(4,), dtype=int8, numpy=array([ -2,  64, 101,  32], dtype=int8)>
-// ```
-//
-func LeftShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Returns Computed precision at `k` as a `bool Tensor`.
+func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LeftShift",
+		Type: "InTopKV2",
 		Input: []tf.Input{
-			x, y,
+			predictions, targets, k,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Generates a feature cross from a list of tensors, and returns it as a
-// RaggedTensor.  See `tf.ragged.cross` for more details.
-//
-// Arguments:
-//	ragged_values: The values tensor for each RaggedTensor input.
-//	ragged_row_splits: The row_splits tensor for each RaggedTensor input.
-//	sparse_indices: The indices tensor for each SparseTensor input.
-//	sparse_values: The values tensor for each SparseTensor input.
-//	sparse_shape: The dense_shape tensor for each SparseTensor input.
-//	dense_inputs: The tf.Tensor inputs.
-//	input_order: String specifying the tensor type for each input.  The `i`th character in
-// this string specifies the type of the `i`th input, and is one of: 'R' (ragged),
-// 'D' (dense), or 'S' (sparse).  This attr is used to ensure that the crossed
-// values are combined in the order of the inputs from the call to tf.ragged.cross.
-//
-//
-//
-//
-//
-//
-// Returns:
-//	output_values: The `values` for the returned `RaggedTensor`.
-//	output_row_splits: The `row_splits` for the returned `RaggedTensor`.
-func RaggedCross(scope *Scope, ragged_values []tf.Output, ragged_row_splits []tf.Output, sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shape []tf.Output, dense_inputs []tf.Output, input_order string, hashed_output bool, num_buckets int64, hash_key int64, out_values_type tf.DataType, out_row_splits_type tf.DataType) (output_values tf.Output, output_row_splits tf.Output) {
+// Creates an Optional variant with no value.
+func OptionalNone(scope *Scope) (optional tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"input_order": input_order, "hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_values_type": out_values_type, "out_row_splits_type": out_row_splits_type}
 	opspec := tf.OpSpec{
-		Type: "RaggedCross",
+		Type: "OptionalNone",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr is an optional argument to RetrieveTPUEmbeddingStochasticGradientDescentParameters.
+type RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func RetrieveTPUEmbeddingStochasticGradientDescentParametersTableId(value int64) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingStochasticGradientDescentParametersTableName(value string) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingStochasticGradientDescentParametersConfig(value string) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Retrieve SGD embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the stochastic gradient descent optimization algorithm.
+func RetrieveTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr) (parameters tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingStochasticGradientDescentParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deprecated. Use TensorArraySplitV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
+func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArraySplitV2",
 		Input: []tf.Input{
-			tf.OutputList(ragged_values), tf.OutputList(ragged_row_splits), tf.OutputList(sparse_indices), tf.OutputList(sparse_values), tf.OutputList(sparse_shape), tf.OutputList(dense_inputs),
+			handle, value, lengths, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// UpperBoundAttr is an optional argument to UpperBound.
+type UpperBoundAttr func(optionalAttr)
+
+// UpperBoundOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func UpperBoundOutType(value tf.DataType) UpperBoundAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Applies upper_bound(sorted_search_values, values) along each row.
+//
+// Each set of rows with the same index in (sorted_inputs, values) is treated
+// independently.  The resulting row is the equivalent of calling
+// `np.searchsorted(sorted_inputs, values, side='right')`.
+//
+// The result is not a global index to the entire
+// `Tensor`, but rather just the index in the last dimension.
+//
+// A 2-D example:
+//   sorted_sequence = [[0, 3, 9, 9, 10],
+//                      [1, 2, 3, 4, 5]]
+//   values = [[2, 4, 9],
+//             [0, 2, 6]]
+//
+//   result = UpperBound(sorted_sequence, values)
+//
+//   result == [[1, 2, 4],
+//              [0, 2, 5]]
+//
+// Arguments:
+//	sorted_inputs: 2-D Tensor where each row is ordered.
+//	values: 2-D Tensor with the same numbers of rows as `sorted_search_values`. Contains
+// the values that will be searched for in `sorted_search_values`.
+//
+// Returns A `Tensor` with the same shape as `values`.  It contains the last scalar index
+// into the last dimension where values can be inserted without changing the
+// ordered property.
+func UpperBound(scope *Scope, sorted_inputs tf.Output, values tf.Output, optional ...UpperBoundAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UpperBound",
+		Input: []tf.Input{
+			sorted_inputs, values,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Output a fact about factorials.
-func Fact(scope *Scope) (fact tf.Output) {
+// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
+type ResourceApplyFtrlV2Attr func(optionalAttr)
+
+// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyFtrlV2MultiplyLinearByLr sets the optional multiply_linear_by_lr attribute to value.
+// If not specified, defaults to false
+func ResourceApplyFtrlV2MultiplyLinearByLr(value bool) ResourceApplyFtrlV2Attr {
+	return func(m optionalAttr) {
+		m["multiply_linear_by_lr"] = value
+	}
+}
+
+// Update '*var' according to the Ftrl-proximal scheme.
+//
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 shrinkage regularization. Must be a scalar.
+//
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyFtrlV2",
+		Input: []tf.Input{
+			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Constructs a tensor by tiling a given tensor.
+//
+// This operation creates a new tensor by replicating `input` `multiples` times.
+// The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
+// and the values of `input` are replicated `multiples[i]` times along the 'i'th
+// dimension. For example, tiling `[a b c d]` by `[2]` produces
+// `[a b c d a b c d]`.
+//
+// >>> a = tf.constant([[1,2,3],[4,5,6]], tf.int32)
+// >>> b = tf.constant([1,2], tf.int32)
+// >>> tf.tile(a, b)
+// <tf.Tensor: shape=(2, 6), dtype=int32, numpy=
+// array([[1, 2, 3, 1, 2, 3],
+//        [4, 5, 6, 4, 5, 6]], dtype=int32)>
+// >>> c = tf.constant([2,1], tf.int32)
+// >>> tf.tile(a, c)
+// <tf.Tensor: shape=(4, 3), dtype=int32, numpy=
+// array([[1, 2, 3],
+//        [4, 5, 6],
+//        [1, 2, 3],
+//        [4, 5, 6]], dtype=int32)>
+// >>> d = tf.constant([2,2], tf.int32)
+// >>> tf.tile(a, d)
+// <tf.Tensor: shape=(4, 6), dtype=int32, numpy=
+// array([[1, 2, 3, 1, 2, 3],
+//        [4, 5, 6, 4, 5, 6],
+//        [1, 2, 3, 1, 2, 3],
+//        [4, 5, 6, 4, 5, 6]], dtype=int32)>
+//
+// Arguments:
+//	input: 1-D or higher.
+//	multiples: 1-D. Length must be the same as the number of dimensions in `input`
+func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Fact",
+		Type: "Tile",
+		Input: []tf.Input{
+			input, multiples,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -44815,57 +43021,101 @@ func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf
 	return scope.AddOperation(opspec)
 }
 
-// RandomUniformAttr is an optional argument to RandomUniform.
-type RandomUniformAttr func(optionalAttr)
-
-// RandomUniformSeed sets the optional seed attribute to value.
+// Elementwise computes the bitwise left-shift of `x` and `y`.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformSeed(value int64) RandomUniformAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomUniformSeed2 sets the optional seed2 attribute to value.
+// If `y` is negative, or greater than or equal to the width of `x` in bits the
+// result is implementation defined.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformSeed2(value int64) RandomUniformAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from a uniform distribution.
+// Example:
 //
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+// ```python
+// import tensorflow as tf
+// from tensorflow.python.ops import bitwise_ops
+// import numpy as np
+// dtype_list = [tf.int8, tf.int16, tf.int32, tf.int64]
 //
-// Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+// for dtype in dtype_list:
+//   lhs = tf.constant([-1, -5, -3, -14], dtype=dtype)
+//   rhs = tf.constant([5, 0, 7, 11], dtype=dtype)
 //
-// Returns A tensor of the specified shape filled with uniform random values.
-func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
+//   left_shift_result = bitwise_ops.left_shift(lhs, rhs)
+//
+//   print(left_shift_result)
+//
+// # This will print:
+// # tf.Tensor([ -32   -5 -128    0], shape=(4,), dtype=int8)
+// # tf.Tensor([   -32     -5   -384 -28672], shape=(4,), dtype=int16)
+// # tf.Tensor([   -32     -5   -384 -28672], shape=(4,), dtype=int32)
+// # tf.Tensor([   -32     -5   -384 -28672], shape=(4,), dtype=int64)
+//
+// lhs = np.array([-2, 64, 101, 32], dtype=np.int8)
+// rhs = np.array([-1, -5, -3, -14], dtype=np.int8)
+// bitwise_ops.left_shift(lhs, rhs)
+// # <tf.Tensor: shape=(4,), dtype=int8, numpy=array([ -2,  64, 101,  32], dtype=int8)>
+// ```
+//
+func LeftShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "RandomUniform",
+		Type: "LeftShift",
 		Input: []tf.Input{
-			shape,
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Generates a feature cross from a list of tensors, and returns it as a
+// RaggedTensor.  See `tf.ragged.cross` for more details.
+//
+// Arguments:
+//	ragged_values: The values tensor for each RaggedTensor input.
+//	ragged_row_splits: The row_splits tensor for each RaggedTensor input.
+//	sparse_indices: The indices tensor for each SparseTensor input.
+//	sparse_values: The values tensor for each SparseTensor input.
+//	sparse_shape: The dense_shape tensor for each SparseTensor input.
+//	dense_inputs: The tf.Tensor inputs.
+//	input_order: String specifying the tensor type for each input.  The `i`th character in
+// this string specifies the type of the `i`th input, and is one of: 'R' (ragged),
+// 'D' (dense), or 'S' (sparse).  This attr is used to ensure that the crossed
+// values are combined in the order of the inputs from the call to tf.ragged.cross.
+//
+//
+//
+//
+//
+//
+// Returns:
+//	output_values: The `values` for the returned `RaggedTensor`.
+//	output_row_splits: The `row_splits` for the returned `RaggedTensor`.
+func RaggedCross(scope *Scope, ragged_values []tf.Output, ragged_row_splits []tf.Output, sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shape []tf.Output, dense_inputs []tf.Output, input_order string, hashed_output bool, num_buckets int64, hash_key int64, out_values_type tf.DataType, out_row_splits_type tf.DataType) (output_values tf.Output, output_row_splits tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"input_order": input_order, "hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_values_type": out_values_type, "out_row_splits_type": out_row_splits_type}
+	opspec := tf.OpSpec{
+		Type: "RaggedCross",
+		Input: []tf.Input{
+			tf.OutputList(ragged_values), tf.OutputList(ragged_row_splits), tf.OutputList(sparse_indices), tf.OutputList(sparse_values), tf.OutputList(sparse_shape), tf.OutputList(dense_inputs),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Output a fact about factorials.
+func Fact(scope *Scope) (fact tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Fact",
+	}
+	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
@@ -44925,6 +43175,532 @@ func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, num_shar
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
+// Pads a tensor with mirrored values.
+//
+// This operation pads a `input` with mirrored values according to the `paddings`
+// you specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is
+// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many values to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many values to add after the contents of `input`
+// in that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater
+// than `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true
+// (if false, respectively).
+//
+// The padded size of each dimension D of the output is:
+//
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 2, 3], [4, 5, 6]].
+// # 'paddings' is [[1, 1]], [2, 2]].
+// # 'mode' is SYMMETRIC.
+// # rank of 't' is 2.
+// pad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]
+//                       [2, 1, 1, 2, 3, 3, 2]
+//                       [5, 4, 4, 5, 6, 6, 5]
+//                       [5, 4, 4, 5, 6, 6, 5]]
+// ```
+//
+// Arguments:
+//	input: The input tensor to be padded.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	mode: Either `REFLECT` or `SYMMETRIC`. In reflect mode the padded regions
+// do not include the borders, while in symmetric mode the padded regions
+// do include the borders. For example, if `input` is `[1, 2, 3]` and `paddings`
+// is `[0, 2]`, then the output is `[1, 2, 3, 2, 1]` in reflect mode, and
+// it is `[1, 2, 3, 3, 2]` in symmetric mode.
+//
+// Returns The padded tensor.
+func MirrorPad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"mode": mode}
+	opspec := tf.OpSpec{
+		Type: "MirrorPad",
+		Input: []tf.Input{
+			input, paddings,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TensorArrayV3Attr is an optional argument to TensorArrayV3.
+type TensorArrayV3Attr func(optionalAttr)
+
+// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
+//
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
+//
+// value: A boolean that determines whether writes to the TensorArray
+// are allowed to grow the size.  By default, this is not allowed.
+// If not specified, defaults to false
+func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["dynamic_size"] = value
+	}
+}
+
+// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
+//
+// value: If true (default), Tensors in the TensorArray are cleared
+// after being read.  This disables multiple read semantics but allows early
+// release of memory.
+// If not specified, defaults to true
+func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["clear_after_read"] = value
+	}
+}
+
+// TensorArrayV3IdenticalElementShapes sets the optional identical_element_shapes attribute to value.
+//
+// value: If true (default is false), then all
+// elements in the TensorArray will be expected to have have identical shapes.
+// This allows certain behaviors, like dynamically checking for
+// consistent shapes on write, and being able to fill in properly
+// shaped zero tensors on stack -- even if the element_shape attribute
+// is not fully defined.
+// If not specified, defaults to false
+func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["identical_element_shapes"] = value
+	}
+}
+
+// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
+//
+// value: Overrides the name used for the temporary tensor_array
+// resource. Default value is the name of the 'TensorArray' op (which
+// is guaranteed unique).
+// If not specified, defaults to ""
+func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["tensor_array_name"] = value
+	}
+}
+
+// An array of Tensors of given size.
+//
+// Write data via Write and read via Read or Pack.
+//
+// Arguments:
+//	size: The size of the array.
+//	dtype: The type of the elements on the tensor_array.
+//
+// Returns:
+//	handle: The handle to the TensorArray.
+//	flow: A scalar used to control gradient flow.
+func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayV3",
+		Input: []tf.Input{
+			size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
+type MatrixSolveLsAttr func(optionalAttr)
+
+// MatrixSolveLsFast sets the optional fast attribute to value.
+// If not specified, defaults to true
+func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
+	return func(m optionalAttr) {
+		m["fast"] = value
+	}
+}
+
+// Solves one or more linear least-squares problems.
+//
+// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
+// type as `matrix` and shape `[..., M, K]`.
+// The output is a tensor shape `[..., N, K]` where each output matrix solves
+// each of the equations
+// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
+// in the least squares sense.
+//
+// We use the following notation for (complex) matrix and right-hand sides
+// in the batch:
+//
+// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
+// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
+// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
+// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
+//
+// If `fast` is `True`, then the solution is computed by solving the normal
+// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
+// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
+// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 + \lambda ||Z||_F^2\\).
+// If \\(m \lt n\\) then `output` is computed as
+// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
+// minimum-norm solution to the under-determined linear system, i.e.
+// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
+// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
+// when \\(A\\) is numerically full rank and has a condition number
+// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or \\(\lambda\\) is
+// sufficiently large.
+//
+// If `fast` is `False` an algorithm based on the numerically robust complete
+// orthogonal decomposition is used. This computes the minimum-norm
+// least-squares solution, even when \\(A\\) is rank deficient. This path is
+// typically 6-7 times slower than the fast path. If `fast` is `False` then
+// `l2_regularizer` is ignored.
+//
+// Arguments:
+//	matrix: Shape is `[..., M, N]`.
+//	rhs: Shape is `[..., M, K]`.
+//	l2_regularizer: Scalar tensor.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.lstsq
+// @end_compatibility
+//
+// Returns Shape is `[..., N, K]`.
+func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixSolveLs",
+		Input: []tf.Input{
+			matrix, rhs, l2_regularizer,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Generates sparse cross from a list of sparse and dense tensors.
+//
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
+//
+// For example, if the inputs are
+//
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
+//
+//     inputs[2]: Tensor [["f"], ["g"]]
+//
+// then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
+//
+// if hashed_output=true then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	hashed_output: If true, returns the hash of the cross instead of the string.
+// This will allow us avoiding string manipulations.
+//	num_buckets: It is used if hashed_output is true.
+// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
+// function to combine the crosses fingerprints.
+//
+//
+//
+// Returns:
+//	output_indices: 2-D.  Indices of the concatenated `SparseTensor`.
+//	output_values: 1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.
+//	output_shape: 1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
+	opspec := tf.OpSpec{
+		Type: "SparseCross",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Reverses specific dimensions of a tensor.
+//
+// Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
+// of `tensor`, this operation reverses each dimension i of `tensor` where
+// `dims[i]` is `True`.
+//
+// `tensor` can have up to 8 dimensions. The number of dimensions
+// of `tensor` must equal the number of elements in `dims`. In other words:
+//
+// `rank(tensor) = size(dims)`
+//
+// For example:
+//
+// ```
+// # tensor 't' is [[[[ 0,  1,  2,  3],
+// #                  [ 4,  5,  6,  7],
+// #                  [ 8,  9, 10, 11]],
+// #                 [[12, 13, 14, 15],
+// #                  [16, 17, 18, 19],
+// #                  [20, 21, 22, 23]]]]
+// # tensor 't' shape is [1, 2, 3, 4]
+//
+// # 'dims' is [False, False, False, True]
+// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+//                         [ 7,  6,  5,  4],
+//                         [ 11, 10, 9, 8]],
+//                        [[15, 14, 13, 12],
+//                         [19, 18, 17, 16],
+//                         [23, 22, 21, 20]]]]
+//
+// # 'dims' is [False, True, False, False]
+// reverse(t, dims) ==> [[[[12, 13, 14, 15],
+//                         [16, 17, 18, 19],
+//                         [20, 21, 22, 23]
+//                        [[ 0,  1,  2,  3],
+//                         [ 4,  5,  6,  7],
+//                         [ 8,  9, 10, 11]]]]
+//
+// # 'dims' is [False, False, True, False]
+// reverse(t, dims) ==> [[[[8, 9, 10, 11],
+//                         [4, 5, 6, 7],
+//                         [0, 1, 2, 3]]
+//                        [[20, 21, 22, 23],
+//                         [16, 17, 18, 19],
+//                         [12, 13, 14, 15]]]]
+// ```
+//
+// Arguments:
+//	tensor: Up to 8-D.
+//	dims: 1-D. The dimensions to reverse.
+//
+// Returns The same shape as `tensor`.
+func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Reverse",
+		Input: []tf.Input{
+			tensor, dims,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StringLowerAttr is an optional argument to StringLower.
+type StringLowerAttr func(optionalAttr)
+
+// StringLowerEncoding sets the optional encoding attribute to value.
+// If not specified, defaults to ""
+func StringLowerEncoding(value string) StringLowerAttr {
+	return func(m optionalAttr) {
+		m["encoding"] = value
+	}
+}
+
+// Converts all uppercase characters into their respective lowercase replacements.
+//
+// Example:
+//
+// >>> tf.strings.lower("CamelCase string and ALL CAPS")
+// <tf.Tensor: shape=(), dtype=string, numpy=b'camelcase string and all caps'>
+//
+func StringLower(scope *Scope, input tf.Output, optional ...StringLowerAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringLower",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Wraps an arbitrary MLIR computation expressed as a module with a main() function.
+//
+// This operation does not have an associated kernel and is not intended to be
+// executed in a regular TensorFlow session. Instead it is intended to be used for
+// testing or for special case where a user intends to pass custom MLIR computation
+// through a TensorFlow graph with the intent of having custom tooling processing
+// it downstream (when targeting a different environment, like TensorFlow lite for
+// example).
+// The MLIR module is expected to have a main() function that will be used as an
+// entry point. The inputs to the operations will be passed as argument to the
+// main() function and the returned values of the main function mapped to the
+// outputs.
+// Example usage:
+//
+// ```
+// import tensorflow as tf
+// from tensorflow.compiler.mlir.tensorflow.gen_mlir_passthrough_op import mlir_passthrough_op
+//
+// mlir_module = '''python
+// func @main(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10x10xf32> {
+//    %add = "magic.op"(%arg0, %arg1) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10x10xf32>
+//    return %ret : tensor<10x10xf32>
+// }
+// '''
+//
+// @tf.function
+// def foo(x, y):
+//   return mlir_passthrough_op([x, y], mlir_module, Toutputs=[tf.float32])
+//
+// graph_def = foo.get_concrete_function(tf.TensorSpec([10], tf.float32), tf.TensorSpec([10], tf.float32)).graph.as_graph_def()
+// ```
+func MlirPassthroughOp(scope *Scope, inputs []tf.Output, mlir_module string, Toutputs []tf.DataType) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"mlir_module": mlir_module, "Toutputs": Toutputs}
+	opspec := tf.OpSpec{
+		Type: "MlirPassthroughOp",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("MlirPassthroughOp", err)
+		return
+	}
+	return outputs
+}
+
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
+//
+// The hash function is deterministic on the content of the string within the
+// process. The hash function is a keyed hash function, where attribute `key`
+// defines the key of the hash function. `key` is an array of 2 elements.
+//
+// A strong hash is important when inputs may be malicious, e.g. URLs with
+// additional components. Adversaries could try to make their inputs hash to the
+// same bucket for a denial-of-service attack or to skew the results. A strong
+// hash can be used to make it difficult to find inputs with a skewed hash value
+// distribution over buckets. This requires that the hash function is
+// seeded by a high-entropy (random) "key" unknown to the adversary.
+//
+// The additional robustness comes at a cost of roughly 4x higher compute
+// time than `tf.string_to_hash_bucket_fast`.
+//
+// Examples:
+//
+// >>> tf.strings.to_hash_bucket_strong(["Hello", "TF"], 3, [1, 2]).numpy()
+// array([2, 0])
+//
+// Arguments:
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//	key: The key used to seed the hash function, passed as a list of two uint64
+// elements.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
+	opspec := tf.OpSpec{
+		Type: "StringToHashBucketStrong",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes rectified linear gradients for a Relu operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Relu operation.
+//	features: The features passed as input to the corresponding Relu operation, OR
+// the outputs of that operation (both work equivalently).
+//
+// Returns `gradients * (features > 0)`.
+func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReluGrad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Retrieves the tree ensemble resource stamp token, number of trees and growing statistics.
 //
 // Arguments:
@@ -45028,100 +43804,64 @@ func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, update
 	return scope.AddOperation(opspec)
 }
 
-// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
-type CropAndResizeGradImageAttr func(optionalAttr)
+// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
+type ResourceApplyMomentumAttr func(optionalAttr)
 
-// CropAndResizeGradImageMethod sets the optional method attribute to value.
+// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
 //
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
 	return func(m optionalAttr) {
-		m["method"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Computes the gradient of the crop_and_resize op wrt the input image tensor.
+// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the momentum scheme.
+//
+// Set use_nesterov = True if you want to use Nesterov momentum.
+//
+// accum = accum * momentum + grad
+// var -= lr * accum
 //
 // Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
-// containing the original image size. Both `image_height` and `image_width` need
-// to be positive.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//	momentum: Momentum. Must be a scalar.
 //
-//
-// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradImage",
+		Type: "ResourceApplyMomentum",
 		Input: []tf.Input{
-			grads, boxes, box_ind, image_size,
+			var_, accum, lr, grad, momentum,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// OutfeedDequeueAttr is an optional argument to OutfeedDequeue.
-type OutfeedDequeueAttr func(optionalAttr)
-
-// OutfeedDequeueDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. This should be -1 when the Op
-// is running on a TPU device, and >= 0 when the Op is running on the CPU
-// device.
-// If not specified, defaults to -1
-func OutfeedDequeueDeviceOrdinal(value int64) OutfeedDequeueAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// Retrieves a single tensor from the computation outfeed.
-//
-// This operation will block indefinitely until data is available.
-//
-// Arguments:
-//	dtype: The type of elements in the tensor.
-//	shape: The shape of the tensor.
-//
-// Returns A tensor that will be read from the device outfeed.
-func OutfeedDequeue(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...OutfeedDequeueAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OutfeedDequeue",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
 // An Op to sum inputs across replicated TPU instances.
@@ -45154,94 +43894,6 @@ func CrossReplicaSum(scope *Scope, input tf.Output, group_assignment tf.Output)
 	return op.Output(0)
 }
 
-// EnqueueTPUEmbeddingRaggedTensorBatchAttr is an optional argument to EnqueueTPUEmbeddingRaggedTensorBatch.
-type EnqueueTPUEmbeddingRaggedTensorBatchAttr func(optionalAttr)
-
-// EnqueueTPUEmbeddingRaggedTensorBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. Should be >= 0 and less than the number
-// of TPU cores in the task on which the node is placed.
-// If not specified, defaults to -1
-func EnqueueTPUEmbeddingRaggedTensorBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingRaggedTensorBatchAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// EnqueueTPUEmbeddingRaggedTensorBatchCombiners sets the optional combiners attribute to value.
-//
-// value: A list of string scalars, one for each embedding table that specify
-// how to normalize the embedding activations after weighted summation.
-// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
-// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
-// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
-// all tables.
-// If not specified, defaults to <>
-func EnqueueTPUEmbeddingRaggedTensorBatchCombiners(value []string) EnqueueTPUEmbeddingRaggedTensorBatchAttr {
-	return func(m optionalAttr) {
-		m["combiners"] = value
-	}
-}
-
-// EnqueueTPUEmbeddingRaggedTensorBatchMaxSequenceLengths sets the optional max_sequence_lengths attribute to value.
-// If not specified, defaults to <>
-func EnqueueTPUEmbeddingRaggedTensorBatchMaxSequenceLengths(value []int64) EnqueueTPUEmbeddingRaggedTensorBatchAttr {
-	return func(m optionalAttr) {
-		m["max_sequence_lengths"] = value
-	}
-}
-
-// Eases the porting of code that uses tf.nn.embedding_lookup().
-//
-// sample_splits[i], embedding_indices[i] and aggregation_weights[i] correspond
-// to the ith feature. table_ids[i] indicates which embedding table to look up ith
-// feature.
-//
-// The tensors at corresponding positions in two of the input lists,
-// embedding_indices and aggregation_weights, must have the same shape, i.e. rank 1
-// with dim_size() equal to the total number of lookups into the table described by
-// the corresponding feature.
-//
-// Arguments:
-//	sample_splits: A list of rank 1 Tensors specifying the break points for splitting
-// embedding_indices and aggregation_weights into rows.
-// It corresponds to ids.row_splits in embedding_lookup(), when ids is a
-// RaggedTensor.
-//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
-// It corresponds to ids.values in embedding_lookup(), when ids is a RaggedTensor.
-//	aggregation_weights: A list of rank 1 Tensors containing per training example
-// aggregation weights. It corresponds to the values field of a RaggedTensor
-// with the same row_splits as ids in embedding_lookup(), when ids is a
-// RaggedTensor.
-//	mode_override: A string input that overrides the mode specified in the
-// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
-// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
-// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
-//	table_ids: A list of integers specifying the identifier of the embedding table
-// (offset of TableDescriptor in the TPUEmbeddingConfiguration) to lookup the
-// corresponding input. The ith input is looked up using table_ids[i]. The size
-// of the table_ids list must be equal to that of sample_indices,
-// embedding_indices and aggregation_weights.
-//
-// Returns the created operation.
-func EnqueueTPUEmbeddingRaggedTensorBatch(scope *Scope, sample_splits []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, table_ids []int64, optional ...EnqueueTPUEmbeddingRaggedTensorBatchAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"table_ids": table_ids}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EnqueueTPUEmbeddingRaggedTensorBatch",
-		Input: []tf.Input{
-			tf.OutputList(sample_splits), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
 type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
 
@@ -45387,6 +44039,365 @@ func LoadTPUEmbeddingADAMParameters(scope *Scope, parameters tf.Output, momenta
 	return scope.AddOperation(opspec)
 }
 
+// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
+type ResourceScatterNdUpdateAttr func(optionalAttr)
+
+// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
+//
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Applies sparse `updates` to individual values or slices within a given
+//
+// variable according to `indices`.
+//
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+// ```
+//
+// For example, say we want to update 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that update would look like this:
+//
+// ```python
+//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1] ,[7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     update = tf.scatter_nd_update(ref, indices, updates)
+//     with tf.Session() as sess:
+//       print sess.run(update)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, 11, 3, 10, 9, 6, 7, 12]
+//
+// See `tf.scatter_nd` for more details about how to make updates to
+// slices.
+//
+// Arguments:
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated
+// values to add to ref.
+//
+// Returns the created operation.
+func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterNdUpdate",
+		Input: []tf.Input{
+			ref, indices, updates,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// EnqueueTPUEmbeddingSparseBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseBatch.
+type EnqueueTPUEmbeddingSparseBatchAttr func(optionalAttr)
+
+// EnqueueTPUEmbeddingSparseBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. Should be >= 0 and less than the number
+// of TPU cores in the task on which the node is placed.
+// If not specified, defaults to -1
+func EnqueueTPUEmbeddingSparseBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingSparseBatchAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// EnqueueTPUEmbeddingSparseBatchCombiners sets the optional combiners attribute to value.
+//
+// value: A list of string scalars, one for each embedding table that specify
+// how to normalize the embedding activations after weighted summation.
+// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
+// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
+// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
+// all tables.
+// If not specified, defaults to <>
+func EnqueueTPUEmbeddingSparseBatchCombiners(value []string) EnqueueTPUEmbeddingSparseBatchAttr {
+	return func(m optionalAttr) {
+		m["combiners"] = value
+	}
+}
+
+// An op that enqueues TPUEmbedding input indices from a SparseTensor.
+//
+// This Op eases the porting of code that uses embedding_lookup_sparse(),
+// although some Python preprocessing of the SparseTensor arguments to
+// embedding_lookup_sparse() is required to produce the arguments to this Op,
+// since only a single EnqueueTPUEmbeddingSparseBatch Op is allowed per training
+// step.
+//
+// The tensors at corresponding positions in the three input lists
+// must have the same shape, i.e. rank 1 with dim_size() equal to the total
+// number of lookups into the table described by the corresponding table_id.
+//
+// Arguments:
+//	sample_indices: A list of rank 1 Tensors specifying the training example and
+// feature to which the corresponding embedding_indices and aggregation_weights
+// values belong. sample_indices[i] must equal b * nf + f, where nf is the
+// number of features from the corresponding table, f is in [0, nf), and
+// b is in [0, batch size).
+//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
+//	aggregation_weights: A list of rank 1 Tensors containing per sample -- i.e. per
+// (training example, feature) -- aggregation weights.
+//	mode_override: A string input that overrides the mode specified in the
+// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+//
+// Returns the created operation.
+func EnqueueTPUEmbeddingSparseBatch(scope *Scope, sample_indices []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingSparseBatchAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EnqueueTPUEmbeddingSparseBatch",
+		Input: []tf.Input{
+			tf.OutputList(sample_indices), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Transforms a vector of brain.Example protos (as strings) into typed tensors.
+//
+// Arguments:
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	names: A vector containing the names of the serialized protos.
+// May contain, for example, table key (descriptive) names for the
+// corresponding serialized protos.  These are purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no names are available.
+// If non-empty, this vector must be the same length as "serialized".
+//	sparse_keys: A list of Nsparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: A list of Ndense string Tensors (scalars).
+// The keys expected in the Examples' features associated with dense values.
+//	dense_defaults: A list of Ndense Tensors (some may be empty).
+// dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	sparse_types: A list of Nsparse types; the data types of data in each Feature
+// given in sparse_keys.
+// Currently the ParseExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
+// given in dense_keys.
+// The number of elements in the Feature corresponding to dense_key[j]
+// must always equal dense_shapes[j].NumEntries().
+// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
+// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
+// The dense outputs are just the inputs row-stacked by batch.
+// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
+// the shape of the output Tensor dense_values[j] will be
+// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
+// of elements of length D1 * .... * DN, across all minibatch entries
+// in the input.  Any minibatch entry with less than M blocks of elements of
+// length D1 * ... * DN will be padded with the corresponding default_value
+// scalar element along the second dimension.
+func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
+	opspec := tf.OpSpec{
+		Type: "ParseExample",
+		Input: []tf.Input{
+			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
+}
+
+// MapPeekAttr is an optional argument to MapPeek.
+type MapPeekAttr func(optionalAttr)
+
+// MapPeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapPeekCapacity(value int64) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapPeekMemoryLimit(value int64) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapPeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapPeekContainer(value string) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapPeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapPeekSharedName(value string) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op peeks at the values at the specified key.  If the
+//
+// underlying container does not contain this key
+// this op will block until it does.
+func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapPeek",
+		Input: []tf.Input{
+			key, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapPeek", err)
+		return
+	}
+	return values
+}
+
+// RetrieveTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to RetrieveTPUEmbeddingCenteredRMSPropParameters.
+type RetrieveTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func RetrieveTPUEmbeddingCenteredRMSPropParametersTableId(value int64) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingCenteredRMSPropParametersTableName(value string) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingCenteredRMSPropParametersConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingCenteredRMSPropParametersConfig(value string) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Retrieve centered RMSProp embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns:
+//	parameters: Parameter parameters updated by the centered RMSProp optimization algorithm.
+//	ms: Parameter ms updated by the centered RMSProp optimization algorithm.
+//	mom: Parameter mom updated by the centered RMSProp optimization algorithm.
+//	mg: Parameter mg updated by the centered RMSProp optimization algorithm.
+func RetrieveTPUEmbeddingCenteredRMSPropParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingCenteredRMSPropParametersAttr) (parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingCenteredRMSPropParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
 // Records the latency of producing `input_dataset` elements in a StatsAggregator.
 func LatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
@@ -45694,6 +44705,4311 @@ func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug(scope *Scope, n
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// IRFFTAttr is an optional argument to IRFFT.
+type IRFFTAttr func(optionalAttr)
+
+// IRFFTTreal sets the optional Treal attribute to value.
+// If not specified, defaults to DT_FLOAT
+func IRFFTTreal(value tf.DataType) IRFFTAttr {
+	return func(m optionalAttr) {
+		m["Treal"] = value
+	}
+}
+
+// Inverse real-valued fast Fourier transform.
+//
+// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most dimension of `input`.
+//
+// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
+// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
+// `fft_length` is not provided, it is computed from the size of the inner-most
+// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
+// compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
+// than the corresponding dimension of `input`, the dimension is cropped. If it is
+// larger, the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A complex tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length` samples of its inverse
+//   1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft
+// @end_compatibility
+func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output, optional ...IRFFTAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "IRFFT",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+//
+// N is the size of the segment being reduced.
+//
+// See `tf.sparse.segment_sum` for usage examples.
+//
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSqrtN",
+		Input: []tf.Input{
+			data, indices, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes square root of x element-wise.
+//
+// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
+func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sqrt",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor.
+//
+// This operation folds the padded areas of `input` by `MirrorPad` according to the
+// `paddings` you specify. `paddings` must be the same as `paddings` argument
+// given to the corresponding `MirrorPad` op.
+//
+// The folded size of each dimension D of the output is:
+//
+// `input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].
+// # 'paddings' is [[0, 1]], [0, 1]].
+// # 'mode' is SYMMETRIC.
+// # rank of 't' is 2.
+// pad(t, paddings) ==> [[ 1,  5]
+//                       [11, 28]]
+// ```
+//
+// Arguments:
+//	input: The input tensor to be folded.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	mode: The mode used in the `MirrorPad` op.
+//
+// Returns The folded tensor.
+func MirrorPadGrad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"mode": mode}
+	opspec := tf.OpSpec{
+		Type: "MirrorPadGrad",
+		Input: []tf.Input{
+			input, paddings,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Produces the max pool of the input tensor for quantized types.
+//
+// Arguments:
+//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	ksize: The size of the window for each dimension of the input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. The length must be 4 to match the number of dimensions of the input.
+//	padding: The type of padding algorithm to use.
+//
+// Returns:
+//	output
+//	min_output: The float value that the lowest quantized output value represents.
+//	max_output: The float value that the highest quantized output value represents.
+func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "QuantizedMaxPool",
+		Input: []tf.Input{
+			input, min_input, max_input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
+type ResourceApplyAdagradAttr func(optionalAttr)
+
+// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
+// If not specified, defaults to true
+func ResourceApplyAdagradUpdateSlots(value bool) ResourceApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["update_slots"] = value
+	}
+}
+
+// Update '*var' according to the adagrad scheme.
+//
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdagrad",
+		Input: []tf.Input{
+			var_, accum, lr, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// FakeQuantWithMinMaxArgsAttr is an optional argument to FakeQuantWithMinMaxArgs.
+type FakeQuantWithMinMaxArgsAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxArgsMin sets the optional min attribute to value.
+// If not specified, defaults to -6
+func FakeQuantWithMinMaxArgsMin(value float32) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["min"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsMax sets the optional max attribute to value.
+// If not specified, defaults to 6
+func FakeQuantWithMinMaxArgsMax(value float32) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["max"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxArgsNumBits(value int64) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxArgsNarrowRange(value bool) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
+//
+// Attributes
+//
+// *   `[min; max]` define the clamping range for the `inputs` data.
+// *   `inputs` values are quantized into the quantization range (
+// `[0; 2^num_bits - 1]` when `narrow_range` is false and `[1; 2^num_bits - 1]`
+// when it is true) and then de-quantized and output as floats in `[min; max]`
+// interval.
+// *   `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
+//
+// Before quantization, `min` and `max` values are adjusted with the following
+// logic.
+// It is suggested to have `min <= 0 <= max`. If `0` is not in the range of values,
+// the behavior can be unexpected:
+//
+// *   If `0 < min < max`: `min_adj = 0` and `max_adj = max - min`.
+// *   If `min < max < 0`: `min_adj = min - max` and `max_adj = 0`.
+// *   If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
+// `min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
+//
+// Quantization is called fake since the output is still in floating point.
+func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsAttr) (outputs tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxArgs",
+		Input: []tf.Input{
+			inputs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Batch normalization.
+//
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+//
+// This op is deprecated. Prefer `tf.nn.batch_normalization`.
+//
+// Arguments:
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	opspec := tf.OpSpec{
+		Type: "BatchNormWithGlobalNormalization",
+		Input: []tf.Input{
+			t, m, v, beta, gamma,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EncodeBase64Attr is an optional argument to EncodeBase64.
+type EncodeBase64Attr func(optionalAttr)
+
+// EncodeBase64Pad sets the optional pad attribute to value.
+//
+// value: Bool whether padding is applied at the ends.
+// If not specified, defaults to false
+func EncodeBase64Pad(value bool) EncodeBase64Attr {
+	return func(m optionalAttr) {
+		m["pad"] = value
+	}
+}
+
+// Encode strings into web-safe base64 format.
+//
+// Refer to the following article for more information on base64 format:
+// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
+// end so that the encoded has length multiple of 4. See Padding section of the
+// link above.
+//
+// Web-safe means that the encoder uses - and _ instead of + and /.
+//
+// Arguments:
+//	input: Strings to be encoded.
+//
+// Returns Input strings encoded in base64.
+func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodeBase64",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyAdagradV2Attr is an optional argument to ResourceApplyAdagradV2.
+type ResourceApplyAdagradV2Attr func(optionalAttr)
+
+// ResourceApplyAdagradV2UseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradV2UseLocking(value bool) ResourceApplyAdagradV2Attr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyAdagradV2UpdateSlots sets the optional update_slots attribute to value.
+// If not specified, defaults to true
+func ResourceApplyAdagradV2UpdateSlots(value bool) ResourceApplyAdagradV2Attr {
+	return func(m optionalAttr) {
+		m["update_slots"] = value
+	}
+}
+
+// Update '*var' according to the adagrad scheme.
+//
+// accum += grad * grad
+// var -= lr * grad * (1 / (sqrt(accum) + epsilon))
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdagradV2(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdagradV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdagradV2",
+		Input: []tf.Input{
+			var_, accum, lr, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Op that loads and executes a TPU program on a TPU device.
+//
+// For the internal use of the distributed TPU compiler.
+func TPUExecute(scope *Scope, args []tf.Output, key tf.Output, Tresults []tf.DataType) (results []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"Tresults": Tresults}
+	opspec := tf.OpSpec{
+		Type: "TPUExecute",
+		Input: []tf.Input{
+			tf.OutputList(args), key,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if results, idx, err = makeOutputList(op, idx, "results"); err != nil {
+		scope.UpdateErr("TPUExecute", err)
+		return
+	}
+	return results
+}
+
+// Creates a dataset that batches input elements into a SparseTensor.
+//
+// Arguments:
+//	input_dataset: A handle to an input dataset. Must have a single component.
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	row_shape: A vector representing the dense shape of each row in the produced
+// SparseTensor. The shape may be partially specified, using `-1` to indicate
+// that a particular dimension should use the maximum size of all batch elements.
+//
+//
+func DenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "DenseToSparseBatchDataset",
+		Input: []tf.Input{
+			input_dataset, batch_size, row_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.
+type LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugConfig(value string) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Load Adadelta parameters with debug support.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the Adadelta optimization algorithm.
+//	accumulators: Value of accumulators used in the Adadelta optimization algorithm.
+//	updates: Value of updates used in the Adadelta optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the Adadelta optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, updates tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug",
+		Input: []tf.Input{
+			parameters, accumulators, updates, gradient_accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
+type DepthwiseConv2dNativeAttr func(optionalAttr)
+
+// DepthwiseConv2dNativeExplicitPaddings sets the optional explicit_paddings attribute to value.
+// If not specified, defaults to <>
+func DepthwiseConv2dNativeExplicitPaddings(value []int64) DepthwiseConv2dNativeAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
+// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// DepthwiseConv2dNativeDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
+//
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
+// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
+// a different filter to each input channel (expanding from 1 channel to
+// `channel_multiplier` channels for each), then concatenates the results
+// together. Thus, the output has `in_channels * channel_multiplier` channels.
+//
+// ```
+// for k in 0..in_channels-1
+//   for q in 0..channel_multiplier-1
+//     output[b, i, j, k * channel_multiplier + q] =
+//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
+//                         filter[di, dj, k, q]
+// ```
+//
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+//
+// Arguments:
+//
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`.
+//	padding: The type of padding algorithm to use.
+func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthwiseConv2dNative",
+		Input: []tf.Input{
+			input, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates an all-zeros CSRSparseMatrix with shape `dense_shape`.
+//
+// Arguments:
+//	dense_shape: The desired matrix shape.
+//
+//
+// Returns An empty CSR matrix with shape `dense_shape`.
+func SparseMatrixZeros(scope *Scope, dense_shape tf.Output, type_ tf.DataType) (sparse_matrix tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"type": type_}
+	opspec := tf.OpSpec{
+		Type: "SparseMatrixZeros",
+		Input: []tf.Input{
+			dense_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EqualAttr is an optional argument to Equal.
+type EqualAttr func(optionalAttr)
+
+// EqualIncompatibleShapeError sets the optional incompatible_shape_error attribute to value.
+// If not specified, defaults to true
+func EqualIncompatibleShapeError(value bool) EqualAttr {
+	return func(m optionalAttr) {
+		m["incompatible_shape_error"] = value
+	}
+}
+
+// Returns the truth value of (x == y) element-wise.
+//
+// *NOTE*: `Equal` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+//
+// ```python
+// x = tf.constant([2, 4])
+// y = tf.constant(2)
+// tf.math.equal(x, y) ==> array([True, False])
+//
+// x = tf.constant([2, 4])
+// y = tf.constant([2, 4])
+// tf.math.equal(x, y) ==> array([True,  True])
+// ```
+func Equal(scope *Scope, x tf.Output, y tf.Output, optional ...EqualAttr) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Equal",
+		Input: []tf.Input{
+			x, y,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
+type SparseToSparseSetOperationAttr func(optionalAttr)
+
+// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Applies set operation along last dimension of 2 `SparseTensor` inputs.
+//
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
+// order and range of `set1` and `set2` indices.
+//
+// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
+// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set1`
+// and `set2` indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
+//
+// Arguments:
+//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
+// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//
+//
+// Returns:
+//	result_indices: 2D indices of a `SparseTensor`.
+//	result_values: 1D values of a `SparseTensor`.
+//	result_shape: 1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"set_operation": set_operation}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseToSparseSetOperation",
+		Input: []tf.Input{
+			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns a batched diagonal tensor with a given batched diagonal values.
+//
+// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+// everything else padded with zeros. The diagonal is computed as follows:
+//
+// Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
+// tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
+//
+// `output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
+//
+// For example:
+//
+// ```
+// # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
+//
+// and diagonal.shape = (2, 4)
+//
+// tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
+//                                      [0, 2, 0, 0]
+//                                      [0, 0, 3, 0]
+//                                      [0, 0, 0, 4]],
+//                                     [[5, 0, 0, 0]
+//                                      [0, 6, 0, 0]
+//                                      [0, 0, 7, 0]
+//                                      [0, 0, 0, 8]]]
+//
+// which has shape (2, 4, 4)
+// ```
+//
+// Arguments:
+//	diagonal: Rank `k`, where `k >= 1`.
+//
+// Returns Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
+func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixDiag",
+		Input: []tf.Input{
+			diagonal,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
+type StatelessTruncatedNormalAttr func(optionalAttr)
+
+// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessTruncatedNormal",
+		Input: []tf.Input{
+			shape, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug.
+type RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Retrieve SGD embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns:
+//	parameters: Parameter parameters updated by the stochastic gradient descent optimization algorithm.
+//	gradient_accumulators: Parameter gradient_accumulators updated by the Adadelta optimization algorithm.
+func RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr) (parameters tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
+type StatelessRandomUniformAttr func(optionalAttr)
+
+// StatelessRandomUniformDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom random values from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniform",
+		Input: []tf.Input{
+			shape, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatelessRandomUniformFullIntAttr is an optional argument to StatelessRandomUniformFullInt.
+type StatelessRandomUniformFullIntAttr func(optionalAttr)
+
+// StatelessRandomUniformFullIntDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_UINT64
+func StatelessRandomUniformFullIntDtype(value tf.DataType) StatelessRandomUniformFullIntAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom random integers from a uniform distribution.
+//
+// The generated values are uniform integers covering the whole range of `dtype`.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniformFullInt(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformFullIntAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniformFullInt",
+		Input: []tf.Input{
+			shape, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.
+type RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Retrieve Momentum embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns:
+//	parameters: Parameter parameters updated by the Momentum optimization algorithm.
+//	momenta: Parameter momenta updated by the Momentum optimization algorithm.
+//	gradient_accumulators: Parameter gradient_accumulators updated by the Momentum optimization algorithm.
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// MaxPoolGradGradV2Attr is an optional argument to MaxPoolGradGradV2.
+type MaxPoolGradGradV2Attr func(optionalAttr)
+
+// MaxPoolGradGradV2DataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradGradV2DataFormat(value string) MaxPoolGradGradV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes second-order gradients of the maxpooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradGradV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolGradGradV2",
+		Input: []tf.Input{
+			orig_input, orig_output, grad, ksize, strides,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
+type ResourceSparseApplyRMSPropAttr func(optionalAttr)
+
+// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the RMSProp algorithm.
+//
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyRMSProp",
+		Input: []tf.Input{
+			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// RetrieveTPUEmbeddingMomentumParametersAttr is an optional argument to RetrieveTPUEmbeddingMomentumParameters.
+type RetrieveTPUEmbeddingMomentumParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingMomentumParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func RetrieveTPUEmbeddingMomentumParametersTableId(value int64) RetrieveTPUEmbeddingMomentumParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingMomentumParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMomentumParametersTableName(value string) RetrieveTPUEmbeddingMomentumParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingMomentumParametersConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMomentumParametersConfig(value string) RetrieveTPUEmbeddingMomentumParametersAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Retrieve Momentum embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns:
+//	parameters: Parameter parameters updated by the Momentum optimization algorithm.
+//	momenta: Parameter momenta updated by the Momentum optimization algorithm.
+func RetrieveTPUEmbeddingMomentumParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersAttr) (parameters tf.Output, momenta tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingMomentumParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingMomentumParametersGradAccumDebug.
+type LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func LoadTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// LoadTPUEmbeddingMomentumParametersGradAccumDebugConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingMomentumParametersGradAccumDebugConfig(value string) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Load Momentum embedding parameters with debug support.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the Momentum optimization algorithm.
+//	momenta: Value of momenta used in the Momentum optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the Momentum optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingMomentumParametersGradAccumDebug",
+		Input: []tf.Input{
+			parameters, momenta, gradient_accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// DecodeCompressedAttr is an optional argument to DecodeCompressed.
+type DecodeCompressedAttr func(optionalAttr)
+
+// DecodeCompressedCompressionType sets the optional compression_type attribute to value.
+//
+// value: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+// If not specified, defaults to ""
+func DecodeCompressedCompressionType(value string) DecodeCompressedAttr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
+	}
+}
+
+// Decompress strings.
+//
+// This op decompresses each element of the `bytes` input `Tensor`, which
+// is assumed to be compressed using the given `compression_type`.
+//
+// The `output` is a string `Tensor` of the same shape as `bytes`,
+// each element containing the decompressed data from the corresponding
+// element in `bytes`.
+//
+// Arguments:
+//	bytes: A Tensor of string which is compressed.
+//
+// Returns A Tensor with the same shape as input `bytes`, uncompressed
+// from bytes.
+func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompressedAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeCompressed",
+		Input: []tf.Input{
+			bytes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// NonDeterministicIntsAttr is an optional argument to NonDeterministicInts.
+type NonDeterministicIntsAttr func(optionalAttr)
+
+// NonDeterministicIntsDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_INT64
+func NonDeterministicIntsDtype(value tf.DataType) NonDeterministicIntsAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Non-deterministically generates some integers.
+//
+// This op may use some OS-provided source of non-determinism (e.g. an RNG), so each execution will give different results.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//
+// Returns Non-deterministic integer values with specified shape.
+func NonDeterministicInts(scope *Scope, shape tf.Output, optional ...NonDeterministicIntsAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "NonDeterministicInts",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MultinomialAttr is an optional argument to Multinomial.
+type MultinomialAttr func(optionalAttr)
+
+// MultinomialSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 is set to be non-zero, the internal random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func MultinomialSeed(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// MultinomialSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func MultinomialSeed2(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// MultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["output_dtype"] = value
+	}
+}
+
+// Draws samples from a multinomial distribution.
+//
+// Arguments:
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Multinomial",
+		Input: []tf.Input{
+			logits, num_samples,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SerializeSparseAttr is an optional argument to SerializeSparse.
+type SerializeSparseAttr func(optionalAttr)
+
+// SerializeSparseOutType sets the optional out_type attribute to value.
+//
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SerializeSparse",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Extracts the average gradient in the given ConditionalAccumulator.
+//
+// The op blocks until sufficient (i.e., more than num_required)
+// gradients have been accumulated.  If the accumulator has already
+// aggregated more than num_required gradients, it returns the average of
+// the accumulated gradients.  Also automatically increments the recorded
+// global_step in the accumulator by 1, and resets the aggregate to 0.
+//
+// Arguments:
+//	handle: The handle to an accumulator.
+//	num_required: Number of gradients required before we return an aggregate.
+//	dtype: The data type of accumulated gradients. Needs to correspond to the type
+// of the accumulator.
+//
+// Returns The average of the accumulated gradients.
+func ResourceAccumulatorTakeGradient(scope *Scope, handle tf.Output, num_required tf.Output, dtype tf.DataType) (average tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "ResourceAccumulatorTakeGradient",
+		Input: []tf.Input{
+			handle, num_required,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// InfeedEnqueueAttr is an optional argument to InfeedEnqueue.
+type InfeedEnqueueAttr func(optionalAttr)
+
+// InfeedEnqueueShape sets the optional shape attribute to value.
+//
+// value: The shape of the tensor.
+// If not specified, defaults to <>
+func InfeedEnqueueShape(value tf.Shape) InfeedEnqueueAttr {
+	return func(m optionalAttr) {
+		m["shape"] = value
+	}
+}
+
+// InfeedEnqueueLayout sets the optional layout attribute to value.
+//
+// value: A vector holding the requested layout in minor-to-major sequence.
+// If a layout attribute is passed, but its values are all -1, the layout will
+// be computed by the infeed operation.
+// If not specified, defaults to <>
+func InfeedEnqueueLayout(value []int64) InfeedEnqueueAttr {
+	return func(m optionalAttr) {
+		m["layout"] = value
+	}
+}
+
+// InfeedEnqueueDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func InfeedEnqueueDeviceOrdinal(value int64) InfeedEnqueueAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// An op which feeds a single Tensor value into the computation.
+//
+// Arguments:
+//	input: A tensor that will be provided using the infeed mechanism.
+//
+// Returns the created operation.
+func InfeedEnqueue(scope *Scope, input tf.Output, optional ...InfeedEnqueueAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "InfeedEnqueue",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// EnqueueTPUEmbeddingIntegerBatchAttr is an optional argument to EnqueueTPUEmbeddingIntegerBatch.
+type EnqueueTPUEmbeddingIntegerBatchAttr func(optionalAttr)
+
+// EnqueueTPUEmbeddingIntegerBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. Should be >= 0 and less than the number
+// of TPU cores in the task on which the node is placed.
+// If not specified, defaults to -1
+func EnqueueTPUEmbeddingIntegerBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingIntegerBatchAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// An op that enqueues a list of input batch tensors to TPUEmbedding.
+//
+// Arguments:
+//	batch: A list of 1D tensors, one for each embedding table, containing the
+// indices into the tables.
+//	mode_override: A string input that overrides the mode specified in the
+// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+//
+// Returns the created operation.
+func EnqueueTPUEmbeddingIntegerBatch(scope *Scope, batch []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingIntegerBatchAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EnqueueTPUEmbeddingIntegerBatch",
+		Input: []tf.Input{
+			tf.OutputList(batch), mode_override,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Component-wise divides a SparseTensor by a dense Tensor.
+//
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
+//
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseDenseCwiseDiv",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape, dense,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the gradient of `Tile`.
+//
+// DEPRECATED at GraphDef version 3: TileGrad has been replaced with reduce_sum
+//
+// Since `Tile` takes an input and repeats the input `multiples` times
+// along each dimension, `TileGrad` takes in `multiples` and aggregates
+// each repeated tile of `input` into `output`.
+func TileGrad(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TileGrad",
+		Input: []tf.Input{
+			input, multiples,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AudioSummaryAttr is an optional argument to AudioSummary.
+type AudioSummaryAttr func(optionalAttr)
+
+// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
+//
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_outputs"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with audio.
+//
+// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
+//
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+//
+// Arguments:
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"sample_rate": sample_rate}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AudioSummary",
+		Input: []tf.Input{
+			tag, tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingFTRLParametersAttr is an optional argument to LoadTPUEmbeddingFTRLParameters.
+type LoadTPUEmbeddingFTRLParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingFTRLParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func LoadTPUEmbeddingFTRLParametersTableId(value int64) LoadTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingFTRLParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingFTRLParametersTableName(value string) LoadTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// LoadTPUEmbeddingFTRLParametersConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingFTRLParametersConfig(value string) LoadTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Load FTRL embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the FTRL optimization algorithm.
+//	accumulators: Value of accumulators used in the FTRL optimization algorithm.
+//	linears: Value of linears used in the FTRL optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, linears tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingFTRLParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingFTRLParameters",
+		Input: []tf.Input{
+			parameters, accumulators, linears,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// RetrieveTPUEmbeddingFTRLParametersAttr is an optional argument to RetrieveTPUEmbeddingFTRLParameters.
+type RetrieveTPUEmbeddingFTRLParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingFTRLParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func RetrieveTPUEmbeddingFTRLParametersTableId(value int64) RetrieveTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingFTRLParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingFTRLParametersTableName(value string) RetrieveTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingFTRLParametersConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingFTRLParametersConfig(value string) RetrieveTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Retrieve FTRL embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns:
+//	parameters: Parameter parameters updated by the FTRL optimization algorithm.
+//	accumulators: Parameter accumulators updated by the FTRL optimization algorithm.
+//	linears: Parameter linears updated by the FTRL optimization algorithm.
+func RetrieveTPUEmbeddingFTRLParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingFTRLParametersAttr) (parameters tf.Output, accumulators tf.Output, linears tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingFTRLParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// UnicodeDecodeWithOffsetsAttr is an optional argument to UnicodeDecodeWithOffsets.
+type UnicodeDecodeWithOffsetsAttr func(optionalAttr)
+
+// UnicodeDecodeWithOffsetsErrors sets the optional errors attribute to value.
+//
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeDecodeWithOffsetsErrors(value string) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
+	}
+}
+
+// UnicodeDecodeWithOffsetsReplacementChar sets the optional replacement_char attribute to value.
+//
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD or U+65533.)
+// If not specified, defaults to 65533
+func UnicodeDecodeWithOffsetsReplacementChar(value int64) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
+	}
+}
+
+// UnicodeDecodeWithOffsetsReplaceControlCharacters sets the optional replace_control_characters attribute to value.
+//
+// value: Whether to replace the C0 control characters (00-1F) with the
+// `replacement_char`. Default is false.
+// If not specified, defaults to false
+func UnicodeDecodeWithOffsetsReplaceControlCharacters(value bool) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["replace_control_characters"] = value
+	}
+}
+
+// UnicodeDecodeWithOffsetsTsplits sets the optional Tsplits attribute to value.
+// If not specified, defaults to DT_INT64
+func UnicodeDecodeWithOffsetsTsplits(value tf.DataType) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["Tsplits"] = value
+	}
+}
+
+// Decodes each string in `input` into a sequence of Unicode code points.
+//
+// The character codepoints for all strings are returned using a single vector
+// `char_values`, with strings expanded to characters in row-major order.
+// Similarly, the character start byte offsets are returned using a single vector
+// `char_to_byte_starts`, with strings expanded in row-major order.
+//
+// The `row_splits` tensor indicates where the codepoints and start offsets for
+// each input string begin and end within the `char_values` and
+// `char_to_byte_starts` tensors.  In particular, the values for the `i`th
+// string (in row-major order) are stored in the slice
+// `[row_splits[i]:row_splits[i+1]]`. Thus:
+//
+// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `char_to_bytes_starts[row_splits[i]+j]` is the start byte offset for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
+//   string (in row-major order).
+//
+// Arguments:
+//	input: The text to be decoded. Can have any shape. Note that the output is flattened
+// to a vector of char values.
+//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
+// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+//
+// Returns:
+//	row_splits: A 1D int32 tensor containing the row splits.
+//	char_values: A 1D int32 Tensor containing the decoded codepoints.
+//	char_to_byte_starts: A 1D int32 Tensor containing the byte index in the input string where each
+// character in `char_values` starts.
+func UnicodeDecodeWithOffsets(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeWithOffsetsAttr) (row_splits tf.Output, char_values tf.Output, char_to_byte_starts tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"input_encoding": input_encoding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UnicodeDecodeWithOffsets",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// TPUPartitionedOutputAttr is an optional argument to TPUPartitionedOutput.
+type TPUPartitionedOutputAttr func(optionalAttr)
+
+// TPUPartitionedOutputPartitionDim sets the optional partition_dim attribute to value.
+//
+// value: An integer describles which dimension is partitioned.
+// If not specified, defaults to 0
+func TPUPartitionedOutputPartitionDim(value int64) TPUPartitionedOutputAttr {
+	return func(m optionalAttr) {
+		m["partition_dim"] = value
+	}
+}
+
+// An op that demultiplexes a tensor to be sharded by XLA to a list of partitioned
+//
+// outputs outside the XLA computation.
+//
+// Arguments:
+//	inputs: A tensor which represents the full shape of partitioned tensors.
+//
+//
+// Returns A list of partitioned inputs which must have the same shape.
+func TPUPartitionedOutput(scope *Scope, inputs tf.Output, num_splits int64, optional ...TPUPartitionedOutputAttr) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_splits": num_splits}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TPUPartitionedOutput",
+		Input: []tf.Input{
+			inputs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("TPUPartitionedOutput", err)
+		return
+	}
+	return output
+}
+
+// Computes the mean along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
+// over `j` such that `segment_ids[j] == i` and `N` is the total number of
+// values summed.
+//
+// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
+// </div>
+//
+// For example:
+//
+// ```
+// c = tf.constant([[1.0,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_mean(c, tf.constant([0, 0, 1]))
+// # ==> [[2.5, 2.5, 2.5, 2.5],
+// #      [5, 6, 7, 8]]
+// ```
+//
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentMean",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CTCLossV2Attr is an optional argument to CTCLossV2.
+type CTCLossV2Attr func(optionalAttr)
+
+// CTCLossV2PreprocessCollapseRepeated sets the optional preprocess_collapse_repeated attribute to value.
+//
+// value: Scalar, if true then repeated labels are
+// collapsed prior to the CTC calculation.
+// If not specified, defaults to false
+func CTCLossV2PreprocessCollapseRepeated(value bool) CTCLossV2Attr {
+	return func(m optionalAttr) {
+		m["preprocess_collapse_repeated"] = value
+	}
+}
+
+// CTCLossV2CtcMergeRepeated sets the optional ctc_merge_repeated attribute to value.
+//
+// value: Scalar.  If set to false, *during* CTC calculation
+// repeated non-blank labels will not be merged and are interpreted as
+// individual labels.  This is a simplified version of CTC.
+// If not specified, defaults to true
+func CTCLossV2CtcMergeRepeated(value bool) CTCLossV2Attr {
+	return func(m optionalAttr) {
+		m["ctc_merge_repeated"] = value
+	}
+}
+
+// CTCLossV2IgnoreLongerOutputsThanInputs sets the optional ignore_longer_outputs_than_inputs attribute to value.
+//
+// value: Scalar. If set to true, during CTC
+// calculation, items that have longer output sequences than input sequences
+// are skipped: they don't contribute to the loss term and have zero-gradient.
+// If not specified, defaults to false
+func CTCLossV2IgnoreLongerOutputsThanInputs(value bool) CTCLossV2Attr {
+	return func(m optionalAttr) {
+		m["ignore_longer_outputs_than_inputs"] = value
+	}
+}
+
+// Calculates the CTC Loss (log probability) for each batch entry.  Also calculates
+//
+// the gradient.  This class performs the softmax operation for you, so inputs
+// should be e.g. linear projections of outputs by an LSTM.
+//
+// Arguments:
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits. Default blank
+// label is 0 rather num_classes - 1.
+//	labels_indices: The indices of a `SparseTensor<int32, 2>`.
+// `labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for
+// `(batch b, time t)`.
+//	labels_values: The values (labels) associated with the given batch and time.
+//	sequence_length: A vector containing sequence lengths (batch).
+//
+// Returns:
+//	loss: A vector (batch) containing log-probabilities.
+//	gradient: The gradient of `loss`.  3-D, shape:
+// `(max_time x batch_size x num_classes)`.
+func CTCLossV2(scope *Scope, inputs tf.Output, labels_indices tf.Output, labels_values tf.Output, sequence_length tf.Output, optional ...CTCLossV2Attr) (loss tf.Output, gradient tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CTCLossV2",
+		Input: []tf.Input{
+			inputs, labels_indices, labels_values, sequence_length,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// ResourceSparseApplyKerasMomentumAttr is an optional argument to ResourceSparseApplyKerasMomentum.
+type ResourceSparseApplyKerasMomentumAttr func(optionalAttr)
+
+// ResourceSparseApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyKerasMomentumUseLocking(value bool) ResourceSparseApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceSparseApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, the tensor passed to compute grad will be
+// var + momentum * accum, so in the end, the var you get is actually
+// var + momentum * accum.
+// If not specified, defaults to false
+func ResourceSparseApplyKerasMomentumUseNesterov(value bool) ResourceSparseApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
+//
+// Set use_nesterov = True if you want to use Nesterov momentum.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+//
+// accum = accum * momentum - lr * grad
+// var += accum
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	momentum: Momentum. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyKerasMomentumAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyKerasMomentum",
+		Input: []tf.Input{
+			var_, accum, lr, grad, indices, momentum,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns the result of a TPU compilation.
+//
+// This operation returns the result of a TPU compilation as a serialized
+// CompilationResultProto, which holds a status and an error message if an error
+// occurred during compilation.
+func TPUCompilationResult(scope *Scope) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TPUCompilationResult",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns element-wise integer closest to x.
+//
+// If the result is midway between two representable values,
+// the even representable is chosen.
+// For example:
+//
+// ```
+// rint(-1.5) ==> -2.0
+// rint(0.5000001) ==> 1.0
+// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
+// ```
+func Rint(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Rint",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ParseSequenceExampleV2Attr is an optional argument to ParseSequenceExampleV2.
+type ParseSequenceExampleV2Attr func(optionalAttr)
+
+// ParseSequenceExampleV2NcontextSparse sets the optional Ncontext_sparse attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleV2NcontextSparse(value int64) ParseSequenceExampleV2Attr {
+	return func(m optionalAttr) {
+		m["Ncontext_sparse"] = value
+	}
+}
+
+// ParseSequenceExampleV2ContextSparseTypes sets the optional context_sparse_types attribute to value.
+//
+// value: A list of Ncontext_sparse types; the data types of data in
+// each context Feature given in context_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleV2ContextSparseTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
+	return func(m optionalAttr) {
+		m["context_sparse_types"] = value
+	}
+}
+
+// ParseSequenceExampleV2ContextRaggedValueTypes sets the optional context_ragged_value_types attribute to value.
+//
+// value: RaggedTensor.value dtypes for the ragged context features.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleV2ContextRaggedValueTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
+	return func(m optionalAttr) {
+		m["context_ragged_value_types"] = value
+	}
+}
+
+// ParseSequenceExampleV2ContextRaggedSplitTypes sets the optional context_ragged_split_types attribute to value.
+//
+// value: RaggedTensor.row_split dtypes for the ragged context features.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleV2ContextRaggedSplitTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
+	return func(m optionalAttr) {
+		m["context_ragged_split_types"] = value
+	}
+}
+
+// ParseSequenceExampleV2ContextDenseShapes sets the optional context_dense_shapes attribute to value.
+//
+// value: A list of Ncontext_dense shapes; the shapes of data in
+// each context Feature given in context_dense_keys.
+// The number of elements in the Feature corresponding to context_dense_key[j]
+// must always equal context_dense_shapes[j].NumEntries().
+// The shape of context_dense_values[j] will match context_dense_shapes[j].
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleV2ContextDenseShapes(value []tf.Shape) ParseSequenceExampleV2Attr {
+	return func(m optionalAttr) {
+		m["context_dense_shapes"] = value
+	}
+}
+
+// ParseSequenceExampleV2NfeatureListSparse sets the optional Nfeature_list_sparse attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleV2NfeatureListSparse(value int64) ParseSequenceExampleV2Attr {
+	return func(m optionalAttr) {
+		m["Nfeature_list_sparse"] = value
+	}
+}
+
+// ParseSequenceExampleV2NfeatureListDense sets the optional Nfeature_list_dense attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleV2NfeatureListDense(value int64) ParseSequenceExampleV2Attr {
+	return func(m optionalAttr) {
+		m["Nfeature_list_dense"] = value
+	}
+}
+
+// ParseSequenceExampleV2FeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleV2FeatureListDenseTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_types"] = value
+	}
+}
+
+// ParseSequenceExampleV2FeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
+//
+// value: A list of Nfeature_list_sparse types; the data types
+// of data in each FeatureList given in feature_list_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleV2FeatureListSparseTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
+	return func(m optionalAttr) {
+		m["feature_list_sparse_types"] = value
+	}
+}
+
+// ParseSequenceExampleV2FeatureListRaggedValueTypes sets the optional feature_list_ragged_value_types attribute to value.
+//
+// value: RaggedTensor.value dtypes for the ragged FeatureList features.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleV2FeatureListRaggedValueTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
+	return func(m optionalAttr) {
+		m["feature_list_ragged_value_types"] = value
+	}
+}
+
+// ParseSequenceExampleV2FeatureListRaggedSplitTypes sets the optional feature_list_ragged_split_types attribute to value.
+//
+// value: RaggedTensor.row_split dtypes for the ragged FeatureList features.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleV2FeatureListRaggedSplitTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
+	return func(m optionalAttr) {
+		m["feature_list_ragged_split_types"] = value
+	}
+}
+
+// ParseSequenceExampleV2FeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
+//
+// value: A list of Nfeature_list_dense shapes; the shapes of
+// data in each FeatureList given in feature_list_dense_keys.
+// The shape of each Feature in the FeatureList corresponding to
+// feature_list_dense_key[j] must always equal
+// feature_list_dense_shapes[j].NumEntries().
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleV2FeatureListDenseShapes(value []tf.Shape) ParseSequenceExampleV2Attr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_shapes"] = value
+	}
+}
+
+// Transforms a vector of tf.io.SequenceExample protos (as strings) into
+// typed tensors.
+//
+// Arguments:
+//	serialized: A scalar or vector containing binary serialized SequenceExample protos.
+//	debug_name: A scalar or vector containing the names of the serialized protos.
+// May contain, for example, table key (descriptive) name for the
+// corresponding serialized proto.  This is purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no name is available.
+//	context_sparse_keys: The keys expected in the Examples' features associated with context_sparse
+// values.
+//	context_dense_keys: The keys expected in the SequenceExamples' context features associated with
+// dense values.
+//	context_ragged_keys: The keys expected in the Examples' features associated with context_ragged
+// values.
+//	feature_list_sparse_keys: The keys expected in the FeatureLists associated with sparse values.
+//	feature_list_dense_keys: The keys expected in the SequenceExamples' feature_lists associated
+// with lists of dense values.
+//	feature_list_ragged_keys: The keys expected in the FeatureLists associated with ragged values.
+//	feature_list_dense_missing_assumed_empty: A vector corresponding 1:1 with feature_list_dense_keys, indicating which
+// features may be missing from the SequenceExamples.  If the associated
+// FeatureList is missing, it is treated as empty.
+//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
+// context_dense_defaults[j] provides default values
+// when the SequenceExample's context map lacks context_dense_key[j].
+// If an empty Tensor is provided for context_dense_defaults[j],
+// then the Feature context_dense_keys[j] is required.
+// The input type is inferred from context_dense_defaults[j], even when it's
+// empty.  If context_dense_defaults[j] is not empty, its shape must match
+// context_dense_shapes[j].
+func ParseSequenceExampleV2(scope *Scope, serialized tf.Output, debug_name tf.Output, context_sparse_keys tf.Output, context_dense_keys tf.Output, context_ragged_keys tf.Output, feature_list_sparse_keys tf.Output, feature_list_dense_keys tf.Output, feature_list_ragged_keys tf.Output, feature_list_dense_missing_assumed_empty tf.Output, context_dense_defaults []tf.Output, optional ...ParseSequenceExampleV2Attr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, context_ragged_values []tf.Output, context_ragged_row_splits []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output, feature_list_dense_lengths []tf.Output, feature_list_ragged_values []tf.Output, feature_list_ragged_outer_splits []tf.Output, feature_list_ragged_inner_splits []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ParseSequenceExampleV2",
+		Input: []tf.Input{
+			serialized, debug_name, context_sparse_keys, context_dense_keys, context_ragged_keys, feature_list_sparse_keys, feature_list_dense_keys, feature_list_ragged_keys, feature_list_dense_missing_assumed_empty, tf.OutputList(context_dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSequenceExampleV2", err)
+		return
+	}
+	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExampleV2", err)
+		return
+	}
+	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSequenceExampleV2", err)
+		return
+	}
+	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExampleV2", err)
+		return
+	}
+	if context_ragged_values, idx, err = makeOutputList(op, idx, "context_ragged_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExampleV2", err)
+		return
+	}
+	if context_ragged_row_splits, idx, err = makeOutputList(op, idx, "context_ragged_row_splits"); err != nil {
+		scope.UpdateErr("ParseSequenceExampleV2", err)
+		return
+	}
+	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSequenceExampleV2", err)
+		return
+	}
+	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExampleV2", err)
+		return
+	}
+	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSequenceExampleV2", err)
+		return
+	}
+	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExampleV2", err)
+		return
+	}
+	if feature_list_dense_lengths, idx, err = makeOutputList(op, idx, "feature_list_dense_lengths"); err != nil {
+		scope.UpdateErr("ParseSequenceExampleV2", err)
+		return
+	}
+	if feature_list_ragged_values, idx, err = makeOutputList(op, idx, "feature_list_ragged_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExampleV2", err)
+		return
+	}
+	if feature_list_ragged_outer_splits, idx, err = makeOutputList(op, idx, "feature_list_ragged_outer_splits"); err != nil {
+		scope.UpdateErr("ParseSequenceExampleV2", err)
+		return
+	}
+	if feature_list_ragged_inner_splits, idx, err = makeOutputList(op, idx, "feature_list_ragged_inner_splits"); err != nil {
+		scope.UpdateErr("ParseSequenceExampleV2", err)
+		return
+	}
+	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, context_ragged_values, context_ragged_row_splits, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values, feature_list_dense_lengths, feature_list_ragged_values, feature_list_ragged_outer_splits, feature_list_ragged_inner_splits
+}
+
+// CudnnRNNAttr is an optional argument to CudnnRNN.
+type CudnnRNNAttr func(optionalAttr)
+
+// CudnnRNNRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNRnnMode(value string) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNInputMode(value string) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNDirection(value string) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNDropout(value float32) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNSeed(value int64) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNSeed2(value int64) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// CudnnRNNIsTraining sets the optional is_training attribute to value.
+// If not specified, defaults to true
+func CudnnRNNIsTraining(value bool) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// A RNN backed by cuDNN.
+//
+// Computes the RNN from the input and initial states, with respect to the params
+// buffer.
+//
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//   the actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// output: A 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// is_training: Indicates whether this operation is used for inference or
+//   training.
+// reserve_space: An opaque tensor that can be used in backprop calculation. It
+//   is only produced if is_training is false.
+func CudnnRNN(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, optional ...CudnnRNNAttr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNN",
+		Input: []tf.Input{
+			input, input_h, input_c, params,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// Creates a dataset that batches `batch_size` elements from `input_dataset`.
+//
+// Arguments:
+//
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//
+//
+func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "BatchDataset",
+		Input: []tf.Input{
+			input_dataset, batch_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EnqueueTPUEmbeddingSparseTensorBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseTensorBatch.
+type EnqueueTPUEmbeddingSparseTensorBatchAttr func(optionalAttr)
+
+// EnqueueTPUEmbeddingSparseTensorBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. Should be >= 0 and less than the number
+// of TPU cores in the task on which the node is placed.
+// If not specified, defaults to -1
+func EnqueueTPUEmbeddingSparseTensorBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingSparseTensorBatchAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// EnqueueTPUEmbeddingSparseTensorBatchCombiners sets the optional combiners attribute to value.
+//
+// value: A list of string scalars, one for each embedding table that specify
+// how to normalize the embedding activations after weighted summation.
+// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
+// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
+// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
+// all tables.
+// If not specified, defaults to <>
+func EnqueueTPUEmbeddingSparseTensorBatchCombiners(value []string) EnqueueTPUEmbeddingSparseTensorBatchAttr {
+	return func(m optionalAttr) {
+		m["combiners"] = value
+	}
+}
+
+// EnqueueTPUEmbeddingSparseTensorBatchMaxSequenceLengths sets the optional max_sequence_lengths attribute to value.
+// If not specified, defaults to <>
+func EnqueueTPUEmbeddingSparseTensorBatchMaxSequenceLengths(value []int64) EnqueueTPUEmbeddingSparseTensorBatchAttr {
+	return func(m optionalAttr) {
+		m["max_sequence_lengths"] = value
+	}
+}
+
+// Eases the porting of code that uses tf.nn.embedding_lookup_sparse().
+//
+// sample_indices[i], embedding_indices[i] and aggregation_weights[i] correspond
+// to the ith feature. table_ids[i] indicates which embedding table to look up ith
+// feature.
+//
+// The tensors at corresponding positions in the three input lists (sample_indices,
+// embedding_indices and aggregation_weights) must have the same shape, i.e. rank 1
+// with dim_size() equal to the total number of lookups into the table described by
+// the corresponding feature.
+//
+// Arguments:
+//	sample_indices: A list of rank 1 Tensors specifying the training example to
+// which the corresponding embedding_indices and aggregation_weights values
+// belong. It corresponds to sp_ids.indices[:,0] in  embedding_lookup_sparse().
+//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
+// It corresponds to sp_ids.values in embedding_lookup_sparse().
+//	aggregation_weights: A list of rank 1 Tensors containing per training example
+// aggregation weights. It corresponds to sp_weights.values in
+// embedding_lookup_sparse().
+//	mode_override: A string input that overrides the mode specified in the
+// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+//	table_ids: A list of integers specifying the identifier of the embedding table
+// (offset of TableDescriptor in the TPUEmbeddingConfiguration) to lookup the
+// corresponding input. The ith input is looked up using table_ids[i]. The size
+// of the table_ids list must be equal to that of sample_indices,
+// embedding_indices and aggregation_weights.
+//
+// Returns the created operation.
+func EnqueueTPUEmbeddingSparseTensorBatch(scope *Scope, sample_indices []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, table_ids []int64, optional ...EnqueueTPUEmbeddingSparseTensorBatchAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"table_ids": table_ids}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EnqueueTPUEmbeddingSparseTensorBatch",
+		Input: []tf.Input{
+			tf.OutputList(sample_indices), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ReverseSequenceAttr is an optional argument to ReverseSequence.
+type ReverseSequenceAttr func(optionalAttr)
+
+// ReverseSequenceBatchDim sets the optional batch_dim attribute to value.
+//
+// value: The dimension along which reversal is performed.
+// If not specified, defaults to 0
+func ReverseSequenceBatchDim(value int64) ReverseSequenceAttr {
+	return func(m optionalAttr) {
+		m["batch_dim"] = value
+	}
+}
+
+// Reverses variable length slices.
+//
+// This op first slices `input` along the dimension `batch_dim`, and for each
+// slice `i`, reverses the first `seq_lengths[i]` elements along
+// the dimension `seq_dim`.
+//
+// The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
+// and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
+//
+// The output slice `i` along dimension `batch_dim` is then given by input
+// slice `i`, with the first `seq_lengths[i]` slices along dimension
+// `seq_dim` reversed.
+//
+// For example:
+//
+// ```
+// # Given this:
+// batch_dim = 0
+// seq_dim = 1
+// input.dims = (4, 8, ...)
+// seq_lengths = [7, 2, 3, 5]
+//
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
+// output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
+// output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
+// output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
+//
+// # while entries past seq_lens are copied through:
+// output[0, 7:, :, ...] = input[0, 7:, :, ...]
+// output[1, 2:, :, ...] = input[1, 2:, :, ...]
+// output[2, 3:, :, ...] = input[2, 3:, :, ...]
+// output[3, 2:, :, ...] = input[3, 2:, :, ...]
+// ```
+//
+// In contrast, if:
+//
+// ```
+// # Given this:
+// batch_dim = 2
+// seq_dim = 0
+// input.dims = (8, ?, 4, ...)
+// seq_lengths = [7, 2, 3, 5]
+//
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
+// output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
+// output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
+// output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
+//
+// # while entries past seq_lens are copied through:
+// output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
+// output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
+// output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
+// output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
+// ```
+//
+// Arguments:
+//	input: The input to reverse.
+//	seq_lengths: 1-D with length `input.dims(batch_dim)` and
+// `max(seq_lengths) <= input.dims(seq_dim)`
+//	seq_dim: The dimension which is partially reversed.
+//
+// Returns The partially reversed input. It has the same shape as `input`.
+func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_dim int64, optional ...ReverseSequenceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"seq_dim": seq_dim}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ReverseSequence",
+		Input: []tf.Input{
+			input, seq_lengths,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Fetches multiple values from infeed as an XLA tuple.
+//
+// Arguments:
+//	dtypes: The element types of each element in `outputs`.
+//	shapes: The shapes of each tensor in `outputs`.
+//
+// Returns A list of tensors that will be provided using the infeed mechanism.
+func InfeedDequeueTuple(scope *Scope, dtypes []tf.DataType, shapes []tf.Shape) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes, "shapes": shapes}
+	opspec := tf.OpSpec{
+		Type: "InfeedDequeueTuple",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("InfeedDequeueTuple", err)
+		return
+	}
+	return outputs
+}
+
+// Serializes the tree ensemble to a proto.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//
+// Returns:
+//	stamp_token: Stamp token of the tree ensemble resource.
+//	tree_ensemble_serialized: Serialized proto of the ensemble.
+func BoostedTreesSerializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, tree_ensemble_serialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesSerializeEnsemble",
+		Input: []tf.Input{
+			tree_ensemble_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Computes inverse hyperbolic cosine of x element-wise.
+//
+// Given an input tensor, the function computes inverse hyperbolic cosine of every element.
+// Input range is `[1, inf]`. It returns `nan` if the input lies outside the range.
+//
+// ```python
+// x = tf.constant([-2, -0.5, 1, 1.2, 200, 10000, float("inf")])
+// tf.math.acosh(x) ==> [nan nan 0. 0.62236255 5.9914584 9.903487 inf]
+// ```
+func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Acosh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs deterministic pseudorandom random numbers from a gamma distribution.
+//
+// Outputs random values from a gamma distribution.
+//
+// The outputs are a deterministic function of `shape`, `seed`, and `alpha`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//	alpha: The concentration of the gamma distribution. Shape must match the rightmost
+// dimensions of `shape`.
+//
+// Returns Random values with specified shape.
+func StatelessRandomGammaV2(scope *Scope, shape tf.Output, seed tf.Output, alpha tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomGammaV2",
+		Input: []tf.Input{
+			shape, seed, alpha,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ConfigureDistributedTPUAttr is an optional argument to ConfigureDistributedTPU.
+type ConfigureDistributedTPUAttr func(optionalAttr)
+
+// ConfigureDistributedTPUEmbeddingConfig sets the optional embedding_config attribute to value.
+//
+// value: Reserved. Do not use.
+// If not specified, defaults to ""
+func ConfigureDistributedTPUEmbeddingConfig(value string) ConfigureDistributedTPUAttr {
+	return func(m optionalAttr) {
+		m["embedding_config"] = value
+	}
+}
+
+// ConfigureDistributedTPUTpuEmbeddingConfig sets the optional tpu_embedding_config attribute to value.
+//
+// value: Serialized tensorflow.tpu.TPUEmbeddingConfiguration that
+// describes the embedding lookups of the program.
+// If not specified, defaults to ""
+func ConfigureDistributedTPUTpuEmbeddingConfig(value string) ConfigureDistributedTPUAttr {
+	return func(m optionalAttr) {
+		m["tpu_embedding_config"] = value
+	}
+}
+
+// ConfigureDistributedTPUIsGlobalInit sets the optional is_global_init attribute to value.
+//
+// value: Reserved. Do not use.
+// If not specified, defaults to false
+func ConfigureDistributedTPUIsGlobalInit(value bool) ConfigureDistributedTPUAttr {
+	return func(m optionalAttr) {
+		m["is_global_init"] = value
+	}
+}
+
+// ConfigureDistributedTPUEnableWholeMeshCompilations sets the optional enable_whole_mesh_compilations attribute to value.
+// If not specified, defaults to false
+func ConfigureDistributedTPUEnableWholeMeshCompilations(value bool) ConfigureDistributedTPUAttr {
+	return func(m optionalAttr) {
+		m["enable_whole_mesh_compilations"] = value
+	}
+}
+
+// ConfigureDistributedTPUCompilationFailureClosesChips sets the optional compilation_failure_closes_chips attribute to value.
+// If not specified, defaults to true
+func ConfigureDistributedTPUCompilationFailureClosesChips(value bool) ConfigureDistributedTPUAttr {
+	return func(m optionalAttr) {
+		m["compilation_failure_closes_chips"] = value
+	}
+}
+
+// Sets up the centralized structures for a distributed TPU system.
+//
+// Returns A serialized tensorflow.tpu.TopologyProto that describes the TPU
+// topology.
+func ConfigureDistributedTPU(scope *Scope, optional ...ConfigureDistributedTPUAttr) (topology tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ConfigureDistributedTPU",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that executes a SQL query and emits rows of the result set.
+//
+// Arguments:
+//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
+//	data_source_name: A connection string to connect to the database.
+//	query: A SQL query to execute.
+//
+//
+func SqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "SqlDataset",
+		Input: []tf.Input{
+			driver_name, data_source_name, query,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs deterministic pseudorandom random integers from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[minval, maxval)`.
+//
+// The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//	minval: Minimum value (inclusive, scalar).
+//	maxval: Maximum value (exclusive, scalar).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniformInt(scope *Scope, shape tf.Output, seed tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniformInt",
+		Input: []tf.Input{
+			shape, seed, minval, maxval,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingRMSPropParameters.
+type LoadTPUEmbeddingRMSPropParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func LoadTPUEmbeddingRMSPropParametersTableId(value int64) LoadTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingRMSPropParametersTableName(value string) LoadTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// LoadTPUEmbeddingRMSPropParametersConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingRMSPropParametersConfig(value string) LoadTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Load RMSProp embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the RMSProp optimization algorithm.
+//	ms: Value of ms used in the RMSProp optimization algorithm.
+//	mom: Value of mom used in the RMSProp optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingRMSPropParameters(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingRMSPropParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingRMSPropParameters",
+		Input: []tf.Input{
+			parameters, ms, mom,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// InfeedEnqueueTupleAttr is an optional argument to InfeedEnqueueTuple.
+type InfeedEnqueueTupleAttr func(optionalAttr)
+
+// InfeedEnqueueTupleLayouts sets the optional layouts attribute to value.
+//
+// value: A vector holding the requested layout in minor-to-major sequence for
+// all the tuple shapes, in the order the shapes appear in the "shapes" input.
+// The layout elements for a sub-shape can be set to -1, in which case the
+// corresponding layout will be computed by the infeed operation.
+// If not specified, defaults to <>
+func InfeedEnqueueTupleLayouts(value []int64) InfeedEnqueueTupleAttr {
+	return func(m optionalAttr) {
+		m["layouts"] = value
+	}
+}
+
+// InfeedEnqueueTupleDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func InfeedEnqueueTupleDeviceOrdinal(value int64) InfeedEnqueueTupleAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// Feeds multiple Tensor values into the computation as an XLA tuple.
+//
+// Arguments:
+//	inputs: A list of tensors that will be provided using the infeed mechanism.
+//	shapes: The shapes of each tensor in `inputs`.
+//
+// Returns the created operation.
+func InfeedEnqueueTuple(scope *Scope, inputs []tf.Output, shapes []tf.Shape, optional ...InfeedEnqueueTupleAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shapes": shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "InfeedEnqueueTuple",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// MapClearAttr is an optional argument to MapClear.
+type MapClearAttr func(optionalAttr)
+
+// MapClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapClearCapacity(value int64) MapClearAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapClearMemoryLimit(value int64) MapClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapClearContainer(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapClearSharedName(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes all elements in the underlying container.
+//
+// Returns the created operation.
+func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapClear",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Deserialize `SparseTensor` objects.
+//
+// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
+// the last dimension stores serialized `SparseTensor` objects and the other N
+// dimensions (N >= 0) correspond to a batch. The ranks of the original
+// `SparseTensor` objects must all match. When the final `SparseTensor` is
+// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
+// the sparse tensors have been concatenated along new dimensions, one for each
+// batch.
+//
+// The output `SparseTensor` object's shape values for the original dimensions
+// are the max across the input `SparseTensor` objects' shape values for the
+// corresponding dimensions. The new dimensions match the size of the batch.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
+//
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+//
+// and
+//
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+//
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+//
+// Arguments:
+//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
+// must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "DeserializeSparse",
+		Input: []tf.Input{
+			serialized_sparse,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Decode web-safe base64-encoded strings.
+//
+// Input may or may not have padding at the end. See EncodeBase64 for padding.
+// Web-safe means that input must use - and _ instead of + and /.
+//
+// Arguments:
+//	input: Base64 strings to decode.
+//
+// Returns Decoded strings.
+func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeBase64",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingAdagradParametersAttr is an optional argument to LoadTPUEmbeddingAdagradParameters.
+type LoadTPUEmbeddingAdagradParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingAdagradParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func LoadTPUEmbeddingAdagradParametersTableId(value int64) LoadTPUEmbeddingAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingAdagradParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdagradParametersTableName(value string) LoadTPUEmbeddingAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// LoadTPUEmbeddingAdagradParametersConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdagradParametersConfig(value string) LoadTPUEmbeddingAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Load Adagrad embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the Adagrad optimization algorithm.
+//	accumulators: Value of accumulators used in the Adagrad optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingAdagradParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdagradParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingAdagradParameters",
+		Input: []tf.Input{
+			parameters, accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// QuantizedMatMulWithBiasAndReluAttr is an optional argument to QuantizedMatMulWithBiasAndRelu.
+type QuantizedMatMulWithBiasAndReluAttr func(optionalAttr)
+
+// QuantizedMatMulWithBiasAndReluToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMatMulWithBiasAndReluToutput(value tf.DataType) QuantizedMatMulWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// QuantizedMatMulWithBiasAndReluTransposeA sets the optional transpose_a attribute to value.
+//
+// value: If true, `a` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulWithBiasAndReluTransposeA(value bool) QuantizedMatMulWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// QuantizedMatMulWithBiasAndReluTransposeB sets the optional transpose_b attribute to value.
+//
+// value: If true, `b` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulWithBiasAndReluTransposeB(value bool) QuantizedMatMulWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// QuantizedMatMulWithBiasAndReluInputQuantMode sets the optional input_quant_mode attribute to value.
+//
+// value: Input data quantization mode. Either MIN_FIRST(default) or SCALED.
+// If not specified, defaults to "MIN_FIRST"
+func QuantizedMatMulWithBiasAndReluInputQuantMode(value string) QuantizedMatMulWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["input_quant_mode"] = value
+	}
+}
+
+// Perform a quantized matrix multiplication of  `a` by the matrix `b` with bias
+// add and relu fusion.
+//
+// The inputs must be two-dimensional matrices and 1D bias vector. And the inner
+// dimension of `a` (after being transposed if `transpose_a` is non-zero) must
+// match the outer dimension of `b` (after being transposed if `transposed_b` is
+// non-zero). Then do broadcast add operation with bias values on the matrix
+// multiplication result. The bias size must match inner dimension of `b`. Then do
+// relu activation to get non-negative result.
+//
+// Arguments:
+//	a: A matrix to be multiplied. Must be a two-dimensional tensor of type `quint8`.
+//	b: A matrix to be multiplied and must be a two-dimensional tensor of type `qint8`.
+//	bias: A 1D bias tensor with size matching with inner dimension of `b` (after being
+// transposed if `transposed_b` is non-zero).
+//	min_a: The float value that the lowest quantized `a` value represents.
+//	max_a: The float value that the highest quantized `a` value represents.
+//	min_b: The float value that the lowest quantized `b` value represents.
+//	max_b: The float value that the highest quantized `b` value represents.
+//
+// Returns:
+//	out
+//	min_out: The float value that the lowest quantized output value represents.
+//	max_out: The float value that the highest quantized output value represents.
+func QuantizedMatMulWithBiasAndRelu(scope *Scope, a tf.Output, b tf.Output, bias tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulWithBiasAndReluAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedMatMulWithBiasAndRelu",
+		Input: []tf.Input{
+			a, b, bias, min_a, max_a, min_b, max_b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Shuts down a running distributed TPU system.
+//
+// The op returns an error if no system is running.
+//
+// Returns the created operation.
+func ShutdownDistributedTPU(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ShutdownDistributedTPU",
+	}
+	return scope.AddOperation(opspec)
+}
+
+// SerializeManySparseAttr is an optional argument to SerializeManySparse.
+type SerializeManySparseAttr func(optionalAttr)
+
+// SerializeManySparseOutType sets the optional out_type attribute to value.
+//
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
+//
+// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
+// is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The serialized
+// `SparseTensor` objects going into each row of `serialized_sparse` will have
+// rank `R-1`.
+//
+// The minibatch size `N` is extracted from `sparse_shape[0]`.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SerializeManySparse",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RequantizePerChannelAttr is an optional argument to RequantizePerChannel.
+type RequantizePerChannelAttr func(optionalAttr)
+
+// RequantizePerChannelOutType sets the optional out_type attribute to value.
+//
+// value: The quantized type of output tensor that needs to be converted.
+// If not specified, defaults to DT_QUINT8
+func RequantizePerChannelOutType(value tf.DataType) RequantizePerChannelAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Requantizes input with min and max values known per channel.
+//
+// Arguments:
+//	input: The original input tensor.
+//	input_min: The minimum value of the input tensor
+//	input_max: The maximum value of the input tensor.
+//	requested_output_min: The minimum value of the output tensor requested.
+//	requested_output_max: The maximum value of the output tensor requested.
+//
+// Returns:
+//	output: Output tensor.
+//	output_min: The minimum value of the final output tensor
+//	output_max: The maximum value of the final output tensor.
+func RequantizePerChannel(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, optional ...RequantizePerChannelAttr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RequantizePerChannel",
+		Input: []tf.Input{
+			input, input_min, input_max, requested_output_min, requested_output_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// LeakyReluAttr is an optional argument to LeakyRelu.
+type LeakyReluAttr func(optionalAttr)
+
+// LeakyReluAlpha sets the optional alpha attribute to value.
+// If not specified, defaults to 0.2
+func LeakyReluAlpha(value float32) LeakyReluAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// Computes rectified linear: `max(features, features * alpha)`.
+func LeakyRelu(scope *Scope, features tf.Output, optional ...LeakyReluAttr) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LeakyRelu",
+		Input: []tf.Input{
+			features,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Generates values in an interval.
+//
+// A sequence of `num` evenly-spaced values are generated beginning at `start`.
+// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
+// so that the last one is exactly `stop`.
+//
+// For example:
+//
+// ```
+// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
+// ```
+//
+// Arguments:
+//	start: 0-D tensor. First entry in the range.
+//	stop: 0-D tensor. Last entry in the range.
+//	num: 0-D tensor. Number of values to generate.
+//
+// Returns 1-D. The generated values.
+func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LinSpace",
+		Input: []tf.Input{
+			start, stop, num,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that caches elements from `input_dataset`.
+//
+// A CacheDataset will iterate over the input_dataset, and store tensors. If the
+// cache already exists, the cache will be used. If the cache is inappropriate
+// (e.g. cannot be opened, contains tensors of the wrong shape / size), an error
+// will the returned when used.
+//
+// Arguments:
+//
+//	filename: A path on the filesystem where we should cache the dataset. Note: this
+// will be a directory.
+//
+//
+func CacheDataset(scope *Scope, input_dataset tf.Output, filename tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "CacheDataset",
+		Input: []tf.Input{
+			input_dataset, filename,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ThreadPoolHandleAttr is an optional argument to ThreadPoolHandle.
+type ThreadPoolHandleAttr func(optionalAttr)
+
+// ThreadPoolHandleMaxIntraOpParallelism sets the optional max_intra_op_parallelism attribute to value.
+//
+// value: The maximum degree of parallelism to use within operations that execute on this
+// threadpool.
+// If not specified, defaults to 1
+func ThreadPoolHandleMaxIntraOpParallelism(value int64) ThreadPoolHandleAttr {
+	return func(m optionalAttr) {
+		m["max_intra_op_parallelism"] = value
+	}
+}
+
+// ThreadPoolHandleContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func ThreadPoolHandleContainer(value string) ThreadPoolHandleAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// ThreadPoolHandleSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func ThreadPoolHandleSharedName(value string) ThreadPoolHandleAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+//
+// Arguments:
+//	num_threads: The number of threads in the thread pool.
+//	display_name: A human-readable name for the threads that may be visible in some
+// visualizations.
+// threadpool.
+//
+// Returns A resource that can be consumed by one or more ExperimentalThreadPoolDataset
+// ops.
+func ThreadPoolHandle(scope *Scope, num_threads int64, display_name string, optional ...ThreadPoolHandleAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_threads": num_threads, "display_name": display_name}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ThreadPoolHandle",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
+type SparseReduceMaxSparseAttr func(optionalAttr)
+
+// SparseReduceMaxSparseKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceMaxSparseKeepDims(value bool) SparseReduceMaxSparseAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the max of elements across dimensions of a SparseTensor.
+//
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
+// SparseTensor.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReduceMaxSparse",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape, reduction_axes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersAttr is an optional argument to LoadTPUEmbeddingStochasticGradientDescentParameters.
+type LoadTPUEmbeddingStochasticGradientDescentParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func LoadTPUEmbeddingStochasticGradientDescentParametersTableId(value int64) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingStochasticGradientDescentParametersTableName(value string) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingStochasticGradientDescentParametersConfig(value string) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Load SGD embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the stochastic gradient descent optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, parameters tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingStochasticGradientDescentParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingStochasticGradientDescentParameters",
+		Input: []tf.Input{
+			parameters,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
+type CropAndResizeGradImageAttr func(optionalAttr)
+
+// CropAndResizeGradImageMethod sets the optional method attribute to value.
+//
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
+	return func(m optionalAttr) {
+		m["method"] = value
+	}
+}
+
+// Computes the gradient of the crop_and_resize op wrt the input image tensor.
+//
+// Arguments:
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
+// containing the original image size. Both `image_height` and `image_width` need
+// to be positive.
+//
+//
+// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"T": T}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CropAndResizeGradImage",
+		Input: []tf.Input{
+			grads, boxes, box_ind, image_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// OutfeedDequeueAttr is an optional argument to OutfeedDequeue.
+type OutfeedDequeueAttr func(optionalAttr)
+
+// OutfeedDequeueDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func OutfeedDequeueDeviceOrdinal(value int64) OutfeedDequeueAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// Retrieves a single tensor from the computation outfeed.
+//
+// This operation will block indefinitely until data is available.
+//
+// Arguments:
+//	dtype: The type of elements in the tensor.
+//	shape: The shape of the tensor.
+//
+// Returns A tensor that will be read from the device outfeed.
+func OutfeedDequeue(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...OutfeedDequeueAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OutfeedDequeue",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+//   Combines (nests of) input elements into a dataset of (nests of) windows.
+//
+//   A "window" is a finite dataset of flat elements of size `size` (or possibly
+//   fewer if there are not enough input elements to fill the window and
+//   `drop_remainder` evaluates to false).
+//
+//   The `shift` argument determines the number of input elements by which
+//   the window moves on each iteration.  The first element in the `k`th window
+//   will be element
+//
+//   ```
+//   1 + (k-1) * shift
+//   ```
+//
+//   of the input dataset. In particular, the first element of the first window
+//   will always be the first element of the input dataset.
+//
+//   If the `stride` parameter is greater than 1, then each window will skip
+//   `(stride - 1)` input elements between each element that appears in the
+//   window. Output windows will still contain `size` elements regardless of
+//   the value of `stride`.
+//
+//   The `stride` argument determines the stride of the input elements, and the
+//   `shift` argument determines the shift of the window.
+//
+//   For example, letting `{...}` to represent a Dataset:
+//
+//   - `tf.data.Dataset.range(7).window(2)` produces
+//     `{{0, 1}, {2, 3}, {4, 5}, {6}}`
+//   - `tf.data.Dataset.range(7).window(3, 2, 1, True)` produces
+//     `{{0, 1, 2}, {2, 3, 4}, {4, 5, 6}}`
+//   - `tf.data.Dataset.range(7).window(3, 1, 2, True)` produces
+//     `{{0, 2, 4}, {1, 3, 5}, {2, 4, 6}}`
+//
+//   Note that when the `window` transformation is applied to a dataset of
+//   nested elements, it produces a dataset of nested windows.
+//
+//   For example:
+//
+//   - `tf.data.Dataset.from_tensor_slices((range(4), range(4))).window(2)`
+//     produces `{({0, 1}, {0, 1}), ({2, 3}, {2, 3})}`
+//   - `tf.data.Dataset.from_tensor_slices({"a": range(4)}).window(2)`
+//     produces `{{"a": {0, 1}}, {"a": {2, 3}}}`
+//
+// Arguments:
+//
+//	size: An integer scalar, representing the number of elements
+// of the input dataset to combine into a window. Must be positive.
+//	shift: An integer scalar, representing the number of input elements
+// by which the window moves in each iteration.  Defaults to `size`.
+// Must be positive.
+//	stride: An integer scalar, representing the stride of the input elements
+// in the sliding window. Must be positive. The default value of 1 means
+// "retain every input element".
+//	drop_remainder: A Boolean scalar, representing whether the last window should be
+// dropped if its size is smaller than `window_size`.
+//
+//
+func WindowDataset(scope *Scope, input_dataset tf.Output, size tf.Output, shift tf.Output, stride tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "WindowDataset",
+		Input: []tf.Input{
+			input_dataset, size, shift, stride, drop_remainder,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SetSizeAttr is an optional argument to SetSize.
+type SetSizeAttr func(optionalAttr)
+
+// SetSizeValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SetSizeValidateIndices(value bool) SetSizeAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Number of unique elements along last dimension of input `set`.
+//
+// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
+// and `set_shape`. The last dimension contains values in a set, duplicates are
+// allowed but ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set`
+// indices.
+//
+// Arguments:
+//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
+//	set_values: 1D `Tensor`, values of a `SparseTensor`.
+//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
+//
+// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
+// `n-1` dimensions as `set`. Each value is the number of unique elements in
+// the corresponding `[0...n-1]` dimension of `set`.
+func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SetSize",
+		Input: []tf.Input{
+			set_indices, set_values, set_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AutoShardDatasetAttr is an optional argument to AutoShardDataset.
+type AutoShardDatasetAttr func(optionalAttr)
+
+// AutoShardDatasetAutoShardPolicy sets the optional auto_shard_policy attribute to value.
+// If not specified, defaults to 0
+func AutoShardDatasetAutoShardPolicy(value int64) AutoShardDatasetAttr {
+	return func(m optionalAttr) {
+		m["auto_shard_policy"] = value
+	}
+}
+
+// Creates a dataset that shards the input dataset.
+//
+// Creates a dataset that shards the input dataset by num_workers, returning a
+// sharded dataset for the index-th worker. This attempts to automatically shard
+// a dataset by examining the Dataset graph and inserting a shard op before the
+// inputs to a reader Dataset (e.g. CSVDataset, TFRecordDataset).
+//
+// This dataset will throw a NotFound error if we cannot shard the dataset
+// automatically.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the input dataset.
+//	num_workers: A scalar representing the number of workers to distribute this dataset across.
+//	index: A scalar representing the index of the current worker out of num_workers.
+//
+//
+func AutoShardDataset(scope *Scope, input_dataset tf.Output, num_workers tf.Output, index tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...AutoShardDatasetAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AutoShardDataset",
+		Input: []tf.Input{
+			input_dataset, num_workers, index,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeJpegAttr is an optional argument to DecodeJpeg.
+type DecodeJpegAttr func(optionalAttr)
+
+// DecodeJpegChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeJpegChannels(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodeJpegRatio sets the optional ratio attribute to value.
+//
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeJpegRatio(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
+	}
+}
+
+// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+//
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
+	}
+}
+
+// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+//
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
+	}
+}
+
+// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+//
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
+
+// DecodeJpegDctMethod sets the optional dct_method attribute to value.
+//
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeJpegDctMethod(value string) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
+	}
+}
+
+// Decode a JPEG-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// This op also supports decoding PNGs and non-animated GIFs since the interface is
+// the same, though it is cleaner to use `tf.io.decode_image`.
+//
+// Arguments:
+//	contents: 0-D.  The JPEG-encoded image.
+//
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeJpeg",
+		Input: []tf.Input{
+			contents,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the number of nonzeroes of `sparse_matrix`.
+//
+// Arguments:
+//	sparse_matrix: A CSRSparseMatrix.
+//
+// Returns The number of nonzeroes of `sparse_matrix`.
+func SparseMatrixNNZ(scope *Scope, sparse_matrix tf.Output) (nnz tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseMatrixNNZ",
+		Input: []tf.Input{
+			sparse_matrix,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Enqueue a Tensor on the computation outfeed.
+//
+// Arguments:
+//	input: A tensor that will be inserted into the outfeed queue.
+//
+// Returns the created operation.
+func OutfeedEnqueue(scope *Scope, input tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "OutfeedEnqueue",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Checks a tensor for NaN and Inf values.
+//
+// When run, reports an `InvalidArgument` error if `tensor` has any values
+// that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
+//
+// Arguments:
+//
+//	message: Prefix of the error message.
+func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"message": message}
+	opspec := tf.OpSpec{
+		Type: "CheckNumerics",
+		Input: []tf.Input{
+			tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Broadcast an array for a compatible shape.
+//
+// Broadcasting is the process of making arrays to have compatible shapes
+// for arithmetic operations. Two shapes are compatible if for each
+// dimension pair they are either equal or one of them is one. When trying
+// to broadcast a Tensor to a shape, it starts with the trailing dimensions,
+// and works its way forward.
+//
+// For example,
+//
+// >>> x = tf.constant([1, 2, 3])
+// >>> y = tf.broadcast_to(x, [3, 3])
+// >>> print(y)
+// tf.Tensor(
+//     [[1 2 3]
+//      [1 2 3]
+//      [1 2 3]], shape=(3, 3), dtype=int32)
+//
+// In the above example, the input Tensor with the shape of `[1, 3]`
+// is broadcasted to output Tensor with shape of `[3, 3]`.
+//
+// When doing broadcasted operations such as multiplying a tensor
+// by a scalar, broadcasting (usually) confers some time or space
+// benefit, as the broadcasted tensor is never materialized.
+//
+// However, `broadcast_to` does not carry with it any such benefits.
+// The newly-created tensor takes the full memory of the broadcasted
+// shape. (In a graph context, `broadcast_to` might be fused to
+// subsequent operation and then be optimized away, however.)
+//
+// Arguments:
+//	input: A Tensor to broadcast.
+//	shape: An 1-D `int` Tensor. The shape of the desired output.
+//
+// Returns A Tensor.
+func BroadcastTo(scope *Scope, input tf.Output, shape tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BroadcastTo",
+		Input: []tf.Input{
+			input, shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Make all elements in the non-Batch dimension unique, but \"close\" to
+//
+// their initial value. Never returns a sub-normal number. Never returns
+// zero. The sign of each input element is always identical to the sign
+// of the corresponding output element. Behavior for infinite elements is
+// undefined. Behavior for subnormal elements is undefined.
+func MakeUnique(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MakeUnique",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns the value stored in an Optional variant or raises an error if none exists.
 func OptionalGetValue(scope *Scope, optional tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
@@ -46068,6 +49384,68 @@ func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.
 	return op.Output(0)
 }
 
+// LoadTPUEmbeddingMDLAdagradLightParametersAttr is an optional argument to LoadTPUEmbeddingMDLAdagradLightParameters.
+type LoadTPUEmbeddingMDLAdagradLightParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingMDLAdagradLightParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func LoadTPUEmbeddingMDLAdagradLightParametersTableId(value int64) LoadTPUEmbeddingMDLAdagradLightParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingMDLAdagradLightParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingMDLAdagradLightParametersTableName(value string) LoadTPUEmbeddingMDLAdagradLightParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// LoadTPUEmbeddingMDLAdagradLightParametersConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingMDLAdagradLightParametersConfig(value string) LoadTPUEmbeddingMDLAdagradLightParametersAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Load MDL Adagrad Light embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the MDL Adagrad Light optimization algorithm.
+//	accumulators: Value of accumulators used in the MDL Adagrad Light optimization algorithm.
+//	weights: Value of weights used in the MDL Adagrad Light optimization algorithm.
+//	benefits: Value of benefits used in the MDL Adagrad Light optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingMDLAdagradLightParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, weights tf.Output, benefits tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMDLAdagradLightParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingMDLAdagradLightParameters",
+		Input: []tf.Input{
+			parameters, accumulators, weights, benefits,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Returns the next record (key, value pair) produced by a Reader.
 //
 // Will dequeue from the input queue if necessary (e.g. when the
@@ -46234,75 +49612,53 @@ func LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug(scope *Sc
 	return scope.AddOperation(opspec)
 }
 
-// FakeQuantWithMinMaxArgsAttr is an optional argument to FakeQuantWithMinMaxArgs.
-type FakeQuantWithMinMaxArgsAttr func(optionalAttr)
+// RandomUniformAttr is an optional argument to RandomUniform.
+type RandomUniformAttr func(optionalAttr)
 
-// FakeQuantWithMinMaxArgsMin sets the optional min attribute to value.
-// If not specified, defaults to -6
-func FakeQuantWithMinMaxArgsMin(value float32) FakeQuantWithMinMaxArgsAttr {
+// RandomUniformSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformSeed(value int64) RandomUniformAttr {
 	return func(m optionalAttr) {
-		m["min"] = value
+		m["seed"] = value
 	}
 }
 
-// FakeQuantWithMinMaxArgsMax sets the optional max attribute to value.
-// If not specified, defaults to 6
-func FakeQuantWithMinMaxArgsMax(value float32) FakeQuantWithMinMaxArgsAttr {
+// RandomUniformSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformSeed2(value int64) RandomUniformAttr {
 	return func(m optionalAttr) {
-		m["max"] = value
+		m["seed2"] = value
 	}
 }
 
-// FakeQuantWithMinMaxArgsNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxArgsNumBits(value int64) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxArgsNarrowRange(value bool) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
+// Outputs random values from a uniform distribution.
 //
-// Attributes
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
 //
-// *   `[min; max]` define the clamping range for the `inputs` data.
-// *   `inputs` values are quantized into the quantization range (
-// `[0; 2^num_bits - 1]` when `narrow_range` is false and `[1; 2^num_bits - 1]`
-// when it is true) and then de-quantized and output as floats in `[min; max]`
-// interval.
-// *   `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
 //
-// Before quantization, `min` and `max` values are adjusted with the following
-// logic.
-// It is suggested to have `min <= 0 <= max`. If `0` is not in the range of values,
-// the behavior can be unexpected:
-//
-// *   If `0 < min < max`: `min_adj = 0` and `max_adj = max - min`.
-// *   If `min < max < 0`: `min_adj = min - max` and `max_adj = 0`.
-// *   If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
-// `min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
-//
-// Quantization is called fake since the output is still in floating point.
-func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsAttr) (outputs tf.Output) {
+// Returns A tensor of the specified shape filled with uniform random values.
+func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxArgs",
+		Type: "RandomUniform",
 		Input: []tf.Input{
-			inputs,
+			shape,
 		},
 		Attrs: attrs,
 	}
@@ -46310,176 +49666,255 @@ func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQua
 	return op.Output(0)
 }
 
-// Batch normalization.
-//
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
-//
-// This op is deprecated. Prefer `tf.nn.batch_normalization`.
-//
-// Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
-	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalization",
-		Input: []tf.Input{
-			t, m, v, beta, gamma,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr is an optional argument to QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize.
+type QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr func(optionalAttr)
 
-// EncodeBase64Attr is an optional argument to EncodeBase64.
-type EncodeBase64Attr func(optionalAttr)
-
-// EncodeBase64Pad sets the optional pad attribute to value.
+// QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType sets the optional out_type attribute to value.
 //
-// value: Bool whether padding is applied at the ends.
-// If not specified, defaults to false
-func EncodeBase64Pad(value bool) EncodeBase64Attr {
+// value: The type of the output.
+// If not specified, defaults to DT_QUINT8
+func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataType) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
-		m["pad"] = value
+		m["out_type"] = value
 	}
 }
 
-// Encode strings into web-safe base64 format.
+// QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
-// Refer to the following article for more information on base64 format:
-// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
-// end so that the encoded has length multiple of 4. See Padding section of the
-// link above.
-//
-// Web-safe means that the encoder uses - and _ instead of + and /.
+// value: List of dilation values.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizePaddingList sets the optional padding_list attribute to value.
+// If not specified, defaults to <>
+func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizePaddingList(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
+	return func(m optionalAttr) {
+		m["padding_list"] = value
+	}
+}
+
+// Computes quantized depthwise Conv2D with Bias, Relu and Requantize.
 //
 // Arguments:
-//	input: Strings to be encoded.
+//	input: The original input tensor.
+//	filter: The original filter tensor.
+//	bias: The original bias tensor.
+//	min_input: The float value that the minimum quantized input value represents.
+//	max_input: The float value that the maximum quantized input value represents.
+//	min_filter: The float value that the minimum quantized filter value represents.
+//	max_filter: The float value that the maximum quantized filter value represents.
+//	min_freezed_output: The minimum float value of the output tensor.
+//	max_freezed_output: The maximum float value of the output tensor.
+//	strides: List of stride values.
 //
-// Returns Input strings encoded in base64.
-func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
+//
+// Returns:
+//	output: The output tensor.
+//	min_output: The float value that the minimum quantized output value represents.
+//	max_output: The float value that the maximum quantized output value represents.
+func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize(scope *Scope, input tf.Output, filter tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, min_freezed_output tf.Output, max_freezed_output tf.Output, strides []int64, padding string, optional ...QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeBase64",
+		Type: "QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize",
 		Input: []tf.Input{
-			input,
+			input, filter, bias, min_input, max_input, min_filter, max_filter, min_freezed_output, max_freezed_output,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Creates a dataset that batches input elements into a SparseTensor.
+// Removes keys and its associated values from a table.
+//
+// The tensor `keys` must of the same type as the keys of the table. Keys not
+// already in the table are silently ignored.
 //
 // Arguments:
-//	input_dataset: A handle to an input dataset. Must have a single component.
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	row_shape: A vector representing the dense shape of each row in the produced
-// SparseTensor. The shape may be partially specified, using `-1` to indicate
-// that a particular dimension should use the maximum size of all batch elements.
-//
-//
-func DenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "DenseToSparseBatchDataset",
-		Input: []tf.Input{
-			input_dataset, batch_size, row_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.
-type LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr func(optionalAttr)
-
-// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugConfig(value string) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Load Adadelta parameters with debug support.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the Adadelta optimization algorithm.
-//	accumulators: Value of accumulators used in the Adadelta optimization algorithm.
-//	updates: Value of updates used in the Adadelta optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the Adadelta optimization algorithm.
-//
-//
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys of the elements to remove.
 //
 // Returns the created operation.
-func LoadTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, updates tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr) (o *tf.Operation) {
+func LookupTableRemoveV2(scope *Scope, table_handle tf.Output, keys tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	opspec := tf.OpSpec{
+		Type: "LookupTableRemoveV2",
+		Input: []tf.Input{
+			table_handle, keys,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// NotEqualAttr is an optional argument to NotEqual.
+type NotEqualAttr func(optionalAttr)
+
+// NotEqualIncompatibleShapeError sets the optional incompatible_shape_error attribute to value.
+// If not specified, defaults to true
+func NotEqualIncompatibleShapeError(value bool) NotEqualAttr {
+	return func(m optionalAttr) {
+		m["incompatible_shape_error"] = value
+	}
+}
+
+// Returns the truth value of (x != y) element-wise.
+//
+// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func NotEqual(scope *Scope, x tf.Output, y tf.Output, optional ...NotEqualAttr) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug",
+		Type: "NotEqual",
 		Input: []tf.Input{
-			parameters, accumulators, updates, gradient_accumulators,
+			x, y,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Concatenates quantized tensors along one dimension.
+//
+// Arguments:
+//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [0, rank(values)).
+//	values: The `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	input_mins: The minimum scalar values for each of the input tensors.
+//	input_maxes: The maximum scalar values for each of the input tensors.
+//
+// Returns:
+//	output: A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+//	output_min: The float value that the minimum quantized output value represents.
+//	output_max: The float value that the maximum quantized output value represents.
+func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedConcat",
+		Input: []tf.Input{
+			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns the batched diagonal part of a batched tensor.
+//
+// Returns a tensor with the `k[0]`-th to `k[1]`-th diagonals of the batched
+// `input`.
+//
+// Assume `input` has `r` dimensions `[I, J, ..., L, M, N]`.
+// Let `max_diag_len` be the maximum length among all diagonals to be extracted,
+// `max_diag_len = min(M + min(k[1], 0), N + min(-k[0], 0))`
+// Let `num_diags` be the number of diagonals to extract,
+// `num_diags = k[1] - k[0] + 1`.
+//
+// If `num_diags == 1`, the output tensor is of rank `r - 1` with shape
+// `[I, J, ..., L, max_diag_len]` and values:
+//
+// ```
+// diagonal[i, j, ..., l, n]
+//   = input[i, j, ..., l, n+y, n+x] ; if 0 <= n+y < M and 0 <= n+x < N,
+//     padding_value                 ; otherwise.
+// ```
+// where `y = max(-k[1], 0)`, `x = max(k[1], 0)`.
+//
+// Otherwise, the output tensor has rank `r` with dimensions
+// `[I, J, ..., L, num_diags, max_diag_len]` with values:
+//
+// ```
+// diagonal[i, j, ..., l, m, n]
+//   = input[i, j, ..., l, n+y, n+x] ; if 0 <= n+y < M and 0 <= n+x < N,
+//     padding_value                 ; otherwise.
+// ```
+// where `d = k[1] - m`, `y = max(-d, 0)`, and `x = max(d, 0)`.
+//
+// The input must be at least a matrix.
+//
+// For example:
+//
+// ```
+// input = np.array([[[1, 2, 3, 4],  # Input shape: (2, 3, 4)
+//                    [5, 6, 7, 8],
+//                    [9, 8, 7, 6]],
+//                   [[5, 4, 3, 2],
+//                    [1, 2, 3, 4],
+//                    [5, 6, 7, 8]]])
+//
+// # A main diagonal from each batch.
+// tf.matrix_diag_part(input) ==> [[1, 6, 7],  # Output shape: (2, 3)
+//                                 [5, 2, 7]]
+//
+// # A superdiagonal from each batch.
+// tf.matrix_diag_part(input, k = 1)
+//   ==> [[2, 7, 6],  # Output shape: (2, 3)
+//        [4, 3, 8]]
+//
+// # A tridiagonal band from each batch.
+// tf.matrix_diag_part(input, k = (-1, 1))
+//   ==> [[[2, 7, 6],  # Output shape: (2, 3, 3)
+//         [1, 6, 7],
+//         [5, 8, 0]],
+//        [[4, 3, 8],
+//         [5, 2, 7],
+//         [1, 6, 0]]]
+//
+// # Padding value = 9
+// tf.matrix_diag_part(input, k = (1, 3), padding_value = 9)
+//   ==> [[[4, 9, 9],  # Output shape: (2, 3, 3)
+//         [3, 8, 9],
+//         [2, 7, 6]],
+//        [[2, 9, 9],
+//         [3, 4, 9],
+//         [4, 3, 8]]]
+// ```
+//
+// Arguments:
+//	input: Rank `r` tensor where `r >= 2`.
+//	k: Diagonal offset(s). Positive value means superdiagonal, 0 refers to the main
+// diagonal, and negative value means subdiagonals. `k` can be a single integer
+// (for a single diagonal) or a pair of integers specifying the low and high ends
+// of a matrix band. `k[0]` must not be larger than `k[1]`.
+//	padding_value: The value to fill the area outside the specified diagonal band with.
+// Default is 0.
+//
+// Returns The extracted diagonal(s).
+func MatrixDiagPartV2(scope *Scope, input tf.Output, k tf.Output, padding_value tf.Output) (diagonal tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixDiagPartV2",
+		Input: []tf.Input{
+			input, k, padding_value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
 // Returns x / y element-wise.
@@ -46500,160 +49935,6 @@ func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Enqueue a Tensor on the computation outfeed.
-//
-// Arguments:
-//	input: A tensor that will be inserted into the outfeed queue.
-//
-// Returns the created operation.
-func OutfeedEnqueue(scope *Scope, input tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "OutfeedEnqueue",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// DecodeJpegAttr is an optional argument to DecodeJpeg.
-type DecodeJpegAttr func(optionalAttr)
-
-// DecodeJpegChannels sets the optional channels attribute to value.
-//
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeJpegChannels(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodeJpegRatio sets the optional ratio attribute to value.
-//
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeJpegRatio(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
-	}
-}
-
-// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
-//
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
-	}
-}
-
-// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
-//
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
-	}
-}
-
-// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
-//
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
-	}
-}
-
-// DecodeJpegDctMethod sets the optional dct_method attribute to value.
-//
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeJpegDctMethod(value string) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["dct_method"] = value
-	}
-}
-
-// Decode a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-//
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
-//
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
-//
-//
-// This op also supports decoding PNGs and non-animated GIFs since the interface is
-// the same, though it is cleaner to use `tf.io.decode_image`.
-//
-// Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
-//
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeJpeg",
-		Input: []tf.Input{
-			contents,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the number of nonzeroes of `sparse_matrix`.
-//
-// Arguments:
-//	sparse_matrix: A CSRSparseMatrix.
-//
-// Returns The number of nonzeroes of `sparse_matrix`.
-func SparseMatrixNNZ(scope *Scope, sparse_matrix tf.Output) (nnz tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseMatrixNNZ",
-		Input: []tf.Input{
-			sparse_matrix,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.
 type LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr func(optionalAttr)
 
@@ -46715,1322 +49996,57 @@ func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug(scope *Scope, param
 	return scope.AddOperation(opspec)
 }
 
-// Serializes the tree ensemble to a proto.
+// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
+type FusedResizeAndPadConv2DAttr func(optionalAttr)
+
+// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
+	return func(m optionalAttr) {
+		m["resize_align_corners"] = value
+	}
+}
+
+// Performs a resize and padding as a preprocess during a convolution.
+//
+// It's often possible to do spatial transformations more efficiently as part of
+// the packing stage of a convolution, so this op allows for an optimized
+// implementation where these stages are fused together. This prevents the need to
+// write out the intermediate results as whole tensors, reducing memory pressure,
+// and we can get some latency gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and defaults to
+// 'NHWC' order.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
 //
 // Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble.
-//
-// Returns:
-//	stamp_token: Stamp token of the tree ensemble resource.
-//	tree_ensemble_serialized: Serialized proto of the ensemble.
-func BoostedTreesSerializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, tree_ensemble_serialized tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesSerializeEnsemble",
-		Input: []tf.Input{
-			tree_ensemble_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Computes inverse hyperbolic cosine of x element-wise.
-//
-// Given an input tensor, the function computes inverse hyperbolic cosine of every element.
-// Input range is `[1, inf]`. It returns `nan` if the input lies outside the range.
-//
-// ```python
-// x = tf.constant([-2, -0.5, 1, 1.2, 200, 10000, float("inf")])
-// tf.math.acosh(x) ==> [nan nan 0. 0.62236255 5.9914584 9.903487 inf]
-// ```
-func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Acosh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Outputs deterministic pseudorandom random numbers from a gamma distribution.
-//
-// Outputs random values from a gamma distribution.
-//
-// The outputs are a deterministic function of `shape`, `seed`, and `alpha`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//	alpha: The concentration of the gamma distribution. Shape must match the rightmost
-// dimensions of `shape`.
-//
-// Returns Random values with specified shape.
-func StatelessRandomGammaV2(scope *Scope, shape tf.Output, seed tf.Output, alpha tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessRandomGammaV2",
-		Input: []tf.Input{
-			shape, seed, alpha,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that executes a SQL query and emits rows of the result set.
-//
-// Arguments:
-//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
-//	data_source_name: A connection string to connect to the database.
-//	query: A SQL query to execute.
-//
-//
-func SqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "SqlDataset",
-		Input: []tf.Input{
-			driver_name, data_source_name, query,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Outputs deterministic pseudorandom random integers from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[minval, maxval)`.
-//
-// The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//	minval: Minimum value (inclusive, scalar).
-//	maxval: Maximum value (exclusive, scalar).
-//
-// Returns Random values with specified shape.
-func StatelessRandomUniformInt(scope *Scope, shape tf.Output, seed tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniformInt",
-		Input: []tf.Input{
-			shape, seed, minval, maxval,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns a batched diagonal tensor with a given batched diagonal values.
-//
-// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-// everything else padded with zeros. The diagonal is computed as follows:
-//
-// Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
-// tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
-//
-// `output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
-//
-// For example:
-//
-// ```
-// # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
-//
-// and diagonal.shape = (2, 4)
-//
-// tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
-//                                      [0, 2, 0, 0]
-//                                      [0, 0, 3, 0]
-//                                      [0, 0, 0, 4]],
-//                                     [[5, 0, 0, 0]
-//                                      [0, 6, 0, 0]
-//                                      [0, 0, 7, 0]
-//                                      [0, 0, 0, 8]]]
-//
-// which has shape (2, 4, 4)
-// ```
-//
-// Arguments:
-//	diagonal: Rank `k`, where `k >= 1`.
-//
-// Returns Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
-func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixDiag",
-		Input: []tf.Input{
-			diagonal,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
-type StatelessTruncatedNormalAttr func(optionalAttr)
-
-// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom values from a truncated normal distribution.
-//
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
-//
-// The outputs are a deterministic function of `shape` and `seed`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//
-// Returns Random values with specified shape.
-func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessTruncatedNormal",
-		Input: []tf.Input{
-			shape, seed,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug.
-type RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Retrieve SGD embedding parameters with debug support.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns:
-//	parameters: Parameter parameters updated by the stochastic gradient descent optimization algorithm.
-//	gradient_accumulators: Parameter gradient_accumulators updated by the Adadelta optimization algorithm.
-func RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugAttr) (parameters tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
-type StatelessRandomUniformAttr func(optionalAttr)
-
-// StatelessRandomUniformDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom random values from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
-//
-// The outputs are a deterministic function of `shape` and `seed`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//
-// Returns Random values with specified shape.
-func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniform",
-		Input: []tf.Input{
-			shape, seed,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StatelessRandomUniformFullIntAttr is an optional argument to StatelessRandomUniformFullInt.
-type StatelessRandomUniformFullIntAttr func(optionalAttr)
-
-// StatelessRandomUniformFullIntDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_UINT64
-func StatelessRandomUniformFullIntDtype(value tf.DataType) StatelessRandomUniformFullIntAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom random integers from a uniform distribution.
-//
-// The generated values are uniform integers covering the whole range of `dtype`.
-//
-// The outputs are a deterministic function of `shape` and `seed`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//
-// Returns Random values with specified shape.
-func StatelessRandomUniformFullInt(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformFullIntAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniformFullInt",
-		Input: []tf.Input{
-			shape, seed,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.
-type RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Retrieve Momentum embedding parameters with debug support.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns:
-//	parameters: Parameter parameters updated by the Momentum optimization algorithm.
-//	momenta: Parameter momenta updated by the Momentum optimization algorithm.
-//	gradient_accumulators: Parameter gradient_accumulators updated by the Momentum optimization algorithm.
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// EqualAttr is an optional argument to Equal.
-type EqualAttr func(optionalAttr)
-
-// EqualIncompatibleShapeError sets the optional incompatible_shape_error attribute to value.
-// If not specified, defaults to true
-func EqualIncompatibleShapeError(value bool) EqualAttr {
-	return func(m optionalAttr) {
-		m["incompatible_shape_error"] = value
-	}
-}
-
-// Returns the truth value of (x == y) element-wise.
-//
-// *NOTE*: `Equal` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-//
-// ```python
-// x = tf.constant([2, 4])
-// y = tf.constant(2)
-// tf.math.equal(x, y) ==> array([True, False])
-//
-// x = tf.constant([2, 4])
-// y = tf.constant([2, 4])
-// tf.math.equal(x, y) ==> array([True,  True])
-// ```
-func Equal(scope *Scope, x tf.Output, y tf.Output, optional ...EqualAttr) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Equal",
-		Input: []tf.Input{
-			x, y,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
-type SparseToSparseSetOperationAttr func(optionalAttr)
-
-// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Applies set operation along last dimension of 2 `SparseTensor` inputs.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-//
-// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
-// order and range of `set1` and `set2` indices.
-//
-// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
-// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
-//
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
-//
-// If `validate_indices` is `True`, this op validates the order and range of `set1`
-// and `set2` indices.
-//
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
-//
-// Arguments:
-//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
-// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
-// max set size across `0...n-1` dimensions.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
-// max set size across `0...n-1` dimensions.
-//
-//
-// Returns:
-//	result_indices: 2D indices of a `SparseTensor`.
-//	result_values: 1D values of a `SparseTensor`.
-//	result_shape: 1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseToSparseSetOperation",
-		Input: []tf.Input{
-			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// InfeedEnqueueTupleAttr is an optional argument to InfeedEnqueueTuple.
-type InfeedEnqueueTupleAttr func(optionalAttr)
-
-// InfeedEnqueueTupleLayouts sets the optional layouts attribute to value.
-//
-// value: A vector holding the requested layout in minor-to-major sequence for
-// all the tuple shapes, in the order the shapes appear in the "shapes" input.
-// The layout elements for a sub-shape can be set to -1, in which case the
-// corresponding layout will be computed by the infeed operation.
-// If not specified, defaults to <>
-func InfeedEnqueueTupleLayouts(value []int64) InfeedEnqueueTupleAttr {
-	return func(m optionalAttr) {
-		m["layouts"] = value
-	}
-}
-
-// InfeedEnqueueTupleDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. This should be -1 when the Op
-// is running on a TPU device, and >= 0 when the Op is running on the CPU
-// device.
-// If not specified, defaults to -1
-func InfeedEnqueueTupleDeviceOrdinal(value int64) InfeedEnqueueTupleAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// Feeds multiple Tensor values into the computation as an XLA tuple.
-//
-// Arguments:
-//	inputs: A list of tensors that will be provided using the infeed mechanism.
-//	shapes: The shapes of each tensor in `inputs`.
-//
-// Returns the created operation.
-func InfeedEnqueueTuple(scope *Scope, inputs []tf.Output, shapes []tf.Shape, optional ...InfeedEnqueueTupleAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shapes": shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "InfeedEnqueueTuple",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// LoadTPUEmbeddingRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingRMSPropParameters.
-type LoadTPUEmbeddingRMSPropParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingRMSPropParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func LoadTPUEmbeddingRMSPropParametersTableId(value int64) LoadTPUEmbeddingRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingRMSPropParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingRMSPropParametersTableName(value string) LoadTPUEmbeddingRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// LoadTPUEmbeddingRMSPropParametersConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingRMSPropParametersConfig(value string) LoadTPUEmbeddingRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Load RMSProp embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the RMSProp optimization algorithm.
-//	ms: Value of ms used in the RMSProp optimization algorithm.
-//	mom: Value of mom used in the RMSProp optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingRMSPropParameters(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingRMSPropParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingRMSPropParameters",
-		Input: []tf.Input{
-			parameters, ms, mom,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// StatefulUniformFullIntAttr is an optional argument to StatefulUniformFullInt.
-type StatefulUniformFullIntAttr func(optionalAttr)
-
-// StatefulUniformFullIntDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_UINT64
-func StatefulUniformFullIntDtype(value tf.DataType) StatefulUniformFullIntAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs random integers from a uniform distribution.
-//
-// The generated values are uniform integers covering the whole range of `dtype`.
-//
-// Arguments:
-//	resource: The handle of the resource variable that stores the state of the RNG.
-//	algorithm: The RNG algorithm.
-//	shape: The shape of the output tensor.
-//
-// Returns Random values with specified shape.
-func StatefulUniformFullInt(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulUniformFullIntAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatefulUniformFullInt",
-		Input: []tf.Input{
-			resource, algorithm, shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingStochasticGradientDescentParametersAttr is an optional argument to LoadTPUEmbeddingStochasticGradientDescentParameters.
-type LoadTPUEmbeddingStochasticGradientDescentParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingStochasticGradientDescentParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func LoadTPUEmbeddingStochasticGradientDescentParametersTableId(value int64) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingStochasticGradientDescentParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingStochasticGradientDescentParametersTableName(value string) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// LoadTPUEmbeddingStochasticGradientDescentParametersConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingStochasticGradientDescentParametersConfig(value string) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Load SGD embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the stochastic gradient descent optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, parameters tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingStochasticGradientDescentParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingStochasticGradientDescentParameters",
-		Input: []tf.Input{
-			parameters,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// RequantizePerChannelAttr is an optional argument to RequantizePerChannel.
-type RequantizePerChannelAttr func(optionalAttr)
-
-// RequantizePerChannelOutType sets the optional out_type attribute to value.
-//
-// value: The quantized type of output tensor that needs to be converted.
-// If not specified, defaults to DT_QUINT8
-func RequantizePerChannelOutType(value tf.DataType) RequantizePerChannelAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Requantizes input with min and max values known per channel.
-//
-// Arguments:
-//	input: The original input tensor.
-//	input_min: The minimum value of the input tensor
-//	input_max: The maximum value of the input tensor.
-//	requested_output_min: The minimum value of the output tensor requested.
-//	requested_output_max: The maximum value of the output tensor requested.
-//
-// Returns:
-//	output: Output tensor.
-//	output_min: The minimum value of the final output tensor
-//	output_max: The maximum value of the final output tensor.
-func RequantizePerChannel(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, optional ...RequantizePerChannelAttr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RequantizePerChannel",
-		Input: []tf.Input{
-			input, input_min, input_max, requested_output_min, requested_output_max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// LeakyReluAttr is an optional argument to LeakyRelu.
-type LeakyReluAttr func(optionalAttr)
-
-// LeakyReluAlpha sets the optional alpha attribute to value.
-// If not specified, defaults to 0.2
-func LeakyReluAlpha(value float32) LeakyReluAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
-
-// Computes rectified linear: `max(features, features * alpha)`.
-func LeakyRelu(scope *Scope, features tf.Output, optional ...LeakyReluAttr) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LeakyRelu",
-		Input: []tf.Input{
-			features,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Component-wise divides a SparseTensor by a dense Tensor.
-//
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
-//
-// Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
-//
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseDiv",
-		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// EnqueueTPUEmbeddingIntegerBatchAttr is an optional argument to EnqueueTPUEmbeddingIntegerBatch.
-type EnqueueTPUEmbeddingIntegerBatchAttr func(optionalAttr)
-
-// EnqueueTPUEmbeddingIntegerBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. Should be >= 0 and less than the number
-// of TPU cores in the task on which the node is placed.
-// If not specified, defaults to -1
-func EnqueueTPUEmbeddingIntegerBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingIntegerBatchAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// An op that enqueues a list of input batch tensors to TPUEmbedding.
-//
-// Arguments:
-//	batch: A list of 1D tensors, one for each embedding table, containing the
-// indices into the tables.
-//	mode_override: A string input that overrides the mode specified in the
-// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
-// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
-// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
-//
-// Returns the created operation.
-func EnqueueTPUEmbeddingIntegerBatch(scope *Scope, batch []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingIntegerBatchAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EnqueueTPUEmbeddingIntegerBatch",
-		Input: []tf.Input{
-			tf.OutputList(batch), mode_override,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// MapClearAttr is an optional argument to MapClear.
-type MapClearAttr func(optionalAttr)
-
-// MapClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapClearCapacity(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapClearMemoryLimit(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapClearContainer(value string) MapClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapClearSharedName(value string) MapClearAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes all elements in the underlying container.
-//
-// Returns the created operation.
-func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapClear",
-
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Deserialize `SparseTensor` objects.
-//
-// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
-// the last dimension stores serialized `SparseTensor` objects and the other N
-// dimensions (N >= 0) correspond to a batch. The ranks of the original
-// `SparseTensor` objects must all match. When the final `SparseTensor` is
-// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
-// the sparse tensors have been concatenated along new dimensions, one for each
-// batch.
-//
-// The output `SparseTensor` object's shape values for the original dimensions
-// are the max across the input `SparseTensor` objects' shape values for the
-// corresponding dimensions. The new dimensions match the size of the batch.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
-//
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-//
-// then the final deserialized `SparseTensor` will be:
-//
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
-//
-// Arguments:
-//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
-// must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "DeserializeSparse",
-		Input: []tf.Input{
-			serialized_sparse,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Decode web-safe base64-encoded strings.
-//
-// Input may or may not have padding at the end. See EncodeBase64 for padding.
-// Web-safe means that input must use - and _ instead of + and /.
-//
-// Arguments:
-//	input: Base64 strings to decode.
-//
-// Returns Decoded strings.
-func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeBase64",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingAdagradParametersAttr is an optional argument to LoadTPUEmbeddingAdagradParameters.
-type LoadTPUEmbeddingAdagradParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingAdagradParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func LoadTPUEmbeddingAdagradParametersTableId(value int64) LoadTPUEmbeddingAdagradParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingAdagradParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingAdagradParametersTableName(value string) LoadTPUEmbeddingAdagradParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// LoadTPUEmbeddingAdagradParametersConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingAdagradParametersConfig(value string) LoadTPUEmbeddingAdagradParametersAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Load Adagrad embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the Adagrad optimization algorithm.
-//	accumulators: Value of accumulators used in the Adagrad optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingAdagradParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdagradParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingAdagradParameters",
-		Input: []tf.Input{
-			parameters, accumulators,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns the gradient of `Tile`.
-//
-// DEPRECATED at GraphDef version 3: TileGrad has been replaced with reduce_sum
-//
-// Since `Tile` takes an input and repeats the input `multiples` times
-// along each dimension, `TileGrad` takes in `multiples` and aggregates
-// each repeated tile of `input` into `output`.
-func TileGrad(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TileGrad",
-		Input: []tf.Input{
-			input, multiples,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AudioSummaryAttr is an optional argument to AudioSummary.
-type AudioSummaryAttr func(optionalAttr)
-
-// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
-//
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_outputs"] = value
-	}
-}
-
-// Outputs a `Summary` protocol buffer with audio.
-//
-// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
-//
-// Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"sample_rate": sample_rate}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AudioSummary",
-		Input: []tf.Input{
-			tag, tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingFTRLParametersAttr is an optional argument to LoadTPUEmbeddingFTRLParameters.
-type LoadTPUEmbeddingFTRLParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingFTRLParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func LoadTPUEmbeddingFTRLParametersTableId(value int64) LoadTPUEmbeddingFTRLParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingFTRLParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingFTRLParametersTableName(value string) LoadTPUEmbeddingFTRLParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// LoadTPUEmbeddingFTRLParametersConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingFTRLParametersConfig(value string) LoadTPUEmbeddingFTRLParametersAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Load FTRL embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the FTRL optimization algorithm.
-//	accumulators: Value of accumulators used in the FTRL optimization algorithm.
-//	linears: Value of linears used in the FTRL optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, linears tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingFTRLParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingFTRLParameters",
-		Input: []tf.Input{
-			parameters, accumulators, linears,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
-type DepthwiseConv2dNativeAttr func(optionalAttr)
-
-// DepthwiseConv2dNativeExplicitPaddings sets the optional explicit_paddings attribute to value.
-// If not specified, defaults to <>
-func DepthwiseConv2dNativeExplicitPaddings(value []int64) DepthwiseConv2dNativeAttr {
-	return func(m optionalAttr) {
-		m["explicit_paddings"] = value
-	}
-}
-
-// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// DepthwiseConv2dNativeDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
-//
-// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-// and a filter / kernel tensor of shape
-// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
-// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
-// a different filter to each input channel (expanding from 1 channel to
-// `channel_multiplier` channels for each), then concatenates the results
-// together. Thus, the output has `in_channels * channel_multiplier` channels.
-//
-// ```
-// for k in 0..in_channels-1
-//   for q in 0..channel_multiplier-1
-//     output[b, i, j, k * channel_multiplier + q] =
-//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
-//                         filter[di, dj, k, q]
-// ```
-//
-// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
-//
-// Arguments:
-//
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
 //
 //	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`.
+// of `input`. Must be in the same order as the dimension specified with format.
 //	padding: The type of padding algorithm to use.
-func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
+func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNative",
+		Type: "FusedResizeAndPadConv2D",
 		Input: []tf.Input{
-			input, filter,
+			input, size, paddings, filter,
 		},
 		Attrs: attrs,
 	}
@@ -48038,1396 +50054,21 @@ func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, stri
 	return op.Output(0)
 }
 
-// Creates an all-zeros CSRSparseMatrix with shape `dense_shape`.
-//
-// Arguments:
-//	dense_shape: The desired matrix shape.
-//
-//
-// Returns An empty CSR matrix with shape `dense_shape`.
-func SparseMatrixZeros(scope *Scope, dense_shape tf.Output, type_ tf.DataType) (sparse_matrix tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"type": type_}
-	opspec := tf.OpSpec{
-		Type: "SparseMatrixZeros",
-		Input: []tf.Input{
-			dense_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// EnqueueTPUEmbeddingRaggedTensorBatchAttr is an optional argument to EnqueueTPUEmbeddingRaggedTensorBatch.
+type EnqueueTPUEmbeddingRaggedTensorBatchAttr func(optionalAttr)
 
-// PaddingFIFOQueueV2Attr is an optional argument to PaddingFIFOQueueV2.
-type PaddingFIFOQueueV2Attr func(optionalAttr)
-
-// PaddingFIFOQueueV2Shapes sets the optional shapes attribute to value.
-//
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types.
-// Shapes of fixed rank but variable size are allowed by setting
-// any shape dimension to -1.  In this case, the inputs' shape may vary along
-// the given dimension, and DequeueMany will pad the given dimension with
-// zeros up to the maximum shape of all elements in the given batch.
-// If the length of this attr is 0, different queue elements may have
-// different ranks and shapes, but only one element may be dequeued at a time.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shapes"] = value
-	}
-}
-
-// PaddingFIFOQueueV2Capacity sets the optional capacity attribute to value.
-//
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func PaddingFIFOQueueV2Capacity(value int64) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// PaddingFIFOQueueV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func PaddingFIFOQueueV2Container(value string) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// PaddingFIFOQueueV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func PaddingFIFOQueueV2SharedName(value string) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A queue that produces elements in first-in first-out order.
-//
-// Variable-size shapes are allowed by setting the corresponding shape dimensions
-// to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
-// size of any given element in the minibatch.  See below for details.
-//
-// Arguments:
-//	component_types: The type of each component in a value.
-//
-// Returns The handle to the queue.
-func PaddingFIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...PaddingFIFOQueueV2Attr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "PaddingFIFOQueueV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingMomentumParametersGradAccumDebug.
-type LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
-
-// LoadTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func LoadTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// LoadTPUEmbeddingMomentumParametersGradAccumDebugConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingMomentumParametersGradAccumDebugConfig(value string) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Load Momentum embedding parameters with debug support.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the Momentum optimization algorithm.
-//	momenta: Value of momenta used in the Momentum optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the Momentum optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingMomentumParametersGradAccumDebug",
-		Input: []tf.Input{
-			parameters, momenta, gradient_accumulators,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Constructs a tensor by tiling a given tensor.
-//
-// This operation creates a new tensor by replicating `input` `multiples` times.
-// The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
-// and the values of `input` are replicated `multiples[i]` times along the 'i'th
-// dimension. For example, tiling `[a b c d]` by `[2]` produces
-// `[a b c d a b c d]`.
-//
-// >>> a = tf.constant([[1,2,3],[4,5,6]], tf.int32)
-// >>> b = tf.constant([1,2], tf.int32)
-// >>> tf.tile(a, b)
-// <tf.Tensor: shape=(2, 6), dtype=int32, numpy=
-// array([[1, 2, 3, 1, 2, 3],
-//        [4, 5, 6, 4, 5, 6]], dtype=int32)>
-// >>> c = tf.constant([2,1], tf.int32)
-// >>> tf.tile(a, c)
-// <tf.Tensor: shape=(4, 3), dtype=int32, numpy=
-// array([[1, 2, 3],
-//        [4, 5, 6],
-//        [1, 2, 3],
-//        [4, 5, 6]], dtype=int32)>
-// >>> d = tf.constant([2,2], tf.int32)
-// >>> tf.tile(a, d)
-// <tf.Tensor: shape=(4, 6), dtype=int32, numpy=
-// array([[1, 2, 3, 1, 2, 3],
-//        [4, 5, 6, 4, 5, 6],
-//        [1, 2, 3, 1, 2, 3],
-//        [4, 5, 6, 4, 5, 6]], dtype=int32)>
-//
-// Arguments:
-//	input: 1-D or higher.
-//	multiples: 1-D. Length must be the same as the number of dimensions in `input`
-func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Tile",
-		Input: []tf.Input{
-			input, multiples,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SerializeSparseAttr is an optional argument to SerializeSparse.
-type SerializeSparseAttr func(optionalAttr)
-
-// SerializeSparseOutType sets the optional out_type attribute to value.
-//
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
-//
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
-func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SerializeSparse",
-		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Extracts the average gradient in the given ConditionalAccumulator.
-//
-// The op blocks until sufficient (i.e., more than num_required)
-// gradients have been accumulated.  If the accumulator has already
-// aggregated more than num_required gradients, it returns the average of
-// the accumulated gradients.  Also automatically increments the recorded
-// global_step in the accumulator by 1, and resets the aggregate to 0.
-//
-// Arguments:
-//	handle: The handle to an accumulator.
-//	num_required: Number of gradients required before we return an aggregate.
-//	dtype: The data type of accumulated gradients. Needs to correspond to the type
-// of the accumulator.
-//
-// Returns The average of the accumulated gradients.
-func ResourceAccumulatorTakeGradient(scope *Scope, handle tf.Output, num_required tf.Output, dtype tf.DataType) (average tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "ResourceAccumulatorTakeGradient",
-		Input: []tf.Input{
-			handle, num_required,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// InfeedEnqueueAttr is an optional argument to InfeedEnqueue.
-type InfeedEnqueueAttr func(optionalAttr)
-
-// InfeedEnqueueShape sets the optional shape attribute to value.
-//
-// value: The shape of the tensor.
-// If not specified, defaults to <>
-func InfeedEnqueueShape(value tf.Shape) InfeedEnqueueAttr {
-	return func(m optionalAttr) {
-		m["shape"] = value
-	}
-}
-
-// InfeedEnqueueLayout sets the optional layout attribute to value.
-//
-// value: A vector holding the requested layout in minor-to-major sequence.
-// If a layout attribute is passed, but its values are all -1, the layout will
-// be computed by the infeed operation.
-// If not specified, defaults to <>
-func InfeedEnqueueLayout(value []int64) InfeedEnqueueAttr {
-	return func(m optionalAttr) {
-		m["layout"] = value
-	}
-}
-
-// InfeedEnqueueDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. This should be -1 when the Op
-// is running on a TPU device, and >= 0 when the Op is running on the CPU
-// device.
-// If not specified, defaults to -1
-func InfeedEnqueueDeviceOrdinal(value int64) InfeedEnqueueAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// An op which feeds a single Tensor value into the computation.
-//
-// Arguments:
-//	input: A tensor that will be provided using the infeed mechanism.
-//
-// Returns the created operation.
-func InfeedEnqueue(scope *Scope, input tf.Output, optional ...InfeedEnqueueAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "InfeedEnqueue",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes the mean along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
-// over `j` such that `segment_ids[j] == i` and `N` is the total number of
-// values summed.
-//
-// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
-// </div>
-//
-// For example:
-//
-// ```
-// c = tf.constant([[1.0,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
-// tf.segment_mean(c, tf.constant([0, 0, 1]))
-// # ==> [[2.5, 2.5, 2.5, 2.5],
-// #      [5, 6, 7, 8]]
-// ```
-//
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SegmentMean",
-		Input: []tf.Input{
-			data, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CTCLossV2Attr is an optional argument to CTCLossV2.
-type CTCLossV2Attr func(optionalAttr)
-
-// CTCLossV2PreprocessCollapseRepeated sets the optional preprocess_collapse_repeated attribute to value.
-//
-// value: Scalar, if true then repeated labels are
-// collapsed prior to the CTC calculation.
-// If not specified, defaults to false
-func CTCLossV2PreprocessCollapseRepeated(value bool) CTCLossV2Attr {
-	return func(m optionalAttr) {
-		m["preprocess_collapse_repeated"] = value
-	}
-}
-
-// CTCLossV2CtcMergeRepeated sets the optional ctc_merge_repeated attribute to value.
-//
-// value: Scalar.  If set to false, *during* CTC calculation
-// repeated non-blank labels will not be merged and are interpreted as
-// individual labels.  This is a simplified version of CTC.
-// If not specified, defaults to true
-func CTCLossV2CtcMergeRepeated(value bool) CTCLossV2Attr {
-	return func(m optionalAttr) {
-		m["ctc_merge_repeated"] = value
-	}
-}
-
-// CTCLossV2IgnoreLongerOutputsThanInputs sets the optional ignore_longer_outputs_than_inputs attribute to value.
-//
-// value: Scalar. If set to true, during CTC
-// calculation, items that have longer output sequences than input sequences
-// are skipped: they don't contribute to the loss term and have zero-gradient.
-// If not specified, defaults to false
-func CTCLossV2IgnoreLongerOutputsThanInputs(value bool) CTCLossV2Attr {
-	return func(m optionalAttr) {
-		m["ignore_longer_outputs_than_inputs"] = value
-	}
-}
-
-// Calculates the CTC Loss (log probability) for each batch entry.  Also calculates
-//
-// the gradient.  This class performs the softmax operation for you, so inputs
-// should be e.g. linear projections of outputs by an LSTM.
-//
-// Arguments:
-//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits. Default blank
-// label is 0 rather num_classes - 1.
-//	labels_indices: The indices of a `SparseTensor<int32, 2>`.
-// `labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for
-// `(batch b, time t)`.
-//	labels_values: The values (labels) associated with the given batch and time.
-//	sequence_length: A vector containing sequence lengths (batch).
-//
-// Returns:
-//	loss: A vector (batch) containing log-probabilities.
-//	gradient: The gradient of `loss`.  3-D, shape:
-// `(max_time x batch_size x num_classes)`.
-func CTCLossV2(scope *Scope, inputs tf.Output, labels_indices tf.Output, labels_values tf.Output, sequence_length tf.Output, optional ...CTCLossV2Attr) (loss tf.Output, gradient tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CTCLossV2",
-		Input: []tf.Input{
-			inputs, labels_indices, labels_values, sequence_length,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// ResourceSparseApplyKerasMomentumAttr is an optional argument to ResourceSparseApplyKerasMomentum.
-type ResourceSparseApplyKerasMomentumAttr func(optionalAttr)
-
-// ResourceSparseApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyKerasMomentumUseLocking(value bool) ResourceSparseApplyKerasMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceSparseApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var + momentum * accum, so in the end, the var you get is actually
-// var + momentum * accum.
-// If not specified, defaults to false
-func ResourceSparseApplyKerasMomentumUseNesterov(value bool) ResourceSparseApplyKerasMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
-//
-// Set use_nesterov = True if you want to use Nesterov momentum.
-//
-// That is for rows we have grad for, we update var and accum as follows:
-//
-// accum = accum * momentum - lr * grad
-// var += accum
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	momentum: Momentum. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceSparseApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyKerasMomentumAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyKerasMomentum",
-		Input: []tf.Input{
-			var_, accum, lr, grad, indices, momentum,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// MaxPoolGradGradV2Attr is an optional argument to MaxPoolGradGradV2.
-type MaxPoolGradGradV2Attr func(optionalAttr)
-
-// MaxPoolGradGradV2DataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradGradV2DataFormat(value string) MaxPoolGradGradV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes second-order gradients of the maxpooling function.
-//
-// Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradGradV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGradV2",
-		Input: []tf.Input{
-			orig_input, orig_output, grad, ksize, strides,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingMomentumParametersAttr is an optional argument to RetrieveTPUEmbeddingMomentumParameters.
-type RetrieveTPUEmbeddingMomentumParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingMomentumParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func RetrieveTPUEmbeddingMomentumParametersTableId(value int64) RetrieveTPUEmbeddingMomentumParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingMomentumParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingMomentumParametersTableName(value string) RetrieveTPUEmbeddingMomentumParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingMomentumParametersConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingMomentumParametersConfig(value string) RetrieveTPUEmbeddingMomentumParametersAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Retrieve Momentum embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns:
-//	parameters: Parameter parameters updated by the Momentum optimization algorithm.
-//	momenta: Parameter momenta updated by the Momentum optimization algorithm.
-func RetrieveTPUEmbeddingMomentumParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersAttr) (parameters tf.Output, momenta tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingMomentumParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// ConfigureDistributedTPUAttr is an optional argument to ConfigureDistributedTPU.
-type ConfigureDistributedTPUAttr func(optionalAttr)
-
-// ConfigureDistributedTPUEmbeddingConfig sets the optional embedding_config attribute to value.
-//
-// value: Reserved. Do not use.
-// If not specified, defaults to ""
-func ConfigureDistributedTPUEmbeddingConfig(value string) ConfigureDistributedTPUAttr {
-	return func(m optionalAttr) {
-		m["embedding_config"] = value
-	}
-}
-
-// ConfigureDistributedTPUTpuEmbeddingConfig sets the optional tpu_embedding_config attribute to value.
-//
-// value: Serialized tensorflow.tpu.TPUEmbeddingConfiguration that
-// describes the embedding lookups of the program.
-// If not specified, defaults to ""
-func ConfigureDistributedTPUTpuEmbeddingConfig(value string) ConfigureDistributedTPUAttr {
-	return func(m optionalAttr) {
-		m["tpu_embedding_config"] = value
-	}
-}
-
-// ConfigureDistributedTPUIsGlobalInit sets the optional is_global_init attribute to value.
-//
-// value: Reserved. Do not use.
-// If not specified, defaults to false
-func ConfigureDistributedTPUIsGlobalInit(value bool) ConfigureDistributedTPUAttr {
-	return func(m optionalAttr) {
-		m["is_global_init"] = value
-	}
-}
-
-// ConfigureDistributedTPUEnableWholeMeshCompilations sets the optional enable_whole_mesh_compilations attribute to value.
-// If not specified, defaults to false
-func ConfigureDistributedTPUEnableWholeMeshCompilations(value bool) ConfigureDistributedTPUAttr {
-	return func(m optionalAttr) {
-		m["enable_whole_mesh_compilations"] = value
-	}
-}
-
-// ConfigureDistributedTPUCompilationFailureClosesChips sets the optional compilation_failure_closes_chips attribute to value.
-// If not specified, defaults to true
-func ConfigureDistributedTPUCompilationFailureClosesChips(value bool) ConfigureDistributedTPUAttr {
-	return func(m optionalAttr) {
-		m["compilation_failure_closes_chips"] = value
-	}
-}
-
-// Sets up the centralized structures for a distributed TPU system.
-//
-// Returns A serialized tensorflow.tpu.TopologyProto that describes the TPU
-// topology.
-func ConfigureDistributedTPU(scope *Scope, optional ...ConfigureDistributedTPUAttr) (topology tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ConfigureDistributedTPU",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-//   Combines (nests of) input elements into a dataset of (nests of) windows.
-//
-//   A "window" is a finite dataset of flat elements of size `size` (or possibly
-//   fewer if there are not enough input elements to fill the window and
-//   `drop_remainder` evaluates to false).
-//
-//   The `shift` argument determines the number of input elements by which
-//   the window moves on each iteration.  The first element in the `k`th window
-//   will be element
-//
-//   ```
-//   1 + (k-1) * shift
-//   ```
-//
-//   of the input dataset. In particular, the first element of the first window
-//   will always be the first element of the input dataset.
-//
-//   If the `stride` parameter is greater than 1, then each window will skip
-//   `(stride - 1)` input elements between each element that appears in the
-//   window. Output windows will still contain `size` elements regardless of
-//   the value of `stride`.
-//
-//   The `stride` argument determines the stride of the input elements, and the
-//   `shift` argument determines the shift of the window.
-//
-//   For example, letting `{...}` to represent a Dataset:
-//
-//   - `tf.data.Dataset.range(7).window(2)` produces
-//     `{{0, 1}, {2, 3}, {4, 5}, {6}}`
-//   - `tf.data.Dataset.range(7).window(3, 2, 1, True)` produces
-//     `{{0, 1, 2}, {2, 3, 4}, {4, 5, 6}}`
-//   - `tf.data.Dataset.range(7).window(3, 1, 2, True)` produces
-//     `{{0, 2, 4}, {1, 3, 5}, {2, 4, 6}}`
-//
-//   Note that when the `window` transformation is applied to a dataset of
-//   nested elements, it produces a dataset of nested windows.
-//
-//   For example:
-//
-//   - `tf.data.Dataset.from_tensor_slices((range(4), range(4))).window(2)`
-//     produces `{({0, 1}, {0, 1}), ({2, 3}, {2, 3})}`
-//   - `tf.data.Dataset.from_tensor_slices({"a": range(4)}).window(2)`
-//     produces `{{"a": {0, 1}}, {"a": {2, 3}}}`
-//
-// Arguments:
-//
-//	size: An integer scalar, representing the number of elements
-// of the input dataset to combine into a window. Must be positive.
-//	shift: An integer scalar, representing the number of input elements
-// by which the window moves in each iteration.  Defaults to `size`.
-// Must be positive.
-//	stride: An integer scalar, representing the stride of the input elements
-// in the sliding window. Must be positive. The default value of 1 means
-// "retain every input element".
-//	drop_remainder: A Boolean scalar, representing whether the last window should be
-// dropped if its size is smaller than `window_size`.
-//
-//
-func WindowDataset(scope *Scope, input_dataset tf.Output, size tf.Output, shift tf.Output, stride tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "WindowDataset",
-		Input: []tf.Input{
-			input_dataset, size, shift, stride, drop_remainder,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SetSizeAttr is an optional argument to SetSize.
-type SetSizeAttr func(optionalAttr)
-
-// SetSizeValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SetSizeValidateIndices(value bool) SetSizeAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Number of unique elements along last dimension of input `set`.
-//
-// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
-// and `set_shape`. The last dimension contains values in a set, duplicates are
-// allowed but ignored.
-//
-// If `validate_indices` is `True`, this op validates the order and range of `set`
-// indices.
-//
-// Arguments:
-//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
-//	set_values: 1D `Tensor`, values of a `SparseTensor`.
-//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
-//
-// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
-// `n-1` dimensions as `set`. Each value is the number of unique elements in
-// the corresponding `[0...n-1]` dimension of `set`.
-func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SetSize",
-		Input: []tf.Input{
-			set_indices, set_values, set_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AutoShardDatasetAttr is an optional argument to AutoShardDataset.
-type AutoShardDatasetAttr func(optionalAttr)
-
-// AutoShardDatasetAutoShardPolicy sets the optional auto_shard_policy attribute to value.
-// If not specified, defaults to 0
-func AutoShardDatasetAutoShardPolicy(value int64) AutoShardDatasetAttr {
-	return func(m optionalAttr) {
-		m["auto_shard_policy"] = value
-	}
-}
-
-// Creates a dataset that shards the input dataset.
-//
-// Creates a dataset that shards the input dataset by num_workers, returning a
-// sharded dataset for the index-th worker. This attempts to automatically shard
-// a dataset by examining the Dataset graph and inserting a shard op before the
-// inputs to a reader Dataset (e.g. CSVDataset, TFRecordDataset).
-//
-// This dataset will throw a NotFound error if we cannot shard the dataset
-// automatically.
-//
-// Arguments:
-//	input_dataset: A variant tensor representing the input dataset.
-//	num_workers: A scalar representing the number of workers to distribute this dataset across.
-//	index: A scalar representing the index of the current worker out of num_workers.
-//
-//
-func AutoShardDataset(scope *Scope, input_dataset tf.Output, num_workers tf.Output, index tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...AutoShardDatasetAttr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AutoShardDataset",
-		Input: []tf.Input{
-			input_dataset, num_workers, index,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingFTRLParametersAttr is an optional argument to RetrieveTPUEmbeddingFTRLParameters.
-type RetrieveTPUEmbeddingFTRLParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingFTRLParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func RetrieveTPUEmbeddingFTRLParametersTableId(value int64) RetrieveTPUEmbeddingFTRLParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingFTRLParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingFTRLParametersTableName(value string) RetrieveTPUEmbeddingFTRLParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingFTRLParametersConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingFTRLParametersConfig(value string) RetrieveTPUEmbeddingFTRLParametersAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Retrieve FTRL embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns:
-//	parameters: Parameter parameters updated by the FTRL optimization algorithm.
-//	accumulators: Parameter accumulators updated by the FTRL optimization algorithm.
-//	linears: Parameter linears updated by the FTRL optimization algorithm.
-func RetrieveTPUEmbeddingFTRLParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingFTRLParametersAttr) (parameters tf.Output, accumulators tf.Output, linears tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingFTRLParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns the result of a TPU compilation.
-//
-// This operation returns the result of a TPU compilation as a serialized
-// CompilationResultProto, which holds a status and an error message if an error
-// occurred during compilation.
-func TPUCompilationResult(scope *Scope) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TPUCompilationResult",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyAdagradV2Attr is an optional argument to ResourceApplyAdagradV2.
-type ResourceApplyAdagradV2Attr func(optionalAttr)
-
-// ResourceApplyAdagradV2UseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdagradV2UseLocking(value bool) ResourceApplyAdagradV2Attr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceApplyAdagradV2UpdateSlots sets the optional update_slots attribute to value.
-// If not specified, defaults to true
-func ResourceApplyAdagradV2UpdateSlots(value bool) ResourceApplyAdagradV2Attr {
-	return func(m optionalAttr) {
-		m["update_slots"] = value
-	}
-}
-
-// Update '*var' according to the adagrad scheme.
-//
-// accum += grad * grad
-// var -= lr * grad * (1 / (sqrt(accum) + epsilon))
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyAdagradV2(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdagradV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagradV2",
-		Input: []tf.Input{
-			var_, accum, lr, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// QuantizedMatMulWithBiasAndReluAttr is an optional argument to QuantizedMatMulWithBiasAndRelu.
-type QuantizedMatMulWithBiasAndReluAttr func(optionalAttr)
-
-// QuantizedMatMulWithBiasAndReluToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMatMulWithBiasAndReluToutput(value tf.DataType) QuantizedMatMulWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
-
-// QuantizedMatMulWithBiasAndReluTransposeA sets the optional transpose_a attribute to value.
-//
-// value: If true, `a` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulWithBiasAndReluTransposeA(value bool) QuantizedMatMulWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// QuantizedMatMulWithBiasAndReluTransposeB sets the optional transpose_b attribute to value.
-//
-// value: If true, `b` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulWithBiasAndReluTransposeB(value bool) QuantizedMatMulWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// QuantizedMatMulWithBiasAndReluInputQuantMode sets the optional input_quant_mode attribute to value.
-//
-// value: Input data quantization mode. Either MIN_FIRST(default) or SCALED.
-// If not specified, defaults to "MIN_FIRST"
-func QuantizedMatMulWithBiasAndReluInputQuantMode(value string) QuantizedMatMulWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["input_quant_mode"] = value
-	}
-}
-
-// Perform a quantized matrix multiplication of  `a` by the matrix `b` with bias
-// add and relu fusion.
-//
-// The inputs must be two-dimensional matrices and 1D bias vector. And the inner
-// dimension of `a` (after being transposed if `transpose_a` is non-zero) must
-// match the outer dimension of `b` (after being transposed if `transposed_b` is
-// non-zero). Then do broadcast add operation with bias values on the matrix
-// multiplication result. The bias size must match inner dimension of `b`. Then do
-// relu activation to get non-negative result.
-//
-// Arguments:
-//	a: A matrix to be multiplied. Must be a two-dimensional tensor of type `quint8`.
-//	b: A matrix to be multiplied and must be a two-dimensional tensor of type `qint8`.
-//	bias: A 1D bias tensor with size matching with inner dimension of `b` (after being
-// transposed if `transposed_b` is non-zero).
-//	min_a: The float value that the lowest quantized `a` value represents.
-//	max_a: The float value that the highest quantized `a` value represents.
-//	min_b: The float value that the lowest quantized `b` value represents.
-//	max_b: The float value that the highest quantized `b` value represents.
-//
-// Returns:
-//	out
-//	min_out: The float value that the lowest quantized output value represents.
-//	max_out: The float value that the highest quantized output value represents.
-func QuantizedMatMulWithBiasAndRelu(scope *Scope, a tf.Output, b tf.Output, bias tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulWithBiasAndReluAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedMatMulWithBiasAndRelu",
-		Input: []tf.Input{
-			a, b, bias, min_a, max_a, min_b, max_b,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Shuts down a running distributed TPU system.
-//
-// The op returns an error if no system is running.
-//
-// Returns the created operation.
-func ShutdownDistributedTPU(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ShutdownDistributedTPU",
-	}
-	return scope.AddOperation(opspec)
-}
-
-// SerializeManySparseAttr is an optional argument to SerializeManySparse.
-type SerializeManySparseAttr func(optionalAttr)
-
-// SerializeManySparseOutType sets the optional out_type attribute to value.
-//
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
-//
-// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
-// is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The serialized
-// `SparseTensor` objects going into each row of `serialized_sparse` will have
-// rank `R-1`.
-//
-// The minibatch size `N` is extracted from `sparse_shape[0]`.
-//
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SerializeManySparse",
-		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Says whether the targets are in the top `K` predictions.
-//
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
-//
-// More formally, let
-//
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
-//
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
-//
-// Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
-//
-// Returns Computed precision at `k` as a `bool Tensor`.
-func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InTopKV2",
-		Input: []tf.Input{
-			predictions, targets, k,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates an Optional variant with no value.
-func OptionalNone(scope *Scope) (optional tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "OptionalNone",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr is an optional argument to RetrieveTPUEmbeddingStochasticGradientDescentParameters.
-type RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingStochasticGradientDescentParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func RetrieveTPUEmbeddingStochasticGradientDescentParametersTableId(value int64) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingStochasticGradientDescentParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingStochasticGradientDescentParametersTableName(value string) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingStochasticGradientDescentParametersConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingStochasticGradientDescentParametersConfig(value string) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Retrieve SGD embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the stochastic gradient descent optimization algorithm.
-func RetrieveTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr) (parameters tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingStochasticGradientDescentParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CudnnRNNAttr is an optional argument to CudnnRNN.
-type CudnnRNNAttr func(optionalAttr)
-
-// CudnnRNNRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNRnnMode(value string) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNInputMode(value string) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNDirection(value string) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNDropout(value float32) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
-}
-
-// CudnnRNNSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNSeed(value int64) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// CudnnRNNSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNSeed2(value int64) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// CudnnRNNIsTraining sets the optional is_training attribute to value.
-// If not specified, defaults to true
-func CudnnRNNIsTraining(value bool) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// A RNN backed by cuDNN.
-//
-// Computes the RNN from the input and initial states, with respect to the params
-// buffer.
-//
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//   the actual computation before the first layer. 'skip_input' is only allowed
-//   when input_size == num_units; 'auto_select' implies 'skip_input' when
-//   input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used. Should be
-//   "unidirectional" or "bidirectional".
-// dropout: Dropout probability. When set to 0., dropout is disabled.
-// seed: The 1st part of a seed to initialize dropout.
-// seed2: The 2nd part of a seed to initialize dropout.
-// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
-// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
-//     num_units].
-// input_c: For LSTM, a 3-D tensor with the shape of
-//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-// params: A 1-D tensor that contains the weights and biases in an opaque layout.
-//     The size must be created through CudnnRNNParamsSize, and initialized
-//     separately. Note that they might not be compatible across different
-//     generations. So it is a good idea to save and restore
-// output: A 3-D tensor with the shape of [seq_length, batch_size,
-//     dir * num_units].
-// output_h: The same shape has input_h.
-// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// is_training: Indicates whether this operation is used for inference or
-//   training.
-// reserve_space: An opaque tensor that can be used in backprop calculation. It
-//   is only produced if is_training is false.
-func CudnnRNN(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, optional ...CudnnRNNAttr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CudnnRNN",
-		Input: []tf.Input{
-			input, input_h, input_c, params,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// Creates a dataset that batches `batch_size` elements from `input_dataset`.
-//
-// Arguments:
-//
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//
-//
-func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "BatchDataset",
-		Input: []tf.Input{
-			input_dataset, batch_size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// EnqueueTPUEmbeddingSparseTensorBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseTensorBatch.
-type EnqueueTPUEmbeddingSparseTensorBatchAttr func(optionalAttr)
-
-// EnqueueTPUEmbeddingSparseTensorBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
+// EnqueueTPUEmbeddingRaggedTensorBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
 //
 // value: The TPU device to use. Should be >= 0 and less than the number
 // of TPU cores in the task on which the node is placed.
 // If not specified, defaults to -1
-func EnqueueTPUEmbeddingSparseTensorBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingSparseTensorBatchAttr {
+func EnqueueTPUEmbeddingRaggedTensorBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingRaggedTensorBatchAttr {
 	return func(m optionalAttr) {
 		m["device_ordinal"] = value
 	}
 }
 
-// EnqueueTPUEmbeddingSparseTensorBatchCombiners sets the optional combiners attribute to value.
+// EnqueueTPUEmbeddingRaggedTensorBatchCombiners sets the optional combiners attribute to value.
 //
 // value: A list of string scalars, one for each embedding table that specify
 // how to normalize the embedding activations after weighted summation.
@@ -49436,40 +50077,42 @@ func EnqueueTPUEmbeddingSparseTensorBatchDeviceOrdinal(value int64) EnqueueTPUEm
 // 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
 // all tables.
 // If not specified, defaults to <>
-func EnqueueTPUEmbeddingSparseTensorBatchCombiners(value []string) EnqueueTPUEmbeddingSparseTensorBatchAttr {
+func EnqueueTPUEmbeddingRaggedTensorBatchCombiners(value []string) EnqueueTPUEmbeddingRaggedTensorBatchAttr {
 	return func(m optionalAttr) {
 		m["combiners"] = value
 	}
 }
 
-// EnqueueTPUEmbeddingSparseTensorBatchMaxSequenceLengths sets the optional max_sequence_lengths attribute to value.
+// EnqueueTPUEmbeddingRaggedTensorBatchMaxSequenceLengths sets the optional max_sequence_lengths attribute to value.
 // If not specified, defaults to <>
-func EnqueueTPUEmbeddingSparseTensorBatchMaxSequenceLengths(value []int64) EnqueueTPUEmbeddingSparseTensorBatchAttr {
+func EnqueueTPUEmbeddingRaggedTensorBatchMaxSequenceLengths(value []int64) EnqueueTPUEmbeddingRaggedTensorBatchAttr {
 	return func(m optionalAttr) {
 		m["max_sequence_lengths"] = value
 	}
 }
 
-// Eases the porting of code that uses tf.nn.embedding_lookup_sparse().
+// Eases the porting of code that uses tf.nn.embedding_lookup().
 //
-// sample_indices[i], embedding_indices[i] and aggregation_weights[i] correspond
+// sample_splits[i], embedding_indices[i] and aggregation_weights[i] correspond
 // to the ith feature. table_ids[i] indicates which embedding table to look up ith
 // feature.
 //
-// The tensors at corresponding positions in the three input lists (sample_indices,
-// embedding_indices and aggregation_weights) must have the same shape, i.e. rank 1
+// The tensors at corresponding positions in two of the input lists,
+// embedding_indices and aggregation_weights, must have the same shape, i.e. rank 1
 // with dim_size() equal to the total number of lookups into the table described by
 // the corresponding feature.
 //
 // Arguments:
-//	sample_indices: A list of rank 1 Tensors specifying the training example to
-// which the corresponding embedding_indices and aggregation_weights values
-// belong. It corresponds to sp_ids.indices[:,0] in  embedding_lookup_sparse().
+//	sample_splits: A list of rank 1 Tensors specifying the break points for splitting
+// embedding_indices and aggregation_weights into rows.
+// It corresponds to ids.row_splits in embedding_lookup(), when ids is a
+// RaggedTensor.
 //	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
-// It corresponds to sp_ids.values in embedding_lookup_sparse().
+// It corresponds to ids.values in embedding_lookup(), when ids is a RaggedTensor.
 //	aggregation_weights: A list of rank 1 Tensors containing per training example
-// aggregation weights. It corresponds to sp_weights.values in
-// embedding_lookup_sparse().
+// aggregation weights. It corresponds to the values field of a RaggedTensor
+// with the same row_splits as ids in embedding_lookup(), when ids is a
+// RaggedTensor.
 //	mode_override: A string input that overrides the mode specified in the
 // TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
 // 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
@@ -49481,7 +50124,7 @@ func EnqueueTPUEmbeddingSparseTensorBatchMaxSequenceLengths(value []int64) Enque
 // embedding_indices and aggregation_weights.
 //
 // Returns the created operation.
-func EnqueueTPUEmbeddingSparseTensorBatch(scope *Scope, sample_indices []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, table_ids []int64, optional ...EnqueueTPUEmbeddingSparseTensorBatchAttr) (o *tf.Operation) {
+func EnqueueTPUEmbeddingRaggedTensorBatch(scope *Scope, sample_splits []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, table_ids []int64, optional ...EnqueueTPUEmbeddingRaggedTensorBatchAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -49490,141 +50133,15 @@ func EnqueueTPUEmbeddingSparseTensorBatch(scope *Scope, sample_indices []tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EnqueueTPUEmbeddingSparseTensorBatch",
+		Type: "EnqueueTPUEmbeddingRaggedTensorBatch",
 		Input: []tf.Input{
-			tf.OutputList(sample_indices), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
+			tf.OutputList(sample_splits), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// ReverseSequenceAttr is an optional argument to ReverseSequence.
-type ReverseSequenceAttr func(optionalAttr)
-
-// ReverseSequenceBatchDim sets the optional batch_dim attribute to value.
-//
-// value: The dimension along which reversal is performed.
-// If not specified, defaults to 0
-func ReverseSequenceBatchDim(value int64) ReverseSequenceAttr {
-	return func(m optionalAttr) {
-		m["batch_dim"] = value
-	}
-}
-
-// Reverses variable length slices.
-//
-// This op first slices `input` along the dimension `batch_dim`, and for each
-// slice `i`, reverses the first `seq_lengths[i]` elements along
-// the dimension `seq_dim`.
-//
-// The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
-// and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
-//
-// The output slice `i` along dimension `batch_dim` is then given by input
-// slice `i`, with the first `seq_lengths[i]` slices along dimension
-// `seq_dim` reversed.
-//
-// For example:
-//
-// ```
-// # Given this:
-// batch_dim = 0
-// seq_dim = 1
-// input.dims = (4, 8, ...)
-// seq_lengths = [7, 2, 3, 5]
-//
-// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
-// output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
-// output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
-// output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
-// output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
-//
-// # while entries past seq_lens are copied through:
-// output[0, 7:, :, ...] = input[0, 7:, :, ...]
-// output[1, 2:, :, ...] = input[1, 2:, :, ...]
-// output[2, 3:, :, ...] = input[2, 3:, :, ...]
-// output[3, 2:, :, ...] = input[3, 2:, :, ...]
-// ```
-//
-// In contrast, if:
-//
-// ```
-// # Given this:
-// batch_dim = 2
-// seq_dim = 0
-// input.dims = (8, ?, 4, ...)
-// seq_lengths = [7, 2, 3, 5]
-//
-// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
-// output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
-// output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
-// output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
-// output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
-//
-// # while entries past seq_lens are copied through:
-// output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
-// output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
-// output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
-// output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
-// ```
-//
-// Arguments:
-//	input: The input to reverse.
-//	seq_lengths: 1-D with length `input.dims(batch_dim)` and
-// `max(seq_lengths) <= input.dims(seq_dim)`
-//	seq_dim: The dimension which is partially reversed.
-//
-// Returns The partially reversed input. It has the same shape as `input`.
-func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_dim int64, optional ...ReverseSequenceAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"seq_dim": seq_dim}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ReverseSequence",
-		Input: []tf.Input{
-			input, seq_lengths,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Fetches multiple values from infeed as an XLA tuple.
-//
-// Arguments:
-//	dtypes: The element types of each element in `outputs`.
-//	shapes: The shapes of each tensor in `outputs`.
-//
-// Returns A list of tensors that will be provided using the infeed mechanism.
-func InfeedDequeueTuple(scope *Scope, dtypes []tf.DataType, shapes []tf.Shape) (outputs []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes, "shapes": shapes}
-	opspec := tf.OpSpec{
-		Type: "InfeedDequeueTuple",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("InfeedDequeueTuple", err)
-		return
-	}
-	return outputs
-}
-
 // Creates and returns an empty tensor list.
 //
 // All list elements must be tensors of dtype element_dtype and shape compatible
@@ -49725,45 +50242,153 @@ func TPUReplicatedOutput(scope *Scope, input tf.Output, num_replicas int64) (out
 	return outputs
 }
 
-// Removes keys and its associated values from a table.
+// Scatter `updates` into a new tensor according to `indices`.
 //
-// The tensor `keys` must of the same type as the keys of the table. Keys not
-// already in the table are silently ignored.
+// Creates a new tensor by applying sparse `updates` to individual values or
+// slices within a tensor (initially zero for numeric, empty for string) of
+// the given `shape` according to indices.  This operator is the inverse of the
+// `tf.gather_nd` operator which extracts values or slices from a given tensor.
+//
+// This operation is similar to tensor_scatter_add, except that the tensor is
+// zero-initialized. Calling `tf.scatter_nd(indices, values, shape)` is identical
+// to `tensor_scatter_add(tf.zeros(shape, values.dtype), indices, values)`
+//
+// If `indices` contains duplicates, then their updates are accumulated (summed).
+//
+// **WARNING**: The order in which updates are applied is nondeterministic, so the
+// output will be nondeterministic if `indices` contains duplicates -- because
+// of some numerical approximation issues, numbers summed in different order
+// may yield different results.
+//
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+//
+//     indices.shape[-1] <= shape.rank
+//
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
+//
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
+//
+// The simplest form of scatter is to insert individual elements in a tensor by
+// index. For example, say we want to insert 4 scattered elements in a rank-1
+// tensor with 8 elements.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
+// </div>
+//
+// In Python, this scatter operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     shape = tf.constant([8])
+//     scatter = tf.scatter_nd(indices, updates, shape)
+//     print(scatter)
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [0, 11, 0, 10, 9, 0, 0, 12]
+//
+// We can also, insert entire slices of a higher rank tensor all at once. For
+// example, if we wanted to insert two slices in the first dimension of a
+// rank-3 tensor with two matrices of new values.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
+// </div>
+//
+// In Python, this scatter operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[0], [2]])
+//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]],
+//                            [[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
+//     shape = tf.constant([4, 4, 4])
+//     scatter = tf.scatter_nd(indices, updates, shape)
+//     print(scatter)
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
+//      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, the index is ignored.
 //
 // Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys of the elements to remove.
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
+//	shape: 1-D. The shape of the resulting tensor.
 //
-// Returns the created operation.
-func LookupTableRemoveV2(scope *Scope, table_handle tf.Output, keys tf.Output) (o *tf.Operation) {
+// Returns A new tensor with the given shape and updates applied according
+// to the indices.
+func ScatterNd(scope *Scope, indices tf.Output, updates tf.Output, shape tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableRemoveV2",
+		Type: "ScatterNd",
 		Input: []tf.Input{
-			table_handle, keys,
+			indices, updates, shape,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// NotEqualAttr is an optional argument to NotEqual.
-type NotEqualAttr func(optionalAttr)
+// UniqueAttr is an optional argument to Unique.
+type UniqueAttr func(optionalAttr)
 
-// NotEqualIncompatibleShapeError sets the optional incompatible_shape_error attribute to value.
-// If not specified, defaults to true
-func NotEqualIncompatibleShapeError(value bool) NotEqualAttr {
+// UniqueOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueOutIdx(value tf.DataType) UniqueAttr {
 	return func(m optionalAttr) {
-		m["incompatible_shape_error"] = value
+		m["out_idx"] = value
 	}
 }
 
-// Returns the truth value of (x != y) element-wise.
+// Finds unique elements in a 1-D tensor.
 //
-// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func NotEqual(scope *Scope, x tf.Output, y tf.Output, optional ...NotEqualAttr) (z tf.Output) {
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`; `x` does not need to be sorted.
+// This operation also returns a tensor `idx` the same size as `x` that contains
+// the index of each value of `x` in the unique output `y`. In other words:
+//
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+//
+// Examples:
+//
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx = unique(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// ```
+//
+// ```
+// # tensor 'x' is [4, 5, 1, 2, 3, 3, 4, 5]
+// y, idx = unique(x)
+// y ==> [4, 5, 1, 2, 3]
+// idx ==> [0, 1, 2, 3, 4, 4, 0, 1]
+// ```
+//
+// Arguments:
+//	x: 1-D.
+//
+// Returns:
+//	y: 1-D.
+//	idx: 1-D.
+func Unique(scope *Scope, x tf.Output, optional ...UniqueAttr) (y tf.Output, idx tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -49772,12 +50397,40 @@ func NotEqual(scope *Scope, x tf.Output, y tf.Output, optional ...NotEqualAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "NotEqual",
+		Type: "Unique",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
+}
+
+// Converts a `RaggedTensor` into a `SparseTensor` with the same values.
+//
+// input=ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
+// output=SparseTensor(indices=sparse_indices, values=sparse_values,
+//                     dense_shape=sparse_dense_shape)
+//
+// Arguments:
+//	rt_nested_splits: The `row_splits` for the `RaggedTensor`.
+//	rt_dense_values: The `flat_values` for the `RaggedTensor`.
+//
+// Returns:
+//	sparse_indices: The indices for the `SparseTensor`.
+//	sparse_values: The values of the `SparseTensor`.
+//	sparse_dense_shape: `sparse_dense_shape` is a tight bounding box of the input `RaggedTensor`.
+func RaggedTensorToSparse(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output) (sparse_indices tf.Output, sparse_values tf.Output, sparse_dense_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RaggedTensorToSparse",
+		Input: []tf.Input{
+			tf.OutputList(rt_nested_splits), rt_dense_values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 06bfaf348fb..7007a847d83 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -1,6 +1,5 @@
 load("//tensorflow:tensorflow.bzl", "if_not_windows", "tf_cc_test")
 load("//tensorflow/lite:build_def.bzl", "if_tflite_experimental_runtime", "tflite_cc_shared_object", "tflite_copts", "tflite_experimental_runtime_linkopts")
-load("//tensorflow/lite/micro:build_def.bzl", "cc_library")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 package(
@@ -83,7 +82,6 @@ FRAMEWORK_LIB_HDRS = [
 cc_library(
     name = "version",
     hdrs = ["version.h"],
-    build_for_embedded = True,
     copts = TFLITE_DEFAULT_COPTS,
     # Note that we only use the header defines from :version_lib.
     deps = ["//tensorflow/core:version_lib"],
@@ -139,7 +137,6 @@ cc_library(
     name = "external_cpu_backend_context",
     srcs = ["external_cpu_backend_context.cc"],
     hdrs = ["external_cpu_backend_context.h"],
-    build_for_embedded = True,
     copts = TFLITE_DEFAULT_COPTS,
     deps = [
         "//tensorflow/lite/c:common",
@@ -193,7 +190,6 @@ cc_library(
     hdrs = [
         "string_type.h",
     ],
-    build_for_embedded = True,
     copts = TFLITE_DEFAULT_COPTS,
 )
 
@@ -309,7 +305,6 @@ cc_library(
     name = "string_util",
     srcs = ["string_util.cc"],
     hdrs = ["string_util.h"],
-    build_for_embedded = True,
     copts = TFLITE_DEFAULT_COPTS,
     deps = [
         ":string",
@@ -417,9 +412,11 @@ cc_test(
         "tflite_smoke_test",
     ],
     deps = [
+        ":builtin_op_data",
         ":external_cpu_backend_context",
         ":framework",
         ":string_util",
+        ":util",
         ":version",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/kernels:builtin_ops",
@@ -620,10 +617,19 @@ cc_library(
 cc_library(
     name = "type_to_tflitetype",
     hdrs = ["type_to_tflitetype.h"],
-    build_for_embedded = True,
     deps = ["//tensorflow/lite/c:common"],
 )
 
+cc_test(
+    name = "type_to_tflitetype_test",
+    size = "small",
+    srcs = ["type_to_tflitetype_test.cc"],
+    deps = [
+        ":type_to_tflitetype",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_test(
     name = "minimal_logging_test",
     size = "small",
@@ -640,10 +646,14 @@ cc_test(
 cc_library(
     name = "shared_library",
     hdrs = ["shared_library.h"],
-    copts = TFLITE_DEFAULT_COPTS,
     linkopts = if_not_windows(["-ldl"]),
 )
 
+cc_library(
+    name = "macros",
+    hdrs = ["core/macros.h"],
+)
+
 # Shared lib target for convenience, pulls in the core runtime and builtin ops.
 # Note: This target is not yet finalized, and the exact set of exported (C/C++)
 # APIs is subject to change. The output library name is platform dependent:
diff --git a/tensorflow/lite/allocation.h b/tensorflow/lite/allocation.h
index 60745f105a6..cf9ff5c1332 100644
--- a/tensorflow/lite/allocation.h
+++ b/tensorflow/lite/allocation.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-// Main abstraction controlling the tflite interpreter.
-// See context.h for the API for defining operations (TfLiteRegistration).
+/// \file
+/// Memory management for TF Lite.
 #ifndef TENSORFLOW_LITE_ALLOCATION_H_
 #define TENSORFLOW_LITE_ALLOCATION_H_
 
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index ad43b56743a..4de0be7c3fa 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -7,6 +7,8 @@ load(
     "tf_cc_shared_object",
     "tf_cc_test",
 )
+load("//tensorflow/lite/java:aar_with_jni.bzl", "aar_with_jni")
+load("@build_bazel_rules_android//android:rules.bzl", "android_library")
 
 def tflite_copts():
     """Defines compile time flags."""
@@ -576,7 +578,14 @@ def flags_for_merged_test_models(test_name, conversion_mode):
         tests_csv = tests_csv[:-1]  # Remove trailing comma.
     return " --no_tests_limit --test_sets=%s" % tests_csv
 
-def gen_zip_test(name, test_name, conversion_mode, **kwargs):
+def gen_zip_test(
+        name,
+        test_name,
+        conversion_mode,
+        test_tags,
+        test_args,
+        additional_test_args = {},
+        **kwargs):
     """Generate a zipped-example test and its dependent zip files.
 
     Args:
@@ -584,6 +593,11 @@ def gen_zip_test(name, test_name, conversion_mode, **kwargs):
       test_name: str. Test targets this model. Comes from the list above.
       conversion_mode: str. Which conversion mode to run with. Comes from the
         list above.
+      test_tags: tags for the generated cc_test.
+      test_args: the basic cc_test args to be used.
+      additional_test_args: a dictionary of additional args to be used together
+        with test_args. The key is an identifier to be used in test tag, and
+        the value is a list of additional test args to be used.
       **kwargs: tf_cc_test kwargs
     """
     toco = "//tensorflow/lite/toco:toco"
@@ -601,7 +615,19 @@ def gen_zip_test(name, test_name, conversion_mode, **kwargs):
         toco = toco,
         flags = flags + " --save_graphdefs",
     )
-    tf_cc_test(name, **kwargs)
+    tf_cc_test(
+        name,
+        args = test_args,
+        tags = test_tags + ["gen_zip_test"],
+        **kwargs
+    )
+    for key, value in additional_test_args.items():
+        tf_cc_test(
+            name = "%s_%s" % (name, key),
+            args = test_args + value,
+            tags = test_tags + ["gen_zip_test_%s" % key],
+            **kwargs
+        )
 
 def gen_zipped_test_file(name, file, toco, flags):
     """Generate a zip file of tests by using :generate_examples.
@@ -732,7 +758,12 @@ def tflite_experimental_runtime_linkopts(if_eager = [], if_non_eager = [], if_no
         if_none = [] + if_none,
     )
 
-def tflite_custom_cc_library(name, models = [], srcs = [], deps = [], visibility = ["//visibility:private"]):
+def tflite_custom_cc_library(
+        name,
+        models = [],
+        srcs = [],
+        deps = [],
+        visibility = ["//visibility:private"]):
     """Generates a tflite cc library, stripping off unused operators.
 
     This library includes the TfLite runtime as well as all operators needed for the given models.
@@ -755,7 +786,7 @@ def tflite_custom_cc_library(name, models = [], srcs = [], deps = [], visibility
     if models:
         gen_selected_ops(
             name = "%s_registration" % name,
-            model = models[0],
+            model = models,
         )
         real_srcs.append(":%s_registration" % name)
         real_deps.append("//tensorflow/lite/java/src/main/native:selected_ops_jni")
@@ -766,6 +797,10 @@ def tflite_custom_cc_library(name, models = [], srcs = [], deps = [], visibility
     native.cc_library(
         name = name,
         srcs = real_srcs,
+        hdrs = [
+            # TODO(b/161323860) replace this by generated header.
+            "//tensorflow/lite/java/src/main/native:op_resolver.h",
+        ],
         copts = tflite_copts(),
         linkopts = select({
             "//tensorflow:windows": [],
@@ -777,3 +812,62 @@ def tflite_custom_cc_library(name, models = [], srcs = [], deps = [], visibility
         ] + real_deps),
         visibility = visibility,
     )
+
+def tflite_custom_android_library(
+        name,
+        models = [],
+        srcs = [],
+        deps = [],
+        custom_package = "org.tensorflow.lite",
+        visibility = ["//visibility:private"]):
+    """Generates a tflite Android library, stripping off unused operators.
+
+    Note that due to a limitation in the JNI Java wrapper, the compiled TfLite shared binary
+    has to be named as tensorflowlite_jni.so so please make sure that there is no naming conflict.
+    i.e. you can't call this rule multiple times in the same build file.
+
+    Args:
+        name: Name of the target.
+        models: List of models to be supported. This TFLite build will only include
+            operators used in these models. If the list is empty, all builtin
+            operators are included.
+        srcs: List of files implementing custom operators if any.
+        deps: Additional dependencies to build all the custom operators.
+        custom_package: Name of the Java package. It is required by android_library in case
+            the Java source file can't be inferred from the directory where this rule is used.
+        visibility: Visibility setting for the generated target. Default to private.
+    """
+    tflite_custom_cc_library(name = "%s_cc" % name, models = models, srcs = srcs, deps = deps, visibility = visibility)
+
+    # JNI wrapper expects a binary file called `libtensorflowlite_jni.so` in java path.
+    tflite_jni_binary(
+        name = "libtensorflowlite_jni.so",
+        linkscript = "//tensorflow/lite/java:tflite_version_script.lds",
+        deps = [
+            ":%s_cc" % name,
+            "//tensorflow/lite/java/src/main/native:native_framework_only",
+        ],
+    )
+
+    native.cc_library(
+        name = "%s_jni" % name,
+        srcs = ["libtensorflowlite_jni.so"],
+        visibility = visibility,
+    )
+
+    android_library(
+        name = name,
+        manifest = "//tensorflow/lite/java:AndroidManifest.xml",
+        deps = [
+            ":%s_jni" % name,
+            "//tensorflow/lite/java:tensorflowlite_java",
+            "@org_checkerframework_qual",
+        ],
+        custom_package = custom_package,
+        visibility = visibility,
+    )
+
+    aar_with_jni(
+        name = "%s_aar" % name,
+        android_library = name,
+    )
diff --git a/tensorflow/lite/c/BUILD b/tensorflow/lite/c/BUILD
index 1aa043b7c0c..5ac6d7881ac 100644
--- a/tensorflow/lite/c/BUILD
+++ b/tensorflow/lite/c/BUILD
@@ -3,10 +3,6 @@ load(
     "tflite_cc_shared_object",
     "tflite_copts",
 )
-load(
-    "//tensorflow/lite/micro:build_def.bzl",
-    "cc_library",
-)
 
 package(
     default_visibility = ["//visibility:public"],
@@ -66,6 +62,7 @@ cc_library(
         "//tensorflow/lite:framework",
         "//tensorflow/lite:version",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/kernels:builtin_ops",
     ],
     alwayslink = 1,
@@ -126,7 +123,6 @@ cc_library(
         "builtin_op_data.h",
         "common.h",
     ],
-    build_for_embedded = True,
     alwayslink = 1,
 )
 
@@ -158,3 +154,18 @@ cc_test(
         "@com_google_googletest//:gtest",
     ],
 )
+
+cc_test(
+    name = "c_test",
+    size = "small",
+    srcs = ["c_test.c"],
+    copts = tflite_copts(),
+    data = [
+        "//tensorflow/lite:testdata/add.bin",
+    ],
+    deps = [
+        ":c_api",
+        ":c_api_experimental",
+        ":common",
+    ],
+)
diff --git a/tensorflow/lite/c/c_api.cc b/tensorflow/lite/c/c_api.cc
index aa93a10302c..4afd413ba9c 100644
--- a/tensorflow/lite/c/c_api.cc
+++ b/tensorflow/lite/c/c_api.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/error_reporter.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
@@ -123,13 +124,18 @@ TfLiteInterpreter* TfLiteInterpreterCreate(
   }
 
   if (optional_options) {
-    interpreter->UseNNAPI(optional_options->use_nnapi);
-
     if (optional_options->num_threads !=
         TfLiteInterpreterOptions::kDefaultNumThreads) {
       interpreter->SetNumThreads(optional_options->num_threads);
     }
 
+    if (optional_options->use_nnapi) {
+      if (interpreter->ModifyGraphWithDelegate(tflite::NnApiDelegate()) !=
+          kTfLiteOk) {
+        return nullptr;
+      }
+    }
+
     for (auto* delegate : optional_options->delegates) {
       if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) {
         return nullptr;
diff --git a/tensorflow/lite/c/c_api.h b/tensorflow/lite/c/c_api.h
index 754fc3b8bbd..880b80e69b4 100644
--- a/tensorflow/lite/c/c_api.h
+++ b/tensorflow/lite/c/c_api.h
@@ -66,20 +66,6 @@ limitations under the License.
 /// TfLiteInterpreterOptionsDelete(options);
 /// TfLiteModelDelete(model);
 
-#ifdef SWIG
-#define TFL_CAPI_EXPORT
-#else
-#if defined(_WIN32)
-#ifdef TFL_COMPILE_LIBRARY
-#define TFL_CAPI_EXPORT __declspec(dllexport)
-#else
-#define TFL_CAPI_EXPORT __declspec(dllimport)
-#endif  // TFL_COMPILE_LIBRARY
-#else
-#define TFL_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // _WIN32
-#endif  // SWIG
-
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
diff --git a/tensorflow/lite/c/c_test.c b/tensorflow/lite/c/c_test.c
new file mode 100644
index 00000000000..2e9ca30ee43
--- /dev/null
+++ b/tensorflow/lite/c/c_test.c
@@ -0,0 +1,143 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/c_api.h"
+#include "tensorflow/lite/c/c_api_experimental.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+
+// This file exists just to verify that the above header files above can build,
+// link, and run as "C" code.
+
+#ifdef __cplusplus
+#error "This file should be compiled as C code, not as C++."
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static void CheckFailed(const char *expression, const char *filename,
+                        int line_number) {
+  fprintf(stderr, "ERROR: CHECK failed: %s:%d: %s\n", filename, line_number,
+          expression);
+  fflush(stderr);
+  abort();
+}
+
+// We use an extra level of macro indirection here to ensure that the
+// macro arguments get evaluated, so that in a call to CHECK(foo),
+// the call to STRINGIZE(condition) in the definition of the CHECK
+// macro results in the string "foo" rather than the string "condition".
+#define STRINGIZE(expression) STRINGIZE2(expression)
+#define STRINGIZE2(expression) #expression
+
+// Like assert(), but not dependent on NDEBUG.
+#define CHECK(condition) \
+  ((condition) ? (void)0 \
+               : CheckFailed(STRINGIZE(condition), __FILE__, __LINE__))
+#define ASSERT_EQ(expected, actual) CHECK((expected) == (actual))
+#define ASSERT_NE(expected, actual) CHECK((expected) != (actual))
+#define ASSERT_STREQ(expected, actual) \
+    ASSERT_EQ(0, strcmp((expected), (actual)))
+
+// Test the TfLiteVersion function.
+static void TestVersion(void) {
+  const char *version = TfLiteVersion();
+  printf("Version = %s\n", version);
+  CHECK(version[0] != '\0');
+}
+
+static void TestSmokeTest(void) {
+  TfLiteModel* model =
+      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+  ASSERT_NE(model, NULL);
+
+  TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+  ASSERT_NE(options, NULL);
+  TfLiteInterpreterOptionsSetNumThreads(options, 2);
+
+  TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
+  ASSERT_NE(interpreter, NULL);
+
+  // The options/model can be deleted immediately after interpreter creation.
+  TfLiteInterpreterOptionsDelete(options);
+  TfLiteModelDelete(model);
+
+  ASSERT_EQ(TfLiteInterpreterAllocateTensors(interpreter), kTfLiteOk);
+  ASSERT_EQ(TfLiteInterpreterGetInputTensorCount(interpreter), 1);
+  ASSERT_EQ(TfLiteInterpreterGetOutputTensorCount(interpreter), 1);
+
+  int input_dims[1] = {2};
+  ASSERT_EQ(TfLiteInterpreterResizeInputTensor(
+                interpreter, 0, input_dims, 1),
+            kTfLiteOk);
+  ASSERT_EQ(TfLiteInterpreterAllocateTensors(interpreter), kTfLiteOk);
+
+  TfLiteTensor* input_tensor = TfLiteInterpreterGetInputTensor(interpreter, 0);
+  ASSERT_NE(input_tensor, NULL);
+  ASSERT_EQ(TfLiteTensorType(input_tensor), kTfLiteFloat32);
+  ASSERT_EQ(TfLiteTensorNumDims(input_tensor), 1);
+  ASSERT_EQ(TfLiteTensorDim(input_tensor, 0), 2);
+  ASSERT_EQ(TfLiteTensorByteSize(input_tensor), sizeof(float) * 2);
+  ASSERT_NE(TfLiteTensorData(input_tensor), NULL);
+  ASSERT_STREQ(TfLiteTensorName(input_tensor), "input");
+
+  TfLiteQuantizationParams input_params =
+      TfLiteTensorQuantizationParams(input_tensor);
+  ASSERT_EQ(input_params.scale, 0.f);
+  ASSERT_EQ(input_params.zero_point, 0);
+
+  float input[2] = {1.f, 3.f};
+  ASSERT_EQ(TfLiteTensorCopyFromBuffer(input_tensor, input,
+                                       2 * sizeof(float)),
+            kTfLiteOk);
+
+  ASSERT_EQ(TfLiteInterpreterInvoke(interpreter), kTfLiteOk);
+
+  const TfLiteTensor* output_tensor =
+      TfLiteInterpreterGetOutputTensor(interpreter, 0);
+  ASSERT_NE(output_tensor, NULL);
+  ASSERT_EQ(TfLiteTensorType(output_tensor), kTfLiteFloat32);
+  ASSERT_EQ(TfLiteTensorNumDims(output_tensor), 1);
+  ASSERT_EQ(TfLiteTensorDim(output_tensor, 0), 2);
+  ASSERT_EQ(TfLiteTensorByteSize(output_tensor), sizeof(float) * 2);
+  ASSERT_NE(TfLiteTensorData(output_tensor), NULL);
+  ASSERT_STREQ(TfLiteTensorName(output_tensor), "output");
+
+  TfLiteQuantizationParams output_params =
+      TfLiteTensorQuantizationParams(output_tensor);
+  ASSERT_EQ(output_params.scale, 0.f);
+  ASSERT_EQ(output_params.zero_point, 0);
+
+  float output[2];
+  ASSERT_EQ(TfLiteTensorCopyToBuffer(output_tensor, output,
+                                     2 * sizeof(float)),
+            kTfLiteOk);
+  ASSERT_EQ(output[0], 3.f);
+  ASSERT_EQ(output[1], 9.f);
+
+  TfLiteInterpreterDelete(interpreter);
+}
+
+static void RunTests(void) {
+  TestVersion();
+  TestSmokeTest();
+}
+
+int main(void) {
+  RunTests();
+  return 0;
+}
diff --git a/tensorflow/lite/c/common.c b/tensorflow/lite/c/common.c
index e6b47896528..0264f420b12 100644
--- a/tensorflow/lite/c/common.c
+++ b/tensorflow/lite/c/common.c
@@ -207,6 +207,8 @@ const char* TfLiteTypeGetName(TfLiteType type) {
       return "BOOL";
     case kTfLiteComplex64:
       return "COMPLEX64";
+    case kTfLiteComplex128:
+      return "COMPLEX128";
     case kTfLiteString:
       return "STRING";
     case kTfLiteFloat16:
diff --git a/tensorflow/lite/c/common.h b/tensorflow/lite/c/common.h
index 13e846406e6..23eb528f4c9 100644
--- a/tensorflow/lite/c/common.h
+++ b/tensorflow/lite/c/common.h
@@ -233,11 +233,32 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a);
     }                                      \
   } while (0)
 
+// Define TFL_CAPI_EXPORT macro to export a function properly with a shared
+// library.
+#ifdef SWIG
+#define TFL_CAPI_EXPORT
+#else
+#if defined(_WIN32)
+#ifdef TFL_COMPILE_LIBRARY
+#define TFL_CAPI_EXPORT __declspec(dllexport)
+#else
+#define TFL_CAPI_EXPORT __declspec(dllimport)
+#endif  // TFL_COMPILE_LIBRARY
+#else
+#define TFL_CAPI_EXPORT __attribute__((visibility("default")))
+#endif  // _WIN32
+#endif  // SWIG
+
 // Single-precision complex data type compatible with the C99 definition.
 typedef struct TfLiteComplex64 {
   float re, im;  // real and imaginary parts, respectively.
 } TfLiteComplex64;
 
+// Double-precision complex data type compatible with the C99 definition.
+typedef struct TfLiteComplex128 {
+  double re, im;  // real and imaginary parts, respectively.
+} TfLiteComplex128;
+
 // Half precision data type compatible with the C99 definition.
 typedef struct TfLiteFloat16 {
   uint16_t data;
@@ -257,6 +278,7 @@ typedef enum {
   kTfLiteInt8 = 9,
   kTfLiteFloat16 = 10,
   kTfLiteFloat64 = 11,
+  kTfLiteComplex128 = 12,
 } TfLiteType;
 
 // Return the name of a given type, for error reporting purposes.
@@ -313,12 +335,14 @@ typedef union TfLitePtrUnion {
   int64_t* i64;
   float* f;
   TfLiteFloat16* f16;
+  double* f64;
   char* raw;
   const char* raw_const;
   uint8_t* uint8;
   bool* b;
   int16_t* i16;
   TfLiteComplex64* c64;
+  TfLiteComplex128* c128;
   int8_t* int8;
   /* Only use this member. */
   void* data;
@@ -334,6 +358,8 @@ typedef union TfLitePtrUnion {
 //  * kTfLitePersistentRo: Allocated and populated during prepare. This is
 //        useful for tensors that can be computed during prepare and treated
 //        as constant inputs for downstream ops (also in prepare).
+//  * kTfLiteCustom: Custom memory allocation provided by the user. See
+//        TfLiteCustomAllocation below.
 typedef enum TfLiteAllocationType {
   kTfLiteMemNone = 0,
   kTfLiteMmapRo,
@@ -341,6 +367,7 @@ typedef enum TfLiteAllocationType {
   kTfLiteArenaRwPersistent,
   kTfLiteDynamic,
   kTfLitePersistentRo,
+  kTfLiteCustom,
 } TfLiteAllocationType;
 
 // The delegates should use zero or positive integers to represent handles.
@@ -373,6 +400,15 @@ typedef struct TfLiteSparsity {
   int dim_metadata_size;
 } TfLiteSparsity;
 
+// Defines a custom memory allocation not owned by the runtime.
+// `data` should be aligned to kDefaultTensorAlignment defined in
+// lite/util.h. (Currently 64 bytes)
+// NOTE: See Interpreter.SetCustomAllocationForTensor for details on usage.
+typedef struct TfLiteCustomAllocation {
+  void* data;
+  size_t bytes;
+} TfLiteCustomAllocation;
+
 // An tensor in the interpreter system which is a wrapper around a buffer of
 // data including a dimensionality (or NULL if not currently defined).
 #ifndef TF_LITE_STATIC_MEMORY
@@ -692,12 +728,11 @@ typedef struct TfLiteContext {
   void* profiler;
 
   // Allocate persistent buffer which has the same life time as the interpreter.
+  // Returns nullptr on failure.
   // The memory is allocated from heap for TFL, and from tail in TFLM.
-  // If *ptr is not nullptr, the pointer will be reallocated.
-  // This method is only available in Prepare stage.
+  // This method is only available in Init or Prepare stage.
   // WARNING: This is an experimental interface that is subject to change.
-  TfLiteStatus (*AllocatePersistentBuffer)(struct TfLiteContext* ctx,
-                                           size_t bytes, void** ptr);
+  void* (*AllocatePersistentBuffer)(struct TfLiteContext* ctx, size_t bytes);
 
   // Allocate a buffer which will be deallocated right after invoke phase.
   // The memory is allocated from heap in TFL, and from volatile arena in TFLM.
@@ -753,16 +788,17 @@ typedef struct TfLiteContext {
       struct TfLiteContext* context, const TfLiteIntArray* nodes_to_replace,
       TfLiteDelegateParams** partition_params_array, int* num_partitions);
 
-  // Returns a TfLiteTensor struct for a given index in the subgraph.
+  // Returns a TfLiteTensor struct for a given index.
   // WARNING: This is an experimental interface that is subject to change.
   // WARNING: This method may not be available on all platforms.
-  TfLiteTensor* (*GetTensor)(struct TfLiteContext* context, int subgraph_idx);
+  TfLiteTensor* (*GetTensor)(const struct TfLiteContext* context,
+                             int tensor_idx);
 
-  // Returns a TfLiteEvalTensor struct for a given index in the subgraph.
+  // Returns a TfLiteEvalTensor struct for a given index.
   // WARNING: This is an experimental interface that is subject to change.
   // WARNING: This method may not be available on all platforms.
-  TfLiteEvalTensor* (*GetEvalTensor)(struct TfLiteContext* context,
-                                     int subgraph_idx);
+  TfLiteEvalTensor* (*GetEvalTensor)(const struct TfLiteContext* context,
+                                     int tensor_idx);
 } TfLiteContext;
 
 typedef struct TfLiteRegistration {
@@ -837,7 +873,26 @@ typedef enum TfLiteDelegateFlags {
   //
   // If the delegate isn't capable to handle dynamic tensors, this flag need
   // to be set to false.
-  kTfLiteDelegateFlagsAllowDynamicTensors = 1
+  kTfLiteDelegateFlagsAllowDynamicTensors = 1,
+
+  // This flag can be used by delegates (that allow dynamic tensors) to ensure
+  // applicable tensor shapes are automatically propagated in the case of tensor
+  // resizing.
+  // This means that non-dynamic (allocation_type != kTfLiteDynamic) I/O tensors
+  // of a delegate kernel will have correct shapes before its Prepare() method
+  // is called. The runtime leverages TFLite builtin ops in the original
+  // execution plan to propagate shapes.
+  //
+  // A few points to note:
+  // 1. This requires kTfLiteDelegateFlagsAllowDynamicTensors. If that flag is
+  // false, this one is redundant since the delegate kernels are re-initialized
+  // every time tensors are resized.
+  // 2. Enabling this flag adds some overhead to AllocateTensors(), since extra
+  // work is required to prepare the original execution plan.
+  // 3. This flag requires that the original execution plan only have ops with
+  // valid registrations (and not 'dummy' custom ops like with Flex).
+  // WARNING: This feature is experimental and subject to change.
+  kTfLiteDelegateFlagsRequirePropagatedShapes = 2
 } TfLiteDelegateFlags;
 
 // WARNING: This is an experimental interface that is subject to change.
diff --git a/tensorflow/lite/c/common_test.cc b/tensorflow/lite/c/common_test.cc
index 0421b50c05e..235c9c1b2cc 100644
--- a/tensorflow/lite/c/common_test.cc
+++ b/tensorflow/lite/c/common_test.cc
@@ -78,6 +78,7 @@ TEST(Types, TestTypeNames) {
     return std::string(TfLiteTypeGetName(t));
   };
   EXPECT_EQ(type_name(kTfLiteNoType), "NOTYPE");
+  EXPECT_EQ(type_name(kTfLiteFloat64), "FLOAT64");
   EXPECT_EQ(type_name(kTfLiteFloat32), "FLOAT32");
   EXPECT_EQ(type_name(kTfLiteFloat16), "FLOAT16");
   EXPECT_EQ(type_name(kTfLiteInt16), "INT16");
@@ -87,6 +88,7 @@ TEST(Types, TestTypeNames) {
   EXPECT_EQ(type_name(kTfLiteInt64), "INT64");
   EXPECT_EQ(type_name(kTfLiteBool), "BOOL");
   EXPECT_EQ(type_name(kTfLiteComplex64), "COMPLEX64");
+  EXPECT_EQ(type_name(kTfLiteComplex128), "COMPLEX128");
   EXPECT_EQ(type_name(kTfLiteString), "STRING");
 }
 
diff --git a/tensorflow/lite/core/api/BUILD b/tensorflow/lite/core/api/BUILD
index 97a3d3f78de..a1e6fc41cd9 100644
--- a/tensorflow/lite/core/api/BUILD
+++ b/tensorflow/lite/core/api/BUILD
@@ -1,5 +1,5 @@
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
-load("//tensorflow/lite/micro:build_def.bzl", "cc_library", "micro_copts")
+load("//tensorflow/lite/micro:build_def.bzl", "micro_copts")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -21,7 +21,6 @@ cc_library(
         "profiler.h",
         "tensor_utils.h",
     ],
-    build_for_embedded = True,
     copts = tflite_copts() + micro_copts(),
     deps = [
         "@flatbuffers//:runtime_cc",
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index fff8d15491e..7fb04f5b89e 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -863,6 +863,9 @@ TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
     case TensorType_COMPLEX64:
       *type = kTfLiteComplex64;
       return kTfLiteOk;
+    case TensorType_COMPLEX128:
+      *type = kTfLiteComplex128;
+      return kTfLiteOk;
     default:
       *type = kTfLiteNoType;
       TF_LITE_REPORT_ERROR(error_reporter,
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 5ef9b45514b..15b8a0bcc57 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/core/subgraph.h"
 
 #include <algorithm>
+#include <cstdint>
 
 #include "tensorflow/lite/arena_planner.h"
 #include "tensorflow/lite/c/common.h"
@@ -140,6 +141,17 @@ const char* GetTFLiteOpName(const TfLiteRegistration& op_reg) {
   return tflite::EnumNamesBuiltinOperator()[op_reg.builtin_code];
 }
 
+TfLiteStatus ValidateCustomAllocationForTensor(
+    TfLiteContext* context, const TfLiteTensor* tensor,
+    const TfLiteCustomAllocation& allocation) {
+  TF_LITE_ENSURE(context, allocation.data != nullptr);
+  TF_LITE_ENSURE(context, allocation.bytes >= tensor->bytes);
+  // Ensure provided memory is aligned to what TFLite requires.
+  const intptr_t data_ptr_value = reinterpret_cast<intptr_t>(allocation.data);
+  TF_LITE_ENSURE(context, data_ptr_value % kDefaultTensorAlignment == 0);
+  return kTfLiteOk;
+}
+
 }  // namespace
 
 // A trivial implementation of GraphInfo around the Interpreter.
@@ -189,6 +201,7 @@ Subgraph::Subgraph(ErrorReporter* error_reporter,
       next_execution_plan_index_to_plan_allocation_(0),
       subgraphs_(subgraphs),
       resources_(resources) {
+  // TODO(b/161272052): Consider a better TfLiteContext initialization pattern:
   context_.impl_ = static_cast<void*>(this);
   context_.ResizeTensor = ResizeTensor;
   context_.ReportError = ReportErrorC;
@@ -200,6 +213,8 @@ Subgraph::Subgraph(ErrorReporter* error_reporter,
   context_.GetExternalContext = GetExternalContext;
   context_.SetExternalContext = SetExternalContext;
   context_.profiler = nullptr;
+  context_.GetTensor = nullptr;
+  context_.GetEvalTensor = nullptr;
 
   // Reserve some space for the tensors to avoid excessive resizing.
   tensors_.reserve(kTensorsReservedCapacity);
@@ -634,6 +649,7 @@ TfLiteStatus Subgraph::AllocateTensors() {
 
   next_execution_plan_index_to_prepare_ = 0;
   next_execution_plan_index_to_plan_allocation_ = 0;
+  next_original_execution_plan_index_to_prepare_ = 0;
   if (memory_planner_) {
     TF_LITE_ENSURE_STATUS(memory_planner_->ResetAllocations());
   }
@@ -826,13 +842,14 @@ TfLiteStatus Subgraph::OpPrepare(const TfLiteRegistration& op_reg,
 }
 
 TfLiteStatus Subgraph::PrepareOpsStartingAt(
-    int first_execution_plan_index, int* last_execution_plan_index_prepared) {
+    int first_execution_plan_index, const std::vector<int>& execution_plan,
+    int* last_execution_plan_index_prepared) {
   if (first_execution_plan_index == 0) {
     has_dynamic_tensors_ = false;
   }
   for (int execution_plan_index = first_execution_plan_index;
-       execution_plan_index < execution_plan_.size(); execution_plan_index++) {
-    int node_index = execution_plan_[execution_plan_index];
+       execution_plan_index < execution_plan.size(); execution_plan_index++) {
+    int node_index = execution_plan[execution_plan_index];
     TfLiteNode& node = nodes_and_registration_[node_index].first;
     const TfLiteRegistration& registration =
         nodes_and_registration_[node_index].second;
@@ -864,15 +881,53 @@ TfLiteStatus Subgraph::PrepareOpsAndTensors() {
     memory_planner_->PlanAllocations();
   }
 
-  int last_exec_plan_index_prepared = 0;
+  // Prepare original execution plan if any applied delegate wants it.
+  // If any of the delegates is immutable, this won't be triggered
+  // post-delegation (since we undo/redo delegation). For all other cases, other
+  // delegates that do shape propagation themselves would still be able to.
+  bool prepare_original_plan = false;
+  if (!pre_delegation_execution_plan_.empty()) {
+    for (int i = 0; i < delegates_applied_.size(); ++i) {
+      if ((delegates_applied_[i]->flags &
+           kTfLiteDelegateFlagsRequirePropagatedShapes)) {
+        prepare_original_plan = true;
+        break;
+      }
+    }
+  }
+  if (prepare_original_plan) {
+    int last_original_exec_plan_index_prepared = 0;
+    TF_LITE_ENSURE_STATUS(PrepareOpsStartingAt(
+        next_execution_plan_index_to_prepare_, pre_delegation_execution_plan_,
+        &last_original_exec_plan_index_prepared));
+    next_original_execution_plan_index_to_prepare_ =
+        last_original_exec_plan_index_prepared + 1;
+  }
 
-  TF_LITE_ENSURE_STATUS(PrepareOpsStartingAt(
-      next_execution_plan_index_to_prepare_, &last_exec_plan_index_prepared));
+  int last_exec_plan_index_prepared = 0;
+  TF_LITE_ENSURE_STATUS(
+      PrepareOpsStartingAt(next_execution_plan_index_to_prepare_,
+                           execution_plan_, &last_exec_plan_index_prepared));
   next_execution_plan_index_to_prepare_ = last_exec_plan_index_prepared + 1;
 
+  // Execute arena allocations.
   TF_LITE_ENSURE_STATUS(memory_planner_->ExecuteAllocations(
       next_execution_plan_index_to_plan_allocation_,
       last_exec_plan_index_prepared));
+
+  // Ensure custom allocations are still valid for applicable tensors.
+  // This causes some extra validations for cases with dynamic tensors, but the
+  // overhead should be minimal since the number of custom-allocated tensors
+  // will typically be low.
+  for (int i = 0; i < custom_allocations_.size(); ++i) {
+    auto idx_and_alloc = custom_allocations_[i];
+    auto& tensor = tensors()[idx_and_alloc.first];
+    const auto& alloc = idx_and_alloc.second;
+    TF_LITE_ENSURE(context(), tensor.allocation_type == kTfLiteCustom);
+    TF_LITE_ENSURE_STATUS(
+        ValidateCustomAllocationForTensor(context(), &tensor, alloc));
+  }
+
   next_execution_plan_index_to_plan_allocation_ =
       last_exec_plan_index_prepared + 1;
 
@@ -1190,7 +1245,8 @@ TfLiteStatus Subgraph::ResizeTensorImpl(TfLiteTensor* tensor,
   if (tensor->allocation_type == kTfLiteArenaRw ||
       tensor->allocation_type == kTfLiteDynamic ||
       tensor->allocation_type == kTfLiteArenaRwPersistent ||
-      tensor->allocation_type == kTfLitePersistentRo) {
+      tensor->allocation_type == kTfLitePersistentRo ||
+      tensor->allocation_type == kTfLiteCustom) {
     tensor_resized_since_op_invoke_ |=
         TfLiteIntArrayEqual(tensor->dims, new_size) == 0;
     if (tensor->type != kTfLiteString) {
@@ -1363,8 +1419,9 @@ TfLiteStatus Subgraph::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
 
   if (!(delegate->flags & kTfLiteDelegateFlagsAllowDynamicTensors)) {
     int last_execution_plan_index_prepared;
-    TF_LITE_ENSURE_OK(&context_, PrepareOpsStartingAt(
-                                     0, &last_execution_plan_index_prepared));
+    TF_LITE_ENSURE_OK(
+        &context_, PrepareOpsStartingAt(0, execution_plan_,
+                                        &last_execution_plan_index_prepared));
     if (has_dynamic_tensors_) {
       // Make sure that we are in a defined ready state before returning.
       // Plan and allocate tensors before returning.
@@ -1426,6 +1483,33 @@ TfLiteStatus Subgraph::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
   return status;
 }
 
+TfLiteStatus Subgraph::SetCustomAllocationForTensor(
+    int tensor_index, const TfLiteCustomAllocation& allocation) {
+  TfLiteTensor* tensor = &context_.tensors[tensor_index];
+  TF_LITE_ENSURE(context(), tensor->allocation_type == kTfLiteArenaRw ||
+                                tensor->allocation_type == kTfLiteCustom);
+  TF_LITE_ENSURE_STATUS(
+      ValidateCustomAllocationForTensor(context(), tensor, allocation));
+
+  // If tensor already has a custom alloc, just reassign.
+  const auto alloc_it = std::find_if(
+      custom_allocations_.begin(), custom_allocations_.end(),
+      [tensor_index](
+          const std::pair<int, TfLiteCustomAllocation>& existing_alloc) {
+        return existing_alloc.first == tensor_index;
+      });
+  if (alloc_it == custom_allocations_.end()) {
+    custom_allocations_.emplace_back(tensor_index, allocation);
+  } else {
+    alloc_it->second = allocation;
+  }
+
+  tensor->allocation_type = kTfLiteCustom;
+  tensor->data.data = allocation.data;
+
+  return kTfLiteOk;
+}
+
 }  // namespace impl
 
 }  // namespace tflite
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index bee13c9073e..1fe1c7e4391 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -332,6 +332,29 @@ class Subgraph {
   // Before `AllocateTensors` is called, this will always return true;
   bool HasDynamicTensors() { return has_dynamic_tensors_; }
 
+  // Assigns (or reassigns) a custom memory allocation for the given tensor.
+  // If AllocateTensors() is called after this, the runtime does not consider
+  // the tensor during internal memory planning and will continue using the
+  // provided allocation for the tensor (assuming it satisfies the expected
+  // tensor byte length).
+  // The runtime does NOT take ownership of the underlying memory.
+  // Note that while this function can be called again to set a new allocation
+  // for the tensor, it can no longer be reset to the TFLite arena memory.
+  //
+  // Parameters should satisfy the following conditions:
+  // 1. tensor->allocation_type == kTfLiteArenaRw
+  //    In general, this is true for all non-constants such as I/O tensors.
+  // 2. allocation->data has the appropriate permissions for runtime access
+  //    (Read-only for inputs, Read-Write for others), and outlives Interpreter.
+  // 3. allocation->bytes >= tensor->bytes.
+  //    This condition is checked again if any tensors are resized.
+  // 4. allocation->data should be aligned to kDefaultTensorAlignment
+  //    defined in lite/util.h. (Currently 64 bytes)
+  //
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus SetCustomAllocationForTensor(
+      int tensor_index, const TfLiteCustomAllocation& allocation);
+
  private:
   // SubgraphAwareProfiler wraps an actual TFLite profiler, such as a
   // BufferedProfiler instance, and takes care of event profiling/tracing in a
@@ -419,6 +442,7 @@ class Subgraph {
   // 'last_node_prepared' with the id of the op containing dynamic tensors, or
   // the last in the graph.
   TfLiteStatus PrepareOpsStartingAt(int first_execution_plan_index,
+                                    const std::vector<int>& execution_plan,
                                     int* last_execution_plan_index_prepared);
 
   // Tensors needed by the interpreter. Use `AddTensors` to add more blank
@@ -635,6 +659,11 @@ class Subgraph {
   // NOTE: this relies on the order of nodes that is in topological order.
   int next_execution_plan_index_to_prepare_;
 
+  // Only used in cases where a delegate supporting dynamic tensors is applied.
+  // This helps prepare the original execution before the post-delegation one,
+  // so that tensor shapes propagate.
+  int next_original_execution_plan_index_to_prepare_;
+
   // This is similar to `next_execution_plan_index_to_prepare_`, but it tracks
   // which nodes' allocation is planned with the arena planner.
   //
@@ -674,6 +703,9 @@ class Subgraph {
 
   std::unique_ptr<MemoryPlanner> memory_planner_;
 
+  // Contains <tensor idx, custom allocation> pairs for all applicable tensors.
+  std::vector<std::pair<int, TfLiteCustomAllocation>> custom_allocations_;
+
   // Tracking bit for whether a tensor was resized in the course of an op
   // invocation. This is a useful hint to ensure that dynamic tensor outputs
   // trigger downstream reallocation after op invocation.
diff --git a/tensorflow/lite/delegates/delegate_test.cc b/tensorflow/lite/delegates/delegate_test.cc
index 1efe6e44d54..aed4400ed99 100644
--- a/tensorflow/lite/delegates/delegate_test.cc
+++ b/tensorflow/lite/delegates/delegate_test.cc
@@ -127,15 +127,19 @@ class TestDelegate : public ::testing::Test {
     // min_ops_per_subset: If >0, partitioning preview is used to choose only
     // those subsets with min_ops_per_subset number of nodes.
     // fail_node_invoke: To simulate failure of Delegate node's Invoke().
-    explicit SimpleDelegate(
-        const std::vector<int>& nodes,
-        TfLiteDelegateFlags delegate_flags = kTfLiteDelegateFlagsNone,
-        bool fail_node_prepare = false, int min_ops_per_subset = 0,
-        bool fail_node_invoke = false)
+    // automatic_shape_propagation: This assumes that the runtime will propagate
+    // shapes using the original execution plan.
+    explicit SimpleDelegate(const std::vector<int>& nodes,
+                            int64_t delegate_flags = kTfLiteDelegateFlagsNone,
+                            bool fail_node_prepare = false,
+                            int min_ops_per_subset = 0,
+                            bool fail_node_invoke = false,
+                            bool automatic_shape_propagation = false)
         : nodes_(nodes),
           fail_delegate_node_prepare_(fail_node_prepare),
           min_ops_per_subset_(min_ops_per_subset),
-          fail_delegate_node_invoke_(fail_node_invoke) {
+          fail_delegate_node_invoke_(fail_node_invoke),
+          automatic_shape_propagation_(automatic_shape_propagation) {
       delegate_.Prepare = [](TfLiteContext* context,
                              TfLiteDelegate* delegate) -> TfLiteStatus {
         auto* simple = static_cast<SimpleDelegate*>(delegate->data_);
@@ -242,60 +246,80 @@ class TestDelegate : public ::testing::Test {
       TfLiteRegistration reg = {nullptr};
       reg.custom_name = "fake_fused_op";
 
-      reg.invoke = [](TfLiteContext* context,
-                      TfLiteNode* node) -> TfLiteStatus {
-        // Copy input data to output data.
-        const TfLiteTensor* a0;
-        const TfLiteTensor* a1;
-        if (node->inputs->size == 2) {
-          a0 = GetInput(context, node, 0);
-          a1 = GetInput(context, node, 1);
-        } else {
-          a0 = GetInput(context, node, 0);
-          a1 = a0;
-        }
-        TfLiteTensor* out = GetOutput(context, node, 0);
-        int num = 1;
-        for (int i = 0; i < a0->dims->size; ++i) {
-          num *= a0->dims->data[i];
-        }
-        for (int i = 0; i < num; i++) {
-          out->data.f[i] = a0->data.f[i] + a1->data.f[i];
-        }
-        if (out->buffer_handle != kTfLiteNullBufferHandle) {
-          // Make the data stale so that CopyFromBufferHandle can be invoked
-          out->data_is_stale = true;
-        }
-        return kTfLiteOk;
-      };
+      // Different flavors of the delegate kernel's Invoke(), dependent on
+      // testing parameters.
       if (fail_delegate_node_invoke_) {
         reg.invoke = [](TfLiteContext* context,
                         TfLiteNode* node) -> TfLiteStatus {
           return kTfLiteError;
         };
+      } else {
+        reg.invoke = [](TfLiteContext* context,
+                        TfLiteNode* node) -> TfLiteStatus {
+          // Copy input data to output data.
+          const TfLiteTensor* a0;
+          const TfLiteTensor* a1;
+          if (node->inputs->size == 2) {
+            a0 = GetInput(context, node, 0);
+            a1 = GetInput(context, node, 1);
+          } else {
+            a0 = GetInput(context, node, 0);
+            a1 = a0;
+          }
+          TfLiteTensor* out = GetOutput(context, node, 0);
+          int num = 1;
+          for (int i = 0; i < a0->dims->size; ++i) {
+            num *= a0->dims->data[i];
+          }
+          for (int i = 0; i < num; i++) {
+            out->data.f[i] = a0->data.f[i] + a1->data.f[i];
+          }
+          if (out->buffer_handle != kTfLiteNullBufferHandle) {
+            // Make the data stale so that CopyFromBufferHandle can be invoked
+            out->data_is_stale = true;
+          }
+          return kTfLiteOk;
+        };
       }
 
-      reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-        // Set output size to input size
-        const TfLiteTensor* input1;
-        const TfLiteTensor* input2;
-        if (node->inputs->size == 2) {
-          input1 = GetInput(context, node, 0);
-          input2 = GetInput(context, node, 1);
-        } else {
-          input1 = GetInput(context, node, 0);
-          input2 = input1;
-        }
-        TfLiteTensor* output = GetOutput(context, node, 0);
-
-        TF_LITE_ENSURE_STATUS(context->ResizeTensor(
-            context, output, TfLiteIntArrayCopy(input1->dims)));
-        return kTfLiteOk;
-      };
-      if (fail_delegate_node_prepare_) {
+      // Different flavors of the delegate kernel's Prepare(), dependent on
+      // testing parameters.
+      if (automatic_shape_propagation_) {
+        reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+          // Shapes should already by propagated by the runtime, just need to
+          // check.
+          const TfLiteTensor* input1 = GetInput(context, node, 0);
+          TfLiteTensor* output = GetOutput(context, node, 0);
+          const int input_dims_size = input1->dims->size;
+          TF_LITE_ENSURE(context, output->dims->size == input_dims_size);
+          for (int i = 0; i < input_dims_size; ++i) {
+            TF_LITE_ENSURE(context,
+                           output->dims->data[i] == input1->dims->data[i]);
+          }
+          return kTfLiteOk;
+        };
+      } else if (fail_delegate_node_prepare_) {
         reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
           return kTfLiteError;
         };
+      } else {
+        reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+          // Set output size to input size
+          const TfLiteTensor* input1;
+          const TfLiteTensor* input2;
+          if (node->inputs->size == 2) {
+            input1 = GetInput(context, node, 0);
+            input2 = GetInput(context, node, 1);
+          } else {
+            input1 = GetInput(context, node, 0);
+            input2 = input1;
+          }
+          TfLiteTensor* output = GetOutput(context, node, 0);
+
+          TF_LITE_ENSURE_STATUS(context->ResizeTensor(
+              context, output, TfLiteIntArrayCopy(input1->dims)));
+          return kTfLiteOk;
+        };
       }
 
       return reg;
@@ -311,6 +335,7 @@ class TestDelegate : public ::testing::Test {
     bool fail_delegate_node_prepare_ = false;
     int min_ops_per_subset_ = 0;
     bool fail_delegate_node_invoke_ = false;
+    bool automatic_shape_propagation_ = false;
   };
 
   std::unique_ptr<Interpreter> interpreter_;
@@ -744,6 +769,129 @@ TEST_F(TestDelegate, TestResizeInputWithMultipleDelegates) {
   }
 }
 
+// If a delegate sets kTfLiteDelegateFlagsRequirePropagatedShapes but not
+// kTfLiteDelegateFlagsAllowDynamicTensors, the former is redundant.
+TEST_F(TestDelegate, TestRequirePropagatedShapes_NonDynamicDelegate) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0, 1, 2}, kTfLiteDelegateFlagsRequirePropagatedShapes));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 4}), kTfLiteOk);
+  // Resizing should revert execution plan to original state.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f, 8.0f};
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 4 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 4 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+TEST_F(TestDelegate, TestRequirePropagatedShapes_DynamicDelegateWithFlag) {
+  // Delegate sets both flags and in its Prepare, ensures that shapes have been
+  // propagated by runtime.
+  int delegate_flags = kTfLiteDelegateFlagsAllowDynamicTensors |
+                       kTfLiteDelegateFlagsRequirePropagatedShapes;
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0, 1, 2}, delegate_flags, false /**fail_node_prepare**/,
+      3 /**min_ops_per_subset**/, false /**fail_node_invoke**/,
+      true /**automatic_shape_propagation**/));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f, 8.0f};
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 4 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 4 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+// If the delegate implementation expects shapes to be automatically propagated
+// but does not set the required flag, its Prepare should fail.
+TEST_F(TestDelegate, TestRequirePropagatedShapes_DynamicDelegateWithoutFlag) {
+  // Delegate sets both flags and in its Prepare, ensures that shapes have been
+  // propagated by runtime.
+  int delegate_flags = kTfLiteDelegateFlagsAllowDynamicTensors;
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0, 1, 2}, delegate_flags, false /**fail_node_prepare**/,
+      3 /**min_ops_per_subset**/, false /**fail_node_invoke**/,
+      true /**automatic_shape_propagation**/));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteError);
+}
+
+TEST_F(TestDelegate, TestRequirePropagatedShapes_MultipleDelegates) {
+  // First delegate needs to support dynamic tensors to allow second delegation.
+  // This delegate does not require automatic propagation.
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0}, kTfLiteDelegateFlagsAllowDynamicTensors,
+      false /**fail_node_prepare**/, 1 /**min_ops_per_subset**/,
+      false /**fail_node_invoke**/, false /**automatic_shape_propagation**/));
+  // Second delegate supports nodes 1 & 2, and requires automatic shape
+  // propagation.
+  int delegate_flags = kTfLiteDelegateFlagsAllowDynamicTensors |
+                       kTfLiteDelegateFlagsRequirePropagatedShapes;
+  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {1, 2}, delegate_flags, false /**fail_node_prepare**/,
+      1 /**min_ops_per_subset**/, false /**fail_node_invoke**/,
+      true /**automatic_shape_propagation**/));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  // Should be two delegate nodes.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f, 8.0f};
+  constexpr int kOutputTensorIndex = 2;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 4 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 4 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
 TEST_F(TestDelegate, TestFallbackWithMultipleDelegates) {
   // First delegate only supports node 0.
   // This delegate should support dynamic tensors, otherwise the second won't be
@@ -959,16 +1107,18 @@ class TestDelegateWithDynamicTensors : public ::testing::Test {
   void SetUp() override {
     interpreter_.reset(new Interpreter);
 
-    interpreter_->AddTensors(2);
+    interpreter_->AddTensors(3);
     interpreter_->SetInputs({0});
-    interpreter_->SetOutputs({1});
+    interpreter_->SetOutputs({1, 2});
     TfLiteQuantizationParams quant;
     interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
                                                quant);
     interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
                                                quant);
+    interpreter_->SetTensorParametersReadWrite(2, kTfLiteFloat32, "", {3},
+                                               quant);
     TfLiteRegistration reg = DynamicCopyOpRegistration();
-    interpreter_->AddNodeWithParameters({0}, {1}, nullptr, 0, nullptr, &reg);
+    interpreter_->AddNodeWithParameters({0}, {1, 2}, nullptr, 0, nullptr, &reg);
 
     delegate_.Prepare = [](TfLiteContext* context,
                            TfLiteDelegate* delegate) -> TfLiteStatus {
@@ -988,8 +1138,14 @@ class TestDelegateWithDynamicTensors : public ::testing::Test {
     TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
 
     reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-      TfLiteTensor* output = GetOutput(context, node, 0);
-      SetTensorToDynamic(output);
+      // Output 0 is dynamic
+      TfLiteTensor* output0 = GetOutput(context, node, 0);
+      SetTensorToDynamic(output0);
+      // Output 1 has the same shape as input.
+      const TfLiteTensor* input = GetInput(context, node, 0);
+      TfLiteTensor* output1 = GetOutput(context, node, 1);
+      TF_LITE_ENSURE_STATUS(context->ResizeTensor(
+          context, output1, TfLiteIntArrayCopy(input->dims)));
       return kTfLiteOk;
     };
 
@@ -1002,6 +1158,21 @@ class TestDelegateWithDynamicTensors : public ::testing::Test {
 
   static TfLiteRegistration DelegateRegistration() {
     TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+
+    reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+      // If tensors are resized, the runtime should propagate shapes
+      // automatically if correct flag is set. Ensure values are correct.
+      // Output 0 should be dynamic.
+      TfLiteTensor* output0 = GetOutput(context, node, 0);
+      TF_LITE_ENSURE(context, IsDynamicTensor(output0));
+      // Output 1 has the same shape as input.
+      const TfLiteTensor* input = GetInput(context, node, 0);
+      TfLiteTensor* output1 = GetOutput(context, node, 1);
+      TF_LITE_ENSURE(context, input->dims->size == output1->dims->size);
+      TF_LITE_ENSURE(context, input->dims->data[0] == output1->dims->data[0]);
+      return kTfLiteOk;
+    };
+
     return reg;
   }
 
@@ -1041,6 +1212,34 @@ TEST_F(TestDelegateWithDynamicTensors, ModifyGraphAfterAllocate) {
   ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
 }
 
+TEST_F(TestDelegateWithDynamicTensors, ShapePropagation_FlagSet) {
+  // Trigger allocation *before* delegate application.
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  delegate_.flags = kTfLiteDelegateFlagsAllowDynamicTensors |
+                    kTfLiteDelegateFlagsRequirePropagatedShapes;
+  ASSERT_EQ(interpreter_->ModifyGraphWithDelegate(&delegate_), kTfLiteOk);
+
+  // Allocation before & after resizing tensors should work.
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+}
+
+TEST_F(TestDelegateWithDynamicTensors, ShapePropagation_FlagNotSet) {
+  // Trigger allocation *before* delegate application.
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  delegate_.flags = kTfLiteDelegateFlagsAllowDynamicTensors;
+  ASSERT_EQ(interpreter_->ModifyGraphWithDelegate(&delegate_), kTfLiteOk);
+
+  // Allocation after resizing tensors should NOT work, since runtime won't
+  // propagate shape - causing delegate kernel to fail.
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteError);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/delegates/external/README.md b/tensorflow/lite/delegates/external/README.md
new file mode 100644
index 00000000000..d110dede5b7
--- /dev/null
+++ b/tensorflow/lite/delegates/external/README.md
@@ -0,0 +1,33 @@
+# What is an External Delegate?
+An external delegate is a special Tensorflow Lite delegate that is simply
+initialized from loading a dynamic library which encapsulates an actual
+Tensorflow Lite delegate implementation. The actual delegate exposes the
+following two creation and deletion C APIs:
+
+* __tflite_plugin_create_delegate__ (declaration seen below) creates a delegate
+object based on provided key-value options. It may return NULL to indicate an
+error with the detailed information reported by calling `report_error` if
+provided. Each option key and value should be null-terminated.
+
+```
+TfLiteDelegate* tflite_plugin_create_delegate(
+  char** options_keys, char** options_values, size_t num_options,
+  void (*report_error)(const char *))
+```
+
+* __tflite_plugin_destroy_delegate__ (declaration seen below) destroys the
+delegate object that is created by the previous API. NULL as an argument value
+is allowed.
+
+```
+void tflite_plugin_destroy_delegate(TfLiteDelegate* delegate)
+```
+
+The external delegate provides an opague and transparent way to utilize a
+Tensorflow Lite delegate when performing inference. In other words, one may
+replace the actual Tensorflow Lite delegate by simply updating the dynamic
+library without changing the application code. We developed this mainly for
+delegate evaluation.
+
+Note, this delegate is the corresponding C++ implementation to the one for
+Tensorflow Lite Python binding as shown [here](https://github.com/tensorflow/tensorflow/blob/7145fc0e49be01ef6943f4df386ce38567e37797/tensorflow/lite/python/interpreter.py#L42).
diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index ae7408f8e30..6210007361a 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -58,12 +58,14 @@ tf_cc_test(
 
 # Define the standard flex delegate library, that pulls in the standard set
 # of TensorFlow ops and kernels, using tflite_flex_cc_library with no
-# portable_tensorflow_lib parameter. Custom flex delegate can be defined with
-# tflite_flex_cc_library if the parameter portable_tensorflow_lib
-# is provided. Ex:
+# models parameter. Custom flex delegate can be defined with
+# tflite_flex_cc_library if the parameter models is provided. Tensorflow
+# user-provided ops could also be supported by passing to additional_deps.
+# Ex:
 # tflite_flex_cc_library(
-#   name = "sample",
-#   portable_tensorflow_lib = custom_portable_tensorflow_lib,
+#   name = "sample_delegate",
+#   models = ["model1.tflite", "model2.tflite"],
+#   additional_deps = ["your_custom_ops_lib"],
 # )
 tflite_flex_cc_library(
     name = "delegate",
@@ -92,6 +94,7 @@ cc_library(
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite:macros",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite:string",
         "//tensorflow/lite:string_util",
@@ -230,11 +233,46 @@ tf_cc_test(
 )
 
 cc_library(
-    name = "whitelisted_flex_ops_lib",
+    name = "allowlisted_flex_ops_lib",
     srcs = [
         "allowlisted_flex_ops.cc",
     ],
     hdrs = [
         "allowlisted_flex_ops.h",
+        "allowlisted_flex_ops_internal.h",
     ],
+    deps = select({
+        "//tensorflow:android": [
+            "//tensorflow/core:portable_tensorflow_lib_lite",
+        ],
+        "//tensorflow:ios": [
+            "//tensorflow/core:portable_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:framework",
+        ],
+    }),
+)
+
+tf_cc_test(
+    name = "allowlisted_flex_ops_test",
+    size = "small",
+    srcs = [
+        "allowlisted_flex_ops_test.cc",
+    ],
+    deps = [
+        ":delegate",
+        ":allowlisted_flex_ops_lib",
+        "@com_google_googletest//:gtest",
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:portable_tensorflow_lib_lite",
+        ],
+        "//tensorflow:ios": [
+            "//tensorflow/core:portable_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:framework",
+        ],
+    }),
 )
diff --git a/tensorflow/lite/delegates/flex/allowlisted_flex_ops.cc b/tensorflow/lite/delegates/flex/allowlisted_flex_ops.cc
index 885601e5333..eefbeb72b15 100644
--- a/tensorflow/lite/delegates/flex/allowlisted_flex_ops.cc
+++ b/tensorflow/lite/delegates/flex/allowlisted_flex_ops.cc
@@ -12,14 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/lite/delegates/flex/allowlisted_flex_ops.h"
+
 #include <set>
 
-#include "tensorflow/lite/delegates/flex/allowlisted_flex_ops.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/lite/delegates/flex/allowlisted_flex_ops_internal.h"
 
 namespace tflite {
 namespace flex {
 
-bool IsAllowlistedFlexOp(const std::string& tensorflow_op_name) {
+const std::set<std::string>& GetFlexAllowlist() {
   static const std::set<std::string>* allowlisted_flex_ops =
       new std::set<std::string>({
           // go/keep-sorted start
@@ -82,6 +85,7 @@ bool IsAllowlistedFlexOp(const std::string& tensorflow_op_name) {
           "Cast",
           "Ceil",
           "CheckNumerics",
+          "CombinedNonMaxSuppression",
           "Complex",
           "ComplexAbs",
           "Concat",
@@ -108,7 +112,7 @@ bool IsAllowlistedFlexOp(const std::string& tensorflow_op_name) {
           "DebugGradientIdentity",
           "DebugGradientRefIdentity",
           "DecodeBase64",
-          "DecodeBmp",
+          "DecodeRaw",
           "DecodeWav",
           "DeepCopy",
           "DeleteSessionTensor",
@@ -299,6 +303,8 @@ bool IsAllowlistedFlexOp(const std::string& tensorflow_op_name) {
           "RFFT2D",
           "RFFT3D",
           "RaggedRange",
+          "RaggedTensorToSparse",
+          "RaggedTensorToTensor",
           "RandomGamma",
           "RandomStandardNormal",
           "RandomUniform",
@@ -538,12 +544,60 @@ bool IsAllowlistedFlexOp(const std::string& tensorflow_op_name) {
           "_Send",
           // go/keep-sorted end
       });
-  return allowlisted_flex_ops->find(tensorflow_op_name) !=
-         allowlisted_flex_ops->end();
+  return *allowlisted_flex_ops;
   // Prevent lint error about this function being too long. This function
   // is a set of ops, and making it shorter won't help readbility.
   // NOLINTNEXTLINE
 }
 
+// Allow the tf.text ops if they are registered in the global op registry.
+bool IsAllowedTFTextOpForFlex(const std::string& op_name) {
+  static const std::set<std::string>* tftext_flex_ops =
+      new std::set<std::string>({
+          "CaseFoldUTF8",
+          "ConstrainedSequence",
+          "MaxSpanningTree",
+          "NormalizeUTF8",
+          "NormalizeUTF8WithOffsetsMap",
+          "RegexSplitWithOffsets",
+          "RougeL",
+          "SentenceFragments",
+          "SentencepieceOp",
+          "SentencepieceTokenizeOp",
+          "SentencepieceTokenizeWithOffsetsOp",
+          "SentencepieceDetokenizeOp",
+          "SentencepieceVocabSizeOp",
+          "SplitMergeTokenizeWithOffsets",
+          "UnicodeScriptTokenizeWithOffsets",
+          "WhitespaceTokenizeWithOffsets",
+          "WordpieceTokenizeWithOffsets",
+      });
+  if (tftext_flex_ops->count(op_name) == 0) return false;
+  return tensorflow::OpRegistry::Global()->LookUp(op_name) != nullptr;
+}
+
+// Allow the sentencepiece ops if they are registered in the global op registry.
+bool IsAllowedSentencePieceOpForFlex(const std::string& op_name) {
+  static const std::set<std::string>* sentencepiece_flex_ops =
+      new std::set<std::string>({
+          "SentencepieceGetPieceSize",
+          "SentencepiecePieceToId",
+          "SentencepieceIdToPiece",
+          "SentencepieceEncodeDense",
+          "SentencepieceEncodeSparse",
+          "SentencepieceDecode",
+      });
+  if (sentencepiece_flex_ops->count(op_name) == 0) return false;
+  return tensorflow::OpRegistry::Global()->LookUp(op_name) != nullptr;
+}
+
+bool IsAllowlistedFlexOp(const std::string& tensorflow_op_name) {
+  if (GetFlexAllowlist().count(tensorflow_op_name) != 0) return true;
+
+  // Check if the op is an allowlisted tf.text or sentencepiece op.
+  return IsAllowedTFTextOpForFlex(tensorflow_op_name) ||
+         IsAllowedSentencePieceOpForFlex(tensorflow_op_name);
+}
+
 }  // namespace flex
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/flex/allowlisted_flex_ops_internal.h b/tensorflow/lite/delegates/flex/allowlisted_flex_ops_internal.h
new file mode 100644
index 00000000000..59392ad2a58
--- /dev/null
+++ b/tensorflow/lite/delegates/flex/allowlisted_flex_ops_internal.h
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_FLEX_ALLOWLISTED_FLEX_OPS_INTERNAL_H_
+#define TENSORFLOW_LITE_DELEGATES_FLEX_ALLOWLISTED_FLEX_OPS_INTERNAL_H_
+
+#include <set>
+#include <string>
+
+namespace tflite {
+namespace flex {
+
+// Return the list of allowlisted flex ops.
+const std::set<std::string>& GetFlexAllowlist();
+
+// Return true if op_name is a tf.text op need to be supported by flex delegate.
+bool IsAllowedTFTextOpForFlex(const std::string& op_name);
+
+}  // namespace flex
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_FLEX_ALLOWLISTED_FLEX_OPS_INTERNAL_H_
diff --git a/tensorflow/lite/delegates/flex/allowlisted_flex_ops_test.cc b/tensorflow/lite/delegates/flex/allowlisted_flex_ops_test.cc
new file mode 100644
index 00000000000..2e0ced9662b
--- /dev/null
+++ b/tensorflow/lite/delegates/flex/allowlisted_flex_ops_test.cc
@@ -0,0 +1,71 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/flex/allowlisted_flex_ops.h"
+
+#include <set>
+
+#include <gtest/gtest.h>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/lite/delegates/flex/allowlisted_flex_ops_internal.h"
+
+namespace tflite {
+namespace flex {
+
+// Get all cpu kernels registered in Tensorflow.
+std::set<std::string> GetAllCpuKernels() {
+  auto is_cpu_kernel = [](const tensorflow::KernelDef& def) {
+    return (def.device_type() == "CPU" || def.device_type() == "DEFAULT");
+  };
+
+  tensorflow::KernelList kernel_list =
+      tensorflow::GetFilteredRegisteredKernels(is_cpu_kernel);
+  std::set<std::string> result;
+
+  for (int i = 0; i < kernel_list.kernel_size(); ++i) {
+    tensorflow::KernelDef kernel_def = kernel_list.kernel(i);
+    result.insert(kernel_def.op());
+  }
+  return result;
+}
+
+// Test if every flex op has their kernel included in the flex delegate library.
+// This test must be run on both Linux and Android.
+TEST(AllowlistedFlexOpsTest, EveryOpHasKernel) {
+  const std::set<std::string>& allowlist = GetFlexAllowlist();
+  std::set<std::string> all_kernels = GetAllCpuKernels();
+
+  for (const std::string& op_name : allowlist) {
+    EXPECT_EQ(all_kernels.count(op_name), 1)
+        << op_name << " op is added to flex allowlist "
+        << "but its kernel is not found.";
+  }
+}
+
+TEST(TfTextUtilsTest, TestFlexOpAllowed) {
+  // Expect false since ConstrainedSequence kernel is not registered.
+  EXPECT_FALSE(IsAllowedTFTextOpForFlex("ConstrainedSequence"));
+}
+
+TEST(TfTextUtilsTest, TestFlexOpNotAllowed) {
+  EXPECT_FALSE(IsAllowedTFTextOpForFlex("ngrams"));
+}
+
+}  // namespace flex
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/delegates/flex/build_def.bzl b/tensorflow/lite/delegates/flex/build_def.bzl
index 2ff762b658b..9b9f1b2c4cb 100644
--- a/tensorflow/lite/delegates/flex/build_def.bzl
+++ b/tensorflow/lite/delegates/flex/build_def.bzl
@@ -5,6 +5,7 @@ load(
     "if_android",
     "if_ios",
     "if_mobile",
+    "tf_cc_binary",
     "tf_copts",
     "tf_defines_nortti_if_lite_protos",
     "tf_features_nomodules_if_mobile",
@@ -21,12 +22,14 @@ load("@build_bazel_rules_android//android:rules.bzl", "android_library")
 
 def generate_flex_kernel_header(
         name,
-        models):
+        models,
+        additional_deps = []):
     """A rule to generate a header file listing only used operators.
 
     Args:
       name: Name of the generated library.
       models: TFLite models to interpret.
+      additional_deps: Dependencies for additional TF ops.
 
     Returns:
       A struct with 'header' and 'include_path' fields that
@@ -44,6 +47,14 @@ def generate_flex_kernel_header(
     )
     list_ops_output = include_path + "/list_flex_ops"
     list_ops_tool = "//tensorflow/lite/tools:list_flex_ops_main"
+    if additional_deps:
+        tf_cc_binary(
+            name = "%s_list_flex_ops_main" % name,
+            deps = [
+                "//tensorflow/lite/tools:list_flex_ops_main_lib",
+            ] + additional_deps,
+        )
+        list_ops_tool = ":%s_list_flex_ops_main" % name
     native.genrule(
         name = "%s_list_flex_ops" % name,
         srcs = models,
@@ -70,59 +81,18 @@ def generate_flex_kernel_header(
     return struct(include_path = include_path, header = header)
 
 def tflite_flex_cc_library(
-        name,
-        portable_tensorflow_lib = "//tensorflow/core:portable_tensorflow_lib",
-        visibility = ["//visibility:public"]):
-    """A rule to generate a flex delegate with custom portable tensorflow lib.
-
-    This lib should be a custom version of portable_tensorflow_lib and contains ops
-    registrations and kernels. If not defined, the default libs will be used.
-
-    Args:
-      name: Name of the generated rule.
-      portable_tensorflow_lib: the tensorflow_lib to be added in deps for android and ios,
-          can be a full or trimmed version.
-      visibility: visibility of the generated rule.
-    """
-    native.cc_library(
-        name = name,
-        hdrs = [
-            "//tensorflow/lite/delegates/flex:delegate.h",
-        ],
-        visibility = visibility,
-        deps = [
-            "//tensorflow/lite/delegates/flex:delegate_data",
-            "//tensorflow/lite/delegates/flex:delegate_only_runtime",
-            "//tensorflow/lite/delegates/utils:simple_delegate",
-        ] + select({
-            "//tensorflow:android": [
-                portable_tensorflow_lib,
-            ],
-            "//tensorflow:ios": [
-                portable_tensorflow_lib,
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:tensorflow",
-                "//tensorflow/lite/c:common",
-            ],
-        }),
-        alwayslink = 1,
-    )
-
-def tflite_flex_jni_library(
         name,
         models = [],
-        visibility = ["//visibility:private"]):
-    """A rule to generate a jni library listing only used operators.
-
-    The libtensorflowlite_flex_jni.so name is fixed due to a limitation in JNI
-    Java wrapper, so please make sure there is no naming conflicts.
+        additional_deps = [],
+        visibility = ["//visibility:public"]):
+    """A rule to generate a flex delegate with only ops to run listed models.
 
     Args:
-      name: Prefix of the generated libraries.
+      name: Name of the generated flex delegate.
       models: TFLite models to interpret. The library will only include ops and kernels
           to support these models. If empty, the library will include all Tensorflow
           ops and kernels.
+      additional_deps: Dependencies for additional TF ops.
       visibility: visibility of the generated rules.
     """
     portable_tensorflow_lib = "//tensorflow/core:portable_tensorflow_lib"
@@ -130,6 +100,7 @@ def tflite_flex_jni_library(
         CUSTOM_KERNEL_HEADER = generate_flex_kernel_header(
             name = "%s_tf_op_headers" % name,
             models = models,
+            additional_deps = additional_deps,
         )
 
         # Define a custom tensorflow_lib with selective registration.
@@ -165,6 +136,7 @@ def tflite_flex_jni_library(
                 "@com_google_absl//absl/types:optional",
                 "@gemmlowp",
                 "//tensorflow/core:protos_all_cc",
+                "@icu//:common",
                 "//tensorflow/core:portable_tensorflow_lib_lite",
                 "//tensorflow/core/platform:strong_hash",
             ],
@@ -172,52 +144,81 @@ def tflite_flex_jni_library(
         )
         portable_tensorflow_lib = ":%s_tensorflow_lib" % name
 
-    # Define a custom init_tensorflow that depends on the above tensorflow_lib.
-    # This will avoid the symbols re-definition errors.
+    # Define a custom flex delegate with above tensorflow_lib.
     native.cc_library(
-        name = "%s_init_tensorflow" % name,
-        srcs = [
-            "//tensorflow/lite/testing:init_tensorflow.cc",
-        ],
+        name = name,
         hdrs = [
-            "//tensorflow/lite/testing:init_tensorflow.h",
+            "//tensorflow/lite/delegates/flex:delegate.h",
         ],
         visibility = visibility,
-        deps = select({
-            "//conditions:default": [
-                "//tensorflow/core:lib",
-            ],
+        deps = [
+            "//tensorflow/lite/delegates/flex:delegate_data",
+            "//tensorflow/lite/delegates/flex:delegate_only_runtime",
+            "//tensorflow/lite/delegates/utils:simple_delegate",
+        ] + select({
             "//tensorflow:android": [
                 portable_tensorflow_lib,
             ],
             "//tensorflow:ios": [
                 portable_tensorflow_lib,
             ],
-        }),
+            "//conditions:default": [
+                "//tensorflow/core:tensorflow",
+                "//tensorflow/lite/c:common",
+            ],
+        }) + additional_deps,
+        alwayslink = 1,
     )
 
+def tflite_flex_jni_library(
+        name,
+        models = [],
+        additional_deps = [],
+        visibility = ["//visibility:private"]):
+    """A rule to generate a jni library listing only used operators.
+
+    The libtensorflowlite_flex_jni.so name is fixed due to a limitation in JNI
+    Java wrapper, so please make sure there is no naming conflicts.
+
+    Args:
+      name: Prefix of the generated libraries.
+      models: TFLite models to interpret. The library will only include ops and kernels
+          to support these models. If empty, the library will include all Tensorflow
+          ops and kernels.
+      additional_deps: Dependencies for additional TF ops.
+      visibility: visibility of the generated rules.
+    """
+
     # Define a custom flex_delegate that depends on above tensorflow_lib.
     # This will reduce the binary size comparing to the original flex delegate.
     tflite_flex_cc_library(
         name = "%s_flex_delegate" % name,
-        portable_tensorflow_lib = portable_tensorflow_lib,
+        models = models,
+        additional_deps = additional_deps,
         visibility = visibility,
     )
 
-    # Define a custom flex_native that depends on above flex_delegate and init_tensorflow.
+    # Define a custom flex_native that depends on above flex_delegate.
     native.cc_library(
         name = "%s_flex_native" % name,
         srcs = [
+            "//tensorflow/lite/testing:init_tensorflow.h",
+            "//tensorflow/lite/testing:init_tensorflow.cc",
             "//tensorflow/lite/delegates/flex/java/src/main/native:flex_delegate_jni.cc",
         ],
         copts = tflite_copts(),
         visibility = visibility,
         deps = [
             ":%s_flex_delegate" % name,
-            ":%s_init_tensorflow" % name,
             "//tensorflow/lite/java/jni",
             "//tensorflow/lite/delegates/utils:simple_delegate",
-        ],
+        ] + select({
+            "//tensorflow:android": [],
+            "//tensorflow:ios": [],
+            "//conditions:default": [
+                "//tensorflow/core:lib",
+            ],
+        }),
         alwayslink = 1,
     )
 
@@ -234,6 +235,7 @@ def tflite_flex_jni_library(
 def tflite_flex_android_library(
         name,
         models = [],
+        additional_deps = [],
         custom_package = "org.tensorflow.lite.flex",
         visibility = ["//visibility:private"]):
     """A rule to generate an android library based on the selective-built jni library.
@@ -243,12 +245,14 @@ def tflite_flex_android_library(
       models: TFLite models used for selective build. The library will only include ops
           and kernels to support these models. If empty, the library will include all
           Tensorflow ops and kernels.
+      additional_deps: Dependencies for additional TF ops.
       custom_package: Java package for which java sources will be generated.
       visibility: visibility of the generated rules.
     """
     tflite_flex_jni_library(
         name = name,
         models = models,
+        additional_deps = additional_deps,
         visibility = visibility,
     )
 
diff --git a/tensorflow/lite/delegates/flex/delegate.cc b/tensorflow/lite/delegates/flex/delegate.cc
index b8b0d4e6d01..4664ab34700 100644
--- a/tensorflow/lite/delegates/flex/delegate.cc
+++ b/tensorflow/lite/delegates/flex/delegate.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/core/macros.h"
 #include "tensorflow/lite/delegates/flex/buffer_map.h"
 #include "tensorflow/lite/delegates/flex/kernel.h"
 #include "tensorflow/lite/delegates/flex/util.h"
@@ -30,9 +31,13 @@ limitations under the License.
 namespace tflite {
 
 // Corresponding weak declaration found in lite/interpreter_builder.cc.
+#if TFLITE_HAS_ATTRIBUTE_WEAK
+// If weak symbol is not supported (Windows), it can use
+// TF_AcquireFlexDelegate() path instead.
 TfLiteDelegateUniquePtr AcquireFlexDelegate() {
   return tflite::FlexDelegate::Create();
 }
+#endif
 
 TfLiteDelegateUniquePtr FlexDelegate::Create(
     std::unique_ptr<FlexDelegate> base_delegate) {
@@ -140,6 +145,11 @@ TfLiteStatus FlexDelegate::CopyFromBufferHandle(
 // Exported C interface function which is used by AcquireFlexDelegate() at
 // interpreter_build.cc. To export the function name globally, the function name
 // must be matched with patterns in tf_version_script.lds
-extern "C" tflite::TfLiteDelegateUniquePtr TF_AcquireFlexDelegate() {
-  return tflite::AcquireFlexDelegate();
+extern "C" {
+#if defined(_WIN32)
+__declspec(dllexport)
+#endif
+    tflite::TfLiteDelegateUniquePtr TF_AcquireFlexDelegate() {
+  return tflite::FlexDelegate::Create();
 }
+}  // extern "C"
diff --git a/tensorflow/lite/delegates/flex/util.cc b/tensorflow/lite/delegates/flex/util.cc
index 750de7397fa..11cf28073fa 100644
--- a/tensorflow/lite/delegates/flex/util.cc
+++ b/tensorflow/lite/delegates/flex/util.cc
@@ -76,6 +76,8 @@ TF_DataType GetTensorFlowDataType(TfLiteType type) {
       return TF_INT64;
     case kTfLiteComplex64:
       return TF_COMPLEX64;
+    case kTfLiteComplex128:
+      return TF_COMPLEX128;
     case kTfLiteString:
       return TF_STRING;
     case kTfLiteBool:
@@ -89,6 +91,8 @@ TfLiteType GetTensorFlowLiteType(TF_DataType type) {
       return kTfLiteFloat32;
     case TF_HALF:
       return kTfLiteFloat16;
+    case TF_DOUBLE:
+      return kTfLiteFloat64;
     case TF_INT16:
       return kTfLiteInt16;
     case TF_INT32:
@@ -101,6 +105,8 @@ TfLiteType GetTensorFlowLiteType(TF_DataType type) {
       return kTfLiteInt64;
     case TF_COMPLEX64:
       return kTfLiteComplex64;
+    case TF_COMPLEX128:
+      return kTfLiteComplex128;
     case TF_STRING:
       return kTfLiteString;
     case TF_BOOL:
diff --git a/tensorflow/lite/delegates/flex/util_test.cc b/tensorflow/lite/delegates/flex/util_test.cc
index 751289ef28f..0d4b50256f0 100644
--- a/tensorflow/lite/delegates/flex/util_test.cc
+++ b/tensorflow/lite/delegates/flex/util_test.cc
@@ -109,22 +109,28 @@ TEST(UtilTest, CopyShapeAndType) {
 TEST(UtilTest, TypeConversionsFromTFLite) {
   EXPECT_EQ(TF_FLOAT, GetTensorFlowDataType(kTfLiteNoType));
   EXPECT_EQ(TF_FLOAT, GetTensorFlowDataType(kTfLiteFloat32));
+  EXPECT_EQ(TF_HALF, GetTensorFlowDataType(kTfLiteFloat16));
+  EXPECT_EQ(TF_DOUBLE, GetTensorFlowDataType(kTfLiteFloat64));
   EXPECT_EQ(TF_INT16, GetTensorFlowDataType(kTfLiteInt16));
   EXPECT_EQ(TF_INT32, GetTensorFlowDataType(kTfLiteInt32));
   EXPECT_EQ(TF_UINT8, GetTensorFlowDataType(kTfLiteUInt8));
   EXPECT_EQ(TF_INT64, GetTensorFlowDataType(kTfLiteInt64));
   EXPECT_EQ(TF_COMPLEX64, GetTensorFlowDataType(kTfLiteComplex64));
+  EXPECT_EQ(TF_COMPLEX128, GetTensorFlowDataType(kTfLiteComplex128));
   EXPECT_EQ(TF_STRING, GetTensorFlowDataType(kTfLiteString));
   EXPECT_EQ(TF_BOOL, GetTensorFlowDataType(kTfLiteBool));
 }
 
 TEST(UtilTest, TypeConversionsFromTensorFlow) {
+  EXPECT_EQ(kTfLiteFloat16, GetTensorFlowLiteType(TF_HALF));
   EXPECT_EQ(kTfLiteFloat32, GetTensorFlowLiteType(TF_FLOAT));
+  EXPECT_EQ(kTfLiteFloat64, GetTensorFlowLiteType(TF_DOUBLE));
   EXPECT_EQ(kTfLiteInt16, GetTensorFlowLiteType(TF_INT16));
   EXPECT_EQ(kTfLiteInt32, GetTensorFlowLiteType(TF_INT32));
   EXPECT_EQ(kTfLiteUInt8, GetTensorFlowLiteType(TF_UINT8));
   EXPECT_EQ(kTfLiteInt64, GetTensorFlowLiteType(TF_INT64));
   EXPECT_EQ(kTfLiteComplex64, GetTensorFlowLiteType(TF_COMPLEX64));
+  EXPECT_EQ(kTfLiteComplex128, GetTensorFlowLiteType(TF_COMPLEX128));
   EXPECT_EQ(kTfLiteString, GetTensorFlowLiteType(TF_STRING));
   EXPECT_EQ(kTfLiteBool, GetTensorFlowLiteType(TF_BOOL));
   EXPECT_EQ(kTfLiteNoType, GetTensorFlowLiteType(TF_RESOURCE));
diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD
index 54fc124cde6..d69bed4c03a 100644
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@@ -1,3 +1,4 @@
+load("@bazel_skylib//lib:selects.bzl", "selects")
 load("//tensorflow/lite:special_rules.bzl", "tflite_extra_gles_deps")
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
 load("@build_bazel_rules_apple//apple:macos.bzl", "macos_dylib")
@@ -219,6 +220,16 @@ cc_library(
     ],
 )
 
+# Currently the GPU delegate needs to be built on Android (due to EGL dependency),
+# or built with -DCL_DELEGATE_NO_GL (disabling OpenGL backend fallback), or both.
+selects.config_setting_group(
+    name = "supports_gpu_delegate",
+    match_any = [
+        "//tensorflow:android",
+        "//tensorflow/lite/delegates/gpu/cl:opencl_delegate_no_gl",
+    ],
+)
+
 cc_library(
     name = "delegate",
     srcs = ["delegate.cc"],
@@ -234,7 +245,15 @@ cc_library(
         ],
         "//conditions:default": [],
     }),
-    deps = [
+    deps = select({
+        "//tensorflow/lite/delegates/gpu/cl:opencl_delegate_no_gl": [],
+        "//conditions:default": [
+            "//tensorflow/lite/delegates/gpu/gl:api2",
+        ],
+    }) + [
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:span",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:common",
@@ -247,9 +266,6 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:quantization_util",
         "//tensorflow/lite/delegates/gpu/common:status",
-        "//tensorflow/lite/delegates/gpu/gl:api2",
         "//tensorflow/lite/kernels/internal:optimized_base",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/tensorflow/lite/delegates/gpu/README.md b/tensorflow/lite/delegates/gpu/README.md
index 552e1cdbec6..c37ee90b704 100644
--- a/tensorflow/lite/delegates/gpu/README.md
+++ b/tensorflow/lite/delegates/gpu/README.md
@@ -51,6 +51,10 @@ TFLite on GPU supports the following ops in 16-bit and 32-bit float precision:
 
 ## Basic Usage
 
+**Note:** Following section describes the example usage for Android GPU delegate
+with C++. For other languages and platforms, please see
+[the documentation](https://www.tensorflow.org/lite/performance/gpu).
+
 Using TFLite on GPU is as simple as getting the GPU delegate via
 `TfLiteGpuDelegateV2Create()` and then passing it to
 `Interpreter::ModifyGraphWithDelegate()` instead of calling
@@ -99,13 +103,13 @@ Metal shaders are used for iOS, which were introduced with iOS 8.  Thus,
 compilation flags should look like:
 
 ```sh
-bazel build --config ios_arm64 //path/to/your:project
+bazel build --config ios_fat //path/to/your:project
 ```
 
 ## Advanced Usage: Delegate Options
 
 There are GPU options that can be set and passed on to
-`TfLiteGpuDelegateCreate()`. When option is set to `nullptr` as shown in the
+`TfLiteGpuDelegateV2Create()`. When option is set to `nullptr` as shown in the
 Basic Usage, it translates to:
 
 ```c++
@@ -113,12 +117,13 @@ const TfLiteGpuDelegateOptionsV2 kDefaultOptions =
     TfLiteGpuDelegateOptionsV2Default();
 ```
 
-Similar for `NewTfLiteMetalDelegate()`:
+Similar for `TFLGpuDelegateCreate()`:
 
 ```c++
-const TfLiteMetalDelegateOptions kDefaultOptions = {
-  .precision_loss_allowed = 0,  // false
-  .wait_type = TFLITE_METAL_WAIT_TYPE_SLEEP,
+const TFLGpuDelegateOptions kDefaultOptions = {
+  .allow_precision_loss = false,
+  .wait_type = TFLGpuDelegateWaitTypePassive,
+  .enable_quantization = false,
 };
 ```
 
@@ -126,9 +131,10 @@ While it is convenient to just supply `nullptr`, it is recommended to explicitly
 set the options to avoid any unexpected artifacts in case default values are
 changed.
 
-*IMPORTANT:* Note that the default option does not allow precision loss, and
-thus may not be the fastest.  For faster execution, you may want to set
-`precision_loss_allowed` to `1` for FP16 execution.
+*IMPORTANT:* Note that the default option may not be the fastest. For faster
+execution, you may want to set `allow_precision_loss` to `true` so that the GPU
+performs FP16 calculation internally, and set `wait_type` to
+`TFLGpuDelegateWaitTypeAggressive` to avoid GPU sleep mode.
 
 ## Tips and Tricks
 
diff --git a/tensorflow/lite/delegates/gpu/api.h b/tensorflow/lite/delegates/gpu/api.h
index 1dfeeebd700..7892d0ce2f6 100644
--- a/tensorflow/lite/delegates/gpu/api.h
+++ b/tensorflow/lite/delegates/gpu/api.h
@@ -43,9 +43,14 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/util.h"
-#include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
 #include <vulkan/vulkan.h>
 
+#define GL_NO_PROTOTYPES
+#define EGL_NO_PROTOTYPES
+#include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
+#undef GL_NO_PROTOTYPES
+#undef EGL_NO_PROTOTYPES
+
 namespace tflite {
 namespace gpu {
 
diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index ffb9d6204ad..d6076e221bd 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -9,23 +9,34 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+config_setting(
+    name = "opencl_delegate_no_gl",
+    values = {"copt": "-DCL_DELEGATE_NO_GL"},
+)
+
 cc_library(
     name = "api",
     srcs = ["api.cc"],
     hdrs = ["api.h"],
-    deps = [
+    deps = select({
+        ":opencl_delegate_no_gl": [],
+        "//conditions:default": [
+            ":egl_sync",
+            ":gl_interop",
+        ],
+    }) + [
         ":cl_command_queue",
         ":cl_errors",
         ":cl_event",
-        ":egl_sync",
         ":environment",
-        ":gl_interop",
         ":inference_context",
         ":opencl_wrapper",
         ":precision",
         ":tensor",
         ":tensor_type",
         ":tensor_type_util",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:span",
         "//tensorflow/lite/delegates/gpu:api",
         "//tensorflow/lite/delegates/gpu/cl/kernels:converter",
         "//tensorflow/lite/delegates/gpu/common:data_type",
@@ -33,8 +44,6 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:tensor",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -157,6 +166,7 @@ cc_library(
     srcs = ["cl_device.cc"],
     hdrs = ["cl_device.h"],
     deps = [
+        ":device_info",
         ":opencl_wrapper",
         ":util",
         "//tensorflow/lite/delegates/gpu/common:status",
@@ -242,6 +252,16 @@ flatbuffer_cc_library(
     ],
 )
 
+cc_library(
+    name = "device_info",
+    srcs = ["device_info.cc"],
+    hdrs = ["device_info.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "egl_sync",
     srcs = ["egl_sync.cc"],
@@ -250,7 +270,6 @@ cc_library(
         "EGL_EGLEXT_PROTOTYPES",
     ],
     deps = [
-        ":cl_device",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/gl:gl_call",
     ],
@@ -357,6 +376,7 @@ cc_library(
         ":tensor_type",
         "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
         "//tensorflow/lite/delegates/gpu/cl/selectors:operation_selector",
+        "//tensorflow/lite/delegates/gpu/cl/selectors:special_selector",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:memory_management",
         "//tensorflow/lite/delegates/gpu/common:model",
@@ -369,6 +389,8 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:util",
         "//tensorflow/lite/delegates/gpu/common/transformations:add_bias",
         "//tensorflow/lite/delegates/gpu/common/transformations:merge_padding_with",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -435,6 +457,7 @@ cc_library(
         ":compiled_program_cache_cc_fbs",
         ":util",
         "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:span",
         "@farmhash_archive//:farmhash",
         "@flatbuffers",
@@ -446,11 +469,11 @@ cc_library(
     srcs = ["storage_type_util.cc"],
     hdrs = ["storage_type_util.h"],
     deps = [
-        ":cl_context",
-        ":cl_device",
+        ":device_info",
         ":tensor_type",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:util",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/gpu/cl/api.cc b/tensorflow/lite/delegates/gpu/cl/api.cc
index ffe0fb68881..2a3c84a67cf 100644
--- a/tensorflow/lite/delegates/gpu/cl/api.cc
+++ b/tensorflow/lite/delegates/gpu/cl/api.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/cl/api.h"
 
-#include <EGL/eglext.h>
+#ifndef CL_DELEGATE_NO_GL
+#define CL_DELEGATE_ALLOW_GL
+#endif
 
 #include <algorithm>
 #include <cstring>
@@ -25,9 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_errors.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
-#include "tensorflow/lite/delegates/gpu/cl/egl_sync.h"
 #include "tensorflow/lite/delegates/gpu/cl/environment.h"
-#include "tensorflow/lite/delegates/gpu/cl/gl_interop.h"
 #include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/converter.h"
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
@@ -39,6 +39,13 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
+#ifdef CL_DELEGATE_ALLOW_GL
+#include <EGL/eglext.h>
+
+#include "tensorflow/lite/delegates/gpu/cl/egl_sync.h"
+#include "tensorflow/lite/delegates/gpu/cl/gl_interop.h"
+#endif
+
 namespace tflite {
 namespace gpu {
 namespace cl {
@@ -87,11 +94,13 @@ class DefaultTensorTie : public TensorTie {
       const TensorTieDef& def,
       const TensorObjectConverterBuilder& converter_builder) {
     auto object_type = def.external_def.object_def.object_type;
+#ifdef CL_DELEGATE_ALLOW_GL
     if (def.external_def.object_def.user_provided &&
         GlClBufferCopier::IsSupported(def.external_def.object_def,
                                       def.internal_def.object_def)) {
       return true;
     }
+#endif
     return (object_type == ObjectType::OPENCL_BUFFER ||
             object_type == ObjectType::OPENCL_TEXTURE ||
             object_type == ObjectType::CPU_MEMORY) &&
@@ -138,6 +147,7 @@ class DefaultTensorTie : public TensorTie {
  private:
   absl::Status Init(TensorObjectConverterBuilder* converter_builder,
                     Environment* env) {
+#ifdef CL_DELEGATE_ALLOW_GL
     if (def().external_def.object_def.user_provided &&
         GlClBufferCopier::IsSupported(def().external_def.object_def,
                                       def().internal_def.object_def)) {
@@ -156,6 +166,12 @@ class DefaultTensorTie : public TensorTie {
       RETURN_IF_ERROR(converter_builder->MakeConverter(
           def().internal_def, def().external_def, &converter_to_));
     }
+#else
+    RETURN_IF_ERROR(converter_builder->MakeConverter(
+        def().external_def, def().internal_def, &converter_from_));
+    RETURN_IF_ERROR(converter_builder->MakeConverter(
+        def().internal_def, def().external_def, &converter_to_));
+#endif
     return MaybeAllocateExternalObject(env);
   }
 
@@ -275,6 +291,7 @@ class TwoStepTensorTie : public TensorTie {
   std::unique_ptr<TensorTie> outer_tie_;
 };
 
+#ifdef CL_DELEGATE_ALLOW_GL
 // Captures GL object into CL context before performing a conversion.
 class GlBufferHolder : public TensorTie {
  public:
@@ -351,6 +368,7 @@ class GlBufferHolder : public TensorTie {
   std::unique_ptr<TensorTie> tie_;
   TensorObject external_obj_;
 };
+#endif
 
 TensorObject TensorToObj(const Tensor& tensor) {
   if (tensor.GetStorageType() == TensorStorageType::BUFFER) {
@@ -365,19 +383,28 @@ TensorObject TensorToObj(const Tensor& tensor) {
 // Responsible for creating new tensor objects.
 class TensorTieFactory {
  public:
-  TensorTieFactory(Environment* env, InferenceContext* context,
-                   GlInteropFabric* gl_interop_fabric)
+  TensorTieFactory(Environment* env, InferenceContext* context
+#ifdef CL_DELEGATE_ALLOW_GL
+                   ,
+                   GlInteropFabric* gl_interop_fabric
+#endif
+                   )
       : env_(*env),
         context_(*context),
+#ifdef CL_DELEGATE_ALLOW_GL
         gl_interop_fabric_(gl_interop_fabric),
-        converter_builder_(NewConverterBuilder(env)) {}
+#endif
+        converter_builder_(NewConverterBuilder(env)) {
+  }
 
   bool IsSupported(const TensorTieDef& def) const {
     return IsValid(def.external_def.object_def) &&
            (NoopTensorTie::IsSupported(def) ||
             DefaultTensorTie::IsSupported(def, *converter_builder_) ||
+#ifdef CL_DELEGATE_ALLOW_GL
             (gl_interop_fabric_ &&
              GlBufferHolder::IsSupported(def, *converter_builder_)) ||
+#endif
             TwoStepTensorTie::IsSupported(def, *converter_builder_));
   }
 
@@ -392,10 +419,12 @@ class TensorTieFactory {
     if (DefaultTensorTie::IsSupported(def, *converter)) {
       return DefaultTensorTie::New(def, internal_object, converter, &env_, tie);
     }
+#ifdef CL_DELEGATE_ALLOW_GL
     if (gl_interop_fabric_ && GlBufferHolder::IsSupported(def, *converter)) {
       return GlBufferHolder::New(def, internal_object, converter,
                                  gl_interop_fabric_, &env_, tie);
     }
+#endif
     if (TwoStepTensorTie::IsSupported(def, *converter)) {
       return TwoStepTensorTie::New(def, internal_object, converter, &env_, tie);
     }
@@ -405,18 +434,29 @@ class TensorTieFactory {
  private:
   Environment& env_;
   InferenceContext& context_;
+#ifdef CL_DELEGATE_ALLOW_GL
   GlInteropFabric* gl_interop_fabric_;
+#endif
   std::unique_ptr<TensorObjectConverterBuilder> converter_builder_;
 };
 
 class InferenceRunnerImpl : public InferenceRunner {
  public:
   InferenceRunnerImpl(Environment* environment,
-                      std::unique_ptr<InferenceContext> context,
-                      std::unique_ptr<GlInteropFabric> gl_interop_fabric)
+                      std::unique_ptr<InferenceContext> context
+#ifdef CL_DELEGATE_ALLOW_GL
+                      ,
+                      std::unique_ptr<GlInteropFabric> gl_interop_fabric
+#endif
+                      )
       : queue_(environment->queue()),
-        context_(std::move(context)),
-        gl_interop_fabric_(std::move(gl_interop_fabric)) {}
+        context_(std::move(context))
+#ifdef CL_DELEGATE_ALLOW_GL
+        ,
+        gl_interop_fabric_(std::move(gl_interop_fabric))
+#endif
+  {
+  }
 
   absl::Status Initialize(const std::vector<TensorTieDef>& inputs,
                           const std::vector<TensorTieDef>& outputs,
@@ -451,22 +491,24 @@ class InferenceRunnerImpl : public InferenceRunner {
 
   absl::Status SetInputObject(int index, TensorObject object) override {
     if (index < 0 || index >= inputs_.size()) {
-      return absl::OutOfRangeError("Index is out of range");
+      return absl::OutOfRangeError("Input index is out of range");
     }
     return inputs_[index]->SetExternalObject(object);
   }
 
   absl::Status SetOutputObject(int index, TensorObject object) override {
     if (index < 0 || index >= outputs_.size()) {
-      return absl::OutOfRangeError("Index is out of range");
+      return absl::OutOfRangeError("Output index is out of range");
     }
     return outputs_[index]->SetExternalObject(object);
   }
 
   absl::Status Run() override {
+#ifdef CL_DELEGATE_ALLOW_GL
     if (gl_interop_fabric_) {
       RETURN_IF_ERROR(gl_interop_fabric_->Start());
     }
+#endif
     for (auto& obj : inputs_) {
       RETURN_IF_ERROR(obj->CopyFromExternalObject());
     }
@@ -475,9 +517,11 @@ class InferenceRunnerImpl : public InferenceRunner {
     for (auto& obj : outputs_) {
       RETURN_IF_ERROR(obj->CopyToExternalObject());
     }
+#ifdef CL_DELEGATE_ALLOW_GL
     if (gl_interop_fabric_) {
       RETURN_IF_ERROR(gl_interop_fabric_->Finish());
     }
+#endif
     return absl::OkStatus();
   }
 
@@ -506,7 +550,9 @@ class InferenceRunnerImpl : public InferenceRunner {
 
   CLCommandQueue* queue_;
   std::unique_ptr<InferenceContext> context_;
+#ifdef CL_DELEGATE_ALLOW_GL
   std::unique_ptr<GlInteropFabric> gl_interop_fabric_;
+#endif
   std::vector<std::unique_ptr<TensorTie>> inputs_;
   std::vector<std::unique_ptr<TensorTie>> outputs_;
 };
@@ -542,6 +588,7 @@ class InferenceBuilderImpl : public InferenceBuilder {
     }
     RETURN_IF_ERROR(context_->InitFromGraph(create_info, graph, environment_));
 
+#ifdef CL_DELEGATE_ALLOW_GL
     if (env_options.IsGlAware() &&
         IsGlSharingSupported(environment_->device())) {
       gl_interop_fabric_ = absl::make_unique<GlInteropFabric>(
@@ -549,6 +596,10 @@ class InferenceBuilderImpl : public InferenceBuilder {
     }
     tie_factory_ = absl::make_unique<TensorTieFactory>(
         environment_, context_.get(), gl_interop_fabric_.get());
+#else
+    tie_factory_ =
+        absl::make_unique<TensorTieFactory>(environment_, context_.get());
+#endif
 
     inputs_ = LinkTensors(graph, graph.inputs());
     outputs_ = LinkTensors(graph, graph.outputs());
@@ -572,13 +623,13 @@ class InferenceBuilderImpl : public InferenceBuilder {
 
   absl::Status SetInputObjectDef(int index, ObjectDef new_def) override {
     if (index < 0 || index >= inputs_.size()) {
-      return absl::OutOfRangeError("Index is out of range");
+      return absl::OutOfRangeError("Input index is out of range");
     }
     auto def = inputs_[index];
     def.external_def.object_def = new_def;
     if (!tie_factory_->IsSupported(def)) {
       return absl::InvalidArgumentError(
-          "New object definition is not supported.");
+          "New input object definition is not supported.");
     }
     inputs_[index] = def;
     return absl::OkStatus();
@@ -586,19 +637,20 @@ class InferenceBuilderImpl : public InferenceBuilder {
 
   absl::Status SetOutputObjectDef(int index, ObjectDef new_def) override {
     if (index < 0 || index >= outputs_.size()) {
-      return absl::OutOfRangeError("Index is out of range");
+      return absl::OutOfRangeError("Output index is out of range");
     }
     auto def = outputs_[index];
     def.external_def.object_def = new_def;
     if (!tie_factory_->IsSupported(def)) {
       return absl::InvalidArgumentError(
-          "New object definition is not supported.");
+          "New output object definition is not supported.");
     }
     outputs_[index] = def;
     return absl::OkStatus();
   }
 
   absl::Status Build(std::unique_ptr<InferenceRunner>* runner) override {
+#ifdef CL_DELEGATE_ALLOW_GL
     if (gl_interop_fabric_ && !HasGlObjects()) {
       // destroy interop layer when there are no GL objects to avoid
       // extra synchronization cost.
@@ -606,6 +658,10 @@ class InferenceBuilderImpl : public InferenceBuilder {
     }
     auto runner_impl = absl::make_unique<InferenceRunnerImpl>(
         environment_, std::move(context_), std::move(gl_interop_fabric_));
+#else
+    auto runner_impl = absl::make_unique<InferenceRunnerImpl>(
+        environment_, std::move(context_));
+#endif
     RETURN_IF_ERROR(
         runner_impl->Initialize(inputs_, outputs_, tie_factory_.get()));
     *runner = std::move(runner_impl);
@@ -676,6 +732,7 @@ class InferenceBuilderImpl : public InferenceBuilder {
   }
 
   bool HasGlObjects() const {
+#ifdef CL_DELEGATE_ALLOW_GL
     auto is_gl = [](ObjectType t) {
       return t == ObjectType::OPENGL_SSBO || t == ObjectType::OPENGL_TEXTURE;
     };
@@ -689,6 +746,7 @@ class InferenceBuilderImpl : public InferenceBuilder {
         return true;
       }
     }
+#endif
     return false;
   }
 
@@ -703,7 +761,9 @@ class InferenceBuilderImpl : public InferenceBuilder {
   }
 
   std::unique_ptr<InferenceContext> context_;
+#ifdef CL_DELEGATE_ALLOW_GL
   std::unique_ptr<GlInteropFabric> gl_interop_fabric_;
+#endif
   Environment* environment_;
 
   std::vector<TensorTieDef> inputs_;
@@ -730,20 +790,25 @@ class InferenceEnvironmentImpl : public InferenceEnvironment {
       RETURN_IF_ERROR(CreateDefaultGPUDevice(&device));
     }
 
+#ifdef CL_DELEGATE_ALLOW_GL
     properties_.is_gl_sharing_supported = IsGlSharingSupported(device);
     properties_.is_gl_to_cl_fast_sync_supported =
         IsClEventFromEglSyncSupported(device);
     properties_.is_cl_to_gl_fast_sync_supported =
         IsEglSyncFromClEventSupported();
+#endif
 
     CLContext context;
     if (options_.context) {
+#ifdef CL_DELEGATE_ALLOW_GL
       if (options_.IsGlAware()) {
         return absl::InvalidArgumentError(
             "OpenCL context and EGL parameters are set in the same time.");
       }
+#endif
       context = CLContext(options_.context, /* has_ownership = */ false);
     } else {
+#ifdef CL_DELEGATE_ALLOW_GL
       if (options_.IsGlAware() && properties_.is_gl_sharing_supported) {
         RETURN_IF_ERROR(CreateCLGLContext(
             device,
@@ -753,6 +818,9 @@ class InferenceEnvironmentImpl : public InferenceEnvironment {
       } else {
         RETURN_IF_ERROR(CreateCLContext(device, &context));
       }
+#else
+      RETURN_IF_ERROR(CreateCLContext(device, &context));
+#endif
     }
 
     CLCommandQueue queue;
diff --git a/tensorflow/lite/delegates/gpu/cl/api.h b/tensorflow/lite/delegates/gpu/cl/api.h
index bddf7de3363..826d4f2bc78 100644
--- a/tensorflow/lite/delegates/gpu/cl/api.h
+++ b/tensorflow/lite/delegates/gpu/cl/api.h
@@ -16,6 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_API_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_API_H_
 
+#ifdef CL_DELEGATE_NO_GL
+#define EGL_NO_PROTOTYPES
+#endif
+
 #include <EGL/egl.h>
 
 #include <cstdint>
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.cc b/tensorflow/lite/delegates/gpu/cl/arguments.cc
index 79241091b14..8db58e5e81b 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.cc
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.cc
@@ -145,6 +145,33 @@ std::string GetImageModifier(AccessType access) {
   }
 }
 
+std::string GetDefaultSamplers(const DeviceInfo& device_info) {
+  std::string result;
+  result +=
+      "__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | "
+      "CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n";
+  if (device_info.IsAdreno3xx()) {
+    // Unfortunately, CLK_ADDRESS_CLAMP is very slow on Adreno3xx and
+    // we can observe huge register overhead when compared to other modes.
+
+    // While using CLK_ADDRESS_NONE with out-of-range image coordinates is
+    // undefined in the OpenCL specification, we have observed that
+    // CLK_ADDRESS_NONE works like CLK_ADDRESS_CLAMP for out-of-range image
+    // coordinates for RGBA F16/F32 textures on Adreno3xx devices. Using
+    // CLK_ADDRESS_NONE is significantly faster than CLK_ADDRESS_CLAMP on Adreno
+    // 3xx.
+    result +=
+        "__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | "
+        "CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n";
+  } else {
+    result +=
+        "__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | "
+        "CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n";
+  }
+
+  return result;
+}
+
 }  // namespace
 
 // Static
@@ -483,6 +510,7 @@ absl::Status Arguments::TransformToCLCode(
   RETURN_IF_ERROR(ResolveSelectorsPass(linkables, code));
   ResolveArgsPass(device_info, code);
   *code = absl::Substitute(*code, GetListOfArgs());
+  *code = GetDefaultSamplers(device_info) + *code;
   return absl::OkStatus();
 }
 
@@ -690,7 +718,7 @@ std::string Arguments::AddActiveArgument(const std::string& arg_name,
 
 void Arguments::ResolveArgsPass(const DeviceInfo& device_info,
                                 std::string* code) {
-  bool use_f32_for_half_arguments = device_info.vendor == Vendor::POWERVR;
+  bool use_f32_for_half_arguments = device_info.IsPowerVR();
   size_t position = 0;
   size_t next_position = code->find(kArgsPrefix);
   while (next_position != std::string::npos) {
diff --git a/tensorflow/lite/delegates/gpu/cl/buffer.cc b/tensorflow/lite/delegates/gpu/cl/buffer.cc
index a612be452d0..31770fca47e 100644
--- a/tensorflow/lite/delegates/gpu/cl/buffer.cc
+++ b/tensorflow/lite/delegates/gpu/cl/buffer.cc
@@ -37,7 +37,7 @@ absl::Status CreateBuffer(size_t size_in_bytes, bool gpu_read_only,
                                  const_cast<void*>(data), &error_code);
   if (!buffer) {
     return absl::UnknownError(
-        absl::StrCat("Failed to allocate device memory with clCreateBuffer",
+        absl::StrCat("Failed to allocate device memory (clCreateBuffer): ",
                      CLErrorCodeToString(error_code)));
   }
 
@@ -132,8 +132,6 @@ Buffer& Buffer::operator=(Buffer&& buffer) {
   return *this;
 }
 
-Buffer::~Buffer() { Release(); }
-
 void Buffer::Release() {
   if (buffer_) {
     clReleaseMemObject(buffer_);
diff --git a/tensorflow/lite/delegates/gpu/cl/buffer.h b/tensorflow/lite/delegates/gpu/cl/buffer.h
index dc5befebea2..dbc43463bc7 100644
--- a/tensorflow/lite/delegates/gpu/cl/buffer.h
+++ b/tensorflow/lite/delegates/gpu/cl/buffer.h
@@ -61,7 +61,7 @@ class Buffer : public GPUObject {
   Buffer(const Buffer&) = delete;
   Buffer& operator=(const Buffer&) = delete;
 
-  ~Buffer();
+  virtual ~Buffer() { Release(); }
 
   // for profiling and memory statistics
   uint64_t GetMemorySizeInBytes() const { return size_; }
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc
index f7501dab5af..a1795b18b27 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc
@@ -216,16 +216,14 @@ absl::Status ProfilingCommandQueue::GetBestWorkGroupIndex(
     const CLKernel& kernel, const DeviceInfo& device_info, const int3& grid,
     const std::vector<int3>& work_group_sizes, int* index) {
   // Some Adreno 3xx can have wrong numbers for some events
-  const bool possible_bug_with_events =
-      device_info.vendor == Vendor::QUALCOMM &&
-      device_info.adreno_info.gpu_version < 400;
+  const bool possible_bug_with_events = device_info.IsAdreno3xx();
   events_.resize(work_group_sizes.size());
   for (int i = 0; i < work_group_sizes.size(); ++i) {
     RETURN_IF_ERROR(CLCommandQueue::DispatchImplicit(
         kernel, grid, work_group_sizes[i], &events_[i]));
 
     // reducing the speed of memory leak on Mali for some kernels
-    if (device_info.vendor == Vendor::MALI && i % 8 == 7) {
+    if (device_info.IsMali() && i % 8 == 7) {
       events_[i - 7].Wait();
     }
     if (possible_bug_with_events) {
@@ -237,7 +235,7 @@ absl::Status ProfilingCommandQueue::GetBestWorkGroupIndex(
   RETURN_IF_ERROR(WaitForCompletion());
 
   // To release memory of some kernel pool on Mali.
-  if (device_info.vendor == Vendor::MALI) {
+  if (device_info.IsMali()) {
     RETURN_IF_ERROR(kernel.ReInit());
   }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_context.cc b/tensorflow/lite/delegates/gpu/cl/cl_context.cc
index e697c78b692..9a8f404c46e 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_context.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_context.cc
@@ -43,6 +43,44 @@ std::vector<cl_image_format> GetSupportedImage2DFormats(cl_context context,
   return result;
 }
 
+bool IsEqualToImageFormat(cl_image_format image_format, DataType data_type,
+                          int num_channels) {
+  return image_format.image_channel_data_type ==
+             ToImageChannelType(data_type) &&
+         image_format.image_channel_order == ToChannelOrder(num_channels);
+}
+
+void AddSupportedImageFormats(cl_context context, DeviceInfo* info) {
+  auto supported_formats =
+      GetSupportedImage2DFormats(context, CL_MEM_READ_WRITE);
+  for (auto format : supported_formats) {
+    info->supports_r_f16_tex2d =
+        info->supports_r_f16_tex2d ||
+        IsEqualToImageFormat(format, DataType::FLOAT16, 1);
+    info->supports_rg_f16_tex2d =
+        info->supports_rg_f16_tex2d ||
+        IsEqualToImageFormat(format, DataType::FLOAT16, 2);
+    info->supports_rgb_f16_tex2d =
+        info->supports_rgb_f16_tex2d ||
+        IsEqualToImageFormat(format, DataType::FLOAT16, 3);
+    info->supports_rgba_f16_tex2d =
+        info->supports_rgba_f16_tex2d ||
+        IsEqualToImageFormat(format, DataType::FLOAT16, 4);
+    info->supports_r_f32_tex2d =
+        info->supports_r_f32_tex2d ||
+        IsEqualToImageFormat(format, DataType::FLOAT32, 1);
+    info->supports_rg_f32_tex2d =
+        info->supports_rg_f32_tex2d ||
+        IsEqualToImageFormat(format, DataType::FLOAT32, 2);
+    info->supports_rgb_f32_tex2d =
+        info->supports_rgb_f32_tex2d ||
+        IsEqualToImageFormat(format, DataType::FLOAT32, 3);
+    info->supports_rgba_f32_tex2d =
+        info->supports_rgba_f32_tex2d ||
+        IsEqualToImageFormat(format, DataType::FLOAT32, 4);
+  }
+}
+
 absl::Status CreateCLContext(const CLDevice& device,
                              cl_context_properties* properties,
                              CLContext* result) {
@@ -55,6 +93,7 @@ absl::Status CreateCLContext(const CLDevice& device,
         absl::StrCat("Failed to create a compute context - ",
                      CLErrorCodeToString(error_code)));
   }
+  AddSupportedImageFormats(context, &device.info_);
 
   *result = CLContext(context, true);
   return absl::OkStatus();
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_device.cc b/tensorflow/lite/delegates/gpu/cl/cl_device.cc
index 271fbce61ce..16f5ce217e9 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_device.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_device.cc
@@ -128,24 +128,24 @@ Vendor ParseVendor(const std::string& device_name,
   std::transform(v_name.begin(), v_name.end(), v_name.begin(), ::tolower);
   if (d_name.find("qualcomm") != std::string::npos ||
       v_name.find("qualcomm") != std::string::npos) {
-    return Vendor::QUALCOMM;
+    return Vendor::kQualcomm;
   } else if (d_name.find("mali") != std::string::npos ||
              v_name.find("mali") != std::string::npos) {
-    return Vendor::MALI;
+    return Vendor::kMali;
   } else if (d_name.find("power") != std::string::npos ||
              v_name.find("power") != std::string::npos) {
-    return Vendor::POWERVR;
+    return Vendor::kPowerVR;
   } else if (d_name.find("nvidia") != std::string::npos ||
              v_name.find("nvidia") != std::string::npos) {
-    return Vendor::NVIDIA;
+    return Vendor::kNvidia;
   } else if (d_name.find("advanced micro devices") != std::string::npos ||
              v_name.find("advanced micro devices") != std::string::npos) {
-    return Vendor::AMD;
+    return Vendor::kAMD;
   } else if (d_name.find("intel") != std::string::npos ||
              v_name.find("intel") != std::string::npos) {
-    return Vendor::INTEL;
+    return Vendor::kIntel;
   } else {
-    return Vendor::UNKNOWN;
+    return Vendor::kUnknown;
   }
 }
 
@@ -156,301 +156,116 @@ bool IsGPUVersionInRange(int gpu_version, int min_version, int max_version) {
 }
 }  // namespace
 
-// There is no rule for gpu version encoding, but we found these samples:
-// Version: OpenCL C 2.0 Adreno(TM) 540   // Pixel 2
-// Version: OpenCL C 2.0 Adreno(TM) 630   // Sony Compact XZ2
-// Version: OpenCL C 2.0 Adreno(TM) 630   // Pixel 3
-// Version: OpenCL C 2.0 Adreno(TM) 540   // Samsung S8
-// Version: OpenCL C 1.2 Adreno(TM) 430   // HTC One M9
-// Version: OpenCL C 2.0 Adreno(TM) 530   // Samsung S7 Edge
-// Version: OpenCL C 1.2 Adreno(TM) 405   // Motorola Moto G(4)
-// After the number string ends.
-// It is assumed that the <vendor-specific information> for Adreno GPUs has
-// the following format:
-// <text?><space?>Adreno(TM)<space><text?><version>
-// Returns -1 if vendor-specific information cannot be parsed
-int GetAdrenoGPUVersion(const std::string& gpu_version) {
-  const std::string gpu = absl::AsciiStrToLower(gpu_version);
-  const std::vector<absl::string_view> words = absl::StrSplit(gpu, ' ');
-  int i = 0;
-  for (; i < words.size(); ++i) {
-    if (words[i].find("adreno") != words[i].npos) {
-      break;
-    }
-  }
-  i += 1;
-  for (; i < words.size(); ++i) {
-    int number;
-    bool is_number = absl::SimpleAtoi(words[i], &number);
-    // Adreno GPUs starts from 2xx, but opencl support should be only from 3xx
-    if (is_number && number >= 300) {
-      return number;
-    }
-  }
-  return -1;
-}
-
-MaliGPU GetMaliGPUVersion(const std::string& device_name) {
-  const std::map<std::string, MaliGPU> kMapping = {
-      {"T604", MaliGPU::T604}, {"T622", MaliGPU::T622}, {"T624", MaliGPU::T624},
-      {"T628", MaliGPU::T628}, {"T658", MaliGPU::T658}, {"T678", MaliGPU::T678},
-      {"T720", MaliGPU::T720}, {"T760", MaliGPU::T760}, {"T820", MaliGPU::T820},
-      {"T830", MaliGPU::T830}, {"T860", MaliGPU::T860}, {"T880", MaliGPU::T880},
-      {"G31", MaliGPU::G31},   {"G51", MaliGPU::G51},   {"G71", MaliGPU::G71},
-      {"G52", MaliGPU::G52},   {"G72", MaliGPU::G72},   {"G76", MaliGPU::G76},
-      {"G57", MaliGPU::G57},   {"G77", MaliGPU::G77},
-  };
-  for (const auto& v : kMapping) {
-    if (device_name.find(v.first) != std::string::npos) {
-      return v.second;
-    }
-  }
-  return MaliGPU::UNKNOWN;
-}
-
-std::string VendorToString(Vendor v) {
-  switch (v) {
-    case Vendor::QUALCOMM:
-      return "Qualcomm";
-    case Vendor::MALI:
-      return "Mali";
-    case Vendor::POWERVR:
-      return "PowerVR";
-    case Vendor::NVIDIA:
-      return "NVIDIA";
-    case Vendor::AMD:
-      return "AMD";
-    case Vendor::INTEL:
-      return "Intel";
-    case Vendor::UNKNOWN:
-      return "unknown vendor";
-  }
-}
-
-std::string OpenCLVersionToString(OpenCLVersion version) {
-  switch (version) {
-    case OpenCLVersion::CL_1_0:
-      return "1.0";
-    case OpenCLVersion::CL_1_1:
-      return "1.1";
-    case OpenCLVersion::CL_1_2:
-      return "1.2";
-    case OpenCLVersion::CL_2_0:
-      return "2.0";
-    case OpenCLVersion::CL_2_1:
-      return "2.1";
-    case OpenCLVersion::CL_2_2:
-      return "2.2";
-    case OpenCLVersion::CL_3_0:
-      return "3.0";
-  }
-}
-
-AdrenoInfo::AdrenoInfo(const std::string& device_version)
-    : gpu_version(GetAdrenoGPUVersion(device_version)) {}
-
-int AdrenoInfo::GetMaximumWavesCount() const {
-  if (gpu_version < 400) {
-    return -1;  // Adreno 3xx does not support it currently
-  } else if (gpu_version >= 400 && gpu_version < 500) {
-    return -1;  // Adreno 4xx does not support it currently
-  } else if (gpu_version >= 500 && gpu_version < 600) {
-    return -1;  // Adreno 5xx does not support it currently
-  } else if (gpu_version >= 600 && gpu_version < 700) {
-    return gpu_version == 640 ? 30 : 16;
-  } else {
-    return -1;  //  Adreno 7xx and higher does not exist yet
-  }
-}
-
-int AdrenoInfo::GetRegisterMemorySizePerComputeUnit() const {
-  if (gpu_version < 400) {
-    return -1;  // Adreno 3xx does not support it currently
-  } else if (gpu_version >= 400 && gpu_version < 500) {
-    return -1;  // Adreno 4xx does not support it currently
-  } else if (gpu_version >= 500 && gpu_version < 600) {
-    return -1;  // Adreno 5xx does not support it currently
-  } else if (gpu_version >= 600 && gpu_version < 700) {
-    return gpu_version == 640 ? 128 * 144 * 16 : 128 * 96 * 16;
-  } else {
-    return -1;  //  Adreno 7xx and higher does not exist yet
-  }
-}
-
-int AdrenoInfo::GetMaximumWavesCount(int register_footprint_per_tread,
-                                     bool full_wave) const {
-  const int register_usage_per_wave =
-      GetWaveSize(full_wave) * register_footprint_per_tread;
-  const int possible_waves_count =
-      GetRegisterMemorySizePerComputeUnit() / register_usage_per_wave;
-  return std::min(possible_waves_count, GetMaximumWavesCount());
-}
-
-int AdrenoInfo::GetWaveSize(bool full_wave) const {
-  if (gpu_version < 400) {
-    return -1;  // Adreno 3xx does not support it currently
-  } else if (gpu_version < 600) {
-    return full_wave ? 64 : 32;
-  } else {
-    return full_wave ? 128 : 64;
-  }
-}
-
-MaliInfo::MaliInfo(const std::string& device_name)
-    : gpu_version(GetMaliGPUVersion(device_name)) {}
-
-bool MaliInfo::IsMaliT6xx() const {
-  return gpu_version == MaliGPU::T604 || gpu_version == MaliGPU::T622 ||
-         gpu_version == MaliGPU::T624 || gpu_version == MaliGPU::T628 ||
-         gpu_version == MaliGPU::T658 || gpu_version == MaliGPU::T678;
-}
-
-bool MaliInfo::IsMaliT7xx() const {
-  return gpu_version == MaliGPU::T720 || gpu_version == MaliGPU::T760;
-}
-
-bool MaliInfo::IsMaliT8xx() const {
-  return gpu_version == MaliGPU::T820 || gpu_version == MaliGPU::T830 ||
-         gpu_version == MaliGPU::T860 || gpu_version == MaliGPU::T880;
-}
-
-bool MaliInfo::IsMidgard() const {
-  return IsMaliT6xx() || IsMaliT7xx() || IsMaliT8xx();
-}
-
-bool MaliInfo::IsBifrostGen1() const {
-  return gpu_version == MaliGPU::G31 || gpu_version == MaliGPU::G51 ||
-         gpu_version == MaliGPU::G71;
-}
-
-bool MaliInfo::IsBifrostGen2() const {
-  return gpu_version == MaliGPU::G52 || gpu_version == MaliGPU::G72;
-}
-
-bool MaliInfo::IsBifrostGen3() const { return gpu_version == MaliGPU::G76; }
-
-bool MaliInfo::IsBifrost() const {
-  return IsBifrostGen1() || IsBifrostGen2() || IsBifrostGen3();
-}
-
-bool MaliInfo::IsValhall() const {
-  return gpu_version == MaliGPU::G57 || gpu_version == MaliGPU::G77;
-}
-
-DeviceInfo::DeviceInfo(cl_device_id id) {
+DeviceInfo DeviceInfoFromDeviceID(cl_device_id id) {
+  DeviceInfo info;
   const auto device_name = GetDeviceInfo<std::string>(id, CL_DEVICE_NAME);
   const auto vendor_name = GetDeviceInfo<std::string>(id, CL_DEVICE_VENDOR);
   const auto opencl_c_version =
       GetDeviceInfo<std::string>(id, CL_DEVICE_OPENCL_C_VERSION);
-  vendor = ParseVendor(device_name, vendor_name);
-  if (vendor == Vendor::QUALCOMM) {
-    adreno_info = AdrenoInfo(opencl_c_version);
-  } else if (vendor == Vendor::MALI) {
-    mali_info = MaliInfo(device_name);
+  info.vendor = ParseVendor(device_name, vendor_name);
+  if (info.vendor == Vendor::kQualcomm) {
+    info.adreno_info = AdrenoInfo(opencl_c_version);
+  } else if (info.vendor == Vendor::kMali) {
+    info.mali_info = MaliInfo(device_name);
   }
-  cl_version = ParseCLVersion(opencl_c_version);
-  extensions =
+  info.cl_version = ParseCLVersion(opencl_c_version);
+  info.extensions =
       absl::StrSplit(GetDeviceInfo<std::string>(id, CL_DEVICE_EXTENSIONS), ' ');
-  supports_fp16 = false;
-  supports_image3d_writes = false;
-  for (const auto& ext : extensions) {
+  info.supports_fp16 = false;
+  info.supports_image3d_writes = false;
+  for (const auto& ext : info.extensions) {
     if (ext == "cl_khr_fp16") {
-      supports_fp16 = true;
+      info.supports_fp16 = true;
     }
     if (ext == "cl_khr_3d_image_writes") {
-      supports_image3d_writes = true;
+      info.supports_image3d_writes = true;
     }
   }
 
-  f32_config =
+  cl_device_fp_config f32_config =
       GetDeviceInfo<cl_device_fp_config>(id, CL_DEVICE_SINGLE_FP_CONFIG);
-  supports_fp32_rtn = f32_config & CL_FP_ROUND_TO_NEAREST;
+  info.supports_fp32_rtn = f32_config & CL_FP_ROUND_TO_NEAREST;
 
-  if (supports_fp16) {
+  if (info.supports_fp16) {
+    cl_device_fp_config f16_config;
     auto status = GetDeviceInfo<cl_device_fp_config>(
         id, CL_DEVICE_HALF_FP_CONFIG, &f16_config);
     // AMD supports cl_khr_fp16 but CL_DEVICE_HALF_FP_CONFIG is empty.
-    if (status.ok() && vendor != Vendor::AMD) {
-      supports_fp16_rtn = f16_config & CL_FP_ROUND_TO_NEAREST;
+    if (status.ok() && info.vendor != Vendor::kAMD) {
+      info.supports_fp16_rtn = f16_config & CL_FP_ROUND_TO_NEAREST;
     } else {  // happens on PowerVR
       f16_config = f32_config;
-      supports_fp16_rtn = supports_fp32_rtn;
+      info.supports_fp16_rtn = info.supports_fp32_rtn;
     }
   } else {
-    f16_config = 0;
-    supports_fp16_rtn = false;
+    info.supports_fp16_rtn = false;
   }
 
-  if (vendor == Vendor::POWERVR && !supports_fp16) {
+  if (info.vendor == Vendor::kPowerVR && !info.supports_fp16) {
     // PowerVR doesn't have full support of fp16 and so doesn't list this
     // extension. But it can support fp16 in MADs and as buffers/textures types,
     // so we will use it.
-    supports_fp16 = true;
-    f16_config = f32_config;
-    supports_fp16_rtn = supports_fp32_rtn;
+    info.supports_fp16 = true;
+    info.supports_fp16_rtn = info.supports_fp32_rtn;
   }
 
-  if (!supports_image3d_writes &&
-      ((vendor == Vendor::QUALCOMM &&
-        IsGPUVersionInRange(adreno_info.gpu_version, 400, 500)) ||
-       vendor == Vendor::NVIDIA)) {
+  if (!info.supports_image3d_writes &&
+      ((info.vendor == Vendor::kQualcomm &&
+        IsGPUVersionInRange(info.adreno_info.gpu_version, 400, 500)) ||
+       info.vendor == Vendor::kNvidia)) {
     // in local tests Adreno 430 can write in image 3d, at least on small sizes,
     // but it doesn't have cl_khr_3d_image_writes in list of available
     // extensions
     // The same for NVidia
-    supports_image3d_writes = true;
+    info.supports_image3d_writes = true;
   }
-  compute_units_count = GetDeviceInfo<cl_uint>(id, CL_DEVICE_MAX_COMPUTE_UNITS);
-  image2d_max_width = GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE2D_MAX_WIDTH);
-  image2d_max_height = GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE2D_MAX_HEIGHT);
-  buffer_max_size = GetDeviceInfo<cl_ulong>(id, CL_DEVICE_MAX_MEM_ALLOC_SIZE);
-  if (cl_version >= OpenCLVersion::CL_1_2) {
-    image_buffer_max_size =
+  info.compute_units_count =
+      GetDeviceInfo<cl_uint>(id, CL_DEVICE_MAX_COMPUTE_UNITS);
+  info.image2d_max_width =
+      GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE2D_MAX_WIDTH);
+  info.image2d_max_height =
+      GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE2D_MAX_HEIGHT);
+  info.buffer_max_size =
+      GetDeviceInfo<cl_ulong>(id, CL_DEVICE_MAX_MEM_ALLOC_SIZE);
+  if (info.cl_version >= OpenCLVersion::CL_1_2) {
+    info.image_buffer_max_size =
         GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE);
-    image_array_max_layers =
+    info.image_array_max_layers =
         GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE_MAX_ARRAY_SIZE);
   }
-  image3d_max_width = GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE3D_MAX_WIDTH);
-  image3d_max_height = GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE2D_MAX_HEIGHT);
-  image3d_max_depth = GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE3D_MAX_DEPTH);
+  info.image3d_max_width =
+      GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE3D_MAX_WIDTH);
+  info.image3d_max_height =
+      GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE2D_MAX_HEIGHT);
+  info.image3d_max_depth =
+      GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE3D_MAX_DEPTH);
+  int3 max_work_group_sizes;
   GetDeviceWorkDimsSizes(id, &max_work_group_sizes);
-}
-
-bool DeviceInfo::SupportsTextureArray() const {
-  return cl_version >= OpenCLVersion::CL_1_2;
-}
-
-bool DeviceInfo::SupportsImageBuffer() const {
-  return cl_version >= OpenCLVersion::CL_1_2;
-}
-
-bool DeviceInfo::SupportsImage3D() const {
-  if (vendor == Vendor::MALI) {
-    // On Mali T880 read_imageh doesn't compile with image3d_t
-    return false;
-  }
-  return supports_image3d_writes;
+  info.max_work_group_size_x = max_work_group_sizes.x;
+  info.max_work_group_size_y = max_work_group_sizes.y;
+  info.max_work_group_size_z = max_work_group_sizes.z;
+  return info;
 }
 
 CLDevice::CLDevice(cl_device_id id, cl_platform_id platform_id)
-    : id_(id), platform_id_(platform_id), info_(id) {}
+    : info_(DeviceInfoFromDeviceID(id)), id_(id), platform_id_(platform_id) {}
 
 CLDevice::CLDevice(const CLDevice& device)
-    : id_(device.id_), platform_id_(device.platform_id_), info_(device.info_) {}
+    : info_(device.info_), id_(device.id_), platform_id_(device.platform_id_) {}
 
 CLDevice& CLDevice::operator=(const CLDevice& device) {
   if (this != &device) {
+    info_ = device.info_;
     id_ = device.id_;
     platform_id_ = device.platform_id_;
-    info_ = device.info_;
   }
   return *this;
 }
 
 CLDevice::CLDevice(CLDevice&& device)
-    : id_(device.id_),
-      platform_id_(device.platform_id_),
-      info_(std::move(device.info_)) {
+    : info_(std::move(device.info_)),
+      id_(device.id_),
+      platform_id_(device.platform_id_) {
   device.id_ = nullptr;
   device.platform_id_ = nullptr;
 }
@@ -459,9 +274,9 @@ CLDevice& CLDevice::operator=(CLDevice&& device) {
   if (this != &device) {
     id_ = nullptr;
     platform_id_ = nullptr;
+    info_ = std::move(device.info_);
     std::swap(id_, device.id_);
     std::swap(platform_id_, device.platform_id_);
-    info_ = std::move(device.info_);
   }
   return *this;
 }
@@ -528,44 +343,32 @@ bool CLDevice::SupportsSubGroupWithSize(int sub_group_size) const {
   return false;
 }
 
-bool CLDevice::IsAdreno() const { return info_.vendor == Vendor::QUALCOMM; }
+bool CLDevice::IsAdreno() const { return info_.IsAdreno(); }
 
-bool CLDevice::IsAdreno3xx() const {
-  return IsAdreno() &&
-         IsGPUVersionInRange(info_.adreno_info.gpu_version, 300, 400);
-}
+bool CLDevice::IsAdreno3xx() const { return info_.IsAdreno3xx(); }
 
-bool CLDevice::IsAdreno4xx() const {
-  return IsAdreno() &&
-         IsGPUVersionInRange(info_.adreno_info.gpu_version, 400, 500);
-}
+bool CLDevice::IsAdreno4xx() const { return info_.IsAdreno4xx(); }
 
-bool CLDevice::IsAdreno5xx() const {
-  return IsAdreno() &&
-         IsGPUVersionInRange(info_.adreno_info.gpu_version, 500, 600);
-}
+bool CLDevice::IsAdreno5xx() const { return info_.IsAdreno5xx(); }
 
-bool CLDevice::IsAdreno6xx() const {
-  return IsAdreno() &&
-         IsGPUVersionInRange(info_.adreno_info.gpu_version, 600, 700);
-}
+bool CLDevice::IsAdreno6xx() const { return info_.IsAdreno6xx(); }
 
 bool CLDevice::IsAdreno6xxOrHigher() const {
-  return IsAdreno() && info_.adreno_info.gpu_version >= 600;
+  return info_.IsAdreno6xxOrHigher();
 }
 
-bool CLDevice::IsPowerVR() const { return info_.vendor == Vendor::POWERVR; }
+bool CLDevice::IsPowerVR() const { return info_.IsPowerVR(); }
 
-bool CLDevice::IsNvidia() const { return info_.vendor == Vendor::NVIDIA; }
+bool CLDevice::IsNvidia() const { return info_.IsNvidia(); }
 
-bool CLDevice::IsMali() const { return info_.vendor == Vendor::MALI; }
+bool CLDevice::IsMali() const { return info_.IsMali(); }
 
-bool CLDevice::IsAMD() const { return info_.vendor == Vendor::AMD; }
+bool CLDevice::IsAMD() const { return info_.IsAMD(); }
 
-bool CLDevice::IsIntel() const { return info_.vendor == Vendor::INTEL; }
+bool CLDevice::IsIntel() const { return info_.IsIntel(); }
 
 bool CLDevice::SupportsOneLayerTextureArray() const {
-  return !IsAdreno() || info_.adreno_info.support_one_layer_texture_array;
+  return info_.SupportsOneLayerTextureArray();
 }
 
 void CLDevice::DisableOneLayerTextureArray() {
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_device.h b/tensorflow/lite/delegates/gpu/cl/cl_device.h
index 68abcf3e202..e7cd274661d 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_device.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_device.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/lite/delegates/gpu/cl/device_info.h"
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
 #include "tensorflow/lite/delegates/gpu/cl/util.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -28,127 +29,6 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-enum class Vendor { QUALCOMM, MALI, POWERVR, NVIDIA, AMD, INTEL, UNKNOWN };
-std::string VendorToString(Vendor v);
-
-enum class OpenCLVersion {
-  CL_1_0,
-  CL_1_1,
-  CL_1_2,
-  CL_2_0,
-  CL_2_1,
-  CL_2_2,
-  CL_3_0
-};
-std::string OpenCLVersionToString(OpenCLVersion version);
-
-// for use only in cl_device.cc, but putted here to make tests
-int GetAdrenoGPUVersion(const std::string& gpu_version);
-
-struct AdrenoInfo {
-  AdrenoInfo() = default;
-  explicit AdrenoInfo(const std::string& device_version);
-  int gpu_version = -1;  // can be, for example, 405/430/540/530/630 etc.
-
-  // This function returns some not very documented physical parameter of
-  // Adreno6xx GPU.
-  // We obtained it using Snapdragon Profiler.
-  int GetMaximumWavesCount() const;
-
-  // returns amount of register memory per CU(Compute Unit) in bytes.
-  int GetRegisterMemorySizePerComputeUnit() const;
-
-  // returns maximum possible amount of waves based on register usage.
-  int GetMaximumWavesCount(int register_footprint_per_tread,
-                           bool full_wave = true) const;
-
-  int GetWaveSize(bool full_wave) const;
-
-  // Not supported on some Adreno devices with specific driver version.
-  // b/131099086
-  bool support_one_layer_texture_array = true;
-};
-
-enum class MaliGPU {
-  T604,
-  T622,
-  T624,
-  T628,
-  T658,
-  T678,
-  T720,
-  T760,
-  T820,
-  T830,
-  T860,
-  T880,
-  G31,
-  G51,
-  G71,
-  G52,
-  G72,
-  G76,
-  G57,
-  G77,
-  UNKNOWN
-};
-
-struct MaliInfo {
-  MaliInfo() = default;
-  explicit MaliInfo(const std::string& device_name);
-  MaliGPU gpu_version;
-
-  bool IsMaliT6xx() const;
-  bool IsMaliT7xx() const;
-  bool IsMaliT8xx() const;
-  bool IsMidgard() const;
-  bool IsBifrostGen1() const;
-  bool IsBifrostGen2() const;
-  bool IsBifrostGen3() const;
-  bool IsBifrost() const;
-  bool IsValhall() const;
-};
-
-struct DeviceInfo {
-  DeviceInfo() = default;
-  explicit DeviceInfo(cl_device_id id);
-
-  bool SupportsTextureArray() const;
-  bool SupportsImageBuffer() const;
-  bool SupportsImage3D() const;
-
-  std::vector<std::string> extensions;
-  bool supports_fp16;
-  bool supports_image3d_writes;
-  Vendor vendor;
-  OpenCLVersion cl_version;
-  int compute_units_count;
-  uint64_t buffer_max_size;
-  uint64_t image2d_max_width;
-  uint64_t image2d_max_height;
-  uint64_t image_buffer_max_size;
-  uint64_t image_array_max_layers;
-  uint64_t image3d_max_width;
-  uint64_t image3d_max_height;
-  uint64_t image3d_max_depth;
-  int3 max_work_group_sizes;
-
-  cl_device_fp_config f32_config;
-  // valid only with cl_khr_fp16
-  cl_device_fp_config f16_config;
-
-  // rtn is ROUND_TO_NEAREST
-  // with rtn precision is much better then with rtz (ROUND_TO_ZERO)
-  // Adreno 3xx supports only rtz, Adreno 4xx and more support rtn
-  // Mali from T6xx supports rtn
-  // PowerVR supports only rtz
-  bool supports_fp32_rtn;
-  bool supports_fp16_rtn;
-
-  AdrenoInfo adreno_info;
-  MaliInfo mali_info;
-};
-
 // A wrapper around opencl device id
 class CLDevice {
  public:
@@ -166,9 +46,6 @@ class CLDevice {
   cl_platform_id platform() const { return platform_id_; }
   std::string GetPlatformVersion() const;
 
-  const DeviceInfo& GetInfo() const { return info_; }
-  const DeviceInfo* GetInfoPtr() const { return &info_; }
-
   Vendor vendor() const { return info_.vendor; }
   OpenCLVersion cl_version() const { return info_.cl_version; }
   bool SupportsFP16() const;
@@ -196,10 +73,13 @@ class CLDevice {
   bool SupportsOneLayerTextureArray() const;
   void DisableOneLayerTextureArray();
 
+  // We update device info during context creation, so as supported texture
+  // formats can be requested from context only.
+  mutable DeviceInfo info_;
+
  private:
   cl_device_id id_ = nullptr;
   cl_platform_id platform_id_ = nullptr;
-  DeviceInfo info_;
 };
 
 absl::Status CreateDefaultGPUDevice(CLDevice* result);
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_kernel.cc b/tensorflow/lite/delegates/gpu/cl/cl_kernel.cc
index c498c14dfe8..7a8aaf6102f 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_kernel.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_kernel.cc
@@ -58,8 +58,7 @@ absl::Status GetKernelPrivateMemorySize(cl_kernel kernel,
 }  // namespace
 
 CLKernel::CLKernel(CLKernel&& kernel)
-    : private_memory_size_(kernel.private_memory_size_),
-      max_work_group_size_(kernel.max_work_group_size_),
+    : info_(kernel.info_),
       binding_counter_(kernel.binding_counter_),
       function_name_(std::move(kernel.function_name_)),
       program_(kernel.program_),
@@ -70,8 +69,7 @@ CLKernel::CLKernel(CLKernel&& kernel)
 CLKernel& CLKernel::operator=(CLKernel&& kernel) {
   if (this != &kernel) {
     Release();
-    std::swap(private_memory_size_, kernel.private_memory_size_);
-    std::swap(max_work_group_size_, kernel.max_work_group_size_);
+    std::swap(info_, kernel.info_);
     std::swap(binding_counter_, kernel.binding_counter_);
     function_name_ = std::move(kernel.function_name_);
     std::swap(program_, kernel.program_);
@@ -119,9 +117,9 @@ absl::Status CLKernel::CreateFromProgram(const CLProgram& program,
   clRetainProgram(program_);
 
   RETURN_IF_ERROR(GetKernelPrivateMemorySize(kernel_, program.GetDeviceId(),
-                                             &private_memory_size_));
+                                             &info_.private_memory_size));
   RETURN_IF_ERROR(GetKernelMaxWorkGroupSize(kernel_, program.GetDeviceId(),
-                                            &max_work_group_size_));
+                                            &info_.max_work_group_size));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_kernel.h b/tensorflow/lite/delegates/gpu/cl/cl_kernel.h
index 81a777ed822..0af8052f738 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_kernel.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_kernel.h
@@ -28,6 +28,11 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
+struct KernelInfo {
+  int private_memory_size;
+  int max_work_group_size;
+};
+
 // Arguments binding to CLKernel can be manual or automatic
 // In manual you specify binding index explicitly
 // In automatic binding, index auto-incremented with every binding call
@@ -61,9 +66,6 @@ class CLKernel {
     return SetBytesAuto(static_cast<const void*>(&value), sizeof(T));
   }
 
-  int GetPrivateMemorySize() const { return private_memory_size_; }
-  int GetMaxWorkGroupSize() const { return max_work_group_size_; }
-
   int GetBindingCounter() const { return binding_counter_; }
   void ResetBindingCounter() { binding_counter_ = 0; }
 
@@ -71,13 +73,13 @@ class CLKernel {
   // workaround for Mali memory leak
   absl::Status ReInit() const;
 
+  KernelInfo info_;
+
  private:
   void Release();
   absl::Status SetBytes(int index, const void* ptr, int length) const;
   absl::Status SetBytesAuto(const void* ptr, int length);
 
-  int private_memory_size_;
-  int max_work_group_size_;
   int binding_counter_ = -1;
 
   std::string function_name_;
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_program.cc b/tensorflow/lite/delegates/gpu/cl/cl_program.cc
index 3b821dc3a5d..fd29ebec2d7 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_program.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_program.cc
@@ -78,13 +78,13 @@ std::string CompilerOptionToString(const CLDevice& device,
                                    CompilerOptions option) {
   switch (option) {
     case CompilerOptions::ADRENO_FULL_SIMD_LINE:
-      if (device.GetInfo().adreno_info.gpu_version < 500) {
+      if (device.info_.adreno_info.gpu_version < 500) {
         return "-qcom-accelerate-16-bit";
       } else {
         return "-qcom-accelerate-16-bit=true";
       }
     case CompilerOptions::ADRENO_MORE_WAVES:
-      if (device.GetInfo().adreno_info.gpu_version >= 500) {
+      if (device.info_.adreno_info.gpu_version >= 500) {
         return "-qcom-accelerate-16-bit=false";
       } else {
         return "";
diff --git a/tensorflow/lite/delegates/gpu/cl/device_info.cc b/tensorflow/lite/delegates/gpu/cl/device_info.cc
new file mode 100644
index 00000000000..d1ed69aa100
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/device_info.cc
@@ -0,0 +1,290 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/device_info.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+// check that gpu_version belong to range min_version-max_version
+// min_version is included and max_version is excluded.
+bool IsGPUVersionInRange(int gpu_version, int min_version, int max_version) {
+  return gpu_version >= min_version && gpu_version < max_version;
+}
+
+MaliGPU GetMaliGPUVersion(const std::string& device_name) {
+  const std::map<std::string, MaliGPU> kMapping = {
+      {"T604", MaliGPU::T604}, {"T622", MaliGPU::T622}, {"T624", MaliGPU::T624},
+      {"T628", MaliGPU::T628}, {"T658", MaliGPU::T658}, {"T678", MaliGPU::T678},
+      {"T720", MaliGPU::T720}, {"T760", MaliGPU::T760}, {"T820", MaliGPU::T820},
+      {"T830", MaliGPU::T830}, {"T860", MaliGPU::T860}, {"T880", MaliGPU::T880},
+      {"G31", MaliGPU::G31},   {"G51", MaliGPU::G51},   {"G71", MaliGPU::G71},
+      {"G52", MaliGPU::G52},   {"G72", MaliGPU::G72},   {"G76", MaliGPU::G76},
+      {"G57", MaliGPU::G57},   {"G77", MaliGPU::G77},
+  };
+  for (const auto& v : kMapping) {
+    if (device_name.find(v.first) != std::string::npos) {
+      return v.second;
+    }
+  }
+  return MaliGPU::UNKNOWN;
+}
+
+}  // namespace
+
+// There is no rule for gpu version encoding, but we found these samples:
+// Version: OpenCL C 2.0 Adreno(TM) 540   // Pixel 2
+// Version: OpenCL C 2.0 Adreno(TM) 630   // Sony Compact XZ2
+// Version: OpenCL C 2.0 Adreno(TM) 630   // Pixel 3
+// Version: OpenCL C 2.0 Adreno(TM) 540   // Samsung S8
+// Version: OpenCL C 1.2 Adreno(TM) 430   // HTC One M9
+// Version: OpenCL C 2.0 Adreno(TM) 530   // Samsung S7 Edge
+// Version: OpenCL C 1.2 Adreno(TM) 405   // Motorola Moto G(4)
+// After the number string ends.
+// It is assumed that the <vendor-specific information> for Adreno GPUs has
+// the following format:
+// <text?><space?>Adreno(TM)<space><text?><version>
+// Returns -1 if vendor-specific information cannot be parsed
+int GetAdrenoGPUVersion(const std::string& gpu_version) {
+  const std::string gpu = absl::AsciiStrToLower(gpu_version);
+  const std::vector<absl::string_view> words = absl::StrSplit(gpu, ' ');
+  int i = 0;
+  for (; i < words.size(); ++i) {
+    if (words[i].find("adreno") != words[i].npos) {
+      break;
+    }
+  }
+  i += 1;
+  for (; i < words.size(); ++i) {
+    int number;
+    bool is_number = absl::SimpleAtoi(words[i], &number);
+    // Adreno GPUs starts from 2xx, but opencl support should be only from 3xx
+    if (is_number && number >= 300) {
+      return number;
+    }
+  }
+  return -1;
+}
+
+std::string VendorToString(Vendor v) {
+  switch (v) {
+    case Vendor::kQualcomm:
+      return "Qualcomm";
+    case Vendor::kMali:
+      return "Mali";
+    case Vendor::kPowerVR:
+      return "PowerVR";
+    case Vendor::kNvidia:
+      return "NVIDIA";
+    case Vendor::kAMD:
+      return "AMD";
+    case Vendor::kIntel:
+      return "Intel";
+    case Vendor::kUnknown:
+      return "unknown vendor";
+  }
+}
+
+std::string OpenCLVersionToString(OpenCLVersion version) {
+  switch (version) {
+    case OpenCLVersion::CL_1_0:
+      return "1.0";
+    case OpenCLVersion::CL_1_1:
+      return "1.1";
+    case OpenCLVersion::CL_1_2:
+      return "1.2";
+    case OpenCLVersion::CL_2_0:
+      return "2.0";
+    case OpenCLVersion::CL_2_1:
+      return "2.1";
+    case OpenCLVersion::CL_2_2:
+      return "2.2";
+    case OpenCLVersion::CL_3_0:
+      return "3.0";
+  }
+}
+
+AdrenoInfo::AdrenoInfo(const std::string& device_version)
+    : gpu_version(GetAdrenoGPUVersion(device_version)) {}
+
+int AdrenoInfo::GetMaximumWavesCount() const {
+  if (gpu_version < 400) {
+    return -1;  // Adreno 3xx does not support it currently
+  } else if (gpu_version >= 400 && gpu_version < 500) {
+    return -1;  // Adreno 4xx does not support it currently
+  } else if (gpu_version >= 500 && gpu_version < 600) {
+    return -1;  // Adreno 5xx does not support it currently
+  } else if (gpu_version >= 600 && gpu_version < 700) {
+    return gpu_version == 640 ? 30 : 16;
+  } else {
+    return -1;  //  Adreno 7xx and higher does not exist yet
+  }
+}
+
+int AdrenoInfo::GetRegisterMemorySizePerComputeUnit() const {
+  if (gpu_version < 400) {
+    return -1;  // Adreno 3xx does not support it currently
+  } else if (gpu_version >= 400 && gpu_version < 500) {
+    return -1;  // Adreno 4xx does not support it currently
+  } else if (gpu_version >= 500 && gpu_version < 600) {
+    return -1;  // Adreno 5xx does not support it currently
+  } else if (gpu_version >= 600 && gpu_version < 700) {
+    return gpu_version == 640 ? 128 * 144 * 16 : 128 * 96 * 16;
+  } else {
+    return -1;  //  Adreno 7xx and higher does not exist yet
+  }
+}
+
+int AdrenoInfo::GetMaximumWavesCount(int register_footprint_per_tread,
+                                     bool full_wave) const {
+  const int register_usage_per_wave =
+      GetWaveSize(full_wave) * register_footprint_per_tread;
+  const int possible_waves_count =
+      GetRegisterMemorySizePerComputeUnit() / register_usage_per_wave;
+  return std::min(possible_waves_count, GetMaximumWavesCount());
+}
+
+int AdrenoInfo::GetWaveSize(bool full_wave) const {
+  if (gpu_version < 400) {
+    return -1;  // Adreno 3xx does not support it currently
+  } else if (gpu_version < 600) {
+    return full_wave ? 64 : 32;
+  } else {
+    return full_wave ? 128 : 64;
+  }
+}
+
+MaliInfo::MaliInfo(const std::string& device_name)
+    : gpu_version(GetMaliGPUVersion(device_name)) {}
+
+bool MaliInfo::IsMaliT6xx() const {
+  return gpu_version == MaliGPU::T604 || gpu_version == MaliGPU::T622 ||
+         gpu_version == MaliGPU::T624 || gpu_version == MaliGPU::T628 ||
+         gpu_version == MaliGPU::T658 || gpu_version == MaliGPU::T678;
+}
+
+bool MaliInfo::IsMaliT7xx() const {
+  return gpu_version == MaliGPU::T720 || gpu_version == MaliGPU::T760;
+}
+
+bool MaliInfo::IsMaliT8xx() const {
+  return gpu_version == MaliGPU::T820 || gpu_version == MaliGPU::T830 ||
+         gpu_version == MaliGPU::T860 || gpu_version == MaliGPU::T880;
+}
+
+bool MaliInfo::IsMidgard() const {
+  return IsMaliT6xx() || IsMaliT7xx() || IsMaliT8xx();
+}
+
+bool MaliInfo::IsBifrostGen1() const {
+  return gpu_version == MaliGPU::G31 || gpu_version == MaliGPU::G51 ||
+         gpu_version == MaliGPU::G71;
+}
+
+bool MaliInfo::IsBifrostGen2() const {
+  return gpu_version == MaliGPU::G52 || gpu_version == MaliGPU::G72;
+}
+
+bool MaliInfo::IsBifrostGen3() const { return gpu_version == MaliGPU::G76; }
+
+bool MaliInfo::IsBifrost() const {
+  return IsBifrostGen1() || IsBifrostGen2() || IsBifrostGen3();
+}
+
+bool MaliInfo::IsValhall() const {
+  return gpu_version == MaliGPU::G57 || gpu_version == MaliGPU::G77;
+}
+
+bool DeviceInfo::SupportsTextureArray() const {
+  return cl_version >= OpenCLVersion::CL_1_2;
+}
+
+bool DeviceInfo::SupportsImageBuffer() const {
+  return cl_version >= OpenCLVersion::CL_1_2;
+}
+
+bool DeviceInfo::SupportsImage3D() const {
+  if (vendor == Vendor::kMali) {
+    // On Mali T880 read_imageh doesn't compile with image3d_t
+    return false;
+  }
+  return supports_image3d_writes;
+}
+
+bool DeviceInfo::SupportsFloatImage2D(DataType data_type, int channels) const {
+  if (channels == 1) {
+    return data_type == DataType::FLOAT32 ? supports_r_f32_tex2d
+                                          : supports_r_f16_tex2d;
+  } else if (channels == 2) {
+    return data_type == DataType::FLOAT32 ? supports_rg_f32_tex2d
+                                          : supports_rg_f16_tex2d;
+  } else if (channels == 3) {
+    return data_type == DataType::FLOAT32 ? supports_rgb_f32_tex2d
+                                          : supports_rgb_f16_tex2d;
+  } else if (channels == 4) {
+    return data_type == DataType::FLOAT32 ? supports_rgba_f32_tex2d
+                                          : supports_rgba_f16_tex2d;
+  } else {
+    return false;
+  }
+}
+
+bool DeviceInfo::SupportsOneLayerTextureArray() const {
+  return !IsAdreno() || adreno_info.support_one_layer_texture_array;
+}
+
+bool DeviceInfo::IsAdreno() const { return vendor == Vendor::kQualcomm; }
+
+bool DeviceInfo::IsAdreno3xx() const {
+  return IsAdreno() && IsGPUVersionInRange(adreno_info.gpu_version, 300, 400);
+}
+
+bool DeviceInfo::IsAdreno4xx() const {
+  return IsAdreno() && IsGPUVersionInRange(adreno_info.gpu_version, 400, 500);
+}
+
+bool DeviceInfo::IsAdreno5xx() const {
+  return IsAdreno() && IsGPUVersionInRange(adreno_info.gpu_version, 500, 600);
+}
+
+bool DeviceInfo::IsAdreno6xx() const {
+  return IsAdreno() && IsGPUVersionInRange(adreno_info.gpu_version, 600, 700);
+}
+
+bool DeviceInfo::IsAdreno6xxOrHigher() const {
+  return IsAdreno() && adreno_info.gpu_version >= 600;
+}
+
+bool DeviceInfo::IsPowerVR() const { return vendor == Vendor::kPowerVR; }
+
+bool DeviceInfo::IsNvidia() const { return vendor == Vendor::kNvidia; }
+
+bool DeviceInfo::IsMali() const { return vendor == Vendor::kMali; }
+
+bool DeviceInfo::IsAMD() const { return vendor == Vendor::kAMD; }
+
+bool DeviceInfo::IsIntel() const { return vendor == Vendor::kIntel; }
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/device_info.h b/tensorflow/lite/delegates/gpu/cl/device_info.h
new file mode 100644
index 00000000000..7123891ecf4
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/device_info.h
@@ -0,0 +1,185 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_DEVICE_INFO_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_DEVICE_INFO_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+
+// for use only in device_info.cc, but keep here to make tests
+int GetAdrenoGPUVersion(const std::string& gpu_version);
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+enum class Vendor {
+  kQualcomm,
+  kMali,
+  kPowerVR,
+  kNvidia,
+  kAMD,
+  kIntel,
+  kUnknown
+};
+std::string VendorToString(Vendor v);
+
+enum class OpenCLVersion {
+  CL_1_0,
+  CL_1_1,
+  CL_1_2,
+  CL_2_0,
+  CL_2_1,
+  CL_2_2,
+  CL_3_0
+};
+std::string OpenCLVersionToString(OpenCLVersion version);
+
+struct AdrenoInfo {
+  AdrenoInfo() = default;
+  explicit AdrenoInfo(const std::string& device_version);
+  int gpu_version = -1;  // can be, for example, 405/430/540/530/630 etc.
+
+  // This function returns some not very documented physical parameter of
+  // Adreno6xx GPU.
+  // We obtained it using Snapdragon Profiler.
+  int GetMaximumWavesCount() const;
+
+  // returns amount of register memory per CU(Compute Unit) in bytes.
+  int GetRegisterMemorySizePerComputeUnit() const;
+
+  // returns maximum possible amount of waves based on register usage.
+  int GetMaximumWavesCount(int register_footprint_per_tread,
+                           bool full_wave = true) const;
+
+  int GetWaveSize(bool full_wave) const;
+
+  // Not supported on some Adreno devices with specific driver version.
+  // b/131099086
+  bool support_one_layer_texture_array = true;
+};
+
+enum class MaliGPU {
+  T604,
+  T622,
+  T624,
+  T628,
+  T658,
+  T678,
+  T720,
+  T760,
+  T820,
+  T830,
+  T860,
+  T880,
+  G31,
+  G51,
+  G71,
+  G52,
+  G72,
+  G76,
+  G57,
+  G77,
+  UNKNOWN
+};
+
+struct MaliInfo {
+  MaliInfo() = default;
+  explicit MaliInfo(const std::string& device_name);
+  MaliGPU gpu_version;
+
+  bool IsMaliT6xx() const;
+  bool IsMaliT7xx() const;
+  bool IsMaliT8xx() const;
+  bool IsMidgard() const;
+  bool IsBifrostGen1() const;
+  bool IsBifrostGen2() const;
+  bool IsBifrostGen3() const;
+  bool IsBifrost() const;
+  bool IsValhall() const;
+};
+
+struct DeviceInfo {
+  DeviceInfo() = default;
+
+  bool IsAdreno() const;
+  bool IsAdreno3xx() const;
+  bool IsAdreno4xx() const;
+  bool IsAdreno5xx() const;
+  bool IsAdreno6xx() const;
+  bool IsAdreno6xxOrHigher() const;
+  bool IsPowerVR() const;
+  bool IsNvidia() const;
+  bool IsMali() const;
+  bool IsAMD() const;
+  bool IsIntel() const;
+
+  bool SupportsTextureArray() const;
+  bool SupportsImageBuffer() const;
+  bool SupportsImage3D() const;
+
+  bool SupportsFloatImage2D(DataType data_type, int channels) const;
+
+  // To track bug on some Adreno. b/131099086
+  bool SupportsOneLayerTextureArray() const;
+
+  std::vector<std::string> extensions;
+  bool supports_fp16;
+  bool supports_image3d_writes;
+  Vendor vendor;
+  OpenCLVersion cl_version;
+  int compute_units_count;
+  uint64_t buffer_max_size;
+  uint64_t image2d_max_width;
+  uint64_t image2d_max_height;
+  uint64_t image_buffer_max_size;
+  uint64_t image_array_max_layers;
+  uint64_t image3d_max_width;
+  uint64_t image3d_max_height;
+  uint64_t image3d_max_depth;
+  int max_work_group_size_x;
+  int max_work_group_size_y;
+  int max_work_group_size_z;
+
+  // rtn is ROUND_TO_NEAREST
+  // with rtn precision is much better then with rtz (ROUND_TO_ZERO)
+  // Adreno 3xx supports only rtz, Adreno 4xx and more support rtn
+  // Mali from T6xx supports rtn
+  // PowerVR supports only rtz
+  bool supports_fp32_rtn;
+  bool supports_fp16_rtn;
+
+  bool supports_r_f16_tex2d = false;
+  bool supports_rg_f16_tex2d = false;
+  bool supports_rgb_f16_tex2d = false;
+  bool supports_rgba_f16_tex2d = false;
+
+  bool supports_r_f32_tex2d = false;
+  bool supports_rg_f32_tex2d = false;
+  bool supports_rgb_f32_tex2d = false;
+  bool supports_rgba_f32_tex2d = false;
+
+  AdrenoInfo adreno_info;
+  MaliInfo mali_info;
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_DEVICE_INFO_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/environment.cc b/tensorflow/lite/delegates/gpu/cl/environment.cc
index 6b6ab84f148..3d5546a8ebb 100644
--- a/tensorflow/lite/delegates/gpu/cl/environment.cc
+++ b/tensorflow/lite/delegates/gpu/cl/environment.cc
@@ -47,7 +47,7 @@ __kernel void main_function(__write_only image2d_array_t dst) {
 absl::Status CheckKernelSupportOfOneLayerTextureArray(Environment* env,
                                                       bool* result) {
   // No bug on Adreno 6xx
-  if (env->device().GetInfo().adreno_info.gpu_version >= 600) {
+  if (env->device().info_.adreno_info.gpu_version >= 600) {
     *result = true;
     return absl::OkStatus();
   }
@@ -198,6 +198,19 @@ std::vector<TensorStorageType> Environment::GetSupportedStorages() const {
   return storage_types;
 }
 
+std::vector<TensorStorageType>
+Environment::GetSupportedStoragesWithHWZeroClampSupport() const {
+  std::vector<TensorStorageType> storage_types;
+  for (auto storage_type :
+       {TensorStorageType::TEXTURE_2D, TensorStorageType::TEXTURE_ARRAY,
+        TensorStorageType::TEXTURE_3D}) {
+    if (IsSupported(storage_type)) {
+      storage_types.push_back(storage_type);
+    }
+  }
+  return storage_types;
+}
+
 bool Environment::IsSupported(TensorStorageType storage_type) const {
   switch (storage_type) {
     case TensorStorageType::TEXTURE_2D:
@@ -229,7 +242,7 @@ TensorStorageType GetFastestStorageType(const CLDevice& gpu) {
   } else if (gpu.IsPowerVR()) {
     return TensorStorageType::TEXTURE_2D;
   } else if (gpu.IsMali()) {
-    const MaliInfo mali_info = gpu.GetInfo().mali_info;
+    const MaliInfo mali_info = gpu.info_.mali_info;
     if (mali_info.IsMaliT8xx() || mali_info.IsBifrostGen3() ||
         mali_info.IsValhall()) {
       return TensorStorageType::TEXTURE_2D;
diff --git a/tensorflow/lite/delegates/gpu/cl/environment.h b/tensorflow/lite/delegates/gpu/cl/environment.h
index b40d22d3dd6..640f2d8cac3 100644
--- a/tensorflow/lite/delegates/gpu/cl/environment.h
+++ b/tensorflow/lite/delegates/gpu/cl/environment.h
@@ -55,6 +55,10 @@ class Environment {
   std::vector<CalculationsPrecision> GetSupportedPrecisions() const;
   bool IsSupported(CalculationsPrecision precision) const;
   std::vector<TensorStorageType> GetSupportedStorages() const;
+  // returns storage types that support zero clamping when reading OOB in HW
+  // (Height/Width) dimensions.
+  std::vector<TensorStorageType> GetSupportedStoragesWithHWZeroClampSupport()
+      const;
   bool IsSupported(TensorStorageType storage_type) const;
 
   absl::Status Init();
diff --git a/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h b/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h
index 1a9fb73e6ab..81d03a9a32d 100644
--- a/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h
@@ -16,27 +16,18 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_API_DELEGATE_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_API_DELEGATE_H_
 
+#define GL_NO_PROTOTYPES
+#define EGL_NO_PROTOTYPES
 #include <EGL/egl.h>
 #include <GLES3/gl31.h>
+#undef GL_NO_PROTOTYPES
+#undef EGL_NO_PROTOTYPES
+
 #include <stdint.h>
 
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/gpu/delegate.h"
 
-#ifdef SWIG
-#define TFL_CAPI_EXPORT
-#else
-#if defined(_WIN32)
-#ifdef TFL_COMPILE_LIBRARY
-#define TFL_CAPI_EXPORT __declspec(dllexport)
-#else
-#define TFL_CAPI_EXPORT __declspec(dllimport)
-#endif  // TFL_COMPILE_LIBRARY
-#else
-#define TFL_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // _WIN32
-#endif  // SWIG
-
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
@@ -76,8 +67,8 @@ typedef struct {
 // .compile_options = {
 //   .precision_loss_allowed = false,
 // }
-// .egl_display = eglGetCurrentDisplay(),
-// .egl_context = eglGetCurrentContext();
+// .egl_display = EGL_NO_DISPLAY;
+// .egl_context = EGL_NO_CONTEXT;
 TFL_CAPI_EXPORT TfLiteDelegate* TfLiteGpuDelegateCreate_New(
     const TfLiteGpuDelegateOptions_New* options);
 
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.cc b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
index 7ff10f16fe1..7802024302b 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.cc
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
@@ -21,15 +21,17 @@ limitations under the License.
 #include <map>
 #include <memory>
 #include <string>
-#include <unordered_set>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/cl/model_hints.h"
 #include "tensorflow/lite/delegates/gpu/cl/precision.h"
 #include "tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h"
+#include "tensorflow/lite/delegates/gpu/cl/selectors/special_selector.h"
 #include "tensorflow/lite/delegates/gpu/cl/storage_type_util.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
@@ -48,7 +50,7 @@ namespace gpu {
 namespace cl {
 
 namespace {
-bool IsReady(const std::unordered_set<ValueId>& ready_tensors,
+bool IsReady(const absl::flat_hash_set<ValueId>& ready_tensors,
              const CLNode& node) {
   for (const ValueId in_id : node.inputs) {
     if (ready_tensors.find(in_id) == ready_tensors.end()) {
@@ -197,10 +199,11 @@ absl::Status InferenceContext::InitFromGraph(
   RETURN_IF_ERROR(AllocateMemory(env->device(), creation_context.context));
   BindMemoryToOperations();
   RETURN_IF_ERROR(Compile(creation_context));
+  RETURN_IF_ERROR(UpdateParams());
 
   TuningParameters tuning_parameters;
   tuning_parameters.queue = env->profiling_queue();
-  tuning_parameters.info = env->device().GetInfoPtr();
+  tuning_parameters.info = &env->device().info_;
   if (create_info.hints.Check(ModelHints::kFastTuning)) {
     tuning_parameters.tuning_type = TuningType::FAST;
   }
@@ -241,14 +244,13 @@ void InferenceContext::ReserveGraphTensors(
     if (graph.IsGraphInput(t->id) || graph.IsGraphOutput(t->id)) {
       if (shape.c < 4 &&
           CanCreateTensorWithShape(
-              *creation_context.context, *creation_context.device, shape,
+              creation_context.device->info_, shape,
               TensorDescriptor{data_type, TensorStorageType::SINGLE_TEXTURE_2D,
                                layout})) {
         storage_type = TensorStorageType::SINGLE_TEXTURE_2D;
       }
     }
-    storage_type = SelectBestStorageType(*creation_context.context,
-                                         *creation_context.device, shape,
+    storage_type = SelectBestStorageType(creation_context.device->info_, shape,
                                          storage_type, data_type, layout);
     tensor_reserver_.Add(
         t->id, {shape, TensorDescriptor{data_type, storage_type, layout}});
@@ -260,6 +262,12 @@ void InferenceContext::ReserveGraphTensors(
 absl::Status InferenceContext::ConvertOperations(
     const CreationContext& creation_context, const GraphFloat32& graph,
     ModelHints hints) {
+  std::map<ValueId, TensorDescriptor> tensor_descriptors;
+  const auto values = graph.values();
+  for (auto value : values) {
+    tensor_descriptors[value->id] = tensor_reserver_.Get(value->id).descriptor;
+  }
+  std::set<NodeId> consumed_nodes;
   std::vector<Node*> graph_nodes = graph.nodes();
   std::map<ValueId, int>
       tensor_usages;  // keeps latest index of operation that updated tensor
@@ -269,46 +277,55 @@ absl::Status InferenceContext::ConvertOperations(
   }
   for (int i = 0; i < graph_nodes.size(); ++i) {
     const Node& node = *graph_nodes[i];
-    auto inputs = graph.FindInputs(node.id);
-    auto outputs = graph.FindOutputs(node.id);
-
-    // Reordering of input ids and updating of temporary tensors_usage struct.
-    // This stage is necessary because we are building OperationDef that rely on
-    // order of input ids. But we also should have input id on first position
-    // that potentially can be "linking" tensor and as result eliminated(unused)
-    // We apply it only for ADD operation, because of ADD associativity and
-    // ADD can be linked.
-    // In current approach "linking" tensor can be only latest written
-    // tensor(during linear order of execution) among input tensors.
-    if (IsGenericAdd(node, inputs, outputs)) {
-      int latest_written_tensor_index = 0;
-      int last_usage = tensor_usages[inputs[0]->id];
-      for (int j = 1; j < inputs.size(); ++j) {
-        if (tensor_usages[inputs[j]->id] > last_usage) {
-          last_usage = tensor_usages[inputs[j]->id];
-          latest_written_tensor_index = j;
-        }
-      }
-      std::swap(inputs[0], inputs[latest_written_tensor_index]);
-    }
-    for (const auto& out_id : outputs) {
-      tensor_usages[out_id->id] = i;
-    }
-
-    OperationDef op_def;
-    op_def.precision = precision_;
-    for (int j = 0; j < inputs.size(); ++j) {
-      op_def.src_tensors.push_back(
-          tensor_reserver_.Get(inputs[j]->id).descriptor);
-    }
-    for (int j = 0; j < outputs.size(); ++j) {
-      op_def.dst_tensors.push_back(
-          tensor_reserver_.Get(outputs[j]->id).descriptor);
+    if (consumed_nodes.find(node.id) != consumed_nodes.end()) {
+      continue;
     }
     GPUOperationsSubgraph gpu_subgraph;
-    RETURN_IF_ERROR(GPUOperationFromNode(creation_context, op_def, hints,
-                                         inputs, outputs, node, &gpu_subgraph));
-    std::unordered_map<int, ValueId> mapping_to_global_ids;
+    if (hints.Check(ModelHints::kAllowSpecialKernels) &&
+        GPUSubgraphFromGraph(creation_context, precision_, graph, node.id,
+                             tensor_descriptors, &consumed_nodes, &gpu_subgraph)
+            .ok()) {
+      // Mapping of subgraph (set of nodes) to GPU operations. Should happen
+      // before straigtforward mapping.
+    } else {
+      // Straigtforward mapping of one graph node to GPU operations.
+      auto inputs = graph.FindInputs(node.id);
+      auto outputs = graph.FindOutputs(node.id);
+      // Reordering of input ids and updating of temporary tensors_usage struct.
+      // This stage is necessary because we are building OperationDef that rely
+      // on order of input ids. But we also should have input id on first
+      // position that potentially can be "linking" tensor and as result
+      // eliminated(unused) We apply it only for ADD operation, because of ADD
+      // associativity and ADD can be linked. In current approach "linking"
+      // tensor can be only latest written tensor(during linear order of
+      // execution) among input tensors.
+      if (IsGenericAdd(node, inputs, outputs)) {
+        int latest_written_tensor_index = 0;
+        int last_usage = tensor_usages[inputs[0]->id];
+        for (int j = 1; j < inputs.size(); ++j) {
+          if (tensor_usages[inputs[j]->id] > last_usage) {
+            last_usage = tensor_usages[inputs[j]->id];
+            latest_written_tensor_index = j;
+          }
+        }
+        std::swap(inputs[0], inputs[latest_written_tensor_index]);
+      }
+      consumed_nodes.insert(node.id);
+      OperationDef op_def;
+      op_def.precision = precision_;
+      for (int j = 0; j < inputs.size(); ++j) {
+        op_def.src_tensors.push_back(
+            tensor_reserver_.Get(inputs[j]->id).descriptor);
+      }
+      for (int j = 0; j < outputs.size(); ++j) {
+        op_def.dst_tensors.push_back(
+            tensor_reserver_.Get(outputs[j]->id).descriptor);
+      }
+      RETURN_IF_ERROR(GPUOperationFromNode(creation_context, op_def, hints,
+                                           inputs, outputs, node,
+                                           &gpu_subgraph));
+    }
+    absl::flat_hash_map<int, ValueId> mapping_to_global_ids;
     for (int j = 0; j < gpu_subgraph.new_tensors.size(); ++j) {
       const auto& t = gpu_subgraph.new_tensors[j];
       auto global_id = tensor_reserver_.Add({t.first, t.second});
@@ -323,7 +340,7 @@ absl::Status InferenceContext::ConvertOperations(
       for (int j = 0; j < gpu_op.input_ids.size(); ++j) {
         int id = gpu_op.input_ids[j];
         if (id >= 0) {
-          cl_node.inputs[j] = inputs[id]->id;
+          cl_node.inputs[j] = id;
         } else {
           cl_node.inputs[j] = mapping_to_global_ids[-(id + 1)];
         }
@@ -332,7 +349,8 @@ absl::Status InferenceContext::ConvertOperations(
       for (int j = 0; j < gpu_op.output_ids.size(); ++j) {
         int id = gpu_op.output_ids[j];
         if (id >= 0) {
-          cl_node.outputs[j] = outputs[id]->id;
+          cl_node.outputs[j] = id;
+          tensor_usages[id] = i;
         } else {
           cl_node.outputs[j] = mapping_to_global_ids[-(id + 1)];
         }
@@ -346,7 +364,7 @@ absl::Status InferenceContext::ConvertOperations(
 }
 
 void InferenceContext::Merge() {
-  std::unordered_set<ValueId> ready_tensors;
+  absl::flat_hash_set<ValueId> ready_tensors;
   for (const auto& input_id : input_ids_) {
     ready_tensors.insert(input_id);
   }
@@ -372,9 +390,7 @@ void InferenceContext::Merge() {
       continue;
     }
     auto& linkable_node = nodes_[next_nodes[0]];
-    auto* elementwise =
-        dynamic_cast<ElementwiseOperation*>(linkable_node.operations[0].get());
-    if (!elementwise || !elementwise->IsLinkable() ||
+    if (!linkable_node.operations[0]->IsLinkable() ||
         linkable_node.outputs.size() != 1 ||
         !IsReady(ready_tensors, linkable_node)) {
       continue;
@@ -392,9 +408,7 @@ void InferenceContext::Merge() {
   }
   for (auto& node : nodes_) {
     for (int j = 1; j < node.operations.size(); ++j) {
-      auto* elementwise =
-          dynamic_cast<ElementwiseOperation*>(node.operations[j].get());
-      node.operations[0]->AddOperation(elementwise);
+      node.operations[0]->AddOperation(node.operations[j].get());
     }
   }
 }
@@ -554,6 +568,13 @@ absl::Status InferenceContext::Tune(const TuningParameters& tuning_parameters) {
   return absl::OkStatus();
 }
 
+absl::Status InferenceContext::UpdateParams() {
+  for (auto& node : nodes_) {
+    RETURN_IF_ERROR(node.operations[0]->UpdateParams());
+  }
+  return absl::OkStatus();
+}
+
 absl::Status InferenceContext::AddToQueue(CLCommandQueue* queue) {
   if (need_manual_release_) {
     if (prev_enqueue_start_point_.is_valid()) {
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.h b/tensorflow/lite/delegates/gpu/cl/inference_context.h
index 75365258e41..e26cb170228 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.h
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.h
@@ -20,9 +20,9 @@ limitations under the License.
 #include <functional>
 #include <map>
 #include <memory>
-#include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
 #include "tensorflow/lite/delegates/gpu/cl/environment.h"
@@ -114,6 +114,7 @@ class InferenceContext {
   void BindMemoryToOperations();
   absl::Status Compile(const CreationContext& creation_context);
   absl::Status Tune(const TuningParameters& tuning_parameters);
+  absl::Status UpdateParams();
 
   // performance hacks
   bool need_flush_ = false;
@@ -159,7 +160,7 @@ class InferenceContext {
     DummyTensor Get(ValueId id) { return reservations_[id]; }
 
    private:
-    std::unordered_map<ValueId, DummyTensor> reservations_;
+    absl::flat_hash_map<ValueId, DummyTensor> reservations_;
     ValueId next_;
   };
   TensorReserver tensor_reserver_;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
index 1f81a34604a..c8351304188 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@@ -96,6 +96,7 @@ cc_library(
         ":gpu_operation",
         ":util",
         ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:cl_device",
         "//tensorflow/lite/delegates/gpu/cl:cl_kernel",
         "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:status",
@@ -171,7 +172,6 @@ cc_test(
     deps = [
         ":cl_test",
         ":conv_buffer_1x1",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_googletest//:gtest_main",
@@ -211,7 +211,6 @@ cc_test(
     deps = [
         ":cl_test",
         ":conv_constants",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_googletest//:gtest_main",
@@ -301,7 +300,6 @@ cc_test(
     deps = [
         ":cl_test",
         ":conv_texture",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_googletest//:gtest_main",
@@ -379,7 +377,6 @@ cc_test(
     deps = [
         ":cl_test",
         ":convolution_transposed",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_googletest//:gtest_main",
@@ -444,7 +441,6 @@ cc_test(
     deps = [
         ":cl_test",
         ":convolution_transposed_3x3",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_googletest//:gtest_main",
@@ -483,7 +479,6 @@ cc_test(
     deps = [
         ":cl_test",
         ":convolution_transposed_3x3_thin",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_googletest//:gtest_main",
@@ -524,7 +519,6 @@ cc_test(
     deps = [
         ":cl_test",
         ":convolution_transposed_4x4",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_googletest//:gtest_main",
@@ -564,7 +558,6 @@ cc_test(
     deps = [
         ":cl_test",
         ":convolution_transposed_thin",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_googletest//:gtest_main",
@@ -605,7 +598,6 @@ cc_test(
     deps = [
         ":cl_test",
         ":depthwise_conv",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_googletest//:gtest_main",
@@ -645,7 +637,6 @@ cc_test(
     deps = [
         ":cl_test",
         ":depthwise_conv_3x3",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_googletest//:gtest_main",
@@ -677,7 +668,6 @@ cc_test(
     deps = [
         ":cl_test",
         ":elementwise",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_googletest//:gtest_main",
@@ -715,7 +705,6 @@ cc_test(
     deps = [
         ":cl_test",
         ":fully_connected",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_googletest//:gtest_main",
@@ -731,6 +720,7 @@ cc_library(
         ":util",
         ":work_group_picking",
         "//tensorflow/lite/delegates/gpu/cl:arguments",
+        "//tensorflow/lite/delegates/gpu/cl:buffer",
         "//tensorflow/lite/delegates/gpu/cl:cl_context",
         "//tensorflow/lite/delegates/gpu/cl:cl_device",
         "//tensorflow/lite/delegates/gpu/cl:precision",
@@ -770,6 +760,38 @@ cc_test(
     deps = [
         ":cl_test",
         ":lstm",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "mean_stddev_normalization",
+    srcs = ["mean_stddev_normalization.cc"],
+    hdrs = ["mean_stddev_normalization.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:precision",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_test(
+    name = "mean_stddev_normalization_test",
+    srcs = ["mean_stddev_normalization_test.cc"],
+    linkstatic = True,
+    tags = tf_gpu_tests_tags() + [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":mean_stddev_normalization",
         "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
@@ -802,7 +824,6 @@ cc_test(
     deps = [
         ":cl_test",
         ":max_unpooling",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_googletest//:gtest_main",
@@ -837,7 +858,6 @@ cc_test(
     deps = [
         ":cl_test",
         ":mean",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_googletest//:gtest_main",
@@ -869,7 +889,6 @@ cc_test(
     deps = [
         ":cl_test",
         ":padding",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_googletest//:gtest_main",
@@ -903,7 +922,6 @@ cc_test(
     deps = [
         ":cl_test",
         ":pooling",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_googletest//:gtest_main",
@@ -940,7 +958,6 @@ cc_test(
     deps = [
         ":cl_test",
         ":prelu",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_googletest//:gtest_main",
@@ -977,7 +994,6 @@ cc_test(
     deps = [
         ":cl_test",
         ":quantize_and_dequantize",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/kernels/internal:quantization_util",
@@ -1008,7 +1024,6 @@ cc_test(
     deps = [
         ":cl_test",
         ":relu",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_googletest//:gtest_main",
@@ -1040,7 +1055,6 @@ cc_test(
     deps = [
         ":cl_test",
         ":reshape",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_googletest//:gtest_main",
@@ -1073,7 +1087,6 @@ cc_test(
     deps = [
         ":cl_test",
         ":reshapex4",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_googletest//:gtest_main",
@@ -1107,7 +1120,6 @@ cc_test(
     deps = [
         ":cl_test",
         ":softmax",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_googletest//:gtest_main",
@@ -1139,7 +1151,6 @@ cc_test(
     deps = [
         ":cl_test",
         ":softmax1x1",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_googletest//:gtest_main",
@@ -1172,7 +1183,6 @@ cc_test(
     deps = [
         ":cl_test",
         ":space_to_depth",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_googletest//:gtest_main",
@@ -1203,7 +1213,6 @@ cc_test(
     deps = [
         ":cl_test",
         ":strided_slice",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_googletest//:gtest_main",
@@ -1235,7 +1244,6 @@ cc_test(
     deps = [
         ":cl_test",
         ":transpose",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_googletest//:gtest_main",
@@ -1277,7 +1285,6 @@ cc_test(
     deps = [
         ":cl_test",
         ":resize",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_googletest//:gtest_main",
@@ -1289,7 +1296,7 @@ cc_library(
     srcs = ["util.cc"],
     hdrs = ["util.h"],
     deps = [
-        "//tensorflow/lite/delegates/gpu/cl:cl_device",
+        "//tensorflow/lite/delegates/gpu/cl:device_info",
         "//tensorflow/lite/delegates/gpu/cl:precision",
         "//tensorflow/lite/delegates/gpu/cl:tensor_type",
         "//tensorflow/lite/delegates/gpu/common:access_type",
@@ -1322,7 +1329,6 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:winograd_util",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
     ],
 )
@@ -1382,6 +1388,8 @@ test_suite(
         "fully_connected_test",
         "lstm_test",
         "max_unpooling_test",
+        "mean_stddev_normalization_test",
+        "mean_test",
         "padding_test",
         "pooling_test",
         "prelu_test",
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/add.cc b/tensorflow/lite/delegates/gpu/cl/kernels/add.cc
index 858d188945f..1cb41e79d88 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/add.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/add.cc
@@ -25,60 +25,29 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-Add::Add(const OperationDef& definition, const std::vector<int>& channels,
-         int dst_channels)
-    : ElementwiseOperation(definition),
-      dst_depth_(DivideRoundUp(dst_channels, 4)) {
-  src_depthes_.resize(channels.size());
-  for (int i = 0; i < channels.size(); ++i) {
-    src_depthes_[i] = DivideRoundUp(channels[i], 4);
+GPUOperation CreateAdd(const OperationDef& definition,
+                       const std::vector<int>& channels, int dst_channels) {
+  GPUOperation add(definition);
+  int dst_depth = DivideRoundUp(dst_channels, 4);
+  int src0_depth = DivideRoundUp(channels[0], 4);
+  add.elementwise_ = true;
+  add.linkable_ = dst_depth == src0_depth;
+  if (src0_depth < dst_depth) {
+    add.check_src_channels_size_ = true;
   }
-  if (src_depthes_[0] < dst_depth_) {
-    check_src_channels_size_ = true;
-  }
-  for (int i = 1; i < definition_.src_tensors.size(); ++i) {
+  for (int i = 1; i < definition.src_tensors.size(); ++i) {
     const std::string tensor_name = absl::StrCat("src_data_", i);
-    auto src_desc =
-        absl::make_unique<TensorDescriptor>(definition_.src_tensors[i]);
-    if (definition_.IsBatchSupported()) {
-      src_desc->SetStateVar("BatchedWidth", "true");
+    auto src_desc = definition.src_tensors[i];
+    if (definition.IsBatchSupported()) {
+      src_desc.SetStateVar("BatchedWidth", "true");
     }
-    args_.AddObjectRef(tensor_name, AccessType::READ, std::move(src_desc));
-    code_ += "if (S_COORD < args." + tensor_name + ".Slices()) {\n";
-    code_ += "  in_out_value += args." + tensor_name +
-             ".Read(X_COORD, Y_COORD, S_COORD);\n";
-    code_ += "}\n";
+    add.AddSrcTensor(tensor_name, src_desc);
+    add.code_ += "if (S_COORD < args." + tensor_name + ".Slices()) {\n";
+    add.code_ += "  in_out_value += args." + tensor_name +
+                 ".Read(X_COORD, Y_COORD, S_COORD);\n";
+    add.code_ += "}\n";
   }
-}
-
-Add::Add(Add&& operation)
-    : ElementwiseOperation(std::move(operation)),
-      link_index_(operation.link_index_),
-      src_depthes_(std::move(operation.src_depthes_)),
-      dst_depth_(operation.dst_depth_) {}
-
-Add& Add::operator=(Add&& operation) {
-  if (this != &operation) {
-    link_index_ = operation.link_index_;
-    src_depthes_ = std::move(operation.src_depthes_);
-    dst_depth_ = operation.dst_depth_;
-    ElementwiseOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status Add::SetArgs(const std::string& unique_postfix, Arguments* args) {
-  for (int i = 1; i < definition_.src_tensors.size(); ++i) {
-    std::string tensor_name = absl::StrCat("src_data_", i, unique_postfix);
-    RETURN_IF_ERROR(args->SetObjectRef(tensor_name, src_[i]));
-  }
-  return absl::OkStatus();
-}
-
-Add CreateAdd(const OperationDef& definition, const std::vector<int>& channels,
-              int dst_channels) {
-  Add operation(definition, channels, dst_channels);
-  return operation;
+  return add;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/add.h b/tensorflow/lite/delegates/gpu/cl/kernels/add.h
index f20425c48dd..0e9d7e0d333 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/add.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/add.h
@@ -27,33 +27,10 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-// Add operation inherited from ElementwiseOperation, but it is more
-// complicated than usual elementwise, that is why it has own versions for
-// Compile. Add operation support not equal tensors on input (for possibility to
-// remove Padding operation with zeroes in Z dimension)
-class Add : public ElementwiseOperation {
- public:
-  Add(const OperationDef& definition, const std::vector<int>& channels,
-      int dst_channels);
-
-  // Move only
-  Add(Add&& operation);
-  Add& operator=(Add&& operation);
-  Add(const Add&) = delete;
-  Add& operator=(const Add&) = delete;
-
-  absl::Status SetArgs(const std::string& unique_postfix,
-                       Arguments* args) override;
-  bool IsLinkable() const override { return dst_depth_ == src_depthes_[0]; }
-
- private:
-  int link_index_;
-  std::vector<int> src_depthes_;
-  int dst_depth_;
-};
-
-Add CreateAdd(const OperationDef& definition, const std::vector<int>& channels,
-              int dst_channels);
+// Add operation supports not equal tensors on input (for possibility to
+// remove Padding operation with zeroes in channels dimension)
+GPUOperation CreateAdd(const OperationDef& definition,
+                       const std::vector<int>& channels, int dst_channels);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/add_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/add_test.cc
index 1eccab87646..2856b37a497 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/add_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/add_test.cc
@@ -49,7 +49,7 @@ TEST_F(OpenCLOperationTest, AddTwoEqualTensors) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Add operation = CreateAdd(op_def, channels, channels[0]);
+      GPUOperation operation = CreateAdd(op_def, channels, channels[0]);
       ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -77,7 +77,7 @@ TEST_F(OpenCLOperationTest, AddFirstTensorHasMoreChannelsThanSecond) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Add operation = CreateAdd(op_def, channels, channels[0]);
+      GPUOperation operation = CreateAdd(op_def, channels, channels[0]);
       ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
                                     BHWC(1, 2, 1, 6), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -107,7 +107,7 @@ TEST_F(OpenCLOperationTest, AddFirstTensorHasLessChannelsThanSecond) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Add operation = CreateAdd(op_def, channels, 6);
+      GPUOperation operation = CreateAdd(op_def, channels, 6);
       ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
                                     BHWC(1, 2, 1, 6), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc
index deb0ebf67c4..f864a731446 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc
@@ -56,6 +56,7 @@ absl::Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
   }
 
   RETURN_IF_ERROR(operation->Compile(creation_context));
+  RETURN_IF_ERROR(operation->UpdateParams());
   RETURN_IF_ERROR(operation->AddToQueue(creation_context.queue));
   RETURN_IF_ERROR(creation_context.queue->WaitForCompletion());
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/concat_test.cc
index eee4203ed1b..d6889af7717 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_test.cc
@@ -51,7 +51,7 @@ TEST_F(OpenCLOperationTest, ConcatWidth) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConcatXY operation = CreateConcatXY(op_def, attr, 2);
+      ConcatXY operation = CreateConcatXY(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
                                     BHWC(1, 2, 3, 2), &dst_tensor));
       EXPECT_THAT(
@@ -83,7 +83,7 @@ TEST_F(OpenCLOperationTest, ConcatHeight) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConcatXY operation = CreateConcatXY(op_def, attr, 2);
+      ConcatXY operation = CreateConcatXY(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
                                     BHWC(1, 3, 1, 2), &dst_tensor));
       EXPECT_THAT(
@@ -117,7 +117,8 @@ TEST_F(OpenCLOperationTest, ConcatChannels) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConcatZ operation = CreateConcatZ(op_def, {1, 2, 3});
+      ConcatZ operation =
+          CreateConcatZ(op_def, {1, 2, 3}, env_.GetDevicePtr()->info_);
       ASSERT_OK(ExecuteGPUOperation({src0, src1, src2}, creation_context_,
                                     &operation, BHWC(1, 2, 1, 6), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -150,7 +151,8 @@ TEST_F(OpenCLOperationTest, ConcatChannelsAlignedx4) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConcatZ operation = CreateConcatZ(op_def, {4, 4});
+      ConcatZ operation =
+          CreateConcatZ(op_def, {4, 4}, env_.GetDevicePtr()->info_);
       ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
                                     BHWC(1, 2, 1, 8), &dst_tensor));
       EXPECT_THAT(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc
index 5476cc22965..7aaa587503e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc
@@ -27,20 +27,28 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
+ConcatXY::ConcatXY(const OperationDef& definition, const ConcatAttributes& attr)
+    : GPUOperation(definition) {
+  code_ = GetConcatKernelCode(definition, attr);
+}
 
-std::string GetConcatKernelCode(const OperationDef& op_def,
-                                const ConcatAttributes& attr, Arguments* args) {
+ConcatXY::ConcatXY(ConcatXY&& operation) : GPUOperation(std::move(operation)) {}
+
+ConcatXY& ConcatXY::operator=(ConcatXY&& operation) {
+  if (this != &operation) {
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+std::string ConcatXY::GetConcatKernelCode(const OperationDef& op_def,
+                                          const ConcatAttributes& attr) {
   std::vector<std::string> tensor_names(op_def.src_tensors.size());
   for (int i = 0; i < op_def.src_tensors.size(); ++i) {
     tensor_names[i] = "src_tensor_" + std::to_string(i);
-    args->AddObjectRef(
-        tensor_names[i], AccessType::READ,
-        absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
+    AddSrcTensor(tensor_names[i], op_def.src_tensors[i]);
   }
-  args->AddObjectRef(
-      "dst_tensor", AccessType::WRITE,
-      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
+  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
 
   std::map<Axis, std::string> axis_to_selector = {
       {Axis::WIDTH, "Width"}, {Axis::HEIGHT, "Height"},
@@ -120,49 +128,6 @@ std::string GetConcatKernelCode(const OperationDef& op_def,
   return c;
 }
 
-}  // namespace
-
-ConcatXY::ConcatXY(ConcatXY&& operation)
-    : GPUOperation(std::move(operation)),
-      attr_(operation.attr_),
-      tensors_count_(operation.tensors_count_),
-      kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_) {}
-
-ConcatXY& ConcatXY::operator=(ConcatXY&& operation) {
-  if (this != &operation) {
-    attr_ = operation.attr_;
-    tensors_count_ = operation.tensors_count_;
-    kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status ConcatXY::Compile(const CreationContext& creation_context) {
-  std::string code = GetConcatKernelCode(definition_, attr_, &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
-absl::Status ConcatXY::BindArguments() {
-  for (int i = 0; i < definition_.src_tensors.size(); ++i) {
-    RETURN_IF_ERROR(
-        args_.SetObjectRef("src_tensor_" + std::to_string(i), src_[i]));
-  }
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
-}
-
 int3 ConcatXY::GetGridSize() const {
   const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
   const int grid_y = dst_[0]->Height() * dst_[0]->Depth();
@@ -170,19 +135,9 @@ int3 ConcatXY::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status ConcatXY::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status ConcatXY::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 ConcatXY CreateConcatXY(const OperationDef& definition,
-                        const ConcatAttributes& attr, int tensors_count) {
-  return ConcatXY(definition, attr, tensors_count);
+                        const ConcatAttributes& attr) {
+  return ConcatXY(definition, attr);
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h
index a170b593cf0..7732064808b 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h
@@ -28,13 +28,8 @@ namespace cl {
 
 class ConcatXY : public GPUOperation {
  public:
-  ConcatXY(const OperationDef& definition, const ConcatAttributes& attr,
-           int tensors_count)
-      : GPUOperation(definition), attr_(attr), tensors_count_(tensors_count) {}
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
-  absl::Status Compile(const CreationContext& creation_context) override;
+  ConcatXY(const OperationDef& definition, const ConcatAttributes& attr);
+  int3 GetGridSize() const override;
 
   // Move only
   ConcatXY(ConcatXY&& operation);
@@ -43,17 +38,12 @@ class ConcatXY : public GPUOperation {
   ConcatXY& operator=(const ConcatXY&) = delete;
 
  private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
-  ConcatAttributes attr_;
-  int tensors_count_;
-  CLKernel kernel_;
-  int3 work_group_size_ = int3(8, 4, 1);
+  std::string GetConcatKernelCode(const OperationDef& op_def,
+                                  const ConcatAttributes& attr);
 };
 
 ConcatXY CreateConcatXY(const OperationDef& definition,
-                        const ConcatAttributes& attr, int tensors_count);
+                        const ConcatAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
index 08c18907c78..067ef25a988 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
@@ -36,23 +36,53 @@ bool IsAllChannelsX4(const std::vector<int>& channels) {
   return true;
 }
 
-std::string GetConcatKernelCode(const OperationDef& op_def,
-                                const std::vector<int>& channels,
-                                Arguments* args) {
+}  // namespace
+
+ConcatZ::ConcatZ(const OperationDef& definition,
+                 const std::vector<int>& channels,
+                 const DeviceInfo& device_info)
+    : GPUOperation(definition) {
+  code_ = GetConcatKernelCode(definition, channels);
+  if (device_info.IsPowerVR() &&
+      definition.precision == CalculationsPrecision::F32 &&
+      !IsAllChannelsX4(channels)) {
+    // BUG, some PowerVRs (GE8320) produce incorrect result without it
+    compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
+  }
+  if (device_info.IsAMD() &&
+      definition.precision != CalculationsPrecision::F32 &&
+      definition.src_tensors[0].storage_type != TensorStorageType::BUFFER &&
+      !IsAllChannelsX4(channels)) {
+    // BUG, some AMD gpus crash without it
+    compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
+  }
+}
+
+ConcatZ::ConcatZ(ConcatZ&& kernel) : GPUOperation(std::move(kernel)) {}
+
+ConcatZ& ConcatZ::operator=(ConcatZ&& kernel) {
+  if (this != &kernel) {
+    GPUOperation::operator=(std::move(kernel));
+  }
+  return *this;
+}
+
+std::string ConcatZ::GetConcatKernelCode(const OperationDef& op_def,
+                                         const std::vector<int>& channels) {
   std::vector<std::string> tensor_names(op_def.src_tensors.size());
   for (int i = 0; i < op_def.src_tensors.size(); ++i) {
     tensor_names[i] = "src_tensor_" + std::to_string(i);
-    auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[i]);
+    auto src_desc = op_def.src_tensors[i];
     if (op_def.IsBatchSupported()) {
-      src_desc->SetStateVar("BatchedWidth", "true");
+      src_desc.SetStateVar("BatchedWidth", "true");
     }
-    args->AddObjectRef(tensor_names[i], AccessType::READ, std::move(src_desc));
+    AddSrcTensor(tensor_names[i], src_desc);
   }
-  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  auto dst_desc = op_def.dst_tensors[0];
   if (op_def.IsBatchSupported()) {
-    dst_desc->SetStateVar("BatchedWidth", "true");
+    dst_desc.SetStateVar("BatchedWidth", "true");
   }
-  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
+  AddDstTensor("dst_tensor", dst_desc);
 
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
@@ -131,61 +161,6 @@ std::string GetConcatKernelCode(const OperationDef& op_def,
   return c;
 }
 
-}  // namespace
-
-ConcatZ::ConcatZ(ConcatZ&& kernel)
-    : GPUOperation(std::move(kernel)),
-      channels_(std::move(kernel.channels_)),
-      kernel_(std::move(kernel.kernel_)),
-      work_group_size_(kernel.work_group_size_) {}
-
-ConcatZ& ConcatZ::operator=(ConcatZ&& kernel) {
-  if (this != &kernel) {
-    channels_ = std::move(kernel.channels_);
-    kernel_ = std::move(kernel.kernel_);
-    std::swap(work_group_size_, kernel.work_group_size_);
-    GPUOperation::operator=(std::move(kernel));
-  }
-  return *this;
-}
-
-absl::Status ConcatZ::Compile(const CreationContext& creation_context) {
-  std::string code = GetConcatKernelCode(definition_, channels_, &args_);
-  std::vector<CompilerOptions> options;
-  if (creation_context.device->IsPowerVR() &&
-      definition_.precision == CalculationsPrecision::F32 &&
-      !IsAllChannelsX4(channels_)) {
-    // BUG, some PowerVRs (GE8320) produce incorrect result without it
-    options.push_back(CompilerOptions::CL_OPT_DISABLE);
-  }
-  if (creation_context.device->IsAMD() &&
-      definition_.precision != CalculationsPrecision::F32 &&
-      definition_.src_tensors[0].storage_type != TensorStorageType::BUFFER &&
-      !IsAllChannelsX4(channels_)) {
-    // BUG, some AMD gpus crash without it
-    options.push_back(CompilerOptions::CL_OPT_DISABLE);
-  }
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", options, *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
-absl::Status ConcatZ::BindArguments() {
-  for (int i = 0; i < definition_.src_tensors.size(); ++i) {
-    RETURN_IF_ERROR(
-        args_.SetObjectRef("src_tensor_" + std::to_string(i), src_[i]));
-  }
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
-}
-
 int3 ConcatZ::GetGridSize() const {
   const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
   const int grid_y = dst_[0]->Height();
@@ -193,19 +168,10 @@ int3 ConcatZ::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status ConcatZ::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status ConcatZ::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 ConcatZ CreateConcatZ(const OperationDef& definition,
-                      const std::vector<int>& channels) {
-  return ConcatZ(definition, channels);
+                      const std::vector<int>& channels,
+                      const DeviceInfo& device_info) {
+  return ConcatZ(definition, channels, device_info);
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h
index ec25f6e4ed9..f3835093e2b 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor.h"
@@ -30,12 +31,9 @@ namespace cl {
 
 class ConcatZ : public GPUOperation {
  public:
-  ConcatZ(const OperationDef& definition, const std::vector<int>& channels)
-      : GPUOperation(definition), channels_(channels) {}
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
-  absl::Status Compile(const CreationContext& creation_context) override;
+  ConcatZ(const OperationDef& definition, const std::vector<int>& channels,
+          const DeviceInfo& device_info);
+  int3 GetGridSize() const override;
 
   // Move only
   ConcatZ(ConcatZ&& kernel);
@@ -44,17 +42,13 @@ class ConcatZ : public GPUOperation {
   ConcatZ& operator=(const ConcatZ&) = delete;
 
  private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
-  std::vector<int> channels_;
-
-  CLKernel kernel_;
-  int3 work_group_size_ = int3(8, 4, 1);
+  std::string GetConcatKernelCode(const OperationDef& op_def,
+                                  const std::vector<int>& channels);
 };
 
 ConcatZ CreateConcatZ(const OperationDef& definition,
-                      const std::vector<int>& channels);
+                      const std::vector<int>& channels,
+                      const DeviceInfo& device_info);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc
index 5e54faa378f..4b898378c2d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc
@@ -30,133 +30,6 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-
-Conv3D::Conv3D(const OperationDef& definition,
-               const Convolution3DAttributes& attr, const CLDevice& device)
-    : GPUOperation(definition),
-      stride_(attr.strides.w, attr.strides.h, attr.strides.d),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
-               -attr.padding.prepended.d),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
-                   attr.weights.shape.d),
-      dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d),
-      conv_params_(GuessBestParams(device, definition, attr)) {}
-
-Conv3D::Conv3D(Conv3D&& operation)
-    : GPUOperation(std::move(operation)),
-      stride_(operation.stride_),
-      padding_(operation.padding_),
-      kernel_size_(operation.kernel_size_),
-      dilation_(operation.dilation_),
-      conv_params_(operation.conv_params_),
-      kernel_(std::move(operation.kernel_)) {}
-
-Conv3D& Conv3D::operator=(Conv3D&& operation) {
-  if (this != &operation) {
-    std::swap(stride_, operation.stride_);
-    std::swap(padding_, operation.padding_);
-    std::swap(kernel_size_, operation.kernel_size_);
-    std::swap(dilation_, operation.dilation_);
-    std::swap(conv_params_, operation.conv_params_);
-    kernel_ = std::move(operation.kernel_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status Conv3D::Compile(const CreationContext& creation_context) {
-  const bool stride_correction =
-      definition_.IsBatchSupported() && stride_.x != 1;
-  std::string code =
-      GenerateConv3D(definition_, stride_correction, conv_params_, &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-
-  std::vector<CompilerOptions> options;
-  if (definition_.precision == CalculationsPrecision::F16 &&
-      creation_context.device->IsPowerVR()) {
-    options.push_back(CompilerOptions::POWERVR_FP16);
-  }
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", options, *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
-absl::Status Conv3D::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  if (!conv_params_.x_kernel_is_1) {
-    RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
-    RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
-    RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
-  }
-  if (!conv_params_.y_kernel_is_1) {
-    RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
-    RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
-    RETURN_IF_ERROR(args_.SetInt("dilation_y", dilation_.y));
-  }
-  if (!conv_params_.z_kernel_is_1) {
-    RETURN_IF_ERROR(args_.SetInt("stride_z", stride_.z));
-    RETURN_IF_ERROR(args_.SetInt("padding_z", padding_.z));
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
-    RETURN_IF_ERROR(args_.SetInt("dilation_z", dilation_.z));
-  }
-  RETURN_IF_ERROR(args_.SetInt(
-      "grid_size_s",
-      DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w)));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
-}
-
-int3 Conv3D::GetGridSize() const {
-  const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
-                                   conv_params_.block_size.x);
-  const int grid_y =
-      DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
-  const int grid_z =
-      DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w) *
-      DivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
-  int3 wg;
-  wg.x = DivideRoundUp(grid_x, conv_params_.work_group_size.x);
-  wg.y = DivideRoundUp(grid_y, conv_params_.work_group_size.y);
-  wg.z = DivideRoundUp(grid_z, conv_params_.work_group_size.z);
-  return int3(wg[conv_params_.work_group_launch_order[0]] *
-                  conv_params_.work_group_size.x,
-              wg[conv_params_.work_group_launch_order[1]] *
-                  conv_params_.work_group_size.y,
-              wg[conv_params_.work_group_launch_order[2]] *
-                  conv_params_.work_group_size.z);
-}
-
-absl::Status Conv3D::Tune(const TuningParameters& params) {
-  if (conv_params_.weights_upload_type ==
-          WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP ||
-      conv_params_.weights_upload_type ==
-          WeightsUploadType::LOCAL_MEM_BY_THREADS) {
-    return absl::OkStatus();
-  }
-  if (conv_params_.work_group_launch_order[0] == 0 &&
-      conv_params_.work_group_launch_order[1] == 1 &&
-      conv_params_.work_group_launch_order[2] == 2) {
-    RETURN_IF_ERROR(BindArguments());
-    return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
-                                &conv_params_.work_group_size);
-  }
-  return absl::OkStatus();
-}
-
-absl::Status Conv3D::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(),
-                                 conv_params_.work_group_size);
-}
-
 namespace {
 std::string GenerateUploadByThreads(const std::string& local_ptr_name,
                                     const std::string& global_ptr_name,
@@ -293,39 +166,141 @@ std::string GenerateConv(CalculationsPrecision precision,
 }
 }  // namespace
 
-std::string GenerateConv3D(const OperationDef& op_def, bool stride_correction,
-                           const Conv3D::ConvParams& conv_params,
-                           Arguments* args) {
-  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
-  src_desc->SetTextureAddressMode(TextureAddressMode::ZERO);
+Conv3D::Conv3D(const OperationDef& definition,
+               const Convolution3DAttributes& attr, const CLDevice& device)
+    : GPUOperation(definition),
+      stride_(attr.strides.w, attr.strides.h, attr.strides.d),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
+               -attr.padding.prepended.d),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
+                   attr.weights.shape.d),
+      dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d),
+      conv_params_(GuessBestParams(device, definition, attr)) {
+  const bool stride_correction =
+      definition_.IsBatchSupported() && stride_.x != 1;
+  code_ = GenerateConv3D(definition_, stride_correction, conv_params_);
+  if (definition_.precision == CalculationsPrecision::F16 &&
+      device.IsPowerVR()) {
+    compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
+  }
+}
+
+Conv3D::Conv3D(Conv3D&& operation)
+    : GPUOperation(std::move(operation)),
+      stride_(operation.stride_),
+      padding_(operation.padding_),
+      kernel_size_(operation.kernel_size_),
+      dilation_(operation.dilation_),
+      conv_params_(operation.conv_params_) {}
+
+Conv3D& Conv3D::operator=(Conv3D&& operation) {
+  if (this != &operation) {
+    std::swap(stride_, operation.stride_);
+    std::swap(padding_, operation.padding_);
+    std::swap(kernel_size_, operation.kernel_size_);
+    std::swap(dilation_, operation.dilation_);
+    std::swap(conv_params_, operation.conv_params_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+absl::Status Conv3D::BindArguments() {
+  if (!conv_params_.x_kernel_is_1) {
+    RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
+    RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
+    RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
+  }
+  if (!conv_params_.y_kernel_is_1) {
+    RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
+    RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
+    RETURN_IF_ERROR(args_.SetInt("dilation_y", dilation_.y));
+  }
+  if (!conv_params_.z_kernel_is_1) {
+    RETURN_IF_ERROR(args_.SetInt("stride_z", stride_.z));
+    RETURN_IF_ERROR(args_.SetInt("padding_z", padding_.z));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
+    RETURN_IF_ERROR(args_.SetInt("dilation_z", dilation_.z));
+  }
+  return args_.SetInt("grid_size_s", DivideRoundUp(dst_[0]->Slices(),
+                                                   conv_params_.block_size.w));
+}
+
+int3 Conv3D::GetGridSize() const {
+  const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
+                                   conv_params_.block_size.x);
+  const int grid_y =
+      DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
+  const int grid_z =
+      DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w) *
+      DivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
+  int3 wg;
+  wg.x = DivideRoundUp(grid_x, work_group_size_.x);
+  wg.y = DivideRoundUp(grid_y, work_group_size_.y);
+  wg.z = DivideRoundUp(grid_z, work_group_size_.z);
+  return int3(wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
+              wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y,
+              wg[conv_params_.work_group_launch_order[2]] * work_group_size_.z);
+}
+
+void Conv3D::GetPossibleKernelWorkGroups(TuningType tuning_type,
+                                         const DeviceInfo& device_info,
+                                         const KernelInfo& kernel_info,
+                                         std::vector<int3>* work_groups) const {
+  if (conv_params_.weights_upload_type ==
+          WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP ||
+      conv_params_.weights_upload_type ==
+          WeightsUploadType::LOCAL_MEM_BY_THREADS) {
+    work_groups->push_back(work_group_size_);
+    return;
+  }
+  if (conv_params_.work_group_launch_order[0] == 0 &&
+      conv_params_.work_group_launch_order[1] == 1 &&
+      conv_params_.work_group_launch_order[2] == 2) {
+    GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
+                              work_groups);
+  } else {
+    work_groups->push_back(work_group_size_);
+  }
+}
+
+std::string Conv3D::GenerateConv3D(const OperationDef& op_def,
+                                   bool stride_correction,
+                                   const Conv3D::ConvParams& conv_params) {
+  auto src_desc = op_def.src_tensors[0];
+  src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
   if (op_def.IsBatchSupported()) {
-    src_desc->SetStateVar("BatchedWidth", "true");
+    src_desc.SetStateVar("BatchedWidth", "true");
   }
-  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
-  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  AddSrcTensor("src_tensor", src_desc);
+
+  auto dst_desc = op_def.dst_tensors[0];
   if (op_def.IsBatchSupported()) {
-    dst_desc->SetStateVar("BatchedWidth", "true");
+    dst_desc.SetStateVar("BatchedWidth", "true");
   }
-  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
-  if (!conv_params.x_kernel_is_1) {
-    args->AddInt("stride_x");
-    args->AddInt("padding_x");
-    args->AddInt("kernel_size_x");
-    args->AddInt("dilation_x");
+  AddDstTensor("dst_tensor", dst_desc);
+
+  if (!conv_params_.x_kernel_is_1) {
+    args_.AddInt("stride_x");
+    args_.AddInt("padding_x");
+    args_.AddInt("kernel_size_x");
+    args_.AddInt("dilation_x");
   }
-  if (!conv_params.y_kernel_is_1) {
-    args->AddInt("stride_y");
-    args->AddInt("padding_y");
-    args->AddInt("kernel_size_y");
-    args->AddInt("dilation_y");
+  if (!conv_params_.y_kernel_is_1) {
+    args_.AddInt("stride_y");
+    args_.AddInt("padding_y");
+    args_.AddInt("kernel_size_y");
+    args_.AddInt("dilation_y");
   }
-  if (!conv_params.z_kernel_is_1) {
-    args->AddInt("stride_z");
-    args->AddInt("padding_z");
-    args->AddInt("kernel_size_z");
-    args->AddInt("dilation_z");
+  if (!conv_params_.z_kernel_is_1) {
+    args_.AddInt("stride_z");
+    args_.AddInt("padding_z");
+    args_.AddInt("kernel_size_z");
+    args_.AddInt("dilation_z");
   }
-  args->AddInt("grid_size_s");
+  args_.AddInt("grid_size_s");
 
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
   const bool buffer_type = src_tensor_type == TensorStorageType::BUFFER ||
@@ -353,14 +328,13 @@ std::string GenerateConv3D(const OperationDef& op_def, bool stride_correction,
       conv_params.weights_upload_type ==
           Conv3D::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP;
 
-  const int3 work_group_size = conv_params.work_group_size;
   const int4 block_size = conv_params.block_size;
   std::string c = GetCommonDefines(op_def.precision);
   if (need_local_mem) {  // we use fixed workgroup size when use local mem
     c += "__attribute__((reqd_work_group_size(" +
-         std::to_string(work_group_size.x) + ", " +
-         std::to_string(work_group_size.y) + ", " +
-         std::to_string(work_group_size.z) + ")))\n";
+         std::to_string(work_group_size_.x) + ", " +
+         std::to_string(work_group_size_.y) + ", " +
+         std::to_string(work_group_size_.z) + ")))\n";
   }
   c += "__kernel void main_function(\n";
   c += "$0) {\n";
@@ -373,7 +347,7 @@ std::string GenerateConv3D(const OperationDef& op_def, bool stride_correction,
   }
   if (conv_params.weights_upload_type ==
       Conv3D::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
-    c += "  int lid = get_local_id(1) * " + std::to_string(work_group_size.x) +
+    c += "  int lid = get_local_id(1) * " + std::to_string(work_group_size_.x) +
          " + get_local_id(0);\n";
   }
   for (int s = 0; s < block_size.w; ++s) {
@@ -633,7 +607,7 @@ std::string GenerateConv3D(const OperationDef& op_def, bool stride_correction,
   declare_src();
   c += "  do {\n";
   const int total_work_items =
-      work_group_size.x * work_group_size.y * work_group_size.z;
+      work_group_size_.x * work_group_size_.y * work_group_size_.z;
   if (conv_params.weights_upload_type ==
       Conv3D::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) {
     c +=
@@ -756,14 +730,14 @@ Conv3D::ConvParams Conv3D::GuessBestParams(const CLDevice& device,
                                            int src_slices, int dst_slices,
                                            bool x_kernel_is_1,
                                            bool y_kernel_is_1,
-                                           bool z_kernel_is_1) const {
+                                           bool z_kernel_is_1) {
   ConvParams conv_params;
   conv_params.x_kernel_is_1 = x_kernel_is_1;
   conv_params.y_kernel_is_1 = y_kernel_is_1;
   conv_params.z_kernel_is_1 = z_kernel_is_1;
   if (device.IsNvidia()) {
     conv_params.block_size = int4(1, 1, 1, 4);
-    conv_params.work_group_size = int3(8, 4, 1);
+    work_group_size_ = int3(8, 4, 1);
     conv_params.work_group_launch_order = int3(2, 0, 1);
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
@@ -782,7 +756,7 @@ Conv3D::ConvParams Conv3D::GuessBestParams(const CLDevice& device,
     }
   } else if (device.IsPowerVR()) {
     conv_params.block_size = int4(1, 1, 1, 4);
-    conv_params.work_group_size = int3(8, 4, 1);
+    work_group_size_ = int3(8, 4, 1);
     conv_params.work_group_launch_order = int3(2, 0, 1);
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type =
@@ -816,17 +790,17 @@ Conv3D::ConvParams Conv3D::GuessBestParams(const CLDevice& device,
         }
       }
       conv_params.block_size.x = 2;
-      conv_params.work_group_size = int3(4, 8, 1);
+      work_group_size_ = int3(4, 8, 1);
     }
   } else if (device.IsAdreno()) {
     conv_params.block_size = int4(2, 2, 1, 2);
-    conv_params.work_group_size = int3(8, 4, 1);
+    work_group_size_ = int3(8, 4, 1);
     conv_params.work_group_launch_order = int3(0, 1, 2);
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type = WeightsUploadType::TEXTURES_MEM;
   } else if (device.IsMali()) {
     conv_params.block_size = int4(1, 1, 1, 4);
-    conv_params.work_group_size = int3(8, 4, 1);
+    work_group_size_ = int3(8, 4, 1);
     conv_params.work_group_launch_order = int3(0, 1, 2);
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
@@ -845,7 +819,7 @@ Conv3D::ConvParams Conv3D::GuessBestParams(const CLDevice& device,
     }
   } else {
     conv_params.block_size = int4(2, 2, 1, 2);
-    conv_params.work_group_size = int3(8, 4, 1);
+    work_group_size_ = int3(8, 4, 1);
     conv_params.work_group_launch_order = int3(0, 1, 2);
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type = WeightsUploadType::TEXTURES_MEM;
@@ -856,7 +830,7 @@ Conv3D::ConvParams Conv3D::GuessBestParams(const CLDevice& device,
 
 Conv3D::ConvParams Conv3D::GuessBestParams(
     const CLDevice& device, const OperationDef& definition,
-    const Convolution3DAttributes& attr) const {
+    const Convolution3DAttributes& attr) {
   const int dst_slices = DivideRoundUp(attr.weights.shape.o, 4);
   const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
   const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h
index 7a00fabe6a0..e53c9c8a6d0 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h
@@ -39,9 +39,12 @@ namespace cl {
 class Conv3D : public GPUOperation {
  public:
   Conv3D() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-  absl::Status Compile(const CreationContext& creation_context) override;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   Conv3D(Conv3D&& operation);
@@ -59,7 +62,6 @@ class Conv3D : public GPUOperation {
 
   struct ConvParams {
     int4 block_size;  // WHDS
-    int3 work_group_size;
     int3 work_group_launch_order;
     int src_depth_loop_size;
     WeightsUploadType weights_upload_type;
@@ -98,23 +100,21 @@ class Conv3D : public GPUOperation {
 
   ConvParams GuessBestParams(const CLDevice& device,
                              const OperationDef& definition,
-                             const Convolution3DAttributes& attr) const;
+                             const Convolution3DAttributes& attr);
 
   ConvParams GuessBestParams(const CLDevice& device,
                              const OperationDef& definition, int src_slices,
                              int dst_slices, bool x_kernel_is_1,
-                             bool y_kernel_is_1, bool z_kernel_is_1) const;
+                             bool y_kernel_is_1, bool z_kernel_is_1);
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
+  std::string GenerateConv3D(const OperationDef& op_def, bool stride_correction,
+                             const Conv3D::ConvParams& conv_params);
 
   int3 stride_;
   int3 padding_;
   int3 kernel_size_;
   int3 dilation_;
   ConvParams conv_params_;
-
-  CLKernel kernel_;
 };
 
 template <DataType T>
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
index 94a81ce3fa5..e75fe02df7a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
@@ -81,24 +81,120 @@ std::string GetComputationPart(const int3& block_size, int element_size,
   return c;
 }
 
-std::string GenerateConvBuffer1x1(const OperationDef& op_def,
-                                  const ConvBuffer1x1::ConvParams& conv_params,
-                                  Arguments* args) {
-  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
+ConvBuffer1x1::ConvParams GetBestParams(const CLDevice& device,
+                                        const OperationDef& definition,
+                                        const BHWC& shape, int src_depth,
+                                        int dst_depth) {
+  ConvBuffer1x1::ConvParams conv_params;
+  conv_params.element_size = 4;
+  conv_params.block_size = int3(1, 1, 1);
+  if (!device.IsMali()) {
+    return conv_params;
+  }
+  bool can_use_flt8 = (shape.w * shape.b) % 2 == 0 &&
+                      definition.precision != CalculationsPrecision::F32;
+  bool is_midgard = device.IsMali() && device.info_.mali_info.IsMidgard();
+  if (is_midgard) {
+    if (can_use_flt8) {
+      conv_params.element_size = 8;
+    }
+    if (definition.precision == CalculationsPrecision::F16 || !can_use_flt8) {
+      conv_params.block_size.x = 2;
+    }
+    return conv_params;
+  }
+
+  int task_size = shape.w * shape.b * shape.h * dst_depth;
+  int block_size = GetRecommendedBlockSizeForConv(
+      device.info_, definition.precision, task_size);
+
+  if (!can_use_flt8 && block_size > 4) {
+    block_size = 4;
+  }
+
+  if (can_use_flt8 && block_size >= 2) {
+    conv_params.element_size = 8;
+    block_size /= 2;
+  }
+  if (block_size == 4) {
+    conv_params.block_size.x = 2;
+    if (definition.precision == CalculationsPrecision::F32 && dst_depth < 32) {
+      conv_params.block_size.y = 2;
+    } else {
+      conv_params.block_size.z = 2;
+    }
+  } else if (block_size == 2) {
+    if (dst_depth >= 32) {
+      conv_params.block_size.z = 2;
+    } else {
+      conv_params.block_size.x = 2;
+    }
+  }
+
+  return conv_params;
+}
+
+ConvBuffer1x1::ConvParams GetBestParams(const CLDevice& device,
+                                        const OperationDef& definition,
+                                        int src_depth, int dst_depth) {
+  ConvBuffer1x1::ConvParams conv_params;
+  conv_params.element_size = 4;
+  conv_params.block_size = int3(1, 1, 1);
+  if (device.IsMali() && definition.precision == CalculationsPrecision::F16 &&
+      device.info_.compute_units_count <= 4) {
+    conv_params.block_size.x *= 2;
+  }
+  return conv_params;
+}
+
+}  // namespace
+
+ConvBuffer1x1::ConvBuffer1x1(const OperationDef& definition,
+                             const ConvParams& conv_params)
+    : GPUOperation(definition), conv_params_(conv_params) {
+  code_ = GenerateConvBuffer1x1(definition_, conv_params_, &args_);
+  work_group_size_ = int3(2, 4, 1);
+}
+
+ConvBuffer1x1::ConvBuffer1x1(ConvBuffer1x1&& operation)
+    : GPUOperation(std::move(operation)),
+      conv_params_(std::move(operation.conv_params_)) {}
+
+ConvBuffer1x1& ConvBuffer1x1::operator=(ConvBuffer1x1&& operation) {
+  if (this != &operation) {
+    std::swap(conv_params_, operation.conv_params_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+std::string ConvBuffer1x1::GenerateConvBuffer1x1(
+    const OperationDef& op_def, const ConvBuffer1x1::ConvParams& conv_params,
+    Arguments* args) {
+  auto src_desc = op_def.src_tensors[0];
   if (op_def.IsBatchSupported()) {
-    src_desc->SetStateVar("BatchedWidth", "true");
+    src_desc.SetStateVar("BatchedWidth", "true");
   }
-  if (conv_params.element_size == 8) {
-    src_desc->SetStateVar("ElementsX2", "true");
-  } else if (conv_params.element_size == 16) {
-    src_desc->SetStateVar("ElementsX4", "true");
+  if (conv_params_.element_size == 8) {
+    src_desc.SetStateVar("ElementsX2", "true");
+  } else if (conv_params_.element_size == 16) {
+    src_desc.SetStateVar("ElementsX4", "true");
   }
-  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
-  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  AddSrcTensor("src_tensor", src_desc);
+  if (op_def.src_tensors.size() == 2) {
+    // dynamic weights
+    BufferDescriptor desc;
+    desc.element_type = op_def.src_tensors[1].data_type;
+    desc.element_size = 16;
+    desc.memory_type = MemoryType::GLOBAL;
+    AddSrcBuffer("weights", desc);
+  }
+
+  auto dst_desc = op_def.dst_tensors[0];
   if (op_def.IsBatchSupported()) {
-    dst_desc->SetStateVar("BatchedWidth", "true");
+    dst_desc.SetStateVar("BatchedWidth", "true");
   }
-  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
+  AddDstTensor("dst_tensor", dst_desc);
 
   std::string c = GetCommonDefines(op_def.precision);
   switch (op_def.precision) {
@@ -207,116 +303,6 @@ std::string GenerateConvBuffer1x1(const OperationDef& op_def,
   return c;
 }
 
-ConvBuffer1x1::ConvParams GetBestParams(const CLDevice& device,
-                                        const OperationDef& definition,
-                                        const BHWC& shape, int src_depth,
-                                        int dst_depth) {
-  ConvBuffer1x1::ConvParams conv_params;
-  conv_params.element_size = 4;
-  conv_params.block_size = int3(1, 1, 1);
-  if (!device.IsMali()) {
-    return conv_params;
-  }
-  bool can_use_flt8 = (shape.w * shape.b) % 2 == 0 &&
-                      definition.precision != CalculationsPrecision::F32;
-  bool is_midgard = device.IsMali() && device.GetInfo().mali_info.IsMidgard();
-  if (is_midgard) {
-    if (can_use_flt8) {
-      conv_params.element_size = 8;
-    }
-    if (definition.precision == CalculationsPrecision::F16 || !can_use_flt8) {
-      conv_params.block_size.x = 2;
-    }
-    return conv_params;
-  }
-
-  int task_size = shape.w * shape.b * shape.h * dst_depth;
-  int block_size =
-      GetRecommendedBlockSizeForConv(device, definition.precision, task_size);
-
-  if (!can_use_flt8 && block_size > 4) {
-    block_size = 4;
-  }
-
-  if (can_use_flt8 && block_size >= 2) {
-    conv_params.element_size = 8;
-    block_size /= 2;
-  }
-  if (block_size == 4) {
-    conv_params.block_size.x = 2;
-    if (definition.precision == CalculationsPrecision::F32 && dst_depth < 32) {
-      conv_params.block_size.y = 2;
-    } else {
-      conv_params.block_size.z = 2;
-    }
-  } else if (block_size == 2) {
-    if (dst_depth >= 32) {
-      conv_params.block_size.z = 2;
-    } else {
-      conv_params.block_size.x = 2;
-    }
-  }
-
-  return conv_params;
-}
-
-ConvBuffer1x1::ConvParams GetBestParams(const CLDevice& device,
-                                        const OperationDef& definition,
-                                        int src_depth, int dst_depth) {
-  ConvBuffer1x1::ConvParams conv_params;
-  conv_params.element_size = 4;
-  conv_params.block_size = int3(1, 1, 1);
-  if (device.IsMali() && definition.precision == CalculationsPrecision::F16 &&
-      device.GetInfo().compute_units_count <= 4) {
-    conv_params.block_size.x *= 2;
-  }
-  return conv_params;
-}
-
-}  // namespace
-
-ConvBuffer1x1::ConvBuffer1x1(const OperationDef& definition,
-                             const ConvParams& conv_params)
-    : GPUOperation(definition), conv_params_(conv_params) {}
-
-ConvBuffer1x1::ConvBuffer1x1(ConvBuffer1x1&& operation)
-    : GPUOperation(std::move(operation)),
-      conv_params_(std::move(operation.conv_params_)),
-      kernel_(std::move(operation.kernel_)) {}
-
-ConvBuffer1x1& ConvBuffer1x1::operator=(ConvBuffer1x1&& operation) {
-  if (this != &operation) {
-    std::swap(conv_params_, operation.conv_params_);
-    kernel_ = std::move(operation.kernel_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status ConvBuffer1x1::Compile(const CreationContext& creation_context) {
-  std::string code = GenerateConvBuffer1x1(definition_, conv_params_, &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_));
-  return absl::OkStatus();
-}
-
-absl::Status ConvBuffer1x1::BindArguments() {
-  if (definition_.src_tensors.size() == 2) {
-    RETURN_IF_ERROR(args_.SetObjectRef("weights", src_[1]));
-  }
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
-}
-
 int3 ConvBuffer1x1::GetGridSize() const {
   const int dst_width_elements = DivideRoundUp(
       dst_[0]->Width() * dst_[0]->Batch(), (conv_params_.element_size / 4));
@@ -329,16 +315,11 @@ int3 ConvBuffer1x1::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status ConvBuffer1x1::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
-                              &conv_params_.work_group_size);
-}
-
-absl::Status ConvBuffer1x1::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(),
-                                 conv_params_.work_group_size);
+void ConvBuffer1x1::GetPossibleKernelWorkGroups(
+    TuningType tuning_type, const DeviceInfo& device_info,
+    const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
+  GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
+                            work_groups);
 }
 
 bool IsConvBuffer1x1Supported(const OperationDef& definition,
@@ -442,12 +423,6 @@ absl::Status CreateConvBuffer1x1DynamicWeights(
                                 dst_depth);
   }
   *result = ConvBuffer1x1(definition, conv_params);
-  BufferDescriptor desc;
-  desc.element_type = definition.src_tensors[1].data_type;
-  desc.element_size = 16;
-  desc.memory_type = MemoryType::GLOBAL;
-  result->args_.AddObjectRef("weights", AccessType::READ,
-                             absl::make_unique<BufferDescriptor>(desc));
   return result->UploadBiases(attr.bias, creation_context.context);
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
index 9e3f9711682..530aec70a17 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
@@ -47,9 +47,11 @@ class ConvBuffer1x1 : public GPUOperation {
   ConvBuffer1x1(const ConvBuffer1x1&) = delete;
   ConvBuffer1x1& operator=(const ConvBuffer1x1&) = delete;
 
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-  absl::Status Compile(const CreationContext& creation_context) override;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override;
+  int3 GetGridSize() const override;
 
   ConvWeightsDescription GetConvWeightsDescription() const {
     ConvWeightsDescription desc;
@@ -66,8 +68,6 @@ class ConvBuffer1x1 : public GPUOperation {
     // some cases we need separate weights for H dimension and convolution
     // kernel requires very small modifications to support it.
     bool different_weights_for_height = false;
-
-    int3 work_group_size = int3(2, 4, 1);
   };
 
  private:
@@ -106,11 +106,11 @@ class ConvBuffer1x1 : public GPUOperation {
   absl::Status UploadBiases(const tflite::gpu::Tensor<Linear, T>& biases,
                             CLContext* context);
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
+  std::string GenerateConvBuffer1x1(
+      const OperationDef& op_def, const ConvBuffer1x1::ConvParams& conv_params,
+      Arguments* args);
 
   ConvParams conv_params_;
-  CLKernel kernel_;
 };
 
 template <DataType T>
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
index e6fc5da36a2..1ed900a2080 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
@@ -26,30 +26,97 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 namespace {
+// Adreno can provide up to ~3-4KB of constant memory, but in some cases even
+// 3KB can have very bad performance.
+int GetAdrenoOptimalMaxConstantSize(int gpu_version) {
+  if (gpu_version < 600) {
+    return 256 * 10;  // 2.5KB
+  } else {
+    return 256 * 14;  // 3.5KB
+  }
+}
 
-std::string GenerateConvolutionConstantCode(const OperationDef& op_def,
-                                            const int2& kernel_size,
-                                            int src_channels, int dst_channels,
-                                            bool stride_correction,
-                                            const CLDevice& device,
-                                            Arguments* args) {
-  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
-  src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
-  if (op_def.IsBatchSupported()) {
-    src_desc->SetStateVar("BatchedWidth", "true");
+int GetOptimalMaxConstantSize(const DeviceInfo& info) {
+  if (!info.IsAdreno()) {
+    // In general we do not expect that this kernel will be used with non Adreno
+    // so as it tuned for __constant memory that have big profit on Adreno
+    return 1024;  // 1KB
+  } else {
+    return GetAdrenoOptimalMaxConstantSize(info.adreno_info.gpu_version);
   }
-  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
-  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
-  if (op_def.IsBatchSupported()) {
-    dst_desc->SetStateVar("BatchedWidth", "true");
+}
+}  // namespace
+
+ConvConstants::ConvConstants(const OperationDef& definition,
+                             const Convolution2DAttributes& attr,
+                             const DeviceInfo& device_info)
+    : GPUOperation(definition),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
+      stride_(attr.strides.w, attr.strides.h),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
+      dilation_(attr.dilations.w, attr.dilations.h),
+      src_channels_(attr.weights.shape.i),
+      dst_channels_(attr.weights.shape.o) {
+  const bool stride_correction =
+      definition_.IsBatchSupported() && stride_.x != 1;
+  code_ =
+      GenerateConvolutionConstantCode(definition_, kernel_size_, src_channels_,
+                                      dst_channels_, stride_correction);
+  if (definition_.precision == CalculationsPrecision::F16 &&
+      device_info.IsAdreno3xx()) {
+    compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
   }
-  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
-  args->AddInt("stride_x");
-  args->AddInt("stride_y");
-  args->AddInt("padding_x");
-  args->AddInt("padding_y");
-  args->AddInt("dilation_x");
-  args->AddInt("dilation_y");
+  if (definition_.precision != CalculationsPrecision::F32 &&
+      device_info.IsPowerVR()) {
+    // BUG, some PowerVRs (GE8320) produce incorrect result without it
+    compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
+  }
+}
+
+ConvConstants::ConvConstants(ConvConstants&& kernel)
+    : GPUOperation(std::move(kernel)),
+      kernel_size_(kernel.kernel_size_),
+      stride_(kernel.stride_),
+      padding_(kernel.padding_),
+      dilation_(kernel.dilation_),
+      src_channels_(kernel.src_channels_),
+      dst_channels_(kernel.dst_channels_) {}
+
+ConvConstants& ConvConstants::operator=(ConvConstants&& kernel) {
+  if (this != &kernel) {
+    std::swap(kernel_size_, kernel.kernel_size_);
+    std::swap(stride_, kernel.stride_);
+    std::swap(padding_, kernel.padding_);
+    std::swap(dilation_, kernel.dilation_);
+    std::swap(src_channels_, kernel.src_channels_);
+    std::swap(dst_channels_, kernel.dst_channels_);
+    GPUOperation::operator=(std::move(kernel));
+  }
+  return *this;
+}
+
+std::string ConvConstants::GenerateConvolutionConstantCode(
+    const OperationDef& op_def, const int2& kernel_size, int src_channels,
+    int dst_channels, bool stride_correction) {
+  auto src_desc = op_def.src_tensors[0];
+  src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
+  if (op_def.IsBatchSupported()) {
+    src_desc.SetStateVar("BatchedWidth", "true");
+  }
+  AddSrcTensor("src_tensor", src_desc);
+
+  auto dst_desc = op_def.dst_tensors[0];
+  if (op_def.IsBatchSupported()) {
+    dst_desc.SetStateVar("BatchedWidth", "true");
+  }
+  AddDstTensor("dst_tensor", dst_desc);
+
+  args_.AddInt("stride_x");
+  args_.AddInt("stride_y");
+  args_.AddInt("padding_x");
+  args_.AddInt("padding_y");
+  args_.AddInt("dilation_x");
+  args_.AddInt("dilation_y");
 
   std::string c = GetCommonDefines(op_def.precision);
 
@@ -173,91 +240,13 @@ std::string GenerateConvolutionConstantCode(const OperationDef& op_def,
   return c;
 }
 
-// Adreno can provide up to ~3-4KB of constant memory, but in some cases even
-// 3KB can have very bad performance.
-int GetAdrenoOptimalMaxConstantSize(int gpu_version) {
-  if (gpu_version < 600) {
-    return 256 * 10;  // 2.5KB
-  } else {
-    return 256 * 14;  // 3.5KB
-  }
-}
-
-int GetOptimalMaxConstantSize(const DeviceInfo& info) {
-  if (info.vendor != Vendor::QUALCOMM) {
-    // In general we do not expect that this kernel will be used with non Adreno
-    // so as it tuned for __constant memory that have big profit on Adreno
-    return 1024;  // 1KB
-  } else {
-    return GetAdrenoOptimalMaxConstantSize(info.adreno_info.gpu_version);
-  }
-}
-}  // namespace
-
-ConvConstants::ConvConstants(ConvConstants&& kernel)
-    : GPUOperation(std::move(kernel)),
-      kernel_size_(kernel.kernel_size_),
-      stride_(kernel.stride_),
-      padding_(kernel.padding_),
-      dilation_(kernel.dilation_),
-      src_channels_(kernel.src_channels_),
-      dst_channels_(kernel.dst_channels_),
-      kernel_(std::move(kernel.kernel_)),
-      work_group_size_(kernel.work_group_size_) {}
-
-ConvConstants& ConvConstants::operator=(ConvConstants&& kernel) {
-  if (this != &kernel) {
-    std::swap(kernel_size_, kernel.kernel_size_);
-    std::swap(stride_, kernel.stride_);
-    std::swap(padding_, kernel.padding_);
-    std::swap(dilation_, kernel.dilation_);
-    std::swap(src_channels_, kernel.src_channels_);
-    std::swap(dst_channels_, kernel.dst_channels_);
-    kernel_ = std::move(kernel.kernel_);
-    std::swap(work_group_size_, kernel.work_group_size_);
-    GPUOperation::operator=(std::move(kernel));
-  }
-  return *this;
-}
-
-absl::Status ConvConstants::Compile(const CreationContext& creation_context) {
-  const bool stride_correction =
-      definition_.IsBatchSupported() && stride_.x != 1;
-  std::string code = GenerateConvolutionConstantCode(
-      definition_, kernel_size_, src_channels_, dst_channels_,
-      stride_correction, *creation_context.device, &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  std::vector<CompilerOptions> options;
-  if (definition_.precision == CalculationsPrecision::F16 &&
-      creation_context.device->IsAdreno3xx()) {
-    options.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
-  }
-  if (definition_.precision != CalculationsPrecision::F32 &&
-      creation_context.device->IsPowerVR()) {
-    // BUG, some PowerVRs (GE8320) produce incorrect result without it
-    options.push_back(CompilerOptions::CL_OPT_DISABLE);
-  }
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", options, *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
 absl::Status ConvConstants::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
   RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
   RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
   RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
   RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
-  RETURN_IF_ERROR(args_.SetInt("dilation_y", dilation_.y));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return args_.SetInt("dilation_y", dilation_.y);
 }
 
 int3 ConvConstants::GetGridSize() const {
@@ -266,16 +255,6 @@ int3 ConvConstants::GetGridSize() const {
   return int3(grid_x, grid_y, 1);
 }
 
-absl::Status ConvConstants::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status ConvConstants::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 bool IsConvConstantsSupported(const CLDevice& device,
                               const OperationDef& definition,
                               const Convolution2DAttributes& attr) {
@@ -292,7 +271,7 @@ bool IsConvConstantsSupported(const CLDevice& device,
                              ? sizeof(float)
                              : sizeof(half);
   const int filters_buffer_size = filters_count * float_size;
-  const int kConstantMaxSize = GetOptimalMaxConstantSize(device.GetInfo());
+  const int kConstantMaxSize = GetOptimalMaxConstantSize(device.info_);
   const int flt4_registers = DivideRoundUp(w_shape.o, 4);
   return filters_buffer_size <= kConstantMaxSize && flt4_registers <= 8;
 }
@@ -304,7 +283,7 @@ absl::Status CreateConvConstants(const CreationContext& creation_context,
   if (!IsConvConstantsSupported(*creation_context.device, definition, attr)) {
     return absl::InvalidArgumentError("ConvConstants doesn't supported");
   }
-  *result = ConvConstants(definition, attr);
+  *result = ConvConstants(definition, attr, creation_context.device->info_);
   RETURN_IF_ERROR(
       result->UploadWeights(attr.weights, creation_context.context));
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
index b9cc52f7e94..6504b828158 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
@@ -35,10 +35,8 @@ namespace cl {
 class ConvConstants : public GPUOperation {
  public:
   ConvConstants() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
-  absl::Status Compile(const CreationContext& creation_context) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   ConvConstants(ConvConstants&& kernel);
@@ -50,15 +48,9 @@ class ConvConstants : public GPUOperation {
   friend absl::Status CreateConvConstants(
       const CreationContext& creation_context, const OperationDef& definition,
       const Convolution2DAttributes& attr, ConvConstants* result);
-  explicit ConvConstants(const OperationDef& definition,
-                         const Convolution2DAttributes& attr)
-      : GPUOperation(definition),
-        kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
-        stride_(attr.strides.w, attr.strides.h),
-        padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
-        dilation_(attr.dilations.w, attr.dilations.h),
-        src_channels_(attr.weights.shape.i),
-        dst_channels_(attr.weights.shape.o) {}
+  ConvConstants(const OperationDef& definition,
+                const Convolution2DAttributes& attr,
+                const DeviceInfo& device_info);
 
   template <DataType T>
   absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
@@ -68,8 +60,11 @@ class ConvConstants : public GPUOperation {
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
                             absl::Span<T> dst);
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
+  std::string GenerateConvolutionConstantCode(const OperationDef& op_def,
+                                              const int2& kernel_size,
+                                              int src_channels,
+                                              int dst_channels,
+                                              bool stride_correction);
 
   int2 kernel_size_;
   int2 stride_;
@@ -77,9 +72,6 @@ class ConvConstants : public GPUOperation {
   int2 dilation_;
   int src_channels_;
   int dst_channels_;
-
-  CLKernel kernel_;
-  int3 work_group_size_ = int3(8, 4, 1);
 };
 
 template <DataType T>
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
index a34fa909267..eb5baa8a6ba 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
@@ -167,48 +167,33 @@ ConvPowerVR::ConvPowerVR(ConvPowerVR&& operation)
     : GPUOperation(std::move(operation)),
       stride_padding_(operation.stride_padding_),
       kernel_dilation_(operation.kernel_dilation_),
-      conv_params_(operation.conv_params_),
-      kernel_(std::move(operation.kernel_)) {}
+      conv_params_(operation.conv_params_) {}
 
 ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
   if (this != &operation) {
     std::swap(stride_padding_, operation.stride_padding_);
     std::swap(kernel_dilation_, operation.kernel_dilation_);
     std::swap(conv_params_, operation.conv_params_);
-    kernel_ = std::move(operation.kernel_);
     GPUOperation::operator=(std::move(operation));
   }
   return *this;
 }
 
-absl::Status ConvPowerVR::Compile(const CreationContext& creation_context) {
+void ConvPowerVR::GenerateCode(const DeviceInfo& device_info) {
   const bool stride_correction =
       definition_.IsBatchSupported() && stride_padding_.x != 1;
-  std::string code = GenerateConv(*creation_context.device, definition_,
-                                  stride_correction, conv_params_, &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  std::vector<CompilerOptions> options;
+  code_ =
+      GenerateConv(device_info, definition_, stride_correction, conv_params_);
   if (definition_.precision == CalculationsPrecision::F16 &&
-      creation_context.device->IsPowerVR()) {
-    options.push_back(CompilerOptions::POWERVR_FP16);
+      device_info.IsPowerVR()) {
+    compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
   }
   if (conv_params_.IsPrivateMemBroadcast()) {
-    options.push_back(CompilerOptions::CL_2_0);
+    compiler_options_.push_back(CompilerOptions::CL_2_0);
   }
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", options, *creation_context.context,
-      *creation_context.device, &kernel_);
 }
 
 absl::Status ConvPowerVR::BindArguments() {
-  if (definition_.src_tensors.size() == 2) {
-    RETURN_IF_ERROR(args_.SetObjectRef("weights", src_[1]));
-  }
   if (!conv_params_.x_kernel_is_1 || !conv_params_.y_kernel_is_1) {
     RETURN_IF_ERROR(args_.SetInt("stride_x", stride_padding_.x));
     RETURN_IF_ERROR(args_.SetInt("stride_y", stride_padding_.y));
@@ -221,15 +206,11 @@ absl::Status ConvPowerVR::BindArguments() {
         args_.SetInt("dilation_x", kernel_dilation_.z * src_[0]->Batch()));
     RETURN_IF_ERROR(args_.SetInt("dilation_y", kernel_dilation_.w));
   }
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   if (conv_params_.linear_hw) {
     const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
                                      conv_params_.block_size.x);
     RETURN_IF_ERROR(args_.SetInt("task_size_x", grid_x));
   }
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
   return absl::OkStatus();
 }
 
@@ -243,78 +224,85 @@ int3 ConvPowerVR::GetGridSize() const {
   int3 wg;
 
   if (conv_params_.linear_hw) {
-    wg.x = DivideRoundUp(grid_x * grid_y, conv_params_.work_group_size.x);
-    wg.y = DivideRoundUp(grid_z, conv_params_.work_group_size.y);
-    return int3(wg[conv_params_.work_group_launch_order[0]] *
-                    conv_params_.work_group_size.x,
-                wg[conv_params_.work_group_launch_order[1]] *
-                    conv_params_.work_group_size.y,
-                1);
+    wg.x = DivideRoundUp(grid_x * grid_y, work_group_size_.x);
+    wg.y = DivideRoundUp(grid_z, work_group_size_.y);
+    return int3(
+        wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
+        wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y, 1);
   } else {
-    wg.x = DivideRoundUp(grid_x, conv_params_.work_group_size.x);
-    wg.y = DivideRoundUp(grid_y, conv_params_.work_group_size.y);
-    wg.z = DivideRoundUp(grid_z, conv_params_.work_group_size.z);
-    return int3(wg[conv_params_.work_group_launch_order[0]] *
-                    conv_params_.work_group_size.x,
-                wg[conv_params_.work_group_launch_order[1]] *
-                    conv_params_.work_group_size.y,
-                wg[conv_params_.work_group_launch_order[2]] *
-                    conv_params_.work_group_size.z);
+    wg.x = DivideRoundUp(grid_x, work_group_size_.x);
+    wg.y = DivideRoundUp(grid_y, work_group_size_.y);
+    wg.z = DivideRoundUp(grid_z, work_group_size_.z);
+    return int3(
+        wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
+        wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y,
+        wg[conv_params_.work_group_launch_order[2]] * work_group_size_.z);
   }
 }
 
-absl::Status ConvPowerVR::Tune(const TuningParameters& params) {
+void ConvPowerVR::GetPossibleKernelWorkGroups(
+    TuningType tuning_type, const DeviceInfo& device_info,
+    const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
   if (conv_params_.weights_upload_type ==
           WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP ||
       conv_params_.weights_upload_type ==
           WeightsUploadType::LOCAL_MEM_BY_THREADS ||
       conv_params_.fixed_work_group_size) {
-    return absl::OkStatus();
+    work_groups->push_back(work_group_size_);
+    return;
   }
   if (conv_params_.work_group_launch_order[0] == 0 &&
       conv_params_.work_group_launch_order[1] == 1 &&
       conv_params_.work_group_launch_order[2] == 2) {
-    RETURN_IF_ERROR(BindArguments());
-    return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
-                                &conv_params_.work_group_size);
+    GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
+                              work_groups);
+  } else {
+    work_groups->push_back(work_group_size_);
   }
-  return absl::OkStatus();
 }
 
-absl::Status ConvPowerVR::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(),
-                                 conv_params_.work_group_size);
-}
+std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
+                                      const OperationDef& op_def,
+                                      bool stride_correction,
+                                      const ConvParams& conv_params) {
+  auto src_desc = op_def.src_tensors[0];
+  src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
+  if (op_def.IsBatchSupported()) {
+    src_desc.SetStateVar("BatchedWidth", "true");
+  }
+  AddSrcTensor("src_tensor", src_desc);
+  if (op_def.src_tensors.size() == 2) {
+    // dynamic weights
+    BufferDescriptor desc;
+    desc.element_type = op_def.src_tensors[1].data_type;
+    desc.element_size = 4;
+    desc.memory_type = conv_params.weights_upload_type ==
+                               ConvPowerVR::WeightsUploadType::CONSTANT_MEM
+                           ? MemoryType::CONSTANT
+                           : MemoryType::GLOBAL;
 
-std::string GenerateConv(const CLDevice& device, const OperationDef& op_def,
-                         bool stride_correction,
-                         const ConvPowerVR::ConvParams& conv_params,
-                         Arguments* args) {
-  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
-  src_desc->SetTextureAddressMode(TextureAddressMode::ZERO);
-  if (op_def.IsBatchSupported()) {
-    src_desc->SetStateVar("BatchedWidth", "true");
+    AddSrcBuffer("weights", desc);
   }
-  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
-  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+
+  auto dst_desc = op_def.dst_tensors[0];
   if (op_def.IsBatchSupported()) {
-    dst_desc->SetStateVar("BatchedWidth", "true");
+    dst_desc.SetStateVar("BatchedWidth", "true");
   }
-  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
-  const bool is1x1 = conv_params.x_kernel_is_1 && conv_params.y_kernel_is_1;
+  AddDstTensor("dst_tensor", dst_desc);
+
+  const bool is1x1 = conv_params_.x_kernel_is_1 && conv_params_.y_kernel_is_1;
   if (!is1x1) {
-    args->AddInt("stride_x");
-    args->AddInt("stride_y");
-    args->AddInt("padding_x");
-    args->AddInt("padding_y");
-    args->AddInt("kernel_size_x");
-    args->AddInt("kernel_size_y");
-    args->AddInt("dilation_x");
-    args->AddInt("dilation_y");
+    args_.AddInt("stride_x");
+    args_.AddInt("stride_y");
+    args_.AddInt("padding_x");
+    args_.AddInt("padding_y");
+    args_.AddInt("kernel_size_x");
+    args_.AddInt("kernel_size_y");
+    args_.AddInt("dilation_x");
+    args_.AddInt("dilation_y");
   }
-  if (conv_params.linear_hw) {
-    args->AddInt("task_size_x");
+  if (conv_params_.linear_hw) {
+    args_.AddInt("task_size_x");
   }
 
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
@@ -350,20 +338,18 @@ std::string GenerateConv(const CLDevice& device, const OperationDef& op_def,
 
   std::string c = GetCommonDefines(op_def.precision);
   if (use_simd_broadcast) {
-    if (device.cl_version() == OpenCLVersion::CL_2_0) {
+    if (device_info.cl_version == OpenCLVersion::CL_2_0) {
       c += "#pragma OPENCL EXTENSION cl_khr_subgroups : enable\n";
     }
   }
-
-  const int3 work_group_size = conv_params.work_group_size;
   const int3 block_size = conv_params.block_size;
   if (conv_params.fixed_work_group_size) {
     c += "__attribute__((reqd_work_group_size(" +
-         std::to_string(work_group_size.x) + ", " +
-         std::to_string(work_group_size.y) + ", " +
-         std::to_string(work_group_size.z) + ")))\n";
+         std::to_string(work_group_size_.x) + ", " +
+         std::to_string(work_group_size_.y) + ", " +
+         std::to_string(work_group_size_.z) + ")))\n";
   }
-  if (use_simd_broadcast && device.IsIntel()) {
+  if (use_simd_broadcast && device_info.IsIntel()) {
     c += "__attribute__((intel_reqd_sub_group_size(" +
          std::to_string(simd_size) + ")))\n";
   }
@@ -392,7 +378,7 @@ std::string GenerateConv(const CLDevice& device, const OperationDef& op_def,
       c += "  int lid = get_local_id(0);\n";
     } else {
       c += "  int lid = get_local_id(1) * " +
-           std::to_string(work_group_size.x) + " + get_local_id(0);\n";
+           std::to_string(work_group_size_.x) + " + get_local_id(0);\n";
     }
   }
   if (use_simd_broadcast) {
@@ -498,7 +484,7 @@ std::string GenerateConv(const CLDevice& device, const OperationDef& op_def,
       }
     }
   };
-  const bool conditional_read = device.IsMali();
+  const bool conditional_read = device_info.IsMali();
   auto read_src = [&]() {
     const std::string cl_type = ToCLDataType(conv_params.weights_data_type);
     for (int y = 0; y < block_size.y; ++y) {
@@ -599,7 +585,7 @@ std::string GenerateConv(const CLDevice& device, const OperationDef& op_def,
   c += "  do {\n";
   declare_src();
   const int total_work_items =
-      work_group_size.x * work_group_size.y * work_group_size.z;
+      work_group_size_.x * work_group_size_.y * work_group_size_.z;
   if (conv_params.weights_upload_type ==
       ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) {
     c += GenerateAsyncUpload("weights_cache", "filters_loc",
@@ -703,7 +689,7 @@ std::string GenerateConv(const CLDevice& device, const OperationDef& op_def,
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     const CLDevice& device, const OperationDef& definition, int src_depth,
     int dst_depth, bool x_kernel_is_1, bool y_kernel_is_1,
-    bool different_weights_for_height, const BHWC* dst_shape) const {
+    bool different_weights_for_height, const BHWC* dst_shape) {
   ConvParams conv_params;
   conv_params.linear_hw = false;
   conv_params.weights_data_type =
@@ -713,16 +699,16 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
   conv_params.different_weights_for_height = different_weights_for_height;
   if (device.IsNvidia()) {
     if (different_weights_for_height) {
-      conv_params.work_group_size = int3(32, 1, 1);
+      work_group_size_ = int3(32, 1, 1);
       conv_params.work_group_launch_order = int3(2, 0, 1);
       conv_params.fixed_work_group_size = true;
     } else {
       conv_params.linear_hw = true;
-      conv_params.work_group_size = int3(32, 1, 1);
+      work_group_size_ = int3(32, 1, 1);
       conv_params.work_group_launch_order = int3(1, 0, 2);
       conv_params.fixed_work_group_size = true;
     }
-    conv_params.block_size = int3(1, 1, 4);
+    conv_params.block_size = int3(2, 1, 4);
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
     if (dst_depth % 4 == 0 || dst_depth >= 8) {
@@ -732,6 +718,24 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     } else {
       conv_params.block_size.z = dst_depth;
     }
+    if (dst_shape) {
+      int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
+      float task_size_per_cu =
+          static_cast<float>(task_size) / device.info_.compute_units_count;
+      int block_size = conv_params.block_size.x * conv_params.block_size.y *
+                       conv_params.block_size.z;
+      float threads_per_cu = task_size_per_cu / block_size;
+      float warps_per_cu = threads_per_cu / 32 /*warp_size*/;
+      if (warps_per_cu < 8.0f) {
+        conv_params.block_size.x = 1;
+      }
+      if (warps_per_cu < 4.0f && conv_params.block_size.z >= 4) {
+        conv_params.block_size.z /= 2;
+      }
+      if (warps_per_cu < 2.0f && conv_params.block_size.z >= 2) {
+        conv_params.block_size.z /= 2;
+      }
+    }
     if (src_depth % 2 == 0) {
       conv_params.src_depth_loop_size = 2;
     }
@@ -740,12 +744,12 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     }
   } else if (device.IsPowerVR()) {
     if (different_weights_for_height) {
-      conv_params.work_group_size = int3(32, 1, 1);
+      work_group_size_ = int3(32, 1, 1);
       conv_params.work_group_launch_order = int3(2, 0, 1);
       conv_params.fixed_work_group_size = true;
     } else {
       conv_params.linear_hw = true;
-      conv_params.work_group_size = int3(32, 1, 1);
+      work_group_size_ = int3(32, 1, 1);
       conv_params.work_group_launch_order = int3(1, 0, 2);
       conv_params.fixed_work_group_size = true;
     }
@@ -788,11 +792,11 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     }
   } else if (device.IsAMD()) {
     if (different_weights_for_height) {
-      conv_params.work_group_size = int3(32, 1, 1);
+      work_group_size_ = int3(32, 1, 1);
       conv_params.work_group_launch_order = int3(2, 0, 1);
       conv_params.fixed_work_group_size = true;
     } else {
-      conv_params.work_group_size = int3(8, 4, 1);
+      work_group_size_ = int3(8, 4, 1);
       conv_params.work_group_launch_order = int3(2, 0, 1);
       conv_params.fixed_work_group_size = true;
     }
@@ -819,8 +823,8 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     int block_size = 2;
     if (dst_shape) {
       int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
-      block_size = GetRecommendedBlockSizeForConv(device, definition.precision,
-                                                  task_size);
+      block_size = GetRecommendedBlockSizeForConv(
+          device.info_, definition.precision, task_size);
     }
     if (!x_kernel_is_1 || !y_kernel_is_1) {
       block_size = std::min(block_size, 4);
@@ -843,7 +847,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
       conv_params.block_size = int3(1, 1, 1);
     }
     conv_params.src_depth_loop_size = 1;
-    MaliInfo mali_info = device.GetInfo().mali_info;
+    MaliInfo mali_info = device.info_.mali_info;
     if (src_depth % 2 == 0 && block_size <= 2 && !mali_info.IsMidgard()) {
       conv_params.src_depth_loop_size = 2;
     }
@@ -851,25 +855,25 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
         definition.precision == CalculationsPrecision::F16) {
       conv_params.src_depth_loop_size = 4;
     }
-    conv_params.work_group_size = int3(4, 4, 1);
+    work_group_size_ = int3(4, 4, 1);
     conv_params.work_group_launch_order = int3(0, 1, 2);
     conv_params.fixed_work_group_size = false;
     conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
   } else if (device.IsAdreno()) {
     conv_params.block_size = int3(2, 2, 1);
-    conv_params.work_group_size = int3(8, 2, 1);
+    work_group_size_ = int3(8, 2, 1);
     conv_params.work_group_launch_order = int3(0, 1, 2);
     conv_params.fixed_work_group_size = false;
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
   } else if (device.IsIntel()) {
     if (different_weights_for_height) {
-      conv_params.work_group_size = int3(16, 1, 1);
+      work_group_size_ = int3(16, 1, 1);
       conv_params.work_group_launch_order = int3(0, 1, 2);
       conv_params.fixed_work_group_size = true;
     } else {
       conv_params.linear_hw = true;
-      conv_params.work_group_size = int3(16, 1, 1);
+      work_group_size_ = int3(16, 1, 1);
       conv_params.work_group_launch_order = int3(0, 1, 2);
       conv_params.fixed_work_group_size = true;
     }
@@ -899,7 +903,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     }
   } else {
     conv_params.block_size = int3(1, 1, 4);
-    conv_params.work_group_size = int3(8, 2, 1);
+    work_group_size_ = int3(8, 2, 1);
     conv_params.work_group_launch_order = int3(0, 1, 2);
     conv_params.fixed_work_group_size = false;
     conv_params.src_depth_loop_size = 1;
@@ -924,7 +928,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
 
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     const CLDevice& device, const OperationDef& definition,
-    const Convolution2DAttributes& attr, const BHWC* dst_shape) const {
+    const Convolution2DAttributes& attr, const BHWC* dst_shape) {
   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
   const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
@@ -942,7 +946,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     const CLDevice& device, const OperationDef& definition,
     const Convolution2DAttributes& attr, const BHWC& weights_shape,
-    const BHWC* dst_shape) const {
+    const BHWC* dst_shape) {
   const int dst_depth = DivideRoundUp(weights_shape.b, 4);
   const int src_depth = DivideRoundUp(weights_shape.c, 4);
   const bool x_kernel_is_1 =
@@ -957,13 +961,13 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
 
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     const CLDevice& device, const OperationDef& definition,
-    const FullyConnectedAttributes& attr, const BHWC* dst_shape) const {
+    const FullyConnectedAttributes& attr, const BHWC* dst_shape) {
   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
   ConvPowerVR::ConvParams params = GuessBestParams(
       device, definition, src_depth, dst_depth, true, true, false, dst_shape);
-  params.work_group_size.x *= params.work_group_size.y;
-  params.work_group_size.y = 1;
+  work_group_size_.x *= work_group_size_.y;
+  work_group_size_.y = 1;
   params.block_size.x *= params.block_size.y;
   params.block_size.y = 1;
   return params;
@@ -971,7 +975,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
 
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParamsWinograd(
     const CLDevice& device, const OperationDef& definition,
-    const Convolution2DAttributes& attr, const BHWC* dst_shape) const {
+    const Convolution2DAttributes& attr, const BHWC* dst_shape) {
   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
   ConvPowerVR::ConvParams params = GuessBestParams(
@@ -986,6 +990,7 @@ absl::Status CreateConvPowerVR(const CreationContext& creation_context,
                                const Convolution2DAttributes& attr,
                                ConvPowerVR* result, const BHWC* dst_shape) {
   *result = ConvPowerVR(definition, attr, *creation_context.device, dst_shape);
+  result->GenerateCode(creation_context.device->info_);
   return result->UploadData(attr.weights, attr.bias, creation_context.context);
 }
 
@@ -994,6 +999,7 @@ absl::Status CreateConvPowerVR(const CreationContext& creation_context,
                                const FullyConnectedAttributes& attr,
                                ConvPowerVR* result, const BHWC* dst_shape) {
   *result = ConvPowerVR(definition, attr, *creation_context.device, dst_shape);
+  result->GenerateCode(creation_context.device->info_);
   return result->UploadData(attr.weights, attr.bias, creation_context.context);
 }
 
@@ -1003,16 +1009,7 @@ absl::Status CreateConvPowerVRDynamicWeights(
     ConvPowerVR* result, const BHWC* dst_shape) {
   *result = ConvPowerVR(definition, attr, weights_shape,
                         *creation_context.device, dst_shape);
-  BufferDescriptor desc;
-  desc.element_type = definition.src_tensors[1].data_type;
-  desc.element_size = 4;
-  desc.memory_type = result->conv_params_.weights_upload_type ==
-                             ConvPowerVR::WeightsUploadType::CONSTANT_MEM
-                         ? MemoryType::CONSTANT
-                         : MemoryType::GLOBAL;
-
-  result->args_.AddObjectRef("weights", AccessType::READ,
-                             absl::make_unique<BufferDescriptor>(desc));
+  result->GenerateCode(creation_context.device->info_);
   return result->UploadBias(attr.bias, creation_context.context);
 }
 
@@ -1023,6 +1020,7 @@ absl::Status CreateConvPowerVRWino4x4To6x6(
   *result = ConvPowerVR(definition);
   result->conv_params_ = result->GuessBestParamsWinograd(
       *creation_context.device, definition, attr, dst_shape);
+  result->GenerateCode(creation_context.device->info_);
   return result->UploadDataForWinograd4x4To6x6(
       attr.weights, *creation_context.device, creation_context.context);
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
index cf82ff1e966..1ff6db43cbc 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
@@ -41,9 +41,12 @@ namespace cl {
 class ConvPowerVR : public GPUOperation {
  public:
   ConvPowerVR() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-  absl::Status Compile(const CreationContext& creation_context) override;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   ConvWeightsDescription GetConvWeightsDescription() const {
     ConvWeightsDescription desc;
@@ -81,7 +84,6 @@ class ConvPowerVR : public GPUOperation {
     // F32_F16 precision mode
     DataType weights_data_type;  // used for weights and biases
     int3 block_size;
-    int3 work_group_size;
     int3 work_group_launch_order;
     bool fixed_work_group_size;
     bool linear_hw;
@@ -136,6 +138,8 @@ class ConvPowerVR : public GPUOperation {
               const BHWC* dst_shape = nullptr);
   explicit ConvPowerVR(const OperationDef& definition);
 
+  void GenerateCode(const DeviceInfo& device_info);
+
   template <DataType T>
   absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
                           const tflite::gpu::Tensor<Linear, T>& biases,
@@ -175,44 +179,37 @@ class ConvPowerVR : public GPUOperation {
       const Convolution2DAttributes& attr, ConvPowerVR* result,
       const BHWC* dst_shape);
 
-  friend std::string GenerateConv(const CLDevice& device,
-                                  const OperationDef& op_def,
-                                  bool stride_correction,
-                                  const ConvParams& conv_params,
-                                  Arguments* args);
-
   ConvParams GuessBestParams(const CLDevice& device,
                              const OperationDef& definition,
                              const Convolution2DAttributes& attr,
-                             const BHWC* dst_shape = nullptr) const;
+                             const BHWC* dst_shape = nullptr);
   ConvParams GuessBestParams(const CLDevice& device,
                              const OperationDef& definition,
                              const Convolution2DAttributes& attr,
                              const BHWC& weights_shape,
-                             const BHWC* dst_shape = nullptr) const;
+                             const BHWC* dst_shape = nullptr);
   ConvParams GuessBestParams(const CLDevice& device,
                              const OperationDef& definition,
                              const FullyConnectedAttributes& attr,
-                             const BHWC* dst_shape = nullptr) const;
+                             const BHWC* dst_shape = nullptr);
   ConvParams GuessBestParamsWinograd(const CLDevice& device,
                                      const OperationDef& definition,
                                      const Convolution2DAttributes& attr,
-                                     const BHWC* dst_shape = nullptr) const;
+                                     const BHWC* dst_shape = nullptr);
   ConvParams GuessBestParams(const CLDevice& device,
                              const OperationDef& definition, int src_depth,
                              int dst_depth, bool x_kernel_is_1,
                              bool y_kernel_is_1,
                              bool different_weights_for_height,
-                             const BHWC* dst_shape = nullptr) const;
+                             const BHWC* dst_shape = nullptr);
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
+  std::string GenerateConv(const DeviceInfo& device_info,
+                           const OperationDef& op_def, bool stride_correction,
+                           const ConvParams& conv_params);
 
   int4 stride_padding_;
   int4 kernel_dilation_;
   ConvParams conv_params_;
-
-  CLKernel kernel_;
 };
 
 template <DataType T>
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
index 12765b11fa5..7f987cc724c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
@@ -30,33 +30,95 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 namespace {
+bool UseFP16SIMD(const DeviceInfo& device_info, CalculationsPrecision precision,
+                 bool kernel1x1) {
+  if (!device_info.IsAdreno()) {
+    return false;
+  }
+  switch (precision) {
+    case CalculationsPrecision::F32:
+    case CalculationsPrecision::F32_F16:
+      return false;
+    case CalculationsPrecision::F16:
+      return device_info.IsAdreno3xx() && kernel1x1;
+  }
+}
+}  // namespace
 
-std::string GenerateConvCode(const OperationDef& op_def, const int3& block_size,
-                             bool is1x1, bool adreno4xx_optimization,
-                             bool stride_correction,
-                             bool different_weights_for_height,
-                             const CLDevice& device, Arguments* args) {
-  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
-  src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
-  if (op_def.IsBatchSupported()) {
-    src_desc->SetStateVar("BatchedWidth", "true");
+ConvTexture::ConvTexture(const OperationDef& definition,
+                         const Convolution2DAttributes& attr)
+    : GPUOperation(definition),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
+      stride_(attr.strides.w, attr.strides.h),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
+      dilation_(attr.dilations.w, attr.dilations.h),
+      different_weights_for_height_(false),
+      block_size_(2, 2, 2) {
+  work_group_size_ = int3(4, 4, 2);
+}
+
+ConvTexture::ConvTexture(const OperationDef& definition)
+    : GPUOperation(definition),
+      kernel_size_(1, 1),
+      stride_(1, 1),
+      padding_(0, 0),
+      dilation_(1, 1),
+      different_weights_for_height_(false),
+      block_size_(4, 1, 2) {
+  work_group_size_ = int3(16, 1, 2);
+}
+
+ConvTexture::ConvTexture(ConvTexture&& operation)
+    : GPUOperation(std::move(operation)),
+      kernel_size_(operation.kernel_size_),
+      stride_(operation.stride_),
+      padding_(operation.padding_),
+      dilation_(operation.dilation_),
+      different_weights_for_height_(operation.different_weights_for_height_),
+      block_size_(operation.block_size_) {}
+
+ConvTexture& ConvTexture::operator=(ConvTexture&& operation) {
+  if (this != &operation) {
+    std::swap(kernel_size_, operation.kernel_size_);
+    std::swap(stride_, operation.stride_);
+    std::swap(padding_, operation.padding_);
+    std::swap(dilation_, operation.dilation_);
+    std::swap(different_weights_for_height_,
+              operation.different_weights_for_height_);
+    std::swap(block_size_, operation.block_size_);
+    GPUOperation::operator=(std::move(operation));
   }
-  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
-  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  return *this;
+}
+
+std::string ConvTexture::GenerateConvCode(const OperationDef& op_def,
+                                          const int3& block_size, bool is1x1,
+                                          bool adreno4xx_optimization,
+                                          bool stride_correction,
+                                          bool different_weights_for_height) {
+  auto src_desc = op_def.src_tensors[0];
+  src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
   if (op_def.IsBatchSupported()) {
-    dst_desc->SetStateVar("BatchedWidth", "true");
+    src_desc.SetStateVar("BatchedWidth", "true");
   }
-  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
+  AddSrcTensor("src_tensor", src_desc);
+
+  auto dst_desc = op_def.dst_tensors[0];
+  if (op_def.IsBatchSupported()) {
+    dst_desc.SetStateVar("BatchedWidth", "true");
+  }
+  AddDstTensor("dst_tensor", dst_desc);
+
   if (!is1x1) {
-    args->AddInt("kernel_size_x");
-    args->AddInt("kernel_size_y");
-    args->AddInt("dilation_x");
-    args->AddInt("dilation_y");
+    args_.AddInt("kernel_size_x");
+    args_.AddInt("kernel_size_y");
+    args_.AddInt("dilation_x");
+    args_.AddInt("dilation_y");
   }
-  args->AddInt("stride_x");
-  args->AddInt("stride_y");
-  args->AddInt("padding_x");
-  args->AddInt("padding_y");
+  args_.AddInt("stride_x");
+  args_.AddInt("stride_y");
+  args_.AddInt("padding_x");
+  args_.AddInt("padding_y");
 
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
   const bool is_buffer = src_tensor_type == TensorStorageType::IMAGE_BUFFER ||
@@ -317,101 +379,26 @@ std::string GenerateConvCode(const OperationDef& op_def, const int3& block_size,
   return c;
 }
 
-bool UseFP16SIMD(const CLDevice& device, CalculationsPrecision precision,
-                 bool kernel1x1) {
-  if (!device.IsAdreno()) {
-    return false;
-  }
-  switch (precision) {
-    case CalculationsPrecision::F32:
-    case CalculationsPrecision::F32_F16:
-      return false;
-    case CalculationsPrecision::F16:
-      return device.IsAdreno3xx() && kernel1x1;
-  }
-}
-}  // namespace
-
-ConvTexture::ConvTexture(const OperationDef& definition,
-                         const Convolution2DAttributes& attr)
-    : GPUOperation(definition),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
-      stride_(attr.strides.w, attr.strides.h),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
-      dilation_(attr.dilations.w, attr.dilations.h),
-      different_weights_for_height_(false),
-      block_size_(2, 2, 2),
-      work_group_size_(4, 4, 2) {}
-
-ConvTexture::ConvTexture(const OperationDef& definition)
-    : GPUOperation(definition),
-      kernel_size_(1, 1),
-      stride_(1, 1),
-      padding_(0, 0),
-      dilation_(1, 1),
-      different_weights_for_height_(false),
-      block_size_(4, 1, 2),
-      work_group_size_(16, 1, 2) {}
-
-ConvTexture::ConvTexture(ConvTexture&& operation)
-    : GPUOperation(std::move(operation)),
-      kernel_size_(operation.kernel_size_),
-      stride_(operation.stride_),
-      padding_(operation.padding_),
-      dilation_(operation.dilation_),
-      different_weights_for_height_(operation.different_weights_for_height_),
-      block_size_(operation.block_size_),
-      kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_) {}
-
-ConvTexture& ConvTexture::operator=(ConvTexture&& operation) {
-  if (this != &operation) {
-    std::swap(kernel_size_, operation.kernel_size_);
-    std::swap(stride_, operation.stride_);
-    std::swap(padding_, operation.padding_);
-    std::swap(dilation_, operation.dilation_);
-    std::swap(different_weights_for_height_,
-              operation.different_weights_for_height_);
-    std::swap(block_size_, operation.block_size_);
-    kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status ConvTexture::Compile(const CreationContext& creation_context) {
+void ConvTexture::GenerateCode(const DeviceInfo& device_info) {
   auto storage_type = definition_.GetPrimaryStorageType();
   bool is1x1 = kernel_size_.x == 1 && kernel_size_.y == 1;
   bool adreno4xx_optimization =
       stride_.x == 1 && stride_.y == 1 && padding_.x == 0 && padding_.y == 0 &&
-      creation_context.device->IsAdreno4xx() &&
+      device_info.IsAdreno4xx() &&
       storage_type == TensorStorageType::TEXTURE_ARRAY &&
       definition_.precision == CalculationsPrecision::F16;
   const bool stride_correction =
       definition_.IsBatchSupported() && stride_.x != 1;
-  std::string code =
+  code_ =
       GenerateConvCode(definition_, block_size_, is1x1, adreno4xx_optimization,
-                       stride_correction, different_weights_for_height_,
-                       *creation_context.device, &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  std::vector<CompilerOptions> options;
-  if (UseFP16SIMD(*creation_context.device, definition_.precision, is1x1)) {
-    options.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
+                       stride_correction, different_weights_for_height_);
+
+  if (UseFP16SIMD(device_info, definition_.precision, is1x1)) {
+    compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
   }
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", options, *creation_context.context,
-      *creation_context.device, &kernel_);
 }
 
 absl::Status ConvTexture::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   if (!(kernel_size_.x == 1 && kernel_size_.y == 1)) {
     RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
     RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
@@ -422,8 +409,6 @@ absl::Status ConvTexture::BindArguments() {
   RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
   RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
   RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
   return absl::OkStatus();
 }
 
@@ -435,15 +420,11 @@ int3 ConvTexture::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status ConvTexture::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
-                              &work_group_size_);
-}
-
-absl::Status ConvTexture::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+void ConvTexture::GetPossibleKernelWorkGroups(
+    TuningType tuning_type, const DeviceInfo& device_info,
+    const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
+  GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
+                            work_groups);
 }
 
 absl::Status CreateConvTexture(const CreationContext& creation_context,
@@ -451,6 +432,7 @@ absl::Status CreateConvTexture(const CreationContext& creation_context,
                                const Convolution2DAttributes& attr,
                                ConvTexture* result) {
   *result = ConvTexture(definition, attr);
+  result->GenerateCode(creation_context.device->info_);
   return result->UploadData(attr.weights, attr.bias, creation_context.context);
 }
 
@@ -459,6 +441,7 @@ absl::Status CreateConvTexture(const CreationContext& creation_context,
                                const FullyConnectedAttributes& attr,
                                ConvTexture* result) {
   *result = ConvTexture(definition);
+  result->GenerateCode(creation_context.device->info_);
   return result->UploadData(attr.weights, attr.bias, creation_context.context);
 }
 
@@ -468,6 +451,7 @@ absl::Status CreateConvTextureWino4x4To6x6(
   *result = ConvTexture(definition);
   result->different_weights_for_height_ = true;
   result->block_size_ = {4, 1, 2};
+  result->GenerateCode(creation_context.device->info_);
   return result->UploadDataForWinograd4x4To6x6(
       attr.weights, *creation_context.device, creation_context.context);
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
index 31c2a72021e..8406918fe80 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
@@ -42,10 +42,12 @@ namespace cl {
 class ConvTexture : public GPUOperation {
  public:
   ConvTexture() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
-  absl::Status Compile(const CreationContext& creation_context) override;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   ConvTexture(ConvTexture&& operation);
@@ -89,8 +91,13 @@ class ConvTexture : public GPUOperation {
                             absl::Span<T> dst_0, absl::Span<T> dst_1,
                             absl::Span<T> dst_2, absl::Span<T> dst_3);
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
+  void GenerateCode(const DeviceInfo& device_info);
+
+  std::string GenerateConvCode(const OperationDef& op_def,
+                               const int3& block_size, bool is1x1,
+                               bool adreno4xx_optimization,
+                               bool stride_correction,
+                               bool different_weights_for_height);
 
   int2 kernel_size_;
   int2 stride_;
@@ -103,9 +110,6 @@ class ConvTexture : public GPUOperation {
   bool different_weights_for_height_;
 
   int3 block_size_ = int3(2, 2, 2);
-
-  CLKernel kernel_;
-  int3 work_group_size_;
 };
 
 template <DataType T>
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc
index 18a6886dc89..d6e17ce2a86 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc
@@ -23,21 +23,37 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
 
-std::string GetConverterToConvWeightsCode(
-    const OperationDef& op_def, const ConvWeightsDescription& conv_weights_desc,
-    Arguments* args) {
-  args->AddObjectRef(
-      "src_tensor", AccessType::READ,
-      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
-  args->AddObjectRef(
-      "dst_tensor", AccessType::WRITE,
-      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
-  args->AddFloat("mask_x");
-  args->AddFloat("mask_y");
-  args->AddFloat("mask_z");
-  args->AddFloat("mask_w");
+ConverterToConvWeights::ConverterToConvWeights(
+    const OperationDef& definition,
+    const ConvWeightsDescription& conv_weights_desc)
+    : GPUOperation(definition), conv_weights_desc_(conv_weights_desc) {
+  code_ = GetConverterToConvWeightsCode(definition_, conv_weights_desc_);
+}
+
+ConverterToConvWeights::ConverterToConvWeights(
+    ConverterToConvWeights&& operation)
+    : GPUOperation(std::move(operation)),
+      conv_weights_desc_(operation.conv_weights_desc_) {}
+
+ConverterToConvWeights& ConverterToConvWeights::operator=(
+    ConverterToConvWeights&& operation) {
+  if (this != &operation) {
+    conv_weights_desc_ = operation.conv_weights_desc_;
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+std::string ConverterToConvWeights::GetConverterToConvWeightsCode(
+    const OperationDef& op_def,
+    const ConvWeightsDescription& conv_weights_desc) {
+  AddSrcTensor("src_tensor", op_def.src_tensors[0]);
+  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
+  args_.AddFloat("mask_x");
+  args_.AddFloat("mask_y");
+  args_.AddFloat("mask_z");
+  args_.AddFloat("mask_w");
 
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
@@ -93,47 +109,13 @@ std::string GetConverterToConvWeightsCode(
   c += "}\n";
   return c;
 }
-}  // namespace
-
-ConverterToConvWeights::ConverterToConvWeights(
-    ConverterToConvWeights&& operation)
-    : GPUOperation(std::move(operation)),
-      conv_weights_desc_(operation.conv_weights_desc_),
-      kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_) {}
-
-ConverterToConvWeights& ConverterToConvWeights::operator=(
-    ConverterToConvWeights&& operation) {
-  if (this != &operation) {
-    conv_weights_desc_ = operation.conv_weights_desc_;
-    kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status ConverterToConvWeights::Compile(
-    const CreationContext& creation_context) {
-  std::string code =
-      GetConverterToConvWeightsCode(definition_, conv_weights_desc_, &args_);
-  RETURN_IF_ERROR(
-      args_.TransformToCLCode(creation_context.device->GetInfo(), {}, &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
 
 absl::Status ConverterToConvWeights::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   float4 mask = GetMaskForLastPlane(src_[0]->Channels());
   RETURN_IF_ERROR(args_.SetFloat("mask_x", mask.x));
   RETURN_IF_ERROR(args_.SetFloat("mask_y", mask.y));
   RETURN_IF_ERROR(args_.SetFloat("mask_z", mask.z));
-  RETURN_IF_ERROR(args_.SetFloat("mask_w", mask.w));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return args_.SetFloat("mask_w", mask.w);
 }
 
 int3 ConverterToConvWeights::GetGridSize() const {
@@ -144,16 +126,6 @@ int3 ConverterToConvWeights::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status ConverterToConvWeights::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status ConverterToConvWeights::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 ConverterToConvWeights CreateConverterToConvWeights(
     const OperationDef& definition,
     const ConvWeightsDescription& conv_weights_desc) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h
index d79cfb8e3e0..fe814d296fa 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h
@@ -30,14 +30,9 @@ namespace cl {
 class ConverterToConvWeights : public GPUOperation {
  public:
   ConverterToConvWeights(const OperationDef& definition,
-                         const ConvWeightsDescription& conv_weights_desc)
-      : GPUOperation(definition),
-        conv_weights_desc_(conv_weights_desc),
-        work_group_size_(8, 4, 1) {}
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
-  absl::Status Compile(const CreationContext& creation_context) override;
+                         const ConvWeightsDescription& conv_weights_desc);
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   ConverterToConvWeights(ConverterToConvWeights&& operation);
@@ -46,12 +41,11 @@ class ConverterToConvWeights : public GPUOperation {
   ConverterToConvWeights& operator=(const ConverterToConvWeights&) = delete;
 
  private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
+  std::string GetConverterToConvWeightsCode(
+      const OperationDef& op_def,
+      const ConvWeightsDescription& conv_weights_desc);
 
   ConvWeightsDescription conv_weights_desc_;
-  CLKernel kernel_;
-  int3 work_group_size_;
 };
 
 // We expect src BHWC tensor and we assume that B is O, H = H, W = W, C is I
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc b/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc
index 69873aa9922..d52efb43a08 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc
@@ -136,8 +136,6 @@ class FromTensorConverter : public OpenClConverterImpl {
         R"(
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
-const sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
-
 __kernel void from_tensor()" +
         params_kernel.first + R"(, $0) {
   int linear_id = get_global_id(0);
@@ -154,8 +152,8 @@ __kernel void from_tensor()" +
     context_ = &environment->context();
     shape_ = BHWC(input_def.dimensions.b, input_def.dimensions.h,
                   input_def.dimensions.w, input_def.dimensions.c);
-    RETURN_IF_ERROR(args_.TransformToCLCode(environment->device().GetInfo(), {},
-                                            &shader_src));
+    RETURN_IF_ERROR(
+        args_.TransformToCLCode(environment->device().info_, {}, &shader_src));
     return environment->program_cache()->GetOrCreateCLKernel(
         shader_src, "from_tensor", environment->context(),
         environment->device(), &kernel_);
@@ -274,8 +272,8 @@ __kernel void to_tensor()" +
     context_ = &environment->context();
     shape_ = BHWC(output_def.dimensions.b, output_def.dimensions.h,
                   output_def.dimensions.w, output_def.dimensions.c);
-    RETURN_IF_ERROR(args_.TransformToCLCode(environment->device().GetInfo(), {},
-                                            &shader_src));
+    RETURN_IF_ERROR(
+        args_.TransformToCLCode(environment->device().info_, {}, &shader_src));
     return environment->program_cache()->GetOrCreateCLKernel(
         shader_src, "to_tensor", environment->context(), environment->device(),
         &kernel_);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
index 6bf4d6a9aac..314d0b20499 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
@@ -28,25 +28,72 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
 
-std::string GenerateConvolutionTransposedCode(const OperationDef& op_def,
-                                              const CLDevice& device,
-                                              bool weights_are_buffer,
-                                              const int3& block_size,
-                                              Arguments* args) {
-  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
-  src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
-  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
-  args->AddObjectRef(
-      "dst_tensor", AccessType::WRITE,
-      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
-  args->AddInt("stride_x");
-  args->AddInt("stride_y");
-  args->AddInt("padding_x");
-  args->AddInt("padding_y");
-  args->AddInt("kernel_size_x");
-  args->AddInt("kernel_size_y");
+ConvolutionTransposed::ConvolutionTransposed(
+    const OperationDef& definition, const ConvolutionTransposedAttributes& attr,
+    const DeviceInfo& device_info)
+    : GPUOperation(definition),
+      weights_are_buffer_(device_info.IsMali()),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
+      stride_(attr.stride.w, attr.stride.h),
+      padding_(attr.padding.prepended.w, attr.padding.prepended.h),
+      block_size_(2, 2, 2) {
+  const bool is_f16 = definition.precision == CalculationsPrecision::F16;
+  if (device_info.IsMali()) {
+    if (device_info.mali_info.IsMidgard()) {
+      block_size_ = is_f16 ? int3(2, 1, 2) : int3(2, 1, 1);
+    } else {
+      block_size_ = is_f16 ? int3(2, 2, 2) : int3(2, 2, 1);
+    }
+  }
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
+  if (dst_depth == 1 || dst_depth == 3) {
+    if (!device_info.IsMali()) {
+      block_size_.y *= block_size_.z;
+    }
+    block_size_.z = 1;
+  }
+
+  code_ = GenerateConvolutionTransposedCode(definition_, device_info,
+                                            weights_are_buffer_, block_size_);
+}
+
+ConvolutionTransposed::ConvolutionTransposed(ConvolutionTransposed&& operation)
+    : GPUOperation(std::move(operation)),
+      weights_are_buffer_(operation.weights_are_buffer_),
+      kernel_size_(operation.kernel_size_),
+      stride_(operation.stride_),
+      padding_(operation.padding_),
+      block_size_(operation.block_size_) {}
+
+ConvolutionTransposed& ConvolutionTransposed::operator=(
+    ConvolutionTransposed&& operation) {
+  if (this != &operation) {
+    std::swap(weights_are_buffer_, operation.weights_are_buffer_);
+    std::swap(kernel_size_, operation.kernel_size_);
+    std::swap(stride_, operation.stride_);
+    std::swap(padding_, operation.padding_);
+    std::swap(block_size_, operation.block_size_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
+    const OperationDef& op_def, const DeviceInfo& device_info,
+    bool weights_are_buffer, const int3& block_size) {
+  auto src_desc = op_def.src_tensors[0];
+  src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
+  AddSrcTensor("src_tensor", src_desc);
+
+  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
+
+  args_.AddInt("stride_x");
+  args_.AddInt("stride_y");
+  args_.AddInt("padding_x");
+  args_.AddInt("padding_y");
+  args_.AddInt("kernel_size_x");
+  args_.AddInt("kernel_size_y");
 
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
   bool image_buffer = src_tensor_type == TensorStorageType::IMAGE_BUFFER;
@@ -208,7 +255,7 @@ std::string GenerateConvolutionTransposedCode(const OperationDef& op_def,
     c += "      int x_c = kernel_index * args.src_tensor.Slices();\n";
   }
   c += "      for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
-  const bool conditional_read = device.IsMali();
+  const bool conditional_read = device_info.IsMali();
   for (int y = 0; y < block_size.y; ++y) {
     const std::string yindex = std::to_string(y);
     for (int x = 0; x < block_size.x; ++x) {
@@ -285,90 +332,14 @@ std::string GenerateConvolutionTransposedCode(const OperationDef& op_def,
   c += "}\n";
   return c;
 }
-}  // namespace
-
-ConvolutionTransposed::ConvolutionTransposed(
-    const OperationDef& definition, const ConvolutionTransposedAttributes& attr,
-    const CLDevice& device)
-    : GPUOperation(definition),
-      weights_are_buffer_(device.IsMali()),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
-      stride_(attr.stride.w, attr.stride.h),
-      padding_(attr.padding.prepended.w, attr.padding.prepended.h),
-      block_size_(2, 2, 2) {
-  const bool is_f16 = definition.precision == CalculationsPrecision::F16;
-  if (device.IsMali()) {
-    MaliInfo mali_info = device.GetInfo().mali_info;
-    if (mali_info.IsMidgard()) {
-      block_size_ = is_f16 ? int3(2, 1, 2) : int3(2, 1, 1);
-    } else {
-      block_size_ = is_f16 ? int3(2, 2, 2) : int3(2, 2, 1);
-    }
-  }
-  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
-  if (dst_depth == 1 || dst_depth == 3) {
-    if (!device.IsMali()) {
-      block_size_.y *= block_size_.z;
-    }
-    block_size_.z = 1;
-  }
-}
-
-ConvolutionTransposed::ConvolutionTransposed(ConvolutionTransposed&& operation)
-    : GPUOperation(std::move(operation)),
-      weights_are_buffer_(operation.weights_are_buffer_),
-      kernel_size_(operation.kernel_size_),
-      stride_(operation.stride_),
-      padding_(operation.padding_),
-      block_size_(operation.block_size_),
-      kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_) {}
-
-ConvolutionTransposed& ConvolutionTransposed::operator=(
-    ConvolutionTransposed&& operation) {
-  if (this != &operation) {
-    std::swap(weights_are_buffer_, operation.weights_are_buffer_);
-    std::swap(kernel_size_, operation.kernel_size_);
-    std::swap(stride_, operation.stride_);
-    std::swap(padding_, operation.padding_);
-    std::swap(block_size_, operation.block_size_);
-    kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status ConvolutionTransposed::Compile(
-    const CreationContext& creation_context) {
-  std::string code = GenerateConvolutionTransposedCode(
-      definition_, *creation_context.device, weights_are_buffer_, block_size_,
-      &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-
-  std::vector<CompilerOptions> options;
-  // options.push_back(CompilerOptions::POWERVR_FP16);
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", options, *creation_context.context,
-      *creation_context.device, &kernel_);
-}
 
 absl::Status ConvolutionTransposed::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
   RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
   RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x));
   RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
   RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
-  RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return args_.SetInt("kernel_size_y", kernel_size_.y);
 }
 
 int3 ConvolutionTransposed::GetGridSize() const {
@@ -380,22 +351,19 @@ int3 ConvolutionTransposed::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status ConvolutionTransposed::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
-                              &work_group_size_);
-}
-
-absl::Status ConvolutionTransposed::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+void ConvolutionTransposed::GetPossibleKernelWorkGroups(
+    TuningType tuning_type, const DeviceInfo& device_info,
+    const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
+  GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
+                            work_groups);
 }
 
 absl::Status CreateConvolutionTransposed(
     const CreationContext& creation_context, const OperationDef& definition,
     const ConvolutionTransposedAttributes& attr,
     ConvolutionTransposed* result) {
-  *result = ConvolutionTransposed(definition, attr, *creation_context.device);
+  *result =
+      ConvolutionTransposed(definition, attr, creation_context.device->info_);
   RETURN_IF_ERROR(
       result->UploadWeights(attr.weights, creation_context.context));
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
index 4f4b7100f77..9f865f8f0b7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
@@ -38,10 +38,12 @@ namespace cl {
 class ConvolutionTransposed : public GPUOperation {
  public:
   ConvolutionTransposed() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
-  absl::Status Compile(const CreationContext& creation_context) override;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   ConvolutionTransposed(ConvolutionTransposed&& operation);
@@ -56,7 +58,7 @@ class ConvolutionTransposed : public GPUOperation {
       ConvolutionTransposed* result);
   explicit ConvolutionTransposed(const OperationDef& definition,
                                  const ConvolutionTransposedAttributes& attr,
-                                 const CLDevice& device);
+                                 const DeviceInfo& device_info);
   template <DataType T>
   absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
                              CLContext* context);
@@ -65,8 +67,10 @@ class ConvolutionTransposed : public GPUOperation {
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
                             absl::Span<T> dst);
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
+  std::string GenerateConvolutionTransposedCode(const OperationDef& op_def,
+                                                const DeviceInfo& device_info,
+                                                bool weights_are_buffer,
+                                                const int3& block_size);
 
   bool weights_are_buffer_;
 
@@ -75,9 +79,6 @@ class ConvolutionTransposed : public GPUOperation {
   int2 padding_;
 
   int3 block_size_ = int3(1, 1, 1);
-
-  CLKernel kernel_;
-  int3 work_group_size_ = int3(8, 4, 1);
 };
 
 template <DataType T>
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc
index 4e2f612f43f..2b35080b1ab 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc
@@ -27,29 +27,73 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
 
-std::string GenerateConvolutionTransposed3DCode(const OperationDef& op_def,
-                                                const CLDevice& device,
-                                                bool weights_are_buffer,
-                                                const int4& block_size,
-                                                Arguments* args) {
-  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
-  src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
-  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
-  args->AddObjectRef(
-      "dst_tensor", AccessType::WRITE,
-      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
-  args->AddInt("stride_x");
-  args->AddInt("stride_y");
-  args->AddInt("stride_z");
-  args->AddInt("padding_x");
-  args->AddInt("padding_y");
-  args->AddInt("padding_z");
-  args->AddInt("kernel_size_x");
-  args->AddInt("kernel_size_y");
-  args->AddInt("kernel_size_z");
-  args->AddInt("grid_size_s");
+ConvolutionTransposed3D::ConvolutionTransposed3D(
+    const OperationDef& definition,
+    const ConvolutionTransposed3DAttributes& attr,
+    const DeviceInfo& device_info)
+    : GPUOperation(definition),
+      weights_are_buffer_(device_info.IsMali()),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
+                   attr.weights.shape.d),
+      stride_(attr.stride.w, attr.stride.h, attr.stride.d),
+      padding_(attr.padding.prepended.w, attr.padding.prepended.h,
+               attr.padding.prepended.d),
+      block_size_(2, 2, 1, 2) {
+  code_ = GenerateConvolutionTransposed3DCode(definition_, weights_are_buffer_,
+                                              block_size_);
+  if (device_info.IsPowerVR() && block_size_.y != 1) {
+    bool is_texture3d = definition_.src_tensors[0].storage_type ==
+                        TensorStorageType::TEXTURE_3D;
+    bool is_texture_array = definition_.src_tensors[0].storage_type ==
+                            TensorStorageType::TEXTURE_ARRAY;
+    if (is_texture3d || is_texture_array) {
+      compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
+    }
+  }
+}
+
+ConvolutionTransposed3D::ConvolutionTransposed3D(
+    ConvolutionTransposed3D&& operation)
+    : GPUOperation(std::move(operation)),
+      weights_are_buffer_(operation.weights_are_buffer_),
+      kernel_size_(operation.kernel_size_),
+      stride_(operation.stride_),
+      padding_(operation.padding_),
+      block_size_(operation.block_size_) {}
+
+ConvolutionTransposed3D& ConvolutionTransposed3D::operator=(
+    ConvolutionTransposed3D&& operation) {
+  if (this != &operation) {
+    std::swap(weights_are_buffer_, operation.weights_are_buffer_);
+    std::swap(kernel_size_, operation.kernel_size_);
+    std::swap(stride_, operation.stride_);
+    std::swap(padding_, operation.padding_);
+    std::swap(block_size_, operation.block_size_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+std::string ConvolutionTransposed3D::GenerateConvolutionTransposed3DCode(
+    const OperationDef& op_def, bool weights_are_buffer,
+    const int4& block_size) {
+  auto src_desc = op_def.src_tensors[0];
+  src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
+  AddSrcTensor("src_tensor", src_desc);
+
+  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
+
+  args_.AddInt("stride_x");
+  args_.AddInt("stride_y");
+  args_.AddInt("stride_z");
+  args_.AddInt("padding_x");
+  args_.AddInt("padding_y");
+  args_.AddInt("padding_z");
+  args_.AddInt("kernel_size_x");
+  args_.AddInt("kernel_size_y");
+  args_.AddInt("kernel_size_z");
+  args_.AddInt("grid_size_s");
 
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
   bool image_buffer = src_tensor_type == TensorStorageType::IMAGE_BUFFER;
@@ -324,76 +368,8 @@ std::string GenerateConvolutionTransposed3DCode(const OperationDef& op_def,
   c += "}\n";
   return c;
 }
-}  // namespace
-
-ConvolutionTransposed3D::ConvolutionTransposed3D(
-    const OperationDef& definition,
-    const ConvolutionTransposed3DAttributes& attr, const CLDevice& device)
-    : GPUOperation(definition),
-      weights_are_buffer_(device.IsMali()),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
-                   attr.weights.shape.d),
-      stride_(attr.stride.w, attr.stride.h, attr.stride.d),
-      padding_(attr.padding.prepended.w, attr.padding.prepended.h,
-               attr.padding.prepended.d),
-      block_size_(2, 2, 1, 2) {}
-
-ConvolutionTransposed3D::ConvolutionTransposed3D(
-    ConvolutionTransposed3D&& operation)
-    : GPUOperation(std::move(operation)),
-      weights_are_buffer_(operation.weights_are_buffer_),
-      kernel_size_(operation.kernel_size_),
-      stride_(operation.stride_),
-      padding_(operation.padding_),
-      block_size_(operation.block_size_),
-      kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_) {}
-
-ConvolutionTransposed3D& ConvolutionTransposed3D::operator=(
-    ConvolutionTransposed3D&& operation) {
-  if (this != &operation) {
-    std::swap(weights_are_buffer_, operation.weights_are_buffer_);
-    std::swap(kernel_size_, operation.kernel_size_);
-    std::swap(stride_, operation.stride_);
-    std::swap(padding_, operation.padding_);
-    std::swap(block_size_, operation.block_size_);
-    kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status ConvolutionTransposed3D::Compile(
-    const CreationContext& creation_context) {
-  std::string code = GenerateConvolutionTransposed3DCode(
-      definition_, *creation_context.device, weights_are_buffer_, block_size_,
-      &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-
-  std::vector<CompilerOptions> options;
-  if (creation_context.device->IsPowerVR() && block_size_.y != 1) {
-    bool is_texture3d = definition_.src_tensors[0].storage_type ==
-                        TensorStorageType::TEXTURE_3D;
-    bool is_texture_array = definition_.src_tensors[0].storage_type ==
-                            TensorStorageType::TEXTURE_ARRAY;
-    if (is_texture3d || is_texture_array) {
-      options.push_back(CompilerOptions::CL_OPT_DISABLE);
-    }
-  }
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", options, *creation_context.context,
-      *creation_context.device, &kernel_);
-}
 
 absl::Status ConvolutionTransposed3D::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
   RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
   RETURN_IF_ERROR(args_.SetInt("stride_z", stride_.z));
@@ -403,10 +379,8 @@ absl::Status ConvolutionTransposed3D::BindArguments() {
   RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
   RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
   RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
-  RETURN_IF_ERROR(args_.SetInt(
-      "grid_size_s", DivideRoundUp(dst_[0]->Slices(), block_size_.w)));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return args_.SetInt("grid_size_s",
+                      DivideRoundUp(dst_[0]->Slices(), block_size_.w));
 }
 
 int3 ConvolutionTransposed3D::GetGridSize() const {
@@ -420,22 +394,19 @@ int3 ConvolutionTransposed3D::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status ConvolutionTransposed3D::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
-                              &work_group_size_);
-}
-
-absl::Status ConvolutionTransposed3D::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+void ConvolutionTransposed3D::GetPossibleKernelWorkGroups(
+    TuningType tuning_type, const DeviceInfo& device_info,
+    const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
+  GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
+                            work_groups);
 }
 
 absl::Status CreateConvolutionTransposed3D(
     const CreationContext& creation_context, const OperationDef& definition,
     const ConvolutionTransposed3DAttributes& attr,
     ConvolutionTransposed3D* result) {
-  *result = ConvolutionTransposed3D(definition, attr, *creation_context.device);
+  *result =
+      ConvolutionTransposed3D(definition, attr, creation_context.device->info_);
   RETURN_IF_ERROR(
       result->UploadWeights(attr.weights, creation_context.context));
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h
index 30e22e6e725..919181bceab 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h
@@ -38,10 +38,12 @@ namespace cl {
 class ConvolutionTransposed3D : public GPUOperation {
  public:
   ConvolutionTransposed3D() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
-  absl::Status Compile(const CreationContext& creation_context) override;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   ConvolutionTransposed3D(ConvolutionTransposed3D&& operation);
@@ -56,7 +58,7 @@ class ConvolutionTransposed3D : public GPUOperation {
       ConvolutionTransposed3D* result);
   ConvolutionTransposed3D(const OperationDef& definition,
                           const ConvolutionTransposed3DAttributes& attr,
-                          const CLDevice& device);
+                          const DeviceInfo& device_info);
   template <DataType T>
   absl::Status UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
                              CLContext* context);
@@ -65,8 +67,9 @@ class ConvolutionTransposed3D : public GPUOperation {
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
                             absl::Span<T> dst);
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
+  std::string GenerateConvolutionTransposed3DCode(const OperationDef& op_def,
+                                                  bool weights_are_buffer,
+                                                  const int4& block_size);
 
   bool weights_are_buffer_;
 
@@ -75,9 +78,6 @@ class ConvolutionTransposed3D : public GPUOperation {
   int3 padding_;
 
   int4 block_size_ = int4(1, 1, 1, 1);  // WHDS
-
-  CLKernel kernel_;
-  int3 work_group_size_ = int3(8, 4, 1);
 };
 
 template <DataType T>
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
index 89eb75bfc68..3e3a5a1f7f4 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
@@ -27,26 +27,67 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
+ConvolutionTransposed3x3::ConvolutionTransposed3x3(
+    const OperationDef& definition, const CLDevice& device, int2 padding)
+    : GPUOperation(definition),
+      padding_(padding),
+      work_group_launch_order_(2, 0, 1) {
+  work_group_size_ = int3(8, 4, 1);
+  if (device.IsPowerVR()) {
+    weights_upload_type_ = WeightsUploadType::LOCAL_MEM_ASYNC;
+  } else if (device.IsNvidia() || device.IsIntel()) {
+    weights_upload_type_ = WeightsUploadType::LOCAL_MEM_BY_THREADS;
+  } else if (device.IsAMD()) {
+    weights_upload_type_ = WeightsUploadType::CONSTANT_MEM;
+  } else {
+    weights_upload_type_ = WeightsUploadType::GLOBAL_MEM;
+  }
+  code_ = GenerateConvolutionTransposedCode(definition_, weights_upload_type_,
+                                            padding_, work_group_launch_order_);
+  if (definition_.precision == CalculationsPrecision::F16 &&
+      device.IsPowerVR()) {
+    compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
+  }
+}
 
-std::string GenerateConvolutionTransposedCode(
+ConvolutionTransposed3x3::ConvolutionTransposed3x3(
+    ConvolutionTransposed3x3&& operation)
+    : GPUOperation(std::move(operation)),
+      padding_(operation.padding_),
+      work_group_launch_order_(operation.work_group_launch_order_),
+      weights_upload_type_(operation.weights_upload_type_) {}
+
+ConvolutionTransposed3x3& ConvolutionTransposed3x3::operator=(
+    ConvolutionTransposed3x3&& operation) {
+  if (this != &operation) {
+    std::swap(padding_, operation.padding_);
+    std::swap(work_group_launch_order_, operation.work_group_launch_order_);
+    std::swap(weights_upload_type_, operation.weights_upload_type_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+std::string ConvolutionTransposed3x3::GenerateConvolutionTransposedCode(
     const OperationDef& op_def,
     ConvolutionTransposed3x3::WeightsUploadType weights_upload_type,
-    int2 padding, int3 work_group_launch_order, Arguments* args) {
-  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
-  src_desc->SetTextureAddressMode(TextureAddressMode::ZERO);
+    int2 padding, int3 work_group_launch_order) {
+  auto src_desc = op_def.src_tensors[0];
+  src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
   if (op_def.IsBatchSupported()) {
-    src_desc->SetStateVar("BatchedWidth", "true");
+    src_desc.SetStateVar("BatchedWidth", "true");
   }
-  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
-  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  AddSrcTensor("src_tensor", src_desc);
+
+  auto dst_desc = op_def.dst_tensors[0];
   if (op_def.IsBatchSupported()) {
-    dst_desc->SetStateVar("BatchedWidth", "true");
+    dst_desc.SetStateVar("BatchedWidth", "true");
   }
-  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
-  args->AddInt("filter_offset");
-  args->AddInt("padding_x");
-  args->AddInt("padding_y");
+  AddDstTensor("dst_tensor", dst_desc);
+
+  args_.AddInt("filter_offset");
+  args_.AddInt("padding_x");
+  args_.AddInt("padding_y");
 
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
   const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
@@ -264,81 +305,14 @@ std::string GenerateConvolutionTransposedCode(
   return c;
 }
 
-}  // namespace
-
-ConvolutionTransposed3x3::ConvolutionTransposed3x3(
-    const OperationDef& definition, const CLDevice& device, int2 padding)
-    : GPUOperation(definition),
-      padding_(padding),
-      work_group_launch_order_(2, 0, 1) {
-  if (device.IsPowerVR()) {
-    weights_upload_type_ = WeightsUploadType::LOCAL_MEM_ASYNC;
-  } else if (device.IsNvidia() || device.IsIntel()) {
-    weights_upload_type_ = WeightsUploadType::LOCAL_MEM_BY_THREADS;
-  } else if (device.IsAMD()) {
-    weights_upload_type_ = WeightsUploadType::CONSTANT_MEM;
-  } else {
-    weights_upload_type_ = WeightsUploadType::GLOBAL_MEM;
-  }
-}
-
-ConvolutionTransposed3x3::ConvolutionTransposed3x3(
-    ConvolutionTransposed3x3&& operation)
-    : GPUOperation(std::move(operation)),
-      padding_(operation.padding_),
-      work_group_launch_order_(operation.work_group_launch_order_),
-      weights_upload_type_(operation.weights_upload_type_),
-      kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_) {}
-
-ConvolutionTransposed3x3& ConvolutionTransposed3x3::operator=(
-    ConvolutionTransposed3x3&& operation) {
-  if (this != &operation) {
-    std::swap(padding_, operation.padding_);
-    std::swap(work_group_launch_order_, operation.work_group_launch_order_);
-    std::swap(weights_upload_type_, operation.weights_upload_type_);
-    kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status ConvolutionTransposed3x3::Compile(
-    const CreationContext& creation_context) {
-  std::string code = GenerateConvolutionTransposedCode(
-      definition_, weights_upload_type_, padding_, work_group_launch_order_,
-      &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-
-  std::vector<CompilerOptions> options;
-  if (definition_.precision == CalculationsPrecision::F16 &&
-      creation_context.device->IsPowerVR()) {
-    options.push_back(CompilerOptions::POWERVR_FP16);
-  }
-  RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", options, *creation_context.context,
-      *creation_context.device, &kernel_));
-  return absl::OkStatus();
-}
-
 absl::Status ConvolutionTransposed3x3::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   RETURN_IF_ERROR(args_.SetInt("filter_offset", 4 * 9 * src_[0]->Slices()));
   const int padding_x =
       padding_.x >= 1 ? (padding_.x - 1) / 2 : (padding_.x - 2) / 2;
   const int padding_y =
       padding_.y >= 1 ? (padding_.y - 1) / 2 : (padding_.y - 2) / 2;
   RETURN_IF_ERROR(args_.SetInt("padding_x", padding_x * src_[0]->Batch()));
-  RETURN_IF_ERROR(args_.SetInt("padding_y", padding_y));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return args_.SetInt("padding_y", padding_y);
 }
 
 int3 ConvolutionTransposed3x3::GetGridSize() const {
@@ -352,12 +326,6 @@ int3 ConvolutionTransposed3x3::GetGridSize() const {
   return int3(wg[work_group_launch_order_[0]] * work_group_size_.x,
               wg[work_group_launch_order_[1]] * work_group_size_.y,
               wg[work_group_launch_order_[2]] * work_group_size_.z);
-  return int3(grid_x, grid_y, grid_z);
-}
-
-absl::Status ConvolutionTransposed3x3::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
 }
 
 bool IsConvolutionTransposed3x3Supported(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
index b11c83dfd85..0f4022b6eb6 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
@@ -37,8 +37,14 @@ namespace cl {
 class ConvolutionTransposed3x3 : public GPUOperation {
  public:
   ConvolutionTransposed3x3() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Compile(const CreationContext& creation_context) override;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override {
+    work_groups->push_back(work_group_size_);
+  }
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   ConvolutionTransposed3x3(ConvolutionTransposed3x3&& operation);
@@ -68,15 +74,14 @@ class ConvolutionTransposed3x3 : public GPUOperation {
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
                             absl::Span<T> dst);
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
+  std::string GenerateConvolutionTransposedCode(
+      const OperationDef& op_def,
+      ConvolutionTransposed3x3::WeightsUploadType weights_upload_type,
+      int2 padding, int3 work_group_launch_order);
 
   int2 padding_;
   int3 work_group_launch_order_;
   WeightsUploadType weights_upload_type_;
-
-  CLKernel kernel_;
-  int3 work_group_size_ = int3(8, 4, 1);
 };
 
 template <DataType T>
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
index 020a99852d7..4fb93dd3263 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
@@ -25,18 +25,33 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
 
-std::string GenerateConvolutionTransposedCode(const OperationDef& op_def,
-                                              int src_depth, int dst_depth,
-                                              const CLDevice& device,
-                                              Arguments* args) {
-  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
-  src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
-  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
-  args->AddObjectRef(
-      "dst_tensor", AccessType::WRITE,
-      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
+ConvolutionTransposed3x3Thin::ConvolutionTransposed3x3Thin(
+    const OperationDef& definition, const ConvolutionTransposedAttributes& attr)
+    : GPUOperation(definition) {
+  code_ = GenerateConvolutionTransposedCode(
+      definition_, DivideRoundUp(attr.weights.shape.i, 4),
+      DivideRoundUp(attr.weights.shape.o, 4));
+}
+
+ConvolutionTransposed3x3Thin::ConvolutionTransposed3x3Thin(
+    ConvolutionTransposed3x3Thin&& operation)
+    : GPUOperation(std::move(operation)) {}
+
+ConvolutionTransposed3x3Thin& ConvolutionTransposed3x3Thin::operator=(
+    ConvolutionTransposed3x3Thin&& operation) {
+  if (this != &operation) {
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+std::string ConvolutionTransposed3x3Thin::GenerateConvolutionTransposedCode(
+    const OperationDef& op_def, int src_depth, int dst_depth) {
+  auto src_desc = op_def.src_tensors[0];
+  src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
+  AddSrcTensor("src_tensor", src_desc);
+  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
 
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
 
@@ -165,56 +180,6 @@ std::string GenerateConvolutionTransposedCode(const OperationDef& op_def,
 
   return c;
 }
-}  // namespace
-
-ConvolutionTransposed3x3Thin::ConvolutionTransposed3x3Thin(
-    const OperationDef& definition, const ConvolutionTransposedAttributes& attr)
-    : GPUOperation(definition),
-      src_channels_(attr.weights.shape.i),
-      dst_channels_(attr.weights.shape.o) {}
-
-ConvolutionTransposed3x3Thin::ConvolutionTransposed3x3Thin(
-    ConvolutionTransposed3x3Thin&& operation)
-    : GPUOperation(std::move(operation)),
-      src_channels_(operation.src_channels_),
-      dst_channels_(operation.dst_channels_),
-      kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_) {}
-
-ConvolutionTransposed3x3Thin& ConvolutionTransposed3x3Thin::operator=(
-    ConvolutionTransposed3x3Thin&& operation) {
-  if (this != &operation) {
-    std::swap(src_channels_, operation.src_channels_);
-    std::swap(dst_channels_, operation.dst_channels_);
-    kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status ConvolutionTransposed3x3Thin::Compile(
-    const CreationContext& creation_context) {
-  std::string code = GenerateConvolutionTransposedCode(
-      definition_, DivideRoundUp(src_channels_, 4),
-      DivideRoundUp(dst_channels_, 4), *creation_context.device, &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
-absl::Status ConvolutionTransposed3x3Thin::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
-}
 
 int3 ConvolutionTransposed3x3Thin::GetGridSize() const {
   const int grid_x = src_[0]->Width() * dst_[0]->Batch();
@@ -223,17 +188,6 @@ int3 ConvolutionTransposed3x3Thin::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status ConvolutionTransposed3x3Thin::Tune(
-    const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status ConvolutionTransposed3x3Thin::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 bool IsConvolutionTransposed3x3ThinSupported(
     const CLDevice& device, const ConvolutionTransposedAttributes& attr) {
   return attr.weights.shape.o <= 8 && attr.weights.shape.w == 3 &&
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
index e292f416796..5b4c4d05bac 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
@@ -37,10 +37,7 @@ namespace cl {
 class ConvolutionTransposed3x3Thin : public GPUOperation {
  public:
   ConvolutionTransposed3x3Thin() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
-  absl::Status Compile(const CreationContext& creation_context) override;
+  int3 GetGridSize() const override;
 
   // Move only
   ConvolutionTransposed3x3Thin(ConvolutionTransposed3x3Thin&& operation);
@@ -67,22 +64,16 @@ class ConvolutionTransposed3x3Thin : public GPUOperation {
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
                             absl::Span<T> dst);
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
-  int src_channels_;
-  int dst_channels_;
-
-  CLKernel kernel_;
-  int3 work_group_size_ = int3(8, 4, 1);
+  std::string GenerateConvolutionTransposedCode(const OperationDef& op_def,
+                                                int src_depth, int dst_depth);
 };
 
 template <DataType T>
 absl::Status ConvolutionTransposed3x3Thin::UploadData(
     const tflite::gpu::Tensor<OHWI, T>& weights,
     const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
-  const int src_depth = DivideRoundUp(src_channels_, 4);
-  const int dst_depth = DivideRoundUp(dst_channels_, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
   const int kernel_x = 3;  //  This operation support only 3x3 kernel
   const int kernel_y = 3;
   const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
@@ -135,8 +126,8 @@ absl::Status ConvolutionTransposed3x3Thin::UploadData(
 template <DataType S, typename T>
 void ConvolutionTransposed3x3Thin::RearrangeWeightsData(
     const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
-  const int src_depth = DivideRoundUp(src_channels_, 4);
-  const int dst_depth = DivideRoundUp(dst_channels_, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
   const int kernel_x = 3;
   const int kernel_y = 3;
 
@@ -155,7 +146,7 @@ void ConvolutionTransposed3x3Thin::RearrangeWeightsData(
             for (int i = 0; i < 4; ++i) {
               const int s_ch = s * 4 + i;
               const int d_ch = d * 4 + j;
-              if (s_ch < src_channels_ && d_ch < dst_channels_) {
+              if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
                 const int f_index = weights.shape.LinearIndex(
                     {d_ch, kernel_index_y, kernel_index_x, s_ch});
                 filters[i][j] = weights.data[f_index];
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
index 61882f29f15..4ecb23c318c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
@@ -27,24 +27,57 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
+ConvolutionTransposed4x4::ConvolutionTransposed4x4(
+    const OperationDef& definition, const CLDevice& device)
+    : GPUOperation(definition) {
+  work_group_size_ = int3(8, 4, 1);
+  if (device.IsPowerVR()) {
+    weights_upload_type_ = WeightsUploadType::LOCAL_MEM_ASYNC;
+  } else if (device.IsNvidia() || device.IsIntel()) {
+    weights_upload_type_ = WeightsUploadType::LOCAL_MEM_BY_THREADS;
+  } else if (device.IsAMD()) {
+    weights_upload_type_ = WeightsUploadType::CONSTANT_MEM;
+  } else {
+    weights_upload_type_ = WeightsUploadType::GLOBAL_MEM;
+  }
 
-std::string GenerateConvolutionTransposedCode(
-    const OperationDef& op_def,
-    ConvolutionTransposed4x4::WeightsUploadType weights_upload_type,
-    Arguments* args) {
-  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
-  src_desc->SetTextureAddressMode(TextureAddressMode::ZERO);
-  if (op_def.IsBatchSupported()) {
-    src_desc->SetStateVar("BatchedWidth", "true");
+  code_ = GenerateConvolutionTransposedCode(definition_, weights_upload_type_);
+  if (definition_.precision == CalculationsPrecision::F16 &&
+      device.IsPowerVR()) {
+    compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
   }
-  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
-  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
-  if (op_def.IsBatchSupported()) {
-    dst_desc->SetStateVar("BatchedWidth", "true");
+}
+
+ConvolutionTransposed4x4::ConvolutionTransposed4x4(
+    ConvolutionTransposed4x4&& operation)
+    : GPUOperation(std::move(operation)),
+      weights_upload_type_(operation.weights_upload_type_) {}
+
+ConvolutionTransposed4x4& ConvolutionTransposed4x4::operator=(
+    ConvolutionTransposed4x4&& operation) {
+  if (this != &operation) {
+    std::swap(weights_upload_type_, operation.weights_upload_type_);
+    GPUOperation::operator=(std::move(operation));
   }
-  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
-  args->AddInt("filter_offset");
+  return *this;
+}
+
+std::string ConvolutionTransposed4x4::GenerateConvolutionTransposedCode(
+    const OperationDef& op_def, WeightsUploadType weights_upload_type) {
+  auto src_desc = op_def.src_tensors[0];
+  src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
+  if (op_def.IsBatchSupported()) {
+    src_desc.SetStateVar("BatchedWidth", "true");
+  }
+  AddSrcTensor("src_tensor", src_desc);
+
+  auto dst_desc = op_def.dst_tensors[0];
+  if (op_def.IsBatchSupported()) {
+    dst_desc.SetStateVar("BatchedWidth", "true");
+  }
+  AddDstTensor("dst_tensor", dst_desc);
+
+  args_.AddInt("filter_offset");
 
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
   const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
@@ -262,68 +295,8 @@ std::string GenerateConvolutionTransposedCode(
   return c;
 }
 
-}  // namespace
-
-ConvolutionTransposed4x4::ConvolutionTransposed4x4(
-    const OperationDef& definition, const CLDevice& device)
-    : GPUOperation(definition) {
-  if (device.IsPowerVR()) {
-    weights_upload_type_ = WeightsUploadType::LOCAL_MEM_ASYNC;
-  } else if (device.IsNvidia() || device.IsIntel()) {
-    weights_upload_type_ = WeightsUploadType::LOCAL_MEM_BY_THREADS;
-  } else if (device.IsAMD()) {
-    weights_upload_type_ = WeightsUploadType::CONSTANT_MEM;
-  } else {
-    weights_upload_type_ = WeightsUploadType::GLOBAL_MEM;
-  }
-}
-
-ConvolutionTransposed4x4::ConvolutionTransposed4x4(
-    ConvolutionTransposed4x4&& operation)
-    : GPUOperation(std::move(operation)),
-      weights_upload_type_(operation.weights_upload_type_),
-      kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_) {}
-
-ConvolutionTransposed4x4& ConvolutionTransposed4x4::operator=(
-    ConvolutionTransposed4x4&& operation) {
-  if (this != &operation) {
-    std::swap(weights_upload_type_, operation.weights_upload_type_);
-    kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status ConvolutionTransposed4x4::Compile(
-    const CreationContext& creation_context) {
-  std::string code = GenerateConvolutionTransposedCode(
-      definition_, weights_upload_type_, &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-
-  std::vector<CompilerOptions> options;
-  if (definition_.precision == CalculationsPrecision::F16 &&
-      creation_context.device->IsPowerVR()) {
-    options.push_back(CompilerOptions::POWERVR_FP16);
-  }
-  RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", options, *creation_context.context,
-      *creation_context.device, &kernel_));
-  return absl::OkStatus();
-}
-
 absl::Status ConvolutionTransposed4x4::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(args_.SetInt("filter_offset", 4 * 16 * src_[0]->Slices()));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return args_.SetInt("filter_offset", 4 * 16 * src_[0]->Slices());
 }
 
 int3 ConvolutionTransposed4x4::GetGridSize() const {
@@ -333,11 +306,6 @@ int3 ConvolutionTransposed4x4::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status ConvolutionTransposed4x4::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 bool IsConvolutionTransposed4x4Supported(
     const CLDevice& device, const OperationDef& definition,
     const ConvolutionTransposedAttributes& attr) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
index b7d52a8cf5a..6344ca39bc0 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
@@ -37,8 +37,14 @@ namespace cl {
 class ConvolutionTransposed4x4 : public GPUOperation {
  public:
   ConvolutionTransposed4x4() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Compile(const CreationContext& creation_context) override;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override {
+    work_groups->push_back(work_group_size_);
+  }
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   ConvolutionTransposed4x4(ConvolutionTransposed4x4&& operation);
@@ -68,13 +74,10 @@ class ConvolutionTransposed4x4 : public GPUOperation {
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
                             absl::Span<T> dst);
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
+  std::string GenerateConvolutionTransposedCode(
+      const OperationDef& op_def, WeightsUploadType weights_upload_type);
 
   WeightsUploadType weights_upload_type_;
-
-  CLKernel kernel_;
-  int3 work_group_size_ = int3(8, 4, 1);
 };
 
 template <DataType T>
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
index 9df9587663c..54fd5396869 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
@@ -26,18 +26,37 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
 
-std::string GenerateConvolutionTransposedCode(const OperationDef& op_def,
-                                              int src_depth, int dst_channels,
-                                              const int2& kernel_size,
-                                              Arguments* args) {
-  args->AddObjectRef(
-      "src_tensor", AccessType::READ,
-      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
-  args->AddObjectRef(
-      "dst_tensor", AccessType::WRITE,
-      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
+ConvolutionTransposedThin::ConvolutionTransposedThin(
+    const OperationDef& definition, const ConvolutionTransposedAttributes& attr,
+    const DeviceInfo& device_info)
+    : GPUOperation(definition) {
+  code_ = GenerateConvolutionTransposedCode(
+      definition_, DivideRoundUp(attr.weights.shape.i, 4), attr.weights.shape.o,
+      int2(attr.weights.shape.w, attr.weights.shape.h));
+  if (definition_.precision == CalculationsPrecision::F16 &&
+      device_info.IsAdreno3xx()) {
+    compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
+  }
+}
+
+ConvolutionTransposedThin::ConvolutionTransposedThin(
+    ConvolutionTransposedThin&& operation)
+    : GPUOperation(std::move(operation)) {}
+
+ConvolutionTransposedThin& ConvolutionTransposedThin::operator=(
+    ConvolutionTransposedThin&& operation) {
+  if (this != &operation) {
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+std::string ConvolutionTransposedThin::GenerateConvolutionTransposedCode(
+    const OperationDef& op_def, int src_depth, int dst_channels,
+    const int2& kernel_size) {
+  AddSrcTensor("src_tensor", op_def.src_tensors[0]);
+  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
 
   const std::string channel_x = dst_channels == 1 ? "" : ".x";
   const std::vector<std::string> postfix = {channel_x, ".y", ".z", ".w"};
@@ -131,66 +150,6 @@ std::string GenerateConvolutionTransposedCode(const OperationDef& op_def,
 
   return c;
 }
-}  // namespace
-
-ConvolutionTransposedThin::ConvolutionTransposedThin(
-    const OperationDef& definition, const ConvolutionTransposedAttributes& attr)
-    : GPUOperation(definition),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
-      src_channels_(attr.weights.shape.i),
-      dst_channels_(attr.weights.shape.o) {}
-
-ConvolutionTransposedThin::ConvolutionTransposedThin(
-    ConvolutionTransposedThin&& operation)
-    : GPUOperation(std::move(operation)),
-      kernel_size_(operation.kernel_size_),
-      src_channels_(operation.src_channels_),
-      dst_channels_(operation.dst_channels_),
-      kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_) {}
-
-ConvolutionTransposedThin& ConvolutionTransposedThin::operator=(
-    ConvolutionTransposedThin&& operation) {
-  if (this != &operation) {
-    std::swap(kernel_size_, operation.kernel_size_);
-    std::swap(src_channels_, operation.src_channels_);
-    std::swap(dst_channels_, operation.dst_channels_);
-    kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status ConvolutionTransposedThin::Compile(
-    const CreationContext& creation_context) {
-  std::string code = GenerateConvolutionTransposedCode(
-      definition_, DivideRoundUp(src_channels_, 4), dst_channels_, kernel_size_,
-      &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-
-  std::vector<CompilerOptions> options;
-  if (definition_.precision == CalculationsPrecision::F16 &&
-      creation_context.device->IsAdreno3xx()) {
-    options.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
-  }
-
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
-absl::Status ConvolutionTransposedThin::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
-}
 
 int3 ConvolutionTransposedThin::GetGridSize() const {
   const int grid_x = src_[0]->Width() * dst_[0]->Batch();
@@ -199,16 +158,6 @@ int3 ConvolutionTransposedThin::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status ConvolutionTransposedThin::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status ConvolutionTransposedThin::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 bool IsConvolutionTransposedThinSupported(
     const CLDevice& device, const ConvolutionTransposedAttributes& attr) {
   return attr.weights.shape.o <= 4 && attr.weights.shape.w == attr.stride.w &&
@@ -225,7 +174,8 @@ absl::Status CreateConvolutionTransposedThin(
     return absl::InvalidArgumentError(
         "ConvolutionTransposedThin doesn't support this attributes");
   }
-  *result = ConvolutionTransposedThin(definition, attr);
+  *result = ConvolutionTransposedThin(definition, attr,
+                                      creation_context.device->info_);
   RETURN_IF_ERROR(
       result->UploadData(attr.weights, attr.bias, creation_context.context));
   return absl::OkStatus();
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
index 9a994d61e70..817887ab7af 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
@@ -37,10 +37,7 @@ namespace cl {
 class ConvolutionTransposedThin : public GPUOperation {
  public:
   ConvolutionTransposedThin() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
-  absl::Status Compile(const CreationContext& creation_context) override;
+  int3 GetGridSize() const override;
 
   // Move only
   ConvolutionTransposedThin(ConvolutionTransposedThin&& operation);
@@ -55,7 +52,8 @@ class ConvolutionTransposedThin : public GPUOperation {
       const ConvolutionTransposedAttributes& attr,
       ConvolutionTransposedThin* result);
   ConvolutionTransposedThin(const OperationDef& definition,
-                            const ConvolutionTransposedAttributes& attr);
+                            const ConvolutionTransposedAttributes& attr,
+                            const DeviceInfo& device_info);
   template <DataType T>
   absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
                           const tflite::gpu::Tensor<Linear, T>& biases,
@@ -64,25 +62,18 @@ class ConvolutionTransposedThin : public GPUOperation {
   template <DataType S, typename T>
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
                             absl::Span<T> dst);
-
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
-  int2 kernel_size_;
-  int src_channels_;
-  int dst_channels_;
-
-  CLKernel kernel_;
-  int3 work_group_size_ = int3(8, 4, 1);
+  std::string GenerateConvolutionTransposedCode(const OperationDef& op_def,
+                                                int src_depth, int dst_channels,
+                                                const int2& kernel_size);
 };
 
 template <DataType T>
 absl::Status ConvolutionTransposedThin::UploadData(
     const tflite::gpu::Tensor<OHWI, T>& weights,
     const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
-  const int src_depth = DivideRoundUp(src_channels_, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
   const int flt4_count =
-      kernel_size_.x * kernel_size_.y * src_depth * dst_channels_;
+      weights.shape.w * weights.shape.h * src_depth * weights.shape.o;
 
   const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
 
@@ -126,20 +117,20 @@ absl::Status ConvolutionTransposedThin::UploadData(
 template <DataType S, typename T>
 void ConvolutionTransposedThin::RearrangeWeightsData(
     const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
-  const int src_depth = DivideRoundUp(src_channels_, 4);
-  const int kernel_x = kernel_size_.x;
-  const int kernel_y = kernel_size_.y;
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
 
   int counter = 0;
   for (int s = 0; s < src_depth; ++s) {
     for (int y = 0; y < kernel_y; ++y) {
       for (int x = 0; x < kernel_x; ++x) {
-        std::vector<T> filters(dst_channels_);
-        for (int j = 0; j < dst_channels_; ++j) {
+        std::vector<T> filters(weights.shape.o);
+        for (int j = 0; j < weights.shape.o; ++j) {
           for (int i = 0; i < 4; ++i) {
             const int s_ch = s * 4 + i;
             const int d_ch = j;
-            if (s_ch < src_channels_ && d_ch < dst_channels_) {
+            if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
               const int f_index = weights.shape.LinearIndex({d_ch, y, x, s_ch});
               filters[j][i] = weights.data[f_index];
             } else {
@@ -147,7 +138,7 @@ void ConvolutionTransposedThin::RearrangeWeightsData(
             }
           }
         }
-        for (int j = 0; j < dst_channels_; ++j) {
+        for (int j = 0; j < weights.shape.o; ++j) {
           dst[counter++] = filters[j];
         }
       }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
index de1a04befa8..4b4416751fb 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
@@ -66,37 +66,99 @@ std::string GetSrcValue(int channel_multiplier, const std::string coords) {
 
   return c;
 }
+}  // namespace
 
-std::string GenerateDepthwiseConvolutionCode(
+DepthwiseConvolution::DepthwiseConvolution(
+    const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& attr, bool weights_are_buffer)
+    : GPUOperation(definition),
+      weights_are_buffer_(weights_are_buffer),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h, 0, 0),
+      stride_(attr.strides.w, attr.strides.h, 0, 0),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
+      dilation_(attr.dilations.w, attr.dilations.h, 0, 0),
+      channel_multiplier_(attr.weights.shape.o) {
+  work_group_size_ = int3(8, 8, 1);
+  const bool stride_correction =
+      definition_.IsBatchSupported() && stride_.x != 1;
+  code_ = GenerateDepthwiseConvolutionCode(
+      definition_, stride_correction, channel_multiplier_, weights_are_buffer_);
+}
+
+DepthwiseConvolution::DepthwiseConvolution(
+    const OperationDef& definition,
+    const DepthwiseConvolution3DAttributes& attr, bool weights_are_buffer)
+    : GPUOperation(definition),
+      weights_are_buffer_(weights_are_buffer),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
+                   attr.weights.shape.d, 0),
+      stride_(attr.strides.w, attr.strides.h, attr.strides.d, 0),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
+               -attr.padding.prepended.d, 0),
+      dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d, 0),
+      channel_multiplier_(attr.weights.shape.o) {
+  work_group_size_ = int3(8, 8, 1);
+  const bool stride_correction =
+      definition_.IsBatchSupported() && stride_.x != 1;
+  code_ = GenerateDepthwiseConvolutionCode(
+      definition_, stride_correction, channel_multiplier_, weights_are_buffer_);
+}
+
+DepthwiseConvolution::DepthwiseConvolution(DepthwiseConvolution&& operation)
+    : GPUOperation(std::move(operation)),
+      weights_are_buffer_(operation.weights_are_buffer_),
+      kernel_size_(operation.kernel_size_),
+      stride_(operation.stride_),
+      padding_(operation.padding_),
+      dilation_(operation.dilation_),
+      channel_multiplier_(operation.channel_multiplier_) {}
+
+DepthwiseConvolution& DepthwiseConvolution::operator=(
+    DepthwiseConvolution&& operation) {
+  if (this != &operation) {
+    std::swap(weights_are_buffer_, operation.weights_are_buffer_);
+    std::swap(kernel_size_, operation.kernel_size_);
+    std::swap(stride_, operation.stride_);
+    std::swap(padding_, operation.padding_);
+    std::swap(dilation_, operation.dilation_);
+    std::swap(channel_multiplier_, operation.channel_multiplier_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+std::string DepthwiseConvolution::GenerateDepthwiseConvolutionCode(
     const OperationDef& op_def, bool stride_correction, int channel_multiplier,
-    bool weights_are_buffer, const CLDevice& device, Arguments* args) {
-  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
-  src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
+    bool weights_are_buffer) {
+  auto src_desc = op_def.src_tensors[0];
+  src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
   if (op_def.IsBatchSupported()) {
-    src_desc->SetStateVar("BatchedWidth", "true");
+    src_desc.SetStateVar("BatchedWidth", "true");
   }
-  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
-  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  AddSrcTensor("src_tensor", src_desc);
+
+  auto dst_desc = op_def.dst_tensors[0];
   if (op_def.IsBatchSupported()) {
-    dst_desc->SetStateVar("BatchedWidth", "true");
+    dst_desc.SetStateVar("BatchedWidth", "true");
   }
-  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
-  args->AddInt("kernel_size_x");
-  args->AddInt("stride_x");
-  args->AddInt("padding_x");
-  args->AddInt("dilation_x");
-  args->AddInt("kernel_size_y");
-  args->AddInt("stride_y");
-  args->AddInt("padding_y");
-  args->AddInt("dilation_y");
+  AddDstTensor("dst_tensor", dst_desc);
+
+  args_.AddInt("kernel_size_x");
+  args_.AddInt("stride_x");
+  args_.AddInt("padding_x");
+  args_.AddInt("dilation_x");
+  args_.AddInt("kernel_size_y");
+  args_.AddInt("stride_y");
+  args_.AddInt("padding_y");
+  args_.AddInt("dilation_y");
   if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
-    args->AddInt("kernel_size_z");
-    args->AddInt("stride_z");
-    args->AddInt("padding_z");
-    args->AddInt("dilation_z");
+    args_.AddInt("kernel_size_z");
+    args_.AddInt("stride_z");
+    args_.AddInt("padding_z");
+    args_.AddInt("dilation_z");
   }
   if (!IsSpecializedCase(channel_multiplier)) {
-    args->AddInt("ch_multiplier");
+    args_.AddInt("ch_multiplier");
   }
 
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
@@ -215,82 +277,8 @@ std::string GenerateDepthwiseConvolutionCode(
 
   return c;
 }
-}  // namespace
-
-DepthwiseConvolution::DepthwiseConvolution(
-    const OperationDef& definition,
-    const DepthwiseConvolution2DAttributes& attr, bool weights_are_buffer)
-    : GPUOperation(definition),
-      weights_are_buffer_(weights_are_buffer),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h, 0, 0),
-      stride_(attr.strides.w, attr.strides.h, 0, 0),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
-      dilation_(attr.dilations.w, attr.dilations.h, 0, 0),
-      channel_multiplier_(attr.weights.shape.o),
-      work_group_size_(8, 8, 1) {}
-
-DepthwiseConvolution::DepthwiseConvolution(
-    const OperationDef& definition,
-    const DepthwiseConvolution3DAttributes& attr, bool weights_are_buffer)
-    : GPUOperation(definition),
-      weights_are_buffer_(weights_are_buffer),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
-                   attr.weights.shape.d, 0),
-      stride_(attr.strides.w, attr.strides.h, attr.strides.d, 0),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
-               -attr.padding.prepended.d, 0),
-      dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d, 0),
-      channel_multiplier_(attr.weights.shape.o),
-      work_group_size_(8, 8, 1) {}
-
-DepthwiseConvolution::DepthwiseConvolution(DepthwiseConvolution&& operation)
-    : GPUOperation(std::move(operation)),
-      weights_are_buffer_(operation.weights_are_buffer_),
-      kernel_size_(operation.kernel_size_),
-      stride_(operation.stride_),
-      padding_(operation.padding_),
-      dilation_(operation.dilation_),
-      channel_multiplier_(operation.channel_multiplier_),
-      kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_) {}
-
-DepthwiseConvolution& DepthwiseConvolution::operator=(
-    DepthwiseConvolution&& operation) {
-  if (this != &operation) {
-    std::swap(weights_are_buffer_, operation.weights_are_buffer_);
-    std::swap(kernel_size_, operation.kernel_size_);
-    std::swap(stride_, operation.stride_);
-    std::swap(padding_, operation.padding_);
-    std::swap(dilation_, operation.dilation_);
-    std::swap(channel_multiplier_, operation.channel_multiplier_);
-    kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status DepthwiseConvolution::Compile(
-    const CreationContext& creation_context) {
-  const bool stride_correction =
-      definition_.IsBatchSupported() && stride_.x != 1;
-  std::string code = GenerateDepthwiseConvolutionCode(
-      definition_, stride_correction, channel_multiplier_, weights_are_buffer_,
-      *creation_context.device, &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
 
 absl::Status DepthwiseConvolution::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
   RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
   RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
@@ -308,8 +296,7 @@ absl::Status DepthwiseConvolution::BindArguments() {
   if (!IsSpecializedCase(channel_multiplier_)) {
     RETURN_IF_ERROR(args_.SetInt("ch_multiplier", channel_multiplier_));
   }
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return absl::OkStatus();
 }
 
 int3 DepthwiseConvolution::GetGridSize() const {
@@ -319,16 +306,6 @@ int3 DepthwiseConvolution::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status DepthwiseConvolution::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status DepthwiseConvolution::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 absl::Status CreateDepthwiseConvolution(
     const CreationContext& creation_context, const OperationDef& definition,
     const DepthwiseConvolution2DAttributes& attr,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
index 30cd3d06a5a..9a841db82ab 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
@@ -38,10 +38,8 @@ namespace cl {
 class DepthwiseConvolution : public GPUOperation {
  public:
   DepthwiseConvolution() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
-  absl::Status Compile(const CreationContext& creation_context) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   DepthwiseConvolution(DepthwiseConvolution&& operation);
@@ -81,8 +79,10 @@ class DepthwiseConvolution : public GPUOperation {
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
                             absl::Span<T> dst);
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
+  std::string GenerateDepthwiseConvolutionCode(const OperationDef& op_def,
+                                               bool stride_correction,
+                                               int channel_multiplier,
+                                               bool weights_are_buffer);
 
   bool weights_are_buffer_;
 
@@ -91,9 +91,6 @@ class DepthwiseConvolution : public GPUOperation {
   int4 padding_;
   int4 dilation_;
   int channel_multiplier_;
-
-  CLKernel kernel_;
-  int3 work_group_size_;
 };
 
 template <DataType T>
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc
index 309ce4a9d87..bb1b409482f 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc
@@ -26,18 +26,46 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
 
-std::string GenerateDepthwiseConvCode(const OperationDef& op_def,
-                                      const CLDevice& device,
-                                      bool weights_are_buffer,
-                                      bool local_mem_uploads, Arguments* args) {
-  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
-  src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
-  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
-  args->AddObjectRef(
-      "dst_tensor", AccessType::WRITE,
-      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
+DepthwiseConv3x3::DepthwiseConv3x3(const OperationDef& definition,
+                                   bool weights_are_buffer,
+                                   bool local_mem_uploads,
+                                   const DeviceInfo& device_info)
+    : GPUOperation(definition),
+      weights_are_buffer_(weights_are_buffer),
+      local_mem_uploads_(local_mem_uploads) {
+  work_group_size_ = int3(8, 4, 1);
+  code_ = GenerateDepthwiseConvCode(definition_, weights_are_buffer_,
+                                    local_mem_uploads_);
+
+  if (definition_.precision == CalculationsPrecision::F16 &&
+      device_info.IsPowerVR()) {
+    compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
+  }
+}
+
+DepthwiseConv3x3::DepthwiseConv3x3(DepthwiseConv3x3&& operation)
+    : GPUOperation(std::move(operation)),
+      weights_are_buffer_(operation.weights_are_buffer_),
+      local_mem_uploads_(operation.local_mem_uploads_) {}
+
+DepthwiseConv3x3& DepthwiseConv3x3::operator=(DepthwiseConv3x3&& operation) {
+  if (this != &operation) {
+    std::swap(weights_are_buffer_, operation.weights_are_buffer_);
+    std::swap(local_mem_uploads_, operation.local_mem_uploads_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+std::string DepthwiseConv3x3::GenerateDepthwiseConvCode(
+    const OperationDef& op_def, bool weights_are_buffer,
+    bool local_mem_uploads) {
+  auto src_desc = op_def.src_tensors[0];
+  src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
+  AddSrcTensor("src_tensor", src_desc);
+  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
+
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
 
   const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
@@ -49,7 +77,15 @@ std::string GenerateDepthwiseConvCode(const OperationDef& op_def,
   }
   c += "__kernel void main_function(\n";
   c += "$0) {\n";
-  c += "  int X = get_global_id(0) * 2;\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int linear_id = get_global_id(0);\n";
+    c += "  int X = (linear_id / args.dst_tensor.Batch()) * 2;\n";
+    c += "  int B = linear_id % args.dst_tensor.Batch();\n";
+    c += "  args.dst_tensor.SetBatchRef(B);\n";
+    c += "  args.src_tensor.SetBatchRef(B);\n";
+  } else {
+    c += "  int X = get_global_id(0) * 2;\n";
+  }
   c += "  int Y = get_global_id(1) * 2;\n";
   c += "  int S = get_global_id(2);\n";
   c += "   ACCUM_FLT4 r0 = (ACCUM_FLT4)(0.0f);\n";
@@ -224,8 +260,7 @@ std::string GenerateDepthwiseConvCode(const OperationDef& op_def,
   c += "  r3 += TO_ACCUM_TYPE(" + bias + ");\n";
   if (local_mem_uploads) {
     c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
-         "|| "
-         "S >= args.dst_tensor.Slices()) { \n";
+         "|| S >= args.dst_tensor.Slices()) { \n";
     c += "    return; \n";
     c += "  } \n";
   }
@@ -254,80 +289,27 @@ std::string GenerateDepthwiseConvCode(const OperationDef& op_def,
   return c;
 }
 
-}  // namespace
-
-DepthwiseConv3x3::DepthwiseConv3x3(const OperationDef& definition,
-                                   bool weights_are_buffer,
-                                   bool local_mem_uploads)
-    : GPUOperation(definition),
-      weights_are_buffer_(weights_are_buffer),
-      local_mem_uploads_(local_mem_uploads) {}
-
-DepthwiseConv3x3::DepthwiseConv3x3(DepthwiseConv3x3&& operation)
-    : GPUOperation(std::move(operation)),
-      weights_are_buffer_(operation.weights_are_buffer_),
-      local_mem_uploads_(operation.local_mem_uploads_),
-      kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_) {}
-
-DepthwiseConv3x3& DepthwiseConv3x3::operator=(DepthwiseConv3x3&& operation) {
-  if (this != &operation) {
-    std::swap(weights_are_buffer_, operation.weights_are_buffer_);
-    std::swap(local_mem_uploads_, operation.local_mem_uploads_);
-    kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status DepthwiseConv3x3::Compile(
-    const CreationContext& creation_context) {
-  std::string code = GenerateDepthwiseConvCode(
-      definition_, *creation_context.device, weights_are_buffer_,
-      local_mem_uploads_, &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-
-  std::vector<CompilerOptions> options;
-  if (definition_.precision == CalculationsPrecision::F16 &&
-      creation_context.device->IsPowerVR()) {
-    options.push_back(CompilerOptions::POWERVR_FP16);
-  }
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", options, *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
 absl::Status DepthwiseConv3x3::BindArguments() {
   RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return args_.SetObjectRef("dst_tensor", dst_[0]);
 }
 
 int3 DepthwiseConv3x3::GetGridSize() const {
-  const int grid_x = DivideRoundUp(dst_[0]->Width(), 2);
+  const int grid_x = DivideRoundUp(dst_[0]->Width(), 2) * dst_[0]->Batch();
   const int grid_y = DivideRoundUp(dst_[0]->Height(), 2);
   const int grid_z = dst_[0]->Slices();
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status DepthwiseConv3x3::Tune(const TuningParameters& params) {
+void DepthwiseConv3x3::GetPossibleKernelWorkGroups(
+    TuningType tuning_type, const DeviceInfo& device_info,
+    const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
   if (local_mem_uploads_) {
-    return absl::OkStatus();
+    work_groups->push_back(work_group_size_);
+  } else {
+    GetPossibleWorkGroups(tuning_type, device_info, kernel_info, grid_size_,
+                          work_groups);
   }
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status DepthwiseConv3x3::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
 }
 
 bool IsDepthwiseConv3x3Supported(const DepthwiseConvolution2DAttributes& attr) {
@@ -350,7 +332,8 @@ absl::Status CreateDepthwiseConv3x3(
       creation_context.device->IsPowerVR() || creation_context.device->IsMali();
   bool local_mem_uploads =
       weights_are_buffer && creation_context.device->IsPowerVR();
-  *result = DepthwiseConv3x3(definition, weights_are_buffer, local_mem_uploads);
+  *result = DepthwiseConv3x3(definition, weights_are_buffer, local_mem_uploads,
+                             creation_context.device->info_);
   return result->UploadWeightsAndBiases(attr.weights, attr.bias,
                                         creation_context.context);
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h
index 9cb2ac41c87..b324b039f2b 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h
@@ -38,10 +38,12 @@ namespace cl {
 class DepthwiseConv3x3 : public GPUOperation {
  public:
   DepthwiseConv3x3() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
-  absl::Status Compile(const CreationContext& creation_context) override;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   DepthwiseConv3x3(DepthwiseConv3x3&& operation);
@@ -51,7 +53,8 @@ class DepthwiseConv3x3 : public GPUOperation {
 
  private:
   explicit DepthwiseConv3x3(const OperationDef& definition,
-                            bool weights_are_buffer, bool local_mem_uploads);
+                            bool weights_are_buffer, bool local_mem_uploads,
+                            const DeviceInfo& device_info);
   template <DataType T>
   absl::Status UploadWeightsAndBiases(
       const tflite::gpu::Tensor<OHWI, T>& weights,
@@ -66,14 +69,12 @@ class DepthwiseConv3x3 : public GPUOperation {
       const tflite::gpu::Tensor<OHWI, S>& weights,
       const tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst);
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
+  std::string GenerateDepthwiseConvCode(const OperationDef& op_def,
+                                        bool weights_are_buffer,
+                                        bool local_mem_uploads);
 
   bool weights_are_buffer_;
   bool local_mem_uploads_;
-
-  CLKernel kernel_;
-  int3 work_group_size_ = int3(8, 4, 1);
 };
 
 template <DataType T>
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
index 47d7dababeb..7d46ae4a109 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
@@ -37,6 +37,16 @@ std::string GetOneInputCode(const OperationType& op_type,
     case OperationType::COS:
       result = "$0 = cos($0);\n";
       break;
+    case OperationType::COPY:
+      // No op as inout_value will be copied to dest automatically.
+      result = "\n";
+      break;
+    case OperationType::ELU:
+      result = "$0.x = $0.x < (FLT)(0.0f) ? exp($0.x) - (FLT)(1.0f) : $0.x;\n";
+      result += "$0.y = $0.y < (FLT)(0.0f) ? exp($0.y) - (FLT)(1.0f) : $0.y;\n";
+      result += "$0.z = $0.z < (FLT)(0.0f) ? exp($0.z) - (FLT)(1.0f) : $0.z;\n";
+      result += "$0.w = $0.w < (FLT)(0.0f) ? exp($0.w) - (FLT)(1.0f) : $0.w;\n";
+      break;
     case OperationType::EXP:
       result = "$0 = exp($0);\n";
       break;
@@ -88,191 +98,77 @@ std::string GetOneInputCode(const OperationType& op_type,
 }
 
 std::string GetTwoInputCode(const OperationType& op_type,
+                            const std::string& result_var,
                             const std::string& input0,
-                            const std::string& input1) {
+                            const std::string& input1,
+                            bool swap_inputs = false) {
   std::string result;
   switch (op_type) {
     case OperationType::ADD:
-      result += "$0 += $1;\n";
+      result += "$0 = $1 + $2;\n";
       break;
     case OperationType::DIV:
-      result += "$0 /= $1;\n";
+      result += "$0 = $1 / $2;\n";
       break;
     case OperationType::MAXIMUM:
-      result += "$0 = max($0, $1);\n";
+      result += "$0 = max($1, $2);\n";
       break;
     case OperationType::MINIMUM:
-      result += "$0 = min($0, $1);\n";
+      result += "$0 = min($1, $2);\n";
       break;
     case OperationType::MUL:
-      result += "$0 *= $1;\n";
+      result += "$0 = $1 * $2;\n";
       break;
     case OperationType::POW:
-      result += "$0 = pow($0, $1);\n";
+      result += "$0 = pow($1, $2);\n";
       break;
     case OperationType::SQUARED_DIFF:
-      result += "$0 -= $1;\n";
-      result += "$0 *= $0;\n";
+      result += "$0 = ($1 - $2) * ($1 - $2);\n";
       break;
     case OperationType::SUB:
-      result += "$0 -= $1;\n";
+      result += "$0 = $1 - $2;\n";
       break;
     default:
       return "Unknown operation type;\n";
   }
-  return absl::Substitute(result, input0, input1);
-}
-}  // namespace
-
-ElementwiseOneInput::ElementwiseOneInput(const OperationDef& definition,
-                                         const OperationType& op_type)
-    : ElementwiseOperation(definition), op_type_(op_type) {
-  code_ = GetOneInputCode(op_type, definition.precision, "in_out_value");
-}
-
-ElementwiseOneInput::ElementwiseOneInput(ElementwiseOneInput&& operation)
-    : ElementwiseOperation(std::move(operation)),
-      op_type_(operation.op_type_) {}
-
-ElementwiseOneInput& ElementwiseOneInput::operator=(
-    ElementwiseOneInput&& operation) {
-  if (this != &operation) {
-    std::swap(op_type_, operation.op_type_);
-    ElementwiseOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-ElementwiseOneInput CreateElementwiseOneInput(const OperationDef& definition,
-                                              const OperationType& op_type) {
-  ElementwiseOneInput operation(definition, op_type);
-  return operation;
-}
-
-ElementwiseOneRuntimeOneScalar::ElementwiseOneRuntimeOneScalar(
-    const OperationDef& definition, const OperationType& op_type,
-    float scalar_parameter, CalculationsPrecision scalar_precision)
-    : ElementwiseOperation(definition), op_type_(op_type) {
-  if (definition.precision == CalculationsPrecision::F32) {
-    args_.AddFloat("scalar", scalar_parameter);
+  if (swap_inputs) {
+    return absl::Substitute(result, result_var, input1, input0);
   } else {
-    args_.AddHalf("scalar", half(scalar_parameter));
+    return absl::Substitute(result, result_var, input0, input1);
   }
-  code_ = GetTwoInputCode(op_type, "in_out_value", "args.scalar");
 }
 
-ElementwiseOneRuntimeOneScalar::ElementwiseOneRuntimeOneScalar(
-    ElementwiseOneRuntimeOneScalar&& operation)
-    : ElementwiseOperation(std::move(operation)),
-      link_index_(operation.link_index_),
-      op_type_(operation.op_type_) {}
-
-ElementwiseOneRuntimeOneScalar& ElementwiseOneRuntimeOneScalar::operator=(
-    ElementwiseOneRuntimeOneScalar&& operation) {
-  if (this != &operation) {
-    link_index_ = operation.link_index_;
-    op_type_ = operation.op_type_;
-    ElementwiseOperation::operator=(std::move(operation));
+// Creates simple two input (first input is runtime tensor and second input is
+// scalar argument) operation, for example sub, div, pow, etc.
+GPUOperation CreateElementwiseOneRuntimeOneScalar(
+    const OperationDef& definition, const OperationType& op_type,
+    float scalar_parameter, bool swap_inputs) {
+  GPUOperation op(definition);
+  op.elementwise_ = true;
+  if (definition.precision == CalculationsPrecision::F32) {
+    op.args_.AddFloat("scalar", scalar_parameter);
+  } else {
+    op.args_.AddHalf("scalar", half(scalar_parameter));
   }
-  return *this;
-}
-
-ElementwiseOneRuntimeOneScalar CreateElementwiseOneRuntimeOneScalar(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const OperationType& op_type, float scalar_parameter) {
-  const auto scalar_precision = creation_context.device->IsPowerVR()
-                                    ? CalculationsPrecision::F32
-                                    : definition.precision;
-  ElementwiseOneRuntimeOneScalar operation(definition, op_type,
-                                           scalar_parameter, scalar_precision);
-  return operation;
-}
-
-ElementwiseTwoInput::ElementwiseTwoInput(const OperationDef& definition,
-                                         const OperationType& op_type,
-                                         const BroadcastSettings& broadcast)
-    : ElementwiseOperation(definition),
-      op_type_(op_type),
-      broadcast_(broadcast) {
-  auto src_desc =
-      absl::make_unique<TensorDescriptor>(definition.src_tensors[1]);
-  if (definition.IsBatchSupported()) {
-    src_desc->SetStateVar("BatchedWidth", "true");
-  }
-  args_.AddObjectRef("second_tensor", AccessType::READ, std::move(src_desc));
-  const std::string x_coord = broadcast.width ? "0" : "X_COORD";
-  const std::string y_coord = broadcast.height ? "0" : "Y_COORD";
-  const std::string s_coord = broadcast.channels ? "0" : "S_COORD";
-  code_ = absl::StrCat("FLT4 second_val = args.second_tensor.Read(", x_coord,
-                       ", ", y_coord, ", ", s_coord, ");\n");
-  if (broadcast.channels) {
-    code_ += "  second_val.y = second_val.x;\n";
-    code_ += "  second_val.z = second_val.x;\n";
-    code_ += "  second_val.w = second_val.x;\n";
-  }
-  code_ += GetTwoInputCode(op_type, "in_out_value", "second_val");
-}
-
-ElementwiseTwoInput::ElementwiseTwoInput(const OperationDef& definition,
-                                         const OperationType& op_type,
-                                         const BroadcastSettings& broadcast,
-                                         Tensor&& constant_tensor)
-    : ElementwiseOperation(definition),
-      op_type_(op_type),
-      broadcast_(broadcast) {
-  auto descriptor = constant_tensor.GetDescriptor();
-  args_.AddObject("second_tensor", AccessType::READ,
-                  absl::make_unique<Tensor>(std::move(constant_tensor)),
-                  absl::make_unique<TensorDescriptor>(descriptor));
-  const std::string x_coord = broadcast.width ? "0" : "X_COORD";
-  const std::string y_coord = broadcast.height ? "0" : "Y_COORD";
-  const std::string s_coord = broadcast.channels ? "0" : "S_COORD";
-  code_ = absl::StrCat("FLT4 second_val = args.second_tensor.Read(", x_coord,
-                       ", ", y_coord, ", ", s_coord, ");\n");
-  if (broadcast.channels) {
-    code_ += "  second_val.y = second_val.x;\n";
-    code_ += "  second_val.z = second_val.x;\n";
-    code_ += "  second_val.w = second_val.x;\n";
-  }
-  code_ += GetTwoInputCode(op_type, "in_out_value", "second_val");
-}
-
-ElementwiseTwoInput::ElementwiseTwoInput(ElementwiseTwoInput&& operation)
-    : ElementwiseOperation(std::move(operation)),
-      link_index_(operation.link_index_),
-      op_type_(operation.op_type_),
-      broadcast_(operation.broadcast_) {}
-
-ElementwiseTwoInput& ElementwiseTwoInput::operator=(
-    ElementwiseTwoInput&& operation) {
-  if (this != &operation) {
-    link_index_ = operation.link_index_;
-    op_type_ = operation.op_type_;
-    broadcast_ = operation.broadcast_;
-    ElementwiseOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status ElementwiseTwoInput::SetArgs(const std::string& unique_postfix,
-                                          Arguments* args) {
-  std::string tensor_name = absl::StrCat("second_tensor", unique_postfix);
-  if (src_.size() == 2) {
-    RETURN_IF_ERROR(args->SetObjectRef(tensor_name, src_[1]));
-  }
-  return absl::OkStatus();
+  op.code_ =
+      "FLT4 second_val = (FLT4)(args.scalar, args.scalar, args.scalar, "
+      "args.scalar);\n";
+  op.code_ += GetTwoInputCode(op_type, "in_out_value", "in_out_value",
+                              "second_val", swap_inputs);
+  return op;
 }
 
+// Creates simple two input(first input is runtime tensor and second input is
+// constant linear tensor) operation, for example sub, div and etc.
 absl::Status CreateElementwiseTwoInput(
     const CreationContext& creation_context, const OperationDef& definition,
     const OperationType& op_type,
     const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& constant_tensor,
-    ElementwiseTwoInput* result) {
+    bool swap_inputs, GPUOperation* result) {
   const BHWC shape = BHWC(1, 1, 1, constant_tensor.shape.v);
-  TensorStorageType storage_type =
-      SelectBestStorageType(*creation_context.context, *creation_context.device,
-                            shape, definition.GetPrimaryStorageType(),
-                            definition.GetDataType(), Layout::HWC);
+  TensorStorageType storage_type = SelectBestStorageType(
+      creation_context.device->info_, shape, definition.GetPrimaryStorageType(),
+      definition.GetDataType(), Layout::HWC);
   TensorDescriptor desc{definition.GetDataType(), storage_type, Layout::HWC};
   Tensor gpu_tensor;
   RETURN_IF_ERROR(CreateTensor(*creation_context.context,
@@ -280,26 +176,37 @@ absl::Status CreateElementwiseTwoInput(
                                &gpu_tensor));
   RETURN_IF_ERROR(
       gpu_tensor.WriteData(creation_context.queue, constant_tensor));
-  BroadcastSettings broadcast;
-  broadcast.width = true;
-  broadcast.height = true;
-  broadcast.channels = shape.c == 1;
-  *result = ElementwiseTwoInput(definition, op_type, broadcast,
-                                std::move(gpu_tensor));
+
+  *result = GPUOperation(definition);
+  result->elementwise_ = true;
+  result->args_.AddObject("second_tensor", AccessType::READ,
+                          absl::make_unique<Tensor>(std::move(gpu_tensor)),
+                          absl::make_unique<TensorDescriptor>(desc));
+  const std::string s_coord = shape.c == 1 ? "0" : "S_COORD";
+  result->code_ = absl::StrCat(
+      "FLT4 second_val = args.second_tensor.Read(0, 0, ", s_coord, ");\n");
+  if (shape.c == 1) {
+    result->code_ += "  second_val.y = second_val.x;\n";
+    result->code_ += "  second_val.z = second_val.x;\n";
+    result->code_ += "  second_val.w = second_val.x;\n";
+  }
+  result->code_ += GetTwoInputCode(op_type, "in_out_value", "in_out_value",
+                                   "second_val", swap_inputs);
   return absl::OkStatus();
 }
 
+// Creates simple two input(first input is runtime tensor and second input is
+// constant HWC tensor) operation, for example sub, div and etc.
 absl::Status CreateElementwiseTwoInput(
     const CreationContext& creation_context, const OperationDef& definition,
     const OperationType& op_type,
     const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& constant_tensor,
-    ElementwiseTwoInput* result) {
+    bool swap_inputs, GPUOperation* result) {
   const BHWC shape = BHWC(1, constant_tensor.shape.h, constant_tensor.shape.w,
                           constant_tensor.shape.c);
-  TensorStorageType storage_type =
-      SelectBestStorageType(*creation_context.context, *creation_context.device,
-                            shape, definition.GetPrimaryStorageType(),
-                            definition.GetDataType(), Layout::HWC);
+  TensorStorageType storage_type = SelectBestStorageType(
+      creation_context.device->info_, shape, definition.GetPrimaryStorageType(),
+      definition.GetDataType(), Layout::HWC);
   TensorDescriptor desc{definition.GetDataType(), storage_type, Layout::HWC};
   Tensor gpu_tensor;
   RETURN_IF_ERROR(CreateTensor(*creation_context.context,
@@ -307,34 +214,89 @@ absl::Status CreateElementwiseTwoInput(
                                &gpu_tensor));
   RETURN_IF_ERROR(
       gpu_tensor.WriteData(creation_context.queue, constant_tensor));
-  BroadcastSettings broadcast;
-  broadcast.width = shape.w == 1;
-  broadcast.height = shape.h == 1;
-  broadcast.channels = shape.c == 1;
-  *result = ElementwiseTwoInput(definition, op_type, broadcast,
-                                std::move(gpu_tensor));
+
+  *result = GPUOperation(definition);
+  result->elementwise_ = true;
+  result->args_.AddObject("second_tensor", AccessType::READ,
+                          absl::make_unique<Tensor>(std::move(gpu_tensor)),
+                          absl::make_unique<TensorDescriptor>(desc));
+  const std::string x_coord = shape.w == 1 ? "0" : "X_COORD";
+  const std::string y_coord = shape.h == 1 ? "0" : "Y_COORD";
+  const std::string s_coord = shape.c == 1 ? "0" : "S_COORD";
+  result->code_ = absl::StrCat("FLT4 second_val = args.second_tensor.Read(",
+                               x_coord, ", ", y_coord, ", ", s_coord, ");\n");
+  if (shape.c == 1) {
+    result->code_ += "  second_val.y = second_val.x;\n";
+    result->code_ += "  second_val.z = second_val.x;\n";
+    result->code_ += "  second_val.w = second_val.x;\n";
+  }
+  result->code_ += GetTwoInputCode(op_type, "in_out_value", "in_out_value",
+                                   "second_val", swap_inputs);
+
   return absl::OkStatus();
 }
 
-ElementwiseTwoInput CreateElementwiseTwoInput(const OperationDef& definition,
-                                              const OperationType& op_type,
-                                              const BHWC& shape) {
-  BroadcastSettings broadcast;
-  broadcast.width = shape.w == 1;
-  broadcast.height = shape.h == 1;
-  broadcast.channels = shape.c == 1;
-  ElementwiseTwoInput operation(definition, op_type, broadcast);
-  return operation;
+}  // namespace
+
+GPUOperation CreateElementwiseOneInput(const OperationDef& definition,
+                                       const OperationType& op_type) {
+  GPUOperation op(definition);
+  op.elementwise_ = true;
+  op.code_ = GetOneInputCode(op_type, definition.precision, "in_out_value");
+  return op;
 }
 
-ElementwiseTwoInput CreateElementwiseTwoInput(const OperationDef& definition,
-                                              const OperationType& op_type) {
-  BroadcastSettings broadcast;
-  broadcast.width = false;
-  broadcast.height = false;
-  broadcast.channels = false;
-  ElementwiseTwoInput operation(definition, op_type, broadcast);
-  return operation;
+absl::Status CreateElementwise(const CreationContext& creation_context,
+                               const OperationDef& definition,
+                               const OperationType& op_type,
+                               const ElementwiseAttributes& attr,
+                               GPUOperation* result) {
+  const float* scalar = absl::get_if<float>(&attr.param);
+  const auto* linear_tensor =
+      absl::get_if<tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(&attr.param);
+  const auto* hwc_tensor =
+      absl::get_if<tflite::gpu::Tensor<HWC, DataType::FLOAT32>>(&attr.param);
+
+  if (scalar) {
+    *result = CreateElementwiseOneRuntimeOneScalar(
+        definition, op_type, *scalar, attr.runtime_tensor_is_second);
+    return absl::OkStatus();
+  } else if (linear_tensor) {
+    return CreateElementwiseTwoInput(creation_context, definition, op_type,
+                                     *linear_tensor,
+                                     attr.runtime_tensor_is_second, result);
+  } else if (hwc_tensor) {
+    return CreateElementwiseTwoInput(creation_context, definition, op_type,
+                                     *hwc_tensor, attr.runtime_tensor_is_second,
+                                     result);
+  }
+  return absl::UnimplementedError(
+      "No elementwise implementation for this case");
+}
+
+GPUOperation CreateElementwiseTwoInput(const OperationDef& definition,
+                                       const OperationType& op_type,
+                                       const BHWC& shape) {
+  GPUOperation op(definition);
+  op.elementwise_ = true;
+  auto src_desc = definition.src_tensors[1];
+  if (definition.IsBatchSupported()) {
+    src_desc.SetStateVar("BatchedWidth", "true");
+  }
+  op.AddSrcTensor("second_tensor", src_desc);
+  const std::string x_coord = shape.w == 1 ? "0" : "X_COORD";
+  const std::string y_coord = shape.h == 1 ? "0" : "Y_COORD";
+  const std::string s_coord = shape.c == 1 ? "0" : "S_COORD";
+  op.code_ = absl::StrCat("FLT4 second_val = args.second_tensor.Read(", x_coord,
+                          ", ", y_coord, ", ", s_coord, ");\n");
+  if (shape.c == 1) {
+    op.code_ += "  second_val.y = second_val.x;\n";
+    op.code_ += "  second_val.z = second_val.x;\n";
+    op.code_ += "  second_val.w = second_val.x;\n";
+  }
+  op.code_ += GetTwoInputCode(op_type, "in_out_value", "in_out_value",
+                              "second_val", false);
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
index be037802dbc..f841cdba9fb 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
@@ -26,105 +26,24 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-// Class for simple one input operations without any parameters, for example
-// log, sin, cos and etc.
-class ElementwiseOneInput : public ElementwiseOperation {
- public:
-  ElementwiseOneInput(const OperationDef& definition,
-                      const OperationType& op_type);
+// Creates simple one input operation without any parameters, for example
+// log, sin, cos, etc.
+GPUOperation CreateElementwiseOneInput(const OperationDef& definition,
+                                       const OperationType& op_type);
 
-  // Move only
-  ElementwiseOneInput(ElementwiseOneInput&& operation);
-  ElementwiseOneInput& operator=(ElementwiseOneInput&& operation);
-  ElementwiseOneInput(const ElementwiseOneInput&) = delete;
-  ElementwiseOneInput& operator=(const ElementwiseOneInput&) = delete;
+// Creates simple two input(first input is runtime tensor and second input is
+// constant or linear/hwc tensor) operation, for example sub, div and etc.
+absl::Status CreateElementwise(const CreationContext& creation_context,
+                               const OperationDef& definition,
+                               const OperationType& op_type,
+                               const ElementwiseAttributes& attr,
+                               GPUOperation* result);
 
- private:
-  OperationType op_type_;
-};
-
-ElementwiseOneInput CreateElementwiseOneInput(const OperationDef& definition,
-                                              const OperationType& op_type);
-
-// Class for simple two input (first input is runtime tensor and second input is
-// scalar argument) operations without any parameters, for example sub, div and
-// etc.
-class ElementwiseOneRuntimeOneScalar : public ElementwiseOperation {
- public:
-  ElementwiseOneRuntimeOneScalar(const OperationDef& definition,
-                                 const OperationType& op_type,
-                                 float scalar_parameter,
-                                 CalculationsPrecision scalar_precision);
-
-  // Move only
-  ElementwiseOneRuntimeOneScalar(ElementwiseOneRuntimeOneScalar&& operation);
-  ElementwiseOneRuntimeOneScalar& operator=(
-      ElementwiseOneRuntimeOneScalar&& operation);
-  ElementwiseOneRuntimeOneScalar(const ElementwiseOneRuntimeOneScalar&) =
-      delete;
-  ElementwiseOneRuntimeOneScalar& operator=(
-      const ElementwiseOneRuntimeOneScalar&) = delete;
-
- private:
-  int link_index_;
-  OperationType op_type_;
-};
-
-ElementwiseOneRuntimeOneScalar CreateElementwiseOneRuntimeOneScalar(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const OperationType& op_type, float scalar_parameter);
-
-struct BroadcastSettings {
-  bool width;
-  bool height;
-  bool channels;
-};
-
-// Class for simple two input(first input is runtime tensor and second input is
-// runtime or constant tensor) operations without any parameters, for example
+// Creates simple two input(2 runtime tensors) operation, for example
 // sub, div and etc.
-class ElementwiseTwoInput : public ElementwiseOperation {
- public:
-  ElementwiseTwoInput() = default;
-  ElementwiseTwoInput(const OperationDef& definition,
-                      const OperationType& op_type,
-                      const BroadcastSettings& broadcast);
-
-  ElementwiseTwoInput(const OperationDef& definition,
-                      const OperationType& op_type,
-                      const BroadcastSettings& broadcast,
-                      Tensor&& constant_tensor);
-
-  // Move only
-  ElementwiseTwoInput(ElementwiseTwoInput&& operation);
-  ElementwiseTwoInput& operator=(ElementwiseTwoInput&& operation);
-  ElementwiseTwoInput(const ElementwiseTwoInput&) = delete;
-  ElementwiseTwoInput& operator=(const ElementwiseTwoInput&) = delete;
-
-  absl::Status SetArgs(const std::string& unique_postfix,
-                       Arguments* args) override;
-
- private:
-  int link_index_;
-  OperationType op_type_;
-  BroadcastSettings broadcast_;
-};
-
-absl::Status CreateElementwiseTwoInput(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const OperationType& op_type,
-    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& constant_tensor,
-    ElementwiseTwoInput* result);
-
-absl::Status CreateElementwiseTwoInput(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const OperationType& op_type,
-    const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& constant_tensor,
-    ElementwiseTwoInput* result);
-
-ElementwiseTwoInput CreateElementwiseTwoInput(const OperationDef& definition,
-                                              const OperationType& op_type,
-                                              const BHWC& shape);
+GPUOperation CreateElementwiseTwoInput(const OperationDef& definition,
+                                       const OperationType& op_type,
+                                       const BHWC& shape);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
index 7c3bdbe66e7..23ee6622e8c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
+using ::testing::FloatEq;
 using ::testing::FloatNear;
 using ::testing::Pointwise;
 
@@ -44,7 +45,7 @@ TEST_F(OpenCLOperationTest, Abs) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseOneInput operation =
+      GPUOperation operation =
           CreateElementwiseOneInput(op_def, OperationType::ABS);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -69,7 +70,7 @@ TEST_F(OpenCLOperationTest, Cos) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseOneInput operation =
+      GPUOperation operation =
           CreateElementwiseOneInput(op_def, OperationType::COS);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -81,6 +82,54 @@ TEST_F(OpenCLOperationTest, Cos) {
   }
 }
 
+TEST_F(OpenCLOperationTest, Copy) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {half(0.0f), half(-1.0f), half(-0.05f), half(0.045f)};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation =
+          CreateElementwiseOneInput(op_def, OperationType::COPY);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatEq(), src_tensor.data));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, Elu) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 1, 1, 7);
+  src_tensor.data = {0.0f, 1.0f, -1.0f, 100.0f, -100.0f, 0.01f, -0.01f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation =
+          CreateElementwiseOneInput(op_def, OperationType::ELU);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 1, 1, 7), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, 1.0f, std::exp(-1.0f) - 1.0f,
+                                             100.0f, std::exp(-100.0f) - 1.0f,
+                                             0.01f, std::exp(-0.01f) - 1.0f}));
+    }
+  }
+}
+
 TEST_F(OpenCLOperationTest, Exp) {
   TensorFloat32 src_tensor;
   src_tensor.shape = BHWC(1, 1, 1, 7);
@@ -95,7 +144,7 @@ TEST_F(OpenCLOperationTest, Exp) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseOneInput operation =
+      GPUOperation operation =
           CreateElementwiseOneInput(op_def, OperationType::EXP);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 1, 1, 7), &dst_tensor));
@@ -122,7 +171,7 @@ TEST_F(OpenCLOperationTest, HardSwish) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseOneInput operation =
+      GPUOperation operation =
           CreateElementwiseOneInput(op_def, OperationType::HARD_SWISH);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     src_tensor.shape, &dst_tensor));
@@ -148,7 +197,7 @@ TEST_F(OpenCLOperationTest, Log) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseOneInput operation =
+      GPUOperation operation =
           CreateElementwiseOneInput(op_def, OperationType::LOG);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -173,7 +222,7 @@ TEST_F(OpenCLOperationTest, Rsqrt) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseOneInput operation =
+      GPUOperation operation =
           CreateElementwiseOneInput(op_def, OperationType::RSQRT);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -200,7 +249,7 @@ TEST_F(OpenCLOperationTest, Sigmoid) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseOneInput operation =
+      GPUOperation operation =
           CreateElementwiseOneInput(op_def, OperationType::SIGMOID);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -224,7 +273,7 @@ TEST_F(OpenCLOperationTest, Sin) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseOneInput operation =
+      GPUOperation operation =
           CreateElementwiseOneInput(op_def, OperationType::SIN);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -250,7 +299,7 @@ TEST_F(OpenCLOperationTest, Sqrt) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseOneInput operation =
+      GPUOperation operation =
           CreateElementwiseOneInput(op_def, OperationType::SQRT);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -276,7 +325,7 @@ TEST_F(OpenCLOperationTest, Square) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseOneInput operation =
+      GPUOperation operation =
           CreateElementwiseOneInput(op_def, OperationType::SQUARE);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -300,7 +349,7 @@ TEST_F(OpenCLOperationTest, Tanh) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseOneInput operation =
+      GPUOperation operation =
           CreateElementwiseOneInput(op_def, OperationType::TANH);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -329,7 +378,7 @@ TEST_F(OpenCLOperationTest, Sub) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+      GPUOperation operation = CreateElementwiseTwoInput(
           op_def, OperationType::SUB, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
@@ -357,7 +406,7 @@ TEST_F(OpenCLOperationTest, SquaredDiff) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+      GPUOperation operation = CreateElementwiseTwoInput(
           op_def, OperationType::SQUARED_DIFF, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
@@ -385,7 +434,7 @@ TEST_F(OpenCLOperationTest, Div) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+      GPUOperation operation = CreateElementwiseTwoInput(
           op_def, OperationType::DIV, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
@@ -413,7 +462,7 @@ TEST_F(OpenCLOperationTest, Pow) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+      GPUOperation operation = CreateElementwiseTwoInput(
           op_def, OperationType::POW, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
@@ -441,7 +490,7 @@ TEST_F(OpenCLOperationTest, Add) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+      GPUOperation operation = CreateElementwiseTwoInput(
           op_def, OperationType::ADD, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
@@ -469,7 +518,7 @@ TEST_F(OpenCLOperationTest, Maximum) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+      GPUOperation operation = CreateElementwiseTwoInput(
           op_def, OperationType::MAXIMUM, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
@@ -497,10 +546,9 @@ TEST_F(OpenCLOperationTest, MaximumWithScalar) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      const float* scalar = absl::get_if<float>(&attr.param);
-      ElementwiseOneRuntimeOneScalar operation =
-          CreateElementwiseOneRuntimeOneScalar(creation_context_, op_def,
-                                               OperationType::MAXIMUM, *scalar);
+      GPUOperation operation;
+      ASSERT_OK(CreateElementwise(creation_context_, op_def,
+                                  OperationType::MAXIMUM, attr, &operation));
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 4, 1, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -529,10 +577,9 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantLinearTensor) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation;
-      ASSERT_OK(CreateElementwiseTwoInput(creation_context_, op_def,
-                                          OperationType::MAXIMUM, linear_tensor,
-                                          &operation));
+      GPUOperation operation;
+      ASSERT_OK(CreateElementwise(creation_context_, op_def,
+                                  OperationType::MAXIMUM, attr, &operation));
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -549,6 +596,8 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantHWCTensor) {
   ::tflite::gpu::Tensor<HWC, DataType::FLOAT32> hwc_tensor;
   hwc_tensor.shape = HWC(2, 1, 2);
   hwc_tensor.data = {0.5f, 2.0f, 0.7f, 4.7f};
+  ElementwiseAttributes attr;
+  attr.param = hwc_tensor;
 
   for (auto storage : env_.GetSupportedStorages()) {
     for (auto precision : env_.GetSupportedPrecisions()) {
@@ -559,10 +608,9 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantHWCTensor) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation;
-      ASSERT_OK(CreateElementwiseTwoInput(creation_context_, op_def,
-                                          OperationType::MAXIMUM, hwc_tensor,
-                                          &operation));
+      GPUOperation operation;
+      ASSERT_OK(CreateElementwise(creation_context_, op_def,
+                                  OperationType::MAXIMUM, attr, &operation));
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -578,6 +626,8 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantHWCTensorBroadcastChannels) {
   ::tflite::gpu::Tensor<HWC, DataType::FLOAT32> hwc_tensor;
   hwc_tensor.shape = HWC(2, 1, 1);
   hwc_tensor.data = {0.5f, 2.0f};
+  ElementwiseAttributes attr;
+  attr.param = hwc_tensor;
 
   for (auto storage : env_.GetSupportedStorages()) {
     for (auto precision : env_.GetSupportedPrecisions()) {
@@ -588,10 +638,9 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantHWCTensorBroadcastChannels) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation;
-      ASSERT_OK(CreateElementwiseTwoInput(creation_context_, op_def,
-                                          OperationType::MAXIMUM, hwc_tensor,
-                                          &operation));
+      GPUOperation operation;
+      ASSERT_OK(CreateElementwise(creation_context_, op_def,
+                                  OperationType::MAXIMUM, attr, &operation));
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -617,7 +666,7 @@ TEST_F(OpenCLOperationTest, Minimum) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+      GPUOperation operation = CreateElementwiseTwoInput(
           op_def, OperationType::MINIMUM, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
@@ -645,10 +694,9 @@ TEST_F(OpenCLOperationTest, MinimumWithScalar) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      const float* scalar = absl::get_if<float>(&attr.param);
-      ElementwiseOneRuntimeOneScalar operation =
-          CreateElementwiseOneRuntimeOneScalar(creation_context_, op_def,
-                                               OperationType::MINIMUM, *scalar);
+      GPUOperation operation;
+      ASSERT_OK(CreateElementwise(creation_context_, op_def,
+                                  OperationType::MINIMUM, attr, &operation));
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 4, 1, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -674,7 +722,7 @@ TEST_F(OpenCLOperationTest, Mul) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+      GPUOperation operation = CreateElementwiseTwoInput(
           op_def, OperationType::MUL, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
@@ -702,7 +750,7 @@ TEST_F(OpenCLOperationTest, MulBroadcastHW) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+      GPUOperation operation = CreateElementwiseTwoInput(
           op_def, OperationType::MUL, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
@@ -730,7 +778,7 @@ TEST_F(OpenCLOperationTest, MulBroadcastChannels) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+      GPUOperation operation = CreateElementwiseTwoInput(
           op_def, OperationType::MUL, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
@@ -741,6 +789,35 @@ TEST_F(OpenCLOperationTest, MulBroadcastChannels) {
   }
 }
 
+TEST_F(OpenCLOperationTest, SubWithScalarAtFirstPosition) {
+  TensorFloat32 src_tensor_0;
+  src_tensor_0.shape = BHWC(1, 4, 1, 1);
+  src_tensor_0.data = {0.0f, -6.2f, 2.0f, -3.0f};
+
+  ElementwiseAttributes attr;
+  attr.param = 4.0f;
+  attr.runtime_tensor_is_second = true;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation;
+      ASSERT_OK(CreateElementwise(creation_context_, op_def, OperationType::SUB,
+                                  attr, &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
+                                    BHWC(1, 4, 1, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {4.0f, 10.2f, 2.0f, 7.0f}));
+    }
+  }
+}
+
 }  // namespace
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
index bc287ec2fee..ec18fa9f6e2 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
@@ -23,7 +23,33 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
+
+FullyConnected::FullyConnected(const OperationDef& definition,
+                               const DeviceInfo& device_info)
+    : GPUOperation(definition) {
+  if (device_info.IsAdreno()) {
+    if (device_info.IsAdreno3xx()) {
+      work_group_size_ = int3(8, 4, 1);
+    } else if (device_info.IsAdreno4xx()) {
+      work_group_size_ = int3(16, 4, 1);
+    } else {
+      work_group_size_ = int3(32, 4, 1);
+    }
+  } else {
+    work_group_size_ = int3(16, 4, 1);
+  }
+  code_ = GetFullyConnectedKernelCode(definition_, work_group_size_);
+}
+
+FullyConnected::FullyConnected(FullyConnected&& kernel)
+    : GPUOperation(std::move(kernel)) {}
+
+FullyConnected& FullyConnected::operator=(FullyConnected&& kernel) {
+  if (this != &kernel) {
+    GPUOperation::operator=(std::move(kernel));
+  }
+  return *this;
+}
 
 // We split vec vec dot (every thread do vec vec dot product in basic
 // vec mat mult) on 4 parts to create more threads
@@ -31,15 +57,10 @@ namespace {
 // Good results for ~1024 x 1024 sizes, for other can be written more
 // optimized shaders
 
-std::string GetFullyConnectedKernelCode(const OperationDef& op_def,
-                                        const int3& work_group_size,
-                                        Arguments* args) {
-  args->AddObjectRef(
-      "src_tensor", AccessType::READ,
-      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
-  args->AddObjectRef(
-      "dst_tensor", AccessType::WRITE,
-      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
+std::string FullyConnected::GetFullyConnectedKernelCode(
+    const OperationDef& op_def, const int3& work_group_size) {
+  AddSrcTensor("src_tensor", op_def.src_tensors[0]);
+  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
 
   std::string c = GetCommonDefines(op_def.precision);
   switch (op_def.precision) {
@@ -84,69 +105,16 @@ std::string GetFullyConnectedKernelCode(const OperationDef& op_def,
 
   return c;
 }
-}  // namespace
 
-FullyConnected::FullyConnected(const OperationDef& definition)
-    : GPUOperation(definition) {}
-
-FullyConnected::FullyConnected(FullyConnected&& kernel)
-    : GPUOperation(std::move(kernel)),
-      kernel_(std::move(kernel.kernel_)),
-      work_group_size_(kernel.work_group_size_) {}
-
-FullyConnected& FullyConnected::operator=(FullyConnected&& kernel) {
-  if (this != &kernel) {
-    kernel_ = std::move(kernel.kernel_);
-    std::swap(work_group_size_, kernel.work_group_size_);
-    GPUOperation::operator=(std::move(kernel));
-  }
-  return *this;
-}
-
-absl::Status FullyConnected::Compile(const CreationContext& creation_context) {
-  int wg_width = 32;
-  int wg_height = 4;
-  int work_items;
-  do {
-    work_group_size_ = {wg_width, wg_height, 1};
-    wg_width /= 2;
-    std::string code =
-        GetFullyConnectedKernelCode(definition_, work_group_size_, &args_);
-    std::string element_wise_code;
-    RETURN_IF_ERROR(
-        MergeOperations(linked_operations_, &args_, &element_wise_code));
-    RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                            {{"dst_tensor", element_wise_code}},
-                                            &code));
-    auto status = creation_context.cache->GetOrCreateCLKernel(
-        code, "main_function", *creation_context.context,
-        *creation_context.device, &kernel_);
-    if (!status.ok()) {
-      if (work_group_size_.x == 1) {
-        return status;
-      } else {
-        continue;
-      }
-    }
-    work_items = work_group_size_.x * work_group_size_.y * work_group_size_.z;
-  } while (work_items > kernel_.GetMaxWorkGroupSize());
-  return absl::OkStatus();
-}
-
-absl::Status FullyConnected::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
-  return queue->DispatchImplicit(kernel_, {dst_[0]->Slices(), 1, 1},
-                                 work_group_size_);
+int3 FullyConnected::GetGridSize() const {
+  return int3(dst_[0]->Slices(), 1, 1);
 }
 
 absl::Status CreateFullyConnected(const CreationContext& creation_context,
                                   const OperationDef& definition,
                                   const FullyConnectedAttributes& attr,
                                   FullyConnected* result) {
-  *result = FullyConnected(definition);
+  *result = FullyConnected(definition, creation_context.device->info_);
   RETURN_IF_ERROR(
       result->UploadWeights(attr.weights, creation_context.context));
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
index 83490b281ab..8543c3defc0 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
@@ -34,12 +34,68 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
+template <DataType T, typename S>
+void RearrangeFCWeightsToIOO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
+                                absl::Span<S> dst) {
+  const int src_channels = weights.shape.i;
+  const int padded_src_channels = AlignByN(src_channels, 4);
+  const int dst_channels = weights.shape.o;
+  const int padded_dst_channels = AlignByN(dst_channels, 4);
+
+  // The weights are to be rearranged in such a way that the first 4 elements of
+  // each row, starting from row_0, are copied onto the destination buffer. The
+  // next set of 4 elements are then copied and so on. As an example, an 8x8
+  // matrix would be rearranged as below.
+  //
+  //  | a0 a1 a2 a3 a4 a5 a6 a7 |              | a0 a1 a2 a3 b0 b1 b2 b3 |
+  //  | b0 b1 b2 b3 b4 b5 b6 b7 |              | c0 c1 c2 c3 d0 d1 d2 d3 |
+  //  | c0 c1 c2 c3 c4 c5 c6 c7 |              | e0 e1 e2 e3 f0 f1 f2 f3 |
+  //  | d0 d1 d2 d3 d4 d5 d6 d7 |  --------->  | g0 g1 g2 g3 h0 h1 h2 h3 |
+  //  | e0 e1 e2 e3 e4 e5 e6 e7 |              | a4 a5 a6 a7 b4 b5 b6 b7 |
+  //  | f0 f1 f2 f3 f4 f5 f6 f7 |              | c4 c5 c6 c7 d4 d5 d6 d7 |
+  //  | g0 g1 g2 g3 g4 g5 g6 g7 |              | e4 e5 e6 e7 f4 f5 f6 f7 |
+  //  | h0 h1 h2 h3 h4 h5 h6 h7 |              | g4 g5 g6 g7 h4 h5 h6 h7 |
+
+  for (int y = 0; y < dst_channels; y++) {
+    int x = 0;
+    for (; x + 4 <= src_channels; x += 4) {
+      const int idx_data_0 = src_channels * y + x;
+      S filter = S(weights.data[idx_data_0], weights.data[idx_data_0 + 1],
+                   weights.data[idx_data_0 + 2], weights.data[idx_data_0 + 3]);
+      dst[y + padded_dst_channels * x / 4] = filter;
+    }
+
+    // If the width is not a multiple of 4, padding is required and the padded
+    // region is filled with zeros.
+    if (src_channels != padded_src_channels) {
+      const int idx_data_0 = src_channels * y + x;
+
+      S filter = S(x < src_channels ? weights.data[idx_data_0] : 0.0,
+                   x + 1 < src_channels ? weights.data[idx_data_0 + 1] : 0.0,
+                   x + 2 < src_channels ? weights.data[idx_data_0 + 2] : 0.0,
+                   x + 3 < src_channels ? weights.data[idx_data_0 + 3] : 0.0);
+      dst[y + padded_dst_channels * x / 4] = filter;
+    }
+  }
+
+  // Fill the padded columns with zeros.
+  for (int y = dst_channels; y < padded_dst_channels; y++) {
+    for (int x = 0; x < padded_src_channels; x += 4) {
+      dst[y + padded_dst_channels * x / 4] = S(0.0);
+    }
+  }
+}
+
 class FullyConnected : public GPUOperation {
  public:
   FullyConnected() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-
-  absl::Status Compile(const CreationContext& creation_context) override;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override {
+    work_groups->push_back(work_group_size_);
+  }
+  int3 GetGridSize() const override;
 
   // Move only
   FullyConnected(FullyConnected&& kernel);
@@ -48,7 +104,7 @@ class FullyConnected : public GPUOperation {
   FullyConnected& operator=(const FullyConnected&) = delete;
 
  private:
-  explicit FullyConnected(const OperationDef& definition);
+  FullyConnected(const OperationDef& definition, const DeviceInfo& device_info);
   friend absl::Status CreateFullyConnected(
       const CreationContext& creation_context, const OperationDef& definition,
       const FullyConnectedAttributes& attr, FullyConnected* result);
@@ -57,12 +113,8 @@ class FullyConnected : public GPUOperation {
   absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
                              CLContext* context);
 
-  template <DataType T, typename S>
-  void RearrangeWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
-                        absl::Span<S> dst);
-
-  CLKernel kernel_;
-  int3 work_group_size_ = int3(0, 0, 0);
+  std::string GetFullyConnectedKernelCode(const OperationDef& op_def,
+                                          const int3& work_group_size);
 };
 
 template <DataType T>
@@ -83,13 +135,13 @@ absl::Status FullyConnected::UploadWeights(
   Buffer weights_buffer;
   if (f32_weights) {
     std::vector<float4> gpu_data(dst_depth * src_depth * 4);
-    RearrangeWeights(weights, absl::MakeSpan(gpu_data));
+    RearrangeFCWeightsToIOO4I4(weights, absl::MakeSpan(gpu_data));
     RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
                                          gpu_data.data(), context,
                                          &weights_buffer));
   } else {
     std::vector<half4> gpu_data(dst_depth * src_depth * 4);
-    RearrangeWeights(weights, absl::MakeSpan(gpu_data));
+    RearrangeFCWeightsToIOO4I4(weights, absl::MakeSpan(gpu_data));
     RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
                                          gpu_data.data(), context,
                                          &weights_buffer));
@@ -102,37 +154,6 @@ absl::Status FullyConnected::UploadWeights(
   return absl::OkStatus();
 }
 
-template <DataType T, typename S>
-void FullyConnected::RearrangeWeights(
-    const tflite::gpu::Tensor<OHWI, T>& weights, absl::Span<S> dst) {
-  const int src_depth = DivideRoundUp(weights.shape.i, 4);
-  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
-  int counter = 0;
-
-  for (int s = 0; s < src_depth; ++s) {
-    for (int d = 0; d < dst_depth; ++d) {
-      S filters[4];
-      for (int i = 0; i < 4; ++i) {
-        for (int j = 0; j < 4; ++j) {
-          const int dst_ch = d * 4 + i;
-          const int src_ch = s * 4 + j;
-          if (dst_ch < weights.shape.o && src_ch < weights.shape.i) {
-            const int f_index =
-                weights.shape.LinearIndex({dst_ch, 0, 0, src_ch});
-            filters[i][j] = weights.data[f_index];
-          } else {
-            filters[i][j] = 0.0;
-          }
-        }
-      }
-      dst[counter++] = filters[0];
-      dst[counter++] = filters[1];
-      dst[counter++] = filters[2];
-      dst[counter++] = filters[3];
-    }
-  }
-}
-
 absl::Status CreateFullyConnected(const CreationContext& creation_context,
                                   const OperationDef& definition,
                                   const FullyConnectedAttributes& attr,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_test.cc
index 4525d49e783..900b244ceb2 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
+using ::testing::ElementsAreArray;
 using ::testing::FloatNear;
 using ::testing::Pointwise;
 
@@ -61,6 +62,60 @@ TEST_F(OpenCLOperationTest, FullyConnected) {
   }
 }
 
+TEST_F(OpenCLOperationTest, RearrageWeights) {
+  tflite::gpu::Tensor<OHWI, DataType::FLOAT32> weights;
+  weights.shape = OHWI(8, 1, 1, 8);
+  weights.data = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  10.0, 11.0,
+                  12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 20.0, 21.0, 22.0, 23.0,
+                  24.0, 25.0, 26.0, 27.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0,
+                  36.0, 37.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0,
+                  50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 60.0, 61.0,
+                  62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 70.0, 71.0, 72.0, 73.0,
+                  74.0, 75.0, 76.0, 77.0};
+
+  std::vector<float> expected_rearranged_data = {
+      0.0,  1.0,  2.0,  3.0,  10.0, 11.0, 12.0, 13.0, 20.0, 21.0, 22.0,
+      23.0, 30.0, 31.0, 32.0, 33.0, 40.0, 41.0, 42.0, 43.0, 50.0, 51.0,
+      52.0, 53.0, 60.0, 61.0, 62.0, 63.0, 70.0, 71.0, 72.0, 73.0, 4.0,
+      5.0,  6.0,  7.0,  14.0, 15.0, 16.0, 17.0, 24.0, 25.0, 26.0, 27.0,
+      34.0, 35.0, 36.0, 37.0, 44.0, 45.0, 46.0, 47.0, 54.0, 55.0, 56.0,
+      57.0, 64.0, 65.0, 66.0, 67.0, 74.0, 75.0, 76.0, 77.0,
+  };
+
+  std::vector<float> data(8 * 8);
+  float4* data_ptr = static_cast<float4*>(static_cast<void*>(data.data()));
+  RearrangeFCWeightsToIOO4I4(weights, absl::MakeSpan(data_ptr, 8 * 8 / 4));
+
+  EXPECT_THAT(data, ElementsAreArray(expected_rearranged_data));
+}
+
+TEST_F(OpenCLOperationTest, RearrageWeightsWhenPaddingIsRequired) {
+  tflite::gpu::Tensor<OHWI, DataType::FLOAT32> weights;
+  weights.shape = OHWI(7, 1, 1, 7);
+  weights.data = {
+      0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  10.0, 11.0, 12.0,
+      13.0, 14.0, 15.0, 16.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
+      26.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 40.0, 41.0,
+      42.0, 43.0, 44.0, 45.0, 46.0, 50.0, 51.0, 52.0, 53.0, 54.0,
+      55.0, 56.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0,
+  };
+
+  std::vector<float> expected_rearranged_data = {
+      0.0,  1.0,  2.0,  3.0,  10.0, 11.0, 12.0, 13.0, 20.0, 21.0, 22.0,
+      23.0, 30.0, 31.0, 32.0, 33.0, 40.0, 41.0, 42.0, 43.0, 50.0, 51.0,
+      52.0, 53.0, 60.0, 61.0, 62.0, 63.0, 0.0,  0.0,  0.0,  0.0,  4.0,
+      5.0,  6.0,  0.0,  14.0, 15.0, 16.0, 0.0,  24.0, 25.0, 26.0, 0.0,
+      34.0, 35.0, 36.0, 0.0,  44.0, 45.0, 46.0, 0.0,  54.0, 55.0, 56.0,
+      0.0,  64.0, 65.0, 66.0, 0.0,  0.0,  0.0,  0.0,  0.0,
+  };
+
+  std::vector<float> data(8 * 8);
+  float4* data_ptr = static_cast<float4*>(static_cast<void*>(data.data()));
+  RearrangeFCWeightsToIOO4I4(weights, absl::MakeSpan(data_ptr, 8 * 8 / 4));
+
+  EXPECT_THAT(data, ElementsAreArray(expected_rearranged_data));
+}
+
 }  // namespace
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
index 669e4478bdd..29f6c038f77 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
@@ -26,18 +26,7 @@ namespace cl {
 namespace {
 
 std::string GetElementWiseCode(const OperationDef& op_def,
-                               bool check_src_slices, Arguments* args) {
-  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
-  if (op_def.IsBatchSupported()) {
-    src_desc->SetStateVar("BatchedWidth", "true");
-  }
-  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
-  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
-  if (op_def.IsBatchSupported()) {
-    dst_desc->SetStateVar("BatchedWidth", "true");
-  }
-  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
-
+                               bool check_src_slices) {
   std::string c = GetCommonDefines(op_def.precision);
 
   c += "__kernel void main_function(\n";
@@ -60,6 +49,20 @@ std::string GetElementWiseCode(const OperationDef& op_def,
   return c;
 }
 
+absl::Status MergeOperations(const std::vector<GPUOperation*>& linked_ops,
+                             Arguments* merged_args, std::string* merged_code) {
+  for (int i = 0; i < linked_ops.size(); ++i) {
+    std::string code = linked_ops[i]->code_;
+    std::string unique_postfix = absl::StrCat("_link", i + 1);
+    linked_ops[i]->args_.RenameArgs(unique_postfix, &code);
+    *merged_code += "{\n" + code + "\n}\n";
+    RETURN_IF_ERROR(
+        merged_args->Merge(std::move(linked_ops[i]->args_), unique_postfix));
+    linked_ops[i]->AddUniquePostfix(unique_postfix);
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace
 
 DataType OperationDef::GetDataType() const {
@@ -119,110 +122,178 @@ void GPUOperation::SetDst(Tensor* ptr, int index) {
 }
 
 GPUOperation::GPUOperation(GPUOperation&& operation)
-    : definition_(std::move(operation.definition_)),
+    : args_(std::move(operation.args_)),
+      code_(std::move(operation.code_)),
+      elementwise_(operation.elementwise_),
+      linkable_(operation.linkable_),
+      check_src_channels_size_(operation.check_src_channels_size_),
+      definition_(std::move(operation.definition_)),
       src_(std::move(operation.src_)),
       dst_(std::move(operation.dst_)),
-      args_(std::move(operation.args_)),
+      kernel_(std::move(operation.kernel_)),
+      work_group_size_(operation.work_group_size_),
+      grid_size_(operation.grid_size_),
+      src_tensors_names_(std::move(operation.src_tensors_names_)),
+      dst_tensors_names_(std::move(operation.dst_tensors_names_)),
+      compiler_options_(std::move(operation.compiler_options_)),
       linked_operations_(std::move(operation.linked_operations_)) {}
 
 GPUOperation& GPUOperation::operator=(GPUOperation&& operation) {
   if (this != &operation) {
+    args_ = std::move(operation.args_);
+    code_ = std::move(operation.code_);
+    elementwise_ = operation.elementwise_;
+    linkable_ = operation.linkable_;
+    check_src_channels_size_ = operation.check_src_channels_size_;
     definition_ = std::move(operation.definition_);
     src_ = std::move(operation.src_);
     dst_ = std::move(operation.dst_);
-    args_ = std::move(operation.args_);
+    kernel_ = std::move(operation.kernel_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    std::swap(grid_size_, operation.grid_size_);
+    src_tensors_names_ = std::move(operation.src_tensors_names_);
+    dst_tensors_names_ = std::move(operation.dst_tensors_names_);
+    compiler_options_ = std::move(operation.compiler_options_);
     linked_operations_ = std::move(operation.linked_operations_);
   }
   return *this;
 }
 
-void GPUOperation::AddOperation(ElementwiseOperation* operation) {
+void GPUOperation::AddOperation(GPUOperation* operation) {
   linked_operations_.push_back(operation);
 }
 
-ElementwiseOperation::ElementwiseOperation(ElementwiseOperation&& operation)
-    : GPUOperation(std::move(operation)),
-      check_src_channels_size_(operation.check_src_channels_size_),
-      code_(std::move(operation.code_)),
-      kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_) {}
+void GPUOperation::AddSrcTensor(const std::string& tensor_name,
+                                const TensorDescriptor& desc) {
+  src_tensors_names_.push_back(tensor_name);
+  auto desc_new = absl::make_unique<TensorDescriptor>(desc);
+  args_.AddObjectRef(tensor_name, AccessType::READ, std::move(desc_new));
+}
 
-ElementwiseOperation& ElementwiseOperation::operator=(
-    ElementwiseOperation&& operation) {
-  if (this != &operation) {
-    check_src_channels_size_ = operation.check_src_channels_size_;
-    code_ = std::move(operation.code_);
-    kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
-    GPUOperation::operator=(std::move(operation));
+void GPUOperation::AddSrcBuffer(const std::string& buffer_name,
+                                const BufferDescriptor& desc) {
+  src_tensors_names_.push_back(buffer_name);
+  auto desc_new = absl::make_unique<BufferDescriptor>(desc);
+  args_.AddObjectRef(buffer_name, AccessType::READ, std::move(desc_new));
+}
+
+void GPUOperation::AddDstTensor(const std::string& tensor_name,
+                                const TensorDescriptor& desc) {
+  dst_tensors_names_.push_back(tensor_name);
+  auto desc_new = absl::make_unique<TensorDescriptor>(desc);
+  args_.AddObjectRef(tensor_name, AccessType::WRITE, std::move(desc_new));
+}
+
+absl::Status GPUOperation::UpdateParams() {
+  for (int i = 0; i < src_tensors_names_.size(); ++i) {
+    RETURN_IF_ERROR(args_.SetObjectRef(src_tensors_names_[i], src_[i]));
+  }
+  for (int i = 0; i < dst_tensors_names_.size(); ++i) {
+    RETURN_IF_ERROR(args_.SetObjectRef(dst_tensors_names_[i], dst_[i]));
+  }
+  for (const auto linked_op : linked_operations_) {
+    for (int i = 0; i < linked_op->src_tensors_names_.size(); ++i) {
+      RETURN_IF_ERROR(args_.SetObjectRef(linked_op->src_tensors_names_[i],
+                                         linked_op->src_[i + 1]));
+    }
   }
-  return *this;
-}
-
-absl::Status ElementwiseOperation::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(SetArgs("", &args_));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
-  return absl::OkStatus();
-}
-
-int3 ElementwiseOperation::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  const int grid_z = dst_[0]->Slices();
-  return int3(grid_x, grid_y, grid_z);
-}
-
-absl::Status ElementwiseOperation::Compile(
-    const CreationContext& creation_context) {
-  std::string code =
-      GetElementWiseCode(definition_, check_src_channels_size_, &args_);
-  std::string element_wise_code;
-  element_wise_code += "{\n" + code_ + "\n}\n";
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  code = absl::Substitute(code, args_.GetListOfArgs());
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
-absl::Status ElementwiseOperation::AddToQueue(CLCommandQueue* queue) {
   RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
-absl::Status ElementwiseOperation::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status MergeOperations(
-    const std::vector<ElementwiseOperation*>& linked_ops,
-    Arguments* merged_args, std::string* merged_code) {
-  for (int i = 0; i < linked_ops.size(); ++i) {
-    std::string code = linked_ops[i]->GetCode();
-    std::string unique_postfix = absl::StrCat("_link", i + 1);
-    auto&& link_args = linked_ops[i]->MoveArgs();
-    link_args.RenameArgs(unique_postfix, &code);
-    *merged_code += "{\n" + code + "\n}\n";
-    RETURN_IF_ERROR(merged_args->Merge(std::move(link_args), unique_postfix));
-  }
+  grid_size_ = GetGridSize();
   return absl::OkStatus();
 }
 
-absl::Status SetArguments(const std::vector<ElementwiseOperation*>& linked_ops,
-                          Arguments* args) {
-  for (int i = 0; i < linked_ops.size(); ++i) {
-    std::string unique_postfix = absl::StrCat("_link", i + 1);
-    RETURN_IF_ERROR(linked_ops[i]->SetArgs(unique_postfix, args));
+absl::Status GPUOperation::Compile(const CreationContext& creation_context) {
+  if (elementwise_) {
+    auto src_desc =
+        absl::make_unique<TensorDescriptor>(definition_.src_tensors[0]);
+    if (definition_.IsBatchSupported()) {
+      src_desc->SetStateVar("BatchedWidth", "true");
+    }
+    src_tensors_names_.insert(src_tensors_names_.begin(), "src_tensor");
+    args_.AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+
+    auto dst_desc =
+        absl::make_unique<TensorDescriptor>(definition_.dst_tensors[0]);
+    if (definition_.IsBatchSupported()) {
+      dst_desc->SetStateVar("BatchedWidth", "true");
+    }
+    dst_tensors_names_.insert(dst_tensors_names_.begin(), "dst_tensor");
+    args_.AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
+
+    std::string code =
+        GetElementWiseCode(definition_, check_src_channels_size_);
+    std::string element_wise_code;
+    element_wise_code += "{\n" + code_ + "\n}\n";
+    RETURN_IF_ERROR(
+        MergeOperations(linked_operations_, &args_, &element_wise_code));
+    RETURN_IF_ERROR(args_.TransformToCLCode(
+        creation_context.device->info_,
+        {{dst_tensors_names_[0], element_wise_code}}, &code));
+    code = absl::Substitute(code, args_.GetListOfArgs());
+    RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
+        code, "main_function", *creation_context.context,
+        *creation_context.device, &kernel_));
+  } else {
+    std::string element_wise_code;
+    RETURN_IF_ERROR(
+        MergeOperations(linked_operations_, &args_, &element_wise_code));
+    RETURN_IF_ERROR(args_.TransformToCLCode(
+        creation_context.device->info_,
+        {{dst_tensors_names_[0], element_wise_code}}, &code_));
+    RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
+        code_, "main_function", compiler_options_, *creation_context.context,
+        *creation_context.device, &kernel_));
+  }
+  return PostCompileCheck(creation_context.device->info_, kernel_.info_);
+}
+
+void GPUOperation::GetPossibleKernelWorkGroups(
+    TuningType tuning_type, const DeviceInfo& device_info,
+    const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
+  GetPossibleWorkGroups(tuning_type, device_info, kernel_info, grid_size_,
+                        work_groups);
+}
+
+absl::Status GPUOperation::Tune(const TuningParameters& params) {
+  std::vector<int3> possible_work_groups;
+  GetPossibleKernelWorkGroups(params.tuning_type, *params.info, kernel_.info_,
+                              &possible_work_groups);
+  if (possible_work_groups.empty()) {
+    return absl::NotFoundError(
+        "Can not found work_group size to launch kernel");
+  }
+  if (possible_work_groups.size() == 1) {
+    work_group_size_ = possible_work_groups[0];
+    return absl::OkStatus();
+  } else {
+    RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
+    int best_work_group_index;
+    RETURN_IF_ERROR(params.queue->GetBestWorkGroupIndex(
+        kernel_, *params.info, grid_size_, possible_work_groups,
+        &best_work_group_index));
+    work_group_size_ = possible_work_groups[best_work_group_index];
+    return absl::OkStatus();
+  }
+}
+
+int3 GPUOperation::GetGridSize() const {
+  if (elementwise_) {
+    const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
+    const int grid_y = dst_[0]->Height();
+    const int grid_z = dst_[0]->Slices();
+    return int3(grid_x, grid_y, grid_z);
+  } else {
+    return int3(0, 0, 0);
+  }
+}
+
+void GPUOperation::AddUniquePostfix(const std::string& unique_postfix) {
+  for (int i = 0; i < src_tensors_names_.size(); ++i) {
+    src_tensors_names_[i] += unique_postfix;
+  }
+  for (int i = 0; i < dst_tensors_names_.size(); ++i) {
+    dst_tensors_names_[i] += unique_postfix;
   }
-  return absl::OkStatus();
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
index 1e101ef2849..80f2eb3c950 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
@@ -21,9 +21,11 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/lite/delegates/gpu/cl/arguments.h"
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/tuning_parameters.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
 #include "tensorflow/lite/delegates/gpu/cl/precision.h"
 #include "tensorflow/lite/delegates/gpu/cl/program_cache.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor.h"
@@ -42,17 +44,6 @@ struct CreationContext {
   ProgramCache* cache;
 };
 
-struct LinkingContext {
-  // variable(FLT4) name to apply subsequent transformations
-  std::string var_name;
-  // x coordinate name (as it appears in kernel) for variable
-  std::string x_coord;
-  // y coordinate name (as it appears in kernel) for variable
-  std::string y_coord;
-  // s coordinate name (as it appears in kernel) for variable
-  std::string s_coord;
-};
-
 struct OperationDef {
   CalculationsPrecision precision;
   std::vector<TensorDescriptor> src_tensors;
@@ -68,18 +59,15 @@ struct OperationDef {
   bool IsBatchSupported() const;
 };
 
-class ElementwiseOperation;
-
 // GPUOperation represents some implementation of neural network operation on
-// GPU. GPUOperation can contain ElementwiseOperation operations, in this case,
-// ElementwiseOperation still hold necessary data and should be alive.
-// When GPUOperation contains ElementwiseOperations, this GPUoperation replaces
-// some sequence of operations Op + el_op0 + el_op1 + ...
+// GPU. GPUOperation can contain another GPU operations with flag elementwise_.
+// When GPUOperation contains another GPU ops, this GPUoperation replaces
+// some sequence of operations Op + op0 + op1 + ...
 // Because of this abilities of GPUOperation, usage scenario is next:
 // Create instance of GPUOperation.
-// Create all instances of ElementwiseOperations that we will(probably) attach
-// to GPUOperation. Attach all ElementwiseOperations to GPUOperation. Call
-// GPUOperation.Compile(). Don't call ElementwiseOperation.Compile() if it
+// Create all instances of GPUOperations that we will(probably) attach
+// to GPUOperation. Attach all GPUOperations to GPUOperation. Call
+// GPUOperation.Compile(). Don't call GPUOperations.Compile() if it
 // attached, it useless(and may be error)
 class GPUOperation {
  public:
@@ -92,85 +80,72 @@ class GPUOperation {
   GPUOperation(const GPUOperation&) = delete;
   GPUOperation& operator=(const GPUOperation&) = delete;
 
-  void AddOperation(ElementwiseOperation* operation);
+  void AddOperation(GPUOperation* operation);
 
   void SetSrc(Tensor* ptr, int index = 0);
   void SetDst(Tensor* ptr, int index = 0);
 
-  virtual absl::Status AddToQueue(CLCommandQueue* queue) {
-    return absl::OkStatus();
-  }
-  virtual absl::Status Tune(const TuningParameters& params) {
-    return absl::OkStatus();
+  // should be called after changes of inputs/outputs.
+  absl::Status UpdateParams();
+
+  absl::Status AddToQueue(CLCommandQueue* queue) {
+    RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
+    return queue->DispatchImplicit(kernel_, grid_size_, work_group_size_);
   }
 
-  virtual absl::Status Compile(const CreationContext& creation_context) {
+  virtual void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info, std::vector<int3>* work_groups) const;
+
+  absl::Status Tune(const TuningParameters& params);
+
+  absl::Status Compile(const CreationContext& creation_context);
+
+  virtual absl::Status PostCompileCheck(const DeviceInfo& device_info,
+                                        const KernelInfo& kernel_info) {
     return absl::OkStatus();
   }
 
   const OperationDef& GetDefinition() const { return definition_; }
 
+  void AddSrcTensor(const std::string& tensor_name,
+                    const TensorDescriptor& desc);
+  void AddSrcBuffer(const std::string& buffer_name,
+                    const BufferDescriptor& desc);
+  void AddDstTensor(const std::string& tensor_name,
+                    const TensorDescriptor& desc);
+
+  bool IsLinkable() const { return elementwise_ && linkable_; }
+
+  // for linking
+  void AddUniquePostfix(const std::string& unique_postfix);
+
+  Arguments args_;
+  std::string code_;
+
+  bool elementwise_ = false;
+  // applicable only with elementwise_ = true;
+  bool linkable_ = true;  // by default every elementwise is linkable
+  // applicable only with elementwise_ = true;
+  bool check_src_channels_size_ = false;
+
  protected:
+  virtual absl::Status BindArguments() { return absl::OkStatus(); }
+  virtual int3 GetGridSize() const;
+
   // Defines operation calculation precision and format of src/dst tensors.
   OperationDef definition_;
   std::vector<Tensor*> src_;
   std::vector<Tensor*> dst_;
-  Arguments args_;
-  std::vector<ElementwiseOperation*> linked_operations_;
-};
-
-// ElementwiseOperation can be fused(linked) to another operation.
-// field linked_ indicate about this
-// link_index_ used mostly for generating of correct names for
-//   linked code variables
-// link_index_ is number of operation in sequence of linked operations
-// and should be unique in this sequence
-// link_index_ = 0 is equivalent that operation not linked.
-class ElementwiseOperation : public GPUOperation {
- public:
-  ElementwiseOperation() {}
-  explicit ElementwiseOperation(const OperationDef& definition)
-      : GPUOperation(definition) {}
-
-  virtual ~ElementwiseOperation() {}
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
-  absl::Status Compile(const CreationContext& creation_context) override;
-
-  // Move only
-  ElementwiseOperation(ElementwiseOperation&& operation);
-  ElementwiseOperation& operator=(ElementwiseOperation&& operation);
-  ElementwiseOperation(const ElementwiseOperation&) = delete;
-  ElementwiseOperation& operator=(const ElementwiseOperation&) = delete;
-
-  virtual absl::Status SetArgs(const std::string& unique_postfix,
-                               Arguments* args) {
-    return absl::OkStatus();
-  }
-
-  Arguments&& MoveArgs() { return std::move(args_); }
-  std::string GetCode() const { return code_; }
-
-  // ovveride to return false if for any reason operation can not be linked.
-  virtual bool IsLinkable() const { return true; }
-
- protected:
-  bool check_src_channels_size_ = false;
-  std::string code_;
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
   CLKernel kernel_;
   int3 work_group_size_ = int3(8, 4, 1);
+  int3 grid_size_ = int3(0, 0, 0);
+  std::vector<std::string> src_tensors_names_;
+  std::vector<std::string> dst_tensors_names_;
+  std::vector<CompilerOptions> compiler_options_;
+  std::vector<GPUOperation*> linked_operations_;
 };
 
-absl::Status MergeOperations(
-    const std::vector<ElementwiseOperation*>& linked_ops,
-    Arguments* merged_args, std::string* merged_code);
-
-absl::Status SetArguments(const std::vector<ElementwiseOperation*>& linked_ops,
-                          Arguments* args);
-
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/lstm.cc b/tensorflow/lite/delegates/gpu/cl/kernels/lstm.cc
index 4732d35e987..0fc5e498de4 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/lstm.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/lstm.cc
@@ -24,22 +24,27 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
 
-std::string GetLSTMCode(const OperationDef& op_def, const CLDevice& device,
-                        Arguments* args) {
-  args->AddObjectRef(
-      "intermediate", AccessType::READ,
-      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
-  args->AddObjectRef(
-      "prev_state", AccessType::READ,
-      absl::make_unique<TensorDescriptor>(op_def.src_tensors[1]));
-  args->AddObjectRef(
-      "new_state", AccessType::WRITE,
-      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
-  args->AddObjectRef(
-      "activation", AccessType::WRITE,
-      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[1]));
+LSTM::LSTM(const OperationDef& definition, const DeviceInfo& device_info)
+    : GPUOperation(definition) {
+  code_ = GetLSTMCode(definition_, device_info);
+}
+
+LSTM::LSTM(LSTM&& kernel) : GPUOperation(std::move(kernel)) {}
+
+LSTM& LSTM::operator=(LSTM&& kernel) {
+  if (this != &kernel) {
+    GPUOperation::operator=(std::move(kernel));
+  }
+  return *this;
+}
+
+std::string LSTM::GetLSTMCode(const OperationDef& op_def,
+                              const DeviceInfo& device_info) {
+  AddSrcTensor("intermediate", op_def.src_tensors[0]);
+  AddSrcTensor("prev_state", op_def.src_tensors[1]);
+  AddDstTensor("new_state", op_def.dst_tensors[0]);
+  AddDstTensor("activation", op_def.dst_tensors[1]);
 
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
@@ -54,7 +59,8 @@ std::string GetLSTMCode(const OperationDef& op_def, const CLDevice& device,
   c += "  FLT4 r1 = args.intermediate.Read(0, 0, Z + state_stride, B);\n";
   c += "  FLT4 r2 = args.intermediate.Read(0, 0, Z + state_stride * 2, B);\n";
   c += "  FLT4 r3 = args.intermediate.Read(0, 0, Z + state_stride * 3, B);\n";
-  if (op_def.precision != CalculationsPrecision::F32 && device.IsAdreno()) {
+  if (op_def.precision != CalculationsPrecision::F32 &&
+      device_info.IsAdreno()) {
     c += "  FLT4 input_gate;\n";
     c += "  FLT4 new_input;\n";
     c += "  FLT4 forget_gate;\n";
@@ -98,40 +104,6 @@ std::string GetLSTMCode(const OperationDef& op_def, const CLDevice& device,
   c += "}\n";
   return c;
 }
-}  // namespace
-
-LSTM::LSTM(const OperationDef& definition) : GPUOperation(definition) {}
-
-LSTM::LSTM(LSTM&& kernel)
-    : GPUOperation(std::move(kernel)),
-      kernel_(std::move(kernel.kernel_)),
-      work_group_size_(kernel.work_group_size_) {}
-
-LSTM& LSTM::operator=(LSTM&& kernel) {
-  if (this != &kernel) {
-    kernel_ = std::move(kernel.kernel_);
-    std::swap(work_group_size_, kernel.work_group_size_);
-    GPUOperation::operator=(std::move(kernel));
-  }
-  return *this;
-}
-
-absl::Status LSTM::Compile(const CreationContext& creation_context) {
-  std::string code = GetLSTMCode(definition_, *creation_context.device, &args_);
-  RETURN_IF_ERROR(
-      args_.TransformToCLCode(creation_context.device->GetInfo(), {}, &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
-absl::Status LSTM::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("intermediate", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("prev_state", src_[1]));
-  RETURN_IF_ERROR(args_.SetObjectRef("new_state", dst_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("activation", dst_[1]));
-  return args_.Bind(kernel_.kernel());
-}
 
 int3 LSTM::GetGridSize() const {
   const int grid_x = dst_[0]->Batch();
@@ -140,18 +112,10 @@ int3 LSTM::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status LSTM::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+LSTM CreateLSTM(const OperationDef& definition, const DeviceInfo& device_info) {
+  return LSTM(definition, device_info);
 }
 
-absl::Status LSTM::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
-LSTM CreateLSTM(const OperationDef& definition) { return LSTM(definition); }
-
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/lstm.h b/tensorflow/lite/delegates/gpu/cl/kernels/lstm.h
index 27b072ed001..91bfd22a0e6 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/lstm.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/lstm.h
@@ -27,10 +27,8 @@ namespace cl {
 
 class LSTM : public GPUOperation {
  public:
-  explicit LSTM(const OperationDef& definition);
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-  absl::Status Compile(const CreationContext& creation_context) override;
+  LSTM(const OperationDef& definition, const DeviceInfo& device_info);
+  int3 GetGridSize() const override;
 
   // Move only
   LSTM(LSTM&& kernel);
@@ -39,14 +37,11 @@ class LSTM : public GPUOperation {
   LSTM& operator=(const LSTM&) = delete;
 
  private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
-  CLKernel kernel_;
-  int3 work_group_size_ = int3(8, 4, 1);
+  std::string GetLSTMCode(const OperationDef& op_def,
+                          const DeviceInfo& device_info);
 };
 
-LSTM CreateLSTM(const OperationDef& definition);
+LSTM CreateLSTM(const OperationDef& definition, const DeviceInfo& device_info);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/lstm_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/lstm_test.cc
index 6e1b858711a..52e9b4cba4c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/lstm_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/lstm_test.cc
@@ -67,7 +67,7 @@ TEST_F(OpenCLOperationTest, LSTM) {
       op_def.dst_tensors.push_back({data_type, storage, Layout::BHWC});
       TensorFloat32 new_state;
       TensorFloat32 new_activ;
-      LSTM operation = CreateLSTM(op_def);
+      LSTM operation = CreateLSTM(op_def, env_.GetDevicePtr()->info_);
       ASSERT_OK(ExecuteGPUOperation(
           {src_tensor, prev_state}, creation_context_, &operation,
           {BHWC(1, 1, 1, 4), BHWC(1, 1, 1, 4)}, {&new_state, &new_activ}));
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc
index dc16837102a..97ee4878572 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc
@@ -23,42 +23,75 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
 
-std::string GetMaxUnpoolingKernelCode(const OperationDef& op_def,
-                                      const CLDevice& device, Arguments* args) {
-  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
-  src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
-  if (op_def.IsBatchSupported()) {
-    src_desc->SetStateVar("BatchedWidth", "true");
+MaxUnpooling::MaxUnpooling(const OperationDef& definition,
+                           const MaxUnpooling2DAttributes& attr)
+    : GPUOperation(definition),
+      stride_(attr.strides.w, attr.strides.h, 0, 0),
+      padding_(attr.padding.appended.w, attr.padding.appended.h, 0, 0),
+      kernel_size_(attr.kernel.w, attr.kernel.h, 0, 0) {
+  code_ = GetMaxUnpoolingKernelCode(definition_);
+}
+
+MaxUnpooling::MaxUnpooling(const OperationDef& definition,
+                           const MaxUnpooling3DAttributes& attr)
+    : GPUOperation(definition),
+      stride_(attr.strides.w, attr.strides.h, attr.strides.d, 0),
+      padding_(attr.padding.appended.w, attr.padding.appended.h,
+               attr.padding.appended.d, 0),
+      kernel_size_(attr.kernel.w, attr.kernel.h, attr.kernel.d, 0) {
+  code_ = GetMaxUnpoolingKernelCode(definition_);
+}
+
+MaxUnpooling::MaxUnpooling(MaxUnpooling&& kernel)
+    : GPUOperation(std::move(kernel)),
+      stride_(kernel.stride_),
+      padding_(kernel.padding_),
+      kernel_size_(kernel.kernel_size_) {}
+
+MaxUnpooling& MaxUnpooling::operator=(MaxUnpooling&& kernel) {
+  if (this != &kernel) {
+    std::swap(stride_, kernel.stride_);
+    std::swap(padding_, kernel.padding_);
+    std::swap(kernel_size_, kernel.kernel_size_);
+    GPUOperation::operator=(std::move(kernel));
   }
-  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
-  auto src_ind_desc =
-      absl::make_unique<TensorDescriptor>(op_def.src_tensors[1]);
-  src_ind_desc->SetTextureAddressMode(GetFastestZeroMode(device));
+  return *this;
+}
+
+std::string MaxUnpooling::GetMaxUnpoolingKernelCode(
+    const OperationDef& op_def) {
+  auto src_desc = op_def.src_tensors[0];
+  src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
   if (op_def.IsBatchSupported()) {
-    src_ind_desc->SetStateVar("BatchedWidth", "true");
+    src_desc.SetStateVar("BatchedWidth", "true");
   }
-  args->AddObjectRef("src_indices", AccessType::READ, std::move(src_ind_desc));
-  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  AddSrcTensor("src_tensor", src_desc);
+  auto src_ind_desc = op_def.src_tensors[1];
+  src_ind_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
   if (op_def.IsBatchSupported()) {
-    dst_desc->SetStateVar("BatchedWidth", "true");
+    src_ind_desc.SetStateVar("BatchedWidth", "true");
   }
-  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
+  AddSrcTensor("src_indices", src_ind_desc);
+  auto dst_desc = op_def.dst_tensors[0];
+  if (op_def.IsBatchSupported()) {
+    dst_desc.SetStateVar("BatchedWidth", "true");
+  }
+  AddDstTensor("dst_tensor", dst_desc);
   if (op_def.dst_tensors[0].HasAxis(Axis::WIDTH)) {
-    args->AddInt("kernel_size_x");
-    args->AddInt("padding_x");
-    args->AddInt("stride_x");
+    args_.AddInt("kernel_size_x");
+    args_.AddInt("padding_x");
+    args_.AddInt("stride_x");
   }
   if (op_def.dst_tensors[0].HasAxis(Axis::HEIGHT)) {
-    args->AddInt("kernel_size_y");
-    args->AddInt("padding_y");
-    args->AddInt("stride_y");
+    args_.AddInt("kernel_size_y");
+    args_.AddInt("padding_y");
+    args_.AddInt("stride_y");
   }
   if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
-    args->AddInt("kernel_size_z");
-    args->AddInt("padding_z");
-    args->AddInt("stride_z");
+    args_.AddInt("kernel_size_z");
+    args_.AddInt("padding_z");
+    args_.AddInt("stride_z");
   }
 
   std::string c = GetCommonDefines(op_def.precision);
@@ -139,61 +172,8 @@ std::string GetMaxUnpoolingKernelCode(const OperationDef& op_def,
 
   return c;
 }
-}  // namespace
-
-MaxUnpooling::MaxUnpooling(const OperationDef& definition,
-                           const MaxUnpooling2DAttributes& attr)
-    : GPUOperation(definition),
-      stride_(attr.strides.w, attr.strides.h, 0, 0),
-      padding_(attr.padding.appended.w, attr.padding.appended.h, 0, 0),
-      kernel_size_(attr.kernel.w, attr.kernel.h, 0, 0) {}
-
-MaxUnpooling::MaxUnpooling(const OperationDef& definition,
-                           const MaxUnpooling3DAttributes& attr)
-    : GPUOperation(definition),
-      stride_(attr.strides.w, attr.strides.h, attr.strides.d, 0),
-      padding_(attr.padding.appended.w, attr.padding.appended.h,
-               attr.padding.appended.d, 0),
-      kernel_size_(attr.kernel.w, attr.kernel.h, attr.kernel.d, 0) {}
-
-MaxUnpooling::MaxUnpooling(MaxUnpooling&& kernel)
-    : GPUOperation(std::move(kernel)),
-      stride_(kernel.stride_),
-      padding_(kernel.padding_),
-      kernel_size_(kernel.kernel_size_),
-      kernel_(std::move(kernel.kernel_)),
-      work_group_size_(kernel.work_group_size_) {}
-
-MaxUnpooling& MaxUnpooling::operator=(MaxUnpooling&& kernel) {
-  if (this != &kernel) {
-    std::swap(stride_, kernel.stride_);
-    std::swap(padding_, kernel.padding_);
-    std::swap(kernel_size_, kernel.kernel_size_);
-    kernel_ = std::move(kernel.kernel_);
-    std::swap(work_group_size_, kernel.work_group_size_);
-    GPUOperation::operator=(std::move(kernel));
-  }
-  return *this;
-}
-
-absl::Status MaxUnpooling::Compile(const CreationContext& creation_context) {
-  std::string code =
-      GetMaxUnpoolingKernelCode(definition_, *creation_context.device, &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
 
 absl::Status MaxUnpooling::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("src_indices", src_[1]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   if (definition_.dst_tensors[0].HasAxis(Axis::WIDTH)) {
     RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
     RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
@@ -209,8 +189,7 @@ absl::Status MaxUnpooling::BindArguments() {
     RETURN_IF_ERROR(args_.SetInt("padding_z", padding_.z));
     RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
   }
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return absl::OkStatus();
 }
 
 int3 MaxUnpooling::GetGridSize() const {
@@ -220,16 +199,6 @@ int3 MaxUnpooling::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status MaxUnpooling::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status MaxUnpooling::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
                                 const MaxUnpooling2DAttributes& attr) {
   return MaxUnpooling(definition, attr);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h
index 24b8c4bbfe3..0b1420a67c9 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h
@@ -31,10 +31,9 @@ class MaxUnpooling : public GPUOperation {
                const MaxUnpooling2DAttributes& attr);
   MaxUnpooling(const OperationDef& definition,
                const MaxUnpooling3DAttributes& attr);
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
 
-  absl::Status Compile(const CreationContext& creation_context) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   MaxUnpooling(MaxUnpooling&& kernel);
@@ -43,15 +42,11 @@ class MaxUnpooling : public GPUOperation {
   MaxUnpooling& operator=(const MaxUnpooling&) = delete;
 
  private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
+  std::string GetMaxUnpoolingKernelCode(const OperationDef& op_def);
 
   int4 stride_;
   int4 padding_;
   int4 kernel_size_;
-
-  CLKernel kernel_;
-  int3 work_group_size_ = int3(8, 4, 1);
 };
 
 MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc b/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
index 3f8fb5ee648..e1628a7e9a7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
@@ -25,18 +25,34 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
 
-std::string GetMeanKernelCode(const OperationDef& op_def,
-                              const int3& work_group_size, Arguments* args) {
-  args->AddObjectRef(
-      "src_tensor", AccessType::READ,
-      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
-  args->AddObjectRef(
-      "dst_tensor", AccessType::WRITE,
-      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
-  args->AddFloat("inv_multiplier_1");
-  args->AddFloat("inv_multiplier_2");
+Mean::Mean(const OperationDef& definition, const DeviceInfo& device_info)
+    : GPUOperation(definition) {
+  // for workgroup size:
+  // must be: (x * y) % 4 = 0;
+  // must be: z = 1;
+  work_group_size_ = int3(16, 16, 1);
+  if (device_info.IsAdreno3xx()) {
+    work_group_size_ = int3(16, 8, 1);
+  }
+  code_ = GetMeanKernelCode(definition_, work_group_size_);
+}
+
+Mean::Mean(Mean&& operation) : GPUOperation(std::move(operation)) {}
+
+Mean& Mean::operator=(Mean&& operation) {
+  if (this != &operation) {
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+std::string Mean::GetMeanKernelCode(const OperationDef& op_def,
+                                    const int3& work_group_size) {
+  AddSrcTensor("src_tensor", op_def.src_tensors[0]);
+  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
+  args_.AddFloat("inv_multiplier_1");
+  args_.AddFloat("inv_multiplier_2");
 
   std::string c = GetCommonDefines(op_def.precision);
   const std::string wg_x = std::to_string(work_group_size.x);
@@ -91,48 +107,14 @@ std::string GetMeanKernelCode(const OperationDef& op_def,
   c += "}\n";
   return c;
 }
-}  // namespace
-
-Mean::Mean(Mean&& operation)
-    : GPUOperation(std::move(operation)),
-      kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_) {}
-
-Mean& Mean::operator=(Mean&& operation) {
-  if (this != &operation) {
-    kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status Mean::Compile(const CreationContext& creation_context) {
-  if (creation_context.device->IsAdreno3xx()) {
-    work_group_size_ = int3(16, 8, 1);
-  }
-  std::string code = GetMeanKernelCode(definition_, work_group_size_, &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
 
 absl::Status Mean::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   const double total_size = src_[0]->Width() * src_[0]->Height();
   const double size_0 = work_group_size_.x * work_group_size_.y;
   const double size_1 = total_size / size_0;
   RETURN_IF_ERROR(args_.SetFloat("inv_multiplier_1", 1.0 / size_1));
   RETURN_IF_ERROR(args_.SetFloat("inv_multiplier_2", 1.0 / size_0));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return absl::OkStatus();
 }
 
 int3 Mean::GetGridSize() const {
@@ -142,13 +124,10 @@ int3 Mean::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status Mean::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+Mean CreateMean(const OperationDef& definition, const DeviceInfo& device_info) {
+  return Mean(definition, device_info);
 }
 
-Mean CreateMean(const OperationDef& definition) { return Mean(definition); }
-
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean.h b/tensorflow/lite/delegates/gpu/cl/kernels/mean.h
index 4525551b5f2..12735c0b916 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean.h
@@ -29,10 +29,16 @@ namespace cl {
 class Mean : public GPUOperation {
  public:
   Mean() = default;
-  explicit Mean(const OperationDef& definition) : GPUOperation(definition) {}
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
+  Mean(const OperationDef& definition, const DeviceInfo& device_info);
 
-  absl::Status Compile(const CreationContext& creation_context) override;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override {
+    work_groups->push_back(work_group_size_);
+  }
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   Mean(Mean&& operation);
@@ -41,16 +47,11 @@ class Mean : public GPUOperation {
   Mean& operator=(const Mean&) = delete;
 
  private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-  CLKernel kernel_;
-
-  // must be: (x * y) % 4 = 0;
-  // must be: z = 1;
-  int3 work_group_size_ = int3(16, 16, 1);
+  std::string GetMeanKernelCode(const OperationDef& op_def,
+                                const int3& work_group_size);
 };
 
-Mean CreateMean(const OperationDef& definition);
+Mean CreateMean(const OperationDef& definition, const DeviceInfo& device_info);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc
new file mode 100644
index 00000000000..ec775861da7
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc
@@ -0,0 +1,161 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h"
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GetVectorReduceCode() {
+  return R"(static inline float reduce_vector(float4 v) {
+  return dot(v, (float4)(1.0f));
+})";
+}
+
+std::string GetReduceCode(size_t work_group_size_x, size_t work_group_size_y) {
+  // If it is supported, use the built-in work_group_reduce_add function.
+  // Otherwise, implement a reduction using __local memory. Note this only works
+  // with power-of-two work group sizes.
+  return R"(
+#if (__OPENCL_C_VERSION__ >= 200) && (__OPENCL_C_VERSION__ < 300) && \
+  !defined(__opencl_c_work_group_collective_functions)
+  #define __opencl_c_work_group_collective_functions 1
+#endif
+
+#ifdef __opencl_c_work_group_collective_functions
+#define local_reduce(input, tmp) work_group_reduce_add(input)
+#else  // !defined(__opencl_c_work_group_collective_functions)
+static inline float local_reduce(float input, __local float tmp[)" +
+         std::to_string(work_group_size_y) + "][" +
+         std::to_string(work_group_size_x) + R"(]) {
+  const size_t local_id_x = get_local_id(0);
+  const size_t local_id_y = get_local_id(1);
+  tmp[local_id_y][local_id_x] = input;
+  mem_fence(CLK_LOCAL_MEM_FENCE);
+  size_t reduction_size = get_local_size(0) / 2;
+  while (reduction_size > 0) {
+    if (local_id_x < reduction_size) {
+      tmp[local_id_y][local_id_x] += tmp[local_id_y][local_id_x + reduction_size];
+    }
+    mem_fence(CLK_LOCAL_MEM_FENCE);
+    reduction_size /=  2;
+  }
+  return tmp[local_id_y][0];
+}
+#endif  // defined(__opencl_c_work_group_collective_functions)
+)";
+}
+}  // namespace
+
+MeanStdDevNormalization::MeanStdDevNormalization(const OperationDef& definition)
+    : GPUOperation(definition) {
+  // The kernel code does not inherently need a fixed size, but in order to not
+  // hardcode the __local array's size for the reductions, we would need to pass
+  // that size to the kernel at runtime, and that is currently not supported.
+  // For now, fix workgroup size to 128 threads.
+  work_group_size_.x = 128;
+  work_group_size_.y = 1;
+  work_group_size_.z = 1;
+  code_ = GetNormalizationCode();
+}
+
+std::string MeanStdDevNormalization::GetNormalizationCode() {
+  AddSrcTensor("src_tensor", definition_.src_tensors[0]);
+  AddDstTensor("dst_tensor", definition_.dst_tensors[0]);
+
+  std::string c = GetCommonDefines(definition_.precision);
+  c += GetVectorReduceCode();
+  c += GetReduceCode(work_group_size_.x, work_group_size_.y);
+  c += "__attribute__((reqd_work_group_size(" +
+       std::to_string(work_group_size_.x) + ", " +
+       std::to_string(work_group_size_.y) + ", " +
+       std::to_string(work_group_size_.z) + ")))\n";
+  c += R"(__kernel void main_function(
+$0) {
+#ifndef __opencl_c_work_group_collective_functions
+  __local float tmp[)" +
+       std::to_string(work_group_size_.y) + "][" +
+       std::to_string(work_group_size_.x) + R"(];
+#endif
+  size_t B = get_global_id(1);
+  if (get_global_id(2) > 0) { return; }
+  if (B >= args.src_tensor.Batch()) { return; }
+  // Calculate the total sum of the input tensor.
+  // First, get a local sum of input[local_id_x + N*local_size_x] for all N.
+  float4 private_sum4 = (float4)(0.0f);
+  for (int S = get_local_id(0); S < args.src_tensor.Slices(); S += get_local_size(0)) {
+    const float4 t = args.src_tensor.Read<float>(0, 0, S, B);
+    // Filter out reads beyond the end of the tensor.
+    const int4 is_after_end_of_tensor = (int4)(0, 1, 2, 3) >= (args.src_tensor.Channels() - S * 4);
+    const float4 filtered_t = select(t, (float4)(0.0f), is_after_end_of_tensor);
+    private_sum4 += filtered_t;
+  }
+  // Reduce the vector to a single float and do a workgroup reduce.
+  const float private_sum = reduce_vector(private_sum4);
+  const float sum = local_reduce(private_sum, tmp);
+  // Calculate the mean
+  const float mean = sum / args.src_tensor.Channels();
+  // Calculate the squared sum of the difference from the mean.
+  float4 private_sum_diff_sq4 = (float4)(0.0f);
+  for (int S = get_local_id(0); S < args.src_tensor.Slices(); S += get_local_size(0)) {
+    const float4 t = args.src_tensor.Read<float>(0, 0, S, B);
+    const float4 diff = t - mean;
+    // Filter out reads beyond the end of the tensor.
+    const int4 is_after_end_of_tensor = (int4)(0, 1, 2, 3) >= (args.src_tensor.Channels() - S * 4);
+    const float4 filtered_diff = select(diff, (float4)(0.0f), is_after_end_of_tensor);
+    // sum_diff_sq += diff²
+    private_sum_diff_sq4 = mad(filtered_diff, filtered_diff, private_sum_diff_sq4);
+  }
+  // Reduce
+  const float private_sum_diff_sq = reduce_vector(private_sum_diff_sq4);
+  const float sum_diff_sq = local_reduce(private_sum_diff_sq, tmp);
+  // Calculate 1/stddev (with the 'regulazing constant' as in tensor_utils.cc)
+  const float variance = sum_diff_sq / args.src_tensor.Channels();
+  const float stddev_inv =  rsqrt(variance + 1.0e-8f);
+  // Calculate (t-mean)/stddev for each element
+  for (int S = get_local_id(0); S < args.src_tensor.Slices(); S += get_local_size(0)) {
+    const float4 t = args.src_tensor.Read<float>(0, 0, S, B);
+    FLT4 result = TO_FLT4((t - mean) * stddev_inv);
+    args.dst_tensor.Write(result, 0, 0, S, B);
+  }
+})";
+  return c;
+}
+
+int3 MeanStdDevNormalization::GetGridSize() const {
+  // To avoid dealing with global reductions, we restrict the grid size to the
+  // work group size in the first dimension.
+  const int grid_x = work_group_size_.x;
+  const int grid_y = src_[0]->Batch();
+  const int grid_z = 1;
+  return int3(grid_x, grid_y, grid_z);
+}
+
+MeanStdDevNormalization CreateMeanStdDevNormalization(
+    const OperationDef& definition) {
+  return MeanStdDevNormalization(definition);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h
new file mode 100644
index 00000000000..47cc7ff46d1
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h
@@ -0,0 +1,59 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_LSTM_NORMALIZATION_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_LSTM_NORMALIZATION_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// Implements tensor_utils::MeanStddevNormalization
+class MeanStdDevNormalization : public GPUOperation {
+ public:
+  explicit MeanStdDevNormalization(const OperationDef& definition);
+
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override {
+    work_groups->push_back(work_group_size_);
+  }
+  int3 GetGridSize() const override;
+
+  // Move only
+  MeanStdDevNormalization(MeanStdDevNormalization&& kernel) = default;
+  MeanStdDevNormalization& operator=(MeanStdDevNormalization&& kernel) =
+      default;
+  MeanStdDevNormalization(const MeanStdDevNormalization&) = delete;
+  MeanStdDevNormalization& operator=(const MeanStdDevNormalization&) = delete;
+
+ private:
+  std::string GetNormalizationCode();
+};
+
+MeanStdDevNormalization CreateMeanStdDevNormalization(
+    const OperationDef& definition);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_LSTM_NORMALIZATION_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization_test.cc
new file mode 100644
index 00000000000..57f052557d4
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization_test.cc
@@ -0,0 +1,139 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+// Parameterized test: mean, difference, tolerance.
+// Input is constructed as [mean-2*diff, mean-diff, mean+diff, mean+2*diff]
+class MeanStddevNormalizationTest
+    : public OpenCLOperationTest,
+      public testing::WithParamInterface<std::tuple<float, float, float>> {};
+
+TEST_P(MeanStddevNormalizationTest, SeparateBatches) {
+  const float mean = std::get<0>(GetParam());
+  const float diff = std::get<1>(GetParam());
+  const float tolerance = std::get<2>(GetParam());
+
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 1, 1, 4);
+  src_tensor.data = {mean - 2 * diff, mean - diff, mean + diff,
+                     mean + 2 * diff};
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::BHWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::BHWC});
+      TensorFloat32 dst_tensor;
+      auto operation = CreateMeanStdDevNormalization(op_def);
+      ASSERT_OK(ExecuteGPUOperation({src_tensor}, creation_context_, &operation,
+                                    BHWC(1, 1, 1, 4), &dst_tensor));
+
+      std::vector<float> expected_output;
+      if (diff == 0.0f) {
+        expected_output.assign({0.0f, 0.0f, 0.0f, 0.0f});
+      } else {
+        const float ksqrt16 = std::sqrt(1.6f);
+        const float ksqrt04 = std::sqrt(0.4f);
+        expected_output.assign({-ksqrt16, -ksqrt04, ksqrt04, ksqrt16});
+      }
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(tolerance), expected_output));
+    }
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    uKernels, MeanStddevNormalizationTest,
+    testing::Values(
+        std::make_tuple(0.0f, 0.0f, 0.0f),         // zero mean, zero variance
+        std::make_tuple(0.0f, 0.01f, 2.53e-5f),    // zero mean, small variance
+        std::make_tuple(0.0f, 100.0f, 1.20e-7f),   // zero mean, large variance
+        std::make_tuple(0.01f, 0.0f, 0.0f),        // small mean, zero variance
+        std::make_tuple(0.01f, 0.01f, 2.53e-5f),   // small mean, small variance
+        std::make_tuple(0.01f, 100.0f, 1.20e-7f),  // small mean, large variance
+        std::make_tuple(100.0f, 0.0f, 0.0f),       // large mean, zero variance
+        std::make_tuple(100.0f, 0.01f, 1.81e-4f),  // large mean, small variance
+        std::make_tuple(100.0f, 100.0f, 1.20e-7f)  // large mean, large variance
+        ));
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(MeanStddevNormalizationTest);
+
+TEST_F(OpenCLOperationTest, MeanStddevNormalizationAllBatches) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(9, 1, 1, 4);
+  src_tensor.data = {
+      0.0f,     0.0f,    0.0f,    0.0f,     // zero mean, zero variance
+      -0.02f,   -0.01f,  0.01f,   0.02f,    // zero mean, small variance
+      -200.0f,  -100.0f, 100.0f,  200.0f,   // zero mean, large variance
+      0.01f,    0.01f,   0.01f,   0.01f,    // small mean, zero variance
+      -0.01f,   0.0f,    0.02f,   0.03f,    // small mean, small variance
+      -199.99f, -99.99f, 100.01f, 200.01f,  // small mean, large variance
+      100.0f,   100.0f,  100.0f,  100.0f,   // large mean, zero variance
+      99.98f,   99.99f,  100.01f, 100.02f,  // large mean, small variance
+      -100.0f,  0.0f,    200.0f,  300.0f,   // large mean, large variance
+  };
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::BHWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::BHWC});
+      TensorFloat32 dst_tensor;
+      auto operation = CreateMeanStdDevNormalization(op_def);
+      ASSERT_OK(ExecuteGPUOperation({src_tensor}, creation_context_, &operation,
+                                    BHWC(9, 1, 1, 4), &dst_tensor));
+
+      const float ksqrt16 = std::sqrt(1.6f);
+      const float ksqrt04 = std::sqrt(0.4f);
+      const std::vector<float> expected_output = {
+          0.0f,     0.0f,     0.0f,    0.0f,     // zero mean, zero variance
+          -ksqrt16, -ksqrt04, ksqrt04, ksqrt16,  // zero mean, small variance
+          -ksqrt16, -ksqrt04, ksqrt04, ksqrt16,  // zero mean, large variance
+          0.0f,     0.0f,     0.0f,    0.0f,     // small mean, zero variance
+          -ksqrt16, -ksqrt04, ksqrt04, ksqrt16,  // small mean, small variance
+          -ksqrt16, -ksqrt04, ksqrt04, ksqrt16,  // small mean, large variance
+          0.0f,     0.0f,     0.0f,    0.0f,     // large mean, zero variance
+          -ksqrt16, -ksqrt04, ksqrt04, ksqrt16,  // large mean, small variance
+          -ksqrt16, -ksqrt04, ksqrt04, ksqrt16,  // large mean, large variance
+      };
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(1.81e-4f), expected_output));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/mean_test.cc
index 0379c59dd45..b1ae1d354eb 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean_test.cc
@@ -47,7 +47,7 @@ TEST_F(OpenCLOperationTest, Mean) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Mean operation = CreateMean(op_def);
+      Mean operation = CreateMean(op_def, env_.GetDevicePtr()->info_);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 1, 1, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {2.5f}));
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc b/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc
index 883067a2c2d..4e2a6fb2bce 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc
@@ -24,20 +24,29 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
 
-std::string GetPaddingCode(const OperationDef& op_def,
-                           const PadAttributes& attr, Arguments* args) {
-  args->AddObjectRef(
-      "src_tensor", AccessType::READ,
-      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
-  args->AddObjectRef(
-      "dst_tensor", AccessType::WRITE,
-      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
-  args->AddInt("prepended_x");
-  args->AddInt("prepended_y");
-  args->AddInt("prepended_z");
-  args->AddInt("prepended_w");
+Padding::Padding(const OperationDef& definition, const PadAttributes& attr)
+    : GPUOperation(definition) {
+  code_ = GetPaddingCode(definition_, attr);
+}
+
+Padding::Padding(Padding&& kernel) : GPUOperation(std::move(kernel)) {}
+
+Padding& Padding::operator=(Padding&& kernel) {
+  if (this != &kernel) {
+    GPUOperation::operator=(std::move(kernel));
+  }
+  return *this;
+}
+
+std::string Padding::GetPaddingCode(const OperationDef& op_def,
+                                    const PadAttributes& attr) {
+  AddSrcTensor("src_tensor", op_def.src_tensors[0]);
+  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
+  args_.AddInt("prepended_x", attr.prepended.w);
+  args_.AddInt("prepended_y", attr.prepended.h);
+  args_.AddInt("prepended_z", attr.prepended.c);
+  args_.AddInt("prepended_w", attr.prepended.b);
 
   const std::string dst_batch =
       op_def.dst_tensors[0].HasAxis(Axis::BATCH) ? "B" : "0";
@@ -139,50 +148,6 @@ std::string GetPaddingCode(const OperationDef& op_def,
 
   return c;
 }
-}  // namespace
-
-Padding::Padding(const OperationDef& definition, const PadAttributes& attr)
-    : GPUOperation(definition), attributes_(attr) {}
-
-Padding::Padding(Padding&& kernel)
-    : GPUOperation(std::move(kernel)),
-      attributes_(kernel.attributes_),
-      kernel_(std::move(kernel.kernel_)),
-      work_group_size_(kernel.work_group_size_) {}
-
-Padding& Padding::operator=(Padding&& kernel) {
-  if (this != &kernel) {
-    std::swap(attributes_, kernel.attributes_);
-    kernel_ = std::move(kernel.kernel_);
-    std::swap(work_group_size_, kernel.work_group_size_);
-    GPUOperation::operator=(std::move(kernel));
-  }
-  return *this;
-}
-
-absl::Status Padding::Compile(const CreationContext& creation_context) {
-  std::string code = GetPaddingCode(definition_, attributes_, &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
-absl::Status Padding::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(args_.SetInt("prepended_x", attributes_.prepended.w));
-  RETURN_IF_ERROR(args_.SetInt("prepended_y", attributes_.prepended.h));
-  RETURN_IF_ERROR(args_.SetInt("prepended_z", attributes_.prepended.c));
-  RETURN_IF_ERROR(args_.SetInt("prepended_w", attributes_.prepended.b));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
-}
 
 int3 Padding::GetGridSize() const {
   const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
@@ -191,16 +156,6 @@ int3 Padding::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status Padding::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status Padding::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 Padding CreatePadding(const OperationDef& definition,
                       const PadAttributes& attr) {
   return Padding(definition, attr);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/padding.h b/tensorflow/lite/delegates/gpu/cl/kernels/padding.h
index ddf9f9583be..44d53204e16 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/padding.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/padding.h
@@ -28,10 +28,7 @@ namespace cl {
 class Padding : public GPUOperation {
  public:
   Padding(const OperationDef& definition, const PadAttributes& attr);
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
-  absl::Status Compile(const CreationContext& creation_context) override;
+  int3 GetGridSize() const override;
 
   // Move only
   Padding(Padding&& kernel);
@@ -40,12 +37,8 @@ class Padding : public GPUOperation {
   Padding& operator=(const Padding&) = delete;
 
  private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
-  PadAttributes attributes_;
-  CLKernel kernel_;
-  int3 work_group_size_ = int3(8, 4, 1);
+  std::string GetPaddingCode(const OperationDef& op_def,
+                             const PadAttributes& attr);
 };
 
 Padding CreatePadding(const OperationDef& definition,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc
index 922d484c57d..fb077fe4a1a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc
@@ -23,37 +23,77 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
 
-std::string GetAveragePoolingKernelCode(const OperationDef& op_def,
-                                        bool stride_correction,
-                                        const CLDevice& device,
-                                        Arguments* args) {
-  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
-  src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
-  if (op_def.IsBatchSupported()) {
-    src_desc->SetStateVar("BatchedWidth", "true");
+Pooling::Pooling(const OperationDef& definition,
+                 const Pooling2DAttributes& attr)
+    : GPUOperation(definition),
+      stride_(attr.strides.w, attr.strides.h, 0, 0),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
+      kernel_size_(attr.kernel.w, attr.kernel.h, 0, 0),
+      type_(attr.type),
+      output_indices_(attr.output_indices) {
+  GenerateCode();
+}
+
+Pooling::Pooling(const OperationDef& definition,
+                 const Pooling3DAttributes& attr)
+    : GPUOperation(definition),
+      stride_(attr.strides.w, attr.strides.h, attr.strides.d, 0),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
+               -attr.padding.prepended.d, 0),
+      kernel_size_(attr.kernel.w, attr.kernel.h, attr.kernel.d, 0),
+      type_(attr.type),
+      output_indices_(attr.output_indices) {
+  GenerateCode();
+}
+
+Pooling::Pooling(Pooling&& kernel)
+    : GPUOperation(std::move(kernel)),
+      stride_(kernel.stride_),
+      padding_(kernel.padding_),
+      kernel_size_(kernel.kernel_size_),
+      type_(kernel.type_),
+      output_indices_(kernel.output_indices_) {}
+
+Pooling& Pooling::operator=(Pooling&& kernel) {
+  if (this != &kernel) {
+    std::swap(stride_, kernel.stride_);
+    std::swap(padding_, kernel.padding_);
+    std::swap(kernel_size_, kernel.kernel_size_);
+    std::swap(type_, kernel.type_);
+    std::swap(output_indices_, kernel.output_indices_);
+    GPUOperation::operator=(std::move(kernel));
   }
-  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
-  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  return *this;
+}
+
+std::string Pooling::GetAveragePoolingKernelCode(const OperationDef& op_def,
+                                                 bool stride_correction) {
+  auto src_desc = op_def.src_tensors[0];
+  src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
   if (op_def.IsBatchSupported()) {
-    dst_desc->SetStateVar("BatchedWidth", "true");
+    src_desc.SetStateVar("BatchedWidth", "true");
   }
-  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
+  AddSrcTensor("src_tensor", src_desc);
+  auto dst_desc = op_def.dst_tensors[0];
+  if (op_def.IsBatchSupported()) {
+    dst_desc.SetStateVar("BatchedWidth", "true");
+  }
+  AddDstTensor("dst_tensor", dst_desc);
   if (op_def.dst_tensors[0].HasAxis(Axis::WIDTH)) {
-    args->AddInt("kernel_size_x");
-    args->AddInt("padding_x");
-    args->AddInt("stride_x");
+    args_.AddInt("kernel_size_x");
+    args_.AddInt("padding_x");
+    args_.AddInt("stride_x");
   }
   if (op_def.dst_tensors[0].HasAxis(Axis::HEIGHT)) {
-    args->AddInt("kernel_size_y");
-    args->AddInt("padding_y");
-    args->AddInt("stride_y");
+    args_.AddInt("kernel_size_y");
+    args_.AddInt("padding_y");
+    args_.AddInt("stride_y");
   }
   if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
-    args->AddInt("kernel_size_z");
-    args->AddInt("padding_z");
-    args->AddInt("stride_z");
+    args_.AddInt("kernel_size_z");
+    args_.AddInt("padding_z");
+    args_.AddInt("stride_z");
   }
 
   std::map<Axis, std::string> axis_to_src_coord = {
@@ -155,42 +195,40 @@ std::string GetAveragePoolingKernelCode(const OperationDef& op_def,
   return c;
 }
 
-std::string GetMaxPoolingKernelCode(const OperationDef& op_def,
-                                    bool stride_correction, bool output_indices,
-                                    Arguments* args) {
-  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
+std::string Pooling::GetMaxPoolingKernelCode(const OperationDef& op_def,
+                                             bool stride_correction,
+                                             bool output_indices) {
+  auto src_desc = op_def.src_tensors[0];
   if (op_def.IsBatchSupported()) {
-    src_desc->SetStateVar("BatchedWidth", "true");
+    src_desc.SetStateVar("BatchedWidth", "true");
   }
-  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
-  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  AddSrcTensor("src_tensor", src_desc);
+  auto dst_desc = op_def.dst_tensors[0];
   if (op_def.IsBatchSupported()) {
-    dst_desc->SetStateVar("BatchedWidth", "true");
+    dst_desc.SetStateVar("BatchedWidth", "true");
   }
-  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
+  AddDstTensor("dst_tensor", dst_desc);
   if (output_indices) {
-    auto dst_ind_desc =
-        absl::make_unique<TensorDescriptor>(op_def.dst_tensors[1]);
+    auto dst_ind_desc = op_def.dst_tensors[1];
     if (op_def.IsBatchSupported()) {
-      dst_ind_desc->SetStateVar("BatchedWidth", "true");
+      dst_ind_desc.SetStateVar("BatchedWidth", "true");
     }
-    args->AddObjectRef("dst_indices", AccessType::WRITE,
-                       std::move(dst_ind_desc));
+    AddDstTensor("dst_indices", dst_ind_desc);
   }
   if (op_def.dst_tensors[0].HasAxis(Axis::WIDTH)) {
-    args->AddInt("kernel_size_x");
-    args->AddInt("padding_x");
-    args->AddInt("stride_x");
+    args_.AddInt("kernel_size_x");
+    args_.AddInt("padding_x");
+    args_.AddInt("stride_x");
   }
   if (op_def.dst_tensors[0].HasAxis(Axis::HEIGHT)) {
-    args->AddInt("kernel_size_y");
-    args->AddInt("padding_y");
-    args->AddInt("stride_y");
+    args_.AddInt("kernel_size_y");
+    args_.AddInt("padding_y");
+    args_.AddInt("stride_y");
   }
   if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
-    args->AddInt("kernel_size_z");
-    args->AddInt("padding_z");
-    args->AddInt("stride_z");
+    args_.AddInt("kernel_size_z");
+    args_.AddInt("padding_z");
+    args_.AddInt("stride_z");
   }
 
   std::map<Axis, std::string> axis_to_src_coord = {
@@ -308,83 +346,19 @@ std::string GetMaxPoolingKernelCode(const OperationDef& op_def,
 
   return c;
 }
-}  // namespace
 
-Pooling::Pooling(const OperationDef& definition,
-                 const Pooling2DAttributes& attr)
-    : GPUOperation(definition),
-      stride_(attr.strides.w, attr.strides.h, 0, 0),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
-      kernel_size_(attr.kernel.w, attr.kernel.h, 0, 0),
-      type_(attr.type),
-      output_indices_(attr.output_indices) {}
-
-Pooling::Pooling(const OperationDef& definition,
-                 const Pooling3DAttributes& attr)
-    : GPUOperation(definition),
-      stride_(attr.strides.w, attr.strides.h, attr.strides.d, 0),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
-               -attr.padding.prepended.d, 0),
-      kernel_size_(attr.kernel.w, attr.kernel.h, attr.kernel.d, 0),
-      type_(attr.type),
-      output_indices_(attr.output_indices) {}
-
-Pooling::Pooling(Pooling&& kernel)
-    : GPUOperation(std::move(kernel)),
-      stride_(kernel.stride_),
-      padding_(kernel.padding_),
-      kernel_size_(kernel.kernel_size_),
-      type_(kernel.type_),
-      output_indices_(kernel.output_indices_),
-      kernel_(std::move(kernel.kernel_)),
-      work_group_size_(kernel.work_group_size_) {}
-
-Pooling& Pooling::operator=(Pooling&& kernel) {
-  if (this != &kernel) {
-    std::swap(stride_, kernel.stride_);
-    std::swap(padding_, kernel.padding_);
-    std::swap(kernel_size_, kernel.kernel_size_);
-    std::swap(type_, kernel.type_);
-    std::swap(output_indices_, kernel.output_indices_);
-    kernel_ = std::move(kernel.kernel_);
-    std::swap(work_group_size_, kernel.work_group_size_);
-    GPUOperation::operator=(std::move(kernel));
-  }
-  return *this;
-}
-
-absl::Status Pooling::Compile(const CreationContext& creation_context) {
-  std::string code;
+void Pooling::GenerateCode() {
   const bool stride_correction =
       definition_.IsBatchSupported() && stride_.x != 1;
-  switch (type_) {
-    case PoolingType::AVERAGE:
-      code = GetAveragePoolingKernelCode(definition_, stride_correction,
-                                         *creation_context.device, &args_);
-      break;
-    case PoolingType::MAX:
-      code = GetMaxPoolingKernelCode(definition_, stride_correction,
-                                     output_indices_, &args_);
-      break;
-    default:
-      return absl::InvalidArgumentError(
-          "You should create another kernel with this params");
-      break;
+  if (type_ == PoolingType::AVERAGE) {
+    code_ = GetAveragePoolingKernelCode(definition_, stride_correction);
+  } else if (type_ == PoolingType::MAX) {
+    code_ = GetMaxPoolingKernelCode(definition_, stride_correction,
+                                    output_indices_);
   }
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
 }
 
 absl::Status Pooling::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   if (definition_.dst_tensors[0].HasAxis(Axis::WIDTH)) {
     RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
     RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
@@ -400,11 +374,7 @@ absl::Status Pooling::BindArguments() {
     RETURN_IF_ERROR(args_.SetInt("padding_z", padding_.z));
     RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
   }
-  if (output_indices_) {
-    RETURN_IF_ERROR(args_.SetObjectRef("dst_indices", dst_[1]));
-  }
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return absl::OkStatus();
 }
 
 int3 Pooling::GetGridSize() const {
@@ -414,16 +384,6 @@ int3 Pooling::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status Pooling::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status Pooling::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 Pooling CreatePooling(const OperationDef& definition,
                       const Pooling2DAttributes& attr) {
   return Pooling(definition, attr);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.h b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.h
index 20719c90ae3..18bb426f259 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.h
@@ -31,10 +31,9 @@ class Pooling : public GPUOperation {
  public:
   Pooling(const OperationDef& definition, const Pooling2DAttributes& attr);
   Pooling(const OperationDef& definition, const Pooling3DAttributes& attr);
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
 
-  absl::Status Compile(const CreationContext& creation_context) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   Pooling(Pooling&& kernel);
@@ -43,8 +42,13 @@ class Pooling : public GPUOperation {
   Pooling& operator=(const Pooling&) = delete;
 
  private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
+  std::string GetAveragePoolingKernelCode(const OperationDef& op_def,
+                                          bool stride_correction);
+  std::string GetMaxPoolingKernelCode(const OperationDef& op_def,
+                                      bool stride_correction,
+                                      bool output_indices);
+
+  void GenerateCode();
 
   int4 stride_;
   int4 padding_;
@@ -52,9 +56,6 @@ class Pooling : public GPUOperation {
 
   PoolingType type_;
   bool output_indices_;
-
-  CLKernel kernel_;
-  int3 work_group_size_ = int3(8, 4, 1);
 };
 
 Pooling CreatePooling(const OperationDef& definition,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc
index 85c88f3b51b..1ca2e096a0e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc
@@ -24,47 +24,43 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-PReLU::PReLU(const OperationDef& definition, const PReLUAttributes& attr,
-             CalculationsPrecision scalar_precision)
-    : ElementwiseOperation(definition) {
+absl::Status CreatePReLU(const CreationContext& creation_context,
+                         const OperationDef& definition,
+                         const PReLUAttributes& attr, GPUOperation* result) {
+  *result = GPUOperation(definition);
+  result->elementwise_ = true;
   if (attr.clip != 0) {
     if (definition.precision == CalculationsPrecision::F32) {
-      args_.AddFloat("clip", attr.clip);
+      result->args_.AddFloat("clip", attr.clip);
     } else {
-      args_.AddHalf("clip", half(attr.clip));
+      result->args_.AddHalf("clip", half(attr.clip));
     }
-    code_ =
+    result->code_ =
         "in_out_value = clamp(in_out_value, (FLT4)(0.0f), (FLT4)(args.clip)) + "
         "min((FLT4)(0.0f), in_out_value) * args.alpha.Read(S_COORD);";
   } else {
-    code_ =
+    result->code_ =
         "in_out_value = max((FLT4)(0.0f), in_out_value) + min((FLT4)(0.0f), "
         "in_out_value) * args.alpha.Read(S_COORD);";
   }
-}
 
-PReLU::PReLU(PReLU&& operation) : ElementwiseOperation(std::move(operation)) {}
-
-PReLU& PReLU::operator=(PReLU&& operation) {
-  if (this != &operation) {
-    ElementwiseOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status CreatePReLU(const CreationContext& creation_context,
-                         const OperationDef& definition,
-                         const PReLUAttributes& attr, PReLU* result) {
   auto alpha =
       absl::get_if<tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(&attr.alpha);
   if (!alpha) {
     return absl::InvalidArgumentError("Alpha is missing");
   }
-  const auto scalar_precision = creation_context.device->IsPowerVR()
-                                    ? CalculationsPrecision::F32
-                                    : definition.precision;
-  *result = PReLU(definition, attr, scalar_precision);
-  RETURN_IF_ERROR(result->UploadParameters(*alpha, creation_context.context));
+  TensorLinearDescriptor desc;
+  desc.storage_type =
+      DeduceLinearStorageType(definition.GetPrimaryStorageType());
+  desc.element_type = definition.GetPrimaryDataType();
+
+  LinearStorage lt;
+  RETURN_IF_ERROR(
+      CreateLinearStorage(desc, *alpha, creation_context.context, &lt));
+  result->args_.AddObject("alpha", AccessType::READ,
+                          absl::make_unique<LinearStorage>(std::move(lt)),
+                          absl::make_unique<TensorLinearDescriptor>(desc));
+
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
index e65559cf7c7..b673217c799 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
@@ -31,48 +31,9 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class PReLU : public ElementwiseOperation {
- public:
-  PReLU() = default;
-  // Move only
-  PReLU(PReLU&& operation);
-  PReLU& operator=(PReLU&& operation);
-  PReLU(const PReLU&) = delete;
-  PReLU& operator=(const PReLU&) = delete;
-
-  friend absl::Status CreatePReLU(const CreationContext& creation_context,
-                                  const OperationDef& definition,
-                                  const PReLUAttributes& attr, PReLU* result);
-
- private:
-  PReLU(const OperationDef& definition, const PReLUAttributes& attr,
-        CalculationsPrecision scalar_precision);
-
-  template <DataType T>
-  absl::Status UploadParameters(
-      const tflite::gpu::Tensor<Linear, T>& parameters, CLContext* context);
-};
-
 absl::Status CreatePReLU(const CreationContext& creation_context,
                          const OperationDef& definition,
-                         const PReLUAttributes& attr, PReLU* result);
-
-template <DataType T>
-absl::Status PReLU::UploadParameters(
-    const tflite::gpu::Tensor<Linear, T>& parameters, CLContext* context) {
-  TensorLinearDescriptor desc;
-  desc.storage_type =
-      DeduceLinearStorageType(definition_.GetPrimaryStorageType());
-  desc.element_type = definition_.GetPrimaryDataType();
-
-  LinearStorage lt;
-  RETURN_IF_ERROR(CreateLinearStorage(desc, parameters, context, &lt));
-  args_.AddObject("alpha", AccessType::READ,
-                  absl::make_unique<LinearStorage>(std::move(lt)),
-                  absl::make_unique<TensorLinearDescriptor>(desc));
-
-  return absl::OkStatus();
-}
+                         const PReLUAttributes& attr, GPUOperation* result);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc
index 4b0006c7f32..06ff09ccca7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc
@@ -52,7 +52,7 @@ TEST_F(OpenCLOperationTest, PReLUAlpha) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      PReLU operation;
+      GPUOperation operation;
       ASSERT_OK(CreatePReLU(creation_context_, op_def, attr, &operation));
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -83,7 +83,7 @@ TEST_F(OpenCLOperationTest, PReLUAlphaClip) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      PReLU operation;
+      GPUOperation operation;
       ASSERT_OK(CreatePReLU(creation_context_, op_def, attr, &operation));
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc
index 957fc9bbb98..e0c44e1cda7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc
@@ -25,59 +25,37 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-
-QuantizeAndDequantize::QuantizeAndDequantize(
-    const OperationDef& definition, const QuantizeAndDequantizeAttributes& attr,
-    CalculationsPrecision scalar_precision)
-    : ElementwiseOperation(definition) {
-  if (definition.precision == CalculationsPrecision::F32) {
-    args_.AddFloat("min", attr.min);
-    args_.AddFloat("max", attr.max);
-    args_.AddFloat("scale", attr.scale);
-  } else {
-    args_.AddHalf("min", half(attr.min));
-    args_.AddHalf("max", half(attr.max));
-    args_.AddHalf("scale", half(attr.scale));
-  }
-  code_ = R"(
-FLT4 clamped_value = min((FLT4)(args.max), max((FLT4)(args.min), in_out_value));
-FLT4 quantized_value = round((clamped_value - (FLT4)(args.min)) / (FLT4)(args.scale));
-FLT4 dequantized_value = quantized_value * (FLT4)(args.scale) + (FLT4)(args.min);
-in_out_value = dequantized_value;)";
-}
-
-QuantizeAndDequantize::QuantizeAndDequantize(QuantizeAndDequantize&& operation)
-    : ElementwiseOperation(std::move(operation)) {}
-
-QuantizeAndDequantize& QuantizeAndDequantize::operator=(
-    QuantizeAndDequantize&& operation) {
-  if (this != &operation) {
-    ElementwiseOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status CreateQuantizeAndDequantize(
+GPUOperation CreateQuantizeAndDequantize(
     const CreationContext& creation_context, const OperationDef& definition,
-    const QuantizeAndDequantizeAttributes& attr,
-    QuantizeAndDequantize* result) {
-  const auto scalar_precision = creation_context.device->IsPowerVR()
-                                    ? CalculationsPrecision::F32
-                                    : definition.precision;
+    const QuantizeAndDequantizeAttributes& attr) {
+  QuantizeAndDequantizeAttributes adjusted_attr = attr;
   const bool is_fp16 = definition.precision == CalculationsPrecision::F16 ||
                        definition.precision == CalculationsPrecision::F32_F16;
   if (is_fp16 && attr.scale < 0.000062f) {
     // The smallest positive normal number for Half-precision floating-point
     // format is 2^-14 ~ 0.000062f. Therefore, if the scale is lesser than this
     // number, we just reset it accordingly.
-    QuantizeAndDequantizeAttributes adjusted_attr = attr;
     adjusted_attr.scale = 0.000062f;
-    *result =
-        QuantizeAndDequantize(definition, adjusted_attr, scalar_precision);
-  } else {
-    *result = QuantizeAndDequantize(definition, attr, scalar_precision);
   }
-  return absl::OkStatus();
+
+  GPUOperation op(definition);
+  op.elementwise_ = true;
+  if (definition.precision == CalculationsPrecision::F32) {
+    op.args_.AddFloat("min", adjusted_attr.min);
+    op.args_.AddFloat("max", adjusted_attr.max);
+    op.args_.AddFloat("scale", adjusted_attr.scale);
+  } else {
+    op.args_.AddHalf("min", half(adjusted_attr.min));
+    op.args_.AddHalf("max", half(adjusted_attr.max));
+    op.args_.AddHalf("scale", half(adjusted_attr.scale));
+  }
+  op.code_ = R"(
+FLT4 clamped_value = min((FLT4)(args.max), max((FLT4)(args.min), in_out_value));
+FLT4 quantized_value = round((clamped_value - (FLT4)(args.min)) / (FLT4)(args.scale));
+FLT4 dequantized_value = quantized_value * (FLT4)(args.scale) + (FLT4)(args.min);
+in_out_value = dequantized_value;)";
+
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h
index a40aa21d23c..6e028625852 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h
@@ -43,43 +43,9 @@ namespace cl {
 //
 // NOTE: We do not need to nudge min/max values in this op, since they would
 // already be adjusted while generating the quantized model.
-class QuantizeAndDequantize : public ElementwiseOperation {
- public:
-  QuantizeAndDequantize() = default;
-  // Move only
-  QuantizeAndDequantize(QuantizeAndDequantize&& operation);
-  QuantizeAndDequantize& operator=(QuantizeAndDequantize&& operation);
-  QuantizeAndDequantize(const QuantizeAndDequantize&) = delete;
-  QuantizeAndDequantize& operator=(const QuantizeAndDequantize&) = delete;
-
-  friend absl::Status CreateQuantizeAndDequantize(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const QuantizeAndDequantizeAttributes& attr,
-      QuantizeAndDequantize* result);
-
- private:
-  QuantizeAndDequantize(const OperationDef& definition,
-                        const QuantizeAndDequantizeAttributes& attr,
-                        CalculationsPrecision scalar_precision);
-
-  template <DataType T>
-  absl::Status UploadParameters(
-      const tflite::gpu::Tensor<Linear, T>& parameters, CLContext* context);
-};
-
-absl::Status CreateQuantizeAndDequantize(
+GPUOperation CreateQuantizeAndDequantize(
     const CreationContext& creation_context, const OperationDef& definition,
-    const QuantizeAndDequantizeAttributes& attr, QuantizeAndDequantize* result);
-
-template <DataType T>
-absl::Status QuantizeAndDequantize::UploadParameters(
-    const tflite::gpu::Tensor<Linear, T>& parameters, CLContext* context) {
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type =
-      DeduceLinearStorageType(definition_.GetPrimaryStorageType());
-  create_info.data_type = definition_.GetPrimaryDataType();
-  return absl::OkStatus();
-}
+    const QuantizeAndDequantizeAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc
index 71d6d066b9b..43b5d69323d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc
@@ -56,9 +56,8 @@ TEST_F(OpenCLOperationTest, QuantAndDequant_Dim2Bits8) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      QuantizeAndDequantize operation;
-      ASSERT_OK(CreateQuantizeAndDequantize(creation_context_, op_def, attr,
-                                            &operation));
+      GPUOperation operation =
+          CreateQuantizeAndDequantize(creation_context_, op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 3, 2, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -92,9 +91,8 @@ TEST_F(OpenCLOperationTest, QuantAndDequant_Dim3Bits8_NegativeRange) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      QuantizeAndDequantize operation;
-      ASSERT_OK(CreateQuantizeAndDequantize(creation_context_, op_def, attr,
-                                            &operation));
+      GPUOperation operation =
+          CreateQuantizeAndDequantize(creation_context_, op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 3, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -128,9 +126,8 @@ TEST_F(OpenCLOperationTest, QuantAndDequant_Dim3Bits16) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      QuantizeAndDequantize operation;
-      ASSERT_OK(CreateQuantizeAndDequantize(creation_context_, op_def, attr,
-                                            &operation));
+      GPUOperation operation =
+          CreateQuantizeAndDequantize(creation_context_, op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 3, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -164,9 +161,8 @@ TEST_F(OpenCLOperationTest, QuantAndDequant_Dim2Bits16_NegativeRange) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      QuantizeAndDequantize operation;
-      ASSERT_OK(CreateQuantizeAndDequantize(creation_context_, op_def, attr,
-                                            &operation));
+      GPUOperation operation =
+          CreateQuantizeAndDequantize(creation_context_, op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 3, 2, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc b/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc
index 774c030545a..a80dccd6259 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc
@@ -21,50 +21,36 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
+GPUOperation CreateReLU(const CreationContext& creation_context,
+                        const OperationDef& definition,
+                        const ReLUAttributes& attr) {
+  GPUOperation op(definition);
+  op.elementwise_ = true;
 
-ReLU::ReLU(const OperationDef& definition, const ReLUAttributes& attr,
-           CalculationsPrecision scalar_precision)
-    : ElementwiseOperation(definition) {
   std::string min_func;
   if (attr.alpha != 0.0f) {
     min_func = "min(in_out_value * args.alpha, (FLT)(0.0f))";
     if (definition.precision == CalculationsPrecision::F32) {
-      args_.AddFloat("alpha", attr.alpha);
+      op.args_.AddFloat("alpha", attr.alpha);
     } else {
-      args_.AddHalf("alpha", half(attr.alpha));
+      op.args_.AddHalf("alpha", half(attr.alpha));
     }
   } else {
     min_func = "(FLT)(0.0f)";
   }
   if (attr.clip != 0.0f) {
     if (definition.precision == CalculationsPrecision::F32) {
-      args_.AddFloat("clip", attr.clip);
+      op.args_.AddFloat("clip", attr.clip);
     } else {
-      args_.AddHalf("clip", half(attr.clip));
+      op.args_.AddHalf("clip", half(attr.clip));
     }
-    code_ = absl::StrCat("in_out_value = clamp(in_out_value, " + min_func +
-                         ", args.clip);");
+    op.code_ = absl::StrCat("in_out_value = clamp(in_out_value, " + min_func +
+                            ", args.clip);");
   } else {
-    code_ = absl::StrCat("in_out_value = max(in_out_value, ", min_func, ");");
+    op.code_ =
+        absl::StrCat("in_out_value = max(in_out_value, ", min_func, ");");
   }
-}
-
-ReLU::ReLU(ReLU&& operation) : ElementwiseOperation(std::move(operation)) {}
-
-ReLU& ReLU::operator=(ReLU&& operation) {
-  if (this != &operation) {
-    ElementwiseOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-ReLU CreateReLU(const CreationContext& creation_context,
-                const OperationDef& definition, const ReLUAttributes& attr) {
-  const auto scalar_precision = creation_context.device->IsPowerVR()
-                                    ? CalculationsPrecision::F32
-                                    : definition.precision;
-  ReLU operation(definition, attr, scalar_precision);
-  return operation;
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/relu.h b/tensorflow/lite/delegates/gpu/cl/kernels/relu.h
index ccb6f6ca37f..001e23da41c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/relu.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/relu.h
@@ -25,25 +25,9 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class ReLU : public ElementwiseOperation {
- public:
-  // Move only
-  ReLU(ReLU&& operation);
-  ReLU& operator=(ReLU&& operation);
-  ReLU(const ReLU&) = delete;
-  ReLU& operator=(const ReLU&) = delete;
-
-  friend ReLU CreateReLU(const CreationContext& creation_context,
-                         const OperationDef& definition,
-                         const ReLUAttributes& attr);
-
- private:
-  ReLU(const OperationDef& definition, const ReLUAttributes& attr,
-       CalculationsPrecision scalar_precision);
-};
-
-ReLU CreateReLU(const CreationContext& creation_context,
-                const OperationDef& definition, const ReLUAttributes& attr);
+GPUOperation CreateReLU(const CreationContext& creation_context,
+                        const OperationDef& definition,
+                        const ReLUAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc
index cebc9886ba5..f741a408661 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc
@@ -49,7 +49,7 @@ TEST_F(OpenCLOperationTest, ReLUNoClipNoAlpha) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ReLU operation = CreateReLU(creation_context_, op_def, attr);
+      GPUOperation operation = CreateReLU(creation_context_, op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -76,7 +76,7 @@ TEST_F(OpenCLOperationTest, ReLUClip) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ReLU operation = CreateReLU(creation_context_, op_def, attr);
+      GPUOperation operation = CreateReLU(creation_context_, op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -103,7 +103,7 @@ TEST_F(OpenCLOperationTest, ReLUAlpha) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ReLU operation = CreateReLU(creation_context_, op_def, attr);
+      GPUOperation operation = CreateReLU(creation_context_, op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -130,7 +130,7 @@ TEST_F(OpenCLOperationTest, ReLUAlphaClip) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ReLU operation = CreateReLU(creation_context_, op_def, attr);
+      GPUOperation operation = CreateReLU(creation_context_, op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
index 5abfad60c1b..4e2ab1307a5 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
@@ -23,15 +23,23 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
 
-std::string GetReshapeCode(const OperationDef& op_def, Arguments* args) {
-  args->AddObjectRef(
-      "src_tensor", AccessType::READ,
-      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
-  args->AddObjectRef(
-      "dst_tensor", AccessType::WRITE,
-      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
+Reshape::Reshape(const OperationDef& definition) : GPUOperation(definition) {
+  code_ = GetReshapeCode(definition_);
+}
+
+Reshape::Reshape(Reshape&& operation) : GPUOperation(std::move(operation)) {}
+
+Reshape& Reshape::operator=(Reshape&& operation) {
+  if (this != &operation) {
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+std::string Reshape::GetReshapeCode(const OperationDef& op_def) {
+  AddSrcTensor("src_tensor", op_def.src_tensors[0]);
+  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
 
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
@@ -87,41 +95,6 @@ std::string GetReshapeCode(const OperationDef& op_def, Arguments* args) {
   c += "}\n";
   return c;
 }
-}  // namespace
-
-Reshape::Reshape(Reshape&& operation)
-    : GPUOperation(std::move(operation)),
-      kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_) {}
-
-Reshape& Reshape::operator=(Reshape&& operation) {
-  if (this != &operation) {
-    kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status Reshape::Compile(const CreationContext& creation_context) {
-  std::string code = GetReshapeCode(definition_, &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
-absl::Status Reshape::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
-}
 
 int3 Reshape::GetGridSize() const {
   const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
@@ -130,16 +103,6 @@ int3 Reshape::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status Reshape::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status Reshape::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 Reshape CreateReshape(const OperationDef& definition) {
   return Reshape(definition);
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.h b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.h
index e11c066ebd3..a5da616c451 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.h
@@ -27,12 +27,9 @@ namespace cl {
 
 class Reshape : public GPUOperation {
  public:
-  explicit Reshape(const OperationDef& definition)
-      : GPUOperation(definition), work_group_size_(8, 4, 1) {}
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
+  explicit Reshape(const OperationDef& definition);
 
-  absl::Status Compile(const CreationContext& creation_context) override;
+  int3 GetGridSize() const override;
 
   // Move only
   Reshape(Reshape&& operation);
@@ -41,11 +38,7 @@ class Reshape : public GPUOperation {
   Reshape& operator=(const Reshape&) = delete;
 
  private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
-  CLKernel kernel_;
-  int3 work_group_size_;
+  std::string GetReshapeCode(const OperationDef& op_def);
 };
 
 Reshape CreateReshape(const OperationDef& definition);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
index 3edbe637aa2..e5692cbc736 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
@@ -23,15 +23,25 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
 
-std::string GetReshapeCode(const OperationDef& op_def, Arguments* args) {
-  args->AddObjectRef(
-      "src_tensor", AccessType::READ,
-      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
-  args->AddObjectRef(
-      "dst_tensor", AccessType::WRITE,
-      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
+Reshapex4::Reshapex4(const OperationDef& definition)
+    : GPUOperation(definition) {
+  code_ = GetReshapeCode(definition_);
+}
+
+Reshapex4::Reshapex4(Reshapex4&& operation)
+    : GPUOperation(std::move(operation)) {}
+
+Reshapex4& Reshapex4::operator=(Reshapex4&& operation) {
+  if (this != &operation) {
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+std::string Reshapex4::GetReshapeCode(const OperationDef& op_def) {
+  AddSrcTensor("src_tensor", op_def.src_tensors[0]);
+  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
 
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
@@ -71,41 +81,6 @@ std::string GetReshapeCode(const OperationDef& op_def, Arguments* args) {
   c += "}\n";
   return c;
 }
-}  // namespace
-
-Reshapex4::Reshapex4(Reshapex4&& operation)
-    : GPUOperation(std::move(operation)),
-      kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_) {}
-
-Reshapex4& Reshapex4::operator=(Reshapex4&& operation) {
-  if (this != &operation) {
-    kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status Reshapex4::Compile(const CreationContext& creation_context) {
-  std::string code = GetReshapeCode(definition_, &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
-absl::Status Reshapex4::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
-}
 
 int3 Reshapex4::GetGridSize() const {
   const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
@@ -114,16 +89,6 @@ int3 Reshapex4::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status Reshapex4::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status Reshapex4::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 Reshapex4 CreateReshapex4(const OperationDef& definition) {
   return Reshapex4(definition);
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.h b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.h
index d61224a7367..654e37e93be 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.h
@@ -28,12 +28,9 @@ namespace cl {
 
 class Reshapex4 : public GPUOperation {
  public:
-  explicit Reshapex4(const OperationDef& definition)
-      : GPUOperation(definition), work_group_size_(8, 4, 1) {}
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
+  explicit Reshapex4(const OperationDef& definition);
 
-  absl::Status Compile(const CreationContext& creation_context) override;
+  int3 GetGridSize() const override;
 
   // Move only
   Reshapex4(Reshapex4&& operation);
@@ -42,11 +39,7 @@ class Reshapex4 : public GPUOperation {
   Reshapex4& operator=(const Reshapex4&) = delete;
 
  private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
-  CLKernel kernel_;
-  int3 work_group_size_;
+  std::string GetReshapeCode(const OperationDef& op_def);
 };
 
 // More optimized, but require src_channels % 4 == 0 and dst_channels % 4 == 0
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/resize.cc b/tensorflow/lite/delegates/gpu/cl/kernels/resize.cc
index 6aa2d1d2570..a0fd699062c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/resize.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/resize.cc
@@ -23,25 +23,39 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
 
-std::string GetResizeCode(const OperationDef& op_def,
-                          SamplingType sampling_type, bool half_pixel_centers,
-                          Arguments* args) {
-  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
-  if (op_def.IsBatchSupported()) {
-    src_desc->SetStateVar("BatchedWidth", "true");
+Resize::Resize(const OperationDef& definition, const Resize2DAttributes& attr)
+    : GPUOperation(definition), attr_(attr) {
+  code_ = GetResizeCode(definition_, attr_);
+}
+
+Resize::Resize(Resize&& operation)
+    : GPUOperation(std::move(operation)), attr_(operation.attr_) {}
+
+Resize& Resize::operator=(Resize&& operation) {
+  if (this != &operation) {
+    attr_ = operation.attr_;
+    GPUOperation::operator=(std::move(operation));
   }
-  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
-  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  return *this;
+}
+
+std::string Resize::GetResizeCode(const OperationDef& op_def,
+                                  const Resize2DAttributes& attr) {
+  auto src_desc = op_def.src_tensors[0];
   if (op_def.IsBatchSupported()) {
-    dst_desc->SetStateVar("BatchedWidth", "true");
+    src_desc.SetStateVar("BatchedWidth", "true");
   }
-  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
-  args->AddInt("border_x");
-  args->AddInt("border_y");
-  args->AddFloat("scale_factor_x");
-  args->AddFloat("scale_factor_y");
+  AddSrcTensor("src_tensor", src_desc);
+  auto dst_desc = op_def.dst_tensors[0];
+  if (op_def.IsBatchSupported()) {
+    dst_desc.SetStateVar("BatchedWidth", "true");
+  }
+  AddDstTensor("dst_tensor", dst_desc);
+  args_.AddInt("border_x");
+  args_.AddInt("border_y");
+  args_.AddFloat("scale_factor_x");
+  args_.AddFloat("scale_factor_y");
 
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
@@ -59,16 +73,34 @@ std::string GetResizeCode(const OperationDef& op_def,
     c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
          "|| Z >= args.dst_tensor.Slices()) return;\n";
   }
-  if (sampling_type == SamplingType::NEAREST) {
-    c += "  int2 coord = (int2)(X * args.scale_factor_x, Y * "
-         "args.scale_factor_y);\n";
+  if (attr.type == SamplingType::NEAREST) {
+    std::string fxc;
+    std::string fyc;
+    if (attr.half_pixel_centers) {
+      fxc = "(X + 0.5f) * args.scale_factor_x";
+      fyc = "(Y + 0.5f) * args.scale_factor_y";
+    } else {
+      fxc = "X * args.scale_factor_x";
+      fyc = "Y * args.scale_factor_y";
+    }
+    if (attr.align_corners) {
+      fxc += " + 0.5f";
+      fyc += " + 0.5f";
+    }
+    c += "  int2 coord;\n";
+    c += "  coord.x = (int)(" + fxc + ");\n";
+    c += "  coord.y = (int)(" + fyc + ");\n";
+    c += "  coord.x = max(0, coord.x);\n";
+    c += "  coord.y = max(0, coord.y);\n";
+    c += "  coord.x = min(coord.x, args.border_x);\n";
+    c += "  coord.y = min(coord.y, args.border_y);\n";
     if (op_def.IsBatchSupported()) {
       c += "  coord.x = coord.x * args.src_tensor.Batch() + B;\n";
       c += "  X = X * args.src_tensor.Batch() + B;\n";
     }
     c += "  FLT4 r0 = args.src_tensor.Read(coord.x, coord.y, Z);\n";
   } else {
-    if (half_pixel_centers) {
+    if (attr.half_pixel_centers) {
       c += "  float2 f_coords = ((float2)(X, Y) + 0.5f) * "
            "(float2)(args.scale_factor_x, args.scale_factor_y) - "
            "0.5f;\n";
@@ -100,24 +132,65 @@ std::string GetResizeCode(const OperationDef& op_def,
   return c;
 }
 
-std::string GetResize3DCode(const OperationDef& op_def,
-                            SamplingType sampling_type, Arguments* args) {
-  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
-  if (op_def.IsBatchSupported()) {
-    src_desc->SetStateVar("BatchedWidth", "true");
+absl::Status Resize::BindArguments() {
+  RETURN_IF_ERROR(args_.SetInt("border_x", src_[0]->Width() - 1));
+  RETURN_IF_ERROR(args_.SetInt("border_y", src_[0]->Height() - 1));
+  RETURN_IF_ERROR(args_.SetFloat(
+      "scale_factor_x",
+      CalculateResizeScale(src_[0]->Width(), dst_[0]->Width(), attr_)));
+  RETURN_IF_ERROR(args_.SetFloat(
+      "scale_factor_y",
+      CalculateResizeScale(src_[0]->Height(), dst_[0]->Height(), attr_)));
+  return absl::OkStatus();
+}
+
+int3 Resize::GetGridSize() const {
+  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
+  const int grid_y = dst_[0]->Height();
+  const int grid_z = dst_[0]->Slices();
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Resize CreateResize(const OperationDef& definition,
+                    const Resize2DAttributes& attr) {
+  return Resize(definition, attr);
+}
+
+Resize3D::Resize3D(const OperationDef& definition,
+                   const Resize3DAttributes& attr)
+    : GPUOperation(definition), attr_(attr) {
+  code_ = GetResize3DCode(definition_, attr_);
+}
+
+Resize3D::Resize3D(Resize3D&& operation)
+    : GPUOperation(std::move(operation)), attr_(operation.attr_) {}
+
+Resize3D& Resize3D::operator=(Resize3D&& operation) {
+  if (this != &operation) {
+    attr_ = operation.attr_;
+    GPUOperation::operator=(std::move(operation));
   }
-  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
-  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  return *this;
+}
+
+std::string Resize3D::GetResize3DCode(const OperationDef& op_def,
+                                      const Resize3DAttributes& attr) {
+  auto src_desc = op_def.src_tensors[0];
   if (op_def.IsBatchSupported()) {
-    dst_desc->SetStateVar("BatchedWidth", "true");
+    src_desc.SetStateVar("BatchedWidth", "true");
   }
-  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
-  args->AddInt("border_x");
-  args->AddInt("border_y");
-  args->AddInt("border_z");
-  args->AddFloat("scale_factor_x");
-  args->AddFloat("scale_factor_y");
-  args->AddFloat("scale_factor_z");
+  AddSrcTensor("src_tensor", src_desc);
+  auto dst_desc = op_def.dst_tensors[0];
+  if (op_def.IsBatchSupported()) {
+    dst_desc.SetStateVar("BatchedWidth", "true");
+  }
+  AddDstTensor("dst_tensor", dst_desc);
+  args_.AddInt("border_x");
+  args_.AddInt("border_y");
+  args_.AddInt("border_z");
+  args_.AddFloat("scale_factor_x");
+  args_.AddFloat("scale_factor_y");
+  args_.AddFloat("scale_factor_z");
 
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
@@ -137,10 +210,34 @@ std::string GetResize3DCode(const OperationDef& op_def,
     c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
          "|| Z >= args.dst_tensor.Depth()) return;\n";
   }
-  if (sampling_type == SamplingType::NEAREST) {
-    c += "  int4 coord = (int4)(X * args.scale_factor_x, Y * "
-         "args.scale_factor_y, Z * "
-         "args.scale_factor_z, 0);\n";
+  if (attr.type == SamplingType::NEAREST) {
+    std::string fxc;
+    std::string fyc;
+    std::string fzc;
+    if (attr.half_pixel_centers) {
+      fxc = "(X + 0.5f) * args.scale_factor_x";
+      fyc = "(Y + 0.5f) * args.scale_factor_y";
+      fzc = "(Z + 0.5f) * args.scale_factor_z";
+    } else {
+      fxc = "X * args.scale_factor_x";
+      fyc = "Y * args.scale_factor_y";
+      fzc = "Z * args.scale_factor_z";
+    }
+    if (attr.align_corners) {
+      fxc += " + 0.5f";
+      fyc += " + 0.5f";
+      fzc += " + 0.5f";
+    }
+    c += "  int4 coord;\n";
+    c += "  coord.x = (int)(" + fxc + ");\n";
+    c += "  coord.y = (int)(" + fyc + ");\n";
+    c += "  coord.z = (int)(" + fzc + ");\n";
+    c += "  coord.x = max(0, coord.x);\n";
+    c += "  coord.y = max(0, coord.y);\n";
+    c += "  coord.z = max(0, coord.z);\n";
+    c += "  coord.x = min(coord.x, args.border_x);\n";
+    c += "  coord.y = min(coord.y, args.border_y);\n";
+    c += "  coord.z = min(coord.z, args.border_z);\n";
     if (op_def.IsBatchSupported()) {
       c += "  coord.x = coord.x * args.src_tensor.Batch() + B;\n";
       c += "  X = X * args.src_tensor.Batch() + B;\n";
@@ -189,107 +286,7 @@ std::string GetResize3DCode(const OperationDef& op_def,
   return c;
 }
 
-}  // namespace
-
-Resize::Resize(Resize&& operation)
-    : GPUOperation(std::move(operation)),
-      attr_(operation.attr_),
-      kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_) {}
-
-Resize& Resize::operator=(Resize&& operation) {
-  if (this != &operation) {
-    attr_ = operation.attr_;
-    kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status Resize::Compile(const CreationContext& creation_context) {
-  std::string code =
-      GetResizeCode(definition_, attr_.type, attr_.half_pixel_centers, &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
-absl::Status Resize::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(args_.SetInt("border_x", src_[0]->Width() - 1));
-  RETURN_IF_ERROR(args_.SetInt("border_y", src_[0]->Height() - 1));
-  RETURN_IF_ERROR(args_.SetFloat(
-      "scale_factor_x",
-      CalculateResizeScale(src_[0]->Width(), dst_[0]->Width(), attr_)));
-  RETURN_IF_ERROR(args_.SetFloat(
-      "scale_factor_y",
-      CalculateResizeScale(src_[0]->Height(), dst_[0]->Height(), attr_)));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
-}
-
-int3 Resize::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  const int grid_z = dst_[0]->Slices();
-  return int3(grid_x, grid_y, grid_z);
-}
-
-absl::Status Resize::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
-absl::Status Resize::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-Resize CreateResize(const OperationDef& definition,
-                    const Resize2DAttributes& attr) {
-  return Resize(definition, attr);
-}
-
-Resize3D::Resize3D(Resize3D&& operation)
-    : GPUOperation(std::move(operation)),
-      attr_(operation.attr_),
-      kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_) {}
-
-Resize3D& Resize3D::operator=(Resize3D&& operation) {
-  if (this != &operation) {
-    attr_ = operation.attr_;
-    kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status Resize3D::Compile(const CreationContext& creation_context) {
-  std::string code = GetResize3DCode(definition_, attr_.type, &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
 absl::Status Resize3D::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   RETURN_IF_ERROR(args_.SetInt("border_x", src_[0]->Width() - 1));
   RETURN_IF_ERROR(args_.SetInt("border_y", src_[0]->Height() - 1));
   RETURN_IF_ERROR(args_.SetInt("border_z", src_[0]->Depth() - 1));
@@ -302,8 +299,7 @@ absl::Status Resize3D::BindArguments() {
   RETURN_IF_ERROR(args_.SetFloat(
       "scale_factor_z",
       CalculateResizeScale(src_[0]->Depth(), dst_[0]->Depth(), attr_)));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return absl::OkStatus();
 }
 
 int3 Resize3D::GetGridSize() const {
@@ -313,16 +309,6 @@ int3 Resize3D::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status Resize3D::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
-absl::Status Resize3D::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
 Resize3D CreateResize3D(const OperationDef& definition,
                         const Resize3DAttributes& attr) {
   return Resize3D(definition, attr);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/resize.h b/tensorflow/lite/delegates/gpu/cl/kernels/resize.h
index 04459e12ff9..0349afe5664 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/resize.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/resize.h
@@ -27,10 +27,8 @@ namespace cl {
 
 class Resize : public GPUOperation {
  public:
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
-  absl::Status Compile(const CreationContext& creation_context) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   Resize(Resize&& operation);
@@ -42,15 +40,12 @@ class Resize : public GPUOperation {
                              const Resize2DAttributes& attr);
 
  private:
-  Resize(const OperationDef& definition, const Resize2DAttributes& attr)
-      : GPUOperation(definition), attr_(attr) {}
+  Resize(const OperationDef& definition, const Resize2DAttributes& attr);
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
+  std::string GetResizeCode(const OperationDef& op_def,
+                            const Resize2DAttributes& attr);
 
   Resize2DAttributes attr_;
-  CLKernel kernel_;
-  int3 work_group_size_ = int3(8, 4, 1);
 };
 
 Resize CreateResize(const OperationDef& definition,
@@ -58,10 +53,8 @@ Resize CreateResize(const OperationDef& definition,
 
 class Resize3D : public GPUOperation {
  public:
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
-  absl::Status Compile(const CreationContext& creation_context) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   Resize3D(Resize3D&& operation);
@@ -73,15 +66,12 @@ class Resize3D : public GPUOperation {
                                  const Resize3DAttributes& attr);
 
  private:
-  Resize3D(const OperationDef& definition, const Resize3DAttributes& attr)
-      : GPUOperation(definition), attr_(attr) {}
+  Resize3D(const OperationDef& definition, const Resize3DAttributes& attr);
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
+  std::string GetResize3DCode(const OperationDef& op_def,
+                              const Resize3DAttributes& attr);
 
   Resize3DAttributes attr_;
-  CLKernel kernel_;
-  int3 work_group_size_ = int3(8, 4, 1);
 };
 
 Resize3D CreateResize3D(const OperationDef& definition,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/resize_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/resize_test.cc
index d8cbfc90333..0a22d77b6fb 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/resize_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/resize_test.cc
@@ -161,6 +161,7 @@ TEST_F(OpenCLOperationTest, ResizeNearest) {
 
   Resize2DAttributes attr;
   attr.align_corners = false;
+  attr.half_pixel_centers = false;
   attr.new_shape = HW(2, 4);
   attr.type = SamplingType::NEAREST;
 
@@ -183,6 +184,66 @@ TEST_F(OpenCLOperationTest, ResizeNearest) {
   }
 }
 
+TEST_F(OpenCLOperationTest, ResizeNearestAlignCorners) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 1);
+  src_tensor.data = {3.0f, 6.0f, 9.0f, 12.0f};
+
+  Resize2DAttributes attr;
+  attr.align_corners = true;
+  attr.half_pixel_centers = false;
+  attr.new_shape = HW(3, 3);
+  attr.type = SamplingType::NEAREST;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-5f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      Resize operation = CreateResize(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 3, 3, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {3.0f, 6.0f, 6.0f, 9.0f, 12.0f,
+                                             12.0f, 9.0f, 12.0f, 12.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ResizeNearestHalfPixelCenters) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 1);
+  src_tensor.data = {3.0f, 6.0f, 9.0f, 12.0f};
+
+  Resize2DAttributes attr;
+  attr.align_corners = false;
+  attr.half_pixel_centers = true;
+  attr.new_shape = HW(3, 3);
+  attr.type = SamplingType::NEAREST;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-5f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      Resize operation = CreateResize(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 3, 3, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {3.0f, 6.0f, 6.0f, 9.0f, 12.0f,
+                                             12.0f, 9.0f, 12.0f, 12.0f}));
+    }
+  }
+}
+
 }  // namespace
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc
index fda7dbba6dd..be8e979305b 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc
@@ -24,21 +24,31 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
 
-std::string GetSoftmaxKernelCode(
-    const OperationDef& op_def,
-    Arguments* args) {
-  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
-  if (op_def.IsBatchSupported()) {
-    src_desc->SetStateVar("BatchedWidth", "true");
+Softmax::Softmax(const OperationDef& definition) : GPUOperation(definition) {
+  code_ = GetSoftmaxKernelCode(definition_);
+}
+
+Softmax::Softmax(Softmax&& kernel) : GPUOperation(std::move(kernel)) {}
+
+Softmax& Softmax::operator=(Softmax&& kernel) {
+  if (this != &kernel) {
+    GPUOperation::operator=(std::move(kernel));
   }
-  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
-  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  return *this;
+}
+
+std::string Softmax::GetSoftmaxKernelCode(const OperationDef& op_def) {
+  auto src_desc = op_def.src_tensors[0];
   if (op_def.IsBatchSupported()) {
-    dst_desc->SetStateVar("BatchedWidth", "true");
+    src_desc.SetStateVar("BatchedWidth", "true");
   }
-  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
+  AddSrcTensor("src_tensor", src_desc);
+  auto dst_desc = op_def.dst_tensors[0];
+  if (op_def.IsBatchSupported()) {
+    dst_desc.SetStateVar("BatchedWidth", "true");
+  }
+  AddDstTensor("dst_tensor", dst_desc);
 
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
@@ -64,41 +74,6 @@ std::string GetSoftmaxKernelCode(
   c += "}\n";
   return c;
 }
-}  // namespace
-
-Softmax::Softmax(Softmax&& kernel)
-    : GPUOperation(std::move(kernel)),
-      kernel_(std::move(kernel.kernel_)),
-      work_group_size_(kernel.work_group_size_) {}
-
-Softmax& Softmax::operator=(Softmax&& kernel) {
-  if (this != &kernel) {
-    kernel_ = std::move(kernel.kernel_);
-    std::swap(work_group_size_, kernel.work_group_size_);
-    GPUOperation::operator=(std::move(kernel));
-  }
-  return *this;
-}
-
-absl::Status Softmax::Compile(const CreationContext& creation_context) {
-  std::string code = GetSoftmaxKernelCode(definition_, &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
-absl::Status Softmax::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
-}
 
 int3 Softmax::GetGridSize() const {
   const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
@@ -107,16 +82,6 @@ int3 Softmax::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status Softmax::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status Softmax::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 Softmax CreateSoftmax(const OperationDef& definition) {
   return Softmax(definition);
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax.h b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.h
index 703a40a4e89..0fa10721df9 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.h
@@ -29,11 +29,9 @@ namespace cl {
 class Softmax : public GPUOperation {
  public:
   Softmax() = default;
-  explicit Softmax(const OperationDef& definition) : GPUOperation(definition) {}
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
+  explicit Softmax(const OperationDef& definition);
 
-  absl::Status Compile(const CreationContext& creation_context) override;
+  int3 GetGridSize() const override;
 
   // Move only
   Softmax(Softmax&& kernel);
@@ -44,10 +42,7 @@ class Softmax : public GPUOperation {
   friend Softmax CreateSoftmax();
 
  private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-  CLKernel kernel_;
-  int3 work_group_size_ = int3(8, 4, 1);
+  std::string GetSoftmaxKernelCode(const OperationDef& op_def);
 };
 
 Softmax CreateSoftmax(const OperationDef& definition);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc
index fcfe4a1810c..e7cf72aa72a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc
@@ -23,20 +23,30 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
 
-std::string GetSoftmaxKernelCode(const OperationDef& op_def, Arguments* args) {
-  args->AddObjectRef(
-      "src_tensor", AccessType::READ,
-      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
-  args->AddObjectRef(
-      "dst_tensor", AccessType::WRITE,
-      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
-  args->AddFloat("mask_x");
-  args->AddFloat("mask_y");
-  args->AddFloat("mask_z");
-  args->AddFloat("mask_w");
-  args->AddInt("slices_x32");
+Softmax1x1::Softmax1x1(const OperationDef& definition)
+    : GPUOperation(definition) {
+  work_group_size_ = int3(32, 1, 1);
+  code_ = GetSoftmaxKernelCode(definition_);
+}
+
+Softmax1x1::Softmax1x1(Softmax1x1&& kernel) : GPUOperation(std::move(kernel)) {}
+
+Softmax1x1& Softmax1x1::operator=(Softmax1x1&& kernel) {
+  if (this != &kernel) {
+    GPUOperation::operator=(std::move(kernel));
+  }
+  return *this;
+}
+
+std::string Softmax1x1::GetSoftmaxKernelCode(const OperationDef& op_def) {
+  AddSrcTensor("src_tensor", op_def.src_tensors[0]);
+  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
+  args_.AddFloat("mask_x");
+  args_.AddFloat("mask_y");
+  args_.AddFloat("mask_z");
+  args_.AddFloat("mask_w");
+  args_.AddInt("slices_x32");
 
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
@@ -98,35 +108,8 @@ std::string GetSoftmaxKernelCode(const OperationDef& op_def, Arguments* args) {
   c += "}\n";
   return c;
 }
-}  // namespace
 
-Softmax1x1::Softmax1x1(Softmax1x1&& kernel)
-    : GPUOperation(std::move(kernel)), kernel_(std::move(kernel.kernel_)) {}
-
-Softmax1x1& Softmax1x1::operator=(Softmax1x1&& kernel) {
-  if (this != &kernel) {
-    kernel_ = std::move(kernel.kernel_);
-    GPUOperation::operator=(std::move(kernel));
-  }
-  return *this;
-}
-
-absl::Status Softmax1x1::Compile(const CreationContext& creation_context) {
-  std::string code = GetSoftmaxKernelCode(definition_, &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
-absl::Status Softmax1x1::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+absl::Status Softmax1x1::BindArguments() {
   float4 mask = GetMaskForLastPlane(src_[0]->Channels());
   RETURN_IF_ERROR(args_.SetFloat("mask_x", mask.x));
   RETURN_IF_ERROR(args_.SetFloat("mask_y", mask.y));
@@ -134,12 +117,11 @@ absl::Status Softmax1x1::AddToQueue(CLCommandQueue* queue) {
   RETURN_IF_ERROR(args_.SetFloat("mask_w", mask.w));
   RETURN_IF_ERROR(
       args_.SetInt("slices_x32", DivideRoundUp(src_[0]->Slices(), 32)));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
-  return queue->DispatchImplicit(kernel_, {32, dst_[0]->Batch(), 1},
-                                 {32, 1, 1});
+  return absl::OkStatus();
 }
 
+int3 Softmax1x1::GetGridSize() const { return int3(32, dst_[0]->Batch(), 1); }
+
 Softmax1x1 CreateSoftmax1x1(const OperationDef& definition) {
   return Softmax1x1(definition);
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h
index 0d28145ca03..5bc9278d612 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h
@@ -28,11 +28,15 @@ namespace cl {
 class Softmax1x1 : public GPUOperation {
  public:
   Softmax1x1() = default;
-  explicit Softmax1x1(const OperationDef& definition)
-      : GPUOperation(definition) {}
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-
-  absl::Status Compile(const CreationContext& creation_context) override;
+  explicit Softmax1x1(const OperationDef& definition);
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override {
+    work_groups->push_back(work_group_size_);
+  }
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   Softmax1x1(Softmax1x1&& kernel);
@@ -43,7 +47,7 @@ class Softmax1x1 : public GPUOperation {
   friend Softmax1x1 CreateSoftmax1x1();
 
  private:
-  CLKernel kernel_;
+  std::string GetSoftmaxKernelCode(const OperationDef& op_def);
 };
 
 Softmax1x1 CreateSoftmax1x1(const OperationDef& definition);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc
index 439b7d0fc15..0fa266aa8e7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc
@@ -25,16 +25,28 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
 
-std::string GetSpaceToDepthCode(const OperationDef& op_def, Arguments* args) {
-  args->AddObjectRef(
-      "src_tensor", AccessType::READ,
-      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
-  args->AddObjectRef(
-      "dst_tensor", AccessType::WRITE,
-      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
-  args->AddInt("block_size");
+SpaceToDepth::SpaceToDepth(const OperationDef& op_def,
+                           const SpaceToDepthAttributes& attr)
+    : GPUOperation(op_def), attr_(attr) {
+  code_ = GetSpaceToDepthCode(definition_);
+}
+
+SpaceToDepth::SpaceToDepth(SpaceToDepth&& operation)
+    : GPUOperation(std::move(operation)), attr_(operation.attr_) {}
+
+SpaceToDepth& SpaceToDepth::operator=(SpaceToDepth&& operation) {
+  if (this != &operation) {
+    attr_ = operation.attr_;
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+std::string SpaceToDepth::GetSpaceToDepthCode(const OperationDef& op_def) {
+  AddSrcTensor("src_tensor", op_def.src_tensors[0]);
+  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
+  args_.AddInt("block_size");
 
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
@@ -76,43 +88,9 @@ std::string GetSpaceToDepthCode(const OperationDef& op_def, Arguments* args) {
   return c;
 }
 
-}  // namespace
-
-SpaceToDepth::SpaceToDepth(SpaceToDepth&& operation)
-    : GPUOperation(std::move(operation)),
-      attr_(operation.attr_),
-      kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_) {}
-
-SpaceToDepth& SpaceToDepth::operator=(SpaceToDepth&& operation) {
-  if (this != &operation) {
-    attr_ = operation.attr_;
-    kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status SpaceToDepth::Compile(const CreationContext& creation_context) {
-  std::string code = GetSpaceToDepthCode(definition_, &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
 absl::Status SpaceToDepth::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   RETURN_IF_ERROR(args_.SetInt("block_size", attr_.block_size));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return absl::OkStatus();
 }
 
 int3 SpaceToDepth::GetGridSize() const {
@@ -122,16 +100,6 @@ int3 SpaceToDepth::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status SpaceToDepth::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status SpaceToDepth::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 SpaceToDepth CreateSpaceToDepth(const OperationDef& op_def,
                                 const SpaceToDepthAttributes& attr) {
   return SpaceToDepth(op_def, attr);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h
index 9dd257a4c4d..65ade000836 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h
@@ -28,11 +28,9 @@ namespace cl {
 
 class SpaceToDepth : public GPUOperation {
  public:
-  SpaceToDepth(const OperationDef& op_def, const SpaceToDepthAttributes& attr)
-      : GPUOperation(op_def), attr_(attr), work_group_size_(8, 4, 1) {}
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-  absl::Status Compile(const CreationContext& creation_context) override;
+  SpaceToDepth(const OperationDef& op_def, const SpaceToDepthAttributes& attr);
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   SpaceToDepth(SpaceToDepth&& operation);
   SpaceToDepth& operator=(SpaceToDepth&& operation);
@@ -40,12 +38,9 @@ class SpaceToDepth : public GPUOperation {
   SpaceToDepth& operator=(const SpaceToDepth&) = delete;
 
  private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
+  std::string GetSpaceToDepthCode(const OperationDef& op_def);
 
   SpaceToDepthAttributes attr_;
-  CLKernel kernel_;
-  int3 work_group_size_;
 };
 
 SpaceToDepth CreateSpaceToDepth(const OperationDef& op_def,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/special/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/special/BUILD
new file mode 100644
index 00000000000..d5ff93e6845
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/special/BUILD
@@ -0,0 +1,25 @@
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "depthwise_conv_plus_1x1_conv",
+    srcs = ["depthwise_conv_plus_1x1_conv.cc"],
+    hdrs = ["depthwise_conv_plus_1x1_conv.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/cl:buffer",
+        "//tensorflow/lite/delegates/gpu/cl:cl_device",
+        "//tensorflow/lite/delegates/gpu/cl:gpu_object",
+        "//tensorflow/lite/delegates/gpu/cl:util",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:util",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:work_group_picking",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.cc b/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.cc
new file mode 100644
index 00000000000..e95e758fc95
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.cc
@@ -0,0 +1,283 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+DepthwiseConvPlus1x1Conv::DepthwiseConvPlus1x1Conv(
+    const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& dw_attr,
+    const Convolution2DAttributes& conv_attr)
+    : GPUOperation(definition), dw_attr_(dw_attr) {
+  work_group_size_ = int3(8, 8, 1);
+  code_ = GenerateCode(definition_, dw_attr_,
+                       DivideRoundUp(conv_attr.weights.shape.o, 4));
+}
+
+DepthwiseConvPlus1x1Conv::DepthwiseConvPlus1x1Conv(
+    DepthwiseConvPlus1x1Conv&& operation)
+    : GPUOperation(std::move(operation)),
+      dw_attr_(std::move(operation.dw_attr_)) {}
+
+DepthwiseConvPlus1x1Conv& DepthwiseConvPlus1x1Conv::operator=(
+    DepthwiseConvPlus1x1Conv&& operation) {
+  if (this != &operation) {
+    dw_attr_ = std::move(operation.dw_attr_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+absl::Status DepthwiseConvPlus1x1Conv::UploadWeights(
+    const DepthwiseConvolution2DAttributes& dw_attr,
+    const Convolution2DAttributes& conv_attr, CLContext* context) {
+  int dw_dst_ch_aligned = AlignByN(dw_attr.weights.shape.i, 4);
+  int dw_weights_count =
+      dw_dst_ch_aligned * dw_attr.weights.shape.h * dw_attr.weights.shape.w;
+  int conv_src_ch_aligned = AlignByN(conv_attr.weights.shape.i, 4);
+  int conv_dst_ch_aligned = AlignByN(conv_attr.weights.shape.o, 4);
+  int conv_weights_count = conv_src_ch_aligned * conv_dst_ch_aligned;
+  std::vector<float> gpu_data;
+  gpu_data.reserve(dw_dst_ch_aligned + dw_weights_count + conv_dst_ch_aligned +
+                   conv_weights_count);
+  // dw bias loading
+  for (int i = 0; i < dw_dst_ch_aligned; ++i) {
+    if (i < dw_attr.bias.shape.v) {
+      gpu_data.push_back(dw_attr.bias.data[i]);
+    } else {
+      gpu_data.push_back(0.0f);
+    }
+  }
+  // dw weights loading
+  for (int y = 0; y < dw_attr.weights.shape.h; ++y) {
+    for (int x = 0; x < dw_attr.weights.shape.w; ++x) {
+      for (int d = 0; d < dw_dst_ch_aligned / 4; ++d) {
+        for (int i = 0; i < 4; ++i) {
+          const int d_ch = d * 4 + i;
+          if (d_ch < dw_attr.weights.shape.i) {
+            const int f_index =
+                dw_attr.weights.shape.LinearIndex({0, y, x, d_ch});
+            gpu_data.push_back(dw_attr.weights.data[f_index]);
+          } else {
+            gpu_data.push_back(0.0f);
+          }
+        }
+      }
+    }
+  }
+  // conv bias loading
+  for (int i = 0; i < conv_dst_ch_aligned; ++i) {
+    if (i < conv_attr.bias.shape.v) {
+      gpu_data.push_back(conv_attr.bias.data[i]);
+    } else {
+      gpu_data.push_back(0.0f);
+    }
+  }
+  // conv weights loading
+  for (int d = 0; d < conv_dst_ch_aligned / 4; ++d) {
+    for (int s = 0; s < conv_src_ch_aligned / 4; ++s) {
+      for (int j = 0; j < 4; ++j) {
+        for (int i = 0; i < 4; ++i) {
+          const int s_ch = s * 4 + j;
+          const int d_ch = d * 4 + i;
+          if (s_ch < conv_attr.weights.shape.i &&
+              d_ch < conv_attr.weights.shape.o) {
+            const int f_index =
+                conv_attr.weights.shape.LinearIndex({d_ch, 0, 0, s_ch});
+            gpu_data.push_back(conv_attr.weights.data[f_index]);
+          } else {
+            gpu_data.push_back(0.0f);
+          }
+        }
+      }
+    }
+  }
+
+  Buffer constants_buf;
+  const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
+  const int float_size = fp32_weights ? 4 : 2;
+  if (fp32_weights) {
+    RETURN_IF_ERROR(CreateReadOnlyBuffer(float_size * gpu_data.size(),
+                                         gpu_data.data(), context,
+                                         &constants_buf));
+  } else {
+    std::vector<half> gpu_data_half(gpu_data.size());
+    for (int i = 0; i < gpu_data.size(); ++i) {
+      gpu_data_half[i] = gpu_data[i];
+    }
+    RETURN_IF_ERROR(CreateReadOnlyBuffer(float_size * gpu_data_half.size(),
+                                         gpu_data_half.data(), context,
+                                         &constants_buf));
+  }
+
+  BufferDescriptor desc;
+  desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+  desc.element_size = 4;
+  desc.memory_type = MemoryType::CONSTANT;
+  args_.AddObject("constants", AccessType::READ,
+                  absl::make_unique<Buffer>(std::move(constants_buf)),
+                  absl::make_unique<BufferDescriptor>(desc));
+  return absl::OkStatus();
+}
+
+std::string DepthwiseConvPlus1x1Conv::GenerateCode(
+    const OperationDef& op_def, const DepthwiseConvolution2DAttributes& dw_attr,
+    int result_depth) {
+  auto src_desc = op_def.src_tensors[0];
+  src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
+  AddSrcTensor("src_tensor", src_desc);
+  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
+
+  args_.AddInt("stride_x", dw_attr.strides.w);
+  args_.AddInt("padding_x", -dw_attr.padding.prepended.w);
+  args_.AddInt("dilation_x", dw_attr.dilations.w);
+  args_.AddInt("stride_y", dw_attr.strides.h);
+  args_.AddInt("padding_y", -dw_attr.padding.prepended.h);
+  args_.AddInt("dilation_y", dw_attr.dilations.h);
+
+  const auto src_tensor_type = op_def.src_tensors[0].storage_type;
+
+  const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
+                            src_tensor_type == TensorStorageType::IMAGE_BUFFER;
+
+  std::string c = GetCommonDefines(op_def.precision);
+  c += "__kernel void main_function(\n";
+  c += "$0) {\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int linear_id = get_global_id(0);\n";
+    c += "  int X = linear_id / args.dst_tensor.Batch();\n";
+    c += "  int B = linear_id % args.dst_tensor.Batch();\n";
+    c += "  args.dst_tensor.SetBatchRef(B);\n";
+    c += "  args.src_tensor.SetBatchRef(B);\n";
+  } else {
+    c += "  int X = get_global_id(0);\n";
+  }
+  c += "  int Y = get_global_id(1);\n";
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height()) { "
+       "\n";
+  c += "    return; \n";
+  c += "  } \n";
+  c += "  __constant FLT4* constants = args.constants.GetPtr();\n";
+  int intermediate_depth = DivideRoundUp(dw_attr.weights.shape.i, 4);
+  int weights_counter = 0;
+  for (int d = 0; d < intermediate_depth; ++d) {
+    c += "  FLT4 dw_res_" + std::to_string(d) + " = constants[" +
+         std::to_string(weights_counter++) + "];\n";
+  }
+  c += "  int x_offseted = X * args.stride_x + args.padding_x;\n";
+  c += "  int y_offseted = Y * args.stride_y + args.padding_y;\n";
+  c += "  int x_c, y_c;\n";
+  if (manual_clamp) {
+    c += "  bool x_in, y_in;\n";
+  }
+  c += "  FLT4 src;\n";
+  for (int ky = 0; ky < dw_attr.weights.shape.h; ++ky) {
+    c += "  y_c = y_offseted + " + std::to_string(ky) + " * args.dilation_y;\n";
+    if (manual_clamp) {
+      c += "  y_in = y_c >= 0 && y_c < args.src_tensor.Height();\n";
+      c += "  y_c = clamp(y_c, 0, args.src_tensor.Height() - 1);\n";
+    }
+    for (int kx = 0; kx < dw_attr.weights.shape.w; ++kx) {
+      c += "  x_c = x_offseted + " + std::to_string(kx) +
+           " * args.dilation_x;\n";
+      if (manual_clamp) {
+        c += "  x_in = x_c >= 0 && x_c < args.src_tensor.Width();\n";
+        c += "  x_c = clamp(x_c, 0, args.src_tensor.Width() - 1);\n";
+      }
+      for (int d = 0; d < intermediate_depth; ++d) {
+        std::string multiplier = manual_clamp ? "* (FLT)(x_in && y_in)" : "";
+        c += "  src = args.src_tensor.Read(x_c, y_c, " + std::to_string(d) +
+             ")" + multiplier + ";\n";
+        c += "  dw_res_" + std::to_string(d) + " += src * constants[" +
+             std::to_string(weights_counter++) + "];\n";
+      }
+    }
+  }
+  for (int d = 0; d < result_depth; ++d) {
+    c += "  FLT4 conv_res_" + std::to_string(d) + " = constants[" +
+         std::to_string(weights_counter++) + "];\n";
+  }
+  for (int d = 0; d < result_depth; ++d) {
+    for (int s = 0; s < intermediate_depth; ++s) {
+      std::string src = "dw_res_" + std::to_string(s);
+      std::string dst = "conv_res_" + std::to_string(d);
+      c += "  " + dst + " += " + src + ".x * constants[" +
+           std::to_string(weights_counter++) + "];\n";
+      c += "  " + dst + " += " + src + ".y * constants[" +
+           std::to_string(weights_counter++) + "];\n";
+      c += "  " + dst + " += " + src + ".z * constants[" +
+           std::to_string(weights_counter++) + "];\n";
+      c += "  " + dst + " += " + src + ".w * constants[" +
+           std::to_string(weights_counter++) + "];\n";
+    }
+    c += "  args.dst_tensor.Write(conv_res_" + std::to_string(d) + ", X, Y, " +
+         std::to_string(d) + ");\n";
+  }
+  c += "}\n";
+
+  return c;
+}
+
+int3 DepthwiseConvPlus1x1Conv::GetGridSize() const {
+  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
+  const int grid_y = dst_[0]->Height();
+  return int3(grid_x, grid_y, 1);
+}
+
+bool IsDepthwiseConvPlus1x1ConvSupported(
+    const CLDevice& device, const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& dw_attr,
+    const Convolution2DAttributes& conv_attr) {
+  const auto dw_shape = dw_attr.weights.shape;
+  const auto conv_shape = conv_attr.weights.shape;
+  bool good_dw = dw_shape.o == 1;
+  bool good_conv =
+      conv_shape.w == 1 && conv_shape.h == 1 && conv_attr.dilations.w == 1 &&
+      conv_attr.dilations.h == 1 && conv_attr.strides.w == 1 &&
+      conv_attr.strides.h == 1 && conv_attr.padding.prepended.w == 0 &&
+      conv_attr.padding.prepended.h == 0 && conv_attr.padding.appended.w == 0 &&
+      conv_attr.padding.appended.h == 0;
+  bool recommended_dw =
+      dw_shape.i <= 16 && dw_shape.i * dw_shape.h * dw_shape.w <= 3 * 3 * 16;
+  bool recommended_conv =
+      conv_shape.o <= 32 && conv_shape.i * conv_shape.o <= 16 * 32;
+  return good_dw && good_conv && recommended_dw && recommended_conv;
+}
+
+absl::Status CreateDepthwiseConvPlus1x1Conv(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& dw_attr,
+    const Convolution2DAttributes& conv_attr,
+    DepthwiseConvPlus1x1Conv* result) {
+  *result = DepthwiseConvPlus1x1Conv(definition, dw_attr, conv_attr);
+  RETURN_IF_ERROR(
+      result->UploadWeights(dw_attr, conv_attr, creation_context.context));
+  return absl::OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.h b/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.h
new file mode 100644
index 00000000000..b2d3b05d285
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.h
@@ -0,0 +1,82 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_SPECIAL_DEPTHWISE_CONV_PLUS_1X1_CONV_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_SPECIAL_DEPTHWISE_CONV_PLUS_1X1_CONV_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class DepthwiseConvPlus1x1Conv : public GPUOperation {
+ public:
+  DepthwiseConvPlus1x1Conv() = default;
+  int3 GetGridSize() const override;
+
+  // Move only
+  DepthwiseConvPlus1x1Conv(DepthwiseConvPlus1x1Conv&& operation);
+  DepthwiseConvPlus1x1Conv& operator=(DepthwiseConvPlus1x1Conv&& operation);
+  DepthwiseConvPlus1x1Conv(const DepthwiseConvPlus1x1Conv&) = delete;
+  DepthwiseConvPlus1x1Conv& operator=(const DepthwiseConvPlus1x1Conv&) = delete;
+
+ private:
+  friend absl::Status CreateDepthwiseConvPlus1x1Conv(
+      const CreationContext& creation_context, const OperationDef& definition,
+      const DepthwiseConvolution2DAttributes& dw_attr,
+      const Convolution2DAttributes& conv_attr,
+      DepthwiseConvPlus1x1Conv* result);
+  DepthwiseConvPlus1x1Conv(const OperationDef& definition,
+                           const DepthwiseConvolution2DAttributes& dw_attr,
+                           const Convolution2DAttributes& conv_attr);
+
+  absl::Status UploadWeights(const DepthwiseConvolution2DAttributes& dw_attr,
+                             const Convolution2DAttributes& conv_attr,
+                             CLContext* context);
+
+  std::string GenerateCode(const OperationDef& op_def,
+                           const DepthwiseConvolution2DAttributes& dw_attr,
+                           int result_depth);
+
+  DepthwiseConvolution2DAttributes dw_attr_;
+};
+
+bool IsDepthwiseConvPlus1x1ConvSupported(
+    const CLDevice& device, const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& dw_attr,
+    const Convolution2DAttributes& conv_attr);
+
+absl::Status CreateDepthwiseConvPlus1x1Conv(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& dw_attr,
+    const Convolution2DAttributes& conv_attr, DepthwiseConvPlus1x1Conv* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_SPECIAL_DEPTHWISE_CONV_PLUS_1X1_CONV_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
index d0c4e432f3a..b2ce0690a9c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
@@ -24,72 +24,6 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 namespace {
-
-std::string GetStridedSliceCode(const OperationDef& op_def, bool alignedx4,
-                                Arguments* args) {
-  args->AddObjectRef(
-      "src_tensor", AccessType::READ,
-      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
-  args->AddObjectRef(
-      "dst_tensor", AccessType::WRITE,
-      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
-  args->AddInt("offset_x");
-  args->AddInt("offset_y");
-  args->AddInt("offset_z");
-  args->AddInt("offset_b");
-  args->AddInt("stride_x");
-  args->AddInt("stride_y");
-  args->AddInt("stride_z");
-  args->AddInt("stride_b");
-
-  const std::string batch_id =
-      op_def.dst_tensors[0].HasAxis(Axis::BATCH) ? "B" : "0";
-  std::string c = GetCommonDefines(op_def.precision);
-  c += "__kernel void main_function(\n";
-  c += "$0) {\n";
-  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
-    c += "  int linear_id = get_global_id(0);\n";
-    c += "  int X = linear_id / args.dst_tensor.Batch();\n";
-    c += "  int B = linear_id % args.dst_tensor.Batch();\n";
-    c += "  args.dst_tensor.SetBatchRef(B);\n";
-  } else {
-    c += "  int X = get_global_id(0);\n";
-  }
-  c += "  int Y = get_global_id(1);\n";
-  c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
-       "Z >= args.dst_tensor.Slices()) { \n";
-  c += "    return; \n";
-  c += "  } \n";
-  c += "  int s_x = X * args.stride_x + args.offset_x;\n";
-  c += "  int s_y = Y * args.stride_y + args.offset_y;\n";
-  if (op_def.src_tensors[0].HasAxis(Axis::BATCH)) {
-    c += "  int s_b = " + batch_id + " * args.stride_b + args.offset_b;\n";
-    c += "  args.src_tensor.SetBatchRef(s_b);\n";
-  }
-  if (alignedx4) {
-    c += "  int s_z = Z + args.offset_z;\n";
-    c += "  FLT4 result = args.src_tensor.Read(s_x, s_y, s_z);\n";
-  } else {
-    c += "  FLT4 result;\n";
-    const std::string postfixes[] = {"x", "y", "z", "w"};
-    for (int i = 0; i < 4; ++i) {
-      c += "  {\n";
-      const std::string channel = "(Z * 4 + " + std::to_string(i) + ")";
-      c += "    int s_ch = " + channel + " * args.stride_z + args.offset_z;\n";
-      c += "    int s_z = min(s_ch >> 2, args.src_tensor.Slices() - 1);\n";
-      c += "    int s_z_rem = s_ch & 3;\n";
-      c += "    FLT4 t = args.src_tensor.Read(s_x, s_y, s_z);\n";
-      c += "    FLT t_ar[4] = {t.x, t.y, t.z, t.w};\n";
-      c += "    result." + postfixes[i] + " = t_ar[s_z_rem];\n";
-      c += "  }\n";
-    }
-  }
-  c += "  args.dst_tensor.Write(result, X, Y, Z);\n";
-  c += "}\n";
-  return c;
-}
-
 bool Is4Aligned(const SliceAttributes& attr) {
   return attr.strides.c == 1 && attr.starts.c % 4 == 0;
 }
@@ -143,41 +77,84 @@ int4 GetOffset(const SliceAttributes& attr, int src_width, int src_height,
 
 StridedSlice::StridedSlice(const OperationDef& definition,
                            const SliceAttributes& attr)
-    : GPUOperation(definition), attributes_(attr), work_group_size_(8, 4, 1) {}
+    : GPUOperation(definition), attributes_(attr) {
+  work_group_size_ = int3(8, 4, 1);
+  code_ = GetStridedSliceCode(definition_, Is4Aligned(attributes_));
+}
 
 StridedSlice::StridedSlice(StridedSlice&& operation)
-    : GPUOperation(std::move(operation)),
-      attributes_(operation.attributes_),
-      kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_) {}
+    : GPUOperation(std::move(operation)), attributes_(operation.attributes_) {}
 
 StridedSlice& StridedSlice::operator=(StridedSlice&& operation) {
   if (this != &operation) {
     attributes_ = operation.attributes_;
-    kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
     GPUOperation::operator=(std::move(operation));
   }
   return *this;
 }
 
-absl::Status StridedSlice::Compile(const CreationContext& creation_context) {
-  std::string code =
-      GetStridedSliceCode(definition_, Is4Aligned(attributes_), &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
+std::string StridedSlice::GetStridedSliceCode(const OperationDef& op_def,
+                                              bool alignedx4) {
+  AddSrcTensor("src_tensor", op_def.src_tensors[0]);
+  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
+  args_.AddInt("offset_x");
+  args_.AddInt("offset_y");
+  args_.AddInt("offset_z");
+  args_.AddInt("offset_b");
+  args_.AddInt("stride_x");
+  args_.AddInt("stride_y");
+  args_.AddInt("stride_z");
+  args_.AddInt("stride_b");
+
+  const std::string batch_id =
+      op_def.dst_tensors[0].HasAxis(Axis::BATCH) ? "B" : "0";
+  std::string c = GetCommonDefines(op_def.precision);
+  c += "__kernel void main_function(\n";
+  c += "$0) {\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int linear_id = get_global_id(0);\n";
+    c += "  int X = linear_id / args.dst_tensor.Batch();\n";
+    c += "  int B = linear_id % args.dst_tensor.Batch();\n";
+    c += "  args.dst_tensor.SetBatchRef(B);\n";
+  } else {
+    c += "  int X = get_global_id(0);\n";
+  }
+  c += "  int Y = get_global_id(1);\n";
+  c += "  int Z = get_global_id(2);\n";
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+       "Z >= args.dst_tensor.Slices()) { \n";
+  c += "    return; \n";
+  c += "  } \n";
+  c += "  int s_x = X * args.stride_x + args.offset_x;\n";
+  c += "  int s_y = Y * args.stride_y + args.offset_y;\n";
+  if (op_def.src_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int s_b = " + batch_id + " * args.stride_b + args.offset_b;\n";
+    c += "  args.src_tensor.SetBatchRef(s_b);\n";
+  }
+  if (alignedx4) {
+    c += "  int s_z = Z + args.offset_z;\n";
+    c += "  FLT4 result = args.src_tensor.Read(s_x, s_y, s_z);\n";
+  } else {
+    c += "  FLT4 result;\n";
+    const std::string postfixes[] = {"x", "y", "z", "w"};
+    for (int i = 0; i < 4; ++i) {
+      c += "  {\n";
+      const std::string channel = "(Z * 4 + " + std::to_string(i) + ")";
+      c += "    int s_ch = " + channel + " * args.stride_z + args.offset_z;\n";
+      c += "    int s_z = min(s_ch >> 2, args.src_tensor.Slices() - 1);\n";
+      c += "    int s_z_rem = s_ch & 3;\n";
+      c += "    FLT4 t = args.src_tensor.Read(s_x, s_y, s_z);\n";
+      c += "    FLT t_ar[4] = {t.x, t.y, t.z, t.w};\n";
+      c += "    result." + postfixes[i] + " = t_ar[s_z_rem];\n";
+      c += "  }\n";
+    }
+  }
+  c += "  args.dst_tensor.Write(result, X, Y, Z);\n";
+  c += "}\n";
+  return c;
 }
 
 absl::Status StridedSlice::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   int4 offset = GetOffset(attributes_, src_[0]->Width(), src_[0]->Height(),
                           src_[0]->Channels(), src_[0]->Batch());
   RETURN_IF_ERROR(args_.SetInt("offset_x", offset.x));
@@ -188,8 +165,7 @@ absl::Status StridedSlice::BindArguments() {
   RETURN_IF_ERROR(args_.SetInt("stride_y", attributes_.strides.h));
   RETURN_IF_ERROR(args_.SetInt("stride_z", attributes_.strides.c));
   RETURN_IF_ERROR(args_.SetInt("stride_b", attributes_.strides.b));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return absl::OkStatus();
 }
 
 int3 StridedSlice::GetGridSize() const {
@@ -199,16 +175,6 @@ int3 StridedSlice::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status StridedSlice::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status StridedSlice::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 StridedSlice CreateStridedSlice(const OperationDef& definition,
                                 const SliceAttributes& attr) {
   return StridedSlice(definition, attr);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.h b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.h
index ee6f18fdacb..5a6d8ad6047 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.h
@@ -27,10 +27,8 @@ namespace cl {
 class StridedSlice : public GPUOperation {
  public:
   StridedSlice(const OperationDef& definition, const SliceAttributes& attr);
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
-  absl::Status Compile(const CreationContext& creation_context) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   StridedSlice(StridedSlice&& operation);
@@ -39,13 +37,9 @@ class StridedSlice : public GPUOperation {
   StridedSlice& operator=(const StridedSlice&) = delete;
 
  private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
+  std::string GetStridedSliceCode(const OperationDef& op_def, bool alignedx4);
 
   SliceAttributes attributes_;
-
-  CLKernel kernel_;
-  int3 work_group_size_;
 };
 
 StridedSlice CreateStridedSlice(const OperationDef& definition,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
index cacfd52542d..259f66e0f38 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
@@ -24,17 +24,28 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
 
-std::string GetTransposeCode(
-    const OperationDef& op_def, const TransposeAttributes& attr,
-    Arguments* args) {
-  args->AddObjectRef(
-      "src_tensor", AccessType::READ,
-      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
-  args->AddObjectRef(
-      "dst_tensor", AccessType::WRITE,
-      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
+Transpose::Transpose(const OperationDef& definition,
+                     const TransposeAttributes& attr)
+    : GPUOperation(definition), attr_(attr) {
+  code_ = GetTransposeCode(definition_, attr_);
+}
+
+Transpose::Transpose(Transpose&& operation)
+    : GPUOperation(std::move(operation)), attr_(operation.attr_) {}
+
+Transpose& Transpose::operator=(Transpose&& operation) {
+  if (this != &operation) {
+    attr_ = operation.attr_;
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+std::string Transpose::GetTransposeCode(const OperationDef& op_def,
+                                        const TransposeAttributes& attr) {
+  AddSrcTensor("src_tensor", op_def.src_tensors[0]);
+  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
 
   const std::string batch_id =
       op_def.dst_tensors[0].HasAxis(Axis::BATCH) ? "B" : "0";
@@ -101,43 +112,6 @@ std::string GetTransposeCode(
   c += "}\n";
   return c;
 }
-}  // namespace
-
-Transpose::Transpose(Transpose&& operation)
-    : GPUOperation(std::move(operation)),
-      attr_(operation.attr_),
-      kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_) {}
-
-Transpose& Transpose::operator=(Transpose&& operation) {
-  if (this != &operation) {
-    attr_ = operation.attr_;
-    kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status Transpose::Compile(const CreationContext& creation_context) {
-  std::string code = GetTransposeCode(definition_, attr_, &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
-absl::Status Transpose::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
-}
 
 int3 Transpose::GetGridSize() const {
   const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
@@ -146,16 +120,6 @@ int3 Transpose::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status Transpose::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status Transpose::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 Transpose CreateTranspose(const OperationDef& definition,
                           const TransposeAttributes& attr) {
   return Transpose(definition, attr);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h
index 61038b1e0ca..950f838923e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h
@@ -26,11 +26,8 @@ namespace cl {
 
 class Transpose : public GPUOperation {
  public:
-  Transpose(const OperationDef& definition, const TransposeAttributes& attr)
-      : GPUOperation(definition), attr_(attr), work_group_size_(8, 4, 1) {}
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-  absl::Status Compile(const CreationContext& creation_context) override;
+  Transpose(const OperationDef& definition, const TransposeAttributes& attr);
+  int3 GetGridSize() const override;
 
   // Move only
   Transpose(Transpose&& operation);
@@ -39,12 +36,9 @@ class Transpose : public GPUOperation {
   Transpose& operator=(const Transpose&) = delete;
 
  private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
+  std::string GetTransposeCode(const OperationDef& op_def,
+                               const TransposeAttributes& attr);
   TransposeAttributes attr_;
-  CLKernel kernel_;
-  int3 work_group_size_;
 };
 
 Transpose CreateTranspose(const OperationDef& definition,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
index 72426b62d39..b7cfa5f013e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
@@ -28,39 +28,6 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
-std::string GetReadImageFromDataType(DataType data_type) {
-  if (data_type == DataType::FLOAT32) {
-    return "read_imagef";
-  } else if (data_type == DataType::FLOAT16) {
-    return "read_imageh";
-  } else {
-    return "error";
-  }
-}
-
-std::string GetWriteImageFromDataType(DataType data_type) {
-  if (data_type == DataType::FLOAT32) {
-    return "write_imagef";
-  } else if (data_type == DataType::FLOAT16) {
-    return "write_imageh";
-  } else {
-    return "error";
-  }
-}
-
-std::string GetImageModifier(AccessType access) {
-  switch (access) {
-    case AccessType::READ:
-      return "__read_only";
-    case AccessType::WRITE:
-      return "__write_only";
-    case AccessType::READ_WRITE:
-      return "__read_write";
-  }
-}
-
-}  // namespace
 
 std::string GetCommonDefines(CalculationsPrecision precision) {
   std::string result;
@@ -76,8 +43,6 @@ std::string GetCommonDefines(CalculationsPrecision precision) {
       result += "#define TO_FLT4 convert_float4\n";
       result += "#define TO_ACCUM_TYPE convert_float4\n";
       result += "#define TO_ACCUM_FLT convert_float\n";
-      result += "#define READ_IMAGE read_imagef\n";
-      result += "#define WRITE_IMAGE write_imagef\n";
       break;
     case CalculationsPrecision::F16:
       result += "#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable\n";
@@ -90,8 +55,6 @@ std::string GetCommonDefines(CalculationsPrecision precision) {
       result += "#define TO_FLT4 convert_half4\n";
       result += "#define TO_ACCUM_TYPE convert_half4\n";
       result += "#define TO_ACCUM_FLT convert_half\n";
-      result += "#define READ_IMAGE read_imageh\n";
-      result += "#define WRITE_IMAGE write_imageh\n";
       break;
     case CalculationsPrecision::F32_F16:
       result += "#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable\n";
@@ -104,21 +67,8 @@ std::string GetCommonDefines(CalculationsPrecision precision) {
       result += "#define TO_FLT4 convert_half4\n";
       result += "#define TO_ACCUM_TYPE convert_float4\n";
       result += "#define TO_ACCUM_FLT convert_float\n";
-      result += "#define READ_IMAGE read_imageh\n";
-      result += "#define WRITE_IMAGE write_imageh\n";
       break;
   }
-
-  result +=
-      "__constant sampler_t smp_edge = CLK_NORMALIZED_COORDS_FALSE | "
-      "CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n";
-  result +=
-      "__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | "
-      "CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n";
-  result +=
-      "__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | "
-      "CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n";
-
   return result;
 }
 
@@ -134,11 +84,6 @@ std::string GetXStrideCorrected(const std::string& src_x,
                           batch_size, stride_x, padding_x);
 }
 
-TextureAddressMode GetFastestZeroMode(const CLDevice& device) {
-  return device.IsAdreno3xx() ? TextureAddressMode::DONT_CARE
-                              : TextureAddressMode::ZERO;
-}
-
 float4 GetMaskForLastPlane(int channels) {
   float4 mask = float4(0.0f);
   const int reminder = channels % 4 == 0 ? 4 : channels % 4;
@@ -158,19 +103,19 @@ int3 GetFirstSuitableWorkGroup(const std::vector<int3>& wgs, int max_wg_size) {
   return {1, 1, 1};
 }
 
-int GetRecommendedBlockSizeForConv(const CLDevice& device,
+int GetRecommendedBlockSizeForConv(const DeviceInfo& device_info,
                                    CalculationsPrecision precision,
                                    int task_size) {
   const float task_size_per_cu =
-      task_size / static_cast<float>(device.GetInfo().compute_units_count);
+      task_size / static_cast<float>(device_info.compute_units_count);
   int block_size = 1;
   float threshold_1 = FLT_MAX;
   float threshold_2 = FLT_MAX;
   float threshold_4 = FLT_MAX;
-  if (!device.IsMali()) {
+  if (!device_info.IsMali()) {
     return 1;
   }
-  MaliInfo mali_info = device.GetInfo().mali_info;
+  MaliInfo mali_info = device_info.mali_info;
   switch (precision) {
     case CalculationsPrecision::F16:
       if (mali_info.IsBifrostGen1()) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.h b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
index 42be865e3a3..b1dd4fe8c57 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/types/span.h"
-#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/device_info.h"
 #include "tensorflow/lite/delegates/gpu/cl/precision.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/access_type.h"
@@ -83,19 +83,6 @@ void RearrangeWeightsToOHWIOGroupI4O4(
   }
 }
 
-// Returns fastest TextureAddressMode that return ZERO for out-of-range image
-// coordinates.
-//
-// Unfortunately, CLK_ADDRESS_CLAMP is very slow on Adreno3xx and
-// we can observe huge register overhead when compared to other modes.
-
-// While using CLK_ADDRESS_NONE with out-of-range image coordinates is undefined
-// in the OpenCL specification, we have observed that CLK_ADDRESS_NONE works
-// like CLK_ADDRESS_CLAMP for out-of-range image coordinates for RGBA F16/F32
-// textures on Adreno3xx devices. Using CLK_ADDRESS_NONE is significantly faster
-// than CLK_ADDRESS_CLAMP on Adreno 3xx.
-TextureAddressMode GetFastestZeroMode(const CLDevice& device);
-
 // Returns float4 mask for last plane(batch of 4 channels)
 // assumes that plane size is 4;
 // for example we have 7 channels, in our data structures we align it to 8
@@ -108,7 +95,7 @@ float4 GetMaskForLastPlane(int channels);
 int3 GetFirstSuitableWorkGroup(const std::vector<int3>& wgs, int max_wg_size);
 
 // task_size as amount of FLT4 processed elements.
-int GetRecommendedBlockSizeForConv(const CLDevice& device,
+int GetRecommendedBlockSizeForConv(const DeviceInfo& device,
                                    CalculationsPrecision precision,
                                    int task_size);
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
index d38b72e61a6..3af4c658ce2 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
@@ -31,11 +31,35 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
 
-std::string GetWinograd4x4To36Code(
-    const OperationDef& op_def,
-    Arguments* args) {
+Winograd4x4To36::Winograd4x4To36(const OperationDef& definition,
+                                 const Padding2D& padding,
+                                 const DeviceInfo& device_info)
+    : GPUOperation(definition), padding_(padding) {
+  work_group_size_ = int3(32, 1, 1);
+  code_ = GetWinograd4x4To36Code(definition_);
+  if (device_info.IsAdreno()) {
+    compiler_options_.push_back(CompilerOptions::ADRENO_MORE_WAVES);
+  }
+  if (definition_.precision == CalculationsPrecision::F16 &&
+      device_info.IsPowerVR()) {
+    compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
+  }
+}
+
+Winograd4x4To36::Winograd4x4To36(Winograd4x4To36&& operation)
+    : GPUOperation(std::move(operation)), padding_(operation.padding_) {}
+
+Winograd4x4To36& Winograd4x4To36::operator=(Winograd4x4To36&& operation) {
+  if (this != &operation) {
+    std::swap(padding_, operation.padding_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+std::string Winograd4x4To36::GetWinograd4x4To36Code(
+    const OperationDef& op_def) {
   std::string c = GetCommonDefines(op_def.precision);
 
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
@@ -69,16 +93,14 @@ std::string GetWinograd4x4To36Code(
   c += "};\n";
 
   std::string cl_type = accum_type == DataType::FLOAT16 ? "half" : "float";
-  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
-  src_desc->SetStateVar("ACCUM_FLT", cl_type);
-  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
-  args->AddObjectRef(
-      "dst_tensor", AccessType::WRITE,
-      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
-  args->AddInt("padding_x");
-  args->AddInt("padding_y");
-  args->AddInt("tiles_total");
-  args->AddInt("tiles_x");
+  auto src_desc = op_def.src_tensors[0];
+  src_desc.SetStateVar("ACCUM_FLT", cl_type);
+  AddSrcTensor("src_tensor", src_desc);
+  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
+  args_.AddInt("padding_x");
+  args_.AddInt("padding_y");
+  args_.AddInt("tiles_total");
+  args_.AddInt("tiles_x");
 
   c += "__kernel void main_function(\n";
   c += "$0) {\n";
@@ -171,7 +193,6 @@ std::string GetWinograd4x4To36Code(
     }
     c += "  }\n";
   }
-  const LinkingContext context{"r0", "DST_X", "DST_Y", "DST_Z"};
   c += "  {\n";
   c += "    FLT4 r0 = TO_FLT4(I0 + Bt[2] * I2 + Bt[4] * I4);\n";
   c += "    args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
@@ -213,8 +234,105 @@ std::string GetWinograd4x4To36Code(
   return c;
 }
 
-std::string GetWinograd36To4x4Code(const OperationDef& op_def,
-                                   Arguments* args) {
+absl::Status Winograd4x4To36::UploadBt(CLContext* context) {
+  tflite::gpu::Tensor<Linear, DataType::FLOAT32> bt_aligned;
+  bt_aligned.shape = Linear(6 * 8);
+  bt_aligned.data.resize(6 * 8);
+  auto bt_mat = BtMatrixForWinograd4x4To6x6();
+  for (int y = 0; y < 6; ++y) {
+    for (int x = 0; x < 6; ++x) {
+      bt_aligned.data[y * 8 + x] = bt_mat[y * 6 + x];
+    }
+    bt_aligned.data[y * 8 + 6] = 0.0f;
+    bt_aligned.data[y * 8 + 7] = 0.0f;
+  }
+
+  TensorLinearDescriptor desc;
+  desc.storage_type = LinearStorageType::TEXTURE_2D;
+  desc.element_type = definition_.GetDataType();
+
+  LinearStorage lt;
+  RETURN_IF_ERROR(CreateLinearStorage(desc, bt_aligned, context, &lt));
+  args_.AddObject("bt", AccessType::READ,
+                  absl::make_unique<LinearStorage>(std::move(lt)),
+                  absl::make_unique<TensorLinearDescriptor>(desc));
+  return absl::OkStatus();
+}
+
+int3 Winograd4x4To36::SelectBestWorkGroup(const KernelInfo& kernel_info) const {
+  const std::vector<int3> wgs = {{8, 6, 4}, {8, 6, 2}, {4, 6, 2},
+                                 {4, 6, 2}, {2, 6, 2}, {2, 6, 1},
+                                 {1, 6, 1}, {1, 3, 1}, {1, 1, 1}};
+  return GetFirstSuitableWorkGroup(wgs, kernel_info.max_work_group_size);
+}
+
+absl::Status Winograd4x4To36::BindArguments() {
+  const int tiles_x = DivideRoundUp(
+      src_[0]->Width() + padding_.prepended.w + padding_.appended.w - 2, 4);
+  const int tiles_y = DivideRoundUp(
+      src_[0]->Height() + padding_.prepended.h + padding_.appended.h - 2, 4);
+  const int tiles_total = tiles_x * tiles_y;
+  RETURN_IF_ERROR(args_.SetInt("padding_x", -padding_.prepended.w));
+  RETURN_IF_ERROR(args_.SetInt("padding_y", -padding_.prepended.h));
+  RETURN_IF_ERROR(args_.SetInt("tiles_total", tiles_total));
+  RETURN_IF_ERROR(args_.SetInt("tiles_x", tiles_x));
+  return absl::OkStatus();
+}
+
+int3 Winograd4x4To36::GetGridSize() const {
+  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
+  const int grid_y = 6;
+  const int grid_z = dst_[0]->Slices();
+  return int3(grid_x, grid_y, grid_z);
+}
+
+void Winograd4x4To36::GetPossibleKernelWorkGroups(
+    TuningType tuning_type, const DeviceInfo& device_info,
+    const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
+  switch (tuning_type) {
+    case TuningType::EXHAUSTIVE:
+      GetPossibleWorkGroups(tuning_type, device_info, kernel_info, grid_size_,
+                            work_groups);
+      return;
+    case TuningType::FAST:
+    default:
+      work_groups->push_back(SelectBestWorkGroup(kernel_info));
+      return;
+  }
+}
+
+absl::Status CreateWinograd4x4To36(const CreationContext& creation_context,
+                                   const OperationDef& definition,
+                                   const Padding2D& padding,
+                                   Winograd4x4To36* result) {
+  *result =
+      Winograd4x4To36(definition, padding, creation_context.device->info_);
+  return result->UploadBt(creation_context.context);
+}
+
+Winograd36To4x4::Winograd36To4x4(const OperationDef& definition,
+                                 const DeviceInfo& device_info)
+    : GPUOperation(definition) {
+  work_group_size_ = int3(32, 1, 1);
+  if (definition_.precision == CalculationsPrecision::F16 &&
+      device_info.IsPowerVR()) {
+    compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
+  }
+  code_ = GetWinograd36To4x4Code(definition_);
+}
+
+Winograd36To4x4::Winograd36To4x4(Winograd36To4x4&& operation)
+    : GPUOperation(std::move(operation)) {}
+
+Winograd36To4x4& Winograd36To4x4::operator=(Winograd36To4x4&& operation) {
+  if (this != &operation) {
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+std::string Winograd36To4x4::GetWinograd36To4x4Code(
+    const OperationDef& op_def) {
   std::string c = GetCommonDefines(op_def.precision);
 
   switch (op_def.precision) {
@@ -232,13 +350,11 @@ std::string GetWinograd36To4x4Code(const OperationDef& op_def,
                                   : DataType::FLOAT32;
 
   std::string cl_type = accum_type == DataType::FLOAT16 ? "half" : "float";
-  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
-  src_desc->SetStateVar("ACCUM_FLT", cl_type);
-  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
-  args->AddObjectRef(
-      "dst_tensor", AccessType::WRITE,
-      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
-  args->AddInt("tiles_x");
+  auto src_desc = op_def.src_tensors[0];
+  src_desc.SetStateVar("ACCUM_FLT", cl_type);
+  AddSrcTensor("src_tensor", src_desc);
+  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
+  args_.AddInt("tiles_x");
 
   auto at_mat = AtMatrixForWinograd4x4To6x6();
   c += "constant ACCUM_FLT At[24] = {\n";
@@ -323,162 +439,6 @@ std::string GetWinograd36To4x4Code(const OperationDef& op_def,
   c += "}\n";
   return c;
 }
-}  // namespace
-
-Winograd4x4To36::Winograd4x4To36(Winograd4x4To36&& operation)
-    : GPUOperation(std::move(operation)),
-      padding_(operation.padding_),
-      kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_) {}
-
-Winograd4x4To36& Winograd4x4To36::operator=(Winograd4x4To36&& operation) {
-  if (this != &operation) {
-    std::swap(padding_, operation.padding_);
-    kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status Winograd4x4To36::Compile(const CreationContext& creation_context) {
-  std::vector<CompilerOptions> options;
-  if (creation_context.device->IsAdreno()) {
-    options.push_back(CompilerOptions::ADRENO_MORE_WAVES);
-  }
-  if (definition_.precision == CalculationsPrecision::F16 &&
-      creation_context.device->IsPowerVR()) {
-    options.push_back(CompilerOptions::POWERVR_FP16);
-  }
-  RETURN_IF_ERROR(UploadBt(creation_context.context));
-  std::string code = GetWinograd4x4To36Code(definition_, &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", options, *creation_context.context,
-      *creation_context.device, &kernel_));
-  work_group_size_ = SelectBestWorkGroup();
-  return absl::OkStatus();
-}
-
-absl::Status Winograd4x4To36::UploadBt(CLContext* context) {
-  tflite::gpu::Tensor<Linear, DataType::FLOAT32> bt_aligned;
-  bt_aligned.shape = Linear(6 * 8);
-  bt_aligned.data.resize(6 * 8);
-  auto bt_mat = BtMatrixForWinograd4x4To6x6();
-  for (int y = 0; y < 6; ++y) {
-    for (int x = 0; x < 6; ++x) {
-      bt_aligned.data[y * 8 + x] = bt_mat[y * 6 + x];
-    }
-    bt_aligned.data[y * 8 + 6] = 0.0f;
-    bt_aligned.data[y * 8 + 7] = 0.0f;
-  }
-
-  TensorLinearDescriptor desc;
-  desc.storage_type = LinearStorageType::TEXTURE_2D;
-  desc.element_type = definition_.GetDataType();
-
-  LinearStorage lt;
-  RETURN_IF_ERROR(CreateLinearStorage(desc, bt_aligned, context, &lt));
-  args_.AddObject("bt", AccessType::READ,
-                  absl::make_unique<LinearStorage>(std::move(lt)),
-                  absl::make_unique<TensorLinearDescriptor>(desc));
-  return absl::OkStatus();
-}
-
-int3 Winograd4x4To36::SelectBestWorkGroup() {
-  const std::vector<int3> wgs = {{8, 6, 4}, {8, 6, 2}, {4, 6, 2},
-                                 {4, 6, 2}, {2, 6, 2}, {2, 6, 1},
-                                 {1, 6, 1}, {1, 3, 1}, {1, 1, 1}};
-  return GetFirstSuitableWorkGroup(wgs, kernel_.GetMaxWorkGroupSize());
-}
-
-absl::Status Winograd4x4To36::BindArguments() {
-  const int tiles_x = DivideRoundUp(
-      src_[0]->Width() + padding_.prepended.w + padding_.appended.w - 2, 4);
-  const int tiles_y = DivideRoundUp(
-      src_[0]->Height() + padding_.prepended.h + padding_.appended.h - 2, 4);
-  const int tiles_total = tiles_x * tiles_y;
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(args_.SetInt("padding_x", -padding_.prepended.w));
-  RETURN_IF_ERROR(args_.SetInt("padding_y", -padding_.prepended.h));
-  RETURN_IF_ERROR(args_.SetInt("tiles_total", tiles_total));
-  RETURN_IF_ERROR(args_.SetInt("tiles_x", tiles_x));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
-}
-
-int3 Winograd4x4To36::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = 6;
-  const int grid_z = dst_[0]->Slices();
-  return int3(grid_x, grid_y, grid_z);
-}
-
-absl::Status Winograd4x4To36::Tune(const TuningParameters& params) {
-  switch (params.tuning_type) {
-    case TuningType::EXHAUSTIVE:
-      RETURN_IF_ERROR(BindArguments());
-      return GetBestWorkGroup(params, kernel_, GetGridSize(),
-                              &work_group_size_);
-    case TuningType::FAST:
-    default:
-      work_group_size_ = SelectBestWorkGroup();
-      return absl::OkStatus();
-  }
-}
-
-absl::Status Winograd4x4To36::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
-absl::Status CreateWinograd4x4To36(const CreationContext& creation_context,
-                                   const OperationDef& definition,
-                                   const Padding2D& padding,
-                                   Winograd4x4To36* result) {
-  *result = Winograd4x4To36(definition, padding);
-  return result->UploadBt(creation_context.context);
-}
-
-Winograd36To4x4::Winograd36To4x4(Winograd36To4x4&& operation)
-    : GPUOperation(std::move(operation)),
-      kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_) {}
-
-Winograd36To4x4& Winograd36To4x4::operator=(Winograd36To4x4&& operation) {
-  if (this != &operation) {
-    kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status Winograd36To4x4::Compile(const CreationContext& creation_context) {
-  std::vector<CompilerOptions> options;
-  if (definition_.precision == CalculationsPrecision::F16 &&
-      creation_context.device->IsPowerVR()) {
-    options.push_back(CompilerOptions::POWERVR_FP16);
-  }
-  std::string code = GetWinograd36To4x4Code(definition_, &args_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", options, *creation_context.context,
-      *creation_context.device, &kernel_));
-  work_group_size_ = SelectBestWorkGroup();
-  return absl::OkStatus();
-}
 
 absl::Status Winograd36To4x4::UploadAt(CLContext* context) {
   tflite::gpu::Tensor<Linear, DataType::FLOAT32> at_aligned;
@@ -504,11 +464,11 @@ absl::Status Winograd36To4x4::UploadAt(CLContext* context) {
   return absl::OkStatus();
 }
 
-int3 Winograd36To4x4::SelectBestWorkGroup() {
+int3 Winograd36To4x4::SelectBestWorkGroup(const KernelInfo& kernel_info) const {
   const std::vector<int3> wgs = {{32, 4, 2}, {16, 4, 2}, {16, 4, 1},
                                  {8, 4, 1},  {4, 4, 1},  {2, 4, 1},
                                  {1, 4, 1},  {1, 2, 1},  {1, 1, 1}};
-  return GetFirstSuitableWorkGroup(wgs, kernel_.GetMaxWorkGroupSize());
+  return GetFirstSuitableWorkGroup(wgs, kernel_info.max_work_group_size);
 }
 
 absl::Status Winograd36To4x4::BindArguments() {
@@ -516,8 +476,7 @@ absl::Status Winograd36To4x4::BindArguments() {
   RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   const int tiles_x = DivideRoundUp(dst_[0]->Width(), 4);
   RETURN_IF_ERROR(args_.SetInt("tiles_x", tiles_x));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return absl::OkStatus();
 }
 
 int3 Winograd36To4x4::GetGridSize() const {
@@ -529,29 +488,26 @@ int3 Winograd36To4x4::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status Winograd36To4x4::Tune(const TuningParameters& params) {
-  switch (params.tuning_type) {
+void Winograd36To4x4::GetPossibleKernelWorkGroups(
+    TuningType tuning_type, const DeviceInfo& device_info,
+    const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
+  switch (tuning_type) {
     case TuningType::EXHAUSTIVE:
-      RETURN_IF_ERROR(BindArguments());
-      return GetBestWorkGroup(params, kernel_, GetGridSize(),
-                              &work_group_size_);
+      GetPossibleWorkGroups(tuning_type, device_info, kernel_info, grid_size_,
+                            work_groups);
+      return;
     case TuningType::FAST:
     default:
-      work_group_size_ = SelectBestWorkGroup();
-      return absl::OkStatus();
+      work_groups->push_back(SelectBestWorkGroup(kernel_info));
+      return;
   }
 }
 
-absl::Status Winograd36To4x4::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 absl::Status CreateWinograd36To4x4(
     const CreationContext& creation_context, const OperationDef& definition,
     const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
     Winograd36To4x4* result) {
-  *result = Winograd36To4x4(definition);
+  *result = Winograd36To4x4(definition, creation_context.device->info_);
   TensorLinearDescriptor desc;
   desc.storage_type = LinearStorageType::TEXTURE_2D;
   desc.element_type = definition.GetDataType();
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
index 84ebd87042d..08153f1d8aa 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
@@ -34,11 +34,14 @@ namespace cl {
 class Winograd4x4To36 : public GPUOperation {
  public:
   Winograd4x4To36() = default;
-  Winograd4x4To36(const OperationDef& definition, const Padding2D& padding)
-      : GPUOperation(definition), padding_(padding) {}
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-  absl::Status Compile(const CreationContext& creation_context) override;
+  Winograd4x4To36(const OperationDef& definition, const Padding2D& padding,
+                  const DeviceInfo& device_info);
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override;
 
   // Move only
   Winograd4x4To36(Winograd4x4To36&& operation);
@@ -53,16 +56,12 @@ class Winograd4x4To36 : public GPUOperation {
 
   absl::Status UploadBt(CLContext* context);
 
-  // Must be called after kernel compilation
-  int3 SelectBestWorkGroup();
+  std::string GetWinograd4x4To36Code(const OperationDef& op_def);
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
+  // Must be called after kernel compilation
+  int3 SelectBestWorkGroup(const KernelInfo& kernel_info) const;
 
   Padding2D padding_;
-
-  CLKernel kernel_;
-  int3 work_group_size_ = int3(128, 1, 1);
 };
 
 absl::Status CreateWinograd4x4To36(const CreationContext& creation_context,
@@ -73,11 +72,14 @@ absl::Status CreateWinograd4x4To36(const CreationContext& creation_context,
 class Winograd36To4x4 : public GPUOperation {
  public:
   Winograd36To4x4() = default;
-  explicit Winograd36To4x4(const OperationDef& definition)
-      : GPUOperation(definition) {}
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-  absl::Status Compile(const CreationContext& creation_context) override;
+  Winograd36To4x4(const OperationDef& definition,
+                  const DeviceInfo& device_info);
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override;
 
   // Move only
   Winograd36To4x4(Winograd36To4x4&& operation);
@@ -93,14 +95,10 @@ class Winograd36To4x4 : public GPUOperation {
 
   absl::Status UploadAt(CLContext* context);
 
+  std::string GetWinograd36To4x4Code(const OperationDef& op_def);
+
   // Must be called after kernel compilation
-  int3 SelectBestWorkGroup();
-
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
-  CLKernel kernel_;
-  int3 work_group_size_ = int3(128, 1, 1);
+  int3 SelectBestWorkGroup(const KernelInfo& kernel_info) const;
 };
 
 absl::Status CreateWinograd36To4x4(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.cc b/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.cc
index 5e280d5f98b..4c0cbc06985 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.cc
@@ -33,61 +33,72 @@ std::vector<int2> Get2DWorkgroupsEqualTo128() {
           {8, 16},  {4, 32}, {2, 64}, {1, 128}};
 }
 
-std::vector<int3> GenerateWorkGroupSizesXY128(
-    int3 grid, int max_work_group_size, WorkGroupSizeAlignment z_alignment) {
+std::vector<int3> GenerateWorkGroupSizesXYMultipleOf(
+    int multiplier, int3 grid, const KernelInfo& kernel_info,
+    const DeviceInfo& device_info, WorkGroupSizeAlignment z_alignment) {
   std::vector<int3> work_groups;
   work_groups.reserve(32);
 
   std::vector<int> possible_z_sizes = GetPossibleSizes(grid.z, z_alignment);
 
-  for (int x = 1; x <= max_work_group_size; x *= 2) {
-    for (int y = 1; y <= max_work_group_size; y *= 2) {
+  for (int x = 1; x <= kernel_info.max_work_group_size; x *= 2) {
+    for (int y = 1; y <= kernel_info.max_work_group_size; y *= 2) {
       int work_group_size_xy = x * y;
-      if (work_group_size_xy % 128 != 0 ||
-          work_group_size_xy > max_work_group_size) {
+      if (work_group_size_xy % multiplier != 0 ||
+          work_group_size_xy > kernel_info.max_work_group_size) {
         continue;
       }
       for (auto z : possible_z_sizes) {
-        if (work_group_size_xy * z > max_work_group_size) {
+        if (work_group_size_xy * z > kernel_info.max_work_group_size) {
           continue;
         }
-        work_groups.push_back({x, y, z});
+        if (x <= device_info.max_work_group_size_x &&
+            y <= device_info.max_work_group_size_y &&
+            z <= device_info.max_work_group_size_z) {
+          work_groups.push_back({x, y, z});
+        }
       }
     }
   }
   return work_groups;
 }
 
-std::vector<int3> GenerateWorkGroupSizesXY128Linear(
-    int3 grid, int max_work_group_size, WorkGroupSizeAlignment z_alignment) {
+std::vector<int3> GenerateWorkGroupSizesXMultipleOf(
+    int multiplier, int3 grid, const KernelInfo& kernel_info,
+    const DeviceInfo& device_info, WorkGroupSizeAlignment z_alignment) {
   std::vector<int3> work_groups;
   work_groups.reserve(32);
 
   std::vector<int> possible_z_sizes = GetPossibleSizes(grid.z, z_alignment);
+  std::vector<int> possible_y_sizes =
+      GetPossibleSizes(grid.y, WorkGroupSizeAlignment::PRECISE);
 
-  for (int x = 128; x <= max_work_group_size && x < grid.x + 128; x += 128) {
-    for (auto z : possible_z_sizes) {
-      if (x * z <= max_work_group_size) {
-        work_groups.push_back({x, 1, z});
+  for (int x = multiplier;
+       x <= kernel_info.max_work_group_size && x < grid.x + multiplier;
+       x += multiplier) {
+    for (auto y : possible_y_sizes) {
+      for (auto z : possible_z_sizes) {
+        if (x <= device_info.max_work_group_size_x &&
+            y <= device_info.max_work_group_size_y &&
+            z <= device_info.max_work_group_size_z &&
+            x * y * z <= kernel_info.max_work_group_size) {
+          work_groups.push_back({x, y, z});
+        }
       }
     }
   }
   return work_groups;
 }
 
-absl::Status GetBestWorkGroupAlignedToGrid(const TuningParameters& params,
-                                           const CLKernel& kernel,
-                                           const int3& grid,
-                                           int3* best_work_group) {
-  std::vector<int3> work_groups;
-  RETURN_IF_ERROR(GenerateWorkGroupSizesAlignedToGrid(
-      grid, params.info->max_work_group_sizes, kernel.GetMaxWorkGroupSize(),
-      &work_groups));
-  int best_work_group_index;
-  RETURN_IF_ERROR(params.queue->GetBestWorkGroupIndex(
-      kernel, *params.info, grid, work_groups, &best_work_group_index));
-  *best_work_group = work_groups[best_work_group_index];
-  return absl::OkStatus();
+void GetWorkGroupsAlignedToGrid(const DeviceInfo& device_info,
+                                const KernelInfo& kernel_info, const int3& grid,
+                                std::vector<int3>* work_groups) {
+  int3 max_wg_size;
+  max_wg_size.x = device_info.max_work_group_size_x;
+  max_wg_size.y = device_info.max_work_group_size_y;
+  max_wg_size.z = device_info.max_work_group_size_z;
+  GenerateWorkGroupSizesAlignedToGrid(
+      grid, max_wg_size, kernel_info.max_work_group_size, work_groups);
 }
 
 int GetPenalty(int grid_size, int group_size) {
@@ -203,31 +214,24 @@ int3 GetWorkGroupConv(const int3& grid, int max_size, int max_z_size) {
   return int3(wg_x, wg_y, wg_z);
 }
 
-absl::Status GetBestWorkGroupXY128(const TuningParameters& params,
-                                   const CLKernel& kernel, const int3& grid,
-                                   WorkGroupSizeAlignment z_alignment,
-                                   int3* best_work_group) {
-  std::vector<int3> work_groups = GenerateWorkGroupSizesXY128(
-      grid, kernel.GetMaxWorkGroupSize(), z_alignment);
-  int best_work_group_index;
-  RETURN_IF_ERROR(params.queue->GetBestWorkGroupIndex(
-      kernel, *params.info, grid, work_groups, &best_work_group_index));
-  *best_work_group = work_groups[best_work_group_index];
-  return absl::OkStatus();
+void GetPossibleWorkGroupsXYMultipleOf(int multiplier,
+                                       const DeviceInfo& device_info,
+                                       const KernelInfo& kernel_info,
+                                       const int3& grid,
+                                       WorkGroupSizeAlignment z_alignment,
+                                       std::vector<int3>* work_groups) {
+  *work_groups = GenerateWorkGroupSizesXYMultipleOf(
+      multiplier, grid, kernel_info, device_info, z_alignment);
 }
 
-absl::Status GetBestWorkGroupXY128Linear(const TuningParameters& params,
-                                         const CLKernel& kernel,
-                                         const int3& grid,
-                                         WorkGroupSizeAlignment z_alignment,
-                                         int3* best_work_group) {
-  std::vector<int3> work_groups = GenerateWorkGroupSizesXY128Linear(
-      grid, kernel.GetMaxWorkGroupSize(), z_alignment);
-  int best_work_group_index;
-  RETURN_IF_ERROR(params.queue->GetBestWorkGroupIndex(
-      kernel, *params.info, grid, work_groups, &best_work_group_index));
-  *best_work_group = work_groups[best_work_group_index];
-  return absl::OkStatus();
+void GetPossibleWorkGroupsXMultipleOf(int multiplier,
+                                      const DeviceInfo& device_info,
+                                      const KernelInfo& kernel_info,
+                                      const int3& grid,
+                                      WorkGroupSizeAlignment z_alignment,
+                                      std::vector<int3>* work_groups) {
+  *work_groups = GenerateWorkGroupSizesXMultipleOf(
+      multiplier, grid, kernel_info, device_info, z_alignment);
 }
 
 bool XY128RequiresMoreWorkGroupsThenXY128Linear(int width, int height) {
@@ -246,42 +250,47 @@ bool XY128RequiresMoreWorkGroupsThenXY128Linear(int width, int height) {
   return !have_equal_work_groups;
 }
 
-absl::Status GetBestWorkGroup(const TuningParameters& params,
-                              const CLKernel& kernel, const int3& grid,
-                              int3* best_work_group) {
-  switch (params.tuning_type) {
+void GetPossibleWorkGroups(TuningType tuning_type,
+                           const DeviceInfo& device_info,
+                           const KernelInfo& kernel_info, const int3& grid,
+                           std::vector<int3>* work_groups) {
+  switch (tuning_type) {
     case TuningType::FAST:
-      *best_work_group = GetWorkGroup(grid, kernel.GetMaxWorkGroupSize());
-      return absl::OkStatus();
-    case TuningType::EXHAUSTIVE:
-      return GetBestWorkGroupAlignedToGrid(params, kernel, grid,
-                                           best_work_group);
+      work_groups->push_back(
+          GetWorkGroup(grid, kernel_info.max_work_group_size));
+      return;
+    case TuningType::EXHAUSTIVE: {
+      GetWorkGroupsAlignedToGrid(device_info, kernel_info, grid, work_groups);
+      return;
+    }
     default:
-      *best_work_group = {8, 4, 1};
-      return absl::OkStatus();
+      work_groups->push_back({8, 4, 1});
+      return;
   }
 }
 
-absl::Status GetBestWorkGroupConv(const TuningParameters& params,
-                                  const CLKernel& kernel, const int3& grid,
-                                  int3* best_work_group) {
-  switch (params.tuning_type) {
+void GetPossibleWorkGroupsConv(TuningType tuning_type,
+                               const DeviceInfo& device_info,
+                               const KernelInfo& kernel_info, const int3& grid,
+                               std::vector<int3>* work_groups) {
+  switch (tuning_type) {
     case TuningType::FAST: {
       int max_z_size = 16;
-      if (params.info->vendor == Vendor::QUALCOMM) {
-        max_z_size = params.info->adreno_info.gpu_version < 400 ? 16 : 64;
+      if (device_info.IsAdreno()) {
+        max_z_size = device_info.IsAdreno3xx() ? 16 : 64;
       }
-      max_z_size = std::min(max_z_size, params.info->max_work_group_sizes.z);
-      *best_work_group =
-          GetWorkGroupConv(grid, kernel.GetMaxWorkGroupSize(), max_z_size);
-      return absl::OkStatus();
+      max_z_size = std::min(max_z_size, device_info.max_work_group_size_z);
+      work_groups->push_back(
+          GetWorkGroupConv(grid, kernel_info.max_work_group_size, max_z_size));
+      return;
+    }
+    case TuningType::EXHAUSTIVE: {
+      GetWorkGroupsAlignedToGrid(device_info, kernel_info, grid, work_groups);
+      return;
     }
-    case TuningType::EXHAUSTIVE:
-      return GetBestWorkGroupAlignedToGrid(params, kernel, grid,
-                                           best_work_group);
     default:
-      *best_work_group = {8, 4, 1};
-      return absl::OkStatus();
+      work_groups->push_back({8, 4, 1});
+      return;
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h b/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h
index 7cc60f4723f..0c1be10782e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h
@@ -27,20 +27,20 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-// writes best_work_group if successful
-// Here and later you can find XY128, this is because 128 is SIMD width of A6xx
-// And XY128 means that work_group_size.x * work_group_size.y % 128 = 0
-// We need it to correctly work with constants uploading on A6xx
-absl::Status GetBestWorkGroupXY128(const TuningParameters& params,
-                                   const CLKernel& kernel, const int3& grid,
-                                   WorkGroupSizeAlignment z_alignment,
-                                   int3* best_work_group);
+// multiplier can be power of two only
+void GetPossibleWorkGroupsXYMultipleOf(int multiplier,
+                                       const DeviceInfo& device_info,
+                                       const KernelInfo& kernel_info,
+                                       const int3& grid,
+                                       WorkGroupSizeAlignment z_alignment,
+                                       std::vector<int3>* work_groups);
 
-absl::Status GetBestWorkGroupXY128Linear(const TuningParameters& params,
-                                         const CLKernel& kernel,
-                                         const int3& grid,
-                                         WorkGroupSizeAlignment z_alignment,
-                                         int3* best_work_group);
+void GetPossibleWorkGroupsXMultipleOf(int multiplier,
+                                      const DeviceInfo& device_info,
+                                      const KernelInfo& kernel_info,
+                                      const int3& grid,
+                                      WorkGroupSizeAlignment z_alignment,
+                                      std::vector<int3>* work_groups);
 
 int3 GetWorkGroupXY128ConvLinear(const int3& grid);
 
@@ -49,13 +49,15 @@ int3 GetWorkGroupXY128Conv(const int3& grid);
 
 bool XY128RequiresMoreWorkGroupsThenXY128Linear(int width, int height);
 
-absl::Status GetBestWorkGroup(const TuningParameters& params,
-                              const CLKernel& kernel, const int3& grid,
-                              int3* best_work_group);
+void GetPossibleWorkGroups(TuningType tuning_type,
+                           const DeviceInfo& device_info,
+                           const KernelInfo& kernel_info, const int3& grid,
+                           std::vector<int3>* work_groups);
 
-absl::Status GetBestWorkGroupConv(const TuningParameters& params,
-                                  const CLKernel& kernel, const int3& grid,
-                                  int3* best_work_group);
+void GetPossibleWorkGroupsConv(TuningType tuning_type,
+                               const DeviceInfo& device_info,
+                               const KernelInfo& kernel_info, const int3& grid,
+                               std::vector<int3>* work_groups);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
index 09c56c13e4a..0ff17d0e3de 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
@@ -81,53 +81,27 @@ absl::Status TensorLinearDescriptor::PerformReadSelector(
   }
 }
 
-LinearStorage::LinearStorage(int depth, LinearStorageType storage_type,
-                             DataType data_type)
-    : depth_(depth), storage_type_(storage_type), data_type_(data_type) {}
+LinearStorage::LinearStorage(int depth, LinearStorageType storage_type)
+    : depth_(depth), storage_type_(storage_type) {}
 
 LinearStorage::LinearStorage(LinearStorage&& storage)
     : GPUObject(std::move(storage)),
       texture_storage_(std::move(storage.texture_storage_)),
       buffer_storage_(std::move(storage.buffer_storage_)),
-      memory_(storage.memory_),
       depth_(storage.depth_),
-      name_(std::move(storage.name_)),
-      storage_type_(storage.storage_type_),
-      data_type_(storage.data_type_) {
-  storage.memory_ = nullptr;
-}
+      storage_type_(storage.storage_type_) {}
 
 LinearStorage& LinearStorage::operator=(LinearStorage&& storage) {
   if (this != &storage) {
     texture_storage_ = std::move(storage.texture_storage_);
     buffer_storage_ = std::move(storage.buffer_storage_);
-    std::swap(memory_, storage.memory_);
     std::swap(depth_, storage.depth_);
-    name_ = std::move(storage.name_);
     std::swap(storage_type_, storage.storage_type_);
-    std::swap(data_type_, storage.data_type_);
     GPUObject::operator=(std::move(storage));
   }
   return *this;
 }
 
-std::string LinearStorage::ReadLinearFLT4(const std::string& z_coord) const {
-  if (storage_type_ == LinearStorageType::BUFFER) {
-    return absl::StrCat(name_, "[", z_coord, "]");
-  } else {
-    return absl::StrCat("READ_IMAGE(", name_, ", smp_none, (int2)(", z_coord,
-                        ", 0))");
-  }
-}
-
-std::string LinearStorage::GetDeclaration() const {
-  if (storage_type_ == LinearStorageType::BUFFER) {
-    return absl::StrCat("__global FLT4* ", name_);
-  } else {
-    return absl::StrCat("__read_only image2d_t ", name_);
-  }
-}
-
 absl::Status LinearStorage::GetGPUResources(
     const GPUObjectDescriptor* obj_ptr,
     GPUResourcesWithValue* resources) const {
@@ -141,9 +115,9 @@ absl::Status LinearStorage::GetGPUResources(
   resources->ints.push_back({"length", depth_});
 
   if (storage_type_ == LinearStorageType::BUFFER) {
-    resources->buffers.push_back({"buffer", memory_});
+    resources->buffers.push_back({"buffer", buffer_storage_.GetMemoryPtr()});
   } else {
-    resources->images2d.push_back({"tex2d", memory_});
+    resources->images2d.push_back({"tex2d", texture_storage_.GetMemoryPtr()});
   }
 
   return absl::OkStatus();
@@ -158,37 +132,21 @@ LinearStorageType DeduceLinearStorageType(
   }
 }
 
-absl::Status CreateBufferLinearStorage(int size, DataType data_type, void* data,
-                                       CLContext* context,
-                                       LinearStorage* result) {
-  const int float4_size =
-      data_type == DataType::FLOAT32 ? sizeof(float4) : sizeof(half4);
-  *result = LinearStorage(size, LinearStorageType::BUFFER, data_type);
-  RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * size, data, context,
-                                       &result->buffer_storage_));
-  result->memory_ = result->buffer_storage_.GetMemoryPtr();
-  return absl::OkStatus();
-}
-
-absl::Status CreateTextureLinearStorage(int size, DataType data_type,
-                                        void* data, CLContext* context,
-                                        LinearStorage* result) {
-  *result = LinearStorage(size, LinearStorageType::TEXTURE_2D, data_type);
-  RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, size, 1, data, context,
-                                      &result->texture_storage_));
-  result->memory_ = result->texture_storage_.GetMemoryPtr();
-  return absl::OkStatus();
-}
-
-absl::Status CreateLinearStorage(const LinearStorageCreateInfo& creation_info,
-                                 int size, void* data, CLContext* context,
-                                 LinearStorage* result) {
-  if (creation_info.storage_type == LinearStorageType::BUFFER) {
-    return CreateBufferLinearStorage(size, creation_info.data_type, data,
-                                     context, result);
+absl::Status CreateLinearStorage(LinearStorageType storage_type,
+                                 DataType data_type, int size, void* data,
+                                 CLContext* context, LinearStorage* result) {
+  if (storage_type == LinearStorageType::BUFFER) {
+    const int float4_size =
+        data_type == DataType::FLOAT32 ? sizeof(float4) : sizeof(half4);
+    *result = LinearStorage(size, LinearStorageType::BUFFER);
+    RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * size, data, context,
+                                         &result->buffer_storage_));
+    return absl::OkStatus();
   } else {
-    return CreateTextureLinearStorage(size, creation_info.data_type, data,
-                                      context, result);
+    *result = LinearStorage(size, LinearStorageType::TEXTURE_2D);
+    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, size, 1, data, context,
+                                        &result->texture_storage_));
+    return absl::OkStatus();
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.h b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
index 29de71c6b5e..b69f76b9c1a 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.h
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
@@ -52,13 +52,6 @@ struct TensorLinearDescriptor : public GPUObjectDescriptor {
                                    std::string* result) const;
 };
 
-struct LinearStorageCreateInfo {
-  LinearStorageType storage_type;
-  DataType data_type;
-  std::string name;      // optional
-  int aligned_size = 0;  // optional, to pad with zeroes
-};
-
 LinearStorageType DeduceLinearStorageType(
     TensorStorageType tensor_storage_type);
 
@@ -68,96 +61,54 @@ class LinearStorage : public GPUObject {
  public:
   LinearStorage() {}
 
+  virtual ~LinearStorage() {}
+
   // Move only
   LinearStorage(LinearStorage&& storage);
   LinearStorage& operator=(LinearStorage&& storage);
   LinearStorage(const LinearStorage&) = delete;
   LinearStorage& operator=(const LinearStorage&) = delete;
 
-  void SetName(const std::string& name) { name_ = name; }
-  cl_mem GetMemoryPtr() const { return memory_; }
-  std::string ReadLinearFLT4(const std::string& z_coord) const;
-  std::string GetDeclaration() const;
-
   absl::Status GetGPUResources(const GPUObjectDescriptor* obj_ptr,
                                GPUResourcesWithValue* resources) const override;
 
  private:
-  friend absl::Status CreateTextureLinearStorage(int size, DataType data_type,
-                                                 void* data, CLContext* context,
-                                                 LinearStorage* result);
-  friend absl::Status CreateBufferLinearStorage(int size, DataType data_type,
-                                                void* data, CLContext* context,
-                                                LinearStorage* result);
+  friend absl::Status CreateLinearStorage(LinearStorageType storage_type,
+                                          DataType data_type, int size,
+                                          void* data, CLContext* context,
+                                          LinearStorage* result);
 
-  LinearStorage(int depth, LinearStorageType storage_type, DataType data_type);
+  LinearStorage(int depth, LinearStorageType storage_type);
 
   Texture2D texture_storage_;
   Buffer buffer_storage_;
-  cl_mem memory_ = nullptr;  // Just a reference to texture_storage_ or
-                             // buffer_storage_ memory, not an owner
+
   int depth_;
-  std::string name_;
   LinearStorageType storage_type_;
-  DataType data_type_;
 };
 
-absl::Status CreateBufferLinearStorage(int size, DataType data_type, void* data,
-                                       CLContext* context,
-                                       LinearStorage* result);
-
-absl::Status CreateTextureLinearStorage(int size, DataType data_type,
-                                        void* data, CLContext* context,
-                                        LinearStorage* result);
-
-absl::Status CreateLinearStorage(const LinearStorageCreateInfo& creation_info,
-                                 int size, void* data, CLContext* context,
-                                 LinearStorage* result);
-
-template <DataType T>
-absl::Status CreateLinearStorage(const LinearStorageCreateInfo& creation_info,
-                                 const tflite::gpu::Tensor<Linear, T>& tensor,
-                                 CLContext* context, LinearStorage* result) {
-  int size = creation_info.aligned_size != 0 ? creation_info.aligned_size
-                                             : tensor.shape.v;
-  const int depth = DivideRoundUp(size, 4);
-  if (creation_info.data_type == DataType::FLOAT32) {
-    std::vector<float4> gpu_data(depth);
-    CopyLinearFLT4(tensor, absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateLinearStorage(creation_info, depth, gpu_data.data(),
-                                        context, result));
-  } else {
-    std::vector<half4> gpu_data(depth);
-    CopyLinearFLT4(tensor, absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateLinearStorage(creation_info, depth, gpu_data.data(),
-                                        context, result));
-  }
-  result->SetName(creation_info.name);
-  return absl::OkStatus();
-}
+absl::Status CreateLinearStorage(LinearStorageType storage_type,
+                                 DataType data_type, int size, void* data,
+                                 CLContext* context, LinearStorage* result);
 
 template <DataType T>
 absl::Status CreateLinearStorage(const TensorLinearDescriptor& descriptor,
                                  const tflite::gpu::Tensor<Linear, T>& tensor,
                                  CLContext* context, LinearStorage* result) {
-  LinearStorageCreateInfo creation_info;
-  creation_info.storage_type = descriptor.storage_type;
-  creation_info.data_type = descriptor.element_type;
-  int size = creation_info.aligned_size != 0 ? creation_info.aligned_size
-                                             : tensor.shape.v;
-  const int depth = DivideRoundUp(size, 4);
-  if (creation_info.data_type == DataType::FLOAT32) {
+  const int depth = DivideRoundUp(tensor.shape.v, 4);
+  if (descriptor.element_type == DataType::FLOAT32) {
     std::vector<float4> gpu_data(depth);
     CopyLinearFLT4(tensor, absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateLinearStorage(creation_info, depth, gpu_data.data(),
-                                        context, result));
+    RETURN_IF_ERROR(CreateLinearStorage(descriptor.storage_type,
+                                        descriptor.element_type, depth,
+                                        gpu_data.data(), context, result));
   } else {
     std::vector<half4> gpu_data(depth);
     CopyLinearFLT4(tensor, absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateLinearStorage(creation_info, depth, gpu_data.data(),
-                                        context, result));
+    RETURN_IF_ERROR(CreateLinearStorage(descriptor.storage_type,
+                                        descriptor.element_type, depth,
+                                        gpu_data.data(), context, result));
   }
-  result->SetName(creation_info.name);
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/model_hints.h b/tensorflow/lite/delegates/gpu/cl/model_hints.h
index 7661cc0dacb..7c0f4b55b1d 100644
--- a/tensorflow/lite/delegates/gpu/cl/model_hints.h
+++ b/tensorflow/lite/delegates/gpu/cl/model_hints.h
@@ -25,13 +25,18 @@ namespace cl {
 struct ModelHints {
   using ModelHint = uint64_t;
 
-  // By default we want the fastest inference
+  // By default we want the fastest inference.
   static constexpr ModelHint kFastestInference = 0x00000000;
-  // Can improve compilation time, but inference can be slower
+  // Can improve compilation time, but inference can be slower.
   static constexpr ModelHint kReduceKernelsCount = 0x00000001;
-  // Can improve tuning time, but inference can be slower
+  // Can improve tuning time, but inference can be slower.
   static constexpr ModelHint kFastTuning = 0x00000002;
 
+  // Experimental.
+  // Can improve performance and memory consumption, but slow down
+  // initialization a lot and create more kernels.
+  static constexpr ModelHint kAllowSpecialKernels = 0x00000004;
+
   void Add(ModelHint hint) {
     if (hint == kFastestInference) {
       hints = kFastestInference;
diff --git a/tensorflow/lite/delegates/gpu/cl/program_cache.h b/tensorflow/lite/delegates/gpu/cl/program_cache.h
index 21f9583a59a..81649d677f7 100644
--- a/tensorflow/lite/delegates/gpu/cl/program_cache.h
+++ b/tensorflow/lite/delegates/gpu/cl/program_cache.h
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include <cstdint>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
@@ -93,8 +93,8 @@ class ProgramCache {
   // There is a low probability of a hash collision when cache is deserialized
   // because only fingerprints are serialized instead of full source code.
   bool use_fingerprints_ = false;
-  std::unordered_map<ProgramDescriptor, CLProgram, ProgramDescriptorHasher,
-                     ProgramDescriptorEqual>
+  absl::flat_hash_map<ProgramDescriptor, CLProgram, ProgramDescriptorHasher,
+                      ProgramDescriptorEqual>
       programs_;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/BUILD b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
index ff196cfaf71..7ea0ac35f89 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
@@ -108,6 +108,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/cl/kernels:conv_common",
         "//tensorflow/lite/delegates/gpu/cl/kernels:elementwise",
         "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:mean_stddev_normalization",
         "//tensorflow/lite/delegates/gpu/cl/selectors:default_selector",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:model",
@@ -124,6 +125,7 @@ cc_library(
     srcs = ["simple_selectors.cc"],
     hdrs = ["simple_selectors.h"],
     deps = [
+        "//tensorflow/lite/delegates/gpu/cl:cl_device",
         "//tensorflow/lite/delegates/gpu/cl/kernels:add",
         "//tensorflow/lite/delegates/gpu/cl/kernels:concat_xy",
         "//tensorflow/lite/delegates/gpu/cl/kernels:concat_z",
@@ -152,6 +154,26 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "special_selector",
+    srcs = ["special_selector.cc"],
+    hdrs = ["special_selector.h"],
+    deps = [
+        ":subgraph",
+        "//tensorflow/lite/delegates/gpu/cl:cl_device",
+        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
+        "//tensorflow/lite/delegates/gpu/cl/kernels/special:depthwise_conv_plus_1x1_conv",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:model",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "@com_google_absl//absl/types:any",
+    ],
+)
+
 cc_library(
     name = "subgraph",
     srcs = ["subgraph.cc"],
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
index 3841c415301..4a97bdddd09 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
@@ -73,6 +73,7 @@ absl::Status SelectConvolutionDynamicWeightsAdreno(
 }
 
 absl::Status SelectConvolutionNVidia(const Convolution2DAttributes& attr,
+                                     const BHWC& dst_shape,
                                      const CreationContext& creation_context,
                                      const OperationDef& op_def,
                                      std::unique_ptr<GPUOperation>* ptr) {
@@ -82,7 +83,8 @@ absl::Status SelectConvolutionNVidia(const Convolution2DAttributes& attr,
     *ptr = absl::make_unique<ConvConstants>(std::move(conv));
   } else {
     ConvPowerVR conv;
-    RETURN_IF_ERROR(CreateConvPowerVR(creation_context, op_def, attr, &conv));
+    RETURN_IF_ERROR(
+        CreateConvPowerVR(creation_context, op_def, attr, &conv, &dst_shape));
     *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
   }
   return absl::OkStatus();
@@ -165,21 +167,21 @@ absl::Status SelectConvolution(const Convolution2DAttributes& attr,
                                const CreationContext& creation_context,
                                const OperationDef& op_def, ModelHints hints,
                                std::unique_ptr<GPUOperation>* ptr) {
-  switch (creation_context.device->vendor()) {
-    case Vendor::QUALCOMM:
-      return SelectConvolutionAdreno(attr, dst_shape, creation_context, op_def,
+  const auto& device_info = creation_context.device->info_;
+  if (device_info.IsAdreno()) {
+    return SelectConvolutionAdreno(attr, dst_shape, creation_context, op_def,
                                      hints, ptr);
-    case Vendor::POWERVR:
-    case Vendor::INTEL:
-    case Vendor::AMD:
-      return SelectConvolutionPowerVR(attr, creation_context, op_def, ptr);
-    case Vendor::NVIDIA:
-      return SelectConvolutionNVidia(attr, creation_context, op_def, ptr);
-    case Vendor::MALI:
-      return SelectConvolutionMali(attr, dst_shape, creation_context, op_def,
+  } else if (device_info.IsPowerVR() || device_info.IsAMD() ||
+             device_info.IsIntel()) {
+    return SelectConvolutionPowerVR(attr, creation_context, op_def, ptr);
+  } else if (device_info.IsNvidia()) {
+    return SelectConvolutionNVidia(attr, dst_shape, creation_context, op_def,
+                                     ptr);
+  } else if (device_info.IsMali()) {
+    return SelectConvolutionMali(attr, dst_shape, creation_context, op_def,
                                    ptr);
-    default:
-      return SelectConvolutionAdreno(attr, dst_shape, creation_context, op_def,
+  } else {
+    return SelectConvolutionAdreno(attr, dst_shape, creation_context, op_def,
                                      hints, ptr);
   }
 }
@@ -188,25 +190,22 @@ absl::Status SelectConvolutionForWinograd(
     const Convolution2DAttributes& attr, const BHWC& dst_shape,
     const CreationContext& creation_context, const OperationDef& op_def,
     ModelHints hints, std::unique_ptr<GPUOperation>* ptr) {
-  switch (creation_context.device->vendor()) {
-    case Vendor::QUALCOMM:
-      return SelectConvolutionWinogradAdreno(attr, dst_shape, creation_context,
+  const auto& device_info = creation_context.device->info_;
+  if (device_info.IsAdreno()) {
+    return SelectConvolutionWinogradAdreno(attr, dst_shape, creation_context,
                                              op_def, hints, ptr);
-    case Vendor::POWERVR:
-    case Vendor::AMD:
-    case Vendor::INTEL:
-    case Vendor::NVIDIA: {
-      ConvPowerVR conv;
-      RETURN_IF_ERROR(
-          CreateConvPowerVRWino4x4To6x6(creation_context, op_def, attr, &conv));
+  } else if (device_info.IsPowerVR() || device_info.IsAMD() ||
+             device_info.IsNvidia() || device_info.IsIntel()) {
+    ConvPowerVR conv;
+      RETURN_IF_ERROR(CreateConvPowerVRWino4x4To6x6(creation_context, op_def,
+                                                    attr, &conv, &dst_shape));
       *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
       return absl::OkStatus();
-    }
-    case Vendor::MALI:
-      return SelectConvolutionWinogradMali(attr, dst_shape, creation_context,
+  } else if (device_info.IsMali()) {
+    return SelectConvolutionWinogradMali(attr, dst_shape, creation_context,
                                            op_def, ptr);
-    default:
-      return SelectConvolutionWinogradAdreno(attr, dst_shape, creation_context,
+  } else {
+    return SelectConvolutionWinogradAdreno(attr, dst_shape, creation_context,
                                              op_def, hints, ptr);
   }
 }
@@ -216,23 +215,22 @@ absl::Status SelectConvolutionWithDynamicWeights(
     const BHWC& dst_shape, const CreationContext& creation_context,
     const OperationDef& op_def, ModelHints hints,
     std::unique_ptr<GPUOperation>* ptr, ConvWeightsDescription* weights_desc) {
-  switch (creation_context.device->vendor()) {
-    case Vendor::QUALCOMM:
-      return SelectConvolutionDynamicWeightsAdreno(
-          attr, weights_shape, dst_shape, creation_context, op_def, hints, ptr,
-          weights_desc);
-    case Vendor::MALI:
-      return SelectConvolutionDynamicWeightsMali(attr, weights_shape, dst_shape,
+  const auto& device_info = creation_context.device->info_;
+  if (device_info.IsAdreno()) {
+    return SelectConvolutionDynamicWeightsAdreno(attr, weights_shape, dst_shape,
                                                  creation_context, op_def,
                                                  hints, ptr, weights_desc);
-    default: {
-      ConvPowerVR conv;
-      RETURN_IF_ERROR(CreateConvPowerVRDynamicWeights(
-          creation_context, op_def, attr, weights_shape, &conv, &dst_shape));
-      *weights_desc = conv.GetConvWeightsDescription();
-      *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
-      return absl::OkStatus();
-    }
+  } else if (device_info.IsMali()) {
+    return SelectConvolutionDynamicWeightsMali(attr, weights_shape, dst_shape,
+                                               creation_context, op_def, hints,
+                                               ptr, weights_desc);
+  } else {
+    ConvPowerVR conv;
+    RETURN_IF_ERROR(CreateConvPowerVRDynamicWeights(
+        creation_context, op_def, attr, weights_shape, &conv, &dst_shape));
+    *weights_desc = conv.GetConvWeightsDescription();
+    *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
+    return absl::OkStatus();
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.cc
index 5fdfdca073e..c00d9392702 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.cc
@@ -105,22 +105,19 @@ absl::Status SelectConvolutionTransposed(
     const ConvolutionTransposedAttributes& attr,
     const CreationContext& creation_context, const OperationDef& op_def,
     std::unique_ptr<GPUOperation>* ptr) {
-  switch (creation_context.device->vendor()) {
-    case Vendor::QUALCOMM:
-      return SelectConvolutionTransposedAdreno(attr, creation_context, op_def,
-                                               ptr);
-    case Vendor::POWERVR:
-    case Vendor::NVIDIA:
-    case Vendor::AMD:
-    case Vendor::INTEL:
-      return SelectConvolutionTransposedPowerVR(attr, creation_context, op_def,
-                                                ptr);
-    case Vendor::MALI:
-      return SelectConvolutionTransposedMali(attr, creation_context, op_def,
+  const auto& device_info = creation_context.device->info_;
+  if (device_info.IsAdreno()) {
+    return SelectConvolutionTransposedAdreno(attr, creation_context, op_def,
+                                             ptr);
+  } else if (device_info.IsPowerVR() || device_info.IsAMD() ||
+             device_info.IsNvidia() || device_info.IsIntel()) {
+    return SelectConvolutionTransposedPowerVR(attr, creation_context, op_def,
+                                              ptr);
+  } else if (device_info.IsMali()) {
+    return SelectConvolutionTransposedMali(attr, creation_context, op_def, ptr);
+  } else {
+    return SelectConvolutionTransposedAdreno(attr, creation_context, op_def,
                                              ptr);
-    default:
-      return SelectConvolutionTransposedAdreno(attr, creation_context, op_def,
-                                               ptr);
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.cc
index 9ae87c6ba07..b89f271365f 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.cc
@@ -30,7 +30,7 @@ absl::Status SelectDWConvolutionAdreno(
     const DepthwiseConvolution2DAttributes& attr,
     const CreationContext& creation_context, const OperationDef& op_def,
     std::unique_ptr<GPUOperation>* ptr) {
-  if (!op_def.IsBatchSupported() && IsDepthwiseConv3x3Supported(attr)) {
+  if (IsDepthwiseConv3x3Supported(attr)) {
     DepthwiseConv3x3 dw_conv;
     RETURN_IF_ERROR(
         CreateDepthwiseConv3x3(creation_context, op_def, attr, &dw_conv));
@@ -48,7 +48,7 @@ absl::Status SelectDWConvolutionPowerVR(
     const DepthwiseConvolution2DAttributes& attr,
     const CreationContext& creation_context, const OperationDef& op_def,
     std::unique_ptr<GPUOperation>* ptr) {
-  if (!op_def.IsBatchSupported() && IsDepthwiseConv3x3Supported(attr)) {
+  if (IsDepthwiseConv3x3Supported(attr)) {
     DepthwiseConv3x3 dw_conv;
     RETURN_IF_ERROR(
         CreateDepthwiseConv3x3(creation_context, op_def, attr, &dw_conv));
@@ -69,10 +69,9 @@ absl::Status SelectDWConvolutionMali(
   const auto storage_type = op_def.src_tensors[0].storage_type;
   bool buffer_type = storage_type == TensorStorageType::BUFFER ||
                      storage_type == TensorStorageType::IMAGE_BUFFER;
-  MaliInfo mali_info = creation_context.device->GetInfo().mali_info;
+  MaliInfo mali_info = creation_context.device->info_.mali_info;
   if (IsDepthwiseConv3x3Supported(attr) && !mali_info.IsMidgard() &&
-      !buffer_type && !op_def.IsBatchSupported() &&
-      op_def.precision != CalculationsPrecision::F32) {
+      !buffer_type && op_def.precision != CalculationsPrecision::F32) {
     DepthwiseConv3x3 dw_conv;
     RETURN_IF_ERROR(
         CreateDepthwiseConv3x3(creation_context, op_def, attr, &dw_conv));
@@ -91,15 +90,15 @@ absl::Status SelectDWConvolution(const DepthwiseConvolution2DAttributes& attr,
                                  const CreationContext& creation_context,
                                  const OperationDef& op_def,
                                  std::unique_ptr<GPUOperation>* ptr) {
-  switch (creation_context.device->vendor()) {
-    case Vendor::QUALCOMM:
-      return SelectDWConvolutionAdreno(attr, creation_context, op_def, ptr);
-    case Vendor::POWERVR:
-      return SelectDWConvolutionPowerVR(attr, creation_context, op_def, ptr);
-    case Vendor::MALI:
-      return SelectDWConvolutionMali(attr, creation_context, op_def, ptr);
-    default:
-      return SelectDWConvolutionAdreno(attr, creation_context, op_def, ptr);
+  const auto& device_info = creation_context.device->info_;
+  if (device_info.IsAdreno()) {
+    return SelectDWConvolutionAdreno(attr, creation_context, op_def, ptr);
+  } else if (device_info.IsPowerVR()) {
+    return SelectDWConvolutionPowerVR(attr, creation_context, op_def, ptr);
+  } else if (device_info.IsMali()) {
+    return SelectDWConvolutionMali(attr, creation_context, op_def, ptr);
+  } else {
+    return SelectDWConvolutionAdreno(attr, creation_context, op_def, ptr);
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
index eacbea8b586..0df8e243da3 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
@@ -104,22 +104,20 @@ absl::Status SelectFullyConnected(const FullyConnectedAttributes& attr,
                                   const CreationContext& creation_context,
                                   const OperationDef& op_def, int batch_size,
                                   std::unique_ptr<GPUOperation>* ptr) {
-  switch (creation_context.device->vendor()) {
-    case Vendor::QUALCOMM:
-      return SelectFullyConnectedAdreno(attr, creation_context, op_def,
-                                        batch_size, ptr);
-    case Vendor::POWERVR:
-    case Vendor::AMD:
-    case Vendor::NVIDIA:
-    case Vendor::INTEL:
-      return SelectFullyConnectedPowerVR(attr, creation_context, op_def,
-                                         batch_size, ptr);
-    case Vendor::MALI:
-      return SelectFullyConnectedMali(attr, creation_context, op_def,
+  const auto& device_info = creation_context.device->info_;
+  if (device_info.IsAdreno()) {
+    return SelectFullyConnectedAdreno(attr, creation_context, op_def,
                                       batch_size, ptr);
-    default:
-      return SelectFullyConnectedGeneric(attr, creation_context, op_def,
-                                         batch_size, ptr);
+  } else if (device_info.IsPowerVR() || device_info.IsAMD() ||
+             device_info.IsNvidia() || device_info.IsIntel()) {
+    return SelectFullyConnectedPowerVR(attr, creation_context, op_def,
+                                       batch_size, ptr);
+  } else if (device_info.IsMali()) {
+    return SelectFullyConnectedMali(attr, creation_context, op_def, batch_size,
+                                    ptr);
+  } else {
+    return SelectFullyConnectedGeneric(attr, creation_context, op_def,
+                                       batch_size, ptr);
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
index 1863cedb793..b257e5a85da 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h"
 #include "tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h"
 #include "tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.h"
 #include "tensorflow/lite/delegates/gpu/cl/selectors/default_selector.h"
@@ -57,6 +58,8 @@ bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr,
 }
 
 absl::Status WinogradFromNode(const CreationContext& creation_context,
+                              const std::vector<Value*>& inputs,
+                              const std::vector<Value*>& outputs,
                               const OperationDef& op_def, ModelHints hints,
                               const BHWC& input_shape, const BHWC& output_shape,
                               const Convolution2DAttributes& attr,
@@ -72,14 +75,14 @@ absl::Status WinogradFromNode(const CreationContext& creation_context,
   const BHWC shape_1{input_shape.b, 36, tiles_x * tiles_y, output_shape.c};
   TensorDescriptor td_0;
   td_0.storage_type = SelectBestStorageType(
-      *creation_context.context, *creation_context.device, shape_0,
+      creation_context.device->info_, shape_0,
       op_def.src_tensors[0].storage_type, op_def.src_tensors[0].data_type,
       op_def.src_tensors[0].layout);
   td_0.data_type = op_def.src_tensors[0].data_type;
   td_0.layout = op_def.src_tensors[0].layout;
   TensorDescriptor td_1;
   td_1.storage_type = SelectBestStorageType(
-      *creation_context.context, *creation_context.device, shape_1,
+      creation_context.device->info_, shape_1,
       op_def.src_tensors[0].storage_type, op_def.src_tensors[0].data_type,
       op_def.src_tensors[0].layout);
   td_1.data_type = op_def.src_tensors[0].data_type;
@@ -95,7 +98,7 @@ absl::Status WinogradFromNode(const CreationContext& creation_context,
   auto& winograd_up = gpu_subgraph->operations[0];
   RETURN_IF_ERROR(SelectWinograd4x4To36(
       creation_context, attr.padding, winograd_up_def, &winograd_up.operation));
-  winograd_up.input_ids = {0};
+  winograd_up.input_ids = {static_cast<int>(inputs[0]->id)};
   winograd_up.output_ids = {-1};
 
   OperationDef conv_def;
@@ -114,7 +117,7 @@ absl::Status WinogradFromNode(const CreationContext& creation_context,
   winograd_down_def.dst_tensors.push_back(op_def.dst_tensors[0]);
   auto& winograd_down = gpu_subgraph->operations[2];
   winograd_down.input_ids = {-2};
-  winograd_down.output_ids = {0};
+  winograd_down.output_ids = {static_cast<int>(outputs[0]->id)};
   auto bias_copy = attr.bias;
   if (bias_copy.shape.v < attr.weights.shape.o) {
     bias_copy.shape = Linear(attr.weights.shape.o);
@@ -141,9 +144,9 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
       if (inputs.size() == 2 &&
           (inputs[0]->tensor.shape.c == inputs[1]->tensor.shape.c ||
            inputs[1]->tensor.shape.c == 1)) {
-        ElementwiseTwoInput operation =
+        GPUOperation operation =
             CreateElementwiseTwoInput(op_def, op_type, inputs[1]->tensor.shape);
-        *gpu_op = absl::make_unique<ElementwiseTwoInput>(std::move(operation));
+        *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
         return absl::OkStatus();
       } else if (inputs.size() >= 2) {
         auto output = outputs[0];
@@ -154,36 +157,13 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
         SelectAdd(op_def, channels, output->tensor.shape.c, gpu_op);
         return absl::OkStatus();
       } else if (inputs.size() == 1 && node.operation.attributes.has_value()) {
-        auto attr = absl::any_cast<AddAttributes>(node.operation.attributes);
-        const float* scalar = absl::get_if<float>(&attr.param);
-        const auto* linear_tensor =
-            absl::get_if<tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(
-                &attr.param);
-        const auto* hwc_tensor =
-            absl::get_if<tflite::gpu::Tensor<HWC, DataType::FLOAT32>>(
-                &attr.param);
-        if (scalar) {
-          ElementwiseOneRuntimeOneScalar operation =
-              CreateElementwiseOneRuntimeOneScalar(creation_context, op_def,
-                                                   op_type, *scalar);
-          *gpu_op = absl::make_unique<ElementwiseOneRuntimeOneScalar>(
-              std::move(operation));
-          return absl::OkStatus();
-        } else if (linear_tensor) {
-          ElementwiseTwoInput operation;
-          RETURN_IF_ERROR(CreateElementwiseTwoInput(
-              creation_context, op_def, op_type, *linear_tensor, &operation));
-          *gpu_op =
-              absl::make_unique<ElementwiseTwoInput>(std::move(operation));
-          return absl::OkStatus();
-        } else if (hwc_tensor) {
-          ElementwiseTwoInput operation;
-          RETURN_IF_ERROR(CreateElementwiseTwoInput(
-              creation_context, op_def, op_type, *hwc_tensor, &operation));
-          *gpu_op =
-              absl::make_unique<ElementwiseTwoInput>(std::move(operation));
-          return absl::OkStatus();
-        }
+        auto attr =
+            absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
+        GPUOperation operation;
+        RETURN_IF_ERROR(CreateElementwise(creation_context, op_def, op_type,
+                                          attr, &operation));
+        *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
+        return absl::OkStatus();
       }
       return absl::UnimplementedError(absl::StrCat(
           "No support of ", node.operation.type, " with this parameters"));
@@ -194,7 +174,8 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
       for (int i = 0; i < inputs.size(); ++i) {
         channels[i] = inputs[i]->tensor.shape.c;
       }
-      return SelectConcat(attr, channels, op_def, gpu_op);
+      return SelectConcat(attr, channels, op_def,
+                          creation_context.device->info_, gpu_op);
     }
     case OperationType::CONVOLUTION_2D: {
       auto attr =
@@ -202,8 +183,8 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
       auto input_shape = inputs[0]->tensor.shape;
       auto output_shape = outputs[0]->tensor.shape;
       if (inputs.size() == 1) {
-        if (WinogradFromNode(creation_context, op_def, hints, input_shape,
-                             output_shape, attr, gpu_subgraph)
+        if (WinogradFromNode(creation_context, inputs, outputs, op_def, hints,
+                             input_shape, output_shape, attr, gpu_subgraph)
                 .ok()) {
           return absl::OkStatus();
         } else {
@@ -215,13 +196,13 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
         auto weights_shape = inputs[1]->tensor.shape;
         TensorDescriptor weights_desc = {op_def.src_tensors[1].data_type,
                                          TensorStorageType::BUFFER,
-                                         Layout::UNKNOWN};
+                                         Layout::BHWC};
         gpu_subgraph->operations.clear();
         gpu_subgraph->operations.resize(2);
         auto& converter_op = gpu_subgraph->operations[0];
         auto& conv_op = gpu_subgraph->operations[1];
-        conv_op.input_ids = {0, -1};
-        conv_op.output_ids = {0};
+        conv_op.input_ids = {static_cast<int>(inputs[0]->id), -1};
+        conv_op.output_ids = {static_cast<int>(outputs[0]->id)};
         OperationDef conv_def = op_def;
         conv_def.src_tensors[1] = weights_desc;
         ConvWeightsDescription conv_weights_desc;
@@ -242,7 +223,7 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
         converter_def.src_tensors.push_back(op_def.src_tensors[1]);
         converter_def.dst_tensors.push_back(weights_desc);
 
-        converter_op.input_ids = {1};
+        converter_op.input_ids = {static_cast<int>(inputs[1]->id)};
         converter_op.output_ids = {-1};
         return SelectConverterToConvWeights(conv_weights_desc, creation_context,
                                             converter_def, hints,
@@ -267,7 +248,7 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
                                   inputs[0]->tensor.shape.b, gpu_op);
     }
     case OperationType::LSTM: {
-      SelectLSTM(op_def, gpu_op);
+      SelectLSTM(op_def, creation_context.device->info_, gpu_op);
       return absl::OkStatus();
     }
     case OperationType::MAX_UNPOOLING_2D: {
@@ -278,49 +259,13 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
     }
     case OperationType::MEAN: {
       auto attr = absl::any_cast<MeanAttributes>(node.operation.attributes);
-      return SelectMean(attr, op_def, gpu_op);
+      return SelectMean(attr, op_def, creation_context.device->info_, gpu_op);
     }
-    case OperationType::MUL: {
-      if (inputs.size() == 2) {
-        ElementwiseTwoInput operation =
-            CreateElementwiseTwoInput(op_def, op_type, inputs[1]->tensor.shape);
-        *gpu_op = absl::make_unique<ElementwiseTwoInput>(std::move(operation));
-        return absl::OkStatus();
-      } else if (inputs.size() == 1 && node.operation.attributes.has_value()) {
-        auto attr =
-            absl::any_cast<MultiplyAttributes>(node.operation.attributes);
-        const float* scalar = absl::get_if<float>(&attr.param);
-        const auto* linear_tensor =
-            absl::get_if<tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(
-                &attr.param);
-        const auto* hwc_tensor =
-            absl::get_if<tflite::gpu::Tensor<HWC, DataType::FLOAT32>>(
-                &attr.param);
-        if (scalar) {
-          ElementwiseOneRuntimeOneScalar operation =
-              CreateElementwiseOneRuntimeOneScalar(creation_context, op_def,
-                                                   op_type, *scalar);
-          *gpu_op = absl::make_unique<ElementwiseOneRuntimeOneScalar>(
-              std::move(operation));
-          return absl::OkStatus();
-        } else if (linear_tensor) {
-          ElementwiseTwoInput operation;
-          RETURN_IF_ERROR(CreateElementwiseTwoInput(
-              creation_context, op_def, op_type, *linear_tensor, &operation));
-          *gpu_op =
-              absl::make_unique<ElementwiseTwoInput>(std::move(operation));
-          return absl::OkStatus();
-        } else if (hwc_tensor) {
-          ElementwiseTwoInput operation;
-          RETURN_IF_ERROR(CreateElementwiseTwoInput(
-              creation_context, op_def, op_type, *hwc_tensor, &operation));
-          *gpu_op =
-              absl::make_unique<ElementwiseTwoInput>(std::move(operation));
-          return absl::OkStatus();
-        }
-      }
-      return absl::UnimplementedError(absl::StrCat(
-          "No support of ", node.operation.type, " with this parameters"));
+    case OperationType::MEAN_STDDEV_NORMALIZATION: {
+      MeanStdDevNormalization operation = CreateMeanStdDevNormalization(op_def);
+      *gpu_op =
+          absl::make_unique<MeanStdDevNormalization>(std::move(operation));
+      return absl::OkStatus();
     }
     case OperationType::PAD: {
       auto attr = absl::any_cast<PadAttributes>(node.operation.attributes);
@@ -340,8 +285,8 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
     case OperationType::QUANTIZE_AND_DEQUANTIZE: {
       auto attr = absl::any_cast<QuantizeAndDequantizeAttributes>(
           node.operation.attributes);
-      return SelectQuantizeAndDequantize(attr, creation_context, op_def,
-                                         gpu_op);
+      SelectQuantizeAndDequantize(attr, creation_context, op_def, gpu_op);
+      return absl::OkStatus();
     }
     case OperationType::RELU: {
       auto attr = absl::any_cast<ReLUAttributes>(node.operation.attributes);
@@ -380,7 +325,9 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
       return absl::OkStatus();
     }
     case OperationType::ABS:
+    case OperationType::COPY:
     case OperationType::COS:
+    case OperationType::ELU:
     case OperationType::EXP:
     case OperationType::HARD_SWISH:
     case OperationType::LOG:
@@ -390,54 +337,30 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
     case OperationType::SQRT:
     case OperationType::SQUARE:
     case OperationType::TANH: {
-      ElementwiseOneInput operation =
-          CreateElementwiseOneInput(op_def, op_type);
-      *gpu_op = absl::make_unique<ElementwiseOneInput>(std::move(operation));
+      GPUOperation operation = CreateElementwiseOneInput(op_def, op_type);
+      *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
       return absl::OkStatus();
     }
     case OperationType::DIV:
     case OperationType::MAXIMUM:
     case OperationType::MINIMUM:
+    case OperationType::MUL:
     case OperationType::POW:
     case OperationType::SQUARED_DIFF:
     case OperationType::SUB: {
       if (inputs.size() == 2) {
-        ElementwiseTwoInput operation =
+        GPUOperation operation =
             CreateElementwiseTwoInput(op_def, op_type, inputs[1]->tensor.shape);
-        *gpu_op = absl::make_unique<ElementwiseTwoInput>(std::move(operation));
+        *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
         return absl::OkStatus();
       } else if (inputs.size() == 1 && node.operation.attributes.has_value()) {
         auto attr =
             absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
-        const float* scalar = absl::get_if<float>(&attr.param);
-        const auto* linear_tensor =
-            absl::get_if<tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(
-                &attr.param);
-        const auto* hwc_tensor =
-            absl::get_if<tflite::gpu::Tensor<HWC, DataType::FLOAT32>>(
-                &attr.param);
-        if (scalar) {
-          ElementwiseOneRuntimeOneScalar operation =
-              CreateElementwiseOneRuntimeOneScalar(creation_context, op_def,
-                                                   op_type, *scalar);
-          *gpu_op = absl::make_unique<ElementwiseOneRuntimeOneScalar>(
-              std::move(operation));
-          return absl::OkStatus();
-        } else if (linear_tensor) {
-          ElementwiseTwoInput operation;
-          RETURN_IF_ERROR(CreateElementwiseTwoInput(
-              creation_context, op_def, op_type, *linear_tensor, &operation));
-          *gpu_op =
-              absl::make_unique<ElementwiseTwoInput>(std::move(operation));
-          return absl::OkStatus();
-        } else if (hwc_tensor) {
-          ElementwiseTwoInput operation;
-          RETURN_IF_ERROR(CreateElementwiseTwoInput(
-              creation_context, op_def, op_type, *hwc_tensor, &operation));
-          *gpu_op =
-              absl::make_unique<ElementwiseTwoInput>(std::move(operation));
-          return absl::OkStatus();
-        }
+        GPUOperation operation;
+        RETURN_IF_ERROR(CreateElementwise(creation_context, op_def, op_type,
+                                          attr, &operation));
+        *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
+        return absl::OkStatus();
       }
       return absl::UnimplementedError(absl::StrCat(
           "No support of ", node.operation.type, " with this parameters"));
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
index d9bd70cc6b3..ca5ec9f4f23 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
@@ -45,26 +45,26 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-void SelectLSTM(const OperationDef& op_def,
+void SelectLSTM(const OperationDef& op_def, const DeviceInfo& device_info,
                 std::unique_ptr<GPUOperation>* ptr) {
-  LSTM operation = CreateLSTM(op_def);
+  LSTM operation = CreateLSTM(op_def, device_info);
   *ptr = absl::make_unique<LSTM>(std::move(operation));
 }
 
 void SelectReLU(const CreationContext& creation_context,
                 const ReLUAttributes& attr, const OperationDef& op_def,
                 std::unique_ptr<GPUOperation>* ptr) {
-  ReLU relu = CreateReLU(creation_context, op_def, attr);
-  *ptr = absl::make_unique<ReLU>(std::move(relu));
+  GPUOperation relu = CreateReLU(creation_context, op_def, attr);
+  *ptr = absl::make_unique<GPUOperation>(std::move(relu));
 }
 
 absl::Status SelectPReLU(const PReLUAttributes& attr,
                          const CreationContext& creation_context,
                          const OperationDef& op_def,
                          std::unique_ptr<GPUOperation>* ptr) {
-  PReLU operation;
+  GPUOperation operation;
   RETURN_IF_ERROR(CreatePReLU(creation_context, op_def, attr, &operation));
-  *ptr = absl::make_unique<PReLU>(std::move(operation));
+  *ptr = absl::make_unique<GPUOperation>(std::move(operation));
   return absl::OkStatus();
 }
 
@@ -83,8 +83,8 @@ void SelectMaxUnpooling(const MaxUnpooling2DAttributes& attr,
 
 void SelectAdd(const OperationDef& op_def, const std::vector<int>& channels,
                int dst_channels, std::unique_ptr<GPUOperation>* ptr) {
-  Add operation = CreateAdd(op_def, channels, dst_channels);
-  *ptr = absl::make_unique<Add>(std::move(operation));
+  GPUOperation operation = CreateAdd(op_def, channels, dst_channels);
+  *ptr = absl::make_unique<GPUOperation>(std::move(operation));
 }
 
 absl::Status SelectResize(const Resize2DAttributes& attr,
@@ -98,10 +98,11 @@ absl::Status SelectResize(const Resize2DAttributes& attr,
 absl::Status SelectConcat(const ConcatAttributes& attr,
                           const std::vector<int>& channels,
                           const OperationDef& op_def,
+                          const DeviceInfo& device_info,
                           std::unique_ptr<GPUOperation>* ptr) {
   switch (attr.axis) {
     case Axis::CHANNELS: {
-      ConcatZ operation = CreateConcatZ(op_def, channels);
+      ConcatZ operation = CreateConcatZ(op_def, channels, device_info);
       *ptr = absl::make_unique<ConcatZ>(std::move(operation));
       return absl::OkStatus();
     }
@@ -109,7 +110,7 @@ absl::Status SelectConcat(const ConcatAttributes& attr,
     case Axis::DEPTH:
     case Axis::HEIGHT:
     case Axis::WIDTH: {
-      ConcatXY operation = CreateConcatXY(op_def, attr, channels.size());
+      ConcatXY operation = CreateConcatXY(op_def, attr);
       *ptr = absl::make_unique<ConcatXY>(std::move(operation));
       return absl::OkStatus();
     }
@@ -150,11 +151,12 @@ void SelectStridedSlice(const SliceAttributes& attr, const OperationDef& op_def,
 }
 
 absl::Status SelectMean(const MeanAttributes& attr, const OperationDef& op_def,
+                        const DeviceInfo& device_info,
                         std::unique_ptr<GPUOperation>* ptr) {
   if (attr.dims != std::set<Axis>({Axis::HEIGHT, Axis::WIDTH})) {
     return absl::UnimplementedError("Mean operation supports only HW plane");
   }
-  Mean operation = CreateMean(op_def);
+  Mean operation = CreateMean(op_def, device_info);
   *ptr = absl::make_unique<Mean>(std::move(operation));
   return absl::OkStatus();
 }
@@ -199,15 +201,13 @@ absl::Status SelectWinograd36To4x4(
   return absl::OkStatus();
 }
 
-absl::Status SelectQuantizeAndDequantize(
-    const QuantizeAndDequantizeAttributes& attr,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    std::unique_ptr<GPUOperation>* ptr) {
-  QuantizeAndDequantize operation;
-  RETURN_IF_ERROR(
-      CreateQuantizeAndDequantize(creation_context, op_def, attr, &operation));
-  *ptr = absl::make_unique<QuantizeAndDequantize>(std::move(operation));
-  return absl::OkStatus();
+void SelectQuantizeAndDequantize(const QuantizeAndDequantizeAttributes& attr,
+                                 const CreationContext& creation_context,
+                                 const OperationDef& op_def,
+                                 std::unique_ptr<GPUOperation>* ptr) {
+  GPUOperation operation =
+      CreateQuantizeAndDequantize(creation_context, op_def, attr);
+  *ptr = absl::make_unique<GPUOperation>(std::move(operation));
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
index eb111ff9509..556698ef62f 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
@@ -27,7 +28,8 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-void SelectLSTM(const OperationDef& op_def, std::unique_ptr<GPUOperation>* ptr);
+void SelectLSTM(const OperationDef& op_def, const DeviceInfo& device_info,
+                std::unique_ptr<GPUOperation>* ptr);
 
 void SelectReLU(const CreationContext& creation_context,
                 const ReLUAttributes& attr, const OperationDef& op_def,
@@ -55,6 +57,7 @@ absl::Status SelectResize(const Resize2DAttributes& attr,
 absl::Status SelectConcat(const ConcatAttributes& attr,
                           const std::vector<int>& channels,
                           const OperationDef& op_def,
+                          const DeviceInfo& device_info,
                           std::unique_ptr<GPUOperation>* ptr);
 
 void SelectReshape(int src_channels, int dst_channels,
@@ -68,6 +71,7 @@ void SelectStridedSlice(const SliceAttributes& attr, const OperationDef& op_def,
                         std::unique_ptr<GPUOperation>* ptr);
 
 absl::Status SelectMean(const MeanAttributes& attr, const OperationDef& op_def,
+                        const DeviceInfo& device_info,
                         std::unique_ptr<GPUOperation>* ptr);
 
 void SelectSoftmax(const BHWC& shape, const OperationDef& op_def,
@@ -91,10 +95,10 @@ absl::Status SelectWinograd36To4x4(
     const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
     std::unique_ptr<GPUOperation>* ptr);
 
-absl::Status SelectQuantizeAndDequantize(
-    const QuantizeAndDequantizeAttributes& attr,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    std::unique_ptr<GPUOperation>* ptr);
+void SelectQuantizeAndDequantize(const QuantizeAndDequantizeAttributes& attr,
+                                 const CreationContext& creation_context,
+                                 const OperationDef& op_def,
+                                 std::unique_ptr<GPUOperation>* ptr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.cc
new file mode 100644
index 00000000000..8a801b460d1
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.cc
@@ -0,0 +1,111 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/selectors/special_selector.h"
+
+#include "absl/types/any.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+absl::Status TryDepthwiseConvPlus1x1Conv(
+    const CreationContext& creation_context, CalculationsPrecision precision,
+    const GraphFloat32& graph, NodeId first_node_id,
+    const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
+    std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph) {
+  auto* dw_node = graph.GetNode(first_node_id);
+  if (OperationTypeFromString(dw_node->operation.type) !=
+      OperationType::DEPTHWISE_CONVOLUTION) {
+    return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
+  }
+  auto dw_outputs = graph.FindOutputs(dw_node->id);
+  auto consumers = graph.FindConsumers(dw_outputs[0]->id);
+  if (consumers.size() != 1) {
+    return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
+  }
+  auto* conv_node = consumers[0];
+  if (consumed_nodes->find(conv_node->id) != consumed_nodes->end()) {
+    return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
+  }
+  if (OperationTypeFromString(conv_node->operation.type) !=
+      OperationType::CONVOLUTION_2D) {
+    return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
+  }
+  if (graph.FindInputs(conv_node->id).size() != 1) {
+    return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
+  }
+  auto dw_attr = absl::any_cast<DepthwiseConvolution2DAttributes>(
+      dw_node->operation.attributes);
+  auto conv_attr =
+      absl::any_cast<Convolution2DAttributes>(conv_node->operation.attributes);
+  auto dw_inputs = graph.FindInputs(dw_node->id);
+  auto conv_outputs = graph.FindOutputs(conv_node->id);
+  OperationDef op_def;
+  op_def.precision = precision;
+  auto it = tensor_descriptors.find(dw_inputs[0]->id);
+  if (it != tensor_descriptors.end()) {
+    op_def.src_tensors.push_back(it->second);
+  }
+  it = tensor_descriptors.find(conv_outputs[0]->id);
+  if (it != tensor_descriptors.end()) {
+    op_def.dst_tensors.push_back(it->second);
+  }
+  if (!IsDepthwiseConvPlus1x1ConvSupported(*creation_context.device, op_def,
+                                           dw_attr, conv_attr)) {
+    return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
+  }
+  std::unique_ptr<GPUOperation>* gpu_op =
+      InitSingleOpSubgraph(dw_inputs, conv_outputs, gpu_subgraph);
+  DepthwiseConvPlus1x1Conv operation;
+  RETURN_IF_ERROR(CreateDepthwiseConvPlus1x1Conv(
+      creation_context, op_def, dw_attr, conv_attr, &operation));
+  *gpu_op = absl::make_unique<DepthwiseConvPlus1x1Conv>(std::move(operation));
+  consumed_nodes->insert(dw_node->id);
+  consumed_nodes->insert(conv_node->id);
+  return absl::OkStatus();
+}
+}  // namespace
+
+absl::Status GPUSubgraphFromGraph(
+    const CreationContext& creation_context, CalculationsPrecision precision,
+    const GraphFloat32& graph, NodeId first_node_id,
+    const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
+    std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph) {
+  if (!creation_context.device->IsNvidia()) {
+    return absl::NotFoundError(
+        "Experimental feature, enabled for NVidia only, but device is not "
+        "nvidia gpu.");
+  }
+  if (TryDepthwiseConvPlus1x1Conv(creation_context, precision, graph,
+                                  first_node_id, tensor_descriptors,
+                                  consumed_nodes, gpu_subgraph)
+          .ok()) {
+    return absl::OkStatus();
+  }
+  return absl::NotFoundError("No special combination.");
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.h
new file mode 100644
index 00000000000..687d221aac6
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.h
@@ -0,0 +1,43 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_SPECIAL_SELECTOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_SPECIAL_SELECTOR_H_
+
+#include <map>
+#include <set>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/selectors/subgraph.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+absl::Status GPUSubgraphFromGraph(
+    const CreationContext& creation_context, CalculationsPrecision precision,
+    const GraphFloat32& graph, NodeId first_node_id,
+    const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
+    std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_SPECIAL_SELECTOR_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/subgraph.cc b/tensorflow/lite/delegates/gpu/cl/selectors/subgraph.cc
index 0f18a4b7be5..27a40886497 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/subgraph.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/subgraph.cc
@@ -32,10 +32,10 @@ std::unique_ptr<GPUOperation>* InitSingleOpSubgraph(
   gpu_subgraph->new_tensors.clear();
   gpu_subgraph->operations.push_back({});
   for (int i = 0; i < inputs.size(); ++i) {
-    gpu_subgraph->operations[0].input_ids.push_back(i);
+    gpu_subgraph->operations[0].input_ids.push_back(inputs[i]->id);
   }
   for (int i = 0; i < outputs.size(); ++i) {
-    gpu_subgraph->operations[0].output_ids.push_back(i);
+    gpu_subgraph->operations[0].output_ids.push_back(outputs[i]->id);
   }
 
   return &gpu_subgraph->operations[0].operation;
diff --git a/tensorflow/lite/delegates/gpu/cl/storage_type_util.cc b/tensorflow/lite/delegates/gpu/cl/storage_type_util.cc
index 755da0c7619..ddcb65e07f9 100644
--- a/tensorflow/lite/delegates/gpu/cl/storage_type_util.cc
+++ b/tensorflow/lite/delegates/gpu/cl/storage_type_util.cc
@@ -15,18 +15,16 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/cl/storage_type_util.h"
 
-#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
-#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
 
 namespace tflite {
 namespace gpu {
 namespace cl {
 
-bool CanCreateTensorWithShape(const CLContext& context, const CLDevice& device,
-                              const BHWDC& shape,
+bool CanCreateTensorWithShape(const DeviceInfo& device_info, const BHWDC& shape,
                               const TensorDescriptor& descriptor) {
   const int slices = DivideRoundUp(shape.c, 4);
   switch (descriptor.storage_type) {
@@ -35,64 +33,60 @@ bool CanCreateTensorWithShape(const CLContext& context, const CLDevice& device,
           4 * (descriptor.data_type == DataType::FLOAT32 ? 4 : 2);
       const int buffer_size =
           shape.b * shape.w * shape.h * shape.d * slices * flt4_size;
-      return buffer_size <= device.GetInfo().buffer_max_size;
+      return buffer_size <= device_info.buffer_max_size;
     }
     case TensorStorageType::IMAGE_BUFFER:
       return shape.b * shape.w * shape.h * shape.d * slices <=
-             device.GetInfo().image_buffer_max_size;
+             device_info.image_buffer_max_size;
     case TensorStorageType::TEXTURE_3D:
-      if (device.cl_version() < OpenCLVersion::CL_1_2 && slices == 1) {
+      if (device_info.cl_version < OpenCLVersion::CL_1_2 && slices == 1) {
         // clCreateImage3D (that used in CL 1.0/1.1) can not create image with
         // depth = 1 by specification;
         return false;
       }
-      return shape.w * shape.b <= device.GetInfo().image3d_max_width &&
-             shape.h <= device.GetInfo().image3d_max_height &&
-             slices * shape.d <= device.GetInfo().image3d_max_depth;
+      return shape.w * shape.b <= device_info.image3d_max_width &&
+             shape.h <= device_info.image3d_max_height &&
+             slices * shape.d <= device_info.image3d_max_depth;
     case TensorStorageType::TEXTURE_ARRAY:
       // Bug on some Adreno. b/131099086
-      if (slices == 1 && !device.SupportsOneLayerTextureArray()) {
+      if (slices == 1 && !device_info.SupportsOneLayerTextureArray()) {
         return false;
       }
-      return shape.w * shape.b <= device.GetInfo().image2d_max_width &&
-             shape.h <= device.GetInfo().image2d_max_height &&
-             slices * shape.d <= device.GetInfo().image_array_max_layers;
+      return shape.w * shape.b <= device_info.image2d_max_width &&
+             shape.h <= device_info.image2d_max_height &&
+             slices * shape.d <= device_info.image_array_max_layers;
     case TensorStorageType::TEXTURE_2D:
-      return shape.w * shape.b * shape.d <=
-                 device.GetInfo().image2d_max_width &&
-             shape.h * slices <= device.GetInfo().image2d_max_height;
+      return shape.w * shape.b * shape.d <= device_info.image2d_max_width &&
+             shape.h * slices <= device_info.image2d_max_height;
     case TensorStorageType::SINGLE_TEXTURE_2D:
       return shape.c <= 4 &&
-             context.IsFloatTexture2DSupported(shape.c, descriptor.data_type) &&
-             shape.w * shape.b * shape.d <=
-                 device.GetInfo().image2d_max_width &&
-             shape.h <= device.GetInfo().image2d_max_height;
+             device_info.SupportsFloatImage2D(descriptor.data_type, shape.c) &&
+             shape.w * shape.b * shape.d <= device_info.image2d_max_width &&
+             shape.h <= device_info.image2d_max_height;
     default:
       return false;
   }
 }
 
-bool CanCreateTensorWithShape(const CLContext& context, const CLDevice& device,
-                              const BHWC& shape,
+bool CanCreateTensorWithShape(const DeviceInfo& device_info, const BHWC& shape,
                               const TensorDescriptor& descriptor) {
   const BHWDC shape5D(shape.b, shape.h, shape.w, 1, shape.c);
-  return CanCreateTensorWithShape(context, device, shape5D, descriptor);
+  return CanCreateTensorWithShape(device_info, shape5D, descriptor);
 }
 
-TensorStorageType SelectBestStorageType(const CLContext& context,
-                                        const CLDevice& device,
+TensorStorageType SelectBestStorageType(const DeviceInfo& device_info,
                                         const BHWC& shape,
                                         const TensorStorageType& desired,
                                         const DataType& data_type,
                                         const Layout& layout) {
-  if (CanCreateTensorWithShape(context, device, shape,
+  if (CanCreateTensorWithShape(device_info, shape,
                                TensorDescriptor{data_type, desired, layout})) {
     return desired;
   }
   auto GetBestTypeAfterTextureArray = [&]() {
-    if (device.SupportsImageBuffer() &&
+    if (device_info.SupportsImageBuffer() &&
         CanCreateTensorWithShape(
-            context, device, shape,
+            device_info, shape,
             TensorDescriptor{data_type, TensorStorageType::IMAGE_BUFFER,
                              layout})) {
       return TensorStorageType::IMAGE_BUFFER;
@@ -101,9 +95,9 @@ TensorStorageType SelectBestStorageType(const CLContext& context,
     }
   };
   auto GetBestTypeAfterTexture2D = [&]() {
-    if (device.SupportsTextureArray() &&
+    if (device_info.SupportsTextureArray() &&
         CanCreateTensorWithShape(
-            context, device, shape,
+            device_info, shape,
             TensorDescriptor{data_type, TensorStorageType::TEXTURE_ARRAY,
                              layout})) {
       return TensorStorageType::TEXTURE_ARRAY;
@@ -113,7 +107,7 @@ TensorStorageType SelectBestStorageType(const CLContext& context,
   };
   auto GetBestTypeAfterTexture3D = [&]() {
     if (CanCreateTensorWithShape(
-            context, device, shape,
+            device_info, shape,
             TensorDescriptor{data_type, TensorStorageType::TEXTURE_2D,
                              layout})) {
       return TensorStorageType::TEXTURE_2D;
diff --git a/tensorflow/lite/delegates/gpu/cl/storage_type_util.h b/tensorflow/lite/delegates/gpu/cl/storage_type_util.h
index 87fc2206e81..a8a82008461 100644
--- a/tensorflow/lite/delegates/gpu/cl/storage_type_util.h
+++ b/tensorflow/lite/delegates/gpu/cl/storage_type_util.h
@@ -16,8 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_STORAGE_TYPE_UTIL_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_STORAGE_TYPE_UTIL_H_
 
-#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
-#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/device_info.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
@@ -26,16 +25,13 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-bool CanCreateTensorWithShape(const CLContext& context, const CLDevice& device,
-                              const BHWDC& shape,
+bool CanCreateTensorWithShape(const DeviceInfo& device_info, const BHWDC& shape,
                               const TensorDescriptor& descriptor);
 
-bool CanCreateTensorWithShape(const CLContext& context, const CLDevice& device,
-                              const BHWC& shape,
+bool CanCreateTensorWithShape(const DeviceInfo& device_info, const BHWC& shape,
                               const TensorDescriptor& descriptor);
 
-TensorStorageType SelectBestStorageType(const CLContext& context,
-                                        const CLDevice& device,
+TensorStorageType SelectBestStorageType(const DeviceInfo& device_info,
                                         const BHWC& shape,
                                         const TensorStorageType& desired,
                                         const DataType& data_type,
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.cc b/tensorflow/lite/delegates/gpu/cl/tensor.cc
index 6a7e2174b9f..4da3e5e5b63 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.cc
@@ -42,13 +42,13 @@ absl::Status CreateImageBufferFromBuffer(const CLContext& context,
   format.image_channel_data_type = ToImageChannelType(data_type);
   format.image_channel_order = CL_RGBA;
 
-  cl_int error;
+  cl_int error_code;
   *result = clCreateImage(context.context(), CL_MEM_READ_WRITE, &format, &desc,
-                          nullptr, &error);
-  if (error != CL_SUCCESS) {
+                          nullptr, &error_code);
+  if (error_code != CL_SUCCESS) {
     return absl::UnknownError(
-        absl::StrCat("Failed to create Texture2D (clCreateImage)",
-                     CLErrorCodeToString(error)));
+        absl::StrCat("Failed to create Image from Buffer (clCreateImage): ",
+                     CLErrorCodeToString(error_code)));
   }
   return absl::OkStatus();
 }
@@ -485,7 +485,7 @@ absl::Status AllocateTensorMemory(const CLContext& context,
                                      data_size, nullptr, &error_code);
       if (!memory) {
         return absl::UnknownError(
-            absl::StrCat("Failed to allocate device memory with clCreateBuffer",
+            absl::StrCat("Failed to allocate device memory (clCreateBuffer): ",
                          CLErrorCodeToString(error_code)));
       }
       *result = CLMemory(memory, true);
@@ -512,7 +512,7 @@ absl::Status AllocateTensorMemory(const CLContext& context,
                                           &format, &desc, nullptr, &error_code);
       if (error_code != CL_SUCCESS) {
         return absl::UnknownError(
-            absl::StrCat("Failed to create Texture2D (clCreateImage)",
+            absl::StrCat("Failed to create 2D texture (clCreateImage): ",
                          CLErrorCodeToString(error_code)));
       }
 
@@ -540,7 +540,7 @@ absl::Status AllocateTensorMemory(const CLContext& context,
                                           &format, &desc, nullptr, &error_code);
       if (error_code != CL_SUCCESS) {
         return absl::UnknownError(
-            absl::StrCat("Failed to create Texture3D (clCreateImage)",
+            absl::StrCat("Failed to create 3D texture (clCreateImage): ",
                          CLErrorCodeToString(error_code)));
       }
 
@@ -569,7 +569,7 @@ absl::Status AllocateTensorMemory(const CLContext& context,
                                     &format, &desc, nullptr, &error_code);
       if (error_code != CL_SUCCESS) {
         return absl::UnknownError(
-            absl::StrCat("Failed to create TextureArray (clCreateImage)",
+            absl::StrCat("Failed to create 2D texture array (clCreateImage): ",
                          CLErrorCodeToString(error_code)));
       }
 
@@ -609,7 +609,7 @@ absl::Status AllocateTensorMemory(const CLContext& context,
                                           &format, &desc, nullptr, &error_code);
       if (error_code != CL_SUCCESS) {
         return absl::UnknownError(
-            absl::StrCat("Failed to create Texture2D (clCreateImage)",
+            absl::StrCat("Failed to create 2D texture (clCreateImage): ",
                          CLErrorCodeToString(error_code)));
       }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.h b/tensorflow/lite/delegates/gpu/cl/tensor.h
index 8d914970743..a204ae9418a 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.h
@@ -68,17 +68,6 @@ class Tensor : public GPUObject {
   int Slices() const { return DivideRoundUp(shape_.c, 4); }
   int Batch() const { return shape_.b; }
 
-  // returns int4(width * batch, height, slices, batch)
-  int4 GetWBatchedHSB() const {
-    return int4(shape_.w * shape_.b, shape_.h, Slices(), shape_.b);
-  }
-  int4 GetWBatchedHDS() const {
-    return int4(shape_.w * shape_.b, shape_.h, shape_.d, Slices());
-  }
-
-  int4 GetWHSB() const { return int4(shape_.w, shape_.h, Slices(), shape_.b); }
-  int4 GetWHDS() const { return int4(shape_.w, shape_.h, shape_.d, Slices()); }
-
   TensorDescriptor GetDescriptor() const { return descriptor_; }
   DataType GetDataType() const { return descriptor_.data_type; }
   TensorStorageType GetStorageType() const { return descriptor_.storage_type; }
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
index 9070cadfb85..e19de02d59d 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
@@ -162,7 +162,11 @@ absl::Status TensorDescriptor::PerformSelector(
     *result = "channels";
     return absl::OkStatus();
   } else if (selector == "Batch") {
-    *result = "batch";
+    if (HasAxis(Axis::BATCH)) {
+      *result = "batch";
+    } else {
+      *result = "1";
+    }
     return absl::OkStatus();
   } else if (selector == "Depth") {
     *result = "depth";
@@ -395,7 +399,21 @@ absl::Status TensorDescriptor::PerformGetWHOffsetSelector(
         "GetWHOffset require two arguments(X and Y coordinates), but ",
         args.size(), " was passed"));
   }
-  *result = absl::StrCat(args[1], " * ", GetWidth(), " + ", args[0]);
+  if (HasAxis(Axis::BATCH) && !IsBatchedWidth()) {
+    auto it = state_vars_.find("batch_id");
+    std::string batch_id;
+    if (it == state_vars_.end()) {
+      return absl::NotFoundError(
+          "Not found batch_id. Should be setted up by SetBatchRef(). method");
+    } else {
+      batch_id = it->second;
+    }
+    *result = absl::StrCat("((", args[1], ") * ", GetWidth(), " + (", args[0],
+                           ")) * batch + (", batch_id, ")");
+  } else {
+    *result =
+        absl::StrCat("(", args[1], ") * ", GetWidth(), " + (", args[0], ")");
+  }
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/BUILD b/tensorflow/lite/delegates/gpu/cl/testing/BUILD
index 723e4cd9e99..c82190ca0e6 100644
--- a/tensorflow/lite/delegates/gpu/cl/testing/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/testing/BUILD
@@ -16,3 +16,21 @@ cc_binary(
         "@com_google_absl//absl/time",
     ],
 )
+
+cc_binary(
+    name = "delegate_testing",
+    srcs = ["delegate_testing.cc"],
+    tags = [
+        "nobuilder",
+        "notap",
+    ],
+    deps = [
+        "//tensorflow/lite/delegates/gpu:delegate",
+        "//tensorflow/lite/delegates/gpu/cl:gpu_api_delegate",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common/testing:tflite_model_reader",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels:kernel_util",
+        "@com_google_absl//absl/time",
+    ],
+)
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/delegate_testing.cc b/tensorflow/lite/delegates/gpu/cl/testing/delegate_testing.cc
new file mode 100644
index 00000000000..10b7ac34404
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/testing/delegate_testing.cc
@@ -0,0 +1,167 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <chrono>  // NOLINT(build/c++11)
+#include <cmath>
+#include <cstdlib>
+#include <iostream>
+#include <memory>
+#include <string>
+
+#include "absl/time/time.h"
+#include "tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/testing/tflite_model_reader.h"
+#include "tensorflow/lite/delegates/gpu/delegate.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/register.h"
+
+namespace {
+
+void FillInputTensor(tflite::Interpreter* interpreter) {
+  for (int k = 0; k < interpreter->inputs().size(); ++k) {
+    TfLiteTensor* tensor_ptr = interpreter->tensor(interpreter->inputs()[k]);
+    const auto tensor_elements_count = tflite::NumElements(tensor_ptr);
+    if (tensor_ptr->type == kTfLiteFloat32) {
+      float* p = interpreter->typed_input_tensor<float>(k);
+      for (int i = 0; i < tensor_elements_count; ++i) {
+        p[i] = std::sin(i);
+      }
+    }
+    if (tensor_ptr->type == kTfLiteInt32) {
+      int* p = interpreter->typed_input_tensor<int>(k);
+      for (int i = 0; i < tensor_elements_count; ++i) {
+        p[i] = i % 2;
+      }
+    }
+  }
+}
+
+void CompareCPUGPUResults(tflite::Interpreter* cpu, tflite::Interpreter* gpu,
+                          float eps) {
+  for (int i = 0; i < cpu->outputs().size(); ++i) {
+    const float* cpu_out = cpu->typed_output_tensor<float>(i);
+    const float* gpu_out = gpu->typed_output_tensor<float>(i);
+    auto out_n = tflite::NumElements(cpu->tensor(cpu->outputs()[i]));
+    const int kMaxPrint = 10;
+    int printed = 0;
+    int total_different = 0;
+    for (int k = 0; k < out_n; ++k) {
+      const float abs_diff = fabs(cpu_out[k] - gpu_out[k]);
+      if (abs_diff > eps) {
+        total_different++;
+        if (printed < kMaxPrint) {
+          std::cout << "Output #" << i << ": element #" << k << ": CPU value - "
+                    << cpu_out[k] << ", GPU value - " << gpu_out[k]
+                    << ", abs diff - " << abs_diff << std::endl;
+          printed++;
+        }
+        if (printed == kMaxPrint) {
+          std::cout << "Printed " << kMaxPrint
+                    << " different elements, threshhold - " << eps
+                    << ", next different elements skipped" << std::endl;
+          printed++;
+        }
+      }
+    }
+    std::cout << "Total " << total_different
+              << " different elements, for output #" << i << ", threshhold - "
+              << eps << std::endl;
+  }
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+  if (argc <= 1) {
+    std::cerr << "Expected model path as second argument." << std::endl;
+    return -1;
+  }
+
+  auto model = tflite::FlatBufferModel::BuildFromFile(argv[1]);
+  if (!model) {
+    std::cerr << "FlatBufferModel::BuildFromFile failed, model path - "
+              << argv[1] << std::endl;
+    return -1;
+  }
+  tflite::ops::builtin::BuiltinOpResolver op_resolver;
+  tflite::InterpreterBuilder builder(*model, op_resolver);
+
+  // CPU.
+  std::unique_ptr<tflite::Interpreter> cpu_inference;
+  builder(&cpu_inference);
+  if (!cpu_inference) {
+    std::cerr << "Failed to build CPU inference." << std::endl;
+    return -1;
+  }
+  auto status = cpu_inference->AllocateTensors();
+  if (status != kTfLiteOk) {
+    std::cerr << "Failed to AllocateTensors for CPU inference." << std::endl;
+    return -1;
+  }
+  FillInputTensor(cpu_inference.get());
+  status = cpu_inference->Invoke();
+  if (status != kTfLiteOk) {
+    std::cerr << "Failed to Invoke CPU inference." << std::endl;
+    return -1;
+  }
+
+  // GPU.
+  std::unique_ptr<tflite::Interpreter> gpu_inference;
+  builder(&gpu_inference);
+  if (!gpu_inference) {
+    std::cerr << "Failed to build GPU inference." << std::endl;
+    return -1;
+  }
+  TfLiteGpuDelegateOptionsV2 options;
+  options.is_precision_loss_allowed = -1;
+  options.inference_preference =
+      TFLITE_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER;
+  options.inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY;
+  options.inference_priority2 = TFLITE_GPU_INFERENCE_PRIORITY_MIN_MEMORY_USAGE;
+  options.inference_priority3 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION;
+  options.max_delegated_partitions = 1;
+  auto* gpu_delegate = TfLiteGpuDelegateV2Create(&options);
+  status = gpu_inference->ModifyGraphWithDelegate(gpu_delegate);
+  if (status != kTfLiteOk) {
+    std::cerr << "ModifyGraphWithDelegate failed." << std::endl;
+    return -1;
+  }
+  FillInputTensor(gpu_inference.get());
+  status = gpu_inference->Invoke();
+  if (status != kTfLiteOk) {
+    std::cerr << "Failed to Invoke GPU inference." << std::endl;
+    return -1;
+  }
+
+  CompareCPUGPUResults(cpu_inference.get(), gpu_inference.get(), 1e-4f);
+
+  // CPU inference latency.
+  auto start = std::chrono::high_resolution_clock::now();
+  cpu_inference->Invoke();
+  auto end = std::chrono::high_resolution_clock::now();
+  std::cout << "CPU time - " << (end - start).count() * 1e-6f << "ms"
+            << std::endl;
+
+  // GPU inference latency.
+  start = std::chrono::high_resolution_clock::now();
+  gpu_inference->Invoke();
+  end = std::chrono::high_resolution_clock::now();
+  std::cout << "GPU time(CPU->GPU->CPU) - " << (end - start).count() * 1e-6f
+            << "ms" << std::endl;
+
+  TfLiteGpuDelegateV2Delete(gpu_delegate);
+  return EXIT_SUCCESS;
+}
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc b/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
index 0c500cd0bbe..ab2e52f14ed 100644
--- a/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
@@ -44,6 +44,7 @@ absl::Status RunModelSample(const std::string& model_name) {
                               ? CalculationsPrecision::F16
                               : CalculationsPrecision::F32;
   create_info.storage_type = GetFastestStorageType(env.device());
+  create_info.hints.Add(ModelHints::kAllowSpecialKernels);
   std::cout << "Precision: " << ToString(create_info.precision) << std::endl;
   std::cout << "Storage type: " << ToString(create_info.storage_type)
             << std::endl;
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/run_delegate_testing.sh b/tensorflow/lite/delegates/gpu/cl/testing/run_delegate_testing.sh
new file mode 100755
index 00000000000..7b86407dbad
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/testing/run_delegate_testing.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+shopt -s expand_aliases  # to work with commands aliases in .sh
+
+description="Delegate testing sample:
+Compares GPU backend vs TFLite CPU(speed/correctness).
+How to use:
+[-h or --help, print instructions]
+[-m or --model_path, path to the model in .tflite format]
+[-d or --device, select device](optional, if you have few connected devices)"
+
+model_path=""
+alias ADB='adb'
+host=""
+
+while [[ "$1" != "" ]]; do
+  case $1 in
+    -m | --model_path)
+      shift
+      model_path=$1
+      ;;
+    -d | --device)
+      shift
+      if [[ "$1" == "HOST" ]]
+      then
+      host="HOST"
+      fi
+      alias ADB='adb -s '$1''
+      ;;
+    -h | --help)
+      echo "$description"
+      exit
+      ;;
+  esac
+  shift
+done
+
+if [ "$model_path" = "" ]
+then
+echo "No model provided."
+echo "$description"
+exit
+fi
+
+SHELL_DIR=$(dirname "$0")
+BINARY_NAME=delegate_testing
+
+if [[ "$host" == "HOST" ]]
+then
+bazel build -c opt --copt -DCL_DELEGATE_NO_GL //"$SHELL_DIR":"$BINARY_NAME"
+chmod +x bazel-bin/"$SHELL_DIR"/"$BINARY_NAME"
+./bazel-bin/"$SHELL_DIR"/"$BINARY_NAME" "$model_path"
+exit
+fi
+
+model_name=${model_path##*/}  # finds last token after '/'
+
+OPENCL_DIR=/data/local/tmp/delegate_testing/
+
+ADB shell mkdir -p $OPENCL_DIR
+
+ADB push "$model_path" "$OPENCL_DIR"
+
+declare -a BUILD_CONFIG
+abi_version=$(ADB shell getprop ro.product.cpu.abi | tr -d '\r')
+if [[ "$abi_version" == "armeabi-v7a" ]]; then
+#"32 bit"
+BUILD_CONFIG=( --config=android_arm -c opt --copt=-fPIE --linkopt=-pie )
+else
+#"64 bit"
+BUILD_CONFIG=( --config=android_arm64 -c opt )
+fi
+
+bazel build "${BUILD_CONFIG[@]}" //$SHELL_DIR:$BINARY_NAME
+
+ADB push bazel-bin/$SHELL_DIR/$BINARY_NAME $OPENCL_DIR
+
+ADB shell chmod +x $OPENCL_DIR/$BINARY_NAME
+ADB shell "cd $OPENCL_DIR && ./$BINARY_NAME $model_name"
+
+# clean up files from device
+ADB shell rm -rf $OPENCL_DIR
diff --git a/tensorflow/lite/delegates/gpu/cl/texture2d.cc b/tensorflow/lite/delegates/gpu/cl/texture2d.cc
index ec4909dcac1..5edf64e83e7 100644
--- a/tensorflow/lite/delegates/gpu/cl/texture2d.cc
+++ b/tensorflow/lite/delegates/gpu/cl/texture2d.cc
@@ -49,7 +49,7 @@ absl::Status CreateTexture2D(int width, int height, cl_channel_type type,
                                        &desc, data, &error_code);
   if (error_code != CL_SUCCESS) {
     return absl::UnknownError(
-        absl::StrCat("Failed to create Texture2D (clCreateImage)",
+        absl::StrCat("Failed to create 2D texture (clCreateImage): ",
                      CLErrorCodeToString(error_code)));
   }
 
@@ -118,8 +118,6 @@ Texture2D& Texture2D::operator=(Texture2D&& texture) {
   return *this;
 }
 
-Texture2D::~Texture2D() { Release(); }
-
 void Texture2D::Release() {
   if (texture_) {
     clReleaseMemObject(texture_);
diff --git a/tensorflow/lite/delegates/gpu/cl/texture2d.h b/tensorflow/lite/delegates/gpu/cl/texture2d.h
index 54a2732fc90..0e972de8cd3 100644
--- a/tensorflow/lite/delegates/gpu/cl/texture2d.h
+++ b/tensorflow/lite/delegates/gpu/cl/texture2d.h
@@ -57,7 +57,7 @@ class Texture2D : public GPUObject {
   Texture2D(const Texture2D&) = delete;
   Texture2D& operator=(const Texture2D&) = delete;
 
-  ~Texture2D();
+  virtual ~Texture2D() { Release(); }
 
   cl_mem GetMemoryPtr() const { return texture_; }
 
diff --git a/tensorflow/lite/delegates/gpu/common/BUILD b/tensorflow/lite/delegates/gpu/common/BUILD
index e9877b63fb3..60a0fda422c 100644
--- a/tensorflow/lite/delegates/gpu/common/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/BUILD
@@ -97,7 +97,6 @@ cc_test(
     srcs = ["model_test.cc"],
     deps = [
         ":model",
-        ":status",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -110,11 +109,13 @@ cc_library(
         ":data_type",
         ":model",
         ":model_builder_helper",
+        ":model_transformer",
         ":object_reader",
         ":operations",
         ":shape",
         ":status",
         ":tensor",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite:context",
@@ -125,7 +126,6 @@ cc_library(
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels/internal:reference_base",
         "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/schema:schema_fbs",
     ] + tf_platform_alias("custom_parsers", "//tensorflow/lite/delegates/gpu/common/"),
 )
 
@@ -148,6 +148,7 @@ cc_library(
     deps = [
         ":data_type",
         ":model",
+        ":operations",
         ":shape",
         ":status",
         ":tensor",
@@ -170,6 +171,7 @@ cc_library(
     hdrs = ["model_transformer.h"],
     deps = [
         ":model",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -187,6 +189,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite/kernels:kernel_util",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -199,6 +202,7 @@ cc_library(
         ":model",
         ":shape",
         ":status",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:variant",
     ],
 )
@@ -213,6 +217,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels/internal:optimized_base",
         "//tensorflow/lite/kernels/internal:types",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index ea9ca70aba2..426d4d2436a 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -16,29 +16,29 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/model_builder.h"
 
 #include <algorithm>
+#include <any>
 #include <cstdint>
-#include <cstring>
-#include <limits>
+#include <map>
 #include <memory>
+#include <optional>
 #include <set>
 #include <string>
-#include <unordered_map>
 #include <utility>
+#include <variant>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
-#include "tensorflow/lite/builtin_op_data.h"
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/context.h"
-#include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/delegates/gpu/common/custom_parsers.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_builder_helper.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/object_reader.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
@@ -49,34 +49,12 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/dequantize.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/util.h"
 
 namespace tflite {
 namespace gpu {
 namespace {
 
-// Creates a node that consumes output from the given node. Because output need
-// to stay the same, newly created node will inherit the output from the given
-// node, which will in turn get newly created copy of output. This is necessary
-// to preserve reference consistency if another node was pointing at that
-// output:
-//   node(output)
-// will turn into:
-//   node(copy(output)) <- passthrough_node(output)
-absl::Status NewPassthroughNode(GraphFloat32* graph, Node* node,
-                                const Value* output, Node** passthru_node) {
-  *passthru_node = graph->NewNode();
-  // Make copies for every output in the original node.
-  RETURN_IF_ERROR(graph->SetProducer((*passthru_node)->id, output->id));
-  Value* copy_output = graph->NewValue();
-  RETURN_IF_ERROR(graph->SetProducer(node->id, copy_output->id));
-  RETURN_IF_ERROR(graph->AddConsumer((*passthru_node)->id, copy_output->id));
-  copy_output->tensor = output->tensor;
-  copy_output->tensor.ref = -1;
-  return absl::OkStatus();
-}
-
 absl::Status CheckTensorIsAvailable(const TfLiteContext* context,
                                     const TfLiteNode* tflite_node, int idx) {
   // If tensor id is in range, it's guaranteed that it'll be available.
@@ -105,79 +83,6 @@ class TFLiteOperationParser {
                                    const TfLiteRegistration* registration) = 0;
 };
 
-absl::Status IsActivationSupported(TfLiteFusedActivation fused_activation) {
-  switch (fused_activation) {
-    case kTfLiteActNone:
-    case kTfLiteActRelu:
-    case kTfLiteActReluN1To1:
-    case kTfLiteActRelu6:
-    case kTfLiteActTanh:
-      return absl::OkStatus();
-    case kTfLiteActSignBit:
-      return absl::UnimplementedError(
-          "TfLiteFusedActivation.kTfLiteActSignBit");
-    case kTfLiteActSigmoid:
-      return absl::UnimplementedError(
-          "TfLiteFusedActivation.kTfLiteActSigmoid");
-
-      // Do not add default; we want compilation error rather than run-time
-      // error.
-  }
-}
-
-// If there is fused activation present, then there will be another node created
-// that will have identical output as the given node. New operation node will
-// depend on the given node output.
-absl::Status MaybeFuseActivation(TfLiteFusedActivation fused_activation,
-                                 const std::vector<uint32_t>& output_indices,
-                                 GraphFloat32* graph, Node* node) {
-  if (fused_activation == kTfLiteActNone) {
-    return absl::OkStatus();
-  }
-  const auto outputs = graph->FindOutputs(node->id);
-  if (outputs.empty()) {
-    return absl::InternalError("Empty outputs in fused node");
-  }
-  switch (fused_activation) {
-    case kTfLiteActRelu:
-    case kTfLiteActReluN1To1:
-    case kTfLiteActRelu6: {
-      ReLUAttributes attr;
-      attr.clip = fused_activation == kTfLiteActRelu
-                      ? 0.0f
-                      : (fused_activation == kTfLiteActReluN1To1 ? 1.0f : 6.0f);
-      for (auto index : output_indices) {
-        Node* activation_node;
-        RETURN_IF_ERROR(
-            NewPassthroughNode(graph, node, outputs[index], &activation_node));
-        activation_node->operation.type = ToString(OperationType::RELU);
-        activation_node->operation.attributes = attr;
-      }
-      break;
-    }
-    case kTfLiteActTanh:
-      for (auto index : output_indices) {
-        Node* activation_node;
-        RETURN_IF_ERROR(
-            NewPassthroughNode(graph, node, outputs[index], &activation_node));
-        activation_node->operation.type = ToString(OperationType::TANH);
-      }
-      break;
-    default:
-      return absl::NotFoundError(
-          absl::StrCat("Unsupported fused activation: ", fused_activation));
-  }
-  return absl::OkStatus();
-}
-
-absl::Status MaybeFuseActivationToTheSingleOutput(
-    TfLiteFusedActivation fused_activation, GraphFloat32* graph, Node* node) {
-  if (graph->FindOutputs(node->id).size() != 1) {
-    return absl::InternalError("Number of outputs exceeds 1");
-  }
-  return MaybeFuseActivation(fused_activation, {0}, graph, node);
-}
-
 HW ToHW(int32_t h, int32_t w) { return HW(h > 0 ? h : 1, w > 0 ? w : 1); }
 
 template <typename AttrT>
@@ -384,13 +289,12 @@ class AddOperationParser : public TFLiteOperationParser {
     Node* node = graph->NewNode();
     node->operation.type = ToString(OperationType::ADD);
     RETURN_IF_ERROR(reader->AddOutputs(node));
-    AddAttributes attr;
+    ElementwiseAttributes attr;
     RETURN_IF_ERROR(ParseInputsWithConstTensor(node, reader, &attr.param));
     node->operation.attributes = std::move(attr);
     const TfLiteAddParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
-    return MaybeFuseActivationToTheSingleOutput(tf_options->activation, graph,
-                                                node);
+    return MaybeFuseActivation(tf_options->activation, graph, node);
   }
 };
 
@@ -463,8 +367,7 @@ class ConcatenationOperationParser : public TFLiteOperationParser {
     }
     const TfLiteConcatenationParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
-    RETURN_IF_ERROR(MaybeFuseActivationToTheSingleOutput(tf_options->activation,
-                                                         graph, node));
+    RETURN_IF_ERROR(MaybeFuseActivation(tf_options->activation, graph, node));
     node->operation.attributes = attr;
     return absl::OkStatus();
   }
@@ -566,8 +469,7 @@ class Conv2DOperationParser : public TFLiteOperationParser {
                         tf_options->dilation_width_factor);
     UpdatePadding(tf_options->padding,
                   graph->FindInputs(node->id)[0]->tensor.shape, &attr);
-    RETURN_IF_ERROR(MaybeFuseActivationToTheSingleOutput(tf_options->activation,
-                                                         graph, node));
+    RETURN_IF_ERROR(MaybeFuseActivation(tf_options->activation, graph, node));
     node->operation.attributes = std::move(attr);
     return absl::OkStatus();
   }
@@ -684,8 +586,7 @@ class DepthwiseConvolutionOperationParser : public TFLiteOperationParser {
                         std::max(1, tf_options->dilation_width_factor));
     UpdatePadding(tf_options->padding,
                   graph->FindInputs(node->id)[0]->tensor.shape, &attr);
-    RETURN_IF_ERROR(MaybeFuseActivationToTheSingleOutput(tf_options->activation,
-                                                         graph, node));
+    RETURN_IF_ERROR(MaybeFuseActivation(tf_options->activation, graph, node));
     const int depth_multiplier = tf_options->depth_multiplier;
     if (depth_multiplier != 1) {
       const TfLiteTensor* input = reader->GetInputTensor(0);
@@ -850,8 +751,7 @@ class ElementwiseOperationParser : public TFLiteOperationParser {
       }
 
       if (activation) {
-        RETURN_IF_ERROR(
-            MaybeFuseActivationToTheSingleOutput(activation, graph, node));
+        RETURN_IF_ERROR(MaybeFuseActivation(activation, graph, node));
       }
     } else if (IsTwoArgumentOperationWithConst()) {
       RETURN_IF_ERROR(reader->VerifyInputsConstsOutputs(tflite_node,
@@ -860,6 +760,8 @@ class ElementwiseOperationParser : public TFLiteOperationParser {
                                                         /*outputs=*/1));
       ElementwiseAttributes attr;
       RETURN_IF_ERROR(ParseInputsWithConstTensor(node, reader, &attr.param));
+      attr.runtime_tensor_is_second =
+          IsConstantTensor(reader->GetInputTensor(0));
       node->operation.attributes = std::move(attr);
     } else {
       return absl::InvalidArgumentError("Incorrect operation type passed");
@@ -893,6 +795,7 @@ class ElementwiseOperationParser : public TFLiteOperationParser {
   bool IsOneArgumentOperation() const {
     switch (operation_type_) {
       case OperationType::ABS:
+      case OperationType::COPY:
       case OperationType::COS:
       case OperationType::EXP:
       case OperationType::LOG:
@@ -997,8 +900,7 @@ class FullyConnectedOperationParser : public TFLiteOperationParser {
     conv->operation.type = ToString(OperationType::FULLY_CONNECTED);
     conv->operation.attributes = std::move(attr);
     absl::Status result = reader->AddOutputs(conv);
-    RETURN_IF_ERROR(MaybeFuseActivationToTheSingleOutput(tf_options->activation,
-                                                         graph, conv));
+    RETURN_IF_ERROR(MaybeFuseActivation(tf_options->activation, graph, conv));
 
     return result;
   }
@@ -1252,8 +1154,7 @@ class MulOperationParser : public TFLiteOperationParser {
 
     const TfLiteMulParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
-    return MaybeFuseActivationToTheSingleOutput(tf_options->activation, graph,
-                                                node);
+    return MaybeFuseActivation(tf_options->activation, graph, node);
   }
 
  private:
@@ -1269,7 +1170,7 @@ class MulOperationParser : public TFLiteOperationParser {
                                    const TfLiteIntArray* constant_dims,
                                    GraphFloat32* graph, ObjectReader* reader) {
     RETURN_IF_ERROR(reader->AddInput(node, runtime_tensor));
-    MultiplyAttributes attr;
+    ElementwiseAttributes attr;
     if (constant_dims->size <= 0 || NumElements(constant_dims) == 1) {
       Tensor<Scalar, DataType::FLOAT32> tensor;
       RETURN_IF_ERROR(reader->ReadTensor(constant_tensor, &tensor));
@@ -1454,9 +1355,7 @@ class Pooling2DOperationParser : public TFLiteOperationParser {
       RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     }
 
-    std::vector<uint32_t> max_tensor_id{0};
-    RETURN_IF_ERROR(MaybeFuseActivation(tf_options->activation, max_tensor_id,
-                                        graph, node));
+    RETURN_IF_ERROR(MaybeFuseActivation(tf_options->activation, graph, node));
     // Second output is optional. It is not required, it but must be added after
     // MaybeAddFusedActivation function is called
     reader->AddOutput(node, 1).IgnoreError();
@@ -1654,7 +1553,9 @@ class Resize2DOperationParser : public TFLiteOperationParser {
       }
       *half_pixel_centers = tf_options->half_pixel_centers;
     } else {
-      *half_pixel_centers = false;
+      const TfLiteResizeNearestNeighborParams* tf_options;
+      RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
+      *half_pixel_centers = tf_options->half_pixel_centers;
     }
     return absl::OkStatus();
   }
@@ -2103,12 +2004,36 @@ class TransposeOperationParser : public TFLiteOperationParser {
     TransposeAttributes attr;
     Tensor<Linear, DataType::INT32> perm;
     RETURN_IF_ERROR(reader->ReadTensor(1, &perm));
+    std::map<Axis, int> axis_to_index = {{Axis::BATCH, 0},
+                                         {Axis::HEIGHT, 1},
+                                         {Axis::WIDTH, 2},
+                                         {Axis::CHANNELS, 3}};
     if (perm.data.size() == 4) {
       attr.perm = BHWC(perm.data[0], perm.data[1], perm.data[2], perm.data[3]);
     } else if (perm.data.size() == 3) {
-      attr.perm = BHWC(0, perm.data[0] + 1, perm.data[1] + 1, perm.data[2] + 1);
+      std::vector<Axis> index_to_axis = {Axis::CHANNELS, Axis::WIDTH,
+                                         Axis::BATCH};
+      std::map<Axis, Axis> remap = {
+          {Axis::HEIGHT, Axis::HEIGHT},
+          {index_to_axis[perm.data[2]], Axis::BATCH},
+          {index_to_axis[perm.data[1]], Axis::WIDTH},
+          {index_to_axis[perm.data[0]], Axis::CHANNELS}};
+      attr.perm.b = axis_to_index[remap[Axis::BATCH]];
+      attr.perm.h = axis_to_index[remap[Axis::HEIGHT]];
+      attr.perm.w = axis_to_index[remap[Axis::WIDTH]];
+      attr.perm.c = axis_to_index[remap[Axis::CHANNELS]];
+
     } else if (perm.data.size() == 2) {
-      attr.perm = BHWC(0, 1, perm.data[0] + 2, perm.data[1] + 2);
+      std::vector<Axis> index_to_axis = {Axis::CHANNELS, Axis::BATCH};
+      std::map<Axis, Axis> remap = {
+          {Axis::HEIGHT, Axis::HEIGHT},
+          {Axis::WIDTH, Axis::WIDTH},
+          {index_to_axis[perm.data[1]], Axis::BATCH},
+          {index_to_axis[perm.data[0]], Axis::CHANNELS}};
+      attr.perm.b = axis_to_index[remap[Axis::BATCH]];
+      attr.perm.h = axis_to_index[remap[Axis::HEIGHT]];
+      attr.perm.w = axis_to_index[remap[Axis::WIDTH]];
+      attr.perm.c = axis_to_index[remap[Axis::CHANNELS]];
     } else {
       return absl::InvalidArgumentError(
           "Permutation for transpose is invalid.");
@@ -2554,8 +2479,23 @@ class MeanOperationParser : public TFLiteOperationParser {
   absl::Status IsSupported(const TfLiteContext* context,
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
-    return CheckInputsOutputs(context, tflite_node, /*runtime_inputs=*/1,
-                              /*outputs=*/1);
+    RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
+                                       /*runtime_inputs=*/1,
+                                       /*outputs=*/1));
+
+    // Simple mechanism to check if MEAN is to be performed only on HW plane.
+    auto* axes = &context->tensors[tflite_node->inputs->data[1]];
+    if (axes->allocation_type != kTfLiteMmapRo || axes->type != kTfLiteInt32) {
+      return absl::UnimplementedError("Mean has unsupported tensor for axes");
+    }
+    auto* axes_data = axes->data.i32;
+    const bool is_hw_mean = tflite::NumElements(axes) == 2 &&
+                            ((axes_data[0] == 1 && axes_data[1] == 2) ||
+                             (axes_data[0] == 2 && axes_data[1] == 1));
+    if (!is_hw_mean) {
+      return absl::UnimplementedError("Mean operation supports only HW plane");
+    }
+    return absl::OkStatus();
   }
 
   absl::Status Parse(const TfLiteNode* tflite_node,
@@ -2634,6 +2574,10 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
       break;
     case kTfLiteBuiltinDiv:
       return std::make_unique<ElementwiseOperationParser>(OperationType::DIV);
+    case kTfLiteBuiltinElu:
+      return std::make_unique<ElementwiseOperationParser>(OperationType::ELU);
+    case kTfLiteBuiltinExp:
+      return std::make_unique<ElementwiseOperationParser>(OperationType::EXP);
     case kTfLiteBuiltinFullyConnected:
       return std::make_unique<FullyConnectedOperationParser>();
     case kTfLiteBuiltinHardSwish:
@@ -2853,8 +2797,8 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context, bool allow_quant_ops,
 // guarantee that the order will match the source model tensors order.
 absl::Status PrecreateIOTensors(
     TfLiteContext* context, GraphFloat32* graph, TfLiteIntArray* io_tensors,
-    std::unordered_map<int, int>* quant_conversion_map,
-    std::unordered_map<int, Value*>* tensor_to_value) {
+    absl::flat_hash_map<int, int>* quant_conversion_map,
+    absl::flat_hash_map<int, Value*>* tensor_to_value) {
   for (int i = 0; i < io_tensors->size; ++i) {
     const int tensor_index = io_tensors->data[i];
     const TfLiteTensor& tflite_tensor = context->tensors[tensor_index];
@@ -2868,7 +2812,7 @@ absl::Status PrecreateIOTensors(
 absl::Status BuildModel(TfLiteContext* context,
                         const TfLiteDelegateParams* delegate_params,
                         GraphFloat32* graph,
-                        std::unordered_map<int, int>* quant_conversion_map) {
+                        absl::flat_hash_map<int, int>* quant_conversion_map) {
   std::vector<std::unique_ptr<TFLiteOperationParser>> operations;
   std::vector<int> tflite_nodes;
   for (int i = 0; i < delegate_params->nodes_to_replace->size; ++i) {
@@ -2894,7 +2838,7 @@ absl::Status BuildModel(TfLiteContext* context,
     operations.push_back(std::move(op_parser));
     tflite_nodes.push_back(i);
   }
-  std::unordered_map<int, Value*> tensor_to_value;
+  absl::flat_hash_map<int, Value*> tensor_to_value;
   RETURN_IF_ERROR(PrecreateIOTensors(context, graph,
                                      delegate_params->input_tensors,
                                      quant_conversion_map, &tensor_to_value));
@@ -2921,7 +2865,7 @@ absl::Status BuildModel(TfLiteContext* context,
 
 absl::Status BuildFinalModel(
     TfLiteContext* context, const TfLiteDelegateParams* delegate_params,
-    GraphFloat32* graph, std::unordered_map<int, int>* quant_conversion_map) {
+    GraphFloat32* graph, absl::flat_hash_map<int, int>* quant_conversion_map) {
   RETURN_IF_ERROR(
       BuildModel(context, delegate_params, graph, quant_conversion_map));
 
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.h b/tensorflow/lite/delegates/gpu/common/model_builder.h
index 1e5016d86b6..9d80e9636f0 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.h
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <cstdint>
 #include <string>
-#include <unordered_map>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/lite/context.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -48,7 +48,7 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context,
 absl::Status BuildModel(
     TfLiteContext* context, const TfLiteDelegateParams* delegate_params,
     GraphFloat32* graph,
-    std::unordered_map<int, int>* quant_conversion_map = nullptr);
+    absl::flat_hash_map<int, int>* quant_conversion_map = nullptr);
 
 // Same as above but also apply all transformations on the final graph.
 // Prefer using this method instead of BuildModel.
@@ -62,7 +62,7 @@ absl::Status BuildModel(
 absl::Status BuildFinalModel(
     TfLiteContext* context, const TfLiteDelegateParams* delegate_params,
     GraphFloat32* graph,
-    std::unordered_map<int, int>* quant_conversion_map = nullptr);
+    absl::flat_hash_map<int, int>* quant_conversion_map = nullptr);
 
 // Module-internal converter, exposed for unit testing purpose only.
 absl::Status ConvertTfLiteTensorToTensorRef(const TfLiteTensor& tflite_tensor,
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
index 6ec910c8cee..b030fb7e700 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
@@ -25,12 +25,37 @@ limitations under the License.
 #include "tensorflow/lite/context.h"
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/utils.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
 namespace gpu {
+namespace {
+
+// Creates a node that consumes output from the given node. Because output need
+// to stay the same, newly created node will inherit the output from the given
+// node, which will in turn get newly created copy of output. This is necessary
+// to preserve reference consistency if another node was pointing at that
+// output:
+//   node(output)
+// will turn into:
+//   node(copy(output)) <- passthrough_node(output)
+absl::Status NewPassthroughNode(GraphFloat32* graph, Node* node,
+                                const Value* output, Node** passthru_node) {
+  *passthru_node = graph->NewNode();
+  // Make copies for every output in the original node.
+  RETURN_IF_ERROR(graph->SetProducer((*passthru_node)->id, output->id));
+  Value* copy_output = graph->NewValue();
+  RETURN_IF_ERROR(graph->SetProducer(node->id, copy_output->id));
+  RETURN_IF_ERROR(graph->AddConsumer((*passthru_node)->id, copy_output->id));
+  copy_output->tensor = output->tensor;
+  copy_output->tensor.ref = -1;
+  return absl::OkStatus();
+}
+
+}  // namespace
 
 absl::Status GetNodeAndRegistration(TfLiteContext* context, int node_id,
                                     TfLiteNode** tflite_node,
@@ -127,8 +152,10 @@ absl::Status PopulateQuantParams(const TfLiteTensor& tensor,
 int GetNumberOfRuntimeInputsForNode(const TfLiteContext* context,
                                     const TfLiteNode* tflite_node) {
   int number_of_runtime_inputs = 0;
-  for (int i = 0; i < tflite_node->inputs->size; i++) {
-    if (!IsConstantTensor(&context->tensors[tflite_node->inputs->data[i]])) {
+  for (int i = 0; i < NumInputs(tflite_node); i++) {
+    const TfLiteTensor* tensor =
+        GetOptionalInputTensor(context, tflite_node, i);
+    if (tensor != nullptr && !IsConstantTensor(tensor)) {
       number_of_runtime_inputs++;
     }
   }
@@ -137,7 +164,7 @@ int GetNumberOfRuntimeInputsForNode(const TfLiteContext* context,
 
 int GetNumberOfConstInputsForNode(const TfLiteContext* context,
                                   const TfLiteNode* tflite_node) {
-  return tflite_node->inputs->size -
+  return NumInputs(tflite_node) -
          GetNumberOfRuntimeInputsForNode(context, tflite_node);
 }
 
@@ -305,5 +332,70 @@ absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, BHWC* shape) {
   return absl::OkStatus();
 }
 
+absl::Status IsActivationSupported(TfLiteFusedActivation fused_activation) {
+  switch (fused_activation) {
+    case kTfLiteActNone:
+    case kTfLiteActRelu:
+    case kTfLiteActReluN1To1:
+    case kTfLiteActRelu6:
+    case kTfLiteActTanh:
+    case kTfLiteActSigmoid:
+      return absl::OkStatus();
+    case kTfLiteActSignBit:
+      return absl::UnimplementedError(
+          "TfLiteFusedActivation.kTfLiteActSignBit");
+
+      // Do not add default; we want compilation error rather than run-time
+      // error.
+  }
+}
+
+// If there is fused activation present, then there will be another node created
+// that will have identical output as the given node. New operation node will
+// depend on the given node output.
+absl::Status MaybeFuseActivation(TfLiteFusedActivation fused_activation,
+                                 GraphFloat32* graph, Node* node) {
+  const auto outputs = graph->FindOutputs(node->id);
+  if (outputs.size() != 1) {
+    return absl::InternalError("Number of outputs != 1");
+  }
+  switch (fused_activation) {
+    case kTfLiteActNone:
+      // Nothing to do here
+      return absl::OkStatus();
+    case kTfLiteActRelu:
+    case kTfLiteActReluN1To1:
+    case kTfLiteActRelu6: {
+      ReLUAttributes attr;
+      attr.clip = fused_activation == kTfLiteActRelu
+                      ? 0.0f
+                      : (fused_activation == kTfLiteActReluN1To1 ? 1.0f : 6.0f);
+      Node* activation_node;
+      RETURN_IF_ERROR(
+          NewPassthroughNode(graph, node, outputs[0], &activation_node));
+      activation_node->operation.type = ToString(OperationType::RELU);
+      activation_node->operation.attributes = attr;
+      return absl::OkStatus();
+    }
+    case kTfLiteActTanh: {
+      Node* activation_node;
+      RETURN_IF_ERROR(
+          NewPassthroughNode(graph, node, outputs[0], &activation_node));
+      activation_node->operation.type = ToString(OperationType::TANH);
+      return absl::OkStatus();
+    }
+    case kTfLiteActSigmoid: {
+      Node* activation_node;
+      RETURN_IF_ERROR(
+          NewPassthroughNode(graph, node, outputs[0], &activation_node));
+      activation_node->operation.type = ToString(OperationType::SIGMOID);
+      return absl::OkStatus();
+    } break;
+    default:
+      return absl::NotFoundError(
+          absl::StrCat("Unsupported fused activation: ", fused_activation));
+  }
+}
+
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
index 064c42ae9ed..849ef049683 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
@@ -16,6 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_HELPER_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_HELPER_H_
 
+#include <string>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
@@ -118,6 +122,14 @@ absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, OHWI* shape);
 
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, BHWC* shape);
 
+absl::Status IsActivationSupported(TfLiteFusedActivation fused_activation);
+
+// If there is fused activation present, then there will be another node created
+// that will have identical output as the given node. New operation node will
+// depend on the given node output.
+absl::Status MaybeFuseActivation(TfLiteFusedActivation fused_activation,
+                                 GraphFloat32* graph, Node* node);
+
 }  // namespace gpu
 }  // namespace tflite
 
diff --git a/tensorflow/lite/delegates/gpu/common/model_transformer.h b/tensorflow/lite/delegates/gpu/common/model_transformer.h
index d82a6a687ca..fd2667390f3 100644
--- a/tensorflow/lite/delegates/gpu/common/model_transformer.h
+++ b/tensorflow/lite/delegates/gpu/common/model_transformer.h
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include <deque>
 #include <string>
-#include <unordered_set>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 
 namespace tflite {
@@ -126,7 +126,7 @@ class ModelTransformer {
   TransformationReporter* reporter_;
 
   std::deque<NodeId> to_process_;
-  std::unordered_set<NodeId> processed_;
+  absl::flat_hash_set<NodeId> processed_;
 };
 
 class NullTransformationReporter : public TransformationReporter {
diff --git a/tensorflow/lite/delegates/gpu/common/object_reader.cc b/tensorflow/lite/delegates/gpu/common/object_reader.cc
index 41f3ef8ff19..c837fa061c0 100644
--- a/tensorflow/lite/delegates/gpu/common/object_reader.cc
+++ b/tensorflow/lite/delegates/gpu/common/object_reader.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/object_reader.h"
 
 #include <cstdint>
-#include <unordered_map>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_builder_helper.h"
@@ -28,8 +28,8 @@ namespace tflite {
 namespace gpu {
 
 absl::Status ObjectReader::ReadNonConstantTensor(
-    TfLiteContext* context, std::unordered_map<int, Value*>* tensor_to_value,
-    std::unordered_map<int, int>* quant_conversion_map, GraphFloat32* graph,
+    TfLiteContext* context, absl::flat_hash_map<int, Value*>* tensor_to_value,
+    absl::flat_hash_map<int, int>* quant_conversion_map, GraphFloat32* graph,
     uint32_t tensor_idx, Value** value) {
   if (tensor_idx >= context->tensors_size) {
     return absl::OutOfRangeError(
diff --git a/tensorflow/lite/delegates/gpu/common/object_reader.h b/tensorflow/lite/delegates/gpu/common/object_reader.h
index a9fbf546bf6..246bc71f9c5 100644
--- a/tensorflow/lite/delegates/gpu/common/object_reader.h
+++ b/tensorflow/lite/delegates/gpu/common/object_reader.h
@@ -17,8 +17,8 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_OBJECT_READER_H_
 
 #include <cstdint>
-#include <unordered_map>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_builder_helper.h"
@@ -34,14 +34,14 @@ namespace gpu {
 class ObjectReader {
  public:
   static absl::Status ReadNonConstantTensor(
-      TfLiteContext* context, std::unordered_map<int, Value*>* tensor_to_value,
-      std::unordered_map<int, int>* quant_conversion_map, GraphFloat32* graph,
+      TfLiteContext* context, absl::flat_hash_map<int, Value*>* tensor_to_value,
+      absl::flat_hash_map<int, int>* quant_conversion_map, GraphFloat32* graph,
       uint32_t tensor_idx, Value** value = nullptr);
 
   ObjectReader(GraphFloat32* graph, TfLiteContext* context,
                const TfLiteNode* node,
-               std::unordered_map<int, Value*>* tensor_to_value,
-               std::unordered_map<int, int>* quant_conversion_map = nullptr)
+               absl::flat_hash_map<int, Value*>* tensor_to_value,
+               absl::flat_hash_map<int, int>* quant_conversion_map = nullptr)
       : graph_(graph),
         context_(context),
         node_(node),
@@ -58,7 +58,18 @@ class ObjectReader {
 
   template <typename TensorT>
   absl::Status ReadTensor(uint32_t idx, TensorT* t) const {
+    if (idx < 0 || idx >= node_->inputs->size) {
+      // If larger, this can be an older model with fewer input tensors than the
+      // current implementation.
+      return absl::OutOfRangeError("Invalid data index found.");
+    }
     const int32_t tensor_idx = node_->inputs->data[idx];
+    if (tensor_idx < 0) {
+      return absl::InvalidArgumentError(
+          "Invalid data index found. Possibly an unset optional tensor is "
+          "being read.");
+    }
+
     const TfLiteTensor* tflite_tensor = context_->tensors + tensor_idx;
     t->data.resize(NumElements(tflite_tensor));
     RETURN_IF_ERROR(CreateVectorCopyData(*tflite_tensor, &t->data[0]));
@@ -87,8 +98,8 @@ class ObjectReader {
   GraphFloat32* graph_;
   TfLiteContext* context_;
   const TfLiteNode* node_;
-  std::unordered_map<int, Value*>* tensor_to_value_;
-  std::unordered_map<int, int>* quant_conversion_map_;
+  absl::flat_hash_map<int, Value*>* tensor_to_value_;
+  absl::flat_hash_map<int, int>* quant_conversion_map_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/common/operations.cc b/tensorflow/lite/delegates/gpu/common/operations.cc
index c3861ca2baa..fbffe9d65ff 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.cc
+++ b/tensorflow/lite/delegates/gpu/common/operations.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 
 #include <cstdint>
-#include <unordered_map>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
@@ -84,12 +84,16 @@ std::string ToString(enum OperationType op) {
       return "convolution_2d";
     case OperationType::CONVOLUTION_TRANSPOSED:
       return "convolution_transposed";
+    case OperationType::COPY:
+      return "copy";
     case OperationType::COS:
       return "cos";
     case OperationType::DEPTHWISE_CONVOLUTION:
       return "depthwise_convolution";
     case OperationType::DIV:
       return "div";
+    case OperationType::ELU:
+      return "elu";
     case OperationType::EXP:
       return "exp";
     case OperationType::FULLY_CONNECTED:
@@ -106,6 +110,8 @@ std::string ToString(enum OperationType op) {
       return "max_unpooling";
     case OperationType::MEAN:
       return "mean";
+    case OperationType::MEAN_STDDEV_NORMALIZATION:
+      return "mean_stddev_normalization";
     case OperationType::MINIMUM:
       return "minimum";
     case OperationType::MUL:
@@ -152,15 +158,14 @@ std::string ToString(enum OperationType op) {
       return "tanh";
     case OperationType::TRANSPOSE:
       return "transpose";
-    default:
-      break;
+    case OperationType::UNKNOWN:
+      return "unknown_operation";
   }
-  return "unknown_operation";
 }
 
 OperationType OperationTypeFromString(const std::string& name) {
   static const auto operations =
-      new std::unordered_map<std::string, OperationType>({
+      new absl::flat_hash_map<std::string, OperationType>({
           {"abs", OperationType::ABS},
           {"add", OperationType::ADD},
           {"batch_normalization", OperationType::BATCH_NORMALIZATION},
@@ -168,9 +173,11 @@ OperationType OperationTypeFromString(const std::string& name) {
           {"const", OperationType::CONST},
           {"convolution_2d", OperationType::CONVOLUTION_2D},
           {"convolution_transposed", OperationType::CONVOLUTION_TRANSPOSED},
+          {"copy", OperationType::COPY},
           {"cos", OperationType::COS},
           {"depthwise_convolution", OperationType::DEPTHWISE_CONVOLUTION},
           {"div", OperationType::DIV},
+          {"elu", OperationType::ELU},
           {"exp", OperationType::EXP},
           {"fully_connected", OperationType::FULLY_CONNECTED},
           {"hard_swish", OperationType::HARD_SWISH},
@@ -179,6 +186,8 @@ OperationType OperationTypeFromString(const std::string& name) {
           {"maximum", OperationType::MAXIMUM},
           {"max_unpooling", OperationType::MAX_UNPOOLING_2D},
           {"mean", OperationType::MEAN},
+          {"mean_stddev_normalization",
+           OperationType::MEAN_STDDEV_NORMALIZATION},
           {"minimum", OperationType::MINIMUM},
           {"mul", OperationType::MUL},
           {"pad", OperationType::PAD},
diff --git a/tensorflow/lite/delegates/gpu/common/operations.h b/tensorflow/lite/delegates/gpu/common/operations.h
index 3ad54dd0118..563dbdec96e 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.h
+++ b/tensorflow/lite/delegates/gpu/common/operations.h
@@ -40,9 +40,11 @@ enum class OperationType {
   CONST,
   CONVOLUTION_2D,
   CONVOLUTION_TRANSPOSED,
+  COPY,
   COS,
   DEPTHWISE_CONVOLUTION,
   DIV,
+  ELU,
   EXP,
   FULLY_CONNECTED,
   HARD_SWISH,
@@ -51,6 +53,7 @@ enum class OperationType {
   MAXIMUM,
   MAX_UNPOOLING_2D,
   MEAN,
+  MEAN_STDDEV_NORMALIZATION,
   MINIMUM,
   MUL,
   PAD,
@@ -368,10 +371,6 @@ struct LstmAttributes {
   LstmKernelType kernel_type = LstmKernelType::BASIC;
 };
 
-struct MultiplyAttributes {
-  TensorOrScalar param;
-};
-
 enum class SamplingType {
   UNKNOWN = 0,
   NEAREST = 1,
@@ -386,8 +385,7 @@ struct Resize2DAttributes {
   // If true, the centers of the 4 corner pixels of the input and output tensors
   // are aligned, preserving the values at the corner pixels. Defaults to false.
   bool align_corners = false;
-  // half_pixel_centers assumes pixels are of half the actual dimensions, and
-  // yields more accurate resizes. Only applicable to BILINEAR sampling.
+
   bool half_pixel_centers = false;
 };
 
@@ -400,8 +398,7 @@ struct Resize3DAttributes {
   // If true, the centers of the 8 corner pixels of the input and output tensors
   // are aligned, preserving the values at the corner pixels. Defaults to false.
   bool align_corners = false;
-  // half_pixel_centers assumes pixels are of half the actual dimensions, and
-  // yields more accurate resizes. Only applicable to BILINEAR sampling.
+
   bool half_pixel_centers = false;
 };
 
@@ -478,10 +475,6 @@ struct Slice3DAttributes {
 //         input.
 BHWDC CalculateOutputShape(const BHWDC& input, const Slice3DAttributes& attr);
 
-struct AddAttributes {
-  TensorOrScalar param;
-};
-
 struct FullyConnectedAttributes {
   Tensor<OHWI, DataType::FLOAT32> weights;
   Tensor<Linear, DataType::FLOAT32> bias;
@@ -497,6 +490,10 @@ BHWC CalculateOutputShape(const BHWC& input, const MeanAttributes& attr);
 
 struct ElementwiseAttributes {
   TensorOrScalar param;
+  // For elementwise operation with 2 inputs op(A, B), runtime_tensor_is_second
+  // true when runtime tensor is B(on second position). this is important for
+  // ops that non commutative, for example substract.
+  bool runtime_tensor_is_second = false;
 };
 
 struct ReshapeAttributes {
diff --git a/tensorflow/lite/delegates/gpu/common/quantization_util.cc b/tensorflow/lite/delegates/gpu/common/quantization_util.cc
index 9584d1d98ec..fe92989a3ae 100644
--- a/tensorflow/lite/delegates/gpu/common/quantization_util.cc
+++ b/tensorflow/lite/delegates/gpu/common/quantization_util.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/quantization_util.h"
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/types.h"
@@ -22,8 +23,9 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace {
-void DequantizeInput(TfLiteContext* context, int input_index,
-                     const std::unordered_map<int, int>& quant_conversion_map) {
+void DequantizeInput(
+    TfLiteContext* context, int input_index,
+    const absl::flat_hash_map<int, int>& quant_conversion_map) {
   if (quant_conversion_map.find(input_index) == quant_conversion_map.end()) {
     return;
   }
@@ -50,7 +52,7 @@ void DequantizeInput(TfLiteContext* context, int input_index,
 }
 
 void QuantizeOutput(TfLiteContext* context, int output_index,
-                    const std::unordered_map<int, int>& quant_conversion_map) {
+                    const absl::flat_hash_map<int, int>& quant_conversion_map) {
   if (quant_conversion_map.find(output_index) == quant_conversion_map.end()) {
     return;
   }
@@ -80,7 +82,7 @@ void QuantizeOutput(TfLiteContext* context, int output_index,
 
 absl::Status DequantizeInputs(
     TfLiteContext* context, const std::vector<uint32_t>& input_indices,
-    const std::unordered_map<int, int>& quant_conversion_map) {
+    const absl::flat_hash_map<int, int>& quant_conversion_map) {
   for (auto index : input_indices) {
     DequantizeInput(context, static_cast<int>(index), quant_conversion_map);
   }
@@ -89,7 +91,7 @@ absl::Status DequantizeInputs(
 
 absl::Status DequantizeInputs(
     TfLiteContext* context, const std::vector<int64_t>& input_indices,
-    const std::unordered_map<int, int>& quant_conversion_map) {
+    const absl::flat_hash_map<int, int>& quant_conversion_map) {
   for (auto index : input_indices) {
     DequantizeInput(context, static_cast<int>(index), quant_conversion_map);
   }
@@ -98,7 +100,7 @@ absl::Status DequantizeInputs(
 
 absl::Status QuantizeOutputs(
     TfLiteContext* context, const std::vector<uint32_t>& output_indices,
-    const std::unordered_map<int, int>& quant_conversion_map) {
+    const absl::flat_hash_map<int, int>& quant_conversion_map) {
   for (auto index : output_indices) {
     QuantizeOutput(context, static_cast<int>(index), quant_conversion_map);
   }
@@ -108,7 +110,7 @@ absl::Status QuantizeOutputs(
 
 absl::Status QuantizeOutputs(
     TfLiteContext* context, const std::vector<int64_t>& output_indices,
-    const std::unordered_map<int, int>& quant_conversion_map) {
+    const absl::flat_hash_map<int, int>& quant_conversion_map) {
   for (auto index : output_indices) {
     QuantizeOutput(context, static_cast<int>(index), quant_conversion_map);
   }
diff --git a/tensorflow/lite/delegates/gpu/common/quantization_util.h b/tensorflow/lite/delegates/gpu/common/quantization_util.h
index 26512531f29..fc01d612d6f 100644
--- a/tensorflow/lite/delegates/gpu/common/quantization_util.h
+++ b/tensorflow/lite/delegates/gpu/common/quantization_util.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_QUANTIZATION_UTIL_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_QUANTIZATION_UTIL_H_
 
-#include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
@@ -32,11 +32,11 @@ namespace gpu {
 // tensor and its original quantized one.
 absl::Status DequantizeInputs(
     TfLiteContext* context, const std::vector<uint32_t>& input_indices,
-    const std::unordered_map<int, int>& quant_conversion_map);
+    const absl::flat_hash_map<int, int>& quant_conversion_map);
 
 absl::Status DequantizeInputs(
     TfLiteContext* context, const std::vector<int64_t>& input_indices,
-    const std::unordered_map<int, int>& quant_conversion_map);
+    const absl::flat_hash_map<int, int>& quant_conversion_map);
 
 // Quantizes output tensors post-inference, leaving float tensors intact.
 // output_indices contains (fp32) inputs to be quantized, which are outputs of
@@ -45,11 +45,11 @@ absl::Status DequantizeInputs(
 // tensor and its original quantized one.
 absl::Status QuantizeOutputs(
     TfLiteContext* context, const std::vector<uint32_t>& output_indices,
-    const std::unordered_map<int, int>& quant_conversion_map);
+    const absl::flat_hash_map<int, int>& quant_conversion_map);
 
 absl::Status QuantizeOutputs(
     TfLiteContext* context, const std::vector<int64_t>& output_indices,
-    const std::unordered_map<int, int>& quant_conversion_map);
+    const absl::flat_hash_map<int, int>& quant_conversion_map);
 }  // namespace gpu
 }  // namespace tflite
 
diff --git a/tensorflow/lite/delegates/gpu/common/quantization_util_test.cc b/tensorflow/lite/delegates/gpu/common/quantization_util_test.cc
index 064a2a2e6b2..b5cdaec91e0 100644
--- a/tensorflow/lite/delegates/gpu/common/quantization_util_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/quantization_util_test.cc
@@ -151,7 +151,7 @@ TEST(DequantizeInputs, Int8) {
   PopulateContext(tensors, context);
 
   std::vector<uint32_t> input_indices = {1};
-  std::unordered_map<int, int> quant_conversion_map = {{1, 0}};
+  absl::flat_hash_map<int, int> quant_conversion_map = {{1, 0}};
 
   auto status = DequantizeInputs(&context, input_indices, quant_conversion_map);
   EXPECT_TRUE(status.ok());
@@ -176,7 +176,7 @@ TEST(DequantizeInputs, UInt8) {
   PopulateContext(tensors, context);
 
   std::vector<int64_t> input_indices = {1};
-  std::unordered_map<int, int> quant_conversion_map = {{1, 0}};
+  absl::flat_hash_map<int, int> quant_conversion_map = {{1, 0}};
 
   auto status = DequantizeInputs(&context, input_indices, quant_conversion_map);
   EXPECT_TRUE(status.ok());
@@ -199,7 +199,7 @@ TEST(QuantizeOutputs, Int8) {
   PopulateContext(tensors, context);
 
   std::vector<uint32_t> output_indices = {0};
-  std::unordered_map<int, int> quant_conversion_map = {{0, 1}};
+  absl::flat_hash_map<int, int> quant_conversion_map = {{0, 1}};
 
   auto status = QuantizeOutputs(&context, output_indices, quant_conversion_map);
   EXPECT_TRUE(status.ok());
@@ -221,7 +221,7 @@ TEST(QuantizeOutputs, UInt8) {
   PopulateContext(tensors, context);
 
   std::vector<int64_t> output_indices = {0};
-  std::unordered_map<int, int> quant_conversion_map = {{0, 1}};
+  absl::flat_hash_map<int, int> quant_conversion_map = {{0, 1}};
 
   auto status = QuantizeOutputs(&context, output_indices, quant_conversion_map);
   EXPECT_TRUE(status.ok());
diff --git a/tensorflow/lite/delegates/gpu/common/status.h b/tensorflow/lite/delegates/gpu/common/status.h
index d6b5dd8a94a..22dcc11d57f 100644
--- a/tensorflow/lite/delegates/gpu/common/status.h
+++ b/tensorflow/lite/delegates/gpu/common/status.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_STATUS_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_STATUS_H_
 
-#include "absl/status/status.h"
-#define RETURN_IF_ERROR(s) {auto c=(s);if(!c.ok())return c;}
+#include "absl/status/status.h"  // IWYU pragma: export
+#define RETURN_IF_ERROR(s) {auto c=(s);if(!c.ok())return c;}         // IWYU pragma: export
 
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_STATUS_H_
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/BUILD b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/BUILD
index ae746cdb08d..4fef0a28525 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/BUILD
@@ -24,7 +24,6 @@ cc_library(
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common/testing/feature_parity:utils",
-        "//tensorflow/lite/kernels:builtin_ops",
         "@flatbuffers",
     ],
 )
diff --git a/tensorflow/lite/delegates/gpu/common/testing/tflite_model_reader.cc b/tensorflow/lite/delegates/gpu/common/testing/tflite_model_reader.cc
index 0faa621f72f..a67602cf245 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/tflite_model_reader.cc
+++ b/tensorflow/lite/delegates/gpu/common/testing/tflite_model_reader.cc
@@ -79,7 +79,6 @@ absl::Status BuildFromFlatBuffer(const tflite::FlatBufferModel& flatbuffer,
   if (interpreter_builder(&interpreter) != kTfLiteOk || !interpreter) {
     return absl::InternalError("Unable to prepare TfLite interpreter.");
   }
-  interpreter->UseNNAPI(false);
   TfLiteDelegate delegate;
   delegate.data_ = graph;
   delegate.flags = kTfLiteDelegateFlagsNone;
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/BUILD b/tensorflow/lite/delegates/gpu/common/transformations/BUILD
index 4c76e4a81d3..bf26b03f534 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/transformations/BUILD
@@ -59,7 +59,6 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
-        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/add_bias.cc b/tensorflow/lite/delegates/gpu/common/transformations/add_bias.cc
index ec2474138a3..29d70d8f4a9 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/add_bias.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/add_bias.cc
@@ -48,6 +48,11 @@ class AddBias : public NodeTransformation {
  public:
   TransformResult ApplyToNode(Node* node, GraphFloat32* graph) final {
     if (node->operation.type == ToString(OperationType::CONVOLUTION_2D)) {
+      if (graph->FindInputs(node->id).size() != 1) {
+        return {TransformStatus::DECLINED,
+                "This transformation is only applicable to conv with one "
+                "runtime input."};
+      }
       auto& attr =
           absl::any_cast<Convolution2DAttributes&>(node->operation.attributes);
       return FillBias(attr.weights.shape.o, &attr.bias);
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments_test.cc
index ef75b5bb23b..2ff84981f9d 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments_test.cc
@@ -51,7 +51,7 @@ TEST(AddQuantAdjustments, OneNode) {
   Tensor<Linear, DataType::FLOAT32> add_tensor;
   add_tensor.shape = Linear(8);
   add_tensor.data.resize(8);
-  AddAttributes add_attr;
+  ElementwiseAttributes add_attr;
   add_attr.param = add_tensor;
   auto add_node = graph.NewNode();
   add_node->operation.type = ToString(OperationType::ADD);
@@ -95,7 +95,7 @@ TEST(AddQuantAdjustments, GeneralCase) {
   Tensor<Linear, DataType::FLOAT32> add_tensor;
   add_tensor.shape = Linear(8);
   add_tensor.data.resize(8);
-  AddAttributes add_attr;
+  ElementwiseAttributes add_attr;
   add_attr.param = add_tensor;
   auto add1_node = graph.NewNode();
   add1_node->operation.type = ToString(OperationType::ADD);
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
index adee86e4a64..fdbd6e03755 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
@@ -22,7 +22,7 @@ namespace tflite {
 namespace gpu {
 namespace {
 
-void FuseBiasWithAddAttributes(const AddAttributes& add_attr,
+void FuseBiasWithAddAttributes(const ElementwiseAttributes& add_attr,
                                const int channels,
                                Tensor<Linear, DataType::FLOAT32>* bias) {
   auto add = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&add_attr.param);
@@ -46,8 +46,8 @@ class MergeConvolutionWithAdd : public SequenceTransformation {
     if (add_node.operation.type != ToString(OperationType::ADD)) {
       return {TransformStatus::SKIPPED, ""};
     }
-    AddAttributes add_attr =
-        absl::any_cast<AddAttributes>(add_node.operation.attributes);
+    ElementwiseAttributes add_attr =
+        absl::any_cast<ElementwiseAttributes>(add_node.operation.attributes);
     if (!absl::holds_alternative<Tensor<Linear, DataType::FLOAT32>>(
             add_attr.param) &&
         !absl::holds_alternative<float>(add_attr.param)) {
@@ -98,23 +98,23 @@ std::unique_ptr<SequenceTransformation> NewMergeConvolutionWithAdd() {
   return absl::make_unique<MergeConvolutionWithAdd>();
 }
 
-void FuseConvolution2DWithAdd(const AddAttributes& add_attr,
+void FuseConvolution2DWithAdd(const ElementwiseAttributes& add_attr,
                               Convolution2DAttributes* attr) {
   FuseBiasWithAddAttributes(add_attr, attr->weights.shape.o, &attr->bias);
 }
 
-void FuseDepthwiseConvolution2DWithAdd(const AddAttributes& add_attr,
+void FuseDepthwiseConvolution2DWithAdd(const ElementwiseAttributes& add_attr,
                                        DepthwiseConvolution2DAttributes* attr) {
   FuseBiasWithAddAttributes(
       add_attr, attr->weights.shape.o * attr->weights.shape.i, &attr->bias);
 }
 
-void FuseConvolutionTransposedWithAdd(const AddAttributes& add_attr,
+void FuseConvolutionTransposedWithAdd(const ElementwiseAttributes& add_attr,
                                       ConvolutionTransposedAttributes* attr) {
   FuseBiasWithAddAttributes(add_attr, attr->weights.shape.o, &attr->bias);
 }
 
-void FuseFullyConnectedWithAdd(const AddAttributes& add_attr,
+void FuseFullyConnectedWithAdd(const ElementwiseAttributes& add_attr,
                                FullyConnectedAttributes* attr) {
   FuseBiasWithAddAttributes(add_attr, attr->weights.shape.o, &attr->bias);
 }
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h
index 85014ec177e..53a0cef63c8 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h
@@ -33,25 +33,25 @@ std::unique_ptr<SequenceTransformation> NewMergeConvolutionWithAdd();
 // Modify Convolution2DAttributes so that after making convolution with
 // modified attributes we will have the same result as convolution
 // with old attributes and following add operation.
-void FuseConvolution2DWithAdd(const AddAttributes& add_attr,
+void FuseConvolution2DWithAdd(const ElementwiseAttributes& add_attr,
                               Convolution2DAttributes* attr);
 
 // Modify DepthwiseConvolution2DAttributes so that after making depth wise
 // convolution with modified attributes we will have the same result as depth
 // wise convolution with old attributes and following add operation.
-void FuseDepthwiseConvolution2DWithAdd(const AddAttributes& add_attr,
+void FuseDepthwiseConvolution2DWithAdd(const ElementwiseAttributes& add_attr,
                                        DepthwiseConvolution2DAttributes* attr);
 
 // Modify ConvolutionTransposedAttributes so that after making convolution
 // transposed with modified attributes we will have the same result as
 // convolution transposed with old attributes and following add operation.
-void FuseConvolutionTransposedWithAdd(const AddAttributes& add_attr,
+void FuseConvolutionTransposedWithAdd(const ElementwiseAttributes& add_attr,
                                       ConvolutionTransposedAttributes* attr);
 
 // Modify FullyConnectedAttributes so that after making fully connected with
 // modified attributes we will have the same result as fully connected
 // with old attributes and following add operation.
-void FuseFullyConnectedWithAdd(const AddAttributes& add_attr,
+void FuseFullyConnectedWithAdd(const ElementwiseAttributes& add_attr,
                                FullyConnectedAttributes* attr);
 
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv_test.cc
index 53dba56ffb8..4a48c7c0b28 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv_test.cc
@@ -45,7 +45,7 @@ TEST(MergeConvolutionWithAddTest, Smoke) {
   Tensor<Linear, DataType::FLOAT32> add_tensor;
   add_tensor.shape = Linear(16);
   add_tensor.data.resize(16);
-  AddAttributes add_attr;
+  ElementwiseAttributes add_attr;
   add_attr.param = add_tensor;
 
   auto conv_node = graph.NewNode();
@@ -88,7 +88,7 @@ TEST(FuseAddAfterConvolution2DTest, Smoke) {
   Tensor<Linear, DataType::FLOAT32> add_tensor;
   add_tensor.shape = Linear(2);
   add_tensor.data = {0.3f, 0.7f};
-  AddAttributes add_attr;
+  ElementwiseAttributes add_attr;
   add_attr.param = add_tensor;
 
   FuseConvolution2DWithAdd(add_attr, &attr);
@@ -109,7 +109,7 @@ TEST(FuseAddAfterDepthwiseConvolution2DTest, Smoke) {
   Tensor<Linear, DataType::FLOAT32> add_tensor;
   add_tensor.shape = Linear(4);
   add_tensor.data = {0.3f, 0.7f, 0.5f, 0.1f};
-  AddAttributes add_attr;
+  ElementwiseAttributes add_attr;
   add_attr.param = add_tensor;
 
   FuseDepthwiseConvolution2DWithAdd(add_attr, &attr);
@@ -131,7 +131,7 @@ TEST(FuseAddAfterConvolutionTransposedTest, Smoke) {
   Tensor<Linear, DataType::FLOAT32> add_tensor;
   add_tensor.shape = Linear(2);
   add_tensor.data = {0.3f, 0.7f};
-  AddAttributes add_attr;
+  ElementwiseAttributes add_attr;
   add_attr.param = add_tensor;
 
   FuseConvolutionTransposedWithAdd(add_attr, &attr);
@@ -152,7 +152,7 @@ TEST(FuseAddAfterFullyConnectedTest, Smoke) {
   Tensor<Linear, DataType::FLOAT32> add_tensor;
   add_tensor.shape = Linear(2);
   add_tensor.data = {0.3f, 0.7f};
-  AddAttributes add_attr;
+  ElementwiseAttributes add_attr;
   add_attr.param = add_tensor;
 
   FuseFullyConnectedWithAdd(add_attr, &attr);
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc
index f4ace3c0d41..25ec6299f11 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc
@@ -43,8 +43,8 @@ class MergeConvolutionWithMul : public SequenceTransformation {
       return {TransformStatus::SKIPPED, ""};
     }
 
-    MultiplyAttributes mul_attr =
-        absl::any_cast<MultiplyAttributes>(mul_node.operation.attributes);
+    ElementwiseAttributes mul_attr =
+        absl::any_cast<ElementwiseAttributes>(mul_node.operation.attributes);
     if (!absl::holds_alternative<Tensor<Linear, DataType::FLOAT32>>(
             mul_attr.param) &&
         !absl::holds_alternative<float>(mul_attr.param)) {
@@ -107,8 +107,8 @@ class MergeMulWithConvolution : public SequenceTransformation {
       return {TransformStatus::SKIPPED, ""};
     }
 
-    MultiplyAttributes mul_attr =
-        absl::any_cast<MultiplyAttributes>(mul_node.operation.attributes);
+    ElementwiseAttributes mul_attr =
+        absl::any_cast<ElementwiseAttributes>(mul_node.operation.attributes);
     if (!absl::holds_alternative<Tensor<Linear, DataType::FLOAT32>>(
             mul_attr.param) &&
         !absl::holds_alternative<float>(mul_attr.param)) {
@@ -164,7 +164,7 @@ std::unique_ptr<SequenceTransformation> NewMergeMulWithConvolution() {
   return absl::make_unique<MergeMulWithConvolution>();
 }
 
-void FuseConvolution2DWithMultiply(const MultiplyAttributes& mul_attr,
+void FuseConvolution2DWithMultiply(const ElementwiseAttributes& mul_attr,
                                    Convolution2DAttributes* attr) {
   auto mul = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&mul_attr.param);
   auto mul_scalar = absl::get_if<float>(&mul_attr.param);
@@ -185,7 +185,7 @@ void FuseConvolution2DWithMultiply(const MultiplyAttributes& mul_attr,
 }
 
 void FuseDepthwiseConvolution2DWithMultiply(
-    const MultiplyAttributes& mul_attr,
+    const ElementwiseAttributes& mul_attr,
     DepthwiseConvolution2DAttributes* attr) {
   auto mul = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&mul_attr.param);
   auto mul_scalar = absl::get_if<float>(&mul_attr.param);
@@ -207,7 +207,8 @@ void FuseDepthwiseConvolution2DWithMultiply(
 }
 
 void FuseConvolutionTransposedWithMultiply(
-    const MultiplyAttributes& mul_attr, ConvolutionTransposedAttributes* attr) {
+    const ElementwiseAttributes& mul_attr,
+    ConvolutionTransposedAttributes* attr) {
   auto mul = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&mul_attr.param);
   auto mul_scalar = absl::get_if<float>(&mul_attr.param);
   for (int d = 0; d < attr->weights.shape.o; ++d) {
@@ -226,7 +227,7 @@ void FuseConvolutionTransposedWithMultiply(
   }
 }
 
-void FuseFullyConnectedWithMultiply(const MultiplyAttributes& mul_attr,
+void FuseFullyConnectedWithMultiply(const ElementwiseAttributes& mul_attr,
                                     FullyConnectedAttributes* attr) {
   auto mul = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&mul_attr.param);
   auto mul_scalar = absl::get_if<float>(&mul_attr.param);
@@ -242,7 +243,7 @@ void FuseFullyConnectedWithMultiply(const MultiplyAttributes& mul_attr,
   }
 }
 
-void FuseMultiplyWithConvolution2D(const MultiplyAttributes& mul_attr,
+void FuseMultiplyWithConvolution2D(const ElementwiseAttributes& mul_attr,
                                    Convolution2DAttributes* attr) {
   auto mul = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&mul_attr.param);
   auto mul_scalar = absl::get_if<float>(&mul_attr.param);
@@ -260,7 +261,7 @@ void FuseMultiplyWithConvolution2D(const MultiplyAttributes& mul_attr,
 }
 
 void FuseMultiplyWithDepthwiseConvolution2D(
-    const MultiplyAttributes& mul_attr,
+    const ElementwiseAttributes& mul_attr,
     DepthwiseConvolution2DAttributes* attr) {
   auto mul = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&mul_attr.param);
   auto mul_scalar = absl::get_if<float>(&mul_attr.param);
@@ -278,7 +279,8 @@ void FuseMultiplyWithDepthwiseConvolution2D(
 }
 
 void FuseMultiplyWithConvolutionTransposed(
-    const MultiplyAttributes& mul_attr, ConvolutionTransposedAttributes* attr) {
+    const ElementwiseAttributes& mul_attr,
+    ConvolutionTransposedAttributes* attr) {
   auto mul = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&mul_attr.param);
   auto mul_scalar = absl::get_if<float>(&mul_attr.param);
   for (int s = 0; s < attr->weights.shape.i; ++s) {
@@ -294,7 +296,7 @@ void FuseMultiplyWithConvolutionTransposed(
   }
 }
 
-void FuseMultiplyWithFullyConnected(const MultiplyAttributes& mul_attr,
+void FuseMultiplyWithFullyConnected(const ElementwiseAttributes& mul_attr,
                                     FullyConnectedAttributes* attr) {
   auto mul = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&mul_attr.param);
   auto mul_scalar = absl::get_if<float>(&mul_attr.param);
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.h b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.h
index 2f19f7d93c4..8d64ae50488 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.h
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.h
@@ -38,49 +38,53 @@ std::unique_ptr<SequenceTransformation> NewMergeMulWithConvolution();
 // Modify Convolution2DAttributes so that after making convolution with
 // modified attributes we will have the same result as convolution
 // with old attributes and following multiply operation.
-void FuseConvolution2DWithMultiply(const MultiplyAttributes& mul_attr,
+void FuseConvolution2DWithMultiply(const ElementwiseAttributes& mul_attr,
                                    Convolution2DAttributes* attr);
 
 // Modify DepthwiseConvolution2DAttributes so that after making depth wise
 // convolution with modified attributes we will have the same result as depth
 // wise convolution with old attributes and following multiply operation.
 void FuseDepthwiseConvolution2DWithMultiply(
-    const MultiplyAttributes& mul_attr, DepthwiseConvolution2DAttributes* attr);
+    const ElementwiseAttributes& mul_attr,
+    DepthwiseConvolution2DAttributes* attr);
 
 // Modify ConvolutionTransposedAttributes so that after making convolution
 // transposed with modified attributes we will have the same result as
 // convolution transposed with old attributes and following multiply operation.
 void FuseConvolutionTransposedWithMultiply(
-    const MultiplyAttributes& mul_attr, ConvolutionTransposedAttributes* attr);
+    const ElementwiseAttributes& mul_attr,
+    ConvolutionTransposedAttributes* attr);
 
 // Modify FullyConnectedAttributes so that after making fully connected with
 // modified attributes we will have the same result as fully connected
 // with old attributes and following multiply operation.
-void FuseFullyConnectedWithMultiply(const MultiplyAttributes& mul_attr,
+void FuseFullyConnectedWithMultiply(const ElementwiseAttributes& mul_attr,
                                     FullyConnectedAttributes* attr);
 
 // Modify Convolution2DAttributes so that after making convolution with
 // modified attributes we will have the same result as multiply operation and
 // convolution with old attributes
-void FuseMultiplyWithConvolution2D(const MultiplyAttributes& mul_attr,
+void FuseMultiplyWithConvolution2D(const ElementwiseAttributes& mul_attr,
                                    Convolution2DAttributes* attr);
 
 // Modify DepthwiseConvolution2DAttributes so that after making depth wise
 // convolution with modified attributes we will have the same result as multiply
 // operation and depth wise convolution with old attributes
 void FuseMultiplyWithDepthwiseConvolution2D(
-    const MultiplyAttributes& mul_attr, DepthwiseConvolution2DAttributes* attr);
+    const ElementwiseAttributes& mul_attr,
+    DepthwiseConvolution2DAttributes* attr);
 
 // Modify ConvolutionTransposedAttributes so that after making convolution
 // transposed with modified attributes we will have the same result as multiply
 // operation and convolution transposed with old attributes
 void FuseMultiplyWithConvolutionTransposed(
-    const MultiplyAttributes& mul_attr, ConvolutionTransposedAttributes* attr);
+    const ElementwiseAttributes& mul_attr,
+    ConvolutionTransposedAttributes* attr);
 
 // Modify FullyConnectedAttributes so that after making fully connected
 // with modified attributes we will have the same result as multiply
 // operation and fully connected with old attributes
-void FuseMultiplyWithFullyConnected(const MultiplyAttributes& mul_attr,
+void FuseMultiplyWithFullyConnected(const ElementwiseAttributes& mul_attr,
                                     FullyConnectedAttributes* attr);
 
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv_test.cc
index 593a18b8731..ea990dd8267 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv_test.cc
@@ -46,7 +46,7 @@ TEST(MergeConvolutionWithMulTest, Smoke) {
   Tensor<Linear, DataType::FLOAT32> mul_tensor;
   mul_tensor.shape = Linear(16);
   mul_tensor.data.resize(16);
-  MultiplyAttributes mul_attr;
+  ElementwiseAttributes mul_attr;
   mul_attr.param = mul_tensor;
 
   auto conv_node = graph.NewNode();
@@ -87,7 +87,7 @@ TEST(MergeMulWithConvolutionTest, Smoke) {
   Tensor<Linear, DataType::FLOAT32> mul_tensor;
   mul_tensor.shape = Linear(8);
   mul_tensor.data.resize(8);
-  MultiplyAttributes mul_attr;
+  ElementwiseAttributes mul_attr;
   mul_attr.param = mul_tensor;
 
   Convolution2DAttributes conv_attr;
@@ -140,7 +140,7 @@ TEST(FuseMulAfterConvolution2DTest, Smoke) {
   Tensor<Linear, DataType::FLOAT32> mul_tensor;
   mul_tensor.shape = Linear(2);
   mul_tensor.data = {0.5f, 2.0f};
-  MultiplyAttributes mul_attr;
+  ElementwiseAttributes mul_attr;
   mul_attr.param = mul_tensor;
 
   FuseConvolution2DWithMultiply(mul_attr, &attr);
@@ -161,7 +161,7 @@ TEST(FuseMulAfterDepthwiseConvolution2DTest, Smoke) {
   Tensor<Linear, DataType::FLOAT32> mul_tensor;
   mul_tensor.shape = Linear(4);
   mul_tensor.data = {0.5f, 2.0f, 4.0f, 0.25f};
-  MultiplyAttributes mul_attr;
+  ElementwiseAttributes mul_attr;
   mul_attr.param = mul_tensor;
 
   FuseDepthwiseConvolution2DWithMultiply(mul_attr, &attr);
@@ -183,7 +183,7 @@ TEST(FuseMulAfterConvolutionTransposedTest, Smoke) {
   Tensor<Linear, DataType::FLOAT32> mul_tensor;
   mul_tensor.shape = Linear(2);
   mul_tensor.data = {0.5f, 2.0f};
-  MultiplyAttributes mul_attr;
+  ElementwiseAttributes mul_attr;
   mul_attr.param = mul_tensor;
 
   FuseConvolutionTransposedWithMultiply(mul_attr, &attr);
@@ -204,7 +204,7 @@ TEST(FuseMulAfterFullyConnectedTest, Smoke) {
   Tensor<Linear, DataType::FLOAT32> mul_tensor;
   mul_tensor.shape = Linear(2);
   mul_tensor.data = {0.5f, 2.0f};
-  MultiplyAttributes mul_attr;
+  ElementwiseAttributes mul_attr;
   mul_attr.param = mul_tensor;
 
   FuseFullyConnectedWithMultiply(mul_attr, &attr);
@@ -224,7 +224,7 @@ TEST(FuseMulBeforeConvolution2DTest, Smoke) {
   Tensor<Linear, DataType::FLOAT32> mul_tensor;
   mul_tensor.shape = Linear(2);
   mul_tensor.data = {0.5f, 2.0f};
-  MultiplyAttributes mul_attr;
+  ElementwiseAttributes mul_attr;
   mul_attr.param = mul_tensor;
 
   FuseMultiplyWithConvolution2D(mul_attr, &attr);
@@ -245,7 +245,7 @@ TEST(FuseMulBeforeDepthwiseConvolution2DTest, Smoke) {
   Tensor<Linear, DataType::FLOAT32> mul_tensor;
   mul_tensor.shape = Linear(4);
   mul_tensor.data = {0.5f, 2.0f, 4.0f, 0.25f};
-  MultiplyAttributes mul_attr;
+  ElementwiseAttributes mul_attr;
   mul_attr.param = mul_tensor;
 
   FuseMultiplyWithDepthwiseConvolution2D(mul_attr, &attr);
@@ -267,7 +267,7 @@ TEST(FuseMulBeforeConvolutionTransposedTest, Smoke) {
   Tensor<Linear, DataType::FLOAT32> mul_tensor;
   mul_tensor.shape = Linear(2);
   mul_tensor.data = {0.5f, 2.0f};
-  MultiplyAttributes mul_attr;
+  ElementwiseAttributes mul_attr;
   mul_attr.param = mul_tensor;
 
   FuseMultiplyWithConvolutionTransposed(mul_attr, &attr);
@@ -288,7 +288,7 @@ TEST(FuseMulBeforeFullyConnectedTest, Smoke) {
   Tensor<Linear, DataType::FLOAT32> mul_tensor;
   mul_tensor.shape = Linear(2);
   mul_tensor.data = {0.5f, 2.0f};
-  MultiplyAttributes mul_attr;
+  ElementwiseAttributes mul_attr;
   mul_attr.param = mul_tensor;
 
   FuseMultiplyWithFullyConnected(mul_attr, &attr);
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc b/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc
index 2f1621eb34b..6a4e24b5042 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc
@@ -144,15 +144,17 @@ class MergePaddingWithAddOperation : public NodeTransformation {
       return {TransformStatus::SKIPPED, ""};
     }
 
-    AddAttributes add_attr =
-        absl::any_cast<AddAttributes>(add_node->operation.attributes);
-    const bool is_add_broadcast =
+    ElementwiseAttributes add_attr =
+        absl::any_cast<ElementwiseAttributes>(add_node->operation.attributes);
+    const bool is_add_hwc =
+        absl::holds_alternative<Tensor<HWC, DataType::FLOAT32>>(add_attr.param);
+    const bool is_add_linear =
         absl::holds_alternative<Tensor<Linear, DataType::FLOAT32>>(
             add_attr.param);
     const bool is_add_scalar = absl::holds_alternative<float>(add_attr.param);
-    if (is_add_broadcast || is_add_scalar) {
+    if (is_add_hwc || is_add_linear || is_add_scalar) {
       return {TransformStatus::SKIPPED,
-              "Cannot remove padding when this broadcast/scalar ADD"};
+              "Cannot remove padding when ADD has constant argument."};
     }
 
     absl::Status status = RemovePrecedingNode(graph, node, add_node);
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with_test.cc
index 01aade9812d..40029efbc65 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with_test.cc
@@ -108,7 +108,7 @@ TEST(MergePaddingWith, MergeTwo) {
   EXPECT_EQ(HW(2, 2), conv_attr.padding.appended);
 }
 
-TEST(MergePaddingWithAdd, MergeOne) {
+TEST(MergePaddingWithAdd, MergeAlignedPadding) {
   GraphFloat32 graph;
   auto input0 = graph.NewValue();
   input0->tensor.shape = BHWC(1, 4, 4, 8);
@@ -127,7 +127,7 @@ TEST(MergePaddingWithAdd, MergeOne) {
   ASSERT_TRUE(graph.SetProducer(pad_node->id, padded->id).ok());
 
   auto add_node = graph.NewNode();
-  AddAttributes add_attr;
+  ElementwiseAttributes add_attr;
   ASSERT_TRUE(graph.AddConsumer(add_node->id, padded->id).ok());
   ASSERT_TRUE(graph.AddConsumer(add_node->id, input1->id).ok());
   ASSERT_TRUE(graph.SetProducer(add_node->id, output->id).ok());
@@ -146,6 +146,46 @@ TEST(MergePaddingWithAdd, MergeOne) {
   EXPECT_EQ(add_node, graph.nodes()[0]);
 }
 
+TEST(MergePaddingWithAdd, DoNotTrigger_AddWithAttributes) {
+  GraphFloat32 graph;
+  auto input0 = graph.NewValue();
+  input0->tensor.shape = BHWC(1, 4, 4, 8);
+  auto input1 = graph.NewValue();
+  auto padded = graph.NewValue();
+  auto output = graph.NewValue();
+
+  auto pad_node = graph.NewNode();
+  pad_node->operation.type = ToString(OperationType::PAD);
+  PadAttributes pad_attr;
+  pad_attr.prepended = BHWC(0, 0, 0, 0);
+  pad_attr.appended = BHWC(0, 0, 0, 32);
+  pad_node->operation.attributes = pad_attr;
+
+  ASSERT_TRUE(graph.AddConsumer(pad_node->id, input0->id).ok());
+  ASSERT_TRUE(graph.SetProducer(pad_node->id, padded->id).ok());
+
+  auto add_node = graph.NewNode();
+  ElementwiseAttributes add_attr;
+  add_attr.param = Tensor<HWC, DataType::FLOAT32>();
+  ASSERT_TRUE(graph.AddConsumer(add_node->id, padded->id).ok());
+  ASSERT_TRUE(graph.AddConsumer(add_node->id, input1->id).ok());
+  ASSERT_TRUE(graph.SetProducer(add_node->id, output->id).ok());
+  add_node->operation.type = ToString(OperationType::ADD);
+  add_node->operation.attributes = add_attr;
+
+  ASSERT_EQ(2, graph.nodes().size());
+  ASSERT_EQ(4, graph.values().size());
+
+  auto transformation = NewMergePaddingWithAdd();
+  ModelTransformer transformer(&graph, nullptr);
+  transformer.Apply("merge_padding", transformation.get());
+
+  ASSERT_EQ(2, graph.nodes().size());
+  ASSERT_EQ(4, graph.values().size());
+  EXPECT_EQ(pad_node, graph.nodes()[0]);
+  EXPECT_EQ(add_node, graph.nodes()[1]);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.cc b/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.cc
index b4cdd87109a..6cc370899e4 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.cc
@@ -75,9 +75,11 @@ std::unique_ptr<SequenceTransformation> NewRemoveSingleInputAdd() {
         if (node->operation.type != type) {
           return false;
         }
-        auto& attr =
-            absl::any_cast<const AddAttributes&>(node->operation.attributes);
-        return !absl::holds_alternative<Tensor<Linear, DataType::FLOAT32>>(
+        auto& attr = absl::any_cast<const ElementwiseAttributes&>(
+            node->operation.attributes);
+        return !absl::holds_alternative<Tensor<HWC, DataType::FLOAT32>>(
+                   attr.param) &&
+               !absl::holds_alternative<Tensor<Linear, DataType::FLOAT32>>(
                    attr.param) &&
                !absl::holds_alternative<float>(attr.param);
       });
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/remove_noop_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/remove_noop_test.cc
index 183b7cdbe13..a6aafee4f06 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/remove_noop_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/remove_noop_test.cc
@@ -38,7 +38,7 @@ TEST(RemoveSingleInputAdd, Smoke) {
   Value* output;
   ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
   add_node->operation.type = ToString(OperationType::ADD);
-  add_node->operation.attributes = AddAttributes();
+  add_node->operation.attributes = ElementwiseAttributes();
 
   Value* temp;
   ASSERT_TRUE(ConnectTwoNodes(&graph, first_node, add_node, &temp).ok());
@@ -56,7 +56,7 @@ TEST(RemoveSingleInputAdd, Smoke) {
   ASSERT_EQ(output, graph.values()[1]);
 }
 
-TEST(RemoveSingleInputAdd, DoNotTrigger_Tensor) {
+TEST(RemoveSingleInputAdd, DoNotTrigger_TensorHWC) {
   GraphFloat32 graph;
   auto input = graph.NewValue();
   auto first_node = graph.NewNode();
@@ -66,7 +66,34 @@ TEST(RemoveSingleInputAdd, DoNotTrigger_Tensor) {
   Value* output;
   ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
   add_node->operation.type = ToString(OperationType::ADD);
-  AddAttributes attr;
+  ElementwiseAttributes attr;
+  attr.param = Tensor<HWC, DataType::FLOAT32>();
+  add_node->operation.attributes = attr;
+
+  Value* temp;
+  ASSERT_TRUE(ConnectTwoNodes(&graph, first_node, add_node, &temp).ok());
+  ASSERT_EQ(2, graph.nodes().size());
+  ASSERT_EQ(3, graph.values().size());
+
+  auto transformation = NewRemoveSingleInputAdd();
+  ModelTransformer transformer(&graph, nullptr);
+  transformer.Apply("noop", transformation.get());
+
+  EXPECT_EQ(2, graph.nodes().size());
+  ASSERT_EQ(3, graph.values().size());
+}
+
+TEST(RemoveSingleInputAdd, DoNotTrigger_LinearTensor) {
+  GraphFloat32 graph;
+  auto input = graph.NewValue();
+  auto first_node = graph.NewNode();
+  ASSERT_TRUE(graph.AddConsumer(first_node->id, input->id).ok());
+
+  auto add_node = graph.NewNode();
+  Value* output;
+  ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
+  add_node->operation.type = ToString(OperationType::ADD);
+  ElementwiseAttributes attr;
   attr.param = Tensor<Linear, DataType::FLOAT32>();
   add_node->operation.attributes = attr;
 
@@ -93,7 +120,7 @@ TEST(RemoveSingleInputAdd, DoNotTrigger_Scalar) {
   Value* output;
   ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
   add_node->operation.type = ToString(OperationType::ADD);
-  AddAttributes attr;
+  ElementwiseAttributes attr;
   attr.param = 0.5f;
   add_node->operation.attributes = attr;
 
diff --git a/tensorflow/lite/delegates/gpu/common/workgroup_selection.cc b/tensorflow/lite/delegates/gpu/common/workgroup_selection.cc
index 3abab71829f..5ae2a53f449 100644
--- a/tensorflow/lite/delegates/gpu/common/workgroup_selection.cc
+++ b/tensorflow/lite/delegates/gpu/common/workgroup_selection.cc
@@ -184,9 +184,10 @@ template std::vector<uint3> GenerateWorkGroupSizes(
     WorkGroupSizeAlignment z_alignment);
 
 template <typename T>
-absl::Status GenerateWorkGroupSizesAlignedToGrid(
-    const T& grid, const T& max_work_group_size,
-    const int max_work_group_invocations, std::vector<T>* work_groups) {
+void GenerateWorkGroupSizesAlignedToGrid(const T& grid,
+                                         const T& max_work_group_size,
+                                         const int max_work_group_invocations,
+                                         std::vector<T>* work_groups) {
   auto alignment = WorkGroupSizeAlignment::PRECISE;
   *work_groups = GenerateWorkGroupSizes<T>(
       grid, /*min_work_group_total_size = */ 32, max_work_group_invocations,
@@ -196,16 +197,15 @@ absl::Status GenerateWorkGroupSizesAlignedToGrid(
     AddCornerCases(grid, max_work_group_invocations, max_work_group_size,
                    alignment, alignment, alignment, work_groups);
   }
-  return absl::OkStatus();
 }
 
 // Specializations of GenerateWorkGroupSizesAlignedToGrid for int3 and uint3
 
-template absl::Status GenerateWorkGroupSizesAlignedToGrid(
+template void GenerateWorkGroupSizesAlignedToGrid(
     const int3& grid, const int3& max_work_group_size,
     const int max_work_group_invocations, std::vector<int3>* work_groups);
 
-template absl::Status GenerateWorkGroupSizesAlignedToGrid(
+template void GenerateWorkGroupSizesAlignedToGrid(
     const uint3& grid, const uint3& max_work_group_size,
     const int max_work_group_invocations, std::vector<uint3>* work_groups);
 
diff --git a/tensorflow/lite/delegates/gpu/common/workgroup_selection.h b/tensorflow/lite/delegates/gpu/common/workgroup_selection.h
index 75967cb04df..a08bfce991a 100644
--- a/tensorflow/lite/delegates/gpu/common/workgroup_selection.h
+++ b/tensorflow/lite/delegates/gpu/common/workgroup_selection.h
@@ -42,9 +42,10 @@ std::vector<T> GenerateWorkGroupSizes(
     WorkGroupSizeAlignment y_alignment, WorkGroupSizeAlignment z_alignment);
 
 template <typename T>
-absl::Status GenerateWorkGroupSizesAlignedToGrid(
-    const T& grid, const T& max_work_group_size,
-    const int max_work_group_invocations, std::vector<T>* work_groups);
+void GenerateWorkGroupSizesAlignedToGrid(const T& grid,
+                                         const T& max_work_group_size,
+                                         const int max_work_group_invocations,
+                                         std::vector<T>* work_groups);
 
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/delegate.cc b/tensorflow/lite/delegates/gpu/delegate.cc
index 38e60753c59..bfc2b7f08c4 100644
--- a/tensorflow/lite/delegates/gpu/delegate.cc
+++ b/tensorflow/lite/delegates/gpu/delegate.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <thread>  // NOLINT(build/c++11)
-#include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "absl/types/span.h"
 #include "tensorflow/lite/builtin_ops.h"
@@ -34,10 +34,13 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/quantization_util.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/gl/api2.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/minimal_logging.h"
 
+#ifndef CL_DELEGATE_NO_GL
+#include "tensorflow/lite/delegates/gpu/gl/api2.h"
+#endif
+
 namespace tflite {
 namespace gpu {
 namespace {
@@ -315,6 +318,7 @@ class DelegateKernel {
 
   absl::Status InitializeOpenGlApi(GraphFloat32* graph,
                                    std::unique_ptr<InferenceBuilder>* builder) {
+#ifndef CL_DELEGATE_NO_GL
     gl::InferenceEnvironmentOptions env_options;
     gl::InferenceEnvironmentProperties properties;
     RETURN_IF_ERROR(
@@ -330,20 +334,23 @@ class DelegateKernel {
     enforce_same_thread_ = true;
     TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
                          "Initialized OpenGL-based API.");
+#endif
     return absl::OkStatus();
   }
 
   // The Delegate instance that's shared across all DelegateKernel instances.
   Delegate* const delegate_;  // doesn't own the memory.
   std::unique_ptr<cl::InferenceEnvironment> cl_environment_;
+#ifndef CL_DELEGATE_NO_GL
   std::unique_ptr<gl::InferenceEnvironment> gl_environment_;
+#endif
   std::unique_ptr<InferenceRunner> runner_;
   std::vector<int64_t> input_indices_;
   std::vector<int64_t> output_indices_;
   // Whenever quantized inference is enabled, this maps the tensor index of each
   // originally quantized (8-bit) tensor to its float version added in
   // model_builder - and vice versa.
-  std::unordered_map<int, int> quant_conversion_map_;
+  absl::flat_hash_map<int, int> quant_conversion_map_;
   std::thread::id thread_id_prepare_;  // thread id used for Prapare()
   bool enforce_same_thread_ = false;   // flag to enforce same thread for Invoke
 };
diff --git a/tensorflow/lite/delegates/gpu/delegate.h b/tensorflow/lite/delegates/gpu/delegate.h
index f03392d9a3c..9af586bfd75 100644
--- a/tensorflow/lite/delegates/gpu/delegate.h
+++ b/tensorflow/lite/delegates/gpu/delegate.h
@@ -20,20 +20,6 @@ limitations under the License.
 
 #include "tensorflow/lite/c/common.h"
 
-#ifdef SWIG
-#define TFL_CAPI_EXPORT
-#else
-#if defined(_WIN32)
-#ifdef TFL_COMPILE_LIBRARY
-#define TFL_CAPI_EXPORT __declspec(dllexport)
-#else
-#define TFL_CAPI_EXPORT __declspec(dllimport)
-#endif  // TFL_COMPILE_LIBRARY
-#else
-#define TFL_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // _WIN32
-#endif  // SWIG
-
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
diff --git a/tensorflow/lite/delegates/gpu/gl/BUILD b/tensorflow/lite/delegates/gpu/gl/BUILD
index 91472261d04..d39f5e3c34a 100644
--- a/tensorflow/lite/delegates/gpu/gl/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/BUILD
@@ -29,6 +29,7 @@ cc_library(
         ":runtime_options",
         ":stats",
         ":variable",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "//tensorflow/lite/delegates/gpu/common:model",
@@ -66,6 +67,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/gl/kernels:converter",
         "//tensorflow/lite/delegates/gpu/gl/kernels:registry",
         "//tensorflow/lite/delegates/gpu/gl/workgroups:default_calculator",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
     ],
@@ -125,6 +127,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/gl/compiler:fuse_inplace",
         "//tensorflow/lite/delegates/gpu/gl/compiler:shader_code",
         "//tensorflow/lite/delegates/gpu/gl/compiler:shader_codegen",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:any",
     ],
diff --git a/tensorflow/lite/delegates/gpu/gl/api.cc b/tensorflow/lite/delegates/gpu/gl/api.cc
index 0240a5cfbed..f50b8cb5d5c 100644
--- a/tensorflow/lite/delegates/gpu/gl/api.cc
+++ b/tensorflow/lite/delegates/gpu/gl/api.cc
@@ -19,10 +19,10 @@ limitations under the License.
 #include <cstdint>
 #include <deque>
 #include <mutex>  // NOLINT
-#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
@@ -46,7 +46,7 @@ namespace gpu {
 namespace gl {
 namespace {
 
-using ObjectsSizes = std::unordered_map<ValueId, size_t>;
+using ObjectsSizes = absl::flat_hash_map<ValueId, size_t>;
 
 enum class InferenceContextState {
   NOT_STARTED,
@@ -313,7 +313,7 @@ class CompiledModelImpl
       full_shaders[shader.second] = shader.first;
     }
 
-    std::unordered_map<std::string, size_t> partial_shader_to_index;
+    absl::flat_hash_map<std::string, size_t> partial_shader_to_index;
     std::vector<std::string> partial_shaders;
     for (const auto& program : programs_) {
       // Remove a header from a shader.
@@ -366,16 +366,16 @@ class CompiledModelImpl
   std::vector<GlShader> shaders_;
 
   // Shaders are serialized in order of their indices.
-  std::unordered_map<std::string, size_t> shader_to_index_;
+  absl::flat_hash_map<std::string, size_t> shader_to_index_;
   std::deque<ProgramParameters> programs_;
-  std::unordered_map<ValueId, size_t> object_sizes_;
+  absl::flat_hash_map<ValueId, size_t> object_sizes_;
   CompilerStats stats_;
 };
 }  // namespace
 
 absl::Status Compile(const CompilationOptions& options,
                      const GraphFloat32& model,
-                     const std::unordered_set<int>& tflite_graph_io,
+                     const std::unordered_set<int>& tflite_graph_io,  // NOLINT
                      const NodeShader& node_shader,
                      const WorkgroupsCalculator& workgroup_calculator,
                      std::unique_ptr<CompiledModel>* compiled_model) {
diff --git a/tensorflow/lite/delegates/gpu/gl/api.h b/tensorflow/lite/delegates/gpu/gl/api.h
index c37eb9b7772..11498243757 100644
--- a/tensorflow/lite/delegates/gpu/gl/api.h
+++ b/tensorflow/lite/delegates/gpu/gl/api.h
@@ -67,7 +67,7 @@ class CompiledModel {
 // Turns the given model into "compiled" form that is suitable for inference.
 absl::Status Compile(const CompilationOptions& options,
                      const GraphFloat32& model,
-                     const std::unordered_set<int>& tflite_graph_io,
+                     const std::unordered_set<int>& tflite_graph_io,  // NOLINT
                      const NodeShader& node_shader,
                      const WorkgroupsCalculator& workgroup_calculator,
                      std::unique_ptr<CompiledModel>* compiled_model);
diff --git a/tensorflow/lite/delegates/gpu/gl/api2.cc b/tensorflow/lite/delegates/gpu/gl/api2.cc
index c8bf6dd063a..c12463800a9 100644
--- a/tensorflow/lite/delegates/gpu/gl/api2.cc
+++ b/tensorflow/lite/delegates/gpu/gl/api2.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <algorithm>
 #include <cstring>
 #include <string>
-#include <unordered_map>
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "absl/types/span.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
@@ -542,7 +542,7 @@ class InferenceBuilderImpl : public InferenceBuilder {
     auto workgroup_calculator = NewDefaultWorkgroupsCalculator(*gpu_info_);
     auto external_objects = absl::make_unique<ObjectManager>();
     std::vector<GlShader> shaders;
-    std::unordered_map<std::string, size_t> shader_to_index;
+    absl::flat_hash_map<std::string, size_t> shader_to_index;
     RuntimeOptions runtime_options;
     auto runtime =
         absl::make_unique<Runtime>(runtime_options, *gpu_info_,
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler.cc b/tensorflow/lite/delegates/gpu/gl/compiler.cc
index d316505a0e0..eba25171ca3 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
@@ -102,9 +103,10 @@ class CompilerImpl : public Compiler {
     }
   }
 
-  absl::Status Compile(const GraphFloat32& graph,
-                       const std::unordered_set<int>& tflite_graph_io,
-                       const ShaderCodeCallback& callback) final {
+  absl::Status Compile(
+      const GraphFloat32& graph,
+      const std::unordered_set<int>& tflite_graph_io,  // NOLINT
+      const ShaderCodeCallback& callback) final {
     // It is important to have ids in a compiled graph identical to the given
     // graph.
     RETURN_IF_ERROR(graph.MakeExactCopy(&compiled_graph_));
@@ -158,7 +160,7 @@ class CompilerImpl : public Compiler {
     }
 
     // Prepare internal objects.
-    std::unordered_map<ValueId, Object> objects;
+    absl::flat_hash_map<ValueId, Object> objects;
     for (auto value : compiled_graph_.values()) {
       Object object = MakePHWC4Ref(value->id, value->tensor.shape);
       object.data_type = value->tensor.type;
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler.h b/tensorflow/lite/delegates/gpu/gl/compiler.h
index 7769890b769..03ea3dd2a90 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler.h
+++ b/tensorflow/lite/delegates/gpu/gl/compiler.h
@@ -40,9 +40,10 @@ class Compiler {
   // Goes over a graph and generates OpenGL shaders for the given graph.
   // Callback is called for every generated shader. Callback may execute shaders
   // as they come or store them elsewhere to execute later.
-  virtual absl::Status Compile(const GraphFloat32& graph,
-                               const std::unordered_set<int>& tflite_graph_io,
-                               const ShaderCodeCallback& callback) = 0;
+  virtual absl::Status Compile(
+      const GraphFloat32& graph,
+      const std::unordered_set<int>& tflite_graph_io,  // NOLINT
+      const ShaderCodeCallback& callback) = 0;
 };
 
 std::unique_ptr<Compiler> NewCompiler(
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/BUILD b/tensorflow/lite/delegates/gpu/gl/compiler/BUILD
index 601e809fffa..f62f48750bd 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/BUILD
@@ -38,6 +38,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/gl:object",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:variant",
@@ -101,6 +102,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/gl:node_shader",
         "//tensorflow/lite/delegates/gpu/gl:object",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -150,6 +152,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/gl:node_shader",
         "//tensorflow/lite/delegates/gpu/gl:object",
         "//tensorflow/lite/delegates/gpu/gl:variable",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -164,6 +167,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:types",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:any",
         "@com_google_absl//absl/types:variant",
@@ -193,6 +197,7 @@ cc_library(
         ":preprocessor",
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/gl:variable",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:variant",
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/compiled_node.cc b/tensorflow/lite/delegates/gpu/gl/compiler/compiled_node.cc
index 4048a07d087..035fce56d31 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/compiled_node.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/compiled_node.cc
@@ -15,8 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/gl/compiler/compiled_node.h"
 
-#include <unordered_set>
-
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/rename.h"
@@ -28,7 +27,7 @@ namespace gl {
 absl::Status MergeCode(CompiledNodeAttributes* attr,
                        CompiledNodeAttributes* merged_attr) {
   // build a map of known names.
-  std::unordered_set<std::string> known_names;
+  absl::flat_hash_set<std::string> known_names;
   for (const auto& parameter : merged_attr->code.parameters) {
     known_names.insert(parameter.name);
   }
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/fuse_auto_input.cc b/tensorflow/lite/delegates/gpu/gl/compiler/fuse_auto_input.cc
index d0408c6a7be..36d8fa8c1c7 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/fuse_auto_input.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/fuse_auto_input.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/gl/compiler/fuse_auto_input.h"
 
 #include <string>
-#include <unordered_set>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
 #include "absl/types/any.h"
@@ -102,7 +102,7 @@ TransformResult FuseAutoInput::ApplyToNode(Node* node, GraphFloat32* graph) {
 
   // Skip fusions which will result in duplicate inputs, e.g. diamond shapes.
   {
-    std::unordered_set<ValueId> all_inputs;
+    absl::flat_hash_set<ValueId> all_inputs;
     for (const auto& node_to_fuse : nodes_to_fuse) {
       for (const auto& input : graph->FindInputs(node_to_fuse.first->id)) {
         if (all_inputs.find(input->id) != all_inputs.end()) {
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h b/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h
index 78e7a2f1e17..5c4de49c44b 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h
@@ -17,9 +17,9 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_OBJECT_ACCESSOR_H_
 
 #include <string>
-#include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h"
 #include "tensorflow/lite/delegates/gpu/gl/object.h"
@@ -85,7 +85,7 @@ class ObjectAccessor : public InlineRewrite {
   RewriteStatus RewriteWrite(absl::string_view location,
                              absl::string_view value, std::string* output);
 
-  std::unordered_map<std::string, Object> name_to_object_;
+  absl::flat_hash_map<std::string, Object> name_to_object_;
 
   const bool is_mali_;
   const bool sampler_textures_;
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/rename.cc b/tensorflow/lite/delegates/gpu/gl/compiler/rename.cc
index 956f6afae28..b41ba473b85 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/rename.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/rename.cc
@@ -16,10 +16,10 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/gl/compiler/rename.h"
 
 #include <algorithm>
-#include <unordered_map>
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
@@ -86,7 +86,7 @@ class VariableRewriter : public InlineRewrite {
   const std::string inline_delimiter_;
   const NameFunctor name_func_;
 
-  std::unordered_map<std::string, Variable> name_to_variable_;
+  absl::flat_hash_map<std::string, Variable> name_to_variable_;
 };
 
 // Rewrites names of all objects according to returned values from the
@@ -168,7 +168,7 @@ class ObjectRewriter : public InlineRewrite {
   const std::string inline_delimiter_;
   const NameFunctor name_func_;
 
-  std::unordered_map<std::string, std::pair<std::string, Object>>
+  absl::flat_hash_map<std::string, std::pair<std::string, Object>>
       name_to_object_;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h b/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h
index c9946a00395..db4b031548b 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h
@@ -16,11 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_VARIABLE_ACCESSOR_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_VARIABLE_ACCESSOR_H_
 
-#include <string>
-#include <unordered_map>
 #include <set>
+#include <string>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.h"
 #include "tensorflow/lite/delegates/gpu/gl/variable.h"
 
@@ -72,7 +72,7 @@ class VariableAccessor : public InlineRewrite {
  private:
   const bool inline_values_;
   const bool vulkan_support_;
-  std::unordered_map<std::string, Variable> name_to_variable_;
+  absl::flat_hash_map<std::string, Variable> name_to_variable_;
   std::set<std::string> shared_variables_;
   std::set<std::string> uniform_parameters_;
 };
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
index 700a553a125..774b6755014 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
@@ -155,7 +155,10 @@ cc_library(
     name = "custom_registry",
     srcs = ["custom_registry.cc"],
     hdrs = ["custom_registry.h"],
-    deps = ["//tensorflow/lite/delegates/gpu/gl:node_shader"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/gl:node_shader",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
 )
 
 cc_library(
@@ -317,13 +320,11 @@ cc_library(
     srcs = ["mean.cc"],
     hdrs = ["mean.h"],
     deps = [
-        "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/gl:node_shader",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -470,7 +471,6 @@ cc_library(
     srcs = ["quantize_and_dequantize.cc"],
     hdrs = ["quantize_and_dequantize.h"],
     deps = [
-        "//tensorflow/lite/delegates/gpu/common:convert",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:shape",
@@ -645,7 +645,6 @@ cc_test(
         ":space_to_depth",
         ":test_util",
         "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:shape",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -720,7 +719,6 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/gl:node_shader",
-        "//tensorflow/lite/delegates/gpu/gl:variable",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -779,6 +777,7 @@ cc_library(
                "//conditions:default": NON_TFLITE_GPU_BINARY_RELEASE_OPERATORS,
            }) + [
         ":custom_registry",
+        "@com_google_absl//absl/container:flat_hash_map",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/gl:node_shader",
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/add.cc b/tensorflow/lite/delegates/gpu/gl/kernels/add.cc
index 0c0aaaab4fb..a7e056239bf 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/add.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/add.cc
@@ -36,7 +36,8 @@ class Add : public NodeShader {
  public:
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
-    const auto& attr = absl::any_cast<const AddAttributes&>(ctx.op_attr);
+    const auto& attr =
+        absl::any_cast<const ElementwiseAttributes&>(ctx.op_attr);
     auto adds = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&attr.param);
     auto scalar = absl::get_if<float>(&attr.param);
 
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/add_test.cc b/tensorflow/lite/delegates/gpu/gl/kernels/add_test.cc
index f4c81841b9f..98eeb1718b0 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/add_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/add_test.cc
@@ -44,7 +44,7 @@ TEST(AddTest, TwoInputTensorsOfTheSameShape) {
   output.ref = 2;
   output.shape = BHWC(1, 2, 2, 1);
 
-  AddAttributes attr;
+  ElementwiseAttributes attr;
   SingleOpModel model({ToString(OperationType::ADD), std::move(attr)},
                       {augend, addend}, {output});
   ASSERT_TRUE(model.PopulateTensor(0, {-2.0, 0.2, 0.7, 0.8}));
@@ -55,7 +55,7 @@ TEST(AddTest, TwoInputTensorsOfTheSameShape) {
 }
 
 TEST(AddTest, InputTensorAndScalar) {
-  AddAttributes attr;
+  ElementwiseAttributes attr;
   attr.param = 0.1f;
   TensorRef<BHWC> input, output;
   input.type = DataType::FLOAT32;
@@ -80,7 +80,7 @@ TEST(AddTest, InputTensorWithConstantBroadcast) {
   input.ref = 0;
   input.shape = BHWC(1, 2, 2, 2);
 
-  AddAttributes attr;
+  ElementwiseAttributes attr;
   Tensor<Linear, DataType::FLOAT32> tensor;
   tensor.shape.v = 2;
   tensor.id = 1;
@@ -114,7 +114,7 @@ TEST(AddTest, InputTensorWithRuntimeBroadcast) {
   input2.ref = 1;
   input2.shape = BHWC(1, 1, 1, 2);
 
-  AddAttributes attr;
+  ElementwiseAttributes attr;
 
   TensorRef<BHWC> output;
   output.type = DataType::FLOAT32;
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/custom_registry.cc b/tensorflow/lite/delegates/gpu/gl/kernels/custom_registry.cc
index f5c5429e867..a01e885adef 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/custom_registry.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/custom_registry.cc
@@ -17,15 +17,16 @@ limitations under the License.
 
 #include <memory>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
+
 namespace tflite {
 namespace gpu {
 namespace gl {
 
 void RegisterCustomOps(
-    std::unordered_map<std::string, std::vector<std::unique_ptr<NodeShader>>>*
+    absl::flat_hash_map<std::string, std::vector<std::unique_ptr<NodeShader>>>*
         shaders) {}
 
 }  // namespace gl
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/custom_registry.h b/tensorflow/lite/delegates/gpu/gl/kernels/custom_registry.h
index 9a979a982db..7b2a841bca9 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/custom_registry.h
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/custom_registry.h
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include <memory>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
 
 namespace tflite {
@@ -29,7 +29,7 @@ namespace gl {
 
 // Registers custom operations.
 void RegisterCustomOps(
-    std::unordered_map<std::string, std::vector<std::unique_ptr<NodeShader>>>*
+    absl::flat_hash_map<std::string, std::vector<std::unique_ptr<NodeShader>>>*
         shaders_);
 
 }  // namespace gl
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc
index b5971c59667..5d50fcc0118 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc
@@ -41,6 +41,17 @@ class ElementwiseOneArgument : public NodeShader {
       case OperationType::COS:
         source = "value_0 = cos(value_0);";
         break;
+      case OperationType::COPY:
+        source = "value_0 = value_0;";
+        break;
+      case OperationType::ELU:
+        source = R"(
+            value_0.x = value_0.x < 0.0 ? exp(value_0.x) - 1.0 : value_0.x;
+            value_0.y = value_0.y < 0.0 ? exp(value_0.y) - 1.0 : value_0.y;
+            value_0.z = value_0.z < 0.0 ? exp(value_0.z) - 1.0 : value_0.z;
+            value_0.w = value_0.w < 0.0 ? exp(value_0.w) - 1.0 : value_0.w;
+        )";
+        break;
       case OperationType::EXP:
         source = "value_0 = exp(value_0);";
         break;
@@ -211,7 +222,9 @@ std::unique_ptr<NodeShader> NewElementwiseNodeShader(
     OperationType operation_type) {
   switch (operation_type) {
     case OperationType::ABS:
+    case OperationType::COPY:
     case OperationType::COS:
+    case OperationType::ELU:
     case OperationType::EXP:
     case OperationType::LOG:
     case OperationType::HARD_SWISH:
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise_test.cc b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise_test.cc
index 625a09eebf4..a32a4ea9f76 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/gl/kernels/test_util.h"
 
+using ::testing::FloatEq;
 using ::testing::FloatNear;
 using ::testing::Pointwise;
 
@@ -60,6 +61,32 @@ TEST(ElementwiseOneArgumentTest, Cos) {
               Pointwise(FloatNear(1e-6), {1.0, -1.0, -1.0, 0.540302}));
 }
 
+TEST(ElementwiseOneArgumentTest, Copy) {
+  OperationType op_type = OperationType::COPY;
+  const BHWC shape(1, 2, 2, 1);
+  SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}},
+                      /*inputs=*/{GetTensorRef(0, shape)},
+                      /*outputs=*/{GetTensorRef(1, shape)});
+  ASSERT_TRUE(model.PopulateTensor(0, {0.0, -6.2, 2.0, 4.0}));
+  ASSERT_OK(model.Invoke(*NewElementwiseNodeShader(op_type)));
+  EXPECT_THAT(model.GetOutput(0), Pointwise(FloatEq(), {0.0, -6.2, 2.0, 4.0}));
+}
+
+TEST(ElementwiseOneArgumentTest, Elu) {
+  OperationType op_type = OperationType::ELU;
+  const BHWC shape(1, 1, 1, 7);
+  SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}},
+                      /*inputs=*/{GetTensorRef(0, shape)},
+                      /*outputs=*/{GetTensorRef(1, shape)});
+  ASSERT_TRUE(model.PopulateTensor(
+      0, {0.0f, 1.0f, -1.0f, 100.0f, -100.0f, 0.01f, -0.01f}));
+  ASSERT_OK(model.Invoke(*NewElementwiseNodeShader(op_type)));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {0.0f, 1.0f, std::exp(-1.0f) - 1.0f,
+                                          100.0f, std::exp(-100.0f) - 1.0f,
+                                          0.01f, std::exp(-0.01f) - 1.0f}));
+}
+
 TEST(ElementwiseOneArgumentTest, Exp) {
   OperationType op_type = OperationType::EXP;
   const BHWC shape(1, 1, 1, 7);
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/mul.cc b/tensorflow/lite/delegates/gpu/gl/kernels/mul.cc
index 9cf96255176..b66decc3ca3 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/mul.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/mul.cc
@@ -80,7 +80,7 @@ absl::Status GenerateApplyMaskCode(const NodeShader::GenerationContext& ctx,
 
 absl::Status GenerateMultiplyScalarCode(
     const NodeShader::GenerationContext& ctx, GeneratedCode* generated_code) {
-  const auto& attr = absl::any_cast<const MultiplyAttributes&>(ctx.op_attr);
+  const auto& attr = absl::any_cast<const ElementwiseAttributes&>(ctx.op_attr);
   auto muls = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&attr.param);
   auto scalar = absl::get_if<float>(&attr.param);
 
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/mul_test.cc b/tensorflow/lite/delegates/gpu/gl/kernels/mul_test.cc
index 6bd5e85df01..e19f00f763e 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/mul_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/mul_test.cc
@@ -41,7 +41,7 @@ TEST(MulTest, Scalar) {
   output.ref = 1;
   output.shape = BHWC(1, 2, 2, 1);
 
-  MultiplyAttributes attr;
+  ElementwiseAttributes attr;
   attr.param = 2.f;
 
   SingleOpModel model({ToString(OperationType::MUL), attr}, {input}, {output});
@@ -61,7 +61,7 @@ TEST(MulTest, Linear) {
   output.ref = 1;
   output.shape = BHWC(1, 1, 2, 2);
 
-  MultiplyAttributes attr;
+  ElementwiseAttributes attr;
   Tensor<Linear, DataType::FLOAT32> tensor;
   tensor.shape.v = 2;
   tensor.id = 1;
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc b/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
index 913eebdabbe..645e5b6c728 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <string>
-#include <unordered_map>
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@@ -82,6 +82,7 @@ class Registry : public NodeShader {
     insert_op(Type::FULLY_CONNECTED, NewFullyConnectedNodeShader);
     insert_op(Type::LSTM, NewLstmNodeShader);
     insert_op(Type::MEAN, NewMeanNodeShader);
+    // TODO(b/162763635): implement MeanStddevNormalization for OpenGL.
     insert_op(Type::MUL, NewMultiplyNodeShader);
     insert_op(Type::PAD, NewPadNodeShader);
     insert_op(Type::POOLING_2D, NewPoolingNodeShader);
@@ -95,8 +96,10 @@ class Registry : public NodeShader {
     insert_op(Type::SOFTMAX, NewSoftmaxNodeShader);
 
     insert_elementwise_op(Type::ABS);
+    insert_elementwise_op(Type::COPY);
     insert_elementwise_op(Type::COS);
     insert_elementwise_op(Type::DIV);
+    insert_elementwise_op(Type::ELU);
     insert_elementwise_op(Type::EXP);
     insert_elementwise_op(Type::HARD_SWISH);
     insert_elementwise_op(Type::LOG);
@@ -136,7 +139,7 @@ class Registry : public NodeShader {
   }
 
  private:
-  std::unordered_map<std::string, std::vector<std::unique_ptr<NodeShader>>>
+  absl::flat_hash_map<std::string, std::vector<std::unique_ptr<NodeShader>>>
       shaders_;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/resize.cc b/tensorflow/lite/delegates/gpu/gl/kernels/resize.cc
index 19694a0797a..64259f48e7c 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/resize.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/resize.cc
@@ -97,8 +97,27 @@ class Resize : public NodeShader {
 
       value_0 = mix(mix(tex11, tex21, t.x), mix(tex12, tex22, t.x), t.y);)";
     } else if (attr.type == SamplingType::NEAREST) {
-      source = R"(
-      ivec2 coord = ivec2(vec2(gid.xy) * $scale_factor$);
+      std::string fxc;
+      std::string fyc;
+      if (attr.half_pixel_centers) {
+        fxc = "(float(gid.x) + 0.5) * $scale_factor.x$";
+        fyc = "(float(gid.y) + 0.5) * $scale_factor.y$";
+      } else {
+        fxc = "float(gid.x) * $scale_factor.x$";
+        fyc = "float(gid.y) * $scale_factor.y$";
+      }
+      if (attr.align_corners) {
+        fxc += " + 0.5";
+        fyc += " + 0.5";
+      }
+      source += "  ivec2 coord;\n";
+      source += "  coord.x = int(" + fxc + ");\n";
+      source += "  coord.y = int(" + fyc + ");\n";
+      source += "  coord.x = max(0, coord.x);\n";
+      source += "  coord.y = max(0, coord.y);\n";
+      source += "  coord.x = min(coord.x, $input_data_0_w$ - 1);\n";
+      source += "  coord.y = min(coord.y, $input_data_0_h$ - 1);\n";
+      source += R"(
       value_0 = $input_data_0[coord.x, coord.y, gid.z]$;
       )";
     } else {
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/resize_test.cc b/tensorflow/lite/delegates/gpu/gl/kernels/resize_test.cc
index 74b8d478228..2c01ed4401d 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/resize_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/resize_test.cc
@@ -180,6 +180,58 @@ TEST(ResizeTest, Nearest1x2x1To2x4x1) {
       Pointwise(FloatNear(1e-6), {1.0, 1.0, 2.0, 2.0, 1.0, 1.0, 2.0, 2.0}));
 }
 
+TEST(ResizeTest, NearestAlignCorners) {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 2, 2, 1);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 2;
+  output.shape = BHWC(1, 3, 3, 1);
+
+  Resize2DAttributes attr;
+  attr.align_corners = true;
+  attr.half_pixel_centers = false;
+  attr.new_shape = HW(3, 3);
+  attr.type = SamplingType::NEAREST;
+
+  SingleOpModel model({ToString(OperationType::RESIZE), attr}, {input},
+                      {output});
+  ASSERT_TRUE(model.PopulateTensor(0, {3.0f, 6.0f, 9.0f, 12.0f}));
+  ASSERT_OK(model.Invoke(*NewResizeNodeShader()));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {3.0f, 6.0f, 6.0f, 9.0f, 12.0f, 12.0f,
+                                          9.0f, 12.0f, 12.0f}));
+}
+
+TEST(ResizeTest, NearestHalfPixelCenters) {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 2, 2, 1);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 2;
+  output.shape = BHWC(1, 3, 3, 1);
+
+  Resize2DAttributes attr;
+  attr.align_corners = false;
+  attr.half_pixel_centers = true;
+  attr.new_shape = HW(3, 3);
+  attr.type = SamplingType::NEAREST;
+
+  SingleOpModel model({ToString(OperationType::RESIZE), attr}, {input},
+                      {output});
+  ASSERT_TRUE(model.PopulateTensor(0, {3.0f, 6.0f, 9.0f, 12.0f}));
+  ASSERT_OK(model.Invoke(*NewResizeNodeShader()));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {3.0f, 6.0f, 6.0f, 9.0f, 12.0f, 12.0f,
+                                          9.0f, 12.0f, 12.0f}));
+}
+
 }  // namespace
 }  // namespace gl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/test_util.cc b/tensorflow/lite/delegates/gpu/gl/kernels/test_util.cc
index e9abec7eec6..21a53acd9c9 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/test_util.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/test_util.cc
@@ -17,10 +17,10 @@ limitations under the License.
 
 #include <memory>
 #include <string>
-#include <unordered_map>
-#include <unordered_set>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -78,7 +78,7 @@ absl::Status SingleOpModel::Invoke(const CompilationOptions& compile_options,
 
   // Create buffers for input tensors.
   {
-    std::unordered_map<int, uint32_t> tensor_to_id;
+    absl::flat_hash_map<int, uint32_t> tensor_to_id;
     for (const auto* input : graph_.inputs()) {
       tensor_to_id[input->tensor.ref] = input->id;
     }
@@ -101,9 +101,9 @@ absl::Status SingleOpModel::Invoke(const CompilationOptions& compile_options,
   GpuInfo gpu_info;
   RETURN_IF_ERROR(RequestGpuInfo(&gpu_info));
   std::unique_ptr<CompiledModel> compiled_model;
-  RETURN_IF_ERROR(Compile(
-      compile_options, graph_, /*tflite_graph_io=*/std::unordered_set<int>(),
-      shader, *NewDefaultWorkgroupsCalculator(gpu_info), &compiled_model));
+  RETURN_IF_ERROR(Compile(compile_options, graph_, /*tflite_graph_io=*/{},
+                          shader, *NewDefaultWorkgroupsCalculator(gpu_info),
+                          &compiled_model));
 
   // Get inference context.
   auto command_queue = NewCommandQueue(gpu_info);
diff --git a/tensorflow/lite/delegates/gpu/gl/object_manager.cc b/tensorflow/lite/delegates/gpu/gl/object_manager.cc
index c37be507b2b..ba48b7323a9 100644
--- a/tensorflow/lite/delegates/gpu/gl/object_manager.cc
+++ b/tensorflow/lite/delegates/gpu/gl/object_manager.cc
@@ -40,12 +40,11 @@ absl::Status CreatePHWC4BufferFromTensorRef(const TensorRef<BHWC>& tensor_ref,
 
 absl::Status CopyFromPHWC4Buffer(const GlBuffer& buffer,
                                  TensorFloat32* tensor) {
-  return buffer.MappedRead<float>(
-      [tensor, &buffer](absl::Span<const float> data) {
-        tensor->data.resize(tensor->shape.DimensionsProduct());
-        return ConvertFromPHWC4(absl::MakeConstSpan(data), tensor->shape,
-                                absl::MakeSpan(tensor->data));
-      });
+  return buffer.MappedRead<float>([tensor](absl::Span<const float> data) {
+    tensor->data.resize(tensor->shape.DimensionsProduct());
+    return ConvertFromPHWC4(absl::MakeConstSpan(data), tensor->shape,
+                            absl::MakeSpan(tensor->data));
+  });
 }
 
 absl::Status ObjectManager::RegisterBuffer(uint32_t id, GlBuffer buffer) {
diff --git a/tensorflow/lite/delegates/gpu/gl/runtime.cc b/tensorflow/lite/delegates/gpu/gl/runtime.cc
index b7e01a33570..7f0cbe0284b 100644
--- a/tensorflow/lite/delegates/gpu/gl/runtime.cc
+++ b/tensorflow/lite/delegates/gpu/gl/runtime.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
-#include <unordered_map>
 #include <vector>
 
 #include "absl/strings/str_cat.h"
diff --git a/tensorflow/lite/delegates/gpu/gl/runtime/BUILD b/tensorflow/lite/delegates/gpu/gl/runtime/BUILD
index 20b307359db..c7418810f2d 100644
--- a/tensorflow/lite/delegates/gpu/gl/runtime/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/runtime/BUILD
@@ -10,7 +10,6 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:util",
         "//tensorflow/lite/delegates/gpu/gl:gl_buffer",
-        "//tensorflow/lite/delegates/gpu/gl:gl_call",
         "//tensorflow/lite/delegates/gpu/gl:object",
         "//tensorflow/lite/delegates/gpu/gl:portable",
     ],
diff --git a/tensorflow/lite/delegates/gpu/gl/workgroups/BUILD b/tensorflow/lite/delegates/gpu/gl/workgroups/BUILD
index 52fdb7435f9..1048912d754 100644
--- a/tensorflow/lite/delegates/gpu/gl/workgroups/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/workgroups/BUILD
@@ -32,15 +32,16 @@ cc_library(
     deps = select({
         "//tensorflow/lite/delegates/gpu:tflite_gpu_binary_release": [],
         "//conditions:default": [
-            ":default_calculator",
-            "//tensorflow/lite/delegates/gpu/gl:common_cc_fbs",
-            "//tensorflow/lite/delegates/gpu/gl:workgroups_cc_fbs",
-            "//tensorflow/lite/delegates/gpu/common:gpu_info",
-            "//tensorflow/lite/delegates/gpu/gl:metadata_cc_fbs",
             ":calculator",
+            ":default_calculator",
+            "@com_google_absl//absl/container:flat_hash_map",
             "@com_google_absl//absl/memory",
             "@flatbuffers",
+            "//tensorflow/lite/delegates/gpu/common:gpu_info",
             "//tensorflow/lite/delegates/gpu/common:types",
+            "//tensorflow/lite/delegates/gpu/gl:common_cc_fbs",
+            "//tensorflow/lite/delegates/gpu/gl:metadata_cc_fbs",
+            "//tensorflow/lite/delegates/gpu/gl:workgroups_cc_fbs",
         ],
     }),
 )
diff --git a/tensorflow/lite/delegates/gpu/gl/workgroups/calculator_from_metadata.cc b/tensorflow/lite/delegates/gpu/gl/workgroups/calculator_from_metadata.cc
index 7976fd54ed0..8a269e7cf25 100644
--- a/tensorflow/lite/delegates/gpu/gl/workgroups/calculator_from_metadata.cc
+++ b/tensorflow/lite/delegates/gpu/gl/workgroups/calculator_from_metadata.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #ifndef TFLITE_GPU_BINARY_RELEASE
 
 #include <memory>
-#include <unordered_map>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
@@ -62,7 +62,7 @@ class WorkgroupsCalculatorFromMetadata : public WorkgroupsCalculator {
   }
 
  private:
-  std::unordered_map<NodeId, uint3> workgroups_;
+  absl::flat_hash_map<NodeId, uint3> workgroups_;
   std::unique_ptr<WorkgroupsCalculator> default_calculator_;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/gl_delegate.cc b/tensorflow/lite/delegates/gpu/gl_delegate.cc
index 0587cb4f3a3..2f25539802a 100644
--- a/tensorflow/lite/delegates/gpu/gl_delegate.cc
+++ b/tensorflow/lite/delegates/gpu/gl_delegate.cc
@@ -160,7 +160,7 @@ class Delegate {
       tensors_[value->id] = {value->tensor.shape, 0};
     }
 
-    std::unordered_set<int> tflite_graph_io;
+    std::unordered_set<int> tflite_graph_io;  // NOLINT
 
     // Prepare graph inputs.
     //
diff --git a/tensorflow/lite/delegates/gpu/gl_delegate.h b/tensorflow/lite/delegates/gpu/gl_delegate.h
index fa8eec2ad6b..e6efd646fc3 100644
--- a/tensorflow/lite/delegates/gpu/gl_delegate.h
+++ b/tensorflow/lite/delegates/gpu/gl_delegate.h
@@ -22,20 +22,6 @@ limitations under the License.
 #include "absl/base/macros.h"
 #include "tensorflow/lite/c/common.h"
 
-#ifdef SWIG
-#define TFL_CAPI_EXPORT
-#else
-#if defined(_WIN32)
-#ifdef TFL_COMPILE_LIBRARY
-#define TFL_CAPI_EXPORT __declspec(dllexport)
-#else
-#define TFL_CAPI_EXPORT __declspec(dllimport)
-#endif  // TFL_COMPILE_LIBRARY
-#else
-#define TFL_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // _WIN32
-#endif  // SWIG
-
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
diff --git a/tensorflow/lite/delegates/gpu/metal/BUILD b/tensorflow/lite/delegates/gpu/metal/BUILD
index 4db8f3d071d..c4e7ca7c10d 100644
--- a/tensorflow/lite/delegates/gpu/metal/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/BUILD
@@ -100,7 +100,6 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common:util",
-        "//tensorflow/lite/delegates/gpu/metal:runtime_options",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/gpu/metal/api.cc b/tensorflow/lite/delegates/gpu/metal/api.cc
index 6a54e4e26bd..fcab962ee61 100644
--- a/tensorflow/lite/delegates/gpu/metal/api.cc
+++ b/tensorflow/lite/delegates/gpu/metal/api.cc
@@ -190,7 +190,8 @@ absl::Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
     case OperationType::ADD: {
       if (inputs.size() == 1) {
         if (node->operation.attributes.has_value()) {
-          auto attr = absl::any_cast<AddAttributes>(node->operation.attributes);
+          auto attr =
+              absl::any_cast<ElementwiseAttributes>(node->operation.attributes);
           *tasks = ElementwiseWithOneInputAndConstantArguent(
               node_id, inputs[0], outputs[0], options, op_type, attr.param);
         } else {
@@ -291,7 +292,7 @@ absl::Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
       if (inputs.size() == 1) {
         if (node->operation.attributes.has_value()) {
           auto attr =
-              absl::any_cast<MultiplyAttributes>(node->operation.attributes);
+              absl::any_cast<ElementwiseAttributes>(node->operation.attributes);
           *tasks = ElementwiseWithOneInputAndConstantArguent(
               node_id, inputs[0], outputs[0], options, op_type, attr.param);
         } else {
@@ -364,7 +365,9 @@ absl::Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
           absl::any_cast<SpaceToDepthAttributes>(node->operation.attributes));
       break;
     case OperationType::ABS:
+    case OperationType::COPY:
     case OperationType::COS:
+    case OperationType::ELU:
     case OperationType::EXP:
     case OperationType::HARD_SWISH:
     case OperationType::LOG:
@@ -403,6 +406,8 @@ absl::Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
     case OperationType::BATCH_TO_SPACE:
     case OperationType::CONST:
     case OperationType::LSTM:
+    // TODO(b/162763635): implement MeanStddevNormalization for Metal.
+    case OperationType::MEAN_STDDEV_NORMALIZATION:
     case OperationType::SPACE_TO_BATCH:
     case OperationType::TRANSPOSE:
     case OperationType::UNKNOWN:
diff --git a/tensorflow/lite/delegates/gpu/metal/compute_task.mm b/tensorflow/lite/delegates/gpu/metal/compute_task.mm
index 88be8676651..7bfbb55feff 100644
--- a/tensorflow/lite/delegates/gpu/metal/compute_task.mm
+++ b/tensorflow/lite/delegates/gpu/metal/compute_task.mm
@@ -39,22 +39,28 @@ using ::tflite::gpu::metal::UniformsFunction;
 using ::tflite::gpu::uint3;
 using ::tflite::gpu::ValueId;
 
-@implementation TFLComputeTask {
-  struct InputBuffer {
-    ValueId uid;
-    id<MTLBuffer> metalHandle;
-  };
-  struct OutputBuffer {
-    ValueId uid;
-    id<MTLBuffer> metalHandle;
-    OutputDimensions dimensionsFunction;
-    std::vector<ValueId> alias;
-  };
-  struct UniformBuffer {
-    std::vector<uint8_t> data;
-    UniformsFunction dataFunction;
-  };
+namespace {
 
+struct InputBuffer {
+  ValueId uid;
+  id<MTLBuffer> metalHandle;
+};
+
+struct OutputBuffer {
+  ValueId uid;
+  id<MTLBuffer> metalHandle;
+  OutputDimensions dimensionsFunction;
+  std::vector<ValueId> alias;
+};
+
+struct UniformBuffer {
+  std::vector<uint8_t> data;
+  UniformsFunction dataFunction;
+};
+
+}  // namespace
+
+@implementation TFLComputeTask {
   id<MTLComputePipelineState> _program;
   std::vector<InputBuffer> _inputBuffers;
   std::vector<OutputBuffer> _outputBuffers;
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
index 6385b87c403..f4f4c180976 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
@@ -91,7 +91,6 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common:util",
         "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
-        "//tensorflow/lite/delegates/gpu/metal:runtime_options",
     ],
 )
 
@@ -176,7 +175,6 @@ cc_library(
     deps = [
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:status",
-        "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
         "//tensorflow/lite/delegates/gpu/metal:runtime_options",
     ],
@@ -230,12 +228,9 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:convert",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:shape",
-        "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common:util",
         "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
         "//tensorflow/lite/delegates/gpu/metal:environment",
-        "//tensorflow/lite/delegates/gpu/metal:runtime_options",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -270,7 +265,6 @@ cc_library(
     deps = [
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common:util",
         "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
@@ -314,7 +308,6 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common:util",
         "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
-        "//tensorflow/lite/delegates/gpu/metal:runtime_options",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -388,11 +381,9 @@ cc_library(
     deps = [
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common:util",
         "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
-        "//tensorflow/lite/delegates/gpu/metal:runtime_options",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -429,10 +420,8 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:shape",
-        "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common:util",
         "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
-        "//tensorflow/lite/delegates/gpu/metal:runtime_options",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -469,8 +458,6 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:shape",
-        "//tensorflow/lite/delegates/gpu/common:types",
-        "//tensorflow/lite/delegates/gpu/common:util",
         "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
         "//tensorflow/lite/delegates/gpu/metal:runtime_options",
         "@com_google_absl//absl/strings",
@@ -509,10 +496,8 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:types",
-        "//tensorflow/lite/delegates/gpu/common:util",
         "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
         "//tensorflow/lite/delegates/gpu/metal:runtime_options",
-        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -556,9 +541,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:types",
-        "//tensorflow/lite/delegates/gpu/common:util",
         "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
-        "//tensorflow/lite/delegates/gpu/metal:runtime_options",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -591,13 +574,10 @@ cc_library(
     srcs = ["resize.cc"],
     hdrs = ["resize.h"],
     deps = [
-        ":util",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:tensor",
         "//tensorflow/lite/delegates/gpu/common:util",
         "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
-        "//tensorflow/lite/delegates/gpu/metal:runtime_options",
         "@com_google_absl//absl/types:variant",
     ],
 )
@@ -637,7 +617,6 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common:util",
         "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
-        "//tensorflow/lite/delegates/gpu/metal:runtime_options",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -676,7 +655,6 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common:util",
         "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
-        "//tensorflow/lite/delegates/gpu/metal:runtime_options",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -717,7 +695,6 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
         "//tensorflow/lite/delegates/gpu/metal:environment",
         "//tensorflow/lite/delegates/gpu/metal:runtime_options",
-        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -753,7 +730,6 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:util",
         "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
-        "//tensorflow/lite/delegates/gpu/metal:runtime_options",
         "//tensorflow/lite/delegates/gpu/metal/kernels:util",
     ],
 )
@@ -789,7 +765,6 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:shape",
-        "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common:util",
         "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
         "//tensorflow/lite/delegates/gpu/metal:environment",
@@ -873,7 +848,6 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:winograd_util",
         "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
         "//tensorflow/lite/delegates/gpu/metal:runtime_options",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
     ],
 )
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/add_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/add_test.mm
index 540308f23b4..22a798c59cc 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/add_test.mm
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/add_test.mm
@@ -29,7 +29,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/metal/kernels/test_util.h"
 #include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
 
-using ::tflite::gpu::AddAttributes;
+using ::tflite::gpu::ElementwiseAttributes;
 using ::tflite::gpu::BHWC;
 using ::tflite::gpu::DataType;
 using ::tflite::gpu::Linear;
@@ -61,7 +61,7 @@ using ::tflite::gpu::metal::SingleOpModel;
   output.ref = 2;
   output.shape = BHWC(1, 2, 2, 1);
 
-  AddAttributes attr;
+  ElementwiseAttributes attr;
   SingleOpModel model({ToString(OperationType::ADD), std::move(attr)}, {augend, addend}, {output});
   XCTAssertTrue(model.PopulateTensor(0, {-2.0, 0.2, 0.7, 0.8}));
   XCTAssertTrue(model.PopulateTensor(1, {0.1, 0.2, 0.3, 0.5}));
@@ -72,7 +72,7 @@ using ::tflite::gpu::metal::SingleOpModel;
 }
 
 - (void)testInputTensorAndScalar {
-  AddAttributes attr;
+  ElementwiseAttributes attr;
   attr.param = 0.1f;
   TensorRef<BHWC> input, output;
   input.type = DataType::FLOAT32;
@@ -97,7 +97,7 @@ using ::tflite::gpu::metal::SingleOpModel;
   input.ref = 0;
   input.shape = BHWC(1, 2, 2, 2);
 
-  AddAttributes attr;
+  ElementwiseAttributes attr;
   Tensor<Linear, DataType::FLOAT32> tensor;
   tensor.shape.v = 2;
   tensor.id = 1;
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.cc
index 963bc1cd5ab..9edfc884638 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/metal/kernels/elementwise.h"
 
 #include <cstddef>
-#include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/lite/delegates/gpu/common/convert.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
@@ -32,12 +32,17 @@ namespace metal {
 namespace {
 
 std::string OneInputFunctor(OperationType op_type, const std::string& value) {
-  const std::unordered_map<OperationType, std::string> functors{
+  const absl::flat_hash_map<OperationType, std::string> functors{
       {OperationType::ABS, "abs($0)"},
       {OperationType::SIN, "sin($0)"},
       {OperationType::HARD_SWISH,
        "$0 * clamp($0 / 6.0f + FLT4(0.5f), FLT4(0.0f), FLT4(1.0f))"},
       {OperationType::COS, "cos($0)"},
+      {OperationType::ELU,
+       "FLT4($0.x < FLT(0.0f) ? exp($0.x) - FLT(1.0f) : $0.x,"
+       "$0.y < FLT(0.0f) ? exp($0.y) - FLT(1.0f) : $0.y,"
+       "$0.z < FLT(0.0f) ? exp($0.z) - FLT(1.0f) : $0.z,"
+       "$0.w < FLT(0.0f) ? exp($0.w) - FLT(1.0f) : $0.w)"},
       {OperationType::EXP, "exp($0)"},
       {OperationType::LOG, "log($0)"},
       {OperationType::SQRT, "sqrt($0)"},
@@ -45,6 +50,7 @@ std::string OneInputFunctor(OperationType op_type, const std::string& value) {
       {OperationType::SQUARE, "$0 * $0"},
       {OperationType::SIGMOID, "1.0 / (1.0 + exp(-1.0 * $0))"},
       {OperationType::TANH, "tanh($0)"},
+      {OperationType::COPY, "$0"},
   };
 
   if (functors.find(op_type) == functors.end()) {
@@ -56,7 +62,7 @@ std::string OneInputFunctor(OperationType op_type, const std::string& value) {
 
 std::string TwoInputFunctor(OperationType op_type, const std::string& value0,
                             const std::string& value1) {
-  const std::unordered_map<OperationType, std::string> functors{
+  const absl::flat_hash_map<OperationType, std::string> functors{
       {OperationType::ADD, "$0 + $1"},
       {OperationType::DIV, "$0 / $1"},
       {OperationType::MAXIMUM, "max($0, $1)"},
@@ -109,7 +115,7 @@ std::vector<ComputeTaskDescriptorPtr> ElementwiseWithTwoInputs(
 
   desc->uniform_buffers = {
       {"constant int2&",
-       [input_ids, output_id](const std::map<ValueId, BHWC>& buffers) {
+       [input_ids](const std::map<ValueId, BHWC>& buffers) {
          const auto& input_dim_1 = buffers.find(input_ids[1])->second;
          std::vector<int> uniform_params{
              input_dim_1.w,
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm
index d3327e9ec2c..4972fdeb1a9 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm
@@ -79,6 +79,19 @@ TensorRef<BHWC> GetTensorRef(int ref, const BHWC& shape) {
   XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
 }
 
+- (void)testCopy {
+  OperationType op_type = OperationType::COPY;
+  const BHWC shape(1, 2, 2, 1);
+  SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}},
+                      /*inputs=*/{GetTensorRef(0, shape)},
+                      /*outputs=*/{GetTensorRef(1, shape)});
+  XCTAssertTrue(model.PopulateTensor(0, {0.0, -6.2, 2.0, 4.0}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
+  status = CompareVectors({0.0, -6.2, 2.0, 4.0}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
+}
+
 - (void)testDiv {
   OperationType op_type = OperationType::DIV;
   const BHWC shape(1, 2, 2, 1);
@@ -93,6 +106,21 @@ TensorRef<BHWC> GetTensorRef(int ref, const BHWC& shape) {
   XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
 }
 
+- (void)testElu {
+  OperationType op_type = OperationType::ELU;
+  const BHWC shape(1, 1, 1, 7);
+  SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}},
+                      /*inputs=*/{GetTensorRef(0, shape)},
+                      /*outputs=*/{GetTensorRef(1, shape)});
+  XCTAssertTrue(model.PopulateTensor(0, {0.0f, 1.0f, -1.0f, 100.0f, -100.0f, 0.01f, -0.01f}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
+  status = CompareVectors({0.0f, 1.0f, std::exp(-1.0f) - 1.0f, 100.0f,
+                           std::exp(-100.0f) - 1.0f, 0.01f, std::exp(-0.01f) - 1.0f},
+                          model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
+}
+
 - (void)testExp {
   OperationType op_type = OperationType::EXP;
   const BHWC shape(1, 1, 1, 7);
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/max_unpooling.cc b/tensorflow/lite/delegates/gpu/metal/kernels/max_unpooling.cc
index d0e326baf2c..39b4c8fde0e 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/max_unpooling.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/max_unpooling.cc
@@ -99,17 +99,15 @@ std::vector<ComputeTaskDescriptorPtr> MaxUnpooling(
       {input_indices_id, "device FLT4* const src_indices_buffer"},
   };
 
-  desc->output_buffer = {output_id, "device FLT4* output_buffer",
-                         [input_id, input_indices_id,
-                          params](const std::map<ValueId, BHWC>& buffers) {
-                           return CalculateOutputShape(
-                               buffers.find(input_id)->second, params);
-                         }};
+  desc->output_buffer = {
+      output_id, "device FLT4* output_buffer",
+      [input_id, params](const std::map<ValueId, BHWC>& buffers) {
+        return CalculateOutputShape(buffers.find(input_id)->second, params);
+      }};
 
   desc->uniform_buffers = {
       {"constant uniforms& params",
-       [input_id, input_indices_id, output_id,
-        params](const std::map<ValueId, BHWC>& buffers) {
+       [input_id, output_id, params](const std::map<ValueId, BHWC>& buffers) {
          const auto& dimension = buffers.find(input_id)->second;
          const auto& output_dimension = buffers.find(output_id)->second;
          std::vector<int> uniform_params{
@@ -126,7 +124,7 @@ std::vector<ComputeTaskDescriptorPtr> MaxUnpooling(
        }},
   };
 
-  desc->resize_function = [input_id, input_indices_id,
+  desc->resize_function = [input_id,
                            params](const std::map<ValueId, BHWC>& buffers) {
     const auto& src_shape = buffers.find(input_id)->second;
     BHWC dst_shape = CalculateOutputShape(src_shape, params);
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/mean.cc b/tensorflow/lite/delegates/gpu/metal/kernels/mean.cc
index 431b1e5d6db..d67c9e7f275 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/mean.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/mean.cc
@@ -130,8 +130,7 @@ std::vector<ComputeTaskDescriptorPtr> Mean(int id, ValueId input_id,
                          }};
   desc->uniform_buffers = {
       {"constant uniforms& params",
-       [input_id, output_id,
-        work_group_size](const std::map<ValueId, BHWC>& buffers) {
+       [input_id, work_group_size](const std::map<ValueId, BHWC>& buffers) {
          const auto& src_shape = buffers.find(input_id)->second;
          const int src_slices = DivideRoundUp(src_shape.c, 4);
          struct uniforms {
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/resize.cc b/tensorflow/lite/delegates/gpu/metal/kernels/resize.cc
index 49a65c17a53..ae3aa341573 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/resize.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/resize.cc
@@ -31,7 +31,7 @@ namespace tflite {
 namespace gpu {
 namespace metal {
 
-std::string GetResizeBilinearCode(bool half_pixel_centers) {
+std::string GetResizeBilinearCode(const Resize2DAttributes& attr) {
   std::string code = R"(
     #include <metal_stdlib>
     using namespace metal;
@@ -42,7 +42,7 @@ std::string GetResizeBilinearCode(bool half_pixel_centers) {
       if (int(gid.x) >= size.z || int(gid.y) >= size.w) {
         return;
       })";
-  if (half_pixel_centers) {
+  if (attr.half_pixel_centers) {
     code += "const float2 tex_coord = (float2(gid.xy) + 0.5f) * scale - 0.5f;";
   } else {
     code += "const float2 tex_coord = float2(gid.xy) * scale;";
@@ -74,8 +74,8 @@ std::string GetResizeBilinearCode(bool half_pixel_centers) {
   return code;
 }
 
-std::string GetResizeNearestCode() {
-  return R"(
+std::string GetResizeNearestCode(const Resize2DAttributes& attr) {
+  std::string code = R"(
     #include <metal_stdlib>
     using namespace metal;
     $0
@@ -85,7 +85,28 @@ std::string GetResizeNearestCode() {
       if (int(gid.x) >= size.z || int(gid.y) >= size.w) {
         return;
       }
-      const int2 coord = int2(float2(gid.xy) * scale);
+)";
+  std::string fxc;
+  std::string fyc;
+  if (attr.half_pixel_centers) {
+    fxc = "(float(gid.x) + 0.5f) * scale.x";
+    fyc = "(float(gid.y) + 0.5f) * scale.y";
+  } else {
+    fxc = "float(gid.x) * scale.x";
+    fyc = "float(gid.y) * scale.y";
+  }
+  if (attr.align_corners) {
+    fxc += " + 0.5f";
+    fyc += " + 0.5f";
+  }
+  code += "  int2 coord;\n";
+  code += "  coord.x = static_cast<int>(" + fxc + ");\n";
+  code += "  coord.y = static_cast<int>(" + fyc + ");\n";
+  code += "  coord.x = max(0, coord.x);\n";
+  code += "  coord.y = max(0, coord.y);\n";
+  code += "  coord.x = min(coord.x, size.x - 1);\n";
+  code += "  coord.y = min(coord.y, size.y - 1);\n";
+  code += R"(
       const int src_index = (gid.z * size.y + coord.y) * size.x + coord.x;
       FLT4 value = src_buffer[src_index];
       const int linear_index = (gid.z * size.w + gid.y) * size.z + gid.x;
@@ -93,6 +114,7 @@ std::string GetResizeNearestCode() {
       output_buffer[linear_index] = value;
     }
   )";
+  return code;
 }
 
 std::vector<ComputeTaskDescriptorPtr> Resize(int id, ValueId input_id,
@@ -103,10 +125,10 @@ std::vector<ComputeTaskDescriptorPtr> Resize(int id, ValueId input_id,
   desc->is_linkable = false;
   switch (attr.type) {
     case SamplingType::BILINEAR:
-      desc->shader_source = GetResizeBilinearCode(attr.half_pixel_centers);
+      desc->shader_source = GetResizeBilinearCode(attr);
       break;
     case SamplingType::NEAREST:
-      desc->shader_source = GetResizeNearestCode();
+      desc->shader_source = GetResizeNearestCode(attr);
       break;
     default:
       // Unknown sampling type
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/resize_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/resize_test.mm
index f00b2766bdc..082f2c80125 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/resize_test.mm
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/resize_test.mm
@@ -196,4 +196,56 @@ using ::tflite::gpu::metal::SingleOpModel;
   XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
 }
 
+- (void)testResizeNearestAlignCorners {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 2, 2, 1);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 2;
+  output.shape = BHWC(1, 3, 3, 1);
+
+  Resize2DAttributes attr;
+  attr.align_corners = true;
+  attr.half_pixel_centers = false;
+  attr.new_shape = HW(3, 3);
+  attr.type = SamplingType::NEAREST;
+
+  SingleOpModel model({ToString(OperationType::RESIZE), attr}, {input}, {output});
+  XCTAssertTrue(model.PopulateTensor(0, {3.0f, 6.0f, 9.0f, 12.0f}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
+  status = CompareVectors({3.0f, 6.0f, 6.0f, 9.0f, 12.0f, 12.0f, 9.0f, 12.0f, 12.0f},
+                          model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
+}
+
+- (void)testResizeNearestHalfPixelCenters {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 2, 2, 1);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 2;
+  output.shape = BHWC(1, 3, 3, 1);
+
+  Resize2DAttributes attr;
+  attr.align_corners = false;
+  attr.half_pixel_centers = true;
+  attr.new_shape = HW(3, 3);
+  attr.type = SamplingType::NEAREST;
+
+  SingleOpModel model({ToString(OperationType::RESIZE), attr}, {input}, {output});
+  XCTAssertTrue(model.PopulateTensor(0, {3.0f, 6.0f, 9.0f, 12.0f}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
+  status = CompareVectors({3.0f, 6.0f, 6.0f, 9.0f, 12.0f, 12.0f, 9.0f, 12.0f, 12.0f},
+                          model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
+}
+
 @end
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/winograd.cc b/tensorflow/lite/delegates/gpu/metal/kernels/winograd.cc
index 2098155888d..d62c6a7fcbe 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/winograd.cc
@@ -613,8 +613,7 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36TileX6(
        }},
   };
 
-  desc->resize_function = [output_id,
-                           attr](const std::map<ValueId, BHWC>& buffers) {
+  desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
     const uint3 groups_size{4, 6, 1};
     const auto& dst_shape = buffers.find(output_id)->second;
     int grid_x = dst_shape.w;
diff --git a/tensorflow/lite/delegates/gpu/metal_delegate.h b/tensorflow/lite/delegates/gpu/metal_delegate.h
index 1cb660c42d0..e4bdba36799 100644
--- a/tensorflow/lite/delegates/gpu/metal_delegate.h
+++ b/tensorflow/lite/delegates/gpu/metal_delegate.h
@@ -16,19 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_DELEGATE_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_METAL_DELEGATE_H_
 
-#ifdef SWIG
-#define TFL_CAPI_EXPORT
-#else
-#if defined(_WIN32)
-#ifdef TFL_COMPILE_LIBRARY
-#define TFL_CAPI_EXPORT __declspec(dllexport)
-#else
-#define TFL_CAPI_EXPORT __declspec(dllimport)
-#endif  // TFL_COMPILE_LIBRARY
-#else
-#define TFL_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // _WIN32
-#endif  // SWIG
+#include "tensorflow/lite/c/common.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/tensorflow/lite/delegates/gpu/metal_delegate.mm b/tensorflow/lite/delegates/gpu/metal_delegate.mm
index 45bfe1f3b2f..c2e5289c604 100644
--- a/tensorflow/lite/delegates/gpu/metal_delegate.mm
+++ b/tensorflow/lite/delegates/gpu/metal_delegate.mm
@@ -26,6 +26,7 @@ limitations under the License.
 #include <thread>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/types/span.h"
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/common.h"
@@ -613,7 +614,7 @@ class Delegate {
   // Whenever quantized inference is enabled, this maps the tensor index of each
   // originally quantized (8-bit) tensor to its float version added in
   // model_builder - and vice versa.
-  std::unordered_map<int, int> quant_conversion_map_;
+  absl::flat_hash_map<int, int> quant_conversion_map_;
 
   TFLInferenceContext* inference_context_;
   // input and output buffers are passed into Metal inference engine
diff --git a/tensorflow/lite/delegates/hexagon/builders/BUILD b/tensorflow/lite/delegates/hexagon/builders/BUILD
index 63ff274c7b7..ef4b0e957c1 100644
--- a/tensorflow/lite/delegates/hexagon/builders/BUILD
+++ b/tensorflow/lite/delegates/hexagon/builders/BUILD
@@ -85,6 +85,7 @@ cc_library(
         "//tensorflow/lite/kernels:padding",
         "//tensorflow/lite/kernels/internal:optimized_base",
         "//tensorflow/lite/kernels/internal:tensor",
+        "@farmhash_archive//:farmhash",
         "@hexagon_nn//:hexagon_nn_ops",
     ],
 )
diff --git a/tensorflow/lite/delegates/hexagon/builders/activation_builder.cc b/tensorflow/lite/delegates/hexagon/builders/activation_builder.cc
index feb061158ea..896c7e30fd4 100644
--- a/tensorflow/lite/delegates/hexagon/builders/activation_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/activation_builder.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <limits>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
@@ -32,13 +33,7 @@ TfLiteStatus ActivationOpBuilder::PopulateSubGraph(
   int tensor_id = inputs->data[0];
   const auto& input_tensor = context->tensors[tensor_id];
   AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
-  ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_);
-  auto* input_min_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_min_), sizeof(input_min_));
-  auto* input_max_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_max_), sizeof(input_max_));
-  AddInput(TensorID(input_min_const->GetID(), 0));
-  AddInput(TensorID(input_max_const->GetID(), 0));
+  TF_LITE_ENSURE_STATUS(ComputeAndAddMinAndMax(context, input_tensor));
 
   if (op_node_.op_type == OP_QuantizedReluX_8) {
     auto* relu_value_const = graph_builder_->AddConstNodeWithData(
diff --git a/tensorflow/lite/delegates/hexagon/builders/activation_builder.h b/tensorflow/lite/delegates/hexagon/builders/activation_builder.h
index ffd6ffdabb7..4537cd4aa01 100644
--- a/tensorflow/lite/delegates/hexagon/builders/activation_builder.h
+++ b/tensorflow/lite/delegates/hexagon/builders/activation_builder.h
@@ -41,7 +41,6 @@ class ActivationOpBuilder : public OpBuilder {
 
  private:
   TensorID node_output_;
-  float input_min_, input_max_;
   float relu_value_ = 6;
 };
 
diff --git a/tensorflow/lite/delegates/hexagon/builders/arg_min_max_builder.cc b/tensorflow/lite/delegates/hexagon/builders/arg_min_max_builder.cc
index 4cd2dc1f897..cc4b97a6222 100644
--- a/tensorflow/lite/delegates/hexagon/builders/arg_min_max_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/arg_min_max_builder.cc
@@ -54,15 +54,7 @@ TfLiteStatus ArgMinMaxOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
   AddInput(TensorID(input_axis_const->GetID(), 0));
 
   // Compute Min/Max
-  TF_LITE_ENSURE_STATUS(
-      ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_));
-  auto* input_min_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_min_), sizeof(input_min_));
-  auto* input_max_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_max_), sizeof(input_max_));
-
-  AddInput(TensorID(input_min_const->GetID(), 0));
-  AddInput(TensorID(input_max_const->GetID(), 0));
+  TF_LITE_ENSURE_STATUS(ComputeAndAddMinAndMax(context, input_tensor));
 
   // Output Node
   int output_batch_size, output_height_size, output_width_size,
diff --git a/tensorflow/lite/delegates/hexagon/builders/arg_min_max_builder.h b/tensorflow/lite/delegates/hexagon/builders/arg_min_max_builder.h
index 0ffa4ac9505..54d85b5cb51 100644
--- a/tensorflow/lite/delegates/hexagon/builders/arg_min_max_builder.h
+++ b/tensorflow/lite/delegates/hexagon/builders/arg_min_max_builder.h
@@ -36,7 +36,6 @@ class ArgMinMaxOpBuilder : public OpBuilder {
 
  private:
   TensorID node_output_;
-  float input_min_, input_max_;
 };
 
 }  // namespace hexagon
diff --git a/tensorflow/lite/delegates/hexagon/builders/arithmetic_builder.cc b/tensorflow/lite/delegates/hexagon/builders/arithmetic_builder.cc
index 5b069ed4e24..1b6de1b8907 100644
--- a/tensorflow/lite/delegates/hexagon/builders/arithmetic_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/arithmetic_builder.cc
@@ -33,29 +33,15 @@ TfLiteStatus ArithmeticOpBuilder::PopulateSubGraph(
   int tensor_id = inputs->data[0];
   const auto& input1_tensor = context->tensors[tensor_id];
   AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
-  TF_LITE_ENSURE_STATUS(
-      ComputeMinAndMaxQuantValues(input1_tensor, &input1_min_, &input1_max_));
-  auto* input1_min_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input1_min_), sizeof(input1_min_));
-  auto* input1_max_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input1_max_), sizeof(input1_max_));
 
   // Second input data tensor.
   tensor_id = inputs->data[1];
   const auto& input2_tensor = context->tensors[tensor_id];
   AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
-  TF_LITE_ENSURE_STATUS(
-      ComputeMinAndMaxQuantValues(input2_tensor, &input2_min_, &input2_max_));
-  auto* input2_min_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input2_min_), sizeof(input2_min_));
-  auto* input2_max_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input2_max_), sizeof(input2_max_));
 
-  // Min/max values for input tensors.
-  AddInput(TensorID(input1_min_const->GetID(), 0));
-  AddInput(TensorID(input1_max_const->GetID(), 0));
-  AddInput(TensorID(input2_min_const->GetID(), 0));
-  AddInput(TensorID(input2_max_const->GetID(), 0));
+  // Inputs min/max
+  TF_LITE_ENSURE_STATUS(ComputeAndAddMinAndMax(context, input1_tensor));
+  TF_LITE_ENSURE_STATUS(ComputeAndAddMinAndMax(context, input2_tensor));
 
   // Output details.
   TF_LITE_ENSURE_STATUS(ComputeMinAndMaxQuantValues(
diff --git a/tensorflow/lite/delegates/hexagon/builders/arithmetic_builder.h b/tensorflow/lite/delegates/hexagon/builders/arithmetic_builder.h
index e3cba846884..cc17a5c1426 100644
--- a/tensorflow/lite/delegates/hexagon/builders/arithmetic_builder.h
+++ b/tensorflow/lite/delegates/hexagon/builders/arithmetic_builder.h
@@ -38,8 +38,7 @@ class ArithmeticOpBuilder : public OpBuilder {
 
  private:
   TensorID node_output_;
-  float input1_min_, input1_max_, input2_min_, input2_max_, output_min_,
-      output_max_;
+  float output_min_, output_max_;
 };
 
 }  // namespace hexagon
diff --git a/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc b/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc
index cfddd2c2b97..c6d20004227 100644
--- a/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc
@@ -267,13 +267,13 @@ TfLiteStatus Conv2dOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
     auto* conv_op = graph_builder_->AddNode(GetTFLiteNodeID());
     conv_op->SetOpType(OP_DepthwiseSupernode_8x8p32to8);
     conv_op->AddInput(space_to_batch_op_out);
-    conv_op->AddInput(TensorID(weights_data_node_->GetID(), 0));
+    conv_op->AddInput(graph_builder_->GetHexagonTensorId(inputs->data[1]));
     conv_op->AddInput(TensorID(data_min_const->GetID(), 0));
     conv_op->AddInput(TensorID(data_max_const->GetID(), 0));
     conv_op->AddInput(TensorID(weights_min_node_->GetID(), 0));
     conv_op->AddInput(TensorID(weights_max_node_->GetID(), 0));
     conv_op->AddInput(TensorID(stride_node->GetID(), 0));
-    conv_op->AddInput(TensorID(bias_data_node_->GetID(), 0));
+    conv_op->AddInput(graph_builder_->GetHexagonTensorId(inputs->data[2]));
     conv_op->AddInput(TensorID(bias_min_node_->GetID(), 0));
     conv_op->AddInput(TensorID(bias_max_node_->GetID(), 0));
     conv_op->AddInput(TensorID(conv_output_min_const->GetID(), 0));
@@ -330,13 +330,13 @@ TfLiteStatus Conv2dOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
     }
     // Inputs
     AddInput(graph_builder_->GetHexagonTensorId(inputs->data[0]));
-    AddInput(TensorID(weights_data_node_->GetID(), 0));
+    AddInput(graph_builder_->GetHexagonTensorId(inputs->data[1]));
     AddInput(TensorID(data_min_const->GetID(), 0));
     AddInput(TensorID(data_max_const->GetID(), 0));
     AddInput(TensorID(weights_min_node_->GetID(), 0));
     AddInput(TensorID(weights_max_node_->GetID(), 0));
     AddInput(TensorID(stride_node->GetID(), 0));
-    AddInput(TensorID(bias_data_node_->GetID(), 0));
+    AddInput(graph_builder_->GetHexagonTensorId(inputs->data[2]));
     AddInput(TensorID(bias_min_node_->GetID(), 0));
     AddInput(TensorID(bias_max_node_->GetID(), 0));
     AddInput(TensorID(conv_output_min_const->GetID(), 0));
diff --git a/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.h b/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.h
index 4980b294481..1407f06154b 100644
--- a/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.h
+++ b/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.h
@@ -62,10 +62,8 @@ class Conv2dOpBuilder : public OpBuilder {
   std::vector<float> transposed_weights_;
   std::vector<int> stride_shape_;
   std::vector<int> weight_shape_;
-  OpBuilder* weights_data_node_ = nullptr;
   OpBuilder* weights_min_node_ = nullptr;
   OpBuilder* weights_max_node_ = nullptr;
-  OpBuilder* bias_data_node_ = nullptr;
   OpBuilder* bias_min_node_ = nullptr;
   OpBuilder* bias_max_node_ = nullptr;
 
diff --git a/tensorflow/lite/delegates/hexagon/builders/conv_2d_helpers.cc b/tensorflow/lite/delegates/hexagon/builders/conv_2d_helpers.cc
index bf68bbe5a25..b33e28f4e71 100644
--- a/tensorflow/lite/delegates/hexagon/builders/conv_2d_helpers.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/conv_2d_helpers.cc
@@ -106,6 +106,7 @@ TfLiteStatus Conv2dOpBuilder::InitializeWeightsNodes(
   const bool is_per_channel_quant = weights_quant_params->scale->size > 1;
 
   // WEIGHTS DATA.
+  OpBuilder* weights_data_node = nullptr;
   if (op_node_.op_type == OP_Supernode_8x8p32to8) {
     // Hexagon lib expects the weight tensor in HWCN, TFLite uses NHWC.
     // Transpose NHWC -> HWCN
@@ -137,7 +138,7 @@ TfLiteStatus Conv2dOpBuilder::InitializeWeightsNodes(
                                         weights_tensor.data.uint8, hwcn_shape,
                                         hwcn.data());
     }
-    weights_data_node_ = graph_builder_->AddConstNodeWithData(
+    weights_data_node = graph_builder_->AddConstNodeWithData(
         weight_shape_.data(), reinterpret_cast<char*>(hwcn.data()),
         hwcn.size() * sizeof(hwcn[0]));
   } else if (op_node_.op_type == OP_DepthwiseSupernode_8x8p32to8) {
@@ -156,17 +157,17 @@ TfLiteStatus Conv2dOpBuilder::InitializeWeightsNodes(
       for (int i = 0; i < converted_data.size(); ++i) {
         converted_data[i] = weights_tensor.data.int8[i] ^ k8BitSignFlipConstant;
       }
-      weights_data_node_ = graph_builder_->AddConstNodeWithData(
+      weights_data_node = graph_builder_->AddConstNodeWithData(
           weight_shape_.data(), reinterpret_cast<char*>(converted_data.data()),
           converted_data.size() * sizeof(converted_data[0]));
     } else {
-      weights_data_node_ = graph_builder_->AddConstNodeWithData(
+      weights_data_node = graph_builder_->AddConstNodeWithData(
           weight_shape_.data(), weights_tensor.data.raw,
           NumElements(&weights_tensor) * sizeof(weights_tensor.data.uint8[0]));
     }
   }
-  graph_builder_->AddTensorWithID(inputs->data[1], weights_data_node_->GetID(),
-                                  0);
+  graph_builder_->AddTensorWithID(inputs->data[1], weights_data_node->GetID(),
+                                  0, /*overwrite=*/true);
 
   // WEIGHTS QUANTIZATION.
   float weights_min = 0;
@@ -229,9 +230,11 @@ TfLiteStatus Conv2dOpBuilder::ProcessPerChannelQuantizedBias(
   }
   // Add nodes for bias.
   const std::vector<int> bias_shape = {1, 1, 1, bias_size};
-  bias_data_node_ = graph_builder_->AddConstNodeWithData(
+  auto* bias_data_node = graph_builder_->AddConstNodeWithData(
       bias_shape.data(), reinterpret_cast<char*>(preprocessed_bias_data.data()),
       preprocessed_bias_data.size() * sizeof(preprocessed_bias_data[0]));
+  graph_builder_->AddTensorWithID(inputs->data[2], bias_data_node->GetID(), 0,
+                                  /*overwrite=*/true);
   return kTfLiteOk;
 }
 
@@ -248,8 +251,10 @@ TfLiteStatus Conv2dOpBuilder::InitializeBiasNodes(const TfLiteIntArray* inputs,
     ProcessPerChannelQuantizedBias(inputs, outputs, context, &bias_min,
                                    &bias_max);
   } else {
-    bias_data_node_ =
+    auto* bias_data_node =
         graph_builder_->AddConstNodeWithData(inputs->data[2], bias_tensor);
+    graph_builder_->AddTensorWithID(inputs->data[2], bias_data_node->GetID(), 0,
+                                    /*overwrite=*/true);
     TF_LITE_ENSURE_STATUS(
         ComputeMinAndMaxQuantValues(bias_tensor, &bias_min, &bias_max));
   }
diff --git a/tensorflow/lite/delegates/hexagon/builders/hardswish_builder.cc b/tensorflow/lite/delegates/hexagon/builders/hardswish_builder.cc
index af7daec7875..774f8759c6e 100644
--- a/tensorflow/lite/delegates/hexagon/builders/hardswish_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/hardswish_builder.cc
@@ -31,24 +31,11 @@ TfLiteStatus HardSwishOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
   int tensor_id = inputs->data[0];
   const auto& input1_tensor = context->tensors[tensor_id];
   AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
-  TF_LITE_ENSURE_STATUS(
-      ComputeMinAndMaxQuantValues(input1_tensor, &input_min_, &input_max_));
-  auto* input_min_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_min_), sizeof(input_min_));
-  auto* input_max_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_max_), sizeof(input_max_));
-  AddInput(TensorID(input_min_const->GetID(), 0));
-  AddInput(TensorID(input_max_const->GetID(), 0));
+  TF_LITE_ENSURE_STATUS(ComputeAndAddMinAndMax(context, input1_tensor));
 
   // Output min/max
-  TF_LITE_ENSURE_STATUS(ComputeMinAndMaxQuantValues(
-      context->tensors[outputs->data[0]], &output_min_, &output_max_));
-  auto* output_min_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&output_min_), sizeof(output_min_));
-  auto* output_max_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&output_max_), sizeof(output_max_));
-  AddInput(TensorID(output_min_const->GetID(), 0));
-  AddInput(TensorID(output_max_const->GetID(), 0));
+  TF_LITE_ENSURE_STATUS(
+      ComputeAndAddMinAndMax(context, context->tensors[outputs->data[0]]));
 
   int output_batch_size, output_height_size, output_width_size,
       output_depth_size;
diff --git a/tensorflow/lite/delegates/hexagon/builders/hardswish_builder.h b/tensorflow/lite/delegates/hexagon/builders/hardswish_builder.h
index ba173bec466..a48b0b0c2ec 100644
--- a/tensorflow/lite/delegates/hexagon/builders/hardswish_builder.h
+++ b/tensorflow/lite/delegates/hexagon/builders/hardswish_builder.h
@@ -40,7 +40,6 @@ class HardSwishOpBuilder : public OpBuilder {
 
  private:
   TensorID node_output_;
-  float input_min_, input_max_, output_min_, output_max_;
 };
 
 }  // namespace hexagon
diff --git a/tensorflow/lite/delegates/hexagon/builders/l2_normalization_builder.cc b/tensorflow/lite/delegates/hexagon/builders/l2_normalization_builder.cc
index 1adc5102ead..4565b299800 100644
--- a/tensorflow/lite/delegates/hexagon/builders/l2_normalization_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/l2_normalization_builder.cc
@@ -32,14 +32,7 @@ TfLiteStatus L2NormalizationOpBuilder::PopulateSubGraph(
   int tensor_id = inputs->data[0];
   const auto& input_tensor = context->tensors[tensor_id];
   AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
-  TF_LITE_ENSURE_STATUS(
-      ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_));
-  auto* input_min_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_min_), sizeof(input_min_));
-  auto* input_max_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_max_), sizeof(input_max_));
-  AddInput(TensorID(input_min_const->GetID(), 0));
-  AddInput(TensorID(input_max_const->GetID(), 0));
+  TF_LITE_ENSURE_STATUS(ComputeAndAddMinAndMax(context, input_tensor));
 
   // Hexagon outputs for this node.
   int output_batch_size, output_height_size, output_width_size,
diff --git a/tensorflow/lite/delegates/hexagon/builders/l2_normalization_builder.h b/tensorflow/lite/delegates/hexagon/builders/l2_normalization_builder.h
index d552d6f7a09..7a534ccd5bb 100644
--- a/tensorflow/lite/delegates/hexagon/builders/l2_normalization_builder.h
+++ b/tensorflow/lite/delegates/hexagon/builders/l2_normalization_builder.h
@@ -38,7 +38,6 @@ class L2NormalizationOpBuilder : public OpBuilder {
 
  private:
   TensorID node_output_;
-  float input_min_, input_max_;
 };
 
 }  // namespace hexagon
diff --git a/tensorflow/lite/delegates/hexagon/builders/min_max_builder.cc b/tensorflow/lite/delegates/hexagon/builders/min_max_builder.cc
index 67027619415..0c6dea2096d 100644
--- a/tensorflow/lite/delegates/hexagon/builders/min_max_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/min_max_builder.cc
@@ -27,48 +27,17 @@ TfLiteStatus MinMaxOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
   int b_tensor_id = inputs->data[1];
   const auto& a_tensor = context->tensors[a_tensor_id];
   const auto& b_tensor = context->tensors[b_tensor_id];
-  if (a_tensor.allocation_type == kTfLiteMmapRo)
-    graph_builder_->AddConstNodeWithData(a_tensor_id, a_tensor);
-  if (b_tensor.allocation_type == kTfLiteMmapRo)
-    graph_builder_->AddConstNodeWithData(b_tensor_id, b_tensor);
   AddInput(graph_builder_->GetHexagonTensorId(a_tensor_id));
   AddInput(graph_builder_->GetHexagonTensorId(b_tensor_id));
 
   // Add Inputs A & B min/max
-  TF_LITE_ENSURE_STATUS(
-      ComputeMinAndMaxQuantValues(a_tensor, &a_input_min_, &a_input_max_));
-  auto* a_input_min_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&a_input_min_),
-      sizeof(a_input_min_));
-  auto* a_input_max_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&a_input_max_),
-      sizeof(a_input_max_));
-  AddInput(TensorID(a_input_min_const->GetID(), 0));
-  AddInput(TensorID(a_input_max_const->GetID(), 0));
-
-  TF_LITE_ENSURE_STATUS(
-      ComputeMinAndMaxQuantValues(b_tensor, &b_input_min_, &b_input_max_));
-  auto* b_input_min_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&b_input_min_),
-      sizeof(b_input_min_));
-  auto* b_input_max_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&b_input_max_),
-      sizeof(b_input_max_));
-  AddInput(TensorID(b_input_min_const->GetID(), 0));
-  AddInput(TensorID(b_input_max_const->GetID(), 0));
+  TF_LITE_ENSURE_STATUS(ComputeAndAddMinAndMax(context, a_tensor));
+  TF_LITE_ENSURE_STATUS(ComputeAndAddMinAndMax(context, b_tensor));
 
   // Add output min/max
   const int output_tensor_id = outputs->data[0];
   const auto& output_tensor = context->tensors[output_tensor_id];
-  float output_min, output_max;
-  TF_LITE_ENSURE_STATUS(
-      ComputeMinAndMaxQuantValues(output_tensor, &output_min, &output_max));
-  auto* output_min_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&output_min), sizeof(output_min));
-  auto* output_max_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&output_max), sizeof(output_max));
-  AddInput(TensorID(output_min_const->GetID(), 0));
-  AddInput(TensorID(output_max_const->GetID(), 0));
+  TF_LITE_ENSURE_STATUS(ComputeAndAddMinAndMax(context, output_tensor));
 
   // Add outputs.
   int output_batch_size, output_height_size, output_width_size,
diff --git a/tensorflow/lite/delegates/hexagon/builders/min_max_builder.h b/tensorflow/lite/delegates/hexagon/builders/min_max_builder.h
index c12cb5ee665..5683f331799 100644
--- a/tensorflow/lite/delegates/hexagon/builders/min_max_builder.h
+++ b/tensorflow/lite/delegates/hexagon/builders/min_max_builder.h
@@ -35,7 +35,6 @@ class MinMaxOpBuilder : public OpBuilder {
 
  private:
   TensorID node_output_;
-  float a_input_min_, a_input_max_, b_input_min_, b_input_max_;
 };
 
 }  // namespace hexagon
diff --git a/tensorflow/lite/delegates/hexagon/builders/mirror_pad_builder.cc b/tensorflow/lite/delegates/hexagon/builders/mirror_pad_builder.cc
index 441140cd570..9f16e7aba58 100644
--- a/tensorflow/lite/delegates/hexagon/builders/mirror_pad_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/mirror_pad_builder.cc
@@ -65,14 +65,7 @@ TfLiteStatus MirrorPadOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
   }
 
   // Min/max values for input tensor.
-  TF_LITE_ENSURE_STATUS(
-      ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_));
-  auto* input_min_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_min_), sizeof(input_min_));
-  auto* input_max_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_max_), sizeof(input_max_));
-  AddInput(TensorID(input_min_const->GetID(), 0));
-  AddInput(TensorID(input_max_const->GetID(), 0));
+  TF_LITE_ENSURE_STATUS(ComputeAndAddMinAndMax(context, input_tensor));
 
   // Hexagon outputs for this node.
   int output_batch_size, output_height_size, output_width_size,
diff --git a/tensorflow/lite/delegates/hexagon/builders/mirror_pad_builder.h b/tensorflow/lite/delegates/hexagon/builders/mirror_pad_builder.h
index 50af36b7417..3afbac2c788 100644
--- a/tensorflow/lite/delegates/hexagon/builders/mirror_pad_builder.h
+++ b/tensorflow/lite/delegates/hexagon/builders/mirror_pad_builder.h
@@ -38,7 +38,6 @@ class MirrorPadOpBuilder : public OpBuilder {
 
  private:
   TensorID node_output_;
-  float input_min_, input_max_;
   std::vector<int> paddings_shape_;
 };
 
diff --git a/tensorflow/lite/delegates/hexagon/builders/neg_op_builder.cc b/tensorflow/lite/delegates/hexagon/builders/neg_op_builder.cc
index 8454258f700..93511dc491d 100644
--- a/tensorflow/lite/delegates/hexagon/builders/neg_op_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/neg_op_builder.cc
@@ -26,13 +26,7 @@ TfLiteStatus NegOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
   int tensor_id = inputs->data[0];
   const auto& input_tensor = context->tensors[tensor_id];
   AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
-  ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_);
-  auto* input_min_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_min_), sizeof(input_min_));
-  auto* input_max_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_max_), sizeof(input_max_));
-  AddInput(TensorID(input_min_const->GetID(), 0));
-  AddInput(TensorID(input_max_const->GetID(), 0));
+  TF_LITE_ENSURE_STATUS(ComputeAndAddMinAndMax(context, input_tensor));
 
   // Hexagon outputs for this node.
   int output_batch_size, output_height_size, output_width_size,
diff --git a/tensorflow/lite/delegates/hexagon/builders/neg_op_builder.h b/tensorflow/lite/delegates/hexagon/builders/neg_op_builder.h
index c7b535e7b29..578447ec791 100644
--- a/tensorflow/lite/delegates/hexagon/builders/neg_op_builder.h
+++ b/tensorflow/lite/delegates/hexagon/builders/neg_op_builder.h
@@ -34,7 +34,6 @@ class NegOpBuilder : public OpBuilder {
 
  private:
   TensorID node_output_;
-  float input_min_, input_max_;
 };
 
 }  // namespace hexagon
diff --git a/tensorflow/lite/delegates/hexagon/builders/op_builder.cc b/tensorflow/lite/delegates/hexagon/builders/op_builder.cc
index 95cec93a41a..80aa4c8155c 100644
--- a/tensorflow/lite/delegates/hexagon/builders/op_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/op_builder.cc
@@ -18,10 +18,59 @@ limitations under the License.
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/hexagon/builders/op_factory.h"
+#include <farmhash.h>
 
 namespace tflite {
 namespace delegates {
 namespace hexagon {
+namespace {
+// Farmhash Fingerprint
+inline uint64_t CombineFingerprints(uint64_t l, uint64_t h) {
+  // Murmur-inspired hashing.
+  const uint64_t kMul = 0x9ddfea08eb382d69ULL;
+  uint64_t a = (l ^ h) * kMul;
+  a ^= (a >> 47);
+  uint64_t b = (h ^ a) * kMul;
+  b ^= (b >> 44);
+  b *= kMul;
+  b ^= (b >> 41);
+  b *= kMul;
+  return b;
+}
+
+inline uint64_t ComputeHash(const int shape[], const char* data,
+                            const int data_len) {
+  return CombineFingerprints(
+      ::util::Fingerprint64(data, data_len),
+      ::util::Fingerprint64(reinterpret_cast<const char*>(shape),
+                              sizeof(shape[0]) * 4));
+}
+
+inline uint64_t ComputeHash(const TfLiteTensor& tensor, const int shape[],
+                            int int8_to_uint8) {
+  auto data_hash = ComputeHash(shape, tensor.data.raw_const, tensor.bytes);
+  auto int8_to_uint8_hash = ::util::Fingerprint64(
+      reinterpret_cast<char*>(&int8_to_uint8), sizeof(int8_to_uint8));
+  return CombineFingerprints(data_hash, int8_to_uint8_hash);
+}
+
+int GetElementSize(TfLiteType type) {
+  switch (type) {
+    case kTfLiteFloat32:
+      return sizeof(float);
+    case kTfLiteBool:
+      return sizeof(bool);
+    case kTfLiteInt32:
+      return sizeof(int32_t);
+    case kTfLiteInt8:
+      return sizeof(int8_t);
+    case kTfLiteUInt8:
+      return sizeof(uint8_t);
+    default:
+      return sizeof(int8_t);
+  }
+}
+}  // namespace
 
 OpBuilder* GraphBuilder::CreateOpBuilderFromTfLiteOp(int op_type,
                                                      TfLiteNode* node) {
@@ -116,8 +165,20 @@ OpBuilder* GraphBuilder::CreateOpBuilderFromTfLiteOp(int op_type,
   }
 }
 
+OpBuilder* GraphBuilder::LookupConstData(uint64_t cache_key) {
+  auto lookup_result = cache_.find(cache_key);
+  if (lookup_result != cache_.end()) return lookup_result->second;
+  return nullptr;
+}
+
+void GraphBuilder::AddToCache(uint64_t cache_key, OpBuilder* value) {
+  cache_[cache_key] = value;
+}
+
 OpBuilder* GraphBuilder::AddConstNodeWithData(const int shape[], char* data,
                                               int data_size) {
+  auto cache_key = ComputeHash(shape, data, data_size);
+  if (auto lookup_result = LookupConstData(cache_key)) return lookup_result;
   builders_.emplace_back(new OpBuilder(this, OP_Const));
   builders_.back()->SetConstNode();
   builders_.back()->SetNodeId(builders_.size());
@@ -125,22 +186,36 @@ OpBuilder* GraphBuilder::AddConstNodeWithData(const int shape[], char* data,
       graph_id_, builders_.size(), shape[0], shape[1], shape[2], shape[3],
       reinterpret_cast<const uint8_t*>(data), data_size);
   if (error != 0) {
-    context_->ReportError(context_, "Error adding const node with shape id: %d",
-                          (int)builders_.size());
+    TF_LITE_KERNEL_LOG(context_, "Error adding const node with shape id: %d",
+                       static_cast<int>(builders_.size()));
     return nullptr;
   }
+  AddToCache(cache_key, builders_.back().get());
   return builders_.back().get();
 }
 
 OpBuilder* GraphBuilder::AddConstNodeWithData(int tensor_id,
                                               const TfLiteTensor& tensor,
                                               bool int8_to_uint8) {
+  // Fetch shape of tensor and pad 1's so it is always 4D.
+  int batch_size, height_size, width_size, depth_size;
+  GetDims(&batch_size, &height_size, &width_size, &depth_size, tensor.dims);
+  const int shape[] = {batch_size, height_size, width_size, depth_size};
+
+  auto cache_key = ComputeHash(tensor, shape, int8_to_uint8 ? 1 : 0);
+  if (auto lookup_result = LookupConstData(cache_key)) {
+    // If tensor is cached but with no id, that can happen when the same
+    // data is added from a constant value (not tensor). We can cache the data
+    // and reuse it.
+    // We assign the tensor to this cached const node before returning.
+    if (!HasTensor(tensor_id))
+      AddTensorWithID(tensor_id, lookup_result->GetID(), 0);
+    return lookup_result;
+  }
   builders_.emplace_back(new OpBuilder(this, OP_Const));
   const int node_id = builders_.size();
   builders_.back()->SetConstNode();
   builders_.back()->SetNodeId(node_id);
-  int batch_size, height_size, width_size, depth_size;
-  GetDims(&batch_size, &height_size, &width_size, &depth_size, tensor.dims);
   int error = hexagon_nn_->hexagon_nn_append_const_node(
       graph_id_, node_id, batch_size, height_size, width_size, depth_size,
       reinterpret_cast<const uint8_t*>(tensor.data.raw), tensor.bytes);
@@ -150,19 +225,26 @@ OpBuilder* GraphBuilder::AddConstNodeWithData(int tensor_id,
     return nullptr;
   }
   AddTensorWithID(tensor_id, node_id, 0);
+  // We need to return the builder with result, so we can't rely
+  // on builders_.back() as it can change while casting, so we hold pointer
+  // and update with value from casting if needed.
+  OpBuilder* result_builder = builders_.back().get();
   // Cast int8 to uint8 if requested.
   // This will add cast op to uint8 and update tensor map to point
   // to the casted tensor.
   if (int8_to_uint8 && tensor.type == kTfLiteInt8) {
-    AddCastOp(context_, OP_Quantized_CastInt8ToUInt8, tensor_id);
+    AddCastOp(context_, OP_Quantized_CastInt8ToUInt8, tensor_id,
+              &result_builder);
   }
-  return builders_.back().get();
+  AddToCache(cache_key, result_builder);
+  return result_builder;
 }
 
 // TODO(b/154604279): Support these casting ops in Hexagon op profiling (which
 // seems to key tensors on a single op, which may not be the case now).
 TfLiteStatus GraphBuilder::AddCastOp(TfLiteContext* context, int op_type,
-                                     int tensor_id) {
+                                     int tensor_id,
+                                     OpBuilder** cast_op_builder) {
   // Create a new OpBuilder for casting the tensor.
   OpBuilder* cast_builder = CreateCastBuilder(this, op_type);
   builders_.emplace_back(cast_builder);
@@ -177,6 +259,7 @@ TfLiteStatus GraphBuilder::AddCastOp(TfLiteContext* context, int op_type,
   TF_LITE_ENSURE_STATUS(cast_builder->RegisterOutputs(tensor_data, context));
 
   TfLiteIntArrayFree(tensor_data);
+  if (cast_op_builder != nullptr) *cast_op_builder = cast_builder;
   return kTfLiteOk;
 }
 
@@ -192,12 +275,12 @@ TfLiteStatus GraphBuilder::AddInputTensors(const TfLiteIntArray* input_tensors,
     const int tensor_id = input_tensors->data[i];
     const auto& tensor = context->tensors[tensor_id];
     if (tensor.allocation_type == kTfLiteMmapRo) continue;
-    input_op->AddOutput(tensor.dims);
+    input_op->AddOutput(tensor.dims, GetElementSize(tensor.type));
     AddTensorWithID(tensor_id, input_op->GetID(), num_inputs);
     // If tensor is of type int8, add an op to cast it to uint8.
     if (tensor.type == kTfLiteInt8) {
-      TF_LITE_ENSURE_STATUS(
-          AddCastOp(context, OP_Quantized_CastInt8ToUInt8, tensor_id));
+      TF_LITE_ENSURE_STATUS(AddCastOp(context, OP_Quantized_CastInt8ToUInt8,
+                                      tensor_id, /*cast_op_builder=*/nullptr));
     }
     ++num_inputs;
   }
@@ -215,8 +298,8 @@ TfLiteStatus GraphBuilder::AddOutputTensors(
     const auto& tensor = context->tensors[tensor_id];
     // If tensor is of type int8, add an op to cast it to uint8.
     if (tensor.type == kTfLiteInt8) {
-      TF_LITE_ENSURE_STATUS(
-          AddCastOp(context, OP_Quantized_CastUInt8ToInt8, tensor_id));
+      TF_LITE_ENSURE_STATUS(AddCastOp(context, OP_Quantized_CastUInt8ToInt8,
+                                      tensor_id, /*cast_op_builder=*/nullptr));
     }
     hexagon_output_ids.push_back(GetHexagonTensorId(tensor_id));
   }
@@ -231,9 +314,10 @@ TfLiteStatus GraphBuilder::AddOutputTensors(
   return kTfLiteOk;
 }
 
-OpBuilder::TensorID OpBuilder::AddOutput(const TfLiteIntArray* dims) {
+OpBuilder::TensorID OpBuilder::AddOutput(const TfLiteIntArray* dims,
+                                         int element_size) {
   op_node_.outputs.push_back(hexagon_nn_output());
-  op_node_.outputs.back().elementsize = sizeof(uint8_t);
+  op_node_.outputs.back().elementsize = element_size;
   op_node_.outputs.back().rank = 4;
   // TODO(karimnosseir): What is a good to estimate the max size ?
   int batch_size, height_size, width_size, depth_size;
@@ -279,6 +363,21 @@ const OpNode* OpBuilder::Build() {
   return &op_node_;
 }
 
+TfLiteStatus OpBuilder::ComputeAndAddMinAndMax(TfLiteContext* context,
+                                               const TfLiteTensor& tensor) {
+  float tensor_min, tensor_max;
+  TF_LITE_ENSURE_STATUS(
+      ComputeMinAndMaxQuantValues(tensor, &tensor_min, &tensor_max));
+  auto* min_const_node = graph_builder_->AddConstNodeWithData(
+      kScalarShape, reinterpret_cast<char*>(&tensor_min), sizeof(tensor_min));
+  auto* max_const_node = graph_builder_->AddConstNodeWithData(
+      kScalarShape, reinterpret_cast<char*>(&tensor_max), sizeof(tensor_max));
+  AddInput(TensorID(min_const_node->GetID(), 0));
+  AddInput(TensorID(max_const_node->GetID(), 0));
+
+  return kTfLiteOk;
+}
+
 // Static
 constexpr int OpBuilder::kScalarShape[];
 
diff --git a/tensorflow/lite/delegates/hexagon/builders/op_builder.h b/tensorflow/lite/delegates/hexagon/builders/op_builder.h
index 8cfa90565bd..c2a2889b142 100644
--- a/tensorflow/lite/delegates/hexagon/builders/op_builder.h
+++ b/tensorflow/lite/delegates/hexagon/builders/op_builder.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_OP_BUILDER_H_
 
 #include <limits>
+#include <map>
 #include <memory>
 #include <string>
 #include <utility>
@@ -131,9 +132,9 @@ class OpBuilder {
   void AddInput(const TensorID& tensor_id) { input_ids_.push_back(tensor_id); }
 
   // Adds Output to the current node, the output has shape defined in 'dims'.
-  // This assumes the data type is uint8.
+  // The size of each element is defined using 'element_size'.
   // Returns the TensorID identifying this output in the graph.
-  TensorID AddOutput(const TfLiteIntArray* dims);
+  TensorID AddOutput(const TfLiteIntArray* dims, int element_size);
 
   // Adds Output to the current node, each element in the output has
   // size 'elementsize' and rank 'rank' and for each dimension in the output
@@ -182,6 +183,14 @@ class OpBuilder {
     }
   }
 
+  // Computes the min and max for 'tensor' and adds them as input
+  // to the node.
+  TfLiteStatus ComputeAndAddMinAndMax(TfLiteContext* context,
+                                      const TfLiteTensor& tensor);
+
+  // Computes the float min and max for 'tensor', given 'min_value' and
+  // 'max_value' data range. The float min and max will be set in 'min' and
+  // 'max' params
   template <typename T>
   static TfLiteStatus ComputeMinAndMaxQuantValues(const TfLiteTensor& tensor,
                                                   float* min, float* max,
@@ -308,11 +317,22 @@ class GraphBuilder {
   bool AddTensorWithID(int tflite_tensor_id, int hexagon_node_id,
                        int hexagon_node_output_id, bool overwrite = false) {
     if (!overwrite && HasTensor(tflite_tensor_id)) {
+      TF_LITE_KERNEL_LOG(
+          context_,
+          "Trying to add duplicate tensor without overwrite, tflite_tensor_id "
+          "%d, hexagon_node_id %d, hexagon_node_output_id %d",
+          tflite_tensor_id, hexagon_node_id, hexagon_node_output_id);
       return false;
     }
     if (tensors_.size() <= tflite_tensor_id) {
       tensors_.resize(tflite_tensor_id + 1);
     }
+    if (hexagon_node_id == -1 || hexagon_node_output_id == -1)
+      TF_LITE_KERNEL_LOG(context_,
+                         "Trying to add invalid id, tflite_tensor_id "
+                         "%d, hexagon_node_id %d, hexagon_node_output_id %d",
+                         tflite_tensor_id, hexagon_node_id,
+                         hexagon_node_output_id);
     tensors_[tflite_tensor_id] =
         OpBuilder::TensorID(hexagon_node_id, hexagon_node_output_id);
     return true;
@@ -340,6 +360,14 @@ class GraphBuilder {
   int GetMaxBatchSize() const { return max_size_for_batch_; }
 
  private:
+  // Lookup in cache if data with key 'cache_key' is present.
+  // Return OpBuilder* for the data if found, nullptr otherwise.
+  OpBuilder* LookupConstData(uint64_t cache_key);
+
+  // Inserts 'value' in cache, with key equals 'cache_key'.
+  // If data in cache with same key then it will be overwritten.
+  void AddToCache(uint64_t cache_key, OpBuilder* value);
+
   // Helper method to fetch dimensions.
   // TODO(karimnosseir): Move this method to shared place.
   void GetDims(int* batch_size, int* height_size, int* width_size,
@@ -352,7 +380,10 @@ class GraphBuilder {
   }
 
   // Adds a Cast op to convert a tensor from int8 to uint8 (or vice versa).
-  TfLiteStatus AddCastOp(TfLiteContext* context, int op_type, int tensor_id);
+  // The builder which has the casting operator is filled in 'cast_op_builder'
+  // if not nullptr.
+  TfLiteStatus AddCastOp(TfLiteContext* context, int op_type, int tensor_id,
+                         OpBuilder** cast_op_builder);
 
   const HexagonNN* hexagon_nn_ = nullptr;
   TfLiteContext* context_ = nullptr;
@@ -365,6 +396,11 @@ class GraphBuilder {
   // If the graph being built supports dynamic batch, this represents
   // the maximum value for batch.
   int max_size_for_batch_ = -1;
+
+  // Cache for const data in the graph.
+  // Key is hash of the data, value is pointer to the OpBuilder* for the added
+  // data.
+  std::map<uint64_t, OpBuilder*> cache_;
 };
 
 }  // namespace hexagon
diff --git a/tensorflow/lite/delegates/hexagon/builders/pad_builder.cc b/tensorflow/lite/delegates/hexagon/builders/pad_builder.cc
index 08393cb8720..7473d686391 100644
--- a/tensorflow/lite/delegates/hexagon/builders/pad_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/pad_builder.cc
@@ -32,16 +32,9 @@ TfLiteStatus PadOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
   int tensor_id = inputs->data[0];
   const auto& input_tensor = context->tensors[tensor_id];
   AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
-  TF_LITE_ENSURE_STATUS(
-      ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_));
-  auto* input_min_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_min_), sizeof(input_min_));
-  auto* input_max_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_max_), sizeof(input_max_));
 
   // Min/max values for input tensor.
-  AddInput(TensorID(input_min_const->GetID(), 0));
-  AddInput(TensorID(input_max_const->GetID(), 0));
+  TF_LITE_ENSURE_STATUS(ComputeAndAddMinAndMax(context, input_tensor));
 
   // Padding tensor.
   tensor_id = inputs->data[1];
diff --git a/tensorflow/lite/delegates/hexagon/builders/pad_builder.h b/tensorflow/lite/delegates/hexagon/builders/pad_builder.h
index 855d6d582bb..62bfaa88e65 100644
--- a/tensorflow/lite/delegates/hexagon/builders/pad_builder.h
+++ b/tensorflow/lite/delegates/hexagon/builders/pad_builder.h
@@ -38,7 +38,6 @@ class PadOpBuilder : public OpBuilder {
 
  private:
   TensorID node_output_;
-  float input_min_, input_max_;
 };
 
 }  // namespace hexagon
diff --git a/tensorflow/lite/delegates/hexagon/builders/pool_2d_builder.cc b/tensorflow/lite/delegates/hexagon/builders/pool_2d_builder.cc
index 5dddcbfb0b3..2aba6c7f164 100644
--- a/tensorflow/lite/delegates/hexagon/builders/pool_2d_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/pool_2d_builder.cc
@@ -33,14 +33,7 @@ TfLiteStatus Pool2dOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
   int tensor_id = inputs->data[0];
   const auto& data_tensor = context->tensors[tensor_id];
   AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
-  TF_LITE_ENSURE_STATUS(
-      ComputeMinAndMaxQuantValues(data_tensor, &data_min_, &data_max_));
-  auto* data_min_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, (char*)&data_min_, sizeof(data_min_));
-  auto* data_max_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, (char*)&data_max_, sizeof(data_max_));
-  AddInput(TensorID(data_min_const->GetID(), 0));
-  AddInput(TensorID(data_max_const->GetID(), 0));
+  TF_LITE_ENSURE_STATUS(ComputeAndAddMinAndMax(context, data_tensor));
 
   const TfLitePoolParams* pool_params =
       reinterpret_cast<const TfLitePoolParams*>(builtin_data_);
diff --git a/tensorflow/lite/delegates/hexagon/builders/pool_2d_builder.h b/tensorflow/lite/delegates/hexagon/builders/pool_2d_builder.h
index 53bb0eb1b29..470ea070acc 100644
--- a/tensorflow/lite/delegates/hexagon/builders/pool_2d_builder.h
+++ b/tensorflow/lite/delegates/hexagon/builders/pool_2d_builder.h
@@ -40,7 +40,7 @@ class Pool2dOpBuilder : public OpBuilder {
   TensorID node_output_;
   std::vector<int> stride_shape_;
   std::vector<int> filter_shape_;
-  float data_min_, data_max_, output_min_, output_max_;
+  float output_min_, output_max_;
 };
 
 }  // namespace hexagon
diff --git a/tensorflow/lite/delegates/hexagon/builders/quantize_builder.cc b/tensorflow/lite/delegates/hexagon/builders/quantize_builder.cc
index cc8ab5e6313..66b86abb6a8 100644
--- a/tensorflow/lite/delegates/hexagon/builders/quantize_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/quantize_builder.cc
@@ -28,36 +28,16 @@ namespace hexagon {
 TfLiteStatus QuantizeOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
                                                  const TfLiteIntArray* outputs,
                                                  TfLiteContext* context) {
-  // Input.
-  float input_min = 0;
-  float input_max = 0;
   const auto& input_tensor = context->tensors[inputs->data[0]];
-  ComputeMinAndMaxQuantValues(input_tensor, &input_min, &input_max);
-  auto* input_min_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_min), sizeof(input_min));
-  auto* input_max_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_max), sizeof(input_max));
-
-  // Output.
-  float output_min = 0;
-  float output_max = 0;
   const auto& output_tensor = context->tensors[outputs->data[0]];
-  TF_LITE_ENSURE_STATUS(
-      ComputeMinAndMaxQuantValues(output_tensor, &output_min, &output_max));
   int output_batch_size, output_height_size, output_width_size,
       output_depth_size;
   GetDims(&output_batch_size, &output_height_size, &output_width_size,
           &output_depth_size, output_tensor.dims);
-  auto* requantized_min_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&output_min), sizeof(output_min));
-  auto* requantized_max_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&output_max), sizeof(output_max));
 
   AddInput(graph_builder_->GetHexagonTensorId(inputs->data[0]));
-  AddInput(TensorID(input_min_const->GetID(), 0));
-  AddInput(TensorID(input_max_const->GetID(), 0));
-  AddInput(TensorID(requantized_min_const->GetID(), 0));
-  AddInput(TensorID(requantized_max_const->GetID(), 0));
+  TF_LITE_ENSURE_STATUS(ComputeAndAddMinAndMax(context, input_tensor));
+  TF_LITE_ENSURE_STATUS(ComputeAndAddMinAndMax(context, output_tensor));
 
   // Hexagon outputs for this node.
   node_output_ = AddOutput(sizeof(uint8_t), 4,
diff --git a/tensorflow/lite/delegates/hexagon/builders/reduce_builder.cc b/tensorflow/lite/delegates/hexagon/builders/reduce_builder.cc
index ddc492541f7..25742a8aab5 100644
--- a/tensorflow/lite/delegates/hexagon/builders/reduce_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/reduce_builder.cc
@@ -33,15 +33,7 @@ TfLiteStatus ReduceOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
   int tensor_id = inputs->data[0];
   const auto& input_tensor = context->tensors[tensor_id];
   AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
-  ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_);
-  auto* input_min_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_min_), sizeof(input_min_));
-  auto* input_max_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_max_), sizeof(input_max_));
-
-  // Min/max values for input tensor.
-  AddInput(TensorID(input_min_const->GetID(), 0));
-  AddInput(TensorID(input_max_const->GetID(), 0));
+  TF_LITE_ENSURE_STATUS(ComputeAndAddMinAndMax(context, input_tensor));
 
   // Axes tensor should be constant.
   tensor_id = inputs->data[1];
diff --git a/tensorflow/lite/delegates/hexagon/builders/reduce_builder.h b/tensorflow/lite/delegates/hexagon/builders/reduce_builder.h
index 8cab32637ac..ef860db268a 100644
--- a/tensorflow/lite/delegates/hexagon/builders/reduce_builder.h
+++ b/tensorflow/lite/delegates/hexagon/builders/reduce_builder.h
@@ -38,7 +38,6 @@ class ReduceOpBuilder : public OpBuilder {
 
  private:
   TensorID node_output_;
-  float input_min_, input_max_;
 };
 
 }  // namespace hexagon
diff --git a/tensorflow/lite/delegates/hexagon/builders/resize_bilinear_builder.cc b/tensorflow/lite/delegates/hexagon/builders/resize_bilinear_builder.cc
index dda1d83717e..3387070e9b2 100644
--- a/tensorflow/lite/delegates/hexagon/builders/resize_bilinear_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/resize_bilinear_builder.cc
@@ -47,16 +47,9 @@ TfLiteStatus ResizeBilinearOpBuilder::PopulateSubGraph(
   AddInput(TensorID(dims_const->GetID(), 0));
 
   // Input min/max
-  TF_LITE_ENSURE_OK(context, ComputeMinAndMaxQuantValues(
-                                 input_tensor, &input_min_, &input_max_));
-  auto* input_min_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_min_), sizeof(input_min_));
-  auto* input_max_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_max_), sizeof(input_max_));
+  TF_LITE_ENSURE_STATUS(ComputeAndAddMinAndMax(context, input_tensor));
 
-  AddInput(TensorID(input_min_const->GetID(), 0));
-  AddInput(TensorID(input_max_const->GetID(), 0));
-  // Align Corners
+  // Align Corners & half-pixel-centers.
   const TfLiteResizeBilinearParams* params =
       reinterpret_cast<const TfLiteResizeBilinearParams*>(builtin_data_);
   int align_corners = params->align_corners ? 1 : 0;
@@ -64,6 +57,11 @@ TfLiteStatus ResizeBilinearOpBuilder::PopulateSubGraph(
       kScalarShape, reinterpret_cast<char*>(&align_corners),
       sizeof(align_corners));
   AddInput(TensorID(align_corners_const->GetID(), 0));
+  int half_pixel_centers = params->half_pixel_centers ? 1 : 0;
+  auto* half_pixel_centers_const = graph_builder_->AddConstNodeWithData(
+      kScalarShape, reinterpret_cast<char*>(&half_pixel_centers),
+      sizeof(half_pixel_centers));
+  AddInput(TensorID(half_pixel_centers_const->GetID(), 0));
 
   // Output
   int output_batch_size, output_height_size, output_width_size,
diff --git a/tensorflow/lite/delegates/hexagon/builders/resize_bilinear_builder.h b/tensorflow/lite/delegates/hexagon/builders/resize_bilinear_builder.h
index 3fb23f88542..3e8891e6855 100644
--- a/tensorflow/lite/delegates/hexagon/builders/resize_bilinear_builder.h
+++ b/tensorflow/lite/delegates/hexagon/builders/resize_bilinear_builder.h
@@ -36,7 +36,6 @@ class ResizeBilinearOpBuilder : public OpBuilder {
 
  private:
   TensorID node_output_;
-  float input_min_, input_max_;
 };
 
 }  // namespace hexagon
diff --git a/tensorflow/lite/delegates/hexagon/builders/resize_nearest_neighbor_builder.cc b/tensorflow/lite/delegates/hexagon/builders/resize_nearest_neighbor_builder.cc
index c8dc0d60363..5ec23d79bcb 100644
--- a/tensorflow/lite/delegates/hexagon/builders/resize_nearest_neighbor_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/resize_nearest_neighbor_builder.cc
@@ -32,12 +32,6 @@ TfLiteStatus ResizeNearestNeighborOpBuilder::PopulateSubGraph(
   int tensor_id = inputs->data[0];
   const auto& input_tensor = context->tensors[tensor_id];
   AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
-  TF_LITE_ENSURE_STATUS(
-      ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_));
-  auto* input_min_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_min_), sizeof(input_min_));
-  auto* input_max_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_max_), sizeof(input_max_));
 
   // Output dimensions tensor.
   tensor_id = inputs->data[1];
@@ -53,17 +47,21 @@ TfLiteStatus ResizeNearestNeighborOpBuilder::PopulateSubGraph(
   }
 
   // Min/max values for input tensor.
-  AddInput(TensorID(input_min_const->GetID(), 0));
-  AddInput(TensorID(input_max_const->GetID(), 0));
+  TF_LITE_ENSURE_STATUS(ComputeAndAddMinAndMax(context, input_tensor));
 
   // Align corners.
   const TfLiteResizeNearestNeighborParams* params =
       reinterpret_cast<const TfLiteResizeNearestNeighborParams*>(builtin_data_);
-  align_corners_ = params->align_corners;
+  int align_corners = params->align_corners ? 1 : 0;
   auto* align_corners_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&align_corners_),
-      sizeof(align_corners_));
+      kScalarShape, reinterpret_cast<char*>(&align_corners),
+      sizeof(align_corners));
   AddInput(TensorID(align_corners_const->GetID(), 0));
+  int half_pixel_centers = params->half_pixel_centers ? 1 : 0;
+  auto* half_pixel_centers_const = graph_builder_->AddConstNodeWithData(
+      kScalarShape, reinterpret_cast<char*>(&half_pixel_centers),
+      sizeof(half_pixel_centers));
+  AddInput(TensorID(half_pixel_centers_const->GetID(), 0));
 
   // Hexagon outputs for this node.
   int output_batch_size, output_height_size, output_width_size,
diff --git a/tensorflow/lite/delegates/hexagon/builders/resize_nearest_neighbor_builder.h b/tensorflow/lite/delegates/hexagon/builders/resize_nearest_neighbor_builder.h
index f9fc9281080..2c16eff1f89 100644
--- a/tensorflow/lite/delegates/hexagon/builders/resize_nearest_neighbor_builder.h
+++ b/tensorflow/lite/delegates/hexagon/builders/resize_nearest_neighbor_builder.h
@@ -39,8 +39,6 @@ class ResizeNearestNeighborOpBuilder : public OpBuilder {
 
  private:
   TensorID node_output_;
-  float input_min_, input_max_;
-  bool align_corners_;
 };
 
 }  // namespace hexagon
diff --git a/tensorflow/lite/delegates/hexagon/builders/slice_builder.cc b/tensorflow/lite/delegates/hexagon/builders/slice_builder.cc
index 4ef6c302ad8..05dfd3ffeb0 100644
--- a/tensorflow/lite/delegates/hexagon/builders/slice_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/slice_builder.cc
@@ -64,14 +64,7 @@ TfLiteStatus SliceOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
   AddInput(TensorID(sizes_node->GetID(), 0));
 
   // Input min/max
-  TF_LITE_ENSURE_STATUS(
-      ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_));
-  auto* input_min_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_min_), sizeof(input_min_));
-  auto* input_max_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_max_), sizeof(input_max_));
-  AddInput(TensorID(input_min_const->GetID(), 0));
-  AddInput(TensorID(input_max_const->GetID(), 0));
+  TF_LITE_ENSURE_STATUS(ComputeAndAddMinAndMax(context, input_tensor));
 
   // Outputs
   int output_batch_size, output_height_size, output_width_size,
diff --git a/tensorflow/lite/delegates/hexagon/builders/slice_builder.h b/tensorflow/lite/delegates/hexagon/builders/slice_builder.h
index 700c3097bed..9bb0c586f51 100644
--- a/tensorflow/lite/delegates/hexagon/builders/slice_builder.h
+++ b/tensorflow/lite/delegates/hexagon/builders/slice_builder.h
@@ -35,7 +35,6 @@ class SliceOpBuilder : public OpBuilder {
 
  private:
   TensorID node_output_;
-  float input_min_, input_max_;
 };
 
 }  // namespace hexagon
diff --git a/tensorflow/lite/delegates/hexagon/builders/softmax_builder.cc b/tensorflow/lite/delegates/hexagon/builders/softmax_builder.cc
index fea696b7421..5c77dc710d6 100644
--- a/tensorflow/lite/delegates/hexagon/builders/softmax_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/softmax_builder.cc
@@ -33,14 +33,7 @@ TfLiteStatus SoftmaxOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
   int tensor_id = inputs->data[0];
   const auto& input_tensor = context->tensors[tensor_id];
   AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
-  TF_LITE_ENSURE_STATUS(
-      ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_));
-  auto* input_min_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, (char*)&input_min_, sizeof(input_min_));
-  auto* input_max_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, (char*)&input_max_, sizeof(input_max_));
-  AddInput(TensorID(input_min_const->GetID(), 0));
-  AddInput(TensorID(input_max_const->GetID(), 0));
+  TF_LITE_ENSURE_STATUS(ComputeAndAddMinAndMax(context, input_tensor));
 
   // beta value
   const TfLiteSoftmaxParams* softmax_params =
diff --git a/tensorflow/lite/delegates/hexagon/builders/softmax_builder.h b/tensorflow/lite/delegates/hexagon/builders/softmax_builder.h
index 1d86a82b5c5..cb35a2fe794 100644
--- a/tensorflow/lite/delegates/hexagon/builders/softmax_builder.h
+++ b/tensorflow/lite/delegates/hexagon/builders/softmax_builder.h
@@ -39,7 +39,6 @@ class SoftmaxOpBuilder : public OpBuilder {
  private:
   TensorID node_output_;
   float beta_value_ = 1.0f;
-  float input_min_, input_max_;
 };
 
 }  // namespace hexagon
diff --git a/tensorflow/lite/delegates/hexagon/builders/space_to_depth_builder.cc b/tensorflow/lite/delegates/hexagon/builders/space_to_depth_builder.cc
index 4447ed9ca2b..7dbfd081f60 100644
--- a/tensorflow/lite/delegates/hexagon/builders/space_to_depth_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/space_to_depth_builder.cc
@@ -30,13 +30,6 @@ TfLiteStatus SpaceToDepthOpBuilder::PopulateSubGraph(
     TfLiteContext* context) {
   // Input tensor.
   int tensor_id = inputs->data[0];
-  const auto& input_tensor = context->tensors[tensor_id];
-  TF_LITE_ENSURE_STATUS(
-      ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_));
-  auto* input_min_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_min_), sizeof(input_min_));
-  auto* input_max_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_max_), sizeof(input_max_));
 
   // Block size.
   const TfLiteSpaceToDepthParams* space_to_depth_params =
@@ -48,8 +41,8 @@ TfLiteStatus SpaceToDepthOpBuilder::PopulateSubGraph(
   // All inputs.
   AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
   AddInput(TensorID(block_size_node->GetID(), 0));
-  AddInput(TensorID(input_min_const->GetID(), 0));
-  AddInput(TensorID(input_max_const->GetID(), 0));
+  TF_LITE_ENSURE_STATUS(
+      ComputeAndAddMinAndMax(context, context->tensors[tensor_id]));
 
   // Hexagon outputs for this node.
   int output_batch_size, output_height_size, output_width_size,
diff --git a/tensorflow/lite/delegates/hexagon/builders/space_to_depth_builder.h b/tensorflow/lite/delegates/hexagon/builders/space_to_depth_builder.h
index 95d1a02feed..347c0aaed42 100644
--- a/tensorflow/lite/delegates/hexagon/builders/space_to_depth_builder.h
+++ b/tensorflow/lite/delegates/hexagon/builders/space_to_depth_builder.h
@@ -40,7 +40,6 @@ class SpaceToDepthOpBuilder : public OpBuilder {
 
  private:
   TensorID node_output_;
-  float input_min_, input_max_;
   int block_size_;
 };
 
diff --git a/tensorflow/lite/delegates/hexagon/builders/split_builder.cc b/tensorflow/lite/delegates/hexagon/builders/split_builder.cc
index 833ef2f5d3d..af151e53913 100644
--- a/tensorflow/lite/delegates/hexagon/builders/split_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/split_builder.cc
@@ -53,14 +53,7 @@ TfLiteStatus SplitOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
 
   // Input data tensor & min/max.
   AddInput(graph_builder_->GetHexagonTensorId(input_tensor_id));
-  TF_LITE_ENSURE_STATUS(
-      ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_));
-  auto* input_min_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_min_), sizeof(input_min_));
-  auto* input_max_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_max_), sizeof(input_max_));
-  AddInput(TensorID(input_min_const->GetID(), 0));
-  AddInput(TensorID(input_max_const->GetID(), 0));
+  TF_LITE_ENSURE_STATUS(ComputeAndAddMinAndMax(context, input_tensor));
 
   // Output data tensors.
   for (int i = 0; i < outputs->size; ++i) {
diff --git a/tensorflow/lite/delegates/hexagon/builders/split_builder.h b/tensorflow/lite/delegates/hexagon/builders/split_builder.h
index 6681158e7e7..e3ec45e8c70 100644
--- a/tensorflow/lite/delegates/hexagon/builders/split_builder.h
+++ b/tensorflow/lite/delegates/hexagon/builders/split_builder.h
@@ -38,8 +38,6 @@ class SplitOpBuilder : public OpBuilder {
 
  private:
   std::vector<TensorID> node_outputs_;
-  float input_min_;
-  float input_max_;
 };
 
 }  // namespace hexagon
diff --git a/tensorflow/lite/delegates/hexagon/builders/strided_slice_builder.cc b/tensorflow/lite/delegates/hexagon/builders/strided_slice_builder.cc
index 125c1f9ea87..ea59775f7f3 100644
--- a/tensorflow/lite/delegates/hexagon/builders/strided_slice_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/strided_slice_builder.cc
@@ -67,14 +67,7 @@ TfLiteStatus StridedSliceOpBuilder::PopulateSubGraph(
   AddInput(TensorID(shrink_axis_mask_const->GetID(), 0));
 
   // Input min/max
-  TF_LITE_ENSURE_STATUS(
-      ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_));
-  auto* input_min_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_min_), sizeof(input_min_));
-  auto* input_max_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_max_), sizeof(input_max_));
-  AddInput(TensorID(input_min_const->GetID(), 0));
-  AddInput(TensorID(input_max_const->GetID(), 0));
+  TF_LITE_ENSURE_STATUS(ComputeAndAddMinAndMax(context, input_tensor));
 
   // Slice outputs.
   int output_batch_size, output_height_size, output_width_size,
diff --git a/tensorflow/lite/delegates/hexagon/builders/strided_slice_builder.h b/tensorflow/lite/delegates/hexagon/builders/strided_slice_builder.h
index 330e6636a1a..b7f042d7562 100644
--- a/tensorflow/lite/delegates/hexagon/builders/strided_slice_builder.h
+++ b/tensorflow/lite/delegates/hexagon/builders/strided_slice_builder.h
@@ -35,7 +35,6 @@ class StridedSliceOpBuilder : public OpBuilder {
 
  private:
   TensorID node_output_;
-  float input_min_, input_max_;
 };
 
 }  // namespace hexagon
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/resize_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/resize_test.cc
index a34c65d7d45..57a7d762d8c 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/resize_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/resize_test.cc
@@ -22,17 +22,24 @@ class ResizeOpModel : public SingleOpModelWithHexagon {
  public:
   explicit ResizeOpModel(BuiltinOperator op_type, const TensorData& input,
                          std::initializer_list<int> size_data,
-                         const TensorData& output) {
+                         const TensorData& output, bool align_corners = false,
+                         bool half_pixel_centers = false) {
     input_ = AddInput(input);
     size_ = AddConstInput(TensorType_INT32, size_data, {2});
     output_ = AddOutput(output);
     if (op_type == BuiltinOperator_RESIZE_NEAREST_NEIGHBOR) {
       SetBuiltinOp(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
                    BuiltinOptions_ResizeNearestNeighborOptions,
-                   CreateResizeNearestNeighborOptions(builder_).Union());
+                   CreateResizeNearestNeighborOptions(
+                       builder_, /*align_corners*/ align_corners,
+                       /*half_pixel_centers*/ half_pixel_centers)
+                       .Union());
     } else {
       SetBuiltinOp(op_type, BuiltinOptions_ResizeBilinearOptions,
-                   CreateResizeBilinearOptions(builder_).Union());
+                   CreateResizeBilinearOptions(
+                       builder_, /**align_corners**/ align_corners,
+                       /**half_pixel_centers**/ half_pixel_centers)
+                       .Union());
     }
     BuildInterpreter({GetShape(input_)});
   }
@@ -66,9 +73,6 @@ class ResizeOpModel : public SingleOpModelWithHexagon {
   int output_;
 };
 
-// TODO(b/154007913): Investigate why NearestNeighbor does not provide the same
-// output always, requiring high allowed error.
-
 TEST(ResizeOpModel, HorizontalResizeBiliear_UInt8) {
   ResizeOpModel m(BuiltinOperator_RESIZE_BILINEAR,
                   {TensorType_UINT8, {1, 1, 2, 1}, -2.0, 10}, {1, 3},
@@ -87,7 +91,7 @@ TEST(ResizeOpModel, HorizontalResizeNearestNeighbor_Int8) {
   m.ApplyDelegateAndInvoke();
   EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(ArrayFloatNear({3.01176, 3.01176, 6.02353},
-                                              /*max_abs_error=*/4)));
+                                              /*max_abs_error=*/1)));
 }
 
 TEST(ResizeOpModel, VerticalResizeBiliear_Int8) {
@@ -108,7 +112,7 @@ TEST(ResizeOpModel, VerticalResizeNearestNeighbor_UInt8) {
   m.ApplyDelegateAndInvoke();
   EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(ArrayFloatNear({3.01961, 3.01961, 8.97255},
-                                              /*max_abs_error=*/6)));
+                                              /*max_abs_error=*/1)));
 }
 
 TEST(ResizeOpModel, ThreeDimensionalResizeBiliear_UInt8) {
@@ -146,7 +150,7 @@ TEST(ResizeOpModel, ThreeDimensionalResizeNearestNeighbor_Int8) {
                       3.01177, 4.01569, 3.01177, 4.01569, 6.02353, 10.0392,  //
                       10.0392, 12.0471, 10.0392, 12.0471, 14.0549, 16.0627,  //
                   },
-                  /*max_abs_error=*/13)));
+                  /*max_abs_error=*/1)));
 }
 
 TEST(ResizeOpModel, TwoDimensionalResizeBilinearWithTwoBatches_Int8) {
@@ -193,7 +197,84 @@ TEST(ResizeOpModel, TwoDimensionalResizeNNWithTwoBatches_UInt8) {
                       4.01569, 4.01569, 10.0392,  //
                       12.0471, 12.0471, 16.0627,  //
                   },
-                  /*max_abs_error=*/13)));
+                  /*max_abs_error=*/1)));
+}
+
+TEST(ResizeOpModel, TwoDimResizeBilinearWithTwoBatches_HalfPixelCenters_UInt8) {
+  ResizeOpModel m(BuiltinOperator_RESIZE_BILINEAR,
+                  {TensorType_UINT8, {2, 2, 2, 1}, -2.0, 20}, {3, 3},
+                  {TensorType_UINT8, {}, -2.0, 20}, /**align_corners**/ false,
+                  /**half_pixel_centers**/ true);
+  m.SetQuantizedInput<uint8_t>({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      12, 16  //
+  });
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({2, 4, 6,    //
+                                               6, 7, 9,    //
+                                               9, 10, 12,  //
+                                               4, 7, 10,   //
+                                               8, 10, 13,  //
+                                               12, 14, 16},
+                                              /*max_abs_error=*/2)));
+}
+
+TEST(ResizeOpModel, TwoDimResizeBilinearWithTwoBatches_AlignCorners_UInt8) {
+  ResizeOpModel m(BuiltinOperator_RESIZE_BILINEAR,
+                  {TensorType_UINT8, {2, 2, 2, 1}, -2.0, 20}, {3, 3},
+                  {TensorType_UINT8, {}, -2.0, 20}, /**align_corners**/ true,
+                  /**half_pixel_centers**/ false);
+  m.SetQuantizedInput<uint8_t>({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      12, 16  //
+  });
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({3, 5, 6,    //
+                                               7, 9, 10,   //
+                                               9, 11, 12,  //
+                                               4, 8, 10,   //
+                                               9, 12, 13,  //
+                                               12, 15, 16},
+                                              /*max_abs_error=*/2)));
+}
+
+TEST(ResizeOpModel, ThreeDimensionalResizeNN_AlignCorners_UInt8) {
+  ResizeOpModel m(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+                  {TensorType_UINT8, {1, 2, 2, 2}, -2.0, 20}, {3, 3},
+                  {TensorType_UINT8, {}, -2.0, 20}, /**align_corners**/ true);
+  m.SetQuantizedInput<uint8_t>({
+      3, 4, 6, 10,     //
+      10, 12, 14, 16,  //
+  });
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({3, 4, 6, 10, 6, 10,      //
+                                               10, 12, 14, 16, 14, 16,  //
+                                               10, 12, 14, 16, 14, 16},
+                                              /*max_abs_error=*/1)));
+}
+
+TEST(ResizeOpModel, ThreeDimensionalResizeNN_HalfPixelCenters_UInt8) {
+  ResizeOpModel m(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+                  {TensorType_UINT8, {1, 2, 2, 2}, -2.0, 20}, {3, 3},
+                  {TensorType_UINT8, {}, -2.0, 20}, /**align_corners**/ false,
+                  /**half_pixel_centers**/ true);
+  m.SetQuantizedInput<uint8_t>({
+      3, 4, 6, 10,     //
+      10, 12, 14, 16,  //
+  });
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({3, 4, 6, 10, 6, 10,      //
+                                               10, 12, 14, 16, 14, 16,  //
+                                               10, 12, 14, 16, 14, 16},
+                                              /*max_abs_error=*/1)));
 }
 
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/hexagon/builders/transpose_builder.cc b/tensorflow/lite/delegates/hexagon/builders/transpose_builder.cc
index 9ad0262fbc2..eb0c2668edc 100644
--- a/tensorflow/lite/delegates/hexagon/builders/transpose_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/transpose_builder.cc
@@ -29,25 +29,9 @@ TfLiteStatus TransposeOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
   const auto& input_tensor = context->tensors[tensor_id];
   AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
   // permutation tensor.
-  tensor_id = inputs->data[1];
-  const auto& control_tensor = context->tensors[tensor_id];
-  if (control_tensor.allocation_type == kTfLiteMmapRo) {
-    auto* const_control_tensor_node =
-        graph_builder_->AddConstNodeWithData(tensor_id, control_tensor);
-    AddInput(TensorID(const_control_tensor_node->GetID(), 0));
-  } else {
-    AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
-  }
+  AddInput(graph_builder_->GetHexagonTensorId(inputs->data[1]));
 
-  TF_LITE_ENSURE_STATUS(
-      ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_));
-  auto* input_min_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_min_), sizeof(input_min_));
-  auto* input_max_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&input_max_), sizeof(input_max_));
-  // Min/max values for input tensor.
-  AddInput(TensorID(input_min_const->GetID(), 0));
-  AddInput(TensorID(input_max_const->GetID(), 0));
+  TF_LITE_ENSURE_STATUS(ComputeAndAddMinAndMax(context, input_tensor));
 
   // Hexagon outputs for this node.
   int output_batch_size, output_height_size, output_width_size,
diff --git a/tensorflow/lite/delegates/hexagon/builders/transpose_builder.h b/tensorflow/lite/delegates/hexagon/builders/transpose_builder.h
index 5f99087b935..f9418300b00 100644
--- a/tensorflow/lite/delegates/hexagon/builders/transpose_builder.h
+++ b/tensorflow/lite/delegates/hexagon/builders/transpose_builder.h
@@ -34,7 +34,6 @@ class TransposeOpBuilder : public OpBuilder {
 
  private:
   TensorID node_output_;
-  float input_min_, input_max_;
 };
 }  // namespace hexagon
 }  // namespace delegates
diff --git a/tensorflow/lite/delegates/hexagon/builders/transpose_conv_2d_builder.cc b/tensorflow/lite/delegates/hexagon/builders/transpose_conv_2d_builder.cc
index d7d700b2e77..3e852533394 100644
--- a/tensorflow/lite/delegates/hexagon/builders/transpose_conv_2d_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/transpose_conv_2d_builder.cc
@@ -83,14 +83,6 @@ TfLiteStatus TransposeConv2dOpBuilder::PopulateSubGraph(
   int tensor_id = inputs->data[2];
   const auto& data_tensor = context->tensors[tensor_id];
   AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
-  float data_min = 0;
-  float data_max = 0;
-  TF_LITE_ENSURE_STATUS(
-      ComputeMinAndMaxQuantValues(data_tensor, &data_min, &data_max));
-  auto* data_min_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&data_min), sizeof(data_min));
-  auto* data_max_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&data_max), sizeof(data_max));
 
   // WEIGHTS.
   tensor_id = inputs->data[1];
@@ -105,8 +97,6 @@ TfLiteStatus TransposeConv2dOpBuilder::PopulateSubGraph(
       filter_depth_size;
   GetDims(&filter_batch_size, &filter_height_size, &filter_width_size,
           &filter_depth_size, weights_tensor.dims);
-  weight_shape_ = {filter_batch_size, filter_height_size, filter_width_size,
-                   filter_depth_size};
   // Weights tensor could be int8 even for per-tensor quantization.
   // Therefore, we look at the number of scale values to check if it is
   // per-channel quantized.
@@ -114,25 +104,7 @@ TfLiteStatus TransposeConv2dOpBuilder::PopulateSubGraph(
       reinterpret_cast<TfLiteAffineQuantization*>(
           weights_tensor.quantization.params);
   const bool is_per_channel_quant = weights_quant_params->scale->size > 1;
-
-  OpBuilder* const_weights_node;
-  if (weights_tensor.type == kTfLiteInt8) {
-    std::vector<uint8_t> weights_data(NumElements(&weights_tensor));
-    const int8_t* original_data = weights_tensor.data.int8;
-    // Flip bits on the weight values so that the int8 values are treated
-    // as uint8.
-    for (int i = 0; i < NumElements(&weights_tensor); ++i) {
-      weights_data[i] = original_data[i] ^ k8BitSignFlipConstant;
-    }
-    const_weights_node = graph_builder_->AddConstNodeWithData(
-        weight_shape_.data(), reinterpret_cast<char*>(weights_data.data()),
-        weights_data.size() * sizeof(weights_data[0]));
-  } else {
-    const_weights_node = graph_builder_->AddConstNodeWithData(
-        weight_shape_.data(), weights_tensor.data.raw, weights_tensor.bytes);
-  }
-  graph_builder_->AddTensorWithID(tensor_id, const_weights_node->GetID(), 0);
-  AddInput(TensorID(const_weights_node->GetID(), 0));
+  AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
 
   // Handle weights quantization.
   float weights_min = 0;
@@ -150,8 +122,7 @@ TfLiteStatus TransposeConv2dOpBuilder::PopulateSubGraph(
       kScalarShape, reinterpret_cast<char*>(&weights_max), sizeof(weights_max));
 
   // Min/max inputs for data & weights tensors.
-  AddInput(TensorID(data_min_const->GetID(), 0));
-  AddInput(TensorID(data_max_const->GetID(), 0));
+  TF_LITE_ENSURE_STATUS(ComputeAndAddMinAndMax(context, data_tensor));
   AddInput(TensorID(weights_min_const->GetID(), 0));
   AddInput(TensorID(weights_max_const->GetID(), 0));
 
@@ -206,16 +177,8 @@ TfLiteStatus TransposeConv2dOpBuilder::PopulateSubGraph(
   AddInput(TensorID(bias_max_const->GetID(), 0));
 
   // Output quantization.
-  float output_min = 0;
-  float output_max = 0;
-  ComputeMinAndMaxQuantValues(context->tensors[outputs->data[0]], &output_min,
-                              &output_max);
-  auto* output_min_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&output_min), sizeof(output_min));
-  auto* output_max_const = graph_builder_->AddConstNodeWithData(
-      kScalarShape, reinterpret_cast<char*>(&output_max), sizeof(output_max));
-  AddInput(TensorID(output_min_const->GetID(), 0));
-  AddInput(TensorID(output_max_const->GetID(), 0));
+  TF_LITE_ENSURE_STATUS(
+      ComputeAndAddMinAndMax(context, context->tensors[outputs->data[0]]));
 
   // Channel scales, if this op is per-channel quantized.
   if (channel_scales_node_ != nullptr) {
diff --git a/tensorflow/lite/delegates/hexagon/builders/transpose_conv_2d_builder.h b/tensorflow/lite/delegates/hexagon/builders/transpose_conv_2d_builder.h
index 0a6a90a0297..4afab9894f0 100644
--- a/tensorflow/lite/delegates/hexagon/builders/transpose_conv_2d_builder.h
+++ b/tensorflow/lite/delegates/hexagon/builders/transpose_conv_2d_builder.h
@@ -47,7 +47,7 @@ class TransposeConv2dOpBuilder : public OpBuilder {
   TensorID node_output_;
   std::vector<float> transposed_weights_;
   std::vector<int> stride_shape_;
-  std::vector<int> weight_shape_, bias_shape_;
+  std::vector<int> bias_shape_;
   std::vector<int> bias_data_;
 
   // Non-null only if node has per-channel quantized weights/biases.
diff --git a/tensorflow/lite/delegates/hexagon/hexagon_delegate.cc b/tensorflow/lite/delegates/hexagon/hexagon_delegate.cc
index 83437df8cd6..f03f300e380 100644
--- a/tensorflow/lite/delegates/hexagon/hexagon_delegate.cc
+++ b/tensorflow/lite/delegates/hexagon/hexagon_delegate.cc
@@ -74,6 +74,39 @@ class HexagonDelegate : public SimpleDelegateInterface {
     return options;
   }
 
+  bool VerifyDelegate() {
+    auto* hexagon_nn = HexagonNNImplementation();
+    if (hexagon_nn == nullptr) {
+      return false;
+    }
+    if (hexagon_nn->hexagon_nn_version != nullptr &&
+        hexagon_nn->hexagon_nn_hexagon_interface_version) {
+      int hexagon_nn_version = -1;
+      int hexagon_interface_version =
+          hexagon_nn->hexagon_nn_hexagon_interface_version();
+      if (hexagon_nn->hexagon_nn_version(&hexagon_nn_version) != 0) {
+        TFLITE_LOG_PROD(tflite::TFLITE_LOG_WARNING,
+                        "Failed to fetch Hexagon NN version. This might be "
+                        "because you're using incompatible versions of "
+                        "libhexagon_interface and libhexagon_nn_skel. "
+                        "You must use compatible versions. "
+                        "Refer to Tensorflow Lite Hexagon Delegate Guide.");
+        return false;
+      }
+      if (hexagon_nn_version != hexagon_interface_version) {
+        TFLITE_LOG_PROD(
+            tflite::TFLITE_LOG_WARNING,
+            "Incompatible versions between interface library and "
+            "libhexagon_skel %d vs %d. You must use compatible versions. "
+            "Refer to Tensorflow Lite Hexagon Delegate Guide.",
+            hexagon_interface_version, hexagon_nn_version);
+        return false;
+      }
+    }
+    return hexagon_nn->hexagon_nn_is_device_supported &&
+           hexagon_nn->hexagon_nn_is_device_supported();
+  }
+
  private:
   TfLiteHexagonDelegateOptions params_;
 };
@@ -83,9 +116,20 @@ class HexagonDelegate : public SimpleDelegateInterface {
 
 TfLiteDelegate* TfLiteHexagonDelegateCreate(
     const TfLiteHexagonDelegateOptions* options) {
-  // return tflite::CreateDelegate(options);
-  return tflite::TfLiteDelegateFactory::CreateSimpleDelegate(
-      std::make_unique<tflite::HexagonDelegate>(options));
+  auto hexagon_delegate_interface =
+      std::make_unique<tflite::HexagonDelegate>(options);
+  if (!hexagon_delegate_interface->VerifyDelegate()) {
+    TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
+                         "Hexagon Delegate is not supported.\n");
+    return nullptr;
+  }
+  auto* initialized_delegate =
+      tflite::TfLiteDelegateFactory::CreateSimpleDelegate(
+          std::move(hexagon_delegate_interface));
+  if (options->enable_dynamic_batch_size) {
+    initialized_delegate->flags |= kTfLiteDelegateFlagsAllowDynamicTensors;
+  }
+  return initialized_delegate;
 }
 
 TfLiteHexagonDelegateOptions TfLiteHexagonDelegateOptionsDefault() {
diff --git a/tensorflow/lite/delegates/hexagon/hexagon_delegate.h b/tensorflow/lite/delegates/hexagon/hexagon_delegate.h
index 264a132b995..931b02e4fa5 100644
--- a/tensorflow/lite/delegates/hexagon/hexagon_delegate.h
+++ b/tensorflow/lite/delegates/hexagon/hexagon_delegate.h
@@ -17,20 +17,6 @@ limitations under the License.
 
 #include "tensorflow/lite/c/common.h"
 
-#ifdef SWIG
-#define TFL_CAPI_EXPORT
-#else
-#if defined(_WIN32)
-#ifdef TFL_COMPILE_LIBRARY
-#define TFL_CAPI_EXPORT __declspec(dllexport)
-#else
-#define TFL_CAPI_EXPORT __declspec(dllimport)
-#endif  // TFL_COMPILE_LIBRARY
-#else
-#define TFL_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // _WIN32
-#endif  // SWIG
-
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
diff --git a/tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.cc b/tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.cc
index cdf6b555929..83ebc15510e 100644
--- a/tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.cc
+++ b/tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.cc
@@ -264,8 +264,9 @@ TfLiteStatus HexagonDelegateKernel::BuildGraph(
       if (tensor_id == -1) continue;
       const auto& input_tensor = context->tensors[tensor_id];
       if (input_tensor.allocation_type == kTfLiteMmapRo) {
-        builder_->AddConstNodeWithData(tensor_id, input_tensor,
-                                       /*int8_to_uint8*/ true);
+        builder_->AddConstNodeWithData(
+            tensor_id, input_tensor,
+            /*int8_to_uint8*/ (input_tensor.type == kTfLiteInt8));
       }
     }
     auto* op_builder =
diff --git a/tensorflow/lite/delegates/hexagon/utils.cc b/tensorflow/lite/delegates/hexagon/utils.cc
index 14d651a9d7d..397400c81f0 100644
--- a/tensorflow/lite/delegates/hexagon/utils.cc
+++ b/tensorflow/lite/delegates/hexagon/utils.cc
@@ -91,8 +91,6 @@ bool CheckOpVersion(const TfLiteRegistration* registration) {
     case kTfLiteBuiltinPad:
     case kTfLiteBuiltinQuantize:
     case kTfLiteBuiltinRelu6:
-    case kTfLiteBuiltinResizeBilinear:
-    case kTfLiteBuiltinResizeNearestNeighbor:
     case kTfLiteBuiltinSlice:
     case kTfLiteBuiltinSoftmax:
     case kTfLiteBuiltinSpaceToDepth:
@@ -107,6 +105,8 @@ bool CheckOpVersion(const TfLiteRegistration* registration) {
       return registration->version == 2;
     case kTfLiteBuiltinConv2d:
     case kTfLiteBuiltinDepthwiseConv2d:
+    case kTfLiteBuiltinResizeBilinear:
+    case kTfLiteBuiltinResizeNearestNeighbor:
       return registration->version <= 3;
     case kTfLiteBuiltinFullyConnected:
       return registration->version <= 4;
diff --git a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
index 56c1895ca4e..43f9c1b0953 100644
--- a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
+++ b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
@@ -18,7 +18,7 @@ namespace tflite {
 
 const constexpr char* NnapiAccelerationTestParams::kAccelerationTestConfig =
     R"(
-## Every Test can be allowlisted or blacklisted using a regexp on its test_id
+## Every Test can be allowlisted or denylisted using a regexp on its test_id
 
 ## Test_id
 #
@@ -28,8 +28,8 @@ const constexpr char* NnapiAccelerationTestParams::kAccelerationTestConfig =
 # the ordinal is the position in the list of parameters generated by the
 # cardinal product of all the different parameter sets
 
-# Blacklist/Allowlist
-# To blacklist an element simply add - before the test_id regex
+# Denylist/Allowlist
+# To denylist an element simply add - before the test_id regex
 
 ## Rules evaluation
 #
@@ -214,6 +214,7 @@ TypesGatherOpTest/Float32Int32,29
 TypesGatherOpTest/Int32Int32,29
 TypesGatherOpTest/Uint8Int32,29
 TypesGatherOpTest/Int8Int32,29
+-TypesGatherOpTest/.*Int16.*
 
 # hashtable_lookup_test
 # All test excepted the string one should be accelerated
@@ -243,9 +244,22 @@ CifgPeepholeNoProjectionNoClippingUnidirectionalLstmTest/NonLayerNormLstmBlackBo
 -.+UnidirectionalLstmTest/.+
 
 # lstm_test
--.+LstmTest/Hybrid.+Int8
--LSTMOpModel/InvalidTypeTest
-.+LstmTest/.+,29
+-LstmOpTest/InvalidTypes
+# Float
+Parameterized/LstmOpTest.+/0,29
+Parameterized/LstmOpTest.+/1,29
+Parameterized/LstmOpTest.+/2,29
+Parameterized/LstmOpTest.+/3,29
+# HybridUint8
+Parameterized/LstmOpTest.+/4,29
+Parameterized/LstmOpTest.+/5,29
+Parameterized/LstmOpTest.+/6,29
+Parameterized/LstmOpTest.+/7,29
+# HybridInt8
+-Parameterized/LstmOpTest.+/8
+-Parameterized/LstmOpTest.+/9
+-Parameterized/LstmOpTest.+/10
+-Parameterized/LstmOpTest.+/11
 
 # maximum_minimum_test
 MaxMinOpTest/.+nt8Test,29
@@ -295,6 +309,8 @@ QuantizedLstmTest/BasicQuantizedLstmTest/29
 
 # quantize_test
 QuantizeOpTest/UINT8,29
+QuantizeOpTest/UInt8UInt8.+,29
+QuantizeOpTest/Int8Int8.+,30
 QuantizeOpTest/INT8,30
 
 # rank
@@ -306,7 +322,9 @@ QuantizeOpTest/INT8,30
 -ConstInt8MeanOpTest.QuantizedDifferentScale
 ConstUint8(Max|Min)OpTest/.+,29
 ConstUint8(Mean)OpTest/.+
-ConstInt8(Mean|Max|Min)OpTest/.+,29
+-ConstInt8(Max|Min)OpTest/.+,29
+-ConstMeanOpTest.*/.+
+-MeanOpTestQuantized.*/.+
 ConstFloat(Sum|Prod|Max|Min)OpTest/NotKeepDims,29
 ConstFloat(Sum|Prod|Max|Min)OpTest/KeepDims,29
 ConstFloat(Mean|Any)OpTest/NotKeepDims
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index ce55d671b5d..122ddc043b2 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -307,6 +307,87 @@ bool IsHybridOperator(const TfLiteContext* context, int builtin_code,
   }
 }
 
+bool HasUnspecifiedDimension(const TfLiteTensor* tensor) {
+  if (tensor->dims_signature) {
+    for (int i : TfLiteIntArrayView(tensor->dims_signature)) {
+      if (i == -1) return true;
+    }
+  }
+  return false;
+}
+
+ANeuralNetworksOperandType ConvertTensorTypeToNNType(
+    const TfLiteTensor* tensor, TfLiteType ann_type_equivalent) {
+  int32_t nn_type = 0;
+  float scale = 0.0f;
+  int32_t zero_point = 0;
+  switch (tensor->type) {
+    case kTfLiteFloat32:
+      nn_type = ANEURALNETWORKS_TENSOR_FLOAT32;
+      break;
+    case kTfLiteUInt8:
+      nn_type = ann_type_equivalent == kTfLiteInt32
+                    ? ANEURALNETWORKS_TENSOR_INT32
+                    : ANEURALNETWORKS_TENSOR_QUANT8_ASYMM;
+      scale = tensor->params.scale;
+      zero_point = tensor->params.zero_point;
+      if (scale == 0) {
+        // TENSOR_QUANT8_ASYMM and ANEURALNETWORKS_TENSOR_QUANT8_ASYMM
+        // with zero scale are not valid in NNAPI.
+        scale = 1;
+      }
+      break;
+    case kTfLiteInt8:
+      nn_type = ANEURALNETWORKS_TENSOR_QUANT8_SYMM;
+      scale = tensor->params.scale;
+      zero_point = tensor->params.zero_point;
+      if (ann_type_equivalent == kTfLiteUInt8) {
+        nn_type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM;
+        zero_point += 128;
+      } else if (ann_type_equivalent == kTfLiteInt32) {
+        nn_type = ANEURALNETWORKS_TENSOR_INT32;
+        zero_point += 128;
+      }
+      if (scale == 0) {
+        // TENSOR_QUANT8_ASYMM and ANEURALNETWORKS_TENSOR_QUANT8_ASYMM
+        // with zero scale are not valid in NNAPI.
+        scale = 1;
+      }
+      break;
+    case kTfLiteInt32:
+      nn_type = ANEURALNETWORKS_TENSOR_INT32;
+      scale = tensor->params.scale;
+      zero_point = tensor->params.zero_point;
+      break;
+    case kTfLiteBool:
+      nn_type = ANEURALNETWORKS_TENSOR_BOOL8;
+      break;
+    case kTfLiteInt16:
+      nn_type = ANEURALNETWORKS_TENSOR_QUANT16_SYMM;
+      scale = tensor->params.scale;
+      zero_point = tensor->params.zero_point;
+      break;
+    default:
+      break;
+  }
+  uint32_t tensor_rank = static_cast<uint32_t>(tensor->dims->size);
+  uint32_t* tensor_dims = reinterpret_cast<uint32_t*>(tensor->dims->data);
+  static uint32_t scalar_rank = 1;
+  // treat scalar input as single cell tensor in NNAPI.
+  if (tensor_rank == 0) {
+    tensor_rank = scalar_rank;
+    tensor_dims = &scalar_rank;
+  }
+  ANeuralNetworksOperandType nn_operand_type{
+      .type = nn_type,
+      .dimensionCount = tensor_rank,
+      .dimensions = tensor_dims,
+      .scale = scale,
+      .zeroPoint = zero_point,
+  };
+  return nn_operand_type;
+}
+
 constexpr size_t kDefaultByteAlignmentForNNAPI = 16;
 
 static size_t getNumPaddingBytes(size_t byte_size) {
@@ -554,7 +635,8 @@ class NNAPIOpBuilder {
                  std::map<const MMAPAllocation*, ANeuralNetworksMemory*>*
                      allocation_mapping,
                  std::vector<int>* nnapi_to_tflite_op_mapping,
-                 ANeuralNetworksModel* nn_model, int* nnapi_errno)
+                 ANeuralNetworksModel* nn_model, int* nnapi_errno,
+                 bool allow_dynamic_dimensions)
       : nnapi_(nnapi),
         context_(context),
         operand_mapping_(tensor_mapping),
@@ -562,7 +644,8 @@ class NNAPIOpBuilder {
         allocation_memory_mapping_(allocation_mapping),
         nnapi_to_tflite_op_mapping_(nnapi_to_tflite_op_mapping),
         nn_model_(nn_model),
-        nnapi_errno_(nnapi_errno) {}
+        nnapi_errno_(nnapi_errno),
+        allow_dynamic_dimensions_(allow_dynamic_dimensions) {}
 
   TfLiteStatus AddScalarBoolOperand(bool value) {
     return AddScalarOperand<bool>(value, ANEURALNETWORKS_BOOL);
@@ -1171,8 +1254,20 @@ class NNAPIOpBuilder {
             TfLiteTypeGetName(tensor_type));
         return kTfLiteError;
     }
+    bool has_unspecified_dimensions = HasUnspecifiedDimension(tensor);
     uint32_t tensor_rank = static_cast<uint32_t>(tensor->dims->size);
-    uint32_t* tensor_dims = reinterpret_cast<uint32_t*>(tensor->dims->data);
+    std::vector<uint32_t> dims_unspecified(tensor_rank, 0);
+    if (has_unspecified_dimensions) {
+      for (int i = 0; i < tensor->dims_signature->size; i++) {
+        dims_unspecified[i] = tensor->dims_signature->data[i] == -1
+                                  ? 0
+                                  : tensor->dims_signature->data[i];
+      }
+    }
+    uint32_t* tensor_dims =
+        has_unspecified_dimensions && allow_dynamic_dimensions_
+            ? dims_unspecified.data()
+            : reinterpret_cast<uint32_t*>(tensor->dims->data);
     if (scalar_as_tensor && tensor_rank == 0) {
       // Use rank 1, shape {1} operand for TFLite scalar tensors.
       tensor_rank = 1;
@@ -1301,6 +1396,9 @@ class NNAPIOpBuilder {
 
   // Return status code of the latest NNAPI call.
   int* nnapi_errno_;
+
+  // Whether to allow dynamic batch size without re-compilation.
+  bool allow_dynamic_dimensions_;
 };  // namespace nnapi
 
 namespace {
@@ -2338,13 +2436,20 @@ bool NNAPIDelegateKernel::Validate(
              "Input should be Float32.", &val_ctx);
     } break;
     case kTfLiteBuiltinQuantize: {
-      ExpectOpVersion(version, 1, &val_ctx);
+      ExpectMaxOpVersion(version, 2, &val_ctx);
       ExpectMinAndroidSdkVersion(android_sdk_version, kMinSdkVersionForNNAPI12,
                                  &val_ctx);
       const auto value_type = context->tensors[node->inputs->data[0]].type;
-      Expect(value_type == kTfLiteFloat32,
+      Expect(value_type == kTfLiteFloat32 || IsQuantized(value_type),
              NNAPIValidationFailureType::kUnsupportedInputType,
-             "Value should be Float32.", &val_ctx);
+             "Value should be quantized or Float32.", &val_ctx);
+      if (IsQuantized(value_type)) {
+        const auto quantization_params =
+            context->tensors[node->inputs->data[0]].params;
+        Expect(quantization_params.scale > 0.f,
+               NNAPIValidationFailureType::kUnsupportedQuantizationParameters,
+               "Quantization scale should be > 0.", &val_ctx);
+      }
       const auto output_type = context->tensors[node->outputs->data[0]].type;
       if (android_sdk_version < kMinSdkVersionForNNAPI13) {
         Expect(output_type == kTfLiteUInt8,
@@ -3186,6 +3291,15 @@ TfLiteStatus NNAPIDelegateKernel::Map(
       *nn_op_type = ANEURALNETWORKS_LOG_SOFTMAX;
     } break;
     case kTfLiteBuiltinQuantize: {
+      auto input_index = mapping_args.node->inputs->data[0];
+      // NNAPI doesn't support requantization cases but only quantizations
+      // from float. Dequantizing our input adding a Dequantize node before
+      // this one.
+      if (IsQuantized(mapping_args.context->tensors[input_index].type)) {
+        mapping_args.builder->AddDequantize(0, input_index, kTfLiteFloat32,
+                                            mapping_args.node_index);
+      }
+
       *nn_op_type = ANEURALNETWORKS_QUANTIZE;
     } break;
     case kTfLiteBuiltinReduceAny: {
@@ -3317,12 +3431,12 @@ TfLiteStatus NNAPIDelegateKernel::Prepare(TfLiteContext* context,
     return kTfLiteError;
   }
 
+  const auto delegate_options =
+      StatefulNnApiDelegate::GetOptions(node->delegate);
   if (nn_compilation_) {
     return kTfLiteOk;
   }
 
-  const auto delegate_options =
-      StatefulNnApiDelegate::GetOptions(node->delegate);
   ANeuralNetworksCompilation* compilation = nullptr;
   if (!nnapi_devices_.empty()) {
     // Compile for the selected accelerator.
@@ -3468,6 +3582,53 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
           "setting execution loop timeout", nnapi_errno);
     }
   }
+  // Check if the size of input and output memory pool needs to be resized.
+  if (delegate_options.allow_dynamic_dimensions) {
+    size_t total_input_byte_size = 0;
+    // Make the TensorFlow Lite inputs and outputs to ann_indices.
+    for (int i : TfLiteIntArrayView(node->inputs)) {
+      // Constant tensors are not NNAPI inputs.
+      if (i != kTfLiteOptionalTensor &&
+          context->tensors[i].allocation_type != kTfLiteMmapRo &&
+          // The delegate might not have mapped this input (this can
+          // happen if one tensor is split in several ones)
+          operand_mapping_.lite_index_to_ann(i) != -1) {
+        if (context->tensors[i].buffer_handle != kTfLiteNullBufferHandle) {
+          continue;
+        }
+        const TfLiteType nn_type_conversion =
+            operand_mapping_.lite_index_to_ann_type_conversion(i);
+        int tensor_size = 0;
+        if (nn_type_conversion == kTfLiteNoType) {
+          tensor_size = context->tensors[i].bytes;
+        } else {
+          size_t type_size;
+          TF_LITE_ENSURE_OK(
+              context, GetSizeOfType(context, nn_type_conversion, &type_size));
+          tensor_size = NumElements(&context->tensors[i]) * type_size;
+        }
+        total_input_byte_size += tensor_size;
+        total_input_byte_size += getNumPaddingBytes(tensor_size);
+      }
+    }
+    if (total_input_byte_size > nn_input_memory_->get_byte_size()) {
+      nn_input_memory_.reset(
+          new NNMemory(nnapi_, "input_pool", total_input_byte_size));
+    }
+
+    size_t total_output_byte_size = 0;
+    for (int i : TfLiteIntArrayView(node->outputs)) {
+      if (context->tensors[i].buffer_handle != kTfLiteNullBufferHandle) {
+        continue;
+      }
+      total_output_byte_size += context->tensors[i].bytes;
+      total_output_byte_size += getNumPaddingBytes(context->tensors[i].bytes);
+    }
+    if (total_output_byte_size > nn_output_memory_->get_byte_size()) {
+      nn_output_memory_.reset(
+          new NNMemory(nnapi_, "output_pool", total_output_byte_size));
+    }
+  }
 
   // Set the input tensor buffers. Note: we access tflite tensors using
   // absolute indices but NN api indices inputs by relative indices.
@@ -3481,14 +3642,25 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
     if (absolute_input_index == kTfLiteOptionalTensor) {
       continue;
     }
+    ANeuralNetworksOperandType input_nn_operand_type;
+    ANeuralNetworksOperandType* input_nn_operand_type_ptr = nullptr;
     TfLiteTensor* tensor = &context->tensors[absolute_input_index];
+    TfLiteType ann_type_equivalent =
+        operand_mapping_.lite_index_to_ann_type_conversion(
+            absolute_input_index);
+    if (delegate_options.allow_dynamic_dimensions &&
+        HasUnspecifiedDimension(tensor)) {
+      input_nn_operand_type =
+          ConvertTensorTypeToNNType(tensor, ann_type_equivalent);
+      input_nn_operand_type_ptr = &input_nn_operand_type;
+    }
     if (tensor->allocation_type != kTfLiteMmapRo) {
       if (tensor->buffer_handle != kTfLiteNullBufferHandle &&
           tensor->buffer_handle < tensor_memory_map_->size()) {
         RETURN_TFLITE_ERROR_IF_NN_ERROR(
             context,
             nnapi_->ANeuralNetworksExecution_setInputFromMemory(
-                execution, relative_input_index, nullptr,
+                execution, relative_input_index, input_nn_operand_type_ptr,
                 tensor_memory_map_->at(tensor->buffer_handle).memory, 0,
                 tensor->bytes),
             "associating NNAPI execution input with a memory object",
@@ -3496,9 +3668,6 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
         relative_input_index++;
         continue;
       }
-      TfLiteType ann_type_equivalent =
-          operand_mapping_.lite_index_to_ann_type_conversion(
-              absolute_input_index);
       int tensor_size = 0;
       if (ann_type_equivalent != kTfLiteNoType) {
         const auto num_elements = NumElements(tensor);
@@ -3544,7 +3713,7 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
         RETURN_TFLITE_ERROR_IF_NN_ERROR(
             context,
             nnapi_->ANeuralNetworksExecution_setInputFromMemory(
-                execution, relative_input_index, nullptr,
+                execution, relative_input_index, input_nn_operand_type_ptr,
                 nn_input_memory_->get_handle(), input_offset, tensor_size),
             "associating NNAPI execution input with a memory object",
             nnapi_errno);
@@ -3555,7 +3724,7 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
         RETURN_TFLITE_ERROR_IF_NN_ERROR(
             context,
             nnapi_->ANeuralNetworksExecution_setInputFromMemory(
-                execution, relative_input_index, nullptr,
+                execution, relative_input_index, input_nn_operand_type_ptr,
                 nn_input_memory_->get_handle(), input_offset, tensor->bytes),
             "associating NNAPI execution input with a memory object",
             nnapi_errno);
@@ -3576,13 +3745,23 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
     if (operand_mapping_.lite_index_to_ann(output_index) == -1) {
       continue;
     }
+    ANeuralNetworksOperandType output_nn_operand_type;
+    ANeuralNetworksOperandType* output_nn_operand_type_ptr = nullptr;
     TfLiteTensor* tensor = &context->tensors[output_index];
+    if (delegate_options.allow_dynamic_dimensions &&
+        HasUnspecifiedDimension(tensor)) {
+      TfLiteType ann_type_equivalent =
+          operand_mapping_.lite_index_to_ann_type_conversion(output_index);
+      output_nn_operand_type =
+          ConvertTensorTypeToNNType(tensor, ann_type_equivalent);
+      output_nn_operand_type_ptr = &output_nn_operand_type;
+    }
     if (tensor->buffer_handle != kTfLiteNullBufferHandle &&
         tensor->buffer_handle < tensor_memory_map_->size()) {
       RETURN_TFLITE_ERROR_IF_NN_ERROR(
           context,
           nnapi_->ANeuralNetworksExecution_setOutputFromMemory(
-              execution, relative_output_index, nullptr,
+              execution, relative_output_index, output_nn_operand_type_ptr,
               tensor_memory_map_->at(tensor->buffer_handle).memory, 0,
               tensor->bytes),
           "associating NNAPI execution output to a memory object", nnapi_errno);
@@ -3591,7 +3770,7 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
       RETURN_TFLITE_ERROR_IF_NN_ERROR(
           context,
           nnapi_->ANeuralNetworksExecution_setOutputFromMemory(
-              execution, relative_output_index, nullptr,
+              execution, relative_output_index, output_nn_operand_type_ptr,
               nn_output_memory_->get_handle(), output_offset, tensor->bytes),
           "associating NNAPI execution output to a memory object", nnapi_errno);
       output_offset += tensor->bytes;
@@ -3729,16 +3908,15 @@ void NNAPIDelegateKernel::AddDequantizeOperatorsWhereNeeded(
   }
 }
 
-TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
-                                                   int* nnapi_errno) {
+TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(
+    TfLiteContext* context, int* nnapi_errno, bool allow_dynamic_dimensions) {
   DequantizeMapping dequantize_mapping;
   // The operand builder allows creating a single op. It is created outside
   // the for loop to avoid reallocating the vectors.
   NNAPIOpBuilder builder(nnapi_, context, &operand_mapping_,
                          &dequantize_mapping, &allocation_memory_mapping_,
                          &nnapi_to_tflite_op_mapping_, nn_model_.get(),
-                         nnapi_errno);
-
+                         nnapi_errno, allow_dynamic_dimensions);
   // If we have target accelerators the target SDK version might be
   // different than the current android version.
   target_sdk_version_ = nnapi_->android_sdk_version;
@@ -3746,7 +3924,6 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
     TF_LITE_ENSURE_STATUS(GetTargetSdkVersion(
         context, nnapi_, nnapi_devices_, &target_sdk_version_, nnapi_errno));
   }
-
   // Add Tensors.
   for (auto node_index : nodes_) {
     // Obtain the op and registration.
@@ -4093,7 +4270,7 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
     int nn_op_type;
     TF_LITE_ENSURE_STATUS(
         Map(context, reg->builtin_code, reg->version, target_sdk_version_,
-            {context, &builder, node, &model_state_outputs_,
+            {context, &builder, node, node_index, &model_state_outputs_,
              &model_state_tfl_inputs_, &feedback_loops_, nnapi_errno},
             &nn_op_type));
 
@@ -4133,7 +4310,8 @@ TfLiteStatus NNAPIDelegateKernel::BuildGraph(
     const TfLiteIntArray* input_tensors, const TfLiteIntArray* output_tensors,
     int* nnapi_errno) {
   // Build the ops and tensors.
-  TF_LITE_ENSURE_STATUS(AddOpsAndTensors(context, nnapi_errno));
+  TF_LITE_ENSURE_STATUS(AddOpsAndTensors(
+      context, nnapi_errno, delegate_options.allow_dynamic_dimensions));
   // Map input and output tensor indices to ANN
   std::vector<uint32_t> inputs;
   inputs.reserve(input_tensors->size);
@@ -4222,6 +4400,9 @@ TfLiteStatus NNAPIDelegateKernel::BuildGraph(
 }  // namespace nnapi
 }  // namespace delegate
 
+using ::tflite::delegate::nnapi::kMinSdkVersionForNNAPI;
+using ::tflite::delegate::nnapi::kMinSdkVersionForNNAPI11;
+using ::tflite::delegate::nnapi::kMinSdkVersionForNNAPI12;
 using ::tflite::delegate::nnapi::NNAPIDelegateKernel;
 
 StatefulNnApiDelegate::Data::Data(const NnApi* nnapi) : nnapi(nnapi) {}
@@ -4284,6 +4465,9 @@ StatefulNnApiDelegate::StatefulNnApiDelegate(const NnApi* nnapi,
       options.max_execution_timeout_duration_ns;
   delegate_data_.max_execution_loop_timeout_duration_ns =
       options.max_execution_loop_timeout_duration_ns;
+  if (nnapi->android_sdk_version >= kMinSdkVersionForNNAPI11) {
+    delegate_data_.allow_dynamic_dimensions = options.allow_dynamic_dimensions;
+  }
   TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
                        "Created TensorFlow Lite delegate for NNAPI.");
   Prepare = DoPrepare;
@@ -4291,6 +4475,10 @@ StatefulNnApiDelegate::StatefulNnApiDelegate(const NnApi* nnapi,
   CopyToBufferHandle = DoCopyToBufferHandle;
   FreeBufferHandle = DoFreeBufferHandle;
   data_ = &delegate_data_;
+  if (delegate_data_.allow_dynamic_dimensions) {
+    flags |= kTfLiteDelegateFlagsAllowDynamicTensors;
+    flags |= kTfLiteDelegateFlagsRequirePropagatedShapes;
+  }
 }
 
 StatefulNnApiDelegate::StatefulNnApiDelegate()
@@ -4321,6 +4509,7 @@ const StatefulNnApiDelegate::Options StatefulNnApiDelegate::GetOptions(
       delegate_data->max_execution_timeout_duration_ns;
   options.max_execution_loop_timeout_duration_ns =
       delegate_data->max_execution_loop_timeout_duration_ns;
+  options.allow_dynamic_dimensions = delegate_data->allow_dynamic_dimensions;
   return options;
 }
 
@@ -4384,9 +4573,6 @@ int StatefulNnApiDelegate::GetNnApiErrno() const {
   return delegate_data_.nnapi_errno;
 }
 
-using ::tflite::delegate::nnapi::kMinSdkVersionForNNAPI;
-using ::tflite::delegate::nnapi::kMinSdkVersionForNNAPI12;
-
 // static
 TfLiteStatus StatefulNnApiDelegate::GetNodesSupportedByAccelerator(
     TfLiteContext* context, TfLiteDelegate* delegate, const NnApi* nnapi,
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
index 27add64563d..bd4165d8a17 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
@@ -116,6 +116,15 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // within the specified duration, the execution will be aborted. If set to
     // 0, the default timeout for loops will be used.
     uint64_t max_execution_loop_timeout_duration_ns = 0;
+
+    // Whether to allow dynamic dimension sizes without re-compilation.
+    // A tensor of with dynamic dimension must have a valid dim_signature
+    // defined.
+    // Only supported in NNAPI 1.1 and newer versions.
+    // WARNING: Setting this flag to true may result in model being rejected by
+    // accelerator. This should only be enabled if the target device supports
+    // dynamic dimensions of the model.
+    bool allow_dynamic_dimensions = false;
   };
 
   // Uses default options.
@@ -224,6 +233,8 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // Specifies the maximum expected duration in nanosecond for WHILE loops in
     // the execution
     uint64_t max_execution_loop_timeout_duration_ns = 0;
+    // Whether to allow dynamic dimension sizes without re-compilation.
+    bool allow_dynamic_dimensions = false;
 
     explicit Data(const NnApi* nnapi);
     ~Data();
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
index 9aa0f303cc2..36c1dd32efb 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
@@ -39,7 +39,8 @@ class OperandMapping {
   // Given a TFLite index return the ANN index. If it doesn't exist
   // return -1.
   int lite_index_to_ann(int index) const {
-    if (index >= 0 && index < lite_tensor_to_ann_tensor_.size())
+    const int64_t max_size = lite_tensor_to_ann_tensor_.size();
+    if (index >= 0 && index < max_size)
       return lite_tensor_to_ann_tensor_[index];
     else
       return -1;
@@ -60,7 +61,8 @@ class OperandMapping {
 
   // Add a new mapping from `tflite_index` and return the NN API tensor index.
   int add_new_ann_tensor_index(int tflite_index) {
-    if (tflite_index >= lite_tensor_to_ann_tensor_.size()) {
+    const int64_t current_size = lite_tensor_to_ann_tensor_.size();
+    if (tflite_index >= current_size) {
       lite_tensor_to_ann_tensor_.resize(tflite_index + 1, -1);
     }
     const int new_tensor_index = next_ann_tensor_index_++;
@@ -72,7 +74,8 @@ class OperandMapping {
   // converted during copying the data to the memory allocated for NN API.
   // kTfLiteNoType means no conversion is needed.
   TfLiteType lite_index_to_ann_type_conversion(int index) const {
-    if (index >= 0 && index < index_to_type_conversion_.size())
+    const int64_t max_size = index_to_type_conversion_.size();
+    if (index >= 0 && index < max_size)
       return index_to_type_conversion_[index];
     else
       return kTfLiteNoType;
@@ -80,7 +83,8 @@ class OperandMapping {
 
   // Add a new mapping from TFLite index to a type conversion.
   void add_type_conversion(int tflite_index, TfLiteType tflite_type) {
-    if (tflite_index >= index_to_type_conversion_.size()) {
+    const int64_t current_size = index_to_type_conversion_.size();
+    if (tflite_index >= current_size) {
       index_to_type_conversion_.resize(tflite_index + 1, kTfLiteNoType);
     }
     index_to_type_conversion_[tflite_index] = tflite_type;
@@ -107,6 +111,7 @@ struct NNAPIOpMappingArgs {
   TfLiteContext* context;
   NNAPIOpBuilder* builder;
   TfLiteNode* node;
+  int node_index;
   std::vector<int>* model_state_outputs;
   std::vector<int>* model_state_tfl_inputs;
   std::vector<std::tuple<int, int>>* feedback_loops;
@@ -159,6 +164,7 @@ class NNMemory {
 
   ANeuralNetworksMemory* get_handle() { return nn_memory_handle_; }
   uint8_t* get_data_ptr() { return data_ptr_; }
+  size_t get_byte_size() { return byte_size_; }
 
  private:
   // NnApi instance to use. Not owned by this object.
@@ -348,7 +354,8 @@ class NNAPIDelegateKernel {
       const TfLiteContext* context, int builtin_code, const TfLiteNode* node,
       int tflite_node_index, NNAPIOpBuilder* builder, int* nnapi_errno);
 
-  TfLiteStatus AddOpsAndTensors(TfLiteContext* context, int* nnapi_errno);
+  TfLiteStatus AddOpsAndTensors(TfLiteContext* context, int* nnapi_errno,
+                                bool allow_dynamic_dimensions);
 
   TfLiteStatus BuildGraph(TfLiteContext* context,
                           const StatefulNnApiDelegate::Options& options,
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
index 8abf15dacb9..fe022199dd6 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -222,6 +222,56 @@ TEST(NNAPIDelegate, ResizeInputTensorsWorks) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1.0, 1.3, 1.1, 1.5}));
 }
 
+TEST(NNAPIDelegate, ResizeDynamicBatchInputTensorsWorks) {
+  StatefulNnApiDelegate::Options options;
+  options.allow_dynamic_dimensions = true;
+
+  FloatAddOpModel m(options,
+                    {TensorType_FLOAT32, /*shape=*/{1, 3, 2, 1}, /*min=*/0.0f,
+                     /*max=*/0.0f, /*scale=*/0.0f,
+                     /*zero_point=*/0, /*per_channel_quantization=*/false,
+                     /*per_channel_quantization_scales=*/{},
+                     /*per_channel_quantization_offsets=*/{},
+                     /*channel_index=*/0, /*traversal_order=*/{},
+                     /*format=*/{},
+                     /*block_size=*/{}, /*block_map=*/{},
+                     /*shape_signature=*/{1, -1, 2, 1}},
+                    {TensorType_FLOAT32, /*shape=*/{1, 3, 2, 1}, /*min=*/0.0f,
+                     /*max=*/0.0f, /*scale=*/0.0f,
+                     /*zero_point=*/0, /*per_channel_quantization=*/false,
+                     /*per_channel_quantization_scales=*/{},
+                     /*per_channel_quantization_offsets=*/{},
+                     /*channel_index=*/0, /*traversal_order=*/{},
+                     /*format=*/{},
+                     /*block_size=*/{}, /*block_map=*/{},
+                     /*shape_signature=*/{1, -1, 2, 1}},
+                    {TensorType_FLOAT32, /*shape=*/{}, /*min=*/0.0f,
+                     /*max=*/0.0f, /*scale=*/0.0f,
+                     /*zero_point=*/0, /*per_channel_quantization=*/false,
+                     /*per_channel_quantization_scales=*/{},
+                     /*per_channel_quantization_offsets=*/{},
+                     /*channel_index=*/0, /*traversal_order=*/{},
+                     /*format=*/{},
+                     /*block_size=*/{}, /*block_map=*/{},
+                     /*shape_signature=*/{1, -1, 2, 1}},
+                    ActivationFunctionType_NONE);
+  EXPECT_EQ(m.ResizeInputTensor(m.input1(), {1, 3, 2, 1}), kTfLiteOk);
+  EXPECT_EQ(m.ResizeInputTensor(m.input2(), {1, 3, 2, 1}), kTfLiteOk);
+  EXPECT_EQ(m.AllocateTensors(), kTfLiteOk);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 0.9, 0.7});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5, 0.2, 0.8});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.9, 0.4, 1.0, 1.3, 1.1, 1.5}));
+
+  EXPECT_EQ(m.ResizeInputTensor(m.input1(), {1, 2, 2, 1}), kTfLiteOk);
+  EXPECT_EQ(m.ResizeInputTensor(m.input2(), {1, 2, 2, 1}), kTfLiteOk);
+  EXPECT_EQ(m.AllocateTensors(), kTfLiteOk);
+  m.PopulateTensor<float>(m.input1(), {0.7, 0.8, 0.9, 0.7});
+  m.PopulateTensor<float>(m.input2(), {0.3, 0.5, 0.2, 0.8});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1.0, 1.3, 1.1, 1.5}));
+}
+
 // Sanity check for the state-ful NNAPI delegate.
 TEST(NNAPIDelegate, StatefulDelegate) {
   StatefulNnApiDelegate::Options options;
diff --git a/tensorflow/lite/delegates/nnapi/quant_lstm_sup.cc b/tensorflow/lite/delegates/nnapi/quant_lstm_sup.cc
index bcf2ff61825..2337296444d 100644
--- a/tensorflow/lite/delegates/nnapi/quant_lstm_sup.cc
+++ b/tensorflow/lite/delegates/nnapi/quant_lstm_sup.cc
@@ -36,7 +36,7 @@ void ExtractQuantLstmWeightsSubmatrix(const TfLiteIntArray* submatrix_dims,
 
   submatrix->resize(NumElements(submatrix_dims));
 
-  for (uint32_t i = 0; i < submatrix_rows * submatrix_cols; ++i) {
+  for (uint32_t i = 0, end = submatrix_rows * submatrix_cols; i < end; ++i) {
     const uint32_t row = i / submatrix_cols;
     const uint32_t column = i % submatrix_cols;
     (*submatrix)[i] =
diff --git a/tensorflow/lite/delegates/utils/dummy_delegate/BUILD b/tensorflow/lite/delegates/utils/dummy_delegate/BUILD
index a451b51e47f..fd4e6e9eedb 100644
--- a/tensorflow/lite/delegates/utils/dummy_delegate/BUILD
+++ b/tensorflow/lite/delegates/utils/dummy_delegate/BUILD
@@ -22,6 +22,21 @@ cc_library(
     ],
 )
 
+cc_binary(
+    name = "dummy_external_delegate.so",
+    srcs = [
+        "external_delegate_adaptor.cc",
+    ],
+    linkshared = 1,
+    linkstatic = 1,
+    deps = [
+        ":dummy_delegate",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/tools:command_line_flags",
+        "//tensorflow/lite/tools:logging",
+    ],
+)
+
 #### The following are for using the dummy test delegate in TFLite tooling ####
 cc_library(
     name = "dummy_delegate_provider",
diff --git a/tensorflow/lite/delegates/utils/dummy_delegate/README.md b/tensorflow/lite/delegates/utils/dummy_delegate/README.md
index be3ccae8810..ae17f1b67d3 100644
--- a/tensorflow/lite/delegates/utils/dummy_delegate/README.md
+++ b/tensorflow/lite/delegates/utils/dummy_delegate/README.md
@@ -1,11 +1,164 @@
-#Description
-A dummy delegate implementation to illustrate
+When speaking of a TFLite delegate, how to create it and how to reuse existing
+TFLite testing and tooling with the new delegate are two major challenging
+issues. Here, we show a dummy delegate implementation to illustrate our
+recommended approaches to address these issues.
 
-*  How to use [SimpleDelegateInterface and SimpleDelegateKernelInterface](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/utils/simple_delegate.h)
-   to faciliate a TFLite delgate creation. A more sophisticated example could be
-   refered to the [Flex delegate](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/delegates/flex)
+## Delegate Creation
 
-* How to leverage the [delegate registrar](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/delegates)
-  to plug in a delegate in TFLite benchmark and task evaluation tools.
+We recommend using
+[SimpleDelegateInterface and SimpleDelegateKernelInterface](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/utils/simple_delegate.h).
+We believe such APIs will make it easier to create a TFLite delegate. At a high
+level, developers only need to focus on
 
-More detailed guide is coming soon.
+* Whether a TFLite node in the graph is supported by the delegate or not.
+* Given the set of supported nodes (i.e. a subgraph of the original model
+graph), implement a delegate kernel that executes this set of nodes.
+
+The dummy delegate implementation here is a good starting point to understand
+the ideas above. For more sophisticated examples, refer to [Flex delegate](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/delegates/flex),
+    [Hexagon delegate](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/delegates/hexagon).
+
+## Testing & Tooling
+
+There are currently **two optionss** to plug in a newly created TFLite delegate
+to reuse existing TFLite kernel tests and and tooling:
+
+- Utilize the **[delegate registrar](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/delegates)**
+mechansim
+- Utilize the
+**[external delegate](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/delegates/external)**
+mechanism.
+
+The former approach requires few changes as detailed below. The latter one
+requires even fewer changes and works with pre-built Tensorflow Lite tooling
+binaries. However, it is less explicit and it might be more complicated to set
+up in automated integration tests. Therefore, for better clarity, the
+delegate-registrar approach is slightly preferred here.
+
+We now describe each option above in more details in the following sections.
+
+### Option 1: Utilize Delegate Registrar
+In this approach, create a delegate provider like the
+[`dummy_delegate_provider.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/utils/dummy_delegate/dummy_delegate_provider.cc)
+here, and then add it as an extra dependency when building the binary. Refer
+[here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/delegates)
+for more delegate provider examples. Now we look at using this provider for
+testing and evaluation.
+
+#### Kernel Tests
+Tests referred here are defined in [tensorflow/lite/kernels](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels).
+They are based on the
+ [test_util library](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/test_util.h)
+ and the [testing main function stub](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/test_main.cc).
+
+To plug in the newly created delegate and reuse these tests, simply add the
+created delegate provider as an extra dependency to
+[`test_util_delegate_providers`](https://github.com/tensorflow/tensorflow/blob/f09dc5cf6e7fde978f9891638f529cd52a3c878f/tensorflow/lite/kernels/BUILD#L203)
+and remove others that are not relevant, like the following:
+
+```
+cc_library(
+    name = "tflite_driver_delegate_providers",
+    deps = [
+        # Existing delegate providers that might be still relevant.
+        ":dummy_delegate_provider",
+    ],
+    alwayslink = 1,
+)
+```
+
+Then build a kernel test, and specify the commandline flags defined in the
+delegate provider when executing the test. Take this case as an example,
+
+```
+bazel build -c opt tensorflow/lite/kernels:add_test
+
+# Setting --use_dummy_delegate=true will apply the dummy delegate to the
+# TFLite model graph
+bazel-bin/tensorflow/lite/kernels/add_test --use_dummy_delegate=true
+```
+
+#### Benchmark and Task Evaluation Tools
+
+In TFLite, we have developed
+[model benchmark tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark)
+and
+[evaluation tools](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/evaluation/tasks)
+that already have integrated existing various TFLite delegates. To reuse these
+tools for the new delegate, similar to the kernel testing above, we simply add
+the created delegate provider as an additional dependency when building the
+binary. See rules in the
+[BUILD](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/utils/BUILD)
+file for details.
+
+Take reusing the TFLite model benchmark tool as an example, after the delegate
+provider is created, define the BUILD rule like the following:
+
+```
+cc_binary(
+    name = "benchmark_model_plus_dummy_delegate",
+    copts = tflite_copts(),
+    linkopts = task_linkopts(),
+    deps = [
+        # Simply add the delegate provider as an extra dep.
+        ":dummy_delegate_provider",
+        "//tensorflow/lite/tools/benchmark:benchmark_model_main",
+    ],
+)
+```
+
+Now build the binary, and specify the commandline flags defined in this new
+delegate provider and others detailed in the benchmark model tool
+[doc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/benchmark/README.md)
+when running the benchmark tool like the following:
+
+```
+bazel build -c opt tensorflow/lite/delegates/utils/dummy_delegate:benchmark_model_plus_dummy_delegate
+
+# Setting --use_dummy_delegate=true will apply the dummy delegate to the
+# TFLite model graph.
+bazel-bin/tensorflow/lite/delegates/utils/dummy_delegate/benchmark_model_plus_dummy_delegate --graph=/tmp/mobilenet-v2.tflite --use_dummy_delegate=true
+
+```
+
+### Option 2: Utilize Tensorflow Lite External Delegate
+In this **alternative approach to reuse existing Tensorflow Lite kernel testing
+and tooling**, we first create an external delegate adaptor like the [`external_delegate_adaptor.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/utils/dummy_delegate/external_delegate_adaptor.cc) here, and create the corresponding BUILD target
+to build a dynamic library.
+
+Afterwards, one could build binaries or use pre-built ones that are linked with
+the
+[`external_delegate_provider`](https://github.com/tensorflow/tensorflow/blob/8c6f2d55762f3fc94f98fdd8b3c5d59ee1276dba/tensorflow/lite/tools/delegates/BUILD#L145-L159)
+library which supports command-line flags as described
+[here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/delegates#external-delegate-provider).
+Note this delegate provider has already been linked to existing testing and
+tooling binaries.
+
+For example, the following illustrates how to benchmark the dummy delegate here
+via this external-delegate approach. We could use similar commands for testing
+and evaluation tools.
+
+```
+bazel build -c opt tensorflow/lite/delegates/utils/dummy_delegate:dummy_external_delegate.so
+
+# Copy the .so file to the directory that the external delegate will be loaded
+# from at your choice.
+cp bazel-bin/tensorflow/lite/delegates/utils/dummy_delegate/dummy_external_delegate.so /tmp
+
+bazel build -c opt tensorflow/lite/tools/benchmark:benchmark_model
+
+# Setting a non-empty --external_delegate_path value will trigger applying
+# the external delegate during runtime.
+bazel-bin/tensorflow/lite/tools/benchmark/benchmark_model \
+  --graph=/tmp/mobilenet-v2.tflite \
+  --external_delegate_path=/tmp/dummy_external_delegate.so \
+  --external_delegate_options='error_during_init:true;error_during_prepare:true'
+```
+
+It is worth noting the *external delegate* is the corresponding C++
+implementation of the *delegate* in Tensorflow Lite Python binding as shown
+[here](https://github.com/tensorflow/tensorflow/blob/7145fc0e49be01ef6943f4df386ce38567e37797/tensorflow/lite/python/interpreter.py#L42).
+Therefore, the dynamic external delegate adaptor library created here could be
+directly used with Tensorflow Lite Python APIs.
+
+More detailed guide on TFLite delegate is coming soon.
diff --git a/tensorflow/lite/delegates/utils/dummy_delegate/external_delegate_adaptor.cc b/tensorflow/lite/delegates/utils/dummy_delegate/external_delegate_adaptor.cc
new file mode 100644
index 00000000000..7ae6539e9ba
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/dummy_delegate/external_delegate_adaptor.cc
@@ -0,0 +1,106 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/utils/dummy_delegate/dummy_delegate.h"
+#include "tensorflow/lite/tools/command_line_flags.h"
+#include "tensorflow/lite/tools/logging.h"
+
+namespace tflite {
+namespace tools {
+
+TfLiteDelegate* CreateDummyDelegateFromOptions(char** options_keys,
+                                               char** options_values,
+                                               size_t num_options) {
+  DummyDelegateOptions options = TfLiteDummyDelegateOptionsDefault();
+
+  // Parse key-values options to DummyDelegateOptions by mimicking them as
+  // command-line flags.
+  std::unique_ptr<const char*> argv =
+      std::unique_ptr<const char*>(new const char*[num_options + 1]);
+  constexpr char kDummyDelegateParsing[] = "dummy_delegate_parsing";
+  argv.get()[0] = kDummyDelegateParsing;
+
+  std::vector<std::string> option_args;
+  option_args.reserve(num_options);
+  for (int i = 0; i < num_options; ++i) {
+    option_args.emplace_back("--");
+    option_args.rbegin()->append(options_keys[i]);
+    option_args.rbegin()->push_back('=');
+    option_args.rbegin()->append(options_values[i]);
+    argv.get()[i + 1] = option_args.rbegin()->c_str();
+  }
+
+  constexpr char kAllowedBuiltinOp[] = "allowed_builtin_code";
+  constexpr char kReportErrorDuingInit[] = "error_during_init";
+  constexpr char kReportErrorDuingPrepare[] = "error_during_prepare";
+  constexpr char kReportErrorDuingInvoke[] = "error_during_invoke";
+
+  std::vector<tflite::Flag> flag_list = {
+      tflite::Flag::CreateFlag(kAllowedBuiltinOp, &options.allowed_builtin_code,
+                               "Allowed builtin code."),
+      tflite::Flag::CreateFlag(kReportErrorDuingInit,
+                               &options.error_during_init,
+                               "Report error during init."),
+      tflite::Flag::CreateFlag(kReportErrorDuingPrepare,
+                               &options.error_during_prepare,
+                               "Report error during prepare."),
+      tflite::Flag::CreateFlag(kReportErrorDuingInvoke,
+                               &options.error_during_invoke,
+                               "Report error during invoke."),
+  };
+
+  int argc = num_options + 1;
+  if (!tflite::Flags::Parse(&argc, argv.get(), flag_list)) {
+    return nullptr;
+  }
+
+  TFLITE_LOG(INFO) << "Dummy delegate: allowed_builtin_code set to "
+                   << options.allowed_builtin_code << ".";
+  TFLITE_LOG(INFO) << "Dummy delegate: error_during_init set to "
+                   << options.error_during_init << ".";
+  TFLITE_LOG(INFO) << "Dummy delegate: error_during_prepare set to "
+                   << options.error_during_prepare << ".";
+  TFLITE_LOG(INFO) << "Dummy delegate: error_during_invoke set to "
+                   << options.error_during_invoke << ".";
+
+  return TfLiteDummyDelegateCreate(&options);
+}
+
+}  // namespace tools
+}  // namespace tflite
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Defines two symbols that need to be exported to use the TFLite external
+// delegate. See tensorflow/lite/delegates/external for details.
+TFL_CAPI_EXPORT TfLiteDelegate* tflite_plugin_create_delegate(
+    char** options_keys, char** options_values, size_t num_options,
+    void (*report_error)(const char*)) {
+  return tflite::tools::CreateDummyDelegateFromOptions(
+      options_keys, options_values, num_options);
+}
+
+TFL_CAPI_EXPORT void tflite_plugin_destroy_delegate(TfLiteDelegate* delegate) {
+  TfLiteDummyDelegateDelete(delegate);
+}
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index e2efac24243..3c580edae10 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -229,6 +229,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "resize_bilinear_tester",
+    testonly = 1,
+    srcs = ["resize_bilinear_tester.cc"],
+    hdrs = ["resize_bilinear_tester.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
 cc_library(
     name = "softmax_tester",
     testonly = 1,
@@ -635,6 +651,21 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "resize_bilinear_test",
+    srcs = ["resize_bilinear_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":resize_bilinear_tester",
+        ":test_main",
+        ":xnnpack_delegate_test_mode",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_test(
     name = "round_test",
     srcs = ["round_test.cc"],
diff --git a/tensorflow/lite/delegates/xnnpack/README.md b/tensorflow/lite/delegates/xnnpack/README.md
index 88b0729ed62..6f597006c1b 100644
--- a/tensorflow/lite/delegates/xnnpack/README.md
+++ b/tensorflow/lite/delegates/xnnpack/README.md
@@ -8,9 +8,47 @@ to use the XNNPACK library as an inference engine for TensorFlow Lite.
 ## Using XNNPACK engine with TensorFlow Lite interpreter
 
 XNNPACK integrates with TensorFlow Lite interpreter through the delegation
-mechanism. There are three methods to enable XNNPACK engine in TensorFlow Lite.
+mechanism. TensorFlow Lite supports several methods to enable XNNPACK
+for floating-point inference.
 
-### Enable XNNPACK via Bazel build flags (recommended)
+### Enable XNNPACK via Java API on Android (recommended on Android)
+
+Pre-built [nightly TensorFlow Lite binaries for Android](https://www.tensorflow.org/lite/guide/android#use_the_tensorflow_lite_aar_from_jcenter)
+include XNNPACK, albeit it is disabled by default. Use the `setUseXNNPACK`
+method in `Interpreter.Options` class to enable it:
+
+```java
+Interpreter.Options interpreterOptions = new Interpreter.Options();
+interpreterOptions.setUseXNNPACK(true);
+Interpreter interpreter = new Interpreter(model, interpreterOptions);
+```
+
+### Enable XNNPACK via Swift/Objective-C API on iOS (recommended on iOS)
+
+Pre-built [nightly TensorFlow Lite CocoaPods](https://www.tensorflow.org/lite/guide/ios#specifying_versions)
+include XNNPACK, but do not enable it by default. Swift developers can use
+`InterpreterOptions` object to enable XNNPACK:
+
+```swift
+var options = InterpreterOptions()
+options.isXNNPackEnabled = true
+var interpreter = try Interpreter(modelPath: "model/path", options: options)
+```
+
+Objective-C developers can enable XNNPACK via a new property in the
+`TFLInterpreterOptions` class:
+
+```objc
+TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
+options.useXNNPACK = YES;
+NSError *error;
+TFLInterpreter *interpreter =
+    [[TFLInterpreter alloc] initWithModelPath:@"model/path"
+                                      options:options
+                                        error:&error];
+```
+
+### Enable XNNPACK via Bazel build flags (recommended on desktop)
 
 When building TensorFlow Lite with Bazel, add
 `--define tflite_with_xnnpack=true`, and the TensorFlow Lite interpreter will
@@ -100,7 +138,7 @@ Below is the list of current operators and limitations:
 ### `AVERAGE_POOL_2D`
 
 * Inputs and outputs must be in 32-bit floating-point format.
-* 1x1 pooling is not supported.
+* 1x1 pooling with non-unit stride is not supported.
 * Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
   but fused `TANH` and `SIGN_BIT` activations are not.
 
@@ -157,7 +195,7 @@ Below is the list of current operators and limitations:
 ### `MAX_POOL_2D`
 
 * Inputs and outputs must be in 32-bit floating-point format.
-* 1x1 pooling is not supported.
+* 1x1 pooling with non-unit stride is not supported.
 * Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
   but fused `TANH` and `SIGN_BIT` activations are not.
 
@@ -215,6 +253,20 @@ Below is the list of current operators and limitations:
 
 * Inputs and outputs must be in 32-bit floating-point format.
 
+### `RESHAPE`
+
+* The first input and the output must be in 32-bit floating-point format.
+* The second input (the input with the new shape specification) must be either
+  static (use `kTfLiteMmapRo` allocation type), or absent (with the new shape
+  specified via `ReshapeOptions` table).
+
+### `RESIZE_BILINEAR`
+
+* The first input and the output must be 4D tensors in 32-bit floating-point
+  format.
+* The second input (the input with the new shape specification) must be
+  static (use `kTfLiteMmapRo` allocation type).
+
 ### `ROUND`
 
 * Inputs and outputs must be in 32-bit floating-point format.
diff --git a/tensorflow/lite/delegates/xnnpack/average_pool_2d_test.cc b/tensorflow/lite/delegates/xnnpack/average_pool_2d_test.cc
index 515fec8083f..9cfaf8e432e 100644
--- a/tensorflow/lite/delegates/xnnpack/average_pool_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/average_pool_2d_test.cc
@@ -25,6 +25,60 @@ limitations under the License.
 namespace tflite {
 namespace xnnpack {
 
+TEST(AveragePool2D, UnitPoolSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(5, 16), std::ref(rng));
+
+  Pool2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .Channels(channel_rng())
+      .PoolingHeight(1)
+      .PoolingWidth(1)
+      .StrideHeight(1)
+      .StrideWidth(1)
+      .SamePadding()
+      .Test(BuiltinOperator_AVERAGE_POOL_2D, xnnpack_delegate.get());
+}
+
+TEST(AveragePool2D, UnitPoolValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(5, 16), std::ref(rng));
+
+  Pool2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .Channels(channel_rng())
+      .PoolingHeight(1)
+      .PoolingWidth(1)
+      .StrideHeight(1)
+      .StrideWidth(1)
+      .ValidPadding()
+      .Test(BuiltinOperator_AVERAGE_POOL_2D, xnnpack_delegate.get());
+}
+
 TEST(AveragePool2D, EqualPoolAndStrideWithSamePadding) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
diff --git a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
index 8962b8ba7ba..9696b07b7a3 100644
--- a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
@@ -201,9 +201,9 @@ std::vector<char> FullyConnectedTester::CreateTfLiteModel() const {
                              sizeof(float) * bias_data.size())));
   }
 
-  const std::array<int32_t, 2> filter_shape(
-      {OutputChannels(), InputChannels()});
-  const std::array<int32_t, 1> bias_shape({OutputChannels()});
+  const std::array<int32_t, 2> filter_shape{
+      {OutputChannels(), InputChannels()}};
+  const std::array<int32_t, 1> bias_shape{{OutputChannels()}};
 
   const std::vector<int32_t> output_shape = OutputShape();
   std::vector<flatbuffers::Offset<Tensor>> tensors;
diff --git a/tensorflow/lite/delegates/xnnpack/max_pool_2d_test.cc b/tensorflow/lite/delegates/xnnpack/max_pool_2d_test.cc
index aaf217800d8..a9651ad1fc5 100644
--- a/tensorflow/lite/delegates/xnnpack/max_pool_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/max_pool_2d_test.cc
@@ -25,6 +25,60 @@ limitations under the License.
 namespace tflite {
 namespace xnnpack {
 
+TEST(MaxPool2D, UnitPoolSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(5, 16), std::ref(rng));
+
+  Pool2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .Channels(channel_rng())
+      .PoolingHeight(1)
+      .PoolingWidth(1)
+      .StrideHeight(1)
+      .StrideWidth(1)
+      .SamePadding()
+      .Test(BuiltinOperator_MAX_POOL_2D, xnnpack_delegate.get());
+}
+
+TEST(MaxPool2D, UnitPoolValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(5, 16), std::ref(rng));
+
+  Pool2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .Channels(channel_rng())
+      .PoolingHeight(1)
+      .PoolingWidth(1)
+      .StrideHeight(1)
+      .StrideWidth(1)
+      .ValidPadding()
+      .Test(BuiltinOperator_MAX_POOL_2D, xnnpack_delegate.get());
+}
+
 TEST(MaxPool2D, EqualPoolAndStrideWithSamePadding) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
diff --git a/tensorflow/lite/delegates/xnnpack/resize_bilinear_test.cc b/tensorflow/lite/delegates/xnnpack/resize_bilinear_test.cc
new file mode 100644
index 00000000000..e4ff3e63388
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/resize_bilinear_test.cc
@@ -0,0 +1,119 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <random>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+TEST(ResizeBilinear, AlignCenters) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto size_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
+
+  ResizeBilinearTester()
+      .HalfPixelCenters(true)
+      .InputHeight(size_rng())
+      .InputWidth(size_rng())
+      .OutputHeight(size_rng())
+      .OutputWidth(size_rng())
+      .Channels(channel_rng())
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(ResizeBilinear, AlignCentersTF1X) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto size_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
+
+  ResizeBilinearTester()
+      .InputHeight(size_rng())
+      .InputWidth(size_rng())
+      .OutputHeight(size_rng())
+      .OutputWidth(size_rng())
+      .Channels(channel_rng())
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(ResizeBilinear, AlignCorners) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto size_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
+
+  ResizeBilinearTester()
+      .AlignCorners(true)
+      .InputHeight(size_rng())
+      .InputWidth(size_rng())
+      .OutputHeight(size_rng())
+      .OutputWidth(size_rng())
+      .Channels(channel_rng())
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(ResizeBilinear, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto size_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
+
+  ResizeBilinearTester()
+      .InputHeight(size_rng())
+      .InputWidth(size_rng())
+      .OutputHeight(size_rng())
+      .OutputWidth(size_rng())
+      .Channels(channel_rng())
+      .Test(xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.cc b/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.cc
new file mode 100644
index 00000000000..34730c05719
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.cc
@@ -0,0 +1,183 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.h"
+
+#include <array>
+#include <cstdint>
+#include <functional>
+#include <numeric>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+namespace tflite {
+namespace xnnpack {
+
+void ResizeBilinearTester::Test(TfLiteDelegate* delegate) const {
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto input_rng =
+      std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
+
+  std::vector<char> buffer = CreateTfLiteModel();
+  const Model* model = GetModel(buffer.data());
+
+  std::unique_ptr<Interpreter> delegate_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &delegate_interpreter),
+      kTfLiteOk);
+  std::unique_ptr<Interpreter> default_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &default_interpreter),
+      kTfLiteOk);
+
+  ASSERT_TRUE(delegate_interpreter);
+  ASSERT_TRUE(default_interpreter);
+
+  ASSERT_EQ(delegate_interpreter->inputs().size(), 1);
+  ASSERT_EQ(default_interpreter->inputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->outputs().size(), 1);
+  ASSERT_EQ(default_interpreter->outputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(default_interpreter->AllocateTensors(), kTfLiteOk);
+
+  ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
+
+  float* default_input_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->inputs()[0]);
+  std::generate(default_input_data,
+                default_input_data +
+                    BatchSize() * InputHeight() * InputWidth() * Channels(),
+                std::ref(input_rng));
+
+  float* delegate_input_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->inputs()[0]);
+  std::copy(default_input_data,
+            default_input_data +
+                BatchSize() * InputHeight() * InputWidth() * Channels(),
+            delegate_input_data);
+
+  ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
+  ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
+
+  float* default_output_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->outputs()[0]);
+  float* delegate_output_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->outputs()[0]);
+
+  for (int i = 0; i < BatchSize(); i++) {
+    for (int y = 0; y < OutputHeight(); y++) {
+      for (int x = 0; x < OutputWidth(); x++) {
+        for (int c = 0; c < Channels(); c++) {
+          const int index =
+              ((i * OutputHeight() + y) * OutputWidth() + x) * Channels() + c;
+          ASSERT_NEAR(default_output_data[index], delegate_output_data[index],
+                      std::max(std::abs(default_output_data[index]) * 1.0e-4f,
+                               10.0f * std::numeric_limits<float>::epsilon()))
+              << "batch " << i << " / " << BatchSize() << ", y position " << y
+              << " / " << OutputHeight() << ", x position " << x << " / "
+              << OutputWidth() << ", channel " << c << " / " << Channels();
+        }
+      }
+    }
+  }
+}
+
+std::vector<char> ResizeBilinearTester::CreateTfLiteModel() const {
+  flatbuffers::FlatBufferBuilder builder;
+  flatbuffers::Offset<OperatorCode> operator_code =
+      CreateOperatorCode(builder, BuiltinOperator_RESIZE_BILINEAR);
+
+  flatbuffers::Offset<tflite::ResizeBilinearOptions> resize_bilinear_options =
+      CreateResizeBilinearOptions(builder, AlignCorners(), HalfPixelCenters());
+
+  const std::array<int32_t, 2> size_data{{OutputHeight(), OutputWidth()}};
+
+  const std::array<flatbuffers::Offset<Buffer>, 2> buffers{{
+      CreateBuffer(builder, builder.CreateVector({})),
+      CreateBuffer(builder,
+                   builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(size_data.data()),
+                       size_data.size() * sizeof(int32_t))),
+  }};
+
+  const std::array<int32_t, 4> input_shape{
+      {BatchSize(), InputHeight(), InputWidth(), Channels()}};
+  const std::array<int32_t, 4> output_shape{
+      {BatchSize(), OutputHeight(), OutputWidth(), Channels()}};
+  const std::array<int32_t, 1> size_shape{
+      {static_cast<int32_t>(size_data.size())}};
+
+  const std::array<flatbuffers::Offset<Tensor>, 3> tensors{{
+      CreateTensor(
+          builder,
+          builder.CreateVector<int32_t>(input_shape.data(), input_shape.size()),
+          TensorType_FLOAT32),
+      CreateTensor(
+          builder,
+          builder.CreateVector<int32_t>(size_shape.data(), size_shape.size()),
+          TensorType_INT32, /*buffer=*/1),
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(output_shape.data(),
+                                                 output_shape.size()),
+                   TensorType_FLOAT32),
+  }};
+
+  const std::array<int32_t, 2> op_inputs{{0, 1}};
+  const std::array<int32_t, 1> op_outputs{{2}};
+  flatbuffers::Offset<Operator> op = CreateOperator(
+      builder, /*opcode_index=*/0,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
+      BuiltinOptions_ResizeBilinearOptions, resize_bilinear_options.Union());
+
+  const std::array<int32_t, 1> subgraph_inputs{{0}};
+  const std::array<int32_t, 1> subgraph_outputs{{2}};
+  flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
+      builder, builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(subgraph_inputs.data(),
+                                    subgraph_inputs.size()),
+      builder.CreateVector<int32_t>(subgraph_outputs.data(),
+                                    subgraph_outputs.size()),
+      builder.CreateVector(&op, 1));
+
+  flatbuffers::Offset<flatbuffers::String> description =
+      builder.CreateString("Resize Bilinear model");
+
+  flatbuffers::Offset<Model> model_buffer = CreateModel(
+      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1), description,
+      builder.CreateVector(buffers.data(), buffers.size()));
+
+  builder.Finish(model_buffer);
+
+  return std::vector<char>(builder.GetBufferPointer(),
+                           builder.GetBufferPointer() + builder.GetSize());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.h b/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.h
new file mode 100644
index 00000000000..6885fcf9033
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.h
@@ -0,0 +1,115 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_RESIZE_BILINEAR_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_RESIZE_BILINEAR_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class ResizeBilinearTester {
+ public:
+  ResizeBilinearTester() = default;
+  ResizeBilinearTester(const ResizeBilinearTester&) = delete;
+  ResizeBilinearTester& operator=(const ResizeBilinearTester&) = delete;
+
+  inline ResizeBilinearTester& BatchSize(int32_t batch_size) {
+    EXPECT_GT(batch_size, 0);
+    batch_size_ = batch_size;
+    return *this;
+  }
+
+  inline int32_t BatchSize() const { return batch_size_; }
+
+  inline ResizeBilinearTester& Channels(int32_t channels) {
+    EXPECT_GT(channels, 0);
+    channels_ = channels;
+    return *this;
+  }
+
+  inline int32_t Channels() const { return channels_; }
+
+  inline ResizeBilinearTester& InputHeight(int32_t input_height) {
+    EXPECT_GT(input_height, 0);
+    input_height_ = input_height;
+    return *this;
+  }
+
+  inline int32_t InputHeight() const { return input_height_; }
+
+  inline ResizeBilinearTester& InputWidth(int32_t input_width) {
+    EXPECT_GT(input_width, 0);
+    input_width_ = input_width;
+    return *this;
+  }
+
+  inline int32_t InputWidth() const { return input_width_; }
+
+  inline ResizeBilinearTester& OutputHeight(int32_t output_height) {
+    EXPECT_GT(output_height, 0);
+    output_height_ = output_height;
+    return *this;
+  }
+
+  inline int32_t OutputHeight() const { return output_height_; }
+
+  inline ResizeBilinearTester& OutputWidth(int32_t output_width) {
+    EXPECT_GT(output_width, 0);
+    output_width_ = output_width;
+    return *this;
+  }
+
+  inline int32_t OutputWidth() const { return output_width_; }
+
+  ResizeBilinearTester& AlignCorners(bool align_corners) {
+    align_corners_ = align_corners;
+    return *this;
+  }
+
+  bool AlignCorners() const { return align_corners_; }
+
+  ResizeBilinearTester& HalfPixelCenters(bool half_pixel_centers) {
+    half_pixel_centers_ = half_pixel_centers;
+    return *this;
+  }
+
+  bool HalfPixelCenters() const { return half_pixel_centers_; }
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  int32_t batch_size_ = 1;
+  int32_t channels_ = 1;
+  int32_t input_height_ = 1;
+  int32_t input_width_ = 1;
+  int32_t output_height_ = 1;
+  int32_t output_width_ = 1;
+  bool align_corners_ = false;
+  bool half_pixel_centers_ = false;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_RESIZE_BILINEAR_TESTER_H_
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index 535c6a9ef1e..eec223597cb 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -157,6 +157,7 @@ class Subgraph {
         case kTfLiteBuiltinMean:
         case kTfLiteBuiltinPad:
         case kTfLiteBuiltinReshape:
+        case kTfLiteBuiltinResizeBilinear:
           // Ignore the second input (axes, static padding, or new shape),
           // because it is represented as parameters of the XNNPACK operator
           // rather than extra input.
@@ -570,8 +571,13 @@ class Subgraph {
                                params->filter_height, node_index);
       return kTfLiteError;
     }
-    if (params->filter_width == 1 && params->filter_height == 1) {
-      TF_LITE_MAYBE_KERNEL_LOG(context, "meaningless 1x1 pooling in node #%d",
+
+    if (params->filter_width == 1 && params->filter_height == 1 &&
+        std::max(params->stride_width, params->stride_height) > 1) {
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "unsupported pooling with 1x1 filter "
+                               "and %dx%d stride in node #%d",
+                               params->stride_width, params->stride_height,
                                node_index);
       return kTfLiteError;
     }
@@ -925,6 +931,14 @@ class Subgraph {
                                 context->tensors, reshape_params,
                                 xnnpack_tensors);
       }
+      case kTfLiteBuiltinResizeBilinear: {
+        const TfLiteResizeBilinearParams* resize_params =
+            static_cast<const TfLiteResizeBilinearParams*>(node->builtin_data);
+
+        return VisitResizeBilinearNode(subgraph, logging_context, node_index,
+                                       node, context->tensors, resize_params,
+                                       xnnpack_tensors);
+      }
       case kTfLiteBuiltinRound:
         return VisitRoundNode(subgraph, logging_context, node_index, node,
                               context->tensors, xnnpack_tensors);
@@ -1105,19 +1119,27 @@ class Subgraph {
         &output_max));
 
     if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_average_pooling_2d(
-          subgraph,
-          /*input_padding_top=*/0,
-          /*input_padding_right=*/0,
-          /*input_padding_bottom=*/0,
-          /*input_padding_left=*/0,
-          static_cast<uint32_t>(pool_params->filter_height),
-          static_cast<uint32_t>(pool_params->filter_width),
-          static_cast<uint32_t>(pool_params->stride_height),
-          static_cast<uint32_t>(pool_params->stride_width), output_min,
-          output_max,
-          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], flags);
+      xnn_status status = xnn_status_success;
+      if (pool_params->filter_height == 1 && pool_params->filter_width == 1) {
+        status = xnn_define_clamp(
+            subgraph, output_min, output_max,
+            /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+            /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
+      } else {
+        status = xnn_define_average_pooling_2d(
+            subgraph,
+            /*input_padding_top=*/0,
+            /*input_padding_right=*/0,
+            /*input_padding_bottom=*/0,
+            /*input_padding_left=*/0,
+            static_cast<uint32_t>(pool_params->filter_height),
+            static_cast<uint32_t>(pool_params->filter_width),
+            static_cast<uint32_t>(pool_params->stride_height),
+            static_cast<uint32_t>(pool_params->stride_width), output_min,
+            output_max,
+            /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+            /*output_id=*/xnnpack_tensors[node->outputs->data[0]], flags);
+      }
       if (status != xnn_status_success) {
         TF_LITE_KERNEL_LOG(logging_context,
                            "failed to delegate AVERAGE_POOL_2D node #%d",
@@ -1710,20 +1732,27 @@ class Subgraph {
         &output_max));
 
     if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_max_pooling_2d(
-          subgraph,
-          /*input_padding_top=*/0,
-          /*input_padding_right=*/0,
-          /*input_padding_bottom=*/0,
-          /*input_padding_left=*/0,
-          static_cast<uint32_t>(pool_params->filter_height),
-          static_cast<uint32_t>(pool_params->filter_width),
-          static_cast<uint32_t>(pool_params->stride_height),
-          static_cast<uint32_t>(pool_params->stride_width),
-          /*dilation_height=*/1,
-          /*dilation_width=*/1, output_min, output_max,
-          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], flags);
+      xnn_status status = xnn_status_success;
+      if (pool_params->filter_height == 1 && pool_params->filter_width == 1) {
+        status = xnn_define_clamp(
+            subgraph, output_min, output_max,
+            /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+            /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
+      } else {
+        status = xnn_define_max_pooling_2d(
+            subgraph,
+            /*input_padding_top=*/0,
+            /*input_padding_right=*/0,
+            /*input_padding_bottom=*/0,
+            /*input_padding_left=*/0,
+            static_cast<uint32_t>(pool_params->filter_height),
+            static_cast<uint32_t>(pool_params->filter_width),
+            static_cast<uint32_t>(pool_params->stride_height),
+            static_cast<uint32_t>(pool_params->stride_width),
+            /*dilation_height=*/1, /*dilation_width=*/1, output_min, output_max,
+            /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+            /*output_id=*/xnnpack_tensors[node->outputs->data[0]], flags);
+      }
       if (status != xnn_status_success) {
         TF_LITE_KERNEL_LOG(logging_context,
                            "failed to delegate MAX_POOL_2D node #%d",
@@ -2440,6 +2469,80 @@ class Subgraph {
     return kTfLiteOk;
   }
 
+  static TfLiteStatus VisitResizeBilinearNode(
+      xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
+      TfLiteNode* node, const TfLiteTensor* tensors,
+      const TfLiteResizeBilinearParams* resize_params,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(
+        CheckNumInputsAndOutputs(logging_context, node, 2, 1, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 4,
+                                           node->inputs->data[0]));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& shape_tensor = tensors[node->inputs->data[1]];
+    TF_LITE_ENSURE_STATUS(CheckTensorType(logging_context, shape_tensor,
+                                          kTfLiteInt32, node->inputs->data[1],
+                                          node_index));
+    TF_LITE_ENSURE_STATUS(CheckShapeTensorShape(
+        logging_context, shape_tensor, node->inputs->data[1], node_index));
+    if (shape_tensor.dims->data[0] != 2) {
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context,
+          "unexpected number of dimensions %d in the output shape in node %d",
+          shape_tensor.dims->data[0], node_index);
+    }
+    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+        logging_context, shape_tensor, node->inputs->data[1], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 4,
+                                           node->outputs->data[0]));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    const int32_t* shape_data =
+        reinterpret_cast<const int32_t*>(shape_tensor.data.data);
+    for (int i = 0; i < shape_tensor.dims->size; i++) {
+      const int32_t dim = shape_data[i];
+      if (dim <= 0) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context, "invalid output dimension #%d value %d in node %d",
+            i, dim, node_index);
+        return kTfLiteError;
+      }
+    }
+
+    if (subgraph != nullptr) {
+      uint32_t flags = 0;
+      if (resize_params->align_corners) {
+        flags |= XNN_FLAG_ALIGN_CORNERS;
+      } else if (!resize_params->half_pixel_centers) {
+        flags |= XNN_FLAG_TENSORFLOW_LEGACY_MODE;
+      }
+      const xnn_status status = xnn_define_static_resize_bilinear_2d(
+          subgraph, static_cast<size_t>(shape_data[0]),
+          static_cast<size_t>(shape_data[1]),
+          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], flags);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context,
+                           "failed to delegate RESIZE_BILINEAR node #%d",
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
   static TfLiteStatus VisitRoundNode(
       xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
       TfLiteNode* node, const TfLiteTensor* tensors,
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/BUILD b/tensorflow/lite/experimental/acceleration/compatibility/BUILD
index 78a9d2eb8d8..97c903d561f 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/BUILD
+++ b/tensorflow/lite/experimental/acceleration/compatibility/BUILD
@@ -14,7 +14,7 @@
 # ==============================================================================
 
 load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
-load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+load("//tensorflow/lite:special_rules.bzl", "tflite_extra_gles_deps", "tflite_portable_test_suite")
 
 package(
     default_visibility = [
@@ -94,16 +94,22 @@ genrule(
     tools = [":convert_binary_to_cc_source"],
 )
 
+cc_library(
+    name = "devicedb_sample",
+    srcs = ["devicedb-sample.cc"],
+    hdrs = ["devicedb-sample.h"],
+    deps = [":database_fbs"],
+)
+
 cc_test(
     name = "devicedb_test",
     srcs = [
-        "devicedb-sample.cc",
-        "devicedb-sample.h",
         "devicedb_test.cc",
     ],
     deps = [
         ":database_fbs",
         ":devicedb",
+        ":devicedb_sample",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -152,11 +158,22 @@ cc_library(
         ":android_info",
         ":database_fbs",
         ":devicedb",
-        "//tensorflow/lite/delegates/gpu:delegate",
-        "//tensorflow/lite/delegates/gpu/common:gpu_info",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@flatbuffers",
+        "//tensorflow/lite/delegates/gpu:delegate",
+        "//tensorflow/lite/delegates/gpu/common:gpu_info",
+    ] + tflite_extra_gles_deps(),
+)
+
+cc_test(
+    name = "gpu_compatibility_test",
+    srcs = ["gpu_compatibility_test.cc"],
+    tags = ["no_mac"],  # b/163222453
+    deps = [
+        ":devicedb_sample",
+        ":gpu_compatibility",
+        "@com_google_googletest//:gtest_main",
     ],
 )
 
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/devicedb_test.cc b/tensorflow/lite/experimental/acceleration/compatibility/devicedb_test.cc
index 4b08c2ff874..c9c6ff831e5 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/devicedb_test.cc
+++ b/tensorflow/lite/experimental/acceleration/compatibility/devicedb_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/experimental/acceleration/compatibility/database_generated.h"
 #include "tensorflow/lite/experimental/acceleration/compatibility/devicedb-sample.h"
 #include "tensorflow/lite/experimental/acceleration/compatibility/variables.h"
 #include "tensorflow/lite/testing/util.h"
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.cc b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.cc
index e04f5d18db4..4f40878da22 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.cc
+++ b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.cc
@@ -98,5 +98,9 @@ TfLiteGpuDelegateOptionsV2 GPUCompatibilityList::GetBestOptionsFor(
   return TfLiteGpuDelegateOptionsV2Default();
 }
 
+bool GPUCompatibilityList::IsDatabaseLoaded() const {
+  return database_ != nullptr;
+}
+
 }  // namespace acceleration
 }  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.h b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.h
index f975fe04f22..1c5e9dec997 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.h
+++ b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.h
@@ -53,6 +53,9 @@ class GPUCompatibilityList {
  public:
   // Construct list from bundled data.
   GPUCompatibilityList();
+  // Constructs list from the given flatbuffer data.
+  explicit GPUCompatibilityList(
+      const unsigned char* compatibility_list_flatbuffer);
   // Returns true if the provided device specs are supported by the database.
   bool Includes(const AndroidInfo& android_info,
                 const ::tflite::gpu::GpuInfo& gpu_info) const;
@@ -73,10 +76,9 @@ class GPUCompatibilityList {
 
   GPUCompatibilityList(const GPUCompatibilityList&) = delete;
   GPUCompatibilityList& operator=(const GPUCompatibilityList&) = delete;
+  bool IsDatabaseLoaded() const;
 
  protected:
-  explicit GPUCompatibilityList(
-      const unsigned char* compatibility_list_flatbuffer);
   const DeviceDatabase* database_;
 };
 
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility_test.cc b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility_test.cc
new file mode 100644
index 00000000000..5576b47dcd9
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility_test.cc
@@ -0,0 +1,87 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/acceleration/compatibility/devicedb-sample.h"
+
+namespace {
+
+class GPUCompatibilityTest : public ::testing::Test {
+ protected:
+  GPUCompatibilityTest() {
+    list_ = absl::make_unique<tflite::acceleration::GPUCompatibilityList>(
+        g_tflite_acceleration_devicedb_sample_binary);
+  }
+
+  std::unique_ptr<tflite::acceleration::GPUCompatibilityList> list_;
+};
+
+TEST_F(GPUCompatibilityTest, ReturnsSupportedForFullMatch) {
+  ASSERT_TRUE(list_->IsDatabaseLoaded());
+
+  tflite::acceleration::AndroidInfo android_info = {.android_sdk_version = "24",
+                                                    .model = "m712c"};
+
+  tflite::gpu::GpuInfo tflite_gpu_info = {
+      .major_version = 3,
+      .minor_version = 1,
+  };
+
+  EXPECT_TRUE(list_->Includes(android_info, tflite_gpu_info));
+}
+
+TEST_F(GPUCompatibilityTest, ReturnsUnsupportedForFullMatch) {
+  ASSERT_TRUE(list_->IsDatabaseLoaded());
+
+  tflite::acceleration::AndroidInfo android_info = {.android_sdk_version = "28",
+                                                    .model = "SM-G960F",
+                                                    .device = "starlte",
+                                                    .manufacturer = "Samsung"};
+  tflite::gpu::GpuInfo tflite_gpu_info = {
+      .renderer_name = "Mali-G72",
+      .major_version = 3,
+      .minor_version = 2,
+  };
+  EXPECT_FALSE(list_->Includes(android_info, tflite_gpu_info));
+}
+
+TEST_F(GPUCompatibilityTest, ReturnsDefaultOptions) {
+  ASSERT_TRUE(list_->IsDatabaseLoaded());
+
+  tflite::acceleration::AndroidInfo android_info;
+  tflite::gpu::GpuInfo tflite_gpu_info;
+  auto default_options = TfLiteGpuDelegateOptionsV2Default();
+  auto best_options = list_->GetBestOptionsFor(android_info, tflite_gpu_info);
+  EXPECT_EQ(best_options.is_precision_loss_allowed,
+            default_options.is_precision_loss_allowed);
+  EXPECT_EQ(best_options.inference_preference,
+            default_options.inference_preference);
+  EXPECT_EQ(best_options.inference_priority1,
+            default_options.inference_priority1);
+  EXPECT_EQ(best_options.inference_priority2,
+            default_options.inference_priority2);
+  EXPECT_EQ(best_options.inference_priority3,
+            default_options.inference_priority3);
+  EXPECT_EQ(best_options.experimental_flags,
+            default_options.experimental_flags);
+  EXPECT_EQ(best_options.max_delegated_partitions,
+            default_options.max_delegated_partitions);
+}
+
+}  // namespace
diff --git a/tensorflow/lite/experimental/delegates/coreml/README.md b/tensorflow/lite/experimental/delegates/coreml/README.md
index fa2e2a8d68a..1353bf92e48 100644
--- a/tensorflow/lite/experimental/delegates/coreml/README.md
+++ b/tensorflow/lite/experimental/delegates/coreml/README.md
@@ -72,20 +72,34 @@ Following ops are supported by the Core ML delegate.
     *   Weights and bias should be constant.
 *   DepthwiseConv2D
     *   Weights and bias should be constant.
+*   FullyConnected (aka Dense or InnerProduct)
+    *   Weights and bias (if present) should be constant.
+    *   Only supports single-batch case. Input dimensions should be 1, except
+        the last dimension.
 *   Hardswish
 *   Logistic (aka Sigmoid)
 *   MaxPool2D
+*   MirrorPad
+    *   Only 4D input with `REFLECT` mode is supported. Padding should be
+        constant, and is only allowed for H and W dimensions.
 *   Mul
     *   Only certain shapes are broadcastable. In Core ML tensor layout,
         following tensor shapes are broadcastable. `[B, C, H, W]`, `[B, C, 1,
         1]`, `[B, 1, H, W]`, `[B, 1, 1, 1]`.
+*   Pad and PadV2
+    *   Only 4D input is supported. Padding should be constant, and is only
+        allowed for H and W dimensions.
 *   Relu
 *   ReluN1To1
 *   Relu6
 *   Reshape
+    *   Only supported when target Core ML version is 2, not supported when
+        targeting Core ML 3.
 *   ResizeBilinear
 *   SoftMax
 *   Tanh
+*   TransposeConv
+    *   Weights should be constant.
 
 ## FAQ
 
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
index 47a2eecb51b..c775f4fdb48 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
@@ -46,8 +46,13 @@ OpBuilder* GraphBuilder::AddBuilder(int builtin_code, const TfLiteNode* node) {
       return AddBuilder(CreateLogisticOpBuilder, node);
     case kTfLiteBuiltinMaxPool2d:
       return AddBuilder(CreateMaxPool2dOpBuilder, node);
+    case kTfLiteBuiltinMirrorPad:
+      return AddBuilder(CreateMirrorPadOpBuilder, node);
     case kTfLiteBuiltinMul:
       return AddBuilder(CreateMulOpBuilder, node);
+    case kTfLiteBuiltinPad:
+    case kTfLiteBuiltinPadv2:
+      return AddBuilder(CreatePadOpBuilder, node);
     case kTfLiteBuiltinRelu:
       return AddBuilder(CreateReluOpBuilder, node);
     case kTfLiteBuiltinReluN1To1:
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/op_factory.h b/tensorflow/lite/experimental/delegates/coreml/builders/op_factory.h
index bc275908d10..4245021fc2f 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/op_factory.h
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/op_factory.h
@@ -32,7 +32,10 @@ OpBuilder* CreateFullyConnectedOpBuilder(GraphBuilder* graph_builder);
 OpBuilder* CreateHardSwishOpBuilder(GraphBuilder* graph_builder);
 OpBuilder* CreateLogisticOpBuilder(GraphBuilder* graph_builder);
 OpBuilder* CreateMaxPool2dOpBuilder(GraphBuilder* graph_builder);
+OpBuilder* CreateMirrorPadOpBuilder(GraphBuilder* graph_builder);
 OpBuilder* CreateMulOpBuilder(GraphBuilder* graph_builder);
+// PAD handles PAD and PADV2 together.
+OpBuilder* CreatePadOpBuilder(GraphBuilder* graph_builder);
 OpBuilder* CreateReluOpBuilder(GraphBuilder* graph_builder);
 OpBuilder* CreateReluN1To1OpBuilder(GraphBuilder* graph_builder);
 OpBuilder* CreateRelu6OpBuilder(GraphBuilder* graph_builder);
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h b/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h
index 501a304706c..b099fd7493a 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h
@@ -31,6 +31,10 @@ bool IsDepthwiseConvolutionOpSupported(const TfLiteRegistration* registration,
 bool IsFullyConnectedOpSupported(const TfLiteRegistration* registration,
                                  const TfLiteNode* node,
                                  TfLiteContext* context);
+bool IsMirrorPadOpSupported(const TfLiteRegistration* registration,
+                            const TfLiteNode* node, TfLiteContext* context);
+bool IsPadOpSupported(const TfLiteRegistration* registration,
+                      const TfLiteNode* node, TfLiteContext* context);
 bool IsReshapeOpSupported(const TfLiteRegistration* registration,
                           const TfLiteNode* node, TfLiteContext* context,
                           int coreml_version);
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/pad_op_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/pad_op_builder.cc
new file mode 100644
index 00000000000..d8ef4f61ddb
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/pad_op_builder.cc
@@ -0,0 +1,143 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/delegates/coreml/builders/pad_op_builder.h"
+
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/experimental/delegates/coreml/builders/op_factory.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace delegates {
+namespace coreml {
+
+const char* PadOpBuilder::DebugName() {
+  if (str_debug_name_[0]) return str_debug_name_;
+  GetDebugName(padding_type_ == PadType::kPad ? "PadOpBuilder (PAD)"
+                                              : "PadOpBuilder (MIRROR_PAD)",
+               node_id_, str_debug_name_);
+  return str_debug_name_;
+}
+
+CoreML::Specification::NeuralNetworkLayer* PadOpBuilder::Build() {
+  layer_->set_name(DebugName());
+  if (padding_type_ == PadType::kPad) {
+    layer_->mutable_padding()->mutable_constant();
+  } else if (padding_type_ == PadType::kMirrorPad) {
+    layer_->mutable_padding()->mutable_reflection();
+  }
+  return layer_.release();
+}
+
+// padding is d x 2 tensor, where d is the dimension of input.
+// only paddings for width and height are considered.
+void PadOpBuilder::SetPadding(const TfLiteTensor* padding) {
+  const int32_t* padding_data = GetTensorData<int32_t>(padding);
+  for (int i = 1; i <= 2; ++i) {
+    auto* borderamount = layer_->mutable_padding()
+                             ->mutable_paddingamounts()
+                             ->add_borderamounts();
+    borderamount->set_startedgesize(padding_data[i * 2]);
+    borderamount->set_endedgesize(padding_data[i * 2 + 1]);
+  }
+}
+
+void PadOpBuilder::SetConstantValue(const TfLiteTensor* constant_value) {
+  layer_->mutable_padding()->mutable_constant()->set_value(
+      GetTensorData<float>(constant_value)[0]);
+}
+
+TfLiteStatus PadOpBuilder::RegisterInputs(const TfLiteIntArray* inputs,
+                                          TfLiteContext* context) {
+  if (!(inputs->size == 2 || inputs->size == 3)) {
+    TF_LITE_KERNEL_LOG(context, "Wrong # of inputs to Padding!.");
+    return kTfLiteError;
+  }
+  AddInput(inputs->data[0]);
+  SetPadding(GetInput(context, tflite_node_, 1));
+  if (inputs->size == 3) {
+    SetConstantValue(GetInput(context, tflite_node_, 2));
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus PadOpBuilder::RegisterOutputs(const TfLiteIntArray* outputs,
+                                           TfLiteContext* context) {
+  if (outputs->size != 1) {
+    TF_LITE_KERNEL_LOG(context, "Wrong # of outputs to Padding!.");
+    return kTfLiteError;
+  }
+  graph_builder_->AddTensorWithID(outputs->data[0], GetOutput(context));
+  return kTfLiteOk;
+}
+
+OpBuilder* CreatePadOpBuilder(GraphBuilder* graph_builder) {
+  return new PadOpBuilder(graph_builder, PadType::kPad);
+}
+
+OpBuilder* CreateMirrorPadOpBuilder(GraphBuilder* graph_builder) {
+  return new PadOpBuilder(graph_builder, PadType::kMirrorPad);
+}
+
+bool IsPadOpSupported(const TfLiteRegistration* registration,
+                      const TfLiteNode* node, TfLiteContext* context) {
+  // padding is d x 2 tensor, where d is the dimension of input.
+  const TfLiteTensor* padding = GetInput(context, node, 1);
+  if (!IsConstantTensor(padding)) {
+    TF_LITE_KERNEL_LOG(context,
+                       "%s: Only constant padding is supported for PAD.",
+                       padding->name);
+    return false;
+  }
+  if (padding->dims->data[0] != 4 || padding->dims->data[1] != 2) {
+    TF_LITE_KERNEL_LOG(context, "%s: Only 4D inputs are supported for PAD.",
+                       padding->name);
+    return false;
+  }
+  const int32_t* padding_data = GetTensorData<int32_t>(padding);
+  if (!(padding_data[0] == 0 && padding_data[1] == 0)) {
+    TF_LITE_KERNEL_LOG(
+        context, "%s: Padding for batch dimension is not supported in PAD.",
+        padding->name);
+    return false;
+  }
+
+  if (!(padding_data[6] == 0 && padding_data[7] == 0)) {
+    TF_LITE_KERNEL_LOG(
+        context, "%s: Padding for channel dimension is not supported in PAD.",
+        padding->name);
+    return false;
+  }
+  return true;
+}
+
+bool IsMirrorPadOpSupported(const TfLiteRegistration* registration,
+                            const TfLiteNode* node, TfLiteContext* context) {
+  auto* params =
+      reinterpret_cast<TfLiteMirrorPaddingParams*>(node->builtin_data);
+  if (params->mode != kTfLiteMirrorPaddingReflect) {
+    TF_LITE_KERNEL_LOG(context,
+                       "Only REFLECT mode is supported for MIRROR_PAD.");
+    return false;
+  }
+  return IsPadOpSupported(registration, node, context);
+}
+
+}  // namespace coreml
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/pad_op_builder.h b/tensorflow/lite/experimental/delegates/coreml/builders/pad_op_builder.h
new file mode 100644
index 00000000000..3fb949a3fb7
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/pad_op_builder.h
@@ -0,0 +1,55 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_COREML_BUILDERS_PAD_OP_BUILDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_COREML_BUILDERS_PAD_OP_BUILDER_H_
+
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/experimental/delegates/coreml/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace coreml {
+
+enum class PadType { kPad, kMirrorPad };
+
+// Supports PAD, PADV2, MIRROR_PAD
+class PadOpBuilder : public OpBuilder {
+ public:
+  explicit PadOpBuilder(GraphBuilder* graph_builder, PadType padding_type)
+      : OpBuilder(graph_builder), padding_type_(padding_type) {}
+
+  const char* DebugName() override;
+
+  CoreML::Specification::NeuralNetworkLayer* Build() override;
+
+  TfLiteStatus RegisterInputs(const TfLiteIntArray* inputs,
+                              TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  void SetPadding(const TfLiteTensor* padding);
+
+  void SetConstantValue(const TfLiteTensor* constant_value);
+
+ private:
+  PadType padding_type_;
+};
+
+}  // namespace coreml
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_COREML_BUILDERS_PAD_OP_BUILDER_H_
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/util_test.cc b/tensorflow/lite/experimental/delegates/coreml/builders/util_test.cc
index 929bc4a2282..8ba8a9bb5bc 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/util_test.cc
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/util_test.cc
@@ -62,7 +62,7 @@ class IsBinaryOpSupportedTest : public testing::Test {
     }
   }
 
-  TfLiteContext context_;
+  TfLiteContext context_ = {};
   TfLiteNode node_;
   std::vector<TfLiteTensor> tensors_;
 };
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm
index 3c6d6c57f5f..2cca58aa9fc 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm
@@ -106,10 +106,17 @@ bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration, const TfL
       const auto* params = reinterpret_cast<const TfLitePoolParams*>(node->builtin_data);
       return params != nullptr && params->activation == kTfLiteActNone;
     }
+    case kTfLiteBuiltinMirrorPad: {
+      return delegates::coreml::IsMirrorPadOpSupported(registration, node, context);
+    }
     case kTfLiteBuiltinMul: {
       return node->builtin_data != nullptr &&
              delegates::coreml::IsBinaryOpSupported(registration, node, context);
     }
+    case kTfLiteBuiltinPad:
+    case kTfLiteBuiltinPadv2: {
+      return delegates::coreml::IsPadOpSupported(registration, node, context);
+    }
     case kTfLiteBuiltinRelu: {
       return true;
     }
@@ -241,8 +248,7 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
                   delegated_nodes.size(), partition_helper.num_total_nodes(),
                   partition_helper.num_partitions());
   return context->ReplaceNodeSubsetsWithDelegateKernels(
-      context, GetCoreMlKernelRegistration(), BuildTfLiteIntArray(delegated_nodes).get(),
-      delegate);
+      context, GetCoreMlKernelRegistration(), BuildTfLiteIntArray(delegated_nodes).get(), delegate);
 }
 
 TfLiteDelegate* CreateCoreMlDelegate(const TfLiteCoreMlDelegateOptions* options) {
diff --git a/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs
index 83291e61794..cf6f1a1ec6c 100644
--- a/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs
+++ b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs
@@ -44,11 +44,21 @@ public class HelloTFLite : MonoBehaviour {
   }
 
   void Start () {
-    interpreter = new Interpreter(model.bytes);
-    Debug.LogFormat(
-        "InputCount: {0}, OutputCount: {1}",
-        interpreter.GetInputTensorCount(),
-        interpreter.GetOutputTensorCount());
+    Debug.LogFormat("TensorFlow Lite Verion: {0}", Interpreter.GetVersion());
+
+    var options = new Interpreter.Options() {
+      threads = 2,
+    };
+    interpreter = new Interpreter(model.bytes, options);
+
+    int inputCount = interpreter.GetInputTensorCount();
+    int outputCount = interpreter.GetOutputTensorCount();
+    for (int i = 0; i < inputCount; i++) {
+      Debug.LogFormat("Input {0}: {1}", i, interpreter.GetInputTensorInfo(i));
+    }
+    for (int i = 0; i < inputCount; i++) {
+      Debug.LogFormat("Output {0}: {1}", i, interpreter.GetOutputTensorInfo(i));
+    }
   }
 
   void Update () {
diff --git a/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs
index 4214b93c17e..6314d55d9f0 100644
--- a/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs
+++ b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 using System;
 using System.Runtime.InteropServices;
+using System.Linq;
 
 using TfLiteInterpreter = System.IntPtr;
 using TfLiteInterpreterOptions = System.IntPtr;
@@ -27,22 +28,51 @@ namespace TensorFlowLite
   /// </summary>
   public class Interpreter : IDisposable
   {
-    private const string TensorFlowLibrary = "tensorflowlite_c";
+    public struct Options: IEquatable<Options> {
+      /// <summary>
+      /// The number of CPU threads to use for the interpreter.
+      /// </summary>
+      public int threads;
 
-    private TfLiteModel model;
-    private TfLiteInterpreter interpreter;
+      public bool Equals(Options other) {
+        return threads == other.threads;
+      }
+    }
 
-    public Interpreter(byte[] modelData) {
+    public struct TensorInfo {
+      public string name { get; internal set; }
+      public DataType type { get; internal set; }
+      public int[] dimensions { get; internal set; }
+      public QuantizationParams quantizationParams { get; internal set; }
+
+      public override string ToString() {
+        return string.Format("name: {0}, type: {1}, dimensions: {2}, quantizationParams: {3}",
+          name,
+          type,
+          "[" + string.Join(",", dimensions.Select(d => d.ToString()).ToArray()) + "]",
+          "{" + quantizationParams + "}");
+      }
+    }
+
+    private TfLiteModel model = IntPtr.Zero;
+    private TfLiteInterpreter interpreter = IntPtr.Zero;
+    private TfLiteInterpreterOptions options = IntPtr.Zero;
+
+    public Interpreter(byte[] modelData): this(modelData, default(Options)) {}
+
+    public Interpreter(byte[] modelData, Options options) {
       GCHandle modelDataHandle = GCHandle.Alloc(modelData, GCHandleType.Pinned);
       IntPtr modelDataPtr = modelDataHandle.AddrOfPinnedObject();
       model = TfLiteModelCreate(modelDataPtr, modelData.Length);
       if (model == IntPtr.Zero) throw new Exception("Failed to create TensorFlowLite Model");
-      interpreter = TfLiteInterpreterCreate(model, /*options=*/IntPtr.Zero);
-      if (interpreter == IntPtr.Zero) throw new Exception("Failed to create TensorFlowLite Interpreter");
-    }
+      
+      if (!options.Equals(default(Options))) {
+        this.options = TfLiteInterpreterOptionsCreate();
+        TfLiteInterpreterOptionsSetNumThreads(this.options, options.threads);
+      }
 
-    ~Interpreter() {
-      Dispose();
+      interpreter = TfLiteInterpreterCreate(model, this.options);
+      if (interpreter == IntPtr.Zero) throw new Exception("Failed to create TensorFlowLite Interpreter");
     }
 
     public void Dispose() {
@@ -50,6 +80,8 @@ namespace TensorFlowLite
       interpreter = IntPtr.Zero;
       if (model != IntPtr.Zero) TfLiteModelDelete(model);
       model = IntPtr.Zero;
+      if (options != IntPtr.Zero) TfLiteInterpreterOptionsDelete(options);
+      options = IntPtr.Zero;
     }
 
     public void Invoke() {
@@ -89,18 +121,98 @@ namespace TensorFlowLite
           tensor, tensorDataPtr, Buffer.ByteLength(outputTensorData)));
     }
 
+    public TensorInfo GetInputTensorInfo(int index) {
+      TfLiteTensor tensor = TfLiteInterpreterGetInputTensor(interpreter, index);
+      return GetTensorInfo(tensor);
+    }
+
+    public TensorInfo GetOutputTensorInfo(int index) {
+      TfLiteTensor tensor = TfLiteInterpreterGetOutputTensor(interpreter, index);
+      return GetTensorInfo(tensor);
+    }
+
+    /// <summary>
+    /// Returns a string describing version information of the TensorFlow Lite library.
+    /// TensorFlow Lite uses semantic versioning.
+    /// </summary>
+    /// <returns>A string describing version information</returns>
+    public static string GetVersion() {
+      return Marshal.PtrToStringAnsi(TfLiteVersion());
+    }
+
+    private static string GetTensorName(TfLiteTensor tensor) {
+      return Marshal.PtrToStringAnsi(TfLiteTensorName(tensor));
+    }
+
+    private static TensorInfo GetTensorInfo(TfLiteTensor tensor) {
+      int[] dimensions = new int[TfLiteTensorNumDims(tensor)];
+      for (int i = 0; i < dimensions.Length; i++) {
+        dimensions[i] = TfLiteTensorDim(tensor, i);
+      }
+      return new TensorInfo() {
+        name = GetTensorName(tensor),
+        type = TfLiteTensorType(tensor),
+        dimensions = dimensions,
+        quantizationParams = TfLiteTensorQuantizationParams(tensor),
+      };
+    }
+
     private static void ThrowIfError(int resultCode) {
       if (resultCode != 0) throw new Exception("TensorFlowLite operation failed.");
     }
 
     #region Externs
 
+    #if UNITY_IPHONE && !UNITY_EDITOR
+    private const string TensorFlowLibrary = "__Internal";
+#else
+    private const string TensorFlowLibrary = "tensorflowlite_c";
+#endif
+
+    public enum DataType {
+      NoType = 0,
+      Float32 = 1,
+      Int32 = 2,
+      UInt8 = 3,
+      Int64 = 4,
+      String = 5,
+      Bool = 6,
+      Int16 = 7,
+      Complex64 = 8,
+      Int8 = 9,
+      Float16 = 10,
+    }
+
+    public struct QuantizationParams {
+      public float scale;
+      public int zeroPoint;
+
+      public override string ToString() {
+        return string.Format("scale: {0} zeroPoint: {1}", scale, zeroPoint);
+      }
+    }
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe IntPtr TfLiteVersion();
+
     [DllImport (TensorFlowLibrary)]
     private static extern unsafe TfLiteInterpreter TfLiteModelCreate(IntPtr model_data, int model_size);
 
     [DllImport (TensorFlowLibrary)]
     private static extern unsafe TfLiteInterpreter TfLiteModelDelete(TfLiteModel model);
 
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe TfLiteInterpreterOptions TfLiteInterpreterOptionsCreate();
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe void TfLiteInterpreterOptionsDelete(TfLiteInterpreterOptions options);
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe void TfLiteInterpreterOptionsSetNumThreads(
+        TfLiteInterpreterOptions options,
+        int num_threads
+    );
+
     [DllImport (TensorFlowLibrary)]
     private static extern unsafe TfLiteInterpreter TfLiteInterpreterCreate(
         TfLiteModel model,
@@ -140,6 +252,24 @@ namespace TensorFlowLite
     private static extern unsafe TfLiteTensor TfLiteInterpreterGetOutputTensor(
         TfLiteInterpreter interpreter,
         int output_index);
+    
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe DataType TfLiteTensorType(TfLiteTensor tensor);
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe int TfLiteTensorNumDims(TfLiteTensor tensor);
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern int TfLiteTensorDim(TfLiteTensor tensor, int dim_index);
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern uint TfLiteTensorByteSize(TfLiteTensor tensor);
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe IntPtr TfLiteTensorName(TfLiteTensor tensor);
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe QuantizationParams TfLiteTensorQuantizationParams(TfLiteTensor tensor);
 
     [DllImport (TensorFlowLibrary)]
     private static extern unsafe int TfLiteTensorCopyFromBuffer(
diff --git a/tensorflow/lite/experimental/ios/BUILD.apple b/tensorflow/lite/experimental/ios/BUILD.apple
index b4df93088b3..e1e3be2bcde 100644
--- a/tensorflow/lite/experimental/ios/BUILD.apple
+++ b/tensorflow/lite/experimental/ios/BUILD.apple
@@ -1,7 +1,12 @@
 # TensorFlow Lite for iOS
 
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
-load("//tensorflow/lite/experimental/ios:ios.bzl", "TFL_MINIMUM_OS_VERSION", "tflite_ios_static_framework")
+load(
+    "//tensorflow/lite/experimental/ios:ios.bzl",
+    "TFL_MINIMUM_OS_VERSION",
+    "strip_common_include_path_prefix",
+    "tflite_ios_static_framework",
+)
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
 
 package(
@@ -18,31 +23,21 @@ sh_binary(
     ],
 )
 
-# When the static framework is built with bazel, the all header files are moved
-# to the "Headers" directory with no header path prefixes. This auxiliary rule
-# is used for stripping the path prefix to the "common.h" file included by the
-# "c_api.h" header.
-genrule(
-    name = "strip_c_api_include_hdr",
-    srcs = ["//tensorflow/lite/c:c_api.h"],
-    outs = ["c_api.h"],
-    cmd = """
-    sed 's|#include ".*common.h"|#include "common.h"|'\
-    "$(location //tensorflow/lite/c:c_api.h)"\
-    > "$@"
-    """,
+strip_common_include_path_prefix(
+    name = "strip_common_include_path_core",
+    hdr_labels = [
+        "//tensorflow/lite/c:c_api.h",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h",
+    ],
 )
 
-# Similar rule as above, but for the "xnnpack_delegate.h" header.
-genrule(
-    name = "strip_xnnpack_include_hdr",
-    srcs = ["//tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h"],
-    outs = ["xnnpack_delegate.h"],
-    cmd = """
-    sed 's|#include ".*common.h"|#include "common.h"|'\
-    "$(location //tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h)"\
-    > "$@"
-    """,
+strip_common_include_path_prefix(
+    name = "strip_common_include_path_subspecs",
+    hdr_labels = [
+        "//tensorflow/lite/delegates/gpu:metal_delegate.h",
+        "//tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h",
+    ],
+    prefix = "TensorFlowLiteC/",
 )
 
 # bazel build -c opt --config=ios_fat //tensorflow/lite/experimental/ios:TensorFlowLiteC_framework
@@ -79,17 +74,6 @@ ios_static_framework(
     ],
 )
 
-genrule(
-    name = "strip_coreml_include_hdr",
-    srcs = ["//tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h"],
-    outs = ["coreml_delegate.h"],
-    cmd = """
-    sed 's|#include ".*common.h"|#include "TensorFlowLiteC/common.h"|'\
-    "$(location //tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h)"\
-    > "$@"
-    """,
-)
-
 # This target builds the Core ML delegate as a separate static framework, which
 # does not include the TensorFlow Lite runtime. As this target does not contain
 # TensorFlow Lite runtime, it is intended to be linked along with the
@@ -118,7 +102,7 @@ tflite_ios_static_framework(
 tflite_ios_static_framework(
     name = "TensorFlowLiteCMetal_framework",
     hdrs = [
-        "//tensorflow/lite/delegates/gpu:metal_delegate.h",
+        ":metal_delegate.h",
     ],
     allowlist_symbols_file = ":allowlist_TensorFlowLiteCMetal.txt",
     bundle_name = "TensorFlowLiteCMetal",
diff --git a/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec b/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec
index f379799c8a9..1b986933f5f 100644
--- a/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec
+++ b/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec
@@ -1,10 +1,10 @@
 Pod::Spec.new do |s|
   s.name             = 'TensorFlowLiteC'
-  s.version          = '2.2.0'
+  s.version          = '2.3.0'
   s.authors          = 'Google Inc.'
   s.license          = { :type => 'Apache' }
   s.homepage         = 'https://github.com/tensorflow/tensorflow'
-  s.source           = { :http => "https://dl.google.com/dl/cpdc/b3338da8d8cfd06b/TensorFlowLiteC-#{s.version}.tar.gz" }
+  s.source           = { :http => "https://dl.google.com/dl/cpdc/b03814d8b5a44ad2/TensorFlowLiteC-#{s.version}.tar.gz" }
   s.summary          = 'TensorFlow Lite'
   s.description      = <<-DESC
 
@@ -19,5 +19,22 @@ Pod::Spec.new do |s|
 
   s.module_name = 'TensorFlowLiteC'
   s.library = 'c++'
-  s.vendored_frameworks = 'Frameworks/TensorFlowLiteC.framework'
+
+  s.default_subspec = 'Core'
+
+  s.subspec 'Core' do |core|
+    core.vendored_frameworks = 'Frameworks/TensorFlowLiteC.framework'
+  end
+
+  s.subspec 'CoreML' do |coreml|
+    coreml.weak_framework = 'CoreML'
+    coreml.dependency 'TensorFlowLiteC/Core'
+    coreml.vendored_frameworks = 'Frameworks/TensorFlowLiteCCoreML.framework'
+  end
+
+  s.subspec 'Metal' do |metal|
+    metal.weak_framework = 'Metal'
+    metal.dependency 'TensorFlowLiteC/Core'
+    metal.vendored_frameworks = 'Frameworks/TensorFlowLiteCMetal.framework'
+  end
 end
diff --git a/tensorflow/lite/experimental/ios/TensorFlowLiteSelectTfOps.podspec b/tensorflow/lite/experimental/ios/TensorFlowLiteSelectTfOps.podspec
index 788630a6d4f..393040b34b4 100644
--- a/tensorflow/lite/experimental/ios/TensorFlowLiteSelectTfOps.podspec
+++ b/tensorflow/lite/experimental/ios/TensorFlowLiteSelectTfOps.podspec
@@ -1,10 +1,10 @@
 Pod::Spec.new do |s|
   s.name             = 'TensorFlowLiteSelectTfOps'
-  s.version          = '2.2.0'
+  s.version          = '2.3.0'
   s.authors          = 'Google Inc.'
   s.license          = { :type => 'Apache' }
   s.homepage         = 'https://github.com/tensorflow/tensorflow'
-  s.source           = { :http => "https://dl.google.com/dl/cpdc/9604b128278441ac/TensorFlowLiteSelectTfOps-2.2.0.tar.gz" }
+  s.source           = { :http => "https://dl.google.com/dl/cpdc/4f626bc24212fd61/TensorFlowLiteSelectTfOps-#{s.version}.tar.gz" }
   s.summary          = 'TensorFlow Lite Select TF Ops'
   s.description      = <<-DESC
 
diff --git a/tensorflow/lite/experimental/ios/ios.bzl b/tensorflow/lite/experimental/ios/ios.bzl
index 1125e85f3d6..63747eb8d1a 100644
--- a/tensorflow/lite/experimental/ios/ios.bzl
+++ b/tensorflow/lite/experimental/ios/ios.bzl
@@ -17,7 +17,7 @@ TFL_DISABLED_SANITIZER_TAGS = [
     "notsan",
 ]
 
-# iOS static framework with symbol allowlist. Exported C++ symbbols might cause
+# iOS static framework with symbol allowlist. Exported C++ symbols might cause
 # symbol collision with other libraries. List of symbols to allowlist can be
 # generated by running `nm -m -g FRAMEWORK_LIBRARY | grep _TfLite` for framework
 # built with `ios_static_framework` rule.
@@ -71,3 +71,32 @@ def tflite_ios_static_framework(
             "//tensorflow/lite/experimental/ios:hide_symbols_with_allowlist",
         ],
     )
+
+# When the static framework is built with bazel, the all header files are moved
+# to the "Headers" directory with no header path prefixes. This auxiliary rule
+# is used for stripping the path prefix to the "common.h" file included by the
+# "c_api.h" header.
+def strip_common_include_path_prefix(name, hdr_labels, prefix = ""):
+    """Create modified header files with the common.h include path stripped out.
+
+    Args:
+      name: The name to be used as a prefix to the generated genrules.
+      hdr_labels: List of header labels to strip out the include path. Each
+          label must end with a colon followed by the header file name.
+      prefix: Optional prefix path to prepend to the common.h inclusion path.
+    """
+
+    for hdr_label in hdr_labels:
+        hdr_filename = hdr_label.split(":")[-1]
+        hdr_basename = hdr_filename.split(".")[0]
+
+        native.genrule(
+            name = "{}_{}".format(name, hdr_basename),
+            srcs = [hdr_label],
+            outs = [hdr_filename],
+            cmd = """
+            sed 's|#include ".*common.h"|#include "{}common.h"|'\
+            "$(location {})"\
+            > "$@"
+            """.format(prefix, hdr_label),
+        )
diff --git a/tensorflow/lite/experimental/microfrontend/lib/BUILD b/tensorflow/lite/experimental/microfrontend/lib/BUILD
index 18bfdb24a84..57f8055e9df 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/BUILD
+++ b/tensorflow/lite/experimental/microfrontend/lib/BUILD
@@ -135,6 +135,9 @@ tflite_micro_cc_test(
 tflite_micro_cc_test(
     name = "filterbank_test",
     srcs = ["filterbank_test.cc"],
+    # Setting copts for experimental code to [], but this code should be fixed
+    # to build with the default copts (micro_copts())
+    copts = [],
     deps = [
         ":filterbank",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -144,6 +147,9 @@ tflite_micro_cc_test(
 tflite_micro_cc_test(
     name = "frontend_test",
     srcs = ["frontend_test.cc"],
+    # Setting copts for experimental code to [], but this code should be fixed
+    # to build with the default copts (micro_copts())
+    copts = [],
     deps = [
         ":frontend",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -153,6 +159,9 @@ tflite_micro_cc_test(
 tflite_micro_cc_test(
     name = "log_scale_test",
     srcs = ["log_scale_test.cc"],
+    # Setting copts for experimental code to [], but this code should be fixed
+    # to build with the default copts (micro_copts())
+    copts = [],
     deps = [
         ":log_scale",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -162,6 +171,9 @@ tflite_micro_cc_test(
 tflite_micro_cc_test(
     name = "noise_reduction_test",
     srcs = ["noise_reduction_test.cc"],
+    # Setting copts for experimental code to [], but this code should be fixed
+    # to build with the default copts (micro_copts())
+    copts = [],
     deps = [
         ":noise_reduction",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -171,6 +183,9 @@ tflite_micro_cc_test(
 tflite_micro_cc_test(
     name = "pcan_gain_control_test",
     srcs = ["pcan_gain_control_test.cc"],
+    # Setting copts for experimental code to [], but this code should be fixed
+    # to build with the default copts (micro_copts())
+    copts = [],
     deps = [
         ":pcan_gain_control",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -180,6 +195,9 @@ tflite_micro_cc_test(
 tflite_micro_cc_test(
     name = "window_test",
     srcs = ["window_test.cc"],
+    # Setting copts for experimental code to [], but this code should be fixed
+    # to build with the default copts (micro_copts())
+    copts = [],
     deps = [
         ":window",
         "//tensorflow/lite/micro/testing:micro_test",
diff --git a/tensorflow/lite/experimental/microfrontend/lib/frontend_test.cc b/tensorflow/lite/experimental/microfrontend/lib/frontend_test.cc
index adf59a1b8b5..9c981decf48 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/frontend_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/frontend_test.cc
@@ -123,7 +123,7 @@ TF_LITE_MICRO_TEST(FrontendTest_CheckNotEnoughSamples) {
       &num_samples_read);
 
   TF_LITE_MICRO_EXPECT_EQ(output.size, 0);
-  TF_LITE_MICRO_EXPECT_EQ(output.values, nullptr);
+  TF_LITE_MICRO_EXPECT(output.values == nullptr);
 
   FrontendFreeStateContents(&state);
 }
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
index 5817619a58f..145cf02a2e6 100644
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
@@ -1,6 +1,6 @@
 Pod::Spec.new do |s|
   s.name             = 'TensorFlowLiteObjC'
-  s.version          = '2.2.0'
+  s.version          = '2.3.0'
   s.authors          = 'Google Inc.'
   s.license          = { :type => 'Apache' }
   s.homepage         = 'https://github.com/tensorflow/tensorflow'
diff --git a/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
index 34dd119885d..0ccafd71d1b 100644
--- a/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
+++ b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
@@ -405,7 +405,9 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
     case kTfLiteNoType:
     case kTfLiteString:
     case kTfLiteComplex64:
-      // kTfLiteString and kTfLiteComplex64 are not supported in TensorFlow Lite Objc API.
+    case kTfLiteComplex128:
+      // kTfLiteString, kTfLiteComplex64 and kTfLiteComplex128 are not supported in TensorFlow Lite
+      // Objc API.
       return TFLTensorDataTypeNoType;
   }
 }
diff --git a/tensorflow/lite/experimental/swift/Sources/Interpreter.swift b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
index de7ab3f89ac..60cdc698e88 100644
--- a/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
+++ b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if os(Linux)
-import SwiftGlibc
-#else
-import Darwin
-#endif
-
 import Foundation
 import TensorFlowLiteC
 
+#if os(Linux)
+  import SwiftGlibc
+#else
+  import Darwin
+#endif
+
 /// A TensorFlow Lite interpreter that performs inference from a given model.
 public final class Interpreter {
   /// The configuration options for the `Interpreter`.
@@ -337,19 +337,19 @@ extension String {
   ///   - cFormat: The format C array as a template for substituting values.
   ///   - arguments: A C pointer to a `va_list` of arguments to substitute into `cFormat`.
   init?(cFormat: UnsafePointer<CChar>, arguments: CVaListPointer) {
-#if os(Linux)
-    let length = vsnprintf(nil, 0, cFormat, arguments) + 1 // null terminator
-    guard length > 0 else { return nil }
-    var buffer = UnsafeMutablePointer<CChar>.allocate(capacity: length)
-    defer {
-      buffer.deallocate()
-    }
-    guard vsnprintf(buffer, length, cFormat, arguments) == length - 1 else { return nil }
-    self.init(validatingUTF8: buffer)
-#else
-    var buffer: UnsafeMutablePointer<CChar>?
-    guard vasprintf(&buffer, cFormat, arguments) != 0, let cString = buffer else { return nil }
-    self.init(validatingUTF8: cString)
-#endif
+    #if os(Linux)
+      let length = Int(vsnprintf(nil, 0, cFormat, arguments) + 1) // null terminator
+      guard length > 0 else { return nil }
+      let buffer = UnsafeMutablePointer<CChar>.allocate(capacity: length)
+      defer {
+        buffer.deallocate()
+      }
+      guard vsnprintf(buffer, length, cFormat, arguments) == length - 1 else { return nil }
+      self.init(validatingUTF8: buffer)
+    #else
+      var buffer: UnsafeMutablePointer<CChar>?
+      guard vasprintf(&buffer, cFormat, arguments) != 0, let cString = buffer else { return nil }
+      self.init(validatingUTF8: cString)
+    #endif
   }
 }
diff --git a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec
index 679a894c414..8af52ef1a33 100644
--- a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec
+++ b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec
@@ -1,6 +1,6 @@
 Pod::Spec.new do |s|
   s.name             = 'TensorFlowLiteSwift'
-  s.version          = '2.2.0'
+  s.version          = '2.3.0'
   s.authors          = 'Google Inc.'
   s.license          = { :type => 'Apache' }
   s.homepage         = 'https://github.com/tensorflow/tensorflow'
@@ -20,14 +20,44 @@ Pod::Spec.new do |s|
 
   tfl_dir = 'tensorflow/lite/'
   swift_dir = tfl_dir + 'experimental/swift/'
-  s.source_files = swift_dir + 'Sources/*.swift'
-  s.dependency 'TensorFlowLiteC', "#{s.version}"
 
-  s.test_spec 'Tests' do |ts|
-    ts.source_files = swift_dir + 'Tests/*.swift'
-    ts.resources = [
-      tfl_dir + 'testdata/add.bin',
-      tfl_dir + 'testdata/add_quantized.bin',
-    ]
+  tfl_dir = 'tensorflow/lite/'
+  swift_dir = tfl_dir + 'experimental/swift/'
+
+  s.default_subspec = 'Core'
+
+  s.subspec 'Core' do |core|
+    core.dependency 'TensorFlowLiteC', "#{s.version}"
+    core.source_files = swift_dir + 'Sources/*.swift'
+    core.exclude_files = swift_dir + 'Sources/{CoreML,Metal}Delegate.swift'
+
+    core.test_spec 'Tests' do |ts|
+      ts.source_files = swift_dir + 'Tests/*.swift'
+      ts.exclude_files = swift_dir + 'Tests/MetalDelegateTests.swift'
+      ts.resources = [
+        tfl_dir + 'testdata/add.bin',
+        tfl_dir + 'testdata/add_quantized.bin',
+      ]
+    end
+  end
+
+  s.subspec 'CoreML' do |coreml|
+    coreml.source_files = swift_dir + 'Sources/CoreMLDelegate.swift'
+    coreml.dependency 'TensorFlowLiteC/CoreML', "#{s.version}"
+    coreml.dependency 'TensorFlowLiteSwift/Core', "#{s.version}"
+  end
+
+  s.subspec 'Metal' do |metal|
+    metal.source_files = swift_dir + 'Sources/MetalDelegate.swift'
+    metal.dependency 'TensorFlowLiteC/Metal', "#{s.version}"
+    metal.dependency 'TensorFlowLiteSwift/Core', "#{s.version}"
+
+    metal.test_spec 'Tests' do |ts|
+      ts.source_files = swift_dir + 'Tests/{Interpreter,MetalDelegate}Tests.swift'
+      ts.resources = [
+        tfl_dir + 'testdata/add.bin',
+        tfl_dir + 'testdata/add_quantized.bin',
+      ]
+    end
   end
 end
diff --git a/tensorflow/lite/experimental/writer/enum_mapping.h b/tensorflow/lite/experimental/writer/enum_mapping.h
index 5eabbcb2015..0847fb7893d 100644
--- a/tensorflow/lite/experimental/writer/enum_mapping.h
+++ b/tensorflow/lite/experimental/writer/enum_mapping.h
@@ -82,6 +82,8 @@ inline TensorType TfLiteTypeToSchemaType(TfLiteType type) {
       return TensorType_INT16;
     case kTfLiteComplex64:
       return TensorType_COMPLEX64;
+    case kTfLiteComplex128:
+      return TensorType_COMPLEX128;
   }
   // TODO(aselle): consider an error
 }
diff --git a/tensorflow/lite/experimental/writer/option_writer_generator.cc b/tensorflow/lite/experimental/writer/option_writer_generator.cc
index a565422457c..898f4a95ef6 100644
--- a/tensorflow/lite/experimental/writer/option_writer_generator.cc
+++ b/tensorflow/lite/experimental/writer/option_writer_generator.cc
@@ -265,6 +265,29 @@ void GenerateImportForResizeBilinearOp(FILE* fp) {
           "  }\n  break;\n");
 }
 
+// Reshape Op infers output shape either from Parameter or from shape tensor
+// that's is an additional input. When we have this additional shape tensor as
+// input we don't have the parameter present in this layer. In case of more than
+// one input we import an empty vector for the parameters.
+void GenerateImportForReshapeOp(FILE* fp) {
+  fprintf(fp,
+          "  case BuiltinOperator_RESHAPE:  {\n"
+          "    const auto* params = reinterpret_cast<const "
+          "TfLiteReshapeParams*>(builtin_op_data);\n"
+          "    flatbuffers::Offset<void> union_type;\n"
+          "    if (node.inputs->size > 1) {\n"
+          "      union_type = CreateReshapeOptions(*fbb).Union();\n"
+          "    } else {\n"
+          "      auto val0 = fbb->CreateVector(std::vector<int>(params->shape, "
+          "params->shape + params->num_dimensions));\n"
+          "      union_type = CreateReshapeOptions(*fbb, "
+          "val0).Union();\n"
+          "    }\n"
+          "    return std::make_pair(BuiltinOptions_ReshapeOptions, "
+          "union_type);\n"
+          "  }\n  break;\n");
+}
+
 void GenerateImportForOp(FILE* fp, const std::string& op_name,
                          const std::string& option_name,
                          const std::string& option_type,
@@ -276,6 +299,13 @@ void GenerateImportForOp(FILE* fp, const std::string& op_name,
     return;
   }
 
+  // Special case Reshape that may have 'new_shape' field missing from the
+  // parameters.
+  if (struct_name == "TfLiteReshapeParams") {
+    GenerateImportForReshapeOp(fp);
+    return;
+  }
+
   fprintf(fp, "  case BuiltinOperator_%s:  {\n", op_name.c_str());
   if (options->num_elems != 0) {
     fprintf(fp,
diff --git a/tensorflow/lite/experimental/writer/writer_lib.cc b/tensorflow/lite/experimental/writer/writer_lib.cc
index 85f57527c31..2f509daa9cb 100644
--- a/tensorflow/lite/experimental/writer/writer_lib.cc
+++ b/tensorflow/lite/experimental/writer/writer_lib.cc
@@ -31,7 +31,7 @@ namespace tflite {
 
 std::pair<BuiltinOptions, flatbuffers::Offset<void>> CreateBuiltinUnion(
     flatbuffers::FlatBufferBuilder* fbb, enum BuiltinOperator op,
-    void* builtin_op_data) {
+    void* builtin_op_data, const TfLiteNode& node) {
   switch (op) {
 #include "tensorflow/lite/experimental/writer/option_writer_generated.h"
   }
@@ -82,7 +82,7 @@ SubgraphWriter::ExportOperators(flatbuffers::FlatBufferBuilder* fbb) {
       // builtin
       auto builtin_options_and_type = CreateBuiltinUnion(
           fbb, static_cast<enum BuiltinOperator>(registration.builtin_code),
-          node.builtin_data);
+          node.builtin_data, node);
       builtin_options = builtin_options_and_type.second;
       builtin_options_type = builtin_options_and_type.first;
     } else {
@@ -320,6 +320,14 @@ TfLiteStatus SubgraphWriter::CheckInputOutput(
         subgraph_->node_and_registration(op_index);
     const TfLiteNode& node = node_and_registration->first;
     for (int tensor_index : TfLiteIntArrayView(node.inputs)) {
+      if (tensor_index < 0) {
+        // Skip if optional input not present.
+        if (tensor_index == kTfLiteOptionalTensor) {
+          continue;
+        } else {
+          return kTfLiteError;
+        }
+      }
       if (TfLiteTensor* tensor = subgraph_->tensor(tensor_index)) {
         // Skip constant tensors.
         if (tensor->allocation_type == kTfLiteMmapRo) {
diff --git a/tensorflow/lite/experimental/writer/writer_lib_test.cc b/tensorflow/lite/experimental/writer/writer_lib_test.cc
index 41cca88ead7..bf50d4944f1 100644
--- a/tensorflow/lite/experimental/writer/writer_lib_test.cc
+++ b/tensorflow/lite/experimental/writer/writer_lib_test.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/lite/experimental/writer/writer_lib.h"
 
+#include <numeric>
+#include <sstream>
+
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/interpreter.h"
@@ -44,6 +47,7 @@ TEST(Writer, FloatModelTest) {
   TfLiteAddParams* builtin_data =
       reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
   builtin_data->activation = kTfLiteActNone;
+  builtin_data->pot_scale_int16 = false;
   const TfLiteRegistration* reg = resolver.FindOp(BuiltinOperator_ADD, 1);
   interpreter.AddNodeWithParameters({0, 1}, {2}, initial_data, 0,
                                     reinterpret_cast<void*>(builtin_data), reg);
@@ -81,6 +85,7 @@ TEST(Writer, CustomInputOutputTest) {
   TfLiteAddParams* builtin_data =
       reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
   builtin_data->activation = kTfLiteActNone;
+  builtin_data->pot_scale_int16 = false;
   const TfLiteRegistration* reg = resolver.FindOp(BuiltinOperator_ADD, 1);
   interpreter.AddNodeWithParameters({0, 1}, {2}, initial_data, 0,
                                     reinterpret_cast<void*>(builtin_data), reg);
@@ -128,6 +133,7 @@ TEST(Writer, CustomInputOutputErrorCasesTest) {
   TfLiteAddParams* builtin_data =
       reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
   builtin_data->activation = kTfLiteActNone;
+  builtin_data->pot_scale_int16 = false;
   const TfLiteRegistration* reg = resolver.FindOp(BuiltinOperator_ADD, 1);
   interpreter.AddNodeWithParameters({0, 1}, {2}, initial_data, 0,
                                     reinterpret_cast<void*>(builtin_data), reg);
@@ -170,6 +176,7 @@ TEST(Writer, PerTensorQuantizedModelTest) {
   TfLiteAddParams* builtin_data =
       reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
   builtin_data->activation = kTfLiteActNone;
+  builtin_data->pot_scale_int16 = false;
   const TfLiteRegistration* reg = resolver.FindOp(BuiltinOperator_ADD, 1);
   interpreter.AddNodeWithParameters({0, 1}, {2}, initial_data, 0,
                                     reinterpret_cast<void*>(builtin_data), reg);
@@ -184,6 +191,83 @@ TEST(Writer, PerTensorQuantizedModelTest) {
   CHECK_EQ(new_interpreter->AllocateTensors(), kTfLiteOk);
 }
 
+struct ReshapeTestPattern {
+  int num_inputs;
+  bool is_param_valid;
+};
+
+class ReshapeLayerTest : public ::testing::TestWithParam<ReshapeTestPattern> {};
+
+TEST_P(ReshapeLayerTest, ReshapeLayerTest) {
+  const auto param = GetParam();
+  Interpreter interpreter;
+  const int total_tensors = param.num_inputs + 1;
+  interpreter.AddTensors(total_tensors);
+  int output_shape[] = {1, 2, 3};
+  interpreter.SetTensorParametersReadWrite(/*tensor_index=*/0, kTfLiteFloat32,
+                                           /*name=*/"a", /*dims=*/{6},
+                                           TfLiteQuantization());
+  ASSERT_LE(param.num_inputs, 2);
+  if (param.num_inputs == 2) {
+    interpreter.SetTensorParametersReadOnly(
+        /*tensor_index=*/1, kTfLiteInt32, /*name=*/"b", /*dims=*/{3},
+        TfLiteQuantization(), reinterpret_cast<char*>(output_shape),
+        sizeof(output_shape));
+  }
+  interpreter.SetTensorParametersReadWrite(/*tensor_index=*/total_tensors - 1,
+                                           kTfLiteFloat32, /*name=*/"c",
+                                           /*dims=*/{3}, TfLiteQuantization());
+
+  std::vector<int> input_tensors(param.num_inputs);
+  std::iota(input_tensors.begin(), input_tensors.end(), 0);
+
+  interpreter.SetInputs(input_tensors);
+  interpreter.SetOutputs({total_tensors - 1});
+  const char* initial_data = "";
+  tflite::ops::builtin::BuiltinOpResolver resolver;
+  TfLiteReshapeParams* builtin_data = reinterpret_cast<TfLiteReshapeParams*>(
+      malloc(sizeof(TfLiteReshapeParams)));
+  if (param.is_param_valid) {
+    builtin_data->num_dimensions = 3;
+    for (int dim = 0; dim < builtin_data->num_dimensions; ++dim) {
+      builtin_data->shape[dim] = output_shape[dim];
+    }
+  }
+  const TfLiteRegistration* reg = resolver.FindOp(BuiltinOperator_RESHAPE, 1);
+  interpreter.AddNodeWithParameters(input_tensors,
+                                    /*outputs=*/{total_tensors - 1},
+                                    initial_data, /*init_data_size=*/0,
+                                    reinterpret_cast<void*>(builtin_data), reg);
+
+  SubgraphWriter writer(&interpreter.primary_subgraph());
+  std::stringstream ss;
+  ss << "/tmp/test_reshape_" << param.num_inputs << param.is_param_valid
+     << ".tflite";
+  std::string filename = ss.str();
+  writer.Write(filename);
+  std::unique_ptr<FlatBufferModel> model =
+      FlatBufferModel::BuildFromFile(filename.c_str());
+  InterpreterBuilder builder(*model, resolver);
+  std::unique_ptr<Interpreter> new_interpreter;
+  builder(&new_interpreter);
+  ASSERT_EQ(new_interpreter->AllocateTensors(), kTfLiteOk);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    Writer, ReshapeLayerTest,
+    ::testing::Values(ReshapeTestPattern{/*num_inputs=*/2,
+                                         /*is_param_valid=*/true},
+                      ReshapeTestPattern{/*num_inputs=*/2,
+                                         /*is_param_valid=*/false},
+                      ReshapeTestPattern{/*num_inputs=*/1,
+                                         /*is_param_valid=*/true}),
+    [](const ::testing::TestParamInfo<ReshapeLayerTest::ParamType>& info) {
+      std::stringstream ss;
+      ss << "num_inputs_" << info.param.num_inputs << "_valid_param_"
+         << info.param.is_param_valid;
+      std::string name = ss.str();
+      return name;
+    });
 }  // namespace tflite
 
 int main(int argc, char** argv) {
diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index 715e0c8431b..96ec7363ab1 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -40,8 +40,10 @@ upper_tabs:
         status: external
 
       - heading: "Text"
-      - title: "Text classification"
+      - title: "Text classification with Model Maker"
         path: /lite/tutorials/model_maker_text_classification
+      - title: "Question Answer with Model Maker"
+        path: /lite/tutorials/model_maker_question_answer
 
       - heading: "Microcontrollers"
       - title: "Gesture recognition"
@@ -91,6 +93,11 @@ upper_tabs:
       - title: "1.x compatibility"
         path: /lite/convert/1x_compatibility
 
+      - heading: "Create a model"
+      - title: "TensorFlow Lite Model Maker"
+        status: experimental
+        path: /lite/guide/model_maker
+
       - heading: "Inference"
       - title: "Overview"
         path: /lite/guide/inference
@@ -190,6 +197,9 @@ upper_tabs:
         path: /lite/models/smart_reply/overview
       - title: "Text classification"
         path: /lite/models/text_classification/overview
+      - heading: "Others"
+      - title: "Recommendation"
+        path: /lite/models/recommendation/overview
 
     - name: "API"
       skip_translation: true
diff --git a/tensorflow/lite/g3doc/convert/metadata.md b/tensorflow/lite/g3doc/convert/metadata.md
index 6cc3e4aad84..4279e409416 100644
--- a/tensorflow/lite/g3doc/convert/metadata.md
+++ b/tensorflow/lite/g3doc/convert/metadata.md
@@ -34,17 +34,17 @@ TensorFlow Lite metadata tooling supports both Python 2 and Python 3.
 ## Adding metadata
 
 There are three parts to the model metadata in the
-[schema](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs):
+[schema](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/metadata/metadata_schema.fbs):
 
 1.  **Model information** - Overall description of the model as well as items
     such as licence terms. See
-    [ModelMetadata](https://github.com/tensorflow/tensorflow/blob/268853ee81edab09e07f455cc918f7ef9a421485/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L464).
+    [ModelMetadata](https://github.com/tensorflow/tflite-support/blob/4cd0551658b6e26030e0ba7fc4d3127152e0d4ae/tensorflow_lite_support/metadata/metadata_schema.fbs#L640).
 2.  **Input information** - Description of the inputs and pre-processing
     required such as normalization. See
-    [SubGraphMetadata.input_tensor_metadata](https://github.com/tensorflow/tensorflow/blob/268853ee81edab09e07f455cc918f7ef9a421485/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L452).
+    [SubGraphMetadata.input_tensor_metadata](https://github.com/tensorflow/tflite-support/blob/4cd0551658b6e26030e0ba7fc4d3127152e0d4ae/tensorflow_lite_support/metadata/metadata_schema.fbs#L590).
 3.  **Output information** - Description of the output and post-processing
     required such as mapping to labels. See
-    [SubGraphMetadata.output_tensor_metadata](https://github.com/tensorflow/tensorflow/blob/268853ee81edab09e07f455cc918f7ef9a421485/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L458).
+    [SubGraphMetadata.output_tensor_metadata](https://github.com/tensorflow/tflite-support/blob/4cd0551658b6e26030e0ba7fc4d3127152e0d4ae/tensorflow_lite_support/metadata/metadata_schema.fbs#L599).
 
 Since TensorFlow Lite only supports single subgraph at this point, the
 [TensorFlow Lite code generator](../guide/codegen.md#generate-code-with-tensorflow-lite-android-code-generator)
@@ -65,7 +65,7 @@ Lite metadata:
 *   Feature - Numbers which are unsigned integers or float32.
 *   Image - Metadata currently supports RGB and greyscale images.
 *   Bounding box - Rectangular shape bounding boxes. The schema supports
-    [a variety of numbering schemes](https://github.com/tensorflow/tensorflow/blob/268853ee81edab09e07f455cc918f7ef9a421485/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L165).
+    [a variety of numbering schemes](https://github.com/tensorflow/tflite-support/blob/4cd0551658b6e26030e0ba7fc4d3127152e0d4ae/tensorflow_lite_support/metadata/metadata_schema.fbs#L214).
 
 ### Pack the associated files
 
@@ -87,7 +87,7 @@ file type and where the file is attached to (i.e. `ModelMetadata`,
 `SubGraphMetadata`, and `TensorMetadata`),
 [the TensorFlow Lite Android code generator](../guide/codegen.md) may apply
 corresponding pre/post processing automatically to the object. See
-[the \<Codegen usage\> section of each associate file type](https://github.com/tensorflow/tensorflow/blob/268853ee81edab09e07f455cc918f7ef9a421485/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L37-L77)
+[the \<Codegen usage\> section of each associate file type](https://github.com/tensorflow/tflite-support/blob/4cd0551658b6e26030e0ba7fc4d3127152e0d4ae/tensorflow_lite_support/metadata/metadata_schema.fbs#L77-L127)
 in the schema for more details.
 
 ### Normalization and quantization parameters
@@ -351,7 +351,7 @@ with open(export_json_file, "w") as f:
 ## Metadata versioning
 
 The
-[metadata schema](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs)
+[metadata schema](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/metadata/metadata_schema.fbs)
 is versioned both by the Semantic versioning number, which tracks the changes of
 the schema file, and by the Flatbuffers file identification, which indicates the
 true version compatibility.
@@ -359,11 +359,11 @@ true version compatibility.
 ### The Semantic versioning number
 
 The metadata schema is versioned by the
-[Semantic versioning number](https://github.com/tensorflow/tensorflow/blob/72d30dfb8bc58be931604f853bd161a11b7c9fcc/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L53),
+[Semantic versioning number](https://github.com/tensorflow/tflite-support/blob/4cd0551658b6e26030e0ba7fc4d3127152e0d4ae/tensorflow_lite_support/metadata/metadata_schema.fbs#L53),
 such as MAJOR.MINOR.PATCH. It tracks schema changes according to the rules
-[here](https://github.com/tensorflow/tensorflow/blob/72d30dfb8bc58be931604f853bd161a11b7c9fcc/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L32-L44).
+[here](https://github.com/tensorflow/tflite-support/blob/4cd0551658b6e26030e0ba7fc4d3127152e0d4ae/tensorflow_lite_support/metadata/metadata_schema.fbs#L32-L44).
 See the
-[history of fields](https://github.com/tensorflow/tensorflow/blob/72d30dfb8bc58be931604f853bd161a11b7c9fcc/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L63)
+[history of fields](https://github.com/tensorflow/tflite-support/blob/4cd0551658b6e26030e0ba7fc4d3127152e0d4ae/tensorflow_lite_support/metadata/metadata_schema.fbs#L63)
 added after version `1.0.0`.
 
 ### The Flatbuffers file identification
@@ -373,7 +373,7 @@ does not imply the true incompatibility. When bumping up the MAJOR number, it
 does not necessarily mean the backwards compatibility is broken. Therefore, we
 use the
 [Flatbuffers file identification](https://google.github.io/flatbuffers/md__schemas.html),
-[file_identifiler](https://github.com/tensorflow/tensorflow/blob/72d30dfb8bc58be931604f853bd161a11b7c9fcc/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L61),
+[file_identifiler](https://github.com/tensorflow/tflite-support/blob/4cd0551658b6e26030e0ba7fc4d3127152e0d4ae/tensorflow_lite_support/metadata/metadata_schema.fbs#L61),
 to denote the true compatibility of the metadata schema. The file identifier is
 exactly 4 characters long. It is fixed to a certain metadata schema and not
 subject to change by users. If the backward compatibility of the metadata schema
@@ -384,12 +384,12 @@ frequently than the metadata_version.
 ### The minimum necessary metadata parser version
 
 The
-[minimum necessary metadata parser version](https://github.com/tensorflow/tensorflow/blob/72d30dfb8bc58be931604f853bd161a11b7c9fcc/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L565)
+[minimum necessary metadata parser version](https://github.com/tensorflow/tflite-support/blob/4cd0551658b6e26030e0ba7fc4d3127152e0d4ae/tensorflow_lite_support/metadata/metadata_schema.fbs#L681)
 is the minimum version of metadata parser (the Flatbuffers generated code) that
 can read the metadata Flatbuffers in full. The version is effectively the
 largest version number among the versions of all the fields populated and the
 smallest compatible version indicated by the file identifier. The minimum
-necessary metadata parser version is automaticaly populated by the
+necessary metadata parser version is automatically populated by the
 `MetadataPopulator` when the metadata is populated into a TFLite model. See the
 [metadata extractor](../guide/codegen.md#read-the-metadata-from-models) about
 how the minimum necessary metadata parser version is used.
diff --git a/tensorflow/lite/g3doc/guide/android.md b/tensorflow/lite/g3doc/guide/android.md
index 2c148ecbe7d..72eb07aa34b 100644
--- a/tensorflow/lite/g3doc/guide/android.md
+++ b/tensorflow/lite/g3doc/guide/android.md
@@ -16,7 +16,7 @@ to continuously classify whatever it sees from the device's rear-facing camera.
 The application can run either on device or emulator.
 
 Inference is performed using the TensorFlow Lite Java API and the
-[TensorFlow Lite Android Support Library](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/support/java/README.md).
+[TensorFlow Lite Android Support Library](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/README.md).
 The demo app classifies frames in real-time, displaying the top most probable
 classifications. It allows the user to choose between a floating point or
 [quantized](https://www.tensorflow.org/lite/performance/post_training_quantization)
@@ -53,7 +53,7 @@ arrays. It also provides pre- and post-processing units that perform tasks such
 as image resizing and cropping.
 
 To get started, follow the instructions in the
-[TensorFlow Lite Android Support Library README.md](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/support/java/README.md).
+[TensorFlow Lite Android Support Library README.md](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/README.md).
 
 ### Use the TensorFlow Lite AAR from JCenter
 
@@ -143,7 +143,7 @@ directory instead (-v hostDir:/tmp).
     license):
 
 ```shell
-android update sdk --no-ui -a --filter tools,platform-tools,android-${ANDROID_API_LEVEL},build-tools-${ANDROID_BUILD_TOOLS_VERSION}’
+android update sdk --no-ui -a --filter tools,platform-tools,android-${ANDROID_API_LEVEL},build-tools-${ANDROID_BUILD_TOOLS_VERSION}
 ```
 
 You can now proceed to the "Build and Install" section. After you are finished
@@ -205,8 +205,19 @@ bazel build -c opt --fat_apk_cpu=x86,x86_64,arm64-v8a,armeabi-v7a \
 This will generate an AAR file in `bazel-bin/tensorflow/lite/java/`. Note
 that this builds a "fat" AAR with several different architectures; if you don't
 need all of them, use the subset appropriate for your deployment environment.
-From there, there are several approaches you can take to use the .aar in your
-Android Studio project.
+
+Caution: Following feature is experimental and only available at HEAD. You can
+build smaller AAR files targeting only a set of models as follows:
+
+```sh
+bash tensorflow/lite/tools/build_aar.sh \
+  --input_models=model1,model2 \
+  --target_archs=x86,x86_64,arm64-v8a,armeabi-v7a
+```
+
+Above script will generate the `tensorflow-lite.aar` file and optionally the
+`tensorflow-lite-select-tf-ops.aar` file if one of the models is using
+Tensorflow ops.
 
 ##### Add AAR directly to project
 
diff --git a/tensorflow/lite/g3doc/guide/codegen.md b/tensorflow/lite/g3doc/guide/codegen.md
index cceb40b1d74..84dd2ffade9 100644
--- a/tensorflow/lite/g3doc/guide/codegen.md
+++ b/tensorflow/lite/g3doc/guide/codegen.md
@@ -27,7 +27,7 @@ Lite model with typed objects such as `Bitmap` and `Rect`.
 The usefulness of the code generator depend on the completeness of the
 TensorFlow Lite model's metadata entry. Refer to the `<Codegen usage>` section
 under relevant fields in
-[metadata_schema.fbs](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs),
+[metadata_schema.fbs](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/metadata/metadata_schema.fbs),
 to see how the codegen tool parses each field.
 
 ### Generate Wrapper Code
@@ -171,9 +171,9 @@ generated by the Android Studio ML Model Binding.
 
 ## Read the metadata from models
 
-The Metadata Extractor library is a convinient tool to read the metadata and
+The Metadata Extractor library is a convenient tool to read the metadata and
 associated files from a models across different platforms (see the
-[Java version](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/support/metadata)
+[Java version](https://github.com/tensorflow/tflite-support/tree/master/tensorflow_lite_support/metadata)
 and the C++ version is coming soon). Users can also build their own metadata
 extractor tool in other languages using the Flatbuffers library.
 
@@ -198,7 +198,7 @@ information.
 As long as the file identifer is satisfied, the metadata extractor will not fail
 when reading metadata generated from an old or a future scheme due to the
 Flatbuffers forward and backwards compatibility mechanism. But fields from
-future shcemas cannot be extracted by older metadata extractors. The
+future schemas cannot be extracted by older metadata extractors. The
 [minimum necessary parser version](../convert/metadata.md#the-minimum-necessary-metadata-parser-version)
 of the metadata indicates the minimum version of metadata parser that can read
 the metadata Flatbuffers in full. You can use the following method to verify if
diff --git a/tensorflow/lite/g3doc/guide/get_started.md b/tensorflow/lite/g3doc/guide/get_started.md
index c9543c7f553..df206e73416 100644
--- a/tensorflow/lite/g3doc/guide/get_started.md
+++ b/tensorflow/lite/g3doc/guide/get_started.md
@@ -67,6 +67,10 @@ If you have designed and trained your own TensorFlow model, or you have trained
 a model obtained from another source, you must
 [convert it to the TensorFlow Lite format](#2_convert_the_model_format).
 
+You can also try [The TensorFlow Lite Model Maker library](model_maker.md) which
+simplifies the process of training a TensorFlow Lite model using custom
+datasets.
+
 ## 2. Convert the model
 
 <a id="2_convert_the_model_format"></a>
diff --git a/tensorflow/lite/g3doc/guide/inference.md b/tensorflow/lite/g3doc/guide/inference.md
index 6e47d6d5190..fbf03ab84b5 100644
--- a/tensorflow/lite/g3doc/guide/inference.md
+++ b/tensorflow/lite/g3doc/guide/inference.md
@@ -57,6 +57,12 @@ explicit goal and some variance between languages is to be expected.
 Across all libraries, the TensorFlow Lite API enables you to load models, feed
 inputs, and retrieve inference outputs.
 
+## Supported operations
+
+TensorFlow Lite supports a subset of TensorFlow operations with some
+limitations. For full list of operations and limitations see
+[TF Lite Ops page](https://www.tensorflow.org/mlir/tfl_ops).
+
 ### Android
 
 On Android, TensorFlow Lite inference can be performed using either Java or C++
diff --git a/tensorflow/lite/g3doc/guide/lite_support.md b/tensorflow/lite/g3doc/guide/lite_support.md
index 826979efb19..39eeeee3684 100644
--- a/tensorflow/lite/g3doc/guide/lite_support.md
+++ b/tensorflow/lite/g3doc/guide/lite_support.md
@@ -6,7 +6,7 @@ Mobile application developers typically interact with typed objects such as
 bitmaps or primitives such as integers. However, the TensorFlow Lite Interpreter
 that runs the on-device machine learning model uses tensors in the form of
 ByteBuffer, which can be difficult to debug and manipulate. The
-[TensorFlow Lite Android Support Library](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/support/java)
+[TensorFlow Lite Android Support Library](https://github.com/tensorflow/tflite-support/tree/master/tensorflow_lite_support/java)
 is designed to help process the input and output of TensorFlow Lite models, and
 make the TensorFlow Lite interpreter easier to use.
 
diff --git a/tensorflow/lite/g3doc/guide/model_maker.md b/tensorflow/lite/g3doc/guide/model_maker.md
new file mode 100644
index 00000000000..3746dbd6c9f
--- /dev/null
+++ b/tensorflow/lite/g3doc/guide/model_maker.md
@@ -0,0 +1,60 @@
+# TensorFlow Lite Model Maker
+
+## Overview
+
+The TensorFlow Lite Model Maker library simplifies the process of training a
+TensorFlow Lite model using custom dataset. It uses transfer learning to reduce
+the amount of training data required and shorten the training time.
+
+## Supported Tasks
+
+The Model Maker library currently supports the following ML tasks. Click the
+links below for guides on how to train the model.
+
+Supported Tasks                                                                                          | Task Utility
+-------------------------------------------------------------------------------------------------------- | ------------
+Image Classification [guide](https://www.tensorflow.org/lite/tutorials/model_maker_image_classification) | Classify images into predefined categories.
+Text Classification [guide](https://www.tensorflow.org/lite/tutorials/model_maker_text_classification)   | Classify text into predefined categories.
+Question Answer [guide](https://www.tensorflow.org/lite/tutorials/model_maker_question_answer)           | Find the answer in a certain context for a given question.
+
+## End-to-End Example
+
+Model Maker allows you to train a TensorFlow Lite model using custom datasets in
+just a few lines of code. For example, here are the steps to train an image
+classification model.
+
+```python
+# Load input data specific to an on-device ML app.
+data = ImageClassifierDataLoader.from_folder('flower_photos/')
+train_data, test_data = data.split(0.9)
+
+# Customize the TensorFlow model.
+model = image_classifier.create(data)
+
+# Evaluate the model.
+loss, accuracy = model.evaluate(test_data)
+
+# Export to Tensorflow Lite model and label file in `export_dir`.
+model.export(export_dir='/tmp/')
+```
+
+For more details, see the
+[image classification guide](https://www.tensorflow.org/lite/tutorials/model_maker_image_classification).
+
+## Installation
+
+There are two ways to install Model Maker.
+
+*   Install a prebuilt pip package.
+
+```shell
+pip install tflite-model-maker
+```
+
+*   Clone the source code from GitHub and install.
+
+```shell
+git clone https://github.com/tensorflow/examples
+cd examples/tensorflow_examples/lite/model_maker/pip_package
+pip install -e .
+```
diff --git a/tensorflow/lite/g3doc/models/recommendation/images/screenshot.gif b/tensorflow/lite/g3doc/models/recommendation/images/screenshot.gif
new file mode 100644
index 00000000000..0a43707c5a1
Binary files /dev/null and b/tensorflow/lite/g3doc/models/recommendation/images/screenshot.gif differ
diff --git a/tensorflow/lite/g3doc/models/recommendation/overview.md b/tensorflow/lite/g3doc/models/recommendation/overview.md
new file mode 100644
index 00000000000..582c5cd405e
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/recommendation/overview.md
@@ -0,0 +1,122 @@
+# Recommendation
+
+Personalized recommendations are widely used for a variety of use cases on
+mobile devices, such as media content retrieval, shopping product suggestion,
+and next app recommendation. If you are interested in providing personalized
+recommendations in your application while respecting user privacy, we recommend
+exploring the following example and toolkit.
+
+## Get started
+
+<img src="images/screenshot.gif" class="attempt-right" style="max-width: 300px">
+
+We provide a TensorFlow Lite sample application that demonstrates how to
+recommend relevant items to users on Android.
+
+<a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/recommendation/android">Android
+example</a>
+
+If you are using a platform other than Android, or you are already familiar with
+the TensorFlow Lite APIs, you can download our starter recommendation model.
+
+<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/recommendation/20200720/recommendation.tar.gz">Download
+starter model</a>
+
+We also provide training script in Github to train your own model.
+
+<a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/recommendation/ml">Training
+code</a>
+
+## Understand the model architecture
+
+We leverage a dual-encoder model architecture, with context-encoder to encode
+sequential user history and label-encoder to encode predicted recommendation
+candidate. Similarity between context and label encodings is used to represent
+the likelihood that the predicted candidate meets the user's needs.
+
+Three different sequential user history encoding techniques are provided with
+this code base:
+
+*   Bag-of-words encoder (BOW): averaging user activities' embeddings without
+    considering context order.
+*   Convolutional neural network encoder (CNN): applying multiple layers of
+    convolutional neural networks to generate context encoding.
+*   Recurrent neural network encoder (RNN): applying recurrent neural network to
+    encode context sequence.
+
+*Note: The model is trained based on
+[MovieLens](https://grouplens.org/datasets/movielens/1m/) dataset for research
+purpose.
+
+## Examples
+
+Input IDs:
+
+*   Matrix (ID: 260)
+*   Saving Private Ryan (ID: 2028)
+*   (and more)
+
+Output IDs:
+
+*   Star Wars: Episode VI - Return of the Jedi (ID: 1210)
+*   (and more)
+
+## Performance benchmarks
+
+Performance benchmark numbers are generated with the tool
+[described here](https://www.tensorflow.org/lite/performance/benchmarks).
+
+<table>
+  <thead>
+    <tr>
+      <th>Model Name</th>
+      <th>Model Size </th>
+      <th>Device </th>
+      <th>CPU</th>
+    </tr>
+  </thead>
+  <tr>
+    <td rowspan = 3>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/recommendation/20200720/model.tar.gz">recommendation</a>
+    </td>
+    <td rowspan = 3>
+      0.52 Mb
+    </td>
+    <td>Pixel 3</td>
+    <td>0.09ms*</td>
+  </tr>
+   <tr>
+     <td>Pixel 4 </td>
+    <td>0.05ms*</td>
+  </tr>
+</table>
+
+\* 4 threads used.
+
+## Use your training data
+
+In addition to the trained model, we provide an open-sourced
+[toolkit in GitHub](https://github.com/tensorflow/examples/tree/master/lite/examples/recommendation/ml)
+to train models with your own data. You can follow this tutorial to learn how to
+use the toolkit and deploy trained models in your own mobile applications.
+
+Please follow this
+[tutorial](https://github.com/tensorflow/examples/tree/master/lite/examples/recommendation/ml/ondevice_recommendation.ipynb)
+to apply the same technique used here to train a recommendation model using your
+own datasets.
+
+## Tips for model customization with your data
+
+The pretrained model integrated in this demo application is trained with
+[MovieLens](https://grouplens.org/datasets/movielens/1m/) dataset, you may want
+to modify model configuration based on your own data, such as vocab size,
+embedding dims and input context length. Here are a few tips:
+
+*   Input context length: The best input context length varies with datasets. We
+    suggest selecting input context length based on how much label events are
+    correlated with long-term interests vs short-term context.
+
+*   Encoder type selection: we suggest selecting encoder type based on input
+    context length. Bag-of-words encoder works well for short input context
+    length (e.g. <10), CNN and RNN encoders bring in more summarization ability
+    for long input context length.
diff --git a/tensorflow/lite/g3doc/performance/coreml_delegate.md b/tensorflow/lite/g3doc/performance/coreml_delegate.md
index 8c2bab96d23..2803b080a13 100644
--- a/tensorflow/lite/g3doc/performance/coreml_delegate.md
+++ b/tensorflow/lite/g3doc/performance/coreml_delegate.md
@@ -159,10 +159,10 @@ if (delegate == nullptr) {
 interpreter->ModifyGraphWithDelegate(delegate);
 ```
 
-The delegate creation logic reads device's machine id (e.g. iPhone11,1)
-to determine its Neural Engine availability. See the
+The delegate creation logic reads device's machine id (e.g. iPhone11,1) to
+determine its Neural Engine availability. See the
 [code](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm)
-for more detail. Alternatively, you can implement your own set of blacklist
+for more detail. Alternatively, you can implement your own set of denylist
 devices using other libraries such as
 [DeviceKit](https://github.com/devicekit/DeviceKit).
 
@@ -184,6 +184,7 @@ Following ops are supported by the Core ML delegate.
         1]`, `[B, 1, H, W]`, `[B, 1, 1, 1]`.
 *   AveragePool2D
 *   Concat
+    *   Concatenation should be done along the channel axis.
 *   Conv2D
     *   Weights and bias should be constant.
 *   DepthwiseConv2D
@@ -195,10 +196,16 @@ Following ops are supported by the Core ML delegate.
 *   Hardswish
 *   Logistic (aka Sigmoid)
 *   MaxPool2D
+*   MirrorPad
+    *   Only 4D input with `REFLECT` mode is supported. Padding should be
+        constant, and is only allowed for H and W dimensions.
 *   Mul
     *   Only certain shapes are broadcastable. In Core ML tensor layout,
         following tensor shapes are broadcastable. `[B, C, H, W]`, `[B, C, 1,
         1]`, `[B, 1, H, W]`, `[B, 1, 1, 1]`.
+*   Pad and PadV2
+    *   Only 4D input is supported. Padding should be constant, and is only
+        allowed for H and W dimensions.
 *   Relu
 *   ReluN1To1
 *   Relu6
diff --git a/tensorflow/lite/g3doc/performance/gpu.md b/tensorflow/lite/g3doc/performance/gpu.md
index b8f7c419e5b..3cea6febb21 100644
--- a/tensorflow/lite/g3doc/performance/gpu.md
+++ b/tensorflow/lite/g3doc/performance/gpu.md
@@ -93,7 +93,7 @@ target 'YourProjectName'
 
 </section>
 
-From TensorFlow Lite 2.1.0, GPU delegate is inlcuded in the `TensorFlowLiteC`
+From TensorFlow Lite 2.1.0, GPU delegate is included in the `TensorFlowLiteC`
 pod. You can choose between `TensorFlowLiteC` and `TensorFlowLiteSwift`
 depending on the language.
 
diff --git a/tensorflow/lite/g3doc/performance/gpu_advanced.md b/tensorflow/lite/g3doc/performance/gpu_advanced.md
index 8d498cd404b..1614523b705 100644
--- a/tensorflow/lite/g3doc/performance/gpu_advanced.md
+++ b/tensorflow/lite/g3doc/performance/gpu_advanced.md
@@ -122,8 +122,8 @@ TFLite GPU for Android C/C++ uses the [Bazel](https://bazel.io) build system.
 The delegate can be built, for example, using the following command:
 
 ```sh
-bazel build -c opt --config android_arm64 tensorflow/lite/delegates/gpu:gl_delegate                  # for static library
-bazel build -c opt --config android_arm64 tensorflow/lite/delegates/gpu:libtensorflowlite_gpu_gl.so  # for dynamic library
+bazel build -c opt --config android_arm64 tensorflow/lite/delegates/gpu:delegate                           # for static library
+bazel build -c opt --config android_arm64 tensorflow/lite/delegates/gpu:libtensorflowlite_gpu_delegate.so  # for dynamic library
 ```
 
 ### iOS (Swift)
@@ -244,6 +244,24 @@ as well. This includes all flavors of quantization, including:
 To optimize performance, use models that have floating-point input & output
 tensors.
 
+#### How does this work?
+
+Since the GPU backend only supports floating-point execution, we run quantized
+models by giving it a ‘floating-point view’ of the original model. At a
+high-level, this entails the following steps:
+
+*   *Constant tensors* (such as weights/biases) are dequantized once into the
+    GPU memory. This happens when the delegate is applied to the TFLite
+    Interpreter.
+
+*   *Inputs and outputs* to the GPU program, if 8-bit quantized, are dequantized
+    and quantized (respectively) for each inference. This is done on the CPU
+    using TFLite’s optimized kernels.
+
+*   The GPU program is modified to mimic quantized behavior by inserting
+    *quantization simulators* between operations. This is necessary for models
+    where ops expect activations to follow bounds learnt during quantization.
+
 This feature can be enabled using delegate options as follows:
 
 #### Android
diff --git a/tensorflow/lite/g3doc/performance/hexagon_delegate.md b/tensorflow/lite/g3doc/performance/hexagon_delegate.md
index 30d108d6990..309d3021f97 100644
--- a/tensorflow/lite/g3doc/performance/hexagon_delegate.md
+++ b/tensorflow/lite/g3doc/performance/hexagon_delegate.md
@@ -75,10 +75,11 @@ dependencies {
     *   [v1.10.3](https://storage.cloud.google.com/download.tensorflow.org/tflite/hexagon_nn_skel_1_10_3_1.run)
     *   [v1.14](https://storage.cloud.google.com/download.tensorflow.org/tflite/hexagon_nn_skel_v1.14.run)
     *   [v1.17](https://storage.cloud.google.com/download.tensorflow.org/tflite/hexagon_nn_skel_v1.17.0.0.run)
+    *   [v1.20](https://storage.cloud.google.com/download.tensorflow.org/tflite/hexagon_nn_skel_v1.20.0.0.run)
 
 Note: You will need to accept the license agreement.
 
-Note: As of 04/28/2020 you should use v1.17.
+Note: As of 07/22/2020 you should use v1.20.
 
 Note: You must use the hexagon_nn libraries with the compatible version of
 interface library. Interface library is part of the AAR and fetched by bazel
@@ -180,10 +181,12 @@ dependencies {
     “libhexagon_nn_skel_v66.so”
     *   [v1.10.3](https://storage.cloud.google.com/download.tensorflow.org/tflite/hexagon_nn_skel_1_10_3_1.run)
     *   [v1.14](https://storage.cloud.google.com/download.tensorflow.org/tflite/hexagon_nn_skel_v1.14.run)
+    *   [v1.17](https://storage.cloud.google.com/download.tensorflow.org/tflite/hexagon_nn_skel_v1.17.0.0.run)
+    *   [v1.20](https://storage.cloud.google.com/download.tensorflow.org/tflite/hexagon_nn_skel_v1.20.0.0.run)
 
 Note: You will need to accept the license agreement.
 
-Note: As of 03/03/2020 you should use v1.14.
+Note: As of 07/22/2020 you should use v1.20.
 
 Note: You must use the hexagon_nn libraries with the compatible version of
 interface library. Interface library is part of the AAR and fetched by bazel
diff --git a/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb b/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
index cff1e773938..a2835f53d82 100644
--- a/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
+++ b/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
@@ -12,7 +12,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 17,
+      "execution_count": null,
       "metadata": {
         "cellView": "form",
         "colab": {},
@@ -76,22 +76,15 @@
       "source": [
         "## Overview\n",
         "\n",
-        "[TensorFlow Lite](https://www.tensorflow.org/lite/) now supports\n",
-        "converting all model values (weights and activations) to 8-bit integers when converting from TensorFlow to TensorFlow Lite's flat buffer format. This results in a 4x reduction in model size and a 3 to 4x performance improvement on CPU performance. In addition, this fully quantized model can be consumed by integer-only hardware accelerators.\n",
+        "Integer quantization is an optimization strategy that converts 32-bit floating-point numbers (such as weights and activation outputs) to the nearest 8-bit fixed-point numbers. This results in a smaller model and increased inferencing speed, which is valuable for low-power devices such as [microcontrollers](https://www.tensorflow.org/lite/microcontrollers). This data format is also required by integer-only accelerators such as the [Edge TPU](https://coral.ai/).\n",
         "\n",
-        "In contrast to [post-training \"on-the-fly\" quantization](https://colab.sandbox.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/tutorials/post_training_quant.ipynb)—which stores only the weights as 8-bit integers—this technique statically quantizes all weights *and* activations during model conversion.\n",
+        "In this tutorial, you'll train an MNIST model from scratch, convert it into a Tensorflow Lite file, and quantize it using [post-training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization). Finally, you'll check the accuracy of the converted model and compare it to the original float model.\n",
         "\n",
-        "In this tutorial, you'll train an MNIST model from scratch, check its accuracy in TensorFlow, and then convert the model into a Tensorflow Lite flatbuffer with full quantization. Finally, you'll check the accuracy of the converted model and compare it to the original float model."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "2XsEP17Zelz9"
-      },
-      "source": [
-        "## Build an MNIST model"
+        "You actually have several options as to how much you want to quantize a model. In this tutorial, you'll perform \"full integer quantization,\" which converts all weights and activation outputs into 8-bit integer data—whereas other strategies may leave some amount of data in floating-point.\n",
+        "\n",
+        "To learn more about the various quantization strategies, read about [TensorFlow Lite model optimization](https://www.tensorflow.org/lite/performance/model_optimization).\n",
+        "\n",
+        "\n"
       ]
     },
     {
@@ -101,12 +94,22 @@
         "id": "dDqqUIZjZjac"
       },
       "source": [
-        "### Setup"
+        "## Setup"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "I0nR5AMEWq0H"
+      },
+      "source": [
+        "In order to quantize both the input and output tensors, we need to use APIs added in TensorFlow r2.3:"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 18,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -118,82 +121,18 @@
         "logging.getLogger(\"tensorflow\").setLevel(logging.DEBUG)\n",
         "\n",
         "import tensorflow as tf\n",
-        "from tensorflow import keras\n",
         "import numpy as np\n",
-        "import pathlib"
+        "assert float(tf.__version__[:3]) \u003e= 2.3"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
-        "id": "eQ6Q0qqKZogR"
+        "id": "2XsEP17Zelz9"
       },
       "source": [
-        "### Train and export the model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 19,
-      "metadata": {
-        "colab": {
-          "height": 51
-        },
-        "colab_type": "code",
-        "id": "eMsw_6HujaqM",
-        "outputId": "5662a5f3-fc64-458f-958a-98f9c6348143"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "1875/1875 [==============================] - 2s 1ms/step - loss: 0.2782 - accuracy: 0.9221 - val_loss: 0.1230 - val_accuracy: 0.9664\n"
-          ]
-        },
-        {
-          "data": {
-            "text/plain": [
-              "\u003ctensorflow.python.keras.callbacks.History at 0x7f33f1817588\u003e"
-            ]
-          },
-          "execution_count": 19,
-          "metadata": {
-            "tags": []
-          },
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "# Load MNIST dataset\n",
-        "mnist = keras.datasets.mnist\n",
-        "(train_images, train_labels), (test_images, test_labels) = mnist.load_data()\n",
-        "\n",
-        "# Normalize the input image so that each pixel value is between 0 to 1.\n",
-        "train_images = train_images / 255.0\n",
-        "test_images = test_images / 255.0\n",
-        "\n",
-        "# Define the model architecture\n",
-        "model = keras.Sequential([\n",
-        "  keras.layers.InputLayer(input_shape=(28, 28)),\n",
-        "  keras.layers.Reshape(target_shape=(28, 28, 1)),\n",
-        "  keras.layers.Conv2D(filters=12, kernel_size=(3, 3), activation='relu'),\n",
-        "  keras.layers.MaxPooling2D(pool_size=(2, 2)),\n",
-        "  keras.layers.Flatten(),\n",
-        "  keras.layers.Dense(10)\n",
-        "])\n",
-        "\n",
-        "# Train the digit classification model\n",
-        "model.compile(optimizer='adam',\n",
-        "              loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
-        "              metrics=['accuracy'])\n",
-        "model.fit(\n",
-        "  train_images,\n",
-        "  train_labels,\n",
-        "  epochs=1,\n",
-        "  validation_data=(test_images, test_labels)\n",
-        ")"
+        "## Generate a TensorFlow Model"
       ]
     },
     {
@@ -203,7 +142,94 @@
         "id": "5NMaNZQCkW9X"
       },
       "source": [
-        "This training won't take long because you're training the model for just a single epoch, which trains to about 96% accuracy."
+        "We'll build a simple model to classify numbers from the [MNIST dataset](https://www.tensorflow.org/datasets/catalog/mnist).\n",
+        "\n",
+        "This training won't take long because you're training the model for just a 5 epochs, which trains to about ~98% accuracy."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "height": 51
+        },
+        "colab_type": "code",
+        "id": "eMsw_6HujaqM",
+        "outputId": "0f362bef-a5b8-46f2-c41c-cba008998b72"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz\n",
+            "11493376/11490434 [==============================] - 0s 0us/step\n",
+            "Epoch 1/5\n",
+            "1875/1875 [==============================] - 5s 2ms/step - loss: 0.2793 - accuracy: 0.9227 - val_loss: 0.1392 - val_accuracy: 0.9618\n",
+            "Epoch 2/5\n",
+            "1875/1875 [==============================] - 5s 2ms/step - loss: 0.1179 - accuracy: 0.9667 - val_loss: 0.0928 - val_accuracy: 0.9719\n",
+            "Epoch 3/5\n",
+            "1875/1875 [==============================] - 4s 2ms/step - loss: 0.0860 - accuracy: 0.9754 - val_loss: 0.0742 - val_accuracy: 0.9755\n",
+            "Epoch 4/5\n",
+            "1875/1875 [==============================] - 4s 2ms/step - loss: 0.0691 - accuracy: 0.9796 - val_loss: 0.0686 - val_accuracy: 0.9776\n",
+            "Epoch 5/5\n",
+            "1875/1875 [==============================] - 4s 2ms/step - loss: 0.0589 - accuracy: 0.9823 - val_loss: 0.0654 - val_accuracy: 0.9787\n"
+          ]
+        },
+        {
+          "data": {
+            "text/plain": [
+              "\u003ctensorflow.python.keras.callbacks.History at 0x7f69e0275a58\u003e"
+            ]
+          },
+          "execution_count": null,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "# Load MNIST dataset\n",
+        "mnist = tf.keras.datasets.mnist\n",
+        "(train_images, train_labels), (test_images, test_labels) = mnist.load_data()\n",
+        "\n",
+        "# Normalize the input image so that each pixel value is between 0 to 1.\n",
+        "train_images = train_images.astype(np.float32) / 255.0\n",
+        "test_images = test_images.astype(np.float32) / 255.0\n",
+        "\n",
+        "# Define the model architecture\n",
+        "model = tf.keras.Sequential([\n",
+        "  tf.keras.layers.InputLayer(input_shape=(28, 28)),\n",
+        "  tf.keras.layers.Reshape(target_shape=(28, 28, 1)),\n",
+        "  tf.keras.layers.Conv2D(filters=12, kernel_size=(3, 3), activation='relu'),\n",
+        "  tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),\n",
+        "  tf.keras.layers.Flatten(),\n",
+        "  tf.keras.layers.Dense(10)\n",
+        "])\n",
+        "\n",
+        "# Train the digit classification model\n",
+        "model.compile(optimizer='adam',\n",
+        "              loss=tf.keras.losses.SparseCategoricalCrossentropy(\n",
+        "                  from_logits=True),\n",
+        "              metrics=['accuracy'])\n",
+        "model.fit(\n",
+        "  train_images,\n",
+        "  train_labels,\n",
+        "  epochs=5,\n",
+        "  validation_data=(test_images, test_labels)\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "KuTEoGFYd8aM"
+      },
+      "source": [
+        "## Convert to a TensorFlow Lite model"
       ]
     },
     {
@@ -213,16 +239,16 @@
         "id": "xl8_fzVAZwOh"
       },
       "source": [
-        "### Convert to a TensorFlow Lite model\n",
+        "Now you can convert the trained model to TensorFlow Lite format using the [`TFLiteConverter`](https://www.tensorflow.org/lite/convert/python_api) API, and apply varying degrees of quantization.\n",
         "\n",
-        "Using the Python [TFLiteConverter](https://www.tensorflow.org/lite/convert/python_api), you can now convert the trained model into a TensorFlow Lite model.\n",
+        "Beware that some versions of quantization leave some of the data in float format. So the following sections show each option with increasing amounts of quantization, until we get a model that's entirely int8 or uint8 data. (Notice we duplicate some code in each section so you can see all the quantization steps for each option.)\n",
         "\n",
-        "Now load the model using the `TFLiteConverter`:"
+        "First, here's a converted model with no quantization:"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 20,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -231,63 +257,10 @@
       "outputs": [],
       "source": [
         "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
+        "\n",
         "tflite_model = converter.convert()"
       ]
     },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "F2o2ZfF0aiCx"
-      },
-      "source": [
-        "Write it out to a `.tflite` file:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 21,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "vptWZq2xnclo"
-      },
-      "outputs": [],
-      "source": [
-        "tflite_models_dir = pathlib.Path(\"/tmp/mnist_tflite_models/\")\n",
-        "tflite_models_dir.mkdir(exist_ok=True, parents=True)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 22,
-      "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "Ie9pQaQrn5ue",
-        "outputId": "8580b835-61f0-42b3-a21e-b8d476042c11"
-      },
-      "outputs": [
-        {
-          "data": {
-            "text/plain": [
-              "84528"
-            ]
-          },
-          "execution_count": 22,
-          "metadata": {
-            "tags": []
-          },
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "tflite_model_file = tflite_models_dir/\"mnist_model.tflite\"\n",
-        "tflite_model_file.write_bytes(tflite_model)"
-      ]
-    },
     {
       "cell_type": "markdown",
       "metadata": {
@@ -295,25 +268,81 @@
         "id": "7BONhYtYocQY"
       },
       "source": [
-        "Now you have a trained MNIST model that's converted to a `.tflite` file, but it's still using 32-bit float values for all parameter data.\n",
-        "\n",
-        "So let's convert the model again, this time using quantization...\n",
-        "\n",
-        "#### Convert using quantization\n",
-        "First, first set the `optimizations` flag to optimize for size:"
+        "It's now a TensorFlow Lite model, but it's still using 32-bit float values for all parameter data."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "jPYZwgZTwJMT"
+      },
+      "source": [
+        "### Convert using dynamic range quantization\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Hjvq1vpJd4U_"
+      },
+      "source": [
+        "Now let's enable the default `optimizations` flag to quantize all fixed parameters (such as weights):"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 23,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "HEZ6ET1AHAS3"
+        "id": "HEZ6ET1AHAS3",
+        "outputId": "82a75458-10d2-484a-8e09-a8af56212e10"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "INFO:tensorflow:Assets written to: /tmp/tmpcojyiqri/assets\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:tensorflow:Assets written to: /tmp/tmpcojyiqri/assets\n"
+          ]
+        }
+      ],
       "source": [
-        "converter.optimizations = [tf.lite.Optimize.DEFAULT]"
+        "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
+        "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+        "\n",
+        "tflite_model_quant = converter.convert()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "o5wuE-RcdX_3"
+      },
+      "source": [
+        "The model is now a bit smaller with quantized weights, but other variable data is still in float format."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "UgKDdnHQEhpb"
+      },
+      "source": [
+        "### Convert using float fallback quantization"
       ]
     },
     {
@@ -323,107 +352,87 @@
         "id": "rTe8avZJHMDO"
       },
       "source": [
-        "Now, in order to create quantized values with an accurate dynamic range of activations, you need to provide a representative dataset.\n",
+        "To quantize the variable data (such as model input/output and intermediates between layers), you need to provide a [`RepresentativeDataset`](https://www.tensorflow.org/api_docs/python/tf/lite/RepresentativeDataset). This is a generator function that provides a set of input data that's large enough to represent typical values. It allows the converter to estimate a dynamic range for all the variable data. (The dataset does not need to be unique compared to the training or evaluation dataset.)\n",
         "To support multiple inputs, each representative data point is a list and elements in the list are fed to the model according to their indices.\n"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 24,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
-        "id": "FiwiWU3gHdkW"
-      },
-      "outputs": [],
-      "source": [
-        "mnist_train, _ = tf.keras.datasets.mnist.load_data()\n",
-        "images = tf.cast(mnist_train[0], tf.float32) / 255.0\n",
-        "mnist_ds = tf.data.Dataset.from_tensor_slices((images)).batch(1)\n",
-        "def representative_data_gen():\n",
-        "  for input_value in mnist_ds.take(100):\n",
-        "    # Model has only one input so each data point has one element.\n",
-        "    yield [input_value]\n",
-        "\n",
-        "converter.representative_dataset = representative_data_gen"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "xW84iMYjHd9t"
-      },
-      "source": [
-        "Finally, convert the model to TensorFlow Lite format:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 25,
-      "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "yuNfl3CoHNK3",
-        "outputId": "79a19679-87a2-4dc6-eee4-b33f3e5c1c5d"
-      },
-      "outputs": [
-        {
-          "data": {
-            "text/plain": [
-              "24720"
-            ]
-          },
-          "execution_count": 25,
-          "metadata": {
-            "tags": []
-          },
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "tflite_model_quant = converter.convert()\n",
-        "tflite_model_quant_file = tflite_models_dir/\"mnist_model_quant.tflite\"\n",
-        "tflite_model_quant_file.write_bytes(tflite_model_quant)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "PhMmUTl4sbkz"
-      },
-      "source": [
-        "Note how the resulting file is approximately `1/4` the size:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 26,
-      "metadata": {
-        "colab": {
-          "height": 85
-        },
-        "colab_type": "code",
-        "id": "JExfcfLDscu4",
-        "outputId": "58238f92-01b0-4faa-e293-35451d08dd7c"
+        "id": "FiwiWU3gHdkW",
+        "outputId": "61093d59-5b47-4e59-a577-46f056281bab"
       },
       "outputs": [
         {
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "total 140K\n",
-            "-rw-rw-r-- 1 yashkatariya 10086651 25K Jun 23 06:06 mnist_model_quant_io.tflite\n",
-            "-rw-rw-r-- 1 yashkatariya 10086651 25K Jun 23 06:07 mnist_model_quant.tflite\n",
-            "-rw-rw-r-- 1 yashkatariya 10086651 83K Jun 23 06:06 mnist_model.tflite\n"
+            "INFO:tensorflow:Assets written to: /tmp/tmp1bvfr71i/assets\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:tensorflow:Assets written to: /tmp/tmp1bvfr71i/assets\n"
           ]
         }
       ],
       "source": [
-        "!ls -lh {tflite_models_dir}"
+        "def representative_data_gen():\n",
+        "  for input_value in tf.data.Dataset.from_tensor_slices(train_images).batch(1).take(100):\n",
+        "    # Model has only one input so each data point has one element.\n",
+        "    yield [input_value]\n",
+        "\n",
+        "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
+        "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+        "converter.representative_dataset = representative_data_gen\n",
+        "\n",
+        "tflite_model_quant = converter.convert()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "_GC3HFlptf7x"
+      },
+      "source": [
+        "Now all weights and variable data are quantized, and the model is significantly smaller compared to the original TensorFlow Lite model.\n",
+        "\n",
+        "However, to maintain compatibility with applications that traditionally use float model input and output tensors, the TensorFlow Lite Converter leaves the model input and output tensors in float:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "height": 51
+        },
+        "colab_type": "code",
+        "id": "id1OEKFELQwp",
+        "outputId": "024a710f-44cc-43d1-89a7-456a1727523c"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "input:  \u003cclass 'numpy.float32'\u003e\n",
+            "output:  \u003cclass 'numpy.float32'\u003e\n"
+          ]
+        }
+      ],
+      "source": [
+        "interpreter = tf.lite.Interpreter(model_content=tflite_model_quant)\n",
+        "input_type = interpreter.get_input_details()[0]['dtype']\n",
+        "print('input: ', input_type)\n",
+        "output_type = interpreter.get_output_details()[0]['dtype']\n",
+        "print('output: ', output_type)"
       ]
     },
     {
@@ -433,44 +442,75 @@
         "id": "RACBJuj2XO8x"
       },
       "source": [
-        "Your model should now be fully quantized. However, if you convert a model that includes any operations that TensorFlow Lite cannot quantize, those ops are left in floating point. This allows for conversion to complete so you have a smaller and more efficient model, but the model won't be compatible with some ML accelerators that require full integer quantization. Also, by default, the converted model still use float input and outputs, which also is not compatible with some accelerators.\n",
+        "That's usually good for compatibility, but it won't be compatible with devices that perform only integer-based operations, such as the Edge TPU.\n",
         "\n",
-        "So to ensure that the converted model is fully quantized (make the converter throw an error if it encounters an operation it cannot quantize), and to use integers for the model's input and output, you need to convert the model again using these additional configurations:"
+        "Additionally, the above process may leave an operation in float format if TensorFlow Lite doesn't include a quantized implementation for that operation. This strategy allows conversion to complete so you have a smaller and more efficient model, but again, it won't be compatible with integer-only hardware. (All ops in this MNIST model have a quantized implementation.)\n",
+        "\n",
+        "So to ensure an end-to-end integer-only model, you need a couple more parameters..."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "FQgTqbvPvxGJ"
+      },
+      "source": [
+        "### Convert using integer-only quantization"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "mwR9keYAwArA"
+      },
+      "source": [
+        "To quantize the input and output tensors, and make the converter throw an error if it encounters an operation it cannot quantize, convert the model again with some additional parameters:"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 27,
+      "execution_count": null,
       "metadata": {
         "colab": {
-          "height": 34
+          "height": 51
         },
         "colab_type": "code",
         "id": "kzjEjcDs3BHa",
-        "outputId": "8d7370ec-3f3f-41a2-8afb-4ecdd40e9efc"
+        "outputId": "0462645b-f8e1-489a-f703-8093f83645d5"
       },
       "outputs": [
         {
-          "data": {
-            "text/plain": [
-              "24784"
-            ]
-          },
-          "execution_count": 27,
-          "metadata": {
-            "tags": []
-          },
-          "output_type": "execute_result"
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "INFO:tensorflow:Assets written to: /tmp/tmpvnuxq9pa/assets\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:tensorflow:Assets written to: /tmp/tmpvnuxq9pa/assets\n"
+          ]
         }
       ],
       "source": [
+        "def representative_data_gen():\n",
+        "  for input_value in tf.data.Dataset.from_tensor_slices(train_images).batch(1).take(100):\n",
+        "    yield [input_value]\n",
+        "\n",
+        "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
+        "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+        "converter.representative_dataset = representative_data_gen\n",
+        "# Ensure that if any ops can't be quantized, the converter throws an error\n",
         "converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]\n",
+        "# Set the input and output tensors to uint8 (APIs added in r2.3)\n",
         "converter.inference_input_type = tf.uint8\n",
         "converter.inference_output_type = tf.uint8\n",
         "\n",
-        "tflite_model_quant = converter.convert()\n",
-        "tflite_model_quant_file = tflite_models_dir/\"mnist_model_quant_io.tflite\"\n",
-        "tflite_model_quant_file.write_bytes(tflite_model_quant)"
+        "tflite_model_quant = converter.convert()"
       ]
     },
     {
@@ -480,9 +520,115 @@
         "id": "wYd6NxD03yjB"
       },
       "source": [
-        "In this example, the resulting model size remains the same because all operations successfully quantized to begin with. However, this new model now uses quantized input and output, making it compatible with more accelerators, such as the Coral Edge TPU.\n",
+        "The internal quantization remains the same as above, but you can see the input and output tensors are now integer format:\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "height": 51
+        },
+        "colab_type": "code",
+        "id": "PaNkOS-twz4k",
+        "outputId": "b7b22b48-c305-4b4c-80c6-506d9f3c2013"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "input:  \u003cclass 'numpy.uint8'\u003e\n",
+            "output:  \u003cclass 'numpy.uint8'\u003e\n"
+          ]
+        }
+      ],
+      "source": [
+        "interpreter = tf.lite.Interpreter(model_content=tflite_model_quant)\n",
+        "input_type = interpreter.get_input_details()[0]['dtype']\n",
+        "print('input: ', input_type)\n",
+        "output_type = interpreter.get_output_details()[0]['dtype']\n",
+        "print('output: ', output_type)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "TO17AP84wzBb"
+      },
+      "source": [
+        "Now you have an integer quantized model that uses integer data for the model's input and output tensors, so it's compatible with integer-only hardware such as the [Edge TPU](https://coral.ai)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "sse224YJ4KMm"
+      },
+      "source": [
+        "### Save the models as files"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "4_9nZ4nv4b9P"
+      },
+      "source": [
+        "You'll need a `.tflite` file to deploy your model on other devices. So let's save the converted models to files and then load them when we run inferences below."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "height": 34
+        },
+        "colab_type": "code",
+        "id": "BEY59dC14uRv",
+        "outputId": "20a3397a-1466-48eb-f421-adc8ebf3f60f"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "24720"
+            ]
+          },
+          "execution_count": null,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "import pathlib\n",
         "\n",
-        "In the following sections, notice that we are now handling two TensorFlow Lite models: `tflite_model_file` is the converted model that still uses floating-point parameters, and `tflite_model_quant_file` is the same model converted with full integer quantization, including uint8 input and output."
+        "tflite_models_dir = pathlib.Path(\"/tmp/mnist_tflite_models/\")\n",
+        "tflite_models_dir.mkdir(exist_ok=True, parents=True)\n",
+        "\n",
+        "# Save the unquantized/float model:\n",
+        "tflite_model_file = tflite_models_dir/\"mnist_model.tflite\"\n",
+        "tflite_model_file.write_bytes(tflite_model)\n",
+        "# Save the quantized model:\n",
+        "tflite_model_quant_file = tflite_models_dir/\"mnist_model_quant.tflite\"\n",
+        "tflite_model_quant_file.write_bytes(tflite_model_quant)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "9t9yaTeF9fyM"
+      },
+      "source": [
+        "## Run the TensorFlow Lite models"
       ]
     },
     {
@@ -492,50 +638,56 @@
         "id": "L8lQHMp_asCq"
       },
       "source": [
-        "## Run the TensorFlow Lite models\n",
+        "Now we'll run inferences using the TensorFlow Lite [`Interpreter`](https://www.tensorflow.org/api_docs/python/tf/lite/Interpreter) to compare the model accuracies.\n",
         "\n",
-        "Run the TensorFlow Lite model using the Python TensorFlow Lite\n",
-        "Interpreter. "
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "Ap_jE7QRvhPf"
-      },
-      "source": [
-        "### Load the model into the interpreters"
+        "First, we need a function that runs inference with a given model and images, and then returns the predictions:\n"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 28,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
-        "id": "Jn16Rc23zTss"
+        "id": "X092SbeWfd1A"
       },
       "outputs": [],
       "source": [
-        "interpreter = tf.lite.Interpreter(model_path=str(tflite_model_file))\n",
-        "interpreter.allocate_tensors()"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 29,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "J8Pztk1mvNVL"
-      },
-      "outputs": [],
-      "source": [
-        "interpreter_quant = tf.lite.Interpreter(model_path=str(tflite_model_quant_file))\n",
-        "interpreter_quant.allocate_tensors()\n",
-        "input_index_quant = interpreter_quant.get_input_details()[0][\"index\"]\n",
-        "output_index_quant = interpreter_quant.get_output_details()[0][\"index\"]"
+        "# Helper function to run inference on a TFLite model\n",
+        "def run_tflite_model(tflite_file, test_image_indices):\n",
+        "  global test_images\n",
+        "\n",
+        "  # Initialize the interpreter\n",
+        "  interpreter = tf.lite.Interpreter(model_path=str(tflite_file))\n",
+        "  interpreter.allocate_tensors()\n",
+        "\n",
+        "  input_details = interpreter.get_input_details()[0]\n",
+        "  output_details = interpreter.get_output_details()[0]\n",
+        "\n",
+        "  predictions = np.zeros((len(test_image_indices),), dtype=int)\n",
+        "  for i, test_image_index in enumerate(test_image_indices):\n",
+        "    test_image = test_images[test_image_index]\n",
+        "    test_label = test_labels[test_image_index]\n",
+        "\n",
+        "    # Check if the input type is quantized, then rescale input data to uint8\n",
+        "    if input_details['dtype'] == np.uint8:\n",
+        "      input_scale, input_zero_point = input_details[\"quantization\"]\n",
+        "      test_image = test_image / input_scale + input_zero_point\n",
+        "\n",
+        "    test_image = np.expand_dims(test_image, axis=0).astype(input_details[\"dtype\"])\n",
+        "    interpreter.set_tensor(input_details[\"index\"], test_image)\n",
+        "    interpreter.invoke()\n",
+        "    output = interpreter.get_tensor(output_details[\"index\"])[0]\n",
+        "\n",
+        "    # Check if the output type is quantized, then rescale output data to float\n",
+        "    if output_details['dtype'] == np.uint8:\n",
+        "      output_scale, output_zero_point = output_details[\"quantization\"]\n",
+        "      test_image = test_image.astype(np.float32)\n",
+        "      test_image = test_image / input_scale + input_zero_point\n",
+        "\n",
+        "    predictions[i] = output.argmax()\n",
+        "\n",
+        "  return predictions\n"
       ]
     },
     {
@@ -546,62 +698,88 @@
       },
       "source": [
         "### Test the models on one image\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "QpPpFPaz7eEM"
+      },
+      "source": [
+        "Now we'll compare the performance of the float model and quantized model:\n",
+        "+ `tflite_model_file` is the original TensorFlow Lite model with floating-point data.\n",
+        "+ `tflite_model_quant_file` is the last model we converted using integer-only quantization (it uses uint8 data for input and output).\n",
         "\n",
-        "First test it on the float model:"
+        "Let's create another function to print our predictions:"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 30,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
-        "id": "AKslvo2kwWac"
+        "id": "zR2cHRUcUZ6e"
       },
       "outputs": [],
       "source": [
-        "test_image = np.expand_dims(test_images[0], axis=0).astype(np.float32)\n",
+        "import matplotlib.pylab as plt\n",
         "\n",
-        "input_index = interpreter.get_input_details()[0][\"index\"]\n",
-        "output_index = interpreter.get_output_details()[0][\"index\"]\n",
-        "interpreter.set_tensor(input_index, test_image)\n",
-        "interpreter.invoke()\n",
-        "predictions = interpreter.get_tensor(output_index)"
+        "# Change this to test a different image\n",
+        "test_image_index = 1\n",
+        "\n",
+        "## Helper function to test the models on one image\n",
+        "def test_model(tflite_file, test_image_index, model_type):\n",
+        "  global test_labels\n",
+        "\n",
+        "  predictions = run_tflite_model(tflite_file, [test_image_index])\n",
+        "\n",
+        "  plt.imshow(test_images[test_image_index])\n",
+        "  template = model_type + \" Model \\n True:{true}, Predicted:{predict}\"\n",
+        "  _ = plt.title(template.format(true= str(test_labels[test_image_index]), predict=str(predictions[0])))\n",
+        "  plt.grid(False)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "A5OTJ_6Vcslt"
+      },
+      "source": [
+        "Now test the float model:"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 31,
+      "execution_count": null,
       "metadata": {
         "colab": {
-          "height": 281
+          "height": 296
         },
         "colab_type": "code",
-        "id": "XZClM2vo3_bm",
-        "outputId": "3af2e31c-44c6-41f2-c51f-da9d7b71bdfb"
+        "id": "iTK0x980coto",
+        "outputId": "1881b045-e953-416f-a25f-6c083409c7be"
       },
       "outputs": [
         {
           "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAP8AAAEICAYAAACQ6CLfAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAFxZJREFUeJzt3XtU1HXeB/D3cE0RVDSG4eKMPJBL\nIrI6ZqXhBTFrVwwpw5WEAGnLc9ZL2nbbI1arPPV4nix99jRR7aiFz7qmtIu6KhulVrJj4baYHiKI\nq6DCE4pyG7/PH51mI5nf4DAX9Pt+neM5zO/z/f2+H37ynt/M/GbmpxJCCBCRdDzc3QARuQfDTyQp\nhp9IUgw/kaQYfiJJMfxEkmL4yeF6enqgUqlQXV0NAMjOzsaGDRucPm9+fj5mzpzp9HluFgy/nYYN\nG2b55+HhgSFDhlhuv/vuu06fPzs7u1cPvr6+GDlypNPntUd+fj6effZZm+OmT5+OP/7xj07p4Ztv\nvum1v4YNGwaVSoXNmzc7Zb4bgZe7G7hRXbp0yfKzTqdDfn4+5syZY3V8T08PvLwct7vz8/ORn59v\nuZ2WloahQ4c6bPs/Zjab4enp6ZRtu0pERESv/7Ovv/4a48aNw8KFC93YlXvxyO8kzz//PB5++GEs\nXrwY/v7+2LFjB9LS0pCbm2sZc/jwYeh0Osvturo6JCcn49Zbb8XYsWOxdevWfs118eJF7NmzB+np\n6f0a/8O8L7zwAkaNGoWxY8di586dlnpaWhqWL1+OefPmwc/PD0eOHEFHRwdWr16N8PBwqNVqPPHE\nE+jo6LCsk5eXh+DgYISGhsJoNPaa76e/9/vvv4+4uDgEBAQgMjISBw8exG9/+1t8+umn+PWvf41h\nw4Zh5cqVAIBTp05hzpw5CAwMxM9+9jPs3r3bsp1z587hl7/8JQICAnDnnXeiqqqqX78/ABiNRsye\nPRvh4eH9XuemI2jAtFqtOHToUK9lzz33nPD29hYffPCBMJvN4vLly2LJkiVi3bp1ljGHDh0SWq1W\nCCFET0+PmDhxovj9738vOjs7RUVFhdBqteLw4cNCCCFKSkrEqFGj+pz/rbfeEpGRkf3u99ChQ8LT\n01OsWbNGdHR0iOLiYjFkyBBRUVEhhBBiyZIlYsSIEeKTTz4RZrNZdHR0iOXLl4sHHnhAtLS0iO++\n+07cd9994vnnnxdCCPGXv/xFBAcHi/LycnHp0iXx0EMPCQCiqqrKsr0ffu9jx46J4cOHi8OHDwuz\n2SxqamrE6dOnhRBCTJs2TbzzzjuWPtva2kRISIgwGo2iu7tbmEwmERgYaBmfkpIiUlNTRXt7uzh5\n8qQIDg4WM2bMsKw/b9488corr1zz+1+9elVotVqxffv2fu+zmxHD7wDWwj9r1qxey5TCf/ToUTF2\n7Nhe41944QWRnZ1tc/74+Hjx4osv9rvfQ4cOCW9vb9He3m5ZlpycLDZs2GDp89FHH7XUzGaz8PX1\nFdXV1ZZlH3/8seUO55FHHhHPPfecpVZeXm41/JmZmWLNmjV99vXT8O/YsUPMnDmz15jMzEzx0ksv\nia6uLuHp6Wm5wxJCiLVr1/YKvzV///vfhb+/f6/fX0Z8zu9E1/OQ8ttvv0VNTQ1GjBhhWWY2m22+\nel1VVYWjR49i27Zt19XbqFGjer1GoNVq0dDQYLn9497Pnj2Lzs5OTJw40bJM/OjzYA0NDZg2bVqv\nbVlTW1uLKVOm9KvHb7/9FseOHeu1T3p6epCRkYGmpiaYzeZefWq1WpSWltrcrtFoxEMPPeS010hu\nFAy/E6lUql63/fz8cPnyZcvts2fPWn4ODw9HVFQUvvrqq+uaY9u2bZgxY4Zi4Ppy4cIFXLlyBUOG\nDAEA1NTUQK/X99m7Wq2Gj48Pzpw5A7Vafc22NBoNamtrLbdramqszhseHo7Kyso+az/dX+Hh4UhI\nSMD+/fuvGdvd3Q0PDw/U1tYiMjLS5rw/aG9vx+7du1FUVGRz7M2OL/i5UFxcHIqKitDa2orGxka8\n9tprltpdd90FHx8fbNq0CR0dHTCbzfjyyy9x4sQJxW1u27YNGRkZ1yxPS0tDdna21fWuXr2K3Nxc\ndHV1oaSkBPv378eDDz7Y51hPT09kZ2dj5cqVOHfuHIQQqKurw8GDBwEAixYtwttvv43Tp0+jvb0d\n69evtzpvVlYW8vPz8eGHH+Lq1auoq6vDmTNnAHx/J/PNN99YxiYlJaG8vBzvvfceuru70d3djdLS\nUpw5cwbe3t544IEHsG7dOly5cgX/+te/sH37dsV9BQC7d+9GUFAQ7rnnHptjb3YMvwtlZGQgOjoa\nWq0W8+bNQ2pqqqXm5eWFffv2obS0FDqdDqNHj8Zjjz2GtrY2AEBJSUmvh78AcOTIETQ1NSElJeWa\nuWpra3s9FP+psLAw+Pn5QaPRID09Hfn5+YiKirI6ftOmTdBqtbjjjjswfPhwzJ07FxUVFQCA+fPn\nY/ny5ZgxYwZuu+02JCYmWt3O3XffjTfffBO/+c1vMHz4cMyaNcvyqGHlypUoKCjAiBEjsHr1agwf\nPhx/+9vfsGPHDmg0GgQHB+OZZ55BZ2cnAOAPf/gDWltboVarkZWVhUcffbTXXHPnzsXLL7/ca5nR\naMTSpUuveZQhI5UQ/DKPm01HRwd+/vOf48svv+zzvQWHDx9Gdna25R14JCc+578J3XLLLdf92gHJ\nhw/7iSTFh/1EkuKRn0hSLn3O76PyxS3wc+WURFLpQDu6RGe/xg4o/AcOHMCKFStgNpuRnZ2Np59+\nWnH8LfDDVFXCQKYkIgXHRXG/x9r9sN9sNmP58uXYv38/Tp06hYKCApw6dcrezRGRi9kd/tLSUkRG\nRiIiIgI+Pj5ITU1FYWGhI3sjIieyO/z19fW9PlQRFhaG+vr6a8YZDAbo9Xro9Xp0o3/PRYjI+ewO\nf19nCPt6y2ROTg5MJhNMJhO84WvvdETkYHaHPywsrNcnuerq6hASEuKQpojI+ewO/5QpU1BRUYGq\nqip0dXVh586dSEpKcmRvROREdp/q8/LywpYtW3DvvffCbDYjMzMT48ePd2RvRORELn17b4AqkOf5\niZzouChGm2jp11i+vZdIUgw/kaQYfiJJMfxEkmL4iSTF8BNJiuEnkhTDTyQphp9IUgw/kaQYfiJJ\nMfxEkmL4iSTF8BNJiuEnkhTDTyQphp9IUgw/kaQYfiJJMfxEkmL4iSTF8BNJiuEnkhTDTyQphp9I\nUgw/kaQYfiJJMfxEkmL4iSTF8BNJymsgK+t0Ovj7+8PT0xNeXl4wmUyO6ouInGxA4QeADz/8EKNH\nj3ZEL0TkQnzYTySpAYVfpVJh7ty5mDx5MgwGQ59jDAYD9Ho99Ho9utE5kOmIyIFUQghh78oNDQ0I\nCQlBc3MzEhMT8frrryM+Pt7q+ABVIKaqEuydjohsOC6K0SZa+jV2QEf+kJAQAEBQUBCSk5NRWlo6\nkM0RkQvZHf729nZcvHjR8vPBgwcRExPjsMaIyLnsfrW/qakJycnJAICenh786le/wrx58xzWGBE5\nl93hj4iIwMmTJx3ZCxG5EE/1EUmK4SeSFMNPJCmGn0hSDD+RpAb8wR5ZXFh2l9XamEe+Vlz3dLNa\nsd7V6a1YDy1Qrg+tu2S1drXslOK6JC8e+YkkxfATSYrhJ5IUw08kKYafSFIMP5GkGH4iSfE8fz89\ntfY9q7UUv1bllf9jgJPPVC5X91y2Wtt8btYAJ79xlTZrrdb8Ng1XXNer+ISj2xl0eOQnkhTDTyQp\nhp9IUgw/kaQYfiJJMfxEkmL4iSQ1oCv2XK8b+Yo97Q9OtVo7H6t8HzryK+Vd3BqtUqz7xP6fYv3l\nmPet1hKHXFFct+jyMMX6L4Za/66AgboiuhTrxzv9FOszb+m2e+7IoscU67fl/MPubbuTy67YQ0Q3\nLoafSFIMP5GkGH4iSTH8RJJi+IkkxfATSYqf5+8nvz8fV6gNbNsBA1sdrwfPtFp7aZpOee6PlK85\n8PLMSDs66h+vK1cV637/bFSsj/p4t2J9go/16x0MrVa+FoIMbB75MzMzERQUhJiYGMuylpYWJCYm\nIioqComJiWhttfFlFkQ06NgMf0ZGBg4cONBrWV5eHhISElBRUYGEhATk5eU5rUEicg6b4Y+Pj0dg\nYGCvZYWFhUhPTwcApKenY+/evc7pjoicxq7n/E1NTdBoNAAAjUaD5uZmq2MNBgMMBgMAoBud9kxH\nRE7g9Ff7c3JyYDKZYDKZ4A1fZ09HRP1kV/jVajUaG79/JbaxsRFBQUEObYqInM+u8CclJcFoNAIA\njEYjFixY4NCmiMj5bD7nX7x4MUpKSnD+/HmEhYVh/fr1ePrpp7Fo0SK89dZbGDNmDHbt2uWKXsmK\nnrNNVmt+u63XAMBsY9t+f75gR0eO0ZR9l2J9vI/yn+9/tYyzWtO9843iuj2K1ZuDzfAXFBT0uby4\nuNjhzRCR6/DtvUSSYviJJMXwE0mK4SeSFMNPJCl+pJfcxksbrljf8uwWxbq3ylOxvmvzHKu1UY2f\nKq4rAx75iSTF8BNJiuEnkhTDTyQphp9IUgw/kaQYfiJJ8Tw/uc3pVaGK9Sm+ypcuL+9Svvx44KnL\n192TTHjkJ5IUw08kKYafSFIMP5GkGH4iSTH8RJJi+IkkxfP85FSdv5hitfb5g/9tY23lKzw9vmKF\nYn3IJ6U2ti83HvmJJMXwE0mK4SeSFMNPJCmGn0hSDD+RpBh+IknxPD85Vc191o8vw1TK5/EXVyUq\n1oceOKlYF4pVsnnkz8zMRFBQEGJiYizLcnNzERoairi4OMTFxWHfvn1ObZKIHM9m+DMyMnDgwIFr\nlq9atQplZWUoKyvD/fff75TmiMh5bIY/Pj4egYGBruiFiFzI7hf8tmzZgtjYWGRmZqK1tdXqOIPB\nAL1eD71ej2502jsdETmYXeF//PHHUVlZibKyMmg0Gjz55JNWx+bk5MBkMsFkMsHbxgc1iMh17Aq/\nWq2Gp6cnPDw8sGzZMpSW8tNTRDcau8Lf2Nho+XnPnj29zgQQ0Y3B5nn+xYsXo6SkBOfPn0dYWBjW\nr1+PkpISlJWVQaVSQafT4Y033nBFrzQIefj7K9Yfueeo1Vrb1Q7FdZs3RCjWfTv/oVgnZTbDX1BQ\ncM2yrKwspzRDRK7Dt/cSSYrhJ5IUw08kKYafSFIMP5Gk+JFeGpCK3PGK9b+O/h+rtQUVKYrr+u7j\nqTxn4pGfSFIMP5GkGH4iSTH8RJJi+IkkxfATSYrhJ5IUz/OTou/S7lSs//Ph1xTrlT3dVmuX/jNM\ncV1fNCrWaWB45CeSFMNPJCmGn0hSDD+RpBh+Ikkx/ESSYviJJMXz/JLzCg1RrK/83f8q1n1Vyn9C\nqScfsVq7dT8/r+9OPPITSYrhJ5IUw08kKYafSFIMP5GkGH4iSTH8RJKyeZ6/trYWS5cuxdmzZ+Hh\n4YGcnBysWLECLS0tePjhh1FdXQ2dToc//elPGDlypCt6puug8lL+L5741zrF+kPDLijW370YpFhX\n/8768eWq4prkbDaP/F5eXti0aRO++uorfPbZZ9i6dStOnTqFvLw8JCQkoKKiAgkJCcjLy3NFv0Tk\nIDbDr9FoMGnSJACAv78/oqOjUV9fj8LCQqSnpwMA0tPTsXfvXud2SkQOdV3P+aurq/HFF19g6tSp\naGpqgkajAfD9HURzc7NTGiQi5+j3e/svXbqElJQUvPrqqwgICOj3BAaDAQaDAQDQjc7r75CInKJf\nR/7u7m6kpKRgyZIlWLhwIQBArVajsfH7L1hsbGxEUFDfL/zk5OTAZDLBZDLBG74OapuIBspm+IUQ\nyMrKQnR0NFavXm1ZnpSUBKPRCAAwGo1YsGCB87okIodTCSGE0oCjR4/innvuwYQJE+Dh8f19xYYN\nGzB16lQsWrQINTU1GDNmDHbt2oXAwEDFyQJUgZiqSnBc92STarLyJbSLPtg+oO3f/cxyxfqIbZ8O\naPt0fY6LYrSJln6Ntfmcf/r06bB2/1BcXHx9nRHRoMF3+BFJiuEnkhTDTyQphp9IUgw/kaQYfiJJ\n8au7bwKet99mtZazs3BA2779beXz+Lrtnw1o++Q+PPITSYrhJ5IUw08kKYafSFIMP5GkGH4iSTH8\nRJLief6bwOknrH9l+vyhbQPadlhJl/IA5a+DoEGMR34iSTH8RJJi+IkkxfATSYrhJ5IUw08kKYaf\nSFI8z38D6Jh/h2K9eP4mhepQxzZDNw0e+YkkxfATSYrhJ5IUw08kKYafSFIMP5GkGH4iSdk8z19b\nW4ulS5fi7Nmz8PDwQE5ODlasWIHc3Fy8+eabuPXWWwEAGzZswP333+/0hmXUMM1TsT7Gy/5z+e9e\nDFKse7cpf56fn+a/cdkMv5eXFzZt2oRJkybh4sWLmDx5MhITEwEAq1atwpo1a5zeJBE5ns3wazQa\naDQaAIC/vz+io6NRX1/v9MaIyLmu6zl/dXU1vvjiC0ydOhUAsGXLFsTGxiIzMxOtra19rmMwGKDX\n66HX69GNzoF3TEQO0e/wX7p0CSkpKXj11VcREBCAxx9/HJWVlSgrK4NGo8GTTz7Z53o5OTkwmUww\nmUzwhq/DGieigelX+Lu7u5GSkoIlS5Zg4cKFAAC1Wg1PT094eHhg2bJlKC0tdWqjRORYNsMvhEBW\nVhaio6OxevVqy/LGxkbLz3v27EFMTIxzOiQip7D5gt+xY8ewfft2TJgwAXFxcQC+P61XUFCAsrIy\nqFQq6HQ6vPHGG05vlq7fxgu3K9Y/vVenWBeNXzqwGxpMbIZ/+vTpEH18NzvP6RPd2PgOPyJJMfxE\nkmL4iSTF8BNJiuEnkhTDTyQplejrPJ6TBKgCMVWV4KrpiKRzXBSjTbT0ayyP/ESSYviJJMXwE0mK\n4SeSFMNPJCmGn0hSDD+RpFx6iW6fUR5o1VVZbp87d87y1d+DzWDtbbD2BbA3ezmyN5/q/h/PXfom\nn5/S6/UwmUzuml7RYO1tsPYFsDd7uas3PuwnkhTDTyQpz9zc3Fx3NjB58mR3Tq9osPY2WPsC2Ju9\n3NGbW5/zE5H78GE/kaQYfiJJuSX8Bw4cwLhx4xAZGYm8vDx3tGCVTqezXKNAr9e7tZfMzEwEBQX1\nuiBKS0sLEhMTERUVhcTERKvXSHRHb7m5uQgNDUVcXBzi4uKwb98+t/RWW1uLWbNmITo6GuPHj8fm\nzZsBuH/fWevLbftNuFhPT4+IiIgQlZWVorOzU8TGxory8nJXt2GVVqsV586dc3cbQgghPvroI3Hi\nxAkxfvx4y7K1a9eKjRs3CiGE2Lhxo3jqqacGTW/r1q0Tr7zyilv6+bGGhgZx4sQJIYQQbW1tIioq\nSpSXl7t931nry137zeVH/tLSUkRGRiIiIgI+Pj5ITU1FYWGhq9u4IcTHxyMwMLDXssLCQqSnpwMA\n0tPTsXfvXne01mdvg4VGo8GkSZMA9L6svLv3nbW+3MXl4a+vr0d4eLjldlhYmFt3wE+pVCrMnTsX\nkydPhsFgcHc712hqaoJGowHw/R9Tc3OzmzvqrT+XbXelH19WfjDtO3sud+9oLg+/6OPMokqlcnUb\nVh07dgyff/459u/fj61bt+Ljjz92d0s3jP5ett1VfnpZ+cHC3svdO5rLwx8WFoba2lrL7bq6OoSE\nhLi6Dat+6CUoKAjJycmD7tLjarXacoXkxsZGBAUFubmjfxtMl223dll5d++7wXS5e5eHf8qUKaio\nqEBVVRW6urqwc+dOJCUlubqNPrW3t+PixYuWnw8ePDjoLj2elJQEo9EIADAajViwYIGbO/q3wXLZ\ndmHlsvLu3nfW+nLbfnP5S4xCiKKiIhEVFSUiIiLESy+95I4W+lRZWSliY2NFbGysuP32293eW2pq\nqggODhZeXl4iNDRU5Ofni/Pnz4vZs2eLyMhIMXv2bHHhwoVB01taWpqIiYkREyZMEPPnzxcNDQ1u\n6e3IkSMCgJgwYYKYOHGimDhxoigqKnL7vrPWl7v2G9/eSyQpvsOPSFIMP5GkGH4iSTH8RJJi+Ikk\nxfATSYrhJ5LU/wOdAGX9nfSgHgAAAABJRU5ErkJggg==\n",
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPsAAAEXCAYAAABrgzLrAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAVZUlEQVR4nO3de9RVdZ3H8fcH5aKICsIQIEFeWN5mxGK8pGM2aBplWtNYTBk2GjVljrOYlWatpEmdVpNZM5VGaqJ5ibximomUYxqhaCgqlTcU6EE0YEArLo/f+WPvpw6Pz9nn4dwffp/XWmdxzv7ty5cDn7Ovv70VEZjZ9q9fqwsws+Zw2M0S4bCbJcJhN0uEw26WCIfdLBEOex8habykkLRjq2uplqSrJF3Qy3GXSTq20TWlxGFvM/l/8j9KeqXkNbrOywhJ+xS0n5aPc0m34Sflw6+qZz3WHA57ezoxInYpef2uBTU8A5zSbUtiGvDbFtRideCw91GSRkuaK2mNpKclfayk7VBJCyStk9Qh6ZuSBuRt9+WjPZpvNXygzCJWAUuA4/PphgFvBeZ2q+M9kp7Il3WvpP1L2g6R9IikDZJ+AAzqNu27JS3Op/2FpL+p8WuxAg5733UDsAIYDbwfuEjS3+dtncC/AcOBI4DJwCcBIuLofJyD862GHxQs42rgI/n7DwK3ARu7GiVNAK4HzgZGAHcCt0sakP+43ApcAwwDfgj8Q8m0hwBXAh8H9gC+A8yVNHCbvwnrFYe9Pd2ar+3WSbq1e6OkscCRwDkR8aeIWAxcTh7MiHg4In4ZEVsiYhlZkN5WRR23AMdI2i2f99Xd2j8A3BER8yJiM/BVYCeyLYDDgf7A1yNic0TcCDxUMu104DsRsTAiOiNiNtkPyeFV1Gm94LC3p5MjYvf8dXIP7aOBNRGxoWTY88AYyNa4kn4kaZWk9cBFZGv5bRIRfwTuAD4P7BERD/RQx/Ml478GLM/rGA2sjK17Wj1f8n4cMKPkR20dMDafzhrAYe+bfgcMkzSkZNgbgZX5+0uBXwP7RsSuwHmAqlzW1cAM4Ptl6hjX9UGSyAK7EugAxuTDSmvsshy4sORHbfeI2Dkirq+yTqvAYe+DImI58AvgPyUNyg9snc5fAjkEWA+8Imk/4F+6zeJFYK9eLu5/geOA/+mhbQ7wLkmTJfUn+1HYmNe2ANgCnCWpv6T3AYeWTPtd4BOSDlNmsKR3dfsBszpy2PuuqcB4srXrLcD5EXFP3vbvwD8BG8hC1f0g3Exgdr75fErRQiIzPyLW9ND2G+DDZD8ELwMnkp023BQRm4D3AacBa8j2728umXYR8DHgm8Ba4Ol8XGsQ+eYVZmnwmt0sEQ67WSIcdrNEOOxmiXDYrSFKu6hKOk/S5U1Y5jGSVjR6OX2Vw14jSW/s1h01JL1a8vnvGrjsd0m6Pz+FtkrS5b09T13SP76rzmWSzm1EnRFxUUSc0Yuaet3ffVtJGijpCknP5x1zFkt6ZyOW1a4c9hpFxAul3VHzwQeXDPt517gNuPHEbsAFZJeY7k92mep/beM8ds/rngp8QdIJ3UfoyzfMKLEj2VV7byP73j4PzJE0voU1NZXD3kD5TSAekHSJpN8DMyXNlPT9knG2ugONpN3yNVCHpJWSLpC0Q0/zj4jrIuKuiPhDRKwlu4DmyGpqjYgFwBPAQV2bw5LOkbQK+J6kfpLOlfSMpN9LmpN3e+36e5yarzV/L+lz3b6H7n/no/IureskLc+/p+nAh4DP5Fsat+fjjpZ0k6SXJD0n6ayS+eyUbw2slfQk8LcFf79XI2JmRCyLiNci4kfAc8Bbqvm++iKHvfEOA54FRgIX9mL8q8guM90HOAR4B3AG/HmXYZ2kN5aZ9miywG6T/HLVI4EDgV/lg99A1jV1HFkPtU8DJ5OtGUeTXfX2rXz6A8iuxz81b9sD2LPMssYBPya76m4EMBFYHBGzgGuBr+RbRCdK6gfcDjxKttUyGThb0vH57M4H9s5fx5PdXKN0Wd+W9O0ydYwEJlDF99VnRYRfdXwBAeyTvz8NeKFb+0zg+yWfx+fT7Ej2g7AR2KmkfSrws14s9ziyAE7oZZ1dy12XT7cUOCtvOwbYBAwqGX8pMLnk8yhgc173F4AbStoG59Mf2/3vDHwWuKVMTVcBF5R8PqyH7++zwPfy988CJ5S0TQdW9OLv3h+4h6yLbcv/zzTrtT3si7W75dsw7jiy/4gdJZ3F+lWah6TDgeuA90fEtt42anhEbOlh+EsR8adutd0i6bWSYZ1kP1CjS2uMiFfz3ZaejCW75VVvjANGK+v+2mUHoOs4yFbLZesutD3KtxauIfsxOrOXdWwXHPbG69754FVg55LPbyh5v5xszV4ugK+j7I4vc4F/joj5tRTaTfe6l+fL6N6nHUkdZAcIuz7vTLYp35PlbN37rdIyn4uIfcuM30H249G1KV5u96arLgFXkP1ATYnshhvJ8D578y0Gjs73v3cj2ywFICI6gLuBiyXtmh8U21tSj3eZkXQQcBfw6Yi4vYf2mZLurVPdlwEX5vvcSBoh6aS87Ubg3fmBtwHAf1D+/9a1wLGSTpG0o6Q9JE3M27p3vX0Q2JAfKNxJ0g6SDpLUdSBuDvBZSUMl7Ul2XKHIpWQ/SidGdmOOpDjsTRYR88i6nD4GPAz8qNsoHwEGAE+S7UvfSLZ/XHpOv2sNNoPsINcVJefLSw84jQVetyau0jfItiDulrQB+CXZPjUR8QTwKbJdiY687h4vbomIF4Apee1ryH78Ds6brwAOyA9C3hoRncC7yQ7iPUfWjfZyslNnAF8k23R/juxH8prSZUm6TNJl+ftxZPe7mwisKvm+PlTLl9KXuIvrdkzSYrKDauX2ny0hDrtZIrwZb5YIh90sEQ67WSKaep59gAbGIAY3c5FmSfkTr7IpNvZ42/Cawp73kPoG2VVNl0fEl4vGH8RgDtPkWhZpZgUWFlxXVfVmfN4T61vAO4EDgKl5hwgza0O17LMfCjwdEc9Gdo/wG4CTKkxjZi1SS9jHsHUnhBX5sK1Imi5pkaRFm//yAFAza7KGH42PiFkRMSkiJvXHT+M1a5Vawr6S7NrrLnvylwcLmlmbqSXsDwH7SnpT3tPpg2QdJcysDVV96i0itkg6E/gJ2am3K/PeT2bWhmo6zx4RdwJ31qkWM2sgXy5rlgiH3SwRDrtZIhx2s0Q47GaJcNjNEuGwmyXCYTdLhMNulgiH3SwRDrtZIhx2s0Q47GaJ8COb+4BlFxxR2N45qPwjvEYc+FLhtAsOvqmqmrrs/dOPFrYPeXCnsm0j//sXNS3bto3X7GaJcNjNEuGwmyXCYTdLhMNulgiH3SwRDrtZInyevQ2svWPfwvbHJ36zYcveXP4Ufa/8+u2XF7ZfO2lU2bY5895WOG3n0qeqqsl65jW7WSIcdrNEOOxmiXDYzRLhsJslwmE3S4TDbpYIn2dvgkrn0R+YeEPDln3Zur0K27+24LjC9vHjivvD333AzYXtHxrSUbbtwtOGF0671zk+z15PNYVd0jJgA9AJbImISfUoyszqrx5r9rdHxMt1mI+ZNZD32c0SUWvYA7hb0sOSpvc0gqTpkhZJWrSZjTUuzsyqVetm/FERsVLSXwHzJP06Iu4rHSEiZgGzAHbVsBq7XZhZtWpas0fEyvzP1cAtwKH1KMrM6q/qsEsaLGlI13vgHcDj9SrMzOqrls34kcAtkrrmc11E3FWXqvqYLZPfUtj+04O/VWEO/Qtbv752QmH7zz5QcMbzd6sLp52wdlFhe79BgwrbL1r414Xt5w1fUrZty9AthdNafVUd9oh4Fji4jrWYWQP51JtZIhx2s0Q47GaJcNjNEuGwmyXCXVzr4JUxAwrb+1X4Ta10au3e9xSf3up89jeF7bV4+ouHFLZfN+ziCnMYWLZlz7u8rmkmf9tmiXDYzRLhsJslwmE3S4TDbpYIh90sEQ67WSJ8nr0Odr96QWH7+xd9uLBda9cXtm/pWLaNFdXPGVPuKWzfpV/58+jWXrxmN0uEw26WCIfdLBEOu1kiHHazRDjsZolw2M0S4fPsTdD55G9bXUJZyy48orD99N2/WmEOxbeantFxeNm2IfcsLZy2s8KSbdt4zW6WCIfdLBEOu1kiHHazRDjsZolw2M0S4bCbJcLn2bdz604tPo/+wEeKz6Pv1q/4PPqCjTsUti++oPx953da/2DhtFZfFdfskq6UtFrS4yXDhkmaJ+mp/M+hjS3TzGrVm834q4ATug07F5gfEfsC8/PPZtbGKoY9Iu4D1nQbfBIwO38/Gzi5znWZWZ1Vu88+MiI68vergJHlRpQ0HZgOMIidq1ycmdWq5qPxERFAFLTPiohJETGpf8FD/syssaoN+4uSRgHkf66uX0lm1gjVhn0uMC1/Pw24rT7lmFmjVNxnl3Q9cAwwXNIK4Hzgy8AcSacDzwOnNLJIq97Lby67hwVUPo9eybR7zyhsn3Crz6W3i4phj4ipZZom17kWM2sgXy5rlgiH3SwRDrtZIhx2s0Q47GaJcBfX7cCmeePKti3Y7+IKUxefejt4wbTC9v1nPFPY7ttBtw+v2c0S4bCbJcJhN0uEw26WCIfdLBEOu1kiHHazRPg8ex+w417jC9u/tM8Py7YNrdCF9eGNxcse96XiM+Wda9cWz8DahtfsZolw2M0S4bCbJcJhN0uEw26WCIfdLBEOu1kifJ69D9h7zsrC9kMGVP+bPXX+JwrbJzz6UNXztvbiNbtZIhx2s0Q47GaJcNjNEuGwmyXCYTdLhMNulgifZ28Da6cdUdj+xZGV7v0+sGzLtGXHFk65/2eeLmz3fd+3HxXX7JKulLRa0uMlw2ZKWilpcf6a0tgyzaxWvdmMvwo4oYfhl0TExPx1Z33LMrN6qxj2iLgPWNOEWsysgWo5QHempMfyzfyh5UaSNF3SIkmLNlPhhmdm1jDVhv1SYG9gItABlD2CFBGzImJSREzqX3Agycwaq6qwR8SLEdEZEa8B3wUOrW9ZZlZvVYVd0qiSj+8FHi83rpm1h4rn2SVdDxwDDJe0AjgfOEbSRCCAZcDHG1hjn7fjmNGF7X931sLC9l36Vb/7s+DJfQrbJ6x1f/VUVAx7REztYfAVDajFzBrIl8uaJcJhN0uEw26WCIfdLBEOu1ki3MW1CZaeN7aw/dY33F7T/N++5B/LtrkLq3Xxmt0sEQ67WSIcdrNEOOxmiXDYzRLhsJslwmE3S4TPszfBw++5pMIYtd3BZ7dPvla2bcvatTXN27YfXrObJcJhN0uEw26WCIfdLBEOu1kiHHazRDjsZonwefbtwOaRu5Vt679pTBMreb3Ol14u2xYbix8HpoHF1x/sMGJ4VTUBdI7YvbD9qRkDqp53b0Snyrbt9+kK9yBYv76qZXrNbpYIh90sEQ67WSIcdrNEOOxmiXDYzRLhsJslojePbB4LXA2MJHtE86yI+IakYcAPgPFkj20+JSLceboF7rjxylaXUNZbf9XTQ4AzL7+4a+G0Q0dsKGxf+Jbrqqqp3R3w+TML2/f6zIKq5tubNfsWYEZEHAAcDnxK0gHAucD8iNgXmJ9/NrM2VTHsEdEREY/k7zcAS4ExwEnA7Hy02cDJjSrSzGq3TfvsksYDhwALgZER0ZE3rSLbzDezNtXrsEvaBbgJODsitro4NyKCbH++p+mmS1okadFmiq+FNrPG6VXYJfUnC/q1EXFzPvhFSaPy9lHA6p6mjYhZETEpIib1r/HGimZWvYphlyTgCmBpRHytpGkuMC1/Pw24rf7lmVm9KNsCLxhBOgr4ObAE6Lpn8Xlk++1zgDcCz5OdeltTNK9dNSwO0+Raa+5z/viTNxW2zz/oxiZVkpY/xKaybZuj/O23e2PKY6cVtv/f4uq73466f0th+8AfP1S2bWHMZ32s6bH/bMXz7BFxP1Cu8216yTXro3wFnVkiHHazRDjsZolw2M0S4bCbJcJhN0uEbyXdBDsd/1xh+4EXFXdpjAb+Kw3Zr/DSiIZ2Iz3w5x8tbI8XBtc0/71ufKV844NLapr3UJ6qqb0VvGY3S4TDbpYIh90sEQ67WSIcdrNEOOxmiXDYzRJRsT97PaXan92sWYr6s3vNbpYIh90sEQ67WSIcdrNEOOxmiXDYzRLhsJslwmE3S4TDbpYIh90sEQ67WSIcdrNEOOxmiXDYzRLhsJslomLYJY2V9DNJT0p6QtK/5sNnSlopaXH+mtL4cs2sWr15/MAWYEZEPCJpCPCwpHl52yUR8dXGlWdm9VIx7BHRAXTk7zdIWgqMaXRhZlZf27TPLmk8cAiwMB90pqTHJF0paWiZaaZLWiRp0WY21lSsmVWv12GXtAtwE3B2RKwHLgX2BiaSrfkv7mm6iJgVEZMiYlJ/BtahZDOrRq/CLqk/WdCvjYibASLixYjojIjXgO8ChzauTDOrVW+Oxgu4AlgaEV8rGT6qZLT3Ao/Xvzwzq5feHI0/EjgVWCJpcT7sPGCqpIlAAMuAjzekQjOri94cjb8f6Ok+1HfWvxwzaxRfQWeWCIfdLBEOu1kiHHazRDjsZolw2M0S4bCbJcJhN0uEw26WCIfdLBEOu1kiHHazRDjsZolw2M0SoYho3sKkl4DnSwYNB15uWgHbpl1ra9e6wLVVq561jYuIET01NDXsr1u4tCgiJrWsgALtWlu71gWurVrNqs2b8WaJcNjNEtHqsM9q8fKLtGtt7VoXuLZqNaW2lu6zm1nztHrNbmZN4rCbJaIlYZd0gqTfSHpa0rmtqKEcScskLckfQ72oxbVcKWm1pMdLhg2TNE/SU/mfPT5jr0W1tcVjvAseM97S767Vjz9v+j67pB2A3wLHASuAh4CpEfFkUwspQ9IyYFJEtPwCDElHA68AV0fEQfmwrwBrIuLL+Q/l0Ig4p01qmwm80urHeOdPKxpV+phx4GTgNFr43RXUdQpN+N5asWY/FHg6Ip6NiE3ADcBJLaij7UXEfcCaboNPAmbn72eT/WdpujK1tYWI6IiIR/L3G4Cux4y39LsrqKspWhH2McDyks8raK/nvQdwt6SHJU1vdTE9GBkRHfn7VcDIVhbTg4qP8W6mbo8Zb5vvrprHn9fKB+he76iIeDPwTuBT+eZqW4psH6ydzp326jHezdLDY8b/rJXfXbWPP69VK8K+Ehhb8nnPfFhbiIiV+Z+rgVtov0dRv9j1BN38z9UtrufP2ukx3j09Zpw2+O5a+fjzVoT9IWBfSW+SNAD4IDC3BXW8jqTB+YETJA0G3kH7PYp6LjAtfz8NuK2FtWylXR7jXe4x47T4u2v5488joukvYArZEflngM+1ooYyde0FPJq/nmh1bcD1ZJt1m8mObZwO7AHMB54C7gGGtVFt1wBLgMfIgjWqRbUdRbaJ/hiwOH9NafV3V1BXU743Xy5rlggfoDNLhMNulgiH3SwRDrtZIhx2s0Q47GaJcNjNEvH/9ALsS7Cy9ngAAAAASUVORK5CYII=\n",
             "text/plain": [
-              "\u003cFigure size 600x400 with 1 Axes\u003e"
+              "\u003cFigure size 432x288 with 1 Axes\u003e"
             ]
           },
           "metadata": {
+            "needs_background": "light",
             "tags": []
           },
           "output_type": "display_data"
         }
       ],
       "source": [
-        "import matplotlib.pylab as plt\n",
-        "\n",
-        "plt.imshow(test_images[0])\n",
-        "template = \"True:{true}, predicted:{predict}\"\n",
-        "_ = plt.title(template.format(true= str(test_labels[0]),\n",
-        "                              predict=str(np.argmax(predictions[0]))))\n",
-        "plt.grid(False)"
+        "test_model(tflite_model_file, test_image_index, model_type=\"Float\")"
       ]
     },
     {
@@ -611,41 +789,37 @@
         "id": "o3N6-UGl1dfE"
       },
       "source": [
-        "Now test the quantized model (using the uint8 data):"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 33,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "3gwhv4lKbYZ4"
-      },
-      "outputs": [],
-      "source": [
-        "input_index = interpreter_quant.get_input_details()[0][\"index\"]\n",
-        "output_index = interpreter_quant.get_output_details()[0][\"index\"]\n",
-        "interpreter_quant.set_tensor(input_index, test_image)\n",
-        "interpreter_quant.invoke()\n",
-        "predictions = interpreter_quant.get_tensor(output_index)"
+        "And test the quantized model:"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 296
+        },
         "colab_type": "code",
-        "id": "CIH7G_MwbY2x"
+        "id": "rc1i9umMcp0t",
+        "outputId": "480bc68f-812b-460e-82fe-d66f70b4345e"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPsAAAEXCAYAAABrgzLrAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAWRklEQVR4nO3de9RVdZ3H8fcHRVRQBHEQ0SBvldoSi9FKKxu1lKm0Vjk5jWLlYGuyci3XlGlTNKPWNJrZTQcvqeUl0kwtM5VyeYkx0UhQKm94oUfRwEQtBPzOH/v32PHhnH0O5w6/z2utZ3HO/u3L9zk8n7Ovv70VEZjZhm9Yrwsws+5w2M0y4bCbZcJhN8uEw26WCYfdLBMOu1kmHPYMSHpO0o5tnufNko5p5zzbuUxJIWnnTte0PnHYO0DS0ZIWSHpB0hOSviNpdJeWvVYgImJURDzUjeWnGmamsH16yPBPp+Ezu1WL/Y3D3maSTgD+G/h3YDTwJmAycIOk4T0srdv+ABw1ZNj0NNx6wGFvI0lbAl8CPhkR10fEqohYDBwO7Aj8cxrvQkmnVEy3v6THK96fKOlBSSsk3SfpfRVtR0u6TdLpkpZLeljSIantVOCtwLfSpvu30vCQtLOk7dLwwZ8XJEXFvD8qaVGa788lTapoO0jS7yT9Oc1XdT6OO4HNJe2ept8d2DQNr/zM/lXSA5KWSbpG0naNLrOsXlubw95eb6H4g/5R5cCIeA64Dnhng/N5kCK0oym+PL4vaUJF+z7A74FxwFeB8yUpIk4GbgWOS5vuxw2p449p+KiIGAVcBVwOIOlQ4CTg/cA2aT6XpbZx6Xf6fFrmg8C+Dfwe3+Nva/fp6f3LJP0D8GWKL8MJwCMV9ZQus6xeq85hb69xwNMRsbpK2wDFH2VdEfHDFMyXIuIHwP3A3hWjPBIR50bEGuAiiqCMX5dCJX0WeC3w0TTo48CXI2JRqv80YEpaW04D7o2IKyJiFfB14IkGFvN94Ii0+/Kh9L7Sh4ELIuLuiFgJfA54s6TJDSyzrF6rwmFvr6eBcZI2rtI2IbXXJekoSfMlPSPpGWAPii+SQS//0UfEC+nlqEaLTJv9nwYOi4i/pMGTgLMqlrmMYrN5IrAd8FjFMqPyfS0R8SjwAEUQ74+IodNsR7E2Hxz/OeBPDS6zrF6rwmFvr7nASopNy5dJGgUcAtycBj0PbF4xyrYV404CzgWOA7aOiK2AhdTfRx5U2mdZ0msotgYOHxK+x4BjI2Krip/NIuJXFFslO1TMQ5Xv67gYOCH9O9QfKUI7ON+RwNbAkgaWWVavVeGwt1FE/JliH/ubkg6WNDxtks6mWKtfkkadD0yTNFbStsDxFbMZSRHYpwAkfYRizd6oJykOBq4lHUC8Gjg5Im4b0nwO8LmKA2qjJX0wtf0U2F3S+9NWy6eo+IKq4wcUxypmV2m7DPiIpCmSRlBsAdyRDmrWW2ZZvVaFw95mEfFVigNHpwMrgIcp1uIHRsTzabTvAb8FFgM3UARicPr7gDMothKeBF4P3L4OJZwFfCAdof7GkLY3AK8Bzqw8Kp+WexXFKcPLJT1LsTVxSGp7Gvgg8BWKzexdGq0pIv4SETdV7C5Utt0E/AdwJcWafCeKffu6yyyr16qT71TTWWnN/J/Avmkf1qwnHPYukHQksCoiLu91LZYvh90sE95nN8uEw24dIWmxpAPT65MkndeFZb7ismN7JYe9RZJeNeR685D0fMX7t3Zw2f+YrpN/RkXvuvMkbdHgtJNTrYN1LpZ0YifqjIjTIqJu11QN6TPQTpJGSDpf0iMq+hzMTxcXZcNhb1FEPDrkenOAPSuG3To4bo0r61oxGjiF4mqz11FcPfY/6ziPrVLdRwBfkHTw0BE6UHcvbExxIc7bKT63zwOz03UQWXDYO0hFD7XbJZ0p6U/ATBV9vb9fMc7gGnbj9H50WgMNSFoi6RRJG1Wbf0RcmnrXvRARyymuvGukg0q1ec0F7gX2GNwclvRZSU8A35U0TH/rjfcnSbMlja34PY5Ma80/STp5yOcw9HfeT9Kv0hbJY+lzmkFxrfxn0pbGtWnc7SRdKekpFT38PlUxn83S1sBySfcBf1/y+z0fETMjYnHqc/ATimsg3tjM57U+ctg7bx/gIYqOKqc2MP6FwGpgZ2AviqvPjoGXdxmekfSqGtO+jSKw60SFfYHdgd+kwdsCYykuZ50BfBI4jGLNuB2wHPh2mn434GzgyNS2NbB9jWVNAn4GfJOiY9AUYH5EzKK4wvCraYvoPZKGAddSXIA0ETgAOF7Su9LsvkhxIc5OwLsoetZVLus7kr5To47xwK408XmttyLCP238objUdef0+mjg0SHtM4HvV7yfnKbZmOILYSWwWUX7EcAvG1juQRQB3LXBOgeX+0yabhHwqdS2P/AisGnF+IuAAyreTwBWpbq/AFxe0TYyTX/g0N+ZomfbVTVquhA4peL9PlU+v88B302vHwIOrmibATzewO8+HLgJ+N9e/71082dD2Bfrd3V7h1WYRPGHOCC93O9lWL15SHoTcCnwgYhY1zvBjIvqXXKfioi/DqntKkkvVQxbQ/EFNbSH2vNpt6WaHSj6pjdiErCdil5tgzai6LvO0OVS0YOulrS18D2KL6Pj6oy+QXHYO2/oVUs1e7xR/OGupHYA1yJpL+Aa4KMRMaeVQocYWvdjaRlrXRMvaYDiAOHg+80pNuWreYxX9s2vt8yHI2KXGuMP9owb3BSvtXszWJeA8ym+oKZF0U8+G95n7775wNvS/vdois1SACJigKJjzBmStkwHxXaS9PZqM5K0B3A9xW2wrq3SPlPSzW2q+xzg1LTPjaRtVNwtBuAK4N3pwNsmFH0Bav1tXQIcKOlwSRtL2lrSlNQ2tMfer4EV6UDhZpI2krSHpMEDcbMper6NkbQ9xXGFMmdTfCm9J6p0zNnQOexdFhE3UvRyuwe4C/jJkFGOAjYB7qPYl76CYv+48pz+4BrsBIqDXOdXnC+vPOC0A+vWY67MWRRbEDdIWgH8H8U+NRFxL/AJil2JgVR31YtbougMNC3Vvoziy2/P1Hw+sFs6CPnjKO7E826Kg3gPU3QTPo/i1BkU3YkfSW03sPZtr86RdE56PQk4Ns3riYrP68OtfCjrE18bvwGTNJ/ioFqt/WfLiMNulglvxptlwmE3y4TDbpaJrp5n30QjYlNGdnORZln5K8/zYqyseifilsKeekidRXFV03kR8ZWy8TdlJPvogFYWaWYl7ii5rqrpzfjUE+vbFHf03I3iyR+7NTs/M+usVvbZ9wYeiIiHIuJFimd0HVpnGjPrkVbCPpFXdkJ4nCqP3pE0Q9I8SfNWsbKFxZlZKzp+ND4iZkXE1IiYOpwRnV6cmdXQStiX8Mpnb22fhplZH2ol7HcCu0h6derp9CGKjhJm1oeaPvUWEaslHQf8nOLU2wWp95OZ9aGWzrNHxHXAdW2qxcw6yJfLmmXCYTfLhMNulgmH3SwTDrtZJhx2s0w47GaZcNjNMuGwm2XCYTfLhMNulgmH3SwTDrtZJvzI5vXA4lPeXNq+ZtPaj/DaZvenSqedu+eVTdU0aKdffKS0fYtfb1azbfw3ftXSsm3deM1ulgmH3SwTDrtZJhx2s0w47GaZcNjNMuGwm2XC59n7wPKf7lLavnDKtzq27FW1T9E35HfvOK+0/ZKpE2q2zb7x7aXTrll0f1M1WXVes5tlwmE3y4TDbpYJh90sEw67WSYcdrNMOOxmmfB59i6odx799imXd2zZ5zyzY2n71+YeVNo+eVJ5f/gbdvtRafuHtxio2Xbq0eNKp93xsz7P3k4thV3SYmAFsAZYHRFT21GUmbVfO9bs74iIp9swHzPrIO+zm2Wi1bAHcIOkuyTNqDaCpBmS5kmat4qVLS7OzJrV6mb8fhGxRNLfATdK+l1E3FI5QkTMAmYBbKmxLXa7MLNmtbRmj4gl6d+lwFXA3u0oyszar+mwSxopaYvB18A7gYXtKszM2quVzfjxwFWSBudzaURc35aq1jOrD3hjafsv9vx2nTkML239+vJdS9t/+U8lZzz/uLR02l2XzyttH7bppqXtp93x+tL2k8YtqNm2eszq0mmtvZoOe0Q8BOzZxlrMrIN86s0sEw67WSYcdrNMOOxmmXDYzTLhLq5t8NzETUrbh9X5Tq13au3m95af3lrz0O9L21vxwJf2Km2/dOwZdeYwombL9td7XdNN/rTNMuGwm2XCYTfLhMNulgmH3SwTDrtZJhx2s0z4PHsbbHXx3NL2D8z7l9J2LX+2tH31wOJ1rKh9jpl2U2n7qGG1z6Nbf/Ga3SwTDrtZJhx2s0w47GaZcNjNMuGwm2XCYTfLhM+zd8Ga+/7Q6xJqWnzqm0vbP7bV6XXmUH6r6RMG3lSzbYubFpVOu6bOkm3deM1ulgmH3SwTDrtZJhx2s0w47GaZcNjNMuGwm2XC59k3cM8cWX4e/fajys+jjx5Wfh597sqNStvnn1L7vvObPfvr0mmtvequ2SVdIGmppIUVw8ZKulHS/enfMZ0t08xa1chm/IXAwUOGnQjMiYhdgDnpvZn1sbphj4hbgGVDBh8KXJReXwQc1ua6zKzNmt1nHx8RA+n1E8D4WiNKmgHMANiUzZtcnJm1quWj8RERQJS0z4qIqRExdXjJQ/7MrLOaDfuTkiYApH+Xtq8kM+uEZsN+DTA9vZ4OXN2ecsysU+rus0u6DNgfGCfpceCLwFeA2ZI+BjwCHN7JIq15T7+h5h4WUP88ej3Tbz6mtH3XH/tcer+oG/aIOKJG0wFtrsXMOsiXy5plwmE3y4TDbpYJh90sEw67WSbcxXUD8OKNk2q2zX3tGXWmLj/1tufc6aXtrzvhwdJ23w66f3jNbpYJh90sEw67WSYcdrNMOOxmmXDYzTLhsJtlwufZ1wMb7zi5tP2/dv5hzbYxdbqw3rWyfNmT/qv8TPma5cvLZ2B9w2t2s0w47GaZcNjNMuGwm2XCYTfLhMNulgmH3SwTPs++Hthp9pLS9r02af47+4g5Hy9t3/W3dzY9b+svXrObZcJhN8uEw26WCYfdLBMOu1kmHHazTDjsZpnwefY+sHz6m0vbvzS+3r3fR9Rsmb74wNIpX/eZB0rbfd/3DUfdNbukCyQtlbSwYthMSUskzU8/0zpbppm1qpHN+AuBg6sMPzMipqSf69pblpm1W92wR8QtwLIu1GJmHdTKAbrjJN2TNvPH1BpJ0gxJ8yTNW0WdG56ZWcc0G/azgZ2AKcAAUPMIUkTMioipETF1eMmBJDPrrKbCHhFPRsSaiHgJOBfYu71lmVm7NRV2SRMq3r4PWFhrXDPrD3XPs0u6DNgfGCfpceCLwP6SpgABLAaO7WCN672NJ25X2v7WT91R2j5qWPO7P3Pv27m0fdfl7q+ei7phj4gjqgw+vwO1mFkH+XJZs0w47GaZcNjNMuGwm2XCYTfLhLu4dsGik3Yobf/xtte2NP93LPhgzTZ3YbVBXrObZcJhN8uEw26WCYfdLBMOu1kmHHazTDjsZpnwefYuuOu9Z9YZo7U7+Iz+t5dqtq1evryleduGw2t2s0w47GaZcNjNMuGwm2XCYTfLhMNulgmH3SwTPs++AVg1fnTNtuEvTuxiJWtb89TTNdtiZfnjwDSi/PqDjbYZ11RNAGu22aq0/f4TNml63o2INarZ9tpP1rkHwbPPNrVMr9nNMuGwm2XCYTfLhMNulgmH3SwTDrtZJhx2s0w08sjmHYCLgfEUj2ieFRFnSRoL/ACYTPHY5sMjwp2ne+CnV1zQ6xJqestvqj0EuPD0k1uWTjtmmxWl7Xe88dKmaup3u33+uNL2HT8zt6n5NrJmXw2cEBG7AW8CPiFpN+BEYE5E7ALMSe/NrE/VDXtEDETE3en1CmARMBE4FLgojXYRcFinijSz1q3TPrukycBewB3A+IgYSE1PUGzmm1mfajjskkYBVwLHR8QrLs6NiKDYn6823QxJ8yTNW0X5tdBm1jkNhV3ScIqgXxIRP0qDn5Q0IbVPAJZWmzYiZkXE1IiYOrzFGyuaWfPqhl2SgPOBRRHxtYqma4Dp6fV04Or2l2dm7aJiC7xkBGk/4FZgATB4z+KTKPbbZwOvAh6hOPW2rGxeW2ps7KMDWq15vfOXn7+6tH3OHld0qZK8vBAv1mxbFbVvv92IafccXdr+5/nNd7+dcNvq0vYRP7uzZtsdMYdnY1nV/rN1z7NHxG1Arc63+SXXbD3lK+jMMuGwm2XCYTfLhMNulgmH3SwTDrtZJnwr6S7Y7F0Pl7bvflp5l8bo4P/SFq8tvTSio91Id7/1I6Xt8ejIlua/4xXP1W789YKW5j2G+1tq7wWv2c0y4bCbZcJhN8uEw26WCYfdLBMOu1kmHHazTNTtz95OufZnN+uWsv7sXrObZcJhN8uEw26WCYfdLBMOu1kmHHazTDjsZplw2M0y4bCbZcJhN8uEw26WCYfdLBMOu1kmHHazTDjsZpmoG3ZJO0j6paT7JN0r6dNp+ExJSyTNTz/TOl+umTWrkccPrAZOiIi7JW0B3CXpxtR2ZkSc3rnyzKxd6oY9IgaAgfR6haRFwMROF2Zm7bVO++ySJgN7AXekQcdJukfSBZLG1JhmhqR5kuatYmVLxZpZ8xoOu6RRwJXA8RHxLHA2sBMwhWLNf0a16SJiVkRMjYipwxnRhpLNrBkNhV3ScIqgXxIRPwKIiCcjYk1EvAScC+zduTLNrFWNHI0XcD6wKCK+VjF8QsVo7wMWtr88M2uXRo7G7wscCSyQND8NOwk4QtIUIIDFwLEdqdDM2qKRo/G3AdXuQ31d+8sxs07xFXRmmXDYzTLhsJtlwmE3y4TDbpYJh90sEw67WSYcdrNMOOxmmXDYzTLhsJtlwmE3y4TDbpYJh90sE4qI7i1Megp4pGLQOODprhWwbvq1tn6tC1xbs9pZ26SI2KZaQ1fDvtbCpXkRMbVnBZTo19r6tS5wbc3qVm3ejDfLhMNuloleh31Wj5dfpl9r69e6wLU1qyu19XSf3cy6p9drdjPrEofdLBM9CbukgyX9XtIDkk7sRQ21SFosaUF6DPW8HtdygaSlkhZWDBsr6UZJ96d/qz5jr0e19cVjvEseM97Tz67Xjz/v+j67pI2APwAHAY8DdwJHRMR9XS2kBkmLgakR0fMLMCS9DXgOuDgi9kjDvgosi4ivpC/KMRHx2T6pbSbwXK8f452eVjSh8jHjwGHA0fTwsyup63C68Ln1Ys2+N/BARDwUES8ClwOH9qCOvhcRtwDLhgw+FLgovb6I4o+l62rU1hciYiAi7k6vVwCDjxnv6WdXUldX9CLsE4HHKt4/Tn897z2AGyTdJWlGr4upYnxEDKTXTwDje1lMFXUf491NQx4z3jefXTOPP2+VD9Ctbb+IeANwCPCJtLnal6LYB+unc6cNPca7W6o8Zvxlvfzsmn38eat6EfYlwA4V77dPw/pCRCxJ/y4FrqL/HkX95OATdNO/S3tcz8v66THe1R4zTh98dr18/Hkvwn4nsIukV0vaBPgQcE0P6liLpJHpwAmSRgLvpP8eRX0NMD29ng5c3cNaXqFfHuNd6zHj9Piz6/njzyOi6z/ANIoj8g8CJ/eihhp17Qj8Nv3c2+vagMsoNutWURzb+BiwNTAHuB+4CRjbR7V9D1gA3EMRrAk9qm0/ik30e4D56Wdarz+7krq68rn5clmzTPgAnVkmHHazTDjsZplw2M0y4bCbZcJhN8uEw26Wif8HteKJB66NhMUAAAAASUVORK5CYII=\n",
+            "text/plain": [
+              "\u003cFigure size 432x288 with 1 Axes\u003e"
+            ]
+          },
+          "metadata": {
+            "needs_background": "light",
+            "tags": []
+          },
+          "output_type": "display_data"
+        }
+      ],
       "source": [
-        "plt.imshow(test_images[0])\n",
-        "template = \"True:{true}, predicted:{predict}\"\n",
-        "_ = plt.title(template.format(true= str(test_labels[0]),\n",
-        "                              predict=str(np.argmax(predictions[0]))))\n",
-        "plt.grid(False)"
+        "test_model(tflite_model_quant_file, test_image_index, model_type=\"Quantized\")"
       ]
     },
     {
@@ -655,7 +829,17 @@
         "id": "LwN7uIdCd8Gw"
       },
       "source": [
-        "### Evaluate the models"
+        "### Evaluate the models on all images"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "RFKOD4DG8XmU"
+      },
+      "source": [
+        "Now let's run both models using all the test images we loaded at the beginning of this tutorial:"
       ]
     },
     {
@@ -668,49 +852,52 @@
       },
       "outputs": [],
       "source": [
-        "# A helper function to evaluate the TF Lite model using \"test\" dataset.\n",
-        "def evaluate_model(interpreter):\n",
-        "  input_index = interpreter.get_input_details()[0][\"index\"]\n",
-        "  output_index = interpreter.get_output_details()[0][\"index\"]\n",
+        "# Helper function to evaluate a TFLite model on all images\n",
+        "def evaluate_model(tflite_file, model_type):\n",
+        "  global test_images\n",
+        "  global test_labels\n",
         "\n",
-        "  # Run predictions on every image in the \"test\" dataset.\n",
-        "  prediction_digits = []\n",
-        "  for test_image in test_images:\n",
-        "    # Pre-processing: add batch dimension and convert to float32 to match with\n",
-        "    # the model's input data format.\n",
-        "    test_image = np.expand_dims(test_image, axis=0).astype(np.float32)\n",
-        "    interpreter.set_tensor(input_index, test_image)\n",
+        "  test_image_indices = range(test_images.shape[0])\n",
+        "  predictions = run_tflite_model(tflite_file, test_image_indices)\n",
         "\n",
-        "    # Run inference.\n",
-        "    interpreter.invoke()\n",
+        "  accuracy = (np.sum(test_labels== predictions) * 100) / len(test_images)\n",
         "\n",
-        "    # Post-processing: remove batch dimension and find the digit with highest\n",
-        "    # probability.\n",
-        "    output = interpreter.tensor(output_index)\n",
-        "    digit = np.argmax(output()[0])\n",
-        "    prediction_digits.append(digit)\n",
-        "\n",
-        "  # Compare prediction results with ground truth labels to calculate accuracy.\n",
-        "  accurate_count = 0\n",
-        "  for index in range(len(prediction_digits)):\n",
-        "    if prediction_digits[index] == test_labels[index]:\n",
-        "      accurate_count += 1\n",
-        "  accuracy = accurate_count * 1.0 / len(prediction_digits)\n",
-        "\n",
-        "  return accuracy"
+        "  print('%s model accuracy is %.4f%% (Number of test samples=%d)' % (\n",
+        "      model_type, accuracy, len(test_images)))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "xnFilQpBuMh5"
+      },
+      "source": [
+        "Evaluate the float model:"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "T5mWkSbMcU5z"
+        "id": "T5mWkSbMcU5z",
+        "outputId": "7e05d400-1455-4c1a-f3f0-b81422c3a0ba"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Float model accuracy is 97.8700% (Number of test samples=10000)\n"
+          ]
+        }
+      ],
       "source": [
-        "print(evaluate_model(interpreter))"
+        "evaluate_model(tflite_model_file, model_type=\"Float\")"
       ]
     },
     {
@@ -720,25 +907,31 @@
         "id": "Km3cY9ry8ZlG"
       },
       "source": [
-        "Repeat the evaluation on the fully quantized model using the uint8 data:"
+        "Evaluate the quantized model:"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "-9cnwiPp6EGm"
+        "id": "-9cnwiPp6EGm",
+        "outputId": "1e7409cf-748d-45c9-aa2f-36ccd9454f45"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Quantized model accuracy is 97.8100% (Number of test samples=10000)\n"
+          ]
+        }
+      ],
       "source": [
-        "# NOTE: Colab runs on server CPUs, and TensorFlow Lite currently\n",
-        "# doesn't have super optimized server CPU kernels. So this part may be\n",
-        "# slower than the above float interpreter. But for mobile CPUs, considerable\n",
-        "# speedup can be observed.\n",
-        "\n",
-        "print(evaluate_model(interpreter_quant))"
+        "evaluate_model(tflite_model_quant_file, model_type=\"Quantized\")"
       ]
     },
     {
@@ -748,7 +941,9 @@
         "id": "L7lfxkor8pgv"
       },
       "source": [
-        "In this example, you have fully quantized a model with almost no difference in the accuracy, compared to the above float model."
+        "So you now have an integer quantized a model with almost no difference in the accuracy, compared to the float model.\n",
+        "\n",
+        "To learn more about other quantization strategies, read about [TensorFlow Lite model optimization](https://www.tensorflow.org/lite/performance/model_optimization)."
       ]
     }
   ],
diff --git a/tensorflow/lite/g3doc/performance/post_training_quantization.md b/tensorflow/lite/g3doc/performance/post_training_quantization.md
index bcc67184b4c..6198798978f 100644
--- a/tensorflow/lite/g3doc/performance/post_training_quantization.md
+++ b/tensorflow/lite/g3doc/performance/post_training_quantization.md
@@ -161,7 +161,7 @@ def representative_dataset_gen():
     yield [input]
 converter.representative_dataset = representative_dataset_gen
 <b>converter.optimizations = [tf.lite.Optimize.DEFAULT]
-converter.target_spec.supported_ops = [tf.lite.constants.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8]</b>
+converter.target_spec.supported_ops = [tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8]</b>
 tflite_quant_model = converter.convert()
 </pre>
 
@@ -177,7 +177,7 @@ def representative_dataset_gen():
     yield [input]
 converter.representative_dataset = representative_dataset_gen
 converter.optimizations = [tf.lite.Optimize.DEFAULT]
-converter.target_spec.supported_ops = [tf.lite.constants.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8,
+converter.target_spec.supported_ops = [tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8,
 <b>tf.lite.OpsSet.TFLITE_BUILTINS</b>]
 tflite_quant_model = converter.convert()
 </pre>
diff --git a/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb b/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb
index 37b2395dec6..e88c2e93519 100644
--- a/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb
+++ b/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb
@@ -101,7 +101,7 @@
       },
       "outputs": [],
       "source": [
-        "!pip install git+https://github.com/tensorflow/examples.git#egg=tensorflow-examples[model_maker]"
+        "!pip install tflite-model-maker"
       ]
     },
     {
@@ -129,11 +129,10 @@
         "import tensorflow as tf\n",
         "assert tf.__version__.startswith('2')\n",
         "\n",
-        "from tensorflow_examples.lite.model_maker.core.data_util.image_dataloader import ImageClassifierDataLoader\n",
-        "from tensorflow_examples.lite.model_maker.core.task import image_classifier\n",
-        "from tensorflow_examples.lite.model_maker.core.task.configs import QuantizationConfig\n",
-        "from tensorflow_examples.lite.model_maker.core.task.model_spec import mobilenet_v2_spec\n",
-        "from tensorflow_examples.lite.model_maker.core.task.model_spec import ImageModelSpec\n",
+        "from tflite_model_maker import configs\n",
+        "from tflite_model_maker import image_classifier\n",
+        "from tflite_model_maker import ImageClassifierDataLoader\n",
+        "from tflite_model_maker import model_spec\n",
         "\n",
         "import matplotlib.pyplot as plt"
       ]
@@ -640,7 +639,7 @@
         "id": "-4jQaxyT5_KV"
       },
       "source": [
-        "Here, we also demonstrate how to use the above files to run and evaluate the TensorFlow Lite model."
+        "You can also evalute the tflite model with the `evaluate_tflite` method."
       ]
     },
     {
@@ -653,53 +652,7 @@
       },
       "outputs": [],
       "source": [
-        "# Read TensorFlow Lite model from TensorFlow Lite file.\n",
-        "with tf.io.gfile.GFile('model.tflite', 'rb') as f:\n",
-        "  model_content = f.read()\n",
-        "\n",
-        "# Read label names from label file.\n",
-        "with tf.io.gfile.GFile('labels.txt', 'r') as f:\n",
-        "  label_names = f.read().split('\\n')\n",
-        "\n",
-        "# Initialze TensorFlow Lite inpterpreter.\n",
-        "interpreter = tf.lite.Interpreter(model_content=model_content)\n",
-        "interpreter.allocate_tensors()\n",
-        "input_index = interpreter.get_input_details()[0]['index']\n",
-        "output = interpreter.tensor(interpreter.get_output_details()[0][\"index\"])\n",
-        "\n",
-        "# Run predictions on each test image data and calculate accuracy.\n",
-        "accurate_count = 0\n",
-        "for i, (image, label) in enumerate(test_data.dataset):\n",
-        "    # Pre-processing should remain the same. Currently, just normalize each pixel value and resize image according to the model's specification.\n",
-        "    image, _ = model.preprocess(image, label)\n",
-        "    # Add batch dimension and convert to float32 to match with the model's input\n",
-        "    # data format.\n",
-        "    image = tf.expand_dims(image, 0).numpy()\n",
-        "\n",
-        "    # Run inference.\n",
-        "    interpreter.set_tensor(input_index, image)\n",
-        "    interpreter.invoke()\n",
-        "\n",
-        "    # Post-processing: remove batch dimension and find the label with highest\n",
-        "    # probability.\n",
-        "    predict_label = np.argmax(output()[0])\n",
-        "    # Get label name with label index.\n",
-        "    predict_label_name = label_names[predict_label]\n",
-        "\n",
-        "    accurate_count += (predict_label == label.numpy())\n",
-        "\n",
-        "accuracy = accurate_count * 1.0 / test_data.size\n",
-        "print('TensorFlow Lite model accuracy = %.4f' % accuracy)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "fuHB-NFqpKTD"
-      },
-      "source": [
-        "Note that preprocessing for inference should be the same as training. Currently, preprocessing contains normalizing each pixel value and resizing the image according to the model's specification. For  EfficientNet-Lite0, input image should be normalized to `[0, 1]` and resized to `[224, 224, 3]`."
+        "model.evaluate_tflite('model.tflite', test_data)"
       ]
     },
     {
@@ -766,7 +719,7 @@
       },
       "outputs": [],
       "source": [
-        "config = QuantizationConfig.create_full_integer_quantization(representative_data=test_data, is_integer_only=True)"
+        "config = configs.QuantizationConfig.create_full_integer_quantization(representative_data=test_data, is_integer_only=True)"
       ]
     },
     {
@@ -836,7 +789,7 @@
       },
       "outputs": [],
       "source": [
-        "model = image_classifier.create(train_data, model_spec=mobilenet_v2_spec, validation_data=validation_data)"
+        "model = image_classifier.create(train_data, model_spec=model_spec.mobilenet_v2_spec, validation_data=validation_data)"
       ]
     },
     {
@@ -888,7 +841,7 @@
       },
       "outputs": [],
       "source": [
-        "inception_v3_spec = ImageModelSpec(\n",
+        "inception_v3_spec = model_spec.ImageModelSpec(\n",
         "    uri='https://tfhub.dev/google/imagenet/inception_v3/feature_vector/1')\n",
         "inception_v3_spec.input_image_shape = [299, 299]"
       ]
diff --git a/tensorflow/lite/g3doc/tutorials/model_maker_question_answer.ipynb b/tensorflow/lite/g3doc/tutorials/model_maker_question_answer.ipynb
new file mode 100644
index 00000000000..645be959d0e
--- /dev/null
+++ b/tensorflow/lite/g3doc/tutorials/model_maker_question_answer.ipynb
@@ -0,0 +1,654 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "h2q27gKz1H20"
+      },
+      "source": [
+        "##### Copyright 2020 The TensorFlow Authors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "both",
+        "colab": {},
+        "colab_type": "code",
+        "id": "TUfAcER1oUS6"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Gb7qyhNL1yWt"
+      },
+      "source": [
+        "# Question Answer with TensorFlow Lite Model Maker"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Fw5Y7snSuG51"
+      },
+      "source": [
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/lite/tutorials/model_maker_question_answer\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/tutorials/model_maker_question_answer.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/tutorials/model_maker_question_answer.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tensorflow/tensorflow/lite/g3doc/tutorials/model_maker_question_answer.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "\u003c/table\u003e"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "sr3q-gvm3cI8"
+      },
+      "source": [
+        "The TensorFlow Lite Model Maker library simplifies the process of adapting and converting a TensorFlow model to particular input data when deploying this model for on-device ML applications.\n",
+        "\n",
+        "This notebook shows an end-to-end example that utilizes the Model Maker library to illustrate the adaptation and conversion of a commonly-used question answer model for question answer task."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "UxEHFTk755qw"
+      },
+      "source": [
+        "# Introduction to Question Answer Task"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "cFbKTCF25-SG"
+      },
+      "source": [
+        "The supported task in this library is extractive question answer task, which means given a passage and a question, the answer is the span in the passage. The image below shows an example for question answer.\n",
+        "\n",
+        "\n",
+        "\u003cp align=\"center\"\u003e\u003cimg src=\"https://storage.googleapis.com/download.tensorflow.org/models/tflite/screenshots/model_maker_squad_showcase.png\"  width=\"500\"\u003e\u003c/p\u003e\n",
+        "\n",
+        "\u003cp align=\"center\"\u003e\n",
+        "    \u003cem\u003eAnswers are spans in the passage (image credit: \u003ca href=\"https://rajpurkar.github.io/mlx/qa-and-squad/\"\u003eSQuAD blog\u003c/a\u003e) \u003c/em\u003e\n",
+        "\u003c/p\u003e\n",
+        "\n",
+        "As for the model of question answer task, the inputs should be the passage and question pair that are already preprocessed, the outputs should be the start logits and end logits for each token in the passage.\n",
+        "The size of input could be set and adjusted according to the length of passage and question."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "gb7P4WQta8Ub"
+      },
+      "source": [
+        "## End-to-End Overview\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "w7cIHjIfbDlG"
+      },
+      "source": [
+        "The following code snippet demonstrates how to get the model within a few lines of code. The overall process includes 5 steps: (1) choose a model, (2) load data, (3) retrain the model, (4) evaluate, and (5) export it to TensorFlow Lite format."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "xQPdlxZBYuZG"
+      },
+      "source": [
+        "\n",
+        "```python\n",
+        "# Chooses a model specification that represents the model.\n",
+        "spec = model_spec.get('mobilebert_qa')\n",
+        "\n",
+        "# Gets the training data and validation data.\n",
+        "train_data = QuestionAnswerDataLoader.from_squad(train_data_path, spec, is_training=True)\n",
+        "validation_data = QuestionAnswerDataLoader.from_squad(validation_data_path, spec, is_training=False)\n",
+        "\n",
+        "# Fine-tunes the model.\n",
+        "model = question_answer.create(train_data, model_spec=spec)\n",
+        "\n",
+        "# Gets the evaluation result.\n",
+        "metric = model.evaluate(validation_data)\n",
+        "\n",
+        "# Exports the model to the TensorFlow Lite format in the export directory.\n",
+        "model.export(export_dir)\n",
+        "```"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "exScAdvBbNEi"
+      },
+      "source": [
+        "The following sections explain the code in more detail."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "bcLF2PKkSbV3"
+      },
+      "source": [
+        "## Prerequisites\n",
+        "\n",
+        "To run this example, install the required packages, including the Model Maker package from the [GitHub repo](https://github.com/tensorflow/examples/tree/master/tensorflow_examples/lite/model_maker)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "qhl8lqVamEty"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install tflite-model-maker"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "l6lRhVK9Q_0U"
+      },
+      "source": [
+        "Import the required packages."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "XtxiUeZEiXpt"
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\n",
+        "import os\n",
+        "\n",
+        "import tensorflow as tf\n",
+        "assert tf.__version__.startswith('2')\n",
+        "\n",
+        "from tflite_model_maker import configs\n",
+        "from tflite_model_maker import model_spec\n",
+        "from tflite_model_maker import question_answer\n",
+        "from tflite_model_maker import QuestionAnswerDataLoader"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "l65ctmtW7_FF"
+      },
+      "source": [
+        "The \"End-to-End Overview\" demonstrates a simple end-to-end example. The following sections walk through the example step by step to show more detail."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "kJ_B8fMDOhMR"
+      },
+      "source": [
+        "## Choose a model_spec that represents a model for question answer\n",
+        "\n",
+        "Each `model_spec` object represents a specific model for question answer. The Model Maker currently supports MobileBERT and BERT-Base models.\n",
+        "\n",
+        "Supported Model | Name of model_spec | Model Description\n",
+        "--- | --- | ---\n",
+        "[MobileBERT](https://arxiv.org/pdf/2004.02984.pdf)  | 'mobilebert_qa' | 4.3x smaller and 5.5x faster than BERT-Base while achieving competitive results, suitable for on-device scenario.\n",
+        "[MobileBERT-SQuAD](https://arxiv.org/pdf/2004.02984.pdf)  | 'mobilebert_qa_squad' | Same model architecture as MobileBERT model and the initial model is already retrained on [SQuAD1.1](https://rajpurkar.github.io/SQuAD-explorer/).\n",
+        "[BERT-Base](https://arxiv.org/pdf/1810.04805.pdf) | 'bert_qa' | Standard BERT model that widely used in NLP tasks.\n",
+        "\n",
+        "In this tutorial, [MobileBERT-SQuAD](https://arxiv.org/pdf/2004.02984.pdf) is used as an example. Since the model is already retrained on [SQuAD1.1](https://rajpurkar.github.io/SQuAD-explorer/), it could coverage faster for question answer task.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "vEAWuZQ1PFiX"
+      },
+      "outputs": [],
+      "source": [
+        "spec = model_spec.get('mobilebert_qa_squad')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "ygEncJxtl-nQ"
+      },
+      "source": [
+        "## Load Input Data Specific to an On-device ML App and Preprocess the Data\n",
+        "\n",
+        "The [TriviaQA](https://nlp.cs.washington.edu/triviaqa/) is a reading comprehension dataset containing over 650K question-answer-evidence triples. In this tutorial, you will use a subset of this dataset to learn how to use the Model Maker library.\n",
+        "\n",
+        "To load the data, convert the TriviaQA dataset to the [SQuAD1.1](https://rajpurkar.github.io/SQuAD-explorer/) format by running the [converter Python script](https://github.com/mandarjoshi90/triviaqa#miscellaneous) with `--sample_size=8000` and a set of `web` data. Modify the conversion code a little bit by:\n",
+        "* Skipping the samples that couldn't find any answer in the context document;\n",
+        "* Getting the original answer in the context without uppercase or lowercase.\n",
+        "\n",
+        "Download the archived version of the already converted dataset."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "7tOfUr2KlgpU"
+      },
+      "outputs": [],
+      "source": [
+        "train_data_path = tf.keras.utils.get_file(\n",
+        "    fname='triviaqa-web-train-8000.json',\n",
+        "    origin='https://storage.googleapis.com/download.tensorflow.org/models/tflite/dataset/triviaqa-web-train-8000.json')\n",
+        "validation_data_path = tf.keras.utils.get_file(\n",
+        "    fname='triviaqa-verified-web-dev.json',\n",
+        "    origin='https://storage.googleapis.com/download.tensorflow.org/models/tflite/dataset/triviaqa-verified-web-dev.json')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "UfZk8GNr_1nc"
+      },
+      "source": [
+        "You can also train the MobileBERT model with your own dataset. If you are running this notebook on Colab, upload your data by using the left sidebar.\n",
+        "\n",
+        "\u003cimg src=\"https://storage.googleapis.com/download.tensorflow.org/models/tflite/screenshots/model_maker_question_answer.png\" alt=\"Upload File\" width=\"800\" hspace=\"100\"\u003e\n",
+        "\n",
+        "If you prefer not to upload your data to the cloud, you can also run the library offline by following the [guide](https://github.com/tensorflow/examples/tree/master/tensorflow_examples/lite/model_maker)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "E051HBUM5owi"
+      },
+      "source": [
+        "Use the `QuestionAnswerDataLoader.from_squad` method to load and preprocess the [SQuAD format](https://rajpurkar.github.io/SQuAD-explorer/) data according to a specific `model_spec`. You can use either SQuAD2.0 or SQuAD1.1 formats. Setting parameter `version_2_with_negative` as `True` means the formats is SQuAD2.0. Otherwise, the format is SQuAD1.1. By default, `version_2_with_negative` is `False`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "I_fOlZsklmlL"
+      },
+      "outputs": [],
+      "source": [
+        "train_data = QuestionAnswerDataLoader.from_squad(train_data_path, spec, is_training=True)\n",
+        "validation_data = QuestionAnswerDataLoader.from_squad(validation_data_path, spec, is_training=False)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "AWuoensX4vDA"
+      },
+      "source": [
+        "## Customize the TensorFlow Model\n",
+        "\n",
+        "Create a custom question answer model based on the loaded data. The `create` function comprises the following steps:\n",
+        "\n",
+        "1. Creates the model for question answer according to `model_spec`.\n",
+        "2. Train the question answer model. The default epochs and the default batch size are set according to two variables `default_training_epochs` and `default_batch_size` in the `model_spec` object."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "TvYSUuJY3QxR"
+      },
+      "outputs": [],
+      "source": [
+        "model = question_answer.create(train_data, model_spec=spec)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "0JKI-pNc8idH"
+      },
+      "source": [
+        "Have a look at the detailed model structure."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "gd7Hs8TF8n3H"
+      },
+      "outputs": [],
+      "source": [
+        "model.summary()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "LP5FPk_tOxoZ"
+      },
+      "source": [
+        "## Evaluate the Customized Model\n",
+        "\n",
+        "Evaluate the model on the validation data and get a dict of metrics including `f1` score and `exact match` etc. Note that metrics are different for SQuAD1.1 and SQuAD2.0."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "A8c2ZQ0J3Riy"
+      },
+      "outputs": [],
+      "source": [
+        "model.evaluate(validation_data)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "aeHoGAceO2xV"
+      },
+      "source": [
+        "## Export to TensorFlow Lite Model\n",
+        "\n",
+        "Convert the existing model to TensorFlow Lite model format that you can later use in an on-device ML application."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "TwA2Z2pokQJc"
+      },
+      "source": [
+        "Since MobileBERT is too big for on-device applications, use dynamic range quantization on the model to compress MobileBERT by 4x with the minimal loss of performance. First, define the quantization configuration:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "1wBVTO8qkmum"
+      },
+      "outputs": [],
+      "source": [
+        "config = configs.QuantizationConfig.create_dynamic_range_quantization(optimizations=[tf.lite.Optimize.OPTIMIZE_FOR_LATENCY])\n",
+        "config._experimental_new_quantizer = True"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "qea2YkEGkOTH"
+      },
+      "source": [
+        "Export the quantized TFLite model according to the quantization config and save the vocabulary to a vocab file. The default TFLite model filename is `model.tflite`, and the default vocab filename is `vocab`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "Im6wA9lK3TQB"
+      },
+      "outputs": [],
+      "source": [
+        "model.export(export_dir='.', quantization_config=config)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "w12kvDdHJIGH"
+      },
+      "source": [
+        "You can use the TensorFlow Lite model file and vocab file in the [bert_qa](https://github.com/tensorflow/examples/tree/master/lite/examples/bert_qa/android) reference app by downloading it from the left sidebar on Colab."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "HZKYthlVrTos"
+      },
+      "source": [
+        "You can also evalute the tflite model with the `evaluate_tflite` method. This step is expected to take a long time."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "ochbq95ZrVFX"
+      },
+      "outputs": [],
+      "source": [
+        "model.evaluate_tflite('model.tflite', validation_data)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "EoWiA_zX8rxE"
+      },
+      "source": [
+        "## Advanced Usage\n",
+        "\n",
+        "The `create` function is the critical part of this library in which the `model_spec` parameter defines the model specification. The `BertQAModelSpec` class is currently supported. There are 2 models: MobileBERT model, BERT-Base model. The `create` function comprises the following steps:\n",
+        "\n",
+        "1. Creates the model for question answer according to `model_spec`.\n",
+        "2. Train the question answer model.\n",
+        "\n",
+        "This section describes several advanced topics, including adjusting the model, tuning the training hyperparameters etc."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "mwtiksguDfhl"
+      },
+      "source": [
+        "### Adjust the model\n",
+        "\n",
+        "You can adjust the model infrastructure like parameters `seq_len` and `query_len` in the `BertQAModelSpec` class.\n",
+        "\n",
+        "Adjustable parameters for model:\n",
+        "\n",
+        "* `seq_len`: Length of the passage to feed into the model.\n",
+        "* `query_len`: Length of the question to feed into the model.\n",
+        "* `doc_stride`: The stride when doing a sliding window approach to take chunks of the documents.\n",
+        "* `initializer_range`: The stdev of the truncated_normal_initializer for initializing all weight matrices.\n",
+        "* `trainable`: Boolean, whether pre-trained layer is trainable.\n",
+        "\n",
+        "Adjustable parameters for training pipeline:\n",
+        "\n",
+        "* `model_dir`: The location of the model checkpoint files. If not set, temporary directory will be used.\n",
+        "* `dropout_rate`: The rate for dropout.\n",
+        "* `learning_rate`: The initial learning rate for Adam.\n",
+        "* `predict_batch_size`: Batch size for prediction.\n",
+        "* `tpu`: TPU address to connect to. Only used if using tpu.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "cAOd5_bzH9AQ"
+      },
+      "source": [
+        "For example, you can train the model with a longer sequence length. If you change the model, you must first construct a new `model_spec`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "e9WBN0UTQoMN"
+      },
+      "outputs": [],
+      "source": [
+        "new_spec = model_spec.get('mobilebert_qa')\n",
+        "new_spec.seq_len = 512"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "6LSTdghTP0Cv"
+      },
+      "source": [
+        "The remaining steps are the same. Note that you must rerun both the `dataloader` and `create` parts as different model specs may have different preprocessing steps.\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "LvQuy7RSDir3"
+      },
+      "source": [
+        "### Tune training hyperparameters\n",
+        "You can also tune the training hyperparameters like `epochs` and `batch_size` to impact the model performance. For instance,\n",
+        "\n",
+        "*   `epochs`: more epochs could achieve better performance, but may lead to overfitting.\n",
+        "*   `batch_size`: number of samples to use in one training step.\n",
+        "\n",
+        "For example, you can train with more epochs and with a bigger batch size like:\n",
+        "\n",
+        "```python\n",
+        "model = question_answer.create(train_data, model_spec=spec, epochs=5, batch_size=64)\n",
+        "```"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Eq6B9lKMfhS6"
+      },
+      "source": [
+        "### Change the Model Architecture\n",
+        "\n",
+        "You can change the base model your data trains on by changing the `model_spec`. For example, to change to the BERT-Base model, run:\n",
+        "\n",
+        "```python\n",
+        "spec = model_spec.get('bert_qa')\n",
+        "```"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "L2d7yycrgu6L"
+      },
+      "source": [
+        "The remaining steps are the same."
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [],
+      "name": "question_answer.ipynb",
+      "private_outputs": true,
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb b/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb
index e10507ccac7..88cef93e761 100644
--- a/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb
+++ b/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb
@@ -12,7 +12,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "cellView": "form",
         "colab": {},
@@ -74,9 +74,9 @@
         "id": "sr3q-gvm3cI8"
       },
       "source": [
-        "The TensorFlow Lite Model Maker library simplifies the process of adapting and converting a TensorFlow neural-network model to particular input data when deploying this model for on-device ML applications.\n",
+        "The TensorFlow Lite Model Maker library simplifies the process of adapting and converting a TensorFlow model to particular input data when deploying this model for on-device ML applications.\n",
         "\n",
-        "This notebook shows an end-to-end example that utilizes this Model Maker library to illustrate the adaption and conversion of a commonly-used text classification model to classify movie reviews on a mobile device."
+        "This notebook shows an end-to-end example that utilizes the Model Maker library to illustrate the adaptation and conversion of a commonly-used text classification model to classify movie reviews on a mobile device. The text classification model classifies text into predefined categories.The inputs should be preprocessed text and the outputs are the probabilities of the categories. The dataset used in this tutorial are positive and negative movie reviews."
       ]
     },
     {
@@ -86,14 +86,23 @@
         "id": "bcLF2PKkSbV3"
       },
       "source": [
-        "## Prerequisites\n",
-        "\n",
-        "To run this example, we first need to install several required packages, including Model Maker package that in github [repo](https://github.com/tensorflow/examples/tree/master/tensorflow_examples/lite/model_maker)."
+        "## Prerequisites\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "2vvAObmTqglq"
+      },
+      "source": [
+        "### Install the required packages\n",
+        "To run this example, install the required packages, including the Model Maker package from the [GitHub repo](https://github.com/tensorflow/examples/tree/master/tensorflow_examples/lite/model_maker)."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -101,7 +110,7 @@
       },
       "outputs": [],
       "source": [
-        "!pip install git+https://github.com/tensorflow/examples.git#egg=tensorflow-examples[model_maker]"
+        "!pip install tflite-model-maker"
       ]
     },
     {
@@ -116,7 +125,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -130,20 +139,10 @@
         "import tensorflow as tf\n",
         "assert tf.__version__.startswith('2')\n",
         "\n",
-        "from tensorflow_examples.lite.model_maker.core.data_util.text_dataloader import TextClassifierDataLoader\n",
-        "from tensorflow_examples.lite.model_maker.core.task.model_spec import AverageWordVecModelSpec\n",
-        "from tensorflow_examples.lite.model_maker.core.task.model_spec import BertClassifierModelSpec\n",
-        "from tensorflow_examples.lite.model_maker.core.task import text_classifier"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "06sWWfvE6I8e"
-      },
-      "source": [
-        "## Simple End-to-End Example"
+        "from tflite_model_maker import configs\n",
+        "from tflite_model_maker import model_spec\n",
+        "from tflite_model_maker import text_classifier\n",
+        "from tflite_model_maker import TextClassifierDataLoader"
       ]
     },
     {
@@ -154,12 +153,12 @@
       },
       "source": [
         "### Get the data path\n",
-        "Let's get some texts to play with this simple end-to-end example."
+        "Download the dataset for this tutorial."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -167,10 +166,11 @@
       },
       "outputs": [],
       "source": [
-        "data_path = tf.keras.utils.get_file(\n",
-        "      fname='aclImdb',\n",
-        "      origin='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz',\n",
-        "      untar=True)"
+        "data_dir = tf.keras.utils.get_file(\n",
+        "      fname='SST-2.zip',\n",
+        "      origin='https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media\u0026token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',\n",
+        "      extract=True)\n",
+        "data_dir = os.path.join(os.path.dirname(data_dir), 'SST-2')"
       ]
     },
     {
@@ -180,7 +180,7 @@
         "id": "6MSCjPAvs2EQ"
       },
       "source": [
-        " You could replace it with your own text folders. As for uploading data to colab, you could find the upload button in the left sidebar shown in the image below with the red rectangle. Just have a try to upload a zip file and unzip it. The root file path is the current path.\n",
+        "You can also upload your own dataset to work through this tutorial. Upload your dataset by using the left sidebar in Colab.\n",
         "\n",
         "\u003cimg src=\"https://storage.googleapis.com/download.tensorflow.org/models/tflite/screenshots/model_maker_text_classification.png\" alt=\"Upload File\" width=\"800\" hspace=\"100\"\u003e\n"
       ]
@@ -192,7 +192,17 @@
         "id": "uO5egTlrtWxm"
       },
       "source": [
-        "If you prefer not to upload your images to the cloud, you could try to run the library locally following the [guide](https://github.com/tensorflow/examples/tree/master/tensorflow_examples/lite/model_maker) in github."
+        "If you prefer not to upload your dataset to the cloud, you can also locally run the library by following the [guide](https://github.com/tensorflow/examples/tree/master/tensorflow_examples/lite/model_maker)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "xushUyZXqP59"
+      },
+      "source": [
+        "## End-to-End Workflow"
       ]
     },
     {
@@ -202,9 +212,7 @@
         "id": "WlKU3SMX6TnB"
       },
       "source": [
-        "### Run the example\n",
-        "\n",
-        "The example just consists of 6 lines of code as shown below, representing 5 steps of the overall process."
+        "This workflow consists of five steps as outlined below:"
       ]
     },
     {
@@ -214,12 +222,14 @@
         "id": "PBPUIhEjMjTR"
       },
       "source": [
-        "Step 0. Choose a `model_spec` that represents a model for text classifier."
+        "Step 1. Choose a model specification that represents a text classification model.\n",
+        "\n",
+        "This tutorial uses [MobileBERT](https://arxiv.org/pdf/2004.02984.pdf) as an example."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -227,7 +237,7 @@
       },
       "outputs": [],
       "source": [
-        "model_spec = AverageWordVecModelSpec()"
+        "spec = model_spec.get('mobilebert_classifier')"
       ]
     },
     {
@@ -237,12 +247,12 @@
         "id": "s5U-A3tw6Y27"
       },
       "source": [
-        "Step 1.   Load train and test data specific to an on-device ML app and preprocess the data according to specific `model_spec`."
+        "Step 2.   Load train and test data specific to an on-device ML app and preprocess the data according to a specific `model_spec`."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -250,8 +260,20 @@
       },
       "outputs": [],
       "source": [
-        "train_data = TextClassifierDataLoader.from_folder(os.path.join(data_path, 'train'), model_spec=model_spec, class_labels=['pos', 'neg'])\n",
-        "test_data = TextClassifierDataLoader.from_folder(os.path.join(data_path, 'test'), model_spec=model_spec, is_training=False, shuffle=False)"
+        "train_data = TextClassifierDataLoader.from_csv(\n",
+        "      filename=os.path.join(os.path.join(data_dir, 'train.tsv')),\n",
+        "      text_column='sentence',\n",
+        "      label_column='label',\n",
+        "      model_spec=spec,\n",
+        "      delimiter='\\t',\n",
+        "      is_training=True)\n",
+        "test_data = TextClassifierDataLoader.from_csv(\n",
+        "      filename=os.path.join(os.path.join(data_dir, 'dev.tsv')),\n",
+        "      text_column='sentence',\n",
+        "      label_column='label',\n",
+        "      model_spec=spec,\n",
+        "      delimiter='\\t',\n",
+        "      is_training=False)"
       ]
     },
     {
@@ -261,12 +283,12 @@
         "id": "2uZkLR6N6gDR"
       },
       "source": [
-        "Step 2. Customize the TensorFlow model."
+        "Step 3. Customize the TensorFlow model."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -274,7 +296,7 @@
       },
       "outputs": [],
       "source": [
-        "model = text_classifier.create(train_data, model_spec=model_spec)"
+        "model = text_classifier.create(train_data, model_spec=spec)"
       ]
     },
     {
@@ -284,12 +306,12 @@
         "id": "-BzCHLWJ6h7q"
       },
       "source": [
-        "Step 3. Evaluate the model."
+        "Step 4. Evaluate the model."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -307,13 +329,28 @@
         "id": "CgCDMe0e6jlT"
       },
       "source": [
-        "Step 4.  Export to TensorFlow Lite  model.\n",
-        "You could download it in the left sidebar same as the uploading part for your own use."
+        "Step 5.  Export as a TensorFlow Lite  model.\n",
+        "\n",
+        "Since MobileBERT is too big for on-device applications, use [dynamic range quantization](https://www.tensorflow.org/lite/performance/post_training_quantization#dynamic_range_quantization) on the model to compress it by almost 4x with minimal performance degradation."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "ZQRLmkGumr9Y"
+      },
+      "outputs": [],
+      "source": [
+        "config = configs.QuantizationConfig.create_dynamic_range_quantization(optimizations=[tf.lite.Optimize.OPTIMIZE_FOR_LATENCY])\n",
+        "config._experimental_new_quantizer = True"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -321,7 +358,7 @@
       },
       "outputs": [],
       "source": [
-        "model.export(export_dir='.')"
+        "model.export(export_dir='mobilebert/', quantization_config=config)"
       ]
     },
     {
@@ -331,7 +368,9 @@
         "id": "rVxaf3x_7OfB"
       },
       "source": [
-        "After this simple 5 steps, we could further use TensorFlow Lite model file and label file in on-device applications like in [text classification](https://github.com/tensorflow/examples/tree/master/lite/examples/text_classification) reference app."
+        "You can also download the model using the left sidebar in Colab.\n",
+        "\n",
+        "After executing the 5 steps above, you can further use the TensorFlow Lite model file and label file in on-device applications like in a [text classification](https://github.com/tensorflow/examples/tree/master/lite/examples/text_classification) reference app."
       ]
     },
     {
@@ -341,9 +380,7 @@
         "id": "l65ctmtW7_FF"
       },
       "source": [
-        "## Detailed Process\n",
-        "\n",
-        "In the above, we tried the simple end-to-end example. The following walks through the example step by step to show more detail."
+        "The following sections walk through the example step by step to show more detail."
       ]
     },
     {
@@ -353,14 +390,22 @@
         "id": "kJ_B8fMDOhMR"
       },
       "source": [
-        "### Step 0: Choose a model_spec that represents a model for text classifier.\n",
+        "## Choose a `model_spec` that Represents a Model for Text Classifier\n",
         "\n",
-        "each `model_spec` object represents a specific model for the text classifier. Currently, we support averging word embedding model and BERT-base model."
+        "Each `model_spec` object represents a specific model for the text classifier. TensorFlow Lite Model Maker currently supports [MobileBERT](https://arxiv.org/pdf/2004.02984.pdf), averaging word embeddings and [BERT-Base]((https://arxiv.org/pdf/1810.04805.pdf) models.\n",
+        "\n",
+        "Supported Model | Name of model_spec | Model Description\n",
+        "--- | --- | ---\n",
+        "MobileBERT | 'mobilebert_classifier' | 4.3x smaller and 5.5x faster than BERT-Base while achieving competitive results, suitable for on-device applications.\n",
+        "BERT-Base | 'bert_classifier' | Standard BERT model that is widely used in NLP tasks.\n",
+        "averaging word embedding | 'average_word_vec' | Averaging text word embeddings with RELU activation.\n",
+        "\n",
+        "This tutorial uses a smaller model, `average_word_vec` that you can retrain multiple times to demonstrate the process."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -368,7 +413,7 @@
       },
       "outputs": [],
       "source": [
-        "model_spec = AverageWordVecModelSpec()"
+        "spec = model_spec.get('average_word_vec')"
       ]
     },
     {
@@ -378,32 +423,18 @@
         "id": "ygEncJxtl-nQ"
       },
       "source": [
-        "### Step 1: Load Input Data Specific to an On-device ML App\n",
+        "## Load Input Data Specific to an On-device ML App\n",
         "\n",
-        "The IMDB dataset contains 25000 movie reviews for training and 25000 movie reviews for testing from the [Internet Movie Database](https://www.imdb.com/). The dataset has two classes: positive and negative movie reviews.\n",
+        "The [SST-2](https://nlp.stanford.edu/sentiment/index.html) (Stanford Sentiment Treebank) is one of the tasks in the [GLUE](https://gluebenchmark.com/) benchmark . It contains 67,349 movie reviews for training and 872 movie reviews for validation. The dataset has two classes: positive and negative movie reviews.\n",
         "\n",
-        "Download the archive version of the dataset and untar it.\n",
+        "Download the archived version of the dataset and extract it.\n",
         "\n",
-        "The IMDB dataset has the following directory structure:\n",
-        "\n",
-        "\u003cpre\u003e\n",
-        "\u003cb\u003eaclImdb\u003c/b\u003e\n",
-        "|__ \u003cb\u003etrain\u003c/b\u003e\n",
-        "    |______ \u003cb\u003epos\u003c/b\u003e: [1962_10.txt, 2499_10.txt, ...]\n",
-        "    |______ \u003cb\u003eneg\u003c/b\u003e: [104_3.txt, 109_2.txt, ...]\n",
-        "    |______ unsup: [12099_0.txt, 1424_0.txt, ...]\n",
-        "|__ \u003cb\u003etest\u003c/b\u003e\n",
-        "    |______ \u003cb\u003epos\u003c/b\u003e: [1384_9.txt, 191_9.txt, ...]\n",
-        "    |______ \u003cb\u003eneg\u003c/b\u003e: [1629_1.txt, 21_1.txt]\n",
-        "\n",
-        "\u003c/pre\u003e\n",
-        "\n",
-        "Note that the text data under `train/unsup` folder are unlabeled documents for unsupervised learning and such data should be ignored in this tutorial.\n"
+        "\n"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -411,10 +442,11 @@
       },
       "outputs": [],
       "source": [
-        "data_path = tf.keras.utils.get_file(\n",
-        "      fname='aclImdb',\n",
-        "      origin='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz',\n",
-        "      untar=True)"
+        "data_dir = tf.keras.utils.get_file(\n",
+        "      fname='SST-2.zip',\n",
+        "      origin='https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media\u0026token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',\n",
+        "      extract=True)\n",
+        "data_dir = os.path.join(os.path.dirname(data_dir), 'SST-2')"
       ]
     },
     {
@@ -424,16 +456,21 @@
         "id": "E051HBUM5owi"
       },
       "source": [
-        "Use `TextClassifierDataLoader` to load data.\n",
+        "The SST-2 dataset has `train.tsv` for training and `dev.tsv` for validation. The files have the following format:\n",
         "\n",
-        "As for `from_folder()` method, it could load data from the folder. It assumes that the text data of the same class are in the same subdirectory and the subfolder name is the class name. Each text file contains one movie review sample.\n",
+        "sentence | label\n",
+        "--- | ---\n",
+        "it 's a charming and often affecting journey . | 1\n",
+        "unflinchingly bleak and desperate | 0\n",
         "\n",
-        "Parameter `class_labels` is used to specify which subfolder should be considered. As for `train` folder, this parameter is used to skip `unsup` subfolder.\n"
+        "A positive review is labeled 1 and a negative review is labeled 0.\n",
+        "\n",
+        "Use the `TestClassifierDataLoader.from_csv` method to load the data."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -441,9 +478,30 @@
       },
       "outputs": [],
       "source": [
-        "train_data = TextClassifierDataLoader.from_folder(os.path.join(data_path, 'train'), model_spec=model_spec, class_labels=['pos', 'neg'])\n",
-        "test_data = TextClassifierDataLoader.from_folder(os.path.join(data_path, 'test'), model_spec=model_spec, is_training=False, shuffle=False)\n",
-        "train_data, validation_data = train_data.split(0.9)"
+        "train_data = TextClassifierDataLoader.from_csv(\n",
+        "      filename=os.path.join(os.path.join(data_dir, 'train.tsv')),\n",
+        "      text_column='sentence',\n",
+        "      label_column='label',\n",
+        "      model_spec=spec,\n",
+        "      delimiter='\\t',\n",
+        "      is_training=True)\n",
+        "test_data = TextClassifierDataLoader.from_csv(\n",
+        "      filename=os.path.join(os.path.join(data_dir, 'dev.tsv')),\n",
+        "      text_column='sentence',\n",
+        "      label_column='label',\n",
+        "      model_spec=spec,\n",
+        "      delimiter='\\t',\n",
+        "      is_training=False)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "MlHvVvv2hw4H"
+      },
+      "source": [
+        "The Model Maker library also supports the `from_folder()` method to load data. It assumes that the text data of the same class are in the same subdirectory and that the subfolder name is the class name. Each text file contains one movie review sample. The `class_labels` parameter is used to specify which the subfolders."
       ]
     },
     {
@@ -453,14 +511,14 @@
         "id": "AWuoensX4vDA"
       },
       "source": [
-        "### Step 2: Customize the TensorFlow Model\n",
+        "## Customize the TensorFlow Model\n",
         "\n",
-        "Create a custom text classifier model based on the loaded data. Currently, we support averaging word embedding and BERT-base model."
+        "Create a custom text classifier model based on the loaded data."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -468,7 +526,7 @@
       },
       "outputs": [],
       "source": [
-        "model = text_classifier.create(train_data, model_spec=model_spec, validation_data=validation_data)"
+        "model = text_classifier.create(train_data, model_spec=spec, epochs=10)"
       ]
     },
     {
@@ -478,12 +536,12 @@
         "id": "0JKI-pNc8idH"
       },
       "source": [
-        "Have a look at the detailed model structure."
+        "Examine the detailed model structure."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -501,16 +559,16 @@
         "id": "LP5FPk_tOxoZ"
       },
       "source": [
-        "### Step 3: Evaluate the Customized Model\n",
+        "## Evaluate the Customized Model\n",
         "\n",
-        "Evaluate the result of the model, get the loss and accuracy of the model.\n",
+        "Evaluate the result of the model and get the loss and accuracy of the model.\n",
         "\n",
-        "Evaluate the loss and accuracy in `test_data`. If no data is given the results are evaluated on the data that's splitted in the `create` method."
+        "Evaluate the loss and accuracy in the test data."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -528,14 +586,14 @@
         "id": "aeHoGAceO2xV"
       },
       "source": [
-        "### Step 4: Export to TensorFlow Lite Model\n",
+        "## Export as a TensorFlow Lite Model\n",
         "\n",
-        "Convert the existing model to TensorFlow Lite model format that could be later used in on-device ML application. Meanwhile, save the text labels in label file and vocabulary in vocab file. The default TFLite filename is `model.tflite`, the default label filename is `label.txt`, the default vocab filename is `vocab`."
+        "Convert the existing model to TensorFlow Lite model format that you can later use in an on-device ML application. Save the text labels in a label file and vocabulary in a vocab file. The default TFLite filename is `model.tflite`, the default label filename is `label.txt` and the default vocab filename is `vocab`."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -543,7 +601,7 @@
       },
       "outputs": [],
       "source": [
-        "model.export(export_dir='.')"
+        "model.export(export_dir='average_word_vec/')"
       ]
     },
     {
@@ -553,9 +611,7 @@
         "id": "w12kvDdHJIGH"
       },
       "source": [
-        "The TensorFlow Lite model file and label file could be used in the [text classification](https://github.com/tensorflow/examples/tree/master/lite/examples/text_classification) reference app.\n",
-        "\n",
-        "In detail, we could add `movie_review_classifier.tflite`, `text_label.txt` and `vocab.txt` to the [assets directory](https://github.com/tensorflow/examples/tree/master/lite/examples/text_classification/android/app/src/main/assets) folder. Meanwhile, change the filenames in [code](https://github.com/tensorflow/examples/blob/master/lite/examples/text_classification/android/app/src/main/java/org/tensorflow/lite/examples/textclassification/TextClassificationClient.java#L43). "
+        "The TensorFlow Lite model file and label file can be used in the [text classification](https://github.com/tensorflow/examples/tree/master/lite/examples/text_classification) reference app by adding `model.tflite`, `text_label.txt` and `vocab.txt` to the [assets directory](https://github.com/tensorflow/examples/tree/master/lite/examples/text_classification/android/app/src/main/assets). Do not forget to also change the filenames in the [code](https://github.com/tensorflow/examples/blob/master/lite/examples/text_classification/android/app/src/main/java/org/tensorflow/lite/examples/textclassification/TextClassificationClient.java#L43)."
       ]
     },
     {
@@ -565,12 +621,12 @@
         "id": "HZKYthlVrTos"
       },
       "source": [
-        "Here, we also demonstrate how to use the above files to run and evaluate the TensorFlow Lite model."
+        "You can evalute the tflite model with `evaluate_tflite` method."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -578,51 +634,7 @@
       },
       "outputs": [],
       "source": [
-        "# Read TensorFlow Lite model from TensorFlow Lite file.\n",
-        "with tf.io.gfile.GFile('model.tflite', 'rb') as f:\n",
-        "  model_content = f.read()\n",
-        "\n",
-        "# Read label names from label file.\n",
-        "with tf.io.gfile.GFile('labels.txt', 'r') as f:\n",
-        "  label_names = f.read().split('\\n')\n",
-        "\n",
-        "# Initialze TensorFlow Lite inpterpreter.\n",
-        "interpreter = tf.lite.Interpreter(model_content=model_content)\n",
-        "interpreter.allocate_tensors()\n",
-        "input_index = interpreter.get_input_details()[0]['index']\n",
-        "output = interpreter.tensor(interpreter.get_output_details()[0][\"index\"])\n",
-        "\n",
-        "# Run predictions on each test data and calculate accuracy.\n",
-        "accurate_count = 0\n",
-        "for text, label in test_data.dataset:\n",
-        "    # Add batch dimension and convert to float32 to match with the model's input\n",
-        "    # data format.\n",
-        "    text = tf.expand_dims(text, 0)\n",
-        "    text = tf.cast(text, tf.float32)\n",
-        "\n",
-        "    # Run inference.\n",
-        "    interpreter.set_tensor(input_index, text)\n",
-        "    interpreter.invoke()\n",
-        "\n",
-        "    # Post-processing: remove batch dimension and find the label with highest\n",
-        "    # probability.\n",
-        "    predict_label = np.argmax(output()[0])\n",
-        "    # Get label name with label index.\n",
-        "    predict_label_name = label_names[predict_label]\n",
-        "    accurate_count += (predict_label == label.numpy())\n",
-        "\n",
-        "accuracy = accurate_count * 1.0 / test_data.size\n",
-        "print('TensorFlow Lite model accuracy = %.4f' % accuracy)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "KLKmboKFtgc2"
-      },
-      "source": [
-        "Note that preprocessing for inference should be the same as training. Currently, preprocessing contains split the text to tokens by '\\W', encode the tokens to ids, the pad the text with `pad_id` to have the length of `seq_length`."
+        "model.evaluate_tflite('average_word_vec/model.tflite', test_data)"
       ]
     },
     {
@@ -634,14 +646,12 @@
       "source": [
         "## Advanced Usage\n",
         "\n",
-        "The `create` function is the critical part of this library in which parameter `model_spec` defines the specification of the model, currently `AverageWordVecModelSpec` and `BertModelSpec` is supported. The `create` function contains the following steps for `AverageWordVecModelSpec`:\n",
+        "The `create` function is the driver function that the Model Maker library uses to create models. The `model spec` parameter defines the model specification. The `AverageWordVecModelSpec` and `BertClassifierModelSpec` classes are currently supported. The `create` function comprises of the following steps:\n",
         "\n",
-        "1.   Tokenize the text and select the top `num_words` most frequent words to generate the vocubulary. The default value of `num_words` in `AverageWordVecModelSpec` object is `10000`.\n",
-        "2.   Encode the text string tokens to int ids.\n",
-        "3.   Create the text classifier model. Currently, this library supports one model: average the word embedding of the text with RELU activation, then leverage softmax dense layer for classification. As for [Embedding layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding), the input dimension is the size of the vocabulary, the output dimension is `AverageWordVecModelSpec` object's variable `wordvec_dim` which default value is `16`, the input length is `AverageWordVecModelSpec` object's variable `seq_len` which default value is `256`.\n",
-        "4.   Train the classifier model. The default epoch is `2` and the default batch size is `32`.\n",
+        "1. Creates the model for the text classifier according to `model_spec`.\n",
+        "2. Trains the classifier model.  The default epochs and the default batch size are set by the `default_training_epochs` and `default_batch_size` variables in the `model_spec` object.\n",
         "\n",
-        "In this section, we describe several advanced topics, including adjusting the model, changing the training hyperparameters etc.\n"
+        "This section covers advanced usage topics like adjusting the model and the training hyperparameters."
       ]
     },
     {
@@ -651,9 +661,9 @@
         "id": "mwtiksguDfhl"
       },
       "source": [
-        "## Adjust the model\n",
+        "### Adjust the model\n",
         "\n",
-        "We could adjust the model infrastructure like variables `wordvec_dim`, `seq_len` in `AverageWordVecModelSpec` class.\n"
+        "You can adjust the model infrastructure like the `wordvec_dim` and the `seq_len` variables in the `AverageWordVecModelSpec` class.\n"
       ]
     },
     {
@@ -663,15 +673,12 @@
         "id": "cAOd5_bzH9AQ"
       },
       "source": [
-        "*   `wordvec_dim`: Dimension of word embedding.\n",
-        "*   `seq_len`: length of sequence.\n",
-        "\n",
-        "For example, we could train with larger `wordvec_dim`. If we change the model, we need to construct the new `model_spec` firstly."
+        "For example, you can train the model with a larger value of `wordvec_dim`. Note that you must construct a new `model_spec` if you modify the model."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -679,7 +686,7 @@
       },
       "outputs": [],
       "source": [
-        "new_model_spec = AverageWordVecModelSpec(wordvec_dim=32)"
+        "new_model_spec = model_spec.AverageWordVecModelSpec(wordvec_dim=32)"
       ]
     },
     {
@@ -689,12 +696,12 @@
         "id": "6LSTdghTP0Cv"
       },
       "source": [
-        "Secondly, we should get the preprocessed data accordingly."
+        "Get the preprocessed data."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -702,8 +709,13 @@
       },
       "outputs": [],
       "source": [
-        "new_train_data = TextClassifierDataLoader.from_folder(os.path.join(data_path, 'train'), model_spec=new_model_spec, class_labels=['pos', 'neg'])\n",
-        "new_train_data, new_validation_data = new_train_data.split(0.9)"
+        "new_train_data = TextClassifierDataLoader.from_csv(\n",
+        "      filename=os.path.join(os.path.join(data_dir, 'train.tsv')),\n",
+        "      text_column='sentence',\n",
+        "      label_column='label',\n",
+        "      model_spec=new_model_spec,\n",
+        "      delimiter='\\t',\n",
+        "      is_training=True)"
       ]
     },
     {
@@ -713,12 +725,12 @@
         "id": "tD7QVVHeRZoM"
       },
       "source": [
-        "Finally, we could train the new model."
+        "Train the new model."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -726,7 +738,46 @@
       },
       "outputs": [],
       "source": [
-        "model = text_classifier.create(new_train_data, model_spec=new_model_spec, validation_data=new_validation_data)"
+        "model = text_classifier.create(new_train_data, model_spec=new_model_spec)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "E8VxPiOLy4Gv"
+      },
+      "source": [
+        "You can also adjust the MobileBERT model.\n",
+        "\n",
+        "The model parameters you can adjust are:\n",
+        "\n",
+        "* `seq_len`: Length of the sequence to feed into the model.\n",
+        "* `initializer_range`: The standard deviation of the truncated_normal_initializer for initializing all weight matrices.\n",
+        "* `trainable`: Boolean that specifies whether the pre-trained layer is trainable.\n",
+        "\n",
+        "The training pipeline parameters you can adjust are:\n",
+        "\n",
+        "* `model_dir`: The location of the model checkpoint files. If not set, a temporary directory will be used.\n",
+        "* `dropout_rate`: The dropout rate.\n",
+        "* `learning_rate`: The initial learning rate for the Adam optimizer.\n",
+        "* `tpu`: TPU address to connect to.\n",
+        "\n",
+        "For instance, you can set the `seq_len=256` (default is 128). This allows the model to classify longer text."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "4tr9BLcjy4Sh"
+      },
+      "outputs": [],
+      "source": [
+        "new_model_spec = model_spec.get('mobilebert_classifier')\n",
+        "new_model_spec.seq_len = 256"
       ]
     },
     {
@@ -736,18 +787,18 @@
         "id": "LvQuy7RSDir3"
       },
       "source": [
-        "### Change the training hyperparameters\n",
-        "We could also change the training hyperparameters like `epochs` and `batch_size` that could affect the model accuracy. For instance,\n",
+        "### Tune the training hyperparameters\n",
+        "You can also tune the training hyperparameters like `epochs` and `batch_size` that affect the model accuracy. For instance,\n",
         "\n",
         "*   `epochs`: more epochs could achieve better accuracy, but may lead to overfitting.\n",
-        "*   `batch_size`: number of samples to use in one training step.\n",
+        "*   `batch_size`: the number of samples to use in one training step.\n",
         "\n",
-        "For example, we could train with more epochs."
+        "For example, you can train with more epochs."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -755,7 +806,7 @@
       },
       "outputs": [],
       "source": [
-        "model = text_classifier.create(train_data, model_spec=model_spec, validation_data=validation_data, epochs=5)"
+        "model = text_classifier.create(train_data, model_spec=spec, epochs=20)"
       ]
     },
     {
@@ -765,12 +816,12 @@
         "id": "nUaKQZBQHBQR"
       },
       "source": [
-        "Evaluate the newly retrained model with 5 training epochs."
+        "Evaluate the newly retrained model with 20 training epochs."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -788,16 +839,16 @@
         "id": "Eq6B9lKMfhS6"
       },
       "source": [
-        "### Change the Model\n",
+        "### Change the Model Architecture\n",
         "\n",
-        "We could change the model by changing the `model_spec`. The following shows how we change to BERT-base model.\n",
+        "You can change the model by changing the `model_spec`. The following shows how to change to BERT-Base model.\n",
         "\n",
-        "First, we could change `model_spec` to `BertModelSpec`."
+        "Change the `model_spec` to BERT-Base model for the text classifier."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -805,7 +856,7 @@
       },
       "outputs": [],
       "source": [
-        "model_spec = BertClassifierModelSpec()"
+        "spec = model_spec.get('bert_classifier')"
       ]
     },
     {
@@ -815,46 +866,7 @@
         "id": "L2d7yycrgu6L"
       },
       "source": [
-        "The remaining steps remains the same.\n",
-        "\n",
-        "Load data and preprocess the data according to `model_spec`."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "6GQXQO54iyyE"
-      },
-      "outputs": [],
-      "source": [
-        "train_data = TextClassifierDataLoader.from_folder(os.path.join(data_path, 'train'), model_spec=model_spec, class_labels=['pos', 'neg'])\n",
-        "test_data = TextClassifierDataLoader.from_folder(os.path.join(data_path, 'test'), model_spec=model_spec, is_training=False, shuffle=False)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "ZTMqpDXCi11Q"
-      },
-      "source": [
-        "Then retrain the model. Note that it could take a long time to retrain the BERT model. we just set `epochs` equals 1 to demonstrate it."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "c991Bdkgi1Bf"
-      },
-      "outputs": [],
-      "source": [
-        "model = text_classifier.create(train_data, model_spec=model_spec, epochs=1)"
+        "The remaining steps are the same."
       ]
     }
   ],
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index 88dcb37898a..1d702dd8397 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -113,7 +113,7 @@ Interpreter::Interpreter(ErrorReporter* error_reporter)
   external_contexts_[kTfLiteCpuBackendContext] =
       own_external_cpu_backend_context_.get();
 
-  UseNNAPI(false);
+  primary_subgraph().UseNNAPI(false);
 }
 
 Interpreter::~Interpreter() {
@@ -163,6 +163,12 @@ void Interpreter::SetExternalContext(TfLiteExternalContextType type,
   primary_subgraph().SetExternalContext(type, ctx);
 }
 
+TfLiteStatus Interpreter::SetCustomAllocationForTensor(
+    int tensor_index, const TfLiteCustomAllocation& allocation) {
+  return primary_subgraph().SetCustomAllocationForTensor(tensor_index,
+                                                         allocation);
+}
+
 TfLiteStatus Interpreter::SetInputs(std::vector<int> inputs) {
   return primary_subgraph().SetInputs(std::move(inputs));
 }
@@ -182,7 +188,16 @@ TfLiteStatus Interpreter::AllocateTensors() {
     // The execution will fall back to default implementation if the XNNPACK
     // delegate fails to be applied. Therefore, we ignore the return status
     // here and let it fall through the rest of the code.
-    ModifyGraphWithDelegate(std::move(lazy_delegate_provider_));
+    auto status = ModifyGraphWithDelegate(std::move(lazy_delegate_provider_));
+    if (status != kTfLiteOk) {
+      TF_LITE_REPORT_ERROR(
+          error_reporter_,
+          "Ignoring failed application of the default TensorFlow Lite "
+          "delegate.");
+    } else {
+      TFLITE_LOG(TFLITE_LOG_INFO,
+                 "Successfully applied the default TensorFlow Lite delegate.");
+    }
     lazy_delegate_provider_.reset();
   }
 
@@ -300,7 +315,12 @@ TfLiteStatus Interpreter::SetExecutionPlan(const std::vector<int>& new_plan) {
   return primary_subgraph().SetExecutionPlan(new_plan);
 }
 
-void Interpreter::UseNNAPI(bool enable) { primary_subgraph().UseNNAPI(enable); }
+void Interpreter::UseNNAPI(bool enable) {
+  TFLITE_LOG_PROD_ONCE(TFLITE_LOG_INFO,
+                       "Interpreter::UseNNAPI() is deprecated. Use "
+                       "tflite::NnApiDelegate() directly instead.");
+  primary_subgraph().UseNNAPI(enable);
+}
 
 TfLiteStatus Interpreter::SetNumThreads(int num_threads) {
   if (num_threads < -1) {
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index 4543759f407..5c354d63dd5 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-// Main abstraction controlling the tflite interpreter.
-// See context.h for the API for defining operations (TfLiteRegistration).
+/// \file
+/// Main abstraction controlling the tflite interpreter.
+/// See context.h for the API for defining operations (TfLiteRegistration).
 #ifndef TENSORFLOW_LITE_INTERPRETER_H_
 #define TENSORFLOW_LITE_INTERPRETER_H_
 
@@ -363,7 +364,13 @@ class Interpreter {
   /// Returns status of success or failure.
   TfLiteStatus Invoke();
 
-  /// Enable or disable the NN API (true to enable)
+  /// Enable or disable NNAPI (true to enable). Disabled by default.
+  ///
+  /// WARNING: NNAPI cannot be disabled after the graph has been prepared
+  /// (via `AllocateTensors`) with NNAPI enabled.
+  ///
+  /// NOTE: This API is deprecated, prefer using the NNAPI delegate directly.
+  /// This method will be removed in a future release.
   void UseNNAPI(bool enable);
 
   /// Set the number of threads available to the interpreter.
@@ -497,6 +504,29 @@ class Interpreter {
   void SetExternalContext(TfLiteExternalContextType type,
                           TfLiteExternalContext* ctx);
 
+  // Assigns (or reassigns) a custom memory allocation for the given tensor.
+  // If AllocateTensors() is called after this, the runtime does not consider
+  // the tensor during internal memory planning and will continue using the
+  // provided allocation for the tensor (assuming it satisfies the expected
+  // tensor byte length).
+  // The runtime does NOT take ownership of the underlying memory.
+  // Note that while this function can be called again to set a new allocation
+  // for the tensor, it can no longer be reset to the TFLite arena memory.
+  //
+  // Parameters should satisfy the following conditions:
+  // 1. tensor->allocation_type == kTfLiteArenaRw
+  //    In general, this is true for all non-constants such as I/O tensors.
+  // 2. allocation->data has the appropriate permissions for runtime access
+  //    (Read-only for inputs, Read-Write for others), and outlives Interpreter.
+  // 3. allocation->bytes >= tensor->bytes.
+  //    This condition is checked again if any tensors are resized.
+  // 4. allocation->data should be aligned to kDefaultTensorAlignment
+  //    defined in lite/util.h. (Currently 64 bytes)
+  //
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus SetCustomAllocationForTensor(
+      int tensor_index, const TfLiteCustomAllocation& allocation);
+
 #ifndef DOXYGEN_SKIP
   /// Adds `subgraphs_to_add` subgraphs, preserving pre-existing Subgraph
   /// entries. The value pointed to by `first_new_subgraph_index` will be set to
diff --git a/tensorflow/lite/interpreter_builder.cc b/tensorflow/lite/interpreter_builder.cc
index 996fc7e6b82..07c5251fab3 100644
--- a/tensorflow/lite/interpreter_builder.cc
+++ b/tensorflow/lite/interpreter_builder.cc
@@ -121,6 +121,8 @@ TFLITE_ATTRIBUTE_WEAK Interpreter::TfLiteDelegatePtr AcquireFlexDelegate() {
   const char* filename_pywrap_tensorflow_internal =
 #if defined(_WIN32)
       "_pywrap_tensorflow_internal.pyd";
+#elif defined(__APPLE__)
+      "python/_pywrap_tensorflow_internal.so";
 #else
       "_pywrap_tensorflow_internal.so";
 #endif
@@ -614,7 +616,12 @@ TfLiteStatus InterpreterBuilder::operator()(
   auto* buffers = model_->buffers();
 
   if (subgraphs->size() == 0) {
-    error_reporter_->Report("No subgraph in the model.\n");
+    TF_LITE_REPORT_ERROR(error_reporter_, "No subgraph in the model.\n");
+    return cleanup_and_error();
+  }
+
+  if (!buffers) {
+    TF_LITE_REPORT_ERROR(error_reporter_, "No buffers in the model.\n");
     return cleanup_and_error();
   }
 
@@ -635,10 +642,10 @@ TfLiteStatus InterpreterBuilder::operator()(
         (*interpreter)->subgraph(subgraph_index);
     auto operators = subgraph->operators();
     auto tensors = subgraph->tensors();
-    if (!operators || !tensors || !buffers) {
-      error_reporter_->Report(
-          "Did not get operators, tensors, or buffers in subgraph %d.\n",
-          subgraph_index);
+    if (!operators || !tensors) {
+      TF_LITE_REPORT_ERROR(error_reporter_,
+                           "Did not get operators or tensors in subgraph %d.\n",
+                           subgraph_index);
       return cleanup_and_error();
     }
     if (modified_subgraph->AddTensors(tensors->size()) != kTfLiteOk) {
diff --git a/tensorflow/lite/interpreter_builder.h b/tensorflow/lite/interpreter_builder.h
index 1b8ae5a8e68..c6638b94835 100644
--- a/tensorflow/lite/interpreter_builder.h
+++ b/tensorflow/lite/interpreter_builder.h
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-/// Deserialization infrastructure for tflite. Provides functionality
-/// to go from a serialized tflite model in flatbuffer format to an
-/// interpreter.
+/// \file
+/// Provides functionality to construct an interpreter for a model.
 ///
 #ifndef TENSORFLOW_LITE_INTERPRETER_BUILDER_H_
 #define TENSORFLOW_LITE_INTERPRETER_BUILDER_H_
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index 899811b3fea..bf40843876c 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -22,8 +22,10 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/lite/builtin_op_data.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/external_cpu_backend_context.h"
+#include "tensorflow/lite/kernels/builtin_op_kernels.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -1480,6 +1482,245 @@ TEST_F(CancellationTest, CancelDuringInvoke) {
   ASSERT_EQ(invoke_error_code, kTfLiteError);
 }
 
+// Tests functionality related to custom memory allocations in TFLite.
+class TestCustomAllocation : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Simple model with two custom ops that add 2 float tensors each.
+    interpreter_.reset(new Interpreter);
+    interpreter_->AddTensors(5);
+    interpreter_->SetInputs({0, 1});
+    interpreter_->SetOutputs({3, 4});
+    TfLiteQuantizationParams quant;
+    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(2, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(3, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(4, kTfLiteFloat32, "", {3},
+                                               quant);
+    auto* add_reg = ops::builtin::Register_ADD();
+    TfLiteAddParams* builtin_data0 =
+        reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
+    TfLiteAddParams* builtin_data1 =
+        reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
+    TfLiteAddParams* builtin_data2 =
+        reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
+    builtin_data0->activation = kTfLiteActNone;
+    builtin_data1->activation = kTfLiteActNone;
+    builtin_data2->activation = kTfLiteActNone;
+    interpreter_->AddNodeWithParameters({0, 0}, {2}, nullptr, 0, builtin_data0,
+                                        add_reg);
+    interpreter_->AddNodeWithParameters({1, 1}, {3}, nullptr, 0, builtin_data1,
+                                        add_reg);
+    interpreter_->AddNodeWithParameters({2, 1}, {4}, nullptr, 0, builtin_data2,
+                                        add_reg);
+  }
+
+  void AssignCustomAllocForTensor(int tensor_idx, int required_alignment) {
+    const TfLiteTensor* tensor = interpreter_->tensor(tensor_idx);
+    auto tensor_alloc = NewCustomAlloc(tensor->bytes, required_alignment);
+    ASSERT_EQ(
+        interpreter_->SetCustomAllocationForTensor(tensor_idx, tensor_alloc),
+        kTfLiteOk);
+  }
+
+  void VerifyInvoke() {
+    std::vector<float> input = {1.0f, 2.0f, 3.0f};
+    std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
+    TfLiteTensor* tensor = interpreter_->tensor(interpreter_->outputs()[0]);
+
+    // typed_tensor<...> should work irrespective of custom alloc, since it
+    // accesses tensor.data.
+    memcpy(interpreter_->typed_tensor<float>(0), input.data(),
+           3 * sizeof(float));
+    memcpy(interpreter_->typed_tensor<float>(1), input.data(),
+           3 * sizeof(float));
+    ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+    for (int i = 0; i < 3; ++i) {
+      EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+    }
+  }
+
+  // Actual initialized allocation is more than num_bytes, to account for
+  // required_allocation.
+  TfLiteCustomAllocation NewCustomAlloc(size_t num_bytes,
+                                        int required_alignment) {
+    // Extra memory to ensure alignment.
+    char* new_alloc = new char[num_bytes + required_alignment];
+    char* new_underlying_buffer_aligned_ptr = reinterpret_cast<char*>(
+        AlignTo(required_alignment, reinterpret_cast<intptr_t>(new_alloc)));
+    custom_alloc_buffers_.emplace_back(new_alloc);
+
+    return TfLiteCustomAllocation(
+        {new_underlying_buffer_aligned_ptr, num_bytes});
+  }
+
+  intptr_t AlignTo(size_t alignment, intptr_t offset) {
+    return offset % alignment == 0 ? offset
+                                   : offset + (alignment - offset % alignment);
+  }
+
+  void TearDown() override {
+    interpreter_.reset();
+    custom_alloc_buffers_.clear();
+  }
+
+ protected:
+  TfLiteAddParams add_params_;
+  std::unique_ptr<Interpreter> interpreter_;
+  std::vector<std::unique_ptr<char[]>> custom_alloc_buffers_;
+};
+
+TEST_F(TestCustomAllocation, InvalidAlignment) {
+  const TfLiteTensor* input_tensor =
+      interpreter_->tensor(interpreter_->inputs()[0]);
+  auto input_alloc =
+      NewCustomAlloc(input_tensor->bytes, kDefaultTensorAlignment - 1);
+  ASSERT_EQ(interpreter_->SetCustomAllocationForTensor(
+                interpreter_->inputs()[0], input_alloc),
+            kTfLiteError);
+
+  // Allocate tensors & Invoke should still work.
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  VerifyInvoke();
+}
+
+TEST_F(TestCustomAllocation, InsufficientBytes) {
+  auto input_alloc = NewCustomAlloc(4, kDefaultTensorAlignment);
+  ASSERT_EQ(interpreter_->SetCustomAllocationForTensor(
+                interpreter_->inputs()[0], input_alloc),
+            kTfLiteError);
+
+  // Allocate tensors & Invoke should still work.
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  VerifyInvoke();
+}
+
+TEST_F(TestCustomAllocation, CustomInputAlloc) {
+  // Set custom allocation for one input tensor.
+  AssignCustomAllocForTensor(interpreter_->inputs()[0],
+                             /*required_alignment=*/kDefaultTensorAlignment);
+
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  VerifyInvoke();
+}
+
+TEST_F(TestCustomAllocation, CustomInputAlloc_MultipleAssigns) {
+  // Set custom allocation for one input tensor.
+  AssignCustomAllocForTensor(interpreter_->inputs()[0],
+                             /*required_alignment=*/kDefaultTensorAlignment);
+
+  AssignCustomAllocForTensor(interpreter_->inputs()[0],
+                             /*required_alignment=*/kDefaultTensorAlignment);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  VerifyInvoke();
+
+  AssignCustomAllocForTensor(interpreter_->inputs()[0],
+                             /*required_alignment=*/kDefaultTensorAlignment);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  VerifyInvoke();
+}
+
+TEST_F(TestCustomAllocation, CustomInputAlloc_AllocateTensorsBefore) {
+  // Allocate tensors.
+  // Allocating now will cause TFLite to reserve some extra memory, but nothing
+  // should break.
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  AssignCustomAllocForTensor(interpreter_->inputs()[0],
+                             /*required_alignment=*/kDefaultTensorAlignment);
+
+  VerifyInvoke();
+}
+
+TEST_F(TestCustomAllocation, CustomInputAndOutputAllocs) {
+  // Set custom allocations for all IO tensors.
+  AssignCustomAllocForTensor(interpreter_->inputs()[0],
+                             /*required_alignment=*/kDefaultTensorAlignment);
+  AssignCustomAllocForTensor(interpreter_->inputs()[1],
+                             /*required_alignment=*/kDefaultTensorAlignment);
+  AssignCustomAllocForTensor(interpreter_->outputs()[0],
+                             /*required_alignment=*/kDefaultTensorAlignment);
+  AssignCustomAllocForTensor(interpreter_->outputs()[1],
+                             /*required_alignment=*/kDefaultTensorAlignment);
+
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  VerifyInvoke();
+}
+
+TEST_F(TestCustomAllocation, ResizeTensorsWithoutEnoughMemory) {
+  // Set custom allocations for all input tensors.
+  AssignCustomAllocForTensor(interpreter_->inputs()[0],
+                             /*required_alignment=*/kDefaultTensorAlignment);
+  AssignCustomAllocForTensor(interpreter_->inputs()[1],
+                             /*required_alignment=*/kDefaultTensorAlignment);
+
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  // Now resize tensors to double the size.
+  ASSERT_EQ(interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {2, 3}),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {2, 3}),
+            kTfLiteOk);
+
+  // Since the custom memory previously allocated isn't enough,
+  // AllocateTensors() will fail.
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteError);
+  // Interpreter should no longer be in invokable state, so expect failure.
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteError);
+}
+
+TEST_F(TestCustomAllocation, ResizeTensorsWithEnoughMemory) {
+  // Set custom allocations for all input tensors, with double the required
+  // memory.
+  const TfLiteTensor* input0_tensor =
+      interpreter_->tensor(interpreter_->inputs()[0]);
+  auto input0_alloc =
+      NewCustomAlloc(2 * input0_tensor->bytes, kDefaultTensorAlignment);
+  ASSERT_EQ(interpreter_->SetCustomAllocationForTensor(
+                interpreter_->inputs()[0], input0_alloc),
+            kTfLiteOk);
+  const TfLiteTensor* input1_tensor =
+      interpreter_->tensor(interpreter_->inputs()[1]);
+  auto input1_alloc =
+      NewCustomAlloc(2 * input1_tensor->bytes, kDefaultTensorAlignment);
+  ASSERT_EQ(interpreter_->SetCustomAllocationForTensor(
+                interpreter_->inputs()[1], input1_alloc),
+            kTfLiteOk);
+
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  // Now resize tensors to double the size.
+  ASSERT_EQ(interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {6, 1}),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {6, 1}),
+            kTfLiteOk);
+
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f, 8.0f, 10.0f, 12.0f};
+  TfLiteTensor* tensor = interpreter_->tensor(interpreter_->outputs()[0]);
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 6 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 6 * sizeof(float));
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+
+  ASSERT_EQ(interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {3, 1}),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {3, 1}),
+            kTfLiteOk);
+
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  VerifyInvoke();
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index 89be932ab4d..9bceb939c02 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -16,8 +16,10 @@ exports_files([
     "src/testdata/add.bin",
     "src/testdata/add_unknown_dimensions.bin",
     "src/testdata/grace_hopper_224.jpg",
+    "src/testdata/tile_with_bool_input.bin",
     "AndroidManifest.xml",
     "proguard.flags",
+    "tflite_version_script.lds",
 ])
 
 JAVA_SRCS = glob([
@@ -248,6 +250,7 @@ java_test(
     data = [
         "src/testdata/add.bin",
         "src/testdata/add_unknown_dimensions.bin",
+        "src/testdata/tile_with_bool_input.bin",
         "//tensorflow/lite:testdata/dynamic_shapes.bin",
         "//tensorflow/lite:testdata/multi_add.bin",
         "//tensorflow/lite:testdata/multi_add_flex.bin",
@@ -325,6 +328,7 @@ java_test(
         "src/testdata/int32.bin",
         "src/testdata/int64.bin",
         "src/testdata/quantized.bin",
+        "src/testdata/string.bin",
     ],
     javacopts = JAVACOPTS,
     tags = [
@@ -339,6 +343,33 @@ java_test(
     ],
 )
 
+java_test(
+    name = "InterpreterCustomizedAndroidBuildTest",
+    size = "small",
+    srcs = [
+        "src/test/java/org/tensorflow/lite/InterpreterCustomizedAndroidBuildTest.java",
+        "src/test/java/org/tensorflow/lite/TestUtils.java",
+    ],
+    data = [
+        "//tensorflow/lite:testdata/add.bin",
+        "//tensorflow/lite:testdata/test_model.bin",
+    ],
+    javacopts = JAVACOPTS,
+    # Add customized libtensorflowlite_jni.so to java_path
+    jvm_flags = ["-Djava.library.path=third_party/tensorflow/lite/testing"],
+    tags = [
+        "no_mac",  # TODO(b/122888913): libtensorflowlite_test_jni broke on mac.
+        "v1only",
+    ],
+    test_class = "org.tensorflow.lite.InterpreterCustomizedAndroidBuildTest",
+    visibility = ["//visibility:private"],
+    deps = [
+        "//tensorflow/lite/testing:customtized_tflite_for_add_ops",
+        "@com_google_truth",
+        "@junit",
+    ],
+)
+
 # portable_tests includes files for running TFLite interpreter tests.
 filegroup(
     name = "portable_tests",
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/DataType.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/DataType.java
index 527346c3c9b..1c4d4176763 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/DataType.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/DataType.java
@@ -32,6 +32,9 @@ public enum DataType {
   /** Strings. */
   STRING(5),
 
+  /** Bool. */
+  BOOL(6),
+
   /** 8-bit signed integer. */
   INT8(9);
 
@@ -45,7 +48,6 @@ public enum DataType {
   public int byteSize() {
     switch (this) {
       case FLOAT32:
-        return 4;
       case INT32:
         return 4;
       case INT8:
@@ -53,6 +55,9 @@ public enum DataType {
         return 1;
       case INT64:
         return 8;
+      case BOOL:
+        // Boolean size is JVM-dependent.
+        return -1;
       case STRING:
         return -1;
     }
@@ -92,6 +97,8 @@ public enum DataType {
         return "byte";
       case INT64:
         return "long";
+      case BOOL:
+        return "bool";
       case STRING:
         return "string";
     }
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index 5993ee7a037..59afc0c3608 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -281,6 +281,8 @@ public final class Interpreter implements AutoCloseable {
    *   <li>{@link LongBuffer} - compatible with int64 Tensors.
    * </ul>
    *
+   * Note that boolean types are only supported as arrays, not {@link Buffer}s, or as scalar inputs.
+   *
    * @param input an array or multidimensional array, or a {@link Buffer} of primitive types
    *     including int, float, long, and byte. {@link Buffer} is the preferred way to pass large
    *     input data for primitive types, whereas string types require using the (multi-dimensional)
@@ -319,6 +321,8 @@ public final class Interpreter implements AutoCloseable {
    *   <li>{@link LongBuffer} - compatible with int64 Tensors.
    * </ul>
    *
+   * Note that boolean types are only supported as arrays, not {@link Buffer}s, or as scalar inputs.
+   *
    * <p>Note: {@code null} values for invididual elements of {@code inputs} and {@code outputs} is
    * allowed only if the caller is using a {@link Delegate} that allows buffer handle interop, and
    * such a buffer has been bound to the corresponding input or output {@link Tensor}(s).
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
index 558200a7da7..f875c7424c9 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
@@ -302,7 +302,7 @@ public final class Tensor {
   }
 
   /** Returns the type of the data. */
-  static DataType dataTypeOf(Object o) {
+  DataType dataTypeOf(Object o) {
     if (o != null) {
       Class<?> c = o.getClass();
       // For arrays, the data elements must be a *primitive* type, e.g., an
@@ -316,9 +316,15 @@ public final class Tensor {
         } else if (int.class.equals(c)) {
           return DataType.INT32;
         } else if (byte.class.equals(c)) {
+          // Byte array can be used for storing string tensors, especially for ParseExample op.
+          if (dtype == DataType.STRING) {
+            return DataType.STRING;
+          }
           return DataType.UINT8;
         } else if (long.class.equals(c)) {
           return DataType.INT64;
+        } else if (boolean.class.equals(c)) {
+          return DataType.BOOL;
         } else if (String.class.equals(c)) {
           return DataType.STRING;
         }
@@ -335,6 +341,8 @@ public final class Tensor {
           return DataType.UINT8;
         } else if (Long.class.equals(c) || o instanceof LongBuffer) {
           return DataType.INT64;
+        } else if (Boolean.class.equals(c)) {
+          return DataType.BOOL;
         } else if (String.class.equals(c)) {
           return DataType.STRING;
         }
@@ -345,8 +353,21 @@ public final class Tensor {
   }
 
   /** Returns the shape of an object as an int array. */
-  static int[] computeShapeOf(Object o) {
+  int[] computeShapeOf(Object o) {
     int size = computeNumDimensions(o);
+    if (dtype == DataType.STRING) {
+      Class<?> c = o.getClass();
+      if (c.isArray()) {
+        while (c.isArray()) {
+          c = c.getComponentType();
+        }
+        // If the given string data is stored in byte streams, the last array dimension should be
+        // treated as a value.
+        if (byte.class.equals(c)) {
+          --size;
+        }
+      }
+    }
     int[] dimensions = new int[size];
     fillShape(o, 0, dimensions);
     return dimensions;
diff --git a/tensorflow/lite/java/src/main/native/BUILD b/tensorflow/lite/java/src/main/native/BUILD
index 52f79615a9f..aba288a314d 100644
--- a/tensorflow/lite/java/src/main/native/BUILD
+++ b/tensorflow/lite/java/src/main/native/BUILD
@@ -71,5 +71,6 @@ exports_files(
     [
         "exported_symbols.lds",
         "version_script.lds",
+        "op_resolver.h",
     ],
 )
diff --git a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index 7abe0f518f0..2d1844fbd39 100644
--- a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -102,6 +102,8 @@ int getDataType(TfLiteType data_type) {
       return 4;
     case kTfLiteString:
       return 5;
+    case kTfLiteBool:
+      return 6;
     default:
       return -1;
   }
diff --git a/tensorflow/lite/java/src/main/native/tensor_jni.cc b/tensorflow/lite/java/src/main/native/tensor_jni.cc
index dfa4e22162a..5dfca9ebe6c 100644
--- a/tensorflow/lite/java/src/main/native/tensor_jni.cc
+++ b/tensorflow/lite/java/src/main/native/tensor_jni.cc
@@ -28,6 +28,9 @@ using tflite::jni::ThrowException;
 
 namespace {
 
+static const char* kByteArrayClassPath = "[B";
+static const char* kStringClassPath = "java/lang/String";
+
 // Convenience handle for obtaining a TfLiteTensor given an interpreter and
 // tensor index.
 //
@@ -139,13 +142,20 @@ size_t WriteOneDimensionalArray(JNIEnv* env, jobject object, TfLiteType type,
       env->GetByteArrayRegion(byte_array, 0, num_elements, byte_dst);
       return to_copy;
     }
+    case kTfLiteBool: {
+      jbooleanArray bool_array = static_cast<jbooleanArray>(array);
+      jboolean* bool_dst = static_cast<jboolean*>(dst);
+      env->GetBooleanArrayRegion(bool_array, 0, num_elements, bool_dst);
+      return to_copy;
+    }
     default: {
-      ThrowException(env, kUnsupportedOperationException,
-                     "DataType error: TensorFlowLite currently supports float "
-                     "(32 bits), int (32 bits), byte (8 bits), and long "
-                     "(64 bits), support for other types (DataType %d in this "
-                     "case) will be added in the future",
-                     kTfLiteFloat32, type);
+      ThrowException(
+          env, kUnsupportedOperationException,
+          "DataType error: TensorFlowLite currently supports float "
+          "(32 bits), int (32 bits), byte (8 bits), bool (8 bits), and long "
+          "(64 bits), support for other types (DataType %d in this "
+          "case) will be added in the future",
+          kTfLiteFloat32, type);
       return 0;
     }
   }
@@ -188,6 +198,12 @@ size_t ReadOneDimensionalArray(JNIEnv* env, TfLiteType data_type,
                               static_cast<const jbyte*>(src));
       return size;
     }
+    case kTfLiteBool: {
+      jbooleanArray bool_array = static_cast<jbooleanArray>(dst);
+      env->SetBooleanArrayRegion(bool_array, 0, len,
+                                 static_cast<const jboolean*>(src));
+      return size;
+    }
     default: {
       ThrowException(env, kIllegalStateException,
                      "DataType error: invalid DataType(%d)", data_type);
@@ -271,13 +287,24 @@ size_t WriteMultiDimensionalArray(JNIEnv* env, jobject src, TfLiteType type,
   }
 }
 
-void AddStringDynamicBuffer(JNIEnv* env, jstring src,
+void AddStringDynamicBuffer(JNIEnv* env, jobject src,
                             tflite::DynamicBuffer* dst_buffer) {
-  const char* chars = env->GetStringUTFChars(src, nullptr);
-  // + 1 for terminating character.
-  const int byte_len = env->GetStringUTFLength(src) + 1;
-  dst_buffer->AddString(chars, byte_len);
-  env->ReleaseStringUTFChars(src, chars);
+  if (env->IsInstanceOf(src, env->FindClass(kStringClassPath))) {
+    jstring str = static_cast<jstring>(src);
+    const char* chars = env->GetStringUTFChars(str, nullptr);
+    // + 1 for terminating character.
+    const int byte_len = env->GetStringUTFLength(str) + 1;
+    dst_buffer->AddString(chars, byte_len);
+    env->ReleaseStringUTFChars(str, chars);
+  }
+  if (env->IsInstanceOf(src, env->FindClass(kByteArrayClassPath))) {
+    jbyteArray byte_array = static_cast<jbyteArray>(src);
+    jsize byte_array_length = env->GetArrayLength(byte_array);
+    jbyte* bytes = env->GetByteArrayElements(byte_array, nullptr);
+    dst_buffer->AddString(reinterpret_cast<const char*>(bytes),
+                          byte_array_length);
+    env->ReleaseByteArrayElements(byte_array, bytes, JNI_ABORT);
+  }
 }
 
 void PopulateStringDynamicBuffer(JNIEnv* env, jobject src,
@@ -290,10 +317,9 @@ void PopulateStringDynamicBuffer(JNIEnv* env, jobject src,
   // recursively call populateStringDynamicBuffer over sub-dimensions.
   if (dims_left <= 1) {
     for (int i = 0; i < num_elements; ++i) {
-      jstring string_obj =
-          static_cast<jstring>(env->GetObjectArrayElement(object_array, i));
-      AddStringDynamicBuffer(env, string_obj, dst_buffer);
-      env->DeleteLocalRef(string_obj);
+      jobject obj = env->GetObjectArrayElement(object_array, i);
+      AddStringDynamicBuffer(env, obj, dst_buffer);
+      env->DeleteLocalRef(obj);
     }
   } else {
     for (int i = 0; i < num_elements; ++i) {
@@ -358,7 +384,7 @@ void WriteScalar(JNIEnv* env, jobject src, TfLiteType type, void* dst,
 
 void WriteScalarString(JNIEnv* env, jobject src, TfLiteTensor* tensor) {
   tflite::DynamicBuffer dst_buffer;
-  AddStringDynamicBuffer(env, static_cast<jstring>(src), &dst_buffer);
+  AddStringDynamicBuffer(env, src, &dst_buffer);
   if (!env->ExceptionCheck()) {
     dst_buffer.WriteToTensor(tensor, /*new_shape=*/nullptr);
   }
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterCustomizedAndroidBuildTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterCustomizedAndroidBuildTest.java
new file mode 100644
index 00000000000..64722e5c3c5
--- /dev/null
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterCustomizedAndroidBuildTest.java
@@ -0,0 +1,63 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite;
+
+import static com.google.common.truth.Truth.assertThat;
+import static org.junit.Assert.fail;
+
+import java.nio.ByteBuffer;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/** Unit tests for {@link org.tensorflow.lite.Interpreter} with selective registration. */
+@RunWith(JUnit4.class)
+public final class InterpreterCustomizedAndroidBuildTest {
+  // Supported model.
+  private static final String SUPPORTED_MODEL_PATH = "tensorflow/lite/testdata/add.bin";
+  private static final ByteBuffer SUPPORTED_MODEL_BUFFER =
+      TestUtils.getTestFileAsBuffer(SUPPORTED_MODEL_PATH);
+
+  // Model with unregistered operator.
+  private static final String UNSUPPORTED_MODEL_PATH =
+      "tensorflow/lite/testdata/test_model.bin";
+  private static final ByteBuffer UNSUPPORTED_MODEL_BUFFER =
+      TestUtils.getTestFileAsBuffer(UNSUPPORTED_MODEL_PATH);
+
+  @Test
+  public void testSupportedModel() throws Exception {
+    try (Interpreter interpreter = new Interpreter(SUPPORTED_MODEL_BUFFER)) {
+      assertThat(interpreter).isNotNull();
+      float[] oneD = {1.23f, 6.54f, 7.81f};
+      float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+      float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+      float[][][][] fourD = {threeD, threeD};
+      float[][][][] parsedOutputs = new float[2][8][8][3];
+      interpreter.run(fourD, parsedOutputs);
+    }
+  }
+
+  @Test
+  public void testUnsupportedModel() throws Exception {
+    try (Interpreter interpreter = new Interpreter(UNSUPPORTED_MODEL_BUFFER)) {
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e)
+          .hasMessageThat()
+          .contains("Cannot create interpreter: Didn't find op for builtin opcode 'CONV_2D'");
+    }
+  }
+}
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
index f1d4ff147b1..8f52422dde0 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
@@ -42,6 +42,8 @@ public final class InterpreterTest {
       "tensorflow/lite/java/src/testdata/add_unknown_dimensions.bin";
   private static final String DYNAMIC_SHAPES_MODEL_PATH =
       "tensorflow/lite/testdata/dynamic_shapes.bin";
+  private static final String BOOL_MODEL =
+      "tensorflow/lite/java/src/testdata/tile_with_bool_input.bin";
 
   private static final ByteBuffer MODEL_BUFFER = TestUtils.getTestFileAsBuffer(MODEL_PATH);
   private static final ByteBuffer MULTIPLE_INPUTS_MODEL_BUFFER =
@@ -52,6 +54,7 @@ public final class InterpreterTest {
       TestUtils.getTestFileAsBuffer(UNKNOWN_DIMS_MODEL_PATH);
   private static final ByteBuffer DYNAMIC_SHAPES_MODEL_BUFFER =
       TestUtils.getTestFileAsBuffer(DYNAMIC_SHAPES_MODEL_PATH);
+  private static final ByteBuffer BOOL_MODEL_BUFFER = TestUtils.getTestFileAsBuffer(BOOL_MODEL);
 
   @Test
   public void testInterpreter() throws Exception {
@@ -611,6 +614,27 @@ public final class InterpreterTest {
     }
   }
 
+  @Test
+  public void testBoolModel() throws Exception {
+    boolean[][][] inputs = {{{true, false}, {false, true}}, {{true, true}, {false, true}}};
+    int[] multipliers = {1, 1, 2};
+    boolean[][][] parsedOutputs = new boolean[2][2][4];
+
+    try (Interpreter interpreter = new Interpreter(BOOL_MODEL_BUFFER)) {
+      assertThat(interpreter.getInputTensor(0).dataType()).isEqualTo(DataType.BOOL);
+      Object[] inputsArray = {inputs, multipliers};
+      Map<Integer, Object> outputsMap = new HashMap<>();
+      outputsMap.put(0, parsedOutputs);
+      interpreter.runForMultipleInputsOutputs(inputsArray, outputsMap);
+
+      boolean[][][] expectedOutputs = {
+        {{true, false, true, false}, {false, true, false, true}},
+        {{true, true, true, true}, {false, true, false, true}}
+      };
+      assertThat(parsedOutputs).isEqualTo(expectedOutputs);
+    }
+  }
+
   private static FloatBuffer fill(FloatBuffer buffer, float value) {
     while (buffer.hasRemaining()) {
       buffer.put(value);
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
index be09bd5b8fe..4305de8000d 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
@@ -46,6 +46,9 @@ public final class TensorTest {
   private static final String LONG_MODEL_PATH =
       "tensorflow/lite/java/src/testdata/int64.bin";
 
+  private static final String STRING_MODEL_PATH =
+      "tensorflow/lite/java/src/testdata/string.bin";
+
   private static final String QUANTIZED_MODEL_PATH =
       "tensorflow/lite/java/src/testdata/quantized.bin";
 
@@ -412,30 +415,30 @@ public final class TensorTest {
   @Test
   public void testDataTypeOf() {
     float[] testEmptyArray = {};
-    DataType dataType = Tensor.dataTypeOf(testEmptyArray);
+    DataType dataType = tensor.dataTypeOf(testEmptyArray);
     assertThat(dataType).isEqualTo(DataType.FLOAT32);
     float[] testFloatArray = {0.783f, 0.251f};
-    dataType = Tensor.dataTypeOf(testFloatArray);
+    dataType = tensor.dataTypeOf(testFloatArray);
     assertThat(dataType).isEqualTo(DataType.FLOAT32);
     float[][] testMultiDimArray = {testFloatArray, testFloatArray, testFloatArray};
-    dataType = Tensor.dataTypeOf(testMultiDimArray);
+    dataType = tensor.dataTypeOf(testMultiDimArray);
     assertThat(dataType).isEqualTo(DataType.FLOAT32);
     FloatBuffer testFloatBuffer = FloatBuffer.allocate(1);
-    dataType = Tensor.dataTypeOf(testFloatBuffer);
+    dataType = tensor.dataTypeOf(testFloatBuffer);
     assertThat(dataType).isEqualTo(DataType.FLOAT32);
     float testFloat = 1.0f;
-    dataType = Tensor.dataTypeOf(testFloat);
+    dataType = tensor.dataTypeOf(testFloat);
     assertThat(dataType).isEqualTo(DataType.FLOAT32);
     try {
       double[] testDoubleArray = {0.783, 0.251};
-      Tensor.dataTypeOf(testDoubleArray);
+      tensor.dataTypeOf(testDoubleArray);
       fail();
     } catch (IllegalArgumentException e) {
       assertThat(e).hasMessageThat().contains("cannot resolve DataType of");
     }
     try {
       Float[] testBoxedArray = {0.783f, 0.251f};
-      Tensor.dataTypeOf(testBoxedArray);
+      tensor.dataTypeOf(testBoxedArray);
       fail();
     } catch (IllegalArgumentException e) {
       assertThat(e).hasMessageThat().contains("cannot resolve DataType of [Ljava.lang.Float;");
@@ -528,4 +531,15 @@ public final class TensorTest {
     assertThat(scale).isWithin(1e-6f).of(0.25f);
     assertThat(zeroPoint).isEqualTo(127);
   }
+
+  @Test
+  public void testByteArrayStringTensorInput() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(STRING_MODEL_PATH);
+    wrapper.resizeInput(0, new int[] {1});
+    Tensor stringTensor = wrapper.getInputTensor(0);
+
+    byte[][] byteArray = new byte[][] {new byte[1]};
+    assertThat(stringTensor.dataTypeOf(byteArray)).isEqualTo(DataType.STRING);
+    assertThat(stringTensor.shape()).isEqualTo(new int[] {1});
+  }
 }
diff --git a/tensorflow/lite/java/src/testdata/tile_with_bool_input.bin b/tensorflow/lite/java/src/testdata/tile_with_bool_input.bin
new file mode 100644
index 00000000000..fadff5298ac
Binary files /dev/null and b/tensorflow/lite/java/src/testdata/tile_with_bool_input.bin differ
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 4351a2c93a2..a56d370afeb 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -1,5 +1,5 @@
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
-load("//tensorflow/lite/micro:build_def.bzl", "cc_library", "micro_copts")
+load("//tensorflow/lite/micro:build_def.bzl", "micro_copts")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite_combined")
 load("//tensorflow:tensorflow.bzl", "tf_opts_nortti_if_android")
 
@@ -171,7 +171,7 @@ cc_library(
     deps = [
         ":acceleration_test_util",
         ":builtin_ops",
-        ":test_util_delegate_providers",
+        ":test_delegate_providers_lib",
         "//tensorflow/core/platform:logging",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
@@ -189,7 +189,6 @@ cc_library(
         "//tensorflow/lite/tools:command_line_flags",
         "//tensorflow/lite/tools:logging",
         "//tensorflow/lite/tools:tool_params",
-        "//tensorflow/lite/tools/delegates:delegate_provider_hdr",
         "//tensorflow/lite/tools/optimize:quantization_utils",
         "//tensorflow/lite/tools/optimize/sparsity:format_converter",
         "//tensorflow/lite/tools/versioning",
@@ -198,7 +197,8 @@ cc_library(
     ],
 )
 
-# A convenient library for tflite delegate execution providers.
+# A convenient library of tflite delegate execution providers for kernel tests
+# based on SingleOpModel or its derivatives defined in test_util.h/cc.
 cc_library(
     name = "test_util_delegate_providers",
     copts = tflite_copts(),
@@ -220,13 +220,28 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "test_delegate_providers_lib",
+    srcs = ["test_delegate_providers.cc"],
+    hdrs = ["test_delegate_providers.h"],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/lite/tools:command_line_flags",
+        "//tensorflow/lite/tools:logging",
+        "//tensorflow/lite/tools:tool_params",
+        "//tensorflow/lite/tools/delegates:delegate_provider_hdr",
+    ],
+)
+
 # TODO(b/132204084): Create tflite_cc_test rule to automate test_main inclusion.
 cc_library(
     name = "test_main",
     testonly = 1,
     srcs = ["test_main.cc"],
     deps = [
+        ":test_delegate_providers_lib",
         ":test_util",
+        ":test_util_delegate_providers",
         "//tensorflow/lite/testing:util",
         "//tensorflow/lite/tools:command_line_flags",
         "@com_google_googletest//:gtest",
@@ -266,7 +281,6 @@ cc_test(
 
 cc_library(
     name = "tflite_with_ruy_enabled",
-    build_for_embedded = True,
     defines = ["TFLITE_WITH_RUY"],
     visibility = ["//visibility:private"],
 )
@@ -282,8 +296,8 @@ cc_library(
 
 cc_library(
     name = "tflite_with_ruy_default",
-    build_for_embedded = True,
-    select_deps = {
+    visibility = ["//visibility:private"],
+    deps = select({
         ":chromiumos_arm64": [":tflite_with_ruy_enabled"],
         ":cpu_aarch64": [":tflite_with_ruy_enabled"],
         ":cpu_arm64": [":tflite_with_ruy_enabled"],
@@ -293,18 +307,16 @@ cc_library(
         ":cpu_arm64_v8a": [":tflite_with_ruy_enabled"],
         "//tensorflow:android_arm": ["tflite_with_ruy_enabled"],
         "//conditions:default": [],
-    },
-    visibility = ["//visibility:private"],
+    }),
 )
 
 cc_library(
     name = "tflite_with_ruy",
-    build_for_embedded = True,
-    select_deps = {
+    deps = select({
         ":tflite_with_ruy_explicit_true": [":tflite_with_ruy_enabled"],
         ":tflite_with_ruy_explicit_false": [],
         "//conditions:default": [":tflite_with_ruy_default"],
-    },
+    }),
 )
 
 cc_library(
@@ -417,7 +429,6 @@ cc_library(
     hdrs = [
         "op_macros.h",
     ],
-    build_for_embedded = True,
     copts = tflite_copts(),
     deps = ["//tensorflow/lite/micro:debug_log"],
 )
@@ -430,7 +441,6 @@ cc_library(
     hdrs = [
         "kernel_util.h",
     ],
-    build_for_embedded = True,
     copts = tflite_copts() + micro_copts(),
     deps = [
         "//tensorflow/lite/c:common",
@@ -456,12 +466,6 @@ cc_test(
     name = "test_util_test",
     size = "small",
     srcs = ["test_util_test.cc"],
-    # See details in https://github.com/bazelbuild/bazel/issues/11552 to avoid
-    # lazy symbol binding failure on macOS.
-    linkstatic = select({
-        "//tensorflow:macos": True,
-        "//conditions:default": False,
-    }),
     deps = [
         ":test_util",
         "//tensorflow/lite/testing:util",
@@ -469,11 +473,29 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "test_delegate_providers_lib_test",
+    size = "small",
+    srcs = ["test_delegate_providers_test.cc"],
+    # See details in https://github.com/bazelbuild/bazel/issues/11552 to avoid
+    # lazy symbol binding failure on macOS.
+    linkstatic = select({
+        "//tensorflow:macos": True,
+        "//conditions:default": False,
+    }),
+    deps = [
+        ":test_delegate_providers_lib",
+        "//tensorflow/lite/tools/delegates:default_execution_provider",
+        "//tensorflow/lite/tools/delegates:nnapi_delegate_provider",
+        "//tensorflow/lite/tools/delegates:xnnpack_delegate_provider",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "padding",
     srcs = [],
     hdrs = ["padding.h"],
-    build_for_embedded = True,
     copts = tflite_copts(),
     deps = [
         "//tensorflow/lite/c:common",
diff --git a/tensorflow/lite/kernels/acceleration_test_util_internal.cc b/tensorflow/lite/kernels/acceleration_test_util_internal.cc
index a6ad8234f59..825d03f245d 100644
--- a/tensorflow/lite/kernels/acceleration_test_util_internal.cc
+++ b/tensorflow/lite/kernels/acceleration_test_util_internal.cc
@@ -46,7 +46,7 @@ void ReadAccelerationConfig(
       auto first_sep_pos =
           std::find(curr_config_line.begin(), curr_config_line.end(), ',');
 
-      bool is_blacklist = false;
+      bool is_denylist = false;
       std::string key = curr_config_line;
       std::string value{};
       if (first_sep_pos != curr_config_line.end()) {
@@ -54,13 +54,13 @@ void ReadAccelerationConfig(
         value = std::string(first_sep_pos + 1, curr_config_line.end());
       }
 
-      // Regexps starting with '-'' are blacklist ones.
+      // Regexps starting with '-'' are denylist ones.
       if (key[0] == '-') {
         key = key.substr(1);
-        is_blacklist = true;
+        is_denylist = true;
       }
 
-      consumer(key, value, is_blacklist);
+      consumer(key, value, is_denylist);
     }
   }
 }
diff --git a/tensorflow/lite/kernels/acceleration_test_util_internal.h b/tensorflow/lite/kernels/acceleration_test_util_internal.h
index 24fc2383f9e..857a2da8749 100644
--- a/tensorflow/lite/kernels/acceleration_test_util_internal.h
+++ b/tensorflow/lite/kernels/acceleration_test_util_internal.h
@@ -39,15 +39,15 @@ template <typename T>
 class ConfigurationEntry {
  public:
   ConfigurationEntry(const std::string& test_id_rex, T test_config,
-                     bool is_blacklist)
+                     bool is_denylist)
       : test_id_rex_(test_id_rex),
         test_config_(test_config),
-        is_blacklist_(is_blacklist) {}
+        is_denylist_(is_denylist) {}
 
   bool Matches(const std::string& test_id) {
     return RE2::FullMatch(test_id, test_id_rex_);
   }
-  bool IsBlacklistEntry() const { return is_blacklist_; }
+  bool IsDenylistEntry() const { return is_denylist_; }
   const T& TestConfig() const { return test_config_; }
 
   const std::string& TestIdRex() const { return test_id_rex_; }
@@ -55,7 +55,7 @@ class ConfigurationEntry {
  private:
   std::string test_id_rex_;
   T test_config_;
-  bool is_blacklist_;
+  bool is_denylist_;
 };
 
 // Returns the acceleration test configuration for the given test id and
@@ -71,9 +71,9 @@ absl::optional<T> GetAccelerationTestParam(std::string test_id) {
     auto config = new std::vector<ConfigurationEntry<T>>();
 
     auto consumer = [&config](std::string key, std::string value_str,
-                              bool is_blacklist) mutable {
+                              bool is_denylist) mutable {
       T value = T::ParseConfigurationLine(value_str);
-      config->push_back(ConfigurationEntry<T>(key, value, is_blacklist));
+      config->push_back(ConfigurationEntry<T>(key, value, is_denylist));
     };
 
     ReadAccelerationConfig(T::kAccelerationTestConfig, consumer);
@@ -91,7 +91,7 @@ absl::optional<T> GetAccelerationTestParam(std::string test_id) {
       test_config->begin(), test_config->end(),
       [&test_id](ConfigurationEntry<T> elem) { return elem.Matches(test_id); });
   if (test_config_iter != test_config->end() &&
-      !test_config_iter->IsBlacklistEntry()) {
+      !test_config_iter->IsDenylistEntry()) {
     return absl::optional<T>(test_config_iter->TestConfig());
   } else {
     return absl::optional<T>();
diff --git a/tensorflow/lite/kernels/acceleration_test_util_internal_test.cc b/tensorflow/lite/kernels/acceleration_test_util_internal_test.cc
index 6d6b7a722b8..400b366f33e 100644
--- a/tensorflow/lite/kernels/acceleration_test_util_internal_test.cc
+++ b/tensorflow/lite/kernels/acceleration_test_util_internal_test.cc
@@ -52,11 +52,11 @@ struct SimpleConfig {
 class ReadAccelerationConfigTest : public ::testing::Test {
  public:
   std::unordered_map<std::string, SimpleConfig> allowlist_;
-  std::unordered_map<std::string, SimpleConfig> blacklist_;
+  std::unordered_map<std::string, SimpleConfig> denylist_;
   std::function<void(std::string, std::string, bool)> consumer_ =
-      [this](std::string key, std::string value, bool is_blacklist) {
-        if (is_blacklist) {
-          blacklist_[key] = {value};
+      [this](std::string key, std::string value, bool is_denylist) {
+        if (is_denylist) {
+          denylist_[key] = {value};
         } else {
           allowlist_[key] = {value};
         }
@@ -67,13 +67,13 @@ TEST_F(ReadAccelerationConfigTest, ReadsAKeyOnlyLine) {
   ReadAccelerationConfig("key", consumer_);
 
   EXPECT_THAT(allowlist_.find("key"), Not(Eq(allowlist_.end())));
-  EXPECT_TRUE(blacklist_.empty());
+  EXPECT_TRUE(denylist_.empty());
 }
 
-TEST_F(ReadAccelerationConfigTest, ReadsABlacklistKeyOnlyLine) {
+TEST_F(ReadAccelerationConfigTest, ReadsADenylistKeyOnlyLine) {
   ReadAccelerationConfig("-key", consumer_);
 
-  EXPECT_THAT(blacklist_.find("key"), Not(Eq(allowlist_.end())));
+  EXPECT_THAT(denylist_.find("key"), Not(Eq(allowlist_.end())));
   EXPECT_TRUE(allowlist_.empty());
 }
 
@@ -81,13 +81,13 @@ TEST_F(ReadAccelerationConfigTest, ReadsAKeyValueLine) {
   ReadAccelerationConfig("key,value", consumer_);
 
   EXPECT_THAT(allowlist_["key"].value, Eq("value"));
-  EXPECT_TRUE(blacklist_.empty());
+  EXPECT_TRUE(denylist_.empty());
 }
 
-TEST_F(ReadAccelerationConfigTest, ReadsABlackListKeyValueLine) {
+TEST_F(ReadAccelerationConfigTest, ReadsADenyListKeyValueLine) {
   ReadAccelerationConfig("-key,value", consumer_);
 
-  EXPECT_THAT(blacklist_["key"].value, Eq("value"));
+  EXPECT_THAT(denylist_["key"].value, Eq("value"));
   EXPECT_TRUE(allowlist_.empty());
 }
 
@@ -95,13 +95,13 @@ TEST_F(ReadAccelerationConfigTest, KeysAreLeftTrimmed) {
   ReadAccelerationConfig("  key,value", consumer_);
 
   EXPECT_THAT(allowlist_["key"].value, Eq("value"));
-  EXPECT_TRUE(blacklist_.empty());
+  EXPECT_TRUE(denylist_.empty());
 }
 
 TEST_F(ReadAccelerationConfigTest, BlKeysAreLeftTrimmed) {
   ReadAccelerationConfig("  -key,value", consumer_);
 
-  EXPECT_THAT(blacklist_["key"].value, Eq("value"));
+  EXPECT_THAT(denylist_["key"].value, Eq("value"));
   EXPECT_TRUE(allowlist_.empty());
 }
 
@@ -109,14 +109,14 @@ TEST_F(ReadAccelerationConfigTest, IgnoresCommentedLines) {
   ReadAccelerationConfig("#key,value", consumer_);
 
   EXPECT_TRUE(allowlist_.empty());
-  EXPECT_TRUE(blacklist_.empty());
+  EXPECT_TRUE(denylist_.empty());
 }
 
 TEST_F(ReadAccelerationConfigTest, CommentCanHaveTrailingBlanks) {
   ReadAccelerationConfig("  #key,value", consumer_);
 
   EXPECT_TRUE(allowlist_.empty());
-  EXPECT_TRUE(blacklist_.empty());
+  EXPECT_TRUE(denylist_.empty());
 }
 
 TEST_F(ReadAccelerationConfigTest, CommentsAreOnlyForTheFullLine) {
@@ -129,7 +129,7 @@ TEST_F(ReadAccelerationConfigTest, IgnoresEmptyLines) {
   ReadAccelerationConfig("", consumer_);
 
   EXPECT_TRUE(allowlist_.empty());
-  EXPECT_TRUE(blacklist_.empty());
+  EXPECT_TRUE(denylist_.empty());
 }
 
 TEST_F(ReadAccelerationConfigTest, ParsesMultipleLines) {
@@ -137,7 +137,7 @@ TEST_F(ReadAccelerationConfigTest, ParsesMultipleLines) {
 
   EXPECT_THAT(allowlist_["key1"].value, Eq("value1"));
   EXPECT_THAT(allowlist_["key2"].value, Eq("value2"));
-  EXPECT_THAT(blacklist_["key3"].value, Eq("value3"));
+  EXPECT_THAT(denylist_["key3"].value, Eq("value3"));
 }
 
 TEST_F(ReadAccelerationConfigTest, ParsesMultipleLinesWithCommentsAndSpaces) {
@@ -177,7 +177,7 @@ TEST(GetAccelerationTestParam, SupportsWildcards) {
   ASSERT_THAT(config_value_maybe.value().value, Eq("data-4"));
 }
 
-TEST(GetAccelerationTestParam, SupportBlacklist) {
+TEST(GetAccelerationTestParam, SupportDenylist) {
   const auto config_value_maybe =
       GetAccelerationTestParam<SimpleConfig>("test-5");
   ASSERT_FALSE(config_value_maybe.has_value());
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index c62b75962aa..654ccbc27ec 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -1313,6 +1313,20 @@ TfLiteStatus LeakyReluEval(TfLiteContext* context, TfLiteNode* node) {
   }
 }
 
+TfLiteStatus EluPrepare(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  // Use LUT to handle quantized elu path.
+  if (input->type == kTfLiteInt8) {
+    PopulateLookupTable<int8_t>(data, input, output, [](float value) {
+      return value < 0.0 ? std::exp(value) - 1.0f : value;
+    });
+  }
+  return GenericPrepare(context, node);
+}
+
 TfLiteStatus EluEval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
@@ -1322,10 +1336,15 @@ TfLiteStatus EluEval(TfLiteContext* context, TfLiteNode* node) {
                          GetTensorShape(output), GetTensorData<float>(output));
       return kTfLiteOk;
     } break;
+    case kTfLiteInt8: {
+      OpData* data = reinterpret_cast<OpData*>(node->user_data);
+      EvalUsingLookupTable(data, input, output);
+      return kTfLiteOk;
+    } break;
     default:
-      TF_LITE_KERNEL_LOG(context,
-                         "Only float32 is supported currently, got %s.",
-                         TfLiteTypeGetName(input->type));
+      TF_LITE_KERNEL_LOG(
+          context, "Only float32 and int8 is supported currently, got %s.",
+          TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -1333,9 +1352,8 @@ TfLiteStatus EluEval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace activations
 
 TfLiteRegistration* Register_ELU() {
-  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
-                                 activations::GenericPrepare,
-                                 activations::EluEval};
+  static TfLiteRegistration r = {activations::Init, activations::Free,
+                                 activations::EluPrepare, activations::EluEval};
   return &r;
 }
 
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index 50b1c041e34..d8f883b9c1d 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -253,6 +253,29 @@ TEST(FloatActivationsOpTest, Elu) {
                              })));
 }
 
+TEST(QuantizedActivationsOpTest, EluInt8) {
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  QuantizedActivationsOpModel model(
+      BuiltinOperator_ELU,
+      /*input=*/{TensorType_INT8, {1, 2, 4, 1}, 8 * kMin, 8 * kMax},
+      /*output=*/{TensorType_INT8, {1, 2, 4, 1}, 8 * kMin, 8 * kMax});
+
+  model.SetInput<int8_t>({
+      0, -6, 2, -4,    //
+      3, -2, 6, -0.1,  //
+  });
+
+  model.Invoke();
+  EXPECT_THAT(model.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0, -1.0, 2.0, -1,          //
+                      3.0, -0.875, 6.0, -0.125,  //
+                  },
+                  kQuantizedTolerance)));
+}
+
 TEST(FloatActivationsOpTest, Relu) {
   FloatActivationsOpModel m(BuiltinOperator_RELU,
                             /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
diff --git a/tensorflow/lite/kernels/conv.cc b/tensorflow/lite/kernels/conv.cc
index 81069de1abe..3c11ecf172b 100644
--- a/tensorflow/lite/kernels/conv.cc
+++ b/tensorflow/lite/kernels/conv.cc
@@ -888,13 +888,16 @@ void EvalHybrid(TfLiteContext* context, TfLiteNode* node,
       GetTemporary(context, node, data->scaling_factors_index));
 
   // Per-batch input quantization for higher accuracy.
-  for (int b = 0; b < batch_size; ++b) {
-    float unused_min, unused_max;
-    const int offset = b * input_size;
-    tensor_utils::SymmetricQuantizeFloats(
-        input_ptr + offset, input_size, quantized_input_ptr_batch + offset,
-        &unused_min, &unused_max, &scaling_factors_ptr[b]);
-    scaling_factors_ptr[b] *= filter->params.scale;
+  {
+    ruy::profiler::ScopeLabel label("ConvHybridQuantizeInputs");
+    for (int b = 0; b < batch_size; ++b) {
+      float unused_min, unused_max;
+      const int offset = b * input_size;
+      tensor_utils::SymmetricQuantizeFloats(
+          input_ptr + offset, input_size, quantized_input_ptr_batch + offset,
+          &unused_min, &unused_max, &scaling_factors_ptr[b]);
+      scaling_factors_ptr[b] *= filter->params.scale;
+    }
   }
 
   switch (kernel_type) {
@@ -902,8 +905,7 @@ void EvalHybrid(TfLiteContext* context, TfLiteNode* node,
     case kGenericOptimized:
     case kMultithreadOptimized:
     case kCblasOptimized: {
-      // There is only one implementation for hybrid kernel. Note
-      // this does not make use of gemmlowp nor supports multithreading.
+      // There is only one implementation for hybrid kernel.
       ConvParams op_params;
       op_params.padding_type = PaddingType::kSame;
       op_params.padding_values.width = data->padding.width;
diff --git a/tensorflow/lite/kernels/conv_test.cc b/tensorflow/lite/kernels/conv_test.cc
index ac78bc6b353..d816e08f560 100644
--- a/tensorflow/lite/kernels/conv_test.cc
+++ b/tensorflow/lite/kernels/conv_test.cc
@@ -1295,6 +1295,47 @@ TEST_P(ConvolutionOpTest, SimpleTestHybridInt8) {
                                  0.16)));
 }
 
+TEST_P(ConvolutionOpTest, SimpleTestHybridInt8Big) {
+  // A bigger variant of the simple hybrid test to ensure coverage on
+  // optimized paths that are only enabled at larger matrix sizes.
+  HybridConvolutionOpModel m(
+      GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 1}},
+      {TensorType_INT8, {8, 2, 2, 1}, 0, 0, 4.0 / 127.0, 0},
+      {TensorType_FLOAT32, {}});
+
+  m.SetInput({
+      // First batch
+      1, 1, 1, 1,  // row = 1
+      2, 2, 2, 2,  // row = 2
+      // Second batch
+      1, 2, 3, 4,  // row = 1
+      1, 2, 3, 4,  // row = 2
+  });
+  m.SetSignedFilter({
+      1,  2,  3,  4,   // first 2x2 filter
+      -1, 1,  -1, 1,   // second 2x2 filter
+      -1, -1, 1,  1,   // third 2x2 filter
+      1,  1,  3,  3,   // fourth 2x2 filter
+      -1, -1, 3,  3,   // fifth 2x2 filter
+      4,  3,  2,  1,   // sixth 2x2 filter
+      2,  1,  1,  2,   // seventh 2x2 filter
+      1,  -1, 2,  -2,  // eighth 2x2 filter
+  });
+  m.SetBias({1, 2, 3, 4, 5, 6, 7, 8});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      18, 2, 5, 18, 15, 19, 16, 8,  // first batch, left
+                      18, 2, 5, 18, 15, 19, 16, 8,  // first batch, right
+                      17, 4, 3, 16, 11, 20, 16, 5,  // second batch, left
+                      37, 4, 3, 32, 19, 40, 28, 5   // second batch, right
+                  },
+                  0.17)));
+}
+
 // This test's output is equivalent to the SimpleTestHybrid
 // because we break each input into two channels, each with half of the value,
 // while keeping the filters for each channel equivalent.
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm.h b/tensorflow/lite/kernels/cpu_backend_gemm.h
index a95c4d15a82..14ff571e7da 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm.h
@@ -95,9 +95,26 @@ void Gemm(const MatrixParams<LhsScalar>& lhs_params, const LhsScalar* lhs_data,
           CpuBackendContext* context) {
   ruy::profiler::ScopeLabel label("cpu_backend_gemm::Gemm");
   ValidateParams(lhs_params, rhs_params, dst_params, params);
+  // In some cases we want to unconditionally use ruy as the backend, overriding
+  // the `tflite_with_ruy` setting and the platform default.
+  bool must_use_ruy = false;
   if (context->use_caching()) {
-    // Dispatch to backend that supports caching of prepacked weights
-    // matrices.
+    // Only ruy supports caching of pre-packed matrices. Due to the large
+    // performance impact in the cases where it's typically used, this overrides
+    // the default.
+    must_use_ruy = true;
+  }
+  if (lhs_params.order != Order::kRowMajor ||
+      rhs_params.order != Order::kColMajor ||
+      dst_params.order != Order::kColMajor) {
+    // ruy supports all 2^3=8 combinations of storage orders with comparable
+    // performance. In ruy, it's only a runtime switch. In other backends
+    // (gemmlowp, Eigen), storage orders are template parameters, supporting
+    // all 8 combinations would be up to a 8-fold code size increase, so we
+    // prefer to force usage of ruy in these cases.
+    must_use_ruy = true;
+  }
+  if (must_use_ruy) {
     detail::GemmImplUsingRuy<LhsScalar, RhsScalar, AccumScalar, DstScalar,
                              quantization_flavor>::Run(lhs_params, lhs_data,
                                                        rhs_params, rhs_data,
@@ -105,15 +122,18 @@ void Gemm(const MatrixParams<LhsScalar>& lhs_params, const LhsScalar* lhs_data,
                                                        params, context);
     return;
   }
-  const bool do_custom_gemv = (dst_params.cols == 1);
-  if (do_custom_gemv) {
-    // GEMV case: try a custom fast GEMV path.
+  // If we did not choose to force usage of ruy above, then we may now consider
+  // using custom GEMV code for the matrix*vector cases.
+  const bool try_custom_gemv = (dst_params.cols == 1);
+  if (try_custom_gemv) {
+    // GEMV case: try a custom fast GEMV path. It will return true if it
+    // actually handled it.
     if (detail::CustomGemv(lhs_params, lhs_data, rhs_params, rhs_data,
                            dst_params, dst_data, params, context)) {
       return;
     }
   }
-  ruy::profiler::ScopeLabel label2("cpu_backend_gemm::Gemm: general GEMM");
+  // Generic case: dispatch to any backend as a general GEMM.
   GemmImpl<LhsScalar, RhsScalar, AccumScalar, DstScalar,
            quantization_flavor>::Run(lhs_params, lhs_data, rhs_params, rhs_data,
                                      dst_params, dst_data, params, context);
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_params.h b/tensorflow/lite/kernels/cpu_backend_gemm_params.h
index 0040f40cd50..ef06d97331e 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_params.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_params.h
@@ -236,17 +236,6 @@ void ValidateParams(
   (void)detail::ValidateTypes<LhsScalar, RhsScalar, AccumScalar, DstScalar,
                               quantization_flavor>();
   ValidateGemmParams(params);
-  // For now, Gemm only supports this particular combination of storage orders.
-  // Actually the generic ruy path already supports all combinations (with
-  // various performance penalties). On the other hand, gemmlowp and Eigen
-  // paths would require more source code and larger binary code to handle
-  // other combinations (because orders are template parameters in gemmlowp
-  // and Eigen). Since this is TFLite's own internal Gemm library, there is
-  // no point in supporting more than what TFlite currently uses, and that
-  // is for now this single combination.
-  TFLITE_DCHECK(lhs_params.order == Order::kRowMajor);
-  TFLITE_DCHECK(rhs_params.order == Order::kColMajor);
-  TFLITE_DCHECK(dst_params.order == Order::kColMajor);
 }
 
 }  // namespace cpu_backend_gemm
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_test.cc b/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
index d79d1357696..521e7bb03fd 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
@@ -389,8 +389,13 @@ void TestSomeGemm(int rows, int depth, int cols,
   }
   MakeDeterministicPseudoRandomVector(rows * cols, &dst_data);
 
+  auto random_order = [&]() {
+    return random_engine() % 2 ? cpu_backend_gemm::Order::kRowMajor
+                               : cpu_backend_gemm::Order::kColMajor;
+  };
   MatrixParams<LhsScalar> lhs_params;
-  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.order =
+      use_golden ? cpu_backend_gemm::Order::kRowMajor : random_order();
   lhs_params.rows = rows;
   lhs_params.cols = depth;
   if (!std::is_floating_point<LhsScalar>::value) {
@@ -401,7 +406,8 @@ void TestSomeGemm(int rows, int depth, int cols,
   }
 
   MatrixParams<RhsScalar> rhs_params;
-  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.order =
+      use_golden ? cpu_backend_gemm::Order::kColMajor : random_order();
   rhs_params.rows = depth;
   rhs_params.cols = cols;
   if (!std::is_floating_point<RhsScalar>::value) {
@@ -412,7 +418,8 @@ void TestSomeGemm(int rows, int depth, int cols,
   }
 
   MatrixParams<DstScalar> dst_params;
-  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.order =
+      use_golden ? cpu_backend_gemm::Order::kColMajor : random_order();
   dst_params.rows = rows;
   dst_params.cols = cols;
   if (!std::is_floating_point<DstScalar>::value) {
diff --git a/tensorflow/lite/kernels/densify.cc b/tensorflow/lite/kernels/densify.cc
index cc3ac67464d..cd0c0a56cfc 100644
--- a/tensorflow/lite/kernels/densify.cc
+++ b/tensorflow/lite/kernels/densify.cc
@@ -82,6 +82,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                              GetTensorShape(op_context.output),
                              GetTensorData<float>(op_context.output));
       break;
+    case kTfLiteFloat16:
+      reference_ops::Densify(op_context.input->sparsity,
+                             GetTensorShape(op_context.input),
+                             GetTensorData<Eigen::half>(op_context.input),
+                             GetTensorShape(op_context.output),
+                             GetTensorData<Eigen::half>(op_context.output));
+      break;
     case kTfLiteInt8:
       reference_ops::Densify(op_context.input->sparsity,
                              GetTensorShape(op_context.input),
diff --git a/tensorflow/lite/kernels/gather.cc b/tensorflow/lite/kernels/gather.cc
index 1de49f7c486..01a1e2a8a17 100644
--- a/tensorflow/lite/kernels/gather.cc
+++ b/tensorflow/lite/kernels/gather.cc
@@ -61,6 +61,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteFloat32:
     case kTfLiteUInt8:
     case kTfLiteInt8:
+    case kTfLiteInt16:
     case kTfLiteInt64:
     case kTfLiteInt32:
     case kTfLiteBool:
@@ -143,6 +144,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         return Gather<uint8_t, int32_t>(*params, input, positions, output);
       case kTfLiteInt8:
         return Gather<int8_t, int32_t>(*params, input, positions, output);
+      case kTfLiteInt16:
+        return Gather<int16_t, int32_t>(*params, input, positions, output);
       case kTfLiteInt32:
         return Gather<int32_t, int32_t>(*params, input, positions, output);
       case kTfLiteInt64:
@@ -165,6 +168,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         return Gather<uint8_t, int64_t>(*params, input, positions, output);
       case kTfLiteInt8:
         return Gather<int8_t, int64_t>(*params, input, positions, output);
+      case kTfLiteInt16:
+        return Gather<int16_t, int64_t>(*params, input, positions, output);
       case kTfLiteInt32:
         return Gather<int32_t, int64_t>(*params, input, positions, output);
       case kTfLiteInt64:
diff --git a/tensorflow/lite/kernels/gather_test.cc b/tensorflow/lite/kernels/gather_test.cc
index 01be7f01935..fe0e55a12f9 100644
--- a/tensorflow/lite/kernels/gather_test.cc
+++ b/tensorflow/lite/kernels/gather_test.cc
@@ -272,6 +272,26 @@ TEST(TypesGatherOpTest, Int8Int64) {
   EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({14, 15, -13, -120}));
 }
 
+TEST(TypesGatherOpTest, Int16Int32) {
+  GatherOpModel m({TensorType_INT16, {2, 2}}, {TensorType_INT32, {2}});
+  m.SetInput<int16_t>({-13, -32000, 0, 32500});
+  m.SetPositions<int32_t>({1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int16_t>(),
+              ElementsAreArray({0, 32500, -13, -32000}));
+}
+
+TEST(TypesGatherOpTest, Int16Int64) {
+  GatherOpModel m({TensorType_INT16, {2, 2}}, {TensorType_INT64, {2}});
+  m.SetInput<int16_t>({-13, -32000, 0, 32500});
+  m.SetPositions<int64_t>({1LL, 0LL});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int16_t>(),
+              ElementsAreArray({0, 32500, -13, -32000}));
+}
+
 TEST(TypesGatherOpTest, Int64Int32) {
   GatherOpModel m({TensorType_INT64, {2, 2}}, {TensorType_INT32, {2}});
   m.SetInput<int64_t>({-(1LL << 34), 134LL, 14LL, 15LL});
diff --git a/tensorflow/lite/kernels/hashtable/BUILD b/tensorflow/lite/kernels/hashtable/BUILD
index d141abf4f95..73f6247a05e 100644
--- a/tensorflow/lite/kernels/hashtable/BUILD
+++ b/tensorflow/lite/kernels/hashtable/BUILD
@@ -1,5 +1,3 @@
-load("//tensorflow/lite/micro:build_def.bzl", "cc_library")
-
 package(
     default_visibility = [
         "//visibility:public",
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 519dd8611ef..ad11c06eb37 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -1,6 +1,7 @@
+load("@bazel_skylib//lib:selects.bzl", "selects")
 load("//tensorflow:tensorflow.bzl", "transitive_hdrs")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
-load("//tensorflow/lite/micro:build_def.bzl", "cc_library", "micro_copts")
+load("//tensorflow/lite/micro:build_def.bzl", "micro_copts")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite_combined")
 
 package(
@@ -46,7 +47,6 @@ NEON_FLAGS_IF_APPLICABLE = select({
 cc_library(
     name = "compatibility",
     hdrs = ["compatibility.h"],
-    build_for_embedded = True,
     copts = tflite_copts(),
     deps = [
         "//tensorflow/lite/kernels:op_macros",
@@ -56,7 +56,6 @@ cc_library(
 cc_library(
     name = "types",
     hdrs = ["types.h"],
-    build_for_embedded = True,
     copts = tflite_copts(),
     deps = [
         ":compatibility",
@@ -66,7 +65,6 @@ cc_library(
 cc_library(
     name = "legacy_types",
     hdrs = ["legacy_types.h"],
-    build_for_embedded = True,
     copts = tflite_copts(),
     deps = [
         ":types",
@@ -209,11 +207,18 @@ config_setting(
     },
 )
 
+config_setting(
+    name = "tf_lite_static_memory",
+    values = {
+        "copt": "-DTF_LITE_STATIC_MEMORY",
+        "cpu": "k8",
+    },
+)
+
 cc_library(
     name = "common",
     srcs = [],
     hdrs = ["common.h"],
-    build_for_embedded = True,
     copts = tflite_copts(),
     deps = [
         ":cppmath",
@@ -367,7 +372,6 @@ cc_library(
         "max.h",
         "min.h",
     ],
-    build_for_embedded = True,
     copts = tflite_copts(),
 )
 
@@ -375,7 +379,6 @@ cc_library(
     name = "quantization_util",
     srcs = ["quantization_util.cc"],
     hdrs = ["quantization_util.h"],
-    build_for_embedded = True,
     copts = tflite_copts() + micro_copts(),
     deps = [
         ":compatibility",
@@ -403,7 +406,6 @@ cc_library(
     hdrs = [
         "transpose_utils.h",
     ],
-    build_for_embedded = True,
     copts = tflite_copts(),
     deps = [
         ":types",
@@ -425,7 +427,6 @@ cc_library(
     hdrs = [
         "strided_slice_logic.h",
     ],
-    build_for_embedded = True,
     copts = tflite_copts(),
     deps = [
         ":compatibility",
@@ -455,16 +456,12 @@ cc_library(
         "reference/integer_ops/add.h",
         "reference/integer_ops/conv.h",
         "reference/integer_ops/depthwise_conv.h",
-        "reference/integer_ops/dequantize.h",
         "reference/integer_ops/fully_connected.h",
         "reference/integer_ops/l2normalization.h",
-        "reference/integer_ops/log_softmax.h",
         "reference/integer_ops/logistic.h",
-        "reference/integer_ops/mean.h",
         "reference/integer_ops/mul.h",
         "reference/integer_ops/pooling.h",
         "reference/integer_ops/tanh.h",
-        "reference/integer_ops/transpose_conv.h",
         "reference/l2normalization.h",
         "reference/logistic.h",
         "reference/maximum_minimum.h",
@@ -477,31 +474,26 @@ cc_library(
         "reference/process_broadcast_shapes.h",
         "reference/quantize.h",
         "reference/reduce.h",
-        "reference/reference_ops.h",
         "reference/requantize.h",
         "reference/resize_nearest_neighbor.h",
         "reference/round.h",
         "reference/softmax.h",
-        "reference/sparse_ops/fully_connected.h",
         "reference/strided_slice.h",
         "reference/sub.h",
         "reference/svdf.h",
         "reference/tanh.h",
-    ],
-    build_for_embedded = True,
+    ] + select({
+        ":tf_lite_static_memory": [],
+        "//conditions:default": [
+            "reference/integer_ops/dequantize.h",
+            "reference/integer_ops/log_softmax.h",
+            "reference/integer_ops/mean.h",
+            "reference/integer_ops/transpose_conv.h",
+            "reference/reference_ops.h",
+            "reference/sparse_ops/fully_connected.h",
+        ],
+    }),
     copts = tflite_copts(),
-    select_deps = {
-        ":haswell": tflite_deps_intel,
-        ":ios_x86_64": tflite_deps_intel,
-        ":k8": tflite_deps_intel,
-        ":x86": tflite_deps_intel,
-        ":x86_64": tflite_deps_intel,
-        ":darwin": tflite_deps_intel,
-        ":darwin_x86_64": tflite_deps_intel,
-        ":freebsd": tflite_deps_intel,
-        ":windows": tflite_deps_intel,
-        "//conditions:default": [],
-    },
     deps = [
         ":common",
         ":compatibility",
@@ -512,14 +504,25 @@ cc_library(
         ":tensor",
         ":tensor_utils",
         ":types",
+        "//third_party/eigen3",
+        "@gemmlowp//:fixedpoint",
+        "@ruy//ruy/profiler:instrumentation",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:op_macros",
         "//tensorflow/lite/tools/optimize/sparsity:format_converter",
-        "//third_party/eigen3",
-        "@gemmlowp//:fixedpoint",
-        "@ruy//ruy/profiler:instrumentation",
-    ],
+    ] + select({
+        ":haswell": tflite_deps_intel,
+        ":ios_x86_64": tflite_deps_intel,
+        ":k8": tflite_deps_intel,
+        ":x86": tflite_deps_intel,
+        ":x86_64": tflite_deps_intel,
+        ":darwin": tflite_deps_intel,
+        ":darwin_x86_64": tflite_deps_intel,
+        ":freebsd": tflite_deps_intel,
+        ":windows": tflite_deps_intel,
+        "//conditions:default": [],
+    }),
 )
 
 cc_library(
@@ -598,7 +601,6 @@ cc_library(
         "tensor.h",
         "tensor_ctypes.h",
     ],
-    build_for_embedded = True,
     copts = tflite_copts(),
     deps = [
         ":types",
@@ -682,7 +684,9 @@ cc_library(
         ":portable_tensor_utils",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:cpu_backend_context",
+        "//tensorflow/lite/kernels:cpu_backend_gemm",
         "//tensorflow/lite/kernels:op_macros",
+        "@ruy//ruy/profiler:instrumentation",
     ],
 )
 
@@ -690,7 +694,6 @@ cc_library(
     name = "kernel_utils",
     srcs = ["kernel_utils.cc"],
     hdrs = ["kernel_utils.h"],
-    build_for_embedded = True,
     copts = tflite_copts() + micro_copts(),
     deps = [
         ":tensor_utils",
@@ -728,73 +731,42 @@ cc_library(
     hdrs = [
         "tensor_utils.h",
     ],
-    build_for_embedded = True,
     copts = tflite_copts() + NEON_FLAGS_IF_APPLICABLE,
-    select_deps = {
-        ":aarch64": [
-            ":neon_tensor_utils",
-        ],
-        ":arm": [
-            ":neon_tensor_utils",
-        ],
-        ":arm64-v8a": [
-            ":neon_tensor_utils",
-        ],
-        ":armeabi-v7a": [
-            ":neon_tensor_utils",
-        ],
-        ":armhf": [
-            ":neon_tensor_utils",
-        ],
-        ":armv7a": [
-            ":neon_tensor_utils",
-        ],
-        ":haswell": [
-            ":sse_tensor_utils",
-        ],
-        ":ios_armv7": [
-            ":neon_tensor_utils",
-        ],
-        ":ios_arm64": [
-            ":neon_tensor_utils",
-        ],
-        ":ios_arm64e": [
-            ":neon_tensor_utils",
-        ],
-        ":raspberry_pi_with_neon": [
-            ":neon_tensor_utils",
-        ],
-        ":ios_x86_64": [
-            ":sse_tensor_utils",
-        ],
-        ":x86_64": [
-            ":sse_tensor_utils",
-        ],
-        ":x86": [
-            ":sse_tensor_utils",
-        ],
-        ":k8": [
-            ":sse_tensor_utils",
-        ],
-        ":darwin": [
-            ":sse_tensor_utils",
-        ],
-        ":darwin_x86_64": [
-            ":sse_tensor_utils",
-        ],
-        ":freebsd": [
-            ":sse_tensor_utils",
-        ],
-        ":windows": [":sse_tensor_utils"],
-        "//conditions:default": [
-            ":portable_tensor_utils",
-        ],
-    },
     deps = [
         ":cpu_check",
-        "//tensorflow/lite/c:common",
         "//third_party/eigen3",
-    ],
+        "//tensorflow/lite/c:common",
+    ] + selects.with_or({
+        (
+            ":aarch64",
+            ":arm",
+            ":arm64-v8a",
+            ":armeabi-v7a",
+            ":armhf",
+            ":armv7a",
+            ":ios_armv7",
+            ":ios_arm64",
+            ":ios_arm64e",
+            ":raspberry_pi_with_neon",
+        ): [":neon_tensor_utils"],
+        (
+            ":darwin",
+            ":darwin_x86_64",
+            ":freebsd",
+            ":haswell",
+            ":ios_x86_64",
+            ":x86_64",
+            ":x86",
+            ":k8",
+            ":windows",
+        ): [
+            ":sse_tensor_utils",
+        ],
+        (
+            ":tf_lite_static_memory",
+            "//conditions:default",
+        ): [":portable_tensor_utils"],
+    }),
 )
 
 cc_library(
@@ -1052,9 +1024,8 @@ cc_library(
         "optimized/neon_check.h",
         "optimized/sse_check.h",
     ],
-    build_for_embedded = True,
     copts = tflite_copts(),
-    select_deps = {
+    deps = select({
         ":haswell": tflite_deps_intel,
         ":ios_x86_64": tflite_deps_intel,
         ":k8": tflite_deps_intel,
@@ -1065,7 +1036,7 @@ cc_library(
         ":freebsd": tflite_deps_intel,
         ":windows": tflite_deps_intel,
         "//conditions:default": [],
-    },
+    }),
 )
 
 cc_test(
diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h
index c45aff9e47b..66a2d977f39 100644
--- a/tensorflow/lite/kernels/internal/common.h
+++ b/tensorflow/lite/kernels/internal/common.h
@@ -138,23 +138,24 @@ inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size,
 #endif
 }
 
-inline int32 MultiplyByQuantizedMultiplierSmallerThanOneExp(
-    int32 x, int32 quantized_multiplier, int left_shift) {
+inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(
+    int32_t x, int32_t quantized_multiplier, int left_shift) {
   using gemmlowp::RoundingDivideByPOT;
   using gemmlowp::SaturatingRoundingDoublingHighMul;
   return RoundingDivideByPOT(
       SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
 }
 
-inline int32 MultiplyByQuantizedMultiplierGreaterThanOne(
-    int32 x, int32 quantized_multiplier, int left_shift) {
+inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(
+    int32_t x, int32_t quantized_multiplier, int left_shift) {
   using gemmlowp::SaturatingRoundingDoublingHighMul;
   return SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
                                            quantized_multiplier);
 }
 
-inline int32 MultiplyByQuantizedMultiplier(int32 x, int32 quantized_multiplier,
-                                           int shift) {
+inline int32_t MultiplyByQuantizedMultiplier(int32_t x,
+                                             int32_t quantized_multiplier,
+                                             int shift) {
   using gemmlowp::RoundingDivideByPOT;
   using gemmlowp::SaturatingRoundingDoublingHighMul;
   int left_shift = shift > 0 ? shift : 0;
@@ -164,16 +165,16 @@ inline int32 MultiplyByQuantizedMultiplier(int32 x, int32 quantized_multiplier,
                              right_shift);
 }
 
-inline int32 MultiplyByQuantizedMultiplier(int64_t x,
-                                           int32 quantized_multiplier,
-                                           int shift) {
+inline int32_t MultiplyByQuantizedMultiplier(int64_t x,
+                                             int32_t quantized_multiplier,
+                                             int shift) {
   // Inputs:
   // - quantized_multiplier has fixed point at bit 31
   // - shift is -31 to +7 (negative for right shift)
   //
   // Assumptions: The following input ranges are assumed
   // - quantize_scale>=0  (the usual range is (1<<30) to (1>>31)-1)
-  // - scaling is chosen so final scaled result fits in int32
+  // - scaling is chosen so final scaled result fits in int32_t
   // - input x is in the range -(1<<47) <= x < (1<<47)
   assert(quantized_multiplier >= 0);
   assert(shift >= -31 && shift < 8);
@@ -218,9 +219,9 @@ inline int CountLeadingSignBits(T integer_input) {
   using U = typename std::make_unsigned<T>::type;
   return integer_input >= 0
              ? CountLeadingZeros(static_cast<U>(integer_input)) - 1
-             : integer_input != std::numeric_limits<T>::min()
-                   ? CountLeadingZeros(2 * static_cast<U>(-integer_input) - 1)
-                   : 0;
+         : integer_input != std::numeric_limits<T>::min()
+             ? CountLeadingZeros(2 * static_cast<U>(-integer_input) - 1)
+             : 0;
 #endif
 }
 
@@ -262,7 +263,7 @@ inline void gen_lut(const std::function<double(double)>& func, double min,
       std::min(std::max(TfLiteRound(func(max) * 32768.0), -32768.0), 32767.0);
 }
 
-// int16 func table lookup, e.g., lookup exp() and 1/(1+x) used in softmax
+// int16_t func table lookup, e.g., lookup exp() and 1/(1+x) used in softmax
 inline int16_t generic_int16_table_lookup(int16_t value, const int16_t* lut) {
   // 512 base value, lut[513] only for calculate slope
   uint16_t index = static_cast<uint16_t>(256 + (value >> 7));
@@ -413,21 +414,21 @@ SaturatingRoundingMultiplyByPOTParam(
       SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent));
 }
 
-// Convert int32 multiplier to int16 with rounding.
-inline void DownScaleInt32ToInt16Multiplier(int32_t multiplier_int32,
-                                            int16_t* multiplier_int16) {
-  TFLITE_DCHECK_GE(multiplier_int32, 0);
+// Convert int32_t multiplier to int16_t with rounding.
+inline void DownScaleInt32ToInt16Multiplier(int32_t multiplier_int32_t,
+                                            int16_t* multiplier_int16_t) {
+  TFLITE_DCHECK_GE(multiplier_int32_t, 0);
   static constexpr int32_t kRoundingOffset = 1 << 15;
-  if (multiplier_int32 >=
+  if (multiplier_int32_t >=
       std::numeric_limits<int32_t>::max() - kRoundingOffset) {
-    *multiplier_int16 = std::numeric_limits<int16_t>::max();
+    *multiplier_int16_t = std::numeric_limits<int16_t>::max();
     return;
   }
-  const int32_t result = (multiplier_int32 + kRoundingOffset) >> 16;
-  TFLITE_DCHECK_LE(result << 16, multiplier_int32 + kRoundingOffset);
-  TFLITE_DCHECK_GT(result << 16, multiplier_int32 - kRoundingOffset);
-  *multiplier_int16 = result;
-  TFLITE_DCHECK_EQ(*multiplier_int16, result);
+  const int32_t result = (multiplier_int32_t + kRoundingOffset) >> 16;
+  TFLITE_DCHECK_LE(result << 16, multiplier_int32_t + kRoundingOffset);
+  TFLITE_DCHECK_GT(result << 16, multiplier_int32_t - kRoundingOffset);
+  *multiplier_int16_t = result;
+  TFLITE_DCHECK_EQ(*multiplier_int16_t, result);
 }
 
 // Minimum output bits to accommodate log of maximum input range.  It actually
@@ -438,15 +439,13 @@ inline void DownScaleInt32ToInt16Multiplier(int32_t multiplier_int32,
 //  ceil(log(abs( log(2.^(0:127))+1 ))/log(2)); ...
 //  ceil(log(abs( log(2.^(0:127))+1 ))/log(2))]
 constexpr int min_log_x_output_bits(int input_bits) {
-  return input_bits > 90
-             ? 7
-             : input_bits > 44
-                   ? 6
-                   : input_bits > 21
-                         ? 5
-                         : input_bits > 10
-                               ? 4
-                               : input_bits > 4 ? 3 : input_bits > 1 ? 2 : 1;
+  return input_bits > 90   ? 7
+         : input_bits > 44 ? 6
+         : input_bits > 21 ? 5
+         : input_bits > 10 ? 4
+         : input_bits > 4  ? 3
+         : input_bits > 1  ? 2
+                           : 1;
 }
 
 // Although currently the name of this function says that it cannot handle
@@ -454,17 +453,17 @@ constexpr int min_log_x_output_bits(int input_bits) {
 // x_max is the largest representable input.  In other words, the output range
 // is symmetric.
 template <int OutputIntegerBits, int InputIntegerBits>
-inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
+inline gemmlowp::FixedPoint<int32_t, OutputIntegerBits>
 log_x_for_x_greater_than_or_equal_to_1_impl(
-    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
-  // assert(__builtin_clz(0u) >= std::numeric_limits<uint32>::digits - 1);
-  // assert(__builtin_clz(0u) <= std::numeric_limits<uint32>::digits);
-  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+    gemmlowp::FixedPoint<int32_t, InputIntegerBits> input_val) {
+  // assert(__builtin_clz(0u) >= std::numeric_limits<uint32_t>::digits - 1);
+  // assert(__builtin_clz(0u) <= std::numeric_limits<uint32_t>::digits);
+  using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
   // The reason for accumulating the result with an extra bit of headroom is
   // that z_pow_2_adj * log_2 might be saturated, and adding num_scaled *
   // recip_denom will otherwise introduce an error.
   static constexpr int kAccumIntegerBits = OutputIntegerBits + 1;
-  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumIntegerBits>;
+  using FixedPointAccum = gemmlowp::FixedPoint<int32_t, kAccumIntegerBits>;
 
   const FixedPoint0 log_2 = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
       FixedPoint0, 1488522236, std::log(2.0));
@@ -492,10 +491,10 @@ log_x_for_x_greater_than_or_equal_to_1_impl(
   // required shift "ourselves" instead of using, say, Rescale.
   FixedPoint0 z_a = FixedPoint0::FromRaw(input_val.raw());
   // z_a_pow_2 = input_integer_bits - z_a_headroom;
-  int z_a_headroom_plus_1 = CountLeadingZeros(static_cast<uint32>(z_a.raw()));
+  int z_a_headroom_plus_1 = CountLeadingZeros(static_cast<uint32_t>(z_a.raw()));
   FixedPoint0 r_a_tmp =
       SaturatingRoundingMultiplyByPOTParam(z_a, (z_a_headroom_plus_1 - 1));
-  const int32 r_a_raw =
+  const int32_t r_a_raw =
       SaturatingRoundingMultiplyByPOTParam((r_a_tmp * sqrt_half).raw(), 1);
   // z_pow_2_adj = max(z_pow_2_a - 0.75, z_pow_2_b - 0.25);
   // z_pow_2_adj = max(InputIntegerBits - z_a_headroom_plus_1 + 0.25,
@@ -507,8 +506,8 @@ log_x_for_x_greater_than_or_equal_to_1_impl(
 
   // z_b is treated like z_a, but premultiplying by sqrt(0.5).
   FixedPoint0 z_b = z_a * sqrt_half;
-  int z_b_headroom = CountLeadingZeros(static_cast<uint32>(z_b.raw())) - 1;
-  const int32 r_b_raw =
+  int z_b_headroom = CountLeadingZeros(static_cast<uint32_t>(z_b.raw())) - 1;
+  const int32_t r_b_raw =
       SaturatingRoundingMultiplyByPOTParam(z_a.raw(), z_b_headroom);
   const FixedPointAccum z_b_pow_2_adj = SaturatingSub(
       FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
@@ -536,9 +535,9 @@ log_x_for_x_greater_than_or_equal_to_1_impl(
 }
 
 template <int OutputIntegerBits, int InputIntegerBits>
-inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
+inline gemmlowp::FixedPoint<int32_t, OutputIntegerBits>
 log_x_for_x_greater_than_or_equal_to_1(
-    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
+    gemmlowp::FixedPoint<int32_t, InputIntegerBits> input_val) {
   static_assert(
       OutputIntegerBits >= min_log_x_output_bits(InputIntegerBits),
       "Output integer bits must be sufficient to accommodate logs of inputs.");
@@ -547,25 +546,25 @@ log_x_for_x_greater_than_or_equal_to_1(
       input_val);
 }
 
-inline int32 GetReciprocal(int32 x, int x_integer_digits,
-                           int* num_bits_over_unit) {
-  int headroom_plus_one = CountLeadingZeros(static_cast<uint32>(x));
+inline int32_t GetReciprocal(int32_t x, int x_integer_digits,
+                             int* num_bits_over_unit) {
+  int headroom_plus_one = CountLeadingZeros(static_cast<uint32_t>(x));
   // This is the number of bits to the left of the binary point above 1.0.
   // Consider x=1.25.  In that case shifted_scale=0.8 and
   // no later adjustment will be needed.
   *num_bits_over_unit = x_integer_digits - headroom_plus_one;
-  const int32 shifted_sum_minus_one =
-      static_cast<int32>((static_cast<uint32>(x) << headroom_plus_one) -
-                         (static_cast<uint32>(1) << 31));
+  const int32_t shifted_sum_minus_one =
+      static_cast<int32_t>((static_cast<uint32_t>(x) << headroom_plus_one) -
+                           (static_cast<uint32_t>(1) << 31));
 
-  gemmlowp::FixedPoint<int32, 0> shifted_scale =
+  gemmlowp::FixedPoint<int32_t, 0> shifted_scale =
       gemmlowp::one_over_one_plus_x_for_x_in_0_1(
-          gemmlowp::FixedPoint<int32, 0>::FromRaw(shifted_sum_minus_one));
+          gemmlowp::FixedPoint<int32_t, 0>::FromRaw(shifted_sum_minus_one));
   return shifted_scale.raw();
 }
 
-inline void GetInvSqrtQuantizedMultiplierExp(int32 input, int reverse_shift,
-                                             int32* output_inv_sqrt,
+inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift,
+                                             int32_t* output_inv_sqrt,
                                              int* output_shift) {
   TFLITE_DCHECK_GE(input, 0);
   if (input <= 1) {
@@ -585,7 +584,7 @@ inline void GetInvSqrtQuantizedMultiplierExp(int32 input, int reverse_shift,
     ++*output_shift;
   }
   const unsigned max_left_shift_bits =
-      CountLeadingZeros(static_cast<uint32>(input)) - 1;
+      CountLeadingZeros(static_cast<uint32_t>(input)) - 1;
   const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
   const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
   *output_shift -= left_shift_bit_pairs;
@@ -597,8 +596,8 @@ inline void GetInvSqrtQuantizedMultiplierExp(int32 input, int reverse_shift,
   using gemmlowp::SaturatingRoundingMultiplyByPOT;
   // Using 3 integer bits gives us enough room for the internal arithmetic in
   // this Newton-Raphson iteration.
-  using F3 = FixedPoint<int32, 3>;
-  using F0 = FixedPoint<int32, 0>;
+  using F3 = FixedPoint<int32_t, 3>;
+  using F0 = FixedPoint<int32_t, 0>;
   const F3 fixedpoint_input = F3::FromRaw(input >> 1);
   const F3 fixedpoint_half_input =
       SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
diff --git a/tensorflow/lite/kernels/internal/compatibility.h b/tensorflow/lite/kernels/internal/compatibility.h
index bfd021ac48d..61becad30c5 100644
--- a/tensorflow/lite/kernels/internal/compatibility.h
+++ b/tensorflow/lite/kernels/internal/compatibility.h
@@ -76,13 +76,15 @@ limitations under the License.
 #define TFLITE_CHECK_LT(x, y) ((x) < (y)) ? (void)0 : TFLITE_ABORT
 #endif
 
-// TODO(ahentz): Clean up.
+#ifndef TF_LITE_STATIC_MEMORY
+// TODO(b/162019032): Consider removing these type-aliases.
 using int8 = std::int8_t;
 using uint8 = std::uint8_t;
 using int16 = std::int16_t;
 using uint16 = std::uint16_t;
 using int32 = std::int32_t;
 using uint32 = std::uint32_t;
+#endif  // !defined(TF_LITE_STATIC_MEMORY)
 
 // TFLITE_DEPRECATED()
 //
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
index 800d7008b4b..0172ba690e4 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -2292,7 +2292,7 @@ void NeonVectorScalarMultiply(const int8_t* vector, const int v_size,
 // Also consider changing the rounding stragey from "ties to away" to
 // "ties to even" since vcvtnq_s32_f32 is generally more available.
 inline int32x4_t RoundToNearest(const float32x4_t input) {
-#if defined(_ACAT_ARM64)
+#if __ARM_ARCH >= 8
   return vcvtaq_s32_f32(input);
 #else
   static const float32x4_t zero_val_dup = vdupq_n_f32(0.0f);
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index 5d922fe3c4e..c505ee81767 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -2635,8 +2635,6 @@ inline void BroadcastMulFivefold(const ArithmeticParams& params,
                        output_shape, output_data);
 }
 
-
-
 // TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
 // dimensionality if the runtime code does a single loop over one dimension
 // that handles broadcasting as the base case. The code generator would then
@@ -4264,8 +4262,6 @@ inline void SoftmaxInt8LUT(const SoftmaxParams& params,
   }
 }
 
-// TODO(myenik): This is the same as the reference implementation, not actually
-// optimized yet.
 inline void LogSoftmax(const SoftmaxParams& params,
                        const RuntimeShape& input_shape, const float* input_data,
                        const RuntimeShape& output_shape, float* output_data) {
@@ -4277,27 +4273,14 @@ inline void LogSoftmax(const SoftmaxParams& params,
       MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
 
   for (int i = 0; i < outer_size; ++i) {
-    const float* block_input_data = input_data + i * depth;
-    float* block_output_data = output_data + i * depth;
+    VectorMap<const float> block_input(input_data + i * depth, depth, 1);
+    VectorMap<float> block_output(output_data + i * depth, depth, 1);
     // Find max element value which we'll use to ensure numerical stability
     // taking advantage of the following equality:
     // log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C)))
-    float max = std::numeric_limits<float>::lowest();
-    for (int c = 0; c < depth; ++c) {
-      max = std::max(max, block_input_data[c]);
-    }
-
-    // Compute sum.
-    float sum = 0.f;
-    for (int c = 0; c < depth; ++c) {
-      sum += std::exp(block_input_data[c] - max);
-    }
-
-    // Compute result.
-    const float log_sum = std::log(sum);
-    for (int c = 0; c < depth; ++c) {
-      block_output_data[c] = block_input_data[c] - max - log_sum;
-    }
+    const float max = block_input.maxCoeff();
+    const float log_sum = std::log((block_input.array() - max).exp().sum());
+    block_output = block_input.array() - max - log_sum;
   }
 }
 
@@ -5559,7 +5542,8 @@ inline void PadImageStyle(const tflite::PadParams& op_params,
                           const RuntimeShape& input_shape, const T* input_data,
                           const P* pad_value_ptr,
                           const RuntimeShape& output_shape, T* output_data) {
-  TFLITE_ASSERT_FALSE;
+  reference_ops::PadImageStyle(op_params, input_shape, input_data,
+                               pad_value_ptr, output_shape, output_data);
 }
 
 template <typename P>
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
index 80cc14c6d26..4c4f39b6300 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
@@ -24,7 +24,10 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 
 namespace tflite {
@@ -170,6 +173,38 @@ void SseMatrixBatchVectorMultiplyAccumulateImpl(
   }  // for batch
 }
 
+void SseCpuBackendGemm(const int8_t* input, const int32_t* bias,
+                       const int8_t* input_to_gate_weights, int32_t n_batch,
+                       int32_t n_input, int32_t n_output, int32_t output_zp,
+                       int32_t* scratch, CpuBackendContext* context) {
+  using ::tflite::cpu_backend_gemm::Gemm;
+  using ::tflite::cpu_backend_gemm::GemmParams;
+  using ::tflite::cpu_backend_gemm::MatrixParams;
+
+  MatrixParams<int8_t> lhs_params;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.rows = n_output;
+  lhs_params.cols = n_input;
+  lhs_params.cache_policy = cpu_backend_gemm::CachePolicy::kCacheIfLargeSpeedup;
+
+  MatrixParams<int8_t> rhs_params;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.rows = n_input;
+  rhs_params.cols = n_batch;
+
+  MatrixParams<int32_t> dst_params;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.rows = n_output;
+  dst_params.cols = n_batch;
+
+  GemmParams<int32, int32> gemm_params;
+  if (bias) {
+    gemm_params.bias = bias;
+  }
+  cpu_backend_gemm::Gemm(lhs_params, input_to_gate_weights, rhs_params, input,
+                         dst_params, scratch, gemm_params, context);
+}
+
 void SseMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors,
@@ -181,6 +216,56 @@ void SseMatrixBatchVectorMultiplyAccumulate(
       /*row_sums=*/nullptr);
 }
 
+void SseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors,
+    const float* __restrict__ scaling_factors, int n_batch, int32_t* scratch,
+    float* __restrict__ result, CpuBackendContext* context) {
+  if (m_rows % 4 == 0) {
+    const int32_t* bias = static_cast<const int32_t*>(nullptr);
+    SseCpuBackendGemm(vectors, bias, matrix, n_batch, m_cols, m_rows,
+                      /*output_zp=*/0, scratch, context);
+
+    {
+      ruy::profiler::ScopeLabel label("HybridMultiplyScalingFactor");
+      // Multiply by float scaling factors and write to result
+      const int total_size = n_batch * m_rows;
+      int i = 0;
+      for (; i <= total_size - 8; i += 8, result += 8) {
+        const float batch_scaling_factor0 = scaling_factors[i / m_rows];
+        const float batch_scaling_factor1 = scaling_factors[(i + 4) / m_rows];
+        const __m128 scaling_factor0 = _mm_set1_ps(batch_scaling_factor0);
+        const __m128 scaling_factor1 = _mm_set1_ps(batch_scaling_factor1);
+        const __m128i scratch_val0 =
+            _mm_loadu_si128(reinterpret_cast<const __m128i*>(scratch + i));
+        const __m128i scratch_val1 =
+            _mm_loadu_si128(reinterpret_cast<const __m128i*>(scratch + i + 4));
+        const __m128 float_val0 = _mm_cvtepi32_ps(scratch_val0);
+        const __m128 float_val1 = _mm_cvtepi32_ps(scratch_val1);
+        const __m128 prod0 = _mm_mul_ps(float_val0, scaling_factor0);
+        const __m128 result0 = _mm_add_ps(_mm_load1_ps(result), prod0);
+        const __m128 prod1 = _mm_mul_ps(float_val1, scaling_factor1);
+        const __m128 result1 = _mm_add_ps(_mm_load1_ps(result + 4), prod1);
+        _mm_store_ps(result, result0);
+        _mm_store_ps(result + 4, result1);
+      }
+      scratch += i;
+      for (; i < total_size; i++) {
+        const float batch_scaling_factor = scaling_factors[i / m_rows];
+        int32_t x = *(scratch++);
+        *result += x * batch_scaling_factor;
+        ++result;
+      }
+    }
+    return;
+  }
+
+  SseMatrixBatchVectorMultiplyAccumulateImpl(
+      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+      /*per_channel_scale=*/nullptr, /*input_offset=*/nullptr,
+      /*row_sums=*/nullptr);
+}
+
 void SseMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors,
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
index 9f73ef6435a..e416579308b 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
@@ -71,7 +71,7 @@ void MatrixBatchVectorMultiplyAccumulate(
     int32_t* __restrict__ scratch, float* __restrict__ result,
     CpuBackendContext* __restrict__ context) {
   SSE_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
-                  vectors, scaling_factors, n_batch, result);
+                  vectors, scaling_factors, n_batch, scratch, result, context);
 }
 
 void SparseMatrixBatchVectorMultiplyAccumulate1x4(
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h
index c5ede624762..a77a049b3af 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h
@@ -35,6 +35,14 @@ void SseMatrixBatchVectorMultiplyAccumulate(
     const float* __restrict__ scaling_factors, int n_batch,
     float* __restrict__ result);
 
+// Matrix multiplication for quantized values using symmetric quantization
+// with additional scratch memory for GEMM operation prior to scaling.
+void SseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors,
+    const float* __restrict__ scaling_factors, int n_batch, int32_t* scratch,
+    float* __restrict__ result, CpuBackendContext* context);
+
 // Matrix multiplication for quantized values using asymmetric quantization.
 void SseMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
diff --git a/tensorflow/lite/kernels/internal/quantization_util.cc b/tensorflow/lite/kernels/internal/quantization_util.cc
index 60e3054056d..cf431cffdf7 100644
--- a/tensorflow/lite/kernels/internal/quantization_util.cc
+++ b/tensorflow/lite/kernels/internal/quantization_util.cc
@@ -342,13 +342,13 @@ void NudgeQuantizationRange(const float min, const float max,
   const float quant_max_float = static_cast<float>(quant_max);
   *nudged_scale = (max - min) / (quant_max_float - quant_min_float);
   const float zero_point_from_min = quant_min_float - min / *nudged_scale;
-  uint16 nudged_zero_point;
+  uint16_t nudged_zero_point;
   if (zero_point_from_min < quant_min_float) {
-    nudged_zero_point = static_cast<uint16>(quant_min);
+    nudged_zero_point = static_cast<uint16_t>(quant_min);
   } else if (zero_point_from_min > quant_max_float) {
-    nudged_zero_point = static_cast<uint16>(quant_max);
+    nudged_zero_point = static_cast<uint16_t>(quant_max);
   } else {
-    nudged_zero_point = static_cast<uint16>(TfLiteRound(zero_point_from_min));
+    nudged_zero_point = static_cast<uint16_t>(TfLiteRound(zero_point_from_min));
   }
   *nudged_min = (quant_min_float - nudged_zero_point) * (*nudged_scale);
   *nudged_max = (quant_max_float - nudged_zero_point) * (*nudged_scale);
diff --git a/tensorflow/lite/kernels/internal/reference/add.h b/tensorflow/lite/kernels/internal/reference/add.h
index 741f4e684c5..5be7ab4dc0c 100644
--- a/tensorflow/lite/kernels/internal/reference/add.h
+++ b/tensorflow/lite/kernels/internal/reference/add.h
@@ -65,22 +65,22 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
   TFLITE_DCHECK_LT(params.input2_offset, std::numeric_limits<T>::max());
 
   for (int i = 0; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
-    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32 scaled_input1_val =
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             shifted_input1_val, params.input1_multiplier, params.input1_shift);
-    const int32 scaled_input2_val =
+    const int32_t scaled_input2_val =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
-    const int32 raw_output =
+    const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32_t raw_output =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             raw_sum, params.output_multiplier, params.output_shift) +
         params.output_offset;
-    const int32 clamped_output =
+    const int32_t clamped_output =
         std::min(params.quantized_activation_max,
                  std::max(params.quantized_activation_min, raw_output));
     output_data[i] = static_cast<T>(clamped_output);
@@ -91,40 +91,40 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
 // broadcast add, so that, for example, scalar-broadcast with batch will still
 // be fast.
 inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
-                               uint8 input1_data, const uint8* input2_data,
-                               uint8* output_data) {
+                               uint8_t input1_data, const uint8_t* input2_data,
+                               uint8_t* output_data) {
   TFLITE_DCHECK_GT(params.input1_offset, -256);
   TFLITE_DCHECK_GT(params.input2_offset, -256);
   TFLITE_DCHECK_LT(params.input1_offset, 256);
   TFLITE_DCHECK_LT(params.input2_offset, 256);
 
-  const int32 input1_val = params.input1_offset + input1_data;
-  const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
-  const int32 scaled_input1_val =
+  const int32_t input1_val = params.input1_offset + input1_data;
+  const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+  const int32_t scaled_input1_val =
       MultiplyByQuantizedMultiplierSmallerThanOneExp(
           shifted_input1_val, params.input1_multiplier, params.input1_shift);
   for (int i = 0; i < size; ++i) {
-    const int32 input2_val = params.input2_offset + input2_data[i];
-    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32 scaled_input2_val =
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input2_val =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
-    const int32 raw_output =
+    const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32_t raw_output =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             raw_sum, params.output_multiplier, params.output_shift) +
         params.output_offset;
-    const int32 clamped_output =
+    const int32_t clamped_output =
         std::min(params.quantized_activation_max,
                  std::max(params.quantized_activation_min, raw_output));
-    output_data[i] = static_cast<uint8>(clamped_output);
+    output_data[i] = static_cast<uint8_t>(clamped_output);
   }
 }
 
 inline void Add(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const uint8* input1_data,
-                const RuntimeShape& input2_shape, const uint8* input2_data,
-                const RuntimeShape& output_shape, uint8* output_data) {
+                const RuntimeShape& input1_shape, const uint8_t* input1_data,
+                const RuntimeShape& input2_shape, const uint8_t* input2_data,
+                const RuntimeShape& output_shape, uint8_t* output_data) {
   TFLITE_DCHECK_LE(params.quantized_activation_min,
                    params.quantized_activation_max);
   const int flat_size =
@@ -139,17 +139,17 @@ inline void Add(const ArithmeticParams& params,
 
 inline void AddGeneralParamScale(const ArithmeticParams& params,
                                  const RuntimeShape& input1_shape,
-                                 const int16* input1_data,
+                                 const int16_t* input1_data,
                                  const RuntimeShape& input2_shape,
-                                 const int16* input2_data,
+                                 const int16_t* input2_data,
                                  const RuntimeShape& output_shape,
-                                 int16* output_data) {
+                                 int16_t* output_data) {
   TFLITE_DCHECK_LE(params.quantized_activation_min,
                    params.quantized_activation_max);
   const int flat_size =
       MatchingElementsSize(input1_shape, input2_shape, output_shape);
 
-  int max_value = std::numeric_limits<int16>::max();
+  int max_value = std::numeric_limits<int16_t>::max();
 
   TFLITE_DCHECK_GT(params.input1_offset, -max_value);
   TFLITE_DCHECK_GT(params.input2_offset, -max_value);
@@ -159,9 +159,9 @@ inline void AddGeneralParamScale(const ArithmeticParams& params,
 }
 
 inline void Add(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const int16* input1_data,
-                const RuntimeShape& input2_shape, const int16* input2_data,
-                const RuntimeShape& output_shape, int16* output_data,
+                const RuntimeShape& input1_shape, const int16_t* input1_data,
+                const RuntimeShape& input2_shape, const int16_t* input2_data,
+                const RuntimeShape& output_shape, int16_t* output_data,
                 bool pot_scale = true) {
   if (!pot_scale) {
     AddGeneralParamScale(params, input1_shape, input1_data, input2_shape,
@@ -175,14 +175,15 @@ inline void Add(const ArithmeticParams& params,
   const int input1_shift = params.input1_shift;
   const int flat_size =
       MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  const int16 output_activation_min = params.quantized_activation_min;
-  const int16 output_activation_max = params.quantized_activation_max;
+  const int16_t output_activation_min = params.quantized_activation_min;
+  const int16_t output_activation_max = params.quantized_activation_max;
 
   TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
   TFLITE_DCHECK_LE(input1_shift, 0);
   TFLITE_DCHECK_LE(params.input2_shift, 0);
-  const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data;
-  const int16* shift_input = input1_shift == 0 ? input2_data : input1_data;
+  const int16_t* not_shift_input =
+      input1_shift == 0 ? input1_data : input2_data;
+  const int16_t* shift_input = input1_shift == 0 ? input2_data : input1_data;
   const int input_right_shift =
       input1_shift == 0 ? -params.input2_shift : -input1_shift;
 
@@ -194,8 +195,8 @@ inline void Add(const ArithmeticParams& params,
     F0 scaled_input = F0::FromRaw(
         gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
     F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
-    const int16 raw_output = result.raw();
-    const int16 clamped_output = std::min(
+    const int16_t raw_output = result.raw();
+    const int16_t clamped_output = std::min(
         output_activation_max, std::max(output_activation_min, raw_output));
     output_data[i] = clamped_output;
   }
@@ -251,11 +252,11 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
 
 inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
                                const RuntimeShape& input1_shape,
-                               const int32* input1_data,
+                               const int32_t* input1_data,
                                const RuntimeShape& input2_shape,
-                               const int32* input2_data,
+                               const int32_t* input2_data,
                                const RuntimeShape& output_shape,
-                               int32* output_data) {
+                               int32_t* output_data) {
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
@@ -320,30 +321,30 @@ inline void BroadcastAdd4DSlow(
     for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
       for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
         for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          const int32 input1_val =
+          const int32_t input1_val =
               params.input1_offset +
               input1_data[SubscriptToIndex(desc1, b, y, x, c)];
-          const int32 input2_val =
+          const int32_t input2_val =
               params.input2_offset +
               input2_data[SubscriptToIndex(desc2, b, y, x, c)];
-          const int32 shifted_input1_val =
+          const int32_t shifted_input1_val =
               input1_val * (1 << params.left_shift);
-          const int32 shifted_input2_val =
+          const int32_t shifted_input2_val =
               input2_val * (1 << params.left_shift);
-          const int32 scaled_input1_val =
+          const int32_t scaled_input1_val =
               MultiplyByQuantizedMultiplierSmallerThanOneExp(
                   shifted_input1_val, params.input1_multiplier,
                   params.input1_shift);
-          const int32 scaled_input2_val =
+          const int32_t scaled_input2_val =
               MultiplyByQuantizedMultiplierSmallerThanOneExp(
                   shifted_input2_val, params.input2_multiplier,
                   params.input2_shift);
-          const int32 raw_sum = scaled_input1_val + scaled_input2_val;
-          const int32 raw_output =
+          const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+          const int32_t raw_output =
               MultiplyByQuantizedMultiplierSmallerThanOneExp(
                   raw_sum, params.output_multiplier, params.output_shift) +
               params.output_offset;
-          const int32 clamped_output =
+          const int32_t clamped_output =
               std::min(params.quantized_activation_max,
                        std::max(params.quantized_activation_min, raw_output));
           output_data[Offset(extended_output_shape, b, y, x, c)] =
@@ -356,11 +357,11 @@ inline void BroadcastAdd4DSlow(
 
 inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
                                  const RuntimeShape& unswitched_input1_shape,
-                                 const uint8* unswitched_input1_data,
+                                 const uint8_t* unswitched_input1_data,
                                  const RuntimeShape& unswitched_input2_shape,
-                                 const uint8* unswitched_input2_data,
+                                 const uint8_t* unswitched_input2_data,
                                  const RuntimeShape& output_shape,
-                                 uint8* output_data) {
+                                 uint8_t* output_data) {
   ArithmeticParams switched_params = unswitched_params;
   switched_params.input1_offset = unswitched_params.input2_offset;
   switched_params.input1_multiplier = unswitched_params.input2_multiplier;
@@ -375,18 +376,18 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
 
   const ArithmeticParams& params =
       use_unswitched ? unswitched_params : switched_params;
-  const uint8* input1_data =
+  const uint8_t* input1_data =
       use_unswitched ? unswitched_input1_data : unswitched_input2_data;
-  const uint8* input2_data =
+  const uint8_t* input2_data =
       use_unswitched ? unswitched_input2_data : unswitched_input1_data;
 
   // Fivefold nested loops. The second input resets its position for each
   // iteration of the second loop. The first input resets its position at the
   // beginning of the fourth loop. The innermost loop is an elementwise add of
   // sections of the arrays.
-  uint8* output_data_ptr = output_data;
-  const uint8* input1_data_ptr = input1_data;
-  const uint8* input2_data_reset = input2_data;
+  uint8_t* output_data_ptr = output_data;
+  const uint8_t* input1_data_ptr = input1_data;
+  const uint8_t* input2_data_reset = input2_data;
   // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
   // between input shapes. y3 for input 1 is always broadcast, and so the
   // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
@@ -402,7 +403,7 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
     // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
     // dimension.
     for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr;
+      const uint8_t* input2_data_ptr;
       for (int i1 = 0; i1 < y1; ++i1) {
         input2_data_ptr = input2_data_reset;
         for (int i2 = 0; i2 < y2; ++i2) {
@@ -431,7 +432,7 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
     // for y4 == 1 and the loop over y3 is contained within the
     // AddScalarBroadcast function.
     for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr;
+      const uint8_t* input2_data_ptr;
       for (int i1 = 0; i1 < y1; ++i1) {
         input2_data_ptr = input2_data_reset;
         for (int i2 = 0; i2 < y2; ++i2) {
diff --git a/tensorflow/lite/kernels/internal/reference/batch_matmul.h b/tensorflow/lite/kernels/internal/reference/batch_matmul.h
index 05caefaca5d..24c3ffe3d7e 100644
--- a/tensorflow/lite/kernels/internal/reference/batch_matmul.h
+++ b/tensorflow/lite/kernels/internal/reference/batch_matmul.h
@@ -266,13 +266,13 @@ inline void BatchMatMul(const FullyConnectedParams& params,
   const int rhs_cols = extended_rhs_shape.Dims(4);
   const int accum_depth = extended_lhs_shape.Dims(4);
 
-  const int32 input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
   const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
 
   for (int b0 = 0; b0 < batch_dim0; ++b0) {
@@ -292,8 +292,8 @@ inline void BatchMatMul(const FullyConnectedParams& params,
           for (int i = 0; i < lhs_rows; ++i) {
             int32_t total = 0;
             for (int k = 0; k < accum_depth; ++k) {
-              int32 lhs_val = lhs_ptr2[accum_depth * i + k];
-              int32 rhs_val = rhs_ptr2[accum_depth * j + k];
+              int32_t lhs_val = lhs_ptr2[accum_depth * i + k];
+              int32_t rhs_val = rhs_ptr2[accum_depth * j + k];
               total += (lhs_val + filter_offset) * (rhs_val + input_offset);
             }
             total = MultiplyByQuantizedMultiplier(total, output_multiplier,
diff --git a/tensorflow/lite/kernels/internal/reference/comparisons.h b/tensorflow/lite/kernels/internal/reference/comparisons.h
index d9bc10a9390..49844ab1539 100644
--- a/tensorflow/lite/kernels/internal/reference/comparisons.h
+++ b/tensorflow/lite/kernels/internal/reference/comparisons.h
@@ -105,30 +105,30 @@ inline void Comparison(const ComparisonParams& op_params,
                            input2_data, output_shape, output_data);
 }
 
-template <typename T, ComparisonFn<int32> F>
+template <typename T, ComparisonFn<int32_t> F>
 inline void ComparisonWithScaling(
     const ComparisonParams& op_params, const RuntimeShape& input1_shape,
     const T* input1_data, const RuntimeShape& input2_shape,
     const T* input2_data, const RuntimeShape& output_shape, bool* output_data) {
   int left_shift = op_params.left_shift;
-  int32 input1_offset = op_params.input1_offset;
-  int32 input1_multiplier = op_params.input1_multiplier;
+  int32_t input1_offset = op_params.input1_offset;
+  int32_t input1_multiplier = op_params.input1_multiplier;
   int input1_shift = op_params.input1_shift;
-  int32 input2_offset = op_params.input2_offset;
-  int32 input2_multiplier = op_params.input2_multiplier;
+  int32_t input2_offset = op_params.input2_offset;
+  int32_t input2_multiplier = op_params.input2_multiplier;
   int input2_shift = op_params.input2_shift;
 
   const int64_t flatsize =
       MatchingFlatSize(input1_shape, input2_shape, output_shape);
   for (int64_t i = 0; i < flatsize; ++i) {
-    const int32 input1_val = input1_offset + input1_data[i];
-    const int32 input2_val = input2_offset + input2_data[i];
-    const int32 shifted_input1_val = input1_val * (1 << left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << left_shift);
-    const int32 scaled_input1_val =
+    const int32_t input1_val = input1_offset + input1_data[i];
+    const int32_t input2_val = input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << left_shift);
+    const int32_t scaled_input1_val =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             shifted_input1_val, input1_multiplier, input1_shift);
-    const int32 scaled_input2_val =
+    const int32_t scaled_input2_val =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             shifted_input2_val, input2_multiplier, input2_shift);
     output_data[i] = F(scaled_input1_val, scaled_input2_val);
@@ -218,7 +218,7 @@ inline void BroadcastComparison4DSlow(const ComparisonParams& op_params,
                                           output_shape, output_data);
 }
 
-template <typename T, ComparisonFn<int32> F>
+template <typename T, ComparisonFn<int32_t> F>
 inline void BroadcastComparison4DSlowWithScaling(
     const ComparisonParams& op_params,
     const RuntimeShape& unextended_input1_shape, const T* input1_data,
@@ -230,29 +230,29 @@ inline void BroadcastComparison4DSlowWithScaling(
                                           unextended_output_shape);
 
   int left_shift = op_params.left_shift;
-  int32 input1_offset = op_params.input1_offset;
-  int32 input1_multiplier = op_params.input1_multiplier;
+  int32_t input1_offset = op_params.input1_offset;
+  int32_t input1_multiplier = op_params.input1_multiplier;
   int input1_shift = op_params.input1_shift;
-  int32 input2_offset = op_params.input2_offset;
-  int32 input2_multiplier = op_params.input2_multiplier;
+  int32_t input2_offset = op_params.input2_offset;
+  int32_t input2_multiplier = op_params.input2_multiplier;
   int input2_shift = op_params.input2_shift;
 
   for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
     for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
       for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
         for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
-          const int32 input1_val =
+          const int32_t input1_val =
               input1_offset +
               input1_data[SubscriptToIndex(dims.desc1, b, y, x, c)];
-          const int32 input2_val =
+          const int32_t input2_val =
               input2_offset +
               input2_data[SubscriptToIndex(dims.desc2, b, y, x, c)];
-          const int32 shifted_input1_val = input1_val * (1 << left_shift);
-          const int32 shifted_input2_val = input2_val * (1 << left_shift);
-          const int32 scaled_input1_val =
+          const int32_t shifted_input1_val = input1_val * (1 << left_shift);
+          const int32_t shifted_input2_val = input2_val * (1 << left_shift);
+          const int32_t scaled_input1_val =
               MultiplyByQuantizedMultiplierSmallerThanOneExp(
                   shifted_input1_val, input1_multiplier, input1_shift);
-          const int32 scaled_input2_val =
+          const int32_t scaled_input2_val =
               MultiplyByQuantizedMultiplierSmallerThanOneExp(
                   shifted_input2_val, input2_multiplier, input2_shift);
           output_data[Offset(dims.output_shape, b, y, x, c)] =
diff --git a/tensorflow/lite/kernels/internal/reference/concatenation.h b/tensorflow/lite/kernels/internal/reference/concatenation.h
index 958fe3ea249..25959793e9d 100644
--- a/tensorflow/lite/kernels/internal/reference/concatenation.h
+++ b/tensorflow/lite/kernels/internal/reference/concatenation.h
@@ -74,14 +74,14 @@ inline void Concatenation(const ConcatenationParams& params,
 // when optimizng this routine further.
 inline void ConcatenationWithScaling(const ConcatenationParams& params,
                                      const RuntimeShape* const* input_shapes,
-                                     const uint8* const* input_data,
+                                     const uint8_t* const* input_data,
                                      const RuntimeShape& output_shape,
-                                     uint8* output_data) {
+                                     uint8_t* output_data) {
   int axis = params.axis;
-  const int32* input_zeropoint = params.input_zeropoint;
+  const int32_t* input_zeropoint = params.input_zeropoint;
   const float* input_scale = params.input_scale;
   int inputs_count = params.inputs_count;
-  const int32 output_zeropoint = params.output_zeropoint;
+  const int32_t output_zeropoint = params.output_zeropoint;
   const float output_scale = params.output_scale;
 
   const int concat_dimensions = output_shape.DimensionsCount();
@@ -110,11 +110,11 @@ inline void ConcatenationWithScaling(const ConcatenationParams& params,
   }
 
   const float inverse_output_scale = 1.f / output_scale;
-  uint8* output_ptr = output_data;
+  uint8_t* output_ptr = output_data;
   for (int k = 0; k < outer_size; k++) {
     for (int i = 0; i < inputs_count; ++i) {
       const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size;
-      const uint8* input_ptr = input_data[i] + k * copy_size;
+      const uint8_t* input_ptr = input_data[i] + k * copy_size;
       if (input_zeropoint[i] == output_zeropoint &&
           input_scale[i] == output_scale) {
         memcpy(output_ptr, input_ptr, copy_size);
diff --git a/tensorflow/lite/kernels/internal/reference/conv.h b/tensorflow/lite/kernels/internal/reference/conv.h
index 55dd869a4b1..d4bf46a86b8 100644
--- a/tensorflow/lite/kernels/internal/reference/conv.h
+++ b/tensorflow/lite/kernels/internal/reference/conv.h
@@ -99,11 +99,11 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
 }
 
 inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
-                 const uint8* input_data, const RuntimeShape& filter_shape,
-                 const uint8* filter_data, const RuntimeShape& bias_shape,
-                 const int32* bias_data, const RuntimeShape& output_shape,
-                 uint8* output_data, const RuntimeShape& im2col_shape,
-                 uint8* im2col_data, void* cpu_backend_context) {
+                 const uint8_t* input_data, const RuntimeShape& filter_shape,
+                 const uint8_t* filter_data, const RuntimeShape& bias_shape,
+                 const int32_t* bias_data, const RuntimeShape& output_shape,
+                 uint8_t* output_data, const RuntimeShape& im2col_shape,
+                 uint8_t* im2col_data, void* cpu_backend_context) {
   (void)cpu_backend_context;  // only used in optimized code.
   (void)im2col_data;   // only used in optimized code.
   (void)im2col_shape;  // only used in optimized code.
@@ -113,13 +113,13 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
   const int dilation_height_factor = params.dilation_height_factor;
   const int pad_width = params.padding_values.width;
   const int pad_height = params.padding_values.height;
-  const int32 input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
   const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
 
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -143,7 +143,7 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
         for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
           const int in_x_origin = (out_x * stride_width) - pad_width;
           const int in_y_origin = (out_y * stride_height) - pad_height;
-          int32 acc = 0;
+          int32_t acc = 0;
           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
             for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
               for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
@@ -154,9 +154,9 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
                 // use zero as a default value.
                 if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                     (in_y < input_height)) {
-                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
-                                                      in_x, in_channel)];
-                  int32 filter_val =
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val =
                       filter_data[Offset(filter_shape, out_channel, filter_y,
                                          filter_x, in_channel)];
                   acc +=
@@ -174,7 +174,7 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
           acc = std::max(acc, output_activation_min);
           acc = std::min(acc, output_activation_max);
           output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
-              static_cast<uint8>(acc);
+              static_cast<uint8_t>(acc);
         }
       }
     }
@@ -220,7 +220,7 @@ inline void HybridConvPerChannel(
         for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
           const int in_x_origin = (out_x * stride_width) - pad_width;
           const int in_y_origin = (out_y * stride_height) - pad_height;
-          int32 acc = 0;
+          int32_t acc = 0;
           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
             for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
               for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
@@ -231,9 +231,9 @@ inline void HybridConvPerChannel(
                 // use zero as a default value.
                 if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                     (in_y < input_height)) {
-                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
-                                                      in_x, in_channel)];
-                  int32 filter_val =
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val =
                       filter_data[Offset(filter_shape, out_channel, filter_y,
                                          filter_x, in_channel)];
                   acc += filter_val * (input_val - input_offset[batch]);
diff --git a/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h b/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
index 70e5dd4012f..20bf83df3d8 100644
--- a/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
+++ b/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
@@ -62,21 +62,21 @@ namespace reference_ops {
 namespace depthwise_conv {
 
 template <DepthwiseConvOutputRounding output_rounding>
-inline int32 DepthwiseConvRound(int32 x, int32 quantized_multiplier,
-                                int shift) {
+inline int32_t DepthwiseConvRound(int32_t x, int32_t quantized_multiplier,
+                                  int shift) {
   TFLITE_DCHECK_NE(output_rounding, DepthwiseConvOutputRounding::kNone);
   return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
 }
 
 template <>
-inline int32 DepthwiseConvRound<DepthwiseConvOutputRounding::kAwayFromZero>(
-    int32 x, int32 quantized_multiplier, int shift) {
+inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kAwayFromZero>(
+    int32_t x, int32_t quantized_multiplier, int shift) {
   return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
 }
 
 template <>
-inline int32 DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
-    int32 x, int32 quantized_multiplier, int shift) {
+inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
+    int32_t x, int32_t quantized_multiplier, int shift) {
   using gemmlowp::SaturatingRoundingDoublingHighMul;
   const int left_shift = shift > 0 ? shift : 0;
   const int right_shift = shift > 0 ? 0 : -shift;
@@ -89,13 +89,12 @@ inline int32 DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
 
 template <DepthwiseConvOutputRounding output_rounding>
 struct DepthwiseConvBasicKernel {
-  static inline void Run(const DepthwiseParams& params,
-                         const RuntimeShape& input_shape,
-                         const uint8* input_data,
-                         const RuntimeShape& filter_shape,
-                         const uint8* filter_data,
-                         const RuntimeShape& bias_shape, const int32* bias_data,
-                         const RuntimeShape& output_shape, uint8* output_data) {
+  static inline void Run(
+      const DepthwiseParams& params, const RuntimeShape& input_shape,
+      const uint8_t* input_data, const RuntimeShape& filter_shape,
+      const uint8_t* filter_data, const RuntimeShape& bias_shape,
+      const int32_t* bias_data, const RuntimeShape& output_shape,
+      uint8_t* output_data) {
     const int stride_width = params.stride_width;
     const int stride_height = params.stride_height;
     const int dilation_width_factor = params.dilation_width_factor;
@@ -103,12 +102,12 @@ struct DepthwiseConvBasicKernel {
     const int pad_width = params.padding_values.width;
     const int pad_height = params.padding_values.height;
     const int depth_multiplier = params.depth_multiplier;
-    const int32 output_activation_min = params.quantized_activation_min;
-    const int32 output_activation_max = params.quantized_activation_max;
-    const int32 input_offset = params.input_offset;
-    const int32 filter_offset = params.weights_offset;
-    const int32 output_offset = params.output_offset;
-    const int32 output_multiplier = params.output_multiplier;
+    const int32_t output_activation_min = params.quantized_activation_min;
+    const int32_t output_activation_max = params.quantized_activation_max;
+    const int32_t input_offset = params.input_offset;
+    const int32_t filter_offset = params.weights_offset;
+    const int32_t output_offset = params.output_offset;
+    const int32_t output_multiplier = params.output_multiplier;
     const int output_shift = params.output_shift;
     TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
     TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
@@ -135,7 +134,7 @@ struct DepthwiseConvBasicKernel {
               const int oc = m + ic * depth_multiplier;
               const int in_x_origin = (out_x * stride_width) - pad_width;
               const int in_y_origin = (out_y * stride_height) - pad_height;
-              int32 acc = 0;
+              int32_t acc = 0;
               for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
                 for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
                   const int in_x =
@@ -146,9 +145,9 @@ struct DepthwiseConvBasicKernel {
                   // use zero as a default value.
                   if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                       (in_y < input_height)) {
-                    int32 input_val =
+                    int32_t input_val =
                         input_data[Offset(input_shape, b, in_y, in_x, ic)];
-                    int32 filter_val = filter_data[Offset(
+                    int32_t filter_val = filter_data[Offset(
                         filter_shape, 0, filter_y, filter_x, oc)];
                     acc += (filter_val + filter_offset) *
                            (input_val + input_offset);
@@ -164,7 +163,7 @@ struct DepthwiseConvBasicKernel {
               acc = std::max(acc, output_activation_min);
               acc = std::min(acc, output_activation_max);
               output_data[Offset(output_shape, b, out_y, out_x, oc)] =
-                  static_cast<uint8>(acc);
+                  static_cast<uint8_t>(acc);
             }
           }
         }
@@ -176,10 +175,10 @@ struct DepthwiseConvBasicKernel {
   // MultiplyByQuantizedMultiplier or DepthwiseConvRound function.
   static inline void RunPerChannel(
       const DepthwiseParams& params, const RuntimeShape& input_shape,
-      const int8* input_data, const RuntimeShape& filter_shape,
-      const int8* filter_data, const RuntimeShape& bias_shape,
-      const int32* bias_data, const RuntimeShape& output_shape,
-      int8* output_data) {
+      const int8_t* input_data, const RuntimeShape& filter_shape,
+      const int8_t* filter_data, const RuntimeShape& bias_shape,
+      const int32_t* bias_data, const RuntimeShape& output_shape,
+      int8_t* output_data) {
     // Get parameters.
     // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
     const int stride_width = params.stride_width;
@@ -189,12 +188,12 @@ struct DepthwiseConvBasicKernel {
     const int pad_width = params.padding_values.width;
     const int pad_height = params.padding_values.height;
     const int depth_multiplier = params.depth_multiplier;
-    const int32 input_offset = params.input_offset;
-    const int32 output_offset = params.output_offset;
-    const int32 output_activation_min = params.quantized_activation_min;
-    const int32 output_activation_max = params.quantized_activation_max;
-    const int32* output_multiplier = params.output_multiplier_per_channel;
-    const int32* output_shift = params.output_shift_per_channel;
+    const int32_t input_offset = params.input_offset;
+    const int32_t output_offset = params.output_offset;
+    const int32_t output_activation_min = params.quantized_activation_min;
+    const int32_t output_activation_max = params.quantized_activation_max;
+    const int32_t* output_multiplier = params.output_multiplier_per_channel;
+    const int32_t* output_shift = params.output_shift_per_channel;
 
     // Check dimensions of the tensors.
     TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -222,7 +221,7 @@ struct DepthwiseConvBasicKernel {
               const int output_channel = m + in_channel * depth_multiplier;
               const int in_x_origin = (out_x * stride_width) - pad_width;
               const int in_y_origin = (out_y * stride_height) - pad_height;
-              int32 acc = 0;
+              int32_t acc = 0;
               for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
                 for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
                   const int in_x =
@@ -234,17 +233,18 @@ struct DepthwiseConvBasicKernel {
                       (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                       (in_y < input_height);
                   if (is_point_inside_image) {
-                    int32 input_val = input_data[Offset(
+                    int32_t input_val = input_data[Offset(
                         input_shape, batch, in_y, in_x, in_channel)];
-                    int32 filter_val = filter_data[Offset(
+                    int32_t filter_val = filter_data[Offset(
                         filter_shape, 0, filter_y, filter_x, output_channel)];
                     // Accumulate with 32 bits accumulator.
                     // In the nudging process during model quantization, we
                     // force real value of 0.0 be represented by a quantized
-                    // value. This guarantees that the input_offset is a int8,
-                    // even though it is represented using int32. int32 += int8
-                    // * (int8 - int8) so the highest value we can get from each
-                    // accumulation is [-127, 127] * ([-128, 127] -
+                    // value. This guarantees that the input_offset is a int8_t,
+                    // even though it is represented using int32_t. int32_t +=
+                    // int8_t
+                    // * (int8_t - int8_t) so the highest value we can get from
+                    // each accumulation is [-127, 127] * ([-128, 127] -
                     // [-128, 127]), which is [-32512, 32512]. log2(32512)
                     // = 14.98, which means we can accumulate at least 2^16
                     // multiplications without overflow. The accumulator is
@@ -279,10 +279,10 @@ struct DepthwiseConvBasicKernel {
 
 inline void DepthwiseConv(
     const DepthwiseParams& params, const RuntimeShape& input_shape,
-    const uint8* input_data, const RuntimeShape& filter_shape,
-    const uint8* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    uint8* output_data) {
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    uint8_t* output_data) {
   return depthwise_conv::DepthwiseConvBasicKernel<
       DepthwiseConvOutputRounding::kAwayFromZero>::Run(params, input_shape,
                                                        input_data, filter_shape,
diff --git a/tensorflow/lite/kernels/internal/reference/dequantize.h b/tensorflow/lite/kernels/internal/reference/dequantize.h
index 286c9310799..b90951f96e8 100644
--- a/tensorflow/lite/kernels/internal/reference/dequantize.h
+++ b/tensorflow/lite/kernels/internal/reference/dequantize.h
@@ -32,12 +32,12 @@ inline void Dequantize(const tflite::DequantizationParams& op_params,
                        const RuntimeShape& input_shape,
                        const InputT* input_data,
                        const RuntimeShape& output_shape, OutputT* output_data) {
-  int32 zero_point = op_params.zero_point;
+  int32_t zero_point = op_params.zero_point;
   const double scale = op_params.scale;
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
 
   for (int i = 0; i < flat_size; i++) {
-    const int32 val = input_data[i];
+    const int32_t val = input_data[i];
     const OutputT result = static_cast<OutputT>(scale * (val - zero_point));
     output_data[i] = result;
   }
@@ -52,11 +52,11 @@ inline void PerChannelDequantize(
   // Ensure flat size is same.
   MatchingFlatSize(input_shape, output_shape);
 
-  const int32* zero_point = op_params.zero_point;
+  const int32_t* zero_point = op_params.zero_point;
   const float* scale = op_params.scale;
-  const int32 quantized_dimension = op_params.quantized_dimension;
-  const int32 num_dims = input_shape.DimensionsCount();
-  const int32* dims_data = input_shape.DimsData();
+  const int32_t quantized_dimension = op_params.quantized_dimension;
+  const int32_t num_dims = input_shape.DimensionsCount();
+  const int32_t* dims_data = input_shape.DimsData();
   std::vector<int> current_dim(num_dims, 0);
 
   do {
@@ -64,7 +64,7 @@ inline void PerChannelDequantize(
         ReducedOutputOffset(num_dims, reinterpret_cast<const int*>(dims_data),
                             current_dim.data(), 0, nullptr);
     const int channel = current_dim[quantized_dimension];
-    const int32 val = input_data[offset];
+    const int32_t val = input_data[offset];
     const float result =
         static_cast<float>(scale[channel] * (val - zero_point[channel]));
     output_data[offset] = result;
diff --git a/tensorflow/lite/kernels/internal/reference/fully_connected.h b/tensorflow/lite/kernels/internal/reference/fully_connected.h
index 204a0fa0afa..39a9cd023d8 100644
--- a/tensorflow/lite/kernels/internal/reference/fully_connected.h
+++ b/tensorflow/lite/kernels/internal/reference/fully_connected.h
@@ -61,17 +61,17 @@ inline void FullyConnected(
 
 inline void FullyConnected(
     const FullyConnectedParams& params, const RuntimeShape& input_shape,
-    const uint8* input_data, const RuntimeShape& filter_shape,
-    const uint8* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    uint8* output_data) {
-  const int32 input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    uint8_t* output_data) {
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
   const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
   TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
   TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
 
@@ -89,10 +89,10 @@ inline void FullyConnected(
   const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
   for (int b = 0; b < batches; ++b) {
     for (int out_c = 0; out_c < output_depth; ++out_c) {
-      int32 acc = 0;
+      int32_t acc = 0;
       for (int d = 0; d < accum_depth; ++d) {
-        int32 input_val = input_data[b * accum_depth + d];
-        int32 filter_val = filter_data[out_c * accum_depth + d];
+        int32_t input_val = input_data[b * accum_depth + d];
+        int32_t filter_val = filter_data[out_c * accum_depth + d];
         acc += (filter_val + filter_offset) * (input_val + input_offset);
       }
       if (bias_data) {
@@ -102,24 +102,24 @@ inline void FullyConnected(
       acc += output_offset;
       acc = std::max(acc, output_activation_min);
       acc = std::min(acc, output_activation_max);
-      output_data[out_c + output_depth * b] = static_cast<uint8>(acc);
+      output_data[out_c + output_depth * b] = static_cast<uint8_t>(acc);
     }
   }
 }
 
 inline void FullyConnected(
     const FullyConnectedParams& params, const RuntimeShape& input_shape,
-    const uint8* input_data, const RuntimeShape& filter_shape,
-    const uint8* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    int16* output_data) {
-  const int32 input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int16_t* output_data) {
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
   const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
 
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
   TFLITE_DCHECK_EQ(output_offset, 0);
@@ -138,20 +138,21 @@ inline void FullyConnected(
     for (int out_c = 0; out_c < output_depth; ++out_c) {
       // Internal accumulation.
       // Initialize accumulator with the bias-value.
-      int32 accum = bias_data[out_c];
+      int32_t accum = bias_data[out_c];
       // Accumulation loop.
       for (int d = 0; d < accum_depth; ++d) {
-        int16 input_val = input_data[b * accum_depth + d] + input_offset;
-        int16 filter_val = filter_data[out_c * accum_depth + d] + filter_offset;
+        int16_t input_val = input_data[b * accum_depth + d] + input_offset;
+        int16_t filter_val =
+            filter_data[out_c * accum_depth + d] + filter_offset;
         accum += filter_val * input_val;
       }
-      // Down-scale the final int32 accumulator to the scale used by our
+      // Down-scale the final int32_t accumulator to the scale used by our
       // (16-bit, typically 3 integer bits) fixed-point format. The quantized
       // multiplier and shift here have been pre-computed offline
       // (e.g. by toco).
       accum =
           MultiplyByQuantizedMultiplier(accum, output_multiplier, output_shift);
-      // Saturate, cast to int16, and store to output array.
+      // Saturate, cast to int16_t, and store to output array.
       accum = std::max(accum, output_activation_min - output_offset);
       accum = std::min(accum, output_activation_max - output_offset);
       accum += output_offset;
@@ -162,14 +163,14 @@ inline void FullyConnected(
 
 inline void ShuffledFullyConnected(
     const FullyConnectedParams& params, const RuntimeShape& input_shape,
-    const uint8* input_data, const RuntimeShape& weights_shape,
-    const uint8* shuffled_weights_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    int16* output_data, uint8* shuffled_input_workspace_data) {
-  const int32 output_multiplier = params.output_multiplier;
+    const uint8_t* input_data, const RuntimeShape& weights_shape,
+    const uint8_t* shuffled_weights_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int16_t* output_data, uint8_t* shuffled_input_workspace_data) {
+  const int32_t output_multiplier = params.output_multiplier;
   const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
 
   TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
@@ -190,7 +191,7 @@ inline void ShuffledFullyConnected(
   TFLITE_DCHECK((output_depth % 4) == 0);
 
   // Shuffling and xoring of input activations into the workspace buffer
-  uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
+  uint8_t* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
   if (batches == 1) {
     for (int i = 0; i < accum_depth; i++) {
       shuffled_input_workspace_data[i] = input_data[i] ^ 0x80;
@@ -198,13 +199,13 @@ inline void ShuffledFullyConnected(
   } else if (batches == 4) {
     for (int c = 0; c < accum_depth; c += 16) {
       for (int b = 0; b < 4; b++) {
-        const uint8* src_data_ptr = input_data + b * accum_depth + c;
+        const uint8_t* src_data_ptr = input_data + b * accum_depth + c;
         for (int j = 0; j < 16; j++) {
-          uint8 src_val = *src_data_ptr++;
+          uint8_t src_val = *src_data_ptr++;
           // Flip the sign bit, so that the kernel will only need to
-          // reinterpret these uint8 values as int8, getting for free the
+          // reinterpret these uint8_t values as int8_t, getting for free the
           // subtraction of the zero_point value 128.
-          uint8 dst_val = src_val ^ 0x80;
+          uint8_t dst_val = src_val ^ 0x80;
           *shuffled_input_workspace_ptr++ = dst_val;
         }
       }
@@ -216,62 +217,62 @@ inline void ShuffledFullyConnected(
 
   // Actual computation
   if (batches == 1) {
-    int16* output_ptr = output_data;
+    int16_t* output_ptr = output_data;
     // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
-    // so that just reinterpreting them as int8 values is equivalent to
+    // so that just reinterpreting them as int8_t values is equivalent to
     // subtracting 128 from them, thus implementing for free the subtraction of
     // the zero_point value 128.
-    const int8* shuffled_weights_ptr =
-        reinterpret_cast<const int8*>(shuffled_weights_data);
+    const int8_t* shuffled_weights_ptr =
+        reinterpret_cast<const int8_t*>(shuffled_weights_data);
     // Likewise, we preshuffled and pre-xored the input data above.
-    const int8* shuffled_input_data =
-        reinterpret_cast<const int8*>(shuffled_input_workspace_data);
+    const int8_t* shuffled_input_data =
+        reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
     for (int c = 0; c < output_depth; c += 4) {
       // Internal accumulation.
       // Initialize accumulator with the bias-value.
-      int32 accum[4] = {0};
+      int32_t accum[4] = {0};
       // Accumulation loop.
       for (int d = 0; d < accum_depth; d += 16) {
         for (int i = 0; i < 4; i++) {
           for (int j = 0; j < 16; j++) {
-            int8 input_val = shuffled_input_data[d + j];
-            int8 weights_val = *shuffled_weights_ptr++;
+            int8_t input_val = shuffled_input_data[d + j];
+            int8_t weights_val = *shuffled_weights_ptr++;
             accum[i] += weights_val * input_val;
           }
         }
       }
       for (int i = 0; i < 4; i++) {
         // Add bias value
-        int32 acc = accum[i] + bias_data[c + i];
-        // Down-scale the final int32 accumulator to the scale used by our
+        int32_t acc = accum[i] + bias_data[c + i];
+        // Down-scale the final int32_t accumulator to the scale used by our
         // (16-bit, typically 3 integer bits) fixed-point format. The quantized
         // multiplier and shift here have been pre-computed offline
         // (e.g. by toco).
         acc =
             MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
-        // Saturate, cast to int16, and store to output array.
+        // Saturate, cast to int16_t, and store to output array.
         acc = std::max(acc, output_activation_min);
         acc = std::min(acc, output_activation_max);
         output_ptr[c + i] = acc;
       }
     }
   } else if (batches == 4) {
-    int16* output_ptr = output_data;
+    int16_t* output_ptr = output_data;
     // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
-    // so that just reinterpreting them as int8 values is equivalent to
+    // so that just reinterpreting them as int8_t values is equivalent to
     // subtracting 128 from them, thus implementing for free the subtraction of
     // the zero_point value 128.
-    const int8* shuffled_weights_ptr =
-        reinterpret_cast<const int8*>(shuffled_weights_data);
+    const int8_t* shuffled_weights_ptr =
+        reinterpret_cast<const int8_t*>(shuffled_weights_data);
     // Likewise, we preshuffled and pre-xored the input data above.
-    const int8* shuffled_input_data =
-        reinterpret_cast<const int8*>(shuffled_input_workspace_data);
+    const int8_t* shuffled_input_data =
+        reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
     for (int c = 0; c < output_depth; c += 4) {
-      const int8* shuffled_input_ptr = shuffled_input_data;
+      const int8_t* shuffled_input_ptr = shuffled_input_data;
       // Accumulation loop.
       // Internal accumulation.
       // Initialize accumulator with the bias-value.
-      int32 accum[4][4];
+      int32_t accum[4][4];
       for (int i = 0; i < 4; i++) {
         for (int b = 0; b < 4; b++) {
           accum[i][b] = 0;
@@ -281,8 +282,8 @@ inline void ShuffledFullyConnected(
         for (int i = 0; i < 4; i++) {
           for (int b = 0; b < 4; b++) {
             for (int j = 0; j < 16; j++) {
-              int8 input_val = shuffled_input_ptr[16 * b + j];
-              int8 weights_val = shuffled_weights_ptr[16 * i + j];
+              int8_t input_val = shuffled_input_ptr[16 * b + j];
+              int8_t weights_val = shuffled_weights_ptr[16 * i + j];
               accum[i][b] += weights_val * input_val;
             }
           }
@@ -293,14 +294,14 @@ inline void ShuffledFullyConnected(
       for (int i = 0; i < 4; i++) {
         for (int b = 0; b < 4; b++) {
           // Add bias value
-          int32 acc = accum[i][b] + bias_data[c + i];
-          // Down-scale the final int32 accumulator to the scale used by our
+          int32_t acc = accum[i][b] + bias_data[c + i];
+          // Down-scale the final int32_t accumulator to the scale used by our
           // (16-bit, typically 3 integer bits) fixed-point format. The
           // quantized multiplier and shift here have been pre-computed offline
           // (e.g. by toco).
           acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
                                               output_shift);
-          // Saturate, cast to int16, and store to output array.
+          // Saturate, cast to int16_t, and store to output array.
           acc = std::max(acc, output_activation_min);
           acc = std::min(acc, output_activation_max);
           output_ptr[b * output_depth + c + i] = acc;
diff --git a/tensorflow/lite/kernels/internal/reference/hard_swish.h b/tensorflow/lite/kernels/internal/reference/hard_swish.h
index dd07b09c3b3..cda1b5cf0ad 100644
--- a/tensorflow/lite/kernels/internal/reference/hard_swish.h
+++ b/tensorflow/lite/kernels/internal/reference/hard_swish.h
@@ -86,7 +86,7 @@ inline void HardSwish(const HardSwishParams& params,
     // (reluish_multiplier_fixedpoint) and bit-shift such that we represent
     // that input value on the scale where the real value 3.0f is represented
     // by the quantized value 32768.  (+32768 is actually not representable as
-    // int16, so this saturates at +32767, and that is seen empirically to be
+    // int16_t, so this saturates at +32767, and that is seen empirically to be
     // a negligible contribution to numerical error/bias).
     //
     // This code is careful to correctly implement any magnitude of multiplier,
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/add.h b/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
index 69b42e08a6d..2af6f373ca5 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
@@ -23,34 +23,41 @@ limitations under the License.
 namespace tflite {
 namespace reference_integer_ops {
 
+inline void CheckArithmeticParams(const ArithmeticParams& params) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  // Input offset is negative input zero point. Activation tensors are
+  // asymmetric quantized so they span the full int8 range.
+  TFLITE_DCHECK_GE(-params.input1_offset, std::numeric_limits<int8_t>::min());
+  TFLITE_DCHECK_GE(-params.input2_offset, std::numeric_limits<int8_t>::min());
+  TFLITE_DCHECK_LE(-params.input1_offset, std::numeric_limits<int8_t>::max());
+  TFLITE_DCHECK_LE(-params.input2_offset, std::numeric_limits<int8_t>::max());
+}
+
 // Element-wise add that can often be used for inner loop of broadcast add as
 // well as the non-broadcast add.
 inline void AddElementwise(int size, const ArithmeticParams& params,
                            const int8_t* input1_data, const int8_t* input2_data,
                            int8_t* output_data) {
-  const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
-  TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
-  TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
-  TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
-  TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
+  CheckArithmeticParams(params);
 
   for (int i = 0; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
-    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32 scaled_input1_val =
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             shifted_input1_val, params.input1_multiplier, params.input1_shift);
-    const int32 scaled_input2_val =
+    const int32_t scaled_input2_val =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
-    const int32 raw_output =
+    const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32_t raw_output =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             raw_sum, params.output_multiplier, params.output_shift) +
         params.output_offset;
-    const int32 clamped_output =
+    const int32_t clamped_output =
         std::min(params.quantized_activation_max,
                  std::max(params.quantized_activation_min, raw_output));
     output_data[i] = static_cast<int8_t>(clamped_output);
@@ -61,16 +68,11 @@ inline void Add(const ArithmeticParams& params,
                 const RuntimeShape& input1_shape, const int8_t* input1_data,
                 const RuntimeShape& input2_shape, const int8_t* input2_data,
                 const RuntimeShape& output_shape, int8_t* output_data) {
-  TFLITE_DCHECK_LE(params.quantized_activation_min,
-                   params.quantized_activation_max);
+  CheckArithmeticParams(params);
+
   const int flat_size =
       MatchingElementsSize(input1_shape, input2_shape, output_shape);
 
-  const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
-  TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
-  TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
-  TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
-  TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
   AddElementwise(flat_size, params, input1_data, input2_data, output_data);
 }
 
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h b/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
index df6b787338d..f4bcb2bd06e 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
@@ -22,25 +22,25 @@ namespace reference_integer_ops {
 
 // Fixed-point per-channel-quantization convolution reference kernel.
 inline void ConvPerChannel(
-    const ConvParams& params, const int32* output_multiplier,
-    const int32* output_shift, const RuntimeShape& input_shape,
-    const int8* input_data, const RuntimeShape& filter_shape,
-    const int8* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    int8* output_data) {
+    const ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data) {
   // Get parameters.
-  const int32 input_offset = params.input_offset;  // r = s(q - Z)
+  const int32_t input_offset = params.input_offset;  // r = s(q - Z)
   const int stride_width = params.stride_width;
   const int stride_height = params.stride_height;
   const int dilation_width_factor = params.dilation_width_factor;
   const int dilation_height_factor = params.dilation_height_factor;
   const int pad_width = params.padding_values.width;
   const int pad_height = params.padding_values.height;
-  const int32 output_offset = params.output_offset;
+  const int32_t output_offset = params.output_offset;
 
   // Set min and max value of the output.
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
 
   // Consistency check.
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
@@ -67,7 +67,7 @@ inline void ConvPerChannel(
         for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
           const int in_x_origin = (out_x * stride_width) - pad_width;
           const int in_y_origin = (out_y * stride_height) - pad_height;
-          int32 acc = 0;
+          int32_t acc = 0;
           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
             for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
               for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
@@ -79,18 +79,18 @@ inline void ConvPerChannel(
                     (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                     (in_y < input_height);
                 if (is_point_inside_image) {
-                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
-                                                      in_x, in_channel)];
-                  int32 filter_val =
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val =
                       filter_data[Offset(filter_shape, out_channel, filter_y,
                                          filter_x, in_channel)];
                   // Accumulate with 32 bits accumulator.
                   // In the nudging process during model quantization, we force
                   // real value of 0.0 be represented by a quantized value. This
-                  // guarantees that the input_offset is a int8, even though it
-                  // is represented using int32.
-                  // int32 += int8 * (int8 - int8) so the highest value we can
-                  // get from each accumulation is [-127, 127] * ([-128, 127] -
+                  // guarantees that the input_offset is a int8_t, even though
+                  // it is represented using int32_t. int32_t += int8_t *
+                  // (int8_t - int8_t) so the highest value we can get from each
+                  // accumulation is [-127, 127] * ([-128, 127] -
                   // [-128, 127]), which is [-32512, 32512]. log2(32512)
                   // = 14.98, which means we can accumulate at least 2^16
                   // multiplications without overflow. The accumulator is
@@ -125,12 +125,12 @@ inline void ConvPerChannel(
 // Fixed-point per-channel-quantization convolution reference kernel.
 // 16-bit data and 8-bit filter
 inline void ConvPerChannel(
-    const ConvParams& params, const int32* output_multiplier,
-    const int32* output_shift, const RuntimeShape& input_shape,
-    const int16* input_data, const RuntimeShape& filter_shape,
-    const int8* filter_data, const RuntimeShape& bias_shape,
+    const ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int16_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
     const std::int64_t* bias_data, const RuntimeShape& output_shape,
-    int16* output_data) {
+    int16_t* output_data) {
   // Get parameters.
   const int stride_width = params.stride_width;
   const int stride_height = params.stride_height;
@@ -140,8 +140,8 @@ inline void ConvPerChannel(
   const int pad_height = params.padding_values.height;
 
   // Set min and max value of the output.
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
 
   // Consistency check.
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
@@ -180,13 +180,13 @@ inline void ConvPerChannel(
                     (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                     (in_y < input_height);
                 if (is_point_inside_image) {
-                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
-                                                      in_x, in_channel)];
-                  int32 filter_val =
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val =
                       filter_data[Offset(filter_shape, out_channel, filter_y,
                                          filter_x, in_channel)];
                   // Accumulate with 64 bits accumulator.
-                  // int64 += int8 * int16 so the highest value we can
+                  // int64_t += int8_t * int16_t so the highest value we can
                   // get from each accumulation is [-127, 127] * ([-32768,
                   // 32767] -
                   // [-32768, 32767]), which is [-8322945, 8322945].
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h b/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
index a4e00981367..6f54e47f344 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
@@ -20,12 +20,12 @@ limitations under the License.
 namespace tflite {
 namespace reference_integer_ops {
 inline void DepthwiseConvPerChannel(
-    const DepthwiseParams& params, const int32* output_multiplier,
-    const int32* output_shift, const RuntimeShape& input_shape,
-    const int8* input_data, const RuntimeShape& filter_shape,
-    const int8* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    int8* output_data) {
+    const DepthwiseParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data) {
   // Get parameters.
   // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
   const int stride_width = params.stride_width;
@@ -35,10 +35,10 @@ inline void DepthwiseConvPerChannel(
   const int pad_width = params.padding_values.width;
   const int pad_height = params.padding_values.height;
   const int depth_multiplier = params.depth_multiplier;
-  const int32 input_offset = params.input_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t input_offset = params.input_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
 
   // Check dimensions of the tensors.
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -66,7 +66,7 @@ inline void DepthwiseConvPerChannel(
             const int output_channel = m + in_channel * depth_multiplier;
             const int in_x_origin = (out_x * stride_width) - pad_width;
             const int in_y_origin = (out_y * stride_height) - pad_height;
-            int32 acc = 0;
+            int32_t acc = 0;
             for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
               for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
                 const int in_x = in_x_origin + dilation_width_factor * filter_x;
@@ -77,17 +77,17 @@ inline void DepthwiseConvPerChannel(
                     (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                     (in_y < input_height);
                 if (is_point_inside_image) {
-                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
-                                                      in_x, in_channel)];
-                  int32 filter_val = filter_data[Offset(
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val = filter_data[Offset(
                       filter_shape, 0, filter_y, filter_x, output_channel)];
                   // Accumulate with 32 bits accumulator.
                   // In the nudging process during model quantization, we force
                   // real value of 0.0 be represented by a quantized value. This
-                  // guarantees that the input_offset is a int8, even though it
-                  // is represented using int32.
-                  // int32 += int8 * (int8 - int8) so the highest value we can
-                  // get from each accumulation is [-127, 127] * ([-128, 127] -
+                  // guarantees that the input_offset is a int8_t, even though
+                  // it is represented using int32_t. int32_t += int8_t *
+                  // (int8_t - int8_t) so the highest value we can get from each
+                  // accumulation is [-127, 127] * ([-128, 127] -
                   // [-128, 127]), which is [-32512, 32512]. log2(32512)
                   // = 14.98, which means we can accumulate at least 2^16
                   // multiplications without overflow. The accumulator is
@@ -120,12 +120,12 @@ inline void DepthwiseConvPerChannel(
 }
 
 inline void DepthwiseConvPerChannel(
-    const DepthwiseParams& params, const int32* output_multiplier,
-    const int32* output_shift, const RuntimeShape& input_shape,
-    const int16* input_data, const RuntimeShape& filter_shape,
-    const int8* filter_data, const RuntimeShape& bias_shape,
+    const DepthwiseParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int16_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
     const std::int64_t* bias_data, const RuntimeShape& output_shape,
-    int16* output_data) {
+    int16_t* output_data) {
   // Get parameters.
   const int stride_width = params.stride_width;
   const int stride_height = params.stride_height;
@@ -134,8 +134,8 @@ inline void DepthwiseConvPerChannel(
   const int pad_width = params.padding_values.width;
   const int pad_height = params.padding_values.height;
   const int depth_multiplier = params.depth_multiplier;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
 
   // Check dimensions of the tensors.
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -174,9 +174,9 @@ inline void DepthwiseConvPerChannel(
                     (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                     (in_y < input_height);
                 if (is_point_inside_image) {
-                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
-                                                      in_x, in_channel)];
-                  int32 filter_val = filter_data[Offset(
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val = filter_data[Offset(
                       filter_shape, 0, filter_y, filter_x, output_channel)];
                   // Accumulate with 64 bits accumulator.
                   // We assume maximum of 2^16 accumulations as with the 8-bit
@@ -190,7 +190,7 @@ inline void DepthwiseConvPerChannel(
             if (bias_data) {
               acc += bias_data[output_channel];
             }
-            int32 scaled_acc = MultiplyByQuantizedMultiplier(
+            int32_t scaled_acc = MultiplyByQuantizedMultiplier(
                 acc, output_multiplier[output_channel],
                 output_shift[output_channel]);
             scaled_acc = std::max(scaled_acc, output_activation_min);
@@ -207,8 +207,8 @@ inline void DepthwiseConvPerChannel(
 
 inline void DepthwiseConvHybridPerChannel(
     const DepthwiseParams& params, float* scaling_factors_ptr,
-    const RuntimeShape& input_shape, const int8* input_data,
-    const RuntimeShape& filter_shape, const int8* filter_data,
+    const RuntimeShape& input_shape, const int8_t* input_data,
+    const RuntimeShape& filter_shape, const int8_t* filter_data,
     const RuntimeShape& bias_shape, const float* bias_data,
     const RuntimeShape& output_shape, float* output_data,
     const float* per_channel_scale, int32_t* input_offset) {
@@ -247,7 +247,7 @@ inline void DepthwiseConvHybridPerChannel(
             const int output_channel = m + in_channel * depth_multiplier;
             const int in_x_origin = (out_x * stride_width) - pad_width;
             const int in_y_origin = (out_y * stride_height) - pad_height;
-            int32 acc = 0;
+            int32_t acc = 0;
             for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
               for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
                 const int in_x = in_x_origin + dilation_width_factor * filter_x;
@@ -258,9 +258,9 @@ inline void DepthwiseConvHybridPerChannel(
                     (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                     (in_y < input_height);
                 if (is_point_inside_image) {
-                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
-                                                      in_x, in_channel)];
-                  int32 filter_val = filter_data[Offset(
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val = filter_data[Offset(
                       filter_shape, 0, filter_y, filter_x, output_channel)];
                   acc += filter_val * (input_val - input_offset[batch]);
                 }
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h b/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
index fd9cb0180e1..2bc3e794855 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
@@ -24,15 +24,15 @@ inline void FullyConnected(
     const FullyConnectedParams& params, const RuntimeShape& input_shape,
     const int8_t* input_data, const RuntimeShape& filter_shape,
     const int8_t* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
     int8_t* output_data) {
-  const int32 input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
   const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
   TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);
 
@@ -44,10 +44,10 @@ inline void FullyConnected(
   const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
   for (int b = 0; b < batches; ++b) {
     for (int out_c = 0; out_c < output_depth; ++out_c) {
-      int32 acc = 0;
+      int32_t acc = 0;
       for (int d = 0; d < accum_depth; ++d) {
-        int32 input_val = input_data[b * accum_depth + d];
-        int32 filter_val = filter_data[out_c * accum_depth + d];
+        int32_t input_val = input_data[b * accum_depth + d];
+        int32_t filter_val = filter_data[out_c * accum_depth + d];
         acc += (filter_val + filter_offset) * (input_val + input_offset);
       }
       if (bias_data) {
@@ -68,11 +68,11 @@ inline void FullyConnected(
     const int8_t* filter_data, const RuntimeShape& bias_shape,
     const int64_t* bias_data, const RuntimeShape& output_shape,
     int16_t* output_data) {
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_multiplier = params.output_multiplier;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_multiplier = params.output_multiplier;
   const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
   TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);
 
@@ -86,8 +86,8 @@ inline void FullyConnected(
     for (int out_c = 0; out_c < output_depth; ++out_c) {
       int64_t acc = 0;
       for (int d = 0; d < accum_depth; ++d) {
-        int32 input_val = input_data[b * accum_depth + d];
-        int32 filter_val = filter_data[out_c * accum_depth + d];
+        int32_t input_val = input_data[b * accum_depth + d];
+        int32_t filter_val = filter_data[out_c * accum_depth + d];
         acc += (filter_val + filter_offset) * input_val;
       }
       if (bias_data) {
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h b/tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h
index 7488a2147c4..31f2de986c8 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h
@@ -21,8 +21,8 @@ namespace tflite {
 namespace reference_integer_ops {
 
 inline void L2Normalization(int32_t input_zero_point, int32_t outer_size,
-                            int32_t depth, const int8* input_data,
-                            int8* output_data) {
+                            int32_t depth, const int8_t* input_data,
+                            int8_t* output_data) {
   static constexpr int8_t kMinInt8 = std::numeric_limits<int8_t>::min();
   static constexpr int8_t kMaxInt8 = std::numeric_limits<int8_t>::max();
   // The output scale must be in sync with Prepare().
@@ -30,7 +30,7 @@ inline void L2Normalization(int32_t input_zero_point, int32_t outer_size,
   // to [-1, 127/128].
   static constexpr int32_t kOutputScale = 7;
   for (int outer_index = 0; outer_index < outer_size; ++outer_index) {
-    // int32 = (int8 - int8) ^ 2.
+    // int32_t = (int8_t - int8_t) ^ 2.
     // ([-128, 127] - [-128, 127]) ^ 2 = [0, (2^8 - 1)^2] so the accumulator is
     // safe from overflowing in at least 2^16 steps.
     int32_t acc = 0;
@@ -55,7 +55,7 @@ inline void L2Normalization(int32_t input_zero_point, int32_t outer_size,
           std::min(static_cast<int32_t>(kMaxInt8),
                    std::max(static_cast<int32_t>(kMinInt8), output_in_q24));
       output_data[depth * outer_index + inner_index] =
-          static_cast<int8>(output_in_q24);
+          static_cast<int8_t>(output_in_q24);
     }
   }
 }
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h b/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h
index 1a00130cd8d..1e29f8c61a7 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h
@@ -20,11 +20,12 @@ limitations under the License.
 namespace tflite {
 namespace reference_integer_ops {
 
+template <typename integer_type>
 inline void Mean(const tflite::MeanParams& op_params, int32_t multiplier,
                  int32_t shift, const RuntimeShape& unextended_input_shape,
-                 const int8_t* input_data, int32 input_zero_point,
+                 const integer_type* input_data, int32 input_zero_point,
                  const RuntimeShape& unextended_output_shape,
-                 int8_t* output_data, int32 output_zero_point) {
+                 integer_type* output_data, int32 output_zero_point) {
   // Current implementation only supports dimension equals 4 and simultaneous
   // reduction over width and height.
   TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
@@ -47,8 +48,8 @@ inline void Mean(const tflite::MeanParams& op_params, int32_t multiplier,
   TFLITE_CHECK_EQ(output_height, 1);
   TFLITE_CHECK_EQ(output_width, 1);
 
-  static constexpr int32_t kMinInt8 = std::numeric_limits<int8_t>::min();
-  static constexpr int32_t kMaxInt8 = std::numeric_limits<int8_t>::max();
+  static constexpr int32_t kMinInt = std::numeric_limits<integer_type>::min();
+  static constexpr int32_t kMaxInt = std::numeric_limits<integer_type>::max();
 
   for (int out_b = 0; out_b < output_batch; ++out_b) {
     for (int out_d = 0; out_d < output_depth; ++out_d) {
@@ -63,9 +64,9 @@ inline void Mean(const tflite::MeanParams& op_params, int32_t multiplier,
       acc = acc > 0 ? (acc + num_elements_in_axis / 2) / num_elements_in_axis
                     : (acc - num_elements_in_axis / 2) / num_elements_in_axis;
       acc += output_zero_point;
-      acc = std::min(std::max(acc, kMinInt8), kMaxInt8);
+      acc = std::min(std::max(acc, kMinInt), kMaxInt);
       output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
-          static_cast<int8_t>(acc);
+          static_cast<integer_type>(acc);
     }
   }
 }
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h b/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
index a815c3f5252..b80838aa089 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
@@ -27,14 +27,14 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
                            const T* input1_data, const T* input2_data,
                            T* output_data) {
   for (int i = 0; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
-    const int32 unclamped_result =
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t unclamped_result =
         params.output_offset +
         MultiplyByQuantizedMultiplier(input1_val * input2_val,
                                       params.output_multiplier,
                                       params.output_shift);
-    const int32 clamped_output =
+    const int32_t clamped_output =
         std::min(params.quantized_activation_max,
                  std::max(params.quantized_activation_min, unclamped_result));
     output_data[i] = static_cast<T>(clamped_output);
@@ -57,13 +57,13 @@ inline void Mul(const ArithmeticParams& params,
 
 // Mul with 16 bit inputs and int8_t outputs.
 inline void Mul(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const int16* input1_data,
-                const RuntimeShape& input2_shape, const int16* input2_data,
+                const RuntimeShape& input1_shape, const int16_t* input1_data,
+                const RuntimeShape& input2_shape, const int16_t* input2_data,
                 const RuntimeShape& output_shape, int8_t* output_data) {
   ruy::profiler::ScopeLabel label("Mul/Int16Int8");
-  int32 output_offset = params.output_offset;
-  int32 output_activation_min = params.quantized_activation_min;
-  int32 output_activation_max = params.quantized_activation_max;
+  int32_t output_offset = params.output_offset;
+  int32_t output_activation_min = params.quantized_activation_min;
+  int32_t output_activation_max = params.quantized_activation_max;
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
 
   const int flat_size =
@@ -75,12 +75,12 @@ inline void Mul(const ArithmeticParams& params,
 
     F0 unclamped_result =
         F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
-    int16 rescaled_result =
+    int16_t rescaled_result =
         gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8);
-    int16 clamped_result =
-        std::min<int16>(output_activation_max - output_offset, rescaled_result);
-    clamped_result =
-        std::max<int16>(output_activation_min - output_offset, clamped_result);
+    int16_t clamped_result = std::min<int16_t>(
+        output_activation_max - output_offset, rescaled_result);
+    clamped_result = std::max<int16_t>(output_activation_min - output_offset,
+                                       clamped_result);
     output_data[i] = output_offset + clamped_result;
   }
 }
@@ -104,18 +104,18 @@ inline void BroadcastMul4DSlow(
     for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
       for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
         for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          const int32 input1_val =
+          const int32_t input1_val =
               params.input1_offset +
               input1_data[SubscriptToIndex(desc1, b, y, x, c)];
-          const int32 input2_val =
+          const int32_t input2_val =
               params.input2_offset +
               input2_data[SubscriptToIndex(desc2, b, y, x, c)];
-          const int32 unclamped_result =
+          const int32_t unclamped_result =
               params.output_offset +
               MultiplyByQuantizedMultiplier(input1_val * input2_val,
                                             params.output_multiplier,
                                             params.output_shift);
-          const int32 clamped_output = std::min(
+          const int32_t clamped_output = std::min(
               params.quantized_activation_max,
               std::max(params.quantized_activation_min, unclamped_result));
           output_data[Offset(extended_output_shape, b, y, x, c)] =
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h b/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h
index 6b49d2b150b..17944bc47dd 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h
@@ -22,8 +22,9 @@ namespace tflite {
 namespace reference_integer_ops {
 
 inline void AveragePool(const PoolParams& params,
-                        const RuntimeShape& input_shape, const int8* input_data,
-                        const RuntimeShape& output_shape, int8* output_data) {
+                        const RuntimeShape& input_shape,
+                        const int8_t* input_data,
+                        const RuntimeShape& output_shape, int8_t* output_data) {
   TFLITE_DCHECK_LE(params.quantized_activation_min,
                    params.quantized_activation_max);
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -52,7 +53,7 @@ inline void AveragePool(const PoolParams& params,
           const int filter_y_start = std::max(0, -in_y_origin);
           const int filter_y_end =
               std::min(params.filter_height, input_height - in_y_origin);
-          int32 acc = 0;
+          int32_t acc = 0;
           int filter_count = 0;
           for (int filter_y = filter_y_start; filter_y < filter_y_end;
                ++filter_y) {
@@ -71,7 +72,7 @@ inline void AveragePool(const PoolParams& params,
           acc = std::max(acc, params.quantized_activation_min);
           acc = std::min(acc, params.quantized_activation_max);
           output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
-              static_cast<int8>(acc);
+              static_cast<int8_t>(acc);
         }
       }
     }
@@ -79,8 +80,8 @@ inline void AveragePool(const PoolParams& params,
 }
 
 inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
-                    const int8* input_data, const RuntimeShape& output_shape,
-                    int8* output_data) {
+                    const int8_t* input_data, const RuntimeShape& output_shape,
+                    int8_t* output_data) {
   TFLITE_DCHECK_LE(params.quantized_activation_min,
                    params.quantized_activation_max);
   TFLITE_DCHECK_GE(params.quantized_activation_min,
@@ -137,8 +138,9 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
 
 inline void AveragePool(const PoolParams& params,
                         const RuntimeShape& input_shape,
-                        const int16* input_data,
-                        const RuntimeShape& output_shape, int16* output_data) {
+                        const int16_t* input_data,
+                        const RuntimeShape& output_shape,
+                        int16_t* output_data) {
   TFLITE_DCHECK_LE(params.quantized_activation_min,
                    params.quantized_activation_max);
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -167,7 +169,7 @@ inline void AveragePool(const PoolParams& params,
           const int filter_y_start = std::max(0, -in_y_origin);
           const int filter_y_end =
               std::min(params.filter_height, input_height - in_y_origin);
-          int32 acc = 0;
+          int32_t acc = 0;
           int filter_count = 0;
           for (int filter_y = filter_y_start; filter_y < filter_y_end;
                ++filter_y) {
@@ -186,7 +188,7 @@ inline void AveragePool(const PoolParams& params,
           acc = std::max(acc, params.quantized_activation_min);
           acc = std::min(acc, params.quantized_activation_max);
           output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
-              static_cast<int16>(acc);
+              static_cast<int16_t>(acc);
         }
       }
     }
@@ -194,8 +196,8 @@ inline void AveragePool(const PoolParams& params,
 }
 
 inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
-                    const int16* input_data, const RuntimeShape& output_shape,
-                    int16* output_data) {
+                    const int16_t* input_data, const RuntimeShape& output_shape,
+                    int16_t* output_data) {
   TFLITE_DCHECK_LE(params.quantized_activation_min,
                    params.quantized_activation_max);
   TFLITE_DCHECK_GE(params.quantized_activation_min,
diff --git a/tensorflow/lite/kernels/internal/reference/l2normalization.h b/tensorflow/lite/kernels/internal/reference/l2normalization.h
index 00697c2e548..7587d2b5c2e 100644
--- a/tensorflow/lite/kernels/internal/reference/l2normalization.h
+++ b/tensorflow/lite/kernels/internal/reference/l2normalization.h
@@ -52,40 +52,39 @@ inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
 
 inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
                             const RuntimeShape& input_shape,
-                            const uint8* input_data,
+                            const uint8_t* input_data,
                             const RuntimeShape& output_shape,
-                            uint8* output_data) {
+                            uint8_t* output_data) {
   const int trailing_dim = input_shape.DimensionsCount() - 1;
   const int depth =
       MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
   const int outer_size =
       MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
-  const int32 input_zero_point = op_params.input_zero_point;
+  const int32_t input_zero_point = op_params.input_zero_point;
 
   for (int i = 0; i < outer_size; ++i) {
-    int32 square_l2_norm = 0;
+    int32_t square_l2_norm = 0;
     for (int c = 0; c < depth; c++) {
-      int32 diff = input_data[depth * i + c] - input_zero_point;
+      int32_t diff = input_data[depth * i + c] - input_zero_point;
       square_l2_norm += diff * diff;
     }
-    int32 inv_l2norm_multiplier;
+    int32_t inv_l2norm_multiplier;
     int inv_l2norm_shift;
     GetInvSqrtQuantizedMultiplierExp(square_l2_norm, kReverseShift,
                                      &inv_l2norm_multiplier, &inv_l2norm_shift);
     for (int c = 0; c < depth; c++) {
-      int32 diff = input_data[depth * i + c] - input_zero_point;
-      int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+      int32_t diff = input_data[depth * i + c] - input_zero_point;
+      int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
           128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
-      int32 unclamped_output_val = 128 + rescaled_diff;
-      int32 output_val =
-          std::min(static_cast<int32>(255),
-                   std::max(static_cast<int32>(0), unclamped_output_val));
-      output_data[depth * i + c] = static_cast<uint8>(output_val);
+      int32_t unclamped_output_val = 128 + rescaled_diff;
+      int32_t output_val =
+          std::min(static_cast<int32_t>(255),
+                   std::max(static_cast<int32_t>(0), unclamped_output_val));
+      output_data[depth * i + c] = static_cast<uint8_t>(output_val);
     }
   }
 }
 
-
 }  // namespace reference_ops
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_L2NORMALIZATION_H_
diff --git a/tensorflow/lite/kernels/internal/reference/logistic.h b/tensorflow/lite/kernels/internal/reference/logistic.h
index 8aba51896df..64b7133bec6 100644
--- a/tensorflow/lite/kernels/internal/reference/logistic.h
+++ b/tensorflow/lite/kernels/internal/reference/logistic.h
@@ -66,8 +66,8 @@ inline void Logistic(const LogisticParams&, const RuntimeShape& input_shape,
 }
 
 inline void Logistic(const LogisticParams& params,
-                     const RuntimeShape& input_shape, const int16* input_data,
-                     const RuntimeShape& output_shape, int16* output_data) {
+                     const RuntimeShape& input_shape, const int16_t* input_data,
+                     const RuntimeShape& output_shape, int16_t* output_data) {
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
 
   for (int i = 0; i < flat_size; i++) {
@@ -84,12 +84,12 @@ inline void Logistic(const LogisticParams& params,
   }
 }
 
-// Quantized int8 logistic activation.  Cheats by dequantizing and requantizing
-// around the floating point logistic method.  This implementation is slow on
-// platforms without a floating point unit.
+// Quantized int8_t logistic activation.  Cheats by dequantizing and
+// requantizing around the floating point logistic method.  This implementation
+// is slow on platforms without a floating point unit.
 
-// TODO(b/141211002): Delete this int8 implementation once we can reuse the
-// approach used in TFLite for int8 Logistic.
+// TODO(b/141211002): Delete this int8_t implementation once we can reuse the
+// approach used in TFLite for int8_t Logistic.
 inline void Logistic(const RuntimeShape& input_shape, const int8_t* input_data,
                      float input_scale, int input_zero_point,
                      const RuntimeShape& output_shape, int8_t* output_data,
diff --git a/tensorflow/lite/kernels/internal/reference/mul.h b/tensorflow/lite/kernels/internal/reference/mul.h
index 54e947db9ca..0578b81bfbc 100644
--- a/tensorflow/lite/kernels/internal/reference/mul.h
+++ b/tensorflow/lite/kernels/internal/reference/mul.h
@@ -24,20 +24,20 @@ namespace reference_ops {
 // Element-wise mul that can often be used for inner loop of broadcast Mul as
 // well as the non-broadcast Mul.
 inline void MulElementwise(int size, const ArithmeticParams& params,
-                           const uint8* input1_data, const uint8* input2_data,
-                           uint8* output_data) {
+                           const uint8_t* input1_data,
+                           const uint8_t* input2_data, uint8_t* output_data) {
   for (int i = 0; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
-    const int32 unclamped_result =
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t unclamped_result =
         params.output_offset +
         MultiplyByQuantizedMultiplier(input1_val * input2_val,
                                       params.output_multiplier,
                                       params.output_shift);
-    const int32 clamped_output =
+    const int32_t clamped_output =
         std::min(params.quantized_activation_max,
                  std::max(params.quantized_activation_min, unclamped_result));
-    output_data[i] = static_cast<uint8>(clamped_output);
+    output_data[i] = static_cast<uint8_t>(clamped_output);
   }
 }
 
@@ -60,9 +60,9 @@ inline void Mul(const ArithmeticParams& params,
 }
 
 inline void Mul(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const uint8* input1_data,
-                const RuntimeShape& input2_shape, const uint8* input2_data,
-                const RuntimeShape& output_shape, uint8* output_data) {
+                const RuntimeShape& input1_shape, const uint8_t* input1_data,
+                const RuntimeShape& input2_shape, const uint8_t* input2_data,
+                const RuntimeShape& output_shape, uint8_t* output_data) {
   TFLITE_DCHECK_LE(params.quantized_activation_min,
                    params.quantized_activation_max);
   const int flat_size =
@@ -73,11 +73,11 @@ inline void Mul(const ArithmeticParams& params,
 
 inline void BroadcastMul4DSlow(const ArithmeticParams& params,
                                const RuntimeShape& input1_shape,
-                               const uint8* input1_data,
+                               const uint8_t* input1_data,
                                const RuntimeShape& input2_shape,
-                               const uint8* input2_data,
+                               const uint8_t* input2_data,
                                const RuntimeShape& output_shape,
-                               uint8* output_data) {
+                               uint8_t* output_data) {
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
@@ -89,22 +89,22 @@ inline void BroadcastMul4DSlow(const ArithmeticParams& params,
     for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
       for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
         for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          const int32 input1_val =
+          const int32_t input1_val =
               params.input1_offset +
               input1_data[SubscriptToIndex(desc1, b, y, x, c)];
-          const int32 input2_val =
+          const int32_t input2_val =
               params.input2_offset +
               input2_data[SubscriptToIndex(desc2, b, y, x, c)];
-          const int32 unclamped_result =
+          const int32_t unclamped_result =
               params.output_offset +
               MultiplyByQuantizedMultiplier(input1_val * input2_val,
                                             params.output_multiplier,
                                             params.output_shift);
-          const int32 clamped_output = std::min(
+          const int32_t clamped_output = std::min(
               params.quantized_activation_max,
               std::max(params.quantized_activation_min, unclamped_result));
           output_data[Offset(extended_output_shape, b, y, x, c)] =
-              static_cast<uint8>(clamped_output);
+              static_cast<uint8_t>(clamped_output);
         }
       }
     }
diff --git a/tensorflow/lite/kernels/internal/reference/pad.h b/tensorflow/lite/kernels/internal/reference/pad.h
index 20fe3434ae5..2a040cefc91 100644
--- a/tensorflow/lite/kernels/internal/reference/pad.h
+++ b/tensorflow/lite/kernels/internal/reference/pad.h
@@ -32,8 +32,8 @@ constexpr int PadKernelMaxDimensionCount() { return 4; }
 // equivalent to a simple input1_data.  For Pad, it should point to a zero
 // value.
 //
-// Note that two typenames are required, so that T=P=int32 is considered a
-// specialization distinct from P=int32.
+// Note that two typenames are required, so that T=P=int32_t is considered a
+// specialization distinct from P=int32_t.
 template <typename T, typename P>
 inline void PadImpl(const tflite::PadParams& op_params,
                     const RuntimeShape& input_shape, const T* input_data,
@@ -116,11 +116,11 @@ inline void Pad(const tflite::PadParams& op_params,
           output_data);
 }
 
-// The second (pad-value) input can be int32 when, say, the first is uint8.
+// The second (pad-value) input can be int32_t when, say, the first is uint8_t.
 template <typename T>
 inline void Pad(const tflite::PadParams& op_params,
                 const RuntimeShape& input_shape, const T* input_data,
-                const int32* pad_value_ptr, const RuntimeShape& output_shape,
+                const int32_t* pad_value_ptr, const RuntimeShape& output_shape,
                 T* output_data) {
   const T converted_pad_value = static_cast<T>(*pad_value_ptr);
   PadImpl(op_params, input_shape, input_data, &converted_pad_value,
@@ -130,9 +130,9 @@ inline void Pad(const tflite::PadParams& op_params,
 // This version avoids conflicting template matching.
 template <>
 inline void Pad(const tflite::PadParams& op_params,
-                const RuntimeShape& input_shape, const int32* input_data,
-                const int32* pad_value_ptr, const RuntimeShape& output_shape,
-                int32* output_data) {
+                const RuntimeShape& input_shape, const int32_t* input_data,
+                const int32_t* pad_value_ptr, const RuntimeShape& output_shape,
+                int32_t* output_data) {
   PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
           output_data);
 }
diff --git a/tensorflow/lite/kernels/internal/reference/pooling.h b/tensorflow/lite/kernels/internal/reference/pooling.h
index a03359cda82..0872f5210c8 100644
--- a/tensorflow/lite/kernels/internal/reference/pooling.h
+++ b/tensorflow/lite/kernels/internal/reference/pooling.h
@@ -78,8 +78,9 @@ inline void AveragePool(const PoolParams& params,
 
 inline void AveragePool(const PoolParams& params,
                         const RuntimeShape& input_shape,
-                        const uint8* input_data,
-                        const RuntimeShape& output_shape, uint8* output_data) {
+                        const uint8_t* input_data,
+                        const RuntimeShape& output_shape,
+                        uint8_t* output_data) {
   TFLITE_DCHECK_LE(params.quantized_activation_min,
                    params.quantized_activation_max);
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -108,7 +109,7 @@ inline void AveragePool(const PoolParams& params,
           const int filter_y_start = std::max(0, -in_y_origin);
           const int filter_y_end =
               std::min(params.filter_height, input_height - in_y_origin);
-          int32 acc = 0;
+          int32_t acc = 0;
           int filter_count = 0;
           for (int filter_y = filter_y_start; filter_y < filter_y_end;
                ++filter_y) {
@@ -125,7 +126,7 @@ inline void AveragePool(const PoolParams& params,
           acc = std::max(acc, params.quantized_activation_min);
           acc = std::min(acc, params.quantized_activation_max);
           output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
-              static_cast<uint8>(acc);
+              static_cast<uint8_t>(acc);
         }
       }
     }
@@ -237,8 +238,8 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
 }
 
 inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
-                    const uint8* input_data, const RuntimeShape& output_shape,
-                    uint8* output_data) {
+                    const uint8_t* input_data, const RuntimeShape& output_shape,
+                    uint8_t* output_data) {
   TFLITE_DCHECK_LE(params.quantized_activation_min,
                    params.quantized_activation_max);
   TFLITE_DCHECK_GE(params.quantized_activation_min, 0);
@@ -269,7 +270,7 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
           const int filter_y_start = std::max(0, -in_y_origin);
           const int filter_y_end =
               std::min(params.filter_height, input_height - in_y_origin);
-          uint8 max = 0;
+          uint8_t max = 0;
           for (int filter_y = filter_y_start; filter_y < filter_y_end;
                ++filter_y) {
             for (int filter_x = filter_x_start; filter_x < filter_x_end;
@@ -281,10 +282,10 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
                   input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
             }
           }
-          max = std::max<uint8>(max, params.quantized_activation_min);
-          max = std::min<uint8>(max, params.quantized_activation_max);
+          max = std::max<uint8_t>(max, params.quantized_activation_min);
+          max = std::min<uint8_t>(max, params.quantized_activation_max);
           output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
-              static_cast<uint8>(max);
+              static_cast<uint8_t>(max);
         }
       }
     }
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
index 4a8d4b0fb6a..d257a170091 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -97,13 +97,13 @@ void PortableAsymmetricQuantizeFloats(const float* values, const int size,
         zero_point_from_min_error < zero_point_from_max_error
             ? zero_point_from_min
             : zero_point_from_max;
-    int8 nudged_zero_point = 0;
+    int8_t nudged_zero_point = 0;
     if (zero_point_double <= qmin_double) {
       nudged_zero_point = kMinScale;
     } else if (zero_point_double >= qmax_double) {
       nudged_zero_point = kMaxScale;
     } else {
-      nudged_zero_point = static_cast<int8>(round(zero_point_double));
+      nudged_zero_point = static_cast<int8_t>(round(zero_point_double));
     }
     *scaling_factor = scale;
     *offset = nudged_zero_point;
@@ -303,8 +303,8 @@ void PortableMatrixBatchVectorMultiplyAccumulateImpl(
     for (int row = 0; row < n_output; ++row) {
       int32_t acc = bias[row];
       for (int col = 0; col < n_input; ++col) {
-        int8 input_val = input[batch * n_input + col];
-        int8 weights_val = input_to_gate_weights[row * n_input + col];
+        int8_t input_val = input[batch * n_input + col];
+        int8_t weights_val = input_to_gate_weights[row * n_input + col];
         acc += input_val * weights_val;
       }
       acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
@@ -349,8 +349,8 @@ void PortableMatrixBatchVectorMultiply(const int8_t* input,
                                        int32_t n_batch, int32_t n_input,
                                        int32_t n_cell, int8_t* gate_output,
                                        int8_t gate_output_zp) {
-  const int32_t int8_max = std::numeric_limits<int8>::max();
-  const int32_t int8_min = std::numeric_limits<int8>::min();
+  const int32_t int8_max = std::numeric_limits<int8_t>::max();
+  const int32_t int8_min = std::numeric_limits<int8_t>::min();
   for (int batch = 0; batch < n_batch; ++batch) {
     for (int row = 0; row < n_cell; ++row) {
       int32_t acc = 0;
@@ -378,8 +378,8 @@ void PortableMatrixBatchVectorMultiply(
     int32_t proj_effective_scale_a, int32_t proj_effective_scale_b,
     const int32_t* gate_bias, int32_t n_batch, int32_t n_hidden,
     int32_t n_output, int32_t output_zp, int8_t* proj_output) {
-  const int16_t int8_max = std::numeric_limits<int8>::max();
-  const int16_t int8_min = std::numeric_limits<int8>::min();
+  const int16_t int8_max = std::numeric_limits<int8_t>::max();
+  const int16_t int8_min = std::numeric_limits<int8_t>::min();
   for (int batch = 0; batch < n_batch; ++batch) {
     for (int row = 0; row < n_output; ++row) {
       int64_t acc = gate_bias[row];
@@ -389,10 +389,10 @@ void PortableMatrixBatchVectorMultiply(
         int64_t curr = acc;
         acc += input_val * weights_val;
         if (input_val * weights_val > 0 && acc < curr) {
-          acc = std::numeric_limits<int32>::max();
+          acc = std::numeric_limits<int32_t>::max();
         }
         if (input_val * weights_val < 0 && acc > curr) {
-          acc = std::numeric_limits<int32>::min();
+          acc = std::numeric_limits<int32_t>::min();
         }
       }
       acc = MultiplyByQuantizedMultiplier(acc, proj_effective_scale_a,
@@ -429,10 +429,10 @@ void PortableApplyLayerNorm(const int16_t* input,
     int32_t mean =
         static_cast<int32_t>(static_cast<int64_t>(sum) * 1024 / n_input);
     // TODO(jianlijianli): Avoids overflow but only works for POT n_input.
-    int32 temp = kTwoToPower20 / n_input;
+    int32_t temp = kTwoToPower20 / n_input;
     int64_t variance =
         sum_sq * temp - static_cast<int64_t>(mean) * static_cast<int64_t>(mean);
-    int32_t variance2 = static_cast<int32>(variance / kTwoToPower20);
+    int32_t variance2 = static_cast<int32_t>(variance / kTwoToPower20);
     if (variance2 < 1) {
       variance2 = variance_limit;
     }
@@ -442,17 +442,17 @@ void PortableApplyLayerNorm(const int16_t* input,
                                      &stddev_inverse_a, &stddev_inverse_b);
 
     for (int j = 0; j < n_input; ++j) {
-      const int32 index = i * n_input + j;
-      int32 val = static_cast<int32_t>(input[index]);
-      int32 shifted = 1024 * val - mean;
-      int32 rescaled = MultiplyByQuantizedMultiplier(shifted, stddev_inverse_a,
-                                                     stddev_inverse_b);
+      const int32_t index = i * n_input + j;
+      int32_t val = static_cast<int32_t>(input[index]);
+      int32_t shifted = 1024 * val - mean;
+      int32_t rescaled = MultiplyByQuantizedMultiplier(
+          shifted, stddev_inverse_a, stddev_inverse_b);
       // TODO(jianlijianli): Saturate this.
       int64_t val3 = rescaled * layer_norm_weights[j] + bias[j];
-      int32 val4 =
-          static_cast<int32>((val3 > 0 ? val3 + 512 : val3 - 512) / 1024);
-      int32 val5 = MultiplyByQuantizedMultiplier(val4, layer_norm_scale_a,
-                                                 layer_norm_scale_b + 12);
+      int32_t val4 =
+          static_cast<int32_t>((val3 > 0 ? val3 + 512 : val3 - 512) / 1024);
+      int32_t val5 = MultiplyByQuantizedMultiplier(val4, layer_norm_scale_a,
+                                                   layer_norm_scale_b + 12);
       val5 = std::min(std::max(kInt16Min, val5), kInt16Max);
       output[index] = static_cast<int16_t>(val5);
     }
@@ -465,8 +465,8 @@ void PortableApplyLayerNormFloat(const int16_t* input,
                                  int32_t layer_norm_scale_b,
                                  const int32_t* bias, int n_batch, int n_input,
                                  int16_t* output) {
-  const int32_t int16_max = std::numeric_limits<int16>::max();
-  const int32_t int16_min = std::numeric_limits<int16>::min();
+  const int32_t int16_max = std::numeric_limits<int16_t>::max();
+  const int32_t int16_min = std::numeric_limits<int16_t>::min();
   // This is to surpress a lint warning.
   const double two = 2.0;
   const float layer_norm_scale =
@@ -498,7 +498,7 @@ void PortableApplyLayerNormFloat(const int16_t* input,
       const float weighted_normalized_value =
           normalized_value * layer_norm_weights[i] * layer_norm_scale +
           bias[i] * bias_scale;
-      const int32_t quant_output = static_cast<int32>(
+      const int32_t quant_output = static_cast<int32_t>(
           std::round(weighted_normalized_value * std::pow(2, 12)));
       output[index] = std::min(int16_max, std::max(int16_min, quant_output));
     }
@@ -533,18 +533,18 @@ void PortableApplySigmoid(const int16_t* input, int32_t n_batch,
 
 void PortableApplySigmoidFloat(const int16_t* input, int32_t n_batch,
                                int32_t n_input, int16_t* output) {
-  const int32_t int16_max = std::numeric_limits<int16>::max();
-  const int32_t int16_min = std::numeric_limits<int16>::min();
+  const int32_t int16_max = std::numeric_limits<int16_t>::max();
+  const int32_t int16_min = std::numeric_limits<int16_t>::min();
   for (int batch = 0; batch < n_batch; ++batch) {
     for (int i = 0; i < n_input; ++i) {
       const int index = batch * n_input + i;
       const float float_input = input[index] * std::pow(2, -12);
       const float float_output = 1.0f / (1.0f + std::exp(-float_input));
       const int32_t quant_output =
-          static_cast<int32>(float_output * std::pow(2, 15));
+          static_cast<int32_t>(float_output * std::pow(2, 15));
       const int32_t quant_output_clamped =
           std::min(int16_max, std::max(int16_min, quant_output));
-      output[index] = static_cast<int16>(quant_output_clamped);
+      output[index] = static_cast<int16_t>(quant_output_clamped);
     }
   }
 }
@@ -588,8 +588,8 @@ void PortableApplyTanh(int32_t integer_bits, const int16_t* input,
 void PortableApplyTanhFloat(const int16_t* input, int32_t n_batch,
                             int32_t n_input, int32_t integer_bits,
                             int16_t* output) {
-  const int32_t int16_max = std::numeric_limits<int16>::max();
-  const int32_t int16_min = std::numeric_limits<int16>::min();
+  const int32_t int16_max = std::numeric_limits<int16_t>::max();
+  const int32_t int16_min = std::numeric_limits<int16_t>::min();
   const double two = 2.0;
   for (int batch = 0; batch < n_batch; ++batch) {
     for (int i = 0; i < n_input; ++i) {
@@ -598,10 +598,10 @@ void PortableApplyTanhFloat(const int16_t* input, int32_t n_batch,
           input[index] * std::pow(two, static_cast<double>(integer_bits));
       const float float_output = std::tanh(float_input);
       const int32_t quant_output =
-          static_cast<int32>(float_output * std::pow(2, 15));
+          static_cast<int32_t>(float_output * std::pow(2, 15));
       const int32_t quant_output_clamped =
           std::min(int16_max, std::max(int16_min, quant_output));
-      output[index] = static_cast<int16>(quant_output_clamped);
+      output[index] = static_cast<int16_t>(quant_output_clamped);
     }
   }
 }
@@ -634,7 +634,7 @@ void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
       value = std::min(std::max(static_cast<int32_t>(-128), value),
                        static_cast<int32_t>(127));
 
-      output[index] = static_cast<int8>(value);
+      output[index] = static_cast<int8_t>(value);
     }
   }
 }
@@ -645,7 +645,7 @@ void PortableCwiseAdd(const int16_t* input_1, const int16_t* input_2,
     for (int i = 0; i < n_input; ++i) {
       const int index = batch * n_input + i;
       int32_t sum = input_1[index] + input_2[index];
-      const int32 sum_clamped = std::min(kInt16Max, std::max(kInt16Min, sum));
+      const int32_t sum_clamped = std::min(kInt16Max, std::max(kInt16Min, sum));
       output[index] = static_cast<int16_t>(sum_clamped);
     }
   }
@@ -793,12 +793,12 @@ void PortableTwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
                                   int32_t recurrent_effective_scale_b,
                                   int32_t n_batch, int32_t n_cell,
                                   int16_t* output) {
-  const int32_t int16_max = std::numeric_limits<int16>::max();
-  const int32_t int16_min = std::numeric_limits<int16>::min();
+  const int32_t int16_max = std::numeric_limits<int16_t>::max();
+  const int32_t int16_min = std::numeric_limits<int16_t>::min();
   for (int i = 0; i < n_batch * n_cell; ++i) {
-    int32_t x = static_cast<int32>(input[i]) - static_cast<int32>(input_zp);
+    int32_t x = static_cast<int32_t>(input[i]) - static_cast<int32_t>(input_zp);
     int32_t h =
-        static_cast<int32>(recurrent[i]) - static_cast<int32>(recurrent_zp);
+        static_cast<int32_t>(recurrent[i]) - static_cast<int32_t>(recurrent_zp);
     int32_t x_scaled = MultiplyByQuantizedMultiplier(x, input_effective_scale_a,
                                                      input_effective_scale_b);
     int32_t h_scaled = MultiplyByQuantizedMultiplier(
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
index 602576ca3db..054fa43243d 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -32,7 +32,7 @@ bool IsZeroVector(const float* vector, int v_size) {
   return PortableIsZeroVector(vector, v_size);
 }
 
-// Check if all entries of a vector are zero for int8.
+// Check if all entries of a vector are zero for int8_t.
 bool IsZeroVector(const int8_t* vector, int v_size) {
   return PortableIsZeroVector(vector, v_size);
 }
diff --git a/tensorflow/lite/kernels/internal/reference/prelu.h b/tensorflow/lite/kernels/internal/reference/prelu.h
index 4633cb9599a..02db5174ed6 100644
--- a/tensorflow/lite/kernels/internal/reference/prelu.h
+++ b/tensorflow/lite/kernels/internal/reference/prelu.h
@@ -23,7 +23,7 @@ namespace tflite {
 
 namespace reference_ops {
 
-// Broadcast prelu to output_shape for quantized uint8/int8 data.
+// Broadcast prelu to output_shape for quantized uint8_t/int8_t data.
 template <typename T>
 inline void BroadcastPrelu4DSlow(
     const PreluParams& params, const RuntimeShape& input_shape,
@@ -44,15 +44,15 @@ inline void BroadcastPrelu4DSlow(
         for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
           int output_index = Offset(extended_output_shape, b, y, x, c);
           int input_index = SubscriptToIndex(desc1, b, y, x, c);
-          const int32 input_value =
+          const int32_t input_value =
               params.input_offset + input_data[input_index];
-          int32 output_value;
+          int32_t output_value;
           if (input_value >= 0) {
             output_value = MultiplyByQuantizedMultiplier(
                 input_value, params.output_multiplier_1, params.output_shift_1);
           } else {
             auto alpha_index = SubscriptToIndex(desc2, b, y, x, c);
-            const int32 alpha_value =
+            const int32_t alpha_value =
                 params.alpha_offset + alpha_data[alpha_index];
 
             output_value = MultiplyByQuantizedMultiplier(
@@ -61,9 +61,9 @@ inline void BroadcastPrelu4DSlow(
           }
           output_value += params.output_offset;
 
-          const int32 quantized_min = std::numeric_limits<T>::min();
-          const int32 quantized_max = std::numeric_limits<T>::max();
-          const int32 clamped_output =
+          const int32_t quantized_min = std::numeric_limits<T>::min();
+          const int32_t quantized_max = std::numeric_limits<T>::max();
+          const int32_t clamped_output =
               std::min(quantized_max, std::max(quantized_min, output_value));
           output_data[output_index] = static_cast<T>(clamped_output);
         }
@@ -77,19 +77,19 @@ inline void Prelu(const PreluParams& params, const RuntimeShape& input_shape,
                   const T* input_data, const RuntimeShape& alpha_shape,
                   const T* alpha_data, const RuntimeShape& output_shape,
                   T* output_data) {
-  const int32 quantized_min = std::numeric_limits<T>::min();
-  const int32 quantized_max = std::numeric_limits<T>::max();
+  const int32_t quantized_min = std::numeric_limits<T>::min();
+  const int32_t quantized_max = std::numeric_limits<T>::max();
 
   const int flat_size =
       MatchingElementsSize(input_shape, alpha_shape, output_shape);
   for (int i = 0; i < flat_size; ++i) {
-    const int32 input_value = params.input_offset + input_data[i];
-    int32 output_value;
+    const int32_t input_value = params.input_offset + input_data[i];
+    int32_t output_value;
     if (input_value >= 0) {
       output_value = MultiplyByQuantizedMultiplier(
           input_value, params.output_multiplier_1, params.output_shift_1);
     } else {
-      const int32 alpha_value = params.alpha_offset + alpha_data[i];
+      const int32_t alpha_value = params.alpha_offset + alpha_data[i];
 
       output_value = MultiplyByQuantizedMultiplier(input_value * alpha_value,
                                                    params.output_multiplier_2,
@@ -97,7 +97,7 @@ inline void Prelu(const PreluParams& params, const RuntimeShape& input_shape,
     }
     output_value += params.output_offset;
 
-    const int32 clamped_output =
+    const int32_t clamped_output =
         std::min(quantized_max, std::max(quantized_min, output_value));
     output_data[i] = static_cast<T>(clamped_output);
   }
diff --git a/tensorflow/lite/kernels/internal/reference/quantize.h b/tensorflow/lite/kernels/internal/reference/quantize.h
index d36db06f2e0..6f3f9aeb419 100644
--- a/tensorflow/lite/kernels/internal/reference/quantize.h
+++ b/tensorflow/lite/kernels/internal/reference/quantize.h
@@ -33,18 +33,18 @@ inline void AffineQuantize(const tflite::QuantizationParams& op_params,
                            const InputT* input_data,
                            const RuntimeShape& output_shape,
                            OutputT* output_data) {
-  const int32 zero_point = op_params.zero_point;
+  const int32_t zero_point = op_params.zero_point;
   const double scale = op_params.scale;
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
-  static constexpr int32 min_val = std::numeric_limits<OutputT>::min();
-  static constexpr int32 max_val = std::numeric_limits<OutputT>::max();
+  static constexpr int32_t min_val = std::numeric_limits<OutputT>::min();
+  static constexpr int32_t max_val = std::numeric_limits<OutputT>::max();
 
   for (int i = 0; i < flat_size; i++) {
     const InputT val = input_data[i];
-    int32 unclamped =
-        static_cast<int32>(TfLiteRound(val / static_cast<float>(scale))) +
+    int32_t unclamped =
+        static_cast<int32_t>(TfLiteRound(val / static_cast<float>(scale))) +
         zero_point;
-    int32 clamped = std::min(std::max(unclamped, min_val), max_val);
+    int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
     output_data[i] = clamped;
   }
 }
diff --git a/tensorflow/lite/kernels/internal/reference/reduce.h b/tensorflow/lite/kernels/internal/reference/reduce.h
index fbad266e843..597d015d0b1 100644
--- a/tensorflow/lite/kernels/internal/reference/reduce.h
+++ b/tensorflow/lite/kernels/internal/reference/reduce.h
@@ -251,9 +251,9 @@ inline void Mean(const tflite::MeanParams& op_params,
 
 inline void Mean(const tflite::MeanParams& op_params,
                  const RuntimeShape& unextended_input_shape,
-                 const uint8_t* input_data, int32 input_zero_point,
+                 const uint8_t* input_data, int32_t input_zero_point,
                  float input_scale, const RuntimeShape& unextended_output_shape,
-                 uint8_t* output_data, int32 output_zero_point,
+                 uint8_t* output_data, int32_t output_zero_point,
                  float output_scale) {
   ruy::profiler::ScopeLabel label("Mean4D/Uint8");
 
@@ -282,9 +282,9 @@ inline void Mean(const tflite::MeanParams& op_params,
   constexpr int32_t kMinValue = std::numeric_limits<uint8_t>::min();
   constexpr int32_t kMaxValue = std::numeric_limits<uint8_t>::max();
 
-  int32 bias =
+  int32_t bias =
       output_zero_point -
-      static_cast<int32>(input_zero_point * input_scale / output_scale);
+      static_cast<int32_t>(input_zero_point * input_scale / output_scale);
   double real_scale =
       static_cast<double>(input_scale / (num_elements_in_axis * output_scale));
 
@@ -293,7 +293,7 @@ inline void Mean(const tflite::MeanParams& op_params,
   QuantizeMultiplier(real_scale, &multiplier, &shift);
   for (int out_b = 0; out_b < output_batch; ++out_b) {
     for (int out_d = 0; out_d < output_depth; ++out_d) {
-      int32 acc = 0;
+      int32_t acc = 0;
       for (int in_h = 0; in_h < input_height; ++in_h) {
         for (int in_w = 0; in_w < input_width; ++in_w) {
           acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
@@ -312,18 +312,21 @@ inline void Mean(const tflite::MeanParams& op_params,
 // It does so in two stages, first calculates the sum of elements along the axis
 // then divides it by the number of element in axis for quantized values.
 template <typename T, typename U>
-inline bool QuantizedMeanOrSum(const T* input_data, int32 input_zero_point,
+inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
                                float input_scale, const int* input_dims,
                                const int input_num_dims, T* output_data,
-                               int32 output_zero_point, float output_scale,
+                               int32_t output_zero_point, float output_scale,
                                const int* output_dims,
                                const int output_num_dims, const int* axis,
                                const int num_axis_dimensions, bool keep_dims,
                                int* temp_index, int* resolved_axis, U* temp_sum,
                                bool compute_sum) {
-  const bool uint8_case = std::is_same<T, int8_t>::value;
+  const bool uint8_case = std::is_same<T, uint8_t>::value;
+  const bool int16_case = std::is_same<T, int16_t>::value;
   if (uint8_case) {
     ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Uint8" : "Mean/Uint8");
+  } else if (int16_case) {
+    ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int16" : "Mean/Int16");
   } else {
     ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int8" : "Mean/Int8");
   }
diff --git a/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h b/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h
index e76fc8b6931..95550abc145 100644
--- a/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h
+++ b/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h
@@ -24,22 +24,23 @@ namespace tflite {
 
 namespace reference_ops {
 
-inline int32 GetNearestNeighbor(const int input_value, const int32 input_size,
-                                const int32 output_size,
-                                const bool align_corners,
-                                const bool half_pixel_centers) {
+inline int32_t GetNearestNeighbor(const int input_value,
+                                  const int32_t input_size,
+                                  const int32_t output_size,
+                                  const bool align_corners,
+                                  const bool half_pixel_centers) {
   const float scale =
       (align_corners && output_size > 1)
           ? (input_size - 1) / static_cast<float>(output_size - 1)
           : input_size / static_cast<float>(output_size);
   const float offset = half_pixel_centers ? 0.5f : 0.0f;
-  int32 output_value = std::min(
+  int32_t output_value = std::min(
       align_corners
-          ? static_cast<int32>(TfLiteRound((input_value + offset) * scale))
-          : static_cast<int32>(std::floor((input_value + offset) * scale)),
+          ? static_cast<int32_t>(TfLiteRound((input_value + offset) * scale))
+          : static_cast<int32_t>(std::floor((input_value + offset) * scale)),
       input_size - 1);
   if (half_pixel_centers) {
-    output_value = std::max(static_cast<int32>(0), output_value);
+    output_value = std::max(static_cast<int32_t>(0), output_value);
   }
   return output_value;
 }
@@ -48,7 +49,7 @@ template <typename T>
 inline void ResizeNearestNeighbor(
     const tflite::ResizeNearestNeighborParams& op_params,
     const RuntimeShape& unextended_input_shape, const T* input_data,
-    const RuntimeShape& output_size_shape, const int32* output_size_data,
+    const RuntimeShape& output_size_shape, const int32_t* output_size_data,
     const RuntimeShape& unextended_output_shape, T* output_data) {
   TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
@@ -58,16 +59,16 @@ inline void ResizeNearestNeighbor(
   const RuntimeShape output_shape =
       RuntimeShape::ExtendedShape(4, unextended_output_shape);
 
-  int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
-  int32 input_height = input_shape.Dims(1);
-  int32 input_width = input_shape.Dims(2);
-  int32 depth = MatchingDim(input_shape, 3, output_shape, 3);
+  int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
+  int32_t input_height = input_shape.Dims(1);
+  int32_t input_width = input_shape.Dims(2);
+  int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
 
   // The Tensorflow version of this op allows resize on the width and height
   // axis only.
   TFLITE_DCHECK_EQ(output_size_shape.FlatSize(), 2);
-  int32 output_height = output_size_data[0];
-  int32 output_width = output_size_data[1];
+  int32_t output_height = output_size_data[0];
+  int32_t output_width = output_size_data[1];
 
   const int col_offset = input_shape.Dims(3);
   const int row_offset = input_shape.Dims(2) * col_offset;
@@ -77,14 +78,14 @@ inline void ResizeNearestNeighbor(
   T* output_ptr = output_data;
   for (int b = 0; b < batches; ++b) {
     for (int y = 0; y < output_height; ++y) {
-      int32 in_y = GetNearestNeighbor(y, input_height, output_height,
-                                      op_params.align_corners,
-                                      op_params.half_pixel_centers);
-      const T* y_input_ptr = input_ptr + in_y * row_offset;
-      for (int x = 0; x < output_width; ++x) {
-        int32 in_x = GetNearestNeighbor(x, input_width, output_width,
+      int32_t in_y = GetNearestNeighbor(y, input_height, output_height,
                                         op_params.align_corners,
                                         op_params.half_pixel_centers);
+      const T* y_input_ptr = input_ptr + in_y * row_offset;
+      for (int x = 0; x < output_width; ++x) {
+        int32_t in_x = GetNearestNeighbor(x, input_width, output_width,
+                                          op_params.align_corners,
+                                          op_params.half_pixel_centers);
         const T* x_input_ptr = y_input_ptr + in_x * col_offset;
         memcpy(output_ptr, x_input_ptr, depth * sizeof(T));
         output_ptr += depth;
diff --git a/tensorflow/lite/kernels/internal/reference/softmax.h b/tensorflow/lite/kernels/internal/reference/softmax.h
index dd44b3c7863..b035b433a0b 100644
--- a/tensorflow/lite/kernels/internal/reference/softmax.h
+++ b/tensorflow/lite/kernels/internal/reference/softmax.h
@@ -62,13 +62,14 @@ inline void Softmax(const SoftmaxParams& params,
   }
 }
 
-// Quantized softmax with int8/uint8 input and int8/uint8/int16 output.
+// Quantized softmax with int8_t/uint8_t input and int8_t/uint8_t/int16_t
+// output.
 template <typename InputT, typename OutputT>
 inline void Softmax(const SoftmaxParams& params,
                     const RuntimeShape& input_shape, const InputT* input_data,
                     const RuntimeShape& output_shape, OutputT* output_data) {
-  const int32 input_beta_multiplier = params.input_multiplier;
-  const int32 input_beta_left_shift = params.input_left_shift;
+  const int32_t input_beta_multiplier = params.input_multiplier;
+  const int32_t input_beta_left_shift = params.input_left_shift;
   const int diff_min = params.diff_min;
   // The representation chosen for the input to the exp() function is Q5.26.
   // We need to leave extra space since values that we skip might be as large as
@@ -78,9 +79,10 @@ inline void Softmax(const SoftmaxParams& params,
   static const int kScaledDiffIntegerBits = 5;
   static const int kAccumulationIntegerBits = 12;
   using FixedPointScaledDiff =
-      gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
-  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
-  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+      gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>;
+  using FixedPointAccum =
+      gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
+  using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
 
   const int trailing_dim = input_shape.DimensionsCount() - 1;
   const int outer_size =
@@ -96,10 +98,10 @@ inline void Softmax(const SoftmaxParams& params,
 
     FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
     for (int c = 0; c < depth; ++c) {
-      int32 input_diff =
-          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      int32_t input_diff =
+          static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
       if (input_diff >= diff_min) {
-        const int32 input_diff_rescaled =
+        const int32_t input_diff_rescaled =
             MultiplyByQuantizedMultiplierGreaterThanOne(
                 input_diff, input_beta_multiplier, input_beta_left_shift);
         const FixedPointScaledDiff scaled_diff_f8 =
@@ -114,28 +116,28 @@ inline void Softmax(const SoftmaxParams& params,
         sum_of_exps.raw(), kAccumulationIntegerBits, &num_bits_over_unit));
 
     for (int c = 0; c < depth; ++c) {
-      int32 input_diff =
-          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      int32_t input_diff =
+          static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
       if (input_diff >= diff_min) {
-        const int32 input_diff_rescaled =
+        const int32_t input_diff_rescaled =
             MultiplyByQuantizedMultiplierGreaterThanOne(
                 input_diff, input_beta_multiplier, input_beta_left_shift);
         const FixedPointScaledDiff scaled_diff_f8 =
             FixedPointScaledDiff::FromRaw(input_diff_rescaled);
 
         FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
-        int32 unsat_output = gemmlowp::RoundingDivideByPOT(
+        int32_t unsat_output = gemmlowp::RoundingDivideByPOT(
             (shifted_scale * exp_in_0).raw(),
             num_bits_over_unit + 31 - (sizeof(OutputT) * 8));
 
-        const int32 shifted_output =
+        const int32_t shifted_output =
             unsat_output +
-            static_cast<int32>(std::numeric_limits<OutputT>::min());
+            static_cast<int32_t>(std::numeric_limits<OutputT>::min());
 
         output_data[i * depth + c] = static_cast<OutputT>(std::max(
             std::min(shifted_output,
-                     static_cast<int32>(std::numeric_limits<OutputT>::max())),
-            static_cast<int32>(std::numeric_limits<OutputT>::min())));
+                     static_cast<int32_t>(std::numeric_limits<OutputT>::max())),
+            static_cast<int32_t>(std::numeric_limits<OutputT>::min())));
       } else {
         output_data[i * depth + c] = std::numeric_limits<OutputT>::min();
       }
@@ -143,7 +145,7 @@ inline void Softmax(const SoftmaxParams& params,
   }
 }
 
-// Quantized softmax with int16 input and int16 output.
+// Quantized softmax with int16_t input and int16_t output.
 inline void SoftmaxInt16(const SoftmaxParams& params,
                          const RuntimeShape& input_shape,
                          const int16_t* input_data,
diff --git a/tensorflow/lite/kernels/internal/reference/sub.h b/tensorflow/lite/kernels/internal/reference/sub.h
index 91ef7f2c2fd..b27f251de6c 100644
--- a/tensorflow/lite/kernels/internal/reference/sub.h
+++ b/tensorflow/lite/kernels/internal/reference/sub.h
@@ -47,11 +47,11 @@ inline void SubNonBroadcast(const ArithmeticParams& params,
 
 inline void SubNonBroadcast(const ArithmeticParams& params,
                             const RuntimeShape& input1_shape,
-                            const int32* input1_data,
+                            const int32_t* input1_data,
                             const RuntimeShape& input2_shape,
-                            const int32* input2_data,
+                            const int32_t* input2_data,
                             const RuntimeShape& output_shape,
-                            int32* output_data) {
+                            int32_t* output_data) {
   const int flat_size =
       MatchingElementsSize(input1_shape, input2_shape, output_shape);
   for (int i = 0; i < flat_size; ++i) {
@@ -112,12 +112,12 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
 template <int N = 5>
 inline void BroadcastSubSlow(const ArithmeticParams& params,
                              const RuntimeShape& input1_shape,
-                             const uint8* input1_data,
+                             const uint8_t* input1_data,
                              const RuntimeShape& input2_shape,
-                             const uint8* input2_data,
+                             const uint8_t* input2_data,
                              const RuntimeShape& output_shape,
-                             uint8* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastSubSlow/uint8");
+                             uint8_t* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastSubSlow/uint8_t");
   TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
   TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
   TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
@@ -140,28 +140,28 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
   // nesting loops such that the innermost loop has the smallest stride for the
   // best cache behavior.
   auto sub_func = [&](int indexes[N]) {
-    const int32 input1_val =
+    const int32_t input1_val =
         params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
-    const int32 input2_val =
+    const int32_t input2_val =
         params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
-    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32 scaled_input1_val =
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             shifted_input1_val, params.input1_multiplier, params.input1_shift);
-    const int32 scaled_input2_val =
+    const int32_t scaled_input2_val =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32 raw_sub = scaled_input1_val - scaled_input2_val;
-    const int32 raw_output =
+    const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
+    const int32_t raw_output =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             raw_sub, params.output_multiplier, params.output_shift) +
         params.output_offset;
-    const int32 clamped_output =
+    const int32_t clamped_output =
         std::min(params.quantized_activation_max,
                  std::max(params.quantized_activation_min, raw_output));
     output_data[SubscriptToIndex(output_desc, indexes)] =
-        static_cast<uint8>(clamped_output);
+        static_cast<uint8_t>(clamped_output);
   };
   NDOpsHelper<N>(output_desc, sub_func);
 }
@@ -169,12 +169,12 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
 template <int N = 5>
 inline void BroadcastSubSlow(const ArithmeticParams& params,
                              const RuntimeShape& input1_shape,
-                             const int32* input1_data,
+                             const int32_t* input1_data,
                              const RuntimeShape& input2_shape,
-                             const int32* input2_data,
+                             const int32_t* input2_data,
                              const RuntimeShape& output_shape,
-                             int32* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int32");
+                             int32_t* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int32_t");
   TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
   TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
   TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
@@ -214,7 +214,7 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
                              const int8_t* input2_data,
                              const RuntimeShape& output_shape,
                              int8_t* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int8");
+  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int8_t");
   NdArrayDesc<N> desc1;
   NdArrayDesc<N> desc2;
   NdArrayDesc<N> output_desc;
@@ -267,7 +267,7 @@ void BroadcastSubSlow(const ArithmeticParams& params,
                       const RuntimeShape& input2_shape,
                       const int64_t* input2_data,
                       const RuntimeShape& output_shape, int64_t* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int64");
+  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int64_t");
   TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
   TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
   TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
@@ -339,33 +339,33 @@ void BroadcastSubSlow(const ArithmeticParams& params,
 // Element-wise Sub that can often be used for inner loop of broadcast sub as
 // well as the non-broadcast sub.
 inline void SubElementwise(int size, const ArithmeticParams& params,
-                           const uint8* input1_data, const uint8* input2_data,
-                           uint8* output_data) {
+                           const uint8_t* input1_data,
+                           const uint8_t* input2_data, uint8_t* output_data) {
   TFLITE_DCHECK_GT(params.input1_offset, -256);
   TFLITE_DCHECK_GT(params.input2_offset, -256);
   TFLITE_DCHECK_LT(params.input1_offset, 256);
   TFLITE_DCHECK_LT(params.input2_offset, 256);
 
   for (int i = 0; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
-    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32 scaled_input1_val =
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             shifted_input1_val, params.input1_multiplier, params.input1_shift);
-    const int32 scaled_input2_val =
+    const int32_t scaled_input2_val =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32 raw_sub = scaled_input1_val - scaled_input2_val;
-    const int32 raw_output =
+    const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
+    const int32_t raw_output =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             raw_sub, params.output_multiplier, params.output_shift) +
         params.output_offset;
-    const int32 clamped_output =
+    const int32_t clamped_output =
         std::min(params.quantized_activation_max,
                  std::max(params.quantized_activation_min, raw_output));
-    output_data[i] = static_cast<uint8>(clamped_output);
+    output_data[i] = static_cast<uint8_t>(clamped_output);
   }
 }
 
@@ -381,22 +381,22 @@ inline void SubElementwise(int size, const ArithmeticParams& params,
   TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
 
   for (int i = 0; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
-    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32 scaled_input1_val =
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             shifted_input1_val, params.input1_multiplier, params.input1_shift);
-    const int32 scaled_input2_val =
+    const int32_t scaled_input2_val =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32 raw_sub = scaled_input1_val - scaled_input2_val;
-    const int32 raw_output =
+    const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
+    const int32_t raw_output =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             raw_sub, params.output_multiplier, params.output_shift) +
         params.output_offset;
-    const int32 clamped_output =
+    const int32_t clamped_output =
         std::min(params.quantized_activation_max,
                  std::max(params.quantized_activation_min, raw_output));
     output_data[i] = static_cast<int8_t>(clamped_output);
@@ -404,9 +404,9 @@ inline void SubElementwise(int size, const ArithmeticParams& params,
 }
 
 inline void Sub(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const uint8* input1_data,
-                const RuntimeShape& input2_shape, const uint8* input2_data,
-                const RuntimeShape& output_shape, uint8* output_data) {
+                const RuntimeShape& input1_shape, const uint8_t* input1_data,
+                const RuntimeShape& input2_shape, const uint8_t* input2_data,
+                const RuntimeShape& output_shape, uint8_t* output_data) {
   TFLITE_DCHECK_LE(params.quantized_activation_min,
                    params.quantized_activation_max);
   const int flat_size =
@@ -474,7 +474,8 @@ void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
 }
 
 inline void SetActivationMinMax(const ArithmeticParams& params,
-                                int32* activation_min, int32* activation_max) {
+                                int32_t* activation_min,
+                                int32_t* activation_max) {
   *activation_min = params.quantized_activation_min;
   *activation_max = params.quantized_activation_max;
 }
diff --git a/tensorflow/lite/kernels/internal/reference/svdf.h b/tensorflow/lite/kernels/internal/reference/svdf.h
index ffa46b8f422..bb986e4de0a 100644
--- a/tensorflow/lite/kernels/internal/reference/svdf.h
+++ b/tensorflow/lite/kernels/internal/reference/svdf.h
@@ -268,7 +268,7 @@ inline void EvalHybridSVDF(
   std::fill_n(scratch_ptr, batch_size * num_filters, 0.0f);
 
   if (!tensor_utils::IsZeroVector(input_ptr, batch_size * input_size)) {
-    // Quantize input from float to int8.
+    // Quantize input from float to int8_t.
     tensor_utils::BatchQuantizeFloats(input_ptr, batch_size, input_size,
                                       quantized_input_ptr, scaling_factors_ptr,
                                       zero_points_ptr,
diff --git a/tensorflow/lite/kernels/internal/reference/tanh.h b/tensorflow/lite/kernels/internal/reference/tanh.h
index 04c66989b48..3a05c474dd3 100644
--- a/tensorflow/lite/kernels/internal/reference/tanh.h
+++ b/tensorflow/lite/kernels/internal/reference/tanh.h
@@ -47,8 +47,8 @@ inline void Tanh(const TanhParams&, const RuntimeShape& input_shape,
 }
 
 inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
-                 const int16* input_data, const RuntimeShape& output_shape,
-                 int16* output_data) {
+                 const int16_t* input_data, const RuntimeShape& output_shape,
+                 int16_t* output_data) {
   const int input_left_shift = params.input_left_shift;
   // Support for shifts is limited until we have a parameterized version of
   // SaturatingRoundingMultiplyByPOT().
@@ -81,43 +81,43 @@ inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
 }
 
 inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
-                 const uint8* input_data, const RuntimeShape& output_shape,
-                 uint8* output_data) {
-  const int32 input_zero_point = params.input_zero_point;
-  const int32 input_range_radius = params.input_range_radius;
-  const int32 input_multiplier = params.input_multiplier;
+                 const uint8_t* input_data, const RuntimeShape& output_shape,
+                 uint8_t* output_data) {
+  const int32_t input_zero_point = params.input_zero_point;
+  const int32_t input_range_radius = params.input_range_radius;
+  const int32_t input_multiplier = params.input_multiplier;
   const int input_left_shift = params.input_left_shift;
-  const int32 output_zero_point = 128;
+  const int32_t output_zero_point = 128;
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
 
   for (int i = 0; i < flat_size; i++) {
-    const uint8 input_val_u8 = input_data[i];
-    const int32 input_val_centered =
-        static_cast<int32>(input_val_u8) - input_zero_point;
-    uint8 output_val;
+    const uint8_t input_val_u8 = input_data[i];
+    const int32_t input_val_centered =
+        static_cast<int32_t>(input_val_u8) - input_zero_point;
+    uint8_t output_val;
     if (input_val_centered <= -input_range_radius) {
       output_val = 0;
     } else if (input_val_centered >= input_range_radius) {
       output_val = 255;
     } else {
-      const int32 input_val_rescaled =
+      const int32_t input_val_rescaled =
           MultiplyByQuantizedMultiplierGreaterThanOne(
               input_val_centered, input_multiplier, input_left_shift);
-      using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
-      using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+      using FixedPoint4 = gemmlowp::FixedPoint<int32_t, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
       const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
       const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
       // Convert from Q0.31 to Q24.7.
       using gemmlowp::RoundingDivideByPOT;
-      int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
+      int32_t output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
       output_val_s32 += output_zero_point;
       if (output_val_s32 == 256) {
         output_val_s32 = 255;
       }
-      // Reinterpret as Q0.7, encoded in uint8.
+      // Reinterpret as Q0.7, encoded in uint8_t.
       TFLITE_DCHECK_GE(output_val_s32, 0);
       TFLITE_DCHECK_LE(output_val_s32, 255);
-      output_val = static_cast<uint8>(output_val_s32);
+      output_val = static_cast<uint8_t>(output_val_s32);
     }
     output_data[i] = output_val;
   }
diff --git a/tensorflow/lite/kernels/internal/tensor.h b/tensorflow/lite/kernels/internal/tensor.h
index 543117df0e5..905552fc640 100644
--- a/tensorflow/lite/kernels/internal/tensor.h
+++ b/tensorflow/lite/kernels/internal/tensor.h
@@ -76,12 +76,12 @@ class VectorOfTensors {
 
 // A list of quantized tensors in a format that can be used by kernels like
 // split and concatenation.
-class VectorOfQuantizedTensors : public VectorOfTensors<uint8> {
+class VectorOfQuantizedTensors : public VectorOfTensors<uint8_t> {
  public:
   // Build with the tensors in 'tensor_list'.
   VectorOfQuantizedTensors(const TfLiteContext& context,
                            const TfLiteIntArray& tensor_list)
-      : VectorOfTensors<uint8>(context, tensor_list) {
+      : VectorOfTensors<uint8_t>(context, tensor_list) {
     for (int i = 0; i < tensor_list.size; ++i) {
       TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
       zero_point_.push_back(t->params.zero_point);
@@ -90,10 +90,10 @@ class VectorOfQuantizedTensors : public VectorOfTensors<uint8> {
   }
 
   const float* scale() const { return scale_.data(); }
-  const int32* zero_point() const { return zero_point_.data(); }
+  const int32_t* zero_point() const { return zero_point_.data(); }
 
  private:
-  std::vector<int32> zero_point_;
+  std::vector<int32_t> zero_point_;
   std::vector<float> scale_;
 };
 
diff --git a/tensorflow/lite/kernels/internal/tensor_utils.cc b/tensorflow/lite/kernels/internal/tensor_utils.cc
index bf32445d153..5e0999121af 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils.cc
@@ -16,9 +16,9 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 
-#if defined(__SSSE3__)
+#if defined(__SSSE3__) && !defined(TF_LITE_STATIC_MEMORY)
 #include "tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h"
-#elif defined(USE_NEON)
+#elif defined(USE_NEON) && !defined(TF_LITE_STATIC_MEMORY)
 #include "tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h"
 #else
 #include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h"
diff --git a/tensorflow/lite/kernels/internal/types.h b/tensorflow/lite/kernels/internal/types.h
index 7fe23ab2346..9db742ddf03 100644
--- a/tensorflow/lite/kernels/internal/types.h
+++ b/tensorflow/lite/kernels/internal/types.h
@@ -24,24 +24,29 @@ limitations under the License.
 
 namespace tflite {
 
-enum class FusedActivationFunctionType : uint8 { kNone, kRelu6, kRelu1, kRelu };
-enum class PaddingType : uint8 { kNone, kSame, kValid };
+enum class FusedActivationFunctionType : uint8_t {
+  kNone,
+  kRelu6,
+  kRelu1,
+  kRelu
+};
+enum class PaddingType : uint8_t { kNone, kSame, kValid };
 
 struct PaddingValues {
-  int16 width;
-  int16 height;
+  int16_t width;
+  int16_t height;
   // offset is used for calculating "remaining" padding, for example, `width`
   // is 1 and `width_offset` is 1, so padding_left is 1 while padding_right is
   // 1 + 1 = 2.
-  int16 width_offset;
+  int16_t width_offset;
   // Same as width_offset except it's over the height dimension.
-  int16 height_offset;
+  int16_t height_offset;
 };
 
 // This enumeration allows for non-default formats for the weights array
 // of a fully-connected operator, allowing the use of special optimized
 // runtime paths.
-enum class FullyConnectedWeightsFormat : uint8 {
+enum class FullyConnectedWeightsFormat : uint8_t {
   // Default format (flat 2D layout, the inner contiguous dimension
   // is input_depth, the outer non-contiguous dimension is output_depth)
   kDefault,
@@ -88,11 +93,11 @@ enum class FullyConnectedWeightsFormat : uint8 {
   //     maximize arithmetic throughput.
   //
   // Finally, the 'Int8' part in the name refers to the fact that this
-  // weights format has each weights value encoded as a signed int8 value,
-  // even if the data type of the weights buffer is uint8.  This is intended
+  // weights format has each weights value encoded as a signed int8_t value,
+  // even if the data type of the weights buffer is uint8_t.  This is intended
   // to save runtime kernels the effort to have to XOR the top bit of these
   // bytes before using them in signed arithmetic, see this file for more
-  // explanations on the 'signed int8 trick' in matrix multiplication kernels:
+  // explanations on the 'signed int8_t trick' in matrix multiplication kernels:
   //
   //   tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
   //
@@ -111,7 +116,7 @@ enum class FullyConnectedWeightsFormat : uint8 {
 // the real 0 value, and scale designates the difference between the real values
 // corresponding to consecutive quantized values differing by 1.
 struct QuantizationParams {
-  int32 zero_point = 0;
+  int32_t zero_point = 0;
   double scale = 0.0;
 };
 
@@ -140,20 +145,20 @@ class RuntimeShape {
     if (dimensions_count > kMaxSmallSize) {
 #ifdef TF_LITE_STATIC_MEMORY
       TFLITE_CHECK(false && "No shape resizing supported on this platform");
-#else   // TF_LITE_STATIC_MEMORY
-      dims_pointer_ = new int32[dimensions_count];
+#else  // TF_LITE_STATIC_MEMORY
+      dims_pointer_ = new int32_t[dimensions_count];
 #endif  // TF_LITE_STATIC_MEMORY
     }
   }
 
-  RuntimeShape(int shape_size, int32 value) : size_(0) {
+  RuntimeShape(int shape_size, int32_t value) : size_(0) {
     Resize(shape_size);
     for (int i = 0; i < shape_size; ++i) {
       SetDim(i, value);
     }
   }
 
-  RuntimeShape(int dimensions_count, const int32* dims_data) : size_(0) {
+  RuntimeShape(int dimensions_count, const int32_t* dims_data) : size_(0) {
     ReplaceWith(dimensions_count, dims_data);
   }
 
@@ -165,33 +170,34 @@ class RuntimeShape {
   // rolls out.
   RuntimeShape(RuntimeShape const& other) : size_(other.DimensionsCount()) {
     if (size_ > kMaxSmallSize) {
-      dims_pointer_ = new int32[size_];
+      dims_pointer_ = new int32_t[size_];
     }
-    std::memcpy(DimsData(), other.DimsData(), sizeof(int32) * size_);
+    std::memcpy(DimsData(), other.DimsData(), sizeof(int32_t) * size_);
   }
 
   bool operator==(const RuntimeShape& comp) const {
     return this->size_ == comp.size_ &&
-           std::memcmp(DimsData(), comp.DimsData(), size_ * sizeof(int32)) == 0;
+           std::memcmp(DimsData(), comp.DimsData(), size_ * sizeof(int32_t)) ==
+               0;
   }
 
   ~RuntimeShape() {
     if (size_ > kMaxSmallSize) {
 #ifdef TF_LITE_STATIC_MEMORY
       TFLITE_CHECK(false && "No shape resizing supported on this platform");
-#else   // TF_LITE_STATIC_MEMORY
+#else  // TF_LITE_STATIC_MEMORY
       delete[] dims_pointer_;
 #endif  // TF_LITE_STATIC_MEMORY
     }
   }
 
-  inline int32 DimensionsCount() const { return size_; }
-  inline int32 Dims(int i) const {
+  inline int32_t DimensionsCount() const { return size_; }
+  inline int32_t Dims(int i) const {
     TFLITE_DCHECK_GE(i, 0);
     TFLITE_DCHECK_LT(i, size_);
     return size_ > kMaxSmallSize ? dims_pointer_[i] : dims_[i];
   }
-  inline void SetDim(int i, int32 val) {
+  inline void SetDim(int i, int32_t val) {
     TFLITE_DCHECK_GE(i, 0);
     TFLITE_DCHECK_LT(i, size_);
     if (size_ > kMaxSmallSize) {
@@ -201,20 +207,20 @@ class RuntimeShape {
     }
   }
 
-  inline int32* DimsData() {
+  inline int32_t* DimsData() {
     return size_ > kMaxSmallSize ? dims_pointer_ : dims_;
   }
-  inline const int32* DimsData() const {
+  inline const int32_t* DimsData() const {
     return size_ > kMaxSmallSize ? dims_pointer_ : dims_;
   }
   // The caller must ensure that the shape is no bigger than 5-D.
-  inline const int32* DimsDataUpTo5D() const { return dims_; }
+  inline const int32_t* DimsDataUpTo5D() const { return dims_; }
 
   inline void Resize(int dimensions_count) {
     if (size_ > kMaxSmallSize) {
 #ifdef TF_LITE_STATIC_MEMORY
       TFLITE_CHECK(false && "No shape resizing supported on this platform");
-#else   // TF_LITE_STATIC_MEMORY
+#else  // TF_LITE_STATIC_MEMORY
       delete[] dims_pointer_;
 #endif  // TF_LITE_STATIC_MEMORY
     }
@@ -222,16 +228,16 @@ class RuntimeShape {
     if (dimensions_count > kMaxSmallSize) {
 #ifdef TF_LITE_STATIC_MEMORY
       TFLITE_CHECK(false && "No shape resizing supported on this platform");
-#else   // TF_LITE_STATIC_MEMORY
-      dims_pointer_ = new int32[dimensions_count];
+#else  // TF_LITE_STATIC_MEMORY
+      dims_pointer_ = new int32_t[dimensions_count];
 #endif  // TF_LITE_STATIC_MEMORY
     }
   }
 
-  inline void ReplaceWith(int dimensions_count, const int32* dims_data) {
+  inline void ReplaceWith(int dimensions_count, const int32_t* dims_data) {
     Resize(dimensions_count);
-    int32* dst_dims = DimsData();
-    std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32));
+    int32_t* dst_dims = DimsData();
+    std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32_t));
   }
 
   template <typename T>
@@ -239,7 +245,7 @@ class RuntimeShape {
     const int dimensions_count =
         std::distance(src_iterable.begin(), src_iterable.end());
     Resize(dimensions_count);
-    int32* data = DimsData();
+    int32_t* data = DimsData();
     for (auto it : src_iterable) {
       *data = it;
       ++data;
@@ -288,13 +294,13 @@ class RuntimeShape {
       SetDim(i, pad_value);
     }
     std::memcpy(DimsData() + size_increase, shape.DimsData(),
-                sizeof(int32) * shape.DimensionsCount());
+                sizeof(int32_t) * shape.DimensionsCount());
   }
 
-  int32 size_;
+  int32_t size_;
   union {
-    int32 dims_[kMaxSmallSize];
-    int32* dims_pointer_;
+    int32_t dims_[kMaxSmallSize];
+    int32_t* dims_pointer_;
   };
 };
 
@@ -713,7 +719,7 @@ void ComputeStrides(Dims<N>* dims) {
   }
 }
 
-enum class BroadcastableOpCategory : uint8 {
+enum class BroadcastableOpCategory : uint8_t {
   kNone,
   kNonBroadcast,               // Matching input shapes.
   kFirstInputBroadcastsFast,   // Fivefold nested loops.
@@ -729,21 +735,21 @@ static_assert(sizeof(MinMax) == 8, "");
 
 struct ActivationParams {
   FusedActivationFunctionType activation_type;
-  // uint8, etc, activation params.
-  int32 quantized_activation_min;
-  int32 quantized_activation_max;
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
 };
 
 struct ReluParams : public ActivationParams {
-  int32 input_offset;
-  int32 output_offset;
-  int32 output_multiplier;
-  int32 output_shift;
+  int32_t input_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
+  int output_shift;
 };
 
 // Styles of resizing op usages. For example, kImageStyle can be used with a Pad
 // op for pattern-specific optimization.
-enum class ResizingCategory : uint8 {
+enum class ResizingCategory : uint8_t {
   kNone,
   kImageStyle,  // 4D, operating on inner dimensions, say {0, a, b, 0}.
   kGenericResize,
@@ -753,27 +759,27 @@ enum class ResizingCategory : uint8 {
 struct ArithmeticParams {
   // Shape dependent / common to data / op types.
   BroadcastableOpCategory broadcast_category;
-  // uint8 inference params.
-  int32 input1_offset;
-  int32 input2_offset;
-  int32 output_offset;
-  int32 output_multiplier;
+  // uint8_t inference params.
+  int32_t input1_offset;
+  int32_t input2_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
   int output_shift;
-  // Add / Sub, not Mul, uint8 inference params.
+  // Add / Sub, not Mul, uint8_t inference params.
   int left_shift;
-  int32 input1_multiplier;
+  int32_t input1_multiplier;
   int input1_shift;
-  int32 input2_multiplier;
+  int32_t input2_multiplier;
   int input2_shift;
 
   // TODO(b/158622529): Union the following activation params.
-  // uint8, etc, activation params.
-  int32 quantized_activation_min;
-  int32 quantized_activation_max;
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
   // float activation params.
   float float_activation_min;
   float float_activation_max;
-  // int64 activation params.
+  // int64_t activation params.
   int64_t int64_activation_min;
   int64_t int64_activation_max;
 
@@ -790,22 +796,22 @@ struct ArithmeticParams {
 };
 
 struct ConcatenationParams {
-  int8 axis;
-  const int32* input_zeropoint;
+  int8_t axis;
+  const int32_t* input_zeropoint;
   const float* input_scale;
-  uint16 inputs_count;
-  int32 output_zeropoint;
+  uint16_t inputs_count;
+  int32_t output_zeropoint;
   float output_scale;
 };
 
 struct ComparisonParams {
-  // uint8 inference params.
+  // uint8_t inference params.
   int left_shift;
-  int32 input1_offset;
-  int32 input1_multiplier;
+  int32_t input1_offset;
+  int32_t input1_multiplier;
   int input1_shift;
-  int32 input2_offset;
-  int32 input2_multiplier;
+  int32_t input2_offset;
+  int32_t input2_multiplier;
   int input2_shift;
   // Shape dependent / common to inference types.
   bool is_broadcast;
@@ -815,81 +821,81 @@ struct ConvParams {
   PaddingType padding_type;
   PaddingValues padding_values;
   // TODO(starka): This was just "stride", so check that width+height is OK.
-  int16 stride_width;
-  int16 stride_height;
-  int16 dilation_width_factor;
-  int16 dilation_height_factor;
-  // uint8 inference params.
+  int16_t stride_width;
+  int16_t stride_height;
+  int16_t dilation_width_factor;
+  int16_t dilation_height_factor;
+  // uint8_t inference params.
   // TODO(b/65838351): Use smaller types if appropriate.
-  int32 input_offset;
-  int32 weights_offset;
-  int32 output_offset;
-  int32 output_multiplier;
+  int32_t input_offset;
+  int32_t weights_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
   int output_shift;
-  // uint8, etc, activation params.
-  int32 quantized_activation_min;
-  int32 quantized_activation_max;
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
   // float activation params.
   float float_activation_min;
   float float_activation_max;
 };
 
 struct DepthToSpaceParams {
-  int32 block_size;
+  int32_t block_size;
 };
 
 struct DepthwiseParams {
   PaddingType padding_type;
   PaddingValues padding_values;
-  int16 stride_width;
-  int16 stride_height;
-  int16 dilation_width_factor;
-  int16 dilation_height_factor;
-  int16 depth_multiplier;
-  // uint8 inference params.
+  int16_t stride_width;
+  int16_t stride_height;
+  int16_t dilation_width_factor;
+  int16_t dilation_height_factor;
+  int16_t depth_multiplier;
+  // uint8_t inference params.
   // TODO(b/65838351): Use smaller types if appropriate.
-  int32 input_offset;
-  int32 weights_offset;
-  int32 output_offset;
-  int32 output_multiplier;
+  int32_t input_offset;
+  int32_t weights_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
   int output_shift;
-  // uint8, etc, activation params.
-  int32 quantized_activation_min;
-  int32 quantized_activation_max;
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
   // float activation params.
   float float_activation_min;
   float float_activation_max;
-  const int32* output_multiplier_per_channel;
-  const int32* output_shift_per_channel;
+  const int32_t* output_multiplier_per_channel;
+  const int32_t* output_shift_per_channel;
 };
 
 struct DequantizationParams {
   double scale;
-  int32 zero_point;
+  int32_t zero_point;
 };
 
 struct PerChannelDequantizationParams {
   const float* scale;
-  const int32* zero_point;
-  int32 quantized_dimension;
+  const int32_t* zero_point;
+  int32_t quantized_dimension;
 };
 
 struct FakeQuantParams {
   MinMax minmax;
-  int32 num_bits;
+  int32_t num_bits;
 };
 
 struct FullyConnectedParams {
-  // uint8 inference params.
+  // uint8_t inference params.
   // TODO(b/65838351): Use smaller types if appropriate.
-  int32 input_offset;
-  int32 weights_offset;
-  int32 output_offset;
-  int32 output_multiplier;
+  int32_t input_offset;
+  int32_t weights_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
   int output_shift;
-  // uint8, etc, activation params.
-  int32 quantized_activation_min;
-  int32 quantized_activation_max;
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
   // float activation params.
   float float_activation_min;
   float float_activation_max;
@@ -900,16 +906,16 @@ struct FullyConnectedParams {
 };
 
 struct GatherParams {
-  int16 axis;
+  int16_t axis;
 };
 
 struct L2NormalizationParams {
-  // uint8 inference params.
-  int32 input_zero_point;
+  // uint8_t inference params.
+  int32_t input_zero_point;
 };
 
 struct LocalResponseNormalizationParams {
-  int32 range;
+  int32_t range;
   double bias;
   double alpha;
   double beta;
@@ -937,50 +943,50 @@ struct HardSwishParams {
 };
 
 struct LogisticParams {
-  // uint8 inference params.
-  int32 input_zero_point;
-  int32 input_range_radius;
-  int32 input_multiplier;
+  // uint8_t inference params.
+  int32_t input_zero_point;
+  int32_t input_range_radius;
+  int32_t input_multiplier;
   int input_left_shift;
 };
 
 struct LstmCellParams {
-  int32 weights_zero_point;
-  int32 accum_multiplier;
+  int32_t weights_zero_point;
+  int32_t accum_multiplier;
   int accum_shift;
   int state_integer_bits;
 };
 
 struct MeanParams {
-  int8 axis_count;
-  int16 axis[4];
+  int8_t axis_count;
+  int16_t axis[4];
 };
 
 struct PackParams {
-  int8 axis;
-  const int32* input_zeropoint;
+  int8_t axis;
+  const int32_t* input_zeropoint;
   const float* input_scale;
-  uint16 inputs_count;
-  int32 output_zeropoint;
+  uint16_t inputs_count;
+  int32_t output_zeropoint;
   float output_scale;
 };
 
 struct PadParams {
-  int8 left_padding_count;
-  int32 left_padding[4];
-  int8 right_padding_count;
-  int32 right_padding[4];
+  int8_t left_padding_count;
+  int32_t left_padding[4];
+  int8_t right_padding_count;
+  int32_t right_padding[4];
   ResizingCategory resizing_category;
 };
 
 struct PreluParams {
-  int32 input_offset;
-  int32 alpha_offset;
-  int32 output_offset;
-  int32 output_multiplier_1;
-  int32 output_shift_1;
-  int32 output_multiplier_2;
-  int32 output_shift_2;
+  int32_t input_offset;
+  int32_t alpha_offset;
+  int32_t output_offset;
+  int32_t output_multiplier_1;
+  int output_shift_1;
+  int32_t output_multiplier_2;
+  int output_shift_2;
 };
 
 struct PoolParams {
@@ -991,17 +997,17 @@ struct PoolParams {
   int stride_width;
   int filter_height;
   int filter_width;
-  // uint8, etc, activation params.
-  int32 quantized_activation_min;
-  int32 quantized_activation_max;
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
   // float activation params.
   float float_activation_min;
   float float_activation_max;
 };
 
 struct ReshapeParams {
-  int8 shape_count;
-  int32 shape[4];
+  int8_t shape_count;
+  int32_t shape[4];
 };
 
 struct ResizeBilinearParams {
@@ -1018,22 +1024,22 @@ struct ResizeNearestNeighborParams {
 };
 
 struct SliceParams {
-  int8 begin_count;
-  int32 begin[4];
-  int8 size_count;
-  int32 size[4];
+  int8_t begin_count;
+  int32_t begin[4];
+  int8_t size_count;
+  int32_t size[4];
 };
 
 struct SoftmaxParams {
   // beta is not really used (not a Tensorflow parameter) and not implemented
   // for LogSoftmax.
   double beta;
-  // uint8 inference params.  Used even when beta defaults to 1.0.
-  int32 input_multiplier;
-  int32 input_left_shift;
+  // uint8_t inference params.  Used even when beta defaults to 1.0.
+  int32_t input_multiplier;
+  int32_t input_left_shift;
   // Reverse scaling is only used by LogSoftmax.
-  int32 reverse_scaling_divisor;
-  int32 reverse_scaling_right_shift;
+  int32_t reverse_scaling_divisor;
+  int32_t reverse_scaling_right_shift;
   int diff_min;
   int32_t zero_point;
   float scale;
@@ -1045,66 +1051,66 @@ struct SoftmaxParams {
 };
 
 struct SpaceToBatchParams {
-  // "Zero" padding for uint8 means padding with the output offset.
-  int32 output_offset;
+  // "Zero" padding for uint8_t means padding with the output offset.
+  int32_t output_offset;
 };
 
 struct SpaceToDepthParams {
-  int32 block_size;
+  int32_t block_size;
 };
 
 struct SplitParams {
   // Graphs that split into, say, 2000 nodes are encountered.  The indices in
-  // OperatorEdges are of type uint16.
-  uint16 num_split;
-  int16 axis;
+  // OperatorEdges are of type uint16_t.
+  uint16_t num_split;
+  int16_t axis;
 };
 
 struct SqueezeParams {
-  int8 squeeze_dims_count;
-  int32 squeeze_dims[4];
+  int8_t squeeze_dims_count;
+  int32_t squeeze_dims[4];
 };
 
 struct StridedSliceParams {
-  int8 start_indices_count;
-  int32 start_indices[5];
-  int8 stop_indices_count;
-  int32 stop_indices[5];
-  int8 strides_count;
-  int32 strides[5];
+  int8_t start_indices_count;
+  int32_t start_indices[5];
+  int8_t stop_indices_count;
+  int32_t stop_indices[5];
+  int8_t strides_count;
+  int32_t strides[5];
 
-  int16 begin_mask;
-  int16 ellipsis_mask;
-  int16 end_mask;
-  int16 new_axis_mask;
-  int16 shrink_axis_mask;
+  int16_t begin_mask;
+  int16_t ellipsis_mask;
+  int16_t end_mask;
+  int16_t new_axis_mask;
+  int16_t shrink_axis_mask;
 };
 
 struct TanhParams {
-  int32 input_zero_point;
-  int32 input_range_radius;
-  int32 input_multiplier;
+  int32_t input_zero_point;
+  int32_t input_range_radius;
+  int32_t input_multiplier;
   int input_left_shift;
 };
 
 struct TransposeParams {
-  int8 perm_count;
-  int32 perm[5];
+  int8_t perm_count;
+  int32_t perm[5];
 };
 
 struct UnpackParams {
-  uint16 num_split;
-  int16 axis;
+  uint16_t num_split;
+  int16_t axis;
 };
 
 struct LeakyReluParams {
   float alpha;
-  int32 input_offset;
-  int32 output_offset;
-  int32 output_multiplier_alpha;
-  int32 output_shift_alpha;
-  int32 output_multiplier_identity;
-  int32 output_shift_identity;
+  int32_t input_offset;
+  int32_t output_offset;
+  int32_t output_multiplier_alpha;
+  int32_t output_shift_alpha;
+  int32_t output_multiplier_identity;
+  int32_t output_shift_identity;
 };
 
 template <typename P>
@@ -1114,7 +1120,7 @@ inline void SetActivationParams(float min, float max, P* params) {
 }
 
 template <typename P>
-inline void SetActivationParams(int32 min, int32 max, P* params) {
+inline void SetActivationParams(int32_t min, int32_t max, P* params) {
   params->quantized_activation_min = min;
   params->quantized_activation_max = max;
 }
@@ -1126,7 +1132,7 @@ inline void SetActivationParams(int64_t min, int64_t max, P* params) {
 }
 
 template <typename P>
-inline void GetActivationParams(const P& params, int32* min, int32* max) {
+inline void GetActivationParams(const P& params, int32_t* min, int32_t* max) {
   *min = params.quantized_activation_min;
   *max = params.quantized_activation_max;
 }
diff --git a/tensorflow/lite/kernels/kernel_util.cc b/tensorflow/lite/kernels/kernel_util.cc
index 164aec3f224..74c8c88d953 100644
--- a/tensorflow/lite/kernels/kernel_util.cc
+++ b/tensorflow/lite/kernels/kernel_util.cc
@@ -28,6 +28,49 @@ limitations under the License.
 
 namespace tflite {
 
+const TfLiteTensor* GetInput(const TfLiteContext* context,
+                             const TfLiteNode* node, int index) {
+  if (context->tensors != nullptr) {
+    return &context->tensors[node->inputs->data[index]];
+  } else {
+    return context->GetTensor(context, node->inputs->data[index]);
+  }
+}
+
+TfLiteTensor* GetVariableInput(TfLiteContext* context, const TfLiteNode* node,
+                               int index) {
+  TfLiteTensor* tensor = nullptr;
+  if (context->tensors != nullptr) {
+    tensor = &context->tensors[node->inputs->data[index]];
+  } else {
+    tensor = context->GetTensor(context, node->inputs->data[index]);
+  }
+  return tensor->is_variable ? tensor : nullptr;
+}
+
+TfLiteTensor* GetOutput(TfLiteContext* context, const TfLiteNode* node,
+                        int index) {
+  if (context->tensors != nullptr) {
+    return &context->tensors[node->outputs->data[index]];
+  } else {
+    return context->GetTensor(context, node->outputs->data[index]);
+  }
+}
+
+const TfLiteTensor* GetOptionalInputTensor(const TfLiteContext* context,
+                                           const TfLiteNode* node, int index) {
+  const bool use_tensor = index < node->inputs->size &&
+                          node->inputs->data[index] != kTfLiteOptionalTensor;
+  if (use_tensor) {
+    if (context->tensors != nullptr) {
+      return &context->tensors[node->inputs->data[index]];
+    } else {
+      return context->GetTensor(context, node->inputs->data[index]);
+    }
+  }
+  return nullptr;
+}
+
 // Per-axis
 TfLiteStatus PopulateConvolutionQuantizationParams(
     TfLiteContext* context, const TfLiteTensor* input,
diff --git a/tensorflow/lite/kernels/kernel_util.h b/tensorflow/lite/kernels/kernel_util.h
index 98418399561..0d6aa8fc790 100644
--- a/tensorflow/lite/kernels/kernel_util.h
+++ b/tensorflow/lite/kernels/kernel_util.h
@@ -24,26 +24,31 @@ limitations under the License.
 
 namespace tflite {
 
+// A fair number of functions in this header have historically been inline.
+// It is ok to change functions to not be inline if the latency with
+// benchmark_model for MobileNet + MobileBERT is unaffected. If such a change is
+// made, move the newly non-inlined function declarations to the top of this
+// header file.
+const TfLiteTensor* GetInput(const TfLiteContext* context,
+                             const TfLiteNode* node, int index);
+
+// Note: You must check if result is not null:
+// TfLiteTensor* my_tensor = GetVariableInput(context, node, kMyTensorIdx);
+// TF_LITE_ENSURE(context, my_tensor != nullptr);
+TfLiteTensor* GetVariableInput(TfLiteContext* context, const TfLiteNode* node,
+                               int index);
+
+TfLiteTensor* GetOutput(TfLiteContext* context, const TfLiteNode* node,
+                        int index);
+
+const TfLiteTensor* GetOptionalInputTensor(const TfLiteContext* context,
+                                           const TfLiteNode* node, int index);
+
 inline int NumDimensions(const TfLiteTensor* t) { return t->dims->size; }
 inline int SizeOfDimension(const TfLiteTensor* t, int dim) {
   return t->dims->data[dim];
 }
-inline const TfLiteTensor* GetInput(const TfLiteContext* context,
-                                    const TfLiteNode* node, int index) {
-  return &context->tensors[node->inputs->data[index]];
-}
-// Note: You must check if result is not null:
-// TfLiteTensor* my_tensor = GetVariableInput(context, node, kMyTensorIdx);
-// TF_LITE_ENSURE(context, my_tensor != nullptr);
-inline TfLiteTensor* GetVariableInput(TfLiteContext* context,
-                                      const TfLiteNode* node, int index) {
-  TfLiteTensor* tensor = &context->tensors[node->inputs->data[index]];
-  return (tensor->is_variable) ? tensor : nullptr;
-}
-inline TfLiteTensor* GetOutput(TfLiteContext* context, const TfLiteNode* node,
-                               int index) {
-  return &context->tensors[node->outputs->data[index]];
-}
+
 #ifndef TF_LITE_STATIC_MEMORY
 inline TfLiteTensor* GetTemporary(TfLiteContext* context,
                                   const TfLiteNode* node, int index) {
@@ -72,17 +77,6 @@ inline int64_t NumElements(const TfLiteTensor* t) {
   return NumElements(t->dims);
 }
 
-inline const TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
-                                                  const TfLiteNode* node,
-                                                  int index) {
-  const bool use_tensor = index < node->inputs->size &&
-                          node->inputs->data[index] != kTfLiteOptionalTensor;
-  if (use_tensor) {
-    return &context->tensors[node->inputs->data[index]];
-  }
-  return nullptr;
-}
-
 // Determines whether tensor is constant.
 // TODO(b/138199592): Introduce new query which checks for constant OR
 // persistent-read-only, which would be useful for most tensor kernels that
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index 9087bbeada9..e11a7c5a026 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -673,7 +673,7 @@ void CalculateLstmGateInteger8x8_8(
   tensor_utils::ApplyLayerNormFloat(
       gate, layer_norm_gate_weight, layer_norm_gate_scale_a,
       layer_norm_gate_scale_b, gate_bias, n_batch, n_cell, gate);
-  // Apply activation.  // Apply activation
+  // Apply activation.
   switch (activation) {
     case kTfLiteActSigmoid:
       tensor_utils::ApplySigmoidFloat(gate, n_batch, n_cell, gate);
diff --git a/tensorflow/lite/kernels/lstm_test.cc b/tensorflow/lite/kernels/lstm_test.cc
index a9023dce371..16e28619daf 100644
--- a/tensorflow/lite/kernels/lstm_test.cc
+++ b/tensorflow/lite/kernels/lstm_test.cc
@@ -38,45 +38,41 @@ class LSTMOpModel : public SingleOpModel {
  public:
   LSTMOpModel(int n_batch, int n_input, int n_cell, int n_output, bool use_cifg,
               bool use_peephole, bool use_projection_weights,
-              bool use_projection_bias, float cell_clip, float proj_clip,
-              const std::vector<std::vector<int>>& input_shapes,
-              const TensorType weight_type, bool is_layer_norm,
-              bool asymmetric_quantize_inputs = false)
-      : n_batch_(n_batch),
-        n_input_(n_input),
-        n_cell_(n_cell),
+              bool use_projection_bias, const TensorType weight_type,
+              bool model_has_legacy_20_inputs, bool is_layer_norm,
+              bool asymmetric_quantize_inputs)
+      : n_input_(n_input),
         n_output_(n_output),
+        n_batch_(n_batch),
         weight_type_(weight_type) {
-    input_ = AddInput(TensorType_FLOAT32);
+    input_ = AddInput({TensorType_FLOAT32, {n_batch, n_input}});
 
     if (use_cifg) {
       input_to_input_weights_ = AddNullInput();
     } else {
-      input_to_input_weights_ = AddInput(weight_type);
+      input_to_input_weights_ = AddInput({weight_type, {n_cell, n_input}});
     }
-
-    input_to_forget_weights_ = AddInput(weight_type);
-    input_to_cell_weights_ = AddInput(weight_type);
-    input_to_output_weights_ = AddInput(weight_type);
+    input_to_forget_weights_ = AddInput({weight_type, {n_cell, n_input}});
+    input_to_cell_weights_ = AddInput({weight_type, {n_cell, n_input}});
+    input_to_output_weights_ = AddInput({weight_type, {n_cell, n_input}});
 
     if (use_cifg) {
       recurrent_to_input_weights_ = AddNullInput();
     } else {
-      recurrent_to_input_weights_ = AddInput(weight_type);
+      recurrent_to_input_weights_ = AddInput({weight_type, {n_cell, n_output}});
     }
-
-    recurrent_to_forget_weights_ = AddInput(weight_type);
-    recurrent_to_cell_weights_ = AddInput(weight_type);
-    recurrent_to_output_weights_ = AddInput(weight_type);
+    recurrent_to_forget_weights_ = AddInput({weight_type, {n_cell, n_output}});
+    recurrent_to_cell_weights_ = AddInput({weight_type, {n_cell, n_output}});
+    recurrent_to_output_weights_ = AddInput({weight_type, {n_cell, n_output}});
 
     if (use_peephole) {
       if (use_cifg) {
         cell_to_input_weights_ = AddNullInput();
       } else {
-        cell_to_input_weights_ = AddInput(weight_type);
+        cell_to_input_weights_ = AddInput({weight_type, {n_cell}});
       }
-      cell_to_forget_weights_ = AddInput(weight_type);
-      cell_to_output_weights_ = AddInput(weight_type);
+      cell_to_forget_weights_ = AddInput({weight_type, {n_cell}});
+      cell_to_output_weights_ = AddInput({weight_type, {n_cell}});
     } else {
       cell_to_input_weights_ = AddNullInput();
       cell_to_forget_weights_ = AddNullInput();
@@ -86,60 +82,66 @@ class LSTMOpModel : public SingleOpModel {
     if (use_cifg) {
       input_gate_bias_ = AddNullInput();
     } else {
-      input_gate_bias_ = AddInput(TensorType_FLOAT32);
+      input_gate_bias_ = AddInput({TensorType_FLOAT32, {n_cell}});
     }
-    forget_gate_bias_ = AddInput(TensorType_FLOAT32);
-    cell_gate_bias_ = AddInput(TensorType_FLOAT32);
-    output_gate_bias_ = AddInput(TensorType_FLOAT32);
+    forget_gate_bias_ = AddInput({TensorType_FLOAT32, {n_cell}});
+    cell_gate_bias_ = AddInput({TensorType_FLOAT32, {n_cell}});
+    output_gate_bias_ = AddInput({TensorType_FLOAT32, {n_cell}});
 
     if (use_projection_weights) {
-      projection_weights_ = AddInput(weight_type);
-      if (use_projection_bias) {
-        projection_bias_ = AddInput(TensorType_FLOAT32);
-      } else {
-        projection_bias_ = AddNullInput();
-      }
+      projection_weights_ = AddInput({weight_type, {n_output, n_cell}});
     } else {
       projection_weights_ = AddNullInput();
+    }
+    if (use_projection_bias) {
+      CHECK(use_projection_weights);
+      projection_bias_ = AddInput({TensorType_FLOAT32, {n_output}});
+    } else {
       projection_bias_ = AddNullInput();
     }
 
     // Adding the 2 state tensors.
-    output_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_batch_, n_output_}}, true);
-    cell_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_batch_, n_cell_}}, true);
+    AddInput({TensorType_FLOAT32, {n_batch, n_output}}, true);
+    AddInput({TensorType_FLOAT32, {n_batch, n_cell}}, true);
 
     // Layer norm weights.
-    if (is_layer_norm) {
-      const int kInputLayerNormCoeffsIndex = 20;
-      const int kForgetLayerNormCoeffsIndex = 21;
-      const int kCellLayerNormCoeffsIndex = 22;
-      const int kOutputLayerNormCoeffsIndex = 23;
-      if (use_cifg) {
-        input_layer_norm_coefficients_ = AddNullInput();
+    if (!model_has_legacy_20_inputs) {
+      if (is_layer_norm) {
+        if (use_cifg) {
+          input_layer_norm_coefficients_ = AddNullInput();
+        } else {
+          input_layer_norm_coefficients_ =
+              AddInput({TensorType_FLOAT32, {n_cell}});
+        }
+        forget_layer_norm_coefficients_ =
+            AddInput({TensorType_FLOAT32, {n_cell}});
+        cell_layer_norm_coefficients_ =
+            AddInput({TensorType_FLOAT32, {n_cell}});
+        output_layer_norm_coefficients_ =
+            AddInput({TensorType_FLOAT32, {n_cell}});
       } else {
-        input_layer_norm_coefficients_ =
-            AddLayerNormCoeffsTensor(kInputLayerNormCoeffsIndex, input_shapes);
+        input_layer_norm_coefficients_ = AddNullInput();
+        forget_layer_norm_coefficients_ = AddNullInput();
+        cell_layer_norm_coefficients_ = AddNullInput();
+        output_layer_norm_coefficients_ = AddNullInput();
       }
-      forget_layer_norm_coefficients_ =
-          AddLayerNormCoeffsTensor(kForgetLayerNormCoeffsIndex, input_shapes);
-      cell_layer_norm_coefficients_ =
-          AddLayerNormCoeffsTensor(kCellLayerNormCoeffsIndex, input_shapes);
-      output_layer_norm_coefficients_ =
-          AddLayerNormCoeffsTensor(kOutputLayerNormCoeffsIndex, input_shapes);
     }
 
-    output_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput({TensorType_FLOAT32, {n_batch, n_output}});
 
+    // TODO(b/161825581): Add tests where cell_clip and/or proj_clip is not the
+    // default 0.
     SetBuiltinOp(
         BuiltinOperator_LSTM, BuiltinOptions_LSTMOptions,
-        CreateLSTMOptions(builder_, ActivationFunctionType_TANH, cell_clip,
-                          proj_clip, ::tflite::LSTMKernelType_FULL,
-                          asymmetric_quantize_inputs)
+        CreateLSTMOptions(builder_, ActivationFunctionType_TANH,
+                          /*cell_clip=*/0.0f, /*proj_clip=*/0.0f,
+                          LSTMKernelType_FULL, asymmetric_quantize_inputs)
             .Union());
 
-    BuildInterpreter(input_shapes);
+    // Input shapes are already set up, no need to pass them again.
+    BuildInterpreter(/*input_shapes=*/{}, /*num_threads=*/-1,
+                     /*allow_fp32_relax_to_fp16=*/false,
+                     /*apply_delegate=*/false);
   }
 
   void SetInputToInputWeights(const std::vector<float>& f) {
@@ -235,7 +237,6 @@ class LSTMOpModel : public SingleOpModel {
 
   int num_inputs() { return n_input_; }
   int num_outputs() { return n_output_; }
-  int num_cells() { return n_cell_; }
   int num_batches() { return n_batch_; }
 
  protected:
@@ -266,28 +267,15 @@ class LSTMOpModel : public SingleOpModel {
 
   int projection_weights_;
   int projection_bias_;
-  int output_state_;
-  int cell_state_;
 
   int output_;
 
-  int n_batch_;
   int n_input_;
-  int n_cell_;
   int n_output_;
+  int n_batch_;
 
  private:
-  int AddLayerNormCoeffsTensor(
-      int tensor_index, const std::vector<std::vector<int>>& input_shapes) {
-    if (input_shapes[tensor_index][0] != 0) {
-      return AddInput(TensorType_FLOAT32);
-    } else {
-      return AddNullInput();
-    }
-  }
-
-  template <typename T>
-  void PopulateTensor(int index, const std::vector<T>& data) {
+  void PopulateTensor(int index, const std::vector<float>& data) {
     // Nothing to do if tensor is an optional input or if data vector is empty.
     if ((index == kTfLiteOptionalTensor) || data.empty()) return;
     SingleOpModel::PopulateTensor(index, data);
@@ -315,7 +303,12 @@ class LSTMOpModel : public SingleOpModel {
   const TensorType weight_type_;
 };
 
-class BaseLstmTest : public ::testing::TestWithParam<bool> {
+// Parameters:
+// std::get<0>(GetParam()) => weight_type
+// std::get<1>(GetParam()) => model_has_legacy_20_inputs
+// std::get<2>(GetParam()) => asymmetric_quantize_inputs
+class LstmOpTest
+    : public ::testing::TestWithParam<std::tuple<TensorType, bool, bool>> {
  protected:
   // Weights of the LSTM model. Some are optional.
   std::vector<float> input_to_input_weights_;
@@ -339,48 +332,46 @@ class BaseLstmTest : public ::testing::TestWithParam<bool> {
   std::vector<float> cell_layer_norm_coefficients_;
   std::vector<float> output_layer_norm_coefficients_;
 
-  // LSTM input is stored as num_batch x num_inputs vector.
-  std::vector<std::vector<float>> lstm_input_;
-  // LSTM output is stored as num_batch x num_outputs vector.
-  std::vector<std::vector<float>> lstm_golden_output_;
+  // LSTM input is stored as num_steps * num_batch * num_inputs vector.
+  std::vector<std::vector<std::vector<float>>> lstm_input_;
+  // LSTM output is stored as num_steps * num_batch * num_outputs vector.
+  std::vector<std::vector<std::vector<float>>> lstm_golden_output_;
 
   // Compares output up to tolerance to the result of the lstm given the input.
-  void VerifyGoldens(const std::vector<std::vector<float>>& input,
-                     const std::vector<std::vector<float>>& output,
-                     LSTMOpModel* lstm, float tolerance = 1e-5) {
-    // Weights are set twice:
-    // - The delegate, if used, needs to know the scales and zero-points of
-    //   quantized tensors, which are computed dynamically when weights are set,
-    //   so weights have to be set before applying the delegate.
-    // - Applying a delegate will invalidate the tensor data so weights have to
-    //   be set a second time.
+  void VerifyGoldens(LSTMOpModel* lstm, float tolerance) {
+    // The delegate, if used, needs to know the scales and zero-points of
+    // quantized tensors, which are computed dynamically when weights are set,
+    // so weights have to be set before applying the delegate.
     SetAllWeightsAndBiases(lstm);
     lstm->ApplyDelegate();
-    SetAllWeightsAndBiases(lstm);
 
-    const int num_batches = input.size();
-    EXPECT_GT(num_batches, 0);
     const int num_inputs = lstm->num_inputs();
-    EXPECT_GT(num_inputs, 0);
-    const int input_sequence_size = input[0].size() / num_inputs;
-    EXPECT_GT(input_sequence_size, 0);
-    for (int i = 0; i < input_sequence_size; ++i) {
-      for (int b = 0; b < num_batches; ++b) {
-        const float* batch_start = input[b].data() + i * num_inputs;
-        const float* batch_end = batch_start + num_inputs;
+    const int num_outputs = lstm->num_outputs();
+    const int num_batches = lstm->num_batches();
 
-        lstm->SetInput(b * lstm->num_inputs(), batch_start, batch_end);
+    ASSERT_EQ(lstm_input_.size(), lstm_golden_output_.size());
+    const int num_steps = lstm_input_.size();
+
+    for (int i = 0; i < num_steps; ++i) {
+      ASSERT_EQ(num_batches, lstm_input_[i].size());
+      for (int b = 0; b < num_batches; ++b) {
+        ASSERT_EQ(num_inputs, lstm_input_[i][b].size());
+        const float* batch_start = lstm_input_[i][b].data();
+        const float* batch_end = batch_start + num_inputs;
+        lstm->SetInput(b * num_inputs, batch_start, batch_end);
       }
 
       lstm->Invoke();
 
-      const int num_outputs = lstm->num_outputs();
       std::vector<float> expected;
+      ASSERT_EQ(num_batches, lstm_golden_output_[i].size());
       for (int b = 0; b < num_batches; ++b) {
-        const float* golden_start_batch = output[b].data() + i * num_outputs;
-        const float* golden_end_batch = golden_start_batch + num_outputs;
-        expected.insert(expected.end(), golden_start_batch, golden_end_batch);
+        ASSERT_EQ(num_outputs, lstm_golden_output_[i][b].size());
+        const float* batch_start = lstm_golden_output_[i][b].data();
+        const float* batch_end = batch_start + num_outputs;
+        expected.insert(expected.end(), batch_start, batch_end);
       }
+
       EXPECT_THAT(lstm->GetOutput(),
                   ElementsAreArray(ArrayFloatNear(expected, tolerance)));
     }
@@ -418,1711 +409,904 @@ class BaseLstmTest : public ::testing::TestWithParam<bool> {
   }
 };
 
-class NoCifgNoPeepholeNoProjectionNoClippingLstmTest : public BaseLstmTest {
-  void SetUp() override {
-    input_to_input_weights_ = {-0.45018822, -0.02338299, -0.0870589,
-                               -0.34550029, 0.04266912,  -0.15680569,
-                               -0.34856534, 0.43890524};
-    input_to_cell_weights_ = {-0.50013041, 0.1370284,  0.11810488, 0.2013163,
-                              -0.20583314, 0.44344562, 0.22077113, -0.29909778};
-    input_to_forget_weights_ = {0.09701663,  0.20334584,  -0.50592935,
-                                -0.31343272, -0.40032279, 0.44781327,
-                                0.01387155,  -0.35593212};
-    input_to_output_weights_ = {-0.25065863, -0.28290087, 0.04613829,
-                                0.40525138,  0.44272184,  0.03897077,
-                                -0.1556896,  0.19487578};
-    input_gate_bias_ = {0., 0., 0., 0.};
-    cell_gate_bias_ = {0., 0., 0., 0.};
-    forget_gate_bias_ = {1., 1., 1., 1.};
-    output_gate_bias_ = {0., 0., 0., 0.};
-
-    recurrent_to_input_weights_ = {
-        -0.0063535,  -0.2042388,  0.31454784,  -0.35746509,
-        0.28902304,  0.08183324,  -0.16555229, 0.02286911,
-        -0.13566875, 0.03034258,  0.48091322,  -0.12528998,
-        0.24077177,  -0.51332325, -0.33502164, 0.10629296};
-
-    recurrent_to_cell_weights_ = {
-        -0.3407414,  0.24443203,  -0.2078532,  0.26320225,
-        0.05695659,  -0.00123841, -0.4744786,  -0.35869038,
-        -0.06418842, -0.13502428, -0.501764,   0.22830659,
-        -0.46367589, 0.26016325,  -0.03894562, -0.16368064};
-
-    recurrent_to_forget_weights_ = {
-        -0.48684245, -0.06655136, 0.42224967,  0.2112639,
-        0.27654213,  0.20864892,  -0.07646349, 0.45877004,
-        0.00141793,  -0.14609534, 0.36447752,  0.09196436,
-        0.28053468,  0.01560611,  -0.20127171, -0.01140004};
-
-    recurrent_to_output_weights_ = {
-        0.43385774,  -0.17194885, 0.2718237,  0.09215671,
-        0.24107647,  -0.39835793, 0.18212086, 0.01301402,
-        0.48572797,  -0.50656658, 0.20047462, -0.20607421,
-        -0.51818722, -0.15390486, 0.0468148,  0.39922136};
-
-    lstm_input_ = {{2., 3., 3., 4., 1., 1.}};
-    lstm_golden_output_ = {{-0.02973187, 0.1229473, 0.20885126, -0.15358765,
-                            -0.03716109, 0.12507336, 0.41193449, -0.20860538,
-                            -0.15053082, 0.09120187, 0.24278517, -0.12222792}};
-  }
-};
-
-TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) {
+TEST_P(LstmOpTest, NoCifg_NoPeephole_NoProjection_NoLayerNorm) {
   const int n_batch = 1;
   const int n_input = 2;
   // n_cell and n_output have the same size when there is no projection.
   const int n_cell = 4;
   const int n_output = 4;
 
-  LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
-                   /*use_cifg=*/false, /*use_peephole=*/false,
-                   /*use_projection_weights=*/false,
-                   /*use_projection_bias=*/false,
-                   /*cell_clip=*/0.0, /*proj_clip=*/0.0,
-                   {
-                       {n_batch, n_input},  // input tensor
+  TensorType weight_type;
+  bool model_has_legacy_20_inputs;
+  bool asymmetric_quantize_inputs;
+  std::tie(weight_type, model_has_legacy_20_inputs,
+           asymmetric_quantize_inputs) = GetParam();
 
-                       {n_cell, n_input},  // input_to_input_weight tensor
-                       {n_cell, n_input},  // input_to_forget_weight tensor
-                       {n_cell, n_input},  // input_to_cell_weight tensor
-                       {n_cell, n_input},  // input_to_output_weight tensor
-
-                       {n_cell, n_output},  // recurrent_to_input_weight_tensor
-                       {n_cell, n_output},  // recurrent_to_forget_weight_tensor
-                       {n_cell, n_output},  // recurrent_to_cell_weight_tensor
-                       {n_cell, n_output},  // recurrent_to_output_weight_tensor
-
-                       {0},  // cell_to_input_weight tensor
-                       {0},  // cell_to_forget_weight tensor
-                       {0},  // cell_to_output_weight tensor
-
-                       {n_cell},  // input_gate_bias tensor
-                       {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_gate_bias tensor
-                       {n_cell},  // output_gate_bias tensor
-
-                       {0, 0},  // projection_weight tensor
-                       {0},     // projection_bias tensor
-                   },
-                   /*weight_type=*/TensorType_FLOAT32,
-                   /*is_layer_norm=*/false);
-
-  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
-}
-
-class NoCifgNoPeepholeNoProjectionNoClippingOmittedLayerNormLstmTest
-    : public NoCifgNoPeepholeNoProjectionNoClippingLstmTest {};
-
-TEST_F(NoCifgNoPeepholeNoProjectionNoClippingOmittedLayerNormLstmTest,
-       LstmBlackBoxTest) {
-  const int n_batch = 1;
-  const int n_input = 2;
-  // n_cell and n_output have the same size when there is no projection.
-  const int n_cell = 4;
-  const int n_output = 4;
-
-  LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
-                   /*use_cifg=*/false, /*use_peephole=*/false,
-                   /*use_projection_weights=*/false,
-                   /*use_projection_bias=*/false,
-                   /*cell_clip=*/0.0, /*proj_clip=*/0.0,
-                   {
-                       {n_batch, n_input},  // input tensor
-
-                       {n_cell, n_input},  // input_to_input_weight tensor
-                       {n_cell, n_input},  // input_to_forget_weight tensor
-                       {n_cell, n_input},  // input_to_cell_weight tensor
-                       {n_cell, n_input},  // input_to_output_weight tensor
-
-                       {n_cell, n_output},  // recurrent_to_input_weight_tensor
-                       {n_cell, n_output},  // recurrent_to_forget_weight_tensor
-                       {n_cell, n_output},  // recurrent_to_cell_weight_tensor
-                       {n_cell, n_output},  // recurrent_to_output_weight_tensor
-
-                       {0},  // cell_to_input_weight tensor
-                       {0},  // cell_to_forget_weight tensor
-                       {0},  // cell_to_output_weight tensor
-
-                       {n_cell},  // input_gate_bias tensor
-                       {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_gate_bias tensor
-                       {n_cell},  // output_gate_bias tensor
-
-                       {0, 0},  // projection_weight tensor
-                       {0},     // projection_bias tensor
-
-                       {n_batch, n_output},  // output_state tensor
-                       {n_batch, n_cell},    // cell_state tensor
-
-                       {0},  // input_layer_norm_coefficient tensor
-                       {0},  // forget_layer_norm_coefficient tensor
-                       {0},  // cell_layer_norm_coefficient tensor
-                       {0},  // output_layer_norm_coefficient tensor
-                   },
-                   /*weight_type=*/TensorType_FLOAT32,
-                   /*is_layer_norm=*/true);
-
-  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
-}
-
-TEST_P(NoCifgNoPeepholeNoProjectionNoClippingLstmTest,
-       HybridLstmBlackBoxTestUint8) {
-  // TODO(b/158205028): Fix this test if GetForceUseNnapi() && !GetParam().
-  if (SingleOpModel::GetForceUseNnapi()) {
+  // TODO(b/158205028): Fix this test if using NN-API.
+  if (SingleOpModel::GetForceUseNnapi() && weight_type == TensorType_UINT8) {
     return;
   }
+
+  input_to_input_weights_ = {-0.45018822, -0.02338299, -0.0870589,  -0.34550029,
+                             0.04266912,  -0.15680569, -0.34856534, 0.43890524};
+  input_to_cell_weights_ = {-0.50013041, 0.1370284,  0.11810488, 0.2013163,
+                            -0.20583314, 0.44344562, 0.22077113, -0.29909778};
+  input_to_forget_weights_ = {0.09701663,  0.20334584,  -0.50592935,
+                              -0.31343272, -0.40032279, 0.44781327,
+                              0.01387155,  -0.35593212};
+  input_to_output_weights_ = {-0.25065863, -0.28290087, 0.04613829, 0.40525138,
+                              0.44272184,  0.03897077,  -0.1556896, 0.19487578};
+  input_gate_bias_ = {0., 0., 0., 0.};
+  cell_gate_bias_ = {0., 0., 0., 0.};
+  forget_gate_bias_ = {1., 1., 1., 1.};
+  output_gate_bias_ = {0., 0., 0., 0.};
+
+  recurrent_to_input_weights_ = {
+      -0.0063535,  -0.2042388,  0.31454784,  -0.35746509,
+      0.28902304,  0.08183324,  -0.16555229, 0.02286911,
+      -0.13566875, 0.03034258,  0.48091322,  -0.12528998,
+      0.24077177,  -0.51332325, -0.33502164, 0.10629296};
+
+  recurrent_to_cell_weights_ = {
+      -0.3407414,  0.24443203,  -0.2078532,  0.26320225,
+      0.05695659,  -0.00123841, -0.4744786,  -0.35869038,
+      -0.06418842, -0.13502428, -0.501764,   0.22830659,
+      -0.46367589, 0.26016325,  -0.03894562, -0.16368064};
+
+  recurrent_to_forget_weights_ = {
+      -0.48684245, -0.06655136, 0.42224967,  0.2112639,
+      0.27654213,  0.20864892,  -0.07646349, 0.45877004,
+      0.00141793,  -0.14609534, 0.36447752,  0.09196436,
+      0.28053468,  0.01560611,  -0.20127171, -0.01140004};
+
+  recurrent_to_output_weights_ = {
+      0.43385774,  -0.17194885, 0.2718237,  0.09215671,
+      0.24107647,  -0.39835793, 0.18212086, 0.01301402,
+      0.48572797,  -0.50656658, 0.20047462, -0.20607421,
+      -0.51818722, -0.15390486, 0.0468148,  0.39922136};
+
+  // num_steps * num_batch * num_inputs
+  lstm_input_ = {{{2., 3.}}, {{3., 4.}}, {{1., 1.}}};
+  // num_steps * num_batch * num_outputs
+  lstm_golden_output_ = {{{-0.02973187, 0.1229473, 0.20885126, -0.15358765}},
+                         {{-0.03716109, 0.12507336, 0.41193449, -0.20860538}},
+                         {{-0.15053082, 0.09120187, 0.24278517, -0.12222792}}};
+
+  LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
+                   /*use_cifg=*/false, /*use_peephole=*/false,
+                   /*use_projection_weights=*/false,
+                   /*use_projection_bias=*/false, weight_type,
+                   model_has_legacy_20_inputs,
+                   /*is_layer_norm=*/false, asymmetric_quantize_inputs);
+
+  static const auto* tolerance_per_type =
+      new std::map<TensorType, float>{{TensorType_FLOAT32, 0.00001f},
+                                      {TensorType_UINT8, 0.0157651f},
+                                      {TensorType_INT8, 0.0157651f}};
+  VerifyGoldens(&lstm, tolerance_per_type->at(weight_type));
+}
+
+TEST_P(LstmOpTest, Cifg_Peephole_NoProjection_NoLayerNorm) {
   const int n_batch = 1;
   const int n_input = 2;
   // n_cell and n_output have the same size when there is no projection.
   const int n_cell = 4;
   const int n_output = 4;
 
-  LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
-                   /*use_cifg=*/false, /*use_peephole=*/false,
-                   /*use_projection_weights=*/false,
-                   /*use_projection_bias=*/false, /*cell_clip=*/0.0,
-                   /*proj_clip=*/0.0,
-                   {
-                       {n_batch, n_input},  // input tensor
+  TensorType weight_type;
+  bool model_has_legacy_20_inputs;
+  bool asymmetric_quantize_inputs;
+  std::tie(weight_type, model_has_legacy_20_inputs,
+           asymmetric_quantize_inputs) = GetParam();
 
-                       {n_cell, n_input},  // input_to_input_weight tensor
-                       {n_cell, n_input},  // input_to_forget_weight tensor
-                       {n_cell, n_input},  // input_to_cell_weight tensor
-                       {n_cell, n_input},  // input_to_output_weight tensor
-
-                       {n_cell, n_output},  // recurrent_to_input_weight tensor
-                       {n_cell, n_output},  // recurrent_to_forget_weight tensor
-                       {n_cell, n_output},  // recurrent_to_cell_weight tensor
-                       {n_cell, n_output},  // recurrent_to_output_weight tensor
-
-                       {0},  // cell_to_input_weight tensor
-                       {0},  // cell_to_forget_weight tensor
-                       {0},  // cell_to_output_weight tensor
-
-                       {n_cell},  // input_gate_bias tensor
-                       {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_gate_bias tensor
-                       {n_cell},  // output_gate_bias tensor
-
-                       {0, 0},  // projection_weight tensor
-                       {0},     // projection_bias tensor
-                   },
-                   /*weight_type=*/TensorType_UINT8,
-                   /*is_layer_norm=*/false, GetParam());
-
-  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm,
-                /*tolerance=*/0.0157651);
-}
-
-class NoCifgNoPeepholeNoProjectionNoClippingLstmInt8Test
-    : public NoCifgNoPeepholeNoProjectionNoClippingLstmTest {};
-
-TEST_P(NoCifgNoPeepholeNoProjectionNoClippingLstmInt8Test,
-       HybridLstmBlackBoxTestInt8) {
-  if (SingleOpModel::GetForceUseNnapi() && GetParam()) {
+  // TODO(b/158205028): Fix this test if using NN-API.
+  if (SingleOpModel::GetForceUseNnapi() && weight_type == TensorType_UINT8) {
     return;
   }
-  const int n_batch = 1;
-  const int n_input = 2;
-  // n_cell and n_output have the same size when there is no projection.
-  const int n_cell = 4;
-  const int n_output = 4;
 
-  LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
-                   /*use_cifg=*/false, /*use_peephole=*/false,
-                   /*use_projection_weights=*/false,
-                   /*use_projection_bias=*/false, /*cell_clip=*/0.0,
-                   /*proj_clip=*/0.0,
-                   {
-                       {n_batch, n_input},  // input tensor
+  input_to_cell_weights_ = {-0.49770179, -0.27711356, -0.09624726, 0.05100781,
+                            0.04717243,  0.48944736,  -0.38535351, -0.17212132};
 
-                       {n_cell, n_input},  // input_to_input_weight tensor
-                       {n_cell, n_input},  // input_to_forget_weight tensor
-                       {n_cell, n_input},  // input_to_cell_weight tensor
-                       {n_cell, n_input},  // input_to_output_weight tensor
+  input_to_forget_weights_ = {-0.55291498, -0.42866567, 0.13056988, -0.3633365,
+                              -0.22755712, 0.28253698,  0.24407166, 0.33826375};
 
-                       {n_cell, n_output},  // recurrent_to_input_weight tensor
-                       {n_cell, n_output},  // recurrent_to_forget_weight tensor
-                       {n_cell, n_output},  // recurrent_to_cell_weight tensor
-                       {n_cell, n_output},  // recurrent_to_output_weight tensor
+  input_to_output_weights_ = {0.10725588,  -0.02335852, -0.55932593,
+                              -0.09426838, -0.44257352, 0.54939759,
+                              0.01533556,  0.42751634};
+  cell_gate_bias_ = {0., 0., 0., 0.};
+  forget_gate_bias_ = {1., 1., 1., 1.};
+  output_gate_bias_ = {0., 0., 0., 0.};
 
-                       {0},  // cell_to_input_weight tensor
-                       {0},  // cell_to_forget_weight tensor
-                       {0},  // cell_to_output_weight tensor
+  recurrent_to_cell_weights_ = {
+      0.54066205,  -0.32668582, -0.43562764, -0.56094903,
+      0.42957711,  0.01841056,  -0.32764608, -0.33027974,
+      -0.10826075, 0.20675004,  0.19069612,  -0.03026325,
+      -0.54532051, 0.33003211,  0.44901288,  0.21193194};
 
-                       {n_cell},  // input_gate_bias tensor
-                       {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_gate_bias tensor
-                       {n_cell},  // output_gate_bias tensor
+  recurrent_to_forget_weights_ = {
+      -0.13832897, -0.0515101,  -0.2359007, -0.16661474,
+      -0.14340827, 0.36986142,  0.23414481, 0.55899,
+      0.10798943,  -0.41174671, 0.17751795, -0.34484994,
+      -0.35874045, -0.11352962, 0.27268326, 0.54058349};
 
-                       {0, 0},  // projection_weight tensor
-                       {0},     // projection_bias tensor
-                   },
-                   /*weight_type=*/TensorType_INT8,
-                   /*is_layer_norm=*/false, GetParam());
+  recurrent_to_output_weights_ = {
+      0.41613156, 0.42610586,  -0.16495961, -0.5663873,
+      0.30579174, -0.05115908, -0.33941799, 0.23364776,
+      0.11178309, 0.09481031,  -0.26424935, 0.46261835,
+      0.50248802, 0.26114327,  -0.43736315, 0.33149987};
 
-  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm,
-                /*tolerance=*/0.0157651);
-}
+  cell_to_forget_weights_ = {0.47485286, -0.51955009, -0.24458408, 0.31544167};
+  cell_to_output_weights_ = {-0.17135078, 0.82760304, 0.85573703, -0.77109635};
 
-class CifgNoPeepholeNoProjectionNoClippingLstmTest : public BaseLstmTest {
-  void SetUp() override {
-    input_to_cell_weights_ = {-0.49770179, -0.27711356, -0.09624726,
-                              0.05100781,  0.04717243,  0.48944736,
-                              -0.38535351, -0.17212132};
-
-    input_to_forget_weights_ = {-0.55291498, -0.42866567, 0.13056988,
-                                -0.3633365,  -0.22755712, 0.28253698,
-                                0.24407166,  0.33826375};
-
-    input_to_output_weights_ = {0.10725588,  -0.02335852, -0.55932593,
-                                -0.09426838, -0.44257352, 0.54939759,
-                                0.01533556,  0.42751634};
-    cell_gate_bias_ = {0., 0., 0., 0.};
-    forget_gate_bias_ = {1., 1., 1., 1.};
-    output_gate_bias_ = {0., 0., 0., 0.};
-
-    recurrent_to_cell_weights_ = {
-        0.54066205,  -0.32668582, -0.43562764, -0.56094903,
-        0.42957711,  0.01841056,  -0.32764608, -0.33027974,
-        -0.10826075, 0.20675004,  0.19069612,  -0.03026325,
-        -0.54532051, 0.33003211,  0.44901288,  0.21193194};
-
-    recurrent_to_forget_weights_ = {
-        -0.13832897, -0.0515101,  -0.2359007, -0.16661474,
-        -0.14340827, 0.36986142,  0.23414481, 0.55899,
-        0.10798943,  -0.41174671, 0.17751795, -0.34484994,
-        -0.35874045, -0.11352962, 0.27268326, 0.54058349};
-
-    recurrent_to_output_weights_ = {
-        0.41613156, 0.42610586,  -0.16495961, -0.5663873,
-        0.30579174, -0.05115908, -0.33941799, 0.23364776,
-        0.11178309, 0.09481031,  -0.26424935, 0.46261835,
-        0.50248802, 0.26114327,  -0.43736315, 0.33149987};
-
-    cell_to_forget_weights_ = {0.47485286, -0.51955009, -0.24458408,
-                               0.31544167};
-    cell_to_output_weights_ = {-0.17135078, 0.82760304, 0.85573703,
-                               -0.77109635};
-
-    lstm_input_ = {{2., 3., 3., 4., 1., 1.}};
-    lstm_golden_output_ = {{-0.36444446, -0.00352185, 0.12886585, -0.05163646,
-                            -0.42312205, -0.01218222, 0.24201041, -0.08124574,
-                            -0.358325, -0.04621704, 0.21641694, -0.06471302}};
-  }
-};
-
-TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) {
-  const int n_batch = 1;
-  const int n_input = 2;
-  // n_cell and n_output have the same size when there is no projection.
-  const int n_cell = 4;
-  const int n_output = 4;
+  lstm_input_ = {{{2., 3.}}, {{3., 4.}}, {{1., 1.}}};
+  lstm_golden_output_ = {{{-0.36444446, -0.00352185, 0.12886585, -0.05163646}},
+                         {{-0.42312205, -0.01218222, 0.24201041, -0.08124574}},
+                         {{-0.358325, -0.04621704, 0.21641694, -0.06471302}}};
 
   LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
                    /*use_cifg=*/true, /*use_peephole=*/true,
                    /*use_projection_weights=*/false,
-                   /*use_projection_bias=*/false,
-                   /*cell_clip=*/0.0, /*proj_clip=*/0.0,
-                   {
-                       {n_batch, n_input},  // input tensor
+                   /*use_projection_bias=*/false, weight_type,
+                   model_has_legacy_20_inputs, /*is_layer_norm=*/false,
+                   asymmetric_quantize_inputs);
 
-                       {0, 0},             // input_to_input_weight tensor
-                       {n_cell, n_input},  // input_to_forget_weight tensor
-                       {n_cell, n_input},  // input_to_cell_weight tensor
-                       {n_cell, n_input},  // input_to_output_weight tensor
-
-                       {0, 0},              // recurrent_to_input_weight tensor
-                       {n_cell, n_output},  // recurrent_to_forget_weight tensor
-                       {n_cell, n_output},  // recurrent_to_cell_weight tensor
-                       {n_cell, n_output},  // recurrent_to_output_weight tensor
-
-                       {0},       // cell_to_input_weight tensor
-                       {n_cell},  // cell_to_forget_weight tensor
-                       {n_cell},  // cell_to_output_weight tensor
-
-                       {0},       // input_gate_bias tensor
-                       {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_gate_bias tensor
-                       {n_cell},  // output_gate_bias tensor
-
-                       {0, 0},  // projection_weight tensor
-                       {0},     // projection_bias tensor
-                   },
-                   /*weight_type=*/TensorType_FLOAT32,
-                   /*is_layer_norm=*/false);
-
-  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
+  static const auto* tolerance_per_type =
+      new std::map<TensorType, float>{{TensorType_FLOAT32, 0.00001f},
+                                      {TensorType_UINT8, 0.03573f},
+                                      {TensorType_INT8, 0.03573f}};
+  VerifyGoldens(&lstm, tolerance_per_type->at(weight_type));
 }
 
-TEST_P(CifgNoPeepholeNoProjectionNoClippingLstmTest,
-       HybridLstmBlackBoxTestUint8) {
-  // TODO(b/158205028): Fix this test if GetForceUseNnapi() && !GetParam().
-  if (SingleOpModel::GetForceUseNnapi()) {
-    return;
-  }
-  const int n_batch = 1;
-  const int n_input = 2;
-  // n_cell and n_output have the same size when there is no projection.
-  const int n_cell = 4;
-  const int n_output = 4;
-
-  LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
-                   /*use_cifg=*/true, /*use_peephole=*/true,
-                   /*use_projection_weights=*/false,
-                   /*use_projection_bias=*/false,
-                   /*cell_clip=*/0.0, /*proj_clip=*/0.0,
-                   {
-                       {n_batch, n_input},  // input tensor
-
-                       {0, 0},             // input_to_input_weight tensor
-                       {n_cell, n_input},  // input_to_forget_weight tensor
-                       {n_cell, n_input},  // input_to_cell_weight tensor
-                       {n_cell, n_input},  // input_to_output_weight tensor
-
-                       {0, 0},              // recurrent_to_input_weight tensor
-                       {n_cell, n_output},  // recurrent_to_forget_weight tensor
-                       {n_cell, n_output},  // recurrent_to_cell_weight tensor
-                       {n_cell, n_output},  // recurrent_to_output_weight tensor
-
-                       {0},       // cell_to_input_weight tensor
-                       {n_cell},  // cell_to_forget_weight tensor
-                       {n_cell},  // cell_to_output_weight tensor
-
-                       {0},       // input_gate_bias tensor
-                       {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_gate_bias tensor
-                       {n_cell},  // output_gate_bias tensor
-
-                       {0, 0},  // projection_weight tensor
-                       {0},     // projection_bias tensor
-                   },
-                   /*weight_type=*/TensorType_UINT8,
-                   /*is_layer_norm=*/false, GetParam());
-
-  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.03573);
-}
-class CifgNoPeepholeNoProjectionNoClippingLstmInt8Test
-    : public CifgNoPeepholeNoProjectionNoClippingLstmTest {};
-
-TEST_P(CifgNoPeepholeNoProjectionNoClippingLstmInt8Test,
-       HybridLstmBlackBoxTestInt8) {
-  if (SingleOpModel::GetForceUseNnapi() && GetParam()) {
-    return;
-  }
-  const int n_batch = 1;
-  const int n_input = 2;
-  // n_cell and n_output have the same size when there is no projection.
-  const int n_cell = 4;
-  const int n_output = 4;
-
-  LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
-                   /*use_cifg=*/true, /*use_peephole=*/true,
-                   /*use_projection_weights=*/false,
-                   /*use_projection_bias=*/false,
-                   /*cell_clip=*/0.0, /*proj_clip=*/0.0,
-                   {
-                       {n_batch, n_input},  // input tensor
-
-                       {0, 0},             // input_to_input_weight tensor
-                       {n_cell, n_input},  // input_to_forget_weight tensor
-                       {n_cell, n_input},  // input_to_cell_weight tensor
-                       {n_cell, n_input},  // input_to_output_weight tensor
-
-                       {0, 0},              // recurrent_to_input_weight tensor
-                       {n_cell, n_output},  // recurrent_to_forget_weight tensor
-                       {n_cell, n_output},  // recurrent_to_cell_weight tensor
-                       {n_cell, n_output},  // recurrent_to_output_weight tensor
-
-                       {0},       // cell_to_input_weight tensor
-                       {n_cell},  // cell_to_forget_weight tensor
-                       {n_cell},  // cell_to_output_weight tensor
-
-                       {0},       // input_gate_bias tensor
-                       {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_gate_bias tensor
-                       {n_cell},  // output_gate_bias tensor
-
-                       {0, 0},  // projection_weight tensor
-                       {0},     // projection_bias tensor
-                   },
-                   /*weight_type=*/TensorType_INT8,
-                   /*is_layer_norm=*/false, GetParam());
-
-  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.03573);
-}
-
-class NoCifgPeepholeProjectionNoClippingLstmTest : public BaseLstmTest {
-  void SetUp() override {
-    input_to_input_weights_ = {
-        0.021393683,  0.06124551,    0.046905167,  -0.014657677,  -0.03149463,
-        0.09171803,   0.14647801,    0.10797193,   -0.0057968358, 0.0019193048,
-        -0.2726754,   0.10154029,    -0.018539885, 0.080349885,   -0.10262385,
-        -0.022599787, -0.09121155,   -0.008675967, -0.045206103,  -0.0821282,
-        -0.008045952, 0.015478081,   0.055217247,  0.038719587,   0.044153627,
-        -0.06453243,  0.05031825,    -0.046935108, -0.008164439,  0.014574226,
-        -0.1671009,   -0.15519552,   -0.16819797,  -0.13971269,   -0.11953059,
-        0.25005487,   -0.22790983,   0.009855087,  -0.028140958,  -0.11200698,
-        0.11295408,   -0.0035217577, 0.054485075,  0.05184695,    0.064711206,
-        0.10989193,   0.11674786,    0.03490607,   0.07727357,    0.11390585,
-        -0.1863375,   -0.1034451,    -0.13945189,  -0.049401227,  -0.18767063,
-        0.042483903,  0.14233552,    0.13832581,   0.18350165,    0.14545603,
-        -0.028545704, 0.024939531,   0.050929718,  0.0076203286,  -0.0029723682,
-        -0.042484224, -0.11827596,   -0.09171104,  -0.10808628,   -0.16327988,
-        -0.2273378,   -0.0993647,    -0.017155107, 0.0023917493,  0.049272764,
-        0.0038534778, 0.054764505,   0.089753784,  0.06947234,    0.08014476,
-        -0.04544234,  -0.0497073,    -0.07135631,  -0.048929106,  -0.004042012,
-        -0.009284026, 0.018042054,   0.0036860977, -0.07427302,   -0.11434604,
-        -0.018995456, 0.031487543,   0.012834908,  0.019977754,   0.044256654,
-        -0.39292613,  -0.18519334,   -0.11651281,  -0.06809892,   0.011373677};
-
-    input_to_forget_weights_ = {
-        -0.0018401089, -0.004852237, 0.03698424,    0.014181704,
-        0.028273236,   -0.016726194, -0.05249759,   -0.10204261,
-        0.00861066,    -0.040979505, -0.009899187,  0.01923892,
-        -0.028177269,  -0.08535103,  -0.14585495,   0.10662567,
-        -0.01909731,   -0.017883534, -0.0047269356, -0.045103323,
-        0.0030784295,  0.076784775,  0.07463696,    0.094531395,
-        0.0814421,     -0.12257899,  -0.033945758,  -0.031303465,
-        0.045630626,   0.06843887,   -0.13492945,   -0.012480007,
-        -0.0811829,    -0.07224499,  -0.09628791,   0.045100946,
-        0.0012300825,  0.013964662,  0.099372394,   0.02543059,
-        0.06958324,    0.034257296,  0.0482646,     0.06267997,
-        0.052625068,   0.12784666,   0.07077897,    0.025725935,
-        0.04165009,    0.07241905,   0.018668644,   -0.037377294,
-        -0.06277783,   -0.08833636,  -0.040120605,  -0.011405586,
-        -0.007808335,  -0.010301386, -0.005102167,  0.027717464,
-        0.05483423,    0.11449111,   0.11289652,    0.10939839,
-        0.13396506,    -0.08402166,  -0.01901462,   -0.044678304,
-        -0.07720565,   0.014350063,  -0.11757958,   -0.0652038,
-        -0.08185733,   -0.076754324, -0.092614375,  0.10405491,
-        0.052960336,   0.035755895,  0.035839386,   -0.012540553,
-        0.036881298,   0.02913376,   0.03420159,    0.05448447,
-        -0.054523353,  0.02582715,   0.02327355,    -0.011857179,
-        -0.0011980024, -0.034641717, -0.026125094,  -0.17582615,
-        -0.15923657,   -0.27486774,  -0.0006143371, 0.0001771948,
-        -8.470171e-05, 0.02651807,   0.045790765,   0.06956496};
-
-    input_to_cell_weights_ = {
-        -0.04580283,   -0.09549462,   -0.032418985,  -0.06454633,
-        -0.043528453,  0.043018587,   -0.049152344,  -0.12418144,
-        -0.078985475,  -0.07596889,   0.019484362,   -0.11434962,
-        -0.0074034138, -0.06314844,   -0.092981495,  0.0062155537,
-        -0.025034338,  -0.0028890965, 0.048929527,   0.06235075,
-        0.10665918,    -0.032036792,  -0.08505916,   -0.10843358,
-        -0.13002433,   -0.036816437,  -0.02130134,   -0.016518239,
-        0.0047691227,  -0.0025825808, 0.066017866,   0.029991534,
-        -0.10652836,   -0.1037554,    -0.13056071,   -0.03266643,
-        -0.033702414,  -0.006473424,  -0.04611692,   0.014419339,
-        -0.025174323,  0.0396852,     0.081777506,   0.06157468,
-        0.10210095,    -0.009658194,  0.046511717,   0.03603906,
-        0.0069369148,  0.015960095,   -0.06507666,   0.09551598,
-        0.053568836,   0.06408714,    0.12835667,    -0.008714329,
-        -0.20211966,   -0.12093674,   0.029450472,   0.2849013,
-        -0.029227901,  0.1164364,     -0.08560263,   0.09941786,
-        -0.036999565,  -0.028842626,  -0.0033637602, -0.017012902,
-        -0.09720865,   -0.11193351,   -0.029155117,  -0.017936034,
-        -0.009768936,  -0.04223324,   -0.036159635,  0.06505112,
-        -0.021742892,  -0.023377212,  -0.07221364,   -0.06430552,
-        0.05453865,    0.091149814,   0.06387331,    0.007518393,
-        0.055960953,   0.069779344,   0.046411168,   0.10509911,
-        0.07463894,    0.0075130584,  0.012850982,   0.04555431,
-        0.056955688,   0.06555285,    0.050801456,   -0.009862683,
-        0.00826772,    -0.026555609,  -0.0073611983, -0.0014897042};
-
-    input_to_output_weights_ = {
-        -0.0998932,   -0.07201956,  -0.052803773,  -0.15629593,  -0.15001918,
-        -0.07650751,  0.02359855,   -0.075155355,  -0.08037709,  -0.15093534,
-        0.029517552,  -0.04751393,  0.010350531,   -0.02664851,  -0.016839722,
-        -0.023121163, 0.0077019283, 0.012851257,   -0.05040649,  -0.0129761,
-        -0.021737747, -0.038305793, -0.06870586,   -0.01481247,  -0.001285394,
-        0.10124236,   0.083122835,  0.053313006,   -0.062235646, -0.075637154,
-        -0.027833903, 0.029774971,  0.1130802,     0.09218906,   0.09506135,
-        -0.086665764, -0.037162706, -0.038880914,  -0.035832845, -0.014481564,
-        -0.09825003,  -0.12048569,  -0.097665586,  -0.05287633,  -0.0964047,
-        -0.11366429,  0.035777505,  0.13568819,    0.052451383,  0.050649304,
-        0.05798951,   -0.021852335, -0.099848844,  0.014740475,  -0.078897946,
-        0.04974699,   0.014160473,  0.06973932,    0.04964942,   0.033364646,
-        0.08190124,   0.025535367,  0.050893165,   0.048514254,  0.06945813,
-        -0.078907564, -0.06707616,  -0.11844508,   -0.09986688,  -0.07509403,
-        0.06263226,   0.14925587,   0.20188436,    0.12098451,   0.14639415,
-        0.0015017595, -0.014267382, -0.03417257,   0.012711468,  0.0028300495,
-        -0.024758482, -0.05098548,  -0.0821182,    0.014225672,  0.021544158,
-        0.08949725,   0.07505268,   -0.0020780868, 0.04908258,   0.06476295,
-        -0.022907063, 0.027562456,  0.040185735,   0.019567577,  -0.015598739,
-        -0.049097303, -0.017121866, -0.083368234,  -0.02332002,  -0.0840956};
-
-    input_gate_bias_ = {0.02234832,   0.14757581,  0.18176508,  0.10380666,
-                        0.053110216,  -0.06928846, -0.13942584, -0.11816189,
-                        0.19483899,   0.03652339,  -0.10250295, 0.036714908,
-                        -0.18426876,  0.036065217, 0.21810818,  0.02383196,
-                        -0.043370757, 0.08690144,  -0.04444982, 0.00030581196};
-
-    forget_gate_bias_ = {0.035185695, -0.042891346, -0.03032477, 0.23027696,
-                         0.11098921,  0.15378423,   0.09263801,  0.09790885,
-                         0.09508917,  0.061199076,  0.07665568,  -0.015443159,
-                         -0.03499149, 0.046190713,  0.08895977,  0.10899629,
-                         0.40694186,  0.06030037,   0.012413437, -0.06108739};
-
-    cell_gate_bias_ = {-0.024379363, 0.0055531194, 0.23377132,   0.033463873,
-                       -0.1483596,   -0.10639995,  -0.091433935, 0.058573797,
-                       -0.06809782,  -0.07889636,  -0.043246906, -0.09829136,
-                       -0.4279842,   0.034901652,  0.18797937,   0.0075234566,
-                       0.016178843,  0.1749513,    0.13975595,   0.92058027};
-
-    output_gate_bias_ = {0.046159424, -0.0012809046, 0.03563469,   0.12648113,
-                         0.027195795, 0.35373217,    -0.018957434, 0.008907322,
-                         -0.0762701,  0.12018895,    0.04216877,   0.0022856654,
-                         0.040952638, 0.3147856,     0.08225149,   -0.057416286,
-                         -0.14995944, -0.008040261,  0.13208859,   0.029760877};
-
-    recurrent_to_input_weights_ = {
-        -0.001374326,   -0.078856036,   0.10672688,    0.029162422,
-        -0.11585556,    0.02557986,     -0.13446963,   -0.035785314,
-        -0.01244275,    0.025961924,    -0.02337298,   -0.044228926,
-        -0.055839065,   -0.046598054,   -0.010546039,  -0.06900766,
-        0.027239809,    0.022582639,    -0.013296484,  -0.05459212,
-        0.08981,        -0.045407712,   0.08682226,    -0.06867011,
-        -0.14390695,    -0.02916037,    0.000996957,   0.091420636,
-        0.14283475,     -0.07390571,    -0.06402044,   0.062524505,
-        -0.093129106,   0.04860203,     -0.08364217,   -0.08119002,
-        0.009352075,    0.22920375,     0.0016303885,  0.11583097,
-        -0.13732095,    0.012405723,    -0.07551853,   0.06343048,
-        0.12162708,     -0.031923793,   -0.014335606,  0.01790974,
-        -0.10650317,    -0.0724401,     0.08554849,    -0.05727212,
-        0.06556731,     -0.042729504,   -0.043227166,  0.011683251,
-        -0.013082158,   -0.029302018,   -0.010899579,  -0.062036745,
-        -0.022509435,   -0.00964907,    -0.01567329,   0.04260106,
-        -0.07787477,    -0.11576462,    0.017356863,   0.048673786,
-        -0.017577527,   -0.05527947,    -0.082487635,  -0.040137455,
-        -0.10820036,    -0.04666372,    0.022746278,   -0.07851417,
-        0.01068115,     0.032956902,    0.022433773,   0.0026891115,
-        0.08944216,     -0.0685835,     0.010513544,   0.07228705,
-        0.02032331,     -0.059686817,   -0.0005566496, -0.086984694,
-        0.040414046,    -0.1380399,     0.094208956,   -0.05722982,
-        0.012092817,    -0.04989123,    -0.086576,     -0.003399834,
-        -0.04696032,    -0.045747425,   0.10091314,    0.048676282,
-        -0.029037097,   0.031399418,    -0.0040285117, 0.047237843,
-        0.09504992,     0.041799378,    -0.049185462,  -0.031518843,
-        -0.10516937,    0.026374253,    0.10058866,    -0.0033195973,
-        -0.041975245,   0.0073591834,   0.0033782164,  -0.004325073,
-        -0.10167381,    0.042500053,    -0.01447153,   0.06464186,
-        -0.017142897,   0.03312627,     0.009205989,   0.024138335,
-        -0.011337001,   0.035530265,    -0.010912711,  0.0706555,
-        -0.005894094,   0.051841937,    -0.1401738,    -0.02351249,
-        0.0365468,      0.07590991,     0.08838724,    0.021681072,
-        -0.10086113,    0.019608743,    -0.06195883,   0.077335775,
-        0.023646897,    -0.095322326,   0.02233014,    0.09756986,
-        -0.048691444,   -0.009579111,   0.07595467,    0.11480546,
-        -0.09801813,    0.019894179,    0.08502348,    0.004032281,
-        0.037211012,    0.068537936,    -0.048005626,  -0.091520436,
-        -0.028379958,   -0.01556313,    0.06554592,    -0.045599163,
-        -0.01672207,    -0.020169014,   -0.011877351,  -0.20212261,
-        0.010889619,    0.0047078193,   0.038385306,   0.08540671,
-        -0.017140968,   -0.0035865551,  0.016678626,   0.005633034,
-        0.015963363,    0.00871737,     0.060130805,   0.028611384,
-        0.10109069,     -0.015060172,   -0.07894427,   0.06401885,
-        0.011584063,    -0.024466386,   0.0047652307,  -0.09041358,
-        0.030737216,    -0.0046374933,  0.14215417,    -0.11823516,
-        0.019899689,    0.006106124,    -0.027092824,  0.0786356,
-        0.05052217,     -0.058925,      -0.011402121,  -0.024987547,
-        -0.0013661642,  -0.06832946,    -0.015667673,  -0.1083353,
-        -0.00096863037, -0.06988685,    -0.053350925,  -0.027275559,
-        -0.033664223,   -0.07978348,    -0.025200296,  -0.017207067,
-        -0.058403496,   -0.055697463,   0.005798788,   0.12965427,
-        -0.062582195,   0.0013350133,   -0.10482091,   0.0379771,
-        0.072521195,    -0.0029455067,  -0.13797039,   -0.03628521,
-        0.013806405,    -0.017858358,   -0.01008298,   -0.07700066,
-        -0.017081132,   0.019358726,    0.0027079724,  0.004635139,
-        0.062634714,    -0.02338735,    -0.039547626,  -0.02050681,
-        0.03385117,     -0.083611414,   0.002862572,   -0.09421313,
-        0.058618143,    -0.08598433,    0.00972939,    0.023867095,
-        -0.053934585,   -0.023203006,   0.07452513,    -0.048767887,
-        -0.07314807,    -0.056307215,   -0.10433547,   -0.06440842,
-        0.04328182,     0.04389765,     -0.020006588,  -0.09076438,
-        -0.11652589,    -0.021705797,   0.03345259,    -0.010329105,
-        -0.025767034,   0.013057034,    -0.07316461,   -0.10145612,
-        0.06358255,     0.18531723,     0.07759293,    0.12006465,
-        0.1305557,      0.058638252,    -0.03393652,   0.09622831,
-        -0.16253184,    -2.4580743e-06, 0.079869635,   -0.070196845,
-        -0.005644518,   0.06857898,     -0.12598175,   -0.035084512,
-        0.03156317,     -0.12794146,    -0.031963028,  0.04692781,
-        0.030070418,    0.0071660685,   -0.095516115,  -0.004643372,
-        0.040170413,    -0.062104587,   -0.0037324072, 0.0554317,
-        0.08184801,     -0.019164372,   0.06791302,    0.034257166,
-        -0.10307039,    0.021943003,    0.046745934,   0.0790918,
-        -0.0265588,     -0.007824208,   0.042546265,   -0.00977924,
-        -0.0002440307,  -0.017384544,   -0.017990116,  0.12252321,
-        -0.014512694,   -0.08251313,    0.08861942,    0.13589665,
-        0.026351685,    0.012641483,    0.07466548,    0.044301085,
-        -0.045414884,   -0.051112458,   0.03444247,    -0.08502782,
-        -0.04106223,    -0.028126027,   0.028473156,   0.10467447};
-
-    recurrent_to_cell_weights_ = {
-        -0.037322544,   0.018592842,   0.0056175636,  -0.06253426,
-        0.055647098,    -0.05713207,   -0.05626563,   0.005559383,
-        0.03375411,     -0.025757805,  -0.088049285,  0.06017052,
-        -0.06570978,    0.007384076,   0.035123326,   -0.07920549,
-        0.053676967,    0.044480428,   -0.07663568,   0.0071805613,
-        0.08089997,     0.05143358,    0.038261272,   0.03339287,
-        -0.027673481,   0.044746667,   0.028349208,   0.020090483,
-        -0.019443132,   -0.030755889,  -0.0040000007, 0.04465846,
-        -0.021585021,   0.0031670958,  0.0053199246,  -0.056117613,
-        -0.10893326,    0.076739706,   -0.08509834,   -0.027997585,
-        0.037871376,    0.01449768,    -0.09002357,   -0.06111149,
-        -0.046195522,   0.0422062,     -0.005683705,  -0.1253618,
-        -0.012925729,   -0.04890792,   0.06985068,    0.037654128,
-        0.03398274,     -0.004781977,  0.007032333,   -0.031787455,
-        0.010868644,    -0.031489216,  0.09525667,    0.013939797,
-        0.0058680447,   0.0167067,     0.02668468,    -0.04797466,
-        -0.048885044,   -0.12722108,   0.035304096,   0.06554885,
-        0.00972396,     -0.039238118,  -0.05159735,   -0.11329045,
-        0.1613692,      -0.03750952,   0.06529313,    -0.071974665,
-        -0.11769596,    0.015524369,   -0.0013754242, -0.12446318,
-        0.02786344,     -0.014179351,  0.005264273,   0.14376344,
-        0.015983658,    0.03406988,    -0.06939408,   0.040699873,
-        0.02111075,     0.09669095,    0.041345075,   -0.08316494,
-        -0.07684199,    -0.045768797,  0.032298047,   -0.041805092,
-        0.0119405,      0.0061010392,  0.12652606,    0.0064572375,
-        -0.024950314,   0.11574242,    0.04508852,    -0.04335324,
-        0.06760663,     -0.027437469,  0.07216407,    0.06977076,
-        -0.05438599,    0.034033038,   -0.028602652,  0.05346137,
-        0.043184172,    -0.037189785,  0.10420091,    0.00882477,
-        -0.054019816,   -0.074273005,  -0.030617684,  -0.0028467078,
-        0.024302477,    -0.0038869337, 0.005332455,   0.0013399826,
-        0.04361412,     -0.007001822,  0.09631092,    -0.06702025,
-        -0.042049985,   -0.035070654,  -0.04103342,   -0.10273396,
-        0.0544271,      0.037184782,   -0.13150354,   -0.0058036847,
-        -0.008264958,   0.042035464,   0.05891794,    0.029673764,
-        0.0063542654,   0.044788733,   0.054816857,   0.062257513,
-        -0.00093483756, 0.048938446,   -0.004952862,  -0.007730018,
-        -0.04043371,    -0.017094059,  0.07229206,    -0.023670016,
-        -0.052195564,   -0.025616996,  -0.01520939,   0.045104615,
-        -0.007376126,   0.003533447,   0.006570588,   0.056037236,
-        0.12436656,     0.051817212,   0.028532185,   -0.08686856,
-        0.11868599,     0.07663395,    -0.07323171,   0.03463402,
-        -0.050708205,   -0.04458982,   -0.11590894,   0.021273347,
-        0.1251325,      -0.15313013,   -0.12224372,   0.17228661,
-        0.023029093,    0.086124025,   0.006445803,   -0.03496501,
-        0.028332196,    0.04449512,    -0.042436164,  -0.026587414,
-        -0.006041347,   -0.09292539,   -0.05678812,   0.03897832,
-        0.09465633,     0.008115513,   -0.02171956,   0.08304309,
-        0.071401566,    0.019622514,   0.032163795,   -0.004167056,
-        0.02295182,     0.030739572,   0.056506045,   0.004612461,
-        0.06524936,     0.059999723,   0.046395954,   -0.0045512207,
-        -0.1335546,     -0.030136576,  0.11584653,    -0.014678886,
-        0.0020118146,   -0.09688814,   -0.0790206,    0.039770417,
-        -0.0329582,     0.07922767,    0.029322514,   0.026405897,
-        0.04207835,     -0.07073373,   0.063781224,   0.0859677,
-        -0.10925287,    -0.07011058,   0.048005477,   0.03438226,
-        -0.09606514,    -0.006669445,  -0.043381985,  0.04240257,
-        -0.06955775,    -0.06769346,   0.043903265,   -0.026784198,
-        -0.017840602,   0.024307009,   -0.040079936,  -0.019946516,
-        0.045318738,    -0.12233574,   0.026170589,   0.0074471775,
-        0.15978073,     0.10185836,    0.10298046,    -0.015476589,
-        -0.039390966,   -0.072174534,  0.0739445,     -0.1211869,
-        -0.0347889,     -0.07943156,   0.014809798,   -0.12412325,
-        -0.0030663363,  0.039695457,   0.0647603,     -0.08291318,
-        -0.018529687,   -0.004423833,  0.0037507233,  0.084633216,
-        -0.01514876,    -0.056505352,  -0.012800942,  -0.06994386,
-        0.012962922,    -0.031234352,  0.07029052,    0.016418684,
-        0.03618972,     0.055686004,   -0.08663945,   -0.017404709,
-        -0.054761406,   0.029065743,   0.052404847,   0.020238016,
-        0.0048197987,   -0.0214882,    0.07078733,    0.013016777,
-        0.06262858,     0.009184685,   0.020785125,   -0.043904778,
-        -0.0270329,     -0.03299152,   -0.060088247,  -0.015162964,
-        -0.001828936,   0.12642565,    -0.056757294,  0.013586685,
-        0.09232601,     -0.035886683,  0.06000002,    0.05229691,
-        -0.052580316,   -0.082029596,  -0.010794592,  0.012947712,
-        -0.036429964,   -0.085508935,  -0.13127148,   -0.017744139,
-        0.031502828,    0.036232427,   -0.031581745,  0.023051167,
-        -0.05325106,    -0.03421577,   0.028793324,   -0.034633752,
-        -0.009881397,   -0.043551125,  -0.018609839,  0.0019097115,
-        -0.008799762,   0.056595087,   0.0022273948,  0.055752404};
-
-    recurrent_to_forget_weights_ = {
-        -0.057784554,  -0.026057621,  -0.068447545,   -0.022581743,
-        0.14811787,    0.10826372,    0.09471067,     0.03987225,
-        -0.0039523416, 0.00030638507, 0.053185795,    0.10572994,
-        0.08414449,    -0.022036452,  -0.00066928595, -0.09203576,
-        0.032950465,   -0.10985798,   -0.023809856,   0.0021431844,
-        -0.02196096,   -0.00326074,   0.00058621005,  -0.074678116,
-        -0.06193199,   0.055729095,   0.03736828,     0.020123724,
-        0.061878487,   -0.04729229,   0.034919553,    -0.07585433,
-        -0.04421272,   -0.044019096,  0.085488975,    0.04058006,
-        -0.06890133,   -0.030951202,  -0.024628663,   -0.07672815,
-        0.034293607,   0.08556707,    -0.05293577,    -0.033561368,
-        -0.04899627,   0.0241671,     0.015736353,    -0.095442444,
-        -0.029564252,  0.016493602,   -0.035026584,   0.022337519,
-        -0.026871363,  0.004780428,   0.0077918363,   -0.03601621,
-        0.016435321,   -0.03263031,   -0.09543275,    -0.047392778,
-        0.013454138,   0.028934088,   0.01685226,     -0.086110644,
-        -0.046250615,  -0.01847454,   0.047608484,    0.07339695,
-        0.034546845,   -0.04881143,   0.009128804,    -0.08802852,
-        0.03761666,    0.008096139,   -0.014454086,   0.014361001,
-        -0.023502491,  -0.0011840804, -0.07607001,    0.001856849,
-        -0.06509276,   -0.006021153,  -0.08570962,    -0.1451793,
-        0.060212336,   0.055259194,   0.06974018,     0.049454916,
-        -0.027794661,  -0.08077226,   -0.016179763,   0.1169753,
-        0.17213494,    -0.0056326236, -0.053934924,   -0.0124349,
-        -0.11520337,   0.05409887,    0.088759385,    0.0019655675,
-        0.0042065294,  0.03881498,    0.019844765,    0.041858196,
-        -0.05695512,   0.047233116,   0.038937137,    -0.06542224,
-        0.014429736,   -0.09719407,   0.13908425,     -0.05379757,
-        0.012321099,   0.082840554,   -0.029899208,   0.044217527,
-        0.059855383,   0.07711018,    -0.045319796,   0.0948846,
-        -0.011724666,  -0.0033288454, -0.033542685,   -0.04764985,
-        -0.13873616,   0.040668588,   0.034832682,    -0.015319203,
-        -0.018715994,  0.046002675,   0.0599172,      -0.043107376,
-        0.0294216,     -0.002314414,  -0.022424703,   0.0030315618,
-        0.0014641669,  0.0029166266,  -0.11878115,    0.013738511,
-        0.12375372,    -0.0006038222, 0.029104086,    0.087442465,
-        0.052958444,   0.07558703,    0.04817258,     0.044462286,
-        -0.015213451,  -0.08783778,   -0.0561384,     -0.003008196,
-        0.047060397,   -0.002058388,  0.03429439,     -0.018839769,
-        0.024734668,   0.024614193,   -0.042046934,   0.09597743,
-        -0.0043254104, 0.04320769,    0.0064070094,   -0.0019131786,
-        -0.02558259,   -0.022822596,  -0.023273505,   -0.02464396,
-        -0.10991725,   -0.006240552,  0.0074488563,   0.024044557,
-        0.04383914,    -0.046476185,  0.028658995,    0.060410924,
-        0.050786525,   0.009452605,   -0.0073054377,  -0.024810238,
-        0.0052906186,  0.0066939713,  -0.0020913032,  0.014515517,
-        0.015898481,   0.021362653,   -0.030262267,   0.016587038,
-        -0.011442813,  0.041154444,   -0.007631438,   -0.03423484,
-        -0.010977775,  0.036152758,   0.0066366293,   0.11915515,
-        0.02318443,    -0.041350313,  0.021485701,    -0.10906167,
-        -0.028218046,  -0.00954771,   0.020531068,    -0.11995105,
-        -0.03672871,   0.024019798,   0.014255957,    -0.05221243,
-        -0.00661567,   -0.04630967,   0.033188973,    0.10107534,
-        -0.014027541,  0.030796422,   -0.10270911,    -0.035999842,
-        0.15443139,    0.07684145,    0.036571592,    -0.035900835,
-        -0.0034699554, 0.06209149,    0.015920248,    -0.031122351,
-        -0.03858649,   0.01849943,    0.13872518,     0.01503974,
-        0.069941424,   -0.06948533,   -0.0088794185,  0.061282158,
-        -0.047401894,  0.03100163,    -0.041533746,   -0.10430945,
-        0.044574402,   -0.01425562,   -0.024290353,   0.034563623,
-        0.05866852,    0.023947537,   -0.09445152,    0.035450947,
-        0.02247216,    -0.0042998926, 0.061146557,    -0.10250651,
-        0.020881841,   -0.06747029,   0.10062043,     -0.0023941975,
-        0.03532124,    -0.016341697,  0.09685456,     -0.016764693,
-        0.051808182,   0.05875331,    -0.04536488,    0.001626336,
-        -0.028892258,  -0.01048663,   -0.009793449,   -0.017093895,
-        0.010987891,   0.02357273,    -0.00010856845, 0.0099760275,
-        -0.001845119,  -0.03551521,   0.0018358806,   0.05763657,
-        -0.01769146,   0.040995963,   0.02235177,     -0.060430344,
-        0.11475477,    -0.023854522,  0.10071741,     0.0686208,
-        -0.014250481,  0.034261297,   0.047418304,    0.08562733,
-        -0.030519066,  0.0060542435,  0.014653856,    -0.038836084,
-        0.04096551,    0.032249358,   -0.08355519,    -0.026823482,
-        0.056386515,   -0.010401743,  -0.028396193,   0.08507674,
-        0.014410365,   0.020995233,   0.17040324,     0.11511526,
-        0.02459721,    0.0066619175,  0.025853224,    -0.023133837,
-        -0.081302024,  0.017264642,   -0.009585969,   0.09491168,
-        -0.051313367,  0.054532815,   -0.014298593,   0.10657464,
-        0.007076659,   0.10964551,    0.0409152,      0.008275321,
-        -0.07283536,   0.07937492,    0.04192024,     -0.1075027};
-
-    recurrent_to_output_weights_ = {
-        0.025825322,   -0.05813119,   0.09495884,     -0.045984812,
-        -0.01255415,   -0.0026479573, -0.08196161,    -0.054914974,
-        -0.0046604523, -0.029587349,  -0.044576716,   -0.07480124,
-        -0.082868785,  0.023254942,   0.027502948,    -0.0039728214,
-        -0.08683098,   -0.08116779,   -0.014675607,   -0.037924774,
-        -0.023314456,  -0.007401714,  -0.09255757,    0.029460307,
-        -0.08829125,   -0.005139627,  -0.08989442,    -0.0555066,
-        0.13596267,    -0.025062224,  -0.048351806,   -0.03850004,
-        0.07266485,    -0.022414139,  0.05940088,     0.075114764,
-        0.09597592,    -0.010211725,  -0.0049794707,  -0.011523867,
-        -0.025980417,  0.072999895,   0.11091378,     -0.081685916,
-        0.014416728,   0.043229222,   0.034178585,    -0.07530371,
-        0.035837382,   -0.085607,     -0.007721233,   -0.03287832,
-        -0.043848954,  -0.06404588,   -0.06632928,    -0.073643476,
-        0.008214239,   -0.045984086,  0.039764922,    0.03474462,
-        0.060612556,   -0.080590084,  0.049127717,    0.04151091,
-        -0.030063879,  0.008801774,   -0.023021035,   -0.019558564,
-        0.05158114,    -0.010947698,  -0.011825728,   0.0075720972,
-        0.0699727,     -0.0039981045, 0.069350146,    0.08799282,
-        0.016156472,   0.035502106,   0.11695009,     0.006217345,
-        0.13392477,    -0.037875112,  0.025745004,    0.08940699,
-        -0.00924166,   0.0046702605,  -0.036598757,   -0.08811812,
-        0.10522024,    -0.032441203,  0.008176899,    -0.04454919,
-        0.07058152,    0.0067963637,  0.039206743,    0.03259838,
-        0.03725492,    -0.09515802,   0.013326398,    -0.052055415,
-        -0.025676316,  0.03198509,    -0.015951829,   -0.058556724,
-        0.036879618,   0.043357447,   0.028362012,    -0.05908629,
-        0.0059240665,  -0.04995891,   -0.019187413,   0.0276265,
-        -0.01628143,   0.0025863599,  0.08800015,     0.035250366,
-        -0.022165963,  -0.07328642,   -0.009415526,   -0.07455109,
-        0.11690406,    0.0363299,     0.07411125,     0.042103454,
-        -0.009660886,  0.019076364,   0.018299393,    -0.046004917,
-        0.08891175,    0.0431396,     -0.026327137,   -0.051502608,
-        0.08979574,    -0.051670972,  0.04940282,     -0.07491107,
-        -0.021240504,  0.022596184,   -0.034280192,   0.060163025,
-        -0.058211457,  -0.051837247,  -0.01349775,    -0.04639988,
-        -0.035936575,  -0.011681591,  0.064818054,    0.0073146066,
-        -0.021745546,  -0.043124277,  -0.06471268,    -0.07053354,
-        -0.029321948,  -0.05330136,   0.016933719,    -0.053782392,
-        0.13747959,    -0.1361751,    -0.11569455,    0.0033329215,
-        0.05693899,    -0.053219706,  0.063698,       0.07977434,
-        -0.07924483,   0.06936997,    0.0034815092,   -0.007305279,
-        -0.037325785,  -0.07251102,   -0.033633437,   -0.08677009,
-        0.091591336,   -0.14165086,   0.021752775,    0.019683983,
-        0.0011612234,  -0.058154266,  0.049996935,    0.0288841,
-        -0.0024567875, -0.14345716,   0.010955264,    -0.10234828,
-        0.1183656,     -0.0010731248, -0.023590032,   -0.072285876,
-        -0.0724771,    -0.026382286,  -0.0014920527,  0.042667855,
-        0.0018776858,  0.02986552,    0.009814309,    0.0733756,
-        0.12289186,    0.018043943,   -0.0458958,     0.049412545,
-        0.033632483,   0.05495232,    0.036686596,    -0.013781798,
-        -0.010036754,  0.02576849,    -0.08307328,    0.010112348,
-        0.042521734,   -0.05869831,   -0.071689695,   0.03876447,
-        -0.13275425,   -0.0352966,    -0.023077697,   0.10285965,
-        0.084736146,   0.15568255,    -0.00040734606, 0.027835453,
-        -0.10292561,   -0.032401145,  0.10053256,     -0.026142767,
-        -0.08271222,   -0.0030240538, -0.016368777,   0.1070414,
-        0.042672627,   0.013456989,   -0.0437609,     -0.022309763,
-        0.11576483,    0.04108048,    0.061026827,    -0.0190714,
-        -0.0869359,    0.037901703,   0.0610107,      0.07202949,
-        0.01675338,    0.086139716,   -0.08795751,    -0.014898893,
-        -0.023771819,  -0.01965048,   0.007955471,    -0.043740474,
-        0.03346837,    -0.10549954,   0.090567775,    0.042013682,
-        -0.03176985,   0.12569028,    -0.02421228,    -0.029526481,
-        0.023851605,   0.031539805,   0.05292009,     -0.02344001,
-        -0.07811758,   -0.08834428,   0.10094801,     0.16594367,
-        -0.06861939,   -0.021256343,  -0.041093912,   -0.06669611,
-        0.035498552,   0.021757556,   -0.09302526,    -0.015403468,
-        -0.06614931,   -0.051798206,  -0.013874718,   0.03630673,
-        0.010412845,   -0.08077351,   0.046185967,    0.0035662893,
-        0.03541868,    -0.094149634,  -0.034814864,   0.003128424,
-        -0.020674974,  -0.03944324,   -0.008110165,   -0.11113267,
-        0.08484226,    0.043586485,   0.040582247,    0.0968012,
-        -0.065249965,  -0.028036479,  0.0050708856,   0.0017462453,
-        0.0326779,     0.041296225,   0.09164146,     -0.047743853,
-        -0.015952192,  -0.034451712,  0.084197424,    -0.05347844,
-        -0.11768019,   0.085926116,   -0.08251791,    -0.045081906,
-        0.0948852,     0.068401024,   0.024856757,    0.06978981,
-        -0.057309967,  -0.012775832,  -0.0032452994,  0.01977615,
-        -0.041040014,  -0.024264973,  0.063464895,    0.05431621,
-    };
-
-    cell_to_input_weights_ = {
-        0.040369894, 0.030746894,  0.24704495,  0.018586371,  -0.037586458,
-        -0.15312155, -0.11812848,  -0.11465643, 0.20259799,   0.11418174,
-        -0.10116027, -0.011334949, 0.12411352,  -0.076769054, -0.052169047,
-        0.21198851,  -0.38871562,  -0.09061183, -0.09683246,  -0.21929175};
-
-    cell_to_forget_weights_ = {
-        -0.01998659,  -0.15568835,  -0.24248174,   -0.012770197, 0.041331276,
-        -0.072311886, -0.052123554, -0.0066330447, -0.043891653, 0.036225766,
-        -0.047248036, 0.021479502,  0.033189066,   0.11952997,   -0.020432774,
-        0.64658105,   -0.06650122,  -0.03467612,   0.095340036,  0.23647355};
-
-    cell_to_output_weights_ = {
-        0.08286371,  -0.08261836, -0.51210177, 0.002913762, 0.17764764,
-        -0.5495371,  -0.08460716, -0.24552552, 0.030037103, 0.04123544,
-        -0.11940523, 0.007358328, 0.1890978,   0.4833202,   -0.34441817,
-        0.36312827,  -0.26375428, 0.1457655,   -0.19724406, 0.15548733};
-
-    projection_weights_ = {
-        -0.009802181, 0.09401916,   0.0717386,     -0.13895074,
-        0.09641832,   0.060420845,  0.08539281,    0.054285463,
-        0.061395317,  0.034448683,  -0.042991187,  0.019801661,
-        -0.16840284,  -0.015726732, -0.23041931,   -0.024478018,
-        -0.10959692,  -0.013875541, 0.18600968,    -0.061274476,
-        0.0138165,    -0.08160894,  -0.07661644,   0.032372914,
-        0.16169067,   0.22465782,   -0.03993472,   -0.004017731,
-        0.08633481,   -0.28869787,  0.08682067,    0.17240396,
-        0.014975425,  0.056431185,  0.031037588,   0.16702051,
-        0.0077946745, 0.15140012,   0.29405436,    0.120285,
-        -0.188994,    -0.027265169, 0.043389652,   -0.022061434,
-        0.014777949,  -0.20203483,  0.094781205,   0.19100232,
-        0.13987629,   -0.036132768, -0.06426278,   -0.05108664,
-        0.13221376,   0.009441198,  -0.16715929,   0.15859416,
-        -0.040437475, 0.050779544,  -0.022187516,  0.012166504,
-        0.027685808,  -0.07675938,  -0.0055694645, -0.09444123,
-        0.0046453946, 0.050794356,  0.10770313,    -0.20790008,
-        -0.07149004,  -0.11425117,  0.008225835,   -0.035802525,
-        0.14374903,   0.15262283,   0.048710253,   0.1847461,
-        -0.007487823, 0.11000021,   -0.09542012,   0.22619456,
-        -0.029149994, 0.08527916,   0.009043713,   0.0042746216,
-        0.016261552,  0.022461696,  0.12689082,    -0.043589946,
-        -0.12035478,  -0.08361797,  -0.050666027,  -0.1248618,
-        -0.1275799,   -0.071875185, 0.07377272,    0.09944291,
-        -0.18897448,  -0.1593054,   -0.06526116,   -0.040107165,
-        -0.004618631, -0.067624845, -0.007576253,  0.10727444,
-        0.041546922,  -0.20424393,  0.06907816,    0.050412357,
-        0.00724631,   0.039827548,  0.12449835,    0.10747581,
-        0.13708383,   0.09134148,   -0.12617786,   -0.06428341,
-        0.09956831,   0.1208086,    -0.14676677,   -0.0727722,
-        0.1126304,    0.010139365,  0.015571211,   -0.038128063,
-        0.022913318,  -0.042050496, 0.16842307,    -0.060597885,
-        0.10531834,   -0.06411776,  -0.07451711,   -0.03410368,
-        -0.13393489,  0.06534304,   0.003620307,   0.04490757,
-        0.05970546,   0.05197996,   0.02839995,    0.10434969,
-        -0.013699693, -0.028353551, -0.07260381,   0.047201227,
-        -0.024575593, -0.036445823, 0.07155557,    0.009672501,
-        -0.02328883,  0.009533515,  -0.03606021,   -0.07421458,
-        -0.028082801, -0.2678904,   -0.13221288,   0.18419984,
-        -0.13012612,  -0.014588381, -0.035059117,  -0.04824723,
-        0.07830115,   -0.056184657, 0.03277091,    0.025466874,
-        0.14494097,   -0.12522776,  -0.098633975,  -0.10766018,
-        -0.08317623,  0.08594209,   0.07749552,    0.039474737,
-        0.1776665,    -0.07409566,  -0.0477268,    0.29323658,
-        0.10801441,   0.1154011,    0.013952499,   0.10739139,
-        0.10708251,   -0.051456142, 0.0074137426,  -0.10430189,
-        0.10034707,   0.045594677,  0.0635285,     -0.0715442,
-        -0.089667566, -0.10811871,  0.00026344223, 0.08298446,
-        -0.009525053, 0.006585689,  -0.24567553,   -0.09450807,
-        0.09648481,   0.026996298,  -0.06419476,   -0.04752702,
-        -0.11063944,  -0.23441927,  -0.17608605,   -0.052156363,
-        0.067035615,  0.19271925,   -0.0032889997, -0.043264326,
-        0.09663576,   -0.057112187, -0.10100678,   0.0628376,
-        0.04447668,   0.017961001,  -0.10094388,   -0.10190601,
-        0.18335468,   0.10494553,   -0.052095775,  -0.0026118709,
-        0.10539724,   -0.04383912,  -0.042349473,  0.08438151,
-        -0.1947263,   0.02251204,   0.11216432,    -0.10307853,
-        0.17351969,   -0.039091777, 0.08066188,    -0.00561982,
-        0.12633002,   0.11335965,   -0.0088127935, -0.019777594,
-        0.06864014,   -0.059751723, 0.016233567,   -0.06894641,
-        -0.28651384,  -0.004228674, 0.019708522,   -0.16305895,
-        -0.07468996,  -0.0855457,   0.099339016,   -0.07580735,
-        -0.13775392,  0.08434318,   0.08330512,    -0.12131499,
-        0.031935584,  0.09180414,   -0.08876437,   -0.08049874,
-        0.008753825,  0.03498998,   0.030215185,   0.03907079,
-        0.089751154,  0.029194152,  -0.03337423,   -0.019092513,
-        0.04331237,   0.04299654,   -0.036394123,  -0.12915532,
-        0.09793732,   0.07512415,   -0.11319543,   -0.032502122,
-        0.15661901,   0.07671967,   -0.005491124,  -0.19379048,
-        -0.218606,    0.21448623,   0.017840758,   0.1416943,
-        -0.07051762,  0.19488361,   0.02664691,    -0.18104725,
-        -0.09334311,  0.15026465,   -0.15493552,   -0.057762887,
-        -0.11604192,  -0.262013,    -0.01391798,   0.012185008,
-        0.11156489,   -0.07483202,  0.06693364,    -0.26151478,
-        0.046425626,  0.036540434,  -0.16435726,   0.17338543,
-        -0.21401681,  -0.11385144,  -0.08283257,   -0.069031075,
-        0.030635102,  0.010969227,  0.11109743,    0.010919218,
-        0.027526086,  0.13519906,   0.01891392,    -0.046839405,
-        -0.040167913, 0.017953383,  -0.09700955,   0.0061885654,
-        -0.07000971,  0.026893595,  -0.038844477,  0.14543656};
-
-    lstm_input_ = {
-        {// Batch0: 4 (input_sequence_size) * 5 (n_input)
-         0.787926, 0.151646, 0.071352, 0.118426, 0.458058,   // step 0
-         0.596268, 0.998386, 0.568695, 0.864524, 0.571277,   // step 1
-         0.073204, 0.296072, 0.743333, 0.069199, 0.045348,   // step 2
-         0.867394, 0.291279, 0.013714, 0.482521, 0.626339},  // step 3
-
-        {// Batch1: 4 (input_sequence_size) * 5 (n_input)
-         0.295743, 0.544053, 0.690064, 0.858138, 0.497181,  // step 0
-         0.642421, 0.524260, 0.134799, 0.003639, 0.162482,  // step 1
-         0.640394, 0.930399, 0.050782, 0.432485, 0.988078,  // step 2
-         0.082922, 0.563329, 0.865614, 0.333232, 0.259916}  // step 3
-    };
-
-    lstm_golden_output_ = {
-        {// Batch0: 4 (input_sequence_size) * 16 (n_output)
-         -0.00396806, 0.029352,     -0.00279226, 0.0159977,   -0.00835576,
-         -0.0211779,  0.0283512,    -0.0114597,  0.00907307,  -0.0244004,
-         -0.0152191,  -0.0259063,   0.00914318,  0.00415118,  0.017147,
-         0.0134203,   -0.0166936,   0.0381209,   0.000889694, 0.0143363,
-         -0.0328911,  -0.0234288,   0.0333051,   -0.012229,   0.0110322,
-         -0.0457725,  -0.000832209, -0.0202817,  0.0327257,   0.0121308,
-         0.0155969,   0.0312091,    -0.0213783,  0.0350169,   0.000324794,
-         0.0276012,   -0.0263374,   -0.0371449,  0.0446149,   -0.0205474,
-         0.0103729,   -0.0576349,   -0.0150052,  -0.0292043,  0.0376827,
-         0.0136115,   0.0243435,    0.0354492,   -0.0189322,  0.0464512,
-         -0.00251373, 0.0225745,    -0.0308346,  -0.0317124,  0.0460407,
-         -0.0189395,  0.0149363,    -0.0530162,  -0.0150767,  -0.0340193,
-         0.0286833,   0.00824207,   0.0264887,   0.0305169},
-        {// Batch1: 4 (input_sequence_size) * 16 (n_output)
-         -0.013869,    0.0287268,   -0.00334693, 0.00733398,  -0.0287926,
-         -0.0186926,   0.0193662,   -0.0115437,  0.00422612,  -0.0345232,
-         0.00223253,   -0.00957321, 0.0210624,   0.013331,    0.0150954,
-         0.02168,      -0.0141913,  0.0322082,   0.00227024,  0.0260507,
-         -0.0188721,   -0.0296489,  0.0399134,   -0.0160509,  0.0116039,
-         -0.0447318,   -0.0150515,  -0.0277406,  0.0316596,   0.0118233,
-         0.0214762,    0.0293641,   -0.0204549,  0.0450315,   -0.00117378,
-         0.0167673,    -0.0375007,  -0.0238314,  0.038784,    -0.0174034,
-         0.0131743,    -0.0506589,  -0.0048447,  -0.0240239,  0.0325789,
-         0.00790065,   0.0220157,   0.0333314,   -0.0264787,  0.0387855,
-         -0.000764675, 0.0217599,   -0.037537,   -0.0335206,  0.0431679,
-         -0.0211424,   0.010203,    -0.062785,   -0.00832363, -0.025181,
-         0.0412031,    0.0118723,   0.0239643,   0.0394009}};
-  }
-};
-
-TEST_F(NoCifgPeepholeProjectionNoClippingLstmTest, LstmBlackBoxTest) {
+TEST_P(LstmOpTest, NoCifg_Peephole_Projection_NoLayerNorm) {
   const int n_batch = 2;
   const int n_input = 5;
   const int n_cell = 20;
   const int n_output = 16;
 
-  LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
-                   /*use_cifg=*/false, /*use_peephole=*/true,
-                   /*use_projection_weights=*/true,
-                   /*use_projection_bias=*/false,
-                   /*cell_clip=*/0.0, /*proj_clip=*/0.0,
-                   {
-                       {n_batch, n_input},  // input tensor
+  TensorType weight_type;
+  bool model_has_legacy_20_inputs;
+  bool asymmetric_quantize_inputs;
+  std::tie(weight_type, model_has_legacy_20_inputs,
+           asymmetric_quantize_inputs) = GetParam();
 
-                       {n_cell, n_input},  // input_to_input_weight tensor
-                       {n_cell, n_input},  // input_to_forget_weight tensor
-                       {n_cell, n_input},  // input_to_cell_weight tensor
-                       {n_cell, n_input},  // input_to_output_weight tensor
-
-                       {n_cell, n_output},  // recurrent_to_input_weight tensor
-                       {n_cell, n_output},  // recurrent_to_forget_weight tensor
-                       {n_cell, n_output},  // recurrent_to_cell_weight tensor
-                       {n_cell, n_output},  // recurrent_to_output_weight tensor
-
-                       {n_cell},  // cell_to_input_weight tensor
-                       {n_cell},  // cell_to_forget_weight tensor
-                       {n_cell},  // cell_to_output_weight tensor
-
-                       {n_cell},  // input_gate_bias tensor
-                       {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_gate_bias tensor
-                       {n_cell},  // output_gate_bias tensor
-
-                       {n_output, n_cell},  // projection_weight tensor
-                       {0},                 // projection_bias tensor
-                   },
-                   /*weight_type=*/TensorType_FLOAT32,
-                   /*is_layer_norm=*/false);
-
-  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
-}
-
-TEST_P(NoCifgPeepholeProjectionNoClippingLstmTest,
-       HybridLstmBlackBoxTestUint8) {
-  // TODO(b/158205028): Fix this test if GetForceUseNnapi() && !GetParam().
-  if (SingleOpModel::GetForceUseNnapi()) {
+  // TODO(b/158205028): Fix this test if using NN-API.
+  if (SingleOpModel::GetForceUseNnapi() && weight_type == TensorType_UINT8) {
     return;
   }
-  const int n_batch = 2;
-  const int n_input = 5;
-  const int n_cell = 20;
-  const int n_output = 16;
 
-  LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
-                   /*use_cifg=*/false, /*use_peephole=*/true,
-                   /*use_projection_weights=*/true,
-                   /*use_projection_bias=*/false,
-                   /*cell_clip=*/0.0, /*proj_clip=*/0.0,
-                   {
-                       {n_batch, n_input},  // input tensor
+  input_to_input_weights_ = {
+      0.021393683,  0.06124551,    0.046905167,  -0.014657677,  -0.03149463,
+      0.09171803,   0.14647801,    0.10797193,   -0.0057968358, 0.0019193048,
+      -0.2726754,   0.10154029,    -0.018539885, 0.080349885,   -0.10262385,
+      -0.022599787, -0.09121155,   -0.008675967, -0.045206103,  -0.0821282,
+      -0.008045952, 0.015478081,   0.055217247,  0.038719587,   0.044153627,
+      -0.06453243,  0.05031825,    -0.046935108, -0.008164439,  0.014574226,
+      -0.1671009,   -0.15519552,   -0.16819797,  -0.13971269,   -0.11953059,
+      0.25005487,   -0.22790983,   0.009855087,  -0.028140958,  -0.11200698,
+      0.11295408,   -0.0035217577, 0.054485075,  0.05184695,    0.064711206,
+      0.10989193,   0.11674786,    0.03490607,   0.07727357,    0.11390585,
+      -0.1863375,   -0.1034451,    -0.13945189,  -0.049401227,  -0.18767063,
+      0.042483903,  0.14233552,    0.13832581,   0.18350165,    0.14545603,
+      -0.028545704, 0.024939531,   0.050929718,  0.0076203286,  -0.0029723682,
+      -0.042484224, -0.11827596,   -0.09171104,  -0.10808628,   -0.16327988,
+      -0.2273378,   -0.0993647,    -0.017155107, 0.0023917493,  0.049272764,
+      0.0038534778, 0.054764505,   0.089753784,  0.06947234,    0.08014476,
+      -0.04544234,  -0.0497073,    -0.07135631,  -0.048929106,  -0.004042012,
+      -0.009284026, 0.018042054,   0.0036860977, -0.07427302,   -0.11434604,
+      -0.018995456, 0.031487543,   0.012834908,  0.019977754,   0.044256654,
+      -0.39292613,  -0.18519334,   -0.11651281,  -0.06809892,   0.011373677};
+
+  input_to_forget_weights_ = {
+      -0.0018401089, -0.004852237,  0.03698424,   0.014181704,   0.028273236,
+      -0.016726194,  -0.05249759,   -0.10204261,  0.00861066,    -0.040979505,
+      -0.009899187,  0.01923892,    -0.028177269, -0.08535103,   -0.14585495,
+      0.10662567,    -0.01909731,   -0.017883534, -0.0047269356, -0.045103323,
+      0.0030784295,  0.076784775,   0.07463696,   0.094531395,   0.0814421,
+      -0.12257899,   -0.033945758,  -0.031303465, 0.045630626,   0.06843887,
+      -0.13492945,   -0.012480007,  -0.0811829,   -0.07224499,   -0.09628791,
+      0.045100946,   0.0012300825,  0.013964662,  0.099372394,   0.02543059,
+      0.06958324,    0.034257296,   0.0482646,    0.06267997,    0.052625068,
+      0.12784666,    0.07077897,    0.025725935,  0.04165009,    0.07241905,
+      0.018668644,   -0.037377294,  -0.06277783,  -0.08833636,   -0.040120605,
+      -0.011405586,  -0.007808335,  -0.010301386, -0.005102167,  0.027717464,
+      0.05483423,    0.11449111,    0.11289652,   0.10939839,    0.13396506,
+      -0.08402166,   -0.01901462,   -0.044678304, -0.07720565,   0.014350063,
+      -0.11757958,   -0.0652038,    -0.08185733,  -0.076754324,  -0.092614375,
+      0.10405491,    0.052960336,   0.035755895,  0.035839386,   -0.012540553,
+      0.036881298,   0.02913376,    0.03420159,   0.05448447,    -0.054523353,
+      0.02582715,    0.02327355,    -0.011857179, -0.0011980024, -0.034641717,
+      -0.026125094,  -0.17582615,   -0.15923657,  -0.27486774,   -0.0006143371,
+      0.0001771948,  -8.470171e-05, 0.02651807,   0.045790765,   0.06956496};
+
+  input_to_cell_weights_ = {
+      -0.04580283,  -0.09549462,   -0.032418985,  -0.06454633,   -0.043528453,
+      0.043018587,  -0.049152344,  -0.12418144,   -0.078985475,  -0.07596889,
+      0.019484362,  -0.11434962,   -0.0074034138, -0.06314844,   -0.092981495,
+      0.0062155537, -0.025034338,  -0.0028890965, 0.048929527,   0.06235075,
+      0.10665918,   -0.032036792,  -0.08505916,   -0.10843358,   -0.13002433,
+      -0.036816437, -0.02130134,   -0.016518239,  0.0047691227,  -0.0025825808,
+      0.066017866,  0.029991534,   -0.10652836,   -0.1037554,    -0.13056071,
+      -0.03266643,  -0.033702414,  -0.006473424,  -0.04611692,   0.014419339,
+      -0.025174323, 0.0396852,     0.081777506,   0.06157468,    0.10210095,
+      -0.009658194, 0.046511717,   0.03603906,    0.0069369148,  0.015960095,
+      -0.06507666,  0.09551598,    0.053568836,   0.06408714,    0.12835667,
+      -0.008714329, -0.20211966,   -0.12093674,   0.029450472,   0.2849013,
+      -0.029227901, 0.1164364,     -0.08560263,   0.09941786,    -0.036999565,
+      -0.028842626, -0.0033637602, -0.017012902,  -0.09720865,   -0.11193351,
+      -0.029155117, -0.017936034,  -0.009768936,  -0.04223324,   -0.036159635,
+      0.06505112,   -0.021742892,  -0.023377212,  -0.07221364,   -0.06430552,
+      0.05453865,   0.091149814,   0.06387331,    0.007518393,   0.055960953,
+      0.069779344,  0.046411168,   0.10509911,    0.07463894,    0.0075130584,
+      0.012850982,  0.04555431,    0.056955688,   0.06555285,    0.050801456,
+      -0.009862683, 0.00826772,    -0.026555609,  -0.0073611983, -0.0014897042};
+
+  input_to_output_weights_ = {
+      -0.0998932,   -0.07201956,  -0.052803773,  -0.15629593,  -0.15001918,
+      -0.07650751,  0.02359855,   -0.075155355,  -0.08037709,  -0.15093534,
+      0.029517552,  -0.04751393,  0.010350531,   -0.02664851,  -0.016839722,
+      -0.023121163, 0.0077019283, 0.012851257,   -0.05040649,  -0.0129761,
+      -0.021737747, -0.038305793, -0.06870586,   -0.01481247,  -0.001285394,
+      0.10124236,   0.083122835,  0.053313006,   -0.062235646, -0.075637154,
+      -0.027833903, 0.029774971,  0.1130802,     0.09218906,   0.09506135,
+      -0.086665764, -0.037162706, -0.038880914,  -0.035832845, -0.014481564,
+      -0.09825003,  -0.12048569,  -0.097665586,  -0.05287633,  -0.0964047,
+      -0.11366429,  0.035777505,  0.13568819,    0.052451383,  0.050649304,
+      0.05798951,   -0.021852335, -0.099848844,  0.014740475,  -0.078897946,
+      0.04974699,   0.014160473,  0.06973932,    0.04964942,   0.033364646,
+      0.08190124,   0.025535367,  0.050893165,   0.048514254,  0.06945813,
+      -0.078907564, -0.06707616,  -0.11844508,   -0.09986688,  -0.07509403,
+      0.06263226,   0.14925587,   0.20188436,    0.12098451,   0.14639415,
+      0.0015017595, -0.014267382, -0.03417257,   0.012711468,  0.0028300495,
+      -0.024758482, -0.05098548,  -0.0821182,    0.014225672,  0.021544158,
+      0.08949725,   0.07505268,   -0.0020780868, 0.04908258,   0.06476295,
+      -0.022907063, 0.027562456,  0.040185735,   0.019567577,  -0.015598739,
+      -0.049097303, -0.017121866, -0.083368234,  -0.02332002,  -0.0840956};
+
+  input_gate_bias_ = {0.02234832,   0.14757581,  0.18176508,  0.10380666,
+                      0.053110216,  -0.06928846, -0.13942584, -0.11816189,
+                      0.19483899,   0.03652339,  -0.10250295, 0.036714908,
+                      -0.18426876,  0.036065217, 0.21810818,  0.02383196,
+                      -0.043370757, 0.08690144,  -0.04444982, 0.00030581196};
+
+  forget_gate_bias_ = {0.035185695, -0.042891346, -0.03032477, 0.23027696,
+                       0.11098921,  0.15378423,   0.09263801,  0.09790885,
+                       0.09508917,  0.061199076,  0.07665568,  -0.015443159,
+                       -0.03499149, 0.046190713,  0.08895977,  0.10899629,
+                       0.40694186,  0.06030037,   0.012413437, -0.06108739};
+
+  cell_gate_bias_ = {-0.024379363, 0.0055531194, 0.23377132,   0.033463873,
+                     -0.1483596,   -0.10639995,  -0.091433935, 0.058573797,
+                     -0.06809782,  -0.07889636,  -0.043246906, -0.09829136,
+                     -0.4279842,   0.034901652,  0.18797937,   0.0075234566,
+                     0.016178843,  0.1749513,    0.13975595,   0.92058027};
+
+  output_gate_bias_ = {0.046159424, -0.0012809046, 0.03563469,   0.12648113,
+                       0.027195795, 0.35373217,    -0.018957434, 0.008907322,
+                       -0.0762701,  0.12018895,    0.04216877,   0.0022856654,
+                       0.040952638, 0.3147856,     0.08225149,   -0.057416286,
+                       -0.14995944, -0.008040261,  0.13208859,   0.029760877};
+
+  recurrent_to_input_weights_ = {
+      -0.001374326,   -0.078856036,   0.10672688,    0.029162422,
+      -0.11585556,    0.02557986,     -0.13446963,   -0.035785314,
+      -0.01244275,    0.025961924,    -0.02337298,   -0.044228926,
+      -0.055839065,   -0.046598054,   -0.010546039,  -0.06900766,
+      0.027239809,    0.022582639,    -0.013296484,  -0.05459212,
+      0.08981,        -0.045407712,   0.08682226,    -0.06867011,
+      -0.14390695,    -0.02916037,    0.000996957,   0.091420636,
+      0.14283475,     -0.07390571,    -0.06402044,   0.062524505,
+      -0.093129106,   0.04860203,     -0.08364217,   -0.08119002,
+      0.009352075,    0.22920375,     0.0016303885,  0.11583097,
+      -0.13732095,    0.012405723,    -0.07551853,   0.06343048,
+      0.12162708,     -0.031923793,   -0.014335606,  0.01790974,
+      -0.10650317,    -0.0724401,     0.08554849,    -0.05727212,
+      0.06556731,     -0.042729504,   -0.043227166,  0.011683251,
+      -0.013082158,   -0.029302018,   -0.010899579,  -0.062036745,
+      -0.022509435,   -0.00964907,    -0.01567329,   0.04260106,
+      -0.07787477,    -0.11576462,    0.017356863,   0.048673786,
+      -0.017577527,   -0.05527947,    -0.082487635,  -0.040137455,
+      -0.10820036,    -0.04666372,    0.022746278,   -0.07851417,
+      0.01068115,     0.032956902,    0.022433773,   0.0026891115,
+      0.08944216,     -0.0685835,     0.010513544,   0.07228705,
+      0.02032331,     -0.059686817,   -0.0005566496, -0.086984694,
+      0.040414046,    -0.1380399,     0.094208956,   -0.05722982,
+      0.012092817,    -0.04989123,    -0.086576,     -0.003399834,
+      -0.04696032,    -0.045747425,   0.10091314,    0.048676282,
+      -0.029037097,   0.031399418,    -0.0040285117, 0.047237843,
+      0.09504992,     0.041799378,    -0.049185462,  -0.031518843,
+      -0.10516937,    0.026374253,    0.10058866,    -0.0033195973,
+      -0.041975245,   0.0073591834,   0.0033782164,  -0.004325073,
+      -0.10167381,    0.042500053,    -0.01447153,   0.06464186,
+      -0.017142897,   0.03312627,     0.009205989,   0.024138335,
+      -0.011337001,   0.035530265,    -0.010912711,  0.0706555,
+      -0.005894094,   0.051841937,    -0.1401738,    -0.02351249,
+      0.0365468,      0.07590991,     0.08838724,    0.021681072,
+      -0.10086113,    0.019608743,    -0.06195883,   0.077335775,
+      0.023646897,    -0.095322326,   0.02233014,    0.09756986,
+      -0.048691444,   -0.009579111,   0.07595467,    0.11480546,
+      -0.09801813,    0.019894179,    0.08502348,    0.004032281,
+      0.037211012,    0.068537936,    -0.048005626,  -0.091520436,
+      -0.028379958,   -0.01556313,    0.06554592,    -0.045599163,
+      -0.01672207,    -0.020169014,   -0.011877351,  -0.20212261,
+      0.010889619,    0.0047078193,   0.038385306,   0.08540671,
+      -0.017140968,   -0.0035865551,  0.016678626,   0.005633034,
+      0.015963363,    0.00871737,     0.060130805,   0.028611384,
+      0.10109069,     -0.015060172,   -0.07894427,   0.06401885,
+      0.011584063,    -0.024466386,   0.0047652307,  -0.09041358,
+      0.030737216,    -0.0046374933,  0.14215417,    -0.11823516,
+      0.019899689,    0.006106124,    -0.027092824,  0.0786356,
+      0.05052217,     -0.058925,      -0.011402121,  -0.024987547,
+      -0.0013661642,  -0.06832946,    -0.015667673,  -0.1083353,
+      -0.00096863037, -0.06988685,    -0.053350925,  -0.027275559,
+      -0.033664223,   -0.07978348,    -0.025200296,  -0.017207067,
+      -0.058403496,   -0.055697463,   0.005798788,   0.12965427,
+      -0.062582195,   0.0013350133,   -0.10482091,   0.0379771,
+      0.072521195,    -0.0029455067,  -0.13797039,   -0.03628521,
+      0.013806405,    -0.017858358,   -0.01008298,   -0.07700066,
+      -0.017081132,   0.019358726,    0.0027079724,  0.004635139,
+      0.062634714,    -0.02338735,    -0.039547626,  -0.02050681,
+      0.03385117,     -0.083611414,   0.002862572,   -0.09421313,
+      0.058618143,    -0.08598433,    0.00972939,    0.023867095,
+      -0.053934585,   -0.023203006,   0.07452513,    -0.048767887,
+      -0.07314807,    -0.056307215,   -0.10433547,   -0.06440842,
+      0.04328182,     0.04389765,     -0.020006588,  -0.09076438,
+      -0.11652589,    -0.021705797,   0.03345259,    -0.010329105,
+      -0.025767034,   0.013057034,    -0.07316461,   -0.10145612,
+      0.06358255,     0.18531723,     0.07759293,    0.12006465,
+      0.1305557,      0.058638252,    -0.03393652,   0.09622831,
+      -0.16253184,    -2.4580743e-06, 0.079869635,   -0.070196845,
+      -0.005644518,   0.06857898,     -0.12598175,   -0.035084512,
+      0.03156317,     -0.12794146,    -0.031963028,  0.04692781,
+      0.030070418,    0.0071660685,   -0.095516115,  -0.004643372,
+      0.040170413,    -0.062104587,   -0.0037324072, 0.0554317,
+      0.08184801,     -0.019164372,   0.06791302,    0.034257166,
+      -0.10307039,    0.021943003,    0.046745934,   0.0790918,
+      -0.0265588,     -0.007824208,   0.042546265,   -0.00977924,
+      -0.0002440307,  -0.017384544,   -0.017990116,  0.12252321,
+      -0.014512694,   -0.08251313,    0.08861942,    0.13589665,
+      0.026351685,    0.012641483,    0.07466548,    0.044301085,
+      -0.045414884,   -0.051112458,   0.03444247,    -0.08502782,
+      -0.04106223,    -0.028126027,   0.028473156,   0.10467447};
+
+  recurrent_to_cell_weights_ = {
+      -0.037322544,   0.018592842,   0.0056175636,  -0.06253426,
+      0.055647098,    -0.05713207,   -0.05626563,   0.005559383,
+      0.03375411,     -0.025757805,  -0.088049285,  0.06017052,
+      -0.06570978,    0.007384076,   0.035123326,   -0.07920549,
+      0.053676967,    0.044480428,   -0.07663568,   0.0071805613,
+      0.08089997,     0.05143358,    0.038261272,   0.03339287,
+      -0.027673481,   0.044746667,   0.028349208,   0.020090483,
+      -0.019443132,   -0.030755889,  -0.0040000007, 0.04465846,
+      -0.021585021,   0.0031670958,  0.0053199246,  -0.056117613,
+      -0.10893326,    0.076739706,   -0.08509834,   -0.027997585,
+      0.037871376,    0.01449768,    -0.09002357,   -0.06111149,
+      -0.046195522,   0.0422062,     -0.005683705,  -0.1253618,
+      -0.012925729,   -0.04890792,   0.06985068,    0.037654128,
+      0.03398274,     -0.004781977,  0.007032333,   -0.031787455,
+      0.010868644,    -0.031489216,  0.09525667,    0.013939797,
+      0.0058680447,   0.0167067,     0.02668468,    -0.04797466,
+      -0.048885044,   -0.12722108,   0.035304096,   0.06554885,
+      0.00972396,     -0.039238118,  -0.05159735,   -0.11329045,
+      0.1613692,      -0.03750952,   0.06529313,    -0.071974665,
+      -0.11769596,    0.015524369,   -0.0013754242, -0.12446318,
+      0.02786344,     -0.014179351,  0.005264273,   0.14376344,
+      0.015983658,    0.03406988,    -0.06939408,   0.040699873,
+      0.02111075,     0.09669095,    0.041345075,   -0.08316494,
+      -0.07684199,    -0.045768797,  0.032298047,   -0.041805092,
+      0.0119405,      0.0061010392,  0.12652606,    0.0064572375,
+      -0.024950314,   0.11574242,    0.04508852,    -0.04335324,
+      0.06760663,     -0.027437469,  0.07216407,    0.06977076,
+      -0.05438599,    0.034033038,   -0.028602652,  0.05346137,
+      0.043184172,    -0.037189785,  0.10420091,    0.00882477,
+      -0.054019816,   -0.074273005,  -0.030617684,  -0.0028467078,
+      0.024302477,    -0.0038869337, 0.005332455,   0.0013399826,
+      0.04361412,     -0.007001822,  0.09631092,    -0.06702025,
+      -0.042049985,   -0.035070654,  -0.04103342,   -0.10273396,
+      0.0544271,      0.037184782,   -0.13150354,   -0.0058036847,
+      -0.008264958,   0.042035464,   0.05891794,    0.029673764,
+      0.0063542654,   0.044788733,   0.054816857,   0.062257513,
+      -0.00093483756, 0.048938446,   -0.004952862,  -0.007730018,
+      -0.04043371,    -0.017094059,  0.07229206,    -0.023670016,
+      -0.052195564,   -0.025616996,  -0.01520939,   0.045104615,
+      -0.007376126,   0.003533447,   0.006570588,   0.056037236,
+      0.12436656,     0.051817212,   0.028532185,   -0.08686856,
+      0.11868599,     0.07663395,    -0.07323171,   0.03463402,
+      -0.050708205,   -0.04458982,   -0.11590894,   0.021273347,
+      0.1251325,      -0.15313013,   -0.12224372,   0.17228661,
+      0.023029093,    0.086124025,   0.006445803,   -0.03496501,
+      0.028332196,    0.04449512,    -0.042436164,  -0.026587414,
+      -0.006041347,   -0.09292539,   -0.05678812,   0.03897832,
+      0.09465633,     0.008115513,   -0.02171956,   0.08304309,
+      0.071401566,    0.019622514,   0.032163795,   -0.004167056,
+      0.02295182,     0.030739572,   0.056506045,   0.004612461,
+      0.06524936,     0.059999723,   0.046395954,   -0.0045512207,
+      -0.1335546,     -0.030136576,  0.11584653,    -0.014678886,
+      0.0020118146,   -0.09688814,   -0.0790206,    0.039770417,
+      -0.0329582,     0.07922767,    0.029322514,   0.026405897,
+      0.04207835,     -0.07073373,   0.063781224,   0.0859677,
+      -0.10925287,    -0.07011058,   0.048005477,   0.03438226,
+      -0.09606514,    -0.006669445,  -0.043381985,  0.04240257,
+      -0.06955775,    -0.06769346,   0.043903265,   -0.026784198,
+      -0.017840602,   0.024307009,   -0.040079936,  -0.019946516,
+      0.045318738,    -0.12233574,   0.026170589,   0.0074471775,
+      0.15978073,     0.10185836,    0.10298046,    -0.015476589,
+      -0.039390966,   -0.072174534,  0.0739445,     -0.1211869,
+      -0.0347889,     -0.07943156,   0.014809798,   -0.12412325,
+      -0.0030663363,  0.039695457,   0.0647603,     -0.08291318,
+      -0.018529687,   -0.004423833,  0.0037507233,  0.084633216,
+      -0.01514876,    -0.056505352,  -0.012800942,  -0.06994386,
+      0.012962922,    -0.031234352,  0.07029052,    0.016418684,
+      0.03618972,     0.055686004,   -0.08663945,   -0.017404709,
+      -0.054761406,   0.029065743,   0.052404847,   0.020238016,
+      0.0048197987,   -0.0214882,    0.07078733,    0.013016777,
+      0.06262858,     0.009184685,   0.020785125,   -0.043904778,
+      -0.0270329,     -0.03299152,   -0.060088247,  -0.015162964,
+      -0.001828936,   0.12642565,    -0.056757294,  0.013586685,
+      0.09232601,     -0.035886683,  0.06000002,    0.05229691,
+      -0.052580316,   -0.082029596,  -0.010794592,  0.012947712,
+      -0.036429964,   -0.085508935,  -0.13127148,   -0.017744139,
+      0.031502828,    0.036232427,   -0.031581745,  0.023051167,
+      -0.05325106,    -0.03421577,   0.028793324,   -0.034633752,
+      -0.009881397,   -0.043551125,  -0.018609839,  0.0019097115,
+      -0.008799762,   0.056595087,   0.0022273948,  0.055752404};
+
+  recurrent_to_forget_weights_ = {
+      -0.057784554,  -0.026057621,  -0.068447545,   -0.022581743,
+      0.14811787,    0.10826372,    0.09471067,     0.03987225,
+      -0.0039523416, 0.00030638507, 0.053185795,    0.10572994,
+      0.08414449,    -0.022036452,  -0.00066928595, -0.09203576,
+      0.032950465,   -0.10985798,   -0.023809856,   0.0021431844,
+      -0.02196096,   -0.00326074,   0.00058621005,  -0.074678116,
+      -0.06193199,   0.055729095,   0.03736828,     0.020123724,
+      0.061878487,   -0.04729229,   0.034919553,    -0.07585433,
+      -0.04421272,   -0.044019096,  0.085488975,    0.04058006,
+      -0.06890133,   -0.030951202,  -0.024628663,   -0.07672815,
+      0.034293607,   0.08556707,    -0.05293577,    -0.033561368,
+      -0.04899627,   0.0241671,     0.015736353,    -0.095442444,
+      -0.029564252,  0.016493602,   -0.035026584,   0.022337519,
+      -0.026871363,  0.004780428,   0.0077918363,   -0.03601621,
+      0.016435321,   -0.03263031,   -0.09543275,    -0.047392778,
+      0.013454138,   0.028934088,   0.01685226,     -0.086110644,
+      -0.046250615,  -0.01847454,   0.047608484,    0.07339695,
+      0.034546845,   -0.04881143,   0.009128804,    -0.08802852,
+      0.03761666,    0.008096139,   -0.014454086,   0.014361001,
+      -0.023502491,  -0.0011840804, -0.07607001,    0.001856849,
+      -0.06509276,   -0.006021153,  -0.08570962,    -0.1451793,
+      0.060212336,   0.055259194,   0.06974018,     0.049454916,
+      -0.027794661,  -0.08077226,   -0.016179763,   0.1169753,
+      0.17213494,    -0.0056326236, -0.053934924,   -0.0124349,
+      -0.11520337,   0.05409887,    0.088759385,    0.0019655675,
+      0.0042065294,  0.03881498,    0.019844765,    0.041858196,
+      -0.05695512,   0.047233116,   0.038937137,    -0.06542224,
+      0.014429736,   -0.09719407,   0.13908425,     -0.05379757,
+      0.012321099,   0.082840554,   -0.029899208,   0.044217527,
+      0.059855383,   0.07711018,    -0.045319796,   0.0948846,
+      -0.011724666,  -0.0033288454, -0.033542685,   -0.04764985,
+      -0.13873616,   0.040668588,   0.034832682,    -0.015319203,
+      -0.018715994,  0.046002675,   0.0599172,      -0.043107376,
+      0.0294216,     -0.002314414,  -0.022424703,   0.0030315618,
+      0.0014641669,  0.0029166266,  -0.11878115,    0.013738511,
+      0.12375372,    -0.0006038222, 0.029104086,    0.087442465,
+      0.052958444,   0.07558703,    0.04817258,     0.044462286,
+      -0.015213451,  -0.08783778,   -0.0561384,     -0.003008196,
+      0.047060397,   -0.002058388,  0.03429439,     -0.018839769,
+      0.024734668,   0.024614193,   -0.042046934,   0.09597743,
+      -0.0043254104, 0.04320769,    0.0064070094,   -0.0019131786,
+      -0.02558259,   -0.022822596,  -0.023273505,   -0.02464396,
+      -0.10991725,   -0.006240552,  0.0074488563,   0.024044557,
+      0.04383914,    -0.046476185,  0.028658995,    0.060410924,
+      0.050786525,   0.009452605,   -0.0073054377,  -0.024810238,
+      0.0052906186,  0.0066939713,  -0.0020913032,  0.014515517,
+      0.015898481,   0.021362653,   -0.030262267,   0.016587038,
+      -0.011442813,  0.041154444,   -0.007631438,   -0.03423484,
+      -0.010977775,  0.036152758,   0.0066366293,   0.11915515,
+      0.02318443,    -0.041350313,  0.021485701,    -0.10906167,
+      -0.028218046,  -0.00954771,   0.020531068,    -0.11995105,
+      -0.03672871,   0.024019798,   0.014255957,    -0.05221243,
+      -0.00661567,   -0.04630967,   0.033188973,    0.10107534,
+      -0.014027541,  0.030796422,   -0.10270911,    -0.035999842,
+      0.15443139,    0.07684145,    0.036571592,    -0.035900835,
+      -0.0034699554, 0.06209149,    0.015920248,    -0.031122351,
+      -0.03858649,   0.01849943,    0.13872518,     0.01503974,
+      0.069941424,   -0.06948533,   -0.0088794185,  0.061282158,
+      -0.047401894,  0.03100163,    -0.041533746,   -0.10430945,
+      0.044574402,   -0.01425562,   -0.024290353,   0.034563623,
+      0.05866852,    0.023947537,   -0.09445152,    0.035450947,
+      0.02247216,    -0.0042998926, 0.061146557,    -0.10250651,
+      0.020881841,   -0.06747029,   0.10062043,     -0.0023941975,
+      0.03532124,    -0.016341697,  0.09685456,     -0.016764693,
+      0.051808182,   0.05875331,    -0.04536488,    0.001626336,
+      -0.028892258,  -0.01048663,   -0.009793449,   -0.017093895,
+      0.010987891,   0.02357273,    -0.00010856845, 0.0099760275,
+      -0.001845119,  -0.03551521,   0.0018358806,   0.05763657,
+      -0.01769146,   0.040995963,   0.02235177,     -0.060430344,
+      0.11475477,    -0.023854522,  0.10071741,     0.0686208,
+      -0.014250481,  0.034261297,   0.047418304,    0.08562733,
+      -0.030519066,  0.0060542435,  0.014653856,    -0.038836084,
+      0.04096551,    0.032249358,   -0.08355519,    -0.026823482,
+      0.056386515,   -0.010401743,  -0.028396193,   0.08507674,
+      0.014410365,   0.020995233,   0.17040324,     0.11511526,
+      0.02459721,    0.0066619175,  0.025853224,    -0.023133837,
+      -0.081302024,  0.017264642,   -0.009585969,   0.09491168,
+      -0.051313367,  0.054532815,   -0.014298593,   0.10657464,
+      0.007076659,   0.10964551,    0.0409152,      0.008275321,
+      -0.07283536,   0.07937492,    0.04192024,     -0.1075027};
+
+  recurrent_to_output_weights_ = {
+      0.025825322,   -0.05813119,  0.09495884,   -0.045984812,   -0.01255415,
+      -0.0026479573, -0.08196161,  -0.054914974, -0.0046604523,  -0.029587349,
+      -0.044576716,  -0.07480124,  -0.082868785, 0.023254942,    0.027502948,
+      -0.0039728214, -0.08683098,  -0.08116779,  -0.014675607,   -0.037924774,
+      -0.023314456,  -0.007401714, -0.09255757,  0.029460307,    -0.08829125,
+      -0.005139627,  -0.08989442,  -0.0555066,   0.13596267,     -0.025062224,
+      -0.048351806,  -0.03850004,  0.07266485,   -0.022414139,   0.05940088,
+      0.075114764,   0.09597592,   -0.010211725, -0.0049794707,  -0.011523867,
+      -0.025980417,  0.072999895,  0.11091378,   -0.081685916,   0.014416728,
+      0.043229222,   0.034178585,  -0.07530371,  0.035837382,    -0.085607,
+      -0.007721233,  -0.03287832,  -0.043848954, -0.06404588,    -0.06632928,
+      -0.073643476,  0.008214239,  -0.045984086, 0.039764922,    0.03474462,
+      0.060612556,   -0.080590084, 0.049127717,  0.04151091,     -0.030063879,
+      0.008801774,   -0.023021035, -0.019558564, 0.05158114,     -0.010947698,
+      -0.011825728,  0.0075720972, 0.0699727,    -0.0039981045,  0.069350146,
+      0.08799282,    0.016156472,  0.035502106,  0.11695009,     0.006217345,
+      0.13392477,    -0.037875112, 0.025745004,  0.08940699,     -0.00924166,
+      0.0046702605,  -0.036598757, -0.08811812,  0.10522024,     -0.032441203,
+      0.008176899,   -0.04454919,  0.07058152,   0.0067963637,   0.039206743,
+      0.03259838,    0.03725492,   -0.09515802,  0.013326398,    -0.052055415,
+      -0.025676316,  0.03198509,   -0.015951829, -0.058556724,   0.036879618,
+      0.043357447,   0.028362012,  -0.05908629,  0.0059240665,   -0.04995891,
+      -0.019187413,  0.0276265,    -0.01628143,  0.0025863599,   0.08800015,
+      0.035250366,   -0.022165963, -0.07328642,  -0.009415526,   -0.07455109,
+      0.11690406,    0.0363299,    0.07411125,   0.042103454,    -0.009660886,
+      0.019076364,   0.018299393,  -0.046004917, 0.08891175,     0.0431396,
+      -0.026327137,  -0.051502608, 0.08979574,   -0.051670972,   0.04940282,
+      -0.07491107,   -0.021240504, 0.022596184,  -0.034280192,   0.060163025,
+      -0.058211457,  -0.051837247, -0.01349775,  -0.04639988,    -0.035936575,
+      -0.011681591,  0.064818054,  0.0073146066, -0.021745546,   -0.043124277,
+      -0.06471268,   -0.07053354,  -0.029321948, -0.05330136,    0.016933719,
+      -0.053782392,  0.13747959,   -0.1361751,   -0.11569455,    0.0033329215,
+      0.05693899,    -0.053219706, 0.063698,     0.07977434,     -0.07924483,
+      0.06936997,    0.0034815092, -0.007305279, -0.037325785,   -0.07251102,
+      -0.033633437,  -0.08677009,  0.091591336,  -0.14165086,    0.021752775,
+      0.019683983,   0.0011612234, -0.058154266, 0.049996935,    0.0288841,
+      -0.0024567875, -0.14345716,  0.010955264,  -0.10234828,    0.1183656,
+      -0.0010731248, -0.023590032, -0.072285876, -0.0724771,     -0.026382286,
+      -0.0014920527, 0.042667855,  0.0018776858, 0.02986552,     0.009814309,
+      0.0733756,     0.12289186,   0.018043943,  -0.0458958,     0.049412545,
+      0.033632483,   0.05495232,   0.036686596,  -0.013781798,   -0.010036754,
+      0.02576849,    -0.08307328,  0.010112348,  0.042521734,    -0.05869831,
+      -0.071689695,  0.03876447,   -0.13275425,  -0.0352966,     -0.023077697,
+      0.10285965,    0.084736146,  0.15568255,   -0.00040734606, 0.027835453,
+      -0.10292561,   -0.032401145, 0.10053256,   -0.026142767,   -0.08271222,
+      -0.0030240538, -0.016368777, 0.1070414,    0.042672627,    0.013456989,
+      -0.0437609,    -0.022309763, 0.11576483,   0.04108048,     0.061026827,
+      -0.0190714,    -0.0869359,   0.037901703,  0.0610107,      0.07202949,
+      0.01675338,    0.086139716,  -0.08795751,  -0.014898893,   -0.023771819,
+      -0.01965048,   0.007955471,  -0.043740474, 0.03346837,     -0.10549954,
+      0.090567775,   0.042013682,  -0.03176985,  0.12569028,     -0.02421228,
+      -0.029526481,  0.023851605,  0.031539805,  0.05292009,     -0.02344001,
+      -0.07811758,   -0.08834428,  0.10094801,   0.16594367,     -0.06861939,
+      -0.021256343,  -0.041093912, -0.06669611,  0.035498552,    0.021757556,
+      -0.09302526,   -0.015403468, -0.06614931,  -0.051798206,   -0.013874718,
+      0.03630673,    0.010412845,  -0.08077351,  0.046185967,    0.0035662893,
+      0.03541868,    -0.094149634, -0.034814864, 0.003128424,    -0.020674974,
+      -0.03944324,   -0.008110165, -0.11113267,  0.08484226,     0.043586485,
+      0.040582247,   0.0968012,    -0.065249965, -0.028036479,   0.0050708856,
+      0.0017462453,  0.0326779,    0.041296225,  0.09164146,     -0.047743853,
+      -0.015952192,  -0.034451712, 0.084197424,  -0.05347844,    -0.11768019,
+      0.085926116,   -0.08251791,  -0.045081906, 0.0948852,      0.068401024,
+      0.024856757,   0.06978981,   -0.057309967, -0.012775832,   -0.0032452994,
+      0.01977615,    -0.041040014, -0.024264973, 0.063464895,    0.05431621,
+  };
+
+  cell_to_input_weights_ = {
+      0.040369894, 0.030746894,  0.24704495,  0.018586371,  -0.037586458,
+      -0.15312155, -0.11812848,  -0.11465643, 0.20259799,   0.11418174,
+      -0.10116027, -0.011334949, 0.12411352,  -0.076769054, -0.052169047,
+      0.21198851,  -0.38871562,  -0.09061183, -0.09683246,  -0.21929175};
+
+  cell_to_forget_weights_ = {
+      -0.01998659,  -0.15568835,  -0.24248174,   -0.012770197, 0.041331276,
+      -0.072311886, -0.052123554, -0.0066330447, -0.043891653, 0.036225766,
+      -0.047248036, 0.021479502,  0.033189066,   0.11952997,   -0.020432774,
+      0.64658105,   -0.06650122,  -0.03467612,   0.095340036,  0.23647355};
+
+  cell_to_output_weights_ = {0.08286371,  -0.08261836, -0.51210177, 0.002913762,
+                             0.17764764,  -0.5495371,  -0.08460716, -0.24552552,
+                             0.030037103, 0.04123544,  -0.11940523, 0.007358328,
+                             0.1890978,   0.4833202,   -0.34441817, 0.36312827,
+                             -0.26375428, 0.1457655,   -0.19724406, 0.15548733};
+
+  projection_weights_ = {
+      -0.009802181,  0.09401916,    0.0717386,     -0.13895074,  0.09641832,
+      0.060420845,   0.08539281,    0.054285463,   0.061395317,  0.034448683,
+      -0.042991187,  0.019801661,   -0.16840284,   -0.015726732, -0.23041931,
+      -0.024478018,  -0.10959692,   -0.013875541,  0.18600968,   -0.061274476,
+      0.0138165,     -0.08160894,   -0.07661644,   0.032372914,  0.16169067,
+      0.22465782,    -0.03993472,   -0.004017731,  0.08633481,   -0.28869787,
+      0.08682067,    0.17240396,    0.014975425,   0.056431185,  0.031037588,
+      0.16702051,    0.0077946745,  0.15140012,    0.29405436,   0.120285,
+      -0.188994,     -0.027265169,  0.043389652,   -0.022061434, 0.014777949,
+      -0.20203483,   0.094781205,   0.19100232,    0.13987629,   -0.036132768,
+      -0.06426278,   -0.05108664,   0.13221376,    0.009441198,  -0.16715929,
+      0.15859416,    -0.040437475,  0.050779544,   -0.022187516, 0.012166504,
+      0.027685808,   -0.07675938,   -0.0055694645, -0.09444123,  0.0046453946,
+      0.050794356,   0.10770313,    -0.20790008,   -0.07149004,  -0.11425117,
+      0.008225835,   -0.035802525,  0.14374903,    0.15262283,   0.048710253,
+      0.1847461,     -0.007487823,  0.11000021,    -0.09542012,  0.22619456,
+      -0.029149994,  0.08527916,    0.009043713,   0.0042746216, 0.016261552,
+      0.022461696,   0.12689082,    -0.043589946,  -0.12035478,  -0.08361797,
+      -0.050666027,  -0.1248618,    -0.1275799,    -0.071875185, 0.07377272,
+      0.09944291,    -0.18897448,   -0.1593054,    -0.06526116,  -0.040107165,
+      -0.004618631,  -0.067624845,  -0.007576253,  0.10727444,   0.041546922,
+      -0.20424393,   0.06907816,    0.050412357,   0.00724631,   0.039827548,
+      0.12449835,    0.10747581,    0.13708383,    0.09134148,   -0.12617786,
+      -0.06428341,   0.09956831,    0.1208086,     -0.14676677,  -0.0727722,
+      0.1126304,     0.010139365,   0.015571211,   -0.038128063, 0.022913318,
+      -0.042050496,  0.16842307,    -0.060597885,  0.10531834,   -0.06411776,
+      -0.07451711,   -0.03410368,   -0.13393489,   0.06534304,   0.003620307,
+      0.04490757,    0.05970546,    0.05197996,    0.02839995,   0.10434969,
+      -0.013699693,  -0.028353551,  -0.07260381,   0.047201227,  -0.024575593,
+      -0.036445823,  0.07155557,    0.009672501,   -0.02328883,  0.009533515,
+      -0.03606021,   -0.07421458,   -0.028082801,  -0.2678904,   -0.13221288,
+      0.18419984,    -0.13012612,   -0.014588381,  -0.035059117, -0.04824723,
+      0.07830115,    -0.056184657,  0.03277091,    0.025466874,  0.14494097,
+      -0.12522776,   -0.098633975,  -0.10766018,   -0.08317623,  0.08594209,
+      0.07749552,    0.039474737,   0.1776665,     -0.07409566,  -0.0477268,
+      0.29323658,    0.10801441,    0.1154011,     0.013952499,  0.10739139,
+      0.10708251,    -0.051456142,  0.0074137426,  -0.10430189,  0.10034707,
+      0.045594677,   0.0635285,     -0.0715442,    -0.089667566, -0.10811871,
+      0.00026344223, 0.08298446,    -0.009525053,  0.006585689,  -0.24567553,
+      -0.09450807,   0.09648481,    0.026996298,   -0.06419476,  -0.04752702,
+      -0.11063944,   -0.23441927,   -0.17608605,   -0.052156363, 0.067035615,
+      0.19271925,    -0.0032889997, -0.043264326,  0.09663576,   -0.057112187,
+      -0.10100678,   0.0628376,     0.04447668,    0.017961001,  -0.10094388,
+      -0.10190601,   0.18335468,    0.10494553,    -0.052095775, -0.0026118709,
+      0.10539724,    -0.04383912,   -0.042349473,  0.08438151,   -0.1947263,
+      0.02251204,    0.11216432,    -0.10307853,   0.17351969,   -0.039091777,
+      0.08066188,    -0.00561982,   0.12633002,    0.11335965,   -0.0088127935,
+      -0.019777594,  0.06864014,    -0.059751723,  0.016233567,  -0.06894641,
+      -0.28651384,   -0.004228674,  0.019708522,   -0.16305895,  -0.07468996,
+      -0.0855457,    0.099339016,   -0.07580735,   -0.13775392,  0.08434318,
+      0.08330512,    -0.12131499,   0.031935584,   0.09180414,   -0.08876437,
+      -0.08049874,   0.008753825,   0.03498998,    0.030215185,  0.03907079,
+      0.089751154,   0.029194152,   -0.03337423,   -0.019092513, 0.04331237,
+      0.04299654,    -0.036394123,  -0.12915532,   0.09793732,   0.07512415,
+      -0.11319543,   -0.032502122,  0.15661901,    0.07671967,   -0.005491124,
+      -0.19379048,   -0.218606,     0.21448623,    0.017840758,  0.1416943,
+      -0.07051762,   0.19488361,    0.02664691,    -0.18104725,  -0.09334311,
+      0.15026465,    -0.15493552,   -0.057762887,  -0.11604192,  -0.262013,
+      -0.01391798,   0.012185008,   0.11156489,    -0.07483202,  0.06693364,
+      -0.26151478,   0.046425626,   0.036540434,   -0.16435726,  0.17338543,
+      -0.21401681,   -0.11385144,   -0.08283257,   -0.069031075, 0.030635102,
+      0.010969227,   0.11109743,    0.010919218,   0.027526086,  0.13519906,
+      0.01891392,    -0.046839405,  -0.040167913,  0.017953383,  -0.09700955,
+      0.0061885654,  -0.07000971,   0.026893595,   -0.038844477, 0.14543656};
+
+  lstm_input_ = {// Step 1
+                 {{0.787926, 0.151646, 0.071352, 0.118426, 0.458058},
+                  {0.295743, 0.544053, 0.690064, 0.858138, 0.497181}},
+                 // Step 2
+                 {{0.596268, 0.998386, 0.568695, 0.864524, 0.571277},
+                  {0.642421, 0.524260, 0.134799, 0.003639, 0.162482}},
+                 // Step 3
+                 {{0.073204, 0.296072, 0.743333, 0.069199, 0.045348},
+                  {0.640394, 0.930399, 0.050782, 0.432485, 0.988078}},
+                 // Step 4
+                 {{0.867394, 0.291279, 0.013714, 0.482521, 0.626339},
+                  {0.082922, 0.563329, 0.865614, 0.333232, 0.259916}}};
 
-                       {n_cell, n_input},  // input_to_input_weight tensor
-                       {n_cell, n_input},  // input_to_forget_weight tensor
-                       {n_cell, n_input},  // input_to_cell_weight tensor
-                       {n_cell, n_input},  // input_to_output_weight tensor
-
-                       {n_cell, n_output},  // recurrent_to_input_weight tensor
-                       {n_cell, n_output},  // recurrent_to_forget_weight tensor
-                       {n_cell, n_output},  // recurrent_to_cell_weight tensor
-                       {n_cell, n_output},  // recurrent_to_output_weight tensor
-
-                       {n_cell},  // cell_to_input_weight tensor
-                       {n_cell},  // cell_to_forget_weight tensor
-                       {n_cell},  // cell_to_output_weight tensor
-
-                       {n_cell},  // input_gate_bias tensor
-                       {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_gate_bias tensor
-                       {n_cell},  // output_gate_bias tensor
-
-                       {n_output, n_cell},  // projection_weight tensor
-                       {0},                 // projection_bias tensor
-                   },
-                   /*weight_type=*/TensorType_UINT8,
-                   /*is_layer_norm=*/false, GetParam());
-
-  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.00467);
-}
-
-class NoCifgPeepholeProjectionNoClippingLstmInt8Test
-    : public NoCifgPeepholeProjectionNoClippingLstmTest {};
-
-TEST_P(NoCifgPeepholeProjectionNoClippingLstmInt8Test,
-       HybridLstmBlackBoxTestInt8) {
-  if (SingleOpModel::GetForceUseNnapi() && GetParam()) {
-    return;
-  }
-  const int n_batch = 2;
-  const int n_input = 5;
-  const int n_cell = 20;
-  const int n_output = 16;
-
-  LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
-                   /*use_cifg=*/false, /*use_peephole=*/true,
-                   /*use_projection_weights=*/true,
-                   /*use_projection_bias=*/false,
-                   /*cell_clip=*/0.0, /*proj_clip=*/0.0,
-                   {
-                       {n_batch, n_input},  // input tensor
-
-                       {n_cell, n_input},  // input_to_input_weight tensor
-                       {n_cell, n_input},  // input_to_forget_weight tensor
-                       {n_cell, n_input},  // input_to_cell_weight tensor
-                       {n_cell, n_input},  // input_to_output_weight tensor
-
-                       {n_cell, n_output},  // recurrent_to_input_weight tensor
-                       {n_cell, n_output},  // recurrent_to_forget_weight tensor
-                       {n_cell, n_output},  // recurrent_to_cell_weight tensor
-                       {n_cell, n_output},  // recurrent_to_output_weight tensor
-
-                       {n_cell},  // cell_to_input_weight tensor
-                       {n_cell},  // cell_to_forget_weight tensor
-                       {n_cell},  // cell_to_output_weight tensor
-
-                       {n_cell},  // input_gate_bias tensor
-                       {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_gate_bias tensor
-                       {n_cell},  // output_gate_bias tensor
-
-                       {n_output, n_cell},  // projection_weight tensor
-                       {0},                 // projection_bias tensor
-                   },
-                   /*weight_type=*/TensorType_INT8,
-                   /*is_layer_norm=*/false, GetParam());
-
-  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.0015);
-}
-
-class NoCifgPeepholeProjectionNoClippingLayerNormLstmTest
-    : public BaseLstmTest {
-  void SetUp() override {
-    input_to_input_weights_ = {0.5,  0.6,  0.7,  -0.8, -0.9, 0.1,  0.2,
-                               0.3,  -0.4, 0.5,  -0.8, 0.7,  -0.6, 0.5,
-                               -0.4, -0.5, -0.4, -0.3, -0.2, -0.1};
-
-    input_to_forget_weights_ = {-0.6, -0.1, 0.3,  0.2,  0.9,  -0.5, -0.2,
-                                -0.4, 0.3,  -0.8, -0.4, 0.3,  -0.5, -0.4,
-                                -0.6, 0.3,  -0.4, -0.6, -0.5, -0.5};
-
-    input_to_cell_weights_ = {-0.4, -0.3, -0.2, -0.1, -0.5, 0.5,  -0.2,
-                              -0.3, -0.2, -0.6, 0.6,  -0.1, -0.4, -0.3,
-                              -0.7, 0.7,  -0.9, -0.5, 0.8,  0.6};
-
-    input_to_output_weights_ = {-0.8, -0.4, -0.2, -0.9, -0.1, -0.7, 0.3,
-                                -0.3, -0.8, -0.2, 0.6,  -0.2, 0.4,  -0.7,
-                                -0.3, -0.5, 0.1,  0.5,  -0.6, -0.4};
-
-    input_gate_bias_ = {0.03, 0.15, 0.22, 0.38};
-
-    forget_gate_bias_ = {0.1, -0.3, -0.2, 0.1};
-
-    cell_gate_bias_ = {-0.05, 0.72, 0.25, 0.08};
-
-    output_gate_bias_ = {0.05, -0.01, 0.2, 0.1};
-
-    recurrent_to_input_weights_ = {-0.2, -0.3, 0.4,  0.1,  -0.5, 0.9,
-                                   -0.2, -0.3, -0.7, 0.05, -0.2, -0.6};
-
-    recurrent_to_cell_weights_ = {-0.3, 0.2, 0.1, -0.3, 0.8,  -0.08,
-                                  -0.2, 0.3, 0.8, -0.6, -0.1, 0.2};
-
-    recurrent_to_forget_weights_ = {-0.5, -0.3, -0.5, -0.2, 0.6, 0.4,
-                                    0.9,  0.3,  -0.1, 0.2,  0.5, 0.2};
-
-    recurrent_to_output_weights_ = {0.3,  -0.1, 0.1,  -0.2, -0.5, -0.7,
-                                    -0.2, -0.6, -0.1, -0.4, -0.7, -0.2};
-
-    cell_to_input_weights_ = {0.05, 0.1, 0.25, 0.15};
-
-    cell_to_forget_weights_ = {-0.02, -0.15, -0.25, -0.03};
-
-    cell_to_output_weights_ = {0.1, -0.1, -0.5, 0.05};
-
-    input_layer_norm_coefficients_ = {0.1, 0.2, 0.3, 0.5};
-    forget_layer_norm_coefficients_ = {0.2, 0.2, 0.4, 0.3};
-    cell_layer_norm_coefficients_ = {0.7, 0.2, 0.3, 0.8};
-    output_layer_norm_coefficients_ = {0.6, 0.2, 0.2, 0.5};
-
-    projection_weights_ = {-0.1, 0.2,  0.01, -0.2, 0.1,  0.5,
-                           0.3,  0.08, 0.07, 0.2,  -0.4, 0.2};
-
-    lstm_input_ = {
-        {// Batch0: 3 (input_sequence_size) * 5 (n_input)
-         0.7, 0.8, 0.1, 0.2, 0.3,   // seq 0
-         0.8, 0.1, 0.2, 0.4, 0.5,   // seq 1
-         0.2, 0.7, 0.7, 0.1, 0.7},  // seq 2
-
-        {// Batch1: 3 (input_sequence_size) * 5 (n_input)
-         0.3, 0.2, 0.9, 0.8, 0.1,   // seq 0
-         0.1, 0.5, 0.2, 0.4, 0.2,   // seq 1
-         0.6, 0.9, 0.2, 0.5, 0.7},  // seq 2
-    };
-  }
-};
-
-TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
-       LayerNormLstmBlackBoxTest) {
-  const int n_batch = 2;
-  const int n_input = 5;
-  const int n_cell = 4;
-  const int n_output = 3;
-  const float cell_clip = 0.0;
-  const float proj_clip = 0.0;
-
-  LSTMOpModel layer_norm_lstm(
-      n_batch, n_input, n_cell, n_output,
-      /*use_cifg=*/false, /*use_peephole=*/true,
-      /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, cell_clip, proj_clip,
-      {
-          {n_batch, n_input},  // input tensor
-
-          {n_cell, n_input},  // input_to_input_weight tensor
-          {n_cell, n_input},  // input_to_forget_weight tensor
-          {n_cell, n_input},  // input_to_cell_weight tensor
-          {n_cell, n_input},  // input_to_output_weight tensor
-
-          {n_cell, n_output},  // recurrent_to_input_weight tensor
-          {n_cell, n_output},  // recurrent_to_forget_weight tensor
-          {n_cell, n_output},  // recurrent_to_cell_weight tensor
-          {n_cell, n_output},  // recurrent_to_output_weight tensor
-
-          {n_cell},  // cell_to_input_weight tensor
-          {n_cell},  // cell_to_forget_weight tensor
-          {n_cell},  // cell_to_output_weight tensor
-
-          {n_cell},  // input_gate_bias tensor
-          {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_gate_bias tensor
-          {n_cell},  // output_gate_bias tensor
-
-          {n_output, n_cell},  // projection_weight tensor
-          {0},                 // projection_bias tensor
-
-          {n_batch, n_output},  // output_state tensor
-          {n_batch, n_cell},    // cell_state tensor
-
-          {n_cell},  // input_layer_norm_coefficient tensor
-          {n_cell},  // forget_layer_norm_coefficient tensor
-          {n_cell},  // cell_layer_norm_coefficient tensor
-          {n_cell},  // output_layer_norm_coefficient tensor
-      },
-      /*weight_type=*/TensorType_FLOAT32,
-      /*is_layer_norm=*/true);
-
-  // Verify the final output.
-  lstm_golden_output_ = {{
-                             // Batch0: 3 (input_sequence_size) * 3 (n_output)
-                             0.0244077, 0.128027, -0.00170918,  // seq 0
-                             0.0137642, 0.140751, 0.0395835,    // seq 1
-                             -0.00459231, 0.155278, 0.0837377,  // seq 2
-                         },
-                         {
-                             // Batch1: 3 (input_sequence_size) * 3 (n_output)
-                             -0.00692428, 0.0848741, 0.063445,  // seq 0
-                             -0.00403912, 0.139963, 0.072681,   // seq 1
-                             0.00752706, 0.161903, 0.0561371,   // seq 2
-                         }};
-
-  VerifyGoldens(lstm_input_, lstm_golden_output_, &layer_norm_lstm);
-}
-
-TEST_P(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
-       HybridLayerNormLstmBlackBoxTestUint8) {
-  // TODO(b/158205028): Fix this test if GetForceUseNnapi() && !GetParam().
-  if (SingleOpModel::GetForceUseNnapi()) {
-    return;
-  }
-  const int n_batch = 2;
-  const int n_input = 5;
-  const int n_cell = 4;
-  const int n_output = 3;
-  const float cell_clip = 0.0;
-  const float proj_clip = 0.0;
-
-  LSTMOpModel layer_norm_lstm(
-      n_batch, n_input, n_cell, n_output,
-      /*use_cifg=*/false, /*use_peephole=*/true,
-      /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, cell_clip, proj_clip,
-      {
-          {n_batch, n_input},  // input tensor
-
-          {n_cell, n_input},  // input_to_input_weight tensor
-          {n_cell, n_input},  // input_to_forget_weight tensor
-          {n_cell, n_input},  // input_to_cell_weight tensor
-          {n_cell, n_input},  // input_to_output_weight tensor
-
-          {n_cell, n_output},  // recurrent_to_input_weight tensor
-          {n_cell, n_output},  // recurrent_to_forget_weight tensor
-          {n_cell, n_output},  // recurrent_to_cell_weight tensor
-          {n_cell, n_output},  // recurrent_to_output_weight tensor
-
-          {n_cell},  // cell_to_input_weight tensor
-          {n_cell},  // cell_to_forget_weight tensor
-          {n_cell},  // cell_to_output_weight tensor
-
-          {n_cell},  // input_gate_bias tensor
-          {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_gate_bias tensor
-          {n_cell},  // output_gate_bias tensor
-
-          {n_output, n_cell},  // projection_weight tensor
-          {0},                 // projection_bias tensor
-
-          {n_batch, n_output},  // output_state tensor
-          {n_batch, n_cell},    // cell_state tensor
-
-          {n_cell},  // input_layer_norm_coefficient tensor
-          {n_cell},  // forget_layer_norm_coefficient tensor
-          {n_cell},  // cell_layer_norm_coefficient tensor
-          {n_cell},  // output_layer_norm_coefficient tensor
-      },
-      /*weight_type=*/TensorType_UINT8,
-      /*is_layer_norm=*/true, GetParam());
-
-  lstm_golden_output_ = {{
-                             // Batch0: 3 (input_sequence_size) * 3 (n_output)
-                             0.0244576, 0.127847, -0.00181765,  // seq 0
-                             0.0137518, 0.140892, 0.0402234,    // seq 1
-                             -0.0048839, 0.155096, 0.0840309,   // seq 2
-                         },
-                         {
-                             // Batch1: 3 (input_sequence_size) * 3 (n_output)
-                             -0.00728636, 0.0843957, 0.0634786,  // seq 0
-                             -0.00448382, 0.139278, 0.0737372,   // seq 1
-                             0.00734616, 0.161793, 0.0560238,    // seq 2
-                         }};
-
-  VerifyGoldens(lstm_input_, lstm_golden_output_, &layer_norm_lstm,
-                /*tolerance=*/0.0010907);
-}
-
-class NoCifgPeepholeProjectionNoClippingLayerNormLstmInt8Test
-    : public NoCifgPeepholeProjectionNoClippingLayerNormLstmTest {};
-
-TEST_P(NoCifgPeepholeProjectionNoClippingLayerNormLstmInt8Test,
-       HybridLayerNormLstmBlackBoxTestInt8) {
-  if (SingleOpModel::GetForceUseNnapi() && GetParam()) {
-    return;
-  }
-  const int n_batch = 2;
-  const int n_input = 5;
-  const int n_cell = 4;
-  const int n_output = 3;
-  const float cell_clip = 0.0;
-  const float proj_clip = 0.0;
-
-  LSTMOpModel layer_norm_lstm(
-      n_batch, n_input, n_cell, n_output,
-      /*use_cifg=*/false, /*use_peephole=*/true,
-      /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, cell_clip, proj_clip,
-      {
-          {n_batch, n_input},  // input tensor
-
-          {n_cell, n_input},  // input_to_input_weight tensor
-          {n_cell, n_input},  // input_to_forget_weight tensor
-          {n_cell, n_input},  // input_to_cell_weight tensor
-          {n_cell, n_input},  // input_to_output_weight tensor
-
-          {n_cell, n_output},  // recurrent_to_input_weight tensor
-          {n_cell, n_output},  // recurrent_to_forget_weight tensor
-          {n_cell, n_output},  // recurrent_to_cell_weight tensor
-          {n_cell, n_output},  // recurrent_to_output_weight tensor
-
-          {n_cell},  // cell_to_input_weight tensor
-          {n_cell},  // cell_to_forget_weight tensor
-          {n_cell},  // cell_to_output_weight tensor
-
-          {n_cell},  // input_gate_bias tensor
-          {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_gate_bias tensor
-          {n_cell},  // output_gate_bias tensor
-
-          {n_output, n_cell},  // projection_weight tensor
-          {0},                 // projection_bias tensor
-
-          {n_batch, n_output},  // output_state tensor
-          {n_batch, n_cell},    // cell_state tensor
-
-          {n_cell},  // input_layer_norm_coefficient tensor
-          {n_cell},  // forget_layer_norm_coefficient tensor
-          {n_cell},  // cell_layer_norm_coefficient tensor
-          {n_cell},  // output_layer_norm_coefficient tensor
-      },
-      /*weight_type=*/TensorType_INT8,
-      /*is_layer_norm=*/true, GetParam());
-
-  // Goldens are calculated from weight_type=TensorType_FLOAT32.
-  lstm_golden_output_ = {{
-                             // Batch0: 3 (input_sequence_size) * 3 (n_output)
-                             0.0244077, 0.128027, -0.00170918,  // seq 0
-                             0.0137642, 0.140751, 0.0395835,    // seq 1
-                             -0.00459233, 0.155278, 0.0837378,  // seq 2
-                         },
-                         {
-                             // Batch1: 3 (input_sequence_size) * 3 (n_output)
-                             -0.00692428, 0.0848741, 0.063445,  // seq 0
-                             -0.00403911, 0.139963, 0.072681,   // seq 1
-                             0.00752708, 0.161903, 0.0561371,   // seq 2
-                         }};
-
-  VerifyGoldens(lstm_input_, lstm_golden_output_, &layer_norm_lstm,
-                /*tolerance=*/1.06e-3);
-}
-
-class CifgPeepholeProjectionNoClippingLayerNormLstmTest : public BaseLstmTest {
-  void SetUp() override {
-    input_to_forget_weights_ = {-0.6, -0.1, 0.3,  0.2,  0.9,  -0.5, -0.2,
-                                -0.4, 0.3,  -0.8, -0.4, 0.3,  -0.5, -0.4,
-                                -0.6, 0.3,  -0.4, -0.6, -0.5, -0.5};
-    input_to_cell_weights_ = {-0.4, -0.3, -0.2, -0.1, -0.5, 0.5,  -0.2,
-                              -0.3, -0.2, -0.6, 0.6,  -0.1, -0.4, -0.3,
-                              -0.7, 0.7,  -0.9, -0.5, 0.8,  0.6};
-    input_to_output_weights_ = {-0.8, -0.4, -0.2, -0.9, -0.1, -0.7, 0.3,
-                                -0.3, -0.8, -0.2, 0.6,  -0.2, 0.4,  -0.7,
-                                -0.3, -0.5, 0.1,  0.5,  -0.6, -0.4};
-
-    forget_gate_bias_ = {0.1, -0.3, -0.2, 0.1};
-    cell_gate_bias_ = {-0.05, 0.72, 0.25, 0.08};
-    output_gate_bias_ = {0.05, -0.01, 0.2, 0.1};
-
-    recurrent_to_cell_weights_ = {-0.3, 0.2, 0.1, -0.3, 0.8,  -0.08,
-                                  -0.2, 0.3, 0.8, -0.6, -0.1, 0.2};
-    recurrent_to_forget_weights_ = {-0.5, -0.3, -0.5, -0.2, 0.6, 0.4,
-                                    0.9,  0.3,  -0.1, 0.2,  0.5, 0.2};
-    recurrent_to_output_weights_ = {0.3,  -0.1, 0.1,  -0.2, -0.5, -0.7,
-                                    -0.2, -0.6, -0.1, -0.4, -0.7, -0.2};
-
-    cell_to_forget_weights_ = {-0.02, -0.15, -0.25, -0.03};
-    cell_to_output_weights_ = {0.1, -0.1, -0.5, 0.05};
-
-    forget_layer_norm_coefficients_ = {0.2, 0.2, 0.4, 0.3};
-    cell_layer_norm_coefficients_ = {0.7, 0.2, 0.3, 0.8};
-    output_layer_norm_coefficients_ = {0.6, 0.2, 0.2, 0.5};
-    projection_weights_ = {-0.1, 0.2,  0.01, -0.2, 0.1,  0.5,
-                           0.3,  0.08, 0.07, 0.2,  -0.4, 0.2};
-
-    lstm_input_ = {
-        {// Batch0: 3 (input_sequence_size) * 5 (n_input)
-         0.7, 0.8, 0.1, 0.2, 0.3,   // seq 0
-         0.8, 0.1, 0.2, 0.4, 0.5,   // seq 1
-         0.2, 0.7, 0.7, 0.1, 0.7},  // seq 2
-
-        {// Batch1: 3 (input_sequence_size) * 5 (n_input)
-         0.3, 0.2, 0.9, 0.8, 0.1,   // seq 0
-         0.1, 0.5, 0.2, 0.4, 0.2,   // seq 1
-         0.6, 0.9, 0.2, 0.5, 0.7},  // seq 2
-    };
-  }
-};
-
-TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
-       LayerNormLstmBlackBoxTest) {
-  const int n_batch = 2;
-  const int n_input = 5;
-  const int n_cell = 4;
-  const int n_output = 3;
-  const float cell_clip = 0.0;
-  const float proj_clip = 0.0;
-
-  LSTMOpModel layer_norm_lstm(
-      n_batch, n_input, n_cell, n_output,
-      /*use_cifg=*/true, /*use_peephole=*/true,
-      /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, cell_clip, proj_clip,
-      {
-          {n_batch, n_input},  // input tensor
-
-          {0, 0},             // input_to_input_weight tensor
-          {n_cell, n_input},  // input_to_forget_weight tensor
-          {n_cell, n_input},  // input_to_cell_weight tensor
-          {n_cell, n_input},  // input_to_output_weight tensor
-
-          {0, 0},              // recurrent_to_input_weight tensor
-          {n_cell, n_output},  // recurrent_to_forget_weight tensor
-          {n_cell, n_output},  // recurrent_to_cell_weight tensor
-          {n_cell, n_output},  // recurrent_to_output_weight tensor
-
-          {0},       // cell_to_input_weight tensor
-          {n_cell},  // cell_to_forget_weight tensor
-          {n_cell},  // cell_to_output_weight tensor
-
-          {0},       // input_gate_bias tensor
-          {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_gate_bias tensor
-          {n_cell},  // output_gate_bias tensor
-
-          {n_output, n_cell},  // projection_weight tensor
-          {0},                 // projection_bias tensor
-
-          {n_batch, n_output},  // output_state tensor
-          {n_batch, n_cell},    // cell_state tensor
-
-          {0},       // input_layer_norm_coefficient tensor
-          {n_cell},  // forget_layer_norm_coefficient tensor
-          {n_cell},  // cell_layer_norm_coefficient tensor
-          {n_cell},  // output_layer_norm_coefficient tensor
-      },
-      /*weight_type=*/TensorType_FLOAT32,
-      /*is_layer_norm=*/true);
-
-  // Verify the final output.
   lstm_golden_output_ = {
-      {
-          // Batch0: 3 (input_sequence_size) * 3 (n_output)
-          0.02129706, 0.140816242, 0.0112733059,     // seq 0
-          0.0132302344, 0.152308047, 0.0346313119,   // seq 1
-          -0.0123688057, 0.165790111, 0.0893077999,  // seq 2
-      },
-      {
-          // Batch1: 3 (input_sequence_size) * 3 (n_output)
-          -0.0226350538, 0.0916948169, 0.0769175813,  // seq 0
-          -0.0269966982, 0.149707705, 0.094149217,    // seq 1
-          -0.0103429332, 0.173016444, 0.0720508844,   // seq 2
-      }};
+      {{-0.00396806, 0.029352, -0.00279226, 0.0159977, -0.00835576, -0.0211779,
+        0.0283512, -0.0114597, 0.00907307, -0.0244004, -0.0152191, -0.0259063,
+        0.00914318, 0.00415118, 0.017147, 0.0134203},
+       {-0.013869, 0.0287268, -0.00334693, 0.00733398, -0.0287926, -0.0186926,
+        0.0193662, -0.0115437, 0.00422612, -0.0345232, 0.00223253, -0.00957321,
+        0.0210624, 0.013331, 0.0150954, 0.02168}},
 
-  VerifyGoldens(lstm_input_, lstm_golden_output_, &layer_norm_lstm);
+      {{-0.0166936, 0.0381209, 0.000889694, 0.0143363, -0.0328911, -0.0234288,
+        0.0333051, -0.012229, 0.0110322, -0.0457725, -0.000832209, -0.0202817,
+        0.0327257, 0.0121308, 0.0155969, 0.0312091},
+       {-0.0141913, 0.0322082, 0.00227024, 0.0260507, -0.0188721, -0.0296489,
+        0.0399134, -0.0160509, 0.0116039, -0.0447318, -0.0150515, -0.0277406,
+        0.0316596, 0.0118233, 0.0214762, 0.0293641}},
+
+      {{-0.0213783, 0.0350169, 0.000324794, 0.0276012, -0.0263374, -0.0371449,
+        0.0446149, -0.0205474, 0.0103729, -0.0576349, -0.0150052, -0.0292043,
+        0.0376827, 0.0136115, 0.0243435, 0.0354492},
+       {-0.0204549, 0.0450315, -0.00117378, 0.0167673, -0.0375007, -0.0238314,
+        0.038784, -0.0174034, 0.0131743, -0.0506589, -0.0048447, -0.0240239,
+        0.0325789, 0.00790065, 0.0220157, 0.0333314}},
+
+      {{-0.0189322, 0.0464512, -0.00251373, 0.0225745, -0.0308346, -0.0317124,
+        0.0460407, -0.0189395, 0.0149363, -0.0530162, -0.0150767, -0.0340193,
+        0.0286833, 0.00824207, 0.0264887, 0.0305169},
+       {-0.0264787, 0.0387855, -0.000764675, 0.0217599, -0.037537, -0.0335206,
+        0.0431679, -0.0211424, 0.010203, -0.062785, -0.00832363, -0.025181,
+        0.0412031, 0.0118723, 0.0239643, 0.0394009}}};
+
+  LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
+                   /*use_cifg=*/false, /*use_peephole=*/true,
+                   /*use_projection_weights=*/true,
+                   /*use_projection_bias=*/false, weight_type,
+                   model_has_legacy_20_inputs, /*is_layer_norm=*/false,
+                   asymmetric_quantize_inputs);
+
+  static const auto* tolerance_per_type = new std::map<TensorType, float>{
+      {TensorType_FLOAT32, 0.00001f},
+      {TensorType_UINT8, 0.00467f},
+      {TensorType_INT8, 0.0015f},
+  };
+  VerifyGoldens(&lstm, tolerance_per_type->at(weight_type));
 }
 
-TEST_P(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
-       HybridLayerNormLstmBlackBoxTestUint8) {
-  if (SingleOpModel::GetForceUseNnapi()) {
+TEST_P(LstmOpTest, NoCifg_Peephole_Projection_LayerNorm) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 4;
+  const int n_output = 3;
+
+  TensorType weight_type;
+  // Layer normalization needs 24 inputs.
+  bool asymmetric_quantize_inputs;
+  std::tie(weight_type, std::ignore, asymmetric_quantize_inputs) = GetParam();
+
+  // TODO(b/158205028): Fix this test if using NN-API.
+  if (SingleOpModel::GetForceUseNnapi() && weight_type == TensorType_UINT8) {
     return;
   }
-  const int n_batch = 2;
-  const int n_input = 5;
-  const int n_cell = 4;
-  const int n_output = 3;
-  const float cell_clip = 0.0;
-  const float proj_clip = 0.0;
 
-  LSTMOpModel layer_norm_lstm(
-      n_batch, n_input, n_cell, n_output,
-      /*use_cifg=*/true, /*use_peephole=*/true,
-      /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, cell_clip, proj_clip,
-      {
-          {n_batch, n_input},  // input tensor
+  input_to_input_weights_ = {0.5,  0.6,  0.7,  -0.8, -0.9, 0.1,  0.2,
+                             0.3,  -0.4, 0.5,  -0.8, 0.7,  -0.6, 0.5,
+                             -0.4, -0.5, -0.4, -0.3, -0.2, -0.1};
 
-          {0, 0},             // input_to_input_weight tensor
-          {n_cell, n_input},  // input_to_forget_weight tensor
-          {n_cell, n_input},  // input_to_cell_weight tensor
-          {n_cell, n_input},  // input_to_output_weight tensor
+  input_to_forget_weights_ = {-0.6, -0.1, 0.3,  0.2,  0.9,  -0.5, -0.2,
+                              -0.4, 0.3,  -0.8, -0.4, 0.3,  -0.5, -0.4,
+                              -0.6, 0.3,  -0.4, -0.6, -0.5, -0.5};
 
-          {0, 0},              // recurrent_to_input_weight tensor
-          {n_cell, n_output},  // recurrent_to_forget_weight tensor
-          {n_cell, n_output},  // recurrent_to_cell_weight tensor
-          {n_cell, n_output},  // recurrent_to_output_weight tensor
+  input_to_cell_weights_ = {-0.4, -0.3, -0.2, -0.1, -0.5, 0.5,  -0.2,
+                            -0.3, -0.2, -0.6, 0.6,  -0.1, -0.4, -0.3,
+                            -0.7, 0.7,  -0.9, -0.5, 0.8,  0.6};
 
-          {0},       // cell_to_input_weight tensor
-          {n_cell},  // cell_to_forget_weight tensor
-          {n_cell},  // cell_to_output_weight tensor
+  input_to_output_weights_ = {-0.8, -0.4, -0.2, -0.9, -0.1, -0.7, 0.3,
+                              -0.3, -0.8, -0.2, 0.6,  -0.2, 0.4,  -0.7,
+                              -0.3, -0.5, 0.1,  0.5,  -0.6, -0.4};
 
-          {0},       // input_gate_bias tensor
-          {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_gate_bias tensor
-          {n_cell},  // output_gate_bias tensor
+  input_gate_bias_ = {0.03, 0.15, 0.22, 0.38};
 
-          {n_output, n_cell},  // projection_weight tensor
-          {0},                 // projection_bias tensor
+  forget_gate_bias_ = {0.1, -0.3, -0.2, 0.1};
 
-          {n_batch, n_output},  // output_state tensor
-          {n_batch, n_cell},    // cell_state tensor
+  cell_gate_bias_ = {-0.05, 0.72, 0.25, 0.08};
 
-          {0},       // input_layer_norm_coefficient tensor
-          {n_cell},  // forget_layer_norm_coefficient tensor
-          {n_cell},  // cell_layer_norm_coefficient tensor
-          {n_cell},  // output_layer_norm_coefficient tensor
-      },
-      /*weight_type=*/TensorType_UINT8,
-      /*is_layer_norm=*/true, GetParam());
+  output_gate_bias_ = {0.05, -0.01, 0.2, 0.1};
+
+  recurrent_to_input_weights_ = {-0.2, -0.3, 0.4,  0.1,  -0.5, 0.9,
+                                 -0.2, -0.3, -0.7, 0.05, -0.2, -0.6};
+
+  recurrent_to_cell_weights_ = {-0.3, 0.2, 0.1, -0.3, 0.8,  -0.08,
+                                -0.2, 0.3, 0.8, -0.6, -0.1, 0.2};
+
+  recurrent_to_forget_weights_ = {-0.5, -0.3, -0.5, -0.2, 0.6, 0.4,
+                                  0.9,  0.3,  -0.1, 0.2,  0.5, 0.2};
+
+  recurrent_to_output_weights_ = {0.3,  -0.1, 0.1,  -0.2, -0.5, -0.7,
+                                  -0.2, -0.6, -0.1, -0.4, -0.7, -0.2};
+
+  cell_to_input_weights_ = {0.05, 0.1, 0.25, 0.15};
+
+  cell_to_forget_weights_ = {-0.02, -0.15, -0.25, -0.03};
+
+  cell_to_output_weights_ = {0.1, -0.1, -0.5, 0.05};
+
+  input_layer_norm_coefficients_ = {0.1, 0.2, 0.3, 0.5};
+  forget_layer_norm_coefficients_ = {0.2, 0.2, 0.4, 0.3};
+  cell_layer_norm_coefficients_ = {0.7, 0.2, 0.3, 0.8};
+  output_layer_norm_coefficients_ = {0.6, 0.2, 0.2, 0.5};
+
+  projection_weights_ = {-0.1, 0.2,  0.01, -0.2, 0.1,  0.5,
+                         0.3,  0.08, 0.07, 0.2,  -0.4, 0.2};
+
+  lstm_input_ = {
+      {{0.7, 0.8, 0.1, 0.2, 0.3}, {0.3, 0.2, 0.9, 0.8, 0.1}},
+
+      {{0.8, 0.1, 0.2, 0.4, 0.5}, {0.1, 0.5, 0.2, 0.4, 0.2}},
+
+      {{0.2, 0.7, 0.7, 0.1, 0.7}, {0.6, 0.9, 0.2, 0.5, 0.7}},
+  };
 
-  // Verify the final output.
   lstm_golden_output_ = {
-      {
-          // Batch0: 3 (input_sequence_size) * 3 (n_output)
-          0.0212250091, 0.140474007, 0.0115012666,   // seq 0
-          0.0130806509, 0.152660668, 0.0347516984,   // seq 1
-          -0.0124010444, 0.166042402, 0.0898982584,  // seq 2
-      },
-      {
-          // Batch1: 3 (input_sequence_size) * 3 (n_output)
-          -0.0228835996, 0.0917588323, 0.0778886303,  // seq 0
-          -0.0275101066, 0.148769245, 0.0938384682,   // seq 1
-          -0.0103605557, 0.172605693, 0.0728750974,   // seq 2
-      }};
+      {{0.0244077, 0.128027, -0.00170918}, {-0.00692428, 0.0848741, 0.063445}},
 
-  VerifyGoldens(lstm_input_, lstm_golden_output_, &layer_norm_lstm,
-                /*tolerance=*/0.0009021);
+      {{0.0137642, 0.140751, 0.0395835}, {-0.00403912, 0.139963, 0.072681}},
+
+      {{-0.00459231, 0.155278, 0.0837377}, {0.00752706, 0.161903, 0.0561371}}};
+
+  LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
+                   /*use_cifg=*/false, /*use_peephole=*/true,
+                   /*use_projection_weights=*/true,
+                   /*use_projection_bias=*/false, weight_type,
+                   /*model_has_legacy_20_inputs=*/false,
+                   /*is_layer_norm=*/true, asymmetric_quantize_inputs);
+
+  static const auto* tolerance_per_type =
+      new std::map<TensorType, float>{{TensorType_FLOAT32, 0.00001f},
+                                      {TensorType_UINT8, 0.0010907f},
+                                      {TensorType_INT8, 0.00106f}};
+  VerifyGoldens(&lstm, tolerance_per_type->at(weight_type));
 }
 
-class CifgPeepholeProjectionNoClippingLayerNormLstmInt8Test
-    : public CifgPeepholeProjectionNoClippingLayerNormLstmTest {};
-
-TEST_P(CifgPeepholeProjectionNoClippingLayerNormLstmInt8Test,
-       HybridLayerNormLstmBlackBoxTestInt8) {
+TEST_P(LstmOpTest, Cifg_Peephole_Projection_LayerNorm) {
   const int n_batch = 2;
   const int n_input = 5;
   const int n_cell = 4;
   const int n_output = 3;
-  const float cell_clip = 0.0;
-  const float proj_clip = 0.0;
 
-  LSTMOpModel layer_norm_lstm(
-      n_batch, n_input, n_cell, n_output,
-      /*use_cifg=*/true, /*use_peephole=*/true,
-      /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, cell_clip, proj_clip,
-      {
-          {n_batch, n_input},  // input tensor
+  TensorType weight_type;
+  // Layer normalization needs 24 inputs.
+  bool asymmetric_quantize_inputs;
+  std::tie(weight_type, std::ignore, asymmetric_quantize_inputs) = GetParam();
 
-          {0, 0},             // input_to_input_weight tensor
-          {n_cell, n_input},  // input_to_forget_weight tensor
-          {n_cell, n_input},  // input_to_cell_weight tensor
-          {n_cell, n_input},  // input_to_output_weight tensor
+  // TODO(b/158205028): Fix this test if using NN-API.
+  if (SingleOpModel::GetForceUseNnapi() && weight_type == TensorType_UINT8) {
+    return;
+  }
 
-          {0, 0},              // recurrent_to_input_weight tensor
-          {n_cell, n_output},  // recurrent_to_forget_weight tensor
-          {n_cell, n_output},  // recurrent_to_cell_weight tensor
-          {n_cell, n_output},  // recurrent_to_output_weight tensor
+  input_to_forget_weights_ = {-0.6, -0.1, 0.3,  0.2,  0.9,  -0.5, -0.2,
+                              -0.4, 0.3,  -0.8, -0.4, 0.3,  -0.5, -0.4,
+                              -0.6, 0.3,  -0.4, -0.6, -0.5, -0.5};
+  input_to_cell_weights_ = {-0.4, -0.3, -0.2, -0.1, -0.5, 0.5,  -0.2,
+                            -0.3, -0.2, -0.6, 0.6,  -0.1, -0.4, -0.3,
+                            -0.7, 0.7,  -0.9, -0.5, 0.8,  0.6};
+  input_to_output_weights_ = {-0.8, -0.4, -0.2, -0.9, -0.1, -0.7, 0.3,
+                              -0.3, -0.8, -0.2, 0.6,  -0.2, 0.4,  -0.7,
+                              -0.3, -0.5, 0.1,  0.5,  -0.6, -0.4};
 
-          {0},       // cell_to_input_weight tensor
-          {n_cell},  // cell_to_forget_weight tensor
-          {n_cell},  // cell_to_output_weight tensor
+  forget_gate_bias_ = {0.1, -0.3, -0.2, 0.1};
+  cell_gate_bias_ = {-0.05, 0.72, 0.25, 0.08};
+  output_gate_bias_ = {0.05, -0.01, 0.2, 0.1};
 
-          {0},       // input_gate_bias tensor
-          {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_gate_bias tensor
-          {n_cell},  // output_gate_bias tensor
+  recurrent_to_cell_weights_ = {-0.3, 0.2, 0.1, -0.3, 0.8,  -0.08,
+                                -0.2, 0.3, 0.8, -0.6, -0.1, 0.2};
+  recurrent_to_forget_weights_ = {-0.5, -0.3, -0.5, -0.2, 0.6, 0.4,
+                                  0.9,  0.3,  -0.1, 0.2,  0.5, 0.2};
+  recurrent_to_output_weights_ = {0.3,  -0.1, 0.1,  -0.2, -0.5, -0.7,
+                                  -0.2, -0.6, -0.1, -0.4, -0.7, -0.2};
 
-          {n_output, n_cell},  // projection_weight tensor
-          {0},                 // projection_bias tensor
+  cell_to_forget_weights_ = {-0.02, -0.15, -0.25, -0.03};
+  cell_to_output_weights_ = {0.1, -0.1, -0.5, 0.05};
 
-          {n_batch, n_output},  // output_state tensor
-          {n_batch, n_cell},    // cell_state tensor
+  forget_layer_norm_coefficients_ = {0.2, 0.2, 0.4, 0.3};
+  cell_layer_norm_coefficients_ = {0.7, 0.2, 0.3, 0.8};
+  output_layer_norm_coefficients_ = {0.6, 0.2, 0.2, 0.5};
+  projection_weights_ = {-0.1, 0.2,  0.01, -0.2, 0.1,  0.5,
+                         0.3,  0.08, 0.07, 0.2,  -0.4, 0.2};
 
-          {0},       // input_layer_norm_coefficient tensor
-          {n_cell},  // forget_layer_norm_coefficient tensor
-          {n_cell},  // cell_layer_norm_coefficient tensor
-          {n_cell},  // output_layer_norm_coefficient tensor
-      },
-      /*weight_type=*/TensorType_INT8,
-      /*is_layer_norm=*/true, GetParam());
+  lstm_input_ = {{{0.7, 0.8, 0.1, 0.2, 0.3}, {0.3, 0.2, 0.9, 0.8, 0.1}},
 
-  // Goldens are results using FLOAT32 inference.
-  lstm_golden_output_ = {{
-                             // Batch0: 3 (input_sequence_size) * 3 (n_output)
-                             0.0212971, 0.140816, 0.0112733,  // seq 0
-                             0.0132302, 0.152308, 0.0346313,  // seq 1
-                             -0.0123688, 0.16579, 0.0893078,  // seq 2
-                         },
-                         {
-                             // Batch1: 3 (input_sequence_size) * 3 (n_output)
-                             -0.0226351, 0.0916948, 0.0769176,  // seq 0
-                             -0.0269967, 0.149708, 0.0941492,   // seq 1
-                             -0.0103429, 0.173016, 0.0720509,   // seq 2
-                         }};
+                 {{0.8, 0.1, 0.2, 0.4, 0.5}, {0.1, 0.5, 0.2, 0.4, 0.2}},
 
-  VerifyGoldens(lstm_input_, lstm_golden_output_, &layer_norm_lstm,
-                /*tolerance=*/1e-3);
+                 {{0.2, 0.7, 0.7, 0.1, 0.7}, {0.6, 0.9, 0.2, 0.5, 0.7}}};
+  lstm_golden_output_ = {{{0.02129706, 0.140816242, 0.0112733059},
+                          {-0.0226350538, 0.0916948169, 0.0769175813}},
+
+                         {{0.0132302344, 0.152308047, 0.0346313119},
+                          {-0.0269966982, 0.149707705, 0.094149217}},
+
+                         {{-0.0123688057, 0.165790111, 0.0893077999},
+                          {-0.0103429332, 0.173016444, 0.0720508844}}};
+
+  LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
+                   /*use_cifg=*/true, /*use_peephole=*/true,
+                   /*use_projection_weights=*/true,
+                   /*use_projection_bias=*/false, weight_type,
+                   /*model_has_legacy_20_inputs=*/false,
+                   /*is_layer_norm=*/true, asymmetric_quantize_inputs);
+
+  static const auto* tolerance_per_type =
+      new std::map<TensorType, float>{{TensorType_FLOAT32, 0.00001f},
+                                      {TensorType_UINT8, 0.000971057f},
+                                      {TensorType_INT8, 0.001f}};
+  VerifyGoldens(&lstm, tolerance_per_type->at(weight_type));
 }
 
 class LSTMIntegerOpModel : public SingleOpModel {
@@ -2130,57 +1314,68 @@ class LSTMIntegerOpModel : public SingleOpModel {
   LSTMIntegerOpModel(int n_batch, int n_input, int n_cell, int n_output,
                      bool use_cifg, bool use_peephole,
                      bool use_projection_weights, bool use_projection_bias,
-                     bool use_layer_norm, float cell_clip, float proj_clip,
-                     const std::vector<std::vector<int>>& input_shapes,
+                     bool use_layer_norm, bool use_8x8_8_implementation,
                      const std::vector<std::pair<float, float>>& ranges,
                      const std::vector<std::pair<float, int>>& intermediates)
-      : n_batch_(n_batch),
-        n_input_(n_input),
-        n_cell_(n_cell),
-        n_output_(n_output) {
-    EXPECT_EQ(input_shapes.size() + 1, ranges.size());
-    EXPECT_EQ(intermediates.size(), 5);
-    input_ = AddInput(
-        {TensorType_INT8, input_shapes[0], ranges[0].first, ranges[0].second});
+      : n_input_(n_input), n_output_(n_output) {
+    input_ = AddInput({TensorType_INT8,
+                       {n_batch, n_input},
+                       ranges[0].first,
+                       ranges[0].second});
 
     if (use_cifg) {
       input_to_input_weights_ = AddNullInput();
     } else {
-      input_to_input_weights_ = AddInput({TensorType_INT8, input_shapes[1],
-                                          ranges[1].first, ranges[1].second});
+      input_to_input_weights_ = AddInput({TensorType_INT8,
+                                          {n_cell, n_input},
+                                          ranges[1].first,
+                                          ranges[1].second});
     }
-    input_to_forget_weights_ = AddInput(
-        {TensorType_INT8, input_shapes[2], ranges[2].first, ranges[2].second});
-    input_to_cell_weights_ = AddInput(
-        {TensorType_INT8, input_shapes[3], ranges[3].first, ranges[3].second});
-    input_to_output_weights_ = AddInput(
-        {TensorType_INT8, input_shapes[4], ranges[4].first, ranges[4].second});
+    input_to_forget_weights_ = AddInput({TensorType_INT8,
+                                         {n_cell, n_input},
+                                         ranges[2].first,
+                                         ranges[2].second});
+    input_to_cell_weights_ = AddInput({TensorType_INT8,
+                                       {n_cell, n_input},
+                                       ranges[3].first,
+                                       ranges[3].second});
+    input_to_output_weights_ = AddInput({TensorType_INT8,
+                                         {n_cell, n_input},
+                                         ranges[4].first,
+                                         ranges[4].second});
 
     if (use_cifg) {
       recurrent_to_input_weights_ = AddNullInput();
     } else {
-      recurrent_to_input_weights_ =
-          AddInput({TensorType_INT8, input_shapes[5], ranges[5].first,
-                    ranges[5].second});
+      recurrent_to_input_weights_ = AddInput({TensorType_INT8,
+                                              {n_cell, n_output},
+                                              ranges[5].first,
+                                              ranges[5].second});
     }
-    recurrent_to_forget_weights_ = AddInput(
-        {TensorType_INT8, input_shapes[6], ranges[6].first, ranges[6].second});
-    recurrent_to_cell_weights_ = AddInput(
-        {TensorType_INT8, input_shapes[7], ranges[7].first, ranges[7].second});
-    recurrent_to_output_weights_ = AddInput(
-        {TensorType_INT8, input_shapes[8], ranges[8].first, ranges[8].second});
+    recurrent_to_forget_weights_ = AddInput({TensorType_INT8,
+                                             {n_cell, n_output},
+                                             ranges[6].first,
+                                             ranges[6].second});
+    recurrent_to_cell_weights_ = AddInput({TensorType_INT8,
+                                           {n_cell, n_output},
+                                           ranges[7].first,
+                                           ranges[7].second});
+    recurrent_to_output_weights_ = AddInput({TensorType_INT8,
+                                             {n_cell, n_output},
+                                             ranges[8].first,
+                                             ranges[8].second});
 
     if (use_peephole) {
       if (use_cifg) {
         cell_to_input_weights_ = AddNullInput();
       } else {
-        cell_to_input_weights_ = AddInput({TensorType_INT16, input_shapes[9],
-                                           ranges[9].first, ranges[9].second});
+        cell_to_input_weights_ = AddInput(
+            {TensorType_INT16, {n_cell}, ranges[9].first, ranges[9].second});
       }
-      cell_to_forget_weights_ = AddInput({TensorType_INT16, input_shapes[10],
-                                          ranges[10].first, ranges[10].second});
-      cell_to_output_weights_ = AddInput({TensorType_INT16, input_shapes[11],
-                                          ranges[11].first, ranges[11].second});
+      cell_to_forget_weights_ = AddInput(
+          {TensorType_INT16, {n_cell}, ranges[10].first, ranges[10].second});
+      cell_to_output_weights_ = AddInput(
+          {TensorType_INT16, {n_cell}, ranges[11].first, ranges[11].second});
     } else {
       cell_to_input_weights_ = AddNullInput();
       cell_to_forget_weights_ = AddNullInput();
@@ -2190,62 +1385,68 @@ class LSTMIntegerOpModel : public SingleOpModel {
     if (use_cifg) {
       input_gate_bias_ = AddNullInput();
     } else {
-      input_gate_bias_ = AddInput({TensorType_INT32, input_shapes[12],
-                                   ranges[12].first, ranges[12].second});
+      input_gate_bias_ = AddInput(
+          {TensorType_INT32, {n_cell}, ranges[12].first, ranges[12].second});
     }
-    forget_gate_bias_ = AddInput({TensorType_INT32, input_shapes[13],
-                                  ranges[13].first, ranges[13].second});
-    cell_gate_bias_ = AddInput({TensorType_INT32, input_shapes[14],
-                                ranges[14].first, ranges[14].second});
-    output_gate_bias_ = AddInput({TensorType_INT32, input_shapes[15],
-                                  ranges[15].first, ranges[15].second});
+    forget_gate_bias_ = AddInput(
+        {TensorType_INT32, {n_cell}, ranges[13].first, ranges[13].second});
+    cell_gate_bias_ = AddInput(
+        {TensorType_INT32, {n_cell}, ranges[14].first, ranges[14].second});
+    output_gate_bias_ = AddInput(
+        {TensorType_INT32, {n_cell}, ranges[15].first, ranges[15].second});
 
     if (use_projection_weights) {
-      projection_weights_ = AddInput({TensorType_INT8, input_shapes[16],
-                                      ranges[16].first, ranges[16].second});
-      if (use_projection_bias) {
-        projection_bias_ = AddInput({TensorType_INT32, input_shapes[17],
-                                     ranges[17].first, ranges[17].second});
-      } else {
-        projection_bias_ = AddNullInput();
-      }
+      projection_weights_ = AddInput({TensorType_INT8,
+                                      {n_output, n_cell},
+                                      ranges[16].first,
+                                      ranges[16].second});
     } else {
       projection_weights_ = AddNullInput();
+    }
+    if (use_projection_bias) {
+      CHECK(use_projection_weights);
+      projection_bias_ = AddInput(
+          {TensorType_INT32, {n_output}, ranges[17].first, ranges[17].second});
+    } else {
       projection_bias_ = AddNullInput();
     }
 
     // Adding the 2 state tensors.
-    output_state_ = AddInput({TensorType_INT16, input_shapes[18],
-                              ranges[18].first, ranges[18].second},
-                             true);
-    cell_state_ = AddInput({TensorType_INT16, input_shapes[19],
-                            ranges[19].first, ranges[19].second},
-                           true);
+    AddInput({TensorType_INT16,
+              {n_batch, n_output},
+              ranges[18].first,
+              ranges[18].second},
+             true);
+    AddInput({TensorType_INT16,
+              {n_batch, n_cell},
+              ranges[19].first,
+              ranges[19].second},
+             true);
 
     // Layer norm weights.
     if (use_layer_norm) {
       if (use_cifg) {
         input_layer_norm_coefficients_ = AddNullInput();
       } else {
-        input_layer_norm_coefficients_ =
-            AddInput({TensorType_INT16, input_shapes[20], ranges[20].first,
-                      ranges[20].second});
+        input_layer_norm_coefficients_ = AddInput(
+            {TensorType_INT16, {n_cell}, ranges[20].first, ranges[20].second});
       }
-      forget_layer_norm_coefficients_ =
-          AddInput({TensorType_INT16, input_shapes[21], ranges[21].first,
-                    ranges[21].second});
-      cell_layer_norm_coefficients_ =
-          AddInput({TensorType_INT16, input_shapes[22], ranges[22].first,
-                    ranges[22].second});
-      output_layer_norm_coefficients_ =
-          AddInput({TensorType_INT16, input_shapes[23], ranges[23].first,
-                    ranges[23].second});
+      forget_layer_norm_coefficients_ = AddInput(
+          {TensorType_INT16, {n_cell}, ranges[21].first, ranges[21].second});
+      cell_layer_norm_coefficients_ = AddInput(
+          {TensorType_INT16, {n_cell}, ranges[22].first, ranges[22].second});
+      output_layer_norm_coefficients_ = AddInput(
+          {TensorType_INT16, {n_cell}, ranges[23].first, ranges[23].second});
     }
 
+    if (use_8x8_8_implementation) {
+      EXPECT_EQ(intermediates.size(), 12);
+    } else {
+      EXPECT_EQ(intermediates.size(), 5);
+    }
     for (int i = 0; i < intermediates.size(); ++i) {
-      intermediates_[i] =
-          AddIntermediate(TensorType_INT16, {intermediates[i].first},
-                          {intermediates[i].second});
+      AddIntermediate(TensorType_INT16, {intermediates[i].first},
+                      {intermediates[i].second});
     }
 
     output_ = AddOutput({TensorType_INT8,
@@ -2253,12 +1454,13 @@ class LSTMIntegerOpModel : public SingleOpModel {
                          ranges[24].first,
                          ranges[24].second});
 
-    SetBuiltinOp(BuiltinOperator_LSTM, BuiltinOptions_LSTMOptions,
-                 CreateLSTMOptions(builder_, ActivationFunctionType_TANH,
-                                   cell_clip, proj_clip)
-                     .Union());
+    // TODO(b/161825581): Add tests where cell_clip and/or proj_clip is not the
+    // default 0.
+    SetBuiltinOp(
+        BuiltinOperator_LSTM, BuiltinOptions_LSTMOptions,
+        CreateLSTMOptions(builder_, ActivationFunctionType_TANH).Union());
 
-    BuildInterpreter(input_shapes);
+    BuildInterpreter({});  // Input sizes are already set
   }
 
   void SetInputToInputWeights(const std::vector<float>& f) {
@@ -2353,8 +1555,6 @@ class LSTMIntegerOpModel : public SingleOpModel {
 
   int num_inputs() { return n_input_; }
   int num_outputs() { return n_output_; }
-  int num_cells() { return n_cell_; }
-  int num_batches() { return n_batch_; }
 
  protected:
   int input_;
@@ -2385,26 +1585,18 @@ class LSTMIntegerOpModel : public SingleOpModel {
   int projection_weights_;
   int projection_bias_;
 
-  int intermediates_[5];
-
   int output_;
-  int output_state_;
-  int cell_state_;
 
-  int n_batch_;
   int n_input_;
-  int n_cell_;
   int n_output_;
 };
 
-TEST(LSTMIntegerOpModel, NoCifgYesLayerNormNoYesProjectionNoPeephole) {
+TEST(IntegerLstmOpTest, NoCifg_NoPeephole_Projection_LayerNorm) {
   // Hyper parameters.
   const int n_batch = 2;
   const int n_input = 5;
   const int n_cell = 4;
   const int n_output = 3;
-  const float cell_clip = 0.0;
-  const float proj_clip = 0.0;
 
   // Model related weights.
   const std::vector<float> input_to_input_weights = {
@@ -2453,41 +1645,6 @@ TEST(LSTMIntegerOpModel, NoCifgYesLayerNormNoYesProjectionNoPeephole) {
   const std::vector<float> projection_weights = {
       -0.1, 0.2, 0.01, -0.2, 0.1, 0.5, 0.3, 0.08, 0.07, 0.2, -0.4, 0.2};
 
-  // Input shapes.
-  const std::vector<std::vector<int32_t>> inputs = {
-      {n_batch, n_input},  // input tensor
-
-      {n_cell, n_input},  // input_to_input_weight tensor
-      {n_cell, n_input},  // input_to_forget_weight tensor
-      {n_cell, n_input},  // input_to_cell_weight tensor
-      {n_cell, n_input},  // input_to_output_weight tensor
-
-      {n_cell, n_output},  // recurrent_to_input_weight tensor
-      {n_cell, n_output},  // recurrent_to_forget_weight tensor
-      {n_cell, n_output},  // recurrent_to_cell_weight tensor
-      {n_cell, n_output},  // recurrent_to_output_weight tensor
-
-      {0},  // cell_to_input_weight tensor
-      {0},  // cell_to_forget_weight tensor
-      {0},  // cell_to_output_weight tensor
-
-      {n_cell},  // input_gate_bias tensor
-      {n_cell},  // forget_gate_bias tensor
-      {n_cell},  // cell_gate_bias tensor
-      {n_cell},  // output_gate_bias tensor
-
-      {n_output, n_cell},  // projection_weight tensor
-      {0},                 // projection_bias tensor
-
-      {n_batch, n_output},  // output_state tensor
-      {n_batch, n_cell},    // cell_state tensor
-
-      {n_cell},  // input_layer_norm_coefficient tensor
-      {n_cell},  // forget_layer_norm_coefficient tensor
-      {n_cell},  // cell_layer_norm_coefficient tensor
-      {n_cell},  // output_layer_norm_coefficient tensor
-  };
-
   // Input ranges.
   const std::vector<std::pair<float, float>> ranges = {
       {-1.0, 127.0 / 128},  // input tensor
@@ -2534,8 +1691,9 @@ TEST(LSTMIntegerOpModel, NoCifgYesLayerNormNoYesProjectionNoPeephole) {
                           /*use_cifg=*/false, /*use_peephole=*/false,
                           /*use_projection_weights=*/true,
                           /*use_projection_bias=*/false,
-                          /*use_layer_norm=*/true, cell_clip, proj_clip, inputs,
-                          ranges, intermediates);
+                          /*use_layer_norm=*/true,
+                          /*use_8x8_8_implementation=*/false, ranges,
+                          intermediates);
 
   // Set weights.
   lstm.SetInputToInputWeights(input_to_input_weights);
@@ -2593,14 +1751,12 @@ TEST(LSTMIntegerOpModel, NoCifgYesLayerNormNoYesProjectionNoPeephole) {
   }
 }
 
-TEST(LSTMIntegerOpModel, NoCifgYesLayerNormNoYesProjectionYesPeephole) {
+TEST(IntegerLstmOpTest, NoCifg_Peephole_Projection_LayerNorm) {
   // Hyper parameters.
   const int n_batch = 2;
   const int n_input = 5;
   const int n_cell = 4;
   const int n_output = 3;
-  const float cell_clip = 0.0;
-  const float proj_clip = 0.0;
 
   // Model related weights.
   const std::vector<float> input_to_input_weights = {
@@ -2655,41 +1811,6 @@ TEST(LSTMIntegerOpModel, NoCifgYesLayerNormNoYesProjectionYesPeephole) {
   const std::vector<float> projection_weights = {
       -0.1, 0.2, 0.01, -0.2, 0.1, 0.5, 0.3, 0.08, 0.07, 0.2, -0.4, 0.2};
 
-  // Input shapes.
-  const std::vector<std::vector<int32_t>> inputs = {
-      {n_batch, n_input},  // input tensor
-
-      {n_cell, n_input},  // input_to_input_weight tensor
-      {n_cell, n_input},  // input_to_forget_weight tensor
-      {n_cell, n_input},  // input_to_cell_weight tensor
-      {n_cell, n_input},  // input_to_output_weight tensor
-
-      {n_cell, n_output},  // recurrent_to_input_weight tensor
-      {n_cell, n_output},  // recurrent_to_forget_weight tensor
-      {n_cell, n_output},  // recurrent_to_cell_weight tensor
-      {n_cell, n_output},  // recurrent_to_output_weight tensor
-
-      {n_cell},  // cell_to_input_weight tensor
-      {n_cell},  // cell_to_forget_weight tensor
-      {n_cell},  // cell_to_output_weight tensor
-
-      {n_cell},  // input_gate_bias tensor
-      {n_cell},  // forget_gate_bias tensor
-      {n_cell},  // cell_gate_bias tensor
-      {n_cell},  // output_gate_bias tensor
-
-      {n_output, n_cell},  // projection_weight tensor
-      {0},                 // projection_bias tensor
-
-      {n_batch, n_output},  // output_state tensor
-      {n_batch, n_cell},    // cell_state tensor
-
-      {n_cell},  // input_layer_norm_coefficient tensor
-      {n_cell},  // forget_layer_norm_coefficient tensor
-      {n_cell},  // cell_layer_norm_coefficient tensor
-      {n_cell},  // output_layer_norm_coefficient tensor
-  };
-
   // Input ranges.
   const std::vector<std::pair<float, float>> ranges = {
       {-1.0, 127.0 / 128},  // input tensor
@@ -2736,8 +1857,9 @@ TEST(LSTMIntegerOpModel, NoCifgYesLayerNormNoYesProjectionYesPeephole) {
                           /*use_cifg=*/false, /*use_peephole=*/true,
                           /*use_projection_weights=*/true,
                           /*use_projection_bias=*/false,
-                          /*use_layer_norm=*/true, cell_clip, proj_clip, inputs,
-                          ranges, intermediates);
+                          /*use_layer_norm=*/true,
+                          /*use_8x8_8_implementation=*/false, ranges,
+                          intermediates);
 
   // Set weights.
   lstm.SetInputToInputWeights(input_to_input_weights);
@@ -2799,286 +1921,12 @@ TEST(LSTMIntegerOpModel, NoCifgYesLayerNormNoYesProjectionYesPeephole) {
   }
 }
 
-class LSTMIntegerOpModel8x8_8 : public SingleOpModel {
- public:
-  LSTMIntegerOpModel8x8_8(
-      int n_batch, int n_input, int n_cell, int n_output, bool use_cifg,
-      bool use_peephole, bool use_projection_weights, bool use_projection_bias,
-      bool use_layer_norm, float cell_clip, float proj_clip,
-      const std::vector<std::vector<int>>& input_shapes,
-      const std::vector<std::pair<float, float>>& ranges,
-      const std::vector<std::pair<float, int>>& intermediates)
-      : n_batch_(n_batch),
-        n_input_(n_input),
-        n_cell_(n_cell),
-        n_output_(n_output) {
-    EXPECT_EQ(input_shapes.size() + 1, ranges.size());
-    EXPECT_EQ(intermediates.size(), 12);
-    input_ = AddInput(
-        {TensorType_INT8, input_shapes[0], ranges[0].first, ranges[0].second});
-
-    if (use_cifg) {
-      input_to_input_weights_ = AddNullInput();
-    } else {
-      input_to_input_weights_ = AddInput({TensorType_INT8, input_shapes[1],
-                                          ranges[1].first, ranges[1].second});
-    }
-    input_to_forget_weights_ = AddInput(
-        {TensorType_INT8, input_shapes[2], ranges[2].first, ranges[2].second});
-    input_to_cell_weights_ = AddInput(
-        {TensorType_INT8, input_shapes[3], ranges[3].first, ranges[3].second});
-    input_to_output_weights_ = AddInput(
-        {TensorType_INT8, input_shapes[4], ranges[4].first, ranges[4].second});
-
-    if (use_cifg) {
-      recurrent_to_input_weights_ = AddNullInput();
-    } else {
-      recurrent_to_input_weights_ =
-          AddInput({TensorType_INT8, input_shapes[5], ranges[5].first,
-                    ranges[5].second});
-    }
-    recurrent_to_forget_weights_ = AddInput(
-        {TensorType_INT8, input_shapes[6], ranges[6].first, ranges[6].second});
-    recurrent_to_cell_weights_ = AddInput(
-        {TensorType_INT8, input_shapes[7], ranges[7].first, ranges[7].second});
-    recurrent_to_output_weights_ = AddInput(
-        {TensorType_INT8, input_shapes[8], ranges[8].first, ranges[8].second});
-
-    if (use_peephole) {
-      if (use_cifg) {
-        cell_to_input_weights_ = AddNullInput();
-      } else {
-        cell_to_input_weights_ = AddInput({TensorType_INT16, input_shapes[9],
-                                           ranges[9].first, ranges[9].second});
-      }
-      cell_to_forget_weights_ = AddInput({TensorType_INT16, input_shapes[10],
-                                          ranges[10].first, ranges[10].second});
-      cell_to_output_weights_ = AddInput({TensorType_INT16, input_shapes[11],
-                                          ranges[11].first, ranges[11].second});
-    } else {
-      cell_to_input_weights_ = AddNullInput();
-      cell_to_forget_weights_ = AddNullInput();
-      cell_to_output_weights_ = AddNullInput();
-    }
-
-    if (use_cifg) {
-      input_gate_bias_ = AddNullInput();
-    } else {
-      input_gate_bias_ = AddInput({TensorType_INT32, input_shapes[12],
-                                   ranges[12].first, ranges[12].second});
-    }
-    forget_gate_bias_ = AddInput({TensorType_INT32, input_shapes[13],
-                                  ranges[13].first, ranges[13].second});
-    cell_gate_bias_ = AddInput({TensorType_INT32, input_shapes[14],
-                                ranges[14].first, ranges[14].second});
-    output_gate_bias_ = AddInput({TensorType_INT32, input_shapes[15],
-                                  ranges[15].first, ranges[15].second});
-
-    if (use_projection_weights) {
-      projection_weights_ = AddInput({TensorType_INT8, input_shapes[16],
-                                      ranges[16].first, ranges[16].second});
-      if (use_projection_bias) {
-        projection_bias_ = AddInput({TensorType_INT32, input_shapes[17],
-                                     ranges[17].first, ranges[17].second});
-      } else {
-        projection_bias_ = AddNullInput();
-      }
-    } else {
-      projection_weights_ = AddNullInput();
-      projection_bias_ = AddNullInput();
-    }
-
-    // Adding the 2 state tensors.
-    output_state_ = AddInput({TensorType_INT16, input_shapes[18],
-                              ranges[18].first, ranges[18].second},
-                             true);
-    cell_state_ = AddInput({TensorType_INT16, input_shapes[19],
-                            ranges[19].first, ranges[19].second},
-                           true);
-
-    // Layer norm weights.
-    if (use_layer_norm) {
-      if (use_cifg) {
-        input_layer_norm_coefficients_ = AddNullInput();
-      } else {
-        input_layer_norm_coefficients_ =
-            AddInput({TensorType_INT16, input_shapes[20], ranges[20].first,
-                      ranges[20].second});
-      }
-      forget_layer_norm_coefficients_ =
-          AddInput({TensorType_INT16, input_shapes[21], ranges[21].first,
-                    ranges[21].second});
-      cell_layer_norm_coefficients_ =
-          AddInput({TensorType_INT16, input_shapes[22], ranges[22].first,
-                    ranges[22].second});
-      output_layer_norm_coefficients_ =
-          AddInput({TensorType_INT16, input_shapes[23], ranges[23].first,
-                    ranges[23].second});
-    }
-
-    for (int i = 0; i < intermediates.size(); ++i) {
-      intermediates_[i] =
-          AddIntermediate(TensorType_INT16, {intermediates[i].first},
-                          {intermediates[i].second});
-    }
-
-    output_ = AddOutput({TensorType_INT8,
-                         {n_batch, n_output},
-                         ranges[24].first,
-                         ranges[24].second});
-
-    SetBuiltinOp(BuiltinOperator_LSTM, BuiltinOptions_LSTMOptions,
-                 CreateLSTMOptions(builder_, ActivationFunctionType_TANH,
-                                   cell_clip, proj_clip)
-                     .Union());
-
-    BuildInterpreter(input_shapes);
-  }
-
-  void SetInputToInputWeights(const std::vector<float>& f) {
-    QuantizeAndPopulate<int8_t>(input_to_input_weights_, f);
-  }
-
-  void SetInputToForgetWeights(const std::vector<float>& f) {
-    QuantizeAndPopulate<int8_t>(input_to_forget_weights_, f);
-  }
-
-  void SetInputToCellWeights(const std::vector<float>& f) {
-    QuantizeAndPopulate<int8_t>(input_to_cell_weights_, f);
-  }
-
-  void SetInputToOutputWeights(const std::vector<float>& f) {
-    QuantizeAndPopulate<int8_t>(input_to_output_weights_, f);
-  }
-
-  void SetRecurrentToInputWeights(const std::vector<float>& f) {
-    QuantizeAndPopulate<int8_t>(recurrent_to_input_weights_, f);
-  }
-
-  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
-    QuantizeAndPopulate<int8_t>(recurrent_to_forget_weights_, f);
-  }
-
-  void SetRecurrentToCellWeights(const std::vector<float>& f) {
-    QuantizeAndPopulate<int8_t>(recurrent_to_cell_weights_, f);
-  }
-
-  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
-    QuantizeAndPopulate<int8_t>(recurrent_to_output_weights_, f);
-  }
-
-  void SetCellToInputWeights(const std::vector<float>& f) {
-    QuantizeAndPopulate<int16_t>(cell_to_input_weights_, f);
-  }
-
-  void SetCellToForgetWeights(const std::vector<float>& f) {
-    QuantizeAndPopulate<int16_t>(cell_to_forget_weights_, f);
-  }
-
-  void SetCellToOutputWeights(const std::vector<float>& f) {
-    QuantizeAndPopulate<int16_t>(cell_to_output_weights_, f);
-  }
-
-  void SetInputLayerNormCoefficients(const std::vector<float>& f) {
-    QuantizeAndPopulate<int16_t>(input_layer_norm_coefficients_, f);
-  }
-
-  void SetForgetLayerNormCoefficients(const std::vector<float>& f) {
-    QuantizeAndPopulate<int16_t>(forget_layer_norm_coefficients_, f);
-  }
-
-  void SetCellLayerNormCoefficients(const std::vector<float>& f) {
-    QuantizeAndPopulate<int16_t>(cell_layer_norm_coefficients_, f);
-  }
-
-  void SetOutputLayerNormCoefficients(const std::vector<float>& f) {
-    QuantizeAndPopulate<int16_t>(output_layer_norm_coefficients_, f);
-  }
-
-  void SetInputGateBias(const std::vector<float>& f) {
-    QuantizeAndPopulate<int32_t>(input_gate_bias_, f);
-  }
-
-  void SetForgetGateBias(const std::vector<float>& f) {
-    QuantizeAndPopulate<int32_t>(forget_gate_bias_, f);
-  }
-
-  void SetCellBias(const std::vector<float>& f) {
-    QuantizeAndPopulate<int32_t>(cell_gate_bias_, f);
-  }
-
-  void SetOutputGateBias(const std::vector<float>& f) {
-    QuantizeAndPopulate<int32_t>(output_gate_bias_, f);
-  }
-
-  void SetProjectionWeights(const std::vector<float>& f) {
-    QuantizeAndPopulate<int8_t>(projection_weights_, f);
-  }
-
-  void SetProjectionBias(const std::vector<float>& f) {
-    QuantizeAndPopulate<int32_t>(projection_bias_, f);
-  }
-
-  void SetInput(const std::vector<float>& f) {
-    QuantizeAndPopulate<int8_t>(input_, f);
-  }
-
-  std::vector<int8_t> GetOutput() { return ExtractVector<int8_t>(output_); }
-
-  int num_inputs() { return n_input_; }
-  int num_outputs() { return n_output_; }
-  int num_cells() { return n_cell_; }
-  int num_batches() { return n_batch_; }
-
- protected:
-  int input_;
-  int input_to_input_weights_;
-  int input_to_forget_weights_;
-  int input_to_cell_weights_;
-  int input_to_output_weights_;
-
-  int recurrent_to_input_weights_;
-  int recurrent_to_forget_weights_;
-  int recurrent_to_cell_weights_;
-  int recurrent_to_output_weights_;
-
-  int cell_to_input_weights_;
-  int cell_to_forget_weights_;
-  int cell_to_output_weights_;
-
-  int input_layer_norm_coefficients_;
-  int forget_layer_norm_coefficients_;
-  int cell_layer_norm_coefficients_;
-  int output_layer_norm_coefficients_;
-
-  int input_gate_bias_;
-  int forget_gate_bias_;
-  int cell_gate_bias_;
-  int output_gate_bias_;
-
-  int projection_weights_;
-  int projection_bias_;
-
-  int intermediates_[12];
-
-  int output_;
-  int output_state_;
-  int cell_state_;
-
-  int n_batch_;
-  int n_input_;
-  int n_cell_;
-  int n_output_;
-};
-
-TEST(LSTMIntegerOpModel8x8_8, CifgYesLayerNormNoYesProjectionNoPeephole) {
+TEST(IntegerLstmOpTest, Cifg_NoPeephole_Projection_LayerNorm_8x8_8) {
   // Hyper parameters.
   const int n_batch = 2;
   const int n_input = 5;
   const int n_cell = 4;
   const int n_output = 3;
-  const float cell_clip = 0.0;
-  const float proj_clip = 0.0;
 
   // Model related weights.
   const std::vector<float> input_to_input_weights = {
@@ -3128,41 +1976,6 @@ TEST(LSTMIntegerOpModel8x8_8, CifgYesLayerNormNoYesProjectionNoPeephole) {
       -0.1, 0.2, 0.01, -0.2, 0.1, 0.5, 0.3, 0.08, 0.07, 0.2, -0.4, 0.2};
   const std::vector<float> projection_bias = {0.1, 0.3, 0.5};
 
-  // Input shapes.
-  const std::vector<std::vector<int32_t>> inputs = {
-      {n_batch, n_input},  // input tensor
-
-      {0},                // input_to_input_weight tensor
-      {n_cell, n_input},  // input_to_forget_weight tensor
-      {n_cell, n_input},  // input_to_cell_weight tensor
-      {n_cell, n_input},  // input_to_output_weight tensor
-
-      {0},                 // recurrent_to_input_weight tensor
-      {n_cell, n_output},  // recurrent_to_forget_weight tensor
-      {n_cell, n_output},  // recurrent_to_cell_weight tensor
-      {n_cell, n_output},  // recurrent_to_output_weight tensor
-
-      {0},  // cell_to_input_weight tensor
-      {0},  // cell_to_forget_weight tensor
-      {0},  // cell_to_output_weight tensor
-
-      {0},       // input_gate_bias tensor
-      {n_cell},  // forget_gate_bias tensor
-      {n_cell},  // cell_gate_bias tensor
-      {n_cell},  // output_gate_bias tensor
-
-      {n_output, n_cell},  // projection_weight tensor
-      {n_output},          // projection_bias tensor
-
-      {n_batch, n_output},  // output_state tensor
-      {n_batch, n_cell},    // cell_state tensor
-
-      {0},       // input_layer_norm_coefficient tensor
-      {n_cell},  // forget_layer_norm_coefficient tensor
-      {n_cell},  // cell_layer_norm_coefficient tensor
-      {n_cell},  // output_layer_norm_coefficient tensor
-  };
-
   // Input ranges.
   const std::vector<std::pair<float, float>> ranges = {
       {-1.0, 127.0 / 128},  // input tensor
@@ -3207,12 +2020,13 @@ TEST(LSTMIntegerOpModel8x8_8, CifgYesLayerNormNoYesProjectionNoPeephole) {
       {0.007059, 0}, {0.007, 0},    {0.007, 0},    {0.3, 0}};
 
   // Create model.
-  LSTMIntegerOpModel8x8_8 lstm(n_batch, n_input, n_cell, n_output,
-                               /*use_cifg=*/true, /*use_peephole=*/false,
-                               /*use_projection_weights=*/true,
-                               /*use_projection_bias=*/true,
-                               /*use_layer_norm=*/true, cell_clip, proj_clip,
-                               inputs, ranges, intermediates);
+  LSTMIntegerOpModel lstm(n_batch, n_input, n_cell, n_output,
+                          /*use_cifg=*/true, /*use_peephole=*/false,
+                          /*use_projection_weights=*/true,
+                          /*use_projection_bias=*/true,
+                          /*use_layer_norm=*/true,
+                          /*use_8x8_8_implementation=*/true, ranges,
+                          intermediates);
 
   // Set weights.
   // lstm.SetInputToInputWeights(input_to_input_weights);
@@ -3272,100 +2086,40 @@ TEST(LSTMIntegerOpModel8x8_8, CifgYesLayerNormNoYesProjectionNoPeephole) {
 }
 
 #ifdef GTEST_HAS_DEATH_TEST
-TEST(LSTMOpModel, InvalidTypeTest) {
+TEST(LstmOpTest, InvalidTypes) {
   const int n_batch = 1;
   const int n_input = 2;
   const int n_cell = 4;
   const int n_output = 4;
 
-  EXPECT_DEATH(LSTMOpModel lstm(
-                   n_batch, n_input, n_cell, n_output,
-                   /*use_cifg=*/false, /*use_peephole=*/false,
-                   /*use_projection_weights=*/false,
-                   /*use_projection_bias=*/false,
-                   /*cell_clip=*/0.0, /*proj_clip=*/0.0,
-                   {
-                       {n_batch, n_input},  // input tensor
-
-                       {n_cell, n_input},  // input_to_input_weight tensor
-                       {n_cell, n_input},  // input_to_forget_weight tensor
-                       {n_cell, n_input},  // input_to_cell_weight tensor
-                       {n_cell, n_input},  // input_to_output_weight tensor
-
-                       {n_cell, n_output},  // recurrent_to_input_weight_tensor
-                       {n_cell, n_output},  // recurrent_to_forget_weight_tensor
-                       {n_cell, n_output},  // recurrent_to_cell_weight_tensor
-                       {n_cell, n_output},  // recurrent_to_output_weight_tensor
-
-                       {0},  // cell_to_input_weight tensor
-                       {0},  // cell_to_forget_weight tensor
-                       {0},  // cell_to_output_weight tensor
-
-                       {n_cell},  // input_gate_bias tensor
-                       {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_gate_bias tensor
-                       {n_cell},  // output_gate_bias tensor
-
-                       {0, 0},  // projection_weight tensor
-                       {0},     // projection_bias tensor
-                   },
-                   /*weight_type=*/TensorType_INT32,
-                   /*is_layer_norm=*/false),
+  EXPECT_DEATH(LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
+                                /*use_cifg=*/false, /*use_peephole=*/false,
+                                /*use_projection_weights=*/false,
+                                /*use_projection_bias=*/false,
+                                /*weight_type=*/TensorType_INT32,
+                                /*model_has_legacy_20_inputs=*/true,
+                                /*is_layer_norm=*/false,
+                                /*asymmetric_quantize_inputs=*/false),
                "");
 
-  EXPECT_DEATH(LSTMOpModel lstm(
-                   n_batch, n_input, n_cell, n_output,
-                   /*use_cifg=*/false, /*use_peephole=*/false,
-                   /*use_projection_weights=*/false,
-                   /*use_projection_bias=*/false,
-                   /*cell_clip=*/0.0, /*proj_clip=*/0.0,
-                   {
-                       {n_batch, n_input},  // input tensor
-
-                       {n_cell, n_input},  // input_to_input_weight tensor
-                       {n_cell, n_input},  // input_to_forget_weight tensor
-                       {n_cell, n_input},  // input_to_cell_weight tensor
-                       {n_cell, n_input},  // input_to_output_weight tensor
-
-                       {n_cell, n_output},  // recurrent_to_input_weight_tensor
-                       {n_cell, n_output},  // recurrent_to_forget_weight_tensor
-                       {n_cell, n_output},  // recurrent_to_cell_weight_tensor
-                       {n_cell, n_output},  // recurrent_to_output_weight_tensor
-
-                       {0},  // cell_to_input_weight tensor
-                       {0},  // cell_to_forget_weight tensor
-                       {0},  // cell_to_output_weight tensor
-
-                       {n_cell},  // input_gate_bias tensor
-                       {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_gate_bias tensor
-                       {n_cell},  // output_gate_bias tensor
-
-                       {0, 0},  // projection_weight tensor
-                       {0},     // projection_bias tensor
-                   },
-                   /*weight_type=*/TensorType_COMPLEX64,
-                   /*is_layer_norm=*/false),
+  EXPECT_DEATH(LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
+                                /*use_cifg=*/false, /*use_peephole=*/false,
+                                /*use_projection_weights=*/false,
+                                /*use_projection_bias=*/false,
+                                /*weight_type=*/TensorType_COMPLEX64,
+                                /*model_has_legacy_20_inputs=*/true,
+                                /*is_layer_norm=*/false,
+                                /*asymmetric_quantize_inputs=*/false),
                "");
 }
 #endif
 
 // Test parameter controls asymmetric_quantize_inputs in LSTMOpModel.
-#define QUANTIZE_PARAMETER_TEST(test) \
-  INSTANTIATE_TEST_SUITE_P(test, test, ::testing::Bool())
-
-QUANTIZE_PARAMETER_TEST(NoCifgNoPeepholeNoProjectionNoClippingLstmTest);
-QUANTIZE_PARAMETER_TEST(NoCifgNoPeepholeNoProjectionNoClippingLstmInt8Test);
-QUANTIZE_PARAMETER_TEST(CifgNoPeepholeNoProjectionNoClippingLstmTest);
-QUANTIZE_PARAMETER_TEST(CifgNoPeepholeNoProjectionNoClippingLstmInt8Test);
-QUANTIZE_PARAMETER_TEST(NoCifgPeepholeProjectionNoClippingLstmTest);
-QUANTIZE_PARAMETER_TEST(NoCifgPeepholeProjectionNoClippingLstmInt8Test);
-QUANTIZE_PARAMETER_TEST(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest);
-QUANTIZE_PARAMETER_TEST(
-    NoCifgPeepholeProjectionNoClippingLayerNormLstmInt8Test);
-QUANTIZE_PARAMETER_TEST(CifgPeepholeProjectionNoClippingLayerNormLstmTest);
-QUANTIZE_PARAMETER_TEST(CifgPeepholeProjectionNoClippingLayerNormLstmInt8Test);
-#undef QUANTIZE_PARAMETER_TEST
+INSTANTIATE_TEST_SUITE_P(
+    Parameterized, LstmOpTest,
+    ::testing::Combine(::testing::Values(TensorType_FLOAT32, TensorType_UINT8,
+                                         TensorType_UINT8),
+                       ::testing::Bool(), ::testing::Bool()));
 
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/pad.cc b/tensorflow/lite/kernels/pad.cc
index 7a864cface0..e522ae06bfb 100644
--- a/tensorflow/lite/kernels/pad.cc
+++ b/tensorflow/lite/kernels/pad.cc
@@ -156,7 +156,7 @@ TfLiteStatus EvalInt(TfLiteContext* context, const PadContext& op_context,
   }
   const integer_type pad_value_copy = pad_value;
   if (op_context.resizing_category == ResizingCategory::kImageStyle) {
-    reference_ops::PadImageStyle(
+    optimized_ops::PadImageStyle(
         op_params, GetTensorShape(op_context.input),
         GetTensorData<integer_type>(op_context.input), &pad_value_copy,
         GetTensorShape(op_context.output),
diff --git a/tensorflow/lite/kernels/reduce.cc b/tensorflow/lite/kernels/reduce.cc
index 6107b01cd46..10fd7b02b61 100644
--- a/tensorflow/lite/kernels/reduce.cc
+++ b/tensorflow/lite/kernels/reduce.cc
@@ -196,9 +196,8 @@ TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
       temp_sum->type = kTfLiteInt64;
       break;
     case kTfLiteUInt8:
-      temp_sum->type = kTfLiteInt32;
-      break;
     case kTfLiteInt8:
+    case kTfLiteInt16:
       temp_sum->type = kTfLiteInt32;
       break;
     case kTfLiteBool:
@@ -245,7 +244,9 @@ TfLiteStatus PrepareMeanOrSum(TfLiteContext* context, TfLiteNode* node) {
 
   // reduce_mean requires a buffer to store intermediate sum result.
   OpContext op_context(context, node);
-  if (op_context.input->type == kTfLiteInt8) {
+  if (op_context.input->type == kTfLiteInt8 ||
+      op_context.input->type == kTfLiteUInt8 ||
+      op_context.input->type == kTfLiteInt16) {
     const double real_multiplier =
         static_cast<double>(op_context.input->params.scale) /
         static_cast<double>(op_context.output->params.scale);
@@ -273,6 +274,69 @@ void ResolveAxis(const int* axis_data, int axis_count,
   }
 }
 
+template <typename integer_type>
+TfLiteStatus EvalMeanReferenceOps(TfLiteContext* context,
+                                  const OpContext& op_context, int num_axis,
+                                  OpData* data, TfLiteTensor* temp_index,
+                                  TfLiteTensor* resolved_axis,
+                                  TfLiteTensor* temp_sum) {
+  tflite::MeanParams op_params;
+  op_params.axis_count = num_axis;
+  ResolveAxis(GetTensorData<int>(op_context.axis), num_axis, &op_params);
+  const TfLiteTensor* input = op_context.input;
+  // TODO(b/139102329): Handle all the cases in the combined reference
+  // method.
+  if (op_context.params->keep_dims && NumDimensions(input) == 4 &&
+      op_params.axis_count == 2 &&
+      ((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
+       (op_params.axis[0] == 2 && op_params.axis[1] == 1))) {
+    if (std::is_same<integer_type, uint8_t>::value) {
+      reference_ops::Mean(op_params, GetTensorShape(op_context.input),
+                          GetTensorData<uint8_t>(op_context.input),
+                          op_context.input->params.zero_point,
+                          op_context.input->params.scale,
+                          GetTensorShape(op_context.output),
+                          GetTensorData<uint8_t>(op_context.output),
+                          op_context.output->params.zero_point,
+                          op_context.output->params.scale);
+    } else {
+      reference_integer_ops::Mean(
+          op_params, data->multiplier, data->shift, GetTensorShape(input),
+          GetTensorData<integer_type>(input),
+          op_context.input->params.zero_point,
+          GetTensorShape(op_context.output),
+          GetTensorData<integer_type>(op_context.output),
+          op_context.output->params.zero_point);
+    }
+  } else if (input->params.zero_point == op_context.output->params.zero_point &&
+             input->params.scale == op_context.output->params.scale) {
+    TF_LITE_ENSURE(
+        context,
+        reference_ops::Mean(
+            GetTensorData<integer_type>(input), input->dims->data,
+            input->dims->size, GetTensorData<integer_type>(op_context.output),
+            op_context.output->dims->data, op_context.output->dims->size,
+            GetTensorData<int>(op_context.axis), num_axis,
+            op_context.params->keep_dims, GetTensorData<int>(temp_index),
+            GetTensorData<int>(resolved_axis), GetTensorData<int>(temp_sum)));
+  } else {
+    TF_LITE_ENSURE(
+        context,
+        reference_ops::QuantizedMeanOrSum<>(
+            GetTensorData<integer_type>(input), input->params.zero_point,
+            input->params.scale, input->dims->data, input->dims->size,
+            GetTensorData<integer_type>(op_context.output),
+            op_context.output->params.zero_point,
+            op_context.output->params.scale, op_context.output->dims->data,
+            op_context.output->dims->size, GetTensorData<int>(op_context.axis),
+            num_axis, op_context.params->keep_dims,
+            GetTensorData<int>(temp_index), GetTensorData<int>(resolved_axis),
+            GetTensorData<int>(temp_sum),
+            /*compute_sum=*/false));
+  }
+  return kTfLiteOk;
+}
+
 template <KernelType kernel_type>
 TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
   OpContext op_context(context, node);
@@ -400,101 +464,19 @@ TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
               GetTensorData<int64_t>(temp_sum)));
       break;
     case kTfLiteInt8: {
-      tflite::MeanParams op_params;
-      op_params.axis_count = num_axis;
-      ResolveAxis(GetTensorData<int>(op_context.axis), num_axis, &op_params);
-      const TfLiteTensor* input = op_context.input;
-      // TODO(b/139102329): Handle all the cases in the combined reference
-      // method.
-      if (op_context.params->keep_dims && NumDimensions(input) == 4 &&
-          op_params.axis_count == 2 &&
-          ((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
-           (op_params.axis[0] == 2 && op_params.axis[1] == 1))) {
-        reference_integer_ops::Mean(
-            op_params, data->multiplier, data->shift, GetTensorShape(input),
-            GetTensorData<int8_t>(input), op_context.input->params.zero_point,
-            GetTensorShape(op_context.output),
-            GetTensorData<int8_t>(op_context.output),
-            op_context.output->params.zero_point);
-      } else if (input->params.zero_point ==
-                     op_context.output->params.zero_point &&
-                 input->params.scale == op_context.output->params.scale) {
-        TF_LITE_ENSURE(
-            context,
-            reference_ops::Mean(
-                GetTensorData<int8_t>(input), input->dims->data,
-                input->dims->size, GetTensorData<int8_t>(op_context.output),
-                op_context.output->dims->data, op_context.output->dims->size,
-                GetTensorData<int>(op_context.axis), num_axis,
-                op_context.params->keep_dims, GetTensorData<int>(temp_index),
-                GetTensorData<int>(resolved_axis),
-                GetTensorData<int>(temp_sum)));
-      } else {
-        TF_LITE_ENSURE(
-            context,
-            reference_ops::QuantizedMeanOrSum<>(
-                GetTensorData<int8_t>(input), input->params.zero_point,
-                input->params.scale, input->dims->data, input->dims->size,
-                GetTensorData<int8_t>(op_context.output),
-                op_context.output->params.zero_point,
-                op_context.output->params.scale, op_context.output->dims->data,
-                op_context.output->dims->size,
-                GetTensorData<int>(op_context.axis), num_axis,
-                op_context.params->keep_dims, GetTensorData<int>(temp_index),
-                GetTensorData<int>(resolved_axis), GetTensorData<int>(temp_sum),
-                /*compute_sum=*/false));
-      }
+      TF_LITE_ENSURE_OK(context, EvalMeanReferenceOps<int8_t>(
+                                     context, op_context, num_axis, data,
+                                     temp_index, resolved_axis, temp_sum));
+    } break;
+    case kTfLiteInt16: {
+      TF_LITE_ENSURE_OK(context, EvalMeanReferenceOps<int16_t>(
+                                     context, op_context, num_axis, data,
+                                     temp_index, resolved_axis, temp_sum));
     } break;
     case kTfLiteUInt8: {
-      // TODO(b/139102329): Handle all the cases in the combined reference
-      // method.
-      tflite::MeanParams op_params;
-      op_params.axis_count = num_axis;
-      ResolveAxis(GetTensorData<int>(op_context.axis), num_axis, &op_params);
-      if (op_context.params->keep_dims &&
-          NumDimensions(op_context.input) == 4 && op_params.axis_count == 2 &&
-          ((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
-           (op_params.axis[0] == 2 && op_params.axis[1] == 1))) {
-        reference_ops::Mean(op_params, GetTensorShape(op_context.input),
-                            GetTensorData<uint8_t>(op_context.input),
-                            op_context.input->params.zero_point,
-                            op_context.input->params.scale,
-                            GetTensorShape(op_context.output),
-                            GetTensorData<uint8_t>(op_context.output),
-                            op_context.output->params.zero_point,
-                            op_context.output->params.scale);
-      } else if (op_context.input->params.zero_point ==
-                     op_context.output->params.zero_point &&
-                 op_context.input->params.scale ==
-                     op_context.output->params.scale) {
-        TF_LITE_ENSURE(
-            context,
-            reference_ops::Mean(
-                GetTensorData<uint8_t>(op_context.input),
-                op_context.input->dims->data, op_context.input->dims->size,
-                GetTensorData<uint8_t>(op_context.output),
-                op_context.output->dims->data, op_context.output->dims->size,
-                GetTensorData<int>(op_context.axis), num_axis,
-                op_context.params->keep_dims, GetTensorData<int>(temp_index),
-                GetTensorData<int>(resolved_axis),
-                GetTensorData<int>(temp_sum)));
-      } else {
-        TF_LITE_ENSURE(
-            context,
-            reference_ops::QuantizedMeanOrSum<>(
-                GetTensorData<uint8_t>(op_context.input),
-                op_context.input->params.zero_point,
-                op_context.input->params.scale, op_context.input->dims->data,
-                op_context.input->dims->size,
-                GetTensorData<uint8_t>(op_context.output),
-                op_context.output->params.zero_point,
-                op_context.output->params.scale, op_context.output->dims->data,
-                op_context.output->dims->size,
-                GetTensorData<int>(op_context.axis), num_axis,
-                op_context.params->keep_dims, GetTensorData<int>(temp_index),
-                GetTensorData<int>(resolved_axis), GetTensorData<int>(temp_sum),
-                /*compute_sum=*/false));
-      }
+      TF_LITE_ENSURE_OK(context, EvalMeanReferenceOps<uint8_t>(
+                                     context, op_context, num_axis, data,
+                                     temp_index, resolved_axis, temp_sum));
     } break;
     default:
       return kTfLiteError;
@@ -516,7 +498,8 @@ TfLiteStatus EvalLogic(TfLiteContext* context, TfLiteNode* node,
                       ResizeTempAxis(context, op_context, resolved_axis));
     TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, op_context));
   }
-  if (op_context->input->type == kTfLiteUInt8) {
+  if (op_context->input->type == kTfLiteUInt8 ||
+      op_context->input->type == kTfLiteInt8) {
     TF_LITE_ENSURE_EQ(context, op_context->input->params.scale,
                       op_context->output->params.scale);
     TF_LITE_ENSURE_EQ(context, op_context->input->params.zero_point,
diff --git a/tensorflow/lite/kernels/reduce_test.cc b/tensorflow/lite/kernels/reduce_test.cc
index 2c83369ea37..fd5724a102b 100644
--- a/tensorflow/lite/kernels/reduce_test.cc
+++ b/tensorflow/lite/kernels/reduce_test.cc
@@ -233,7 +233,14 @@ class AnyOpDynamicModel : public BaseOpModel {
 };
 
 // for quantized Add, the error shouldn't exceed step
-float GetTolerance(int min, int max) { return (max - min) / 255.0; }
+template <typename integer_type = int8_t>
+float GetTolerance(int min, int max) {
+  if (std::is_same<int16_t, integer_type>::value) {
+    return (max - min) / 65536.0;
+  } else {
+    return (max - min) / 255.0;
+  }
+}
 
 // Tests for reduce_mean
 TEST(ConstFloatMeanOpTest, NotKeepDims) {
@@ -430,65 +437,125 @@ TEST(ConstUint8MeanOpTest, KeepDims) {
       ElementsAreArray(ArrayFloatNear({0.3, 0.35, 0.55}, kQuantizedTolerance)));
 }
 
-TEST(ConstInt8MeanOpTest, NonSpecialAxisSameScale) {
-  float kQuantizedTolerance = GetTolerance(-5.0, 5.0);
+template <typename integer_type, TensorType tensor_dtype>
+void MeanOpConstModelTest() {
+  float kQuantizedTolerance = GetTolerance<integer_type>(-5.0, 5.0);
   std::vector<float> data = {105.0, 71.0, 233.0, 92.0, 227.0, 11.0, 14.0, 43.0};
-  MeanOpConstModel m({TensorType_INT8, {1, 1, 2, 4}, 0.0, 255.0},
-                     {TensorType_INT8, {1, 2, 4}, 0.0, 255.0}, {1}, {1}, false);
-  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+
+  float scale = tensor_dtype == TensorType_INT16 ? 255 / 32767.0f : 0.0f;
+
+  MeanOpConstModel m({tensor_dtype, {1, 1, 2, 4}, 0.0, 255.0, scale, 0},
+                     {tensor_dtype, {1, 2, 4}, 0.0, 255.0, scale, 0}, {1}, {1},
+                     false);
+  m.QuantizeAndPopulate<integer_type>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 4}));
-  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+  EXPECT_THAT(m.GetDequantizedOutput<integer_type>(),
               ElementsAreArray(ArrayFloatNear(data, kQuantizedTolerance)));
 }
 
-TEST(ConstInt8MeanOpTest, NonSpecialAxisNonSameScale) {
-  float kQuantizedTolerance = GetTolerance(-5.0, 5.0);
+class ConstMeanOpTestSameScale : public ::testing::Test {};
+
+TEST_F(ConstMeanOpTestSameScale, NonSpecialAxisSameScaleInt8) {
+  MeanOpConstModelTest<int8_t, TensorType_INT8>();
+}
+
+TEST_F(ConstMeanOpTestSameScale, NonSpecialAxisSameScaleInt16) {
+  MeanOpConstModelTest<int16_t, TensorType_INT16>();
+}
+
+template <typename integer_type, TensorType tensor_dtype>
+void ConstMeanOpTestNonSameScale() {
+  float kQuantizedTolerance = GetTolerance<integer_type>(-5.0, 5.0);
   std::vector<float> data = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8};
-  MeanOpConstModel m({TensorType_INT8, {1, 1, 2, 4}, -1.0, 1.0},
-                     {TensorType_INT8, {1, 2}, -5.0, 5.0}, {2}, {1, 3}, false);
-  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+
+  float scale = tensor_dtype == TensorType_INT16 ? 1 / 32767.f : 0.0f;
+
+  MeanOpConstModel m({tensor_dtype, {1, 1, 2, 4}, -1.0, 1.0, scale, 0},
+                     {tensor_dtype, {1, 2}, -5.0, 5.0, scale, 0}, {2}, {1, 3},
+                     false);
+  m.QuantizeAndPopulate<integer_type>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
   EXPECT_THAT(
-      m.GetDequantizedOutput<int8_t>(),
+      m.GetDequantizedOutput<integer_type>(),
       ElementsAreArray(ArrayFloatNear({0.25, 0.65}, kQuantizedTolerance)));
 }
 
-TEST(ConstInt8MeanOpTest, QuantizedSameScale) {
-  float kQuantizedTolerance = GetTolerance(-5.0, 5.0);
+class ConstMeanOpTestNonSameScale : public ::testing::Test {};
+
+TEST_F(ConstMeanOpTestNonSameScale, NonSpecialAxisNonSameScaleInt8) {
+  MeanOpConstModelTest<int8_t, TensorType_INT8>();
+}
+
+TEST_F(ConstMeanOpTestNonSameScale, NonSpecialAxisNonSameScaleInt16) {
+  MeanOpConstModelTest<int16_t, TensorType_INT16>();
+}
+
+template <typename integer_type, TensorType tensor_dtype>
+void MeanOpTestQuantizedSameScale() {
+  float kQuantizedTolerance = GetTolerance<integer_type>(-5.0, 5.0);
+
+  float scale = tensor_dtype == TensorType_INT16 ? 1 / 32767.f : 0.0f;
+
   std::vector<float> data = {0.1, 0.2, 0.3, 0.4, 0.2, 0.3, 0.4, 0.5, 0.1,
                              0.1, 0.1, 0.1, 0.4, 0.2, 0.2, 0.2, 0.9, 0.9,
                              0.9, 0.9, 0.2, 0.3, 0.7, 0.7, 0.1, 0.1, 0.3,
                              0.3, 0.1, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3, 0.4};
-  MeanOpConstModel m({TensorType_INT8, {1, 2, 2, 9}, -1.0, 1.0},
-                     {TensorType_INT8, {2}, -1.0, 1.0}, {2}, {1, 2}, true);
-  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  MeanOpConstModel m({tensor_dtype, {1, 2, 2, 9}, -1.0, 1.0, scale, 0},
+                     {tensor_dtype, {2}, -1.0, 1.0, scale, 0}, {2}, {1, 2},
+                     true);
+  m.QuantizeAndPopulate<integer_type>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 1, 9}));
-  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+  EXPECT_THAT(m.GetDequantizedOutput<integer_type>(),
               ElementsAreArray(ArrayFloatNear(
                   {0.35, 0.325, 0.2, 0.35, 0.375, 0.325, 0.225, 0.45, 0.425},
                   kQuantizedTolerance)));
 }
 
-TEST(ConstInt8MeanOpTest, QuantizedDifferentScale) {
-  float kQuantizedTolerance = GetTolerance(-5.0, 5.0);
+class MeanOpTestQuantizedSameScale : public ::testing::Test {};
+
+TEST_F(MeanOpTestQuantizedSameScale, QuantizedSameScaleInt8) {
+  MeanOpConstModelTest<int8_t, TensorType_INT8>();
+}
+
+TEST_F(MeanOpTestQuantizedSameScale, QuantizedSameScaleInt16) {
+  MeanOpConstModelTest<int16_t, TensorType_INT16>();
+}
+
+template <typename integer_type, TensorType tensor_dtype>
+void MeanOpTestQuantizedDifferentScale() {
+  float kQuantizedTolerance = GetTolerance<integer_type>(-5.0, 5.0);
+
+  float scale = tensor_dtype == TensorType_INT16 ? 1 / 32767.f : 0.0f;
+
   std::vector<float> data = {0.1, 0.2, 0.3, 0.4, 0.2, 0.3, 0.4, 0.5, 0.1,
                              0.1, 0.1, 0.1, 0.4, 0.2, 0.2, 0.2, 0.9, 0.9,
                              0.9, 0.9, 0.2, 0.3, 0.7, 0.7, 0.1, 0.1, 0.3,
                              0.3, 0.1, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3, 0.4};
-  MeanOpConstModel m({TensorType_INT8, {1, 2, 2, 9}, -1.0, 1.0},
-                     {TensorType_INT8, {2}, -4.0, 4.0}, {2}, {1, 2}, true);
-  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  MeanOpConstModel m({tensor_dtype, {1, 2, 2, 9}, -1.0, 1.0, scale, 0},
+                     {tensor_dtype, {2}, -4.0, 4.0, scale, 0}, {2}, {1, 2},
+                     true);
+  m.QuantizeAndPopulate<integer_type>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 1, 9}));
-  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+  EXPECT_THAT(m.GetDequantizedOutput<integer_type>(),
               ElementsAreArray(ArrayFloatNear(
                   {0.35, 0.325, 0.2, 0.35, 0.375, 0.325, 0.225, 0.45, 0.425},
                   kQuantizedTolerance)));
 }
 
+class MeanOpTestQuantizedDifferentScale : public ::testing::Test {};
+
+TEST_F(MeanOpTestQuantizedDifferentScale, QuantizedDifferentScaleInt8) {
+  MeanOpConstModelTest<int8_t, TensorType_INT8>();
+}
+
+TEST_F(MeanOpTestQuantizedDifferentScale, QuantizedDifferentScaleInt16) {
+  MeanOpConstModelTest<int16_t, TensorType_INT16>();
+}
+
 TEST(ConstFloatMeanOpTest, KeepDims4DMeanLargeDepthInt8) {
   float kQuantizedTolerance = GetTolerance(-5.0, 5.0);
   std::vector<float> data = {
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index 667d4a8e4f8..1d1db9e0403 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -131,13 +131,13 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_DEPTH_TO_SPACE, Register_DEPTH_TO_SPACE());
   AddBuiltin(BuiltinOperator_GATHER, Register_GATHER(),
              /* min_version = */ 1,
-             /* max_version = */ 3);
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_TRANSPOSE, Register_TRANSPOSE(),
              /* min_version = */ 1,
              /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_MEAN, Register_MEAN(),
              /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_DIV, Register_DIV(),
              /* min_version */ 1,
              /* max_version */ 2);
diff --git a/tensorflow/lite/kernels/test_delegate_providers.cc b/tensorflow/lite/kernels/test_delegate_providers.cc
new file mode 100644
index 00000000000..d2cb2d1021d
--- /dev/null
+++ b/tensorflow/lite/kernels/test_delegate_providers.cc
@@ -0,0 +1,57 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/test_delegate_providers.h"
+
+#include "tensorflow/lite/tools/command_line_flags.h"
+#include "tensorflow/lite/tools/logging.h"
+
+namespace tflite {
+/*static*/ KernelTestDelegateProviders* KernelTestDelegateProviders::Get() {
+  static KernelTestDelegateProviders* const providers =
+      new KernelTestDelegateProviders();
+  return providers;
+}
+
+KernelTestDelegateProviders::KernelTestDelegateProviders() {
+  for (const auto& one : tools::GetRegisteredDelegateProviders()) {
+    params_.Merge(one->DefaultParams());
+  }
+}
+
+bool KernelTestDelegateProviders::InitFromCmdlineArgs(int* argc,
+                                                      const char** argv) {
+  std::vector<tflite::Flag> flags;
+  for (const auto& one : tools::GetRegisteredDelegateProviders()) {
+    auto one_flags = one->CreateFlags(&params_);
+    flags.insert(flags.end(), one_flags.begin(), one_flags.end());
+  }
+  return tflite::Flags::Parse(argc, argv, flags);
+}
+
+std::vector<tools::TfLiteDelegatePtr>
+KernelTestDelegateProviders::CreateAllDelegates(
+    const tools::ToolParams& params) const {
+  std::vector<tools::TfLiteDelegatePtr> delegates;
+  for (const auto& one : tools::GetRegisteredDelegateProviders()) {
+    auto ptr = one->CreateTfLiteDelegate(params);
+    // It's possible that a delegate of certain type won't be created as
+    // user-specified benchmark params tells not to.
+    if (ptr == nullptr) continue;
+    delegates.emplace_back(std::move(ptr));
+    TFLITE_LOG(INFO) << one->GetName() << " delegate is created.";
+  }
+  return delegates;
+}
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/test_delegate_providers.h b/tensorflow/lite/kernels/test_delegate_providers.h
new file mode 100644
index 00000000000..668441c6b77
--- /dev/null
+++ b/tensorflow/lite/kernels/test_delegate_providers.h
@@ -0,0 +1,71 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_TEST_DELEGATE_PROVIDERS_H_
+#define TENSORFLOW_LITE_KERNELS_TEST_DELEGATE_PROVIDERS_H_
+
+#include <vector>
+
+#include "tensorflow/lite/tools/delegates/delegate_provider.h"
+#include "tensorflow/lite/tools/tool_params.h"
+
+namespace tflite {
+// A utility class to provide TfLite delegate creations for kernel tests. The
+// options of a particular delegate could be specified from commandline flags by
+// using the delegate provider registrar as implemented in lite/tools/delegates
+// directory.
+class KernelTestDelegateProviders {
+ public:
+  // Returns a global KernelTestDelegateProviders instance.
+  static KernelTestDelegateProviders* Get();
+
+  KernelTestDelegateProviders();
+
+  // Initialize delegate-related parameters from commandline arguments and
+  // returns true if successful.
+  bool InitFromCmdlineArgs(int* argc, const char** argv);
+
+  // This provides a way to overwrite parameter values programmatically before
+  // creating TfLite delegates. Note, changes to the returned ToolParams will
+  // have a global impact on creating TfLite delegates.
+  // If a local-only change is preferred, recommend using the following workflow
+  // create TfLite delegates via delegate providers:
+  // tools::ToolParams local_params;
+  // local_params.Merge(KernelTestDelegateProviders::Get()->ConstParams());
+  // Overwrite params in local_params by calling local_params.Set<...>(...);
+  // Get TfLite delegates via
+  // KernelTestDelegateProviders::Get()->CreateAllDelegates(local_params);
+  tools::ToolParams* MutableParams() { return &params_; }
+  const tools::ToolParams& ConstParams() const { return params_; }
+
+  // Create a list of TfLite delegates based on the provided parameters
+  // `params`.
+  std::vector<tools::TfLiteDelegatePtr> CreateAllDelegates(
+      const tools::ToolParams& params) const;
+
+  // Similar to the above, but creating a list of TfLite delegates based on what
+  // have been initialized (i.e. 'params_').
+  std::vector<tools::TfLiteDelegatePtr> CreateAllDelegates() const {
+    return CreateAllDelegates(params_);
+  }
+
+ private:
+  // Contain delegate-related parameters that are initialized from command-line
+  // flags.
+  tools::ToolParams params_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_TEST_DELEGATE_PROVIDERS_H_
diff --git a/tensorflow/lite/kernels/test_delegate_providers_test.cc b/tensorflow/lite/kernels/test_delegate_providers_test.cc
new file mode 100644
index 00000000000..8ec09f5bf25
--- /dev/null
+++ b/tensorflow/lite/kernels/test_delegate_providers_test.cc
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/test_delegate_providers.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace {
+TEST(KernelTestDelegateProvidersTest, DelegateProvidersParams) {
+  KernelTestDelegateProviders providers;
+  const auto& params = providers.ConstParams();
+  EXPECT_TRUE(params.HasParam("use_xnnpack"));
+  EXPECT_TRUE(params.HasParam("use_nnapi"));
+
+  int argc = 3;
+  const char* argv[] = {"program_name", "--use_nnapi=true",
+                        "--other_undefined_flag=1"};
+  EXPECT_TRUE(providers.InitFromCmdlineArgs(&argc, argv));
+  EXPECT_TRUE(params.Get<bool>("use_nnapi"));
+  EXPECT_EQ(2, argc);
+  EXPECT_EQ("--other_undefined_flag=1", argv[1]);
+}
+
+TEST(KernelTestDelegateProvidersTest, CreateTfLiteDelegates) {
+#if !defined(__Fuchsia__) && !defined(TFLITE_WITHOUT_XNNPACK)
+  KernelTestDelegateProviders providers;
+  providers.MutableParams()->Set<bool>("use_xnnpack", true);
+  EXPECT_GE(providers.CreateAllDelegates().size(), 1);
+
+  tools::ToolParams local_params;
+  local_params.Merge(providers.ConstParams());
+  local_params.Set<bool>("use_xnnpack", false);
+  EXPECT_TRUE(providers.CreateAllDelegates(local_params).empty());
+#endif
+}
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/test_main.cc b/tensorflow/lite/kernels/test_main.cc
index a99109080fa..a1b1a913281 100644
--- a/tensorflow/lite/kernels/test_main.cc
+++ b/tensorflow/lite/kernels/test_main.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/kernels/test_delegate_providers.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/testing/util.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
@@ -26,9 +27,13 @@ void InitKernelTest(int* argc, char** argv) {
       tflite::KernelTestDelegateProviders::Get();
   delegate_providers->InitFromCmdlineArgs(argc, const_cast<const char**>(argv));
 
-  // TODO(b/160764491): remove the special handling of NNAPI delegate test.
-  tflite::SingleOpModel::SetForceUseNnapi(
-      delegate_providers->ConstParams().Get<bool>("use_nnapi"));
+  if (delegate_providers->ConstParams().Get<bool>("use_nnapi")) {
+    // In Android Q, the NNAPI delegate avoids delegation if the only device
+    // is the reference CPU. However, for testing purposes, we still want
+    // delegation coverage, so force use of this reference path.
+    delegate_providers->MutableParams()->Set<std::string>(
+        "nnapi_accelerator_name", "nnapi-reference");
+  }
 }
 
 }  // namespace
diff --git a/tensorflow/lite/kernels/test_util.cc b/tensorflow/lite/kernels/test_util.cc
index d77d3367afe..c3a40252cfb 100644
--- a/tensorflow/lite/kernels/test_util.cc
+++ b/tensorflow/lite/kernels/test_util.cc
@@ -40,12 +40,12 @@ limitations under the License.
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/acceleration_test_util.h"
 #include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_delegate_providers.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/nnapi/nnapi_implementation.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
 #include "tensorflow/lite/string_util.h"
-#include "tensorflow/lite/tools/command_line_flags.h"
 #include "tensorflow/lite/tools/logging.h"
 #include "tensorflow/lite/tools/versioning/op_version.h"
 #include "tensorflow/lite/version.h"
@@ -55,26 +55,6 @@ namespace tflite {
 using ::testing::FloatNear;
 using ::testing::Matcher;
 
-namespace {
-
-// Whether to enable (global) use of NNAPI. Note that this will typically
-// be set via a command-line flag.
-static bool force_use_nnapi = false;
-
-TfLiteDelegate* TestNnApiDelegate() {
-  static TfLiteDelegate* delegate = [] {
-    StatefulNnApiDelegate::Options options;
-    // In Android Q, the NNAPI delegate avoids delegation if the only device
-    // is the reference CPU. However, for testing purposes, we still want
-    // delegation coverage, so force use of this reference path.
-    options.accelerator_name = "nnapi-reference";
-    return new StatefulNnApiDelegate(options);
-  }();
-  return delegate;
-}
-
-}  // namespace
-
 std::vector<Matcher<float>> ArrayFloatNear(const std::vector<float>& values,
                                            float max_abs_error) {
   std::vector<Matcher<float>> matchers;
@@ -221,26 +201,22 @@ void SingleOpModel::BuildInterpreter(std::vector<std::vector<int>> input_shapes,
 }
 
 TfLiteStatus SingleOpModel::ApplyDelegate() {
-  auto* delegate_providers = tflite::KernelTestDelegateProviders::Get();
-
-  if (force_use_nnapi) {
-    delegate_ = TestNnApiDelegate();
-
-    // As we currently have special handling of nnapi delegate in kernel tests,
-    // we turn off the nnapi delegate provider to avoid re-applying it later.
-    // TODO(b/160764491): remove this special handling for NNAPI delegate test.
-    delegate_providers->MutableParams()->Set<bool>("use_nnapi", false);
-  }
-
   if (delegate_) {
     TFLITE_LOG(WARN) << "Having a manually-set TfLite delegate, and bypassing "
                         "KernelTestDelegateProviders";
-    return interpreter_->ModifyGraphWithDelegate(delegate_);
-  }
-
-  for (auto& one : delegate_providers->CreateAllDelegates()) {
-    TF_LITE_ENSURE_STATUS(
-        interpreter_->ModifyGraphWithDelegate(std::move(one)));
+    TF_LITE_ENSURE_STATUS(interpreter_->ModifyGraphWithDelegate(delegate_));
+    ++num_applied_delegates_;
+  } else {
+    auto* delegate_providers = tflite::KernelTestDelegateProviders::Get();
+    for (auto& one : delegate_providers->CreateAllDelegates()) {
+      // The raw ptr always points to the actual TfLiteDegate object.
+      auto* delegate_raw_ptr = one.get();
+      TF_LITE_ENSURE_STATUS(
+          interpreter_->ModifyGraphWithDelegate(std::move(one)));
+      // Note: 'delegate_' is always set to the last successfully applied one.
+      delegate_ = delegate_raw_ptr;
+      ++num_applied_delegates_;
+    }
   }
   return kTfLiteOk;
 }
@@ -257,13 +233,15 @@ void SingleOpModel::BuildInterpreter(
 }
 
 // static
-void SingleOpModel::SetForceUseNnapi(bool use_nnapi) {
-  force_use_nnapi = use_nnapi;
+bool SingleOpModel::GetForceUseNnapi() {
+  const auto& delegate_params =
+      tflite::KernelTestDelegateProviders::Get()->ConstParams();
+  // It's possible this library isn't linked with the nnapi delegate provider
+  // lib.
+  return delegate_params.HasParam("use_nnapi") &&
+         delegate_params.Get<bool>("use_nnapi");
 }
 
-// static
-bool SingleOpModel::GetForceUseNnapi() { return force_use_nnapi; }
-
 int32_t SingleOpModel::GetTensorSize(int index) const {
   TfLiteTensor* t = interpreter_->tensor(index);
   CHECK(t);
@@ -342,20 +320,27 @@ void SingleOpModel::ExpectOpAcceleratedWithNnapi(const std::string& test_id) {
     return;
   }
 
+  // If we have multiple delegates applied, we would skip this check at the
+  // moment.
+  if (num_applied_delegates_ > 1) {
+    TFLITE_LOG(WARN) << "Skipping ExpectOpAcceleratedWithNnapi as "
+                     << num_applied_delegates_
+                     << " delegates have been successfully applied.";
+    return;
+  }
   TFLITE_LOG(INFO) << "Validating acceleration";
   const NnApi* nnapi = NnApiImplementation();
   if (nnapi && nnapi->nnapi_exists &&
       nnapi->android_sdk_version >=
           validation_params.value().MinAndroidSdkVersion()) {
-    EXPECT_EQ(
-        CountPartitionsDelegatedTo(interpreter_.get(), TestNnApiDelegate()), 1)
+    EXPECT_EQ(CountPartitionsDelegatedTo(interpreter_.get(), delegate_), 1)
         << "Expecting operation to be accelerated but cannot find a partition "
            "associated to the NNAPI delegate";
   }
 }
 
 void SingleOpModel::ValidateAcceleration() {
-  if (force_use_nnapi) {
+  if (GetForceUseNnapi()) {
     ExpectOpAcceleratedWithNnapi(GetCurrentTestId());
   }
 }
@@ -393,40 +378,4 @@ void MultiOpModel::AddCustomOp(
       builder_.CreateVector<uint8_t>(custom_option),
       CustomOptionsFormat_FLEXBUFFERS));
 }
-
-/*static*/ KernelTestDelegateProviders* KernelTestDelegateProviders::Get() {
-  static KernelTestDelegateProviders* const providers =
-      new KernelTestDelegateProviders();
-  return providers;
-}
-
-KernelTestDelegateProviders::KernelTestDelegateProviders() {
-  for (const auto& one : tools::GetRegisteredDelegateProviders()) {
-    params_.Merge(one->DefaultParams());
-  }
-}
-
-bool KernelTestDelegateProviders::InitFromCmdlineArgs(int* argc,
-                                                      const char** argv) {
-  std::vector<tflite::Flag> flags;
-  for (const auto& one : tools::GetRegisteredDelegateProviders()) {
-    auto one_flags = one->CreateFlags(&params_);
-    flags.insert(flags.end(), one_flags.begin(), one_flags.end());
-  }
-  return tflite::Flags::Parse(argc, argv, flags);
-}
-
-std::vector<tools::TfLiteDelegatePtr>
-KernelTestDelegateProviders::CreateAllDelegates() const {
-  std::vector<tools::TfLiteDelegatePtr> delegates;
-  for (const auto& one : tools::GetRegisteredDelegateProviders()) {
-    auto ptr = one->CreateTfLiteDelegate(params_);
-    // It's possible that a delegate of certain type won't be created as
-    // user-specified benchmark params tells not to.
-    if (ptr == nullptr) continue;
-    delegates.emplace_back(std::move(ptr));
-    TFLITE_LOG(INFO) << one->GetName() << " delegate is created.";
-  }
-  return delegates;
-}
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index f58867a5120..3e13335b160 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -45,10 +45,8 @@ limitations under the License.
 #include "tensorflow/lite/string_type.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/testing/util.h"  // IWYU pragma: keep
-#include "tensorflow/lite/tools/delegates/delegate_provider.h"
 #include "tensorflow/lite/tools/optimize/quantization_utils.h"
 #include "tensorflow/lite/tools/optimize/sparsity/format_converter.h"
-#include "tensorflow/lite/tools/tool_params.h"
 #include "tensorflow/lite/type_to_tflitetype.h"
 
 namespace tflite {
@@ -114,7 +112,8 @@ struct TensorData {
              std::vector<int64_t> per_channel_quantization_offsets = {},
              int32_t channel_index = 0, std::vector<int> traversal_order = {},
              std::vector<TfLiteDimensionType> format = {},
-             std::vector<int> block_size = {}, std::vector<int> block_map = {})
+             std::vector<int> block_size = {}, std::vector<int> block_map = {},
+             std::vector<int> shape_signature = {})
       : type(type),
         shape(shape),
         min(min),
@@ -130,7 +129,8 @@ struct TensorData {
         traversal_order(traversal_order),
         format(format),
         block_size(block_size),
-        block_map(block_map) {}
+        block_map(block_map),
+        shape_signature(shape_signature) {}
   TensorType type;
   std::vector<int> shape;
   float min;
@@ -145,6 +145,7 @@ struct TensorData {
   std::vector<TfLiteDimensionType> format;
   std::vector<int> block_size;
   std::vector<int> block_map;
+  std::vector<int> shape_signature;
 };
 
 class SingleOpResolver : public OpResolver {
@@ -515,8 +516,7 @@ class SingleOpModel {
     resolver_ = std::move(resolver);
   }
 
-  // Enables NNAPI delegate application during interpreter creation.
-  static void SetForceUseNnapi(bool use_nnapi);
+  // Indicate whether the test has the NNAPI delegate applied.
   static bool GetForceUseNnapi();
   int CountOpsExecutedByCpuKernel();
 
@@ -585,10 +585,11 @@ class SingleOpModel {
       buffers_.push_back(CreateBuffer(builder_, data_buffer));
     }
 
-    tensors_.push_back(CreateTensor(builder_,
-                                    builder_.CreateVector<int>(t.shape), t.type,
-                                    /*buffer=*/buffer_id,
-                                    /*name=*/0, q_params, is_variable));
+    tensors_.push_back(CreateTensor(
+        builder_, builder_.CreateVector<int>(t.shape), t.type,
+        /*buffer=*/buffer_id,
+        /*name=*/0, q_params, is_variable,
+        /*sparsity=*/0, builder_.CreateVector<int>(t.shape_signature)));
 
     tensor_data_[id] = t;
 
@@ -769,6 +770,7 @@ class SingleOpModel {
   std::vector<flatbuffers::Offset<Tensor>> tensors_;
   std::vector<flatbuffers::Offset<Buffer>> buffers_;
   TfLiteDelegate* delegate_ = nullptr;
+  int num_applied_delegates_ = 0;
 };
 
 // Populate string tensors.
@@ -899,37 +901,6 @@ class MultiOpModel : public SingleOpModel {
     return AddTensor<T>(t, {}, false);
   }
 };
-
-// A utility class to provide TfLite delegate creations for kernel tests. The
-// options of a particular delegate could be specified from commandline flags by
-// using the delegate provider registrar as implemented in lite/tools/delegates
-// directory.
-class KernelTestDelegateProviders {
- public:
-  // Returns a global KernelTestDelegateProviders instance.
-  static KernelTestDelegateProviders* Get();
-
-  KernelTestDelegateProviders();
-
-  // Initialize delegate-related parameters from commandline arguments and
-  // returns true if successful.
-  bool InitFromCmdlineArgs(int* argc, const char** argv);
-
-  // This provides a way to overwrite parameter values programmatically before
-  // creating TfLite delegates.
-  tools::ToolParams* MutableParams() { return &params_; }
-  const tools::ToolParams& ConstParams() const { return params_; }
-
-  // Create a list of TfLite delegates based on what have been initialized (i.e.
-  // 'params_').
-  std::vector<tools::TfLiteDelegatePtr> CreateAllDelegates() const;
-
- private:
-  // Contain delegate-related parameters that are initialized from command-line
-  // flags.
-  tools::ToolParams params_;
-};
-
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_KERNELS_TEST_UTIL_H_
diff --git a/tensorflow/lite/kernels/test_util_test.cc b/tensorflow/lite/kernels/test_util_test.cc
index 1ac08631079..e6f865f6cd6 100644
--- a/tensorflow/lite/kernels/test_util_test.cc
+++ b/tensorflow/lite/kernels/test_util_test.cc
@@ -47,28 +47,6 @@ TEST(TestUtilTest, QuantizeVectorScalingUp) {
   EXPECT_THAT(q_data, ElementsAreArray(expected));
 }
 
-TEST(KernelTestDelegateProvidersTest, DelegateProvidersParams) {
-  KernelTestDelegateProviders providers;
-  const auto& params = providers.ConstParams();
-  EXPECT_TRUE(params.HasParam("use_xnnpack"));
-  EXPECT_TRUE(params.HasParam("use_nnapi"));
-
-  int argc = 3;
-  const char* argv[] = {"program_name", "--use_nnapi=true",
-                        "--other_undefined_flag=1"};
-  EXPECT_TRUE(providers.InitFromCmdlineArgs(&argc, argv));
-  EXPECT_TRUE(params.Get<bool>("use_nnapi"));
-  EXPECT_EQ(2, argc);
-  EXPECT_EQ("--other_undefined_flag=1", argv[1]);
-}
-
-TEST(KernelTestDelegateProvidersTest, CreateTfLiteDelegates) {
-#if !defined(__Fuchsia__) && !defined(TFLITE_WITHOUT_XNNPACK)
-  KernelTestDelegateProviders providers;
-  providers.MutableParams()->Set<bool>("use_xnnpack", true);
-  EXPECT_GE(providers.CreateAllDelegates().size(), 1);
-#endif
-}
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/micro/BUILD b/tensorflow/lite/micro/BUILD
index d66b297508b..7cec8584413 100644
--- a/tensorflow/lite/micro/BUILD
+++ b/tensorflow/lite/micro/BUILD
@@ -1,10 +1,10 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load(
     "//tensorflow/lite/micro/testing:micro_test.bzl",
     "tflite_micro_cc_test",
 )
 load(
     "//tensorflow/lite/micro:build_def.bzl",
-    "cc_library",
     "micro_copts",
 )
 
@@ -13,12 +13,16 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+package_group(
+    name = "micro",
+    packages = ["//tensorflow/lite/micro/..."],
+)
+
 cc_library(
     name = "micro_compatibility",
     hdrs = [
         "compatibility.h",
     ],
-    build_for_embedded = True,
     copts = micro_copts(),
 )
 
@@ -36,18 +40,17 @@ cc_library(
         "micro_optional_debug_tools.h",
         "simple_memory_allocator.h",
     ],
-    build_for_embedded = True,
     copts = micro_copts(),
     deps = [
         ":memory_helpers",
         ":micro_compatibility",
+        ":micro_profiler",
         ":op_resolvers",
         "//tensorflow/lite:type_to_tflitetype",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/micro:micro_profiler",
         "//tensorflow/lite/micro/memory_planner",
         "//tensorflow/lite/micro/memory_planner:greedy_memory_planner",
         "//tensorflow/lite/schema:schema_fbs",
@@ -76,7 +79,6 @@ cc_library(
     hdrs = [
         "test_helpers.h",
     ],
-    build_for_embedded = True,
     copts = micro_copts(),
     deps = [
         ":micro_utils",
@@ -101,7 +103,6 @@ cc_library(
         "micro_mutable_op_resolver.h",
         "micro_op_resolver.h",
     ],
-    build_for_embedded = True,
     copts = micro_copts(),
     deps = [
         ":micro_compatibility",
@@ -122,7 +123,6 @@ cc_library(
     hdrs = [
         "debug_log.h",
     ],
-    build_for_embedded = True,
     copts = micro_copts(),
 )
 
@@ -134,7 +134,6 @@ cc_library(
     hdrs = [
         "micro_error_reporter.h",
     ],
-    build_for_embedded = True,
     copts = micro_copts(),
     deps = [
         ":debug_log",
@@ -152,7 +151,6 @@ cc_library(
     hdrs = [
         "micro_string.h",
     ],
-    build_for_embedded = True,
     copts = micro_copts(),
     deps = ["//tensorflow/lite/c:common"],
 )
@@ -165,7 +163,6 @@ cc_library(
     hdrs = [
         "micro_time.h",
     ],
-    build_for_embedded = True,
     copts = micro_copts(),
     deps = ["//tensorflow/lite/c:common"],
 )
@@ -178,7 +175,6 @@ cc_library(
     hdrs = [
         "micro_profiler.h",
     ],
-    build_for_embedded = True,
     copts = micro_copts(),
     deps = [
         ":micro_compatibility",
@@ -196,7 +192,6 @@ cc_library(
     hdrs = [
         "micro_utils.h",
     ],
-    build_for_embedded = True,
     copts = micro_copts(),
     deps = [
         "//tensorflow/lite/c:common",
@@ -215,7 +210,6 @@ cc_library(
         "recording_micro_interpreter.h",
         "recording_simple_memory_allocator.h",
     ],
-    build_for_embedded = True,
     copts = micro_copts(),
     deps = [
         ":micro_compatibility",
@@ -294,9 +288,11 @@ tflite_micro_cc_test(
         "micro_allocator_test.cc",
     ],
     deps = [
+        ":memory_helpers",
         ":micro_framework",
         ":test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
+        "//tensorflow/lite/micro/testing:test_conv_model",
     ],
 )
 
@@ -384,3 +380,10 @@ tflite_micro_cc_test(
         "//tensorflow/lite/micro/testing:test_conv_model",
     ],
 )
+
+bzl_library(
+    name = "build_def_bzl",
+    srcs = ["build_def.bzl"],
+    visibility = [":micro"],
+    deps = ["//tensorflow:tensorflow_bzl"],
+)
diff --git a/tensorflow/lite/micro/apollo3evb/debug_log.cc b/tensorflow/lite/micro/apollo3evb/debug_log.cc
index 2779d941784..1523d4bcc84 100644
--- a/tensorflow/lite/micro/apollo3evb/debug_log.cc
+++ b/tensorflow/lite/micro/apollo3evb/debug_log.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "am_util.h"        // NOLINT
 
 extern "C" void DebugLog(const char* s) {
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
   static bool is_initialized = false;
   if (!is_initialized) {
     am_bsp_itm_printf_enable();
@@ -48,4 +49,5 @@ extern "C" void DebugLog(const char* s) {
   }
 
   am_util_stdio_printf("%s", s);
+#endif
 }
diff --git a/tensorflow/lite/micro/arduino/abi.cc b/tensorflow/lite/micro/arduino/abi.cc
new file mode 100644
index 00000000000..6e58671f9a6
--- /dev/null
+++ b/tensorflow/lite/micro/arduino/abi.cc
@@ -0,0 +1,16 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+void* __dso_handle;
diff --git a/tensorflow/lite/micro/benchmarks/BUILD b/tensorflow/lite/micro/benchmarks/BUILD
index 7e140b5995d..5a8a88133e6 100644
--- a/tensorflow/lite/micro/benchmarks/BUILD
+++ b/tensorflow/lite/micro/benchmarks/BUILD
@@ -85,3 +85,21 @@ cc_binary(
         "//tensorflow/lite/schema:schema_fbs",
     ],
 )
+
+cc_binary(
+    name = "person_detection_experimental_benchmark",
+    srcs = ["person_detection_experimental_benchmark.cc"],
+    deps = [
+        ":micro_benchmark",
+        "//tensorflow/lite:version",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
+        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_utils",
+        "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro/examples/person_detection_experimental:model_settings",
+        "//tensorflow/lite/micro/examples/person_detection_experimental:person_detect_model_data",
+        "//tensorflow/lite/micro/examples/person_detection_experimental:simple_images_test_data",
+        "//tensorflow/lite/schema:schema_fbs",
+    ],
+)
diff --git a/tensorflow/lite/micro/benchmarks/Makefile.inc b/tensorflow/lite/micro/benchmarks/Makefile.inc
index 2a7eefd2596..4a57ef39d69 100644
--- a/tensorflow/lite/micro/benchmarks/Makefile.inc
+++ b/tensorflow/lite/micro/benchmarks/Makefile.inc
@@ -1,4 +1,6 @@
 $(eval $(call add_third_party_download,$(PERSON_MODEL_URL),$(PERSON_MODEL_MD5),person_model_grayscale,))
+$(eval $(call add_third_party_download,$(PERSON_MODEL_INT8_URL),$(PERSON_MODEL_INT8_MD5),person_model_int8,))
+
 
 KEYWORD_BENCHMARK_SRCS := \
 tensorflow/lite/micro/benchmarks/keyword_benchmark.cc \
@@ -16,9 +18,21 @@ $(MAKEFILE_DIR)/downloads/person_model_grayscale/person_image_data.cc
 PERSON_DETECTION_BENCHMARK_HDRS := \
 tensorflow/lite/micro/examples/person_detection/person_detect_model_data.h
 
+PERSON_DETECTION_EXPERIMENTAL_BENCHMARK_SRCS := \
+tensorflow/lite/micro/benchmarks/person_detection_experimental_benchmark.cc \
+$(MAKEFILE_DIR)/downloads/person_model_int8/no_person_image_data.cc \
+$(MAKEFILE_DIR)/downloads/person_model_int8/person_detect_model_data.cc \
+$(MAKEFILE_DIR)/downloads/person_model_int8/person_image_data.cc
+
+PERSON_DETECTION_EXPERIMENTAL_BENCHMARK_HDRS := \
+tensorflow/lite/micro/examples/person_detection_experimental/person_detect_model_data.h
+
 # Builds a standalone binary.
 $(eval $(call microlite_test,keyword_benchmark,\
 $(KEYWORD_BENCHMARK_SRCS),$(KEYWORD_BENCHMARK_HDRS)))
 
 $(eval $(call microlite_test,person_detection_benchmark,\
 $(PERSON_DETECTION_BENCHMARK_SRCS),$(PERSON_DETECTION_BENCHMARK_HDRS)))
+
+$(eval $(call microlite_test,person_detection_experimental_benchmark,\
+$(PERSON_DETECTION_EXPERIMENTAL_BENCHMARK_SRCS),$(PERSON_DETECTION_EXPERIMENTAL_BENCHMARK_HDRS)))
diff --git a/tensorflow/lite/micro/benchmarks/conv_benchmark.cc b/tensorflow/lite/micro/benchmarks/conv_benchmark.cc
index cbddbd23f26..d64b31dd39a 100644
--- a/tensorflow/lite/micro/benchmarks/conv_benchmark.cc
+++ b/tensorflow/lite/micro/benchmarks/conv_benchmark.cc
@@ -151,7 +151,7 @@ int main() {
   // Output scale of 50 is needed to accomodate a float range of [-6400, 6350]
   float output_scale = 50.0f;
 
-  // Create per-tensor quantized int8 input tensor.
+  // Create per-tensor quantized int8_t input tensor.
   int8_t input_quantized[32];
   TfLiteTensor input_tensor = tflite::testing::CreateQuantizedTensor(
       input_values, input_quantized, input_dims, input_scale, input_zero_point);
@@ -163,7 +163,7 @@ int main() {
       tflite::testing::IntArrayFromInts(input_zero_points)};
   input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
 
-  // Create per-tensor quantized int8 filter tensor.
+  // Create per-tensor quantized int8_t filter tensor.
   int8_t filter_quantized[32 * 32];
   TfLiteTensor filter_tensor = tflite::testing::CreateQuantizedTensor(
       filter_values, filter_quantized, filter_dims, filter_scale,
@@ -176,7 +176,7 @@ int main() {
       tflite::testing::IntArrayFromInts(filter_zero_points)};
   filter_tensor.quantization = {kTfLiteAffineQuantization, &filter_quant};
 
-  // Create per-tensor quantized int32 bias tensor.
+  // Create per-tensor quantized int32_t bias tensor.
   int32_t bias_quantized[32];
   tflite::SymmetricQuantize(bias_values, bias_quantized, 32,
                             input_scale * output_scale);
@@ -192,7 +192,7 @@ int main() {
       tflite::testing::IntArrayFromInts(bias_zero_points)};
   bias_tensor.quantization = {kTfLiteAffineQuantization, &bias_quant};
 
-  // Create per-tensor quantized int8 output tensor.
+  // Create per-tensor quantized int8_t output tensor.
   int8_t output_quantized[32];
   TfLiteTensor output_tensor = tflite::testing::CreateQuantizedTensor(
       output_quantized, output_dims, output_scale, output_zero_point);
diff --git a/tensorflow/lite/micro/benchmarks/depthwise_conv_benchmark.cc b/tensorflow/lite/micro/benchmarks/depthwise_conv_benchmark.cc
index ddaea133221..a4133680b9f 100644
--- a/tensorflow/lite/micro/benchmarks/depthwise_conv_benchmark.cc
+++ b/tensorflow/lite/micro/benchmarks/depthwise_conv_benchmark.cc
@@ -157,7 +157,7 @@ int main() {
   TfLiteIntArray* bias_dims = tflite::testing::IntArrayFromInts(bias_shape);
   TfLiteIntArray* output_dims = tflite::testing::IntArrayFromInts(output_shape);
 
-  // Create per-tensor quantized int8 input tensor.
+  // Create per-tensor quantized int8_t input tensor.
   int8_t input_quantized[input_elements];
   TfLiteTensor input_tensor = tflite::testing::CreateQuantizedTensor(
       input_values, input_quantized, input_dims, input_scale, input_zero_point);
@@ -170,7 +170,7 @@ int main() {
       tflite::testing::IntArrayFromInts(input_zero_points)};
   input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
 
-  // Create per-tensor quantized int8 filter tensor.
+  // Create per-tensor quantized int8_t filter tensor.
   int8_t filter_quantized[filter_elements];
   TfLiteTensor filter_tensor = tflite::testing::CreateQuantizedTensor(
       filter_values, filter_quantized, filter_dims, filter_scale, 0);
@@ -183,7 +183,7 @@ int main() {
       tflite::testing::IntArrayFromInts(filter_zero_points)};
   filter_tensor.quantization = {kTfLiteAffineQuantization, &filter_quant};
 
-  // Create per-tensor quantized int32 bias tensor.
+  // Create per-tensor quantized int32_t bias tensor.
   int32_t bias_quantized[bias_elements];
   // See https://www.tensorflow.org/lite/performance/quantization_spec for a
   // detailed explanation of why bias scale is input_scale * filter_scale.
@@ -200,7 +200,7 @@ int main() {
       tflite::testing::IntArrayFromInts(bias_zero_points)};
   bias_tensor.quantization = {kTfLiteAffineQuantization, &bias_quant};
 
-  // Create per-tensor quantized int8 output tensor.
+  // Create per-tensor quantized int8_t output tensor.
   int8_t output_quantized[output_elements];
   TfLiteTensor output_tensor = tflite::testing::CreateQuantizedTensor(
       output_quantized, output_dims, output_scale, output_zero_point);
diff --git a/tensorflow/lite/micro/benchmarks/micro_benchmark.h b/tensorflow/lite/micro/benchmarks/micro_benchmark.h
index 3c4bfb17dee..2c7390b9cd4 100644
--- a/tensorflow/lite/micro/benchmarks/micro_benchmark.h
+++ b/tensorflow/lite/micro/benchmarks/micro_benchmark.h
@@ -45,6 +45,8 @@ extern tflite::ErrorReporter* reporter;
 
 #define TF_LITE_MICRO_BENCHMARK(func)                                         \
   if (tflite::ticks_per_second() == 0) {                                      \
+    TF_LITE_REPORT_ERROR(micro_benchmark::reporter,                           \
+                         "no timer implementation found");                    \
     return 0;                                                                 \
   }                                                                           \
   start_ticks = tflite::GetCurrentTimeTicks();                                \
@@ -95,7 +97,6 @@ class MicroBenchmarkRunner {
   }
 
   void SetInput(const inputT* custom_input) {
-    // Populate input tensor with an image with no person.
     TfLiteTensor* input = interpreter_.input(0);
     inputT* input_buffer = tflite::GetTensorData<inputT>(input);
     int input_length = input->bytes / sizeof(inputT);
diff --git a/tensorflow/lite/micro/benchmarks/person_detection_experimental_benchmark.cc b/tensorflow/lite/micro/benchmarks/person_detection_experimental_benchmark.cc
new file mode 100644
index 00000000000..65412136bdc
--- /dev/null
+++ b/tensorflow/lite/micro/benchmarks/person_detection_experimental_benchmark.cc
@@ -0,0 +1,75 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/benchmarks/micro_benchmark.h"
+#include "tensorflow/lite/micro/examples/person_detection_experimental/model_settings.h"
+#include "tensorflow/lite/micro/examples/person_detection_experimental/no_person_image_data.h"
+#include "tensorflow/lite/micro/examples/person_detection_experimental/person_detect_model_data.h"
+#include "tensorflow/lite/micro/examples/person_detection_experimental/person_image_data.h"
+#include "tensorflow/lite/micro/micro_error_reporter.h"
+#include "tensorflow/lite/micro/micro_interpreter.h"
+#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
+#include "tensorflow/lite/micro/micro_utils.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+/*
+ * Person Detection benchmark.  Evaluates runtime performance of the visual
+ * wakewords person detection model.  This is the same model found in
+ * exmaples/person_detection.
+ */
+
+namespace {
+
+// Create an area of memory to use for input, output, and intermediate arrays.
+// Align arena to 16 bytes to avoid alignment warnings on certain platforms.
+constexpr int tensor_arena_size = 135 * 1024;
+alignas(16) uint8_t tensor_arena[tensor_arena_size];
+
+MicroBenchmarkRunner<int8_t>* runner;
+
+void InitializeBenchmarkRunner() {
+  // NOLINTNEXTLINE
+  static MicroBenchmarkRunner<int8_t> benchmark_runner(
+      g_person_detect_model_data, tensor_arena, tensor_arena_size);
+  runner = &benchmark_runner;
+  runner->SetInput(reinterpret_cast<const int8_t*>(g_person_data));
+}
+
+void PersonDetectionTenIerationsWithPerson() {
+  runner->SetInput(reinterpret_cast<const int8_t*>(g_person_data));
+  for (int i = 0; i < 10; i++) {
+    runner->RunSingleIteration();
+  }
+}
+
+void PersonDetectionTenIerationsWithoutPerson() {
+  runner->SetInput(reinterpret_cast<const int8_t*>(g_no_person_data));
+  for (int i = 0; i < 10; i++) {
+    runner->RunSingleIteration();
+  }
+}
+
+}  // namespace
+
+TF_LITE_MICRO_BENCHMARKS_BEGIN
+
+TF_LITE_MICRO_BENCHMARK(InitializeBenchmarkRunner());
+TF_LITE_MICRO_BENCHMARK(runner->RunSingleIteration());
+TF_LITE_MICRO_BENCHMARK(PersonDetectionTenIerationsWithPerson());
+TF_LITE_MICRO_BENCHMARK(PersonDetectionTenIerationsWithoutPerson());
+
+TF_LITE_MICRO_BENCHMARKS_END
diff --git a/tensorflow/lite/micro/build_def.bzl b/tensorflow/lite/micro/build_def.bzl
index ef37c92d9cd..edca4cb5cea 100644
--- a/tensorflow/lite/micro/build_def.bzl
+++ b/tensorflow/lite/micro/build_def.bzl
@@ -1,25 +1,2 @@
-load(
-    "@rules_cc//cc:defs.bzl",
-    _cc_library = "cc_library",
-)
-load(
-    "@flatbuffers//:build_defs.bzl",
-    _flatbuffer_cc_library = "flatbuffer_cc_library",
-)
-
 def micro_copts():
     return []
-
-def cc_library(**kwargs):
-    kwargs.pop("build_for_embedded", False)
-    if "select_deps" in kwargs.keys():
-        select_deps = kwargs.pop("select_deps", {})
-        if "deps" in kwargs.keys():
-            kwargs["deps"] += select(select_deps)
-        else:
-            kwargs["deps"] = select(select_deps)
-    _cc_library(**kwargs)
-
-def flatbuffer_cc_library(**kwargs):
-    kwargs.pop("build_for_embedded", False)
-    _flatbuffer_cc_library(**kwargs)
diff --git a/tensorflow/lite/micro/examples/hello_world/BUILD b/tensorflow/lite/micro/examples/hello_world/BUILD
index 8762e9d2f3e..b5541f15fa9 100644
--- a/tensorflow/lite/micro/examples/hello_world/BUILD
+++ b/tensorflow/lite/micro/examples/hello_world/BUILD
@@ -7,7 +7,6 @@ load(
 )
 load(
     "//tensorflow/lite/micro:build_def.bzl",
-    "cc_library",
     "micro_copts",
 )
 
@@ -23,7 +22,6 @@ cc_library(
     hdrs = [
         "model.h",
     ],
-    build_for_embedded = True,
     copts = micro_copts(),
 )
 
diff --git a/tensorflow/lite/micro/examples/hello_world/hello_world_test.cc b/tensorflow/lite/micro/examples/hello_world/hello_world_test.cc
index 4da4ba7fa94..c76491b6b7c 100644
--- a/tensorflow/lite/micro/examples/hello_world/hello_world_test.cc
+++ b/tensorflow/lite/micro/examples/hello_world/hello_world_test.cc
@@ -26,13 +26,12 @@ TF_LITE_MICRO_TESTS_BEGIN
 TF_LITE_MICRO_TEST(LoadModelAndPerformInference) {
   // Set up logging
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   // Map the model into a usable data structure. This doesn't involve any
   // copying or parsing, it's a very lightweight operation.
   const tflite::Model* model = ::tflite::GetModel(g_model);
   if (model->version() != TFLITE_SCHEMA_VERSION) {
-    TF_LITE_REPORT_ERROR(error_reporter,
+    TF_LITE_REPORT_ERROR(&micro_error_reporter,
                          "Model provided is schema version %d not equal "
                          "to supported version %d.\n",
                          model->version(), TFLITE_SCHEMA_VERSION);
@@ -52,8 +51,8 @@ TF_LITE_MICRO_TEST(LoadModelAndPerformInference) {
   uint8_t tensor_arena[tensor_arena_size];
 
   // Build an interpreter to run the model with
-  tflite::MicroInterpreter interpreter(model, resolver, tensor_arena,
-                                       tensor_arena_size, error_reporter);
+  tflite::MicroInterpreter interpreter(
+      model, resolver, tensor_arena, tensor_arena_size, &micro_error_reporter);
   // Allocate memory from the tensor_arena for the model's tensors
   TF_LITE_MICRO_EXPECT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
 
@@ -95,7 +94,7 @@ TF_LITE_MICRO_TEST(LoadModelAndPerformInference) {
   // Obtain the output value from the tensor
   float value = output->data.f[0];
   // Check that the output value is within 0.05 of the expected value
-  TF_LITE_MICRO_EXPECT_NEAR(0., value, 0.05);
+  TF_LITE_MICRO_EXPECT_NEAR(0.f, value, 0.05f);
 
   // Run inference on several more values and confirm the expected outputs
   input->data.f[0] = 1.;
@@ -103,21 +102,21 @@ TF_LITE_MICRO_TEST(LoadModelAndPerformInference) {
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
 
   value = output->data.f[0];
-  TF_LITE_MICRO_EXPECT_NEAR(0.841, value, 0.05);
+  TF_LITE_MICRO_EXPECT_NEAR(0.841f, value, 0.05f);
 
-  input->data.f[0] = 3.;
+  input->data.f[0] = 3.f;
   invoke_status = interpreter.Invoke();
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
 
   value = output->data.f[0];
-  TF_LITE_MICRO_EXPECT_NEAR(0.141, value, 0.05);
+  TF_LITE_MICRO_EXPECT_NEAR(0.141f, value, 0.05f);
 
-  input->data.f[0] = 5.;
+  input->data.f[0] = 5.f;
   invoke_status = interpreter.Invoke();
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
 
   value = output->data.f[0];
-  TF_LITE_MICRO_EXPECT_NEAR(-0.959, value, 0.05);
+  TF_LITE_MICRO_EXPECT_NEAR(-0.959f, value, 0.05f);
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/examples/hello_world/output_handler_test.cc b/tensorflow/lite/micro/examples/hello_world/output_handler_test.cc
index cbed83e1c75..206113d1427 100644
--- a/tensorflow/lite/micro/examples/hello_world/output_handler_test.cc
+++ b/tensorflow/lite/micro/examples/hello_world/output_handler_test.cc
@@ -22,12 +22,11 @@ TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(TestCallability) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   // This will have external side-effects (like printing to the debug console
   // or lighting an LED) that are hard to observe, so the most we can do is
   // make sure the call doesn't crash.
-  HandleOutput(error_reporter, 0, 0);
+  HandleOutput(&micro_error_reporter, 0, 0);
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/examples/hello_world/riscv32_mcu/Makefile.inc b/tensorflow/lite/micro/examples/hello_world/riscv32_mcu/Makefile.inc
new file mode 100644
index 00000000000..f24610a5c14
--- /dev/null
+++ b/tensorflow/lite/micro/examples/hello_world/riscv32_mcu/Makefile.inc
@@ -0,0 +1,26 @@
+ifeq ($(TARGET), riscv32_mcu)
+  # Wrap functions
+  MICRO_FE310_LIBWRAP_SRCS := \
+    $(wildcard $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/libwrap/sys/*.c) \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/libwrap/misc/write_hex.c \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/libwrap/stdlib/malloc.c
+
+  MICRO_FE310_BSP_ENV_SRCS := \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env/start.S \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env/entry.S \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env/freedom-e300-hifive1/init.c
+
+  HELLO_WORLD_TEST_SRCS += $(MICRO_FE310_LIBWRAP_SRCS) $(MICRO_FE310_BSP_ENV_SRCS)
+  HELLO_WORLD_SRCS += $(MICRO_FE310_LIBWRAP_SRCS) $(MICRO_FE310_BSP_ENV_SRCS) \
+    tensorflow/lite/micro/arduino/abi.cc
+
+  LIBWRAP_SYMS := malloc free \
+                  open lseek read write fstat stat close link unlink \
+                  execve fork getpid kill wait \
+                  isatty times sbrk _exit puts
+
+  LDFLAGS += $(foreach s,$(LIBWRAP_SYMS),-Wl,--wrap=$(s))
+  LDFLAGS += $(foreach s,$(LIBWRAP_SYMS),-Wl,--wrap=_$(s))
+  LDFLAGS += -L. -Wl,--start-group -lc -Wl,--end-group
+endif
+
diff --git a/tensorflow/lite/micro/examples/hello_world/sparkfun_edge/output_handler.cc b/tensorflow/lite/micro/examples/hello_world/sparkfun_edge/output_handler.cc
index c9f23dc2de0..2e727095a5c 100644
--- a/tensorflow/lite/micro/examples/hello_world/sparkfun_edge/output_handler.cc
+++ b/tensorflow/lite/micro/examples/hello_world/sparkfun_edge/output_handler.cc
@@ -22,7 +22,7 @@ This function uses the device's LEDs to visually indicate the current y value.
 The y value is in the range -1 <= y <= 1. The LEDs (red, green, blue,
 and yellow) are physically lined up in the following order:
 
-                         [ R G B Y ]
+                         [ R B G Y ]
 
 The following table represents how we will light the LEDs for different values:
 
diff --git a/tensorflow/lite/micro/examples/hello_world/train/train_hello_world_model.ipynb b/tensorflow/lite/micro/examples/hello_world/train/train_hello_world_model.ipynb
index d0fb0eaa1b5..aea609cbb39 100644
--- a/tensorflow/lite/micro/examples/hello_world/train/train_hello_world_model.ipynb
+++ b/tensorflow/lite/micro/examples/hello_world/train/train_hello_world_model.ipynb
@@ -103,7 +103,8 @@
         }
       },
       "source": [
-        "! pip install -q tensorflow==2"
+        "! pip2 install gast==0.3.3\n",
+        "! pip install -q tensorflow==2\n"
       ],
       "execution_count": 2,
       "outputs": [
diff --git a/tensorflow/lite/micro/examples/image_recognition_experimental/BUILD b/tensorflow/lite/micro/examples/image_recognition_experimental/BUILD
new file mode 100644
index 00000000000..d3bcd69d1c7
--- /dev/null
+++ b/tensorflow/lite/micro/examples/image_recognition_experimental/BUILD
@@ -0,0 +1,36 @@
+# Description:
+#   TensorFlow Lite for Microcontrollers image recognition example.
+
+load(
+    "//tensorflow/lite/micro/testing:micro_test.bzl",
+    "tflite_micro_cc_test",
+)
+
+licenses(["notice"])  # Apache 2.0
+
+cc_library(
+    name = "image_model_data",
+    srcs = [
+        "first_10_cifar_images.cc",
+        "image_recognition_model.cc",
+    ],
+    hdrs = [
+        "first_10_cifar_images.h",
+        "image_recognition_model.h",
+        "util.h",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "image_recognition_test",
+    srcs = ["image_recognition_test.cc"],
+    deps = [
+        ":image_model_data",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/micro:micro_error_reporter",
+        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro/testing:micro_test",
+        "//tensorflow/lite/schema:schema_fbs",
+    ],
+)
diff --git a/tensorflow/lite/micro/examples/image_recognition_experimental/Makefile.inc b/tensorflow/lite/micro/examples/image_recognition_experimental/Makefile.inc
index 2fdfb0e6779..76b21cb2580 100644
--- a/tensorflow/lite/micro/examples/image_recognition_experimental/Makefile.inc
+++ b/tensorflow/lite/micro/examples/image_recognition_experimental/Makefile.inc
@@ -20,6 +20,7 @@ tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_
 $(MAKEFILE_DIR)/downloads/image_recognition_model/image_recognition_model.cc
 
 IMAGE_RECOGNITION_TEST_HDRS := \
+tensorflow/lite/micro/examples/image_recognition_experimental/first_10_cifar_images.h \
 tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_model.h \
 tensorflow/lite/micro/examples/image_recognition_experimental/util.h
 
diff --git a/tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_test.cc b/tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_test.cc
index 61071fd5696..ff9ed498137 100644
--- a/tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_test.cc
+++ b/tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_test.cc
@@ -31,11 +31,10 @@ TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(TestImageRecognitionInvoke) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   const tflite::Model* model = ::tflite::GetModel(image_recognition_model_data);
   if (model->version() != TFLITE_SCHEMA_VERSION) {
-    TF_LITE_REPORT_ERROR(error_reporter,
+    TF_LITE_REPORT_ERROR(&micro_error_reporter,
                          "Model provided is schema version %d not equal "
                          "to supported version %d.\n",
                          model->version(), TFLITE_SCHEMA_VERSION);
@@ -52,7 +51,8 @@ TF_LITE_MICRO_TEST(TestImageRecognitionInvoke) {
   uint8_t tensor_arena[tensor_arena_size];
 
   tflite::MicroInterpreter interpreter(model, micro_op_resolver, tensor_arena,
-                                       tensor_arena_size, error_reporter);
+                                       tensor_arena_size,
+                                       &micro_error_reporter);
   interpreter.AllocateTensors();
 
   TfLiteTensor* input = interpreter.input(0);
@@ -83,7 +83,7 @@ TF_LITE_MICRO_TEST(TestImageRecognitionInvoke) {
     TfLiteStatus invoke_status = interpreter.Invoke();
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
     if (invoke_status != kTfLiteOk) {
-      TF_LITE_REPORT_ERROR(error_reporter, "Invoke failed\n");
+      TF_LITE_REPORT_ERROR(&micro_error_reporter, "Invoke failed\n");
     }
 
     TfLiteTensor* output = interpreter.output(0);
diff --git a/tensorflow/lite/micro/examples/magic_wand/README.md b/tensorflow/lite/micro/examples/magic_wand/README.md
index 0cf3b8e74c3..efc235b07f8 100644
--- a/tensorflow/lite/micro/examples/magic_wand/README.md
+++ b/tensorflow/lite/micro/examples/magic_wand/README.md
@@ -12,6 +12,7 @@ then outputs the gesture to the serial port.
 
 -   [Getting started](#getting-started)
 -   [Deploy to Arduino](#deploy-to-arduino)
+-   [Deploy to Himax WE1 EVB](#deploy-to-himax-we1-evb)
 -   [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge)
 -   [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
 -   [Train your own model](#train-your-own-model)
@@ -140,6 +141,139 @@ SLOPE:
  * * * * * * * *
 ```
 
+## Deploy to Himax WE1 EVB
+
+The following instructions will help you build and deploy this example to
+[HIMAX WE1 EVB](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_board_brief)
+board. To undstand more about using this board, please check
+[HIMAX WE1 EVB user guide](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide).
+
+### Initial Setup
+
+To use the HIMAX WE1 EVB, please make sure following software are installed:
+
+#### MetaWare Development Toolkit
+
+See
+[Install the Synopsys DesignWare ARC MetaWare Development Toolkit](/tensorflow/lite/micro/tools/make/targets/arc/README.md#install-the-synopsys-designware-arc-metaware-development-toolkit)
+section for instructions on toolchain installation.
+
+#### Make Tool version
+
+A `'make'` tool is required for deploying Tensorflow Lite Micro applications on
+HIMAX WE1 EVB, See
+[Check make tool version](/tensorflow/lite/micro/tools/make/targets/arc/README.md#make-tool)
+section for proper environment.
+
+#### Serial Terminal Emulation Application
+
+There are 2 main purposes for HIMAX WE1 EVB Debug UART port
+
+-   print application output
+-   burn application to flash by using xmodem send application binary
+
+You can use any terminal emulation program (like [PuTTY](https://www.putty.org/)
+or [minicom](https://linux.die.net/man/1/minicom)).
+
+### Generate Example Project
+
+The example project for HIMAX WE1 EVB platform can be generated with the
+following command:
+
+Download related third party data
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=himax_we1_evb third_party_downloads
+```
+
+Generate magic wand project
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile generate_magic_wand_make_project TARGET=himax_we1_evb
+```
+
+### Build and Burn Example
+
+Following the Steps to run magic wand example at HIMAX WE1 EVB platform.
+
+1.  Go to the generated example project directory.
+
+    ```
+    cd tensorflow/lite/micro/tools/make/gen/himax_we1_evb_arc/prj/magic_wand/make
+    ```
+
+2.  Build the example using
+
+    ```
+    make app
+    ```
+
+3.  After example build finish, copy ELF file and map file to image generate
+    tool directory. \
+    image generate tool directory located at
+    `'tensorflow/lite/micro/tools/make/downloads/himax_we1_sdk/image_gen_linux_v3/'`
+
+    ```
+    cp magic_wand.elf himax_we1_evb.map ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
+    ```
+
+4.  Go to flash image generate tool directory.
+
+    ```
+    cd ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
+    ```
+
+5.  run image generate tool, generate flash image file.
+
+    *   Before running image generate tool, by typing `sudo chmod +x image_gen`
+        and `sudo chmod +x sign_tool` to make sure it is executable.
+
+    ```
+    image_gen -e magic_wand.elf -m himax_we1_evb.map -o out.img
+    ```
+
+6.  Download flash image file to HIMAX WE1 EVB by UART:
+
+    *   more detail about download image through UART can be found at
+        [HIMAX WE1 EVB update Flash image](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide#flash-image-update)
+
+After these steps, press reset button on the HIMAX WE1 EVB, you will see
+application output in the serial terminal. Perform following gestures
+`'Wing'`,`'Ring'`,`'Slope'` and you can see the otuput in serial terminal.
+
+```
+WING:
+*         *         *
+ *       * *       *
+  *     *   *     *
+   *   *     *   *
+    * *       * *
+     *         *
+```
+
+```
+RING:
+          *
+       *     *
+     *         *
+    *           *
+     *         *
+       *     *
+          *
+```
+
+```
+SLOPE:
+        *
+       *
+      *
+     *
+    *
+   *
+  *
+ * * * * * * * *
+```
+
 ## Deploy to SparkFun Edge
 
 The following instructions will help you build and deploy this sample on the
diff --git a/tensorflow/lite/micro/examples/magic_wand/himax_we1_evb/accelerometer_handler.cc b/tensorflow/lite/micro/examples/magic_wand/himax_we1_evb/accelerometer_handler.cc
new file mode 100644
index 00000000000..9d83b01be05
--- /dev/null
+++ b/tensorflow/lite/micro/examples/magic_wand/himax_we1_evb/accelerometer_handler.cc
@@ -0,0 +1,89 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/examples/magic_wand/accelerometer_handler.h"
+
+#include "hx_drv_tflm.h"
+
+int begin_index = 0;
+
+namespace {
+// Ring buffer size
+constexpr int ring_buffer_size = 600;
+// Ring buffer
+float save_data[ring_buffer_size] = {0.0};
+// Flag to start detect gesture
+bool pending_initial_data = true;
+// Available data count in accelerometer FIFO
+int available_count = 0;
+
+}  // namespace
+
+TfLiteStatus SetupAccelerometer(tflite::ErrorReporter* error_reporter) {
+  if (hx_drv_accelerometer_initial() != HX_DRV_LIB_PASS) {
+    TF_LITE_REPORT_ERROR(error_reporter, "setup fail");
+    return kTfLiteError;
+  }
+
+  TF_LITE_REPORT_ERROR(error_reporter, "setup done");
+
+  return kTfLiteOk;
+}
+
+bool ReadAccelerometer(tflite::ErrorReporter* error_reporter, float* input,
+                       int length) {
+  // Check how many accelerometer data
+  available_count = hx_drv_accelerometer_available_count();
+
+  if (available_count == 0) return false;
+
+  for (int i = 0; i < available_count; i++) {
+    float x, y, z;
+    hx_drv_accelerometer_receive(&x, &y, &z);
+
+    const float norm_x = -x;
+    const float norm_y = y;
+    const float norm_z = z;
+
+    // Save data in milli-g unit
+    save_data[begin_index++] = norm_x * 1000;
+    save_data[begin_index++] = norm_y * 1000;
+    save_data[begin_index++] = norm_z * 1000;
+
+    // If reach end of buffer, return to 0 position
+    if (begin_index >= ring_buffer_size) begin_index = 0;
+  }
+
+  // Check if data enough for prediction
+  if (pending_initial_data && begin_index >= 200) {
+    pending_initial_data = false;
+  }
+
+  // Return if we don't have enough data
+  if (pending_initial_data) {
+    return false;
+  }
+
+  // Copy the requested number of bytes to the provided input tensor
+  for (int i = 0; i < length; ++i) {
+    int ring_array_index = begin_index + i - length;
+    if (ring_array_index < 0) {
+      ring_array_index += ring_buffer_size;
+    }
+    input[i] = save_data[ring_array_index];
+  }
+
+  return true;
+}
diff --git a/tensorflow/lite/micro/examples/magic_wand/magic_wand_test.cc b/tensorflow/lite/micro/examples/magic_wand/magic_wand_test.cc
index 96a2b971d9b..920440509f7 100644
--- a/tensorflow/lite/micro/examples/magic_wand/magic_wand_test.cc
+++ b/tensorflow/lite/micro/examples/magic_wand/magic_wand_test.cc
@@ -28,13 +28,12 @@ TF_LITE_MICRO_TESTS_BEGIN
 TF_LITE_MICRO_TEST(LoadModelAndPerformInference) {
   // Set up logging
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   // Map the model into a usable data structure. This doesn't involve any
   // copying or parsing, it's a very lightweight operation.
   const tflite::Model* model = ::tflite::GetModel(g_magic_wand_model_data);
   if (model->version() != TFLITE_SCHEMA_VERSION) {
-    TF_LITE_REPORT_ERROR(error_reporter,
+    TF_LITE_REPORT_ERROR(&micro_error_reporter,
                          "Model provided is schema version %d not equal "
                          "to supported version %d.\n",
                          model->version(), TFLITE_SCHEMA_VERSION);
@@ -59,7 +58,8 @@ TF_LITE_MICRO_TEST(LoadModelAndPerformInference) {
 
   // Build an interpreter to run the model with
   tflite::MicroInterpreter interpreter(model, micro_op_resolver, tensor_arena,
-                                       tensor_arena_size, error_reporter);
+                                       tensor_arena_size,
+                                       &micro_error_reporter);
 
   // Allocate memory from the tensor_arena for the model's tensors
   interpreter.AllocateTensors();
@@ -80,15 +80,15 @@ TF_LITE_MICRO_TEST(LoadModelAndPerformInference) {
 
   // Provide an input value
   const float* ring_features_data = g_ring_micro_f9643d42_nohash_4_data;
-  TF_LITE_REPORT_ERROR(error_reporter, "%d", input->bytes);
-  for (int i = 0; i < (input->bytes / sizeof(float)); ++i) {
+  TF_LITE_REPORT_ERROR(&micro_error_reporter, "%d", input->bytes);
+  for (size_t i = 0; i < (input->bytes / sizeof(float)); ++i) {
     input->data.f[i] = ring_features_data[i];
   }
 
   // Run the model on this input and check that it succeeds
   TfLiteStatus invoke_status = interpreter.Invoke();
   if (invoke_status != kTfLiteOk) {
-    TF_LITE_REPORT_ERROR(error_reporter, "Invoke failed\n");
+    TF_LITE_REPORT_ERROR(&micro_error_reporter, "Invoke failed\n");
   }
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
 
@@ -118,14 +118,14 @@ TF_LITE_MICRO_TEST(LoadModelAndPerformInference) {
 
   // Now test with a different input, from a recording of "Slope".
   const float* slope_features_data = g_slope_micro_f2e59fea_nohash_1_data;
-  for (int i = 0; i < (input->bytes / sizeof(float)); ++i) {
+  for (size_t i = 0; i < (input->bytes / sizeof(float)); ++i) {
     input->data.f[i] = slope_features_data[i];
   }
 
   // Run the model on this "Slope" input.
   invoke_status = interpreter.Invoke();
   if (invoke_status != kTfLiteOk) {
-    TF_LITE_REPORT_ERROR(error_reporter, "Invoke failed\n");
+    TF_LITE_REPORT_ERROR(&micro_error_reporter, "Invoke failed\n");
   }
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
 
diff --git a/tensorflow/lite/micro/examples/magic_wand/output_handler_test.cc b/tensorflow/lite/micro/examples/magic_wand/output_handler_test.cc
index 6ac5468531d..133d62427a1 100644
--- a/tensorflow/lite/micro/examples/magic_wand/output_handler_test.cc
+++ b/tensorflow/lite/micro/examples/magic_wand/output_handler_test.cc
@@ -22,11 +22,10 @@ TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(TestCallability) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
-  HandleOutput(error_reporter, 0);
-  HandleOutput(error_reporter, 1);
-  HandleOutput(error_reporter, 2);
-  HandleOutput(error_reporter, 3);
+  HandleOutput(&micro_error_reporter, 0);
+  HandleOutput(&micro_error_reporter, 1);
+  HandleOutput(&micro_error_reporter, 2);
+  HandleOutput(&micro_error_reporter, 3);
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/examples/magic_wand/riscv32_mcu/Makefile.inc b/tensorflow/lite/micro/examples/magic_wand/riscv32_mcu/Makefile.inc
new file mode 100644
index 00000000000..545ed1ad1f5
--- /dev/null
+++ b/tensorflow/lite/micro/examples/magic_wand/riscv32_mcu/Makefile.inc
@@ -0,0 +1,27 @@
+ifeq ($(TARGET), riscv32_mcu)
+  # Wrap functions
+  MICRO_FE310_LIBWRAP_SRCS := \
+    $(wildcard $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/libwrap/sys/*.c) \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/libwrap/misc/write_hex.c \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/libwrap/stdlib/malloc.c
+
+  MICRO_FE310_BSP_ENV_SRCS := \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env/start.S \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env/entry.S \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env/freedom-e300-hifive1/init.c
+
+  magic_wand_TEST_SRCS += $(MICRO_FE310_LIBWRAP_SRCS) $(MICRO_FE310_BSP_ENV_SRCS) \
+    tensorflow/lite/micro/arduino/abi.cc
+  magic_wand_SRCS += $(MICRO_FE310_LIBWRAP_SRCS) $(MICRO_FE310_BSP_ENV_SRCS) \
+    tensorflow/lite/micro/arduino/abi.cc
+
+  LIBWRAP_SYMS := malloc free \
+                  open lseek read write fstat stat close link unlink \
+                  execve fork getpid kill wait \
+                  isatty times sbrk _exit puts
+
+  LDFLAGS += $(foreach s,$(LIBWRAP_SYMS),-Wl,--wrap=$(s))
+  LDFLAGS += $(foreach s,$(LIBWRAP_SYMS),-Wl,--wrap=_$(s))
+  LDFLAGS += -L. -Wl,--start-group -lc -Wl,--end-group
+endif
+
diff --git a/tensorflow/lite/micro/examples/magic_wand/sparkfun_edge/accelerometer_handler.cc b/tensorflow/lite/micro/examples/magic_wand/sparkfun_edge/accelerometer_handler.cc
index 0b35b69c298..ae2d127a428 100644
--- a/tensorflow/lite/micro/examples/magic_wand/sparkfun_edge/accelerometer_handler.cc
+++ b/tensorflow/lite/micro/examples/magic_wand/sparkfun_edge/accelerometer_handler.cc
@@ -173,23 +173,24 @@ bool ReadAccelerometer(tflite::ErrorReporter* error_reporter, float* input,
   }
 
   // Load data from FIFO buffer
-  axis3bit16_t data_raw_acceleration;
+  axis3bit16_t data_raw_acceleration_local;
   for (int i = 0; i < samples; i++) {
     // Zero out the struct that holds raw accelerometer data
-    memset(data_raw_acceleration.u8bit, 0x00, 3 * sizeof(int16_t));
+    memset(data_raw_acceleration_local.u8bit, 0x00, 3 * sizeof(int16_t));
     // If the return value is non-zero, sensor data was successfully read
-    if (lis2dh12_acceleration_raw_get(&dev_ctx, data_raw_acceleration.u8bit)) {
+    if (lis2dh12_acceleration_raw_get(&dev_ctx,
+                                      data_raw_acceleration_local.u8bit)) {
       TF_LITE_REPORT_ERROR(error_reporter, "Failed to get raw data.");
     } else {
       // Convert each raw 16-bit value into floating point values representing
       // milli-Gs, a unit of acceleration, and store in the current position of
       // our buffer
       save_data[begin_index++] =
-          lis2dh12_from_fs2_hr_to_mg(data_raw_acceleration.i16bit[0]);
+          lis2dh12_from_fs2_hr_to_mg(data_raw_acceleration_local.i16bit[0]);
       save_data[begin_index++] =
-          lis2dh12_from_fs2_hr_to_mg(data_raw_acceleration.i16bit[1]);
+          lis2dh12_from_fs2_hr_to_mg(data_raw_acceleration_local.i16bit[1]);
       save_data[begin_index++] =
-          lis2dh12_from_fs2_hr_to_mg(data_raw_acceleration.i16bit[2]);
+          lis2dh12_from_fs2_hr_to_mg(data_raw_acceleration_local.i16bit[2]);
       // Start from beginning, imitating loop array.
       if (begin_index >= 600) begin_index = 0;
     }
diff --git a/tensorflow/lite/micro/examples/micro_speech/Makefile.inc b/tensorflow/lite/micro/examples/micro_speech/Makefile.inc
index 18d5fa52505..a4ec24f48e1 100644
--- a/tensorflow/lite/micro/examples/micro_speech/Makefile.inc
+++ b/tensorflow/lite/micro/examples/micro_speech/Makefile.inc
@@ -237,6 +237,10 @@ $(MICRO_FEATURES_GENERATOR_HDRS)
 #Find any platform - specific rules for this example.
 include $(wildcard tensorflow/lite/micro/examples/micro_speech/*/Makefile.inc)
 
+# TODO(b/161489252): Disabling warnings for this example until we have a better
+# way to build third_party code with a reduced list of CFLAGS.
+CCFLAGS := $(filter-out $(CC_WARNINGS),$(CCFLAGS))
+
 # Test the code for feature generation.
 $(eval $(call microlite_test,micro_features_generator_test,\
 $(MICRO_FEATURES_GENERATOR_TEST_SRCS), $(MICRO_FEATURES_GENERATOR_TEST_HDRS)))
diff --git a/tensorflow/lite/micro/examples/micro_speech/README.md b/tensorflow/lite/micro/examples/micro_speech/README.md
index a4a2f2d3be7..e04813f2088 100644
--- a/tensorflow/lite/micro/examples/micro_speech/README.md
+++ b/tensorflow/lite/micro/examples/micro_speech/README.md
@@ -22,6 +22,7 @@ kilobytes of Flash.
 -   [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge)
 -   [Deploy to STM32F746](#deploy-to-STM32F746)
 -   [Deploy to NXP FRDM K66F](#deploy-to-nxp-frdm-k66f)
+-   [Deploy to HIMAX WE1 EVB](#deploy-to-himax-we1-evb)
 -   [Run on macOS](#run-on-macos)
 -   [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
 -   [Train your own model](#train-your-own-model)
@@ -562,6 +563,107 @@ using [ARM Mbed](https://github.com/ARMmbed/mbed-cli).
     in black color. If there is no output on the serial port, you can connect
     headphone to headphone port to check if audio loopback path is working.
 
+## Deploy to HIMAX WE1 EVB
+
+The following instructions will help you build and deploy this example to
+[HIMAX WE1 EVB](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_board_brief)
+board. To undstand more about using this board, please check
+[HIMAX WE1 EVB user guide](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide).
+
+### Initial Setup
+
+To use the HIMAX WE1 EVB, please make sure following software are installed:
+
+#### MetaWare Development Toolkit
+
+See
+[Install the Synopsys DesignWare ARC MetaWare Development Toolkit](/tensorflow/lite/micro/tools/make/targets/arc/README.md#install-the-synopsys-designware-arc-metaware-development-toolkit)
+section for instructions on toolchain installation.
+
+#### Make Tool version
+
+A `'make'` tool is required for deploying Tensorflow Lite Micro applications on
+HIMAX WE1 EVB, See
+[Check make tool version](/tensorflow/lite/micro/tools/make/targets/arc/README.md#make-tool)
+section for proper environment.
+
+#### Serial Terminal Emulation Application
+
+There are 2 main purposes for HIMAX WE1 EVB Debug UART port
+
+-   print application output
+-   burn application to flash by using xmodem send application binary
+
+You can use any terminal emulation program (like [PuTTY](https://www.putty.org/)
+or [minicom](https://linux.die.net/man/1/minicom)).
+
+### Generate Example Project
+
+The example project for HIMAX WE1 EVB platform can be generated with the
+following command:
+
+Download related third party data
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=himax_we1_evb third_party_downloads
+```
+
+Generate micro speech project
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile generate_micro_speech_make_project TARGET=himax_we1_evb
+```
+
+### Build and Burn Example
+
+Following the Steps to run micro speech example at HIMAX WE1 EVB platform.
+
+1.  Go to the generated example project directory.
+
+    ```
+    cd tensorflow/lite/micro/tools/make/gen/himax_we1_evb_arc/prj/micro_speech/make
+    ```
+
+2.  Build the example using
+
+    ```
+    make app
+    ```
+
+3.  After example build finish, copy ELF file and map file to image generate
+    tool directory. \
+    image generate tool directory located at
+    `'tensorflow/lite/micro/tools/make/downloads/himax_we1_sdk/image_gen_linux_v3/'`
+
+    ```
+    cp micro_speech.elf himax_we1_evb.map ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
+    ```
+
+4.  Go to flash image generate tool directory.
+
+    ```
+    cd ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
+    ```
+
+5.  run image generate tool, generate flash image file.
+
+    *   Before running image generate tool, by typing `sudo chmod +x image_gen`
+        and `sudo chmod +x sign_tool` to make sure it is executable.
+
+    ```
+    image_gen -e micro_speech.elf -m himax_we1_evb.map -o out.img
+    ```
+
+6.  Download flash image file to HIMAX WE1 EVB by UART:
+
+    *   more detail about download image through UART can be found at
+        [HIMAX WE1 EVB update Flash image](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide#flash-image-update)
+
+After these steps, press reset button on the HIMAX WE1 EVB, you will see
+application output in the serial terminal and lighting LED.
+
+![Animation on Himax WE1 EVB](https://raw.githubusercontent.com/HimaxWiseEyePlus/bsp_tflu/master/HIMAX_WE1_EVB_user_guide/images/tflm_example_micro_speech_int8_led.gif)
+
 ## Run on macOS
 
 The example contains an audio provider compatible with macOS. If you have access
diff --git a/tensorflow/lite/micro/examples/micro_speech/audio_provider_mock_test.cc b/tensorflow/lite/micro/examples/micro_speech/audio_provider_mock_test.cc
index d874210ccea..91419035048 100644
--- a/tensorflow/lite/micro/examples/micro_speech/audio_provider_mock_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/audio_provider_mock_test.cc
@@ -27,12 +27,11 @@ TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(TestAudioProviderMock) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   int audio_samples_size = 0;
   int16_t* audio_samples = nullptr;
   TfLiteStatus get_status =
-      GetAudioSamples(error_reporter, 0, kFeatureSliceDurationMs,
+      GetAudioSamples(&micro_error_reporter, 0, kFeatureSliceDurationMs,
                       &audio_samples_size, &audio_samples);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, get_status);
   TF_LITE_MICRO_EXPECT_LE(audio_samples_size, kMaxAudioSampleSize);
@@ -41,8 +40,9 @@ TF_LITE_MICRO_TEST(TestAudioProviderMock) {
     TF_LITE_MICRO_EXPECT_EQ(g_yes_1000ms_sample_data[i], audio_samples[i]);
   }
 
-  get_status = GetAudioSamples(error_reporter, 500, kFeatureSliceDurationMs,
-                               &audio_samples_size, &audio_samples);
+  get_status =
+      GetAudioSamples(&micro_error_reporter, 500, kFeatureSliceDurationMs,
+                      &audio_samples_size, &audio_samples);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, get_status);
   TF_LITE_MICRO_EXPECT_LE(audio_samples_size, kMaxAudioSampleSize);
   TF_LITE_MICRO_EXPECT_NE(audio_samples, nullptr);
@@ -51,8 +51,9 @@ TF_LITE_MICRO_TEST(TestAudioProviderMock) {
                             audio_samples[i]);
   }
 
-  get_status = GetAudioSamples(error_reporter, 1500, kFeatureSliceDurationMs,
-                               &audio_samples_size, &audio_samples);
+  get_status =
+      GetAudioSamples(&micro_error_reporter, 1500, kFeatureSliceDurationMs,
+                      &audio_samples_size, &audio_samples);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, get_status);
   TF_LITE_MICRO_EXPECT_LE(audio_samples_size, kMaxAudioSampleSize);
   TF_LITE_MICRO_EXPECT_NE(audio_samples, nullptr);
@@ -60,8 +61,9 @@ TF_LITE_MICRO_TEST(TestAudioProviderMock) {
     TF_LITE_MICRO_EXPECT_EQ(0, audio_samples[i]);
   }
 
-  get_status = GetAudioSamples(error_reporter, 12250, kFeatureSliceDurationMs,
-                               &audio_samples_size, &audio_samples);
+  get_status =
+      GetAudioSamples(&micro_error_reporter, 12250, kFeatureSliceDurationMs,
+                      &audio_samples_size, &audio_samples);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, get_status);
   TF_LITE_MICRO_EXPECT_LE(audio_samples_size, kMaxAudioSampleSize);
   TF_LITE_MICRO_EXPECT_NE(audio_samples, nullptr);
diff --git a/tensorflow/lite/micro/examples/micro_speech/audio_provider_test.cc b/tensorflow/lite/micro/examples/micro_speech/audio_provider_test.cc
index 065f0f6f996..9249c42ae7c 100644
--- a/tensorflow/lite/micro/examples/micro_speech/audio_provider_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/audio_provider_test.cc
@@ -26,12 +26,11 @@ TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(TestAudioProvider) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   int audio_samples_size = 0;
   int16_t* audio_samples = nullptr;
   TfLiteStatus get_status =
-      GetAudioSamples(error_reporter, 0, kFeatureSliceDurationMs,
+      GetAudioSamples(&micro_error_reporter, 0, kFeatureSliceDurationMs,
                       &audio_samples_size, &audio_samples);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, get_status);
   TF_LITE_MICRO_EXPECT_LE(audio_samples_size, kMaxAudioSampleSize);
@@ -47,7 +46,7 @@ TF_LITE_MICRO_TEST(TestAudioProvider) {
 TF_LITE_MICRO_TEST(TestTimer) {
   // Make sure that the technically-undefined overflow behavior we rely on below
   // works on this platform. It's still not guaranteed, but at least this is a
-  // sanity check.  Turn off when running with ASan, as it will complain about
+  // smoke check.  Turn off when running with ASan, as it will complain about
   // the following undefined behavior.
 #ifndef ADDRESS_SANITIZER
   int32_t overflow_value = std::numeric_limits<int32_t>::max();
diff --git a/tensorflow/lite/micro/examples/micro_speech/command_responder_test.cc b/tensorflow/lite/micro/examples/micro_speech/command_responder_test.cc
index fe811ea52bc..818b0840d08 100644
--- a/tensorflow/lite/micro/examples/micro_speech/command_responder_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/command_responder_test.cc
@@ -22,12 +22,11 @@ TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(TestCallability) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   // This will have external side-effects (like printing to the debug console
   // or lighting an LED) that are hard to observe, so the most we can do is
   // make sure the call doesn't crash.
-  RespondToCommand(error_reporter, 0, "foo", 0, true);
+  RespondToCommand(&micro_error_reporter, 0, "foo", 0, true);
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/examples/micro_speech/feature_provider_mock_test.cc b/tensorflow/lite/micro/examples/micro_speech/feature_provider_mock_test.cc
index aae556bf6e0..c093f31ad10 100644
--- a/tensorflow/lite/micro/examples/micro_speech/feature_provider_mock_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/feature_provider_mock_test.cc
@@ -25,14 +25,13 @@ TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(TestFeatureProviderMockYes) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   int8_t feature_data[kFeatureElementCount];
   FeatureProvider feature_provider(kFeatureElementCount, feature_data);
 
   int how_many_new_slices = 0;
   TfLiteStatus populate_status = feature_provider.PopulateFeatureData(
-      error_reporter, /* last_time_in_ms= */ 0, /* time_in_ms= */ 970,
+      &micro_error_reporter, /* last_time_in_ms= */ 0, /* time_in_ms= */ 970,
       &how_many_new_slices);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, populate_status);
   TF_LITE_MICRO_EXPECT_EQ(kFeatureSliceCount, how_many_new_slices);
@@ -45,15 +44,14 @@ TF_LITE_MICRO_TEST(TestFeatureProviderMockYes) {
 
 TF_LITE_MICRO_TEST(TestFeatureProviderMockNo) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   int8_t feature_data[kFeatureElementCount];
   FeatureProvider feature_provider(kFeatureElementCount, feature_data);
 
   int how_many_new_slices = 0;
   TfLiteStatus populate_status = feature_provider.PopulateFeatureData(
-      error_reporter, /* last_time_in_ms= */ 4000, /* time_in_ms= */ 4970,
-      &how_many_new_slices);
+      &micro_error_reporter, /* last_time_in_ms= */ 4000,
+      /* time_in_ms= */ 4970, &how_many_new_slices);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, populate_status);
   TF_LITE_MICRO_EXPECT_EQ(kFeatureSliceCount, how_many_new_slices);
 
diff --git a/tensorflow/lite/micro/examples/micro_speech/feature_provider_test.cc b/tensorflow/lite/micro/examples/micro_speech/feature_provider_test.cc
index 5d6816a91e4..e0fc95c6336 100644
--- a/tensorflow/lite/micro/examples/micro_speech/feature_provider_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/feature_provider_test.cc
@@ -24,14 +24,13 @@ TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(TestFeatureProvider) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   int8_t feature_data[kFeatureElementCount];
   FeatureProvider feature_provider(kFeatureElementCount, feature_data);
 
   int how_many_new_slices = 0;
   TfLiteStatus populate_status = feature_provider.PopulateFeatureData(
-      error_reporter, /* last_time_in_ms= */ 0, /* time_in_ms= */ 10000,
+      &micro_error_reporter, /* last_time_in_ms= */ 0, /* time_in_ms= */ 10000,
       &how_many_new_slices);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, populate_status);
   TF_LITE_MICRO_EXPECT_EQ(kFeatureSliceCount, how_many_new_slices);
diff --git a/tensorflow/lite/micro/examples/micro_speech/himax_we1_evb/audio_provider.cc b/tensorflow/lite/micro/examples/micro_speech/himax_we1_evb/audio_provider.cc
new file mode 100644
index 00000000000..a2137d2f8d2
--- /dev/null
+++ b/tensorflow/lite/micro/examples/micro_speech/himax_we1_evb/audio_provider.cc
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/examples/micro_speech/audio_provider.h"
+
+#include "hx_drv_tflm.h"
+#include "tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+
+namespace {
+// Feedback silence buffer when beginning start_ms <= 0
+int16_t g_silence[kMaxAudioSampleSize] = {0};
+// Latest time-stamp
+int32_t g_latest_audio_timestamp = 0;
+// config about audio data size and address
+hx_drv_mic_data_config_t mic_config;
+// Flag for check if audio is initialize or not
+bool g_is_audio_initialized = false;
+}  // namespace
+
+TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
+                             int start_ms, int duration_ms,
+                             int* audio_samples_size, int16_t** audio_samples) {
+  if (!g_is_audio_initialized) {
+    if (hx_drv_mic_initial() != HX_DRV_LIB_PASS) return kTfLiteError;
+
+    hx_drv_mic_on();
+    g_is_audio_initialized = true;
+  }
+
+  if (start_ms > 0) {
+    hx_drv_mic_capture(&mic_config);
+  } else {
+    mic_config.data_size = kMaxAudioSampleSize;
+    mic_config.data_address = (uint32_t)g_silence;
+  }
+
+  *audio_samples_size = mic_config.data_size;
+  *audio_samples = (int16_t*)mic_config.data_address;
+  return kTfLiteOk;
+}
+
+int32_t LatestAudioTimestamp() {
+  hx_drv_mic_timestamp_get(&g_latest_audio_timestamp);
+  return g_latest_audio_timestamp;
+}
diff --git a/tensorflow/lite/micro/examples/micro_speech/himax_we1_evb/command_responder.cc b/tensorflow/lite/micro/examples/micro_speech/himax_we1_evb/command_responder.cc
new file mode 100644
index 00000000000..deda6c8dc50
--- /dev/null
+++ b/tensorflow/lite/micro/examples/micro_speech/himax_we1_evb/command_responder.cc
@@ -0,0 +1,60 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/examples/micro_speech/command_responder.h"
+
+#include "hx_drv_tflm.h"
+
+static int32_t last_command_time = 0;
+static uint32_t loop = 0;
+static bool all_on = 0;
+
+void RespondToCommand(tflite::ErrorReporter* error_reporter,
+                      int32_t current_time, const char* found_command,
+                      uint8_t score, bool is_new_command) {
+  loop++;
+  if (is_new_command) {
+    TF_LITE_REPORT_ERROR(error_reporter, "Heard %s (%d) @%dms", found_command,
+                         score, current_time);
+    if (found_command[0] == 'y') {
+      last_command_time = current_time;
+      hx_drv_led_off(HX_DRV_LED_RED);
+      hx_drv_led_on(HX_DRV_LED_GREEN);
+    } else if (found_command[0] == 'n') {
+      last_command_time = current_time;
+      hx_drv_led_off(HX_DRV_LED_GREEN);
+      hx_drv_led_on(HX_DRV_LED_RED);
+    }
+  }
+
+  if (last_command_time != 0) {
+    if (last_command_time < (current_time - 3000)) {
+      last_command_time = 0;
+      hx_drv_led_off(HX_DRV_LED_GREEN);
+      hx_drv_led_off(HX_DRV_LED_RED);
+    }
+  } else {
+    if ((loop % 10) == 0) {
+      if (all_on) {
+        hx_drv_led_on(HX_DRV_LED_RED);
+        hx_drv_led_on(HX_DRV_LED_GREEN);
+      } else {
+        hx_drv_led_off(HX_DRV_LED_RED);
+        hx_drv_led_off(HX_DRV_LED_GREEN);
+      }
+      all_on = !all_on;
+    }
+  }
+}
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.cc b/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.cc
index fbb6e6e4a9f..9e076431288 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.cc
@@ -81,7 +81,7 @@ TfLiteStatus GenerateMicroFeatures(tflite::ErrorReporter* error_reporter,
   FrontendOutput frontend_output = FrontendProcessSamples(
       &g_micro_features_state, frontend_input, input_size, num_samples_read);
 
-  for (int i = 0; i < frontend_output.size; ++i) {
+  for (size_t i = 0; i < frontend_output.size; ++i) {
     // These scaling values are derived from those used in input_data.py in the
     // training pipeline.
     // The feature pipeline outputs 16-bit signed integers in roughly a 0 to 670
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator_test.cc b/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator_test.cc
index ee3ee03763f..083c3cc479d 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator_test.cc
@@ -30,9 +30,9 @@ TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(TestMicroFeaturesGeneratorYes) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, InitializeMicroFeatures(error_reporter));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          InitializeMicroFeatures(&micro_error_reporter));
 
   // The micro features pipeline retains state from previous calls to help
   // estimate the background noise. Unfortunately this makes it harder to
@@ -51,8 +51,9 @@ TF_LITE_MICRO_TEST(TestMicroFeaturesGeneratorYes) {
   int8_t yes_calculated_data[g_yes_feature_data_slice_size];
   size_t num_samples_read;
   TfLiteStatus yes_status = GenerateMicroFeatures(
-      error_reporter, g_yes_30ms_sample_data, g_yes_30ms_sample_data_size,
-      g_yes_feature_data_slice_size, yes_calculated_data, &num_samples_read);
+      &micro_error_reporter, g_yes_30ms_sample_data,
+      g_yes_30ms_sample_data_size, g_yes_feature_data_slice_size,
+      yes_calculated_data, &num_samples_read);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, yes_status);
 
   for (int i = 0; i < g_yes_feature_data_slice_size; ++i) {
@@ -60,17 +61,17 @@ TF_LITE_MICRO_TEST(TestMicroFeaturesGeneratorYes) {
     const int actual = yes_calculated_data[i];
     TF_LITE_MICRO_EXPECT_EQ(expected, actual);
     if (expected != actual) {
-      TF_LITE_REPORT_ERROR(error_reporter, "Expected value %d but found %d",
-                           expected, actual);
+      TF_LITE_REPORT_ERROR(&micro_error_reporter,
+                           "Expected value %d but found %d", expected, actual);
     }
   }
 }
 
 TF_LITE_MICRO_TEST(TestMicroFeaturesGeneratorNo) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, InitializeMicroFeatures(error_reporter));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          InitializeMicroFeatures(&micro_error_reporter));
   // As we did for the previous features, set known good noise state
   // parameters.
   const uint32_t no_estimate_presets[] = {
@@ -85,17 +86,17 @@ TF_LITE_MICRO_TEST(TestMicroFeaturesGeneratorNo) {
   int8_t no_calculated_data[g_no_feature_data_slice_size];
   size_t num_samples_read;
   TfLiteStatus no_status = GenerateMicroFeatures(
-      error_reporter, g_no_30ms_sample_data, g_no_30ms_sample_data_size,
+      &micro_error_reporter, g_no_30ms_sample_data, g_no_30ms_sample_data_size,
       g_no_feature_data_slice_size, no_calculated_data, &num_samples_read);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, no_status);
 
-  for (int i = 0; i < g_no_feature_data_slice_size; ++i) {
+  for (size_t i = 0; i < g_no_feature_data_slice_size; ++i) {
     const int expected = g_no_feature_data_slice[i];
     const int actual = no_calculated_data[i];
     TF_LITE_MICRO_EXPECT_EQ(expected, actual);
     if (expected != actual) {
-      TF_LITE_REPORT_ERROR(error_reporter, "Expected value %d but found %d",
-                           expected, actual);
+      TF_LITE_REPORT_ERROR(&micro_error_reporter,
+                           "Expected value %d but found %d", expected, actual);
     }
   }
 }
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
index 4598dd3662f..b58515d1833 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
@@ -28,13 +28,12 @@ TF_LITE_MICRO_TESTS_BEGIN
 TF_LITE_MICRO_TEST(TestInvoke) {
   // Set up logging.
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   // Map the model into a usable data structure. This doesn't involve any
   // copying or parsing, it's a very lightweight operation.
   const tflite::Model* model = ::tflite::GetModel(g_model);
   if (model->version() != TFLITE_SCHEMA_VERSION) {
-    TF_LITE_REPORT_ERROR(error_reporter,
+    TF_LITE_REPORT_ERROR(&micro_error_reporter,
                          "Model provided is schema version %d not equal "
                          "to supported version %d.\n",
                          model->version(), TFLITE_SCHEMA_VERSION);
@@ -59,7 +58,8 @@ TF_LITE_MICRO_TEST(TestInvoke) {
 
   // Build an interpreter to run the model with.
   tflite::MicroInterpreter interpreter(model, micro_op_resolver, tensor_arena,
-                                       tensor_arena_size, error_reporter);
+                                       tensor_arena_size,
+                                       &micro_error_reporter);
   interpreter.AllocateTensors();
 
   // Get information about the memory area to use for the model's input.
@@ -75,14 +75,14 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   // Copy a spectrogram created from a .wav audio file of someone saying "Yes",
   // into the memory area used for the input.
   const int8_t* yes_features_data = g_yes_micro_f2e59fea_nohash_1_data;
-  for (int i = 0; i < input->bytes; ++i) {
+  for (size_t i = 0; i < input->bytes; ++i) {
     input->data.int8[i] = yes_features_data[i];
   }
 
   // Run the model on this input and make sure it succeeds.
   TfLiteStatus invoke_status = interpreter.Invoke();
   if (invoke_status != kTfLiteOk) {
-    TF_LITE_REPORT_ERROR(error_reporter, "Invoke failed\n");
+    TF_LITE_REPORT_ERROR(&micro_error_reporter, "Invoke failed\n");
   }
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
 
@@ -111,14 +111,14 @@ TF_LITE_MICRO_TEST(TestInvoke) {
 
   // Now test with a different input, from a recording of "No".
   const int8_t* no_features_data = g_no_micro_f9643d42_nohash_4_data;
-  for (int i = 0; i < input->bytes; ++i) {
+  for (size_t i = 0; i < input->bytes; ++i) {
     input->data.int8[i] = no_features_data[i];
   }
 
   // Run the model on this "No" input.
   invoke_status = interpreter.Invoke();
   if (invoke_status != kTfLiteOk) {
-    TF_LITE_REPORT_ERROR(error_reporter, "Invoke failed\n");
+    TF_LITE_REPORT_ERROR(&micro_error_reporter, "Invoke failed\n");
   }
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
 
@@ -139,7 +139,7 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   TF_LITE_MICRO_EXPECT_GT(no_score, unknown_score);
   TF_LITE_MICRO_EXPECT_GT(no_score, yes_score);
 
-  TF_LITE_REPORT_ERROR(error_reporter, "Ran successfully\n");
+  TF_LITE_REPORT_ERROR(&micro_error_reporter, "Ran successfully\n");
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/examples/micro_speech/recognize_commands.cc b/tensorflow/lite/micro/examples/micro_speech/recognize_commands.cc
index 47bd10074d3..265c494670d 100644
--- a/tensorflow/lite/micro/examples/micro_speech/recognize_commands.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/recognize_commands.cc
@@ -50,7 +50,7 @@ TfLiteStatus RecognizeCommands::ProcessLatestResults(
   if (latest_results->type != kTfLiteInt8) {
     TF_LITE_REPORT_ERROR(
         error_reporter_,
-        "The results for recognition should be int8 elements, but are %d",
+        "The results for recognition should be int8_t elements, but are %d",
         latest_results->type);
     return kTfLiteError;
   }
diff --git a/tensorflow/lite/micro/examples/micro_speech/recognize_commands_test.cc b/tensorflow/lite/micro/examples/micro_speech/recognize_commands_test.cc
index 9ad20b68c8c..089da9173c7 100644
--- a/tensorflow/lite/micro/examples/micro_speech/recognize_commands_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/recognize_commands_test.cc
@@ -22,9 +22,8 @@ TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(PreviousResultsQueueBasic) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
-  PreviousResultsQueue queue(error_reporter);
+  PreviousResultsQueue queue(&micro_error_reporter);
   TF_LITE_MICRO_EXPECT_EQ(0, queue.size());
 
   int8_t scores_a[4] = {0, 0, 0, 1};
@@ -54,9 +53,8 @@ TF_LITE_MICRO_TEST(PreviousResultsQueueBasic) {
 
 TF_LITE_MICRO_TEST(PreviousResultsQueuePushPop) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
-  PreviousResultsQueue queue(error_reporter);
+  PreviousResultsQueue queue(&micro_error_reporter);
   TF_LITE_MICRO_EXPECT_EQ(0, queue.size());
 
   for (int i = 0; i < 123; ++i) {
@@ -74,15 +72,14 @@ TF_LITE_MICRO_TEST(PreviousResultsQueuePushPop) {
 
 TF_LITE_MICRO_TEST(RecognizeCommandsTestBasic) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
-  RecognizeCommands recognize_commands(error_reporter);
+  RecognizeCommands recognize_commands(&micro_error_reporter);
 
-  std::initializer_list<int8_t> result_data = {127, -128, -128, -128};
-  auto result_dims = {2, 1, 4};
+  const int8_t result_data[] = {127, -128, -128, -128};
+  const int result_dims[] = {2, 1, 4};
   TfLiteTensor results = tflite::testing::CreateQuantizedTensor(
-      result_data, tflite::testing::IntArrayFromInitializer(result_dims),
-      -128.0f, 127.0f);
+      result_data, tflite::testing::IntArrayFromInts(result_dims), -128.0f,
+      127.0f);
 
   const char* found_command;
   uint8_t score;
@@ -94,15 +91,13 @@ TF_LITE_MICRO_TEST(RecognizeCommandsTestBasic) {
 
 TF_LITE_MICRO_TEST(RecognizeCommandsTestFindCommands) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
-  RecognizeCommands recognize_commands(error_reporter, 1000, 51);
+  RecognizeCommands recognize_commands(&micro_error_reporter, 1000, 51);
 
-  std::initializer_list<int8_t> yes_data = {-128, -128, 127, -128};
-  auto yes_dims = {2, 1, 4};
+  const int8_t yes_data[] = {-128, -128, 127, -128};
+  const int yes_dims[] = {2, 1, 4};
   TfLiteTensor yes_results = tflite::testing::CreateQuantizedTensor(
-      yes_data, tflite::testing::IntArrayFromInitializer(yes_dims), -128.0f,
-      127.0f);
+      yes_data, tflite::testing::IntArrayFromInts(yes_dims), -128.0f, 127.0f);
 
   bool has_found_new_command = false;
   const char* new_command;
@@ -126,11 +121,10 @@ TF_LITE_MICRO_TEST(RecognizeCommandsTestFindCommands) {
     TF_LITE_MICRO_EXPECT_EQ(0, tflite::testing::TestStrcmp("yes", new_command));
   }
 
-  std::initializer_list<int8_t> no_data = {-128, -128, -128, 127};
-  auto no_dims = {2, 1, 4};
+  const int8_t no_data[] = {-128, -128, -128, 127};
+  const int no_dims[] = {2, 1, 4};
   TfLiteTensor no_results = tflite::testing::CreateQuantizedTensor(
-      no_data, tflite::testing::IntArrayFromInitializer(no_dims), -128.0f,
-      127.0f);
+      no_data, tflite::testing::IntArrayFromInts(no_dims), -128.0f, 127.0f);
   has_found_new_command = false;
   new_command = "";
   uint8_t score;
@@ -157,15 +151,13 @@ TF_LITE_MICRO_TEST(RecognizeCommandsTestFindCommands) {
 
 TF_LITE_MICRO_TEST(RecognizeCommandsTestBadInputLength) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
-  RecognizeCommands recognize_commands(error_reporter, 1000, 51);
+  RecognizeCommands recognize_commands(&micro_error_reporter, 1000, 51);
 
-  std::initializer_list<int8_t> bad_data = {-128, -128, 127};
-  auto bad_dims = {2, 1, 3};
+  const int8_t bad_data[] = {-128, -128, 127};
+  const int bad_dims[] = {2, 1, 3};
   TfLiteTensor bad_results = tflite::testing::CreateQuantizedTensor(
-      bad_data, tflite::testing::IntArrayFromInitializer(bad_dims), -128.0f,
-      127.0f);
+      bad_data, tflite::testing::IntArrayFromInts(bad_dims), -128.0f, 127.0f);
 
   const char* found_command;
   uint8_t score;
@@ -177,15 +169,14 @@ TF_LITE_MICRO_TEST(RecognizeCommandsTestBadInputLength) {
 
 TF_LITE_MICRO_TEST(RecognizeCommandsTestBadInputTimes) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
-  RecognizeCommands recognize_commands(error_reporter, 1000, 51);
+  RecognizeCommands recognize_commands(&micro_error_reporter, 1000, 51);
 
-  std::initializer_list<int8_t> result_data = {-128, -128, 127, -128};
-  auto result_dims = {2, 1, 4};
+  const int8_t result_data[] = {-128, -128, 127, -128};
+  const int result_dims[] = {2, 1, 4};
   TfLiteTensor results = tflite::testing::CreateQuantizedTensor(
-      result_data, tflite::testing::IntArrayFromInitializer(result_dims),
-      -128.0f, 127.0f);
+      result_data, tflite::testing::IntArrayFromInts(result_dims), -128.0f,
+      127.0f);
 
   const char* found_command;
   uint8_t score;
@@ -200,15 +191,14 @@ TF_LITE_MICRO_TEST(RecognizeCommandsTestBadInputTimes) {
 
 TF_LITE_MICRO_TEST(RecognizeCommandsTestTooFewInputs) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
-  RecognizeCommands recognize_commands(error_reporter, 1000, 51);
+  RecognizeCommands recognize_commands(&micro_error_reporter, 1000, 51);
 
-  std::initializer_list<int8_t> result_data = {-128, -128, 127, -128};
-  auto result_dims = {2, 1, 4};
+  const int8_t result_data[] = {-128, -128, 127, -128};
+  const int result_dims[] = {2, 1, 4};
   TfLiteTensor results = tflite::testing::CreateQuantizedTensor(
-      result_data, tflite::testing::IntArrayFromInitializer(result_dims),
-      -128.0f, 127.0f);
+      result_data, tflite::testing::IntArrayFromInts(result_dims), -128.0f,
+      127.0f);
 
   const char* found_command;
   uint8_t score;
diff --git a/tensorflow/lite/micro/examples/micro_speech/riscv32_mcu/Makefile.inc b/tensorflow/lite/micro/examples/micro_speech/riscv32_mcu/Makefile.inc
new file mode 100644
index 00000000000..480c6fbb219
--- /dev/null
+++ b/tensorflow/lite/micro/examples/micro_speech/riscv32_mcu/Makefile.inc
@@ -0,0 +1,26 @@
+ifeq ($(TARGET), riscv32_mcu)
+  # Wrap functions
+  MICRO_FE310_LIBWRAP_SRCS := \
+    $(wildcard $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/libwrap/sys/*.c) \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/libwrap/misc/write_hex.c \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/libwrap/stdlib/malloc.c
+
+  MICRO_FE310_BSP_ENV_SRCS := \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env/start.S \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env/entry.S \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env/freedom-e300-hifive1/init.c
+
+  MICRO_SPEECH_TEST_SRCS += $(MICRO_FE310_LIBWRAP_SRCS) $(MICRO_FE310_BSP_ENV_SRCS) \
+    tensorflow/lite/micro/arduino/abi.cc
+  MICRO_SPEECH_SRCS += $(MICRO_FE310_LIBWRAP_SRCS) $(MICRO_FE310_BSP_ENV_SRCS) \
+    tensorflow/lite/micro/arduino/abi.cc
+
+  LIBWRAP_SYMS := malloc free \
+                  open lseek read write fstat stat close link unlink \
+                  execve fork getpid kill wait \
+                  isatty times sbrk _exit puts
+
+  LDFLAGS += $(foreach s,$(LIBWRAP_SYMS),-Wl,--wrap=$(s))
+  LDFLAGS += $(foreach s,$(LIBWRAP_SYMS),-Wl,--wrap=_$(s))
+  LDFLAGS += -L. -Wl,--start-group -lc -Wl,--end-group
+endif
diff --git a/tensorflow/lite/micro/examples/micro_speech/simple_features/simple_features_generator.cc b/tensorflow/lite/micro/examples/micro_speech/simple_features/simple_features_generator.cc
index 22434c995c4..0de36b48e41 100644
--- a/tensorflow/lite/micro/examples/micro_speech/simple_features/simple_features_generator.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/simple_features/simple_features_generator.cc
@@ -67,7 +67,7 @@ void CalculateDiscreteFourierTransform(float* time_series, int time_series_size,
 // of the current sample window are weighted more heavily than those at the end.
 void CalculatePeriodicHann(int window_length, float* window_function) {
   for (int i = 0; i < window_length; ++i) {
-    window_function[i] = 0.5 - 0.5 * std::cos((2 * kPi * i) / window_length);
+    window_function[i] = 0.5f - 0.5f * std::cos((2 * kPi * i) / window_length);
   }
 }
 
diff --git a/tensorflow/lite/micro/examples/micro_speech/simple_features/simple_features_generator_test.cc b/tensorflow/lite/micro/examples/micro_speech/simple_features/simple_features_generator_test.cc
index 9ac19b374da..f54feecadfa 100644
--- a/tensorflow/lite/micro/examples/micro_speech/simple_features/simple_features_generator_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/simple_features/simple_features_generator_test.cc
@@ -27,34 +27,35 @@ TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(TestSimpleFeaturesGenerator) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   uint8_t yes_calculated_data[g_yes_power_spectrum_data_size];
   TfLiteStatus yes_status = GenerateSimpleFeatures(
-      error_reporter, g_yes_30ms_sample_data, g_yes_30ms_sample_data_size,
-      g_yes_power_spectrum_data_size, yes_calculated_data);
+      &micro_error_reporter, g_yes_30ms_sample_data,
+      g_yes_30ms_sample_data_size, g_yes_power_spectrum_data_size,
+      yes_calculated_data);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, yes_status);
 
   for (int i = 0; i < g_yes_power_spectrum_data_size; ++i) {
     TF_LITE_MICRO_EXPECT_EQ(g_yes_power_spectrum_data[i],
                             yes_calculated_data[i]);
     if (g_yes_power_spectrum_data[i] != yes_calculated_data[i]) {
-      TF_LITE_REPORT_ERROR(error_reporter, "Expected value %d but found %d",
-                           g_yes_power_spectrum_data[i],
-                           yes_calculated_data[i]);
+      TF_LITE_REPORT_ERROR(
+          &micro_error_reporter, "Expected value %d but found %d",
+          g_yes_power_spectrum_data[i], yes_calculated_data[i]);
     }
   }
 
   uint8_t no_calculated_data[g_yes_power_spectrum_data_size];
   TfLiteStatus no_status = GenerateSimpleFeatures(
-      error_reporter, g_no_30ms_sample_data, g_no_30ms_sample_data_size,
+      &micro_error_reporter, g_no_30ms_sample_data, g_no_30ms_sample_data_size,
       g_no_power_spectrum_data_size, no_calculated_data);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, no_status);
 
   for (int i = 0; i < g_no_power_spectrum_data_size; ++i) {
     TF_LITE_MICRO_EXPECT_EQ(g_no_power_spectrum_data[i], no_calculated_data[i]);
     if (g_no_power_spectrum_data[i] != no_calculated_data[i]) {
-      TF_LITE_REPORT_ERROR(error_reporter, "Expected value %d but found %d",
+      TF_LITE_REPORT_ERROR(&micro_error_reporter,
+                           "Expected value %d but found %d",
                            g_no_power_spectrum_data[i], no_calculated_data[i]);
     }
   }
diff --git a/tensorflow/lite/micro/examples/network_tester/expected_output_data.h b/tensorflow/lite/micro/examples/network_tester/expected_output_data.h
index 934722bad94..18937a9b601 100644
--- a/tensorflow/lite/micro/examples/network_tester/expected_output_data.h
+++ b/tensorflow/lite/micro/examples/network_tester/expected_output_data.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_EXAMPLES_NETWORK_TESTER_EXPECTED_OUTPUT_DATA_H_
 #define TENSORFLOW_LITE_MICRO_EXAMPLES_NETWORK_TESTER_EXPECTED_OUTPUT_DATA_H_
 
-static unsigned int expected_output_data_len = 4;
 static unsigned char expected_output_data[1][4] = {6, 8, 14, 16};
 
 #endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_NETWORK_TESTER_EXPECTED_OUTPUT_DATA_H_
diff --git a/tensorflow/lite/micro/examples/network_tester/network_tester_test.cc b/tensorflow/lite/micro/examples/network_tester/network_tester_test.cc
index 6ea02b3f4a5..563500f2115 100644
--- a/tensorflow/lite/micro/examples/network_tester/network_tester_test.cc
+++ b/tensorflow/lite/micro/examples/network_tester/network_tester_test.cc
@@ -64,21 +64,20 @@ inline void print_output_data(TfLiteTensor* output) {
 #endif
 
 template <typename T>
-void check_output_elem(TfLiteTensor* output, const T* expected_output_data,
+void check_output_elem(TfLiteTensor* output, const T* expected_output,
                        const int index) {
   TF_LITE_MICRO_EXPECT_EQ(tflite::GetTensorData<T>(output)[index],
-                          expected_output_data[index]);
+                          expected_output[index]);
 }
 
 TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(TestInvoke) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   const tflite::Model* model = ::tflite::GetModel(network_model);
   if (model->version() != TFLITE_SCHEMA_VERSION) {
-    TF_LITE_REPORT_ERROR(error_reporter,
+    TF_LITE_REPORT_ERROR(&micro_error_reporter,
                          "Model provided is schema version %d not equal "
                          "to supported version %d.\n",
                          model->version(), TFLITE_SCHEMA_VERSION);
@@ -87,23 +86,23 @@ TF_LITE_MICRO_TEST(TestInvoke) {
 
   tflite::AllOpsResolver resolver;
 
-  tflite::MicroInterpreter interpreter(model, resolver, tensor_arena,
-                                       TENSOR_ARENA_SIZE, error_reporter);
+  tflite::MicroInterpreter interpreter(
+      model, resolver, tensor_arena, TENSOR_ARENA_SIZE, &micro_error_reporter);
 
   TfLiteStatus allocate_status = interpreter.AllocateTensors();
   if (allocate_status != kTfLiteOk) {
-    TF_LITE_REPORT_ERROR(error_reporter, "Tensor allocation failed\n");
+    TF_LITE_REPORT_ERROR(&micro_error_reporter, "Tensor allocation failed\n");
     return kTfLiteError;
   }
 
   for (int n = 0; n < NUM_INFERENCES; n++) {
-    for (int i = 0; i < interpreter.inputs_size(); ++i) {
+    for (size_t i = 0; i < interpreter.inputs_size(); ++i) {
       TfLiteTensor* input = interpreter.input(i);
       memcpy(input->data.data, input_data[i], input->bytes);
     }
     TfLiteStatus invoke_status = interpreter.Invoke();
     if (invoke_status != kTfLiteOk) {
-      TF_LITE_REPORT_ERROR(error_reporter, "Invoke failed\n");
+      TF_LITE_REPORT_ERROR(&micro_error_reporter, "Invoke failed\n");
       return kTfLiteError;
     }
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
@@ -126,7 +125,7 @@ TF_LITE_MICRO_TEST(TestInvoke) {
 #endif
 
 #ifndef NO_COMPARE_OUTPUT_DATA
-    for (int i = 0; i < interpreter.outputs_size(); i++) {
+    for (size_t i = 0; i < interpreter.outputs_size(); i++) {
       TfLiteTensor* output = interpreter.output(i);
       for (int j = 0; j < tflite::ElementCount(*(output->dims)); ++j) {
         check_output_elem(output, expected_output_data[i], j);
@@ -134,7 +133,7 @@ TF_LITE_MICRO_TEST(TestInvoke) {
     }
 #endif
   }
-  TF_LITE_REPORT_ERROR(error_reporter, "Ran successfully\n");
+  TF_LITE_REPORT_ERROR(&micro_error_reporter, "Ran successfully\n");
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/examples/person_detection/detection_responder_test.cc b/tensorflow/lite/micro/examples/person_detection/detection_responder_test.cc
index 6ef17d38dc9..1714079f39a 100644
--- a/tensorflow/lite/micro/examples/person_detection/detection_responder_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection/detection_responder_test.cc
@@ -22,13 +22,12 @@ TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(TestCallability) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   // This will have external side-effects (like printing to the debug console
   // or lighting an LED) that are hard to observe, so the most we can do is
   // make sure the call doesn't crash.
-  RespondToDetection(error_reporter, 100, 200);
-  RespondToDetection(error_reporter, 200, 100);
+  RespondToDetection(&micro_error_reporter, 100, 200);
+  RespondToDetection(&micro_error_reporter, 200, 100);
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/examples/person_detection/himax_driver/HM01B0.c b/tensorflow/lite/micro/examples/person_detection/himax_driver/HM01B0.c
index 8e457ec4ca8..4fc673a1d38 100644
--- a/tensorflow/lite/micro/examples/person_detection/himax_driver/HM01B0.c
+++ b/tensorflow/lite/micro/examples/person_detection/himax_driver/HM01B0.c
@@ -503,45 +503,6 @@ uint32_t hm01b0_test_walking1s(hm01b0_cfg_t* psCfg) {
   return hm01b0_load_script(psCfg, psScript, ui32ScriptCmdNum);
 }
 
-//*****************************************************************************
-//
-//! @brief Check the data read from HM01B0 in the walking 1s test mode
-//!
-//! @param pui8Buffer       - Pointer to data buffer.
-//! @param ui32BufferLen    - Buffer length
-//! @param ui32PrintCnt     - Number of mismatched data to be printed out
-//!
-//! This function sets HM01B0 in the walking 1s test mode.
-//!
-//! @return Error code.
-//
-//*****************************************************************************
-void hm01b0_test_walking1s_check_data_sanity(uint8_t* pui8Buffer,
-                                             uint32_t ui32BufferLen,
-                                             uint32_t ui32PrintCnt) {
-  uint8_t ui8ByteData = *pui8Buffer;
-  uint32_t ui32MismatchCnt = 0x00;
-
-  for (uint32_t ui32Idx = 0; ui32Idx < ui32BufferLen; ui32Idx++) {
-    if (*(pui8Buffer + ui32Idx) != ui8ByteData) {
-      if (ui32PrintCnt) {
-        am_util_stdio_printf("[0x%08X] actual 0x%02X expected 0x%02X\n",
-                             ui32Idx, *(pui8Buffer + ui32Idx), ui8ByteData);
-        am_util_delay_ms(1);
-        ui32PrintCnt--;
-      }
-      ui32MismatchCnt++;
-    }
-
-    if (ui8ByteData)
-      ui8ByteData = ui8ByteData << 1;
-    else
-      ui8ByteData = 0x01;
-  }
-
-  am_util_stdio_printf("Mismatch Rate %d/%d\n", ui32MismatchCnt, ui32BufferLen);
-}
-
 //*****************************************************************************
 //
 //! @brief Software reset HM01B0
diff --git a/tensorflow/lite/micro/examples/person_detection/himax_driver/HM01B0.h b/tensorflow/lite/micro/examples/person_detection/himax_driver/HM01B0.h
index e2561da6d10..c7ec4e6676e 100644
--- a/tensorflow/lite/micro/examples/person_detection/himax_driver/HM01B0.h
+++ b/tensorflow/lite/micro/examples/person_detection/himax_driver/HM01B0.h
@@ -297,23 +297,6 @@ uint32_t hm01b0_init_system(hm01b0_cfg_t *psCfg, hm_script_t *psScript,
 //*****************************************************************************
 uint32_t hm01b0_test_walking1s(hm01b0_cfg_t *psCfg);
 
-//*****************************************************************************
-//
-//! @brief Check the data read from HM01B0 in the walking 1s test mode
-//!
-//! @param pui8Buffer       - Pointer to data buffer.
-//! @param ui32BufferLen    - Buffer length
-//! @param ui32PrintCnt     - Number of mismatched data to be printed out
-//!
-//! This function sets HM01B0 in the walking 1s test mode.
-//!
-//! @return Error code.
-//
-//*****************************************************************************
-void hm01b0_test_walking1s_check_data_sanity(uint8_t *pui8Buffer,
-                                             uint32_t ui32BufferLen,
-                                             uint32_t ui32PrintCnt);
-
 //*****************************************************************************
 //
 //! @brief Software reset HM01B0
diff --git a/tensorflow/lite/micro/examples/person_detection/image_provider_test.cc b/tensorflow/lite/micro/examples/person_detection/image_provider_test.cc
index 73695035d14..60c89c8aaf2 100644
--- a/tensorflow/lite/micro/examples/person_detection/image_provider_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection/image_provider_test.cc
@@ -26,11 +26,10 @@ TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(TestImageProvider) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   uint8_t image_data[kMaxImageSize];
-  TfLiteStatus get_status =
-      GetImage(error_reporter, kNumCols, kNumRows, kNumChannels, image_data);
+  TfLiteStatus get_status = GetImage(&micro_error_reporter, kNumCols, kNumRows,
+                                     kNumChannels, image_data);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, get_status);
   TF_LITE_MICRO_EXPECT_NE(image_data, nullptr);
 
diff --git a/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc b/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
index 149d0d50746..548b95e0acc 100644
--- a/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
@@ -35,18 +35,17 @@ TF_LITE_MICRO_TESTS_BEGIN
 TF_LITE_MICRO_TEST(TestInvoke) {
   // Set up logging.
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   // Map the model into a usable data structure. This doesn't involve any
   // copying or parsing, it's a very lightweight operation.
   const tflite::Model* model = ::tflite::GetModel(g_person_detect_model_data);
   if (model->version() != TFLITE_SCHEMA_VERSION) {
-    TF_LITE_REPORT_ERROR(error_reporter,
+    TF_LITE_REPORT_ERROR(&micro_error_reporter,
                          "Model provided is schema version %d not equal "
                          "to supported version %d.\n",
                          model->version(), TFLITE_SCHEMA_VERSION);
   }
-  PrintModelData(model, error_reporter);
+  PrintModelData(model, &micro_error_reporter);
 
   // Pull in only the operation implementations we need.
   // This relies on a complete list of all the ops needed by this graph.
@@ -62,7 +61,8 @@ TF_LITE_MICRO_TEST(TestInvoke) {
 
   // Build an interpreter to run the model with.
   tflite::MicroInterpreter interpreter(model, micro_op_resolver, tensor_arena,
-                                       tensor_arena_size, error_reporter);
+                                       tensor_arena_size,
+                                       &micro_error_reporter);
   interpreter.AllocateTensors();
 
   // Get information about the memory area to use for the model's input.
@@ -79,14 +79,14 @@ TF_LITE_MICRO_TEST(TestInvoke) {
 
   // Copy an image with a person into the memory area used for the input.
   const uint8_t* person_data = g_person_data;
-  for (int i = 0; i < input->bytes; ++i) {
+  for (size_t i = 0; i < input->bytes; ++i) {
     input->data.uint8[i] = person_data[i];
   }
 
   // Run the model on this input and make sure it succeeds.
   TfLiteStatus invoke_status = interpreter.Invoke();
   if (invoke_status != kTfLiteOk) {
-    TF_LITE_REPORT_ERROR(error_reporter, "Invoke failed\n");
+    TF_LITE_REPORT_ERROR(&micro_error_reporter, "Invoke failed\n");
   }
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
 
@@ -103,21 +103,21 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   // Make sure that the expected "Person" score is higher than the other class.
   uint8_t person_score = output->data.uint8[kPersonIndex];
   uint8_t no_person_score = output->data.uint8[kNotAPersonIndex];
-  TF_LITE_REPORT_ERROR(error_reporter,
+  TF_LITE_REPORT_ERROR(&micro_error_reporter,
                        "person data.  person score: %d, no person score: %d\n",
                        person_score, no_person_score);
   TF_LITE_MICRO_EXPECT_GT(person_score, no_person_score);
 
   // Now test with a different input, from an image without a person.
   const uint8_t* no_person_data = g_no_person_data;
-  for (int i = 0; i < input->bytes; ++i) {
+  for (size_t i = 0; i < input->bytes; ++i) {
     input->data.uint8[i] = no_person_data[i];
   }
 
   // Run the model on this "No Person" input.
   invoke_status = interpreter.Invoke();
   if (invoke_status != kTfLiteOk) {
-    TF_LITE_REPORT_ERROR(error_reporter, "Invoke failed\n");
+    TF_LITE_REPORT_ERROR(&micro_error_reporter, "Invoke failed\n");
   }
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
 
@@ -135,12 +135,12 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   person_score = output->data.uint8[kPersonIndex];
   no_person_score = output->data.uint8[kNotAPersonIndex];
   TF_LITE_REPORT_ERROR(
-      error_reporter,
+      &micro_error_reporter,
       "no person data.  person score: %d, no person score: %d\n", person_score,
       no_person_score);
   TF_LITE_MICRO_EXPECT_GT(no_person_score, person_score);
 
-  TF_LITE_REPORT_ERROR(error_reporter, "Ran successfully\n");
+  TF_LITE_REPORT_ERROR(&micro_error_reporter, "Ran successfully\n");
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/examples/person_detection/riscv32_mcu/Makefile.inc b/tensorflow/lite/micro/examples/person_detection/riscv32_mcu/Makefile.inc
new file mode 100644
index 00000000000..e138efd0954
--- /dev/null
+++ b/tensorflow/lite/micro/examples/person_detection/riscv32_mcu/Makefile.inc
@@ -0,0 +1,26 @@
+ifeq ($(TARGET), riscv32_mcu)
+  # Wrap functions
+  MICRO_FE310_LIBWRAP_SRCS := \
+    $(wildcard $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/libwrap/sys/*.c) \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/libwrap/misc/write_hex.c \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/libwrap/stdlib/malloc.c
+
+  MICRO_FE310_BSP_ENV_SRCS := \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env/start.S \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env/entry.S \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env/freedom-e300-hifive1/init.c
+
+  person_detection_TEST_HDRS += $(MICRO_FE310_LIBWRAP_SRCS) $(MICRO_FE310_BSP_ENV_SRCS) \
+    tensorflow/lite/micro/arduino/abi.cc
+  person_detection_SRCS += $(MICRO_FE310_LIBWRAP_SRCS) $(MICRO_FE310_BSP_ENV_SRCS) \
+    tensorflow/lite/micro/arduino/abi.cc
+
+  LIBWRAP_SYMS := malloc free \
+                  open lseek read write fstat stat close link unlink \
+                  execve fork getpid kill wait \
+                  isatty times sbrk _exit puts
+
+  LDFLAGS += $(foreach s,$(LIBWRAP_SYMS),-Wl,--wrap=$(s))
+  LDFLAGS += $(foreach s,$(LIBWRAP_SYMS),-Wl,--wrap=_$(s))
+  LDFLAGS += -L. -Wl,--start-group -lc -Wl,--end-group
+endif
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/detection_responder_test.cc b/tensorflow/lite/micro/examples/person_detection_experimental/detection_responder_test.cc
index 48dbe5e9f7c..3d86baa9d59 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/detection_responder_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/detection_responder_test.cc
@@ -22,13 +22,12 @@ TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(TestCallability) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   // This will have external side-effects (like printing to the debug console
   // or lighting an LED) that are hard to observe, so the most we can do is
   // make sure the call doesn't crash.
-  RespondToDetection(error_reporter, -100, 100);
-  RespondToDetection(error_reporter, 100, 50);
+  RespondToDetection(&micro_error_reporter, -100, 100);
+  RespondToDetection(&micro_error_reporter, 100, 50);
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/himax_driver/HM01B0.c b/tensorflow/lite/micro/examples/person_detection_experimental/himax_driver/HM01B0.c
index 4c89b8e5d76..3ec481a5cd4 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/himax_driver/HM01B0.c
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/himax_driver/HM01B0.c
@@ -497,45 +497,6 @@ uint32_t hm01b0_test_walking1s(hm01b0_cfg_t* psCfg) {
   return hm01b0_load_script(psCfg, psScript, ui32ScriptCmdNum);
 }
 
-//*****************************************************************************
-//
-//! @brief Check the data read from HM01B0 in the walking 1s test mode
-//!
-//! @param pui8Buffer       - Pointer to data buffer.
-//! @param ui32BufferLen    - Buffer length
-//! @param ui32PrintCnt     - Number of mismatched data to be printed out
-//!
-//! This function sets HM01B0 in the walking 1s test mode.
-//!
-//! @return Error code.
-//
-//*****************************************************************************
-void hm01b0_test_walking1s_check_data_sanity(uint8_t* pui8Buffer,
-                                             uint32_t ui32BufferLen,
-                                             uint32_t ui32PrintCnt) {
-  uint8_t ui8ByteData = *pui8Buffer;
-  uint32_t ui32MismatchCnt = 0x00;
-
-  for (uint32_t ui32Idx = 0; ui32Idx < ui32BufferLen; ui32Idx++) {
-    if (*(pui8Buffer + ui32Idx) != ui8ByteData) {
-      if (ui32PrintCnt) {
-        am_util_stdio_printf("[0x%08X] actual 0x%02X expected 0x%02X\n",
-                             ui32Idx, *(pui8Buffer + ui32Idx), ui8ByteData);
-        am_util_delay_ms(1);
-        ui32PrintCnt--;
-      }
-      ui32MismatchCnt++;
-    }
-
-    if (ui8ByteData)
-      ui8ByteData = ui8ByteData << 1;
-    else
-      ui8ByteData = 0x01;
-  }
-
-  am_util_stdio_printf("Mismatch Rate %d/%d\n", ui32MismatchCnt, ui32BufferLen);
-}
-
 //*****************************************************************************
 //
 //! @brief Software reset HM01B0
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/himax_driver/HM01B0.h b/tensorflow/lite/micro/examples/person_detection_experimental/himax_driver/HM01B0.h
index 46dcb583122..f95ee7bd76c 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/himax_driver/HM01B0.h
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/himax_driver/HM01B0.h
@@ -286,23 +286,6 @@ uint32_t hm01b0_init_system(hm01b0_cfg_t *psCfg, hm_script_t *psScript,
 //*****************************************************************************
 uint32_t hm01b0_test_walking1s(hm01b0_cfg_t *psCfg);
 
-//*****************************************************************************
-//
-//! @brief Check the data read from HM01B0 in the walking 1s test mode
-//!
-//! @param pui8Buffer       - Pointer to data buffer.
-//! @param ui32BufferLen    - Buffer length
-//! @param ui32PrintCnt     - Number of mismatched data to be printed out
-//!
-//! This function sets HM01B0 in the walking 1s test mode.
-//!
-//! @return Error code.
-//
-//*****************************************************************************
-void hm01b0_test_walking1s_check_data_sanity(uint8_t *pui8Buffer,
-                                             uint32_t ui32BufferLen,
-                                             uint32_t ui32PrintCnt);
-
 //*****************************************************************************
 //
 //! @brief Software reset HM01B0
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/image_provider_test.cc b/tensorflow/lite/micro/examples/person_detection_experimental/image_provider_test.cc
index f282ed55651..cd5022446b6 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/image_provider_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/image_provider_test.cc
@@ -26,11 +26,10 @@ TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(TestImageProvider) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   int8_t image_data[kMaxImageSize];
-  TfLiteStatus get_status =
-      GetImage(error_reporter, kNumCols, kNumRows, kNumChannels, image_data);
+  TfLiteStatus get_status = GetImage(&micro_error_reporter, kNumCols, kNumRows,
+                                     kNumChannels, image_data);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, get_status);
   TF_LITE_MICRO_EXPECT_NE(image_data, nullptr);
 
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc b/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc
index 53b87bffb41..f1ded80d1b9 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc
@@ -32,7 +32,7 @@ const tflite::Model* model = nullptr;
 tflite::MicroInterpreter* interpreter = nullptr;
 TfLiteTensor* input = nullptr;
 
-// In order to use optimized tensorflow lite kernels, a signed int8 quantized
+// In order to use optimized tensorflow lite kernels, a signed int8_t quantized
 // model is preferred over the legacy unsigned model format. This means that
 // throughout this project, input images must be converted from unisgned to
 // signed format. The easiest and quickest way to convert from unsigned to
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc b/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
index 4ceeb753283..6175a59ba52 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
@@ -34,13 +34,12 @@ TF_LITE_MICRO_TESTS_BEGIN
 TF_LITE_MICRO_TEST(TestInvoke) {
   // Set up logging.
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   // Map the model into a usable data structure. This doesn't involve any
   // copying or parsing, it's a very lightweight operation.
   const tflite::Model* model = ::tflite::GetModel(g_person_detect_model_data);
   if (model->version() != TFLITE_SCHEMA_VERSION) {
-    TF_LITE_REPORT_ERROR(error_reporter,
+    TF_LITE_REPORT_ERROR(&micro_error_reporter,
                          "Model provided is schema version %d not equal "
                          "to supported version %d.\n",
                          model->version(), TFLITE_SCHEMA_VERSION);
@@ -60,7 +59,8 @@ TF_LITE_MICRO_TEST(TestInvoke) {
 
   // Build an interpreter to run the model with.
   tflite::MicroInterpreter interpreter(model, micro_op_resolver, tensor_arena,
-                                       tensor_arena_size, error_reporter);
+                                       tensor_arena_size,
+                                       &micro_error_reporter);
   interpreter.AllocateTensors();
 
   // Get information about the memory area to use for the model's input.
@@ -76,15 +76,13 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt8, input->type);
 
   // Copy an image with a person into the memory area used for the input.
-  for (int i = 0; i < input->bytes; ++i) {
-    // Subtract 128 to convert between uint8 and int8.
-    input->data.int8[i] = g_person_data[i] - 128;
-  }
+  TFLITE_DCHECK_EQ(input->bytes, static_cast<size_t>(g_person_data_size));
+  memcpy(input->data.int8, g_person_data, input->bytes);
 
   // Run the model on this input and make sure it succeeds.
   TfLiteStatus invoke_status = interpreter.Invoke();
   if (invoke_status != kTfLiteOk) {
-    TF_LITE_REPORT_ERROR(error_reporter, "Invoke failed\n");
+    TF_LITE_REPORT_ERROR(&micro_error_reporter, "Invoke failed\n");
   }
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
 
@@ -99,20 +97,18 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   // Make sure that the expected "Person" score is higher than the other class.
   int8_t person_score = output->data.int8[kPersonIndex];
   int8_t no_person_score = output->data.int8[kNotAPersonIndex];
-  TF_LITE_REPORT_ERROR(error_reporter,
+  TF_LITE_REPORT_ERROR(&micro_error_reporter,
                        "person data.  person score: %d, no person score: %d\n",
                        person_score, no_person_score);
   TF_LITE_MICRO_EXPECT_GT(person_score, no_person_score);
 
-  // Now test with a blank image.
-  for (int i = 0; i < input->bytes; ++i) {
-    input->data.int8[i] = 0;
-  }
+  // TODO(b/161461076): Update model to make this work on real negative inputs.
+  memset(input->data.int8, 0, input->bytes);
 
   // Run the model on this "No Person" input.
   invoke_status = interpreter.Invoke();
   if (invoke_status != kTfLiteOk) {
-    TF_LITE_REPORT_ERROR(error_reporter, "Invoke failed\n");
+    TF_LITE_REPORT_ERROR(&micro_error_reporter, "Invoke failed\n");
   }
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
 
@@ -128,12 +124,12 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   person_score = output->data.int8[kPersonIndex];
   no_person_score = output->data.int8[kNotAPersonIndex];
   TF_LITE_REPORT_ERROR(
-      error_reporter,
+      &micro_error_reporter,
       "no person data.  person score: %d, no person score: %d\n", person_score,
       no_person_score);
   TF_LITE_MICRO_EXPECT_GT(no_person_score, person_score);
 
-  TF_LITE_REPORT_ERROR(error_reporter, "Ran successfully\n");
+  TF_LITE_REPORT_ERROR(&micro_error_reporter, "Ran successfully\n");
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/himax_we1_evb/debug_log.cc b/tensorflow/lite/micro/himax_we1_evb/debug_log.cc
index 5bc3c7fae35..3431b8d2a59 100644
--- a/tensorflow/lite/micro/himax_we1_evb/debug_log.cc
+++ b/tensorflow/lite/micro/himax_we1_evb/debug_log.cc
@@ -13,10 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Implementation for the DebugLog() function that prints to the UART on the
-// SparkFun Edge microcontroller. The same should work for other targets using
-// the Ambiq Apollo 3.
-
 #include "tensorflow/lite/micro/debug_log.h"
 
 #include "hx_drv_tflm.h"
diff --git a/tensorflow/lite/micro/kernels/BUILD b/tensorflow/lite/micro/kernels/BUILD
index 50e95690d83..dcf2337aa24 100644
--- a/tensorflow/lite/micro/kernels/BUILD
+++ b/tensorflow/lite/micro/kernels/BUILD
@@ -4,7 +4,6 @@ load(
 )
 load(
     "//tensorflow/lite/micro:build_def.bzl",
-    "cc_library",
     "micro_copts",
 )
 
@@ -93,8 +92,6 @@ cc_library(
         ],
     }),
     hdrs = ["micro_ops.h"],
-    # TODO(b/153609488): enable embedded build once we can properly support it.
-    #build_for_embedded = True,
     copts = micro_copts(),
     visibility = [
         # Needed for micro:op_resolvers but visibility can not be finer-grained
@@ -103,6 +100,7 @@ cc_library(
     ],
     deps = [
         ":activation_utils",
+        ":kernel_util",
         ":micro_utils",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:kernel_util",
@@ -132,6 +130,7 @@ tflite_micro_cc_test(
     name = "elementwise_test",
     srcs = ["elementwise_test.cc"],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:debug_log",
         "//tensorflow/lite/micro:op_resolvers",
@@ -145,6 +144,7 @@ tflite_micro_cc_test(
         "pooling_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -157,6 +157,7 @@ tflite_micro_cc_test(
         "depthwise_conv_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels/internal:tensor",
         "//tensorflow/lite/micro:op_resolvers",
@@ -170,11 +171,13 @@ tflite_micro_cc_test(
         "fully_connected_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro:micro_utils",
         "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro:test_helpers",
+        "//tensorflow/lite/micro/kernels:micro_ops",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -185,8 +188,10 @@ tflite_micro_cc_test(
         "softmax_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -197,6 +202,7 @@ tflite_micro_cc_test(
         "logistic_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -209,8 +215,10 @@ tflite_micro_cc_test(
         "svdf_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -221,6 +229,7 @@ tflite_micro_cc_test(
         "conv_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:micro_utils",
         "//tensorflow/lite/micro:op_resolvers",
@@ -234,6 +243,7 @@ tflite_micro_cc_test(
         "prelu_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -246,6 +256,7 @@ tflite_micro_cc_test(
         "floor_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -258,6 +269,7 @@ tflite_micro_cc_test(
         "logical_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -270,6 +282,7 @@ tflite_micro_cc_test(
         "neg_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -282,6 +295,7 @@ tflite_micro_cc_test(
         "maximum_minimum_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -294,6 +308,7 @@ tflite_micro_cc_test(
         "mul_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -306,6 +321,7 @@ tflite_micro_cc_test(
         "sub_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -318,6 +334,7 @@ tflite_micro_cc_test(
         "arg_min_max_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -330,6 +347,7 @@ tflite_micro_cc_test(
         "comparisons_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -342,6 +360,7 @@ tflite_micro_cc_test(
         "ceil_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -354,6 +373,7 @@ tflite_micro_cc_test(
         "round_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -366,6 +386,7 @@ tflite_micro_cc_test(
         "strided_slice_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -378,6 +399,7 @@ tflite_micro_cc_test(
         "pack_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:debug_log",
         "//tensorflow/lite/micro:op_resolvers",
@@ -391,6 +413,7 @@ tflite_micro_cc_test(
         "unpack_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:debug_log",
         "//tensorflow/lite/micro:op_resolvers",
@@ -404,6 +427,7 @@ tflite_micro_cc_test(
         "split_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:debug_log",
         "//tensorflow/lite/micro:op_resolvers",
@@ -417,6 +441,7 @@ tflite_micro_cc_test(
         "add_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -438,7 +463,6 @@ tflite_micro_cc_test(
 cc_library(
     name = "activation_utils",
     hdrs = ["activation_utils.h"],
-    build_for_embedded = True,
     deps = [
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels/internal:cppmath",
@@ -451,6 +475,7 @@ tflite_micro_cc_test(
         "quantize_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro:op_resolvers",
@@ -465,6 +490,7 @@ tflite_micro_cc_test(
         "dequantize_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro:op_resolvers",
@@ -473,10 +499,35 @@ tflite_micro_cc_test(
     ],
 )
 
+cc_library(
+    name = "kernel_runner",
+    srcs = [
+        "kernel_runner.cc",
+    ],
+    hdrs = ["kernel_runner.h"],
+    deps = [
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels/internal:compatibility",
+        "//tensorflow/lite/micro:micro_framework",
+    ],
+)
+
+cc_library(
+    name = "kernel_util",
+    srcs = [
+        "kernel_util.cc",
+    ],
+    hdrs = ["kernel_util.h"],
+    deps = [
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels/internal:compatibility",
+        "//tensorflow/lite/kernels/internal:types",
+    ],
+)
+
 cc_library(
     name = "micro_utils",
     hdrs = ["micro_utils.h"],
-    build_for_embedded = True,
 )
 
 tflite_micro_cc_test(
@@ -485,6 +536,7 @@ tflite_micro_cc_test(
         "reshape_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels/internal:tensor",
         "//tensorflow/lite/micro:micro_framework",
@@ -501,6 +553,7 @@ tflite_micro_cc_test(
         "activations_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -513,6 +566,7 @@ tflite_micro_cc_test(
         "concatenation_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -525,6 +579,7 @@ tflite_micro_cc_test(
         "pad_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro:op_resolvers",
@@ -539,6 +594,7 @@ tflite_micro_cc_test(
         "reduce_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -551,6 +607,7 @@ tflite_micro_cc_test(
         "circular_buffer_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         ":micro_ops",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
@@ -564,6 +621,7 @@ tflite_micro_cc_test(
         "resize_nearest_neighbor_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -576,6 +634,7 @@ tflite_micro_cc_test(
         "l2norm_test.cc",
     ],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -586,6 +645,7 @@ tflite_micro_cc_test(
     name = "tanh_test",
     srcs = ["tanh_test.cc"],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro:op_resolvers",
@@ -597,6 +657,7 @@ tflite_micro_cc_test(
     name = "hard_swish_test",
     srcs = ["hard_swish_test.cc"],
     deps = [
+        ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro:op_resolvers",
diff --git a/tensorflow/lite/micro/kernels/activations.cc b/tensorflow/lite/micro/kernels/activations.cc
index 1b7efe8ed1c..2bdc0b5169a 100644
--- a/tensorflow/lite/micro/kernels/activations.cc
+++ b/tensorflow/lite/micro/kernels/activations.cc
@@ -18,30 +18,82 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 
 namespace tflite {
 namespace ops {
 namespace micro {
 namespace activations {
+namespace {
+
+struct ReluOpData {
+  ReluParams params;
+};
+
+struct Relu6OpData {
+  int8_t six_int8;
+  int8_t zero_int8;
+  uint8_t six_uint8;
+  uint8_t zero_uint8;
+};
+
+}  // namespace
 
 constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;
 
-template <typename Q>
-inline void ReluQuantized(int32_t lower, const RuntimeShape& input_shape,
-                          const Q* input_data, const RuntimeShape& output_shape,
-                          Q* output_data) {
+template <typename T>
+inline void ReluQuantized(const ReluOpData& data,
+                          const RuntimeShape& input_shape,
+                          const RuntimeShape& output_shape, const T* input_data,
+                          T* output_data) {
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
   for (int i = 0; i < flat_size; ++i) {
-    const Q val = input_data[i];
-    const Q clamped = val < lower ? lower : val;
-    output_data[i] = clamped;
+    const int32_t val = static_cast<int32_t>(input_data[i]);
+    int32_t clamped =
+        data.params.output_offset +
+        MultiplyByQuantizedMultiplier(val - data.params.input_offset,
+                                      data.params.output_multiplier,
+                                      data.params.output_shift);
+    clamped = std::max(data.params.quantized_activation_min, clamped);
+    clamped = std::min(data.params.quantized_activation_max, clamped);
+    output_data[i] = static_cast<T>(clamped);
   }
 }
 
+template <typename T>
+inline void CalculateReluOpData(const TfLiteTensor* input, TfLiteTensor* output,
+                                ReluOpData* data) {
+  float act_min = 0.0;
+  float act_max = std::numeric_limits<float>::infinity();
+  double real_multiplier =
+      static_cast<double>(input->params.scale / output->params.scale);
+
+  const RuntimeShape input_shape = GetTensorShape(input);
+  const RuntimeShape output_shape = GetTensorShape(output);
+
+  QuantizeMultiplier(real_multiplier, &data->params.output_multiplier,
+                     &data->params.output_shift);
+
+  data->params.quantized_activation_min = std::max(
+      static_cast<int32_t>(std::numeric_limits<T>::min()),
+      output->params.zero_point +
+          static_cast<int32_t>(roundf(act_min / output->params.scale)));
+  data->params.quantized_activation_max =
+      act_max == std::numeric_limits<float>::infinity()
+          ? static_cast<int32_t>(std::numeric_limits<T>::max())
+          : std::min(static_cast<int32_t>(std::numeric_limits<T>::max()),
+                     output->params.zero_point +
+                         static_cast<int32_t>(
+                             roundf(act_max / output->params.scale)));
+  data->params.input_offset = input->params.zero_point;
+  data->params.output_offset = output->params.zero_point;
+}
+
 inline void ReluFloat(const RuntimeShape& input_shape, const float* input_data,
                       const RuntimeShape& output_shape, float* output_data) {
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
@@ -77,33 +129,57 @@ inline void Relu6Quantized(Q lower, Q upper, const RuntimeShape& input_shape,
   }
 }
 
+void* ReluInit(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(ReluOpData));
+}
+
 TfLiteStatus ReluPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  ReluOpData* data = static_cast<ReluOpData*>(node->user_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (input->type == kTfLiteInt8) {
+    CalculateReluOpData<int8_t>(input, output, data);
+  } else if (input->type == kTfLiteUInt8) {
+    CalculateReluOpData<uint8_t>(input, output, data);
+  }
+
   return kTfLiteOk;
 }
 
 TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const ReluOpData& data = *(static_cast<const ReluOpData*>(node->user_data));
+
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
   switch (input->type) {
     case kTfLiteFloat32: {
-      ReluFloat(GetTensorShape(input), GetTensorData<float>(input),
-                GetTensorShape(output), GetTensorData<float>(output));
+      ReluFloat(tflite::micro::GetTensorShape(input),
+                tflite::micro::GetTensorData<float>(input),
+                tflite::micro::GetTensorShape(output),
+                tflite::micro::GetTensorData<float>(output));
 
       return kTfLiteOk;
     }
     case kTfLiteInt8: {
-      ReluQuantized<int8_t>(input->params.zero_point, GetTensorShape(input),
-                            GetTensorData<int8_t>(input),
-                            GetTensorShape(output),
-                            GetTensorData<int8_t>(output));
+      ReluQuantized<int8_t>(data, tflite::micro::GetTensorShape(input),
+                            tflite::micro::GetTensorShape(output),
+                            tflite::micro::GetTensorData<int8_t>(input),
+                            tflite::micro::GetTensorData<int8_t>(output));
       return kTfLiteOk;
     }
     case kTfLiteUInt8: {
-      ReluQuantized<uint8_t>(input->params.zero_point, GetTensorShape(input),
-                             GetTensorData<uint8_t>(input),
-                             GetTensorShape(output),
-                             GetTensorData<uint8_t>(output));
+      ReluQuantized<uint8_t>(data, tflite::micro::GetTensorShape(input),
+                             tflite::micro::GetTensorShape(output),
+                             tflite::micro::GetTensorData<uint8_t>(input),
+                             tflite::micro::GetTensorData<uint8_t>(output));
       return kTfLiteOk;
     }
     default: {
@@ -114,37 +190,62 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
   }
 }
 
+void* Relu6Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(Relu6OpData));
+}
+
 TfLiteStatus Relu6Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  Relu6OpData* data = static_cast<Relu6OpData*>(node->user_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+
+  if (input->type == kTfLiteInt8) {
+    data->six_int8 = FloatToAsymmetricQuantizedInt8(6.0f, input->params.scale,
+                                                    input->params.zero_point);
+    data->zero_int8 = input->params.zero_point;
+  } else if (input->type == kTfLiteUInt8) {
+    data->six_uint8 = FloatToAsymmetricQuantizedUInt8(6.0f, input->params.scale,
+                                                      input->params.zero_point);
+    data->zero_uint8 = input->params.zero_point;
+  }
+
   return kTfLiteOk;
 }
 
 TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const Relu6OpData& data = *(static_cast<const Relu6OpData*>(node->user_data));
+
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
   switch (input->type) {
     case kTfLiteFloat32: {
-      Relu6Float(GetTensorShape(input), GetTensorData<float>(input),
-                 GetTensorShape(output), GetTensorData<float>(output));
+      Relu6Float(tflite::micro::GetTensorShape(input),
+                 tflite::micro::GetTensorData<float>(input),
+                 tflite::micro::GetTensorShape(output),
+                 tflite::micro::GetTensorData<float>(output));
 
       return kTfLiteOk;
     }
     case kTfLiteInt8: {
-      const int8_t six = FloatToAsymmetricQuantizedInt8(
-          6.0f, input->params.scale, input->params.zero_point);
-      const int8_t zero = input->params.zero_point;
-      Relu6Quantized<int8_t>(
-          zero, six, GetTensorShape(input), GetTensorData<int8_t>(input),
-          GetTensorShape(output), GetTensorData<int8_t>(output));
+      Relu6Quantized<int8_t>(data.zero_int8, data.six_int8,
+                             tflite::micro::GetTensorShape(input),
+                             tflite::micro::GetTensorData<int8_t>(input),
+                             tflite::micro::GetTensorShape(output),
+                             tflite::micro::GetTensorData<int8_t>(output));
       return kTfLiteOk;
     }
     case kTfLiteUInt8: {
-      const uint8_t six = FloatToAsymmetricQuantizedUInt8(
-          6.0f, input->params.scale, input->params.zero_point);
-      const uint8_t zero = input->params.zero_point;
-      Relu6Quantized<uint8_t>(
-          zero, six, GetTensorShape(input), GetTensorData<uint8_t>(input),
-          GetTensorShape(output), GetTensorData<uint8_t>(output));
+      Relu6Quantized<uint8_t>(data.zero_uint8, data.six_uint8,
+                              tflite::micro::GetTensorShape(input),
+                              tflite::micro::GetTensorData<uint8_t>(input),
+                              tflite::micro::GetTensorShape(output),
+                              tflite::micro::GetTensorData<uint8_t>(output));
       return kTfLiteOk;
     }
     default: {
@@ -158,7 +259,7 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace activations
 
 TfLiteRegistration Register_RELU() {
-  return {/*init=*/nullptr,
+  return {/*init=*/activations::ReluInit,
           /*free=*/nullptr,
           /*prepare=*/activations::ReluPrepare,
           /*invoke=*/activations::ReluEval,
@@ -169,7 +270,7 @@ TfLiteRegistration Register_RELU() {
 }
 
 TfLiteRegistration Register_RELU6() {
-  return {/*init=*/nullptr,
+  return {/*init=*/activations::Relu6Init,
           /*free=*/nullptr,
           /*prepare=*/activations::Relu6Prepare,
           /*invoke=*/activations::Relu6Eval,
diff --git a/tensorflow/lite/micro/kernels/activations_test.cc b/tensorflow/lite/micro/kernels/activations_test.cc
index 85556d10406..db23bdec475 100644
--- a/tensorflow/lite/micro/kernels/activations_test.cc
+++ b/tensorflow/lite/micro/kernels/activations_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
@@ -38,40 +39,19 @@ void TestReluFloat(const int* input_dims_data, const float* input_data,
       CreateFloatTensor(output_data, output_dims),
   };
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_RELU);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  const char* init_data = nullptr;
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
   int inputs_array_data[] = {1, 0};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
+  const TfLiteRegistration registration = ops::micro::Register_RELU();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array,
+                             /*builtin_data=*/nullptr, micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
+
   for (int i = 0; i < output_elements_count; ++i) {
     TF_LITE_MICRO_EXPECT_NEAR(golden[i], output_data[i], 1e-5f);
   }
@@ -92,40 +72,19 @@ void TestRelu6Float(const int* input_dims_data, const float* input_data,
       CreateFloatTensor(output_data, output_dims),
   };
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_RELU6);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  const char* init_data = nullptr;
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
   int inputs_array_data[] = {1, 0};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
+  const TfLiteRegistration registration = ops::micro::Register_RELU6();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array,
+                             /*builtin_data=*/nullptr, micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
+
   for (int i = 0; i < output_elements_count; ++i) {
     TF_LITE_MICRO_EXPECT_NEAR(golden[i], output_data[i], 1e-5f);
   }
@@ -151,40 +110,18 @@ void TestReluUint8(const int* input_dims_data, const float* input_data,
                             output_zero_point),
   };
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_RELU);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  const char* init_data = nullptr;
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
   int inputs_array_data[] = {1, 0};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
+  const TfLiteRegistration registration = ops::micro::Register_RELU();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array,
+                             /*builtin_data=*/nullptr, micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   AsymmetricQuantize(golden, golden_quantized, output_elements_count,
                      output_scale, output_zero_point);
@@ -214,40 +151,18 @@ void TestRelu6Uint8(const int* input_dims_data, const float* input_data,
                             output_zero_point),
   };
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_RELU6);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  const char* init_data = nullptr;
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
   int inputs_array_data[] = {1, 0};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
+  const TfLiteRegistration registration = ops::micro::Register_RELU6();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array,
+                             /*builtin_data=*/nullptr, micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   AsymmetricQuantize(golden, golden_quantized, output_elements_count,
                      output_scale, output_zero_point);
@@ -276,42 +191,18 @@ void TestReluInt8(const int* input_dims_data, const float* input_data,
                             output_zero_point),
   };
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_RELU);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  const char* init_data = nullptr;
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
   int inputs_array_data[] = {1, 0};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
+  const TfLiteRegistration registration = ops::micro::Register_RELU();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array,
+                             /*builtin_data=*/nullptr, micro_test::reporter);
 
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   AsymmetricQuantize(golden, golden_quantized, output_elements_count,
                      output_scale, output_zero_point);
@@ -340,42 +231,18 @@ void TestRelu6Int8(const int* input_dims_data, const float* input_data,
                             output_zero_point),
   };
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_RELU6);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  const char* init_data = nullptr;
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
   int inputs_array_data[] = {1, 0};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
+  const TfLiteRegistration registration = ops::micro::Register_RELU6();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array,
+                             /*builtin_data=*/nullptr, micro_test::reporter);
 
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   AsymmetricQuantize(golden, golden_quantized, output_elements_count,
                      output_scale, output_zero_point);
diff --git a/tensorflow/lite/micro/kernels/add.cc b/tensorflow/lite/micro/kernels/add.cc
index 8d2ea6c5167..79a04875def 100644
--- a/tensorflow/lite/micro/kernels/add.cc
+++ b/tensorflow/lite/micro/kernels/add.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
 
 namespace tflite {
@@ -41,18 +42,22 @@ struct OpData {
   // and the special 16-bit -> 16bit quantized path
   int input1_shift;
   int input2_shift;
-  int32 output_activation_min;
-  int32 output_activation_max;
+  int32_t output_activation_min;
+  int32_t output_activation_max;
 
   // These fields are used only in the general 8-bit -> 8bit quantized path
-  int32 input1_multiplier;
-  int32 input2_multiplier;
-  int32 output_multiplier;
+  int32_t input1_multiplier;
+  int32_t input2_multiplier;
+  int32_t output_multiplier;
   int output_shift;
   int left_shift;
-  int32 input1_offset;
-  int32 input2_offset;
-  int32 output_offset;
+  int32_t input1_offset;
+  int32_t input2_offset;
+  int32_t output_offset;
+
+  // Used only for float evals:
+  float output_activation_min_f32;
+  float output_activation_max_f32;
 };
 
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteAddParams* params,
@@ -90,24 +95,28 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteAddParams* params,
     TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
         context, params->activation, output, &data->output_activation_min,
         &data->output_activation_max));
+  } else if (output->type == kTfLiteFloat32) {
+    CalculateActivationRange(params->activation,
+                             &data->output_activation_min_f32,
+                             &data->output_activation_max_f32);
   }
 
   return kTfLiteOk;
 }
 
 void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params,
-             const OpData* data, const TfLiteTensor* input1,
-             const TfLiteTensor* input2, TfLiteTensor* output) {
-  float output_activation_min, output_activation_max;
-  CalculateActivationRange(params->activation, &output_activation_min,
-                           &output_activation_max);
+             const OpData* data, const TfLiteEvalTensor* input1,
+             const TfLiteEvalTensor* input2, TfLiteEvalTensor* output) {
   tflite::ArithmeticParams op_params;
-  SetActivationParams(output_activation_min, output_activation_max, &op_params);
-#define TF_LITE_ADD(opname)                                                   \
-  reference_ops::opname(op_params, GetTensorShape(input1),                    \
-                        GetTensorData<float>(input1), GetTensorShape(input2), \
-                        GetTensorData<float>(input2), GetTensorShape(output), \
-                        GetTensorData<float>(output))
+  SetActivationParams(data->output_activation_min_f32,
+                      data->output_activation_max_f32, &op_params);
+#define TF_LITE_ADD(opname)                                               \
+  reference_ops::opname(op_params, tflite::micro::GetTensorShape(input1), \
+                        tflite::micro::GetTensorData<float>(input1),      \
+                        tflite::micro::GetTensorShape(input2),            \
+                        tflite::micro::GetTensorData<float>(input2),      \
+                        tflite::micro::GetTensorShape(output),            \
+                        tflite::micro::GetTensorData<float>(output))
   if (data->requires_broadcast) {
     TF_LITE_ADD(BroadcastAdd4DSlow);
   } else {
@@ -118,9 +127,9 @@ void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params,
 
 TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
                               TfLiteAddParams* params, const OpData* data,
-                              const TfLiteTensor* input1,
-                              const TfLiteTensor* input2,
-                              TfLiteTensor* output) {
+                              const TfLiteEvalTensor* input1,
+                              const TfLiteEvalTensor* input2,
+                              TfLiteEvalTensor* output) {
   if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
     tflite::ArithmeticParams op_params;
     op_params.left_shift = data->left_shift;
@@ -136,12 +145,15 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
     SetActivationParams(data->output_activation_min,
                         data->output_activation_max, &op_params);
     bool need_broadcast = reference_ops::ProcessBroadcastShapes(
-        GetTensorShape(input1), GetTensorShape(input2), &op_params);
-#define TF_LITE_ADD(type, opname, dtype)                             \
-  type::opname(op_params, GetTensorShape(input1),                    \
-               GetTensorData<dtype>(input1), GetTensorShape(input2), \
-               GetTensorData<dtype>(input2), GetTensorShape(output), \
-               GetTensorData<dtype>(output));
+        tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorShape(input2), &op_params);
+#define TF_LITE_ADD(type, opname, dtype)                         \
+  type::opname(op_params, tflite::micro::GetTensorShape(input1), \
+               tflite::micro::GetTensorData<dtype>(input1),      \
+               tflite::micro::GetTensorShape(input2),            \
+               tflite::micro::GetTensorData<dtype>(input2),      \
+               tflite::micro::GetTensorShape(output),            \
+               tflite::micro::GetTensorData<dtype>(output));
     if (output->type == kTfLiteInt8) {
       if (need_broadcast) {
         TF_LITE_ADD(reference_integer_ops, BroadcastAdd4DSlow, int8_t);
@@ -163,12 +175,7 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@@ -194,9 +201,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TFLITE_DCHECK(node->user_data != nullptr);
   const OpData* data = static_cast<const OpData*>(node->user_data);
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
   if (output->type == kTfLiteFloat32) {
     EvalAdd(context, node, params, data, input1, input2, output);
diff --git a/tensorflow/lite/micro/kernels/add_test.cc b/tensorflow/lite/micro/kernels/add_test.cc
index 2d703600f56..5ea9daee621 100644
--- a/tensorflow/lite/micro/kernels/add_test.cc
+++ b/tensorflow/lite/micro/kernels/add_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
@@ -66,47 +67,21 @@ void ValidateAddGoldens(TfLiteTensor* tensors, int tensors_size,
                         const T* golden, T* output, int output_size,
                         TfLiteFusedActivation activation,
                         float tolerance = 1e-5) {
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(::tflite::BuiltinOperator_ADD);
-
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
   TfLiteAddParams builtin_data;
   builtin_data.activation = activation;
 
-  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
-  const size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-
   int inputs_array_data[] = {2, 0, 1};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 2};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
+  const TfLiteRegistration registration = ops::micro::Register_ADD();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array, &builtin_data,
+                             micro_test::reporter);
 
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   for (int i = 0; i < output_size; ++i) {
     TF_LITE_MICRO_EXPECT_NEAR(golden[i], output[i], tolerance);
@@ -431,12 +406,6 @@ TF_LITE_MICRO_TEST(QuantizedAddWithScalarBroadcastUint8) {
   }
 }
 TF_LITE_MICRO_TEST(QuantizedAddWithScalarBroadcastFloat) {
-  const float scales[] = {0.1, 0.05, 0.1};
-  const int zero_points[] = {127, 131, 139};
-  uint8_t input1_quantized[tflite::testing::broadcast_output_dims_count];
-  uint8_t input2_quantized[tflite::testing::broadcast_output_dims_count];
-  uint8_t golden_quantized[tflite::testing::broadcast_output_dims_count];
-  uint8_t output[tflite::testing::broadcast_output_dims_count];
   float output_float[tflite::testing::broadcast_output_dims_count];
 
   for (int i = 0; i < tflite::testing::broadcast_num_shapes; ++i) {
@@ -491,7 +460,6 @@ TF_LITE_MICRO_TEST(QuantizedAddWithMixedBroadcastUint8) {
   uint8_t input2_quantized[tflite::testing::broadcast_output_dims_count];
   uint8_t golden_quantized[tflite::testing::broadcast_output_dims_count];
   uint8_t output[tflite::testing::broadcast_output_dims_count];
-  float output_float[tflite::testing::broadcast_output_dims_count];
 
   for (int i = 0; i < tflite::testing::broadcast_num_shapes; ++i) {
     tflite::testing::TestAddQuantized(
@@ -512,7 +480,6 @@ TF_LITE_MICRO_TEST(QuantizedAddWithMixedBroadcastInt8) {
   int8_t input2_quantized[tflite::testing::broadcast_output_dims_count];
   int8_t golden_quantized[tflite::testing::broadcast_output_dims_count];
   int8_t output[tflite::testing::broadcast_output_dims_count];
-  float output_float[tflite::testing::broadcast_output_dims_count];
 
   for (int i = 0; i < tflite::testing::broadcast_num_shapes; ++i) {
     tflite::testing::TestAddQuantized(
diff --git a/tensorflow/lite/micro/kernels/arc_mli/conv.cc b/tensorflow/lite/micro/kernels/arc_mli/conv.cc
index 6f137590b91..905feb1a529 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/conv.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/conv.cc
@@ -78,8 +78,8 @@ bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
                      const TfLiteConvParams* params) {
   const auto* affine_quantization =
       reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params);
-  // MLI optimized version only supports int8 dataype, dilation factor of 1 and
-  // per-axis quantization of weights (no broadcasting/per-tensor)
+  // MLI optimized version only supports int8_t dataype, dilation factor of 1
+  // and per-axis quantization of weights (no broadcasting/per-tensor)
   bool ret_val = (filter->type == kTfLiteInt8) &&
                  (input->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) &&
                  (params->dilation_width_factor == 1) &&
@@ -176,7 +176,7 @@ TfLiteStatus EvalMliQuantizedPerChannel(
     OpData* data, const TfLiteTensor* input, const TfLiteTensor* filter,
     const TfLiteTensor* bias, TfLiteTensor* output) {
   // Run Conv MLI kernel
-  // MLI optimized version only supports int8 dataype and dilation factor of 1
+  // MLI optimized version only supports int8_t dataype and dilation factor of 1
   if ((input->type == kTfLiteInt8) && (params->dilation_width_factor == 1) &&
       (params->dilation_height_factor == 1)) {
     mli_tensor mli_in = {0};
@@ -353,10 +353,10 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
   reference_integer_ops::ConvPerChannel(
       op_params, data->per_channel_output_multiplier,
       data->per_channel_output_shift, GetTensorShape(input),
-      GetTensorData<int8>(input), GetTensorShape(filter),
-      GetTensorData<int8>(filter), GetTensorShape(bias),
-      GetTensorData<int32>(bias), GetTensorShape(output),
-      GetTensorData<int8>(output));
+      GetTensorData<int8_t>(input), GetTensorShape(filter),
+      GetTensorData<int8_t>(filter), GetTensorShape(bias),
+      GetTensorData<int32_t>(bias), GetTensorShape(output),
+      GetTensorData<int8_t>(output));
   return kTfLiteOk;
 #else
   TF_LITE_KERNEL_LOG(context,
diff --git a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
index 9a7edcb847c..9f8a6b4004c 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
@@ -71,10 +71,10 @@ bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
   const int in_ch = SizeOfDimension(input, 3);
   const int filters_num = SizeOfDimension(filter, 3);
 
-  // MLI optimized version only supports int8 dataype, dilation factor of 1 and
-  // per-axis quantization of weights (no broadcasting/per-tensor)
-  // (in_ch == filters_num) || (in_ch == 1)) is a forbidding of
-  // channel multiplier logic for multichannel input.
+  // MLI optimized version only supports int8_t dataype, dilation factor of 1
+  // and per-axis quantization of weights (no broadcasting/per-tensor) (in_ch ==
+  // filters_num) || (in_ch == 1)) is a forbidding of channel multiplier logic
+  // for multichannel input.
   bool ret_val = (filter->type == kTfLiteInt8) &&
                  (input->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) &&
                  (params->dilation_width_factor == 1) &&
@@ -373,10 +373,10 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
   reference_integer_ops::DepthwiseConvPerChannel(
       op_params, data->per_channel_output_multiplier,
       data->per_channel_output_shift, GetTensorShape(input),
-      GetTensorData<int8>(input), GetTensorShape(filter),
-      GetTensorData<int8>(filter), GetTensorShape(bias),
-      GetTensorData<int32>(bias), GetTensorShape(output),
-      GetTensorData<int8>(output));
+      GetTensorData<int8_t>(input), GetTensorShape(filter),
+      GetTensorData<int8_t>(filter), GetTensorShape(bias),
+      GetTensorData<int32_t>(bias), GetTensorShape(output),
+      GetTensorData<int8_t>(output));
   return kTfLiteOk;
 #else
   TF_LITE_KERNEL_LOG(context,
diff --git a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
index 41c65faafb2..24b3fed0998 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
@@ -55,7 +55,7 @@ constexpr int kOutputTensor = 0;
 bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
                      const TfLiteTensor* filter, const TfLiteTensor* bias,
                      const TfLiteFullyConnectedParams* params) {
-  // MLI optimized version only supports int8 dataype and no fused Relu and
+  // MLI optimized version only supports int8_t dataype and no fused Relu and
   // symmetric per-tensor quantization of weights (not per-axis)
   bool ret_val = (filter->type == kTfLiteInt8) &&
                  (input->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) &&
diff --git a/tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h b/tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h
index 3f9933ada47..1764f1fdf45 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h
+++ b/tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h
@@ -34,7 +34,7 @@ static void ConvertToMliTensorData(const TfLiteTensor* tfT, mli_tensor* mliT) {
   } else if (tfT->type == kTfLiteInt32) {
     mliT->el_type = MLI_EL_ASYM_I32;
   } else {
-    TF_LITE_FATAL("Wrong data type. Expected int8 or int32.");
+    TF_LITE_FATAL("Wrong data type. Expected int8_t or int32_t.");
   }
 
   mliT->capacity = tfT->bytes;
diff --git a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
index d59a673d925..44bc966a8e2 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
@@ -43,7 +43,7 @@ enum MliPoolingType { AveragePooling = 0, MaxPooling = 1 };
 
 bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
                      const TfLitePoolParams* params) {
-  // MLI optimized version only supports int8 dataype and no fused Relu
+  // MLI optimized version only supports int8_t dataype and no fused Relu
   return (input->type == kTfLiteInt8 && params->activation == kTfLiteActNone);
 }
 
diff --git a/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc
index 7f21a67d9f7..516b1bf63d6 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc
@@ -41,7 +41,7 @@ void TestAveragePoolingQuantized(
     const T* expected_output_data, const int* output_dims_data,
     float output_min, float output_max, TfLitePadding padding,
     TfLiteFusedActivation activation, T* output_data) {
-  static_assert(sizeof(T) == 1, "Only int8/uint8 data types allowed.");
+  static_assert(sizeof(T) == 1, "Only int8_t/uint8_t data types allowed.");
 
   TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
@@ -112,7 +112,7 @@ void TestMaxPoolQuantized(const int* input_dims_data, const T* input_data,
                           float output_min, float output_max,
                           const int* output_dims_data, TfLitePadding padding,
                           TfLiteFusedActivation activation, T* output_data) {
-  static_assert(sizeof(T) == 1, "Only int8/uint8 data types allowed.");
+  static_assert(sizeof(T) == 1, "Only int8_t/uint8_t data types allowed.");
 
   TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
diff --git a/tensorflow/lite/micro/kernels/arg_min_max.cc b/tensorflow/lite/micro/kernels/arg_min_max.cc
index 86abc1d7a4b..12ac0019c05 100644
--- a/tensorflow/lite/micro/kernels/arg_min_max.cc
+++ b/tensorflow/lite/micro/kernels/arg_min_max.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/micro_utils.h"
 
 namespace tflite {
@@ -45,14 +46,20 @@ inline void ArgMinMaxHelper(const RuntimeShape& input1_shape,
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node, bool is_arg_max) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* axis = GetInput(context, node, kAxis);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* axis =
+      tflite::micro::GetEvalInput(context, node, kAxis);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
-#define TF_LITE_ARG_MIN_MAX(data_type, axis_type, output_type)            \
-  ArgMinMaxHelper(GetTensorShape(input), GetTensorData<data_type>(input), \
-                  GetTensorData<axis_type>(axis), GetTensorShape(output), \
-                  GetTensorData<output_type>(output), is_arg_max)
+#define TF_LITE_ARG_MIN_MAX(data_type, axis_type, output_type)       \
+  ArgMinMaxHelper(tflite::micro::GetTensorShape(input),              \
+                  tflite::micro::GetTensorData<data_type>(input),    \
+                  tflite::micro::GetTensorData<axis_type>(axis),     \
+                  tflite::micro::GetTensorShape(output),             \
+                  tflite::micro::GetTensorData<output_type>(output), \
+                  is_arg_max)
   if (axis->type == kTfLiteInt32) {
     if (output->type == kTfLiteInt32) {
       switch (input->type) {
@@ -67,18 +74,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node, bool is_arg_max) {
           break;
         default:
           TF_LITE_KERNEL_LOG(context,
-                             "Only float32, uint8 and int8 are "
+                             "Only float32, uint8_t and int8_t are "
                              "supported currently, got %s.",
                              TfLiteTypeGetName(input->type));
           return kTfLiteError;
       }
     } else {
-      TF_LITE_KERNEL_LOG(context, "Only int32 are supported currently, got %s.",
+      TF_LITE_KERNEL_LOG(context,
+                         "Only int32_t are supported currently, got %s.",
                          TfLiteTypeGetName(output->type));
       return kTfLiteError;
     }
   } else {
-    TF_LITE_KERNEL_LOG(context, "Only int32 are supported currently, got %s.",
+    TF_LITE_KERNEL_LOG(context, "Only int32_t are supported currently, got %s.",
                        TfLiteTypeGetName(axis->type));
     return kTfLiteError;
   }
diff --git a/tensorflow/lite/micro/kernels/arg_min_max_test.cc b/tensorflow/lite/micro/kernels/arg_min_max_test.cc
index fa46badfc27..dfd04bf74b9 100644
--- a/tensorflow/lite/micro/kernels/arg_min_max_test.cc
+++ b/tensorflow/lite/micro/kernels/arg_min_max_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
@@ -26,41 +27,21 @@ namespace {
 void ValidateArgMinMaxGoldens(TfLiteTensor* tensors, int tensors_size,
                               const int32_t* golden, int32_t* output,
                               int output_size, bool using_min) {
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration;
-  if (using_min) {
-    registration = resolver.FindOp(tflite::BuiltinOperator_ARG_MIN);
-  } else {
-    registration = resolver.FindOp(tflite::BuiltinOperator_ARG_MAX);
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, nullptr, init_data_size);
-  }
   int inputs_array_data[] = {2, 0, 1};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 2};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
+
+  const TfLiteRegistration registration = using_min
+                                              ? ops::micro::Register_ARG_MIN()
+                                              : ops::micro::Register_ARG_MAX();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array,
+                             /*builtin_data=*/nullptr, micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
+
   for (int i = 0; i < output_size; ++i) {
     TF_LITE_MICRO_EXPECT_EQ(golden[i], output[i]);
   }
diff --git a/tensorflow/lite/micro/kernels/ceil.cc b/tensorflow/lite/micro/kernels/ceil.cc
index f6e4abdc6f5..3bce8a73f55 100644
--- a/tensorflow/lite/micro/kernels/ceil.cc
+++ b/tensorflow/lite/micro/kernels/ceil.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -43,11 +44,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
-  reference_ops::Ceil(GetTensorShape(input), GetTensorData<float>(input),
-                      GetTensorShape(output), GetTensorData<float>(output));
+  reference_ops::Ceil(tflite::micro::GetTensorShape(input),
+                      tflite::micro::GetTensorData<float>(input),
+                      tflite::micro::GetTensorShape(output),
+                      tflite::micro::GetTensorData<float>(output));
 
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/micro/kernels/ceil_test.cc b/tensorflow/lite/micro/kernels/ceil_test.cc
index 7261d1c76a6..27caa507c00 100644
--- a/tensorflow/lite/micro/kernels/ceil_test.cc
+++ b/tensorflow/lite/micro/kernels/ceil_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
@@ -35,26 +36,20 @@ void TestCeil(const int* input_dims_data, const float* input_data,
       CreateFloatTensor(input_data, input_dims),
       CreateFloatTensor(output_data, output_dims),
   };
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_CEIL);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   int inputs_array_data[] = {1, 0};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = nullptr;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+
+  const TfLiteRegistration registration = ops::micro::Register_CEIL();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array,
+                             /*builtin_data=*/nullptr, micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
+
   for (int i = 0; i < output_dims_count; ++i) {
     TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i], 1e-5f);
   }
diff --git a/tensorflow/lite/micro/kernels/circular_buffer.cc b/tensorflow/lite/micro/kernels/circular_buffer.cc
index f588d64dcd5..b5a8ae1be3b 100644
--- a/tensorflow/lite/micro/kernels/circular_buffer.cc
+++ b/tensorflow/lite/micro/kernels/circular_buffer.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 /*
  * The circular buffer custom operator is used to implement strided streaming
@@ -91,7 +92,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
-  // The circular buffer custom operator currently only supports int8.
+  // The circular buffer custom operator currently only supports int8_t.
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteInt8);
 
   // TODO(b/132070898): Use statically slotted OpData structures until a
@@ -121,8 +122,10 @@ void EvalInt8(const int8_t* input, int num_slots, int depth, int8_t* output) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
@@ -130,8 +133,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   int depth = output->dims->data[3];
 
   if (input->type == kTfLiteInt8) {
-    EvalInt8(GetTensorData<int8_t>(input), num_slots, depth,
-             GetTensorData<int8_t>(output));
+    EvalInt8(tflite::micro::GetTensorData<int8_t>(input), num_slots, depth,
+             tflite::micro::GetTensorData<int8_t>(output));
   } else {
     TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
                        TfLiteTypeGetName(input->type), input->type);
diff --git a/tensorflow/lite/micro/kernels/circular_buffer_test.cc b/tensorflow/lite/micro/kernels/circular_buffer_test.cc
index 4c48060a0a9..770f4565670 100644
--- a/tensorflow/lite/micro/kernels/circular_buffer_test.cc
+++ b/tensorflow/lite/micro/kernels/circular_buffer_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/kernels/micro_ops.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
@@ -29,93 +30,6 @@ constexpr int kRunPeriod = 2;
 // TODO(b/149795762): Add this to TfLiteStatus enum.
 constexpr int kTfLiteAbort = -9;
 
-TfLiteNode PrepareCircularBufferInt8(const int* input_dims_data,
-                                     const int8_t* input_data,
-                                     const int* output_dims_data,
-                                     const int8_t* expected_output_data,
-                                     int8_t* output_data) {
-  const TfLiteRegistration* registration =
-      ops::micro::Register_CIRCULAR_BUFFER();
-
-  TfLiteNode node;
-  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
-  constexpr int inputs_size = 2;
-  constexpr int outputs_size = 1;
-  constexpr int tensors_size = inputs_size + outputs_size;
-  TfLiteTensor tensors[tensors_size] = {
-      CreateQuantizedTensor(input_data, input_dims, 1, 0),
-      CreateQuantizedTensor(output_data, output_dims, 1, 0),
-  };
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  // There is one input - tensor 0.
-  const int inputs_array_data[] = {1, 0};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  // There is one output - tensor 1.
-  const int outputs_array_data[] = {1, 1};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->prepare);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  return node;
-}
-
-// Run invoke cycles_until_output times with the supplied input, expecting
-// invoke to return kTfLiteAbort until the last iteration, at which point the
-// output should match expected_output_data.
-TfLiteStatus InvokeCircularBufferInt8(const int* input_dims_data,
-                                      const int8_t* input_data,
-                                      const int* output_dims_data,
-                                      const int8_t* expected_output_data,
-                                      int8_t* output_data, TfLiteNode* node) {
-  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
-  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
-
-  const int output_dims_count = ElementCount(*output_dims);
-  const TfLiteRegistration* registration =
-      ops::micro::Register_CIRCULAR_BUFFER();
-
-  constexpr int inputs_size = 2;
-  constexpr int outputs_size = 1;
-  constexpr int tensors_size = inputs_size + outputs_size;
-  TfLiteTensor tensors[tensors_size] = {
-      CreateQuantizedTensor(input_data, input_dims, 1, 0),
-      CreateQuantizedTensor(output_data, output_dims, 1, 0),
-  };
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  // There is one input - tensor 0.
-  const int inputs_array_data[] = {1, 0};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  // There is one output - tensor 1.
-  const int outputs_array_data[] = {1, 1};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-
-  node->inputs = inputs_array;
-  node->outputs = outputs_array;
-  node->builtin_data = nullptr;
-  node->custom_initial_data = nullptr;
-  node->custom_initial_data_size = 0;
-
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-
-  TfLiteStatus status = registration->invoke(&context, node);
-
-  for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output_data[i], output_data[i]);
-  }
-  return status;
-}
-
 }  // namespace
 }  // namespace testing
 }  // namespace tflite
@@ -125,30 +39,65 @@ TF_LITE_MICRO_TESTS_BEGIN
 TF_LITE_MICRO_TEST(OutputTensorLength4) {
   constexpr int depth = 3;
   constexpr int num_slots = 4;
+  int8_t input_data[depth];
   int8_t output_data[depth * num_slots];
 
   memset(output_data, 0, sizeof(output_data));
+
   // There are four input dimensions - [1, 1, 1, depth].
   const int input_dims[] = {4, 1, 1, 1, depth};
   // There are four output dimensions - [1, num_slots, 1, depth].
   const int output_dims[] = {4, 1, num_slots, 1, depth};
 
+  TfLiteIntArray* input_tensor_dims =
+      tflite::testing::IntArrayFromInts(input_dims);
+  TfLiteIntArray* output_tensor_dims =
+      tflite::testing::IntArrayFromInts(output_dims);
+
+  const int output_dims_count = tflite::ElementCount(*output_tensor_dims);
+
+  constexpr int inputs_size = 2;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      tflite::testing::CreateQuantizedTensor(input_data, input_tensor_dims, 1,
+                                             0),
+      tflite::testing::CreateQuantizedTensor(output_data, output_tensor_dims, 1,
+                                             0),
+  };
+
+  // There is one input - tensor 0.
+  const int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array =
+      tflite::testing::IntArrayFromInts(inputs_array_data);
+  // There is one output - tensor 1.
+  const int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array =
+      tflite::testing::IntArrayFromInts(outputs_array_data);
+
+  const TfLiteRegistration* registration =
+      tflite::ops::micro::Register_CIRCULAR_BUFFER();
+  tflite::micro::KernelRunner runner = tflite::micro::KernelRunner(
+      *registration, tensors, tensors_size, inputs_array, outputs_array,
+      /*builtin_data=*/nullptr, micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+
   const int8_t goldens[5][16] = {{0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3},
                                  {0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6},
                                  {0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
                                  {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
                                  {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}};
 
-  int8_t input[depth];
-  TfLiteNode node = tflite::testing::PrepareCircularBufferInt8(
-      input_dims, input, output_dims, goldens[0], output_data);
   // Expect the circular buffer to run every other invoke for 4xN output.
   for (int i = 0; i < 5; i++) {
     for (int j = 0; j < depth; j++) {
-      input[j] = i * depth + j + 1;
+      input_data[j] = i * depth + j + 1;
+    }
+    TfLiteStatus status = runner.Invoke();
+
+    for (int j = 0; j < output_dims_count; ++j) {
+      TF_LITE_MICRO_EXPECT_EQ(goldens[i][j], output_data[j]);
     }
-    TfLiteStatus status = tflite::testing::InvokeCircularBufferInt8(
-        input_dims, input, output_dims, goldens[i], output_data, &node);
 
     // Every kRunPeriod iterations, the circular buffer should return kTfLiteOk.
     if (i % tflite::testing::kRunPeriod == tflite::testing::kRunPeriod - 1) {
@@ -162,11 +111,44 @@ TF_LITE_MICRO_TEST(OutputTensorLength4) {
 TF_LITE_MICRO_TEST(OutputTensorLength5) {
   constexpr int depth = 4;
   constexpr int num_slots = 5;
+  int8_t input_data[depth];
   int8_t output_data[depth * num_slots];
 
   memset(output_data, 0, sizeof(output_data));
   const int input_dims[] = {4, 1, 1, 1, depth};
   const int output_dims[] = {4, 1, num_slots, 1, depth};
+  TfLiteIntArray* input_tensor_dims =
+      tflite::testing::IntArrayFromInts(input_dims);
+  TfLiteIntArray* output_tensor_dims =
+      tflite::testing::IntArrayFromInts(output_dims);
+
+  const int output_dims_count = tflite::ElementCount(*output_tensor_dims);
+
+  constexpr int inputs_size = 2;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      tflite::testing::CreateQuantizedTensor(input_data, input_tensor_dims, 1,
+                                             0),
+      tflite::testing::CreateQuantizedTensor(output_data, output_tensor_dims, 1,
+                                             0),
+  };
+
+  // There is one input - tensor 0.
+  const int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array =
+      tflite::testing::IntArrayFromInts(inputs_array_data);
+  // There is one output - tensor 1.
+  const int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array =
+      tflite::testing::IntArrayFromInts(outputs_array_data);
+
+  const TfLiteRegistration* registration =
+      tflite::ops::micro::Register_CIRCULAR_BUFFER();
+  tflite::micro::KernelRunner runner = tflite::micro::KernelRunner(
+      *registration, tensors, tensors_size, inputs_array, outputs_array,
+      /*builtin_data=*/nullptr, micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
 
   const int8_t goldens[6][20] = {
       {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4},
@@ -177,18 +159,16 @@ TF_LITE_MICRO_TEST(OutputTensorLength5) {
       {5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
        15, 16, 17, 18, 19, 20, 21, 22, 23, 24}};
 
-  int8_t input[depth];
-  TfLiteNode node = tflite::testing::PrepareCircularBufferInt8(
-      input_dims, input, output_dims, goldens[0], output_data);
   // Expect circular buffer to run every cycle for 5xN output.
   for (int i = 0; i < 6; i++) {
     for (int j = 0; j < depth; j++) {
-      input[j] = i * depth + j + 1;
+      input_data[j] = i * depth + j + 1;
+    }
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
+
+    for (int j = 0; j < output_dims_count; ++j) {
+      TF_LITE_MICRO_EXPECT_EQ(goldens[i][j], output_data[j]);
     }
-    TF_LITE_MICRO_EXPECT_EQ(
-        kTfLiteOk,
-        tflite::testing::InvokeCircularBufferInt8(
-            input_dims, input, output_dims, goldens[i], output_data, &node));
   }
 }
 
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/add.cc b/tensorflow/lite/micro/kernels/cmsis-nn/add.cc
index 0ccdd16428d..6db88839073 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/add.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/add.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
 
 namespace tflite {
@@ -41,18 +42,18 @@ struct OpData {
   // and the special 16-bit -> 16bit quantized path
   int input1_shift;
   int input2_shift;
-  int32 output_activation_min;
-  int32 output_activation_max;
+  int32_t output_activation_min;
+  int32_t output_activation_max;
 
   // These fields are used only in the general 8-bit -> 8bit quantized path
-  int32 input1_multiplier;
-  int32 input2_multiplier;
-  int32 output_multiplier;
+  int32_t input1_multiplier;
+  int32_t input2_multiplier;
+  int32_t output_multiplier;
   int output_shift;
   int left_shift;
-  int32 input1_offset;
-  int32 input2_offset;
-  int32 output_offset;
+  int32_t input1_offset;
+  int32_t input2_offset;
+  int32_t output_offset;
 };
 
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteAddParams* params,
@@ -96,18 +97,20 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteAddParams* params,
 }
 
 void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params,
-             const OpData* data, const TfLiteTensor* input1,
-             const TfLiteTensor* input2, TfLiteTensor* output) {
+             const OpData* data, const TfLiteEvalTensor* input1,
+             const TfLiteEvalTensor* input2, TfLiteEvalTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
   tflite::ArithmeticParams op_params;
   SetActivationParams(output_activation_min, output_activation_max, &op_params);
-#define TF_LITE_ADD(opname)                                                   \
-  reference_ops::opname(op_params, GetTensorShape(input1),                    \
-                        GetTensorData<float>(input1), GetTensorShape(input2), \
-                        GetTensorData<float>(input2), GetTensorShape(output), \
-                        GetTensorData<float>(output))
+#define TF_LITE_ADD(opname)                                               \
+  reference_ops::opname(op_params, tflite::micro::GetTensorShape(input1), \
+                        tflite::micro::GetTensorData<float>(input1),      \
+                        tflite::micro::GetTensorShape(input2),            \
+                        tflite::micro::GetTensorData<float>(input2),      \
+                        tflite::micro::GetTensorShape(output),            \
+                        tflite::micro::GetTensorData<float>(output))
   if (data->requires_broadcast) {
     TF_LITE_ADD(BroadcastAdd4DSlow);
   } else {
@@ -118,9 +121,9 @@ void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params,
 
 TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
                               TfLiteAddParams* params, const OpData* data,
-                              const TfLiteTensor* input1,
-                              const TfLiteTensor* input2,
-                              TfLiteTensor* output) {
+                              const TfLiteEvalTensor* input1,
+                              const TfLiteEvalTensor* input2,
+                              TfLiteEvalTensor* output) {
   if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
     tflite::ArithmeticParams op_params;
     op_params.left_shift = data->left_shift;
@@ -136,27 +139,32 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
     SetActivationParams(data->output_activation_min,
                         data->output_activation_max, &op_params);
     bool need_broadcast = reference_ops::ProcessBroadcastShapes(
-        GetTensorShape(input1), GetTensorShape(input2), &op_params);
-#define TF_LITE_ADD(type, opname, dtype)                             \
-  type::opname(op_params, GetTensorShape(input1),                    \
-               GetTensorData<dtype>(input1), GetTensorShape(input2), \
-               GetTensorData<dtype>(input2), GetTensorShape(output), \
-               GetTensorData<dtype>(output));
+        tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorShape(input2), &op_params);
+#define TF_LITE_ADD(type, opname, dtype)                         \
+  type::opname(op_params, tflite::micro::GetTensorShape(input1), \
+               tflite::micro::GetTensorData<dtype>(input1),      \
+               tflite::micro::GetTensorShape(input2),            \
+               tflite::micro::GetTensorData<dtype>(input2),      \
+               tflite::micro::GetTensorShape(output),            \
+               tflite::micro::GetTensorData<dtype>(output));
     if (output->type == kTfLiteInt8) {
       if (need_broadcast) {
         TF_LITE_ADD(reference_integer_ops, BroadcastAdd4DSlow, int8_t);
       } else {
         arm_elementwise_add_s8(
-            GetTensorData<int8_t>(input1), GetTensorData<int8_t>(input2),
+            tflite::micro::GetTensorData<int8_t>(input1),
+            tflite::micro::GetTensorData<int8_t>(input2),
             op_params.input1_offset, op_params.input1_multiplier,
             op_params.input1_shift, op_params.input2_offset,
             op_params.input2_multiplier, op_params.input2_shift,
-            op_params.left_shift, GetTensorData<int8_t>(output),
+            op_params.left_shift, tflite::micro::GetTensorData<int8_t>(output),
             op_params.output_offset, op_params.output_multiplier,
             op_params.output_shift, op_params.quantized_activation_min,
             op_params.quantized_activation_max,
-            MatchingElementsSize(GetTensorShape(input1), GetTensorShape(input2),
-                                 GetTensorShape(output)));
+            MatchingElementsSize(tflite::micro::GetTensorShape(input1),
+                                 tflite::micro::GetTensorShape(input2),
+                                 tflite::micro::GetTensorShape(output)));
       }
     } else {
       if (need_broadcast) {
@@ -173,12 +181,7 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@@ -201,9 +204,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
   TFLITE_DCHECK(node->user_data != nullptr);
   const OpData* data = static_cast<const OpData*>(node->user_data);
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc b/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
index 909b37c957e..cf1ce8cb5cb 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -43,6 +44,12 @@ constexpr int kConvQuantizedDimension = 0;
 
 struct OpData {
   TfLitePaddingValues padding;
+
+  // Cached tensor zero point values for quantized operations.
+  int32_t input_zero_point;
+  int32_t filter_zero_point;
+  int32_t output_zero_point;
+
   // The scaling factor from input to output (aka the 'real multiplier') can
   // be represented as a fixed point multiplier plus a left shift.
   int32_t output_multiplier;
@@ -57,6 +64,9 @@ struct OpData {
   // uint8_t these would be 0 and 255.
   int32_t output_activation_min;
   int32_t output_activation_max;
+
+  // Index to buffer for optimizations if applicable.
+  int buffer_idx;
 };
 
 inline PaddingType RuntimePaddingType(TfLitePadding padding) {
@@ -109,18 +119,18 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
 }
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  void* raw;
-  context->AllocatePersistentBuffer(context, sizeof(int), &raw);
-  return raw;
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 #if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
-  OpData data;
   int32_t buf_size = 0;
 
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
   auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
-
+  auto* data = reinterpret_cast<OpData*>(node->user_data);
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
   const TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
@@ -149,11 +159,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   output_dims.w = output->dims->data[2];
   output_dims.c = output_shape.Dims(3);
 
-  int* buffer_idx = reinterpret_cast<int*>(node->user_data);
-
   TF_LITE_ENSURE_STATUS(CalculateOpData(
       context, node, params, input_dims.w, input_dims.h, filter_dims.w,
-      filter_dims.h, output_dims.w, output_dims.h, input->type, &data));
+      filter_dims.h, output_dims.w, output_dims.h, input->type, data));
+
+  data->input_zero_point = input->params.zero_point;
+  data->filter_zero_point = filter->params.zero_point;
+  data->output_zero_point = output->params.zero_point;
 
   if (input->type == kTfLiteInt8) {
     // Initialize cmsis-nn convolution parameters
@@ -164,40 +176,41 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     conv_params.stride.w = params->stride_width;
     conv_params.dilation.h = params->dilation_height_factor;
     conv_params.dilation.w = params->dilation_width_factor;
-    conv_params.padding.h = data.padding.height;
-    conv_params.padding.w = data.padding.width;
-    conv_params.activation.min = data.output_activation_min;
-    conv_params.activation.max = data.output_activation_max;
+    conv_params.padding.h = data->padding.height;
+    conv_params.padding.w = data->padding.width;
+    conv_params.activation.min = data->output_activation_min;
+    conv_params.activation.max = data->output_activation_max;
 
     buf_size = arm_convolve_wrapper_s8_get_buffer_size(
         &conv_params, &input_dims, &filter_dims, &output_dims);
   }
 
-  node->user_data = buffer_idx;
   if (buf_size > 0) {
-    TF_LITE_ENSURE_STATUS(
-        context->RequestScratchBufferInArena(context, buf_size, buffer_idx));
+    TF_LITE_ENSURE_STATUS(context->RequestScratchBufferInArena(
+        context, buf_size, &data->buffer_idx));
   } else {
-    *buffer_idx = -1;
+    data->buffer_idx = -1;
   }
 #endif
   return kTfLiteOk;
 }
 
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           TfLiteConvParams* params, OpData* data,
-                           const TfLiteTensor* input,
-                           const TfLiteTensor* filter, const TfLiteTensor* bias,
-                           TfLiteTensor* im2col, TfLiteTensor* hwcn_weights,
-                           TfLiteTensor* output) {
-  const int32_t input_offset = -input->params.zero_point;
-  const int32_t filter_offset = -filter->params.zero_point;
-  const int32_t output_offset = output->params.zero_point;
+                           TfLiteConvParams* params, const OpData& data,
+                           const TfLiteEvalTensor* input,
+                           const TfLiteEvalTensor* filter,
+                           const TfLiteEvalTensor* bias,
+                           TfLiteEvalTensor* im2col,
+                           TfLiteEvalTensor* hwcn_weights,
+                           TfLiteEvalTensor* output) {
+  const int32_t input_offset = -data.input_zero_point;
+  const int32_t filter_offset = -data.filter_zero_point;
+  const int32_t output_offset = data.output_zero_point;
 
   ConvParams op_params;
   op_params.padding_type = RuntimePaddingType(params->padding);
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
   op_params.stride_width = params->stride_width;
   op_params.stride_height = params->stride_height;
   op_params.dilation_width_factor = params->dilation_width_factor;
@@ -205,48 +218,54 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   op_params.input_offset = input_offset;
   op_params.weights_offset = filter_offset;
   op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
-  op_params.output_shift = -data->output_shift;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
-  reference_ops::Conv(op_params, GetTensorShape(input),
-                      GetTensorData<uint8_t>(input), GetTensorShape(filter),
-                      GetTensorData<uint8_t>(filter), GetTensorShape(bias),
-                      GetTensorData<int32_t>(bias), GetTensorShape(output),
-                      GetTensorData<uint8_t>(output), GetTensorShape(im2col),
-                      GetTensorData<uint8_t>(im2col), nullptr);
+  op_params.output_multiplier = data.output_multiplier;
+  op_params.output_shift = -data.output_shift;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
+  reference_ops::Conv(op_params, tflite::micro::GetTensorShape(input),
+                      tflite::micro::GetTensorData<uint8_t>(input),
+                      tflite::micro::GetTensorShape(filter),
+                      tflite::micro::GetTensorData<uint8_t>(filter),
+                      tflite::micro::GetTensorShape(bias),
+                      tflite::micro::GetTensorData<int32_t>(bias),
+                      tflite::micro::GetTensorShape(output),
+                      tflite::micro::GetTensorData<uint8_t>(output),
+                      tflite::micro::GetTensorShape(im2col),
+                      tflite::micro::GetTensorData<uint8_t>(im2col), nullptr);
   return kTfLiteOk;
 }
 
 TfLiteStatus EvalQuantizedPerChannel(
     TfLiteContext* context, TfLiteNode* node, TfLiteConvParams* params,
-    OpData* data, const TfLiteTensor* input, const TfLiteTensor* filter,
-    const TfLiteTensor* bias, TfLiteTensor* output, TfLiteTensor* im2col) {
+    const OpData& data, const TfLiteEvalTensor* input,
+    const TfLiteEvalTensor* filter, const TfLiteEvalTensor* bias,
+    TfLiteEvalTensor* output, TfLiteEvalTensor* im2col) {
   // Initialize cmsis-nn convolution parameters
   cmsis_nn_conv_params conv_params;
-  conv_params.input_offset = -input->params.zero_point;
-  conv_params.output_offset = output->params.zero_point;
+  conv_params.input_offset = -data.input_zero_point;
+  conv_params.output_offset = data.output_zero_point;
   conv_params.stride.h = params->stride_height;
   conv_params.stride.w = params->stride_width;
   conv_params.dilation.h = params->dilation_height_factor;
   conv_params.dilation.w = params->dilation_width_factor;
-  conv_params.padding.h = data->padding.height;
-  conv_params.padding.w = data->padding.width;
-  conv_params.activation.min = data->output_activation_min;
-  conv_params.activation.max = data->output_activation_max;
+  conv_params.padding.h = data.padding.height;
+  conv_params.padding.w = data.padding.width;
+  conv_params.activation.min = data.output_activation_min;
+  conv_params.activation.max = data.output_activation_max;
 
   // Initialize cmsis-nn per channel quantization parameters
   cmsis_nn_per_channel_quant_params quant_params;
-  quant_params.multiplier = data->per_channel_output_multiplier;
-  quant_params.shift = data->per_channel_output_shift;
+  quant_params.multiplier =
+      const_cast<int32_t*>(data.per_channel_output_multiplier);
+  quant_params.shift = const_cast<int32_t*>(data.per_channel_output_shift);
 
 #if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
-  RuntimeShape filter_shape = GetTensorShape(filter);
-  RuntimeShape input_shape = GetTensorShape(input);
-  RuntimeShape output_shape = GetTensorShape(output);
-  RuntimeShape bias_shape = GetTensorShape(bias);
+  RuntimeShape filter_shape = tflite::micro::GetTensorShape(filter);
+  RuntimeShape input_shape = tflite::micro::GetTensorShape(input);
+  RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
+  RuntimeShape bias_shape = tflite::micro::GetTensorShape(bias);
 
-  // Sanity check.
+  // Consistency check.
   TFLITE_DCHECK_LE(conv_params.activation.min, conv_params.activation.max);
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
@@ -254,7 +273,7 @@ TfLiteStatus EvalQuantizedPerChannel(
   const int batch_size = MatchingDim(input_shape, 0, output_shape, 0);
   const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
   const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
-  if (GetTensorData<int8_t>(bias)) {
+  if (tflite::micro::GetTensorData<int8_t>(bias)) {
     TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
   }
 
@@ -292,9 +311,8 @@ TfLiteStatus EvalQuantizedPerChannel(
   ctx.buf = nullptr;
   ctx.size = 0;
 
-  auto* buffer_idx = reinterpret_cast<int*>(node->user_data);
-  if (*buffer_idx > -1) {
-    ctx.buf = context->GetScratchBuffer(context, *buffer_idx);
+  if (data.buffer_idx > -1) {
+    ctx.buf = context->GetScratchBuffer(context, data.buffer_idx);
     // Note: ctx.size is currently not used in cmsis-nn.
     // The buffer should be allocated in the Prepare function through
     // arm_convolve_wrapper_s8_get_buffer_size
@@ -304,9 +322,10 @@ TfLiteStatus EvalQuantizedPerChannel(
   // the parameters passed
   arm_status status = arm_convolve_wrapper_s8(
       &ctx, &conv_params, &quant_params, &input_dims,
-      GetTensorData<int8_t>(input), &filter_dims, GetTensorData<int8_t>(filter),
-      &bias_dims, GetTensorData<int32>(bias), &output_dims,
-      GetTensorData<int8_t>(output));
+      tflite::micro::GetTensorData<int8_t>(input), &filter_dims,
+      tflite::micro::GetTensorData<int8_t>(filter), &bias_dims,
+      tflite::micro::GetTensorData<int32_t>(bias), &output_dims,
+      tflite::micro::GetTensorData<int8_t>(output));
 
   if (status == ARM_MATH_SUCCESS) {
     return kTfLiteOk;
@@ -319,42 +338,47 @@ TfLiteStatus EvalQuantizedPerChannel(
     "CMSIS-NN optimization for conv not available for this target. Using reference kernel.")
 
   ConvParams op_params;
-  op_params.input_offset = -input->params.zero_point;
-  op_params.output_offset = output->params.zero_point;
+  conv_params.input_offset = -data.input_zero_point;
+  conv_params.output_offset = data.output_zero_point;
   op_params.stride_height = params->stride_height;
   op_params.stride_width = params->stride_width;
   op_params.dilation_height_factor = params->dilation_height_factor;
   op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.padding_values.height = data->padding.height;
-  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data.padding.height;
+  op_params.padding_values.width = data.padding.width;
   op_params.quantized_activation_min = data->output_activation_min;
   op_params.quantized_activation_max = data->output_activation_max;
 
   reference_integer_ops::ConvPerChannel(
       op_params, data->per_channel_output_multiplier,
-      data->per_channel_output_shift, GetTensorShape(input),
-      GetTensorData<int8>(input), GetTensorShape(filter),
-      GetTensorData<int8>(filter), GetTensorShape(bias),
-      GetTensorData<int32>(bias), GetTensorShape(output),
-      GetTensorData<int8>(output));
+      data->per_channel_output_shift, tflite::micro::GetTensorShape(input),
+      tflite::micro::GetTensorData<int8_t>(input),
+      tflite::micro::GetTensorShape(filter),
+      tflite::micro::GetTensorData<int8_t>(filter),
+      tflite::micro::GetTensorShape(bias),
+      tflite::micro::GetTensorData<int32_t>(bias),
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<int8_t>(output));
 
 #endif
   return kTfLiteOk;
 }
 
 TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteConvParams* params, OpData* data,
-                       const TfLiteTensor* input, const TfLiteTensor* filter,
-                       const TfLiteTensor* bias, TfLiteTensor* im2col,
-                       TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
+                       TfLiteConvParams* params, const OpData& data,
+                       const TfLiteEvalTensor* input,
+                       const TfLiteEvalTensor* filter,
+                       const TfLiteEvalTensor* bias, TfLiteEvalTensor* im2col,
+                       TfLiteEvalTensor* hwcn_weights,
+                       TfLiteEvalTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
-
+  // TODO(b/154032858): Investigate removing extra copies.
   ConvParams op_params;
   op_params.padding_type = RuntimePaddingType(params->padding);
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
   op_params.stride_width = params->stride_width;
   op_params.stride_height = params->stride_height;
   op_params.dilation_width_factor = params->dilation_width_factor;
@@ -362,66 +386,47 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
   op_params.float_activation_min = output_activation_min;
   op_params.float_activation_max = output_activation_max;
 
-  reference_ops::Conv(op_params, GetTensorShape(input),
-                      GetTensorData<float>(input), GetTensorShape(filter),
-                      GetTensorData<float>(filter), GetTensorShape(bias),
-                      GetTensorData<float>(bias), GetTensorShape(output),
-                      GetTensorData<float>(output), GetTensorShape(im2col),
-                      GetTensorData<float>(im2col));
+  reference_ops::Conv(op_params, tflite::micro::GetTensorShape(input),
+                      tflite::micro::GetTensorData<float>(input),
+                      tflite::micro::GetTensorShape(filter),
+                      tflite::micro::GetTensorData<float>(filter),
+                      tflite::micro::GetTensorShape(bias),
+                      tflite::micro::GetTensorData<float>(bias),
+                      tflite::micro::GetTensorShape(output),
+                      tflite::micro::GetTensorData<float>(output),
+                      tflite::micro::GetTensorShape(im2col),
+                      tflite::micro::GetTensorData<float>(im2col));
   return kTfLiteOk;
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kFilterTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 3)
+          ? tflite::micro::GetEvalInput(context, node, kBiasTensor)
+          : nullptr;
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
-  int input_width = input->dims->data[2];
-  int input_height = input->dims->data[1];
-  int filter_width = filter->dims->data[2];
-  int filter_height = filter->dims->data[1];
-  int output_width = output->dims->data[2];
-  int output_height = output->dims->data[1];
-
-  OpData data;
-
-  // All per-channel quantized tensors need valid zero point and scale arrays.
-  if (input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
-                      kTfLiteAffineQuantization);
-
-    const auto* affine_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(
-            filter->quantization.params);
-    TF_LITE_ENSURE(context, affine_quantization);
-    TF_LITE_ENSURE(context, affine_quantization->scale);
-    TF_LITE_ENSURE(context, affine_quantization->zero_point);
-    TF_LITE_ENSURE(context,
-                   affine_quantization->scale->size == 1 ||
-                       affine_quantization->scale->size ==
-                           filter->dims->data[kConvQuantizedDimension]);
-    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
-                      affine_quantization->zero_point->size);
-  }
-
-  TF_LITE_ENSURE_STATUS(CalculateOpData(
-      context, node, params, input_width, input_height, filter_width,
-      filter_height, output_width, output_height, input->type, &data));
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
-      return EvalFloat(context, node, params, &data, input, filter, bias,
-                       nullptr, nullptr, output);
+      EvalFloat(context, node, params, data, input, filter, bias, nullptr,
+                nullptr, output);
       break;
     case kTfLiteInt8:
-      return EvalQuantizedPerChannel(context, node, params, &data, input,
-                                     filter, bias, output, nullptr);
+      return EvalQuantizedPerChannel(context, node, params, data, input, filter,
+                                     bias, output, nullptr);
       break;
     case kTfLiteUInt8:
-      return EvalQuantized(context, node, params, &data, input, filter, bias,
+      return EvalQuantized(context, node, params, data, input, filter, bias,
                            nullptr, nullptr, output);
       break;
     default:
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc b/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc
index 889193a8784..42ac15a0837 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -44,6 +45,12 @@ constexpr int kDepthwiseConvQuantizedDimension = 3;
 
 struct OpData {
   TfLitePaddingValues padding;
+
+  // Cached tensor zero point values for quantized operations.
+  int32_t input_zero_point;
+  int32_t filter_zero_point;
+  int32_t output_zero_point;
+
   // The scaling factor from input to output (aka the 'real multiplier') can
   // be represented as a fixed point multiplier plus a left shift.
   int32_t output_multiplier;
@@ -102,12 +109,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@@ -120,6 +122,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   const TfLiteType data_type = input->type;
   int width = SizeOfDimension(input, 2);
@@ -155,8 +158,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                         filter_width, filter_height, data_type,
                                         data));
 
+  data->input_zero_point = input->params.zero_point;
+  data->filter_zero_point = filter->params.zero_point;
+  data->output_zero_point = output->params.zero_point;
+
   if (input->type == kTfLiteInt8) {
-    const TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
     RuntimeShape input_shape = GetTensorShape(input);
     RuntimeShape output_shape = GetTensorShape(output);
     RuntimeShape filter_shape = GetTensorShape(filter);
@@ -205,8 +211,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
                TfLiteDepthwiseConvParams* params, const OpData* data,
-               const TfLiteTensor* input, const TfLiteTensor* filter,
-               const TfLiteTensor* bias, TfLiteTensor* output) {
+               const TfLiteEvalTensor* input, const TfLiteEvalTensor* filter,
+               const TfLiteEvalTensor* bias, TfLiteEvalTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
@@ -225,25 +231,30 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
   op_params.float_activation_max = output_activation_max;
 
   tflite::reference_ops::DepthwiseConv(
-      op_params, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(filter), GetTensorData<float>(filter),
-      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
-      GetTensorData<float>(output));
+      op_params, tflite::micro::GetTensorShape(input),
+      tflite::micro::GetTensorData<float>(input),
+      tflite::micro::GetTensorShape(filter),
+      tflite::micro::GetTensorData<float>(filter),
+      tflite::micro::GetTensorShape(bias),
+      tflite::micro::GetTensorData<float>(bias),
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<float>(output));
 }
 
 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
                              TfLiteDepthwiseConvParams* params, OpData* data,
-                             const TfLiteTensor* input,
-                             const TfLiteTensor* filter,
-                             const TfLiteTensor* bias, TfLiteTensor* output) {
+                             const TfLiteEvalTensor* input,
+                             const TfLiteEvalTensor* filter,
+                             const TfLiteEvalTensor* bias,
+                             TfLiteEvalTensor* output) {
   cmsis_nn_dw_conv_params dw_conv_params;
   dw_conv_params.dilation.h = params->dilation_height_factor;
   dw_conv_params.dilation.w = params->dilation_width_factor;
   // Call to reference implementation can be removed when dilation is supported
   // in the optimized implementations.
   if (1 == dw_conv_params.dilation.h && 1 == dw_conv_params.dilation.w) {
-    dw_conv_params.input_offset = -input->params.zero_point;
-    dw_conv_params.output_offset = output->params.zero_point;
+    dw_conv_params.input_offset = -data->input_zero_point;
+    dw_conv_params.output_offset = data->output_zero_point;
     dw_conv_params.stride.h = params->stride_height;
     dw_conv_params.stride.w = params->stride_width;
     dw_conv_params.padding.h = data->padding.height;
@@ -257,10 +268,10 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
     quant_params.multiplier = data->per_channel_output_multiplier;
     quant_params.shift = data->per_channel_output_shift;
 
-    RuntimeShape filter_shape = GetTensorShape(filter);
-    RuntimeShape input_shape = GetTensorShape(input);
-    RuntimeShape output_shape = GetTensorShape(output);
-    RuntimeShape bias_shape = GetTensorShape(bias);
+    RuntimeShape filter_shape = tflite::micro::GetTensorShape(filter);
+    RuntimeShape input_shape = tflite::micro::GetTensorShape(input);
+    RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
+    RuntimeShape bias_shape = tflite::micro::GetTensorShape(bias);
 
     TFLITE_DCHECK_LE(dw_conv_params.activation.min,
                      dw_conv_params.activation.max);
@@ -268,7 +279,7 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
     const int batch_size = MatchingDim(input_shape, 0, output_shape, 0);
     const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
 
-    if (GetTensorData<int8_t>(bias)) {
+    if (tflite::micro::GetTensorData<int8_t>(bias)) {
       TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
     }
 
@@ -305,13 +316,14 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
       ctx.buf = context->GetScratchBuffer(context, data->buffer_idx);
     }
 
-    TFLITE_DCHECK_EQ(arm_depthwise_conv_wrapper_s8(
-                         &ctx, &dw_conv_params, &quant_params, &input_dims,
-                         GetTensorData<int8_t>(input), &filter_dims,
-                         GetTensorData<int8_t>(filter), &bias_dims,
-                         GetTensorData<int32>(bias), &output_dims,
-                         GetTensorData<int8_t>(output)),
-                     ARM_MATH_SUCCESS);
+    TFLITE_DCHECK_EQ(
+        arm_depthwise_conv_wrapper_s8(
+            &ctx, &dw_conv_params, &quant_params, &input_dims,
+            tflite::micro::GetTensorData<int8_t>(input), &filter_dims,
+            tflite::micro::GetTensorData<int8_t>(filter), &bias_dims,
+            tflite::micro::GetTensorData<int32_t>(bias), &output_dims,
+            tflite::micro::GetTensorData<int8_t>(output)),
+        ARM_MATH_SUCCESS);
   } else {
     DepthwiseParams op_params;
     op_params.padding_type = PaddingType::kSame;
@@ -322,30 +334,34 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
     op_params.dilation_width_factor = params->dilation_width_factor;
     op_params.dilation_height_factor = params->dilation_height_factor;
     op_params.depth_multiplier = params->depth_multiplier;
-    op_params.input_offset = -input->params.zero_point;
+    op_params.input_offset = -data->input_zero_point;
     op_params.weights_offset = 0;
-    op_params.output_offset = output->params.zero_point;
+    op_params.output_offset = data->output_zero_point;
     // TODO(b/130439627): Use calculated value for clamping.
     op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
     op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
 
     reference_integer_ops::DepthwiseConvPerChannel(
         op_params, data->per_channel_output_multiplier,
-        data->per_channel_output_shift, GetTensorShape(input),
-        GetTensorData<int8>(input), GetTensorShape(filter),
-        GetTensorData<int8>(filter), GetTensorShape(bias),
-        GetTensorData<int32>(bias), GetTensorShape(output),
-        GetTensorData<int8>(output));
+        data->per_channel_output_shift, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<int8_t>(input),
+        tflite::micro::GetTensorShape(filter),
+        tflite::micro::GetTensorData<int8_t>(filter),
+        tflite::micro::GetTensorShape(bias),
+        tflite::micro::GetTensorData<int32_t>(bias),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int8_t>(output));
   }
 }
 
 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                    TfLiteDepthwiseConvParams* params, const OpData* data,
-                   const TfLiteTensor* input, const TfLiteTensor* filter,
-                   const TfLiteTensor* bias, TfLiteTensor* output) {
-  const int32_t input_offset = -input->params.zero_point;
-  const int32_t filter_offset = -filter->params.zero_point;
-  const int32_t output_offset = output->params.zero_point;
+                   const TfLiteEvalTensor* input,
+                   const TfLiteEvalTensor* filter, const TfLiteEvalTensor* bias,
+                   TfLiteEvalTensor* output) {
+  const int32_t input_offset = -data->input_zero_point;
+  const int32_t filter_offset = -data->filter_zero_point;
+  const int32_t output_offset = data->output_zero_point;
 
   tflite::DepthwiseParams op_params;
   // Padding type is ignored, but still set.
@@ -368,34 +384,39 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 
   if (1 == op_params.dilation_width_factor &&
       1 == op_params.dilation_height_factor) {
-    RuntimeShape filter_shape = GetTensorShape(filter);
+    RuntimeShape filter_shape = tflite::micro::GetTensorShape(filter);
     const int filter_height = filter_shape.Dims(1);
     const int filter_width = filter_shape.Dims(2);
-    RuntimeShape input_shape = GetTensorShape(input);
+    RuntimeShape input_shape = tflite::micro::GetTensorShape(input);
     const int input_height = input_shape.Dims(1);
     const int input_width = input_shape.Dims(2);
     const int input_depth = input_shape.Dims(3);
-    RuntimeShape output_shape = GetTensorShape(output);
+    RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
     const int output_height = output_shape.Dims(1);
     const int output_width = output_shape.Dims(2);
     arm_depthwise_conv_u8_basic_ver1(
-        GetTensorData<uint8_t>(input), input_width, input_height, input_depth,
-        GetTensorData<uint8_t>(filter), filter_width, filter_height,
-        op_params.depth_multiplier, op_params.padding_values.width,
-        op_params.padding_values.height, op_params.stride_width,
-        op_params.stride_height, op_params.dilation_width_factor,
-        op_params.dilation_height_factor, GetTensorData<int32_t>(bias),
-        op_params.input_offset, op_params.weights_offset,
-        op_params.output_offset, GetTensorData<uint8_t>(output), output_width,
+        tflite::micro::GetTensorData<uint8_t>(input), input_width, input_height,
+        input_depth, tflite::micro::GetTensorData<uint8_t>(filter),
+        filter_width, filter_height, op_params.depth_multiplier,
+        op_params.padding_values.width, op_params.padding_values.height,
+        op_params.stride_width, op_params.stride_height,
+        op_params.dilation_width_factor, op_params.dilation_height_factor,
+        tflite::micro::GetTensorData<int32_t>(bias), op_params.input_offset,
+        op_params.weights_offset, op_params.output_offset,
+        tflite::micro::GetTensorData<uint8_t>(output), output_width,
         output_height, op_params.quantized_activation_min,
         op_params.quantized_activation_max, op_params.output_shift,
         op_params.output_multiplier);
   } else {
     tflite::reference_ops::DepthwiseConv(
-        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-        GetTensorShape(filter), GetTensorData<uint8_t>(filter),
-        GetTensorShape(bias), GetTensorData<int32_t>(bias),
-        GetTensorShape(output), GetTensorData<uint8_t>(output));
+        op_params, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<uint8_t>(input),
+        tflite::micro::GetTensorShape(filter),
+        tflite::micro::GetTensorData<uint8_t>(filter),
+        tflite::micro::GetTensorShape(bias),
+        tflite::micro::GetTensorData<int32_t>(bias),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<uint8_t>(output));
   }
 }
 
@@ -407,11 +428,16 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
   OpData& data = *(static_cast<OpData*>(node->user_data));
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  const TfLiteTensor* bias =
-      (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kFilterTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 3)
+          ? tflite::micro::GetEvalInput(context, node, kBiasTensor)
+          : nullptr;
 
   // TODO(aselle): Consider whether float conv and quantized conv should be
   // separate ops to avoid dispatch overhead here.
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc b/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc
index c7e64dd11f4..8af92e6d245 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -43,6 +44,11 @@ struct OpData {
   int input_quantized_index;
   // Index to buffer for optimizations if applicable.
   int buffer_idx;
+
+  // Cached tensor zero point values for quantized operations.
+  int32_t input_zero_point;
+  int32_t filter_zero_point;
+  int32_t output_zero_point;
 };
 
 constexpr int kInputTensor = 0;
@@ -69,6 +75,9 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
     TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
         context, activation, output, &data->output_activation_min,
         &data->output_activation_max));
+    data->input_zero_point = input->params.zero_point;
+    data->filter_zero_point = filter->params.zero_point;
+    data->output_zero_point = output->params.zero_point;
   }
   return status;
 }
@@ -77,12 +86,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@@ -104,7 +108,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                         input->type, input, filter, bias,
                                         output, data));
 
-  if (input->type == kTfLiteInt8 && nullptr != GetTensorData<int32>(bias)) {
+  if (input->type == kTfLiteInt8 && nullptr != GetTensorData<int32_t>(bias)) {
     RuntimeShape filter_shape = GetTensorShape(filter);
     RuntimeShape output_shape = GetTensorShape(output);
 
@@ -130,25 +134,26 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
-                               const OpData& data, const TfLiteTensor* input,
-                               const TfLiteTensor* filter,
-                               const TfLiteTensor* bias, TfLiteTensor* output) {
+                               const OpData& data,
+                               const TfLiteEvalTensor* input,
+                               const TfLiteEvalTensor* filter,
+                               const TfLiteEvalTensor* bias,
+                               TfLiteEvalTensor* output) {
   // The 'if' condition can be removed when null handling of bias is added to
   // arm_fully_connected_s8
-  if (nullptr != GetTensorData<int32>(bias)) {
-    RuntimeShape output_shape = GetTensorShape(output);
+  if (nullptr != tflite::micro::GetTensorData<int32_t>(bias)) {
+    RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
     TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);
     const int batches = output_shape.Dims(0);
     const int output_depth = output_shape.Dims(1);
-    const RuntimeShape filter_shape = GetTensorShape(filter);
+    const RuntimeShape filter_shape = tflite::micro::GetTensorShape(filter);
     const int filter_dim_count = filter_shape.DimensionsCount();
     const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
-    const RuntimeShape input_shape = GetTensorShape(input);
+    const RuntimeShape input_shape = tflite::micro::GetTensorShape(input);
 
     cmsis_nn_fc_params fc_params;
-    fc_params.input_offset = -input->params.zero_point;
-    fc_params.filter_offset = -filter->params.zero_point;
-    fc_params.output_offset = output->params.zero_point;
+    fc_params.input_offset = -data.input_zero_point;
+    fc_params.output_offset = data.output_zero_point;
     fc_params.activation.min = data.output_activation_min;
     fc_params.activation.max = data.output_activation_max;
 
@@ -191,17 +196,18 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
 
     TF_LITE_ENSURE_EQ(
         context,
-        arm_fully_connected_s8(&ctx, &fc_params, &quant_params, &input_dims,
-                               GetTensorData<int8_t>(input), &filter_dims,
-                               GetTensorData<int8_t>(filter), &bias_dims,
-                               GetTensorData<int32>(bias), &output_dims,
-                               GetTensorData<int8_t>(output)),
+        arm_fully_connected_s8(
+            &ctx, &fc_params, &quant_params, &input_dims,
+            tflite::micro::GetTensorData<int8_t>(input), &filter_dims,
+            tflite::micro::GetTensorData<int8_t>(filter), &bias_dims,
+            tflite::micro::GetTensorData<int32_t>(bias), &output_dims,
+            tflite::micro::GetTensorData<int8_t>(output)),
         ARM_MATH_SUCCESS);
   } else {
     tflite::FullyConnectedParams op_params;
-    op_params.input_offset = -input->params.zero_point;
-    op_params.weights_offset = -filter->params.zero_point;
-    op_params.output_offset = output->params.zero_point;
+    op_params.input_offset = -data.input_zero_point;
+    op_params.weights_offset = -data.filter_zero_point;
+    op_params.output_offset = data.output_zero_point;
     op_params.output_multiplier = data.output_multiplier;
     // TODO(b/138810107): Figure out whether output shift should be inverted
     op_params.output_shift = -data.output_shift;
@@ -209,21 +215,26 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
     op_params.quantized_activation_max = data.output_activation_max;
 
     reference_integer_ops::FullyConnected(
-        op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-        GetTensorShape(filter), GetTensorData<int8_t>(filter),
-        GetTensorShape(bias), GetTensorData<int32_t>(bias),
-        GetTensorShape(output), GetTensorData<int8_t>(output));
+        op_params, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<int8_t>(input),
+        tflite::micro::GetTensorShape(filter),
+        tflite::micro::GetTensorData<int8_t>(filter),
+        tflite::micro::GetTensorShape(bias),
+        tflite::micro::GetTensorData<int32_t>(bias),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int8_t>(output));
   }
   return kTfLiteOk;
 }
 
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           const OpData& data, const TfLiteTensor* input,
-                           const TfLiteTensor* filter, const TfLiteTensor* bias,
-                           TfLiteTensor* output) {
-  const int32_t input_offset = -input->params.zero_point;
-  const int32_t filter_offset = -filter->params.zero_point;
-  const int32_t output_offset = output->params.zero_point;
+                           const OpData& data, const TfLiteEvalTensor* input,
+                           const TfLiteEvalTensor* filter,
+                           const TfLiteEvalTensor* bias,
+                           TfLiteEvalTensor* output) {
+  const int32_t input_offset = -data.input_zero_point;
+  const int32_t filter_offset = -data.filter_zero_point;
+  const int32_t output_offset = data.output_zero_point;
 
   tflite::FullyConnectedParams op_params;
   op_params.input_offset = input_offset;
@@ -235,12 +246,16 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   op_params.quantized_activation_min = data.output_activation_min;
   op_params.quantized_activation_max = data.output_activation_max;
 
-#define TF_LITE_FULLY_CONNECTED(output_data_type)                      \
-  reference_ops::FullyConnected(                                       \
-      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), \
-      GetTensorShape(filter), GetTensorData<uint8_t>(filter),          \
-      GetTensorShape(bias), GetTensorData<int32_t>(bias),              \
-      GetTensorShape(output), GetTensorData<output_data_type>(output))
+#define TF_LITE_FULLY_CONNECTED(output_data_type)      \
+  reference_ops::FullyConnected(                       \
+      op_params, tflite::micro::GetTensorShape(input), \
+      tflite::micro::GetTensorData<uint8_t>(input),    \
+      tflite::micro::GetTensorShape(filter),           \
+      tflite::micro::GetTensorData<uint8_t>(filter),   \
+      tflite::micro::GetTensorShape(bias),             \
+      tflite::micro::GetTensorData<int32_t>(bias),     \
+      tflite::micro::GetTensorShape(output),           \
+      tflite::micro::GetTensorData<output_data_type>(output))
   switch (output->type) {
     case kTfLiteUInt8:
       TF_LITE_FULLY_CONNECTED(uint8_t);
@@ -259,8 +274,9 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 
 TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
                        TfLiteFusedActivation activation,
-                       const TfLiteTensor* input, const TfLiteTensor* filter,
-                       const TfLiteTensor* bias, TfLiteTensor* output) {
+                       const TfLiteEvalTensor* input,
+                       const TfLiteEvalTensor* filter,
+                       const TfLiteEvalTensor* bias, TfLiteEvalTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRange(activation, &output_activation_min,
                            &output_activation_max);
@@ -268,10 +284,14 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
   op_params.float_activation_min = output_activation_min;
   op_params.float_activation_max = output_activation_max;
   tflite::reference_ops::FullyConnected(
-      op_params, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(filter), GetTensorData<float>(filter),
-      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
-      GetTensorData<float>(output));
+      op_params, tflite::micro::GetTensorShape(input),
+      tflite::micro::GetTensorData<float>(input),
+      tflite::micro::GetTensorShape(filter),
+      tflite::micro::GetTensorData<float>(filter),
+      tflite::micro::GetTensorShape(bias),
+      tflite::micro::GetTensorData<float>(bias),
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<float>(output));
   return kTfLiteOk;
 }
 
@@ -280,10 +300,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const auto* params =
       static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kWeightsTensor);
+  const TfLiteEvalTensor* bias =
+      tflite::micro::GetEvalInput(context, node, kBiasTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
   TFLITE_DCHECK(node->user_data != nullptr);
   const OpData& data = *(static_cast<const OpData*>(node->user_data));
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/mul.cc b/tensorflow/lite/micro/kernels/cmsis-nn/mul.cc
index 6f9113a02f6..00d884eb415 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/mul.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/mul.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
 
 namespace tflite {
@@ -38,6 +39,11 @@ struct OpData {
 
   int32_t output_multiplier;
   int output_shift;
+
+  // Cached tensor zero point values for quantized operations.
+  int32_t input1_zero_point;
+  int32_t input2_zero_point;
+  int32_t output_zero_point;
 };
 
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
@@ -65,6 +71,11 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
   return kTfLiteOk;
 }
 
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input1 = GetInput(context, node, kInput1Tensor);
   const TfLiteTensor* input2 = GetInput(context, node, kInput2Tensor);
@@ -74,44 +85,59 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     return AllocateOutputDimensionsFromInput(context, input1, input2, output);
   }
 
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  auto* params = reinterpret_cast<TfLiteMulParams*>(node->builtin_data);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
+  data->input1_zero_point = input1->params.zero_point;
+  data->input2_zero_point = input2->params.zero_point;
+  data->output_zero_point = output->params.zero_point;
+  CalculateOpData(context, node, params, data);
+
   return kTfLiteOk;
 }
 
 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                   TfLiteMulParams* params, OpData* data,
-                   const TfLiteTensor* input1, const TfLiteTensor* input2,
-                   TfLiteTensor* output) {
+                   TfLiteMulParams* params, const OpData& data,
+                   const TfLiteEvalTensor* input1,
+                   const TfLiteEvalTensor* input2, TfLiteEvalTensor* output) {
   if (output->type == kTfLiteInt8 || output->type == kTfLiteUInt8) {
     tflite::ArithmeticParams op_params;
-    SetActivationParams(data->output_activation_min,
-                        data->output_activation_max, &op_params);
-    op_params.input1_offset = -input1->params.zero_point;
-    op_params.input2_offset = -input2->params.zero_point;
-    op_params.output_offset = output->params.zero_point;
-    op_params.output_multiplier = data->output_multiplier;
-    op_params.output_shift = data->output_shift;
+    SetActivationParams(data.output_activation_min, data.output_activation_max,
+                        &op_params);
+    op_params.input1_offset = -data.input1_zero_point;
+    op_params.input2_offset = -data.input2_zero_point;
+    op_params.output_offset = data.output_zero_point;
+    op_params.output_multiplier = data.output_multiplier;
+    op_params.output_shift = data.output_shift;
     bool need_broadcast = reference_ops::ProcessBroadcastShapes(
-        GetTensorShape(input1), GetTensorShape(input2), &op_params);
+        tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorShape(input2), &op_params);
 
-#define TF_LITE_MUL(type, opname, dtype)                             \
-  type::opname(op_params, GetTensorShape(input1),                    \
-               GetTensorData<dtype>(input1), GetTensorShape(input2), \
-               GetTensorData<dtype>(input2), GetTensorShape(output), \
-               GetTensorData<dtype>(output));
+#define TF_LITE_MUL(type, opname, dtype)                         \
+  type::opname(op_params, tflite::micro::GetTensorShape(input1), \
+               tflite::micro::GetTensorData<dtype>(input1),      \
+               tflite::micro::GetTensorShape(input2),            \
+               tflite::micro::GetTensorData<dtype>(input2),      \
+               tflite::micro::GetTensorShape(output),            \
+               tflite::micro::GetTensorData<dtype>(output));
 
     if (output->type == kTfLiteInt8) {
       if (need_broadcast) {
         TF_LITE_MUL(reference_integer_ops, BroadcastMul4DSlow, int8_t);
       } else {
         arm_elementwise_mul_s8(
-            GetTensorData<int8_t>(input1), GetTensorData<int8_t>(input2),
+            tflite::micro::GetTensorData<int8_t>(input1),
+            tflite::micro::GetTensorData<int8_t>(input2),
             op_params.input1_offset, op_params.input2_offset,
-            GetTensorData<int8_t>(output), op_params.output_offset,
-            op_params.output_multiplier, op_params.output_shift,
-            op_params.quantized_activation_min,
+            tflite::micro::GetTensorData<int8_t>(output),
+            op_params.output_offset, op_params.output_multiplier,
+            op_params.output_shift, op_params.quantized_activation_min,
             op_params.quantized_activation_max,
-            MatchingElementsSize(GetTensorShape(input1), GetTensorShape(input2),
-                                 GetTensorShape(output)));
+            MatchingElementsSize(tflite::micro::GetTensorShape(input1),
+                                 tflite::micro::GetTensorShape(input2),
+                                 tflite::micro::GetTensorShape(output)));
       }
     } else if (output->type == kTfLiteUInt8) {
       if (need_broadcast) {
@@ -125,9 +151,8 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 }
 
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
-               TfLiteMulParams* params, OpData* data,
-               const TfLiteTensor* input1, const TfLiteTensor* input2,
-               TfLiteTensor* output) {
+               TfLiteMulParams* params, const TfLiteEvalTensor* input1,
+               const TfLiteEvalTensor* input2, TfLiteEvalTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
@@ -135,12 +160,15 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
   SetActivationParams(output_activation_min, output_activation_max, &op_params);
 
   bool need_broadcast = reference_ops::ProcessBroadcastShapes(
-      GetTensorShape(input1), GetTensorShape(input2), &op_params);
-#define TF_LITE_MUL(opname)                                                   \
-  reference_ops::opname(op_params, GetTensorShape(input1),                    \
-                        GetTensorData<float>(input1), GetTensorShape(input2), \
-                        GetTensorData<float>(input2), GetTensorShape(output), \
-                        GetTensorData<float>(output));
+      tflite::micro::GetTensorShape(input1),
+      tflite::micro::GetTensorShape(input2), &op_params);
+#define TF_LITE_MUL(opname)                                               \
+  reference_ops::opname(op_params, tflite::micro::GetTensorShape(input1), \
+                        tflite::micro::GetTensorData<float>(input1),      \
+                        tflite::micro::GetTensorShape(input2),            \
+                        tflite::micro::GetTensorData<float>(input2),      \
+                        tflite::micro::GetTensorShape(output),            \
+                        tflite::micro::GetTensorData<float>(output));
 
   if (need_broadcast) {
     TF_LITE_MUL(BroadcastMul4DSlow);
@@ -152,21 +180,24 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteMulParams*>(node->builtin_data);
-  OpData data;
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInput1Tensor);
-  const TfLiteTensor* input2 = GetInput(context, node, kInput2Tensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInput1Tensor);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInput2Tensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
-  CalculateOpData(context, node, params, &data);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
   switch (input1->type) {
     case kTfLiteUInt8:
     case kTfLiteInt8:
-      EvalQuantized(context, node, params, &data, input1, input2, output);
+      EvalQuantized(context, node, params, data, input1, input2, output);
       break;
     case kTfLiteFloat32:
-      EvalFloat(context, node, params, &data, input1, input2, output);
+      EvalFloat(context, node, params, input1, input2, output);
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@@ -179,8 +210,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace mul
 
 TfLiteRegistration Register_MUL() {
-  return {nullptr /* Init */, nullptr /* Free */, nullptr /* Prepare */,
-          mul::Eval};
+  return {mul::Init, nullptr /* Free */, mul::Prepare, mul::Eval};
 }
 
 }  // namespace micro
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc b/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
index 77883f48d7e..4229b2c244c 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -72,7 +73,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
 
 void AverageEvalFloat(const TfLiteContext* context, const TfLiteNode* node,
                       const TfLitePoolParams* params, const OpData& data,
-                      const TfLiteTensor* input, TfLiteTensor* output) {
+                      const TfLiteEvalTensor* input, TfLiteEvalTensor* output) {
   float activation_min, activation_max;
   CalculateActivationRange(params->activation, &activation_min,
                            &activation_max);
@@ -86,14 +87,16 @@ void AverageEvalFloat(const TfLiteContext* context, const TfLiteNode* node,
   op_params.padding_values.width = data.padding.width;
   op_params.float_activation_min = activation_min;
   op_params.float_activation_max = activation_max;
-  reference_ops::AveragePool(
-      op_params, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(output), GetTensorData<float>(output));
+  reference_ops::AveragePool(op_params, tflite::micro::GetTensorShape(input),
+                             tflite::micro::GetTensorData<float>(input),
+                             tflite::micro::GetTensorShape(output),
+                             tflite::micro::GetTensorData<float>(output));
 }
 
 void AverageEvalQuantized(TfLiteContext* context, const TfLiteNode* node,
                           const TfLitePoolParams* params, const OpData& data,
-                          const TfLiteTensor* input, TfLiteTensor* output) {
+                          const TfLiteEvalTensor* input,
+                          TfLiteEvalTensor* output) {
   TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
 
   PoolParams op_params;
@@ -107,14 +110,15 @@ void AverageEvalQuantized(TfLiteContext* context, const TfLiteNode* node,
   op_params.quantized_activation_max = data.activation_max;
 
   if (input->type == kTfLiteUInt8) {
-    reference_ops::AveragePool(
-        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-        GetTensorShape(output), GetTensorData<uint8_t>(output));
+    reference_ops::AveragePool(op_params, tflite::micro::GetTensorShape(input),
+                               tflite::micro::GetTensorData<uint8_t>(input),
+                               tflite::micro::GetTensorShape(output),
+                               tflite::micro::GetTensorData<uint8_t>(output));
   } else {
-    RuntimeShape input_shape = GetTensorShape(input);
+    RuntimeShape input_shape = tflite::micro::GetTensorShape(input);
     TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
 
-    RuntimeShape output_shape = GetTensorShape(output);
+    RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
     TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
 
     const int depth = MatchingDim(input_shape, 3, output_shape, 3);
@@ -154,15 +158,16 @@ void AverageEvalQuantized(TfLiteContext* context, const TfLiteNode* node,
 
     TFLITE_DCHECK_EQ(
         arm_avgpool_s8(&ctx, &pool_params, &input_dims,
-                       GetTensorData<int8_t>(input), &filter_dims, &output_dims,
-                       GetTensorData<int8_t>(output)),
+                       tflite::micro::GetTensorData<int8_t>(input),
+                       &filter_dims, &output_dims,
+                       tflite::micro::GetTensorData<int8_t>(output)),
         ARM_MATH_SUCCESS);
   }
 }
 
 void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
                   TfLitePoolParams* params, const OpData& data,
-                  TfLiteTensor* input, TfLiteTensor* output) {
+                  const TfLiteEvalTensor* input, TfLiteEvalTensor* output) {
   float activation_min, activation_max;
   CalculateActivationRange(params->activation, &activation_min,
                            &activation_max);
@@ -175,14 +180,16 @@ void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
   op_params.padding_values.width = data.padding.width;
   op_params.float_activation_min = activation_min;
   op_params.float_activation_max = activation_max;
-  reference_ops::MaxPool(op_params, GetTensorShape(input),
-                         GetTensorData<float>(input), GetTensorShape(output),
-                         GetTensorData<float>(output));
+  reference_ops::MaxPool(op_params, tflite::micro::GetTensorShape(input),
+                         tflite::micro::GetTensorData<float>(input),
+                         tflite::micro::GetTensorShape(output),
+                         tflite::micro::GetTensorData<float>(output));
 }
 
 void MaxEvalQuantizedUInt8(TfLiteContext* context, TfLiteNode* node,
                            TfLitePoolParams* params, const OpData& data,
-                           TfLiteTensor* input, TfLiteTensor* output) {
+                           const TfLiteEvalTensor* input,
+                           TfLiteEvalTensor* output) {
   tflite::PoolParams op_params;
   op_params.stride_height = params->stride_height;
   op_params.stride_width = params->stride_width;
@@ -192,16 +199,18 @@ void MaxEvalQuantizedUInt8(TfLiteContext* context, TfLiteNode* node,
   op_params.padding_values.width = data.padding.width;
   op_params.quantized_activation_min = data.activation_min;
   op_params.quantized_activation_max = data.activation_max;
-  reference_ops::MaxPool(op_params, GetTensorShape(input),
-                         GetTensorData<uint8_t>(input), GetTensorShape(output),
-                         GetTensorData<uint8_t>(output));
+  reference_ops::MaxPool(op_params, tflite::micro::GetTensorShape(input),
+                         tflite::micro::GetTensorData<uint8_t>(input),
+                         tflite::micro::GetTensorShape(output),
+                         tflite::micro::GetTensorData<uint8_t>(output));
 }
 
 TfLiteStatus MaxEvalInt8(TfLiteContext* context, const TfLiteNode* node,
                          const TfLitePoolParams* params, const OpData& data,
-                         TfLiteTensor* input, TfLiteTensor* output) {
-  RuntimeShape input_shape = GetTensorShape(input);
-  RuntimeShape output_shape = GetTensorShape(output);
+                         const TfLiteEvalTensor* input,
+                         TfLiteEvalTensor* output) {
+  RuntimeShape input_shape = tflite::micro::GetTensorShape(input);
+  RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
   const int depth = MatchingDim(input_shape, 3, output_shape, 3);
 
   cmsis_nn_dims input_dims;
@@ -237,10 +246,12 @@ TfLiteStatus MaxEvalInt8(TfLiteContext* context, const TfLiteNode* node,
     ctx.buf = context->GetScratchBuffer(context, data.buffer_idx);
   }
 
-  TFLITE_DCHECK_EQ(arm_max_pool_s8(&ctx, &pool_params, &input_dims,
-                                   GetTensorData<int8_t>(input), &filter_dims,
-                                   &output_dims, GetTensorData<int8_t>(output)),
-                   ARM_MATH_SUCCESS);
+  TFLITE_DCHECK_EQ(
+      arm_max_pool_s8(&ctx, &pool_params, &input_dims,
+                      tflite::micro::GetTensorData<int8_t>(input), &filter_dims,
+                      &output_dims,
+                      tflite::micro::GetTensorData<int8_t>(output)),
+      ARM_MATH_SUCCESS);
 
   return kTfLiteOk;
 }
@@ -249,12 +260,7 @@ TfLiteStatus MaxEvalInt8(TfLiteContext* context, const TfLiteNode* node,
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 TfLiteStatus MaxPrepare(TfLiteContext* context, TfLiteNode* node) {
@@ -312,8 +318,10 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
 
   const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
   // Inputs and outputs share the same type, guaranteed by the converter.
   switch (input->type) {
@@ -337,9 +345,10 @@ TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
 
   const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
-  TfLiteTensor* input = &context->tensors[flatbuffers::EndianScalar(
-      node->inputs->data[kInputTensor])];
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
   switch (input->type) {
     case kTfLiteFloat32:
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/softmax.cc b/tensorflow/lite/micro/kernels/cmsis-nn/softmax.cc
index 4b7179ac077..194bba4f26a 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/softmax.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/softmax.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "cmsis/CMSIS/NN/Include/arm_nnfunctions.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -38,7 +39,8 @@ TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,
       TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteInt8);
       if (output->type == kTfLiteInt16) {
         TF_LITE_ENSURE_EQ(context, output->params.zero_point, -32768);
-        // NOTE: Current int16 softmax output does not require symmetric scaling
+        // NOTE: Current int16_t softmax output does not require symmetric
+        // scaling
         // - so no need to verify scale here.
       } else {
         TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);
@@ -46,8 +48,6 @@ TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,
         TF_LITE_ENSURE(context, output->params.scale == 1.f / 256);
       }
     }
-    TF_LITE_ENSURE(context, (output->params.scale == 1.f / 256) ||
-                                (output->params.scale == 1.f / 255));
 
     static const int kScaledDiffIntegerBits = 5;
 
@@ -70,71 +70,84 @@ TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,
 
 }  // namespace
 
+void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(SoftmaxParams));
+}
+
 TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   const TfLiteTensor* input = GetInput(context, node, 0);
   TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
 
-  return kTfLiteOk;
+  TfLiteTensor* output = GetOutput(context, node, 0);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  SoftmaxParams* data = static_cast<SoftmaxParams*>(node->user_data);
+  return CalculateSoftmaxParams(context, input, output, params, data);
 }
 
 // Takes a tensor and performs softmax along the last dimension.
-void SoftmaxFloat(const TfLiteTensor* input, TfLiteTensor* output,
+void SoftmaxFloat(const TfLiteEvalTensor* input, TfLiteEvalTensor* output,
                   const SoftmaxParams& op_data) {
-  tflite::reference_ops::Softmax(
-      op_data, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(output), GetTensorData<float>(output));
+  tflite::reference_ops::Softmax(op_data, tflite::micro::GetTensorShape(input),
+                                 tflite::micro::GetTensorData<float>(input),
+                                 tflite::micro::GetTensorShape(output),
+                                 tflite::micro::GetTensorData<float>(output));
 }
 
-void SoftmaxQuantized(const TfLiteTensor* input, TfLiteTensor* output,
+void SoftmaxQuantized(const TfLiteEvalTensor* input, TfLiteEvalTensor* output,
                       const SoftmaxParams& op_data) {
-  const auto input_shape = GetTensorShape(input);
-  const auto output_shape = GetTensorShape(output);
+  const auto input_shape = tflite::micro::GetTensorShape(input);
+  const auto output_shape = tflite::micro::GetTensorShape(output);
 
   if (input->type == kTfLiteUInt8) {
-    tflite::reference_ops::Softmax(op_data, input_shape,
-                                   GetTensorData<uint8_t>(input), output_shape,
-                                   GetTensorData<uint8_t>(output));
+    tflite::reference_ops::Softmax(
+        op_data, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<uint8_t>(input),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<uint8_t>(output));
   } else {
     if (output->type == kTfLiteInt16) {
       tflite::reference_ops::Softmax(
-          op_data, GetTensorShape(input), GetTensorData<int8_t>(input),
-          GetTensorShape(output), GetTensorData<int16_t>(output));
+          op_data, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int16_t>(output));
     } else {
-      const unsigned int num_dims = NumDimensions(input);
-
       const int trailing_dim = input_shape.DimensionsCount() - 1;
       const int outer_size =
           MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
       const int depth =
           MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
 
-      arm_softmax_s8(GetTensorData<int8_t>(input), outer_size, depth,
-                     op_data.input_multiplier, op_data.input_left_shift,
-                     op_data.diff_min, GetTensorData<int8_t>(output));
+      arm_softmax_s8(tflite::micro::GetTensorData<int8_t>(input), outer_size,
+                     depth, op_data.input_multiplier, op_data.input_left_shift,
+                     op_data.diff_min,
+                     tflite::micro::GetTensorData<int8_t>(output));
     }
   }
 }
 
 TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data);
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
 
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
-
-  SoftmaxParams op_data;
-  TF_LITE_ENSURE_STATUS(
-      CalculateSoftmaxParams(context, input, output, params, &op_data));
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const SoftmaxParams& data =
+      *(static_cast<const SoftmaxParams*>(node->user_data));
 
   switch (input->type) {
     case kTfLiteFloat32: {
-      SoftmaxFloat(input, output, op_data);
+      SoftmaxFloat(input, output, data);
       return kTfLiteOk;
     }
     case kTfLiteInt8:
     case kTfLiteUInt8: {
-      SoftmaxQuantized(input, output, op_data);
+      SoftmaxQuantized(input, output, data);
       return kTfLiteOk;
     }
     default:
@@ -143,10 +156,11 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteError;
   }
 }
+
 }  // namespace activations
 
 TfLiteRegistration Register_SOFTMAX() {
-  return {/*init=*/nullptr,
+  return {/*init=*/activations::SoftmaxInit,
           /*free=*/nullptr,
           /*prepare=*/activations::SoftmaxPrepare,
           /*invoke=*/activations::SoftmaxEval,
diff --git a/tensorflow/lite/micro/kernels/comparisons.cc b/tensorflow/lite/micro/kernels/comparisons.cc
index 0b3b9814fa4..ed7a20086f8 100644
--- a/tensorflow/lite/micro/kernels/comparisons.cc
+++ b/tensorflow/lite/micro/kernels/comparisons.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -25,103 +26,109 @@ namespace micro {
 namespace comparisons {
 namespace {
 
+struct OpData {
+  ComparisonParams params;
+};
+
 constexpr int kInputTensor1 = 0;
 constexpr int kInputTensor2 = 1;
 constexpr int kOutputTensor = 0;
 
-// TODO(ruic): optimize macros below to using template functions.
-#define TF_LITE_QUANTIZE_COMPARISON(opname)                                    \
-  template <typename input_dtype>                                              \
-  void EvalQuantized##opname(TfLiteContext* context, TfLiteNode* node,         \
-                             const TfLiteTensor* input1,                       \
-                             const TfLiteTensor* input2, TfLiteTensor* output, \
-                             bool requires_broadcast) {                        \
-    if (input1->type == kTfLiteUInt8 || input1->type == kTfLiteInt8) {         \
-      auto input1_offset = -input1->params.zero_point;                         \
-      auto input2_offset = -input2->params.zero_point;                         \
-      const int left_shift = 8;                                                \
-                                                                               \
-      int32 input1_multiplier;                                                 \
-      int input1_shift;                                                        \
-      QuantizeMultiplierSmallerThanOneExp(                                     \
-          static_cast<double>(input1->params.scale), &input1_multiplier,       \
-          &input1_shift);                                                      \
-      int32 input2_multiplier;                                                 \
-      int input2_shift;                                                        \
-      QuantizeMultiplierSmallerThanOneExp(                                     \
-          static_cast<double>(input2->params.scale), &input2_multiplier,       \
-          &input2_shift);                                                      \
-                                                                               \
-      ComparisonParams op_params;                                              \
-      op_params.left_shift = left_shift;                                       \
-      op_params.input1_offset = input1_offset;                                 \
-      op_params.input1_multiplier = input1_multiplier;                         \
-      op_params.input1_shift = input1_shift;                                   \
-      op_params.input2_offset = input2_offset;                                 \
-      op_params.input2_multiplier = input2_multiplier;                         \
-      op_params.input2_shift = input2_shift;                                   \
-      if (requires_broadcast) {                                                \
-        reference_ops::Broadcast4DSlow##opname##WithScaling(                   \
-            op_params, GetTensorShape(input1),                                 \
-            GetTensorData<input_dtype>(input1), GetTensorShape(input2),        \
-            GetTensorData<input_dtype>(input2), GetTensorShape(output),        \
-            GetTensorData<bool>(output));                                      \
-      } else {                                                                 \
-        reference_ops::opname##WithScaling(                                    \
-            op_params, GetTensorShape(input1),                                 \
-            GetTensorData<input_dtype>(input1), GetTensorShape(input2),        \
-            GetTensorData<input_dtype>(input2), GetTensorShape(output),        \
-            GetTensorData<bool>(output));                                      \
-      }                                                                        \
-    }                                                                          \
-  }
-TF_LITE_QUANTIZE_COMPARISON(Equal);
-TF_LITE_QUANTIZE_COMPARISON(NotEqual);
-TF_LITE_QUANTIZE_COMPARISON(Greater);
-TF_LITE_QUANTIZE_COMPARISON(GreaterEqual);
-TF_LITE_QUANTIZE_COMPARISON(Less);
-TF_LITE_QUANTIZE_COMPARISON(LessEqual);
-#undef TF_LITE_QUANTIZE_COMPARISON
-
-#define TF_LITE_COMPARISON(type, opname, requires_broadcast)                  \
-  {                                                                           \
-    ComparisonParams op_params;                                               \
-    requires_broadcast                                                        \
-        ? reference_ops::Broadcast4DSlow##opname##NoScaling(                  \
-              op_params, GetTensorShape(input1), GetTensorData<type>(input1), \
-              GetTensorShape(input2), GetTensorData<type>(input2),            \
-              GetTensorShape(output), GetTensorData<bool>(output))            \
-        : reference_ops::opname##NoScaling(                                   \
-              op_params, GetTensorShape(input1), GetTensorData<type>(input1), \
-              GetTensorShape(input2), GetTensorData<type>(input2),            \
-              GetTensorShape(output), GetTensorData<bool>(output));           \
-  }
-
 TfLiteStatus EqualEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
+
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  RuntimeShape input1_shape = tflite::micro::GetTensorShape(input1);
+  RuntimeShape input2_shape = tflite::micro::GetTensorShape(input2);
+  RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
+  bool* output_data = tflite::micro::GetTensorData<bool>(output);
+
+  bool requires_broadcast = !tflite::micro::HaveSameShapes(input1, input2);
   switch (input1->type) {
     case kTfLiteBool:
-      TF_LITE_COMPARISON(bool, Equal, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<bool>(input1), input2_shape,
+                tflite::micro::GetTensorData<bool>(input2), output_shape,
+                output_data)
+          : reference_ops::EqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<bool>(input1), input2_shape,
+                tflite::micro::GetTensorData<bool>(input2), output_shape,
+                output_data);
       break;
     case kTfLiteFloat32:
-      TF_LITE_COMPARISON(float, Equal, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<float>(input1), input2_shape,
+                tflite::micro::GetTensorData<float>(input2), output_shape,
+                output_data)
+          : reference_ops::EqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<float>(input1), input2_shape,
+                tflite::micro::GetTensorData<float>(input2), output_shape,
+                output_data);
       break;
     case kTfLiteInt32:
-      TF_LITE_COMPARISON(int32_t, Equal, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
+                output_data)
+          : reference_ops::EqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
+                output_data);
       break;
     case kTfLiteInt64:
-      TF_LITE_COMPARISON(int64_t, Equal, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
+                output_data)
+          : reference_ops::EqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
+                output_data);
       break;
     case kTfLiteUInt8:
-      EvalQuantizedEqual<uint8_t>(context, node, input1, input2, output,
-                                  requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
+                output_data)
+          : reference_ops::EqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
+                output_data);
       break;
     case kTfLiteInt8:
-      EvalQuantizedEqual<int8_t>(context, node, input1, input2, output,
-                                 requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
+                output_data)
+          : reference_ops::EqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
+                output_data);
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@@ -133,30 +140,100 @@ TfLiteStatus EqualEval(TfLiteContext* context, TfLiteNode* node) {
 
 // TODO(renjieliu): Refactor the logic to avoid duplications.
 TfLiteStatus NotEqualEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
+
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  RuntimeShape input1_shape = tflite::micro::GetTensorShape(input1);
+  RuntimeShape input2_shape = tflite::micro::GetTensorShape(input2);
+  RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
+  bool* output_data = tflite::micro::GetTensorData<bool>(output);
+
+  bool requires_broadcast = !tflite::micro::HaveSameShapes(input1, input2);
   switch (input1->type) {
     case kTfLiteBool:
-      TF_LITE_COMPARISON(bool, NotEqual, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowNotEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<bool>(input1), input2_shape,
+                tflite::micro::GetTensorData<bool>(input2), output_shape,
+                output_data)
+          : reference_ops::NotEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<bool>(input1), input2_shape,
+                tflite::micro::GetTensorData<bool>(input2), output_shape,
+                output_data);
       break;
     case kTfLiteFloat32:
-      TF_LITE_COMPARISON(float, NotEqual, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowNotEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<float>(input1), input2_shape,
+                tflite::micro::GetTensorData<float>(input2), output_shape,
+                output_data)
+          : reference_ops::NotEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<float>(input1), input2_shape,
+                tflite::micro::GetTensorData<float>(input2), output_shape,
+                output_data);
       break;
     case kTfLiteInt32:
-      TF_LITE_COMPARISON(int32_t, NotEqual, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowNotEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
+                output_data)
+          : reference_ops::NotEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
+                output_data);
       break;
     case kTfLiteInt64:
-      TF_LITE_COMPARISON(int64_t, NotEqual, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowNotEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
+                output_data)
+          : reference_ops::NotEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
+                output_data);
       break;
     case kTfLiteUInt8:
-      EvalQuantizedNotEqual<uint8_t>(context, node, input1, input2, output,
-                                     requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowNotEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
+                output_data)
+          : reference_ops::NotEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
+                output_data);
       break;
     case kTfLiteInt8:
-      EvalQuantizedNotEqual<int8_t>(context, node, input1, input2, output,
-                                    requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowNotEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
+                output_data)
+          : reference_ops::NotEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
+                output_data);
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@@ -167,27 +244,87 @@ TfLiteStatus NotEqualEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus GreaterEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
+
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  RuntimeShape input1_shape = tflite::micro::GetTensorShape(input1);
+  RuntimeShape input2_shape = tflite::micro::GetTensorShape(input2);
+  RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
+  bool* output_data = tflite::micro::GetTensorData<bool>(output);
+
+  bool requires_broadcast = !tflite::micro::HaveSameShapes(input1, input2);
   switch (input1->type) {
     case kTfLiteFloat32:
-      TF_LITE_COMPARISON(float, Greater, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowGreaterNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<float>(input1), input2_shape,
+                tflite::micro::GetTensorData<float>(input2), output_shape,
+                output_data)
+          : reference_ops::GreaterNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<float>(input1), input2_shape,
+                tflite::micro::GetTensorData<float>(input2), output_shape,
+                output_data);
       break;
     case kTfLiteInt32:
-      TF_LITE_COMPARISON(int32_t, Greater, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowGreaterNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
+                output_data)
+          : reference_ops::GreaterNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
+                output_data);
       break;
     case kTfLiteInt64:
-      TF_LITE_COMPARISON(int64_t, Greater, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowGreaterNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
+                output_data)
+          : reference_ops::GreaterNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
+                output_data);
       break;
     case kTfLiteUInt8:
-      EvalQuantizedGreater<uint8_t>(context, node, input1, input2, output,
-                                    requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowGreaterWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
+                output_data)
+          : reference_ops::GreaterWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
+                output_data);
       break;
     case kTfLiteInt8:
-      EvalQuantizedGreater<int8_t>(context, node, input1, input2, output,
-                                   requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowGreaterWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
+                output_data)
+          : reference_ops::GreaterWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
+                output_data);
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@@ -198,27 +335,87 @@ TfLiteStatus GreaterEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus GreaterEqualEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
+
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  RuntimeShape input1_shape = tflite::micro::GetTensorShape(input1);
+  RuntimeShape input2_shape = tflite::micro::GetTensorShape(input2);
+  RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
+  bool* output_data = tflite::micro::GetTensorData<bool>(output);
+
+  bool requires_broadcast = !tflite::micro::HaveSameShapes(input1, input2);
   switch (input1->type) {
     case kTfLiteFloat32:
-      TF_LITE_COMPARISON(float, GreaterEqual, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowGreaterEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<float>(input1), input2_shape,
+                tflite::micro::GetTensorData<float>(input2), output_shape,
+                output_data)
+          : reference_ops::GreaterEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<float>(input1), input2_shape,
+                tflite::micro::GetTensorData<float>(input2), output_shape,
+                output_data);
       break;
     case kTfLiteInt32:
-      TF_LITE_COMPARISON(int32_t, GreaterEqual, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowGreaterEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
+                output_data)
+          : reference_ops::GreaterEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
+                output_data);
       break;
     case kTfLiteInt64:
-      TF_LITE_COMPARISON(int64_t, GreaterEqual, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowGreaterEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
+                output_data)
+          : reference_ops::GreaterEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
+                output_data);
       break;
     case kTfLiteUInt8:
-      EvalQuantizedGreaterEqual<uint8_t>(context, node, input1, input2, output,
-                                         requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowGreaterEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
+                output_data)
+          : reference_ops::GreaterEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
+                output_data);
       break;
     case kTfLiteInt8:
-      EvalQuantizedGreaterEqual<int8_t>(context, node, input1, input2, output,
-                                        requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowGreaterEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
+                output_data)
+          : reference_ops::GreaterEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
+                output_data);
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@@ -229,27 +426,87 @@ TfLiteStatus GreaterEqualEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus LessEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
+
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  RuntimeShape input1_shape = tflite::micro::GetTensorShape(input1);
+  RuntimeShape input2_shape = tflite::micro::GetTensorShape(input2);
+  RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
+  bool* output_data = tflite::micro::GetTensorData<bool>(output);
+
+  bool requires_broadcast = !tflite::micro::HaveSameShapes(input1, input2);
   switch (input1->type) {
     case kTfLiteFloat32:
-      TF_LITE_COMPARISON(float, Less, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowLessNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<float>(input1), input2_shape,
+                tflite::micro::GetTensorData<float>(input2), output_shape,
+                output_data)
+          : reference_ops::LessNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<float>(input1), input2_shape,
+                tflite::micro::GetTensorData<float>(input2), output_shape,
+                output_data);
       break;
     case kTfLiteInt32:
-      TF_LITE_COMPARISON(int32_t, Less, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowLessNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
+                output_data)
+          : reference_ops::LessNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
+                output_data);
       break;
     case kTfLiteInt64:
-      TF_LITE_COMPARISON(int64_t, Less, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowLessNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
+                output_data)
+          : reference_ops::LessNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
+                output_data);
       break;
     case kTfLiteUInt8:
-      EvalQuantizedLess<uint8_t>(context, node, input1, input2, output,
-                                 requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowLessWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
+                output_data)
+          : reference_ops::LessWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
+                output_data);
       break;
     case kTfLiteInt8:
-      EvalQuantizedLess<int8_t>(context, node, input1, input2, output,
-                                requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowLessWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
+                output_data)
+          : reference_ops::LessWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
+                output_data);
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@@ -260,27 +517,87 @@ TfLiteStatus LessEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus LessEqualEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
+
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  RuntimeShape input1_shape = tflite::micro::GetTensorShape(input1);
+  RuntimeShape input2_shape = tflite::micro::GetTensorShape(input2);
+  RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
+  bool* output_data = tflite::micro::GetTensorData<bool>(output);
+
+  bool requires_broadcast = !tflite::micro::HaveSameShapes(input1, input2);
   switch (input1->type) {
     case kTfLiteFloat32:
-      TF_LITE_COMPARISON(float, LessEqual, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowLessEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<float>(input1), input2_shape,
+                tflite::micro::GetTensorData<float>(input2), output_shape,
+                output_data)
+          : reference_ops::LessEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<float>(input1), input2_shape,
+                tflite::micro::GetTensorData<float>(input2), output_shape,
+                output_data);
       break;
     case kTfLiteInt32:
-      TF_LITE_COMPARISON(int32_t, LessEqual, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowLessEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
+                output_data)
+          : reference_ops::LessEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
+                output_data);
       break;
     case kTfLiteInt64:
-      TF_LITE_COMPARISON(int64_t, LessEqual, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowLessEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
+                output_data)
+          : reference_ops::LessEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
+                output_data);
       break;
     case kTfLiteUInt8:
-      EvalQuantizedLessEqual<uint8_t>(context, node, input1, input2, output,
-                                      requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowLessEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
+                output_data)
+          : reference_ops::LessEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
+                output_data);
       break;
     case kTfLiteInt8:
-      EvalQuantizedLessEqual<int8_t>(context, node, input1, input2, output,
-                                     requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowLessEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
+                output_data)
+          : reference_ops::LessEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
+                output_data);
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@@ -291,12 +608,53 @@ TfLiteStatus LessEqualEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 }  // namespace
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+
+  if (input1->type == kTfLiteUInt8 || input1->type == kTfLiteInt8) {
+    auto input1_offset = -input1->params.zero_point;
+    auto input2_offset = -input2->params.zero_point;
+    const int kLeftShift = 8;
+
+    int32_t input1_multiplier;
+    int input1_shift;
+    QuantizeMultiplierSmallerThanOneExp(
+        static_cast<double>(input1->params.scale), &input1_multiplier,
+        &input1_shift);
+    int32_t input2_multiplier;
+    int input2_shift;
+    QuantizeMultiplierSmallerThanOneExp(
+        static_cast<double>(input2->params.scale), &input2_multiplier,
+        &input2_shift);
+
+    data->params.left_shift = kLeftShift;
+    data->params.input1_offset = input1_offset;
+    data->params.input1_multiplier = input1_multiplier;
+    data->params.input1_shift = input1_shift;
+    data->params.input2_offset = input2_offset;
+    data->params.input2_multiplier = input2_multiplier;
+    data->params.input2_shift = input2_shift;
+  }
+
+  return kTfLiteOk;
+}
+
 }  // namespace comparisons
 
 TfLiteRegistration Register_EQUAL() {
-  return {/*init=*/nullptr,
+  return {/*init=*/comparisons::Init,
           /*free=*/nullptr,
-          /*prepare=*/nullptr,
+          /*prepare=*/comparisons::Prepare,
           /*invoke=*/comparisons::EqualEval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
@@ -305,9 +663,9 @@ TfLiteRegistration Register_EQUAL() {
 }
 
 TfLiteRegistration Register_NOT_EQUAL() {
-  return {/*init=*/nullptr,
+  return {/*init=*/comparisons::Init,
           /*free=*/nullptr,
-          /*prepare=*/nullptr,
+          /*prepare=*/comparisons::Prepare,
           /*invoke=*/comparisons::NotEqualEval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
@@ -316,9 +674,9 @@ TfLiteRegistration Register_NOT_EQUAL() {
 }
 
 TfLiteRegistration Register_GREATER() {
-  return {/*init=*/nullptr,
+  return {/*init=*/comparisons::Init,
           /*free=*/nullptr,
-          /*prepare=*/nullptr,
+          /*prepare=*/comparisons::Prepare,
           /*invoke=*/comparisons::GreaterEval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
@@ -327,9 +685,9 @@ TfLiteRegistration Register_GREATER() {
 }
 
 TfLiteRegistration Register_GREATER_EQUAL() {
-  return {/*init=*/nullptr,
+  return {/*init=*/comparisons::Init,
           /*free=*/nullptr,
-          /*prepare=*/nullptr,
+          /*prepare=*/comparisons::Prepare,
           /*invoke=*/comparisons::GreaterEqualEval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
@@ -338,9 +696,9 @@ TfLiteRegistration Register_GREATER_EQUAL() {
 }
 
 TfLiteRegistration Register_LESS() {
-  return {/*init=*/nullptr,
+  return {/*init=*/comparisons::Init,
           /*free=*/nullptr,
-          /*prepare=*/nullptr,
+          /*prepare=*/comparisons::Prepare,
           /*invoke=*/comparisons::LessEval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
@@ -349,9 +707,9 @@ TfLiteRegistration Register_LESS() {
 }
 
 TfLiteRegistration Register_LESS_EQUAL() {
-  return {/*init=*/nullptr,
+  return {/*init=*/comparisons::Init,
           /*free=*/nullptr,
-          /*prepare=*/nullptr,
+          /*prepare=*/comparisons::Prepare,
           /*invoke=*/comparisons::LessEqualEval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
diff --git a/tensorflow/lite/micro/kernels/comparisons_test.cc b/tensorflow/lite/micro/kernels/comparisons_test.cc
index b19c2aa8f01..393f0e22187 100644
--- a/tensorflow/lite/micro/kernels/comparisons_test.cc
+++ b/tensorflow/lite/micro/kernels/comparisons_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
@@ -29,46 +29,33 @@ constexpr int inputs_size = 2;
 constexpr int outputs_size = 1;
 constexpr int tensors_size = inputs_size + outputs_size;
 
-void TestComparison(tflite::BuiltinOperator op, TfLiteTensor* tensors,
-                    bool* expected_output_data, bool* output_data) {
+void TestComparison(const TfLiteRegistration& registration,
+                    TfLiteTensor* tensors, bool* expected_output_data,
+                    bool* output_data) {
   const int output_dims_count = ElementCount(*tensors[inputs_size].dims);
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration = resolver.FindOp(op);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
   const int inputs_array_data[] = {2, 0, 1};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   const int outputs_array_data[] = {1, 2};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = nullptr;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array, /*builtin_data=*/nullptr,
+                             micro_test::reporter);
 
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   for (int i = 0; i < output_dims_count; ++i) {
     TF_LITE_MICRO_EXPECT_EQ(expected_output_data[i], output_data[i]);
   }
 }
 
-void TestComparisonFloat(tflite::BuiltinOperator op, int* input1_dims_data,
-                         float* input1_data, int* input2_dims_data,
-                         float* input2_data, bool* expected_output_data,
-                         int* output_dims_data, bool* output_data) {
+void TestComparisonFloat(const TfLiteRegistration& registration,
+                         int* input1_dims_data, float* input1_data,
+                         int* input2_dims_data, float* input2_data,
+                         bool* expected_output_data, int* output_dims_data,
+                         bool* output_data) {
   TfLiteIntArray* input1_dims = IntArrayFromInts(input1_dims_data);
   TfLiteIntArray* input2_dims = IntArrayFromInts(input2_dims_data);
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
@@ -79,13 +66,14 @@ void TestComparisonFloat(tflite::BuiltinOperator op, int* input1_dims_data,
       CreateBoolTensor(output_data, output_dims),
   };
 
-  TestComparison(op, tensors, expected_output_data, output_data);
+  TestComparison(registration, tensors, expected_output_data, output_data);
 }
 
-void TestComparisonBool(tflite::BuiltinOperator op, int* input1_dims_data,
-                        bool* input1_data, int* input2_dims_data,
-                        bool* input2_data, bool* expected_output_data,
-                        int* output_dims_data, bool* output_data) {
+void TestComparisonBool(const TfLiteRegistration& registration,
+                        int* input1_dims_data, bool* input1_data,
+                        int* input2_dims_data, bool* input2_data,
+                        bool* expected_output_data, int* output_dims_data,
+                        bool* output_data) {
   TfLiteIntArray* input1_dims = IntArrayFromInts(input1_dims_data);
   TfLiteIntArray* input2_dims = IntArrayFromInts(input2_dims_data);
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
@@ -96,13 +84,14 @@ void TestComparisonBool(tflite::BuiltinOperator op, int* input1_dims_data,
       CreateBoolTensor(output_data, output_dims),
   };
 
-  TestComparison(op, tensors, expected_output_data, output_data);
+  TestComparison(registration, tensors, expected_output_data, output_data);
 }
 
-void TestComparisonInt(tflite::BuiltinOperator op, int* input1_dims_data,
-                       int32_t* input1_data, int* input2_dims_data,
-                       int32_t* input2_data, bool* expected_output_data,
-                       int* output_dims_data, bool* output_data) {
+void TestComparisonInt(const TfLiteRegistration& registration,
+                       int* input1_dims_data, int32_t* input1_data,
+                       int* input2_dims_data, int32_t* input2_data,
+                       bool* expected_output_data, int* output_dims_data,
+                       bool* output_data) {
   TfLiteIntArray* input1_dims = IntArrayFromInts(input1_dims_data);
   TfLiteIntArray* input2_dims = IntArrayFromInts(input2_dims_data);
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
@@ -113,10 +102,10 @@ void TestComparisonInt(tflite::BuiltinOperator op, int* input1_dims_data,
       CreateBoolTensor(output_data, output_dims),
   };
 
-  TestComparison(op, tensors, expected_output_data, output_data);
+  TestComparison(registration, tensors, expected_output_data, output_data);
 }
 
-void TestComparisonQuantizedUInt8(tflite::BuiltinOperator op,
+void TestComparisonQuantizedUInt8(const TfLiteRegistration& registration,
                                   int* input1_dims_data, float* input1_data,
                                   uint8_t* input1_quantized, float input1_scale,
                                   int input1_zero_point, int* input2_dims_data,
@@ -127,7 +116,6 @@ void TestComparisonQuantizedUInt8(tflite::BuiltinOperator op,
   TfLiteIntArray* input1_dims = IntArrayFromInts(input1_dims_data);
   TfLiteIntArray* input2_dims = IntArrayFromInts(input2_dims_data);
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
-  const int output_dims_count = ElementCount(*output_dims);
 
   TfLiteTensor tensors[tensors_size] = {
       CreateQuantizedTensor(input1_data, input1_quantized, input1_dims,
@@ -137,10 +125,10 @@ void TestComparisonQuantizedUInt8(tflite::BuiltinOperator op,
       CreateBoolTensor(output_data, output_dims),
   };
 
-  TestComparison(op, tensors, expected_output_data, output_data);
+  TestComparison(registration, tensors, expected_output_data, output_data);
 }
 
-void TestComparisonQuantizedInt8(tflite::BuiltinOperator op,
+void TestComparisonQuantizedInt8(const TfLiteRegistration& registration,
                                  int* input1_dims_data, float* input1_data,
                                  int8_t* input1_quantized, float input1_scale,
                                  int input1_zero_point, int* input2_dims_data,
@@ -151,7 +139,6 @@ void TestComparisonQuantizedInt8(tflite::BuiltinOperator op,
   TfLiteIntArray* input1_dims = IntArrayFromInts(input1_dims_data);
   TfLiteIntArray* input2_dims = IntArrayFromInts(input2_dims_data);
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
-  const int output_dims_count = ElementCount(*output_dims);
 
   TfLiteTensor tensors[tensors_size] = {
       CreateQuantizedTensor(input1_data, input1_quantized, input1_dims,
@@ -161,7 +148,7 @@ void TestComparisonQuantizedInt8(tflite::BuiltinOperator op,
       CreateBoolTensor(output_data, output_dims),
   };
 
-  TestComparison(op, tensors, expected_output_data, output_data);
+  TestComparison(registration, tensors, expected_output_data, output_data);
 }
 
 }  // namespace
@@ -181,9 +168,9 @@ TF_LITE_MICRO_TEST(EqualBool) {
   int expected_dim[] = {4, 1, 1, 1, 4};
 
   bool output_data[4];
-  tflite::testing::TestComparisonBool(tflite::BuiltinOperator_EQUAL, input1_dim,
-                                      input1_data, input2_dim, input2_data,
-                                      expected_data, expected_dim, output_data);
+  tflite::testing::TestComparisonBool(
+      tflite::ops::micro::Register_EQUAL(), input1_dim, input1_data, input2_dim,
+      input2_data, expected_data, expected_dim, output_data);
 }
 
 TF_LITE_MICRO_TEST(EqualFloat) {
@@ -198,7 +185,7 @@ TF_LITE_MICRO_TEST(EqualFloat) {
 
   bool output_data[4];
   tflite::testing::TestComparisonFloat(
-      tflite::BuiltinOperator_EQUAL, input1_dim, input1_data, input2_dim,
+      tflite::ops::micro::Register_EQUAL(), input1_dim, input1_data, input2_dim,
       input2_data, expected_data, expected_dim, output_data);
 }
 
@@ -212,9 +199,9 @@ TF_LITE_MICRO_TEST(EqualInt) {
   bool expected_data[] = {false, false, true, false};
   int expected_dim[] = {4, 1, 1, 1, 4};
   bool output_data[4];
-  tflite::testing::TestComparisonInt(tflite::BuiltinOperator_EQUAL, input1_dim,
-                                     input1_data, input2_dim, input2_data,
-                                     expected_data, expected_dim, output_data);
+  tflite::testing::TestComparisonInt(
+      tflite::ops::micro::Register_EQUAL(), input1_dim, input1_data, input2_dim,
+      input2_data, expected_data, expected_dim, output_data);
 }
 
 TF_LITE_MICRO_TEST(EqualBroadcast) {
@@ -228,9 +215,9 @@ TF_LITE_MICRO_TEST(EqualBroadcast) {
   int expected_dim[] = {4, 1, 1, 1, 4};
 
   bool output_data[4];
-  tflite::testing::TestComparisonInt(tflite::BuiltinOperator_EQUAL, input1_dim,
-                                     input1_data, input2_dim, input2_data,
-                                     expected_data, expected_dim, output_data);
+  tflite::testing::TestComparisonInt(
+      tflite::ops::micro::Register_EQUAL(), input1_dim, input1_data, input2_dim,
+      input2_data, expected_data, expected_dim, output_data);
 }
 
 TF_LITE_MICRO_TEST(EqualBroadcastTwoD) {
@@ -245,9 +232,9 @@ TF_LITE_MICRO_TEST(EqualBroadcastTwoD) {
   int expected_dim[] = {4, 1, 1, 2, 4};
 
   bool output_data[8];
-  tflite::testing::TestComparisonInt(tflite::BuiltinOperator_EQUAL, input1_dim,
-                                     input1_data, input2_dim, input2_data,
-                                     expected_data, expected_dim, output_data);
+  tflite::testing::TestComparisonInt(
+      tflite::ops::micro::Register_EQUAL(), input1_dim, input1_data, input2_dim,
+      input2_data, expected_data, expected_dim, output_data);
 }
 
 TF_LITE_MICRO_TEST(NotEqualBool) {
@@ -262,8 +249,8 @@ TF_LITE_MICRO_TEST(NotEqualBool) {
 
   bool output_data[4];
   tflite::testing::TestComparisonBool(
-      tflite::BuiltinOperator_NOT_EQUAL, input1_dim, input1_data, input2_dim,
-      input2_data, expected_data, expected_dim, output_data);
+      tflite::ops::micro::Register_NOT_EQUAL(), input1_dim, input1_data,
+      input2_dim, input2_data, expected_data, expected_dim, output_data);
 }
 
 TF_LITE_MICRO_TEST(NotEqualFloat) {
@@ -278,8 +265,8 @@ TF_LITE_MICRO_TEST(NotEqualFloat) {
 
   bool output_data[4];
   tflite::testing::TestComparisonFloat(
-      tflite::BuiltinOperator_NOT_EQUAL, input1_dim, input1_data, input2_dim,
-      input2_data, expected_data, expected_dim, output_data);
+      tflite::ops::micro::Register_NOT_EQUAL(), input1_dim, input1_data,
+      input2_dim, input2_data, expected_data, expected_dim, output_data);
 }
 
 TF_LITE_MICRO_TEST(NotEqualInt) {
@@ -294,8 +281,8 @@ TF_LITE_MICRO_TEST(NotEqualInt) {
 
   bool output_data[4];
   tflite::testing::TestComparisonInt(
-      tflite::BuiltinOperator_NOT_EQUAL, input1_dim, input1_data, input2_dim,
-      input2_data, expected_data, expected_dim, output_data);
+      tflite::ops::micro::Register_NOT_EQUAL(), input1_dim, input1_data,
+      input2_dim, input2_data, expected_data, expected_dim, output_data);
 }
 
 TF_LITE_MICRO_TEST(NotEqualBroadcast) {
@@ -310,8 +297,8 @@ TF_LITE_MICRO_TEST(NotEqualBroadcast) {
 
   bool output_data[4];
   tflite::testing::TestComparisonInt(
-      tflite::BuiltinOperator_NOT_EQUAL, input1_dim, input1_data, input2_dim,
-      input2_data, expected_data, expected_dim, output_data);
+      tflite::ops::micro::Register_NOT_EQUAL(), input1_dim, input1_data,
+      input2_dim, input2_data, expected_data, expected_dim, output_data);
 }
 
 TF_LITE_MICRO_TEST(NotEqualBroadcastTwoD) {
@@ -326,8 +313,8 @@ TF_LITE_MICRO_TEST(NotEqualBroadcastTwoD) {
 
   bool output_data[8];
   tflite::testing::TestComparisonInt(
-      tflite::BuiltinOperator_NOT_EQUAL, input1_dim, input1_data, input2_dim,
-      input2_data, expected_data, expected_dim, output_data);
+      tflite::ops::micro::Register_NOT_EQUAL(), input1_dim, input1_data,
+      input2_dim, input2_data, expected_data, expected_dim, output_data);
 }
 
 TF_LITE_MICRO_TEST(GreaterFloat) {
@@ -342,8 +329,8 @@ TF_LITE_MICRO_TEST(GreaterFloat) {
 
   bool output_data[4];
   tflite::testing::TestComparisonFloat(
-      tflite::BuiltinOperator_GREATER, input1_dim, input1_data, input2_dim,
-      input2_data, expected_data, expected_dim, output_data);
+      tflite::ops::micro::Register_GREATER(), input1_dim, input1_data,
+      input2_dim, input2_data, expected_data, expected_dim, output_data);
 }
 
 TF_LITE_MICRO_TEST(GreaterInt) {
@@ -358,8 +345,8 @@ TF_LITE_MICRO_TEST(GreaterInt) {
 
   bool output_data[4];
   tflite::testing::TestComparisonInt(
-      tflite::BuiltinOperator_GREATER, input1_dim, input1_data, input2_dim,
-      input2_data, expected_data, expected_dim, output_data);
+      tflite::ops::micro::Register_GREATER(), input1_dim, input1_data,
+      input2_dim, input2_data, expected_data, expected_dim, output_data);
 }
 
 TF_LITE_MICRO_TEST(GreaterBroadcast) {
@@ -374,8 +361,8 @@ TF_LITE_MICRO_TEST(GreaterBroadcast) {
 
   bool output_data[4];
   tflite::testing::TestComparisonInt(
-      tflite::BuiltinOperator_GREATER, input1_dim, input1_data, input2_dim,
-      input2_data, expected_data, expected_dim, output_data);
+      tflite::ops::micro::Register_GREATER(), input1_dim, input1_data,
+      input2_dim, input2_data, expected_data, expected_dim, output_data);
 }
 
 TF_LITE_MICRO_TEST(GreaterBroadcastTwoD) {
@@ -390,8 +377,8 @@ TF_LITE_MICRO_TEST(GreaterBroadcastTwoD) {
 
   bool output_data[8];
   tflite::testing::TestComparisonInt(
-      tflite::BuiltinOperator_GREATER, input1_dim, input1_data, input2_dim,
-      input2_data, expected_data, expected_dim, output_data);
+      tflite::ops::micro::Register_GREATER(), input1_dim, input1_data,
+      input2_dim, input2_data, expected_data, expected_dim, output_data);
 }
 
 TF_LITE_MICRO_TEST(GreaterEqualFloat) {
@@ -406,7 +393,7 @@ TF_LITE_MICRO_TEST(GreaterEqualFloat) {
 
   bool output_data[4];
   tflite::testing::TestComparisonFloat(
-      tflite::BuiltinOperator_GREATER_EQUAL, input1_dim, input1_data,
+      tflite::ops::micro::Register_GREATER_EQUAL(), input1_dim, input1_data,
       input2_dim, input2_data, expected_data, expected_dim, output_data);
 }
 
@@ -422,7 +409,7 @@ TF_LITE_MICRO_TEST(GreaterEqualInt) {
 
   bool output_data[4];
   tflite::testing::TestComparisonInt(
-      tflite::BuiltinOperator_GREATER_EQUAL, input1_dim, input1_data,
+      tflite::ops::micro::Register_GREATER_EQUAL(), input1_dim, input1_data,
       input2_dim, input2_data, expected_data, expected_dim, output_data);
 }
 
@@ -438,7 +425,7 @@ TF_LITE_MICRO_TEST(GreaterEqualBroadcast) {
 
   bool output_data[4];
   tflite::testing::TestComparisonInt(
-      tflite::BuiltinOperator_GREATER_EQUAL, input1_dim, input1_data,
+      tflite::ops::micro::Register_GREATER_EQUAL(), input1_dim, input1_data,
       input2_dim, input2_data, expected_data, expected_dim, output_data);
 }
 
@@ -454,7 +441,7 @@ TF_LITE_MICRO_TEST(GreaterEqualBroadcastTwoD) {
 
   bool output_data[8];
   tflite::testing::TestComparisonInt(
-      tflite::BuiltinOperator_GREATER_EQUAL, input1_dim, input1_data,
+      tflite::ops::micro::Register_GREATER_EQUAL(), input1_dim, input1_data,
       input2_dim, input2_data, expected_data, expected_dim, output_data);
 }
 
@@ -470,7 +457,7 @@ TF_LITE_MICRO_TEST(LessFloat) {
 
   bool output_data[4];
   tflite::testing::TestComparisonFloat(
-      tflite::BuiltinOperator_LESS, input1_dim, input1_data, input2_dim,
+      tflite::ops::micro::Register_LESS(), input1_dim, input1_data, input2_dim,
       input2_data, expected_data, expected_dim, output_data);
 }
 
@@ -485,9 +472,9 @@ TF_LITE_MICRO_TEST(LessInt) {
   int expected_dim[] = {4, 1, 1, 1, 4};
 
   bool output_data[4];
-  tflite::testing::TestComparisonInt(tflite::BuiltinOperator_LESS, input1_dim,
-                                     input1_data, input2_dim, input2_data,
-                                     expected_data, expected_dim, output_data);
+  tflite::testing::TestComparisonInt(
+      tflite::ops::micro::Register_LESS(), input1_dim, input1_data, input2_dim,
+      input2_data, expected_data, expected_dim, output_data);
 }
 
 TF_LITE_MICRO_TEST(LessBroadcast) {
@@ -501,9 +488,9 @@ TF_LITE_MICRO_TEST(LessBroadcast) {
   int expected_dim[] = {4, 1, 1, 1, 4};
 
   bool output_data[4];
-  tflite::testing::TestComparisonInt(tflite::BuiltinOperator_LESS, input1_dim,
-                                     input1_data, input2_dim, input2_data,
-                                     expected_data, expected_dim, output_data);
+  tflite::testing::TestComparisonInt(
+      tflite::ops::micro::Register_LESS(), input1_dim, input1_data, input2_dim,
+      input2_data, expected_data, expected_dim, output_data);
 }
 
 TF_LITE_MICRO_TEST(LessBroadcastTwoD) {
@@ -517,9 +504,9 @@ TF_LITE_MICRO_TEST(LessBroadcastTwoD) {
   int expected_dim[] = {4, 1, 1, 2, 4};
 
   bool output_data[8];
-  tflite::testing::TestComparisonInt(tflite::BuiltinOperator_LESS, input1_dim,
-                                     input1_data, input2_dim, input2_data,
-                                     expected_data, expected_dim, output_data);
+  tflite::testing::TestComparisonInt(
+      tflite::ops::micro::Register_LESS(), input1_dim, input1_data, input2_dim,
+      input2_data, expected_data, expected_dim, output_data);
 }
 
 TF_LITE_MICRO_TEST(LessEqualFloat) {
@@ -534,8 +521,8 @@ TF_LITE_MICRO_TEST(LessEqualFloat) {
 
   bool output_data[4];
   tflite::testing::TestComparisonFloat(
-      tflite::BuiltinOperator_LESS_EQUAL, input1_dim, input1_data, input2_dim,
-      input2_data, expected_data, expected_dim, output_data);
+      tflite::ops::micro::Register_LESS_EQUAL(), input1_dim, input1_data,
+      input2_dim, input2_data, expected_data, expected_dim, output_data);
 }
 
 TF_LITE_MICRO_TEST(LessEqualInt) {
@@ -550,8 +537,8 @@ TF_LITE_MICRO_TEST(LessEqualInt) {
 
   bool output_data[4];
   tflite::testing::TestComparisonInt(
-      tflite::BuiltinOperator_LESS_EQUAL, input1_dim, input1_data, input2_dim,
-      input2_data, expected_data, expected_dim, output_data);
+      tflite::ops::micro::Register_LESS_EQUAL(), input1_dim, input1_data,
+      input2_dim, input2_data, expected_data, expected_dim, output_data);
 }
 
 TF_LITE_MICRO_TEST(LessEqualBroadcast) {
@@ -566,8 +553,8 @@ TF_LITE_MICRO_TEST(LessEqualBroadcast) {
 
   bool output_data[4];
   tflite::testing::TestComparisonInt(
-      tflite::BuiltinOperator_LESS_EQUAL, input1_dim, input1_data, input2_dim,
-      input2_data, expected_data, expected_dim, output_data);
+      tflite::ops::micro::Register_LESS_EQUAL(), input1_dim, input1_data,
+      input2_dim, input2_data, expected_data, expected_dim, output_data);
 }
 
 TF_LITE_MICRO_TEST(LessEqualBroadcastTwoD) {
@@ -582,8 +569,8 @@ TF_LITE_MICRO_TEST(LessEqualBroadcastTwoD) {
 
   bool output_data[8];
   tflite::testing::TestComparisonInt(
-      tflite::BuiltinOperator_LESS_EQUAL, input1_dim, input1_data, input2_dim,
-      input2_data, expected_data, expected_dim, output_data);
+      tflite::ops::micro::Register_LESS_EQUAL(), input1_dim, input1_data,
+      input2_dim, input2_data, expected_data, expected_dim, output_data);
 }
 
 TF_LITE_MICRO_TEST(EqualQuantizedUInt8) {
@@ -604,10 +591,10 @@ TF_LITE_MICRO_TEST(EqualQuantizedUInt8) {
 
   bool output_data[4];
   tflite::testing::TestComparisonQuantizedUInt8(
-      tflite::BuiltinOperator_EQUAL, input1_dim, input1_data, input1_quantized,
-      input1_scale, input1_zero_point, input2_dim, input2_data,
-      input2_quantized, input2_scale, input2_zero_point, expected_data,
-      expected_dim, output_data);
+      tflite::ops::micro::Register_EQUAL(), input1_dim, input1_data,
+      input1_quantized, input1_scale, input1_zero_point, input2_dim,
+      input2_data, input2_quantized, input2_scale, input2_zero_point,
+      expected_data, expected_dim, output_data);
 }
 
 TF_LITE_MICRO_TEST(EqualQuantizedInt8) {
@@ -629,10 +616,10 @@ TF_LITE_MICRO_TEST(EqualQuantizedInt8) {
 
   bool output_data[4];
   tflite::testing::TestComparisonQuantizedInt8(
-      tflite::BuiltinOperator_EQUAL, input1_dim, input1_data, input1_quantized,
-      input1_scale, input1_zero_point, input2_dim, input2_data,
-      input2_quantized, input2_scale, input2_zero_point, expected_data,
-      expected_dim, output_data);
+      tflite::ops::micro::Register_EQUAL(), input1_dim, input1_data,
+      input1_quantized, input1_scale, input1_zero_point, input2_dim,
+      input2_data, input2_quantized, input2_scale, input2_zero_point,
+      expected_data, expected_dim, output_data);
 }
 
 TF_LITE_MICRO_TEST(NotEqualQuantizedUInt8) {
@@ -653,7 +640,7 @@ TF_LITE_MICRO_TEST(NotEqualQuantizedUInt8) {
 
   bool output_data[4];
   tflite::testing::TestComparisonQuantizedUInt8(
-      tflite::BuiltinOperator_NOT_EQUAL, input1_dim, input1_data,
+      tflite::ops::micro::Register_NOT_EQUAL(), input1_dim, input1_data,
       input1_quantized, input1_scale, input1_zero_point, input2_dim,
       input2_data, input2_quantized, input2_scale, input2_zero_point,
       expected_data, expected_dim, output_data);
@@ -678,7 +665,7 @@ TF_LITE_MICRO_TEST(NotEqualQuantizedInt8) {
 
   bool output_data[4];
   tflite::testing::TestComparisonQuantizedInt8(
-      tflite::BuiltinOperator_NOT_EQUAL, input1_dim, input1_data,
+      tflite::ops::micro::Register_NOT_EQUAL(), input1_dim, input1_data,
       input1_quantized, input1_scale, input1_zero_point, input2_dim,
       input2_data, input2_quantized, input2_scale, input2_zero_point,
       expected_data, expected_dim, output_data);
@@ -702,7 +689,7 @@ TF_LITE_MICRO_TEST(GreaterQuantizedUInt8) {
 
   bool output_data[4];
   tflite::testing::TestComparisonQuantizedUInt8(
-      tflite::BuiltinOperator_GREATER, input1_dim, input1_data,
+      tflite::ops::micro::Register_GREATER(), input1_dim, input1_data,
       input1_quantized, input1_scale, input1_zero_point, input2_dim,
       input2_data, input2_quantized, input2_scale, input2_zero_point,
       expected_data, expected_dim, output_data);
@@ -726,7 +713,7 @@ TF_LITE_MICRO_TEST(GreaterQuantizedUInt8SmallRange) {
 
   bool output_data[4];
   tflite::testing::TestComparisonQuantizedUInt8(
-      tflite::BuiltinOperator_GREATER, input1_dim, input1_data,
+      tflite::ops::micro::Register_GREATER(), input1_dim, input1_data,
       input1_quantized, input1_scale, input1_zero_point, input2_dim,
       input2_data, input2_quantized, input2_scale, input2_zero_point,
       expected_data, expected_dim, output_data);
@@ -744,14 +731,12 @@ TF_LITE_MICRO_TEST(GreaterUInt8EqualQuantized) {
 
   const float input1_scale = 0.5;
   const int input1_zero_point = 128;
-  const float input2_scale = 0.25;
-  const int input2_zero_point = 125;
   uint8_t input1_quantized[4];
   uint8_t input2_quantized[4];
 
   bool output_data[4];
   tflite::testing::TestComparisonQuantizedUInt8(
-      tflite::BuiltinOperator_GREATER_EQUAL, input1_dim, input1_data,
+      tflite::ops::micro::Register_GREATER_EQUAL(), input1_dim, input1_data,
       input1_quantized, input1_scale, input1_zero_point, input2_dim,
       input2_data, input2_quantized, input1_scale, input1_zero_point,
       expected_data, expected_dim, output_data);
@@ -769,17 +754,15 @@ TF_LITE_MICRO_TEST(LessQuantizedUInt8) {
 
   const float input1_scale = 0.5;
   const int input1_zero_point = 128;
-  const float input2_scale = 0.25;
-  const int input2_zero_point = 125;
   uint8_t input1_quantized[4];
   uint8_t input2_quantized[4];
 
   bool output_data[4];
   tflite::testing::TestComparisonQuantizedUInt8(
-      tflite::BuiltinOperator_LESS, input1_dim, input1_data, input1_quantized,
-      input1_scale, input1_zero_point, input2_dim, input2_data,
-      input2_quantized, input1_scale, input1_zero_point, expected_data,
-      expected_dim, output_data);
+      tflite::ops::micro::Register_LESS(), input1_dim, input1_data,
+      input1_quantized, input1_scale, input1_zero_point, input2_dim,
+      input2_data, input2_quantized, input1_scale, input1_zero_point,
+      expected_data, expected_dim, output_data);
 }
 
 TF_LITE_MICRO_TEST(LessEqualQuantizedUInt8) {
@@ -794,14 +777,12 @@ TF_LITE_MICRO_TEST(LessEqualQuantizedUInt8) {
 
   const float input1_scale = 0.5;
   const int input1_zero_point = 128;
-  const float input2_scale = 0.25;
-  const int input2_zero_point = 125;
   uint8_t input1_quantized[4];
   uint8_t input2_quantized[4];
 
   bool output_data[4];
   tflite::testing::TestComparisonQuantizedUInt8(
-      tflite::BuiltinOperator_LESS_EQUAL, input1_dim, input1_data,
+      tflite::ops::micro::Register_LESS_EQUAL(), input1_dim, input1_data,
       input1_quantized, input1_scale, input1_zero_point, input2_dim,
       input2_data, input2_quantized, input1_scale, input1_zero_point,
       expected_data, expected_dim, output_data);
@@ -824,14 +805,12 @@ TF_LITE_MICRO_TEST(EqualQuantizedUInt8WithBroadcast) {
 
     const float input1_scale = 0.5;
     const int input1_zero_point = 128;
-    const float input2_scale = 0.25;
-    const int input2_zero_point = 125;
     uint8_t input1_quantized[6];
     uint8_t input2_quantized[6];
 
     bool output_data[6];
     tflite::testing::TestComparisonQuantizedUInt8(
-        tflite::BuiltinOperator_EQUAL, input1_dim, input1_data,
+        tflite::ops::micro::Register_EQUAL(), input1_dim, input1_data,
         input1_quantized, input1_scale, input1_zero_point, input2_dim,
         input2_data, input2_quantized, input1_scale, input1_zero_point,
         expected_data, expected_dim, output_data);
@@ -855,14 +834,12 @@ TF_LITE_MICRO_TEST(NotEqualQuantizedUInt8WithBroadcast) {
 
     const float input1_scale = 0.5;
     const int input1_zero_point = 128;
-    const float input2_scale = 0.25;
-    const int input2_zero_point = 125;
     uint8_t input1_quantized[6];
     uint8_t input2_quantized[6];
 
     bool output_data[6];
     tflite::testing::TestComparisonQuantizedUInt8(
-        tflite::BuiltinOperator_NOT_EQUAL, input1_dim, input1_data,
+        tflite::ops::micro::Register_NOT_EQUAL(), input1_dim, input1_data,
         input1_quantized, input1_scale, input1_zero_point, input2_dim,
         input2_data, input2_quantized, input1_scale, input1_zero_point,
         expected_data, expected_dim, output_data);
@@ -886,14 +863,12 @@ TF_LITE_MICRO_TEST(NotEqualQuantizedInt8WithBroadcast) {
 
     const float input1_scale = 0.5;
     const int input1_zero_point = -9;
-    const float input2_scale = 0.25;
-    const int input2_zero_point = 9;
     int8_t input1_quantized[6];
     int8_t input2_quantized[6];
 
     bool output_data[6];
     tflite::testing::TestComparisonQuantizedInt8(
-        tflite::BuiltinOperator_NOT_EQUAL, input1_dim, input1_data,
+        tflite::ops::micro::Register_NOT_EQUAL(), input1_dim, input1_data,
         input1_quantized, input1_scale, input1_zero_point, input2_dim,
         input2_data, input2_quantized, input1_scale, input1_zero_point,
         expected_data, expected_dim, output_data);
@@ -917,14 +892,12 @@ TF_LITE_MICRO_TEST(GreaterQuantizedUInt8WithBroadcast) {
 
     const float input1_scale = 0.5;
     const int input1_zero_point = 128;
-    const float input2_scale = 0.25;
-    const int input2_zero_point = 125;
     uint8_t input1_quantized[6];
     uint8_t input2_quantized[6];
 
     bool output_data[6];
     tflite::testing::TestComparisonQuantizedUInt8(
-        tflite::BuiltinOperator_GREATER, input1_dim, input1_data,
+        tflite::ops::micro::Register_GREATER(), input1_dim, input1_data,
         input1_quantized, input1_scale, input1_zero_point, input2_dim,
         input2_data, input2_quantized, input1_scale, input1_zero_point,
         expected_data, expected_dim, output_data);
@@ -948,14 +921,12 @@ TF_LITE_MICRO_TEST(GreaterQuantizedInt8WithBroadcast) {
 
     const float input1_scale = 0.5;
     const int input1_zero_point = -9;
-    const float input2_scale = 0.25;
-    const int input2_zero_point = 9;
     int8_t input1_quantized[6];
     int8_t input2_quantized[6];
 
     bool output_data[6];
     tflite::testing::TestComparisonQuantizedInt8(
-        tflite::BuiltinOperator_GREATER, input1_dim, input1_data,
+        tflite::ops::micro::Register_GREATER(), input1_dim, input1_data,
         input1_quantized, input1_scale, input1_zero_point, input2_dim,
         input2_data, input2_quantized, input1_scale, input1_zero_point,
         expected_data, expected_dim, output_data);
@@ -979,14 +950,12 @@ TF_LITE_MICRO_TEST(GreaterEqualQuantizedUInt8WithBroadcast) {
 
     const float input1_scale = 0.5;
     const int input1_zero_point = 128;
-    const float input2_scale = 0.25;
-    const int input2_zero_point = 125;
     uint8_t input1_quantized[6];
     uint8_t input2_quantized[6];
 
     bool output_data[6];
     tflite::testing::TestComparisonQuantizedUInt8(
-        tflite::BuiltinOperator_GREATER_EQUAL, input1_dim, input1_data,
+        tflite::ops::micro::Register_GREATER_EQUAL(), input1_dim, input1_data,
         input1_quantized, input1_scale, input1_zero_point, input2_dim,
         input2_data, input2_quantized, input1_scale, input1_zero_point,
         expected_data, expected_dim, output_data);
@@ -1010,14 +979,12 @@ TF_LITE_MICRO_TEST(GreaterEqualQuantizedInt8WithBroadcast) {
 
     const float input1_scale = 0.5;
     const int input1_zero_point = -9;
-    const float input2_scale = 0.25;
-    const int input2_zero_point = 9;
     int8_t input1_quantized[6];
     int8_t input2_quantized[6];
 
     bool output_data[6];
     tflite::testing::TestComparisonQuantizedInt8(
-        tflite::BuiltinOperator_GREATER_EQUAL, input1_dim, input1_data,
+        tflite::ops::micro::Register_GREATER_EQUAL(), input1_dim, input1_data,
         input1_quantized, input1_scale, input1_zero_point, input2_dim,
         input2_data, input2_quantized, input1_scale, input1_zero_point,
         expected_data, expected_dim, output_data);
@@ -1041,17 +1008,15 @@ TF_LITE_MICRO_TEST(LessQuantizedUInt8WithBroadcast) {
 
     const float input1_scale = 0.5;
     const int input1_zero_point = 128;
-    const float input2_scale = 0.25;
-    const int input2_zero_point = 125;
     uint8_t input1_quantized[6];
     uint8_t input2_quantized[6];
 
     bool output_data[6];
     tflite::testing::TestComparisonQuantizedUInt8(
-        tflite::BuiltinOperator_LESS, input1_dim, input1_data, input1_quantized,
-        input1_scale, input1_zero_point, input2_dim, input2_data,
-        input2_quantized, input1_scale, input1_zero_point, expected_data,
-        expected_dim, output_data);
+        tflite::ops::micro::Register_LESS(), input1_dim, input1_data,
+        input1_quantized, input1_scale, input1_zero_point, input2_dim,
+        input2_data, input2_quantized, input1_scale, input1_zero_point,
+        expected_data, expected_dim, output_data);
   }
 }
 
@@ -1072,17 +1037,15 @@ TF_LITE_MICRO_TEST(LessQuantizedInt8WithBroadcast) {
 
     const float input1_scale = 0.5;
     const int input1_zero_point = -9;
-    const float input2_scale = 0.25;
-    const int input2_zero_point = 9;
     int8_t input1_quantized[6];
     int8_t input2_quantized[6];
 
     bool output_data[6];
     tflite::testing::TestComparisonQuantizedInt8(
-        tflite::BuiltinOperator_LESS, input1_dim, input1_data, input1_quantized,
-        input1_scale, input1_zero_point, input2_dim, input2_data,
-        input2_quantized, input1_scale, input1_zero_point, expected_data,
-        expected_dim, output_data);
+        tflite::ops::micro::Register_LESS(), input1_dim, input1_data,
+        input1_quantized, input1_scale, input1_zero_point, input2_dim,
+        input2_data, input2_quantized, input1_scale, input1_zero_point,
+        expected_data, expected_dim, output_data);
   }
 }
 
@@ -1103,14 +1066,12 @@ TF_LITE_MICRO_TEST(LessEqualQuantizedUInt8WithBroadcast) {
 
     const float input1_scale = 0.5;
     const int input1_zero_point = 128;
-    const float input2_scale = 0.25;
-    const int input2_zero_point = 125;
     uint8_t input1_quantized[6];
     uint8_t input2_quantized[6];
 
     bool output_data[6];
     tflite::testing::TestComparisonQuantizedUInt8(
-        tflite::BuiltinOperator_LESS_EQUAL, input1_dim, input1_data,
+        tflite::ops::micro::Register_LESS_EQUAL(), input1_dim, input1_data,
         input1_quantized, input1_scale, input1_zero_point, input2_dim,
         input2_data, input2_quantized, input1_scale, input1_zero_point,
         expected_data, expected_dim, output_data);
@@ -1134,14 +1095,12 @@ TF_LITE_MICRO_TEST(LessEqualQuantizedInt8WithBroadcast) {
 
     const float input1_scale = 0.5;
     const int input1_zero_point = -9;
-    const float input2_scale = 0.25;
-    const int input2_zero_point = 9;
     int8_t input1_quantized[6];
     int8_t input2_quantized[6];
 
     bool output_data[6];
     tflite::testing::TestComparisonQuantizedInt8(
-        tflite::BuiltinOperator_LESS_EQUAL, input1_dim, input1_data,
+        tflite::ops::micro::Register_LESS_EQUAL(), input1_dim, input1_data,
         input1_quantized, input1_scale, input1_zero_point, input2_dim,
         input2_data, input2_quantized, input1_scale, input1_zero_point,
         expected_data, expected_dim, output_data);
diff --git a/tensorflow/lite/micro/kernels/concatenation.cc b/tensorflow/lite/micro/kernels/concatenation.cc
index abfeb5eae83..f64362745be 100644
--- a/tensorflow/lite/micro/kernels/concatenation.cc
+++ b/tensorflow/lite/micro/kernels/concatenation.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -31,6 +32,104 @@ namespace concatenation {
 constexpr int kMaxInputNum = 10;  // Maximum number of input tensors
 constexpr int kOutputTensor = 0;
 
+struct OpData {
+  ConcatenationParams params;
+};
+
+// Handles negative axis index, coerces to positive index value.
+inline int CalculatePositiveAxis(int axis, const TfLiteTensor* output_tensor) {
+  if (axis >= 0) {
+    return axis;
+  } else {
+    return NumDimensions(output_tensor) + axis;
+  }
+}
+
+// The following functions are helpers to get tensor data in the format that the
+// reference op implementation expects. They provide the same functionality as
+// class VectorOfTensors and class VectorOfQuantizedTensors in TFLite.
+
+// Gets shapes from a list of tensors.
+inline void GetAllInputTensorShapes(const TfLiteContext* context,
+                                    const TfLiteNode* node,
+                                    RuntimeShape all_shapes[kMaxInputNum]) {
+  TFLITE_DCHECK(context != nullptr);
+  TFLITE_DCHECK(node != nullptr);
+  for (int i = 0; i < node->inputs->size; ++i) {
+    const TfLiteEvalTensor* t = tflite::micro::GetEvalInput(context, node, i);
+    RuntimeShape shape = tflite::micro::GetTensorShape(t);
+    all_shapes[i].ReplaceWith(shape.DimensionsCount(), shape.DimsData());
+  }
+}
+
+// Get shape pointers from a list of shapes.
+inline void GetShapesPointers(const RuntimeShape* shapes, size_t num,
+                              const RuntimeShape* pointers[]) {
+  for (size_t i = 0; i < num; ++i) {
+    pointers[i] = &shapes[i];
+  }
+}
+
+// Gets data pointers from a list of tensors.
+template <typename T>
+inline void GetAllInputTensorData(const TfLiteContext* context,
+                                  const TfLiteNode* node,
+                                  T* all_data[kMaxInputNum]) {
+  TFLITE_DCHECK(context != nullptr);
+  TFLITE_DCHECK(node != nullptr);
+  for (int i = 0; i < node->inputs->size; ++i) {
+    const TfLiteEvalTensor* t = tflite::micro::GetEvalInput(context, node, i);
+    all_data[i] = tflite::micro::GetTensorData<T>(t);
+  }
+}
+
+template <typename data_type>
+void EvalUnquantized(TfLiteContext* context, TfLiteNode* node) {
+  // Collect the shapes and data pointer of input tensors
+  RuntimeShape inputs_shape[kMaxInputNum];
+  const RuntimeShape* inputs_shape_ptr[kMaxInputNum];
+  const data_type* inputs_data[kMaxInputNum];
+  GetAllInputTensorShapes(context, node, inputs_shape);
+  GetShapesPointers(inputs_shape, node->inputs->size, inputs_shape_ptr);
+  GetAllInputTensorData(context, node, inputs_data);
+
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
+
+  reference_ops::Concatenation(data->params, inputs_shape_ptr, inputs_data,
+                               tflite::micro::GetTensorShape(output),
+                               tflite::micro::GetTensorData<data_type>(output));
+}
+
+void EvalQuantizedUInt8(TfLiteContext* context, TfLiteNode* node) {
+  // Collect the shapes and data pointer of input tensors
+  RuntimeShape inputs_shape[kMaxInputNum];
+  const RuntimeShape* inputs_shape_ptr[kMaxInputNum];
+  const uint8_t* inputs_data[kMaxInputNum];
+  GetAllInputTensorShapes(context, node, inputs_shape);
+  GetShapesPointers(inputs_shape, node->inputs->size, inputs_shape_ptr);
+  GetAllInputTensorData(context, node, inputs_data);
+
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
+
+  reference_ops::ConcatenationWithScaling(
+      data->params, inputs_shape_ptr, inputs_data,
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<uint8_t>(output));
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // This function only checks the types. Additional shape validations are
   // performed in the reference implementation called during Eval().
@@ -69,119 +168,57 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
   }
 
+  // Calculate OpData.
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  switch (output_type) {  // Already know in/outtypes are same.
+    case kTfLiteFloat32:
+    case kTfLiteInt32:
+    case kTfLiteInt64: {
+      data->params.axis = CalculatePositiveAxis(params->axis, output);
+      data->params.inputs_count = node->inputs->size;
+      break;
+    }
+    case kTfLiteUInt8:
+    case kTfLiteInt8: {
+      data->params.axis = CalculatePositiveAxis(params->axis, output);
+      data->params.inputs_count = node->inputs->size;
+
+      float* input_scales =
+          reinterpret_cast<float*>(context->AllocatePersistentBuffer(
+              context, node->inputs->size * sizeof(float)));
+
+      int32_t* input_zero_points =
+          reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+              context, node->inputs->size * sizeof(int32_t)));
+
+      // Allocate persistent scale and zeropoint buffers.
+      // Store input scale and zero point values in OpParams:
+      for (int i = 0; i < node->inputs->size; ++i) {
+        const TfLiteTensor* t = GetInput(context, node, i);
+        input_scales[i] = t->params.scale;
+        input_zero_points[i] = t->params.zero_point;
+      }
+
+      data->params.input_scale = input_scales;
+      data->params.input_zeropoint = input_zero_points;
+      data->params.output_zeropoint = output->params.zero_point;
+      data->params.output_scale = output->params.scale;
+      break;
+    }
+    default:
+      TF_LITE_KERNEL_LOG(
+          context, "Op Concatenation does not currently support Type '%s'.",
+          TfLiteTypeGetName(output_type));
+      return kTfLiteError;
+  }
+
   return kTfLiteOk;
 }
 
-// Handles negative axis index, coerces to positive index value.
-inline int CalculatePositiveAxis(int axis, const TfLiteTensor* output_tensor) {
-  if (axis >= 0) {
-    return axis;
-  } else {
-    return NumDimensions(output_tensor) + axis;
-  }
-}
-
-// The following functions are helpers to get tensor data in the format that the
-// reference op implementation expects. They provide the same functionality as
-// class VectorOfTensors and class VectorOfQuantizedTensors in TFLite.
-
-// Gets shapes from a list of tensors.
-inline void GetAllTensorShapes(const TfLiteContext& context,
-                               const TfLiteIntArray& tensor_list,
-                               RuntimeShape all_shapes[kMaxInputNum]) {
-  for (int i = 0; i < tensor_list.size; ++i) {
-    const TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
-    RuntimeShape shape = GetTensorShape(t);
-    all_shapes[i].ReplaceWith(shape.DimensionsCount(), shape.DimsData());
-  }
-}
-
-// Get shape pointers from a list of shapes.
-inline void GetShapesPointers(const RuntimeShape* shapes, size_t num,
-                              const RuntimeShape* pointers[]) {
-  for (size_t i = 0; i < num; ++i) {
-    pointers[i] = &shapes[i];
-  }
-}
-
-// Gets data pointers from a list of tensors.
-template <typename T>
-inline void GetAllTensorData(const TfLiteContext& context,
-                             const TfLiteIntArray& tensor_list,
-                             T* all_data[kMaxInputNum]) {
-  for (int i = 0; i < tensor_list.size; ++i) {
-    const TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
-    all_data[i] = GetTensorData<T>(t);
-  }
-}
-
-// Gets scale and zero point from a list of tensors
-inline void GetAllQuantizationParam(const TfLiteContext& context,
-                                    const TfLiteIntArray& tensor_list,
-                                    float scales[kMaxInputNum],
-                                    int32 zero_points[kMaxInputNum]) {
-  for (int i = 0; i < tensor_list.size; ++i) {
-    const TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
-    scales[i] = t->params.scale;
-    zero_points[i] = t->params.zero_point;
-  }
-}
-
-template <typename data_type>
-void EvalUnquantized(TfLiteContext* context, TfLiteNode* node) {
-  // Collect the shapes and data pointer of input tensors
-  RuntimeShape inputs_shape[kMaxInputNum];
-  const RuntimeShape* inputs_shape_ptr[kMaxInputNum];
-  const data_type* inputs_data[kMaxInputNum];
-  GetAllTensorShapes(*context, *node->inputs, inputs_shape);
-  GetShapesPointers(inputs_shape, node->inputs->size, inputs_shape_ptr);
-  GetAllTensorData(*context, *node->inputs, inputs_data);
-
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  const TfLiteConcatenationParams* params =
-      reinterpret_cast<TfLiteConcatenationParams*>(node->builtin_data);
-
-  ConcatenationParams op_params;
-  op_params.axis = CalculatePositiveAxis(params->axis, output);
-  op_params.inputs_count = NumInputs(node);
-
-  reference_ops::Concatenation(op_params, inputs_shape_ptr, inputs_data,
-                               GetTensorShape(output),
-                               GetTensorData<data_type>(output));
-}
-
-void EvalQuantizedUInt8(TfLiteContext* context, TfLiteNode* node) {
-  // Collect the shapes and data pointer of input tensors
-  RuntimeShape inputs_shape[kMaxInputNum];
-  const RuntimeShape* inputs_shape_ptr[kMaxInputNum];
-  const uint8_t* inputs_data[kMaxInputNum];
-  float inputs_scale[kMaxInputNum];
-  int32 inputs_zero_point[kMaxInputNum];
-  GetAllTensorShapes(*context, *node->inputs, inputs_shape);
-  GetShapesPointers(inputs_shape, node->inputs->size, inputs_shape_ptr);
-  GetAllTensorData(*context, *node->inputs, inputs_data);
-  GetAllQuantizationParam(*context, *node->inputs, inputs_scale,
-                          inputs_zero_point);
-
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  const TfLiteConcatenationParams* params =
-      reinterpret_cast<TfLiteConcatenationParams*>(node->builtin_data);
-
-  ConcatenationParams op_params;
-  op_params.axis = CalculatePositiveAxis(params->axis, output);
-  op_params.inputs_count = NumInputs(node);
-  op_params.input_zeropoint = inputs_zero_point;
-  op_params.input_scale = inputs_scale;
-  op_params.output_zeropoint = output->params.zero_point;
-  op_params.output_scale = output->params.scale;
-
-  reference_ops::ConcatenationWithScaling(op_params, inputs_shape_ptr,
-                                          inputs_data, GetTensorShape(output),
-                                          GetTensorData<uint8>(output));
-}
-
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteType output_type = GetOutput(context, node, kOutputTensor)->type;
 
@@ -215,7 +252,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace concatenation
 
 TfLiteRegistration Register_CONCATENATION() {
-  return {/*init=*/nullptr,
+  return {/*init=*/concatenation::Init,
           /*free=*/nullptr,
           /*prepare=*/concatenation::Prepare,
           /*invoke=*/concatenation::Eval,
diff --git a/tensorflow/lite/micro/kernels/concatenation_test.cc b/tensorflow/lite/micro/kernels/concatenation_test.cc
index 8ac9e2ee2c8..d82a804e659 100644
--- a/tensorflow/lite/micro/kernels/concatenation_test.cc
+++ b/tensorflow/lite/micro/kernels/concatenation_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
@@ -24,17 +24,16 @@ namespace tflite {
 namespace testing {
 namespace {
 
-void TestConcatenateTwoInputs(std::initializer_list<int> input1_dims_data,
-                              std::initializer_list<float> input1_data,
-                              std::initializer_list<int> input2_dims_data,
-                              std::initializer_list<float> input2_data,
-                              int axis,
-                              std::initializer_list<int> output_dims_data,
-                              std::initializer_list<float> expected_output_data,
+void TestConcatenateTwoInputs(const int* input1_dims_data,
+                              const float* input1_data,
+                              const int* input2_dims_data,
+                              const float* input2_data, int axis,
+                              const int* output_dims_data,
+                              const float* expected_output_data,
                               float* output_data) {
-  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
-  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  TfLiteIntArray* input1_dims = IntArrayFromInts(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInts(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
 
   constexpr int input_size = 2;
   constexpr int output_size = 1;
@@ -44,98 +43,75 @@ void TestConcatenateTwoInputs(std::initializer_list<int> input1_dims_data,
       CreateFloatTensor(input2_data, input2_dims),
       CreateFloatTensor(output_data, output_dims)};
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_CONCATENATION);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+  int inputs_array_data[] = {2, 0, 1};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 2};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
   TfLiteConcatenationParams builtin_data = {
       .axis = axis,
       .activation = kTfLiteActNone  // Only activation supported in this impl
   };
 
-  int inputs_array_data[] = {2, 0, 1};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 2};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  const TfLiteRegistration registration =
+      tflite::ops::micro::Register_CONCATENATION();
+  micro::KernelRunner runner(
+      registration, tensors, tensors_size, inputs_array, outputs_array,
+      reinterpret_cast<void*>(&builtin_data), micro_test::reporter);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = nullptr;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   const int output_dims_count = ElementCount(*output_dims);
   for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
-                              1e-5f);
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i], 1e-5f);
   }
 }
 
 void TestConcatenateQuantizedTwoInputs(
-    std::initializer_list<int> input1_dims_data,
-    std::initializer_list<uint8_t> input1_data,
-    std::initializer_list<int> input2_dims_data,
-    std::initializer_list<uint8_t> input2_data, float input_min,
-    float input_max, int axis, std::initializer_list<int> output_dims_data,
-    std::initializer_list<uint8_t> expected_output_data, float output_min,
-    float output_max, uint8_t* output_data) {
-  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
-  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+    const int* input1_dims_data, const uint8_t* input1_data,
+    const int* input2_dims_data, const uint8_t* input2_data,
+    const float input_scale, const int input_zero_point, int axis,
+    const int* output_dims_data, const uint8_t* expected_output_data,
+    const float output_scale, const int output_zero_point,
+    uint8_t* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInts(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInts(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
 
   constexpr int input_size = 2;
   constexpr int output_size = 1;
   constexpr int tensors_size = input_size + output_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateQuantizedTensor(input1_data, input1_dims, input_min, input_max),
-      CreateQuantizedTensor(input2_data, input2_dims, input_min, input_max),
-      CreateQuantizedTensor(output_data, output_dims, output_min, output_max)};
-
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_CONCATENATION);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  TfLiteConcatenationParams builtin_data = {
-      .axis = axis,
-      .activation = kTfLiteActNone  // Only activation supported in this impl
-  };
+      CreateQuantizedTensor(input1_data, input1_dims, input_scale,
+                            input_zero_point),
+      CreateQuantizedTensor(input2_data, input2_dims, input_scale,
+                            input_zero_point),
+      CreateQuantizedTensor(output_data, output_dims, output_scale,
+                            output_zero_point)};
 
   int inputs_array_data[] = {2, 0, 1};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 2};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = nullptr;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
+  TfLiteConcatenationParams builtin_data = {
+      .axis = axis,
+      .activation = kTfLiteActNone  // Only activation supported in this impl
+  };
 
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  const TfLiteRegistration registration =
+      tflite::ops::micro::Register_CONCATENATION();
+  micro::KernelRunner runner(
+      registration, tensors, tensors_size, inputs_array, outputs_array,
+      reinterpret_cast<void*>(&builtin_data), micro_test::reporter);
 
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   const int output_dims_count = ElementCount(*output_dims);
   for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data[i], output_data[i]);
   }
 }
 
@@ -148,19 +124,19 @@ TF_LITE_MICRO_TESTS_BEGIN
 TF_LITE_MICRO_TEST(TwoInputsAllAxesCombinations) {
   // Concatenate the same two input tensors along all possible axes.
 
-  auto input_shape = {2, 2, 3};
-  auto input1_value = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-  auto input2_value = {7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
+  const int input_shape[] = {2, 2, 3};
+  const float input1_value[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  const float input2_value[] = {7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
 
   // expected output when concatenating on axis 0
-  auto output_shape_axis0 = {2, 4, 3};
-  auto output_value_axis0 = {1.0f, 2.0f, 3.0f, 4.0f,  5.0f,  6.0f,
-                             7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
+  const int output_shape_axis0[] = {2, 4, 3};
+  const float output_value_axis0[] = {1.0f, 2.0f, 3.0f, 4.0f,  5.0f,  6.0f,
+                                      7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
 
   // expected output when concatenating on axis 1
-  auto output_shape_axis1 = {2, 2, 6};
-  auto output_value_axis1 = {1.0f, 2.0f, 3.0f, 7.0f,  8.0f,  9.0f,
-                             4.0f, 5.0f, 6.0f, 10.0f, 11.0f, 12.0f};
+  const int output_shape_axis1[] = {2, 2, 6};
+  const float output_value_axis1[] = {1.0f, 2.0f, 3.0f, 7.0f,  8.0f,  9.0f,
+                                      4.0f, 5.0f, 6.0f, 10.0f, 11.0f, 12.0f};
 
   float output_data[12];
 
@@ -186,59 +162,48 @@ TF_LITE_MICRO_TEST(TwoInputsAllAxesCombinations) {
 }
 
 TF_LITE_MICRO_TEST(TwoInputsQuantizedUint8) {
-  using tflite::testing::F2Q;
-
   const int axis = 2;
-  auto input_shape = {3, 2, 1, 2};
-  auto output_shape = {3, 2, 1, 4};
+  const int input_shape[] = {3, 2, 1, 2};
+  const int output_shape[] = {3, 2, 1, 4};
 
-  const float input_min = -12.7f;
-  const float input_max = 12.8f;
-  const float output_min = -12.7f;
-  const float output_max = 12.8f;
+  const float input_scale = 0.1f;
+  const int input_zero_point = 127;
+  const float output_scale = 0.1f;
+  const int output_zero_point = 127;
 
-  auto input1_value = {
-      F2Q(1.0, input_min, input_max),
-      F2Q(3.0, input_min, input_max),
-      F2Q(4.0, input_min, input_max),
-      F2Q(7.0, input_min, input_max),
-  };
+  const uint8_t input1_values[] = {137, 157, 167, 197};
 
-  auto input2_value = {
-      F2Q(1.1, input_min, input_max),
-      F2Q(3.1, input_min, input_max),
-      F2Q(4.1, input_min, input_max),
-      F2Q(7.1, input_min, input_max),
-  };
+  const uint8_t input2_values[] = {138, 158, 168, 198};
 
-  std::initializer_list<uint8_t> output_value = {
+  const uint8_t output_value[] = {
       137, 157, 138, 158, 167, 197, 168, 198,
   };
 
   uint8_t output_data[8];
   tflite::testing::TestConcatenateQuantizedTwoInputs(
-      input_shape, input1_value, input_shape, input2_value, input_min,
-      input_max, axis, output_shape, output_value, output_min, output_max,
-      output_data);
+      input_shape, input1_values, input_shape, input2_values, input_scale,
+      input_zero_point, axis, output_shape, output_value, output_scale,
+      output_zero_point, output_data);
 }
 
 TF_LITE_MICRO_TEST(ThreeDimensionalTwoInputsDifferentShapes) {
   const int axis = 1;
 
-  auto input1_shape = {3, 2, 1, 2};
-  auto input2_shape = {3, 2, 3, 2};
-  auto output_shape = {3, 2, 4, 2};
+  const int input1_shape[] = {3, 2, 1, 2};
+  const int input2_shape[] = {3, 2, 3, 2};
+  const int output_shape[] = {3, 2, 4, 2};
 
-  auto input1_value = {1.0f, 3.0f, 4.0f, 7.0f};
-  auto input2_value = {1.0f, 2.0f, 3.0f, 4.0f,  5.0f,  6.0f,
-                       7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
-  auto output_value = {1.0f, 3.0f, 1.0f, 2.0f, 3.0f, 4.0f,  5.0f,  6.0f,
-                       4.0f, 7.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
+  const float input1_values[] = {1.0f, 3.0f, 4.0f, 7.0f};
+  const float input2_values[] = {1.0f, 2.0f, 3.0f, 4.0f,  5.0f,  6.0f,
+                                 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
+  const float output_values[] = {1.0f, 3.0f,  1.0f,  2.0f, 3.0f, 4.0f,
+                                 5.0f, 6.0f,  4.0f,  7.0f, 7.0f, 8.0f,
+                                 9.0f, 10.0f, 11.0f, 12.0f};
 
   float output_data[16];
   tflite::testing::TestConcatenateTwoInputs(
-      input1_shape, input1_value, input2_shape, input2_value, axis,
-      output_shape, output_value, output_data);
+      input1_shape, input1_values, input2_shape, input2_values, axis,
+      output_shape, output_values, output_data);
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/conv.cc b/tensorflow/lite/micro/kernels/conv.cc
index fec6f1e3c12..6601213fc51 100644
--- a/tensorflow/lite/micro/kernels/conv.cc
+++ b/tensorflow/lite/micro/kernels/conv.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -42,6 +43,12 @@ constexpr int kConvQuantizedDimension = 0;
 
 struct OpData {
   TfLitePaddingValues padding;
+
+  // Cached tensor zero point values for quantized operations.
+  int32_t input_zero_point;
+  int32_t filter_zero_point;
+  int32_t output_zero_point;
+
   // The scaling factor from input to output (aka the 'real multiplier') can
   // be represented as a fixed point multiplier plus a left shift.
   int32_t output_multiplier;
@@ -109,12 +116,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@@ -137,12 +139,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // Dynimically allocate per-channel quantization parameters.
   const int num_channels = filter->dims->data[kConvQuantizedDimension];
-  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
-      context, num_channels * sizeof(int32_t),
-      reinterpret_cast<void**>(&data->per_channel_output_multiplier)));
-  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
-      context, num_channels * sizeof(int32_t),
-      reinterpret_cast<void**>(&data->per_channel_output_shift)));
+  data->per_channel_output_multiplier =
+      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+  data->per_channel_output_shift =
+      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
 
   // All per-channel quantized tensors need valid zero point and scale arrays.
   if (input->type == kTfLiteInt8) {
@@ -163,19 +165,26 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                       affine_quantization->zero_point->size);
   }
 
-  return CalculateOpData(context, node, params, input_width, input_height,
-                         filter_width, filter_height, output_width,
-                         output_height, input->type, data);
+  TF_LITE_ENSURE_STATUS(CalculateOpData(
+      context, node, params, input_width, input_height, filter_width,
+      filter_height, output_width, output_height, input->type, data));
+
+  data->input_zero_point = input->params.zero_point;
+  data->filter_zero_point = filter->params.zero_point;
+  data->output_zero_point = output->params.zero_point;
+
+  return kTfLiteOk;
 }  // namespace conv
 
 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                    TfLiteConvParams* params, const OpData& data,
-                   const TfLiteTensor* input, const TfLiteTensor* filter,
-                   const TfLiteTensor* bias, TfLiteTensor* im2col,
-                   TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
-  const int32_t input_offset = -input->params.zero_point;
-  const int32_t filter_offset = -filter->params.zero_point;
-  const int32_t output_offset = output->params.zero_point;
+                   const TfLiteEvalTensor* input,
+                   const TfLiteEvalTensor* filter, const TfLiteEvalTensor* bias,
+                   TfLiteEvalTensor* im2col, TfLiteEvalTensor* hwcn_weights,
+                   TfLiteEvalTensor* output) {
+  const int32_t input_offset = -data.input_zero_point;
+  const int32_t filter_offset = -data.filter_zero_point;
+  const int32_t output_offset = data.output_zero_point;
 
   // TODO(b/154032858): Investigate removing extra copies.
   ConvParams op_params;
@@ -193,24 +202,29 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   op_params.output_shift = -data.output_shift;
   op_params.quantized_activation_min = data.output_activation_min;
   op_params.quantized_activation_max = data.output_activation_max;
-  reference_ops::Conv(op_params, GetTensorShape(input),
-                      GetTensorData<uint8_t>(input), GetTensorShape(filter),
-                      GetTensorData<uint8_t>(filter), GetTensorShape(bias),
-                      GetTensorData<int32_t>(bias), GetTensorShape(output),
-                      GetTensorData<uint8_t>(output), GetTensorShape(im2col),
-                      GetTensorData<uint8_t>(im2col), nullptr);
+  reference_ops::Conv(op_params, tflite::micro::GetTensorShape(input),
+                      tflite::micro::GetTensorData<uint8_t>(input),
+                      tflite::micro::GetTensorShape(filter),
+                      tflite::micro::GetTensorData<uint8_t>(filter),
+                      tflite::micro::GetTensorShape(bias),
+                      tflite::micro::GetTensorData<int32_t>(bias),
+                      tflite::micro::GetTensorShape(output),
+                      tflite::micro::GetTensorData<uint8_t>(output),
+                      tflite::micro::GetTensorShape(im2col),
+                      tflite::micro::GetTensorData<uint8_t>(im2col), nullptr);
 }
 
 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
                              TfLiteConvParams* params, const OpData& data,
-                             const TfLiteTensor* input,
-                             const TfLiteTensor* filter,
-                             const TfLiteTensor* bias, TfLiteTensor* output,
-                             TfLiteTensor* im2col) {
+                             const TfLiteEvalTensor* input,
+                             const TfLiteEvalTensor* filter,
+                             const TfLiteEvalTensor* bias,
+                             TfLiteEvalTensor* output,
+                             TfLiteEvalTensor* im2col) {
   // TODO(b/154032858): Investigate removing extra copies.
   ConvParams op_params;
-  op_params.input_offset = -input->params.zero_point;
-  op_params.output_offset = output->params.zero_point;
+  op_params.input_offset = -data.input_zero_point;
+  op_params.output_offset = data.output_zero_point;
   op_params.stride_height = params->stride_height;
   op_params.stride_width = params->stride_width;
   op_params.dilation_height_factor = params->dilation_height_factor;
@@ -222,18 +236,21 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
 
   reference_integer_ops::ConvPerChannel(
       op_params, data.per_channel_output_multiplier,
-      data.per_channel_output_shift, GetTensorShape(input),
-      GetTensorData<int8>(input), GetTensorShape(filter),
-      GetTensorData<int8>(filter), GetTensorShape(bias),
-      GetTensorData<int32>(bias), GetTensorShape(output),
-      GetTensorData<int8>(output));
+      data.per_channel_output_shift, tflite::micro::GetTensorShape(input),
+      tflite::micro::GetTensorData<int8_t>(input),
+      tflite::micro::GetTensorShape(filter),
+      tflite::micro::GetTensorData<int8_t>(filter),
+      tflite::micro::GetTensorShape(bias),
+      tflite::micro::GetTensorData<int32_t>(bias),
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<int8_t>(output));
 }
 
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
                TfLiteConvParams* params, const OpData& data,
-               const TfLiteTensor* input, const TfLiteTensor* filter,
-               const TfLiteTensor* bias, TfLiteTensor* im2col,
-               TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
+               const TfLiteEvalTensor* input, const TfLiteEvalTensor* filter,
+               const TfLiteEvalTensor* bias, TfLiteEvalTensor* im2col,
+               TfLiteEvalTensor* hwcn_weights, TfLiteEvalTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
@@ -249,21 +266,31 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
   op_params.float_activation_min = output_activation_min;
   op_params.float_activation_max = output_activation_max;
 
-  reference_ops::Conv(op_params, GetTensorShape(input),
-                      GetTensorData<float>(input), GetTensorShape(filter),
-                      GetTensorData<float>(filter), GetTensorShape(bias),
-                      GetTensorData<float>(bias), GetTensorShape(output),
-                      GetTensorData<float>(output), GetTensorShape(im2col),
-                      GetTensorData<float>(im2col));
+  reference_ops::Conv(op_params, tflite::micro::GetTensorShape(input),
+                      tflite::micro::GetTensorData<float>(input),
+                      tflite::micro::GetTensorShape(filter),
+                      tflite::micro::GetTensorData<float>(filter),
+                      tflite::micro::GetTensorShape(bias),
+                      tflite::micro::GetTensorData<float>(bias),
+                      tflite::micro::GetTensorShape(output),
+                      tflite::micro::GetTensorData<float>(output),
+                      tflite::micro::GetTensorShape(im2col),
+                      tflite::micro::GetTensorData<float>(im2col));
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kFilterTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 3)
+          ? tflite::micro::GetEvalInput(context, node, kBiasTensor)
+          : nullptr;
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
   TFLITE_DCHECK(node->user_data != nullptr);
   const OpData& data = *(static_cast<const OpData*>(node->user_data));
diff --git a/tensorflow/lite/micro/kernels/conv_test.cc b/tensorflow/lite/micro/kernels/conv_test.cc
index 686b3f98ff5..be646d63659 100644
--- a/tensorflow/lite/micro/kernels/conv_test.cc
+++ b/tensorflow/lite/micro/kernels/conv_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
@@ -54,48 +54,25 @@ TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
                                  int output_length,
                                  TfLiteConvParams* conv_params,
                                  float tolerance = 1e-5) {
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_CONV_2D);
-
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  const char* init_data = reinterpret_cast<const char*>(conv_params);
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-
   int inputs_array_data[] = {3, 0, 1, 2};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 3};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(conv_params);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
+  const TfLiteRegistration registration =
+      tflite::ops::micro::Register_CONV_2D();
+  micro::KernelRunner runner(
+      registration, tensors, tensors_size, inputs_array, outputs_array,
+      reinterpret_cast<void*>(conv_params), micro_test::reporter);
 
-  if (registration->prepare) {
-    TF_LITE_ENSURE_OK(context, registration->prepare(&context, &node));
-  }
+  const char* init_data = reinterpret_cast<const char*>(conv_params);
 
   // TODO(b/154240825): Use a test macro here which fails and returns.
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_ENSURE_OK(context, registration->invoke(&context, &node));
-
-  if (registration->free) {
-    registration->free(&context, user_data);
+  TfLiteStatus status = runner.InitAndPrepare(init_data);
+  if (status != kTfLiteOk) {
+    return status;
   }
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   for (int i = 0; i < output_length; ++i) {
     TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i],
@@ -163,9 +140,9 @@ void TestConvQuantizedPerLayer(
   // TODO(njeff): Affine Quantization Params should be set on tensor creation.
   float filter_scales[] = {1, filter_scale};
   int filter_zero_points[] = {1, 128};
-  TfLiteAffineQuantization filter_quant = {
-      FloatArrayFromFloats(filter_scales),
-      IntArrayFromInts(filter_zero_points)};
+  TfLiteAffineQuantization filter_quant = {FloatArrayFromFloats(filter_scales),
+                                           IntArrayFromInts(filter_zero_points),
+                                           0};
   tensors[1].quantization = {kTfLiteAffineQuantization, &filter_quant};
 
   TF_LITE_MICRO_EXPECT_EQ(
@@ -209,14 +186,15 @@ void TestConvQuantizedPerChannel(
   float input_scales[] = {1, input_scale};
   int input_zero_points[] = {1, input_zero_point};
   TfLiteAffineQuantization input_quant = {FloatArrayFromFloats(input_scales),
-                                          IntArrayFromInts(input_zero_points)};
+                                          IntArrayFromInts(input_zero_points),
+                                          0};
   input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
 
   float output_scales[] = {1, output_scale};
   int output_zero_points[] = {1, output_zero_point};
-  TfLiteAffineQuantization output_quant = {
-      FloatArrayFromFloats(output_scales),
-      IntArrayFromInts(output_zero_points)};
+  TfLiteAffineQuantization output_quant = {FloatArrayFromFloats(output_scales),
+                                           IntArrayFromInts(output_zero_points),
+                                           0};
   output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant};
 
   constexpr int inputs_size = 3;
@@ -401,9 +379,6 @@ TF_LITE_MICRO_TEST(SimpleTestDilatedQuantizedPerChannel) {
 }
 
 TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannelRelu6) {
-  // conv params:
-  // padding, stride_<width,height>, dilation_<width, height>, activation
-  TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1, kTfLiteActRelu6};
   const int output_dims_count = 12;
   int8_t output_data[output_dims_count];
 
@@ -565,7 +540,7 @@ TF_LITE_MICRO_TEST(FilterDimsNotMatchingAffineQuantization) {
   int input_zero_points[] = {1, 128};
   TfLiteAffineQuantization input_quant = {
       tflite::testing::FloatArrayFromFloats(input_scales),
-      tflite::testing::IntArrayFromInts(input_zero_points)};
+      tflite::testing::IntArrayFromInts(input_zero_points), 0};
   input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
 
   constexpr int inputs_size = 3;
@@ -626,17 +601,17 @@ TF_LITE_MICRO_TEST(BroadcastPerLayerQuantizationToPerChannelShouldMatchGolden) {
   TfLiteIntArray* output_dims =
       tflite::testing::IntArrayFromInts(tflite::testing::kOutputShape);
 
-  // Create per-layer quantized int8 input tensor.
+  // Create per-layer quantized int8_t input tensor.
   TfLiteTensor input_tensor = tflite::testing::CreateQuantizedTensor(
       tflite::testing::kInputData, input_quantized, input_dims, input_scale, 0);
   int input_zero_points[2] = {1, 0};
   float input_scales[2] = {1, input_scale};
   TfLiteAffineQuantization input_quant = {
       tflite::testing::FloatArrayFromFloats(input_scales),
-      tflite::testing::IntArrayFromInts(input_zero_points)};
+      tflite::testing::IntArrayFromInts(input_zero_points), 0};
   input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
 
-  // Create per-layer quantized int8 filter tensor.
+  // Create per-layer quantized int8_t filter tensor.
   TfLiteTensor filter_tensor = tflite::testing::CreateQuantizedTensor(
       tflite::testing::kFilterData, filter_quantized, filter_dims, filter_scale,
       0);
@@ -644,10 +619,10 @@ TF_LITE_MICRO_TEST(BroadcastPerLayerQuantizationToPerChannelShouldMatchGolden) {
   float filter_scales[2] = {1, filter_scale};
   TfLiteAffineQuantization filter_quant = {
       tflite::testing::FloatArrayFromFloats(filter_scales),
-      tflite::testing::IntArrayFromInts(filter_zero_points)};
+      tflite::testing::IntArrayFromInts(filter_zero_points), 0};
   filter_tensor.quantization = {kTfLiteAffineQuantization, &filter_quant};
 
-  // Create per-layer quantized int32 bias tensor.
+  // Create per-layer quantized int32_t bias tensor.
   tflite::SymmetricQuantize(tflite::testing::kBiasData, bias_quantized,
                             tflite::testing::kBiasElements,
                             input_scale * output_scale);
@@ -658,17 +633,17 @@ TF_LITE_MICRO_TEST(BroadcastPerLayerQuantizationToPerChannelShouldMatchGolden) {
   float bias_scales[2] = {1, input_scale * filter_scale};
   TfLiteAffineQuantization bias_quant = {
       tflite::testing::FloatArrayFromFloats(bias_scales),
-      tflite::testing::IntArrayFromInts(bias_zero_points)};
+      tflite::testing::IntArrayFromInts(bias_zero_points), 0};
   bias_tensor.quantization = {kTfLiteAffineQuantization, &bias_quant};
 
-  // Create per-layer quantized int8 output tensor.
+  // Create per-layer quantized int8_t output tensor.
   TfLiteTensor output_tensor = tflite::testing::CreateQuantizedTensor(
       output_data, output_dims, output_scale, 0 /* quantized dimension */);
   int output_zero_points[2] = {1, 0};
   float output_scales[2] = {1, output_scale};
   TfLiteAffineQuantization output_quant = {
       tflite::testing::FloatArrayFromFloats(output_scales),
-      tflite::testing::IntArrayFromInts(output_zero_points)};
+      tflite::testing::IntArrayFromInts(output_zero_points), 0};
   output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant};
 
   constexpr int inputs_size = 3;
@@ -748,7 +723,7 @@ TF_LITE_MICRO_TEST(Int8Input32x1Filter32x32ShouldMatchGolden) {
   // Output scale of 50 is needed to accomodate a float range of [-6400, 6350]
   float output_scale = 50.0f;
 
-  // Create per-tensor quantized int8 input tensor.
+  // Create per-tensor quantized int8_t input tensor.
   int8_t input_quantized[kSampleSize];
   TfLiteTensor input_tensor = tflite::testing::CreateQuantizedTensor(
       input_values, input_quantized, input_dims, input_scale, input_zero_point);
@@ -757,10 +732,10 @@ TF_LITE_MICRO_TEST(Int8Input32x1Filter32x32ShouldMatchGolden) {
   float input_scales[] = {1, input_scale};
   TfLiteAffineQuantization input_quant = {
       tflite::testing::FloatArrayFromFloats(input_scales),
-      tflite::testing::IntArrayFromInts(input_zero_points)};
+      tflite::testing::IntArrayFromInts(input_zero_points), 0};
   input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
 
-  // Create per-tensor quantized int8 filter tensor.
+  // Create per-tensor quantized int8_t filter tensor.
   int8_t filter_quantized[kNumFilters * kSampleSize];
   TfLiteTensor filter_tensor = tflite::testing::CreateQuantizedTensor(
       filter_values, filter_quantized, filter_dims, filter_scale,
@@ -770,10 +745,10 @@ TF_LITE_MICRO_TEST(Int8Input32x1Filter32x32ShouldMatchGolden) {
   float filter_scales[] = {1, filter_scale};
   TfLiteAffineQuantization filter_quant = {
       tflite::testing::FloatArrayFromFloats(filter_scales),
-      tflite::testing::IntArrayFromInts(filter_zero_points)};
+      tflite::testing::IntArrayFromInts(filter_zero_points), 0};
   filter_tensor.quantization = {kTfLiteAffineQuantization, &filter_quant};
 
-  // Create per-tensor quantized int32 bias tensor.
+  // Create per-tensor quantized int32_t bias tensor.
   int32_t bias_quantized[kSampleSize];
   tflite::SymmetricQuantize(bias_values, bias_quantized, kSampleSize,
                             input_scale * output_scale);
@@ -786,10 +761,10 @@ TF_LITE_MICRO_TEST(Int8Input32x1Filter32x32ShouldMatchGolden) {
   float bias_scales[] = {1, input_scale * filter_scale};
   TfLiteAffineQuantization bias_quant = {
       tflite::testing::FloatArrayFromFloats(bias_scales),
-      tflite::testing::IntArrayFromInts(bias_zero_points)};
+      tflite::testing::IntArrayFromInts(bias_zero_points), 0};
   bias_tensor.quantization = {kTfLiteAffineQuantization, &bias_quant};
 
-  // Create per-tensor quantized int8 output tensor.
+  // Create per-tensor quantized int8_t output tensor.
   int8_t output_quantized[kSampleSize];
   TfLiteTensor output_tensor = tflite::testing::CreateQuantizedTensor(
       output_quantized, output_dims, output_scale, output_zero_point);
@@ -798,7 +773,7 @@ TF_LITE_MICRO_TEST(Int8Input32x1Filter32x32ShouldMatchGolden) {
   float output_scales[] = {1, output_scale};
   TfLiteAffineQuantization output_quant = {
       tflite::testing::FloatArrayFromFloats(output_scales),
-      tflite::testing::IntArrayFromInts(output_zero_points)};
+      tflite::testing::IntArrayFromInts(output_zero_points), 0};
   output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant};
 
   // The 3 inputs include the input, filter and bias tensors.
diff --git a/tensorflow/lite/micro/kernels/depthwise_conv.cc b/tensorflow/lite/micro/kernels/depthwise_conv.cc
index f85323b62bb..2f6083d56c1 100644
--- a/tensorflow/lite/micro/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/depthwise_conv.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -42,6 +43,12 @@ constexpr int kDepthwiseConvQuantizedDimension = 3;
 
 struct OpData {
   TfLitePaddingValues padding;
+
+  // Cached tensor zero point values for quantized operations.
+  int32_t input_zero_point;
+  int32_t filter_zero_point;
+  int32_t output_zero_point;
+
   // The scaling factor from input to output (aka the 'real multiplier') can
   // be represented as a fixed point multiplier plus a left shift.
   int32_t output_multiplier;
@@ -95,12 +102,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@@ -111,6 +113,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
   OpData* data = static_cast<OpData*>(node->user_data);
 
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
 
@@ -120,16 +123,16 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   int filter_width = SizeOfDimension(filter, 2);
   int filter_height = SizeOfDimension(filter, 1);
 
-  // Per channel quantization is only needed for int8 inference. For other
+  // Per channel quantization is only needed for int8_t inference. For other
   // quantized types, only a single scale and zero point is needed.
   const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
   // Dynimically allocate per-channel quantization parameters.
-  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
-      context, num_channels * sizeof(int32_t),
-      reinterpret_cast<void**>(&data->per_channel_output_multiplier)));
-  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
-      context, num_channels * sizeof(int32_t),
-      reinterpret_cast<void**>(&data->per_channel_output_shift)));
+  data->per_channel_output_multiplier =
+      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+  data->per_channel_output_shift =
+      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
 
   // All per-channel quantized tensors need valid zero point and scale arrays.
   if (input->type == kTfLiteInt8) {
@@ -150,14 +153,21 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                       affine_quantization->zero_point->size);
   }
 
-  return CalculateOpData(context, node, params, width, height, filter_width,
-                         filter_height, data_type, data);
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
+                                        filter_width, filter_height, data_type,
+                                        data));
+
+  data->input_zero_point = input->params.zero_point;
+  data->filter_zero_point = filter->params.zero_point;
+  data->output_zero_point = output->params.zero_point;
+
+  return kTfLiteOk;
 }
 
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
-               TfLiteDepthwiseConvParams* params, const OpData* data,
-               const TfLiteTensor* input, const TfLiteTensor* filter,
-               const TfLiteTensor* bias, TfLiteTensor* output) {
+               TfLiteDepthwiseConvParams* params, const OpData& data,
+               const TfLiteEvalTensor* input, const TfLiteEvalTensor* filter,
+               const TfLiteEvalTensor* bias, TfLiteEvalTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
@@ -165,8 +175,8 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
   tflite::DepthwiseParams op_params;
   // Padding type is ignored, but still set.
   op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
   op_params.stride_width = params->stride_width;
   op_params.stride_height = params->stride_height;
   op_params.dilation_width_factor = params->dilation_width_factor;
@@ -176,74 +186,87 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
   op_params.float_activation_max = output_activation_max;
 
   tflite::reference_ops::DepthwiseConv(
-      op_params, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(filter), GetTensorData<float>(filter),
-      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
-      GetTensorData<float>(output));
+      op_params, tflite::micro::GetTensorShape(input),
+      tflite::micro::GetTensorData<float>(input),
+      tflite::micro::GetTensorShape(filter),
+      tflite::micro::GetTensorData<float>(filter),
+      tflite::micro::GetTensorShape(bias),
+      tflite::micro::GetTensorData<float>(bias),
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<float>(output));
 }
 
 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
                              TfLiteDepthwiseConvParams* params,
-                             const OpData* data, const TfLiteTensor* input,
-                             const TfLiteTensor* filter,
-                             const TfLiteTensor* bias, TfLiteTensor* output) {
+                             const OpData& data, const TfLiteEvalTensor* input,
+                             const TfLiteEvalTensor* filter,
+                             const TfLiteEvalTensor* bias,
+                             TfLiteEvalTensor* output) {
   DepthwiseParams op_params;
   op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
   op_params.stride_width = params->stride_width;
   op_params.stride_height = params->stride_height;
   op_params.dilation_width_factor = params->dilation_width_factor;
   op_params.dilation_height_factor = params->dilation_height_factor;
   op_params.depth_multiplier = params->depth_multiplier;
-  op_params.input_offset = -input->params.zero_point;
+  op_params.input_offset = -data.input_zero_point;
   op_params.weights_offset = 0;
-  op_params.output_offset = output->params.zero_point;
+  op_params.output_offset = data.output_zero_point;
   // TODO(b/130439627): Use calculated value for clamping.
   op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
   op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
 
   reference_integer_ops::DepthwiseConvPerChannel(
-      op_params, data->per_channel_output_multiplier,
-      data->per_channel_output_shift, GetTensorShape(input),
-      GetTensorData<int8>(input), GetTensorShape(filter),
-      GetTensorData<int8>(filter), GetTensorShape(bias),
-      GetTensorData<int32>(bias), GetTensorShape(output),
-      GetTensorData<int8>(output));
+      op_params, data.per_channel_output_multiplier,
+      data.per_channel_output_shift, tflite::micro::GetTensorShape(input),
+      tflite::micro::GetTensorData<int8_t>(input),
+      tflite::micro::GetTensorShape(filter),
+      tflite::micro::GetTensorData<int8_t>(filter),
+      tflite::micro::GetTensorShape(bias),
+      tflite::micro::GetTensorData<int32_t>(bias),
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<int8_t>(output));
 }
 
 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                   TfLiteDepthwiseConvParams* params, const OpData* data,
-                   const TfLiteTensor* input, const TfLiteTensor* filter,
-                   const TfLiteTensor* bias, TfLiteTensor* output) {
-  const int32_t input_offset = -input->params.zero_point;
-  const int32_t filter_offset = -filter->params.zero_point;
-  const int32_t output_offset = output->params.zero_point;
+                   TfLiteDepthwiseConvParams* params, const OpData& data,
+                   const TfLiteEvalTensor* input,
+                   const TfLiteEvalTensor* filter, const TfLiteEvalTensor* bias,
+                   TfLiteEvalTensor* output) {
+  const int32_t input_offset = -data.input_zero_point;
+  const int32_t filter_offset = -data.filter_zero_point;
+  const int32_t output_offset = data.output_zero_point;
 
   tflite::DepthwiseParams op_params;
   // Padding type is ignored, but still set.
   op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
   op_params.stride_width = params->stride_width;
   op_params.stride_height = params->stride_height;
   op_params.dilation_width_factor = params->dilation_width_factor;
   op_params.dilation_height_factor = params->dilation_height_factor;
   op_params.depth_multiplier = params->depth_multiplier;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
   op_params.input_offset = input_offset;
   op_params.weights_offset = filter_offset;
   op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_multiplier = data.output_multiplier;
   // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = -data->output_shift;
+  op_params.output_shift = -data.output_shift;
 
   tflite::reference_ops::DepthwiseConv(
-      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-      GetTensorShape(filter), GetTensorData<uint8_t>(filter),
-      GetTensorShape(bias), GetTensorData<int32_t>(bias),
-      GetTensorShape(output), GetTensorData<uint8_t>(output));
+      op_params, tflite::micro::GetTensorShape(input),
+      tflite::micro::GetTensorData<uint8_t>(input),
+      tflite::micro::GetTensorShape(filter),
+      tflite::micro::GetTensorData<uint8_t>(filter),
+      tflite::micro::GetTensorShape(bias),
+      tflite::micro::GetTensorData<int32_t>(bias),
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<uint8_t>(output));
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
@@ -254,24 +277,29 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
   const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  const TfLiteTensor* bias =
-      (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kFilterTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 3)
+          ? tflite::micro::GetEvalInput(context, node, kBiasTensor)
+          : nullptr;
 
   // TODO(aselle): Consider whether float conv and quantized conv should be
   // separate ops to avoid dispatch overhead here.
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
-      EvalFloat(context, node, params, &data, input, filter, bias, output);
+      EvalFloat(context, node, params, data, input, filter, bias, output);
       break;
     case kTfLiteInt8:
-      EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias,
+      EvalQuantizedPerChannel(context, node, params, data, input, filter, bias,
                               output);
       break;
     case kTfLiteUInt8:
-      EvalQuantized(context, node, params, &data, input, filter, bias, output);
+      EvalQuantized(context, node, params, data, input, filter, bias, output);
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
diff --git a/tensorflow/lite/micro/kernels/depthwise_conv_test.cc b/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
index cd62de0d17e..e16e9f893cb 100644
--- a/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
@@ -42,13 +42,16 @@ TfLiteStatus ValidateDepthwiseConvGoldens(
     const T* expected_output_data, int output_length,
     TfLiteDepthwiseConvParams* conv_params, float tolerance, int tensors_size,
     TfLiteTensor* tensors) {
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+  int inputs_array_data[] = {3, 0, 1, 2};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_DEPTHWISE_CONV_2D);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+  const TfLiteRegistration registration =
+      tflite::ops::micro::Register_DEPTHWISE_CONV_2D();
+  micro::KernelRunner runner(
+      registration, tensors, tensors_size, inputs_array, outputs_array,
+      reinterpret_cast<void*>(conv_params), micro_test::reporter);
 
   int input_depth = tensors[0].dims->data[3];
   int output_depth = tensors[1].dims->data[3];
@@ -60,32 +63,13 @@ TfLiteStatus ValidateDepthwiseConvGoldens(
   conv_params->depth_multiplier = depth_mul;
 
   const char* init_data = reinterpret_cast<const char*>(conv_params);
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-  int inputs_array_data[] = {3, 0, 1, 2};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 3};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(conv_params);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  if (registration->prepare) {
-    TF_LITE_ENSURE_OK(context, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_ENSURE_OK(context, registration->invoke(&context, &node));
-
-  if (registration->free) {
-    registration->free(&context, user_data);
+  // TODO(b/154240825): Use a test macro here which fails and returns.
+  TfLiteStatus status = runner.InitAndPrepare(init_data);
+  if (status != kTfLiteOk) {
+    return status;
   }
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   const T* output_data = tflite::GetTensorData<T>(&tensors[kOutputTensorIndex]);
   for (int i = 0; i < output_length; ++i) {
@@ -157,15 +141,15 @@ void TestDepthwiseConvQuantizedPerLayer(
   // TODO(njeff): Affine Quantization Params should be set on tensor creation.
   float filter_scales[] = {1, filter_scale};
   int filter_zero_points[] = {1, 128};
-  TfLiteAffineQuantization filter_quant = {
-      FloatArrayFromFloats(filter_scales),
-      IntArrayFromInts(filter_zero_points)};
+  TfLiteAffineQuantization filter_quant = {FloatArrayFromFloats(filter_scales),
+                                           IntArrayFromInts(filter_zero_points),
+                                           0};
   tensors[1].quantization = {kTfLiteAffineQuantization, &filter_quant};
 
   float bias_scales[] = {1, filter_scale * input_scale};
   int bias_zero_points[] = {1, 128};
   TfLiteAffineQuantization bias_quant = {FloatArrayFromFloats(bias_scales),
-                                         IntArrayFromInts(bias_zero_points)};
+                                         IntArrayFromInts(bias_zero_points), 0};
   tensors[2].quantization = {kTfLiteAffineQuantization, &bias_quant};
 
   AsymmetricQuantize(golden, golden_quantized, output_dims_count, output_scale,
@@ -213,14 +197,15 @@ void TestDepthwiseConvQuantizedPerChannel(
   float input_scales[] = {1, input_scale};
   int input_zero_points[] = {1, input_zero_point};
   TfLiteAffineQuantization input_quant = {FloatArrayFromFloats(input_scales),
-                                          IntArrayFromInts(input_zero_points)};
+                                          IntArrayFromInts(input_zero_points),
+                                          0};
   input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
 
   float output_scales[] = {1, output_scale};
   int output_zero_points[] = {1, output_zero_point};
-  TfLiteAffineQuantization output_quant = {
-      FloatArrayFromFloats(output_scales),
-      IntArrayFromInts(output_zero_points)};
+  TfLiteAffineQuantization output_quant = {FloatArrayFromFloats(output_scales),
+                                           IntArrayFromInts(output_zero_points),
+                                           0};
   output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant};
 
   constexpr int inputs_size = 3;
@@ -249,14 +234,11 @@ void TestDepthwiseConvQuantizedPerChannel(
 TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(SimpleTest) {
-  const int input_elements = 12;
   const int input_shape[] = {4, 1, 3, 2, 2};
   const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12};
-  const int filter_elements = 16;
   const int filter_shape[] = {4, 1, 2, 2, 4};
   const float filter_values[] = {1, 2, 3, 4, -9, 10,  -11, 12,
                                  5, 6, 7, 8, 13, -14, 15,  -16};
-  const int bias_elements = 4;
   const int bias_shape[] = {4, 1, 1, 1, 4};
   const float bias_values[] = {1, 2, 3, 4};
   const float golden[] = {
@@ -367,16 +349,12 @@ TF_LITE_MICRO_TEST(SimpleTestDilatedQuantized) {
 }
 
 TF_LITE_MICRO_TEST(SimpleTestRelu) {
-  const int input_elements = 12;
   const int input_shape[] = {4, 1, 3, 2, 2};
   const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12};
-  const int filter_elements = 16;
   const int filter_shape[] = {4, 1, 2, 2, 4};
   const float filter_values[] = {1, 2, 3, 4, -9, 10,  -11, 12,
                                  5, 6, 7, 8, 13, -14, 15,  -16};
-  const int bias_elements = 4;
   const int bias_shape[] = {4, 1, 1, 1, 4};
-  const int output_elements = 8;
   const float bias_values[] = {1, 2, 3, 4};
   const int output_shape[] = {4, 1, 2, 1, 4};
   const int output_dims_count = 8;
@@ -505,8 +483,6 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannel) {
   int8_t filter_quantized[filter_elements];
   int32_t bias_quantized[bias_elements];
   int8_t golden_quantized[output_elements];
-  int zero_points[bias_elements + 1];
-  float scales[bias_elements + 1];
 
   TfLiteDepthwiseConvParams conv_params;
   conv_params.activation = kTfLiteActNone;
@@ -550,8 +526,6 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannelDepthMultiplier1) {
   int8_t filter_quantized[filter_elements];
   int32_t bias_quantized[bias_elements];
   int8_t golden_quantized[output_elements];
-  int zero_points[bias_elements + 1];
-  float scales[bias_elements + 1];
 
   TfLiteDepthwiseConvParams conv_params;
   conv_params.activation = kTfLiteActNone;
@@ -583,7 +557,6 @@ TF_LITE_MICRO_TEST(TestQuantizedPerChannelDepthMultiplier1Relu6) {
   };
   const int output_shape[] = {4, 1, 2, 1, 4};
   int8_t output_data[output_elements];
-  float output_float[output_elements];
 
   const float input_scale = 0.023529f;
   const float output_scale = 0.023529f;
@@ -594,8 +567,6 @@ TF_LITE_MICRO_TEST(TestQuantizedPerChannelDepthMultiplier1Relu6) {
   int8_t filter_quantized[filter_elements];
   int32_t bias_quantized[bias_elements];
   int8_t golden_quantized[output_elements];
-  int zero_points[bias_elements + 1];
-  float scales[bias_elements + 1];
 
   TfLiteDepthwiseConvParams conv_params;
   conv_params.activation = kTfLiteActRelu6;
@@ -640,8 +611,6 @@ TF_LITE_MICRO_TEST(SimpleTestDilatedQuantizedPerChannel) {
   int8_t filter_quantized[filter_elements];
   int32_t bias_quantized[bias_elements];
   int8_t golden_quantized[output_elements];
-  int zero_points[bias_elements + 1];
-  float scales[bias_elements + 1];
 
   TfLiteDepthwiseConvParams conv_params;
   conv_params.activation = kTfLiteActNone;
@@ -673,8 +642,6 @@ TF_LITE_MICRO_TEST(TestQuantizedPerChannelCompareWithFloat) {
   int8_t filter_quantized[filter_size];
   int32_t bias_quantized[bias_size];
   int8_t golden_quantized[output_size];
-  int zero_points[bias_size + 1];
-  float scales[bias_size + 1];
   int8_t output_data[output_size];
   float output_float[output_size];
 
@@ -707,7 +674,6 @@ TF_LITE_MICRO_TEST(FilterDimsNotMatchingAffineQuantization) {
   const int bias_shape[] = {4, 1, 1, 1, 4};
   const float bias_data[] = {3, -2, 4, 6};
   const int output_shape[] = {4, 1, 1, 2, 4};
-  const float golden[] = {43, 48, 18, 22, 3, -4, -28, -36};
 
   const int input_size = 12;
   const int filter_size = 16;
@@ -720,7 +686,6 @@ TF_LITE_MICRO_TEST(FilterDimsNotMatchingAffineQuantization) {
   int zero_points[bias_size + 1];
   float scales[bias_size + 1];
   int8_t output_data[output_size];
-  float output_float[output_size];
 
   const float input_scale = 0.5;
   const float output_scale = 1.0;
@@ -753,7 +718,7 @@ TF_LITE_MICRO_TEST(FilterDimsNotMatchingAffineQuantization) {
   int input_zero_points[] = {1, input_zero_point};
   TfLiteAffineQuantization input_quant = {
       tflite::testing::FloatArrayFromFloats(input_scales),
-      tflite::testing::IntArrayFromInts(input_zero_points)};
+      tflite::testing::IntArrayFromInts(input_zero_points), 0};
   input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
 
   constexpr int inputs_size = 3;
@@ -822,27 +787,27 @@ TF_LITE_MICRO_TEST(PerChannelBroadcastQuantizationParams) {
   TfLiteIntArray* bias_dims = tflite::testing::IntArrayFromInts(bias_shape);
   TfLiteIntArray* output_dims = tflite::testing::IntArrayFromInts(output_shape);
 
-  // Create per-layer quantized int8 input tensor.
+  // Create per-layer quantized int8_t input tensor.
   TfLiteTensor input_tensor = tflite::testing::CreateQuantizedTensor(
       input_values, input_quantized, input_dims, input_scale, 0);
   int input_zero_points[2] = {1, 0};
   float input_scales[2] = {1, input_scale};
   TfLiteAffineQuantization input_quant = {
       tflite::testing::FloatArrayFromFloats(input_scales),
-      tflite::testing::IntArrayFromInts(input_zero_points)};
+      tflite::testing::IntArrayFromInts(input_zero_points), 0};
   input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
 
-  // Create per-layer quantized int8 filter tensor.
+  // Create per-layer quantized int8_t filter tensor.
   TfLiteTensor filter_tensor = tflite::testing::CreateQuantizedTensor(
       filter_values, filter_quantized, filter_dims, filter_scale, 0);
   int filter_zero_points[2] = {1, 0};
   float filter_scales[2] = {1, filter_scale};
   TfLiteAffineQuantization filter_quant = {
       tflite::testing::FloatArrayFromFloats(filter_scales),
-      tflite::testing::IntArrayFromInts(filter_zero_points)};
+      tflite::testing::IntArrayFromInts(filter_zero_points), 0};
   filter_tensor.quantization = {kTfLiteAffineQuantization, &filter_quant};
 
-  // Create per-layer quantized int32 bias tensor.
+  // Create per-layer quantized int32_t bias tensor.
   tflite::SymmetricQuantize(bias_values, bias_quantized, bias_elements,
                             input_scale * output_scale);
   TfLiteTensor bias_tensor =
@@ -852,17 +817,17 @@ TF_LITE_MICRO_TEST(PerChannelBroadcastQuantizationParams) {
   float bias_scales[2] = {1, input_scale * filter_scale};
   TfLiteAffineQuantization bias_quant = {
       tflite::testing::FloatArrayFromFloats(bias_scales),
-      tflite::testing::IntArrayFromInts(bias_zero_points)};
+      tflite::testing::IntArrayFromInts(bias_zero_points), 0};
   bias_tensor.quantization = {kTfLiteAffineQuantization, &bias_quant};
 
-  // Create per-layer quantized int8 output tensor.
+  // Create per-layer quantized int8_t output tensor.
   TfLiteTensor output_tensor = tflite::testing::CreateQuantizedTensor(
       output_data, output_dims, output_scale, 0);
   int output_zero_points[2] = {1, 0};
   float output_scales[2] = {1, output_scale};
   TfLiteAffineQuantization output_quant = {
       tflite::testing::FloatArrayFromFloats(output_scales),
-      tflite::testing::IntArrayFromInts(output_zero_points)};
+      tflite::testing::IntArrayFromInts(output_zero_points), 0};
   output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant};
 
   constexpr int inputs_size = 3;
@@ -957,7 +922,7 @@ TF_LITE_MICRO_TEST(Int8Input32x4Filter32x4ShouldMatchGolden) {
   TfLiteIntArray* bias_dims = tflite::testing::IntArrayFromInts(bias_shape);
   TfLiteIntArray* output_dims = tflite::testing::IntArrayFromInts(output_shape);
 
-  // Create per-tensor quantized int8 input tensor.
+  // Create per-tensor quantized int8_t input tensor.
   int8_t input_quantized[input_elements];
   TfLiteTensor input_tensor = tflite::testing::CreateQuantizedTensor(
       input_values, input_quantized, input_dims, input_scale, input_zero_point);
@@ -967,10 +932,10 @@ TF_LITE_MICRO_TEST(Int8Input32x4Filter32x4ShouldMatchGolden) {
   float input_scales[] = {1, input_scale};
   TfLiteAffineQuantization input_quant = {
       tflite::testing::FloatArrayFromFloats(input_scales),
-      tflite::testing::IntArrayFromInts(input_zero_points)};
+      tflite::testing::IntArrayFromInts(input_zero_points), 0};
   input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
 
-  // Create per-tensor quantized int8 filter tensor.
+  // Create per-tensor quantized int8_t filter tensor.
   int8_t filter_quantized[filter_elements];
   TfLiteTensor filter_tensor = tflite::testing::CreateQuantizedTensor(
       filter_values, filter_quantized, filter_dims, filter_scale, 0);
@@ -980,10 +945,10 @@ TF_LITE_MICRO_TEST(Int8Input32x4Filter32x4ShouldMatchGolden) {
   float filter_scales[] = {1, filter_scale};
   TfLiteAffineQuantization filter_quant = {
       tflite::testing::FloatArrayFromFloats(filter_scales),
-      tflite::testing::IntArrayFromInts(filter_zero_points)};
+      tflite::testing::IntArrayFromInts(filter_zero_points), 0};
   filter_tensor.quantization = {kTfLiteAffineQuantization, &filter_quant};
 
-  // Create per-tensor quantized int32 bias tensor.
+  // Create per-tensor quantized int32_t bias tensor.
   int32_t bias_quantized[bias_elements];
   // See https://www.tensorflow.org/lite/performance/quantization_spec for a
   // detailed explanation of why bias scale is input_scale * filter_scale.
@@ -997,10 +962,10 @@ TF_LITE_MICRO_TEST(Int8Input32x4Filter32x4ShouldMatchGolden) {
   float bias_scales[] = {1, input_scale * filter_scale};
   TfLiteAffineQuantization bias_quant = {
       tflite::testing::FloatArrayFromFloats(bias_scales),
-      tflite::testing::IntArrayFromInts(bias_zero_points)};
+      tflite::testing::IntArrayFromInts(bias_zero_points), 0};
   bias_tensor.quantization = {kTfLiteAffineQuantization, &bias_quant};
 
-  // Create per-tensor quantized int8 output tensor.
+  // Create per-tensor quantized int8_t output tensor.
   int8_t output_quantized[output_elements];
   TfLiteTensor output_tensor = tflite::testing::CreateQuantizedTensor(
       output_quantized, output_dims, output_scale, output_zero_point);
@@ -1010,7 +975,7 @@ TF_LITE_MICRO_TEST(Int8Input32x4Filter32x4ShouldMatchGolden) {
   float output_scales[] = {1, output_scale};
   TfLiteAffineQuantization output_quant = {
       tflite::testing::FloatArrayFromFloats(output_scales),
-      tflite::testing::IntArrayFromInts(output_zero_points)};
+      tflite::testing::IntArrayFromInts(output_zero_points), 0};
   output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant};
 
   // The 3 inputs include the input, filter and bias tensors.
@@ -1035,7 +1000,7 @@ TF_LITE_MICRO_TEST(Int8Input32x4Filter32x4ShouldMatchGolden) {
   conv_params.activation = kTfLiteActNone;
   conv_params.dilation_width_factor = 1;
   conv_params.dilation_height_factor = 1;
-  TfLiteStatus status = tflite::testing::ValidateDepthwiseConvGoldens(
+  tflite::testing::ValidateDepthwiseConvGoldens(
       golden_quantized, output_elements, &conv_params, kQuantizationTolerance,
       kTensorsSize, tensors);
 }
diff --git a/tensorflow/lite/micro/kernels/dequantize.cc b/tensorflow/lite/micro/kernels/dequantize.cc
index 55a41bf0a1b..df501887866 100644
--- a/tensorflow/lite/micro/kernels/dequantize.cc
+++ b/tensorflow/lite/micro/kernels/dequantize.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/requantize.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -34,16 +35,12 @@ struct OpData {
   // be represented as a fixed point multiplier plus a left shift.
   int32_t output_multiplier;
   int output_shift;
+  int32_t output_zero_point;
 };
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@@ -73,6 +70,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   data->quantization_params.zero_point = input->params.zero_point;
   data->quantization_params.scale = static_cast<double>(input->params.scale);
+  data->output_zero_point = output->params.zero_point;
   return kTfLiteOk;
 }
 
@@ -80,28 +78,31 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TFLITE_DCHECK(node->user_data != nullptr);
   OpData* data = static_cast<OpData*>(node->user_data);
 
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
 
   if (output->type == kTfLiteFloat32) {
     switch (input->type) {
       case kTfLiteUInt8:
-        reference_ops::Dequantize(
-            data->quantization_params, GetTensorShape(input),
-            GetTensorData<uint8_t>(input), GetTensorShape(output),
-            GetTensorData<float>(output));
+        reference_ops::Dequantize(data->quantization_params,
+                                  tflite::micro::GetTensorShape(input),
+                                  tflite::micro::GetTensorData<uint8_t>(input),
+                                  tflite::micro::GetTensorShape(output),
+                                  tflite::micro::GetTensorData<float>(output));
         break;
       case kTfLiteInt8:
-        reference_ops::Dequantize(
-            data->quantization_params, GetTensorShape(input),
-            GetTensorData<int8_t>(input), GetTensorShape(output),
-            GetTensorData<float>(output));
+        reference_ops::Dequantize(data->quantization_params,
+                                  tflite::micro::GetTensorShape(input),
+                                  tflite::micro::GetTensorData<int8_t>(input),
+                                  tflite::micro::GetTensorShape(output),
+                                  tflite::micro::GetTensorData<float>(output));
         break;
       case kTfLiteInt16:
-        reference_ops::Dequantize(
-            data->quantization_params, GetTensorShape(input),
-            GetTensorData<int16_t>(input), GetTensorShape(output),
-            GetTensorData<float>(output));
+        reference_ops::Dequantize(data->quantization_params,
+                                  tflite::micro::GetTensorShape(input),
+                                  tflite::micro::GetTensorData<int16_t>(input),
+                                  tflite::micro::GetTensorShape(output),
+                                  tflite::micro::GetTensorData<float>(output));
         break;
       default:
         TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
@@ -110,21 +111,23 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         return kTfLiteError;
     }
   } else if (output->type == kTfLiteInt32) {
-    int flat_size =
-        MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
+    int flat_size = MatchingFlatSize(tflite::micro::GetTensorShape(input),
+                                     tflite::micro::GetTensorShape(output));
     switch (input->type) {
       case kTfLiteInt16: {
         reference_ops::Requantize(
-            GetTensorData<int16_t>(input), flat_size, data->output_multiplier,
-            data->output_shift, input->params.zero_point,
-            output->params.zero_point, GetTensorData<int32_t>(output));
+            tflite::micro::GetTensorData<int16_t>(input), flat_size,
+            data->output_multiplier, data->output_shift,
+            data->quantization_params.zero_point, data->output_zero_point,
+            tflite::micro::GetTensorData<int32_t>(output));
         break;
       }
       case kTfLiteInt8: {
         reference_ops::Requantize(
-            GetTensorData<int8_t>(input), flat_size, data->output_multiplier,
-            data->output_shift, input->params.zero_point,
-            output->params.zero_point, GetTensorData<int32_t>(output));
+            tflite::micro::GetTensorData<int8_t>(input), flat_size,
+            data->output_multiplier, data->output_shift,
+            data->quantization_params.zero_point, data->output_zero_point,
+            tflite::micro::GetTensorData<int32_t>(output));
         break;
       }
       default:
diff --git a/tensorflow/lite/micro/kernels/dequantize_test.cc b/tensorflow/lite/micro/kernels/dequantize_test.cc
index 21b42aedc50..6b499204b98 100644
--- a/tensorflow/lite/micro/kernels/dequantize_test.cc
+++ b/tensorflow/lite/micro/kernels/dequantize_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
@@ -28,47 +28,22 @@ template <typename T>
 void ValidateDequantizeGoldens(TfLiteTensor* tensors, int tensors_size,
                                const T* expected_output_data, T* output_data,
                                int output_length, float tolerance = 1e-5) {
-  TfLiteContext context;
-  ::tflite::AllOpsResolver resolver;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_DEQUANTIZE);
-
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  const char* init_data = nullptr;
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-
   int inputs_array_data[] = {1, 0};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
+  const TfLiteRegistration registration =
+      tflite::ops::micro::Register_DEQUANTIZE();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array,
+                             /*builtin_data=*/nullptr, micro_test::reporter);
 
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   for (int i = 0; i < output_length; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i], 0.001);
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i], 0.001f);
   }
 }
 
@@ -113,7 +88,6 @@ void TestDequantizeToInt32(const int* input_dims_data, const float* input_data,
       CreateInt32Tensor(output_data, output_dims),
   };
 
-  TfLiteQuantizationParams output_quant;
   tensors[1].params.scale = output_scale;
   tensors[1].params.zero_point = output_zero_point;
 
diff --git a/tensorflow/lite/micro/kernels/elementwise.cc b/tensorflow/lite/micro/kernels/elementwise.cc
index cb1fd852812..64880344664 100644
--- a/tensorflow/lite/micro/kernels/elementwise.cc
+++ b/tensorflow/lite/micro/kernels/elementwise.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_utils.h"
 
 namespace tflite {
 namespace ops {
@@ -52,13 +54,13 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
 template <typename T>
 inline TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node,
                              T func(T), TfLiteType expected_type) {
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, expected_type);
-  const int64_t num_elements = NumElements(input);
-  const T* in_data = GetTensorData<T>(input);
-  T* out_data = GetTensorData<T>(output);
-  for (int64_t i = 0; i < num_elements; ++i) {
+  const size_t num_elements = ElementCount(*input->dims);
+  const T* in_data = tflite::micro::GetTensorData<T>(input);
+  T* out_data = tflite::micro::GetTensorData<T>(output);
+  for (size_t i = 0; i < num_elements; ++i) {
     out_data[i] = func(in_data[i]);
   }
   return kTfLiteOk;
@@ -106,7 +108,6 @@ TfLiteStatus LogicalNotEval(TfLiteContext* context, TfLiteNode* node) {
   return EvalLogical(context, node, [](bool v) { return !v; });
 }
 
-
 }  // namespace
 }  // namespace elementwise
 
diff --git a/tensorflow/lite/micro/kernels/elementwise_test.cc b/tensorflow/lite/micro/kernels/elementwise_test.cc
index 8f028b1f451..b7094cbd445 100644
--- a/tensorflow/lite/micro/kernels/elementwise_test.cc
+++ b/tensorflow/lite/micro/kernels/elementwise_test.cc
@@ -16,13 +16,14 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/debug_log.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
 
-void TestElementwiseFloat(tflite::BuiltinOperator op,
+void TestElementwiseFloat(const TfLiteRegistration& registration,
                           const int* input_dims_data, const float* input_data,
                           const int* output_dims_data,
                           const float* expected_output_data,
@@ -43,45 +44,26 @@ void TestElementwiseFloat(tflite::BuiltinOperator op,
     output_data[i] = 23;
   }
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration = resolver.FindOp(op);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, nullptr, 0);
-  }
   static int inputs_array_data[] = {1, 0};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   static int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array,
+                             /*builtin_data=*/nullptr, micro_test::reporter);
 
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
   for (int i = 0; i < output_dims_count; ++i) {
     TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i], 1e-5f);
   }
 }
 
-void TestElementwiseBool(tflite::BuiltinOperator op, const int* input_dims_data,
-                         const bool* input_data, const int* output_dims_data,
+void TestElementwiseBool(const TfLiteRegistration& registration,
+                         const int* input_dims_data, const bool* input_data,
+                         const int* output_dims_data,
                          const bool* expected_output_data, bool* output_data) {
   TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
@@ -99,39 +81,18 @@ void TestElementwiseBool(tflite::BuiltinOperator op, const int* input_dims_data,
     output_data[i] = false;
   }
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration = resolver.FindOp(op);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, nullptr, 0);
-  }
-
   const int inputs_array_data[] = {1, 0};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   const int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array,
+                             /*builtin_data=*/nullptr, micro_test::reporter);
 
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
   for (int i = 0; i < output_dims_count; ++i) {
     TF_LITE_MICRO_EXPECT_EQ(expected_output_data[i], output_data[i]);
   }
@@ -148,8 +109,9 @@ TF_LITE_MICRO_TEST(Abs) {
   const float input[] = {0.01, -0.01, 10, -10};
   const float golden[] = {0.01, 0.01, 10, 10};
   float output_data[output_dims_count];
-  tflite::testing::TestElementwiseFloat(tflite::BuiltinOperator_ABS, shape,
-                                        input, shape, golden, output_data);
+  tflite::testing::TestElementwiseFloat(tflite::ops::micro::Register_ABS(),
+                                        shape, input, shape, golden,
+                                        output_data);
 }
 
 TF_LITE_MICRO_TEST(Sin) {
@@ -158,8 +120,9 @@ TF_LITE_MICRO_TEST(Sin) {
   const float input[] = {0, 3.1415926, -3.1415926, 1};
   const float golden[] = {0, 0, 0, 0.84147};
   float output_data[output_dims_count];
-  tflite::testing::TestElementwiseFloat(tflite::BuiltinOperator_SIN, shape,
-                                        input, shape, golden, output_data);
+  tflite::testing::TestElementwiseFloat(tflite::ops::micro::Register_SIN(),
+                                        shape, input, shape, golden,
+                                        output_data);
 }
 
 TF_LITE_MICRO_TEST(Cos) {
@@ -168,8 +131,9 @@ TF_LITE_MICRO_TEST(Cos) {
   const float input[] = {0, 3.1415926, -3.1415926, 1};
   const float golden[] = {1, -1, -1, 0.54030};
   float output_data[output_dims_count];
-  tflite::testing::TestElementwiseFloat(tflite::BuiltinOperator_COS, shape,
-                                        input, shape, golden, output_data);
+  tflite::testing::TestElementwiseFloat(tflite::ops::micro::Register_COS(),
+                                        shape, input, shape, golden,
+                                        output_data);
 }
 
 TF_LITE_MICRO_TEST(Log) {
@@ -178,8 +142,9 @@ TF_LITE_MICRO_TEST(Log) {
   const float input[] = {1, 2.7182818, 0.5, 2};
   const float golden[] = {0, 1, -0.6931472, 0.6931472};
   float output_data[output_dims_count];
-  tflite::testing::TestElementwiseFloat(tflite::BuiltinOperator_LOG, shape,
-                                        input, shape, golden, output_data);
+  tflite::testing::TestElementwiseFloat(tflite::ops::micro::Register_LOG(),
+                                        shape, input, shape, golden,
+                                        output_data);
 }
 
 TF_LITE_MICRO_TEST(Sqrt) {
@@ -188,8 +153,9 @@ TF_LITE_MICRO_TEST(Sqrt) {
   const float input[] = {0, 1, 2, 4};
   const float golden[] = {0, 1, 1.41421, 2};
   float output_data[output_dims_count];
-  tflite::testing::TestElementwiseFloat(tflite::BuiltinOperator_SQRT, shape,
-                                        input, shape, golden, output_data);
+  tflite::testing::TestElementwiseFloat(tflite::ops::micro::Register_SQRT(),
+                                        shape, input, shape, golden,
+                                        output_data);
 }
 
 TF_LITE_MICRO_TEST(Rsqrt) {
@@ -198,8 +164,9 @@ TF_LITE_MICRO_TEST(Rsqrt) {
   const float input[] = {1, 2, 4, 9};
   const float golden[] = {1, 0.7071, 0.5, 0.33333};
   float output_data[output_dims_count];
-  tflite::testing::TestElementwiseFloat(tflite::BuiltinOperator_RSQRT, shape,
-                                        input, shape, golden, output_data);
+  tflite::testing::TestElementwiseFloat(tflite::ops::micro::Register_RSQRT(),
+                                        shape, input, shape, golden,
+                                        output_data);
 }
 
 TF_LITE_MICRO_TEST(Square) {
@@ -208,8 +175,9 @@ TF_LITE_MICRO_TEST(Square) {
   const float input[] = {1, 2, 0.5, -3.0};
   const float golden[] = {1, 4.0, 0.25, 9.0};
   float output_data[output_dims_count];
-  tflite::testing::TestElementwiseFloat(tflite::BuiltinOperator_SQUARE, shape,
-                                        input, shape, golden, output_data);
+  tflite::testing::TestElementwiseFloat(tflite::ops::micro::Register_SQUARE(),
+                                        shape, input, shape, golden,
+                                        output_data);
 }
 
 TF_LITE_MICRO_TEST(LogicalNot) {
@@ -218,9 +186,9 @@ TF_LITE_MICRO_TEST(LogicalNot) {
   const bool input[] = {true, false, false, true};
   const bool golden[] = {false, true, true, false};
   bool output_data[output_dims_count];
-  tflite::testing::TestElementwiseBool(tflite::BuiltinOperator_LOGICAL_NOT,
-                                       shape, input, shape, golden,
-                                       output_data);
+  tflite::testing::TestElementwiseBool(
+      tflite::ops::micro::Register_LOGICAL_NOT(), shape, input, shape, golden,
+      output_data);
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/floor.cc b/tensorflow/lite/micro/kernels/floor.cc
index 4ef8fc599f7..b8be1cf0e73 100644
--- a/tensorflow/lite/micro/kernels/floor.cc
+++ b/tensorflow/lite/micro/kernels/floor.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -28,11 +28,15 @@ constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  reference_ops::Floor(GetTensorShape(input), GetTensorData<float>(input),
-                       GetTensorShape(output), GetTensorData<float>(output));
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+  reference_ops::Floor(tflite::micro::GetTensorShape(input),
+                       tflite::micro::GetTensorData<float>(input),
+                       tflite::micro::GetTensorShape(output),
+                       tflite::micro::GetTensorData<float>(output));
   return kTfLiteOk;
 }
 }  // namespace floor
diff --git a/tensorflow/lite/micro/kernels/floor_test.cc b/tensorflow/lite/micro/kernels/floor_test.cc
index 2684bf3d8c9..3a27a937b17 100644
--- a/tensorflow/lite/micro/kernels/floor_test.cc
+++ b/tensorflow/lite/micro/kernels/floor_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
@@ -36,26 +37,20 @@ void TestFloor(const int* input_dims_data, const float* input_data,
       CreateFloatTensor(input_data, input_dims),
       CreateFloatTensor(output_data, output_dims),
   };
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_FLOOR);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   int inputs_array_data[] = {1, 0};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = nullptr;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+
+  const TfLiteRegistration registration = tflite::ops::micro::Register_FLOOR();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array, /*builtin_data=*/nullptr,
+                             micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
+
   for (int i = 0; i < output_dims_count; ++i) {
     TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i], 1e-5f);
   }
diff --git a/tensorflow/lite/micro/kernels/fully_connected.cc b/tensorflow/lite/micro/kernels/fully_connected.cc
index 88c150dc224..03078f893fb 100644
--- a/tensorflow/lite/micro/kernels/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/fully_connected.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -40,6 +41,10 @@ struct OpData {
   int32_t output_activation_max;
   // The index of the temporary tensor where the quantized inputs are cached.
   int input_quantized_index;
+  // Cached zero point values of tensors.
+  int32_t input_zero_point;
+  int32_t filter_zero_point;
+  int32_t output_zero_point;
 };
 
 constexpr int kInputTensor = 0;
@@ -64,6 +69,10 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
     TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
         context, activation, output, &data->output_activation_min,
         &data->output_activation_max));
+
+    data->input_zero_point = input->params.zero_point;
+    data->filter_zero_point = filter->params.zero_point;
+    data->output_zero_point = output->params.zero_point;
   }
   return status;
 }
@@ -72,12 +81,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@@ -102,13 +106,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
-                               const OpData& data, const TfLiteTensor* input,
-                               const TfLiteTensor* filter,
-                               const TfLiteTensor* bias, TfLiteTensor* output) {
+                               const OpData& data,
+                               const TfLiteEvalTensor* input,
+                               const TfLiteEvalTensor* filter,
+                               const TfLiteEvalTensor* bias,
+                               TfLiteEvalTensor* output) {
   tflite::FullyConnectedParams op_params;
-  op_params.input_offset = -input->params.zero_point;
-  op_params.weights_offset = -filter->params.zero_point;
-  op_params.output_offset = output->params.zero_point;
+  op_params.input_offset = -data.input_zero_point;
+  op_params.weights_offset = -data.filter_zero_point;
+  op_params.output_offset = data.output_zero_point;
   op_params.output_multiplier = data.output_multiplier;
   // TODO(b/138810107): Figure out whether output shift should be inverted
   op_params.output_shift = -data.output_shift;
@@ -116,20 +122,25 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
   op_params.quantized_activation_max = data.output_activation_max;
 
   reference_integer_ops::FullyConnected(
-      op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-      GetTensorShape(filter), GetTensorData<int8_t>(filter),
-      GetTensorShape(bias), GetTensorData<int32_t>(bias),
-      GetTensorShape(output), GetTensorData<int8_t>(output));
+      op_params, tflite::micro::GetTensorShape(input),
+      tflite::micro::GetTensorData<int8_t>(input),
+      tflite::micro::GetTensorShape(filter),
+      tflite::micro::GetTensorData<int8_t>(filter),
+      tflite::micro::GetTensorShape(bias),
+      tflite::micro::GetTensorData<int32_t>(bias),
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<int8_t>(output));
   return kTfLiteOk;
 }
 
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           const OpData& data, const TfLiteTensor* input,
-                           const TfLiteTensor* filter, const TfLiteTensor* bias,
-                           TfLiteTensor* output) {
-  const int32_t input_offset = -input->params.zero_point;
-  const int32_t filter_offset = -filter->params.zero_point;
-  const int32_t output_offset = output->params.zero_point;
+                           const OpData& data, const TfLiteEvalTensor* input,
+                           const TfLiteEvalTensor* filter,
+                           const TfLiteEvalTensor* bias,
+                           TfLiteEvalTensor* output) {
+  const int32_t input_offset = -data.input_zero_point;
+  const int32_t filter_offset = -data.filter_zero_point;
+  const int32_t output_offset = data.output_zero_point;
 
   tflite::FullyConnectedParams op_params;
   op_params.input_offset = input_offset;
@@ -141,12 +152,16 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   op_params.quantized_activation_min = data.output_activation_min;
   op_params.quantized_activation_max = data.output_activation_max;
 
-#define TF_LITE_FULLY_CONNECTED(output_data_type)                      \
-  reference_ops::FullyConnected(                                       \
-      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), \
-      GetTensorShape(filter), GetTensorData<uint8_t>(filter),          \
-      GetTensorShape(bias), GetTensorData<int32_t>(bias),              \
-      GetTensorShape(output), GetTensorData<output_data_type>(output))
+#define TF_LITE_FULLY_CONNECTED(output_data_type)      \
+  reference_ops::FullyConnected(                       \
+      op_params, tflite::micro::GetTensorShape(input), \
+      tflite::micro::GetTensorData<uint8_t>(input),    \
+      tflite::micro::GetTensorShape(filter),           \
+      tflite::micro::GetTensorData<uint8_t>(filter),   \
+      tflite::micro::GetTensorShape(bias),             \
+      tflite::micro::GetTensorData<int32_t>(bias),     \
+      tflite::micro::GetTensorShape(output),           \
+      tflite::micro::GetTensorData<output_data_type>(output))
   switch (output->type) {
     case kTfLiteUInt8:
       TF_LITE_FULLY_CONNECTED(uint8_t);
@@ -165,8 +180,9 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 
 TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
                        TfLiteFusedActivation activation,
-                       const TfLiteTensor* input, const TfLiteTensor* filter,
-                       const TfLiteTensor* bias, TfLiteTensor* output) {
+                       const TfLiteEvalTensor* input,
+                       const TfLiteEvalTensor* filter,
+                       const TfLiteEvalTensor* bias, TfLiteEvalTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRange(activation, &output_activation_min,
                            &output_activation_max);
@@ -174,10 +190,14 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
   op_params.float_activation_min = output_activation_min;
   op_params.float_activation_max = output_activation_max;
   tflite::reference_ops::FullyConnected(
-      op_params, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(filter), GetTensorData<float>(filter),
-      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
-      GetTensorData<float>(output));
+      op_params, tflite::micro::GetTensorShape(input),
+      tflite::micro::GetTensorData<float>(input),
+      tflite::micro::GetTensorShape(filter),
+      tflite::micro::GetTensorData<float>(filter),
+      tflite::micro::GetTensorShape(bias),
+      tflite::micro::GetTensorData<float>(bias),
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<float>(output));
   return kTfLiteOk;
 }
 
@@ -186,10 +206,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const auto* params =
       static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kWeightsTensor);
+  const TfLiteEvalTensor* bias =
+      tflite::micro::GetEvalInput(context, node, kBiasTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
   TFLITE_DCHECK(node->user_data != nullptr);
   const OpData& data = *(static_cast<const OpData*>(node->user_data));
diff --git a/tensorflow/lite/micro/kernels/fully_connected_test.cc b/tensorflow/lite/micro/kernels/fully_connected_test.cc
index 5723248a408..95892f5e1d0 100644
--- a/tensorflow/lite/micro/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/micro/kernels/fully_connected_test.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/kernels/micro_ops.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
@@ -50,47 +52,28 @@ TfLiteStatus TestFullyConnectedFloat(
       CreateFloatTensor(output_data, output_dims),
   };
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_FULLY_CONNECTED);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
   TfLiteFullyConnectedParams builtin_data = {
-      activation,
-      kTfLiteFullyConnectedWeightsFormatDefault,
-  };
-  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
+      activation, kTfLiteFullyConnectedWeightsFormatDefault, false, false};
+
   int inputs_array_data[] = {3, 0, 1, 2};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 3};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  if (registration->prepare) {
-    TF_LITE_ENSURE_OK(&context, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TfLiteStatus invoke_status = registration->invoke(&context, &node);
+  const TfLiteRegistration registration =
+      ops::micro::Register_FULLY_CONNECTED();
+  micro::KernelRunner runner(
+      registration, tensors, tensors_size, inputs_array, outputs_array,
+      reinterpret_cast<void*>(&builtin_data), micro_test::reporter);
 
-  if (registration->free) {
-    registration->free(&context, user_data);
+  TfLiteStatus status = runner.InitAndPrepare();
+  if (status != kTfLiteOk) {
+    return status;
   }
-  if (invoke_status != kTfLiteOk) {
-    return invoke_status;
+
+  status = runner.Invoke();
+  if (status != kTfLiteOk) {
+    return status;
   }
   for (int i = 0; i < output_dims_count; ++i) {
     TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i], 1e-5f);
@@ -124,48 +107,28 @@ TfLiteStatus TestFullyConnectedQuantized(
       CreateQuantizedTensor(output_data, output_dims, output_min, output_max),
   };
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_FULLY_CONNECTED);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
   TfLiteFullyConnectedParams builtin_data = {
-      activation,
-      kTfLiteFullyConnectedWeightsFormatDefault,
-  };
-  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
+      activation, kTfLiteFullyConnectedWeightsFormatDefault, false, false};
 
   int inputs_array_data[] = {3, 0, 1, 2};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 3};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
+  const TfLiteRegistration registration =
+      ops::micro::Register_FULLY_CONNECTED();
+  micro::KernelRunner runner(
+      registration, tensors, tensors_size, inputs_array, outputs_array,
+      reinterpret_cast<void*>(&builtin_data), micro_test::reporter);
 
-  if (registration->prepare) {
-    TF_LITE_ENSURE_OK(&context, registration->prepare(&context, &node));
+  TfLiteStatus status = runner.InitAndPrepare();
+  if (status != kTfLiteOk) {
+    return status;
   }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TfLiteStatus invoke_status = registration->invoke(&context, &node);
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
-  if (invoke_status != kTfLiteOk) {
-    return invoke_status;
+
+  status = runner.Invoke();
+  if (status != kTfLiteOk) {
+    return status;
   }
   for (int i = 0; i < output_dims_count; ++i) {
     TF_LITE_MICRO_EXPECT_EQ(expected_output_data[i], output_data[i]);
diff --git a/tensorflow/lite/micro/kernels/hard_swish.cc b/tensorflow/lite/micro/kernels/hard_swish.cc
index d4b46c7d63a..11e1d1a769f 100644
--- a/tensorflow/lite/micro/kernels/hard_swish.cc
+++ b/tensorflow/lite/micro/kernels/hard_swish.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 
 namespace tflite {
@@ -34,12 +35,8 @@ constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;
 
 void* HardSwishInit(TfLiteContext* context, const char* buffer, size_t length) {
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(HardSwishParams),
-                                        &data) == kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(HardSwishParams));
 }
 
 TfLiteStatus HardSwishPrepare(TfLiteContext* context, TfLiteNode* node) {
@@ -86,29 +83,38 @@ TfLiteStatus HardSwishPrepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus HardSwishEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
   HardSwishParams* params = static_cast<HardSwishParams*>(node->user_data);
 
   switch (input->type) {
     case kTfLiteFloat32: {
       tflite::reference_ops::HardSwish<float>(
-          GetTensorShape(input), GetTensorData<float>(input),
-          GetTensorShape(output), GetTensorData<float>(output));
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<float>(input),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<float>(output));
     } break;
     case kTfLiteUInt8: {
       tflite::reference_ops::HardSwish<uint8_t>(
-          *params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-          GetTensorShape(output), GetTensorData<uint8_t>(output));
+          *params, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<uint8_t>(input),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<uint8_t>(output));
     } break;
     case kTfLiteInt8: {
       tflite::reference_ops::HardSwish<int8_t>(
-          *params, GetTensorShape(input), GetTensorData<int8_t>(input),
-          GetTensorShape(output), GetTensorData<int8_t>(output));
+          *params, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int8_t>(output));
     } break;
     default: {
       TF_LITE_KERNEL_LOG(
-          context, "Only float32/int8/uint8 are supported currently, got %s",
+          context,
+          "Only float32/int8_t/uint8_t are supported currently, got %s",
           TfLiteTypeGetName(input->type));
       return kTfLiteError;
     }
diff --git a/tensorflow/lite/micro/kernels/hard_swish_test.cc b/tensorflow/lite/micro/kernels/hard_swish_test.cc
index cfedd523512..83cdacc96bc 100644
--- a/tensorflow/lite/micro/kernels/hard_swish_test.cc
+++ b/tensorflow/lite/micro/kernels/hard_swish_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
@@ -99,40 +100,19 @@ void TestHardSwishQuantized(int size, const T* output_data,
                             output_zero_point),
   };
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_HARD_SWISH);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  const char* init_data = nullptr;
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
   int inputs_array_data[] = {1, 0};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = nullptr;
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
+  const TfLiteRegistration registration =
+      tflite::ops::micro::Register_HARD_SWISH();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array, /*builtin_data=*/nullptr,
+                             micro_test::reporter);
 
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   AsymmetricDequantize<T>(output_data, output_elements_count, output_scale,
                           output_zero_point, dequantized_output);
@@ -151,10 +131,6 @@ void TestHardSwishQuantizedBias(const int size, const T* output_data,
                                 float output_max, float tolerated_bias,
                                 float* float_input_values,
                                 float* float_ref_output_values) {
-  const float quantized_type_range =
-      static_cast<float>(std::numeric_limits<T>::max()) -
-      static_cast<float>(std::numeric_limits<T>::min());
-
   const float input_scale = ScaleFromMinMax<T>(input_min, input_max);
   const float output_scale = ScaleFromMinMax<T>(output_min, output_max);
 
@@ -188,13 +164,6 @@ void TestHardSwishQuantizedBias(const int size, const T* output_data,
   const int input_dims_data[] = {2, 1, size};
   const int output_dims_data[] = {2, 1, size};
 
-  // The numerical error for any 8bit quantized function is at least one half
-  // times the quantization step: 0.5 * (kOutMax - kOutMin) / 256.
-  // To that we add again the quantization step (kOutMax - kOutMin) / 256
-  // to allow for an off-by-one rounding error.
-  const float kTolerance =
-      std::max(input_max - input_min, output_max - output_min) * (1.5f / 256.f);
-
   TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_elements_count = ElementCount(*output_dims);
@@ -211,40 +180,19 @@ void TestHardSwishQuantizedBias(const int size, const T* output_data,
                             output_zero_point),
   };
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_HARD_SWISH);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  const char* init_data = nullptr;
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
   int inputs_array_data[] = {1, 0};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = nullptr;
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
+  const TfLiteRegistration registration =
+      tflite::ops::micro::Register_HARD_SWISH();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array, /*builtin_data=*/nullptr,
+                             micro_test::reporter);
 
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   AsymmetricDequantize<T>(output_data, output_elements_count, output_scale,
                           output_zero_point, dequantized_output);
@@ -284,37 +232,20 @@ void TestHardSwishFloat(const int size, float* output_data,
       CreateFloatTensor(float_input_values, input_dims),
       CreateFloatTensor(output_data, output_dims),
   };
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_HARD_SWISH);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  const char* init_data = nullptr;
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
   int inputs_array_data[] = {1, 0};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
 
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
+  const TfLiteRegistration registration =
+      tflite::ops::micro::Register_HARD_SWISH();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array, /*builtin_data=*/nullptr,
+                             micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   for (int i = 0; i < output_elements_count; ++i) {
     TF_LITE_MICRO_EXPECT_NEAR(float_ref_output_values[i], output_data[i],
diff --git a/tensorflow/lite/micro/kernels/kernel_runner.cc b/tensorflow/lite/micro/kernels/kernel_runner.cc
new file mode 100644
index 00000000000..cef6c01cf45
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/kernel_runner.cc
@@ -0,0 +1,165 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
+
+namespace tflite {
+namespace micro {
+
+namespace {
+constexpr size_t kBufferAlignment = 16;
+}  // namespace
+
+// TODO(b/161841696): Consider moving away from global arena buffers:
+constexpr int KernelRunner::kNumScratchBuffers_;
+constexpr int KernelRunner::kKernelRunnerBufferSize_;
+uint8_t KernelRunner::kKernelRunnerBuffer_[];
+
+KernelRunner::KernelRunner(const TfLiteRegistration& registration,
+                           TfLiteTensor* tensors, int tensors_size,
+                           TfLiteIntArray* inputs, TfLiteIntArray* outputs,
+                           void* builtin_data, ErrorReporter* error_reporter)
+    : allocator_(SimpleMemoryAllocator::Create(
+          error_reporter, kKernelRunnerBuffer_, kKernelRunnerBufferSize_)),
+      registration_(registration),
+      tensors_(tensors),
+      error_reporter_(error_reporter) {
+  // Prepare TfLiteContext:
+  context_.impl_ = static_cast<void*>(this);
+  context_.ReportError = ReportOpError;
+  context_.recommended_num_threads = 1;
+  context_.GetTensor = GetTensor;
+  context_.GetEvalTensor = GetEvalTensor;
+  context_.AllocatePersistentBuffer = AllocatePersistentBuffer;
+  context_.RequestScratchBufferInArena = RequestScratchBufferInArena;
+  context_.GetScratchBuffer = GetScratchBuffer;
+
+  // Prepare TfLiteNode:
+  node_.inputs = inputs;
+  node_.outputs = outputs;
+  node_.builtin_data = builtin_data;
+}
+
+TfLiteStatus KernelRunner::InitAndPrepare(const char* init_data) {
+  if (registration_.init) {
+    node_.user_data = registration_.init(&context_, init_data, /*length=*/0);
+  }
+  if (registration_.prepare) {
+    TF_LITE_ENSURE_STATUS(registration_.prepare(&context_, &node_));
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus KernelRunner::Invoke() {
+  if (registration_.invoke == nullptr) {
+    TF_LITE_REPORT_ERROR(error_reporter_,
+                         "TfLiteRegistration missing invoke function pointer!");
+    return kTfLiteError;
+  }
+  return registration_.invoke(&context_, &node_);
+}
+
+TfLiteTensor* KernelRunner::GetTensor(const struct TfLiteContext* context,
+                                      int tensor_index) {
+  TFLITE_DCHECK(context != nullptr);
+  KernelRunner* runner = reinterpret_cast<KernelRunner*>(context->impl_);
+  TFLITE_DCHECK(runner != nullptr);
+
+  return &runner->tensors_[tensor_index];
+}
+
+TfLiteEvalTensor* KernelRunner::GetEvalTensor(
+    const struct TfLiteContext* context, int tensor_index) {
+  TFLITE_DCHECK(context != nullptr);
+  KernelRunner* runner = reinterpret_cast<KernelRunner*>(context->impl_);
+  TFLITE_DCHECK(runner != nullptr);
+
+  TfLiteEvalTensor* eval_tensor =
+      reinterpret_cast<TfLiteEvalTensor*>(runner->allocator_->AllocateTemp(
+          sizeof(TfLiteEvalTensor), alignof(TfLiteEvalTensor)));
+  TFLITE_DCHECK(eval_tensor != nullptr);
+
+  // In unit tests, the TfLiteTensor pointer contains the source of truth for
+  // buffers and values:
+  eval_tensor->data = runner->tensors_[tensor_index].data;
+  eval_tensor->dims = runner->tensors_[tensor_index].dims;
+  eval_tensor->type = runner->tensors_[tensor_index].type;
+  return eval_tensor;
+}
+
+void* KernelRunner::AllocatePersistentBuffer(TfLiteContext* context,
+                                             size_t bytes) {
+  TFLITE_DCHECK(context != nullptr);
+  KernelRunner* runner = reinterpret_cast<KernelRunner*>(context->impl_);
+  TFLITE_DCHECK(runner != nullptr);
+
+  return runner->allocator_->AllocateFromTail(bytes, kBufferAlignment);
+}
+
+TfLiteStatus KernelRunner::RequestScratchBufferInArena(TfLiteContext* context,
+                                                       size_t bytes,
+                                                       int* buffer_index) {
+  TFLITE_DCHECK(context != nullptr);
+  TFLITE_DCHECK(buffer_index != nullptr);
+
+  KernelRunner* runner = reinterpret_cast<KernelRunner*>(context->impl_);
+  TFLITE_DCHECK(runner != nullptr);
+
+  if (runner->scratch_buffer_count_ == kNumScratchBuffers_) {
+    TF_LITE_REPORT_ERROR(
+        runner->error_reporter_,
+        "Exceeded the maximum number of scratch tensors allowed (%d).",
+        kNumScratchBuffers_);
+    return kTfLiteError;
+  }
+
+  // For tests, we allocate scratch buffers from the tail and keep them around
+  // for the lifetime of model. This means that the arena size in the tests will
+  // be more than what we would have if the scratch buffers could share memory.
+  runner->scratch_buffers_[runner->scratch_buffer_count_] =
+      runner->allocator_->AllocateFromTail(bytes, kBufferAlignment);
+  TFLITE_DCHECK(runner->scratch_buffers_[runner->scratch_buffer_count_] !=
+                nullptr);
+
+  *buffer_index = runner->scratch_buffer_count_++;
+  return kTfLiteOk;
+}
+
+void* KernelRunner::GetScratchBuffer(TfLiteContext* context, int buffer_index) {
+  TFLITE_DCHECK(context != nullptr);
+  KernelRunner* runner = reinterpret_cast<KernelRunner*>(context->impl_);
+  TFLITE_DCHECK(runner != nullptr);
+
+  TFLITE_DCHECK(runner->scratch_buffer_count_ <= kNumScratchBuffers_);
+  if (buffer_index >= runner->scratch_buffer_count_) {
+    return nullptr;
+  }
+  return runner->scratch_buffers_[buffer_index];
+}
+
+void KernelRunner::ReportOpError(struct TfLiteContext* context,
+                                 const char* format, ...) {
+  TFLITE_DCHECK(context != nullptr);
+  KernelRunner* runner = reinterpret_cast<KernelRunner*>(context->impl_);
+  TFLITE_DCHECK(runner != nullptr);
+
+  va_list args;
+  va_start(args, format);
+  TF_LITE_REPORT_ERROR(runner->error_reporter_, format, args);
+  va_end(args);
+}
+
+}  // namespace micro
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/kernel_runner.h b/tensorflow/lite/micro/kernels/kernel_runner.h
new file mode 100644
index 00000000000..45d107e7a37
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/kernel_runner.h
@@ -0,0 +1,83 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_KERNEL_RUNNER_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_KERNEL_RUNNER_H_
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/micro/simple_memory_allocator.h"
+
+namespace tflite {
+namespace micro {
+
+// Helper class to perform a simulated kernel (i.e. TfLiteRegistration) lifecyle
+// (init, prepare, invoke). All internal allocations are handled by this class.
+// Simply pass in the registration, list of required tensors, inputs array,
+// outputs array, and any pre-builtin data. Calling Invoke() will automatically
+// walk the kernl and outputs will be ready on the the TfLiteTensor output
+// provided during construction.
+class KernelRunner {
+ public:
+  KernelRunner(const TfLiteRegistration& registration, TfLiteTensor* tensors,
+               int tensors_size, TfLiteIntArray* inputs,
+               TfLiteIntArray* outputs, void* builtin_data,
+               ErrorReporter* error_reporter);
+
+  // Calls init and prepare on the kernel (i.e. TfLiteRegistration) struct. Any
+  // exceptions will be reported through the error_reporter and returned as a
+  // status code here.
+  TfLiteStatus InitAndPrepare(const char* init_data = nullptr);
+
+  // Calls init, prepare, and invoke on a given TfLiteRegistration pointer.
+  // After successful invoke, results will be available in the output tensor as
+  // passed into the constructor of this class.
+  TfLiteStatus Invoke();
+
+ protected:
+  static TfLiteTensor* GetTensor(const struct TfLiteContext* context,
+                                 int tensor_index);
+  static TfLiteEvalTensor* GetEvalTensor(const struct TfLiteContext* context,
+                                         int tensor_index);
+  static void* AllocatePersistentBuffer(TfLiteContext* context, size_t bytes);
+  static TfLiteStatus RequestScratchBufferInArena(TfLiteContext* context,
+                                                  size_t bytes,
+                                                  int* buffer_index);
+  static void* GetScratchBuffer(TfLiteContext* context, int buffer_index);
+  static void ReportOpError(struct TfLiteContext* context, const char* format,
+                            ...);
+
+ private:
+  static constexpr int kNumScratchBuffers_ = 5;
+
+  static constexpr int kKernelRunnerBufferSize_ = 10000;
+  static uint8_t kKernelRunnerBuffer_[kKernelRunnerBufferSize_];
+
+  SimpleMemoryAllocator* allocator_ = nullptr;
+  const TfLiteRegistration& registration_;
+  TfLiteTensor* tensors_ = nullptr;
+  ErrorReporter* error_reporter_ = nullptr;
+
+  TfLiteContext context_ = {};
+  TfLiteNode node_ = {};
+
+  int scratch_buffer_count_ = 0;
+  uint8_t* scratch_buffers_[kNumScratchBuffers_];
+};
+
+}  // namespace micro
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_KERNEL_RUNNER_H_
diff --git a/tensorflow/lite/micro/kernels/kernel_util.cc b/tensorflow/lite/micro/kernels/kernel_util.cc
new file mode 100644
index 00000000000..1ddfc1d3a29
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/kernel_util.cc
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+namespace micro {
+
+bool HaveSameShapes(const TfLiteEvalTensor* input1,
+                    const TfLiteEvalTensor* input2) {
+  TFLITE_DCHECK(input1 != nullptr);
+  TFLITE_DCHECK(input2 != nullptr);
+  return TfLiteIntArrayEqual(input1->dims, input2->dims);
+}
+
+}  // namespace micro
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/kernel_util.h b/tensorflow/lite/micro/kernels/kernel_util.h
new file mode 100644
index 00000000000..530e52df5f5
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/kernel_util.h
@@ -0,0 +1,83 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_KERNEL_UTIL_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_KERNEL_UTIL_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace micro {
+
+// Returns a mutable tensor for a given input index. is_variable must be checked
+// during prepare when the full TfLiteTensor is available.
+inline TfLiteEvalTensor* GetMutableEvalInput(const TfLiteContext* context,
+                                             const TfLiteNode* node,
+                                             int index) {
+  TFLITE_DCHECK(context != nullptr);
+  TFLITE_DCHECK(node != nullptr);
+  return context->GetEvalTensor(context, node->inputs->data[index]);
+}
+
+// Returns the TfLiteEvalTensor struct for a given input index in a node.
+inline const TfLiteEvalTensor* GetEvalInput(const TfLiteContext* context,
+                                            const TfLiteNode* node, int index) {
+  return GetMutableEvalInput(context, node, index);
+}
+
+// Returns the TfLiteEvalTensor struct for a given output index in a node.
+inline TfLiteEvalTensor* GetEvalOutput(const TfLiteContext* context,
+                                       const TfLiteNode* node, int index) {
+  TFLITE_DCHECK(context != nullptr);
+  TFLITE_DCHECK(node != nullptr);
+  return context->GetEvalTensor(context, node->outputs->data[index]);
+}
+
+// Returns data for a TfLiteEvalTensor struct.
+template <typename T>
+T* GetTensorData(TfLiteEvalTensor* tensor) {
+  return tensor != nullptr ? reinterpret_cast<T*>(tensor->data.raw) : nullptr;
+}
+
+// Returns const data for a TfLiteEvalTensor struct.
+template <typename T>
+const T* GetTensorData(const TfLiteEvalTensor* tensor) {
+  TFLITE_DCHECK(tensor != nullptr);
+  return reinterpret_cast<const T*>(tensor->data.raw);
+}
+
+// Returns the shape of a TfLiteEvalTensor struct.
+inline const RuntimeShape GetTensorShape(const TfLiteEvalTensor* tensor) {
+  if (tensor == nullptr) {
+    return RuntimeShape();
+  }
+  TfLiteIntArray* dims = tensor->dims;
+  const int dims_size = dims->size;
+  const int32_t* dims_data = reinterpret_cast<const int32_t*>(dims->data);
+  return RuntimeShape(dims_size, dims_data);
+}
+
+// Return true if the given tensors have the same shape.
+bool HaveSameShapes(const TfLiteEvalTensor* input1,
+                    const TfLiteEvalTensor* input2);
+
+}  // namespace micro
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_KERNEL_UTIL_H_
diff --git a/tensorflow/lite/micro/kernels/l2norm.cc b/tensorflow/lite/micro/kernels/l2norm.cc
index 16a982344e1..f864efa271c 100644
--- a/tensorflow/lite/micro/kernels/l2norm.cc
+++ b/tensorflow/lite/micro/kernels/l2norm.cc
@@ -18,12 +18,15 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/l2normalization.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
 namespace micro {
 namespace l2norm {
 
+namespace {
+
 // This file has two implementation of L2Norm.
 enum KernelType {
   kReference,
@@ -33,9 +36,15 @@ enum KernelType {
 constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;
 
+}  // namespace
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-#if defined(DEBUG)
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
   auto* params = reinterpret_cast<TfLiteL2NormParams*>(node->builtin_data);
+  L2NormalizationParams* data =
+      static_cast<L2NormalizationParams*>(node->user_data);
 
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
@@ -51,26 +60,33 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
   if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, output->params.scale, (1. / 128.));
-    if (output->type == kTfLiteUInt8) {
-      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 128);
-    }
-    if (output->type == kTfLiteInt8) {
-      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
-    }
+    data->input_zero_point = input->params.zero_point;
+  } else if (output->type == kTfLiteFloat32) {
+    data->input_zero_point = 0;
   }
 
   // TODO(ahentz): For some reason our implementations don't support
   // activations.
   TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActNone);
-#endif
 
   return kTfLiteOk;
 }
 
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context,
+                                           sizeof(L2NormalizationParams));
+}
+
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const L2NormalizationParams& data =
+      *(static_cast<const L2NormalizationParams*>(node->user_data));
+
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
   // TODO(b/143912164): instead of hardcode the epsilon here, we should read it
   // from tensorflow, i.e., adding a params.
@@ -87,36 +103,29 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // So we don't even need to do handle the epsilon for quantized kernel case.
   const float epsilon = 1e-6f;
   if (output->type == kTfLiteFloat32) {
-#define TF_LITE_L2NORM(type)                                                 \
-  tflite::L2NormalizationParams op_params;                                   \
-  op_params.input_zero_point = 0;                                            \
-  type::L2Normalization(op_params, GetTensorShape(input),                    \
-                        GetTensorData<float>(input), GetTensorShape(output), \
-                        GetTensorData<float>(output), epsilon)
-
-    TF_LITE_L2NORM(reference_ops);
-#undef TF_LITE_L2NORM
+    reference_ops::L2Normalization(data, tflite::micro::GetTensorShape(input),
+                                   tflite::micro::GetTensorData<float>(input),
+                                   tflite::micro::GetTensorShape(output),
+                                   tflite::micro::GetTensorData<float>(output),
+                                   epsilon);
   } else if (output->type == kTfLiteUInt8) {
-#define TF_LITE_L2NORM(type)                                                 \
-  tflite::L2NormalizationParams op_params;                                   \
-  op_params.input_zero_point = input->params.zero_point;                     \
-  type::L2Normalization(op_params, GetTensorShape(input),                    \
-                        GetTensorData<uint8>(input), GetTensorShape(output), \
-                        GetTensorData<uint8>(output))
-
-    TF_LITE_L2NORM(reference_ops);
-#undef TF_LITE_L2NORM
+    reference_ops::L2Normalization(
+        data, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<uint8_t>(input),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<uint8_t>(output));
   } else if (output->type == kTfLiteInt8) {
-    const auto input_shape = GetTensorShape(input);
-    const auto output_shape = GetTensorShape(output);
+    const auto input_shape = tflite::micro::GetTensorShape(input);
+    const auto output_shape = tflite::micro::GetTensorShape(output);
     const int trailing_dim = input_shape.DimensionsCount() - 1;
     const int depth =
         MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
     const int outer_size =
         MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
-    reference_integer_ops::L2Normalization(input->params.zero_point, outer_size,
-                                           depth, GetTensorData<int8>(input),
-                                           GetTensorData<int8>(output));
+    reference_integer_ops::L2Normalization(
+        data.input_zero_point, outer_size, depth,
+        tflite::micro::GetTensorData<int8_t>(input),
+        tflite::micro::GetTensorData<int8_t>(output));
   } else {
     TF_LITE_KERNEL_LOG(context, "Output type is %s, requires float.",
                        TfLiteTypeGetName(output->type));
@@ -129,7 +138,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace l2norm
 
 TfLiteRegistration Register_L2NORM_REF() {
-  return {/*init=*/nullptr,
+  return {/*init=*/l2norm::Init,
           /*free=*/nullptr,
           /*prepare=*/l2norm::Prepare,
           /*invoke=*/l2norm::Eval,
diff --git a/tensorflow/lite/micro/kernels/l2norm_test.cc b/tensorflow/lite/micro/kernels/l2norm_test.cc
index 39eb92a8849..791f9036c56 100644
--- a/tensorflow/lite/micro/kernels/l2norm_test.cc
+++ b/tensorflow/lite/micro/kernels/l2norm_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
@@ -23,7 +24,7 @@ namespace tflite {
 namespace testing {
 namespace {
 
-// used to set the quantization parameters for the int8 and uint8 tests
+// used to set the quantization parameters for the int8_t and uint8_t tests
 constexpr float kInputMin = -2.0;
 constexpr float kInputMax = 2.0;
 constexpr float kOutputMin = -1.0;
@@ -50,7 +51,7 @@ TfLiteTensor CreateL2NormTensor(const float* data, TfLiteIntArray* dims,
   return CreateFloatTensor(data, dims);
 }
 
-TfLiteTensor CreateL2NormTensor(const uint8* data, TfLiteIntArray* dims,
+TfLiteTensor CreateL2NormTensor(const uint8_t* data, TfLiteIntArray* dims,
                                 bool is_input) {
   TfLiteTensor tensor;
 
@@ -64,7 +65,7 @@ TfLiteTensor CreateL2NormTensor(const uint8* data, TfLiteIntArray* dims,
   return tensor;
 }
 
-TfLiteTensor CreateL2NormTensor(const int8* data, TfLiteIntArray* dims,
+TfLiteTensor CreateL2NormTensor(const int8_t* data, TfLiteIntArray* dims,
                                 bool is_input) {
   TfLiteTensor tensor;
 
@@ -97,31 +98,23 @@ void TestL2Normalization(const int* input_dims_data, const T* input_data,
       CreateL2NormTensor(output_data, dims, false),
   };
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_L2_NORMALIZATION);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  TfLiteL2NormParams builtin_data = {
-      .activation = kTfLiteActNone,
-  };
-
   int inputs_array_data[] = {1, 0};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = nullptr;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  TfLiteL2NormParams builtin_data = {
+      .activation = kTfLiteActNone,
+  };
+
+  const TfLiteRegistration registration =
+      ops::micro::Register_L2_NORMALIZATION();
+  micro::KernelRunner runner(
+      registration, tensors, tensors_size, inputs_array, outputs_array,
+      reinterpret_cast<void*>(&builtin_data), micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   // Compare the results from dequantization and expected outputs, and make
   // sure the difference is within a threshold.
diff --git a/tensorflow/lite/micro/kernels/logical.cc b/tensorflow/lite/micro/kernels/logical.cc
index cbb818193ac..f4033ba8856 100644
--- a/tensorflow/lite/micro/kernels/logical.cc
+++ b/tensorflow/lite/micro/kernels/logical.cc
@@ -15,8 +15,8 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/binary_function.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -31,20 +31,29 @@ constexpr int kOutputTensor = 0;
 
 TfLiteStatus LogicalImpl(TfLiteContext* context, TfLiteNode* node,
                          bool (*func)(bool, bool)) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
-  if (HaveSameShapes(input1, input2)) {
+  if (tflite::micro::HaveSameShapes(input1, input2)) {
     reference_ops::BinaryFunction<bool, bool, bool>(
-        GetTensorShape(input1), GetTensorData<bool>(input1),
-        GetTensorShape(input2), GetTensorData<bool>(input2),
-        GetTensorShape(output), GetTensorData<bool>(output), func);
+        tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorData<bool>(input1),
+        tflite::micro::GetTensorShape(input2),
+        tflite::micro::GetTensorData<bool>(input2),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<bool>(output), func);
   } else {
     reference_ops::BroadcastBinaryFunction4DSlow<bool, bool, bool>(
-        GetTensorShape(input1), GetTensorData<bool>(input1),
-        GetTensorShape(input2), GetTensorData<bool>(input2),
-        GetTensorShape(output), GetTensorData<bool>(output), func);
+        tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorData<bool>(input1),
+        tflite::micro::GetTensorShape(input2),
+        tflite::micro::GetTensorData<bool>(input2),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<bool>(output), func);
   }
 
   return kTfLiteOk;
diff --git a/tensorflow/lite/micro/kernels/logical_test.cc b/tensorflow/lite/micro/kernels/logical_test.cc
index 89a7a0ae74a..d5355c830b6 100644
--- a/tensorflow/lite/micro/kernels/logical_test.cc
+++ b/tensorflow/lite/micro/kernels/logical_test.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
@@ -22,9 +24,10 @@ namespace tflite {
 namespace testing {
 namespace {
 
-void TestLogicalOp(tflite::BuiltinOperator op, const int* input1_dims_data,
-                   const bool* input1_data, const int* input2_dims_data,
-                   const bool* input2_data, const int* output_dims_data,
+void TestLogicalOp(const TfLiteRegistration& registration,
+                   const int* input1_dims_data, const bool* input1_data,
+                   const int* input2_dims_data, const bool* input2_data,
+                   const int* output_dims_data,
                    const bool* expected_output_data, bool* output_data) {
   TfLiteIntArray* input1_dims = IntArrayFromInts(input1_dims_data);
   TfLiteIntArray* input2_dims = IntArrayFromInts(input2_dims_data);
@@ -40,32 +43,17 @@ void TestLogicalOp(tflite::BuiltinOperator op, const int* input1_dims_data,
       CreateBoolTensor(output_data, output_dims),
   };
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration = resolver.FindOp(op);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
   int inputs_array_data[] = {2, 0, 1};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 2};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = nullptr;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array,
+                             /*builtin_data=*/nullptr, micro_test::reporter);
 
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   TF_LITE_MICRO_EXPECT_EQ(output_dims_count, 4);
   for (int i = 0; i < output_dims_count; ++i) {
@@ -85,8 +73,8 @@ TF_LITE_MICRO_TEST(LogicalOr) {
   const bool input2[] = {true, false, true, false};
   const bool golden[] = {true, false, true, true};
   bool output_data[4];
-  tflite::testing::TestLogicalOp(tflite::BuiltinOperator_LOGICAL_OR, shape,
-                                 input1, shape, input2, shape, golden,
+  tflite::testing::TestLogicalOp(tflite::ops::micro::Register_LOGICAL_OR(),
+                                 shape, input1, shape, input2, shape, golden,
                                  output_data);
 }
 
@@ -97,7 +85,7 @@ TF_LITE_MICRO_TEST(BroadcastLogicalOr) {
   const bool input2[] = {false};
   const bool golden[] = {true, false, false, true};
   bool output_data[4];
-  tflite::testing::TestLogicalOp(tflite::BuiltinOperator_LOGICAL_OR,
+  tflite::testing::TestLogicalOp(tflite::ops::micro::Register_LOGICAL_OR(),
                                  input1_shape, input1, input2_shape, input2,
                                  input1_shape, golden, output_data);
 }
@@ -108,8 +96,8 @@ TF_LITE_MICRO_TEST(LogicalAnd) {
   const bool input2[] = {true, false, true, false};
   const bool golden[] = {true, false, false, false};
   bool output_data[4];
-  tflite::testing::TestLogicalOp(tflite::BuiltinOperator_LOGICAL_AND, shape,
-                                 input1, shape, input2, shape, golden,
+  tflite::testing::TestLogicalOp(tflite::ops::micro::Register_LOGICAL_AND(),
+                                 shape, input1, shape, input2, shape, golden,
                                  output_data);
 }
 
@@ -120,7 +108,7 @@ TF_LITE_MICRO_TEST(BroadcastLogicalAnd) {
   const bool input2[] = {true};
   const bool golden[] = {true, false, false, true};
   bool output_data[4];
-  tflite::testing::TestLogicalOp(tflite::BuiltinOperator_LOGICAL_AND,
+  tflite::testing::TestLogicalOp(tflite::ops::micro::Register_LOGICAL_AND(),
                                  input1_shape, input1, input2_shape, input2,
                                  input1_shape, golden, output_data);
 }
diff --git a/tensorflow/lite/micro/kernels/logistic.cc b/tensorflow/lite/micro/kernels/logistic.cc
index 312594592d7..7a371da252b 100644
--- a/tensorflow/lite/micro/kernels/logistic.cc
+++ b/tensorflow/lite/micro/kernels/logistic.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -68,12 +69,7 @@ TfLiteStatus CalculateArithmeticOpData(TfLiteContext* context, TfLiteNode* node,
 
 void* LogisticInit(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 TfLiteStatus LogisticPrepare(TfLiteContext* context, TfLiteNode* node) {
@@ -84,8 +80,10 @@ TfLiteStatus LogisticPrepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus LogisticEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
   TFLITE_DCHECK(node->user_data != nullptr);
   OpData* data = static_cast<OpData*>(node->user_data);
@@ -93,9 +91,10 @@ TfLiteStatus LogisticEval(TfLiteContext* context, TfLiteNode* node) {
   if (input->type == kTfLiteFloat32) {
     switch (output->type) {
       case kTfLiteFloat32: {
-        reference_ops::Logistic(
-            GetTensorShape(input), GetTensorData<float>(input),
-            GetTensorShape(output), GetTensorData<float>(output));
+        reference_ops::Logistic(tflite::micro::GetTensorShape(input),
+                                tflite::micro::GetTensorData<float>(input),
+                                tflite::micro::GetTensorShape(output),
+                                tflite::micro::GetTensorData<float>(output));
         return kTfLiteOk;
       }
       default:
@@ -110,8 +109,9 @@ TfLiteStatus LogisticEval(TfLiteContext* context, TfLiteNode* node) {
         reference_integer_ops::Logistic(
             data->input_zero_point, data->input_range_radius,
             data->input_multiplier, data->input_left_shift,
-            NumElements(input->dims), GetTensorData<int8_t>(input),
-            GetTensorData<int8_t>(output));
+            NumElements(input->dims),
+            tflite::micro::GetTensorData<int8_t>(input),
+            tflite::micro::GetTensorData<int8_t>(output));
         return kTfLiteOk;
       }
       default:
diff --git a/tensorflow/lite/micro/kernels/logistic_test.cc b/tensorflow/lite/micro/kernels/logistic_test.cc
index c828f00016f..b1e56df3161 100644
--- a/tensorflow/lite/micro/kernels/logistic_test.cc
+++ b/tensorflow/lite/micro/kernels/logistic_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
@@ -23,13 +24,55 @@ namespace tflite {
 namespace testing {
 namespace {
 
-void TestLogisticFloat(std::initializer_list<int> input_dims_data,
-                       std::initializer_list<float> input_data,
-                       std::initializer_list<float> expected_output_data,
-                       std::initializer_list<int> output_dims_data,
+// The Logistic kernel assumes an output in the range [0, 1.0], leading to these
+// quantization parameters.
+const float quantized_output_scale = 1.0 / 255.0;
+const int quantized_output_zero_point_int8 = -128;
+
+const int flat_size_basic = 10;
+const int shape_basic[] = {2, 2, 5};
+const float input_data_basic[] = {1, 2, 3, 4, 5, -1, -2, -3, -4, -5};
+const float golden_basic[] = {0.73105858, 0.88079708, 0.95257413, 0.98201379,
+                              0.99330715, 0.26894142, 0.11920292, 0.04742587,
+                              0.01798621, 0.00669285};
+
+const int flat_size_wide_range = 10;
+const int shape_wide_range[] = {2, 1, 5};
+const float input_data_wide_range[]{
+    1.0, 2.0, 3.0, 4.0, 93.0, -1.0, -2.0, -3.0, -4.0, -93.0,
+};
+const float golden_wide_range[] = {
+    0.73105858, 0.88079708, 0.95257413, 0.98201379, 1.0,
+    0.26894142, 0.11920292, 0.04742587, 0.01798621, 0.0,
+};
+
+template <typename T>
+void ValidateLogisticGoldens(TfLiteTensor* tensors, const int tensor_count,
+                             T* output_data, const T* golden,
+                             int output_dims_count, float tolerance) {
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+
+  const TfLiteRegistration registration =
+      tflite::ops::micro::Register_LOGISTIC();
+  micro::KernelRunner runner(registration, tensors, tensor_count, inputs_array,
+                             outputs_array, nullptr, micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
+
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(golden[i], output_data[i], tolerance);
+  }
+}
+
+void TestLogisticFloat(const int* input_dims_data, const float* input_data,
+                       const float* golden, const int* output_dims_data,
                        float* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_elements_count = ElementCount(*output_dims);
 
   constexpr int inputs_size = 1;
@@ -40,102 +83,35 @@ void TestLogisticFloat(std::initializer_list<int> input_dims_data,
       CreateFloatTensor(output_data, output_dims),
   };
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_LOGISTIC);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  const char* init_data = nullptr;
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-  int inputs_array_data[] = {1, 0};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 1};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
-  for (int i = 0; i < output_elements_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
-                              1e-5f);
-  }
+  ValidateLogisticGoldens(tensors, tensors_size, output_data, golden,
+                          output_elements_count, 1e-5);
 }
 
-void TestLogisticInt8(std::initializer_list<int> input_dims_data,
-                      std::initializer_list<int8_t> input_data, float input_min,
-                      float input_max,
-                      std::initializer_list<int8_t> expected_output_data,
-                      std::initializer_list<int> output_dims_data,
-                      float output_min, float output_max, int8_t* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+template <typename T>
+void TestLogisticQuantized(const int* input_dims_data, const float* input_data,
+                           T* input_quantized, const float input_scale,
+                           const int input_zero_point, const float* golden,
+                           T* golden_quantized, const int* output_dims_data,
+                           const float output_scale,
+                           const int output_zero_point, int8_t* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_elements_count = ElementCount(*output_dims);
 
   constexpr int inputs_size = 1;
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateQuantizedTensor(input_data, input_dims, input_min, input_max),
-      CreateQuantizedTensor(output_data, output_dims, output_min, output_max),
+      CreateQuantizedTensor(input_data, input_quantized, input_dims,
+                            input_scale, input_zero_point),
+      CreateQuantizedTensor(output_data, output_dims, output_scale,
+                            output_zero_point),
   };
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_LOGISTIC);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  const char* init_data = nullptr;
-  size_t init_data_size = 1;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-  int inputs_array_data[] = {1, 0};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 1};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
-  for (int i = 0; i < output_elements_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
-                              1);
-  }
+  tflite::AsymmetricQuantize(golden, golden_quantized, output_elements_count,
+                             output_scale, output_zero_point);
+  ValidateLogisticGoldens(tensors, tensors_size, output_data, golden_quantized,
+                          output_elements_count, 1.0);
 }
 
 }  // namespace
@@ -144,71 +120,50 @@ void TestLogisticInt8(std::initializer_list<int> input_dims_data,
 
 TF_LITE_MICRO_TESTS_BEGIN
 
-TF_LITE_MICRO_TEST(SimpleTestFloat) {
-  const int output_elements_count = 10;
-  float output_data[output_elements_count];
-  tflite::testing::TestLogisticFloat({2, 1, 5},  // Input shape.
-                                     {
-                                         1.0,
-                                         2.0,
-                                         3.0,
-                                         4.0,
-                                         93.0,
-                                         -1.0,
-                                         -2.0,
-                                         -3.0,
-                                         -4.0,
-                                         -93.0,
-                                     },
-                                     {
-                                         // Expected results.
-                                         0.73105858,
-                                         0.88079708,
-                                         0.95257413,
-                                         0.98201379,
-                                         1.0,
-                                         0.26894142,
-                                         0.11920292,
-                                         0.04742587,
-                                         0.01798621,
-                                         0.0,
-                                     },
-                                     {2, 1, 5},  // Output shape.
-                                     output_data);
+TF_LITE_MICRO_TEST(LogisticFloatBasicShouldMatchGolden) {
+  float output_data[tflite::testing::flat_size_basic];
+  tflite::testing::TestLogisticFloat(
+      tflite::testing::shape_basic, tflite::testing::input_data_basic,
+      tflite::testing::golden_basic, tflite::testing::shape_basic, output_data);
 }
 
-TF_LITE_MICRO_TEST(SimpleTestInt8) {
-  using tflite::testing::F2QS;
+TF_LITE_MICRO_TEST(LogisticQuantizedInt8BasicShouldMatchGolden) {
+  const float input_scale = 0.1;
+  const int input_zero_point = 0;
+  int8_t input_quantized[tflite::testing::flat_size_basic];
+  int8_t golden_quantized[tflite::testing::flat_size_basic];
+  int8_t output_data[tflite::testing::flat_size_basic];
 
-  const float input_min = -63.5f;
-  const float input_max = 64.0f;
-  const float output_min = 0.0f;
-  const float output_max = (255.0f / 256.0f);
+  tflite::testing::TestLogisticQuantized(
+      tflite::testing::shape_basic, tflite::testing::input_data_basic,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::golden_basic, golden_quantized,
+      tflite::testing::shape_basic, tflite::testing::quantized_output_scale,
+      tflite::testing::quantized_output_zero_point_int8, output_data);
+}
 
-  const int output_elements_count = 10;
-  int8_t output_data[output_elements_count];
-  tflite::testing::TestLogisticInt8(
-      {2, 1, output_elements_count},  // Input shape.
-      {F2QS(1.0, input_min, input_max), F2QS(2.0, input_min, input_max),
-       F2QS(3.0, input_min, input_max), F2QS(4.0, input_min, input_max),
-       F2QS(5.0, input_min, input_max), F2QS(-1.0, input_min, input_max),
-       F2QS(-2.0, input_min, input_max), F2QS(-3.0, input_min, input_max),
-       F2QS(-4.0, input_min, input_max), F2QS(-5.0, input_min, input_max)},
-      input_min, input_max,  // Input quantized range.
-      {                      // Expected results.
-       F2QS(0.73105858, output_min, output_max),
-       F2QS(0.88079708, output_min, output_max),
-       F2QS(0.95257413, output_min, output_max),
-       F2QS(0.98201379, output_min, output_max),
-       F2QS(0.99330715, output_min, output_max),
-       F2QS(0.26894142, output_min, output_max),
-       F2QS(0.11920292, output_min, output_max),
-       F2QS(0.04742587, output_min, output_max),
-       F2QS(0.01798621, output_min, output_max),
-       F2QS(0.00669285, output_min, output_max)},
-      {2, 1, output_elements_count},  // Output shape.
-      output_min, output_max,         // Output quantized range.
+TF_LITE_MICRO_TEST(LogisticFloatWideRangeShouldMatchGolden) {
+  float output_data[tflite::testing::flat_size_wide_range];
+  tflite::testing::TestLogisticFloat(
+      tflite::testing::shape_wide_range, tflite::testing::input_data_wide_range,
+      tflite::testing::golden_wide_range, tflite::testing::shape_wide_range,
       output_data);
 }
 
+TF_LITE_MICRO_TEST(LogisticQuantizedInt8WideRangeShouldMatchGolden) {
+  const float input_scale = 1.0;
+  const int input_zero_point = 0;
+  int8_t input_quantized[tflite::testing::flat_size_wide_range];
+  int8_t golden_quantized[tflite::testing::flat_size_wide_range];
+  int8_t output_data[tflite::testing::flat_size_wide_range];
+
+  tflite::testing::TestLogisticQuantized(
+      tflite::testing::shape_wide_range, tflite::testing::input_data_wide_range,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::golden_wide_range, golden_quantized,
+      tflite::testing::shape_wide_range,
+      tflite::testing::quantized_output_scale,
+      tflite::testing::quantized_output_zero_point_int8, output_data);
+}
+
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/maximum_minimum.cc b/tensorflow/lite/micro/kernels/maximum_minimum.cc
index 3f336cebe5b..a7c343bf58c 100644
--- a/tensorflow/lite/micro/kernels/maximum_minimum.cc
+++ b/tensorflow/lite/micro/kernels/maximum_minimum.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -40,13 +41,13 @@ constexpr int kOutputTensor = 0;
 
 struct OpContext {
   OpContext(TfLiteContext* context, TfLiteNode* node) {
-    input1 = GetInput(context, node, kInputTensor1);
-    input2 = GetInput(context, node, kInputTensor2);
-    output = GetOutput(context, node, kOutputTensor);
+    input1 = tflite::micro::GetEvalInput(context, node, kInputTensor1);
+    input2 = tflite::micro::GetEvalInput(context, node, kInputTensor2);
+    output = tflite::micro::GetEvalOutput(context, node, kOutputTensor);
   }
-  const TfLiteTensor* input1;
-  const TfLiteTensor* input2;
-  TfLiteTensor* output;
+  const TfLiteEvalTensor* input1;
+  const TfLiteEvalTensor* input2;
+  TfLiteEvalTensor* output;
 };
 
 struct MaximumOp {
@@ -69,12 +70,12 @@ template <typename data_type, typename op_type>
 void TFLiteOperation(TfLiteContext* context, TfLiteNode* node,
                      const OpContext& op_context) {
   reference_ops::MaximumMinimumBroadcastSlow(
-      GetTensorShape(op_context.input1),
-      GetTensorData<data_type>(op_context.input1),
-      GetTensorShape(op_context.input2),
-      GetTensorData<data_type>(op_context.input2),
-      GetTensorShape(op_context.output),
-      GetTensorData<data_type>(op_context.output),
+      tflite::micro::GetTensorShape(op_context.input1),
+      tflite::micro::GetTensorData<data_type>(op_context.input1),
+      tflite::micro::GetTensorShape(op_context.input2),
+      tflite::micro::GetTensorData<data_type>(op_context.input2),
+      tflite::micro::GetTensorShape(op_context.output),
+      tflite::micro::GetTensorData<data_type>(op_context.output),
       op_type::template op<data_type>);
 }
 
diff --git a/tensorflow/lite/micro/kernels/maximum_minimum_test.cc b/tensorflow/lite/micro/kernels/maximum_minimum_test.cc
index 8635db3b60b..7fab5407cdb 100644
--- a/tensorflow/lite/micro/kernels/maximum_minimum_test.cc
+++ b/tensorflow/lite/micro/kernels/maximum_minimum_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
@@ -23,17 +24,14 @@ namespace tflite {
 namespace testing {
 namespace {
 
-void TestMaxMinFloat(tflite::BuiltinOperator op,
-                     std::initializer_list<int> input1_dims_data,
-                     std::initializer_list<float> input1_data,
-                     std::initializer_list<int> input2_dims_data,
-                     std::initializer_list<float> input2_data,
-                     std::initializer_list<float> expected_output_data,
-                     std::initializer_list<int> output_dims_data,
-                     float* output_data) {
-  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
-  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+void TestMaxMinFloat(const TfLiteRegistration& registration,
+                     const int* input1_dims_data, const float* input1_data,
+                     const int* input2_dims_data, const float* input2_data,
+                     const float* expected_output_data,
+                     const int* output_dims_data, float* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInts(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInts(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
 
   constexpr int inputs_size = 2;
@@ -45,103 +43,78 @@ void TestMaxMinFloat(tflite::BuiltinOperator op,
       CreateFloatTensor(output_data, output_dims),
   };
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration = resolver.FindOp(op);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
   int inputs_array_data[] = {2, 0, 1};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 2};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = nullptr;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array,
+                             /*builtin_data=*/nullptr, micro_test::reporter);
 
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
-                              1e-5);
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i], 1e-5f);
   }
 }
 
-void TestMaxMinQuantized(
-    tflite::BuiltinOperator op, std::initializer_list<int> input1_dims_data,
-    std::initializer_list<uint8_t> input1_data, float input1_min,
-    float input1_max, std::initializer_list<int> input2_dims_data,
-    std::initializer_list<uint8_t> input2_data, float input2_min,
-    float input2_max, std::initializer_list<uint8_t> expected_output_data,
-    float output_min, float output_max,
-    std::initializer_list<int> output_dims_data, uint8_t* output_data) {
-  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
-  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+void TestMaxMinQuantized(const TfLiteRegistration& registration,
+                         const int* input1_dims_data,
+                         const uint8_t* input1_data, float const input1_scale,
+                         const int input1_zero_point,
+                         const int* input2_dims_data,
+                         const uint8_t* input2_data, const float input2_scale,
+                         const int input2_zero_point,
+                         const uint8_t* expected_output_data,
+                         const float output_scale, const int output_zero_point,
+                         const int* output_dims_data, uint8_t* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInts(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInts(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
 
   constexpr int inputs_size = 2;
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateQuantizedTensor(input1_data, input1_dims, input1_min, input1_max),
-      CreateQuantizedTensor(input2_data, input2_dims, input2_min, input2_max),
-      CreateQuantizedTensor(output_data, output_dims, output_min, output_max),
+      CreateQuantizedTensor(input1_data, input1_dims, input1_scale,
+                            input1_zero_point),
+      CreateQuantizedTensor(input2_data, input2_dims, input2_scale,
+                            input2_zero_point),
+      CreateQuantizedTensor(output_data, output_dims, output_scale,
+                            output_zero_point),
   };
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration = resolver.FindOp(op);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
   int inputs_array_data[] = {2, 0, 1};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 2};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = nullptr;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array,
+                             /*builtin_data=*/nullptr, micro_test::reporter);
 
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data[i], output_data[i]);
   }
 }
 
-void TestMaxMinQuantizedInt32(
-    tflite::BuiltinOperator op, std::initializer_list<int> input1_dims_data,
-    std::initializer_list<int32_t> input1_data, float input1_scale,
-    std::initializer_list<int> input2_dims_data,
-    std::initializer_list<int32_t> input2_data, float input2_scale,
-    std::initializer_list<int32_t> expected_output_data, float output_scale,
-    std::initializer_list<int> output_dims_data, int32_t* output_data) {
-  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
-  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+void TestMaxMinQuantizedInt32(const TfLiteRegistration& registration,
+                              const int* input1_dims_data,
+                              const int32_t* input1_data, float input1_scale,
+                              const int* input2_dims_data,
+                              const int32_t* input2_data, float input2_scale,
+                              const int32_t* expected_output_data,
+                              float output_scale, const int* output_dims_data,
+                              int32_t* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInts(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInts(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
 
   constexpr int inputs_size = 2;
@@ -153,35 +126,20 @@ void TestMaxMinQuantizedInt32(
       CreateQuantized32Tensor(output_data, output_dims, output_scale),
   };
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration = resolver.FindOp(op);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
   int inputs_array_data[] = {2, 0, 1};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 2};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = nullptr;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array,
+                             /*builtin_data=*/nullptr, micro_test::reporter);
 
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data[i], output_data[i]);
   }
 }
 
@@ -192,109 +150,86 @@ void TestMaxMinQuantizedInt32(
 TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(FloatTest) {
-  std::initializer_list<float> data1 = {1.0, 0.0, -1.0, 11.0, -2.0, -1.44};
-  std::initializer_list<float> data2 = {-1.0, 0.0, 1.0, 12.0, -3.0, -1.43};
+  const int dims[] = {3, 3, 1, 2};
+  const float data1[] = {1.0, 0.0, -1.0, 11.0, -2.0, -1.44};
+  const float data2[] = {-1.0, 0.0, 1.0, 12.0, -3.0, -1.43};
+  const float golden_max[] = {1.0, 0.0, 1.0, 12.0, -2.0, -1.43};
+  const float golden_min[] = {-1.0, 0.0, -1.0, 11.0, -3.0, -1.44};
   float output_data[6];
 
-  tflite::testing::TestMaxMinFloat(
-      tflite::BuiltinOperator_MAXIMUM, {3, 3, 1, 2},
-      data1,                               // input1 shape and data
-      {3, 3, 1, 2}, data2,                 // input2 shape and data
-      {1.0, 0.0, 1.0, 12.0, -2.0, -1.43},  // expected output
-      {3, 3, 1, 2}, output_data);          // output shape and data buffer
+  tflite::testing::TestMaxMinFloat(tflite::ops::micro::Register_MAXIMUM(), dims,
+                                   data1, dims, data2, golden_max, dims,
+                                   output_data);
 
-  tflite::testing::TestMaxMinFloat(
-      tflite::BuiltinOperator_MINIMUM, {3, 3, 1, 2},
-      data1,                                 // input1 shape and data
-      {3, 3, 1, 2}, data2,                   // input2 shape and data
-      {-1.0, 0.0, -1.0, 11.0, -3.0, -1.44},  // expected output
-      {3, 3, 1, 2}, output_data);            // output shape and data buffer
+  tflite::testing::TestMaxMinFloat(tflite::ops::micro::Register_MINIMUM(), dims,
+                                   data1, dims, data2, golden_min, dims,
+                                   output_data);
 }
 
 TF_LITE_MICRO_TEST(Uint8Test) {
-  std::initializer_list<uint8_t> data1 = {1, 0, 2, 11, 2, 23};
-  std::initializer_list<uint8_t> data2 = {0, 0, 1, 12, 255, 1};
-  const float input1_min = -63.5;
-  const float input1_max = 64;
-  const float input2_min = -63.5;
-  const float input2_max = 64;
-  const float output_min = -63.5;
-  const float output_max = 64;
+  const int dims[] = {3, 3, 1, 2};
+  const uint8_t data1[] = {1, 0, 2, 11, 2, 23};
+  const uint8_t data2[] = {0, 0, 1, 12, 255, 1};
+  const uint8_t golden_max[] = {1, 0, 2, 12, 255, 23};
+  const uint8_t golden_min[] = {0, 0, 1, 11, 2, 1};
+
+  const float input_scale = 1.0;
+  const int input_zero_point = 0;
+  const float output_scale = 1.0;
+  const int output_zero_point = 0;
 
   uint8_t output_data[6];
 
   tflite::testing::TestMaxMinQuantized(
-      tflite::BuiltinOperator_MAXIMUM,
-      // input1 shape, data and bounds
-      {3, 3, 1, 2}, data1, input1_min, input1_max,
-      // input2 shape, data and bounds
-      {3, 3, 1, 2}, data2, input2_min, input2_max,
-      // expected output
-      {1, 0, 2, 12, 255, 23},
-      // output bounds, shape and data buffer
-      output_min, output_max, {3, 3, 1, 2}, output_data);
+      tflite::ops::micro::Register_MAXIMUM(), dims, data1, input_scale,
+      input_zero_point, dims, data2, input_scale, input_zero_point, golden_max,
+      output_scale, output_zero_point, dims, output_data);
 
   tflite::testing::TestMaxMinQuantized(
-      tflite::BuiltinOperator_MINIMUM,
-      // input1 shape, data and bounds
-      {3, 3, 1, 2}, data1, input1_min, input1_max,
-      // input2 shape, data and bounds
-      {3, 3, 1, 2}, data2, input2_min, input2_max,
-      // expected output
-      {0, 0, 1, 11, 2, 1},
-      // output bounds, shape and data buffer
-      output_min, output_max, {3, 3, 1, 2}, output_data);
+      tflite::ops::micro::Register_MINIMUM(), dims, data1, input_scale,
+      input_zero_point, dims, data2, input_scale, input_zero_point, golden_min,
+      output_scale, output_zero_point, dims, output_data);
 }
 
 TF_LITE_MICRO_TEST(FloatWithBroadcastTest) {
-  std::initializer_list<float> data1 = {1.0, 0.0, -1.0, -2.0, -1.44, 11.0};
-  std::initializer_list<float> data2 = {0.5, 2.0};
+  const int dims[] = {3, 3, 1, 2};
+  const int dims_scalar[] = {1, 2};
+  const float data1[] = {1.0, 0.0, -1.0, -2.0, -1.44, 11.0};
+  const float data2[] = {0.5, 2.0};
+  const float golden_max[] = {1.0, 2.0, 0.5, 2.0, 0.5, 11.0};
+  const float golden_min[] = {0.5, 0.0, -1.0, -2.0, -1.44, 2.0};
   float output_data[6];
 
-  tflite::testing::TestMaxMinFloat(
-      tflite::BuiltinOperator_MAXIMUM, {3, 3, 1, 2},
-      data1,                            // input1 shape and data
-      {1, 2}, data2,                    // input2 shape and data
-      {1.0, 2.0, 0.5, 2.0, 0.5, 11.0},  // expected output
-      {3, 3, 1, 2}, output_data);       // output shape and data buffer
+  tflite::testing::TestMaxMinFloat(tflite::ops::micro::Register_MAXIMUM(), dims,
+                                   data1, dims_scalar, data2, golden_max, dims,
+                                   output_data);
 
-  tflite::testing::TestMaxMinFloat(
-      tflite::BuiltinOperator_MINIMUM, {3, 3, 1, 2},
-      data1,                               // input1 shape and data
-      {1, 2}, data2,                       // input2 shape and data
-      {0.5, 0.0, -1.0, -2.0, -1.44, 2.0},  // expected output
-      {3, 3, 1, 2}, output_data);          // output shape and data buffer
+  tflite::testing::TestMaxMinFloat(tflite::ops::micro::Register_MINIMUM(), dims,
+                                   data1, dims_scalar, data2, golden_min, dims,
+                                   output_data);
 }
 
 TF_LITE_MICRO_TEST(Int32WithBroadcastTest) {
   const float input1_scale = 0.5;
   const float input2_scale = 0.5;
   const float output_scale = 0.5;
-  std::initializer_list<int32_t> data1 = {1, 0, -1, -2, 3, 11};
-  std::initializer_list<int32_t> data2 = {2};
+  const int dims[] = {3, 3, 1, 2};
+  const int dims_scalar[] = {1, 1};
+  const int32_t data1[] = {1, 0, -1, -2, 3, 11};
+  const int32_t data2[] = {2};
+  const int32_t golden_max[] = {2, 2, 2, 2, 3, 11};
+  const int32_t golden_min[] = {1, 0, -1, -2, 2, 2};
   int32_t output_data[6];
 
   tflite::testing::TestMaxMinQuantizedInt32(
-      tflite::BuiltinOperator_MAXIMUM,
-      // input1 shape, data and scale
-      {3, 3, 1, 2}, data1, input1_scale,
-      // input2 shape, data and scale
-      {1, 1}, data2, input2_scale,
-      // expected output
-      {2, 2, 2, 2, 3, 11},
-      // output scale, shape and data buffer
-      output_scale, {3, 3, 1, 2}, output_data);
+      tflite::ops::micro::Register_MAXIMUM(), dims, data1, input1_scale,
+      dims_scalar, data2, input2_scale, golden_max, output_scale, dims,
+      output_data);
 
   tflite::testing::TestMaxMinQuantizedInt32(
-      tflite::BuiltinOperator_MINIMUM,
-      // input1 shape, data and scale
-      {3, 3, 1, 2}, data1, input1_scale,
-      // input2 shape, data and scale
-      {1, 1}, data2, input2_scale,
-      // expected output
-      {1, 0, -1, -2, 2, 2},
-      // output scale, shape and data buffer
-      output_scale, {3, 3, 1, 2}, output_data);
+      tflite::ops::micro::Register_MINIMUM(), dims, data1, input1_scale,
+      dims_scalar, data2, input2_scale, golden_min, output_scale, dims,
+      output_data);
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/mul.cc b/tensorflow/lite/micro/kernels/mul.cc
index 951ae9c99ee..36e41a36456 100644
--- a/tensorflow/lite/micro/kernels/mul.cc
+++ b/tensorflow/lite/micro/kernels/mul.cc
@@ -21,23 +21,31 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
 
 namespace tflite {
 namespace ops {
 namespace micro {
 namespace mul {
+namespace {
 
 constexpr int kInput1Tensor = 0;
 constexpr int kInput2Tensor = 1;
 constexpr int kOutputTensor = 0;
 
 struct OpData {
+  int32_t input1_zero_point;
+  int32_t input2_zero_point;
+
   int32_t output_activation_min;
   int32_t output_activation_max;
-
+  int32_t output_zero_point;
   int32_t output_multiplier;
   int output_shift;
+
+  float output_activation_min_f32;
+  float output_activation_max_f32;
 };
 
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
@@ -61,105 +69,143 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
                              static_cast<double>(output->params.scale);
     QuantizeMultiplier(real_multiplier, &data->output_multiplier,
                        &data->output_shift);
+
+    data->input1_zero_point = input1->params.zero_point;
+    data->input2_zero_point = input2->params.zero_point;
+    data->output_zero_point = output->params.zero_point;
+  } else {
+    CalculateActivationRange(params->activation,
+                             &data->output_activation_min_f32,
+                             &data->output_activation_max_f32);
   }
 
   return kTfLiteOk;
 }
 
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInput1Tensor);
-  const TfLiteTensor* input2 = GetInput(context, node, kInput2Tensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+}  // namespace
 
-  if (output->dims->size == 0) {
-    return AllocateOutputDimensionsFromInput(context, input1, input2, output);
-  }
+void EvalQuantized(TfLiteContext* context, TfLiteNode* node, const OpData* data,
+                   const TfLiteEvalTensor* input1,
+                   const TfLiteEvalTensor* input2, TfLiteEvalTensor* output) {
+  tflite::ArithmeticParams op_params = {};
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.float_activation_max = data->output_activation_max_f32;
+  op_params.input1_offset = -data->input1_zero_point;
+  op_params.input2_offset = -data->input2_zero_point;
+  op_params.output_offset = data->output_zero_point;
+  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_shift = data->output_shift;
 
-  return kTfLiteOk;
-}
+  bool need_broadcast = reference_ops::ProcessBroadcastShapes(
+      tflite::micro::GetTensorShape(input1),
+      tflite::micro::GetTensorShape(input2), &op_params);
 
-void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                   TfLiteMulParams* params, OpData* data,
-                   const TfLiteTensor* input1, const TfLiteTensor* input2,
-                   TfLiteTensor* output) {
-  if (output->type == kTfLiteInt8 || output->type == kTfLiteUInt8) {
-    tflite::ArithmeticParams op_params;
-    SetActivationParams(data->output_activation_min,
-                        data->output_activation_max, &op_params);
-    op_params.input1_offset = -input1->params.zero_point;
-    op_params.input2_offset = -input2->params.zero_point;
-    op_params.output_offset = output->params.zero_point;
-    op_params.output_multiplier = data->output_multiplier;
-    op_params.output_shift = data->output_shift;
-    bool need_broadcast = reference_ops::ProcessBroadcastShapes(
-        GetTensorShape(input1), GetTensorShape(input2), &op_params);
-
-#define TF_LITE_MUL(type, opname, dtype)                             \
-  type::opname(op_params, GetTensorShape(input1),                    \
-               GetTensorData<dtype>(input1), GetTensorShape(input2), \
-               GetTensorData<dtype>(input2), GetTensorShape(output), \
-               GetTensorData<dtype>(output));
-
-    if (output->type == kTfLiteInt8) {
-      if (need_broadcast) {
-        TF_LITE_MUL(reference_integer_ops, BroadcastMul4DSlow, int8_t);
-      } else {
-        TF_LITE_MUL(reference_integer_ops, Mul, int8_t);
-      }
-    } else if (output->type == kTfLiteUInt8) {
-      if (need_broadcast) {
-        TF_LITE_MUL(reference_ops, BroadcastMul4DSlow, uint8_t);
-      } else {
-        TF_LITE_MUL(reference_ops, Mul, uint8_t);
-      }
+  if (output->type == kTfLiteInt8) {
+    if (need_broadcast) {
+      reference_integer_ops::BroadcastMul4DSlow(
+          op_params, tflite::micro::GetTensorShape(input1),
+          tflite::micro::GetTensorData<int8_t>(input1),
+          tflite::micro::GetTensorShape(input2),
+          tflite::micro::GetTensorData<int8_t>(input2),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int8_t>(output));
+    } else {
+      reference_integer_ops::Mul(op_params,
+                                 tflite::micro::GetTensorShape(input1),
+                                 tflite::micro::GetTensorData<int8_t>(input1),
+                                 tflite::micro::GetTensorShape(input2),
+                                 tflite::micro::GetTensorData<int8_t>(input2),
+                                 tflite::micro::GetTensorShape(output),
+                                 tflite::micro::GetTensorData<int8_t>(output));
+    }
+  } else if (output->type == kTfLiteUInt8) {
+    if (need_broadcast) {
+      reference_integer_ops::BroadcastMul4DSlow(
+          op_params, tflite::micro::GetTensorShape(input1),
+          tflite::micro::GetTensorData<uint8_t>(input1),
+          tflite::micro::GetTensorShape(input2),
+          tflite::micro::GetTensorData<uint8_t>(input2),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<uint8_t>(output));
+    } else {
+      reference_integer_ops::Mul(op_params,
+                                 tflite::micro::GetTensorShape(input1),
+                                 tflite::micro::GetTensorData<uint8_t>(input1),
+                                 tflite::micro::GetTensorShape(input2),
+                                 tflite::micro::GetTensorData<uint8_t>(input2),
+                                 tflite::micro::GetTensorShape(output),
+                                 tflite::micro::GetTensorData<uint8_t>(output));
     }
-#undef TF_LITE_MUL
   }
 }
 
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
-               TfLiteMulParams* params, OpData* data,
-               const TfLiteTensor* input1, const TfLiteTensor* input2,
-               TfLiteTensor* output) {
-  float output_activation_min, output_activation_max;
-  CalculateActivationRange(params->activation, &output_activation_min,
-                           &output_activation_max);
-  tflite::ArithmeticParams op_params;
-  SetActivationParams(output_activation_min, output_activation_max, &op_params);
+               TfLiteMulParams* params, const OpData* data,
+               const TfLiteEvalTensor* input1, const TfLiteEvalTensor* input2,
+               TfLiteEvalTensor* output) {
+  tflite::ArithmeticParams op_params = {};
+  op_params.float_activation_min = data->output_activation_min_f32;
+  op_params.float_activation_max = data->output_activation_max_f32;
 
   bool need_broadcast = reference_ops::ProcessBroadcastShapes(
-      GetTensorShape(input1), GetTensorShape(input2), &op_params);
-#define TF_LITE_MUL(opname)                                                   \
-  reference_ops::opname(op_params, GetTensorShape(input1),                    \
-                        GetTensorData<float>(input1), GetTensorShape(input2), \
-                        GetTensorData<float>(input2), GetTensorShape(output), \
-                        GetTensorData<float>(output));
+      tflite::micro::GetTensorShape(input1),
+      tflite::micro::GetTensorShape(input2), &op_params);
 
   if (need_broadcast) {
-    TF_LITE_MUL(BroadcastMul4DSlow);
+    reference_ops::BroadcastMul4DSlow(
+        op_params, tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorData<float>(input1),
+        tflite::micro::GetTensorShape(input2),
+        tflite::micro::GetTensorData<float>(input2),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<float>(output));
   } else {
-    TF_LITE_MUL(Mul);
+    reference_ops::Mul(op_params, tflite::micro::GetTensorShape(input1),
+                       tflite::micro::GetTensorData<float>(input1),
+                       tflite::micro::GetTensorShape(input2),
+                       tflite::micro::GetTensorData<float>(input2),
+                       tflite::micro::GetTensorShape(output),
+                       tflite::micro::GetTensorData<float>(output));
   }
-#undef TF_LITE_MUL
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  auto* params = reinterpret_cast<TfLiteMulParams*>(node->builtin_data);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
+  return CalculateOpData(context, node, params, data);
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
   auto* params = reinterpret_cast<TfLiteMulParams*>(node->builtin_data);
-  OpData data;
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInput1Tensor);
-  const TfLiteTensor* input2 = GetInput(context, node, kInput2Tensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
 
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, &data));
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInput1Tensor);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInput2Tensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
   switch (input1->type) {
     case kTfLiteUInt8:
     case kTfLiteInt8:
-      EvalQuantized(context, node, params, &data, input1, input2, output);
+      EvalQuantized(context, node, data, input1, input2, output);
       break;
     case kTfLiteFloat32:
-      EvalFloat(context, node, params, &data, input1, input2, output);
+      EvalFloat(context, node, params, data, input1, input2, output);
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@@ -172,9 +218,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace mul
 
 TfLiteRegistration Register_MUL() {
-  return {/*init=*/nullptr,
+  return {/*init=*/mul::Init,
           /*free=*/nullptr,
-          /*prepare=*/nullptr,
+          /*prepare=*/mul::Prepare,
           /*invoke=*/mul::Eval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
diff --git a/tensorflow/lite/micro/kernels/mul_test.cc b/tensorflow/lite/micro/kernels/mul_test.cc
index 3601d91d8f7..86b4d8be57c 100644
--- a/tensorflow/lite/micro/kernels/mul_test.cc
+++ b/tensorflow/lite/micro/kernels/mul_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
@@ -23,19 +23,58 @@ namespace tflite {
 namespace testing {
 namespace {
 
-void TestMulFloat(std::initializer_list<int> input1_dims_data,
-                  std::initializer_list<float> input1_data,
-                  std::initializer_list<int> input2_dims_data,
-                  std::initializer_list<float> input2_data,
-                  std::initializer_list<int> output_dims_data,
-                  std::initializer_list<float> expected_output_data,
-                  float* output_data, TfLiteFusedActivation activation) {
-  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
-  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
-  const int output_dims_count = ElementCount(*output_dims);
+const int flat_size_simple = 4;
+const float scale_simple = 0.01;
+const int dims_simple[] = {4, 1, 2, 2, 1};
+const float input1_simple[] = {-0.8, 0.2, 0.9, 0.7};
+const float input2_simple[] = {0.6, 0.4, 0.9, 0.8};
+const float golden_simple[] = {-0.48, 0.08, 0.81, 0.56};
+const float golden_simple_relu[] = {0.0, 0.08, 0.81, 0.56};
 
-  ::tflite::AllOpsResolver resolver;
+const int flat_size_broadcast = 6;
+const float input_scale_broadcast = 0.05f;
+const float output_scale_broadcast = 0.01f;
+const int dims_broadcast[] = {4, 1, 3, 1, 2};
+const int dims_scalar_broadcast[] = {1, 1};
+const float input1_broadcast[] = {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0};
+const float input2_broadcast[] = {0.1};
+const float golden_broadcast[] = {-0.2, 0.02, 0.07, 0.08, 0.11, 0.2};
+const float golden_broadcast_relu[] = {0, 0.02, 0.07, 0.08, 0.11, 0.2};
+
+template <typename T>
+void ValidateMulGoldens(TfLiteTensor* tensors, int tensors_size,
+                        TfLiteFusedActivation activation, const T* golden,
+                        int output_len, float tolerance, T* output) {
+  TfLiteMulParams builtin_data = {
+      .activation = activation,
+  };
+
+  int inputs_array_data[] = {2, 0, 1};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 2};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+
+  const TfLiteRegistration registration = tflite::ops::micro::Register_MUL();
+  micro::KernelRunner runner(
+      registration, tensors, tensors_size, inputs_array, outputs_array,
+      reinterpret_cast<void*>(&builtin_data), micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
+
+  for (int i = 0; i < output_len; i++) {
+    TF_LITE_MICRO_EXPECT_NEAR(golden[i], output[i], tolerance);
+  }
+}
+
+void TestMulFloat(const int* input1_dims_data, const float* input1_data,
+                  const int* input2_dims_data, const float* input2_data,
+                  const int* output_dims_data, const float* golden,
+                  float* output_data, TfLiteFusedActivation activation) {
+  TfLiteIntArray* input1_dims = IntArrayFromInts(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInts(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
 
   constexpr int inputs_size = 2;
   constexpr int outputs_size = 1;
@@ -46,119 +85,40 @@ void TestMulFloat(std::initializer_list<int> input1_dims_data,
       CreateFloatTensor(output_data, output_dims),
   };
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_MUL);
-
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  TfLiteMulParams builtin_data = {
-      .activation = activation,
-  };
-
-  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-
-  int inputs_array_data[] = {2, 0, 1};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 2};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-
-  for (int i = 0; i < output_dims_count; i++) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
-                              1e-5f);
-  }
+  ValidateMulGoldens(tensors, tensors_size, activation, golden,
+                     output_dims_count, 1e-5, output_data);
 }
 
 template <typename T>
-void TestMulQuantized(std::initializer_list<int> input1_dims_data,
-                      std::initializer_list<T> input1_data,
-                      std::initializer_list<int> input2_dims_data,
-                      std::initializer_list<T> input2_data,
-                      const float input_min, const float input_max,
-                      std::initializer_list<int> output_dims_data,
-                      const float output_min, const float output_max,
-                      std::initializer_list<T> expected_output_data,
-                      T* output_data, TfLiteFusedActivation activation,
-                      int error_tolerance) {
-  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
-  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+void TestMulQuantized(const int* input1_dims_data, const float* input1_data,
+                      T* input1_quantized, const int* input2_dims_data,
+                      const float* input2_data, T* input2_quantized,
+                      const float input_scale, const int input_zero_point,
+                      const int* output_dims_data, const float* golden,
+                      T* golden_quantized, const float output_scale,
+                      const int output_zero_point, T* output_data,
+                      TfLiteFusedActivation activation) {
+  TfLiteIntArray* input1_dims = IntArrayFromInts(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInts(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
 
-  ::tflite::AllOpsResolver resolver;
-
   constexpr int inputs_size = 2;
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateQuantizedTensor(input1_data, input1_dims, input_min, input_max),
-      CreateQuantizedTensor(input2_data, input2_dims, input_min, input_max),
-      CreateQuantizedTensor(output_data, output_dims, output_min, output_max),
-  };
+      CreateQuantizedTensor(input1_data, input1_quantized, input1_dims,
+                            input_scale, input_zero_point),
+      CreateQuantizedTensor(input2_data, input2_quantized, input2_dims,
+                            input_scale, input_zero_point),
+      CreateQuantizedTensor(output_data, output_dims, output_scale,
+                            output_zero_point)};
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_MUL);
+  AsymmetricQuantize(golden, golden_quantized, output_dims_count, output_scale,
+                     output_zero_point);
 
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  TfLiteMulParams builtin_data = {
-      .activation = activation,
-  };
-
-  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-
-  int inputs_array_data[] = {2, 0, 1};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 2};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-
-  for (int i = 0; i < output_dims_count; i++) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
-                              error_tolerance);
-  }
+  ValidateMulGoldens(tensors, tensors_size, activation, golden_quantized,
+                     output_dims_count, 1.0f, output_data);
 }
 
 }  // namespace
@@ -168,250 +128,105 @@ void TestMulQuantized(std::initializer_list<int> input1_dims_data,
 
 TF_LITE_MICRO_TESTS_BEGIN
 
-TF_LITE_MICRO_TEST(Int8NoActivation) {
-  using tflite::testing::F2QS;
-  const float input_min = -1;
-  const float input_max = 1;
-  const float output_min = -1;
-  const float output_max = 1;
+TF_LITE_MICRO_TEST(SimpleFloatNoAcativationShouldMatchGolden) {
+  float output_data[tflite::testing::flat_size_simple];
 
-  int8_t output_data[4];
-  tflite::testing::TestMulQuantized({4, 1, 2, 2, 1},  // input1 dims
-                                    {
-                                        F2QS(-0.8, input_min, input_max),
-                                        F2QS(0.2, input_min, input_max),
-                                        F2QS(0.9, input_min, input_max),
-                                        F2QS(0.7, input_min, input_max),
-                                    },                // input1 data
-                                    {4, 1, 2, 2, 1},  // input2 dims
-                                    {
-                                        F2QS(0.6, input_min, input_max),
-                                        F2QS(0.4, input_min, input_max),
-                                        F2QS(0.9, input_min, input_max),
-                                        F2QS(0.8, input_min, input_max),
-                                    },  // input2 data
-                                    input_min, input_max,
-                                    {4, 1, 2, 2, 1},  // output dims
-                                    output_min, output_max,
-                                    {
-                                        F2QS(-0.48, output_min, output_max),
-                                        F2QS(0.08, output_min, output_max),
-                                        F2QS(0.81, output_min, output_max),
-                                        F2QS(0.56, output_min, output_max),
-                                    },  // expected output data
-                                    output_data, kTfLiteActNone, 1);
-}
-
-TF_LITE_MICRO_TEST(Int8NoActivationLargeMultiplier) {
-  using tflite::testing::F2QS;
-  const float input_min = -100;
-  const float input_max = 100;
-  const float output_min = -10;
-  const float output_max = 10;
-
-  int8_t output_data[4];
-  tflite::testing::TestMulQuantized(
-      {4, 1, 2, 2, 1},
-      {
-          F2QS(-4, input_min, input_max),
-          F2QS(2, input_min, input_max),
-          F2QS(3, input_min, input_max),
-          F2QS(1, input_min, input_max),
-      },
-      {4, 1, 2, 2, 1},
-      {
-          /* F2QS(-1, input_min, input_max), F2QS(-3, input_min, input_max), */
-          F2QS(-1, input_min, input_max),
-          F2QS(-3, input_min, input_max),
-          F2QS(4, input_min, input_max),
-          F2QS(2, input_min, input_max),
-      },
-      input_min, input_max, {4, 1, 2, 2, 1}, output_min, output_max,
-      {
-          F2QS(4, output_min, output_max),
-          F2QS(-6, output_min, output_max),
-          F2QS(12, output_min, output_max),
-          F2QS(2, output_min, output_max),
-      },
-      // In Tensorflow Lite, this test have a max allowed error of 1.4f.
-      // A difference of 1.4 in floating points corresponds to 18 quantized
-      // for the output min/max [-10, 10].
-      output_data, kTfLiteActNone, 18);
-}
-
-TF_LITE_MICRO_TEST(Int8NoActivationBroadcast) {
-  using tflite::testing::F2QS;
-  const float input_min = -3.0;
-  const float input_max = 3.0;
-  const float output_min = -3.0;
-  const float output_max = 3.0;
-
-  int8_t output_data[6];
-  tflite::testing::TestMulQuantized({4, 1, 3, 1, 2},  // input1 shape
-                                    {
-                                        F2QS(-2.0, input_min, input_max),
-                                        F2QS(0.2, input_min, input_max),
-                                        F2QS(0.7, input_min, input_max),
-                                        F2QS(0.8, input_min, input_max),
-                                        F2QS(1.1, input_min, input_max),
-                                        F2QS(2.0, input_min, input_max),
-                                    },       // input1 data
-                                    {1, 1},  // input2 shape
-                                    {
-                                        F2QS(0.1, input_min, input_max),
-                                    },  // input2 data
-                                    input_min, input_max,
-                                    {4, 1, 3, 1, 2},  // output shape
-                                    output_min, output_max,
-                                    {
-                                        F2QS(-0.2, output_min, output_max),
-                                        F2QS(0.02, output_min, output_max),
-                                        F2QS(0.07, output_min, output_max),
-                                        F2QS(0.08, output_min, output_max),
-                                        F2QS(0.11, output_min, output_max),
-                                        F2QS(0.2, output_min, output_max),
-                                    },  // expected output data
-                                    output_data, kTfLiteActNone, 1);
-}
-
-TF_LITE_MICRO_TEST(UInt8NoActivation) {
-  using tflite::testing::F2Q;
-  const float input_min = -1;
-  const float input_max = 1;
-  const float output_min = -1;
-  const float output_max = 1;
-
-  uint8_t output_data[4];
-  tflite::testing::TestMulQuantized({4, 1, 2, 2, 1},  // input1 dims
-                                    {
-                                        F2Q(-0.8, input_min, input_max),
-                                        F2Q(0.2, input_min, input_max),
-                                        F2Q(0.9, input_min, input_max),
-                                        F2Q(0.7, input_min, input_max),
-                                    },                // input1 data
-                                    {4, 1, 2, 2, 1},  // input2 dims
-                                    {
-                                        F2Q(0.6, input_min, input_max),
-                                        F2Q(0.4, input_min, input_max),
-                                        F2Q(0.9, input_min, input_max),
-                                        F2Q(0.8, input_min, input_max),
-                                    },  // input2 data
-                                    input_min, input_max,
-                                    {4, 1, 2, 2, 1},  // output dims
-                                    output_min, output_max,
-                                    {
-                                        F2Q(-0.48, output_min, output_max),
-                                        F2Q(0.08, output_min, output_max),
-                                        F2Q(0.81, output_min, output_max),
-                                        F2Q(0.56, output_min, output_max),
-                                    },  // expected output data
-                                    output_data, kTfLiteActNone, 1);
-}
-
-TF_LITE_MICRO_TEST(UInt8NoActivationLargeMultiplier) {
-  using tflite::testing::F2Q;
-  const float input_min = -100;
-  const float input_max = 100;
-  const float output_min = -10;
-  const float output_max = 10;
-
-  uint8_t output_data[4];
-  tflite::testing::TestMulQuantized(
-      {4, 1, 2, 2, 1},
-      {
-          F2Q(-4, input_min, input_max),
-          F2Q(2, input_min, input_max),
-          F2Q(3, input_min, input_max),
-          F2Q(1, input_min, input_max),
-      },
-      {4, 1, 2, 2, 1},
-      {
-          F2Q(-1, input_min, input_max),
-          F2Q(-3, input_min, input_max),
-          F2Q(4, input_min, input_max),
-          F2Q(2, input_min, input_max),
-      },
-      input_min, input_max, {4, 1, 2, 2, 1}, output_min, output_max,
-      {
-          F2Q(4, output_min, output_max),
-          F2Q(-6, output_min, output_max),
-          F2Q(12, output_min, output_max),
-          F2Q(2, output_min, output_max),
-      },
-      // In Tensorflow Lite, this test have a max allowed error of 1.4f.
-      // A difference of 1.4 in floating points corresponds to 18 quantized
-      // for the output min/max [-10, 10].
-      output_data, kTfLiteActNone, 18);
-}
-
-TF_LITE_MICRO_TEST(UInt8NoActivationBroadcast) {
-  using tflite::testing::F2Q;
-  const float input_min = -3.0;
-  const float input_max = 3.0;
-  const float output_min = -3.0;
-  const float output_max = 3.0;
-
-  uint8_t output_data[6];
-  tflite::testing::TestMulQuantized({4, 1, 3, 1, 2},  // input1 shape
-                                    {
-                                        F2Q(-2.0, input_min, input_max),
-                                        F2Q(0.2, input_min, input_max),
-                                        F2Q(0.7, input_min, input_max),
-                                        F2Q(0.8, input_min, input_max),
-                                        F2Q(1.1, input_min, input_max),
-                                        F2Q(2.0, input_min, input_max),
-                                    },       // input1 data
-                                    {1, 1},  // input2 shape
-                                    {
-                                        F2Q(0.1, input_min, input_max),
-                                    },  // input2 data
-                                    input_min, input_max,
-                                    {4, 1, 3, 1, 2},  // output shape
-                                    output_min, output_max,
-                                    {
-                                        F2Q(-0.2, output_min, output_max),
-                                        F2Q(0.02, output_min, output_max),
-                                        F2Q(0.07, output_min, output_max),
-                                        F2Q(0.08, output_min, output_max),
-                                        F2Q(0.11, output_min, output_max),
-                                        F2Q(0.2, output_min, output_max),
-                                    },  // expected output data
-                                    output_data, kTfLiteActNone, 1);
-}
-
-TF_LITE_MICRO_TEST(FloatNoActivation) {
-  float output_data[4];
   tflite::testing::TestMulFloat(
-      {4, 1, 2, 2, 1},          // input1 shape
-      {-2.0, 0.2, 0.7, 0.8},    // input1 data
-      {4, 1, 2, 2, 1},          // input2 shape
-      {0.1, 0.2, 0.3, 0.5},     // input2 data
-      {4, 1, 2, 2, 1},          // output shape
-      {-0.2, 0.04, 0.21, 0.4},  // expected output data
+      tflite::testing::dims_simple, tflite::testing::input1_simple,
+      tflite::testing::dims_simple, tflite::testing::input2_simple,
+      tflite::testing::dims_simple, tflite::testing::golden_simple, output_data,
+      kTfLiteActNone);
+}
+
+TF_LITE_MICRO_TEST(SimpleFloatReluShouldMatchGolden) {
+  float output_data[tflite::testing::flat_size_simple];
+
+  tflite::testing::TestMulFloat(
+      tflite::testing::dims_simple, tflite::testing::input1_simple,
+      tflite::testing::dims_simple, tflite::testing::input2_simple,
+      tflite::testing::dims_simple, tflite::testing::golden_simple_relu,
+      output_data, kTfLiteActRelu);
+}
+
+TF_LITE_MICRO_TEST(SimpleInt8NoAcativationShouldMatchGolden) {
+  int8_t input1_quantized[tflite::testing::flat_size_simple];
+  int8_t input2_quantized[tflite::testing::flat_size_simple];
+  int8_t golden_quantized[tflite::testing::flat_size_simple];
+  int8_t output_data[tflite::testing::flat_size_simple];
+
+  tflite::testing::TestMulQuantized(
+      tflite::testing::dims_simple, tflite::testing::input1_simple,
+      input1_quantized, tflite::testing::dims_simple,
+      tflite::testing::input2_simple, input2_quantized,
+      tflite::testing::scale_simple, 0, tflite::testing::dims_simple,
+      tflite::testing::golden_simple, golden_quantized,
+      tflite::testing::scale_simple, 0, output_data, kTfLiteActNone);
+}
+
+TF_LITE_MICRO_TEST(SimpleUInt8NoAcativationShouldMatchGolden) {
+  uint8_t input1_quantized[tflite::testing::flat_size_simple];
+  uint8_t input2_quantized[tflite::testing::flat_size_simple];
+  uint8_t golden_quantized[tflite::testing::flat_size_simple];
+  uint8_t output_data[tflite::testing::flat_size_simple];
+
+  tflite::testing::TestMulQuantized(
+      tflite::testing::dims_simple, tflite::testing::input1_simple,
+      input1_quantized, tflite::testing::dims_simple,
+      tflite::testing::input2_simple, input2_quantized,
+      tflite::testing::scale_simple, 128, tflite::testing::dims_simple,
+      tflite::testing::golden_simple, golden_quantized,
+      tflite::testing::scale_simple, 128, output_data, kTfLiteActNone);
+}
+
+TF_LITE_MICRO_TEST(BroadcastFloatNoActivationShouldMatchGolden) {
+  float output_data[tflite::testing::flat_size_broadcast];
+
+  tflite::testing::TestMulFloat(
+      tflite::testing::dims_broadcast, tflite::testing::input1_broadcast,
+      tflite::testing::dims_scalar_broadcast, tflite::testing::input2_broadcast,
+      tflite::testing::dims_broadcast, tflite::testing::golden_broadcast,
       output_data, kTfLiteActNone);
 }
 
-TF_LITE_MICRO_TEST(FloatRelu) {
-  float output_data[4];
+TF_LITE_MICRO_TEST(BroadcastFloatReluShouldMatchGolden) {
+  float output_data[tflite::testing::flat_size_broadcast];
+
   tflite::testing::TestMulFloat(
-      {4, 1, 2, 2, 1},          // input1 shape
-      {-2.0, 0.2, 0.7, 0.8},    // input1 data
-      {4, 1, 2, 2, 1},          // input2 shape
-      {0.1, 0.2, 0.3, 0.5},     // input2 data
-      {4, 1, 2, 2, 1},          // output shape
-      {-0.2, 0.04, 0.21, 0.4},  // expected output data
-      output_data, kTfLiteActReluN1To1);
+      tflite::testing::dims_broadcast, tflite::testing::input1_broadcast,
+      tflite::testing::dims_scalar_broadcast, tflite::testing::input2_broadcast,
+      tflite::testing::dims_broadcast, tflite::testing::golden_broadcast_relu,
+      output_data, kTfLiteActRelu);
 }
 
-TF_LITE_MICRO_TEST(FloatBroadcast) {
-  float output_data[6];
-  tflite::testing::TestMulFloat(
-      {4, 1, 3, 1, 2},                      // input1 shape
-      {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0},      // input1 data
-      {1, 1},                               // input2 shape
-      {0.1},                                // input2 data
-      {4, 1, 3, 1, 2},                      // output shape
-      {-0.2, 0.02, 0.07, 0.08, 0.11, 0.2},  // expected output data
+TF_LITE_MICRO_TEST(BroadcastInt8NoAcativationShouldMatchGolden) {
+  int8_t input1_quantized[tflite::testing::flat_size_broadcast];
+  int8_t input2_quantized[tflite::testing::flat_size_broadcast];
+  int8_t golden_quantized[tflite::testing::flat_size_broadcast];
+  int8_t output_data[tflite::testing::flat_size_broadcast];
+
+  tflite::testing::TestMulQuantized(
+      tflite::testing::dims_broadcast, tflite::testing::input1_broadcast,
+      input1_quantized, tflite::testing::dims_scalar_broadcast,
+      tflite::testing::input2_broadcast, input2_quantized,
+      tflite::testing::input_scale_broadcast, 0,
+      tflite::testing::dims_broadcast, tflite::testing::golden_broadcast,
+      golden_quantized, tflite::testing::output_scale_broadcast, 0, output_data,
+      kTfLiteActNone);
+}
+
+TF_LITE_MICRO_TEST(BroadcastUInt8NoAcativationShouldMatchGolden) {
+  uint8_t input1_quantized[tflite::testing::flat_size_broadcast];
+  uint8_t input2_quantized[1];
+  uint8_t golden_quantized[tflite::testing::flat_size_broadcast];
+  uint8_t output_data[tflite::testing::flat_size_broadcast];
+
+  tflite::testing::TestMulQuantized(
+      tflite::testing::dims_broadcast, tflite::testing::input1_broadcast,
+      input1_quantized, tflite::testing::dims_scalar_broadcast,
+      tflite::testing::input2_broadcast, input2_quantized,
+      tflite::testing::input_scale_broadcast, 128,
+      tflite::testing::dims_broadcast, tflite::testing::golden_broadcast,
+      golden_quantized, tflite::testing::output_scale_broadcast, 128,
       output_data, kTfLiteActNone);
 }
 
diff --git a/tensorflow/lite/micro/kernels/neg.cc b/tensorflow/lite/micro/kernels/neg.cc
index 0786218d522..74a95ca32eb 100644
--- a/tensorflow/lite/micro/kernels/neg.cc
+++ b/tensorflow/lite/micro/kernels/neg.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -28,14 +28,17 @@ constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
   switch (input->type) {
     // TODO(wangtz): handle for kTfLiteInt8
     case kTfLiteFloat32:
-      reference_ops::Negate(GetTensorShape(input), GetTensorData<float>(input),
-                            GetTensorShape(output),
-                            GetTensorData<float>(output));
+      reference_ops::Negate(tflite::micro::GetTensorShape(input),
+                            tflite::micro::GetTensorData<float>(input),
+                            tflite::micro::GetTensorShape(output),
+                            tflite::micro::GetTensorData<float>(output));
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
diff --git a/tensorflow/lite/micro/kernels/neg_test.cc b/tensorflow/lite/micro/kernels/neg_test.cc
index 8c8e6b8b282..544a3eddc1c 100644
--- a/tensorflow/lite/micro/kernels/neg_test.cc
+++ b/tensorflow/lite/micro/kernels/neg_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
@@ -23,13 +24,11 @@ namespace tflite {
 namespace testing {
 namespace {
 
-void TestNegFloat(std::initializer_list<int> input_dims_data,
-                  std::initializer_list<float> input_data,
-                  std::initializer_list<float> expected_output_data,
-                  std::initializer_list<int> output_dims_data,
-                  float* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+void TestNegFloat(const int* input_dims_data, const float* input_data,
+                  const float* expected_output_data,
+                  const int* output_dims_data, float* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
   constexpr int inputs_size = 1;
   constexpr int outputs_size = 1;
@@ -39,32 +38,22 @@ void TestNegFloat(std::initializer_list<int> input_dims_data,
       CreateFloatTensor(output_data, output_dims),
   };
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_NEG);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
   int inputs_array_data[] = {1, 0};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = nullptr;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
+  const TfLiteRegistration registration = tflite::ops::micro::Register_NEG();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array,
+                             /*builtin_data=*/nullptr, micro_test::reporter);
 
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
-  TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[0], output_data[0]);
+  TF_LITE_MICRO_EXPECT_EQ(expected_output_data[0], output_data[0]);
   for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data[i], output_data[i]);
   }
 }
 
@@ -75,23 +64,21 @@ void TestNegFloat(std::initializer_list<int> input_dims_data,
 TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(NegOpSingleFloat) {
+  const int dims[] = {1, 2};
+  const float input_data[] = {8.5, 0.0};
+  const float golden[] = {-8.5, 0.0};
   float output_data[2];
-  tflite::testing::TestNegFloat(/*input_dims_data=*/{1, 2},
-                                /*input_data=*/{8.5f, 0.0f},
-                                /*expected_output_data=*/{-8.5f, 0.0f},
-                                /*output_dims_data*/ {1, 2},
-                                /*output_data=*/output_data);
+
+  tflite::testing::TestNegFloat(dims, input_data, golden, dims, output_data);
 }
 
 TF_LITE_MICRO_TEST(NegOpFloat) {
+  const int dims[] = {2, 2, 3};
+  const float input_data[] = {-2.0f, -1.0f, 0.f, 1.0f, 2.0f, 3.0f};
+  const float golden[] = {2.0f, 1.0f, -0.f, -1.0f, -2.0f, -3.0f};
   float output_data[6];
-  tflite::testing::TestNegFloat(/*input_dims_data=*/{2, 2, 3},
-                                /*input_data=*/
-                                {-2.0f, -1.0f, 0.f, 1.0f, 2.0f, 3.0f},
-                                /*expected_output_data=*/
-                                {2.0f, 1.0f, -0.f, -1.0f, -2.0f, -3.0f},
-                                /*output_dims_data=*/{2, 2, 3},
-                                /*output_data=*/output_data);
+
+  tflite::testing::TestNegFloat(dims, input_data, golden, dims, output_data);
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/pack.cc b/tensorflow/lite/micro/kernels/pack.cc
index 7c2a8a2f768..d332fc63653 100644
--- a/tensorflow/lite/micro/kernels/pack.cc
+++ b/tensorflow/lite/micro/kernels/pack.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -28,9 +28,11 @@ constexpr int kOutputTensor = 0;
 
 template <typename T>
 TfLiteStatus PackImpl(TfLiteContext* context, TfLiteNode* node,
-                      TfLiteTensor* output, int values_count, int axis) {
+                      TfLiteEvalTensor* output, int values_count, int axis) {
+  const TfLiteEvalTensor* input0 =
+      tflite::micro::GetEvalInput(context, node, 0);
+
   const int dimensions = output->dims->size;
-  const TfLiteTensor* input0 = GetInput(context, node, 0);
   const TfLiteIntArray* input_dims = input0->dims;
   const TfLiteIntArray* output_dims = output->dims;
 
@@ -52,11 +54,11 @@ TfLiteStatus PackImpl(TfLiteContext* context, TfLiteNode* node,
   }
   TFLITE_DCHECK_EQ(input_size, copy_size * outer_size);
 
-  T* output_data = GetTensorData<T>(output);
+  T* output_data = tflite::micro::GetTensorData<T>(output);
 
   for (int i = 0; i < values_count; ++i) {
-    const TfLiteTensor* t = GetInput(context, node, i);
-    const T* input_data = GetTensorData<T>(t);
+    const TfLiteEvalTensor* t = tflite::micro::GetEvalInput(context, node, i);
+    const T* input_data = tflite::micro::GetTensorData<T>(t);
     for (int k = 0; k < outer_size; ++k) {
       const T* input_ptr = input_data + copy_size * k;
       int loc = k * values_count * copy_size + i * copy_size;
@@ -72,7 +74,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLitePackParams* data =
       reinterpret_cast<TfLitePackParams*>(node->builtin_data);
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
   switch (output->type) {
     case kTfLiteFloat32: {
diff --git a/tensorflow/lite/micro/kernels/pack_test.cc b/tensorflow/lite/micro/kernels/pack_test.cc
index ddd1a39d775..c05595df146 100644
--- a/tensorflow/lite/micro/kernels/pack_test.cc
+++ b/tensorflow/lite/micro/kernels/pack_test.cc
@@ -15,24 +15,47 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/debug_log.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
 
-void TestPackTwoInputsFloat(std::initializer_list<int> input1_dims_data,
-                            std::initializer_list<float> input1_data,
-                            std::initializer_list<int> input2_dims_data,
-                            std::initializer_list<float> input2_data, int axis,
-                            std::initializer_list<int> output_dims_data,
-                            std::initializer_list<float> expected_output_data,
+template <typename T>
+void ValidatePackGoldens(TfLiteTensor* tensors, int tensors_size,
+                         TfLitePackParams params, TfLiteIntArray* inputs_array,
+                         TfLiteIntArray* outputs_array, const T* golden,
+                         int output_len, float tolerance, T* output) {
+  // Place a unique value in the uninitialized output buffer.
+  for (int i = 0; i < output_len; ++i) {
+    output[i] = 23;
+  }
+
+  const TfLiteRegistration registration = tflite::ops::micro::Register_PACK();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array, reinterpret_cast<void*>(&params),
+                             micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
+
+  for (int i = 0; i < output_len; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(golden[i], output[i], tolerance);
+  }
+}
+
+void TestPackTwoInputsFloat(const int* input1_dims_data,
+                            const float* input1_data,
+                            const int* input2_dims_data,
+                            const float* input2_data, int axis,
+                            const int* output_dims_data,
+                            const float* expected_output_data,
                             float* output_data) {
-  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
-  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  TfLiteIntArray* input1_dims = IntArrayFromInts(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInts(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
 
   constexpr int input_size = 2;
@@ -43,71 +66,30 @@ void TestPackTwoInputsFloat(std::initializer_list<int> input1_dims_data,
       CreateFloatTensor(input2_data, input2_dims),
       CreateFloatTensor(output_data, output_dims)};
 
-  // Place a unique value in the uninitialized output buffer.
-  for (int i = 0; i < output_dims_count; ++i) {
-    output_data[i] = 23;
-  }
-
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_PACK);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
   TfLitePackParams builtin_data = {
       .values_count = 2,
       .axis = axis,
   };
-
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, nullptr, 0);
-  }
-
   int inputs_array_data[] = {2, 0, 1};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 2};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
-
-  for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
-                              1e-5f);
-  }
+  ValidatePackGoldens(tensors, tensors_size, builtin_data, inputs_array,
+                      outputs_array, expected_output_data, output_dims_count,
+                      1e-5f, output_data);
 }
 
-void TestPackThreeInputsFloat(std::initializer_list<int> input1_dims_data,
-                              std::initializer_list<float> input1_data,
-                              std::initializer_list<int> input2_dims_data,
-                              std::initializer_list<float> input2_data,
-                              std::initializer_list<int> input3_dims_data,
-                              std::initializer_list<float> input3_data,
-                              int axis,
-                              std::initializer_list<int> output_dims_data,
-                              std::initializer_list<float> expected_output_data,
-                              float* output_data) {
-  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
-  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
-  TfLiteIntArray* input3_dims = IntArrayFromInitializer(input3_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+void TestPackThreeInputsFloat(
+    const int* input1_dims_data, const float* input1_data,
+    const int* input2_dims_data, const float* input2_data,
+    const int* input3_dims_data, const float* input3_data, int axis,
+    const int* output_dims_data, const float* expected_output_data,
+    float* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInts(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInts(input2_dims_data);
+  TfLiteIntArray* input3_dims = IntArrayFromInts(input3_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
 
   constexpr int input_size = 3;
@@ -119,139 +101,67 @@ void TestPackThreeInputsFloat(std::initializer_list<int> input1_dims_data,
       CreateFloatTensor(input3_data, input3_dims),
       CreateFloatTensor(output_data, output_dims)};
 
-  // Place a unique value in the uninitialized output buffer.
-  for (int i = 0; i < output_dims_count; ++i) {
-    output_data[i] = 23;
-  }
-
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_PACK);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
   TfLitePackParams builtin_data = {
       .values_count = 3,
       .axis = axis,
   };
-
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, nullptr, 0);
-  }
-
   int inputs_array_data[] = {3, 0, 1, 2};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 3};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
-
-  for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
-                              1e-5f);
-  }
+  ValidatePackGoldens(tensors, tensors_size, builtin_data, inputs_array,
+                      outputs_array, expected_output_data, output_dims_count,
+                      1e-5f, output_data);
 }
 
-void TestPackTwoInputsQuantized(
-    std::initializer_list<int> input1_dims_data,
-    std::initializer_list<uint8_t> input1_data,
-    std::initializer_list<int> input2_dims_data,
-    std::initializer_list<uint8_t> input2_data, int axis,
-    std::initializer_list<int> output_dims_data,
-    std::initializer_list<uint8_t> expected_output_data, uint8_t* output_data) {
-  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
-  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+void TestPackTwoInputsQuantized(const int* input1_dims_data,
+                                const uint8_t* input1_data,
+                                const int* input2_dims_data,
+                                const uint8_t* input2_data, int axis,
+                                const int* output_dims_data,
+                                const uint8_t* expected_output_data,
+                                uint8_t* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInts(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInts(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
 
   constexpr int input_size = 2;
   constexpr int output_size = 1;
   constexpr int tensors_size = input_size + output_size;
   TfLiteTensor tensors[tensors_size] = {
-      // CreateQuantizedTensor needs min/max values as input, but these values
-      // don't matter as to the functionality of PACK, so just set as 0 and 10.
-      CreateQuantizedTensor(input1_data, input1_dims, 0, 10),
-      CreateQuantizedTensor(input2_data, input2_dims, 0, 10),
-      CreateQuantizedTensor(output_data, output_dims, 0, 10)};
-
-  // Place a unique value in the uninitialized output buffer.
-  for (int i = 0; i < output_dims_count; ++i) {
-    output_data[i] = 23;
-  }
-
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_PACK);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+      // CreateQuantizedTensor needs scale/zero_point values as input, but these
+      // values don't matter as to the functionality of PACK, so just set as 1.0
+      // and 128.
+      CreateQuantizedTensor(input1_data, input1_dims, 1.0, 128),
+      CreateQuantizedTensor(input2_data, input2_dims, 1.0, 128),
+      CreateQuantizedTensor(output_data, output_dims, 1.0, 128)};
 
   TfLitePackParams builtin_data = {
       .values_count = 2,
       .axis = axis,
   };
-
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, nullptr, 0);
-  }
   int inputs_array_data[] = {2, 0, 1};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 2};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
-
-  for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
-  }
+  ValidatePackGoldens(tensors, tensors_size, builtin_data, inputs_array,
+                      outputs_array, expected_output_data, output_dims_count,
+                      1e-5f, output_data);
 }
 
-void TestPackTwoInputsQuantized32(
-    std::initializer_list<int> input1_dims_data,
-    std::initializer_list<int32_t> input1_data,
-    std::initializer_list<int> input2_dims_data,
-    std::initializer_list<int32_t> input2_data, int axis,
-    std::initializer_list<int> output_dims_data,
-    std::initializer_list<int32_t> expected_output_data, int32_t* output_data) {
-  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
-  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+void TestPackTwoInputsQuantized32(const int* input1_dims_data,
+                                  const int32_t* input1_data,
+                                  const int* input2_dims_data,
+                                  const int32_t* input2_data, int axis,
+                                  const int* output_dims_data,
+                                  const int32_t* expected_output_data,
+                                  int32_t* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInts(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInts(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
 
   constexpr int input_size = 2;
@@ -262,53 +172,18 @@ void TestPackTwoInputsQuantized32(
       CreateQuantized32Tensor(input2_data, input2_dims, 1.0),
       CreateQuantized32Tensor(output_data, output_dims, 1.0)};
 
-  // Place a unique value in the uninitialized output buffer.
-  for (int i = 0; i < output_dims_count; ++i) {
-    output_data[i] = 23;
-  }
-
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_PACK);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
   TfLitePackParams builtin_data = {
       .values_count = 2,
       .axis = axis,
   };
-
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, nullptr, 0);
-  }
   int inputs_array_data[] = {2, 0, 1};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 2};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
-
-  for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
-  }
+  ValidatePackGoldens(tensors, tensors_size, builtin_data, inputs_array,
+                      outputs_array, expected_output_data, output_dims_count,
+                      1e-5f, output_data);
 }
 
 }  // namespace testing
@@ -317,99 +192,96 @@ void TestPackTwoInputsQuantized32(
 TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(PackFloatThreeInputs) {
+  const int input_shape[] = {1, 2};
+  const int output_shape[] = {2, 3, 2};
+  const float input1_values[] = {1, 4};
+  const float input2_values[] = {2, 5};
+  const float input3_values[] = {3, 6};
+  const float golden[] = {1, 4, 2, 5, 3, 6};
+  const int axis = 0;
   constexpr int output_dims_count = 6;
   float output_data[output_dims_count];
+
   tflite::testing::TestPackThreeInputsFloat(
-      {1, 2},        // Input1 shape
-      {1, 4},        // Input1 values
-      {1, 2},        // Input2 shape
-      {2, 5},        // Input2 values
-      {1, 2},        // Input3 shape
-      {3, 6},        // Input3 values
-      0, {2, 3, 2},  // Output shape
-      {
-          1, 4, 2, 5, 3, 6  // Output values
-      },
-      output_data);
+      input_shape, input1_values, input_shape, input2_values, input_shape,
+      input3_values, axis, output_shape, golden, output_data);
 }
 
 TF_LITE_MICRO_TEST(PackFloatThreeInputsDifferentAxis) {
+  const int input_shape[] = {1, 2};
+  const int output_shape[] = {2, 2, 3};
+  const float input1_values[] = {1, 4};
+  const float input2_values[] = {2, 5};
+  const float input3_values[] = {3, 6};
+  const float golden[] = {1, 2, 3, 4, 5, 6};
+  const int axis = 1;
   constexpr int output_dims_count = 6;
   float output_data[output_dims_count];
+
   tflite::testing::TestPackThreeInputsFloat(
-      {1, 2},        // Input1 shape
-      {1, 4},        // Input1 values
-      {1, 2},        // Input2 shape
-      {2, 5},        // Input2 values
-      {1, 2},        // Input3 shape
-      {3, 6},        // Input3 values
-      1, {2, 2, 3},  // Output shape
-      {
-          1, 2, 3, 4, 5, 6  // Output values
-      },
-      output_data);
+      input_shape, input1_values, input_shape, input2_values, input_shape,
+      input3_values, axis, output_shape, golden, output_data);
 }
 
 TF_LITE_MICRO_TEST(PackFloatThreeInputsNegativeAxis) {
+  const int input_shape[] = {1, 2};
+  const int output_shape[] = {2, 2, 3};
+  const float input1_values[] = {1, 4};
+  const float input2_values[] = {2, 5};
+  const float input3_values[] = {3, 6};
+  const float golden[] = {1, 2, 3, 4, 5, 6};
+  const int axis = -1;
   constexpr int output_dims_count = 6;
   float output_data[output_dims_count];
+
   tflite::testing::TestPackThreeInputsFloat(
-      {1, 2},         // Input1 shape
-      {1, 4},         // Input1 values
-      {1, 2},         // Input2 shape
-      {2, 5},         // Input2 values
-      {1, 2},         // Input3 shape
-      {3, 6},         // Input3 values
-      -1, {2, 2, 3},  // Output shape
-      {
-          1, 2, 3, 4, 5, 6  // Output values
-      },
-      output_data);
+      input_shape, input1_values, input_shape, input2_values, input_shape,
+      input3_values, axis, output_shape, golden, output_data);
 }
 
 TF_LITE_MICRO_TEST(PackFloatMultilDimensions) {
+  const int input_shape[] = {2, 2, 3};
+  const int output_shape[] = {3, 2, 2, 3};
+  const float input1_values[] = {1, 2, 3, 4, 5, 6};
+  const float input2_values[] = {7, 8, 9, 10, 11, 12};
+  const float golden[] = {1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12};
+  const int axis = 1;
   constexpr int output_dims_count = 12;
   float output_data[output_dims_count];
-  tflite::testing::TestPackTwoInputsFloat(
-      {2, 2, 3},              // Input1 shape
-      {1, 2, 3, 4, 5, 6},     // Input1 values
-      {2, 2, 3},              // Input2 shape
-      {7, 8, 9, 10, 11, 12},  // Input2 values
-      1, {3, 2, 2, 3},        // Output shape
-      {
-          1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12  // Output values
-      },
-      output_data);
+
+  tflite::testing::TestPackTwoInputsFloat(input_shape, input1_values,
+                                          input_shape, input2_values, axis,
+                                          output_shape, golden, output_data);
 }
 
 TF_LITE_MICRO_TEST(PackQuantizedMultilDimensions) {
+  const int input_shape[] = {2, 2, 3};
+  const int output_shape[] = {3, 2, 2, 3};
+  const uint8_t input1_values[] = {1, 2, 3, 4, 5, 6};
+  const uint8_t input2_values[] = {7, 8, 9, 10, 11, 12};
+  const uint8_t golden[] = {1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12};
+  const int axis = 1;
   constexpr int output_dims_count = 12;
   uint8_t output_data[output_dims_count];
+
   tflite::testing::TestPackTwoInputsQuantized(
-      {2, 2, 3},              // Input1 shape
-      {1, 2, 3, 4, 5, 6},     // Input1 values
-      {2, 2, 3},              // Input2 shape
-      {7, 8, 9, 10, 11, 12},  // Input2 values
-      1, {3, 2, 2, 3},        // Output shape
-      {
-          1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12  // Output values
-      },
-      output_data);
+      input_shape, input1_values, input_shape, input2_values, axis,
+      output_shape, golden, output_data);
 }
 
 TF_LITE_MICRO_TEST(PackQuantized32MultilDimensions) {
+  const int input_shape[] = {2, 2, 3};
+  const int output_shape[] = {3, 2, 2, 3};
+  const int32_t input1_values[] = {1, 2, 3, 4, 5, 6};
+  const int32_t input2_values[] = {7, 8, 9, 10, 11, 12};
+  const int32_t golden[] = {1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12};
+  const int axis = 1;
   constexpr int output_dims_count = 12;
   int32_t output_data[output_dims_count];
+
   tflite::testing::TestPackTwoInputsQuantized32(
-      {2, 2, 3},              // Input1 shape
-      {1, 2, 3, 4, 5, 6},     // Input1 values
-      {2, 2, 3},              // Input2 shape
-      {7, 8, 9, 10, 11, 12},  // Input2 values
-      1, {3, 2, 2, 3},        // Output shape
-      {
-          1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12  // Output values
-      },
-      output_data);
+      input_shape, input1_values, input_shape, input2_values, axis,
+      output_shape, golden, output_data);
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/pad.cc b/tensorflow/lite/micro/kernels/pad.cc
index 7ac39943c5c..39f86cbf9a3 100644
--- a/tensorflow/lite/micro/kernels/pad.cc
+++ b/tensorflow/lite/micro/kernels/pad.cc
@@ -16,189 +16,205 @@ limitations under the License.
 
 #include <string.h>
 
-#include "tensorflow/lite/kernels/internal/types.h"
-
-#ifdef MEMORY_SANITIZER
-#include <sanitizer/msan_interface.h>
-#else
-#define __msan_check_mem_is_initialized(ptr, size)
-#endif
-
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
 namespace micro {
 namespace pad {
+namespace {
 
-struct PadContext {
-  PadContext(TfLiteContext* context, TfLiteNode* node) {
-    input = GetInput(context, node, 0);
-    paddings = GetInput(context, node, 1);
-    constant_values = nullptr;
-    if (NumInputs(node) == 3) {
-      constant_values = GetOptionalInputTensor(context, node, 2);
-    } else {
-      constant_values = nullptr;
-    }
-    output = GetOutput(context, node, 0);
-    dims = NumDimensions(input);
-
-    resizing_category = ResizingCategory::kGenericResize;
-    const int paddings_total = GetTensorShape(paddings).FlatSize();
-    const int32* paddings_data = GetTensorData<int32>(paddings);
-    // Paddings will be a n,2 array, and we need to detect 4D arrays with the
-    // pattern { {0,0}, {a, b}, {c, d}, {0,0} }.
-    if (IsConstantTensor(paddings) && paddings_total == 8 &&
-        (paddings_data[0] == 0 && paddings_data[1] == 0) &&
-        (paddings_data[6] == 0 && paddings_data[7] == 0)) {
-      resizing_category = ResizingCategory::kImageStyle;
-    }
-  }
-  const TfLiteTensor* constant_values;
-  const TfLiteTensor* input;
-  const TfLiteTensor* paddings;
-  TfLiteTensor* output;
-  int dims;
-  ResizingCategory resizing_category;
+struct OpData {
+  PadParams params;
+  int32_t output_zero_point;
 };
 
+}  // namespace
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
   TF_LITE_ENSURE(context, NumInputs(node) == 2 || NumInputs(node) == 3);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  PadContext op_context(context, node);
-  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
-  if (op_context.constant_values != nullptr) {
-    TF_LITE_ENSURE_EQ(context, op_context.input->type,
-                      op_context.constant_values->type);
+  const TfLiteTensor* input = GetInput(context, node, /*index=*/0);
+  const TfLiteTensor* paddings = GetInput(context, node, /*index=*/1);
+  const TfLiteTensor* constant_values =
+      NumInputs(node) == 3 ? GetInput(context, node, /*index=*/2) : nullptr;
+  TfLiteTensor* output = GetOutput(context, node, /*index=*/0);
+
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+  // Current implementations rely on the inputs being <= 4D.
+  TF_LITE_ENSURE(context, NumDimensions(input) <=
+                              reference_ops::PadKernelMaxDimensionCount());
+
+  if (constant_values != nullptr) {
+    TF_LITE_ENSURE_EQ(context, input->type, constant_values->type);
+    // Ensure that constant_values is a scalar.
+    TF_LITE_ENSURE_EQ(context, NumElements(constant_values), 1);
   }
 
   // There must be a pair of paddings for each output dimension.
-  TF_LITE_ENSURE_EQ(context, GetTensorShape(op_context.paddings).FlatSize(),
-                    op_context.output->dims->size * 2);
+  TF_LITE_ENSURE_EQ(context, GetTensorShape(paddings).FlatSize(),
+                    output->dims->size * 2);
 
   // On Micro, outputs must be properly sized by the converter.
-  const int32* paddings_data = GetTensorData<int32>(op_context.paddings);
-  for (int i = 0; i < op_context.output->dims->size; i++) {
-    int output_dim = op_context.output->dims->data[i];
-    int expected_dim = op_context.input->dims->data[i] + paddings_data[i * 2] +
-                       paddings_data[i * 2 + 1];
+  // NOTE: This data is only available because the paddings buffer is stored in
+  // the flatbuffer:
+  TF_LITE_ENSURE(context, IsConstantTensor(paddings));
+  const int32_t* paddings_data = GetTensorData<int32_t>(paddings);
+  for (int i = 0; i < output->dims->size; i++) {
+    int output_dim = output->dims->data[i];
+    int expected_dim =
+        input->dims->data[i] + paddings_data[i * 2] + paddings_data[i * 2 + 1];
     TF_LITE_ENSURE_EQ(context, output_dim, expected_dim);
   }
 
-  // Current implementations rely on the inputs being <= 4D.
-  TF_LITE_ENSURE(
-      context, op_context.dims <= reference_ops::PadKernelMaxDimensionCount());
-  TF_LITE_ENSURE(context, IsConstantTensor(op_context.paddings));
+  // Calculate OpData:
+  data->params.resizing_category = ResizingCategory::kGenericResize;
+  const int paddings_total = GetTensorShape(paddings).FlatSize();
+  if (paddings_total == 8 && (paddings_data[0] == 0 && paddings_data[1] == 0) &&
+      (paddings_data[6] == 0 && paddings_data[7] == 0)) {
+    data->params.resizing_category = ResizingCategory::kImageStyle;
+  }
+
+  const int num_input_dimensions = NumDimensions(input);
+  data->params.left_padding_count = num_input_dimensions;
+  data->params.right_padding_count = num_input_dimensions;
+
+  for (int idx = num_input_dimensions - 1; idx >= 0; --idx) {
+    data->params.left_padding[idx] = paddings_data[idx * 2];
+    data->params.right_padding[idx] = paddings_data[idx * 2 + 1];
+  }
+
+  if (input->type == kTfLiteInt8 || input->type == kTfLiteUInt8) {
+    if (constant_values == nullptr) {
+      // Quantized Pad requires that 0 is represented in the quantized
+      // range.
+      if (input->type == kTfLiteUInt8) {
+        TF_LITE_ENSURE(context, output->params.zero_point >=
+                                    std::numeric_limits<uint8_t>::min());
+        TF_LITE_ENSURE(context, output->params.zero_point <=
+                                    std::numeric_limits<uint8_t>::max());
+      } else {
+        TF_LITE_ENSURE(context, output->params.zero_point >=
+                                    std::numeric_limits<int8_t>::min());
+        TF_LITE_ENSURE(context, output->params.zero_point <=
+                                    std::numeric_limits<int8_t>::max());
+      }
+    } else {
+      // Quantized Pad requires that 'constant_values' is represented in the
+      // same quantized range as the input and output tensors.
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point,
+                        constant_values->params.zero_point);
+      TF_LITE_ENSURE_EQ(context, static_cast<double>(output->params.scale),
+                        static_cast<double>(constant_values->params.scale));
+    }
+    data->output_zero_point = output->params.zero_point;
+  }
+
   return kTfLiteOk;
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  PadContext op_context(context, node);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
 
-  if (op_context.constant_values != nullptr) {
-    // Ensure that constant_values is a scalar.
-    TF_LITE_ENSURE_EQ(context, NumElements(op_context.constant_values), 1);
-  }
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, /*index=*/0);
+  const TfLiteEvalTensor* constant_values =
+      NumInputs(node) == 3
+          ? tflite::micro::GetEvalInput(context, node, /*index=*/2)
+          : nullptr;
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, /*index=*/0);
 
-  // Create before and after padding arrays that are accepted by the kernel.
-  const int32* paddings_data = GetTensorData<int32>(op_context.paddings);
-
-  tflite::PadParams op_params;
-  memset(&op_params, 0, sizeof(PadParams));
-  op_params.left_padding_count = op_context.dims;
-  op_params.right_padding_count = op_context.dims;
-
-  for (int idx = op_context.dims - 1; idx >= 0; --idx) {
-    op_params.left_padding[idx] = paddings_data[idx * 2];
-    op_params.right_padding[idx] = paddings_data[idx * 2 + 1];
-  }
-
-#define TF_LITE_PAD(type, op_name, scalar, pad_value)                     \
-  const scalar pad_value_copy = pad_value;                                \
-                                                                          \
-  type::op_name(op_params, GetTensorShape(op_context.input),              \
-                GetTensorData<scalar>(op_context.input), &pad_value_copy, \
-                GetTensorShape(op_context.output),                        \
-                GetTensorData<scalar>(op_context.output))
-  switch (op_context.input->type) {
+  switch (input->type) {
     case kTfLiteFloat32: {
-      float pad_value = op_context.constant_values == nullptr
-                            ? 0.f
-                            : *GetTensorData<float>(op_context.constant_values);
-      if (op_context.resizing_category == ResizingCategory::kImageStyle) {
-        TF_LITE_PAD(reference_ops, PadImageStyle, float, pad_value);
+      float pad_value =
+          constant_values == nullptr
+              ? 0.f
+              : *tflite::micro::GetTensorData<float>(constant_values);
+      if (data->params.resizing_category == ResizingCategory::kImageStyle) {
+        reference_ops::PadImageStyle(
+            data->params, tflite::micro::GetTensorShape(input),
+            tflite::micro::GetTensorData<float>(input), &pad_value,
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<float>(output));
       } else {
-        TF_LITE_PAD(reference_ops, Pad, float, pad_value);
+        reference_ops::Pad(data->params, tflite::micro::GetTensorShape(input),
+                           tflite::micro::GetTensorData<float>(input),
+                           &pad_value, tflite::micro::GetTensorShape(output),
+                           tflite::micro::GetTensorData<float>(output));
       }
     } break;
     case kTfLiteUInt8: {
       uint8_t pad_value;
-      if (op_context.constant_values == nullptr) {
-        // Quantized Pad requires that 0 is represented in the quantized
-        // range.
-        TF_LITE_ENSURE(context, op_context.output->params.zero_point >=
-                                    std::numeric_limits<uint8_t>::min());
-        TF_LITE_ENSURE(context, op_context.output->params.zero_point <=
-                                    std::numeric_limits<uint8_t>::max());
-        pad_value = static_cast<uint8_t>(op_context.output->params.zero_point);
+      if (constant_values == nullptr) {
+        pad_value = static_cast<uint8_t>(data->output_zero_point);
       } else {
-        // Quantized Pad requires that 'constant_values' is represented in the
-        // same quantized range as the input and output tensors.
-        TF_LITE_ENSURE_EQ(context, op_context.output->params.zero_point,
-                          op_context.constant_values->params.zero_point);
-        TF_LITE_ENSURE_EQ(
-            context, static_cast<double>(op_context.output->params.scale),
-            static_cast<double>(op_context.constant_values->params.scale));
-        pad_value = *GetTensorData<uint8_t>(op_context.constant_values);
+        pad_value = *tflite::micro::GetTensorData<uint8_t>(constant_values);
       }
-      if (op_context.resizing_category == ResizingCategory::kImageStyle) {
-        TF_LITE_PAD(reference_ops, PadImageStyle, uint8_t, pad_value);
+      if (data->params.resizing_category == ResizingCategory::kImageStyle) {
+        reference_ops::PadImageStyle(
+            data->params, tflite::micro::GetTensorShape(input),
+            tflite::micro::GetTensorData<uint8_t>(input), &pad_value,
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<uint8_t>(output));
       } else {
-        TF_LITE_PAD(reference_ops, Pad, uint8_t, pad_value);
+        reference_ops::Pad(data->params, tflite::micro::GetTensorShape(input),
+                           tflite::micro::GetTensorData<uint8_t>(input),
+                           &pad_value, tflite::micro::GetTensorShape(output),
+                           tflite::micro::GetTensorData<uint8_t>(output));
       }
     } break;
     case kTfLiteInt8: {
       int8_t pad_value;
-      if (op_context.constant_values == nullptr) {
-        // Quantized Pad requires that 0 is represented in the quantized
-        // range.
-        TF_LITE_ENSURE(context, op_context.output->params.zero_point >=
-                                    std::numeric_limits<int8_t>::min());
-        TF_LITE_ENSURE(context, op_context.output->params.zero_point <=
-                                    std::numeric_limits<int8_t>::max());
-        pad_value = static_cast<int8_t>(op_context.output->params.zero_point);
+      if (constant_values == nullptr) {
+        pad_value = static_cast<uint8_t>(data->output_zero_point);
       } else {
-        // Quantized Pad requires that 'constant_values' is represented in the
-        // same quantized range as the input and output tensors.
-        TF_LITE_ENSURE_EQ(context, op_context.output->params.zero_point,
-                          op_context.constant_values->params.zero_point);
-        TF_LITE_ENSURE(context, op_context.output->params.scale ==
-                                    op_context.constant_values->params.scale);
-        pad_value = *GetTensorData<int8_t>(op_context.constant_values);
+        pad_value = *tflite::micro::GetTensorData<int8_t>(constant_values);
       }
-      if (op_context.resizing_category == ResizingCategory::kImageStyle) {
-        TF_LITE_PAD(reference_ops, PadImageStyle, int8_t, pad_value);
+      if (data->params.resizing_category == ResizingCategory::kImageStyle) {
+        reference_ops::PadImageStyle(
+            data->params, tflite::micro::GetTensorShape(input),
+            tflite::micro::GetTensorData<int8_t>(input), &pad_value,
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int8_t>(output));
       } else {
-        TF_LITE_PAD(reference_ops, Pad, int8_t, pad_value);
+        reference_ops::Pad(data->params, tflite::micro::GetTensorShape(input),
+                           tflite::micro::GetTensorData<int8_t>(input),
+                           &pad_value, tflite::micro::GetTensorShape(output),
+                           tflite::micro::GetTensorData<int8_t>(output));
       }
     } break;
     case kTfLiteInt32: {
       int32_t pad_value =
-          op_context.constant_values == nullptr
+          constant_values == nullptr
               ? 0
-              : *GetTensorData<int32_t>(op_context.constant_values);
-      TF_LITE_PAD(reference_ops, Pad, int32_t, pad_value);
+              : *tflite::micro::GetTensorData<int32_t>(constant_values);
+      reference_ops::Pad(data->params, tflite::micro::GetTensorShape(input),
+                         tflite::micro::GetTensorData<int32_t>(input),
+                         &pad_value, tflite::micro::GetTensorShape(output),
+                         tflite::micro::GetTensorData<int32_t>(output));
     } break;
     default:
 
       TF_LITE_KERNEL_LOG(context, "Type %s not currently supported by Pad.",
-                         TfLiteTypeGetName(op_context.input->type));
+                         TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 #undef TF_LITE_PAD
@@ -208,7 +224,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace pad
 
 TfLiteRegistration Register_PAD() {
-  return {/*init=*/nullptr,
+  return {/*init=*/pad::Init,
           /*free=*/nullptr,
           /*prepare=*/pad::Prepare,
           /*invoke=*/pad::Eval,
@@ -220,7 +236,7 @@ TfLiteRegistration Register_PAD() {
 
 // Also register Pad as PadV2.
 TfLiteRegistration Register_PADV2() {
-  return {/*init=*/nullptr,
+  return {/*init=*/pad::Init,
           /*free=*/nullptr,
           /*prepare=*/pad::Prepare,
           /*invoke=*/pad::Eval,
diff --git a/tensorflow/lite/micro/kernels/pad_test.cc b/tensorflow/lite/micro/kernels/pad_test.cc
index 9b1f4db30cd..4d391057858 100644
--- a/tensorflow/lite/micro/kernels/pad_test.cc
+++ b/tensorflow/lite/micro/kernels/pad_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
@@ -28,29 +29,28 @@ template <typename T>
 TfLiteStatus ValidatePadGoldens(TfLiteTensor* tensors, int tensors_size,
                                 const T* golden, T* output_data,
                                 int output_length) {
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_PAD);
-  TF_LITE_ENSURE(&context, registration != nullptr);
-
   int inputs_array_data[] = {2, 0, 1};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 2};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = nullptr;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->prepare);
-  TF_LITE_ENSURE_EQ(&context, kTfLiteOk,
-                    registration->prepare(&context, &node));
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_ENSURE_EQ(&context, kTfLiteOk, registration->invoke(&context, &node));
+
+  const TfLiteRegistration registration = tflite::ops::micro::Register_PAD();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array,
+                             /*builtin_data=*/nullptr, micro_test::reporter);
+
+  // Prepare should catch dimension mismatches.
+  TfLiteStatus prepare_status = runner.InitAndPrepare();
+  if (prepare_status != kTfLiteOk) {
+    return prepare_status;
+  }
+
+  // Eval should catch quantization mismatches.
+  TfLiteStatus invoke_status = runner.Invoke();
+  if (invoke_status != kTfLiteOk) {
+    return invoke_status;
+  }
+
   for (int i = 0; i < output_length; ++i) {
     TF_LITE_MICRO_EXPECT_EQ(golden[i], output_data[i]);
   }
@@ -61,34 +61,24 @@ template <typename T>
 TfLiteStatus ValidatePadV2Goldens(TfLiteTensor* tensors, int tensors_size,
                                   const T* golden, T* output_data,
                                   int output_length) {
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_PADV2);
-  TF_LITE_ENSURE(&context, registration != nullptr);
-
   int inputs_array_data[] = {3, 0, 1, 2};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 3};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = nullptr;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->prepare);
+
+  const TfLiteRegistration registration = tflite::ops::micro::Register_PADV2();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array,
+                             /*builtin_data=*/nullptr, micro_test::reporter);
+
   // Prepare should catch dimension mismatches.
-  TfLiteStatus prepare_status = registration->prepare(&context, &node);
+  TfLiteStatus prepare_status = runner.InitAndPrepare();
   if (prepare_status != kTfLiteOk) {
     return prepare_status;
   }
 
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
   // Eval should catch quantization mismatches.
-  TfLiteStatus invoke_status = registration->invoke(&context, &node);
+  TfLiteStatus invoke_status = runner.Invoke();
   if (invoke_status != kTfLiteOk) {
     return invoke_status;
   }
diff --git a/tensorflow/lite/micro/kernels/pooling.cc b/tensorflow/lite/micro/kernels/pooling.cc
index 6d8e61ed755..90d48aaee5a 100644
--- a/tensorflow/lite/micro/kernels/pooling.cc
+++ b/tensorflow/lite/micro/kernels/pooling.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -32,6 +33,10 @@ constexpr int kOutputTensor = 0;
 
 struct OpData {
   TfLitePaddingValues padding;
+  int32_t activation_min;
+  int32_t activation_max;
+  float activation_min_f32;
+  float activation_max_f32;
 };
 
 TfLiteStatus CalculateOpData(const TfLiteContext* context,
@@ -55,11 +60,7 @@ TfLiteStatus CalculateOpData(const TfLiteContext* context,
 
 void AverageEvalFloat(const TfLiteContext* context, const TfLiteNode* node,
                       const TfLitePoolParams* params, const OpData* data,
-                      const TfLiteTensor* input, TfLiteTensor* output) {
-  float activation_min, activation_max;
-  CalculateActivationRange(params->activation, &activation_min,
-                           &activation_max);
-
+                      const TfLiteEvalTensor* input, TfLiteEvalTensor* output) {
   PoolParams op_params;
   op_params.stride_height = params->stride_height;
   op_params.stride_width = params->stride_width;
@@ -67,20 +68,19 @@ void AverageEvalFloat(const TfLiteContext* context, const TfLiteNode* node,
   op_params.filter_width = params->filter_width;
   op_params.padding_values.height = data->padding.height;
   op_params.padding_values.width = data->padding.width;
-  op_params.float_activation_min = activation_min;
-  op_params.float_activation_max = activation_max;
-  reference_ops::AveragePool(
-      op_params, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(output), GetTensorData<float>(output));
+  op_params.float_activation_min = data->activation_min_f32;
+  op_params.float_activation_max = data->activation_max_f32;
+  reference_ops::AveragePool(op_params, tflite::micro::GetTensorShape(input),
+                             tflite::micro::GetTensorData<float>(input),
+                             tflite::micro::GetTensorShape(output),
+                             tflite::micro::GetTensorData<float>(output));
 }
 
 void AverageEvalQuantized(TfLiteContext* context, const TfLiteNode* node,
                           const TfLitePoolParams* params, const OpData* data,
-                          const TfLiteTensor* input, TfLiteTensor* output) {
+                          const TfLiteEvalTensor* input,
+                          TfLiteEvalTensor* output) {
   TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
-  int32_t activation_min, activation_max;
-  (void)CalculateActivationRangeQuantized(context, params->activation, output,
-                                          &activation_min, &activation_max);
 
   PoolParams op_params;
   op_params.stride_height = params->stride_height;
@@ -89,27 +89,26 @@ void AverageEvalQuantized(TfLiteContext* context, const TfLiteNode* node,
   op_params.filter_width = params->filter_width;
   op_params.padding_values.height = data->padding.height;
   op_params.padding_values.width = data->padding.width;
-  op_params.quantized_activation_min = activation_min;
-  op_params.quantized_activation_max = activation_max;
+  op_params.quantized_activation_min = data->activation_min;
+  op_params.quantized_activation_max = data->activation_max;
 
   if (input->type == kTfLiteUInt8) {
-    reference_ops::AveragePool(
-        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-        GetTensorShape(output), GetTensorData<uint8_t>(output));
+    reference_ops::AveragePool(op_params, tflite::micro::GetTensorShape(input),
+                               tflite::micro::GetTensorData<uint8_t>(input),
+                               tflite::micro::GetTensorShape(output),
+                               tflite::micro::GetTensorData<uint8_t>(output));
   } else {
     reference_integer_ops::AveragePool(
-        op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-        GetTensorShape(output), GetTensorData<int8_t>(output));
+        op_params, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<int8_t>(input),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int8_t>(output));
   }
 }
 
 void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
-                  TfLitePoolParams* params, OpData* data,
-                  const TfLiteTensor* input, TfLiteTensor* output) {
-  float activation_min, activation_max;
-  CalculateActivationRange(params->activation, &activation_min,
-                           &activation_max);
-
+                  TfLitePoolParams* params, const OpData* data,
+                  const TfLiteEvalTensor* input, TfLiteEvalTensor* output) {
   tflite::PoolParams op_params;
   op_params.stride_height = params->stride_height;
   op_params.stride_width = params->stride_width;
@@ -117,22 +116,17 @@ void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
   op_params.filter_width = params->filter_width;
   op_params.padding_values.height = data->padding.height;
   op_params.padding_values.width = data->padding.width;
-  op_params.float_activation_min = activation_min;
-  op_params.float_activation_max = activation_max;
-  reference_ops::MaxPool(op_params, GetTensorShape(input),
-                         GetTensorData<float>(input), GetTensorShape(output),
-                         GetTensorData<float>(output));
+  op_params.float_activation_min = data->activation_min_f32;
+  op_params.float_activation_max = data->activation_max_f32;
+  reference_ops::MaxPool(op_params, tflite::micro::GetTensorShape(input),
+                         tflite::micro::GetTensorData<float>(input),
+                         tflite::micro::GetTensorShape(output),
+                         tflite::micro::GetTensorData<float>(output));
 }
 
 void MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                      TfLitePoolParams* params, OpData* data,
-                      const TfLiteTensor* input, TfLiteTensor* output) {
-  TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
-
-  int32_t activation_min, activation_max;
-  (void)CalculateActivationRangeQuantized(context, params->activation, output,
-                                          &activation_min, &activation_max);
-
+                      TfLitePoolParams* params, const OpData* data,
+                      const TfLiteEvalTensor* input, TfLiteEvalTensor* output) {
   tflite::PoolParams op_params;
   op_params.stride_height = params->stride_height;
   op_params.stride_width = params->stride_width;
@@ -140,39 +134,44 @@ void MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
   op_params.filter_width = params->filter_width;
   op_params.padding_values.height = data->padding.height;
   op_params.padding_values.width = data->padding.width;
-  op_params.quantized_activation_min = activation_min;
-  op_params.quantized_activation_max = activation_max;
+  op_params.quantized_activation_min = data->activation_min;
+  op_params.quantized_activation_max = data->activation_max;
 
   if (input->type == kTfLiteUInt8) {
-    reference_ops::MaxPool(
-        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-        GetTensorShape(output), GetTensorData<uint8_t>(output));
+    reference_ops::MaxPool(op_params, tflite::micro::GetTensorShape(input),
+                           tflite::micro::GetTensorData<uint8_t>(input),
+                           tflite::micro::GetTensorShape(output),
+                           tflite::micro::GetTensorData<uint8_t>(output));
   } else {
     reference_integer_ops::MaxPool(
-        op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-        GetTensorShape(output), GetTensorData<int8_t>(output));
+        op_params, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<int8_t>(input),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int8_t>(output));
   }
 }
 }  // namespace
 
-
 TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
   auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
-  OpData data;
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
 
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
   // Inputs and outputs share the same type, guaranteed by the converter.
   switch (input->type) {
     case kTfLiteFloat32:
-      AverageEvalFloat(context, node, params, &data, input, output);
+      AverageEvalFloat(context, node, params, data, input, output);
       break;
     case kTfLiteUInt8:
     case kTfLiteInt8:
-      AverageEvalQuantized(context, node, params, &data, input, output);
+      AverageEvalQuantized(context, node, params, data, input, output);
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Input type %s is not currently supported",
@@ -183,21 +182,24 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
   auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
-  OpData data;
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
 
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
   switch (input->type) {
     case kTfLiteFloat32:
-      MaxEvalFloat(context, node, params, &data, input, output);
+      MaxEvalFloat(context, node, params, data, input, output);
       break;
     case kTfLiteUInt8:
     case kTfLiteInt8:
-      MaxEvalQuantized(context, node, params, &data, input, output);
+      MaxEvalQuantized(context, node, params, data, input, output);
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.",
@@ -207,12 +209,41 @@ TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, data));
+
+  if (input->type == kTfLiteFloat32) {
+    CalculateActivationRange(params->activation, &data->activation_min_f32,
+                             &data->activation_max_f32);
+  } else if (input->type == kTfLiteInt8 || input->type == kTfLiteUInt8) {
+    CalculateActivationRangeQuantized(context, params->activation, output,
+                                      &data->activation_min,
+                                      &data->activation_max);
+  }
+
+  return kTfLiteOk;
+}
+
 }  // namespace pooling
 
 TfLiteRegistration Register_AVERAGE_POOL_2D() {
-  return {/*init=*/nullptr,
+  return {/*init=*/pooling::Init,
           /*free=*/nullptr,
-          /*prepare=*/nullptr,
+          /*prepare=*/pooling::Prepare,
           /*invoke=*/pooling::AverageEval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
@@ -221,9 +252,9 @@ TfLiteRegistration Register_AVERAGE_POOL_2D() {
 }
 
 TfLiteRegistration Register_MAX_POOL_2D() {
-  return {/*init=*/nullptr,
+  return {/*init=*/pooling::Init,
           /*free=*/nullptr,
-          /*prepare=*/nullptr,
+          /*prepare=*/pooling::Prepare,
           /*invoke=*/pooling::MaxEval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
diff --git a/tensorflow/lite/micro/kernels/pooling_test.cc b/tensorflow/lite/micro/kernels/pooling_test.cc
index d1f21da7533..a33f5df6fd4 100644
--- a/tensorflow/lite/micro/kernels/pooling_test.cc
+++ b/tensorflow/lite/micro/kernels/pooling_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
@@ -25,17 +25,48 @@ namespace tflite {
 namespace testing {
 namespace {
 
-void TestAveragePoolingFloat(std::initializer_list<int> input_dims_data,
-                             std::initializer_list<float> input_data,
-                             const int filter_height, const int filter_width,
-                             const int stride_height, const int stride_width,
-                             std::initializer_list<float> expected_output_data,
-                             std::initializer_list<int> output_dims_data,
-                             TfLitePadding padding,
-                             TfLiteFusedActivation activation,
-                             float* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+template <typename T>
+void ValidatePoolingGoldens(TfLiteTensor* tensors, int tensors_size,
+                            const TfLiteRegistration registration,
+                            const int filter_height, const int filter_width,
+                            const int stride_height, const int stride_width,
+                            const T* golden, const int output_length,
+                            TfLitePadding padding,
+                            TfLiteFusedActivation activation, T* output_data) {
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+
+  TfLitePoolParams builtin_data = {padding,
+                                   stride_width,
+                                   stride_height,
+                                   filter_width,
+                                   filter_height,
+                                   activation,
+                                   {}};
+
+  micro::KernelRunner runner(
+      registration, tensors, tensors_size, inputs_array, outputs_array,
+      reinterpret_cast<void*>(&builtin_data), micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
+
+  for (int i = 0; i < output_length; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(golden[i], output_data[i], 1e-5f);
+  }
+}
+
+void TestAveragePoolFloat(const int* input_dims_data, const float* input_data,
+                          const int filter_height, const int filter_width,
+                          const int stride_height, const int stride_width,
+                          const float* expected_output_data,
+                          const int* output_dims_data, TfLitePadding padding,
+                          TfLiteFusedActivation activation,
+                          float* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
 
   constexpr int inputs_size = 1;
@@ -46,127 +77,54 @@ void TestAveragePoolingFloat(std::initializer_list<int> input_dims_data,
       CreateFloatTensor(output_data, output_dims),
   };
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+  const TfLiteRegistration registration =
+      tflite::ops::micro::Register_AVERAGE_POOL_2D();
 
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_AVERAGE_POOL_2D);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  TfLitePoolParams builtin_data = {padding,      stride_width,  stride_height,
-                                   filter_width, filter_height, activation};
-  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-  int inputs_array_data[] = {1, 0};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 1};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
-
-  for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
-                              1e-5f);
-  }
+  ValidatePoolingGoldens(tensors, tensors_size, registration, filter_height,
+                         filter_width, stride_height, stride_width,
+                         expected_output_data, output_dims_count, padding,
+                         activation, output_data);
 }
 
 template <typename T>
-void TestAveragePoolingQuantized(
-    std::initializer_list<int> input_dims_data,
-    std::initializer_list<T> input_data, const float input_min,
-    const float input_max, const int filter_height, const int filter_width,
+void TestAveragePoolQuantized(
+    const int* input_dims_data, const T* input_data, const float input_scale,
+    const int input_zero_point, const int filter_height, const int filter_width,
     const int stride_height, const int stride_width,
-    std::initializer_list<T> expected_output_data,
-    std::initializer_list<int> output_dims_data, float output_min,
-    float output_max, TfLitePadding padding, TfLiteFusedActivation activation,
-    T* output_data) {
-  static_assert(sizeof(T) == 1, "Only int8/uint8 data types allowed.");
+    const T* expected_output_data, const int* output_dims_data,
+    const float output_scale, const int output_zero_point,
+    TfLitePadding padding, TfLiteFusedActivation activation, T* output_data) {
+  static_assert(sizeof(T) == 1, "Only int8_t/uint8_t data types allowed.");
 
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
 
   constexpr int inputs_size = 1;
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateQuantizedTensor(input_data, input_dims, input_min, input_max),
-      CreateQuantizedTensor(output_data, output_dims, output_min, output_max),
+      CreateQuantizedTensor(input_data, input_dims, input_scale,
+                            input_zero_point),
+      CreateQuantizedTensor(output_data, output_dims, output_scale,
+                            output_zero_point),
   };
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_AVERAGE_POOL_2D);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  TfLitePoolParams builtin_data = {padding,      stride_width,  stride_height,
-                                   filter_width, filter_height, activation};
-  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-  int inputs_array_data[] = {1, 0};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 1};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
-
-  for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
-                              1e-5f);
-  }
+  const TfLiteRegistration registration =
+      tflite::ops::micro::Register_AVERAGE_POOL_2D();
+  ValidatePoolingGoldens(tensors, tensors_size, registration, filter_height,
+                         filter_width, stride_height, stride_width,
+                         expected_output_data, output_dims_count, padding,
+                         activation, output_data);
 }
 
-void TestMaxPoolFloat(std::initializer_list<int> input_dims_data,
-                      std::initializer_list<float> input_data, int filter_width,
-                      int filter_height, int stride_width, int stride_height,
-                      std::initializer_list<float> expected_output_data,
-                      std::initializer_list<int> output_dims_data,
-                      TfLitePadding padding, TfLiteFusedActivation activation,
-                      float* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+void TestMaxPoolFloat(const int* input_dims_data, const float* input_data,
+                      int filter_width, int filter_height, int stride_width,
+                      int stride_height, const float* expected_output_data,
+                      const int* output_dims_data, TfLitePadding padding,
+                      TfLiteFusedActivation activation, float* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
 
   constexpr int inputs_size = 1;
@@ -177,119 +135,43 @@ void TestMaxPoolFloat(std::initializer_list<int> input_dims_data,
       CreateFloatTensor(output_data, output_dims),
   };
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_MAX_POOL_2D);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  TfLitePoolParams builtin_data = {
-      padding,      stride_width,  stride_height,
-      filter_width, filter_height, activation,
-  };
-
-  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-
-  int inputs_array_data[] = {1, 0};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 1};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
-  for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
-                              1e-5f);
-  }
+  const TfLiteRegistration registration =
+      tflite::ops::micro::Register_MAX_POOL_2D();
+  ValidatePoolingGoldens(tensors, tensors_size, registration, filter_height,
+                         filter_width, stride_height, stride_width,
+                         expected_output_data, output_dims_count, padding,
+                         activation, output_data);
 }
 
 template <typename T>
-void TestMaxPoolQuantized(std::initializer_list<int> input_dims_data,
-                          std::initializer_list<T> input_data, float input_min,
-                          float input_max, int filter_width, int filter_height,
-                          int stride_width, int stride_height,
-                          std::initializer_list<T> expected_output_data,
-                          float output_min, float output_max,
-                          std::initializer_list<int> output_dims_data,
-                          TfLitePadding padding,
+void TestMaxPoolQuantized(const int* input_dims_data, const T* input_data,
+                          const float input_scale, const int input_zero_point,
+                          const int filter_height, const int filter_width,
+                          const int stride_height, const int stride_width,
+                          const T* expected_output_data,
+                          const int* output_dims_data, const float output_scale,
+                          const int output_zero_point, TfLitePadding padding,
                           TfLiteFusedActivation activation, T* output_data) {
-  static_assert(sizeof(T) == 1, "Only int8/uint8 data types allowed.");
-
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
 
   constexpr int inputs_size = 1;
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateQuantizedTensor(input_data, input_dims, input_min, input_max),
-      CreateQuantizedTensor(output_data, output_dims, output_min, output_max),
+      CreateQuantizedTensor(input_data, input_dims, input_scale,
+                            input_zero_point),
+      CreateQuantizedTensor(output_data, output_dims, output_scale,
+                            output_zero_point),
   };
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_MAX_POOL_2D);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  TfLitePoolParams builtin_data = {
-      padding,      stride_width,  stride_height,
-      filter_width, filter_height, activation,
-  };
-
-  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-
-  int inputs_array_data[] = {1, 0};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 1};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
-  for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
-  }
+  const TfLiteRegistration registration =
+      tflite::ops::micro::Register_MAX_POOL_2D();
+  ValidatePoolingGoldens(tensors, tensors_size, registration, filter_height,
+                         filter_width, stride_height, stride_width,
+                         expected_output_data, output_dims_count, padding,
+                         activation, output_data);
 }
 
 }  // namespace
@@ -300,798 +182,535 @@ void TestMaxPoolQuantized(std::initializer_list<int> input_dims_data,
 TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(SimpleAveragePoolTestFloat) {
+  const int input_shape[] = {4, 1, 2, 4, 1};
+  const float input_values[] = {0, 6, 2, 4, 3, 2, 10, 7};
+  const int filter_width = 2;
+  const int filter_height = 2;
+  const int stride_width = 2;
+  const int stride_height = 2;
+  const float golden[] = {2.75, 5.75};
+  const int output_shape[] = {4, 1, 1, 2, 1};
   float output_data[2];
-  tflite::testing::TestAveragePoolingFloat({4, 1, 2, 4, 1},  // Input shape
-                                           {                 // Input values
-                                            0., 6., 2., 4., 3., 2., 10., 7.},
-                                           2, 2,  // filter width, filter height
-                                           2, 2,  // stride width, stride height
-                                           {
-                                               // Output values
-                                               2.75,
-                                               5.75,
-                                           },
-                                           {4, 1, 1, 2, 1},  // Output shape
-                                           kTfLitePaddingValid, kTfLiteActNone,
-                                           output_data);
+  tflite::testing::TestAveragePoolFloat(
+      input_shape, input_values, filter_height, filter_width, stride_height,
+      stride_width, golden, output_shape, kTfLitePaddingValid, kTfLiteActNone,
+      output_data);
 }
 
 TF_LITE_MICRO_TEST(SimpleAveragePoolTestUint8) {
-  using tflite::testing::F2Q;
-
-  const float input_min = -15.9375;
-  const float input_max = 15.9375;
-  const float output_min = -15.9375;
-  const float output_max = 15.9375;
+  const int input_shape[] = {4, 1, 2, 4, 1};
+  const uint8_t input_values[] = {0, 24, 8, 16, 12, 8, 40, 28};
+  const int filter_width = 2;
+  const int filter_height = 2;
+  const int stride_width = 2;
+  const int stride_height = 2;
+  const uint8_t golden[] = {11, 23};
+  const int output_shape[] = {4, 1, 1, 2, 1};
   uint8_t output_data[2];
-  tflite::testing::TestAveragePoolingQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2Q(0., input_min, input_max),
-          F2Q(-6., input_min, input_max),
-          F2Q(2., input_min, input_max),
-          F2Q(4., input_min, input_max),
-          F2Q(3., input_min, input_max),
-          F2Q(2., input_min, input_max),
-          F2Q(-10., input_min, input_max),
-          F2Q(7., input_min, input_max),
-      },
-      input_min, input_max,  // input quantization range
-      2, 2,                  // filter width, filter height
-      2, 2,                  // stride width, stride height
-      {
-          // Output values
-          F2Q(0., output_min, output_max),
-          F2Q(0.75, output_min, output_max),
-      },
-      {4, 1, 1, 2, 1},         // Output shape
-      output_min, output_max,  // output quantization range
-      kTfLitePaddingValid, kTfLiteActRelu, output_data);
+
+  const float input_scale = 0.25;
+  const int input_zero_point = 0;
+  const float output_scale = .25;
+  const int output_zero_point = 0;
+  tflite::testing::TestAveragePoolQuantized(
+      input_shape, input_values, input_scale, input_zero_point, filter_height,
+      filter_width, stride_height, stride_width, golden, output_shape,
+      output_scale, output_zero_point, kTfLitePaddingValid, kTfLiteActNone,
+      output_data);
 }
 
 TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingValidStride2ActNone) {
-  using tflite::testing::F2QS;
-
-  const float input_min = -15.9375;
-  const float input_max = 15.8130;
-  const float output_min = -15.9375;
-  const float output_max = 15.8130;
+  const int input_shape[] = {4, 1, 2, 4, 1};
+  const int8_t input_values[] = {0, -24, 8, 16, 12, 8, -40, 28};
+  const int filter_width = 2;
+  const int filter_height = 2;
+  const int stride_width = 2;
+  const int stride_height = 2;
+  const int8_t golden[] = {-1, 3};
+  const int output_shape[] = {4, 1, 1, 2, 1};
   int8_t output_data[2];
-  tflite::testing::TestAveragePoolingQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {                 // Input values
-       F2QS(0., input_min, input_max), F2QS(-6., input_min, input_max),
-       F2QS(2., input_min, input_max), F2QS(4., input_min, input_max),
-       F2QS(3., input_min, input_max), F2QS(2., input_min, input_max),
-       F2QS(-10., input_min, input_max), F2QS(7., input_min, input_max)},
-      input_min, input_max,  // input quantization range
-      2, 2,                  // filter height, filter width
-      2, 2,                  // stride height, stride width
-      {                      // Output values
-       F2QS(-0.25, output_min, output_max), F2QS(0.75, output_min, output_max)},
-      {4, 1, 1, 2, 1},         // Output shape
-      output_min, output_max,  // output quantization range
-      kTfLitePaddingValid, kTfLiteActNone, output_data);
+
+  const float input_scale = .25;
+  const int input_zero_point = 0;
+  const float output_scale = .25;
+  const int output_zero_point = 0;
+  tflite::testing::TestAveragePoolQuantized(
+      input_shape, input_values, input_scale, input_zero_point, filter_height,
+      filter_width, stride_height, stride_width, golden, output_shape,
+      output_scale, output_zero_point, kTfLitePaddingValid, kTfLiteActNone,
+      output_data);
 }
 
 TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingValidStride1Stride2Relu) {
-  using tflite::testing::F2QS;
-
-  const float input_min = -15.9375;
-  const float input_max = 15.8130;
-  const float output_min = -15.9375;
-  const float output_max = 15.8130;
+  const int input_shape[] = {4, 1, 2, 4, 1};
+  const int8_t input_values[] = {0, -24, 8, 16, 12, 8, -40, 28};
+  const int filter_width = 2;
+  const int filter_height = 2;
+  const int stride_width = 1;
+  const int stride_height = 2;
+  const int8_t golden[] = {0, 0, 3};
+  const int output_shape[] = {4, 1, 1, 3, 1};
   int8_t output_data[3];
-  tflite::testing::TestAveragePoolingQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {                 // Input values
-       F2QS(0., input_min, input_max), F2QS(-6., input_min, input_max),
-       F2QS(2., input_min, input_max), F2QS(4., input_min, input_max),
-       F2QS(3., input_min, input_max), F2QS(2., input_min, input_max),
-       F2QS(-10., input_min, input_max), F2QS(7., input_min, input_max)},
-      input_min, input_max,  // input quantization range
-      2, 2,                  // filter height, filter width
-      2, 1,                  // stride height, stride width
-      {                      // Output values
-       F2QS(0., output_min, output_max), F2QS(0., output_min, output_max),
-       F2QS(0.75, output_min, output_max)},
-      {4, 1, 1, 3, 1},         // Output shape
-      output_min, output_max,  // output quantization range
-      kTfLitePaddingValid, kTfLiteActRelu, output_data);
+
+  const float input_scale = .25;
+  const int input_zero_point = 0;
+  const float output_scale = .25;
+  const int output_zero_point = 0;
+  tflite::testing::TestAveragePoolQuantized(
+      input_shape, input_values, input_scale, input_zero_point, filter_height,
+      filter_width, stride_height, stride_width, golden, output_shape,
+      output_scale, output_zero_point, kTfLitePaddingValid, kTfLiteActRelu,
+      output_data);
 }
 
 TF_LITE_MICRO_TEST(
     SimpleAveragePoolTestInt8PaddingValidStride2Stride1ReluN1To1) {
-  using tflite::testing::F2QS;
-
-  const float input_min = -15.9375;
-  const float input_max = 15.8130;
-  const float output_min = -15.9375;
-  const float output_max = 15.8130;
+  const int input_shape[] = {4, 1, 2, 4, 1};
+  const int8_t input_values[] = {0, -24, 8, 16, 12, 8, -40, 28};
+  const int filter_width = 2;
+  const int filter_height = 2;
+  const int stride_width = 2;
+  const int stride_height = 1;
+  const int8_t golden[] = {-1, 3};
+  const int output_shape[] = {4, 1, 1, 2, 1};
   int8_t output_data[2];
-  tflite::testing::TestAveragePoolingQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {                 // Input values
-       F2QS(0., input_min, input_max), F2QS(-6., input_min, input_max),
-       F2QS(2., input_min, input_max), F2QS(4., input_min, input_max),
-       F2QS(3., input_min, input_max), F2QS(2., input_min, input_max),
-       F2QS(-10., input_min, input_max), F2QS(7., input_min, input_max)},
-      input_min, input_max,  // input quantization range
-      2, 2,                  // filter height, filter width
-      1, 2,                  // stride height, stride width
-      {                      // Output values
-       F2QS(-0.25, output_min, output_max), F2QS(0.75, output_min, output_max)},
-      {4, 1, 1, 2, 1},         // Output shape
-      output_min, output_max,  // output quantization range
-      kTfLitePaddingValid, kTfLiteActReluN1To1, output_data);
+
+  const float input_scale = .25;
+  const int input_zero_point = 0;
+  const float output_scale = .25;
+  const int output_zero_point = 0;
+  tflite::testing::TestAveragePoolQuantized(
+      input_shape, input_values, input_scale, input_zero_point, filter_height,
+      filter_width, stride_height, stride_width, golden, output_shape,
+      output_scale, output_zero_point, kTfLitePaddingValid, kTfLiteActReluN1To1,
+      output_data);
 }
 
 TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingValidStride2Relu6) {
-  using tflite::testing::F2QS;
-
-  const float input_min = -15.9375;
-  const float input_max = 15.8130;
-  const float output_min = -15.9375;
-  const float output_max = 15.8130;
+  const int input_shape[] = {4, 1, 2, 4, 1};
+  const int8_t input_values[] = {12, -24, 32, 16, 12, 8, 40, 28};
+  const int filter_width = 2;
+  const int filter_height = 2;
+  const int stride_width = 2;
+  const int stride_height = 2;
+  const int8_t golden[] = {2, 24};
+  const int output_shape[] = {4, 1, 1, 2, 1};
   int8_t output_data[2];
-  tflite::testing::TestAveragePoolingQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {                 // Input values
-       F2QS(3., input_min, input_max), F2QS(-6., input_min, input_max),
-       F2QS(8., input_min, input_max), F2QS(4., input_min, input_max),
-       F2QS(3., input_min, input_max), F2QS(2., input_min, input_max),
-       F2QS(10., input_min, input_max), F2QS(7., input_min, input_max)},
-      input_min, input_max,  // input quantization range
-      2, 2,                  // filter height, filter width
-      2, 2,                  // stride height, stride width
-      {                      // Output values
-       F2QS(0.5, output_min, output_max), F2QS(6., output_min, output_max)},
-      {4, 1, 1, 2, 1},         // Output shape
-      output_min, output_max,  // output quantization range
-      kTfLitePaddingValid, kTfLiteActRelu6, output_data);
+
+  const float input_scale = .25;
+  const int input_zero_point = 0;
+  const float output_scale = .25;
+  const int output_zero_point = 0;
+  tflite::testing::TestAveragePoolQuantized(
+      input_shape, input_values, input_scale, input_zero_point, filter_height,
+      filter_width, stride_height, stride_width, golden, output_shape,
+      output_scale, output_zero_point, kTfLitePaddingValid, kTfLiteActRelu6,
+      output_data);
 }
 
 TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingSameStride1ActNone) {
-  using tflite::testing::F2QS;
-
-  const float input_min = -15.9375;
-  const float input_max = 15.8130;
-  const float output_min = -15.9375;
-  const float output_max = 15.8130;
+  const int input_shape[] = {4, 1, 2, 4, 1};
+  const int8_t input_values[] = {12, -24, 32, 16, 12, 8, 40, 28};
+  const int filter_width = 2;
+  const int filter_height = 2;
+  const int stride_width = 1;
+  const int stride_height = 1;
+  const int8_t golden[] = {2, 14, 29, 22, 10, 24, 34, 28};
+  const int output_shape[] = {4, 1, 2, 4, 1};
   int8_t output_data[8];
-  tflite::testing::TestAveragePoolingQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {                 // Input values
-       F2QS(3., input_min, input_max), F2QS(-6., input_min, input_max),
-       F2QS(8., input_min, input_max), F2QS(4., input_min, input_max),
-       F2QS(3., input_min, input_max), F2QS(2., input_min, input_max),
-       F2QS(10., input_min, input_max), F2QS(7., input_min, input_max)},
-      input_min, input_max,  // input quantization range
-      2, 2,                  // filter height, filter width
-      1, 1,                  // stride height, stride width
-      {                      // Output values
-       F2QS(0.5, output_min, output_max), F2QS(3.5, output_min, output_max),
-       F2QS(7.25, output_min, output_max), F2QS(5.5, output_min, output_max),
-       F2QS(2.5, output_min, output_max), F2QS(6., output_min, output_max),
-       F2QS(8.5, output_min, output_max), F2QS(7., output_min, output_max)},
-      {4, 1, 2, 4, 1},         // Output shape
-      output_min, output_max,  // output quantization range
-      kTfLitePaddingSame, kTfLiteActNone, output_data);
+
+  const float input_scale = .25;
+  const int input_zero_point = 0;
+  const float output_scale = .25;
+  const int output_zero_point = 0;
+  tflite::testing::TestAveragePoolQuantized(
+      input_shape, input_values, input_scale, input_zero_point, filter_height,
+      filter_width, stride_height, stride_width, golden, output_shape,
+      output_scale, output_zero_point, kTfLitePaddingValid, kTfLiteActNone,
+      output_data);
 }
 
 TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloat) {
+  const int input_shape[] = {4, 1, 2, 4, 1};
+  const float input_values[] = {0, 6, 2, 4, 3, 2, 10, 7};
+  const int filter_width = 2;
+  const int filter_height = 2;
+  const int stride_width = 2;
+  const int stride_height = 2;
+  const float golden[] = {6, 10};
+  const int output_shape[] = {4, 1, 1, 2, 1};
   float output_data[2];
-  tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
-                                    {                 // Input values
-                                     0, 6, 2, 4, 3, 2, 10, 7},
-                                    2, 2,  // filter width, filter height
-                                    2, 2,  // stride width, stride height
-                                    {
-                                        // Output values
-                                        6,
-                                        10,
-                                    },
-                                    {4, 1, 1, 2, 1},  // Output shape
-                                    kTfLitePaddingValid, kTfLiteActNone,
-                                    output_data);
+  tflite::testing::TestMaxPoolFloat(input_shape, input_values, filter_height,
+                                    filter_width, stride_height, stride_width,
+                                    golden, output_shape, kTfLitePaddingValid,
+                                    kTfLiteActNone, output_data);
 }
 
 TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloatRelu) {
+  const int input_shape[] = {4, 1, 2, 4, 1};
+  const float input_values[] = {-1, -6, 2, 4, -3, -2, 10.5, 7};
+  const int filter_width = 2;
+  const int filter_height = 2;
+  const int stride_width = 2;
+  const int stride_height = 2;
+  const float golden[] = {0, 10.5};
+  const int output_shape[] = {4, 1, 1, 2, 1};
   float output_data[2];
-  tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
-                                    {
-                                        // Input values
-                                        -1, -6, 2, 4,     //
-                                        -3, -2, 10.5, 7,  //
-                                    },
-                                    2, 2,  // filter width, filter height
-                                    2, 2,  // stride width, stride height
-                                    {
-                                        // Output values
-                                        0.0,
-                                        10.5,
-                                    },
-                                    {4, 1, 1, 2, 1},  // Output shape
-                                    kTfLitePaddingValid, kTfLiteActRelu,
-                                    output_data);
+  tflite::testing::TestMaxPoolFloat(input_shape, input_values, filter_height,
+                                    filter_width, stride_height, stride_width,
+                                    golden, output_shape, kTfLitePaddingValid,
+                                    kTfLiteActRelu, output_data);
 }
 
 TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloatReluN1To1) {
+  const int input_shape[] = {4, 1, 2, 4, 1};
+  const float input_values1[] = {-2.75, -6, 0.2, 0.4, -3, -2, -0.3, 0.7};
+  const int filter_width = 2;
+  const int filter_height = 2;
+  const int stride_width = 2;
+  const int stride_height = 2;
+  const float golden1[] = {-1.0, 0.7};
+  const int output_shape[] = {4, 1, 1, 2, 1};
   float output_data[2];
-  tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
-                                    {
-                                        // Input values
-                                        -2.75, -6, 0.2, 0.4,  //
-                                        -3, -2, -0.3, 0.7,    //
-                                    },
-                                    2, 2,  // filter width, filter height
-                                    2, 2,  // stride width, stride height
-                                    {
-                                        // Output values
-                                        -1.0,
-                                        0.7,
-                                    },
-                                    {4, 1, 1, 2, 1},  // Output shape
-                                    kTfLitePaddingValid, kTfLiteActReluN1To1,
-                                    output_data);
+  tflite::testing::TestMaxPoolFloat(input_shape, input_values1, filter_height,
+                                    filter_width, stride_height, stride_width,
+                                    golden1, output_shape, kTfLitePaddingValid,
+                                    kTfLiteActReluN1To1, output_data);
 
-  tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
-                                    {
-                                        // Input values
-                                        -2.75, -6, -2, -4,  //
-                                        -3, -2, 10, -7,     //
-                                    },
-                                    2, 2,  // filter width, filter height
-                                    2, 2,  // stride width, stride height
-                                    {
-                                        // Output values
-                                        -1.0,
-                                        1.0,
-                                    },
-                                    {4, 1, 1, 2, 1},  // Output shape
-                                    kTfLitePaddingValid, kTfLiteActReluN1To1,
-                                    output_data);
+  const float input_values2[] = {-2.75, -6, -2, -4, -3, -2, 10, -7};
+  const float golden2[] = {-1.0, 1.0};
+  tflite::testing::TestMaxPoolFloat(input_shape, input_values2, filter_height,
+                                    filter_width, stride_height, stride_width,
+                                    golden2, output_shape, kTfLitePaddingValid,
+                                    kTfLiteActReluN1To1, output_data);
 }
 
 TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloatRelu6) {
+  const int input_shape[] = {4, 1, 2, 4, 1};
+  const float input_values1[] = {-1.5, -6, 12, 4, -3, -2, 10, 7};
+  const int filter_width = 2;
+  const int filter_height = 2;
+  const int stride_width = 2;
+  const int stride_height = 2;
+  const float golden1[] = {0, 6};
+  const int output_shape[] = {4, 1, 1, 2, 1};
   float output_data[2];
-  tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
-                                    {
-                                        // Input values
-                                        -1.5, -6, 12, 4,  //
-                                        -3, -2, 10, 7,    //
-                                    },
-                                    2, 2,  // filter width, filter height
-                                    2, 2,  // stride width, stride height
-                                    {
-                                        // Output values
-                                        0.0,
-                                        6.0,
-                                    },
-                                    {4, 1, 1, 2, 1},  // Output shape
-                                    kTfLitePaddingValid, kTfLiteActRelu6,
-                                    output_data);
+  tflite::testing::TestMaxPoolFloat(input_shape, input_values1, filter_height,
+                                    filter_width, stride_height, stride_width,
+                                    golden1, output_shape, kTfLitePaddingValid,
+                                    kTfLiteActRelu6, output_data);
 
-  tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
-                                    {
-                                        // Input values
-                                        0, 4.5, 12, 4,  //
-                                        3, 2, 10, 7,    //
-                                    },
-                                    2, 2,  // filter width, filter height
-                                    2, 2,  // stride width, stride height
-                                    {
-                                        // Output values
-                                        4.5,
-                                        6.0,
-                                    },
-                                    {4, 1, 1, 2, 1},  // Output shape
-                                    kTfLitePaddingValid, kTfLiteActRelu6,
-                                    output_data);
+  const float input_values2[] = {0, 4.5, 12, 4, 3, 2, 10, 7};
+  const float golden2[] = {4.5, 6};
+  tflite::testing::TestMaxPoolFloat(input_shape, input_values2, filter_height,
+                                    filter_width, stride_height, stride_width,
+                                    golden2, output_shape, kTfLitePaddingValid,
+                                    kTfLiteActRelu6, output_data);
 }
 
 TF_LITE_MICRO_TEST(SimpleMaxPoolTestPaddingSameStride1) {
+  const int input_shape[] = {4, 1, 2, 4, 1};
+  const float input_values[] = {0, 6, 2, 4, 3, 2, 10, 7};
+  const int filter_width = 2;
+  const int filter_height = 2;
+  const int stride_width = 1;
+  const int stride_height = 1;
+  const float golden[] = {6, 10, 10, 7, 3, 10, 10, 7};
+  const int output_shape[] = {4, 1, 2, 4, 1};
   float output_data[8];
-  tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
-                                    {
-                                        // Input values
-                                        0, 6, 2, 4,   //
-                                        3, 2, 10, 7,  //
-                                    },
-                                    2, 2,  // filter width, filter height
-                                    1, 1,  // stride width, stride height
-                                    {
-                                        // Output values
-                                        6, 10, 10, 7,  //
-                                        3, 10, 10, 7,  //
-                                    },
-                                    {4, 1, 2, 4, 1},  // Output shape
-                                    kTfLitePaddingSame, kTfLiteActNone,
-                                    output_data);
+  tflite::testing::TestMaxPoolFloat(input_shape, input_values, filter_height,
+                                    filter_width, stride_height, stride_width,
+                                    golden, output_shape, kTfLitePaddingSame,
+                                    kTfLiteActNone, output_data);
 }
 
 TF_LITE_MICRO_TEST(SimpleMaxPoolTestPaddingValidStride1) {
-  float output_data[3];
-  tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
-                                    {
-                                        // Input values
-                                        0, 6, 2, 4,   //
-                                        3, 2, 10, 7,  //
-                                    },
-                                    2, 2,  // filter width, filter height
-                                    1, 1,  // stride width, stride height
-                                    {
-                                        // Output values
-                                        6,
-                                        10,
-                                        10,
-                                    },
-                                    {4, 1, 1, 3, 1},  // Output shape
-                                    kTfLitePaddingValid, kTfLiteActNone,
-                                    output_data);
+  const int input_shape[] = {4, 1, 2, 4, 1};
+  const float input_values[] = {0, 6, 2, 4, 3, 2, 10, 7};
+  const int filter_width = 2;
+  const int filter_height = 2;
+  const int stride_width = 1;
+  const int stride_height = 1;
+  const float golden[] = {6, 10, 10};
+  const int output_shape[] = {4, 1, 1, 3, 1};
+  float output_data[8];
+  tflite::testing::TestMaxPoolFloat(input_shape, input_values, filter_height,
+                                    filter_width, stride_height, stride_width,
+                                    golden, output_shape, kTfLitePaddingValid,
+                                    kTfLiteActNone, output_data);
 }
 
 TF_LITE_MICRO_TEST(SimpleMaxPoolTestUInt8ActNone) {
-  using tflite::testing::F2Q;
-
+  const int input_shape[] = {4, 1, 2, 4, 1};
+  const uint8_t input_values[] = {0, 12, 4, 8, 6, 4, 20, 14};
+  const int filter_width = 2;
+  const int filter_height = 2;
+  const int stride_width = 2;
+  const int stride_height = 2;
+  const uint8_t golden[] = {12, 20};
+  const int output_shape[] = {4, 1, 1, 2, 1};
   uint8_t output_data[2];
-  float input_min = 0;
-  float input_max = 15.9375;
-  float output_min = 0;
-  float output_max = 15.9375;
-  int filter_width = 2;
-  int filter_height = 2;
-  int stride_width = 2;
-  int stride_height = 2;
+
+  const float input_scale = 1.0;
+  const int input_zero_point = 0;
+  const float output_scale = 1.0;
+  const int output_zero_point = 0;
   tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2Q(0, input_min, input_max),
-          F2Q(6, input_min, input_max),
-          F2Q(2, input_min, input_max),
-          F2Q(4, input_min, input_max),
-          F2Q(3, input_min, input_max),
-          F2Q(2, input_min, input_max),
-          F2Q(10, input_min, input_max),
-          F2Q(7, input_min, input_max),
-      },
-      input_min, input_max, filter_width, filter_height, stride_width,
-      stride_height,
-      {// Output values
-       F2Q(6, output_min, output_max), F2Q(10, output_min, output_max)},
-      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
-      kTfLitePaddingValid, kTfLiteActNone, output_data);
+      input_shape, input_values, input_scale, input_zero_point, filter_height,
+      filter_width, stride_height, stride_width, golden, output_shape,
+      output_scale, output_zero_point, kTfLitePaddingValid, kTfLiteActNone,
+      output_data);
 }
 
 TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu) {
-  using tflite::testing::F2Q;
-
+  const int input_shape[] = {4, 1, 2, 4, 1};
+  const uint8_t input_values[] = {0, 4, 2, 4, 3, 2, 14, 7};
+  const int filter_width = 2;
+  const int filter_height = 2;
+  const int stride_width = 2;
+  const int stride_height = 2;
+  const uint8_t golden[] = {4, 14};
+  const int output_shape[] = {4, 1, 1, 2, 1};
   uint8_t output_data[2];
-  float input_min = -15.9375;
-  float input_max = 15.9375;
-  float output_min = -15.9375;
-  float output_max = 15.9375;
-  int filter_width = 2;
-  int filter_height = 2;
-  int stride_width = 2;
-  int stride_height = 2;
+
+  const float input_scale = 1.0;
+  const int input_zero_point = 4;
+  const float output_scale = 1.0;
+  const int output_zero_point = 4;
   tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2Q(-1.5, input_min, input_max),
-          F2Q(-6, input_min, input_max),
-          F2Q(2, input_min, input_max),
-          F2Q(4, input_min, input_max),
-          F2Q(-3, input_min, input_max),
-          F2Q(-2, input_min, input_max),
-          F2Q(10, input_min, input_max),
-          F2Q(7, input_min, input_max),
-      },
-      input_min, input_max, filter_width, filter_height, stride_width,
-      stride_height,
-      {// Output values
-       F2Q(0, output_min, output_max), F2Q(10, output_min, output_max)},
-      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
-      kTfLitePaddingValid, kTfLiteActRelu, output_data);
+      input_shape, input_values, input_scale, input_zero_point, filter_height,
+      filter_width, stride_height, stride_width, golden, output_shape,
+      output_scale, output_zero_point, kTfLitePaddingValid, kTfLiteActRelu,
+      output_data);
 }
 
 TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActReluN1To1) {
-  using tflite::testing::F2Q;
-
+  const int input_shape[] = {4, 1, 2, 4, 1};
+  const uint8_t input_values[] = {0, 4, 2, 4, 3, 2, 14, 7};
+  const int filter_width = 2;
+  const int filter_height = 2;
+  const int stride_width = 2;
+  const int stride_height = 2;
+  const uint8_t golden[] = {3, 5};
+  const int output_shape[] = {4, 1, 1, 2, 1};
   uint8_t output_data[2];
-  float input_min = -15.9375;
-  float input_max = 15.9375;
-  float output_min = -15.9375;
-  float output_max = 15.9375;
-  int filter_width = 2;
-  int filter_height = 2;
-  int stride_width = 2;
-  int stride_height = 2;
-  tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2Q(-1.7, input_min, input_max),
-          F2Q(-6, input_min, input_max),
-          F2Q(2, input_min, input_max),
-          F2Q(4, input_min, input_max),
-          F2Q(-3, input_min, input_max),
-          F2Q(-2, input_min, input_max),
-          F2Q(-10, input_min, input_max),
-          F2Q(7, input_min, input_max),
-      },
-      input_min, input_max, filter_width, filter_height, stride_width,
-      stride_height,
-      {// Output values
-       F2Q(-1.0, output_min, output_max), F2Q(1.0, output_min, output_max)},
-      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
-      kTfLitePaddingValid, kTfLiteActReluN1To1, output_data);
+
+  const float input_scale = 1.0;
+  const int input_zero_point = 4;
+  const float output_scale = 1.0;
+  const int output_zero_point = 4;
+  tflite::testing::TestAveragePoolQuantized(
+      input_shape, input_values, input_scale, input_zero_point, filter_height,
+      filter_width, stride_height, stride_width, golden, output_shape,
+      output_scale, output_zero_point, kTfLitePaddingValid, kTfLiteActReluN1To1,
+      output_data);
 }
 
 TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu6) {
-  using tflite::testing::F2Q;
-
+  const int input_shape[] = {4, 1, 2, 4, 1};
+  const uint8_t input_values1[] = {12, 0, 36, 20, 6, 8, 32, 26};
+  const int filter_width = 2;
+  const int filter_height = 2;
+  const int stride_width = 2;
+  const int stride_height = 2;
+  const uint8_t golden1[] = {12, 24};
+  const int output_shape[] = {4, 1, 1, 2, 1};
   uint8_t output_data[8];
-  float input_min = -15.9375;
-  float input_max = 15.9375;
-  float output_min = -15.9375;
-  float output_max = 15.9375;
-  int filter_width = 2;
-  int filter_height = 2;
-  int stride_width = 2;
-  int stride_height = 2;
-  tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2Q(0, input_min, input_max),
-          F2Q(-6, input_min, input_max),
-          F2Q(12, input_min, input_max),
-          F2Q(4, input_min, input_max),
-          F2Q(-3, input_min, input_max),
-          F2Q(-2, input_min, input_max),
-          F2Q(10, input_min, input_max),
-          F2Q(7, input_min, input_max),
-      },
-      input_min, input_max, filter_width, filter_height, stride_width,
-      stride_height,
-      {// Output values
-       F2Q(0.0, output_min, output_max), F2Q(6.0, output_min, output_max)},
-      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
-      kTfLitePaddingValid, kTfLiteActRelu6, output_data);
 
+  const float input_scale = 0.5;
+  const int input_zero_point = 12;
+  const float output_scale = 0.5;
+  const int output_zero_point = 12;
   tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2Q(0, input_min, input_max),
-          F2Q(4.5, input_min, input_max),
-          F2Q(12, input_min, input_max),
-          F2Q(4, input_min, input_max),
-          F2Q(3, input_min, input_max),
-          F2Q(2, input_min, input_max),
-          F2Q(10, input_min, input_max),
-          F2Q(7, input_min, input_max),
-      },
-      input_min, input_max, filter_width, filter_height, stride_width,
-      stride_height,
-      {// Output values
-       F2Q(4.5, output_min, output_max), F2Q(6.0, output_min, output_max)},
-      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
-      kTfLitePaddingValid, kTfLiteActRelu6, output_data);
+      input_shape, input_values1, input_scale, input_zero_point, filter_height,
+      filter_width, stride_height, stride_width, golden1, output_shape,
+      output_scale, output_zero_point, kTfLitePaddingValid, kTfLiteActRelu6,
+      output_data);
+
+  const uint8_t input_values2[] = {12, 21, 36, 16, 18, 16, 32, 26};
+
+  const uint8_t golden2[] = {21, 24};
+  tflite::testing::TestMaxPoolQuantized(
+      input_shape, input_values2, input_scale, input_zero_point, filter_height,
+      filter_width, stride_height, stride_width, golden2, output_shape,
+      output_scale, output_zero_point, kTfLitePaddingValid, kTfLiteActRelu6,
+      output_data);
 }
 
 TF_LITE_MICRO_TEST(MaxPoolTestUInt8PaddingSameStride1) {
-  using tflite::testing::F2Q;
-
+  const int input_shape[] = {4, 1, 2, 4, 1};
+  const uint8_t input_values1[] = {0, 6, 2, 4, 3, 2, 10, 7};
+  const int filter_width = 2;
+  const int filter_height = 2;
+  const int stride_width = 1;
+  const int stride_height = 1;
+  const uint8_t golden1[] = {6, 10, 10, 7, 3, 10, 10, 7};
+  const int output_shape[] = {4, 1, 2, 4, 1};
   uint8_t output_data[8];
-  float input_min = 0;
-  float input_max = 15.9375;
-  float output_min = 0;
-  float output_max = 15.9375;
-  int filter_width = 2;
-  int filter_height = 2;
-  int stride_width = 1;
-  int stride_height = 1;
+
+  const float input_scale = 1.0;
+  const int input_zero_point = 0;
+  const float output_scale = 1.0;
+  const int output_zero_point = 0;
   tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2Q(0, input_min, input_max),
-          F2Q(6, input_min, input_max),
-          F2Q(2, input_min, input_max),
-          F2Q(4, input_min, input_max),
-          F2Q(3, input_min, input_max),
-          F2Q(2, input_min, input_max),
-          F2Q(10, input_min, input_max),
-          F2Q(7, input_min, input_max),
-      },
-      input_min, input_max, filter_width, filter_height, stride_width,
-      stride_height,
-      {
-          // Output values
-          F2Q(6, output_min, output_max),
-          F2Q(10, output_min, output_max),
-          F2Q(10, output_min, output_max),
-          F2Q(7, output_min, output_max),
-          F2Q(3, output_min, output_max),
-          F2Q(10, output_min, output_max),
-          F2Q(10, output_min, output_max),
-          F2Q(7, output_min, output_max),
-      },
-      output_min, output_max, {4, 1, 2, 4, 1},  // Output shape
-      kTfLitePaddingSame, kTfLiteActNone, output_data);
+      input_shape, input_values1, input_scale, input_zero_point, filter_height,
+      filter_width, stride_height, stride_width, golden1, output_shape,
+      output_scale, output_zero_point, kTfLitePaddingValid, kTfLiteActNone,
+      output_data);
 }
 
 TF_LITE_MICRO_TEST(MaxPoolTestUInt8PaddingValidStride1) {
-  using tflite::testing::F2Q;
-
+  const int input_shape[] = {4, 1, 2, 4, 1};
+  const uint8_t input_values1[] = {0, 6, 2, 4, 3, 2, 10, 7};
+  const int filter_width = 2;
+  const int filter_height = 2;
+  const int stride_width = 1;
+  const int stride_height = 1;
+  const uint8_t golden1[] = {6, 10, 10};
+  const int output_shape[] = {4, 1, 1, 3, 1};
   uint8_t output_data[3];
-  float input_min = 0;
-  float input_max = 15.9375;
-  float output_min = 0;
-  float output_max = 15.9375;
-  int filter_width = 2;
-  int filter_height = 2;
-  int stride_width = 1;
-  int stride_height = 1;
+
+  const float input_scale = 1.0;
+  const int input_zero_point = 0;
+  const float output_scale = 1.0;
+  const int output_zero_point = 0;
   tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2Q(0, input_min, input_max),
-          F2Q(6, input_min, input_max),
-          F2Q(2, input_min, input_max),
-          F2Q(4, input_min, input_max),
-          F2Q(3, input_min, input_max),
-          F2Q(2, input_min, input_max),
-          F2Q(10, input_min, input_max),
-          F2Q(7, input_min, input_max),
-      },
-      input_min, input_max, filter_width, filter_height, stride_width,
-      stride_height,
-      {
-          // Output values
-          F2Q(6, output_min, output_max),
-          F2Q(10, output_min, output_max),
-          F2Q(10, output_min, output_max),
-      },
-      output_min, output_max, {4, 1, 1, 3, 1},  // Output shape
-      kTfLitePaddingValid, kTfLiteActNone, output_data);
+      input_shape, input_values1, input_scale, input_zero_point, filter_height,
+      filter_width, stride_height, stride_width, golden1, output_shape,
+      output_scale, output_zero_point, kTfLitePaddingValid, kTfLiteActNone,
+      output_data);
 }
 
 TF_LITE_MICRO_TEST(SimpleMaxPoolTestInt8ActNone) {
-  using tflite::testing::F2QS;
-
+  const int input_shape[] = {4, 1, 2, 4, 1};
+  const int8_t input_values1[] = {0, 6, 2, 4, 3, 2, 10, 7};
+  const int filter_width = 2;
+  const int filter_height = 2;
+  const int stride_width = 2;
+  const int stride_height = 2;
+  const int8_t golden1[] = {6, 10};
+  const int output_shape[] = {4, 1, 1, 2, 1};
   int8_t output_data[2];
-  float input_min = 0;
-  float input_max = 15.9375;
-  float output_min = 0;
-  float output_max = 15.9375;
-  int filter_width = 2;
-  int filter_height = 2;
-  int stride_width = 2;
-  int stride_height = 2;
+
+  const float input_scale = 1.0;
+  const int input_zero_point = 0;
+  const float output_scale = 1.0;
+  const int output_zero_point = 0;
   tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2QS(0, input_min, input_max),
-          F2QS(6, input_min, input_max),
-          F2QS(2, input_min, input_max),
-          F2QS(4, input_min, input_max),
-          F2QS(3, input_min, input_max),
-          F2QS(2, input_min, input_max),
-          F2QS(10, input_min, input_max),
-          F2QS(7, input_min, input_max),
-      },
-      input_min, input_max, filter_width, filter_height, stride_width,
-      stride_height,
-      {// Output values
-       F2QS(6, output_min, output_max), F2QS(10, output_min, output_max)},
-      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
-      kTfLitePaddingValid, kTfLiteActNone, output_data);
+      input_shape, input_values1, input_scale, input_zero_point, filter_height,
+      filter_width, stride_height, stride_width, golden1, output_shape,
+      output_scale, output_zero_point, kTfLitePaddingValid, kTfLiteActNone,
+      output_data);
 }
 
-TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu) {
-  using tflite::testing::F2QS;
-
+TF_LITE_MICRO_TEST(MaxPoolTestInt8ActRelu) {
+  const int input_shape[] = {4, 1, 2, 4, 1};
+  const int8_t input_values1[] = {-3, -12, 4, 8, -6, -4, 20, 14};
+  const int filter_width = 2;
+  const int filter_height = 2;
+  const int stride_width = 2;
+  const int stride_height = 2;
+  const int8_t golden1[] = {0, 20};
+  const int output_shape[] = {4, 1, 1, 2, 1};
   int8_t output_data[2];
-  float input_min = -15.9375;
-  float input_max = 15.9375;
-  float output_min = -15.9375;
-  float output_max = 15.9375;
-  int filter_width = 2;
-  int filter_height = 2;
-  int stride_width = 2;
-  int stride_height = 2;
+
+  const float input_scale = 0.5;
+  const int input_zero_point = 0;
+  const float output_scale = 0.5;
+  const int output_zero_point = 0;
   tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2QS(-1.5, input_min, input_max),
-          F2QS(-6, input_min, input_max),
-          F2QS(2, input_min, input_max),
-          F2QS(4, input_min, input_max),
-          F2QS(-3, input_min, input_max),
-          F2QS(-2, input_min, input_max),
-          F2QS(10, input_min, input_max),
-          F2QS(7, input_min, input_max),
-      },
-      input_min, input_max, filter_width, filter_height, stride_width,
-      stride_height,
-      {// Output values
-       F2QS(0, output_min, output_max), F2QS(10, output_min, output_max)},
-      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
-      kTfLitePaddingValid, kTfLiteActRelu, output_data);
+      input_shape, input_values1, input_scale, input_zero_point, filter_height,
+      filter_width, stride_height, stride_width, golden1, output_shape,
+      output_scale, output_zero_point, kTfLitePaddingValid, kTfLiteActRelu,
+      output_data);
 }
 
-TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActReluN1To1) {
-  using tflite::testing::F2QS;
-
+TF_LITE_MICRO_TEST(MaxPoolTestInt8ActReluN1To1) {
+  const int input_shape[] = {4, 1, 2, 4, 1};
+  const int8_t input_values1[] = {-2, -6, -2, -4, -3, -2, 10, 7};
+  const int filter_width = 2;
+  const int filter_height = 2;
+  const int stride_width = 2;
+  const int stride_height = 2;
+  const int8_t golden1[] = {-1, 1};
+  const int output_shape[] = {4, 1, 1, 2, 1};
   int8_t output_data[2];
-  float input_min = -15.9375;
-  float input_max = 15.9375;
-  float output_min = -15.9375;
-  float output_max = 15.9375;
-  int filter_width = 2;
-  int filter_height = 2;
-  int stride_width = 2;
-  int stride_height = 2;
+
+  const float input_scale = 1.0;
+  const int input_zero_point = 0;
+  const float output_scale = 1.0;
+  const int output_zero_point = 0;
   tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2QS(-1.7, input_min, input_max),
-          F2QS(-6, input_min, input_max),
-          F2QS(2, input_min, input_max),
-          F2QS(4, input_min, input_max),
-          F2QS(-3, input_min, input_max),
-          F2QS(-2, input_min, input_max),
-          F2QS(-10, input_min, input_max),
-          F2QS(7, input_min, input_max),
-      },
-      input_min, input_max, filter_width, filter_height, stride_width,
-      stride_height,
-      {// Output values
-       F2QS(-1.0, output_min, output_max), F2QS(1.0, output_min, output_max)},
-      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
-      kTfLitePaddingValid, kTfLiteActReluN1To1, output_data);
+      input_shape, input_values1, input_scale, input_zero_point, filter_height,
+      filter_width, stride_height, stride_width, golden1, output_shape,
+      output_scale, output_zero_point, kTfLitePaddingValid, kTfLiteActReluN1To1,
+      output_data);
 }
 
-TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu6) {
-  using tflite::testing::F2QS;
+TF_LITE_MICRO_TEST(MaxPoolTestInt8ActRelu6) {
+  const int input_shape[] = {4, 1, 2, 4, 1};
+  const int8_t input_values1[] = {0, -6, 12, 4, -3, -2, 10, 7};
+  const int filter_width = 2;
+  const int filter_height = 2;
+  const int stride_width = 2;
+  const int stride_height = 2;
+  const int8_t golden1[] = {0, 6};
+  const int output_shape[] = {4, 1, 1, 2, 1};
+  int8_t output_data[2];
 
-  int8_t output_data[8];
-  float input_min = -15.9375;
-  float input_max = 15.9375;
-  float output_min = -15.9375;
-  float output_max = 15.9375;
-  int filter_width = 2;
-  int filter_height = 2;
-  int stride_width = 2;
-  int stride_height = 2;
+  const float input_scale = 1.0;
+  const int input_zero_point = 0;
+  const float output_scale = 1.0;
+  const int output_zero_point = 0;
   tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2QS(0, input_min, input_max),
-          F2QS(-6, input_min, input_max),
-          F2QS(12, input_min, input_max),
-          F2QS(4, input_min, input_max),
-          F2QS(-3, input_min, input_max),
-          F2QS(-2, input_min, input_max),
-          F2QS(10, input_min, input_max),
-          F2QS(7, input_min, input_max),
-      },
-      input_min, input_max, filter_width, filter_height, stride_width,
-      stride_height,
-      {// Output values
-       F2QS(0.0, output_min, output_max), F2QS(6.0, output_min, output_max)},
-      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
-      kTfLitePaddingValid, kTfLiteActRelu6, output_data);
-
-  tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2QS(0, input_min, input_max),
-          F2QS(4.5, input_min, input_max),
-          F2QS(12, input_min, input_max),
-          F2QS(4, input_min, input_max),
-          F2QS(3, input_min, input_max),
-          F2QS(2, input_min, input_max),
-          F2QS(10, input_min, input_max),
-          F2QS(7, input_min, input_max),
-      },
-      input_min, input_max, filter_width, filter_height, stride_width,
-      stride_height,
-      {// Output values
-       F2QS(4.5, output_min, output_max), F2QS(6.0, output_min, output_max)},
-      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
-      kTfLitePaddingValid, kTfLiteActRelu6, output_data);
+      input_shape, input_values1, input_scale, input_zero_point, filter_height,
+      filter_width, stride_height, stride_width, golden1, output_shape,
+      output_scale, output_zero_point, kTfLitePaddingValid, kTfLiteActRelu6,
+      output_data);
 }
 
 TF_LITE_MICRO_TEST(MaxPoolTestUInt8PaddingSameStride1) {
-  using tflite::testing::F2QS;
+  const int input_shape[] = {4, 1, 2, 4, 1};
+  const uint8_t input_values1[] = {0, 6, 2, 4, 3, 2, 10, 7};
+  const int filter_width = 2;
+  const int filter_height = 2;
+  const int stride_width = 1;
+  const int stride_height = 1;
+  const uint8_t golden1[] = {6, 10, 10, 7, 3, 10, 10, 7};
+  const int output_shape[] = {4, 1, 2, 4, 1};
+  uint8_t output_data[8];
 
-  int8_t output_data[8];
-  float input_min = 0;
-  float input_max = 15.9375;
-  float output_min = 0;
-  float output_max = 15.9375;
-  int filter_width = 2;
-  int filter_height = 2;
-  int stride_width = 1;
-  int stride_height = 1;
+  const float input_scale = 1.0;
+  const int input_zero_point = 0;
+  const float output_scale = 1.0;
+  const int output_zero_point = 0;
   tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2QS(0, input_min, input_max),
-          F2QS(6, input_min, input_max),
-          F2QS(2, input_min, input_max),
-          F2QS(4, input_min, input_max),
-          F2QS(3, input_min, input_max),
-          F2QS(2, input_min, input_max),
-          F2QS(10, input_min, input_max),
-          F2QS(7, input_min, input_max),
-      },
-      input_min, input_max, filter_width, filter_height, stride_width,
-      stride_height,
-      {
-          // Output values
-          F2QS(6, output_min, output_max),
-          F2QS(10, output_min, output_max),
-          F2QS(10, output_min, output_max),
-          F2QS(7, output_min, output_max),
-          F2QS(3, output_min, output_max),
-          F2QS(10, output_min, output_max),
-          F2QS(10, output_min, output_max),
-          F2QS(7, output_min, output_max),
-      },
-      output_min, output_max, {4, 1, 2, 4, 1},  // Output shape
-      kTfLitePaddingSame, kTfLiteActNone, output_data);
+      input_shape, input_values1, input_scale, input_zero_point, filter_height,
+      filter_width, stride_height, stride_width, golden1, output_shape,
+      output_scale, output_zero_point, kTfLitePaddingSame, kTfLiteActNone,
+      output_data);
 }
 
 TF_LITE_MICRO_TEST(MaxPoolTestUInt8PaddingValidStride1) {
-  using tflite::testing::F2QS;
+  const int input_shape[] = {4, 1, 2, 4, 1};
+  const uint8_t input_values1[] = {0, 6, 2, 4, 3, 2, 10, 7};
+  const int filter_width = 2;
+  const int filter_height = 2;
+  const int stride_width = 1;
+  const int stride_height = 1;
+  const uint8_t golden1[] = {6, 10, 10};
+  const int output_shape[] = {4, 1, 1, 3, 1};
+  uint8_t output_data[3];
 
-  int8_t output_data[3];
-  float input_min = 0;
-  float input_max = 15.9375;
-  float output_min = 0;
-  float output_max = 15.9375;
-  int filter_width = 2;
-  int filter_height = 2;
-  int stride_width = 1;
-  int stride_height = 1;
+  const float input_scale = 1.0;
+  const int input_zero_point = 0;
+  const float output_scale = 1.0;
+  const int output_zero_point = 0;
   tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2QS(0, input_min, input_max),
-          F2QS(6, input_min, input_max),
-          F2QS(2, input_min, input_max),
-          F2QS(4, input_min, input_max),
-          F2QS(3, input_min, input_max),
-          F2QS(2, input_min, input_max),
-          F2QS(10, input_min, input_max),
-          F2QS(7, input_min, input_max),
-      },
-      input_min, input_max, filter_width, filter_height, stride_width,
-      stride_height,
-      {
-          // Output values
-          F2QS(6, output_min, output_max),
-          F2QS(10, output_min, output_max),
-          F2QS(10, output_min, output_max),
-      },
-      output_min, output_max, {4, 1, 1, 3, 1},  // Output shape
-      kTfLitePaddingValid, kTfLiteActNone, output_data);
+      input_shape, input_values1, input_scale, input_zero_point, filter_height,
+      filter_width, stride_height, stride_width, golden1, output_shape,
+      output_scale, output_zero_point, kTfLitePaddingValid, kTfLiteActNone,
+      output_data);
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/prelu.cc b/tensorflow/lite/micro/kernels/prelu.cc
index d1d8f977850..8665dbc2abb 100644
--- a/tensorflow/lite/micro/kernels/prelu.cc
+++ b/tensorflow/lite/micro/kernels/prelu.cc
@@ -15,20 +15,45 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/internal/reference/prelu.h"
 
+#include <cstdint>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
 namespace micro {
 namespace activations {
+namespace {
+
+TfLiteStatus CalculatePreluParams(const TfLiteTensor* input,
+                                  const TfLiteTensor* alpha,
+                                  TfLiteTensor* output, PreluParams* params) {
+  if (output->type == kTfLiteInt8 || output->type == kTfLiteUInt8 ||
+      output->type == kTfLiteInt16) {
+    double real_multiplier_1 = static_cast<double>(input->params.scale) /
+                               static_cast<double>(output->params.scale);
+    double real_multiplier_2 = static_cast<double>(input->params.scale) *
+                               static_cast<double>(alpha->params.scale) /
+                               static_cast<double>(output->params.scale);
+    QuantizeMultiplier(real_multiplier_1, &params->output_multiplier_1,
+                       &params->output_shift_1);
+    QuantizeMultiplier(real_multiplier_2, &params->output_multiplier_2,
+                       &params->output_shift_2);
+
+    params->input_offset = -input->params.zero_point;
+    params->alpha_offset = -alpha->params.zero_point;
+    params->output_offset = output->params.zero_point;
+  }
 
-TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
+}  // namespace
+
 inline void BroadcastPrelu4DSlowFloat(
     const RuntimeShape& unextended_input1_shape, const float* input1_data,
     const RuntimeShape& unextended_input2_shape, const float* input2_data,
@@ -60,67 +85,64 @@ inline void BroadcastPrelu4DSlowFloat(
   }
 }
 
-TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
+void* PreluInit(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(PreluParams));
+}
+
+TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  PreluParams* params = static_cast<PreluParams*>(node->user_data);
+
   const TfLiteTensor* input = GetInput(context, node, 0);
   const TfLiteTensor* alpha = GetInput(context, node, 1);
   TfLiteTensor* output = GetOutput(context, node, 0);
-  int32_t output_multiplier_1 = 0;
-  int output_shift_1 = 0;
-  int32_t output_multiplier_2 = 0;
-  int output_shift_2 = 0;
-  if (output->type == kTfLiteInt8 || output->type == kTfLiteUInt8 ||
-      output->type == kTfLiteInt16) {
-    double real_multiplier_1 = static_cast<double>(input->params.scale) /
-                               static_cast<double>(output->params.scale);
-    double real_multiplier_2 = static_cast<double>(input->params.scale) *
-                               static_cast<double>(alpha->params.scale) /
-                               static_cast<double>(output->params.scale);
-    QuantizeMultiplier(real_multiplier_1, &output_multiplier_1,
-                       &output_shift_1);
-    QuantizeMultiplier(real_multiplier_2, &output_multiplier_2,
-                       &output_shift_2);
-  }
+
+  return CalculatePreluParams(input, alpha, output, params);
+}
+
+TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const PreluParams& params =
+      *(static_cast<const PreluParams*>(node->user_data));
+
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  const TfLiteEvalTensor* alpha = tflite::micro::GetEvalInput(context, node, 1);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
+
   switch (input->type) {
     case kTfLiteFloat32: {
-      BroadcastPrelu4DSlowFloat(
-          GetTensorShape(input), GetTensorData<float>(input),
-          GetTensorShape(alpha), GetTensorData<float>(alpha),
-          GetTensorShape(output), GetTensorData<float>(output));
+      BroadcastPrelu4DSlowFloat(tflite::micro::GetTensorShape(input),
+                                tflite::micro::GetTensorData<float>(input),
+                                tflite::micro::GetTensorShape(alpha),
+                                tflite::micro::GetTensorData<float>(alpha),
+                                tflite::micro::GetTensorShape(output),
+                                tflite::micro::GetTensorData<float>(output));
       return kTfLiteOk;
     } break;
     case kTfLiteUInt8: {
-      PreluParams op_params;
-      op_params.input_offset = -input->params.zero_point;
-      op_params.alpha_offset = -alpha->params.zero_point;
-      op_params.output_offset = output->params.zero_point;
-      op_params.output_multiplier_1 = output_multiplier_1;
-      op_params.output_shift_1 = output_shift_1;
-      op_params.output_multiplier_2 = output_multiplier_2;
-      op_params.output_shift_2 = output_shift_2;
       reference_ops::BroadcastPrelu4DSlow(
-          op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-          GetTensorShape(alpha), GetTensorData<uint8_t>(alpha),
-          GetTensorShape(output), GetTensorData<uint8_t>(output));
+          params, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<uint8_t>(input),
+          tflite::micro::GetTensorShape(alpha),
+          tflite::micro::GetTensorData<uint8_t>(alpha),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<uint8_t>(output));
       return kTfLiteOk;
     } break;
     case kTfLiteInt8: {
-      PreluParams op_params;
-      op_params.input_offset = -input->params.zero_point;
-      op_params.alpha_offset = -alpha->params.zero_point;
-      op_params.output_offset = output->params.zero_point;
-      op_params.output_multiplier_1 = output_multiplier_1;
-      op_params.output_shift_1 = output_shift_1;
-      op_params.output_multiplier_2 = output_multiplier_2;
-      op_params.output_shift_2 = output_shift_2;
       reference_ops::BroadcastPrelu4DSlow(
-          op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-          GetTensorShape(alpha), GetTensorData<int8_t>(alpha),
-          GetTensorShape(output), GetTensorData<int8_t>(output));
+          params, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(alpha),
+          tflite::micro::GetTensorData<int8_t>(alpha),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int8_t>(output));
       return kTfLiteOk;
     } break;
     default:
       TF_LITE_KERNEL_LOG(
-          context, "Only float32 and uint8 are supported currently, got %d.",
+          context, "Only float32 and uint8_t are supported currently, got %d.",
           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
@@ -129,7 +151,7 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace activations
 
 TfLiteRegistration Register_PRELU() {
-  return {/*init=*/nullptr,
+  return {/*init=*/activations::PreluInit,
           /*free=*/nullptr,
           /*prepare=*/activations::PreluPrepare,
           /*invoke=*/activations::PreluEval,
diff --git a/tensorflow/lite/micro/kernels/prelu_test.cc b/tensorflow/lite/micro/kernels/prelu_test.cc
index 4b4bfd12e60..f559ddff993 100644
--- a/tensorflow/lite/micro/kernels/prelu_test.cc
+++ b/tensorflow/lite/micro/kernels/prelu_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
@@ -23,16 +23,35 @@ namespace tflite {
 namespace testing {
 namespace {
 
-void TestPreluFloat(std::initializer_list<int> input_dims_data,
-                    std::initializer_list<float> input_data,
-                    std::initializer_list<int> alpha_dims_data,
-                    std::initializer_list<float> alpha_data,
-                    std::initializer_list<float> expected_output_data,
-                    std::initializer_list<int> output_dims_data,
-                    float* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* alpha_dims = IntArrayFromInitializer(alpha_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+template <typename T>
+void ValidatePreluGoldens(TfLiteTensor* tensors, int tensors_size,
+                          const T* golden, const int output_length,
+                          T* output_data) {
+  int inputs_array_data[] = {2, 0, 1};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 2};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+
+  const TfLiteRegistration registration = tflite::ops::micro::Register_PRELU();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array,
+                             /*builtin_data=*/nullptr, micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
+
+  for (int i = 0; i < output_length; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(golden[i], output_data[i], 1e-5f);
+  }
+}
+
+void TestPreluFloat(const int* input_dims_data, const float* input_data,
+                    const int* alpha_dims_data, const float* alpha_data,
+                    const float* expected_output_data,
+                    const int* output_dims_data, float* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* alpha_dims = IntArrayFromInts(alpha_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
   constexpr int inputs_size = 2;
   constexpr int outputs_size = 1;
@@ -42,101 +61,43 @@ void TestPreluFloat(std::initializer_list<int> input_dims_data,
       CreateFloatTensor(alpha_data, alpha_dims),
       CreateFloatTensor(output_data, output_dims),
   };
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_PRELU);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, nullptr, init_data_size);
-  }
-  int inputs_array_data[] = {2, 0, 1};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 2};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
-  for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
-                              1e-5f);
-  }
+  ValidatePreluGoldens(tensors, tensors_size, expected_output_data,
+                       output_dims_count, output_data);
 }
 
 // Template argument T can be either uint8_t or int8_t depending on which type
 // of quantization required to be tested.
 template <typename T>
-void TestPreluQuantized(std::initializer_list<int> input_dims_data,
-                        std::initializer_list<T> input_data, float input_min,
-                        float input_max,
-                        std::initializer_list<int> alpha_dims_data,
-                        std::initializer_list<T> alpha_data, float alpha_min,
-                        float alpha_max,
-                        std::initializer_list<T> expected_output_data,
-                        std::initializer_list<int> output_dims_data,
-                        float output_min, float output_max, T* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* alpha_dims = IntArrayFromInitializer(alpha_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+void TestPreluQuantized(const int* input_dims_data, const float* input_data,
+                        T* input_quantized, const float input_scale,
+                        const int input_zero_point, const int* alpha_dims_data,
+                        const float* alpha_data, T* alpha_quantized,
+                        const float alpha_scale, const int alpha_zero_point,
+                        const float* golden, T* golden_quantized,
+                        const float output_scale, const int output_zero_point,
+                        const int* output_dims_data, T* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* alpha_dims = IntArrayFromInts(alpha_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
   constexpr int inputs_size = 2;
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateQuantizedTensor(input_data, input_dims, input_min, input_max),
-      CreateQuantizedTensor(alpha_data, alpha_dims, alpha_min, alpha_max),
-      CreateQuantizedTensor(output_data, output_dims, output_min, output_max),
+      CreateQuantizedTensor(input_data, input_quantized, input_dims,
+                            input_scale, input_zero_point),
+      CreateQuantizedTensor(alpha_data, alpha_quantized, alpha_dims,
+                            alpha_scale, alpha_zero_point),
+      CreateQuantizedTensor(output_data, output_dims, output_scale,
+                            output_zero_point),
   };
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_PRELU);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, nullptr, init_data_size);
-  }
-  int inputs_array_data[] = {2, 0, 1};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 2};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
-  for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
-  }
+  AsymmetricQuantize(golden, golden_quantized, output_dims_count, output_scale,
+                     output_zero_point);
+
+  ValidatePreluGoldens(tensors, tensors_size, golden_quantized,
+                       output_dims_count, output_data);
 }
 }  // namespace
 }  // namespace testing
@@ -145,78 +106,89 @@ void TestPreluQuantized(std::initializer_list<int> input_dims_data,
 TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(FloatPreluActivationsOpTest) {
+  const int input_shape[] = {3, 2, 2, 3};
+  const float input_values[] = {
+      0.0f,  0.0f,  0.0f,   // Row 1, Column 1
+      1.0f,  1.0f,  1.0f,   // Row 1, Column 2
+      -1.0f, -1.0f, -1.0f,  // Row 2, Column 1
+      -2.0f, -2.0f, -2.0f,  // Row 1, Column 2
+  };
+  const int alpha_shape[] = {3, 1, 1, 3};
+  const float alpha_values[] = {0.0f, 1.0f, 2.0f};
+  const int output_shape[] = {3, 2, 2, 3};
+  const float golden[] = {
+      0.0f, 0.0f,  0.0f,   // Row 1, Column 1
+      1.0f, 1.0f,  1.0f,   // Row 1, Column 2
+      0.0f, -1.0f, -2.0f,  // Row 2, Column 1
+      0.0f, -2.0f, -4.0f,  // Row 1, Column 2
+  };
   const int output_dims_count = 12;
   float output_data[output_dims_count];
-  tflite::testing::TestPreluFloat({3, 2, 2, 3},  // input shape
-                                  {
-                                      0.0f, 0.0f, 0.0f,     // Row 1, Column 1
-                                      1.0f, 1.0f, 1.0f,     // Row 1, Column 2
-                                      -1.0f, -1.0f, -1.0f,  // Row 2, Column 1
-                                      -2.0f, -2.0f, -2.0f,  // Row 1, Column 2
-                                  },
-                                  {3, 1, 1, 3},        // alpha shape
-                                  {0.0f, 1.0f, 2.0f},  // alpha values
-                                  {
-                                      0.0f, 0.0f, 0.0f,    // Row 1, Column 1
-                                      1.0f, 1.0f, 1.0f,    // Row 1, Column 2
-                                      0.0f, -1.0f, -2.0f,  // Row 2, Column 1
-                                      0.0f, -2.0f, -4.0f,  // Row 1, Column 2
-                                  },
-                                  {3, 2, 2, 3},  // output shape
+  tflite::testing::TestPreluFloat(input_shape, input_values, alpha_shape,
+                                  alpha_values, golden, output_shape,
                                   output_data);
 }
 
 TF_LITE_MICRO_TEST(QuantizedUint8PreluActivationsOpTest) {
-  using tflite::testing::F2Q;
-  const float kMin = -4;
-  const float kMax = 127.f / 32.f;
-  const float kAlphaMin = -0.5f;
-  const float kAlphaMax = 0.5f;
-  const int output_dims_count = 12;
-  uint8_t output_data[output_dims_count];
+  const int input_shape[] = {3, 2, 2, 3};
+  const float input_values[] = {
+      0.0f,   0.0f,   0.0f,    // Row 1, Column 1
+      0.5f,   0.5f,   0.5f,    // Row 1, Column 2
+      -1.0f,  -1.0f,  -1.0f,   // Row 2, Column 1
+      -0.25f, -0.25f, -0.25f,  // Row 1, Column 2
+  };
+  const int alpha_shape[] = {3, 1, 1, 3};
+  const float alpha_values[] = {0.0f, 0.5f, -0.5f};
+  const int output_shape[] = {3, 2, 2, 3};
+  const float golden[] = {
+      0.0f, 0.0f,    0.0f,    // Row 1, Column 1
+      0.5f, 0.5f,    0.5f,    // Row 1, Column 2
+      0.0f, -0.5f,   0.5f,    // Row 2, Column 1
+      0.0f, -0.125f, 0.125f,  // Row 1, Column 2
+  };
+
+  const int dims_count = 12;
+
+  uint8_t input_quantized[dims_count];
+  uint8_t alpha_quantized[3];
+  uint8_t golden_quantized[dims_count];
+  float scale = 0.125;
+  int zero_point = 127;
+  uint8_t output_data[dims_count];
+
   tflite::testing::TestPreluQuantized(
-      {3, 2, 2, 3},  // input shape
-      {F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax),
-       F2Q(0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax),
-       F2Q(-1.0f, kMin, kMax), F2Q(-1.0f, kMin, kMax), F2Q(-1.0f, kMin, kMax),
-       F2Q(-0.25f, kMin, kMax), F2Q(-0.25f, kMin, kMax),
-       F2Q(-0.25f, kMin, kMax)},
-      kMin, kMax, {3, 1, 1, 3},  // alpha shape
-      {F2Q(0.0f, kMin, kMax), F2Q(0.5f, kMin, kMax), F2Q(-0.5f, kMin, kMax)},
-      kMin, kMax,
-      {F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax),
-       F2Q(0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax),
-       F2Q(0.0f, kMin, kMax), F2Q(-0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax),
-       F2Q(0.0f, kMin, kMax), F2Q(-0.125f, kMin, kMax),
-       F2Q(0.125f, kMin, kMax)},
-      {3, 2, 2, 3},  // output shape
-      kMin, kMax, output_data);
+      input_shape, input_values, input_quantized, scale, zero_point,
+      alpha_shape, alpha_values, alpha_quantized, scale, zero_point, golden,
+      golden_quantized, scale, zero_point, output_shape, output_data);
 }
 
 TF_LITE_MICRO_TEST(QuantizedInt8PreluActivationsOpTest) {
-  using tflite::testing::F2QS;
-  const float kMin = -1;
-  const float kMax = 127.f / 128.f;
-  const float kAlphaMin = -0.5f;
-  const float kAlphaMax = 0.5f;
-  const int output_dims_count = 12;
-  int8_t output_data[output_dims_count];
+  const int input_shape[] = {3, 2, 2, 3};
+  const float input_values[] = {
+      0.0f,   0.0f,   0.0f,    // Row 1, Column 1
+      0.5f,   0.5f,   0.5f,    // Row 1, Column 2
+      -1.0f,  -1.0f,  -1.0f,   // Row 2, Column 1
+      -0.25f, -0.25f, -0.25f,  // Row 1, Column 2
+  };
+  const int alpha_shape[] = {3, 1, 1, 3};
+  const float alpha_values[] = {0.0f, 0.5f, -0.5f};
+  const int output_shape[] = {3, 2, 2, 3};
+  const float golden[] = {
+      0.0f, 0.0f,    0.0f,    // Row 1, Column 1
+      0.5f, 0.5f,    0.5f,    // Row 1, Column 2
+      0.0f, -0.5f,   0.5f,    // Row 2, Column 1
+      0.0f, -0.125f, 0.125f,  // Row 1, Column 2
+  };
+  const int dims_count = 12;
+  int8_t input_quantized[dims_count];
+  int8_t alpha_quantized[3];
+  int8_t golden_quantized[dims_count];
+  float scale = 2.0 / 255.0;
+  int zero_point = 0;
+  int8_t output_data[dims_count];
   tflite::testing::TestPreluQuantized(
-      {3, 2, 2, 3},  // input shape
-      {F2QS(0.0f, kMin, kMax), F2QS(0.0f, kMin, kMax), F2QS(0.0f, kMin, kMax),
-       F2QS(0.5f, kMin, kMax), F2QS(0.5f, kMin, kMax), F2QS(0.5f, kMin, kMax),
-       F2QS(-1.0f, kMin, kMax), F2QS(-1.0f, kMin, kMax),
-       F2QS(-1.0f, kMin, kMax), F2QS(-0.25f, kMin, kMax),
-       F2QS(-0.25f, kMin, kMax), F2QS(-0.25f, kMin, kMax)},
-      kMin, kMax, {3, 1, 1, 3},  // alpha shape
-      {F2QS(0.0f, kMin, kMax), F2QS(0.5f, kMin, kMax), F2QS(-0.5f, kMin, kMax)},
-      kMin, kMax,
-      {F2QS(0.0f, kMin, kMax), F2QS(0.0f, kMin, kMax), F2QS(0.0f, kMin, kMax),
-       F2QS(0.5f, kMin, kMax), F2QS(0.5f, kMin, kMax), F2QS(0.5f, kMin, kMax),
-       F2QS(0.0f, kMin, kMax), F2QS(-0.5f, kMin, kMax), F2QS(0.5f, kMin, kMax),
-       F2QS(0.0f, kMin, kMax), F2QS(-0.125f, kMin, kMax),
-       F2QS(0.125f, kMin, kMax)},
-      {3, 2, 2, 3},  // output shape
-      kMin, kMax, output_data);
+      input_shape, input_values, input_quantized, scale, zero_point,
+      alpha_shape, alpha_values, alpha_quantized, scale, zero_point, golden,
+      golden_quantized, scale, zero_point, output_shape, output_data);
 }
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/quantization_util_test.cc b/tensorflow/lite/micro/kernels/quantization_util_test.cc
index e9b219128fe..76ee9eefb7e 100644
--- a/tensorflow/lite/micro/kernels/quantization_util_test.cc
+++ b/tensorflow/lite/micro/kernels/quantization_util_test.cc
@@ -27,40 +27,55 @@ void RunSafeCastTests() {
   const IntOut imin = std::numeric_limits<IntOut>::min();
   const bool s = std::numeric_limits<IntOut>::is_signed;
   if (s) {
-    TF_LITE_MICRO_EXPECT_LT(imin, 0);
+    TF_LITE_MICRO_EXPECT_LT(static_cast<IntOut>(imin), 0);
   } else {
-    TF_LITE_MICRO_EXPECT_EQ(0, imin);
+    TF_LITE_MICRO_EXPECT_EQ(static_cast<IntOut>(0), imin);
   }
 
   // Some basic tests.
-  TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(0.0)), 0);
-  TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-0.0)), 0);
-  TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(0.99)), 0);
-  TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(1.0)), 1);
-  TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(1.01)), 1);
-  TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(1.99)), 1);
-  TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(2.0)), 2);
-  TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(2.01)), 2);
-  TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-0.99)), 0);
+  TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(0.0)),
+                          static_cast<IntOut>(0));
+  TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-0.0)),
+                          static_cast<IntOut>(0));
+  TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(0.99)),
+                          static_cast<IntOut>(0));
+  TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(1.0)),
+                          static_cast<IntOut>(1));
+  TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(1.01)),
+                          static_cast<IntOut>(1));
+  TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(1.99)),
+                          static_cast<IntOut>(1));
+  TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(2.0)),
+                          static_cast<IntOut>(2));
+  TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(2.01)),
+                          static_cast<IntOut>(2));
+  TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-0.99)),
+                          static_cast<IntOut>(0));
   TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-1.0)),
-                          s ? -1 : 0);
+                          s ? static_cast<IntOut>(-1) : static_cast<IntOut>(0));
   TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-1.01)),
-                          s ? -1 : 0);
+                          s ? static_cast<IntOut>(-1) : static_cast<IntOut>(0));
   TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-1.99)),
-                          s ? -1 : 0);
+                          s ? static_cast<IntOut>(-1) : static_cast<IntOut>(0));
   TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-2.0)),
-                          s ? -2 : 0);
+                          s ? static_cast<IntOut>(-2) : static_cast<IntOut>(0));
   TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-2.01)),
-                          s ? -2 : 0);
-  TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(117.9)), 117);
-  TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(118.0)), 118);
-  TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(118.1)), 118);
-  TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-117.9)),
-                          s ? -117 : 0);
-  TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-118.0)),
-                          s ? -118 : 0);
-  TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-118.1)),
-                          s ? -118 : 0);
+                          s ? static_cast<IntOut>(-2) : static_cast<IntOut>(0));
+  TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(117.9)),
+                          static_cast<IntOut>(117));
+  TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(118.0)),
+                          static_cast<IntOut>(118));
+  TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(118.1)),
+                          static_cast<IntOut>(118));
+  TF_LITE_MICRO_EXPECT_EQ(
+      SafeCast<IntOut>(static_cast<FloatIn>(-117.9)),
+      s ? static_cast<IntOut>(-117) : static_cast<IntOut>(0));
+  TF_LITE_MICRO_EXPECT_EQ(
+      SafeCast<IntOut>(static_cast<FloatIn>(-118.0)),
+      s ? static_cast<IntOut>(-118) : static_cast<IntOut>(0));
+  TF_LITE_MICRO_EXPECT_EQ(
+      SafeCast<IntOut>(static_cast<FloatIn>(-118.1)),
+      s ? static_cast<IntOut>(-118) : static_cast<IntOut>(0));
 
   // Some edge cases.
   TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(std::numeric_limits<FloatIn>::max()),
@@ -72,52 +87,66 @@ void RunSafeCastTests() {
   TF_LITE_MICRO_EXPECT_EQ(
       SafeCast<IntOut>(-std::numeric_limits<FloatIn>::infinity()), imin);
   TF_LITE_MICRO_EXPECT_EQ(
-      SafeCast<IntOut>(std::numeric_limits<FloatIn>::quiet_NaN()), 0);
+      SafeCast<IntOut>(std::numeric_limits<FloatIn>::quiet_NaN()),
+      static_cast<IntOut>(0));
 
   // Some larger numbers.
-  if (sizeof(IntOut) >= 4 && sizeof(FloatIn) > 4) {
+  if (sizeof(IntOut) >= static_cast<size_t>(4) &&
+      sizeof(FloatIn) > static_cast<size_t>(4)) {
     TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(0x76543210)),
-                            0x76543210);
+                            static_cast<IntOut>(0x76543210));
   }
 
   if (sizeof(FloatIn) > sizeof(IntOut)) {
     // Check values near imax.
-    TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(
-                                static_cast<FloatIn>(imax) + 0.1)),
-                            imax);
-    TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(
-                                static_cast<FloatIn>(imax) + 0.99)),
-                            imax);
-    TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(
-                                static_cast<FloatIn>(imax) + 1.0)),
-                            imax);
-    TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(
-                                static_cast<FloatIn>(imax) + 1.99)),
-                            imax);
-    TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(
-                                static_cast<FloatIn>(imax) + 2.0)),
-                            imax);
-    TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(
-                                static_cast<FloatIn>(imax) - 0.1)),
-                            imax - 1);
-    TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(
-                                static_cast<FloatIn>(imax) - 0.99)),
-                            imax - 1);
-    TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(
-                                static_cast<FloatIn>(imax) - 1.0)),
-                            imax - 1);
-    TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(
-                                static_cast<FloatIn>(imax) - 1.01)),
-                            imax - 2);
-    TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(
-                                static_cast<FloatIn>(imax) - 1.99)),
-                            imax - 2);
-    TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(
-                                static_cast<FloatIn>(imax) - 2.0)),
-                            imax - 2);
-    TF_LITE_MICRO_EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(
-                                static_cast<FloatIn>(imax) - 2.01)),
-                            imax - 3);
+    TF_LITE_MICRO_EXPECT_EQ(
+        SafeCast<IntOut>(static_cast<FloatIn>(static_cast<FloatIn>(imax) +
+                                              static_cast<FloatIn>(0.1))),
+        imax);
+    TF_LITE_MICRO_EXPECT_EQ(
+        SafeCast<IntOut>(static_cast<FloatIn>(static_cast<FloatIn>(imax) +
+                                              static_cast<FloatIn>(0.99))),
+        imax);
+    TF_LITE_MICRO_EXPECT_EQ(
+        SafeCast<IntOut>(static_cast<FloatIn>(static_cast<FloatIn>(imax) +
+                                              static_cast<FloatIn>(1.0))),
+        imax);
+    TF_LITE_MICRO_EXPECT_EQ(
+        SafeCast<IntOut>(static_cast<FloatIn>(static_cast<FloatIn>(imax) +
+                                              static_cast<FloatIn>(1.99))),
+        imax);
+    TF_LITE_MICRO_EXPECT_EQ(
+        SafeCast<IntOut>(static_cast<FloatIn>(static_cast<FloatIn>(imax) +
+                                              static_cast<FloatIn>(2.0))),
+        imax);
+    TF_LITE_MICRO_EXPECT_EQ(
+        SafeCast<IntOut>(static_cast<FloatIn>(static_cast<FloatIn>(imax) -
+                                              static_cast<FloatIn>(0.1))),
+        imax - 1);
+    TF_LITE_MICRO_EXPECT_EQ(
+        SafeCast<IntOut>(static_cast<FloatIn>(static_cast<FloatIn>(imax) -
+                                              static_cast<FloatIn>(0.99))),
+        imax - 1);
+    TF_LITE_MICRO_EXPECT_EQ(
+        SafeCast<IntOut>(static_cast<FloatIn>(static_cast<FloatIn>(imax) -
+                                              static_cast<FloatIn>(1.0))),
+        imax - 1);
+    TF_LITE_MICRO_EXPECT_EQ(
+        SafeCast<IntOut>(static_cast<FloatIn>(static_cast<FloatIn>(imax) -
+                                              static_cast<FloatIn>(1.01))),
+        imax - 2);
+    TF_LITE_MICRO_EXPECT_EQ(
+        SafeCast<IntOut>(static_cast<FloatIn>(static_cast<FloatIn>(imax) -
+                                              static_cast<FloatIn>(1.99))),
+        imax - 2);
+    TF_LITE_MICRO_EXPECT_EQ(
+        SafeCast<IntOut>(static_cast<FloatIn>(static_cast<FloatIn>(imax) -
+                                              static_cast<FloatIn>(2.0))),
+        imax - 2);
+    TF_LITE_MICRO_EXPECT_EQ(
+        SafeCast<IntOut>(static_cast<FloatIn>(static_cast<FloatIn>(imax) -
+                                              static_cast<FloatIn>(2.01))),
+        imax - 3);
   }
 
   // Check values considerably larger in magnitude than imin and imax
@@ -174,7 +203,7 @@ TF_LITE_MICRO_TEST(QuantizationUtilTest_SafeCast) {
 //  128       | 10.0
 TF_LITE_MICRO_TEST(QuantizationUtilTest_ChooseQuantizationParams) {
   tflite::QuantizationParams qp =
-      tflite::ChooseQuantizationParams<uint8>(-10.0, 30.0);
+      tflite::ChooseQuantizationParams<uint8_t>(-10.0, 30.0);
   TF_LITE_MICRO_EXPECT_NEAR(qp.scale, 0.156863, 1e-5);
   TF_LITE_MICRO_EXPECT_EQ(qp.zero_point, 64);
 }
@@ -182,7 +211,7 @@ TF_LITE_MICRO_TEST(QuantizationUtilTest_ChooseQuantizationParams) {
 TF_LITE_MICRO_TEST(
     QuantizationUtilTest_ChooseQuantizationParamsZeroPointOnMinBoundary) {
   tflite::QuantizationParams qp =
-      tflite::ChooseQuantizationParams<uint8>(0.0, 30.0);
+      tflite::ChooseQuantizationParams<uint8_t>(0.0, 30.0);
   TF_LITE_MICRO_EXPECT_NEAR(qp.scale, 0.117647, 1e-5);
   TF_LITE_MICRO_EXPECT_EQ(qp.zero_point, 0);
 }
@@ -190,7 +219,7 @@ TF_LITE_MICRO_TEST(
 TF_LITE_MICRO_TEST(
     QuantizationUtilTest_ChooseQuantizationParamsEmptyRangeZero) {
   tflite::QuantizationParams qp =
-      tflite::ChooseQuantizationParams<uint8>(0.0, 0.0);
+      tflite::ChooseQuantizationParams<uint8_t>(0.0, 0.0);
   TF_LITE_MICRO_EXPECT_NEAR(qp.scale, 0.0, 1e-5);
   TF_LITE_MICRO_EXPECT_EQ(qp.zero_point, 0);
 }
@@ -198,7 +227,7 @@ TF_LITE_MICRO_TEST(
 TF_LITE_MICRO_TEST(
     QuantizationUtilTest_ChooseQuantizationParamsZeroPointOnMaxBoundary) {
   tflite::QuantizationParams qp =
-      tflite::ChooseQuantizationParams<uint8>(-10.0, 0.0);
+      tflite::ChooseQuantizationParams<uint8_t>(-10.0, 0.0);
   TF_LITE_MICRO_EXPECT_NEAR(qp.scale, 0.039216, 1e-5);
   TF_LITE_MICRO_EXPECT_EQ(qp.zero_point, 255);
 }
@@ -210,30 +239,30 @@ TF_LITE_MICRO_TEST(QuantizationUtilTest_IntegerFrExp) {
   TF_LITE_MICRO_EXPECT_EQ(0, shift);
 
   result = tflite::IntegerFrExp(1.0, &shift);
-  TF_LITE_MICRO_EXPECT_NEAR(0x40000000, result, 1);
+  TF_LITE_MICRO_EXPECT_NEAR(0x40000000, result, 1ll);
   TF_LITE_MICRO_EXPECT_EQ(1, shift);
 
   result = tflite::IntegerFrExp(0.25, &shift);
-  TF_LITE_MICRO_EXPECT_NEAR(0x40000000, result, 1);
+  TF_LITE_MICRO_EXPECT_NEAR(0x40000000, result, 1ll);
   TF_LITE_MICRO_EXPECT_EQ(-1, shift);
 
   result = tflite::IntegerFrExp(-1.0, &shift);
-  TF_LITE_MICRO_EXPECT_NEAR(-(1 << 30), result, 1);
+  TF_LITE_MICRO_EXPECT_NEAR(-(1 << 30), result, 1ll);
   TF_LITE_MICRO_EXPECT_EQ(1, shift);
 
   result = tflite::IntegerFrExp(123.45, &shift);
-  TF_LITE_MICRO_EXPECT_NEAR(2071147315, result, 1);
+  TF_LITE_MICRO_EXPECT_NEAR(2071147315, result, 1ll);
   TF_LITE_MICRO_EXPECT_EQ(7, shift);
 
-  result = tflite::IntegerFrExp(NAN, &shift);
+  result = tflite::IntegerFrExp(static_cast<double>(NAN), &shift);
   TF_LITE_MICRO_EXPECT_NEAR(0, result, 1);
   TF_LITE_MICRO_EXPECT_EQ(0x7fffffff, shift);
 
-  result = tflite::IntegerFrExp(INFINITY, &shift);
+  result = tflite::IntegerFrExp(static_cast<double>(INFINITY), &shift);
   TF_LITE_MICRO_EXPECT_NEAR(std::numeric_limits<int64_t>::max(), result, 1);
   TF_LITE_MICRO_EXPECT_EQ(0x7fffffff, shift);
 
-  result = tflite::IntegerFrExp(-INFINITY, &shift);
+  result = tflite::IntegerFrExp(-static_cast<double>(INFINITY), &shift);
   TF_LITE_MICRO_EXPECT_NEAR(std::numeric_limits<int64_t>::min(), result, 1);
   TF_LITE_MICRO_EXPECT_EQ(0x7fffffff, shift);
 }
@@ -301,11 +330,11 @@ TF_LITE_MICRO_TEST(QuantizationUtilTest_DoubleFromFractionAndShift) {
   result = tflite::DoubleFromFractionAndShift(fraction, shift);
   TF_LITE_MICRO_EXPECT_NEAR(-23.232323, result, 1e-5);
 
-  fraction = tflite::IntegerFrExp(NAN, &shift);
+  fraction = tflite::IntegerFrExp(static_cast<double>(NAN), &shift);
   result = tflite::DoubleFromFractionAndShift(fraction, shift);
   TF_LITE_MICRO_EXPECT_TRUE(std::isnan(result));
 
-  fraction = tflite::IntegerFrExp(INFINITY, &shift);
+  fraction = tflite::IntegerFrExp(static_cast<double>(INFINITY), &shift);
   result = tflite::DoubleFromFractionAndShift(fraction, shift);
   TF_LITE_MICRO_EXPECT_FALSE(std::isfinite(result));
 }
@@ -326,10 +355,10 @@ TF_LITE_MICRO_TEST(QuantizationUtilTest_IntegerDoubleMultiply) {
                             1e-5);
   TF_LITE_MICRO_EXPECT_NEAR(
       15000000.0, tflite::IntegerDoubleMultiply(3000.0, 5000.0), 1e-5);
-  TF_LITE_MICRO_EXPECT_TRUE(
-      std::isnan(tflite::IntegerDoubleMultiply(NAN, 5000.0)));
-  TF_LITE_MICRO_EXPECT_TRUE(
-      std::isnan(tflite::IntegerDoubleMultiply(3000.0, NAN)));
+  TF_LITE_MICRO_EXPECT_TRUE(std::isnan(
+      tflite::IntegerDoubleMultiply(static_cast<double>(NAN), 5000.0)));
+  TF_LITE_MICRO_EXPECT_TRUE(std::isnan(
+      tflite::IntegerDoubleMultiply(3000.0, static_cast<double>(NAN))));
 }
 
 TF_LITE_MICRO_TEST(QuantizationUtilTest_IntegerDoubleCompare) {
@@ -339,8 +368,12 @@ TF_LITE_MICRO_TEST(QuantizationUtilTest_IntegerDoubleCompare) {
   TF_LITE_MICRO_EXPECT_EQ(0, tflite::IntegerDoubleCompare(0.0, 0.0));
   TF_LITE_MICRO_EXPECT_EQ(-1, tflite::IntegerDoubleCompare(-10.0, 10.0));
   TF_LITE_MICRO_EXPECT_EQ(1, tflite::IntegerDoubleCompare(123.45, 10.0));
-  TF_LITE_MICRO_EXPECT_EQ(1, tflite::IntegerDoubleCompare(NAN, INFINITY));
-  TF_LITE_MICRO_EXPECT_EQ(1, tflite::IntegerDoubleCompare(INFINITY, NAN));
+  TF_LITE_MICRO_EXPECT_EQ(
+      1, tflite::IntegerDoubleCompare(static_cast<double>(NAN),
+                                      static_cast<double>(INFINITY)));
+  TF_LITE_MICRO_EXPECT_EQ(
+      1, tflite::IntegerDoubleCompare(static_cast<double>(INFINITY),
+                                      static_cast<double>(NAN)));
 }
 
 TF_LITE_MICRO_TEST(QuantizationUtilTest_PreprocessSoftmaxScaling) {
@@ -385,11 +418,11 @@ TF_LITE_MICRO_TEST(QuantizationUtilTest_QuantizeMultiplierArray) {
                             0.125, 0.25, 0.5, 1,    2,     4};
 
   const int size = 13;
-  int32 effective_scale_significand[size];
+  int32_t effective_scale_significand[size];
   int effective_scale_shift[size];
   tflite::QuantizeMultiplierArray(weights, size, effective_scale_significand,
                                   effective_scale_shift);
-  const int32 expected_effective_scale_significand[] = {
+  const int32_t expected_effective_scale_significand[] = {
       -1073741824,  // float scale = -4
       -1073741824,  // float scale = -2
       -1073741824,  // float scale = -1
diff --git a/tensorflow/lite/micro/kernels/quantize.cc b/tensorflow/lite/micro/kernels/quantize.cc
index 243ed9688ac..9e512b35743 100644
--- a/tensorflow/lite/micro/kernels/quantize.cc
+++ b/tensorflow/lite/micro/kernels/quantize.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/requantize.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 
 namespace tflite {
@@ -32,16 +33,13 @@ struct OpData {
   // be represented as a fixed point multiplier plus a left shift.
   int32_t output_multiplier;
   int output_shift;
+
+  int32_t input_zero_point;
 };
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@@ -67,11 +65,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, input->type == kTfLiteFloat32 ||
                               input->type == kTfLiteInt16 ||
                               input->type == kTfLiteInt8);
-  TF_LITE_ENSURE(context,
-                 output->type == kTfLiteUInt8 || output->type == kTfLiteInt8);
+  TF_LITE_ENSURE(context, output->type == kTfLiteUInt8 ||
+                              output->type == kTfLiteInt8 ||
+                              output->type == kTfLiteInt16);
 
-  if ((input->type == kTfLiteInt16 || input->type == kTfLiteInt8) &&
-      output->type == kTfLiteInt8) {
+  if (((input->type == kTfLiteInt16 || input->type == kTfLiteInt8) &&
+       output->type == kTfLiteInt8) ||
+      (input->type == kTfLiteInt16 && output->type == kTfLiteInt16)) {
     double effective_scale =
         static_cast<double>(input->params.scale / output->params.scale);
 
@@ -81,6 +81,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   data->quantization_params.zero_point = output->params.zero_point;
   data->quantization_params.scale = static_cast<double>(output->params.scale);
+
+  data->input_zero_point = input->params.zero_point;
   return kTfLiteOk;
 }
 
@@ -88,23 +90,32 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TFLITE_DCHECK(node->user_data != nullptr);
   OpData* data = static_cast<OpData*>(node->user_data);
 
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
 
   if (input->type == kTfLiteFloat32) {
     switch (output->type) {
       case kTfLiteInt8:
         reference_ops::AffineQuantize(
-            data->quantization_params, GetTensorShape(input),
-            GetTensorData<float>(input), GetTensorShape(output),
-            GetTensorData<int8_t>(output));
+            data->quantization_params, tflite::micro::GetTensorShape(input),
+            tflite::micro::GetTensorData<float>(input),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int8_t>(output));
         break;
       case kTfLiteUInt8:
         reference_ops::AffineQuantize(
-            data->quantization_params, GetTensorShape(input),
-            GetTensorData<float>(input), GetTensorShape(output),
-            GetTensorData<uint8_t>(output));
+            data->quantization_params, tflite::micro::GetTensorShape(input),
+            tflite::micro::GetTensorData<float>(input),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<uint8_t>(output));
         break;
+      case kTfLiteInt16:
+        reference_ops::AffineQuantize(
+            data->quantization_params, tflite::micro::GetTensorShape(input),
+            tflite::micro::GetTensorData<float>(input),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int16_t>(output));
+        return kTfLiteOk;
       default:
         TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
                            TfLiteTypeGetName(input->type),
@@ -115,11 +126,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     size_t size = ElementCount(*input->dims);
     switch (output->type) {
       case kTfLiteInt8:
-        reference_ops::Requantize(
-            GetTensorData<int16_t>(input), size, data->output_multiplier,
-            data->output_shift, input->params.zero_point,
-            output->params.zero_point, GetTensorData<int8_t>(output));
+        reference_ops::Requantize(tflite::micro::GetTensorData<int16_t>(input),
+                                  size, data->output_multiplier,
+                                  data->output_shift, data->input_zero_point,
+                                  data->quantization_params.zero_point,
+                                  tflite::micro::GetTensorData<int8_t>(output));
         break;
+      case kTfLiteInt16:
+        reference_ops::Requantize(
+            tflite::micro::GetTensorData<int16_t>(input), size,
+            data->output_multiplier, data->output_shift, data->input_zero_point,
+            data->quantization_params.zero_point,
+            tflite::micro::GetTensorData<int16_t>(output));
+        return kTfLiteOk;
       default:
         TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
                            TfLiteTypeGetName(input->type),
@@ -132,10 +151,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     size_t size = ElementCount(*input->dims);
     switch (output->type) {
       case kTfLiteInt8:
-        reference_ops::Requantize(
-            GetTensorData<int8_t>(input), size, data->output_multiplier,
-            data->output_shift, input->params.zero_point,
-            output->params.zero_point, GetTensorData<int8_t>(output));
+        reference_ops::Requantize(tflite::micro::GetTensorData<int8_t>(input),
+                                  size, data->output_multiplier,
+                                  data->output_shift, data->input_zero_point,
+                                  data->quantization_params.zero_point,
+                                  tflite::micro::GetTensorData<int8_t>(output));
         break;
       default:
         TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
@@ -157,7 +177,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 // This Op (QUANTIZE) quantizes the input and produces quantized output.
 // AffineQuantize takes scale and zero point and quantizes the float value to
-// quantized output, in int8 or uint8 format.
+// quantized output, in int8_t or uint8_t format.
 TfLiteRegistration Register_QUANTIZE() {
   return {/*init=*/quantize::Init,
           /*free=*/nullptr,
diff --git a/tensorflow/lite/micro/kernels/quantize_test.cc b/tensorflow/lite/micro/kernels/quantize_test.cc
index b6f885d09e7..b92758d3b93 100644
--- a/tensorflow/lite/micro/kernels/quantize_test.cc
+++ b/tensorflow/lite/micro/kernels/quantize_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
@@ -29,45 +29,20 @@ void ValidateQuantizeGoldens(TfLiteTensor* tensors, int tensors_size,
                              const float* golden, T* golden_quantized,
                              float scale, int zero_point, int output_len,
                              T* output_data) {
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  // Version 1 of quantize supports int8 and uint8 quantization.
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_QUANTIZE);
-
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  const char* init_data = nullptr;
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-
   int inputs_array_data[] = {1, 0};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
+  // Version 1 of quantize supports int8_t and uint8_t quantization.
+  const TfLiteRegistration registration =
+      tflite::ops::micro::Register_QUANTIZE();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array,
+                             /*builtin_data=*/nullptr, micro_test::reporter);
 
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   // Use reference quantization from test utils to compare against op output.
   AsymmetricQuantize(golden, golden_quantized, output_len, scale, zero_point);
@@ -198,6 +173,32 @@ TF_LITE_MICRO_TEST(QuantizeOpTestInt8NoScale) {
       dims, values, dims, values, values_quantized, scale, zero_point, output);
 }
 
+TF_LITE_MICRO_TEST(QuantizeOpTestInt16) {
+  const int length = 10;
+  const int dims[] = {2, 2, 5};
+  const float values[] = {-63.5, -63,  -62.5, -62,  -61.5,
+                          62,    62.5, 63,    63.5, 64};
+  const float scale = 0.5;
+  const int zero_point = -1;
+  int16_t output[length];
+  int16_t values_quantized[length];
+  tflite::testing::TestQuantizeFloat(
+      dims, values, dims, values, values_quantized, scale, zero_point, output);
+}
+
+TF_LITE_MICRO_TEST(QuantizeOpTestInt16NoScale) {
+  const int length = 10;
+  const int dims[] = {2, 2, 5};
+  const float values[] = {-128, -127, -126, -125, -124,
+                          123,  124,  125,  126,  127};
+  const float scale = 1.0;
+  const int zero_point = 0;
+  int16_t output[length];
+  int16_t values_quantized[length];
+  tflite::testing::TestQuantizeFloat(
+      dims, values, dims, values, values_quantized, scale, zero_point, output);
+}
+
 TF_LITE_MICRO_TEST(QuantizeOpTestInt16toInt8) {
   const int length = 10;
   const int dims[] = {2, 2, 5};
@@ -215,6 +216,40 @@ TF_LITE_MICRO_TEST(QuantizeOpTestInt16toInt8) {
                                   output_zero_point, output_quantized);
 }
 
+TF_LITE_MICRO_TEST(QuantizeOpTestInt16toInt16) {
+  const int length = 10;
+  const int dims[] = {2, 2, 5};
+  const float values[] = {-64, -62, -60, -58, -56, 54, 56, 58, 60, 62};
+  const float input_scale = 2.f;
+  const int input_zero_point = 0;
+  const float output_scale = 0.5;
+  const int output_zero_point = 32;
+  int16_t output_quantized[length];
+  int16_t values_quantized[length];
+  int16_t input_quantized[length];
+  tflite::testing::TestRequantize(dims, values, input_quantized, input_scale,
+                                  input_zero_point, dims, values,
+                                  values_quantized, output_scale,
+                                  output_zero_point, output_quantized);
+}
+
+TF_LITE_MICRO_TEST(QuantizeOpTestInt16toInt16NoZeroPoint) {
+  const int length = 10;
+  const int dims[] = {2, 2, 5};
+  const float values[] = {-32, -31, -30, -29, -28, 27, 28, 29, 30, 31};
+  const float input_scale = 1.f;
+  const int input_zero_point = 0;
+  const float output_scale = 0.5;
+  const int output_zero_point = 0;
+  int16_t output_quantized[length];
+  int16_t values_quantized[length];
+  int16_t input_quantized[length];
+  tflite::testing::TestRequantize(dims, values, input_quantized, input_scale,
+                                  input_zero_point, dims, values,
+                                  values_quantized, output_scale,
+                                  output_zero_point, output_quantized);
+}
+
 TF_LITE_MICRO_TEST(QuantizeOpTestInt8toInt8) {
   const int length = 10;
   const int dims[] = {2, 2, 5};
diff --git a/tensorflow/lite/micro/kernels/reduce.cc b/tensorflow/lite/micro/kernels/reduce.cc
index 464b7faafad..5cae782482e 100644
--- a/tensorflow/lite/micro/kernels/reduce.cc
+++ b/tensorflow/lite/micro/kernels/reduce.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_utils.h"
 
 namespace tflite {
 namespace ops {
@@ -50,7 +52,7 @@ TfLiteStatus PrepareSimple(TfLiteContext* context, TfLiteNode* node) {
 
 TfLiteStatus PrepareMeanOrSum(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context, PrepareSimple(context, node));
-  // TODO(b/144955155): Support uint8(b/144955155) and int8(b/144955018)
+  // TODO(b/144955155): Support uint8_t(b/144955155) and int8_t(b/144955018)
   return kTfLiteOk;
 }
 
@@ -58,7 +60,7 @@ void ResolveAxis(const int* axis_data, int axis_count,
                  tflite::MeanParams* op_params) {
   int i = 0;
   for (; i < axis_count; ++i) {
-    op_params->axis[i] = static_cast<int16>(axis_data[i]);
+    op_params->axis[i] = static_cast<int16_t>(axis_data[i]);
   }
   for (; i < 4; ++i) {
     op_params->axis[i] = 1;
@@ -67,24 +69,25 @@ void ResolveAxis(const int* axis_data, int axis_count,
 }
 
 TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  const TfLiteTensor* axis = GetInput(context, node, 1);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  const TfLiteEvalTensor* axis = tflite::micro::GetEvalInput(context, node, 1);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
   TfLiteReducerParams* params =
       reinterpret_cast<TfLiteReducerParams*>(node->builtin_data);
 
-  int num_axis = static_cast<int>(NumElements(axis));
+  int num_axis = static_cast<int>(ElementCount(*axis->dims));
   int temp_index[kMaxNumberOfAxis];
   int resolved_axis[kMaxNumberOfReducedAxis];
 
   switch (input->type) {
     case kTfLiteFloat32: {
       tflite::MeanParams op_params;
-      ResolveAxis(GetTensorData<int>(axis), num_axis, &op_params);
+      ResolveAxis(tflite::micro::GetTensorData<int>(axis), num_axis,
+                  &op_params);
       // TODO(b/146571391): Support only 4D Input and 2D Axis for Mean until
       // scratch tensor allocation has been implemented in (b/132070898)
       bool is_valid_inputs =
-          (NumDimensions(input) == 4 && op_params.axis_count == 2 &&
+          (input->dims->size == 4 && op_params.axis_count == 2 &&
            ((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
             (op_params.axis[0] == 2 && op_params.axis[1] == 1)));
       TF_LITE_ENSURE_MSG(
@@ -95,22 +98,24 @@ TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
       // reference method.
       // Defer to specialized implementation for 4D Mean across axes 1 & 2.
       if (params->keep_dims) {
-        reference_ops::Mean(op_params, GetTensorShape(input),
-                            GetTensorData<float>(input), GetTensorShape(output),
-                            GetTensorData<float>(output));
+        reference_ops::Mean(op_params, tflite::micro::GetTensorShape(input),
+                            tflite::micro::GetTensorData<float>(input),
+                            tflite::micro::GetTensorShape(output),
+                            tflite::micro::GetTensorData<float>(output));
       } else {
         TF_LITE_ENSURE(
             context,
-            reference_ops::Mean(GetTensorData<float>(input), input->dims->data,
-                                input->dims->size, GetTensorData<float>(output),
-                                output->dims->data, output->dims->size,
-                                GetTensorData<int>(axis), num_axis,
-                                params->keep_dims, temp_index, resolved_axis,
-                                GetTensorData<float>(output)));
+            reference_ops::Mean(
+                tflite::micro::GetTensorData<float>(input), input->dims->data,
+                input->dims->size, tflite::micro::GetTensorData<float>(output),
+                output->dims->data, output->dims->size,
+                tflite::micro::GetTensorData<int>(axis), num_axis,
+                params->keep_dims, temp_index, resolved_axis,
+                tflite::micro::GetTensorData<float>(output)));
       }
     } break;
     default:
-      // TODO(b/144955155): Support uint8(b/144955155) and int8(b/144955018)
+      // TODO(b/144955155): Support uint8_t(b/144955155) and int8_t(b/144955018)
       TF_LITE_ENSURE_MSG(context, false,
                          "Currently, only float32 input type "
                          "is supported.");
diff --git a/tensorflow/lite/micro/kernels/reduce_test.cc b/tensorflow/lite/micro/kernels/reduce_test.cc
index 928dda287aa..1e3ded2bd77 100644
--- a/tensorflow/lite/micro/kernels/reduce_test.cc
+++ b/tensorflow/lite/micro/kernels/reduce_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
@@ -38,60 +39,23 @@ static const int kOutputElements = 4;
 static const int kOutputShape[] = {4, 2, 1, 1, 2};
 static const float kGoldenData[] = {6, 7, 18, 19};
 
-static TfLiteReducerParams params = {
-    true  // keep_dims
-};
-
 template <typename T>
 TfLiteStatus ValidateReduceGoldens(TfLiteTensor* tensors, int tensors_size,
                                    const T* expected_output_data,
                                    T* output_data, int output_length,
                                    TfLiteReducerParams* params,
                                    float tolerance = 1e-5) {
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_MEAN);
-
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  const char* init_data = nullptr;
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-
   int inputs_array_data[] = {2, 0, 1};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 2};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(params);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
+  const TfLiteRegistration registration = tflite::ops::micro::Register_MEAN();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array, params, micro_test::reporter);
 
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TfLiteStatus return_val = registration->invoke(&context, &node);
-  if (return_val != kTfLiteOk) {
-    return return_val;
-  }
-
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   for (int i = 0; i < output_length; ++i) {
     TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i],
@@ -135,11 +99,15 @@ TF_LITE_MICRO_TESTS_BEGIN
 TF_LITE_MICRO_TEST(MeanFloat4DKeepDims) {
   float output_data[tflite::testing::kOutputElements];
 
+  TfLiteReducerParams params = {
+      true  // keep_dims
+  };
+
   tflite::testing::TestMeanFloatInput4D(
       tflite::testing::kInputShape4D, tflite::testing::kInputData4D,
       tflite::testing::kAxisShape, tflite::testing::kAxisData,
       tflite::testing::kOutputShape, tflite::testing::kGoldenData, output_data,
-      &tflite::testing::params);
+      &params);
 }
 
 TF_LITE_MICRO_TEST(MeanFloat4DWithoutKeepDims) {
diff --git a/tensorflow/lite/micro/kernels/reshape.cc b/tensorflow/lite/micro/kernels/reshape.cc
index 232d4d9b732..a865892b347 100644
--- a/tensorflow/lite/micro/kernels/reshape.cc
+++ b/tensorflow/lite/micro/kernels/reshape.cc
@@ -18,6 +18,9 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
+#include "tensorflow/lite/micro/micro_utils.h"
 
 namespace tflite {
 namespace ops {
@@ -74,13 +77,21 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  // TODO(b/162522304): storing input bytes in OpData increases some models
+  // significantly, possibly due to alignment issues.
+  size_t input_bytes;
+  TF_LITE_ENSURE_STATUS(TfLiteTypeSizeOf(input->type, &input_bytes));
+  input_bytes *= ElementCount(*input->dims);
 
   // Do nothing for in-place reshape.
   if (input->data.raw != output->data.raw) {
     // Otherwise perform reshape with copy.
-    for (size_t i = 0; i < input->bytes; ++i) {
+    for (size_t i = 0; i < input_bytes; ++i) {
       output->data.raw[i] = input->data.raw[i];
     }
   }
diff --git a/tensorflow/lite/micro/kernels/reshape_test.cc b/tensorflow/lite/micro/kernels/reshape_test.cc
index 5913c7f86bb..91ecbdc7a49 100644
--- a/tensorflow/lite/micro/kernels/reshape_test.cc
+++ b/tensorflow/lite/micro/kernels/reshape_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
@@ -29,132 +29,110 @@ namespace tflite {
 namespace testing {
 namespace {
 
+// TODO(b/162356196): Cleanup this unit test more.
+
 template <typename T>
-void TestReshapeImpl(TfLiteContext* context, TfLiteNode* node,
-                     TfLiteTensor* output_tensor,
-                     std::initializer_list<T> expected_output,
-                     std::initializer_list<int> expected_dims,
-                     bool expect_failure) {
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_RESHAPE);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+void ValidateReshapeGoldens(
+    TfLiteTensor* tensors, int tensors_size, TfLiteIntArray* inputs_array,
+    TfLiteIntArray* outputs_array, const T* expected_output,
+    const size_t expected_output_len, const int* expected_dims,
+    const size_t expected_dims_len, bool expect_failure) {
+  const TfLiteRegistration registration =
+      tflite::ops::micro::Register_RESHAPE();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array,
+                             /*builtin_data=*/nullptr, micro_test::reporter);
 
-  void* user_data = nullptr;
-  node->user_data = user_data;
-  node->builtin_data = nullptr;
-  node->custom_initial_data = nullptr;
-  node->custom_initial_data_size = 0;
-
-  TF_LITE_MICRO_EXPECT_EQ(registration->init, nullptr);
-  TF_LITE_MICRO_EXPECT_EQ(registration->free, nullptr);
-
-  if (registration->prepare) {
-    // Error can happen either in Prepare or eval stage.
-    auto status = registration->prepare(context, node);
-    if (status != kTfLiteOk && expect_failure) {
-      return;
-    } else {
-      TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, status);
-    }
-  }
   if (expect_failure) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteError, registration->invoke(context, node));
+    TF_LITE_MICRO_EXPECT_NE(kTfLiteOk, runner.InitAndPrepare());
     return;
   }
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(context, node));
 
-  const int output_dims_count = ElementCount(*output_tensor->dims);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
+
+  TfLiteTensor* output_tensor = &tensors[outputs_array->data[0]];
   const T* output_data = GetTensorData<T>(output_tensor);
-  for (int i = 0; i < expected_output.size(); ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output.begin()[i], output_data[i],
-                              1e-5f);
+  for (size_t i = 0; i < expected_output_len; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output[i], output_data[i], 1e-5f);
   }
-  TF_LITE_MICRO_EXPECT_EQ(expected_dims.size(), output_tensor->dims->size);
-  for (int i = 0; i < expected_dims.size(); ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_dims.begin()[i],
-                              output_tensor->dims->data[i], 1e-5f);
+  TF_LITE_MICRO_EXPECT_EQ(expected_dims_len,
+                          static_cast<size_t>(output_tensor->dims->size));
+  for (size_t i = 0; i < expected_dims_len; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_dims[i], output_tensor->dims->data[i]);
   }
 }
-
-// If expected output is empty, the test is expected to fail.
 template <typename T>
-void TestReshapeWithShapeImpl(TfLiteTensor* input_tensor,
-                              TfLiteTensor* shape_tensor,
-                              TfLiteTensor* output_tensor,
-                              std::initializer_list<T> expected_output,
-                              std::initializer_list<int> expected_dims,
-                              bool expect_failure) {
-  TfLiteContext context;
-  TfLiteTensor tensors[3];
-  TfLiteNode node;
+void TestReshapeWithShape(TfLiteTensor* input_tensor,
+                          TfLiteTensor* shape_tensor,
+                          TfLiteTensor* output_tensor, const T* expected_output,
+                          const size_t expected_output_len,
+                          const int* expected_dims,
+                          const size_t expected_dims_len, bool expect_failure) {
   constexpr int inputs_size = 2;
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size];
   tensors[0] = *input_tensor;
   tensors[1] = *shape_tensor;
   tensors[2] = *output_tensor;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   int inputs_data[] = {2, 0, 1};
-  node.inputs = IntArrayFromInts(inputs_data);
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_data);
   int outputs_data[] = {1, 2};
-  node.outputs = IntArrayFromInts(outputs_data);
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_data);
 
-  TestReshapeImpl(&context, &node, output_tensor, expected_output,
-                  expected_dims, expect_failure);
+  ValidateReshapeGoldens(tensors, tensors_size, inputs_array, outputs_array,
+                         expected_output, expected_output_len, expected_dims,
+                         expected_dims_len, expect_failure);
 }
 
 // If expected output is empty, the test is expected to fail.
 template <typename T>
-void TestReshapeWithoutShapeImpl(TfLiteTensor* input_tensor,
-                                 TfLiteTensor* output_tensor,
-                                 std::initializer_list<T> expected_output,
-                                 std::initializer_list<int> expected_dims,
-                                 bool expect_failure) {
-  TfLiteContext context;
-  TfLiteTensor tensors[3];
-  TfLiteNode node;
+void TestReshapeWithoutShape(TfLiteTensor* input_tensor,
+                             TfLiteTensor* output_tensor,
+                             const T* expected_output,
+                             const size_t expected_output_len,
+                             const int* expected_dims,
+                             const size_t expected_dims_len,
+                             bool expect_failure) {
   constexpr int inputs_size = 1;
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size];
   tensors[0] = *input_tensor;
-  tensors[1] = *output_tensor,
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  int inputs_data[] = {1, 0};
-  node.inputs = IntArrayFromInts(inputs_data);
-  int outputs_data[] = {1, 1};
-  node.outputs = IntArrayFromInts(outputs_data);
+  tensors[1] = *output_tensor;
 
-  TestReshapeImpl(&context, &node, output_tensor, expected_output,
-                  expected_dims, expect_failure);
+  int inputs_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_data);
+  int outputs_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_data);
+
+  ValidateReshapeGoldens(tensors, tensors_size, inputs_array, outputs_array,
+                         expected_output, expected_output_len, expected_dims,
+                         expected_dims_len, expect_failure);
 }
 
-template <typename T = float, TfLiteType tensor_input_type = kTfLiteFloat32>
-void TestReshape(std::initializer_list<int> input_dims_data,
-                 std::initializer_list<T> input_data,
-                 std::initializer_list<int> shape_dims_data,
-                 std::initializer_list<int32_t> shape_data,
-                 int* output_dims_data, uint8_t* output_data_raw,
-                 std::initializer_list<T> expected_output,
-                 std::initializer_list<int> expected_dims,
+template <typename T = float, TfLiteType tensor_type = kTfLiteFloat32>
+void TestReshape(const int* input_dims_data, const T* input_data,
+                 const int* shape_dims_data, const int32_t* shape_data,
+                 int* output_dims_data, T* output_data,
+                 const T* expected_output, const size_t expected_output_len,
+                 const int* expected_dims, const size_t expected_dims_len,
                  bool expect_failure = false) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* shape_dims = IntArrayFromInts(shape_dims_data);
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   TfLiteTensor input_tensor =
-      CreateTensor<T, tensor_input_type>(input_data, input_dims);
-  T* output_data = reinterpret_cast<T*>(output_data_raw);
-  TfLiteTensor output_tensor =
-      CreateTensor<T, tensor_input_type>(output_data, output_dims);
-  // Reshape param is passed as op's param.
-  TestReshapeWithoutShapeImpl<T>(&input_tensor, &output_tensor, expected_output,
-                                 expected_dims, expect_failure);
-  // Reshape param is passed as a tensor.
-  TfLiteIntArray* shape_dims = IntArrayFromInitializer(shape_dims_data);
-  auto shape_tensor =
+      CreateTensor<T, tensor_type>(input_data, input_dims);
+  TfLiteTensor shape_tensor =
       CreateTensor<int32_t, kTfLiteInt32>(shape_data, shape_dims);
-  TestReshapeWithShapeImpl<T>(&input_tensor, &shape_tensor, &output_tensor,
-                              expected_output, expected_dims, expect_failure);
+  TfLiteTensor output_tensor =
+      CreateTensor<T, tensor_type>(output_data, output_dims);
+
+  TestReshapeWithShape(&input_tensor, &shape_tensor, &output_tensor,
+                       expected_output, expected_output_len, expected_dims,
+                       expected_dims_len, expect_failure);
 }
 }  // namespace
 }  // namespace testing
@@ -162,154 +140,208 @@ void TestReshape(std::initializer_list<int> input_dims_data,
 
 TF_LITE_MICRO_TESTS_BEGIN
 
-#define TEST_RESHAPE(...)                                           \
-  using tflite::testing::TestReshape;                               \
-  tflite::testing::TestReshape<float, kTfLiteFloat32>(__VA_ARGS__); \
-  tflite::testing::TestReshape<uint8_t, kTfLiteUInt8>(__VA_ARGS__); \
-  tflite::testing::TestReshape<int8_t, kTfLiteInt8>(__VA_ARGS__);
-
-TF_LITE_MICRO_TEST(MismatchedDimensions) {
-  uint8_t output_data[32];
-  int output_dims[3] = {2, 2, 1};
-  TEST_RESHAPE({4, 1, 2, 4, 1},  // input_dims
-               {3},              // input_data
-               {1, 2},           // shape_dims
-               {2, 1},           // shape_data
-               output_dims,      // output_dims
-               output_data, {},  // expected_output
-               {},               // expected_dims
-               true              // expect failure
-  );
+TF_LITE_MICRO_TEST(ReshapeWithMismatchedDimensionsShouldFail) {
+  float output_data[32];
+  const int input_dims[] = {4, 1, 2, 4, 1};
+  const float input_data[] = {3};
+  const int shape_dims[] = {1, 2};
+  const int32_t shape_int32[] = {2, 1};
+  int output_dims[] = {2, 2, 1};
+  const int golden_output_len = 0;
+  const float golden_output[] = {};
+  const int golden_dims_len = 0;
+  const int golden_dims[] = {};
+  tflite::testing::TestReshape(
+      input_dims, input_data, shape_dims, shape_int32, output_dims, output_data,
+      golden_output, golden_output_len, golden_dims, golden_dims_len, true);
 }
 
-TF_LITE_MICRO_TEST(TooManyDimensions) {
-  uint8_t output_data[32];
-  int output_dims[10] = {9, 1, 1, 1, 1, 1, 1, 1, 1, 2};
-  TEST_RESHAPE({9, 1, 1, 2, 1, 1, 1, 1, 1, 1},  // input_dims
-               {3, 2},                          // input_data
-               {1, 9},                          // shape_dims
-               {1, 1, 1, 1, 1, 1, 1, 1, 2},     // shape_data
-               output_dims,                     // output_dims
-               output_data, {3, 2},             // expected_output
-               {1, 1, 1, 1, 1, 1, 1, 1, 2}      // expected_dims
-  );
+TF_LITE_MICRO_TEST(ReshapeWithTooManyDimensionsShouldFail) {
+  float output_data[32];
+  const int input_dims[] = {9, 1, 1, 2, 1, 1, 1, 1, 1, 1};
+  const float input[] = {3, 2};
+  const int shape_dims[] = {1, 9};
+  const int32_t shape_int32[] = {1, 1, 1, 1, 1, 1, 1, 1, 2};
+  int output_dims[] = {9, 1, 1, 1, 1, 1, 1, 1, 1, 2};
+  const int golden_output_len = 2;
+  const float golden_output[] = {3, 2};
+  const int golden_dims_len = 9;
+  const int golden_dims[] = {1, 1, 1, 1, 1, 1, 1, 1, 2};
+  tflite::testing::TestReshape(
+      input_dims, input, shape_dims, shape_int32, output_dims, output_data,
+      golden_output, golden_output_len, golden_dims, golden_dims_len, false);
 }
 
-// Number of dimensions > 8 is accepted in micro since it does not use
-// TfLiteReshapeParams.
-TF_LITE_MICRO_TEST(TooManySpecialDimensions) {
-  uint8_t output_data[32];
-  int output_dims[5] = {4, -1, -1, 2, 4};
-  TEST_RESHAPE({4, 1, 2, 4, 1},  // input_dims
-               {3},              // input_data
-               {1, 4},           // shape_dims
-               {-1, -1, 2, 4},   // shape_data
-               output_dims,      // output_dims
-               output_data, {},  // expected_output
-               {},               // expected_dims
-               true              // expect failure
-  );
+TF_LITE_MICRO_TEST(ReshapeWithTooManySpecialDimensionsShouldFail) {
+  float output_data[32];
+  const int input_dims[] = {4, 1, 2, 4, 11};
+  const float input[] = {3};
+  const int shape_dims[] = {1, 4};
+  const int32_t shape_int32[] = {-1, -1, 2, 4};
+  int output_dims[] = {4, -1, -1, 2, 4};
+  const int golden_output_len = 2;
+  const float golden_output[] = {};
+  const int golden_dims_len = 9;
+  const int golden_dims[] = {};
+  tflite::testing::TestReshape(
+      input_dims, input, shape_dims, shape_int32, output_dims, output_data,
+      golden_output, golden_output_len, golden_dims, golden_dims_len, true);
 }
 
 // Create the model with a 2x2 shape. Processing still works because the new
 // shape ends up being hardcoded as a flat vector.
-TF_LITE_MICRO_TEST(InvalidShape) {
-  using tflite::testing::CreateFloatTensor;
-  using tflite::testing::IntArrayFromInitializer;
-  using tflite::testing::IntArrayFromInts;
+TF_LITE_MICRO_TEST(ReshapeWithInvalidShapeShouldFail) {
   int input_dims_data[] = {3, 1, 2, 2};
-  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
-  auto input_data = {3.0f};
-  auto input_tensor = CreateFloatTensor(input_data, input_dims);
+  TfLiteIntArray* input_dims =
+      tflite::testing::IntArrayFromInts(input_dims_data);
+  const float input_data[] = {3.0f};
+  auto input_tensor =
+      tflite::testing::CreateFloatTensor(input_data, input_dims);
   float output_data[4];
   int output_dims_data[6] = {2, 2, 1, 2, 2, 1};
-  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
-  auto output_tensor = CreateFloatTensor(output_data, output_dims);
-  tflite::testing::TestReshapeWithoutShapeImpl<float>(
-      &input_tensor,   // input_tensor
-      &output_tensor,  // output_tensor
-      {},              // expected_output
-      {},              // expected_dims
-      true             // expect failure
-  );
+  TfLiteIntArray* output_dims =
+      tflite::testing::IntArrayFromInts(output_dims_data);
+  auto output_tensor =
+      tflite::testing::CreateFloatTensor(output_data, output_dims);
+  const int expected_output[] = {};
+  const int expected_output_len = 0;
+  const int expected_dims[] = {};
+  const int expected_dims_len = 0;
+  tflite::testing::TestReshapeWithoutShape(
+      &input_tensor, &output_tensor, expected_output, expected_output_len,
+      expected_dims, expected_dims_len, true);
 }
 
-TF_LITE_MICRO_TEST(RegularShapes) {
-  uint8_t output_data[32];
-  int output_dims[4] = {3, 2, 2, 2};
-  TEST_RESHAPE({4, 1, 2, 4, 1},                        // input_dims
-               {1, 2, 3, 4, 5, 6, 7, 8},               // input_data
-               {1, 3},                                 // shape_dims
-               {2, 2, 2},                              // shape_data
-               output_dims,                            // output_dims
-               output_data, {1, 2, 3, 4, 5, 6, 7, 8},  // expected_output
-               {2, 2, 2}                               // expected_dims
-  );
+TF_LITE_MICRO_TEST(ReshapeWithRegularShapesShouldSucceed) {
+  float output_data_float[32];
+  int8_t output_data_int8[32];
+  uint8_t output_data_uint8[32];
+  const int input_dims[] = {4, 1, 2, 4, 1};
+  const float input_float[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  const int8_t input_int8[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  const uint8_t input_uint8[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  const int shape_dims[] = {1, 3};
+  const int32_t shape_int32[] = {2, 2, 2};
+  int output_dims[] = {3, 2, 2, 2};
+  const int golden_output_len = 8;
+  const float golden_output_float[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  const int8_t golden_output_int8[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  const uint8_t golden_output_uint8[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  const int golden_dims_len = 3;
+  const int golden_dims[] = {2, 2, 2};
+  tflite::testing::TestReshape(input_dims, input_float, shape_dims, shape_int32,
+                               output_dims, output_data_float,
+                               golden_output_float, golden_output_len,
+                               golden_dims, golden_dims_len, false);
+  tflite::testing::TestReshape<int8_t, kTfLiteInt8>(
+      input_dims, input_int8, shape_dims, shape_int32, output_dims,
+      output_data_int8, golden_output_int8, golden_output_len, golden_dims,
+      golden_dims_len, false);
+  tflite::testing::TestReshape<uint8_t, kTfLiteUInt8>(
+      input_dims, input_uint8, shape_dims, shape_int32, output_dims,
+      output_data_uint8, golden_output_uint8, golden_output_len, golden_dims,
+      golden_dims_len, false);
 }
 
-TF_LITE_MICRO_TEST(WithStretchDimension) {
-  uint8_t output_data[32];
-  int output_dims[4] = {3, 2, 1, -1};
-  TEST_RESHAPE({4, 1, 2, 4, 1},                        // input_dims
-               {1, 2, 3, 4, 5, 6, 7, 8},               // input_data
-               {1, 3},                                 // shape_dims
-               {2, 1, -1},                             // shape_data
-               output_dims,                            // output_dims
-               output_data, {1, 2, 3, 4, 5, 6, 7, 8},  // expected_output
-               {2, 1, 4}                               // expected_dims
-  );
+// Stretch is not supported with TF Micro
+TF_LITE_MICRO_TEST(ReshapeWithStretchDimensionShouldSucceed) {
+  float output_data_float[32];
+  int8_t output_data_int8[32];
+  uint8_t output_data_uint8[32];
+  const int input_dims[] = {4, 1, 2, 4, 1};
+  const float input_float[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  const int8_t input_int8[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  const uint8_t input_uint8[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  const int shape_dims[] = {1, 3};
+  const int32_t shape_int32[] = {2, 1, -1};
+  int output_dims[] = {3, 2, 1, -1};
+  const int golden_output_len = 8;
+  const float golden_output_float[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  const int8_t golden_output_int8[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  const uint8_t golden_output_uint8[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  const int golden_dims_len = 3;
+  const int golden_dims[] = {2, 1, 4};
+  tflite::testing::TestReshape(input_dims, input_float, shape_dims, shape_int32,
+                               output_dims, output_data_float,
+                               golden_output_float, golden_output_len,
+                               golden_dims, golden_dims_len, false);
+  tflite::testing::TestReshape<int8_t, kTfLiteInt8>(
+      input_dims, input_int8, shape_dims, shape_int32, output_dims,
+      output_data_int8, golden_output_int8, golden_output_len, golden_dims,
+      golden_dims_len, false);
+  tflite::testing::TestReshape<uint8_t, kTfLiteUInt8>(
+      input_dims, input_uint8, shape_dims, shape_int32, output_dims,
+      output_data_uint8, golden_output_uint8, golden_output_len, golden_dims,
+      golden_dims_len, false);
 }
 
-// Shape is specified as '[]', which is the modern way to represent scalar
-// input and output.
-TF_LITE_MICRO_TEST(ScalarOutput) {
-  uint8_t output_data[4];
-  int output_dims[1] = {0};
-  TEST_RESHAPE({1, 1},            // input_dims
-               {3},               // input_data
-               {0},               // shape_dims
-               {},                // shape_data
-               output_dims,       // output_dims
-               output_data, {3},  // expected_output
-               {}                 // expected_dims
-  );
+// Empty shape indicates scalar output.
+TF_LITE_MICRO_TEST(ReshapeWithScalarOutputShouldSucceed) {
+  float output_data_float[4];
+  int8_t output_data_int8[4];
+  uint8_t output_data_uint8[4];
+  const int input_dims[] = {1, 1};
+  const float input_float[] = {3};
+  const int8_t input_int8[] = {3};
+  const uint8_t input_uint8[] = {3};
+  const int shape_dims[] = {0};
+  const int32_t shape_int32[] = {};
+  int output_dims[] = {0};
+  const int golden_output_len = 1;
+  const float golden_output_float[] = {3};
+  const int8_t golden_output_int8[] = {3};
+  const uint8_t golden_output_uint8[] = {3};
+  const int golden_dims_len = 0;
+  const int golden_dims[] = {};
+  tflite::testing::TestReshape(input_dims, input_float, shape_dims, shape_int32,
+                               output_dims, output_data_float,
+                               golden_output_float, golden_output_len,
+                               golden_dims, golden_dims_len, false);
+  tflite::testing::TestReshape<int8_t, kTfLiteInt8>(
+      input_dims, input_int8, shape_dims, shape_int32, output_dims,
+      output_data_int8, golden_output_int8, golden_output_len, golden_dims,
+      golden_dims_len, false);
+  tflite::testing::TestReshape<uint8_t, kTfLiteUInt8>(
+      input_dims, input_uint8, shape_dims, shape_int32, output_dims,
+      output_data_uint8, golden_output_uint8, golden_output_len, golden_dims,
+      golden_dims_len, false);
 }
 
 // Some old models specify '[0]' as the new shape, indicating that both input
 // and output are scalars.
-TF_LITE_MICRO_TEST(LegacyScalarOutput) {
+TF_LITE_MICRO_TEST(ReshapeWithLegacyScalarOutputShouldSucceed) {
   using tflite::testing::CreateFloatTensor;
-  using tflite::testing::IntArrayFromInitializer;
   using tflite::testing::IntArrayFromInts;
+
   int input_dims_data[] = {1, 1};
   TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
-  auto input_data = {3.0f};
+  const float input_data[] = {3.0f};
   auto input_tensor = CreateFloatTensor(input_data, input_dims);
+
   float output_data[1];
   int output_dims_data[2] = {1, 0};
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   auto output_tensor = CreateFloatTensor(output_data, output_dims);
+
   int shape_dims_data[] = {1, 0};
   TfLiteIntArray* shape_dims = IntArrayFromInts(shape_dims_data);
-  auto shape_tensor =
-      tflite::testing::CreateTensor<int32_t, kTfLiteInt32>({0}, shape_dims);
-  tflite::testing::TestReshapeWithShapeImpl<float>(
-      &input_tensor,   // input_tensor
-      &shape_tensor,   // shape_tensor
-      &output_tensor,  // output_tensor
-      {},              // expected_output
-      {},              // expected_dims
-      true             // expect failure
-  );
-  tflite::testing::TestReshapeWithoutShapeImpl<float>(
-      &input_tensor,   // input_tensor
-      &output_tensor,  // output_tensor
-      {3},             // expected_output
-      {},              // expected_dims
-      false            // expect failure
-  );
+
+  const int32_t shape_data[] = {0};
+  auto shape_tensor = tflite::testing::CreateTensor<int32_t, kTfLiteInt32>(
+      shape_data, shape_dims);
+  const float expected_output_with_shape[] = {};
+  const int expected_output_with_shape_len = 0;
+  const float expected_output_no_shape[] = {3};
+  const int expected_output_no_shape_len = 1;
+  const int expected_dims[] = {};
+  const int expected_dims_len = 0;
+  tflite::testing::TestReshapeWithShape<float>(
+      &input_tensor, &shape_tensor, &output_tensor, expected_output_with_shape,
+      expected_output_with_shape_len, expected_dims, expected_dims_len, true);
+
+  tflite::testing::TestReshapeWithoutShape<float>(
+      &input_tensor, &output_tensor, expected_output_no_shape,
+      expected_output_no_shape_len, expected_dims, expected_dims_len, false);
 }
 
-#undef TEST_RESHAPE
-
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cc b/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cc
index dc39bfeebf0..222cb3a0044 100644
--- a/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cc
+++ b/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -49,8 +50,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   output->type = input->type;
 
   if (!IsConstantTensor(size)) {
-    TF_LITE_KERNEL_LOG(context,
-                         "Dynamic tensors are unsupported in tfmicro.");
+    TF_LITE_KERNEL_LOG(context, "Dynamic tensors are unsupported in tfmicro.");
     return kTfLiteError;
   }
 #endif
@@ -61,9 +61,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params =
       reinterpret_cast<TfLiteResizeNearestNeighborParams*>(node->builtin_data);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* size = GetInput(context, node, kSizeTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* size =
+      tflite::micro::GetEvalInput(context, node, kSizeTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
   tflite::ResizeNearestNeighborParams op_params;
   op_params.align_corners = params->align_corners;
@@ -71,22 +74,31 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   if (output->type == kTfLiteFloat32) {
     reference_ops::ResizeNearestNeighbor(
-        op_params, GetTensorShape(input), GetTensorData<int32>(input),
-        GetTensorShape(size), GetTensorData<int32>(size),
-        GetTensorShape(output), GetTensorData<int32>(output));
+        op_params, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<int32_t>(input),
+        tflite::micro::GetTensorShape(size),
+        tflite::micro::GetTensorData<int32_t>(size),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int32_t>(output));
   } else if (output->type == kTfLiteUInt8) {
     reference_ops::ResizeNearestNeighbor(
-        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-        GetTensorShape(size), GetTensorData<int32>(size),
-        GetTensorShape(output), GetTensorData<uint8_t>(output));
+        op_params, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<uint8_t>(input),
+        tflite::micro::GetTensorShape(size),
+        tflite::micro::GetTensorData<int32_t>(size),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<uint8_t>(output));
   } else if (output->type == kTfLiteInt8) {
     reference_ops::ResizeNearestNeighbor(
-        op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-        GetTensorShape(size), GetTensorData<int32>(size),
-        GetTensorShape(output), GetTensorData<int8_t>(output));
+        op_params, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<int8_t>(input),
+        tflite::micro::GetTensorShape(size),
+        tflite::micro::GetTensorData<int32_t>(size),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int8_t>(output));
   } else {
     TF_LITE_KERNEL_LOG(context,
-                       "Output type is %d, requires float, uint8 or int8.",
+                       "Output type is %d, requires float, uint8_t or int8_t.",
                        output->type);
     return kTfLiteError;
   }
diff --git a/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc b/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc
index 654516c3ce7..47813d5d8a3 100644
--- a/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc
+++ b/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc
@@ -15,26 +15,26 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
-
 namespace tflite {
 namespace testing {
 namespace {
 
-using uint8 = std::uint8_t;
-using int32 = std::int32_t;
+using uint8_t = std::uint8_t;
+using int32_t = std::int32_t;
 
 TfLiteTensor TestCreateTensor(const float* data, TfLiteIntArray* dims) {
   return CreateFloatTensor(data, dims);
 }
 
-TfLiteTensor TestCreateTensor(const uint8* data, TfLiteIntArray* dims) {
+TfLiteTensor TestCreateTensor(const uint8_t* data, TfLiteIntArray* dims) {
   return CreateQuantizedTensor(data, dims, 0, 255);
 }
 
-TfLiteTensor TestCreateTensor(const int8* data, TfLiteIntArray* dims) {
+TfLiteTensor TestCreateTensor(const int8_t* data, TfLiteIntArray* dims) {
   return CreateQuantizedTensor(data, dims, -128, 127);
 }
 
@@ -43,7 +43,7 @@ TfLiteTensor TestCreateTensor(const int8* data, TfLiteIntArray* dims) {
 // Expected sizes should be a 1-D tensor with 2 elements: new_height & new_width
 template <typename T>
 void TestResizeNearestNeighbor(const int* input_dims_data, const T* input_data,
-                               const int32* expected_size_data,
+                               const int32_t* expected_size_data,
                                const T* expected_output_data,
                                const int* output_dims_data, T* output_data) {
   TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
@@ -63,31 +63,21 @@ void TestResizeNearestNeighbor(const int* input_dims_data, const T* input_data,
       TestCreateTensor(output_data, output_dims),
   };
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_RESIZE_NEAREST_NEIGHBOR);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  TfLiteResizeNearestNeighborParams builtin_data = {
-    .align_corners = false
-  };
+  TfLiteResizeNearestNeighborParams builtin_data = {false, false};
 
   int inputs_array_data[] = {2, 0, 1};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 2};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = nullptr;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  const TfLiteRegistration registration =
+      tflite::ops::micro::Register_RESIZE_NEAREST_NEIGHBOR();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array, &builtin_data,
+                             micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   // compare results
   for (int i = 0; i < output_dims_count; ++i) {
@@ -99,234 +89,266 @@ void TestResizeNearestNeighbor(const int* input_dims_data, const T* input_data,
 }  // namespace testing
 }  // namespace tflite
 
-
 TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(HorizontalResize) {
   const int input_dims[] = {4, 1, 1, 2, 1};
   const float input_data[] = {3, 6};
-  const int32 expected_size_data[] = {1, 3};
+  const int32_t expected_size_data[] = {1, 3};
   const float expected_output_data[] = {3, 3, 6};
   const int output_dims[] = {4, 1, 1, 3, 1};
   float output_data[3];
 
-  tflite::testing::TestResizeNearestNeighbor<float>(input_dims, input_data,
-    expected_size_data, expected_output_data, output_dims, output_data);
+  tflite::testing::TestResizeNearestNeighbor<float>(
+      input_dims, input_data, expected_size_data, expected_output_data,
+      output_dims, output_data);
 }
 TF_LITE_MICRO_TEST(HorizontalResizeUInt8) {
   const int input_dims[] = {4, 1, 1, 2, 1};
-  const uint8 input_data[] = {3, 6};
-  const int32 expected_size_data[] = {1, 3};
-  const uint8 expected_output_data[] = {3, 3, 6};
+  const uint8_t input_data[] = {3, 6};
+  const int32_t expected_size_data[] = {1, 3};
+  const uint8_t expected_output_data[] = {3, 3, 6};
   const int output_dims[] = {4, 1, 1, 3, 1};
-  uint8 output_data[3];
+  uint8_t output_data[3];
 
-  tflite::testing::TestResizeNearestNeighbor<uint8>(input_dims, input_data,
-    expected_size_data, expected_output_data, output_dims, output_data);
+  tflite::testing::TestResizeNearestNeighbor<uint8_t>(
+      input_dims, input_data, expected_size_data, expected_output_data,
+      output_dims, output_data);
 }
 TF_LITE_MICRO_TEST(HorizontalResizeInt8) {
   const int input_dims[] = {4, 1, 1, 2, 1};
-  const int8 input_data[] = {-3, 6};
-  const int32 expected_size_data[] = {1, 3};
-  const int8 expected_output_data[] = {-3, -3, 6};
+  const int8_t input_data[] = {-3, 6};
+  const int32_t expected_size_data[] = {1, 3};
+  const int8_t expected_output_data[] = {-3, -3, 6};
   const int output_dims[] = {4, 1, 1, 3, 1};
-  int8 output_data[3];
+  int8_t output_data[3];
 
-  tflite::testing::TestResizeNearestNeighbor<int8>(input_dims, input_data,
-    expected_size_data, expected_output_data, output_dims, output_data);
+  tflite::testing::TestResizeNearestNeighbor<int8_t>(
+      input_dims, input_data, expected_size_data, expected_output_data,
+      output_dims, output_data);
 }
 TF_LITE_MICRO_TEST(VerticalResize) {
   const int input_dims[] = {4, 1, 2, 1, 1};
   const float input_data[] = {3, 9};
-  const int32 expected_size_data[] = {3, 1};
+  const int32_t expected_size_data[] = {3, 1};
   const float expected_output_data[] = {3, 3, 9};
   const int output_dims[] = {4, 1, 3, 1, 1};
   float output_data[3];
 
-  tflite::testing::TestResizeNearestNeighbor<float>(input_dims, input_data,
-    expected_size_data, expected_output_data, output_dims, output_data);
+  tflite::testing::TestResizeNearestNeighbor<float>(
+      input_dims, input_data, expected_size_data, expected_output_data,
+      output_dims, output_data);
 }
 TF_LITE_MICRO_TEST(VerticalResizeUInt8) {
   const int input_dims[] = {4, 1, 2, 1, 1};
-  const uint8 input_data[] = {3, 9};
-  const int32 expected_size_data[] = {3, 1};
-  const uint8 expected_output_data[] = {3, 3, 9};
+  const uint8_t input_data[] = {3, 9};
+  const int32_t expected_size_data[] = {3, 1};
+  const uint8_t expected_output_data[] = {3, 3, 9};
   const int output_dims[] = {4, 1, 3, 1, 1};
-  uint8 output_data[3];
+  uint8_t output_data[3];
 
-  tflite::testing::TestResizeNearestNeighbor<uint8>(input_dims, input_data,
-    expected_size_data, expected_output_data, output_dims, output_data);
+  tflite::testing::TestResizeNearestNeighbor<uint8_t>(
+      input_dims, input_data, expected_size_data, expected_output_data,
+      output_dims, output_data);
 }
 TF_LITE_MICRO_TEST(VerticalResizeInt8) {
   const int input_dims[] = {4, 1, 2, 1, 1};
-  const int8 input_data[] = {3, -9};
-  const int32 expected_size_data[] = {3, 1};
-  const int8 expected_output_data[] = {3, 3, -9};
+  const int8_t input_data[] = {3, -9};
+  const int32_t expected_size_data[] = {3, 1};
+  const int8_t expected_output_data[] = {3, 3, -9};
   const int output_dims[] = {4, 1, 3, 1, 1};
-  int8 output_data[3];
+  int8_t output_data[3];
 
-  tflite::testing::TestResizeNearestNeighbor<int8>(input_dims, input_data,
-    expected_size_data, expected_output_data, output_dims, output_data);
+  tflite::testing::TestResizeNearestNeighbor<int8_t>(
+      input_dims, input_data, expected_size_data, expected_output_data,
+      output_dims, output_data);
 }
 TF_LITE_MICRO_TEST(TwoDimensionalResize) {
   const int input_dims[] = {4, 1, 2, 2, 1};
-  const float input_data[] = {3, 6,   //
-                              9, 12,  //
-                             };
-  const int32 expected_size_data[] = {3, 3};
-  const float expected_output_data[] = {3, 3, 6,  //
-                                        3, 3, 6,  //
-                                        9, 9, 12  //
-                                       };
+  const float input_data[] = {
+      3, 6,   //
+      9, 12,  //
+  };
+  const int32_t expected_size_data[] = {3, 3};
+  const float expected_output_data[] = {
+      3, 3, 6,  //
+      3, 3, 6,  //
+      9, 9, 12  //
+  };
 
   const int output_dims[] = {4, 1, 3, 3, 1};
   float output_data[9];
 
-  tflite::testing::TestResizeNearestNeighbor<float>(input_dims, input_data,
-    expected_size_data, expected_output_data, output_dims, output_data);
+  tflite::testing::TestResizeNearestNeighbor<float>(
+      input_dims, input_data, expected_size_data, expected_output_data,
+      output_dims, output_data);
 }
 TF_LITE_MICRO_TEST(TwoDimensionalResizeUInt8) {
   const int input_dims[] = {4, 1, 2, 2, 1};
-  const uint8 input_data[] = {3, 6,  //
-                              9, 12  //
-                             };
-  const int32 expected_size_data[] = {3, 3};
-  const uint8 expected_output_data[] = {3, 3, 6,  //
-                                        3, 3, 6,  //
-                                        9, 9, 12  //
-                                       };
+  const uint8_t input_data[] = {
+      3, 6,  //
+      9, 12  //
+  };
+  const int32_t expected_size_data[] = {3, 3};
+  const uint8_t expected_output_data[] = {
+      3, 3, 6,  //
+      3, 3, 6,  //
+      9, 9, 12  //
+  };
   const int output_dims[] = {4, 1, 3, 3, 1};
-  uint8 output_data[9];
+  uint8_t output_data[9];
 
-  tflite::testing::TestResizeNearestNeighbor<uint8>(input_dims, input_data,
-    expected_size_data, expected_output_data, output_dims, output_data);
+  tflite::testing::TestResizeNearestNeighbor<uint8_t>(
+      input_dims, input_data, expected_size_data, expected_output_data,
+      output_dims, output_data);
 }
 TF_LITE_MICRO_TEST(TwoDimensionalResizeInt8) {
   const int input_dims[] = {4, 1, 2, 2, 1};
-  const int8 input_data[] = {3, -6,  //
-                             9, 12,  //
-                            };
-  const int32 expected_size_data[] = {3, 3};
-  const int8 expected_output_data[] = {3, 3, -6,  //
-                                       3, 3, -6,  //
-                                       9, 9, 12,  //
-                                      };
+  const int8_t input_data[] = {
+      3, -6,  //
+      9, 12,  //
+  };
+  const int32_t expected_size_data[] = {3, 3};
+  const int8_t expected_output_data[] = {
+      3, 3, -6,  //
+      3, 3, -6,  //
+      9, 9, 12,  //
+  };
   const int output_dims[] = {4, 1, 3, 3, 1};
-  int8 output_data[9];
+  int8_t output_data[9];
 
-  tflite::testing::TestResizeNearestNeighbor<int8>(input_dims, input_data,
-    expected_size_data, expected_output_data, output_dims, output_data);
+  tflite::testing::TestResizeNearestNeighbor<int8_t>(
+      input_dims, input_data, expected_size_data, expected_output_data,
+      output_dims, output_data);
 }
 TF_LITE_MICRO_TEST(TwoDimensionalResizeWithTwoBatches) {
   const int input_dims[] = {4, 2, 2, 2, 1};
-  const float input_data[] = {3, 6,   //
-                              9, 12,  //
-                              4, 10,  //
-                              10, 16  //
-                             };
-  const int32 expected_size_data[] = {3, 3};
-  const float expected_output_data[] = {3, 3, 6,     //
-                                        3, 3, 6,     //
-                                        9, 9, 12,    //
-                                        4, 4, 10,    //
-                                        4, 4, 10,    //
-                                        10, 10, 16,  //
-                                       };
+  const float input_data[] = {
+      3,  6,   //
+      9,  12,  //
+      4,  10,  //
+      10, 16   //
+  };
+  const int32_t expected_size_data[] = {3, 3};
+  const float expected_output_data[] = {
+      3,  3,  6,   //
+      3,  3,  6,   //
+      9,  9,  12,  //
+      4,  4,  10,  //
+      4,  4,  10,  //
+      10, 10, 16,  //
+  };
   const int output_dims[] = {4, 2, 3, 3, 1};
   float output_data[18];
 
-  tflite::testing::TestResizeNearestNeighbor<float>(input_dims, input_data,
-    expected_size_data, expected_output_data, output_dims, output_data);
+  tflite::testing::TestResizeNearestNeighbor<float>(
+      input_dims, input_data, expected_size_data, expected_output_data,
+      output_dims, output_data);
 }
 TF_LITE_MICRO_TEST(TwoDimensionalResizeWithTwoBatchesUInt8) {
   const int input_dims[] = {4, 2, 2, 2, 1};
-  const uint8 input_data[] = {3, 6,   //
-                              9, 12,  //
-                              4, 10,  //
-                              10, 16  //
-                             };
-  const int32 expected_size_data[] = {3, 3};
-  const uint8 expected_output_data[] = {3, 3, 6,     //
-                                        3, 3, 6,     //
-                                        9, 9, 12,    //
-                                        4, 4, 10,    //
-                                        4, 4, 10,    //
-                                        10, 10, 16,  //
-                                       };
+  const uint8_t input_data[] = {
+      3,  6,   //
+      9,  12,  //
+      4,  10,  //
+      10, 16   //
+  };
+  const int32_t expected_size_data[] = {3, 3};
+  const uint8_t expected_output_data[] = {
+      3,  3,  6,   //
+      3,  3,  6,   //
+      9,  9,  12,  //
+      4,  4,  10,  //
+      4,  4,  10,  //
+      10, 10, 16,  //
+  };
   const int output_dims[] = {4, 2, 3, 3, 1};
-  uint8 output_data[18];
+  uint8_t output_data[18];
 
-  tflite::testing::TestResizeNearestNeighbor<uint8>(input_dims, input_data,
-    expected_size_data, expected_output_data, output_dims, output_data);
+  tflite::testing::TestResizeNearestNeighbor<uint8_t>(
+      input_dims, input_data, expected_size_data, expected_output_data,
+      output_dims, output_data);
 }
 TF_LITE_MICRO_TEST(TwoDimensionalResizeWithTwoBatchesInt8) {
   const int input_dims[] = {4, 2, 2, 2, 1};
-  const int8 input_data[] = {3, 6,    //
-                             9, -12,  //
-                             -4, 10,  //
-                             10, 16   //
-                            };
-  const int32 expected_size_data[] = {3, 3};
-  const int8 expected_output_data[] = {3, 3, 6,     //
-                                       3, 3, 6,     //
-                                       9, 9, -12,   //
-                                       -4, -4, 10,  //
-                                       -4, -4, 10,  //
-                                       10, 10, 16,  //
-                                      };
+  const int8_t input_data[] = {
+      3,  6,    //
+      9,  -12,  //
+      -4, 10,   //
+      10, 16    //
+  };
+  const int32_t expected_size_data[] = {3, 3};
+  const int8_t expected_output_data[] = {
+      3,  3,  6,    //
+      3,  3,  6,    //
+      9,  9,  -12,  //
+      -4, -4, 10,   //
+      -4, -4, 10,   //
+      10, 10, 16,   //
+  };
   const int output_dims[] = {4, 2, 3, 3, 1};
-  int8 output_data[18];
+  int8_t output_data[18];
 
-  tflite::testing::TestResizeNearestNeighbor<int8>(input_dims, input_data,
-    expected_size_data, expected_output_data, output_dims, output_data);
+  tflite::testing::TestResizeNearestNeighbor<int8_t>(
+      input_dims, input_data, expected_size_data, expected_output_data,
+      output_dims, output_data);
 }
 TF_LITE_MICRO_TEST(ThreeDimensionalResize) {
   const int input_dims[] = {4, 1, 2, 2, 2};
-  const float input_data[] = {3, 4, 6, 10,    //
-                              9, 10, 12, 16,  //
-                             };
-  const int32 expected_size_data[] = {3, 3};
-  const float expected_output_data[] = {3, 4, 3, 4, 6, 10,     //
-                                        3, 4, 3, 4, 6, 10,     //
-                                        9, 10, 9, 10, 12, 16,  //
-                                     };
+  const float input_data[] = {
+      3, 4,  6,  10,  //
+      9, 10, 12, 16,  //
+  };
+  const int32_t expected_size_data[] = {3, 3};
+  const float expected_output_data[] = {
+      3, 4,  3, 4,  6,  10,  //
+      3, 4,  3, 4,  6,  10,  //
+      9, 10, 9, 10, 12, 16,  //
+  };
   const int output_dims[] = {4, 1, 3, 3, 2};
   float output_data[18];
 
-  tflite::testing::TestResizeNearestNeighbor<float>(input_dims, input_data,
-    expected_size_data, expected_output_data, output_dims, output_data);
+  tflite::testing::TestResizeNearestNeighbor<float>(
+      input_dims, input_data, expected_size_data, expected_output_data,
+      output_dims, output_data);
 }
 TF_LITE_MICRO_TEST(ThreeDimensionalResizeUInt8) {
   const int input_dims[] = {4, 1, 2, 2, 2};
-  const uint8 input_data[] = {3, 4, 6, 10,     //
-                              10, 12, 14, 16,  //
-                             };
-  const int32 expected_size_data[] = {3, 3};
-  const uint8 expected_output_data[] = {3, 4, 3, 4, 6, 10,       //
-                                        3, 4, 3, 4, 6, 10,       //
-                                        10, 12, 10, 12, 14, 16,  //
-                                     };
+  const uint8_t input_data[] = {
+      3,  4,  6,  10,  //
+      10, 12, 14, 16,  //
+  };
+  const int32_t expected_size_data[] = {3, 3};
+  const uint8_t expected_output_data[] = {
+      3,  4,  3,  4,  6,  10,  //
+      3,  4,  3,  4,  6,  10,  //
+      10, 12, 10, 12, 14, 16,  //
+  };
   const int output_dims[] = {4, 1, 3, 3, 2};
-  uint8 output_data[18];
+  uint8_t output_data[18];
 
-  tflite::testing::TestResizeNearestNeighbor<uint8>(input_dims, input_data,
-    expected_size_data, expected_output_data, output_dims, output_data);
+  tflite::testing::TestResizeNearestNeighbor<uint8_t>(
+      input_dims, input_data, expected_size_data, expected_output_data,
+      output_dims, output_data);
 }
 TF_LITE_MICRO_TEST(ThreeDimensionalResizeInt8) {
   const int input_dims[] = {4, 1, 2, 2, 2};
-  const int8 input_data[] = {3, 4, -6, 10,    //
-                             10, 12, -14, 16,  //
-                            };
-  const int32 expected_size_data[] = {3, 3};
-  const int8 expected_output_data[] = {3, 4, 3, 4, -6, 10,       //
-                                        3, 4, 3, 4, -6, 10,       //
-                                        10, 12, 10, 12, -14, 16,  //
-                                     };
+  const int8_t input_data[] = {
+      3,  4,  -6,  10,  //
+      10, 12, -14, 16,  //
+  };
+  const int32_t expected_size_data[] = {3, 3};
+  const int8_t expected_output_data[] = {
+      3,  4,  3,  4,  -6,  10,  //
+      3,  4,  3,  4,  -6,  10,  //
+      10, 12, 10, 12, -14, 16,  //
+  };
   const int output_dims[] = {4, 1, 3, 3, 2};
-  int8 output_data[18];
+  int8_t output_data[18];
 
-  tflite::testing::TestResizeNearestNeighbor<int8>(input_dims, input_data,
-    expected_size_data, expected_output_data, output_dims, output_data);
+  tflite::testing::TestResizeNearestNeighbor<int8_t>(
+      input_dims, input_data, expected_size_data, expected_output_data,
+      output_dims, output_data);
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/round.cc b/tensorflow/lite/micro/kernels/round.cc
index 9736a15dfba..7b4adfc61c0 100644
--- a/tensorflow/lite/micro/kernels/round.cc
+++ b/tensorflow/lite/micro/kernels/round.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -43,11 +44,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
-  reference_ops::Round(GetTensorShape(input), GetTensorData<float>(input),
-                       GetTensorShape(output), GetTensorData<float>(output));
+  reference_ops::Round(tflite::micro::GetTensorShape(input),
+                       tflite::micro::GetTensorData<float>(input),
+                       tflite::micro::GetTensorShape(output),
+                       tflite::micro::GetTensorData<float>(output));
 
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/micro/kernels/round_test.cc b/tensorflow/lite/micro/kernels/round_test.cc
index c5db0447824..7048a98f52b 100644
--- a/tensorflow/lite/micro/kernels/round_test.cc
+++ b/tensorflow/lite/micro/kernels/round_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
@@ -35,26 +36,19 @@ void TestRound(const int* input_dims_data, const float* input_data,
       CreateFloatTensor(input_data, input_dims),
       CreateFloatTensor(output_data, output_dims),
   };
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_ROUND);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   int inputs_array_data[] = {1, 0};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = nullptr;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+
+  const TfLiteRegistration registration = tflite::ops::micro::Register_ROUND();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array, nullptr, micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
+
   for (int i = 0; i < output_dims_count; ++i) {
     TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i], 1e-5f);
   }
diff --git a/tensorflow/lite/micro/kernels/softmax.cc b/tensorflow/lite/micro/kernels/softmax.cc
index 6dbae3b0b54..e85c1a4a306 100644
--- a/tensorflow/lite/micro/kernels/softmax.cc
+++ b/tensorflow/lite/micro/kernels/softmax.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -42,7 +43,8 @@ TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,
       TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteInt8);
       if (output->type == kTfLiteInt16) {
         TF_LITE_ENSURE_EQ(context, output->params.zero_point, -32768);
-        // NOTE: Current int16 softmax output does not require symmetric scaling
+        // NOTE: Current int16_t softmax output does not require symmetric
+        // scaling
         // - so no need to verify scale here.
       } else {
         TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);
@@ -73,40 +75,42 @@ TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,
 }  // namespace
 
 // Takes a tensor and performs softmax along the last dimension.
-void SoftmaxFloat(const TfLiteTensor* input, TfLiteTensor* output,
+void SoftmaxFloat(const TfLiteEvalTensor* input, TfLiteEvalTensor* output,
                   const SoftmaxParams& op_data) {
-  tflite::reference_ops::Softmax(
-      op_data, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(output), GetTensorData<float>(output));
+  tflite::reference_ops::Softmax(op_data, tflite::micro::GetTensorShape(input),
+                                 tflite::micro::GetTensorData<float>(input),
+                                 tflite::micro::GetTensorShape(output),
+                                 tflite::micro::GetTensorData<float>(output));
 }
 
-void SoftmaxQuantized(const TfLiteTensor* input, TfLiteTensor* output,
+void SoftmaxQuantized(const TfLiteEvalTensor* input, TfLiteEvalTensor* output,
                       const SoftmaxParams& op_data) {
   if (input->type == kTfLiteUInt8) {
     tflite::reference_ops::Softmax(
-        op_data, GetTensorShape(input), GetTensorData<uint8_t>(input),
-        GetTensorShape(output), GetTensorData<uint8_t>(output));
+        op_data, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<uint8_t>(input),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<uint8_t>(output));
   } else {
     if (output->type == kTfLiteInt16) {
       tflite::reference_ops::Softmax(
-          op_data, GetTensorShape(input), GetTensorData<int8_t>(input),
-          GetTensorShape(output), GetTensorData<int16_t>(output));
+          op_data, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int16_t>(output));
     } else {
       tflite::reference_ops::Softmax(
-          op_data, GetTensorShape(input), GetTensorData<int8_t>(input),
-          GetTensorShape(output), GetTensorData<int8_t>(output));
+          op_data, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int8_t>(output));
     }
   }
 }
 
 void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(SoftmaxParams),
-                                        &data) == kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(SoftmaxParams));
 }
 
 TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
@@ -125,8 +129,8 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
 
   TFLITE_DCHECK(node->user_data != nullptr);
   SoftmaxParams* data = static_cast<SoftmaxParams*>(node->user_data);
diff --git a/tensorflow/lite/micro/kernels/softmax_test.cc b/tensorflow/lite/micro/kernels/softmax_test.cc
index 9ba67eafabc..27828d2de34 100644
--- a/tensorflow/lite/micro/kernels/softmax_test.cc
+++ b/tensorflow/lite/micro/kernels/softmax_test.cc
@@ -16,23 +16,264 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
 namespace {
 
-void TestSoftmaxFloat(std::initializer_list<int> input_dims_data,
-                      std::initializer_list<float> input_data,
-                      std::initializer_list<float> expected_output_data,
-                      std::initializer_list<int> output_dims_data,
-                      float* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+// The Softmax kernel assumes an output in the range [0, 1.0], leading to these
+// quantization parameters.
+const float output_scale_int8 = 1.0f / 256.0f;
+const float output_scale_uint8 = 1.0f / 256.0f;
+const int output_zero_point_int8 = -128;
+const int output_zero_point_uint8 = 0;
+
+// 1-dimensional test data.
+const int flat_size_1d = 5;
+const int shape_1d[] = {1, 5};
+const float input_data_1d[] = {1.0, 2.0, 3.0, 4.0, 5.0};
+const float golden_1d[] = {0.011656231, 0.031684921, 0.086128544, 0.234121657,
+                           0.636408647};
+
+// 2-dimensional test data.
+const int flat_size_2d = 10;
+const int shape_2d[] = {2, 2, 5};
+const float input_data_2d[] = {1.0,  2.0,  3.0,  4.0,  5.0,
+                               -1.0, -2.0, -3.0, -4.0, -5.0};
+const float golden_2d[] = {0.011656231, 0.031684921, 0.086128544, 0.234121657,
+                           0.636408647, 0.636408647, 0.234121657, 0.086128544,
+                           0.031684921, 0.011656231};
+
+// 3-dimensional test data.
+const int flat_size_3d = 60;
+const int shape_3d[] = {3, 3, 4, 5};
+const float input_data_3d[] = {
+    // c = 0
+    // h = 0
+    3.00, 6.00, -5.00, 4.00, -9.00,
+    // h = 1
+    -10.00, -10.00, -8.00, 2.00, 2.00,
+    // h = 2
+    8.00, -5.00, -8.00, 5.00, -6.00,
+    // h = 3
+    -8.00, 6.00, 1.00, -10.00, -8.00,
+
+    // c = 1
+    // h = 0
+    7.00, 6.00, -10.00, -4.00, -5.00,
+    // h = 1
+    2.00, 7.00, 9.00, -9.00, 7.00,
+    // h = 2
+    -4.00, -2.00, 8.00, 2.00, 2.00,
+    // h = 3
+    3.00, 6.00, 6.00, 2.00, 4.00,
+
+    // c = 2
+    // h = 0
+    9.00, 7.00, -7.00, 0.00, 4.00,
+    // h = 1
+    -3.00, 8.00, 8.00, -3.00, -4.00,
+    // h = 2
+    -9.00, -9.00, 4.00, -8.00, -1.00,
+    // h = 3
+    -10.00, -2.00, 6.00, -7.00, 0.00};
+
+float golden_3d[] = {
+    // c = 0
+    // h = 0
+    0.042009463, 0.843782625, 0.000014093, 0.114193561, 0.000000258,
+    // h = 1
+    0.000003072, 0.000003072, 0.000022699, 0.499985578, 0.499985578,
+    // h = 2
+    0.952571219, 0.000002153, 0.000000107, 0.047425728, 0.000000792,
+    // h = 3
+    0.000000826, 0.993305397, 0.006692839, 0.000000112, 0.000000826,
+
+    // c = 1
+    // h = 0
+    0.731046347, 0.268936922, 0.000000030, 0.000012210, 0.000004492,
+    // h = 1
+    0.000717124, 0.106430599, 0.786421666, 0.000000012, 0.106430599,
+    // h = 2
+    0.000006114, 0.000045174, 0.995015917, 0.002466398, 0.002466398,
+    // h = 3
+    0.022595176, 0.453836234, 0.453836234, 0.008312301, 0.061420055,
+
+    // c = 2
+    // h = 0
+    0.875505904, 0.118486839, 0.000000099, 0.000108046, 0.005899112,
+    // h = 1
+    0.000008351, 0.499990113, 0.499990113, 0.000008351, 0.000003072,
+    // h = 2
+    0.000002245, 0.000002245, 0.993296627, 0.000006103, 0.006692780,
+    // h = 3
+    0.000000112, 0.000334520, 0.997191323, 0.000002254, 0.002471790};
+
+// 4-dimensional test data.
+const int flat_size_4d = 120;
+const int shape_4d[] = {4, 2, 3, 4, 5};
+const float input_data_4d[] = {
+    // n = 0
+    // c = 0
+    // h = 0
+    3.00, 6.00, -5.00, 4.00, -9.00,
+    // h = 1
+    -10.00, -10.00, -8.00, 2.00, 2.00,
+    // h = 2
+    8.00, -5.00, -8.00, 5.00, -6.00,
+    // h = 3
+    -8.00, 6.00, 1.00, -10.00, -8.00,
+
+    // c = 1
+    // h = 0
+    7.00, 6.00, -10.00, -4.00, -5.00,
+    // h = 1
+    2.00, 7.00, 9.00, -9.00, 7.00,
+    // h = 2
+    -4.00, -2.00, 8.00, 2.00, 2.00,
+    // h = 3
+    3.00, 6.00, 6.00, 2.00, 4.00,
+
+    // c = 2
+    // h = 0
+    9.00, 7.00, -7.00, 0.00, 4.00,
+    // h = 1
+    -3.00, 8.00, 8.00, -3.00, -4.00,
+    // h = 2
+    -9.00, -9.00, 4.00, -8.00, -1.00,
+    // h = 3
+    -10.00, -2.00, 6.00, -7.00, 0.00,
+
+    // n = 1
+    // c = 0
+    // h = 0
+    -9.00, -8.00, 6.00, -1.00, -5.00,
+    // h = 1
+    -10.00, -5.00, -10.00, 7.00, -2.00,
+    // h = 2
+    -5.00, -4.00, 1.00, 2.00, 2.00,
+    // h = 3
+    -2.00, -2.00, 1.00, 1.00, -4.00,
+
+    // c = 1
+    // h = 0
+    -8.00, -3.00, 1.00, 1.00, -1.00,
+    // h = 1
+    -2.00, 6.00, -1.00, -5.00, 6.00,
+    // h = 2
+    -7.00, 8.00, 9.00, 0.00, 9.00,
+    // h = 3
+    -9.00, -5.00, -2.00, 0.00, 8.00,
+
+    // c = 2
+    // h = 0
+    4.00, 2.00, -3.00, 5.00, 8.00,
+    // h = 1
+    -1.00, 1.00, -4.00, -9.00, 7.00,
+    // h = 2
+    3.00, -8.00, 0.00, 9.00, -4.00,
+    // h = 3
+    8.00, -1.00, 9.00, -9.00, 1.00};
+
+const float golden_4d[] = {
+    // n = 0
+    // c = 0
+    // h = 0
+    0.042009463, 0.843782625, 0.000014093, 0.114193561, 0.000000258,
+    // h = 1
+    0.000003072, 0.000003072, 0.000022699, 0.499985578, 0.499985578,
+    // h = 2
+    0.952571219, 0.000002153, 0.000000107, 0.047425728, 0.000000792,
+    // h = 3
+    0.000000826, 0.993305397, 0.006692839, 0.000000112, 0.000000826,
+
+    // c = 1
+    // h = 0
+    0.731046347, 0.268936922, 0.000000030, 0.000012210, 0.000004492,
+    // h = 1
+    0.000717124, 0.106430599, 0.786421666, 0.000000012, 0.106430599,
+    // h = 2
+    0.000006114, 0.000045174, 0.995015917, 0.002466398, 0.002466398,
+    // h = 3
+    0.022595176, 0.453836234, 0.453836234, 0.008312301, 0.061420055,
+
+    // c = 2
+    // h = 0
+    0.875505904, 0.118486839, 0.000000099, 0.000108046, 0.005899112,
+    // h = 1
+    0.000008351, 0.499990113, 0.499990113, 0.000008351, 0.000003072,
+    // h = 2
+    0.000002245, 0.000002245, 0.993296627, 0.000006103, 0.006692780,
+    // h = 3
+    0.000000112, 0.000334520, 0.997191323, 0.000002254, 0.002471790,
+
+    // n = 1
+    // c = 0
+    // h = 0
+    0.000000306, 0.000000831, 0.999071142, 0.000911035, 0.000016686,
+    // h = 1
+    0.000000041, 0.000006143, 0.000000041, 0.999870380, 0.000123394,
+    // h = 2
+    0.000384554, 0.001045327, 0.155140254, 0.421714933, 0.421714933,
+    // h = 3
+    0.023637081, 0.023637081, 0.474763454, 0.474763454, 0.003198931,
+
+    // c = 1
+    // h = 0
+    0.000057299, 0.008503973, 0.464301197, 0.464301197, 0.062836334,
+    // h = 1
+    0.000167625, 0.499684188, 0.000455653, 0.000008346, 0.499684188,
+    // h = 2
+    0.000000048, 0.155354299, 0.422296769, 0.000052116, 0.422296769,
+    // h = 3
+    0.000000041, 0.000002259, 0.000045383, 0.000335334, 0.999616982,
+
+    // c = 2
+    // h = 0
+    0.017107856, 0.002315297, 0.000015600, 0.046503973, 0.934057274,
+    // h = 1
+    0.000334516, 0.002471755, 0.000016655, 0.000000112, 0.997176963,
+    // h = 2
+    0.002472313, 0.000000041, 0.000123089, 0.997402302, 0.000002254,
+    // h = 3
+    0.268866557, 0.000033181, 0.730855076, 0.000000011, 0.000245175};
+
+template <typename T>
+void ValidateSoftmaxGoldens(TfLiteTensor* tensors, const int tensor_count,
+                            T* output_data, const T* expected_output,
+                            int output_dims_count, float tolerance) {
+  TfLiteSoftmaxParams builtin_data = {1.0f};
+
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+
+  const TfLiteRegistration registration =
+      tflite::ops::micro::Register_SOFTMAX();
+  micro::KernelRunner runner(registration, tensors, tensor_count, inputs_array,
+                             outputs_array, &builtin_data,
+                             micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
+
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output[i], output_data[i], tolerance);
+  }
+}
+
+void TestSoftmaxFloat(const int* input_dims_data, const float* input_data,
+                      const int* output_dims_data,
+                      const float* expected_output_data, float* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
 
-  constexpr int inputs_size = 2;
+  constexpr int inputs_size = 1;
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
@@ -40,166 +281,36 @@ void TestSoftmaxFloat(std::initializer_list<int> input_dims_data,
       CreateFloatTensor(output_data, output_dims),
   };
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SOFTMAX);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  TfLiteSoftmaxParams builtin_data = {1.0f};
-  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-  int inputs_array_data[] = {1, 0};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 1};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
-  for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
-                              1e-5f);
-  }
+  ValidateSoftmaxGoldens(tensors, tensors_size, output_data,
+                         expected_output_data, output_dims_count, 1e-5);
 }
 
-void TestSoftmaxQuantized(std::initializer_list<int> input_dims_data,
-                          std::initializer_list<uint8_t> input_data,
-                          float input_min, float input_max,
-                          std::initializer_list<uint8_t> expected_output_data,
-                          std::initializer_list<int> output_dims_data,
-                          float output_min, float output_max,
-                          uint8_t* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+template <typename T>
+void TestSoftmaxQuantized(const int* input_dims_data, const float* input_data,
+                          T* input_quantized, float input_scale,
+                          int input_zero_point, const int* output_dims_data,
+                          const float* golden, T* golden_quantized,
+                          float output_scale, int output_zero_point,
+                          T* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
 
   constexpr int inputs_size = 1;
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateQuantizedTensor(input_data, input_dims, input_min, input_max),
-      CreateQuantizedTensor(output_data, output_dims, output_min, output_max),
+      CreateQuantizedTensor(input_data, input_quantized, input_dims,
+                            input_scale, input_zero_point),
+      CreateQuantizedTensor(output_data, output_dims, output_scale,
+                            output_zero_point),
   };
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+  AsymmetricQuantize(golden, golden_quantized, output_dims_count, output_scale,
+                     output_zero_point);
 
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SOFTMAX);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  TfLiteSoftmaxParams builtin_data = {1.0f};
-  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-
-  int inputs_array_data[] = {1, 0};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 1};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
-  for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
-  }
-}
-
-void TestSoftmaxQuantizedSigned(
-    std::initializer_list<int> input_dims_data,
-    std::initializer_list<int8_t> input_data, float input_min, float input_max,
-    std::initializer_list<int8_t> expected_output_data,
-    std::initializer_list<int> output_dims_data, float output_min,
-    float output_max, int8_t* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
-  const int output_dims_count = ElementCount(*output_dims);
-
-  constexpr int inputs_size = 1;
-  constexpr int outputs_size = 1;
-  constexpr int tensors_size = inputs_size + outputs_size;
-  TfLiteTensor tensors[tensors_size] = {
-      CreateQuantizedTensor(input_data, input_dims, input_min, input_max),
-      CreateQuantizedTensor(output_data, output_dims, output_min, output_max),
-  };
-
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SOFTMAX);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  TfLiteSoftmaxParams builtin_data = {1.0f};
-  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-
-  int inputs_array_data[] = {1, 0};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 1};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
-  for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
-  }
+  ValidateSoftmaxGoldens(tensors, tensors_size, output_data, golden_quantized,
+                         output_dims_count, 1.0);
 }
 
 }  // namespace
@@ -208,608 +319,152 @@ void TestSoftmaxQuantizedSigned(
 
 TF_LITE_MICRO_TESTS_BEGIN
 
-TF_LITE_MICRO_TEST(SimpleTest) {
-  const int output_dims_count = 10;
-  float output_data[output_dims_count];
-  tflite::testing::TestSoftmaxFloat(  //
-      {2, 2, 5},                      // Input shape.
-      {
-          1.0, 2.0, 3.0, 4.0, 5.0,       // b = 0
-          -1.0, -2.0, -3.0, -4.0, -5.0,  // b = 0
-      },
-      {
-          // Expected results.
-          0.011656231,
-          0.031684921,
-          0.086128544,
-          0.234121657,
-          0.636408647,
-          0.636408647,
-          0.234121657,
-          0.086128544,
-          0.031684921,
-          0.011656231,
-      },
-      {2, 2, 5},  // Output shape.
-      output_data);
+TF_LITE_MICRO_TEST(Softmax1DFloatShouldMatchGolden) {
+  float output_data[tflite::testing::flat_size_1d];
+  tflite::testing::TestSoftmaxFloat(
+      tflite::testing ::shape_1d, tflite::testing::input_data_1d,
+      tflite::testing::shape_1d, tflite::testing::golden_1d, output_data);
 }
 
-TF_LITE_MICRO_TEST(SimpleTestQuantizedUnsigned) {
-  using tflite::testing::F2Q;
+TF_LITE_MICRO_TEST(Softmax1DQuantizedUInt8ShouldMatchGolden) {
+  const float input_scale = 0.1f;
+  const int input_zero_point = 128;
 
-  const float input_min = -63.5f;
-  const float input_max = 64.0f;
-  const float output_min = 0.0f;
-  const float output_max = (255.0f / 256.0f);
-  const int output_dims_count = 5;
-  uint8_t output_data[output_dims_count];
-  tflite::testing::TestSoftmaxQuantized(  //
-      {2, 1, 5},                          // Input shape.
-      {
-          F2Q(1.0, input_min, input_max),
-          F2Q(2.0, input_min, input_max),
-          F2Q(3.0, input_min, input_max),
-          F2Q(4.0, input_min, input_max),
-          F2Q(5.0, input_min, input_max),
-      },
-      input_min, input_max,  // Input quantized range.
-      {
-          // Expected results.
-          F2Q(0.011656231, output_min, output_max),
-          F2Q(0.031684921, output_min, output_max),
-          F2Q(0.086128544, output_min, output_max),
-          F2Q(0.234121657, output_min, output_max),
-          F2Q(0.636408647, output_min, output_max),
-      },
-      {2, 1, 5},               // Output shape.
-      output_min, output_max,  // Output quantized range.
-      output_data);
+  uint8_t input_quantized[tflite::testing::flat_size_1d];
+  uint8_t golden_quantized[tflite::testing::flat_size_1d];
+  uint8_t output_data[tflite::testing::flat_size_1d];
+  tflite::testing::TestSoftmaxQuantized(
+      tflite::testing::shape_1d, tflite::testing::input_data_1d,
+      input_quantized, input_scale, input_zero_point, tflite::testing::shape_1d,
+      tflite::testing::golden_1d, golden_quantized,
+      tflite::testing::output_scale_uint8,
+      tflite::testing::output_zero_point_uint8, output_data);
 }
 
-TF_LITE_MICRO_TEST(SimpleTestQuantizedSigned) {
-  using tflite::testing::F2QS;
+TF_LITE_MICRO_TEST(Softmax1DQuantizedInt8ShouldMatchGolden) {
+  const float input_scale = 0.1f;
+  const int input_zero_point = 0;
 
-  const float input_min = -63.5f;
-  const float input_max = 64.0f;
-  const float output_min = 0.0f;
-  const float output_max = (255.0f / 256.0f);
-  const int output_dims_count = 10;
-  int8_t output_data[output_dims_count];
-  tflite::testing::TestSoftmaxQuantizedSigned(  //
-      {2, 2, 5},                                // Input shape.
-      {                                         // b = 0
-       F2QS(-3.0, input_min, input_max), F2QS(5.0, input_min, input_max),
-       F2QS(-7.0, input_min, input_max), F2QS(9.0, input_min, input_max),
-       F2QS(-11.0, input_min, input_max),
-       // b = 1
-       F2QS(1.0, input_min, input_max), F2QS(2.0, input_min, input_max),
-       F2QS(3.0, input_min, input_max), F2QS(4.0, input_min, input_max),
-       F2QS(5.0, input_min, input_max)},
-      input_min, input_max,  // Input quantized range.
-      {
-          // Expected results.
-          // b = 0
-          F2QS(0.000006933, output_min, output_max),
-          F2QS(0.017986099, output_min, output_max),
-          F2QS(0.000000110, output_min, output_max),
-          F2QS(0.982007754, output_min, output_max),
-          F2QS(0.000000002, output_min, output_max),
-          // b = 1
-          F2QS(0.011656231, output_min, output_max),
-          F2QS(0.031684921, output_min, output_max),
-          F2QS(0.086128544, output_min, output_max),
-          F2QS(0.234121657, output_min, output_max),
-          F2QS(0.636408647, output_min, output_max),
-      },
-      {2, 2, 5},               // Output shape.
-      output_min, output_max,  // Output quantized range.
-      output_data);
+  int8_t input_quantized[tflite::testing::flat_size_1d];
+  int8_t golden_quantized[tflite::testing::flat_size_1d];
+  int8_t output_data[tflite::testing::flat_size_1d];
+  tflite::testing::TestSoftmaxQuantized(
+      tflite::testing::shape_1d, tflite::testing::input_data_1d,
+      input_quantized, input_scale, input_zero_point, tflite::testing::shape_1d,
+      tflite::testing::golden_1d, golden_quantized,
+      tflite::testing::output_scale_int8,
+      tflite::testing::output_zero_point_int8, output_data);
 }
 
-TF_LITE_MICRO_TEST(SimpleTestQuantizedSigned1D) {
-  using tflite::testing::F2QS;
-
-  const float input_min = -63.5f;
-  const float input_max = 64.0f;
-  const float output_min = 0.0f;
-  const float output_max = (255.0f / 256.0f);
-  const int output_dims_count = 5;
-  int8_t output_data[output_dims_count];
-  tflite::testing::TestSoftmaxQuantizedSigned(  //
-      {1, 5},                                   // Input shape.
-      {
-          F2QS(1.0, input_min, input_max),
-          F2QS(2.0, input_min, input_max),
-          F2QS(3.0, input_min, input_max),
-          F2QS(4.0, input_min, input_max),
-          F2QS(5.0, input_min, input_max),
-      },
-      input_min, input_max,  // Input quantized range.
-      {
-          // Expected results.
-          F2QS(0.011656231, output_min, output_max),
-          F2QS(0.031684921, output_min, output_max),
-          F2QS(0.086128544, output_min, output_max),
-          F2QS(0.234121657, output_min, output_max),
-          F2QS(0.636408647, output_min, output_max),
-      },
-      {1, 5},                  // Output shape.
-      output_min, output_max,  // Output quantized range.
-      output_data);
+TF_LITE_MICRO_TEST(Softmax2DFloatShouldMatchGolden) {
+  float output_data[tflite::testing::flat_size_2d];
+  tflite::testing::TestSoftmaxFloat(
+      tflite::testing ::shape_2d, tflite::testing::input_data_2d,
+      tflite::testing::shape_2d, tflite::testing::golden_2d, output_data);
 }
 
-TF_LITE_MICRO_TEST(SimpleTestQuantizedSigned2D) {
-  using tflite::testing::F2QS;
+TF_LITE_MICRO_TEST(Softmax2DQuantizedUInt8ShouldMatchGolden) {
+  const float input_scale = 0.1f;
+  const int input_zero_point = 128;
 
-  const float input_min = -63.5f;
-  const float input_max = 64.0f;
-  const float output_min = 0.0f;
-  const float output_max = (255.0f / 256.0f);
-  const int output_dims_count = 10;
-  int8_t output_data[output_dims_count];
-  tflite::testing::TestSoftmaxQuantizedSigned(  //
-      {2, 2, 5},                                // Input shape.
-      {                                         // h = 0
-       F2QS(-3.0, input_min, input_max), F2QS(5.0, input_min, input_max),
-       F2QS(-7.0, input_min, input_max), F2QS(9.0, input_min, input_max),
-       F2QS(-11.0, input_min, input_max),
-       // h = 1
-       F2QS(1.0, input_min, input_max), F2QS(2.0, input_min, input_max),
-       F2QS(3.0, input_min, input_max), F2QS(4.0, input_min, input_max),
-       F2QS(5.0, input_min, input_max)},
-      input_min, input_max,  // Input quantized range.
-      {
-          // Expected results.
-          // h = 0
-          F2QS(0.000006034, output_min, output_max),
-          F2QS(0.017986099, output_min, output_max),
-          F2QS(0.000000111, output_min, output_max),
-          F2QS(0.982007754, output_min, output_max),
-          F2QS(0.000000002, output_min, output_max),
-          // h = 1
-          F2QS(0.011656231, output_min, output_max),
-          F2QS(0.031684921, output_min, output_max),
-          F2QS(0.086128544, output_min, output_max),
-          F2QS(0.234121657, output_min, output_max),
-          F2QS(0.636408647, output_min, output_max),
-      },
-      {2, 2, 5},               // Output shape.
-      output_min, output_max,  // Output quantized range.
-      output_data);
+  uint8_t input_quantized[tflite::testing::flat_size_2d];
+  uint8_t golden_quantized[tflite::testing::flat_size_2d];
+  uint8_t output_data[tflite::testing::flat_size_2d];
+  tflite::testing::TestSoftmaxQuantized(
+      tflite::testing::shape_2d, tflite::testing::input_data_2d,
+      input_quantized, input_scale, input_zero_point, tflite::testing::shape_2d,
+      tflite::testing::golden_2d, golden_quantized,
+      tflite::testing::output_scale_uint8,
+      tflite::testing::output_zero_point_uint8, output_data);
 }
 
-TF_LITE_MICRO_TEST(SimpleTestQuantizedSigned3D) {
-  using tflite::testing::F2QS;
+TF_LITE_MICRO_TEST(Softmax2DQuantizedInt8ShouldMatchGolden) {
+  const float input_scale = 0.1f;
+  const int input_zero_point = 0;
 
-  const float input_min = -63.5f;
-  const float input_max = 64.0f;
-  const float output_min = 0.0f;
-  const float output_max = (255.0f / 256.0f);
-  const int output_dims_count = 60;
-  int8_t output_data[output_dims_count];
-  tflite::testing::TestSoftmaxQuantizedSigned(  //
-      {3, 3, 4, 5},                             // Input shape.
-      {                                         // n = 0
-       // c = 0
-       // h = 0
-       F2QS(3.00, input_min, input_max), F2QS(6.00, input_min, input_max),
-       F2QS(-5.00, input_min, input_max), F2QS(4.00, input_min, input_max),
-       F2QS(-9.00, input_min, input_max),
-       // h = 1
-       F2QS(-10.00, input_min, input_max), F2QS(-10.00, input_min, input_max),
-       F2QS(-8.00, input_min, input_max), F2QS(2.00, input_min, input_max),
-       F2QS(2.00, input_min, input_max),
-       // h = 2
-       F2QS(8.00, input_min, input_max), F2QS(-5.00, input_min, input_max),
-       F2QS(-8.00, input_min, input_max), F2QS(5.00, input_min, input_max),
-       F2QS(-6.00, input_min, input_max),
-       // h = 3
-       F2QS(-8.00, input_min, input_max), F2QS(6.00, input_min, input_max),
-       F2QS(1.00, input_min, input_max), F2QS(-10.00, input_min, input_max),
-       F2QS(-8.00, input_min, input_max),
-
-       // c = 1
-       // h = 0
-       F2QS(7.00, input_min, input_max), F2QS(6.00, input_min, input_max),
-       F2QS(-10.00, input_min, input_max), F2QS(-4.00, input_min, input_max),
-       F2QS(-5.00, input_min, input_max),
-       // h = 1
-       F2QS(2.00, input_min, input_max), F2QS(7.00, input_min, input_max),
-       F2QS(9.00, input_min, input_max), F2QS(-9.00, input_min, input_max),
-       F2QS(7.00, input_min, input_max),
-       // h = 2
-       F2QS(-4.00, input_min, input_max), F2QS(-2.00, input_min, input_max),
-       F2QS(8.00, input_min, input_max), F2QS(2.00, input_min, input_max),
-       F2QS(2.00, input_min, input_max),
-       // h = 3
-       F2QS(3.00, input_min, input_max), F2QS(6.00, input_min, input_max),
-       F2QS(6.00, input_min, input_max), F2QS(2.00, input_min, input_max),
-       F2QS(4.00, input_min, input_max),
-
-       // c = 2
-       // h = 0
-       F2QS(9.00, input_min, input_max), F2QS(7.00, input_min, input_max),
-       F2QS(-7.00, input_min, input_max), F2QS(0.00, input_min, input_max),
-       F2QS(4.00, input_min, input_max),
-       // h = 1
-       F2QS(-3.00, input_min, input_max), F2QS(8.00, input_min, input_max),
-       F2QS(8.00, input_min, input_max), F2QS(-3.00, input_min, input_max),
-       F2QS(-4.00, input_min, input_max),
-       // h = 2
-       F2QS(-9.00, input_min, input_max), F2QS(-9.00, input_min, input_max),
-       F2QS(4.00, input_min, input_max), F2QS(-8.00, input_min, input_max),
-       F2QS(-1.00, input_min, input_max),
-       // h = 3
-       F2QS(-10.00, input_min, input_max), F2QS(-2.00, input_min, input_max),
-       F2QS(6.00, input_min, input_max), F2QS(-7.00, input_min, input_max),
-       F2QS(0.00, input_min, input_max)},
-      input_min, input_max,  // Input quantized range.
-      {                      // Expected results.
-       // n = 0
-       // c = 0
-       // h = 0
-       F2QS(0.042009463, output_min, output_max),
-       F2QS(0.843782625, output_min, output_max),
-       F2QS(0.000014093, output_min, output_max),
-       F2QS(0.114193561, output_min, output_max),
-       F2QS(0.000000258, output_min, output_max),
-       // h = 1
-       F2QS(0.000003072, output_min, output_max),
-       F2QS(0.000003072, output_min, output_max),
-       F2QS(0.000022699, output_min, output_max),
-       F2QS(0.499985578, output_min, output_max),
-       F2QS(0.499985578, output_min, output_max),
-       // h = 2
-       F2QS(0.952571219, output_min, output_max),
-       F2QS(0.000002153, output_min, output_max),
-       F2QS(0.000000107, output_min, output_max),
-       F2QS(0.047425728, output_min, output_max),
-       F2QS(0.000000792, output_min, output_max),
-       // h = 3
-       F2QS(0.000000826, output_min, output_max),
-       F2QS(0.993305397, output_min, output_max),
-       F2QS(0.006692839, output_min, output_max),
-       F2QS(0.000000112, output_min, output_max),
-       F2QS(0.000000826, output_min, output_max),
-
-       // c = 1
-       // h = 0
-       F2QS(0.731046347, output_min, output_max),
-       F2QS(0.268936922, output_min, output_max),
-       F2QS(0.000000030, output_min, output_max),
-       F2QS(0.000012210, output_min, output_max),
-       F2QS(0.000004492, output_min, output_max),
-       // h = 1
-       F2QS(0.000717124, output_min, output_max),
-       F2QS(0.106430599, output_min, output_max),
-       F2QS(0.786421666, output_min, output_max),
-       F2QS(0.000000012, output_min, output_max),
-       F2QS(0.106430599, output_min, output_max),
-       // h = 2
-       F2QS(0.000006114, output_min, output_max),
-       F2QS(0.000045174, output_min, output_max),
-       F2QS(0.995015917, output_min, output_max),
-       F2QS(0.002466398, output_min, output_max),
-       F2QS(0.002466398, output_min, output_max),
-       // h = 3
-       F2QS(0.022595176, output_min, output_max),
-       F2QS(0.453836234, output_min, output_max),
-       F2QS(0.453836234, output_min, output_max),
-       F2QS(0.008312301, output_min, output_max),
-       F2QS(0.061420055, output_min, output_max),
-
-       // c = 2
-       // h = 0
-       F2QS(0.875505904, output_min, output_max),
-       F2QS(0.118486839, output_min, output_max),
-       F2QS(0.000000099, output_min, output_max),
-       F2QS(0.000108046, output_min, output_max),
-       F2QS(0.005899112, output_min, output_max),
-       // h = 1
-       F2QS(0.000008351, output_min, output_max),
-       F2QS(0.499990113, output_min, output_max),
-       F2QS(0.499990113, output_min, output_max),
-       F2QS(0.000008351, output_min, output_max),
-       F2QS(0.000003072, output_min, output_max),
-       // h = 2
-       F2QS(0.000002245, output_min, output_max),
-       F2QS(0.000002245, output_min, output_max),
-       F2QS(0.993296627, output_min, output_max),
-       F2QS(0.000006103, output_min, output_max),
-       F2QS(0.006692780, output_min, output_max),
-       // h = 3
-       F2QS(0.000000112, output_min, output_max),
-       F2QS(0.000334520, output_min, output_max),
-       F2QS(0.997191323, output_min, output_max),
-       F2QS(0.000002254, output_min, output_max),
-       F2QS(0.002471790, output_min, output_max)},
-      {3, 3, 4, 5},            // Output shape.
-      output_min, output_max,  // Output quantized range.
-      output_data);
+  int8_t input_quantized[tflite::testing::flat_size_2d];
+  int8_t golden_quantized[tflite::testing::flat_size_2d];
+  int8_t output_data[tflite::testing::flat_size_2d];
+  tflite::testing::TestSoftmaxQuantized(
+      tflite::testing::shape_2d, tflite::testing::input_data_2d,
+      input_quantized, input_scale, input_zero_point, tflite::testing::shape_2d,
+      tflite::testing::golden_2d, golden_quantized,
+      tflite::testing::output_scale_int8,
+      tflite::testing::output_zero_point_int8, output_data);
 }
 
-TF_LITE_MICRO_TEST(SimpleTestQuantizedSigned4D) {
-  using tflite::testing::F2QS;
+TF_LITE_MICRO_TEST(Softmax3DFloatShouldMatchGolden) {
+  float output_data[tflite::testing::flat_size_3d];
+  tflite::testing::TestSoftmaxFloat(
+      tflite::testing ::shape_3d, tflite::testing::input_data_3d,
+      tflite::testing::shape_3d, tflite::testing::golden_3d, output_data);
+}
 
-  const float input_min = -63.5f;
-  const float input_max = 64.0f;
-  const float output_min = 0.0f;
-  const float output_max = (255.0f / 256.0f);
-  const int output_dims_count = 120;
-  int8_t output_data[output_dims_count];
-  tflite::testing::TestSoftmaxQuantizedSigned(  //
-      {4, 2, 3, 4, 5},                          // Input shape.
-      {                                         // n = 0
-       // c = 0
-       // h = 0
-       F2QS(3.00, input_min, input_max), F2QS(6.00, input_min, input_max),
-       F2QS(-5.00, input_min, input_max), F2QS(4.00, input_min, input_max),
-       F2QS(-9.00, input_min, input_max),
-       // h = 1
-       F2QS(-10.00, input_min, input_max), F2QS(-10.00, input_min, input_max),
-       F2QS(-8.00, input_min, input_max), F2QS(2.00, input_min, input_max),
-       F2QS(2.00, input_min, input_max),
-       // h = 2
-       F2QS(8.00, input_min, input_max), F2QS(-5.00, input_min, input_max),
-       F2QS(-8.00, input_min, input_max), F2QS(5.00, input_min, input_max),
-       F2QS(-6.00, input_min, input_max),
-       // h = 3
-       F2QS(-8.00, input_min, input_max), F2QS(6.00, input_min, input_max),
-       F2QS(1.00, input_min, input_max), F2QS(-10.00, input_min, input_max),
-       F2QS(-8.00, input_min, input_max),
+TF_LITE_MICRO_TEST(Softmax3DQuantizedUInt8ShouldMatchGolden) {
+  const float input_scale = 0.1f;
+  const int input_zero_point = 128;
 
-       // c = 1
-       // h = 0
-       F2QS(7.00, input_min, input_max), F2QS(6.00, input_min, input_max),
-       F2QS(-10.00, input_min, input_max), F2QS(-4.00, input_min, input_max),
-       F2QS(-5.00, input_min, input_max),
-       // h = 1
-       F2QS(2.00, input_min, input_max), F2QS(7.00, input_min, input_max),
-       F2QS(9.00, input_min, input_max), F2QS(-9.00, input_min, input_max),
-       F2QS(7.00, input_min, input_max),
-       // h = 2
-       F2QS(-4.00, input_min, input_max), F2QS(-2.00, input_min, input_max),
-       F2QS(8.00, input_min, input_max), F2QS(2.00, input_min, input_max),
-       F2QS(2.00, input_min, input_max),
-       // h = 3
-       F2QS(3.00, input_min, input_max), F2QS(6.00, input_min, input_max),
-       F2QS(6.00, input_min, input_max), F2QS(2.00, input_min, input_max),
-       F2QS(4.00, input_min, input_max),
+  uint8_t input_quantized[tflite::testing::flat_size_3d];
+  uint8_t golden_quantized[tflite::testing::flat_size_3d];
+  uint8_t output_data[tflite::testing::flat_size_3d];
+  tflite::testing::TestSoftmaxQuantized(
+      tflite::testing::shape_3d, tflite::testing::input_data_3d,
+      input_quantized, input_scale, input_zero_point, tflite::testing::shape_3d,
+      tflite::testing::golden_3d, golden_quantized,
+      tflite::testing::output_scale_uint8,
+      tflite::testing::output_zero_point_uint8, output_data);
+}
 
-       // c = 2
-       // h = 0
-       F2QS(9.00, input_min, input_max), F2QS(7.00, input_min, input_max),
-       F2QS(-7.00, input_min, input_max), F2QS(0.00, input_min, input_max),
-       F2QS(4.00, input_min, input_max),
-       // h = 1
-       F2QS(-3.00, input_min, input_max), F2QS(8.00, input_min, input_max),
-       F2QS(8.00, input_min, input_max), F2QS(-3.00, input_min, input_max),
-       F2QS(-4.00, input_min, input_max),
-       // h = 2
-       F2QS(-9.00, input_min, input_max), F2QS(-9.00, input_min, input_max),
-       F2QS(4.00, input_min, input_max), F2QS(-8.00, input_min, input_max),
-       F2QS(-1.00, input_min, input_max),
-       // h = 3
-       F2QS(-10.00, input_min, input_max), F2QS(-2.00, input_min, input_max),
-       F2QS(6.00, input_min, input_max), F2QS(-7.00, input_min, input_max),
-       F2QS(0.00, input_min, input_max),
+TF_LITE_MICRO_TEST(Softmax3DQuantizedInt8ShouldMatchGolden) {
+  const float input_scale = 0.1f;
+  const int input_zero_point = 0;
 
-       // n = 1
-       // c = 0
-       // h = 0
-       F2QS(-9.00, input_min, input_max), F2QS(-8.00, input_min, input_max),
-       F2QS(6.00, input_min, input_max), F2QS(-1.00, input_min, input_max),
-       F2QS(-5.00, input_min, input_max),
-       // h = 1
-       F2QS(-10.00, input_min, input_max), F2QS(-5.00, input_min, input_max),
-       F2QS(-10.00, input_min, input_max), F2QS(7.00, input_min, input_max),
-       F2QS(-2.00, input_min, input_max),
-       // h = 2
-       F2QS(-5.00, input_min, input_max), F2QS(-4.00, input_min, input_max),
-       F2QS(1.00, input_min, input_max), F2QS(2.00, input_min, input_max),
-       F2QS(2.00, input_min, input_max),
-       // h = 3
-       F2QS(-2.00, input_min, input_max), F2QS(-2.00, input_min, input_max),
-       F2QS(1.00, input_min, input_max), F2QS(1.00, input_min, input_max),
-       F2QS(-4.00, input_min, input_max),
+  int8_t input_quantized[tflite::testing::flat_size_3d];
+  int8_t golden_quantized[tflite::testing::flat_size_3d];
+  int8_t output_data[tflite::testing::flat_size_3d];
+  tflite::testing::TestSoftmaxQuantized(
+      tflite::testing::shape_3d, tflite::testing::input_data_3d,
+      input_quantized, input_scale, input_zero_point, tflite::testing::shape_3d,
+      tflite::testing::golden_3d, golden_quantized,
+      tflite::testing::output_scale_int8,
+      tflite::testing::output_zero_point_int8, output_data);
+}
 
-       // c = 1
-       // h = 0
-       F2QS(-8.00, input_min, input_max), F2QS(-3.00, input_min, input_max),
-       F2QS(1.00, input_min, input_max), F2QS(1.00, input_min, input_max),
-       F2QS(-1.00, input_min, input_max),
-       // h = 1
-       F2QS(-2.00, input_min, input_max), F2QS(6.00, input_min, input_max),
-       F2QS(-1.00, input_min, input_max), F2QS(-5.00, input_min, input_max),
-       F2QS(6.00, input_min, input_max),
-       // h = 2
-       F2QS(-7.00, input_min, input_max), F2QS(8.00, input_min, input_max),
-       F2QS(9.00, input_min, input_max), F2QS(0.00, input_min, input_max),
-       F2QS(9.00, input_min, input_max),
-       // h = 3
-       F2QS(-9.00, input_min, input_max), F2QS(-5.00, input_min, input_max),
-       F2QS(-2.00, input_min, input_max), F2QS(0.00, input_min, input_max),
-       F2QS(8.00, input_min, input_max),
+TF_LITE_MICRO_TEST(Softmax4DFloatShouldMatchGolden) {
+  float output_data[tflite::testing::flat_size_4d];
+  tflite::testing::TestSoftmaxFloat(
+      tflite::testing ::shape_4d, tflite::testing::input_data_4d,
+      tflite::testing::shape_4d, tflite::testing::golden_4d, output_data);
+}
 
-       // c = 2
-       // h = 0
-       F2QS(4.00, input_min, input_max), F2QS(2.00, input_min, input_max),
-       F2QS(-3.00, input_min, input_max), F2QS(5.00, input_min, input_max),
-       F2QS(8.00, input_min, input_max),
-       // h = 1
-       F2QS(-1.00, input_min, input_max), F2QS(1.00, input_min, input_max),
-       F2QS(-4.00, input_min, input_max), F2QS(-9.00, input_min, input_max),
-       F2QS(7.00, input_min, input_max),
-       // h = 2
-       F2QS(3.00, input_min, input_max), F2QS(-8.00, input_min, input_max),
-       F2QS(0.00, input_min, input_max), F2QS(9.00, input_min, input_max),
-       F2QS(-4.00, input_min, input_max),
-       // h = 3
-       F2QS(8.00, input_min, input_max), F2QS(-1.00, input_min, input_max),
-       F2QS(9.00, input_min, input_max), F2QS(-9.00, input_min, input_max),
-       F2QS(1.00, input_min, input_max)},
-      input_min, input_max,  // Input quantized range.
-      {                      // Expected results.
-       // n = 0
-       // c = 0
-       // h = 0
-       F2QS(0.042009463, output_min, output_max),
-       F2QS(0.843782625, output_min, output_max),
-       F2QS(0.000014093, output_min, output_max),
-       F2QS(0.114193561, output_min, output_max),
-       F2QS(0.000000258, output_min, output_max),
-       // h = 1
-       F2QS(0.000003072, output_min, output_max),
-       F2QS(0.000003072, output_min, output_max),
-       F2QS(0.000022699, output_min, output_max),
-       F2QS(0.499985578, output_min, output_max),
-       F2QS(0.499985578, output_min, output_max),
-       // h = 2
-       F2QS(0.952571219, output_min, output_max),
-       F2QS(0.000002153, output_min, output_max),
-       F2QS(0.000000107, output_min, output_max),
-       F2QS(0.047425728, output_min, output_max),
-       F2QS(0.000000792, output_min, output_max),
-       // h = 3
-       F2QS(0.000000826, output_min, output_max),
-       F2QS(0.993305397, output_min, output_max),
-       F2QS(0.006692839, output_min, output_max),
-       F2QS(0.000000112, output_min, output_max),
-       F2QS(0.000000826, output_min, output_max),
+TF_LITE_MICRO_TEST(Softmax4DQuantizedUInt8ShouldMatchGolden) {
+  const float input_scale = 0.1f;
+  const int input_zero_point = 128;
 
-       // c = 1
-       // h = 0
-       F2QS(0.731046347, output_min, output_max),
-       F2QS(0.268936922, output_min, output_max),
-       F2QS(0.000000030, output_min, output_max),
-       F2QS(0.000012210, output_min, output_max),
-       F2QS(0.000004492, output_min, output_max),
-       // h = 1
-       F2QS(0.000717124, output_min, output_max),
-       F2QS(0.106430599, output_min, output_max),
-       F2QS(0.786421666, output_min, output_max),
-       F2QS(0.000000012, output_min, output_max),
-       F2QS(0.106430599, output_min, output_max),
-       // h = 2
-       F2QS(0.000006114, output_min, output_max),
-       F2QS(0.000045174, output_min, output_max),
-       F2QS(0.995015917, output_min, output_max),
-       F2QS(0.002466398, output_min, output_max),
-       F2QS(0.002466398, output_min, output_max),
-       // h = 3
-       F2QS(0.022595176, output_min, output_max),
-       F2QS(0.453836234, output_min, output_max),
-       F2QS(0.453836234, output_min, output_max),
-       F2QS(0.008312301, output_min, output_max),
-       F2QS(0.061420055, output_min, output_max),
+  uint8_t input_quantized[tflite::testing::flat_size_4d];
+  uint8_t golden_quantized[tflite::testing::flat_size_4d];
+  uint8_t output_data[tflite::testing::flat_size_4d];
+  tflite::testing::TestSoftmaxQuantized(
+      tflite::testing::shape_4d, tflite::testing::input_data_4d,
+      input_quantized, input_scale, input_zero_point, tflite::testing::shape_4d,
+      tflite::testing::golden_4d, golden_quantized,
+      tflite::testing::output_scale_uint8,
+      tflite::testing::output_zero_point_uint8, output_data);
+}
 
-       // c = 2
-       // h = 0
-       F2QS(0.875505904, output_min, output_max),
-       F2QS(0.118486839, output_min, output_max),
-       F2QS(0.000000099, output_min, output_max),
-       F2QS(0.000108046, output_min, output_max),
-       F2QS(0.005899112, output_min, output_max),
-       // h = 1
-       F2QS(0.000008351, output_min, output_max),
-       F2QS(0.499990113, output_min, output_max),
-       F2QS(0.499990113, output_min, output_max),
-       F2QS(0.000008351, output_min, output_max),
-       F2QS(0.000003072, output_min, output_max),
-       // h = 2
-       F2QS(0.000002245, output_min, output_max),
-       F2QS(0.000002245, output_min, output_max),
-       F2QS(0.993296627, output_min, output_max),
-       F2QS(0.000006103, output_min, output_max),
-       F2QS(0.006692780, output_min, output_max),
-       // h = 3
-       F2QS(0.000000112, output_min, output_max),
-       F2QS(0.000334520, output_min, output_max),
-       F2QS(0.997191323, output_min, output_max),
-       F2QS(0.000002254, output_min, output_max),
-       F2QS(0.002471790, output_min, output_max),
+TF_LITE_MICRO_TEST(Softmax4DQuantizedInt8ShouldMatchGolden) {
+  const float input_scale = 0.1f;
+  const int input_zero_point = 0;
 
-       // n = 1
-       // c = 0
-       // h = 0
-       F2QS(0.000000306, output_min, output_max),
-       F2QS(0.000000831, output_min, output_max),
-       F2QS(0.999071142, output_min, output_max),
-       F2QS(0.000911035, output_min, output_max),
-       F2QS(0.000016686, output_min, output_max),
-       // h = 1
-       F2QS(0.000000041, output_min, output_max),
-       F2QS(0.000006143, output_min, output_max),
-       F2QS(0.000000041, output_min, output_max),
-       F2QS(0.999870380, output_min, output_max),
-       F2QS(0.000123394, output_min, output_max),
-       // h = 2
-       F2QS(0.000384554, output_min, output_max),
-       F2QS(0.001045327, output_min, output_max),
-       F2QS(0.155140254, output_min, output_max),
-       F2QS(0.421714933, output_min, output_max),
-       F2QS(0.421714933, output_min, output_max),
-       // h = 3
-       F2QS(0.023637081, output_min, output_max),
-       F2QS(0.023637081, output_min, output_max),
-       F2QS(0.474763454, output_min, output_max),
-       F2QS(0.474763454, output_min, output_max),
-       F2QS(0.003198931, output_min, output_max),
-
-       // c = 1
-       // h = 0
-       F2QS(0.000057299, output_min, output_max),
-       F2QS(0.008503973, output_min, output_max),
-       F2QS(0.464301197, output_min, output_max),
-       F2QS(0.464301197, output_min, output_max),
-       F2QS(0.062836334, output_min, output_max),
-       // h = 1
-       F2QS(0.000167625, output_min, output_max),
-       F2QS(0.499684188, output_min, output_max),
-       F2QS(0.000455653, output_min, output_max),
-       F2QS(0.000008346, output_min, output_max),
-       F2QS(0.499684188, output_min, output_max),
-       // h = 2
-       F2QS(0.000000048, output_min, output_max),
-       F2QS(0.155354299, output_min, output_max),
-       F2QS(0.422296769, output_min, output_max),
-       F2QS(0.000052116, output_min, output_max),
-       F2QS(0.422296769, output_min, output_max),
-       // h = 3
-       F2QS(0.000000041, output_min, output_max),
-       F2QS(0.000002259, output_min, output_max),
-       F2QS(0.000045383, output_min, output_max),
-       F2QS(0.000335334, output_min, output_max),
-       F2QS(0.999616982, output_min, output_max),
-
-       // c = 2
-       // h = 0
-       F2QS(0.017107856, output_min, output_max),
-       F2QS(0.002315297, output_min, output_max),
-       F2QS(0.000015600, output_min, output_max),
-       F2QS(0.046503973, output_min, output_max),
-       F2QS(0.934057274, output_min, output_max),
-       // h = 1
-       F2QS(0.000334516, output_min, output_max),
-       F2QS(0.002471755, output_min, output_max),
-       F2QS(0.000016655, output_min, output_max),
-       F2QS(0.000000112, output_min, output_max),
-       F2QS(0.997176963, output_min, output_max),
-       // h = 2
-       F2QS(0.002472313, output_min, output_max),
-       F2QS(0.000000041, output_min, output_max),
-       F2QS(0.000123089, output_min, output_max),
-       F2QS(0.997402302, output_min, output_max),
-       F2QS(0.000002254, output_min, output_max),
-       // h = 3
-       F2QS(0.268866557, output_min, output_max),
-       F2QS(0.000033181, output_min, output_max),
-       F2QS(0.730855076, output_min, output_max),
-       F2QS(0.000000011, output_min, output_max),
-       F2QS(0.000245175, output_min, output_max)},
-      {4, 2, 3, 4, 5},         // Output shape.
-      output_min, output_max,  // Output quantized range.
-      output_data);
+  int8_t input_quantized[tflite::testing::flat_size_4d];
+  int8_t golden_quantized[tflite::testing::flat_size_4d];
+  int8_t output_data[tflite::testing::flat_size_4d];
+  tflite::testing::TestSoftmaxQuantized(
+      tflite::testing::shape_4d, tflite::testing::input_data_4d,
+      input_quantized, input_scale, input_zero_point, tflite::testing::shape_4d,
+      tflite::testing::golden_4d, golden_quantized,
+      tflite::testing::output_scale_int8,
+      tflite::testing::output_zero_point_int8, output_data);
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/split.cc b/tensorflow/lite/micro/kernels/split.cc
index 926280af997..9bff0b700e7 100644
--- a/tensorflow/lite/micro/kernels/split.cc
+++ b/tensorflow/lite/micro/kernels/split.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -25,10 +26,11 @@ namespace split {
 
 template <typename T>
 TfLiteStatus SplitImpl(TfLiteContext* context, TfLiteNode* node,
-                       const TfLiteTensor* input, int axis_value) {
+                       const TfLiteEvalTensor* input, int axis_value) {
   const int output_count = NumOutputs(node);
   const TfLiteIntArray* input_dims = input->dims;
-  const TfLiteTensor* output0 = GetOutput(context, node, 0);
+  const TfLiteEvalTensor* output0 =
+      tflite::micro::GetEvalOutput(context, node, 0);
   const TfLiteIntArray* output_dims = output0->dims;
 
   const int split_dimensions = input_dims->size;
@@ -50,11 +52,11 @@ TfLiteStatus SplitImpl(TfLiteContext* context, TfLiteNode* node,
     base_inner_size *= input_dims->data[i];
   }
 
-  const T* input_ptr = GetTensorData<T>(input);
+  const T* input_ptr = tflite::micro::GetTensorData<T>(input);
   for (int k = 0; k < outer_size; ++k) {
     for (int i = 0; i < output_count; ++i) {
-      TfLiteTensor* t = GetOutput(context, node, i);
-      T* output_data = GetTensorData<T>(t);
+      TfLiteEvalTensor* t = tflite::micro::GetEvalOutput(context, node, i);
+      T* output_data = tflite::micro::GetTensorData<T>(t);
       const int copy_size = output_dims->data[axis] * base_inner_size;
       T* output_ptr = output_data + k * copy_size;
       for (int j = 0; j < copy_size; ++j) output_ptr[j] = input_ptr[j];
@@ -65,23 +67,28 @@ TfLiteStatus SplitImpl(TfLiteContext* context, TfLiteNode* node,
   return kTfLiteOk;
 }
 
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* axis = GetInput(context, node, 0);
-  const TfLiteTensor* input = GetInput(context, node, 1);
 
   // Dynamic output tensors are needed if axis tensor is not constant.
   // But Micro doesn't support dynamic memory allocation, so we only support
   // constant axis tensor for now.
   TF_LITE_ENSURE_MSG(context, IsConstantTensor(axis),
                      "Non constant axis tensor not supported");
+  return kTfLiteOk;
+}
 
-  int axis_value = GetTensorData<int32_t>(axis)[0];
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* axis = tflite::micro::GetEvalInput(context, node, 0);
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 1);
+
+  int axis_value = tflite::micro::GetTensorData<int32_t>(axis)[0];
   if (axis_value < 0) {
-    axis_value += NumDimensions(input);
+    axis_value += input->dims->size;
   }
 
   TF_LITE_ENSURE(context, axis_value >= 0);
-  TF_LITE_ENSURE(context, axis_value < NumDimensions(input));
+  TF_LITE_ENSURE(context, axis_value < input->dims->size);
 
   switch (input->type) {
     case kTfLiteFloat32: {
@@ -114,7 +121,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 TfLiteRegistration Register_SPLIT() {
   return {/*init=*/nullptr,
           /*free=*/nullptr,
-          /*prepare=*/nullptr,
+          /*prepare=*/split::Prepare,
           /*invoke=*/split::Eval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
diff --git a/tensorflow/lite/micro/kernels/split_test.cc b/tensorflow/lite/micro/kernels/split_test.cc
index 3a51665ed56..711e807b2e8 100644
--- a/tensorflow/lite/micro/kernels/split_test.cc
+++ b/tensorflow/lite/micro/kernels/split_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/debug_log.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
@@ -24,19 +25,15 @@ namespace tflite {
 namespace testing {
 
 void TestSplitTwoOutputsFloat(
-    std::initializer_list<int> input_dims_data,
-    std::initializer_list<float> input_data,
-    std::initializer_list<int> axis_dims_data,
-    std::initializer_list<int32_t> axis_data,
-    std::initializer_list<int> output1_dims_data,
-    std::initializer_list<float> expected_output1_data,
-    std::initializer_list<int> output2_dims_data,
-    std::initializer_list<float> expected_output2_data, float* output1_data,
-    float* output2_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* axis_dims = IntArrayFromInitializer(axis_dims_data);
-  TfLiteIntArray* output1_dims = IntArrayFromInitializer(output1_dims_data);
-  TfLiteIntArray* output2_dims = IntArrayFromInitializer(output2_dims_data);
+    const int* input_dims_data, const float* input_data,
+    const int* axis_dims_data, const int32_t* axis_data,
+    const int* output1_dims_data, const float* expected_output1_data,
+    const int* output2_dims_data, const float* expected_output2_data,
+    float* output1_data, float* output2_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* axis_dims = IntArrayFromInts(axis_dims_data);
+  TfLiteIntArray* output1_dims = IntArrayFromInts(output1_dims_data);
+  TfLiteIntArray* output2_dims = IntArrayFromInts(output2_dims_data);
   const int output1_dims_count = ElementCount(*output1_dims);
   const int output2_dims_count = ElementCount(*output2_dims);
 
@@ -61,76 +58,42 @@ void TestSplitTwoOutputsFloat(
     output2_data[i] = 23;
   }
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SPLIT);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  TfLiteSplitParams builtin_data = {
-      .num_splits = 2,
-  };
-
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, nullptr, 0);
-  }
-
   int inputs_array_data[] = {2, 0, 1};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {2, 2, 3};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
+  const TfLiteRegistration registration = tflite::ops::micro::Register_SPLIT();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array, nullptr, micro_test::reporter);
 
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   for (int i = 0; i < output1_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output1_data.begin()[i], output1_data[i],
-                              1e-5f);
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output1_data[i], output1_data[i], 1e-5f);
   }
 
   for (int i = 0; i < output2_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output2_data.begin()[i], output2_data[i],
-                              1e-5f);
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output2_data[i], output2_data[i], 1e-5f);
   }
 }
 
 void TestSplitFourOutputsFloat(
-    std::initializer_list<int> input_dims_data,
-    std::initializer_list<float> input_data,
-    std::initializer_list<int> axis_dims_data,
-    std::initializer_list<int32_t> axis_data,
-    std::initializer_list<int> output1_dims_data,
-    std::initializer_list<float> expected_output1_data,
-    std::initializer_list<int> output2_dims_data,
-    std::initializer_list<float> expected_output2_data,
-    std::initializer_list<int> output3_dims_data,
-    std::initializer_list<float> expected_output3_data,
-    std::initializer_list<int> output4_dims_data,
-    std::initializer_list<float> expected_output4_data, float* output1_data,
-    float* output2_data, float* output3_data, float* output4_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* axis_dims = IntArrayFromInitializer(axis_dims_data);
-  TfLiteIntArray* output1_dims = IntArrayFromInitializer(output1_dims_data);
-  TfLiteIntArray* output2_dims = IntArrayFromInitializer(output2_dims_data);
-  TfLiteIntArray* output3_dims = IntArrayFromInitializer(output3_dims_data);
-  TfLiteIntArray* output4_dims = IntArrayFromInitializer(output4_dims_data);
+    const int* input_dims_data, const float* input_data,
+    const int* axis_dims_data, const int32_t* axis_data,
+    const int* output1_dims_data, const float* expected_output1_data,
+    const int* output2_dims_data, const float* expected_output2_data,
+    const int* output3_dims_data, const float* expected_output3_data,
+    const int* output4_dims_data, const float* expected_output4_data,
+    float* output1_data, float* output2_data, float* output3_data,
+    float* output4_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* axis_dims = IntArrayFromInts(axis_dims_data);
+  TfLiteIntArray* output1_dims = IntArrayFromInts(output1_dims_data);
+  TfLiteIntArray* output2_dims = IntArrayFromInts(output2_dims_data);
+  TfLiteIntArray* output3_dims = IntArrayFromInts(output3_dims_data);
+  TfLiteIntArray* output4_dims = IntArrayFromInts(output4_dims_data);
   const int output1_dims_count = ElementCount(*output1_dims);
   const int output2_dims_count = ElementCount(*output2_dims);
   const int output3_dims_count = ElementCount(*output3_dims);
@@ -164,77 +127,42 @@ void TestSplitFourOutputsFloat(
     output4_data[i] = 23;
   }
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SPLIT);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  TfLiteSplitParams builtin_data = {
-      .num_splits = 4,
-  };
-
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, nullptr, 0);
-  }
-
   int inputs_array_data[] = {2, 0, 1};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {4, 2, 3, 4, 5};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
+  const TfLiteRegistration registration = tflite::ops::micro::Register_SPLIT();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array, nullptr, micro_test::reporter);
 
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   for (int i = 0; i < output1_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output1_data.begin()[i], output1_data[i],
-                              1e-5f);
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output1_data[i], output1_data[i], 1e-5f);
   }
   for (int i = 0; i < output2_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output2_data.begin()[i], output2_data[i],
-                              1e-5f);
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output2_data[i], output2_data[i], 1e-5f);
   }
   for (int i = 0; i < output3_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output3_data.begin()[i], output3_data[i],
-                              1e-5f);
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output3_data[i], output3_data[i], 1e-5f);
   }
   for (int i = 0; i < output4_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output4_data.begin()[i], output4_data[i],
-                              1e-5f);
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output4_data[i], output4_data[i], 1e-5f);
   }
 }
 
 void TestSplitTwoOutputsQuantized(
-    std::initializer_list<int> input_dims_data,
-    std::initializer_list<uint8_t> input_data,
-    std::initializer_list<int> axis_dims_data,
-    std::initializer_list<int32_t> axis_data,
-    std::initializer_list<int> output1_dims_data,
-    std::initializer_list<uint8_t> expected_output1_data,
-    std::initializer_list<int> output2_dims_data,
-    std::initializer_list<uint8_t> expected_output2_data, uint8_t* output1_data,
-    uint8_t* output2_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* axis_dims = IntArrayFromInitializer(axis_dims_data);
-  TfLiteIntArray* output1_dims = IntArrayFromInitializer(output1_dims_data);
-  TfLiteIntArray* output2_dims = IntArrayFromInitializer(output2_dims_data);
+    const int* input_dims_data, const uint8_t* input_data,
+    const int* axis_dims_data, const int32_t* axis_data,
+    const int* output1_dims_data, const uint8_t* expected_output1_data,
+    const int* output2_dims_data, const uint8_t* expected_output2_data,
+    uint8_t* output1_data, uint8_t* output2_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* axis_dims = IntArrayFromInts(axis_dims_data);
+  TfLiteIntArray* output1_dims = IntArrayFromInts(output1_dims_data);
+  TfLiteIntArray* output2_dims = IntArrayFromInts(output2_dims_data);
   const int output1_dims_count = ElementCount(*output1_dims);
   const int output2_dims_count = ElementCount(*output2_dims);
 
@@ -260,68 +188,37 @@ void TestSplitTwoOutputsQuantized(
     output2_data[i] = 23;
   }
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SPLIT);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  TfLiteSplitParams builtin_data = {
-      .num_splits = 2,
-  };
-
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, nullptr, 0);
-  }
-
   int inputs_array_data[] = {2, 0, 1};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {2, 2, 3};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
+  const TfLiteRegistration registration = tflite::ops::micro::Register_SPLIT();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array, nullptr, micro_test::reporter);
 
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   for (int i = 0; i < output1_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output1_data.begin()[i], output1_data[i]);
+    TF_LITE_MICRO_EXPECT_EQ(expected_output1_data[i], output1_data[i]);
   }
 
   for (int i = 0; i < output2_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output2_data.begin()[i], output2_data[i]);
+    TF_LITE_MICRO_EXPECT_EQ(expected_output2_data[i], output2_data[i]);
   }
 }
 
 void TestSplitTwoOutputsQuantized32(
-    std::initializer_list<int> input_dims_data,
-    std::initializer_list<int32_t> input_data,
-    std::initializer_list<int> axis_dims_data,
-    std::initializer_list<int32_t> axis_data,
-    std::initializer_list<int> output1_dims_data,
-    std::initializer_list<int32_t> expected_output1_data,
-    std::initializer_list<int> output2_dims_data,
-    std::initializer_list<int32_t> expected_output2_data, int32_t* output1_data,
-    int32_t* output2_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* axis_dims = IntArrayFromInitializer(axis_dims_data);
-  TfLiteIntArray* output1_dims = IntArrayFromInitializer(output1_dims_data);
-  TfLiteIntArray* output2_dims = IntArrayFromInitializer(output2_dims_data);
+    const int* input_dims_data, const int32_t* input_data,
+    const int* axis_dims_data, const int32_t* axis_data,
+    const int* output1_dims_data, const int32_t* expected_output1_data,
+    const int* output2_dims_data, const int32_t* expected_output2_data,
+    int32_t* output1_data, int32_t* output2_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* axis_dims = IntArrayFromInts(axis_dims_data);
+  TfLiteIntArray* output1_dims = IntArrayFromInts(output1_dims_data);
+  TfLiteIntArray* output2_dims = IntArrayFromInts(output2_dims_data);
   const int output1_dims_count = ElementCount(*output1_dims);
   const int output2_dims_count = ElementCount(*output2_dims);
 
@@ -347,51 +244,24 @@ void TestSplitTwoOutputsQuantized32(
     output2_data[i] = 23;
   }
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SPLIT);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  TfLiteSplitParams builtin_data = {
-      .num_splits = 2,
-  };
-
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, nullptr, 0);
-  }
-
   int inputs_array_data[] = {2, 0, 1};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {2, 2, 3};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
+  const TfLiteRegistration registration = tflite::ops::micro::Register_SPLIT();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array, nullptr, micro_test::reporter);
 
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   for (int i = 0; i < output1_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output1_data.begin()[i], output1_data[i]);
+    TF_LITE_MICRO_EXPECT_EQ(expected_output1_data[i], output1_data[i]);
   }
 
   for (int i = 0; i < output2_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output2_data.begin()[i], output2_data[i]);
+    TF_LITE_MICRO_EXPECT_EQ(expected_output2_data[i], output2_data[i]);
   }
 }
 
@@ -401,91 +271,119 @@ void TestSplitTwoOutputsQuantized32(
 TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(TwoSplitFourDimensionalAxisZero) {
+  const int input_shape[] = {4, 2, 2, 2, 2};
+  const float input_data[] = {1, 2,  3,  4,  5,  6,  7,  8,
+                              9, 10, 11, 12, 13, 14, 15, 16};
+  const int axis_shape[] = {1, 1};
+  const int32_t axis_data[] = {0};
+  const int output1_shape[] = {4, 1, 2, 2, 2};
+  const float golden1[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  const int output2_shape[] = {4, 1, 2, 2, 2};
+  const float golden2[] = {9, 10, 11, 12, 13, 14, 15, 16};
+
   constexpr int output1_dims_count = 8;
   constexpr int output2_dims_count = 8;
   float output1_data[output1_dims_count];
   float output2_data[output2_dims_count];
   tflite::testing::TestSplitTwoOutputsFloat(
-      {4, 2, 2, 2, 2},                                          // Input shape
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},  // Input values
-      {1, 1},                                                   // Axis shape
-      {0},                                                      // Axis value
-      {4, 1, 2, 2, 2},                                          // Output1 shape
-      {1, 2, 3, 4, 5, 6, 7, 8},         // Output1 values
-      {4, 1, 2, 2, 2},                  // Output2 shape
-      {9, 10, 11, 12, 13, 14, 15, 16},  // Output2 values
-      output1_data, output2_data);
+      input_shape, input_data, axis_shape, axis_data, output1_shape, golden1,
+      output2_shape, golden2, output1_data, output2_data);
 }
 
 TF_LITE_MICRO_TEST(TwoSplitFourDimensionalAxisOne) {
+  const int input_shape[] = {4, 2, 2, 2, 2};
+  const float input_data[] = {1, 2,  3,  4,  5,  6,  7,  8,
+                              9, 10, 11, 12, 13, 14, 15, 16};
+  const int axis_shape[] = {1, 1};
+  const int32_t axis_data[] = {1};
+  const int output1_shape[] = {4, 2, 1, 2, 2};
+  const float golden1[] = {1, 2, 3, 4, 9, 10, 11, 12};
+  const int output2_shape[] = {4, 2, 1, 2, 2};
+  const float golden2[] = {5, 6, 7, 8, 13, 14, 15, 16};
+
   constexpr int output1_dims_count = 8;
   constexpr int output2_dims_count = 8;
   float output1_data[output1_dims_count];
   float output2_data[output2_dims_count];
   tflite::testing::TestSplitTwoOutputsFloat(
-      {4, 2, 2, 2, 2},                                          // Input shape
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},  // Input values
-      {1, 1},                                                   // Axis shape
-      {1},                                                      // Axis value
-      {4, 2, 1, 2, 2},                                          // Output1 shape
-      {1, 2, 3, 4, 9, 10, 11, 12},   // Output1 values
-      {4, 2, 1, 2, 2},               // Output2 shape
-      {5, 6, 7, 8, 13, 14, 15, 16},  // Output2 values
-      output1_data, output2_data);
+      input_shape, input_data, axis_shape, axis_data, output1_shape, golden1,
+      output2_shape, golden2, output1_data, output2_data);
 }
 
 TF_LITE_MICRO_TEST(TwoSplitFourDimensionalAxisTwo) {
+  const int input_shape[] = {4, 2, 2, 2, 2};
+  const float input_data[] = {1, 2,  3,  4,  5,  6,  7,  8,
+                              9, 10, 11, 12, 13, 14, 15, 16};
+  const int axis_shape[] = {1, 1};
+  const int32_t axis_data[] = {2};
+  const int output1_shape[] = {4, 2, 2, 1, 2};
+  const float golden1[] = {1, 2, 5, 6, 9, 10, 13, 14};
+  const int output2_shape[] = {4, 2, 2, 1, 2};
+  const float golden2[] = {3, 4, 7, 8, 11, 12, 15, 16};
+
   constexpr int output1_dims_count = 8;
   constexpr int output2_dims_count = 8;
   float output1_data[output1_dims_count];
   float output2_data[output2_dims_count];
   tflite::testing::TestSplitTwoOutputsFloat(
-      {4, 2, 2, 2, 2},                                          // Input shape
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},  // Input values
-      {1, 1},                                                   // Axis shape
-      {2},                                                      // Axis value
-      {4, 2, 2, 1, 2},                                          // Output1 shape
-      {1, 2, 5, 6, 9, 10, 13, 14},   // Output1 values
-      {4, 2, 2, 1, 2},               // Output2 shape
-      {3, 4, 7, 8, 11, 12, 15, 16},  // Output2 values
-      output1_data, output2_data);
+      input_shape, input_data, axis_shape, axis_data, output1_shape, golden1,
+      output2_shape, golden2, output1_data, output2_data);
 }
 
 TF_LITE_MICRO_TEST(TwoSplitFourDimensionalAxisThree) {
+  const int input_shape[] = {4, 2, 2, 2, 2};
+  const float input_data[] = {1, 2,  3,  4,  5,  6,  7,  8,
+                              9, 10, 11, 12, 13, 14, 15, 16};
+  const int axis_shape[] = {1, 1};
+  const int32_t axis_data[] = {3};
+  const int output1_shape[] = {4, 2, 2, 2, 1};
+  const float golden1[] = {1, 3, 5, 7, 9, 11, 13, 15};
+  const int output2_shape[] = {4, 2, 2, 2, 1};
+  const float golden2[] = {2, 4, 6, 8, 10, 12, 14, 16};
+
   constexpr int output1_dims_count = 8;
   constexpr int output2_dims_count = 8;
   float output1_data[output1_dims_count];
   float output2_data[output2_dims_count];
   tflite::testing::TestSplitTwoOutputsFloat(
-      {4, 2, 2, 2, 2},                                          // Input shape
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},  // Input values
-      {1, 1},                                                   // Axis shape
-      {3},                                                      // Axis value
-      {4, 2, 2, 2, 1},                                          // Output1 shape
-      {1, 3, 5, 7, 9, 11, 13, 15},   // Output1 values
-      {4, 2, 2, 2, 1},               // Output2 shape
-      {2, 4, 6, 8, 10, 12, 14, 16},  // Output2 values
-      output1_data, output2_data);
+      input_shape, input_data, axis_shape, axis_data, output1_shape, golden1,
+      output2_shape, golden2, output1_data, output2_data);
 }
 
 TF_LITE_MICRO_TEST(TwoSplitFourDimensionalNegativeAxis) {
+  const int input_shape[] = {4, 2, 2, 2, 2};
+  const float input_data[] = {1, 2,  3,  4,  5,  6,  7,  8,
+                              9, 10, 11, 12, 13, 14, 15, 16};
+  const int axis_shape[] = {1, 1};
+  const int32_t axis_data[] = {-4};
+  const int output1_shape[] = {4, 1, 2, 2, 2};
+  const float golden1[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  const int output2_shape[] = {4, 1, 2, 2, 2};
+  const float golden2[] = {9, 10, 11, 12, 13, 14, 15, 16};
+
   constexpr int output1_dims_count = 8;
   constexpr int output2_dims_count = 8;
   float output1_data[output1_dims_count];
   float output2_data[output2_dims_count];
   tflite::testing::TestSplitTwoOutputsFloat(
-      {4, 2, 2, 2, 2},                                          // Input shape
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},  // Input values
-      {1, 1},                                                   // Axis shape
-      {-4},                                                     // Axis value
-      {4, 1, 2, 2, 2},                                          // Output1 shape
-      {1, 2, 3, 4, 5, 6, 7, 8},         // Output1 values
-      {4, 1, 2, 2, 2},                  // Output2 shape
-      {9, 10, 11, 12, 13, 14, 15, 16},  // Output2 values
-      output1_data, output2_data);
+      input_shape, input_data, axis_shape, axis_data, output1_shape, golden1,
+      output2_shape, golden2, output1_data, output2_data);
 }
 
 TF_LITE_MICRO_TEST(FourSplit) {
+  const int input_shape[] = {1, 4};
+  const float input_data[] = {1, 2, 3, 4};
+  const int axis_shape[] = {1, 1};
+  const int32_t axis_data[] = {0};
+  const int output1_shape[] = {1, 1};
+  const float golden1[] = {1};
+  const int output2_shape[] = {1, 1};
+  const float golden2[] = {2};
+  const int output3_shape[] = {1, 1};
+  const float golden3[] = {3};
+  const int output4_shape[] = {1, 1};
+  const float golden4[] = {4};
+
   constexpr int output1_dims_count = 1;
   constexpr int output2_dims_count = 1;
   constexpr int output3_dims_count = 1;
@@ -494,70 +392,69 @@ TF_LITE_MICRO_TEST(FourSplit) {
   float output2_data[output2_dims_count];
   float output3_data[output3_dims_count];
   float output4_data[output4_dims_count];
-  tflite::testing::TestSplitFourOutputsFloat({1, 4},        // Input shape
-                                             {1, 2, 3, 4},  // Input values
-                                             {1, 1},        // Axis shape
-                                             {0},           // Axis value
-                                             {1, 1},        // Output1 shape
-                                             {1},           // Output1 values
-                                             {1, 1},        // Output2 shape
-                                             {2},           // Output2 values
-                                             {1, 1},        // Output3 shape
-                                             {3},           // Output3 values
-                                             {1, 1},        // Output4 shape
-                                             {4},           // Output4 values
-                                             output1_data, output2_data,
-                                             output3_data, output4_data);
+  tflite::testing::TestSplitFourOutputsFloat(
+      input_shape, input_data, axis_shape, axis_data, output1_shape, golden1,
+      output2_shape, golden2, output3_shape, golden3, output4_shape, golden4,
+      output1_data, output2_data, output3_data, output4_data);
 }
 
 TF_LITE_MICRO_TEST(TwoSplitOneDimensional) {
+  const int input_shape[] = {1, 2};
+  const float input_data[] = {1, 2};
+  const int axis_shape[] = {1, 1};
+  const int32_t axis_data[] = {0};
+  const int output1_shape[] = {1, 1};
+  const float golden1[] = {1};
+  const int output2_shape[] = {1, 1};
+  const float golden2[] = {2};
+
   constexpr int output1_dims_count = 8;
   constexpr int output2_dims_count = 8;
   float output1_data[output1_dims_count];
   float output2_data[output2_dims_count];
-  tflite::testing::TestSplitTwoOutputsFloat({1, 2},  // Input shape
-                                            {1, 2},  // Input values
-                                            {1, 1},  // Axis shape
-                                            {0},     // Axis value
-                                            {1, 1},  // Output1 shape
-                                            {1},     // Output1 values
-                                            {1, 1},  // Output2 shape
-                                            {2},     // Output2 values
-                                            output1_data, output2_data);
+  tflite::testing::TestSplitTwoOutputsFloat(
+      input_shape, input_data, axis_shape, axis_data, output1_shape, golden1,
+      output2_shape, golden2, output1_data, output2_data);
 }
 
 TF_LITE_MICRO_TEST(TwoSplitFourDimensionalQuantized) {
+  const int input_shape[] = {4, 2, 2, 2, 2};
+  const uint8_t input_data[] = {1, 2,  3,  4,  5,  6,  7,  8,
+                                9, 10, 11, 12, 13, 14, 15, 16};
+  const int axis_shape[] = {1, 1};
+  const int32_t axis_data[] = {1};
+  const int output1_shape[] = {4, 2, 1, 2, 2};
+  const uint8_t golden1[] = {1, 2, 3, 4, 9, 10, 11, 12};
+  const int output2_shape[] = {4, 2, 1, 2, 2};
+  const uint8_t golden2[] = {5, 6, 7, 8, 13, 14, 15, 16};
+
   constexpr int output1_dims_count = 8;
   constexpr int output2_dims_count = 8;
   uint8_t output1_data[output1_dims_count];
   uint8_t output2_data[output2_dims_count];
   tflite::testing::TestSplitTwoOutputsQuantized(
-      {4, 2, 2, 2, 2},                                          // Input shape
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},  // Input values
-      {1, 1},                                                   // Axis shape
-      {0},                                                      // Axis value
-      {4, 1, 2, 2, 2},                                          // Output1 shape
-      {1, 2, 3, 4, 5, 6, 7, 8},         // Output1 values
-      {4, 1, 2, 2, 2},                  // Output2 shape
-      {9, 10, 11, 12, 13, 14, 15, 16},  // Output2 values
-      output1_data, output2_data);
+      input_shape, input_data, axis_shape, axis_data, output1_shape, golden1,
+      output2_shape, golden2, output1_data, output2_data);
 }
 
 TF_LITE_MICRO_TEST(TwoSplitFourDimensionalQuantized32) {
+  const int input_shape[] = {4, 2, 2, 2, 2};
+  const int32_t input_data[] = {1, 2,  3,  4,  5,  6,  7,  8,
+                                9, 10, 11, 12, 13, 14, 15, 16};
+  const int axis_shape[] = {1, 1};
+  const int32_t axis_data[] = {1};
+  const int output1_shape[] = {4, 2, 1, 2, 2};
+  const int32_t golden1[] = {1, 2, 3, 4, 9, 10, 11, 12};
+  const int output2_shape[] = {4, 2, 1, 2, 2};
+  const int32_t golden2[] = {5, 6, 7, 8, 13, 14, 15, 16};
+
   constexpr int output1_dims_count = 8;
   constexpr int output2_dims_count = 8;
   int32_t output1_data[output1_dims_count];
   int32_t output2_data[output2_dims_count];
   tflite::testing::TestSplitTwoOutputsQuantized32(
-      {4, 2, 2, 2, 2},                                          // Input shape
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},  // Input values
-      {1, 1},                                                   // Axis shape
-      {0},                                                      // Axis value
-      {4, 1, 2, 2, 2},                                          // Output1 shape
-      {1, 2, 3, 4, 5, 6, 7, 8},         // Output1 values
-      {4, 1, 2, 2, 2},                  // Output2 shape
-      {9, 10, 11, 12, 13, 14, 15, 16},  // Output2 values
-      output1_data, output2_data);
+      input_shape, input_data, axis_shape, axis_data, output1_shape, golden1,
+      output2_shape, golden2, output1_data, output2_data);
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/strided_slice.cc b/tensorflow/lite/micro/kernels/strided_slice.cc
index 3307d67a9ab..2dbe6e1debf 100644
--- a/tensorflow/lite/micro/kernels/strided_slice.cc
+++ b/tensorflow/lite/micro/kernels/strided_slice.cc
@@ -15,23 +15,20 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/strided_slice.h"
 
 #include <cmath>
+#include <cstring>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
 namespace micro {
 namespace strided_slice {
 
-enum KernelType {
-  kReference,
-  // TODO(soroosh): add kGenericOptimized
-};
-
 constexpr int kInputTensor = 0;
 constexpr int kBeginTensor = 1;
 constexpr int kEndTensor = 2;
@@ -120,58 +117,70 @@ TfLiteStatus CheckOutputSize(TfLiteContext* context,
   return kTfLiteOk;
 }
 
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(StridedSliceParams));
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  StridedSliceParams* op_params =
+      static_cast<StridedSliceParams*>(node->user_data);
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 4);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   StridedSliceContext op_context(context, node);
   TF_LITE_ENSURE_MSG(context, op_context.dims <= kMaxDim,
                      "input dim should not exceed 4");
+  auto params = BuildStridedSliceParams(&op_context);
+  memcpy(op_params, &params, sizeof(StridedSliceParams));
   return CheckOutputSize(context, &op_context);
 }
 
-template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  StridedSliceContext op_context(context, node);
-  auto op_params = BuildStridedSliceParams(&op_context);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const StridedSliceParams& op_params =
+      *(static_cast<const StridedSliceParams*>(node->user_data));
 
-#define TF_LITE_STRIDED_SLICE(kernel_type, data_type)                    \
-  kernel_type::StridedSlice(op_params, GetTensorShape(op_context.input), \
-                            GetTensorData<data_type>(op_context.input),  \
-                            GetTensorShape(op_context.output),           \
-                            GetTensorData<data_type>(op_context.output))
-
-  switch (op_context.input->type) {
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+  switch (output->type) {
     case kTfLiteFloat32:
-      if (kernel_type == kReference) {
-        TF_LITE_STRIDED_SLICE(reference_ops, float);
-      }
+      reference_ops::StridedSlice(op_params,
+                                  tflite::micro::GetTensorShape(input),
+                                  tflite::micro::GetTensorData<float>(input),
+                                  tflite::micro::GetTensorShape(output),
+                                  tflite::micro::GetTensorData<float>(output));
       break;
     case kTfLiteUInt8:
-      if (kernel_type == kReference) {
-        TF_LITE_STRIDED_SLICE(reference_ops, uint8_t);
-      }
+      reference_ops::StridedSlice(
+          op_params, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<uint8_t>(input),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<uint8_t>(output));
       break;
     case kTfLiteInt8:
-      if (kernel_type == kReference) {
-        TF_LITE_STRIDED_SLICE(reference_ops, int8_t);
-      }
+      reference_ops::StridedSlice(op_params,
+                                  tflite::micro::GetTensorShape(input),
+                                  tflite::micro::GetTensorData<int8_t>(input),
+                                  tflite::micro::GetTensorShape(output),
+                                  tflite::micro::GetTensorData<int8_t>(output));
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(op_context.input->type),
-                         op_context.input->type);
+                         TfLiteTypeGetName(input->type), input->type);
       return kTfLiteError;
   }
-#undef TF_LITE_STRIDED_SLICE
   return kTfLiteOk;
 }
 }  // namespace strided_slice
 
 TfLiteRegistration Register_STRIDED_SLICE() {
-  return {/*init=*/nullptr,
+  return {/*init=*/strided_slice::Init,
           /*free=*/nullptr,
           /*prepare=*/strided_slice::Prepare,
-          /*invoke=*/strided_slice::Eval<strided_slice::kReference>,
+          /*invoke=*/strided_slice::Eval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
           /*custom_name=*/nullptr,
diff --git a/tensorflow/lite/micro/kernels/strided_slice_test.cc b/tensorflow/lite/micro/kernels/strided_slice_test.cc
index 6ef162aea3d..a2a472af990 100644
--- a/tensorflow/lite/micro/kernels/strided_slice_test.cc
+++ b/tensorflow/lite/micro/kernels/strided_slice_test.cc
@@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstdint>
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
@@ -22,1161 +25,1049 @@ namespace tflite {
 namespace testing {
 namespace {
 
-template <typename input_type = int32_t,
-          TfLiteType tensor_input_type = kTfLiteInt32>
-inline TfLiteTensor CreateTensor(const input_type* data, TfLiteIntArray* dims,
-                                 bool is_variable = false) {
-  TfLiteTensor result;
-  result.type = tensor_input_type;
-  result.data.raw = reinterpret_cast<char*>(const_cast<input_type*>(data));
-  result.dims = dims;
-  result.allocation_type = kTfLiteMemNone;
-  result.bytes = ElementCount(*dims) * sizeof(input_type);
-  result.is_variable = is_variable;
-  return result;
-}
-
-template <typename input_type = int32_t,
-          TfLiteType tensor_input_type = kTfLiteInt32>
-inline TfLiteTensor CreateTensor(std::initializer_list<input_type> data,
-                                 TfLiteIntArray* dims,
-                                 bool is_variable = false) {
-  return CreateTensor<input_type, tensor_input_type>(data.begin(), dims,
-                                                     is_variable);
-}
-
-template <typename input_type = float,
-          TfLiteType tensor_input_type = kTfLiteFloat32>
-void TestStrideSlide(std::initializer_list<int> input_shape,
-                     std::initializer_list<int> begin_shape,
-                     std::initializer_list<int> end_shape,
-                     std::initializer_list<int> strides_shape, int begin_mask,
-                     int end_mask, int ellipsis_mask, int new_axis_mask,
-                     int shrink_axis_mask,
-                     std::initializer_list<input_type> input_data,
-                     std::initializer_list<int32_t> begin_data,
-                     std::initializer_list<int32_t> end_data,
-                     std::initializer_list<int32_t> strides_data,
-                     std::initializer_list<int> output_shape,
-                     input_type* output_data,
-                     std::initializer_list<int> expected_output,
-                     bool expect_prepare_err, bool expect_invoke_err,
-                     int num_invoke = 1) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_shape);
-  TfLiteIntArray* begin_dims = IntArrayFromInitializer(begin_shape);
-  TfLiteIntArray* end_dims = IntArrayFromInitializer(end_shape);
-  TfLiteIntArray* strides_dims = IntArrayFromInitializer(strides_shape);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_shape);
-  constexpr int inputs_size = 4;
-  constexpr int outputs_size = 1;
-  constexpr int tensors_size = inputs_size + outputs_size;
-  TfLiteTensor tensors[tensors_size] = {
-      CreateTensor<input_type, tensor_input_type>(input_data, input_dims),
-      CreateTensor<int32_t, kTfLiteInt32>(begin_data, begin_dims),
-      CreateTensor<int32_t, kTfLiteInt32>(end_data, end_dims),
-      CreateTensor<int32_t, kTfLiteInt32>(strides_data, strides_dims),
-      CreateTensor<input_type, tensor_input_type>(output_data, output_dims),
-  };
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_STRIDED_SLICE);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-  TfLiteStridedSliceParams builtin_data = {begin_mask, end_mask, ellipsis_mask,
-                                           new_axis_mask, shrink_axis_mask};
-  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-
+template <typename T>
+void ValidateStridedSliceGoldens(TfLiteTensor* tensors, int tensors_size,
+                                 const T* golden, T* output, int output_len,
+                                 TfLiteStridedSliceParams* params,
+                                 const bool expect_prepare_err, int num_invoke,
+                                 float tolerance = 1e-5) {
   int inputs_array_data[] = {4, 0, 1, 2, 3};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 4};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  if (registration->prepare) {
-    if (expect_prepare_err) {
-      TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
-                              registration->prepare(&context, &node));
-      return;
-    }
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  if (expect_invoke_err) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
-                            registration->invoke(&context, &node));
+
+  const TfLiteRegistration registration =
+      tflite::ops::micro::Register_STRIDED_SLICE();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array, reinterpret_cast<void*>(params),
+                             micro_test::reporter);
+  if (expect_prepare_err) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteError, runner.InitAndPrepare());
     return;
+  } else {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
   }
-  for (int i = 0; i < num_invoke; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+
+  for (int i = 0; i < num_invoke; i++) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
   }
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
-  auto* output_tensor = &context.tensors[node.outputs->data[0]];
-  for (int i = 0; i < expected_output.size(); ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output.begin()[i], output_data[i],
-                              1e-5f);
+
+  for (int i = 0; i < output_len; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(golden[i], output[i], 1e-5f);
   }
 }
 
+void TestStridedSliceFloat(const int* input_shape, const int* begin_shape,
+                           const int* end_shape, const int* strides_shape,
+                           TfLiteStridedSliceParams* builtin_data,
+                           float* input_data, const int32_t* begin_data,
+                           const int32_t* end_data, const int32_t* strides_data,
+                           const int* output_shape, float* output_data,
+                           const float* expected_output,
+                           bool expect_prepare_err, int num_invoke = 1) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_shape);
+  TfLiteIntArray* begin_dims = IntArrayFromInts(begin_shape);
+  TfLiteIntArray* end_dims = IntArrayFromInts(end_shape);
+  TfLiteIntArray* strides_dims = IntArrayFromInts(strides_shape);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_shape);
+  constexpr int inputs_size = 4;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input_data, input_dims),
+      CreateQuantized32Tensor(begin_data, begin_dims, 1.0),
+      CreateQuantized32Tensor(end_data, end_dims, 1.0),
+      CreateQuantized32Tensor(strides_data, strides_dims, 1.0),
+      CreateFloatTensor(output_data, output_dims),
+  };
+
+  ValidateStridedSliceGoldens(tensors, tensors_size, expected_output,
+                              output_data, ElementCount(*output_dims),
+                              builtin_data, expect_prepare_err, num_invoke,
+                              1.0);
+}
+
+template <typename T>
+void TestStridedSliceQuantized(
+    const int* input_shape, const int* begin_shape, const int* end_shape,
+    const int* strides_shape, TfLiteStridedSliceParams* builtin_data,
+    const T* input_data, const int32_t* begin_data, const int32_t* end_data,
+    const int32_t* strides_data, const int* output_shape, T* output_data,
+    const T* expected_output, bool expect_prepare_err, int num_invoke = 1) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_shape);
+  TfLiteIntArray* begin_dims = IntArrayFromInts(begin_shape);
+  TfLiteIntArray* end_dims = IntArrayFromInts(end_shape);
+  TfLiteIntArray* strides_dims = IntArrayFromInts(strides_shape);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_shape);
+  constexpr int inputs_size = 4;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  int zero_point =
+      std::numeric_limits<T>::max() + std::numeric_limits<T>::min() / 2;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input_data, input_dims, 1.0, zero_point),
+      CreateQuantized32Tensor(begin_data, begin_dims, 1.0),
+      CreateQuantized32Tensor(end_data, end_dims, 1.0),
+      CreateQuantized32Tensor(strides_data, strides_dims, 1.0),
+      CreateQuantizedTensor(output_data, output_dims, 1.0, zero_point),
+  };
+
+  ValidateStridedSliceGoldens(tensors, tensors_size, expected_output,
+                              output_data, ElementCount(*output_dims),
+                              builtin_data, expect_prepare_err, num_invoke,
+                              1.0);
+}
+
 }  // namespace
 }  // namespace testing
 }  // namespace tflite
 
 TF_LITE_MICRO_TESTS_BEGIN
-using tflite::testing::TestStrideSlide;
 
 TF_LITE_MICRO_TEST(UnsupportedInputSize) {
+  const int input_shape[] = {5, 2, 2, 2, 2, 2};
+  const int begin_shape[] = {1, 5};
+  const int end_shape[] = {1, 5};
+  const int strides_shape[] = {1, 5};
+  const int output_shape[] = {0};
+  float input_data[] = {};
+  int32_t begin_data[] = {};
+  int32_t end_data[] = {};
+  int32_t strides_data[] = {};
+  float golden[] = {};
   float output_data[4];
-  TestStrideSlide<float>({5, 2, 2, 2, 2, 2},  // input_shape
-                         {1, 5},              //  begin_shape
-                         {1, 5},              // end_shape
-                         {1, 5},              //  strides_shape
-                         0,                   // begin_mask
-                         0,                   // end_mask
-                         0,                   // ellipsis_mask
-                         0,                   // new_axis_mask
-                         0,                   // shrink_axis_mask
-                         {},                  // input_data
-                         {},                  // begin_data
-                         {},                  // end_data
-                         {},                  // strides_data
-                         {0},                 // output_shape
-                         output_data,         // output_data
-                         {},                  // expected_output
-                         true,                // expect_prepare_err
-                         false                // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, true);
 }
 
 TF_LITE_MICRO_TEST(In1D) {
+  const int input_shape[] = {1, 4};
+  const int begin_shape[] = {1, 1};
+  const int end_shape[] = {1, 1};
+  const int strides_shape[] = {1, 1};
+  const int output_shape[] = {1, 2};
+  float input_data[] = {1, 2, 3, 4};
+  int32_t begin_data[] = {1};
+  int32_t end_data[] = {3};
+  int32_t strides_data[] = {1};
+  float golden[] = {2, 3};
   float output_data[4];
-  TestStrideSlide<float>({1, 4},        // input_shape
-                         {1, 1},        //  begin_shape
-                         {1, 1},        // end_shape
-                         {1, 1},        //  strides_shape
-                         0,             // begin_mask
-                         0,             // end_mask
-                         0,             // ellipsis_mask
-                         0,             // new_axis_mask
-                         0,             // shrink_axis_mask
-                         {1, 2, 3, 4},  // input_data
-                         {1},           // begin_data
-                         {3},           // end_data
-                         {1},           // strides_data
-                         {1, 2},        // output_shape
-                         output_data,   // output_data
-                         {2, 3},        // expected_output
-                         false,         // expect_prepare_err
-                         false          // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In1D_EmptyOutput) {
+  const int input_shape[] = {1, 4};
+  const int begin_shape[] = {1, 1};
+  const int end_shape[] = {1, 1};
+  const int strides_shape[] = {1, 1};
+  const int output_shape[] = {1, 0};
+  float input_data[] = {1, 2, 3, 4};
+  int32_t begin_data[] = {10};
+  int32_t end_data[] = {3};
+  int32_t strides_data[] = {1};
+  float golden[] = {};
   float output_data[4];
-  TestStrideSlide<float>({1, 4},        // input_shape
-                         {1, 1},        //  begin_shape
-                         {1, 1},        // end_shape
-                         {1, 1},        //  strides_shape
-                         0,             // begin_mask
-                         0,             // end_mask
-                         0,             // ellipsis_mask
-                         0,             // new_axis_mask
-                         0,             // shrink_axis_mask
-                         {1, 2, 3, 4},  // input_data
-                         {10},          // begin_data
-                         {3},           // end_data
-                         {1},           // strides_data
-                         {1, 0},        // output_shape
-                         output_data,   // output_data
-                         {},            // expected_output
-                         false,         // expect_prepare_err
-                         false          // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In1D_NegativeBegin) {
+  const int input_shape[] = {1, 4};
+  const int begin_shape[] = {1, 1};
+  const int end_shape[] = {1, 1};
+  const int strides_shape[] = {1, 1};
+  const int output_shape[] = {1, 2};
+  float input_data[] = {1, 2, 3, 4};
+  int32_t begin_data[] = {-3};
+  int32_t end_data[] = {3};
+  int32_t strides_data[] = {1};
+  float golden[] = {2, 3};
   float output_data[4];
-  TestStrideSlide<float>({1, 4},        // input_shape
-                         {1, 1},        //  begin_shape
-                         {1, 1},        // end_shape
-                         {1, 1},        //  strides_shape
-                         0,             // begin_mask
-                         0,             // end_mask
-                         0,             // ellipsis_mask
-                         0,             // new_axis_mask
-                         0,             // shrink_axis_mask
-                         {1, 2, 3, 4},  // input_data
-                         {-3},          // begin_data
-                         {3},           // end_data
-                         {1},           // strides_data
-                         {1, 2},        // output_shape
-                         output_data,   // output_data
-                         {2, 3},        // expected_output
-                         false,         // expect_prepare_err
-                         false          // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In1D_OutOfRangeBegin) {
+  const int input_shape[] = {1, 4};
+  const int begin_shape[] = {1, 1};
+  const int end_shape[] = {1, 1};
+  const int strides_shape[] = {1, 1};
+  const int output_shape[] = {1, 3};
+  float input_data[] = {1, 2, 3, 4};
+  int32_t begin_data[] = {-5};
+  int32_t end_data[] = {3};
+  int32_t strides_data[] = {1};
+  float golden[] = {1, 2, 3};
   float output_data[4];
-  TestStrideSlide<float>({1, 4},        // input_shape
-                         {1, 1},        //  begin_shape
-                         {1, 1},        // end_shape
-                         {1, 1},        //  strides_shape
-                         0,             // begin_mask
-                         0,             // end_mask
-                         0,             // ellipsis_mask
-                         0,             // new_axis_mask
-                         0,             // shrink_axis_mask
-                         {1, 2, 3, 4},  // input_data
-                         {-5},          // begin_data
-                         {3},           // end_data
-                         {1},           // strides_data
-                         {1, 3},        // output_shape
-                         output_data,   // output_data
-                         {1, 2, 3},     // expected_output
-                         false,         // expect_prepare_err
-                         false          // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In1D_NegativeEnd) {
+  const int input_shape[] = {1, 4};
+  const int begin_shape[] = {1, 1};
+  const int end_shape[] = {1, 1};
+  const int strides_shape[] = {1, 1};
+  const int output_shape[] = {1, 1};
+  float input_data[] = {1, 2, 3, 4};
+  int32_t begin_data[] = {1};
+  int32_t end_data[] = {-2};
+  int32_t strides_data[] = {1};
+  float golden[] = {2};
   float output_data[4];
-  TestStrideSlide<float>({1, 4},        // input_shape
-                         {1, 1},        //  begin_shape
-                         {1, 1},        // end_shape
-                         {1, 1},        //  strides_shape
-                         0,             // begin_mask
-                         0,             // end_mask
-                         0,             // ellipsis_mask
-                         0,             // new_axis_mask
-                         0,             // shrink_axis_mask
-                         {1, 2, 3, 4},  // input_data
-                         {1},           // begin_data
-                         {-2},          // end_data
-                         {1},           // strides_data
-                         {1, 1},        // output_shape
-                         output_data,   // output_data
-                         {2},           // expected_output
-                         false,         // expect_prepare_err
-                         false          // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In1D_OutOfRangeEnd) {
+  const int input_shape[] = {1, 4};
+  const int begin_shape[] = {1, 1};
+  const int end_shape[] = {1, 1};
+  const int strides_shape[] = {1, 1};
+  const int output_shape[] = {1, 3};
+  float input_data[] = {1, 2, 3, 4};
+  int32_t begin_data[] = {-3};
+  int32_t end_data[] = {5};
+  int32_t strides_data[] = {1};
+  float golden[] = {2, 3, 4};
   float output_data[4];
-  TestStrideSlide<float>({1, 4},        // input_shape
-                         {1, 1},        //  begin_shape
-                         {1, 1},        // end_shape
-                         {1, 1},        //  strides_shape
-                         0,             // begin_mask
-                         0,             // end_mask
-                         0,             // ellipsis_mask
-                         0,             // new_axis_mask
-                         0,             // shrink_axis_mask
-                         {1, 2, 3, 4},  // input_data
-                         {-3},          // begin_data
-                         {5},           // end_data
-                         {1},           // strides_data
-                         {1, 3},        // output_shape
-                         output_data,   // output_data
-                         {2, 3, 4},     // expected_output
-                         false,         // expect_prepare_err
-                         false          // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In1D_BeginMask) {
+  const int input_shape[] = {1, 4};
+  const int begin_shape[] = {1, 1};
+  const int end_shape[] = {1, 1};
+  const int strides_shape[] = {1, 1};
+  const int output_shape[] = {1, 3};
+  float input_data[] = {1, 2, 3, 4};
+  int32_t begin_data[] = {1};
+  int32_t end_data[] = {3};
+  int32_t strides_data[] = {1};
+  float golden[] = {1, 2, 3};
   float output_data[4];
-  TestStrideSlide<float>({1, 4},        // input_shape
-                         {1, 1},        //  begin_shape
-                         {1, 1},        // end_shape
-                         {1, 1},        //  strides_shape
-                         1,             // begin_mask
-                         0,             // end_mask
-                         0,             // ellipsis_mask
-                         0,             // new_axis_mask
-                         0,             // shrink_axis_mask
-                         {1, 2, 3, 4},  // input_data
-                         {1},           // begin_data
-                         {3},           // end_data
-                         {1},           // strides_data
-                         {1, 3},        // output_shape
-                         output_data,   // output_data
-                         {1, 2, 3},     // expected_output
-                         false,         // expect_prepare_err
-                         false          // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {1, 0, 0, 0, 0};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In1D_NegativeBeginNegativeStride) {
+  const int input_shape[] = {1, 4};
+  const int begin_shape[] = {1, 1};
+  const int end_shape[] = {1, 1};
+  const int strides_shape[] = {1, 1};
+  const int output_shape[] = {1, 1};
+  float input_data[] = {1, 2, 3, 4};
+  int32_t begin_data[] = {-2};
+  int32_t end_data[] = {-3};
+  int32_t strides_data[] = {-1};
+  float golden[] = {3};
   float output_data[4];
-  TestStrideSlide<float>({1, 4},        // input_shape
-                         {1, 1},        //  begin_shape
-                         {1, 1},        // end_shape
-                         {1, 1},        //  strides_shape
-                         0,             // begin_mask
-                         0,             // end_mask
-                         0,             // ellipsis_mask
-                         0,             // new_axis_mask
-                         0,             // shrink_axis_mask
-                         {1, 2, 3, 4},  // input_data
-                         {-2},          // begin_data
-                         {-3},          // end_data
-                         {-1},          // strides_data
-                         {1, 1},        // output_shape
-                         output_data,   // output_data
-                         {3},           // expected_output
-                         false,         // expect_prepare_err
-                         false          // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In1D_OutOfRangeBeginNegativeStride) {
+  const int input_shape[] = {1, 4};
+  const int begin_shape[] = {1, 1};
+  const int end_shape[] = {1, 1};
+  const int strides_shape[] = {1, 1};
+  const int output_shape[] = {1, 1};
+  float input_data[] = {1, 2, 3, 4};
+  int32_t begin_data[] = {5};
+  int32_t end_data[] = {2};
+  int32_t strides_data[] = {-1};
+  float golden[] = {4};
   float output_data[4];
-  TestStrideSlide<float>({1, 4},        // input_shape
-                         {1, 1},        //  begin_shape
-                         {1, 1},        // end_shape
-                         {1, 1},        //  strides_shape
-                         0,             // begin_mask
-                         0,             // end_mask
-                         0,             // ellipsis_mask
-                         0,             // new_axis_mask
-                         0,             // shrink_axis_mask
-                         {1, 2, 3, 4},  // input_data
-                         {5},           // begin_data
-                         {2},           // end_data
-                         {-1},          // strides_data
-                         {1, 1},        // output_shape
-                         output_data,   // output_data
-                         {4},           // expected_output
-                         false,         // expect_prepare_err
-                         false          // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In1D_NegativeEndNegativeStride) {
+  const int input_shape[] = {1, 4};
+  const int begin_shape[] = {1, 1};
+  const int end_shape[] = {1, 1};
+  const int strides_shape[] = {1, 1};
+  const int output_shape[] = {1, 2};
+  float input_data[] = {1, 2, 3, 4};
+  int32_t begin_data[] = {2};
+  int32_t end_data[] = {-4};
+  int32_t strides_data[] = {-1};
+  float golden[] = {3, 2};
   float output_data[4];
-  TestStrideSlide<float>({1, 4},        // input_shape
-                         {1, 1},        //  begin_shape
-                         {1, 1},        // end_shape
-                         {1, 1},        //  strides_shape
-                         0,             // begin_mask
-                         0,             // end_mask
-                         0,             // ellipsis_mask
-                         0,             // new_axis_mask
-                         0,             // shrink_axis_mask
-                         {1, 2, 3, 4},  // input_data
-                         {2},           // begin_data
-                         {-4},          // end_data
-                         {-1},          // strides_data
-                         {1, 2},        // output_shape
-                         output_data,   // output_data
-                         {3, 2},        // expected_output
-                         false,         // expect_prepare_err
-                         false          // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In1D_OutOfRangeEndNegativeStride) {
+  const int input_shape[] = {1, 4};
+  const int begin_shape[] = {1, 1};
+  const int end_shape[] = {1, 1};
+  const int strides_shape[] = {1, 1};
+  const int output_shape[] = {1, 2};
+  float input_data[] = {1, 2, 3, 4};
+  int32_t begin_data[] = {-3};
+  int32_t end_data[] = {-5};
+  int32_t strides_data[] = {-1};
+  float golden[] = {2, 1};
   float output_data[4];
-  TestStrideSlide<float>({1, 4},        // input_shape
-                         {1, 1},        //  begin_shape
-                         {1, 1},        // end_shape
-                         {1, 1},        //  strides_shape
-                         0,             // begin_mask
-                         0,             // end_mask
-                         0,             // ellipsis_mask
-                         0,             // new_axis_mask
-                         0,             // shrink_axis_mask
-                         {1, 2, 3, 4},  // input_data
-                         {-3},          // begin_data
-                         {-5},          // end_data
-                         {-1},          // strides_data
-                         {1, 2},        // output_shape
-                         output_data,   // output_data
-                         {2, 1},        // expected_output
-                         false,         // expect_prepare_err
-                         false          // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In1D_EndMask) {
+  const int input_shape[] = {1, 4};
+  const int begin_shape[] = {1, 1};
+  const int end_shape[] = {1, 1};
+  const int strides_shape[] = {1, 1};
+  const int output_shape[] = {1, 3};
+  float input_data[] = {1, 2, 3, 4};
+  int32_t begin_data[] = {1};
+  int32_t end_data[] = {3};
+  int32_t strides_data[] = {1};
+  float golden[] = {2, 3, 4};
   float output_data[4];
-  TestStrideSlide<float>({1, 4},        // input_shape
-                         {1, 1},        //  begin_shape
-                         {1, 1},        // end_shape
-                         {1, 1},        //  strides_shape
-                         0,             // begin_mask
-                         1,             // end_mask
-                         0,             // ellipsis_mask
-                         0,             // new_axis_mask
-                         0,             // shrink_axis_mask
-                         {1, 2, 3, 4},  // input_data
-                         {1},           // begin_data
-                         {3},           // end_data
-                         {1},           // strides_data
-                         {1, 3},        // output_shape
-                         output_data,   // output_data
-                         {2, 3, 4},     // expected_output
-                         false,         // expect_prepare_err
-                         false          // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {0, 1, 0, 0, 0};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In1D_NegStride) {
+  const int input_shape[] = {1, 3};
+  const int begin_shape[] = {1, 1};
+  const int end_shape[] = {1, 1};
+  const int strides_shape[] = {1, 1};
+  const int output_shape[] = {1, 3};
+  float input_data[] = {1, 2, 3};
+  int32_t begin_data[] = {-1};
+  int32_t end_data[] = {-4};
+  int32_t strides_data[] = {-1};
+  float golden[] = {3, 2, 1};
   float output_data[4];
-  TestStrideSlide<float>({1, 3},       // input_shape
-                         {1, 1},       //  begin_shape
-                         {1, 1},       // end_shape
-                         {1, 1},       //  strides_shape
-                         0,            // begin_mask
-                         0,            // end_mask
-                         0,            // ellipsis_mask
-                         0,            // new_axis_mask
-                         0,            // shrink_axis_mask
-                         {1, 2, 3},    // input_data
-                         {-1},         // begin_data
-                         {-4},         // end_data
-                         {-1},         // strides_data
-                         {1, 3},       // output_shape
-                         output_data,  // output_data
-                         {3, 2, 1},    // expected_output
-                         false,        // expect_prepare_err
-                         false         // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In1D_EvenLenStride2) {
+  const int input_shape[] = {1, 2};
+  const int begin_shape[] = {1, 1};
+  const int end_shape[] = {1, 1};
+  const int strides_shape[] = {1, 1};
+  const int output_shape[] = {1, 1};
+  float input_data[] = {1, 2, 3, 4};
+  int32_t begin_data[] = {0};
+  int32_t end_data[] = {4};
+  int32_t strides_data[] = {2};
+  float golden[] = {1};
   float output_data[4];
-  TestStrideSlide<float>({1, 2},       // input_shape
-                         {1, 1},       //  begin_shape
-                         {1, 1},       // end_shape
-                         {1, 1},       //  strides_shape
-                         0,            // begin_mask
-                         0,            // end_mask
-                         0,            // ellipsis_mask
-                         0,            // new_axis_mask
-                         0,            // shrink_axis_mask
-                         {1, 2},       // input_data
-                         {0},          // begin_data
-                         {4},          // end_data
-                         {2},          // strides_data
-                         {1, 1},       // output_shape
-                         output_data,  // output_data
-                         {1},          // expected_output
-                         false,        // expect_prepare_err
-                         false         // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In1D_OddLenStride2) {
+  const int input_shape[] = {1, 3};
+  const int begin_shape[] = {1, 1};
+  const int end_shape[] = {1, 1};
+  const int strides_shape[] = {1, 1};
+  const int output_shape[] = {1, 2};
+  float input_data[] = {1, 2, 3, 4};
+  int32_t begin_data[] = {0};
+  int32_t end_data[] = {3};
+  int32_t strides_data[] = {2};
+  float golden[] = {1, 3};
   float output_data[4];
-  TestStrideSlide<float>({1, 3},       // input_shape
-                         {1, 1},       //  begin_shape
-                         {1, 1},       // end_shape
-                         {1, 1},       //  strides_shape
-                         0,            // begin_mask
-                         0,            // end_mask
-                         0,            // ellipsis_mask
-                         0,            // new_axis_mask
-                         0,            // shrink_axis_mask
-                         {1, 2, 3},    // input_data
-                         {0},          // begin_data
-                         {3},          // end_data
-                         {2},          // strides_data
-                         {1, 2},       // output_shape
-                         output_data,  // output_data
-                         {1, 3},       // expected_output
-                         false,        // expect_prepare_err
-                         false         // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In2D_Identity) {
+  const int input_shape[] = {2, 2, 3};
+  const int begin_shape[] = {1, 2};
+  const int end_shape[] = {1, 2};
+  const int strides_shape[] = {1, 2};
+  const int output_shape[] = {2, 2, 3};
+  float input_data[] = {1, 2, 3, 4, 5, 6};
+  int32_t begin_data[] = {0, 0};
+  int32_t end_data[] = {2, 3};
+  int32_t strides_data[] = {1, 1};
+  float golden[] = {1, 2, 3, 4, 5, 6};
   float output_data[8];
-  TestStrideSlide<float>({2, 2, 3},           // input_shape
-                         {1, 2},              //  begin_shape
-                         {1, 2},              // end_shape
-                         {1, 2},              //  strides_shape
-                         0,                   // begin_mask
-                         0,                   // end_mask
-                         0,                   // ellipsis_mask
-                         0,                   // new_axis_mask
-                         0,                   // shrink_axis_mask
-                         {1, 2, 3, 4, 5, 6},  // input_data
-                         {0, 0},              // begin_data
-                         {2, 3},              // end_data
-                         {1, 1},              // strides_data
-                         {2, 2, 3},           // output_shape
-                         output_data,         // output_data
-                         {1, 2, 3, 4, 5, 6},  // expected_output
-                         false,               // expect_prepare_err
-                         false                // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In2D) {
+  const int input_shape[] = {2, 2, 3};
+  const int begin_shape[] = {1, 2};
+  const int end_shape[] = {1, 2};
+  const int strides_shape[] = {1, 2};
+  const int output_shape[] = {2, 1, 2};
+  float input_data[] = {1, 2, 3, 4, 5, 6};
+  int32_t begin_data[] = {1, 0};
+  int32_t end_data[] = {2, 2};
+  int32_t strides_data[] = {1, 1};
+  float golden[] = {4, 5};
   float output_data[8];
-  TestStrideSlide<float>({2, 2, 3},           // input_shape
-                         {1, 2},              //  begin_shape
-                         {1, 2},              // end_shape
-                         {1, 2},              //  strides_shape
-                         0,                   // begin_mask
-                         0,                   // end_mask
-                         0,                   // ellipsis_mask
-                         0,                   // new_axis_mask
-                         0,                   // shrink_axis_mask
-                         {1, 2, 3, 4, 5, 6},  // input_data
-                         {1, 0},              // begin_data
-                         {2, 2},              // end_data
-                         {1, 1},              // strides_data
-                         {2, 1, 2},           // output_shape
-                         output_data,         // output_data
-                         {4, 5},              // expected_output
-                         false,               // expect_prepare_err
-                         false                // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In2D_Stride2) {
+  const int input_shape[] = {2, 2, 3};
+  const int begin_shape[] = {1, 2};
+  const int end_shape[] = {1, 2};
+  const int strides_shape[] = {1, 2};
+  const int output_shape[] = {2, 1, 2};
+  float input_data[] = {1, 2, 3, 4, 5, 6};
+  int32_t begin_data[] = {0, 0};
+  int32_t end_data[] = {2, 3};
+  int32_t strides_data[] = {2, 2};
+  float golden[] = {1, 3};
   float output_data[8];
-  TestStrideSlide<float>({2, 2, 3},           // input_shape
-                         {1, 2},              //  begin_shape
-                         {1, 2},              // end_shape
-                         {1, 2},              //  strides_shape
-                         0,                   // begin_mask
-                         0,                   // end_mask
-                         0,                   // ellipsis_mask
-                         0,                   // new_axis_mask
-                         0,                   // shrink_axis_mask
-                         {1, 2, 3, 4, 5, 6},  // input_data
-                         {0, 0},              // begin_data
-                         {2, 3},              // end_data
-                         {2, 2},              // strides_data
-                         {2, 1, 2},           // output_shape
-                         output_data,         // output_data
-                         {1, 3},              // expected_output
-                         false,               // expect_prepare_err
-                         false                // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In2D_NegStride) {
+  const int input_shape[] = {2, 2, 3};
+  const int begin_shape[] = {1, 2};
+  const int end_shape[] = {1, 2};
+  const int strides_shape[] = {1, 2};
+  const int output_shape[] = {2, 1, 3};
+  float input_data[] = {1, 2, 3, 4, 5, 6};
+  int32_t begin_data[] = {1, -1};
+  int32_t end_data[] = {2, -4};
+  int32_t strides_data[] = {2, -1};
+  float golden[] = {6, 5, 4};
   float output_data[8];
-  TestStrideSlide<float>({2, 2, 3},           // input_shape
-                         {1, 2},              //  begin_shape
-                         {1, 2},              // end_shape
-                         {1, 2},              //  strides_shape
-                         0,                   // begin_mask
-                         0,                   // end_mask
-                         0,                   // ellipsis_mask
-                         0,                   // new_axis_mask
-                         0,                   // shrink_axis_mask
-                         {1, 2, 3, 4, 5, 6},  // input_data
-                         {1, -1},             // begin_data
-                         {2, -4},             // end_data
-                         {2, -1},             // strides_data
-                         {2, 1, 3},           // output_shape
-                         output_data,         // output_data
-                         {6, 5, 4},           // expected_output
-                         false,               // expect_prepare_err
-                         false                // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In2D_BeginMask) {
+  const int input_shape[] = {2, 2, 3};
+  const int begin_shape[] = {1, 2};
+  const int end_shape[] = {1, 2};
+  const int strides_shape[] = {1, 2};
+  const int output_shape[] = {2, 2, 2};
+  float input_data[] = {1, 2, 3, 4, 5, 6};
+  int32_t begin_data[] = {1, 0};
+  int32_t end_data[] = {2, 2};
+  int32_t strides_data[] = {1, 1};
+  float golden[] = {1, 2, 4, 5};
   float output_data[8];
-  TestStrideSlide<float>({2, 2, 3},           // input_shape
-                         {1, 2},              //  begin_shape
-                         {1, 2},              // end_shape
-                         {1, 2},              //  strides_shape
-                         1,                   // begin_mask
-                         0,                   // end_mask
-                         0,                   // ellipsis_mask
-                         0,                   // new_axis_mask
-                         0,                   // shrink_axis_mask
-                         {1, 2, 3, 4, 5, 6},  // input_data
-                         {1, 0},              // begin_data
-                         {2, 2},              // end_data
-                         {1, 1},              // strides_data
-                         {2, 2, 2},           // output_shape
-                         output_data,         // output_data
-                         {1, 2, 4, 5},        // expected_output
-                         false,               // expect_prepare_err
-                         false                // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {1, 0, 0, 0, 0};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In2D_EndMask) {
+  const int input_shape[] = {2, 2, 3};
+  const int begin_shape[] = {1, 2};
+  const int end_shape[] = {1, 2};
+  const int strides_shape[] = {1, 2};
+  const int output_shape[] = {2, 1, 3};
+  float input_data[] = {1, 2, 3, 4, 5, 6};
+  int32_t begin_data[] = {1, 0};
+  int32_t end_data[] = {2, 2};
+  int32_t strides_data[] = {1, 1};
+  float golden[] = {4, 5, 6};
   float output_data[8];
-  TestStrideSlide<float>({2, 2, 3},           // input_shape
-                         {1, 2},              //  begin_shape
-                         {1, 2},              // end_shape
-                         {1, 2},              //  strides_shape
-                         0,                   // begin_mask
-                         2,                   // end_mask
-                         0,                   // ellipsis_mask
-                         0,                   // new_axis_mask
-                         0,                   // shrink_axis_mask
-                         {1, 2, 3, 4, 5, 6},  // input_data
-                         {1, 0},              // begin_data
-                         {2, 2},              // end_data
-                         {1, 1},              // strides_data
-                         {2, 1, 3},           // output_shape
-                         output_data,         // output_data
-                         {4, 5, 6},           // expected_output
-                         false,               // expect_prepare_err
-                         false                // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {0, 2, 0, 0, 0};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In2D_NegStrideBeginMask) {
+  const int input_shape[] = {2, 2, 3};
+  const int begin_shape[] = {1, 2};
+  const int end_shape[] = {1, 2};
+  const int strides_shape[] = {1, 2};
+  const int output_shape[] = {2, 1, 3};
+  float input_data[] = {1, 2, 3, 4, 5, 6};
+  int32_t begin_data[] = {1, -2};
+  int32_t end_data[] = {2, -4};
+  int32_t strides_data[] = {1, -1};
+  float golden[] = {6, 5, 4};
   float output_data[8];
-  TestStrideSlide<float>({2, 2, 3},           // input_shape
-                         {1, 2},              //  begin_shape
-                         {1, 2},              // end_shape
-                         {1, 2},              //  strides_shape
-                         2,                   // begin_mask
-                         0,                   // end_mask
-                         0,                   // ellipsis_mask
-                         0,                   // new_axis_mask
-                         0,                   // shrink_axis_mask
-                         {1, 2, 3, 4, 5, 6},  // input_data
-                         {1, -2},             // begin_data
-                         {2, -4},             // end_data
-                         {1, -1},             // strides_data
-                         {2, 1, 3},           // output_shape
-                         output_data,         // output_data
-                         {6, 5, 4},           // expected_output
-                         false,               // expect_prepare_err
-                         false                // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {2, 0, 0, 0, 0};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In2D_NegStrideEndMask) {
+  const int input_shape[] = {2, 2, 3};
+  const int begin_shape[] = {1, 2};
+  const int end_shape[] = {1, 2};
+  const int strides_shape[] = {1, 2};
+  const int output_shape[] = {2, 1, 2};
+  float input_data[] = {1, 2, 3, 4, 5, 6};
+  int32_t begin_data[] = {1, -2};
+  int32_t end_data[] = {2, -3};
+  int32_t strides_data[] = {1, -1};
+  float golden[] = {5, 4};
   float output_data[8];
-  TestStrideSlide<float>({2, 2, 3},           // input_shape
-                         {1, 2},              //  begin_shape
-                         {1, 2},              // end_shape
-                         {1, 2},              //  strides_shape
-                         0,                   // begin_mask
-                         2,                   // end_mask
-                         0,                   // ellipsis_mask
-                         0,                   // new_axis_mask
-                         0,                   // shrink_axis_mask
-                         {1, 2, 3, 4, 5, 6},  // input_data
-                         {1, -2},             // begin_data
-                         {2, -3},             // end_data
-                         {1, -1},             // strides_data
-                         {2, 1, 2},           // output_shape
-                         output_data,         // output_data
-                         {5, 4},              // expected_output
-                         false,               // expect_prepare_err
-                         false                // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {0, 2, 0, 0, 0};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In3D_Identity) {
+  const int input_shape[] = {3, 2, 3, 2};
+  const int begin_shape[] = {1, 3};
+  const int end_shape[] = {1, 3};
+  const int strides_shape[] = {1, 3};
+  const int output_shape[] = {3, 2, 3, 2};
+  float input_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  int32_t begin_data[] = {0, 0, 0};
+  int32_t end_data[] = {2, 3, 2};
+  int32_t strides_data[] = {1, 1, 1};
+  float golden[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
   float output_data[16];
-  TestStrideSlide<float>(
-      {3, 2, 3, 2},                             // input_shape
-      {1, 3},                                   //  begin_shape
-      {1, 3},                                   // end_shape
-      {1, 3},                                   //  strides_shape
-      0,                                        // begin_mask
-      0,                                        // end_mask
-      0,                                        // ellipsis_mask
-      0,                                        // new_axis_mask
-      0,                                        // shrink_axis_mask
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},  // input_data
-      {0, 0, 0},                                // begin_data
-      {2, 3, 2},                                // end_data
-      {1, 1, 1},                                // strides_data
-      {3, 2, 3, 2},                             // output_shape
-      output_data,                              // output_data
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},  // expected_output
-      false,                                    // expect_prepare_err
-      false                                     // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In3D_NegStride) {
+  const int input_shape[] = {3, 2, 3, 2};
+  const int begin_shape[] = {1, 3};
+  const int end_shape[] = {1, 3};
+  const int strides_shape[] = {1, 3};
+  const int output_shape[] = {3, 2, 3, 2};
+  float input_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  int32_t begin_data[] = {0, 0, 0};
+  int32_t end_data[] = {2, 3, 2};
+  int32_t strides_data[] = {1, 1, 1};
+  float golden[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
   float output_data[16];
-  TestStrideSlide<float>(
-      {3, 2, 3, 2},                             // input_shape
-      {1, 3},                                   //  begin_shape
-      {1, 3},                                   // end_shape
-      {1, 3},                                   //  strides_shape
-      0,                                        // begin_mask
-      0,                                        // end_mask
-      0,                                        // ellipsis_mask
-      0,                                        // new_axis_mask
-      0,                                        // shrink_axis_mask
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},  // input_data
-      {-1, -1, -1},                             // begin_data
-      {-3, -4, -3},                             // end_data
-      {-1, -1, -1},                             // strides_data
-      {3, 2, 3, 2},                             // output_shape
-      output_data,                              // output_data
-      {12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1},  // expected_output
-      false,                                    // expect_prepare_err
-      false                                     // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In3D_Strided2) {
+  const int input_shape[] = {3, 2, 3, 2};
+  const int begin_shape[] = {1, 3};
+  const int end_shape[] = {1, 3};
+  const int strides_shape[] = {1, 3};
+  const int output_shape[] = {3, 1, 2, 1};
+  float input_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  int32_t begin_data[] = {0, 0, 0};
+  int32_t end_data[] = {2, 3, 2};
+  int32_t strides_data[] = {2, 2, 2};
+  float golden[] = {1, 5};
   float output_data[16];
-  TestStrideSlide<float>({3, 2, 3, 2},  // input_shape
-                         {1, 3},        //  begin_shape
-                         {1, 3},        // end_shape
-                         {1, 3},        //  strides_shape
-                         0,             // begin_mask
-                         0,             // end_mask
-                         0,             // ellipsis_mask
-                         0,             // new_axis_mask
-                         0,             // shrink_axis_mask
-                         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},  // input_data
-                         {0, 0, 0},                                // begin_data
-                         {2, 3, 2},                                // end_data
-                         {2, 2, 2},     // strides_data
-                         {3, 1, 2, 1},  // output_shape
-                         output_data,   // output_data
-                         {1, 5},        // expected_output
-                         false,         // expect_prepare_err
-                         false          // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In1D_ShrinkAxisMask1) {
-  float output_data[4];
-  TestStrideSlide<float>({1, 4},        // input_shape
-                         {1, 1},        //  begin_shape
-                         {1, 1},        // end_shape
-                         {1, 1},        //  strides_shape
-                         0,             // begin_mask
-                         0,             // end_mask
-                         0,             // ellipsis_mask
-                         0,             // new_axis_mask
-                         1,             // shrink_axis_mask
-                         {1, 2, 3, 4},  // input_data
-                         {1},           // begin_data
-                         {2},           // end_data
-                         {1},           // strides_data
-                         {0},           // output_shape
-                         output_data,   // output_data
-                         {2},           // expected_output
-                         false,         // expect_prepare_err
-                         false          // expect_invoke_err
-  );
+  const int input_shape[] = {3, 2, 3, 2};
+  const int begin_shape[] = {1, 3};
+  const int end_shape[] = {1, 3};
+  const int strides_shape[] = {1, 3};
+  const int output_shape[] = {3, 2, 3, 2};
+  float input_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  int32_t begin_data[] = {0, 0, 0};
+  int32_t end_data[] = {2, 3, 2};
+  int32_t strides_data[] = {1, 1, 1};
+  float golden[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  float output_data[16];
+
+  TfLiteStridedSliceParams builtin_data = {};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In1D_ShrinkAxisMask1_NegativeSlice) {
+  const int input_shape[] = {1, 4};
+  const int begin_shape[] = {1, 1};
+  const int end_shape[] = {1, 1};
+  const int strides_shape[] = {1, 1};
+  const int output_shape[] = {0};
+  float input_data[] = {0, 1, 2, 3};
+  int32_t begin_data[] = {-1};
+  int32_t end_data[] = {0};
+  int32_t strides_data[] = {1};
+  float golden[] = {3};
   float output_data[4];
-  TestStrideSlide<float>({1, 4},        // input_shape
-                         {1, 1},        //  begin_shape
-                         {1, 1},        // end_shape
-                         {1, 1},        //  strides_shape
-                         0,             // begin_mask
-                         0,             // end_mask
-                         0,             // ellipsis_mask
-                         0,             // new_axis_mask
-                         1,             // shrink_axis_mask
-                         {0, 1, 2, 3},  // input_data
-                         {-1},          // begin_data
-                         {0},           // end_data
-                         {1},           // strides_data
-                         {0},           // output_shape
-                         output_data,   // output_data
-                         {3},           // expected_output
-                         false,         // expect_prepare_err
-                         false          // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {0, 0, 0, 0, 1};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In2D_ShrinkAxis3_NegativeSlice) {
+  const int input_shape[] = {2, 4, 1};
+  const int begin_shape[] = {1, 2};
+  const int end_shape[] = {1, 2};
+  const int strides_shape[] = {1, 2};
+  const int output_shape[] = {0};
+  float input_data[] = {0, 1, 2, 3};
+  int32_t begin_data[] = {-2, -1};
+  int32_t end_data[] = {-1, 0};
+  int32_t strides_data[] = {1, 1};
+  float golden[] = {2};
   float output_data[4];
-  TestStrideSlide<float>({2, 4, 1},     // input_shape
-                         {1, 2},        //  begin_shape
-                         {1, 2},        // end_shape
-                         {1, 2},        //  strides_shape
-                         0,             // begin_mask
-                         0,             // end_mask
-                         0,             // ellipsis_mask
-                         0,             // new_axis_mask
-                         3,             // shrink_axis_mask
-                         {0, 1, 2, 3},  // input_data
-                         {-2, -1},      // begin_data
-                         {-1, 0},       // end_data
-                         {1, 1},        // strides_data
-                         {0},           // output_shape
-                         output_data,   // output_data
-                         {2},           // expected_output
-                         false,         // expect_prepare_err
-                         false          // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {0, 0, 0, 0, 3};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In2D_ShrinkAxis2_BeginEndAxis1_NegativeSlice) {
+  const int input_shape[] = {2, 4, 1};
+  const int begin_shape[] = {1, 2};
+  const int end_shape[] = {1, 2};
+  const int strides_shape[] = {1, 2};
+  const int output_shape[] = {1, 4};
+  float input_data[] = {0, 1, 2, 3};
+  int32_t begin_data[] = {0, -1};
+  int32_t end_data[] = {0, 0};
+  int32_t strides_data[] = {1, 1};
+  float golden[] = {0, 1, 2, 3};
   float output_data[4];
-  TestStrideSlide<float>({2, 4, 1},     // input_shape
-                         {1, 2},        //  begin_shape
-                         {1, 2},        // end_shape
-                         {1, 2},        //  strides_shape
-                         1,             // begin_mask
-                         1,             // end_mask
-                         0,             // ellipsis_mask
-                         0,             // new_axis_mask
-                         2,             // shrink_axis_mask
-                         {0, 1, 2, 3},  // input_data
-                         {0, -1},       // begin_data
-                         {0, 0},        // end_data
-                         {1, 1},        // strides_data
-                         {1, 4},        // output_shape
-                         output_data,   // output_data
-                         {0, 1, 2, 3},  // expected_output
-                         false,         // expect_prepare_err
-                         false          // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {1, 1, 0, 0, 2};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In1D_BeginMaskShrinkAxisMask1) {
+  const int input_shape[] = {1, 4};
+  const int begin_shape[] = {1, 1};
+  const int end_shape[] = {1, 1};
+  const int strides_shape[] = {1, 1};
+  const int output_shape[] = {0};
+  float input_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  int32_t begin_data[] = {1};
+  int32_t end_data[] = {1};
+  int32_t strides_data[] = {1};
+  float golden[] = {1};
   float output_data[4];
-  TestStrideSlide<float>({1, 4},        // input_shape
-                         {1, 1},        //  begin_shape
-                         {1, 1},        // end_shape
-                         {1, 1},        //  strides_shape
-                         1,             // begin_mask
-                         0,             // end_mask
-                         0,             // ellipsis_mask
-                         0,             // new_axis_mask
-                         1,             // shrink_axis_mask
-                         {1, 2, 3, 4},  // input_data
-                         {1},           // begin_data
-                         {1},           // end_data
-                         {1},           // strides_data
-                         {0},           // output_shape
-                         output_data,   // output_data
-                         {1},           // expected_output
-                         false,         // expect_prepare_err
-                         false          // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {1, 0, 0, 0, 1};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In2D_ShrinkAxisMask1) {
+  const int input_shape[] = {2, 2, 3};
+  const int begin_shape[] = {1, 2};
+  const int end_shape[] = {1, 2};
+  const int strides_shape[] = {1, 2};
+  const int output_shape[] = {1, 3};
+  float input_data[] = {1, 2, 3, 4, 5, 6};
+  int32_t begin_data[] = {0, 0};
+  int32_t end_data[] = {1, 3};
+  int32_t strides_data[] = {1, 1};
+  float golden[] = {1, 2, 3};
   float output_data[6];
-  TestStrideSlide<float>({2, 2, 3},           // input_shape
-                         {1, 2},              //  begin_shape
-                         {1, 2},              // end_shape
-                         {1, 2},              //  strides_shape
-                         0,                   // begin_mask
-                         0,                   // end_mask
-                         0,                   // ellipsis_mask
-                         0,                   // new_axis_mask
-                         1,                   // shrink_axis_mask
-                         {1, 2, 3, 4, 5, 6},  // input_data
-                         {0, 0},              // begin_data
-                         {1, 3},              // end_data
-                         {1, 1},              // strides_data
-                         {1, 3},              // output_shape
-                         output_data,         // output_data
-                         {1, 2, 3},           // expected_output
-                         false,               // expect_prepare_err
-                         false                // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {0, 0, 0, 0, 1};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In2D_ShrinkAxisMask2) {
+  const int input_shape[] = {2, 2, 3};
+  const int begin_shape[] = {1, 2};
+  const int end_shape[] = {1, 2};
+  const int strides_shape[] = {1, 2};
+  const int output_shape[] = {1, 2};
+  float input_data[] = {1, 2, 3, 4, 5, 6};
+  int32_t begin_data[] = {0, 0};
+  int32_t end_data[] = {2, 1};
+  int32_t strides_data[] = {1, 1};
+  float golden[] = {1, 4};
   float output_data[6];
-  TestStrideSlide<float>({2, 2, 3},           // input_shape
-                         {1, 2},              //  begin_shape
-                         {1, 2},              // end_shape
-                         {1, 2},              //  strides_shape
-                         0,                   // begin_mask
-                         0,                   // end_mask
-                         0,                   // ellipsis_mask
-                         0,                   // new_axis_mask
-                         2,                   // shrink_axis_mask
-                         {1, 2, 3, 4, 5, 6},  // input_data
-                         {0, 0},              // begin_data
-                         {2, 1},              // end_data
-                         {1, 1},              // strides_data
-                         {1, 2},              // output_shape
-                         output_data,         // output_data
-                         {1, 4},              // expected_output
-                         false,               // expect_prepare_err
-                         false                // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {0, 0, 0, 0, 2};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In2D_ShrinkAxisMask3) {
+  const int input_shape[] = {2, 2, 3};
+  const int begin_shape[] = {1, 2};
+  const int end_shape[] = {1, 2};
+  const int strides_shape[] = {1, 2};
+  const int output_shape[] = {0};
+  float input_data[] = {1, 2, 3, 4, 5, 6};
+  int32_t begin_data[] = {0, 0};
+  int32_t end_data[] = {1, 1};
+  int32_t strides_data[] = {1, 1};
+  float golden[] = {1};
   float output_data[6];
-  TestStrideSlide<float>({2, 2, 3},           // input_shape
-                         {1, 2},              //  begin_shape
-                         {1, 2},              // end_shape
-                         {1, 2},              //  strides_shape
-                         0,                   // begin_mask
-                         0,                   // end_mask
-                         0,                   // ellipsis_mask
-                         0,                   // new_axis_mask
-                         3,                   // shrink_axis_mask
-                         {1, 2, 3, 4, 5, 6},  // input_data
-                         {0, 0},              // begin_data
-                         {1, 1},              // end_data
-                         {1, 1},              // strides_data
-                         {0},                 // output_shape
-                         output_data,         // output_data
-                         {1},                 // expected_output
-                         false,               // expect_prepare_err
-                         false                // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {0, 0, 0, 0, 3};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In3D_IdentityShrinkAxis1) {
+  const int input_shape[] = {3, 2, 3, 2};
+  const int begin_shape[] = {1, 3};
+  const int end_shape[] = {1, 3};
+  const int strides_shape[] = {1, 3};
+  const int output_shape[] = {2, 3, 2};
+  float input_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  int32_t begin_data[] = {0, 0, 0};
+  int32_t end_data[] = {1, 3, 2};
+  int32_t strides_data[] = {1, 1, 1};
+  float golden[] = {1, 2, 3, 4, 5, 6};
   float output_data[16];
-  TestStrideSlide<float>({3, 2, 3, 2},  // input_shape
-                         {1, 3},        //  begin_shape
-                         {1, 3},        // end_shape
-                         {1, 3},        //  strides_shape
-                         0,             // begin_mask
-                         0,             // end_mask
-                         0,             // ellipsis_mask
-                         0,             // new_axis_mask
-                         1,             // shrink_axis_mask
-                         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},  // input_data
-                         {0, 0, 0},                                // begin_data
-                         {1, 3, 2},                                // end_data
-                         {1, 1, 1},           // strides_data
-                         {2, 3, 2},           // output_shape
-                         output_data,         // output_data
-                         {1, 2, 3, 4, 5, 6},  // expected_output
-                         false,               // expect_prepare_err
-                         false                // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {0, 0, 0, 0, 1};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In3D_IdentityShrinkAxis2) {
+  const int input_shape[] = {3, 2, 3, 2};
+  const int begin_shape[] = {1, 3};
+  const int end_shape[] = {1, 3};
+  const int strides_shape[] = {1, 3};
+  const int output_shape[] = {2, 2, 2};
+  float input_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  int32_t begin_data[] = {0, 0, 0};
+  int32_t end_data[] = {2, 1, 2};
+  int32_t strides_data[] = {1, 1, 1};
+  float golden[] = {1, 2, 7, 8};
   float output_data[16];
-  TestStrideSlide<float>({3, 2, 3, 2},  // input_shape
-                         {1, 3},        //  begin_shape
-                         {1, 3},        // end_shape
-                         {1, 3},        //  strides_shape
-                         0,             // begin_mask
-                         0,             // end_mask
-                         0,             // ellipsis_mask
-                         0,             // new_axis_mask
-                         2,             // shrink_axis_mask
-                         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},  // input_data
-                         {0, 0, 0},                                // begin_data
-                         {2, 1, 2},                                // end_data
-                         {1, 1, 1},     // strides_data
-                         {2, 2, 2},     // output_shape
-                         output_data,   // output_data
-                         {1, 2, 7, 8},  // expected_output
-                         false,         // expect_prepare_err
-                         false          // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {0, 0, 0, 0, 2};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In3D_IdentityShrinkAxis3) {
+  const int input_shape[] = {3, 2, 3, 2};
+  const int begin_shape[] = {1, 3};
+  const int end_shape[] = {1, 3};
+  const int strides_shape[] = {1, 3};
+  const int output_shape[] = {1, 2};
+  float input_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  int32_t begin_data[] = {0, 0, 0};
+  int32_t end_data[] = {1, 1, 2};
+  int32_t strides_data[] = {1, 1, 1};
+  float golden[] = {1, 2};
   float output_data[16];
-  TestStrideSlide<float>({3, 2, 3, 2},  // input_shape
-                         {1, 3},        //  begin_shape
-                         {1, 3},        // end_shape
-                         {1, 3},        //  strides_shape
-                         0,             // begin_mask
-                         0,             // end_mask
-                         0,             // ellipsis_mask
-                         0,             // new_axis_mask
-                         3,             // shrink_axis_mask
-                         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},  // input_data
-                         {0, 0, 0},                                // begin_data
-                         {1, 1, 2},                                // end_data
-                         {1, 1, 1},    // strides_data
-                         {1, 2},       // output_shape
-                         output_data,  // output_data
-                         {1, 2},       // expected_output
-                         false,        // expect_prepare_err
-                         false         // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {0, 0, 0, 0, 3};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In3D_IdentityShrinkAxis4) {
+  const int input_shape[] = {3, 2, 3, 2};
+  const int begin_shape[] = {1, 3};
+  const int end_shape[] = {1, 3};
+  const int strides_shape[] = {1, 3};
+  const int output_shape[] = {2, 2, 3};
+  float input_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  int32_t begin_data[] = {0, 0, 0};
+  int32_t end_data[] = {2, 3, 2};
+  int32_t strides_data[] = {1, 1, 1};
+  float golden[] = {1, 3, 5, 7, 9, 11};
   float output_data[16];
-  TestStrideSlide<float>({3, 2, 3, 2},  // input_shape
-                         {1, 3},        //  begin_shape
-                         {1, 3},        // end_shape
-                         {1, 3},        //  strides_shape
-                         0,             // begin_mask
-                         0,             // end_mask
-                         0,             // ellipsis_mask
-                         0,             // new_axis_mask
-                         4,             // shrink_axis_mask
-                         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},  // input_data
-                         {0, 0, 0},                                // begin_data
-                         {2, 3, 1},                                // end_data
-                         {1, 1, 1},            // strides_data
-                         {2, 2, 3},            // output_shape
-                         output_data,          // output_data
-                         {1, 3, 5, 7, 9, 11},  // expected_output
-                         false,                // expect_prepare_err
-                         false                 // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {0, 0, 0, 0, 4};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In3D_IdentityShrinkAxis5) {
+  const int input_shape[] = {3, 2, 3, 2};
+  const int begin_shape[] = {1, 3};
+  const int end_shape[] = {1, 3};
+  const int strides_shape[] = {1, 3};
+  const int output_shape[] = {1, 3};
+  float input_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  int32_t begin_data[] = {0, 0, 0};
+  int32_t end_data[] = {1, 3, 1};
+  int32_t strides_data[] = {1, 1, 1};
+  float golden[] = {1, 3, 5};
   float output_data[16];
-  TestStrideSlide<float>({3, 2, 3, 2},  // input_shape
-                         {1, 3},        //  begin_shape
-                         {1, 3},        // end_shape
-                         {1, 3},        //  strides_shape
-                         0,             // begin_mask
-                         0,             // end_mask
-                         0,             // ellipsis_mask
-                         0,             // new_axis_mask
-                         5,             // shrink_axis_mask
-                         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},  // input_data
-                         {0, 0, 0},                                // begin_data
-                         {1, 3, 1},                                // end_data
-                         {1, 1, 1},    // strides_data
-                         {1, 3},       // output_shape
-                         output_data,  // output_data
-                         {1, 3, 5},    // expected_output
-                         false,        // expect_prepare_err
-                         false         // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {0, 0, 0, 0, 5};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In3D_IdentityShrinkAxis6) {
+  const int input_shape[] = {3, 2, 3, 2};
+  const int begin_shape[] = {1, 3};
+  const int end_shape[] = {1, 3};
+  const int strides_shape[] = {1, 3};
+  const int output_shape[] = {1, 2};
+  float input_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  int32_t begin_data[] = {0, 0, 0};
+  int32_t end_data[] = {2, 1, 1};
+  int32_t strides_data[] = {1, 1, 1};
+  float golden[] = {1, 7};
   float output_data[16];
-  TestStrideSlide<float>({3, 2, 3, 2},  // input_shape
-                         {1, 3},        //  begin_shape
-                         {1, 3},        // end_shape
-                         {1, 3},        //  strides_shape
-                         0,             // begin_mask
-                         0,             // end_mask
-                         0,             // ellipsis_mask
-                         0,             // new_axis_mask
-                         6,             // shrink_axis_mask
-                         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},  // input_data
-                         {0, 0, 0},                                // begin_data
-                         {2, 1, 1},                                // end_data
-                         {1, 1, 1},    // strides_data
-                         {1, 2},       // output_shape
-                         output_data,  // output_data
-                         {1, 7},       // expected_output
-                         false,        // expect_prepare_err
-                         false         // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {0, 0, 0, 0, 6};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In3D_IdentityShrinkAxis7) {
+  const int input_shape[] = {3, 2, 3, 2};
+  const int begin_shape[] = {1, 3};
+  const int end_shape[] = {1, 3};
+  const int strides_shape[] = {1, 3};
+  const int output_shape[] = {0};
+  float input_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  int32_t begin_data[] = {0, 0, 0};
+  int32_t end_data[] = {1, 1, 1};
+  int32_t strides_data[] = {1, 1, 1};
+  float golden[] = {1};
   float output_data[16];
-  TestStrideSlide<float>({3, 2, 3, 2},  // input_shape
-                         {1, 3},        //  begin_shape
-                         {1, 3},        // end_shape
-                         {1, 3},        //  strides_shape
-                         0,             // begin_mask
-                         0,             // end_mask
-                         0,             // ellipsis_mask
-                         0,             // new_axis_mask
-                         7,             // shrink_axis_mask
-                         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},  // input_data
-                         {0, 0, 0},                                // begin_data
-                         {1, 1, 1},                                // end_data
-                         {1, 1, 1},    // strides_data
-                         {0},          // output_shape
-                         output_data,  // output_data
-                         {1},          // expected_output
-                         false,        // expect_prepare_err
-                         false         // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {0, 0, 0, 0, 7};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 // This tests catches a very subtle bug that was fixed by cl/188403234.
 TF_LITE_MICRO_TEST(RunTwice) {
-  float output_data[6];
-  TestStrideSlide<float>({2, 2, 3},           // input_shape
-                         {1, 2},              //  begin_shape
-                         {1, 2},              // end_shape
-                         {1, 2},              //  strides_shape
-                         1,                   // begin_mask
-                         0,                   // end_mask
-                         0,                   // ellipsis_mask
-                         0,                   // new_axis_mask
-                         0,                   // shrink_axis_mask
-                         {1, 2, 3, 4, 5, 6},  // input_data
-                         {1, 0},              // begin_data
-                         {2, 2},              // end_data
-                         {1, 1},              // strides_data
-                         {2, 2, 2},           // output_shape
-                         output_data,         // output_data
-                         {1, 2, 4, 5},        // expected_output
-                         false,               // expect_prepare_err
-                         false,               // expect_invoke_err
-                         2                    // num_invoke
-  );
+  const int input_shape[] = {2, 2, 3};
+  const int begin_shape[] = {1, 2};
+  const int end_shape[] = {1, 2};
+  const int strides_shape[] = {1, 2};
+  const int output_shape[] = {2, 2, 2};
+  float input_data[] = {1, 2, 3, 4, 5, 6};
+  int32_t begin_data[] = {1, 0};
+  int32_t end_data[] = {2, 2};
+  int32_t strides_data[] = {1, 1};
+  float golden[] = {1, 2, 4, 5};
+  float output_data[16];
+
+  TfLiteStridedSliceParams builtin_data = {1, 0, 0, 0, 0};
+
+  tflite::testing::TestStridedSliceFloat(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false, 2);
 }
 
 TF_LITE_MICRO_TEST(In3D_IdentityShrinkAxis1Uint8) {
+  const int input_shape[] = {3, 2, 3, 2};
+  const int begin_shape[] = {1, 3};
+  const int end_shape[] = {1, 3};
+  const int strides_shape[] = {1, 3};
+  const int output_shape[] = {2, 3, 2};
+  uint8_t input_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  int32_t begin_data[] = {0, 0, 0};
+  int32_t end_data[] = {1, 3, 2};
+  int32_t strides_data[] = {1, 1, 1};
+  uint8_t golden[] = {1, 2, 3, 4, 5, 6};
   uint8_t output_data[12];
-  TestStrideSlide<uint8_t, kTfLiteUInt8>(
-      {3, 2, 3, 2},                             // input_shape
-      {1, 3},                                   //  begin_shape
-      {1, 3},                                   // end_shape
-      {1, 3},                                   //  strides_shape
-      0,                                        // begin_mask
-      0,                                        // end_mask
-      0,                                        // ellipsis_mask
-      0,                                        // new_axis_mask
-      1,                                        // shrink_axis_mask
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},  // input_data
-      {0, 0, 0},                                // begin_data
-      {1, 3, 2},                                // end_data
-      {1, 1, 1},                                // strides_data
-      {2, 3, 2},                                // output_shape
-      output_data,                              // output_data
-      {1, 2, 3, 4, 5, 6},                       // expected_output
-      false,                                    // expect_prepare_err
-      false                                     // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {0, 0, 0, 0, 1};
+
+  tflite::testing::TestStridedSliceQuantized(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TEST(In3D_IdentityShrinkAxis1int8) {
+  const int input_shape[] = {3, 2, 3, 2};
+  const int begin_shape[] = {1, 3};
+  const int end_shape[] = {1, 3};
+  const int strides_shape[] = {1, 3};
+  const int output_shape[] = {2, 3, 2};
+  int8_t input_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  int32_t begin_data[] = {0, 0, 0};
+  int32_t end_data[] = {1, 3, 2};
+  int32_t strides_data[] = {1, 1, 1};
+  int8_t golden[] = {1, 2, 3, 4, 5, 6};
   int8_t output_data[12];
-  TestStrideSlide<int8_t, kTfLiteInt8>(
-      {3, 2, 3, 2},                             // input_shape
-      {1, 3},                                   //  begin_shape
-      {1, 3},                                   // end_shape
-      {1, 3},                                   //  strides_shape
-      0,                                        // begin_mask
-      0,                                        // end_mask
-      0,                                        // ellipsis_mask
-      0,                                        // new_axis_mask
-      1,                                        // shrink_axis_mask
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},  // input_data
-      {0, 0, 0},                                // begin_data
-      {1, 3, 2},                                // end_data
-      {1, 1, 1},                                // strides_data
-      {2, 3, 2},                                // output_shape
-      output_data,                              // output_data
-      {1, 2, 3, 4, 5, 6},                       // expected_output
-      false,                                    // expect_prepare_err
-      false                                     // expect_invoke_err
-  );
+
+  TfLiteStridedSliceParams builtin_data = {0, 0, 0, 0, 1};
+
+  tflite::testing::TestStridedSliceQuantized(
+      input_shape, begin_shape, end_shape, strides_shape, &builtin_data,
+      input_data, begin_data, end_data, strides_data, output_shape, output_data,
+      golden, false);
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/sub.cc b/tensorflow/lite/micro/kernels/sub.cc
index ddc03d81856..8ba1594932f 100644
--- a/tensorflow/lite/micro/kernels/sub.cc
+++ b/tensorflow/lite/micro/kernels/sub.cc
@@ -21,8 +21,10 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -40,18 +42,18 @@ struct OpData {
   // and the special 16-bit -> 16bit quantized path
   int input1_shift;
   int input2_shift;
-  int32 output_activation_min;
-  int32 output_activation_max;
+  int32_t output_activation_min;
+  int32_t output_activation_max;
 
   // These fields are used only in the general 8-bit -> 8bit quantized path
-  int32 input1_multiplier;
-  int32 input2_multiplier;
-  int32 output_multiplier;
+  int32_t input1_multiplier;
+  int32_t input2_multiplier;
+  int32_t output_multiplier;
   int output_shift;
   int left_shift;
-  int32 input1_offset;
-  int32 input2_offset;
-  int32 output_offset;
+  int32_t input1_offset;
+  int32_t input2_offset;
+  int32_t output_offset;
 };
 
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteSubParams* params,
@@ -93,31 +95,59 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteSubParams* params,
   return kTfLiteOk;
 }
 
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  OpData* data = static_cast<OpData*>(node->user_data);
+  auto* params = reinterpret_cast<TfLiteSubParams*>(node->builtin_data);
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_STATUS(
+      CalculateOpData(context, params, input1, input2, output, data));
+  return kTfLiteOk;
+}
+
 void EvalSub(TfLiteContext* context, TfLiteNode* node, TfLiteSubParams* params,
-             const OpData* data, const TfLiteTensor* input1,
-             const TfLiteTensor* input2, TfLiteTensor* output) {
+             const OpData* data, const TfLiteEvalTensor* input1,
+             const TfLiteEvalTensor* input2, TfLiteEvalTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
   tflite::ArithmeticParams op_params;
   SetActivationParams(output_activation_min, output_activation_max, &op_params);
-#define TF_LITE_SUB(opname)                                               \
-  opname(op_params, GetTensorShape(input1), GetTensorData<float>(input1), \
-         GetTensorShape(input2), GetTensorData<float>(input2),            \
-         GetTensorShape(output), GetTensorData<float>(output))
   if (data->requires_broadcast) {
-    TF_LITE_SUB(tflite::reference_ops::BroadcastSubSlow);
+    tflite::reference_ops::BroadcastSubSlow(
+        op_params, tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorData<float>(input1),
+        tflite::micro::GetTensorShape(input2),
+        tflite::micro::GetTensorData<float>(input2),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<float>(output));
   } else {
-    TF_LITE_SUB(tflite::reference_ops::SubWithActivation);
+    tflite::reference_ops::SubWithActivation(
+        op_params, tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorData<float>(input1),
+        tflite::micro::GetTensorShape(input2),
+        tflite::micro::GetTensorData<float>(input2),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<float>(output));
   }
-#undef TF_LITE_SUB
 }
 
 TfLiteStatus EvalSubQuantized(TfLiteContext* context, TfLiteNode* node,
                               TfLiteSubParams* params, const OpData* data,
-                              const TfLiteTensor* input1,
-                              const TfLiteTensor* input2,
-                              TfLiteTensor* output) {
+                              const TfLiteEvalTensor* input1,
+                              const TfLiteEvalTensor* input2,
+                              TfLiteEvalTensor* output) {
   if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
     tflite::ArithmeticParams op_params;
     op_params.left_shift = data->left_shift;
@@ -133,25 +163,46 @@ TfLiteStatus EvalSubQuantized(TfLiteContext* context, TfLiteNode* node,
     SetActivationParams(data->output_activation_min,
                         data->output_activation_max, &op_params);
     bool need_broadcast = reference_ops::ProcessBroadcastShapes(
-        GetTensorShape(input1), GetTensorShape(input2), &op_params);
-#define TF_LITE_SUB(opname, dtype)                                        \
-  opname(op_params, GetTensorShape(input1), GetTensorData<dtype>(input1), \
-         GetTensorShape(input2), GetTensorData<dtype>(input2),            \
-         GetTensorShape(output), GetTensorData<dtype>(output));
+        tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorShape(input2), &op_params);
+
     if (output->type == kTfLiteInt8) {
       if (need_broadcast) {
-        TF_LITE_SUB(tflite::reference_ops::BroadcastSubSlow, int8_t);
+        tflite::reference_ops::BroadcastSubSlow(
+            op_params, tflite::micro::GetTensorShape(input1),
+            tflite::micro::GetTensorData<int8_t>(input1),
+            tflite::micro::GetTensorShape(input2),
+            tflite::micro::GetTensorData<int8_t>(input2),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int8_t>(output));
       } else {
-        TF_LITE_SUB(tflite::reference_ops::Sub, int8_t);
+        tflite::reference_ops::Sub(
+            op_params, tflite::micro::GetTensorShape(input1),
+            tflite::micro::GetTensorData<int8_t>(input1),
+            tflite::micro::GetTensorShape(input2),
+            tflite::micro::GetTensorData<int8_t>(input2),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int8_t>(output));
       }
     } else {
       if (need_broadcast) {
-        TF_LITE_SUB(tflite::reference_ops::BroadcastSubSlow, uint8_t);
+        tflite::reference_ops::BroadcastSubSlow(
+            op_params, tflite::micro::GetTensorShape(input1),
+            tflite::micro::GetTensorData<uint8_t>(input1),
+            tflite::micro::GetTensorShape(input2),
+            tflite::micro::GetTensorData<uint8_t>(input2),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<uint8_t>(output));
       } else {
-        TF_LITE_SUB(tflite::reference_ops::Sub, uint8_t);
+        tflite::reference_ops::Sub(
+            op_params, tflite::micro::GetTensorShape(input1),
+            tflite::micro::GetTensorData<uint8_t>(input1),
+            tflite::micro::GetTensorShape(input2),
+            tflite::micro::GetTensorData<uint8_t>(input2),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<uint8_t>(output));
       }
     }
-#undef TF_LITE_SUB
   }
 
   return kTfLiteOk;
@@ -160,13 +211,15 @@ TfLiteStatus EvalSubQuantized(TfLiteContext* context, TfLiteNode* node,
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteSubParams*>(node->builtin_data);
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
-  OpData data;
-  TF_LITE_ENSURE_STATUS(
-      CalculateOpData(context, params, input1, input2, output, &data));
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
   if (output->type == kTfLiteFloat32) {
     EvalSub(context, node, params, &data, input1, input2, output);
@@ -185,9 +238,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace sub
 
 TfLiteRegistration Register_SUB() {
-  return {/*init=*/nullptr,
+  return {/*init=*/sub::Init,
           /*free=*/nullptr,
-          /*prepare=*/nullptr,
+          /*prepare=*/sub::Prepare,
           /*invoke=*/sub::Eval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
diff --git a/tensorflow/lite/micro/kernels/sub_test.cc b/tensorflow/lite/micro/kernels/sub_test.cc
index 169f3ad9568..fdfe4234c64 100644
--- a/tensorflow/lite/micro/kernels/sub_test.cc
+++ b/tensorflow/lite/micro/kernels/sub_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
@@ -66,47 +66,21 @@ void ValidateSubGoldens(TfLiteTensor* tensors, int tensors_size,
                         const T* golden, T* output, int output_size,
                         TfLiteFusedActivation activation,
                         float tolerance = 1e-5) {
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(::tflite::BuiltinOperator_SUB);
-
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
   TfLiteSubParams builtin_data;
   builtin_data.activation = activation;
 
-  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
-  const size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-
   int inputs_array_data[] = {2, 0, 1};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 2};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
+  const TfLiteRegistration registration = tflite::ops::micro::Register_SUB();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array, &builtin_data,
+                             micro_test::reporter);
 
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   for (int i = 0; i < output_size; ++i) {
     TF_LITE_MICRO_EXPECT_NEAR(golden[i], output[i], tolerance);
@@ -431,12 +405,6 @@ TF_LITE_MICRO_TEST(QuantizedSubWithScalarBroadcastUint8) {
   }
 }
 TF_LITE_MICRO_TEST(QuantizedSubWithScalarBroadcastFloat) {
-  const float scales[] = {0.1, 0.05, 0.1};
-  const int zero_points[] = {127, 131, 139};
-  uint8_t input1_quantized[tflite::testing::broadcast_output_dims_count];
-  uint8_t input2_quantized[tflite::testing::broadcast_output_dims_count];
-  uint8_t golden_quantized[tflite::testing::broadcast_output_dims_count];
-  uint8_t output[tflite::testing::broadcast_output_dims_count];
   float output_float[tflite::testing::broadcast_output_dims_count];
 
   for (int i = 0; i < tflite::testing::broadcast_num_shapes; ++i) {
@@ -491,7 +459,6 @@ TF_LITE_MICRO_TEST(QuantizedSubWithMixedBroadcastUint8) {
   uint8_t input2_quantized[tflite::testing::broadcast_output_dims_count];
   uint8_t golden_quantized[tflite::testing::broadcast_output_dims_count];
   uint8_t output[tflite::testing::broadcast_output_dims_count];
-  float output_float[tflite::testing::broadcast_output_dims_count];
 
   for (int i = 0; i < tflite::testing::broadcast_num_shapes; ++i) {
     tflite::testing::TestSubQuantized(
@@ -512,7 +479,6 @@ TF_LITE_MICRO_TEST(QuantizedSubWithMixedBroadcastInt8) {
   int8_t input2_quantized[tflite::testing::broadcast_output_dims_count];
   int8_t golden_quantized[tflite::testing::broadcast_output_dims_count];
   int8_t output[tflite::testing::broadcast_output_dims_count];
-  float output_float[tflite::testing::broadcast_output_dims_count];
 
   for (int i = 0; i < tflite::testing::broadcast_num_shapes; ++i) {
     tflite::testing::TestSubQuantized(
diff --git a/tensorflow/lite/micro/kernels/svdf.cc b/tensorflow/lite/micro/kernels/svdf.cc
index fde5269fe63..5cb8e06f9a7 100644
--- a/tensorflow/lite/micro/kernels/svdf.cc
+++ b/tensorflow/lite/micro/kernels/svdf.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/kernels/activation_utils.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 
 namespace tflite {
@@ -32,14 +33,18 @@ namespace svdf {
 namespace {
 
 struct OpData {
-  int32 effective_scale_1_a;
-  int32 effective_scale_2_a;
+  int32_t effective_scale_1_a;
+  int32_t effective_scale_2_a;
   // b versions of each scale are kept at int since the numbers are just the
   // shift value - typically between [-32, 32].
   int effective_scale_1_b;
   int effective_scale_2_b;
   int scratch_tensor_index;
   int scratch_output_tensor_index;
+
+  // Cached tensor zero point values for quantized operations.
+  int input_zero_point;
+  int output_zero_point;
 };
 
 /**
@@ -114,11 +119,11 @@ static inline void ApplyTimeWeightsBiasAndActivation(
 }
 
 inline void EvalFloatSVDF(
-    TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input,
-    const TfLiteTensor* weights_feature, const TfLiteTensor* weights_time,
-    const TfLiteTensor* bias, const TfLiteSVDFParams* params,
-    int scratch_tensor_index, TfLiteTensor* activation_state,
-    TfLiteTensor* output) {
+    TfLiteContext* context, TfLiteNode* node, const TfLiteEvalTensor* input,
+    const TfLiteEvalTensor* weights_feature,
+    const TfLiteEvalTensor* weights_time, const TfLiteEvalTensor* bias,
+    const TfLiteSVDFParams* params, int scratch_tensor_index,
+    TfLiteEvalTensor* activation_state, TfLiteEvalTensor* output) {
   const int rank = params->rank;
   const int batch_size = input->dims->data[0];
   const int input_size = input->dims->data[1];
@@ -126,12 +131,14 @@ inline void EvalFloatSVDF(
   const int num_units = num_filters / rank;
   const int memory_size = weights_time->dims->data[1];
 
-  const float* weights_feature_ptr = GetTensorData<float>(weights_feature);
-  const float* weights_time_ptr = GetTensorData<float>(weights_time);
-  const float* bias_ptr = GetTensorData<float>(bias);
-  const float* input_ptr = GetTensorData<float>(input);
+  const float* weights_feature_ptr =
+      tflite::micro::GetTensorData<float>(weights_feature);
+  const float* weights_time_ptr =
+      tflite::micro::GetTensorData<float>(weights_time);
+  const float* bias_ptr = tflite::micro::GetTensorData<float>(bias);
+  const float* input_ptr = tflite::micro::GetTensorData<float>(input);
 
-  float* state_ptr = GetTensorData<float>(activation_state);
+  float* state_ptr = tflite::micro::GetTensorData<float>(activation_state);
 
   TFLITE_DCHECK(context != nullptr);
   TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
@@ -139,7 +146,7 @@ inline void EvalFloatSVDF(
   float* scratch_ptr = static_cast<float*>(
       context->GetScratchBuffer(context, scratch_tensor_index));
 
-  float* output_ptr = GetTensorData<float>(output);
+  float* output_ptr = tflite::micro::GetTensorData<float>(output);
 
   // Left shift the activation_state.
   {
@@ -185,14 +192,13 @@ inline void EvalFloatSVDF(
 }
 
 void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
-                     const TfLiteTensor* input_tensor,
-                     const TfLiteTensor* weights_feature_tensor,
-                     const TfLiteTensor* weights_time_tensor,
-                     const TfLiteTensor* bias_tensor,
+                     const TfLiteEvalTensor* input_tensor,
+                     const TfLiteEvalTensor* weights_feature_tensor,
+                     const TfLiteEvalTensor* weights_time_tensor,
+                     const TfLiteEvalTensor* bias_tensor,
                      const TfLiteSVDFParams* params,
-                     TfLiteTensor* activation_state_tensor,
-                     TfLiteTensor* output_tensor, const OpData& data,
-                     int32_t input_zp, int32_t output_zp) {
+                     TfLiteEvalTensor* activation_state_tensor,
+                     TfLiteEvalTensor* output_tensor, const OpData& data) {
   const int n_rank = params->rank;
   const int n_batch = input_tensor->dims->data[0];
   const int n_input = input_tensor->dims->data[1];
@@ -209,7 +215,8 @@ void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
       context->GetScratchBuffer(context, data.scratch_output_tensor_index));
 
   // Shift states.
-  int16_t* const state_ptr = GetTensorData<int16_t>(activation_state_tensor);
+  int16_t* const state_ptr =
+      tflite::micro::GetTensorData<int16_t>(activation_state_tensor);
 
   // Left shift the activation_state.
   {
@@ -225,10 +232,11 @@ void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
 
   // Feature matmul.
   {
-    int16_t* state = GetTensorData<int16_t>(activation_state_tensor);
-    const int8_t* input = GetTensorData<int8_t>(input_tensor);
+    int16_t* state =
+        tflite::micro::GetTensorData<int16_t>(activation_state_tensor);
+    const int8_t* input = tflite::micro::GetTensorData<int8_t>(input_tensor);
     const int8_t* weight_feature =
-        GetTensorData<int8_t>(weights_feature_tensor);
+        tflite::micro::GetTensorData<int8_t>(weights_feature_tensor);
     const int32_t output_max = std::numeric_limits<int16_t>::max();
     const int32_t output_min = std::numeric_limits<int16_t>::min();
     int16_t* result_in_batch = state + (n_memory - 1);
@@ -238,7 +246,8 @@ void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
         int32_t dot_prod = 0;
         const int8_t* vector_in_batch = input + b * n_input;
         for (int c = 0; c < n_input; c++) {
-          dot_prod += *matrix_ptr++ * (*vector_in_batch++ - input_zp);
+          dot_prod +=
+              *matrix_ptr++ * (*vector_in_batch++ - data.input_zero_point);
         }
         dot_prod = MultiplyByQuantizedMultiplier(
             dot_prod, data.effective_scale_1_a, data.effective_scale_1_b);
@@ -261,9 +270,10 @@ void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
       int32_t* scratch_ptr_batch = scratch_tensor + b * n_filter;
 
       // Perform batched vector dot product:
-      const int16_t* vector1_ptr = GetTensorData<int16_t>(weights_time_tensor);
+      const int16_t* vector1_ptr =
+          tflite::micro::GetTensorData<int16_t>(weights_time_tensor);
       const int16_t* vector2_ptr =
-          GetTensorData<int16_t>(activation_state_tensor) +
+          tflite::micro::GetTensorData<int16_t>(activation_state_tensor) +
           b * n_memory * n_filter;
 
       for (int i = 0; i < n_filter; i++) {
@@ -281,7 +291,8 @@ void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
     // Add bias.
     if (bias_tensor) {
       // Vector batch assign:
-      const int32_t* bias_data = GetTensorData<int32_t>(bias_tensor);
+      const int32_t* bias_data =
+          tflite::micro::GetTensorData<int32_t>(bias_tensor);
       for (int i = 0; i < n_batch; ++i) {
         int32_t* output_ptr = scratch_output_tensor + i * n_unit;
         const int32_t* bias_ptr = bias_data;
@@ -316,9 +327,10 @@ void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
       int32_t x1 = scratch_output_tensor[i];
       int32_t x2 = MultiplyByQuantizedMultiplier(x1, data.effective_scale_2_a,
                                                  data.effective_scale_2_b);
-      int32_t x3 = x2 + output_zp;
+      int32_t x3 = x2 + data.output_zero_point;
       int32_t x4 = std::min(std::max(output_min, x3), output_max);
-      GetTensorData<int8_t>(output_tensor)[i] = static_cast<int8_t>(x4);
+      tflite::micro::GetTensorData<int8_t>(output_tensor)[i] =
+          static_cast<int8_t>(x4);
     }
   }
 }
@@ -338,12 +350,7 @@ constexpr int kOutputTensor = 0;
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@@ -382,7 +389,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 2);
 
   // Validate Tensor Output:
-  // [0] = float/int8, {2, batch_size, num_units}
+  // [0] = float/int8_t, {2, batch_size, num_units}
   TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TF_LITE_ENSURE_EQ(context, NumDimensions(output), 2);
@@ -408,9 +415,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, activation_state->dims->data[0], batch_size);
   TF_LITE_ENSURE_EQ(context, activation_state->dims->data[1],
                     memory_size * num_filters);
+  // Since is_variable is not part of TFLiteEvalTensor, check is_variable here.
+  TF_LITE_ENSURE_EQ(context, activation_state->is_variable, true);
 
   TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
 
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
   if (input->type == kTfLiteInt8) {
     TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteInt8);
     TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteInt16);
@@ -421,33 +433,28 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
     TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);
 
-    const auto* input_params =
-        reinterpret_cast<TfLiteAffineQuantization*>(input->quantization.params);
-    const auto* weights_feature_params =
-        static_cast<const TfLiteAffineQuantization*>(
-            weights_feature->quantization.params);
-    const auto* state_params = static_cast<const TfLiteAffineQuantization*>(
-        activation_state->quantization.params);
-    const auto* weight_time_params =
-        static_cast<const TfLiteAffineQuantization*>(
-            weights_time->quantization.params);
-    const auto* output_params = static_cast<const TfLiteAffineQuantization*>(
-        output->quantization.params);
     const double effective_scale_1 = static_cast<double>(
-        input_params->scale->data[0] * weights_feature_params->scale->data[0] /
-        state_params->scale->data[0]);
-    const double effective_scale_2 = static_cast<double>(
-        state_params->scale->data[0] * weight_time_params->scale->data[0] /
-        output_params->scale->data[0]);
+        input->params.scale * weights_feature->params.scale /
+        activation_state->params.scale);
+    const double effective_scale_2 =
+        static_cast<double>(activation_state->params.scale *
+                            weights_time->params.scale / output->params.scale);
 
-    TFLITE_DCHECK(node->user_data != nullptr);
-    OpData* data = static_cast<OpData*>(node->user_data);
+    // TODO(b/162018098): Use TF_LITE_ENSURE_NEAR when it is ready.
+    TF_LITE_ENSURE(
+        context,
+        std::abs(static_cast<double>(bias->params.scale) -
+                 static_cast<double>(activation_state->params.scale *
+                                     weights_time->params.scale)) < 1e-5);
 
     QuantizeMultiplier(effective_scale_1, &(data->effective_scale_1_a),
                        &(data->effective_scale_1_b));
     QuantizeMultiplier(effective_scale_2, &(data->effective_scale_2_a),
                        &(data->effective_scale_2_b));
 
+    data->input_zero_point = input->params.zero_point;
+    data->output_zero_point = output->params.zero_point;
+
     TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);
 
     const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
@@ -469,9 +476,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);
 
-    TFLITE_DCHECK(node->user_data != nullptr);
-    OpData* data = static_cast<OpData*>(node->user_data);
-
     TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);
     const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
         context, batch_size * num_filters * sizeof(float),
@@ -484,20 +488,24 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data);
-
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* weights_feature =
-      GetInput(context, node, kWeightsFeatureTensor);
-  const TfLiteTensor* weights_time =
-      GetInput(context, node, kWeightsTimeTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
-  TfLiteTensor* activation_state =
-      GetVariableInput(context, node, kInputActivationStateTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
   TFLITE_DCHECK(node->user_data != nullptr);
   const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* weights_feature =
+      tflite::micro::GetEvalInput(context, node, kWeightsFeatureTensor);
+  const TfLiteEvalTensor* weights_time =
+      tflite::micro::GetEvalInput(context, node, kWeightsTimeTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 5)
+          ? tflite::micro::GetEvalInput(context, node, kBiasTensor)
+          : nullptr;
+  TfLiteEvalTensor* activation_state = tflite::micro::GetMutableEvalInput(
+      context, node, kInputActivationStateTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
   switch (weights_feature->type) {
     case kTfLiteFloat32: {
       EvalFloatSVDF(context, node, input, weights_feature, weights_time, bias,
@@ -508,11 +516,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     }
 
     case kTfLiteInt8: {
-      TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActRelu);
-
       EvalIntegerSVDF(context, node, input, weights_feature, weights_time, bias,
-                      params, activation_state, output, data,
-                      input->params.zero_point, output->params.zero_point);
+                      params, activation_state, output, data);
       return kTfLiteOk;
       break;
     }
diff --git a/tensorflow/lite/micro/kernels/svdf_test.cc b/tensorflow/lite/micro/kernels/svdf_test.cc
index ea129efaaa8..8999d8fb4a0 100644
--- a/tensorflow/lite/micro/kernels/svdf_test.cc
+++ b/tensorflow/lite/micro/kernels/svdf_test.cc
@@ -13,52 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <initializer_list>
-
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
-
 namespace tflite {
 namespace testing {
 namespace {
 
-// naming as follows: svdf_<tensor name>_<input size>x<batch size>x<batch count>
-static float svdf_input_3x2x10[] = {
-    0.12609188,  -0.46347019, -0.89598465,
-    0.35867718,  0.36897406,  0.73463392,
+// naming as follows: <tensor name>_<input size>x<batch size>x<batch count>
 
-    0.14278367,  -1.64410412, -0.75222826,
-    -0.57290924, 0.12729003,  0.7567004,
-
-    0.49837467,  0.19278903,  0.26584083,
-    0.17660543,  0.52949083,  -0.77931279,
-
-    -0.11186574, 0.13164264,  -0.05349274,
-    -0.72674477, -0.5683046,  0.55900657,
-
-    -0.68892461, 0.37783599,  0.18263303,
-    -0.63690937, 0.44483393,  -0.71817774,
-
-    -0.81299269, -0.86831826, 1.43940818,
-    -0.95760226, 1.82078898,  0.71135032,
-
-    -1.45006323, -0.82251364, -1.69082689,
-    -1.65087092, -1.89238167, 1.54172635,
-
-    0.03966608,  -0.24936394, -0.77526885,
-    2.06740379,  -1.51439476, 1.43768692,
-
-    0.11771342,  -0.23761693, -0.65898693,
-    0.31088525,  -1.55601168, -0.87661445,
-
-    -0.89477462, 1.67204106,  -0.53235275,
-    -0.6230064,  0.29819036,  1.06939757,
-};
-
-static float svdf_input_2x2x10[] = {
+// 10 inputs each with shape {2, 2}.
+const float input_data_2x2x10[] = {
     0.12609188,  -0.46347019, 0.35867718,  0.36897406,
 
     0.14278367,  -1.64410412, -0.57290924, 0.12729003,
@@ -80,145 +47,476 @@ static float svdf_input_2x2x10[] = {
     -0.89477462, 1.67204106,  -0.6230064,  0.29819036,
 };
 
-static float svdf_golden_output_2x2x30_rank_1[] = {
-    -0.044205, -0.013757, 0.050369,  -0.018447, 0.073010,  0.025142,  -0.021154,
-    0.013551,  -0.209613, -0.062421, 0.150209,  -0.108334, 0.028256,  -0.006950,
-    -0.030885, 0.009603,  -0.076800, -0.037075, -0.087198, -0.155183, 0.091069,
-    0.098446,  -0.016083, 0.106475,  -0.082123, -0.162238, -0.084434, -0.141074,
-    -0.029340, -0.090685, 0.053302,  -0.030604, -0.201440, 0.088424,  0.139877,
-    0.012416,  -0.113212, 0.103893,  -0.100842, 0.122780,  -0.166632, -0.116705,
-    0.175298,  -0.047163, 0.313077,  -0.166485, -0.285860, 0.129069,  -0.625911,
-    0.046134,  0.138081,  -0.129581, -0.521455, -0.061579, 0.230289,  0.114963,
-    -0.216693, -0.161643, -0.179177, -0.052599, -0.213239, 0.029502,  0.260858,
-    0.275045,  -0.213689, -0.323608, -0.285635, -0.317687, -0.324092, -0.317972,
-    -0.208450, -0.462504, -0.255126, -0.218576, -0.041528, 0.179421,  -0.440583,
-    0.072127,  -0.284136, 0.241570,  -0.582490, 0.253004,  0.156972,  0.132266,
-    -0.175340, -0.269495, -0.005782, -0.125683, -0.461215, 0.257511,  0.340125,
-    0.140569,  -0.866940, -0.075565, 0.484422,  0.018665,  0.059312,  -0.006378,
-    -0.465532, 0.291374,  -0.182749, 0.232608,  0.479811,  0.541274,  0.286369,
-    -0.188810, -0.011561, 0.022947,  0.451862,  0.214710,  -0.367849, -0.722380,
-    -0.072298, -0.270524, -0.083401, -0.038342, -0.035884, -0.565247, -0.427794,
-    0.015071};
+// Feature filter of shape {8, 2}.
+const float feature_weights_data_2x2x10[] = {
+    -0.31930989, 0.0079667,  0.39296314,  0.37613347,  0.12416199,  0.15785322,
+    0.27901134,  0.3905206,  0.21931258,  -0.36137494, -0.10640851, 0.31053296,
+    -0.36118156, -0.0976817, -0.36916667, 0.22197971};
 
-static float svdf_golden_output_3x2x10_rank_1[] = {
-    0.014899,    -0.0517661,  -0.143725,   -0.00271883,
-    -0.03004015, 0.09565311,  0.1587342,   0.00784263,
+// Time filter of shape {8, 10}.
+const float time_weights_data_2x2x10[] = {
+    -0.31930989, 0.37613347,  0.27901134,  -0.36137494, -0.36118156,
+    0.22197971,  0.27557442,  -0.06634006, 0.0079667,   0.12416199,
 
-    0.068281,    -0.162217,   -0.152268,   0.00323521,
-    0.01582633,  0.03858774,  -0.03001583, -0.02671271,
+    0.3905206,   -0.10640851, -0.0976817,  0.15294972,  0.39635518,
+    -0.02702999, 0.39296314,  0.15785322,  0.21931258,  0.31053296,
 
-    -0.0317821,  -0.0333089,  0.0609602,   0.0333759,
-    -0.01432795, 0.05524484,  0.1101355,   -0.02382665,
+    -0.36916667, 0.38031587,  -0.21580373, 0.27072677,  0.23622236,
+    0.34936687,  0.18174365,  0.35907319,  -0.17493086, 0.324846,
 
-    -0.00623099, -0.077701,   -0.391193,   -0.0136691,
-    -0.02333033, 0.02293761,  0.12338032,  0.04326871,
+    -0.10781813, 0.27201805,  0.14324132,  -0.23681851, -0.27115166,
+    -0.01580888, -0.14943552, 0.15465137,  0.09784451,  -0.0337657,
 
-    0.201551,    -0.164607,   -0.179462,   -0.0592739,
-    0.01064911,  -0.17503069, 0.07821996,  -0.00224009,
+    -0.14884081, 0.19931212,  -0.36002168, 0.34663299,  -0.11405486,
+    0.12672701,  0.39463779,  -0.07886535, -0.06384811, 0.08249187,
 
-    0.0886511,   -0.0875401,  -0.269283,   0.0281379,
-    -0.02282338, 0.09741908,  0.32973239,  0.12281385,
+    -0.26816407, -0.19905911, 0.29211238,  0.31264046,  -0.28664589,
+    0.05698794,  0.11613581,  0.14078894,  0.02187902,  -0.21781836,
 
-    -0.201174,   -0.586145,   -0.628624,   -0.0330412,
-    0.24780814,  -0.39304617, -0.22473189, 0.02589256,
+    -0.15567942, 0.08693647,  -0.38256618, 0.36580828,  -0.22922277,
+    -0.0226903,  0.12878349,  -0.28122205, -0.10850525, -0.11955214,
 
-    -0.0839096,  -0.299329,   0.108746,    0.109808,
-    0.10084175,  -0.06416984, 0.28936723,  0.0026358,
+    0.27179423,  -0.04710215, 0.31069002,  0.22672787,  0.09580326,
+    0.08682203,  0.1258215,   0.1851041,   0.29228821,  0.12366763};
 
-    0.419114,    -0.237824,   -0.422627,   0.175115,
-    -0.2314795,  -0.18584411, -0.4228974,  -0.12928449,
+// Activation state with shape {2, 80}. These initial values must be copied into
+// a mutable activation state tensor.
 
-    0.36726,     -0.522303,   -0.456502,   -0.175475,
-    0.17012937,  -0.34447709, 0.38505614,  -0.28158101,
+const float initial_activation_state_data_2x2x10[] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+// Bias with shape {8}
+const float bias_data_2x2x10[] = {0, 0, 0, 0, 0, 0, 0, 0};
+
+// 10 outputs each of shape {2, 4}
+const float golden_output_2x2x10[] = {
+    -0.044205, -0.013757, 0.050369,  -0.018447,
+    0.073010,  0.025142,  -0.021154, 0.013551,
+
+    -0.209613, -0.062421, 0.150209,  -0.108334,
+    0.028256,  -0.006950, -0.030885, 0.009603,
+
+    -0.076800, -0.037075, -0.087198, -0.155183,
+    0.091069,  0.098446,  -0.016083, 0.106475,
+
+    -0.082123, -0.162238, -0.084434, -0.141074,
+    -0.029340, -0.090685, 0.053302,  -0.030604,
+
+    -0.201440, 0.088424,  0.139877,  0.012416,
+    -0.113212, 0.103893,  -0.100842, 0.122780,
+
+    -0.166632, -0.116705, 0.175298,  -0.047163,
+    0.313077,  -0.166485, -0.285860, 0.129069,
+
+    -0.625911, 0.046134,  0.138081,  -0.129581,
+    -0.521455, -0.061579, 0.230289,  0.114963,
+
+    -0.216693, -0.161643, -0.179177, -0.052599,
+    -0.213239, 0.029502,  0.260858,  0.275045,
+
+    -0.213689, -0.323608, -0.285635, -0.317687,
+    -0.324092, -0.317972, -0.208450, -0.462504,
+
+    -0.255126, -0.218576, -0.041528, 0.179421,
+    -0.440583, 0.072127,  -0.284136, 0.241570};
+
+// Simulated real-world inputs, weights and expected outputs.
+
+// Input of shape {1x16}
+const float input_data_16x1x1[] = {
+    -0.488494, 2.023762,  -2.233117, -0.488494, 3.559030, 9.490748,
+    -3.210106, -1.953977, -0.279140, 0.907204,  1.674838, 0.000000,
+    -0.279140, -0.628064, -0.069785, -0.628064,
 };
 
-static float svdf_golden_output_3x2x10_rank_2[] = {
-    -0.09623547, -0.10193135, 0.11083051,  -0.0347917,
-    0.1141196,   0.12965347,  -0.12652366, 0.01007236,
+// Feature filter of shape {64, 16}.
+const float feature_weights_data_16x1x1[] = {
+    0.173588,  0.173588,  -0.024798, 0.193426,  -0.099193, 0.044637,  0.183507,
+    0.183507,  0.044637,  0.198386,  -0.069435, 0.084314,  0.312458,  0.024798,
+    0.173588,  -0.049596, -0.352135, -0.550521, -0.009919, -0.099193, -0.074395,
+    -0.128951, 0.193426,  0.357095,  -0.317418, -0.119032, -0.218225, -0.004960,
+    -0.386853, -0.133911, 0.252942,  -0.019839, -0.024798, -0.054556, -0.069435,
+    -0.128951, 0.029758,  -0.099193, -0.312458, -0.029758, 0.064475,  0.183507,
+    0.114072,  -0.178547, -0.247982, -0.119032, 0.243023,  -0.119032, -0.034718,
+    -0.178547, 0.019839,  0.128951,  -0.223184, -0.009919, -0.213265, 0.168628,
+    -0.143830, -0.322377, -0.218225, -0.193426, -0.252942, -0.049596, 0.064475,
+    -0.267821, -0.580279, -0.099193, 0.213265,  0.119032,  -0.119032, -0.178547,
+    0.610037,  0.109112,  0.049596,  -0.014879, -0.049596, -0.193426, 0.039677,
+    -0.148789, -0.114072, -0.158709, -0.158709, 0.094233,  0.099193,  -0.114072,
+    0.104153,  -0.123991, 0.198386,  -0.173588, 0.089274,  -0.247982, -0.054556,
+    0.123991,  0.183507,  0.114072,  0.188467,  0.302539,  0.044637,  0.039677,
+    -0.099193, 0.168628,  -0.024798, -0.054556, -0.109112, 0.014879,  -0.009919,
+    0.069435,  -0.396772, -0.287660, -0.079354, -0.104153, 0.054556,  0.089274,
+    -0.099193, 0.114072,  0.034718,  0.119032,  0.282700,  -0.119032, -0.505884,
+    -0.233104, -0.114072, -0.257902, -0.233104, -0.178547, 0.153749,  0.128951,
+    0.143830,  -0.188467, -0.183507, 0.104153,  -0.024798, 0.193426,  -0.287660,
+    0.168628,  -0.009919, 0.119032,  -0.024798, -0.099193, -0.203346, 0.099193,
+    0.084314,  -0.168628, 0.123991,  -0.148789, 0.114072,  -0.029758, 0.228144,
+    -0.238063, 0.089274,  -0.064475, 0.307498,  -0.188467, -0.004960, -0.252942,
+    -0.173588, -0.158709, -0.044637, -0.009919, 0.312458,  -0.262861, 0.059516,
+    0.158709,  0.069435,  -0.282700, 0.074395,  -0.322377, -0.183507, -0.123991,
+    -0.233104, 0.009919,  0.252942,  -0.243023, 0.555481,  -0.099193, -0.119032,
+    -0.441409, 0.148789,  0.084314,  -0.168628, -0.183507, 0.188467,  0.024798,
+    -0.302539, 0.223184,  0.143830,  -0.193426, -0.054556, -0.218225, -0.297579,
+    0.104153,  0.272781,  -0.034718, 0.114072,  -0.059516, 0.044637,  0.342216,
+    0.421570,  0.138870,  -0.024798, -0.039677, -0.163668, -0.034718, 0.396772,
+    -0.128951, -0.044637, -0.173588, 0.302539,  0.079354,  0.049596,  0.133911,
+    -0.029758, -0.312458, -0.029758, 0.079354,  0.128951,  0.252942,  0.213265,
+    0.014879,  0.287660,  0.178547,  0.297579,  0.352135,  0.401732,  0.024798,
+    -0.277740, -0.411651, -0.069435, 0.342216,  -0.158709, -0.104153, -0.009919,
+    0.223184,  0.228144,  -0.019839, 0.059516,  -0.104153, -0.510844, 0.029758,
+    -0.406691, 0.089274,  0.421570,  0.163668,  -0.143830, -0.019839, -0.039677,
+    0.104153,  -0.044637, -0.128951, 0.203346,  0.079354,  -0.069435, 0.094233,
+    -0.138870, 0.466207,  -0.163668, 0.049596,  0.029758,  0.267821,  0.029758,
+    -0.049596, 0.009919,  0.004960,  -0.099193, 0.094233,  -0.262861, 0.089274,
+    -0.302539, 0.332297,  -0.307498, -0.014879, 0.168628,  -0.094233, -0.272781,
+    0.034718,  -0.133911, -0.228144, 0.094233,  0.257902,  -0.228144, 0.153749,
+    -0.054556, -0.252942, 0.054556,  0.218225,  -0.054556, 0.302539,  0.282700,
+    0.054556,  -0.044637, -0.133911, 0.233104,  -0.049596, 0.411651,  0.044637,
+    -0.297579, -0.029758, -0.114072, 0.114072,  -0.580279, 0.079354,  -0.024798,
+    -0.347175, -0.128951, -0.099193, 0.238063,  -0.104153, -0.009919, 0.158709,
+    -0.034718, 0.123991,  -0.163668, 0.059516,  0.342216,  0.009919,  0.064475,
+    -0.307498, -0.520763, -0.238063, 0.163668,  0.362054,  0.034718,  -0.178547,
+    -0.104153, -0.257902, 0.322377,  0.054556,  0.148789,  -0.178547, 0.084314,
+    0.004960,  0.257902,  0.029758,  0.079354,  -0.223184, -0.193426, 0.282700,
+    0.000000,  -0.019839, -0.114072, 0.491005,  -0.193426, -0.029758, -0.243023,
+    0.009919,  0.089274,  -0.277740, -0.089274, 0.104153,  0.337256,  0.138870,
+    -0.307498, -0.054556, 0.352135,  0.133911,  -0.044637, 0.133911,  -0.089274,
+    -0.357095, -0.272781, 0.069435,  0.059516,  -0.109112, 0.148789,  -0.044637,
+    -0.019839, -0.153749, 0.123991,  -0.223184, 0.322377,  0.074395,  -0.312458,
+    0.024798,  -0.223184, 0.109112,  -0.138870, 0.218225,  -0.074395, -0.406691,
+    0.009919,  -0.198386, -0.009919, 0.416611,  0.178547,  0.148789,  0.133911,
+    -0.004960, 0.069435,  -0.054556, -0.044637, 0.297579,  0.059516,  -0.456288,
+    -0.148789, -0.004960, 0.054556,  0.094233,  -0.104153, 0.198386,  -0.302539,
+    0.133911,  0.411651,  0.054556,  0.525723,  -0.089274, 0.079354,  0.238063,
+    0.079354,  -0.039677, 0.039677,  0.029758,  0.332297,  -0.014879, -0.367014,
+    -0.143830, -0.123991, -0.064475, 0.014879,  0.173588,  -0.168628, 0.386853,
+    0.009919,  0.173588,  0.163668,  0.123991,  0.163668,  0.198386,  0.203346,
+    -0.401732, -0.009919, 0.272781,  -0.173588, 0.044637,  0.238063,  0.133911,
+    0.049596,  0.208305,  -0.024798, 0.049596,  -0.049596, 0.034718,  -0.446368,
+    0.466207,  -0.089274, -0.099193, -0.128951, -0.228144, 0.014879,  -0.252942,
+    0.074395,  -0.223184, -0.168628, -0.292619, 0.178547,  0.153749,  -0.014879,
+    0.054556,  0.000000,  0.193426,  0.158709,  0.178547,  -0.327337, -0.138870,
+    -0.114072, 0.168628,  0.297579,  -0.109112, -0.029758, -0.029758, -0.416611,
+    0.059516,  0.000000,  -0.168628, -0.322377, 0.238063,  -0.128951, -0.029758,
+    0.500925,  0.292619,  0.123991,  -0.099193, 0.074395,  0.317418,  -0.148789,
+    0.064475,  -0.104153, -0.044637, -0.094233, 0.188467,  -0.044637, 0.213265,
+    -0.233104, -0.049596, 0.004960,  -0.198386, 0.287660,  -0.148789, -0.257902,
+    0.004960,  -0.218225, -0.044637, -0.386853, -0.243023, -0.163668, 0.094233,
+    0.029758,  -0.019839, -0.009919, -0.143830, -0.158709, 0.158709,  -0.243023,
+    -0.039677, -0.297579, 0.069435,  0.049596,  0.302539,  0.059516,  0.074395,
+    -0.019839, 0.352135,  -0.019839, -0.138870, -0.178547, -0.243023, 0.233104,
+    0.252942,  -0.228144, -0.049596, 0.173588,  0.173588,  -0.074395, -0.034718,
+    -0.292619, 0.362054,  0.183507,  0.243023,  -0.203346, -0.044637, 0.054556,
+    0.059516,  -0.158709, -0.158709, 0.000000,  0.327337,  0.119032,  0.034718,
+    -0.044637, -0.089274, 0.089274,  -0.233104, 0.000000,  -0.317418, 0.371974,
+    0.213265,  0.307498,  -0.178547, -0.367014, 0.039677,  -0.059516, 0.168628,
+    -0.014879, 0.143830,  0.123991,  -0.084314, -0.332297, -0.416611, 0.183507,
+    0.109112,  -0.039677, 0.014879,  0.292619,  -0.213265, -0.054556, 0.004960,
+    0.123991,  0.119032,  0.000000,  -0.332297, -0.312458, -0.198386, -0.213265,
+    0.119032,  0.322377,  0.168628,  0.104153,  -0.262861, 0.327337,  -0.049596,
+    -0.228144, -0.074395, 0.168628,  0.123991,  0.396772,  0.044637,  0.322377,
+    0.193426,  0.267821,  -0.178547, 0.297579,  0.148789,  -0.218225, -0.138870,
+    0.044637,  0.049596,  0.133911,  0.064475,  0.069435,  0.064475,  -0.158709,
+    -0.044637, -0.173588, 0.267821,  0.327337,  0.079354,  -0.228144, 0.029758,
+    0.014879,  0.198386,  -0.109112, -0.133911, 0.431490,  0.099193,  0.421570,
+    0.233104,  -0.054556, 0.054556,  -0.317418, -0.133911, -0.123991, -0.287660,
+    0.342216,  -0.049596, -0.153749, 0.228144,  -0.213265, 0.262861,  0.406691,
+    -0.084314, -0.004960, 0.193426,  0.188467,  -0.099193, -0.223184, 0.163668,
+    -0.257902, -0.153749, 0.441409,  0.099193,  0.128951,  -0.089274, -0.208305,
+    -0.009919, -0.004960, -0.109112, 0.024798,  -0.119032, 0.019839,  0.391812,
+    -0.024798, 0.198386,  0.327337,  -0.505884, -0.099193, 0.510844,  -0.148789,
+    0.094233,  -0.153749, -0.039677, 0.352135,  0.272781,  -0.228144, -0.287660,
+    -0.272781, 0.148789,  0.277740,  0.074395,  0.109112,  -0.064475, 0.044637,
+    0.074395,  -0.292619, 0.153749,  -0.064475, -0.114072, 0.198386,  -0.039677,
+    -0.128951, -0.004960, 0.257902,  -0.228144, -0.094233, 0.064475,  0.014879,
+    0.188467,  -0.416611, 0.099193,  0.362054,  -0.208305, 0.198386,  -0.079354,
+    0.009919,  0.119032,  0.332297,  0.243023,  -0.168628, 0.158709,  0.039677,
+    0.143830,  0.277740,  -0.168628, 0.009919,  0.099193,  -0.004960, -0.257902,
+    -0.297579, 0.208305,  -0.104153, 0.119032,  0.247982,  0.381893,  -0.223184,
+    -0.367014, -0.327337, -0.168628, -0.094233, 0.208305,  -0.019839, 0.183507,
+    0.084314,  0.133911,  0.109112,  -0.148789, -0.183507, -0.411651, -0.024798,
+    -0.114072, -0.029758, -0.009919, 0.173588,  -0.059516, -0.049596, 0.039677,
+    0.317418,  0.138870,  -0.247982, -0.084314, 0.158709,  0.054556,  -0.084314,
+    -0.049596, 0.074395,  0.019839,  -0.282700, -0.119032, -0.262861, 0.163668,
+    -0.069435, -0.064475, -0.059516, 0.094233,  0.123991,  -0.079354, -0.272781,
+    -0.267821, 0.233104,  0.114072,  -0.218225, 0.540602,  0.089274,  0.262861,
+    0.079354,  0.267821,  -0.119032, -0.109112, -0.128951, 0.128951,  -0.044637,
+    -0.272781, 0.277740,  0.297579,  -0.054556, -0.084314, -0.049596, 0.123991,
+    0.059516,  0.238063,  -0.168628, -0.009919, 0.163668,  -0.307498, 0.109112,
+    -0.064475, 0.218225,  -0.168628, -0.004960, -0.168628, 0.119032,  0.094233,
+    -0.183507, -0.089274, -0.292619, -0.094233, 0.064475,  -0.183507, -0.168628,
+    0.089274,  0.074395,  -0.367014, -0.024798, -0.069435, 0.119032,  -0.302539,
+    -0.376933, -0.123991, -0.009919, -0.069435, -0.208305, -0.119032, 0.014879,
+    -0.183507, -0.238063, 0.163668,  -0.332297, -0.148789, -0.391812, -0.024798,
+    -0.133911, -0.059516, -0.123991, 0.123991,  -0.292619, -0.044637, 0.059516,
+    -0.069435, 0.049596,  -0.069435, 0.034718,  0.158709,  -0.347175, -0.044637,
+    0.352135,  -0.347175, -0.282700, -0.054556, 0.307498,  0.029758,  0.357095,
+    -0.148789, 0.208305,  -0.317418, 0.009919,  0.004960,  -0.243023, 0.049596,
+    -0.099193, 0.213265,  -0.342216, 0.158709,  0.123991,  -0.332297, 0.386853,
+    -0.262861, -0.208305, 0.123991,  -0.044637, 0.148789,  0.084314,  -0.297579,
+    -0.307498, -0.163668, 0.337256,  -0.014879, 0.074395,  0.178547,  -0.004960,
+    -0.257902, -0.019839, -0.228144, -0.034718, -0.277740, -0.158709, -0.119032,
+    -0.153749, 0.629876,  0.277740,  0.178547,  -0.267821, -0.004960, 0.247982,
+    0.084314,  -0.094233, 0.000000,  -0.039677, 0.332297,  0.178547,  0.009919,
+    -0.213265, -0.208305, -0.044637, 0.019839,  0.218225,  -0.297579, 0.014879,
+    -0.247982, -0.004960, -0.128951, 0.421570,  -0.059516, 0.362054,  -0.203346,
+    -0.143830, -0.099193, -0.024798, 0.094233,  -0.123991, 0.163668,  0.109112,
+    -0.104153, -0.233104, 0.009919,  -0.218225, 0.376933,  0.104153,  -0.059516,
+    0.049596,  -0.054556, 0.019839,  -0.044637, -0.019839, 0.371974,  -0.019839,
+    0.104153,  0.168628,  -0.024798, -0.272781, -0.158709, 0.223184,  0.044637,
+    0.039677,  -0.168628, -0.287660, -0.109112, 0.094233,  -0.089274, -0.148789,
+    0.178547,  -0.039677, -0.089274, -0.049596, -0.024798, 0.064475,  -0.158709,
+    0.089274,  0.029758,  -0.247982, 0.362054,  0.024798,  -0.004960, -0.099193,
+    0.173588,  -0.059516, 0.188467,  -0.629876, 0.094233,  0.371974,  0.069435,
+    0.252942,  -0.357095, -0.272781, -0.367014, 0.014879,  -0.049596, -0.262861,
+    0.009919,  -0.094233, -0.094233, 0.059516,  0.223184,  0.133911,  0.411651,
+    -0.044637, -0.044637, 0.109112,  0.228144,  0.386853,  -0.233104, 0.069435,
+    0.228144,  -0.302539, 0.029758,  0.089274,  0.044637,  -0.238063, -0.138870,
+    -0.158709, -0.019839, 0.049596,  0.039677,  0.000000,  -0.069435, 0.109112,
+    -0.213265, -0.188467, -0.262861, -0.267821, -0.094233, 0.133911,  0.391812,
+    0.123991,  -0.317418, 0.233104,  -0.029758, -0.099193, -0.193426, 0.074395,
+    -0.009919, 0.252942,  0.322377,  -0.530683, 0.208305,  0.252942,  0.203346,
+    -0.069435, -0.262861};
 
-    -0.16396809, -0.21247184, 0.11259045,  -0.04156673,
-    0.10132131,  -0.06143532, -0.00924693, 0.10084561,
+// Time filter of shape {64, 8}.
+const float time_weights_data_16x1x1[] = {
+    -0.052026, 0.043107,  0.053512,  0.013378,  0.011892,  -0.182834, -0.108511,
+    0.153105,  0.050539,  -0.173915, 0.145672,  0.208103,  -0.221481, 0.108511,
+    -0.496475, 0.181347,  -0.016351, -0.132294, -0.234859, -0.243778, 0.028243,
+    -0.228914, -0.130808, -0.167969, -0.041621, -0.306209, -0.193239, -0.028243,
+    -0.057972, -0.057972, -0.497962, 0.054999,  0.181347,  0.047566,  -0.099592,
+    -0.111484, -0.130808, -0.071350, 0.380532,  0.010405,  0.041621,  0.052026,
+    0.022297,  0.081755,  0.098106,  0.099592,  -0.584176, -0.023783, 0.062431,
+    -0.090674, -0.279453, -0.486070, -0.273507, 0.004459,  -0.062431, 0.095133,
+    0.056485,  0.022297,  -0.105538, -0.184320, 0.358235,  0.254183,  0.049053,
+    0.084728,  0.218508,  0.078782,  -0.136754, -0.017837, -0.124862, -0.118916,
+    -0.001486, 0.043107,  0.254183,  0.087701,  0.261616,  0.309182,  -0.404315,
+    -0.040134, -0.046080, -0.052026, -0.034188, -0.475665, -0.025270, -0.049053,
+    -0.046080, -0.062431, 0.020810,  0.040134,  -0.135267, -0.169456, -0.050539,
+    -0.576743, 0.034188,  0.075809,  0.101079,  0.136754,  0.083241,  0.077296,
+    -0.050539, 0.761064,  -0.335938, -0.080268, 0.025270,  0.257156,  0.227427,
+    0.252697,  0.065404,  0.115943,  0.222968,  -0.026756, -0.054999, 0.107025,
+    -0.093646, 0.041621,  -0.092160, -0.474178, -0.016351, 0.004459,  0.049053,
+    0.019324,  0.019324,  0.074323,  0.038648,  -0.613905, 0.182834,  0.075809,
+    0.028243,  0.019324,  0.010405,  -0.011892, 0.001486,  -0.492016, -0.224454,
+    -0.474178, -0.147159, 0.002973,  0.102565,  0.136754,  -0.267561, -0.001486,
+    -0.095133, -0.040134, 0.066890,  0.074323,  0.104052,  0.532150,  0.090674,
+    0.072836,  -0.053512, -0.004459, 0.020810,  0.046080,  0.062431,  0.477151,
+    0.133781,  -0.029729, -0.026756, 0.031215,  0.156077,  0.096619,  0.251210,
+    0.352289,  0.657012,  0.047566,  -0.014865, -0.072836, -0.016351, 0.008919,
+    -0.053512, 0.016351,  0.300263,  0.047566,  0.020810,  0.169456,  0.001486,
+    0.007432,  0.111484,  0.044594,  -0.188779, -0.096619, 0.074323,  -0.040134,
+    0.160537,  0.138240,  0.184320,  0.377559,  -0.092160, -0.049053, 0.056485,
+    -0.032702, 0.001486,  -0.083241, -0.472692, -0.114457, -0.117430, -0.075809,
+    0.026756,  0.163510,  0.172428,  0.127835,  -0.199185, -0.218508, -0.057972,
+    -0.132294, -0.162023, -0.019324, -0.245265, -0.395396, -0.254183, 0.084728,
+    0.248238,  0.191752,  0.221481,  0.173915,  0.173915,  -0.208103, -0.077296,
+    0.384991,  -0.313641, -0.313641, -0.147159, -0.090674, 0.035675,  0.059458,
+    -0.010405, 0.019324,  0.087701,  0.016351,  0.037161,  0.469719,  -0.074323,
+    0.092160,  0.026756,  0.090674,  0.098106,  0.004459,  -0.034188, 0.492016,
+    -0.367154, -0.093646, -0.063917, 0.041621,  0.017837,  0.026756,  -0.062431,
+    -0.350803, 0.425125,  0.002973,  0.083241,  0.075809,  0.016351,  0.047566,
+    -0.185807, -0.107025, -0.098106, -0.144186, 0.255670,  0.020810,  0.105538,
+    0.029729,  0.129321,  0.156077,  0.141213,  0.334452,  0.147159,  -0.066890,
+    0.035675,  0.115943,  0.240805,  0.328506,  0.162023,  -0.237832, 0.218508,
+    0.233373,  0.214049,  0.099592,  0.026756,  -0.322560, -0.236346, -0.166483,
+    0.225941,  0.109997,  -0.147159, 0.147159,  -0.266075, 0.111484,  0.078782,
+    -0.120403, 0.022297,  -0.075809, -0.148645, -0.251210, -0.176888, -0.044594,
+    -0.023783, 0.016351,  0.026756,  -0.013378, -0.069863, -0.112970, 0.013378,
+    0.086214,  0.014865,  0.352289,  -0.240805, -0.135267, -0.114457, -0.472692,
+    0.334452,  0.095133,  0.047566,  0.130808,  -0.068377, -0.007432, -0.130808,
+    -0.121889, -0.053512, -0.245265, -0.371613, -0.083241, 0.000000,  -0.028243,
+    0.029729,  -0.093646, -0.004459, -0.038648, -0.108511, -0.475665, -0.169456,
+    -0.047566, -0.010405, -0.114457, -0.353776, -0.034188, -0.044594, 0.041621,
+    -0.047566, -0.107025, 0.004459,  0.053512,  0.047566,  -0.358235, -0.193239,
+    0.040134,  -0.096619, -0.054999, 0.099592,  0.032702,  0.205130,  -0.170942,
+    -0.237832, -0.405801, -0.126348, -0.072836, -0.203644, -0.169456, -0.093646,
+    -0.074323, 0.078782,  0.607959,  -0.437017, -0.164996, -0.166483, 0.043107,
+    -0.016351, 0.258643,  0.065404,  -0.057972, 0.017837,  0.080268,  0.050539,
+    -0.013378, -0.215536, -0.524718, 0.260129,  0.040134,  -0.002973, -0.046080,
+    0.020810,  0.025270,  0.145672,  0.515799,  0.233373,  0.011892,  0.139727,
+    0.126348,  0.065404,  -0.007432, -0.008919, 0.035675,  0.083241,  0.040134,
+    -0.005946, 0.503907,  -0.490529, -0.181347, -0.092160, -0.038648, 0.019324,
+    0.133781,  -0.011892, 0.041621,  0.062431,  -0.062431, -0.040134, -0.092160,
+    -0.111484, -0.133781, -0.130808, -0.484583, -0.248238, 0.037161,  -0.092160,
+    -0.056485, -0.041621, 0.112970,  0.248238,  0.438503,  0.258643,  -0.013378,
+    0.004459,  0.043107,  0.040134,  0.017837,  0.101079,  0.264589,  0.212563,
+    0.014865,  0.285399,  0.153105,  0.170942,  0.358235,  0.334452,  0.086214,
+    0.132294,  0.098106,  -0.001486, 0.107025,  0.200671,  -0.026756, 0.344857,
+    0.227427,  -0.041621, 0.098106,  0.063917,  -0.093646, 0.130808,  0.285399,
+    -0.319587, 0.035675,  -0.017837, -0.319587, 0.016351,  -0.098106, -0.017837,
+    0.083241,  0.074323,  -0.054999, 0.276480,  0.316614,  -0.099592, -0.059458,
+    0.156077,  -0.043107, 0.035675,  0.056485,  -0.022297, 0.017837,  -0.001486,
+    0.340398,  0.492016,  0.004459,  0.057972,  -0.150132, -0.206617, -0.257156,
+    -0.248238, -0.080268, -0.164996, 0.352289,  -0.054999, -0.056485, 0.010405,
+    -0.049053, -0.041621, -0.099592, 0.013378,  -0.089187, 0.057972,  -0.413234,
+    0.217022,  0.013378,  -0.080268, -0.035675, 0.035675,  0.007432,  0.002973,
+    -0.469719, 0.141213,  0.136754,  0.153105,  0.130808,  -0.104052, -0.508367,
+    -0.291345, -0.072836, -0.019324, -0.252697, -0.214049, -0.214049, 0.130808,
+    0.484583};
 
-    0.01257364,  0.0506071,   -0.19287863, -0.07162561,
-    -0.02033747, 0.22673416,  0.15487903,  0.02525555,
+// Bias of shape {64}
+const float bias_data_16x1x1[] = {
+    -0.245395, -0.083545, -0.262522, -0.407912, -0.560898, -0.364789, -0.037964,
+    -0.378594, 0.178152,  0.400380,  -0.301349, -0.240913, -0.159454, -0.158757,
+    -0.073665, 0.455906,  -0.061232, 0.318907,  -0.226993, -0.344644, 0.140316,
+    0.559608,  0.109774,  0.437391,  0.113849,  -0.162068, 0.039572,  0.569472,
+    0.460205,  0.113459,  0.370469,  0.176811,  0.203063,  -0.296975, -0.271655,
+    0.059862,  -0.159912, -0.077310, -0.338314, -0.195477, -0.256762, 0.233834,
+    0.083172,  0.029040,  -0.236288, -0.267054, -0.166627, 0.188319,  -0.271391,
+    -0.222920, 0.106463,  0.263614,  0.384986,  -0.125957, -0.095890, 0.363686,
+    -0.036990, -0.358884, -0.178254, 0.305596,  0.390088,  -0.189437, 0.613409,
+    0.399639};
 
-    -0.1411963,  -0.37054959, 0.01774767,  0.05867489,
-    0.09607603,  -0.0141301,  -0.08995658, 0.12867066,
+// Activation state with shape {64, 8}. These initial values must be copied into
+// a mutable activation state tensor.
+const float initial_activation_state_data_16x1x1[] = {
+    -0.582275, -0.586623, -1.262373, -1.277279, -1.542175, -1.271999, -1.429757,
+    -1.184425, -0.462094, -1.443421, 0.230736,  -0.494701, -0.354955, -2.534061,
+    -4.277471, -4.218467, 0.403711,  -0.248748, -0.330111, -0.467683, 0.549047,
+    0.733511,  -0.230115, 0.793136,  -1.126353, -0.984123, -0.081984, -0.222351,
+    0.692830,  0.517060,  1.367958,  2.118860,  -0.116766, -0.826365, -2.402700,
+    -2.313884, -2.898954, -2.076005, -2.405185, -2.755481, 0.329490,  0.085400,
+    -1.485966, -2.034702, -2.161405, -1.269515, -1.151818, -1.823841, 0.561469,
+    1.109273,  1.693411,  -0.082605, -0.069252, -1.225107, -1.330693, -1.411435,
+    0.253406,  -0.357439, -1.593415, -0.879779, -1.111136, 1.821357,  2.471952,
+    1.236908,  -4.014127, -2.810448, -2.944604, -1.930980, -1.566398, -0.838166,
+    -0.319242, 0.749349,  1.156476,  0.658670,  1.997437,  2.080663,  2.912618,
+    2.677224,  2.642442,  2.796163,  -0.272349, -0.473273, 3.120063,  2.747097,
+    3.595510,  1.874150,  2.049919,  2.093396,  -1.049959, 0.277939,  -1.255541,
+    -1.052443, -1.810177, -0.883505, -0.538178, 0.524203,  -1.017662, -0.269244,
+    0.039129,  -0.227941, -0.114592, -2.018243, -2.548968, -0.706804, 0.890959,
+    0.102480,  0.349986,  0.405885,  1.287216,  0.756181,  0.319242,  -0.641590,
+    -3.841774, -2.716042, -4.342065, -3.826557, -2.924729, -1.643724, -1.237839,
+    -0.597492, -1.954892, -1.215169, -1.528201, -1.018904, -0.863941, -0.293467,
+    0.039439,  0.672023,  1.408019,  1.362679,  1.467644,  1.006171,  0.310236,
+    -0.249990, -1.048406, -0.752144, -1.831605, -1.058033, -1.096541, -0.293467,
+    0.051551,  0.232600,  0.088816,  2.570395,  0.704009,  2.465120,  3.010751,
+    2.139357,  0.630410,  1.006171,  1.545281,  1.486898,  -1.162998, -2.344317,
+    -4.593918, -3.522842, -2.872247, -1.416714, -0.642521, -0.230115, 0.315205,
+    -0.368930, -0.162726, 0.396879,  0.505570,  0.534451,  0.554947,  1.270447,
+    0.388805,  0.531967,  -1.243119, -0.671713, -1.214859, -0.238189, 0.016459,
+    -1.164550, 0.609603,  3.293348,  2.600208,  1.454290,  -1.034121, -1.760179,
+    -1.192500, -0.613951, 3.449553,  2.912618,  1.917937,  1.435968,  0.879158,
+    1.118279,  0.102791,  -0.502465, -0.239121, -0.092853, 1.786265,  1.943091,
+    2.547104,  2.630641,  2.585302,  2.965411,  -0.945615, -2.538720, -2.474126,
+    -1.088156, 0.056209,  0.864873,  0.170490,  0.457435,  0.545941,  0.752765,
+    1.569503,  1.129459,  0.662086,  -0.527929, -0.810838, -1.662978, 1.285042,
+    1.653040,  4.130893,  2.961995,  4.147041,  3.256393,  3.881524,  2.522571,
+    -0.875431, -1.112378, 2.105817,  2.180970,  3.121926,  1.577577,  1.639376,
+    2.906407,  -0.142230, 0.421101,  2.212335,  2.311399,  3.993321,  3.651719,
+    4.206666,  4.678387,  -1.304917, -1.130701, -2.543067, -2.500212, -2.197118,
+    -1.197158, -0.949652, -0.282908, 0.320795,  -1.543728, 1.290322,  1.788128,
+    3.957297,  3.205774,  2.892432,  2.297114,  0.138814,  -0.139435, 0.936920,
+    0.344707,  0.723263,  -1.772290, -3.138385, -2.287177, -2.405806, -1.859864,
+    -4.572801, -3.410424, -3.855748, -2.239663, -2.269786, -1.582857, 4.238342,
+    3.858543,  2.499901,  1.087535,  0.290051,  -0.026086, -0.880400, -2.602692,
+    -1.404292, 0.253096,  -0.665502, -1.443421, -0.925119, -0.096580, 1.115484,
+    1.846200,  -1.604284, -1.244671, -0.464888, 0.326385,  0.168006,  -0.262723,
+    -0.744691, 0.953379,  -0.407127, -0.349986, -1.154302, 0.831023,  1.590931,
+    2.538720,  2.063583,  3.697680,  -0.752455, -1.293117, -1.330693, -1.869802,
+    -0.592523, 0.631652,  1.198089,  -0.481347, 3.738983,  4.153252,  2.782499,
+    2.244321,  0.709289,  1.650245,  1.700865,  0.385078,  2.192460,  2.610456,
+    4.009780,  3.492719,  2.574743,  2.116687,  1.856138,  1.205853,  2.722563,
+    4.075305,  5.415935,  3.009198,  2.715421,  1.571056,  0.897170,  -2.430339,
+    0.749970,  0.425760,  -0.302783, 0.817359,  1.031636,  1.913589,  2.686229,
+    1.631923,  -1.459259, -1.793097, -1.187531, -1.553355, -0.844998, -1.296843,
+    -1.805519, -0.486627, 0.909591,  2.082837,  -1.473855, -2.456735, -3.851401,
+    -2.760139, -3.060438, -2.605487, -2.138735, -2.441519, -1.333177, -1.353984,
+    -0.245642, -0.588486, 0.033850,  2.084700,  0.076084,  0.690035,  0.747797,
+    0.594697,  -1.016109, -1.348083, -1.201195, -1.088466, 2.045571,  2.460772,
+    0.717984,  0.041613,  -0.721711, 1.134738,  2.322269,  1.112378,  -0.307441,
+    -0.581033, -0.868599, -0.018633, 0.856488,  0.919839,  0.303094,  -0.433213,
+    0.811148,  -0.508986, -1.060828, -1.227591, -1.566087, -1.117968, -1.385038,
+    -2.011101, -0.490353, -1.849616, -0.594697, -1.055859, 1.110205,  0.622646,
+    0.145957,  0.359303,  1.012072,  0.774814,  -0.400295, -1.484103, -2.007374,
+    -1.441247, -0.997787, -0.581033, -0.545941, -0.306510, 0.693451,  0.087264,
+    -0.227320, -1.211753, -1.532859, -1.688753, 0.065215,  0.134777,  0.608051,
+    -0.393152, -0.214588, -0.635689, -1.499320, 0.069562,  -1.555839, -2.633126,
+    -2.966032, -1.550870, -0.101549, 0.874189,  0.436318,  0.299367,  2.289972,
+    2.339659,  2.602071,  1.564535,  0.019254,  -0.583207, -1.295912, -2.424749,
+    -1.221070, -1.175109, -0.577306, -0.102791, 1.877876,  2.568222,  2.173827,
+    3.131243,  2.637784,  2.088737,  3.679047,  3.218506,  2.483442,  1.650556,
+    1.363611,  -0.027328, 1.486898,  -0.721711, -3.684327, -3.006093, -3.777491,
+    -2.327548, -2.737470, -4.549510, -0.060867, 0.127635,  0.680408,  0.581344,
+    0.320174,  -0.403090, -0.838166, 0.293777,  -0.995613, -0.165521, -0.419859,
+    1.110515,  1.203679,  1.749931,  2.467294,  4.276539,  0.031055,  -0.967664,
+    1.167035,  1.865144,  3.221923,  3.248630,  4.121266,  4.187723,  0.749039,
+    -1.571056, 0.785994,  1.568572,  3.759479,  3.588678,  4.116608,  3.864444,
+    -0.290051, -0.271107, 0.375140,  0.537556,  0.536314,  0.095959,  0.054656,
+    0.088816};
 
-    -0.27142537, -0.16955489, 0.18521598,  -0.12528358,
-    0.00331409,  0.11167502,  0.02218599,  -0.07309391,
+// One output with shape {1, 64}
+const float golden_output_16x1x1[] = {
+    -0.087914, 1.145864,  -0.418088, -1.556392, -0.925298, 0.205252,  0.289119,
+    1.331180,  -0.218010, 0.963057,  -2.225886, 1.248478,  1.448983,  0.355467,
+    1.682174,  0.803739,  0.449738,  0.543566,  1.916269,  -2.975136, 0.222774,
+    0.241589,  -0.104216, 1.561748,  0.936818,  -0.089907, -0.520117, -0.870353,
+    1.606074,  0.895770,  0.521297,  -0.369994, -0.889351, -2.809309, 2.404628,
+    1.069754,  -0.195456, -1.105652, 1.272715,  -1.233177, 1.271416,  -1.691805,
+    -1.058125, -0.716227, 0.052540,  1.262483,  0.540555,  1.735760,  -0.539197,
+    -0.014367, -0.243002, 1.072254,  0.528985,  -0.731151, -1.262649, 2.338702,
+    -0.603093, 0.970736,  -3.567897, 0.035085,  -0.201711, -0.550400, 1.545573,
+    -1.805005};
 
-    0.09593632,  -0.28361851, -0.0773851,  0.17199151,
-    -0.00075242, 0.33691186,  -0.1536046,  0.16572715,
-
-    -0.27916506, -0.27626723, 0.42615682,  0.3225764,
-    -0.37472126, -0.55655634, -0.05013514, 0.289112,
-
-    -0.24418658, 0.07540751,  -0.1940318,  -0.08911639,
-    0.00732617,  0.46737891,  0.26449674,  0.24888524,
-
-    -0.17225097, -0.54660404, -0.38795233, 0.08389944,
-    0.07736043,  -0.28260678, 0.15666828,  1.14949894,
-
-    -0.57454878, -0.64704704, 0.73235172,  -0.34616736,
-    0.21120001,  -0.22927976, 0.02455296,  -0.35906726,
-};
+// One output with shape {1, 64}
+const float golden_output_relu_16x1x1[] = {
+    0.000000, 1.145864, 0.000000, 0.000000, 0.000000, 0.205252, 0.289119,
+    1.331180, 0.000000, 0.963057, 0.000000, 1.248478, 1.448983, 0.355467,
+    1.682174, 0.803739, 0.449738, 0.543566, 1.916269, 0.000000, 0.222774,
+    0.241589, 0.000000, 1.561748, 0.936818, 0.000000, 0.000000, 0.000000,
+    1.606074, 0.895770, 0.521297, 0.000000, 0.000000, 0.000000, 2.404628,
+    1.069754, 0.000000, 0.000000, 1.272715, 0.000000, 1.271416, 0.000000,
+    0.000000, 0.000000, 0.052540, 1.262483, 0.540555, 1.735760, 0.000000,
+    0.000000, 0.000000, 1.072254, 0.528985, 0.000000, 0.000000, 2.338702,
+    0.000000, 0.970736, 0.000000, 0.035085, 0.000000, 0.000000, 1.545573,
+    0.000000};
 
+template <typename T>
 void ValidateSVDFGoldens(const int batch_size, const int num_units,
                          const int input_size, const int rank,
                          TfLiteTensor* tensors, const int tensor_count,
-                         float* golden_input_data,
-                         const int golden_input_data_size, float* output_data,
-                         float* expected_output, float tolerance = 1e-5f) {
-  TfLiteContext context;
-  PopulateContext(tensors, tensor_count, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SVDF);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
+                         TfLiteFusedActivation activaiton,
+                         const T* input_sequences_data,
+                         const int input_sequences_len, T* output_data,
+                         const T* expected_output, float tolerance = 1e-5f) {
   TfLiteSVDFParams params;
   params.rank = rank;
-  params.activation = kTfLiteActNone;
+  params.activation = activaiton;
 
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, nullptr, 0);
-  }
-
-  // Bias is an optional tensor:
-  int inputs_array_data[] = {5, 0, 1, 2, kTfLiteOptionalTensor, 3};
+  int inputs_array_data[] = {5, 0, 1, 2, 3, 4};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
 
-  int outputs_array_data[] = {1, 4};
+  int outputs_array_data[] = {1, 5};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&params);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  if (registration->prepare) {
-    TfLiteStatus prepare_status = registration->prepare(&context, &node);
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, prepare_status);
-    // Abort early to make it clear prepare failed.
-    if (prepare_status != kTfLiteOk) {
-      return;
-    }
+  const TfLiteRegistration registration = tflite::ops::micro::Register_SVDF();
+  micro::KernelRunner runner(registration, tensors, tensor_count, inputs_array,
+                             outputs_array, &params, micro_test::reporter);
+
+  TfLiteStatus init_and_prepare_status = runner.InitAndPrepare();
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, init_and_prepare_status);
+
+  // Abort early to make it clear init and prepare failed.
+  if (init_and_prepare_status != kTfLiteOk) {
+    return;
   }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
 
-  int input_sequence_size =
-      golden_input_data_size / sizeof(float) / (input_size * batch_size);
-  for (int i = 0; i < input_sequence_size; ++i) {
-    float* input_batch_start = golden_input_data + i * input_size * batch_size;
-    float* input_batch_end = input_batch_start + input_size * batch_size;
+  int num_inputs = input_sequences_len / (input_size * batch_size);
 
-    PopulateFloatTensor(&tensors[0], input_batch_start, input_batch_end);
-    TfLiteStatus status = registration->invoke(&context, &node);
+  for (int i = 0; i < num_inputs; ++i) {
+    const T* input_batch_start =
+        input_sequences_data + i * input_size * batch_size;
+
+    memcpy(tensors[0].data.raw, input_batch_start, tensors[0].bytes);
+    TfLiteStatus status = runner.Invoke();
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, status);
 
     // Only validate outputs when invoke has succeeded.
@@ -232,146 +530,86 @@ void ValidateSVDFGoldens(const int batch_size, const int num_units,
       }
     }
   }
-
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
-}
-
-void ValidateIntegerSVDFGoldens(const int batch_size, const int num_units,
-                                const int input_size, const int rank,
-                                TfLiteTensor* tensors, const int tensor_count,
-                                int8_t* golden_input_data,
-                                const int golden_input_data_size,
-                                int8_t* output_data, int8_t* expected_output) {
-  TfLiteContext context;
-  PopulateContext(tensors, tensor_count, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SVDF);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  TfLiteSVDFParams params;
-  params.rank = rank;
-  params.activation = kTfLiteActRelu;
-
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, nullptr, 0);
-  }
-
-  int inputs_array_data[] = {5, 0, 1, 2, 3, 4};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-
-  int outputs_array_data[] = {1, 5};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&params);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-
-  if (registration->prepare) {
-    TfLiteStatus prepare_status = registration->prepare(&context, &node);
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, prepare_status);
-    // Abort early to make it clear prepare failed.
-    if (prepare_status != kTfLiteOk) {
-      return;
-    }
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-
-  int input_sequence_size =
-      golden_input_data_size / sizeof(int8_t) / (input_size * batch_size);
-  for (int i = 0; i < input_sequence_size; ++i) {
-    int8_t* input_batch_start = golden_input_data + i * input_size * batch_size;
-    int8_t* input_batch_end = input_batch_start + input_size * batch_size;
-    int8_t* tensor_data = tensors[0].data.int8;
-    while (input_batch_start != input_batch_end) {
-      *tensor_data++ = *input_batch_start++;
-    }
-
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-
-    int output_idx = 0;
-    int golden_idx = i * batch_size * num_units;
-    for (int j = golden_idx; j < golden_idx + batch_size * num_units; ++j) {
-      TF_LITE_MICRO_EXPECT_NEAR(expected_output[j], output_data[output_idx], 1);
-      output_idx++;
-    }
-  }
-
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
 }
 
 void TestSVDF(const int batch_size, const int num_units, const int input_size,
-              const int memory_size, const int rank, float* input_data,
-              float* weights_feature_data, float* weights_time_data,
-              float* activation_state_data, float* scratch_data,
-              float* output_data, float* golden_input_data,
-              int golden_input_data_size, float* expected_output,
-              float tolerance = 1e-5f) {
+              const int memory_size, const int rank,
+              TfLiteFusedActivation activation, float* input_data,
+              const float* feature_weights_data, const float* time_weights_data,
+              float* activation_state_data, const float* bias_data,
+              float* scratch_data, float* output_data,
+              const float* input_sequences_data, int input_sequences_len,
+              const float* expected_output, float tolerance = 1e-5f) {
   const int num_filters = num_units * rank;
 
   const int input_dims_arg[] = {2, batch_size, input_size};
   TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_arg);
 
-  const int weights_feature_dims_args[] = {2, num_filters, input_size};
-  TfLiteIntArray* weights_feature_dims =
-      IntArrayFromInts(weights_feature_dims_args);
+  const int feature_weights_dims_args[] = {2, num_filters, input_size};
+  TfLiteIntArray* feature_weights_dims =
+      IntArrayFromInts(feature_weights_dims_args);
 
-  const int weights_time_dims_args[] = {2, num_filters, memory_size};
-  TfLiteIntArray* weights_time_dims = IntArrayFromInts(weights_time_dims_args);
+  const int time_weights_dims_args[] = {2, num_filters, memory_size};
+  TfLiteIntArray* time_weights_dims = IntArrayFromInts(time_weights_dims_args);
 
   const int activation_state_dims_args[] = {2, batch_size,
                                             memory_size * num_filters};
   TfLiteIntArray* activation_state_dims =
       IntArrayFromInts(activation_state_dims_args);
 
+  const int bias_dims_args[] = {1, num_units};
+  TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_args);
+
   const int output_dims_args[] = {2, batch_size, num_units};
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_args);
 
-  const int tensor_count = 5;  // 4 inputs, 1 output
+  const int tensor_count = 6;  // 5 inputs, 1 output
   TfLiteTensor tensors[] = {
       CreateFloatTensor(input_data, input_dims),
-      CreateFloatTensor(weights_feature_data, weights_feature_dims),
-      CreateFloatTensor(weights_time_data, weights_time_dims),
+      CreateFloatTensor(feature_weights_data, feature_weights_dims),
+      CreateFloatTensor(time_weights_data, time_weights_dims),
+      CreateFloatTensor(bias_data, bias_dims),
       CreateFloatTensor(activation_state_data, activation_state_dims,
                         /*is_variable=*/true),
       CreateFloatTensor(output_data, output_dims),
   };
 
   ValidateSVDFGoldens(batch_size, num_units, input_size, rank, tensors,
-                      tensor_count, golden_input_data, golden_input_data_size,
-                      output_data, expected_output, tolerance);
+                      tensor_count, activation, input_sequences_data,
+                      input_sequences_len, output_data, expected_output,
+                      tolerance);
 }
 
+// The pattern to this method's arguemnts is:
+// <kernel metadata>
+// for each tensor in
+//     {input, feature weights, time weights, bias, activation state, output}:
+//   <tensor float values> <tensor quantized buffer> <tensor quantization data>
 inline void TestIntegerSVDF(
     const int batch_size, const int num_units, const int input_size,
-    const int memory_size, const int rank, int8_t* input_data,
-    float input_scale, int8_t* weights_feature_data,
-    float weights_feature_scale, int16_t* weights_time_data,
-    float weights_time_scale, int32_t* bias_data, float bias_scale,
-    int16_t* activation_state_data, float activation_scale, int8_t* output_data,
-    float output_scale, int8_t* golden_input_data, int golden_input_data_size,
-    int8_t* expected_output) {
+    const int memory_size, const int rank, TfLiteFusedActivation activation,
+    int8_t* input_quantized, float input_scale, int input_zero_point,
+    const float* feature_weights_data, int8_t* feature_weights_quantized,
+    const float feature_weights_scale, const float* time_weights_data,
+    int16_t* time_weights_quantized, float time_weights_scale,
+    const float* bias_data, int32_t* bias_quantized,
+    const float* initial_activation_state_data,
+    int16_t* activation_state_quantized, float activation_state_scale,
+    int8_t* output_data, float output_scale, int output_zero_point,
+    const float* input_sequences_data, int8_t* input_sequences_quantized,
+    const int input_sequences_len, const float* golden_output,
+    int8_t* golden_output_quantized, int golden_output_len) {
   const int num_filters = num_units * rank;
 
   const int input_dims_arg[] = {2, batch_size, input_size};
   TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_arg);
 
-  const int weights_feature_dims_args[] = {2, num_filters, input_size};
-  TfLiteIntArray* weights_feature_dims =
-      IntArrayFromInts(weights_feature_dims_args);
+  const int feature_weights_dims_args[] = {2, num_filters, input_size};
+  TfLiteIntArray* feature_weights_dims =
+      IntArrayFromInts(feature_weights_dims_args);
 
-  const int weights_time_dims_args[] = {2, num_filters, memory_size};
-  TfLiteIntArray* weights_time_dims = IntArrayFromInts(weights_time_dims_args);
+  const int time_weights_dims_args[] = {2, num_filters, memory_size};
+  TfLiteIntArray* time_weights_dims = IntArrayFromInts(time_weights_dims_args);
 
   const int bias_dims_data[] = {1, num_units};
   TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
@@ -384,65 +622,36 @@ inline void TestIntegerSVDF(
   const int output_dims_args[] = {2, batch_size, num_units};
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_args);
 
-  // Tensor size is higher due to workarounds in micro buffer usage
-  // (b/132070898) and re-working scale calculations (b/146029510).
   const int tensor_count = 6;  // 5 inputs, 1 output
 
   TfLiteTensor tensors[] = {
-      CreateQuantizedTensor(input_data, input_dims, input_scale,
-                            /*zero_point=*/0),
-      CreateQuantizedTensor(weights_feature_data, weights_feature_dims,
-                            weights_feature_scale, /*zero_point=*/0),
-      CreateQuantizedTensor(weights_time_data, weights_time_dims,
-                            weights_time_scale, /*zero_point=*/0),
-      CreateQuantized32Tensor(bias_data, bias_dims, bias_scale),
-      CreateQuantizedTensor(activation_state_data, activation_state_dims,
-                            activation_scale, /*zero_point=*/0,
+      CreateQuantizedTensor(input_quantized, input_dims, input_scale,
+                            input_zero_point),
+      CreateQuantizedTensor(feature_weights_data, feature_weights_quantized,
+                            feature_weights_dims, feature_weights_scale, 0),
+      CreateQuantizedTensor(time_weights_data, time_weights_quantized,
+                            time_weights_dims, time_weights_scale, 0),
+      CreateQuantizedBiasTensor(bias_data, bias_quantized, bias_dims,
+                                time_weights_scale, activation_state_scale),
+      CreateQuantizedTensor(initial_activation_state_data,
+                            activation_state_quantized, activation_state_dims,
+                            activation_state_scale, 0,
                             /*is_variable=*/true),
       CreateQuantizedTensor(output_data, output_dims, output_scale,
-                            /*zero_point=*/0)};
+                            output_zero_point)};
 
-  // TODO(b/147839421): Affine Quantization Params should be set on tensor
-  // creation.
-  int zero_points[] = {1, 0};
+  tflite::AsymmetricQuantize(golden_output, golden_output_quantized,
+                             golden_output_len, output_scale,
+                             output_zero_point);
+  tflite::AsymmetricQuantize(input_sequences_data, input_sequences_quantized,
+                             input_sequences_len, input_scale,
+                             input_zero_point);
 
-  // Input quant params:
-  float input_scales[] = {1, input_scale};
-  TfLiteAffineQuantization input_quant = {FloatArrayFromFloats(input_scales),
-                                          IntArrayFromInts(zero_points)};
-  tensors[0].quantization = {kTfLiteAffineQuantization, &input_quant};
-
-  // Weights features quant params:
-  float weights_features_scales[] = {1, weights_feature_scale};
-  TfLiteAffineQuantization weights_feature_quant = {
-      FloatArrayFromFloats(weights_features_scales),
-      IntArrayFromInts(zero_points)};
-  tensors[1].quantization = {kTfLiteAffineQuantization, &weights_feature_quant};
-
-  // Weights time quant params:
-  float weights_time_scales[] = {1, weights_time_scale};
-  TfLiteAffineQuantization weights_time_quant = {
-      FloatArrayFromFloats(weights_time_scales), IntArrayFromInts(zero_points)};
-  tensors[2].quantization = {kTfLiteAffineQuantization, &weights_time_quant};
-
-  // Activation state quant params:
-  float activation_state_scales[] = {1, activation_scale};
-  TfLiteAffineQuantization activation_state_quant = {
-      FloatArrayFromFloats(activation_state_scales),
-      IntArrayFromInts(zero_points)};
-  tensors[4].quantization = {kTfLiteAffineQuantization,
-                             &activation_state_quant};
-
-  // Output quant params:
-  float output_scales[] = {1, output_scale};
-  TfLiteAffineQuantization output_quant = {FloatArrayFromFloats(output_scales),
-                                           IntArrayFromInts(zero_points)};
-  tensors[5].quantization = {kTfLiteAffineQuantization, &output_quant};
-
-  ValidateIntegerSVDFGoldens(
-      batch_size, num_units, input_size, rank, tensors, tensor_count,
-      golden_input_data, golden_input_data_size, output_data, expected_output);
-}  // namespace
+  ValidateSVDFGoldens(batch_size, num_units, input_size, rank, tensors,
+                      tensor_count, activation, input_sequences_quantized,
+                      input_sequences_len, output_data, golden_output_quantized,
+                      /*tolerance*/ 1);
+}
 
 }  // namespace
 }  // namespace testing
@@ -450,114 +659,7 @@ inline void TestIntegerSVDF(
 
 TF_LITE_MICRO_TESTS_BEGIN
 
-TF_LITE_MICRO_TEST(SvdfFloatInputSize3Rank1ShouldMatchGolden) {
-  constexpr int batch_size = 2;
-  constexpr int num_units = 4;
-  constexpr int input_size = 3;
-  constexpr int memory_size = 10;
-  constexpr int rank = 1;
-  constexpr int num_filters = num_units * rank;
-
-  float weights_feature_data[] = {-0.31930989, -0.36118156, 0.0079667,
-                                  0.37613347,  0.22197971,  0.12416199,
-                                  0.27901134,  0.27557442,  0.3905206,
-                                  -0.36137494, -0.06634006, -0.10640851};
-
-  float weights_time_data[] = {
-      -0.31930989, 0.37613347,  0.27901134,  -0.36137494, -0.36118156,
-      0.22197971,  0.27557442,  -0.06634006, 0.0079667,   0.12416199,
-
-      0.3905206,   -0.10640851, -0.0976817,  0.15294972,  0.39635518,
-      -0.02702999, 0.39296314,  0.15785322,  0.21931258,  0.31053296,
-
-      -0.36916667, 0.38031587,  -0.21580373, 0.27072677,  0.23622236,
-      0.34936687,  0.18174365,  0.35907319,  -0.17493086, 0.324846,
-
-      -0.10781813, 0.27201805,  0.14324132,  -0.23681851, -0.27115166,
-      -0.01580888, -0.14943552, 0.15465137,  0.09784451,  -0.0337657};
-
-  const int input_size_dims_count = batch_size * input_size;
-  float input_data[input_size_dims_count];
-
-  const int activation_state_dims_count =
-      batch_size * memory_size * num_filters;
-  float activation_state_data[activation_state_dims_count];
-
-  const int scratch_dims_count = batch_size * num_filters;
-  float scratch_data[scratch_dims_count];
-
-  const int output_dims_count = batch_size * num_units;
-  float output_data[output_dims_count];
-
-  tflite::testing::TestSVDF(
-      batch_size, num_units, input_size, memory_size, rank, input_data,
-      weights_feature_data, weights_time_data, activation_state_data,
-      scratch_data, output_data, tflite::testing::svdf_input_3x2x10,
-      sizeof(tflite::testing::svdf_input_3x2x10),
-      tflite::testing::svdf_golden_output_3x2x10_rank_1);
-}
-
-TF_LITE_MICRO_TEST(SvdfFloatInputSize3Rank2ShouldMatchGolden) {
-  constexpr int batch_size = 2;
-  constexpr int num_units = 4;
-  constexpr int input_size = 3;
-  constexpr int memory_size = 10;
-  constexpr int rank = 2;
-  constexpr int num_filters = num_units * rank;
-
-  float weights_feature_data[] = {
-      -0.31930989, 0.0079667,   0.39296314,  0.37613347, 0.12416199,
-      0.15785322,  0.27901134,  0.3905206,   0.21931258, -0.36137494,
-      -0.10640851, 0.31053296,  -0.36118156, -0.0976817, -0.36916667,
-      0.22197971,  0.15294972,  0.38031587,  0.27557442, 0.39635518,
-      -0.21580373, -0.06634006, -0.02702999, 0.27072677};
-
-  float weights_time_data[] = {
-      -0.31930989, 0.37613347,  0.27901134,  -0.36137494, -0.36118156,
-      0.22197971,  0.27557442,  -0.06634006, 0.0079667,   0.12416199,
-
-      0.3905206,   -0.10640851, -0.0976817,  0.15294972,  0.39635518,
-      -0.02702999, 0.39296314,  0.15785322,  0.21931258,  0.31053296,
-
-      -0.36916667, 0.38031587,  -0.21580373, 0.27072677,  0.23622236,
-      0.34936687,  0.18174365,  0.35907319,  -0.17493086, 0.324846,
-
-      -0.10781813, 0.27201805,  0.14324132,  -0.23681851, -0.27115166,
-      -0.01580888, -0.14943552, 0.15465137,  0.09784451,  -0.0337657,
-
-      -0.14884081, 0.19931212,  -0.36002168, 0.34663299,  -0.11405486,
-      0.12672701,  0.39463779,  -0.07886535, -0.06384811, 0.08249187,
-
-      -0.26816407, -0.19905911, 0.29211238,  0.31264046,  -0.28664589,
-      0.05698794,  0.11613581,  0.14078894,  0.02187902,  -0.21781836,
-
-      -0.15567942, 0.08693647,  -0.38256618, 0.36580828,  -0.22922277,
-      -0.0226903,  0.12878349,  -0.28122205, -0.10850525, -0.11955214,
-
-      0.27179423,  -0.04710215, 0.31069002,  0.22672787,  0.09580326,
-      0.08682203,  0.1258215,   0.1851041,   0.29228821,  0.12366763};
-
-  const int input_size_dims_count = batch_size * input_size;
-  float input_data[input_size_dims_count];
-
-  const int activation_state_dims_count =
-      batch_size * memory_size * num_filters;
-  float activation_state_data[activation_state_dims_count];
-  const int scratch_dims_count = batch_size * num_filters;
-  float scratch_data[scratch_dims_count];
-
-  const int output_dims_count = batch_size * num_units;
-  float output_data[output_dims_count];
-
-  tflite::testing::TestSVDF(
-      batch_size, num_units, input_size, memory_size, rank, input_data,
-      weights_feature_data, weights_time_data, activation_state_data,
-      scratch_data, output_data, tflite::testing::svdf_input_3x2x10,
-      sizeof(tflite::testing::svdf_input_3x2x10),
-      tflite::testing::svdf_golden_output_3x2x10_rank_2);
-}
-
-TF_LITE_MICRO_TEST(SvdfFloatInputSize2Rank1ShouldMatchGolden) {
+TF_LITE_MICRO_TEST(SvdfFloat2x2Input2x4OutputShouldMatchGolden) {
   constexpr int batch_size = 2;
   constexpr int num_units = 4;
   constexpr int input_size = 2;
@@ -565,44 +667,17 @@ TF_LITE_MICRO_TEST(SvdfFloatInputSize2Rank1ShouldMatchGolden) {
   constexpr int rank = 2;
   constexpr int num_filters = num_units * rank;
 
-  float weights_feature_data[] = {
-      -0.31930989, 0.0079667,   0.39296314,  0.37613347, 0.12416199,
-      0.15785322,  0.27901134,  0.3905206,   0.21931258, -0.36137494,
-      -0.10640851, 0.31053296,  -0.36118156, -0.0976817, -0.36916667,
-      0.22197971,  0.15294972,  0.38031587,  0.27557442, 0.39635518,
-      -0.21580373, -0.06634006, -0.02702999, 0.27072677};
-
-  float weights_time_data[] = {
-      -0.31930989, 0.37613347,  0.27901134,  -0.36137494, -0.36118156,
-      0.22197971,  0.27557442,  -0.06634006, 0.0079667,   0.12416199,
-
-      0.3905206,   -0.10640851, -0.0976817,  0.15294972,  0.39635518,
-      -0.02702999, 0.39296314,  0.15785322,  0.21931258,  0.31053296,
-
-      -0.36916667, 0.38031587,  -0.21580373, 0.27072677,  0.23622236,
-      0.34936687,  0.18174365,  0.35907319,  -0.17493086, 0.324846,
-
-      -0.10781813, 0.27201805,  0.14324132,  -0.23681851, -0.27115166,
-      -0.01580888, -0.14943552, 0.15465137,  0.09784451,  -0.0337657,
-
-      -0.14884081, 0.19931212,  -0.36002168, 0.34663299,  -0.11405486,
-      0.12672701,  0.39463779,  -0.07886535, -0.06384811, 0.08249187,
-
-      -0.26816407, -0.19905911, 0.29211238,  0.31264046,  -0.28664589,
-      0.05698794,  0.11613581,  0.14078894,  0.02187902,  -0.21781836,
-
-      -0.15567942, 0.08693647,  -0.38256618, 0.36580828,  -0.22922277,
-      -0.0226903,  0.12878349,  -0.28122205, -0.10850525, -0.11955214,
-
-      0.27179423,  -0.04710215, 0.31069002,  0.22672787,  0.09580326,
-      0.08682203,  0.1258215,   0.1851041,   0.29228821,  0.12366763};
-
   const int input_size_dims_count = batch_size * input_size;
   float input_data[input_size_dims_count];
 
   const int activation_state_dims_count =
       batch_size * memory_size * num_filters;
   float activation_state_data[activation_state_dims_count];
+
+  memcpy(activation_state_data,
+         tflite::testing::initial_activation_state_data_2x2x10,
+         sizeof(tflite::testing::initial_activation_state_data_2x2x10));
+
   const int scratch_dims_count = batch_size * num_filters;
   float scratch_data[scratch_dims_count];
 
@@ -610,87 +685,236 @@ TF_LITE_MICRO_TEST(SvdfFloatInputSize2Rank1ShouldMatchGolden) {
   float output_data[output_dims_count];
 
   tflite::testing::TestSVDF(
-      batch_size, num_units, input_size, memory_size, rank, input_data,
-      weights_feature_data, weights_time_data, activation_state_data,
-      scratch_data, output_data, tflite::testing::svdf_input_2x2x10,
-      sizeof(tflite::testing::svdf_input_2x2x10),
-      tflite::testing::svdf_golden_output_2x2x30_rank_1);
+      batch_size, num_units, input_size, memory_size, rank, kTfLiteActNone,
+      input_data, tflite::testing::feature_weights_data_2x2x10,
+      tflite::testing::time_weights_data_2x2x10, activation_state_data,
+      tflite::testing::bias_data_2x2x10, scratch_data, output_data,
+      tflite::testing::input_data_2x2x10,
+      sizeof(tflite::testing::input_data_2x2x10) / sizeof(float),
+      tflite::testing::golden_output_2x2x10);
 }
 
-TF_LITE_MICRO_TEST(SvdfIntegerInputSize2Rank1ShouldMatchGolden) {
+TF_LITE_MICRO_TEST(SvdfQuantized2x2Input2x4OutputShouldMatchGolden) {
   constexpr int batch_size = 2;
   constexpr int num_units = 4;
   constexpr int input_size = 2;
   constexpr int memory_size = 10;
-  constexpr int rank = 1;
+  constexpr int rank = 2;
   constexpr int num_filters = num_units * rank;
 
-  int8_t weights_feature_data[] = {-81, -92, 2,   96,  57,  32,
-                                   71,  70,  100, -92, -17, -27};
-  const int weights_feature_dims_count = num_filters * input_size;
-
-  int16_t weights_time_data[] = {
-      -10464, 12324, 9142,  -11842, -11836, 7273,  9029,  -2175, 260,   4067,
-      12795,  -3488, -3202, 5011,   12987,  -887,  12875, 5171,  7185,  10174,
-      -12098, 12461, -7072, 8870,   7739,   11447, 5954,  11765, -5733, 10643,
-      -3534,  8912,  4693,  -7761,  -8886,  -519,  -4898, 5067,  3205,  -1107,
-  };
-  const int weights_time_dims_count = num_filters * memory_size;
-
-  int32_t bias_data[] = {-409707, 641518, 1662434, -113372};
-
-  int8_t input_sequences_data[] = {
-      64, 25,   34,   23,  68, -99, 16, -59,  -114, 46,  47, 94,
-      18, -128, -96,  -73, 16, 96,  64, 25,   34,   23,  68, -99,
-      16, -59,  -114, 46,  47, 94,  18, -128, -96,  -73, 16, 96,
-      64, 25,   34,   23,  68, -99, 16, -59,  -114, 46,  47, 94,
-      18, -128, -96,  -73, 16, 96,  64, 25,   34,   23,  68, -99,
-      16, -59,  -114, 46,  47, 94,  18, -128, -96,  -73, 16, 96,
-  };
-
-  int8_t expected_output[] = {
-      -9,  9,   18,   -2,  -6,  8,   13,  -2,  2,   -16, 2,    5,   2,   -7,
-      0,   3,   7,    0,   5,   7,   -11, 18,  30,  0,   -9,   -24, 14,  -12,
-      -1,  1,   -20,  2,   -19, -20, 20,  -13, -1,  -10, 50,   4,   26,  32,
-      2,   -12, -12,  11,  -10, -29, 50,  -61, 4,   15,  19,   -39, 13,  19,
-      -56, 49,  12,   13,  29,  -3,  -4,  -22, -76, -29, -14,  38,  -30, -30,
-      27,  0,   39,   16,  49,  -14, -18, 28,  -35, 11,  45,   0,   -13, -61,
-      34,  -80, 37,   26,  15,  -23, 12,  15,  18,  83,  -28,  -21, -27, -48,
-      17,  2,   -113, -52, 9,   48,  -4,  -1,  15,  -7,  39,   16,  49,  -14,
-      -18, 28,  -35,  11,  45,  0,   -13, -61, 34,  -80, 37,   26,  15,  -23,
-      12,  15,  18,   83,  -28, -21, -27, -48, 17,  2,   -113, -52, 9,   48,
-      -4,  -1,  15,   -7,
-  };
-
   const int input_size_dims_count = batch_size * input_size;
-  int8_t input_data[input_size_dims_count];
 
   const int activation_state_dims_count =
       batch_size * memory_size * num_filters;
-  int16_t activation_state_data[activation_state_dims_count];
-
-  const int scratch_dims_count = batch_size * num_filters;
-  int32_t scratch_data[scratch_dims_count];
-
-  const int scratch_output_dims_count = batch_size * num_units;
-  int32_t scratch_output_data[scratch_output_dims_count];
 
   const int output_dims_count = batch_size * num_units;
   int8_t output_data[output_dims_count];
 
-  float input_scale = 1.f / INT8_MAX;             // Range is [-1, 1]
-  float weights_feature_scale = 0.5f / INT8_MAX;  // Range is [-0.5, 0.5]
-  float weights_time_scale = 1.f / INT16_MAX;     // Range is [-1, 1]
-  float activation_scale = 16.f / INT16_MAX;      // Range is [-16, 16]
-  float bias_scale = 512.f / INT32_MAX;           // Range is [-512, 512]
-  float output_scale = 0.5f / INT8_MAX;           // Range is [-0.5, 0.5]
+  float input_scale = 2.5f / INT8_MAX;              // Range is [-2.5, 2.5]
+  float feature_weights_scale = 1.f / INT8_MAX;     // Range is [-1, 1]
+  float time_weights_scale = 1.f / INT16_MAX;       // Range is [-1, 1]
+  float activation_state_scale = 16.f / INT16_MAX;  // Range is [-16, 16]
+  float output_scale = 1.f / INT8_MAX;              // Range is [-1, 1]
+
+  int input_zero_point = 0;
+  int output_zero_point = 0;
+
+  int8_t input_quantized[input_size_dims_count];
+  int8_t input_sequences_quantized[sizeof(tflite::testing::input_data_2x2x10) /
+                                   sizeof(float)];
+  int8_t feature_weights_quantized
+      [sizeof(tflite::testing::feature_weights_data_2x2x10) / sizeof(float)];
+  int16_t
+      time_weights_quantized[sizeof(tflite::testing::time_weights_data_2x2x10) /
+                             sizeof(float)];
+  int16_t activation_state_quantized[activation_state_dims_count];
+  int32_t
+      bias_quantized[sizeof(tflite::testing::bias_data_2x2x10) / sizeof(float)];
+  int8_t golden_quantized[sizeof(tflite::testing::golden_output_2x2x10) /
+                          sizeof(float)];
 
   tflite::testing::TestIntegerSVDF(
-      batch_size, num_units, input_size, memory_size, rank, input_data,
-      input_scale, weights_feature_data, weights_feature_scale,
-      weights_time_data, weights_time_scale, bias_data, bias_scale,
-      activation_state_data, activation_scale, output_data, output_scale,
-      input_sequences_data, sizeof(input_sequences_data), expected_output);
+      batch_size, num_units, input_size, memory_size, rank, kTfLiteActRelu,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::feature_weights_data_2x2x10, feature_weights_quantized,
+      feature_weights_scale, tflite::testing::time_weights_data_2x2x10,
+      time_weights_quantized, time_weights_scale,
+      tflite::testing::bias_data_2x2x10, bias_quantized,
+      tflite::testing::initial_activation_state_data_2x2x10,
+      activation_state_quantized, activation_state_scale, output_data,
+      output_scale, output_zero_point, tflite::testing::input_data_2x2x10,
+      input_sequences_quantized,
+      sizeof(tflite::testing::input_data_2x2x10) / sizeof(float),
+      tflite::testing::golden_output_2x2x10, golden_quantized,
+      sizeof(tflite::testing::golden_output_2x2x10) / sizeof(float));
+}
+
+TF_LITE_MICRO_TEST(SvdfFloat1x16Input64x1OutputShouldMatchGolden) {
+  constexpr int batch_size = 1;
+  constexpr int num_units = 64;
+  constexpr int input_size = 16;
+  constexpr int memory_size = 8;
+  constexpr int rank = 1;
+  constexpr int num_filters = num_units * rank;
+  constexpr int activation_state_dims_count =
+      batch_size * memory_size * num_filters;
+  constexpr int output_dims_count = batch_size * num_units;
+  constexpr int input_dims_count = batch_size * input_size;
+
+  float input_data[input_dims_count];
+  float output_data[output_dims_count];
+  float scratch_buffer[batch_size * num_filters];
+  float activation_state_data_mutable[activation_state_dims_count];
+
+  // Initialize activation state to starting values.
+  memcpy(activation_state_data_mutable,
+         tflite::testing::initial_activation_state_data_16x1x1,
+         sizeof(tflite::testing::initial_activation_state_data_16x1x1));
+
+  tflite::testing::TestSVDF(
+      batch_size, num_units, input_size, memory_size, rank, kTfLiteActNone,
+      input_data, tflite::testing::feature_weights_data_16x1x1,
+      tflite::testing::time_weights_data_16x1x1, activation_state_data_mutable,
+      tflite::testing::bias_data_16x1x1, scratch_buffer, output_data,
+      tflite::testing::input_data_16x1x1, input_size,
+      tflite::testing::golden_output_16x1x1);
+}
+
+TF_LITE_MICRO_TEST(SvdfFloat1x16Input64x1OutputReluShouldMatchGolden) {
+  constexpr int batch_size = 1;
+  constexpr int num_units = 64;
+  constexpr int input_size = 16;
+  constexpr int memory_size = 8;
+  constexpr int rank = 1;
+  constexpr int num_filters = num_units * rank;
+  constexpr int activation_state_dims_count =
+      batch_size * memory_size * num_filters;
+  constexpr int output_dims_count = batch_size * num_units;
+  constexpr int input_dims_count = batch_size * input_size;
+
+  float input_data[input_dims_count];
+  float output_data[output_dims_count];
+  float scratch_buffer[batch_size * num_filters];
+  float activation_state_data_mutable[activation_state_dims_count];
+
+  // Initialize activation state to starting values.
+  memcpy(activation_state_data_mutable,
+         tflite::testing::initial_activation_state_data_16x1x1,
+         sizeof(tflite::testing::initial_activation_state_data_16x1x1));
+
+  tflite::testing::TestSVDF(
+      batch_size, num_units, input_size, memory_size, rank, kTfLiteActRelu,
+      input_data, tflite::testing::feature_weights_data_16x1x1,
+      tflite::testing::time_weights_data_16x1x1, activation_state_data_mutable,
+      tflite::testing::bias_data_16x1x1, scratch_buffer, output_data,
+      tflite::testing::input_data_16x1x1, input_size,
+      tflite::testing::golden_output_relu_16x1x1);
+}
+
+TF_LITE_MICRO_TEST(SvdfQuantized1x16Input64x1OutputShouldMatchGolden) {
+  constexpr int batch_size = 1;
+  constexpr int num_units = 64;
+  constexpr int input_size = 16;
+  constexpr int memory_size = 8;
+  constexpr int rank = 1;
+  constexpr int num_filters = num_units * rank;
+  constexpr int activation_state_dims_count =
+      batch_size * memory_size * num_filters;
+  constexpr int output_dims_count = batch_size * num_units;
+  constexpr int input_dims_count = batch_size * input_size;
+
+  int8_t output_data[output_dims_count];
+
+  float input_scale = 0.10075444;
+  float feature_weights_scale = 0.00649388;
+  float time_weights_scale = 0.001571355;
+  float activation_state_scale = 0.00045896982;
+  float output_scale = 0.051445257;
+
+  int input_zero_point = 2;
+  int output_zero_point = 0;
+
+  int8_t input_quantized[input_dims_count];
+  int8_t input_sequences_quantized[sizeof(tflite::testing::input_data_16x1x1) /
+                                   sizeof(float)];
+  int8_t feature_weights_quantized
+      [sizeof(tflite::testing::feature_weights_data_16x1x1) / sizeof(float)];
+  int16_t
+      time_weights_quantized[sizeof(tflite::testing::time_weights_data_16x1x1) /
+                             sizeof(float)];
+  int16_t activation_state_quantized[activation_state_dims_count];
+  int32_t
+      bias_quantized[sizeof(tflite::testing::bias_data_16x1x1) / sizeof(float)];
+  int8_t golden_quantized[sizeof(tflite::testing::golden_output_16x1x1) /
+                          sizeof(float)];
+
+  tflite::testing::TestIntegerSVDF(
+      batch_size, num_units, input_size, memory_size, rank, kTfLiteActNone,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::feature_weights_data_16x1x1, feature_weights_quantized,
+      feature_weights_scale, tflite::testing::time_weights_data_16x1x1,
+      time_weights_quantized, time_weights_scale,
+      tflite::testing::bias_data_16x1x1, bias_quantized,
+      tflite::testing::initial_activation_state_data_16x1x1,
+      activation_state_quantized, activation_state_scale, output_data,
+      output_scale, output_zero_point, tflite::testing::input_data_16x1x1,
+      input_sequences_quantized,
+      sizeof(tflite::testing::input_data_16x1x1) / sizeof(float),
+      tflite::testing::golden_output_16x1x1, golden_quantized,
+      sizeof(tflite::testing::golden_output_16x1x1) / sizeof(float));
+}
+
+TF_LITE_MICRO_TEST(SvdfQuantized1x16Input64x1OutputReluShouldMatchGolden) {
+  constexpr int batch_size = 1;
+  constexpr int num_units = 64;
+  constexpr int input_size = 16;
+  constexpr int memory_size = 8;
+  constexpr int rank = 1;
+  constexpr int num_filters = num_units * rank;
+  constexpr int activation_state_dims_count =
+      batch_size * memory_size * num_filters;
+  constexpr int output_dims_count = batch_size * num_units;
+  constexpr int input_dims_count = batch_size * input_size;
+
+  int8_t output_data[output_dims_count];
+
+  float input_scale = 0.10075444;
+  float feature_weights_scale = 0.00649388;
+  float time_weights_scale = 0.001571355;
+  float activation_state_scale = 0.00045896982;
+  float output_scale = 0.051445257;
+
+  int input_zero_point = 2;
+  int output_zero_point = -128;
+
+  int8_t input_quantized[input_dims_count];
+  int8_t input_sequences_quantized[sizeof(tflite::testing::input_data_16x1x1) /
+                                   sizeof(float)];
+  int8_t feature_weights_quantized
+      [sizeof(tflite::testing::feature_weights_data_16x1x1) / sizeof(float)];
+  int16_t
+      time_weights_quantized[sizeof(tflite::testing::time_weights_data_16x1x1) /
+                             sizeof(float)];
+  int16_t activation_state_quantized[activation_state_dims_count];
+  int32_t
+      bias_quantized[sizeof(tflite::testing::bias_data_16x1x1) / sizeof(float)];
+  int8_t golden_quantized[sizeof(tflite::testing::golden_output_relu_16x1x1) /
+                          sizeof(float)];
+
+  tflite::testing::TestIntegerSVDF(
+      batch_size, num_units, input_size, memory_size, rank, kTfLiteActRelu,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::feature_weights_data_16x1x1, feature_weights_quantized,
+      feature_weights_scale, tflite::testing::time_weights_data_16x1x1,
+      time_weights_quantized, time_weights_scale,
+      tflite::testing::bias_data_16x1x1, bias_quantized,
+      tflite::testing::initial_activation_state_data_16x1x1,
+      activation_state_quantized, activation_state_scale, output_data,
+      output_scale, output_zero_point, tflite::testing::input_data_16x1x1,
+      input_sequences_quantized,
+      sizeof(tflite::testing::input_data_16x1x1) / sizeof(float),
+      tflite::testing::golden_output_16x1x1, golden_quantized,
+      sizeof(tflite::testing::golden_output_relu_16x1x1) / sizeof(float));
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/tanh.cc b/tensorflow/lite/micro/kernels/tanh.cc
index d5f39d4796c..5fa32f8f7ce 100644
--- a/tensorflow/lite/micro/kernels/tanh.cc
+++ b/tensorflow/lite/micro/kernels/tanh.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 
 namespace tflite {
@@ -40,6 +41,11 @@ struct OpData {
   int input_left_shift;
 };
 
+void* TanhInit(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
 TfLiteStatus CalculateArithmeticOpData(TfLiteContext* context, TfLiteNode* node,
                                        OpData* data) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
@@ -63,45 +69,64 @@ TfLiteStatus CalculateArithmeticOpData(TfLiteContext* context, TfLiteNode* node,
   }
   return kTfLiteOk;
 }
+
+TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+
+  OpData* data = static_cast<OpData*>(node->user_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  data->input_zero_point = input->params.zero_point;
+  return CalculateArithmeticOpData(context, node, data);
+}
+
 }  // namespace
 
 TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  OpData data;
-  CalculateArithmeticOpData(context, node, &data);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
+
   switch (input->type) {
     case kTfLiteFloat32: {
-      reference_ops::Tanh(GetTensorShape(input), GetTensorData<float>(input),
-                          GetTensorShape(output), GetTensorData<float>(output));
+      reference_ops::Tanh(tflite::micro::GetTensorShape(input),
+                          tflite::micro::GetTensorData<float>(input),
+                          tflite::micro::GetTensorShape(output),
+                          tflite::micro::GetTensorData<float>(output));
       return kTfLiteOk;
     } break;
     case kTfLiteInt16: {
       TanhParams params;
       params.input_left_shift = data.input_left_shift;
-      reference_ops::Tanh(params, GetTensorShape(input),
-                          GetTensorData<int16_t>(input), GetTensorShape(output),
-                          GetTensorData<int16_t>(output));
+      reference_ops::Tanh(params, tflite::micro::GetTensorShape(input),
+                          tflite::micro::GetTensorData<int16_t>(input),
+                          tflite::micro::GetTensorShape(output),
+                          tflite::micro::GetTensorData<int16_t>(output));
       return kTfLiteOk;
     } break;
     case kTfLiteUInt8: {
       TanhParams params;
-      params.input_zero_point = input->params.zero_point;
+      params.input_zero_point = data.input_zero_point;
       params.input_range_radius = data.input_range_radius;
       params.input_multiplier = data.input_multiplier;
       params.input_left_shift = data.input_left_shift;
-      reference_ops::Tanh(params, GetTensorShape(input),
-                          GetTensorData<uint8_t>(input), GetTensorShape(output),
-                          GetTensorData<uint8_t>(output));
+      reference_ops::Tanh(params, tflite::micro::GetTensorShape(input),
+                          tflite::micro::GetTensorData<uint8_t>(input),
+                          tflite::micro::GetTensorShape(output),
+                          tflite::micro::GetTensorData<uint8_t>(output));
 
       return kTfLiteOk;
     } break;
     case kTfLiteInt8: {
       reference_integer_ops::Tanh(
-          input->params.zero_point, data.input_range_radius,
-          data.input_multiplier, data.input_left_shift,
-          NumElements(input->dims), GetTensorData<int8_t>(input),
-          GetTensorData<int8_t>(output));
+          data.input_zero_point, data.input_range_radius, data.input_multiplier,
+          data.input_left_shift, NumElements(input->dims),
+          tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorData<int8_t>(output));
       return kTfLiteOk;
     } break;
     default:
@@ -115,9 +140,9 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace activations
 
 TfLiteRegistration Register_TANH() {
-  return {/*init=*/nullptr,
+  return {/*init=*/activations::TanhInit,
           /*free=*/nullptr,
-          /*prepare=*/nullptr,
+          /*prepare=*/activations::TanhPrepare,
           /*invoke=*/activations::TanhEval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
diff --git a/tensorflow/lite/micro/kernels/tanh_test.cc b/tensorflow/lite/micro/kernels/tanh_test.cc
index 54c9816c9a9..ef1564f4675 100644
--- a/tensorflow/lite/micro/kernels/tanh_test.cc
+++ b/tensorflow/lite/micro/kernels/tanh_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
@@ -81,40 +81,19 @@ void TestTanhFloat(const int input_dims_data[], const float* input_data,
       CreateFloatTensor(output_data, output_dims),
   };
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_TANH);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  const char* init_data = nullptr;
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
   int inputs_array_data[] = {1, 0};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
+  const TfLiteRegistration registration = tflite::ops::micro::Register_TANH();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array, /*builtin_data=*/nullptr,
+                             micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
+
   for (int i = 0; i < output_elements_count; ++i) {
     TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i],
                               tolerance);
@@ -147,40 +126,19 @@ void TestTanhQuantized(const int input_dims_data[], const float* input_data,
       CreateQuantizedTensor(output_quantized, output_dims, output_scale,
                             output_zero_point)};
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_TANH);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  const char* init_data = nullptr;
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
   int inputs_array_data[] = {1, 0};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
+  const TfLiteRegistration registration = tflite::ops::micro::Register_TANH();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array, /*builtin_data=*/nullptr,
+                             micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
+
   for (int i = 0; i < output_elements_count; ++i) {
     TF_LITE_MICRO_EXPECT_NEAR(expected_output_quantized[i], output_quantized[i],
                               tolerance);
@@ -217,7 +175,7 @@ TF_LITE_MICRO_TEST(SimpleTestTanhUInt8) {
 
   const float input_scale = 16 / 256.f;
   const int input_zero_point = 128;
-  const float output_scale = 1.99999955 / 256.f;
+  const float output_scale = 1.99999955f / 256.f;
   const int output_zero_point = 128;
 
   const int input_shape[] = {2, 1, tanh_vec_size};
@@ -245,7 +203,7 @@ TF_LITE_MICRO_TEST(SimpleTestTanhUInt8) {
 
   const float input_scale = 16 / 256.f;
   const int input_zero_point = 0;
-  const float output_scale = 1.99999955 / 256.f;
+  const float output_scale = 1.99999955f / 256.f;
   const int output_zero_point = 0;
 
   const int input_shape[] = {2, 1, tanh_vec_size};
diff --git a/tensorflow/lite/micro/kernels/unpack.cc b/tensorflow/lite/micro/kernels/unpack.cc
index 2c36fc56634..557cc57ac7e 100644
--- a/tensorflow/lite/micro/kernels/unpack.cc
+++ b/tensorflow/lite/micro/kernels/unpack.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -28,14 +29,16 @@ constexpr int kInputTensor = 0;
 
 template <typename T>
 TfLiteStatus UnpackImpl(TfLiteContext* context, TfLiteNode* node,
-                        const TfLiteTensor* input, int output_count, int axis) {
-  const TfLiteTensor* output0 = GetOutput(context, node, 0);
+                        const TfLiteEvalTensor* input, int output_count,
+                        int axis) {
+  const TfLiteEvalTensor* output0 =
+      tflite::micro::GetEvalOutput(context, node, 0);
   const TfLiteIntArray* input_dims = input->dims;
   const TfLiteIntArray* output_dims = output0->dims;
   const int dimensions = input_dims->size;
 
   if (axis < 0) {
-    axis += NumDimensions(input);
+    axis += input->dims->size;
   }
 
   TFLITE_DCHECK_LT(axis, dimensions);
@@ -54,11 +57,11 @@ TfLiteStatus UnpackImpl(TfLiteContext* context, TfLiteNode* node,
   }
   TFLITE_DCHECK_EQ(output_size, copy_size * outer_size);
 
-  const T* input_data = GetTensorData<T>(input);
+  const T* input_data = tflite::micro::GetTensorData<T>(input);
 
   for (int i = 0; i < output_count; ++i) {
-    TfLiteTensor* t = GetOutput(context, node, i);
-    T* output_data = GetTensorData<T>(t);
+    TfLiteEvalTensor* t = tflite::micro::GetEvalOutput(context, node, i);
+    T* output_data = tflite::micro::GetTensorData<T>(t);
     for (int k = 0; k < outer_size; ++k) {
       T* output_ptr = output_data + copy_size * k;
       int loc = k * output_count * copy_size + i * copy_size;
@@ -74,7 +77,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteUnpackParams* data =
       reinterpret_cast<TfLiteUnpackParams*>(node->builtin_data);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
 
   switch (input->type) {
     case kTfLiteFloat32: {
diff --git a/tensorflow/lite/micro/kernels/unpack_test.cc b/tensorflow/lite/micro/kernels/unpack_test.cc
index 1b801c2901d..5b2c36cdf3f 100644
--- a/tensorflow/lite/micro/kernels/unpack_test.cc
+++ b/tensorflow/lite/micro/kernels/unpack_test.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/debug_log.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 
@@ -24,19 +24,15 @@ namespace tflite {
 namespace testing {
 
 void TestUnpackThreeOutputsFloat(
-    std::initializer_list<int> input_dims_data,
-    std::initializer_list<float> input_data, int axis,
-    std::initializer_list<int> output1_dims_data,
-    std::initializer_list<float> expected_output1_data,
-    std::initializer_list<int> output2_dims_data,
-    std::initializer_list<float> expected_output2_data,
-    std::initializer_list<int> output3_dims_data,
-    std::initializer_list<float> expected_output3_data, float* output1_data,
-    float* output2_data, float* output3_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* output1_dims = IntArrayFromInitializer(output1_dims_data);
-  TfLiteIntArray* output2_dims = IntArrayFromInitializer(output2_dims_data);
-  TfLiteIntArray* output3_dims = IntArrayFromInitializer(output3_dims_data);
+    const int* input_dims_data, const float* input_data, int axis,
+    const int* output1_dims_data, const float* expected_output1_data,
+    const int* output2_dims_data, const float* expected_output2_data,
+    const int* output3_dims_data, const float* expected_output3_data,
+    float* output1_data, float* output2_data, float* output3_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output1_dims = IntArrayFromInts(output1_dims_data);
+  TfLiteIntArray* output2_dims = IntArrayFromInts(output2_dims_data);
+  TfLiteIntArray* output3_dims = IntArrayFromInts(output3_dims_data);
   const int output1_dims_count = ElementCount(*output1_dims);
   const int output2_dims_count = ElementCount(*output2_dims);
   const int output3_dims_count = ElementCount(*output3_dims);
@@ -63,68 +59,44 @@ void TestUnpackThreeOutputsFloat(
     output3_data[i] = 23;
   }
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_UNPACK);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
   TfLiteUnpackParams builtin_data = {
       .num = 3,
       .axis = axis,
   };
 
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, nullptr, 0);
-  }
   int inputs_array_data[] = {1, 0};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {3, 1, 2, 3};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
+  const TfLiteRegistration registration = tflite::ops::micro::Register_UNPACK();
+  micro::KernelRunner runner(
+      registration, tensors, tensors_size, inputs_array, outputs_array,
+      reinterpret_cast<void*>(&builtin_data), micro_test::reporter);
 
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   for (int i = 0; i < output1_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output1_data.begin()[i], output1_data[i],
-                              1e-5f);
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output1_data[i], output1_data[i], 1e-5f);
   }
 
   for (int i = 0; i < output2_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output2_data.begin()[i], output2_data[i],
-                              1e-5f);
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output2_data[i], output2_data[i], 1e-5f);
   }
 
   for (int i = 0; i < output3_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output3_data.begin()[i], output3_data[i],
-                              1e-5f);
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output3_data[i], output3_data[i], 1e-5f);
   }
 }
 
-void TestUnpackOneOutputFloat(std::initializer_list<int> input_dims_data,
-                              std::initializer_list<float> input_data, int axis,
-                              std::initializer_list<int> output_dims_data,
-                              std::initializer_list<float> expected_output_data,
+void TestUnpackOneOutputFloat(const int* input_dims_data,
+                              const float* input_data, int axis,
+                              const int* output_dims_data,
+                              const float* expected_output_data,
                               float* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
 
   constexpr int input_size = 1;
@@ -139,65 +111,39 @@ void TestUnpackOneOutputFloat(std::initializer_list<int> input_dims_data,
     output_data[i] = 23;
   }
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_UNPACK);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
   TfLiteUnpackParams builtin_data = {
       .num = 1,
       .axis = axis,
   };
 
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, nullptr, 0);
-  }
   int inputs_array_data[] = {1, 0};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
+  const TfLiteRegistration registration = tflite::ops::micro::Register_UNPACK();
+  micro::KernelRunner runner(
+      registration, tensors, tensors_size, inputs_array, outputs_array,
+      reinterpret_cast<void*>(&builtin_data), micro_test::reporter);
 
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
-                              1e-5f);
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i], 1e-5f);
   }
 }
 
 void TestUnpackThreeOutputsQuantized(
-    std::initializer_list<int> input_dims_data,
-    std::initializer_list<uint8_t> input_data, int axis,
-    std::initializer_list<int> output1_dims_data,
-    std::initializer_list<uint8_t> expected_output1_data,
-    std::initializer_list<int> output2_dims_data,
-    std::initializer_list<uint8_t> expected_output2_data,
-    std::initializer_list<int> output3_dims_data,
-    std::initializer_list<uint8_t> expected_output3_data, uint8_t* output1_data,
-    uint8_t* output2_data, uint8_t* output3_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* output1_dims = IntArrayFromInitializer(output1_dims_data);
-  TfLiteIntArray* output2_dims = IntArrayFromInitializer(output2_dims_data);
-  TfLiteIntArray* output3_dims = IntArrayFromInitializer(output3_dims_data);
+    const int* input_dims_data, const uint8_t* input_data, int axis,
+    const int* output1_dims_data, const uint8_t* expected_output1_data,
+    const int* output2_dims_data, const uint8_t* expected_output2_data,
+    const int* output3_dims_data, const uint8_t* expected_output3_data,
+    uint8_t* output1_data, uint8_t* output2_data, uint8_t* output3_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output1_dims = IntArrayFromInts(output1_dims_data);
+  TfLiteIntArray* output2_dims = IntArrayFromInts(output2_dims_data);
+  TfLiteIntArray* output3_dims = IntArrayFromInts(output3_dims_data);
   const int output1_dims_count = ElementCount(*output1_dims);
   const int output2_dims_count = ElementCount(*output2_dims);
   const int output3_dims_count = ElementCount(*output3_dims);
@@ -227,72 +173,47 @@ void TestUnpackThreeOutputsQuantized(
     output3_data[i] = 23;
   }
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_UNPACK);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
   TfLiteUnpackParams builtin_data = {
       .num = 3,
       .axis = axis,
   };
 
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, nullptr, 0);
-  }
   int inputs_array_data[] = {1, 0};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {3, 1, 2, 3};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
+  const TfLiteRegistration registration = tflite::ops::micro::Register_UNPACK();
+  micro::KernelRunner runner(
+      registration, tensors, tensors_size, inputs_array, outputs_array,
+      reinterpret_cast<void*>(&builtin_data), micro_test::reporter);
 
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   for (int i = 0; i < output1_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output1_data.begin()[i], output1_data[i]);
+    TF_LITE_MICRO_EXPECT_EQ(expected_output1_data[i], output1_data[i]);
   }
 
   for (int i = 0; i < output2_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output2_data.begin()[i], output2_data[i]);
+    TF_LITE_MICRO_EXPECT_EQ(expected_output2_data[i], output2_data[i]);
   }
 
   for (int i = 0; i < output3_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output3_data.begin()[i], output3_data[i]);
+    TF_LITE_MICRO_EXPECT_EQ(expected_output3_data[i], output3_data[i]);
   }
 }
 
 void TestUnpackThreeOutputsQuantized32(
-    std::initializer_list<int> input_dims_data,
-    std::initializer_list<int32_t> input_data, int axis,
-    std::initializer_list<int> output1_dims_data,
-    std::initializer_list<int32_t> expected_output1_data,
-    std::initializer_list<int> output2_dims_data,
-    std::initializer_list<int32_t> expected_output2_data,
-    std::initializer_list<int> output3_dims_data,
-    std::initializer_list<int32_t> expected_output3_data, int32_t* output1_data,
-    int32_t* output2_data, int32_t* output3_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* output1_dims = IntArrayFromInitializer(output1_dims_data);
-  TfLiteIntArray* output2_dims = IntArrayFromInitializer(output2_dims_data);
-  TfLiteIntArray* output3_dims = IntArrayFromInitializer(output3_dims_data);
+    const int* input_dims_data, const int32_t* input_data, int axis,
+    const int* output1_dims_data, const int32_t* expected_output1_data,
+    const int* output2_dims_data, const int32_t* expected_output2_data,
+    const int* output3_dims_data, const int32_t* expected_output3_data,
+    int32_t* output1_data, int32_t* output2_data, int32_t* output3_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output1_dims = IntArrayFromInts(output1_dims_data);
+  TfLiteIntArray* output2_dims = IntArrayFromInts(output2_dims_data);
+  TfLiteIntArray* output3_dims = IntArrayFromInts(output3_dims_data);
   const int output1_dims_count = ElementCount(*output1_dims);
   const int output2_dims_count = ElementCount(*output2_dims);
   const int output3_dims_count = ElementCount(*output3_dims);
@@ -319,55 +240,34 @@ void TestUnpackThreeOutputsQuantized32(
     output3_data[i] = 23;
   }
 
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  tflite::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_UNPACK);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
   TfLiteUnpackParams builtin_data = {
       .num = 3,
       .axis = axis,
   };
 
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, nullptr, 0);
-  }
   int inputs_array_data[] = {1, 0};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {3, 1, 2, 3};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
+  const TfLiteRegistration registration = tflite::ops::micro::Register_UNPACK();
+  micro::KernelRunner runner(
+      registration, tensors, tensors_size, inputs_array, outputs_array,
+      reinterpret_cast<void*>(&builtin_data), micro_test::reporter);
 
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   for (int i = 0; i < output1_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output1_data.begin()[i], output1_data[i]);
+    TF_LITE_MICRO_EXPECT_EQ(expected_output1_data[i], output1_data[i]);
   }
 
   for (int i = 0; i < output2_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output2_data.begin()[i], output2_data[i]);
+    TF_LITE_MICRO_EXPECT_EQ(expected_output2_data[i], output2_data[i]);
   }
 
   for (int i = 0; i < output3_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output3_data.begin()[i], output3_data[i]);
+    TF_LITE_MICRO_EXPECT_EQ(expected_output3_data[i], output3_data[i]);
   }
 }
 
@@ -377,6 +277,14 @@ void TestUnpackThreeOutputsQuantized32(
 TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(UnpackFloatThreeOutputs) {
+  const int input_shape[] = {2, 3, 2};
+  const float input_values[] = {1, 2, 3, 4, 5, 6};
+  const int output1_shape[] = {1, 2};
+  const float output1_golden[] = {1, 2};
+  const int output2_shape[] = {1, 2};
+  const float output2_golden[] = {3, 4};
+  const int output3_shape[] = {1, 2};
+  const float output3_golden[] = {5, 6};
   constexpr int output1_dims_count = 2;
   constexpr int output2_dims_count = 2;
   constexpr int output3_dims_count = 2;
@@ -384,18 +292,20 @@ TF_LITE_MICRO_TEST(UnpackFloatThreeOutputs) {
   float output2_data[output2_dims_count];
   float output3_data[output3_dims_count];
   tflite::testing::TestUnpackThreeOutputsFloat(
-      {2, 3, 2},           // Input shape
-      {1, 2, 3, 4, 5, 6},  // Input values
-      0, {1, 2},           // Output1 shape
-      {1, 2},              // Output1 values
-      {1, 2},              // Output2 shape
-      {3, 4},              // Output2 values
-      {1, 2},              // Output3 shape
-      {5, 6},              // Output3 values
+      input_shape, input_values, 0, output1_shape, output1_golden,
+      output2_shape, output2_golden, output3_shape, output3_golden,
       output1_data, output2_data, output3_data);
 }
 
 TF_LITE_MICRO_TEST(UnpackFloatThreeOutputsNegativeAxisTwo) {
+  const int input_shape[] = {2, 3, 2};
+  const float input_values[] = {1, 2, 3, 4, 5, 6};
+  const int output1_shape[] = {1, 2};
+  const float output1_golden[] = {1, 2};
+  const int output2_shape[] = {1, 2};
+  const float output2_golden[] = {3, 4};
+  const int output3_shape[] = {1, 2};
+  const float output3_golden[] = {5, 6};
   constexpr int output1_dims_count = 2;
   constexpr int output2_dims_count = 2;
   constexpr int output3_dims_count = 2;
@@ -403,29 +313,31 @@ TF_LITE_MICRO_TEST(UnpackFloatThreeOutputsNegativeAxisTwo) {
   float output2_data[output2_dims_count];
   float output3_data[output3_dims_count];
   tflite::testing::TestUnpackThreeOutputsFloat(
-      {2, 3, 2},           // Input shape
-      {1, 2, 3, 4, 5, 6},  // Input values
-      -2, {1, 2},          // Output1 shape
-      {1, 2},              // Output1 values
-      {1, 2},              // Output2 shape
-      {3, 4},              // Output2 values
-      {1, 2},              // Output3 shape
-      {5, 6},              // Output3 values
+      input_shape, input_values, -2, output1_shape, output1_golden,
+      output2_shape, output2_golden, output3_shape, output3_golden,
       output1_data, output2_data, output3_data);
 }
 
 TF_LITE_MICRO_TEST(UnpackFloatOneOutput) {
+  const int input_shape[] = {2, 1, 6};
+  const float input_values[] = {1, 2, 3, 4, 5, 6};
+  const int output_shape[] = {1, 6};
+  const float golden[] = {1, 2, 3, 4, 5, 6};
   constexpr int output_dims_count = 6;
   float output_data[output_dims_count];
-  tflite::testing::TestUnpackOneOutputFloat(
-      {2, 1, 6},           // Input shape
-      {1, 2, 3, 4, 5, 6},  // Input values
-      0, {1, 6},           // Output shape
-      {1, 2, 3, 4, 5, 6},  // Output values
-      output_data);
+  tflite::testing::TestUnpackOneOutputFloat(input_shape, input_values, 0,
+                                            output_shape, golden, output_data);
 }
 
 TF_LITE_MICRO_TEST(UnpackQuantizedThreeOutputs) {
+  const int input_shape[] = {2, 3, 2};
+  const uint8_t input_values[] = {1, 2, 3, 4, 5, 6};
+  const int output1_shape[] = {1, 2};
+  const uint8_t output1_golden[] = {1, 2};
+  const int output2_shape[] = {1, 2};
+  const uint8_t output2_golden[] = {3, 4};
+  const int output3_shape[] = {1, 2};
+  const uint8_t output3_golden[] = {5, 6};
   constexpr int output1_dims_count = 2;
   constexpr int output2_dims_count = 2;
   constexpr int output3_dims_count = 2;
@@ -433,18 +345,20 @@ TF_LITE_MICRO_TEST(UnpackQuantizedThreeOutputs) {
   uint8_t output2_data[output2_dims_count];
   uint8_t output3_data[output3_dims_count];
   tflite::testing::TestUnpackThreeOutputsQuantized(
-      {2, 3, 2},           // Input shape
-      {1, 2, 3, 4, 5, 6},  // Input values
-      0, {1, 2},           // Output1 shape
-      {1, 2},              // Output1 values
-      {1, 2},              // Output2 shape
-      {3, 4},              // Output2 values
-      {1, 2},              // Output3 shape
-      {5, 6},              // Output3 values
+      input_shape, input_values, 0, output1_shape, output1_golden,
+      output2_shape, output2_golden, output3_shape, output3_golden,
       output1_data, output2_data, output3_data);
 }
 
 TF_LITE_MICRO_TEST(UnpackQuantized32ThreeOutputs) {
+  const int input_shape[] = {2, 3, 2};
+  const int32_t input_values[] = {1, 2, 3, 4, 5, 6};
+  const int output1_shape[] = {1, 2};
+  const int32_t output1_golden[] = {1, 2};
+  const int output2_shape[] = {1, 2};
+  const int32_t output2_golden[] = {3, 4};
+  const int output3_shape[] = {1, 2};
+  const int32_t output3_golden[] = {5, 6};
   constexpr int output1_dims_count = 2;
   constexpr int output2_dims_count = 2;
   constexpr int output3_dims_count = 2;
@@ -452,14 +366,8 @@ TF_LITE_MICRO_TEST(UnpackQuantized32ThreeOutputs) {
   int32_t output2_data[output2_dims_count];
   int32_t output3_data[output3_dims_count];
   tflite::testing::TestUnpackThreeOutputsQuantized32(
-      {2, 3, 2},           // Input shape
-      {1, 2, 3, 4, 5, 6},  // Input values
-      0, {1, 2},           // Output1 shape
-      {1, 2},              // Output1 values
-      {1, 2},              // Output2 shape
-      {3, 4},              // Output2 values
-      {1, 2},              // Output3 shape
-      {5, 6},              // Output3 values
+      input_shape, input_values, 0, output1_shape, output1_golden,
+      output2_shape, output2_golden, output3_shape, output3_golden,
       output1_data, output2_data, output3_data);
 }
 
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifi/activations.cc b/tensorflow/lite/micro/kernels/xtensa_hifi/activations.cc
index 2582bf322e5..c501d8ae1a6 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/activations.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/activations.cc
@@ -1,24 +1,24 @@
-/******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
+/*******************************************************************************
+* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to use this Software with Cadence processor cores only and
+* not with any other processors and platforms, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
+******************************************************************************/
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -41,8 +41,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
 #include "tensorflow/lite/micro/micro_utils.h"
-#include "xtensa_tf_micro_common.h"
 
 namespace tflite {
 namespace ops {
@@ -109,6 +109,7 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
 
   switch (input->type) {
     case kTfLiteFloat32: {
+#if HIFI_VFPU
       int err;
       const float* inp_data_ptr;
       float* out_data_ptr;
@@ -119,11 +120,13 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
       inp_data_ptr = GetTensorData<float>(input);
       out_data_ptr = GetTensorData<float>(output);
 
-      const float f32_pos_inf = 0x7F800000;
-      err = xa_nn_vec_relu_f32_f32(out_data_ptr, inp_data_ptr, f32_pos_inf,
-                                   flat_size);
+      err = xa_nn_vec_relu_std_f32_f32(out_data_ptr, inp_data_ptr, flat_size);
 
-      CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_relu1_f32_f32 failed");
+      CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_relu_std_f32_f32 failed");
+#else
+      ReluFloat(GetTensorShape(input), GetTensorData<float>(input),
+                GetTensorShape(output), GetTensorData<float>(output));
+#endif /* HIFI_VFPU */
       return kTfLiteOk;
     }
     case kTfLiteInt8: {
@@ -140,14 +143,17 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
       const RuntimeShape& input_shape = GetTensorShape(input);
       const RuntimeShape& output_shape = GetTensorShape(output);
       const int flat_size = MatchingFlatSize(input_shape, output_shape);
+      const uint8_t zero = input->params.zero_point;
 
       inp_data_ptr = GetTensorData<uint8_t>(input);
       out_data_ptr = GetTensorData<uint8_t>(output);
 
       err = xa_nn_vec_activation_min_max_asym8_asym8(
-          out_data_ptr, inp_data_ptr, 0, 255, flat_size);  // Is 255 right?
+          out_data_ptr, inp_data_ptr, zero, std::numeric_limits<uint8_t>::max(),
+          flat_size);
 
-      CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_activation_min_max_8_8 failed");
+      CHECK_ERR_HIFI_NNLIB_KER(
+          err, "xa_nn_vec_activation_min_max_asym8_asym8 failed");
       return kTfLiteOk;
     }
     default: {
@@ -168,6 +174,7 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
 
   switch (input->type) {
     case kTfLiteFloat32: {
+#if HIFI_VFPU
       int err;
       const float* inp_data_ptr;
       float* out_data_ptr;
@@ -180,7 +187,11 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
 
       err = xa_nn_vec_relu6_f32_f32(out_data_ptr, inp_data_ptr, flat_size);
 
-      CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_relu1_f32_f32 failed");
+      CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_relu6_f32_f32 failed");
+#else
+      Relu6Float(GetTensorShape(input), GetTensorData<float>(input),
+                 GetTensorShape(output), GetTensorData<float>(output));
+#endif /* HIFI_VFPU */
       return kTfLiteOk;
     }
     case kTfLiteInt8: {
@@ -209,7 +220,8 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
       err = xa_nn_vec_activation_min_max_asym8_asym8(out_data_ptr, inp_data_ptr,
                                                      zero, six, flat_size);
 
-      CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_activation_min_max_8_8 failed");
+      CHECK_ERR_HIFI_NNLIB_KER(
+          err, "xa_nn_vec_activation_min_max_asym8_asym8 failed");
       return kTfLiteOk;
     }
     default: {
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifi/add.cc b/tensorflow/lite/micro/kernels/xtensa_hifi/add.cc
new file mode 100644
index 00000000000..90590ab0632
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/add.cc
@@ -0,0 +1,273 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/add.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace add {
+
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+struct OpData {
+  bool requires_broadcast;
+
+  // These fields are used in both the general 8-bit -> 8bit quantized path,
+  // and the special 16-bit -> 16bit quantized path
+  int input1_shift;
+  int input2_shift;
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+
+  // These fields are used only in the general 8-bit -> 8bit quantized path
+  int32_t input1_multiplier;
+  int32_t input2_multiplier;
+  int32_t output_multiplier;
+  int output_shift;
+  int left_shift;
+  int32_t input1_offset;
+  int32_t input2_offset;
+  int32_t output_offset;
+};
+
+TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteAddParams* params,
+                             const TfLiteTensor* input1,
+                             const TfLiteTensor* input2, TfLiteTensor* output,
+                             OpData* data) {
+  data->requires_broadcast = !HaveSameShapes(input1, input2);
+
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
+    // 8bit -> 8bit general quantized path, with general rescalings
+    data->input1_offset = -input1->params.zero_point;
+    data->input2_offset = -input2->params.zero_point;
+    data->output_offset = output->params.zero_point;
+    data->left_shift = 20;
+    const double twice_max_input_scale =
+        2 * static_cast<double>(
+                std::max(input1->params.scale, input2->params.scale));
+    const double real_input1_multiplier =
+        static_cast<double>(input1->params.scale) / twice_max_input_scale;
+    const double real_input2_multiplier =
+        static_cast<double>(input2->params.scale) / twice_max_input_scale;
+    const double real_output_multiplier =
+        twice_max_input_scale /
+        ((1 << data->left_shift) * static_cast<double>(output->params.scale));
+
+    QuantizeMultiplierSmallerThanOneExp(
+        real_input1_multiplier, &data->input1_multiplier, &data->input1_shift);
+
+    QuantizeMultiplierSmallerThanOneExp(
+        real_input2_multiplier, &data->input2_multiplier, &data->input2_shift);
+
+    QuantizeMultiplierSmallerThanOneExp(
+        real_output_multiplier, &data->output_multiplier, &data->output_shift);
+
+    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
+        context, params->activation, output, &data->output_activation_min,
+        &data->output_activation_max));
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalAdd(TfLiteContext* context, TfLiteNode* node,
+                     TfLiteAddParams* params, const OpData* data,
+                     const TfLiteTensor* input1, const TfLiteTensor* input2,
+                     TfLiteTensor* output) {
+  float output_activation_min, output_activation_max;
+  CalculateActivationRange(params->activation, &output_activation_min,
+                           &output_activation_max);
+  tflite::ArithmeticParams op_params;
+  SetActivationParams(output_activation_min, output_activation_max, &op_params);
+#define TF_LITE_ADD(opname)                                                   \
+  reference_ops::opname(op_params, GetTensorShape(input1),                    \
+                        GetTensorData<float>(input1), GetTensorShape(input2), \
+                        GetTensorData<float>(input2), GetTensorShape(output), \
+                        GetTensorData<float>(output))
+  if (data->requires_broadcast) {
+    TF_LITE_ADD(BroadcastAdd4DSlow);
+  } else {
+#if HIFI_VFPU
+    int err;
+    const RuntimeShape& input1_shape = GetTensorShape(input1);
+    const RuntimeShape& input2_shape = GetTensorShape(input2);
+    const RuntimeShape& output_shape = GetTensorShape(output);
+    const int flat_size =
+        MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+    err = xa_nn_elm_add_f32xf32_f32(GetTensorData<float>(output),
+                                    GetTensorData<float>(input1),
+                                    GetTensorData<float>(input2), flat_size);
+
+    CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_elm_add_f32xf32_f32 failed");
+
+    err = xa_nn_vec_activation_min_max_f32_f32(
+        GetTensorData<float>(output), GetTensorData<float>(output),
+        output_activation_min, output_activation_max, flat_size);
+
+    CHECK_ERR_HIFI_NNLIB_KER(err,
+                             "xa_nn_vec_activation_min_max_f32_f32 failed");
+#else
+    TF_LITE_ADD(Add);
+#endif /* HIFI_VFPU */
+  }
+#undef TF_LITE_ADD
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
+                              TfLiteAddParams* params, const OpData* data,
+                              const TfLiteTensor* input1,
+                              const TfLiteTensor* input2,
+                              TfLiteTensor* output) {
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
+    tflite::ArithmeticParams op_params;
+    op_params.left_shift = data->left_shift;
+    op_params.input1_offset = data->input1_offset;
+    op_params.input1_multiplier = data->input1_multiplier;
+    op_params.input1_shift = data->input1_shift;
+    op_params.input2_offset = data->input2_offset;
+    op_params.input2_multiplier = data->input2_multiplier;
+    op_params.input2_shift = data->input2_shift;
+    op_params.output_offset = data->output_offset;
+    op_params.output_multiplier = data->output_multiplier;
+    op_params.output_shift = data->output_shift;
+    SetActivationParams(data->output_activation_min,
+                        data->output_activation_max, &op_params);
+    bool need_broadcast = reference_ops::ProcessBroadcastShapes(
+        GetTensorShape(input1), GetTensorShape(input2), &op_params);
+#define TF_LITE_ADD(type, opname, dtype)                             \
+  type::opname(op_params, GetTensorShape(input1),                    \
+               GetTensorData<dtype>(input1), GetTensorShape(input2), \
+               GetTensorData<dtype>(input2), GetTensorShape(output), \
+               GetTensorData<dtype>(output));
+    if (output->type == kTfLiteInt8) {
+      if (need_broadcast) {
+        TF_LITE_ADD(reference_integer_ops, BroadcastAdd4DSlow, int8_t);
+      } else {
+        TF_LITE_ADD(reference_integer_ops, Add, int8_t);
+      }
+    } else {
+      if (need_broadcast) {
+        TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, uint8_t);
+      } else {
+        int err;
+        const RuntimeShape& input1_shape = GetTensorShape(input1);
+        const RuntimeShape& input2_shape = GetTensorShape(input2);
+        const RuntimeShape& output_shape = GetTensorShape(output);
+        const int flat_size =
+            MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+        err = xa_nn_elm_add_asym8xasym8_asym8(
+            GetTensorData<uint8_t>(output), op_params.output_offset,
+            op_params.output_shift, op_params.output_multiplier,
+            op_params.quantized_activation_min,
+            op_params.quantized_activation_max, GetTensorData<uint8_t>(input1),
+            op_params.input1_offset, op_params.input1_shift,
+            op_params.input1_multiplier, GetTensorData<uint8_t>(input2),
+            op_params.input2_offset, op_params.input2_shift,
+            op_params.input2_multiplier, op_params.left_shift, flat_size);
+
+        CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_elm_add_asym8xasym8_asym8 failed");
+      }
+    }
+#undef TF_LITE_ADD
+  }
+
+  return kTfLiteOk;
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  OpData* data = static_cast<OpData*>(node->user_data);
+  auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
+
+  TF_LITE_ENSURE_STATUS(
+      CalculateOpData(context, params, input1, input2, output, data));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (output->type == kTfLiteFloat32) {
+    TF_LITE_ENSURE_OK(
+        context, EvalAdd(context, node, params, data, input1, input2, output));
+  } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_OK(context, EvalAddQuantized(context, node, params, data,
+                                                input1, input2, output));
+  } else {
+    TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                       TfLiteTypeGetName(output->type), output->type);
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace add
+
+TfLiteRegistration Register_ADD() {
+  return {/*init=*/add::Init,
+          /*free=*/nullptr,
+          /*prepare=*/add::Prepare,
+          /*invoke=*/add::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifi/conv.cc b/tensorflow/lite/micro/kernels/xtensa_hifi/conv.cc
index 5f91282c7e1..2de3345bcbf 100755
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/conv.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/conv.cc
@@ -1,24 +1,24 @@
-/******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
+/*******************************************************************************
+* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to use this Software with Cadence processor cores only and
+* not with any other processors and platforms, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
+******************************************************************************/
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -44,7 +44,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
-#include "xtensa_tf_micro_common.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
 
 namespace tflite {
 namespace ops {
@@ -55,7 +55,6 @@ constexpr int kInputTensor = 0;
 constexpr int kFilterTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
-constexpr int kMaxChannels = 256;
 
 // Conv is quantized along dimension 0:
 // https://www.tensorflow.org/lite/performance/quantization_spec
@@ -71,9 +70,8 @@ struct OpData {
   int output_shift;
 
   // Per channel output multiplier and shift.
-  // (b/141139247): Allocate these dynamically when possible.
-  int32_t per_channel_output_multiplier[kMaxChannels];
-  int32_t per_channel_output_shift[kMaxChannels];
+  int32_t* per_channel_output_multiplier;
+  int32_t* per_channel_output_shift;
 
   // The range of the fused activation layer. For example for kNone and
   // uint8_t these would be 0 and 255.
@@ -94,10 +92,10 @@ inline PaddingType RuntimePaddingType(TfLitePadding padding) {
 }
 
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteConvParams* params, int width, int height,
-                             int filter_width, int filter_height, int out_width,
-                             int out_height, const TfLiteType data_type,
-                             OpData* data) {
+                             const TfLiteConvParams* params, int width,
+                             int height, int filter_width, int filter_height,
+                             int out_width, int out_height,
+                             const TfLiteType data_type, OpData* data) {
   bool has_bias = node->inputs->size == 3;
   // Check number of inputs/outputs
   TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
@@ -131,8 +129,69 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
   return kTfLiteOk;
 }
 
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  OpData* data = static_cast<OpData*>(node->user_data);
+  const auto params = static_cast<const TfLiteConvParams*>(node->builtin_data);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+
+  int input_width = input->dims->data[2];
+  int input_height = input->dims->data[1];
+  int filter_width = filter->dims->data[2];
+  int filter_height = filter->dims->data[1];
+  int output_width = output->dims->data[2];
+  int output_height = output->dims->data[1];
+
+  // Dynimically allocate per-channel quantization parameters.
+  const int num_channels = filter->dims->data[kConvQuantizedDimension];
+  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
+      context, num_channels * sizeof(int32_t),
+      reinterpret_cast<void**>(&data->per_channel_output_multiplier)));
+  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
+      context, num_channels * sizeof(int32_t),
+      reinterpret_cast<void**>(&data->per_channel_output_shift)));
+
+  // All per-channel quantized tensors need valid zero point and scale arrays.
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+
+    const auto* affine_quantization =
+        static_cast<TfLiteAffineQuantization*>(filter->quantization.params);
+    TF_LITE_ENSURE(context, affine_quantization);
+    TF_LITE_ENSURE(context, affine_quantization->scale);
+    TF_LITE_ENSURE(context, affine_quantization->zero_point);
+
+    TF_LITE_ENSURE(context,
+                   affine_quantization->scale->size == 1 ||
+                       affine_quantization->scale->size ==
+                           filter->dims->data[kConvQuantizedDimension]);
+    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
+                      affine_quantization->zero_point->size);
+  }
+
+  return CalculateOpData(context, node, params, input_width, input_height,
+                         filter_width, filter_height, output_width,
+                         output_height, input->type, data);
+}  // namespace conv
+
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           TfLiteConvParams* params, OpData* data,
+                           TfLiteConvParams* params, const OpData& data,
                            const TfLiteTensor* input,
                            const TfLiteTensor* filter, const TfLiteTensor* bias,
                            TfLiteTensor* im2col, TfLiteTensor* hwcn_weights,
@@ -143,9 +202,9 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 
   if ((params->dilation_width_factor == 1) &&
       (params->dilation_height_factor == 1)) {
-    const uint8 *input_data, *filter_data;
+    const uint8_t *input_data, *filter_data;
     const int32_t* bias_data;
-    uint8* output_data;
+    uint8_t* output_data;
     const RuntimeShape& input_shape = GetTensorShape(input);
     const RuntimeShape& filter_shape = GetTensorShape(filter);
     const RuntimeShape& output_shape = GetTensorShape(output);
@@ -158,14 +217,12 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 
     const int stride_width = params->stride_width;
     const int stride_height = params->stride_height;
-    const int dilation_width_factor = 1;
-    const int dilation_height_factor = 1;
-    const int pad_width = data->padding.width;
-    const int pad_height = data->padding.height;
-    const int32 output_activation_min = data->output_activation_min;
-    const int32 output_activation_max = data->output_activation_max;
-    const int32 output_multiplier = data->output_multiplier;
-    const int output_shift = -data->output_shift;
+    const int pad_width = data.padding.width;
+    const int pad_height = data.padding.height;
+    const int32_t output_activation_min = data.output_activation_min;
+    const int32_t output_activation_max = data.output_activation_max;
+    const int32_t output_multiplier = data.output_multiplier;
+    const int output_shift = -data.output_shift;
     TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
     TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
     TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
@@ -186,13 +243,14 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
     const int filter_depth = filter_shape.Dims(3);
 
     int err, output_data_format = 0;
-    void* p_scratch;
-    uint8 *p_filter, *p_out_scratch;
+    uint8_t* p_scratch;
+    uint8_t* p_filter;
     // Calculate filter_depth_padded as next near multiple of 4
     int filter_depth_padded = (filter_depth + 3) & (~3);
     int out_length = output_height * output_width * output_depth;
+    int filter_size_padded = filter_height * filter_width * filter_depth_padded;
     int required_scratch, input_precision = PREC_ASYM8;
-    int h, w, c;
+    int h, c;
 
     required_scratch = xa_nn_conv2d_std_getsize(
         input_height, input_depth, filter_height, filter_width, stride_height,
@@ -207,19 +265,11 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
     ALLOCATE_XTENSA_NNLIB_SCRATCH_MEM;
     p_scratch = xtensa_nnlib_scratch_buf;
 
-    p_filter = (uint8*)p_scratch;
-    p_out_scratch =
-        (p_filter +
-         ALIGNED_SIZE((sizeof(uint8_t) * filter_height * filter_width *
-                       filter_depth_padded * output_depth),
-                      8));
+    p_filter = p_scratch;
     required_scratch +=
-        ALIGNED_SIZE((sizeof(uint8_t) * filter_height * filter_width *
-                      filter_depth_padded * output_depth),
-                     8);
-    p_scratch =
-        (uint8*)(p_out_scratch + ALIGNED_SIZE(sizeof(uint8_t) * out_length, 8));
-    required_scratch += ALIGNED_SIZE(sizeof(uint8_t) * out_length, 8);
+        ALIGNED_SIZE((sizeof(uint8_t) * filter_size_padded * output_depth), 8);
+    p_scratch +=
+        ALIGNED_SIZE(sizeof(uint8_t) * filter_size_padded * output_depth, 8);
 
     if (required_scratch > (int)XTENSA_NNLIB_MAX_SCRATCH_SIZE) {
       TF_LITE_KERNEL_LOG(context,
@@ -240,9 +290,8 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
     }
 
     for (int batch = 0; batch < batches; ++batch) {
-      uint8* p_out_temp;
-      p_out_temp = (uint8*)&p_out_scratch[0];
-      p_out_temp = (uint8*)ALIGN_PTR(p_out_temp, 8);
+      uint8_t* p_out_temp;
+      p_out_temp = &output_data[batch * out_length];
 
       err = xa_nn_conv2d_std_asym8xasym8(
           p_out_temp,
@@ -252,24 +301,24 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
           filter_width, output_depth, stride_width, stride_height, pad_width,
           pad_height, output_height, output_width, input_offset, filter_offset,
           output_multiplier, output_shift, output_offset, output_data_format,
-          p_scratch);
+          static_cast<void*>(p_scratch));
 
       CHECK_ERR_HIFI_NNLIB_KER(
           err, "conv2d_std_asym8: xa_nn_conv2d_std_asym8xasym8 failed");
 
-      for (int i = 0; i < out_length; i++) {
-        uint8* p_temp;
-        p_temp = &output_data[batch * out_length];
+      err = xa_nn_vec_activation_min_max_asym8_asym8(
+          p_out_temp, p_out_temp, output_activation_min, output_activation_max,
+          out_length);
 
-        ACTIVATION_MIN_MAX_ASYM8(p_temp[i], p_out_temp[i],
-                                 output_activation_min, output_activation_max)
-      }
+      CHECK_ERR_HIFI_NNLIB_KER(
+          err, "xa_nn_vec_activation_min_max_asym8_asym8 failed");
     }
   } else {
+    // TODO(b/154032858): Investigate removing extra copies.
     ConvParams op_params;
     op_params.padding_type = RuntimePaddingType(params->padding);
-    op_params.padding_values.width = data->padding.width;
-    op_params.padding_values.height = data->padding.height;
+    op_params.padding_values.width = data.padding.width;
+    op_params.padding_values.height = data.padding.height;
     op_params.stride_width = params->stride_width;
     op_params.stride_height = params->stride_height;
     op_params.dilation_width_factor = params->dilation_width_factor;
@@ -277,10 +326,10 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
     op_params.input_offset = input_offset;
     op_params.weights_offset = filter_offset;
     op_params.output_offset = output_offset;
-    op_params.output_multiplier = data->output_multiplier;
-    op_params.output_shift = -data->output_shift;
-    op_params.quantized_activation_min = data->output_activation_min;
-    op_params.quantized_activation_max = data->output_activation_max;
+    op_params.output_multiplier = data.output_multiplier;
+    op_params.output_shift = -data.output_shift;
+    op_params.quantized_activation_min = data.output_activation_min;
+    op_params.quantized_activation_max = data.output_activation_max;
     reference_ops::Conv(op_params, GetTensorShape(input),
                         GetTensorData<uint8_t>(input), GetTensorShape(filter),
                         GetTensorData<uint8_t>(filter), GetTensorShape(bias),
@@ -292,11 +341,12 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 }
 
 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteConvParams* params, OpData* data,
+                             TfLiteConvParams* params, const OpData& data,
                              const TfLiteTensor* input,
                              const TfLiteTensor* filter,
                              const TfLiteTensor* bias, TfLiteTensor* output,
                              TfLiteTensor* im2col) {
+  // TODO(b/154032858): Investigate removing extra copies.
   ConvParams op_params;
   op_params.input_offset = -input->params.zero_point;
   op_params.output_offset = output->params.zero_point;
@@ -304,22 +354,22 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
   op_params.stride_width = params->stride_width;
   op_params.dilation_height_factor = params->dilation_height_factor;
   op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.padding_values.height = data->padding.height;
-  op_params.padding_values.width = data->padding.width;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.padding_values.height = data.padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
 
   reference_integer_ops::ConvPerChannel(
-      op_params, data->per_channel_output_multiplier,
-      data->per_channel_output_shift, GetTensorShape(input),
-      GetTensorData<int8>(input), GetTensorShape(filter),
-      GetTensorData<int8>(filter), GetTensorShape(bias),
-      GetTensorData<int32>(bias), GetTensorShape(output),
-      GetTensorData<int8>(output));
+      op_params, data.per_channel_output_multiplier,
+      data.per_channel_output_shift, GetTensorShape(input),
+      GetTensorData<int8_t>(input), GetTensorShape(filter),
+      GetTensorData<int8_t>(filter), GetTensorShape(bias),
+      GetTensorData<int32_t>(bias), GetTensorShape(output),
+      GetTensorData<int8_t>(output));
 }
 
 TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteConvParams* params, OpData* data,
+                       TfLiteConvParams* params, const OpData& data,
                        const TfLiteTensor* input, const TfLiteTensor* filter,
                        const TfLiteTensor* bias, TfLiteTensor* im2col,
                        TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
@@ -327,6 +377,7 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
 
+#if HIFI_VFPU
   if ((params->dilation_width_factor == 1) &&
       (params->dilation_height_factor == 1)) {
     const float *input_data, *filter_data;
@@ -344,10 +395,8 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
 
     const int stride_width = params->stride_width;
     const int stride_height = params->stride_height;
-    const int dilation_width_factor = 1;
-    const int dilation_height_factor = 1;
-    const int pad_width = data->padding.width;
-    const int pad_height = data->padding.height;
+    const int pad_width = data.padding.width;
+    const int pad_height = data.padding.height;
     TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
     TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
     TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
@@ -366,13 +415,14 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
     const int output_width = output_shape.Dims(2);
     const int filter_depth = filter_shape.Dims(3);
     int err, output_data_format = 0;
-    void* p_scratch;
-    float *p_filter, *p_out_scratch;
+    uint8_t* p_scratch;
+    float* p_filter;
     // Calculate filter_depth_padded as next near multiple of 2
     int filter_depth_padded = (filter_depth + 1) & (~1);
     int out_length = output_height * output_width * output_depth;
+    int filter_size_padded = filter_height * filter_width * filter_depth_padded;
     int required_scratch, input_precision = PREC_F32;
-    int h, w, c;
+    int h, c;
 
     required_scratch = xa_nn_conv2d_std_getsize(
         input_height, input_depth, filter_height, filter_width, stride_height,
@@ -387,19 +437,11 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
     ALLOCATE_XTENSA_NNLIB_SCRATCH_MEM;
     p_scratch = xtensa_nnlib_scratch_buf;
 
-    p_filter = (float*)p_scratch;
-    p_out_scratch =
-        (float*)((uint8_t*)p_filter +
-                 ALIGNED_SIZE((sizeof(float) * filter_height * filter_width *
-                               filter_depth_padded * output_depth),
-                              8));
+    p_filter = reinterpret_cast<float*>(p_scratch);
+    p_scratch +=
+        ALIGNED_SIZE((sizeof(float) * filter_size_padded * output_depth), 8);
     required_scratch +=
-        ALIGNED_SIZE((sizeof(float) * filter_height * filter_width *
-                      filter_depth_padded * output_depth),
-                     8);
-    p_scratch = (float*)((uint8_t*)p_out_scratch +
-                         ALIGNED_SIZE(sizeof(float) * out_length, 8));
-    required_scratch += ALIGNED_SIZE(sizeof(float) * out_length, 8);
+        ALIGNED_SIZE((sizeof(float) * filter_size_padded * output_depth), 8);
 
     if (required_scratch > (int)XTENSA_NNLIB_MAX_SCRATCH_SIZE) {
       TF_LITE_KERNEL_LOG(context,
@@ -420,8 +462,7 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
 
     for (int batch = 0; batch < batches; ++batch) {
       float* p_out_temp;
-      p_out_temp = (float*)&p_out_scratch[0];
-      p_out_temp = (float*)ALIGN_PTR(p_out_temp, 8);
+      p_out_temp = &output_data[batch * out_length];
 
       err = xa_nn_conv2d_std_f32(
           p_out_temp,
@@ -429,23 +470,26 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
           p_filter, bias_data, input_height, input_width, input_depth,
           filter_height, filter_width, output_depth, stride_width,
           stride_height, pad_width, pad_height, output_height, output_width,
-          output_data_format, p_scratch);
+          output_data_format, static_cast<void*>(p_scratch));
 
       CHECK_ERR_HIFI_NNLIB_KER(
           err, "conv2d_std_f32: xa_nn_conv2d_std_f32xf32 failed");
 
-      for (int i = 0; i < out_length; i++) {
-        float* p_temp;
-        p_temp = &output_data[batch * out_length];
-        ACTIVATION_MIN_MAX(float, p_temp[i], p_out_temp[i],
-                           output_activation_min, output_activation_max)
-      }
+      err = xa_nn_vec_activation_min_max_f32_f32(
+          p_out_temp, p_out_temp, output_activation_min, output_activation_max,
+          out_length);
+
+      CHECK_ERR_HIFI_NNLIB_KER(err,
+                               "xa_nn_vec_activation_min_max_f32_f32 failed");
     }
-  } else {
+  } else
+#endif /* HIFI_VFPU */
+  {
+    // TODO(b/154032858): Investigate removing extra copies.
     ConvParams op_params;
     op_params.padding_type = RuntimePaddingType(params->padding);
-    op_params.padding_values.width = data->padding.width;
-    op_params.padding_values.height = data->padding.height;
+    op_params.padding_values.width = data.padding.width;
+    op_params.padding_values.height = data.padding.height;
     op_params.stride_width = params->stride_width;
     op_params.stride_height = params->stride_height;
     op_params.dilation_width_factor = params->dilation_width_factor;
@@ -471,50 +515,20 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
   const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
 
-  int input_width = input->dims->data[2];
-  int input_height = input->dims->data[1];
-  int filter_width = filter->dims->data[2];
-  int filter_height = filter->dims->data[1];
-  int output_width = output->dims->data[2];
-  int output_height = output->dims->data[1];
-
-  OpData data;
-
-  // All per-channel quantized tensors need valid zero point and scale arrays.
-  if (input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
-                      kTfLiteAffineQuantization);
-
-    const auto* affine_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(
-            filter->quantization.params);
-    TF_LITE_ENSURE(context, affine_quantization);
-    TF_LITE_ENSURE(context, affine_quantization->scale);
-    TF_LITE_ENSURE(context, affine_quantization->zero_point);
-
-    TF_LITE_ENSURE(context,
-                   affine_quantization->scale->size == 1 ||
-                       affine_quantization->scale->size ==
-                           filter->dims->data[kConvQuantizedDimension]);
-    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
-                      affine_quantization->zero_point->size);
-  }
-
-  TF_LITE_ENSURE_STATUS(CalculateOpData(
-      context, node, params, input_width, input_height, filter_width,
-      filter_height, output_width, output_height, input->type, &data));
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
-      EvalFloat(context, node, params, &data, input, filter, bias, nullptr,
+      EvalFloat(context, node, params, data, input, filter, bias, nullptr,
                 nullptr, output);
       break;
     case kTfLiteInt8:
-      EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias,
+      EvalQuantizedPerChannel(context, node, params, data, input, filter, bias,
                               output, nullptr);
       break;
     case kTfLiteUInt8:
-      EvalQuantized(context, node, params, &data, input, filter, bias, nullptr,
+      EvalQuantized(context, node, params, data, input, filter, bias, nullptr,
                     nullptr, output);
       break;
     default:
@@ -528,9 +542,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace conv
 
 TfLiteRegistration Register_CONV_2D() {
-  return {/*init=*/nullptr,
+  return {/*init=*/conv::Init,
           /*free=*/nullptr,
-          /*prepare=*/nullptr,
+          /*prepare=*/conv::Prepare,
           /*invoke=*/conv::Eval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifi/depthwise_conv.cc b/tensorflow/lite/micro/kernels/xtensa_hifi/depthwise_conv.cc
index e01a5916fca..2dd11ed060f 100755
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/depthwise_conv.cc
@@ -1,24 +1,24 @@
-/******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
+/*******************************************************************************
+* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to use this Software with Cadence processor cores only and
+* not with any other processors and platforms, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
+******************************************************************************/
 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -45,7 +45,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
-#include "xtensa_tf_micro_common.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
 
 namespace tflite {
 namespace ops {
@@ -57,8 +57,6 @@ constexpr int kInputTensor = 0;
 constexpr int kFilterTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
-// Per channel quantization is not needed for any model on xtensa.
-constexpr int kMaxChannels = 256;
 
 // Depthwise conv is quantized along dimension 3:
 // https://www.tensorflow.org/lite/performance/quantization_spec
@@ -72,10 +70,8 @@ struct OpData {
   int output_shift;
 
   // Per channel output multiplier and shift.
-  // (b/141139247): Allocate these dynamically when possible.
-  int32_t per_channel_output_multiplier[kMaxChannels];
-  int32_t per_channel_output_shift[kMaxChannels];
-
+  int32_t* per_channel_output_multiplier;
+  int32_t* per_channel_output_shift;
   // The range of the fused activation layer. For example for kNone and
   // uint8_t these would be 0 and 255.
   int32_t output_activation_min;
@@ -107,26 +103,88 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
     TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
     int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
 
-    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
+    return tflite::PopulateConvolutionQuantizationParams(
         context, input, filter, bias, output, params->activation,
         &data->output_multiplier, &data->output_shift,
         &data->output_activation_min, &data->output_activation_max,
         data->per_channel_output_multiplier,
-        reinterpret_cast<int*>(data->per_channel_output_shift), num_channels));
+        reinterpret_cast<int*>(data->per_channel_output_shift), num_channels);
   }
   return kTfLiteOk;
 }
 
 }  // namespace
 
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  auto* params =
+      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+
+  const TfLiteType data_type = input->type;
+  int width = SizeOfDimension(input, 2);
+  int height = SizeOfDimension(input, 1);
+  int filter_width = SizeOfDimension(filter, 2);
+  int filter_height = SizeOfDimension(filter, 1);
+
+  // Per channel quantization is only needed for int8_t inference. For other
+  // quantized types, only a single scale and zero point is needed.
+  const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
+  // Dynimically allocate per-channel quantization parameters.
+  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
+      context, num_channels * sizeof(int32_t),
+      reinterpret_cast<void**>(&data->per_channel_output_multiplier)));
+  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
+      context, num_channels * sizeof(int32_t),
+      reinterpret_cast<void**>(&data->per_channel_output_shift)));
+
+  // All per-channel quantized tensors need valid zero point and scale arrays.
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+
+    const auto* affine_quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(
+            filter->quantization.params);
+    TF_LITE_ENSURE(context, affine_quantization);
+    TF_LITE_ENSURE(context, affine_quantization->scale);
+    TF_LITE_ENSURE(context, affine_quantization->zero_point);
+    TF_LITE_ENSURE(
+        context, affine_quantization->scale->size == 1 ||
+                     affine_quantization->scale->size ==
+                         filter->dims->data[kDepthwiseConvQuantizedDimension]);
+    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
+                      affine_quantization->zero_point->size);
+  }
+
+  return CalculateOpData(context, node, params, width, height, filter_width,
+                         filter_height, data_type, data);
+}
+
 TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteDepthwiseConvParams* params, OpData* data,
+                       TfLiteDepthwiseConvParams* params, const OpData* data,
                        const TfLiteTensor* input, const TfLiteTensor* filter,
                        const TfLiteTensor* bias, TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
 
+#if HIFI_VFPU
   if ((params->dilation_width_factor == 1) &&
       (params->dilation_height_factor == 1)) {
     const float *input_data, *filter_data, *bias_data;
@@ -143,10 +201,6 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
 
     const int stride_width = params->stride_width;
     const int stride_height = params->stride_height;
-    const int dilation_width_factor = 1;
-    const int dilation_height_factor = 1;
-    // const int dilation_width_factor = params->dilation_width_factor;;
-    // const int dilation_height_factor = params->dilation_height_factor;
     const int pad_width = data->padding.width;
     const int pad_height = data->padding.height;
     const int depth_multiplier = params->depth_multiplier;
@@ -168,7 +222,7 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
     TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
 
     int32_t err, input_data_format = 0, output_data_format = 0;
-    void* p_scratch;
+    uint8_t* p_scratch;
     float* p_filter;
     int filter_depth_padded, filter_size_padded, required_scratch;
     int input_precision = PREC_F32;
@@ -198,9 +252,8 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
       return kTfLiteError;
     }
 
-    p_filter = (float*)p_scratch;
-    p_scratch = (void*)((uint8_t*)p_filter +
-                        ALIGNED_SIZE(sizeof(float) * filter_size_padded, 8));
+    p_filter = reinterpret_cast<float*>(p_scratch);
+    p_scratch += ALIGNED_SIZE(sizeof(float) * filter_size_padded, 8);
 
     for (h = 0; h < filter_height * filter_width; h++) {
       for (c = 0; c < filter_depth; c++) {
@@ -220,37 +273,22 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
           input_height, input_width, input_depth, filter_height, filter_width,
           depth_multiplier, stride_width, stride_height, pad_width, pad_height,
           output_height, output_width, input_data_format, output_data_format,
-          p_scratch);
+          static_cast<void*>(p_scratch));
 
       CHECK_ERR_HIFI_NNLIB_KER(
           err, "DepthwiseConvFloat: xa_nn_conv2d_depthwise_f32 failed");
     }
 
-    // pre loop for activation_min_max to handle alignment
     int out_length = batches * output_height * output_width * output_depth;
-    uint32 p_unalign_val = (uint32)output_data, p_align_val;
-    p_align_val = (p_unalign_val + 7) & (~7);
+    err = xa_nn_vec_activation_min_max_f32_f32(
+        output_data, output_data, output_activation_min, output_activation_max,
+        out_length);
 
-    int pre_loop_count = p_align_val - p_unalign_val;
-    pre_loop_count = MIN(pre_loop_count, out_length);
-
-    for (i = 0; i < pre_loop_count; i++) {
-      ACTIVATION_MIN_MAX(float, output_data[i], output_data[i],
-                         output_activation_min, output_activation_max)
-    }
-
-    out_length = out_length - pre_loop_count;
-
-    if (out_length) {
-      err = xa_nn_vec_activation_min_max_f32_f32(
-          &output_data[i], &output_data[i], output_activation_min,
-          output_activation_max, out_length);
-
-      CHECK_ERR_HIFI_NNLIB_KER(
-          err,
-          "DepthwiseConvFloat: xa_nn_vec_activation_min_max_f32_f32 failed");
-    }
-  } else {
+    CHECK_ERR_HIFI_NNLIB_KER(
+        err, "DepthwiseConvFloat: xa_nn_vec_activation_min_max_f32_f32 failed");
+  } else
+#endif /* HIFI_VFPU */
+  {
     tflite::DepthwiseParams op_params;
     // Padding type is ignored, but still set.
     op_params.padding_type = PaddingType::kSame;
@@ -274,8 +312,8 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
 }
 
 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteDepthwiseConvParams* params, OpData* data,
-                             const TfLiteTensor* input,
+                             TfLiteDepthwiseConvParams* params,
+                             const OpData* data, const TfLiteTensor* input,
                              const TfLiteTensor* filter,
                              const TfLiteTensor* bias, TfLiteTensor* output) {
   DepthwiseParams op_params;
@@ -290,22 +328,22 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
   op_params.input_offset = -input->params.zero_point;
   op_params.weights_offset = 0;
   op_params.output_offset = output->params.zero_point;
-  // (b/130439627): Use calculated value for clamping.
+  // TODO(b/130439627): Use calculated value for clamping.
   op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
   op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
 
   reference_integer_ops::DepthwiseConvPerChannel(
       op_params, data->per_channel_output_multiplier,
       data->per_channel_output_shift, GetTensorShape(input),
-      GetTensorData<int8>(input), GetTensorShape(filter),
-      GetTensorData<int8>(filter), GetTensorShape(bias),
-      GetTensorData<int32>(bias), GetTensorShape(output),
-      GetTensorData<int8>(output));
+      GetTensorData<int8_t>(input), GetTensorShape(filter),
+      GetTensorData<int8_t>(filter), GetTensorShape(bias),
+      GetTensorData<int32_t>(bias), GetTensorShape(output),
+      GetTensorData<int8_t>(output));
 }
 
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           TfLiteDepthwiseConvParams* params, OpData* data,
-                           const TfLiteTensor* input,
+                           TfLiteDepthwiseConvParams* params,
+                           const OpData* data, const TfLiteTensor* input,
                            const TfLiteTensor* filter, const TfLiteTensor* bias,
                            TfLiteTensor* output) {
   const int32_t input_offset = -input->params.zero_point;
@@ -314,9 +352,9 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 
   if ((params->dilation_width_factor == 1) &&
       (params->dilation_height_factor == 1)) {
-    const uint8 *input_data, *filter_data;
+    const uint8_t *input_data, *filter_data;
     const int32_t* bias_data;
-    uint8* output_data;
+    uint8_t* output_data;
     const RuntimeShape& input_shape = GetTensorShape(input);
     const RuntimeShape& filter_shape = GetTensorShape(filter);
     const RuntimeShape& output_shape = GetTensorShape(output);
@@ -329,16 +367,12 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 
     const int stride_width = params->stride_width;
     const int stride_height = params->stride_height;
-    const int dilation_width_factor = 1;
-    const int dilation_height_factor = 1;
-    // const int dilation_width_factor = params->dilation_width_factor;
-    // const int dilation_height_factor = params->dilation_height_factor;
     const int pad_width = data->padding.width;
     const int pad_height = data->padding.height;
     const int depth_multiplier = params->depth_multiplier;
-    const int32 output_activation_min = data->output_activation_min;
-    const int32 output_activation_max = data->output_activation_max;
-    const int32 output_multiplier = data->output_multiplier;
+    const int32_t output_activation_min = data->output_activation_min;
+    const int32_t output_activation_max = data->output_activation_max;
+    const int32_t output_multiplier = data->output_multiplier;
     // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
     const int output_shift = -data->output_shift;
     TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -360,11 +394,11 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
     TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
 
     int32_t err, i, input_data_format = 0, output_data_format = 0;
-    void* p_scratch;
-    uint8* p_filter;
+    uint8_t* p_scratch;
+    uint8_t* p_filter;
     int filter_depth_padded, filter_size_padded, required_scratch;
     int input_precision = PREC_ASYM8;
-    int h, c;
+    int h;
 
     ALLOCATE_XTENSA_NNLIB_SCRATCH_MEM;
     p_scratch = xtensa_nnlib_scratch_buf;
@@ -390,18 +424,15 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
       return kTfLiteError;
     }
 
-    p_filter = (uint8*)p_scratch;
-    p_scratch = (void*)(p_filter +
-                        ALIGNED_SIZE(sizeof(uint8_t) * filter_size_padded, 8));
+    p_filter = p_scratch;
+    p_scratch += ALIGNED_SIZE(sizeof(uint8_t) * filter_size_padded, 8);
+    int pad_value = filter_depth_padded - filter_depth;
 
     for (h = 0; h < filter_height * filter_width; h++) {
-      for (c = 0; c < filter_depth; c++) {
-        p_filter[h * filter_depth_padded + c] =
-            filter_data[h * filter_depth + c];
-      }
-      for (c = filter_depth; c < filter_depth_padded; c++) {
-        p_filter[h * filter_depth_padded + c] = -filter_offset;
-      }
+      memcpy(&p_filter[h * filter_depth_padded], &filter_data[h * filter_depth],
+             filter_depth);
+      memset(&p_filter[h * filter_depth_padded + filter_depth], -filter_offset,
+             pad_value);
     }
 
     for (i = 0; i < batches; i++) {
@@ -413,37 +444,22 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
           depth_multiplier, stride_width, stride_height, pad_width, pad_height,
           output_height, output_width, input_offset, filter_offset,
           output_multiplier, output_shift, output_offset, input_data_format,
-          output_data_format, p_scratch);
+          output_data_format, static_cast<void*>(p_scratch));
 
       CHECK_ERR_HIFI_NNLIB_KER(
           err, "DepthwiseConvAsym8: xa_nn_conv2d_depthwise_asym8xasym8 failed");
     }
 
-    // pre loop for activation_min_max to handle alignment
     int out_length = batches * output_height * output_width * output_depth;
-    uint32 p_unalign_val = (uint32)output_data, p_align_val;
-    p_align_val = (p_unalign_val + 7) & (~7);
+    err = xa_nn_vec_activation_min_max_asym8_asym8(
+        output_data, output_data, output_activation_min, output_activation_max,
+        out_length);
 
-    int pre_loop_count = p_align_val - p_unalign_val;
-    pre_loop_count = MIN(pre_loop_count, out_length);
+    CHECK_ERR_HIFI_NNLIB_KER(
+        err,
+        "DepthwiseConvAsym8: xa_nn_vec_activation_min_max_asym8_asym8 "
+        "failed");
 
-    for (i = 0; i < pre_loop_count; i++) {
-      ACTIVATION_MIN_MAX_ASYM8(output_data[i], output_data[i],
-                               output_activation_min, output_activation_max)
-    }
-
-    out_length = out_length - pre_loop_count;
-
-    if (out_length > 0) {
-      err = xa_nn_vec_activation_min_max_asym8_asym8(
-          &output_data[i], &output_data[i], output_activation_min,
-          output_activation_max, out_length);
-
-      CHECK_ERR_HIFI_NNLIB_KER(
-          err,
-          "DepthwiseConvAsym8: xa_nn_vec_activation_min_max_asym8_asym8 "
-          "failed");
-    }
   } else {
     tflite::DepthwiseParams op_params;
     // Padding type is ignored, but still set.
@@ -474,8 +490,12 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
   auto* params =
       reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
@@ -483,38 +503,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* bias =
       (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
 
-  const TfLiteType data_type = input->type;
-  int width = SizeOfDimension(input, 2);
-  int height = SizeOfDimension(input, 1);
-  int filter_width = SizeOfDimension(filter, 2);
-  int filter_height = SizeOfDimension(filter, 1);
-
-  OpData data;
-
-  // All per-channel quantized tensors need valid zero point and scale arrays.
-  if (input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
-                      kTfLiteAffineQuantization);
-
-    const auto* affine_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(
-            filter->quantization.params);
-    TF_LITE_ENSURE(context, affine_quantization);
-    TF_LITE_ENSURE(context, affine_quantization->scale);
-    TF_LITE_ENSURE(context, affine_quantization->zero_point);
-    TF_LITE_ENSURE(
-        context, affine_quantization->scale->size == 1 ||
-                     affine_quantization->scale->size ==
-                         filter->dims->data[kDepthwiseConvQuantizedDimension]);
-    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
-                      affine_quantization->zero_point->size);
-  }
-
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
-                                        filter_width, filter_height, data_type,
-                                        &data));
-
-  // (aselle): Consider whether float conv and quantized conv should be
+  // TODO(aselle): Consider whether float conv and quantized conv should be
   // separate ops to avoid dispatch overhead here.
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
@@ -538,9 +527,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace depthwise_conv
 
 TfLiteRegistration Register_DEPTHWISE_CONV_2D() {
-  return {/*init=*/nullptr,
+  return {/*init=*/depthwise_conv::Init,
           /*free=*/nullptr,
-          /*prepare=*/nullptr,
+          /*prepare=*/depthwise_conv::Prepare,
           /*invoke=*/depthwise_conv::Eval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifi/floor.cc b/tensorflow/lite/micro/kernels/xtensa_hifi/floor.cc
index 0e597465260..44aac921c87 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/floor.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/floor.cc
@@ -1,24 +1,24 @@
-/******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
+/*******************************************************************************
+* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to use this Software with Cadence processor cores only and
+* not with any other processors and platforms, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
+******************************************************************************/
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -39,7 +39,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "xtensa_tf_micro_common.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
 
 namespace tflite {
 namespace ops {
@@ -53,6 +53,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+#if HIFI_VFPU
   int err;
   const float* inp_data_ptr;
   float* out_data_ptr;
@@ -66,6 +67,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   err = xa_nn_elm_floor_f32_f32(out_data_ptr, inp_data_ptr, flat_size);
 
   CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_elm_floor_f32_f32 failed");
+#else
+  reference_ops::Floor(GetTensorShape(input), GetTensorData<float>(input),
+                       GetTensorShape(output), GetTensorData<float>(output));
+#endif /* HIFI_VFPU */
   return kTfLiteOk;
 }
 }  // namespace floor
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifi/fully_connected.cc b/tensorflow/lite/micro/kernels/xtensa_hifi/fully_connected.cc
index 74d148200cb..2cbea172434 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/fully_connected.cc
@@ -1,24 +1,24 @@
-/******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
+/*******************************************************************************
+* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to use this Software with Cadence processor cores only and
+* not with any other processors and platforms, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
+******************************************************************************/
 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -43,7 +43,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "xtensa_tf_micro_common.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
 
 namespace tflite {
 namespace ops {
@@ -70,7 +70,7 @@ constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
 
 TfLiteStatus CalculateOpData(TfLiteContext* context,
-                             TfLiteFullyConnectedParams* params,
+                             TfLiteFusedActivation activation,
                              TfLiteType data_type, const TfLiteTensor* input,
                              const TfLiteTensor* filter,
                              const TfLiteTensor* bias, TfLiteTensor* output,
@@ -84,7 +84,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
     QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
     data->output_shift = -exponent;
     TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
-        context, params->activation, output, &data->output_activation_min,
+        context, activation, output, &data->output_activation_min,
         &data->output_activation_max));
   }
   return status;
@@ -92,20 +92,50 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
 
 }  // namespace
 
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  OpData* data = static_cast<OpData*>(node->user_data);
+  const auto params =
+      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_MSG(context, input->type == filter->type,
+                     "Hybrid models are not supported on TFLite Micro.");
+
+  return CalculateOpData(context, params->activation, input->type, input,
+                         filter, bias, output, data);
+}
+
 TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
-                               TfLiteFullyConnectedParams* params, OpData* data,
-                               const TfLiteTensor* input,
+                               const OpData& data, const TfLiteTensor* input,
                                const TfLiteTensor* filter,
                                const TfLiteTensor* bias, TfLiteTensor* output) {
-  FullyConnectedParams op_params;
+  tflite::FullyConnectedParams op_params;
   op_params.input_offset = -input->params.zero_point;
   op_params.weights_offset = -filter->params.zero_point;
   op_params.output_offset = output->params.zero_point;
-  op_params.output_multiplier = data->output_multiplier;
-  // (b/138810107): Figure out whether output shift should be inverted
-  op_params.output_shift = -data->output_shift;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.output_multiplier = data.output_multiplier;
+  // TODO(b/138810107): Figure out whether output shift should be inverted
+  op_params.output_shift = -data.output_shift;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
 
   reference_integer_ops::FullyConnected(
       op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
@@ -116,8 +146,7 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
 }
 
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           TfLiteFullyConnectedParams* params, OpData* data,
-                           const TfLiteTensor* input,
+                           const OpData& data, const TfLiteTensor* input,
                            const TfLiteTensor* filter, const TfLiteTensor* bias,
                            TfLiteTensor* output) {
   const int32_t input_offset = -input->params.zero_point;
@@ -128,11 +157,11 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   op_params.input_offset = input_offset;
   op_params.weights_offset = filter_offset;
   op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_multiplier = data.output_multiplier;
   // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = -data->output_shift;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.output_shift = -data.output_shift;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
 
 #define TF_LITE_FULLY_CONNECTED(output_data_type)                      \
   reference_ops::FullyConnected(                                       \
@@ -162,11 +191,12 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
         CHECK_ERR_HIFI_NNLIB_KER(
             ret, "xa_nn_fully_connected_asym8xasym8_asym8 failed");
       }
-      for (int i = 0; i < batches * out_depth; i++) {
-        ACTIVATION_MIN_MAX_ASYM8(p_out[i], p_out[i],
-                                 data->output_activation_min,
-                                 data->output_activation_max)
-      }
+      ret = xa_nn_vec_activation_min_max_asym8_asym8(
+          p_out, p_out, data.output_activation_min, data.output_activation_max,
+          batches * out_depth);
+
+      CHECK_ERR_HIFI_NNLIB_KER(
+          ret, "xa_nn_vec_activation_min_max_asym8_asym8 failed");
       break;
     }
     case kTfLiteInt16:
@@ -182,15 +212,16 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 }
 
 TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteFullyConnectedParams* params, OpData* data,
+                       TfLiteFusedActivation activation,
                        const TfLiteTensor* input, const TfLiteTensor* filter,
                        const TfLiteTensor* bias, TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
-  CalculateActivationRange(params->activation, &output_activation_min,
+  CalculateActivationRange(activation, &output_activation_min,
                            &output_activation_max);
   tflite::FullyConnectedParams op_params;
   op_params.float_activation_min = output_activation_min;
   op_params.float_activation_max = output_activation_max;
+#if HIFI_VFPU
   int ret, b, weight_depth, out_depth, batches;
   weight_depth =
       GetTensorShape(filter).Dims(GetTensorShape(filter).DimensionsCount() - 1);
@@ -208,43 +239,48 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
     CHECK_ERR_HIFI_NNLIB_KER(ret, "xa_nn_fully_connected_f32 failed.");
   }
   float* p_out = GetTensorData<float>(output);
-  for (int i = 0; i < batches * out_depth; i++) {
-    ACTIVATION_MIN_MAX(float, p_out[i], p_out[i], output_activation_min,
-                       output_activation_max)
-  }
+  ret = xa_nn_vec_activation_min_max_f32_f32(
+      p_out, p_out, output_activation_min, output_activation_max,
+      batches * out_depth);
+  CHECK_ERR_HIFI_NNLIB_KER(ret, "xa_nn_vec_activation_min_max_f32_f32 failed");
+#else
+  tflite::reference_ops::FullyConnected(
+      op_params, GetTensorShape(input), GetTensorData<float>(input),
+      GetTensorShape(filter), GetTensorData<float>(filter),
+      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
+      GetTensorData<float>(output));
+#endif /* HIFI_VFPU */
   return kTfLiteOk;
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params =
-      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  const auto* params =
+      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
   const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TfLiteType data_type = input->type;
-  OpData local_data_object;
-  OpData* data = &local_data_object;
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, data_type, input,
-                                        filter, bias, output, data));
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
-  switch (filter->type) {  // Already know in/out types are same.
+  // Checks in Prepare ensure input, output and filter types are all the same.
+  switch (input->type) {
     case kTfLiteFloat32:
-      return EvalFloat(context, node, params, data, input, filter, bias,
+      return EvalFloat(context, node, params->activation, input, filter, bias,
                        output);
     case kTfLiteInt8:
-      return EvalQuantizedInt8(context, node, params, data, input, filter, bias,
+      return EvalQuantizedInt8(context, node, data, input, filter, bias,
                                output);
 
     case kTfLiteUInt8:
-      return EvalQuantized(context, node, params, data, input, filter, bias,
-                           output);
+      return EvalQuantized(context, node, data, input, filter, bias, output);
 
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(filter->type), filter->type);
+                         TfLiteTypeGetName(input->type), input->type);
       return kTfLiteError;
   }
   return kTfLiteOk;
@@ -253,9 +289,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace fully_connected
 
 TfLiteRegistration Register_FULLY_CONNECTED() {
-  return {/*init=*/nullptr,
+  return {/*init=*/fully_connected::Init,
           /*free=*/nullptr,
-          /*prepare=*/nullptr,
+          /*prepare=*/fully_connected::Prepare,
           /*invoke=*/fully_connected::Eval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifi/logistic.cc b/tensorflow/lite/micro/kernels/xtensa_hifi/logistic.cc
index aee48a571c1..764bc88ceb1 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/logistic.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/logistic.cc
@@ -1,24 +1,24 @@
-/******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
+/*******************************************************************************
+* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to use this Software with Cadence processor cores only and
+* not with any other processors and platforms, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
+******************************************************************************/
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -34,32 +34,68 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/kernels/internal/reference/logistic.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h"
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/logistic.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
-#include "xtensa_tf_micro_common.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
 
 namespace tflite {
 namespace ops {
 namespace micro {
 namespace activations {
-
+namespace {
 constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;
 
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+struct OpData {
+  int32_t input_zero_point;
+  int32_t input_range_radius;
+  int32_t input_multiplier;
+  int input_left_shift;
+};
+
+TfLiteStatus CalculateArithmeticOpData(TfLiteContext* context, TfLiteNode* node,
+                                       OpData* data) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point,
+                      std::numeric_limits<int8_t>::min());
+
+    static constexpr int kInputIntegerBits = 4;
+    const double input_real_multiplier =
+        static_cast<double>(input->params.scale) *
+        static_cast<double>(1 << (31 - kInputIntegerBits));
+
+    const double q = std::frexp(input_real_multiplier, &data->input_left_shift);
+    data->input_multiplier = static_cast<int32_t>(TfLiteRound(q * (1ll << 31)));
+
+    data->input_range_radius =
+        CalculateInputRadius(kInputIntegerBits, data->input_left_shift, 31);
+  }
+  return kTfLiteOk;
+}
+}  // namespace
+
+TfLiteStatus LogisticEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  OpData data;
+  CalculateArithmeticOpData(context, node, &data);
+
   if (input->type == kTfLiteFloat32) {
     switch (output->type) {
       case kTfLiteFloat32: {
+#if HIFI_VFPU
         int err;
         const float* inp_data_ptr;
         float* out_data_ptr;
@@ -73,6 +109,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         err = xa_nn_vec_sigmoid_f32_f32(out_data_ptr, inp_data_ptr, flat_size);
 
         CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_sigmoid_f32_f32 failed");
+#else
+        reference_ops::Logistic(
+            GetTensorShape(input), GetTensorData<float>(input),
+            GetTensorShape(output), GetTensorData<float>(output));
+#endif /* HIFI_VFPU */
         return kTfLiteOk;
       }
       default:
@@ -84,11 +125,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   } else if (input->type == kTfLiteInt8) {
     switch (output->type) {
       case kTfLiteInt8: {
-        reference_ops::Logistic(
-            GetTensorShape(input), GetTensorData<int8_t>(input),
-            input->params.scale, input->params.zero_point,
-            GetTensorShape(output), GetTensorData<int8_t>(output),
-            output->params.scale, output->params.zero_point);
+        reference_integer_ops::Logistic(
+            input->params.zero_point, data.input_range_radius,
+            data.input_multiplier, data.input_left_shift,
+            NumElements(input->dims), GetTensorData<int8_t>(input),
+            GetTensorData<int8_t>(output));
         return kTfLiteOk;
       }
       default:
@@ -98,7 +139,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         return kTfLiteError;
     }
   } else {
-    // (b/141211002): Also support other data types once we have supported
+    // TODO(b/141211002): Also support other data types once we have supported
     // temporary tensors in TFLM.
     TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
                        TfLiteTypeGetName(input->type),
@@ -114,7 +155,7 @@ TfLiteRegistration Register_LOGISTIC() {
   return {/*init=*/nullptr,
           /*free=*/nullptr,
           /*prepare=*/nullptr,
-          /*invoke=*/activations::Eval,
+          /*invoke=*/activations::LogisticEval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
           /*custom_name=*/nullptr,
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifi/mul.cc b/tensorflow/lite/micro/kernels/xtensa_hifi/mul.cc
new file mode 100644
index 00000000000..b4cf2ce7bd5
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/mul.cc
@@ -0,0 +1,229 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/mul.h"
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/mul.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace mul {
+
+constexpr int kInput1Tensor = 0;
+constexpr int kInput2Tensor = 1;
+constexpr int kOutputTensor = 0;
+
+struct OpData {
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+
+  int32_t output_multiplier;
+  int output_shift;
+};
+
+TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
+                             TfLiteMulParams* params, OpData* data) {
+  const TfLiteTensor* input1 = GetInput(context, node, kInput1Tensor);
+  const TfLiteTensor* input2 = GetInput(context, node, kInput2Tensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
+
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
+        context, params->activation, output, &data->output_activation_min,
+        &data->output_activation_max));
+
+    double real_multiplier = static_cast<double>(input1->params.scale) *
+                             static_cast<double>(input2->params.scale) /
+                             static_cast<double>(output->params.scale);
+    QuantizeMultiplier(real_multiplier, &data->output_multiplier,
+                       &data->output_shift);
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input1 = GetInput(context, node, kInput1Tensor);
+  const TfLiteTensor* input2 = GetInput(context, node, kInput2Tensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (output->dims->size == 0) {
+    return AllocateOutputDimensionsFromInput(context, input1, input2, output);
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                           TfLiteMulParams* params, OpData* data,
+                           const TfLiteTensor* input1,
+                           const TfLiteTensor* input2, TfLiteTensor* output) {
+  if (output->type == kTfLiteInt8 || output->type == kTfLiteUInt8) {
+    tflite::ArithmeticParams op_params;
+    SetActivationParams(data->output_activation_min,
+                        data->output_activation_max, &op_params);
+    op_params.input1_offset = -input1->params.zero_point;
+    op_params.input2_offset = -input2->params.zero_point;
+    op_params.output_offset = output->params.zero_point;
+    op_params.output_multiplier = data->output_multiplier;
+    op_params.output_shift = data->output_shift;
+    bool need_broadcast = reference_ops::ProcessBroadcastShapes(
+        GetTensorShape(input1), GetTensorShape(input2), &op_params);
+
+#define TF_LITE_MUL(type, opname, dtype)                             \
+  type::opname(op_params, GetTensorShape(input1),                    \
+               GetTensorData<dtype>(input1), GetTensorShape(input2), \
+               GetTensorData<dtype>(input2), GetTensorShape(output), \
+               GetTensorData<dtype>(output));
+
+    if (output->type == kTfLiteInt8) {
+      if (need_broadcast) {
+        TF_LITE_MUL(reference_integer_ops, BroadcastMul4DSlow, int8_t);
+      } else {
+        TF_LITE_MUL(reference_integer_ops, Mul, int8_t);
+      }
+    } else if (output->type == kTfLiteUInt8) {
+      if (need_broadcast) {
+        TF_LITE_MUL(reference_ops, BroadcastMul4DSlow, uint8_t);
+      } else {
+        int err;
+        const RuntimeShape& input1_shape = GetTensorShape(input1);
+        const RuntimeShape& input2_shape = GetTensorShape(input2);
+        const RuntimeShape& output_shape = GetTensorShape(output);
+        const int flat_size =
+            MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+        err = xa_nn_elm_mul_asym8xasym8_asym8(
+            GetTensorData<uint8_t>(output), op_params.output_offset,
+            op_params.output_shift, op_params.output_multiplier,
+            op_params.quantized_activation_min,
+            op_params.quantized_activation_max, GetTensorData<uint8_t>(input1),
+            op_params.input1_offset, GetTensorData<uint8_t>(input2),
+            op_params.input2_offset, flat_size);
+
+        CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_elm_mul_asym8xasym8_asym8 failed");
+      }
+    }
+#undef TF_LITE_MUL
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
+                       TfLiteMulParams* params, OpData* data,
+                       const TfLiteTensor* input1, const TfLiteTensor* input2,
+                       TfLiteTensor* output) {
+  float output_activation_min, output_activation_max;
+  CalculateActivationRange(params->activation, &output_activation_min,
+                           &output_activation_max);
+  tflite::ArithmeticParams op_params;
+  SetActivationParams(output_activation_min, output_activation_max, &op_params);
+
+  bool need_broadcast = reference_ops::ProcessBroadcastShapes(
+      GetTensorShape(input1), GetTensorShape(input2), &op_params);
+#define TF_LITE_MUL(opname)                                                   \
+  reference_ops::opname(op_params, GetTensorShape(input1),                    \
+                        GetTensorData<float>(input1), GetTensorShape(input2), \
+                        GetTensorData<float>(input2), GetTensorShape(output), \
+                        GetTensorData<float>(output));
+
+  if (need_broadcast) {
+    TF_LITE_MUL(BroadcastMul4DSlow);
+  } else {
+#if HIFI_VFPU
+    int err;
+    const RuntimeShape& input1_shape = GetTensorShape(input1);
+    const RuntimeShape& input2_shape = GetTensorShape(input2);
+    const RuntimeShape& output_shape = GetTensorShape(output);
+    const int flat_size =
+        MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+    err = xa_nn_elm_mul_f32xf32_f32(GetTensorData<float>(output),
+                                    GetTensorData<float>(input1),
+                                    GetTensorData<float>(input2), flat_size);
+
+    CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_elm_mul_f32xf32_f32 failed");
+
+    err = xa_nn_vec_activation_min_max_f32_f32(
+        GetTensorData<float>(output), GetTensorData<float>(output),
+        output_activation_min, output_activation_max, flat_size);
+
+    CHECK_ERR_HIFI_NNLIB_KER(err,
+                             "xa_nn_vec_activation_min_max_f32_f32 failed");
+#else
+    TF_LITE_MUL(Mul);
+#endif /* HIFI_VFPU */
+  }
+#undef TF_LITE_MUL
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteMulParams*>(node->builtin_data);
+  OpData data;
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInput1Tensor);
+  const TfLiteTensor* input2 = GetInput(context, node, kInput2Tensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, &data));
+
+  switch (input1->type) {
+    case kTfLiteUInt8:
+    case kTfLiteInt8:
+      TF_LITE_ENSURE_OK(context, EvalQuantized(context, node, params, &data,
+                                               input1, input2, output));
+      break;
+    case kTfLiteFloat32:
+      TF_LITE_ENSURE_OK(context, EvalFloat(context, node, params, &data, input1,
+                                           input2, output));
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                         TfLiteTypeGetName(input1->type), input1->type);
+      return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+}  // namespace mul
+
+TfLiteRegistration Register_MUL() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/nullptr,
+          /*invoke=*/mul::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifi/pooling.cc b/tensorflow/lite/micro/kernels/xtensa_hifi/pooling.cc
index 162f3e069f9..ccb3c11844f 100755
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/pooling.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/pooling.cc
@@ -1,24 +1,24 @@
-/******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
+/*******************************************************************************
+* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to use this Software with Cadence processor cores only and
+* not with any other processors and platforms, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
+******************************************************************************/
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -40,7 +40,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
-#include "xtensa_tf_micro_common.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
 
 namespace tflite {
 namespace ops {
@@ -83,6 +83,7 @@ TfLiteStatus AverageEvalFloat(TfLiteContext* context, const TfLiteNode* node,
   CalculateActivationRange(params->activation, &activation_min,
                            &activation_max);
 
+#if HIFI_VFPU
   const int stride_height = params->stride_height;
   const int stride_width = params->stride_width;
   const int pad_width = data->padding.width;
@@ -147,7 +148,7 @@ TfLiteStatus AverageEvalFloat(TfLiteContext* context, const TfLiteNode* node,
   }
 
   out_length = batches * output_height * output_width * depth;
-  uint32 p_unalign_val = (uint32)out_data_ptr, p_align_val;
+  uint32_t p_unalign_val = (uint32_t)out_data_ptr, p_align_val;
   p_align_val = (p_unalign_val + 7) & (~7);
 
   // pre loop for activation_min_max
@@ -168,6 +169,20 @@ TfLiteStatus AverageEvalFloat(TfLiteContext* context, const TfLiteNode* node,
     CHECK_ERR_HIFI_NNLIB_KER(
         err, "AveragepoolFloat: xa_nn_vec_activation_min_max_f32_f32 failed");
   }
+#else
+  PoolParams op_params;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.filter_height = params->filter_height;
+  op_params.filter_width = params->filter_width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data->padding.width;
+  op_params.float_activation_min = activation_min;
+  op_params.float_activation_max = activation_max;
+  reference_ops::AveragePool(
+      op_params, GetTensorShape(input), GetTensorData<float>(input),
+      GetTensorShape(output), GetTensorData<float>(output));
+#endif /* HIFI_VFPU */
   return kTfLiteOk;
 }
 
@@ -177,7 +192,6 @@ TfLiteStatus AverageEvalQuantized(TfLiteContext* context,
                                   const OpData* data, const TfLiteTensor* input,
                                   TfLiteTensor* output) {
   TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
-
   int32_t activation_min, activation_max;
   (void)CalculateActivationRangeQuantized(context, params->activation, output,
                                           &activation_min, &activation_max);
@@ -201,8 +215,8 @@ TfLiteStatus AverageEvalQuantized(TfLiteContext* context,
     const int output_height = output_shape.Dims(1);
     const int output_width = output_shape.Dims(2);
 
-    const uint8* inp_data_ptr;
-    uint8* out_data_ptr;
+    const uint8_t* inp_data_ptr;
+    uint8_t* out_data_ptr;
     int inp_data_format = 0, out_data_format = 0, out_length;
     int inp_precision = PREC_ASYM8, out_precision = PREC_ASYM8;
     void* p_scratch;
@@ -248,7 +262,7 @@ TfLiteStatus AverageEvalQuantized(TfLiteContext* context,
     }
 
     out_length = batches * output_height * output_width * depth;
-    uint32 p_unalign_val = (uint32)out_data_ptr, p_align_val;
+    uint32_t p_unalign_val = (uint32_t)out_data_ptr, p_align_val;
     p_align_val = (p_unalign_val + 7) & (~7);
 
     // pre loop for activation_min_max
@@ -295,6 +309,7 @@ TfLiteStatus MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
   CalculateActivationRange(params->activation, &activation_min,
                            &activation_max);
 
+#if HIFI_VFPU
   const int stride_height = params->stride_height;
   const int stride_width = params->stride_width;
   const int pad_width = data->padding.width;
@@ -357,7 +372,7 @@ TfLiteStatus MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
   }
 
   out_length = batches * output_height * output_width * depth;
-  uint32 p_unalign_val = (uint32)out_data_ptr, p_align_val;
+  uint32_t p_unalign_val = (uint32_t)out_data_ptr, p_align_val;
   p_align_val = (p_unalign_val + 7) & (~7);
 
   // pre loop for activation_min_max
@@ -378,6 +393,20 @@ TfLiteStatus MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
     CHECK_ERR_HIFI_NNLIB_KER(
         err, "MaxpoolFloat: xa_nn_vec_activation_min_max_f32_f32 failed");
   }
+#else
+  tflite::PoolParams op_params;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.filter_height = params->filter_height;
+  op_params.filter_width = params->filter_width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data->padding.width;
+  op_params.float_activation_min = activation_min;
+  op_params.float_activation_max = activation_max;
+  reference_ops::MaxPool(op_params, GetTensorShape(input),
+                         GetTensorData<float>(input), GetTensorShape(output),
+                         GetTensorData<float>(output));
+#endif /* HIFI_VFPU */
   return kTfLiteOk;
 }
 
@@ -409,8 +438,8 @@ TfLiteStatus MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
     const int output_height = output_shape.Dims(1);
     const int output_width = output_shape.Dims(2);
 
-    const uint8* inp_data_ptr;
-    uint8* out_data_ptr;
+    const uint8_t* inp_data_ptr;
+    uint8_t* out_data_ptr;
     int inp_data_format = 0, out_data_format = 0, out_length;
     int inp_precision = PREC_ASYM8, out_precision = PREC_ASYM8;
     void* p_scratch;
@@ -453,7 +482,7 @@ TfLiteStatus MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
     }
 
     out_length = batches * output_height * output_width * depth;
-    uint32 p_unalign_val = (uint32)out_data_ptr, p_align_val;
+    uint32_t p_unalign_val = (uint32_t)out_data_ptr, p_align_val;
     p_align_val = (p_unalign_val + 7) & (~7);
 
     // pre loop for activation_min_max
@@ -491,7 +520,6 @@ TfLiteStatus MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
   }
   return kTfLiteOk;
 }
-
 }  // namespace
 
 
@@ -504,7 +532,7 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));
 
-  // Inputs and outputs share the same type, guarenteed by the converter.
+  // Inputs and outputs share the same type, guaranteed by the converter.
   switch (input->type) {
     case kTfLiteFloat32:
       AverageEvalFloat(context, node, params, &data, input, output);
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifi/softmax.cc b/tensorflow/lite/micro/kernels/xtensa_hifi/softmax.cc
index de949a631cf..9d256b3aecc 100755
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/softmax.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/softmax.cc
@@ -1,24 +1,24 @@
-/******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
+/*******************************************************************************
+* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to use this Software with Cadence processor cores only and
+* not with any other processors and platforms, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
+******************************************************************************/
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -43,7 +43,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
-#include "xtensa_tf_micro_common.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
 namespace tflite {
 namespace ops {
 namespace micro {
@@ -63,7 +63,8 @@ TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,
       TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteInt8);
       if (output->type == kTfLiteInt16) {
         TF_LITE_ENSURE_EQ(context, output->params.zero_point, -32768);
-        // NOTE: Current int16 softmax output does not require symmetric scaling
+        // NOTE: Current int16_t softmax output does not require symmetric
+        // scaling
         // - so no need to verify scale here.
       } else {
         TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);
@@ -105,6 +106,7 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
 // Takes a tensor and performs softmax along the last dimension.
 TfLiteStatus SoftmaxFloat(TfLiteContext* context, const TfLiteTensor* input,
                           TfLiteTensor* output, const SoftmaxParams& op_data) {
+#if HIFI_VFPU
   const RuntimeShape& input_shape = GetTensorShape(input);
   const float* input_data = GetTensorData<float>(input);
   const RuntimeShape& output_shape = GetTensorShape(output);
@@ -133,6 +135,11 @@ TfLiteStatus SoftmaxFloat(TfLiteContext* context, const TfLiteTensor* input,
         xa_nn_vec_softmax_f32_f32(&output_data[i * depth], p_scratch, depth);
     CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_softmax_f32_f32 failed");
   }
+#else
+  tflite::reference_ops::Softmax(
+      op_data, GetTensorShape(input), GetTensorData<float>(input),
+      GetTensorShape(output), GetTensorData<float>(output));
+#endif /* HIFI_VFPU */
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifi/svdf.cc b/tensorflow/lite/micro/kernels/xtensa_hifi/svdf.cc
index a85f796693e..a208713fb9d 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/svdf.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/svdf.cc
@@ -1,5 +1,5 @@
-/******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
+/*******************************************************************************
+ * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files (the
@@ -18,7 +18,6 @@
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  ******************************************************************************/
-
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -44,8 +43,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/kernels/activation_utils.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
 #include "tensorflow/lite/micro/micro_utils.h"
-#include "xtensa_tf_micro_common.h"
 
 namespace tflite {
 namespace ops {
@@ -53,17 +52,15 @@ namespace micro {
 namespace svdf {
 namespace {
 
-// These constants represent constants specific to the hotword "OK G" model.
-// They exist until (b/132070898) is fixed.
-constexpr int kScratchTensorMaxSize = 64;
-
 struct OpData {
-  int32 effective_scale_1_a;
-  int32 effective_scale_2_a;
+  int32_t effective_scale_1_a;
+  int32_t effective_scale_2_a;
   // b versions of each scale are kept at int since the numbers are just the
   // shift value - typically between [-32, 32].
   int effective_scale_1_b;
   int effective_scale_2_b;
+  int scratch_tensor_index;
+  int scratch_output_tensor_index;
 };
 
 /**
@@ -84,6 +81,7 @@ static inline TfLiteStatus ApplyTimeWeightsBiasAndActivation(
     float* const __restrict__ state_ptr, float* const __restrict__ scratch_ptr,
     float* const __restrict__ output_ptr) {
   // Compute matmul(activation_state, weights_time).
+#if HIFI_VFPU
   float* scratch_bias = scratch_ptr;
   if (bias_ptr) {
     const float* bias_data = bias_ptr;
@@ -111,6 +109,51 @@ static inline TfLiteStatus ApplyTimeWeightsBiasAndActivation(
       weights_time_vec += memory_size * rank;
     }
   }
+#else
+  for (int b = 0; b < batch_size; ++b) {
+    // Perform batched vector dot product:
+    float* scratch_ptr_batch = scratch_ptr + b * num_filters;
+    const float* vector1_ptr = weights_time_ptr;
+    const float* vector2_ptr = state_ptr + b * memory_size * num_filters;
+    for (int i = 0; i < num_filters; ++i) {
+      *scratch_ptr_batch = 0.f;
+      for (int j = 0; j < memory_size; ++j) {
+        *scratch_ptr_batch += *vector1_ptr++ * *vector2_ptr++;
+      }
+      scratch_ptr_batch++;
+    }
+  }
+
+  // Initialize output with bias if provided.
+  if (bias_ptr) {
+    // VectorBatchVectorAssign
+    for (int i = 0; i < batch_size; ++i) {
+      float* output_data = output_ptr + i * num_units;
+      const float* bias_data = bias_ptr;
+      for (int j = 0; j < num_units; ++j) {
+        *output_data++ = *bias_data++;
+      }
+    }
+  } else {
+    float* output_data = output_ptr;
+    for (int i = 0; i < batch_size * num_units; ++i) {
+      *output_data++ = 0.0f;
+    }
+  }
+
+  // Reduction sum.
+  for (int b = 0; b < batch_size; ++b) {
+    float* output_ptr_batch = output_ptr + b * num_units;
+    float* scratch_ptr_batch = scratch_ptr + b * num_filters;
+
+    // Reduction sum vector
+    for (int i = 0; i < num_units; ++i) {
+      for (int j = 0; j < rank; j++) {
+        output_ptr_batch[i] += *scratch_ptr_batch++;
+      }
+    }
+  }
+#endif /* HIFI_VFPU */
 
   // Apply activation.
   for (int b = 0; b < batch_size; ++b) {
@@ -127,7 +170,8 @@ inline TfLiteStatus EvalFloatSVDF(
     TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input,
     const TfLiteTensor* weights_feature, const TfLiteTensor* weights_time,
     const TfLiteTensor* bias, const TfLiteSVDFParams* params,
-    TfLiteTensor* activation_state, TfLiteTensor* output) {
+    int scratch_tensor_index, TfLiteTensor* activation_state,
+    TfLiteTensor* output) {
   const int rank = params->rank;
   const int batch_size = input->dims->data[0];
   const int input_size = input->dims->data[1];
@@ -142,10 +186,11 @@ inline TfLiteStatus EvalFloatSVDF(
 
   float* state_ptr = GetTensorData<float>(activation_state);
 
-  // TODO(b/132070898): Move this temp variable to the new scratch buffer API
-  // when ready.
-  float scratch_tensor[kScratchTensorMaxSize];
-  float* scratch_ptr = scratch_tensor;
+  TFLITE_DCHECK(context != nullptr);
+  TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
+
+  float* scratch_ptr = static_cast<float*>(
+      context->GetScratchBuffer(context, scratch_tensor_index));
 
   float* output_ptr = GetTensorData<float>(output);
 
@@ -174,6 +219,7 @@ inline TfLiteStatus EvalFloatSVDF(
     float* result = &state_ptr[memory_size - 1];
     float* result_in_batch = result;
 
+#if HIFI_VFPU
     float* out_scratch = scratch_ptr;
     float* bias_scratch = output_ptr;
     for (int i = 0; i < num_units; i++) bias_scratch[i] = 0.0f;
@@ -195,6 +241,20 @@ inline TfLiteStatus EvalFloatSVDF(
         result_in_batch += memory_size;
       }
     }
+#else
+    for (int i = 0; i < batch_size; ++i) {
+      const float* matrix_ptr = matrix;
+      for (int j = 0; j < num_filters; ++j) {
+        float dot_prod = 0.0f;
+        const float* vector_in_batch = vector + i * input_size;
+        for (int k = 0; k < input_size; ++k) {
+          dot_prod += *matrix_ptr++ * *vector_in_batch++;
+        }
+        *result_in_batch = dot_prod;
+        result_in_batch += memory_size;
+      }
+    }
+#endif /* HIFI_VFPU */
   }
 
   return ApplyTimeWeightsBiasAndActivation(
@@ -203,13 +263,15 @@ inline TfLiteStatus EvalFloatSVDF(
       output_ptr);
 }
 
-void EvalIntegerSVDF(
-    TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input_tensor,
-    const TfLiteTensor* weights_feature_tensor,
-    const TfLiteTensor* weights_time_tensor, const TfLiteTensor* bias_tensor,
-    const TfLiteSVDFParams* params, TfLiteTensor* activation_state_tensor,
-    TfLiteTensor* output_tensor, int32_t scale_1_a, int scale_1_b,
-    int32_t scale_2_a, int scale_2_b, int32_t input_zp, int32_t output_zp) {
+void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
+                     const TfLiteTensor* input_tensor,
+                     const TfLiteTensor* weights_feature_tensor,
+                     const TfLiteTensor* weights_time_tensor,
+                     const TfLiteTensor* bias_tensor,
+                     const TfLiteSVDFParams* params,
+                     TfLiteTensor* activation_state_tensor,
+                     TfLiteTensor* output_tensor, const OpData& data,
+                     int32_t input_zp, int32_t output_zp) {
   const int n_rank = params->rank;
   const int n_batch = input_tensor->dims->data[0];
   const int n_input = input_tensor->dims->data[1];
@@ -217,10 +279,13 @@ void EvalIntegerSVDF(
   const int n_unit = n_filter / n_rank;
   const int n_memory = weights_time_tensor->dims->data[1];
 
-  // TODO(b/132070898): Move these temp variables to the new scratch buffer API
-  // when ready.
-  int32_t scratch_tensor[kScratchTensorMaxSize];
-  int32_t scratch_output_tensor[kScratchTensorMaxSize];
+  TFLITE_DCHECK(context != nullptr);
+  TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
+
+  int32_t* scratch_tensor = static_cast<int32_t*>(
+      context->GetScratchBuffer(context, data.scratch_tensor_index));
+  int32_t* scratch_output_tensor = static_cast<int32_t*>(
+      context->GetScratchBuffer(context, data.scratch_output_tensor_index));
 
   // Shift states.
   int16_t* const state_ptr = GetTensorData<int16_t>(activation_state_tensor);
@@ -254,8 +319,8 @@ void EvalIntegerSVDF(
         for (int c = 0; c < n_input; c++) {
           dot_prod += *matrix_ptr++ * (*vector_in_batch++ - input_zp);
         }
-        dot_prod =
-            MultiplyByQuantizedMultiplier(dot_prod, scale_1_a, scale_1_b);
+        dot_prod = MultiplyByQuantizedMultiplier(
+            dot_prod, data.effective_scale_1_a, data.effective_scale_1_b);
         dot_prod = std::min(std::max(output_min, dot_prod), output_max);
         // This assumes state is symmetrically quantized. Otherwise last bit of
         // state should be initialized to its zero point and accumulate the
@@ -328,7 +393,8 @@ void EvalIntegerSVDF(
     const int32_t output_min = std::numeric_limits<int8_t>::min();
     for (int i = 0; i < n_batch * n_unit; ++i) {
       int32_t x1 = scratch_output_tensor[i];
-      int32_t x2 = MultiplyByQuantizedMultiplier(x1, scale_2_a, scale_2_b);
+      int32_t x2 = MultiplyByQuantizedMultiplier(x1, data.effective_scale_2_a,
+                                                 data.effective_scale_2_b);
       int32_t x3 = x2 + output_zp;
       int32_t x4 = std::min(std::max(output_min, x3), output_max);
       GetTensorData<int8_t>(output_tensor)[i] = static_cast<int8_t>(x4);
@@ -349,8 +415,20 @@ constexpr int kInputActivationStateTensor = 4;
 // Output tensor.
 constexpr int kOutputTensor = 0;
 
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  const auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  const auto* params = static_cast<const TfLiteSVDFParams*>(node->builtin_data);
 
   // Validate Tensor Inputs (dtype depends on quantization):
   // [0] = Input, {2, batch_size, input_size}
@@ -359,7 +437,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // [3] = Bias (optional), {1, num_units}
   // [4] = Activation State (variable),
   //         {2, batch_size, memory_size * num_filters}
-
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* weights_feature =
       GetInput(context, node, kWeightsFeatureTensor);
@@ -378,15 +455,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const int num_units = num_filters / rank;
   const int memory_size = weights_time->dims->data[1];
 
-  const bool is_full_integer = input->type == kTfLiteInt8;
-
   // Validate Input Tensor:
   TF_LITE_ENSURE(context,
                  input->type == kTfLiteFloat32 || input->type == kTfLiteInt8);
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 2);
 
   // Validate Tensor Output:
-  // [0] = float/int8, {2, batch_size, num_units}
+  // [0] = float/int8_t, {2, batch_size, num_units}
   TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TF_LITE_ENSURE_EQ(context, NumDimensions(output), 2);
@@ -403,7 +478,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, weights_time->dims->data[1], memory_size);
 
   // Validate Optional Bias Input Tensor:
-  if (bias) {
+  if (bias != nullptr) {
     TF_LITE_ENSURE_EQ(context, bias->dims->data[0], num_units);
   }
 
@@ -413,53 +488,77 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, activation_state->dims->data[1],
                     memory_size * num_filters);
 
-  if (is_full_integer) {
     TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
 
-    TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteInt8);
-    TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteInt16);
+    if (input->type == kTfLiteInt8) {
+      TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteInt8);
+      TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteInt16);
+      TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteInt16);
+      if (bias != nullptr) {
+        TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
+      }
 
-    if (bias) {
-      TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
+      TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);
+
+      const auto* input_params = reinterpret_cast<TfLiteAffineQuantization*>(
+          input->quantization.params);
+      const auto* weights_feature_params =
+          static_cast<const TfLiteAffineQuantization*>(
+              weights_feature->quantization.params);
+      const auto* state_params = static_cast<const TfLiteAffineQuantization*>(
+          activation_state->quantization.params);
+      const auto* weight_time_params =
+          static_cast<const TfLiteAffineQuantization*>(
+              weights_time->quantization.params);
+      const auto* output_params = static_cast<const TfLiteAffineQuantization*>(
+          output->quantization.params);
+      const double effective_scale_1 =
+          static_cast<double>(input_params->scale->data[0] *
+                              weights_feature_params->scale->data[0] /
+                              state_params->scale->data[0]);
+      const double effective_scale_2 = static_cast<double>(
+          state_params->scale->data[0] * weight_time_params->scale->data[0] /
+          output_params->scale->data[0]);
+
+      TFLITE_DCHECK(node->user_data != nullptr);
+      OpData* data = static_cast<OpData*>(node->user_data);
+
+      QuantizeMultiplier(effective_scale_1, &(data->effective_scale_1_a),
+                         &(data->effective_scale_1_b));
+      QuantizeMultiplier(effective_scale_2, &(data->effective_scale_2_a),
+                         &(data->effective_scale_2_b));
+
+      TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);
+
+      const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
+          context, batch_size * num_filters * sizeof(int32_t),
+          &(data->scratch_tensor_index));
+      TF_LITE_ENSURE_OK(context, scratch_status);
+
+      const TfLiteStatus scratch_output_status =
+          context->RequestScratchBufferInArena(
+              context, batch_size * num_units * sizeof(int32_t),
+              &(data->scratch_output_tensor_index));
+      TF_LITE_ENSURE_OK(context, scratch_output_status);
+    } else {
+      TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteFloat32);
+      TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteFloat32);
+      TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteFloat32);
+      if (bias != nullptr) {
+        TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteFloat32);
+      }
+      TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);
+
+      TFLITE_DCHECK(node->user_data != nullptr);
+      OpData* data = static_cast<OpData*>(node->user_data);
+
+      TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);
+      const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
+          context, batch_size * num_filters * sizeof(float),
+          &(data->scratch_tensor_index));
+      TF_LITE_ENSURE_OK(context, scratch_status);
     }
 
-    TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteInt16);
-
-    // Validate Scratch Tensors:
-    // [0] = (shared - see float block below for usage)
-    // [1] = Output Temp, int8_t, {2, num_units, batch_size}
-    // TODO(b/132070898): Scratch values are used as stack variables in
-    // EvalIntegerSVDF().
-
-    // Validate output tensor:
-    TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);
-  } else {
-    TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
-
-    // Validate Input Tensor dtypes:
-    TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteFloat32);
-    TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteFloat32);
-    TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteFloat32);
-
-    if (bias) {
-      TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteFloat32);
-    }
-
-    // Validate shared Scratch Tensor:
-    // [0] = Holds dot-product of time-forward calculations in
-    //       ApplyTimeWeightsBiasAndActivation():
-    //         float/int32, {2, batch_size, num_filters}
-    // TODO(b/132070898): Scratch values are used as stack variables in
-    // EvalIntegerSVDF().
-
-    // Full-float SVDF only uses the one shared scratch tensor (see above for
-    // usage).
-    // TODO(b/132070898): Use input tensor as variable until scratch tensor
-    // allocation has been implemented.
-    // TF_LITE_ENSURE_EQ(context, node->temporaries->size, 1);
-    TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);
-  }
-
   return kTfLiteOk;
 }
 
@@ -476,56 +575,24 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       GetVariableInput(context, node, kInputActivationStateTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  const bool is_full_integer = input->type == kTfLiteInt8;
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
   switch (weights_feature->type) {
     case kTfLiteFloat32: {
-      // TODO(b/132070898): Use input tensor as variable until scratch tensor
-      // allocation has been implemented.
-      // TfLiteTensor* scratch = GetTemporary(context, node, /*index=*/0);
       return EvalFloatSVDF(context, node, input, weights_feature, weights_time,
-                           bias, params, activation_state, output);
+                           bias, params, data.scratch_tensor_index,
+                           activation_state, output);
       break;
     }
 
     case kTfLiteInt8: {
-      if (is_full_integer) {
-        // TODO(b/132070898): Store these values in ::Prepare() instead of
-        // ::Eval():
-        // Calculate effective scales.
-        OpData op_data;
-        auto* input_params = reinterpret_cast<TfLiteAffineQuantization*>(
-            input->quantization.params);
-        auto* weights_feature_params =
-            reinterpret_cast<TfLiteAffineQuantization*>(
-                weights_feature->quantization.params);
-        auto* state_params = reinterpret_cast<TfLiteAffineQuantization*>(
-            activation_state->quantization.params);
-        auto* weight_time_params = reinterpret_cast<TfLiteAffineQuantization*>(
-            weights_time->quantization.params);
-        auto* output_params = reinterpret_cast<TfLiteAffineQuantization*>(
-            output->quantization.params);
-        const double effective_scale_1 =
-            static_cast<double>(input_params->scale->data[0] *
-                                weights_feature_params->scale->data[0] /
-                                state_params->scale->data[0]);
-        const double effective_scale_2 = static_cast<double>(
-            state_params->scale->data[0] * weight_time_params->scale->data[0] /
-            output_params->scale->data[0]);
-        QuantizeMultiplier(effective_scale_1, &op_data.effective_scale_1_a,
-                           &op_data.effective_scale_1_b);
-        QuantizeMultiplier(effective_scale_2, &op_data.effective_scale_2_a,
-                           &op_data.effective_scale_2_b);
+      TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActRelu);
 
-        TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActRelu);
-        EvalIntegerSVDF(
-            context, node, input, weights_feature, weights_time, bias, params,
-            activation_state, output, op_data.effective_scale_1_a,
-            op_data.effective_scale_1_b, op_data.effective_scale_2_a,
-            op_data.effective_scale_2_b, input->params.zero_point,
-            output->params.zero_point);
-        return kTfLiteOk;
-      }
+      EvalIntegerSVDF(context, node, input, weights_feature, weights_time, bias,
+                      params, activation_state, output, data,
+                      input->params.zero_point, output->params.zero_point);
+      return kTfLiteOk;
       break;
     }
 
@@ -540,7 +607,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace svdf
 
 TfLiteRegistration Register_SVDF() {
-  return {/*init=*/nullptr,
+  return {/*init=*/svdf::Init,
           /*free=*/nullptr,
           /*prepare=*/svdf::Prepare,
           /*invoke=*/svdf::Eval,
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/conv.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/conv.cc
index 26cc0f03d73..273a2fad583 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/conv.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/conv.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
 
 namespace tflite {
@@ -33,22 +34,22 @@ namespace conv {
 namespace xtensa {
 namespace hifimini {
 
-void ConvPerChannel(const ConvParams& params, const int32* output_multiplier,
-                    const int32* output_shift, const RuntimeShape& input_shape,
-                    const int8* input_data, const RuntimeShape& filter_shape,
-                    const int8* filter_data, const RuntimeShape& bias_shape,
-                    const int32* bias_data, const RuntimeShape& output_shape,
-                    int8* output_data) {
+void ConvPerChannel(const ConvParams& params, const int32_t* output_multiplier,
+                    const int32_t* output_shift,
+                    const RuntimeShape& input_shape, const int8_t* input_data,
+                    const RuntimeShape& filter_shape, const int8_t* filter_data,
+                    const RuntimeShape& bias_shape, const int32_t* bias_data,
+                    const RuntimeShape& output_shape, int8_t* output_data) {
   const int stride_width = params.stride_width;
   const int stride_height = params.stride_height;
   const int dilation_width_factor = params.dilation_width_factor;
   const int dilation_height_factor = params.dilation_height_factor;
   const int pad_width = params.padding_values.width;
   const int pad_height = params.padding_values.height;
-  const int32 input_offset = params.input_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t input_offset = params.input_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
 
   const int batches = input_shape.Dims(0);
 
@@ -169,11 +170,11 @@ void ConvPerChannel(const ConvParams& params, const int32* output_multiplier,
 inline void Conv1x32Input32x32Filter(
     const int input_offset, const int output_offset,
     const int quantized_activation_min, const int quantized_activation_max,
-    const int32* output_multiplier, const int32* output_shift,
-    const RuntimeShape& input_shape, const int8* input_data,
-    const RuntimeShape& filter_shape, const int8* filter_data,
-    const RuntimeShape& bias_shape, const int32* bias_data,
-    const RuntimeShape& output_shape, int8* output_data) {
+    const int32_t* output_multiplier, const int32_t* output_shift,
+    const RuntimeShape& input_shape, const int8_t* input_data,
+    const RuntimeShape& filter_shape, const int8_t* filter_data,
+    const RuntimeShape& bias_shape, const int32_t* bias_data,
+    const RuntimeShape& output_shape, int8_t* output_data) {
   ae_p24x2s input_offset_24x2 = AE_MOVPA24(input_offset);
   ae_q56s output_offset_56 = AE_CVTQ48A32S(output_offset);
   ae_q56s output_activation_max_56 = AE_CVTQ48A32S(quantized_activation_max);
@@ -253,6 +254,10 @@ struct OpData {
   int32_t output_multiplier;
   int output_shift;
 
+  // Cached tensor zero point values for quantized operations.
+  int32_t input_zero_point;
+  int32_t output_zero_point;
+
   // Per channel output multiplier and shift.
   int32_t* per_channel_output_multiplier;
   int32_t* per_channel_output_shift;
@@ -303,12 +308,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@@ -329,17 +329,18 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   int output_width = output->dims->data[2];
   int output_height = output->dims->data[1];
 
-  // Per channel quantization is only needed for int8 inference. For other
+  // Per channel quantization is only needed for int8_t inference. For other
   // quantized types, only a single scale and zero point is needed.
   const int num_channels = filter->dims->data[kConvQuantizedDimension];
   // Dynimically allocate per-channel quantization parameters.
-  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
-      context, num_channels * sizeof(int32_t),
-      reinterpret_cast<void**>(&op_data->per_channel_output_multiplier)));
-  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
-      context, num_channels * sizeof(int32_t),
-      reinterpret_cast<void**>(&op_data->per_channel_output_shift)));
-
+  op_data->per_channel_output_multiplier =
+      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+  op_data->per_channel_output_shift =
+      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+  op_data->input_zero_point = input->params.zero_point;
+  op_data->output_zero_point = output->params.zero_point;
   // All per-channel quantized tensors need valid zero point and scale arrays.
   if (input->type == kTfLiteInt8) {
     TF_LITE_ENSURE_EQ(context, filter->quantization.type,
@@ -367,14 +368,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
                              TfLiteConvParams* params, OpData* data,
-                             const TfLiteTensor* input,
-                             const TfLiteTensor* filter,
-                             const TfLiteTensor* bias, TfLiteTensor* output,
-                             TfLiteTensor* im2col) {
+                             const TfLiteEvalTensor* input,
+                             const TfLiteEvalTensor* filter,
+                             const TfLiteEvalTensor* bias,
+                             TfLiteEvalTensor* output,
+                             TfLiteEvalTensor* im2col) {
   // TODO(b/154032858): Investigate removing extra copies.
   ConvParams op_params;
-  op_params.input_offset = -input->params.zero_point;
-  op_params.output_offset = output->params.zero_point;
+  op_params.input_offset = -data->input_zero_point;
+  op_params.output_offset = data->output_zero_point;
   op_params.stride_height = params->stride_height;
   op_params.stride_width = params->stride_width;
   op_params.dilation_height_factor = params->dilation_height_factor;
@@ -386,11 +388,14 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
 
   xtensa::hifimini::ConvPerChannel(
       op_params, data->per_channel_output_multiplier,
-      data->per_channel_output_shift, GetTensorShape(input),
-      GetTensorData<int8>(input), GetTensorShape(filter),
-      GetTensorData<int8>(filter), GetTensorShape(bias),
-      GetTensorData<int32>(bias), GetTensorShape(output),
-      GetTensorData<int8>(output));
+      data->per_channel_output_shift, tflite::micro::GetTensorShape(input),
+      tflite::micro::GetTensorData<int8_t>(input),
+      tflite::micro::GetTensorShape(filter),
+      tflite::micro::GetTensorData<int8_t>(filter),
+      tflite::micro::GetTensorShape(bias),
+      tflite::micro::GetTensorData<int32_t>(bias),
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<int8_t>(output));
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
@@ -399,10 +404,16 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
   auto* op_data = reinterpret_cast<OpData*>(node->user_data);
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kFilterTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 3)
+          ? tflite::micro::GetEvalInput(context, node, kBiasTensor)
+          : nullptr;
 
   int* input_dims = input->dims->data;
   int* filter_dims = filter->dims->data;
@@ -410,14 +421,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       input_dims[3] == 32 && filter_dims[0] == 32 && filter_dims[1] == 1 &&
       filter_dims[2] == 1 && filter_dims[3] == 32) {
     xtensa::hifimini::Conv1x32Input32x32Filter(
-        -input->params.zero_point, output->params.zero_point,
+        -op_data->input_zero_point, op_data->output_zero_point,
         op_data->output_activation_min, op_data->output_activation_max,
         op_data->per_channel_output_multiplier,
-        op_data->per_channel_output_shift, GetTensorShape(input),
-        GetTensorData<int8>(input), GetTensorShape(filter),
-        GetTensorData<int8>(filter), GetTensorShape(bias),
-        GetTensorData<int32>(bias), GetTensorShape(output),
-        GetTensorData<int8>(output));
+        op_data->per_channel_output_shift, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<int8_t>(input),
+        tflite::micro::GetTensorShape(filter),
+        tflite::micro::GetTensorData<int8_t>(filter),
+        tflite::micro::GetTensorShape(bias),
+        tflite::micro::GetTensorData<int32_t>(bias),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int8_t>(output));
     return kTfLiteOk;
   }
 
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/depthwise_conv.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/depthwise_conv.cc
index a3aff598fcb..0d79d56ad0c 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/depthwise_conv.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
 
 namespace tflite {
@@ -34,12 +35,12 @@ namespace xtensa {
 namespace hifimini {
 
 inline void DepthwiseConvPerChannel(
-    const DepthwiseParams& params, const int32* output_multiplier,
-    const int32* output_shift, const RuntimeShape& input_shape,
-    const int8* input_data, const RuntimeShape& filter_shape,
-    const int8* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    int8* output_data) {
+    const DepthwiseParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data) {
   // TODO(b/154032858): Investigate removing extra copies.
   const int stride_width = params.stride_width;
   const int stride_height = params.stride_height;
@@ -48,10 +49,10 @@ inline void DepthwiseConvPerChannel(
   const int pad_width = params.padding_values.width;
   const int pad_height = params.padding_values.height;
   const int depth_multiplier = params.depth_multiplier;
-  const int32 input_offset = params.input_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t input_offset = params.input_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
 
   const int batches = input_shape.Dims(0);
 
@@ -99,16 +100,16 @@ inline void DepthwiseConvPerChannel(
                       ((batch * input_height + in_y) * input_width + in_x) *
                           input_depth +
                       (in_channel);
-                  int32 input_val = input_data[input_idx];
+                  int32_t input_val = input_data[input_idx];
 
                   // Find current filter index, minus 2 for Xtensa load
                   // alignments:
                   int filter_idx =
                       ((filter_y)*filter_width + filter_x) * filter_depth +
                       (output_channel);
-                  int32 filter_val = filter_data[filter_idx];
+                  int32_t filter_val = filter_data[filter_idx];
 
-                  // Load 8bit value as int32 into a 24x24 register and right
+                  // Load 8bit value as int32_t into a 24x24 register and right
                   // shift into 24bit space. Note: value is duplicated in the HH
                   // and LL register - but all calculations are done on the HH
                   // side.
@@ -171,11 +172,11 @@ constexpr int kConvolutionalKernelDepth = 32;
 inline void DepthwiseConv4x32MatchingInputAndFilter(
     const int input_offset, const int output_offset,
     const int quantized_activation_min, const int quantized_activation_max,
-    const int32* output_multiplier, const int32* output_shift,
-    const RuntimeShape& input_shape, const int8* input_data,
-    const RuntimeShape& filter_shape, const int8* filter_data,
-    const RuntimeShape& bias_shape, const int32* bias_data,
-    const RuntimeShape& output_shape, int8* output_data) {
+    const int32_t* output_multiplier, const int32_t* output_shift,
+    const RuntimeShape& input_shape, const int8_t* input_data,
+    const RuntimeShape& filter_shape, const int8_t* filter_data,
+    const RuntimeShape& bias_shape, const int32_t* bias_data,
+    const RuntimeShape& output_shape, int8_t* output_data) {
   // Convert the (unsigned) 32-bit multiplier down to a 24-bit multiplier.
   const int32_t mult = output_multiplier[0] >> 8;
   const int32_t shift = output_shift[0];
@@ -189,16 +190,16 @@ inline void DepthwiseConv4x32MatchingInputAndFilter(
   const int stride_elements =
       (kConvolutionalKernelDepth / kConvolutionalKernelWidth);
 
-  const int8* input_0_ptr = (const int8*)(input_data - 2);
-  const int8* weight_0_ptr = (const int8*)(filter_data - 2);
+  const int8_t* input_0_ptr = (const int8_t*)(input_data - 2);
+  const int8_t* weight_0_ptr = (const int8_t*)(filter_data - 2);
   // Apply the kernels in blocks of 4 for all the channels.
-  const int8* input_1_ptr = input_0_ptr + stride_elements * 4;
-  const int8* input_2_ptr = input_1_ptr + stride_elements * 4;
-  const int8* input_3_ptr = input_2_ptr + stride_elements * 4;
+  const int8_t* input_1_ptr = input_0_ptr + stride_elements * 4;
+  const int8_t* input_2_ptr = input_1_ptr + stride_elements * 4;
+  const int8_t* input_3_ptr = input_2_ptr + stride_elements * 4;
 
-  const int8* weight_1_ptr = weight_0_ptr + stride_elements * 4;
-  const int8* weight_2_ptr = weight_1_ptr + stride_elements * 4;
-  const int8* weight_3_ptr = weight_2_ptr + stride_elements * 4;
+  const int8_t* weight_1_ptr = weight_0_ptr + stride_elements * 4;
+  const int8_t* weight_2_ptr = weight_1_ptr + stride_elements * 4;
+  const int8_t* weight_3_ptr = weight_2_ptr + stride_elements * 4;
 
   for (int i = 0; i < num_blocks; ++i) {
     ae_q56s block_0_acc = AE_ZEROQ56();
@@ -300,6 +301,10 @@ struct OpData {
   int32_t output_multiplier;
   int output_shift;
 
+  // Cached tensor zero point values for quantized operations.
+  int32_t input_zero_point;
+  int32_t output_zero_point;
+
   // Per channel output multiplier and shift.
   // TODO(b/141139247): Allocate these dynamically when possible.
   int32_t* per_channel_output_multiplier;
@@ -352,12 +357,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@@ -368,6 +368,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  const TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   auto* op_data = reinterpret_cast<OpData*>(node->user_data);
 
@@ -377,16 +378,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   int filter_width = SizeOfDimension(filter, 2);
   int filter_height = SizeOfDimension(filter, 1);
 
-  // Per channel quantization is only needed for int8 inference. For other
+  // Per channel quantization is only needed for int8_t inference. For other
   // quantized types, only a single scale and zero point is needed.
   const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
   // Dynimically allocate per-channel quantization parameters.
-  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
-      context, num_channels * sizeof(int32_t),
-      reinterpret_cast<void**>(&op_data->per_channel_output_multiplier)));
-  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
-      context, num_channels * sizeof(int32_t),
-      reinterpret_cast<void**>(&op_data->per_channel_output_shift)));
+  op_data->per_channel_output_multiplier =
+      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+  op_data->per_channel_output_shift =
+      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+
+  op_data->input_zero_point = input->params.zero_point;
+  op_data->output_zero_point = output->params.zero_point;
 
   // All per-channel quantized tensors need valid zero point and scale arrays.
   if (input->type == kTfLiteInt8) {
@@ -413,9 +417,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
                              TfLiteDepthwiseConvParams* params, OpData* data,
-                             const TfLiteTensor* input,
-                             const TfLiteTensor* filter,
-                             const TfLiteTensor* bias, TfLiteTensor* output) {
+                             const TfLiteEvalTensor* input,
+                             const TfLiteEvalTensor* filter,
+                             const TfLiteEvalTensor* bias,
+                             TfLiteEvalTensor* output) {
   DepthwiseParams op_params;
   op_params.padding_type = PaddingType::kSame;
   op_params.padding_values.width = data->padding.width;
@@ -425,20 +430,23 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
   op_params.dilation_width_factor = params->dilation_width_factor;
   op_params.dilation_height_factor = params->dilation_height_factor;
   op_params.depth_multiplier = params->depth_multiplier;
-  op_params.input_offset = -input->params.zero_point;
+  op_params.input_offset = -data->input_zero_point;
   op_params.weights_offset = 0;
-  op_params.output_offset = output->params.zero_point;
+  op_params.output_offset = data->output_zero_point;
   // TODO(b/130439627): Use calculated value for clamping.
   op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
   op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
 
   xtensa::hifimini::DepthwiseConvPerChannel(
       op_params, data->per_channel_output_multiplier,
-      data->per_channel_output_shift, GetTensorShape(input),
-      GetTensorData<int8>(input), GetTensorShape(filter),
-      GetTensorData<int8>(filter), GetTensorShape(bias),
-      GetTensorData<int32>(bias), GetTensorShape(output),
-      GetTensorData<int8>(output));
+      data->per_channel_output_shift, tflite::micro::GetTensorShape(input),
+      tflite::micro::GetTensorData<int8_t>(input),
+      tflite::micro::GetTensorShape(filter),
+      tflite::micro::GetTensorData<int8_t>(filter),
+      tflite::micro::GetTensorShape(bias),
+      tflite::micro::GetTensorData<int32_t>(bias),
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<int8_t>(output));
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
@@ -448,11 +456,16 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
   auto* op_data = reinterpret_cast<OpData*>(node->user_data);
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  const TfLiteTensor* bias =
-      (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kFilterTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 3)
+          ? tflite::micro::GetEvalInput(context, node, kBiasTensor)
+          : nullptr;
 
   // Handle special case for streaming model.
   int* input_dims = input->dims->data;
@@ -461,14 +474,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       input_dims[3] == 32 && filter_dims[0] == 1 && filter_dims[1] == 4 &&
       filter_dims[2] == 1 && filter_dims[3] == 32) {
     xtensa::hifimini::DepthwiseConv4x32MatchingInputAndFilter(
-        -input->params.zero_point, output->params.zero_point,
+        -op_data->input_zero_point, op_data->output_zero_point,
         std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max(),
         op_data->per_channel_output_multiplier,
-        op_data->per_channel_output_shift, GetTensorShape(input),
-        GetTensorData<int8>(input), GetTensorShape(filter),
-        GetTensorData<int8>(filter), GetTensorShape(bias),
-        GetTensorData<int32>(bias), GetTensorShape(output),
-        GetTensorData<int8>(output));
+        op_data->per_channel_output_shift, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<int8_t>(input),
+        tflite::micro::GetTensorShape(filter),
+        tflite::micro::GetTensorData<int8_t>(filter),
+        tflite::micro::GetTensorShape(bias),
+        tflite::micro::GetTensorData<int32_t>(bias),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int8_t>(output));
     return kTfLiteOk;
   }
   switch (input->type) {  // Already know in/out types are same.
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
index a89c53f73e7..4bd9875d152 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
 
 namespace tflite {
@@ -36,16 +37,16 @@ namespace hifimini {
 void FullyConnected(const FullyConnectedParams& params,
                     const RuntimeShape& input_shape, const int8_t* input_data,
                     const RuntimeShape& filter_shape, const int8_t* filter_data,
-                    const RuntimeShape& bias_shape, const int32* bias_data,
+                    const RuntimeShape& bias_shape, const int32_t* bias_data,
                     const RuntimeShape& output_shape, int8_t* output_data) {
   // TODO(b/154032858): Investigate removing extra copies.
-  const int32 input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
   const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
 
   const int filter_dim_count = filter_shape.DimensionsCount();
   const int batches = output_shape.Dims(0);
@@ -128,6 +129,12 @@ struct OpData {
   // be represented as a fixed point multiplier plus a left shift.
   int32_t output_multiplier;
   int output_shift;
+
+  // Cached tensor zero point values for quantized operations.
+  int32_t input_zero_point;
+  int32_t filter_zero_point;
+  int32_t output_zero_point;
+
   // The range of the fused activation layer. For example for kNone and
   // uint8_t these would be 0 and 255.
   int32_t output_activation_min;
@@ -147,8 +154,6 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
                              const TfLiteTensor* filter,
                              const TfLiteTensor* bias, TfLiteTensor* output,
                              OpData* data) {
-  TFLITE_DCHECK(data_type != kTfLiteFloat32);
-
   double real_multiplier = 0.0;
   TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
       context, input, filter, bias, output, &real_multiplier));
@@ -163,12 +168,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@@ -184,33 +184,49 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
+  if (input->type != kTfLiteInt8) {
+    TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                       TfLiteTypeGetName(input->type), input->type);
+    return kTfLiteError;
+  }
+
+  data->input_zero_point = input->params.zero_point;
+  data->filter_zero_point = filter->params.zero_point;
+  data->output_zero_point = output->params.zero_point;
+
   return CalculateOpData(context, params->activation, input->type, input,
                          filter, bias, output, data);
 }
 
 TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
-                               const OpData& data, const TfLiteTensor* input,
-                               const TfLiteTensor* filter,
-                               const TfLiteTensor* bias, TfLiteTensor* output) {
+                               const OpData& data,
+                               const TfLiteEvalTensor* input,
+                               const TfLiteEvalTensor* filter,
+                               const TfLiteEvalTensor* bias,
+                               TfLiteEvalTensor* output) {
   // TODO(b/154032858): Investigate removing extra copies, and also passing by
   // value. TODO(b/155656675): Consider passing OpData by value once it is also
   // passed to the FullyConnected function. Until it is copied to a local
   // op_param variable, we do not get any latency improvements from passing by
   // value.
   FullyConnectedParams op_params;
-  op_params.input_offset = -input->params.zero_point;
-  op_params.weights_offset = -filter->params.zero_point;
-  op_params.output_offset = output->params.zero_point;
+  op_params.input_offset = -data.input_zero_point;
+  op_params.weights_offset = -data.filter_zero_point;
+  op_params.output_offset = data.output_zero_point;
   op_params.output_multiplier = data.output_multiplier;
   op_params.output_shift = data.output_shift;
   op_params.quantized_activation_min = data.output_activation_min;
   op_params.quantized_activation_max = data.output_activation_max;
 
   xtensa::hifimini::FullyConnected(
-      op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-      GetTensorShape(filter), GetTensorData<int8_t>(filter),
-      GetTensorShape(bias), GetTensorData<int32_t>(bias),
-      GetTensorShape(output), GetTensorData<int8_t>(output));
+      op_params, tflite::micro::GetTensorShape(input),
+      tflite::micro::GetTensorData<int8_t>(input),
+      tflite::micro::GetTensorShape(filter),
+      tflite::micro::GetTensorData<int8_t>(filter),
+      tflite::micro::GetTensorShape(bias),
+      tflite::micro::GetTensorData<int32_t>(bias),
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<int8_t>(output));
   return kTfLiteOk;
 }
 
@@ -218,12 +234,18 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TFLITE_DCHECK(node->user_data != nullptr);
   const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kWeightsTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 3)
+          ? tflite::micro::GetEvalInput(context, node, kBiasTensor)
+          : nullptr;
+
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
-  TFLITE_DCHECK(filter->type == kTfLiteInt8);
   return EvalQuantizedInt8(context, node, data, input, filter, bias, output);
 }
 
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc
index 9e612cf5ae7..bbd2a7d9fea 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
 
 namespace tflite {
@@ -109,12 +110,7 @@ struct OpData {
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@@ -137,11 +133,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TFLITE_DCHECK(node->user_data != nullptr);
   auto* op_data = static_cast<OpData*>(node->user_data);
 
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
 
   tflite::QuantizationParams op_params;
-  op_params.zero_point = output->params.zero_point;
+  op_params.zero_point = op_data->zero_point;
 
   if (input->type != kTfLiteInt16 && output->type != kTfLiteInt8) {
     TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
@@ -151,9 +147,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   }
 
   xtensa::hifimini::AffineQuantize(
-      op_data->scale_multiplier, op_data->zero_point, GetTensorShape(input),
-      GetTensorData<int16_t>(input), GetTensorShape(output),
-      GetTensorData<int8_t>(output));
+      op_data->scale_multiplier, op_data->zero_point,
+      tflite::micro::GetTensorShape(input),
+      tflite::micro::GetTensorData<int16_t>(input),
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<int8_t>(output));
   return kTfLiteOk;
 }
 
@@ -161,7 +159,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 // This Op (QUANTIZE) quantizes the input and produces quantized output.
 // AffineQuantize takes scale and zero point and quantizes the float value to
-// quantized output, in int8 or uint8 format.
+// quantized output, in int8_t or uint8_t format.
 TfLiteRegistration Register_QUANTIZE() {
   return {/*init=*/quantize::Init,
           /*free=*/nullptr,
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
index 3a4b157d265..f0a0134d7e6 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -33,12 +34,12 @@ struct OpData {
   uint16_t* exp_lut;
 };
 
-// Number of unique int8 and int16 values.  Used in exponent lookup table
+// Number of unique int8_t and int16_t values.  Used in exponent lookup table
 // conputation.
 constexpr int kInt8Range =
-    std::numeric_limits<int8_t>::max() - std::numeric_limits<int8>::min() + 1;
-constexpr int kInt16Range =
-    std::numeric_limits<int16_t>::max() - std::numeric_limits<int16>::min() + 1;
+    std::numeric_limits<int8_t>::max() - std::numeric_limits<int8_t>::min() + 1;
+constexpr int kInt16Range = std::numeric_limits<int16_t>::max() -
+                            std::numeric_limits<int16_t>::min() + 1;
 // Each 16-bit precalculated exponent is expressed as a Q0.16 fixedpoint
 // value. We special-case e^0 since 1.0 requires 1 integer bit to
 // express.
@@ -47,7 +48,7 @@ constexpr int kExpFractionalBits = 16;
 // specially.
 constexpr int kMaxExponentValue = (1 << kExpFractionalBits);
 
-// Quantized softmax with int8 input and int16 output.
+// Quantized softmax with int8_t input and int16_t output.
 // Passing OpData by value does not have much savings in this op, but following
 // that as a best practice, at least for the xtensa kernels. See b/155656675 for
 // more details.
@@ -97,7 +98,7 @@ TfLiteStatus Softmax(OpData op_data, const RuntimeShape& input_shape,
       }
       output_data[i * depth + c] = static_cast<int16_t>(std::max(
           std::min(full_range_output,
-                   static_cast<int32>(std::numeric_limits<int16_t>::max())),
+                   static_cast<int32_t>(std::numeric_limits<int16_t>::max())),
           static_cast<int32_t>(std::numeric_limits<int16_t>::min())));
     }
   }
@@ -118,7 +119,8 @@ TfLiteStatus CalculateSoftmaxOpData(TfLiteContext* context,
       if (output->type == kTfLiteInt16) {
         TF_LITE_ENSURE_EQ(context, output->params.zero_point,
                           std::numeric_limits<int16_t>::min());
-        // NOTE: Current int16 softmax output does not require symmetric scaling
+        // NOTE: Current int16_t softmax output does not require symmetric
+        // scaling
         // - so no need to verify scale here.
       } else {
         TF_LITE_ENSURE_EQ(context, output->params.zero_point,
@@ -127,10 +129,10 @@ TfLiteStatus CalculateSoftmaxOpData(TfLiteContext* context,
       }
     }
 
-    // Precompute e^(-x * input_scale * beta) for every possible int8 input.
+    // Precompute e^(-x * input_scale * beta) for every possible int8_t input.
     // This computation is used for every iteration of Softmax.  We must compute
     // using pre-scaled inputs to avoid introducing additional error, while
-    // restricting our input range to the int8 range. This is valid since beta
+    // restricting our input range to the int8_t range. This is valid since beta
     // and input scale are constant for a given op in the graph. Skip index 0
     // since that is a special case which requires 1 integer bit instead of 0.
     for (int i = 1; i <= kInt8Range; i++) {
@@ -148,12 +150,7 @@ TfLiteStatus CalculateSoftmaxOpData(TfLiteContext* context,
 
 void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
@@ -168,14 +165,13 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
   TFLITE_DCHECK(node->user_data != nullptr);
   OpData* op_data = static_cast<OpData*>(node->user_data);
 
-  // Allocate an array to precompute exponents over all int8 inputs, applying
+  // Allocate an array to precompute exponents over all int8_t inputs, applying
   // the scale and beta before calculating exp. It is mandatory to apply beta
   // and scale here, since each softmax op may have different beta and scale
   // values. Beta and scale will remain constant for a given softmax op.
-  void* allocated_ptr;
-  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
-      context, kInt8Range * sizeof(int16_t), &allocated_ptr));
-  op_data->exp_lut = static_cast<uint16_t*>(allocated_ptr);
+  op_data->exp_lut = static_cast<uint16_t*>(context->AllocatePersistentBuffer(
+      context, kInt8Range * sizeof(uint16_t)));
+  TF_LITE_ENSURE(context, op_data->exp_lut != nullptr);
 
   TF_LITE_ENSURE_STATUS(
       CalculateSoftmaxOpData(context, input, output, params, op_data));
@@ -186,13 +182,14 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
   auto* op_data = static_cast<OpData*>(node->user_data);
 
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
 
   if (input->type == kTfLiteInt8 && output->type == kTfLiteInt16) {
-    return Softmax(*op_data, GetTensorShape(input),
-                   GetTensorData<int8_t>(input), GetTensorShape(output),
-                   GetTensorData<int16_t>(output));
+    return Softmax(*op_data, tflite::micro::GetTensorShape(input),
+                   tflite::micro::GetTensorData<int8_t>(input),
+                   tflite::micro::GetTensorShape(output),
+                   tflite::micro::GetTensorData<int16_t>(output));
   } else {
     TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
                        TfLiteTypeGetName(input->type), input->type);
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/svdf.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/svdf.cc
index dc071b221bd..545e91bab3d 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/svdf.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/svdf.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/kernels/activation_utils.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
 
 namespace tflite {
@@ -33,14 +34,18 @@ namespace svdf {
 namespace {
 
 struct OpData {
-  int32 effective_scale_1_a;
-  int32 effective_scale_2_a;
+  int32_t effective_scale_1_a;
+  int32_t effective_scale_2_a;
   // b versions of each scale are kept at int since the numbers are just the
   // shift value - typically between [-32, 32].
   int effective_scale_1_b;
   int effective_scale_2_b;
   int scratch_tensor_index;
   int scratch_output_tensor_index;
+
+  // Cached tensor zero point values for quantized operations.
+  int input_zero_point;
+  int output_zero_point;
 };
 
 // Input tensors.
@@ -62,14 +67,13 @@ constexpr int kOutputTensor = 0;
  * reduce the latency. See b/155656675 for more details.
  */
 void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
-                     const TfLiteTensor* input_tensor,
-                     const TfLiteTensor* weights_feature_tensor,
-                     const TfLiteTensor* weights_time_tensor,
-                     const TfLiteTensor* bias_tensor,
+                     const TfLiteEvalTensor* input_tensor,
+                     const TfLiteEvalTensor* weights_feature_tensor,
+                     const TfLiteEvalTensor* weights_time_tensor,
+                     const TfLiteEvalTensor* bias_tensor,
                      const TfLiteSVDFParams* params,
-                     TfLiteTensor* activation_state_tensor,
-                     TfLiteTensor* output_tensor, OpData data, int32_t input_zp,
-                     int32_t output_zp) {
+                     TfLiteEvalTensor* activation_state_tensor,
+                     TfLiteEvalTensor* output_tensor, OpData data) {
   const int n_rank = params->rank;
   const int n_batch = input_tensor->dims->data[0];
   const int n_input = input_tensor->dims->data[1];
@@ -88,7 +92,8 @@ void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
   TFLITE_DCHECK(scratch_output_tensor != nullptr);
 
   // Shift states.
-  int16_t* const state_ptr = GetTensorData<int16_t>(activation_state_tensor);
+  int16_t* const state_ptr =
+      tflite::micro::GetTensorData<int16_t>(activation_state_tensor);
 
   // Left shift the activation_state.
   {
@@ -104,14 +109,14 @@ void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
 
   // Feature matmul.
   {
-    const int8_t* input = GetTensorData<int8_t>(input_tensor);
+    const int8_t* input = tflite::micro::GetTensorData<int8_t>(input_tensor);
     const int8_t* weight_feature =
-        GetTensorData<int8_t>(weights_feature_tensor);
+        tflite::micro::GetTensorData<int8_t>(weights_feature_tensor);
     int16_t* result_in_batch = state_ptr + (n_memory - 1);
 
     ae_q56s output_int16_max_56 = AE_CVTQ48A32S(INT16_MAX);
     ae_q56s output_int16_min_56 = AE_CVTQ48A32S(INT16_MIN);
-    ae_p24x2s input_zp_24x2 = AE_MOVPA24(input_zp);
+    ae_p24x2s input_zp_24x2 = AE_MOVPA24(data.input_zero_point);
 
     for (int b = 0; b < n_batch; b++) {
       const int8_t* weight_feature_ptr = weight_feature - 2;
@@ -153,7 +158,7 @@ void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
                 dot_prod_24x2, data.effective_scale_1_a,
                 data.effective_scale_1_b);
 
-        // Cap min/max and convert to int32:
+        // Cap min/max and convert to int32_t:
         dot_prod_56 = AE_MAXQ56S(dot_prod_56, output_int16_min_56);
         dot_prod_56 = AE_MINQ56S(dot_prod_56, output_int16_max_56);
         // Truncate immediately since the QR register is already 32 bit aligned:
@@ -175,7 +180,8 @@ void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
       int32_t* scratch_ptr_batch = scratch_tensor + b * n_filter;
 
       // Perform batched vector dot product:
-      const int16_t* vector1_ptr = GetTensorData<int16_t>(weights_time_tensor);
+      const int16_t* vector1_ptr =
+          tflite::micro::GetTensorData<int16_t>(weights_time_tensor);
       const int16_t* vector2_ptr = state_ptr + b * n_memory * n_filter;
 
       const ae_p16x2s* offset_vector1 =
@@ -207,7 +213,8 @@ void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
     // Add bias.
     if (bias_tensor) {
       // Vector batch assign:
-      const int32_t* bias_data = GetTensorData<int32_t>(bias_tensor);
+      const int32_t* bias_data =
+          tflite::micro::GetTensorData<int32_t>(bias_tensor);
       for (int i = 0; i < n_batch; ++i) {
         int32_t* output_ptr = scratch_output_tensor + i * n_unit;
         const int32_t* bias_ptr = bias_data;
@@ -238,7 +245,7 @@ void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
     // Rescale.
     ae_q56s output_int8_max_56 = AE_CVTQ48A32S(INT8_MAX);
     ae_q56s output_int8_min_56 = AE_CVTQ48A32S(INT8_MIN);
-    ae_q56s output_zp_56 = AE_CVTQ48A32S(output_zp);
+    ae_q56s output_zp_56 = AE_CVTQ48A32S(data.output_zero_point);
     for (int i = 0; i < n_batch * n_unit; ++i) {
       ae_q56s x_56 =
           tflite::ops::micro::xtensa::hifimini::MultiplyByQuantizedMultiplier(
@@ -246,10 +253,10 @@ void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
               data.effective_scale_2_b);
       // Add output adjustment:
       x_56 = AE_ADDQ56(x_56, output_zp_56);
-      // Cap min/max and convert to int32 (already aligned to 32bit):
+      // Cap min/max and convert to int32_t (already aligned to 32bit):
       x_56 = AE_MAXQ56S(x_56, output_int8_min_56);
       x_56 = AE_MINQ56S(x_56, output_int8_max_56);
-      GetTensorData<int8_t>(output_tensor)[i] =
+      tflite::micro::GetTensorData<int8_t>(output_tensor)[i] =
           static_cast<int8_t>(AE_TRUNCA32Q48(x_56));
     }
   }
@@ -259,13 +266,7 @@ void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context != nullptr);
-  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@@ -314,7 +315,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 2);
 
   // Validate Tensor Output:
-  // [0] = float/int8, {2, batch_size, num_units}
+  // [0] = float/int8_t, {2, batch_size, num_units}
   TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TF_LITE_ENSURE_EQ(context, NumDimensions(output), 2);
@@ -350,23 +351,16 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Validate output tensor:
   TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);
 
-  // Calculate effective scales.
-  auto* input_params =
-      static_cast<TfLiteAffineQuantization*>(input->quantization.params);
-  auto* weights_feature_params = static_cast<TfLiteAffineQuantization*>(
-      weights_feature->quantization.params);
-  auto* state_params = static_cast<TfLiteAffineQuantization*>(
-      activation_state->quantization.params);
-  auto* weight_time_params =
-      static_cast<TfLiteAffineQuantization*>(weights_time->quantization.params);
-  auto* output_params =
-      static_cast<TfLiteAffineQuantization*>(output->quantization.params);
-  const float effective_scale_1 = input_params->scale->data[0] *
-                                  weights_feature_params->scale->data[0] /
-                                  state_params->scale->data[0];
-  const float effective_scale_2 = state_params->scale->data[0] *
-                                  weight_time_params->scale->data[0] /
-                                  output_params->scale->data[0];
+  const double effective_scale_1 =
+      static_cast<double>(input->params.scale * weights_feature->params.scale /
+                          activation_state->params.scale);
+  const double effective_scale_2 =
+      static_cast<double>(activation_state->params.scale *
+                          weights_time->params.scale / output->params.scale);
+
+  TF_LITE_ENSURE_EQ(context, static_cast<double>(bias->params.scale),
+                    static_cast<double>(activation_state->params.scale *
+                                        weights_time->params.scale));
 
   TFLITE_DCHECK(node->user_data != nullptr);
   OpData* data = static_cast<OpData*>(node->user_data);
@@ -378,6 +372,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                        &data->effective_scale_2_a,
                                        &data->effective_scale_2_b);
 
+  data->input_zero_point = input->params.zero_point;
+  data->output_zero_point = output->params.zero_point;
+
   const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
       context, batch_size * num_filters * sizeof(int32_t),
       &(data->scratch_tensor_index));
@@ -394,23 +391,26 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = static_cast<TfLiteSVDFParams*>(node->builtin_data);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* weights_feature =
-      GetInput(context, node, kWeightsFeatureTensor);
-  const TfLiteTensor* weights_time =
-      GetInput(context, node, kWeightsTimeTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
-  TfLiteTensor* activation_state =
-      GetVariableInput(context, node, kInputActivationStateTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActRelu);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* weights_feature =
+      tflite::micro::GetEvalInput(context, node, kWeightsFeatureTensor);
+  const TfLiteEvalTensor* weights_time =
+      tflite::micro::GetEvalInput(context, node, kWeightsTimeTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 5)
+          ? tflite::micro::GetEvalInput(context, node, kBiasTensor)
+          : nullptr;
+  TfLiteEvalTensor* activation_state = tflite::micro::GetMutableEvalInput(
+      context, node, kInputActivationStateTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
   TFLITE_DCHECK(node->user_data != nullptr);
   const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
   EvalIntegerSVDF(context, node, input, weights_feature, weights_time, bias,
-                  params, activation_state, output, data,
-                  input->params.zero_point, output->params.zero_point);
+                  params, activation_state, output, data);
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/fully_connected.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/fully_connected.cc
index 6264373f2c6..f9b49a2f1ae 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/fully_connected.cc
@@ -98,12 +98,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/quantize.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/quantize.cc
index 7c521e7d2aa..13c19cc6f34 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/quantize.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/quantize.cc
@@ -34,7 +34,7 @@ void AffineQuantize(int scale_multiplier,
                     const tflite::QuantizationParams& op_params,
                     const RuntimeShape& input_shape, const int16_t* input_data,
                     const RuntimeShape& output_shape, int8_t* output_data) {
-  const int32 zero_point = op_params.zero_point;
+  const int32_t zero_point = op_params.zero_point;
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
   ae_q56s min_val_56 = AE_CVTQ48A32S(INT16_MIN);
   ae_q56s max_val_56 = AE_CVTQ48A32S(INT16_MAX);
@@ -110,12 +110,7 @@ struct OpData {
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@@ -160,7 +155,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 // This Op (QUANTIZE) quantizes the input and produces quantized output.
 // AffineQuantize takes scale and zero point and quantizes the float value to
-// quantized output, in int8 or uint8 format.
+// quantized output, in int8_t or uint8_t format.
 TfLiteRegistration Register_QUANTIZE() {
   return {/*init=*/quantize::Init,
           /*free=*/nullptr,
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/softmax.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/softmax.cc
index 6588dff6ec7..3e5ef198928 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/softmax.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/softmax.cc
@@ -72,7 +72,8 @@ TfLiteStatus CalculateSoftmaxOpData(TfLiteContext* context,
       if (output->type == kTfLiteInt16) {
         TF_LITE_ENSURE_EQ(context, output->params.zero_point,
                           std::numeric_limits<int16_t>::min());
-        // NOTE: Current int16 softmax output does not require symmetric scaling
+        // NOTE: Current int16_t softmax output does not require symmetric
+        // scaling
         // - so no need to verify scale here.
       } else {
         TF_LITE_ENSURE_EQ(context, output->params.zero_point,
@@ -98,12 +99,7 @@ TfLiteStatus CalculateSoftmaxOpData(TfLiteContext* context,
 
 void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
@@ -129,7 +125,7 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
       context, scratch_size, &(op_data->scratch_tensor_index));
   TF_LITE_ENSURE_OK(context, scratch_status);
-  // Allocate an array to precompute exponents over all int8 inputs, applying
+  // Allocate an array to precompute exponents over all int8_t inputs, applying
   // the scale and beta before calculating exp. It is mandatory to apply beta
   // and scale here, since each softmax op may have different beta and scale
   // values. Beta and scale will remain constant for a given softmax op.
@@ -150,7 +146,7 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
     const RuntimeShape& input_shape = GetTensorShape(input);
     const int8_t* input_data = GetTensorData<int8_t>(input);
     const RuntimeShape& output_shape = GetTensorShape(output);
-    int16* output_data = GetTensorData<int16>(output);
+    int16_t* output_data = GetTensorData<int16_t>(output);
     const int trailing_dim = input_shape.DimensionsCount() - 1;
     const int outer_size =
         MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/svdf.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/svdf.cc
index 6da87687be3..05256f33306 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/svdf.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/svdf.cc
@@ -55,8 +55,8 @@ namespace svdf {
 namespace {
 
 struct OpData {
-  int32 effective_scale_1_a;
-  int32 effective_scale_2_a;
+  int32_t effective_scale_1_a;
+  int32_t effective_scale_2_a;
   // b versions of each scale are kept at int since the numbers are just the
   // shift value - typically between [-32, 32].
   int effective_scale_1_b;
@@ -190,12 +190,7 @@ TfLiteStatus EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context != nullptr);
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@@ -244,7 +239,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 2);
 
   // Validate Tensor Output:
-  // [0] = float/int8, {2, batch_size, num_units}
+  // [0] = float/int8_t, {2, batch_size, num_units}
   TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TF_LITE_ENSURE_EQ(context, NumDimensions(output), 2);
diff --git a/tensorflow/lite/micro/memory_arena_threshold_test.cc b/tensorflow/lite/micro/memory_arena_threshold_test.cc
index 7a1f63b5633..d2bb404051a 100644
--- a/tensorflow/lite/micro/memory_arena_threshold_test.cc
+++ b/tensorflow/lite/micro/memory_arena_threshold_test.cc
@@ -49,15 +49,14 @@ constexpr int kKeywordModelNodeAndRegistrationCount = 15;
 // Run this test with '--copt=-DTF_LITE_STATIC_MEMORY' to get optimized memory
 // runtime values:
 #ifdef TF_LITE_STATIC_MEMORY
-constexpr int kKeywordModelTotalSize = 18192;
-constexpr int kKeywordModelTailSize = 17520;
+constexpr int kKeywordModelTotalSize = 14336;
+constexpr int kKeywordModelTailSize = 13664;
 #else
-constexpr int kKeywordModelTotalSize = 21152;
-constexpr int kKeywordModelTailSize = 20480;
+constexpr int kKeywordModelTotalSize = 14704;
+constexpr int kKeywordModelTailSize = 14032;
 #endif
 constexpr int kKeywordModelHeadSize = 672;
 constexpr int kKeywordModelTfLiteTensorVariableBufferDataSize = 10240;
-constexpr int kKeywordModelTfLiteTensorQuantizationDataSize = 1728;
 constexpr int kKeywordModelOpRuntimeDataSize = 148;
 
 constexpr int kTestConvModelArenaSize = 12 * 1024;
@@ -69,14 +68,13 @@ constexpr int kTestConvModelNodeAndRegistrationCount = 7;
 // NOTE: These values are measured on x86-64:
 // TODO(b/158651472): Consider auditing these values on non-64 bit systems.
 #ifdef TF_LITE_STATIC_MEMORY
-constexpr int kTestConvModelTotalSize = 10816;
-constexpr int kTestConvModelTailSize = 3072;
+constexpr int kTestConvModelTotalSize = 9552;
+constexpr int kTestConvModelTailSize = 1808;
 #else
-constexpr int kTestConvModelTotalSize = 11712;
-constexpr int kTestConvModelTailSize = 3968;
+constexpr int kTestConvModelTotalSize = 9712;
+constexpr int kTestConvModelTailSize = 1968;
 #endif
 constexpr int kTestConvModelHeadSize = 7744;
-constexpr int kTestConvModelTfLiteTensorQuantizationDataSize = 768;
 constexpr int kTestConvModelOpRuntimeDataSize = 136;
 
 struct ModelAllocationThresholds {
@@ -86,7 +84,6 @@ struct ModelAllocationThresholds {
   size_t head_alloc_size = 0;
   size_t tail_alloc_size = 0;
   size_t tensor_variable_buffer_data_size = 0;
-  size_t tensor_quantization_data_size = 0;
   size_t op_runtime_data_size = 0;
 };
 
@@ -124,12 +121,12 @@ void ValidateModelAllocationThresholds(
       "Tail", allocator.GetSimpleMemoryAllocator()->GetTailUsedBytes(),
       thresholds.tail_alloc_size);
   EnsureAllocatedSizeThreshold(
-      "TfLiteTensor",
+      "TfLiteEvalTensor",
       allocator
           .GetRecordedAllocation(
-              tflite::RecordedAllocationType::kTfLiteTensorArray)
+              tflite::RecordedAllocationType::kTfLiteEvalTensorData)
           .used_bytes,
-      sizeof(TfLiteTensor) * thresholds.tensor_count);
+      sizeof(TfLiteEvalTensor) * thresholds.tensor_count);
   EnsureAllocatedSizeThreshold(
       "VariableBufferData",
       allocator
@@ -138,12 +135,19 @@ void ValidateModelAllocationThresholds(
           .used_bytes,
       thresholds.tensor_variable_buffer_data_size);
   EnsureAllocatedSizeThreshold(
-      "QuantizationData",
+      "PersistentTfLiteTensor",
+      allocator
+          .GetRecordedAllocation(
+              tflite::RecordedAllocationType::kPersistentTfLiteTensorData)
+          .used_bytes,
+      0);
+  EnsureAllocatedSizeThreshold(
+      "PersistentTfliteTensorQuantizationData",
       allocator
           .GetRecordedAllocation(tflite::RecordedAllocationType::
-                                     kTfLiteTensorArrayQuantizationData)
+                                     kPersistentTfLiteTensorQuantizationData)
           .used_bytes,
-      thresholds.tensor_quantization_data_size);
+      0);
   EnsureAllocatedSizeThreshold(
       "NodeAndRegistration",
       allocator
@@ -159,8 +163,7 @@ void ValidateModelAllocationThresholds(
       thresholds.op_runtime_data_size);
 
   // Ensure tail allocation recording is not missing any large chunks:
-  size_t tail_est_length = sizeof(TfLiteTensor) * thresholds.tensor_count +
-                           thresholds.tensor_quantization_data_size +
+  size_t tail_est_length = sizeof(TfLiteEvalTensor) * thresholds.tensor_count +
                            thresholds.tensor_variable_buffer_data_size +
                            sizeof(tflite::NodeAndRegistration) *
                                thresholds.node_and_registration_count +
@@ -191,8 +194,6 @@ TF_LITE_MICRO_TEST(TestKeywordModelMemoryThreshold) {
   thresholds.tail_alloc_size = kKeywordModelTailSize;
   thresholds.tensor_variable_buffer_data_size =
       kKeywordModelTfLiteTensorVariableBufferDataSize;
-  thresholds.tensor_quantization_data_size =
-      kKeywordModelTfLiteTensorQuantizationDataSize;
   thresholds.op_runtime_data_size = kKeywordModelOpRuntimeDataSize;
 
   ValidateModelAllocationThresholds(interpreter.GetMicroAllocator(),
@@ -214,8 +215,6 @@ TF_LITE_MICRO_TEST(TestConvModelMemoryThreshold) {
   thresholds.total_alloc_size = kTestConvModelTotalSize;
   thresholds.head_alloc_size = kTestConvModelHeadSize;
   thresholds.tail_alloc_size = kTestConvModelTailSize;
-  thresholds.tensor_quantization_data_size =
-      kTestConvModelTfLiteTensorQuantizationDataSize;
   thresholds.op_runtime_data_size = kTestConvModelOpRuntimeDataSize;
 
   ValidateModelAllocationThresholds(interpreter.GetMicroAllocator(),
diff --git a/tensorflow/lite/micro/memory_helpers.cc b/tensorflow/lite/micro/memory_helpers.cc
index bded4d6895a..c6180cb4951 100644
--- a/tensorflow/lite/micro/memory_helpers.cc
+++ b/tensorflow/lite/micro/memory_helpers.cc
@@ -72,6 +72,9 @@ TfLiteStatus TfLiteTypeSizeOf(TfLiteType type, size_t* size) {
     case kTfLiteComplex64:
       *size = sizeof(float) * 2;
       break;
+    case kTfLiteComplex128:
+      *size = sizeof(double) * 2;
+      break;
     default:
       return kTfLiteError;
   }
@@ -98,6 +101,23 @@ TfLiteStatus BytesRequiredForTensor(const tflite::Tensor& flatbuffer_tensor,
   return kTfLiteOk;
 }
 
+TfLiteStatus TfLiteEvalTensorByteLength(const TfLiteEvalTensor* eval_tensor,
+                                        size_t* out_bytes) {
+  TFLITE_DCHECK(out_bytes != nullptr);
+
+  int element_count = 1;
+  // If eval_tensor->dims == nullptr, then tensor is a scalar so has 1 element.
+  if (eval_tensor->dims != nullptr) {
+    for (int n = 0; n < eval_tensor->dims->size; ++n) {
+      element_count *= eval_tensor->dims->data[n];
+    }
+  }
+  size_t type_size;
+  TF_LITE_ENSURE_STATUS(TfLiteTypeSizeOf(eval_tensor->type, &type_size));
+  *out_bytes = element_count * type_size;
+  return kTfLiteOk;
+}
+
 TfLiteStatus AllocateOutputDimensionsFromInput(TfLiteContext* context,
                                                const TfLiteTensor* input1,
                                                const TfLiteTensor* input2,
@@ -111,7 +131,7 @@ TfLiteStatus AllocateOutputDimensionsFromInput(TfLiteContext* context,
   input = input1->dims->size > input2->dims->size ? input1 : input2;
   TF_LITE_ENSURE(context, output->type == input->type);
 
-  size_t size;
+  size_t size = 0;
   TfLiteTypeSizeOf(input->type, &size);
   const int dimensions_count = tflite::GetTensorShape(input).DimensionsCount();
   for (int i = 0; i < dimensions_count; i++) {
@@ -120,9 +140,9 @@ TfLiteStatus AllocateOutputDimensionsFromInput(TfLiteContext* context,
 
   output->bytes = size;
 
-  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
-      context, TfLiteIntArrayGetSizeInBytes(size),
-      reinterpret_cast<void**>(&output->dims)));
+  output->dims =
+      reinterpret_cast<TfLiteIntArray*>(context->AllocatePersistentBuffer(
+          context, TfLiteIntArrayGetSizeInBytes(size)));
 
   output->dims->size = input->dims->size;
   for (int i = 0; i < dimensions_count; i++) {
diff --git a/tensorflow/lite/micro/memory_helpers.h b/tensorflow/lite/micro/memory_helpers.h
index 274eef52976..8f5526ce924 100644
--- a/tensorflow/lite/micro/memory_helpers.h
+++ b/tensorflow/lite/micro/memory_helpers.h
@@ -41,6 +41,11 @@ TfLiteStatus BytesRequiredForTensor(const tflite::Tensor& flatbuffer_tensor,
                                     size_t* bytes, size_t* type_size,
                                     ErrorReporter* error_reporter);
 
+// How many bytes are used in a TfLiteEvalTensor instance. The byte length is
+// returned in out_bytes.
+TfLiteStatus TfLiteEvalTensorByteLength(const TfLiteEvalTensor* eval_tensor,
+                                        size_t* out_bytes);
+
 // Deduce output dimensions from input and allocate given size.
 // Useful for operators with two inputs where the largest input should equal the
 // output dimension.
diff --git a/tensorflow/lite/micro/memory_helpers_test.cc b/tensorflow/lite/micro/memory_helpers_test.cc
index 791e30c944e..5000a880638 100644
--- a/tensorflow/lite/micro/memory_helpers_test.cc
+++ b/tensorflow/lite/micro/memory_helpers_test.cc
@@ -20,9 +20,15 @@ limitations under the License.
 
 namespace {
 
-TfLiteStatus FakeAllocatePersistentBuffer(TfLiteContext* context, size_t bytes,
-                                          void** ptr) {
-  return kTfLiteOk;
+// This just needs to be big enough to handle the array of 5 ints allocated
+// in TestAllocateOutputDimensionsFromInput below.
+const int kGlobalPersistentBufferLength = 100;
+char global_persistent_buffer[kGlobalPersistentBufferLength];
+
+// Only need to handle a single allocation at a time for output dimensions
+// in TestAllocateOutputDimensionsFromInput.
+void* FakeAllocatePersistentBuffer(TfLiteContext* context, size_t bytes) {
+  return reinterpret_cast<void*>(global_persistent_buffer);
 }
 
 }  // namespace
@@ -33,78 +39,78 @@ TF_LITE_MICRO_TEST(TestAlignPointerUp) {
   uint8_t* input0 = reinterpret_cast<uint8_t*>(0);
 
   uint8_t* input0_aligned1 = tflite::AlignPointerUp(input0, 1);
-  TF_LITE_MICRO_EXPECT_EQ(input0, input0_aligned1);
+  TF_LITE_MICRO_EXPECT(input0 == input0_aligned1);
 
   uint8_t* input0_aligned2 = tflite::AlignPointerUp(input0, 2);
-  TF_LITE_MICRO_EXPECT_EQ(input0, input0_aligned2);
+  TF_LITE_MICRO_EXPECT(input0 == input0_aligned2);
 
   uint8_t* input0_aligned3 = tflite::AlignPointerUp(input0, 3);
-  TF_LITE_MICRO_EXPECT_EQ(input0, input0_aligned3);
+  TF_LITE_MICRO_EXPECT(input0 == input0_aligned3);
 
   uint8_t* input0_aligned16 = tflite::AlignPointerUp(input0, 16);
-  TF_LITE_MICRO_EXPECT_EQ(input0, input0_aligned16);
+  TF_LITE_MICRO_EXPECT(input0 == input0_aligned16);
 
   uint8_t* input23 = reinterpret_cast<uint8_t*>(23);
 
   uint8_t* input23_aligned1 = tflite::AlignPointerUp(input23, 1);
-  TF_LITE_MICRO_EXPECT_EQ(input23, input23_aligned1);
+  TF_LITE_MICRO_EXPECT(input23 == input23_aligned1);
 
   uint8_t* input23_aligned2 = tflite::AlignPointerUp(input23, 2);
   uint8_t* expected23_aligned2 = reinterpret_cast<uint8_t*>(24);
-  TF_LITE_MICRO_EXPECT_EQ(expected23_aligned2, input23_aligned2);
+  TF_LITE_MICRO_EXPECT(expected23_aligned2 == input23_aligned2);
 
   uint8_t* input23_aligned3 = tflite::AlignPointerUp(input23, 3);
   uint8_t* expected23_aligned3 = reinterpret_cast<uint8_t*>(24);
-  TF_LITE_MICRO_EXPECT_EQ(expected23_aligned3, input23_aligned3);
+  TF_LITE_MICRO_EXPECT(expected23_aligned3 == input23_aligned3);
 
   uint8_t* input23_aligned16 = tflite::AlignPointerUp(input23, 16);
   uint8_t* expected23_aligned16 = reinterpret_cast<uint8_t*>(32);
-  TF_LITE_MICRO_EXPECT_EQ(expected23_aligned16, input23_aligned16);
+  TF_LITE_MICRO_EXPECT(expected23_aligned16 == input23_aligned16);
 }
 
 TF_LITE_MICRO_TEST(TestAlignPointerDown) {
   uint8_t* input0 = reinterpret_cast<uint8_t*>(0);
 
   uint8_t* input0_aligned1 = tflite::AlignPointerDown(input0, 1);
-  TF_LITE_MICRO_EXPECT_EQ(input0, input0_aligned1);
+  TF_LITE_MICRO_EXPECT(input0 == input0_aligned1);
 
   uint8_t* input0_aligned2 = tflite::AlignPointerDown(input0, 2);
-  TF_LITE_MICRO_EXPECT_EQ(input0, input0_aligned2);
+  TF_LITE_MICRO_EXPECT(input0 == input0_aligned2);
 
   uint8_t* input0_aligned3 = tflite::AlignPointerDown(input0, 3);
-  TF_LITE_MICRO_EXPECT_EQ(input0, input0_aligned3);
+  TF_LITE_MICRO_EXPECT(input0 == input0_aligned3);
 
   uint8_t* input0_aligned16 = tflite::AlignPointerDown(input0, 16);
-  TF_LITE_MICRO_EXPECT_EQ(input0, input0_aligned16);
+  TF_LITE_MICRO_EXPECT(input0 == input0_aligned16);
 
   uint8_t* input23 = reinterpret_cast<uint8_t*>(23);
 
   uint8_t* input23_aligned1 = tflite::AlignPointerDown(input23, 1);
-  TF_LITE_MICRO_EXPECT_EQ(input23, input23_aligned1);
+  TF_LITE_MICRO_EXPECT(input23 == input23_aligned1);
 
   uint8_t* input23_aligned2 = tflite::AlignPointerDown(input23, 2);
   uint8_t* expected23_aligned2 = reinterpret_cast<uint8_t*>(22);
-  TF_LITE_MICRO_EXPECT_EQ(expected23_aligned2, input23_aligned2);
+  TF_LITE_MICRO_EXPECT(expected23_aligned2 == input23_aligned2);
 
   uint8_t* input23_aligned3 = tflite::AlignPointerDown(input23, 3);
   uint8_t* expected23_aligned3 = reinterpret_cast<uint8_t*>(21);
-  TF_LITE_MICRO_EXPECT_EQ(expected23_aligned3, input23_aligned3);
+  TF_LITE_MICRO_EXPECT(expected23_aligned3 == input23_aligned3);
 
   uint8_t* input23_aligned16 = tflite::AlignPointerDown(input23, 16);
   uint8_t* expected23_aligned16 = reinterpret_cast<uint8_t*>(16);
-  TF_LITE_MICRO_EXPECT_EQ(expected23_aligned16, input23_aligned16);
+  TF_LITE_MICRO_EXPECT(expected23_aligned16 == input23_aligned16);
 }
 
 TF_LITE_MICRO_TEST(TestAlignSizeUp) {
-  TF_LITE_MICRO_EXPECT_EQ(1, tflite::AlignSizeUp(1, 1));
-  TF_LITE_MICRO_EXPECT_EQ(2, tflite::AlignSizeUp(1, 2));
-  TF_LITE_MICRO_EXPECT_EQ(3, tflite::AlignSizeUp(1, 3));
-  TF_LITE_MICRO_EXPECT_EQ(16, tflite::AlignSizeUp(1, 16));
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(1), tflite::AlignSizeUp(1, 1));
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(2), tflite::AlignSizeUp(1, 2));
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(3), tflite::AlignSizeUp(1, 3));
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(16), tflite::AlignSizeUp(1, 16));
 
-  TF_LITE_MICRO_EXPECT_EQ(23, tflite::AlignSizeUp(23, 1));
-  TF_LITE_MICRO_EXPECT_EQ(24, tflite::AlignSizeUp(23, 2));
-  TF_LITE_MICRO_EXPECT_EQ(24, tflite::AlignSizeUp(23, 3));
-  TF_LITE_MICRO_EXPECT_EQ(32, tflite::AlignSizeUp(23, 16));
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(23), tflite::AlignSizeUp(23, 1));
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(24), tflite::AlignSizeUp(23, 2));
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(24), tflite::AlignSizeUp(23, 3));
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(32), tflite::AlignSizeUp(23, 16));
 }
 
 TF_LITE_MICRO_TEST(TestTypeSizeOf) {
@@ -141,6 +147,10 @@ TF_LITE_MICRO_TEST(TestTypeSizeOf) {
                           tflite::TfLiteTypeSizeOf(kTfLiteComplex64, &size));
   TF_LITE_MICRO_EXPECT_EQ(sizeof(float) * 2, size);
 
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          tflite::TfLiteTypeSizeOf(kTfLiteComplex128, &size));
+  TF_LITE_MICRO_EXPECT_EQ(sizeof(double) * 2, size);
+
   TF_LITE_MICRO_EXPECT_NE(
       kTfLiteOk, tflite::TfLiteTypeSizeOf(static_cast<TfLiteType>(-1), &size));
 }
@@ -153,16 +163,16 @@ TF_LITE_MICRO_TEST(TestBytesRequiredForTensor) {
   TF_LITE_MICRO_EXPECT_EQ(
       kTfLiteOk, tflite::BytesRequiredForTensor(*tensor100, &bytes, &type_size,
                                                 micro_test::reporter));
-  TF_LITE_MICRO_EXPECT_EQ(400, bytes);
-  TF_LITE_MICRO_EXPECT_EQ(4, type_size);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(400), bytes);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(4), type_size);
 
   const tflite::Tensor* tensor200 =
       tflite::testing::Create1dFlatbufferTensor(200);
   TF_LITE_MICRO_EXPECT_EQ(
       kTfLiteOk, tflite::BytesRequiredForTensor(*tensor200, &bytes, &type_size,
                                                 micro_test::reporter));
-  TF_LITE_MICRO_EXPECT_EQ(800, bytes);
-  TF_LITE_MICRO_EXPECT_EQ(4, type_size);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(800), bytes);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(4), type_size);
 }
 
 TF_LITE_MICRO_TEST(TestAllocateOutputDimensionsFromInput) {
@@ -177,8 +187,8 @@ TF_LITE_MICRO_TEST(TestAllocateOutputDimensionsFromInput) {
   TfLiteTensor output_tensor = tflite::testing::CreateInt32Tensor(
       nullptr, tflite::testing::IntArrayFromInts(output_dims));
   TfLiteContext context;
-  // Set allocator to no-op to avoid segfault. Memory is already allocated for
-  // output dims.
+  // Only need to allocate space for output_tensor.dims.  Use a simple
+  // fake allocator.
   context.AllocatePersistentBuffer = FakeAllocatePersistentBuffer;
 
   TF_LITE_MICRO_EXPECT_EQ(
diff --git a/tensorflow/lite/micro/memory_planner/BUILD b/tensorflow/lite/micro/memory_planner/BUILD
index 9e53fb1f874..a674f075cb6 100644
--- a/tensorflow/lite/micro/memory_planner/BUILD
+++ b/tensorflow/lite/micro/memory_planner/BUILD
@@ -4,7 +4,6 @@ load(
 )
 load(
     "//tensorflow/lite/micro:build_def.bzl",
-    "cc_library",
     "micro_copts",
 )
 
@@ -18,7 +17,6 @@ cc_library(
     hdrs = [
         "memory_planner.h",
     ],
-    build_for_embedded = True,
     copts = micro_copts(),
     deps = [
         "//tensorflow/lite/c:common",
@@ -50,7 +48,6 @@ cc_library(
     hdrs = [
         "greedy_memory_planner.h",
     ],
-    build_for_embedded = True,
     copts = micro_copts(),
     deps = [
         ":memory_planner",
diff --git a/tensorflow/lite/micro/memory_planner/greedy_memory_planner_test.cc b/tensorflow/lite/micro/memory_planner/greedy_memory_planner_test.cc
index 923013845fa..12e5b392cc5 100644
--- a/tensorflow/lite/micro/memory_planner/greedy_memory_planner_test.cc
+++ b/tensorflow/lite/micro/memory_planner/greedy_memory_planner_test.cc
@@ -32,7 +32,6 @@ TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(TestReverseSortInPlace) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   constexpr int a_size = 10;
   int a_values[a_size] = {10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
@@ -92,179 +91,182 @@ TF_LITE_MICRO_TEST(TestReverseSortInPlace) {
 
 TF_LITE_MICRO_TEST(TestGreedyBasics) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   tflite::GreedyMemoryPlanner planner(g_scratch_buffer, kScratchBufferSize);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 10, 0, 1));
+                          planner.AddBuffer(&micro_error_reporter, 10, 0, 1));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 20, 2, 3));
+                          planner.AddBuffer(&micro_error_reporter, 20, 2, 3));
 
-  TF_LITE_MICRO_EXPECT_EQ(false, planner.DoAnyBuffersOverlap(error_reporter));
+  TF_LITE_MICRO_EXPECT_EQ(false,
+                          planner.DoAnyBuffersOverlap(&micro_error_reporter));
 
-  TF_LITE_MICRO_EXPECT_EQ(20, planner.GetMaximumMemorySize());
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(20),
+                          planner.GetMaximumMemorySize());
 
   int offset = -1;
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, planner.GetOffsetForBuffer(error_reporter, 0, &offset));
+      kTfLiteOk, planner.GetOffsetForBuffer(&micro_error_reporter, 0, &offset));
   TF_LITE_MICRO_EXPECT_EQ(0, offset);
 
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, planner.GetOffsetForBuffer(error_reporter, 1, &offset));
+      kTfLiteOk, planner.GetOffsetForBuffer(&micro_error_reporter, 1, &offset));
   TF_LITE_MICRO_EXPECT_EQ(0, offset);
 }
 
 TF_LITE_MICRO_TEST(TestGreedyMedium) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   tflite::GreedyMemoryPlanner planner(g_scratch_buffer, kScratchBufferSize);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 10, 0, 1));
+                          planner.AddBuffer(&micro_error_reporter, 10, 0, 1));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 20, 1, 2));
+                          planner.AddBuffer(&micro_error_reporter, 20, 1, 2));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 30, 2, 3));
+                          planner.AddBuffer(&micro_error_reporter, 30, 2, 3));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 40, 3, 4));
+                          planner.AddBuffer(&micro_error_reporter, 40, 3, 4));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 50, 0, 1));
+                          planner.AddBuffer(&micro_error_reporter, 50, 0, 1));
 
   int offset = -1;
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, planner.GetOffsetForBuffer(error_reporter, 0, &offset));
+      kTfLiteOk, planner.GetOffsetForBuffer(&micro_error_reporter, 0, &offset));
   TF_LITE_MICRO_EXPECT_EQ(50, offset);
 
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, planner.GetOffsetForBuffer(error_reporter, 1, &offset));
+      kTfLiteOk, planner.GetOffsetForBuffer(&micro_error_reporter, 1, &offset));
   TF_LITE_MICRO_EXPECT_EQ(70, offset);
 
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, planner.GetOffsetForBuffer(error_reporter, 2, &offset));
+      kTfLiteOk, planner.GetOffsetForBuffer(&micro_error_reporter, 2, &offset));
   TF_LITE_MICRO_EXPECT_EQ(40, offset);
 
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, planner.GetOffsetForBuffer(error_reporter, 3, &offset));
+      kTfLiteOk, planner.GetOffsetForBuffer(&micro_error_reporter, 3, &offset));
   TF_LITE_MICRO_EXPECT_EQ(0, offset);
 
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, planner.GetOffsetForBuffer(error_reporter, 4, &offset));
+      kTfLiteOk, planner.GetOffsetForBuffer(&micro_error_reporter, 4, &offset));
   TF_LITE_MICRO_EXPECT_EQ(0, offset);
 
-  planner.PrintMemoryPlan(error_reporter);
+  planner.PrintMemoryPlan(&micro_error_reporter);
 
-  TF_LITE_MICRO_EXPECT_EQ(false, planner.DoAnyBuffersOverlap(error_reporter));
+  TF_LITE_MICRO_EXPECT_EQ(false,
+                          planner.DoAnyBuffersOverlap(&micro_error_reporter));
 
-  TF_LITE_MICRO_EXPECT_EQ(90, planner.GetMaximumMemorySize());
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(90),
+                          planner.GetMaximumMemorySize());
 }
 
 TF_LITE_MICRO_TEST(TestPersonDetectionModel) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   tflite::GreedyMemoryPlanner planner(g_scratch_buffer, kScratchBufferSize);
   // These buffer sizes and time ranges are taken from the 250KB MobileNet model
   // used in the person detection example.
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 9216, 0, 29));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 9216, 0, 29));
+                          planner.AddBuffer(&micro_error_reporter, 3, 28, 29));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 256, 27, 28));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 2304, 26, 27));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 2304, 25, 26));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 2304, 24, 25));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 1152, 23, 24));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 4608, 22, 23));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 4608, 21, 22));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 4608, 20, 21));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 4608, 19, 20));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 4608, 18, 19));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 4608, 17, 18));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 4608, 16, 17));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 4608, 15, 16));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 4608, 14, 15));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 4608, 13, 14));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 4608, 12, 13));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 2304, 11, 12));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 9216, 10, 11));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 9216, 9, 10));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 3, 28, 29));
+                          planner.AddBuffer(&micro_error_reporter, 9216, 8, 9));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 256, 27, 28));
+                          planner.AddBuffer(&micro_error_reporter, 4608, 7, 8));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 18432, 6, 7));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 18432, 5, 6));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 18432, 4, 5));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 2304, 26, 27));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 2304, 25, 26));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 2304, 24, 25));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 1152, 23, 24));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 4608, 22, 23));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 4608, 21, 22));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 4608, 20, 21));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 4608, 19, 20));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 4608, 18, 19));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 4608, 17, 18));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 4608, 16, 17));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 4608, 15, 16));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 4608, 14, 15));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 4608, 13, 14));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 4608, 12, 13));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 2304, 11, 12));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 9216, 10, 11));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 9216, 9, 10));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 9216, 8, 9));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 4608, 7, 8));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 18432, 6, 7));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 18432, 5, 6));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 18432, 4, 5));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 9216, 3, 4));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 36864, 2, 3));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 18432, 1, 2));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 18432, 0, 1));
+                          planner.AddBuffer(&micro_error_reporter, 9216, 3, 4));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 36864, 2, 3));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 18432, 1, 2));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 18432, 0, 1));
 
-  planner.PrintMemoryPlan(error_reporter);
+  planner.PrintMemoryPlan(&micro_error_reporter);
 
-  TF_LITE_MICRO_EXPECT_EQ(false, planner.DoAnyBuffersOverlap(error_reporter));
+  TF_LITE_MICRO_EXPECT_EQ(false,
+                          planner.DoAnyBuffersOverlap(&micro_error_reporter));
 
   // The sum of all the buffers is 241,027 bytes, so we at least expect the plan
   // to come up with something smaller than this.
-  TF_LITE_MICRO_EXPECT_GT(241027, planner.GetMaximumMemorySize());
+  TF_LITE_MICRO_EXPECT_GT(static_cast<size_t>(241027),
+                          planner.GetMaximumMemorySize());
 }
 
 TF_LITE_MICRO_TEST(TestOverlapCase) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   tflite::GreedyMemoryPlanner planner(g_scratch_buffer, kScratchBufferSize);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 100, 0, 1));
+                          planner.AddBuffer(&micro_error_reporter, 100, 0, 1));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 50, 2, 3));
+                          planner.AddBuffer(&micro_error_reporter, 50, 2, 3));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 20, 1, 2));
+                          planner.AddBuffer(&micro_error_reporter, 20, 1, 2));
 
-  planner.PrintMemoryPlan(error_reporter);
+  planner.PrintMemoryPlan(&micro_error_reporter);
 
-  TF_LITE_MICRO_EXPECT_EQ(false, planner.DoAnyBuffersOverlap(error_reporter));
+  TF_LITE_MICRO_EXPECT_EQ(false,
+                          planner.DoAnyBuffersOverlap(&micro_error_reporter));
 
-  TF_LITE_MICRO_EXPECT_EQ(120, planner.GetMaximumMemorySize());
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(120),
+                          planner.GetMaximumMemorySize());
 }
 
 TF_LITE_MICRO_TEST(TestSmallScratch) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   constexpr int scratch_buffer_size = 40;
   unsigned char scratch_buffer[scratch_buffer_size];
   tflite::GreedyMemoryPlanner planner(scratch_buffer, scratch_buffer_size);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 100, 0, 1));
+                          planner.AddBuffer(&micro_error_reporter, 100, 0, 1));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
-                          planner.AddBuffer(error_reporter, 50, 2, 3));
+                          planner.AddBuffer(&micro_error_reporter, 50, 2, 3));
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/memory_planner/linear_memory_planner_test.cc b/tensorflow/lite/micro/memory_planner/linear_memory_planner_test.cc
index 61a914b5e91..f0b50383dfd 100644
--- a/tensorflow/lite/micro/memory_planner/linear_memory_planner_test.cc
+++ b/tensorflow/lite/micro/memory_planner/linear_memory_planner_test.cc
@@ -21,104 +21,103 @@ TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(TestBasics) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   tflite::LinearMemoryPlanner planner;
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 10, 0, 1));
+                          planner.AddBuffer(&micro_error_reporter, 10, 0, 1));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 20, 1, 2));
-  TF_LITE_MICRO_EXPECT_EQ(30, planner.GetMaximumMemorySize());
+                          planner.AddBuffer(&micro_error_reporter, 20, 1, 2));
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(30),
+                          planner.GetMaximumMemorySize());
 
   int offset = -1;
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, planner.GetOffsetForBuffer(error_reporter, 0, &offset));
+      kTfLiteOk, planner.GetOffsetForBuffer(&micro_error_reporter, 0, &offset));
   TF_LITE_MICRO_EXPECT_EQ(0, offset);
 
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, planner.GetOffsetForBuffer(error_reporter, 1, &offset));
+      kTfLiteOk, planner.GetOffsetForBuffer(&micro_error_reporter, 1, &offset));
   TF_LITE_MICRO_EXPECT_EQ(10, offset);
 }
 
 TF_LITE_MICRO_TEST(TestErrorHandling) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   tflite::LinearMemoryPlanner planner;
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 10, 0, 1));
+                          planner.AddBuffer(&micro_error_reporter, 10, 0, 1));
 
   int offset = -1;
-  TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteError, planner.GetOffsetForBuffer(error_reporter, 1, &offset));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError, planner.GetOffsetForBuffer(
+                                            &micro_error_reporter, 1, &offset));
 }
 
 TF_LITE_MICRO_TEST(TestPersonDetectionModel) {
   tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   tflite::LinearMemoryPlanner planner;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 9216, 0, 29));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 9216, 0, 29));
+                          planner.AddBuffer(&micro_error_reporter, 3, 28, 29));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 256, 27, 28));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 2304, 26, 27));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 2304, 25, 26));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 2304, 24, 25));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 1152, 23, 24));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 4608, 22, 23));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 4608, 21, 22));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 4608, 20, 21));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 4608, 19, 20));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 4608, 18, 19));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 4608, 17, 18));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 4608, 16, 17));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 4608, 15, 16));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 4608, 14, 15));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 4608, 13, 14));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 4608, 12, 13));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 2304, 11, 12));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 9216, 10, 11));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 9216, 9, 10));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 3, 28, 29));
+                          planner.AddBuffer(&micro_error_reporter, 9216, 8, 9));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 256, 27, 28));
+                          planner.AddBuffer(&micro_error_reporter, 4608, 7, 8));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 18432, 6, 7));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 18432, 5, 6));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 18432, 4, 5));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 2304, 26, 27));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 2304, 25, 26));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 2304, 24, 25));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 1152, 23, 24));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 4608, 22, 23));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 4608, 21, 22));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 4608, 20, 21));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 4608, 19, 20));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 4608, 18, 19));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 4608, 17, 18));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 4608, 16, 17));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 4608, 15, 16));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 4608, 14, 15));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 4608, 13, 14));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 4608, 12, 13));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 2304, 11, 12));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 9216, 10, 11));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 9216, 9, 10));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 9216, 8, 9));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 4608, 7, 8));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 18432, 6, 7));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 18432, 5, 6));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 18432, 4, 5));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 9216, 3, 4));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 36864, 2, 3));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 18432, 1, 2));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          planner.AddBuffer(error_reporter, 18432, 0, 1));
-  TF_LITE_MICRO_EXPECT_EQ(241027, planner.GetMaximumMemorySize());
+                          planner.AddBuffer(&micro_error_reporter, 9216, 3, 4));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 36864, 2, 3));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 18432, 1, 2));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.AddBuffer(&micro_error_reporter, 18432, 0, 1));
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(241027),
+                          planner.GetMaximumMemorySize());
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index abc26c90efb..881b9b9abb0 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -49,17 +49,8 @@ struct AllocationInfo {
 // We align tensor buffers to 16-byte boundaries, since this is a common
 // requirement for SIMD extensions.
 constexpr int kBufferAlignment = 16;
-
 constexpr char kOfflineMemAllocMetadata[] = "OfflineMemoryAllocation";
-
-// Instance of a zero-length int to pass as tensor dims for a flatbuffer
-// Tensor with no shape. Note that the second member of a TfLiteArray is a
-// flexible array member, which is not strictly valid C++. However it is
-// supported by both GCC and clang, as long as the flexible array element is not
-// initialized, which is ok in this case as it should never be accessed.
-// Declaring this as constexpr causes build errors with clang, as it requires
-// the flexible array element to be initialized.
-const TfLiteIntArray kZeroLengthIntArray = {0};
+const TfLiteIntArray kZeroLengthIntArray = {0, {}};
 
 class MicroBuiltinDataAllocator : public BuiltinDataAllocator {
  public:
@@ -106,7 +97,9 @@ TfLiteStatus CheckOfflinePlannedOffsets(const Model* model,
         int version = metadata_buffer[0];
         int subgraph_idx = metadata_buffer[1];
         const int nbr_offline_offsets = metadata_buffer[2];
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
         int* offline_planner_offsets = (int*)&metadata_buffer[3];
+#endif
 
         TF_LITE_REPORT_ERROR(error_reporter, "==== Model metadata info: =====");
         TF_LITE_REPORT_ERROR(error_reporter,
@@ -172,7 +165,7 @@ class AllocationInfoBuilder {
   // Add allocaiton information for the tensors.
   TfLiteStatus AddTensors(const SubGraph* subgraph,
                           const int32_t* offline_offsets,
-                          TfLiteTensor* runtime_tensors);
+                          TfLiteEvalTensor* eval_tensors);
 
   // Add allocation information for the scratch buffers.
   TfLiteStatus AddScratchBuffers(internal::ScratchBufferHandle* buffer_handles);
@@ -208,16 +201,20 @@ TfLiteStatus AllocationInfoBuilder::Allocate() {
 
 TfLiteStatus AllocationInfoBuilder::AddTensors(const SubGraph* subgraph,
                                                const int32_t* offline_offsets,
-                                               TfLiteTensor* runtime_tensors) {
+                                               TfLiteEvalTensor* eval_tensors) {
+  TFLITE_DCHECK(eval_tensors != nullptr);
+
   // Set up allocation info for all tensors.
   for (size_t i = 0; i < tensor_count_; ++i) {
     AllocationInfo* current = &info_[i];
-    // TfLiteTensor.uint8 field is deprecated so use .data field instead.
-    current->output_ptr = &(runtime_tensors[i].data.data);
-    current->bytes = runtime_tensors[i].bytes;
+    current->output_ptr = &(eval_tensors[i].data.data);
+
+    TF_LITE_ENSURE_STATUS(
+        TfLiteEvalTensorByteLength(&eval_tensors[i], &current->bytes));
+
     current->first_created = -1;
     current->last_used = -1;
-    current->needs_allocating = (runtime_tensors[i].data.data == nullptr) &&
+    current->needs_allocating = (eval_tensors[i].data.data == nullptr) &&
                                 (!subgraph->tensors()->Get(i)->is_variable());
     if (offline_offsets) {
       current->offline_offset = offline_offsets[i];
@@ -394,6 +391,8 @@ namespace internal {
 
 // Handles architecture safe mapping of flatbuffer vectors to a TfLite*Array
 // struct. Matching types are required (e.g. float and TfLiteFloatArray).
+// Big-endian systems will always allocate dimension array data in the tail
+// (persistent) section.
 template <typename kFlatBufferVectorType, typename kTfLiteArrayType>
 TfLiteStatus FlatBufferVectorToTfLiteTypeArray(
     SimpleMemoryAllocator* allocator, ErrorReporter* error_reporter,
@@ -434,24 +433,19 @@ TfLiteStatus FlatBufferVectorToTfLiteTypeArray(
   return kTfLiteOk;
 }
 
-TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
-    SimpleMemoryAllocator* allocator, const tflite::Tensor& flatbuffer_tensor,
-    const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
-    ErrorReporter* error_reporter, TfLiteTensor* result) {
-  *result = {};
-  // Make sure the serialized type is one we know how to deal with, and convert
-  // it from a flatbuffer enum into a constant used by the kernel C API.
-  TF_LITE_ENSURE_STATUS(ConvertTensorType(flatbuffer_tensor.type(),
-                                          &result->type, error_reporter));
-  // Make sure we remember if the serialized tensor is designated as a variable.
-  result->is_variable = flatbuffer_tensor.is_variable();
-
+// Returns a pointer to any buffer associated with the flatbuffer tensor. Can
+// return nullptr if no buffer is found.
+void* GetFlatbufferTensorBuffer(
+    const tflite::Tensor& flatbuffer_tensor,
+    const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers) {
   // We need to figure out where the actual contents of this tensor are stored
   // in memory. We'll check to see if there's a serialized buffer (pretty much
   // the same as a constant op in TensorFlow) associated with this tensor first,
   // and if there is update the runtime structure to point to its location in
   // memory.
   // First see if there's any buffer information in the serialized tensor.
+  // TODO(b/160894903): Add better unit tests that validate flatbuffer values.
+  void* out_buffer = nullptr;
   if (auto* buffer = (*buffers)[flatbuffer_tensor.buffer()]) {
     // If we've found a buffer, does it have any data?
     if (auto* array = buffer->data()) {
@@ -459,10 +453,7 @@ TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
       if (array->size()) {
         // We've found a buffer with valid data, so update the runtime tensor
         // data structure to point to it.
-        result->data.data =
-            const_cast<void*>(static_cast<const void*>(array->data()));
-        // We set the data from a serialized buffer, so record tha.
-        result->allocation_type = kTfLiteMmapRo;
+        out_buffer = const_cast<void*>(static_cast<const void*>(array->data()));
       }
     }
     // TODO(petewarden): It's not clear in what circumstances we could have a
@@ -471,6 +462,25 @@ TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
     // error condition? It would be good to tighten up the specification to make
     // it less ambiguous.
   }
+  return out_buffer;
+}
+
+TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
+    SimpleMemoryAllocator* allocator, bool allocate_temp,
+    const tflite::Tensor& flatbuffer_tensor,
+    const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
+    ErrorReporter* error_reporter, TfLiteTensor* result) {
+  TFLITE_DCHECK(result != nullptr);
+
+  *result = {};
+  // Make sure the serialized type is one we know how to deal with, and convert
+  // it from a flatbuffer enum into a constant used by the kernel C API.
+  TF_LITE_ENSURE_STATUS(ConvertTensorType(flatbuffer_tensor.type(),
+                                          &result->type, error_reporter));
+  // Make sure we remember if the serialized tensor is designated as a variable.
+  result->is_variable = flatbuffer_tensor.is_variable();
+
+  result->data.data = GetFlatbufferTensorBuffer(flatbuffer_tensor, buffers);
 
   // TODO(petewarden): Some of these paths aren't getting enough testing
   // coverage, so we should figure out some tests that exercise them.
@@ -479,6 +489,9 @@ TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
     // make a note that they will be allocated from memory. The actual
     // allocation won't happen until later.
     result->allocation_type = kTfLiteArenaRw;
+  } else {
+    // We set the data from a serialized buffer, so record tha.
+    result->allocation_type = kTfLiteMmapRo;
   }
 
   // Figure out what the size in bytes of the buffer is and store it.
@@ -517,9 +530,14 @@ TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
     // Populate per-channel quantization params.
     int channels = src_quantization->scale()->size();
     TfLiteAffineQuantization* quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(
-            allocator->AllocateFromTail(sizeof(TfLiteAffineQuantization),
-                                        alignof(TfLiteAffineQuantization)));
+        allocate_temp
+            ? reinterpret_cast<TfLiteAffineQuantization*>(
+                  allocator->AllocateTemp(sizeof(TfLiteAffineQuantization),
+                                          alignof(TfLiteAffineQuantization)))
+            : reinterpret_cast<TfLiteAffineQuantization*>(
+                  allocator->AllocateFromTail(
+                      sizeof(TfLiteAffineQuantization),
+                      alignof(TfLiteAffineQuantization)));
     if (quantization == nullptr) {
       TF_LITE_REPORT_ERROR(error_reporter,
                            "Unable to allocate TfLiteAffineQuantization.\n");
@@ -530,8 +548,13 @@ TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
     // buffer. This value can not be reused from the flatbuffer since the
     // zero_point is stored as a int64_t.
     quantization->zero_point =
-        reinterpret_cast<TfLiteIntArray*>(allocator->AllocateFromTail(
-            TfLiteIntArrayGetSizeInBytes(channels), alignof(TfLiteIntArray)));
+        allocate_temp
+            ? reinterpret_cast<TfLiteIntArray*>(allocator->AllocateTemp(
+                  TfLiteIntArrayGetSizeInBytes(channels),
+                  alignof(TfLiteIntArray)))
+            : reinterpret_cast<TfLiteIntArray*>(allocator->AllocateFromTail(
+                  TfLiteIntArrayGetSizeInBytes(channels),
+                  alignof(TfLiteIntArray)));
     if (quantization->zero_point == nullptr) {
       TF_LITE_REPORT_ERROR(error_reporter,
                            "Unable to allocate quantization->zero_point.\n");
@@ -556,6 +579,29 @@ TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
   return kTfLiteOk;
 }
 
+TfLiteStatus InitializeTfLiteEvalTensorFromFlatbuffer(
+    SimpleMemoryAllocator* allocator, const tflite::Tensor& flatbuffer_tensor,
+    const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
+    ErrorReporter* error_reporter, TfLiteEvalTensor* result) {
+  *result = {};
+  // Make sure the serialized type is one we know how to deal with, and convert
+  // it from a flatbuffer enum into a constant used by the kernel C API.
+  TF_LITE_ENSURE_STATUS(ConvertTensorType(flatbuffer_tensor.type(),
+                                          &result->type, error_reporter));
+
+  result->data.data = GetFlatbufferTensorBuffer(flatbuffer_tensor, buffers);
+
+  if (flatbuffer_tensor.shape() == nullptr) {
+    // flatbuffer_tensor.shape() can return a nullptr in the case of a scalar
+    // tensor.
+    result->dims = const_cast<TfLiteIntArray*>(&kZeroLengthIntArray);
+  } else {
+    TF_LITE_ENSURE_STATUS(FlatBufferVectorToTfLiteTypeArray(
+        allocator, error_reporter, flatbuffer_tensor.shape(), &(result->dims)));
+  }
+  return kTfLiteOk;
+}
+
 }  // namespace internal
 
 MicroAllocator::MicroAllocator(SimpleMemoryAllocator* memory_allocator,
@@ -595,11 +641,10 @@ MicroAllocator* MicroAllocator::Create(SimpleMemoryAllocator* memory_allocator,
 }
 
 TfLiteStatus MicroAllocator::StartModelAllocation(
-    const Model* model, TfLiteContext* context,
-    const MicroOpResolver& op_resolver,
-    NodeAndRegistration** node_and_registrations) {
+    const Model* model, const MicroOpResolver& op_resolver,
+    NodeAndRegistration** node_and_registrations,
+    TfLiteEvalTensor** eval_tensors) {
   TFLITE_DCHECK(model != nullptr);
-  TFLITE_DCHECK(context != nullptr);
 
   if (model_is_allocating_) {
     TF_LITE_REPORT_ERROR(error_reporter_,
@@ -608,23 +653,19 @@ TfLiteStatus MicroAllocator::StartModelAllocation(
     return kTfLiteError;
   }
 
-  const SubGraph* subgraph = GetSubGraphFromModel(model);
-  TFLITE_DCHECK(subgraph != nullptr);
   model_is_allocating_ = true;
 
-  TF_LITE_ENSURE_STATUS(AllocateTfLiteTensorArray(context, subgraph));
+  TF_LITE_ENSURE_STATUS(AllocateTfLiteEvalTensors(model, eval_tensors));
   TF_LITE_ENSURE_STATUS(
-      PopulateTfLiteTensorArrayFromFlatbuffer(model, context, subgraph));
-  TF_LITE_ENSURE_STATUS(
-      AllocateNodeAndRegistrations(subgraph, node_and_registrations));
+      AllocateNodeAndRegistrations(model, node_and_registrations));
   TF_LITE_ENSURE_STATUS(PrepareNodeAndRegistrationDataFromFlatbuffer(
-      model, subgraph, op_resolver, *node_and_registrations));
+      model, op_resolver, *node_and_registrations));
 
   return kTfLiteOk;
 }
 
-TfLiteStatus MicroAllocator::FinishModelAllocation(const Model* model,
-                                                   TfLiteContext* context) {
+TfLiteStatus MicroAllocator::FinishModelAllocation(
+    const Model* model, TfLiteEvalTensor* eval_tensors) {
   if (!model_is_allocating_) {
     TF_LITE_REPORT_ERROR(error_reporter_,
                          "MicroAllocator: Model allocation finished before "
@@ -635,30 +676,21 @@ TfLiteStatus MicroAllocator::FinishModelAllocation(const Model* model,
   const SubGraph* subgraph = GetSubGraphFromModel(model);
   TFLITE_DCHECK(subgraph != nullptr);
 
-  TF_LITE_ENSURE_STATUS(CommitStaticMemoryPlan(model, context, subgraph));
-  TF_LITE_ENSURE_STATUS(AllocateVariables(context, subgraph));
+  TF_LITE_ENSURE_STATUS(CommitStaticMemoryPlan(model, subgraph, eval_tensors));
+  TF_LITE_ENSURE_STATUS(AllocateVariables(subgraph, eval_tensors));
 
   model_is_allocating_ = false;
   return kTfLiteOk;
 }
 
-TfLiteStatus MicroAllocator::AllocatePersistentBuffer(size_t bytes,
-                                                      void** ptr) {
-  uint8_t* data = memory_allocator_->AllocateFromTail(bytes, kBufferAlignment);
-  if (data == nullptr) {
-    TF_LITE_REPORT_ERROR(error_reporter_,
-                         "Failed to allocate persistent buffer of size %d",
-                         bytes);
-    return kTfLiteError;
-  }
-  (*ptr) = data;
-  return kTfLiteOk;
+void* MicroAllocator::AllocatePersistentBuffer(size_t bytes) {
+  return memory_allocator_->AllocateFromTail(bytes, kBufferAlignment);
 }
 
 TfLiteStatus MicroAllocator::RequestScratchBufferInArena(int node_id,
                                                          size_t bytes,
                                                          int* buffer_idx) {
-  // A sanity check to make sure scratch_buffer_handles_ is contiguous i.e.
+  // A consistency check to make sure scratch_buffer_handles_ is contiguous i.e.
   // scratch_buffer_handles_ is pointing to the last allocation from memory
   // allocator.
   if (scratch_buffer_handles_ != nullptr &&
@@ -707,40 +739,13 @@ size_t MicroAllocator::used_bytes() const {
   return memory_allocator_->GetUsedBytes();
 }
 
-TfLiteStatus MicroAllocator::AllocateTfLiteTensorArray(
-    TfLiteContext* context, const SubGraph* subgraph) {
-  context->tensors_size = subgraph->tensors()->size();
-  context->tensors =
-      reinterpret_cast<TfLiteTensor*>(memory_allocator_->AllocateFromTail(
-          sizeof(TfLiteTensor) * context->tensors_size, alignof(TfLiteTensor)));
-  if (context->tensors == nullptr) {
-    TF_LITE_REPORT_ERROR(
-        error_reporter_,
-        "Failed to allocate memory for context->tensors, %d bytes required",
-        sizeof(TfLiteTensor) * context->tensors_size);
-    return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus MicroAllocator::PopulateTfLiteTensorArrayFromFlatbuffer(
-    const Model* model, TfLiteContext* context, const SubGraph* subgraph) {
-  // Initialize tensors in context_ using the flatbuffer for quantization data.
-  for (size_t i = 0; i < subgraph->tensors()->size(); ++i) {
-    TfLiteStatus status = internal::InitializeTfLiteTensorFromFlatbuffer(
-        memory_allocator_, *subgraph->tensors()->Get(i), model->buffers(),
-        error_reporter_, &context->tensors[i]);
-    if (status != kTfLiteOk) {
-      TF_LITE_REPORT_ERROR(error_reporter_, "Failed to initialize tensor %d",
-                           i);
-      return kTfLiteError;
-    }
-  }
-  return kTfLiteOk;
-}
-
 TfLiteStatus MicroAllocator::AllocateNodeAndRegistrations(
-    const SubGraph* subgraph, NodeAndRegistration** node_and_registrations) {
+    const Model* model, NodeAndRegistration** node_and_registrations) {
+  TFLITE_DCHECK(node_and_registrations);
+
+  const SubGraph* subgraph = GetSubGraphFromModel(model);
+  TFLITE_DCHECK(subgraph != nullptr);
+
   NodeAndRegistration* output = reinterpret_cast<NodeAndRegistration*>(
       memory_allocator_->AllocateFromTail(
           sizeof(NodeAndRegistration) * subgraph->operators()->size(),
@@ -756,9 +761,14 @@ TfLiteStatus MicroAllocator::AllocateNodeAndRegistrations(
 }
 
 TfLiteStatus MicroAllocator::PrepareNodeAndRegistrationDataFromFlatbuffer(
-    const Model* model, const SubGraph* subgraph,
-    const MicroOpResolver& op_resolver,
+    const Model* model, const MicroOpResolver& op_resolver,
     NodeAndRegistration* node_and_registrations) {
+  TFLITE_DCHECK(model != nullptr);
+  TFLITE_DCHECK(node_and_registrations != nullptr);
+
+  const SubGraph* subgraph = GetSubGraphFromModel(model);
+  TFLITE_DCHECK(subgraph != nullptr);
+
   TfLiteStatus status = kTfLiteOk;
   auto* opcodes = model->operator_codes();
   MicroBuiltinDataAllocator builtin_data_allocator(memory_allocator_);
@@ -842,26 +852,148 @@ TfLiteStatus MicroAllocator::PrepareNodeAndRegistrationDataFromFlatbuffer(
   return kTfLiteOk;
 }
 
-TfLiteStatus MicroAllocator::AllocateVariables(TfLiteContext* context,
-                                               const SubGraph* subgraph) {
-  for (size_t i = 0; i < context->tensors_size; ++i) {
-    if (subgraph->tensors()->Get(i)->is_variable()) {
-      context->tensors[i].data.data = memory_allocator_->AllocateFromTail(
-          context->tensors[i].bytes, kBufferAlignment);
-      // Allocation failure.
-      if (context->tensors[i].data.data == nullptr) {
+TfLiteTensor* MicroAllocator::AllocatePersistentTfLiteTensor(
+    const Model* model, TfLiteEvalTensor* eval_tensors, int tensor_index) {
+  const SubGraph* subgraph = GetSubGraphFromModel(model);
+  TFLITE_DCHECK(subgraph != nullptr);
+
+  // This value is allocated from persistent arena space. It is guaranteed to be
+  // around for the lifetime of the application.
+  TfLiteTensor* tensor =
+      AllocatePersistentTfLiteTensorInternal(model, eval_tensors, tensor_index);
+
+  // Populate any fields from the flatbuffer, since this TfLiteTensor struct is
+  // allocated in the persistent section of the arena, ensure that additional
+  // allocations also take place in that section of the arena.
+  if (PopulateTfLiteTensorFromFlatbuffer(model, subgraph, tensor, tensor_index,
+                                         /*allocate_temp=*/false) !=
+      kTfLiteOk) {
+    TF_LITE_REPORT_ERROR(error_reporter_,
+                         "Failed to populate a persistent TfLiteTensor struct "
+                         "from flatbuffer data!");
+    return nullptr;
+  }
+
+  if (eval_tensors != nullptr) {
+    // Tensor buffers that are allocated at runtime (e.g. non-weight buffers)
+    // and not located in the flatbuffer are stored on the pre-allocated list of
+    // TfLiteEvalTensors structs. These structs are the source of truth, simply
+    // point the corresponding buffer to the new TfLiteTensor data value.
+    tensor->data.data = eval_tensors[tensor_index].data.data;
+  }
+  return tensor;
+}
+
+TfLiteTensor* MicroAllocator::AllocateTempTfLiteTensor(
+    const Model* model, TfLiteEvalTensor* eval_tensors, int tensor_index) {
+  const SubGraph* subgraph = GetSubGraphFromModel(model);
+  TFLITE_DCHECK(subgraph != nullptr);
+
+  // This value is allocated from temporary arena space. It is guaranteed to be
+  // around for at least the scope of the calling function. Since this struct
+  // allocation takes place in temp space, no need to own or cleanup.
+  TfLiteTensor* tensor =
+      reinterpret_cast<TfLiteTensor*>(memory_allocator_->AllocateTemp(
+          sizeof(TfLiteTensor), alignof(TfLiteTensor)));
+
+  // Populate any fields from the flatbuffer, since this TfLiteTensor struct is
+  // allocated in the temp section of the arena, ensure that additional
+  // allocations also take place in that section of the arena.
+  if (PopulateTfLiteTensorFromFlatbuffer(model, subgraph, tensor, tensor_index,
+                                         /*allocate_temp=*/true) != kTfLiteOk) {
+    TF_LITE_REPORT_ERROR(
+        error_reporter_,
+        "Failed to populate a temp TfLiteTensor struct from flatbuffer data!");
+    return nullptr;
+  }
+
+  if (eval_tensors != nullptr) {
+    // Tensor buffers that are allocated at runtime (e.g. non-weight buffers)
+    // and not located in the flatbuffer are stored on the pre-allocated list of
+    // TfLiteEvalTensors structs. These structs are the source of truth, simply
+    // point the corresponding buffer to the new TfLiteTensor data value.
+    tensor->data.data = eval_tensors[tensor_index].data.data;
+  }
+  return tensor;
+}
+
+void MicroAllocator::ResetTempAllocations() {
+  memory_allocator_->ResetTempAllocations();
+}
+
+TfLiteStatus MicroAllocator::AllocateTfLiteEvalTensors(
+    const Model* model, TfLiteEvalTensor** eval_tensors) {
+  TFLITE_DCHECK(eval_tensors != nullptr);
+
+  const SubGraph* subgraph = GetSubGraphFromModel(model);
+  TFLITE_DCHECK(subgraph != nullptr);
+
+  size_t alloc_count = subgraph->tensors()->size();
+  TfLiteEvalTensor* tensors =
+      reinterpret_cast<TfLiteEvalTensor*>(memory_allocator_->AllocateFromTail(
+          sizeof(TfLiteEvalTensor) * alloc_count, alignof(TfLiteEvalTensor)));
+  if (tensors == nullptr) {
+    TF_LITE_REPORT_ERROR(error_reporter_,
+                         "Failed to allocate memory for context->eval_tensors, "
+                         "%d bytes required",
+                         sizeof(TfLiteEvalTensor) * alloc_count);
+    return kTfLiteError;
+  }
+
+  for (size_t i = 0; i < alloc_count; ++i) {
+    TfLiteStatus status = internal::InitializeTfLiteEvalTensorFromFlatbuffer(
+        memory_allocator_, *subgraph->tensors()->Get(i), model->buffers(),
+        error_reporter_, &tensors[i]);
+    if (status != kTfLiteOk) {
+      TF_LITE_REPORT_ERROR(error_reporter_, "Failed to initialize tensor %d",
+                           i);
+      return kTfLiteError;
+    }
+  }
+  *eval_tensors = tensors;
+  return kTfLiteOk;
+}
+
+TfLiteStatus MicroAllocator::AllocateVariables(const SubGraph* subgraph,
+                                               TfLiteEvalTensor* eval_tensors) {
+  for (size_t i = 0; i < subgraph->tensors()->size(); ++i) {
+    auto* tensor = subgraph->tensors()->Get(i);
+    if (tensor->is_variable()) {
+      size_t buffer_size;
+      TF_LITE_ENSURE_STATUS(
+          TfLiteEvalTensorByteLength(&eval_tensors[i], &buffer_size));
+
+      eval_tensors[i].data.data =
+          memory_allocator_->AllocateFromTail(buffer_size, kBufferAlignment);
+
+      if (eval_tensors[i].data.data == nullptr) {
         TF_LITE_REPORT_ERROR(error_reporter_,
                              "Failed to allocate variable tensor of size %d",
-                             context->tensors[i].bytes);
+                             buffer_size);
         return kTfLiteError;
       }
     }
-    tflite::ResetVariableTensor(&(context->tensors[i]));
   }
-
   return kTfLiteOk;
 }
 
+TfLiteTensor* MicroAllocator::AllocatePersistentTfLiteTensorInternal(
+    const Model* model, TfLiteEvalTensor* eval_tensors, int tensor_index) {
+  return reinterpret_cast<TfLiteTensor*>(memory_allocator_->AllocateFromTail(
+      sizeof(TfLiteTensor), alignof(TfLiteTensor)));
+}
+
+TfLiteStatus MicroAllocator::PopulateTfLiteTensorFromFlatbuffer(
+    const Model* model, const SubGraph* subgraph, TfLiteTensor* tensor,
+    int tensor_index, bool allocate_temp) {
+  // TODO(b/160894903): This method serves as a stub to ensure quantized
+  // allocations in the tail can be recorded. Once all kernels have been ported
+  // to the new API this can be dropped.
+  return internal::InitializeTfLiteTensorFromFlatbuffer(
+      memory_allocator_, allocate_temp, *subgraph->tensors()->Get(tensor_index),
+      model->buffers(), error_reporter_, tensor);
+}
+
 ErrorReporter* MicroAllocator::error_reporter() const {
   return error_reporter_;
 }
@@ -876,9 +1008,10 @@ const SubGraph* MicroAllocator::GetSubGraphFromModel(const Model* model) {
   return (*subgraphs)[0];
 }
 
-TfLiteStatus MicroAllocator::CommitStaticMemoryPlan(const Model* model,
-                                                    TfLiteContext* context,
-                                                    const SubGraph* subgraph) {
+TfLiteStatus MicroAllocator::CommitStaticMemoryPlan(
+    const Model* model, const SubGraph* subgraph,
+    TfLiteEvalTensor* eval_tensors) {
+  size_t head_usage = 0;
   // Create static memory plan
   // 1. Calculate AllocationInfo to know the lifetime of each tensor/buffer.
   // 2. Add them into the planner (such as the GreedyMemoryPlanner).
@@ -887,8 +1020,10 @@ TfLiteStatus MicroAllocator::CommitStaticMemoryPlan(const Model* model,
   // Note that AllocationInfo is only needed for creating the plan. It will be
   // thrown away when the child allocator (tmp_allocator) goes out of scope.
   {
+    // TODO(b/162595810): Use temp allocation buffer instead of a stack
+    // instance:
     SimpleMemoryAllocator tmp_allocator(error_reporter_,
-                                        memory_allocator_->GetHead(),
+                                        memory_allocator_->GetBufferHead(),
                                         memory_allocator_->GetTail());
 
     AllocationInfoBuilder builder(error_reporter_, &tmp_allocator);
@@ -898,23 +1033,24 @@ TfLiteStatus MicroAllocator::CommitStaticMemoryPlan(const Model* model,
     const int32_t* offline_planner_offsets = nullptr;
     TF_LITE_ENSURE_STATUS(
         builder.GetOfflinePlannedOffsets(model, &offline_planner_offsets));
-    TF_LITE_ENSURE_STATUS(builder.AddTensors(subgraph, offline_planner_offsets,
-                                             context->tensors));
+    TF_LITE_ENSURE_STATUS(
+        builder.AddTensors(subgraph, offline_planner_offsets, eval_tensors));
 
     TF_LITE_ENSURE_STATUS(builder.AddScratchBuffers(scratch_buffer_handles_));
     const AllocationInfo* allocation_info = builder.Finish();
 
     // Remaining arena size that memory planner can use for calculating offsets.
-    size_t remaining_arena_size = tmp_allocator.GetAvailableMemory();
+    size_t remaining_arena_size =
+        tmp_allocator.GetAvailableMemory(kBufferAlignment);
     uint8_t* planner_arena =
-        tmp_allocator.AllocateFromHead(remaining_arena_size, /*alignment=*/1);
+        tmp_allocator.AllocateTemp(remaining_arena_size, kBufferAlignment);
     TF_LITE_ENSURE(error_reporter_, planner_arena != nullptr);
     GreedyMemoryPlanner planner(planner_arena, remaining_arena_size);
     TF_LITE_ENSURE_STATUS(
         CreatePlan(error_reporter_, &planner, allocation_info, builder.Size()));
 
     size_t actual_available_arena_size =
-        memory_allocator_->GetAvailableMemory();
+        memory_allocator_->GetAvailableMemory(kBufferAlignment);
     // Make sure we have enough arena size.
     if (planner.GetMaximumMemorySize() > actual_available_arena_size) {
       TF_LITE_REPORT_ERROR(
@@ -927,14 +1063,13 @@ TfLiteStatus MicroAllocator::CommitStaticMemoryPlan(const Model* model,
 
     // Commit the plan.
     TF_LITE_ENSURE_STATUS(CommitPlan(error_reporter_, &planner,
-                                     memory_allocator_->GetHead(),
+                                     memory_allocator_->GetBufferHead(),
                                      allocation_info, builder.Size()));
-    // Allocate the planned area, so the allocator knows it's used.
-    uint8_t* allocated_tensor_memory =
-        memory_allocator_->AllocateFromHead(planner.GetMaximumMemorySize(),
-                                            /*alignment=*/1);
-    TF_LITE_ENSURE(error_reporter_, allocated_tensor_memory != nullptr);
+    head_usage = planner.GetMaximumMemorySize();
   }
+
+  TF_LITE_ENSURE_STATUS(
+      memory_allocator_->EnsureHeadSize(head_usage, kBufferAlignment));
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/micro/micro_allocator.h b/tensorflow/lite/micro/micro_allocator.h
index 5fad5a2e5cc..efd11b8b230 100644
--- a/tensorflow/lite/micro/micro_allocator.h
+++ b/tensorflow/lite/micro/micro_allocator.h
@@ -1,5 +1,5 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
+b/160894903
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -33,8 +33,13 @@ namespace internal {
 
 // Sets up all of the data structure members for a TfLiteTensor based on the
 // contents of a serialized tensor in the flatbuffer.
+// TODO(b/160894903): Once all kernels have been updated to the new
+// TfLiteEvalTensor API - drop the allocate_temp flag. This enables internal
+// flatbuffer quantization or dimension allocations to take place in either the
+// temp or tail section of the arena.
 TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
-    SimpleMemoryAllocator* allocator, const tflite::Tensor& flatbuffer_tensor,
+    SimpleMemoryAllocator* allocator, bool allocate_temp,
+    const tflite::Tensor& flatbuffer_tensor,
     const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
     ErrorReporter* error_reporter, TfLiteTensor* result);
 
@@ -105,24 +110,51 @@ class MicroAllocator {
   // This method will run through the flatbuffer data supplied in the model to
   // properly allocate tensor, node, and op registration data. This method is
   // expected to be followed with a call to FinishModelAllocation() before
-  // resuming allocation with another model.
+  // resuming allocation with another model. All persistent tensor buffers are
+  // stored in the out-param eval_tensors. This value is allocated from the
+  // persistent memory arena and will be used to host runtime tensor buffers.
   TfLiteStatus StartModelAllocation(
-      const Model* model, TfLiteContext* context,
-      const MicroOpResolver& op_resolver,
-      NodeAndRegistration** node_and_registrations);
+      const Model* model, const MicroOpResolver& op_resolver,
+      NodeAndRegistration** node_and_registrations,
+      TfLiteEvalTensor** eval_tensors);
 
   // Finish allocating internal resources required for model inference.
   // This method will plan non-persistent buffers and commit a memory plan to
   // the 'head' section of the memory arena. All variable tensor data will also
   // be allocated. This method should be called after assigning model resources
-  // in StartModelAllocation().
+  // in StartModelAllocation(). The eval_tensors pointer should be the value
+  // passed into this class during StartModelAllocation().
   TfLiteStatus FinishModelAllocation(const Model* model,
-                                     TfLiteContext* context);
+                                     TfLiteEvalTensor* eval_tensors);
+
+  // Allocates a TfLiteTensor struct and populates the returned value with
+  // properties from the model flatbuffer. This struct is allocated from
+  // persistent arena memory is only guaranteed for the lifetime of the
+  // application. The eval_tensors pointer should be the value passed into this
+  // class during StartModelAllocation() and contains the source-of-truth for
+  // buffers.
+  virtual TfLiteTensor* AllocatePersistentTfLiteTensor(
+      const Model* model, TfLiteEvalTensor* eval_tensors, int tensor_index);
+
+  // Allocates a TfLiteTensor struct and populates the returned value with
+  // properties from the model flatbuffer. This struct is allocated from
+  // temporary arena memory is only guaranteed until a call is made to
+  // ResetTempAllocations(). The eval_tensors pointer should be the value passed
+  // into this class during StartModelAllocation() and contains the
+  // source-of-truth for buffers.
+  virtual TfLiteTensor* AllocateTempTfLiteTensor(const Model* model,
+                                                 TfLiteEvalTensor* eval_tensors,
+                                                 int tensor_index);
+
+  // Resets all temporary allocations. This method should be called after a
+  // chain of temp allocations (e.g. chain of TfLiteTensor objects via
+  // AllocateTfLiteTensor()).
+  virtual void ResetTempAllocations();
 
   // Allocates persistent buffer which has the same life time as the allocator.
   // The memory is immediately available and is allocated from the tail of the
   // arena.
-  TfLiteStatus AllocatePersistentBuffer(size_t bytes, void** ptr);
+  void* AllocatePersistentBuffer(size_t bytes);
 
   // Register a scratch buffer of size `bytes` for Node with `node_id`.
   // This method only allocates a BufferHandle holding information for memory
@@ -144,49 +176,59 @@ class MicroAllocator {
                  ErrorReporter* error_reporter);
   virtual ~MicroAllocator();
 
-  // Allocates an array in the arena to hold pointers to the tensors required
-  // to initialize and prepare a model. These allocations are stored and
-  // populated on the context.
-  virtual TfLiteStatus AllocateTfLiteTensorArray(TfLiteContext* context,
-                                                 const SubGraph* subgraph);
-
-  // Populates content on the list of tensor pointers required to initialize and
-  // prepare a model from data in the flatbuffer (loaded from the TfLiteModel
-  // instance). Persistent data (e.g. quantization params) is allocated from the
-  // arena.
-  virtual TfLiteStatus PopulateTfLiteTensorArrayFromFlatbuffer(
-      const Model* model, TfLiteContext* context, const SubGraph* subgraph);
-
   // Allocates an array in the arena to hold pointers to the node and
   // registration pointers required to represent the inference graph of the
   // model.
   virtual TfLiteStatus AllocateNodeAndRegistrations(
-      const SubGraph* subgraph, NodeAndRegistration** node_and_registrations);
+      const Model* model, NodeAndRegistration** node_and_registrations);
 
   // Populates node and registration pointers representing the inference graph
   // of the model from values inside the flatbuffer (loaded from the TfLiteModel
   // instance). Persistent data (e.g. operator data) is allocated from the
   // arena.
   virtual TfLiteStatus PrepareNodeAndRegistrationDataFromFlatbuffer(
-      const Model* model, const SubGraph* subgraph,
-      const MicroOpResolver& op_resolver,
+      const Model* model, const MicroOpResolver& op_resolver,
       NodeAndRegistration* node_and_registrations);
 
+  // Allocates the list of persistent TfLiteEvalTensors that are used for the
+  // "eval" phase of model inference. These structs will be the source of truth
+  // for all tensor buffers. Allocation results are stored in the out-param
+  // eval_tensors.
+  virtual TfLiteStatus AllocateTfLiteEvalTensors(
+      const Model* model, TfLiteEvalTensor** eval_tensors);
+
   // Allocates persistent tensor buffers for variable tensors in the subgraph.
-  virtual TfLiteStatus AllocateVariables(TfLiteContext* context,
-                                         const SubGraph* subgraph);
+  virtual TfLiteStatus AllocateVariables(const SubGraph* subgraph,
+                                         TfLiteEvalTensor* eval_tensors);
+
+  // TODO(b/160894903): Once all kernels have been updated to the new API drop
+  // this method. It is only used to record TfLiteTensor persistent allocations.
+  virtual TfLiteTensor* AllocatePersistentTfLiteTensorInternal(
+      const Model* model, TfLiteEvalTensor* eval_tensors, int tensor_index);
+
+  // Populates a TfLiteTensor struct with data from the model flatbuffer. Any
+  // quantization data is allocated from either the tail (persistent) or temp
+  // sections of the arena based on the allocation flag.
+  // TODO(b/160894903): Once all kernels have been updated to the new API drop
+  // this function since all allocations for quantized data will take place in
+  // the temp section.
+  virtual TfLiteStatus PopulateTfLiteTensorFromFlatbuffer(
+      const Model* model, const SubGraph* subgraph, TfLiteTensor* tensor,
+      int tensor_index, bool allocate_temp);
 
   ErrorReporter* error_reporter() const;
 
- private:
   // Returns the first subgraph from the model.
   const SubGraph* GetSubGraphFromModel(const Model* model);
 
+ private:
   // Commits a memory plan for all non-persistent buffer allocations in the
-  // 'head' section of the memory arena.
+  // 'head' section of the memory arena. The eval_tensors pointer is the list of
+  // pre-allocated TfLiteEvalTensor structs that will point to the buffers that
+  // will be allocated into the head section in this function call.
   virtual TfLiteStatus CommitStaticMemoryPlan(const Model* model,
-                                              TfLiteContext* context,
-                                              const SubGraph* subgraph);
+                                              const SubGraph* subgraph,
+                                              TfLiteEvalTensor* eval_tensors);
 
   // A simple memory allocator that always allocate from the arena tail or head.
   SimpleMemoryAllocator* memory_allocator_;
diff --git a/tensorflow/lite/micro/micro_allocator_test.cc b/tensorflow/lite/micro/micro_allocator_test.cc
index 510c5ac348e..32d52a994d9 100644
--- a/tensorflow/lite/micro/micro_allocator_test.cc
+++ b/tensorflow/lite/micro/micro_allocator_test.cc
@@ -17,9 +17,11 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "tensorflow/lite/micro/memory_helpers.h"
 #include "tensorflow/lite/micro/simple_memory_allocator.h"
 #include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
+#include "tensorflow/lite/micro/testing/test_conv_model.h"
 
 namespace tflite {
 namespace testing {
@@ -27,32 +29,74 @@ namespace {
 
 constexpr int kExpectedAlignment = 4;
 
-void VerifyMockTensor(TfLiteTensor* tensor, bool is_variable = false) {
+void VerifyMockTfLiteTensor(TfLiteTensor* tensor, bool is_variable = false) {
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, tensor->type);
   TF_LITE_MICRO_EXPECT_EQ(1, tensor->dims->size);
   TF_LITE_MICRO_EXPECT_EQ(1, tensor->dims->data[0]);
   TF_LITE_MICRO_EXPECT_EQ(is_variable, tensor->is_variable);
-  TF_LITE_MICRO_EXPECT_EQ(4, tensor->bytes);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(4), tensor->bytes);
   TF_LITE_MICRO_EXPECT_NE(nullptr, tensor->data.raw);
-  TF_LITE_MICRO_EXPECT_EQ(0,
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(0),
                           (reinterpret_cast<std::uintptr_t>(tensor->data.raw) %
                            kExpectedAlignment));
 }
 
-void VerifyMockWeightTensor(TfLiteTensor* tensor) {
+void VerifyMockWeightTfLiteTensor(TfLiteTensor* tensor) {
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, tensor->type);
   TF_LITE_MICRO_EXPECT_EQ(1, tensor->dims->size);
   TF_LITE_MICRO_EXPECT_EQ(1, tensor->dims->data[0]);
-  TF_LITE_MICRO_EXPECT_EQ(1, tensor->bytes);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(1), tensor->bytes);
   TF_LITE_MICRO_EXPECT_NE(nullptr, tensor->data.raw);
 }
 
-void EnsureUniqueVariableTensorBuffer(TfLiteContext* context,
+void VerifyMockTfLiteEvalTensor(TfLiteEvalTensor* tensor) {
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, tensor->type);
+  TF_LITE_MICRO_EXPECT_EQ(1, tensor->dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(1, tensor->dims->data[0]);
+  size_t buffer_size;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, tflite::TfLiteEvalTensorByteLength(tensor, &buffer_size));
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(4), buffer_size);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, tensor->data.raw);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(0),
+                          (reinterpret_cast<std::uintptr_t>(tensor->data.raw) %
+                           kExpectedAlignment));
+}
+
+void VerifyMockWeightTfLiteEvalTensor(TfLiteEvalTensor* tensor) {
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, tensor->type);
+  TF_LITE_MICRO_EXPECT_EQ(1, tensor->dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(1, tensor->dims->data[0]);
+  size_t buffer_size;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, tflite::TfLiteEvalTensorByteLength(tensor, &buffer_size));
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(1), buffer_size);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, tensor->data.raw);
+}
+
+void VerifyMockTensor(const Model* model, MicroAllocator* allocator,
+                      TfLiteEvalTensor* eval_tensors, int tensor_idx,
+                      bool is_variable = false) {
+  VerifyMockTfLiteTensor(allocator->AllocatePersistentTfLiteTensor(
+                             model, eval_tensors, tensor_idx),
+                         is_variable);
+  VerifyMockTfLiteEvalTensor(&eval_tensors[tensor_idx]);
+}
+
+void VerifyMockWeightTensor(const Model* model, MicroAllocator* allocator,
+                            TfLiteEvalTensor* eval_tensors, int tensor_idx) {
+  VerifyMockWeightTfLiteTensor(allocator->AllocatePersistentTfLiteTensor(
+      model, eval_tensors, tensor_idx));
+  VerifyMockWeightTfLiteEvalTensor(&eval_tensors[tensor_idx]);
+}
+
+void EnsureUniqueVariableTensorBuffer(const Model* model,
+                                      TfLiteEvalTensor* eval_tensors,
                                       const int variable_tensor_idx) {
-  for (int i = 0; i < context->tensors_size; i++) {
-    if (i != variable_tensor_idx) {
-      TF_LITE_MICRO_EXPECT_NE(context->tensors[variable_tensor_idx].data.raw,
-                              context->tensors[i].data.raw);
+  for (size_t i = 0; i < GetModelTensorCount(model); i++) {
+    if (i != static_cast<size_t>(variable_tensor_idx)) {
+      TF_LITE_MICRO_EXPECT_NE(eval_tensors[variable_tensor_idx].data.raw,
+                              eval_tensors[i].data.raw);
     }
   }
 }
@@ -73,8 +117,6 @@ void VerifyRegistrationAndNodeAllocation(
 TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(TestInitializeRuntimeTensor) {
-  const tflite::Model* model = tflite::testing::GetSimpleMockModel();
-  TfLiteContext context;
   constexpr size_t arena_size = 1024;
   uint8_t arena[arena_size];
   tflite::SimpleMemoryAllocator* simple_allocator =
@@ -88,21 +130,49 @@ TF_LITE_MICRO_TEST(TestInitializeRuntimeTensor) {
   TfLiteTensor allocated_tensor;
   TF_LITE_MICRO_EXPECT_EQ(
       kTfLiteOk, tflite::internal::InitializeTfLiteTensorFromFlatbuffer(
-                     simple_allocator, *tensor, buffers, micro_test::reporter,
-                     &allocated_tensor));
+                     simple_allocator, /*allocate_temp=*/false, *tensor,
+                     buffers, micro_test::reporter, &allocated_tensor));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, allocated_tensor.type);
   TF_LITE_MICRO_EXPECT_EQ(1, allocated_tensor.dims->size);
   TF_LITE_MICRO_EXPECT_EQ(100, allocated_tensor.dims->data[0]);
-  TF_LITE_MICRO_EXPECT_EQ(400, allocated_tensor.bytes);
-  TF_LITE_MICRO_EXPECT_EQ(nullptr, allocated_tensor.data.i32);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(400), allocated_tensor.bytes);
+  TF_LITE_MICRO_EXPECT(nullptr == allocated_tensor.data.i32);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteArenaRw, allocated_tensor.allocation_type);
 
   simple_allocator->~SimpleMemoryAllocator();
 }
 
+// TODO(b/160894903): Drop this test when InitializeTfLiteTensorFromFlatbuffer()
+// always allocates from temp (kernels are using the new TfLiteEvalTensor API):
+TF_LITE_MICRO_TEST(TestInitializeTempRuntimeTensor) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::SimpleMemoryAllocator* simple_allocator =
+      tflite::SimpleMemoryAllocator::Create(micro_test::reporter, arena,
+                                            arena_size);
+
+  const tflite::Tensor* tensor = tflite::testing::Create1dFlatbufferTensor(100);
+  const flatbuffers::Vector<flatbuffers::Offset<tflite::Buffer>>* buffers =
+      tflite::testing::CreateFlatbufferBuffers();
+
+  TfLiteTensor allocated_temp_tensor;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, tflite::internal::InitializeTfLiteTensorFromFlatbuffer(
+                     simple_allocator, /*allocate_temp=*/true, *tensor, buffers,
+                     micro_test::reporter, &allocated_temp_tensor));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, allocated_temp_tensor.type);
+  TF_LITE_MICRO_EXPECT_EQ(1, allocated_temp_tensor.dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(100, allocated_temp_tensor.dims->data[0]);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(400),
+                          allocated_temp_tensor.bytes);
+  TF_LITE_MICRO_EXPECT(nullptr == allocated_temp_tensor.data.i32);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteArenaRw,
+                          allocated_temp_tensor.allocation_type);
+
+  simple_allocator->~SimpleMemoryAllocator();
+}
+
 TF_LITE_MICRO_TEST(TestInitializeQuantizedTensor) {
-  const tflite::Model* model = tflite::testing::GetSimpleMockModel();
-  TfLiteContext context;
   constexpr size_t arena_size = 1024;
   uint8_t arena[arena_size];
   tflite::SimpleMemoryAllocator* simple_allocator =
@@ -117,21 +187,19 @@ TF_LITE_MICRO_TEST(TestInitializeQuantizedTensor) {
   TfLiteTensor allocated_tensor;
   TF_LITE_MICRO_EXPECT_EQ(
       kTfLiteOk, tflite::internal::InitializeTfLiteTensorFromFlatbuffer(
-                     simple_allocator, *tensor, buffers, micro_test::reporter,
-                     &allocated_tensor));
+                     simple_allocator, /*allocate_temp=*/false, *tensor,
+                     buffers, micro_test::reporter, &allocated_tensor));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, allocated_tensor.type);
   TF_LITE_MICRO_EXPECT_EQ(1, allocated_tensor.dims->size);
   TF_LITE_MICRO_EXPECT_EQ(100, allocated_tensor.dims->data[0]);
-  TF_LITE_MICRO_EXPECT_EQ(400, allocated_tensor.bytes);
-  TF_LITE_MICRO_EXPECT_EQ(nullptr, allocated_tensor.data.i32);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(400), allocated_tensor.bytes);
+  TF_LITE_MICRO_EXPECT(nullptr == allocated_tensor.data.i32);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteArenaRw, allocated_tensor.allocation_type);
 
   simple_allocator->~SimpleMemoryAllocator();
 }
 
 TF_LITE_MICRO_TEST(TestMissingQuantization) {
-  const tflite::Model* model = tflite::testing::GetSimpleMockModel();
-  TfLiteContext context;
   constexpr size_t arena_size = 1024;
   uint8_t arena[arena_size];
   tflite::SimpleMemoryAllocator* simple_allocator =
@@ -146,36 +214,38 @@ TF_LITE_MICRO_TEST(TestMissingQuantization) {
   TfLiteTensor allocated_tensor;
   TF_LITE_MICRO_EXPECT_EQ(
       kTfLiteOk, tflite::internal::InitializeTfLiteTensorFromFlatbuffer(
-                     simple_allocator, *tensor, buffers, micro_test::reporter,
-                     &allocated_tensor));
+                     simple_allocator, /*allocate_temp=*/false, *tensor,
+                     buffers, micro_test::reporter, &allocated_tensor));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, allocated_tensor.type);
   TF_LITE_MICRO_EXPECT_EQ(1, allocated_tensor.dims->size);
   TF_LITE_MICRO_EXPECT_EQ(100, allocated_tensor.dims->data[0]);
-  TF_LITE_MICRO_EXPECT_EQ(400, allocated_tensor.bytes);
-  TF_LITE_MICRO_EXPECT_EQ(nullptr, allocated_tensor.data.i32);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(400), allocated_tensor.bytes);
+  TF_LITE_MICRO_EXPECT(nullptr == allocated_tensor.data.i32);
 }
 
 TF_LITE_MICRO_TEST(TestFailsWhenModelStartsTwice) {
   const tflite::Model* model = tflite::testing::GetSimpleMockModel();
-  TfLiteContext context;
+  TfLiteEvalTensor* eval_tensors = nullptr;
   tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
   tflite::NodeAndRegistration* node_and_registration;
   constexpr size_t arena_size = 1024;
   uint8_t arena[arena_size];
   tflite::MicroAllocator* allocator =
       tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, allocator);
+  TF_LITE_MICRO_EXPECT(nullptr != allocator);
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator->StartModelAllocation(model, &context, op_resolver,
-                                                 &node_and_registration));
+      kTfLiteOk,
+      allocator->StartModelAllocation(model, op_resolver,
+                                      &node_and_registration, &eval_tensors));
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteError, allocator->StartModelAllocation(
-                        model, &context, op_resolver, &node_and_registration));
+      kTfLiteError,
+      allocator->StartModelAllocation(model, op_resolver,
+                                      &node_and_registration, &eval_tensors));
 }
 
-TF_LITE_MICRO_TEST(TestFailsWhenModelFinishesBeforeStart) {
+TF_LITE_MICRO_TEST(TestFailsWithWrongSequence) {
   const tflite::Model* model = tflite::testing::GetSimpleMockModel();
-  TfLiteContext context;
+  TfLiteEvalTensor* eval_tensors = nullptr;
   tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
   tflite::NodeAndRegistration* node_and_registration;
   constexpr size_t arena_size = 1024;
@@ -183,56 +253,103 @@ TF_LITE_MICRO_TEST(TestFailsWhenModelFinishesBeforeStart) {
   tflite::MicroAllocator* allocator =
       tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
   TF_LITE_MICRO_EXPECT_NE(nullptr, allocator);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
-                          allocator->FinishModelAllocation(model, &context));
+
+  // We can't finish allocation before it ever got started.
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteError, allocator->FinishModelAllocation(model, eval_tensors));
+
+  // Start twice is not allowed.
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      allocator->StartModelAllocation(model, op_resolver,
+                                      &node_and_registration, &eval_tensors));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteError,
+      allocator->StartModelAllocation(model, op_resolver,
+                                      &node_and_registration, &eval_tensors));
 }
 
 TF_LITE_MICRO_TEST(TestMockModelAllocation) {
   const tflite::Model* model = tflite::testing::GetSimpleMockModel();
-  TfLiteContext context;
+  TfLiteEvalTensor* eval_tensors = nullptr;
   tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
   tflite::NodeAndRegistration* node_and_registration;
   constexpr size_t arena_size = 1024;
   uint8_t arena[arena_size];
   tflite::MicroAllocator* allocator =
       tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, allocator);
+  TF_LITE_MICRO_EXPECT(nullptr != allocator);
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator->StartModelAllocation(model, &context, op_resolver,
-                                                 &node_and_registration));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          allocator->FinishModelAllocation(model, &context));
+      kTfLiteOk,
+      allocator->StartModelAllocation(model, op_resolver,
+                                      &node_and_registration, &eval_tensors));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, allocator->FinishModelAllocation(model, eval_tensors));
 
-  TF_LITE_MICRO_EXPECT_EQ(4, context.tensors_size);
+  size_t model_tensor_size = tflite::testing::GetModelTensorCount(model);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(4), model_tensor_size);
 
-  // NOTE: Tensor indexes match the values in GetSimpleMockModel().
-  tflite::testing::VerifyMockTensor(&context.tensors[0]);
-  tflite::testing::VerifyMockWeightTensor(&context.tensors[1]);
-  tflite::testing::VerifyMockTensor(&context.tensors[2]);
-  tflite::testing::VerifyMockTensor(&context.tensors[3]);
+  tflite::testing::VerifyMockTensor(model, allocator, eval_tensors, 0);
+  tflite::testing::VerifyMockWeightTensor(model, allocator, eval_tensors, 1);
+  tflite::testing::VerifyMockTensor(model, allocator, eval_tensors, 2);
+  tflite::testing::VerifyMockTensor(model, allocator, eval_tensors, 3);
 
-  TF_LITE_MICRO_EXPECT_NE(context.tensors[1].data.raw,
-                          context.tensors[0].data.raw);
-  TF_LITE_MICRO_EXPECT_NE(context.tensors[2].data.raw,
-                          context.tensors[0].data.raw);
-  TF_LITE_MICRO_EXPECT_NE(context.tensors[1].data.raw,
-                          context.tensors[2].data.raw);
-  TF_LITE_MICRO_EXPECT_NE(context.tensors[3].data.raw,
-                          context.tensors[0].data.raw);
-  TF_LITE_MICRO_EXPECT_NE(context.tensors[3].data.raw,
-                          context.tensors[1].data.raw);
-  TF_LITE_MICRO_EXPECT_NE(context.tensors[3].data.raw,
-                          context.tensors[2].data.raw);
-  TF_LITE_MICRO_EXPECT_LE(allocator->used_bytes(), 760 + 100);
+  TF_LITE_MICRO_EXPECT_NE(eval_tensors[1].data.raw, eval_tensors[0].data.raw);
+  TF_LITE_MICRO_EXPECT_NE(eval_tensors[2].data.raw, eval_tensors[0].data.raw);
+  TF_LITE_MICRO_EXPECT_NE(eval_tensors[1].data.raw, eval_tensors[2].data.raw);
+  TF_LITE_MICRO_EXPECT_NE(eval_tensors[3].data.raw, eval_tensors[0].data.raw);
+  TF_LITE_MICRO_EXPECT_NE(eval_tensors[3].data.raw, eval_tensors[1].data.raw);
+  TF_LITE_MICRO_EXPECT_NE(eval_tensors[3].data.raw, eval_tensors[2].data.raw);
+  TF_LITE_MICRO_EXPECT_LE(allocator->used_bytes(), 856 + 100);
 
   // SimpleMockModel has 2 operators:
   tflite::testing::VerifyRegistrationAndNodeAllocation(node_and_registration,
                                                        /*count=*/2);
 }
 
+TF_LITE_MICRO_TEST(TestMultiTenantAllocation) {
+  // The `OpResolver` is shared among different models in this test for
+  // simplicity but in practice you could have different `OpResolver`.
+  tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
+
+  // Create a shared allocator.
+  constexpr size_t arena_size = 4096;
+  uint8_t arena[arena_size];
+  tflite::MicroAllocator* allocator =
+      tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, allocator);
+  TfLiteEvalTensor* eval_tensors = nullptr;
+
+  // Allocate for model 1. We use ComplexMockModel here to cover the code path
+  // allocatig variables.
+  const tflite::Model* model1 = tflite::testing::GetComplexMockModel();
+  tflite::NodeAndRegistration* node_and_registration1;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      allocator->StartModelAllocation(model1, op_resolver,
+                                      &node_and_registration1, &eval_tensors));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, allocator->FinishModelAllocation(model1, eval_tensors));
+  const size_t single_model_used_bytes = allocator->used_bytes();
+
+  // Allocate for model 2.
+  const tflite::Model* model2 = tflite::testing::GetComplexMockModel();
+  tflite::NodeAndRegistration* node_and_registration2;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      allocator->StartModelAllocation(model2, op_resolver,
+                                      &node_and_registration2, &eval_tensors));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, allocator->FinishModelAllocation(model2, eval_tensors));
+
+  // Allocation for two instances of the same model takes less memory as `head`
+  // of the arena is reused.
+  TF_LITE_MICRO_EXPECT_LE(allocator->used_bytes(), 2 * single_model_used_bytes);
+}
+
 TF_LITE_MICRO_TEST(TestAllocationForModelsWithBranches) {
   const tflite::Model* model = tflite::testing::GetSimpleModelWithBranch();
-  TfLiteContext context;
+  TfLiteEvalTensor* eval_tensors = nullptr;
   tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
   tflite::NodeAndRegistration* node_and_registration;
   constexpr size_t arena_size = 4096;
@@ -241,24 +358,28 @@ TF_LITE_MICRO_TEST(TestAllocationForModelsWithBranches) {
       tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
   TF_LITE_MICRO_EXPECT_NE(nullptr, allocator);
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator->StartModelAllocation(model, &context, op_resolver,
-                                                 &node_and_registration));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          allocator->FinishModelAllocation(model, &context));
+      kTfLiteOk,
+      allocator->StartModelAllocation(model, op_resolver,
+                                      &node_and_registration, &eval_tensors));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, allocator->FinishModelAllocation(model, eval_tensors));
 
-  uint8_t* start = context.tensors[0].data.uint8;
+  uint8_t* start = eval_tensors[0].data.uint8;
   // Check test_helpers.cc BuildSimpleModelWithBranch for model structure.
   // t0 is the first tensor, so place it in offset 0.
-  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[0].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[0].data.uint8 - start);
   // bytes = 2 * 2 * 3 * sizeof(float32) = 48, same for other tensors.
-  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[0].bytes);
+  size_t buffer_size;
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, tflite::TfLiteEvalTensorByteLength(
+                                         &eval_tensors[0], &buffer_size));
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(48), buffer_size);
   // t1 can't reuse any memory, as n0 requires both t0 and t1.
-  TF_LITE_MICRO_EXPECT_EQ(96, context.tensors[1].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(96, eval_tensors[1].data.uint8 - start);
   // t2 can't reuse any memory, as n1 requires both t0 and t2. Also n2 requires
   // both t1 and t2.
-  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[2].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(48, eval_tensors[2].data.uint8 - start);
   // t3 reuses the same memory from t0 as t0 is not an input to any node.
-  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[3].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[3].data.uint8 - start);
 
   // SimpleModelWithBranch has 3 operators:
   tflite::testing::VerifyRegistrationAndNodeAllocation(node_and_registration,
@@ -267,41 +388,43 @@ TF_LITE_MICRO_TEST(TestAllocationForModelsWithBranches) {
 
 TF_LITE_MICRO_TEST(TestAllocationForComplexModelAllocation) {
   const tflite::Model* model = tflite::testing::GetComplexMockModel();
-  TfLiteContext context;
+  TfLiteEvalTensor* eval_tensors = nullptr;
   tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
   tflite::NodeAndRegistration* node_and_registration;
   constexpr size_t arena_size = 2048;
   uint8_t arena[arena_size];
   tflite::MicroAllocator* allocator =
       tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, allocator);
+  TF_LITE_MICRO_EXPECT(nullptr != allocator);
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator->StartModelAllocation(model, &context, op_resolver,
-                                                 &node_and_registration));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          allocator->FinishModelAllocation(model, &context));
+      kTfLiteOk,
+      allocator->StartModelAllocation(model, op_resolver,
+                                      &node_and_registration, &eval_tensors));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, allocator->FinishModelAllocation(model, eval_tensors));
 
-  TF_LITE_MICRO_EXPECT_EQ(10, context.tensors_size);
+  size_t model_tensor_size = tflite::testing::GetModelTensorCount(model);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(10), model_tensor_size);
 
   // NOTE: Tensor indexes match the values in GetComplexMockModel().
-  tflite::testing::VerifyMockTensor(&context.tensors[0]);
-  tflite::testing::VerifyMockTensor(&context.tensors[1],
-                                    true /* is_variable */);
-  tflite::testing::VerifyMockWeightTensor(&context.tensors[2]);
-  tflite::testing::VerifyMockTensor(&context.tensors[3]);
-  tflite::testing::VerifyMockTensor(&context.tensors[4],
-                                    true /* is_variable */);
-  tflite::testing::VerifyMockWeightTensor(&context.tensors[5]);
-  tflite::testing::VerifyMockTensor(&context.tensors[6]);
-  tflite::testing::VerifyMockTensor(&context.tensors[7],
-                                    true /* is_variable */);
-  tflite::testing::VerifyMockWeightTensor(&context.tensors[8]);
-  tflite::testing::VerifyMockTensor(&context.tensors[9]);
+  tflite::testing::VerifyMockTensor(model, allocator, eval_tensors, 0);
+  tflite::testing::VerifyMockTensor(model, allocator, eval_tensors, 1,
+                                    /*is_variable=*/true);
+  tflite::testing::VerifyMockWeightTensor(model, allocator, eval_tensors, 2);
+  tflite::testing::VerifyMockTensor(model, allocator, eval_tensors, 3);
+  tflite::testing::VerifyMockTensor(model, allocator, eval_tensors, 4,
+                                    /*is_variable=*/true);
+  tflite::testing::VerifyMockWeightTensor(model, allocator, eval_tensors, 5);
+  tflite::testing::VerifyMockTensor(model, allocator, eval_tensors, 6);
+  tflite::testing::VerifyMockTensor(model, allocator, eval_tensors, 7,
+                                    /*is_variable=*/true);
+  tflite::testing::VerifyMockWeightTensor(model, allocator, eval_tensors, 8);
+  tflite::testing::VerifyMockTensor(model, allocator, eval_tensors, 9);
 
-  // Ensure that variable tensors have unique address
-  tflite::testing::EnsureUniqueVariableTensorBuffer(&context, 1);
-  tflite::testing::EnsureUniqueVariableTensorBuffer(&context, 4);
-  tflite::testing::EnsureUniqueVariableTensorBuffer(&context, 7);
+  // // Ensure that variable tensors have unique address
+  tflite::testing::EnsureUniqueVariableTensorBuffer(model, eval_tensors, 1);
+  tflite::testing::EnsureUniqueVariableTensorBuffer(model, eval_tensors, 4);
+  tflite::testing::EnsureUniqueVariableTensorBuffer(model, eval_tensors, 7);
 
   // ComplexMockModel has 3 operators:
   tflite::testing::VerifyRegistrationAndNodeAllocation(node_and_registration,
@@ -339,27 +462,32 @@ TF_LITE_MICRO_TEST(OfflinePlannerBranchesAllOnline) {
   const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning(
       nbr_tensors, metadata_buffer, node_list, num_conns);
 
-  TfLiteContext context;
+  TfLiteEvalTensor* eval_tensors = nullptr;
   constexpr size_t arena_size = 4096;
   uint8_t arena[arena_size];
   tflite::MicroAllocator* allocator =
       tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
 
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator->StartModelAllocation(model, &context, op_resolver,
-                                                 &node_and_registration));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          allocator->FinishModelAllocation(model, &context));
+      kTfLiteOk,
+      allocator->StartModelAllocation(model, op_resolver,
+                                      &node_and_registration, &eval_tensors));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, allocator->FinishModelAllocation(model, eval_tensors));
 
   // Since all of the tensors are online planned and the model structure is
   // identical to that in TestAllocationForModelsWithBranches,
   // the offsets be should identical to that test.
-  uint8_t* start = context.tensors[0].data.uint8;
-  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[0].data.uint8 - start);
-  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[0].bytes);
-  TF_LITE_MICRO_EXPECT_EQ(96, context.tensors[1].data.uint8 - start);
-  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[2].data.uint8 - start);
-  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[3].data.uint8 - start);
+  uint8_t* start = eval_tensors[0].data.uint8;
+  TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[0].data.uint8 - start);
+
+  size_t buffer_size;
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, tflite::TfLiteEvalTensorByteLength(
+                                         &eval_tensors[0], &buffer_size));
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(48), buffer_size);
+  TF_LITE_MICRO_EXPECT_EQ(96, eval_tensors[1].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(48, eval_tensors[2].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[3].data.uint8 - start);
 }
 
 TF_LITE_MICRO_TEST(OfflinePlannerBasic) {
@@ -395,23 +523,24 @@ TF_LITE_MICRO_TEST(OfflinePlannerBasic) {
   const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning(
       nbr_tensors, metadata_buffer, node_list, num_conns);
 
-  TfLiteContext context;
+  TfLiteEvalTensor* eval_tensors = nullptr;
   constexpr size_t arena_size = 4096;
   uint8_t arena[arena_size];
   tflite::MicroAllocator* allocator =
       tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
 
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator->StartModelAllocation(model, &context, op_resolver,
-                                                 &node_and_registration));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          allocator->FinishModelAllocation(model, &context));
+      kTfLiteOk,
+      allocator->StartModelAllocation(model, op_resolver,
+                                      &node_and_registration, &eval_tensors));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, allocator->FinishModelAllocation(model, eval_tensors));
 
-  uint8_t* start = context.tensors[0].data.uint8;
-  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[0].data.uint8 - start);
-  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[1].data.uint8 - start);
-  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[2].data.uint8 - start);
-  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[3].data.uint8 - start);
+  uint8_t* start = eval_tensors[0].data.uint8;
+  TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[0].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(48, eval_tensors[1].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[2].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(48, eval_tensors[3].data.uint8 - start);
 }
 
 TF_LITE_MICRO_TEST(OfflinePlannerOverlappingAllocation) {
@@ -447,24 +576,25 @@ TF_LITE_MICRO_TEST(OfflinePlannerOverlappingAllocation) {
   const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning(
       nbr_tensors, metadata_buffer, node_list, num_conns);
 
-  TfLiteContext context;
+  TfLiteEvalTensor* eval_tensors = nullptr;
   constexpr size_t arena_size = 4096;
   uint8_t arena[arena_size];
   tflite::MicroAllocator* allocator =
       tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
 
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator->StartModelAllocation(model, &context, op_resolver,
-                                                 &node_and_registration));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          allocator->FinishModelAllocation(model, &context));
+      kTfLiteOk,
+      allocator->StartModelAllocation(model, op_resolver,
+                                      &node_and_registration, &eval_tensors));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, allocator->FinishModelAllocation(model, eval_tensors));
 
-  uint8_t* start = context.tensors[0].data.uint8;
-  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[0].data.uint8 - start);
-  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[1].data.uint8 - start);
-  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[2].data.uint8 - start);
-  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[3].data.uint8 - start);
-  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[0].bytes);
+  uint8_t* start = eval_tensors[0].data.uint8;
+  TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[0].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[1].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(48, eval_tensors[2].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[3].data.uint8 - start);
+  // TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(48), context.tensors[0].bytes);
 }
 
 TF_LITE_MICRO_TEST(OfflinePlannerOfflineOnline) {
@@ -502,24 +632,107 @@ TF_LITE_MICRO_TEST(OfflinePlannerOfflineOnline) {
   const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning(
       nbr_tensors, metadata_buffer, node_list, num_conns);
 
-  TfLiteContext context;
+  TfLiteEvalTensor* eval_tensors = nullptr;
   constexpr size_t arena_size = 4096;
   uint8_t arena[arena_size];
   tflite::MicroAllocator* allocator =
       tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
 
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator->StartModelAllocation(model, &context, op_resolver,
-                                                 &node_and_registration));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          allocator->FinishModelAllocation(model, &context));
+      kTfLiteOk,
+      allocator->StartModelAllocation(model, op_resolver,
+                                      &node_and_registration, &eval_tensors));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, allocator->FinishModelAllocation(model, eval_tensors));
 
-  uint8_t* start = context.tensors[0].data.uint8;
-  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[0].data.uint8 - start);
-  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[1].data.uint8 - start);
-  TF_LITE_MICRO_EXPECT_EQ(96, context.tensors[2].data.uint8 - start);
-  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[4].data.uint8 - start);
-  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[3].data.uint8 - start);
+  uint8_t* start = eval_tensors[0].data.uint8;
+  TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[0].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(48, eval_tensors[1].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(96, eval_tensors[2].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(48, eval_tensors[4].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[3].data.uint8 - start);
+}
+
+TF_LITE_MICRO_TEST(TestAllocatePersistentTfLiteTensor) {
+  const tflite::Model* model = tflite::GetModel(kTestConvModelData);
+  constexpr size_t arena_size = 1024 * 12;
+  uint8_t arena[arena_size];
+  tflite::MicroAllocator* allocator =
+      tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_NE(allocator, nullptr);
+
+  TfLiteTensor* tensor1 = allocator->AllocatePersistentTfLiteTensor(
+      model, /*eval_tensors=*/nullptr, /*tensor_index=*/1);
+  TF_LITE_MICRO_EXPECT_NE(tensor1, nullptr);
+  TF_LITE_MICRO_EXPECT_NE(tensor1->quantization.params, nullptr);
+  TF_LITE_MICRO_EXPECT_FALSE(tensor1->is_variable);
+
+  TfLiteTensor* tensor2 = allocator->AllocatePersistentTfLiteTensor(
+      model, /*eval_tensors=*/nullptr, /*tensor_index=*/2);
+  TF_LITE_MICRO_EXPECT_NE(tensor2, nullptr);
+  TF_LITE_MICRO_EXPECT_NE(tensor2->quantization.params, nullptr);
+  TF_LITE_MICRO_EXPECT_FALSE(tensor2->is_variable);
+
+  // The address of tensor1 should be higher than the address of tensor2 since
+  // persistent allocations take place in the tail which grows downward.
+  TF_LITE_MICRO_EXPECT_GT(tensor1, tensor2);
+}
+
+TF_LITE_MICRO_TEST(TestAllocateSingleTempTfLiteTensor) {
+  const tflite::Model* model = tflite::testing::GetSimpleMockModel();
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::MicroAllocator* allocator =
+      tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_NE(allocator, nullptr);
+
+  TfLiteTensor* tensor1 = allocator->AllocateTempTfLiteTensor(
+      model, /*eval_tensors=*/nullptr, /*tensor_index=*/1);
+  TF_LITE_MICRO_EXPECT_NE(tensor1, nullptr);
+}
+
+TF_LITE_MICRO_TEST(TestAllocateChainOfTfLiteTensor) {
+  const tflite::Model* model = tflite::testing::GetSimpleMockModel();
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::MicroAllocator* allocator =
+      tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_NE(allocator, nullptr);
+
+  TfLiteTensor* tensor1 = allocator->AllocateTempTfLiteTensor(
+      model, /*eval_tensors=*/nullptr, /*tensor_index=*/1);
+  TF_LITE_MICRO_EXPECT_NE(tensor1, nullptr);
+
+  TfLiteTensor* tensor2 = allocator->AllocateTempTfLiteTensor(
+      model, /*eval_tensors=*/nullptr, /*tensor_index=*/2);
+  TF_LITE_MICRO_EXPECT_NE(tensor2, nullptr);
+
+  // The address of tensor2 should be higher than the address of tensor1
+  // (chained allocations):
+  TF_LITE_MICRO_EXPECT_GT(tensor2, tensor1);
+}
+
+TF_LITE_MICRO_TEST(TestAllocateTfLiteTensorWithReset) {
+  const tflite::Model* model = tflite::testing::GetSimpleMockModel();
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::MicroAllocator* allocator =
+      tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
+  TF_LITE_MICRO_EXPECT(allocator != nullptr);
+
+  TfLiteTensor* tensor1 = allocator->AllocateTempTfLiteTensor(
+      model, /*eval_tensors=*/nullptr, /*tensor_index=*/1);
+  TF_LITE_MICRO_EXPECT(tensor1 != nullptr);
+
+  allocator->ResetTempAllocations();
+
+  TfLiteTensor* tensor2 = allocator->AllocateTempTfLiteTensor(
+      model, /*eval_tensors=*/nullptr, /*tensor_index=*/2);
+  TF_LITE_MICRO_EXPECT(tensor2 != nullptr);
+
+  // The address of tensor2 should be equal than the address of tensor1 since
+  // allocations were not chained:
+  TF_LITE_MICRO_EXPECT(tensor2 == tensor1);
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/micro_error_reporter_test.cc b/tensorflow/lite/micro/micro_error_reporter_test.cc
index a23fc286fc5..b67a71628ed 100644
--- a/tensorflow/lite/micro/micro_error_reporter_test.cc
+++ b/tensorflow/lite/micro/micro_error_reporter_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 
 int main(int argc, char** argv) {
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
   tflite::MicroErrorReporter micro_error_reporter;
   tflite::ErrorReporter* error_reporter = &micro_error_reporter;
   TF_LITE_REPORT_ERROR(error_reporter, "Number: %d", 42);
@@ -23,4 +24,5 @@ int main(int argc, char** argv) {
   TF_LITE_REPORT_ERROR(error_reporter,
                        "Another % badly-formed %% format string");
   TF_LITE_REPORT_ERROR(error_reporter, "~~~%s~~~", "ALL TESTS PASSED");
+#endif  // !defined(TF_LITE_STRIP_ERROR_STRINGS)
 }
diff --git a/tensorflow/lite/micro/micro_interpreter.cc b/tensorflow/lite/micro/micro_interpreter.cc
index 08556a56a54..8c2f8e031d8 100644
--- a/tensorflow/lite/micro/micro_interpreter.cc
+++ b/tensorflow/lite/micro/micro_interpreter.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/tensor_utils.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
 #include "tensorflow/lite/micro/micro_allocator.h"
 #include "tensorflow/lite/micro/micro_op_resolver.h"
 #include "tensorflow/lite/micro/micro_profiler.h"
@@ -30,6 +31,7 @@ limitations under the License.
 namespace tflite {
 namespace {
 
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
 const char* OpNameFromRegistration(const TfLiteRegistration* registration) {
   if (registration->builtin_code == BuiltinOperator_CUSTOM) {
     return registration->custom_name;
@@ -37,15 +39,20 @@ const char* OpNameFromRegistration(const TfLiteRegistration* registration) {
     return EnumNameBuiltinOperator(BuiltinOperator(registration->builtin_code));
   }
 }
+#endif  // !defined(TF_LITE_STRIP_ERROR_STRINGS)
 
 }  // namespace
 
 namespace internal {
 
-TfLiteStatus ContextHelper::AllocatePersistentBuffer(TfLiteContext* ctx,
-                                                     size_t bytes, void** ptr) {
+ContextHelper::ContextHelper(ErrorReporter* error_reporter,
+                             MicroAllocator* allocator, const Model* model)
+    : allocator_(allocator), error_reporter_(error_reporter), model_(model) {}
+
+void* ContextHelper::AllocatePersistentBuffer(TfLiteContext* ctx,
+                                              size_t bytes) {
   return reinterpret_cast<ContextHelper*>(ctx->impl_)
-      ->allocator_->AllocatePersistentBuffer(bytes, ptr);
+      ->allocator_->AllocatePersistentBuffer(bytes);
 }
 
 TfLiteStatus ContextHelper::RequestScratchBufferInArena(TfLiteContext* ctx,
@@ -63,11 +70,32 @@ void* ContextHelper::GetScratchBuffer(TfLiteContext* ctx, int buffer_idx) {
 
 void ContextHelper::ReportOpError(struct TfLiteContext* context,
                                   const char* format, ...) {
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
   ContextHelper* helper = static_cast<ContextHelper*>(context->impl_);
   va_list args;
   va_start(args, format);
   TF_LITE_REPORT_ERROR(helper->error_reporter_, format, args);
   va_end(args);
+#endif
+}
+
+TfLiteTensor* ContextHelper::GetTensor(const struct TfLiteContext* context,
+                                       int tensor_idx) {
+  ContextHelper* helper = static_cast<ContextHelper*>(context->impl_);
+  return helper->allocator_->AllocateTempTfLiteTensor(
+      helper->model_, helper->eval_tensors_, tensor_idx);
+}
+
+TfLiteEvalTensor* ContextHelper::GetEvalTensor(
+    const struct TfLiteContext* context, int tensor_idx) {
+  ContextHelper* helper = reinterpret_cast<ContextHelper*>(context->impl_);
+  return &helper->eval_tensors_[tensor_idx];
+}
+
+void ContextHelper::SetNodeIndex(int idx) { current_node_idx_ = idx; }
+
+void ContextHelper::SetTfLiteEvalTensors(TfLiteEvalTensor* eval_tensors) {
+  eval_tensors_ = eval_tensors;
 }
 
 }  // namespace internal
@@ -85,7 +113,10 @@ MicroInterpreter::MicroInterpreter(const Model* model,
                                          error_reporter)),
       tensors_allocated_(false),
       initialization_status_(kTfLiteError),
-      context_helper_(error_reporter_, &allocator_) {
+      eval_tensors_(nullptr),
+      context_helper_(error_reporter_, &allocator_, model),
+      input_tensor_(nullptr),
+      output_tensor_(nullptr) {
   Init(profiler);
 }
 
@@ -100,7 +131,10 @@ MicroInterpreter::MicroInterpreter(const Model* model,
       allocator_(*allocator),
       tensors_allocated_(false),
       initialization_status_(kTfLiteError),
-      context_helper_(error_reporter_, &allocator_) {
+      eval_tensors_(nullptr),
+      context_helper_(error_reporter_, &allocator_, model),
+      input_tensor_(nullptr),
+      output_tensor_(nullptr) {
   Init(profiler);
 }
 
@@ -132,13 +166,15 @@ void MicroInterpreter::Init(tflite::Profiler* profiler) {
 
   context_.impl_ = static_cast<void*>(&context_helper_);
   context_.ReportError = context_helper_.ReportOpError;
+  context_.GetTensor = context_helper_.GetTensor;
+  context_.GetEvalTensor = context_helper_.GetEvalTensor;
   context_.recommended_num_threads = 1;
   context_.profiler = profiler;
 
   initialization_status_ = kTfLiteOk;
 }
 
-void MicroInterpreter::CorrectTensorEndianness(TfLiteTensor* tensorCorr) {
+void MicroInterpreter::CorrectTensorEndianness(TfLiteEvalTensor* tensorCorr) {
   int32_t tensorSize = 1;
   for (int d = 0; d < tensorCorr->dims->size; ++d)
     tensorSize *= reinterpret_cast<const int32_t*>(tensorCorr->dims->data)[d];
@@ -162,6 +198,9 @@ void MicroInterpreter::CorrectTensorEndianness(TfLiteTensor* tensorCorr) {
     case TfLiteType::kTfLiteComplex64:
       CorrectTensorDataEndianness(tensorCorr->data.c64, tensorSize);
       break;
+    case TfLiteType::kTfLiteComplex128:
+      CorrectTensorDataEndianness(tensorCorr->data.c128, tensorSize);
+      break;
     default:
       // Do nothing for other data types.
       break;
@@ -176,24 +215,40 @@ void MicroInterpreter::CorrectTensorDataEndianness(T* data, int32_t size) {
 }
 
 TfLiteStatus MicroInterpreter::AllocateTensors() {
-  if (allocator_.StartModelAllocation(model_, &context_, op_resolver_,
-                                      &node_and_registrations_) != kTfLiteOk) {
+  if (allocator_.StartModelAllocation(model_, op_resolver_,
+                                      &node_and_registrations_,
+                                      &eval_tensors_) != kTfLiteOk) {
     TF_LITE_REPORT_ERROR(error_reporter_,
                          "Failed starting model allocation.\n");
     initialization_status_ = kTfLiteError;
     return kTfLiteError;
   }
 
+  // Update the pointer now that TfLiteEvalTensor allocation has completed on
+  // the context helper.
+  // TODO(b/16157777): This call would not be needed if ContextHelper rolled
+  // into the interpreter.
+  context_helper_.SetTfLiteEvalTensors(eval_tensors_);
+
   // If the system is big endian then convert weights from the flatbuffer from
   // little to big endian on startup so that it does not need to be done during
   // inference.
   // NOTE: This requires that the flatbuffer is held in memory which can be
   // modified by this process.
   if (!FLATBUFFERS_LITTLEENDIAN) {
-    for (size_t t = 0; t < tensors_size(); ++t) {
-      TfLiteTensor* thisTensor = &context_.tensors[t];
-      if (thisTensor->allocation_type == kTfLiteMmapRo)
-        CorrectTensorEndianness(thisTensor);
+    for (size_t t = 0; t < subgraph_->tensors()->size(); ++t) {
+      if (auto* buffer =
+              (*model_->buffers())[subgraph_->tensors()->Get(t)->buffer()]) {
+        // If we've found a buffer, does it have any data?
+        if (auto* array = buffer->data()) {
+          // If it has any data, is the data size larger than zero?
+          if (array->size()) {
+            // Update the endianness of the corresponding eval tensor since that
+            // struct holds the buffer used at inference time.
+            CorrectTensorEndianness(&eval_tensors_[t]);
+          }
+        }
+      }
     }
   }
 
@@ -222,8 +277,8 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
   }
   context_helper_.SetNodeIndex(-1);
 
-  // Both AllocatePersistentBuffer and RequestScratchBufferInArena is available
-  // in Prepare stage.
+  // Both AllocatePersistentBuffer and RequestScratchBufferInArena is
+  // available in Prepare stage.
   context_.RequestScratchBufferInArena =
       context_helper_.RequestScratchBufferInArena;
   for (size_t i = 0; i < subgraph_->operators()->size(); ++i) {
@@ -241,6 +296,7 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
         return kTfLiteError;
       }
     }
+    allocator_.ResetTempAllocations();
   }
   context_helper_.SetNodeIndex(-1);
 
@@ -251,7 +307,9 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
   context_.GetScratchBuffer = context_helper_.GetScratchBuffer;
 
   TF_LITE_ENSURE_OK(&context_,
-                    allocator_.FinishModelAllocation(model_, &context_));
+                    allocator_.FinishModelAllocation(model_, eval_tensors_));
+  TF_LITE_ENSURE_STATUS(ResetVariableTensors());
+
   tensors_allocated_ = true;
   return kTfLiteOk;
 }
@@ -276,7 +334,8 @@ TfLiteStatus MicroInterpreter::Invoke() {
     if (registration->invoke) {
       TfLiteStatus invoke_status;
 #ifndef NDEBUG  // Omit profiler overhead from release builds.
-      // The case where profiler == nullptr is handled by ScopedOperatorProfile.
+      // The case where profiler == nullptr is handled by
+      // ScopedOperatorProfile.
       tflite::Profiler* profiler =
           reinterpret_cast<tflite::Profiler*>(context_.profiler);
       ScopedOperatorProfile scoped_profiler(
@@ -284,6 +343,12 @@ TfLiteStatus MicroInterpreter::Invoke() {
 #endif
       invoke_status = registration->invoke(&context_, node);
 
+      // All TfLiteTensor structs used in the kernel are allocated from temp
+      // memory in the allocator. This creates a chain of allocations in the
+      // temp section. The call below resets the chain of allocations to
+      // prepare for the next call.
+      allocator_.ResetTempAllocations();
+
       if (invoke_status == kTfLiteError) {
         TF_LITE_REPORT_ERROR(
             error_reporter_,
@@ -300,50 +365,82 @@ TfLiteStatus MicroInterpreter::Invoke() {
 
 TfLiteTensor* MicroInterpreter::input(size_t index) {
   const size_t length = inputs_size();
-  if ((index < 0) || (index >= length)) {
+  if (index >= length) {
     TF_LITE_REPORT_ERROR(error_reporter_,
                          "Input index %d out of range (length is %d)", index,
                          length);
     return nullptr;
   }
-  return &(context_.tensors[inputs().Get(index)]);
+  if (index != 0) {
+    TF_LITE_REPORT_ERROR(
+        error_reporter_,
+        "Input tensors not at index 0 are allocated from the "
+        "persistent memory arena. Repeat calls will cause excess "
+        "allocation!");
+    return allocator_.AllocatePersistentTfLiteTensor(model_, eval_tensors_,
+                                                     inputs().Get(index));
+  }
+  if (input_tensor_ == nullptr) {
+    input_tensor_ = allocator_.AllocatePersistentTfLiteTensor(
+        model_, eval_tensors_, inputs().Get(index));
+  }
+  return input_tensor_;
 }
 
 TfLiteTensor* MicroInterpreter::output(size_t index) {
   const size_t length = outputs_size();
-  if ((index < 0) || (index >= length)) {
+  if (index >= length) {
     TF_LITE_REPORT_ERROR(error_reporter_,
                          "Output index %d out of range (length is %d)", index,
                          length);
     return nullptr;
   }
-  return &(context_.tensors[outputs().Get(index)]);
+  if (index != 0) {
+    TF_LITE_REPORT_ERROR(
+        error_reporter_,
+        "Output tensors not at index 0 are allocated from the "
+        "persistent memory arena. Repeat calls will cause excess "
+        "allocation!");
+    return allocator_.AllocatePersistentTfLiteTensor(model_, eval_tensors_,
+                                                     outputs().Get(index));
+  }
+  if (output_tensor_ == nullptr) {
+    // TODO(b/160894903): This API will allocate TfLiteTensor structs from
+    // persistent (tail) memory and cache on this pointer.
+    output_tensor_ = allocator_.AllocatePersistentTfLiteTensor(
+        model_, eval_tensors_, outputs().Get(index));
+  }
+  return output_tensor_;
 }
 
 TfLiteTensor* MicroInterpreter::tensor(size_t index) {
   const size_t length = tensors_size();
-  if ((index < 0) || (index >= length)) {
+  if (index >= length) {
     TF_LITE_REPORT_ERROR(error_reporter_,
                          "Tensor index %d out of range (length is %d)", index,
                          length);
     return nullptr;
   }
-  return &context_.tensors[index];
+  return allocator_.AllocatePersistentTfLiteTensor(model_, eval_tensors_,
+                                                   index);
 }
 
 TfLiteStatus MicroInterpreter::ResetVariableTensors() {
-  const size_t length = tensors_size();
-  for (size_t i = 0; i < length; ++i) {
-    TfLiteTensor* cur_tensor = tensor(i);
-    if (cur_tensor->is_variable) {
-      TfLiteStatus status = tflite::ResetVariableTensor(cur_tensor);
-      if (status != kTfLiteOk) {
-        TF_LITE_REPORT_ERROR(error_reporter_,
-                             "Failed to reset variable tensor at index: %d", i);
-        return status;
+  for (size_t i = 0; i < subgraph_->tensors()->size(); ++i) {
+    auto* tensor = subgraph_->tensors()->Get(i);
+    if (tensor->is_variable()) {
+      size_t buffer_size;
+      TF_LITE_ENSURE_STATUS(
+          TfLiteEvalTensorByteLength(&eval_tensors_[i], &buffer_size));
+
+      int value = 0;
+      if (tensor->type() == tflite::TensorType_INT8) {
+        value = tensor->quantization()->zero_point()->Get(0);
       }
+      memset(eval_tensors_[i].data.raw, value, buffer_size);
     }
   }
+
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/micro/micro_interpreter.h b/tensorflow/lite/micro/micro_interpreter.h
index 29377e3b940..67d74574e61 100644
--- a/tensorflow/lite/micro/micro_interpreter.h
+++ b/tensorflow/lite/micro/micro_interpreter.h
@@ -35,29 +35,36 @@ namespace internal {
 // A helper class to encapsulate the implementation of APIs in Context.
 // context->impl_ points to an instance of this class.
 // Check tensorflow/lite/c/common.h for detailed descriptions.
+// TODO(b/16157777): Consider rolling this class into MicroInterpreter.
 class ContextHelper {
  public:
   explicit ContextHelper(ErrorReporter* error_reporter,
-                         MicroAllocator* allocator)
-      : allocator_(allocator), error_reporter_(error_reporter) {}
-
-  static TfLiteStatus AllocatePersistentBuffer(TfLiteContext* ctx, size_t bytes,
-                                               void** ptr);
+                         MicroAllocator* allocator, const Model* model);
 
+  // Functions that will be assigned to function pointers on TfLiteContext:
+  static void* AllocatePersistentBuffer(TfLiteContext* ctx, size_t bytes);
   static TfLiteStatus RequestScratchBufferInArena(TfLiteContext* ctx,
                                                   size_t bytes,
                                                   int* buffer_idx);
-
   static void* GetScratchBuffer(TfLiteContext* ctx, int buffer_idx);
-
   static void ReportOpError(struct TfLiteContext* context, const char* format,
                             ...);
+  static TfLiteTensor* GetTensor(const struct TfLiteContext* context,
+                                 int tensor_idx);
+  static TfLiteEvalTensor* GetEvalTensor(const struct TfLiteContext* context,
+                                         int tensor_idx);
 
-  void SetNodeIndex(int idx) { current_node_idx_ = idx; }
+  // Sets the current node index to assist with scratch buffer allocations:
+  void SetNodeIndex(int idx);
+
+  // Sets the pointer to a list of TfLiteEvalTensor instances.
+  void SetTfLiteEvalTensors(TfLiteEvalTensor* eval_tensors);
 
  private:
   MicroAllocator* allocator_;
   ErrorReporter* error_reporter_;
+  const Model* model_;
+  TfLiteEvalTensor* eval_tensors_;
   int current_node_idx_ = -1;
 };
 
@@ -170,7 +177,7 @@ class MicroInterpreter {
   // error reporting during initialization.
   void Init(tflite::Profiler* profiler);
 
-  void CorrectTensorEndianness(TfLiteTensor* tensorCorr);
+  void CorrectTensorEndianness(TfLiteEvalTensor* tensorCorr);
 
   template <class T>
   void CorrectTensorDataEndianness(T* data, int32_t size);
@@ -187,7 +194,13 @@ class MicroInterpreter {
   TfLiteStatus initialization_status_;
 
   const SubGraph* subgraph_;
+  TfLiteEvalTensor* eval_tensors_;
   internal::ContextHelper context_helper_;
+
+  // TODO(b/160894903): Clean these pointers up when all APIs are updated to new
+  // TfLiteEvalTensor buffers.
+  TfLiteTensor* input_tensor_;
+  TfLiteTensor* output_tensor_;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/micro_interpreter_test.cc b/tensorflow/lite/micro/micro_interpreter_test.cc
index a5be011e2f0..150dbead337 100644
--- a/tensorflow/lite/micro/micro_interpreter_test.cc
+++ b/tensorflow/lite/micro/micro_interpreter_test.cc
@@ -72,7 +72,7 @@ TF_LITE_MICRO_TEST(TestInterpreter) {
 
   tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
 
-  constexpr size_t allocator_buffer_size = 1000;
+  constexpr size_t allocator_buffer_size = 2000;
   uint8_t allocator_buffer[allocator_buffer_size];
 
   // Create a new scope so that we can test the destructor.
@@ -82,15 +82,15 @@ TF_LITE_MICRO_TEST(TestInterpreter) {
                                          micro_test::reporter);
     TF_LITE_MICRO_EXPECT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
     TF_LITE_MICRO_EXPECT_LE(interpreter.arena_used_bytes(), 928 + 100);
-    TF_LITE_MICRO_EXPECT_EQ(1, interpreter.inputs_size());
-    TF_LITE_MICRO_EXPECT_EQ(2, interpreter.outputs_size());
+    TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(1), interpreter.inputs_size());
+    TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(2), interpreter.outputs_size());
 
     TfLiteTensor* input = interpreter.input(0);
     TF_LITE_MICRO_EXPECT_NE(nullptr, input);
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, input->type);
     TF_LITE_MICRO_EXPECT_EQ(1, input->dims->size);
     TF_LITE_MICRO_EXPECT_EQ(1, input->dims->data[0]);
-    TF_LITE_MICRO_EXPECT_EQ(4, input->bytes);
+    TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(4), input->bytes);
     TF_LITE_MICRO_EXPECT_NE(nullptr, input->data.i32);
     input->data.i32[0] = 21;
 
@@ -101,7 +101,7 @@ TF_LITE_MICRO_TEST(TestInterpreter) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, output->type);
     TF_LITE_MICRO_EXPECT_EQ(1, output->dims->size);
     TF_LITE_MICRO_EXPECT_EQ(1, output->dims->data[0]);
-    TF_LITE_MICRO_EXPECT_EQ(4, output->bytes);
+    TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(4), output->bytes);
     TF_LITE_MICRO_EXPECT_NE(nullptr, output->data.i32);
     TF_LITE_MICRO_EXPECT_EQ(42, output->data.i32[0]);
 
@@ -110,7 +110,7 @@ TF_LITE_MICRO_TEST(TestInterpreter) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, output->type);
     TF_LITE_MICRO_EXPECT_EQ(1, output->dims->size);
     TF_LITE_MICRO_EXPECT_EQ(1, output->dims->data[0]);
-    TF_LITE_MICRO_EXPECT_EQ(4, output->bytes);
+    TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(4), output->bytes);
     TF_LITE_MICRO_EXPECT_NE(nullptr, output->data.i32);
     TF_LITE_MICRO_EXPECT_EQ(42, output->data.i32[0]);
 
@@ -121,20 +121,113 @@ TF_LITE_MICRO_TEST(TestInterpreter) {
   TF_LITE_MICRO_EXPECT_EQ(tflite::testing::MockCustom::freed_, true);
 }
 
+TF_LITE_MICRO_TEST(TestMultiTenantInterpreter) {
+  tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
+  constexpr size_t arena_size = 8192;
+  uint8_t arena[arena_size];
+
+  size_t simple_model_head_usage = 0, complex_model_head_usage = 0;
+
+  // Get simple_model_head_usage.
+  {
+    tflite::RecordingMicroAllocator* allocator =
+        tflite::RecordingMicroAllocator::Create(arena, arena_size,
+                                                micro_test::reporter);
+    const tflite::Model* model0 = tflite::testing::GetSimpleMockModel();
+    tflite::MicroInterpreter interpreter0(model0, op_resolver, allocator,
+                                          micro_test::reporter);
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, interpreter0.AllocateTensors());
+    simple_model_head_usage =
+        allocator->GetSimpleMemoryAllocator()->GetHeadUsedBytes();
+
+    TfLiteTensor* input = interpreter0.input(0);
+    TfLiteTensor* output = interpreter0.output(0);
+    input->data.i32[0] = 21;
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, interpreter0.Invoke());
+    TF_LITE_MICRO_EXPECT_EQ(42, output->data.i32[0]);
+  }
+
+  // Shared allocator for various models.
+  tflite::RecordingMicroAllocator* allocator =
+      tflite::RecordingMicroAllocator::Create(arena, arena_size,
+                                              micro_test::reporter);
+
+  // Get complex_model_head_usage. No head space reuse since it's the first
+  // model allocated in the `allocator`.
+  const tflite::Model* model1 = tflite::testing::GetComplexMockModel();
+  tflite::MicroInterpreter interpreter1(model1, op_resolver, allocator,
+                                        micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, interpreter1.AllocateTensors());
+  TfLiteTensor* input1 = interpreter1.input(0);
+  TfLiteTensor* output1 = interpreter1.output(0);
+  complex_model_head_usage =
+      allocator->GetSimpleMemoryAllocator()->GetHeadUsedBytes();
+
+  // Allocate simple model from the same `allocator`. Some head space will
+  // be reused thanks to multi-tenant TFLM support. Also makes sure that
+  // the output is correct.
+  const tflite::Model* model2 = tflite::testing::GetSimpleMockModel();
+  tflite::MicroInterpreter interpreter2(model2, op_resolver, allocator,
+                                        micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, interpreter2.AllocateTensors());
+  TfLiteTensor* input2 = interpreter2.input(0);
+  TfLiteTensor* output2 = interpreter2.output(0);
+  // Verify that 1 + 1 < 2.
+  size_t multi_tenant_head_usage =
+      allocator->GetSimpleMemoryAllocator()->GetHeadUsedBytes();
+  TF_LITE_MICRO_EXPECT_LE(multi_tenant_head_usage,
+                          complex_model_head_usage + simple_model_head_usage);
+
+  // Now we have model1 and model2 sharing the same `allocator`.
+  // Let's make sure that they can produce correct results.
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, input1->type);
+  input1->data.i32[0] = 10;
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, interpreter1.Invoke());
+  // Output tensor for the first model.
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, output1->type);
+  TF_LITE_MICRO_EXPECT_EQ(10, output1->data.i32[0]);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, input2->type);
+  input2->data.i32[0] = 21;
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, interpreter2.Invoke());
+  // Output for the second model.
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, output2->type);
+  TF_LITE_MICRO_EXPECT_EQ(42, output2->data.i32[0]);
+
+  // Allocate another complex model from the `allocator` will not increase
+  // head space usage.
+  const tflite::Model* model3 = tflite::testing::GetComplexMockModel();
+  tflite::MicroInterpreter interpreter3(model3, op_resolver, allocator,
+                                        micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, interpreter3.AllocateTensors());
+  TfLiteTensor* input3 = interpreter3.input(0);
+  TfLiteTensor* output3 = interpreter3.output(0);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, input3->type);
+  input3->data.i32[0] = 10;
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, interpreter3.Invoke());
+  // Output tensor for the third model.
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, output3->type);
+  TF_LITE_MICRO_EXPECT_EQ(10, output3->data.i32[0]);
+  // No increase on the head usage as we're reusing the space.
+  TF_LITE_MICRO_EXPECT_EQ(
+      multi_tenant_head_usage,
+      allocator->GetSimpleMemoryAllocator()->GetHeadUsedBytes());
+}
+
 TF_LITE_MICRO_TEST(TestKernelMemoryPlanning) {
   const tflite::Model* model = tflite::testing::GetSimpleStatefulModel();
   TF_LITE_MICRO_EXPECT_NE(nullptr, model);
 
   tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
 
-  constexpr size_t allocator_buffer_size = 1024;
+  constexpr size_t allocator_buffer_size = 2048;
   uint8_t allocator_buffer[allocator_buffer_size];
   tflite::MicroInterpreter interpreter(model, op_resolver, allocator_buffer,
                                        allocator_buffer_size,
                                        micro_test::reporter);
   TF_LITE_MICRO_EXPECT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
-  TF_LITE_MICRO_EXPECT_EQ(1, interpreter.inputs_size());
-  TF_LITE_MICRO_EXPECT_EQ(2, interpreter.outputs_size());
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(1), interpreter.inputs_size());
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(2), interpreter.outputs_size());
 
   TfLiteTensor* input = interpreter.input(0);
   TF_LITE_MICRO_EXPECT_EQ(1, input->dims->size);
@@ -177,8 +270,8 @@ TF_LITE_MICRO_TEST(TestVariableTensorReset) {
                                        micro_test::reporter);
   TF_LITE_MICRO_EXPECT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
   TF_LITE_MICRO_EXPECT_LE(interpreter.arena_used_bytes(), 2096 + 100);
-  TF_LITE_MICRO_EXPECT_EQ(1, interpreter.inputs_size());
-  TF_LITE_MICRO_EXPECT_EQ(1, interpreter.outputs_size());
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(1), interpreter.inputs_size());
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(1), interpreter.outputs_size());
 
   // Assign hard-code values:
   for (size_t i = 0; i < interpreter.tensors_size(); ++i) {
@@ -306,25 +399,22 @@ TF_LITE_MICRO_TEST(TestIncompleteInitializationAllocationsWithSmallArena) {
   // Ensure allocations are zero (ignore tail since some internal structs are
   // initialized with this space):
   TF_LITE_MICRO_EXPECT_EQ(
-      0, allocator->GetSimpleMemoryAllocator()->GetHeadUsedBytes());
+      static_cast<size_t>(0),
+      allocator->GetSimpleMemoryAllocator()->GetHeadUsedBytes());
   TF_LITE_MICRO_EXPECT_EQ(
-      0, allocator
-             ->GetRecordedAllocation(
-                 tflite::RecordedAllocationType::kTfLiteTensorArray)
-             .used_bytes);
+      static_cast<size_t>(0),
+      allocator
+          ->GetRecordedAllocation(
+              tflite::RecordedAllocationType::kTfLiteEvalTensorData)
+          .used_bytes);
   TF_LITE_MICRO_EXPECT_EQ(
-      0, allocator
-             ->GetRecordedAllocation(tflite::RecordedAllocationType::
-                                         kTfLiteTensorArrayQuantizationData)
-             .used_bytes);
-  TF_LITE_MICRO_EXPECT_EQ(
-      0,
+      static_cast<size_t>(0),
       allocator
           ->GetRecordedAllocation(
               tflite::RecordedAllocationType::kTfLiteTensorVariableBufferData)
           .used_bytes);
   TF_LITE_MICRO_EXPECT_EQ(
-      0,
+      static_cast<size_t>(0),
       allocator->GetRecordedAllocation(tflite::RecordedAllocationType::kOpData)
           .used_bytes);
 }
@@ -349,20 +439,22 @@ TF_LITE_MICRO_TEST(TestInterpreterDoesNotAllocateUntilInvoke) {
   // Ensure allocations are zero (ignore tail since some internal structs are
   // initialized with this space):
   TF_LITE_MICRO_EXPECT_EQ(
-      0, allocator->GetSimpleMemoryAllocator()->GetHeadUsedBytes());
+      static_cast<size_t>(0),
+      allocator->GetSimpleMemoryAllocator()->GetHeadUsedBytes());
   TF_LITE_MICRO_EXPECT_EQ(
-      0, allocator
-             ->GetRecordedAllocation(
-                 tflite::RecordedAllocationType::kTfLiteTensorArray)
-             .used_bytes);
-  TF_LITE_MICRO_EXPECT_EQ(
-      0,
+      static_cast<size_t>(0),
       allocator
           ->GetRecordedAllocation(
               tflite::RecordedAllocationType::kTfLiteTensorVariableBufferData)
           .used_bytes);
   TF_LITE_MICRO_EXPECT_EQ(
-      0,
+      static_cast<size_t>(0),
+      allocator
+          ->GetRecordedAllocation(
+              tflite::RecordedAllocationType::kTfLiteEvalTensorData)
+          .used_bytes);
+  TF_LITE_MICRO_EXPECT_EQ(
+      static_cast<size_t>(0),
       allocator->GetRecordedAllocation(tflite::RecordedAllocationType::kOpData)
           .used_bytes);
 
@@ -372,11 +464,12 @@ TF_LITE_MICRO_TEST(TestInterpreterDoesNotAllocateUntilInvoke) {
   // Allocation sizes vary based on platform - check that allocations are now
   // non-zero:
   TF_LITE_MICRO_EXPECT_GT(
-      allocator->GetSimpleMemoryAllocator()->GetHeadUsedBytes(), 0);
+      allocator->GetSimpleMemoryAllocator()->GetHeadUsedBytes(),
+      static_cast<size_t>(0));
   TF_LITE_MICRO_EXPECT_GT(
       allocator
           ->GetRecordedAllocation(
-              tflite::RecordedAllocationType::kTfLiteTensorArray)
+              tflite::RecordedAllocationType::kTfLiteEvalTensorData)
           .used_bytes,
       0);
 
@@ -385,15 +478,15 @@ TF_LITE_MICRO_TEST(TestInterpreterDoesNotAllocateUntilInvoke) {
           ->GetRecordedAllocation(
               tflite::RecordedAllocationType::kTfLiteTensorVariableBufferData)
           .used_bytes,
-      0);
+      static_cast<size_t>(0));
 
   // TODO(b/160160549): This check is mostly meaningless right now because the
-  // operator creation in our mock models is inconsistent. Revisit what this
-  // check should be once the mock models are properly created.
+  // operator creation in our mock models is inconsistent.  Revisit what
+  // this check should be once the mock models are properly created.
   TF_LITE_MICRO_EXPECT_EQ(
       allocator->GetRecordedAllocation(tflite::RecordedAllocationType::kOpData)
           .used_bytes,
-      0);
+      static_cast<size_t>(0));
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc b/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
index fe9c8de5959..efe41ff4e2f 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
@@ -65,8 +65,11 @@ TF_LITE_MICRO_TEST(TestOperations) {
   using tflite::MicroMutableOpResolver;
   using tflite::OpResolver;
 
-  static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
-                                 tflite::MockPrepare, tflite::MockInvoke};
+  static TfLiteRegistration r = {};
+  r.init = tflite::MockInit;
+  r.free = tflite::MockFree;
+  r.prepare = tflite::MockPrepare;
+  r.invoke = tflite::MockInvoke;
 
   MicroMutableOpResolver<1> micro_op_resolver;
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
@@ -78,20 +81,21 @@ TF_LITE_MICRO_TEST(TestOperations) {
 
   tflite::MicroOpResolver* resolver = &micro_op_resolver;
 
-  TF_LITE_MICRO_EXPECT_EQ(1, micro_op_resolver.GetRegistrationLength());
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(1),
+                          micro_op_resolver.GetRegistrationLength());
 
   const TfLiteRegistration* registration =
       resolver->FindOp(BuiltinOperator_RELU);
-  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration);
+  TF_LITE_MICRO_EXPECT(nullptr == registration);
 
   registration = resolver->FindOp("mock_custom");
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration->init(nullptr, nullptr, 0));
+  TF_LITE_MICRO_EXPECT(nullptr != registration);
+  TF_LITE_MICRO_EXPECT(nullptr == registration->init(nullptr, nullptr, 0));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(nullptr, nullptr));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(nullptr, nullptr));
 
   registration = resolver->FindOp("nonexistent_custom");
-  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration);
+  TF_LITE_MICRO_EXPECT(nullptr == registration);
 }
 
 TF_LITE_MICRO_TEST(TestErrorReporting) {
@@ -99,8 +103,11 @@ TF_LITE_MICRO_TEST(TestErrorReporting) {
   using tflite::BuiltinOperator_RELU;
   using tflite::MicroMutableOpResolver;
 
-  static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
-                                 tflite::MockPrepare, tflite::MockInvoke};
+  static TfLiteRegistration r = {};
+  r.init = tflite::MockInit;
+  r.free = tflite::MockFree;
+  r.prepare = tflite::MockPrepare;
+  r.invoke = tflite::MockInvoke;
 
   tflite::MockErrorReporter mock_reporter;
   MicroMutableOpResolver<1> micro_op_resolver(&mock_reporter);
diff --git a/tensorflow/lite/micro/micro_optional_debug_tools.cc b/tensorflow/lite/micro/micro_optional_debug_tools.cc
index f94d67b5ee5..e7aee576351 100644
--- a/tensorflow/lite/micro/micro_optional_debug_tools.cc
+++ b/tensorflow/lite/micro/micro_optional_debug_tools.cc
@@ -85,6 +85,8 @@ const char* TensorTypeName(TfLiteType type) {
       return "kTfLiteInt16";
     case kTfLiteComplex64:
       return "kTfLiteComplex64";
+    case kTfLiteComplex128:
+      return "kTfLiteComplex128";
     case kTfLiteFloat16:
       return "kTfLiteFloat16";
     case kTfLiteFloat64:
@@ -107,6 +109,8 @@ const char* AllocTypeName(TfLiteAllocationType type) {
       return "kTfLiteArenaRwPersistent";
     case kTfLitePersistentRo:
       return "kTfLitePersistentRo";
+    case kTfLiteCustom:
+      return "kTfLiteCustom";
   }
   return "(invalid)";
 }
@@ -115,6 +119,7 @@ const char* AllocTypeName(TfLiteAllocationType type) {
 // Helper function to print model flatbuffer data. This function is not called
 // by default. Hence it's not linked in to the final binary code.
 void PrintModelData(const Model* model, ErrorReporter* error_reporter) {
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
   auto* subgraphs = model->subgraphs();
   const SubGraph* subgraph = (*subgraphs)[0];
   const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors =
@@ -137,6 +142,7 @@ void PrintModelData(const Model* model, ErrorReporter* error_reporter) {
         error_reporter, "Tensor index: %d arena tensor %d size %d ", i,
         !array_size && !flatbuffer_tensor.is_variable(), tensor_size);
   }
+#endif
 }
 
 // Prints a dump of what tensors and what nodes are in the interpreter.
diff --git a/tensorflow/lite/micro/micro_profiler.cc b/tensorflow/lite/micro/micro_profiler.cc
index a765b918108..83fb9f64713 100644
--- a/tensorflow/lite/micro/micro_profiler.cc
+++ b/tensorflow/lite/micro/micro_profiler.cc
@@ -33,9 +33,10 @@ uint32_t MicroProfiler::BeginEvent(const char* tag, EventType event_type,
 }
 
 void MicroProfiler::EndEvent(uint32_t event_handle) {
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
   int32_t end_time = GetCurrentTimeTicks();
   TF_LITE_REPORT_ERROR(reporter_, "%s took %d cycles\n", event_tag_,
                        end_time - start_time_);
+#endif
 }
-
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/micro_string_test.cc b/tensorflow/lite/micro/micro_string_test.cc
index fb8183bb492..400f908f97f 100644
--- a/tensorflow/lite/micro/micro_string_test.cc
+++ b/tensorflow/lite/micro/micro_string_test.cc
@@ -24,7 +24,7 @@ TF_LITE_MICRO_TEST(FormatPositiveIntShouldMatchExpected) {
   char buffer[kBufferLen];
   const char golden[] = "Int: 55";
   int bytes_written = MicroSnprintf(buffer, kBufferLen, "Int: %d", 55);
-  TF_LITE_MICRO_EXPECT_EQ(sizeof(golden), bytes_written);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<int>(sizeof(golden)), bytes_written);
   TF_LITE_MICRO_EXPECT_STRING_EQ(golden, buffer);
 }
 
@@ -33,7 +33,7 @@ TF_LITE_MICRO_TEST(FormatNegativeIntShouldMatchExpected) {
   char buffer[kBufferLen];
   const char golden[] = "Int: -55";
   int bytes_written = MicroSnprintf(buffer, kBufferLen, "Int: %d", -55);
-  TF_LITE_MICRO_EXPECT_EQ(sizeof(golden), bytes_written);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<int>(sizeof(golden)), bytes_written);
   TF_LITE_MICRO_EXPECT_STRING_EQ(golden, buffer);
 }
 
@@ -42,7 +42,7 @@ TF_LITE_MICRO_TEST(FormatUnsignedIntShouldMatchExpected) {
   char buffer[kBufferLen];
   const char golden[] = "UInt: 12345";
   int bytes_written = MicroSnprintf(buffer, kBufferLen, "UInt: %u", 12345);
-  TF_LITE_MICRO_EXPECT_EQ(sizeof(golden), bytes_written);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<int>(sizeof(golden)), bytes_written);
   TF_LITE_MICRO_EXPECT_STRING_EQ(golden, buffer);
 }
 
@@ -51,7 +51,7 @@ TF_LITE_MICRO_TEST(FormatHexShouldMatchExpected) {
   char buffer[kBufferLen];
   const char golden[] = "Hex: 0x12345";
   int bytes_written = MicroSnprintf(buffer, kBufferLen, "Hex: %x", 0x12345);
-  TF_LITE_MICRO_EXPECT_EQ(sizeof(golden), bytes_written);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<int>(sizeof(golden)), bytes_written);
   TF_LITE_MICRO_EXPECT_STRING_EQ(golden, buffer);
 }
 
@@ -59,8 +59,8 @@ TF_LITE_MICRO_TEST(FormatFloatShouldMatchExpected) {
   const int kBufferLen = 32;
   char buffer[kBufferLen];
   const char golden[] = "Float: 1.0*2^4";
-  int bytes_written = MicroSnprintf(buffer, kBufferLen, "Float: %f", 16.f);
-  TF_LITE_MICRO_EXPECT_EQ(sizeof(golden), bytes_written);
+  int bytes_written = MicroSnprintf(buffer, kBufferLen, "Float: %f", 16.);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<int>(sizeof(golden)), bytes_written);
   TF_LITE_MICRO_EXPECT_STRING_EQ(golden, buffer);
 }
 
@@ -70,7 +70,7 @@ TF_LITE_MICRO_TEST(BadlyFormattedStringShouldProduceReasonableString) {
   const char golden[] = "Test Badly % formated % string";
   int bytes_written =
       MicroSnprintf(buffer, kBufferLen, "Test Badly %% formated %% string%");
-  TF_LITE_MICRO_EXPECT_EQ(sizeof(golden), bytes_written);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<int>(sizeof(golden)), bytes_written);
   TF_LITE_MICRO_EXPECT_STRING_EQ(golden, buffer);
 }
 
@@ -79,7 +79,7 @@ TF_LITE_MICRO_TEST(IntFormatOverrunShouldTruncate) {
   char buffer[kBufferLen];
   const char golden[] = "Int: ";
   int bytes_written = MicroSnprintf(buffer, kBufferLen, "Int: %d", 12345);
-  TF_LITE_MICRO_EXPECT_EQ(sizeof(golden), bytes_written);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<int>(sizeof(golden)), bytes_written);
   TF_LITE_MICRO_EXPECT_STRING_EQ(golden, buffer);
 }
 
@@ -88,7 +88,7 @@ TF_LITE_MICRO_TEST(UnsignedIntFormatOverrunShouldTruncate) {
   char buffer[kBufferLen];
   const char golden[] = "UInt: ";
   int bytes_written = MicroSnprintf(buffer, kBufferLen, "UInt: %u", 12345);
-  TF_LITE_MICRO_EXPECT_EQ(sizeof(golden), bytes_written);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<int>(sizeof(golden)), bytes_written);
   TF_LITE_MICRO_EXPECT_STRING_EQ(golden, buffer);
 }
 
@@ -97,7 +97,7 @@ TF_LITE_MICRO_TEST(HexFormatOverrunShouldTruncate) {
   char buffer[kBufferLen];
   const char golden[] = "Hex: ";
   int bytes_written = MicroSnprintf(buffer, kBufferLen, "Hex: %x", 0x12345);
-  TF_LITE_MICRO_EXPECT_EQ(sizeof(golden), bytes_written);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<int>(sizeof(golden)), bytes_written);
   TF_LITE_MICRO_EXPECT_STRING_EQ(golden, buffer);
 }
 
@@ -105,8 +105,8 @@ TF_LITE_MICRO_TEST(FloatFormatOverrunShouldTruncate) {
   const int kBufferLen = 12;
   char buffer[kBufferLen];
   const char golden[] = "Float: ";
-  int bytes_written = MicroSnprintf(buffer, kBufferLen, "Float: %x", 12345.f);
-  TF_LITE_MICRO_EXPECT_EQ(sizeof(golden), bytes_written);
+  int bytes_written = MicroSnprintf(buffer, kBufferLen, "Float: %x", 12345.);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<int>(sizeof(golden)), bytes_written);
   TF_LITE_MICRO_EXPECT_STRING_EQ(golden, buffer);
 }
 
@@ -115,9 +115,8 @@ TF_LITE_MICRO_TEST(FloatFormatShouldPrintFractionCorrectly) {
   char buffer[kBufferLen];
   const char golden[] = "Float: 1.0625*2^0";
   // Add small offset to float value to account for float rounding error.
-  int bytes_written =
-      MicroSnprintf(buffer, kBufferLen, "Float: %f", 1.0625001f);
-  TF_LITE_MICRO_EXPECT_EQ(sizeof(golden), bytes_written);
+  int bytes_written = MicroSnprintf(buffer, kBufferLen, "Float: %f", 1.0625001);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<int>(sizeof(golden)), bytes_written);
   TF_LITE_MICRO_EXPECT_STRING_EQ(golden, buffer);
 }
 
@@ -127,7 +126,7 @@ TF_LITE_MICRO_TEST(StringFormatOverrunShouldTruncate) {
   const char golden[] = "String: h";
   int bytes_written =
       MicroSnprintf(buffer, kBufferLen, "String: %s", "hello world");
-  TF_LITE_MICRO_EXPECT_EQ(sizeof(golden), bytes_written);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<int>(sizeof(golden)), bytes_written);
   TF_LITE_MICRO_EXPECT_STRING_EQ(golden, buffer);
 }
 
@@ -136,7 +135,7 @@ TF_LITE_MICRO_TEST(StringFormatWithExactOutputSizeOverrunShouldTruncate) {
   char buffer[kBufferLen];
   const char golden[] = "format st";
   int bytes_written = MicroSnprintf(buffer, kBufferLen, "format str");
-  TF_LITE_MICRO_EXPECT_EQ(sizeof(golden), bytes_written);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<int>(sizeof(golden)), bytes_written);
   TF_LITE_MICRO_EXPECT_STRING_EQ(golden, buffer);
 }
 
diff --git a/tensorflow/lite/micro/micro_utils.h b/tensorflow/lite/micro/micro_utils.h
index 1fc63d130e4..24aebad8a78 100644
--- a/tensorflow/lite/micro/micro_utils.h
+++ b/tensorflow/lite/micro/micro_utils.h
@@ -48,10 +48,10 @@ int32_t FloatToSymmetricQuantizedInt32(const float value, const float scale);
 //
 // There are several key flavors of quantization in TfLite:
 //        asymmetric symmetric  per channel
-// int8  |     X    |    X    |     X      |
-// uint8 |     X    |    X    |            |
-// int16 |     X    |         |            |
-// int32 |          |    X    |     X      |
+// int8_t  |     X    |    X    |     X      |
+// uint8_t |     X    |    X    |            |
+// int16_t |     X    |         |            |
+// int32_t |          |    X    |     X      |
 //
 // The per-op quantization spec can be found here:
 // https://www.tensorflow.org/lite/performance/quantization_spec
diff --git a/tensorflow/lite/micro/recording_micro_allocator.cc b/tensorflow/lite/micro/recording_micro_allocator.cc
index e667e7db9a9..7e11523fea0 100644
--- a/tensorflow/lite/micro/recording_micro_allocator.cc
+++ b/tensorflow/lite/micro/recording_micro_allocator.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/micro/compatibility.h"
+#include "tensorflow/lite/micro/micro_allocator.h"
 #include "tensorflow/lite/micro/recording_simple_memory_allocator.h"
 
 namespace tflite {
@@ -47,10 +48,12 @@ RecordingMicroAllocator* RecordingMicroAllocator::Create(
 RecordedAllocation RecordingMicroAllocator::GetRecordedAllocation(
     RecordedAllocationType allocation_type) const {
   switch (allocation_type) {
-    case RecordedAllocationType::kTfLiteTensorArray:
-      return recorded_tflite_tensor_array_data_;
-    case RecordedAllocationType::kTfLiteTensorArrayQuantizationData:
-      return recorded_tflite_tensor_array_quantization_data_;
+    case RecordedAllocationType::kTfLiteEvalTensorData:
+      return recorded_tflite_eval_tensor_data_;
+    case RecordedAllocationType::kPersistentTfLiteTensorData:
+      return recorded_persistent_tflite_tensor_data_;
+    case RecordedAllocationType::kPersistentTfLiteTensorQuantizationData:
+      return recorded_persistent_tflite_tensor_quantization_data_;
     case RecordedAllocationType::kTfLiteTensorVariableBufferData:
       return recorded_tflite_tensor_variable_buffer_data_;
     case RecordedAllocationType::kNodeAndRegistrationArray:
@@ -81,11 +84,13 @@ void RecordingMicroAllocator::PrintAllocations() const {
       error_reporter(),
       "[RecordingMicroAllocator] Arena allocation tail %d bytes",
       recording_memory_allocator_->GetTailUsedBytes());
-  PrintRecordedAllocation(RecordedAllocationType::kTfLiteTensorArray,
-                          "TfLiteTensor struct", "tensors");
+  PrintRecordedAllocation(RecordedAllocationType::kTfLiteEvalTensorData,
+                          "TfLiteEvalTensor data", "allocations");
+  PrintRecordedAllocation(RecordedAllocationType::kPersistentTfLiteTensorData,
+                          "Persistent TfLiteTensor data", "tensors");
   PrintRecordedAllocation(
-      RecordedAllocationType::kTfLiteTensorArrayQuantizationData,
-      "TfLiteTensor quantization data", "allocations");
+      RecordedAllocationType::kPersistentTfLiteTensorQuantizationData,
+      "Persistent TfLiteTensor quantization data", "allocations");
   PrintRecordedAllocation(
       RecordedAllocationType::kTfLiteTensorVariableBufferData,
       "TfLiteTensor variable buffer data", "allocations");
@@ -99,6 +104,7 @@ void RecordingMicroAllocator::PrintAllocations() const {
 void RecordingMicroAllocator::PrintRecordedAllocation(
     RecordedAllocationType allocation_type, const char* allocation_name,
     const char* allocation_description) const {
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
   RecordedAllocation allocation = GetRecordedAllocation(allocation_type);
   TF_LITE_REPORT_ERROR(
       error_reporter(),
@@ -106,44 +112,15 @@ void RecordingMicroAllocator::PrintRecordedAllocation(
       "(requested %d bytes for %d %s)",
       allocation_name, allocation.used_bytes, allocation.requested_bytes,
       allocation.count, allocation_description);
-}
-
-TfLiteStatus RecordingMicroAllocator::AllocateTfLiteTensorArray(
-    TfLiteContext* context, const SubGraph* subgraph) {
-  RecordedAllocation allocations = SnapshotAllocationUsage();
-
-  TfLiteStatus status =
-      MicroAllocator::AllocateTfLiteTensorArray(context, subgraph);
-
-  RecordAllocationUsage(allocations, recorded_tflite_tensor_array_data_);
-  // The allocation for this recording will always be 1. This is because the
-  // parent class mallocs one large allocation for the number of tensors in the
-  // graph (e.g. sizeof(TfLiteTensor) * num_tensors).
-  // To prevent extra overhead and potential for fragmentation, manually adjust
-  // the accounting by decrementing by 1 and adding the actual number of tensors
-  // used in the graph:
-  recorded_tflite_tensor_array_data_.count += context->tensors_size - 1;
-  return status;
-}
-
-TfLiteStatus RecordingMicroAllocator::PopulateTfLiteTensorArrayFromFlatbuffer(
-    const Model* model, TfLiteContext* context, const SubGraph* subgraph) {
-  RecordedAllocation allocations = SnapshotAllocationUsage();
-
-  TfLiteStatus status = MicroAllocator::PopulateTfLiteTensorArrayFromFlatbuffer(
-      model, context, subgraph);
-
-  RecordAllocationUsage(allocations,
-                        recorded_tflite_tensor_array_quantization_data_);
-  return status;
+#endif
 }
 
 TfLiteStatus RecordingMicroAllocator::AllocateNodeAndRegistrations(
-    const SubGraph* subgraph, NodeAndRegistration** node_and_registrations) {
+    const Model* model, NodeAndRegistration** node_and_registrations) {
   RecordedAllocation allocations = SnapshotAllocationUsage();
 
   TfLiteStatus status = MicroAllocator::AllocateNodeAndRegistrations(
-      subgraph, node_and_registrations);
+      model, node_and_registrations);
 
   RecordAllocationUsage(allocations,
                         recorded_node_and_registration_array_data_);
@@ -157,36 +134,79 @@ TfLiteStatus RecordingMicroAllocator::AllocateNodeAndRegistrations(
   // the accounting by decrementing by 1 and adding the actual number of nodes
   // used in the graph:
   recorded_node_and_registration_array_data_.count +=
-      subgraph->operators()->size() - 1;
+      GetSubGraphFromModel(model)->operators()->size() - 1;
   return status;
 }
 
 TfLiteStatus
 RecordingMicroAllocator::PrepareNodeAndRegistrationDataFromFlatbuffer(
-    const Model* model, const SubGraph* subgraph,
-    const MicroOpResolver& op_resolver,
+    const Model* model, const MicroOpResolver& op_resolver,
     NodeAndRegistration* node_and_registrations) {
   RecordedAllocation allocations = SnapshotAllocationUsage();
 
   TfLiteStatus status =
       MicroAllocator::PrepareNodeAndRegistrationDataFromFlatbuffer(
-          model, subgraph, op_resolver, node_and_registrations);
+          model, op_resolver, node_and_registrations);
 
   RecordAllocationUsage(allocations, recorded_op_data_);
   return status;
 }
 
-TfLiteStatus RecordingMicroAllocator::AllocateVariables(
-    TfLiteContext* context, const SubGraph* subgraph) {
+TfLiteStatus RecordingMicroAllocator::AllocateTfLiteEvalTensors(
+    const Model* model, TfLiteEvalTensor** eval_tensors) {
   RecordedAllocation allocations = SnapshotAllocationUsage();
 
-  TfLiteStatus status = MicroAllocator::AllocateVariables(context, subgraph);
+  TfLiteStatus status =
+      MicroAllocator::AllocateTfLiteEvalTensors(model, eval_tensors);
+
+  RecordAllocationUsage(allocations, recorded_tflite_eval_tensor_data_);
+  // The allocation for this recording will always be 1. This is because the
+  // parent class mallocs one large allocation for the number of tensors in the
+  // graph (e.g. sizeof(TfLiteEvalTensor) * num_tensors).
+  // To prevent extra overhead and potential for fragmentation, manually adjust
+  // the accounting by decrementing by 1 and adding the actual number of tensors
+  // used in the graph:
+  recorded_tflite_eval_tensor_data_.count +=
+      GetSubGraphFromModel(model)->tensors()->size() - 1;
+  return status;
+}
+
+TfLiteStatus RecordingMicroAllocator::AllocateVariables(
+    const SubGraph* subgraph, TfLiteEvalTensor* eval_tensors) {
+  RecordedAllocation allocations = SnapshotAllocationUsage();
+
+  TfLiteStatus status =
+      MicroAllocator::AllocateVariables(subgraph, eval_tensors);
 
   RecordAllocationUsage(allocations,
                         recorded_tflite_tensor_variable_buffer_data_);
   return status;
 }
 
+TfLiteTensor* RecordingMicroAllocator::AllocatePersistentTfLiteTensorInternal(
+    const Model* model, TfLiteEvalTensor* eval_tensors, int tensor_index) {
+  RecordedAllocation allocations = SnapshotAllocationUsage();
+
+  TfLiteTensor* result = MicroAllocator::AllocatePersistentTfLiteTensorInternal(
+      model, eval_tensors, tensor_index);
+
+  RecordAllocationUsage(allocations, recorded_persistent_tflite_tensor_data_);
+  return result;
+}
+
+TfLiteStatus RecordingMicroAllocator::PopulateTfLiteTensorFromFlatbuffer(
+    const Model* model, const SubGraph* subgraph, TfLiteTensor* tensor,
+    int tensor_index, bool allocate_temp) {
+  RecordedAllocation allocations = SnapshotAllocationUsage();
+
+  TfLiteStatus status = MicroAllocator::PopulateTfLiteTensorFromFlatbuffer(
+      model, subgraph, tensor, tensor_index, allocate_temp);
+
+  RecordAllocationUsage(allocations,
+                        recorded_persistent_tflite_tensor_quantization_data_);
+  return status;
+}
+
 RecordedAllocation RecordingMicroAllocator::SnapshotAllocationUsage() const {
   return {/*requested_bytes=*/recording_memory_allocator_->GetRequestedBytes(),
           /*used_bytes=*/recording_memory_allocator_->GetUsedBytes(),
diff --git a/tensorflow/lite/micro/recording_micro_allocator.h b/tensorflow/lite/micro/recording_micro_allocator.h
index a5b97c7ef3a..9243fec12e5 100644
--- a/tensorflow/lite/micro/recording_micro_allocator.h
+++ b/tensorflow/lite/micro/recording_micro_allocator.h
@@ -25,8 +25,9 @@ namespace tflite {
 // List of buckets currently recorded by this class. Each type keeps a list of
 // allocated information during model initialization.
 enum class RecordedAllocationType {
-  kTfLiteTensorArray,
-  kTfLiteTensorArrayQuantizationData,
+  kTfLiteEvalTensorData,
+  kPersistentTfLiteTensorData,
+  kPersistentTfLiteTensorQuantizationData,
   kTfLiteTensorVariableBufferData,
   kNodeAndRegistrationArray,
   kOpData,
@@ -66,20 +67,29 @@ class RecordingMicroAllocator : public MicroAllocator {
   void PrintAllocations() const;
 
  protected:
-  TfLiteStatus AllocateTfLiteTensorArray(TfLiteContext* context,
-                                         const SubGraph* subgraph) override;
-  TfLiteStatus PopulateTfLiteTensorArrayFromFlatbuffer(
-      const Model* model, TfLiteContext* context,
-      const SubGraph* subgraph) override;
   TfLiteStatus AllocateNodeAndRegistrations(
-      const SubGraph* subgraph,
+      const Model* model,
       NodeAndRegistration** node_and_registrations) override;
   TfLiteStatus PrepareNodeAndRegistrationDataFromFlatbuffer(
-      const Model* model, const SubGraph* subgraph,
-      const MicroOpResolver& op_resolver,
+      const Model* model, const MicroOpResolver& op_resolver,
       NodeAndRegistration* node_and_registrations) override;
-  TfLiteStatus AllocateVariables(TfLiteContext* context,
-                                 const SubGraph* subgraph) override;
+  TfLiteStatus AllocateTfLiteEvalTensors(
+      const Model* model, TfLiteEvalTensor** eval_tensors) override;
+  TfLiteStatus AllocateVariables(const SubGraph* subgraph,
+                                 TfLiteEvalTensor* eval_tensors) override;
+  // TODO(b/160894903): Once all kernels have been updated to the new API drop
+  // this method. It is only used to record TfLiteTensor persistent allocations.
+  TfLiteTensor* AllocatePersistentTfLiteTensorInternal(
+      const Model* model, TfLiteEvalTensor* eval_tensors,
+      int tensor_index) override;
+  // TODO(b/160894903): Once all kernels have been updated to the new API drop
+  // this function since all allocations for quantized data will take place in
+  // the temp section.
+  TfLiteStatus PopulateTfLiteTensorFromFlatbuffer(const Model* model,
+                                                  const SubGraph* subgraph,
+                                                  TfLiteTensor* tensor,
+                                                  int tensor_index,
+                                                  bool allocate_temp) override;
 
  private:
   RecordingMicroAllocator(RecordingSimpleMemoryAllocator* memory_allocator,
@@ -95,8 +105,9 @@ class RecordingMicroAllocator : public MicroAllocator {
 
   const RecordingSimpleMemoryAllocator* recording_memory_allocator_;
 
-  RecordedAllocation recorded_tflite_tensor_array_data_ = {};
-  RecordedAllocation recorded_tflite_tensor_array_quantization_data_ = {};
+  RecordedAllocation recorded_tflite_eval_tensor_data_ = {};
+  RecordedAllocation recorded_persistent_tflite_tensor_data_ = {};
+  RecordedAllocation recorded_persistent_tflite_tensor_quantization_data_ = {};
   RecordedAllocation recorded_tflite_tensor_variable_buffer_data_ = {};
   RecordedAllocation recorded_node_and_registration_array_data_ = {};
   RecordedAllocation recorded_op_data_ = {};
diff --git a/tensorflow/lite/micro/recording_micro_allocator_test.cc b/tensorflow/lite/micro/recording_micro_allocator_test.cc
index 8b8eaa20638..f46bd29abdd 100644
--- a/tensorflow/lite/micro/recording_micro_allocator_test.cc
+++ b/tensorflow/lite/micro/recording_micro_allocator_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/micro/testing/test_conv_model.h"
 
 #define TF_LITE_TENSOR_STRUCT_SIZE sizeof(TfLiteTensor)
+#define TF_LITE_EVAL_TENSOR_STRUCT_SIZE sizeof(TfLiteEvalTensor)
 #define TF_LITE_AFFINE_QUANTIZATION_SIZE sizeof(TfLiteAffineQuantization)
 #define NODE_AND_REGISTRATION_STRUCT_SIZE sizeof(tflite::NodeAndRegistration)
 
@@ -33,8 +34,8 @@ constexpr int kTestConvArenaSize = 1024 * 12;
 
 TF_LITE_MICRO_TESTS_BEGIN
 
-TF_LITE_MICRO_TEST(TestRecordsTfLiteTensorArrayData) {
-  TfLiteContext context;
+TF_LITE_MICRO_TEST(TestRecordsTfLiteEvalTensorArrayData) {
+  TfLiteEvalTensor* eval_tensors = nullptr;
   tflite::AllOpsResolver all_ops_resolver;
   tflite::NodeAndRegistration* node_and_registration;
   const tflite::Model* model = tflite::GetModel(kTestConvModelData);
@@ -50,86 +51,33 @@ TF_LITE_MICRO_TEST(TestRecordsTfLiteTensorArrayData) {
 
   TfLiteStatus status;
   status = micro_allocator->StartModelAllocation(
-      model, &context, all_ops_resolver, &node_and_registration);
+      model, all_ops_resolver, &node_and_registration, &eval_tensors);
   TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
   if (status != kTfLiteOk) return 1;
 
-  status = micro_allocator->FinishModelAllocation(model, &context);
+  status = micro_allocator->FinishModelAllocation(model, eval_tensors);
   TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
   if (status != kTfLiteOk) return 1;
 
+  micro_allocator->PrintAllocations();
+
   tflite::RecordedAllocation recorded_allocation =
       micro_allocator->GetRecordedAllocation(
-          tflite::RecordedAllocationType::kTfLiteTensorArray);
-  TF_LITE_MICRO_EXPECT_EQ(recorded_allocation.count, context.tensors_size);
+          tflite::RecordedAllocationType::kTfLiteEvalTensorData);
+
+  micro_allocator->PrintAllocations();
+
+  size_t tensors_count = tflite::testing::GetModelTensorCount(model);
+
+  TF_LITE_MICRO_EXPECT_EQ(recorded_allocation.count, tensors_count);
   TF_LITE_MICRO_EXPECT_EQ(recorded_allocation.requested_bytes,
-                          context.tensors_size * TF_LITE_TENSOR_STRUCT_SIZE);
+                          tensors_count * TF_LITE_EVAL_TENSOR_STRUCT_SIZE);
   TF_LITE_MICRO_EXPECT_GE(recorded_allocation.used_bytes,
-                          context.tensors_size * TF_LITE_TENSOR_STRUCT_SIZE);
-}
-
-TF_LITE_MICRO_TEST(TestRecordsTensorArrayQuantizationData) {
-  TfLiteContext context;
-  tflite::AllOpsResolver all_ops_resolver;
-  tflite::NodeAndRegistration* node_and_registration;
-  const tflite::Model* model = tflite::GetModel(kTestConvModelData);
-  uint8_t arena[kTestConvArenaSize];
-
-  tflite::RecordingMicroAllocator* micro_allocator =
-      tflite::RecordingMicroAllocator::Create(arena, kTestConvArenaSize,
-                                              micro_test::reporter);
-  TF_LITE_MICRO_EXPECT_NE(micro_allocator, nullptr);
-  if (micro_allocator == nullptr) return 1;
-
-  TfLiteStatus status;
-  status = micro_allocator->StartModelAllocation(
-      model, &context, all_ops_resolver, &node_and_registration);
-  TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
-  if (status != kTfLiteOk) return 1;
-
-  status = micro_allocator->FinishModelAllocation(model, &context);
-  TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
-  if (status != kTfLiteOk) return 1;
-
-  // Walk the model subgraph to find all tensors with quantization params and
-  // keep a tally.
-  size_t quantized_tensor_count = 0;
-  size_t quantized_channel_bytes = 0;
-  for (size_t i = 0; i < context.tensors_size; ++i) {
-    const tflite::Tensor* cur_tensor =
-        model->subgraphs()->Get(0)->tensors()->Get(i);
-    const tflite::QuantizationParameters* quantization_params =
-        cur_tensor->quantization();
-    if (quantization_params && quantization_params->scale() &&
-        quantization_params->scale()->size() > 0 &&
-        quantization_params->zero_point() &&
-        quantization_params->zero_point()->size() > 0) {
-      quantized_tensor_count++;
-      size_t num_channels = quantization_params->scale()->size();
-      quantized_channel_bytes += TfLiteIntArrayGetSizeInBytes(num_channels);
-    }
-  }
-
-  // Calculate the expected allocation bytes with subgraph quantization data:
-  size_t expected_requested_bytes =
-      quantized_tensor_count * TF_LITE_AFFINE_QUANTIZATION_SIZE +
-      quantized_channel_bytes;
-
-  tflite::RecordedAllocation recorded_allocation =
-      micro_allocator->GetRecordedAllocation(
-          tflite::RecordedAllocationType::kTfLiteTensorArrayQuantizationData);
-
-  // Each quantized tensors has 2 mallocs (quant struct, zero point dimensions):
-  TF_LITE_MICRO_EXPECT_EQ(recorded_allocation.count,
-                          quantized_tensor_count * 2);
-  TF_LITE_MICRO_EXPECT_EQ(recorded_allocation.requested_bytes,
-                          expected_requested_bytes);
-  TF_LITE_MICRO_EXPECT_GE(recorded_allocation.used_bytes,
-                          expected_requested_bytes);
+                          tensors_count * TF_LITE_EVAL_TENSOR_STRUCT_SIZE);
 }
 
 TF_LITE_MICRO_TEST(TestRecordsNodeAndRegistrationArrayData) {
-  TfLiteContext context;
+  TfLiteEvalTensor* eval_tensors = nullptr;
   tflite::AllOpsResolver all_ops_resolver;
   tflite::NodeAndRegistration* node_and_registration;
   const tflite::Model* model = tflite::GetModel(kTestConvModelData);
@@ -143,11 +91,11 @@ TF_LITE_MICRO_TEST(TestRecordsNodeAndRegistrationArrayData) {
 
   TfLiteStatus status;
   status = micro_allocator->StartModelAllocation(
-      model, &context, all_ops_resolver, &node_and_registration);
+      model, all_ops_resolver, &node_and_registration, &eval_tensors);
   TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
   if (status != kTfLiteOk) return 1;
 
-  status = micro_allocator->FinishModelAllocation(model, &context);
+  status = micro_allocator->FinishModelAllocation(model, eval_tensors);
   TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
   if (status != kTfLiteOk) return 1;
 
@@ -163,7 +111,7 @@ TF_LITE_MICRO_TEST(TestRecordsNodeAndRegistrationArrayData) {
 }
 
 TF_LITE_MICRO_TEST(TestRecordsMultiTenantAllocations) {
-  TfLiteContext context;
+  TfLiteEvalTensor* eval_tensors = nullptr;
   tflite::AllOpsResolver all_ops_resolver;
   tflite::NodeAndRegistration* node_and_registration;
   const tflite::Model* model = tflite::GetModel(kTestConvModelData);
@@ -181,34 +129,108 @@ TF_LITE_MICRO_TEST(TestRecordsMultiTenantAllocations) {
 
   // First allocation with the model in the arena:
   status = micro_allocator->StartModelAllocation(
-      model, &context, all_ops_resolver, &node_and_registration);
+      model, all_ops_resolver, &node_and_registration, &eval_tensors);
   TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
   if (status != kTfLiteOk) return 1;
 
-  status = micro_allocator->FinishModelAllocation(model, &context);
+  status = micro_allocator->FinishModelAllocation(model, eval_tensors);
   TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
   if (status != kTfLiteOk) return 1;
 
   // Second allocation with the same model in the arena:
   status = micro_allocator->StartModelAllocation(
-      model, &context, all_ops_resolver, &node_and_registration);
+      model, all_ops_resolver, &node_and_registration, &eval_tensors);
   TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
   if (status != kTfLiteOk) return 1;
 
-  status = kTfLiteOk, micro_allocator->FinishModelAllocation(model, &context);
+  status = kTfLiteOk,
+  micro_allocator->FinishModelAllocation(model, eval_tensors);
   TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
   if (status != kTfLiteOk) return 1;
 
+  size_t tensors_count = tflite::testing::GetModelTensorCount(model);
+
   tflite::RecordedAllocation recorded_allocation =
       micro_allocator->GetRecordedAllocation(
-          tflite::RecordedAllocationType::kTfLiteTensorArray);
-  TF_LITE_MICRO_EXPECT_EQ(recorded_allocation.count, context.tensors_size * 2);
-  TF_LITE_MICRO_EXPECT_EQ(
-      recorded_allocation.requested_bytes,
-      context.tensors_size * TF_LITE_TENSOR_STRUCT_SIZE * 2);
-  TF_LITE_MICRO_EXPECT_GE(
-      recorded_allocation.used_bytes,
-      context.tensors_size * TF_LITE_TENSOR_STRUCT_SIZE * 2);
+          tflite::RecordedAllocationType::kTfLiteEvalTensorData);
+  TF_LITE_MICRO_EXPECT_EQ(recorded_allocation.count, tensors_count * 2);
+  TF_LITE_MICRO_EXPECT_EQ(recorded_allocation.requested_bytes,
+                          tensors_count * TF_LITE_EVAL_TENSOR_STRUCT_SIZE * 2);
+  TF_LITE_MICRO_EXPECT_GE(recorded_allocation.used_bytes,
+                          tensors_count * TF_LITE_EVAL_TENSOR_STRUCT_SIZE * 2);
+}
+
+TF_LITE_MICRO_TEST(TestRecordsPersistentTfLiteTensorData) {
+  const tflite::Model* model = tflite::GetModel(kTestConvModelData);
+  uint8_t arena[kTestConvArenaSize];
+
+  tflite::RecordingMicroAllocator* micro_allocator =
+      tflite::RecordingMicroAllocator::Create(arena, kTestConvArenaSize,
+                                              micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_NE(micro_allocator, nullptr);
+  if (micro_allocator == nullptr) return 1;
+
+  TfLiteTensor* tensor = micro_allocator->AllocatePersistentTfLiteTensor(
+      model, /*eval_tensors=*/nullptr, 0);
+  TF_LITE_MICRO_EXPECT_NE(tensor, nullptr);
+  if (tensor == nullptr) return 1;
+
+  tflite::RecordedAllocation recorded_allocation =
+      micro_allocator->GetRecordedAllocation(
+          tflite::RecordedAllocationType::kPersistentTfLiteTensorData);
+
+  TF_LITE_MICRO_EXPECT_EQ(recorded_allocation.count, static_cast<size_t>(1));
+  TF_LITE_MICRO_EXPECT_EQ(recorded_allocation.requested_bytes,
+                          TF_LITE_TENSOR_STRUCT_SIZE);
+  TF_LITE_MICRO_EXPECT_GE(recorded_allocation.used_bytes,
+                          TF_LITE_TENSOR_STRUCT_SIZE);
+}
+
+TF_LITE_MICRO_TEST(TestRecordsPersistentTfLiteTensorQuantizationData) {
+  const tflite::Model* model = tflite::GetModel(kTestConvModelData);
+  uint8_t arena[kTestConvArenaSize];
+
+  tflite::RecordingMicroAllocator* micro_allocator =
+      tflite::RecordingMicroAllocator::Create(arena, kTestConvArenaSize,
+                                              micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_NE(micro_allocator, nullptr);
+  if (micro_allocator == nullptr) return 1;
+
+  TfLiteTensor* tensor = micro_allocator->AllocatePersistentTfLiteTensor(
+      model, /*eval_tensors=*/nullptr, 0);
+  TF_LITE_MICRO_EXPECT_NE(tensor, nullptr);
+  if (tensor == nullptr) return 1;
+
+  // Walk the model subgraph to find all tensors with quantization params and
+  // keep a tally.
+  size_t quantized_channel_bytes = 0;
+  const tflite::Tensor* cur_tensor =
+      model->subgraphs()->Get(0)->tensors()->Get(0);
+  const tflite::QuantizationParameters* quantization_params =
+      cur_tensor->quantization();
+  if (quantization_params && quantization_params->scale() &&
+      quantization_params->scale()->size() > 0 &&
+      quantization_params->zero_point() &&
+      quantization_params->zero_point()->size() > 0) {
+    size_t num_channels = quantization_params->scale()->size();
+    quantized_channel_bytes += TfLiteIntArrayGetSizeInBytes(num_channels);
+  }
+
+  // Calculate the expected allocation bytes with subgraph quantization data:
+  size_t expected_requested_bytes =
+      TF_LITE_AFFINE_QUANTIZATION_SIZE + quantized_channel_bytes;
+
+  tflite::RecordedAllocation recorded_allocation =
+      micro_allocator->GetRecordedAllocation(
+          tflite::RecordedAllocationType::
+              kPersistentTfLiteTensorQuantizationData);
+
+  // Each quantized tensors has 2 mallocs (quant struct, zero point dimensions):
+  TF_LITE_MICRO_EXPECT_EQ(recorded_allocation.count, static_cast<size_t>(2));
+  TF_LITE_MICRO_EXPECT_EQ(recorded_allocation.requested_bytes,
+                          expected_requested_bytes);
+  TF_LITE_MICRO_EXPECT_GE(recorded_allocation.used_bytes,
+                          expected_requested_bytes);
 }
 
 // TODO(b/158124094): Find a way to audit OpData allocations on
diff --git a/tensorflow/lite/micro/recording_simple_memory_allocator.cc b/tensorflow/lite/micro/recording_simple_memory_allocator.cc
index 5e7eb5754e7..ef2e9f31664 100644
--- a/tensorflow/lite/micro/recording_simple_memory_allocator.cc
+++ b/tensorflow/lite/micro/recording_simple_memory_allocator.cc
@@ -24,7 +24,8 @@ namespace tflite {
 RecordingSimpleMemoryAllocator::RecordingSimpleMemoryAllocator(
     ErrorReporter* error_reporter, uint8_t* buffer_head, size_t buffer_size)
     : SimpleMemoryAllocator(error_reporter, buffer_head, buffer_size),
-      requested_bytes_(0),
+      requested_head_bytes_(0),
+      requested_tail_bytes_(0),
       used_bytes_(0),
       alloc_count_(0) {}
 
@@ -45,7 +46,7 @@ RecordingSimpleMemoryAllocator* RecordingSimpleMemoryAllocator::Create(
 }
 
 size_t RecordingSimpleMemoryAllocator::GetRequestedBytes() const {
-  return requested_bytes_;
+  return requested_head_bytes_ + requested_tail_bytes_;
 }
 
 size_t RecordingSimpleMemoryAllocator::GetUsedBytes() const {
@@ -56,16 +57,15 @@ size_t RecordingSimpleMemoryAllocator::GetAllocatedCount() const {
   return alloc_count_;
 }
 
-uint8_t* RecordingSimpleMemoryAllocator::AllocateFromHead(size_t size,
-                                                          size_t alignment) {
+TfLiteStatus RecordingSimpleMemoryAllocator::EnsureHeadSize(size_t size,
+                                                            size_t alignment) {
   const uint8_t* previous_head = GetHead();
-  uint8_t* result = SimpleMemoryAllocator::AllocateFromHead(size, alignment);
-  if (result != nullptr) {
+  TfLiteStatus status = SimpleMemoryAllocator::EnsureHeadSize(size, alignment);
+  if (status == kTfLiteOk) {
     used_bytes_ += GetHead() - previous_head;
-    requested_bytes_ += size;
-    alloc_count_++;
+    requested_head_bytes_ = size;
   }
-  return result;
+  return status;
 }
 
 uint8_t* RecordingSimpleMemoryAllocator::AllocateFromTail(size_t size,
@@ -74,7 +74,7 @@ uint8_t* RecordingSimpleMemoryAllocator::AllocateFromTail(size_t size,
   uint8_t* result = SimpleMemoryAllocator::AllocateFromTail(size, alignment);
   if (result != nullptr) {
     used_bytes_ += previous_tail - GetTail();
-    requested_bytes_ += size;
+    requested_tail_bytes_ += size;
     alloc_count_++;
   }
   return result;
diff --git a/tensorflow/lite/micro/recording_simple_memory_allocator.h b/tensorflow/lite/micro/recording_simple_memory_allocator.h
index 270d9543404..8d3e9fb49d4 100644
--- a/tensorflow/lite/micro/recording_simple_memory_allocator.h
+++ b/tensorflow/lite/micro/recording_simple_memory_allocator.h
@@ -47,11 +47,12 @@ class RecordingSimpleMemoryAllocator : public SimpleMemoryAllocator {
   // Returns the number of alloc calls from the head or tail.
   size_t GetAllocatedCount() const;
 
-  uint8_t* AllocateFromHead(size_t size, size_t alignment) override;
+  TfLiteStatus EnsureHeadSize(size_t size, size_t alignment) override;
   uint8_t* AllocateFromTail(size_t size, size_t alignment) override;
 
  private:
-  size_t requested_bytes_;
+  size_t requested_head_bytes_;
+  size_t requested_tail_bytes_;
   size_t used_bytes_;
   size_t alloc_count_;
 
diff --git a/tensorflow/lite/micro/recording_simple_memory_allocator_test.cc b/tensorflow/lite/micro/recording_simple_memory_allocator_test.cc
index 8fc4745a70e..6450cb53cac 100644
--- a/tensorflow/lite/micro/recording_simple_memory_allocator_test.cc
+++ b/tensorflow/lite/micro/recording_simple_memory_allocator_test.cc
@@ -30,15 +30,19 @@ TF_LITE_MICRO_TEST(TestRecordsTailAllocations) {
 
   uint8_t* result = allocator.AllocateFromTail(/*size=*/10, /*alignment=*/1);
   TF_LITE_MICRO_EXPECT_NE(result, nullptr);
-  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), 10);
-  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(), 10);
-  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(), 1);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), static_cast<size_t>(10));
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(),
+                          static_cast<size_t>(10));
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(),
+                          static_cast<size_t>(1));
 
   result = allocator.AllocateFromTail(/*size=*/20, /*alignment=*/1);
   TF_LITE_MICRO_EXPECT_NE(result, nullptr);
-  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), 30);
-  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(), 30);
-  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(), 2);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), static_cast<size_t>(30));
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(),
+                          static_cast<size_t>(30));
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(),
+                          static_cast<size_t>(2));
 }
 
 TF_LITE_MICRO_TEST(TestRecordsMisalignedTailAllocations) {
@@ -50,10 +54,12 @@ TF_LITE_MICRO_TEST(TestRecordsMisalignedTailAllocations) {
   uint8_t* result = allocator.AllocateFromTail(/*size=*/10, /*alignment=*/12);
   TF_LITE_MICRO_EXPECT_NE(result, nullptr);
   // Validate used bytes in 8 byte range that can included alignment of 12:
-  TF_LITE_MICRO_EXPECT_GE(allocator.GetUsedBytes(), 10);
-  TF_LITE_MICRO_EXPECT_LE(allocator.GetUsedBytes(), 20);
-  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(), 10);
-  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(), 1);
+  TF_LITE_MICRO_EXPECT_GE(allocator.GetUsedBytes(), static_cast<size_t>(10));
+  TF_LITE_MICRO_EXPECT_LE(allocator.GetUsedBytes(), static_cast<size_t>(20));
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(),
+                          static_cast<size_t>(10));
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(),
+                          static_cast<size_t>(1));
 }
 
 TF_LITE_MICRO_TEST(TestDoesNotRecordFailedTailAllocations) {
@@ -63,44 +69,54 @@ TF_LITE_MICRO_TEST(TestDoesNotRecordFailedTailAllocations) {
                                                    arena_size);
 
   uint8_t* result = allocator.AllocateFromTail(/*size=*/2048, /*alignment=*/1);
-  TF_LITE_MICRO_EXPECT_EQ(result, nullptr);
-  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), 0);
-  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(), 0);
-  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(), 0);
+  TF_LITE_MICRO_EXPECT(result == nullptr);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), static_cast<size_t>(0));
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(),
+                          static_cast<size_t>(0));
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(),
+                          static_cast<size_t>(0));
 }
 
-TF_LITE_MICRO_TEST(TestRecordsHeadAllocations) {
+TF_LITE_MICRO_TEST(TestRecordsHeadSizeAdjustment) {
   constexpr size_t arena_size = 1024;
   uint8_t arena[arena_size];
   tflite::RecordingSimpleMemoryAllocator allocator(micro_test::reporter, arena,
                                                    arena_size);
 
-  uint8_t* result = allocator.AllocateFromHead(/*size=*/5, /*alignment=*/1);
-  TF_LITE_MICRO_EXPECT_NE(result, nullptr);
-  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), 5);
-  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(), 5);
-  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(), 1);
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, allocator.EnsureHeadSize(/*size=*/5, /*alignment=*/1));
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), static_cast<size_t>(5));
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(),
+                          static_cast<size_t>(5));
+  // Head adjustments do not count as an allocation:
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(),
+                          static_cast<size_t>(0));
 
-  result = allocator.AllocateFromTail(/*size=*/15, /*alignment=*/1);
+  uint8_t* result = allocator.AllocateFromTail(/*size=*/15, /*alignment=*/1);
   TF_LITE_MICRO_EXPECT_NE(result, nullptr);
-  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), 20);
-  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(), 20);
-  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(), 2);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), static_cast<size_t>(20));
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(),
+                          static_cast<size_t>(20));
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(),
+                          static_cast<size_t>(1));
 }
 
-TF_LITE_MICRO_TEST(TestRecordsMisalignedHeadAllocations) {
+TF_LITE_MICRO_TEST(TestRecordsMisalignedHeadSizeAdjustments) {
   constexpr size_t arena_size = 1024;
   uint8_t arena[arena_size];
   tflite::RecordingSimpleMemoryAllocator allocator(micro_test::reporter, arena,
                                                    arena_size);
 
-  uint8_t* result = allocator.AllocateFromHead(/*size=*/10, /*alignment=*/12);
-  TF_LITE_MICRO_EXPECT_NE(result, nullptr);
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, allocator.EnsureHeadSize(/*size=*/10, /*alignment=*/12));
   // Validate used bytes in 8 byte range that can included alignment of 12:
-  TF_LITE_MICRO_EXPECT_GE(allocator.GetUsedBytes(), 10);
-  TF_LITE_MICRO_EXPECT_LE(allocator.GetUsedBytes(), 20);
-  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(), 10);
-  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(), 1);
+  TF_LITE_MICRO_EXPECT_GE(allocator.GetUsedBytes(), static_cast<size_t>(10));
+  TF_LITE_MICRO_EXPECT_LE(allocator.GetUsedBytes(), static_cast<size_t>(20));
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(),
+                          static_cast<size_t>(10));
+  // Head adjustments do not count as an allocation:
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(),
+                          static_cast<size_t>(0));
 }
 
 TF_LITE_MICRO_TEST(TestDoesNotRecordFailedTailAllocations) {
@@ -109,11 +125,13 @@ TF_LITE_MICRO_TEST(TestDoesNotRecordFailedTailAllocations) {
   tflite::RecordingSimpleMemoryAllocator allocator(micro_test::reporter, arena,
                                                    arena_size);
 
-  uint8_t* result = allocator.AllocateFromHead(/*size=*/2048, /*alignment=*/1);
-  TF_LITE_MICRO_EXPECT_EQ(result, nullptr);
-  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), 0);
-  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(), 0);
-  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(), 0);
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteError, allocator.EnsureHeadSize(/*size=*/2048, /*alignment=*/1));
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), static_cast<size_t>(0));
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(),
+                          static_cast<size_t>(0));
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(),
+                          static_cast<size_t>(0));
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/simple_memory_allocator.cc b/tensorflow/lite/micro/simple_memory_allocator.cc
index 7ca662f04d8..bea1a9d7175 100644
--- a/tensorflow/lite/micro/simple_memory_allocator.cc
+++ b/tensorflow/lite/micro/simple_memory_allocator.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstdint>
 #include <new>
 
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
@@ -32,7 +33,8 @@ SimpleMemoryAllocator::SimpleMemoryAllocator(ErrorReporter* error_reporter,
       buffer_head_(buffer_head),
       buffer_tail_(buffer_tail),
       head_(buffer_head),
-      tail_(buffer_tail) {}
+      tail_(buffer_tail),
+      temp_(buffer_head_) {}
 
 SimpleMemoryAllocator::SimpleMemoryAllocator(ErrorReporter* error_reporter,
                                              uint8_t* buffer,
@@ -58,38 +60,73 @@ SimpleMemoryAllocator* SimpleMemoryAllocator::Create(
 
 SimpleMemoryAllocator::~SimpleMemoryAllocator() {}
 
-uint8_t* SimpleMemoryAllocator::AllocateFromHead(size_t size,
-                                                 size_t alignment) {
-  uint8_t* const aligned_result = AlignPointerUp(head_, alignment);
+TfLiteStatus SimpleMemoryAllocator::EnsureHeadSize(size_t size,
+                                                   size_t alignment) {
+  if (head_ != temp_) {
+    TF_LITE_REPORT_ERROR(
+        error_reporter_,
+        "Internal error: EnsureHeadSize() needs to be called after"
+        "ResetTempAllocations().");
+    return kTfLiteError;
+  }
+
+  uint8_t* const aligned_result = AlignPointerUp(buffer_head_, alignment);
+  if (aligned_result + size < head_) {
+    // Size is below the current head size, just return.
+    return kTfLiteOk;
+  }
+
   const size_t available_memory = tail_ - aligned_result;
   if (available_memory < size) {
     TF_LITE_REPORT_ERROR(
         error_reporter_,
-        "Failed to allocate memory. Requested: %u, available %u, missing: %u",
+        "Failed to adjust head size. Requested: %u, available %u, missing: %u",
         size, available_memory, size - available_memory);
-    return nullptr;
+    return kTfLiteError;
   }
   head_ = aligned_result + size;
-  return aligned_result;
+  temp_ = head_;
+
+  return kTfLiteOk;
 }
 
 uint8_t* SimpleMemoryAllocator::AllocateFromTail(size_t size,
                                                  size_t alignment) {
   uint8_t* const aligned_result = AlignPointerDown(tail_ - size, alignment);
   if (aligned_result < head_) {
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
     const size_t missing_memory = head_ - aligned_result;
-    TF_LITE_REPORT_ERROR(
-        error_reporter_,
-        "Failed to allocate memory. Requested: %u, available %u, missing: %u",
-        size, size - missing_memory, missing_memory);
+    TF_LITE_REPORT_ERROR(error_reporter_,
+                         "Failed to allocate tail memory. Requested: %u, "
+                         "available %u, missing: %u",
+                         size, size - missing_memory, missing_memory);
+#endif
     return nullptr;
   }
   tail_ = aligned_result;
   return aligned_result;
 }
 
+uint8_t* SimpleMemoryAllocator::AllocateTemp(size_t size, size_t alignment) {
+  uint8_t* const aligned_result = AlignPointerUp(temp_, alignment);
+  const size_t available_memory = tail_ - aligned_result;
+  if (available_memory < size) {
+    TF_LITE_REPORT_ERROR(error_reporter_,
+                         "Failed to allocate temp memory. Requested: %u, "
+                         "available %u, missing: %u",
+                         size, available_memory, size - available_memory);
+    return nullptr;
+  }
+  temp_ = aligned_result + size;
+  return aligned_result;
+}
+
+void SimpleMemoryAllocator::ResetTempAllocations() { temp_ = head_; }
+
 uint8_t* SimpleMemoryAllocator::GetHead() const { return head_; }
 
+uint8_t* SimpleMemoryAllocator::GetBufferHead() const { return buffer_head_; }
+
 uint8_t* SimpleMemoryAllocator::GetTail() const { return tail_; }
 
 size_t SimpleMemoryAllocator::GetHeadUsedBytes() const {
@@ -100,12 +137,14 @@ size_t SimpleMemoryAllocator::GetTailUsedBytes() const {
   return buffer_tail_ - tail_;
 }
 
-size_t SimpleMemoryAllocator::GetAvailableMemory() const {
-  return tail_ - head_;
+size_t SimpleMemoryAllocator::GetAvailableMemory(size_t alignment) const {
+  uint8_t* const aligned_head = AlignPointerUp(head_, alignment);
+  uint8_t* const aligned_tail = AlignPointerDown(tail_, alignment);
+  return aligned_tail - aligned_head;
 }
 
 size_t SimpleMemoryAllocator::GetUsedBytes() const {
-  return GetBufferSize() - GetAvailableMemory();
+  return GetBufferSize() - (tail_ - head_);
 }
 
 size_t SimpleMemoryAllocator::GetBufferSize() const {
diff --git a/tensorflow/lite/micro/simple_memory_allocator.h b/tensorflow/lite/micro/simple_memory_allocator.h
index 426ced032f6..8c216f47848 100644
--- a/tensorflow/lite/micro/simple_memory_allocator.h
+++ b/tensorflow/lite/micro/simple_memory_allocator.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/micro/compatibility.h"
 
@@ -42,20 +43,42 @@ class SimpleMemoryAllocator {
                                        uint8_t* buffer_head,
                                        size_t buffer_size);
 
-  // Allocates memory starting at the head of the arena (lowest address and
-  // moving upwards).
-  virtual uint8_t* AllocateFromHead(size_t size, size_t alignment);
+  // Ensure that the head (lowest address and moving upwards) memory allocation
+  // is at least a given size. This function will only increase the head size if
+  // the passed in value is larger than the current head size. Calls to this
+  // method will also invalidate all temporary allocation values. This call will
+  // fail if a chain of allocations through AllocateTemp() have not been cleaned
+  // up with a call to ResetTempAllocations().
+  virtual TfLiteStatus EnsureHeadSize(size_t size, size_t alignment);
+
   // Allocates memory starting at the tail of the arena (highest address and
   // moving downwards).
   virtual uint8_t* AllocateFromTail(size_t size, size_t alignment);
 
+  // Allocates a temporary buffer from the head of the arena (lowest address and
+  // moving upwards) but does not update the actual head allocation size or
+  // position. The returned buffer is guaranteed until either
+  // ResetTempAllocations() is called or another call to AllocateFromHead().
+  // Repeat calls to this function will create a chain of temp allocations. All
+  // calls to AllocateTemp() must end with a call to ResetTempAllocations(). If
+  // AllocateFromHead() is called before a call to ResetTempAllocations(), it
+  // will fail with an error message.
+  virtual uint8_t* AllocateTemp(size_t size, size_t alignment);
+
+  // Resets a chain of temporary allocations back to the current head of the
+  // arena (lowest address).
+  virtual void ResetTempAllocations();
+
   uint8_t* GetHead() const;
+  uint8_t* GetBufferHead() const;
   uint8_t* GetTail() const;
 
   size_t GetHeadUsedBytes() const;
   size_t GetTailUsedBytes() const;
 
-  size_t GetAvailableMemory() const;
+  // Returns the number of bytes available with a given alignment.
+  size_t GetAvailableMemory(size_t alignment) const;
+
   size_t GetUsedBytes() const;
 
  private:
@@ -66,6 +89,7 @@ class SimpleMemoryAllocator {
   uint8_t* buffer_tail_;
   uint8_t* head_;
   uint8_t* tail_;
+  uint8_t* temp_;
 
   TF_LITE_REMOVE_VIRTUAL_DELETE
 };
diff --git a/tensorflow/lite/micro/simple_memory_allocator_test.cc b/tensorflow/lite/micro/simple_memory_allocator_test.cc
index f0ebf343b59..adffc9566da 100644
--- a/tensorflow/lite/micro/simple_memory_allocator_test.cc
+++ b/tensorflow/lite/micro/simple_memory_allocator_test.cc
@@ -22,6 +22,91 @@ limitations under the License.
 
 TF_LITE_MICRO_TESTS_BEGIN
 
+TF_LITE_MICRO_TEST(TestEnsureHeadSizeSimpleAlignment) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::SimpleMemoryAllocator allocator(micro_test::reporter, arena,
+                                          arena_size);
+
+  // First head adjustment
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, allocator.EnsureHeadSize(/*size=*/100, /*alignment=*/1));
+  TF_LITE_MICRO_EXPECT(arena + 100 == allocator.GetHead());
+
+  // Second head adjusment is smaller, head size should still be 100.
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, allocator.EnsureHeadSize(/*size=*/10, /*alignment=*/1));
+  TF_LITE_MICRO_EXPECT(arena + 100 == allocator.GetHead());
+
+  // Third head adjustment re-increases the head size:
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, allocator.EnsureHeadSize(/*size=*/1000, /*alignment=*/1));
+  TF_LITE_MICRO_EXPECT(arena + 1000 == allocator.GetHead());
+}
+
+TF_LITE_MICRO_TEST(TestAdjustHeadSizeMisalignment) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::SimpleMemoryAllocator allocator(micro_test::reporter, arena,
+                                          arena_size);
+
+  // First head adjustment of 100 bytes (aligned 12):
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, allocator.EnsureHeadSize(/*size=*/100, /*alignment=*/12));
+
+  // Offset alignment of 12 can lead to allocation within 8 byte range of
+  // requested bytes based to arena alignment at runtime:
+  TF_LITE_MICRO_EXPECT_GE(allocator.GetHead(), arena + 100);
+  TF_LITE_MICRO_EXPECT_LE(allocator.GetHead(), arena + 100 + 11);
+
+  // Second head adjusment shrinks the head size (aligned at 12), head size
+  // should still be 100:
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, allocator.EnsureHeadSize(/*size=*/10, /*alignment=*/12));
+  TF_LITE_MICRO_EXPECT_GE(allocator.GetHead(), arena + 100);
+  TF_LITE_MICRO_EXPECT_LE(allocator.GetHead(), arena + 100 + 11);
+
+  // Third head adjustment re-increases the head size (aligned at 12):
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, allocator.EnsureHeadSize(/*size=*/1000, /*alignment=*/12));
+  TF_LITE_MICRO_EXPECT_GE(allocator.GetHead(), arena + 1000);
+  TF_LITE_MICRO_EXPECT_LE(allocator.GetHead(), arena + 1000 + 11);
+}
+
+TF_LITE_MICRO_TEST(TestAdjustHeadSizeMisalignedHandlesCorrectBytesAvailable) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::SimpleMemoryAllocator allocator(micro_test::reporter, arena,
+                                          arena_size);
+
+  // First head adjustment of 100 bytes (aligned 12):
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, allocator.EnsureHeadSize(/*size=*/100, /*alignment=*/12));
+
+  // allocator.GetAvailableMemory() should also report the actual amount of
+  // memory available based on a requested offset (12):
+  size_t aligned_available_bytes =
+      allocator.GetAvailableMemory(/*alignment=*/12);
+  TF_LITE_MICRO_EXPECT_LE(aligned_available_bytes, arena_size - 100);
+  TF_LITE_MICRO_EXPECT_GE(aligned_available_bytes, arena_size - 100 - 24);
+
+  // Second head adjusment shrinks the head size (aligned at 12), head size
+  // should still be 100:
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, allocator.EnsureHeadSize(/*size=*/10, /*alignment=*/12));
+  aligned_available_bytes = allocator.GetAvailableMemory(/*alignment=*/12);
+
+  TF_LITE_MICRO_EXPECT_LE(aligned_available_bytes, arena_size - 100);
+  TF_LITE_MICRO_EXPECT_GE(aligned_available_bytes, arena_size - 100 - 24);
+
+  // Third head adjustment re-increases the head size (aligned at 12):
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, allocator.EnsureHeadSize(/*size=*/1000, /*alignment=*/12));
+  aligned_available_bytes = allocator.GetAvailableMemory(/*alignment=*/12);
+  TF_LITE_MICRO_EXPECT_LE(aligned_available_bytes, arena_size - 1000);
+  TF_LITE_MICRO_EXPECT_GE(aligned_available_bytes, arena_size - 1000 - 24);
+}
+
 TF_LITE_MICRO_TEST(TestJustFits) {
   constexpr size_t arena_size = 1024;
   uint8_t arena[arena_size];
@@ -29,7 +114,7 @@ TF_LITE_MICRO_TEST(TestJustFits) {
                                           arena_size);
 
   uint8_t* result = allocator.AllocateFromTail(arena_size, 1);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, result);
+  TF_LITE_MICRO_EXPECT(nullptr != result);
 }
 
 TF_LITE_MICRO_TEST(TestAligned) {
@@ -39,11 +124,12 @@ TF_LITE_MICRO_TEST(TestAligned) {
                                           arena_size);
 
   uint8_t* result = allocator.AllocateFromTail(1, 1);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, result);
+  TF_LITE_MICRO_EXPECT(nullptr != result);
 
   result = allocator.AllocateFromTail(16, 4);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, result);
-  TF_LITE_MICRO_EXPECT_EQ(0, reinterpret_cast<std::uintptr_t>(result) & 3);
+  TF_LITE_MICRO_EXPECT(nullptr != result);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(0),
+                          reinterpret_cast<std::uintptr_t>(result) & 3);
 }
 
 TF_LITE_MICRO_TEST(TestMultipleTooLarge) {
@@ -53,10 +139,67 @@ TF_LITE_MICRO_TEST(TestMultipleTooLarge) {
                                           arena_size);
 
   uint8_t* result = allocator.AllocateFromTail(768, 1);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, result);
+  TF_LITE_MICRO_EXPECT(nullptr != result);
 
   result = allocator.AllocateFromTail(768, 1);
-  TF_LITE_MICRO_EXPECT_EQ(nullptr, result);
+  TF_LITE_MICRO_EXPECT(nullptr == result);
+}
+
+TF_LITE_MICRO_TEST(TestTempAllocations) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::SimpleMemoryAllocator allocator(micro_test::reporter, arena,
+                                          arena_size);
+
+  uint8_t* temp1 = allocator.AllocateTemp(100, 1);
+  TF_LITE_MICRO_EXPECT(nullptr != temp1);
+
+  uint8_t* temp2 = allocator.AllocateTemp(100, 1);
+  TF_LITE_MICRO_EXPECT(nullptr != temp2);
+
+  // Expect that the next micro allocation is 100 bytes away from each other.
+  TF_LITE_MICRO_EXPECT_EQ(temp2 - temp1, 100);
+}
+
+TF_LITE_MICRO_TEST(TestResetTempAllocations) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::SimpleMemoryAllocator allocator(micro_test::reporter, arena,
+                                          arena_size);
+
+  uint8_t* temp1 = allocator.AllocateTemp(100, 1);
+  TF_LITE_MICRO_EXPECT(nullptr != temp1);
+
+  allocator.ResetTempAllocations();
+
+  uint8_t* temp2 = allocator.AllocateTemp(100, 1);
+  TF_LITE_MICRO_EXPECT(nullptr != temp2);
+
+  // Reset temp allocations should have the same start address:
+  TF_LITE_MICRO_EXPECT_EQ(temp2 - temp1, 0);
+}
+
+TF_LITE_MICRO_TEST(TestEnsureHeadSizeWithoutResettingTemp) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::SimpleMemoryAllocator allocator(micro_test::reporter, arena,
+                                          arena_size);
+
+  uint8_t* temp = allocator.AllocateTemp(100, 1);
+  TF_LITE_MICRO_EXPECT(nullptr != temp);
+
+  // Adjustment to head should fail since temp allocation was not followed by a
+  // call to ResetTempAllocations().
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError, allocator.EnsureHeadSize(100, 1));
+
+  allocator.ResetTempAllocations();
+
+  // Reduce head size back to zero.
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocator.EnsureHeadSize(0, 1));
+
+  // The most recent head allocation should be in the same location as the
+  // original temp allocation pointer.
+  TF_LITE_MICRO_EXPECT(temp == allocator.GetHead());
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/sparkfun_edge/debug_log.cc b/tensorflow/lite/micro/sparkfun_edge/debug_log.cc
index 1dc15aba529..984d2a90188 100644
--- a/tensorflow/lite/micro/sparkfun_edge/debug_log.cc
+++ b/tensorflow/lite/micro/sparkfun_edge/debug_log.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "am_util.h"  // NOLINT
 
 extern "C" void DebugLog(const char* s) {
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
   static bool is_initialized = false;
   if (!is_initialized) {
     am_bsp_uart_printf_enable();
@@ -30,4 +31,5 @@ extern "C" void DebugLog(const char* s) {
   }
 
   am_util_stdio_printf("%s", s);
+#endif
 }
diff --git a/tensorflow/lite/micro/test_helpers.cc b/tensorflow/lite/micro/test_helpers.cc
index 6551683bfb2..23c7ca96408 100644
--- a/tensorflow/lite/micro/test_helpers.cc
+++ b/tensorflow/lite/micro/test_helpers.cc
@@ -574,9 +574,7 @@ void* SimpleStatefulOp::Init(TfLiteContext* context, const char* buffer,
   TFLITE_DCHECK(context->GetScratchBuffer == nullptr);
   TFLITE_DCHECK(context->RequestScratchBufferInArena == nullptr);
 
-  void* raw;
-  TFLITE_DCHECK(context->AllocatePersistentBuffer(context, sizeof(OpData),
-                                                  &raw) == kTfLiteOk);
+  void* raw = context->AllocatePersistentBuffer(context, sizeof(OpData));
   OpData* data = reinterpret_cast<OpData*>(raw);
   *data = {};
   return raw;
@@ -586,7 +584,7 @@ TfLiteStatus SimpleStatefulOp::Prepare(TfLiteContext* context,
                                        TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  // Make sure that the input is in uint8 with at least 1 data entry.
+  // Make sure that the input is in uint8_t with at least 1 data entry.
   const TfLiteTensor* input = tflite::GetInput(context, node, kInputTensor);
   if (input->type != kTfLiteUInt8) return kTfLiteError;
   if (NumElements(input->dims) == 0) return kTfLiteError;
@@ -814,11 +812,13 @@ int TestStrcmp(const char* a, const char* b) {
 
 // Wrapper to forward kernel errors to the interpreter's error reporter.
 void ReportOpError(struct TfLiteContext* context, const char* format, ...) {
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
   ErrorReporter* error_reporter = static_cast<ErrorReporter*>(context->impl_);
   va_list args;
   va_start(args, format);
   TF_LITE_REPORT_ERROR(error_reporter, format, args);
   va_end(args);
+#endif
 }
 
 // Create a TfLiteIntArray from an array of ints.  The first element in the
@@ -927,8 +927,8 @@ TfLiteTensor CreateQuantizedBiasTensor(const float* data, int32_t* quantized,
   TfLiteTensor result = CreateTensor(dims, is_variable);
   result.type = kTfLiteInt32;
   result.data.i32 = const_cast<int32_t*>(quantized);
-  // Quantized int32 tensors always have a zero point of 0, since the range of
-  // int32 values is large, and because zero point costs extra cycles during
+  // Quantized int32_t tensors always have a zero point of 0, since the range of
+  // int32_t values is large, and because zero point costs extra cycles during
   // processing.
   result.params = {bias_scale, 0};
   result.quantization = {kTfLiteAffineQuantization, nullptr};
@@ -936,7 +936,7 @@ TfLiteTensor CreateQuantizedBiasTensor(const float* data, int32_t* quantized,
   return result;
 }
 
-// Quantizes int32 bias tensor with per-channel weights determined by input
+// Quantizes int32_t bias tensor with per-channel weights determined by input
 // scale multiplied by weight scale for each channel.
 TfLiteTensor CreatePerChannelQuantizedBiasTensor(
     const float* input, int32_t* quantized, TfLiteIntArray* dims,
@@ -996,5 +996,13 @@ TfLiteTensor CreateSymmetricPerChannelQuantizedTensor(
   return result;
 }
 
+size_t GetModelTensorCount(const Model* model) {
+  auto* subgraphs = model->subgraphs();
+  if (subgraphs) {
+    return (*subgraphs)[0]->tensors()->size();
+  }
+  return 0;
+}
+
 }  // namespace testing
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/test_helpers.h b/tensorflow/lite/micro/test_helpers.h
index c2b489314d0..a7897145d26 100644
--- a/tensorflow/lite/micro/test_helpers.h
+++ b/tensorflow/lite/micro/test_helpers.h
@@ -164,7 +164,7 @@ TfLiteTensor CreateQuantizedBiasTensor(const float* data, int32_t* quantized,
                                        float weights_scale,
                                        bool is_variable = false);
 
-// Quantizes int32 bias tensor with per-channel weights determined by input
+// Quantizes int32_t bias tensor with per-channel weights determined by input
 // scale multiplied by weight scale for each channel.
 TfLiteTensor CreatePerChannelQuantizedBiasTensor(
     const float* input, int32_t* quantized, TfLiteIntArray* dims,
@@ -177,6 +177,9 @@ TfLiteTensor CreateSymmetricPerChannelQuantizedTensor(
     int* zero_points, TfLiteAffineQuantization* affine_quant,
     int quantized_dimension, bool is_variable = false);
 
+// Returns the number of tensors in the default subgraph for a tflite::Model.
+size_t GetModelTensorCount(const Model* model);
+
 }  // namespace testing
 }  // namespace tflite
 
diff --git a/tensorflow/lite/micro/testing/BUILD b/tensorflow/lite/micro/testing/BUILD
index 6f4b2502f4a..207d500c53d 100644
--- a/tensorflow/lite/micro/testing/BUILD
+++ b/tensorflow/lite/micro/testing/BUILD
@@ -1,3 +1,4 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load(
     "//tensorflow/lite/micro/testing:micro_test.bzl",
     "tflite_micro_cc_test",
@@ -78,3 +79,10 @@ py_binary(
         "@absl_py//absl:app",
     ],
 )
+
+bzl_library(
+    name = "micro_test_bzl",
+    srcs = ["micro_test.bzl"],
+    visibility = ["//visibility:private"],
+    deps = ["//tensorflow/lite/micro:build_def_bzl"],
+)
diff --git a/tensorflow/lite/micro/testing/micro_test.bzl b/tensorflow/lite/micro/testing/micro_test.bzl
index 532a1a16ac6..5e1a56fdc48 100644
--- a/tensorflow/lite/micro/testing/micro_test.bzl
+++ b/tensorflow/lite/micro/testing/micro_test.bzl
@@ -1,5 +1,10 @@
 """Rules for simple testing without dependencies by parsing output logs."""
 
+load(
+    "//tensorflow/lite/micro:build_def.bzl",
+    "micro_copts",
+)
+
 def tflite_micro_cc_test(
         name,
         size = "medium",
@@ -7,7 +12,7 @@ def tflite_micro_cc_test(
         srcs = [],
         includes = [],
         defines = [],
-        copts = ["-Werror", "-Wno-unused-variable"],
+        copts = micro_copts(),
         nocopts = "",
         linkopts = [],
         deps = [],
diff --git a/tensorflow/lite/micro/testing/micro_test.h b/tensorflow/lite/micro/testing/micro_test.h
index 95796e64ff1..d74d8f4f1a6 100644
--- a/tensorflow/lite/micro/testing/micro_test.h
+++ b/tensorflow/lite/micro/testing/micro_test.h
@@ -110,13 +110,16 @@ extern tflite::ErrorReporter* reporter;
     }                                                                          \
   } while (false)
 
+// TODO(b/139142772): this macro is used with types other than ints even though
+// the printf specifier is %d.
 #define TF_LITE_MICRO_EXPECT_EQ(x, y)                                          \
   do {                                                                         \
     auto vx = x;                                                               \
     auto vy = y;                                                               \
     if ((vx) != (vy)) {                                                        \
       micro_test::reporter->Report(#x " == " #y " failed at %s:%d (%d vs %d)", \
-                                   __FILE__, __LINE__, (vx), (vy));            \
+                                   __FILE__, __LINE__, static_cast<int>(vx),   \
+                                   static_cast<int>(vy));                      \
       micro_test::did_test_fail = true;                                        \
     }                                                                          \
   } while (false)
@@ -147,17 +150,18 @@ extern tflite::ErrorReporter* reporter;
     }                                                                   \
   } while (false)
 
-#define TF_LITE_MICRO_EXPECT_NEAR(x, y, epsilon)                               \
-  do {                                                                         \
-    auto vx = (x);                                                             \
-    auto vy = (y);                                                             \
-    auto delta = ((vx) > (vy)) ? ((vx) - (vy)) : ((vy) - (vx));                \
-    if (delta > epsilon) {                                                     \
-      micro_test::reporter->Report(                                            \
-          #x " (%f) near " #y " (%f) failed at %s:%d", static_cast<float>(vx), \
-          static_cast<float>(vy), __FILE__, __LINE__);                         \
-      micro_test::did_test_fail = true;                                        \
-    }                                                                          \
+#define TF_LITE_MICRO_EXPECT_NEAR(x, y, epsilon)                      \
+  do {                                                                \
+    auto vx = (x);                                                    \
+    auto vy = (y);                                                    \
+    auto delta = ((vx) > (vy)) ? ((vx) - (vy)) : ((vy) - (vx));       \
+    if (delta > epsilon) {                                            \
+      micro_test::reporter->Report(                                   \
+          #x " (%f) near " #y " (%f) failed at %s:%d",                \
+          static_cast<double>(vx), static_cast<double>(vy), __FILE__, \
+          __LINE__);                                                  \
+      micro_test::did_test_fail = true;                               \
+    }                                                                 \
   } while (false)
 
 #define TF_LITE_MICRO_EXPECT_GT(x, y)                                        \
diff --git a/tensorflow/lite/micro/testing/test_utils.cc b/tensorflow/lite/micro/testing/test_utils.cc
index 4471b2e2929..4d931bdd33b 100644
--- a/tensorflow/lite/micro/testing/test_utils.cc
+++ b/tensorflow/lite/micro/testing/test_utils.cc
@@ -36,23 +36,17 @@ constexpr size_t kBufferAlignment = 16;
 // We store the pointer to the ith scratch buffer to implement the Request/Get
 // ScratchBuffer API for the tests. scratch_buffers_[i] will be the ith scratch
 // buffer and will still be allocated from within raw_arena_.
-constexpr size_t kNumScratchBuffers = 5;
+constexpr int kNumScratchBuffers = 5;
 uint8_t* scratch_buffers_[kNumScratchBuffers];
-size_t scratch_buffer_count_ = 0;
+int scratch_buffer_count_ = 0;
 
 // Note that the context parameter in this function is only needed to match the
 // signature of TfLiteContext::AllocatePersistentBuffer and isn't needed in the
 // implementation because we are assuming a single global
 // simple_memory_allocator_
-TfLiteStatus AllocatePersistentBuffer(TfLiteContext* context, size_t bytes,
-                                      void** ptr) {
+void* AllocatePersistentBuffer(TfLiteContext* context, size_t bytes) {
   TFLITE_DCHECK(simple_memory_allocator_ != nullptr);
-  TFLITE_DCHECK(ptr != nullptr);
-  *ptr = simple_memory_allocator_->AllocateFromTail(bytes, kBufferAlignment);
-  if (*ptr == nullptr) {
-    return kTfLiteError;
-  }
-  return kTfLiteOk;
+  return simple_memory_allocator_->AllocateFromTail(bytes, kBufferAlignment);
 }
 
 TfLiteStatus RequestScratchBufferInArena(TfLiteContext* context, size_t bytes,
@@ -87,6 +81,11 @@ void* GetScratchBuffer(TfLiteContext* context, int buffer_index) {
   return scratch_buffers_[buffer_index];
 }
 
+TfLiteTensor* GetTensor(const struct TfLiteContext* context, int subgraph_idx) {
+  // TODO(b/160894903): Return this value from temp allocated memory.
+  return &context->tensors[subgraph_idx];
+}
+
 }  // namespace
 
 uint8_t F2Q(float value, float min, float max) {
@@ -107,7 +106,7 @@ int8_t F2QS(float value, float min, float max) {
 }
 
 int32_t F2Q32(float value, float scale) {
-  double quantized = value / scale;
+  double quantized = static_cast<double>(value / scale);
   if (quantized > std::numeric_limits<int32_t>::max()) {
     quantized = std::numeric_limits<int32_t>::max();
   } else if (quantized < std::numeric_limits<int32_t>::min()) {
@@ -137,6 +136,9 @@ void PopulateContext(TfLiteTensor* tensors, int tensors_size,
   context->GetExternalContext = nullptr;
   context->SetExternalContext = nullptr;
 
+  context->GetTensor = GetTensor;
+  context->GetEvalTensor = nullptr;
+
   context->AllocatePersistentBuffer = AllocatePersistentBuffer;
   context->RequestScratchBufferInArena = RequestScratchBufferInArena;
   context->GetScratchBuffer = GetScratchBuffer;
@@ -148,16 +150,6 @@ void PopulateContext(TfLiteTensor* tensors, int tensors_size,
   }
 }
 
-TfLiteTensor CreateFloatTensor(std::initializer_list<float> data,
-                               TfLiteIntArray* dims, bool is_variable) {
-  return CreateFloatTensor(data.begin(), dims, is_variable);
-}
-
-TfLiteTensor CreateBoolTensor(std::initializer_list<bool> data,
-                              TfLiteIntArray* dims, bool is_variable) {
-  return CreateBoolTensor(data.begin(), dims, is_variable);
-}
-
 TfLiteTensor CreateQuantizedTensor(const uint8_t* data, TfLiteIntArray* dims,
                                    float min, float max, bool is_variable) {
   TfLiteTensor result;
@@ -172,12 +164,6 @@ TfLiteTensor CreateQuantizedTensor(const uint8_t* data, TfLiteIntArray* dims,
   return result;
 }
 
-TfLiteTensor CreateQuantizedTensor(std::initializer_list<uint8_t> data,
-                                   TfLiteIntArray* dims, float min, float max,
-                                   bool is_variable) {
-  return CreateQuantizedTensor(data.begin(), dims, min, max, is_variable);
-}
-
 TfLiteTensor CreateQuantizedTensor(const int8_t* data, TfLiteIntArray* dims,
                                    float min, float max, bool is_variable) {
   TfLiteTensor result;
@@ -192,12 +178,6 @@ TfLiteTensor CreateQuantizedTensor(const int8_t* data, TfLiteIntArray* dims,
   return result;
 }
 
-TfLiteTensor CreateQuantizedTensor(std::initializer_list<int8_t> data,
-                                   TfLiteIntArray* dims, float min, float max,
-                                   bool is_variable) {
-  return CreateQuantizedTensor(data.begin(), dims, min, max, is_variable);
-}
-
 TfLiteTensor CreateQuantizedTensor(float* data, uint8_t* quantized_data,
                                    TfLiteIntArray* dims, bool is_variable) {
   TfLiteTensor result;
@@ -246,8 +226,8 @@ TfLiteTensor CreateQuantized32Tensor(const int32_t* data, TfLiteIntArray* dims,
   result.type = kTfLiteInt32;
   result.data.i32 = const_cast<int32_t*>(data);
   result.dims = dims;
-  // Quantized int32 tensors always have a zero point of 0, since the range of
-  // int32 values is large, and because zero point costs extra cycles during
+  // Quantized int32_t tensors always have a zero point of 0, since the range of
+  // int32_t values is large, and because zero point costs extra cycles during
   // processing.
   result.params = {scale, 0};
   result.allocation_type = kTfLiteMemNone;
@@ -256,11 +236,5 @@ TfLiteTensor CreateQuantized32Tensor(const int32_t* data, TfLiteIntArray* dims,
   return result;
 }
 
-TfLiteTensor CreateQuantized32Tensor(std::initializer_list<int32_t> data,
-                                     TfLiteIntArray* dims, float scale,
-                                     bool is_variable) {
-  return CreateQuantized32Tensor(data.begin(), dims, scale, is_variable);
-}
-
 }  // namespace testing
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/testing/test_utils.h b/tensorflow/lite/micro/testing/test_utils.h
index 0165cbb707a..e83ac806d8a 100644
--- a/tensorflow/lite/micro/testing/test_utils.h
+++ b/tensorflow/lite/micro/testing/test_utils.h
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <cmath>
 #include <cstdint>
-#include <initializer_list>
 #include <limits>
 
 #include "tensorflow/lite/c/common.h"
@@ -31,12 +30,6 @@ namespace testing {
 
 // Note: These methods are deprecated, do not use.  See b/141332970.
 
-// TODO(kreeger): Don't use this anymore in our tests. Optimized compiler
-// settings can play with pointer placement on the stack (b/140130236).
-inline TfLiteIntArray* IntArrayFromInitializer(
-    std::initializer_list<int> int_initializer) {
-  return IntArrayFromInts(int_initializer.begin());
-}
 
 // Derives the quantization range max from scaling factor and zero point.
 template <typename T>
@@ -53,8 +46,9 @@ inline float MinFromZeroPointScale(const int zero_point, const float scale) {
 // Derives the quantization scaling factor from a min and max range.
 template <typename T>
 inline float ScaleFromMinMax(const float min, const float max) {
-  return (max - min) / ((std::numeric_limits<T>::max() * 1.0) -
-                        std::numeric_limits<T>::min());
+  return (max - min) /
+         static_cast<float>((std::numeric_limits<T>::max() * 1.0) -
+                            std::numeric_limits<T>::min());
 }
 
 // Derives the quantization zero point from a min and max range.
@@ -79,28 +73,14 @@ int32_t F2Q32(const float value, const float scale);
 void PopulateContext(TfLiteTensor* tensors, int tensors_size,
                      ErrorReporter* error_reporter, TfLiteContext* context);
 
-TfLiteTensor CreateFloatTensor(std::initializer_list<float> data,
-                               TfLiteIntArray* dims, bool is_variable = false);
-
-TfLiteTensor CreateBoolTensor(std::initializer_list<bool> data,
-                              TfLiteIntArray* dims, bool is_variable = false);
-
 TfLiteTensor CreateQuantizedTensor(const uint8_t* data, TfLiteIntArray* dims,
                                    float min, float max,
                                    bool is_variable = false);
 
-TfLiteTensor CreateQuantizedTensor(std::initializer_list<uint8_t> data,
-                                   TfLiteIntArray* dims, float min, float max,
-                                   bool is_variable = false);
-
 TfLiteTensor CreateQuantizedTensor(const int8_t* data, TfLiteIntArray* dims,
                                    float min, float max,
                                    bool is_variable = false);
 
-TfLiteTensor CreateQuantizedTensor(std::initializer_list<int8_t> data,
-                                   TfLiteIntArray* dims, float min, float max,
-                                   bool is_variable = false);
-
 TfLiteTensor CreateQuantizedTensor(float* data, uint8_t* quantized_data,
                                    TfLiteIntArray* dims,
                                    bool is_variable = false);
@@ -116,10 +96,6 @@ TfLiteTensor CreateQuantizedTensor(float* data, int16_t* quantized_data,
 TfLiteTensor CreateQuantized32Tensor(const int32_t* data, TfLiteIntArray* dims,
                                      float scale, bool is_variable = false);
 
-TfLiteTensor CreateQuantized32Tensor(std::initializer_list<int32_t> data,
-                                     TfLiteIntArray* dims, float scale,
-                                     bool is_variable = false);
-
 template <typename input_type = int32_t,
           TfLiteType tensor_input_type = kTfLiteInt32>
 inline TfLiteTensor CreateTensor(const input_type* data, TfLiteIntArray* dims,
@@ -134,15 +110,6 @@ inline TfLiteTensor CreateTensor(const input_type* data, TfLiteIntArray* dims,
   return result;
 }
 
-template <typename input_type = int32_t,
-          TfLiteType tensor_input_type = kTfLiteInt32>
-inline TfLiteTensor CreateTensor(std::initializer_list<input_type> data,
-                                 TfLiteIntArray* dims,
-                                 bool is_variable = false) {
-  return CreateTensor<input_type, tensor_input_type>(data.begin(), dims,
-                                                     is_variable);
-}
-
 }  // namespace testing
 }  // namespace tflite
 
diff --git a/tensorflow/lite/micro/testing/util_test.cc b/tensorflow/lite/micro/testing/util_test.cc
index f4eb28e121a..261e9f29a25 100644
--- a/tensorflow/lite/micro/testing/util_test.cc
+++ b/tensorflow/lite/micro/testing/util_test.cc
@@ -21,10 +21,10 @@ TF_LITE_MICRO_TESTS_BEGIN
 TF_LITE_MICRO_TEST(ArgumentsExecutedOnlyOnce) {
   float count = 0.;
   // Make sure either argument is executed once after macro expansion.
-  TF_LITE_MICRO_EXPECT_NEAR(0, count++, 0.1);
-  TF_LITE_MICRO_EXPECT_NEAR(1, count++, 0.1);
-  TF_LITE_MICRO_EXPECT_NEAR(count++, 2, 0.1);
-  TF_LITE_MICRO_EXPECT_NEAR(count++, 3, 0.1);
+  TF_LITE_MICRO_EXPECT_NEAR(0, count++, 0.1f);
+  TF_LITE_MICRO_EXPECT_NEAR(1, count++, 0.1f);
+  TF_LITE_MICRO_EXPECT_NEAR(count++, 2, 0.1f);
+  TF_LITE_MICRO_EXPECT_NEAR(count++, 3, 0.1f);
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/testing_helpers_test.cc b/tensorflow/lite/micro/testing_helpers_test.cc
index 710ca2a4a9e..885bd873b53 100644
--- a/tensorflow/lite/micro/testing_helpers_test.cc
+++ b/tensorflow/lite/micro/testing_helpers_test.cc
@@ -33,7 +33,7 @@ TF_LITE_MICRO_TEST(CreateQuantizedBiasTensor) {
       pre_quantized, quantized, dims, input_scale, weight_scale);
 
   TF_LITE_MICRO_EXPECT_EQ(result.bytes, tensor_size * sizeof(int32_t));
-  TF_LITE_MICRO_EXPECT_EQ(result.dims, dims);
+  TF_LITE_MICRO_EXPECT(result.dims == dims);
   TF_LITE_MICRO_EXPECT_EQ(result.params.scale, input_scale * weight_scale);
   for (int i = 0; i < tensor_size; i++) {
     TF_LITE_MICRO_EXPECT_EQ(expected_quantized_values[i], result.data.i32[i]);
@@ -66,7 +66,7 @@ TF_LITE_MICRO_TEST(CreatePerChannelQuantizedBiasTensor) {
   }
 
   TF_LITE_MICRO_EXPECT_EQ(result.bytes, tensor_size * sizeof(int32_t));
-  TF_LITE_MICRO_EXPECT_EQ(result.dims, dims);
+  TF_LITE_MICRO_EXPECT(result.dims == dims);
   for (int i = 0; i < tensor_size; i++) {
     TF_LITE_MICRO_EXPECT_EQ(expected_quantized_values[i], result.data.i32[i]);
   }
@@ -92,7 +92,7 @@ TF_LITE_MICRO_TEST(CreateSymmetricPerChannelQuantizedTensor) {
           pre_quantized, quantized, dims, scales, zero_points, &quant, 0);
 
   TF_LITE_MICRO_EXPECT_EQ(result.bytes, tensor_size * sizeof(int8_t));
-  TF_LITE_MICRO_EXPECT_EQ(result.dims, dims);
+  TF_LITE_MICRO_EXPECT(result.dims == dims);
   TfLiteFloatArray* result_scales =
       static_cast<TfLiteAffineQuantization*>(result.quantization.params)->scale;
   for (int i = 0; i < channels; i++) {
diff --git a/tensorflow/lite/micro/tools/ci_build/test_all.sh b/tensorflow/lite/micro/tools/ci_build/test_all.sh
index 817a4dce115..e0cb0b325ef 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_all.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_all.sh
@@ -34,8 +34,8 @@ make -f tensorflow/lite/micro/tools/make/Makefile \
 
 echo "Starting to run micro tests at `date`"
 
-echo "Running Arduino tests at `date`"
-tensorflow/lite/micro/tools/ci_build/test_arduino.sh
+echo "Running x86 tests at `date`"
+tensorflow/lite/micro/tools/ci_build/test_x86.sh
 
 echo "Running bluepill tests at `date`"
 tensorflow/lite/micro/tools/ci_build/test_bluepill.sh
@@ -46,10 +46,10 @@ tensorflow/lite/micro/tools/ci_build/test_mbed.sh PRESUBMIT
 echo "Running Sparkfun tests at `date`"
 tensorflow/lite/micro/tools/ci_build/test_sparkfun.sh
 
-echo "Running x86 tests at `date`"
-tensorflow/lite/micro/tools/ci_build/test_x86.sh
-
 echo "Running stm32f4 tests at `date`"
 tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh
 
+echo "Running Arduino tests at `date`"
+tensorflow/lite/micro/tools/ci_build/test_arduino.sh
+
 echo "Finished all micro tests at `date`"
diff --git a/tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh b/tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh
index be706a3a6bd..2ef1bb1f97f 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh
@@ -32,7 +32,12 @@ readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
 # TODO(b/143715361): downloading first to allow for parallel builds.
 readable_run make -f tensorflow/lite/micro/tools/make/Makefile TAGS=${TAGS} TARGET=${TARGET} third_party_downloads
 
-# Build test binaries first
+# First make sure that the release build succeeds.
+readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile BUILD_TYPE=release TAGS=${TAGS} TARGET=${TARGET} build
+
+# Next, build w/o release so that we can run the tests and get additional
+# debugging info on failures.
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
 readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TAGS=${TAGS} TARGET=${TARGET} build
 
 # TODO(b/149597202): Disabled until we can get Docker running inside Docker.
diff --git a/tensorflow/lite/micro/tools/ci_build/test_x86.sh b/tensorflow/lite/micro/tools/ci_build/test_x86.sh
index c150d828164..49e20b4f84d 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_x86.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_x86.sh
@@ -28,4 +28,11 @@ readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
 
 # TODO(b/143715361): downloading first to allow for parallel builds.
 readable_run make -f tensorflow/lite/micro/tools/make/Makefile third_party_downloads
+
+# First make sure that the release build succeeds.
+readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile BUILD_TYPE=release build
+
+# Next, build w/o release so that we can run the tests and get additional
+# debugging info on failures.
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
 readable_run make -s -j8 -f tensorflow/lite/micro/tools/make/Makefile test
diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index 2e042b17308..62510159547 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -74,11 +74,14 @@ TEST_SCRIPT := tensorflow/lite/micro/testing/test_linux_binary.sh
 
 MICROLITE_LIBS := -lm
 
+CC_WARNINGS := -Werror -Wsign-compare -Wdouble-promotion \
+               -Wshadow -Wunused-variable -Wmissing-field-initializers \
+               -Wunused-function
 # TODO(b/150240249): Add in -fno-rtti once that works for the Xtensa toolchain.
 # TODO(b/159155203): Consider TF_LITE_STATIC_MEMORY to align more with the fact
 # this flag is for an optimized micro runtime.
-CXXFLAGS := -std=c++11 -DTF_LITE_STATIC_MEMORY
-CCFLAGS  := -std=c11   -DTF_LITE_STATIC_MEMORY
+CXXFLAGS := -std=c++11 -DTF_LITE_STATIC_MEMORY $(CC_WARNINGS)
+CCFLAGS  := -std=c11   -DTF_LITE_STATIC_MEMORY $(CC_WARNINGS)
 ARFLAGS := -r
 
 # override these in the makefile.inc for specific compiler targets
@@ -287,6 +290,10 @@ CXX := $(TARGET_TOOLCHAIN_ROOT)${TARGET_TOOLCHAIN_PREFIX}${CXX_TOOL}
 CC := $(TARGET_TOOLCHAIN_ROOT)${TARGET_TOOLCHAIN_PREFIX}${CC_TOOL}
 AR := $(TARGET_TOOLCHAIN_ROOT)${TARGET_TOOLCHAIN_PREFIX}${AR_TOOL}
 
+# The default Makefile target(all) must appear before any target,
+# which is compiled if there's no command-line arguments.
+all: $(MICROLITE_LIB_PATH)
+
 # Load the examples.
 include $(MICRO_LITE_EXAMPLE_TESTS)
 
@@ -319,9 +326,6 @@ $(OBJDIR)%.o: %.S $(THIRD_PARTY_TARGETS)
 	@mkdir -p $(dir $@)
 	$(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
 
-# The target that's compiled if there's no command-line arguments.
-all: $(MICROLITE_LIB_PATH)
-
 microlite: $(MICROLITE_LIB_PATH)
 
 # Hack for generating schema file bypassing flatbuffer parsing
@@ -364,11 +368,9 @@ build: $(MICROLITE_BUILD_TARGETS)
 
 generate_projects: $(ALL_PROJECT_TARGETS)
 
-generate_non_kernel_projects: $(filter-out generate_kernel%,$(ALL_PROJECT_TARGETS))
+ARDUINO_PROJECT_TARGETS := $(foreach TARGET,$(ALL_PROJECT_TARGETS),$(if $(findstring _arduino,$(TARGET)),$(TARGET),))
 
-generate_non_test_projects: $(filter-out %_test%,$(ALL_PROJECT_TARGETS))
-
-generate_arduino_zip: generate_non_kernel_projects $(ARDUINO_LIBRARY_ZIPS)
+generate_arduino_zip: $(ARDUINO_PROJECT_TARGETS) $(ARDUINO_LIBRARY_ZIPS)
 	python tensorflow/lite/micro/tools/make/merge_arduino_zips.py $(PRJDIR)/tensorflow_lite.zip $(ARDUINO_LIBRARY_ZIPS)
 
 # Gets rid of all generated files.
diff --git a/tensorflow/lite/micro/tools/make/download_and_extract.sh b/tensorflow/lite/micro/tools/make/download_and_extract.sh
index fa5e57dd91a..e72fd7a0184 100755
--- a/tensorflow/lite/micro/tools/make/download_and_extract.sh
+++ b/tensorflow/lite/micro/tools/make/download_and_extract.sh
@@ -171,12 +171,20 @@ download_and_extract() {
   # loop to attempt to recover from them.
   for (( i=1; i<=$curl_retries; ++i ))
   do
+    # We have to use this approach because we normally halt the script when
+    # there's an error, and instead we want to catch errors so we can retry.
+    set +e
     curl -Ls --fail --retry 5 "${url}" > ${tempfile}
     CURL_RESULT=$?
+    set -e
+
+    # Was the command successful? If so, continue.
     if [[ $CURL_RESULT -eq 0 ]]
     then
       break
     fi
+
+    # Keep trying if we see the '56' error code.
     if [[ ( $CURL_RESULT -ne 56 ) || ( $i -eq $curl_retries ) ]]
     then
       echo "Error $CURL_RESULT downloading '${url}'"
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/xtensa_hifi_nn_library.inc b/tensorflow/lite/micro/tools/make/ext_libs/xtensa_hifi_nn_library.inc
index bd79d9cacca..7e8fe2b26f1 100644
--- a/tensorflow/lite/micro/tools/make/ext_libs/xtensa_hifi_nn_library.inc
+++ b/tensorflow/lite/micro/tools/make/ext_libs/xtensa_hifi_nn_library.inc
@@ -4,6 +4,8 @@ ifneq ($(filter xtensa_hifi, $(ALL_TAGS)),)
 
     ifneq (,$(filter hifi4%, $(TARGET_ARCH)))
 
+        NNLIB = xa_nnlib_hifi4
+
         CCFLAGS += -DNNLIB_V2 \
                    -DXTENSA_NNLIB_MAX_SCRATCH_SIZE=70*1024
 
@@ -11,56 +13,60 @@ ifneq ($(filter xtensa_hifi, $(ALL_TAGS)),)
                     -DXTENSA_NNLIB_MAX_SCRATCH_SIZE=70*1024
 
         MICROLITE_CC_SRCS += \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/activations/hifi4/xa_nn_activations_f32_f32.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/activations/hifi4/xa_nn_activations_asym8_asym8.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/activations/hifi4/xa_nn_activations_32_16.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/activations/hifi4/xa_nn_activations_32_8.c  \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/activations/hifi4/xa_nn_softmax_asym8_asym8.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/basic/hifi4/xa_nn_floor_f32.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_circ_buf.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_asym8xasym8.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_f32.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_matXvec_asym8xasym8_asym8_circ.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_matXvec_f32_circ.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_conv2d_depthwise.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_conv2d_depthwise_f32.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_conv2d_depthwise_asym8xasym8.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_circ_buf.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/fc/hifi4/xa_nn_fully_connected.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/matXvec/hifi4/xa_nn_matXvec_f32.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/matXvec/hifi4/xa_nn_matXvec_16x16.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/matXvec/hifi4/xa_nn_matXvec_8x16.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/matXvec/hifi4/xa_nn_matXvec_8x8.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/matXvec/hifi4/xa_nn_matXvec_asym8xasym8.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_avgpool.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_avgpool_f32.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_avgpool_asym8.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_maxpool.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_maxpool_f32.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_maxpool_asym8.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_avgpool_f32_nhwc.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_avgpool_asym8_nhwc.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_maxpool_f32_nhwc.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_maxpool_asym8_nhwc.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_inv_256_tbl.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/vec_sigmoidf_hifi4.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/vec_tanhf_hifi4.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/vec_reluf_hifi4.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/vec_softmaxf_hifi4.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/vec_alognf_hifi4.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/scl_sigmoidf_hifi4.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/scl_tanhf_hifi4.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/expf_tbl.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/pow2f_tbl.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/inff_tbl.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/tanhf_tbl.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/nanf_tbl.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/activations/hifi4/xa_nn_activations_f32_f32.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/activations/hifi4/xa_nn_activations_asym8_asym8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/activations/hifi4/xa_nn_activations_32_16.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/activations/hifi4/xa_nn_activations_32_8.c  \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/activations/hifi4/xa_nn_softmax_asym8_asym8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/basic/hifi4/xa_nn_floor_f32.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/basic/hifi4/xa_nn_elm_add_f32.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/basic/hifi4/xa_nn_elm_add_quant8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/basic/hifi4/xa_nn_elm_mul_f32.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/basic/hifi4/xa_nn_elm_mul_quant8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_circ_buf.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_asym8xasym8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_f32.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_matXvec_asym8xasym8_asym8_circ.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_matXvec_f32_circ.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_conv2d_depthwise.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_conv2d_depthwise_f32.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_conv2d_depthwise_asym8xasym8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_circ_buf.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/fc/hifi4/xa_nn_fully_connected.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/matXvec/hifi4/xa_nn_matXvec_f32.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/matXvec/hifi4/xa_nn_matXvec_16x16.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/matXvec/hifi4/xa_nn_matXvec_8x16.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/matXvec/hifi4/xa_nn_matXvec_8x8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/matXvec/hifi4/xa_nn_matXvec_asym8xasym8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_avgpool.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_avgpool_f32.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_avgpool_asym8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_maxpool.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_maxpool_f32.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_maxpool_asym8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_avgpool_f32_nhwc.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_avgpool_asym8_nhwc.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_maxpool_f32_nhwc.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_maxpool_asym8_nhwc.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_inv_256_tbl.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/vec_sigmoidf_hifi4.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/vec_tanhf_hifi4.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/vec_reluf_hifi4.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/vec_softmaxf_hifi4.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/vec_alognf_hifi4.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/scl_sigmoidf_hifi4.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/scl_tanhf_hifi4.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/expf_tbl.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/pow2f_tbl.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/inff_tbl.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/tanhf_tbl.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/nanf_tbl.c \
 
-        INCLUDES += -I$(XTENSA_PATH)/xa_nnlib/algo/kernels/ \
-                    -I$(XTENSA_PATH)/xa_nnlib/include/nnlib/ \
-                    -I$(XTENSA_PATH)/xa_nnlib/include/ \
-                    -I$(XTENSA_PATH)/xa_nnlib/algo/common/include/ \
-                    -I$(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/include/ \
+        INCLUDES += -I$(XTENSA_PATH)/$(NNLIB)/algo/kernels/ \
+                    -I$(XTENSA_PATH)/$(NNLIB)/include/nnlib/ \
+                    -I$(XTENSA_PATH)/$(NNLIB)/include/ \
+                    -I$(XTENSA_PATH)/$(NNLIB)/algo/common/include/ \
+                    -I$(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/include/ \
 
     endif
 
diff --git a/tensorflow/lite/micro/tools/make/helper_functions.inc b/tensorflow/lite/micro/tools/make/helper_functions.inc
index 9f75410432e..83cb0e31254 100644
--- a/tensorflow/lite/micro/tools/make/helper_functions.inc
+++ b/tensorflow/lite/micro/tools/make/helper_functions.inc
@@ -427,7 +427,7 @@ $(1)_LOCAL_SRCS := $$(call specialize,$$($(1)_LOCAL_SRCS))
 ALL_SRCS += $$($(1)_LOCAL_SRCS)
 $(1)_LOCAL_HDRS := $(3)
 $(1)_LOCAL_OBJS := $$(addprefix $$(OBJDIR), \
-$$(patsubst %.cc,%.o,$$(patsubst %.c,%.o,$$($(1)_LOCAL_SRCS))))
+$$(patsubst %.S,%.o,$$(patsubst %.cc,%.o,$$(patsubst %.c,%.o,$$($(1)_LOCAL_SRCS)))))
 $(1)_BINARY := $$(BINDIR)$(1)
 $$($(1)_BINARY): $$($(1)_LOCAL_OBJS) $$(MICROLITE_LIB_PATH)
 	@mkdir -p $$(dir $$@)
diff --git a/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc b/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
index dc7a689daed..68792496ec3 100644
--- a/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
@@ -50,18 +50,14 @@ $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST)/$(SF_BSPS_DEST): $(MAKEFILE_DIR)/downlo
     -Wvla \
     -Wall \
     -Wextra \
-    -Wsign-compare \
-    -Wdouble-promotion \
-    -Wunused-variable \
-    -Wshadow \
-    -Wmissing-field-initializers \
+    -Wno-missing-field-initializers \
+    -Wno-strict-aliasing \
+    -Wno-type-limits \
+    -Wno-unused-function \
     -Wno-unused-parameter \
-    -Wno-write-strings \
-    -Wunused-function \
     -fno-delete-null-pointer-checks \
     -fno-threadsafe-statics \
     -fomit-frame-pointer \
-    -fpermissive \
     -fno-use-cxa-atexit \
     -nostdlib \
     -ggdb \
@@ -122,9 +118,11 @@ $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST)/$(SF_BSPS_DEST): $(MAKEFILE_DIR)/downlo
 
   CMSIS_SRC_DIR := $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source
   THIRD_PARTY_CC_SRCS := \
+  $(CMSIS_SRC_DIR)/BasicMathFunctions/arm_dot_prod_q15.c \
   $(CMSIS_SRC_DIR)/BasicMathFunctions/arm_mult_q15.c \
   $(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_init_q15.c \
   $(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_q15.c \
+  $(CMSIS_SRC_DIR)/TransformFunctions/arm_bitreversal2.c \
   $(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_q15.c \
   $(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_radix4_q15.c \
   $(CMSIS_SRC_DIR)/CommonTables/arm_const_structs.c \
diff --git a/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc b/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc
index 3b6f1f9aa88..62230f6a80a 100644
--- a/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc
@@ -29,18 +29,12 @@ ifeq ($(TARGET), bluepill)
     -Wvla \
     -Wall \
     -Wextra \
-    -Wsign-compare \
-    -Wdouble-promotion \
-    -Wshadow \
-    -Wunused-variable \
-    -Wmissing-field-initializers \
     -Wno-unused-parameter \
-    -Wno-write-strings \
-    -Wunused-function \
+    -Wno-strict-aliasing \
+    -Wno-type-limits \
     -fno-delete-null-pointer-checks \
     -fno-threadsafe-statics \
     -fomit-frame-pointer \
-    -fpermissive \
     -fno-use-cxa-atexit \
     -nostdlib \
     -g \
diff --git a/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc b/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc
index c3cbf206c8a..e46ca0717a4 100644
--- a/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc
@@ -36,6 +36,7 @@ ifeq ($(TARGET), hexagon)
     -Wno-unused-parameter \
     -Wno-write-strings \
     -Wunused-function \
+    -Wno-unused-private-field \
     -Wvla \
     -fdata-sections \
     -ffunction-sections \
diff --git a/tensorflow/lite/micro/tools/make/targets/mcu_riscv_makefile.inc b/tensorflow/lite/micro/tools/make/targets/mcu_riscv_makefile.inc
index b91b0a516f2..9c87a105a31 100644
--- a/tensorflow/lite/micro/tools/make/targets/mcu_riscv_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/mcu_riscv_makefile.inc
@@ -53,22 +53,7 @@ ifeq ($(TARGET), riscv32_mcu)
 
   MICROLITE_CC_SRCS += \
     $(wildcard tensorflow/lite/micro/riscv32_mcu/*.cc)
-  MICRO_SPEECH_TEST_SRCS += \
-    $(wildcard $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/libwrap/sys/*.c) \
-    $(wildcard $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/libwrap/sys/*.cc) \
-    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/libwrap/misc/write_hex.c \
-    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/libwrap/stdlib/malloc.c \
-    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env/start.S \
-    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env/entry.S \
-    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env/freedom-e300-hifive1/init.c
-  LIBWRAP_SYMS := malloc free \
-                  open lseek read write fstat stat close link unlink \
-                  execve fork getpid kill wait \
-                  isatty times sbrk _exit puts
 
-  LDFLAGS += $(foreach s,$(LIBWRAP_SYMS),-Wl,--wrap=$(s))
-  LDFLAGS += $(foreach s,$(LIBWRAP_SYMS),-Wl,--wrap=_$(s))
-  LDFLAGS += -L. -Wl,--start-group -lc -Wl,--end-group
   LDFLAGS += \
    -T$(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env/freedom-e300-hifive1/flash.lds \
    -nostartfiles \
diff --git a/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc b/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
index 8405542fcba..15ee93d4e19 100644
--- a/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
@@ -9,11 +9,12 @@ ifeq ($(TARGET), stm32f4)
   $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,patch_cmsis))
   $(eval $(call add_third_party_download,$(STM32_BARE_LIB_URL),$(STM32_BARE_LIB_MD5),stm32_bare_lib,))
 
+  # TODO(b/161478030) : change - Wno - vla to - Wvla and remove - Wno-shadow once
+  # we have a solution for fixing / avoiding being tripped up by these warnings.
   PLATFORM_FLAGS = \
     -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
     -DTF_LITE_STATIC_MEMORY \
     -DTF_LITE_MCU_DEBUG_LOG \
-    -fno-rtti \
     -fmessage-length=0 \
     -fno-exceptions \
     -fno-unwind-tables \
@@ -23,10 +24,12 @@ ifeq ($(TARGET), stm32f4)
     -MMD \
     -mcpu=cortex-m4 \
     -mthumb \
-    -std=gnu++11 \
-    -Wvla \
     -Wall \
     -Wextra \
+    -Wno-shadow \
+    -Wno-vla \
+    -Wno-strict-aliasing \
+    -Wno-type-limits \
     -Wno-unused-parameter \
     -Wno-missing-field-initializers \
     -Wno-write-strings \
@@ -34,11 +37,9 @@ ifeq ($(TARGET), stm32f4)
     -Wunused-function \
     -fno-delete-null-pointer-checks \
     -fomit-frame-pointer \
-    -fpermissive \
-    -fno-use-cxa-atexit \
     -g \
     -Os
-  CXXFLAGS += $(PLATFORM_FLAGS)
+  CXXFLAGS += $(PLATFORM_FLAGS) -std=gnu++11 -fno-rtti -fno-use-cxa-atexit
   CCFLAGS += $(PLATFORM_FLAGS)
   LDFLAGS += \
     --specs=nosys.specs \
diff --git a/tensorflow/lite/micro/tools/make/targets/xtensa_hifi/README.md b/tensorflow/lite/micro/tools/make/targets/xtensa_hifi/README.md
index fd606a7f96b..6c88ce394c5 100644
--- a/tensorflow/lite/micro/tools/make/targets/xtensa_hifi/README.md
+++ b/tensorflow/lite/micro/tools/make/targets/xtensa_hifi/README.md
@@ -28,7 +28,7 @@ tensorflow/lite/micro/kernels/xtensa_hifi/
 
 A scratch memory allocation is needed for the HiFi optimized kernels. This
 allocation is currently done on stack and it's size can be controlled by
-defining 'XTENSA_NNLIB_MAX_SCRATCH_SIZE' approproately in the file
+defining 'XTENSA_NNLIB_MAX_SCRATCH_SIZE' appropriately in the file
 'tensorflow/lite/micro/tools/make/ext_libs/xtensa_hifi_nn_library.inc
 
 The files containing the HiFi optimized NN kernels are present in this folder:
diff --git a/tensorflow/lite/micro/tools/make/targets/xtensa_hifi_makefile.inc b/tensorflow/lite/micro/tools/make/targets/xtensa_hifi_makefile.inc
index aa7d8cfb1c3..539f0b87ee8 100644
--- a/tensorflow/lite/micro/tools/make/targets/xtensa_hifi_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/xtensa_hifi_makefile.inc
@@ -5,7 +5,7 @@
 ifeq ($(TARGET), xtensa_hifi)
   TARGET_ARCH := hifi3_bd5
 
-$(eval $(call add_third_party_download,$(XTENSA_HIFI4_URL),$(XTENSA_HIFI4_MD5),xa_nnlib,))
+$(eval $(call add_third_party_download,$(XTENSA_HIFI4_URL),$(XTENSA_HIFI4_MD5),xa_nnlib_hifi4,))
 
   PLATFORM_ARGS = \
     -mno-mul16 \
diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
index 8590ace9fda..e2777d9fbb5 100644
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@@ -1,65 +1,70 @@
 # Add URLs and MD5 checksums for third-party libraries here.
+# We use mirror.tensorflow.org to cache copies of third-party files,
+# but this is just an optimization applied manually by TensorFlow
+# engineers, so add non-mirrored URLs if you need to update this
+# in a pull request and we'll periodically copy them and update
+# the URL.
 
 GEMMLOWP_URL := "https://github.com/google/gemmlowp/archive/719139ce755a0f31cbf1c37f7f98adcc7fc9f425.zip"
 GEMMLOWP_MD5 := "7e8191b24853d75de2af87622ad293ba"
 
 ifeq ($(HOST_OS),windows)
-  FLATBUFFERS_URL := "https://github.com/google/flatbuffers/archive/v1.12.0.zip"
+  FLATBUFFERS_URL := "http://mirror.tensorflow.org/github.com/google/flatbuffers/archive/v1.12.0.zip"
   FLATBUFFERS_MD5 := "a1afdbf114dec01a861c1b8c917d0fc7"
 else
-  FLATBUFFERS_URL := "https://github.com/google/flatbuffers/archive/v1.12.0.tar.gz"
+  FLATBUFFERS_URL := "http://mirror.tensorflow.org/github.com/google/flatbuffers/archive/v1.12.0.tar.gz"
   FLATBUFFERS_MD5 := "c62ffefb3d4548b127cca14ce047f16c"
 endif
 
 ifeq ($(HOST_OS),osx)
-  GCC_EMBEDDED_URL := "https://developer.arm.com/-/media/Files/downloads/gnu-rm/7-2018q2/gcc-arm-none-eabi-7-2018-q2-update-mac.tar.bz2"
+  GCC_EMBEDDED_URL := "http://mirror.tensorflow.org/developer.arm.com/-/media/Files/downloads/gnu-rm/7-2018q2/gcc-arm-none-eabi-7-2018-q2-update-mac.tar.bz2"
   GCC_EMBEDDED_MD5 := "a66be9828cf3c57d7d21178e07cd8904"
 else ifeq ($(HOST_OS),windows)
-  GCC_EMBEDDED_URL := "https://developer.arm.com/-/media/Files/downloads/gnu-rm/7-2018q2/gcc-arm-none-eabi-7-2018-q2-update-win32.zip"
+  GCC_EMBEDDED_URL := "http://mirror.tensorflow.org/developer.arm.com/-/media/Files/downloads/gnu-rm/7-2018q2/gcc-arm-none-eabi-7-2018-q2-update-win32.zip"
   GCC_EMBEDDED_MD5 := "bc8ae26d7c429f30d583a605a4bcf9bc"
 else
-  GCC_EMBEDDED_URL := "https://developer.arm.com/-/media/Files/downloads/gnu-rm/7-2018q2/gcc-arm-none-eabi-7-2018-q2-update-linux.tar.bz2"
+  GCC_EMBEDDED_URL := "http://mirror.tensorflow.org/developer.arm.com/-/media/Files/downloads/gnu-rm/7-2018q2/gcc-arm-none-eabi-7-2018-q2-update-linux.tar.bz2"
   GCC_EMBEDDED_MD5 := "299ebd3f1c2c90930d28ab82e5d8d6c0"
 endif
 
-LEON_BCC2_URL := "https://www.gaisler.com/anonftp/bcc2/bin/bcc-2.0.7-gcc-linux64.tar.xz"
+LEON_BCC2_URL := "http://mirror.tensorflow.org/www.gaisler.com/anonftp/bcc2/bin/bcc-2.0.7-gcc-linux64.tar.xz"
 LEON_BCC2_MD5 := "cdf78082be4882da2a92c9baa82fe765"
 
-TSIM_URL := "https://www.gaisler.com/anonftp/tsim/tsim-eval-2.0.63.tar.gz"
+TSIM_URL := "http://mirror.tensorflow.org/www.gaisler.com/anonftp/tsim/tsim-eval-2.0.63.tar.gz"
 TSIM_MD5 := "afa0095d3ed989a949e1467f94e41d2f"
 
-CMSIS_URL := "https://github.com/ARM-software/CMSIS_5/archive/9daaa7a34a5627a24009462b8fa8413a00c4fdb1.zip"
+CMSIS_URL := "http://mirror.tensorflow.org/github.com/ARM-software/CMSIS_5/archive/9daaa7a34a5627a24009462b8fa8413a00c4fdb1.zip"
 CMSIS_MD5 := "b988dacff8925ffffcb7e5079cc713b7"
 
-AM_SDK_URL := "http://s3.asia.ambiqmicro.com/downloads/AmbiqSuite-Rel2.2.0.zip"
+AM_SDK_URL := "http://mirror.tensorflow.org/s3.asia.ambiqmicro.com/downloads/AmbiqSuite-Rel2.2.0.zip"
 AM_SDK_MD5 := "7605fa2d4d97e6bb7a1190c92b66b597"
 AM_SDK_DEST := AmbiqSuite-Rel2.2.0
 
-SF_BSPS_URL := "https://github.com/sparkfun/SparkFun_Apollo3_AmbiqSuite_BSPs/archive/v0.0.7.zip"
+SF_BSPS_URL := "http://mirror.tensorflow.org/github.com/sparkfun/SparkFun_Apollo3_AmbiqSuite_BSPs/archive/v0.0.7.zip"
 SF_BSPS_MD5 := "34199f7e754735661d1c8a70a40ca7a3"
 SF_BSPS_DEST := boards_sfe
 
-STM32_BARE_LIB_URL := "https://github.com/google/stm32_bare_lib/archive/c07d611fb0af58450c5a3e0ab4d52b47f99bc82d.zip"
+STM32_BARE_LIB_URL := "http://mirror.tensorflow.org/github.com/google/stm32_bare_lib/archive/c07d611fb0af58450c5a3e0ab4d52b47f99bc82d.zip"
 STM32_BARE_LIB_MD5 := "282bff40d4d0b92278fd123a3b6e3123"
 
 ifeq ($(HOST_OS),osx)
-  RISCV_TOOLCHAIN_URL := "https://static.dev.sifive.com/dev-tools/riscv64-unknown-elf-gcc-8.1.0-2019.01.0-x86_64-apple-darwin.tar.gz"
+  RISCV_TOOLCHAIN_URL := "http://mirror.tensorflow.org/static.dev.sifive.com/dev-tools/riscv64-unknown-elf-gcc-8.1.0-2019.01.0-x86_64-apple-darwin.tar.gz"
   RISCV_TOOLCHAIN_MD5 := "2ac2fa00618b9ab7fa0c7d0ec173de94"
 else
-  RISCV_TOOLCHAIN_URL := "https://static.dev.sifive.com/dev-tools/riscv64-unknown-elf-gcc-20181030-x86_64-linux-ubuntu14.tar.gz"
+  RISCV_TOOLCHAIN_URL := "http://mirror.tensorflow.org/static.dev.sifive.com/dev-tools/riscv64-unknown-elf-gcc-20181030-x86_64-linux-ubuntu14.tar.gz"
   RISCV_TOOLCHAIN_MD5="2366b7afe36a54dc94fb0ff8a0830934"
 endif
 
-SIFIVE_FE310_LIB_URL := "https://github.com/sifive/freedom-e-sdk/archive/baeeb8fd497a99b3c141d7494309ec2e64f19bdf.zip"
+SIFIVE_FE310_LIB_URL := "http://mirror.tensorflow.org/github.com/sifive/freedom-e-sdk/archive/baeeb8fd497a99b3c141d7494309ec2e64f19bdf.zip"
 SIFIVE_FE310_LIB_MD5 := "06ee24c4956f8e21670ab3395861fe64"
 
-KISSFFT_URL="https://github.com/mborgerding/kissfft/archive/v130.zip"
+KISSFFT_URL="http://mirror.tensorflow.org/github.com/mborgerding/kissfft/archive/v130.zip"
 KISSFFT_MD5="438ba1fef5783cc5f5f201395cc477ca"
 
-RUY_URL="https://github.com/google/ruy/archive/34ea9f4993955fa1ff4eb58e504421806b7f2e8f.zip"
-RUY_MD5="18613212e9c01aba85c7d19010b194a9"
+RUY_URL="https://github.com/google/ruy/archive/5bb02fbf90824c2eb6cd7418f766c593106a332b.zip"
+RUY_MD5="c720b1743360259ac45809a321f8f26c"
 
-CIFAR10_DATASET_URL="https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz"
+CIFAR10_DATASET_URL="http://mirror.tensorflow.org/www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz"
 CIFAR10_DATASET_MD5="c32a1d4ab5d03f1284b67883e8d87530"
 
 IMAGE_RECOGNITION_MODEL_URL := "https://storage.googleapis.com/download.tensorflow.org/models/tflite/cifar_image_recognition_model_2020_05_27.zip"
@@ -68,25 +73,23 @@ IMAGE_RECOGNITION_MODEL_MD5 := "1f4607b05ac45b8a6146fb883dbc2d7b"
 PERSON_MODEL_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_grayscale_2020_05_27.zip"
 PERSON_MODEL_MD5 := "55b85f76e2995153e660391d4a209ef1"
 
-PERSON_MODEL_INT8_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_int8_grayscale_2020_05_27.zip"
-PERSON_MODEL_INT8_MD5 := "a0ede2d058aa2a1d413893455dd55352"
+PERSON_MODEL_INT8_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_int8_grayscale_2020_06_23.zip"
+PERSON_MODEL_INT8_MD5 := "9b5b6d4677dd0a91b1bb992d1c4c0417"
 
-EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/58284867ca52d1f43b25045e8601999d7359d986.zip"
+EMBARC_MLI_URL := "http://mirror.tensorflow.org/github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/58284867ca52d1f43b25045e8601999d7359d986.zip"
 EMBARC_MLI_MD5 := "2bf4982a327fdaa9d475803ce014d1ef"
 
-EMBARC_MLI_PRE_COMPILED_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/releases/download/Release_1.1_RC2/embARC_MLI_package.zip"
+EMBARC_MLI_PRE_COMPILED_URL := "http://mirror.tensorflow.org/github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/releases/download/Release_1.1_RC2/embARC_MLI_package.zip"
 EMBARC_MLI_PRE_COMPILED_MD5 := "a95ff9e0370434484f14e7e4114327f6"
 
-ZEPHYR_URL := "https://github.com/antmicro/zephyr/archive/55e36b9.zip"
+ZEPHYR_URL := "http://mirror.tensorflow.org/github.com/antmicro/zephyr/archive/55e36b9.zip"
 ZEPHYR_MD5 := "755622eb4812fde918a6382b65d50c3b"
 
-XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip"
-XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b"
+XTENSA_HIFI4_URL :="http://mirror.tensorflow.org/github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_06_27.zip"
+XTENSA_HIFI4_MD5 :="45fdc1209a8da62ab568aa6040f7eabf"
 
-ETHOSU_URL := "https://git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git/snapshot/ethos-u-core-driver-bcb5aaa99756f1b5c1295b079ebdd60996bc75a5.tar.gz"
+ETHOSU_URL := "http://mirror.tensorflow.org/git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git/snapshot/ethos-u-core-driver-bcb5aaa99756f1b5c1295b079ebdd60996bc75a5.tar.gz"
 ETHOSU_MD5 := "d2073c8d88fc167fd5c46b5dcda58ea1"
 
-HIMAX_WE1_SDK_URL ="https://www.himax.com.tw/we-i/himax_we1_sdk_v02.zip"
-HIMAX_WE1_SDK_MD5 ="9a4b2f29b16052764e437b64bdcba816"
-
-
+HIMAX_WE1_SDK_URL ="https://www.himax.com.tw/we-i/himax_we1_sdk_v03.zip"
+HIMAX_WE1_SDK_MD5 ="1cd9b17f3fdb3e9a1dfd1cc356694325"
diff --git a/tensorflow/lite/model.h b/tensorflow/lite/model.h
index 84dc00f145b..079b4ad2a40 100644
--- a/tensorflow/lite/model.h
+++ b/tensorflow/lite/model.h
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-/// Deserialization infrastructure for tflite. Provides functionality
-/// to go from a serialized tflite model in flatbuffer format to an
-/// interpreter.
+/// \file
+/// Defines tflite::Interpreter and tflite::InterpreterBuilder.
 ///
 #ifndef TENSORFLOW_LITE_MODEL_H_
 #define TENSORFLOW_LITE_MODEL_H_
diff --git a/tensorflow/lite/model_builder.h b/tensorflow/lite/model_builder.h
index 01807103e1e..e4233998a30 100644
--- a/tensorflow/lite/model_builder.h
+++ b/tensorflow/lite/model_builder.h
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+/// \file
 /// Deserialization infrastructure for tflite. Provides functionality
 /// to go from a serialized tflite model in flatbuffer format to an
-/// interpreter.
+/// in-memory representation of the model.
 ///
 #ifndef TENSORFLOW_LITE_MODEL_BUILDER_H_
 #define TENSORFLOW_LITE_MODEL_BUILDER_H_
diff --git a/tensorflow/lite/mutable_op_resolver.h b/tensorflow/lite/mutable_op_resolver.h
index fe5e121424c..69ecbbd6723 100644
--- a/tensorflow/lite/mutable_op_resolver.h
+++ b/tensorflow/lite/mutable_op_resolver.h
@@ -23,7 +23,7 @@ limitations under the License.
 
 namespace tflite {
 
-// Some versions of gcc doesn't support partial specialization in class scope,
+// Some versions of gcc don't support partial specialization in class scope,
 // so these are defined in a namescope.
 namespace op_resolver_hasher {
 template <typename V>
diff --git a/tensorflow/lite/nnapi/NeuralNetworksTypes.h b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
index 27892f55631..934e8948b71 100644
--- a/tensorflow/lite/nnapi/NeuralNetworksTypes.h
+++ b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
@@ -627,6 +627,26 @@ typedef enum {
   // such as that of the runtime itself and the IPC needed for the runtime to
   // communicate with the driver.
   ANEURALNETWORKS_DURATION_IN_DRIVER = 1,
+  // Execution time on hardware, after all dependencies have been signaled.
+  // If no dependencies specified (for example, if the execution was scheduled
+  // other
+  // than with {@link ANeuralNetworksExecution_startComputeWithDependencies}),
+  // the
+  // reported time will be the same as ANEURALNETWORKS_DURATION_ON_HARDWARE.
+  // Available since API level 30.
+  ANEURALNETWORKS_FENCED_DURATION_ON_HARDWARE = 2,
+  // Execution time in driver, after all dependencies have been signaled.
+  // Excludes
+  // overhead such as that of the runtime itself and the IPC needed for the
+  // runtime
+  // to communicate with the driver.
+  // If no dependencies specified (for example, if the execution was scheduled
+  // other
+  // than with {@link ANeuralNetworksExecution_startComputeWithDependencies}),
+  // the
+  // reported time will be the same as ANEURALNETWORKS_DURATION_IN_DRIVER.
+  // Available since API level 30.
+  ANEURALNETWORKS_FENCED_DURATION_IN_DRIVER = 3,
 } DurationCode;
 
 typedef int (*ANeuralNetworksExecution_getDuration_fn)(
diff --git a/tensorflow/lite/nnapi/nnapi_implementation.cc b/tensorflow/lite/nnapi/nnapi_implementation.cc
index 1d4c86f8456..25b0d8920dd 100644
--- a/tensorflow/lite/nnapi/nnapi_implementation.cc
+++ b/tensorflow/lite/nnapi/nnapi_implementation.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <sys/stat.h>
 #include <unistd.h>
 
+#include <algorithm>
 #include <cstdlib>
 
 #ifdef __ANDROID__
@@ -64,8 +65,21 @@ void* LoadFunction(void* handle, const char* name, bool optional) {
 
 #ifndef __ANDROID__
 // Add /dev/shm implementation of shared memory for non-Android platforms
-int ASharedMemory_create(const char* name, size_t size) {
-  int fd = shm_open(name, O_RDWR | O_CREAT, 0644);
+int ASharedMemory_create(const char* /* name */, size_t size) {
+  // Each call to ASharedMemory_create produces a unique memory space, hence
+  // name should not be used to create the shared memory file, otherwise
+  // two calls to create memory regions using the same 'name', will collide.
+  char shm_name_buffer[L_tmpnam];
+  if (tmpnam(shm_name_buffer) == nullptr) {
+    return -1;
+  }
+
+  // tmpnam will produce a string containing with slashes, but shm_open
+  // won't like that.
+  std::string shm_region_name = std::string(shm_name_buffer);
+  std::replace(shm_region_name.begin(), shm_region_name.end(), '/', '-');
+
+  int fd = shm_open(shm_region_name.c_str(), O_RDWR | O_CREAT, 0644);
   if (fd < 0) {
     return fd;
   }
@@ -76,6 +90,31 @@ int ASharedMemory_create(const char* name, size_t size) {
   }
   return fd;
 }
+
+// Determine the NnApi version from loaded entry points
+uint32_t CalculateAndroidSdkVersion(NnApi const& nnapi) {
+  // Test for specific NNAPI 1.0, 1.1, 1.2 and 1.3 functions
+  bool has_10 = nnapi.ANeuralNetworksMemory_createFromFd != nullptr;
+  bool has_11 =
+      nnapi.ANeuralNetworksModel_relaxComputationFloat32toFloat16 != nullptr;
+  bool has_12 = nnapi.ANeuralNetworks_getDeviceCount != nullptr;
+  bool has_13 = nnapi.ANeuralNetworksCompilation_setTimeout != nullptr;
+
+  uint32_t sdk_version = 0;
+  if (has_10) {
+    sdk_version = 27;
+  }
+  if (sdk_version == 27 && has_11) {
+    sdk_version = 28;
+  }
+  if (sdk_version == 28 && has_12) {
+    sdk_version = 29;
+  }
+  if (sdk_version == 29 && has_13) {
+    sdk_version = 30;
+  }
+  return sdk_version;
+}
 #endif  // __ANDROID__
 
 #define LOAD_FUNCTION(handle, name)         \
@@ -244,6 +283,15 @@ const NnApi LoadNnApi() {
   LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
                          ANeuralNetworksExecution_startComputeWithDependencies);
 
+#ifndef __ANDROID__
+  // If libneuralnetworks.so is loaded, but android_sdk_version is not set,
+  // then determine android_sdk_version by testing which functions are
+  // available.
+  if (nnapi.nnapi_exists && nnapi.android_sdk_version == 0) {
+    nnapi.android_sdk_version = CalculateAndroidSdkVersion(nnapi);
+  }
+#endif  // __ANDROID__
+
   return nnapi;
 }
 
diff --git a/tensorflow/lite/optional_debug_tools.cc b/tensorflow/lite/optional_debug_tools.cc
index 2e25b0a17f7..ef4ee1cb4e3 100644
--- a/tensorflow/lite/optional_debug_tools.cc
+++ b/tensorflow/lite/optional_debug_tools.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/optional_debug_tools.h"
 
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 namespace tflite {
 
@@ -57,6 +58,8 @@ const char* TensorTypeName(TfLiteType type) {
       return "kTfLiteInt16";
     case kTfLiteComplex64:
       return "kTfLiteComplex64";
+    case kTfLiteComplex128:
+      return "kTfLiteComplex128";
     case kTfLiteFloat16:
       return "kTfLiteFloat16";
     case kTfLiteFloat64:
@@ -79,6 +82,8 @@ const char* AllocTypeName(TfLiteAllocationType type) {
       return "kTfLiteArenaRwPersistent";
     case kTfLitePersistentRo:
       return "kTfLitePersistentRo";
+    case kTfLiteCustom:
+      return "kTfLiteCustom";
   }
   return "(invalid)";
 }
diff --git a/tensorflow/lite/optional_debug_tools.h b/tensorflow/lite/optional_debug_tools.h
index fb2f78e5ae4..2337f8c8751 100644
--- a/tensorflow/lite/optional_debug_tools.h
+++ b/tensorflow/lite/optional_debug_tools.h
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-// Optional debugging functionality. For small sized binaries, these are not
-// needed.
+/// \file
+/// Optional debugging functionality.
+/// For small sized binaries, these are not needed.
 #ifndef TENSORFLOW_LITE_OPTIONAL_DEBUG_TOOLS_H_
 #define TENSORFLOW_LITE_OPTIONAL_DEBUG_TOOLS_H_
 
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index 453aa803515..3f4e187b4eb 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -96,6 +96,8 @@ py_test(
         "@tflite_mobilenet_ssd_quant_protobuf//:tflite_graph.pb",
     ],
     python_version = "PY3",
+    # Increased thread count for reducing timeout failures.
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -160,6 +162,7 @@ py_test(
     shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
+        "no_mac",  # TODO(b/148247402): flatbuffers import broken on Mac OS.
         "no_windows",
     ],
     deps = [
@@ -193,10 +196,6 @@ py_test(
     srcs = ["lite_flex_test.py"],
     python_version = "PY3",
     srcs_version = "PY2AND3",
-    tags = [
-        "no_mac",  # TODO(b/159077703): Enable Python API Flex support on MacOS.
-        "no_windows",  # TODO(b/159077703): Enable Python API Flex support on Windows.
-    ],
     deps = [
         ":lite",
         "//tensorflow/python:client_testlib",
@@ -214,8 +213,11 @@ py_library(
     deps = [
         ":lite_constants",
         ":op_hint",
+        ":schema_py",
         "//tensorflow/python:tf_optimizer",
         "//tensorflow/python/eager:wrap_function",
+        "@absl_py//absl/logging",
+        "@flatbuffers//:runtime_py",
         "@six_archive//:six",
     ],
 )
@@ -226,12 +228,24 @@ py_test(
     python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
+        "no_mac",  # TODO(b/148247402): flatbuffers import broken on Mac OS.
         "no_windows",
     ],
     deps = [
+        ":lite_constants",
         ":util",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:convert_to_constants",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:session",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/lite/python/interpreter.py b/tensorflow/lite/python/interpreter.py
index 35e05d8c8c9..12ee41d6dee 100644
--- a/tensorflow/lite/python/interpreter.py
+++ b/tensorflow/lite/python/interpreter.py
@@ -21,11 +21,12 @@ from __future__ import print_function
 import ctypes
 import platform
 import sys
+import os
 
 import numpy as np
 
 # pylint: disable=g-import-not-at-top
-if not __file__.endswith('tflite_runtime/interpreter.py'):
+if not os.path.splitext(__file__)[0].endswith('tflite_runtime/interpreter'):
   # This file is part of tensorflow package.
   from tensorflow.lite.python.interpreter_wrapper import _pywrap_tensorflow_interpreter_wrapper as _interpreter_wrapper
   from tensorflow.python.util.tf_export import tf_export as _tf_export
diff --git a/tensorflow/lite/python/interpreter_wrapper/numpy.cc b/tensorflow/lite/python/interpreter_wrapper/numpy.cc
index 00e5064e620..d2f308a74a2 100644
--- a/tensorflow/lite/python/interpreter_wrapper/numpy.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/numpy.cc
@@ -56,6 +56,8 @@ int TfLiteTypeToPyArrayType(TfLiteType tf_lite_type) {
       return NPY_BOOL;
     case kTfLiteComplex64:
       return NPY_COMPLEX64;
+    case kTfLiteComplex128:
+      return NPY_COMPLEX128;
     case kTfLiteNoType:
       return NPY_NOTYPE;
       // Avoid default so compiler errors created when new types are made.
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 4c6bd362efa..56397110e5b 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -61,6 +61,7 @@ from tensorflow.lite.python.util import get_grappler_config as _get_grappler_con
 from tensorflow.lite.python.util import get_tensor_name as _get_tensor_name
 from tensorflow.lite.python.util import get_tensors_from_tensor_names as _get_tensors_from_tensor_names
 from tensorflow.lite.python.util import is_frozen_graph as _is_frozen_graph
+from tensorflow.lite.python.util import modify_integer_quantized_model_io_type as _modify_integer_quantized_model_io_type
 from tensorflow.lite.python.util import run_graph_optimizations as _run_graph_optimizations
 from tensorflow.lite.python.util import set_tensor_shapes as _set_tensor_shapes
 from tensorflow.python import keras as _keras
@@ -124,7 +125,7 @@ class Optimize(enum.Enum):
   OPTIMIZE_FOR_LATENCY = "OPTIMIZE_FOR_LATENCY"
 
   def __str__(self):
-    return self.value
+    return str(self.value)
 
 
 @_tf_export("lite.RepresentativeDataset")
@@ -199,11 +200,21 @@ class QuantizationMode(object):
             self._representative_dataset is not None and
             self._smallest_supported_type() == constants.INT8)
 
-  def is_post_training_integer_quantize(self):
-    """Post training integer quantization."""
+  def is_post_training_integer_quantize_8(self):
+    """Post training integer 8 quantization."""
     return (self.post_training_int8_no_float() or
             self.post_training_int8_allow_float())
 
+  def is_post_training_integer_quantize_16x8(self):
+    """Post training integer 16x8 quantization."""
+    return (self.post_training_int16x8_no_float() or
+            self.post_training_int16x8_allow_float())
+
+  def is_post_training_integer_quantize(self):
+    """Post training integer quantization."""
+    return (self.is_post_training_integer_quantize_8() or
+            self.is_post_training_integer_quantize_16x8())
+
   def training_time_int8_allow_float(self):
     """Training-time int8 quantize, allow float fallback."""
     return (self._any_optimization_enabled() and
@@ -219,7 +230,7 @@ class QuantizationMode(object):
 
   def post_training_int16x8_allow_float(self):
     """Post training int16x8 quantize, allow float fallback."""
-    return (self._is_int16x8_target_required() and self._is_allow_float())
+    return self._is_int16x8_target_required() and self._is_allow_float()
 
   def post_training_dynamic_range_int8(self):
     """Post training int8 const, on-the-fly int8 quantize of dynamic tensors."""
@@ -314,6 +325,23 @@ class QuantizationMode(object):
     else:
       return False, None
 
+  def flags_modify_model_io_type(
+      self, input_type=constants.FLOAT, output_type=constants.FLOAT):
+    """Flags for modifying the input and output type of a tflite model."""
+    is_post_training_quantize = self.quantizer_flags(input_type, output_type)[0]
+    is_training_time_only_quantize = self.training_time_int8_allow_float() and \
+        not is_post_training_quantize
+
+    # TODO(b/153576658): Consolidate post/during training quantization workflows
+    # to modify model input/output type after MLIR conversion.
+    if is_training_time_only_quantize:
+      return {
+          "inference_input_type": input_type,
+          "inference_output_type": output_type,
+      }
+    else:
+      return None
+
   # Below are helpers for the above functions.
 
   def _validate_int8_required(self):
@@ -510,6 +538,10 @@ class TFLiteConverterBase(object):
       if not self._saved_model_exported_names:
         self._saved_model_exported_names = []
       self._saved_model_version = saved_model_proto.saved_model_schema_version
+      if self._saved_model_version == 0:
+        self.saved_model_dir = None
+        logging.warning("SavedModel schema version is zero.")
+        return
       if self._saved_model_version not in [1, 2]:
         raise ValueError("SavedModel file format({0}) is not supported".format(
             self._saved_model_version))
@@ -552,11 +584,13 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
 
   def _validate_inference_input_output_types(self, quant_mode):
     """Validate inference_input_type and inference_output_type flags."""
-    default_types = [constants.FLOAT, None]
-    # We only support integer types for post training integer quantization
-    # as we have statistical information to quantize the input and output.
-    if quant_mode.is_post_training_integer_quantize():
-      all_types = default_types + [constants.INT8, constants.QUANTIZED_UINT8]
+    default_types = [constants.FLOAT]
+    # We support integer input/output for integer quantized models only.
+    if quant_mode.training_time_int8_allow_float():
+      if quant_mode.is_post_training_integer_quantize_16x8():
+        all_types = default_types + [constants.INT16]
+      else:
+        all_types = default_types + [constants.INT8, constants.QUANTIZED_UINT8]
       if self.inference_input_type not in all_types or \
           self.inference_output_type not in all_types:
         all_types_names = ["tf." + t.name for t in all_types]
@@ -639,6 +673,12 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
     if calibrate_and_quantize:
       result = self._calibrate_quantize_model(result, **flags)
 
+    flags_modify_model_io_type = quant_mode.flags_modify_model_io_type(
+        self.inference_input_type, self.inference_output_type)
+    if flags_modify_model_io_type:
+      result = _modify_integer_quantized_model_io_type(
+          result, **flags_modify_model_io_type)
+
     if self._experimental_sparsify_model:
       result = _mlir_sparsify(result)
 
@@ -867,7 +907,7 @@ class TFLiteFrozenGraphConverterV2(TFLiteConverterBaseV2):
     """
     # TODO(b/130297984): Add support for converting multiple function.
 
-    if len(self._funcs) == 0:
+    if len(self._funcs) == 0:  # pylint: disable=g-explicit-length-test
       raise ValueError("No ConcreteFunction is specified.")
 
     if len(self._funcs) > 1:
@@ -1087,7 +1127,7 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
       parameter is ignored. (default tf.float32)
     inference_input_type: Target data type of real-number input arrays. Allows
       for a different type for input arrays. If an integer type is provided and
-      `optimizations` are not used, `quantized_inputs_stats` must be provided.
+      `optimizations` are not used, `quantized_input_stats` must be provided.
       If `inference_type` is tf.uint8, signaling conversion to a fully quantized
       model from a quantization-aware trained input model, then
       `inference_input_type` defaults to tf.uint8. In all other cases,
@@ -1205,7 +1245,7 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
     return object.__getattribute__(self, name)
 
   def _validate_quantized_input_stats(self, converter_kwargs, calibrate):
-    """Ensure quantized_input_stats provided if required."""
+    """Ensure the `quantized_input_stats` flag is provided if required."""
 
     quantized_types = frozenset({constants.INT8, constants.QUANTIZED_UINT8})
 
@@ -1216,8 +1256,9 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
 
     if (requires_quantized_input_stats and
         not converter_kwargs["quantized_input_stats"]):
-      raise ValueError("std_dev and mean must be defined when inference_type "
-                       "or inference_input_type is QUANTIZED_UINT8 or INT8.")
+      raise ValueError("The `quantized_input_stats` flag must be defined when "
+                       "either `inference_type` flag or `inference_input_type` "
+                       "flag is set to tf.uint8 or tf.int8.")
 
   def convert(self):
     """Converts a TensorFlow GraphDef based on instance variables.
@@ -1640,7 +1681,7 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
     inference_input_type: Target data type of real-number input arrays. Allows
       for a different type for input arrays.
       If an integer type is provided and `optimizations` are not used,
-      `quantized_inputs_stats` must be provided.
+      `quantized_input_stats` must be provided.
       If `inference_type` is tf.uint8, signaling conversion to a fully quantized
       model from a quantization-aware trained input model, then
       `inference_input_type` defaults to tf.uint8.
@@ -1971,6 +2012,7 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
     """
     return super(TFLiteConverter, self).convert()
 
+
 @_tf_export(v1=["lite.TocoConverter"])
 class TocoConverter(object):
   """Convert a TensorFlow model into `output_format` using TOCO.
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index ede24b2ede5..d17fc94cd20 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -114,7 +114,7 @@ class FromConstructor(TestModels):
 
 class FromSessionTest(TestModels, parameterized.TestCase):
 
-  def testFloat(self):
+  def testFloatModel(self):
     with ops.Graph().as_default():
       in_tensor = array_ops.placeholder(
           shape=[1, 16, 16, 3], dtype=dtypes.float32)
@@ -125,130 +125,27 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
+    self.assertLen(input_details, 1)
     self.assertEqual('Placeholder', input_details[0]['name'])
     self.assertEqual(np.float32, input_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertAllEqual([1, 16, 16, 3], input_details[0]['shape'])
     self.assertEqual((0., 0.), input_details[0]['quantization'])
 
     output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
+    self.assertLen(output_details, 1)
     self.assertEqual('add', output_details[0]['name'])
     self.assertEqual(np.float32, output_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertAllEqual([1, 16, 16, 3], output_details[0]['shape'])
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
-  def testForgottenCallToAllocateTensors(self):
-    with ops.Graph().as_default():
-      in_tensor = array_ops.placeholder(
-          shape=[1, 16, 16, 3], dtype=dtypes.float32)
-      out_tensor = in_tensor + in_tensor
-      sess = session.Session()
-    # Convert model and ensure model is not None.
-    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
-                                                  [out_tensor])
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
-
-    # Check values from converted model.
-    interpreter = Interpreter(model_content=tflite_model)
-    input_index = interpreter.get_input_details()[0]['index']
-    dummy_tensor = np.ones(shape=[1, 16, 16, 3], dtype=np.float32)
-    with self.assertRaises(ValueError):
-      interpreter.set_tensor(input_index, dummy_tensor)
-
-  @parameterized.named_parameters(
-      ('EnableMlirConverter', True),  # enable mlir
-      ('DisableMlirConverter', False))  # disable mlir
-  def testString(self, enable_mlir):
-    with ops.Graph().as_default():
-      in_tensor = array_ops.placeholder(shape=[4], dtype=dtypes.string)
-      out_tensor = array_ops.reshape(in_tensor, shape=[2, 2])
-      sess = session.Session()
-
-    # Convert model and ensure model is not None.
-    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
-                                                  [out_tensor])
-    converter.experimental_new_converter = enable_mlir
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
-
-    # Check values from converted model.
-    interpreter = Interpreter(model_content=tflite_model)
-    interpreter.allocate_tensors()
-
-    input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
-    self.assertEqual('Placeholder', input_details[0]['name'])
-    self.assertEqual(np.string_, input_details[0]['dtype'])
-    self.assertTrue(([4] == input_details[0]['shape']).all())
-
-    output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
-    self.assertEqual('Reshape', output_details[0]['name'])
-    self.assertEqual(np.string_, output_details[0]['dtype'])
-    self.assertTrue(([2, 2] == output_details[0]['shape']).all())
-    # TODO(b/122659643): Test setting/getting string data via the python
-    # interpreter API after support has been added.
-
-  @parameterized.named_parameters(
-      ('EnableMlirConverter', True),  # enable mlir
-      ('DisableMlirConverter', False))  # disable mlir
-  def testQuantization(self, enable_mlir):
-    with ops.Graph().as_default():
-      in_tensor_1 = array_ops.placeholder(
-          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputA')
-      in_tensor_2 = array_ops.placeholder(
-          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputB')
-      out_tensor = array_ops.fake_quant_with_min_max_args(
-          in_tensor_1 + in_tensor_2, min=0., max=1., name='output')
-      sess = session.Session()
-
-    # Convert model and ensure model is not None.
-    converter = lite.TFLiteConverter.from_session(sess,
-                                                  [in_tensor_1, in_tensor_2],
-                                                  [out_tensor])
-    converter.inference_type = lite_constants.QUANTIZED_UINT8
-    converter.quantized_input_stats = {
-        'inputA': (0., 1.),
-        'inputB': (0., 1.)
-    }  # mean, std_dev
-    converter.experimental_new_converter = enable_mlir
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
-
-    # Check values from converted model.
-    interpreter = Interpreter(model_content=tflite_model)
-    interpreter.allocate_tensors()
-
-    input_details = interpreter.get_input_details()
-    self.assertEqual(2, len(input_details))
-    self.assertEqual('inputA', input_details[0]['name'])
-    self.assertEqual(np.uint8, input_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
-    self.assertEqual((1., 0.),
-                     input_details[0]['quantization'])  # scale, zero_point
-
-    self.assertEqual('inputB', input_details[1]['name'])
-    self.assertEqual(np.uint8, input_details[1]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == input_details[1]['shape']).all())
-    self.assertEqual((1., 0.),
-                     input_details[1]['quantization'])  # scale, zero_point
-
-    output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
-    self.assertEqual(np.uint8, output_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
-    self.assertTrue(output_details[0]['quantization'][0] > 0)  # scale
-
-  def testQuantizedInput(self):
+  def testFloatModelQuantizedInput(self):
     with ops.Graph().as_default():
       in_tensor = array_ops.placeholder(
           shape=[1, 16, 16, 3], dtype=dtypes.float32)
@@ -262,7 +159,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     converter.inference_type = lite_constants.FLOAT
     converter.quantized_input_stats = {'Placeholder': (0., 1.)}  # mean, std_dev
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
     interpreter = Interpreter(model_content=tflite_model)
@@ -272,38 +169,68 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     self.assertLen(input_details, 1)
     self.assertEqual('Placeholder', input_details[0]['name'])
     self.assertEqual(np.uint8, input_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
-    self.assertEqual((1., 0.),
-                     input_details[0]['quantization'])  # scale, zero_point
+    self.assertAllEqual([1, 16, 16, 3], input_details[0]['shape'])
+    self.assertEqual((1., 0.), input_details[0]['quantization'])
 
     output_details = interpreter.get_output_details()
     self.assertLen(output_details, 1)
     self.assertEqual('add', output_details[0]['name'])
     self.assertEqual(np.float32, output_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertAllEqual([1, 16, 16, 3], output_details[0]['shape'])
     self.assertEqual((0., 0.), output_details[0]['quantization'])  # float
 
-  def testQuantizationInvalid(self):
+  def testForgottenCallToAllocateTensors(self):
     with ops.Graph().as_default():
-      in_tensor_1 = array_ops.placeholder(
-          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputA')
-      in_tensor_2 = array_ops.placeholder(
-          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputB')
-      out_tensor = array_ops.fake_quant_with_min_max_args(
-          in_tensor_1 + in_tensor_2, min=0., max=1., name='output')
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      out_tensor = in_tensor + in_tensor
+      sess = session.Session()
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
+                                                  [out_tensor])
+    tflite_model = converter.convert()
+    self.assertIsNotNone(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    input_index = interpreter.get_input_details()[0]['index']
+    dummy_tensor = np.ones(shape=[1, 16, 16, 3], dtype=np.float32)
+    with self.assertRaises(ValueError):
+      interpreter.set_tensor(input_index, dummy_tensor)
+
+  @parameterized.named_parameters(
+      ('EnableMlirConverter', True),  # enable mlir
+      ('DisableMlirConverter', False))  # disable mlir
+  def testString(self, enable_mlir_converter):
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(shape=[4], dtype=dtypes.string)
+      out_tensor = array_ops.reshape(in_tensor, shape=[2, 2])
       sess = session.Session()
 
     # Convert model and ensure model is not None.
-    converter = lite.TFLiteConverter.from_session(sess,
-                                                  [in_tensor_1, in_tensor_2],
+    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
-    converter.inference_type = lite_constants.QUANTIZED_UINT8
-    converter.quantized_input_stats = {'inputA': (0., 1.)}  # mean, std_dev
-    with self.assertRaises(ValueError) as error:
-      converter.convert()
-    self.assertEqual(
-        'Quantization input stats are not available for input tensors '
-        '\'inputB\'.', str(error.exception))
+    converter.experimental_new_converter = enable_mlir_converter
+    tflite_model = converter.convert()
+    self.assertIsNotNone(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertLen(input_details, 1)
+    self.assertEqual('Placeholder', input_details[0]['name'])
+    self.assertEqual(np.string_, input_details[0]['dtype'])
+    self.assertAllEqual([4], input_details[0]['shape'])
+
+    output_details = interpreter.get_output_details()
+    self.assertLen(output_details, 1)
+    self.assertEqual('Reshape', output_details[0]['name'])
+    self.assertEqual(np.string_, output_details[0]['dtype'])
+    self.assertAllEqual([2, 2], output_details[0]['shape'])
+    # TODO(b/122659643): Test setting/getting string data via the python
+    # interpreter API after support has been added.
 
   def testIntermediateInputArray(self):
     """Convert a model from an intermediate input array."""
@@ -318,24 +245,24 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor_final],
                                                   [out_tensor])
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
+    self.assertLen(input_details, 1)
     self.assertEqual('add', input_details[0]['name'])
     self.assertEqual(np.float32, input_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertAllEqual([1, 16, 16, 3], input_details[0]['shape'])
     self.assertEqual((0., 0.), input_details[0]['quantization'])
 
     output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
+    self.assertLen(output_details, 1)
     self.assertEqual('add_1', output_details[0]['name'])
     self.assertEqual(np.float32, output_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertAllEqual([1, 16, 16, 3], output_details[0]['shape'])
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
   def testSizeNoneInvalid(self):
@@ -357,7 +284,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
   @parameterized.named_parameters(
       ('EnableMlirConverter', True),  # enable mlir
       ('DisableMlirConverter', False))  # disable mlir
-  def testScalarValid(self, enable_mlir):
+  def testScalarValid(self, enable_mlir_converter):
     # Construct a graph using a scalar (empty shape) input.
     with ops.Graph().as_default():
       in_tensor = array_ops.placeholder(dtype=dtypes.float32, shape=[])
@@ -367,25 +294,25 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     # Test conversion with the scalar input shape.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
-    converter.experimental_new_converter = enable_mlir
+    converter.experimental_new_converter = enable_mlir_converter
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
+    self.assertLen(input_details, 1)
     self.assertEqual('Placeholder', input_details[0]['name'])
     self.assertEqual(np.float32, input_details[0]['dtype'])
-    self.assertTrue(([] == input_details[0]['shape']).all())
+    self.assertEmpty(input_details[0]['shape'])
 
     output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
+    self.assertLen(output_details, 1)
     self.assertEqual('add', output_details[0]['name'])
     self.assertEqual(np.float32, output_details[0]['dtype'])
-    self.assertTrue(([] == input_details[0]['shape']).all())
+    self.assertEmpty(input_details[0]['shape'])
 
     # Validate inference using the scalar inputs/outputs.
     test_input = np.array(4.0, dtype=np.float32)
@@ -394,7 +321,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     interpreter.invoke()
 
     output_data = interpreter.get_tensor(output_details[0]['index'])
-    self.assertTrue((expected_output == output_data).all())
+    self.assertEqual(expected_output, output_data)
 
   def testSizeInvalid(self):
     with ops.Graph().as_default():
@@ -433,9 +360,8 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     self.assertLen(input_details, 1)
     self.assertEqual('Placeholder', input_details[0]['name'])
     self.assertEqual(np.float32, input_details[0]['dtype'])
-    self.assertTrue(([1, 1, 16, 3] == input_details[0]['shape']).all())
-    self.assertTrue(([1, -1, 16,
-                      3] == input_details[0]['shape_signature']).all())
+    self.assertAllEqual([1, 1, 16, 3], input_details[0]['shape'])
+    self.assertAllEqual([1, -1, 16, 3], input_details[0]['shape_signature'])
     self.assertEqual((0., 0.), input_details[0]['quantization'])
 
     # Resize tensor with strict checking.
@@ -452,13 +378,11 @@ class FromSessionTest(TestModels, parameterized.TestCase):
 
     input_details = interpreter.get_input_details()
     self.assertLen(input_details, 1)
-    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
-    self.assertTrue(([1, -1, 16,
-                      3] == input_details[0]['shape_signature']).all())
+    self.assertAllEqual([1, 16, 16, 3], input_details[0]['shape'])
+    self.assertAllEqual([1, -1, 16, 3], input_details[0]['shape_signature'])
 
     output_details = interpreter.get_output_details()
-    self.assertTrue(([1, -1, 16,
-                      3] == output_details[0]['shape_signature']).all())
+    self.assertAllEqual([1, -1, 16, 3], output_details[0]['shape_signature'])
 
   def testResizeTensorInputStrict(self):
     # Ensures that resize_tensor_input(strict=True) works as expected.
@@ -472,7 +396,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
     interpreter = Interpreter(model_content=tflite_model)
@@ -499,24 +423,24 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
+    self.assertLen(input_details, 1)
     self.assertEqual('Placeholder', input_details[0]['name'])
     self.assertEqual(np.float32, input_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertAllEqual([1, 16, 16, 3], input_details[0]['shape'])
     self.assertEqual((0., 0.), input_details[0]['quantization'])
 
     output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
+    self.assertLen(output_details, 1)
     self.assertEqual('add', output_details[0]['name'])
     self.assertEqual(np.float32, output_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertAllEqual([1, 16, 16, 3], output_details[0]['shape'])
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
   def testBatchSizeNonZero(self):
@@ -533,7 +457,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
                                                   [in_tensor_1, in_tensor_2],
                                                   [out_tensor])
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
     interpreter = Interpreter(model_content=tflite_model)
@@ -542,9 +466,9 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     input_details = interpreter.get_input_details()
     self.assertLen(input_details, 2)
     self.assertEqual('input1', input_details[0]['name'])
-    self.assertTrue(([1, 4] == input_details[0]['shape']).all())
+    self.assertAllEqual([1, 4], input_details[0]['shape'])
     self.assertEqual('input2', input_details[1]['name'])
-    self.assertTrue(([4, 10] == input_details[1]['shape']).all())
+    self.assertAllEqual([4, 10], input_details[1]['shape'])
 
   def testFreezeGraph(self):
     with ops.Graph().as_default():
@@ -562,24 +486,24 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
+    self.assertLen(input_details, 1)
     self.assertEqual('Placeholder', input_details[0]['name'])
     self.assertEqual(np.float32, input_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertAllEqual([1, 16, 16, 3], input_details[0]['shape'])
     self.assertEqual((0., 0.), input_details[0]['quantization'])
 
     output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
+    self.assertLen(output_details, 1)
     self.assertEqual('top_k:1', output_details[0]['name'])
     self.assertEqual(np.int32, output_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 1] == output_details[0]['shape']).all())
+    self.assertAllEqual([1, 16, 16, 1], output_details[0]['shape'])
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
   def testGraphviz(self):
@@ -594,12 +518,12 @@ class FromSessionTest(TestModels, parameterized.TestCase):
                                                   [out_tensor])
     converter.output_format = lite_constants.GRAPHVIZ_DOT
     graphviz_output = converter.convert()
-    self.assertTrue(graphviz_output)
+    self.assertIsNotNone(graphviz_output)
 
   @parameterized.named_parameters(
       ('EnableMlirConverter', True),  # enable mlir
       ('DisableMlirConverter', False))  # disable mlir
-  def testDumpGraphviz(self, enable_mlir):
+  def testDumpGraphviz(self, enable_mlir_converter):
     with ops.Graph().as_default():
       in_tensor = array_ops.placeholder(
           shape=[1, 16, 16, 3], dtype=dtypes.float32)
@@ -609,35 +533,35 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
-    converter.experimental_new_converter = enable_mlir
+    converter.experimental_new_converter = enable_mlir_converter
     graphviz_dir = self.get_temp_dir()
     converter.dump_graphviz_dir = graphviz_dir
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     # Ensure interpreter is able to allocate and check graphviz data.
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
     num_items_graphviz = len(os.listdir(graphviz_dir))
-    self.assertTrue(num_items_graphviz)
-    self.assertTrue(
+    self.assertIsNotNone(num_items_graphviz)
+    self.assertIsNotNone(
         os.path.exists(os.path.join(graphviz_dir, 'toco_AT_IMPORT.dot')))
-    self.assertTrue(
+    self.assertIsNotNone(
         os.path.exists(
             os.path.join(graphviz_dir, 'toco_AFTER_TRANSFORMATIONS.dot')))
 
     # new converter doesn't support `dump_graphviz_video` flag
-    if not enable_mlir:
+    if not enable_mlir_converter:
       # Convert model and ensure model is not None.
       converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                     [out_tensor])
-      converter.experimental_new_converter = enable_mlir
+      converter.experimental_new_converter = enable_mlir_converter
       graphviz_dir = self.get_temp_dir()
       converter.dump_graphviz_dir = graphviz_dir
       converter.dump_graphviz_video = True
       tflite_model = converter.convert()
-      self.assertTrue(tflite_model)
+      self.assertIsNotNone(tflite_model)
 
       # Ensure graphviz folder has more data after using video flag.
       num_items_graphviz_video = len(os.listdir(graphviz_dir))
@@ -656,10 +580,9 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     log_dir = self.get_temp_dir()
     converter.conversion_summary_dir = log_dir
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
-    num_items_conversion_summary = len(os.listdir(log_dir))
-    self.assertTrue(num_items_conversion_summary)
+    self.assertNotEmpty(os.listdir(log_dir))
 
   def testDumpConversionSummaryWithOldConverter(self):
     with ops.Graph().as_default():
@@ -675,7 +598,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     log_dir = self.get_temp_dir()
     converter.conversion_summary_dir = log_dir
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
     # Check nothing is generated under the conversion summary path.
     num_items_conversion_summary = len(os.listdir(log_dir))
     self.assertEqual(num_items_conversion_summary, 0)
@@ -683,104 +606,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
   @parameterized.named_parameters(
       ('EnableMlirConverter', True),  # enable mlir
       ('DisableMlirConverter', False))  # disable mlir
-  def testInferenceInputType(self, enable_mlir):
-    with ops.Graph().as_default():
-      in_tensor = array_ops.placeholder(
-          shape=[1, 16, 16, 3], dtype=dtypes.float32)
-      out_tensor = in_tensor + in_tensor
-      sess = session.Session()
-
-    # Convert model and ensure model is not None.
-    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
-                                                  [out_tensor])
-    converter.experimental_new_converter = enable_mlir
-    converter.inference_input_type = lite_constants.QUANTIZED_UINT8
-    converter.quantized_input_stats = {'Placeholder': (0., 1.)}  # mean, std_dev
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
-
-    # Check values from converted model.
-    interpreter = Interpreter(model_content=tflite_model)
-    interpreter.allocate_tensors()
-
-    input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
-    self.assertEqual('Placeholder', input_details[0]['name'])
-    self.assertEqual(np.uint8, input_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
-    self.assertEqual((1., 0.), input_details[0]['quantization'])
-
-    output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
-    self.assertEqual('add', output_details[0]['name'])
-    self.assertEqual(np.float32, output_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
-
-  def testDefaultRangesStats(self):
-    with ops.Graph().as_default():
-      in_tensor = array_ops.placeholder(
-          shape=[1, 16, 16, 3], dtype=dtypes.float32)
-      out_tensor = in_tensor + in_tensor
-      sess = session.Session()
-
-    # Convert model and ensure model is not None.
-    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
-                                                  [out_tensor])
-    converter.inference_type = lite_constants.QUANTIZED_UINT8
-    converter.quantized_input_stats = {'Placeholder': (0., 1.)}  # mean, std_dev
-    converter.default_ranges_stats = (0, 6)  # min, max
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
-
-    # Check values from converted model.
-    interpreter = Interpreter(model_content=tflite_model)
-    interpreter.allocate_tensors()
-
-    input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
-    self.assertEqual('Placeholder', input_details[0]['name'])
-    self.assertEqual(np.uint8, input_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
-    self.assertEqual((1., 0.), input_details[0]['quantization'])
-
-    output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
-    self.assertEqual('add', output_details[0]['name'])
-    self.assertEqual(np.uint8, output_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
-    self.assertTrue(output_details[0]['quantization'][0] > 0)  # scale
-
-  @parameterized.named_parameters(
-      ('EnableMlirConverter', True),  # enable mlir
-      ('DisableMlirConverter', False))  # disable mlir
-  def testPostTrainingQuantizeDeprecatedAttribute(self, enable_mlir):
-    with ops.Graph().as_default():
-      in_tensor_1 = array_ops.placeholder(
-          shape=[33, 33], dtype=dtypes.float32, name='inputA')
-      in_tensor_2 = constant_op.constant(
-          np.random.uniform(low=-10., high=10., size=(33, 33)),
-          shape=[33, 33],
-          dtype=dtypes.float32,
-          name='inputB')
-      out_tensor = math_ops.matmul(in_tensor_1, in_tensor_2, name='output')
-      sess = session.Session()
-
-    quantized_converter = lite.TFLiteConverter.from_session(
-        sess, [in_tensor_1], [out_tensor])
-    self.assertFalse(quantized_converter.post_training_quantize)
-    quantized_converter.experimental_new_converter = enable_mlir
-
-    quantized_converter.post_training_quantize = True
-    self.assertTrue(quantized_converter.post_training_quantize)
-    self.assertEqual(quantized_converter.optimizations, [lite.Optimize.DEFAULT])
-
-    quantized_tflite = quantized_converter.convert()
-    self.assertTrue(quantized_tflite)
-
-  @parameterized.named_parameters(
-      ('EnableMlirConverter', True),  # enable mlir
-      ('DisableMlirConverter', False))  # disable mlir
-  def testPostTrainingQuantize(self, enable_mlir):
+  def testQuantizeDynamicRange(self, enable_mlir_converter):
     np.random.seed(0)
     with ops.Graph().as_default():
       # We need the tensor to have more than 1024 elements for quantize_weights
@@ -796,26 +622,53 @@ class FromSessionTest(TestModels, parameterized.TestCase):
       sess = session.Session()
 
     # Convert float model.
-    float_converter = lite.TFLiteConverter.from_session(sess, [in_tensor_1],
-                                                        [out_tensor])
-    float_converter.experimental_new_converter = enable_mlir
-    float_tflite = float_converter.convert()
-    self.assertTrue(float_tflite)
+    float_converter = lite.TFLiteConverter.from_session(
+        sess, [in_tensor_1], [out_tensor])
+    float_converter.experimental_new_converter = enable_mlir_converter
+    float_tflite_model = float_converter.convert()
+    self.assertIsNotNone(float_tflite_model)
 
     # Convert quantized weights model.
     quantized_converter = lite.TFLiteConverter.from_session(
         sess, [in_tensor_1], [out_tensor])
 
-    quantized_converter.experimental_new_converter = enable_mlir
     quantized_converter.optimizations = [lite.Optimize.DEFAULT]
-    quantized_converter.experimental_new_converter = enable_mlir
-    quantized_tflite = quantized_converter.convert()
-    self.assertTrue(quantized_tflite)
+    quantized_converter.experimental_new_converter = enable_mlir_converter
+    quantized_tflite_model = quantized_converter.convert()
+    self.assertIsNotNone(quantized_tflite_model)
 
     # Ensure that the quantized weights tflite model is smaller.
-    self.assertTrue(len(quantized_tflite) < len(float_tflite))
+    self.assertLess(len(quantized_tflite_model), len(float_tflite_model))
 
-  def _getCalibrationQuantizeModel(self):
+  @parameterized.named_parameters(
+      ('EnableMlirConverter', True),  # enable mlir
+      ('DisableMlirConverter', False))  # disable mlir
+  def testQuantizeDynamicRangeDeprecatedPostTrainingQuantizeAttribute(
+      self, enable_mlir_converter):
+    with ops.Graph().as_default():
+      in_tensor_1 = array_ops.placeholder(
+          shape=[33, 33], dtype=dtypes.float32, name='inputA')
+      in_tensor_2 = constant_op.constant(
+          np.random.uniform(low=-10., high=10., size=(33, 33)),
+          shape=[33, 33],
+          dtype=dtypes.float32,
+          name='inputB')
+      out_tensor = math_ops.matmul(in_tensor_1, in_tensor_2, name='output')
+      sess = session.Session()
+
+    quantized_converter = lite.TFLiteConverter.from_session(
+        sess, [in_tensor_1], [out_tensor])
+    self.assertFalse(quantized_converter.post_training_quantize)
+    quantized_converter.experimental_new_converter = enable_mlir_converter
+
+    quantized_converter.post_training_quantize = True
+    self.assertTrue(quantized_converter.post_training_quantize)
+    self.assertEqual(quantized_converter.optimizations, [lite.Optimize.DEFAULT])
+
+    quantized_tflite_model = quantized_converter.convert()
+    self.assertIsNotNone(quantized_tflite_model)
+
+  def _getIntegerQuantizeModel(self):
     np.random.seed(0)
     inp = array_ops.placeholder(
         dtype=dtypes.float32, shape=(1, 5, 5, 3), name='input')
@@ -835,37 +688,37 @@ class FromSessionTest(TestModels, parameterized.TestCase):
   @parameterized.named_parameters(
       ('EnableMlirConverter', True),  # enable mlir
       ('DisableMlirConverter', False))  # disable mlir
-  def testPostTrainingCalibrateAndQuantize(self, enable_mlir):
+  def testQuantizeInt8AllowFloat(self, enable_mlir_converter):
     with ops.Graph().as_default():
-      inp, output, calibration_gen = self._getCalibrationQuantizeModel()
+      inp, output, calibration_gen = self._getIntegerQuantizeModel()
       sess = session.Session()
 
     # Convert float model.
     float_converter = lite.TFLiteConverter.from_session(sess, [inp], [output])
-    float_tflite = float_converter.convert()
-    self.assertTrue(float_tflite)
+    float_tflite_model = float_converter.convert()
+    self.assertIsNotNone(float_tflite_model)
 
     # Convert quantized model.
     quantized_converter = lite.TFLiteConverter.from_session(
         sess, [inp], [output])
-    quantized_converter.experimental_new_converter = enable_mlir
+    quantized_converter.experimental_new_converter = enable_mlir_converter
     quantized_converter.optimizations = [lite.Optimize.DEFAULT]
     quantized_converter.representative_dataset = calibration_gen
-    quantized_tflite = quantized_converter.convert()
-    self.assertTrue(quantized_tflite)
+    quantized_tflite_model = quantized_converter.convert()
+    self.assertIsNotNone(quantized_tflite_model)
 
     # The default input and output types should be float.
-    interpreter = Interpreter(model_content=quantized_tflite)
+    interpreter = Interpreter(model_content=quantized_tflite_model)
     interpreter.allocate_tensors()
     input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
+    self.assertLen(input_details, 1)
     self.assertEqual(np.float32, input_details[0]['dtype'])
     output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
+    self.assertLen(output_details, 1)
     self.assertEqual(np.float32, output_details[0]['dtype'])
 
     # Ensure that the quantized weights tflite model is smaller.
-    self.assertLess(len(quantized_tflite), len(float_tflite))
+    self.assertLess(len(quantized_tflite_model), len(float_tflite_model))
 
   @parameterized.named_parameters(
       # Quantize model to Int8: with enable mlir
@@ -883,138 +736,82 @@ class FromSessionTest(TestModels, parameterized.TestCase):
        [lite.OpsSet.\
        EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8],
        True))
-  def testCalibrateAndQuantizeBuiltinInt(self, supported_ops, enable_mlir):
+  def testQuantizeInt8And16x8(self, supported_ops, enable_mlir_converter):
     with ops.Graph().as_default():
-      inp, output, calibration_gen = self._getCalibrationQuantizeModel()
+      inp, output, calibration_gen = self._getIntegerQuantizeModel()
       sess = session.Session()
 
     # Convert float model.
     float_converter = lite.TFLiteConverter.from_session(sess, [inp], [output])
-    float_converter.experimental_new_converter = enable_mlir
-    float_tflite = float_converter.convert()
-    self.assertTrue(float_tflite)
+    float_converter.experimental_new_converter = enable_mlir_converter
+    float_tflite_model = float_converter.convert()
+    self.assertIsNotNone(float_tflite_model)
 
     # Convert model by specifying target spec (instead of optimizations), since
     # when targeting an integer only backend, quantization is mandatory.
     quantized_converter = lite.TFLiteConverter.from_session(
         sess, [inp], [output])
-    quantized_converter.experimental_new_converter = enable_mlir
+    quantized_converter.experimental_new_converter = enable_mlir_converter
     quantized_converter.target_spec.supported_ops = supported_ops
     quantized_converter.representative_dataset = calibration_gen
-    quantized_tflite = quantized_converter.convert()
-    self.assertTrue(quantized_tflite)
+    quantized_tflite_model = quantized_converter.convert()
+    self.assertIsNotNone(quantized_tflite_model)
 
     # The default input and output types should be float.
-    interpreter = Interpreter(model_content=quantized_tflite)
+    interpreter = Interpreter(model_content=quantized_tflite_model)
     interpreter.allocate_tensors()
     input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
+    self.assertLen(input_details, 1)
     self.assertEqual(np.float32, input_details[0]['dtype'])
     output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
+    self.assertLen(output_details, 1)
     self.assertEqual(np.float32, output_details[0]['dtype'])
 
     # Ensure that the quantized weights tflite model is smaller.
-    self.assertLess(len(quantized_tflite), len(float_tflite))
+    self.assertLess(len(quantized_tflite_model), len(float_tflite_model))
 
   @parameterized.named_parameters(
-      # Quantize to Float16 even if rep data provided.
-      ('UseRepresentativeData', True, False, True, False, False, False),
-      # Quantize to Float16 if no rep data provided.
-      ('NoRepresentativeData', False, False, True, False, False, False),
-      # Post training quantization if both rep data and int8 included.
-      ('UseSampleDataIncludeInt8', True, True, False, False, True, False),
-
-      # Quantize to Float16 even if rep data provided with mlir.
-      ('UseRepresentativeDataMlir', True, False, True, False, False, True),
-      # Quantize to Float16 if no rep data provided with mlir.
-      ('NoRepresentativeDataMlir', False, False, True, False, False, True),
-      # Post training quantization if both rep data and int8 included with mlir.
-      ('SampleDataIncludeInt8Mlir', True, True, False, False, True, True))
-  def testQuantizeFloat16(self, use_rep_data, include_int8,
-                          is_float16_quantized, is_error,
-                          is_post_training_quantized, enable_mlir):
+      ('EnableMlirConverter', True),  # enable mlir
+      ('DisableMlirConverter', False))  # disable mlir
+  def testQuantizeInt8InputOutput(self, enable_mlir_converter):
     with ops.Graph().as_default():
-      inp, output, calibration_gen = self._getCalibrationQuantizeModel()
+      inp, output, calibration_gen = self._getIntegerQuantizeModel()
       sess = session.Session()
 
-    idx = 1 if enable_mlir else 0
-    node_name = 'Conv2D' if enable_mlir else 'Conv2D_bias'
     # Convert float model.
     float_converter = lite.TFLiteConverter.from_session(sess, [inp], [output])
-    float_converter.experimental_new_converter = enable_mlir
-    float_tflite = float_converter.convert()
-    self.assertTrue(float_tflite)
-    interpreter = Interpreter(model_content=float_tflite)
+    float_converter.experimental_new_converter = enable_mlir_converter
+    float_tflite_model = float_converter.convert()
+    self.assertIsNotNone(float_tflite_model)
+
+    # Convert quantized weights model.
+    quantized_converter = lite.TFLiteConverter.from_session(
+        sess, [inp], [output])
+    quantized_converter.experimental_new_converter = enable_mlir_converter
+    quantized_converter.inference_input_type = lite_constants.INT8
+    quantized_converter.inference_output_type = lite_constants.INT8
+    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
+    quantized_converter.representative_dataset = calibration_gen
+    quantized_tflite_model = quantized_converter.convert()
+    self.assertIsNotNone(quantized_tflite_model)
+
+    # The input and output types should be int8.
+    interpreter = Interpreter(model_content=quantized_tflite_model)
     interpreter.allocate_tensors()
-    self.assertEqual(interpreter.get_tensor_details()[idx]['name'], node_name)
-    self.assertEqual(interpreter.get_tensor_details()[idx]['dtype'],
-                     lite.constants.FLOAT)
-    # Convert model to quantized version
-    quantized_converter = lite.TFLiteConverter.from_session(
-        sess, [inp], [output])
-    quantized_converter.experimental_new_converter = enable_mlir
-    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
-    quantized_converter.target_spec.supported_types = [lite.constants.FLOAT16]
-    if include_int8:
-      quantized_converter.target_spec.supported_types.append(
-          lite.constants.INT8)
-    if use_rep_data:
-      quantized_converter.representative_dataset = calibration_gen
+    input_details = interpreter.get_input_details()
+    self.assertLen(input_details, 1)
+    self.assertEqual(np.int8, input_details[0]['dtype'])
+    output_details = interpreter.get_output_details()
+    self.assertLen(output_details, 1)
+    self.assertEqual(np.int8, output_details[0]['dtype'])
 
-    if is_error:
-      with self.assertRaises(ValueError) as error:
-        quantized_converter.convert()
-      self.assertEqual(
-          'representative_dataset is required when specifying '
-          'TFLITE_BUILTINS_INT8 or INT8 supported types.', str(error.exception))
-
-    else:
-      quantized_tflite = quantized_converter.convert()
-      self.assertTrue(quantized_tflite)
-      interpreter = Interpreter(model_content=quantized_tflite)
-      interpreter.allocate_tensors()
-      self.assertEqual(interpreter.get_tensor_details()[idx]['name'], node_name)
-
-      if is_float16_quantized:
-        # Verify that bias constant is float16 type.
-        self.assertEqual(interpreter.get_tensor_details()[idx]['dtype'],
-                         lite.constants.FLOAT16)
-      elif is_post_training_quantized:
-        # Verify that bias constants is int32 type.
-        self.assertEqual(interpreter.get_tensor_details()[idx]['dtype'],
-                         lite.constants.INT32)
-      else:
-        raise ValueError('Invalid test options.')
+    # Ensure that the quantized weights tflite model is smaller.
+    self.assertLess(len(quantized_tflite_model), len(float_tflite_model))
 
   @parameterized.named_parameters(
       ('EnableMlirConverter', True),  # enable mlir
       ('DisableMlirConverter', False))  # disable mlir
-  def testInvalidQuantizeFloat16(self, enable_mlir):
-    with ops.Graph().as_default():
-      inp, output, _ = self._getCalibrationQuantizeModel()
-      sess = session.Session()
-
-    # Specify float16 quantization
-    quantized_converter = lite.TFLiteConverter.from_session(
-        sess, [inp], [output])
-    quantized_converter.experimental_new_converter = enable_mlir
-    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
-    quantized_converter.target_spec.supported_types = [lite.constants.FLOAT16]
-    # Specify only int8 builtin ops
-    quantized_converter.target_spec.supported_ops = [
-        lite.OpsSet.TFLITE_BUILTINS_INT8
-    ]
-    with self.assertRaises(ValueError) as error:
-      quantized_converter.convert()
-    self.assertEqual(
-        'TFLITE_BUILTINS_INT8 requires smallest supported type to be INT8.',
-        str(error.exception))
-
-  @parameterized.named_parameters(
-      ('EnableMlirConverter', True),  # enable mlir
-      ('DisableMlirConverter', False))  # disable mlir
-  def testInvalidPostTrainingQuantize(self, enable_mlir):
+  def testInvalidQuantizeInt8(self, enable_mlir_converter):
     np.random.seed(0)
     with ops.Graph().as_default():
       # We need the tensor to have more than 1024 elements for quantize_weights
@@ -1032,7 +829,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     # Attempt to convert to quantized weights model.
     quantized_converter = lite.TFLiteConverter.from_session(
         sess, [in_tensor_1], [out_tensor])
-    quantized_converter.experimental_new_converter = enable_mlir
+    quantized_converter.experimental_new_converter = enable_mlir_converter
     quantized_converter.optimizations = [lite.Optimize.DEFAULT]
     # Restricting to int8 type only
     quantized_converter.target_spec.supported_types = [lite.constants.INT8]
@@ -1046,72 +843,183 @@ class FromSessionTest(TestModels, parameterized.TestCase):
   @parameterized.named_parameters(
       ('EnableMlirConverter', True),  # enable mlir
       ('DisableMlirConverter', False))  # disable mlir
-  def testPostTrainingCalibrateAndQuantizeFloatNotAllowed(self, enable_mlir):
+  def testQuantizeUInt8(self, enable_mlir_converter):
     with ops.Graph().as_default():
-      inp, output, calibration_gen = self._getCalibrationQuantizeModel()
+      in_tensor_1 = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputA')
+      in_tensor_2 = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputB')
+      out_tensor = array_ops.fake_quant_with_min_max_args(
+          in_tensor_1 + in_tensor_2, min=0., max=1., name='output')
       sess = session.Session()
 
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverter.from_session(sess,
+                                                  [in_tensor_1, in_tensor_2],
+                                                  [out_tensor])
+    converter.inference_type = lite_constants.QUANTIZED_UINT8
+    converter.quantized_input_stats = {
+        'inputA': (0., 1.),
+        'inputB': (0., 1.)
+    }  # mean, std_dev
+    converter.experimental_new_converter = enable_mlir_converter
+    tflite_model = converter.convert()
+    self.assertIsNotNone(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertLen(input_details, 2)
+    self.assertEqual('inputA', input_details[0]['name'])
+    self.assertEqual(np.uint8, input_details[0]['dtype'])
+    self.assertAllEqual([1, 16, 16, 3], input_details[0]['shape'])
+    self.assertEqual((1., 0.), input_details[0]['quantization'])
+
+    self.assertEqual('inputB', input_details[1]['name'])
+    self.assertEqual(np.uint8, input_details[1]['dtype'])
+    self.assertAllEqual([1, 16, 16, 3], input_details[1]['shape'])
+    self.assertEqual((1., 0.), input_details[1]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertLen(output_details, 1)
+    self.assertEqual(np.uint8, output_details[0]['dtype'])
+    self.assertAllEqual([1, 16, 16, 3], output_details[0]['shape'])
+    self.assertGreater(output_details[0]['quantization'][0], 0)  # scale
+
+  def testQuantizeUInt8UsingDefaultRangeStats(self):
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      out_tensor = in_tensor + in_tensor
+      sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
+                                                  [out_tensor])
+    converter.inference_type = lite_constants.QUANTIZED_UINT8
+    converter.quantized_input_stats = {'Placeholder': (0., 1.)}  # mean, std_dev
+    converter.default_ranges_stats = (0, 6)  # min, max
+    tflite_model = converter.convert()
+    self.assertIsNotNone(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertLen(input_details, 1)
+    self.assertEqual('Placeholder', input_details[0]['name'])
+    self.assertEqual(np.uint8, input_details[0]['dtype'])
+    self.assertAllEqual([1, 16, 16, 3], input_details[0]['shape'])
+    self.assertEqual((1., 0.), input_details[0]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertLen(output_details, 1)
+    self.assertEqual('add', output_details[0]['name'])
+    self.assertEqual(np.uint8, output_details[0]['dtype'])
+    self.assertAllEqual([1, 16, 16, 3], output_details[0]['shape'])
+    self.assertGreater(output_details[0]['quantization'][0], 0)  # scale
+
+  @parameterized.named_parameters(
+      # Quantize to Float16 even if rep data provided.
+      ('UseRepresentativeData', True, False, True, False, False, False),
+      # Quantize to Float16 if no rep data provided.
+      ('NoRepresentativeData', False, False, True, False, False, False),
+      # Post training quantization if both rep data and int8 included.
+      ('UseSampleDataIncludeInt8', True, True, False, False, True, False),
+
+      # Quantize to Float16 even if rep data provided with mlir.
+      ('UseRepresentativeDataMlir', True, False, True, False, False, True),
+      # Quantize to Float16 if no rep data provided with mlir.
+      ('NoRepresentativeDataMlir', False, False, True, False, False, True),
+      # Post training quantization if both rep data and int8 included with mlir.
+      ('SampleDataIncludeInt8Mlir', True, True, False, False, True, True))
+  def testQuantizeFloat16(self, use_rep_data, include_int8,
+                          is_float16_quantized, is_error,
+                          is_post_training_quantized, enable_mlir_converter):
+    with ops.Graph().as_default():
+      inp, output, calibration_gen = self._getIntegerQuantizeModel()
+      sess = session.Session()
+
+    idx = 1 if enable_mlir_converter else 0
+    node_name = 'Conv2D' if enable_mlir_converter else 'Conv2D_bias'
     # Convert float model.
     float_converter = lite.TFLiteConverter.from_session(sess, [inp], [output])
-    float_converter.experimental_new_converter = enable_mlir
-    float_tflite = float_converter.convert()
-    self.assertTrue(float_tflite)
-
-    # Convert quantized model.
+    float_converter.experimental_new_converter = enable_mlir_converter
+    float_tflite_model = float_converter.convert()
+    self.assertIsNotNone(float_tflite_model)
+    interpreter = Interpreter(model_content=float_tflite_model)
+    interpreter.allocate_tensors()
+    self.assertEqual(interpreter.get_tensor_details()[idx]['name'], node_name)
+    self.assertEqual(interpreter.get_tensor_details()[idx]['dtype'],
+                     lite.constants.FLOAT)
+    # Convert model to quantized version
     quantized_converter = lite.TFLiteConverter.from_session(
         sess, [inp], [output])
-    quantized_converter.experimental_new_converter = enable_mlir
+    quantized_converter.experimental_new_converter = enable_mlir_converter
     quantized_converter.optimizations = [lite.Optimize.DEFAULT]
-    quantized_converter.representative_dataset = calibration_gen
-    quantized_converter.target_spec.supported_types = [lite.constants.INT8]
-    quantized_tflite = quantized_converter.convert()
-    self.assertTrue(quantized_tflite)
+    quantized_converter.target_spec.supported_types = [lite.constants.FLOAT16]
+    if include_int8:
+      quantized_converter.target_spec.supported_types.append(
+          lite.constants.INT8)
+    if use_rep_data:
+      quantized_converter.representative_dataset = calibration_gen
 
-    # Ensure that the quantized weights tflite model is smaller.
-    self.assertLess(len(quantized_tflite), len(float_tflite))
+    if is_error:
+      with self.assertRaises(ValueError) as error:
+        quantized_converter.convert()
+      self.assertEqual(
+          'representative_dataset is required when specifying '
+          'TFLITE_BUILTINS_INT8 or INT8 supported types.', str(error.exception))
+
+    else:
+      quantized_tflite_model = quantized_converter.convert()
+      self.assertIsNotNone(quantized_tflite_model)
+      interpreter = Interpreter(model_content=quantized_tflite_model)
+      interpreter.allocate_tensors()
+      self.assertEqual(interpreter.get_tensor_details()[idx]['name'], node_name)
+
+      if is_float16_quantized:
+        # Verify that bias constant is float16 type.
+        self.assertEqual(interpreter.get_tensor_details()[idx]['dtype'],
+                         lite.constants.FLOAT16)
+      elif is_post_training_quantized:
+        # Verify that bias constants is int32 type.
+        self.assertEqual(interpreter.get_tensor_details()[idx]['dtype'],
+                         lite.constants.INT32)
+      else:
+        raise ValueError('Invalid test options.')
 
   @parameterized.named_parameters(
       ('EnableMlirConverter', True),  # enable mlir
       ('DisableMlirConverter', False))  # disable mlir
-  def testPostTrainingCalibrateAndQuantizeInt8Inputs(self, enable_mlir):
+  def testInvalidQuantizeFloat16(self, enable_mlir_converter):
     with ops.Graph().as_default():
-      inp, output, calibration_gen = self._getCalibrationQuantizeModel()
+      inp, output, _ = self._getIntegerQuantizeModel()
       sess = session.Session()
 
-    # Convert float model.
-    float_converter = lite.TFLiteConverter.from_session(sess, [inp], [output])
-    float_converter.experimental_new_converter = enable_mlir
-    float_tflite = float_converter.convert()
-    self.assertTrue(float_tflite)
-
-    # Convert quantized weights model.
+    # Specify float16 quantization
     quantized_converter = lite.TFLiteConverter.from_session(
         sess, [inp], [output])
-    quantized_converter.experimental_new_converter = enable_mlir
-    quantized_converter.inference_input_type = lite_constants.INT8
-    quantized_converter.inference_output_type = lite_constants.INT8
+    quantized_converter.experimental_new_converter = enable_mlir_converter
     quantized_converter.optimizations = [lite.Optimize.DEFAULT]
-    quantized_converter.representative_dataset = calibration_gen
-    quantized_tflite = quantized_converter.convert()
-    self.assertTrue(quantized_tflite)
-
-    # The input and output types should be int8.
-    interpreter = Interpreter(model_content=quantized_tflite)
-    interpreter.allocate_tensors()
-    input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
-    self.assertEqual(np.int8, input_details[0]['dtype'])
-    output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
-    self.assertEqual(np.int8, output_details[0]['dtype'])
-
-    # Ensure that the quantized weights tflite model is smaller.
-    self.assertLess(len(quantized_tflite), len(float_tflite))
+    quantized_converter.target_spec.supported_types = [lite.constants.FLOAT16]
+    # Specify only int8 builtin ops
+    quantized_converter.target_spec.supported_ops = [
+        lite.OpsSet.TFLITE_BUILTINS_INT8
+    ]
+    with self.assertRaises(ValueError) as error:
+      quantized_converter.convert()
+    self.assertEqual(
+        'TFLITE_BUILTINS_INT8 requires smallest supported type to be INT8.',
+        str(error.exception))
 
   @parameterized.named_parameters(
       ('InferenceType_INT8', lite_constants.INT8),
-      ('InferenceType_QUANTIZED_INT8', lite_constants.QUANTIZED_UINT8))
-  def testRequiresInputStatsForTrainingTimeQuantization(self, quantized_type):
+      ('InferenceType_UINT8', lite_constants.QUANTIZED_UINT8))
+  def testInvalidQuantizeQATModelRequiresInputStats(self, quantized_type):
     with ops.Graph().as_default():
       in_tensor = array_ops.placeholder(
           shape=[1, 16, 16, 3], dtype=dtypes.float32)
@@ -1126,18 +1034,18 @@ class FromSessionTest(TestModels, parameterized.TestCase):
       quantized_converter.inference_type = quantized_type
       quantized_converter.convert()
     self.assertEqual(
-        'std_dev and mean must be defined when inference_type or '
-        'inference_input_type is QUANTIZED_UINT8 or INT8.',
-        str(error.exception))
+        'The `quantized_input_stats` flag must be defined when '
+        'either `inference_type` flag or `inference_input_type` '
+        'flag is set to tf.uint8 or tf.int8.', str(error.exception))
 
     with self.assertRaises(ValueError) as error:
       quantized_converter.inference_type = lite_constants.FLOAT
       quantized_converter.inference_input_type = quantized_type
       quantized_converter.convert()
     self.assertEqual(
-        'std_dev and mean must be defined when inference_type or '
-        'inference_input_type is QUANTIZED_UINT8 or INT8.',
-        str(error.exception))
+        'The `quantized_input_stats` flag must be defined when '
+        'either `inference_type` flag or `inference_input_type` '
+        'flag is set to tf.uint8 or tf.int8.', str(error.exception))
 
     quantized_converter.inference_type = quantized_type
     quantized_converter.inference_input_type = quantized_type
@@ -1148,15 +1056,37 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     }
     quantized_converter.convert()
 
+  def testInvalidQuantizeQATModelMissingInputStats(self):
+    with ops.Graph().as_default():
+      in_tensor_1 = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputA')
+      in_tensor_2 = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputB')
+      out_tensor = array_ops.fake_quant_with_min_max_args(
+          in_tensor_1 + in_tensor_2, min=0., max=1., name='output')
+      sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverter.from_session(sess,
+                                                  [in_tensor_1, in_tensor_2],
+                                                  [out_tensor])
+    converter.inference_type = lite_constants.QUANTIZED_UINT8
+    converter.quantized_input_stats = {'inputA': (0., 1.)}  # mean, std_dev
+    with self.assertRaises(ValueError) as error:
+      converter.convert()
+    self.assertEqual(
+        'Quantization input stats are not available for input tensors '
+        '\'inputB\'.', str(error.exception))
+
   def testTrainingTimeAndPostTrainingCalibrateAndQuantize(self):
     with ops.Graph().as_default():
-      inp, output, calibration_gen = self._getCalibrationQuantizeModel()
+      inp, output, calibration_gen = self._getIntegerQuantizeModel()
       sess = session.Session()
 
     # Convert float model.
     float_converter = lite.TFLiteConverter.from_session(sess, [inp], [output])
-    float_tflite = float_converter.convert()
-    self.assertTrue(float_tflite)
+    float_tflite_model = float_converter.convert()
+    self.assertIsNotNone(float_tflite_model)
 
     converter = lite.TFLiteConverter.from_session(sess, [inp], [output])
 
@@ -1172,15 +1102,16 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     converter.optimizations = [lite.Optimize.DEFAULT]
     converter.representative_dataset = calibration_gen
     converter._experimental_new_quantizer = True
-    quantized_tflite = converter.convert()
-    self.assertTrue(quantized_tflite)
-    self.assertLess(len(quantized_tflite), len(float_tflite))
+    quantized_tflite_model = converter.convert()
+    self.assertIsNotNone(quantized_tflite_model)
+    self.assertLess(len(quantized_tflite_model), len(float_tflite_model))
 
     # calibration only api
     converter._experimental_calibrate_only = True
     calibrated_tflite = converter.convert()
-    quantized_tflite = mlir_quantize(calibrated_tflite, fully_quantize=True)
-    interpreter = Interpreter(model_content=quantized_tflite)
+    quantized_tflite_model = mlir_quantize(
+        calibrated_tflite, fully_quantize=True)
+    interpreter = Interpreter(model_content=quantized_tflite_model)
     interpreter.allocate_tensors()
     input_details = interpreter.get_input_details()
     self.assertEqual(np.int8, input_details[0]['dtype'])
@@ -1200,7 +1131,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     # Convert model and ensure model is not None.
     converter = lite.TocoConverter.from_session(sess, [in_tensor], [out_tensor])
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     # Ensure the interpreter is able to load.
     interpreter = Interpreter(model_content=tflite_model)
@@ -1218,20 +1149,20 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     converter = lite.TFLiteConverter.from_session(sess, [input_tensor],
                                                   [out0, out1, out2, out3])
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
+    self.assertLen(input_details, 1)
     interpreter.set_tensor(input_details[0]['index'],
                            np.asarray([1.0, 2.0, 3.0, 4.0], dtype=np.float32))
     interpreter.invoke()
 
     output_details = interpreter.get_output_details()
-    self.assertEqual(4, len(output_details))
+    self.assertLen(output_details, 4)
     self.assertEqual(1.0, interpreter.get_tensor(output_details[0]['index']))
     self.assertEqual(2.0, interpreter.get_tensor(output_details[1]['index']))
     self.assertEqual(3.0, interpreter.get_tensor(output_details[2]['index']))
@@ -1241,7 +1172,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
       ('EnableMlirConverter', True),  # enable mlir
       ('DisableMlirConverter', False))  # disable mlir
   @test_util.run_in_graph_and_eager_modes
-  def testFunctions(self, enable_mlir):
+  def testFunctions(self, enable_mlir_converter):
     """Tests tf.function in 1.X."""
 
     @def_function.function
@@ -1262,26 +1193,26 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [placeholder],
                                                   [output_node])
-    converter.experimental_new_converter = enable_mlir
+    converter.experimental_new_converter = enable_mlir_converter
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
+    self.assertLen(input_details, 1)
     self.assertEqual('input', input_details[0]['name'])
     self.assertEqual(np.float32, input_details[0]['dtype'])
-    self.assertTrue(([1] == input_details[0]['shape']).all())
+    self.assertAllEqual([1], input_details[0]['shape'])
     self.assertEqual((0., 0.), input_details[0]['quantization'])
 
     output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
+    self.assertLen(output_details, 1)
     self.assertEqual('output_node', output_details[0]['name'])
     self.assertEqual(np.float32, output_details[0]['dtype'])
-    self.assertTrue(([1] == output_details[0]['shape']).all())
+    self.assertAllEqual([1], output_details[0]['shape'])
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
   def testInferenceInputOutputTypeFloatDefault(self):
@@ -1295,23 +1226,23 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
+    self.assertLen(input_details, 1)
     self.assertEqual('Placeholder', input_details[0]['name'])
     self.assertEqual(np.float32, input_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertAllEqual([1, 16, 16, 3], input_details[0]['shape'])
 
     output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
+    self.assertLen(output_details, 1)
     self.assertEqual('add', output_details[0]['name'])
     self.assertEqual(np.float32, output_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertAllEqual([1, 16, 16, 3], output_details[0]['shape'])
 
   def testInferenceInputOutputTypeQuantizedUint8Default(self):
     with ops.Graph().as_default():
@@ -1327,23 +1258,23 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     converter.inference_type = lite_constants.QUANTIZED_UINT8
     converter.quantized_input_stats = {'Placeholder': (0., 1.)}  # mean, std_dev
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
+    self.assertLen(input_details, 1)
     self.assertEqual('Placeholder', input_details[0]['name'])
     self.assertEqual(np.uint8, input_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertAllEqual([1, 16, 16, 3], input_details[0]['shape'])
 
     output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
+    self.assertLen(output_details, 1)
     self.assertEqual('output', output_details[0]['name'])
     self.assertEqual(np.uint8, output_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertAllEqual([1, 16, 16, 3], output_details[0]['shape'])
 
   def testReusingConverterWithDifferentPostTrainingQuantization(self):
     with ops.Graph().as_default():
@@ -1359,11 +1290,11 @@ class FromSessionTest(TestModels, parameterized.TestCase):
 
     converter.post_training_quantize = True
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     converter.post_training_quantize = False
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
   def testResizeWithShape(self):
     with ops.Graph().as_default():
@@ -1383,8 +1314,8 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     interpreter = Interpreter(model_content=tflite_model)
     input_details = interpreter.get_input_details()
     self.assertLen(input_details, 1)
-    self.assertTrue(([1, 1] == input_details[0]['shape']).all())
-    self.assertTrue(([-1, -1] == input_details[0]['shape_signature']).all())
+    self.assertAllEqual([1, 1], input_details[0]['shape'])
+    self.assertAllEqual([-1, -1], input_details[0]['shape_signature'])
 
     # Resize tensor and invoke.
     interpreter.resize_tensor_input(0, [4])
@@ -1395,9 +1326,9 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     output_details = interpreter.get_output_details()
     self.assertLen(output_details, 1)
     self.assertEqual(np.int32, output_details[0]['dtype'])
-    self.assertTrue(([4] == output_details[0]['shape']).all())
+    self.assertAllEqual([4], output_details[0]['shape'])
     output_data = interpreter.get_tensor(output_details[0]['index'])
-    self.assertTrue(([1, 2, 3, 4] == output_data).all())
+    self.assertAllEqual([1, 2, 3, 4], output_data)
 
   def testResizingIntermediateDynamicTensor(self):
     # This is a regression test for the case where shape of dynamic output
@@ -1479,24 +1410,24 @@ class FromFrozenGraphFile(LiteTest):
     converter = lite.TFLiteConverter.from_frozen_graph(graph_def_file,
                                                        ['Placeholder'], ['add'])
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
+    self.assertLen(input_details, 1)
     self.assertEqual('Placeholder', input_details[0]['name'])
     self.assertEqual(np.float32, input_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertAllEqual([1, 16, 16, 3], input_details[0]['shape'])
     self.assertEqual((0., 0.), input_details[0]['quantization'])
 
     output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
+    self.assertLen(output_details, 1)
     self.assertEqual('add', output_details[0]['name'])
     self.assertEqual(np.float32, output_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertAllEqual([1, 16, 16, 3], output_details[0]['shape'])
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
   def testFloatWithShapesArray(self):
@@ -1516,15 +1447,15 @@ class FromFrozenGraphFile(LiteTest):
         graph_def_file, ['Placeholder'], ['add'],
         input_shapes={'Placeholder': [1, 16, 16, 3]})
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
-    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertLen(input_details, 1)
+    self.assertAllEqual([1, 16, 16, 3], input_details[0]['shape'])
 
   def testFreezeGraph(self):
     with ops.Graph().as_default():
@@ -1563,24 +1494,24 @@ class FromFrozenGraphFile(LiteTest):
     converter = lite.TFLiteConverter.from_frozen_graph(graph_def_file,
                                                        ['Placeholder'], ['add'])
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
+    self.assertLen(input_details, 1)
     self.assertEqual('Placeholder', input_details[0]['name'])
     self.assertEqual(np.float32, input_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertAllEqual([1, 16, 16, 3], input_details[0]['shape'])
     self.assertEqual((0., 0.), input_details[0]['quantization'])
 
     output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
+    self.assertLen(output_details, 1)
     self.assertEqual('add', output_details[0]['name'])
     self.assertEqual(np.float32, output_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertAllEqual([1, 16, 16, 3], output_details[0]['shape'])
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
   def testInvalidFileNotFound(self):
@@ -1620,7 +1551,7 @@ class FromFrozenGraphFile(LiteTest):
     converter = lite.TocoConverter.from_frozen_graph(graph_def_file,
                                                      ['Placeholder'], ['add'])
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     # Ensure the model is able to load.
     interpreter = Interpreter(model_content=tflite_model)
@@ -1644,7 +1575,7 @@ class FromFrozenGraphFile(LiteTest):
                                                      ['Placeholder'], ['add'])
     converter.convert()
     # GraphDebugInfo should be none for frozen graph.
-    self.assertTrue(not converter._debug_info)
+    self.assertFalse(converter._debug_info)
 
 
 class FromFrozenGraphObjectDetection(LiteTest):
@@ -1679,35 +1610,35 @@ class FromFrozenGraphObjectDetection(LiteTest):
                                                        self._input_shapes)
     converter.allow_custom_ops = True
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
+    self.assertLen(input_details, 1)
     self.assertEqual('normalized_input_image_tensor', input_details[0]['name'])
     self.assertEqual(np.float32, input_details[0]['dtype'])
-    self.assertTrue(([1, 300, 300, 3] == input_details[0]['shape']).all())
+    self.assertAllEqual([1, 300, 300, 3], input_details[0]['shape'])
     self.assertEqual((0., 0.), input_details[0]['quantization'])
 
     output_details = interpreter.get_output_details()
-    self.assertEqual(4, len(output_details))
+    self.assertLen(output_details, 4)
     self.assertEqual('TFLite_Detection_PostProcess', output_details[0]['name'])
     self.assertEqual(np.float32, output_details[0]['dtype'])
-    self.assertTrue(([1, 10, 4] == output_details[0]['shape']).all())
+    self.assertAllEqual([1, 10, 4], output_details[0]['shape'])
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
     self.assertEqual('TFLite_Detection_PostProcess:1',
                      output_details[1]['name'])
-    self.assertTrue(([1, 10] == output_details[1]['shape']).all())
+    self.assertAllEqual([1, 10], output_details[1]['shape'])
     self.assertEqual('TFLite_Detection_PostProcess:2',
                      output_details[2]['name'])
-    self.assertTrue(([1, 10] == output_details[2]['shape']).all())
+    self.assertAllEqual([1, 10], output_details[2]['shape'])
     self.assertEqual('TFLite_Detection_PostProcess:3',
                      output_details[3]['name'])
-    self.assertTrue(([1] == output_details[3]['shape']).all())
+    self.assertAllEqual([1], output_details[3]['shape'])
 
 
 class FromSavedModelTest(TestModels):
@@ -1734,28 +1665,28 @@ class FromSavedModelTest(TestModels):
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_saved_model(saved_model_dir)
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(2, len(input_details))
+    self.assertLen(input_details, 2)
     self.assertStartsWith(input_details[0]['name'], 'inputA')
     self.assertEqual(np.float32, input_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertAllEqual([1, 16, 16, 3], input_details[0]['shape'])
     self.assertEqual((0., 0.), input_details[0]['quantization'])
 
     self.assertStartsWith(input_details[1]['name'], 'inputB')
     self.assertEqual(np.float32, input_details[1]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == input_details[1]['shape']).all())
+    self.assertAllEqual([1, 16, 16, 3], input_details[1]['shape'])
     self.assertEqual((0., 0.), input_details[1]['quantization'])
 
     output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
+    self.assertLen(output_details, 1)
     self.assertStartsWith(output_details[0]['name'], 'add')
     self.assertEqual(np.float32, output_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertAllEqual([1, 16, 16, 3], output_details[0]['shape'])
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
   def testOldConverterWarning(self):
@@ -1769,7 +1700,7 @@ class FromSavedModelTest(TestModels):
     converter = lite.TFLiteConverter.from_saved_model(saved_model_dir)
     converter.experimental_new_converter = False
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
     self.assertIn(warning_message, log.getvalue())
     logging.root.removeHandler(handler)
 
@@ -1784,7 +1715,7 @@ class FromSavedModelTest(TestModels):
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_saved_model(saved_model_dir)
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
     self.assertIn(optout_message, log.getvalue())
     logging.root.removeHandler(handler)
 
@@ -1794,29 +1725,29 @@ class FromSavedModelTest(TestModels):
 
     converter = lite.TFLiteConverter.from_saved_model(saved_model_dir)
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(2, len(input_details))
+    self.assertLen(input_details, 2)
     self.assertStartsWith(input_details[0]['name'], 'inputA')
     self.assertEqual(np.float32, input_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertAllEqual([1, 16, 16, 3], input_details[0]['shape'])
     self.assertEqual((0., 0.), input_details[0]['quantization'])
 
     self.assertStartsWith(input_details[1]['name'], 'inputB')
     self.assertEqual(np.float32, input_details[1]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == input_details[1]['shape']).all())
+    self.assertAllEqual([1, 16, 16, 3], input_details[1]['shape'])
     self.assertEqual((0., 0.), input_details[1]['quantization'])
 
     output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
+    self.assertLen(output_details, 1)
     self.assertStartsWith(output_details[0]['name'], 'add')
     self.assertEqual(np.float32, output_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertAllEqual([1, 16, 16, 3], output_details[0]['shape'])
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
   def testOrderInputArrays(self):
@@ -1826,29 +1757,29 @@ class FromSavedModelTest(TestModels):
     converter = lite.TFLiteConverter.from_saved_model(
         saved_model_dir, input_arrays=['inputB', 'inputA'])
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(2, len(input_details))
+    self.assertLen(input_details, 2)
     self.assertStartsWith(input_details[0]['name'], 'inputA')
     self.assertEqual(np.float32, input_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertAllEqual([1, 16, 16, 3], input_details[0]['shape'])
     self.assertEqual((0., 0.), input_details[0]['quantization'])
 
     self.assertStartsWith(input_details[1]['name'], 'inputB')
     self.assertEqual(np.float32, input_details[1]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == input_details[1]['shape']).all())
+    self.assertAllEqual([1, 16, 16, 3], input_details[1]['shape'])
     self.assertEqual((0., 0.), input_details[1]['quantization'])
 
     output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
+    self.assertLen(output_details, 1)
     self.assertStartsWith(output_details[0]['name'], 'add')
     self.assertEqual(np.float32, output_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertAllEqual([1, 16, 16, 3], output_details[0]['shape'])
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
   def testSubsetInputArrays(self):
@@ -1880,7 +1811,7 @@ class FromSavedModelTest(TestModels):
     # Convert model and ensure model is not None.
     converter = lite.TocoConverter.from_saved_model(saved_model_dir)
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     # Ensure the model is able to load.
     interpreter = Interpreter(model_content=tflite_model)
@@ -1958,7 +1889,7 @@ class FromKerasFile(TestModels, parameterized.TestCase):
 
       converter = lite.TFLiteConverter.from_keras_model_file(self._keras_file)
       tflite_model = converter.convert()
-      self.assertTrue(tflite_model)
+      self.assertIsNotNone(tflite_model)
 
     # Check tensor details of converted model.
     interpreter = Interpreter(model_content=tflite_model)
@@ -1968,13 +1899,13 @@ class FromKerasFile(TestModels, parameterized.TestCase):
     self.assertLen(input_details, 1)
     self.assertEndsWith(input_details[0]['name'], 'dense_input')
     self.assertEqual(np.float32, input_details[0]['dtype'])
-    self.assertTrue(([1, 3] == input_details[0]['shape']).all())
+    self.assertAllEqual([1, 3], input_details[0]['shape'])
     self.assertEqual((0., 0.), input_details[0]['quantization'])
 
     output_details = interpreter.get_output_details()
     self.assertLen(output_details, 1)
     self.assertEqual(np.float32, output_details[0]['dtype'])
-    self.assertTrue(([1, 3, 3] == output_details[0]['shape']).all())
+    self.assertAllEqual([1, 3, 3], output_details[0]['shape'])
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
     # Check inference of converted model.
@@ -1998,7 +1929,7 @@ class FromKerasFile(TestModels, parameterized.TestCase):
       converter = lite.TFLiteConverter.from_keras_model_file(
           self._keras_file, custom_objects=self._custom_objects)
       tflite_model = converter.convert()
-      self.assertTrue(tflite_model)
+      self.assertIsNotNone(tflite_model)
 
     # Check tensor details of converted model.
     interpreter = Interpreter(model_content=tflite_model)
@@ -2035,7 +1966,7 @@ class FromKerasFile(TestModels, parameterized.TestCase):
     converter = lite.TFLiteConverter.from_keras_model_file(
         self._keras_file, input_arrays=['dense_input'])
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
   def testSequentialModelInputShape(self):
     """Test a Sequential tf.keras model testing input shapes argument."""
@@ -2053,7 +1984,7 @@ class FromKerasFile(TestModels, parameterized.TestCase):
     converter = lite.TFLiteConverter.from_keras_model_file(
         self._keras_file, input_shapes={'dense_input': [2, 3]})
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     # Check input shape from converted model.
     interpreter = Interpreter(model_content=tflite_model)
@@ -2062,7 +1993,7 @@ class FromKerasFile(TestModels, parameterized.TestCase):
     input_details = interpreter.get_input_details()
     self.assertLen(input_details, 1)
     self.assertEndsWith(input_details[0]['name'], 'dense_input')
-    self.assertTrue(([2, 3] == input_details[0]['shape']).all())
+    self.assertAllEqual([2, 3], input_details[0]['shape'])
 
   def testSequentialModelOutputArray(self):
     """Test a Sequential tf.keras model testing output arrays argument."""
@@ -2080,7 +2011,7 @@ class FromKerasFile(TestModels, parameterized.TestCase):
     converter = lite.TFLiteConverter.from_keras_model_file(
         self._keras_file, output_arrays=['time_distributed/Reshape_1'])
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
   @parameterized.named_parameters(('_graph', context.graph_mode),
                                   ('_eager', context.eager_mode))
@@ -2110,7 +2041,7 @@ class FromKerasFile(TestModels, parameterized.TestCase):
       # Convert to TFLite model.
       converter = lite.TFLiteConverter.from_keras_model_file(self._keras_file)
       tflite_model = converter.convert()
-      self.assertTrue(tflite_model)
+      self.assertIsNotNone(tflite_model)
 
     # Check tensor details of converted model.
     interpreter = Interpreter(model_content=tflite_model)
@@ -2120,13 +2051,13 @@ class FromKerasFile(TestModels, parameterized.TestCase):
     self.assertLen(input_details, 1)
     self.assertEqual('input', input_details[0]['name'])
     self.assertEqual(np.float32, input_details[0]['dtype'])
-    self.assertTrue(([1, 3] == input_details[0]['shape']).all())
+    self.assertAllEqual([1, 3], input_details[0]['shape'])
     self.assertEqual((0., 0.), input_details[0]['quantization'])
 
     output_details = interpreter.get_output_details()
     self.assertLen(output_details, 1)
     self.assertEqual(np.float32, output_details[0]['dtype'])
-    self.assertTrue(([1, 3] == output_details[0]['shape']).all())
+    self.assertAllEqual([1, 3], output_details[0]['shape'])
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
     # Check inference of converted model.
@@ -2172,7 +2103,7 @@ class FromKerasFile(TestModels, parameterized.TestCase):
     # Convert to TFLite model.
     converter = lite.TFLiteConverter.from_keras_model_file(self._keras_file)
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
     interpreter = Interpreter(model_content=tflite_model)
@@ -2182,22 +2113,22 @@ class FromKerasFile(TestModels, parameterized.TestCase):
     self.assertLen(input_details, 2)
     self.assertEndsWith(input_details[0]['name'], 'input_a')
     self.assertEqual(np.float32, input_details[0]['dtype'])
-    self.assertTrue(([1, 3] == input_details[0]['shape']).all())
+    self.assertAllEqual([1, 3], input_details[0]['shape'])
     self.assertEqual((0., 0.), input_details[0]['quantization'])
 
     self.assertEndsWith(input_details[1]['name'], 'input_b')
     self.assertEqual(np.float32, input_details[1]['dtype'])
-    self.assertTrue(([1, 3] == input_details[1]['shape']).all())
+    self.assertAllEqual([1, 3], input_details[1]['shape'])
     self.assertEqual((0., 0.), input_details[1]['quantization'])
 
     output_details = interpreter.get_output_details()
     self.assertLen(output_details, 2)
     self.assertEqual(np.float32, output_details[0]['dtype'])
-    self.assertTrue(([1, 4] == output_details[0]['shape']).all())
+    self.assertAllEqual([1, 4], output_details[0]['shape'])
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
     self.assertEqual(np.float32, output_details[1]['dtype'])
-    self.assertTrue(([1, 4] == output_details[1]['shape']).all())
+    self.assertAllEqual([1, 4], output_details[1]['shape'])
     self.assertEqual((0., 0.), output_details[1]['quantization'])
 
   def testFunctionalSequentialModel(self):
@@ -2228,7 +2159,7 @@ class FromKerasFile(TestModels, parameterized.TestCase):
     # Convert to TFLite model.
     converter = lite.TFLiteConverter.from_keras_model_file(self._keras_file)
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     # Check tensor details of converted model.
     interpreter = Interpreter(model_content=tflite_model)
@@ -2238,13 +2169,13 @@ class FromKerasFile(TestModels, parameterized.TestCase):
     self.assertLen(input_details, 1)
     self.assertEndsWith(input_details[0]['name'], 'dense_input')
     self.assertEqual(np.float32, input_details[0]['dtype'])
-    self.assertTrue(([1, 3] == input_details[0]['shape']).all())
+    self.assertAllEqual([1, 3], input_details[0]['shape'])
     self.assertEqual((0., 0.), input_details[0]['quantization'])
 
     output_details = interpreter.get_output_details()
     self.assertLen(output_details, 1)
     self.assertEqual(np.float32, output_details[0]['dtype'])
-    self.assertTrue(([1, 3, 3] == output_details[0]['shape']).all())
+    self.assertAllEqual([1, 3, 3], output_details[0]['shape'])
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
     # Check inference of converted model.
@@ -2264,7 +2195,7 @@ class FromKerasFile(TestModels, parameterized.TestCase):
 
     converter = lite.TocoConverter.from_keras_model_file(self._keras_file)
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
     # Ensure the model is able to load.
     interpreter = Interpreter(model_content=tflite_model)
@@ -2286,7 +2217,7 @@ class FromKerasFile(TestModels, parameterized.TestCase):
     converter = lite.TocoConverter.from_keras_model_file(self._keras_file)
     converter._experimental_sparsify_model = True
     tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    self.assertIsNotNone(tflite_model)
 
 
 class GrapplerTest(TestModels, parameterized.TestCase):
@@ -2312,21 +2243,21 @@ class GrapplerTest(TestModels, parameterized.TestCase):
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
+    self.assertLen(input_details, 1)
     self.assertEqual('Placeholder', input_details[0]['name'])
     self.assertEqual(np.float32, input_details[0]['dtype'])
-    self.assertTrue(([3, 3] == input_details[0]['shape']).all())
+    self.assertAllEqual([3, 3], input_details[0]['shape'])
 
     output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
+    self.assertLen(output_details, 1)
     self.assertEqual('output', output_details[0]['name'])
     self.assertEqual(np.float32, output_details[0]['dtype'])
-    self.assertTrue(([3, 3] == output_details[0]['shape']).all())
+    self.assertAllEqual([3, 3], output_details[0]['shape'])
 
   @parameterized.named_parameters(
       ('EnableMlirConverter', True),  # enable mlir
       ('DisableMlirConverter', False))  # disable mlir
-  def testInputNodeIsNotFolded(self, enable_mlir):
+  def testInputNodeIsNotFolded(self, enable_mlir_converter):
     ops.disable_eager_execution()
     # Constant folding handles the tf.broadcast_to operation which was not
     # supported by the TFLite at the time this test was added.
@@ -2340,7 +2271,7 @@ class GrapplerTest(TestModels, parameterized.TestCase):
     # Convert model.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor, y_const],
                                                   [out_tensor])
-    converter.experimental_new_converter = enable_mlir
+    converter.experimental_new_converter = enable_mlir_converter
     tflite_model = converter.convert()
 
     # Check values from converted model.
@@ -2416,7 +2347,7 @@ class DefaultConverterAttrsTest(LiteTest):
     self.assertFalse(converter.change_concat_input_ranges)
 
     # Assert dropping control dependency is enabled by default.
-    self.assertTrue(converter.drop_control_dependency)
+    self.assertIsNotNone(converter.drop_control_dependency)
 
     # Assert dumping extra information is disabled by default.
     self.assertIsNone(converter.dump_graphviz_dir)
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
index 3b51991d674..714eb249ec9 100644
--- a/tensorflow/lite/python/lite_v2_test.py
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -36,9 +36,11 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.layers import recurrent
 from tensorflow.python.keras.layers import recurrent_v2
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import save_options
 from tensorflow.python.saved_model import saved_model
+from tensorflow.python.saved_model.loader_impl import parse_saved_model
 from tensorflow.python.saved_model.save import save
 from tensorflow.python.training.tracking import tracking
 
@@ -56,14 +58,14 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
       ('EnableMlirConverter', True),  # enable mlir
       ('DisableMlirConverter', False))  # disable mlir
   @test_util.run_v2_only
-  def testFloat(self, enable_mlir):
+  def testFloat(self, enable_mlir_converter):
     root = self._getSimpleVariableModel()
     input_data = tf.constant(1., shape=[1])
     concrete_func = root.f.get_concrete_function(input_data)
 
     # Convert model.
     converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func])
-    converter.experimental_new_converter = enable_mlir
+    converter.experimental_new_converter = enable_mlir_converter
     tflite_model = converter.convert()
 
     # Check values from converted model.
@@ -73,7 +75,8 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
 
   @parameterized.named_parameters(
       ('_INT8InputOutput', lite.constants.INT8),
-      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
+      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8),
+      ('_INT16InputOutput', lite.constants.INT16))
   @test_util.run_v2_only
   def testInvalidFloat(self, inference_input_output_type):
     root = self._getSimpleVariableModel()
@@ -139,7 +142,7 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     self.assertIn('can only convert a single ConcreteFunction',
                   str(error.exception))
 
-  def _getCalibrationQuantizeModel(self):
+  def _getIntegerQuantizeModel(self):
     np.random.seed(0)
 
     root = tracking.AutoTrackable()
@@ -164,23 +167,23 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
       ('EnableMlirQuantizer', True),  # enable mlir quantizer
       ('DisableMlirQuantizer', False))  # disable mlir quantizer
   def testPostTrainingCalibrateAndQuantize(self, mlir_quantizer):
-    func, calibration_gen = self._getCalibrationQuantizeModel()
+    func, calibration_gen = self._getIntegerQuantizeModel()
 
     # Convert float model.
     float_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    float_tflite = float_converter.convert()
-    self.assertTrue(float_tflite)
+    float_tflite_model = float_converter.convert()
+    self.assertIsNotNone(float_tflite_model)
 
     # Convert quantized model.
     quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
     quantized_converter.optimizations = [lite.Optimize.DEFAULT]
     quantized_converter.representative_dataset = calibration_gen
     quantized_converter._experimental_new_quantizer = mlir_quantizer
-    quantized_tflite = quantized_converter.convert()
-    self.assertTrue(quantized_tflite)
+    quantized_tflite_model = quantized_converter.convert()
+    self.assertIsNotNone(quantized_tflite_model)
 
     # The default input and output types should be float.
-    interpreter = Interpreter(model_content=quantized_tflite)
+    interpreter = Interpreter(model_content=quantized_tflite_model)
     interpreter.allocate_tensors()
     input_details = interpreter.get_input_details()
     self.assertLen(input_details, 1)
@@ -190,15 +193,16 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     self.assertEqual(np.float32, output_details[0]['dtype'])
 
     # Ensure that the quantized weights tflite model is smaller.
-    self.assertLess(len(quantized_tflite), len(float_tflite))
+    self.assertLess(len(quantized_tflite_model), len(float_tflite_model))
 
   @parameterized.named_parameters(
       ('_INT8InputOutput', lite.constants.INT8),
-      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
+      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8),
+      ('_INT16InputOutput', lite.constants.INT16))
   @test_util.run_v2_only
   def testInvalidPostTrainingDynamicRangeQuantization(
       self, inference_input_output_type):
-    func, _ = self._getCalibrationQuantizeModel()
+    func, _ = self._getIntegerQuantizeModel()
 
     # Convert float model.
     converter = lite.TFLiteConverterV2.from_concrete_functions([func])
@@ -224,7 +228,7 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
       ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
   def testPostTrainingIntegerAllowFloatQuantization(
       self, inference_input_output_type):
-    func, calibration_gen = self._getCalibrationQuantizeModel()
+    func, calibration_gen = self._getIntegerQuantizeModel()
 
     # Convert float model.
     converter = lite.TFLiteConverterV2.from_concrete_functions([func])
@@ -238,7 +242,7 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     quantized_converter.inference_input_type = inference_input_output_type
     quantized_converter.inference_output_type = inference_input_output_type
     quantized_tflite_model = quantized_converter.convert()
-    self.assertTrue(quantized_tflite_model)
+    self.assertIsNotNone(quantized_tflite_model)
 
     interpreter = Interpreter(model_content=quantized_tflite_model)
     interpreter.allocate_tensors()
@@ -254,16 +258,79 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     # Ensure that the quantized tflite model is smaller.
     self.assertLess(len(quantized_tflite_model), len(tflite_model))
 
+  def testPostTrainingIntegerAllowFloatQuantizationINT16InputOutput(self):
+    func, calibration_gen = self._getIntegerQuantizeModel()
+
+    # Convert float model.
+    converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Post-training quantization 16x8 with float fallback allowed.
+    quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
+    quantized_converter.representative_dataset = calibration_gen
+    quantized_converter.target_spec.supported_ops = [
+        lite.OpsSet.\
+        EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8,
+        lite.OpsSet.TFLITE_BUILTINS
+    ]
+    inference_input_output_type = lite.constants.INT16
+    quantized_converter.inference_input_type = inference_input_output_type
+    quantized_converter.inference_output_type = inference_input_output_type
+    quantized_tflite_model = quantized_converter.convert()
+    self.assertIsNotNone(quantized_tflite_model)
+
+    interpreter = Interpreter(model_content=quantized_tflite_model)
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    self.assertLen(input_details, 1)
+    self.assertEqual(inference_input_output_type.as_numpy_dtype,
+                     input_details[0]['dtype'])
+    output_details = interpreter.get_output_details()
+    self.assertLen(output_details, 1)
+    self.assertEqual(inference_input_output_type.as_numpy_dtype,
+                     output_details[0]['dtype'])
+
+    # Ensure that the quantized tflite model is smaller.
+    self.assertLess(len(quantized_tflite_model), len(tflite_model))
+
+  def testPostTrainingIntegerQuant16x8MismatchInferenceParams(self):
+    # In this test we check that when we do 16x8 post-training
+    # quantization and set inference_input(output)_type to
+    # constants.INT8, we have an error.
+    func, calibration_gen = self._getIntegerQuantizeModel()
+
+    # Convert quantized model.
+    quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
+    quantized_converter.representative_dataset = calibration_gen
+    quantized_converter.target_spec.supported_ops = [
+        lite.OpsSet.\
+          EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+        ]
+
+    with self.assertRaises(ValueError) as error:
+      quantized_converter.inference_input_type = lite.constants.INT8
+      quantized_converter.inference_output_type = lite.constants.INT8
+      quantized_converter.convert()
+    self.assertEqual(
+        "The inference_input_type and inference_output_type "
+        "must be in ['tf.float32', 'tf.int16'].", str(error.exception))
+
   @parameterized.named_parameters(
       ('_DefaultFLOAT32InputOutput_UseTargetTypesFlag', lite.constants.FLOAT,
-       False), ('_DefaultFLOAT32InputOutput', lite.constants.FLOAT, True),
-      ('_INT8InputOutput', lite.constants.INT8, True),
-      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8, True))
+       False, False),
+      ('_DefaultFLOAT32InputOutput', lite.constants.FLOAT, True, False),
+      ('_INT8InputOutput', lite.constants.INT8, True, False),
+      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8, True, False),
+      ('_INT16InputOutput', lite.constants.INT16, True, True))
   @test_util.run_v2_only
   def testPostTrainingIntegerNoFloatQuantization(self,
                                                  inference_input_output_type,
-                                                 use_target_ops_flag):
-    func, calibration_gen = self._getCalibrationQuantizeModel()
+                                                 use_target_ops_flag,
+                                                 quantization_16x8):
+    func, calibration_gen = self._getIntegerQuantizeModel()
 
     # Convert float model.
     converter = lite.TFLiteConverterV2.from_concrete_functions([func])
@@ -276,15 +343,21 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     quantized_converter.optimizations = [lite.Optimize.DEFAULT]
     quantized_converter.representative_dataset = calibration_gen
     if use_target_ops_flag:
-      quantized_converter.target_spec.supported_ops = [
-          lite.OpsSet.TFLITE_BUILTINS_INT8
-      ]
+      if quantization_16x8:
+        quantized_converter.target_spec.supported_ops = [
+            lite.OpsSet.\
+            EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+        ]
+      else:
+        quantized_converter.target_spec.supported_ops = [
+            lite.OpsSet.TFLITE_BUILTINS_INT8
+        ]
     else:
       quantized_converter.target_spec.supported_types = [lite.constants.INT8]
     quantized_converter.inference_input_type = inference_input_output_type
     quantized_converter.inference_output_type = inference_input_output_type
     quantized_tflite_model = quantized_converter.convert()
-    self.assertTrue(quantized_tflite_model)
+    self.assertIsNotNone(quantized_tflite_model)
 
     interpreter = Interpreter(model_content=quantized_tflite_model)
     interpreter.allocate_tensors()
@@ -301,12 +374,12 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     self.assertLess(len(quantized_tflite_model), len(tflite_model))
 
   def testCalibrateAndQuantizeBuiltinInt16(self):
-    func, calibration_gen = self._getCalibrationQuantizeModel()
+    func, calibration_gen = self._getIntegerQuantizeModel()
 
     # Convert float model.
     float_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    float_tflite = float_converter.convert()
-    self.assertTrue(float_tflite)
+    float_tflite_model = float_converter.convert()
+    self.assertIsNotNone(float_tflite_model)
 
     converter = lite.TFLiteConverterV2.from_concrete_functions([func])
     # TODO(b/156309549): We should add INT16 to the builtin types.
@@ -316,13 +389,13 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     converter.representative_dataset = calibration_gen
     converter._experimental_calibrate_only = True
     calibrated_tflite = converter.convert()
-    quantized_tflite = mlir_quantize(calibrated_tflite,
-                                     inference_type=_types_pb2.QUANTIZED_INT16)
+    quantized_tflite_model = mlir_quantize(
+        calibrated_tflite, inference_type=_types_pb2.QUANTIZED_INT16)
 
-    self.assertTrue(quantized_tflite)
+    self.assertIsNotNone(quantized_tflite_model)
 
     # The default input and output types should be float.
-    interpreter = Interpreter(model_content=quantized_tflite)
+    interpreter = Interpreter(model_content=quantized_tflite_model)
     interpreter.allocate_tensors()
     input_details = interpreter.get_input_details()
     self.assertLen(input_details, 1)
@@ -332,7 +405,7 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     self.assertEqual(np.float32, output_details[0]['dtype'])
 
     # Ensure that the quantized weights tflite model is smaller.
-    self.assertLess(len(quantized_tflite), len(float_tflite))
+    self.assertLess(len(quantized_tflite_model), len(float_tflite_model))
 
   def _getTrainingTimeQuantizedModel(self):
 
@@ -372,52 +445,43 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
 
     return tf.keras.Sequential(QLinear(3, input_shape=(2,)))
 
+  @parameterized.named_parameters(
+      ('_DefaultFLOAT32InputOutput', lite.constants.FLOAT),
+      ('_INT8InputOutput', lite.constants.INT8),
+      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
   @test_util.run_v2_only
-  def testTrainingTimeQuantization(self):
+  def testTrainingTimeQuantization(self, inference_input_output_type):
     model = self._getTrainingTimeQuantizedModel()
 
     float_converter = lite.TFLiteConverterV2.from_keras_model(model)
-    float_tflite = float_converter.convert()
-    self.assertTrue(float_tflite)
+    float_tflite_model = float_converter.convert()
+    self.assertIsNotNone(float_tflite_model)
 
     quantized_converter = lite.TFLiteConverterV2.from_keras_model(model)
     quantized_converter.optimizations = [lite.Optimize.DEFAULT]
-    quantized_tflite = quantized_converter.convert()
-    self.assertTrue(quantized_tflite)
+    quantized_converter.inference_input_type = inference_input_output_type
+    quantized_converter.inference_output_type = inference_input_output_type
+    quantized_tflite_model = quantized_converter.convert()
+    self.assertIsNotNone(quantized_tflite_model)
 
-    # Ensure that the quantized weights tflite model is smaller.
-    self.assertLess(len(quantized_tflite), len(float_tflite))
+    interpreter = Interpreter(model_content=quantized_tflite_model)
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    self.assertLen(input_details, 1)
+    self.assertEqual(inference_input_output_type.as_numpy_dtype,
+                     input_details[0]['dtype'])
+    output_details = interpreter.get_output_details()
+    self.assertLen(output_details, 1)
+    self.assertEqual(inference_input_output_type.as_numpy_dtype,
+                     output_details[0]['dtype'])
 
-    interpreter = Interpreter(model_content=quantized_tflite)
-    self.assertEqual(np.float32, interpreter.get_input_details()[0]['dtype'])
-
-  @parameterized.named_parameters(
-      ('_INT8InputOutput', lite.constants.INT8),
-      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
-  def testInvalidTrainingTimeQuantization(self, inference_input_output_type):
-    # We currently don't support integer inference_input_type and
-    # inference_output_type flags for training time quantization.
-
-    model = self._getTrainingTimeQuantizedModel()
-
-    converter = lite.TFLiteConverterV2.from_keras_model(model)
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
-
-    quantized_converter = lite.TFLiteConverterV2.from_keras_model(model)
-    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
-    with self.assertRaises(ValueError) as error:
-      quantized_converter.inference_input_type = inference_input_output_type
-      quantized_converter.inference_output_type = inference_input_output_type
-      quantized_converter.convert()
-    self.assertEqual(
-        'The inference_input_type and inference_output_type '
-        'must be tf.float32.', str(error.exception))
+    # Ensure that the quantized tflite model is smaller.
+    self.assertLess(len(quantized_tflite_model), len(float_tflite_model))
 
   @test_util.run_v2_only
   def testNewQuantizer(self):
     """Test the model quantized by the new converter."""
-    func, calibration_gen = self._getCalibrationQuantizeModel()
+    func, calibration_gen = self._getIntegerQuantizeModel()
 
     quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
     quantized_converter.target_spec.supported_ops = [
@@ -438,13 +502,13 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
           np.random.uniform(-1, 1, size=(1, 5, 5, 3)).astype(np.float32))
       old_value = self._evaluateTFLiteModel(old_tflite, [input_data])
       new_value = self._evaluateTFLiteModel(new_tflite, [input_data])
-      np.testing.assert_almost_equal(old_value, new_value, 1)
+      self.assertAllClose(old_value, new_value, atol=1e-01)
 
   @parameterized.named_parameters(
       ('EnableMlirConverter', True),  # enable mlir
       ('DisableMlirConverter', False))  # disable mlir
   @test_util.run_v2_only
-  def testEmbeddings(self, enable_mlir):
+  def testEmbeddings(self, enable_mlir_converter):
     """Test model with embeddings."""
     input_data = tf.constant(
         np.array(np.random.random_sample((20)), dtype=np.int32))
@@ -470,13 +534,13 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
 
     # Convert model.
     converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func])
-    converter.experimental_new_converter = enable_mlir
+    converter.experimental_new_converter = enable_mlir_converter
     tflite_model = converter.convert()
 
     # Check values from converted model.
     expected_value = root.func(input_data)
     actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
-    np.testing.assert_almost_equal(expected_value.numpy(), actual_value[0], 5)
+    self.assertAllClose(expected_value.numpy(), actual_value[0], atol=1e-05)
 
   @test_util.run_v2_only
   def testGraphDebugInfo(self):
@@ -530,7 +594,7 @@ class FromSavedModelTest(lite_v2_test_util.ModelTest):
       self.assertLen(input_details, 2)
       self.assertStartsWith(input_details[0]['name'], 'inputA')
       self.assertEqual(np.float32, input_details[0]['dtype'])
-      self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+      self.assertAllEqual([1, 16, 16, 3], input_details[0]['shape'])
       self.assertEqual((0., 0.), input_details[0]['quantization'])
 
       self.assertStartsWith(
@@ -538,16 +602,35 @@ class FromSavedModelTest(lite_v2_test_util.ModelTest):
           'inputB',
       )
       self.assertEqual(np.float32, input_details[1]['dtype'])
-      self.assertTrue(([1, 16, 16, 3] == input_details[1]['shape']).all())
+      self.assertTrue([1, 16, 16, 3], input_details[1]['shape'])
       self.assertEqual((0., 0.), input_details[1]['quantization'])
 
       output_details = interpreter.get_output_details()
       self.assertLen(output_details, 1)
       self.assertStartsWith(output_details[0]['name'], 'add')
       self.assertEqual(np.float32, output_details[0]['dtype'])
-      self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+      self.assertTrue([1, 16, 16, 3], output_details[0]['shape'])
       self.assertEqual((0., 0.), output_details[0]['quantization'])
 
+  @test_util.run_v2_only
+  def testTF1HubFormattedModel(self):
+    """Test a TF1 hub formatted model."""
+    saved_model_dir = self._createV1SavedModel(shape=[1, 16, 16, 3])
+
+    # TF1 hub model is based on V1 saved model and they omit the saved model
+    # schema version setting.
+    saved_model_proto = parse_saved_model(saved_model_dir)
+    saved_model_proto.saved_model_schema_version = 0
+
+    saved_model_pb_file_path = os.path.join(saved_model_dir, 'saved_model.pb')
+    with file_io.FileIO(saved_model_pb_file_path, 'wb') as writer:
+      writer.write(saved_model_proto.SerializeToString())
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverterV2.from_saved_model(saved_model_dir)
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
   @test_util.run_v2_only
   def testConstModel(self):
     """Test a basic model with functions to make sure functions are inlined."""
@@ -632,7 +715,6 @@ class FromSavedModelTest(lite_v2_test_util.ModelTest):
   @test_util.run_v2_only
   def testNoConcreteFunctionModel(self):
     root = self._getMultiFunctionModel()
-    input_data = tf.constant(1., shape=[1])
 
     save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
     save(root, save_dir)
@@ -753,7 +835,7 @@ class FromKerasModelTest(lite_v2_test_util.ModelTest):
     expected_value = model.predict(input_data)
     actual_value = self._evaluateTFLiteModel(tflite_model, input_data)
     for tf_result, tflite_result in zip(expected_value, actual_value):
-      np.testing.assert_almost_equal(tf_result, tflite_result, 5)
+      self.assertAllClose(tf_result, tflite_result, atol=1e-05)
 
   @test_util.run_v2_only
   def testGraphDebugInfo(self):
@@ -836,7 +918,7 @@ class ControlFlowTest(lite_v2_test_util.ModelTest):
     expected_value = concrete_func(**input_data)
     actual_value = self._evaluateTFLiteModel(
         tflite_model, [input_data['x'], input_data['b']])[0]
-    np.testing.assert_almost_equal(expected_value.numpy(), actual_value)
+    self.assertAllClose(expected_value, actual_value)
 
   @test_util.run_v2_only
   def testStaticRnn(self):
@@ -862,7 +944,7 @@ class ControlFlowTest(lite_v2_test_util.ModelTest):
     expected_value = concrete_func(input_data)[0]
     actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
     for expected, actual in zip(expected_value, actual_value):
-      np.testing.assert_almost_equal(expected.numpy(), actual)
+      self.assertAllClose(expected, actual)
 
   @test_util.run_v2_only
   def testWhileLoop(self):
@@ -890,7 +972,7 @@ class ControlFlowTest(lite_v2_test_util.ModelTest):
     # Check values from converted model.
     expected_value = concrete_func(input_data)[0]
     actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])[0]
-    np.testing.assert_almost_equal(expected_value.numpy(), actual_value)
+    self.assertAllClose(expected_value, actual_value)
 
   @test_util.run_v2_only
   def testDynamicRnn(self):
@@ -914,11 +996,9 @@ class ControlFlowTest(lite_v2_test_util.ModelTest):
     expected_value = concrete_func(input_data)
     actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
     for expected, actual in zip(expected_value, actual_value):
-      if isinstance(expected, ops.EagerTensor):
-        expected = expected.numpy()
-      else:
-        expected = expected.c.numpy()
-      np.testing.assert_almost_equal(expected, actual)
+      if not isinstance(expected, ops.EagerTensor):
+        expected = expected.c
+      self.assertAllClose(expected, actual)
 
   @parameterized.named_parameters(('LSTM', recurrent_v2.LSTM),
                                   ('SimpleRNN', recurrent.SimpleRNN),
@@ -942,7 +1022,7 @@ class ControlFlowTest(lite_v2_test_util.ModelTest):
 
     # Check values from converted model.
     expected_value = model.predict(input_data)
-    np.testing.assert_almost_equal(expected_value, actual_value, decimal=5)
+    self.assertAllClose(expected_value, actual_value, atol=1e-05)
 
   @parameterized.named_parameters(('LSTM', recurrent_v2.LSTM),
                                   ('SimpleRNN', recurrent.SimpleRNN),
@@ -963,7 +1043,7 @@ class ControlFlowTest(lite_v2_test_util.ModelTest):
 
     # Check values from converted model.
     expected_value = model.predict(input_data)
-    np.testing.assert_almost_equal(expected_value, actual_value, decimal=5)
+    self.assertAllClose(expected_value, actual_value, atol=1e-05)
 
   @test_util.run_v2_only
   def testKerasBidirectionalRNN(self):
@@ -986,7 +1066,7 @@ class ControlFlowTest(lite_v2_test_util.ModelTest):
 
     # Check values from converted model.
     expected_value = model.predict(input_data)
-    np.testing.assert_almost_equal(expected_value, actual_value, decimal=5)
+    self.assertAllClose(expected_value, actual_value, atol=1e-05)
 
 
 class GrapplerTest(lite_v2_test_util.ModelTest):
@@ -1013,14 +1093,14 @@ class GrapplerTest(lite_v2_test_util.ModelTest):
 
     # Check values from converted model.
     expected_value = root.f(input_data)
-    actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
-    np.testing.assert_almost_equal(expected_value.numpy(), actual_value[0])
+    actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])[0]
+    self.assertAllClose(expected_value, actual_value)
 
     # Enable hybrid quantization, same result
     converter.optimizations = [lite.Optimize.DEFAULT]
-    hybrid_tflite_model = converter.convert()
-    actual_value = self._evaluateTFLiteModel(hybrid_tflite_model, [input_data])
-    np.testing.assert_almost_equal(expected_value.numpy(), actual_value[0])
+    tflite_model = converter.convert()
+    actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])[0]
+    self.assertAllClose(expected_value, actual_value)
 
 
 class UnknownShapes(lite_v2_test_util.ModelTest):
@@ -1045,15 +1125,16 @@ class UnknownShapes(lite_v2_test_util.ModelTest):
     # Check values from converted model.
     expected_value = concrete_func(input_data)
     actual_value = self._evaluateTFLiteModel(
-        tflite_model, [input_data], input_shapes=[([-1, 4], [10, 4])])
-    np.testing.assert_almost_equal(
-        expected_value.numpy(), actual_value[0], decimal=6)
+        tflite_model, [input_data], input_shapes=[([-1, 4], [10, 4])])[0]
+    self.assertAllClose(expected_value, actual_value, atol=1e-06)
+
+  def _getIntegerQuantizeModelWithUnknownShapes(self):
+    np.random.seed(0)
 
-  def _getQuantizedModel(self):
-    # Returns a model with tf.MatMul and unknown dimensions.
     @tf.function(
         input_signature=[tf.TensorSpec(shape=[None, 33], dtype=tf.float32)])
-    def model(in_tensor):
+    def model(input_tensor):
+      """Define a model with tf.MatMul and unknown shapes."""
       # We need the tensor to have more than 1024 elements for quantize_weights
       # to kick in. Thus, the [33, 33] shape.
       const_tensor = tf.constant(
@@ -1062,12 +1143,14 @@ class UnknownShapes(lite_v2_test_util.ModelTest):
           dtype=tf.float32,
           name='inputB')
 
-      shape = tf.shape(in_tensor)
+      shape = tf.shape(input_tensor)
       fill = tf.transpose(tf.fill(shape, 1.))
-      mult = tf.matmul(fill, in_tensor)
+      mult = tf.matmul(fill, input_tensor)
       return tf.matmul(mult, const_tensor)
 
-    concrete_func = model.get_concrete_function()
+    root = tracking.AutoTrackable()
+    root.f = model
+    concrete_func = root.f.get_concrete_function()
 
     def calibration_gen():
       for batch in range(5, 20, 5):
@@ -1078,7 +1161,7 @@ class UnknownShapes(lite_v2_test_util.ModelTest):
 
   @test_util.run_v2_only
   def testMatMulQuantize(self):
-    concrete_func, _ = self._getQuantizedModel()
+    concrete_func, _ = self._getIntegerQuantizeModelWithUnknownShapes()
     float_converter = lite.TFLiteConverterV2.from_concrete_functions(
         [concrete_func])
     float_tflite_model = float_converter.convert()
@@ -1094,14 +1177,15 @@ class UnknownShapes(lite_v2_test_util.ModelTest):
     input_details = quantized_interpreter.get_input_details()
     self.assertLen(input_details, 1)
     self.assertEqual(np.float32, input_details[0]['dtype'])
-    self.assertTrue((input_details[0]['shape_signature'] == [-1, 33]).all())
+    self.assertAllEqual([-1, 33], input_details[0]['shape_signature'])
 
     # Ensure that the quantized weights tflite model is smaller.
     self.assertLess(len(quantized_tflite_model), len(float_tflite_model))
 
   @test_util.run_v2_only
   def testMatMulCalibrateAndQuantize(self):
-    concrete_func, calibration_gen = self._getQuantizedModel()
+    concrete_func, calibration_gen = \
+        self._getIntegerQuantizeModelWithUnknownShapes()
     float_converter = lite.TFLiteConverterV2.from_concrete_functions(
         [concrete_func])
     float_tflite_model = float_converter.convert()
@@ -1118,7 +1202,7 @@ class UnknownShapes(lite_v2_test_util.ModelTest):
     input_details = quantized_interpreter.get_input_details()
     self.assertLen(input_details, 1)
     self.assertEqual(np.float32, input_details[0]['dtype'])
-    self.assertTrue((input_details[0]['shape_signature'] == [-1, 33]).all())
+    self.assertAllEqual([-1, 33], input_details[0]['shape_signature'])
 
     # Ensure that the quantized weights tflite model is smaller.
     self.assertLess(len(quantized_tflite_model), len(float_tflite_model))
@@ -1145,9 +1229,8 @@ class UnknownShapes(lite_v2_test_util.ModelTest):
     expected_value = concrete_func(input_data_1, input_data_2)
     actual_value = self._evaluateTFLiteModel(
         tflite_model, [input_data_1, input_data_2],
-        input_shapes=[([-1, 256, 256], [1, 256, 256])])
-    np.testing.assert_almost_equal(
-        expected_value.numpy(), actual_value[0], decimal=4)
+        input_shapes=[([-1, 256, 256], [1, 256, 256])])[0]
+    self.assertAllClose(expected_value, actual_value, atol=4)
 
   def testSizeInvalid(self):
 
diff --git a/tensorflow/lite/python/lite_v2_test_util.py b/tensorflow/lite/python/lite_v2_test_util.py
index d8f764711cd..1493b240913 100644
--- a/tensorflow/lite/python/lite_v2_test_util.py
+++ b/tensorflow/lite/python/lite_v2_test_util.py
@@ -77,6 +77,7 @@ class ModelTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def _getMultiFunctionModel(self):
 
     class BasicModel(tracking.AutoTrackable):
+      """Basic model with multiple functions."""
 
       def __init__(self):
         self.y = None
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.cc b/tensorflow/lite/python/optimize/calibration_wrapper.cc
index 4e4584c0fd7..b608d529c85 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.cc
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.cc
@@ -86,6 +86,8 @@ inline TensorType TfLiteTypeToSchemaType(TfLiteType type) {
       return TensorType_INT16;
     case kTfLiteComplex64:
       return TensorType_COMPLEX64;
+    case kTfLiteComplex128:
+      return TensorType_COMPLEX128;
   }
   // No default to get compiler error when new type is introduced.
 }
diff --git a/tensorflow/lite/python/util.py b/tensorflow/lite/python/util.py
index a69f59b2837..79d2775d1dc 100644
--- a/tensorflow/lite/python/util.py
+++ b/tensorflow/lite/python/util.py
@@ -19,15 +19,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
 import datetime
 import sys
 
+from absl import logging
 import six
 from six.moves import range
 
+import flatbuffers
 from tensorflow.core.protobuf import config_pb2 as _config_pb2
 from tensorflow.core.protobuf import graph_debug_info_pb2
 from tensorflow.core.protobuf import meta_graph_pb2 as _meta_graph_pb2
+from tensorflow.lite.python import lite_constants as _lite_constants
+from tensorflow.lite.python import schema_py_generated as schema_fb
 from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs
 from tensorflow.lite.python.op_hint import find_all_hinted_output_nodes
 from tensorflow.lite.toco import types_pb2 as _types_pb2
@@ -43,17 +48,38 @@ from tensorflow.python.training.saver import export_meta_graph as _export_meta_g
 _MAP_TF_TO_TFLITE_TYPES = {
     dtypes.float32: _types_pb2.FLOAT,
     dtypes.float16: _types_pb2.FLOAT16,
-    dtypes.float64: _types_pb2.FLOAT64,
     dtypes.int32: _types_pb2.INT32,
+    dtypes.uint8: _types_pb2.QUANTIZED_UINT8,
     dtypes.int64: _types_pb2.INT64,
     dtypes.string: _types_pb2.STRING,
-    dtypes.uint8: _types_pb2.QUANTIZED_UINT8,
-    dtypes.int8: _types_pb2.INT8,
+    dtypes.bool: _types_pb2.BOOL,
     dtypes.int16: _types_pb2.QUANTIZED_INT16,
     dtypes.complex64: _types_pb2.COMPLEX64,
-    dtypes.bool: _types_pb2.BOOL,
+    dtypes.int8: _types_pb2.INT8,
+    dtypes.float64: _types_pb2.FLOAT64,
+    dtypes.complex128: _types_pb2.COMPLEX128,
 }
 
+_MAP_TFLITE_ENUM_TO_TF_TYPES = {
+    0: dtypes.float32,
+    1: dtypes.float16,
+    2: dtypes.int32,
+    3: dtypes.uint8,
+    4: dtypes.int64,
+    5: dtypes.string,
+    6: dtypes.bool,
+    7: dtypes.int16,
+    8: dtypes.complex64,
+    9: dtypes.int8,
+    10: dtypes.float64,
+    11: dtypes.complex128,
+}
+
+_TFLITE_FILE_IDENTIFIER = b"TFL3"
+
+_TFLITE_MODEL_INPUT_OUTPUT_TYPES = (_lite_constants.FLOAT, _lite_constants.INT8,
+                                    _lite_constants.QUANTIZED_UINT8)
+
 
 def convert_dtype_to_tflite_type(tf_dtype):
   """Converts tf.dtype to TFLite proto type.
@@ -73,6 +99,31 @@ def convert_dtype_to_tflite_type(tf_dtype):
   return result
 
 
+def _convert_tflite_enum_type_to_tf_type(tflite_enum_type):
+  """Converts tflite enum type (eg: 0) to tf type (eg: tf.float32).
+
+  Args:
+    tflite_enum_type: tflite enum type (eg: 0, that corresponds to float32)
+
+  Raises:
+    ValueError: If an invalid tflite enum type is provided.
+
+  Returns:
+    tf type (eg: tf.float32)
+  """
+  tf_type = _MAP_TFLITE_ENUM_TO_TF_TYPES.get(tflite_enum_type)
+  if tf_type is None:
+    raise ValueError(
+        "Unsupported enum {}. The valid map of enum to tf types is : {}"
+        .format(tflite_enum_type, _MAP_TFLITE_ENUM_TO_TF_TYPES))
+  return tf_type
+
+
+def _get_dtype_name(tf_type):
+  """Converts tf.dtype (eg: tf.float32) to str (eg: "tf.float32")."""
+  return "tf." + tf_type.name
+
+
 def get_tensor_name(tensor):
   """Returns name of the input tensor.
 
@@ -513,3 +564,218 @@ extern const int {array_name}_len;
       license_text=license_text)
 
   return source_text, header_text
+
+
+def _convert_model_from_bytearray_to_object(model_bytearray):
+  """Converts a tflite model from a bytearray into a parsable object."""
+  model_object = schema_fb.Model.GetRootAsModel(model_bytearray, 0)
+  model_object = schema_fb.ModelT.InitFromObj(model_object)
+  model_object = copy.deepcopy(model_object)
+  model_object.subgraphs[0].inputs[0] = model_object.subgraphs[0].inputs[0]
+  return model_object
+
+
+def _convert_model_from_object_to_bytearray(model_object):
+  """Converts a tflite model from a parsable object into a bytearray."""
+  # Initial size of the buffer, which will grow automatically if needed
+  builder = flatbuffers.Builder(1024)
+  model_offset = model_object.Pack(builder)
+  builder.Finish(model_offset, file_identifier=_TFLITE_FILE_IDENTIFIER)
+  return bytes(builder.Output())
+
+
+def _remove_tensors_from_model(model, remove_tensors_idxs):
+  """Remove tensors from model."""
+  if not remove_tensors_idxs:
+    return
+  if len(model.subgraphs) > 1:
+    raise ValueError("Model must only have one subgraph. Instead, it has "
+                     "{} subgraphs.".format(len(model.subgraphs)))
+  subgraph = model.subgraphs[0]
+  tensors = subgraph.tensors
+  operators = subgraph.operators
+
+  logging.debug("Removing tensors at indices : %s", remove_tensors_idxs)
+  # An optimized check to validate if "remove_tensors_idxs" (eg: [4,5,6]) is an
+  # exact subset, with ordering, of "tensors" indices (eg: [0,1,2,3,4,5,6]).
+  if min(remove_tensors_idxs) == len(tensors) - len(remove_tensors_idxs):
+    logging.debug("Removing tensors only at the end of the tensor list")
+    del tensors[min(remove_tensors_idxs):]
+  else:
+    logging.debug("Removing tensors requires updating the model")
+    # Map the old tensor indices to new tensor indices
+    d_old_to_new_tensors = {}
+    left_shift_by = 0
+    for idx in range(len(tensors)):
+      if idx in remove_tensors_idxs:
+        left_shift_by += 1
+      else:
+        d_old_to_new_tensors[idx] = idx - left_shift_by
+    logging.debug("Old to new tensors map: %s", d_old_to_new_tensors.__str__())
+    # Update tensor indices referenced throughout the model
+    def update_tensors(tensor_idxs):
+      for i, ti in enumerate(tensor_idxs):
+        tensor_idxs[i] = d_old_to_new_tensors.get(ti, -1)
+    update_tensors(subgraph.inputs)
+    update_tensors(subgraph.outputs)
+    for op in operators:
+      update_tensors(op.inputs)
+      update_tensors(op.outputs)
+    # Delete the tensors
+    for idx in sorted(remove_tensors_idxs, reverse=True):
+      tensors.pop(idx)
+    logging.debug("Removed tensors marked for deletion")
+
+
+def _validate_and_find_int8_quantized_inputs_outputs(model):
+  """Validate that model input is quantized and output is dequantized."""
+  if len(model.subgraphs) > 1:
+    raise ValueError("Model must only have one subgraph. Instead, it has "
+                     "{} subgraphs.".format(len(model.subgraphs)))
+  subgraph = model.subgraphs[0]
+  tensors = subgraph.tensors
+  operators = subgraph.operators
+
+  # Ensure model has atleast one quantize and dequantize operator
+  quant_opcode_idx, dequant_opcode_idx = None, None
+  for idx, opcode in enumerate(model.operatorCodes):
+    if opcode.builtinCode == schema_fb.BuiltinOperator.QUANTIZE:
+      quant_opcode_idx = idx
+    elif opcode.builtinCode == schema_fb.BuiltinOperator.DEQUANTIZE:
+      dequant_opcode_idx = idx
+    if quant_opcode_idx is not None and dequant_opcode_idx is not None:
+      break
+  if quant_opcode_idx is None and dequant_opcode_idx is None:
+    raise ValueError("Model is not integer quantized as it does not "
+                     "contain quantize/dequantize operators.")
+
+  # Ensure model inputs and outputs are integer quantized
+  input_quant_ops, output_dequant_ops = [], []
+  for op in operators:
+    # Find input quantize operator
+    if op.opcodeIndex == quant_opcode_idx and op.inputs[0] in subgraph.inputs:
+      pos, float_tensor, int_tensor = \
+          "input", tensors[op.inputs[0]], tensors[op.outputs[0]]
+      input_quant_ops.append(op)
+    # Find output dequantize operator
+    elif op.opcodeIndex == dequant_opcode_idx and \
+        op.outputs[0] in subgraph.outputs:
+      pos, float_tensor, int_tensor = \
+          "output", tensors[op.outputs[0]], tensors[op.inputs[0]]
+      output_dequant_ops.append(op)
+    # Otherwise, ignore
+    else:
+      continue
+    # If found, validate the input/output tensor type
+    if float_tensor.type != schema_fb.TensorType.FLOAT32:
+      raise ValueError(
+          "Model {} type must be tf.float32. Expected type for tensor with "
+          "name '{}' is tf.float32, instead type is tf.{}".format(
+              pos, float_tensor.name,
+              _convert_tflite_enum_type_to_tf_type(float_tensor.type).name))
+    if int_tensor.type != schema_fb.TensorType.INT8:
+      raise ValueError(
+          "Model is not integer quantized. Expected type for tensor with "
+          "name '{}' is tf.int8, instead type is tf.{}".format(
+              int_tensor.name,
+              _convert_tflite_enum_type_to_tf_type(int_tensor.type).name))
+
+  return input_quant_ops, output_dequant_ops
+
+
+def modify_integer_quantized_model_io_type(
+    model, inference_input_type=_lite_constants.FLOAT,
+    inference_output_type=_lite_constants.FLOAT):
+  """Modify the float input/output type of an integer quantized model.
+
+  Args:
+    model: An int8 quantized tflite model with float input and output.
+    inference_input_type: tf.DType representing final input type.
+      (default tf.float32)
+    inference_output_type: tf.DType representing final output type.
+      (default tf.float32)
+
+  Returns:
+    An int8 quantized tflite model with modified input and/or output type.
+
+  Raises:
+    ValueError: If the model is not int8 quantized or the inference_input_type
+      and/or inference_input_type is unsupported.
+    RuntimeError: If the modification was unsuccessful.
+
+  """
+  # Return if input and output types default to float
+  if inference_input_type == _lite_constants.FLOAT and \
+      inference_output_type == _lite_constants.FLOAT:
+    return model
+
+  # Validate input and output types
+  if inference_input_type not in _TFLITE_MODEL_INPUT_OUTPUT_TYPES:
+    raise ValueError("The `inference_input_type` should be in {}".format(
+        tuple(_get_dtype_name(t) for t in _TFLITE_MODEL_INPUT_OUTPUT_TYPES)))
+  if inference_output_type not in _TFLITE_MODEL_INPUT_OUTPUT_TYPES:
+    raise ValueError("The `inference_output_type` should be in {}".format(
+        tuple(_get_dtype_name(t) for t in _TFLITE_MODEL_INPUT_OUTPUT_TYPES)))
+
+  logging.debug(("Attempting to modify the model input from tf.float32 to %s "
+                 "and output from tf.float32 to %s"),
+                _get_dtype_name(inference_input_type),
+                _get_dtype_name(inference_output_type))
+  # Convert the model to an object
+  model = _convert_model_from_bytearray_to_object(model)
+
+  # Validate the integer quantized model
+  input_quant_ops, output_dequant_ops = \
+      _validate_and_find_int8_quantized_inputs_outputs(model)
+
+  # Initialize references and variables
+  if len(model.subgraphs) > 1:
+    raise ValueError("Model must only have one subgraph. Instead, it has "
+                     "{} subgraphs.".format(len(model.subgraphs)))
+  subgraph = model.subgraphs[0]
+  tensors = subgraph.tensors
+  operators = subgraph.operators
+  remove_tensors_idxs = set()
+
+  # Modify model input type
+  if inference_input_type == _lite_constants.QUANTIZED_UINT8:
+    # Change quant op (float to int8) to quant op (uint8 to int8)
+    for op in input_quant_ops:
+      int8_quantization = tensors[op.outputs[0]].quantization
+      uint8_quantization = schema_fb.QuantizationParametersT()
+      uint8_quantization.scale = [int8_quantization.scale[0]]
+      uint8_quantization.zeroPoint = [int8_quantization.zeroPoint[0] + 128]
+      tensors[op.inputs[0]].quantization = uint8_quantization
+      tensors[op.inputs[0]].type = schema_fb.TensorType.UINT8
+  elif inference_input_type == _lite_constants.INT8:
+    # Remove the inputs and the quant operator
+    for op in input_quant_ops:
+      subgraph.inputs[subgraph.inputs == op.inputs[0]] = op.outputs[0]
+      remove_tensors_idxs.add(op.inputs[0])
+      operators.remove(op)
+
+  # Modify model output type
+  if inference_output_type == _lite_constants.QUANTIZED_UINT8:
+    # Change dequant op (int8 to float) to quant op (int8 to uint8)
+    for op in output_dequant_ops:
+      op.opcodeIndex = input_quant_ops[0].opcodeIndex
+      int8_quantization = tensors[op.inputs[0]].quantization
+      uint8_quantization = schema_fb.QuantizationParametersT()
+      uint8_quantization.scale = [int8_quantization.scale[0]]
+      uint8_quantization.zeroPoint = [int8_quantization.zeroPoint[0] + 128]
+      tensors[op.outputs[0]].quantization = uint8_quantization
+      tensors[op.outputs[0]].type = schema_fb.TensorType.UINT8
+  elif inference_output_type == _lite_constants.INT8:
+    # Remove the outputs and the dequant operator
+    for op in output_dequant_ops:
+      subgraph.outputs[subgraph.outputs == op.outputs[0]] = op.inputs[0]
+      remove_tensors_idxs.add(op.outputs[0])
+      operators.remove(op)
+
+  # Remove tensors marked for deletion.
+  _remove_tensors_from_model(model, remove_tensors_idxs)
+
+  # Convert the model to a bytearray
+  model = _convert_model_from_object_to_bytearray(model)
+
+  return model
diff --git a/tensorflow/lite/python/util_test.py b/tensorflow/lite/python/util_test.py
index f3c287dd7fc..820cda4c7d6 100644
--- a/tensorflow/lite/python/util_test.py
+++ b/tensorflow/lite/python/util_test.py
@@ -19,7 +19,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+import numpy as np
 from six.moves import range
+import tensorflow as tf
 
 from tensorflow.lite.python import lite_constants
 from tensorflow.lite.python import util
@@ -39,31 +42,65 @@ from tensorflow.python.platform import test
 class UtilTest(test_util.TensorFlowTestCase):
 
   def testConvertDtype(self):
-    self.assertEqual(
-        util.convert_dtype_to_tflite_type(lite_constants.FLOAT),
-        _types_pb2.FLOAT)
     self.assertEqual(
         util.convert_dtype_to_tflite_type(dtypes.float32), _types_pb2.FLOAT)
+    self.assertEqual(
+        util.convert_dtype_to_tflite_type(dtypes.float16), _types_pb2.FLOAT16)
     self.assertEqual(
         util.convert_dtype_to_tflite_type(dtypes.int32), _types_pb2.INT32)
+    self.assertEqual(
+        util.convert_dtype_to_tflite_type(dtypes.uint8),
+        _types_pb2.QUANTIZED_UINT8)
     self.assertEqual(
         util.convert_dtype_to_tflite_type(dtypes.int64), _types_pb2.INT64)
     self.assertEqual(
         util.convert_dtype_to_tflite_type(dtypes.string), _types_pb2.STRING)
     self.assertEqual(
-        util.convert_dtype_to_tflite_type(dtypes.uint8),
-        _types_pb2.QUANTIZED_UINT8)
+        util.convert_dtype_to_tflite_type(dtypes.bool), _types_pb2.BOOL)
+    self.assertEqual(
+        util.convert_dtype_to_tflite_type(dtypes.int16),
+        _types_pb2.QUANTIZED_INT16)
     self.assertEqual(
         util.convert_dtype_to_tflite_type(dtypes.complex64),
         _types_pb2.COMPLEX64)
     self.assertEqual(
-        util.convert_dtype_to_tflite_type(dtypes.half), _types_pb2.FLOAT16)
+        util.convert_dtype_to_tflite_type(dtypes.int8), _types_pb2.INT8)
     self.assertEqual(
-        util.convert_dtype_to_tflite_type(dtypes.bool), _types_pb2.BOOL)
+        util.convert_dtype_to_tflite_type(dtypes.float64), _types_pb2.FLOAT64)
+    self.assertEqual(
+        util.convert_dtype_to_tflite_type(dtypes.complex128),
+        _types_pb2.COMPLEX128)
+
+  def testConvertEnumToDtype(self):
+    self.assertEqual(
+        util._convert_tflite_enum_type_to_tf_type(0), dtypes.float32)
+    self.assertEqual(
+        util._convert_tflite_enum_type_to_tf_type(1), dtypes.float16)
+    self.assertEqual(util._convert_tflite_enum_type_to_tf_type(2), dtypes.int32)
+    self.assertEqual(util._convert_tflite_enum_type_to_tf_type(3), dtypes.uint8)
+    self.assertEqual(util._convert_tflite_enum_type_to_tf_type(4), dtypes.int64)
+    self.assertEqual(
+        util._convert_tflite_enum_type_to_tf_type(5), dtypes.string)
+    self.assertEqual(util._convert_tflite_enum_type_to_tf_type(6), dtypes.bool)
+    self.assertEqual(util._convert_tflite_enum_type_to_tf_type(7), dtypes.int16)
+    self.assertEqual(
+        util._convert_tflite_enum_type_to_tf_type(8), dtypes.complex64)
+    self.assertEqual(util._convert_tflite_enum_type_to_tf_type(9), dtypes.int8)
+    self.assertEqual(
+        util._convert_tflite_enum_type_to_tf_type(10), dtypes.float64)
+    self.assertEqual(
+        util._convert_tflite_enum_type_to_tf_type(11), dtypes.complex128)
+    with self.assertRaises(ValueError) as error:
+      util._convert_tflite_enum_type_to_tf_type(20)
+    self.assertEqual(
+        "Unsupported enum 20. The valid map of enum to tf types is : "
+        "{0: tf.float32, 1: tf.float16, 2: tf.int32, 3: tf.uint8, 4: tf.int64, "
+        "5: tf.string, 6: tf.bool, 7: tf.int16, 8: tf.complex64, 9: tf.int8, "
+        "10: tf.float64, 11: tf.complex128}", str(error.exception))
 
   def testTensorName(self):
     with ops.Graph().as_default():
-      in_tensor = array_ops.placeholder(shape=[4], dtype=dtypes.float32)
+      in_tensor = array_ops.placeholder(dtype=dtypes.float32, shape=[4])
       out_tensors = array_ops.split(
           value=in_tensor, num_or_size_splits=[1, 1, 1, 1], axis=0)
 
@@ -75,7 +112,7 @@ class UtilTest(test_util.TensorFlowTestCase):
   @test_util.enable_control_flow_v2
   def testRemoveLowerUsingSwitchMerge(self):
     with ops.Graph().as_default():
-      i = array_ops.placeholder(shape=(), dtype=dtypes.int32)
+      i = array_ops.placeholder(dtype=dtypes.int32, shape=())
       c = lambda i: math_ops.less(i, 10)
       b = lambda i: math_ops.add(i, 1)
       control_flow_ops.while_loop(c, b, [i])
@@ -88,7 +125,7 @@ class UtilTest(test_util.TensorFlowTestCase):
       if node.op == "While" or node.op == "StatelessWhile":
         if not node.attr["_lower_using_switch_merge"].b:
           lower_using_switch_merge_is_removed = True
-    self.assertEqual(lower_using_switch_merge_is_removed, True)
+    self.assertTrue(lower_using_switch_merge_is_removed)
 
   def testConvertBytes(self):
     source, header = util.convert_bytes_to_c_source(
@@ -126,7 +163,7 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
   def testGetTensorsValid(self):
     with ops.Graph().as_default():
       in_tensor = array_ops.placeholder(
-          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+          dtype=dtypes.float32, shape=[1, 16, 16, 3])
       _ = in_tensor + in_tensor
       sess = session.Session()
 
@@ -136,7 +173,7 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
   def testGetTensorsInvalid(self):
     with ops.Graph().as_default():
       in_tensor = array_ops.placeholder(
-          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+          dtype=dtypes.float32, shape=[1, 16, 16, 3])
       _ = in_tensor + in_tensor
       sess = session.Session()
 
@@ -147,52 +184,186 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
 
   def testSetTensorShapeValid(self):
     with ops.Graph().as_default():
-      tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
-    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+      tensor = array_ops.placeholder(dtype=dtypes.float32, shape=[None, 3, 5])
+    self.assertAllEqual([None, 3, 5], tensor.shape)
 
     util.set_tensor_shapes([tensor], {"Placeholder": [5, 3, 5]})
-    self.assertEqual([5, 3, 5], tensor.shape.as_list())
+    self.assertAllEqual([5, 3, 5], tensor.shape)
 
   def testSetTensorShapeNoneValid(self):
     with ops.Graph().as_default():
       tensor = array_ops.placeholder(dtype=dtypes.float32)
-    self.assertEqual(None, tensor.shape)
 
     util.set_tensor_shapes([tensor], {"Placeholder": [1, 3, 5]})
-    self.assertEqual([1, 3, 5], tensor.shape.as_list())
+    self.assertAllEqual([1, 3, 5], tensor.shape)
 
   def testSetTensorShapeArrayInvalid(self):
     # Tests set_tensor_shape where the tensor name passed in doesn't exist.
     with ops.Graph().as_default():
-      tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
-    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+      tensor = array_ops.placeholder(dtype=dtypes.float32, shape=[None, 3, 5])
+    self.assertAllEqual([None, 3, 5], tensor.shape)
 
     with self.assertRaises(ValueError) as error:
       util.set_tensor_shapes([tensor], {"invalid-input": [5, 3, 5]})
     self.assertEqual(
         "Invalid tensor 'invalid-input' found in tensor shapes map.",
         str(error.exception))
-    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+    self.assertAllEqual([None, 3, 5], tensor.shape)
 
   def testSetTensorShapeDimensionInvalid(self):
     # Tests set_tensor_shape where the shape passed in is incompatible.
     with ops.Graph().as_default():
-      tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
-    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+      tensor = array_ops.placeholder(dtype=dtypes.float32, shape=[None, 3, 5])
+    self.assertAllEqual([None, 3, 5], tensor.shape)
 
     with self.assertRaises(ValueError) as error:
       util.set_tensor_shapes([tensor], {"Placeholder": [1, 5, 5]})
     self.assertIn("The shape of tensor 'Placeholder' cannot be changed",
                   str(error.exception))
-    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+    self.assertAllEqual([None, 3, 5], tensor.shape)
 
   def testSetTensorShapeEmpty(self):
     with ops.Graph().as_default():
-      tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
-    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+      tensor = array_ops.placeholder(dtype=dtypes.float32, shape=[None, 3, 5])
+    self.assertAllEqual([None, 3, 5], tensor.shape)
 
     util.set_tensor_shapes([tensor], {})
-    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+    self.assertAllEqual([None, 3, 5], tensor.shape)
+
+
+def _generate_integer_tflite_model():
+  """Define an integer post-training quantized tflite model."""
+  # Load MNIST dataset
+  n = 10  # Number of samples
+  (train_images, train_labels), (test_images, test_labels) = \
+      tf.keras.datasets.mnist.load_data()
+  train_images, train_labels, test_images, test_labels = \
+      train_images[:n], train_labels[:n], test_images[:n], test_labels[:n]
+
+  # Normalize the input image so that each pixel value is between 0 to 1.
+  train_images = train_images / 255.0
+  test_images = test_images / 255.0
+
+  # Define TF model
+  model = tf.keras.Sequential([
+      tf.keras.layers.InputLayer(input_shape=(28, 28)),
+      tf.keras.layers.Reshape(target_shape=(28, 28, 1)),
+      tf.keras.layers.Conv2D(filters=12, kernel_size=(3, 3), activation="relu"),
+      tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
+      tf.keras.layers.Flatten(),
+      tf.keras.layers.Dense(10)
+  ])
+
+  # Train
+  model.compile(
+      optimizer="adam",
+      loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+      metrics=["accuracy"])
+
+  model.fit(
+      train_images,
+      train_labels,
+      epochs=1,
+      validation_split=0.1,
+  )
+
+  # Convert TF Model to an Integer Quantized TFLite Model
+  converter = tf.lite.TFLiteConverter.from_keras_model(model)
+  converter.optimizations = {tf.lite.Optimize.DEFAULT}
+  def representative_dataset_gen():
+    for _ in range(2):
+      yield [
+          np.random.uniform(low=0, high=1, size=(1, 28, 28)).astype(
+              np.float32)
+      ]
+  converter.representative_dataset = representative_dataset_gen
+  converter.target_spec.supported_ops = {tf.lite.OpsSet.TFLITE_BUILTINS_INT8}
+  tflite_model = converter.convert()
+
+  return tflite_model
+
+
+def _test_param_modify_integer_model_io_type():
+  """Function to generate parameterized inputs for testing."""
+  params = []
+  str_template = "_{}{}{}"
+  map_model_type = {
+      "PostTraining": True,
+      # "DuringTraining": False,
+  }
+  map_types = {
+      "": lite_constants.FLOAT,
+      "INT8": lite_constants.INT8,
+      "UINT8": lite_constants.QUANTIZED_UINT8
+  }
+  for k1, v1 in map_model_type.items():
+    for k2, v2 in map_types.items():
+      istr = "_Input{}".format(k2) if k2 else ""
+      for k3, v3 in map_types.items():
+        ostr = "_Output{}".format(k3) if k3 else "" if istr else "_NoUpdate"
+        params.append((str_template.format(k1, istr, ostr), v1, v2, v3))
+  return params
+
+
+# TODO(b/161174063):  Merge tests for integer input/output type
+class UtilModifyIntegerQuantizedModelIOTypeTest(
+    test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    super(UtilModifyIntegerQuantizedModelIOTypeTest, cls).setUpClass()
+    cls.post_train_integer_model = _generate_integer_tflite_model()
+
+  @parameterized.named_parameters(_test_param_modify_integer_model_io_type())
+  def test(self, is_post_train, in_tftype, out_tftype):
+    """Modify the float input/output type of an integer quantized model."""
+
+    def _run_tflite_inference(model, in_tftype, out_tftype):
+      """Run inference on a model with a specific input/output type."""
+      # Load TFLite model and allocate tensors.
+      interpreter = tf.lite.Interpreter(model_content=model)
+      interpreter.allocate_tensors()
+      input_details = interpreter.get_input_details()[0]
+      output_details = interpreter.get_output_details()[0]
+
+      # Validate TFLite model input and output types
+      self.assertEqual(input_details["dtype"], in_tftype.as_numpy_dtype)
+      self.assertEqual(output_details["dtype"], out_tftype.as_numpy_dtype)
+
+      # Define Input
+      np.random.seed(0)
+      input_data = np.random.uniform(low=0, high=1, size=(1, 28, 28))
+      input_data = input_data.astype(np.float32)
+      if input_details["dtype"] != np.float32:
+        # quantize float to int
+        scale, zero_point = input_details["quantization"]
+        input_data = input_data / scale + zero_point
+        input_data = input_data.astype(input_details["dtype"])
+
+      # Run Inference
+      interpreter.set_tensor(input_details["index"], input_data)
+      interpreter.invoke()
+
+      # Get output
+      output_data = interpreter.get_tensor(output_details["index"])[0]
+      if output_details["dtype"] != np.float32:
+        # dequantize int to float
+        scale, zero_point = output_details["quantization"]
+        output_data = output_data.astype(np.float32)
+        output_data = (output_data - zero_point) * scale
+
+      return output_data
+
+    model = self.__class__.post_train_integer_model if is_post_train else None
+    # Run model inference with float input output type
+    output_data = _run_tflite_inference(model, tf.float32, tf.float32)
+    # Run model inference with modified integer input output type
+    model_io = util.modify_integer_quantized_model_io_type(
+        model, in_tftype, out_tftype)
+    output_io_data = _run_tflite_inference(model_io, in_tftype, out_tftype)
+
+     # Validate that both the outputs are the same
+    self.assertAllClose(output_data, output_io_data, atol=1.0)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/lite/schema/BUILD b/tensorflow/lite/schema/BUILD
index 33e7eec8421..0bbb2d5e95d 100644
--- a/tensorflow/lite/schema/BUILD
+++ b/tensorflow/lite/schema/BUILD
@@ -1,6 +1,6 @@
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
-load("//tensorflow/lite/micro:build_def.bzl", "flatbuffer_cc_library")
+load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
 
 package(
     default_visibility = [
@@ -64,7 +64,6 @@ exports_files([
 flatbuffer_cc_library(
     name = "schema_fbs",
     srcs = ["schema.fbs"],
-    build_for_embedded = True,
 )
 
 # Generic schema for flatbuffer converter (but with mutable makes bigger).
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index ff977a0db02..baeb49f7b7a 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -41,6 +41,7 @@ enum TensorType : byte {
   COMPLEX64 = 8,
   INT8 = 9,
   FLOAT64 = 10,
+  COMPLEX128 = 11,
 }
 
 // Custom quantization parameters for experimenting with new quantization
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index 3ac32f8ac8f..a4691b70e49 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -379,11 +379,12 @@ enum TensorType {
   TensorType_COMPLEX64 = 8,
   TensorType_INT8 = 9,
   TensorType_FLOAT64 = 10,
+  TensorType_COMPLEX128 = 11,
   TensorType_MIN = TensorType_FLOAT32,
-  TensorType_MAX = TensorType_FLOAT64
+  TensorType_MAX = TensorType_COMPLEX128
 };
 
-inline const TensorType (&EnumValuesTensorType())[11] {
+inline const TensorType (&EnumValuesTensorType())[12] {
   static const TensorType values[] = {
     TensorType_FLOAT32,
     TensorType_FLOAT16,
@@ -395,13 +396,14 @@ inline const TensorType (&EnumValuesTensorType())[11] {
     TensorType_INT16,
     TensorType_COMPLEX64,
     TensorType_INT8,
-    TensorType_FLOAT64
+    TensorType_FLOAT64,
+    TensorType_COMPLEX128
   };
   return values;
 }
 
 inline const char * const *EnumNamesTensorType() {
-  static const char * const names[12] = {
+  static const char * const names[13] = {
     "FLOAT32",
     "FLOAT16",
     "INT32",
@@ -413,13 +415,14 @@ inline const char * const *EnumNamesTensorType() {
     "COMPLEX64",
     "INT8",
     "FLOAT64",
+    "COMPLEX128",
     nullptr
   };
   return names;
 }
 
 inline const char *EnumNameTensorType(TensorType e) {
-  if (flatbuffers::IsOutRange(e, TensorType_FLOAT32, TensorType_FLOAT64)) return "";
+  if (flatbuffers::IsOutRange(e, TensorType_FLOAT32, TensorType_COMPLEX128)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesTensorType()[index];
 }
diff --git a/tensorflow/lite/special_rules.bzl b/tensorflow/lite/special_rules.bzl
index 5053eb2a16b..cc5fd15e5d5 100644
--- a/tensorflow/lite/special_rules.bzl
+++ b/tensorflow/lite/special_rules.bzl
@@ -51,3 +51,20 @@ def if_nnapi(supported, not_supported = [], supported_android = None):
 def tflite_hexagon_mobile_test(name):
     """This is a no-op outside of Google."""
     pass
+
+def tflite_hexagon_nn_skel_libraries():
+    """This is a no-op outside of Google due to license agreement process.
+
+    Developers who want to use hexagon nn skel libraries can download
+    and install the libraries as the guided in
+    https://www.tensorflow.org/lite/performance/hexagon_delegate#step_2_add_hexagon_libraries_to_your_android_app.
+    For example, if you installed the libraries at third_party/hexagon_nn_skel
+    and created third_party/hexagon_nn_skel/BUILD with a build target,
+    filegroup(
+        name = "libhexagon_nn_skel",
+        srcs = glob(["*.so"]),
+    )
+    you need to modify this macro to specifiy the build target.
+    return ["//third_party/hexagon_nn_skel:libhexagon_nn_skel"]
+    """
+    return []
diff --git a/tensorflow/lite/string_util.cc b/tensorflow/lite/string_util.cc
index 44719858f2a..799a850a0d4 100644
--- a/tensorflow/lite/string_util.cc
+++ b/tensorflow/lite/string_util.cc
@@ -35,27 +35,32 @@ void DynamicBuffer::AddString(const StringRef& string) {
 
 void DynamicBuffer::AddJoinedString(const std::vector<StringRef>& strings,
                                     char separator) {
+  StringRef ref;
+  ref.str = &separator;
+  ref.len = 1;
+  AddJoinedString(strings, ref);
+}
+
+void DynamicBuffer::AddJoinedString(const std::vector<StringRef>& strings,
+                                    StringRef separator) {
   // Resize the data buffer.
-  int total_len = strings.size() - 1;
+  int total_len = (strings.size() - 1) * separator.len;
   for (StringRef ref : strings) {
     total_len += ref.len;
   }
   data_.resize(data_.size() + total_len);
 
-  int current_idx = 0;
-  for (StringRef ref : strings) {
-    char* dst = data_.data() + offset_.back() + current_idx;
-
+  char* dst = data_.data() + offset_.back();
+  for (int i = 0; i < strings.size(); ++i) {
     // Fill separator if not first string.
-    if (current_idx != 0) {
-      *dst = separator;
-      ++dst;
-      ++current_idx;
+    if (i != 0) {
+      memcpy(dst, separator.str, separator.len);
+      dst += separator.len;
     }
 
     // Fill content of the string.
-    memcpy(dst, ref.str, ref.len);
-    current_idx += ref.len;
+    memcpy(dst, strings[i].str, strings[i].len);
+    dst += strings[i].len;
   }
   offset_.push_back(offset_.back() + total_len);
 }
diff --git a/tensorflow/lite/string_util.h b/tensorflow/lite/string_util.h
index 879aa76b83b..2086f9badbf 100644
--- a/tensorflow/lite/string_util.h
+++ b/tensorflow/lite/string_util.h
@@ -69,6 +69,8 @@ class DynamicBuffer {
   // Join a list of string with separator, and add as a single string to the
   // buffer.
   void AddJoinedString(const std::vector<StringRef>& strings, char separator);
+  void AddJoinedString(const std::vector<StringRef>& strings,
+                       StringRef separator);
 
   // Fill content into a buffer and returns the number of bytes stored.
   // The function allocates space for the buffer but does NOT take ownership.
diff --git a/tensorflow/lite/string_util_test.cc b/tensorflow/lite/string_util_test.cc
index 28d93840c56..d5c4909fcad 100644
--- a/tensorflow/lite/string_util_test.cc
+++ b/tensorflow/lite/string_util_test.cc
@@ -97,27 +97,53 @@ TEST(StringUtil, TestStringUtil) {
   ASSERT_EQ(t2->bytes, 15);
 }
 
-TEST(StringUtil, TestAddJoinedString) {
+TEST(StringUtil, TestAddJoinedStringCharSeparator) {
   Interpreter interpreter;
   interpreter.AddTensors(1);
   TfLiteTensor* t0 = interpreter.tensor(0);
   t0->type = kTfLiteString;
   t0->allocation_type = kTfLiteDynamic;
 
-  char s0[] = "ABC";
-  char s1[] = "DEFG";
-  char s2[] = "";
-  char s3[] = "XYZ";
+  char s0[] = "";
+  char s1[] = "ABC";
+  char s2[] = "DEFG";
+  char s3[] = "";
+  char s4[] = "XYZ";
 
   DynamicBuffer buf;
-  buf.AddJoinedString({{s0, 3}, {s1, 4}, {s2, 0}, {s3, 3}}, ' ');
+  buf.AddJoinedString({{s0, 0}, {s1, 3}, {s2, 4}, {s3, 0}, {s4, 3}}, ' ');
   buf.WriteToTensorAsVector(t0);
 
   ASSERT_EQ(GetStringCount(t0), 1);
   StringRef str_ref;
   str_ref = GetString(t0, 0);
-  ASSERT_EQ(string(str_ref.str, str_ref.len), "ABC DEFG  XYZ");
-  ASSERT_EQ(t0->bytes, 25);
+  ASSERT_EQ(string(str_ref.str, str_ref.len), " ABC DEFG  XYZ");
+  ASSERT_EQ(t0->bytes, 26);
+}
+
+TEST(StringUtil, TestAddJoinedStringStringRefSeparator) {
+  Interpreter interpreter;
+  interpreter.AddTensors(1);
+  TfLiteTensor* t0 = interpreter.tensor(0);
+  t0->type = kTfLiteString;
+  t0->allocation_type = kTfLiteDynamic;
+
+  char s[] = " - ";
+  char s0[] = "";
+  char s1[] = "ABC";
+  char s2[] = "DEFG";
+  char s3[] = "";
+  char s4[] = "XYZ";
+
+  DynamicBuffer buf;
+  buf.AddJoinedString({{s0, 0}, {s1, 3}, {s2, 4}, {s3, 0}, {s4, 3}}, {s, 3});
+  buf.WriteToTensorAsVector(t0);
+
+  ASSERT_EQ(GetStringCount(t0), 1);
+  StringRef str_ref;
+  str_ref = GetString(t0, 0);
+  ASSERT_EQ(string(str_ref.str, str_ref.len), " - ABC - DEFG -  - XYZ");
+  ASSERT_EQ(t0->bytes, 34);
 }
 
 TEST(StringUtil, TestEmptyList) {
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index 6452d511acc..4bfc17dc509 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -4,6 +4,8 @@ load(
     "gen_zipped_test_file",
     "generated_test_models_all",
     "merged_test_models",
+    "tflite_custom_android_library",
+    "tflite_custom_cc_library",
 )
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("//tensorflow/lite/testing:tflite_model_test.bzl", "tflite_model_test")
@@ -33,7 +35,16 @@ exports_files([
     name = "zip_test_%s" % test_name,
     size = "medium",
     srcs = ["generated_examples_zip_test.cc"],
-    args = args + select({
+    additional_test_args = {
+        # TODO(b/162696268): uncomment once the bug is fixed.
+        # "xnnpack": ["--use_xnnpack=true"],
+    },
+    conversion_mode = conversion_mode,
+    data = [
+        ":zip_%s" % test_name,
+    ],
+    shard_count = 20,
+    test_args = args + select({
         "//tensorflow:android": [],
         "//conditions:default": [
             "--zip_file_path=$(location :zip_%s)" % test_name,
@@ -42,27 +53,23 @@ exports_files([
             "--unzip_binary_path=/usr/bin/unzip",
         ],
     }),
-    conversion_mode = conversion_mode,
-    data = [
-        ":zip_%s" % test_name,
-    ],
-    shard_count = 20,
-    tags = tags + [
-        "gen_zip_test",
+    test_name = test_name,
+    test_tags = tags + [
         "no_gpu",  # Executing with TF GPU configurations is redundant.
         "no_oss",
         "tflite_not_portable_intentional",
     ],
-    test_name = test_name,
     deps = [
         ":parse_testdata_lib",
         ":tflite_driver",
+        ":tflite_driver_delegate_providers",
         ":util",
         "@com_google_googletest//:gtest",
         "@com_googlesource_code_re2//:re2",
         "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels:test_delegate_providers_lib",
     ] + select({
         "//conditions:default": [
             "//tensorflow/core:framework_internal",
@@ -227,8 +234,9 @@ cc_library(
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:custom_ops",
-        "//tensorflow/lite/kernels/hashtable:hashtable_op_kernels",
         "//tensorflow/lite/kernels:reference_ops",
+        "//tensorflow/lite/kernels:test_delegate_providers_lib",
+        "//tensorflow/lite/kernels/hashtable:hashtable_op_kernels",
         "//tensorflow/lite/tools/evaluation:utils",
     ] + select({
         "//tensorflow:ios": [],
@@ -236,6 +244,22 @@ cc_library(
     }),
 )
 
+# A convenient library of tflite delegate execution providers for tests based
+# on the `tflite_driver` library.
+cc_library(
+    name = "tflite_driver_delegate_providers",
+    deps = [
+        "//tensorflow/lite/tools/delegates:coreml_delegate_provider",
+        "//tensorflow/lite/tools/delegates:default_execution_provider",
+        "//tensorflow/lite/tools/delegates:external_delegate_provider",
+        "//tensorflow/lite/tools/delegates:gpu_delegate_provider",
+        "//tensorflow/lite/tools/delegates:hexagon_delegate_provider",
+        "//tensorflow/lite/tools/delegates:nnapi_delegate_provider",
+        "//tensorflow/lite/tools/delegates:xnnpack_delegate_provider",
+    ],
+    alwayslink = 1,
+)
+
 tf_cc_test(
     name = "tflite_driver_test",
     size = "small",
@@ -516,6 +540,35 @@ cc_library(
     ],
 )
 
+# A selective built tflite for testing.
+tflite_custom_cc_library(
+    name = "test_tflite_lib",
+    models = [
+        "//tensorflow/lite:testdata/add.bin",
+        "//tensorflow/lite:testdata/lstm.bin",
+    ],
+)
+
+cc_test(
+    name = "selective_build_test",
+    srcs = ["selective_build_test.cc"],
+    data = [
+        "//tensorflow/lite:testdata/add.bin",
+        "//tensorflow/lite:testdata/lstm.bin",
+    ],
+    tags = [
+        "no_mac",  # b/161990368
+        "tflite_not_portable",
+    ],
+    deps = [
+        ":test_tflite_lib",
+        "//tensorflow/core:tflite_portable_logging",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:common",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 pybind_extension(
     name = "_pywrap_string_util",
     srcs = [
@@ -534,6 +587,12 @@ pybind_extension(
 
 tflite_portable_test_suite()
 
+tflite_custom_android_library(
+    name = "customtized_tflite_for_add_ops",
+    models = ["//tensorflow/lite:testdata/add.bin"],
+    visibility = ["//visibility:public"],
+)
+
 edgetpu_ops = [
     "add",
     "avg_pool",
diff --git a/tensorflow/lite/testing/generate_examples_report.py b/tensorflow/lite/testing/generate_examples_report.py
index 7bcf8cd86a1..2d7545be9b1 100644
--- a/tensorflow/lite/testing/generate_examples_report.py
+++ b/tensorflow/lite/testing/generate_examples_report.py
@@ -21,7 +21,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import cgi
+import html
 import json
 
 FAILED = "FAILED"
@@ -45,7 +45,7 @@ def make_report_table(fp, title, reports):
   reports.sort(key=lambda x: x[1]["tf"], reverse=True)
   def result_cell(x, row, col):
     """Produce a cell with the condition string `x`."""
-    s = cgi.escape(repr(x), quote=True)
+    s = html.escape(repr(x), quote=True)
     color = "#44ff44" if x == SUCCESS else (
         "#ff4444" if x == FAILED else "#eeeeee")
     handler = "ShowLog(%d, %d)" % (row, col)
@@ -76,8 +76,8 @@ log.innerHTML = "<pre>" + data[row][col]  + "</pre>";
 }
 """)
   fp.write("var data = \n")
-  fp.write(json.dumps([[cgi.escape(x[1]["tf_log"], quote=True),
-                        cgi.escape(x[1]["toco_log"], quote=True)]
+  fp.write(json.dumps([[html.escape(x[1]["tf_log"], quote=True),
+                        html.escape(x[1]["toco_log"], quote=True)]
                        for x in reports]))
   fp.write(";</script>\n")
 
@@ -100,14 +100,14 @@ log.innerHTML = "<pre>" + data[row][col]  + "</pre>";
   fp.write("<table>\n")
   fp.write("<tr>\n")
   for p in param_keys:
-    fp.write("<th>%s</th>\n" % cgi.escape(p, quote=True))
+    fp.write("<th>%s</th>\n" % html.escape(p, quote=True))
   fp.write("<th>TensorFlow</th>\n")
   fp.write("<th>TOCO</th>\n")
   fp.write("</tr>\n")
   for idx, (params, vals) in enumerate(reports):
     fp.write("<tr>\n")
     for p in param_keys:
-      fp.write("  <td>%s</td>\n" % cgi.escape(repr(params[p]), quote=True))
+      fp.write("  <td>%s</td>\n" % html.escape(repr(params[p]), quote=True))
 
     result_cell(vals["tf"], idx, 0)
     result_cell(vals["toco"], idx, 1)
diff --git a/tensorflow/lite/testing/generated_examples_zip_test.cc b/tensorflow/lite/testing/generated_examples_zip_test.cc
index 92f696d0e65..8d03911eb87 100644
--- a/tensorflow/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/lite/testing/generated_examples_zip_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/subprocess.h"
 #include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/lite/kernels/test_delegate_providers.h"
 #include "tensorflow/lite/testing/parse_testdata.h"
 #include "tensorflow/lite/testing/tflite_driver.h"
 #include "tensorflow/lite/testing/util.h"
@@ -47,7 +48,6 @@ string* FLAGS_tar_binary_path = new string("/bin/tar");
 string* FLAGS_unzip_binary_path = new string("/system/bin/unzip");
 string* FLAGS_tar_binary_path = new string("/system/bin/tar");
 #endif
-bool FLAGS_use_nnapi = false;
 bool FLAGS_ignore_unsupported_nnapi = false;
 }  // namespace
 
@@ -298,9 +298,10 @@ TEST_P(OpsTest, RunZipTests) {
 
   std::ifstream tflite_stream(tflite_test_case);
   ASSERT_TRUE(tflite_stream.is_open()) << tflite_test_case;
-  tflite::testing::TfLiteDriver test_driver(
-      FLAGS_use_nnapi ? TfLiteDriver::DelegateType::kNnapi
-                      : TfLiteDriver::DelegateType::kNone);
+  tflite::testing::TfLiteDriver test_driver;
+  const bool use_nnapi =
+      tflite::KernelTestDelegateProviders::Get()->ConstParams().Get<bool>(
+          "use_nnapi");
 
   auto quantized_tests_error = GetQuantizeTestsError();
   bool fully_quantize = false;
@@ -317,7 +318,7 @@ TEST_P(OpsTest, RunZipTests) {
   test_driver.SetModelBaseDir(tflite_dir);
 
   auto broken_tests = GetKnownBrokenTests();
-  if (FLAGS_use_nnapi) {
+  if (use_nnapi) {
     auto kBrokenNnapiTests = GetKnownBrokenNnapiTests();
     broken_tests.insert(kBrokenNnapiTests.begin(), kBrokenNnapiTests.end());
   }
@@ -334,7 +335,7 @@ TEST_P(OpsTest, RunZipTests) {
       }
     }
     if (bug_number.empty()) {
-      if (FLAGS_use_nnapi && FLAGS_ignore_unsupported_nnapi && !result) {
+      if (use_nnapi && FLAGS_ignore_unsupported_nnapi && !result) {
         EXPECT_EQ(message, string("Failed to invoke interpreter")) << message;
       } else {
         EXPECT_TRUE(result) << message;
@@ -408,8 +409,6 @@ int main(int argc, char** argv) {
       tensorflow::Flag("tar_binary_path",
                        tflite::testing::FLAGS_tar_binary_path,
                        "Location of a suitable tar binary."),
-      tensorflow::Flag("use_nnapi", &tflite::testing::FLAGS_use_nnapi,
-                       "Whether to enable the NNAPI delegate"),
       tensorflow::Flag("ignore_unsupported_nnapi",
                        &tflite::testing::FLAGS_ignore_unsupported_nnapi,
                        "Don't fail tests just because delegation to NNAPI "
@@ -417,7 +416,12 @@ int main(int argc, char** argv) {
   bool success = tensorflow::Flags::Parse(&argc, argv, flags);
   if (!success || (argc == 2 && !strcmp(argv[1], "--helpfull"))) {
     fprintf(stderr, "%s", tensorflow::Flags::Usage(argv[0], flags).c_str());
-    return 1;
+    return EXIT_FAILURE;
+  }
+
+  if (!tflite::testing::TfLiteDriver::InitTestDelegateProviders(
+          &argc, const_cast<const char**>(argv))) {
+    return EXIT_FAILURE;
   }
 
   ::tflite::LogToStderr();
diff --git a/tensorflow/lite/testing/op_tests/equal.py b/tensorflow/lite/testing/op_tests/equal.py
index ddbece129d3..da55cc8f9bb 100644
--- a/tensorflow/lite/testing/op_tests/equal.py
+++ b/tensorflow/lite/testing/op_tests/equal.py
@@ -32,6 +32,11 @@ def make_equal_tests(options):
       "input_shape_pair": [([], []), ([1, 1, 1, 3], [1, 1, 1, 3]),
                            ([2, 3, 4, 5], [2, 3, 4, 5]), ([2, 3, 3], [2, 3]),
                            ([5, 5], [1]), ([10], [2, 4, 10])],
+      "fully_quantize": [False],
+  }, {
+      "input_dtype": [tf.float32],
+      "input_shape_pair": [([1, 1, 1, 3], [1, 1, 1, 3]), ([2, 3, 3], [2, 3])],
+      "fully_quantize": [True],
   }]
 
   def build_graph(parameters):
@@ -60,4 +65,4 @@ def make_equal_tests(options):
       test_parameters,
       build_graph,
       build_inputs,
-      expected_tf_failures=4)
+      expected_tf_failures=5)
diff --git a/tensorflow/lite/testing/op_tests/greater.py b/tensorflow/lite/testing/op_tests/greater.py
index f30a085020b..a3210620ac8 100644
--- a/tensorflow/lite/testing/op_tests/greater.py
+++ b/tensorflow/lite/testing/op_tests/greater.py
@@ -32,6 +32,11 @@ def make_greater_tests(options):
       "input_shape_pair": [([1, 1, 1, 3], [1, 1, 1, 3]),
                            ([2, 3, 4, 5], [2, 3, 4, 5]), ([2, 3, 3], [2, 3]),
                            ([5, 5], [1]), ([10], [2, 4, 10])],
+      "fully_quantize": [False],
+  }, {
+      "input_dtype": [tf.float32],
+      "input_shape_pair": [([1, 1, 1, 3], [1, 1, 1, 3]), ([2, 3, 3], [2, 3])],
+      "fully_quantize": [True],
   }]
 
   def build_graph(parameters):
@@ -60,4 +65,4 @@ def make_greater_tests(options):
       test_parameters,
       build_graph,
       build_inputs,
-      expected_tf_failures=3)
+      expected_tf_failures=4)
diff --git a/tensorflow/lite/testing/op_tests/greater_equal.py b/tensorflow/lite/testing/op_tests/greater_equal.py
index 20a49719f12..c7d7c3397da 100644
--- a/tensorflow/lite/testing/op_tests/greater_equal.py
+++ b/tensorflow/lite/testing/op_tests/greater_equal.py
@@ -32,6 +32,11 @@ def make_greater_equal_tests(options):
       "input_shape_pair": [([1, 1, 1, 3], [1, 1, 1, 3]),
                            ([2, 3, 4, 5], [2, 3, 4, 5]), ([2, 3, 3], [2, 3]),
                            ([5, 5], [1]), ([10], [2, 4, 10])],
+      "fully_quantize": [False],
+  }, {
+      "input_dtype": [tf.float32],
+      "input_shape_pair": [([1, 1, 1, 3], [1, 1, 1, 3]), ([2, 3, 3], [2, 3])],
+      "fully_quantize": [True],
   }]
 
   def build_graph(parameters):
@@ -60,4 +65,4 @@ def make_greater_equal_tests(options):
       test_parameters,
       build_graph,
       build_inputs,
-      expected_tf_failures=3)
+      expected_tf_failures=4)
diff --git a/tensorflow/lite/testing/op_tests/less.py b/tensorflow/lite/testing/op_tests/less.py
index 099f0039454..2bb3c11ce57 100644
--- a/tensorflow/lite/testing/op_tests/less.py
+++ b/tensorflow/lite/testing/op_tests/less.py
@@ -32,6 +32,11 @@ def make_less_tests(options):
       "input_shape_pair": [([1, 1, 1, 3], [1, 1, 1, 3]),
                            ([2, 3, 4, 5], [2, 3, 4, 5]), ([2, 3, 3], [2, 3]),
                            ([5, 5], [1]), ([10], [2, 4, 10])],
+      "fully_quantize": [False],
+  }, {
+      "input_dtype": [tf.float32],
+      "input_shape_pair": [([1, 1, 1, 3], [1, 1, 1, 3]), ([2, 3, 3], [2, 3])],
+      "fully_quantize": [True],
   }]
 
   def build_graph(parameters):
@@ -60,4 +65,4 @@ def make_less_tests(options):
       test_parameters,
       build_graph,
       build_inputs,
-      expected_tf_failures=3)
+      expected_tf_failures=4)
diff --git a/tensorflow/lite/testing/op_tests/less_equal.py b/tensorflow/lite/testing/op_tests/less_equal.py
index 2e8e8d03887..141d599a893 100644
--- a/tensorflow/lite/testing/op_tests/less_equal.py
+++ b/tensorflow/lite/testing/op_tests/less_equal.py
@@ -32,6 +32,11 @@ def make_less_equal_tests(options):
       "input_shape_pair": [([1, 1, 1, 3], [1, 1, 1, 3]),
                            ([2, 3, 4, 5], [2, 3, 4, 5]), ([2, 3, 3], [2, 3]),
                            ([5, 5], [1]), ([10], [2, 4, 10])],
+      "fully_quantize": [False],
+  }, {
+      "input_dtype": [tf.float32],
+      "input_shape_pair": [([1, 1, 1, 3], [1, 1, 1, 3]), ([2, 3, 3], [2, 3])],
+      "fully_quantize": [True],
   }]
 
   def build_graph(parameters):
@@ -60,4 +65,4 @@ def make_less_equal_tests(options):
       test_parameters,
       build_graph,
       build_inputs,
-      expected_tf_failures=3)
+      expected_tf_failures=4)
diff --git a/tensorflow/lite/testing/op_tests/pad.py b/tensorflow/lite/testing/op_tests/pad.py
index 0746a5a1601..ec524270ea4 100644
--- a/tensorflow/lite/testing/op_tests/pad.py
+++ b/tensorflow/lite/testing/op_tests/pad.py
@@ -37,7 +37,8 @@ def make_pad_tests(options):
           "paddings": [[[0, 0], [0, 1], [2, 3], [0, 0]],
                        [[0, 1], [0, 0], [0, 0], [2, 3]]],
           "constant_paddings": [True, False],
-          "fully_quantize": [False]
+          "fully_quantize": [False],
+          "quant_16x8": [False]
       },
       # 2D:
       {
@@ -45,7 +46,8 @@ def make_pad_tests(options):
           "input_shape": [[1, 2]],
           "paddings": [[[0, 1], [2, 3]]],
           "constant_paddings": [True, False],
-          "fully_quantize": [False]
+          "fully_quantize": [False],
+          "quant_16x8": [False]
       },
       # 1D:
       {
@@ -53,7 +55,8 @@ def make_pad_tests(options):
           "input_shape": [[1]],
           "paddings": [[[1, 2]]],
           "constant_paddings": [False],
-          "fully_quantize": [False]
+          "fully_quantize": [False],
+          "quant_16x8": [False]
       },
       # 4D:
       {
@@ -63,7 +66,8 @@ def make_pad_tests(options):
                        [[0, 1], [0, 0], [0, 0], [2, 3]],
                        [[0, 0], [0, 0], [0, 0], [0, 0]]],
           "constant_paddings": [True],
-          "fully_quantize": [True]
+          "fully_quantize": [True],
+          "quant_16x8": [False, True]
       },
       # 2D:
       {
@@ -71,7 +75,8 @@ def make_pad_tests(options):
           "input_shape": [[1, 2]],
           "paddings": [[[0, 1], [2, 3]]],
           "constant_paddings": [True],
-          "fully_quantize": [True]
+          "fully_quantize": [True],
+          "quant_16x8": [False, True],
       },
       # 1D:
       {
@@ -79,7 +84,8 @@ def make_pad_tests(options):
           "input_shape": [[1]],
           "paddings": [[[1, 2]]],
           "constant_paddings": [True],
-          "fully_quantize": [True]
+          "fully_quantize": [True],
+          "quant_16x8": [False, True],
       },
   ]
 
diff --git a/tensorflow/lite/testing/op_tests/reduce.py b/tensorflow/lite/testing/op_tests/reduce.py
index 259dcad68f3..72324010f4b 100644
--- a/tensorflow/lite/testing/op_tests/reduce.py
+++ b/tensorflow/lite/testing/op_tests/reduce.py
@@ -249,13 +249,17 @@ def make_reduce_prod_tests(options):
 @register_make_test_function()
 def make_reduce_max_tests(options):
   """Make a set of tests to do max."""
-  return make_reduce_tests(tf.reduce_max)(options)
+  return make_reduce_tests(
+      tf.reduce_max, allow_fully_quantize=True, min_value=-1, max_value=1)(
+          options)
 
 
 @register_make_test_function()
 def make_reduce_min_tests(options):
   """Make a set of tests to do min."""
-  return make_reduce_tests(tf.reduce_min)(options)
+  return make_reduce_tests(
+      tf.reduce_min, allow_fully_quantize=True, min_value=-1, max_value=1)(
+          options)
 
 
 @register_make_test_function()
diff --git a/tensorflow/lite/testing/selective_build_test.cc b/tensorflow/lite/testing/selective_build_test.cc
new file mode 100644
index 00000000000..1a9a5b2efdb
--- /dev/null
+++ b/tensorflow/lite/testing/selective_build_test.cc
@@ -0,0 +1,80 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/java/src/main/native/op_resolver.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/model_builder.h"
+
+namespace tflite {
+bool RunWithRandomInputs(const std::string& filename) {
+  std::unique_ptr<tflite::FlatBufferModel> model =
+      tflite::FlatBufferModel::BuildFromFile(filename.c_str());
+
+  // Build the interpreter
+  std::unique_ptr<OpResolver> resolver = CreateOpResolver();
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  if (tflite::InterpreterBuilder(*model, *resolver)(&interpreter) !=
+      kTfLiteOk) {
+    LOG(ERROR) << "Could not initialize interpreter for TFLite model.";
+    return false;
+  }
+
+  // Resize input tensors, if desired.
+  if (interpreter->AllocateTensors() != kTfLiteOk) {
+    LOG(ERROR) << "Could not allocate tensor.";
+    return false;
+  }
+
+  // Fill the random data.
+  std::vector<std::vector<uint8_t>> sample;
+  for (int tensor_idx : interpreter->inputs()) {
+    auto tensor = interpreter->tensor(tensor_idx);
+    std::vector<uint8_t> data(tensor->bytes);
+    for (auto it = data.begin(); it != data.end(); ++it) {
+      *it = random();
+    }
+    sample.push_back(data);
+    tensor->data.raw = reinterpret_cast<char*>(sample.rbegin()->data());
+  }
+
+  // Running inference.
+  if (interpreter->Invoke() != kTfLiteOk) {
+    LOG(ERROR) << "Failed to run the model.";
+    return false;
+  }
+  return true;
+}
+
+TEST(SelectiveBuiltTest, AddModel) {
+  std::string model = "third_party/tensorflow/lite/testdata/add.bin";
+  EXPECT_THAT(RunWithRandomInputs(model), true);
+}
+
+TEST(SelectiveBuiltTest, LGTMModel) {
+  std::string model = "third_party/tensorflow/lite/testdata/lstm.bin";
+  EXPECT_THAT(RunWithRandomInputs(model), true);
+}
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/testing/split.h b/tensorflow/lite/testing/split.h
index d4e762164a4..6f7b9a68484 100644
--- a/tensorflow/lite/testing/split.h
+++ b/tensorflow/lite/testing/split.h
@@ -132,6 +132,26 @@ inline std::vector<std::complex<float>> Split(const string& s,
   return fields;
 }
 
+template <>
+inline std::vector<std::complex<double>> Split(const string& s,
+                                               const string& delimiter) {
+  std::vector<std::complex<double>> fields;
+  for (const auto& p : SplitToPos(s, delimiter)) {
+    std::string sc = s.substr(p.first, p.second - p.first);
+    std::string::size_type sz_real, sz_img;
+    double real = std::stod(sc, &sz_real);
+    double img = std::stod(sc.substr(sz_real), &sz_img);
+    if (sz_real + sz_img + 1 != sc.length()) {
+      std::cerr << "There were errors in parsing string, " << sc
+                << ", to complex value." << std::endl;
+      return fields;
+    }
+    std::complex<double> c(real, img);
+    fields.push_back(c);
+  }
+  return fields;
+}
+
 }  // namespace testing
 }  // namespace tflite
 
diff --git a/tensorflow/lite/testing/tflite_driver.cc b/tensorflow/lite/testing/tflite_driver.cc
index ae53be09889..526b3968b21 100644
--- a/tensorflow/lite/testing/tflite_driver.cc
+++ b/tensorflow/lite/testing/tflite_driver.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/hashtable/hashtable_ops.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/register_ref.h"
+#include "tensorflow/lite/kernels/test_delegate_providers.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/testing/join.h"
 #include "tensorflow/lite/testing/split.h"
@@ -127,15 +128,37 @@ class TfLiteDriver::DataExpectation {
     return error_is_large;
   }
 
+  bool CompareTwoValuesHelper(double v1, double v2) {
+    double diff = std::abs(v1 - v2);
+    bool error_is_large = false;
+    // For very small numbers, try absolute error, otherwise go with
+    // relative.
+    if (std::abs(v2) < relative_threshold_) {
+      error_is_large = (diff > absolute_threshold_);
+    } else {
+      error_is_large = (diff > relative_threshold_ * std::abs(v2));
+    }
+    return error_is_large;
+  }
+
   bool CompareTwoValues(std::complex<float> v1, std::complex<float> v2) {
     return CompareTwoValues(v1.real(), v2.real()) ||
            CompareTwoValues(v1.imag(), v2.imag());
   }
 
+  bool CompareTwoValues(std::complex<double> v1, std::complex<double> v2) {
+    return CompareTwoValues(v1.real(), v2.real()) ||
+           CompareTwoValues(v1.imag(), v2.imag());
+  }
+
   bool CompareTwoValues(float v1, float v2) {
     return CompareTwoValuesHelper(v1, v2);
   }
 
+  bool CompareTwoValues(double v1, double v2) {
+    return CompareTwoValuesHelper(v1, v2);
+  }
+
   template <typename T, typename TS>
   bool TypedCheck(bool verbose, const TfLiteTensor& tensor) {
     size_t tensor_size = tensor.bytes / sizeof(T);
@@ -315,12 +338,21 @@ bool TfLiteDriver::DataExpectation::Check(bool verbose,
     case kTfLiteComplex64:
       return TypedCheck<std::complex<float>, std::complex<float>>(verbose,
                                                                   tensor);
+    case kTfLiteComplex128:
+      return TypedCheck<std::complex<double>, std::complex<double>>(verbose,
+                                                                    tensor);
     default:
       fprintf(stderr, "Unsupported type %d in Check\n", tensor.type);
       return false;
   }
 }
 
+/* static */
+bool TfLiteDriver::InitTestDelegateProviders(int* argc, const char** argv) {
+  return tflite::KernelTestDelegateProviders::Get()->InitFromCmdlineArgs(argc,
+                                                                         argv);
+}
+
 TfLiteDriver::TfLiteDriver(DelegateType delegate_type, bool reference_kernel)
     : delegate_(nullptr, nullptr),
       relative_threshold_(kRelativeThreshold),
@@ -389,6 +421,16 @@ void TfLiteDriver::LoadModel(const string& bin_file_path) {
       Invalidate("Unable to the build graph using the delegate");
       return;
     }
+  } else {
+    auto* delegate_providers = tflite::KernelTestDelegateProviders::Get();
+    for (auto& one : delegate_providers->CreateAllDelegates()) {
+      if (interpreter_->ModifyGraphWithDelegate(std::move(one)) != kTfLiteOk) {
+        Invalidate(
+            "Unable to the build graph using the delegate initialized from "
+            "tflite::KernelTestDelegateProviders");
+        return;
+      }
+    }
   }
 
   must_allocate_tensors_ = true;
@@ -527,6 +569,9 @@ void TfLiteDriver::SetExpectation(int id, const string& csv_values) {
     case kTfLiteComplex64:
       expected_output_[id]->SetData<std::complex<float>>(csv_values);
       break;
+    case kTfLiteComplex128:
+      expected_output_[id]->SetData<std::complex<double>>(csv_values);
+      break;
     default:
       Invalidate(absl::StrCat("Unsupported tensor type ",
                               TfLiteTypeGetName(tensor->type),
diff --git a/tensorflow/lite/testing/tflite_driver.h b/tensorflow/lite/testing/tflite_driver.h
index bce3e9c4c01..1d7095efdb4 100644
--- a/tensorflow/lite/testing/tflite_driver.h
+++ b/tensorflow/lite/testing/tflite_driver.h
@@ -40,10 +40,15 @@ class TfLiteDriver : public TestRunner {
     kFlex,
   };
 
+  // Initialize the global test delegate providers from commandline arguments
+  // and returns true if successful.
+  static bool InitTestDelegateProviders(int* argc, const char** argv);
+
   /**
    * Creates a new TfLiteDriver
    * @param  delegate         The (optional) delegate to use.
-   * @param  reference_kernel Whether to use the builtin reference kernel ops.
+   * @param  reference_kernel Whether to use the builtin reference kernel
+   * ops.
    */
   explicit TfLiteDriver(DelegateType delegate_type = DelegateType::kNone,
                         bool reference_kernel = false);
diff --git a/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc b/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
index aa8e2556d1a..869d3486fba 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
@@ -31,7 +31,7 @@ bool TransposeAffectsMemoryOrder(std::vector<int> perm,
   // just the shape) then the flat buffer representation shouldn't change.
   std::vector<int> old_major_index_ordering;
   std::vector<int> new_major_index_ordering;
-  for (int i = 0, iter_limit = in_shape.size(); i < iter_limit; i++) {
+  for (int i = 0, end = in_shape.size(); i < end; i++) {
     if (in_shape[i] != 1) {
       old_major_index_ordering.push_back(i);
     }
diff --git a/tensorflow/lite/toco/graph_transformations/dequantize.cc b/tensorflow/lite/toco/graph_transformations/dequantize.cc
index bd2fdff3497..e6f796ed361 100644
--- a/tensorflow/lite/toco/graph_transformations/dequantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/dequantize.cc
@@ -35,7 +35,7 @@ void DequantizeBuffer(Array* array) {
   auto& new_data = array->GetMutableBuffer<ArrayDataType::kFloat>().data;
   new_data.resize(old_data.size());
   const auto& qparams = array->GetQuantizationParams();
-  for (int i = 0, iter_limit = old_data.size(); i < iter_limit; i++) {
+  for (int i = 0, end = old_data.size(); i < end; i++) {
     new_data[i] = qparams.scale * (old_data[i] - qparams.zero_point);
   }
 }
diff --git a/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc b/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc
index d684cc6971c..996d714ae8c 100644
--- a/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc
+++ b/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc
@@ -45,8 +45,7 @@ namespace toco {
   }
 
   // Drop min/max inputs
-  for (int i = 1, iter_limit = fakequant_op->inputs.size(); i < iter_limit;
-       i++) {
+  for (int i = 1, end = fakequant_op->inputs.size(); i < end; i++) {
     if (CountOpsWithInput(*model, fakequant_op->inputs[i]) == 1) {
       model->EraseArray(fakequant_op->inputs[i]);
     }
diff --git a/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc b/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
index 2cb930b8a5c..6f4d4a783c0 100644
--- a/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
+++ b/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
@@ -166,7 +166,7 @@ namespace toco {
   int index_of_previous_bad_value = 0;
   bool changed = false;
 
-  for (int i = 0, iter_limit = buffer_data.size(); i < iter_limit; i++) {
+  for (int i = 0, end = buffer_data.size(); i < end; i++) {
     if (buffer_data[i] == 0) {
       count_bad++;
       if (count_bad > 1) {
diff --git a/tensorflow/lite/toco/model.h b/tensorflow/lite/toco/model.h
index 58397f5a3eb..b42fed6fbc1 100644
--- a/tensorflow/lite/toco/model.h
+++ b/tensorflow/lite/toco/model.h
@@ -236,6 +236,7 @@ enum class ArrayDataType : uint8 {
   kComplex64,
   kFloat16,
   kFloat64,
+  kComplex128,
 };
 
 // Compile-time logic to map ArrayDataType to the corresponding C++ scalar type
diff --git a/tensorflow/lite/toco/tflite/BUILD b/tensorflow/lite/toco/tflite/BUILD
index d34f38a6863..18b531fd5f1 100644
--- a/tensorflow/lite/toco/tflite/BUILD
+++ b/tensorflow/lite/toco/tflite/BUILD
@@ -28,7 +28,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:ptr_util",
-        "//tensorflow/lite/delegates/flex:whitelisted_flex_ops_lib",
+        "//tensorflow/lite/delegates/flex:allowlisted_flex_ops_lib",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/toco:graph_transformations",
         "//tensorflow/lite/toco:model",
diff --git a/tensorflow/lite/toco/tflite/export_test.cc b/tensorflow/lite/toco/tflite/export_test.cc
index dd0b1273dca..ced55921e50 100644
--- a/tensorflow/lite/toco/tflite/export_test.cc
+++ b/tensorflow/lite/toco/tflite/export_test.cc
@@ -796,7 +796,7 @@ TEST(OperatorKeyTest, TestFlexWithUnsupportedOp) {
   EXPECT_EQ(key.version(), 1);
   // While HashTableV2 is excluded from the allowlisted flex op list, eventually
   // it won't be, and the following expectations will need to change as the op
-  // is explicitly blacklisted due to lack of asset support.
+  // is explicitly denylisted due to lack of asset support.
   EXPECT_FALSE(key.is_flex_op());
   EXPECT_FALSE(key.is_unsupported_flex_op());
 }
diff --git a/tensorflow/lite/toco/tflite/import.cc b/tensorflow/lite/toco/tflite/import.cc
index 136aa4ffaa8..4253ab93160 100644
--- a/tensorflow/lite/toco/tflite/import.cc
+++ b/tensorflow/lite/toco/tflite/import.cc
@@ -157,7 +157,7 @@ void ImportOperators(
       }
     }
     auto outputs = input_op->outputs();
-    for (int i = 0; i < outputs->Length(); i++) {
+    for (int i = 0, end = outputs->Length(); i < end; i++) {
       auto output_index = outputs->Get(i);
       const std::string& output_name = tensors_table.at(output_index);
       op->outputs.push_back(output_name);
diff --git a/tensorflow/lite/toco/tflite/op_version.cc b/tensorflow/lite/toco/tflite/op_version.cc
index c34cc8d1b24..222be969560 100644
--- a/tensorflow/lite/toco/tflite/op_version.cc
+++ b/tensorflow/lite/toco/tflite/op_version.cc
@@ -83,6 +83,7 @@ std::string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kGather, 1}, "1.6.0"},
           {{OperatorType::kGather, 2}, "1.14.0"},
           {{OperatorType::kGather, 3}, "1.15.0"},
+          {{OperatorType::kGather, 4}, kPendingReleaseOpVersion},
           {{OperatorType::kGatherNd, 1}, "1.14.0"},
           {{OperatorType::kGatherNd, 2}, kPendingReleaseOpVersion},
           {{OperatorType::kSvdf, 1}, "1.5.0"},
@@ -132,6 +133,7 @@ std::string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kBidirectionalSequenceRnn, 1}, "1.14.0"},
           {{OperatorType::kMean, 1}, "1.6.0"},
           {{OperatorType::kMean, 2}, "1.14.0"},
+          {{OperatorType::kMean, 3}, kPendingReleaseOpVersion},
           {{OperatorType::kSum, 1}, "1.10.0"},
           {{OperatorType::kSum, 2}, "1.15.0"},
           {{OperatorType::kReduceMax, 1}, "1.11.0"},
diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index 144884f054a..585b15bae2e 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -51,6 +51,7 @@ namespace tflite {
       {ArrayDataType::kInt64, ::tflite::TensorType_INT64},
       {ArrayDataType::kString, ::tflite::TensorType_STRING},
       {ArrayDataType::kComplex64, ::tflite::TensorType_COMPLEX64},
+      {ArrayDataType::kComplex128, ::tflite::TensorType_COMPLEX128},
       {ArrayDataType::kFloat16, ::tflite::TensorType_FLOAT16},
       {ArrayDataType::kFloat64, ::tflite::TensorType_FLOAT64}};
 
diff --git a/tensorflow/lite/toco/tooling_util.cc b/tensorflow/lite/toco/tooling_util.cc
index be4cda8aa3d..d84763faee6 100644
--- a/tensorflow/lite/toco/tooling_util.cc
+++ b/tensorflow/lite/toco/tooling_util.cc
@@ -1769,6 +1769,8 @@ int ElementSize(ArrayDataType data_type) {
       return 8;
     case ArrayDataType::kComplex64:
       return 8;
+    case ArrayDataType::kComplex128:
+      return 16;
     case ArrayDataType::kFloat64:
       return 8;
 
@@ -2313,6 +2315,8 @@ ArrayDataType ConvertIODataTypeToArrayDataType(IODataType type) {
       return ArrayDataType::kString;
     case COMPLEX64:
       return ArrayDataType::kComplex64;
+    case COMPLEX128:
+      return ArrayDataType::kComplex128;
     case FLOAT16:
       return ArrayDataType::kFloat16;
     case FLOAT64:
diff --git a/tensorflow/lite/toco/types.proto b/tensorflow/lite/toco/types.proto
index 029a159321e..009891c3bcb 100644
--- a/tensorflow/lite/toco/types.proto
+++ b/tensorflow/lite/toco/types.proto
@@ -52,4 +52,7 @@ enum IODataType {
 
   // Double precision float, not quantized.
   FLOAT64 = 11;
+
+  // Complex128, not quantized
+  COMPLEX128 = 12;
 }
diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD
index 89d3da1ec6a..ad19cd2b519 100644
--- a/tensorflow/lite/tools/BUILD
+++ b/tensorflow/lite/tools/BUILD
@@ -9,7 +9,9 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["logging.h"])
+exports_files([
+    "logging.h",
+])
 
 common_copts = ["-Wall"]
 
@@ -283,6 +285,38 @@ tf_cc_binary(
     ],
 )
 
+cc_library(
+    name = "list_flex_ops_main_lib",
+    srcs = ["list_flex_ops_main.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":list_flex_ops",
+        "//tensorflow/lite/tools:command_line_flags",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "list_flex_ops_no_kernel",
+    srcs = ["list_flex_ops_no_kernel.cc"],
+    hdrs = ["list_flex_ops.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_binary(
+    name = "list_flex_ops_no_kernel_main",
+    srcs = ["list_flex_ops_main.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":list_flex_ops_no_kernel",
+        "//tensorflow/lite/tools:command_line_flags",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 tf_cc_test(
     name = "list_flex_ops_test",
     srcs = ["list_flex_ops_test.cc"],
diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
index 413dd8c6181..68cc59dd371 100644
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -34,13 +34,6 @@ and the following optional parameters:
 *   `run_delay`: `float` (default=-1.0) \
     The delay in seconds between subsequent benchmark runs. Non-positive values
     mean use no delay.
-*   `use_legacy_nnapi`: `bool` (default=false) \
-    Whether to use the legacy
-    [Android NNAPI](https://developer.android.com/ndk/guides/neuralnetworks/)
-    TFLite path, which requires the graph to be fully compatible with NNAPI.
-    This is available on recent Android devices. Note that some Android P
-    devices will fail to use NNAPI for models in `/data/local/tmp/` and this
-    benchmark tool will not correctly use NNAPI.
 *   `enable_op_profiling`: `bool` (default=false) \
     Whether to enable per-operator profiling measurement.
 *   `enable_platform_tracing`: `bool` (default=false) \
@@ -65,8 +58,7 @@ The following simply lists the names of these parameters and additional notes
 where applicable. For details about each parameter, please refer to
 [this page](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/delegates/README.md#tflite-delegate-registrar).
 #### Common parameters
-* `max_delegated_partitions`: `int` (default=0) \
-Note when `use_legacy_nnapi` is selected, this parameter won't work.
+* `max_delegated_partitions`: `int` (default=0)
 * `min_nodes_per_partition`:`int` (default=0)
 
 #### GPU delegate
@@ -77,14 +69,15 @@ Note when `use_legacy_nnapi` is selected, this parameter won't work.
 * `gpu_wait_type`: `str` (default="")
 
 #### NNAPI delegate
+
 *   `use_nnapi`: `bool` (default=false) \
     Note some Android P devices will fail to use NNAPI for models in
     `/data/local/tmp/` and this benchmark tool will not correctly use NNAPI.
 *   `nnapi_execution_preference`: `str` (default="")
 *   `nnapi_execution_priority`: `str` (default="") \
-    Note this requires Anroid 11+.
+    Note this requires Android 11+.
 *   `nnapi_accelerator_name`: `str` (default="") \
-    Note this requires Anroid 10+.
+    Note this requires Android 10+.
 *   `disable_nnapi_cpu`: `bool` (default=false)
 *   `nnapi_allow_fp16`: `bool` (default=false)
 
diff --git a/tensorflow/lite/tools/benchmark/android/BUILD b/tensorflow/lite/tools/benchmark/android/BUILD
index b9b9331fcb8..6645b730bac 100644
--- a/tensorflow/lite/tools/benchmark/android/BUILD
+++ b/tensorflow/lite/tools/benchmark/android/BUILD
@@ -2,6 +2,7 @@
 #   BenchmarkModel Android harness for TensorFlow Lite benchmarks.
 
 load("//tensorflow/lite:build_def.bzl", "tflite_jni_binary")
+load("//tensorflow/lite:special_rules.bzl", "tflite_hexagon_nn_skel_libraries")
 load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
 
 package(
@@ -23,7 +24,10 @@ android_binary(
     # can't be built. We need to prevent the build system from trying to
     # use the target in that case.
     tags = ["manual"],
-    deps = [":tensorflowlite_benchmark_native"],
+    deps = [
+        ":hexagon_libs",
+        ":tensorflowlite_benchmark_native",
+    ],
 )
 
 tflite_jni_binary(
@@ -43,3 +47,17 @@ cc_library(
     srcs = ["libtensorflowlite_benchmark.so"],
     visibility = ["//visibility:private"],
 )
+
+cc_library(
+    name = "hexagon_libs",
+    srcs = select({
+        "//tensorflow:android_arm64": [
+            "//tensorflow/lite/delegates/hexagon/hexagon_nn:libhexagon_interface.so",
+        ] + tflite_hexagon_nn_skel_libraries(),
+        "//tensorflow:android_arm": [
+            "//tensorflow/lite/delegates/hexagon/hexagon_nn:libhexagon_interface.so",
+        ] + tflite_hexagon_nn_skel_libraries(),
+        "//conditions:default": [],
+    }),
+    visibility = ["//visibility:private"],
+)
diff --git a/tensorflow/lite/tools/benchmark/android/README.md b/tensorflow/lite/tools/benchmark/android/README.md
index 00092c4a44f..f73939c96bf 100644
--- a/tensorflow/lite/tools/benchmark/android/README.md
+++ b/tensorflow/lite/tools/benchmark/android/README.md
@@ -31,6 +31,26 @@ bazel build -c opt \
   tensorflow/lite/tools/benchmark/android:benchmark_model
 ```
 
+(Optional) To enable Hexagon delegate with `--use_hexagon=true` option, you can
+download and install the libraries as the guided in [hexagon delegate]
+(https://www.tensorflow.org/lite/performance/hexagon_delegate#step_2_add_hexagon_libraries_to_your_android_app)
+page. For example, if you installed the libraries at third_party/hexagon_nn_skel
+and created third_party/hexagon_nn_skel/BUILD with a build target,
+
+```
+filegroup(
+    name = "libhexagon_nn_skel",
+    srcs = glob(["*.so"]),
+)
+```
+
+you need to modify tflite_hexagon_nn_skel_libraries macro in
+tensorflow/lite/special_rules.bzl to specifiy the build target.
+
+```
+return ["//third_party/hexagon_nn_skel:libhexagon_nn_skel"]
+```
+
 (2) Connect your phone. Install the benchmark APK to your phone with adb:
 
 ```
diff --git a/tensorflow/lite/tools/benchmark/android/src/org/tensorflow/lite/benchmark/BenchmarkModelActivity.java b/tensorflow/lite/tools/benchmark/android/src/org/tensorflow/lite/benchmark/BenchmarkModelActivity.java
index 6833d70931b..baf981f6680 100644
--- a/tensorflow/lite/tools/benchmark/android/src/org/tensorflow/lite/benchmark/BenchmarkModelActivity.java
+++ b/tensorflow/lite/tools/benchmark/android/src/org/tensorflow/lite/benchmark/BenchmarkModelActivity.java
@@ -36,6 +36,10 @@ public class BenchmarkModelActivity extends Activity {
     Intent intent = getIntent();
     Bundle bundle = intent.getExtras();
     String args = bundle.getString(ARGS_INTENT_KEY_0, bundle.getString(ARGS_INTENT_KEY_1));
+    if (args.contains("--use_hexagon=true") || args.contains("--use_hexagon=1")) {
+      // Users should not specify this argument.
+      args = args + " --hexagon_lib_path=" + getApplicationInfo().nativeLibraryDir;
+    }
     Log.i(TAG, "Running TensorFlow Lite benchmark with args: " + args);
 
     Trace.beginSection("TFLite Benchmark Model");
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index f816bbdedfe..ef9742eaac7 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -259,8 +259,6 @@ BenchmarkParams BenchmarkTfLiteModel::DefaultParams() {
                           BenchmarkParam::Create<std::string>(""));
   default_params.AddParam("input_layer_value_files",
                           BenchmarkParam::Create<std::string>(""));
-  default_params.AddParam("use_legacy_nnapi",
-                          BenchmarkParam::Create<bool>(false));
   default_params.AddParam("allow_fp16", BenchmarkParam::Create<bool>(false));
   default_params.AddParam("require_full_delegation",
                           BenchmarkParam::Create<bool>(false));
@@ -324,7 +322,6 @@ std::vector<Flag> BenchmarkTfLiteModel::GetFlags() {
           "input_layer_value_range of the input_name will be ignored. The file "
           "format is binary and it should be array format or null separated "
           "strings format."),
-      CreateFlag<bool>("use_legacy_nnapi", &params_, "use legacy nnapi api"),
       CreateFlag<bool>("allow_fp16", &params_, "allow fp16"),
       CreateFlag<bool>("require_full_delegation", &params_,
                        "require delegate to run the entire graph"),
@@ -363,9 +360,6 @@ void BenchmarkTfLiteModel::LogParams() {
   LOG_BENCHMARK_PARAM(std::string, "input_layer_value_files",
                       "Input value files", verbose);
 
-#if defined(__ANDROID__)
-  LOG_BENCHMARK_PARAM(bool, "use_legacy_nnapi", "Use legacy nnapi", verbose);
-#endif
   LOG_BENCHMARK_PARAM(bool, "allow_fp16", "Allow fp16", verbose);
   LOG_BENCHMARK_PARAM(bool, "require_full_delegation",
                       "Require full delegation", verbose);
@@ -635,7 +629,6 @@ TfLiteStatus BenchmarkTfLiteModel::Init() {
   profiling_listener_ = MayCreateProfilingListener();
   if (profiling_listener_) AddListener(profiling_listener_.get());
 
-  interpreter_->UseNNAPI(params_.Get<bool>("use_legacy_nnapi"));
   interpreter_->SetAllowFp16PrecisionForFp32(params_.Get<bool>("allow_fp16"));
 
   owned_delegates_.clear();
@@ -650,26 +643,41 @@ TfLiteStatus BenchmarkTfLiteModel::Init() {
                         << " delegate.";
       return kTfLiteError;
     } else {
-      bool fully_delegated = true;
-      if (interpreter_->execution_plan().size() != 1) {
-        fully_delegated = false;
-      } else {
-        int first_node_id = interpreter_->execution_plan()[0];
-        const TfLiteNode first_node =
-            interpreter_->node_and_registration(first_node_id)->first;
-        if (delegate.get() != first_node.delegate) {
-          fully_delegated = false;
+      // Ideally, such delegate info should already be computed when the
+      // delegate is being applied to the model graph.
+      int num_delegated_kernels = 0;
+      for (int i = 0; i < interpreter_->execution_plan().size(); ++i) {
+        int node_id = interpreter_->execution_plan()[i];
+        const TfLiteNode& node =
+            interpreter_->node_and_registration(node_id)->first;
+        if (delegate.get() == node.delegate) {
+          num_delegated_kernels++;
         }
       }
+      bool fully_delegated = (num_delegated_kernels == 1 &&
+                              interpreter_->execution_plan().size() == 1);
+
       if (params_.Get<bool>("require_full_delegation") && !fully_delegated) {
         TFLITE_LOG(ERROR) << "Disallowed CPU fallback detected.";
         return kTfLiteError;
       }
-      const std::string delegate_status =
-          fully_delegated ? "completely" : "partially";
-      TFLITE_LOG(INFO) << "Applied " << delegate_provider->GetName()
-                       << " delegate, and the model graph will be "
-                       << delegate_status << " executed w/ the delegate.";
+      if (fully_delegated) {
+        TFLITE_LOG(INFO) << "Explicitly applied "
+                         << delegate_provider->GetName()
+                         << " delegate, and the model graph will be completely"
+                         << " executed by the delegate.";
+      } else if (num_delegated_kernels > 0) {
+        TFLITE_LOG(INFO) << "Explicitly applied "
+                         << delegate_provider->GetName()
+                         << " delegate, and the model graph will be partially"
+                         << " executed by the delegate w/ "
+                         << num_delegated_kernels << " delegate kernels.";
+      } else {
+        TFLITE_LOG(INFO)
+            << "Though " << delegate_provider->GetName()
+            << " delegate is explicitly applied, the model graph will not be"
+            << " executed by the delegate.";
+      }
     }
     owned_delegates_.emplace_back(std::move(delegate));
   }
diff --git a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
index 13e846406e6..23eb528f4c9 100644
--- a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
+++ b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
@@ -233,11 +233,32 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a);
     }                                      \
   } while (0)
 
+// Define TFL_CAPI_EXPORT macro to export a function properly with a shared
+// library.
+#ifdef SWIG
+#define TFL_CAPI_EXPORT
+#else
+#if defined(_WIN32)
+#ifdef TFL_COMPILE_LIBRARY
+#define TFL_CAPI_EXPORT __declspec(dllexport)
+#else
+#define TFL_CAPI_EXPORT __declspec(dllimport)
+#endif  // TFL_COMPILE_LIBRARY
+#else
+#define TFL_CAPI_EXPORT __attribute__((visibility("default")))
+#endif  // _WIN32
+#endif  // SWIG
+
 // Single-precision complex data type compatible with the C99 definition.
 typedef struct TfLiteComplex64 {
   float re, im;  // real and imaginary parts, respectively.
 } TfLiteComplex64;
 
+// Double-precision complex data type compatible with the C99 definition.
+typedef struct TfLiteComplex128 {
+  double re, im;  // real and imaginary parts, respectively.
+} TfLiteComplex128;
+
 // Half precision data type compatible with the C99 definition.
 typedef struct TfLiteFloat16 {
   uint16_t data;
@@ -257,6 +278,7 @@ typedef enum {
   kTfLiteInt8 = 9,
   kTfLiteFloat16 = 10,
   kTfLiteFloat64 = 11,
+  kTfLiteComplex128 = 12,
 } TfLiteType;
 
 // Return the name of a given type, for error reporting purposes.
@@ -313,12 +335,14 @@ typedef union TfLitePtrUnion {
   int64_t* i64;
   float* f;
   TfLiteFloat16* f16;
+  double* f64;
   char* raw;
   const char* raw_const;
   uint8_t* uint8;
   bool* b;
   int16_t* i16;
   TfLiteComplex64* c64;
+  TfLiteComplex128* c128;
   int8_t* int8;
   /* Only use this member. */
   void* data;
@@ -334,6 +358,8 @@ typedef union TfLitePtrUnion {
 //  * kTfLitePersistentRo: Allocated and populated during prepare. This is
 //        useful for tensors that can be computed during prepare and treated
 //        as constant inputs for downstream ops (also in prepare).
+//  * kTfLiteCustom: Custom memory allocation provided by the user. See
+//        TfLiteCustomAllocation below.
 typedef enum TfLiteAllocationType {
   kTfLiteMemNone = 0,
   kTfLiteMmapRo,
@@ -341,6 +367,7 @@ typedef enum TfLiteAllocationType {
   kTfLiteArenaRwPersistent,
   kTfLiteDynamic,
   kTfLitePersistentRo,
+  kTfLiteCustom,
 } TfLiteAllocationType;
 
 // The delegates should use zero or positive integers to represent handles.
@@ -373,6 +400,15 @@ typedef struct TfLiteSparsity {
   int dim_metadata_size;
 } TfLiteSparsity;
 
+// Defines a custom memory allocation not owned by the runtime.
+// `data` should be aligned to kDefaultTensorAlignment defined in
+// lite/util.h. (Currently 64 bytes)
+// NOTE: See Interpreter.SetCustomAllocationForTensor for details on usage.
+typedef struct TfLiteCustomAllocation {
+  void* data;
+  size_t bytes;
+} TfLiteCustomAllocation;
+
 // An tensor in the interpreter system which is a wrapper around a buffer of
 // data including a dimensionality (or NULL if not currently defined).
 #ifndef TF_LITE_STATIC_MEMORY
@@ -692,12 +728,11 @@ typedef struct TfLiteContext {
   void* profiler;
 
   // Allocate persistent buffer which has the same life time as the interpreter.
+  // Returns nullptr on failure.
   // The memory is allocated from heap for TFL, and from tail in TFLM.
-  // If *ptr is not nullptr, the pointer will be reallocated.
-  // This method is only available in Prepare stage.
+  // This method is only available in Init or Prepare stage.
   // WARNING: This is an experimental interface that is subject to change.
-  TfLiteStatus (*AllocatePersistentBuffer)(struct TfLiteContext* ctx,
-                                           size_t bytes, void** ptr);
+  void* (*AllocatePersistentBuffer)(struct TfLiteContext* ctx, size_t bytes);
 
   // Allocate a buffer which will be deallocated right after invoke phase.
   // The memory is allocated from heap in TFL, and from volatile arena in TFLM.
@@ -753,16 +788,17 @@ typedef struct TfLiteContext {
       struct TfLiteContext* context, const TfLiteIntArray* nodes_to_replace,
       TfLiteDelegateParams** partition_params_array, int* num_partitions);
 
-  // Returns a TfLiteTensor struct for a given index in the subgraph.
+  // Returns a TfLiteTensor struct for a given index.
   // WARNING: This is an experimental interface that is subject to change.
   // WARNING: This method may not be available on all platforms.
-  TfLiteTensor* (*GetTensor)(struct TfLiteContext* context, int subgraph_idx);
+  TfLiteTensor* (*GetTensor)(const struct TfLiteContext* context,
+                             int tensor_idx);
 
-  // Returns a TfLiteEvalTensor struct for a given index in the subgraph.
+  // Returns a TfLiteEvalTensor struct for a given index.
   // WARNING: This is an experimental interface that is subject to change.
   // WARNING: This method may not be available on all platforms.
-  TfLiteEvalTensor* (*GetEvalTensor)(struct TfLiteContext* context,
-                                     int subgraph_idx);
+  TfLiteEvalTensor* (*GetEvalTensor)(const struct TfLiteContext* context,
+                                     int tensor_idx);
 } TfLiteContext;
 
 typedef struct TfLiteRegistration {
@@ -837,7 +873,26 @@ typedef enum TfLiteDelegateFlags {
   //
   // If the delegate isn't capable to handle dynamic tensors, this flag need
   // to be set to false.
-  kTfLiteDelegateFlagsAllowDynamicTensors = 1
+  kTfLiteDelegateFlagsAllowDynamicTensors = 1,
+
+  // This flag can be used by delegates (that allow dynamic tensors) to ensure
+  // applicable tensor shapes are automatically propagated in the case of tensor
+  // resizing.
+  // This means that non-dynamic (allocation_type != kTfLiteDynamic) I/O tensors
+  // of a delegate kernel will have correct shapes before its Prepare() method
+  // is called. The runtime leverages TFLite builtin ops in the original
+  // execution plan to propagate shapes.
+  //
+  // A few points to note:
+  // 1. This requires kTfLiteDelegateFlagsAllowDynamicTensors. If that flag is
+  // false, this one is redundant since the delegate kernels are re-initialized
+  // every time tensors are resized.
+  // 2. Enabling this flag adds some overhead to AllocateTensors(), since extra
+  // work is required to prepare the original execution plan.
+  // 3. This flag requires that the original execution plan only have ops with
+  // valid registrations (and not 'dummy' custom ops like with Flex).
+  // WARNING: This feature is experimental and subject to change.
+  kTfLiteDelegateFlagsRequirePropagatedShapes = 2
 } TfLiteDelegateFlags;
 
 // WARNING: This is an experimental interface that is subject to change.
diff --git a/tensorflow/lite/tools/benchmark/experimental/firebase/android/AndroidManifest.xml b/tensorflow/lite/tools/benchmark/experimental/firebase/android/AndroidManifest.xml
new file mode 100644
index 00000000000..647204e2f04
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/firebase/android/AndroidManifest.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="org.tensorflow.lite.benchmark.firebase">
+
+    <!-- Necessary for loading custom models from disk and writing result to disk. -->
+    <uses-permission android:name="android.permission.READ_EXTERNAL_STORAGE"/>
+    <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE"/>
+
+    <uses-sdk
+        android:minSdkVersion="23"
+        android:targetSdkVersion="23" />
+
+    <application>
+        <!-- Number of Firebase Game Loop test scenarios defined in this application. -->
+        <meta-data
+            android:name="com.google.test.loops"
+            android:value="10" />
+        <!-- This Activity runs on the Firebase Test Lab. -->
+        <activity
+            android:name=".BenchmarkModelActivity"
+            android:screenOrientation="portrait"
+            android:label="TFLite Benchmark on Firebase Test Lab"
+            android:theme="@android:style/Theme.NoDisplay"
+            android:exported="true"
+            android:noHistory="true">
+            <!-- Intent filter for the Firebase Game Loop test.  -->
+            <intent-filter>
+                <action android:name="com.google.intent.action.TEST_LOOP" />
+                <category android:name="android.intent.category.DEFAULT" />
+                <data android:mimeType="application/javascript" />
+            </intent-filter>
+        </activity>
+    </application>
+
+</manifest>
diff --git a/tensorflow/lite/tools/benchmark/experimental/firebase/android/BUILD b/tensorflow/lite/tools/benchmark/experimental/firebase/android/BUILD
new file mode 100644
index 00000000000..98591063962
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/firebase/android/BUILD
@@ -0,0 +1,62 @@
+# Description:
+#   BenchmarkModel Android harness for Firebase Test Lab.
+
+load("//tensorflow/lite:build_def.bzl", "tflite_jni_binary")
+load("//tensorflow/lite:special_rules.bzl", "tflite_hexagon_nn_skel_libraries")
+load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
+
+package(
+    default_visibility = ["//visibility:private"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+exports_files(["LICENSE"])
+
+android_binary(
+    name = "benchmark_model_firebase",
+    srcs = glob([
+        "src/**/*.java",
+    ]),
+    custom_package = "org.tensorflow.lite.benchmark.firebase",
+    manifest = "AndroidManifest.xml",
+    # In some platforms we don't have an Android SDK/NDK and this target
+    # can't be built. We need to prevent the build system from trying to
+    # use the target in that case.
+    tags = ["manual"],
+    deps = [
+        ":hexagon_libs",
+        ":tensorflowlite_benchmark_firebase_native",
+    ],
+)
+
+tflite_jni_binary(
+    name = "libtensorflowlite_benchmark_firebase.so",
+    srcs = glob([
+        "jni/**/*.cc",
+        "jni/**/*.h",
+    ]),
+    deps = [
+        "//tensorflow/lite/java/jni",
+        "//tensorflow/lite/tools/benchmark:benchmark_tflite_model_lib",
+    ],
+)
+
+cc_library(
+    name = "tensorflowlite_benchmark_firebase_native",
+    srcs = ["libtensorflowlite_benchmark_firebase.so"],
+    visibility = ["//visibility:private"],
+)
+
+cc_library(
+    name = "hexagon_libs",
+    srcs = select({
+        "//tensorflow:android_arm64": [
+            "//tensorflow/lite/delegates/hexagon/hexagon_nn:libhexagon_interface.so",
+        ] + tflite_hexagon_nn_skel_libraries(),
+        "//tensorflow:android_arm": [
+            "//tensorflow/lite/delegates/hexagon/hexagon_nn:libhexagon_interface.so",
+        ] + tflite_hexagon_nn_skel_libraries(),
+        "//conditions:default": [],
+    }),
+    visibility = ["//visibility:private"],
+)
diff --git a/tensorflow/lite/tools/benchmark/experimental/firebase/android/jni/benchmark_model_jni.cc b/tensorflow/lite/tools/benchmark/experimental/firebase/android/jni/benchmark_model_jni.cc
new file mode 100644
index 00000000000..97cba275931
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/firebase/android/jni/benchmark_model_jni.cc
@@ -0,0 +1,268 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <errno.h>
+#include <jni.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/tools/benchmark/benchmark_tflite_model.h"
+
+#ifdef __ANDROID__
+#include <android/log.h>
+#endif
+
+namespace tflite {
+namespace benchmark {
+namespace {
+
+const char kOutputDir[] = "/sdcard/benchmark_output";
+
+class FirebaseReportingListener : public BenchmarkListener {
+ public:
+  explicit FirebaseReportingListener(std::string tag, int report_fd)
+      : tag_(tag), report_fd_(report_fd) {
+    if (report_fd < 0) {
+#ifdef __ANDROID__
+      __android_log_print(
+          ANDROID_LOG_ERROR, "tflite",
+          "Report would be streamed only to local log not to Firebase "
+          "since the Firebase log file is not opened.");
+#else
+      fprintf(stderr,
+              "Report would be streamed only to local log not to Firebase "
+              "since the Firebase log file is not opened.");
+#endif
+    }
+  }
+
+  void OnBenchmarkEnd(const BenchmarkResults& results) override {
+    ReportResult(results);
+  }
+
+  void ReportFailure(TfLiteStatus status) {
+    std::string status_msg =
+        status == kTfLiteError
+            ? "TFLite error"
+            : (status == kTfLiteDelegateError ? "TFLite delegate error"
+                                              : "Unknown error code");
+    Report(status_msg, std::vector<std::pair<std::string, std::string>>());
+  }
+
+ private:
+  void Report(
+      const std::string& status,
+      const std::vector<std::pair<std::string, std::string>>& contents) {
+    // The output format of Firebase Game Loop test is json.
+    // https://firebase.google.com/docs/test-lab/android/game-loop#output-example
+    std::stringstream report;
+    report << "{\n"
+           << "  \"name\": \"TFLite benchmark\",\n"
+           << "  \"benchmark config\": \"" << tag_ << "\",\n"
+           << "  \"status\": \"" << status << "\"";
+    for (const auto& content : contents) {
+      report << ",\n"
+             << "  \"" << content.first << "\": \"" << content.second << "\"";
+    }
+    report << "\n}\n";
+
+    auto report_str = report.str();
+    if (report_fd_ >= 0) {
+      write(report_fd_, report_str.c_str(), report_str.size());
+    }
+
+#ifdef __ANDROID__
+    __android_log_print(ANDROID_LOG_ERROR, "tflite", "%s", report_str.c_str());
+#else
+    fprintf(stderr, "%s", report_str.c_str());
+#endif
+  }
+
+  void ReportResult(const BenchmarkResults& results) {
+    std::vector<std::pair<std::string, std::string>> contents;
+    std::stringstream avg_time;
+    avg_time << "init: " << results.startup_latency_us() << ", "
+             << "warmup: " << results.warmup_time_us().avg() << ", "
+             << "inference: " << results.inference_time_us().avg();
+    contents.emplace_back("average time in us", avg_time.str());
+    std::stringstream overall_mem_usage;
+    overall_mem_usage << results.overall_mem_usage();
+    contents.emplace_back("overall memory usage", overall_mem_usage.str());
+
+    Report("OK", contents);
+  }
+
+  std::string tag_;
+  int report_fd_;
+};
+
+class CsvExportingListener : public BenchmarkListener {
+ public:
+  explicit CsvExportingListener(std::string tag) : tag_(tag) {}
+
+  void OnBenchmarkEnd(const BenchmarkResults& results) override {
+    if (!CreateOutputDir()) {
+#ifdef __ANDROID__
+      __android_log_print(ANDROID_LOG_ERROR, "tflite",
+                          "Failed to create output directory %s.", kOutputDir);
+#else
+      fprintf(stderr, "Failed to create output directory %s.", kOutputDir);
+#endif
+      return;
+    }
+    WriteBenchmarkResultCsv(results);
+  }
+
+ private:
+  bool CreateOutputDir() {
+    struct stat st;
+    if (stat(kOutputDir, &st) != 0) {
+      if (mkdir(kOutputDir, 0777) != 0 && errno != EEXIST) {
+        return false;
+      }
+    } else if (!S_ISDIR(st.st_mode)) {
+      errno = ENOTDIR;
+      return false;
+    }
+    return true;
+  }
+
+  void WriteBenchmarkResultCsv(const BenchmarkResults& results) {
+    auto init_us = results.startup_latency_us();
+    auto warmup_us = results.warmup_time_us();
+    auto inference_us = results.inference_time_us();
+    auto init_mem_usage = results.init_mem_usage();
+    auto overall_mem_usage = results.overall_mem_usage();
+
+    std::stringstream file_name;
+    file_name << kOutputDir << "/benchmark_result_" << tag_;
+
+    std::ofstream file;
+    file.open(file_name.str().c_str());
+    file << "config_key,model_size,init_time,"
+         << "warmup_avg,warmup_min,warmup_max,warmup_stddev,"
+         << "inference_avg,inference_min,inference_max,inference_stddev,"
+         << "init_max_rss,init_total_alloc,init_in_use_alloc,"
+         << "overall_max_rss,overall_total_alloc,overall_in_use_alloc\n";
+    file << tag_ << "," << results.model_size_mb() << "," << init_us << ","
+         << warmup_us.avg() << "," << warmup_us.min() << "," << warmup_us.max()
+         << "," << warmup_us.std_deviation() << "," << inference_us.avg() << ","
+         << inference_us.min() << "," << inference_us.max() << ","
+         << inference_us.std_deviation() << ","
+         << (init_mem_usage.max_rss_kb / 1024.0) << ","
+         << (init_mem_usage.total_allocated_bytes / 1024.0 / 1024.0) << ","
+         << (init_mem_usage.in_use_allocated_bytes / 1024.0 / 1024.0) << ","
+         << (overall_mem_usage.max_rss_kb / 1024.0) << ","
+         << (overall_mem_usage.total_allocated_bytes / 1024.0 / 1024.0) << ","
+         << (overall_mem_usage.in_use_allocated_bytes / 1024.0 / 1024.0)
+         << "\n";
+    file.close();
+  }
+
+  std::string tag_;
+};
+
+std::string GetScenarioConfig(const string& library_dir, int scenario,
+                              std::vector<std::string>& args) {
+  // The number of scenarios should equal to the value specified in
+  // AndroidManifest.xml file.
+  std::unordered_map<int, std::pair<std::string, std::vector<std::string>>>
+      all_scenarios = {
+          {1, {"cpu_1thread", {"--num_threads=1"}}},
+          {2, {"cpu_2threads", {"--num_threads=2"}}},
+          {3, {"cpu_4threads", {"--num_threads=4"}}},
+          {4, {"xnnpack_1thread", {"--use_xnnpack=true", "--num_threads=1"}}},
+          {5, {"xnnpack_2threads", {"--use_xnnpack=true", "--num_threads=2"}}},
+          {6, {"xnnpack_4threads", {"--use_xnnpack=true", "--num_threads=4"}}},
+          {7,
+           {"gpu_default",
+            {"--use_gpu=true", "--gpu_precision_loss_allowed=false"}}},
+          {8,
+           {"gpu_fp16",
+            {"--use_gpu=true", "--gpu_precision_loss_allowed=true"}}},
+          {9, {"dsp_hexagon", {"--use_hexagon=true"}}},
+          {10, {"nnapi", {"--use_nnapi=true"}}},
+      };
+
+  std::string tag;
+  args.emplace_back("(BenchmarkModelAndroid)");
+  args.emplace_back("--graph=/data/local/tmp/graph");
+
+  auto it = all_scenarios.find(scenario);
+  if (it != all_scenarios.end()) {
+    const auto& scenario_info = it->second;
+    tag = scenario_info.first;
+    for (const auto& arg : scenario_info.second) {
+      args.push_back(arg);
+    }
+  }
+  if (scenario == 9) {
+    std::stringstream hexagon_lib_path;
+    hexagon_lib_path << "--hexagon_lib_path=" << library_dir;
+    args.push_back(hexagon_lib_path.str());
+  }
+  return tag;
+}
+
+void RunScenario(const string& library_dir, int scenario, int report_fd) {
+  std::vector<std::string> args;
+  std::string tag = GetScenarioConfig(library_dir, scenario, args);
+  std::vector<char*> argv;
+  argv.reserve(args.size());
+  for (auto& arg : args) {
+    argv.push_back(const_cast<char*>(arg.data()));
+  }
+
+  BenchmarkTfLiteModel benchmark;
+  FirebaseReportingListener firebaseReporting(tag, report_fd);
+  benchmark.AddListener(&firebaseReporting);
+  CsvExportingListener csvExporting(tag);
+  benchmark.AddListener(&csvExporting);
+  auto status = benchmark.Run(static_cast<int>(argv.size()), argv.data());
+  if (status != kTfLiteOk) {
+    firebaseReporting.ReportFailure(status);
+  }
+}
+
+}  // namespace
+}  // namespace benchmark
+}  // namespace tflite
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_benchmark_firebase_BenchmarkModel_nativeRun(
+    JNIEnv* env, jclass clazz, jstring library_dir, jint scenario,
+    jint report_fd) {
+  const char* lib_dir = env->GetStringUTFChars(library_dir, nullptr);
+
+  tflite::benchmark::RunScenario(lib_dir, static_cast<int>(scenario),
+                                 static_cast<int>(report_fd));
+
+  env->ReleaseStringUTFChars(library_dir, lib_dir);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
diff --git a/tensorflow/lite/tools/benchmark/experimental/firebase/android/src/org/tensorflow/lite/benchmark/firebase/BenchmarkModel.java b/tensorflow/lite/tools/benchmark/experimental/firebase/android/src/org/tensorflow/lite/benchmark/firebase/BenchmarkModel.java
new file mode 100644
index 00000000000..f9f0e9b543b
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/firebase/android/src/org/tensorflow/lite/benchmark/firebase/BenchmarkModel.java
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.benchmark.firebase;
+
+import android.content.Context;
+
+/** Helper class for running a native TensorFlow Lite benchmark. */
+class BenchmarkModel {
+  static {
+    System.loadLibrary("tensorflowlite_benchmark_firebase");
+  }
+
+  // Executes a standard TensorFlow Lite benchmark with predefined args for each scenario.
+  // Result and status will be reported to the file with reportFd file descriptor.
+  public static void run(Context context, int scenario, int reportFd) {
+    String libraryDir = context.getApplicationInfo().nativeLibraryDir;
+    nativeRun(libraryDir, scenario, reportFd);
+  }
+
+  private static native void nativeRun(String libraryDir, int scenario, int reportFd);
+}
diff --git a/tensorflow/lite/tools/benchmark/experimental/firebase/android/src/org/tensorflow/lite/benchmark/firebase/BenchmarkModelActivity.java b/tensorflow/lite/tools/benchmark/experimental/firebase/android/src/org/tensorflow/lite/benchmark/firebase/BenchmarkModelActivity.java
new file mode 100644
index 00000000000..2de5ca6eece
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/firebase/android/src/org/tensorflow/lite/benchmark/firebase/BenchmarkModelActivity.java
@@ -0,0 +1,74 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.benchmark.firebase;
+
+import android.app.Activity;
+import android.content.Intent;
+import android.net.Uri;
+import android.os.Bundle;
+import android.os.ParcelFileDescriptor;
+import android.util.Log;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+
+/**
+ * {@code Activity} class for Firebase Game Loop test.
+ *
+ * <p>This Activity receives and handles an {@code Intent} for Firebase Game Loop test. Refer to
+ * https://firebase.google.com/docs/test-lab/android/game-loop.
+ */
+public class BenchmarkModelActivity extends Activity {
+
+  private static final String TAG = "tflite_BenchmarkModelActivity";
+
+  @Override
+  public void onCreate(Bundle savedInstanceState) {
+    super.onCreate(savedInstanceState);
+
+    Intent intent = getIntent();
+    if (!intent.getAction().equals("com.google.intent.action.TEST_LOOP")) {
+      Log.e(TAG, "Received non Firebase Game Loop test intent " + intent.getAction());
+      finish();
+    }
+    int scenario = intent.getIntExtra("scenario", 0);
+    Log.i(TAG, "Running TensorFlow Lite benchmark with scenario: " + scenario);
+
+    ParcelFileDescriptor parcelFileDescriptor = null;
+    Uri reportFile = intent.getData();
+    if (reportFile != null) {
+      Log.i(TAG, "Logging the result to " + reportFile.getEncodedPath());
+      try {
+        parcelFileDescriptor =
+            getContentResolver().openAssetFileDescriptor(reportFile, "w").getParcelFileDescriptor();
+      } catch (FileNotFoundException | NullPointerException e) {
+        Log.e(TAG, "Error while opening Firebase Test Lab report file", e);
+      }
+    }
+
+    int reportFd = parcelFileDescriptor != null ? parcelFileDescriptor.getFd() : -1;
+    BenchmarkModel.run(this, scenario, reportFd);
+
+    if (parcelFileDescriptor != null) {
+      try {
+        parcelFileDescriptor.close();
+      } catch (IOException e) {
+        Log.e(TAG, "Failed to close Firebase Test Lab result file", e);
+      }
+    }
+
+    finish();
+  }
+}
diff --git a/tensorflow/lite/tools/benchmark/ios/build_benchmark_framework.sh b/tensorflow/lite/tools/benchmark/ios/build_benchmark_framework.sh
index ed1b3dcef21..cedf043df57 100755
--- a/tensorflow/lite/tools/benchmark/ios/build_benchmark_framework.sh
+++ b/tensorflow/lite/tools/benchmark/ios/build_benchmark_framework.sh
@@ -15,14 +15,13 @@
 # ==============================================================================
 
 set -e
-set -x
 
-WORKSPACE_ROOT=$(bazel info workspace)
+WORKSPACE_ROOT=$(bazel info workspace 2> /dev/null)
 BENCHMARK_DIR=tensorflow/lite/tools/benchmark
 DEST_DIR="${BENCHMARK_DIR}/ios/TFLiteBenchmark/TFLiteBenchmark/Frameworks"
 FRAMEWORK_TARGET=TensorFlowLiteBenchmarkC_framework
 
-usage() {
+function usage() {
   echo "Usage: $(basename "$0") [-p]"
   echo "-p enable profiling"
   exit 1
@@ -37,19 +36,35 @@ while getopts "p" opt_name; do
 done
 shift $(($OPTIND - 1))
 
-pushd "${WORKSPACE_ROOT}"
+function check_ios_configured() {
+  if [ ! -f "${WORKSPACE_ROOT}/${BENCHMARK_DIR}/experimental/ios/BUILD" ]; then
+    echo "ERROR: Benchmark framework BUILD file not found."
+    echo "Please enable iOS support by running the \"./configure\" script" \
+         "from the workspace root."
+    exit 1
+  fi
+}
+
+function build_framework() {
+  set -x
+  pushd "${WORKSPACE_ROOT}"
 
 # Build the framework.
-bazel build --config=ios_fat -c opt ${PROFILING_ARGS} \
-    "//${BENCHMARK_DIR}/experimental/ios:${FRAMEWORK_TARGET}"
+  bazel build --config=ios_fat -c opt ${PROFILING_ARGS} \
+      "//${BENCHMARK_DIR}/experimental/ios:${FRAMEWORK_TARGET}"
 
 # Copy the framework into the destination and unzip.
-mkdir -p "${DEST_DIR}"
-cp -f "bazel-bin/${BENCHMARK_DIR}/experimental/ios/${FRAMEWORK_TARGET}.zip" \
-    "${DEST_DIR}"
-pushd "${DEST_DIR}"
-unzip -o "${FRAMEWORK_TARGET}.zip"
-rm -f "${FRAMEWORK_TARGET}.zip"
+  mkdir -p "${DEST_DIR}"
+  cp -f "bazel-bin/${BENCHMARK_DIR}/experimental/ios/${FRAMEWORK_TARGET}.zip" \
+      "${DEST_DIR}"
+  pushd "${DEST_DIR}"
+  unzip -o "${FRAMEWORK_TARGET}.zip"
+  rm -f "${FRAMEWORK_TARGET}.zip"
+
+  popd
+  popd
+}
+
+check_ios_configured
+build_framework
 
-popd
-popd
diff --git a/tensorflow/lite/tools/build_aar.sh b/tensorflow/lite/tools/build_aar.sh
new file mode 100755
index 00000000000..6d84d5b35b1
--- /dev/null
+++ b/tensorflow/lite/tools/build_aar.sh
@@ -0,0 +1,214 @@
+#!/bin/bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT_DIR="$(cd "${SCRIPT_DIR}/../../../" && pwd)"
+
+function print_usage {
+  echo "Usage:"
+  echo "  $(basename ${BASH_SOURCE}) \\"
+  echo "    --input_models=model1.tflite,model2.tflite \\"
+  echo "    --target_archs=x86,x86_64,arm64-v8a,armeabi-v7a \\"
+  echo "    --tflite_custom_ops_srcs=file1.cc,file2.h \\"
+  echo "    --tflite_custom_ops_deps=dep1,dep2"
+  echo ""
+  echo "Where: "
+  echo "  --input_models: Supported TFLite models. "
+  echo "  --target_archs: Supported arches included in the aar file."
+  echo "  --tflite_custom_ops_srcs: The src files for building additional TFLite custom ops if any."
+  echo "  --tflite_custom_ops_deps: Dependencies for building additional TFLite custom ops if any."
+  echo ""
+  exit 1
+}
+
+function generate_list_field {
+  local name="$1"
+  local list_string="$2"
+  local list=(${list_string//,/ })
+
+  local message+=("$name=[")
+  for item in "${list[@]}"
+  do
+    message+=("\"$item\",")
+  done
+  message+=('],')
+  printf '%s' "${message[@]}"
+}
+
+function print_output {
+  echo "Output can be found here:"
+  for i in "$@"
+  do
+    # Check if the file exist.
+    ls -1a ${ROOT_DIR}/$i
+  done
+}
+
+function generate_tflite_aar {
+  pushd ${TMP_DIR} > /dev/null
+  # Generate the BUILD file.
+  message=(
+    'load("//tensorflow/lite:build_def.bzl", "tflite_custom_android_library")'
+    'load("//tensorflow/lite/java:aar_with_jni.bzl", "aar_with_jni")'
+    ''
+    'tflite_custom_android_library('
+    '    name = "custom_tensorflowlite",'
+  )
+  message+=('    '$(generate_list_field "models" $MODEL_NAMES))
+  message+=('    '$(generate_list_field "srcs" $TFLITE_OPS_SRCS))
+  message+=('    '$(generate_list_field "deps" $FLAG_TFLITE_OPS_DEPS))
+  message+=(
+    ')'
+    ''
+    'aar_with_jni('
+    '    name = "tensorflow-lite",'
+    '    android_library = ":custom_tensorflowlite",'
+    ')'
+    ''
+  )
+  printf '%s\n' "${message[@]}" >> BUILD
+
+  # Build the aar package.
+  popd > /dev/null
+  bazel build -c opt --cxxopt='--std=c++14' \
+        --fat_apk_cpu=${TARGET_ARCHS} \
+        --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+        //tmp:tensorflow-lite
+
+   OUT_FILES="${OUT_FILES} bazel-bin/tmp/tensorflow-lite.aar"
+}
+
+function generate_flex_aar {
+  pushd ${TMP_DIR}
+  # Generating the BUILD file.
+  message=(
+    'load("//tensorflow/lite/delegates/flex:build_def.bzl", "tflite_flex_android_library")'
+    'load("//tensorflow/lite/java:aar_with_jni.bzl", "aar_with_jni")'
+    ''
+    'tflite_flex_android_library('
+    '    name = "custom_tensorflowlite_flex",'
+    )
+  message+=('    '$(generate_list_field "models" $MODEL_NAMES))
+  message+=(
+    ')'
+    ''
+    'aar_with_jni('
+    '    name = "tensorflow-lite-select-tf-ops",'
+    '    android_library = ":custom_tensorflowlite_flex",'
+    ')'
+  )
+  printf '%s\n' "${message[@]}" >> BUILD
+
+  cp ${ROOT_DIR}/tensorflow/lite/java/AndroidManifest.xml .
+  cp ${ROOT_DIR}/tensorflow/lite/java/proguard.flags .
+  popd
+
+  # Build the aar package.
+  bazel build -c opt --cxxopt='--std=c++14' \
+      --fat_apk_cpu=${TARGET_ARCHS} \
+      --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+      //tmp:tensorflow-lite-select-tf-ops
+
+  OUT_FILES="${OUT_FILES} bazel-bin/tmp/tensorflow-lite-select-tf-ops.aar"
+}
+
+# Check command line flags.
+TARGET_ARCHS=x86,x86_64,arm64-v8a,armeabi-v7a
+
+if [ "$#" -gt 4 ]; then
+  echo "ERROR: Too many arguments."
+  print_usage
+fi
+
+for i in "$@"
+do
+case $i in
+    --input_models=*)
+      FLAG_MODELS="${i#*=}"
+      shift;;
+    --target_archs=*)
+      TARGET_ARCHS="${i#*=}"
+      shift;;
+    --tflite_custom_ops_srcs=*)
+      FLAG_TFLITE_OPS_SRCS="${i#*=}"
+      shift;;
+    --tflite_custom_ops_deps=*)
+      FLAG_TFLITE_OPS_DEPS="${i#*=}"
+      shift;;
+    *)
+      echo "ERROR: Unrecognized argument: ${i}"
+      print_usage;;
+esac
+done
+
+# Check if users already run configure
+cd $ROOT_DIR
+if [ ! -f "$ROOT_DIR/.tf_configure.bazelrc" ]; then
+  echo "ERROR: Please run ./configure first."
+  exit 1
+else
+  if ! grep -q ANDROID_SDK_HOME "$ROOT_DIR/.tf_configure.bazelrc"; then
+    echo "ERROR: Please run ./configure with Android config."
+    exit 1
+  fi
+fi
+
+# Build the standard aar package of no models provided.
+if [ -z ${FLAG_MODELS} ]; then
+  bazel build -c opt --cxxopt='--std=c++14' \
+    --fat_apk_cpu=${TARGET_ARCHS} \
+    --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+    //tensorflow/lite/java:tensorflow-lite
+
+  print_output bazel-bin/tensorflow/lite/java/tensorflow-lite.aar
+  exit 0
+fi
+
+# Prepare the tmp directory.
+TMP_DIR="${ROOT_DIR}/tmp/"
+rm -rf ${TMP_DIR} && mkdir -p ${TMP_DIR}
+
+# Copy models to tmp directory.
+MODEL_NAMES=""
+for model in $(echo ${FLAG_MODELS} | sed "s/,/ /g")
+do
+  cp ${model} ${TMP_DIR}
+  MODEL_NAMES="${MODEL_NAMES},$(basename ${model})"
+done
+
+# Copy srcs of additional tflite ops to tmp directory.
+TFLITE_OPS_SRCS=""
+for src_file in $(echo ${FLAG_TFLITE_OPS_SRCS} | sed "s/,/ /g")
+do
+  cp ${src_file} ${TMP_DIR}
+  TFLITE_OPS_SRCS="${TFLITE_OPS_SRCS},$(basename ${src_file})"
+done
+
+# Build the custom aar package.
+generate_tflite_aar
+
+# Build flex aar if one of the models contain flex ops.
+bazel build -c opt --config=monolithic //tensorflow/lite/tools:list_flex_ops_no_kernel_main
+bazel-bin/tensorflow/lite/tools/list_flex_ops_no_kernel_main --graphs=${FLAG_MODELS} > ${TMP_DIR}/ops_list.txt
+if [[ `cat ${TMP_DIR}/ops_list.txt` != "[]" ]]; then
+  generate_flex_aar
+fi
+
+# List the output files.
+rm -rf ${TMP_DIR}
+print_output ${OUT_FILES}
diff --git a/tensorflow/lite/tools/build_aar_with_docker.sh b/tensorflow/lite/tools/build_aar_with_docker.sh
new file mode 100755
index 00000000000..2af4787c35c
--- /dev/null
+++ b/tensorflow/lite/tools/build_aar_with_docker.sh
@@ -0,0 +1,115 @@
+#!/bin/bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+function print_usage {
+  echo "Usage:"
+  echo "  $(basename ${BASH_SOURCE}) \\"
+  echo "    --input_models=model1.tflite,model2.tflite \\"
+  echo "    --target_archs=x86,x86_64,arm64-v8a,armeabi-v7a \\"
+  echo "    --checkpoint=master"
+  echo ""
+  echo "Where: "
+  echo "  --input_models: Supported TFLite models. "
+  echo "  --target_archs: Supported arches included in the aar file."
+  echo "  --checkpoint: Checkpoint of the github repo, could be a branch, a commit or a tag. Default: master"
+  echo ""
+  exit 1
+}
+
+# Check command line flags.
+ARGUMENTS=$@
+TARGET_ARCHS=x86,x86_64,arm64-v8a,armeabi-v7a
+FLAG_CHECKPOINT="master"
+
+if [ "$#" -gt 3 ]; then
+  echo "ERROR: Too many arguments."
+  print_usage
+fi
+
+for i in "$@"
+do
+case $i in
+    --input_models=*)
+      FLAG_MODELS="${i#*=}"
+      shift;;
+    --target_archs=*)
+      TARGET_ARCHS="${i#*=}"
+      shift;;
+    --checkpoint=*)
+      FLAG_CHECKPOINT="${i#*=}"
+      shift;;
+    *)
+      echo "ERROR: Unrecognized argument: ${i}"
+      print_usage;;
+esac
+done
+
+if [ ! -d /tensorflow_src ]; then
+  # Running on host.
+  for model in $(echo ${FLAG_MODELS} | sed "s/,/ /g")
+  do
+    FLAG_DIR="${FLAG_DIR} -v ${model}:${model}"
+  done
+  docker run --rm -it -v $PWD:/tmp -v ${SCRIPT_DIR}:/script_dir ${FLAG_DIR} \
+    --entrypoint /script_dir/build_aar_with_docker.sh tflite-builder \
+    ${ARGUMENTS}
+  exit 0
+else
+  # Running inside docker container, download the SDK first.
+  android update sdk --no-ui -a \
+    --filter tools,platform-tools,android-${ANDROID_API_LEVEL},build-tools-${ANDROID_BUILD_TOOLS_VERSION}
+
+  cd /tensorflow_src
+
+  # Run configure.
+  configs=(
+    '/usr/bin/python3'
+    '/usr/lib/python3/dist-packages'
+    'N'
+    'N'
+    'N'
+    'N'
+    '-march=native -Wno-sign-compare'
+    'y'
+    '/android/sdk'
+  )
+  printf '%s\n' "${configs[@]}" | ./configure
+
+  # Pull the latest code from tensorflow.
+  git pull -a
+  git checkout ${FLAG_CHECKPOINT}
+
+  # Building with bazel.
+  bash /tensorflow_src/tensorflow/lite/tools/build_aar.sh ${ARGUMENTS}
+
+  # Copy the output files from docker container.
+  clear
+  OUT_FILES="/tensorflow_src/bazel-bin/tmp/tensorflow-lite.aar"
+  OUT_FILES="${OUT_FILES} /tensorflow_src/bazel-bin/tmp/tensorflow-lite-select-tf-ops.aar"
+  echo "Output can be found here:"
+  for i in ${OUT_FILES}
+  do
+    if [ -f $i ]; then
+      cp $i /tmp
+      basename $i
+    fi
+  done
+fi
+
diff --git a/tensorflow/lite/tools/delegates/BUILD b/tensorflow/lite/tools/delegates/BUILD
index a8b1485d14b..fca64467bdf 100644
--- a/tensorflow/lite/tools/delegates/BUILD
+++ b/tensorflow/lite/tools/delegates/BUILD
@@ -64,9 +64,11 @@ cc_library(
         ":delegate_provider_hdr",
         "//tensorflow/lite/tools/evaluation:utils",
     ] + select({
-        "//tensorflow:android": [
+        "//tensorflow/lite/delegates/gpu:supports_gpu_delegate": [
             "//tensorflow/lite/delegates/gpu:delegate",
         ],
+        "//conditions:default": [],
+    }) + select({
         "//tensorflow:ios": [
             "//tensorflow/lite/delegates/gpu:metal_delegate",
         ],
diff --git a/tensorflow/lite/tools/delegates/README.md b/tensorflow/lite/tools/delegates/README.md
index 26bf1bcd8fd..3a9669bbdd7 100644
--- a/tensorflow/lite/tools/delegates/README.md
+++ b/tensorflow/lite/tools/delegates/README.md
@@ -1,10 +1,11 @@
 # TFLite Delegate Utilities for Tooling
 
 ## TFLite Delegate Registrar
+
 [A TFLite delegate registrar](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/delegates/delegate_provider.h)
 is provided here. The registrar keeps a list of TFLite delegate providers, each
 of which defines a list parameters that could be initialized from commandline
-argumenents and provides a TFLite delegate instance creation based on those
+arguments and provides a TFLite delegate instance creation based on those
 parameters. This delegate registrar has been used in TFLite evaluation tools and
 the benchmark model tool.
 
diff --git a/tensorflow/lite/tools/delegates/external_delegate_provider.cc b/tensorflow/lite/tools/delegates/external_delegate_provider.cc
index f61f90127a9..f3ae9a02ca9 100644
--- a/tensorflow/lite/tools/delegates/external_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/external_delegate_provider.cc
@@ -88,11 +88,23 @@ TfLiteDelegatePtr ExternalDelegateProvider::CreateTfLiteDelegate(
     const std::vector<std::string> options =
         SplitString(params.Get<std::string>("external_delegate_options"), ';');
     std::vector<std::string> keys, values;
+    // We reserve the memory here to avoid memory pointer change during
+    // insertion to vectors above.
+    keys.reserve(options.size());
+    values.reserve(options.size());
     for (const auto& option : options) {
       auto key_value = SplitString(option, ':');
       if (key_value.size() == 2) {
-        delegate_options.insert(&delegate_options, key_value[0].c_str(),
-                                key_value[1].c_str());
+        // The inserted (key,value) pair has to outlive the
+        // TfLiteExternalDelegateCreate call, therefore, we use two vectors
+        // 'keys' and 'values' to achieve this.
+        // Also, we will insert the memory pointer of key and value to
+        // delegate_options later, we have to ensure the pointer won't change by
+        // reserving the memory earlier.
+        keys.emplace_back(key_value[0]);
+        values.emplace_back(key_value[1]);
+        delegate_options.insert(&delegate_options, keys.back().c_str(),
+                                values.back().c_str());
       }
     }
 
diff --git a/tensorflow/lite/tools/delegates/gpu_delegate_provider.cc b/tensorflow/lite/tools/delegates/gpu_delegate_provider.cc
index 32e5e1b117f..de7ace18cf1 100644
--- a/tensorflow/lite/tools/delegates/gpu_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/gpu_delegate_provider.cc
@@ -16,7 +16,7 @@ limitations under the License.
 
 #include "tensorflow/lite/tools/delegates/delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/utils.h"
-#if defined(__ANDROID__)
+#if TFLITE_SUPPORTS_GPU_DELEGATE
 #include "tensorflow/lite/delegates/gpu/delegate.h"
 #elif defined(__APPLE__)
 #include "TargetConditionals.h"
@@ -34,13 +34,13 @@ class GpuDelegateProvider : public DelegateProvider {
  public:
   GpuDelegateProvider() {
     default_params_.AddParam("use_gpu", ToolParam::Create<bool>(false));
-#if defined(__ANDROID__) || defined(REAL_IPHONE_DEVICE)
+#if TFLITE_SUPPORTS_GPU_DELEGATE || defined(REAL_IPHONE_DEVICE)
     default_params_.AddParam("gpu_precision_loss_allowed",
                              ToolParam::Create<bool>(true));
     default_params_.AddParam("gpu_experimental_enable_quant",
                              ToolParam::Create<bool>(true));
 #endif
-#if defined(__ANDROID__)
+#if TFLITE_SUPPORTS_GPU_DELEGATE
     default_params_.AddParam("gpu_backend", ToolParam::Create<std::string>(""));
 #endif
 #if defined(REAL_IPHONE_DEVICE)
@@ -62,7 +62,7 @@ REGISTER_DELEGATE_PROVIDER(GpuDelegateProvider);
 std::vector<Flag> GpuDelegateProvider::CreateFlags(ToolParams* params) const {
   std::vector<Flag> flags = {
     CreateFlag<bool>("use_gpu", params, "use gpu"),
-#if defined(__ANDROID__) || defined(REAL_IPHONE_DEVICE)
+#if TFLITE_SUPPORTS_GPU_DELEGATE || defined(REAL_IPHONE_DEVICE)
     CreateFlag<bool>("gpu_precision_loss_allowed", params,
                      "Allow to process computation in lower precision than "
                      "FP32 in GPU. By default, it's enabled."),
@@ -70,7 +70,7 @@ std::vector<Flag> GpuDelegateProvider::CreateFlags(ToolParams* params) const {
                      "Whether to enable the GPU delegate to run quantized "
                      "models or not. By default, it's enabled."),
 #endif
-#if defined(__ANDROID__)
+#if TFLITE_SUPPORTS_GPU_DELEGATE
     CreateFlag<std::string>(
         "gpu_backend", params,
         "Force the GPU delegate to use a particular backend for execution, and "
@@ -89,13 +89,13 @@ std::vector<Flag> GpuDelegateProvider::CreateFlags(ToolParams* params) const {
 void GpuDelegateProvider::LogParams(const ToolParams& params,
                                     bool verbose) const {
   LOG_TOOL_PARAM(params, bool, "use_gpu", "Use gpu", verbose);
-#if defined(__ANDROID__) || defined(REAL_IPHONE_DEVICE)
+#if TFLITE_SUPPORTS_GPU_DELEGATE || defined(REAL_IPHONE_DEVICE)
   LOG_TOOL_PARAM(params, bool, "gpu_precision_loss_allowed",
                  "Allow lower precision in gpu", verbose);
   LOG_TOOL_PARAM(params, bool, "gpu_experimental_enable_quant",
                  "Enable running quant models in gpu", verbose);
 #endif
-#if defined(__ANDROID__)
+#if TFLITE_SUPPORTS_GPU_DELEGATE
   LOG_TOOL_PARAM(params, std::string, "gpu_backend", "GPU backend", verbose);
 #endif
 #if defined(REAL_IPHONE_DEVICE)
@@ -109,7 +109,7 @@ TfLiteDelegatePtr GpuDelegateProvider::CreateTfLiteDelegate(
   TfLiteDelegatePtr delegate(nullptr, [](TfLiteDelegate*) {});
 
   if (params.Get<bool>("use_gpu")) {
-#if defined(__ANDROID__)
+#if TFLITE_SUPPORTS_GPU_DELEGATE
     TfLiteGpuDelegateOptionsV2 gpu_opts = TfLiteGpuDelegateOptionsV2Default();
     if (params.Get<bool>("gpu_precision_loss_allowed")) {
       gpu_opts.inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY;
@@ -157,7 +157,8 @@ TfLiteDelegatePtr GpuDelegateProvider::CreateTfLiteDelegate(
                                  &TFLGpuDelegateDelete);
 #else
     TFLITE_LOG(WARN) << "The GPU delegate compile options are only supported "
-                        "on Android or iOS platforms.";
+                        "on Android or iOS platforms or when the tool was "
+                        "built with -DCL_DELEGATE_NO_GL.";
     delegate = evaluation::CreateGPUDelegate();
 #endif
 
diff --git a/tensorflow/lite/tools/evaluation/BUILD b/tensorflow/lite/tools/evaluation/BUILD
index 85dfb183254..e653379ef69 100644
--- a/tensorflow/lite/tools/evaluation/BUILD
+++ b/tensorflow/lite/tools/evaluation/BUILD
@@ -42,8 +42,12 @@ cc_library(
     deps = [
         "//tensorflow/lite/c:common",
     ] + select({
-        "//tensorflow:android": [
+        "//tensorflow/lite/delegates/gpu:supports_gpu_delegate": [
             "//tensorflow/lite/delegates/gpu:delegate",
+        ],
+        "//conditions:default": [],
+    }) + select({
+        "//tensorflow:android": [
             "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         ],
         "//conditions:default": [],
diff --git a/tensorflow/lite/tools/evaluation/stages/image_classification_stage.cc b/tensorflow/lite/tools/evaluation/stages/image_classification_stage.cc
index f0c1daeb06b..4f280c79143 100644
--- a/tensorflow/lite/tools/evaluation/stages/image_classification_stage.cc
+++ b/tensorflow/lite/tools/evaluation/stages/image_classification_stage.cc
@@ -150,32 +150,31 @@ EvaluationStageMetrics ImageClassificationStage::LatestMetrics() {
   return metrics;
 }
 
-TfLiteStatus FilterBlackListedImages(const std::string& blacklist_file_path,
-                                     std::vector<ImageLabel>* image_labels) {
-  if (!blacklist_file_path.empty()) {
+TfLiteStatus FilterDenyListedImages(const std::string& denylist_file_path,
+                                    std::vector<ImageLabel>* image_labels) {
+  if (!denylist_file_path.empty()) {
     std::vector<std::string> lines;
-    if (!tflite::evaluation::ReadFileLines(blacklist_file_path, &lines)) {
-      LOG(ERROR) << "Could not read: " << blacklist_file_path;
+    if (!tflite::evaluation::ReadFileLines(denylist_file_path, &lines)) {
+      LOG(ERROR) << "Could not read: " << denylist_file_path;
       return kTfLiteError;
     }
-    std::vector<int> blacklist_ids;
-    blacklist_ids.reserve(lines.size());
-    // Populate blacklist_ids with indices of images.
-    std::transform(lines.begin(), lines.end(),
-                   std::back_inserter(blacklist_ids),
+    std::vector<int> denylist_ids;
+    denylist_ids.reserve(lines.size());
+    // Populate denylist_ids with indices of images.
+    std::transform(lines.begin(), lines.end(), std::back_inserter(denylist_ids),
                    [](const std::string& val) { return std::stoi(val) - 1; });
 
     std::vector<ImageLabel> filtered_images;
-    std::sort(blacklist_ids.begin(), blacklist_ids.end());
+    std::sort(denylist_ids.begin(), denylist_ids.end());
     const size_t size_post_filtering =
-        image_labels->size() - blacklist_ids.size();
+        image_labels->size() - denylist_ids.size();
     filtered_images.reserve(size_post_filtering);
-    int blacklist_index = 0;
+    int denylist_index = 0;
     for (int image_index = 0; image_index < image_labels->size();
          image_index++) {
-      if (blacklist_index < blacklist_ids.size() &&
-          blacklist_ids[blacklist_index] == image_index) {
-        blacklist_index++;
+      if (denylist_index < denylist_ids.size() &&
+          denylist_ids[denylist_index] == image_index) {
+        denylist_index++;
         continue;
       }
       filtered_images.push_back((*image_labels)[image_index]);
diff --git a/tensorflow/lite/tools/evaluation/stages/image_classification_stage.h b/tensorflow/lite/tools/evaluation/stages/image_classification_stage.h
index c3f8eb8f900..d468afbc359 100644
--- a/tensorflow/lite/tools/evaluation/stages/image_classification_stage.h
+++ b/tensorflow/lite/tools/evaluation/stages/image_classification_stage.h
@@ -80,10 +80,10 @@ struct ImageLabel {
   std::string label;
 };
 
-// Reads a file containing newline-separated blacklisted image indices and
+// Reads a file containing newline-separated denylisted image indices and
 // filters them out from image_labels.
-TfLiteStatus FilterBlackListedImages(const std::string& blacklist_file_path,
-                                     std::vector<ImageLabel>* image_labels);
+TfLiteStatus FilterDenyListedImages(const std::string& denylist_file_path,
+                                    std::vector<ImageLabel>* image_labels);
 
 }  // namespace evaluation
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/README.md b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/README.md
index 590c15cc133..faac6f7fedf 100644
--- a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/README.md
+++ b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/README.md
@@ -135,7 +135,7 @@ above, and you we still need an output labels file.
 
 To compute mAP in a consistent and interpretable way, we utilize the same 2014
 COCO 'minival' dataset that is mentioned in the
-[Tensorflow detection model zoo](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/detection_model_zoo.md).
+[Tensorflow detection model zoo](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf1_detection_zoo.md).
 
 The links to download the components of the validation set are:
 
diff --git a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
index 73491457f38..44a6ca36c98 100644
--- a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
@@ -67,7 +67,6 @@ class CocoObjectDetection : public TaskExecutor {
   bool debug_mode_;
   std::string delegate_;
   int num_interpreter_threads_;
-  DelegateProviders delegate_providers_;
 };
 
 std::vector<Flag> CocoObjectDetection::GetFlags() {
diff --git a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md
index b775c3dc8e8..9cea895eff0 100644
--- a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md
+++ b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md
@@ -57,9 +57,9 @@ The binary takes the following parameters:
 
 and the following optional parameters:
 
-*   `blacklist_file_path`: `string` \
-    Path to blacklist file. This file contains the indices of images that are
-    blacklisted for evaluation. 1762 images are blacklisted in ILSVRC dataset.
+*   `denylist_file_path`: `string` \
+    Path to denylist file. This file contains the indices of images that are
+    denylisted for evaluation. 1762 images are denylisted in ILSVRC dataset.
     For details please refer to readme.txt of ILSVRC2014 devkit.
 
 *   `num_images`: `int` (default=0) \
diff --git a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
index fdc97d44abc..50e12be1c1c 100644
--- a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
@@ -35,7 +35,7 @@ constexpr char kGroundTruthImagesPathFlag[] = "ground_truth_images_path";
 constexpr char kGroundTruthLabelsFlag[] = "ground_truth_labels";
 constexpr char kOutputFilePathFlag[] = "output_file_path";
 constexpr char kModelOutputLabelsFlag[] = "model_output_labels";
-constexpr char kBlacklistFilePathFlag[] = "blacklist_file_path";
+constexpr char kDenylistFilePathFlag[] = "denylist_file_path";
 constexpr char kNumImagesFlag[] = "num_images";
 constexpr char kInterpreterThreadsFlag[] = "num_interpreter_threads";
 constexpr char kDelegateFlag[] = "delegate";
@@ -64,7 +64,7 @@ class ImagenetClassification : public TaskExecutor {
   std::string ground_truth_images_path_;
   std::string ground_truth_labels_path_;
   std::string model_output_labels_path_;
-  std::string blacklist_file_path_;
+  std::string denylist_file_path_;
   std::string output_file_path_;
   std::string delegate_;
   int num_images_;
@@ -90,10 +90,10 @@ std::vector<Flag> ImagenetClassification::GetFlags() {
           "Path to ground truth labels, corresponding to alphabetical ordering "
           "of ground truth images."),
       tflite::Flag::CreateFlag(
-          kBlacklistFilePathFlag, &blacklist_file_path_,
-          "Path to blacklist file (optional) where each line is a single "
+          kDenylistFilePathFlag, &denylist_file_path_,
+          "Path to denylist file (optional) where each line is a single "
           "integer that is "
-          "equal to index number of blacklisted image."),
+          "equal to index number of denylisted image."),
       tflite::Flag::CreateFlag(kOutputFilePathFlag, &output_file_path_,
                                "File to output metrics proto to."),
       tflite::Flag::CreateFlag(kNumImagesFlag, &num_images_,
@@ -131,9 +131,8 @@ absl::optional<EvaluationStageMetrics> ImagenetClassification::RunImpl() {
     image_labels.push_back({image_files[i], ground_truth_image_labels[i]});
   }
 
-  // Filter out blacklisted/unwanted images.
-  if (FilterBlackListedImages(blacklist_file_path_, &image_labels) !=
-      kTfLiteOk) {
+  // Filter out denylisted/unwanted images.
+  if (FilterDenyListedImages(denylist_file_path_, &image_labels) != kTfLiteOk) {
     return absl::nullopt;
   }
   if (num_images_ > 0) {
diff --git a/tensorflow/lite/tools/evaluation/utils.cc b/tensorflow/lite/tools/evaluation/utils.cc
index c766a932999..d75270c07e9 100644
--- a/tensorflow/lite/tools/evaluation/utils.cc
+++ b/tensorflow/lite/tools/evaluation/utils.cc
@@ -114,15 +114,15 @@ TfLiteDelegatePtr CreateNNAPIDelegate(StatefulNnApiDelegate::Options options) {
 }
 #endif  // defined(__ANDROID__)
 
-#if defined(__ANDROID__)
+#if TFLITE_SUPPORTS_GPU_DELEGATE
 TfLiteDelegatePtr CreateGPUDelegate(TfLiteGpuDelegateOptionsV2* options) {
   return TfLiteDelegatePtr(TfLiteGpuDelegateV2Create(options),
                            &TfLiteGpuDelegateV2Delete);
 }
-#endif  // defined(__ANDROID__)
+#endif  // TFLITE_SUPPORTS_GPU_DELEGATE
 
 TfLiteDelegatePtr CreateGPUDelegate() {
-#if defined(__ANDROID__)
+#if TFLITE_SUPPORTS_GPU_DELEGATE
   TfLiteGpuDelegateOptionsV2 options = TfLiteGpuDelegateOptionsV2Default();
   options.inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY;
   options.inference_preference =
@@ -131,7 +131,7 @@ TfLiteDelegatePtr CreateGPUDelegate() {
   return CreateGPUDelegate(&options);
 #else
   return CreateNullDelegate();
-#endif  // defined(__ANDROID__)
+#endif  // TFLITE_SUPPORTS_GPU_DELEGATE
 }
 
 TfLiteDelegatePtr CreateHexagonDelegate(
diff --git a/tensorflow/lite/tools/evaluation/utils.h b/tensorflow/lite/tools/evaluation/utils.h
index 5d9920cf22b..02013f3e39a 100644
--- a/tensorflow/lite/tools/evaluation/utils.h
+++ b/tensorflow/lite/tools/evaluation/utils.h
@@ -21,8 +21,15 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
-#if defined(__ANDROID__)
+#if defined(__ANDROID__) || defined(CL_DELEGATE_NO_GL)
+#define TFLITE_SUPPORTS_GPU_DELEGATE 1
+#endif
+
+#if TFLITE_SUPPORTS_GPU_DELEGATE
 #include "tensorflow/lite/delegates/gpu/delegate.h"
+#endif
+
+#if defined(__ANDROID__)
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #if (defined(__arm__) || defined(__aarch64__))
 #include "tensorflow/lite/delegates/hexagon/hexagon_delegate.h"
@@ -67,7 +74,7 @@ TfLiteDelegatePtr CreateNNAPIDelegate(StatefulNnApiDelegate::Options options);
 #endif
 
 TfLiteDelegatePtr CreateGPUDelegate();
-#if defined(__ANDROID__)
+#if TFLITE_SUPPORTS_GPU_DELEGATE
 TfLiteDelegatePtr CreateGPUDelegate(TfLiteGpuDelegateOptionsV2* options);
 #endif
 
diff --git a/tensorflow/lite/tools/flatbuffer_utils.py b/tensorflow/lite/tools/flatbuffer_utils.py
index f80daad2519..3171759201c 100644
--- a/tensorflow/lite/tools/flatbuffer_utils.py
+++ b/tensorflow/lite/tools/flatbuffer_utils.py
@@ -25,53 +25,84 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
 import os
 import random
 
-from flatbuffers.python import flatbuffers
+import flatbuffers
 from tensorflow.lite.python import schema_py_generated as schema_fb
 
-TFLITE_FILE_IDENTIFIER = b'TFL3'
+_TFLITE_FILE_IDENTIFIER = b'TFL3'
+
+
+def convert_bytearray_to_object(model_bytearray):
+  """Converts a tflite model from a bytearray to an object for parsing."""
+  model_object = schema_fb.Model.GetRootAsModel(model_bytearray, 0)
+  return schema_fb.ModelT.InitFromObj(model_object)
 
 
 def read_model(input_tflite_file):
-  """Reads and parses a tflite model.
+  """Reads a tflite model as a python object.
 
   Args:
     input_tflite_file: Full path name to the input tflite file
 
   Raises:
-    RuntimeError: If input_tflite_file is not found.
+    RuntimeError: If input_tflite_file path is invalid.
     IOError: If input_tflite_file cannot be opened.
 
   Returns:
-    A python flatbuffer object corresponding to the input tflite file.
+    A python object corresponding to the input tflite file.
   """
   if not os.path.exists(input_tflite_file):
     raise RuntimeError('Input file not found at %r\n' % input_tflite_file)
   with open(input_tflite_file, 'rb') as file_handle:
-    file_data = bytearray(file_handle.read())
-  model_obj = schema_fb.Model.GetRootAsModel(file_data, 0)
-  return schema_fb.ModelT.InitFromObj(model_obj)
+    model_bytearray = bytearray(file_handle.read())
+  return convert_bytearray_to_object(model_bytearray)
 
 
-def write_model(model, output_tflite_file):
-  """Writes the model, a python flatbuffer object, into the output tflite file.
+def read_model_with_mutable_tensors(input_tflite_file):
+  """Reads a tflite model as a python object with mutable tensors.
+
+  Similar to read_model() with the addition that the returned object has
+  mutable tensors (read_model() returns an object with immutable tensors).
 
   Args:
-    model: tflite model
+    input_tflite_file: Full path name to the input tflite file
+
+  Raises:
+    RuntimeError: If input_tflite_file path is invalid.
+    IOError: If input_tflite_file cannot be opened.
+
+  Returns:
+    A mutable python object corresponding to the input tflite file.
+  """
+  return copy.deepcopy(read_model(input_tflite_file))
+
+
+def convert_object_to_bytearray(model_object):
+  """Converts a tflite model from an object to a bytearray."""
+  # Initial size of the buffer, which will grow automatically if needed
+  builder = flatbuffers.Builder(1024)
+  model_offset = model_object.Pack(builder)
+  builder.Finish(model_offset, file_identifier=_TFLITE_FILE_IDENTIFIER)
+  model_bytearray = bytes(builder.Output())
+  return model_bytearray
+
+
+def write_model(model_object, output_tflite_file):
+  """Writes the tflite model, a python object, into the output file.
+
+  Args:
+    model_object: A tflite model as a python object
     output_tflite_file: Full path name to the output tflite file.
 
   Raises:
-    IOError: If output_tflite_file cannot be opened.
+    IOError: If output_tflite_file path is invalid or cannot be opened.
   """
-  # Initial size of the buffer, which will grow automatically if needed
-  builder = flatbuffers.Builder(1024)
-  model_offset = model.Pack(builder)
-  builder.Finish(model_offset, file_identifier=TFLITE_FILE_IDENTIFIER)
-  model_data = builder.Output()
+  model_bytearray = convert_object_to_bytearray(model_object)
   with open(output_tflite_file, 'wb') as out_file:
-    out_file.write(model_data)
+    out_file.write(model_bytearray)
 
 
 def strip_strings(model):
diff --git a/tensorflow/lite/tools/list_flex_ops.h b/tensorflow/lite/tools/list_flex_ops.h
index 070da2d9b3d..f9bc7b952df 100644
--- a/tensorflow/lite/tools/list_flex_ops.h
+++ b/tensorflow/lite/tools/list_flex_ops.h
@@ -42,7 +42,7 @@ struct OpKernelCompare {
 using OpKernelSet = std::set<OpKernel, OpKernelCompare>;
 
 // Find flex ops and its kernel classes inside a TFLite model and add them to
-// the map flex_ops. The map stores
+// the map flex_ops.
 void AddFlexOpsFromModel(const tflite::Model* model, OpKernelSet* flex_ops);
 
 // Serialize the list op of to a json string. If flex_ops is empty, return an
diff --git a/tensorflow/lite/tools/list_flex_ops_no_kernel.cc b/tensorflow/lite/tools/list_flex_ops_no_kernel.cc
new file mode 100644
index 00000000000..11a9f39dbfd
--- /dev/null
+++ b/tensorflow/lite/tools/list_flex_ops_no_kernel.cc
@@ -0,0 +1,61 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/lite/tools/list_flex_ops.h"
+
+namespace tflite {
+namespace flex {
+
+std::string OpListToJSONString(const OpKernelSet& flex_ops) {
+  return absl::StrCat("[",
+                      absl::StrJoin(flex_ops, ",\n",
+                                    [](std::string* out, const OpKernel& op) {
+                                      absl::StrAppend(out, "\"", op.op_name,
+                                                      "\"");
+                                    }),
+                      "]");
+}
+
+void AddFlexOpsFromModel(const tflite::Model* model, OpKernelSet* flex_ops) {
+  auto* subgraphs = model->subgraphs();
+  if (!subgraphs) return;
+
+  for (int subgraph_index = 0; subgraph_index < subgraphs->size();
+       ++subgraph_index) {
+    const tflite::SubGraph* subgraph = subgraphs->Get(subgraph_index);
+    auto* operators = subgraph->operators();
+    auto* opcodes = model->operator_codes();
+    if (!operators || !opcodes) continue;
+
+    for (int i = 0; i < operators->size(); ++i) {
+      const tflite::Operator* op = operators->Get(i);
+      const tflite::OperatorCode* opcode = opcodes->Get(op->opcode_index());
+      if (opcode->builtin_code() != tflite::BuiltinOperator_CUSTOM ||
+          !tflite::IsFlexOp(opcode->custom_code()->c_str())) {
+        continue;
+      }
+
+      // Remove the "Flex" prefix from op name.
+      std::string flex_op_name(opcode->custom_code()->c_str());
+      std::string tf_op_name =
+          flex_op_name.substr(strlen(tflite::kFlexCustomCodePrefix));
+
+      flex_ops->insert({tf_op_name, ""});
+    }
+  }
+}
+}  // namespace flex
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index f8b67fbbe7d..c7ddff58440 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -148,6 +148,7 @@ endif
 CORE_CC_ALL_SRCS := $(sort $(CORE_CC_ALL_SRCS))
 CORE_CC_EXCLUDE_SRCS := \
 $(wildcard tensorflow/lite/*test.cc) \
+$(wildcard tensorflow/lite/*/*test.c) \
 $(wildcard tensorflow/lite/*/*test.cc) \
 $(wildcard tensorflow/lite/*/*/benchmark.cc) \
 $(wildcard tensorflow/lite/*/*/example*.cc) \
@@ -221,10 +222,6 @@ else
 	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/minimal_logging_ios.cc
 endif
 
-# Temporary fix for ruy compilation error.
-# TODO(b/158800055): Remove this hack once the ruy version is correctly bumped.
-CORE_CC_EXCLUDE_SRCS += tensorflow/lite/tools/make/downloads/ruy/ruy/prepare_packed_matrices.cc
-
 # Filter out all the excluded files.
 TF_LITE_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
 
diff --git a/tensorflow/lite/tools/make/download_dependencies.sh b/tensorflow/lite/tools/make/download_dependencies.sh
index 0ab8307b07a..27537823be2 100755
--- a/tensorflow/lite/tools/make/download_dependencies.sh
+++ b/tensorflow/lite/tools/make/download_dependencies.sh
@@ -37,8 +37,8 @@ EIGEN_URL="$(grep -o 'https.*gitlab.com/libeigen/eigen/-/archive/.*tar\.gz' "${B
 EIGEN_SHA="$(eval echo $(grep '# SHARED_EIGEN_SHA' "${BZL_FILE_PATH}" | grep -o '\".*\"'))"
 GEMMLOWP_URL="$(grep -o 'https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GEMMLOWP_SHA="$(eval echo $(grep '# SHARED_GEMMLOWP_SHA' "${BZL_FILE_PATH}" | grep -o '\".*\"'))"
-RUY_URL="https://github.com/google/ruy/archive/34ea9f4993955fa1ff4eb58e504421806b7f2e8f.zip"
-RUY_SHA="8fd4adeeff4f29796bf7cdda64806ec0495a2435361569f02afe3fe33406f07c"
+RUY_URL="https://github.com/google/ruy/archive/5bb02fbf90824c2eb6cd7418f766c593106a332b.zip"
+RUY_SHA="d8f9dc52c0a52c8470e2e0b60bc16cba91853d812846c075f7ed8404990b003d"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 GOOGLETEST_SHA="58a6f4277ca2bc8565222b3bbd58a177609e9c488e8a72649359ba51450db7d8"
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
diff --git a/tensorflow/lite/tools/optimize/calibration/BUILD b/tensorflow/lite/tools/optimize/calibration/BUILD
index 06183353e44..674ef0ae4f6 100644
--- a/tensorflow/lite/tools/optimize/calibration/BUILD
+++ b/tensorflow/lite/tools/optimize/calibration/BUILD
@@ -41,7 +41,6 @@ cc_library(
         ":calibration_reader",
         ":logging_op",
         ":logging_op_resolver",
-        ":node_info_delegate",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/c:common",
@@ -156,39 +155,4 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "node_info_delegate",
-    srcs = ["node_info_delegate.cc"],
-    hdrs = ["node_info_delegate.h"],
-    copts = tflite_copts(),
-    deps = [
-        ":calibration_common",
-        "//tensorflow/lite:framework",
-    ],
-)
-
-tf_cc_test(
-    name = "node_info_delegate_test",
-    srcs = ["node_info_delegate_test.cc"],
-    args = [
-        "--test_model_file=$(location //tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin)",
-    ],
-    data = [
-        "//tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin",
-    ],
-    tags = [
-        "tflite_not_portable_android",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":node_info_delegate",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "//tensorflow/lite/tools/optimize:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
 tflite_portable_test_suite()
diff --git a/tensorflow/lite/tools/optimize/calibration/calibrator.cc b/tensorflow/lite/tools/optimize/calibration/calibrator.cc
index fb1677fda99..c82057ec207 100644
--- a/tensorflow/lite/tools/optimize/calibration/calibrator.cc
+++ b/tensorflow/lite/tools/optimize/calibration/calibrator.cc
@@ -39,7 +39,6 @@ limitations under the License.
 #include "tensorflow/lite/tools/optimize/calibration/calibration_reader.h"
 #include "tensorflow/lite/tools/optimize/calibration/logging_op.h"
 #include "tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h"
-#include "tensorflow/lite/tools/optimize/calibration/node_info_delegate.h"
 
 namespace tflite {
 namespace optimize {
@@ -267,18 +266,20 @@ TfLiteStatus GetNodeOpInfoMapAndContext(
     const std::unordered_map<int, OperatorInfo>& node_to_opinfo,
     tflite::Interpreter* const interpreter,
     std::unordered_map<const TfLiteNode*, OperatorInfo>* node_ptr_opinfo_map,
-    const TfLiteContext** context) {
-  NodeInfoDelegateObserver delegate_observer(node_to_opinfo,
-                                             node_ptr_opinfo_map);
-  NodeInfoDelegateParams delegate_params;
-  delegate_params.delegate_observer = &delegate_observer;
-  TfLiteDelegate logging_delegate = CreateNodeInfoDelegate(&delegate_params);
+    TfLiteContext** context) {
+  *context = interpreter->primary_subgraph().context();
 
-  auto modify_status = interpreter->ModifyGraphWithDelegate(&logging_delegate);
-  if (modify_status != kTfLiteOk) {
-    return kTfLiteError;
+  // Since we only consider the primary subgraph while populating
+  // node_to_opinfo, do the same here.
+  TF_LITE_ENSURE_EQ(*context, interpreter->execution_plan().size(),
+                    node_to_opinfo.size());
+  for (const auto op_index : interpreter->execution_plan()) {
+    const auto* node_and_reg = interpreter->node_and_registration(op_index);
+
+    auto op_info = node_to_opinfo.at(op_index);
+    op_info.registration = &node_and_reg->second;
+    node_ptr_opinfo_map->insert({&node_and_reg->first, op_info});
   }
-  *context = delegate_observer.GetContext();
   return kTfLiteOk;
 }
 
@@ -391,7 +392,7 @@ TfLiteStatus BuildLoggingInterpreter(
   // Compute the mapping between runtime and static graph structure, i.e.
   // (TfLiteContext, TfLiteNode) -> OperatorInfo
   std::unordered_map<const TfLiteNode*, OperatorInfo> node_ptr_opinfo_map;
-  const TfLiteContext* context = nullptr;
+  TfLiteContext* context = nullptr;
   GetNodeOpInfoMapAndContext(node_to_opinfo, interpreter->get(),
                              &node_ptr_opinfo_map, &context);
 
diff --git a/tensorflow/lite/tools/optimize/calibration/node_info_delegate.cc b/tensorflow/lite/tools/optimize/calibration/node_info_delegate.cc
deleted file mode 100644
index 84031761b30..00000000000
--- a/tensorflow/lite/tools/optimize/calibration/node_info_delegate.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/tools/optimize/calibration/node_info_delegate.h"
-
-namespace tflite {
-namespace optimize {
-namespace calibration {
-
-namespace {
-// The prepare function for delegate that forwards the prepare call to the
-// delegate observer in node info delegate params.
-// The function simply calls a delegate observer OnDelegatePrepareMethod.
-TfLiteStatus NodeInfoDelegatePrepare(TfLiteContext* context,
-                                     TfLiteDelegate* delegate) {
-  if (delegate == nullptr) return TfLiteStatus::kTfLiteError;
-
-  NodeInfoDelegateParams* params =
-      reinterpret_cast<NodeInfoDelegateParams*>(delegate->data_);
-  return params->delegate_observer->OnDelegatePrepareCalled(context);
-}
-}  // namespace
-
-TfLiteDelegate CreateNodeInfoDelegate(NodeInfoDelegateParams* params) {
-  auto delegate = TfLiteDelegateCreate();
-  delegate.data_ = params;
-  delegate.Prepare = NodeInfoDelegatePrepare;
-  delegate.CopyFromBufferHandle = nullptr;
-  delegate.CopyToBufferHandle = nullptr;
-  delegate.FreeBufferHandle = nullptr;
-  delegate.flags = kTfLiteDelegateFlagsAllowDynamicTensors;
-  return delegate;
-}
-
-TfLiteStatus NodeInfoDelegateObserver::OnDelegatePrepareCalled(
-    TfLiteContext* context) {
-  context_ = context;
-  const size_t num_nodes = node_index_opinfo_map_.size();
-  for (size_t node_index = 0; node_index < num_nodes; node_index++) {
-    TfLiteNode* node = nullptr;
-    TfLiteRegistration* reg = nullptr;
-    TF_LITE_ENSURE_STATUS(
-        context->GetNodeAndRegistration(context, node_index, &node, &reg));
-    auto op_info = node_index_opinfo_map_.at(node_index);
-    op_info.registration = reg;
-    node_ptr_opinfo_map_->insert({node, op_info});
-  }
-
-  if (node_ptr_opinfo_map_->size() != node_index_opinfo_map_.size()) {
-    // Something wrong.
-    return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace calibration
-}  // namespace optimize
-}  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/calibration/node_info_delegate.h b/tensorflow/lite/tools/optimize/calibration/node_info_delegate.h
deleted file mode 100644
index 56f6141f21d..00000000000
--- a/tensorflow/lite/tools/optimize/calibration/node_info_delegate.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_NODE_INFO_DELEGATE_H_
-#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_NODE_INFO_DELEGATE_H_
-
-#include <unordered_map>
-
-#include "tensorflow/lite/context.h"
-#include "tensorflow/lite/tools/optimize/calibration/calibration_common.h"
-
-namespace tflite {
-namespace optimize {
-namespace calibration {
-
-// An interface for delegate observer that can listen to TfLiteDelegate::Prepare
-// calls.
-class DelegateObserver {
- public:
-  virtual TfLiteStatus OnDelegatePrepareCalled(TfLiteContext* context) = 0;
-  virtual ~DelegateObserver() {}
-};
-
-// The parameters for the node info delegate.
-struct NodeInfoDelegateParams {
-  DelegateObserver* delegate_observer;
-};
-
-// Creates a delegate with the given |params|.
-TfLiteDelegate CreateNodeInfoDelegate(NodeInfoDelegateParams* params);
-
-// A delegate observer that can construct the map from TfLiteNode* ->
-// OperatorInfo.
-class NodeInfoDelegateObserver : public DelegateObserver {
- public:
-  NodeInfoDelegateObserver(
-      const std::unordered_map<int, OperatorInfo>& node_index_to_op,
-      std::unordered_map<const TfLiteNode*, OperatorInfo>* node_ptr_opinfo_map)
-      : node_index_opinfo_map_(node_index_to_op),
-        node_ptr_opinfo_map_(node_ptr_opinfo_map) {}
-
-  TfLiteStatus OnDelegatePrepareCalled(TfLiteContext* context) override;
-
-  // Returns the context that was used to called the prepare method.
-  const TfLiteContext* GetContext() const { return context_; }
-
- private:
-  const TfLiteContext* context_ = nullptr;
-  const std::unordered_map<int, OperatorInfo>& node_index_opinfo_map_;
-  std::unordered_map<const TfLiteNode*, OperatorInfo>* node_ptr_opinfo_map_;
-};
-
-}  // namespace calibration
-}  // namespace optimize
-}  // namespace tflite
-#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_NODE_INFO_DELEGATE_H_
diff --git a/tensorflow/lite/tools/optimize/calibration/node_info_delegate_test.cc b/tensorflow/lite/tools/optimize/calibration/node_info_delegate_test.cc
deleted file mode 100644
index 722bdbdbb39..00000000000
--- a/tensorflow/lite/tools/optimize/calibration/node_info_delegate_test.cc
+++ /dev/null
@@ -1,178 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <unordered_map>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/platform/init_main.h"
-#include "tensorflow/core/util/command_line_flags.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/tools/optimize/calibration/node_info_delegate.h"
-#include "tensorflow/lite/tools/optimize/test_util.h"
-
-namespace {
-tensorflow::string* g_test_model_dir = nullptr;
-}  // namespace
-
-namespace tflite {
-namespace optimize {
-namespace calibration {
-namespace {
-
-std::unique_ptr<FlatBufferModel> ReadModel(const char* model) {
-  auto model_path = tensorflow::io::JoinPath(*g_test_model_dir, model);
-  return FlatBufferModel::BuildFromFile(model_path.c_str());
-}
-
-std::unique_ptr<FlatBufferModel> ReadModel() {
-  return ReadModel(internal::kConvModelWith0Plus10Weights);
-}
-
-class TestDelegateObserver : public DelegateObserver {
- public:
-  explicit TestDelegateObserver(TfLiteStatus status_to_return)
-      : status_to_return_(status_to_return) {}
-
-  TfLiteStatus OnDelegatePrepareCalled(TfLiteContext* context) override {
-    num_times_called_++;
-    return status_to_return_;
-  }
-  int num_times_called() { return num_times_called_; }
-
- private:
-  int num_times_called_ = 0;
-  TfLiteStatus status_to_return_;
-};
-
-TEST(NodeInfoDelegateTest, DelegateObserverIsCalled) {
-  TestDelegateObserver observer(kTfLiteOk);
-  NodeInfoDelegateParams params;
-  params.delegate_observer = &observer;
-  auto model = ReadModel();
-  ASSERT_TRUE(model);
-  std::unique_ptr<Interpreter> interpreter;
-  ASSERT_EQ(InterpreterBuilder(*model,
-                               ops::builtin::BuiltinOpResolver{})(&interpreter),
-            kTfLiteOk);
-  ASSERT_TRUE(interpreter);
-  EXPECT_EQ(0, observer.num_times_called());
-  TfLiteDelegate delegate = CreateNodeInfoDelegate(&params);
-
-  auto status = interpreter->ModifyGraphWithDelegate(&delegate);
-  EXPECT_EQ(kTfLiteOk, status);
-  EXPECT_EQ(1, observer.num_times_called());
-}
-
-TEST(NodeInfoDelegateTest, ObserverErrorCausesModifyGraphFailure) {
-  // Observer returns error
-  TestDelegateObserver observer(kTfLiteError);
-  NodeInfoDelegateParams params;
-  params.delegate_observer = &observer;
-  auto model = ReadModel();
-  ASSERT_TRUE(model);
-  std::unique_ptr<Interpreter> interpreter;
-  ASSERT_EQ(InterpreterBuilder(*model,
-                               ops::builtin::BuiltinOpResolver{})(&interpreter),
-            kTfLiteOk);
-  ASSERT_TRUE(interpreter);
-  TfLiteDelegate delegate = CreateNodeInfoDelegate(&params);
-
-  auto status = interpreter->ModifyGraphWithDelegate(&delegate);
-  EXPECT_EQ(kTfLiteDelegateError, status);
-}
-
-TEST(NodeInfoDelegateTest, NodeInfoDelegateObserver) {
-  auto model = ReadModel();
-  ASSERT_TRUE(model);
-
-  std::unordered_map<int, OperatorInfo> index_to_opinfo;
-  auto primary_subgraph = model->GetModel()->subgraphs()->Get(0);
-  auto operators = primary_subgraph->operators();
-  auto subgraph_tensors = primary_subgraph->tensors();
-  for (size_t i = 0; i < operators->size(); i++) {
-    OperatorInfo info;
-    auto op_inputs = operators->Get(i)->inputs();
-    auto op_outputs = operators->Get(i)->outputs();
-    info.inputs = std::vector<int>(op_inputs->begin(), op_inputs->end());
-    info.outputs = std::vector<int>(op_outputs->begin(), op_outputs->end());
-    index_to_opinfo[i] = info;
-  }
-
-  std::unordered_map<const TfLiteNode*, OperatorInfo> node_to_opinfo;
-  NodeInfoDelegateObserver observer(index_to_opinfo, &node_to_opinfo);
-  NodeInfoDelegateParams params;
-  params.delegate_observer = &observer;
-  std::unique_ptr<Interpreter> interpreter;
-  ASSERT_EQ(InterpreterBuilder(*model,
-                               ops::builtin::BuiltinOpResolver{})(&interpreter),
-            kTfLiteOk);
-  ASSERT_TRUE(interpreter);
-
-  TfLiteDelegate delegate = CreateNodeInfoDelegate(&params);
-
-  auto status = interpreter->ModifyGraphWithDelegate(&delegate);
-  EXPECT_EQ(kTfLiteOk, status);
-  EXPECT_EQ(index_to_opinfo.size(), node_to_opinfo.size());
-  EXPECT_EQ(interpreter->nodes_size(), node_to_opinfo.size());
-
-  for (const auto& node_and_opinfo : node_to_opinfo) {
-    const TfLiteNode* tflite_node = node_and_opinfo.first;
-    const OperatorInfo& info = node_and_opinfo.second;
-    ASSERT_EQ(tflite_node->inputs->size, info.inputs.size());
-    ASSERT_EQ(tflite_node->outputs->size, info.outputs.size());
-
-    for (size_t input_index = 0; input_index < info.inputs.size();
-         input_index++) {
-      const TfLiteTensor* tflite_tensor =
-          interpreter->tensor(tflite_node->inputs->data[input_index]);
-      EXPECT_EQ(tflite_tensor->name,
-                subgraph_tensors->Get(info.inputs[input_index])->name()->str());
-    }
-
-    for (size_t output_index = 0; output_index < info.outputs.size();
-         output_index++) {
-      const TfLiteTensor* tflite_tensor =
-          interpreter->tensor(tflite_node->outputs->data[output_index]);
-      EXPECT_EQ(
-          tflite_tensor->name,
-          subgraph_tensors->Get(info.outputs[output_index])->name()->str());
-    }
-  }
-}
-
-}  // namespace
-}  // namespace calibration
-}  // namespace optimize
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  tensorflow::string model_file;
-  const std::vector<tensorflow::Flag> flag_list = {
-      tensorflow::Flag("test_model_file", &model_file,
-                       "Path to test tflite model file."),
-  };
-
-  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
-  if (!parse_result) {
-    std::cerr << "Required test_model_file\n";
-    std::abort();
-  }
-  g_test_model_dir =
-      new tensorflow::string(tensorflow::io::Dirname(model_file));
-  ::tensorflow::port::InitMain(argv[0], &argc, &argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/tools/optimize/model_utils.cc b/tensorflow/lite/tools/optimize/model_utils.cc
index ae868cf21b8..f30fa8b7bdd 100644
--- a/tensorflow/lite/tools/optimize/model_utils.cc
+++ b/tensorflow/lite/tools/optimize/model_utils.cc
@@ -125,7 +125,7 @@ bool HasMinMax(const TensorT* tensor) {
 }
 
 void SetOperatorCodeVersion(ModelT* model) {
-  for (int subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
+  for (int subgraph_idx = 0, end = model->subgraphs.size(); subgraph_idx < end;
        subgraph_idx++) {
     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
     // Iterate backward to avoid messing with index.
diff --git a/tensorflow/lite/tools/optimize/modify_model_interface.cc b/tensorflow/lite/tools/optimize/modify_model_interface.cc
index 91c9b7e8b74..d40d4455e24 100644
--- a/tensorflow/lite/tools/optimize/modify_model_interface.cc
+++ b/tensorflow/lite/tools/optimize/modify_model_interface.cc
@@ -45,7 +45,8 @@ struct TensorOpTensor {
 
 // Finds float tensors that are model inputs and is consumed by a quantize Op.
 // The returned TensorOpTensor should have reverse order.
-std::vector<TensorOpTensor> GetInputTensors(ModelT* model,
+std::vector<TensorOpTensor> GetInputTensors(const TensorType& input_type,
+                                            ModelT* model,
                                             ErrorReporter* error_reporter) {
   std::vector<TensorOpTensor> result;
   // Get all input tensors.
@@ -71,7 +72,7 @@ std::vector<TensorOpTensor> GetInputTensors(ModelT* model,
         continue;
       }
       if (op_code != BuiltinOperator_QUANTIZE) {
-        // Current only support INT8 quantized models.
+        // Currently only supports int8 and int16 quantized models.
         TF_LITE_REPORT_ERROR(
             error_reporter,
             "modify_model_interface called on a model without quant/dequant.");
@@ -85,10 +86,27 @@ std::vector<TensorOpTensor> GetInputTensors(ModelT* model,
       }
       const int model_input_index = input_tensors[input_tensor];
       TensorT* quant_output = subgraph->tensors[op->outputs[0]].get();
-      if (quant_output->type != TensorType_INT8) {
+      if (quant_output->type != TensorType_INT8 &&
+          quant_output->type != TensorType_INT16) {
         TF_LITE_REPORT_ERROR(error_reporter,
-                             "modify_model_interface currently only support "
-                             "int8 quantized models.");
+                             "modify_model_interface currently only supports "
+                             "int8 and int16 quantized models.");
+      }
+
+      // The input type must be the same as the model quantization type
+      if (input_type != quant_output->type) {
+        // An exception, allow for UINT8 input type for INT8 quantized model.
+        if (!(input_type == TensorType_UINT8 &&
+              quant_output->type == TensorType_INT8)) {
+          TF_LITE_REPORT_ERROR(
+              error_reporter,
+              "The %s input type is incompatible with %s quantized models. "
+              "To resolve this error, change the input_type to a compatible "
+              "one. "
+              "See: modify_model_interface.cc",
+              EnumNameTensorType(input_type),
+              EnumNameTensorType(quant_output->type));
+        }
       }
       if (quant_output->quantization == nullptr) {
         continue;
@@ -102,7 +120,8 @@ std::vector<TensorOpTensor> GetInputTensors(ModelT* model,
 
 // Finds float tensors that are model output and is consumed by a dequantize Op.
 // The returned TensorOpTensor should have reverse order.
-std::vector<TensorOpTensor> GetOutputTensors(ModelT* model,
+std::vector<TensorOpTensor> GetOutputTensors(const TensorType& output_type,
+                                             ModelT* model,
                                              ErrorReporter* error_reporter) {
   std::vector<TensorOpTensor> result;
   // Get all output tensors.
@@ -128,7 +147,7 @@ std::vector<TensorOpTensor> GetOutputTensors(ModelT* model,
         continue;
       }
       if (op_code != BuiltinOperator_DEQUANTIZE) {
-        // Current only support INT8 quantized models.
+        // Currently only supports int8 and int16 quantized models.
         TF_LITE_REPORT_ERROR(
             error_reporter,
             "modify_model_interface called on a model without quant/dequant.");
@@ -142,13 +161,28 @@ std::vector<TensorOpTensor> GetOutputTensors(ModelT* model,
       }
       const int model_output_index = output_tensors[output_tensor];
       TensorT* dequant_input = subgraph->tensors[op->inputs[0]].get();
-      if (dequant_input->type != TensorType_INT8) {
-        // Current only support INT8 quantized models.
+      if (dequant_input->type != TensorType_INT8 &&
+          dequant_input->type != TensorType_INT16) {
+        // Currently only supports int8 and int16 quantized models.
         TF_LITE_REPORT_ERROR(error_reporter,
-                             "modify_model_interface currently only support "
-                             "int8 quantized models.");
+                             "modify_model_interface currently only supports "
+                             "int8 and int16 quantized models.");
         return {};
       }
+      if (output_type != dequant_input->type) {
+        // An exception, allow for UINT8 input type for INT8 quantized model.
+        if (!(output_type == TensorType_UINT8 &&
+              dequant_input->type == TensorType_INT8)) {
+          TF_LITE_REPORT_ERROR(
+              error_reporter,
+              "The %s output type is incompatible with %s quantized models. "
+              "To resolve this error, change the output_type to a compatible "
+              "one. "
+              "See: modify_model_interface.cc",
+              EnumNameTensorType(output_type),
+              EnumNameTensorType(dequant_input->type));
+        }
+      }
       if (dequant_input->quantization == nullptr) {
         continue;
       }
@@ -288,9 +322,13 @@ std::unique_ptr<tflite::ModelT> CreateMutableModelFromFile(
   return copied_model;
 }
 
-int GetOriginalNumberOfTensors(ModelT* model, ErrorReporter* error_reporter) {
-  std::vector<TensorOpTensor> outputs = GetOutputTensors(model, error_reporter);
-  std::vector<TensorOpTensor> inputs = GetInputTensors(model, error_reporter);
+int GetOriginalNumberOfTensors(const TensorType& input_type,
+                               const TensorType& output_type, ModelT* model,
+                               ErrorReporter* error_reporter) {
+  std::vector<TensorOpTensor> outputs =
+      GetOutputTensors(output_type, model, error_reporter);
+  std::vector<TensorOpTensor> inputs =
+      GetInputTensors(input_type, model, error_reporter);
   return model->subgraphs[0]->tensors.size() - outputs.size() - inputs.size();
 }
 
@@ -300,30 +338,39 @@ TfLiteStatus ModifyModelInterface(flatbuffers::FlatBufferBuilder* builder,
                                   ModelT* model, const TensorType& input_type,
                                   const TensorType& output_type) {
   tflite::StderrReporter error_reporter;
-  const int original_number_tensors =
-      GetOriginalNumberOfTensors(model, &error_reporter);
-  // Find float tensors that are model output and is consumed by a float to int8
-  // quantize Op.
-  // Do output first since the tensors are added into input first.,
+  const int original_number_tensors = GetOriginalNumberOfTensors(
+      input_type, output_type, model, &error_reporter);
+  // Finds float tensors that are model output and are consumed by a float to
+  // int8/int16 quantize Op. Do output first since the tensors are added into
+  // input first.,
   std::vector<TensorOpTensor> outputs =
-      GetOutputTensors(model, &error_reporter);
-  if (output_type == TensorType_UINT8) {
-    SetOutputTypeToUINT8(model, outputs);
-  } else if (output_type == TensorType_INT8) {
-    RemoveOutputTensor(model, outputs, original_number_tensors);
-  } else {
-    return kTfLiteError;
+      GetOutputTensors(output_type, model, &error_reporter);
+  switch (output_type) {
+    case TensorType_UINT8:
+      SetOutputTypeToUINT8(model, outputs);
+      break;
+    case TensorType_INT8:
+    case TensorType_INT16:
+      RemoveOutputTensor(model, outputs, original_number_tensors);
+      break;
+    default:
+      return kTfLiteError;
   }
 
-  // Find float tensors that are model input and is consumed by a float to int8
-  // quantize Op.
-  std::vector<TensorOpTensor> inputs = GetInputTensors(model, &error_reporter);
-  if (input_type == TensorType_UINT8) {
-    SetInputTypeToUINT8(model, inputs);
-  } else if (input_type == TensorType_INT8) {
-    RemoveInputTensor(model, inputs, original_number_tensors);
-  } else {
-    return kTfLiteError;
+  // Find float tensors that are model input and is consumed by a float to
+  // int8/int16 quantize Op.
+  std::vector<TensorOpTensor> inputs =
+      GetInputTensors(input_type, model, &error_reporter);
+  switch (input_type) {
+    case TensorType_UINT8:
+      SetInputTypeToUINT8(model, inputs);
+      break;
+    case TensorType_INT8:
+    case TensorType_INT16:
+      RemoveInputTensor(model, inputs, original_number_tensors);
+      break;
+    default:
+      return kTfLiteError;
   }
 
   // Write to builder.
@@ -340,11 +387,13 @@ TfLiteStatus ModifyModelInterface(const string& input_file,
                                   const TensorType& output_type) {
   // Consistency Check
   if (input_type != tflite::TensorType_INT8 &&
-      input_type != tflite::TensorType_UINT8) {
+      input_type != tflite::TensorType_UINT8 &&
+      input_type != tflite::TensorType_INT16) {
     return kTfLiteError;
   }
   if (output_type != tflite::TensorType_INT8 &&
-      output_type != tflite::TensorType_UINT8) {
+      output_type != tflite::TensorType_UINT8 &&
+      output_type != tflite::TensorType_INT16) {
     return kTfLiteError;
   }
 
@@ -357,17 +406,8 @@ TfLiteStatus ModifyModelInterface(const string& input_file,
       absl::make_unique<flatbuffers::FlatBufferBuilder>();
   flatbuffers::FlatBufferBuilder builder;
 
-  tflite::TensorType input_override_type = tflite::TensorType_INT8;
-  if (input_type == tflite::TensorType_UINT8) {
-    input_override_type = tflite::TensorType_UINT8;
-  }
-  tflite::TensorType output_override_type = tflite::TensorType_INT8;
-  if (output_type == tflite::TensorType_UINT8) {
-    output_override_type = tflite::TensorType_UINT8;
-  }
-
-  auto status = ModifyModelInterface(&builder, tflite_model.get(),
-                                     input_override_type, output_override_type);
+  auto status = ModifyModelInterface(&builder, tflite_model.get(), input_type,
+                                     output_type);
   TFLITE_DCHECK_EQ(status, kTfLiteOk);
 
   WriteFile(output_file, builder.GetBufferPointer(), builder.GetSize());
diff --git a/tensorflow/lite/tools/optimize/modify_model_interface.h b/tensorflow/lite/tools/optimize/modify_model_interface.h
index 170e0e73a67..5711a615812 100644
--- a/tensorflow/lite/tools/optimize/modify_model_interface.h
+++ b/tensorflow/lite/tools/optimize/modify_model_interface.h
@@ -24,7 +24,7 @@ namespace optimize {
 // Changes the interface of a quantized model. This method allows the users to
 // replace float interface with other types.
 // This populates the builder with the new model.
-// Currently only int8 and unit8 are supported.
+// Currently only int8, int16 and uint8 are supported.
 //
 // Note: This is a private API, subject to change.
 TfLiteStatus ModifyModelInterface(flatbuffers::FlatBufferBuilder* builder,
diff --git a/tensorflow/lite/tools/optimize/modify_model_interface_main.cc b/tensorflow/lite/tools/optimize/modify_model_interface_main.cc
index 24674a1b341..940c0d98b82 100644
--- a/tensorflow/lite/tools/optimize/modify_model_interface_main.cc
+++ b/tensorflow/lite/tools/optimize/modify_model_interface_main.cc
@@ -25,24 +25,20 @@ int main(int argc, char** argv) {
     return 1;
   }
 
-  if (strcmp(argv[3], "uint8") && strcmp(argv[3], "int8")) {
-    printf("Only support uint8 and int8 for input interface");
-    return 1;
-  }
-
-  if (strcmp(argv[4], "uint8") && strcmp(argv[4], "int8")) {
-    printf("Only support uint8 and int8 for output interface");
-    return 1;
-  }
+  const std::unordered_map<std::string, tflite::TensorType> supported_types{
+      {"uint8", tflite::TensorType_UINT8},
+      {"int8", tflite::TensorType_INT8},
+      {"int16", tflite::TensorType_INT16}};
 
   tflite::TensorType input = tflite::TensorType_INT8;
   tflite::TensorType output = tflite::TensorType_INT8;
 
-  if (!strcmp(argv[3], "uint8")) {
-    input = tflite::TensorType_UINT8;
-  }
-  if (!strcmp(argv[4], "uint8")) {
-    output = tflite::TensorType_UINT8;
+  try {
+    input = supported_types.at(argv[3]);
+    output = supported_types.at(argv[4]);
+  } catch (const std::out_of_range&) {
+    printf("Only supports uint8, int8 and int16 for input and output types");
+    return 1;
   }
 
   tflite::optimize::ModifyModelInterface(argv[1], argv[2], input, output);
diff --git a/tensorflow/lite/tools/optimize/modify_model_interface_test.cc b/tensorflow/lite/tools/optimize/modify_model_interface_test.cc
index 55147cec1ec..99e0ad35b2d 100644
--- a/tensorflow/lite/tools/optimize/modify_model_interface_test.cc
+++ b/tensorflow/lite/tools/optimize/modify_model_interface_test.cc
@@ -26,8 +26,11 @@ namespace tflite {
 namespace optimize {
 namespace {
 
-// Create a quantized model with 1 quant, 1 FC, 1 dequant
-std::unique_ptr<ModelT> CreateQuantizedModelSingleInputOutput() {
+using ::testing::ElementsAreArray;
+
+// Create a model with 1 quant, 1 FC, 1 dequant
+std::unique_ptr<ModelT> CreateQuantizedModelSingleInputOutput(
+    const TensorType& quantization_type) {
   auto model = absl::make_unique<ModelT>();
   auto subgraph = absl::make_unique<tflite::SubGraphT>();
   auto buffer = absl::make_unique<tflite::BufferT>();
@@ -87,7 +90,7 @@ std::unique_ptr<ModelT> CreateQuantizedModelSingleInputOutput() {
   tensor_1->quantization->zero_point.push_back(28);
   tensor_1->name = "tensor_1";
   tensor_1->shape = {};
-  tensor_1->type = TensorType_INT8;
+  tensor_1->type = quantization_type;
 
   auto tensor_2 = absl::make_unique<TensorT>();
   tensor_2->quantization = absl::make_unique<QuantizationParametersT>();
@@ -95,7 +98,7 @@ std::unique_ptr<ModelT> CreateQuantizedModelSingleInputOutput() {
   tensor_2->quantization->zero_point.push_back(50);
   tensor_2->name = "tensor_2";
   tensor_2->shape = {};
-  tensor_2->type = TensorType_INT8;
+  tensor_2->type = quantization_type;
 
   auto tensor_3 = absl::make_unique<TensorT>();
   tensor_3->name = "tensor_3";
@@ -113,8 +116,10 @@ std::unique_ptr<ModelT> CreateQuantizedModelSingleInputOutput() {
   return model;
 }
 
-// Create a quantized model with 2 quant, 1 FC, 2 dequant
-std::unique_ptr<ModelT> CreateQuantizedModelMultipleInputOutput() {
+// Create a model with 2 quant, 1 FC, 2 dequant
+// The model mimics the behavior of the quantize_model.cc.
+std::unique_ptr<ModelT> CreateQuantizedModelMultipleInputOutput(
+    const TensorType& quantization_type) {
   auto model = absl::make_unique<ModelT>();
   auto subgraph = absl::make_unique<tflite::SubGraphT>();
   auto buffer = absl::make_unique<tflite::BufferT>();
@@ -189,7 +194,7 @@ std::unique_ptr<ModelT> CreateQuantizedModelMultipleInputOutput() {
   tensor_2->quantization->zero_point.push_back(28);
   tensor_2->name = "tensor_2";
   tensor_2->shape = {};
-  tensor_2->type = TensorType_INT8;
+  tensor_2->type = quantization_type;
 
   auto tensor_3 = absl::make_unique<TensorT>();
   tensor_3->quantization = absl::make_unique<QuantizationParametersT>();
@@ -197,7 +202,7 @@ std::unique_ptr<ModelT> CreateQuantizedModelMultipleInputOutput() {
   tensor_3->quantization->zero_point.push_back(50);
   tensor_3->name = "tensor_3";
   tensor_3->shape = {};
-  tensor_3->type = TensorType_INT8;
+  tensor_3->type = quantization_type;
 
   auto tensor_4 = absl::make_unique<TensorT>();
   tensor_4->quantization = absl::make_unique<QuantizationParametersT>();
@@ -205,7 +210,7 @@ std::unique_ptr<ModelT> CreateQuantizedModelMultipleInputOutput() {
   tensor_4->quantization->zero_point.push_back(28);
   tensor_4->name = "tensor_4";
   tensor_4->shape = {};
-  tensor_4->type = TensorType_INT8;
+  tensor_4->type = quantization_type;
 
   auto tensor_5 = absl::make_unique<TensorT>();
   tensor_5->quantization = absl::make_unique<QuantizationParametersT>();
@@ -213,7 +218,7 @@ std::unique_ptr<ModelT> CreateQuantizedModelMultipleInputOutput() {
   tensor_5->quantization->zero_point.push_back(50);
   tensor_5->name = "tensor_5";
   tensor_5->shape = {};
-  tensor_5->type = TensorType_INT8;
+  tensor_5->type = quantization_type;
 
   auto tensor_6 = absl::make_unique<TensorT>();
   tensor_6->name = "tensor_6";
@@ -286,8 +291,141 @@ std::unique_ptr<ModelT> CreateFloatModel() {
   return model;
 }
 
+struct ModelInterface : ::testing::TestWithParam<tflite::TensorType> {};
+
+TEST_P(ModelInterface, SingleInputOutput) {
+  TensorType quantization_type = GetParam();
+
+  auto model = CreateQuantizedModelSingleInputOutput(quantization_type);
+
+  // Change model type.
+  flatbuffers::FlatBufferBuilder builder;
+  EXPECT_EQ(ModifyModelInterface(&builder, model.get(), quantization_type,
+                                 quantization_type),
+            kTfLiteOk);
+
+  // Verify results.
+  EXPECT_EQ(model->subgraphs.size(), 1);
+  // TODO(mnatraj): The float input tensor has not been removed.
+  // EXPECT_EQ(model->subgraphs[0]->tensors.size(), 2);
+  EXPECT_EQ(model->subgraphs[0]->tensors.size(), 3);
+  EXPECT_EQ(model->subgraphs[0]->inputs.size(), 1);
+  EXPECT_EQ(model->subgraphs[0]->inputs[0], 1);
+  EXPECT_EQ(model->subgraphs[0]->outputs.size(), 1);
+  EXPECT_EQ(model->subgraphs[0]->outputs[0], 2);
+  EXPECT_EQ(model->operator_codes.size(), 3);
+  EXPECT_EQ(model->subgraphs[0]->operators.size(), 1);
+  EXPECT_EQ(model->subgraphs[0]->operators[0]->opcode_index, 1);
+
+  auto fc_op = model->subgraphs[0]->operators[0].get();
+
+  auto input = model->subgraphs[0]->tensors[fc_op->inputs[0]].get();
+  EXPECT_EQ(input->name, "tensor_1");
+  EXPECT_EQ(input->type, quantization_type);
+  EXPECT_FLOAT_EQ(input->quantization->scale[0], 0.35);
+  EXPECT_EQ(input->quantization->zero_point[0], 28);
+
+  auto output = model->subgraphs[0]->tensors[fc_op->outputs[0]].get();
+  EXPECT_EQ(output->name, "tensor_2");
+  EXPECT_EQ(output->type, quantization_type);
+  EXPECT_FLOAT_EQ(output->quantization->scale[0], 0.12);
+  EXPECT_EQ(output->quantization->zero_point[0], 50);
+}
+
+TEST_P(ModelInterface, MutipleInputOutput) {
+  TensorType quantization_type = GetParam();
+
+  auto model = CreateQuantizedModelMultipleInputOutput(quantization_type);
+
+  // Change model type.
+  flatbuffers::FlatBufferBuilder builder;
+  EXPECT_EQ(ModifyModelInterface(&builder, model.get(), quantization_type,
+                                 quantization_type),
+            kTfLiteOk);
+
+  // Verify results.
+  EXPECT_EQ(model->subgraphs.size(), 1);
+  // TODO (b/158254056): Remove unused inputs and outputs from tensor list
+  // EXPECT_EQ(model->subgraphs[0]->tensors.size(), 4);
+  EXPECT_EQ(model->subgraphs[0]->tensors.size(), 6);
+  EXPECT_EQ(model->subgraphs[0]->inputs.size(), 2);
+  EXPECT_EQ(model->subgraphs[0]->inputs[0], 2);
+  EXPECT_EQ(model->subgraphs[0]->inputs[1], 3);
+  EXPECT_EQ(model->subgraphs[0]->outputs.size(), 2);
+  EXPECT_EQ(model->subgraphs[0]->outputs[0], 4);
+  EXPECT_EQ(model->subgraphs[0]->outputs[1], 5);
+  EXPECT_EQ(model->operator_codes.size(), 3);
+  EXPECT_EQ(model->subgraphs[0]->operators.size(), 1);
+  EXPECT_EQ(model->subgraphs[0]->operators[0]->opcode_index, 1);
+
+  auto fc_op = model->subgraphs[0]->operators[0].get();
+
+  auto input_1 = model->subgraphs[0]->tensors[fc_op->inputs[0]].get();
+  EXPECT_EQ(input_1->name, "tensor_2");
+  EXPECT_EQ(input_1->type, quantization_type);
+  EXPECT_FLOAT_EQ(input_1->quantization->scale[0], 0.35);
+  EXPECT_EQ(input_1->quantization->zero_point[0], 28);
+
+  auto input_2 = model->subgraphs[0]->tensors[fc_op->inputs[1]].get();
+  EXPECT_EQ(input_2->name, "tensor_3");
+  EXPECT_EQ(input_2->type, quantization_type);
+  EXPECT_FLOAT_EQ(input_2->quantization->scale[0], 0.12);
+  EXPECT_EQ(input_2->quantization->zero_point[0], 50);
+
+  auto output_1 = model->subgraphs[0]->tensors[fc_op->outputs[0]].get();
+  EXPECT_EQ(output_1->name, "tensor_4");
+  EXPECT_EQ(output_1->type, quantization_type);
+  EXPECT_FLOAT_EQ(output_1->quantization->scale[0], 0.45);
+  EXPECT_EQ(output_1->quantization->zero_point[0], 28);
+
+  auto output_2 = model->subgraphs[0]->tensors[fc_op->outputs[1]].get();
+  EXPECT_EQ(output_2->name, "tensor_5");
+  EXPECT_EQ(output_2->type, quantization_type);
+  EXPECT_FLOAT_EQ(output_2->quantization->scale[0], 0.22);
+  EXPECT_EQ(output_2->quantization->zero_point[0], 50);
+}
+
+INSTANTIATE_TEST_SUITE_P(MultipleInputOutputTests, ModelInterface,
+                         ::testing::Values(TensorType_INT8, TensorType_INT16));
+
+TEST(ModelInterface, MixedTypeSingleInputOutput) {
+  auto model = CreateQuantizedModelSingleInputOutput(TensorType_INT8);
+
+  // Change model type.
+  flatbuffers::FlatBufferBuilder builder;
+  EXPECT_EQ(ModifyModelInterface(&builder, model.get(), TensorType_UINT8,
+                                 TensorType_INT8),
+            kTfLiteOk);
+
+  // Verify results.
+  EXPECT_EQ(model->subgraphs.size(), 1);
+  EXPECT_EQ(model->subgraphs[0]->tensors.size(), 3);
+  EXPECT_EQ(model->subgraphs[0]->inputs.size(), 1);
+  EXPECT_EQ(model->subgraphs[0]->inputs[0], 0);
+  EXPECT_EQ(model->subgraphs[0]->outputs.size(), 1);
+  EXPECT_EQ(model->subgraphs[0]->outputs[0], 2);
+  EXPECT_EQ(model->operator_codes.size(), 3);
+  EXPECT_EQ(model->subgraphs[0]->operators.size(), 2);
+  EXPECT_EQ(model->subgraphs[0]->operators[0]->opcode_index, 0);
+  EXPECT_EQ(model->subgraphs[0]->operators[1]->opcode_index, 1);
+
+  auto quant_op = model->subgraphs[0]->operators[0].get();
+  auto input = model->subgraphs[0]->tensors[quant_op->inputs[0]].get();
+  EXPECT_EQ(input->name, "tensor_0");
+  EXPECT_EQ(input->type, TensorType_UINT8);
+  EXPECT_FLOAT_EQ(input->quantization->scale[0], 0.35);
+  EXPECT_EQ(input->quantization->zero_point[0], 156);
+
+  auto fc_op = model->subgraphs[0]->operators[1].get();
+  auto output = model->subgraphs[0]->tensors[fc_op->outputs[0]].get();
+  EXPECT_EQ(output->name, "tensor_2");
+  EXPECT_EQ(output->type, TensorType_INT8);
+  EXPECT_FLOAT_EQ(output->quantization->scale[0], 0.12);
+  EXPECT_EQ(output->quantization->zero_point[0], 50);
+}
+
 TEST(ModelInterface, Uint8SingleInputOutput) {
-  auto model = CreateQuantizedModelSingleInputOutput();
+  auto model = CreateQuantizedModelSingleInputOutput(TensorType_INT8);
 
   // Change model type.
   flatbuffers::FlatBufferBuilder builder;
@@ -323,81 +461,8 @@ TEST(ModelInterface, Uint8SingleInputOutput) {
   EXPECT_EQ(output->quantization->zero_point[0], 178);
 }
 
-TEST(ModelInterface, Int8SingleInputOutput) {
-  auto model = CreateQuantizedModelSingleInputOutput();
-
-  // Change model type.
-  flatbuffers::FlatBufferBuilder builder;
-  EXPECT_EQ(ModifyModelInterface(&builder, model.get(), TensorType_INT8,
-                                 TensorType_INT8),
-            kTfLiteOk);
-
-  // Verify results.
-  EXPECT_EQ(model->subgraphs.size(), 1);
-  // TODO(mnatraj): The float input tensor has not been removed.
-  // EXPECT_EQ(model->subgraphs[0]->tensors.size(), 2);
-  EXPECT_EQ(model->subgraphs[0]->tensors.size(), 3);
-  EXPECT_EQ(model->subgraphs[0]->inputs.size(), 1);
-  EXPECT_EQ(model->subgraphs[0]->inputs[0], 1);
-  EXPECT_EQ(model->subgraphs[0]->outputs.size(), 1);
-  EXPECT_EQ(model->subgraphs[0]->outputs[0], 2);
-  EXPECT_EQ(model->operator_codes.size(), 3);
-  EXPECT_EQ(model->subgraphs[0]->operators.size(), 1);
-  EXPECT_EQ(model->subgraphs[0]->operators[0]->opcode_index, 1);
-
-  auto fc_op = model->subgraphs[0]->operators[0].get();
-
-  auto input = model->subgraphs[0]->tensors[fc_op->inputs[0]].get();
-  EXPECT_EQ(input->name, "tensor_1");
-  EXPECT_EQ(input->type, TensorType_INT8);
-  EXPECT_FLOAT_EQ(input->quantization->scale[0], 0.35);
-  EXPECT_EQ(input->quantization->zero_point[0], 28);
-
-  auto output = model->subgraphs[0]->tensors[fc_op->outputs[0]].get();
-  EXPECT_EQ(output->name, "tensor_2");
-  EXPECT_EQ(output->type, TensorType_INT8);
-  EXPECT_FLOAT_EQ(output->quantization->scale[0], 0.12);
-  EXPECT_EQ(output->quantization->zero_point[0], 50);
-}
-
-TEST(ModelInterface, MixedTypeSingleInputOutput) {
-  auto model = CreateQuantizedModelSingleInputOutput();
-
-  // Change model type.
-  flatbuffers::FlatBufferBuilder builder;
-  EXPECT_EQ(ModifyModelInterface(&builder, model.get(), TensorType_UINT8,
-                                 TensorType_INT8),
-            kTfLiteOk);
-
-  // Verify results.
-  EXPECT_EQ(model->subgraphs.size(), 1);
-  EXPECT_EQ(model->subgraphs[0]->tensors.size(), 3);
-  EXPECT_EQ(model->subgraphs[0]->inputs.size(), 1);
-  EXPECT_EQ(model->subgraphs[0]->inputs[0], 0);
-  EXPECT_EQ(model->subgraphs[0]->outputs.size(), 1);
-  EXPECT_EQ(model->subgraphs[0]->outputs[0], 2);
-  EXPECT_EQ(model->operator_codes.size(), 3);
-  EXPECT_EQ(model->subgraphs[0]->operators.size(), 2);
-  EXPECT_EQ(model->subgraphs[0]->operators[0]->opcode_index, 0);
-  EXPECT_EQ(model->subgraphs[0]->operators[1]->opcode_index, 1);
-
-  auto quant_op = model->subgraphs[0]->operators[0].get();
-  auto input = model->subgraphs[0]->tensors[quant_op->inputs[0]].get();
-  EXPECT_EQ(input->name, "tensor_0");
-  EXPECT_EQ(input->type, TensorType_UINT8);
-  EXPECT_FLOAT_EQ(input->quantization->scale[0], 0.35);
-  EXPECT_EQ(input->quantization->zero_point[0], 156);
-
-  auto fc_op = model->subgraphs[0]->operators[1].get();
-  auto output = model->subgraphs[0]->tensors[fc_op->outputs[0]].get();
-  EXPECT_EQ(output->name, "tensor_2");
-  EXPECT_EQ(output->type, TensorType_INT8);
-  EXPECT_FLOAT_EQ(output->quantization->scale[0], 0.12);
-  EXPECT_EQ(output->quantization->zero_point[0], 50);
-}
-
 TEST(ModelInterface, Uint8MutipleInputOutput) {
-  auto model = CreateQuantizedModelMultipleInputOutput();
+  auto model = CreateQuantizedModelMultipleInputOutput(TensorType_INT8);
 
   // Change model type.
   flatbuffers::FlatBufferBuilder builder;
@@ -454,7 +519,7 @@ TEST(ModelInterface, Uint8MutipleInputOutput) {
 }
 
 TEST(ModelInterface, Int8MutipleInputOutput) {
-  auto model = CreateQuantizedModelMultipleInputOutput();
+  auto model = CreateQuantizedModelMultipleInputOutput(TensorType_INT8);
 
   // Change model type.
   flatbuffers::FlatBufferBuilder builder;
diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc
index f2cb98ef31a..5ab48d570f5 100644
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@@ -190,8 +190,8 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.inputs = {{0, {}}};
       property.outputs = {{0, {}}};
       property.restrict_same_input_output_scale = true;
+      property.quantize_input_as_activations = true;
       property.version = 2;
-      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_HARD_SWISH: {
       property.inputs = {{0, {}}};
@@ -802,7 +802,6 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.inputs = {{0, {}}};
       property.outputs = {{0, {}}};
       property.version = 2;
-      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_MINIMUM:
       property.arbitrary_inputs = true;
@@ -830,7 +829,6 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.outputs = {{0, {}}};
       property.restrict_same_input_output_scale = true;
       property.version = 2;
-      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_QUANTIZE:
       property.inputs = {{0, {}}};
@@ -975,6 +973,13 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.restrict_same_input_output_scale = true;
       property.version = 2;
       break;
+    case BuiltinOperator_REDUCE_MAX:
+    case BuiltinOperator_REDUCE_MIN:
+      property.inputs = {{0, {}}};
+      property.outputs = {{0, {}}};
+      property.restrict_same_input_output_scale = true;
+      property.version = 2;
+      break;
     default:
       // No quantized implementation exists for this operation.
       property.quantizable = false;
diff --git a/tensorflow/lite/tools/optimize/python/modify_model_interface.cc b/tensorflow/lite/tools/optimize/python/modify_model_interface.cc
index ed67b07cb0f..cd2e4a192e9 100644
--- a/tensorflow/lite/tools/optimize/python/modify_model_interface.cc
+++ b/tensorflow/lite/tools/optimize/python/modify_model_interface.cc
@@ -33,7 +33,7 @@ PYBIND11_MODULE(_pywrap_modify_model_interface, m) {
           return tflite::optimize::ModifyModelInterface(
               input_file, output_file,
               static_cast<tflite::TensorType>(input_type),
-              static_cast<tflite::TensorType>(input_type));
+              static_cast<tflite::TensorType>(output_type));
         });
 }
 
diff --git a/tensorflow/lite/tools/optimize/python/modify_model_interface_constants.py b/tensorflow/lite/tools/optimize/python/modify_model_interface_constants.py
index 42767268e48..cbe1aa92022 100644
--- a/tensorflow/lite/tools/optimize/python/modify_model_interface_constants.py
+++ b/tensorflow/lite/tools/optimize/python/modify_model_interface_constants.py
@@ -23,6 +23,7 @@ from tensorflow.lite.python import lite_constants
 
 STR_TO_TFLITE_TYPES = {
     'INT8': lite_constants.INT8,
+    'INT16': lite_constants.INT16,
     'UINT8': lite_constants.QUANTIZED_UINT8
 }
 TFLITE_TO_STR_TYPES = {v: k for k, v in STR_TO_TFLITE_TYPES.items()}
diff --git a/tensorflow/lite/tools/optimize/python/modify_model_interface_lib_test.py b/tensorflow/lite/tools/optimize/python/modify_model_interface_lib_test.py
index e97f0db9bbb..b3d3e8ec48d 100644
--- a/tensorflow/lite/tools/optimize/python/modify_model_interface_lib_test.py
+++ b/tensorflow/lite/tools/optimize/python/modify_model_interface_lib_test.py
@@ -28,7 +28,8 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
-def build_tflite_model_with_full_integer_quantization():
+def build_tflite_model_with_full_integer_quantization(
+    supported_ops=tf.lite.OpsSet.TFLITE_BUILTINS_INT8):
   # Define TF model
   input_size = 3
   model = tf.keras.Sequential([
@@ -46,7 +47,7 @@ def build_tflite_model_with_full_integer_quantization():
       yield [np.array([i] * input_size, dtype=np.float32)]
 
   converter.representative_dataset = representative_dataset_gen
-  converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+  converter.target_spec.supported_ops = [supported_ops]
   tflite_model = converter.convert()
 
   return tflite_model
@@ -89,6 +90,43 @@ class ModifyModelInterfaceTest(test_util.TensorFlowTestCase):
     self.assertEqual(final_input_dtype, np.int8)
     self.assertEqual(final_output_dtype, np.int8)
 
+  def testInt16Interface(self):
+    # 1. SETUP
+    # Define the temporary directory and files
+    temp_dir = self.get_temp_dir()
+    initial_file = os.path.join(temp_dir, 'initial_model.tflite')
+    final_file = os.path.join(temp_dir, 'final_model.tflite')
+    # Define initial model
+    initial_model = build_tflite_model_with_full_integer_quantization(
+        supported_ops=tf.lite.OpsSet
+        .EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8)
+    with open(initial_file, 'wb') as model_file:
+      model_file.write(initial_model)
+
+    # 2. INVOKE
+    # Invoke the modify_model_interface function
+    modify_model_interface_lib.modify_model_interface(initial_file, final_file,
+                                                      tf.int16, tf.int16)
+
+    # 3. VALIDATE
+    # Load TFLite model and allocate tensors.
+    initial_interpreter = tf.lite.Interpreter(model_path=initial_file)
+    initial_interpreter.allocate_tensors()
+    final_interpreter = tf.lite.Interpreter(model_path=final_file)
+    final_interpreter.allocate_tensors()
+
+    # Get input and output types.
+    initial_input_dtype = initial_interpreter.get_input_details()[0]['dtype']
+    initial_output_dtype = initial_interpreter.get_output_details()[0]['dtype']
+    final_input_dtype = final_interpreter.get_input_details()[0]['dtype']
+    final_output_dtype = final_interpreter.get_output_details()[0]['dtype']
+
+    # Validate the model interfaces
+    self.assertEqual(initial_input_dtype, np.float32)
+    self.assertEqual(initial_output_dtype, np.float32)
+    self.assertEqual(final_input_dtype, np.int16)
+    self.assertEqual(final_output_dtype, np.int16)
+
   def testUInt8Interface(self):
     # 1. SETUP
     # Define the temporary directory and files
diff --git a/tensorflow/lite/tools/optimize/quantization_utils.cc b/tensorflow/lite/tools/optimize/quantization_utils.cc
index b3aa23b2579..81110071dc9 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils.cc
+++ b/tensorflow/lite/tools/optimize/quantization_utils.cc
@@ -259,7 +259,7 @@ TfLiteStatus AdjustWeightsForBiasScale(QuantizationParametersT* quant_params,
 
   // Per channel quantization
   if (channel_dim_size > 1) {
-    for (size_t i = 0; i < channel_dim_size; ++i) {
+    for (int i = 0; i < channel_dim_size; ++i) {
       // Current scale is not compatible with bias. Adjust max/min values.
       if (std::abs(bias_data[i]) >=
           0.5 * input_scale * weight_scales[i] * kScale) {
@@ -636,7 +636,7 @@ TfLiteStatus SymmetricPerChannelBiasQuantize(ModelT* model, TensorT* tensor,
                                              ErrorReporter* error_reporter) {
   // Compute scales.
   std::vector<float> scales(number_of_dimension);
-  for (size_t i = 0; i < number_of_dimension; i++) {
+  for (int i = 0; i < number_of_dimension; i++) {
     scales[i] = input_scale * weight_scales[i];
   }
 
@@ -703,19 +703,19 @@ float GetEffectiveScale(ModelT* model, SubGraphT* subgraph, int op_idx,
                         std::vector<float> factors) {
   float scale = 1.0f;
   OperatorT* op = subgraph->operators[op_idx].get();
-  for (int i = 0; i < input_index.size(); ++i) {
+  for (int i = 0, end = input_index.size(); i < end; ++i) {
     const int index_local = input_index[i];
     const int index_global = op->inputs[index_local];
     const TensorT* tensor = subgraph->tensors[index_global].get();
     scale *= tensor->quantization->scale[0];
   }
-  for (int i = 0; i < intermediate_index.size(); ++i) {
+  for (int i = 0, end = intermediate_index.size(); i < end; ++i) {
     const int index_local = intermediate_index[i];
     const int index_global = op->intermediates[index_local];
     const TensorT* tensor = subgraph->tensors[index_global].get();
     scale *= tensor->quantization->scale[0];
   }
-  for (int i = 0; i < factors.size(); ++i) {
+  for (int i = 0, end = factors.size(); i < end; ++i) {
     scale *= factors[i];
   }
   return scale;
diff --git a/tensorflow/lite/tools/optimize/quantize_model.cc b/tensorflow/lite/tools/optimize/quantize_model.cc
index bb1deb695b9..ca9a51abefe 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model.cc
@@ -931,6 +931,10 @@ TfLiteStatus QuantizeWeightsInputOutput(
     const std::unordered_set<string>& operator_names,
     const std::unordered_set<string>& real_value_op_set,
     const TensorType& activations_type, ErrorReporter* error_reporter) {
+  // Flag to track unsupported ops.
+  bool quantization_not_supported = false;
+
+  // Loop over the graph and quantize ops.
   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
@@ -950,14 +954,14 @@ TfLiteStatus QuantizeWeightsInputOutput(
           !allow_float) {
         TF_LITE_REPORT_ERROR(
             error_reporter,
-            "Quantization to 16x8-bit not yet supported for op: %",
+            "Quantization to 16x8-bit not yet supported for op: '%s'.\n",
             EnumNameBuiltinOperator(op_code));
-        return kTfLiteError;
+        quantization_not_supported = true;
       } else if (!property.quantizable && !allow_float) {
         TF_LITE_REPORT_ERROR(error_reporter,
-                             "Quantization not yet supported for op: %",
+                             "Quantization not yet supported for op: '%s'.\n",
                              EnumNameBuiltinOperator(op_code));
-        return kTfLiteError;
+        quantization_not_supported = true;
       }
 
       // Quantize operator inputs/weights.
@@ -977,6 +981,11 @@ TfLiteStatus QuantizeWeightsInputOutput(
       }
     }
   }
+
+  // Return; emit errors if there are any.
+  if (quantization_not_supported) {
+    return kTfLiteError;
+  }
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/tools/optimize/quantize_weights.cc b/tensorflow/lite/tools/optimize/quantize_weights.cc
index 8bef019a83e..35c90d7859e 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights.cc
+++ b/tensorflow/lite/tools/optimize/quantize_weights.cc
@@ -130,7 +130,8 @@ bool IsQuantizedInput(const OperatorCodeT* op_code,
 
 // Returns true if the operator supports hybrid evaluation.
 bool IsHybridEvaluationOp(const OperatorT* op, const OperatorCodeT* op_code,
-                          const CustomOpMap& custom_op_map) {
+                          const CustomOpMap& custom_op_map,
+                          bool use_updated_hybrid_scheme) {
   const BuiltinOperator builtin_op_code = op_code->builtin_code;
   // Operations that support hybrid evaluation.
   bool eval_hybrid = false;
@@ -144,7 +145,6 @@ bool IsHybridEvaluationOp(const OperatorT* op, const OperatorCodeT* op_code,
     }
   } else if (builtin_op_code == BuiltinOperator_FULLY_CONNECTED ||
              builtin_op_code == BuiltinOperator_CONV_2D ||
-             builtin_op_code == BuiltinOperator_DEPTHWISE_CONV_2D ||
              builtin_op_code == BuiltinOperator_SVDF ||
              builtin_op_code == BuiltinOperator_RNN ||
              builtin_op_code == BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM ||
@@ -158,6 +158,8 @@ bool IsHybridEvaluationOp(const OperatorT* op, const OperatorCodeT* op_code,
     if (options->kernel_type == LSTMKernelType_FULL) {
       eval_hybrid = true;
     }
+  } else if (builtin_op_code == BuiltinOperator_DEPTHWISE_CONV_2D) {
+    eval_hybrid = use_updated_hybrid_scheme;
   }
   return eval_hybrid;
 }
@@ -191,7 +193,7 @@ TfLiteStatus InsertQuantizableInputTensorsFromOperator(
     const ModelT* model, OperatorT* op, uint64_t weights_min_num_elements,
     const CustomOpMap& custom_op_map,
     absl::flat_hash_map<int32_t, TensorPerChannel>* tensor_map,
-    int subgraph_index) {
+    int subgraph_index, bool use_updated_hybrid_scheme) {
   SubGraphT* subgraph = model->subgraphs.at(subgraph_index).get();
   const OperatorCodeT* op_code = model->operator_codes[op->opcode_index].get();
 
@@ -231,43 +233,46 @@ TfLiteStatus InsertQuantizableInputTensorsFromOperator(
     }
 
     if (op_code->builtin_code == BuiltinOperator_DEPTHWISE_CONV_2D) {
-      tensor_map->insert(
-          {tensor_idx, {tensor, /*is_per_channel=*/true, /*dim=*/3}});
+      tensor_map->insert({tensor_idx,
+                          {tensor, /*is_per_channel=*/use_updated_hybrid_scheme,
+                           /*dim=*/3}});
     } else if (op_code->builtin_code == BuiltinOperator_CONV_2D) {
-      tensor_map->insert(
-          {tensor_idx, {tensor, /*is_per_channel=*/true, /*dim=*/0}});
+      tensor_map->insert({tensor_idx,
+                          {tensor, /*is_per_channel=*/use_updated_hybrid_scheme,
+                           /*dim=*/0}});
     } else {
       switch (op_code->builtin_code) {
         case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM:
           op->builtin_options.AsBidirectionalSequenceLSTMOptions()
-              ->asymmetric_quantize_inputs = true;
+              ->asymmetric_quantize_inputs = use_updated_hybrid_scheme;
           break;
         case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN:
           op->builtin_options.AsBidirectionalSequenceRNNOptions()
-              ->asymmetric_quantize_inputs = true;
+              ->asymmetric_quantize_inputs = use_updated_hybrid_scheme;
           break;
         case BuiltinOperator_FULLY_CONNECTED:
           op->builtin_options.AsFullyConnectedOptions()
-              ->asymmetric_quantize_inputs = true;
+              ->asymmetric_quantize_inputs = use_updated_hybrid_scheme;
           break;
         case BuiltinOperator_LSTM:
           op->builtin_options.AsLSTMOptions()->asymmetric_quantize_inputs =
-              true;
+              use_updated_hybrid_scheme;
           break;
         case BuiltinOperator_RNN:
-          op->builtin_options.AsRNNOptions()->asymmetric_quantize_inputs = true;
+          op->builtin_options.AsRNNOptions()->asymmetric_quantize_inputs =
+              use_updated_hybrid_scheme;
           break;
         case BuiltinOperator_SVDF:
           op->builtin_options.AsSVDFOptions()->asymmetric_quantize_inputs =
-              true;
+              use_updated_hybrid_scheme;
           break;
         case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM:
           op->builtin_options.AsUnidirectionalSequenceLSTMOptions()
-              ->asymmetric_quantize_inputs = true;
+              ->asymmetric_quantize_inputs = use_updated_hybrid_scheme;
           break;
         case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN:
           op->builtin_options.AsSequenceRNNOptions()
-              ->asymmetric_quantize_inputs = true;
+              ->asymmetric_quantize_inputs = use_updated_hybrid_scheme;
           break;
         default:
           break;
@@ -323,25 +328,27 @@ void MakeTensor(const string& name, const std::vector<int32_t>& shape,
 }
 
 // Updates operator code versions for the operators with INT8 inputs.
-void UpdateInt8OperatorVersions(ModelT* model) {
-  for (int i = 0; i < model->operator_codes.size(); ++i) {
+void UpdateInt8OperatorVersions(ModelT* model, bool use_updated_hybrid_scheme) {
+  for (int i = 0, end = model->operator_codes.size(); i < end; ++i) {
     const BuiltinOperator& op_code = model->operator_codes[i]->builtin_code;
-    if (op_code == BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM ||
+    if (op_code == BuiltinOperator_RNN ||
         op_code == BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN ||
-        op_code == BuiltinOperator_EMBEDDING_LOOKUP ||
-        op_code == BuiltinOperator_RNN ||
         op_code == BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM ||
         op_code == BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN) {
+      model->operator_codes[i]->version = use_updated_hybrid_scheme ? 3 : 2;
+    } else if (op_code == BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM ||
+               op_code == BuiltinOperator_EMBEDDING_LOOKUP) {
       model->operator_codes[i]->version = 3;
-    } else if (op_code == BuiltinOperator_LSTM ||
-               op_code == BuiltinOperator_SVDF) {
-      model->operator_codes[i]->version = 4;
+    } else if (op_code == BuiltinOperator_LSTM) {
+      model->operator_codes[i]->version = use_updated_hybrid_scheme ? 4 : 3;
     } else if (op_code == BuiltinOperator_CONV_2D) {
-      model->operator_codes[i]->version = 5;
+      model->operator_codes[i]->version = use_updated_hybrid_scheme ? 5 : 2;
+    } else if (op_code == BuiltinOperator_FULLY_CONNECTED) {
+      model->operator_codes[i]->version = use_updated_hybrid_scheme ? 9 : 3;
+    } else if (op_code == BuiltinOperator_SVDF) {
+      model->operator_codes[i]->version = use_updated_hybrid_scheme ? 4 : 2;
     } else if (op_code == BuiltinOperator_DEPTHWISE_CONV_2D) {
       model->operator_codes[i]->version = 6;
-    } else if (op_code == BuiltinOperator_FULLY_CONNECTED) {
-      model->operator_codes[i]->version = 9;
     }
   }
 }
@@ -402,12 +409,13 @@ TfLiteStatus QuantizeWeightsInt8(flatbuffers::FlatBufferBuilder* builder,
                                  const Model* input_model,
                                  bool use_hybrid_evaluation,
                                  uint64_t weights_min_num_elements,
-                                 const CustomOpMap& custom_op_map) {
+                                 const CustomOpMap& custom_op_map,
+                                 bool use_updated_hybrid_scheme) {
   std::unique_ptr<ModelT> model;
   model.reset(input_model->UnPack());
 
-  for (int subgraph_index = 0; subgraph_index < model->subgraphs.size();
-       ++subgraph_index) {
+  for (int subgraph_index = 0, end = model->subgraphs.size();
+       subgraph_index < end; ++subgraph_index) {
     SubGraphT* subgraph = model->subgraphs.at(subgraph_index).get();
 
     absl::flat_hash_map<int32_t, TensorPerChannel> tensor_map;
@@ -415,7 +423,7 @@ TfLiteStatus QuantizeWeightsInt8(flatbuffers::FlatBufferBuilder* builder,
       OperatorT* op = subgraph->operators[i].get();
       TF_LITE_ENSURE_STATUS(InsertQuantizableInputTensorsFromOperator(
           model.get(), op, weights_min_num_elements, custom_op_map, &tensor_map,
-          subgraph_index));
+          subgraph_index, use_updated_hybrid_scheme));
     }
 
     for (std::pair<int32_t, TensorPerChannel> tensor_pair : tensor_map) {
@@ -456,8 +464,8 @@ TfLiteStatus QuantizeWeightsInt8(flatbuffers::FlatBufferBuilder* builder,
         // dequantization we need to add a Dequantize op.
         bool eval_hybrid =
             use_hybrid_evaluation &&
-            IsHybridEvaluationOp(consumer_op, consumer_op_code,
-                                 custom_op_map) &&
+            IsHybridEvaluationOp(consumer_op, consumer_op_code, custom_op_map,
+                                 use_updated_hybrid_scheme) &&
             CheckAllOpInputsQuantized(subgraph, consumer_op, consumer_op_code,
                                       custom_op_map) &&
             IsQuantizedInput(consumer_op_code, custom_op_map,
@@ -516,7 +524,7 @@ TfLiteStatus QuantizeWeightsInt8(flatbuffers::FlatBufferBuilder* builder,
   }
 
   // Update the modified operator code versions.
-  UpdateInt8OperatorVersions(model.get());
+  UpdateInt8OperatorVersions(model.get(), use_updated_hybrid_scheme);
 
   flatbuffers::Offset<Model> output_model_location =
       Model::Pack(*builder, model.get());
@@ -530,12 +538,12 @@ TfLiteStatus QuantizeWeightsFloat16(flatbuffers::FlatBufferBuilder* builder,
   std::unique_ptr<ModelT> model;
   model.reset(input_model->UnPack());
 
-  for (int subgraph_index = 0; subgraph_index < model->subgraphs.size();
-       ++subgraph_index) {
+  for (int subgraph_index = 0, end = model->subgraphs.size();
+       subgraph_index < end; ++subgraph_index) {
     SubGraphT* subgraph = model->subgraphs.at(subgraph_index).get();
 
     absl::flat_hash_map<int32_t, TensorT*> tensor_map;
-    for (int i = 0; i < subgraph->operators.size(); ++i) {
+    for (int i = 0, sub_end = subgraph->operators.size(); i < sub_end; ++i) {
       OperatorT* op = subgraph->operators[i].get();
       for (auto tensor_idx : op->inputs) {
         // Skip optional tensors.
@@ -611,7 +619,8 @@ TfLiteStatus QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
   // kWeightsMinSizeDefault elements are quantized.
   CustomOpMap custom_op_map;
   return QuantizeWeightsInt8(builder, input_model, use_hybrid_evaluation,
-                             weights_min_num_elements, custom_op_map);
+                             weights_min_num_elements, custom_op_map,
+                             kUseUpdatedHybridSchemeDefault);
 }
 }  // namespace internal
 
@@ -620,7 +629,8 @@ TfLiteStatus QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
                              uint64_t weights_min_num_elements) {
   CustomOpMap custom_op_map;
   return QuantizeWeightsInt8(builder, input_model, true,
-                             weights_min_num_elements, custom_op_map);
+                             weights_min_num_elements, custom_op_map,
+                             kUseUpdatedHybridSchemeDefault);
 }
 
 TfLiteStatus QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
@@ -631,7 +641,8 @@ TfLiteStatus QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
       // kWeightsMinSizeDefault elements are quantized.
       CustomOpMap custom_op_map;
       return QuantizeWeightsInt8(builder, input_model, true,
-                                 kWeightsMinNumElementsDefault, custom_op_map);
+                                 kWeightsMinNumElementsDefault, custom_op_map,
+                                 kUseUpdatedHybridSchemeDefault);
     }
     case BufferType::QUANTIZED_FLOAT16:
       return QuantizeWeightsFloat16(builder, input_model);
@@ -643,7 +654,19 @@ TfLiteStatus QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
                              uint64_t weights_min_num_elements,
                              const CustomOpMap& custom_op_map) {
   return QuantizeWeightsInt8(builder, input_model, true,
-                             weights_min_num_elements, custom_op_map);
+                             weights_min_num_elements, custom_op_map,
+                             kUseUpdatedHybridSchemeDefault);
+}
+
+TfLiteStatus QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
+                             const Model* input_model,
+                             uint64_t weights_min_num_elements,
+                             const CustomOpMap& custom_op_map,
+                             bool use_updated_hybrid_scheme) {
+  return QuantizeWeightsInt8(builder, input_model,
+                             /*use_hybrid_evaluation=*/true,
+                             weights_min_num_elements, custom_op_map,
+                             use_updated_hybrid_scheme);
 }
 
 }  // namespace optimize
diff --git a/tensorflow/lite/tools/optimize/quantize_weights.h b/tensorflow/lite/tools/optimize/quantize_weights.h
index 528614f0b7b..9212c9a117d 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights.h
+++ b/tensorflow/lite/tools/optimize/quantize_weights.h
@@ -29,6 +29,13 @@ namespace optimize {
 // Supported resulting types from quantization process.
 enum class BufferType { QUANTIZED_INT8, QUANTIZED_FLOAT16 };
 
+// This macro is for internal use for conversions requiring previous behavior.
+#ifdef TFLITE_USE_PREVIOUS_HYBRID_SCHEME
+constexpr bool kUseUpdatedHybridSchemeDefault = false;
+#else
+constexpr bool kUseUpdatedHybridSchemeDefault = true;
+#endif
+
 // Quantizes input_model and populates the provided builder with the new model.
 // By default only weights tensors weight more than 1024 elements will be
 // quantized.
@@ -61,6 +68,14 @@ TfLiteStatus QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
                              uint64_t weights_min_num_elements,
                              const CustomOpMap& custom_op_map);
 
+// Same as above, but if use updated_hybrid_scheme is false,
+// use previous quantization scheme.
+TfLiteStatus QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
+                             const Model* input_model,
+                             uint64_t weights_min_num_elements,
+                             const CustomOpMap& custom_op_map,
+                             bool use_updated_hybrid_scheme);
+
 namespace internal {
 // If use_hybrid_evaluation is false, will disable using hybrid eval for
 // operations that support it.
diff --git a/tensorflow/lite/tools/optimize/quantize_weights_test.cc b/tensorflow/lite/tools/optimize/quantize_weights_test.cc
index 2f92a9ad71c..94bff2d5eb8 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_weights_test.cc
@@ -216,7 +216,11 @@ TEST_F(QuantizeWeightsTest, HybridConv) {
         EXPECT_EQ(quant_tensor->type(), TensorType_INT8)
             << quant_tensor->name()->str();
         auto shape = GetAsVector(quant_tensor->shape());
-        EXPECT_EQ(quant_tensor->quantization()->scale()->size(), shape[0]);
+        if (kUseUpdatedHybridSchemeDefault) {
+          EXPECT_EQ(quant_tensor->quantization()->scale()->size(), shape[0]);
+        } else {
+          EXPECT_EQ(quant_tensor->quantization()->scale()->size(), 1);
+        }
       } else {
         EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
       }
@@ -533,6 +537,58 @@ TEST_F(QuantizeWeightsTest, VerifyCustomOpQuantizationHybrid) {
   EXPECT_EQ(num_custom_ops_found, 1);
 }
 
+TEST_F(QuantizeWeightsTest, VerifyUpdatedHybridSchemeFalseQuantizationHybrid) {
+  LoadBasicModel();
+  flatbuffers::FlatBufferBuilder builder;
+  const CustomOpMap custom_op_map;
+  auto status = QuantizeWeights(&builder, model_, 0, custom_op_map, false);
+  EXPECT_EQ(status, kTfLiteOk);
+
+  const uint8_t* buffer = builder.GetBufferPointer();
+  const Model* output_model = GetModel(buffer);
+  ASSERT_TRUE(output_model);
+
+  // Nothing should change.
+  ASSERT_EQ(output_model->subgraphs()->size(), model_->subgraphs()->size());
+  for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
+       subgraph_idx++) {
+    const auto quantized_graph = output_model->subgraphs()->Get(subgraph_idx);
+    const auto float_graph = model_->subgraphs()->Get(subgraph_idx);
+    ASSERT_EQ(quantized_graph->tensors()->size(),
+              float_graph->tensors()->size());
+    // Make sure the graph only has one Conv operation.
+    ASSERT_EQ(quantized_graph->operators()->size(), 1);
+    const auto op = quantized_graph->operators()->Get(0);
+    const uint32_t op_code_idx = op->opcode_index();
+    ASSERT_EQ(output_model->operator_codes()->Get(op_code_idx)->builtin_code(),
+              BuiltinOperator_CONV_2D);
+    for (size_t i = 0; i < quantized_graph->tensors()->size(); i++) {
+      const auto quant_tensor = quantized_graph->tensors()->Get(i);
+      const auto float_tensor = float_graph->tensors()->Get(i);
+      EXPECT_EQ(quant_tensor->buffer(), float_tensor->buffer());
+      EXPECT_EQ(quant_tensor->is_variable(), float_tensor->is_variable());
+      EXPECT_EQ(GetAsVector(quant_tensor->shape()),
+                GetAsVector(float_tensor->shape()));
+      EXPECT_EQ(quant_tensor->name()->str(), float_tensor->name()->str());
+      // If the tensor is a weight, it should have type INT8, otherwise it
+      // should stay with type FLOAT32.
+      // If the tensor is a bias, it should have type FLOAT32.
+      if (quant_tensor->name()->str() == "conv_bias") {
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
+      } else if (IsModelInputOrOutput(output_model, i)) {
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
+      } else if (quant_tensor->buffer() != 0) {
+        EXPECT_EQ(quant_tensor->type(), TensorType_INT8)
+            << quant_tensor->name()->str();
+        auto shape = GetAsVector(quant_tensor->shape());
+        EXPECT_EQ(quant_tensor->quantization()->scale()->size(), 1);
+      } else {
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
+      }
+    }
+  }
+}
+
 }  // namespace
 }  // namespace optimize
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/sparsity/BUILD b/tensorflow/lite/tools/optimize/sparsity/BUILD
index 4ea901f77f9..7f2d880ff48 100644
--- a/tensorflow/lite/tools/optimize/sparsity/BUILD
+++ b/tensorflow/lite/tools/optimize/sparsity/BUILD
@@ -1,5 +1,4 @@
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
-load("//tensorflow/lite/micro:build_def.bzl", "cc_library")
 
 package(
     default_visibility = [
@@ -12,10 +11,10 @@ cc_library(
     name = "format_converter",
     srcs = ["format_converter.cc"],
     hdrs = ["format_converter.h"],
-    build_for_embedded = True,
     copts = tflite_copts(),
     deps = [
         "//tensorflow/lite/c:common",
+        "//third_party/eigen3",
     ],
 )
 
diff --git a/tensorflow/lite/tools/optimize/sparsity/format_converter.cc b/tensorflow/lite/tools/optimize/sparsity/format_converter.cc
index 3800672a4e2..d6a80f585d5 100644
--- a/tensorflow/lite/tools/optimize/sparsity/format_converter.cc
+++ b/tensorflow/lite/tools/optimize/sparsity/format_converter.cc
@@ -161,7 +161,7 @@ TfLiteStatus FormatConverter<T>::DenseToSparse(const T* src_data) {
     if (dst_dim_idx == num_expanded_dims) {
       // We have a complete coordinate. Add the element to the value array if it
       // is not zero, or if the last dimension is dense.
-      if (src_data[dense_tensor_idx] != 0) {
+      if (!IsZero(src_data[dense_tensor_idx])) {
         data_.push_back(src_data[dense_tensor_idx]);
         // Mark all sparse dimensions that their current indices have nonzeroes.
         for (auto dst_dim : dst_sparse_dims) {
@@ -317,9 +317,21 @@ TfLiteStatus FormatConverter<T>::SparseToDense(const T* src_data) {
   return kTfLiteOk;
 }
 
+template <>
+bool FormatConverter<Eigen::half>::IsZero(const Eigen::half val) {
+  auto zero = Eigen::half(0);
+  return Eigen::half_impl::operator==(val, zero);
+}
+
+template <typename T>
+bool FormatConverter<T>::IsZero(const T val) {
+  return (val == 0);
+}
+
 template class FormatConverter<int32_t>;
 template class FormatConverter<int8_t>;
 template class FormatConverter<float>;
+template class FormatConverter<Eigen::half>;
 
 }  // namespace sparsity
 }  // namespace optimize
diff --git a/tensorflow/lite/tools/optimize/sparsity/format_converter.h b/tensorflow/lite/tools/optimize/sparsity/format_converter.h
index b6ee238505e..46e7d93b0b7 100644
--- a/tensorflow/lite/tools/optimize/sparsity/format_converter.h
+++ b/tensorflow/lite/tools/optimize/sparsity/format_converter.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/lite/c/common.h"
 
 namespace tflite {
@@ -66,6 +67,9 @@ class FormatConverter {
   void Populate(const T* src_data, std::vector<int> indices, int level,
                 int prev_idx, int* src_data_ptr);
 
+  // Check if val is equal to zero.
+  bool IsZero(const T val);
+
   // Shape of the conceptual dense tensor.
   std::vector<int> dense_shape_;
   // Shape of the dense tensor with inner blocks reduced. For example, a (4, 4)
@@ -92,9 +96,13 @@ class FormatConverter {
   std::vector<T> data_;
 };
 
+template <>
+bool FormatConverter<Eigen::half>::IsZero(const Eigen::half val);
+
 extern template class FormatConverter<int32_t>;
 extern template class FormatConverter<int8_t>;
 extern template class FormatConverter<float>;
+extern template class FormatConverter<Eigen::half>;
 }  // namespace sparsity
 }  // namespace optimize
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/pip_package/README.md b/tensorflow/lite/tools/pip_package/README.md
index 6212104be19..57e161c5b83 100644
--- a/tensorflow/lite/tools/pip_package/README.md
+++ b/tensorflow/lite/tools/pip_package/README.md
@@ -65,32 +65,28 @@ tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
 ### Cross build for armhf Python 3.5
 
 ```sh
-CI_DOCKER_EXTRA_PARAMS="-e CI_BUILD_PYTHON=python3 -e CROSSTOOL_PYTHON_INCLUDE_PATH=/usr/include/python3.5" \
-  tensorflow/tools/ci_build/ci_build.sh PI-PYTHON3 \
+tensorflow/tools/ci_build/ci_build.sh PI-PYTHON3 \
   tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh armhf
 ```
 
 ### Cross build for armhf Python 3.7
 
 ```sh
-CI_DOCKER_EXTRA_PARAMS="-e CI_BUILD_PYTHON=python3 -e CROSSTOOL_PYTHON_INCLUDE_PATH=/usr/include/python3.7" \
-  tensorflow/tools/ci_build/ci_build.sh PI-PYTHON37 \
+tensorflow/tools/ci_build/ci_build.sh PI-PYTHON37 \
   tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh armhf
 ```
 
 ### Cross build for aarch64 Python 3.5
 
 ```sh
-  CI_DOCKER_EXTRA_PARAMS="-e CI_BUILD_PYTHON=python3 -e CROSSTOOL_PYTHON_INCLUDE_PATH=/usr/include/python3.5" \
-  tensorflow/tools/ci_build/ci_build.sh PI-PYTHON3 \
+tensorflow/tools/ci_build/ci_build.sh PI-PYTHON3 \
   tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh aarch64
 ```
 
-### Cross build for aarch64 Python 3.7
+### Cross build for aarch64 Python 3.8
 
 ```sh
-CI_DOCKER_EXTRA_PARAMS="-e CI_BUILD_PYTHON=python3 -e CROSSTOOL_PYTHON_INCLUDE_PATH=/usr/include/python3.7" \
-  tensorflow/tools/ci_build/ci_build.sh PI-PYTHON37 \
+tensorflow/tools/ci_build/ci_build.sh PI-PYTHON38 \
   tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh aarch64
 ```
 
@@ -109,12 +105,11 @@ CUSTOM_BAZEL_FLAGS=--define=tflite_pip_with_flex=true \
   tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
 ```
 
-### Cross build with Flex for armhf Python 3.5
+### Cross build with Flex for armhf Python 3.7
 
 ```sh
-CI_DOCKER_EXTRA_PARAMS="-e CUSTOM_BAZEL_FLAGS=--define=tflite_pip_with_flex=true \
-  -e CI_BUILD_PYTHON=python3 -e CROSSTOOL_PYTHON_INCLUDE_PATH=/usr/include/python3.5" \
-  tensorflow/tools/ci_build/ci_build.sh PI-PYTHON3 \
+CI_DOCKER_EXTRA_PARAMS="-e CUSTOM_BAZEL_FLAGS=--define=tflite_pip_with_flex=true" \
+  tensorflow/tools/ci_build/ci_build.sh PI-PYTHON37 \
   tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh armhf
 ```
 
diff --git a/tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh b/tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
index 4976624e340..c60ceec5e2b 100755
--- a/tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
+++ b/tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
@@ -16,7 +16,7 @@
 set -ex
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-PYTHON="${PYTHON:-python3}"
+PYTHON="${CI_BUILD_PYTHON:-python3}"
 VERSION_SUFFIX=${VERSION_SUFFIX:-}
 export TENSORFLOW_DIR="${SCRIPT_DIR}/../../../.."
 TENSORFLOW_LITE_DIR="${TENSORFLOW_DIR}/tensorflow/lite"
diff --git a/tensorflow/lite/tools/signature/BUILD b/tensorflow/lite/tools/signature/BUILD
index cf28b2eab72..05fc106d759 100644
--- a/tensorflow/lite/tools/signature/BUILD
+++ b/tensorflow/lite/tools/signature/BUILD
@@ -2,7 +2,6 @@
 load("//tensorflow:tensorflow.bzl", "pybind_extension")
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
-load("//tensorflow/lite/micro:build_def.bzl", "cc_library")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 package(
diff --git a/tensorflow/lite/tools/test_utils.py b/tensorflow/lite/tools/test_utils.py
index 3950e3de35e..dde01a9872a 100644
--- a/tensorflow/lite/tools/test_utils.py
+++ b/tensorflow/lite/tools/test_utils.py
@@ -21,7 +21,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from flatbuffers.python import flatbuffers
+import flatbuffers
 from tensorflow.lite.python import schema_py_generated as schema_fb
 
 TFLITE_SCHEMA_VERSION = 3
diff --git a/tensorflow/lite/tools/verifier.cc b/tensorflow/lite/tools/verifier.cc
index 9befa7fd6f1..bdd4a35dac7 100644
--- a/tensorflow/lite/tools/verifier.cc
+++ b/tensorflow/lite/tools/verifier.cc
@@ -106,9 +106,10 @@ bool VerifyStringTensorBuffer(const Tensor& tensor, const Buffer& buffer,
     return false;
   }
   offset += sizeof(int32_t);
-  for (int i = 1; i <= num_strings; i++, offset += sizeof(int32_t)) {
+  for (int i = 1, end = num_strings; i <= end; i++, offset += sizeof(int32_t)) {
     int string_offset = *GetIntPtr(buffer_ptr + offset);
-    if (string_offset < prev_ptr || string_offset > buffer_size) {
+    if (string_offset < static_cast<int>(prev_ptr) ||
+        string_offset > static_cast<int>(buffer_size)) {
       ReportError(error_reporter,
                   "String tensor %s buffer is invalid: index %d",
                   NameOrEmptyString(tensor.name()), i);
@@ -221,7 +222,7 @@ absl::optional<uint64_t> VerifyAndCountElements(
         }
       }
 
-      if (num_elements != array_segments_size - 1) {
+      if (static_cast<int>(num_elements) != array_segments_size - 1) {
         return absl::nullopt;
       }
 
@@ -254,16 +255,20 @@ absl::optional<uint64_t> VerifyAndCountSparseElements(const Tensor& tensor) {
 
   const int total_dims = sparsity->traversal_order()->size();
   const int original_rank = tensor.shape()->size();
-
-  if (total_dims < original_rank ||
-      sparsity->dim_metadata()->size() != total_dims) {
+  const int sparsity_dim_metadata_size = sparsity->dim_metadata()->size();
+  if (total_dims < original_rank || sparsity_dim_metadata_size != total_dims) {
     return absl::nullopt;
   }
 
   const int block_rank = total_dims - original_rank;
-  if (block_rank > 0 && (sparsity->block_map() == nullptr ||
-                         sparsity->block_map()->size() != block_rank)) {
-    return absl::nullopt;
+  if (block_rank > 0) {
+    if (sparsity->block_map() == nullptr) {
+      return absl::nullopt;
+    }
+    const int sparse_rank = sparsity->block_map()->size();
+    if (sparse_rank != block_rank) {
+      return absl::nullopt;
+    }
   }
 
   // For a n-dimensional tensor (d0, ..., dn-1) with k-dimensional block (dn,
@@ -384,6 +389,9 @@ bool VerifyNumericTensorBuffer(const Tensor& tensor, const Buffer& buffer,
     case TensorType_COMPLEX64:
       bytes_required *= sizeof(std::complex<float>);
       break;
+    case TensorType_COMPLEX128:
+      bytes_required *= sizeof(std::complex<double>);
+      break;
     default:
       ReportError(error_reporter, "Tensor %s invalid type: %d",
                   NameOrEmptyString(tensor.name()), tensor.type());
@@ -443,7 +451,7 @@ bool VerifySubGraphConsistency(const Model& model, const SubGraph& subgraph,
   absl::flat_hash_set<int> subgraph_input_tensors, constant_tensors,
       variable_tensors, output_tensors;
   if (subgraph.tensors()) {
-    for (int i = 0; i < subgraph.tensors()->size(); ++i) {
+    for (int i = 0, end = subgraph.tensors()->size(); i < end; ++i) {
       const auto* tensor = subgraph.tensors()->Get(i);
       if (IsConstantTensor(*tensor, model)) {
         constant_tensors.insert(i);
@@ -459,7 +467,8 @@ bool VerifySubGraphConsistency(const Model& model, const SubGraph& subgraph,
   }
 
   if (subgraph.operators()) {
-    for (int op_idx = 0; op_idx < subgraph.operators()->size(); ++op_idx) {
+    for (int op_idx = 0, end = subgraph.operators()->size(); op_idx < end;
+         ++op_idx) {
       const auto* op = subgraph.operators()->Get(op_idx);
       if (!model.operator_codes() ||
           (op->opcode_index() >= model.operator_codes()->size())) {
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index b88ce0b9152..ef4825c397e 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -176,6 +176,9 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       return 1;
 
     case BuiltinOperator_GATHER:
+      if (op_sig.input_types.at(0) == TensorType_INT16) {
+        return 4;
+      }
       // If the op takes bool input, it is version 3.
       if (op_sig.input_types.at(0) == TensorType_BOOL) {
         return 3;
@@ -532,6 +535,7 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
 
     case BuiltinOperator_CONCATENATION:
     case BuiltinOperator_SOFTMAX:
+    case BuiltinOperator_MEAN:
     case BuiltinOperator_PAD:
     case BuiltinOperator_PADV2:
       // In case of int16 inputs, the version is 3.
@@ -559,7 +563,6 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
 
     case BuiltinOperator_SPACE_TO_DEPTH:
     case BuiltinOperator_SPLIT_V:
-    case BuiltinOperator_MEAN:
     case BuiltinOperator_SUM:
     case BuiltinOperator_REDUCE_MAX:
     case BuiltinOperator_REDUCE_MIN:
diff --git a/tensorflow/lite/tools/versioning/op_version_test.cc b/tensorflow/lite/tools/versioning/op_version_test.cc
index 2f13b7234e3..a90cb336318 100644
--- a/tensorflow/lite/tools/versioning/op_version_test.cc
+++ b/tensorflow/lite/tools/versioning/op_version_test.cc
@@ -67,6 +67,18 @@ void SimpleVersioningTest(BuiltinOperator op) {
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
 }
 
+// Similar to SimpleVersioningTest function, but
+// op has 3 versions and the input type includes TensorType_INT16.
+void SimpleVersioningTestExtended(BuiltinOperator op) {
+  OpSignature fake_op_sig = {
+      .op = op,
+      .input_types = std::vector<TensorType>{TensorType_INT16},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+
+  SimpleVersioningTest(op);
+}
+
 // Test version for a simple Op with 2 versions and the output type controls the
 void SimpleOutputVersioningTest(BuiltinOperator op) {
   OpSignature fake_op_sig = {
@@ -281,7 +293,7 @@ TEST(OpVersionTest, VersioningMinTest) {
 }
 
 TEST(OpVersionTest, VersioningMeanTest) {
-  SimpleVersioningTest(BuiltinOperator_MEAN);
+  SimpleVersioningTestExtended(BuiltinOperator_MEAN);
 }
 
 TEST(OpVersionTest, VersioningSumTest) {
diff --git a/tensorflow/lite/tools/versioning/runtime_version.cc b/tensorflow/lite/tools/versioning/runtime_version.cc
index 2be02a6d41e..5a454224b92 100644
--- a/tensorflow/lite/tools/versioning/runtime_version.cc
+++ b/tensorflow/lite/tools/versioning/runtime_version.cc
@@ -112,6 +112,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_GATHER, 1}, "1.6.0"},
               {{BuiltinOperator_GATHER, 2}, "1.14.0"},
               {{BuiltinOperator_GATHER, 3}, "1.15.0"},
+              {{BuiltinOperator_GATHER, 4}, kPendingReleaseVersion},
               {{BuiltinOperator_GATHER_ND, 1}, "1.14.0"},
               {{BuiltinOperator_GATHER_ND, 2}, "2.3.0"},
               {{BuiltinOperator_HASHTABLE_LOOKUP, 1}, "1.5.0"},
@@ -173,6 +174,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN, 3}, "2.3.0"},
               {{BuiltinOperator_MEAN, 1}, "1.6.0"},
               {{BuiltinOperator_MEAN, 2}, "1.14.0"},
+              {{BuiltinOperator_MEAN, 3}, kPendingReleaseVersion},
               {{BuiltinOperator_SUM, 1}, "1.10.0"},
               {{BuiltinOperator_SUM, 2}, "1.15.0"},
               {{BuiltinOperator_REDUCE_MAX, 1}, "1.11.0"},
diff --git a/tensorflow/lite/type_to_tflitetype.h b/tensorflow/lite/type_to_tflitetype.h
index 84cd54b5718..a95b233c13c 100644
--- a/tensorflow/lite/type_to_tflitetype.h
+++ b/tensorflow/lite/type_to_tflitetype.h
@@ -28,55 +28,43 @@ limitations under the License.
 
 namespace tflite {
 
-// Map statically from a c++ type to a TfLiteType. Used in interpreter for safe
-// casts.
-template <class T>
+// Map statically from a C++ type to a TfLiteType. Used in interpreter for
+// safe casts.
+// Example:
+//  typeToTfLiteType<bool>() -> kTfLiteBool
+template <typename T>
 constexpr TfLiteType typeToTfLiteType() {
   return kTfLiteNoType;
 }
-template <>
-constexpr TfLiteType typeToTfLiteType<int>() {
-  return kTfLiteInt32;
-}
-template <>
-constexpr TfLiteType typeToTfLiteType<int16_t>() {
-  return kTfLiteInt16;
-}
-template <>
-constexpr TfLiteType typeToTfLiteType<int64_t>() {
-  return kTfLiteInt64;
-}
-template <>
-constexpr TfLiteType typeToTfLiteType<float>() {
-  return kTfLiteFloat32;
-}
-template <>
-constexpr TfLiteType typeToTfLiteType<unsigned char>() {
-  return kTfLiteUInt8;
-}
-template <>
-constexpr TfLiteType typeToTfLiteType<int8_t>() {
-  return kTfLiteInt8;
-}
-template <>
-constexpr TfLiteType typeToTfLiteType<bool>() {
-  return kTfLiteBool;
-}
-template <>
-constexpr TfLiteType typeToTfLiteType<std::complex<float>>() {
-  return kTfLiteComplex64;
-}
-template <>
-constexpr TfLiteType typeToTfLiteType<std::string>() {
-  return kTfLiteString;
-}
-template <>
-constexpr TfLiteType typeToTfLiteType<TfLiteFloat16>() {
-  return kTfLiteFloat16;
-}
-template <>
-constexpr TfLiteType typeToTfLiteType<double>() {
-  return kTfLiteFloat64;
-}
+// Map from TfLiteType to the corresponding C++ type.
+// Example:
+//   TfLiteTypeToType<kTfLiteBool>::Type -> bool
+template <TfLiteType TFLITE_TYPE_ENUM>
+struct TfLiteTypeToType {};  // Specializations below
+
+// Template specialization for both typeToTfLiteType and TfLiteTypeToType.
+#define MATCH_TYPE_AND_TFLITE_TYPE(CPP_TYPE, TFLITE_TYPE_ENUM) \
+  template <>                                                  \
+  constexpr TfLiteType typeToTfLiteType<CPP_TYPE>() {          \
+    return TFLITE_TYPE_ENUM;                                   \
+  }                                                            \
+  template <>                                                  \
+  struct TfLiteTypeToType<TFLITE_TYPE_ENUM> {                  \
+    using Type = CPP_TYPE;                                     \
+  }
+
+MATCH_TYPE_AND_TFLITE_TYPE(int, kTfLiteInt32);
+MATCH_TYPE_AND_TFLITE_TYPE(int16_t, kTfLiteInt16);
+MATCH_TYPE_AND_TFLITE_TYPE(int64_t, kTfLiteInt64);
+MATCH_TYPE_AND_TFLITE_TYPE(float, kTfLiteFloat32);
+MATCH_TYPE_AND_TFLITE_TYPE(unsigned char, kTfLiteUInt8);
+MATCH_TYPE_AND_TFLITE_TYPE(int8_t, kTfLiteInt8);
+MATCH_TYPE_AND_TFLITE_TYPE(bool, kTfLiteBool);
+MATCH_TYPE_AND_TFLITE_TYPE(std::complex<float>, kTfLiteComplex64);
+MATCH_TYPE_AND_TFLITE_TYPE(std::complex<double>, kTfLiteComplex128);
+MATCH_TYPE_AND_TFLITE_TYPE(std::string, kTfLiteString);
+MATCH_TYPE_AND_TFLITE_TYPE(TfLiteFloat16, kTfLiteFloat16);
+MATCH_TYPE_AND_TFLITE_TYPE(double, kTfLiteFloat64);
+
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_TYPE_TO_TFLITETYPE_H_
diff --git a/tensorflow/lite/type_to_tflitetype_test.cc b/tensorflow/lite/type_to_tflitetype_test.cc
new file mode 100644
index 00000000000..51148531913
--- /dev/null
+++ b/tensorflow/lite/type_to_tflitetype_test.cc
@@ -0,0 +1,65 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/type_to_tflitetype.h"
+
+#include <string>
+
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace {
+
+TEST(TypeToTfLiteType, TypeMapsAreInverseOfEachOther) {
+  EXPECT_EQ(kTfLiteInt16,
+            typeToTfLiteType<TfLiteTypeToType<kTfLiteInt16>::Type>());
+  EXPECT_EQ(kTfLiteInt32,
+            typeToTfLiteType<TfLiteTypeToType<kTfLiteInt32>::Type>());
+  EXPECT_EQ(kTfLiteFloat32,
+            typeToTfLiteType<TfLiteTypeToType<kTfLiteFloat32>::Type>());
+  EXPECT_EQ(kTfLiteUInt8,
+            typeToTfLiteType<TfLiteTypeToType<kTfLiteUInt8>::Type>());
+  EXPECT_EQ(kTfLiteInt8,
+            typeToTfLiteType<TfLiteTypeToType<kTfLiteInt8>::Type>());
+  EXPECT_EQ(kTfLiteBool,
+            typeToTfLiteType<TfLiteTypeToType<kTfLiteBool>::Type>());
+  EXPECT_EQ(kTfLiteComplex64,
+            typeToTfLiteType<TfLiteTypeToType<kTfLiteComplex64>::Type>());
+  EXPECT_EQ(kTfLiteComplex128,
+            typeToTfLiteType<TfLiteTypeToType<kTfLiteComplex128>::Type>());
+  EXPECT_EQ(kTfLiteString,
+            typeToTfLiteType<TfLiteTypeToType<kTfLiteString>::Type>());
+  EXPECT_EQ(kTfLiteFloat16,
+            typeToTfLiteType<TfLiteTypeToType<kTfLiteFloat16>::Type>());
+  EXPECT_EQ(kTfLiteFloat64,
+            typeToTfLiteType<TfLiteTypeToType<kTfLiteFloat64>::Type>());
+}
+
+TEST(TypeToTfLiteType, Sanity) {
+  EXPECT_EQ(kTfLiteFloat32, typeToTfLiteType<float>());
+  EXPECT_EQ(kTfLiteBool, typeToTfLiteType<bool>());
+  EXPECT_EQ(kTfLiteString, typeToTfLiteType<std::string>());
+  static_assert(
+      std::is_same<float, TfLiteTypeToType<kTfLiteFloat32>::Type>::value,
+      "TfLiteTypeToType test failure");
+  static_assert(std::is_same<bool, TfLiteTypeToType<kTfLiteBool>::Type>::value,
+                "TfLiteTypeToType test failure");
+  static_assert(
+      std::is_same<std::string, TfLiteTypeToType<kTfLiteString>::Type>::value,
+      "TfLiteTypeToType test failure");
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/util.cc b/tensorflow/lite/util.cc
index 09efaa77f15..9cfdaf4d695 100644
--- a/tensorflow/lite/util.cc
+++ b/tensorflow/lite/util.cc
@@ -102,6 +102,9 @@ TfLiteStatus GetSizeOfType(TfLiteContext* context, const TfLiteType type,
     case kTfLiteComplex64:
       *bytes = sizeof(std::complex<float>);
       break;
+    case kTfLiteComplex128:
+      *bytes = sizeof(std::complex<double>);
+      break;
     case kTfLiteInt16:
       *bytes = sizeof(int16_t);
       break;
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index a0ce4305b16..faf097e85f9 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -169,6 +169,7 @@ tensorflow/third_party/systemlibs/BUILD
 tensorflow/third_party/systemlibs/BUILD.tpl
 tensorflow/third_party/systemlibs/absl_py.BUILD
 tensorflow/third_party/systemlibs/absl_py.absl.flags.BUILD
+tensorflow/third_party/systemlibs/absl_py.absl.logging.BUILD
 tensorflow/third_party/systemlibs/absl_py.absl.testing.BUILD
 tensorflow/third_party/systemlibs/astor.BUILD
 tensorflow/third_party/systemlibs/boringssl.BUILD
@@ -244,6 +245,8 @@ tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/cc_toolchain_config.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1/cc_toolchain_config.bzl
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11/cc_toolchain_config.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/cc_toolchain_config.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/dummy_toolchain.bzl
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index c0b982be3c9..745fed375b8 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -3,7 +3,7 @@
 #  ":platform" - Low-level and platform-specific Python code.
 
 load("//tensorflow:tensorflow.bzl", "py_strict_library")
-load("//tensorflow:tensorflow.bzl", "cc_header_only_library", "if_mlir", "if_not_windows", "if_tpu", "if_xla_available", "py_test", "py_tests", "tf_cc_shared_object", "tf_cc_test", "tf_cuda_library", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "cc_header_only_library", "if_mlir", "if_not_windows", "if_xla_available", "py_test", "py_tests", "tf_cc_shared_object", "tf_cc_test", "tf_cuda_library", "tf_enable_mlir_bridge", "tf_gen_op_wrapper_py")
 
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_monitoring_python_deps")
@@ -56,9 +56,8 @@ visibility = [
     "//third_party/py/reverb:__subpackages__",
     "//third_party/py/neural_structured_learning:__subpackages__",
     "//third_party/py/tensorflow_examples:__subpackages__",
+    "//third_party/py/tf_agents:__subpackages__",  # For benchmarks.
     "//third_party/py/tf_slim:__subpackages__",
-    # TODO(aselle): to pass open source test.
-    "//bazel_pip/tensorflow/lite/toco/python:__pkg__",
     "//third_party/py/tensorflow_docs:__subpackages__",
 ]
 
@@ -103,6 +102,7 @@ py_library(
         "//tensorflow/tools/api/tests:__pkg__",
         "//tensorflow/tools/compatibility/update:__pkg__",
         "//tensorflow_estimator:__subpackages__",
+        "//third_party/py/tensorflow_privacy:__subpackages__",  # TODO(b/163395075): remove when fixed
     ],
     deps = [
         ":layers",
@@ -173,6 +173,7 @@ py_library(
         ":list_ops",
         ":manip_ops",
         ":map_fn",
+        ":map_ops",
         ":math_ops",
         ":metrics",
         ":nccl_ops",
@@ -347,6 +348,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":platform",
         ":platform_test",
@@ -365,6 +367,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":platform",
         ":platform_test",
@@ -376,6 +379,7 @@ tf_py_test(
     size = "small",
     srcs = ["platform/flags_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":platform",
@@ -391,6 +395,7 @@ tf_py_test(
         "no_windows",
         "nomac",
     ],
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":platform",
@@ -790,9 +795,9 @@ tf_python_pybind_extension(
 tf_python_pybind_extension(
     name = "_pywrap_tf32_execution",
     srcs = ["util/tf32.cc"],
+    hdrs = ["//tensorflow/core/platform:tf32_hdr"],
     module_name = "_pywrap_tf32_execution",
     deps = [
-        "//tensorflow/core/platform:tf32_utils",
         "@pybind11",
     ],
 )
@@ -1148,6 +1153,7 @@ tf_py_test(
     name = "decorator_utils_test",
     srcs = ["util/decorator_utils_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":platform",
@@ -1159,6 +1165,7 @@ tf_py_test(
     name = "deprecation_test",
     srcs = ["util/deprecation_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":platform",
@@ -1170,6 +1177,7 @@ tf_py_test(
     name = "dispatch_test",
     srcs = ["util/dispatch_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":platform",
@@ -1181,6 +1189,7 @@ tf_py_test(
     name = "keyword_args_test",
     srcs = ["util/keyword_args_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":util",
@@ -1518,6 +1527,7 @@ tf_py_test(
     srcs = ["framework/function_def_to_graph_test.py"],
     python_version = "PY3",
     tags = ["no_pip"],
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -1617,11 +1627,61 @@ py_library(
     ],
 )
 
+cc_library(
+    name = "op_def_util_cc",
+    srcs = ["framework/op_def_util.cc"],
+    hdrs = ["framework/op_def_util.h"],
+    deps = [
+        ":cpp_python_util",
+        ":safe_ptr",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+# Note: this target is only used for op_def_util_test.  It includes op_def_util.cc
+# directly in its srcs (rather than depending on the `op_def_util_cc` target) because
+# depending on that target adds dependencies that register objects; and since the
+# extension is built as a shared object in some kokoro tests, this causes those objects
+# to get registered multiple times (which fails).
+tf_python_pybind_extension(
+    name = "_op_def_util",
+    srcs = [
+        "framework/op_def_util.cc",
+        "framework/op_def_util_pybind.cc",
+    ],
+    hdrs = [
+        "framework/op_def_util.h",
+        "lib/core/safe_ptr.h",
+        "util/util.h",
+        "//tensorflow/c:headers",
+        "//tensorflow/c/eager:headers",
+    ],
+    module_name = "_op_def_util",
+    deps = [
+        ":pybind11_status",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:status",
+        "//third_party/python_runtime:headers",
+        "@com_google_absl//absl/strings",
+        "@pybind11",
+    ],
+)
+
+tf_py_test(
+    name = "op_def_util_test",
+    srcs = ["framework/op_def_util_test.py"],
+    python_version = "PY3",
+    tags = ["no_pip"],
+    tfrt_enabled = True,
+)
+
 py_library(
     name = "framework_ops",  # "ops" is already the name of a deprecated target
     srcs = ["framework/ops.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":_op_def_util",
         ":c_api_util",
         ":control_flow_util",
         ":device",
@@ -1643,6 +1703,7 @@ py_library(
         "//tensorflow/python/eager:core",
         "//tensorflow/python/eager:monitoring",
         "//tensorflow/python/eager:tape",
+        "//tensorflow/python/profiler:traceme",
         "@six_archive//:six",
     ],
 )
@@ -1826,6 +1887,7 @@ tf_py_test(
     size = "small",
     srcs = ["framework/smart_cond_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":constant_op",
@@ -1898,6 +1960,7 @@ tf_py_test(
     srcs = ["framework/composite_tensor_utils_test.py"],
     main = "framework/composite_tensor_utils_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":composite_tensor",
@@ -2155,6 +2218,7 @@ tf_py_test(
     srcs = ["framework/registry_test.py"],
     main = "framework/registry_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -2168,6 +2232,7 @@ tf_py_test(
     srcs = ["framework/errors_test.py"],
     main = "framework/errors_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":errors",
@@ -2181,6 +2246,7 @@ tf_py_test(
     srcs = ["framework/error_interpolation_test.py"],
     main = "framework/error_interpolation_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":constant_op",
@@ -2195,6 +2261,7 @@ tf_py_test(
     srcs = ["framework/subscribe_test.py"],
     main = "framework/subscribe_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework",
         ":framework_for_generated_wrappers",
@@ -2237,6 +2304,7 @@ tf_py_test(
     tags = [
         "no_pip",
     ],
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":platform",
@@ -2249,6 +2317,7 @@ tf_py_test(
     srcs = ["framework/proto_test.py"],
     main = "framework/proto_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -2337,6 +2406,7 @@ tf_py_test(
     srcs = ["framework/versions_test.py"],
     main = "framework/versions_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -2349,6 +2419,7 @@ tf_py_test(
     srcs = ["framework/importer_test.py"],
     main = "framework/importer_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -2385,6 +2456,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -2408,6 +2480,7 @@ tf_py_test(
     srcs = ["framework/traceable_stack_test.py"],
     main = "framework/traceable_stack_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework_test_lib",
         ":platform_test",
@@ -2462,6 +2535,7 @@ tf_py_test(
     srcs = ["framework/common_shapes_test.py"],
     main = "framework/common_shapes_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework",
         ":framework_for_generated_wrappers",
@@ -2508,6 +2582,7 @@ tf_py_test(
     srcs = ["framework/ops_enable_eager_test.py"],
     main = "framework/ops_enable_eager_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework",
         ":platform_test",
@@ -2521,6 +2596,7 @@ tf_py_test(
     srcs = ["framework/tensor_shape_test.py"],
     main = "framework/tensor_shape_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -2536,6 +2612,7 @@ tf_py_test(
     srcs = ["framework/type_spec_test.py"],
     main = "framework/type_spec_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -2551,6 +2628,7 @@ tf_py_test(
     srcs = ["framework/tensor_spec_test.py"],
     main = "framework/tensor_spec_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -2583,6 +2661,7 @@ tf_py_test(
     srcs = ["framework/device_spec_test.py"],
     main = "framework/device_spec_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -2597,6 +2676,7 @@ tf_py_test(
     srcs = ["framework/device_test.py"],
     main = "framework/device_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -2611,6 +2691,7 @@ tf_py_test(
     srcs = ["framework/random_seed_test.py"],
     main = "framework/random_seed_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework",
@@ -2623,6 +2704,7 @@ tf_py_test(
     srcs = ["framework/tensor_shape_div_test.py"],
     main = "framework/tensor_shape_div_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -2639,6 +2721,7 @@ tf_py_test(
     main = "framework/tensor_util_test.py",
     python_version = "PY3",
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -2658,6 +2741,7 @@ tf_py_test(
     main = "framework/test_util_test.py",
     python_version = "PY3",
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         ":control_flow_ops",
         ":errors",
@@ -2692,6 +2776,7 @@ tf_py_test(
         "nomsan",  # TODO(b/149948895): Re-enable.
         "notsan",  # TODO(b/149948895): Re-enable.
     ],
+    tfrt_enabled = True,
     deps = [
         ":framework_test_lib",
         # TODO(kkb): Find more appropriate place to add `memory_checker` as deps
@@ -2717,6 +2802,7 @@ tf_py_test(
     srcs = ["framework/dtypes_test.py"],
     main = "framework/dtypes_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -2732,6 +2818,7 @@ tf_py_test(
     size = "small",
     srcs = ["framework/op_def_library_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -2745,6 +2832,7 @@ tf_py_test(
     srcs = ["framework/kernels_test.py"],
     main = "framework/kernels_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework_test_lib",
         ":kernels",
@@ -2756,11 +2844,10 @@ tf_py_test(
 tf_gen_op_wrapper_private_py(
     name = "array_ops_gen",
     visibility = [
-        # To pass open source testing in the pip Kokoros.
-        "//bazel_pip/tensorflow/compiler/tests:__pkg__",
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/compiler/tests:__pkg__",
         "//tensorflow/python/kernel_tests:__pkg__",
+        "//tensorflow/python/kernel_tests/v1_compat_tests:__pkg__",
     ],
     deps = [
         "//tensorflow/c/kernels:bitcast_op_lib",
@@ -2771,8 +2858,6 @@ tf_gen_op_wrapper_private_py(
 tf_gen_op_wrapper_private_py(
     name = "bitwise_ops_gen",
     visibility = [
-        # To pass open source testing in the pip Kokoros.
-        "//bazel_pip/tensorflow/compiler/tests:__pkg__",
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/compiler/tests:__pkg__",
         "//tensorflow/contrib/quantization:__pkg__",
@@ -2907,6 +2992,10 @@ tf_gen_op_wrapper_private_py(
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/python/kernel_tests:__pkg__",
     ],
+    deps = [
+        "//tensorflow/c/kernels:summary_op_lib",
+        "//tensorflow/core:logging_ops_op_lib",
+    ],
 )
 
 tf_gen_op_wrapper_private_py(
@@ -2968,8 +3057,6 @@ tf_gen_op_wrapper_private_py(
 tf_gen_op_wrapper_private_py(
     name = "math_ops_gen",
     visibility = [
-        # To pass open source testing in the pip Kokoros.
-        "//bazel_pip/tensorflow/compiler/tests:__pkg__",
         "//learning/brain/google/python/ops:__pkg__",
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/compiler/tests:__pkg__",
@@ -2980,8 +3067,6 @@ tf_gen_op_wrapper_private_py(
 tf_gen_op_wrapper_private_py(
     name = "nn_ops_gen",
     visibility = [
-        # To pass open source testing in the pip Kokoros.
-        "//bazel_pip/tensorflow/compiler/tests:__pkg__",
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/compiler/tests:__pkg__",
         "//tensorflow/python/kernel_tests:__pkg__",
@@ -3033,6 +3118,10 @@ tf_gen_op_wrapper_private_py(
     name = "list_ops_gen",
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "map_ops_gen",
+)
+
 tf_gen_op_wrapper_private_py(
     name = "script_ops_gen",
 )
@@ -3144,6 +3233,7 @@ py_library(
         "ops/inplace_ops.py",
     ],
     srcs_version = "PY2AND3",
+    visibility = visibility,
     deps = [
         ":array_ops_gen",
         ":common_shapes",
@@ -3268,6 +3358,7 @@ tf_py_test(
     size = "small",
     srcs = ["ops/clip_ops_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":clip_ops",
@@ -3293,6 +3384,7 @@ tf_py_test(
     size = "medium",
     srcs = ["ops/clustering_ops_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":clustering_ops",
@@ -3317,6 +3409,7 @@ tf_py_test(
     srcs = ["ops/collective_ops_test.py"],
     python_version = "PY3",
     tags = ["no_rocm"],
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":collective_ops",
@@ -3337,6 +3430,7 @@ tf_py_test(
         "no_windows",
         "nomac",
     ],
+    tfrt_enabled = True,
     xla_enable_strict_auto_jit = True,
     deps = [
         ":client_testlib",
@@ -3354,12 +3448,11 @@ cuda_py_test(
     python_version = "PY3",
     tags = [
         "guitar",
-        "manual",
         "multi_gpu",
-        "no_oss",
         "no_rocm",
-        "notap",
+        "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":collective_ops",
@@ -3463,6 +3556,7 @@ tf_py_test(
     size = "small",
     srcs = ["ops/control_flow_v2_toggles_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":control_flow_util_v2",
@@ -3476,6 +3570,7 @@ tf_py_test(
     size = "small",
     srcs = ["ops/control_flow_v2_enable_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":control_flow_util",
@@ -3497,6 +3592,7 @@ tf_py_test(
         "no_oss",
         "no_pip",
     ],
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":control_flow_util",
@@ -3551,6 +3647,7 @@ py_library(
         ":functional_ops_gen",
         ":gradients_util",
         ":list_ops",
+        ":map_ops",
         ":pywrap_tf_session",
         ":tensor_array_ops",
         ":tensor_shape",
@@ -3577,6 +3674,7 @@ tf_py_test(
     size = "small",
     srcs = ["ops/bincount_ops_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":bincount_ops",
         ":platform_test",
@@ -4058,6 +4156,7 @@ cuda_py_test(
     size = "small",
     srcs = ["training/experimental/mixed_precision_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":mixed_precision",
@@ -4128,6 +4227,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
+        ":bitwise_ops_gen",
         ":common_shapes",
         ":constant_op",
         ":control_flow_ops_gen",
@@ -4209,6 +4309,16 @@ py_library(
     ],
 )
 
+py_library(
+    name = "map_ops",
+    srcs = ["ops/map_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":map_ops_gen",
+    ],
+)
+
 py_library(
     name = "nn",
     srcs = [
@@ -4365,6 +4475,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["ops/stateful_random_ops_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,
     xla_enabled = True,
     deps = [
@@ -4536,6 +4647,7 @@ tf_py_test(
     name = "sort_ops_test",
     srcs = ["ops/sort_ops_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -4635,6 +4747,7 @@ cuda_py_test(
     name = "rnn_grad_test",
     srcs = ["ops/rnn_grad_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -4917,6 +5030,7 @@ cuda_py_test(
     srcs = ["ops/bitwise_ops_test.py"],
     python_version = "PY3",
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         ":bitwise_ops",
         ":constant_op",
@@ -4955,23 +5069,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "gradient_checker_test",
-    size = "medium",
-    srcs = ["ops/gradient_checker_test.py"],
-    python_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":framework_for_generated_wrappers",
-        ":math_ops",
-        ":nn_grad",
-        ":nn_ops",
-        ":platform",
-        "//third_party/py/numpy",
-    ],
-)
-
 py_test(
     name = "op_selector_test",
     srcs = ["ops/op_selector_test.py"],
@@ -5022,6 +5119,7 @@ cuda_py_test(
         ":gradients",
         ":init_ops",
         ":list_ops",
+        ":map_ops",
         ":math_grad",
         ":math_ops",
         ":nn_grad",
@@ -5044,6 +5142,7 @@ cuda_py_test(
     size = "small",
     srcs = ["ops/histogram_ops_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -5060,6 +5159,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["ops/image_grad_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -5075,7 +5175,7 @@ cuda_py_test(
     srcs = ["ops/image_ops_test.py"],
     data = ["//tensorflow/core:image_testdata"],
     python_version = "PY3",
-    shard_count = 5,
+    shard_count = 16,
     deps = [
         ":array_ops",
         ":client",
@@ -5092,6 +5192,7 @@ cuda_py_test(
         ":variables",
         "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -5100,6 +5201,7 @@ cuda_py_test(
     size = "small",
     srcs = ["ops/init_ops_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_ops",
@@ -5115,6 +5217,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["ops/init_ops_v2_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -5185,6 +5288,7 @@ cuda_py_test(
     python_version = "PY3",
     shard_count = 4,
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -5204,6 +5308,7 @@ cuda_py_test(
     srcs = ["ops/nn_fused_batchnorm_test.py"],
     python_version = "PY3",
     shard_count = 24,
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -5254,6 +5359,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["ops/nn_xent_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -5287,6 +5393,7 @@ cuda_py_test(
         "no_oss",  # TODO(b/149565560)
         "no_windows_gpu",
     ],
+    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -5323,6 +5430,7 @@ tf_py_test(
     size = "small",
     srcs = ["ops/variable_spec_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -5593,6 +5701,7 @@ tf_py_test(
     name = "tf_export_test",
     srcs = ["util/tf_export_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":platform",
@@ -5650,6 +5759,7 @@ tf_py_test(
     name = "tf_stack_test",
     srcs = ["util/tf_stack_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":tf_export",
@@ -5663,13 +5773,21 @@ cc_library(
     hdrs = ["util/stack_trace.h"],
     deps = [
         ":py_util",
+        "//tensorflow/core/platform:str_util",
+        "//tensorflow/core/platform:stringpiece",
+        "//tensorflow/core/util:abstract_stack_trace",
         "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:optional",
     ],
 )
 
+py_library(
+    name = "global_test_configuration",
+    deps = if_mlir(["//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_pass_registration"]) +
+           tf_enable_mlir_bridge(),
+)
+
 py_library(
     name = "util",
     srcs = glob(
@@ -5693,6 +5811,10 @@ py_library(
     ],
     deps = [
         ":_pywrap_tf32_execution",
+        # global_test_configuration is added here because all major tests depend on this
+        # library. It isn't possible to add these test dependencies via tensorflow.bzl's
+        # py_test because not all tensorflow tests use tensorflow.bzl's py_test.
+        ":global_test_configuration",
         ":tf_decorator",
         ":tf_export",
         ":tf_stack",
@@ -5702,7 +5824,7 @@ py_library(
         "@six_archive//:six",
         "@wrapt",
         "//tensorflow/tools/compatibility:all_renames_v2",
-    ] + if_mlir(["//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_pass_registration"]),
+    ],
 )
 
 tf_py_test(
@@ -5710,6 +5832,7 @@ tf_py_test(
     size = "small",
     srcs = ["util/object_identity_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
 )
 
 # Placeholder for intenal nest_test comments.
@@ -5719,6 +5842,7 @@ tf_py_test(
     srcs = ["util/nest_test.py"],
     main = "util/nest_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [":util_nest_test_main_lib"],
 )
 
@@ -5744,6 +5868,7 @@ tf_py_test(
     srcs = ["util/serialization_test.py"],
     main = "util/serialization_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":util",
@@ -5754,6 +5879,7 @@ tf_py_test(
     name = "function_utils_test",
     srcs = ["util/function_utils_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":util",
@@ -5765,6 +5891,7 @@ tf_py_test(
     size = "small",
     srcs = ["util/tf_contextlib_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":util",
@@ -5776,6 +5903,7 @@ tf_py_test(
     size = "small",
     srcs = ["util/tf_decorator_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":util",
@@ -5799,6 +5927,7 @@ tf_py_test(
     size = "small",
     srcs = ["util/tf_should_use_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":tf_should_use",
@@ -5810,6 +5939,7 @@ tf_py_test(
     size = "small",
     srcs = ["util/tf_inspect_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":util",
@@ -5834,6 +5964,7 @@ tf_py_test(
     srcs = ["util/lock_util_test.py"],
     main = "util/lock_util_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":util",
@@ -5846,6 +5977,7 @@ tf_py_test(
     size = "small",
     srcs = ["util/module_wrapper_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":util",
@@ -5888,6 +6020,7 @@ tf_py_test(
     main = "util/protobuf/compare_test.py",
     python_version = "PY3",
     tags = ["no_pip"],  # compare_test_pb2 proto is not available in pip.
+    tfrt_enabled = True,
     deps = [
         ":compare_test_proto_py",
         ":platform_test",
@@ -5902,6 +6035,7 @@ tf_py_test(
     srcs = ["util/example_parser_configuration_test.py"],
     main = "util/example_parser_configuration_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -5917,6 +6051,7 @@ tf_py_test(
     size = "small",
     srcs = ["client/events_writer_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":errors",
         ":framework_test_lib",
@@ -6073,7 +6208,10 @@ pywrap_tensorflow_macro(
         "//tensorflow/core/debug",
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/platform:stacktrace_handler",
+        "//tensorflow/core/profiler/rpc:profiler_server_impl",
+        "//tensorflow/core/profiler/rpc/client:profiler_client_impl",
         "//tensorflow/core/profiler/internal:print_model_analysis",
+        "//tensorflow/lite/delegates/flex:delegate",
         "//tensorflow/core/profiler/internal/cpu:python_tracer",
         "//tensorflow/tools/graph_transforms:transform_graph_lib",
         "//tensorflow/lite/toco/python:toco_python_api",
@@ -6087,12 +6225,7 @@ pywrap_tensorflow_macro(
         "@ngraph_tf//:ngraph_tf",
     ]) + if_xla_available([
         "//tensorflow/compiler/aot:tfcompile_lib",
-    ]) + select({
-        "//tensorflow:windows": [],  # TODO(b/159077703): Enable Flex on Windows
-        "//conditions:default": [
-            "//tensorflow/lite/delegates/flex:delegate",
-        ],
-    }) + if_tpu(["//tensorflow/core/tpu:tpu_api_dlsym_initializer"]),
+    ]) + if_static(extra_deps = ["//tensorflow/core/platform:tf32_utils"]),
 )
 
 # ** Targets for Windows build (start) **
@@ -6145,10 +6278,13 @@ filegroup(
         "//tensorflow/core/grappler/graph_analyzer:graph_analyzer_tool",  # graph_analyzer
         "//tensorflow/core/grappler/optimizers:meta_optimizer",  # tf_optimizer
         "//tensorflow/core/grappler/utils:topological_sort",  # tf_item
+        "//tensorflow/core/platform:tf32_utils",  # tf32
         "//tensorflow/core/profiler/internal:annotation_stack_impl",  # profiler
         "//tensorflow/core/profiler/internal:print_model_analysis",  # tfprof
         "//tensorflow/core/profiler/internal:traceme_recorder_impl",  # profiler
         "//tensorflow/core/profiler/lib:profiler_session_impl",  # profiler
+        "//tensorflow/core/profiler/rpc:profiler_server_impl",  # profiler
+        "//tensorflow/core/profiler/rpc/client:profiler_client_impl",  # profiler
         "//tensorflow/core/util:port",  # util_port
         "//tensorflow/core/util/tensor_bundle",  # checkpoint_reader
         "//tensorflow/lite/toco/python:toco_python_api",  # toco
@@ -6334,6 +6470,10 @@ tf_py_test(
     srcs = ["training/server_lib_test.py"],
     grpc_enabled = True,
     python_version = "PY3",
+    tags = [
+        "noasan",  # TODO(b/161236904): flaky timeout in trying to start gRPC server
+    ],
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -6355,6 +6495,7 @@ tf_py_test(
     srcs = ["training/server_lib_multiple_containers_test.py"],
     grpc_enabled = True,
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -6376,6 +6517,7 @@ tf_py_test(
     srcs = ["training/server_lib_same_variables_clear_container_test.py"],
     grpc_enabled = True,
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -6397,6 +6539,7 @@ tf_py_test(
     srcs = ["training/server_lib_same_variables_clear_test.py"],
     grpc_enabled = True,
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -6418,6 +6561,7 @@ tf_py_test(
     srcs = ["training/server_lib_same_variables_no_clear_test.py"],
     grpc_enabled = True,
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -6439,6 +6583,7 @@ tf_py_test(
     srcs = ["training/server_lib_sparse_job_test.py"],
     grpc_enabled = True,
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -6466,6 +6611,7 @@ cuda_py_test(
         "no_oss",  # Test flaky due to port collisions.
         "oss_serial",
     ],
+    tfrt_enabled = True,
     deps = [
         ":client",
         ":client_testlib",
@@ -6492,6 +6638,7 @@ tf_py_test(
         "notsan",  # data race due to b/62910646
         "oss_serial",
     ],
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -6534,6 +6681,7 @@ tf_py_test(
         "no_pip_gpu",  # testInteractivePlacePrunedGraph fails on invalid assumption about GPU ops.
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -6596,6 +6744,7 @@ tf_py_test(
         "no_pip_gpu",
         "notsan",  # data race due to b/62910646
     ],
+    tfrt_enabled = True,
     deps = [
         ":client",
         ":framework",
@@ -6615,6 +6764,7 @@ tf_py_test(
         "no_gpu",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -6639,6 +6789,7 @@ cuda_py_test(
         "gpu_cupti",
         "no_gpu",  # b/154742661
     ],
+    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,  # Graph structure is different with autojit
     deps = [
         ":client",
@@ -6658,6 +6809,7 @@ cuda_py_test(
         "no_gpu",  # b/127386241
         "no_windows_gpu",
     ],
+    tfrt_enabled = True,
     deps = [
         ":client",
         ":client_testlib",
@@ -6672,6 +6824,7 @@ tf_py_test(
     size = "small",
     srcs = ["framework/c_api_util_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":c_api_util",
         ":framework_test_lib",
@@ -6684,6 +6837,7 @@ tf_py_test(
     size = "small",
     srcs = ["framework/graph_util_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client",
         ":client_testlib",
@@ -6704,6 +6858,7 @@ tf_py_test(
     srcs = ["framework/convert_to_constants_test.py"],
     python_version = "PY3",
     tags = ["no_rocm"],
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":control_flow_v2_toggles",
@@ -6718,6 +6873,7 @@ tf_py_test(
     size = "small",
     srcs = ["lib/core/bfloat16_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":lib",
@@ -6734,6 +6890,7 @@ tf_py_test(
         "no_rocm",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":errors",
@@ -6746,6 +6903,7 @@ tf_py_test(
     size = "small",
     srcs = ["lib/io/tf_record_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":errors",
@@ -6782,6 +6940,7 @@ cuda_py_test(
         "no_windows",  # b/139083295: bfloat16 tests fail on Windows
         "notsan",
     ],
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -6952,6 +7111,7 @@ tf_py_test(
         "noasan",  # http://b/30379628
         "notsan",  # http://b/30379628
     ],
+    tfrt_enabled = True,
     deps = [
         ":client",
         ":client_testlib",
@@ -6972,6 +7132,7 @@ tf_py_test(
         "noasan",  # http://b/30782289
         "notsan",  # http://b/30782289
     ],
+    tfrt_enabled = True,
     deps = [
         ":client",
         ":client_testlib",
@@ -6989,6 +7150,7 @@ cuda_py_test(
     grpc_enabled = True,
     main = "training/session_manager_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -7009,6 +7171,7 @@ tf_py_test(
     grpc_enabled = True,
     python_version = "PY3",
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":checkpoint_management",
@@ -7037,6 +7200,7 @@ tf_py_test(
         "no_windows",
         "notsan",  # intermittent races on a few percent of runs
     ],
+    tfrt_enabled = True,
     deps = [
         ":client",
         ":client_testlib",
@@ -7087,6 +7251,7 @@ tf_py_test(
     size = "small",
     srcs = ["training/checkpoint_ops_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":checkpoint_ops_gen",
         ":client",
@@ -7108,6 +7273,7 @@ tf_py_test(
     size = "medium",
     srcs = ["training/warm_starting_util_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -7129,6 +7295,7 @@ tf_py_test(
         "no_pip",
         "notsan",  # b/67945581
     ],
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":checkpoint_management",
@@ -7173,6 +7340,7 @@ tf_py_test(
     size = "small",
     srcs = ["training/training_util_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework",
@@ -7188,6 +7356,7 @@ tf_py_test(
     size = "medium",
     srcs = ["training/input_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -7264,6 +7433,7 @@ py_tests(
     size = "small",
     srcs = [
         "summary/plugin_asset_test.py",
+        "summary/summary_iterator_test.py",
         "summary/summary_test.py",
         "summary/writer/writer_test.py",
     ],
@@ -7337,6 +7507,7 @@ tf_py_test(
     srcs = ["ops/dequantize_op_test.py"],
     python_version = "PY3",
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -7351,6 +7522,7 @@ tf_py_test(
     srcs = ["ops/quantized_ops_test.py"],
     python_version = "PY3",
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -7365,6 +7537,7 @@ tf_py_test(
     srcs = ["ops/quantized_conv_ops_test.py"],
     python_version = "PY3",
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -7398,6 +7571,7 @@ cuda_py_test(
     main = "ops/accumulate_n_benchmark.py",
     python_version = "PY3",
     shard_count = 6,
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -7417,6 +7591,7 @@ cuda_py_test(
     srcs = ["ops/batch_norm_benchmark.py"],
     main = "ops/batch_norm_benchmark.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -7438,6 +7613,7 @@ cuda_py_test(
     srcs = ["ops/collective_ops_benchmark.py"],
     main = "ops/collective_ops_benchmark.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -7455,6 +7631,7 @@ cuda_py_test(
     srcs = ["ops/concat_benchmark.py"],
     main = "ops/concat_benchmark.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -7473,6 +7650,7 @@ cuda_py_test(
     srcs = ["ops/control_flow_ops_benchmark.py"],
     main = "ops/control_flow_ops_benchmark.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":constant_op",
@@ -7488,6 +7666,7 @@ cuda_py_test(
     srcs = ["ops/conv2d_benchmark.py"],
     main = "ops/conv2d_benchmark.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client",
         ":client_testlib",
@@ -7508,6 +7687,7 @@ cuda_py_test(
     srcs = ["ops/split_benchmark.py"],
     main = "ops/split_benchmark.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -7528,6 +7708,7 @@ cuda_py_test(
     srcs = ["ops/transpose_benchmark.py"],
     main = "ops/transpose_benchmark.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -7548,6 +7729,7 @@ cuda_py_test(
     srcs = ["ops/matmul_benchmark.py"],
     main = "ops/matmul_benchmark.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [":matmul_benchmark_main_lib"],
 )
 
@@ -7577,6 +7759,7 @@ cuda_py_test(
     grpc_enabled = True,
     main = "client/session_benchmark.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -7595,6 +7778,7 @@ cuda_py_test(
     srcs = ["framework/graph_building_benchmark.py"],
     main = "framework/graph_building_benchmark.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -7610,6 +7794,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["ops/nn_grad_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -7664,6 +7849,7 @@ tf_py_test(
         "grappler",
         "no_pip",  # tf_optimizer is not available in pip.
     ],
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -7684,6 +7870,7 @@ tf_py_test(
         "grappler",
         "no_pip",  # tf_optimizer is not available in pip.
     ],
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -7802,6 +7989,7 @@ tf_py_test(
         "grappler",
         "no_pip",  # tf_optimizer is not available in pip.
     ],
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -7823,6 +8011,7 @@ tf_py_test(
     tags = [
         "grappler",
     ],
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -7958,6 +8147,7 @@ tf_py_test(
         "no_pip",
         "no_windows",  # TODO(b/151942037)
     ],
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -7992,6 +8182,7 @@ tf_py_test(
         "grappler",
         "no_pip",
     ],
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -8012,6 +8203,7 @@ cuda_py_test(
     ],
     python_version = "PY3",
     tags = ["grappler"],
+    tfrt_enabled = True,
     # This test analyzes the graph, but XLA changes the names of nodes.
     xla_enable_strict_auto_jit = False,
     deps = [
@@ -8267,6 +8459,7 @@ cuda_py_test(
     name = "raw_ops_test",
     srcs = ["ops/raw_ops_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
     ],
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 698a0d120c1..b5acf23ba79 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -56,6 +56,7 @@ from tensorflow.python.ops import image_ops as image
 from tensorflow.python.ops import manip_ops as manip
 from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
+from tensorflow.python.ops import numpy_ops
 from tensorflow.python.ops import ragged
 from tensorflow.python.ops import sets
 from tensorflow.python.ops import stateful_random_ops
diff --git a/tensorflow/python/autograph/converters/BUILD b/tensorflow/python/autograph/converters/BUILD
index f584038978f..fd8ec1dbaa3 100644
--- a/tensorflow/python/autograph/converters/BUILD
+++ b/tensorflow/python/autograph/converters/BUILD
@@ -176,6 +176,7 @@ py_test(
     srcs = ["logical_expressions_test.py"],
     python_version = "PY3",
     srcs_version = "PY2AND3",
+    tags = ["notsan"],  # b/163218460
     deps = [
         ":converters",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/autograph/g3doc/pyct_tutorial.ipynb b/tensorflow/python/autograph/g3doc/pyct_tutorial.ipynb
index 05e7a46c718..8b7b3e9b350 100644
--- a/tensorflow/python/autograph/g3doc/pyct_tutorial.ipynb
+++ b/tensorflow/python/autograph/g3doc/pyct_tutorial.ipynb
@@ -122,10 +122,6 @@
         "\n",
         "class PyToBasicCpp(transpiler.GenericTranspiler):\n",
         "\n",
-        "  #TODO(mdan): Document this.\n",
-        "  def get_transformed_name(self, node):\n",
-        "    return 'new_f'\n",
-        "\n",
         "  def transform_ast(self, node, ctx):\n",
         "    codegen = BasicCppCodegen(ctx)\n",
         "    codegen.visit(node)\n",
diff --git a/tensorflow/python/autograph/g3doc/reference/debugging.md b/tensorflow/python/autograph/g3doc/reference/debugging.md
index 2c2a96cec86..5f20b5f9c24 100644
--- a/tensorflow/python/autograph/g3doc/reference/debugging.md
+++ b/tensorflow/python/autograph/g3doc/reference/debugging.md
@@ -21,13 +21,12 @@ Note: Python debugging can only be used to step through the code during graph
 construction time (or tracing time in the case of `tf.function`). To debug
 TensorFlow execution, use Eager execution.
 
-### Debugging `tf.function`: `tf.config.experimental_execute_functions_eagerly`
+### Debugging `tf.function`: `tf.config.experimental_run_functions_eagerly`
 
-When using `@tf.function`, you can temporarily toggle graph execution
-by using `tf.config.experimental_execute_functions_eagerly`. This will
-effectively run the annotated code eagerly, without transformation.
-Since AutoGraph has semantics consistent with Eager, it's an effective way to
-debug the code step-by-step.
+When using `@tf.function`, you can temporarily toggle graph execution by using
+`tf.config.experimental_run_functions_eagerly`. This will effectively run the
+annotated code eagerly, without transformation. Since AutoGraph has semantics
+consistent with Eager, it's an effective way to debug the code step-by-step.
 
 Note: AutoGraph is compatible with Eager, but the converse is not always
 true, so exercise care when making modifications to the code while debugging.
@@ -58,8 +57,8 @@ f(1)
      14       ...
 ```
 
-Adding a call to `tf.config.experimental_execute_functions_eagerly` before
-executing the function will land the debugger in the original code instead:
+Adding a call to `tf.config.experimental_run_functions_eagerly` before executing
+the function will land the debugger in the original code instead:
 
 ```
 tf.config.run_functions_eagerly(True)
diff --git a/tensorflow/python/autograph/g3doc/reference/limitations.md b/tensorflow/python/autograph/g3doc/reference/limitations.md
index 70ce5fc7dec..70e3b3a552e 100644
--- a/tensorflow/python/autograph/g3doc/reference/limitations.md
+++ b/tensorflow/python/autograph/g3doc/reference/limitations.md
@@ -66,22 +66,48 @@ else:
   pass
 ```
 
-Similarly, variables may not be defined inside a TensorFlow loop, unless they
-are local to the loop. A variable is local to the loop if (1) it's not used
-after the loop and (2) the value from a previour iteration is not used in the
-next iteration:
+Similarly, variables must usually be defined before a TensorFlow loop.
+
+The most common example that is not allowed is a loop which initializes some
+accumulator variable in the first iteration:
 
 ```
 del x
-while tf.random.uniform(()) > 0.5:  # Error -- x must be defined before the loop
+for i in tf.range(100):  # Error -- x must be defined before the loop
+  if i == 0:
+    x = tf.constant(1)
+  else:
+    x = x + 1
+tf.print(x)
+```
+
+When the variable is only used inside the loop and does not depend on previous
+iterations, then it's ok to only be initialized inside the loop.
+
+```
+del x
+while tf.random.uniform(()) > 0.5:  # Okay -- x is not used after the loop
+  x = tf.constant(1)
+```
+
+* New in TF 2.4 *
+
+As long as it doesn't depend on previous iterations, the variable may also be
+used after the loop, however in that case the loop must execute at least one
+iteration, and will raise a runtime error otherwise.
+
+```
+del x
+for i in tf.range(10):  # Okay -- x does not depend on previous iterations
   x = tf.constant(1)
 tf.print(x)
 ```
 
 ```
 del x
-while tf.random.uniform(()) > 0.5:  # Okay -- x is local to the loop
+while tf.constant(False):  # Error -- loop must initialize x!
   x = tf.constant(1)
+tf.print(x)
 ```
 
 Avoid these limitations by defining a default value before the control flow
@@ -98,6 +124,34 @@ Note: `None` values and undefined symbols are allowed in Eager control flow,
 because Eager execution uses Python control flow, rather than TensorFlow
 control flow ops.
 
+#### Special case: creating Tensors in a loop
+
+* New in TF 2.4 *
+
+A very common use-case is to run a training loop that creates some outputs:
+
+```
+for i in tf.range(num_steps):
+  outputs = train(next(data_iterator))
+```
+
+Often times these outputs can be nested structures of Tensors, which makes them
+impractical to initialize ahead of the loop.
+
+To help with this use-case, AutoGraph lets you run such loops, under certain
+conditions:
+
+ * outputs must be a Tensor, Python numeric, or a structure of these
+ * outputs must not depend on the value from a previous iteration; in other
+   words, the outputs may only appear to the left of an assignment operation
+ * the loop must run at least one iteration
+
+If the type of outputs is not recognized, then the usual
+"outputs must be defined before the loop" is raised at graph construction.
+
+AutoGraph also inserts a `tf.Assert` statement that raises a runtime error
+if the loop did not execute at least one iteration.
+
 ### Indirect modifications and hidden side effects in TensorFlow control flow
 
 Key Point: We recommend using a functional programming style, immutable Python
diff --git a/tensorflow/python/autograph/impl/BUILD b/tensorflow/python/autograph/impl/BUILD
index a21b0df2ce8..866314f5b76 100644
--- a/tensorflow/python/autograph/impl/BUILD
+++ b/tensorflow/python/autograph/impl/BUILD
@@ -40,6 +40,8 @@ py_library(
 tf_py_test(
     name = "api_test",
     srcs = ["api_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
     deps = [
         ":impl",
         "//tensorflow/python:client_testlib",
@@ -49,24 +51,6 @@ tf_py_test(
     ],
 )
 
-py_test(
-    name = "api_py3_test",
-    srcs = ["api_py3_test.py"],
-    python_version = "PY3",
-    srcs_version = "PY3",
-    tags = [
-        "no_oss_py2",
-        "no_pip",
-        "nopip",
-    ],
-    deps = [
-        ":impl",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python/autograph/core",
-    ],
-)
-
 tf_py_test(
     name = "conversion_test",
     srcs = ["conversion_test.py"],
diff --git a/tensorflow/python/autograph/impl/api.py b/tensorflow/python/autograph/impl/api.py
index 8c7093c864d..fb318e71cd9 100644
--- a/tensorflow/python/autograph/impl/api.py
+++ b/tensorflow/python/autograph/impl/api.py
@@ -377,6 +377,10 @@ def converted_call(f,
       return py_builtins.eval_in_original_context(f, args, caller_fn_scope)
     if f is super:
       return py_builtins.super_in_original_context(f, args, caller_fn_scope)
+    if f is globals:
+      return py_builtins.globals_in_original_context(caller_fn_scope)
+    if f is locals:
+      return py_builtins.locals_in_original_context(caller_fn_scope)
     if kwargs:
       return py_builtins.overload_of(f)(*args, **kwargs)
     else:
diff --git a/tensorflow/python/autograph/impl/api_py3_test.py b/tensorflow/python/autograph/impl/api_py3_test.py
deleted file mode 100644
index c460e478008..00000000000
--- a/tensorflow/python/autograph/impl/api_py3_test.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# python3
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for api module."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import os
-
-from tensorflow.python.autograph.core import converter
-from tensorflow.python.autograph.impl import api
-from tensorflow.python.framework import constant_op
-from tensorflow.python.platform import test
-
-DEFAULT_RECURSIVE = converter.ConversionOptions(recursive=True)
-
-
-class ApiTest(test.TestCase):
-
-  def test_converted_call_kwonly_args(self):
-
-    def test_fn(*, a):
-      return a
-
-    x = api.converted_call(
-        test_fn, (), {'a': constant_op.constant(-1)}, options=DEFAULT_RECURSIVE)
-    self.assertEqual(-1, self.evaluate(x))
-
-  def test_super_with_no_arg(self):
-    test_case_self = self
-
-    class TestBase:
-
-      def plus_three(self, x):
-        return x + 3
-
-    class TestSubclass(TestBase):
-
-      def plus_three(self, x):
-        test_case_self.fail('This should never be called.')
-
-      def no_arg(self, x):
-        return super().plus_three(x)
-
-    tc = api.converted_call(TestSubclass, (), None, options=DEFAULT_RECURSIVE)
-
-    self.assertEqual(5, tc.no_arg(2))
-
-  def test_converted_call_avoids_triggering_operators(self):
-
-    test_self = self
-
-    class Pair(collections.namedtuple('Pair', ['a', 'b'])):
-
-      def __call__(self):
-        return self.a + self.b
-
-      def __eq__(self, other):
-        test_self.fail('Triggered operator')
-
-    p = Pair(constant_op.constant(1), constant_op.constant(2))
-
-    x = api.converted_call(p, (), {}, options=DEFAULT_RECURSIVE)
-    self.assertIsNotNone(self.evaluate(x), 3)
-
-
-if __name__ == '__main__':
-  os.environ['AUTOGRAPH_STRICT_CONVERSION'] = '1'
-  test.main()
diff --git a/tensorflow/python/autograph/impl/api_test.py b/tensorflow/python/autograph/impl/api_test.py
index 5b885af43ac..ad7e8e9fb37 100644
--- a/tensorflow/python/autograph/impl/api_test.py
+++ b/tensorflow/python/autograph/impl/api_test.py
@@ -33,7 +33,6 @@ import types
 import numpy as np
 import six
 
-from tensorflow.python.autograph import utils
 from tensorflow.python.autograph.core import ag_ctx
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import converter_testing
@@ -47,15 +46,15 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 
-tf = utils.fake_tf()
 
 global_n = 2
 
@@ -90,6 +89,52 @@ class ApiTest(test.TestCase):
     self.assertEmpty(
         tuple(o for o in objects_after if isinstance(o, TestResource)))
 
+  def test_converted_call_kwonly_args(self):
+
+    def test_fn(*, a):
+      return a
+
+    x = api.converted_call(
+        test_fn, (), {'a': constant_op.constant(-1)}, options=DEFAULT_RECURSIVE)
+    self.assertEqual(-1, self.evaluate(x))
+
+  def test_super_with_no_arg(self):
+    test_case_self = self
+
+    class TestBase:
+
+      def plus_three(self, x):
+        return x + 3
+
+    class TestSubclass(TestBase):
+
+      def plus_three(self, x):
+        test_case_self.fail('This should never be called.')
+
+      def no_arg(self, x):
+        return super().plus_three(x)
+
+    tc = api.converted_call(TestSubclass, (), None, options=DEFAULT_RECURSIVE)
+
+    self.assertEqual(5, tc.no_arg(2))
+
+  def test_converted_call_avoids_triggering_operators(self):
+
+    test_self = self
+
+    class Pair(collections.namedtuple('Pair', ['a', 'b'])):
+
+      def __call__(self):
+        return self.a + self.b
+
+      def __eq__(self, other):
+        test_self.fail('Triggered operator')
+
+    p = Pair(constant_op.constant(1), constant_op.constant(2))
+
+    x = api.converted_call(p, (), {}, options=DEFAULT_RECURSIVE)
+    self.assertIsNotNone(self.evaluate(x), 3)
+
   @test_util.run_deprecated_v1
   def test_decorator_recursive(self):
 
@@ -102,16 +147,15 @@ class ApiTest(test.TestCase):
 
       @api.convert(recursive=True)
       def test_method(self, x, s, a):
-        while tf.reduce_sum(x) > s:
+        while math_ops.reduce_sum(x) > s:
           x //= self.called_member(a)
         return x
 
     tc = TestClass()
-    with self.cached_session() as sess:
-      x = tc.test_method(
-          constant_op.constant([2, 4]), constant_op.constant(1),
-          constant_op.constant(-2))
-      self.assertListEqual([0, 1], self.evaluate(x).tolist())
+    x = tc.test_method(
+        constant_op.constant([2, 4]), constant_op.constant(1),
+        constant_op.constant(-2))
+    self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
   @test_util.run_deprecated_v1
   def test_decorator_not_recursive(self):
@@ -119,20 +163,19 @@ class ApiTest(test.TestCase):
     class TestClass(object):
 
       def called_member(self, a):
-        return tf.negative(a)
+        return math_ops.negative(a)
 
       @api.convert(recursive=False)
       def test_method(self, x, s, a):
-        while tf.reduce_sum(x) > s:
+        while math_ops.reduce_sum(x) > s:
           x //= self.called_member(a)
         return x
 
     tc = TestClass()
-    with self.cached_session() as sess:
-      x = tc.test_method(
-          constant_op.constant([2, 4]), constant_op.constant(1),
-          constant_op.constant(-2))
-      self.assertListEqual([0, 1], self.evaluate(x).tolist())
+    x = tc.test_method(
+        constant_op.constant([2, 4]), constant_op.constant(1),
+        constant_op.constant(-2))
+    self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
   @test_util.run_deprecated_v1
   def test_convert_then_do_not_convert(self):
@@ -141,11 +184,11 @@ class ApiTest(test.TestCase):
 
       @api.do_not_convert
       def called_member(self, a):
-        return tf.negative(a)
+        return math_ops.negative(a)
 
       @api.convert(recursive=True)
       def test_method(self, x, s, a):
-        while tf.reduce_sum(x) > s:
+        while math_ops.reduce_sum(x) > s:
           x //= self.called_member(a)
         return x
 
@@ -168,16 +211,15 @@ class ApiTest(test.TestCase):
 
       @api.convert(recursive=True)
       def test_method(self, x, s, a):
-        while tf.reduce_sum(x) > s:
+        while math_ops.reduce_sum(x) > s:
           x //= self.called_member(a)
         return x
 
     tc = TestClass()
-    with self.cached_session() as sess:
-      x = tc.test_method(
-          constant_op.constant([2, 4]), constant_op.constant(1),
-          constant_op.constant(-2))
-      self.assertListEqual([0, 1], self.evaluate(x).tolist())
+    x = tc.test_method(
+        constant_op.constant([2, 4]), constant_op.constant(1),
+        constant_op.constant(-2))
+    self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
   def test_decorator_preserves_argspec(self):
 
@@ -234,7 +276,7 @@ class ApiTest(test.TestCase):
 
       @api.convert(recursive=True)
       def test_method(self, x, s, a):
-        while tf.reduce_sum(x) > s:
+        while math_ops.reduce_sum(x) > s:
           x //= api.converted_call(
               self.called_member, (a,), None, options=DEFAULT_RECURSIVE)
         return x
@@ -644,7 +686,7 @@ class ApiTest(test.TestCase):
     opts = converter.ConversionOptions(
         user_requested=True, optional_features=None)
 
-    x = api.converted_call(gen_math_ops.add, (1, 1), None, options=opts)
+    x = api.converted_call(math_ops.add, (1, 1), None, options=opts)
 
     self.assertAllEqual(self.evaluate(x), 2)
 
@@ -685,7 +727,7 @@ class ApiTest(test.TestCase):
     class TestClass(collections.namedtuple('TestNamedtuple', ('a', 'b'))):
 
       def test_method(self, x):
-        while tf.reduce_sum(x) > self.a:
+        while math_ops.reduce_sum(x) > self.a:
           x //= self.b
         return x
 
@@ -713,7 +755,7 @@ class ApiTest(test.TestCase):
     class TestClass(collections.namedtuple('TestNamedtuple', ('a', 'b'))):
 
       def test_method(self, x):
-        while tf.reduce_sum(x) > self.a:
+        while math_ops.reduce_sum(x) > self.a:
           x //= self.b
         return x
 
@@ -786,7 +828,7 @@ class ApiTest(test.TestCase):
     def f():
       return dataset_ops.Dataset.range(-3, 3).map(other_fn)
 
-    # Dataset iteration only works inside tf.
+    # Dataset iteration only works inside math_ops.
     @def_function.function
     def graph_fn():
       ds = api.converted_call(f, (), None, options=DEFAULT_RECURSIVE)
@@ -912,13 +954,13 @@ class ApiTest(test.TestCase):
   def test_to_graph_basic(self):
 
     def test_fn(x, s):
-      while tf.reduce_sum(x) > s:
+      while math_ops.reduce_sum(x) > s:
         x //= 2
       return x
 
     compiled_fn = api.to_graph(test_fn)
 
-    with tf.Graph().as_default():
+    with ops.Graph().as_default():
       x = compiled_fn(constant_op.constant((4, 8)), 4)
       self.assertAllEqual(self.evaluate(x), (1, 2))
 
@@ -928,15 +970,14 @@ class ApiTest(test.TestCase):
     foo = 4
 
     def test_fn(x, s=foo):
-      while tf.reduce_sum(x) > s:
+      while math_ops.reduce_sum(x) > s:
         x //= 2
       return x
 
     compiled_fn = api.to_graph(test_fn)
 
-    with self.cached_session() as sess:
-      x = compiled_fn(constant_op.constant([4, 8]))
-      self.assertListEqual([1, 2], self.evaluate(x).tolist())
+    x = compiled_fn(constant_op.constant([4, 8]))
+    self.assertListEqual([1, 2], self.evaluate(x).tolist())
 
   def test_to_graph_with_globals(self):
 
@@ -1056,7 +1097,7 @@ class ApiTest(test.TestCase):
   def test_to_code_basic(self):
 
     def test_fn(x, s):
-      while tf.reduce_sum(x) > s:
+      while math_ops.reduce_sum(x) > s:
         x /= 2
       return x
 
@@ -1067,7 +1108,7 @@ class ApiTest(test.TestCase):
 
     @def_function.function
     def test_fn(x, s):
-      while tf.reduce_sum(x) > s:
+      while math_ops.reduce_sum(x) > s:
         x /= 2
       return x
 
diff --git a/tensorflow/python/autograph/operators/BUILD b/tensorflow/python/autograph/operators/BUILD
index 5f644ea525d..77d02e69976 100644
--- a/tensorflow/python/autograph/operators/BUILD
+++ b/tensorflow/python/autograph/operators/BUILD
@@ -120,8 +120,7 @@ py_test(
     name = "py_builtins_test",
     srcs = ["py_builtins_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
+    srcs_version = "PY3",
     deps = [
         ":operators",
         "//tensorflow/python:client_testlib",
@@ -133,23 +132,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "py_builtins_py3_test",
-    srcs = ["py_builtins_py3_test.py"],
-    python_version = "PY3",
-    srcs_version = "PY3",
-    tags = [
-        "no_windows",
-        # TODO(kkb): Temporay workaround since KokoroPresubmit was failing.
-        #                 cl/259400943 for more context.
-        "no_oss_py2",
-    ],
-    deps = [
-        ":operators",
-        "//tensorflow/python:client_testlib",
-    ],
-)
-
 py_test(
     name = "slices_test",
     srcs = ["slices_test.py"],
diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index 400bcb81fd5..f194c446dc0 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -79,8 +79,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops.ragged import ragged_tensor
@@ -99,19 +99,70 @@ INEFFICIENT_UNROLL_MIN_OPS = 1
 # datasets. Before it can be used though, we need to standardize the interface.
 
 
-def _verify_loop_init_vars(values, symbol_names):
-  """Ensures that all values in the state are defined when entering a loop."""
-  for name, value in zip(symbol_names, values):
-    if value is None:
-      raise ValueError("'{}' may not be None before the loop.".format(name))
-    if isinstance(value, variables.UndefinedReturnValue):
-      # Assumption: the loop will only capture the variable which tracks the
-      # return value if the loop contained a return statement.
-      # TODO(mdan): This should be checked at the place where return occurs.
-      raise ValueError(
-          'return statements are not supported within a TensorFlow loop.')
-    if isinstance(value, variables.Undefined):
-      raise ValueError("'{}' must be defined before the loop.".format(name))
+def _is_none_or_undef(value):
+  """Tests whether a value is None or undefined.
+
+  AutoGraph represents undefined symbols using special objects of type Undefined
+  or UndefinedReturnValue.
+
+  Args:
+    value: value to test
+  Returns:
+    Boolean
+  """
+  return ((value is None)
+          or isinstance(value, variables.UndefinedReturnValue)
+          or isinstance(value, variables.Undefined))
+
+
+def _verify_loop_init_vars(init_vars, symbol_names, first_iter_vars=None):
+  """Ensures that all values in the state are valid to use in a TF loop.
+
+  The init_vars may contain placeholder values derived from first_iter_vars.
+
+  Args:
+    init_vars: initial loop variables (as taken before entering the loop)
+    symbol_names: corresponding names of the initial loop variables
+    first_iter_vars: loop variables after one iteration of the loop
+  """
+  if not symbol_names:
+    return
+  if first_iter_vars is None:
+    first_iter_vars = (None,) * len(symbol_names)
+
+  assert len(symbol_names) == len(init_vars)
+  assert len(symbol_names) == len(first_iter_vars)
+  for name, val, fi_val in zip(symbol_names, init_vars, first_iter_vars):
+    if isinstance(val, variables.UndefinedReturnValue):
+      if fi_val:
+        raise ValueError(
+            'the return value from a TensorFlow loop may only be a {}; got {}'
+            .format(LEGAL_LOOP_TYPES, type(fi_val)))
+      else:
+        # TODO(mdan): This can be handled by removing the return value.
+        raise NotImplementedError(
+            'a return statement cannot be placed inside this TensorFlow loop;'
+            ' this may happen if a return statement depends on a'
+            ' static Python condition such as a hyperparameter')
+
+    error_msg = None
+    if val is None:
+      error_msg = "'{}' may not be None before the loop".format(name)
+    elif isinstance(val, variables.Undefined):
+      error_msg = "'{}' must be defined before the loop".format(name)
+
+    # This only happens when we could not infer a placeholder for the
+    # variable. The canonical case when that happens is when _placeholder_value
+    # couldnot infer a placeholder for it. That means it's of an unknown type
+    # or it's still undefined after staging one iteration.
+    if error_msg is not None:
+      if fi_val:
+        error_msg += (", unless it's a {}; got {}".format(
+            LEGAL_LOOP_TYPES, type(fi_val)))
+      else:
+        # TODO(mdan): This can be handled by removing the loop var.
+        error_msg += '.'
+      raise ValueError(error_msg)
 
 
 def _is_subshape(left, right):
@@ -328,9 +379,6 @@ def for_stmt(iter_, extra_test, body, get_state, set_state, symbol_names, opts):
     symbol_names: Tuple containing names of the loop variables returned by
       get_state.
     opts: Optional dict of extra loop parameters.
-
-  Returns:
-    Tuple containing the final state.
   """
   if tensor_util.is_tensor(iter_):
     if tensors.is_range_tensor(iter_):
@@ -356,11 +404,11 @@ def for_stmt(iter_, extra_test, body, get_state, set_state, symbol_names, opts):
         iter_, extra_test, body, get_state, set_state, symbol_names, opts)
 
   elif isinstance(iter_, distribute.Iterator):
-    raise NotImplementedError(
-        'distributed iterators not supported yet, use the distributed dataset'
-        ' directly')
+    _tf_iterator_for_stmt(
+        iter_, extra_test, body, get_state, set_state, symbol_names, opts)
 
   elif isinstance(iter_, distribute.Iterable):
+    # TODO(b/162250181): Use _tf_iterator_for_stmt(iter(iter_)...
     _tf_distributed_iterable_for_stmt(
         iter_, extra_test, body, get_state, set_state, symbol_names, opts)
 
@@ -430,9 +478,7 @@ def _known_len_tf_for_stmt(
       return control_flow_ops.cond(main_test, extra_test, lambda: False)
     return main_test
 
-  # TODO(b/159186914): Remove.
-  if not control_flow_util.GraphOrParentsInXlaContext(ops.get_default_graph()):
-    opts['maximum_iterations'] = n
+  opts['maximum_iterations'] = n
 
   _tf_while_stmt(
       aug_test,
@@ -478,9 +524,7 @@ def _tf_ragged_for_stmt(
       return control_flow_ops.cond(main_test, extra_test, lambda: False)
     return main_test
 
-  # TODO(b/159186914): Remove.
-  if not control_flow_util.GraphOrParentsInXlaContext(ops.get_default_graph()):
-    opts['maximum_iterations'] = n
+  opts['maximum_iterations'] = n
 
   _tf_while_stmt(
       aug_test,
@@ -538,10 +582,8 @@ def _tf_range_for_stmt(
       main_test = control_flow_ops.cond(main_test, extra_test, lambda: False)
     return main_test
 
-  # TODO(b/134181679): Remove.
-  if not control_flow_util.GraphOrParentsInXlaContext(ops.get_default_graph()):
-    opts['maximum_iterations'] = math_ops.cast(
-        misc.get_range_len(start, limit, delta), dtypes.int32)
+  opts['maximum_iterations'] = math_ops.cast(
+      misc.get_range_len(start, limit, delta), dtypes.int32)
 
   _tf_while_stmt(
       aug_test,
@@ -571,7 +613,7 @@ def _tf_iterator_for_stmt(
 
   def aug_body():
     """Main body passed to _tf_while_stmt."""
-    opt_iterate = iterator_ops.get_next_as_optional(iter_)
+    opt_iterate = iter_.get_next_as_optional()
     has_next.value = opt_iterate.has_value()
     loop_vars = aug_get_state()  # updated by set_state() in _tf_while_loop.
 
@@ -879,21 +921,130 @@ def _shape_invariants_mapping_to_positional_list(mapping, keys):
   return tuple(result)
 
 
+# Textual description of what a legal TF loop variable is. This description
+# summarizes types that _placeholder_value below can handle. Keep the two
+# together and in sync.
+LEGAL_LOOP_TYPES = 'Tensor, int, float, bool or a list, tuple or dict thereof'
+
+
+def _placeholder_value(like, original=None):
+  if isinstance(like, (variables.Undefined, variables.UndefinedReturnValue)):
+    return original
+  if isinstance(like, (int, float, bool)):
+    return type(like)(0)
+  if tensor_util.is_tensor(like):
+    return array_ops.zeros(like.shape, like.dtype)
+  elif isinstance(like, (list, tuple, dict)):
+    return nest.map_structure(_placeholder_value, like)
+  return original
+
+
+def _try_handling_undefineds(
+    body, get_state, set_state, init_vars, nulls, symbol_names):
+  """Makes a best-effort attempt to substitute undefineds with placeholders.
+
+  Note: this substitution requires two things to happen:
+   1. the types of loop variables could be inferred (usually by staging one
+       iteration)
+   2. these types could be replaced by placeholders (e.g. zero values, for
+       tensors.
+
+  Args:
+    body: a function representing the loop body. See while_stmt.
+    get_state: state getter for the loop statement. See while_stmt.
+    set_state: state getter for the loop statement. See while_stmt.
+    init_vars: loop variables before entering the loop. See while_stmt.
+    nulls: list of boolean flags indicating whether the corresponding loop
+        var is None or undefined.
+    symbol_names: list of loop variable names. See while_stmt.
+  Returns:
+    A tuple (success, new_init_vars). success is a boolean flag indicating
+    whether types could be successfully inferred (step 1 above). new_init_vars
+    contains the loop vars, with None or undefined values replaced by
+    placeholders, where possible (step 2 above).
+  """
+  state_modified = False
+
+  try:
+    # Stage an iteration of the loop body in a temporary graph.
+    with func_graph.FuncGraph('tmp').as_default():
+      # This call to set_state helps report nicer error messages when symbols
+      # are inconsistently used.
+      set_state(init_vars)
+      state_modified = True
+
+      body()
+      first_iter_vars = get_state()
+  except (UnboundLocalError, TypeError, ValueError, KeyError):
+    # Fall back to the old functionality. It will likely result in an input
+    # validation failure.
+    first_iter_vars = None
+  finally:
+    if state_modified:
+      set_state(init_vars)
+
+  if first_iter_vars is not None:
+    # Note: the actual placeholder value doesn't matter, because as the staging
+    # proved, it will be replaced by an actual value before being read.
+    init_vars = tuple(
+        (_placeholder_value(iv, v) if n else v)
+        for v, n, iv in zip(init_vars, nulls, first_iter_vars))
+    success = True
+  else:
+    success = False
+
+  # This check runs regardless, in case we captured non-Tensor inputs.
+  _verify_loop_init_vars(init_vars, symbol_names, first_iter_vars)
+
+  return success, init_vars
+
+
+def _runtime_zero_iterations_errmsg(symbol_names, nulls, init_vars):
+  """Creates an error message asking for the loop to iterate at least once."""
+  var_names = []
+  for sn, n, v in zip(symbol_names, nulls, init_vars):
+    if not n:
+      continue
+    if isinstance(v, variables.UndefinedReturnValue):
+      var_names.append('the function return value')
+    else:
+      var_names.append(sn)
+  var_names = ', '.join(var_names)
+  return 'loop must iterate at least once to initialize {}'.format(var_names)
+
+
 def _tf_while_stmt(test, body, get_state, set_state, symbol_names, opts):
   """Overload of while_stmt that stages a TF while_stmt."""
   init_vars = get_state()
-  _verify_loop_init_vars(init_vars, symbol_names)
+  orig_init_vars = init_vars
+
+  nulls = tuple(_is_none_or_undef(v) for v in init_vars)
+  if any(nulls):
+    require_one_iteration, init_vars = _try_handling_undefineds(
+        body, get_state, set_state, init_vars, nulls, symbol_names)
+  else:
+    require_one_iteration = False
 
   def aug_test(*loop_vars):
+    if require_one_iteration:
+      loop_vars = loop_vars[1:]
+
     set_state(loop_vars)
     return test()
 
   def aug_body(*loop_vars):
+    if require_one_iteration:
+      loop_vars = loop_vars[1:]
+
     set_state(loop_vars)
     body()
     new_loop_vars = get_state()
     _verify_tf_loop_vars(
         init_vars, loop_vars, new_loop_vars, symbol_names, opts)
+
+    if require_one_iteration:
+      new_loop_vars = (True,) + new_loop_vars
+
     return new_loop_vars
 
   if 'shape_invariants' in opts:
@@ -907,8 +1058,25 @@ def _tf_while_stmt(test, body, get_state, set_state, symbol_names, opts):
   # This enforces consistency across versions.
   while_loop_opts['return_same_structure'] = True
 
+  if require_one_iteration:
+    aug_init_vars = (False,) + init_vars
+  else:
+    aug_init_vars = init_vars
+
   final_loop_vars = control_flow_ops.while_loop(
-      aug_test, aug_body, init_vars, **while_loop_opts)
+      aug_test, aug_body, aug_init_vars, **while_loop_opts)
+
+  if require_one_iteration:
+    with ops.control_dependencies([
+        control_flow_ops.Assert(final_loop_vars[0], [
+            _runtime_zero_iterations_errmsg(symbol_names, nulls, orig_init_vars)
+        ])
+    ]):
+      final_loop_vars = nest.map_structure(
+          lambda v: (array_ops.identity(v) if tensor_util.is_tensor(v) else v),
+          final_loop_vars[1:],
+      )
+
   set_state(final_loop_vars)
 
 
diff --git a/tensorflow/python/autograph/operators/control_flow_deprecated_py2.py b/tensorflow/python/autograph/operators/control_flow_deprecated_py2.py
index 5a900fb19ed..a18603c9964 100644
--- a/tensorflow/python/autograph/operators/control_flow_deprecated_py2.py
+++ b/tensorflow/python/autograph/operators/control_flow_deprecated_py2.py
@@ -590,7 +590,7 @@ def _tf_iterator_for_stmt(itr, extra_test, body, get_state, set_state,
 
   def while_body(has_next, *loop_vars):
     """Main loop body."""
-    opt_iterate = iterator_ops.get_next_as_optional(itr)
+    opt_iterate = itr.get_next_as_optional()
     has_next = opt_iterate.has_value()
 
     if not init_vars:
diff --git a/tensorflow/python/autograph/operators/control_flow_test.py b/tensorflow/python/autograph/operators/control_flow_test.py
index ce9b1181e05..32b36a29797 100644
--- a/tensorflow/python/autograph/operators/control_flow_test.py
+++ b/tensorflow/python/autograph/operators/control_flow_test.py
@@ -31,26 +31,22 @@ import six
 from tensorflow.python.autograph.operators import control_flow
 from tensorflow.python.autograph.operators import variables as variable_operators
 from tensorflow.python.autograph.utils import ag_logging
+from tensorflow.python.autograph.utils import testing
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class ForLoopTest(test.TestCase):
+class ForLoopTest(testing.AutoGraphTestCase):
 
   def test_tensor(self):
     def body(i):
@@ -70,7 +66,7 @@ class ForLoopTest(test.TestCase):
         set_state=set_state,
         symbol_names=('s',),
         opts={})
-    self.assertEqual(self.evaluate(s), (1234,))
+    self.assertEqual(s, (1234,))
 
   def test_range_tensor(self):
     def body(i):
@@ -90,7 +86,9 @@ class ForLoopTest(test.TestCase):
         set_state=set_state,
         symbol_names=('s',),
         opts={'iterate_names': 'i'})
-    self.assertEqual(self.evaluate(s), (1234,))
+
+    self.assertEqual(s, (1234,))
+    self.assertOpCreated('StatelessWhile')
 
   def test_range_tensor_explicit_limit_delta(self):
     def body(i):
@@ -110,7 +108,9 @@ class ForLoopTest(test.TestCase):
         set_state=set_state,
         symbol_names=('s',),
         opts={'iterate_names': 'i'})
-    self.assertEqual(self.evaluate(s), (-171207,))
+
+    self.assertEqual(s, (-171207,))
+    self.assertOpCreated('StatelessWhile')
 
   def test_range_tensor_explicit_limit_negative_delta(self):
     def body(i):
@@ -130,7 +130,9 @@ class ForLoopTest(test.TestCase):
         set_state=set_state,
         symbol_names=('s',),
         opts={'iterate_names': 'i'})
-    self.assertEqual(self.evaluate(s), (171207,))
+
+    self.assertEqual(s, (171207,))
+    self.assertOpCreated('StatelessWhile')
 
   def test_range_tensor_random_delta(self):
     def body(i):
@@ -151,7 +153,9 @@ class ForLoopTest(test.TestCase):
         set_state=set_state,
         symbol_names=('s',),
         opts={'iterate_names': 'i'})
-    self.assertEqual(self.evaluate(s), (1234,))
+
+    self.assertEqual(s, (1234,))
+    self.assertOpCreated('StatelessWhile')
 
   def test_range_tensor_random_negative_delta(self):
     def body(i):
@@ -172,7 +176,9 @@ class ForLoopTest(test.TestCase):
         set_state=set_state,
         symbol_names=('s',),
         opts={'iterate_names': 'i'})
-    self.assertEqual(self.evaluate(s), (171207,))
+
+    self.assertEqual(s, (171207,))
+    self.assertOpCreated('StatelessWhile')
 
   def test_tensor_with_extra_test_object_vars(self):
     class MutableObject(object):
@@ -198,7 +204,9 @@ class ForLoopTest(test.TestCase):
         set_state=set_state,
         symbol_names=('state.field_1', 'state.field_2'),
         opts={})
-    self.assertEqual(self.evaluate((state.field_1, state.field_2)), (6, 6))
+
+    self.assertEqual((state.field_1, state.field_2), (6, 6))
+    self.assertOpCreated('StatelessWhile')
 
   def test_python(self):
     def body(i):
@@ -218,7 +226,9 @@ class ForLoopTest(test.TestCase):
         set_state=set_state,
         symbol_names=('s',),
         opts={})
+
     self.assertEqual(s, 1234)
+    self.assertNoOpsCreated()
 
   def test_python_generator_with_extra_test(self):
     def new_generator():
@@ -251,6 +261,8 @@ class ForLoopTest(test.TestCase):
 
     self.assertEqual(next(gen), 4)
 
+    self.assertNoOpsCreated()
+
   def test_python_generator_with_extra_test_no_iterations(self):
     def new_generator():
       for i in range(5):
@@ -279,6 +291,8 @@ class ForLoopTest(test.TestCase):
 
     self.assertEqual(next(gen), 0)
 
+    self.assertNoOpsCreated()
+
   def test_tf_dataset(self):
     def body(i):
       nonlocal s
@@ -297,7 +311,9 @@ class ForLoopTest(test.TestCase):
         set_state=set_state,
         symbol_names=('s',),
         opts={})
-    self.assertEqual(self.evaluate(s), (1234,))
+
+    self.assertEqual(s, (1234,))
+    self.assertOpCreated('ScanDataset')
 
   def test_dataset_with_extra_test(self):
     def body(i):
@@ -317,7 +333,9 @@ class ForLoopTest(test.TestCase):
         set_state=set_state,
         symbol_names=('s',),
         opts={})
-    self.assertEqual(self.evaluate(s), (12,))
+
+    self.assertEqual(s, (12,))
+    self.assertOpCreated('ScanDataset')
 
   def test_dataset_with_extra_test_collection_vars(self):
     def body(i):
@@ -339,7 +357,9 @@ class ForLoopTest(test.TestCase):
         set_state=set_state,
         symbol_names=('l[0]', 's'),
         opts={})
-    self.assertEqual(self.evaluate((l[0], s)), (3, 3))
+
+    self.assertEqual((l[0], s), (3, 3))
+    self.assertOpCreated('ScanDataset')
 
   def test_dataset_with_extra_test_iteration_limiting(self):
     def body(it):
@@ -360,100 +380,89 @@ class ForLoopTest(test.TestCase):
         set_state=set_state,
         symbol_names=('i',),
         opts={})
-    self.assertEqual(self.evaluate(i), (3,))
+
+    self.assertEqual(i, (3,))
+    self.assertOpCreated('ScanDataset')
 
   def test_tf_dataset_no_loop_vars(self):
     def body(i):
       v.assign(v.read_value() * 10 + i)
 
-    v = variables.Variable(0, dtype=dtypes.int64)
-    self.evaluate(v.initializer)
+    v = self.variable('v', 0, dtypes.int64)
 
-    # tf.function required for the automatic control dependencies, and because
-    # ops test for its presence.
-    @def_function.function
-    def test_fn():
-      control_flow.for_stmt(
-          dataset_ops.Dataset.range(5),
-          extra_test=None,
-          body=body,
-          get_state=lambda: (),
-          set_state=lambda _: None,
-          symbol_names=(),
-          opts={})
+    control_flow.for_stmt(
+        dataset_ops.Dataset.range(5),
+        extra_test=None,
+        body=body,
+        get_state=lambda: (),
+        set_state=lambda _: None,
+        symbol_names=(),
+        opts={})
 
-    self.evaluate(test_fn())
-    self.assertEqual(self.evaluate(v.read_value()), 1234)
+    self.assertEqual(v.read_value(), 1234)
+    self.assertOpCreated('ScanDataset')
 
   def test_tf_iterator(self):
-    # graph-mode iterators are only supported inside tf.function.
-    @def_function.function
-    def test_fn():
-      def body(i):
-        nonlocal s
-        s = s * 10 + i
+    def body(i):
+      nonlocal s
+      s = s * 10 + i
 
-      def set_state(loop_vars):
-        nonlocal s
-        s, = loop_vars
+    def set_state(loop_vars):
+      nonlocal s
+      s, = loop_vars
 
-      s = constant_op.constant(0, dtype=dtypes.int64)
-      control_flow.for_stmt(
-          iter(dataset_ops.Dataset.range(5)),
-          extra_test=None,
-          body=body,
-          get_state=lambda: (s,),
-          set_state=set_state,
-          symbol_names=('s',),
-          opts={})
-      return s
-    self.assertAllEqual(test_fn(), 1234)
+    s = constant_op.constant(0, dtype=dtypes.int64)
+    control_flow.for_stmt(
+        iter(dataset_ops.Dataset.range(5)),
+        extra_test=None,
+        body=body,
+        get_state=lambda: (s,),
+        set_state=set_state,
+        symbol_names=('s',),
+        opts={})
+
+    self.assertEqual(s, 1234)
+    self.assertOpCreated('IteratorGetNextAsOptional')
 
   def test_tf_iterator_shape_invariants(self):
-    # graph-mode iterators are only supported inside tf.function.
-    @def_function.function
-    def test_fn():
-      def body(i):
-        nonlocal s
-        s = array_ops.concat([s, [i]], 0)
+    def body(i):
+      nonlocal s
+      s = array_ops.concat([s, [i]], 0)
 
-      def set_state(loop_vars):
-        nonlocal s
-        s, = loop_vars
+    def set_state(loop_vars):
+      nonlocal s
+      s, = loop_vars
 
-      s = constant_op.constant([], dtype=dtypes.int64)
-      control_flow.for_stmt(
-          iter(dataset_ops.Dataset.range(5)),
-          extra_test=None,
-          body=body,
-          get_state=lambda: (s,),
-          set_state=set_state,
-          symbol_names=('s',),
-          opts={'shape_invariants': [(s, tensor_shape.TensorShape([None]))]})
-      return s
-    self.assertAllEqual(test_fn(), [0, 1, 2, 3, 4])
+    s = constant_op.constant([], dtype=dtypes.int64)
+    control_flow.for_stmt(
+        iter(dataset_ops.Dataset.range(5)),
+        extra_test=None,
+        body=body,
+        get_state=lambda: (s,),
+        set_state=set_state,
+        symbol_names=('s',),
+        opts={'shape_invariants': [(s, tensor_shape.TensorShape([None]))]})
+
+    self.assertAllEqual(s, [0, 1, 2, 3, 4])
+    self.assertOpCreated('IteratorGetNextAsOptional')
 
   def test_tf_iterator_no_loop_vars(self):
     def body(i):
       v.assign(v.read_value() * 10 + i)
 
-    v = variables.Variable(0, dtype=dtypes.int64)
-    self.evaluate(v.initializer)
+    v = self.variable('v', 0, dtypes.int64)
 
-    # tf.function required for the automatic control dependencies.
-    @def_function.function
-    def test_fn():
-      control_flow.for_stmt(
-          iter(dataset_ops.Dataset.range(5)),
-          extra_test=None,
-          body=body,
-          get_state=lambda: (),
-          set_state=lambda _: None,
-          symbol_names=(),
-          opts={})
+    control_flow.for_stmt(
+        iter(dataset_ops.Dataset.range(5)),
+        extra_test=None,
+        body=body,
+        get_state=lambda: (),
+        set_state=lambda _: None,
+        symbol_names=(),
+        opts={})
 
-    self.evaluate(test_fn())
-    self.assertEqual(self.evaluate(v.read_value()), 1234)
+    self.assertEqual(v.read_value(), 1234)
+    self.assertOpCreated('IteratorGetNextAsOptional')
 
   def test_tf_ragged_tensor(self):
     def body(i):
@@ -473,7 +482,9 @@ class ForLoopTest(test.TestCase):
         set_state=set_state,
         symbol_names=('s',),
         opts={})
-    self.assertEqual(self.evaluate(s), (123,))
+
+    self.assertEqual(s, (123,))
+    self.assertOpCreated('StatelessWhile')
 
   def test_tf_ragged_tensor_higher_dimensional(self):
     def body(i):
@@ -497,30 +508,28 @@ class ForLoopTest(test.TestCase):
         set_state=set_state,
         symbol_names=('s',),
         opts={})
-    self.assertEqual(self.evaluate(s), (12,))
+
+    self.assertEqual(s, (12,))
+    self.assertOpCreated('StatelessWhile')
 
   def test_tf_ragged_tensor_no_loop_vars(self):
-    v = variables.Variable(0, dtype=dtypes.int32)
-    self.evaluate(v.initializer)
+    v = self.variable('v', 0, dtypes.int32)
 
     def body(i):
       v.assign(v.read_value() * 10 + i[0])
 
-    # tf.function required for the automatic control dependencies.
-    @def_function.function(autograph=False)
-    def test_fn():
-      control_flow.for_stmt(
-          ragged_factory_ops.constant([[1], [2, 4], [3]]),
-          extra_test=None,
-          body=body,
-          get_state=lambda: (),
-          set_state=lambda _: None,
-          symbol_names=(),
-          opts={})
+    control_flow.for_stmt(
+        ragged_factory_ops.constant([[1], [2, 4], [3]]),
+        extra_test=None,
+        body=body,
+        get_state=lambda: (),
+        set_state=lambda _: None,
+        symbol_names=(),
+        opts={})
 
-    self.evaluate(test_fn())
     # Note: 123 = ((0*10 + 1)*10+2)*10+3 (first element of each row).
-    self.assertEqual(self.evaluate(v.read_value()), 123)
+    self.assertEqual(v.read_value(), 123)
+    self.assertOpCreated('While')
 
   def _basic_loop(self, init_value, body_fn):
     def body(i):
@@ -561,10 +570,10 @@ class ForLoopTest(test.TestCase):
       self._basic_loop(0, lambda i, s: np.array([1], dtype=np.int32))
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class WhileLoopTest(test.TestCase):
+class WhileLoopTest(testing.AutoGraphTestCase):
 
   def test_tensor(self):
+
     def body():
       nonlocal i, s
       s = s * 10 + i
@@ -584,40 +593,95 @@ class WhileLoopTest(test.TestCase):
         set_state=set_state,
         symbol_names=('i', 's'),
         opts={})
-    self.assertEqual(self.evaluate((i, s)), (5, 1234))
+
+    self.assertEqual(i, 5)
+    self.assertEqual(s, 1234)
+    self.assertOpCreated('StatelessWhile')
+
+  def test_tensor_creating_variable(self):
+
+    def body():
+      nonlocal i, s
+      i = constant_op.constant(2)
+      s = i ** 5
+
+    def set_state(loop_vars):
+      nonlocal i, s
+      i, s = loop_vars
+
+    i = variable_operators.Undefined('i')
+    s = constant_op.constant(0)
+    control_flow.while_stmt(
+        test=lambda: math_ops.equal(s, 0),
+        body=body,
+        get_state=lambda: (i, s),
+        set_state=set_state,
+        symbol_names=('i', 's'),
+        opts={})
+
+    self.assertEqual(i, 2)
+    self.assertEqual(s, 32)
+    self.assertOpCreated('StatelessWhile')
+    # Check that the temporary staging of the body did not create extra ops.
+    # Node naming is inconsistent between V1 and V2.
+    self.assertGraphContains(r'(while/)?pow$', 1)
+
+  def test_tensor_creating_complex_variable(self):
+
+    def body():
+      nonlocal i, s
+      i = {'a': constant_op.constant(2), 'b': {'c': constant_op.constant(1)}}
+      s = i['a'] ** 5
+
+    def set_state(loop_vars):
+      nonlocal i, s
+      i, s = loop_vars
+
+    i = variable_operators.Undefined('i')
+    s = constant_op.constant(0)
+    control_flow.while_stmt(
+        test=lambda: math_ops.equal(s, 0),
+        body=body,
+        get_state=lambda: (i, s),
+        set_state=set_state,
+        symbol_names=('i', 's'),
+        opts={})
+
+    self.assertDictEqual(i, {'a': 2, 'b': {'c': 1}})
+    self.assertEqual(s, 32)
+    self.assertOpCreated('StatelessWhile')
+    # Check that the temporary staging of the body did not create extra ops.
+    # Node naming is inconsistent between V1 and V2.
+    self.assertGraphContains(r'(while/)?pow$', 1)
 
   def test_tensor_with_side_effecting_condition(self):
-    v = variables.Variable(0)
+    v = self.variable('v', 0, dtypes.int32)
 
-    # tf.function required for the automatic control dependencies.
-    @def_function.function
-    def test_fn():
-      def cond():
-        v.assign(v.read_value() * 10 + i)
-        return i < n
+    def cond():
+      v.assign(v.read_value() * 10 + i)
+      return i < n
 
-      def body():
-        nonlocal i
-        i += 1
+    def body():
+      nonlocal i
+      i += 1
 
-      def set_state(loop_vars):
-        nonlocal i
-        i, = loop_vars
+    def set_state(loop_vars):
+      nonlocal i
+      i, = loop_vars
 
-      i = 0
-      n = constant_op.constant(5)
-      control_flow.while_stmt(
-          test=cond,
-          body=body,
-          get_state=lambda: (i,),
-          set_state=set_state,
-          symbol_names=('i',),
-          opts={})
-      return i
+    i = 0
+    n = constant_op.constant(5)
+    control_flow.while_stmt(
+        test=cond,
+        body=body,
+        get_state=lambda: (i,),
+        set_state=set_state,
+        symbol_names=('i',),
+        opts={})
 
-    self.evaluate(v.initializer)
-    self.assertEqual(self.evaluate(test_fn()), (5,))
-    self.assertEqual(self.evaluate(v), (12345,))
+    self.assertEqual(i, (5,))
+    self.assertEqual(v, (12345,))
+    self.assertOpCreated('While')
 
   def test_tensor_with_python_state(self):
     class MutableObject(object):
@@ -642,7 +706,10 @@ class WhileLoopTest(test.TestCase):
         set_state=set_state,
         symbol_names=('i', 'state.field'),
         opts={})
-    self.assertEqual(self.evaluate((i, state.field)), (5, 1234))
+
+    self.assertEqual(i, 5)
+    self.assertEqual(state.field, 1234)
+    self.assertOpCreated('StatelessWhile')
 
   def test_python(self):
     def body():
@@ -660,7 +727,9 @@ class WhileLoopTest(test.TestCase):
         set_state=None,
         symbol_names=('i', 's'),
         opts={})
+
     self.assertEqual(s, 1234)
+    self.assertNoOpsCreated()
 
   def test_python_with_tensor_state(self):
     def body():
@@ -678,8 +747,10 @@ class WhileLoopTest(test.TestCase):
         set_state=None,
         symbol_names=('i', 's'),
         opts={})
+
     self.assertEqual(i, 5)
-    self.assertEqual(self.evaluate(s), 1234)
+    self.assertEqual(s, 1234)
+    self.assertOpsNotCreated(('While', 'StatelessWhile'))
 
   def test_python_while_infinite(self):
     if not __debug__:
@@ -760,6 +831,7 @@ class WhileLoopTest(test.TestCase):
             r'.* Large unrolled loop.*Add.*', out_capturer.getvalue()))
 
   def _basic_loop(self, init_value, body_fn):
+
     def body():
       nonlocal i, s
       s = body_fn(i, s)
@@ -800,8 +872,7 @@ class WhileLoopTest(test.TestCase):
       self._basic_loop(0, lambda i, s: np.array([1], dtype=np.int32))
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class IfStmtTest(test.TestCase):
+class IfStmtTest(testing.AutoGraphTestCase):
 
   def test_tensor(self):
 
@@ -829,8 +900,9 @@ class IfStmtTest(test.TestCase):
           nouts=1)
       return i
 
-    self.assertEqual(1, self.evaluate(test_fn(constant_op.constant(True))))
-    self.assertEqual(-1, self.evaluate(test_fn(constant_op.constant(False))))
+    self.assertEqual(test_fn(constant_op.constant(True)), 1)
+    self.assertEqual(test_fn(constant_op.constant(False)), -1)
+    self.assertOpCreated('StatelessIf')
 
   def test_tensor_no_outputs(self):
 
@@ -858,8 +930,9 @@ class IfStmtTest(test.TestCase):
           nouts=0)
       return i
 
-    self.assertEqual(None, test_fn(constant_op.constant(True)))
-    self.assertEqual(None, test_fn(constant_op.constant(False)))
+    self.assertIsNone(test_fn(constant_op.constant(True)))
+    self.assertIsNone(test_fn(constant_op.constant(False)))
+    self.assertOpCreated('StatelessIf')
 
   def test_tensor_multiple_returns(self):
 
@@ -889,9 +962,9 @@ class IfStmtTest(test.TestCase):
           nouts=2)
       return i, j
 
-    self.assertEqual((1, 2), self.evaluate(test_fn(constant_op.constant(True))))
-    self.assertEqual((-1, -2),
-                     self.evaluate(test_fn(constant_op.constant(False))))
+    self.assertEqual(test_fn(constant_op.constant(True)), (1, 2))
+    self.assertEqual(test_fn(constant_op.constant(False)), (-1, -2))
+    self.assertOpCreated('StatelessIf')
 
   def test_python(self):
 
@@ -915,8 +988,9 @@ class IfStmtTest(test.TestCase):
           nouts=1)
       return i
 
-    self.assertEqual(1, test_fn(True))
-    self.assertEqual(-1, test_fn(False))
+    self.assertEqual(test_fn(True), 1)
+    self.assertEqual(test_fn(False), -1)
+    self.assertNoOpsCreated()
 
   def test_python_multiple_returns(self):
 
@@ -942,8 +1016,9 @@ class IfStmtTest(test.TestCase):
           nouts=2)
       return i, j
 
-    self.assertEqual((1, 2), test_fn(True))
-    self.assertEqual((-1, -2), test_fn(False))
+    self.assertEqual(test_fn(True), (1, 2))
+    self.assertEqual(test_fn(False), (-1, -2))
+    self.assertNoOpsCreated()
 
   def _basic_cond(self, body_fn, else_fn):
     def body():
@@ -959,16 +1034,14 @@ class IfStmtTest(test.TestCase):
       x, = cond_vars
 
     x = 0
-    # Eager cond had different semantics, we don't test those here.
-    with func_graph.FuncGraph('tmp').as_default():
-      control_flow.if_stmt(
-          cond=constant_op.constant(True),
-          body=body,
-          orelse=orelse,
-          get_state=lambda: (x,),
-          set_state=set_state,
-          symbol_names=('x',),
-          nouts=1)
+    control_flow.if_stmt(
+        cond=constant_op.constant(True),
+        body=body,
+        orelse=orelse,
+        get_state=lambda: (x,),
+        set_state=set_state,
+        symbol_names=('x',),
+        nouts=1)
     return x
 
   def test_tensor_none_output(self):
diff --git a/tensorflow/python/autograph/operators/py_builtins.py b/tensorflow/python/autograph/operators/py_builtins.py
index 4dbe25aec6d..bf5ea035b54 100644
--- a/tensorflow/python/autograph/operators/py_builtins.py
+++ b/tensorflow/python/autograph/operators/py_builtins.py
@@ -89,6 +89,16 @@ def _find_originating_frame(caller_fn_scope, innermost=True):
   return result
 
 
+def locals_in_original_context(caller_fn_scope):
+  """Executes the locals function in the context of a specified function."""
+  return _find_originating_frame(caller_fn_scope, innermost=True).f_locals
+
+
+def globals_in_original_context(caller_fn_scope):
+  """Executes the locals function in the context of a specified function."""
+  return _find_originating_frame(caller_fn_scope, innermost=True).f_globals
+
+
 def eval_in_original_context(f, args, caller_fn_scope):
   """Executes the eval function in the context of a specified function."""
   # When control flow is rewritten using functions, eval should use the
@@ -501,7 +511,7 @@ def next_tf_iterator(iterator, default=UNSPECIFIED):
     # Without a default, fall back to the "normal" behavior which raises
     # a runtime exception.
     return next(iterator)
-  opt_iterate = iterator_ops.get_next_as_optional(iterator)
+  opt_iterate = iterator.get_next_as_optional()
   _verify_structure_compatible(
       'the default argument', 'the iterate', default, iterator.element_spec)
   return control_flow_ops.cond(
diff --git a/tensorflow/python/autograph/operators/py_builtins_py3_test.py b/tensorflow/python/autograph/operators/py_builtins_py3_test.py
deleted file mode 100644
index 11a33b90b75..00000000000
--- a/tensorflow/python/autograph/operators/py_builtins_py3_test.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for py_builtins_py3 module."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.autograph.core import converter
-from tensorflow.python.autograph.core import function_wrappers
-from tensorflow.python.autograph.operators import py_builtins
-from tensorflow.python.platform import test
-
-
-class TestBaseClass(object):
-
-  def overridden_method(self, x):
-    return x + 20
-
-
-class PyBuiltinsTest(test.TestCase):
-
-  def _basic_function_scope(self):
-    return function_wrappers.FunctionScope(
-        'test_function_name',
-        'test_scope',  # Note: this must match the name in the `with` statement.
-        converter.ConversionOptions())
-
-  def test_super_in_original_context_niladic_call(self):
-    test_case_self = self
-
-    class TestSubclass(TestBaseClass):
-
-      def overridden_method(self, x):
-        test_case_self.fail('This should never be called.')
-
-      def test_method(self):
-        with test_case_self._basic_function_scope() as test_scope:
-          b = py_builtins.super_in_original_context(super, (), test_scope)
-          return b.overridden_method(1)
-
-    tc = TestSubclass()
-    self.assertEqual(tc.test_method(), 21)
-
-  def test_super_in_original_context_caller_with_locals(self):
-    test_case_self = self
-
-    class TestSubclass(TestBaseClass):
-
-      def overridden_method(self, x):
-        test_case_self.fail('This should never be called.')
-
-      def test_method(self, x):
-        y = 7
-        with test_case_self._basic_function_scope() as test_scope:
-          z = 7
-          return py_builtins.super_in_original_context(
-              super, (), test_scope).overridden_method(x + y - z)
-
-    tc = TestSubclass()
-    self.assertEqual(tc.test_method(1), 21)
-
-  def test_super_in_original_context_inner_function(self):
-    test_case_self = self
-
-    class TestSubclass(TestBaseClass):
-
-      def overridden_method(self, x):
-        test_case_self.fail('This should never be called.')
-
-      def test_method(self, x):
-        with test_case_self._basic_function_scope() as test_scope:
-          # Oddly, it's sufficient to use `self` in an inner function
-          # to gain access to __class__ in this scope.
-          # TODO(mdan): Is this true across implementations?
-          # Note: normally, it's illegal to use super() in inner functions (it
-          # throws an error), but the generated code may create them.
-          def inner_fn():
-            return py_builtins.super_in_original_context(
-                super, (), test_scope).overridden_method(x)
-
-          return inner_fn()
-
-    tc = TestSubclass()
-    self.assertEqual(tc.test_method(1), 21)
-
-  def test_super_in_original_context_inner_lambda(self):
-    test_case_self = self
-
-    class TestSubclass(TestBaseClass):
-
-      def overridden_method(self, x):
-        test_case_self.fail('This should never be called.')
-
-      def test_method(self, x):
-        with test_case_self._basic_function_scope() as test_scope:
-          # Oddly, it's sufficient to use `self` in an inner function
-          # to gain access to __class__ in this scope.
-          # TODO(mdan): Is this true across implementations?
-          # Note: normally, it's illegal to use super() in inner functions (it
-          # throws an error), but the generated code may create them.
-          l = lambda: py_builtins.super_in_original_context(  # pylint:disable=g-long-lambda
-              super, (), test_scope).overridden_method(x)
-          return l()
-
-    tc = TestSubclass()
-    self.assertEqual(tc.test_method(1), 21)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/autograph/operators/py_builtins_test.py b/tensorflow/python/autograph/operators/py_builtins_test.py
index 43feb0d9bc3..8dab5c2522c 100644
--- a/tensorflow/python/autograph/operators/py_builtins_test.py
+++ b/tensorflow/python/autograph/operators/py_builtins_test.py
@@ -40,10 +40,11 @@ from tensorflow.python.platform import test
 
 class TestBase(object):
 
-  def plus_twenty(self, x):
+  def overridden_method(self, x):
     return x + 20
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class PyBuiltinsTest(test.TestCase):
 
   def test_abs(self):
@@ -400,12 +401,67 @@ class PyBuiltinsTest(test.TestCase):
 
     self.assertEqual(test_fn(), 2)
 
+  def test_locals_in_original_context(self):
+
+    def test_fn():
+      l = 1  # pylint:disable=unused-variable
+      with self._basic_function_scope() as test_scope:
+        return py_builtins.locals_in_original_context(test_scope)
+
+    locs = test_fn()
+
+    self.assertEqual(locs['l'], 1)
+
+  def test_locals_in_original_context_inner_function(self):
+
+    def test_fn():
+      l = 1  # pylint:disable=unused-variable
+      with self._basic_function_scope() as test_scope:
+
+        def inner_fn():
+          # Note: a user function without a top-level function scope should
+          # never be found in user code; it's only possible in generated code.
+          l = 2  # pylint:disable=unused-variable
+          return py_builtins.locals_in_original_context(test_scope)
+
+        return inner_fn()
+
+    locs = test_fn()
+
+    self.assertEqual(locs['l'], 2)
+
+  def test_globals_in_original_context(self):
+
+    def test_fn():
+      with self._basic_function_scope() as test_scope:
+        return py_builtins.globals_in_original_context(test_scope)
+
+    globs = test_fn()
+
+    self.assertIs(globs['TestBase'], TestBase)
+
+  def test_globals_in_original_context_inner_function(self):
+
+    def test_fn():
+      with self._basic_function_scope() as test_scope:
+
+        def inner_fn():
+          # Note: a user function without a top-level function scope should
+          # never be found in user code; it's only possible in generated code.
+          return py_builtins.globals_in_original_context(test_scope)
+
+        return inner_fn()
+
+    globs = test_fn()
+
+    self.assertIs(globs['TestBase'], TestBase)
+
   def test_super_in_original_context_unary_call(self):
     test_case_self = self
 
     class TestSubclass(TestBase):
 
-      def plus_twenty(self, x):
+      def overridden_method(self, x):
         test_case_self.fail('This should never be called.')
 
       def test_method(self):
@@ -413,7 +469,7 @@ class PyBuiltinsTest(test.TestCase):
           test_base_unbound = py_builtins.super_in_original_context(
               super, (TestSubclass,), test_scope)
           test_base = test_base_unbound.__get__(self, TestSubclass)
-          return test_base.plus_twenty(1)
+          return test_base.overridden_method(1)
 
     tc = TestSubclass()
     self.assertEqual(tc.test_method(), 21)
@@ -423,18 +479,98 @@ class PyBuiltinsTest(test.TestCase):
 
     class TestSubclass(TestBase):
 
-      def plus_twenty(self, x):
+      def overridden_method(self, x):
         test_case_self.fail('This should never be called.')
 
       def test_method(self):
         with test_case_self._basic_function_scope() as test_scope:
           test_base = py_builtins.super_in_original_context(
               super, (TestSubclass, self), test_scope)
-          return test_base.plus_twenty(1)
+          return test_base.overridden_method(1)
 
     tc = TestSubclass()
     self.assertEqual(tc.test_method(), 21)
 
+  def test_super_in_original_context_niladic_call(self):
+    test_case_self = self
+
+    class TestSubclass(TestBase):
+
+      def overridden_method(self, x):
+        test_case_self.fail('This should never be called.')
+
+      def test_method(self):
+        with test_case_self._basic_function_scope() as test_scope:
+          b = py_builtins.super_in_original_context(super, (), test_scope)
+          return b.overridden_method(1)
+
+    tc = TestSubclass()
+    self.assertEqual(tc.test_method(), 21)
+
+  def test_super_in_original_context_caller_with_locals(self):
+    test_case_self = self
+
+    class TestSubclass(TestBase):
+
+      def overridden_method(self, x):
+        test_case_self.fail('This should never be called.')
+
+      def test_method(self, x):
+        y = 7
+        with test_case_self._basic_function_scope() as test_scope:
+          z = 7
+          return py_builtins.super_in_original_context(
+              super, (), test_scope).overridden_method(x + y - z)
+
+    tc = TestSubclass()
+    self.assertEqual(tc.test_method(1), 21)
+
+  def test_super_in_original_context_inner_function(self):
+    test_case_self = self
+
+    class TestSubclass(TestBase):
+
+      def overridden_method(self, x):
+        test_case_self.fail('This should never be called.')
+
+      def test_method(self, x):
+        with test_case_self._basic_function_scope() as test_scope:
+          # Oddly, it's sufficient to use `self` in an inner function
+          # to gain access to __class__ in this scope.
+          # TODO(mdan): Is this true across implementations?
+          # Note: normally, it's illegal to use super() in inner functions (it
+          # throws an error), but the generated code may create them.
+          def inner_fn():
+            return py_builtins.super_in_original_context(
+                super, (), test_scope).overridden_method(x)
+
+          return inner_fn()
+
+    tc = TestSubclass()
+    self.assertEqual(tc.test_method(1), 21)
+
+  def test_super_in_original_context_inner_lambda(self):
+    test_case_self = self
+
+    class TestSubclass(TestBase):
+
+      def overridden_method(self, x):
+        test_case_self.fail('This should never be called.')
+
+      def test_method(self, x):
+        with test_case_self._basic_function_scope() as test_scope:
+          # Oddly, it's sufficient to use `self` in an inner function
+          # to gain access to __class__ in this scope.
+          # TODO(mdan): Is this true across implementations?
+          # Note: normally, it's illegal to use super() in inner functions (it
+          # throws an error), but the generated code may create them.
+          l = lambda: py_builtins.super_in_original_context(  # pylint:disable=g-long-lambda
+              super, (), test_scope).overridden_method(x)
+          return l()
+
+    tc = TestSubclass()
+    self.assertEqual(tc.test_method(1), 21)
+
   def test_filter(self):
     self.assertListEqual(
         list(py_builtins.filter_(lambda x: x == 'b', ['a', 'b', 'c'])), ['b'])
diff --git a/tensorflow/python/autograph/pyct/anno.py b/tensorflow/python/autograph/pyct/anno.py
index a5f3f5b33a4..3abee325084 100644
--- a/tensorflow/python/autograph/pyct/anno.py
+++ b/tensorflow/python/autograph/pyct/anno.py
@@ -35,9 +35,19 @@ import gast
 
 
 class NoValue(enum.Enum):
+  """Base class for different types of AST annotations."""
+
+  def of(self, node, default=None):
+    return getanno(node, self, default=default)
+
+  def add_to(self, node, value):
+    setanno(node, self, value)
+
+  def exists(self, node):
+    return hasanno(node, self)
 
   def __repr__(self):
-    return self.name
+    return str(self.name)
 
 
 class Basic(NoValue):
@@ -100,6 +110,9 @@ class Static(NoValue):
       'Symbols defined when entering the node. See reaching_definitions.py.')
   LIVE_VARS_OUT = ('Symbols live when exiting the node. See liveness.py.')
   LIVE_VARS_IN = ('Symbols live when entering the node. See liveness.py.')
+  TYPES = 'Static type information. See type_inference.py.'
+  CLOSURE_TYPES = 'Types of closure symbols at each detected call site.'
+  VALUE = 'Static value information. See type_inference.py.'
 
 
 FAIL = object()
diff --git a/tensorflow/python/autograph/pyct/cfg.py b/tensorflow/python/autograph/pyct/cfg.py
index 9a8ece2bc3a..fa9f99b5a69 100644
--- a/tensorflow/python/autograph/pyct/cfg.py
+++ b/tensorflow/python/autograph/pyct/cfg.py
@@ -219,7 +219,7 @@ class GraphVisitor(object):
                       (gast.Break, gast.Continue, gast.Raise, gast.Pass))
 
   def _visit_internal(self, mode):
-    """Visits the CFG, depth-first."""
+    """Visits the CFG, breadth-first."""
     assert mode in (_WalkMode.FORWARD, _WalkMode.REVERSE)
     if mode == _WalkMode.FORWARD:
       open_ = [self.graph.entry]
diff --git a/tensorflow/python/autograph/pyct/parser.py b/tensorflow/python/autograph/pyct/parser.py
index 5b369270ac7..51523cbc642 100644
--- a/tensorflow/python/autograph/pyct/parser.py
+++ b/tensorflow/python/autograph/pyct/parser.py
@@ -22,6 +22,7 @@ from __future__ import division
 from __future__ import print_function
 
 import inspect
+import linecache
 import re
 import sys
 import textwrap
@@ -183,7 +184,6 @@ def _without_context(node, lines, minl, maxl):
   if end_col_offset is not None:
     # This is only available in 3.8.
     code_lines[-1] = code_lines[-1][:end_col_offset]
-  code_block = '\n'.join(lines[minl - 1:maxl])
 
   col_offset = getattr(node, 'col_offset', None)
   if col_offset is None:
@@ -195,7 +195,7 @@ def _without_context(node, lines, minl, maxl):
   if col_offset is not None:
     code_lines[0] = code_lines[0][col_offset:]
 
-  code_block = '\n'.join(code_lines)
+  code_block = '\n'.join([c.rstrip() for c in code_lines])
 
   return node, code_block
 
@@ -247,9 +247,15 @@ def _parse_lambda(lam):
   # potential multi-line definition.
 
   mod = inspect.getmodule(lam)
+  f = inspect.getsourcefile(lam)
   def_line = lam.__code__.co_firstlineno
-  source = inspect.getsource(mod)
-  lines = source.split('\n')
+
+  # This method is more robust that just calling inspect.getsource(mod), as it
+  # works in interactive shells, where getsource would fail. This is the
+  # same procedure followed by inspect for non-modules:
+  # https://github.com/python/cpython/blob/3.8/Lib/inspect.py#L772
+  lines = linecache.getlines(f, mod.__dict__)
+  source = ''.join(lines)
 
   # Narrow down to the last node starting before our definition node.
   all_nodes = parse(source, preamble_len=0, single_node=False)
diff --git a/tensorflow/python/autograph/pyct/static_analysis/BUILD b/tensorflow/python/autograph/pyct/static_analysis/BUILD
index 0764a3e64b4..1eaf3b3c177 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/BUILD
+++ b/tensorflow/python/autograph/pyct/static_analysis/BUILD
@@ -24,6 +24,7 @@ py_library(
         "liveness.py",
         "reaching_definitions.py",
         "reaching_fndefs.py",
+        "type_inference.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
@@ -167,3 +168,16 @@ py_test(
         "//tensorflow/python/autograph/pyct",
     ],
 )
+
+py_test(
+    name = "type_inference_test",
+    srcs = ["type_inference_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    deps = [
+        ":static_analysis",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/autograph/pyct",
+        "@gast_archive//:gast",
+    ],
+)
diff --git a/tensorflow/python/autograph/pyct/static_analysis/activity.py b/tensorflow/python/autograph/pyct/static_analysis/activity.py
index 0e19da87451..a3228c0a1cc 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/activity.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/activity.py
@@ -57,6 +57,7 @@ class Scope(object):
       the terminology of the Python 3 reference documentation, True roughly
       represents an actual scope, whereas False represents an ordinary code
       block.
+    function_name: Optional[str], name of the function owning this scope.
     isolated_names: Set[qual_names.QN], identifiers that are isolated to this
       scope (even if the scope is not isolated).
     annotations: Set[qual_names.QN], identifiers used as type annotations
@@ -94,7 +95,7 @@ class Scope(object):
   # Note: this mutable-immutable pattern is used because using a builder would
   # have taken a lot more boilerplate.
 
-  def __init__(self, parent, isolated=True):
+  def __init__(self, parent, isolated=True, function_name=None):
     """Create a new scope.
 
     Args:
@@ -102,9 +103,11 @@ class Scope(object):
       isolated: Whether the scope is isolated, that is, whether variables
         modified in this scope should be considered modified in the parent
         scope.
+      function_name: Name of the function owning this scope.
     """
     self.parent = parent
     self.isolated = isolated
+    self.function_name = function_name
 
     self.isolated_names = set()
 
@@ -321,8 +324,8 @@ class ActivityAnalyzer(transformer.Base):
       raise ValueError('Unknown context {} for node "{}".'.format(
           type(node.ctx), qn))
 
-  def _enter_scope(self, isolated):
-    self.scope = Scope(self.scope, isolated=isolated)
+  def _enter_scope(self, isolated, f_name=None):
+    self.scope = Scope(self.scope, isolated=isolated, function_name=f_name)
 
   def _exit_scope(self):
     exited_scope = self.scope
@@ -580,10 +583,10 @@ class ActivityAnalyzer(transformer.Base):
       self._exit_and_record_scope(node)
 
       # A separate Scope tracks the actual function definition.
-      self._enter_scope(True)
+      self._enter_scope(True, node.name)
 
       # Keep a separate scope for the arguments node, which is used in the CFG.
-      self._enter_scope(False)
+      self._enter_scope(False, node.name)
 
       # Arg declarations only affect the function itself, and have no effect
       # in the defining context whatsoever.
@@ -593,7 +596,7 @@ class ActivityAnalyzer(transformer.Base):
 
       # Track the body separately. This is for compatibility reasons, it may not
       # be strictly needed.
-      self._enter_scope(False)
+      self._enter_scope(False, node.name)
       node.body = self.visit_block(node.body)
       self._exit_and_record_scope(node, NodeAnno.BODY_SCOPE)
 
diff --git a/tensorflow/python/autograph/pyct/static_analysis/type_inference.py b/tensorflow/python/autograph/pyct/static_analysis/type_inference.py
new file mode 100644
index 00000000000..a5ed40a1e53
--- /dev/null
+++ b/tensorflow/python/autograph/pyct/static_analysis/type_inference.py
@@ -0,0 +1,555 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Type inference.
+
+This analysis annotates all symbols nodes of an AST with type information
+extracted from static sources:
+ * type annotations
+ * global and local symbols visible to the function at analysis time
+ * literals
+
+Important: This analysis is static, and does not detect dynamic type changes.
+The analysis attempts to use the values of external symbols, if available. These
+values are also considered static for the purpose of analysis.
+
+Requires reaching function definitions analysis.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from typing import Any, Callable, Tuple
+
+import gast
+
+from tensorflow.python.autograph.pyct import anno
+from tensorflow.python.autograph.pyct import cfg
+from tensorflow.python.autograph.pyct import qual_names
+from tensorflow.python.autograph.pyct import transformer
+from tensorflow.python.autograph.pyct.static_analysis import annos
+
+
+class Resolver(object):
+  """Resolver objects handle the process of looking up actual names and types.
+
+  Unless noted otherwise, all resolve_* methods:
+    * have a first namespace argument, mapping string to actual values
+    * have a second types_namespace argument, mapping string to actual inferred
+      types
+    * specify names as QN objects
+    * specify types as a Set of inferred types
+
+  Unless noted otherwise, all resolve_* methods must return either:
+    * a set of `type` objects
+    * None
+  """
+
+  def res_name(self, ns, types_ns, name):
+    """Resolves the type/value an external (e.g. closure, global) variable.
+
+    Args:
+      ns: namespace
+      types_ns: types namespace
+      name: symbol name
+    Returns:
+      Tuple (type, static_value). The first element is the type to use for
+      inferrence. The second is the static value to use. Return None to treat it
+      as unknown.
+    """
+    raise NotImplementedError('subclasses must implement')
+
+  def res_value(self, ns, value):
+    """Resolves the type a literal or static value."""
+    raise NotImplementedError('subclasses must implement')
+
+  def res_arg(self, ns, types_ns, f_name, name, type_anno):
+    """Resolves the type of a (possibly annotated) function argument."""
+    raise NotImplementedError('subclasses must implement')
+
+  def res_call(self, ns, types_ns, node, args, keywords):
+    """Resolves the return type an external function or method call.
+
+    Args:
+      ns: namespace
+      types_ns: types namespace
+      node: str, the function name
+      args: types of each respective argument in node.args
+      keywords: types of each respective argument in node.keywords
+
+    Returns:
+      Tuple (return_type, side_effect_types). The first element is just the
+      return types of the function. The second element is a map from
+      argument names to sets of types, and allow modelling side effects of
+      functions (for example via global or nonlocal).
+    """
+    raise NotImplementedError('subclasses must implement')
+
+  def res_subscript(self, ns, types_ns, node, value, slice_):
+    """Resolves the return type of a unary operation."""
+    raise NotImplementedError('subclasses must implement')
+
+  def res_compare(self, ns, types_ns, node, left, right):
+    """Resolves the return type of a unary operation."""
+    raise NotImplementedError('subclasses must implement')
+
+  def res_binop(self, ns, types_ns, node, left, right):
+    """Resolves the return type of a unary operation."""
+    raise NotImplementedError('subclasses must implement')
+
+
+class _SymbolTable(object):
+  """Abstraction for the state of the CFG walk for type inference.
+
+  This is a value type. Only implements the strictly necessary operators.
+
+  Attributes:
+    types: Dict[qual_names.QN, Set[Type]], mapping symbols to the set of
+      possible types.
+  """
+
+  def __init__(self, init_from=None):
+    if init_from:
+      assert isinstance(init_from, _SymbolTable)
+      self.types = {
+          s: set(other_types) for s, other_types in init_from.types.items()
+      }
+    else:
+      self.types = {}
+
+  def __eq__(self, other):
+    if frozenset(self.types.keys()) != frozenset(other.types.keys()):
+      return False
+    ret = all(self.types[s] == other.types[s] for s in self.types)
+    return ret
+
+  def __ne__(self, other):
+    return not self.__eq__(other)
+
+  def __or__(self, other):
+    assert isinstance(other, _SymbolTable)
+    result = _SymbolTable(self)
+    for s, other_types in other.types.items():
+      if s not in result.types:
+        self_types = set()
+        result.types[s] = self_types
+      else:
+        self_types = result.types[s]
+      self_types.update(other_types)
+    return result
+
+  def __repr__(self):
+    return 'SymbolTable {}'.format(self.types)
+
+
+class StmtInferrer(gast.NodeVisitor):
+  """Runs type inference on a single AST statement.
+
+  This visitor annotates most nodes with type information. It also sets types
+  for the symbols modified by this statement in its types_out property.
+
+  Note: this inferrer is able to capture side effects of functions, however,
+  these side effects will not be applied to the current expression. Doing so
+  would create too much of a dependence on the runtime's internal rules about
+  execution order.
+  Example:
+
+    def f():
+      nonlocal a
+      a = 1
+      return a
+
+    a = 0.0
+    b = f() + a  # a = float; side effect of f() ignored
+    print(a)  # a = int; side effect of f() accounted for
+  """
+
+  def __init__(self, resolver, scope, namespace, closure_types, types_in):
+    self.resolver = resolver
+    self.scope = scope
+    self.namespace = namespace
+    self.closure_types = closure_types
+    self.types_in = types_in
+    self.new_symbols = {}
+    self.rtype = None
+
+  def visit(self, node):
+    types = super().visit(node)
+    if __debug__:
+      self._check_set(types)
+    if types is not None:
+      # TODO(mdan): Normalize by removing subtypes.
+      anno.setanno(node, anno.Static.TYPES, tuple(types))
+    return types
+
+  def _check_set(self, value):
+    if value is not None and not isinstance(value, set):
+      raise ValueError('{} method expected to return set, got {}'.format(
+          self.resolver, value))
+
+  def visit_Constant(self, node):
+    types = self.resolver.res_value(self.namespace, node.value)
+    if __debug__:
+      self._check_set(types)
+    return types
+
+  def visit_Tuple(self, node):
+    if isinstance(node.ctx, gast.Load):
+      for elt in node.elts:
+        self.visit(elt)
+      # TODO(mdan): Parameterize it.
+      return {Tuple}
+
+    assert isinstance(node.ctx, gast.Store)
+    # TODO(mdan): Implement tuple unpacking.
+    return None
+
+  def visit_List(self, node):
+    if isinstance(node.ctx, gast.Load):
+      el_types = []
+      for elt in node.elts:
+        el_types.append(self.visit(elt))
+      return {list}
+
+    raise NotImplementedError('list unpacking')
+
+  def visit_Set(self, node):
+    raise NotImplementedError()
+
+  def visit_Name(self, node):
+    name = anno.getanno(node, anno.Basic.QN)
+
+    if isinstance(node.ctx, gast.Load):
+      types = self.types_in.types.get(name, None)
+      if types is None:
+        if (name not in self.scope.bound) or (name in self.scope.nonlocals):
+          # TODO(mdan): Test with global variables.
+          if name in self.closure_types:
+            types = self.closure_types[name]
+          else:
+            types, value = self.resolver.res_name(
+                self.namespace, self.types_in.types, name)
+            if value is not None:
+              anno.setanno(node, anno.Static.VALUE, value)
+
+    elif isinstance(node.ctx, gast.Param):
+      type_name = anno.getanno(node.annotation, anno.Basic.QN, None)
+      types = self.resolver.res_arg(self.namespace, self.types_in.types,
+                                    self.scope.function_name, name, type_name)
+      if types is not None:
+        self.new_symbols[name] = types
+
+    elif isinstance(node.ctx, gast.Store):
+      if self.rtype is not None:
+        self.new_symbols[name] = self.rtype
+      types = self.rtype
+
+    else:
+      assert False, 'unknown ctx'
+
+    if __debug__:
+      self._check_set(types)
+
+    return types
+
+  def visit_Attribute(self, node):
+    parent_types = self.visit(node.value)
+
+    # Attempt to use the static value if known.
+    parent_value = anno.Static.VALUE.of(node.value, None)
+    if parent_value is not None:
+      static_value = getattr(parent_value, node.attr, None)
+
+    else:
+      # Fall back to the type if that is known.
+      if parent_types is None:
+        return None
+
+      inferred_values = [getattr(t, node.attr, None) for t in parent_types]
+      if not inferred_values:
+        return None
+
+      static_value = inferred_values[0]
+      if static_value is None:
+        return None
+
+      if any(v is not static_value for v in inferred_values[1:]):
+        # Static value not stable, assume it's dynamic.
+        return None
+
+    types = self.resolver.res_value(self.namespace, static_value)
+    anno.setanno(node, anno.Static.VALUE, static_value)
+
+    if __debug__:
+      self._check_set(types)
+
+    return types
+
+  def visit_FunctionDef(self, node):
+    f_name = qual_names.QN(node.name)
+
+    if node.decorator_list:
+      raise NotImplementedError('decorators: {}'.format(node.decorator_list))
+
+    # TODO(mdan): Use args.
+
+    ret_types = None
+    if node.returns:
+      ret_types, _ = self.resolver.res_name(
+          self.namespace, self.types_in.types, anno.Basic.QN.of(node.returns))
+      if __debug__:
+        self._check_set(ret_types)
+
+    if ret_types is None:
+      ret_types = {Any}
+
+    fn_types = set()
+    for rt in ret_types:
+      fn_types.add(Callable[[Any], rt])
+
+    self.new_symbols[f_name] = fn_types
+    # The definition of a function is an expression, hence has no return value.
+    return None
+
+  def _resolve_typed_callable(self, fn_types, arg_types, keyword_types):
+    ret_types = set()
+    for t in fn_types:
+
+      if isinstance(t, Callable):
+        # Note: these are undocummented - may be version-specific!
+        # Callable[[x], y]: __args__ are (x, y)
+        args = t.__args__
+        if args:
+          ret_types.add(args[-1])
+        else:
+          ret_types.add(Any)
+      else:
+        raise NotImplementedError('callable type {}'.format(type(t)))
+
+    # Side effects can not be inferred based on type alone.
+    side_effects = None
+    return ret_types, side_effects
+
+  def visit_Call(self, node):
+    self.visit(node.func)
+
+    f_name = anno.Basic.QN.of(node.func)
+    arg_types = [self.visit(a) for a in node.args]
+    keyword_types = [self.visit(kw.value) for kw in node.keywords]
+
+    if f_name in self.scope.bound:
+      # Local function, use local type definitions, if available.
+      fn_type = self.types_in.types.get(f_name, None)
+      if fn_type is None:
+        # No static type info available, nothing more to do.
+        ret_type, side_effects = None, None
+      else:
+        ret_type, side_effects = self._resolve_typed_callable(
+            self.types_in.types.get(f_name), arg_types, keyword_types)
+
+    else:
+      # Nonlocal function, resolve externally.
+      ret_type, side_effects = self.resolver.res_call(self.namespace,
+                                                      self.types_in.types, node,
+                                                      arg_types, keyword_types)
+    if __debug__:
+      self._check_set(ret_type)
+      if side_effects:
+        if not isinstance(side_effects, dict):
+          raise ValueError(
+              'side effects must be dict, got {}'.format(side_effects))
+        for k, v in side_effects.items():
+          if not isinstance(k, qual_names.QN):
+            raise ValueError('side effect keys must be QNs, got {}'.format(k))
+          self._check_set(v)
+
+    if side_effects:
+      self.new_symbols.update(side_effects)
+    return ret_type
+
+  def visit_Expr(self, node):
+    return self.visit(node.value)
+
+  def visit_Index(self, node):
+    return self.visit(node.value)
+
+  def visit_Assign(self, node):
+    self.rtype = self.visit(node.value)
+
+    for t in node.targets:
+      self.visit(t)
+
+    self.rtype = None
+
+  def visit_Subscript(self, node):
+    val_types = self.visit(node.value)
+    slice_types = self.visit(node.slice)
+
+    if val_types is None or slice_types is None:
+      return None
+
+    types = self.resolver.res_subscript(
+        self.namespace, self.types_in.types, node, val_types, slice_types)
+
+    if __debug__:
+      self._check_set(types)
+
+    return types
+
+  def visit_Compare(self, node):
+    left_types = self.visit(node.left)
+    right_types = [self.visit(c) for c in node.comparators]
+
+    if left_types is None or any(t is None for t in right_types):
+      return None
+
+    types = self.resolver.res_compare(
+        self.namespace, self.types_in.types, node, left_types, right_types)
+
+    if __debug__:
+      self._check_set(types)
+
+    return types
+
+  def visit_BinOp(self, node):
+    left_types = self.visit(node.left)
+    right_types = self.visit(node.right)
+
+    if left_types is None or right_types is None:
+      return None
+
+    types = self.resolver.res_binop(
+        self.namespace, self.types_in.types, node, left_types, right_types)
+
+    if __debug__:
+      self._check_set(types)
+
+    return types
+
+
+class Analyzer(cfg.GraphVisitor):
+  """CFG visitor that propagates type information across statements."""
+
+  def __init__(self, graph, resolver, namespace, scope, closure_types):
+    """Creates a new analyzer.
+
+    Args:
+      graph: cfg.Graph
+      resolver: Resolver
+      namespace: Dict[str, Any]
+      scope: activity.Scope
+      closure_types: Dict[QN, Set]
+    """
+    super(Analyzer, self).__init__(graph)
+    self.resolver = resolver
+    self.namespace = namespace
+    self.scope = scope
+    self.closure_types = closure_types
+
+    context_types = {
+        n: t for n, t in closure_types.items() if n not in scope.bound
+    }
+    if context_types:
+      self.context_types = _SymbolTable()
+      self.context_types.types = context_types
+    else:
+      self.context_types = None
+
+  def init_state(self, _):
+    return _SymbolTable()
+
+  def _update_closure_types(self, ast_node, types):
+    existing_types = anno.Static.CLOSURE_TYPES.of(ast_node, None)
+
+    if existing_types is None:
+      existing_types = {}
+      anno.Static.CLOSURE_TYPES.add_to(ast_node, existing_types)
+
+    for k, v in types.types.items():
+      if k in existing_types:
+        existing_types[k].update(v)
+      else:
+        existing_types[k] = set(v)
+
+  def visit_node(self, node):
+    prev_types_out = self.out[node]
+
+    types_in = _SymbolTable()
+    for n in node.prev:
+      types_in |= self.out[n]
+    if (self.context_types is not None) and (node is self.graph.entry):
+      types_in |= self.context_types
+
+    types_out = _SymbolTable(types_in)
+    ast_node = node.ast_node
+
+    inferrer = StmtInferrer(self.resolver, self.scope, self.namespace,
+                            self.closure_types, types_in)
+    inferrer.visit(ast_node)
+    types_out.types.update(inferrer.new_symbols)
+
+    reaching_fndefs = anno.Static.DEFINED_FNS_IN.of(ast_node)
+    node_scope = anno.Static.SCOPE.of(ast_node, None)
+    if node_scope is not None:
+      # TODO(mdan): Check that it's actually safe to skip nodes without scope.
+      reads = {str(qn) for qn in node_scope.read}
+      for def_node in reaching_fndefs:
+        if def_node.name in reads:
+          self._update_closure_types(def_node, types_out)
+
+    self.in_[node] = types_in
+    self.out[node] = types_out
+
+    return prev_types_out != types_out
+
+
+class FunctionVisitor(transformer.Base):
+  """AST visitor that applies type inference to each function separately."""
+
+  def __init__(self, source_info, graphs, resolver):
+    super(FunctionVisitor, self).__init__(source_info)
+    self.graphs = graphs
+    self.resolver = resolver
+
+  def visit_FunctionDef(self, node):
+    subgraph = self.graphs[node]
+    scope = anno.getanno(node, annos.NodeAnno.ARGS_AND_BODY_SCOPE)
+    closure_types = anno.getanno(node, anno.Static.CLOSURE_TYPES, {})
+
+    analyzer = Analyzer(subgraph, self.resolver, self.ctx.info.namespace, scope,
+                        closure_types)
+    analyzer.visit_forward()
+
+    # Recursively process any remaining subfunctions.
+    node.body = self.visit_block(node.body)
+
+    return node
+
+
+def resolve(node, source_info, graphs, resolver):
+  """Performs type inference.
+
+  Args:
+    node: ast.AST
+    source_info: transformer.SourceInfo
+    graphs: Dict[ast.FunctionDef, cfg.Graph]
+    resolver: Resolver
+
+  Returns:
+    ast.AST
+  """
+  visitor = FunctionVisitor(source_info, graphs, resolver)
+  node = visitor.visit(node)
+  return node
diff --git a/tensorflow/python/autograph/pyct/static_analysis/type_inference_test.py b/tensorflow/python/autograph/pyct/static_analysis/type_inference_test.py
new file mode 100644
index 00000000000..ae54cd98b25
--- /dev/null
+++ b/tensorflow/python/autograph/pyct/static_analysis/type_inference_test.py
@@ -0,0 +1,679 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for type_inference module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from typing import Any, Callable
+
+from tensorflow.python.autograph.pyct import anno
+from tensorflow.python.autograph.pyct import cfg
+from tensorflow.python.autograph.pyct import qual_names
+from tensorflow.python.autograph.pyct import transpiler
+from tensorflow.python.autograph.pyct.static_analysis import activity
+from tensorflow.python.autograph.pyct.static_analysis import reaching_definitions
+from tensorflow.python.autograph.pyct.static_analysis import reaching_fndefs
+from tensorflow.python.autograph.pyct.static_analysis import type_inference
+from tensorflow.python.platform import test
+
+
+class BasicTestResolver(type_inference.Resolver):
+  """A very basic resolver for testing."""
+
+  def res_name(self, ns, types_ns, name):
+    str_name = str(name)
+    if str_name == 'int':
+      return {int}, int
+    return {type(ns[str_name])}, ns[str_name]
+
+  def res_value(self, ns, value):
+    return {type(value)}
+
+  def res_arg(self, ns, types_ns, f_name, name, type_anno):
+    return {str(type_anno)}
+
+
+class TestTranspiler(transpiler.GenericTranspiler):
+
+  def __init__(self, resolver_type):
+    super().__init__()
+    self.resolver = resolver_type()
+
+  def get_transformed_name(self, _):
+    return 'test_item'
+
+  def transform_ast(self, node, ctx):
+    node = qual_names.resolve(node)
+    node = activity.resolve(node, ctx)
+    graphs = cfg.build(node)
+    node = reaching_definitions.resolve(node, ctx, graphs)
+    node = reaching_fndefs.resolve(node, ctx, graphs)
+    node = type_inference.resolve(node, ctx, graphs, self.resolver)
+    return node
+
+
+class TypeInferenceAnalyzerTest(test.TestCase):
+
+  def assertTypes(self, node, expected):
+    if not isinstance(expected, tuple):
+      expected = expected,
+    self.assertSetEqual(
+        set(anno.getanno(node, anno.Static.TYPES)), set(expected))
+
+  def assertClosureTypes(self, node, expected):
+    actual = anno.getanno(node, anno.Static.CLOSURE_TYPES)
+    actual = {str(k): v for k, v in actual.items()}
+    for k, v in expected.items():
+      self.assertIn(k, actual)
+      self.assertEqual(actual[k], v)
+
+  def test_no_inference_on_unknown_operand_types(self):
+
+    class Resolver(type_inference.Resolver):
+
+      def res_arg(self, ns, types_ns, f_name, name, type_anno):
+        return None
+
+    def test_fn(a, b):
+      return a < b, a - b
+
+    node, _ = TestTranspiler(Resolver).transform(test_fn, None)
+    fn_body = node.body
+
+    # With no information on operand types, the operators will infer nothing.
+    self.assertFalse(
+        anno.hasanno(fn_body[0].value.elts[0], anno.Static.TYPES))
+    self.assertFalse(
+        anno.hasanno(fn_body[0].value.elts[1], anno.Static.TYPES))
+
+  def test_resolver_output_checked(self):
+
+    class Resolver(type_inference.Resolver):
+
+      def res_arg(self, ns, types_ns, f_name, name, type_anno):
+        return 1
+
+    def test_fn(a):
+      del a
+      pass
+
+    with self.assertRaisesRegex(ValueError, 'expected to return set'):
+      TestTranspiler(Resolver).transform(test_fn, None)
+
+  def test_argument(self):
+
+    test_self = self
+
+    class Resolver(type_inference.Resolver):
+
+      def res_arg(self, ns, types_ns, f_name, name, type_anno):
+        if name == qual_names.QN('a'):
+          test_self.assertEqual(type_anno, qual_names.QN('int'))
+        return {str(name) + '_type'}
+
+    def test_fn(a: int, b):
+      return a, b
+
+    node, _ = TestTranspiler(Resolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertTypes(fn_body[0].value.elts[0], 'a_type')
+    self.assertTypes(fn_body[0].value.elts[1], 'b_type')
+
+  def test_argument_of_local_function(self):
+
+    def test_fn(a: int):
+
+      def foo(x: float):
+        return x
+
+      return foo(a)
+
+    tr = TestTranspiler(BasicTestResolver)
+    node, _ = tr.transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertTypes(fn_body[0].body[0].value, 'float')
+    self.assertClosureTypes(fn_body[0], {'a': {'int'}})
+
+  def test_assign_straightline(self):
+
+    def test_fn(a: int, c: float):
+      b = a
+      return a, b, c
+
+    node, _ = TestTranspiler(BasicTestResolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertTypes(fn_body[0].targets[0], 'int')
+    self.assertTypes(fn_body[0].value, 'int')
+    self.assertTypes(fn_body[1].value.elts[0], 'int')
+    self.assertTypes(fn_body[1].value.elts[1], 'int')
+    self.assertTypes(fn_body[1].value.elts[2], 'float')
+
+  def test_expr(self):
+
+    self_test = self
+
+    class Resolver(type_inference.Resolver):
+
+      def res_value(self, ns, value):
+        self_test.assertEqual(value, tc.a)
+        return {str}
+
+      def res_name(self, ns, types_ns, name):
+        self_test.assertEqual(name, qual_names.QN('tc'))
+        return {TestClass}, tc
+
+      def res_call(self, ns, types_ns, node, args, keywords):
+        return {int}, None
+
+    class TestClass:
+
+      def a(self):
+        pass
+
+    tc = TestClass()
+
+    def test_fn():
+      tc.a()
+
+    node, _ = TestTranspiler(Resolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertEqual(
+        anno.getanno(fn_body[0].value.func, anno.Static.VALUE), tc.a)
+    self.assertTypes(fn_body[0].value.func, str)
+    self.assertTypes(fn_body[0].value, int)
+    self.assertTypes(fn_body[0], int)
+
+  def test_assign_overwriting(self):
+
+    def test_fn(a: int, b: float):
+      c = a
+      c = b
+      return c
+
+    node, _ = TestTranspiler(BasicTestResolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertTypes(fn_body[0].targets[0], 'int')
+    self.assertTypes(fn_body[0].value, 'int')
+    self.assertTypes(fn_body[1].targets[0], 'float')
+    self.assertTypes(fn_body[1].value, 'float')
+
+  def test_dynamic_attribute_of_static_value(self):
+
+    test_self = self
+
+    class Resolver(type_inference.Resolver):
+
+      def res_value(self, ns, value):
+        test_self.assertEqual(value, tc.a)
+        return {int}
+
+      def res_name(self, ns, types_ns, name):
+        test_self.assertEqual(name, qual_names.QN('tc'))
+        return {TestClass}, tc
+
+    class TestClass:
+
+      def __init__(self):
+        self.a = 1
+
+    tc = TestClass()
+
+    def test_fn():
+      return tc.a
+
+    node, _ = TestTranspiler(Resolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertTypes(fn_body[0].value.value, TestClass)
+    self.assertTypes(fn_body[0].value, int)
+    self.assertIs(anno.getanno(fn_body[0].value.value, anno.Static.VALUE), tc)
+    self.assertEqual(anno.getanno(fn_body[0].value, anno.Static.VALUE), tc.a)
+
+  def test_static_attribute_of_typed_value(self):
+
+    test_self = self
+
+    class TestClass:
+
+      a = 1
+
+    tc = TestClass()
+
+    class Resolver(type_inference.Resolver):
+
+      def res_name(self, ns, types_ns, name):
+        test_self.assertEqual(name, qual_names.QN('tc'))
+        return {TestClass}, None
+
+      def res_value(self, ns, value):
+        test_self.assertIs(value, tc.a)
+        return {str}
+
+    def test_fn():
+      return tc.a
+
+    node, _ = TestTranspiler(Resolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertTypes(fn_body[0].value.value, TestClass)
+    self.assertTypes(fn_body[0].value, str)  # Resolver is SOT
+    self.assertFalse(anno.hasanno(fn_body[0].value.value, anno.Static.VALUE))
+    self.assertEqual(anno.getanno(fn_body[0].value, anno.Static.VALUE), 1)
+
+  def test_static_attribute_of_ambiguous_type(self):
+
+    test_self = self
+
+    class TestClass1:
+
+      a = 1
+
+    class TestClass2:
+
+      a = 2
+
+    tc = TestClass1()
+
+    class Resolver(type_inference.Resolver):
+
+      def res_name(self, ns, types_ns, name):
+        test_self.assertEqual(name, qual_names.QN('tc'))
+        return {TestClass1, TestClass2}, None
+
+      def res_value(self, ns, value):
+        test_self.assertIn(value, (1, 2))
+        return {str}
+
+    def test_fn():
+      return tc.a
+
+    node, _ = TestTranspiler(Resolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertTypes(fn_body[0].value.value, (TestClass1, TestClass2))
+    self.assertFalse(anno.hasanno(fn_body[0].value, anno.Static.TYPES))
+    self.assertFalse(anno.hasanno(fn_body[0].value.value, anno.Static.VALUE))
+    self.assertFalse(anno.hasanno(fn_body[0].value, anno.Static.VALUE))
+
+  def test_property_of_typed_value(self):
+
+    test_self = self
+
+    class TestClass:
+
+      @property
+      def a(self):
+        return 1
+
+    tc = TestClass()
+
+    class Resolver(type_inference.Resolver):
+
+      def res_name(self, ns, types_ns, name):
+        test_self.assertEqual(name, qual_names.QN('tc'))
+        return {TestClass}, None
+
+      def res_value(self, ns, value):
+        test_self.assertIs(value, TestClass.a)
+        test_self.assertNotEqual(value, 1)  # Can't evaluate property of class.
+        return {property}
+
+    def test_fn():
+      return tc.a
+
+    node, _ = TestTranspiler(Resolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertTypes(fn_body[0].value.value, TestClass)
+    self.assertTypes(fn_body[0].value, property)
+    self.assertFalse(anno.hasanno(fn_body[0].value.value, anno.Static.VALUE))
+    self.assertEqual(
+        anno.getanno(fn_body[0].value, anno.Static.VALUE), TestClass.a)
+
+  def test_dynamic_attribute_of_typed_value(self):
+
+    test_self = self
+
+    class TestClass:
+
+      def __init__(self):
+        self.a = 1
+
+    tc = TestClass()
+
+    class Resolver(type_inference.Resolver):
+
+      def res_name(self, ns, types_ns, name):
+        test_self.assertEqual(name, qual_names.QN('tc'))
+        return {TestClass}, None
+
+    def test_fn():
+      return tc.a
+
+    node, _ = TestTranspiler(Resolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertTypes(fn_body[0].value.value, TestClass)
+    self.assertFalse(anno.hasanno(fn_body[0].value, anno.Static.TYPES))
+    self.assertFalse(anno.hasanno(fn_body[0].value.value, anno.Static.VALUE))
+    self.assertFalse(anno.hasanno(fn_body[0].value, anno.Static.VALUE))
+
+  def test_external_value(self):
+
+    a = 'foo'
+
+    def test_fn():
+      b = a
+      return b
+
+    node, _ = TestTranspiler(BasicTestResolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertTypes(fn_body[0].targets[0], str)
+    self.assertTypes(fn_body[1].value, str)
+
+  def test_external_function(self):
+
+    test_self = self
+
+    class Resolver(type_inference.Resolver):
+
+      def res_name(self, ns, types_ns, name):
+        test_self.assertEqual(name, qual_names.QN('g'))
+        return {str}, g
+
+      def res_call(self, ns, types_ns, node, args, keywords):
+        test_self.assertEqual(
+            anno.getanno(node.func, anno.Basic.QN), qual_names.QN('g'))
+        return {float}, None
+
+    def g() -> float:
+      return 1.0
+
+    def test_fn():
+      a = g()
+      return a
+
+    node, _ = TestTranspiler(Resolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertTypes(fn_body[0].value.func, str)
+    self.assertTypes(fn_body[0].targets[0], float)
+    self.assertTypes(fn_body[1].value, float)
+
+  def test_external_function_side_effects(self):
+
+    test_self = self
+
+    class Resolver(type_inference.Resolver):
+
+      def res_name(self, ns, types_ns, name):
+        test_self.assertEqual(name, qual_names.QN('g'))
+        return None, g
+
+      def res_arg(self, ns, types_ns, f_name, name, type_anno):
+        return {str(type_anno)}
+
+      def res_call(self, ns, types_ns, node, args, keywords):
+        return None, {qual_names.QN('x'): {str}}
+
+    def g():
+      # The resolver will pretend that this function has the following body:
+      #
+      #   nonlocal x
+      #   x = 'a'
+      pass
+
+    def test_fn(x: int):
+      y = x
+      g()
+      return x, y
+
+    node, _ = TestTranspiler(Resolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertTypes(fn_body[0].targets[0], 'int')
+    self.assertTypes(fn_body[0].value, 'int')
+    self.assertTypes(fn_body[2].value.elts[0], str)
+    self.assertTypes(fn_body[2].value.elts[1], 'int')
+
+  def test_local_function_closure(self):
+
+    def test_fn(x: int):
+
+      def foo():
+        return x
+
+      foo()
+
+    node, _ = TestTranspiler(BasicTestResolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertTypes(fn_body[0].body[0].value, 'int')
+    self.assertClosureTypes(fn_body[0], {'x': {'int'}})
+
+  def test_local_function_closure_nested(self):
+
+    def test_fn(x: int):
+
+      def foo():
+
+        def bar():
+          return x
+
+        bar()
+
+      foo()
+
+    node, _ = TestTranspiler(BasicTestResolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertTypes(fn_body[0].body[0].body[0].value, 'int')
+    self.assertClosureTypes(fn_body[0], {'x': {'int'}})
+    self.assertClosureTypes(fn_body[0].body[0], {'x': {'int'}})
+
+  def test_local_function_closure_mutable_var(self):
+
+    def test_fn(x: int):
+
+      def foo():
+        nonlocal x
+        return x
+
+      foo()
+
+    node, _ = TestTranspiler(BasicTestResolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertTypes(fn_body[0].body[1].value, 'int')
+    self.assertClosureTypes(fn_body[0], {'x': {'int'}})
+
+  def test_local_function_closure_ignored_for_bound_symbols(self):
+
+    def test_fn(x: float):  # pylint:disable=unused-argument
+
+      def foo():
+        x = x + 1  # pylint:disable=used-before-assignment
+
+      foo()
+
+    node, _ = TestTranspiler(BasicTestResolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertFalse(
+        anno.hasanno(fn_body[0].body[0].value.left, anno.Static.TYPES))
+    self.assertClosureTypes(fn_body[0], {'x': {'float'}})
+
+  def test_local_function_closure_uses_call_site_types(self):
+
+    def test_fn(x: int):
+
+      def foo():
+        return x
+
+      x = 1.0
+      foo()
+
+    node, _ = TestTranspiler(BasicTestResolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertTypes(fn_body[0].body[0].value, float)
+    self.assertTypes(fn_body[1].targets[0], float)
+    self.assertClosureTypes(fn_body[0], {'x': {float}})
+
+  def test_local_function_type(self):
+
+    def test_fn(x: int):
+
+      def foo() -> int:
+        return x
+
+      foo()
+
+    node, _ = TestTranspiler(BasicTestResolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertTypes(fn_body[1].value.func, Callable[[Any], int])
+    self.assertTypes(fn_body[1].value, int)
+    self.assertTypes(fn_body[1], int)
+
+  def test_side_effects_on_arg_function_closure(self):
+
+    test_self = self
+
+    class Resolver(type_inference.Resolver):
+
+      def res_name(self, ns, types_ns, name):
+        test_self.assertEqual(name, qual_names.QN('g'))
+        return None, g
+
+      def res_value(self, ns, value):
+        test_self.assertEqual(value, 1.0)
+        return {float}
+
+      def res_arg(self, ns, types_ns, f_name, name, type_anno):
+        return {str(type_anno)}
+
+      def res_call(self, ns, types_ns, node, args, keywords):
+        test_self.assertEqual(node.func.id, 'g')
+        return None, {qual_names.QN('x'): {str}}
+
+    def g(foo):
+      # The resolver will convey that this function has the following body:
+      #
+      #   nonlocal x
+      #   x = 'a'
+      #   foo()
+      del foo
+      pass
+
+    def test_fn(x: int):  # pylint:disable=unused-argument
+
+      def foo():
+        return x
+
+      x = 1.0
+      g(foo)
+
+    node, _ = TestTranspiler(Resolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertTypes(fn_body[0].body[0].value, str)
+
+  def test_subscript(self):
+
+    test_self = self
+
+    class Resolver(type_inference.Resolver):
+
+      def res_arg(self, ns, types_ns, f_name, name, type_anno):
+        return {list}
+
+      def res_value(self, ns, value):
+        return {int}
+
+      def res_subscript(self, ns, types_ns, node, value, slice_):
+        test_self.assertSetEqual(value, {list})
+        test_self.assertSetEqual(slice_, {int})
+        return {str}
+
+    def test_fn(a):
+      return a[1]
+
+    node, _ = TestTranspiler(Resolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertTypes(fn_body[0].value, str)
+    self.assertTypes(fn_body[0].value.value, list)
+    self.assertTypes(fn_body[0].value.slice.value, int)
+
+  def test_compare(self):
+
+    test_self = self
+
+    class Resolver(type_inference.Resolver):
+
+      def res_arg(self, ns, types_ns, f_name, name, type_anno):
+        return {int}
+
+      def res_compare(self, ns, types_ns, node, left, right):
+        test_self.assertSetEqual(left, {int})
+        test_self.assertListEqual(right, [{int}])
+        return {bool}
+
+    def test_fn(a, b):
+      return a < b
+
+    node, _ = TestTranspiler(Resolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertTypes(fn_body[0].value, bool)
+    self.assertTypes(fn_body[0].value.left, int)
+    self.assertTypes(fn_body[0].value.comparators[0], int)
+
+  def test_binop(self):
+
+    test_self = self
+
+    class Resolver(type_inference.Resolver):
+
+      def res_arg(self, ns, types_ns, f_name, name, type_anno):
+        return {list}
+
+      def res_binop(self, ns, types_ns, node, left, right):
+        test_self.assertSetEqual(left, {list})
+        test_self.assertSetEqual(right, {list})
+        return {float}
+
+    def test_fn(a, b):
+      return a @ b
+
+    node, _ = TestTranspiler(Resolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertTypes(fn_body[0].value, float)
+    self.assertTypes(fn_body[0].value.left, list)
+    self.assertTypes(fn_body[0].value.right, list)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/autograph/pyct/transformer.py b/tensorflow/python/autograph/pyct/transformer.py
index 87abe3d185c..dc0b173aff9 100644
--- a/tensorflow/python/autograph/pyct/transformer.py
+++ b/tensorflow/python/autograph/pyct/transformer.py
@@ -527,7 +527,7 @@ class CodeGenerator(NodeStateTracker, gast.NodeVisitor):
       self.ctx.current_origin = anno.getanno(node, anno.Basic.ORIGIN)
 
     try:
-      super(CodeGenerator, self).visit(node)
+      ret = super(CodeGenerator, self).visit(node)
 
       # By default, all replacements receive the origin info of the replaced
       # node.
@@ -537,5 +537,6 @@ class CodeGenerator(NodeStateTracker, gast.NodeVisitor):
             node, anno.Basic.ORIGIN, default=parent_origin)
         if inherited_origin is not None:
           self.source_map[(eof_before, eof_after)] = inherited_origin
+      return ret
     finally:
       self.ctx.current_origin = parent_origin
diff --git a/tensorflow/python/autograph/pyct/transpiler.py b/tensorflow/python/autograph/pyct/transpiler.py
index 9916440cfcc..d93da4b03d1 100644
--- a/tensorflow/python/autograph/pyct/transpiler.py
+++ b/tensorflow/python/autograph/pyct/transpiler.py
@@ -238,7 +238,7 @@ class GenericTranspiler(object):
 
       class MyTransformer(GenericTranspiler):
 
-        def transform(self, obj):
+        def transform_ast(self, node, ctx):
           result = <<transform node>>
           return result
 
@@ -248,6 +248,14 @@ class GenericTranspiler(object):
       # result is the output
   """
 
+  def get_transformed_name(self, node):
+    """Returns a name for the output function. Subclasses may override this."""
+    if isinstance(node, gast.Lambda):
+      return 'lam'
+    elif isinstance(node, gast.FunctionDef):
+      return node.name
+    raise ValueError('Unknown node type {}'.format(node))
+
   def transform_ast(self, node, ctx):
     """Performs an actual transformation of a function's AST.
 
@@ -289,6 +297,34 @@ class GenericTranspiler(object):
         args.kw_defaults[i] = parser.parse_expression('None')
     return node
 
+  def transform_module(self, mod, user_context):
+    """Transforms a module.
+
+    Subclasses may override this method. The return value is opaque.
+
+    The method receives the original AST. The result is passed as-is to the
+    output of `transform`.
+
+    Args:
+      mod: A Python module.
+      user_context: An opaque object (may be None) that is forwarded to
+        transform_ast, through the ctx.user_context argument.
+    Returns:
+      List[Tuple[Any, Any]]. By default it returns the output of transform_ast,
+      evaluated on each supported member, other than modules, together with a
+      `transformer.Context` containing information about the transformation
+      process.
+    """
+    result = []
+    for member in mod.__dict__.values():
+      if inspect.ismodule(member):
+        continue  # Not transforming modules recursively.
+      try:
+        result.append(self.transform(member, user_context))
+      except NotImplementedError:
+        pass  # Skip unsupported elements.
+    return result
+
   def transform_function(self, fn, user_context):
     """Transforms a function.
 
@@ -363,16 +399,6 @@ class PyToPy(GenericTranspiler):
     self._cache_lock = threading.RLock()
     self._cache = cache.CodeObjectCache()
 
-  def get_transformed_name(self, node):
-    """Returns a name for the output function. Subclasses may override this."""
-    if isinstance(node, gast.Lambda):
-      return 'lam'
-    elif isinstance(node, gast.FunctionDef):
-      # Note that we need to rename the function, to avoid any namespace
-      # clashes.
-      return node.name
-    raise ValueError('Unknown node type {}'.format(node))
-
   def get_extra_locals(self):
     """Returns extra static local variables to be made to transformed code.
 
diff --git a/tensorflow/python/autograph/utils/__init__.py b/tensorflow/python/autograph/utils/__init__.py
index 270f9b9d14f..295d6674e2c 100644
--- a/tensorflow/python/autograph/utils/__init__.py
+++ b/tensorflow/python/autograph/utils/__init__.py
@@ -22,4 +22,3 @@ from tensorflow.python.autograph.utils.context_managers import control_dependenc
 from tensorflow.python.autograph.utils.misc import alias_tensors
 from tensorflow.python.autograph.utils.py_func import wrap_py_func
 from tensorflow.python.autograph.utils.tensor_list import dynamic_list_append
-from tensorflow.python.autograph.utils.testing import fake_tf
diff --git a/tensorflow/python/autograph/utils/testing.py b/tensorflow/python/autograph/utils/testing.py
index a59642c9577..bec6966e7cb 100644
--- a/tensorflow/python/autograph/utils/testing.py
+++ b/tensorflow/python/autograph/utils/testing.py
@@ -18,20 +18,130 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import imp
+import re
+import types
+import unittest
 
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import op_callbacks
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import gen_math_ops
-from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
 
 
-def fake_tf():
-  """Creates a fake module that looks like TensorFlow, for testing."""
-  mod = imp.new_module('tensorflow')
-  mod_contents = {}
-  mod_contents.update(gen_math_ops.__dict__)
-  mod_contents.update(math_ops.__dict__)
-  mod_contents.update(ops.__dict__)
-  mod_contents.update(mod.__dict__)
-  mod.__dict__.update(mod_contents)
-  return mod
+class AutoGraphTestCase(test.TestCase):
+  """Tests specialized for AutoGraph, which run as tf.functions.
+
+  These tests use a staged programming-like approach: most of the test code runs
+  as-is inside a tf.function, but the assertions are lifted outside the
+  function, and run with the corresponding function values instead.
+
+  For example, the test:
+
+      def test_foo(self):
+        baz = bar();
+        self.assertEqual(baz, value)
+
+  is equivalent to writing:
+
+      def test_foo(self):
+        @tf.function
+        def test_fn():
+          baz = bar();
+          return baz, value
+
+        baz_actual, value_actual = test_fn()
+        self.assertEqual(baz_actual, value_actual)
+
+  Only assertions that require evaluation outside the function are lifted
+  outside the function scope. The rest execute inline, at function creation
+  time.
+  """
+
+  def __new__(cls, *args):
+    obj = super().__new__(cls)
+
+    for name in cls.__dict__:
+      if not name.startswith(unittest.TestLoader.testMethodPrefix):
+        continue
+      m = getattr(obj, name)
+      if callable(m):
+        wrapper = obj._run_as_tf_function(m)
+        setattr(obj, name, types.MethodType(wrapper, obj))
+
+    return obj
+
+  def _op_callback(
+      self, op_type, inputs, attrs, outputs, op_name=None, graph=None):
+    self.trace_log.append(op_type)
+
+  def _run_as_tf_function(self, fn):
+
+    def wrapper(self):
+      @def_function.function(autograph=False)  # Testing autograph itself.
+      def fn_wrapper():
+        self.assertions = []
+        self.graph_assertions = []
+        self.trace_log = []
+        fn()
+        targets = [args for _, args in self.assertions]
+        return targets
+
+      tensors = fn_wrapper()
+
+      for assertion in self.graph_assertions:
+        assertion(fn_wrapper.get_concrete_function().graph)
+
+      actuals = self.evaluate(tensors)
+      for (assertion, _), values in zip(self.assertions, actuals):
+        assertion(*values)
+
+    return wrapper
+
+  def variable(self, name, value, dtype):
+    with ops.init_scope():
+      if name not in self.variables:
+        self.variables[name] = variables.Variable(value, dtype=dtype)
+        self.evaluate(self.variables[name].initializer)
+    return self.variables[name]
+
+  def setUp(self):
+    super().setUp()
+    self.variables = {}
+    self.trace_log = []
+    op_callbacks.add_op_callback(self._op_callback)
+
+  def tearDown(self):
+    op_callbacks.remove_op_callback(self._op_callback)
+    self.trace_log = None
+    self.variables = None
+    super().tearDown()
+
+  def assertGraphContains(self, op_regex, n):
+    def assertion(graph):
+      matches = []
+      for node in graph.as_graph_def().node:
+        if re.match(op_regex, node.name):
+          matches.append(node)
+      for fn in graph.as_graph_def().library.function:
+        for node_def in fn.node_def:
+          if re.match(op_regex, node_def.name):
+            matches.append(node_def)
+      self.assertLen(matches, n)
+
+    self.graph_assertions.append(assertion)
+
+  def assertOpCreated(self, op_type):
+    self.assertIn(op_type, self.trace_log)
+
+  def assertOpsNotCreated(self, op_types):
+    self.assertEmpty(set(op_types) & set(self.trace_log))
+
+  def assertNoOpsCreated(self):
+    self.assertEmpty(self.trace_log)
+
+  def assertEqual(self, *args):
+    self.assertions.append((super().assertEqual, list(args)))
+
+  def assertDictEqual(self, *args):
+    self.assertions.append((super().assertDictEqual, list(args)))
diff --git a/tensorflow/python/client/session_ref.cc b/tensorflow/python/client/session_ref.cc
index 3e9ef302c76..dfbeb3a4b29 100644
--- a/tensorflow/python/client/session_ref.cc
+++ b/tensorflow/python/client/session_ref.cc
@@ -146,8 +146,7 @@ class SessionLogger {
     // Build an index from fetch tensor name to first index in
     // output_tensor_names.
     std::unordered_map<string, int> output_name_to_offset;
-    for (int i = 0, iter_limit = output_tensor_names.size(); i < iter_limit;
-         ++i) {
+    for (int i = 0, end = output_tensor_names.size(); i < end; ++i) {
       const string& name = output_tensor_names[i];
       if (output_name_to_offset.insert(std::make_pair(name, i)).second) {
         req->add_fetch(name);
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 447814b378d..3e91a8b3f45 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 7, 10)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 8, 11)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 
diff --git a/tensorflow/python/compiler/tensorrt/test/base_test.py b/tensorflow/python/compiler/tensorrt/test/base_test.py
index 9d2d3abd4fb..195382cd8ed 100644
--- a/tensorflow/python/compiler/tensorrt/test/base_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/base_test.py
@@ -70,6 +70,12 @@ class SimpleSingleEngineTest(trt_test.TfTrtIntegrationTestBase):
         ]
     }
 
+  def ShouldRunTest(self, run_params):
+    # TODO(b/162448349): Enable the test for TRT 7.1.3.
+    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
+      return (False, "Skip test due to b/162448349")
+    return super().ShouldRunTest(run_params)
+
 
 class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
 
@@ -130,6 +136,12 @@ class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
     return conversion_params._replace(
         rewriter_config_template=rewrite_config_with_trt)
 
+  def ShouldRunTest(self, run_params):
+    # TODO(b/162448349): Enable the test for TRT 7.1.3.
+    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
+      return (False, "Skip test due to b/162448349")
+    return super().ShouldRunTest(run_params)
+
 
 class SimpleMultiEnginesTest2(trt_test.TfTrtIntegrationTestBase):
 
diff --git a/tensorflow/python/compiler/tensorrt/test/combined_nms_test.py b/tensorflow/python/compiler/tensorrt/test/combined_nms_test.py
index ffb1bf85e87..26e911e3b0b 100644
--- a/tensorflow/python/compiler/tensorrt/test/combined_nms_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/combined_nms_test.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.compiler.tf2tensorrt._pywrap_py_utils import get_linked_tensorrt_version
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -90,14 +89,16 @@ class CombinedNmsTest(trt_test.TfTrtIntegrationTestBase):
     }
 
   def ShouldRunTest(self, run_params):
+    # TODO(b/162447069): Enable the test for TRT 7.1.3.
+    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
+      return (False, 'Skip test due to b/162447069')
     # There is no CombinedNonMaxSuppression op for GPU at the moment, so
     # calibration will fail.
     # TODO(laigd): fix this.
     # Only run for TRT 5.1 and above.
-    ver = get_linked_tensorrt_version()
-    return (ver[0] > 5 or
-            (ver[0] == 5 and ver[1] >= 1)) and not trt_test.IsQuantizationMode(
-                run_params.precision_mode), 'test >=TRT5.1 and non-INT8'
+    return trt_test.IsTensorRTVersionGreaterEqual(
+        5, 1) and not trt_test.IsQuantizationMode(
+            run_params.precision_mode), 'test >=TRT5.1 and non-INT8'
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/compiler/tensorrt/test/const_broadcast_test.py b/tensorflow/python/compiler/tensorrt/test/const_broadcast_test.py
index ccbaf9e52fa..9e71b9e3f75 100644
--- a/tensorflow/python/compiler/tensorrt/test/const_broadcast_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/const_broadcast_test.py
@@ -60,6 +60,12 @@ class ConstBroadcastTest(trt_test.TfTrtIntegrationTestBase):
     """The relative tolerance to compare floating point results."""
     return 1.e-04 if run_params.precision_mode == 'FP32' else 1.e-02
 
+  def ShouldRunTest(self, run_params):
+    # TODO(b/162448349): Enable the test for TRT 7.1.3.
+    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
+      return (False, 'Skip test due to b/162448349')
+    return super().ShouldRunTest(run_params)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/compiler/tensorrt/test/conv2d_test.py b/tensorflow/python/compiler/tensorrt/test/conv2d_test.py
index df1adce2178..400c17b343e 100644
--- a/tensorflow/python/compiler/tensorrt/test/conv2d_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/conv2d_test.py
@@ -114,6 +114,12 @@ class Conv2DNCHWTest(trt_test.TfTrtIntegrationTestBase):
       return 4e-02
     return super(Conv2DNCHWTest, self).ExpectedRelativeTolerance(run_params)
 
+  def ShouldRunTest(self, run_params):
+    # TODO(b/162448349): Enable the test for TRT 7.1.3.
+    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
+      return (False, "Skip test due to b/162448349")
+    return super().ShouldRunTest(run_params)
+
 
 class Conv2DNHWCTest(trt_test.TfTrtIntegrationTestBase):
   """Testing conversion of Conv2D (data_format=NCHW) in TF-TRT conversion."""
@@ -137,6 +143,12 @@ class Conv2DNHWCTest(trt_test.TfTrtIntegrationTestBase):
     """Return the expected engines to build."""
     return ["TRTEngineOp_0"]
 
+  def ShouldRunTest(self, run_params):
+    # TODO(b/162448349): Enable the test for TRT 7.1.3.
+    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
+      return (False, "Skip test due to b/162448349")
+    return super().ShouldRunTest(run_params)
+
 
 class Conv2DStridedNCHWTest(trt_test.TfTrtIntegrationTestBase):
   """Testing conversion of strided Conv2D (data_format=NCHW)."""
@@ -168,6 +180,12 @@ class Conv2DStridedNCHWTest(trt_test.TfTrtIntegrationTestBase):
     """Return the expected engines to build."""
     return ["TRTEngineOp_0"]
 
+  def ShouldRunTest(self, run_params):
+    # TODO(b/162448349): Enable the test for TRT 7.1.3.
+    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
+      return (False, "Skip test due to b/162448349")
+    return super().ShouldRunTest(run_params)
+
 
 class Conv2DTranposeTest(trt_test.TfTrtIntegrationTestBase):
   """Testing conversion of conv2d_transpose (AKA Conv2DBackpropInput)"""
diff --git a/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py b/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py
index 95dbe727ac3..f02ad08777e 100644
--- a/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py
@@ -98,6 +98,9 @@ class DynamicInputShapesTest(trt_test.TfTrtIntegrationTestBase):
     return ["TRTEngineOp_0"]
 
   def ShouldRunTest(self, run_params):
+    # TODO(b/162448349): Enable the test for TRT 7.1.3.
+    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
+      return (False, "Skip test due to b/162448349")
     return (run_params.dynamic_engine and not trt_test.IsQuantizationMode(
         run_params.precision_mode)), "test dynamic engine and non-INT8"
 
diff --git a/tensorflow/python/compiler/tensorrt/test/lru_cache_test.py b/tensorflow/python/compiler/tensorrt/test/lru_cache_test.py
index 9ebbfd51bc6..a2caa070011 100644
--- a/tensorflow/python/compiler/tensorrt/test/lru_cache_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/lru_cache_test.py
@@ -33,14 +33,6 @@ from tensorflow.python.platform import test
 class LRUCacheTest(trt_test.TfTrtIntegrationTestBase):
 
   def GraphFn(self, x):
-    conv_filter = constant_op.constant(
-        np.random.randn(3, 3, 2, 1), dtype=dtypes.float32)
-    x = nn.conv2d(
-        input=x,
-        filter=conv_filter,
-        strides=[1, 1, 1, 1],
-        padding="SAME",
-        name="conv")
     bias = constant_op.constant(
         np.random.randn(1, 10, 10, 1), dtype=dtypes.float32)
     x = math_ops.add(x, bias)
@@ -51,9 +43,9 @@ class LRUCacheTest(trt_test.TfTrtIntegrationTestBase):
     dtype = dtypes.float32
     input_dims = [[[1, 10, 10, 2]], [[2, 10, 10, 2]], [[4, 10, 10, 2]],
                   [[2, 10, 10, 2]]]
-    expected_output_dims = [[[1, 10, 10, 1]], [[2, 10, 10, 1]], [[4, 10, 10,
-                                                                  1]],
-                            [[2, 10, 10, 1]]]
+    expected_output_dims = [[[1, 10, 10, 2]], [[2, 10, 10, 2]], [[4, 10, 10,
+                                                                  2]],
+                            [[2, 10, 10, 2]]]
     return trt_test.TfTrtIntegrationTestParams(
         graph_fn=self.GraphFn,
         input_specs=[
diff --git a/tensorflow/python/compiler/tensorrt/test/memory_alignment_test.py b/tensorflow/python/compiler/tensorrt/test/memory_alignment_test.py
index 056edc3e4d4..c1f0a007bf8 100644
--- a/tensorflow/python/compiler/tensorrt/test/memory_alignment_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/memory_alignment_test.py
@@ -67,6 +67,12 @@ class MemoryAlignmentTest(trt_test.TfTrtIntegrationTestBase):
     """The relative tolerance to compare floating point results."""
     return 0.1
 
+  def ShouldRunTest(self, run_params):
+    # TODO(b/162448349): Enable the test for TRT 7.1.3.
+    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
+      return (False, "Skip test due to b/162448349")
+    return super().ShouldRunTest(run_params)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/compiler/tensorrt/test/multi_connection_neighbor_engine_test.py b/tensorflow/python/compiler/tensorrt/test/multi_connection_neighbor_engine_test.py
index b57bee6c5d7..687a12486b7 100644
--- a/tensorflow/python/compiler/tensorrt/test/multi_connection_neighbor_engine_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/multi_connection_neighbor_engine_test.py
@@ -72,6 +72,12 @@ class MultiConnectionNeighborEngineTest(trt_test.TfTrtIntegrationTestBase):
     """Return the expected engines to build."""
     return ["TRTEngineOp_0", "TRTEngineOp_1"]
 
+  def ShouldRunTest(self, run_params):
+    # TODO(b/162447069): Enable the test for TRT 7.1.3.
+    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
+      return (False, "Skip test due to b/162447069")
+    return super().ShouldRunTest(run_params)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/compiler/tensorrt/test/neighboring_engine_test.py b/tensorflow/python/compiler/tensorrt/test/neighboring_engine_test.py
index f377fe8dceb..39fee5cba5d 100644
--- a/tensorflow/python/compiler/tensorrt/test/neighboring_engine_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/neighboring_engine_test.py
@@ -61,6 +61,12 @@ class NeighboringEngineTest(trt_test.TfTrtIntegrationTestBase):
         "TRTEngineOp_1": ["weights", "conv"]
     }
 
+  def ShouldRunTest(self, run_params):
+    # TODO(b/162447069): Enable the test for TRT 7.1.3.
+    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
+      return (False, "Skip test due to b/162447069")
+    return super().ShouldRunTest(run_params)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py b/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py
index 92e44aa68a8..d859407f1f7 100644
--- a/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py
@@ -18,13 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-
 import tensorflow_datasets as tfds
-
-from tensorflow.compiler.tf2tensorrt._pywrap_py_utils import get_linked_tensorrt_version
 from tensorflow.compiler.tf2tensorrt._pywrap_py_utils import is_tensorrt_enabled
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.compiler.tensorrt import trt_convert
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator.estimator import Estimator
 from tensorflow.python.estimator.model_fn import EstimatorSpec
@@ -147,7 +145,7 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
                    len(graph_def.node))
       converter = trt_convert.TrtGraphConverter(
           input_graph_def=graph_def,
-          nodes_blacklist=[OUTPUT_NODE_NAME],
+          nodes_denylist=[OUTPUT_NODE_NAME],
           max_batch_size=max_batch_size,
           precision_mode='INT8',
           # There is a 2GB GPU memory limit for each test, so we set
@@ -262,6 +260,11 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
   def testEval(self):
     if not is_tensorrt_enabled():
       return
+
+    # TODO(b/162447069): Enable the test for TRT 7.1.3.
+    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
+      return
+
     model_dir = test.test_src_dir_path(
         'python/compiler/tensorrt/test/testdata/mnist')
 
@@ -274,7 +277,7 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
     logging.info('accuracy_tf_native: %f', accuracy_tf_native)
     self.assertAllClose(0.9662, accuracy_tf_native, rtol=3e-3, atol=3e-3)
 
-    if get_linked_tensorrt_version()[0] < 5:
+    if not trt_test.IsTensorRTVersionGreaterEqual(5):
       return
 
     accuracy_tf_trt = self._Run(
@@ -286,6 +289,5 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
     logging.info('accuracy_tf_trt: %f', accuracy_tf_trt)
     self.assertAllClose(0.9675, accuracy_tf_trt, rtol=1e-3, atol=1e-3)
 
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/compiler/tensorrt/test/quantization_test.py b/tensorflow/python/compiler/tensorrt/test/quantization_test.py
index 7ed3414817c..c41afbb29c5 100644
--- a/tensorflow/python/compiler/tensorrt/test/quantization_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/quantization_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.compiler.tf2tensorrt._pywrap_py_utils import get_linked_tensorrt_version
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -65,7 +64,7 @@ class QuantizationMissingAllRangesTest(trt_test.TfTrtIntegrationTestBase):
 
   def ShouldRunTest(self, run_params):
     # Only test static engine mode, with or without calibration.
-    return (get_linked_tensorrt_version()[0] >= 5 and
+    return (trt_test.IsTensorRTVersionGreaterEqual(5) and
             trt_test.IsQuantizationMode(run_params.precision_mode) and
             not run_params.convert_online and not run_params.dynamic_engine
            ), "test static engine, offline conversion and INT8"
@@ -90,7 +89,7 @@ class QuantizationWithRangesTest(trt_test.TfTrtIntegrationTestBase):
 
   def ShouldRunTest(self, run_params):
     # Test static/dynamic engine with/without calibration.
-    return (get_linked_tensorrt_version()[0] >= 5 and
+    return (trt_test.IsTensorRTVersionGreaterEqual(5) and
             trt_test.IsQuantizationMode(run_params.precision_mode) and
             not run_params.convert_online), "test offline conversion and INT8"
 
diff --git a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
index 87fa55a32bd..27133a14203 100644
--- a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
+++ b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
@@ -31,6 +31,7 @@ import warnings
 import numpy as np
 import six
 
+from tensorflow.compiler.tf2tensorrt._pywrap_py_utils import get_linked_tensorrt_version
 from tensorflow.compiler.tf2tensorrt._pywrap_py_utils import is_tensorrt_enabled
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import config_pb2
@@ -100,6 +101,12 @@ def IsQuantizationWithCalibration(params):
   return IsQuantizationMode(params.precision_mode) and params.use_calibration
 
 
+def IsTensorRTVersionGreaterEqual(major, minor=0, patch=0):
+  ver = get_linked_tensorrt_version()
+  return ver[0] > major or (ver[0] == major and ver[1] > minor) or (
+      ver[0] == major and ver[1] == minor and ver[2] >= patch)
+
+
 class GraphState(object):
   ORIGINAL = 0
   CALIBRATE = 1
diff --git a/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py b/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
index c67de7432cd..7d991678748 100644
--- a/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from unittest import SkipTest  # pylint: disable=g-importing-member
 
-from tensorflow.compiler.tf2tensorrt._pywrap_py_utils import get_linked_tensorrt_version
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
@@ -132,8 +131,7 @@ class ExplicitBatchTest(TrtModeTestBase):
 
   def ShouldRunTest(self, run_params):
     # Only run for TRT 6 and above.
-    ver = get_linked_tensorrt_version()
-    return run_params.is_v2 and ver[0] >= 6 and (
+    return run_params.is_v2 and trt_test.IsTensorRTVersionGreaterEqual(6) and (
         not run_params.use_calibration), "test v2, >=TRT6 and non-calibration"
 
 
@@ -169,8 +167,7 @@ class DynamicShapesTest(TrtModeTestBase):
 
   def ShouldRunTest(self, run_params):
     # Only run for TRT 6 and above.
-    ver = get_linked_tensorrt_version()
-    return run_params.is_v2 and ver[0] >= 6 and (
+    return run_params.is_v2 and trt_test.IsTensorRTVersionGreaterEqual(6) and (
         not run_params.use_calibration), "test v2 >=TRT6 and non-calibration"
 
 
diff --git a/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py b/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py
index 8fd9606812d..43034e8b31e 100644
--- a/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py
@@ -76,6 +76,12 @@ class VGGBlockNCHWTest(trt_test.TfTrtIntegrationTestBase):
     super(trt_test.TfTrtIntegrationTestBase, self).setUp()
     os.environ["TF_TRT_ALLOW_ENGINE_NATIVE_SEGMENT_EXECUTION"] = "True"
 
+  def ShouldRunTest(self, run_params):
+    # TODO(b/162448349): Enable the test for TRT 7.1.3.
+    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
+      return (False, "Skip test due to b/162448349")
+    return super().ShouldRunTest(run_params)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py b/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py
index 9d81cd6dcc3..7b1f7e062d7 100644
--- a/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py
@@ -67,6 +67,12 @@ class VGGBlockTest(trt_test.TfTrtIntegrationTestBase):
     super(trt_test.TfTrtIntegrationTestBase, self).setUp()
     os.environ["TF_TRT_ALLOW_ENGINE_NATIVE_SEGMENT_EXECUTION"] = "True"
 
+  def ShouldRunTest(self, run_params):
+    # TODO(b/162448349): Enable the test for TRT 7.1.3.
+    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
+      return (False, "Skip test due to b/162448349")
+    return super().ShouldRunTest(run_params)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert.py b/tensorflow/python/compiler/tensorrt/trt_convert.py
index 255d65abda9..a0388c3630d 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert.py
@@ -432,7 +432,7 @@ class TrtGraphConverter(object):
                input_saved_model_tags=None,
                input_saved_model_signature_key=None,
                input_graph_def=None,
-               nodes_blacklist=None,
+               nodes_denylist=None,
                session_config=None,
                max_batch_size=1,
                max_workspace_size_bytes=DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES,
@@ -452,7 +452,7 @@ class TrtGraphConverter(object):
       input_graph_def: a GraphDef object containing a model to be transformed.
         If set to None, the graph will be read from the SavedModel loaded from
         input_saved_model_dir.
-      nodes_blacklist: list of node names to prevent the converter from
+      nodes_denylist: list of node names to prevent the converter from
         touching.
       session_config: the ConfigProto used to create a Session. It's also used
         as a template to create a TRT-enabled ConfigProto for conversion. If not
@@ -497,7 +497,7 @@ class TrtGraphConverter(object):
     _check_trt_version_compatibility()
 
     self._input_graph_def = input_graph_def
-    self._nodes_blacklist = nodes_blacklist
+    self._nodes_denylist = nodes_denylist
 
     self._input_saved_model_dir = input_saved_model_dir
     self._converted = False
@@ -558,15 +558,15 @@ class TrtGraphConverter(object):
         graph_id=b"tf_graph")
     self._converted = True
 
-  def _add_nodes_blacklist(self):
-    if self._nodes_blacklist:
+  def _add_nodes_denylist(self):
+    if self._nodes_denylist:
       collection_def = self._grappler_meta_graph_def.collection_def["train_op"]
-      blacklist = collection_def.node_list.value
-      for i in self._nodes_blacklist:
+      denylist = collection_def.node_list.value
+      for i in self._nodes_denylist:
         if isinstance(i, ops.Tensor):
-          blacklist.append(_to_bytes(i.name))
+          denylist.append(_to_bytes(i.name))
         else:
-          blacklist.append(_to_bytes(i))
+          denylist.append(_to_bytes(i))
 
   def _convert_graph_def(self):
     """Convert the input GraphDef."""
@@ -575,7 +575,7 @@ class TrtGraphConverter(object):
       importer.import_graph_def(self._input_graph_def, name="")
     self._grappler_meta_graph_def = saver.export_meta_graph(
         graph_def=graph.as_graph_def(add_shapes=True), graph=graph)
-    self._add_nodes_blacklist()
+    self._add_nodes_denylist()
 
     self._run_conversion()
 
@@ -629,7 +629,7 @@ class TrtGraphConverter(object):
         self._grappler_meta_graph_def.collection_def[collection_key].CopyFrom(
             input_meta_graph_def.collection_def[collection_key])
 
-      self._add_nodes_blacklist()
+      self._add_nodes_denylist()
 
       # Copy other information.
       self._grappler_meta_graph_def.meta_info_def.CopyFrom(
@@ -1342,7 +1342,7 @@ def create_inference_graph(
       input_saved_model_tags=input_saved_model_tags,
       input_saved_model_signature_key=input_saved_model_signature_key,
       input_graph_def=input_graph_def,
-      nodes_blacklist=outputs,
+      nodes_denylist=outputs,
       session_config=session_config,
       max_batch_size=max_batch_size,
       max_workspace_size_bytes=max_workspace_size_bytes,
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert_test.py b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
index 9052fc2b6ed..1aa53a5bc1b 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert_test.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
@@ -280,7 +280,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         input_saved_model_signature_key=_SAVED_MODEL_SIGNATURE_KEY,
         input_graph_def=None
         if input_saved_model_dir else self._GetGraphDefForV1(device),
-        nodes_blacklist=None if input_saved_model_dir else ["output"],
+        nodes_denylist=None if input_saved_model_dir else ["output"],
         session_config=self._GetConfigProto(),
         max_batch_size=max_batch_size,
         max_workspace_size_bytes=TrtConvertTest._TRT_MAX_WORKSPACE_SIZE_BYTES,
diff --git a/tensorflow/python/compiler/xla/xla.py b/tensorflow/python/compiler/xla/xla.py
index 5b19dc4ec5f..59b70f2a217 100644
--- a/tensorflow/python/compiler/xla/xla.py
+++ b/tensorflow/python/compiler/xla/xla.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import contextlib
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
@@ -37,6 +36,8 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.compat import collections_abc
+from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
 _XLA_COMPILE_ATTR = '_xla_compile_id'
@@ -44,7 +45,7 @@ _MAX_WARNING_LINES = 5
 
 # Operations that indicate some error in the users graph. For example, XLA
 # computation should not have any Placeholder op.
-_BLACKLISTED_OPS = set([
+_DENYLISTED_OPS = set([
     'Placeholder',
 ])
 
@@ -64,6 +65,10 @@ _UNSUPPORTED_OPS = set([
 
 
 @tf_export('xla.experimental.compile')
+@deprecated(
+    None, 'xla.experimental.compile is deprecated. Consider using '
+    'tf.function(experimental_compile=True)',
+    warn_once=True)
 def compile(computation, inputs=None):  # pylint: disable=redefined-builtin
   """Builds an operator that compiles and runs `computation` with XLA.
 
@@ -195,7 +200,7 @@ class XLACompileContext(control_flow_ops.XLAControlFlowContext):
   def AddOp(self, op):
     """Create op in XLACompileContext and notifies outer context recursively."""
     # pylint: disable=protected-access
-    if op.type in _BLACKLISTED_OPS:
+    if op.type in _DENYLISTED_OPS:
       logging.error(
           'Operation of type %s (%s) is not supported in XLA. Execution will '
           'fail if this op is used in the graph. ', op.type, op.name)
@@ -329,7 +334,7 @@ def _compile_internal(computation, inputs=None):
   if inputs is None:
     inputs = []
 
-  if not isinstance(inputs, collections.Sequence):
+  if not isinstance(inputs, collections_abc.Sequence):
     raise TypeError('inputs must be a list')
 
   # Flatten inputs.
@@ -428,15 +433,15 @@ def is_flat(outputs):
   """
   # If outputs is a list or tuple, check if it has any nested structure. If
   # there is, then outputs is non-flat.
-  if isinstance(outputs, collections.Sequence):
+  if isinstance(outputs, collections_abc.Sequence):
     for o in outputs:
-      if (isinstance(o, collections.Sequence) or
-          isinstance(o, collections.Mapping) or
+      if (isinstance(o, collections_abc.Sequence) or
+          isinstance(o, collections_abc.Mapping) or
           hasattr(o.__class__, '__attrs_attrs__')):
         return False
 
   # If outputs is a dict, it is non-flat.
-  if isinstance(outputs, collections.Mapping):
+  if isinstance(outputs, collections_abc.Mapping):
     return False
 
   # If outputs is from the attrs library, it is non-flat.
@@ -467,7 +472,7 @@ def _postprocess_flat_outputs(outputs):
   if outputs is None:
     outputs = tuple()
   # If the computation only returned one value, make it a tuple.
-  if not isinstance(outputs, collections.Sequence):
+  if not isinstance(outputs, collections_abc.Sequence):
     outputs = (outputs,)
 
   # Append `no_op` here so that return value of this function always contains
diff --git a/tensorflow/python/data/BUILD b/tensorflow/python/data/BUILD
index d4f4f8055d7..882cc66b673 100644
--- a/tensorflow/python/data/BUILD
+++ b/tensorflow/python/data/BUILD
@@ -9,6 +9,7 @@ py_library(
     name = "data",
     srcs = ["__init__.py"],
     srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/python:util",
         "//tensorflow/python/data/experimental",
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index 63cbdcbd676..18b748904e6 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -642,20 +642,6 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "replicate_cluster_test",
-    srcs = ["replicate_cluster_test.py"],
-    grpc_enabled = True,
-    tags = ["no_oss"],
-    deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_combinations",
-        "//tensorflow/python/data/experimental/ops:distribute",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
 cuda_py_test(
     name = "scan_test",
     size = "small",
diff --git a/tensorflow/python/data/experimental/kernel_tests/assert_next_test.py b/tensorflow/python/data/experimental/kernel_tests/assert_next_test.py
index 37d0f1586a4..17419c9b299 100644
--- a/tensorflow/python/data/experimental/kernel_tests/assert_next_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/assert_next_test.py
@@ -38,6 +38,17 @@ class AssertNextTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset.with_options(options)
     self.assertDatasetProduces(dataset, expected_output=[0])
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testIgnoreVersionSuffix(self):
+    # The `batch` transformation creates a "BatchV2" dataset, but we should
+    # still match that with "Batch".
+    dataset = dataset_ops.Dataset.from_tensors(0).apply(
+        testing.assert_next(["Map", "Batch"])).map(lambda x: x).batch(1)
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
+    self.assertDatasetProduces(dataset, expected_output=[[0]])
+
   @combinations.generate(test_base.default_test_combinations())
   def testAssertNextInvalid(self):
     dataset = dataset_ops.Dataset.from_tensors(0).apply(
@@ -47,10 +58,8 @@ class AssertNextTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset.with_options(options)
     self.assertDatasetProduces(
         dataset,
-        expected_error=(
-            errors.InvalidArgumentError,
-            "Asserted Whoops transformation at offset 0 but encountered "
-            "Map transformation instead."))
+        expected_error=(errors.InvalidArgumentError,
+                        "Asserted transformation matching Whoops"))
 
   @combinations.generate(test_base.default_test_combinations())
   def testAssertNextShort(self):
diff --git a/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
index 8271dbada7a..36587d97ea0 100644
--- a/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
@@ -17,8 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from absl.testing import parameterized
 
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
 from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
 from tensorflow.python.data.experimental.ops import cardinality
 from tensorflow.python.data.experimental.ops import distribute
@@ -31,7 +35,10 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.framework import combinations
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.lib.io import python_io
+from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
@@ -387,14 +394,27 @@ class AutoShardDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
       self.evaluate(self.getNext(dataset)())
 
   @combinations.generate(test_base.default_test_combinations())
-  def testShardWithRebatch(self):
-    # Tests that Rebatch is a passthrough op.
+  def testShardWithLegacyRebatch(self):
+    # Tests that RebatchDatasetV1 is a passthrough op.
     dataset = dataset_ops.Dataset.list_files(self.test_filenames, shuffle=False)
     dataset = dataset.apply(
-        testing.assert_next(["Shard", "FlatMap", "BatchV2", "Rebatch"]))
+        testing.assert_next(["Shard", "FlatMap", "Batch", "Rebatch"]))
     dataset = dataset.flat_map(core_readers.TFRecordDataset)
     dataset = dataset.batch(5)
-    dataset = distribute._RebatchDataset(dataset, num_replicas=1)
+    dataset = distribute._LegacyRebatchDataset(dataset, num_replicas=1)
+    dataset = distribute._AutoShardDataset(dataset, 5, 3)
+    nxt = self.getNext(dataset)
+    self.evaluate(nxt())
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testShardWithRebatch(self):
+    # Tests that RebatchDatasetV2 is a passthrough op.
+    dataset = dataset_ops.Dataset.list_files(self.test_filenames, shuffle=False)
+    dataset = dataset.apply(
+        testing.assert_next(["Shard", "FlatMap", "Batch", "Rebatch"]))
+    dataset = dataset.flat_map(core_readers.TFRecordDataset)
+    dataset = dataset.batch(5)
+    dataset = distribute._RebatchDataset(dataset, batch_sizes=5)
     dataset = distribute._AutoShardDataset(dataset, 5, 3)
     nxt = self.getNext(dataset)
     self.evaluate(nxt())
@@ -445,6 +465,46 @@ class AutoShardDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     ]
     self.assertDatasetProduces(dataset, list(chunk(expected, 5)))
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testMakeBatchedFeaturesDataset(self):
+    files = 2
+    records_per_file = 5
+
+    def make_record(file_index):
+      example = example_pb2.Example(
+          features=feature_pb2.Features(
+              feature={
+                  "file":
+                      feature_pb2.Feature(
+                          int64_list=feature_pb2.Int64List(value=[file_index])),
+              }))
+      return example.SerializeToString()
+
+    filenames = []
+    for file_index in range(files):
+      filename = os.path.join(self.get_temp_dir(),
+                              "tf_record.%d.txt" % file_index)
+      filenames.append(filename)
+      writer = python_io.TFRecordWriter(filename)
+      for _ in range(records_per_file):
+        writer.write(make_record(file_index))
+      writer.close()
+
+    dataset = readers.make_batched_features_dataset(
+        file_pattern=filenames,
+        batch_size=records_per_file,
+        features={
+            "file": parsing_ops.FixedLenFeature([], dtypes.int64),
+        },
+        reader=core_readers.TFRecordDataset,
+        num_epochs=1)
+    # We should shard at the file level, so that all records come from file 0.
+    dataset = distribute._AutoShardDataset(dataset, 2, 0)
+    dataset = dataset.unbatch()
+    output = self.getDatasetOutput(dataset)
+    files = [elem["file"] for elem in output]
+    self.assertEqual(files, [0] * records_per_file)
+
 
 class AutoShardTextLineDatasetTest(
     reader_dataset_ops_test_base.TextLineDatasetTestBase,
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
index 1411481f0ac..f3e871a903e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
@@ -304,6 +304,24 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "reorder_data_discarding_ops_test",
+    size = "small",
+    srcs = ["reorder_data_discarding_ops_test.py"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/experimental/ops:testing",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
 tf_py_test(
     name = "shuffle_and_repeat_fusion_test",
     srcs = ["shuffle_and_repeat_fusion_test.py"],
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/inject_prefetch_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/inject_prefetch_test.py
index 4e908ead618..a4d5e9db785 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/inject_prefetch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/inject_prefetch_test.py
@@ -37,7 +37,7 @@ class InjectPrefetchTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testParallelMap(self):
     dataset = dataset_ops.Dataset.range(100)
     dataset = dataset.apply(
-        testing.assert_next(["ParallelMapV2", "Prefetch", "FiniteTake"]))
+        testing.assert_next(["ParallelMap", "Prefetch", "FiniteTake"]))
     dataset = dataset.map(
         lambda x: x + 1, num_parallel_calls=dataset_ops.AUTOTUNE)
     dataset = dataset.take(50)
@@ -61,7 +61,7 @@ class InjectPrefetchTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testParallelInterleave(self):
     dataset = dataset_ops.Dataset.range(100)
     dataset = dataset.apply(
-        testing.assert_next(["ParallelInterleaveV4", "Prefetch", "FiniteTake"]))
+        testing.assert_next(["ParallelInterleave", "Prefetch", "FiniteTake"]))
     dataset = dataset.interleave(
         lambda x: dataset_ops.Dataset.from_tensors(x + 1),
         num_parallel_calls=dataset_ops.AUTOTUNE)
@@ -74,7 +74,7 @@ class InjectPrefetchTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset_ops.Dataset.range(100)
     dataset = dataset.apply(
         testing.assert_next([
-            "ParallelMapV2", "Prefetch", "ParallelInterleaveV4", "Prefetch",
+            "ParallelMap", "Prefetch", "ParallelInterleave", "Prefetch",
             "MapAndBatch", "Prefetch", "FiniteTake"
         ]))
     dataset = dataset.map(
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
index 080a03c76dd..3876408697f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
@@ -223,7 +223,7 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
     """
     map_node_name = "Map"
     if num_parallel_calls is not None:
-      map_node_name = "ParallelMapV2"
+      map_node_name = "ParallelMap"
 
     def _make_dataset(node_names):
       dataset = base_dataset.apply(testing.assert_next(node_names))
@@ -235,11 +235,11 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
       dataset = dataset.with_options(options)
       return dataset
 
-    unoptimized = _make_dataset([map_node_name, "BatchV2"])
+    unoptimized = _make_dataset([map_node_name, "Batch"])
     # Note that because of the `ChooseDataset` fork, we can't use `assert_next`
     # to verify the optimization result.
-    optimized = _make_dataset(["ChooseFastestBranch"] if expect_optimized else
-                              [map_node_name, "BatchV2"])
+    optimized = _make_dataset(["ChooseFastestBranch"]
+                              if expect_optimized else [map_node_name, "Batch"])
     optimized = self._enable_map_vectorization(optimized)
     return unoptimized, optimized
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
index f564fac4f1b..e8fdf5f2e24 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
@@ -50,7 +50,7 @@ def _test_combinations():
     ds = ds.map(lambda x: (x, x), num_parallel_calls=2)  # Not eliminated
     return ds.map(lambda x, y: (x, y))  # Eliminated
 
-  parallel_map_name = "ParallelMapV2"
+  parallel_map_name = "ParallelMap"
 
   cases = [
       ("Skip0", lambda ds: ds.skip(0), None),
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/reorder_data_discarding_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/reorder_data_discarding_ops_test.py
new file mode 100644
index 00000000000..66b509cccf9
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/reorder_data_discarding_ops_test.py
@@ -0,0 +1,71 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the `ReorderDataDiscardingOps` rewrite."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.ops import testing
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
+from tensorflow.python.platform import test
+
+
+class ReorderDataDiscardingOpsTest(test_base.DatasetTestBase,
+                                   parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(tf_api_version=2, mode=["eager", "graph"]))
+  def testSimpleReorderingV2(self):
+    dataset = dataset_ops.Dataset.range(100)
+    dataset = dataset.apply(
+        testing.assert_next(
+            ["FiniteSkip", "FiniteTake", "Shard", "ParallelMap", "Prefetch"]))
+    dataset = dataset.map(lambda x: x + 1, num_parallel_calls=10)
+    dataset = dataset.skip(10)
+    dataset = dataset.prefetch(1)
+    dataset = dataset.take(50)
+    dataset = dataset.shard(2, 0)
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.reorder_data_discarding_ops = True
+    dataset = dataset.with_options(options)
+    self.assertDatasetProduces(dataset, range(11, 61, 2))
+
+  @combinations.generate(
+      combinations.combine(tf_api_version=1, mode=["eager", "graph"]))
+  def testSimpleReorderingV1(self):
+    dataset = dataset_ops.Dataset.range(100)
+    # Map ops have preserve_cardinality=false in tensorflow v1.
+    dataset = dataset.apply(
+        testing.assert_next(
+            ["ParallelMap", "FiniteSkip", "FiniteTake", "Shard", "Prefetch"]))
+    dataset = dataset.map(lambda x: x + 1, num_parallel_calls=10)
+    dataset = dataset.skip(10)
+    dataset = dataset.prefetch(1)
+    dataset = dataset.take(50)
+    dataset = dataset.shard(2, 0)
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.reorder_data_discarding_ops = True
+    dataset = dataset.with_options(options)
+    self.assertDatasetProduces(dataset, range(11, 61, 2))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimize_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimize_dataset_test.py
index 59e41528ea4..e26e97dbd97 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimize_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimize_dataset_test.py
@@ -225,11 +225,14 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       optimized_it = dataset_ops.make_initializable_iterator(optimized_dataset)
 
     self.assertGreaterEqual(len(w), 1)
-    expected = ("tf.data graph rewrites are not compatible with "
-                "tf.Variable. The following rewrites will be disabled: %s."
-                " To enable rewrites, use resource variables instead by "
-                "calling `tf.enable_resource_variables()` at the start of the "
-                "program." % (", ".join(options._graph_rewrites())))
+    graph_rewrites = options._graph_rewrites()
+    expected = (
+        "tf.data graph rewrites are not compatible with "
+        "tf.Variable. The following rewrites will be disabled: %s."
+        " To enable rewrites, use resource variables instead by "
+        "calling `tf.enable_resource_variables()` at the start of the "
+        "program." %
+        (", ".join(graph_rewrites.enabled + graph_rewrites.default)))
     self.assertTrue(any(expected in str(warning) for warning in w))
 
     # Check that outputs are the same in the optimized and unoptimized cases,
@@ -251,34 +254,136 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         break
 
   @combinations.generate(test_base.default_test_combinations())
-  def testOptimizationEnabledByDefault(self):
-    """Tests that some optimizations are applied to datasets by default."""
+  def testOptimizationDefault(self):
+    """Tests the optimization settings by default."""
     options = dataset_ops.Options()
-    expected_optimizations = [
+    expected_optimizations_enabled = []
+    expected_optimizations_disabled = []
+    expected_optimizations_default = [
         "map_and_batch_fusion",
         "noop_elimination",
         "shuffle_and_repeat_fusion",
     ]
-    self.assertEqual(
-        set(options._graph_rewrites()), set(expected_optimizations))
+    graph_rewrites = options._graph_rewrites()
+    self.assertEqual(set(graph_rewrites.enabled),
+                     set(expected_optimizations_enabled))
+    self.assertEqual(set(graph_rewrites.disabled),
+                     set(expected_optimizations_disabled))
+    self.assertEqual(set(graph_rewrites.default),
+                     set(expected_optimizations_default))
+
+    options.experimental_optimization.apply_default_optimizations = True
+    graph_rewrites = options._graph_rewrites()
+    self.assertEqual(set(graph_rewrites.enabled),
+                     set(expected_optimizations_enabled))
+    self.assertEqual(set(graph_rewrites.disabled),
+                     set(expected_optimizations_disabled))
+    self.assertEqual(set(graph_rewrites.default),
+                     set(expected_optimizations_default))
+
+    options.experimental_optimization.apply_default_optimizations = False
+    expected_optimizations_default = []
+    graph_rewrites = options._graph_rewrites()
+    self.assertEqual(set(graph_rewrites.enabled),
+                     set(expected_optimizations_enabled))
+    self.assertEqual(set(graph_rewrites.disabled),
+                     set(expected_optimizations_disabled))
+    self.assertEqual(set(graph_rewrites.default),
+                     set(expected_optimizations_default))
 
   @combinations.generate(test_base.default_test_combinations())
-  def testOptimizationDisableDefault(self):
-    """Tests that we can disable all graph optimizations enabled by default.
-
-    If the `apply_default_optimizations` optimization options flag is False,
-    only explicitly enabled optimizations will be applied.
-    """
+  def testOptimizationEnabled(self):
+    """Tests the optimization settings by enabling all."""
     options = dataset_ops.Options()
-    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.filter_fusion = True
+    options.experimental_optimization.filter_with_random_uniform_fusion = True
     options.experimental_optimization.hoist_random_uniform = True
+    options.experimental_optimization.map_and_batch_fusion = True
+    options.experimental_optimization.map_and_filter_fusion = True
+    options.experimental_optimization.map_parallelization = True
+    options.experimental_optimization.map_fusion = True
     options.experimental_optimization.noop_elimination = True
-    expected_optimizations = [
+    options.experimental_optimization.parallel_batch = True
+    options.experimental_optimization.shuffle_and_repeat_fusion = True
+    options.experimental_optimization.map_vectorization.enabled = True
+    options.experimental_optimization.autotune_buffers = True
+    options.experimental_deterministic = False
+    options.experimental_stats.latency_all_edges = True
+    options.experimental_slack = True
+
+    expected_optimizations_enabled = [
+        "filter_fusion",
+        "filter_with_random_uniform_fusion",
         "hoist_random_uniform",
+        "map_and_batch_fusion",
+        "map_and_filter_fusion",
+        "map_parallelization",
+        "map_fusion",
         "noop_elimination",
+        "parallel_batch",
+        "shuffle_and_repeat_fusion",
+        "map_vectorization",
+        "inject_prefetch",
+        "make_sloppy",
+        "latency_all_edges",
+        "slack",
     ]
-    self.assertEqual(
-        set(options._graph_rewrites()), set(expected_optimizations))
+    expected_optimizations_disabled = []
+    expected_optimizations_default = []
+    graph_rewrites = options._graph_rewrites()
+    self.assertEqual(set(graph_rewrites.enabled),
+                     set(expected_optimizations_enabled))
+    self.assertEqual(set(graph_rewrites.disabled),
+                     set(expected_optimizations_disabled))
+    self.assertEqual(set(graph_rewrites.default),
+                     set(expected_optimizations_default))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testOptimizationDisabled(self):
+    """Tests the optimization settings by disabling all."""
+    options = dataset_ops.Options()
+    options.experimental_optimization.filter_fusion = False
+    options.experimental_optimization.filter_with_random_uniform_fusion = False
+    options.experimental_optimization.hoist_random_uniform = False
+    options.experimental_optimization.map_and_batch_fusion = False
+    options.experimental_optimization.map_and_filter_fusion = False
+    options.experimental_optimization.map_parallelization = False
+    options.experimental_optimization.map_fusion = False
+    options.experimental_optimization.noop_elimination = False
+    options.experimental_optimization.parallel_batch = False
+    options.experimental_optimization.shuffle_and_repeat_fusion = False
+    options.experimental_optimization.map_vectorization.enabled = False
+    options.experimental_optimization.autotune = False
+    options.experimental_deterministic = True
+    options.experimental_stats.latency_all_edges = False
+    options.experimental_slack = False
+
+    expected_optimizations_enabled = []
+    expected_optimizations_disabled = [
+        "filter_fusion",
+        "filter_with_random_uniform_fusion",
+        "hoist_random_uniform",
+        "map_and_batch_fusion",
+        "map_and_filter_fusion",
+        "map_parallelization",
+        "map_fusion",
+        "noop_elimination",
+        "parallel_batch",
+        "shuffle_and_repeat_fusion",
+        "map_vectorization",
+        "inject_prefetch",
+        "make_sloppy",
+        "latency_all_edges",
+        "slack",
+    ]
+    expected_optimizations_default = []
+    graph_rewrites = options._graph_rewrites()
+    self.assertEqual(set(graph_rewrites.enabled),
+                     set(expected_optimizations_enabled))
+    self.assertEqual(set(graph_rewrites.disabled),
+                     set(expected_optimizations_disabled))
+    self.assertEqual(set(graph_rewrites.default),
+                     set(expected_optimizations_default))
 
   @combinations.generate(test_base.default_test_combinations())
   def testAutotuningDefaults(self):
@@ -295,7 +400,7 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testAutotuningBufferSizes(self):
     options = dataset_ops.Options()
     options.experimental_optimization.autotune_buffers = True
-    self.assertIn("inject_prefetch", options._graph_rewrites())
+    self.assertIn("inject_prefetch", options._graph_rewrites().enabled)
     autotune, algorithm, cpu_budget = options._autotune_settings()
     self.assertTrue(autotune)
     self.assertEqual(algorithm,
diff --git a/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py b/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py
index ff1f1680a76..cbff39b90e5 100644
--- a/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py
@@ -45,7 +45,7 @@ class PrefetchWithSlackTest(test_base.DatasetTestBase, parameterized.TestCase):
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
         dataset, ["/cpu:1", "/cpu:2"])
     dataset = multi_device_iterator._dataset  # pylint: disable=protected-access
-    self.assertIn("slack", dataset.options()._graph_rewrites())
+    self.assertIn("slack", dataset.options()._graph_rewrites().enabled)
     self.assertIn("slack:slack_period:2",
                   dataset.options()._graph_rewrite_configs())
 
@@ -69,7 +69,7 @@ class PrefetchWithSlackTest(test_base.DatasetTestBase, parameterized.TestCase):
     options = dataset_ops.Options()
     options.experimental_slack = True
     dataset = dataset.with_options(options)
-    self.assertIn("slack", dataset.options()._graph_rewrites())
+    self.assertIn("slack", dataset.options()._graph_rewrites().enabled)
     self.assertIn("slack:slack_period:1",
                   dataset.options()._graph_rewrite_configs())
     self.assertDatasetProduces(dataset, range(10))
diff --git a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
index 3f8b40be508..8175480182f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
@@ -26,29 +26,358 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import combinations
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 
 
+class BatchSizesForWorkerTest(test_base.DatasetTestBase,
+                              parameterized.TestCase):
+
+  def _test(self, global_batch_size, num_workers, num_replicas_per_worker,
+            is_batch_size_static):
+    """Test that all constraints are met for given parameters."""
+    if not is_batch_size_static:
+      # Adding a constant value here prevents downstream computation from
+      # statically deriving the value of global batch size when running
+      # in graph mode.
+      global_batch_size += constant_op.constant(0, dtypes.int64)
+
+    batch_sizes_list = []
+    for i in range(num_workers):
+      batch_sizes_list.append(
+          self.evaluate(
+              distribute.batch_sizes_for_worker(global_batch_size, num_workers,
+                                                num_replicas_per_worker, i)))
+    for batch_sizes in batch_sizes_list:
+      # Constraint (A): for any worker, len(batch_sizes) == W * R
+      self.assertLen(batch_sizes, num_workers * num_replicas_per_worker)
+      # Constraint (B): for any worker, sum(batch_sizes) == G
+      self.assertAllEqual(np.sum(batch_sizes), global_batch_size)
+
+    # Each per-worker batch is split into num_workers global steps
+    for step_index in range(num_workers):
+      actual_global_batch = 0
+      offset = step_index * num_replicas_per_worker
+      for batch_sizes in batch_sizes_list:
+        actual_global_batch += np.sum(batch_sizes[offset:offset +
+                                                  num_replicas_per_worker])
+      # Constraint (C): for any step, batch size across all workers add up to G.
+      self.assertAllEqual(
+          global_batch_size,
+          actual_global_batch,
+      )
+
+    # Constraint (D): Batch size of any two replicas differs by at most one
+    self.assertLessEqual(np.max(batch_sizes_list) - np.min(batch_sizes_list), 1)
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(is_batch_size_static=[True, False])))
+  def testBasic(self, is_batch_size_static):
+    # Manually verify basic test case.
+    global_batch_size = 8
+    num_workers = 2
+    num_replicas_per_worker = 2
+    for worker_index in range(4):
+      batch_sizes = distribute.batch_sizes_for_worker(global_batch_size,
+                                                      num_workers,
+                                                      num_replicas_per_worker,
+                                                      worker_index)
+      self.assertAllEqual([2, 2, 2, 2],
+                          tensor_util.constant_value(batch_sizes))
+    self._test(global_batch_size, num_workers, num_replicas_per_worker,
+               is_batch_size_static)
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(is_batch_size_static=[True, False])))
+  def testBatchSizeIndivisibleByNumWorkers(self, is_batch_size_static):
+    global_batch_size = 4
+    num_workers = 3
+    num_replicas_per_worker = 1
+
+    def get_batch_sizes_for_worker(worker_index):
+      return tensor_util.constant_value(
+          distribute.batch_sizes_for_worker(global_batch_size, num_workers,
+                                            num_replicas_per_worker,
+                                            worker_index))
+
+    # Manually verify this test case.
+    self.assertAllEqual([2, 1, 1], get_batch_sizes_for_worker(0))
+    self.assertAllEqual([1, 1, 2], get_batch_sizes_for_worker(1))
+    self.assertAllEqual([1, 2, 1], get_batch_sizes_for_worker(2))
+    self._test(global_batch_size, num_workers, num_replicas_per_worker,
+               is_batch_size_static)
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(is_batch_size_static=[True, False])))
+  def testBatchSizeIndivisibleByNumReplicas(self, is_batch_size_static):
+    self._test(
+        global_batch_size=4,
+        num_workers=1,
+        num_replicas_per_worker=5,
+        is_batch_size_static=is_batch_size_static)
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(is_batch_size_static=[True, False])))
+  def testBatchSizeSmallerThanNumReplicas(self, is_batch_size_static):
+    self._test(
+        global_batch_size=4,
+        num_workers=2,
+        num_replicas_per_worker=5,
+        is_batch_size_static=is_batch_size_static)
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(is_batch_size_static=[True, False])))
+  def testBatchSizeSmallerThanNumWorkers(self, is_batch_size_static):
+    self._test(
+        global_batch_size=4,
+        num_workers=5,
+        num_replicas_per_worker=1,
+        is_batch_size_static=is_batch_size_static)
+
+
 def _flat_shapes(dataset):
-  return nest.flatten(dataset_ops.get_legacy_output_shapes(dataset))
+  return [
+      ts.as_list()
+      for ts in nest.flatten(dataset_ops.get_legacy_output_shapes(dataset))
+  ]
 
 
 class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
+  ##############################################################################
+  # The following tests exercise our static computation of output_shapes.
+  ##############################################################################
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testShapeInferenceNotAllBatchSizesEqual(self):
+    dataset = dataset_ops.Dataset.range(8).batch(4, drop_remainder=True)
+    rebatched_dataset = distribute._RebatchDataset(
+        dataset, batch_sizes=[2, 1, 1])
+    expected_shapes = [[None]]
+    self.assertEqual(expected_shapes, _flat_shapes(rebatched_dataset))
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(drop_remainder=[True, False])))
+  def testShapeInferenceInputBatchDimDivisible(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(8).batch(4, drop_remainder=True)
+    rebatched_dataset = distribute._RebatchDataset(
+        dataset, batch_sizes=[2, 2], drop_remainder=drop_remainder)
+    expected_shapes = [[2]]
+    self.assertEqual(expected_shapes, _flat_shapes(rebatched_dataset))
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations()))
+  def testShapeInferenceInputBatchDimUnknown(self):
+    dataset = dataset_ops.Dataset.range(8).batch(4, drop_remainder=False)
+    rebatched_dataset = distribute._RebatchDataset(
+        dataset, batch_sizes=[2, 2], drop_remainder=False)
+    expected_shapes = [[None]]
+    self.assertEqual(expected_shapes, _flat_shapes(rebatched_dataset))
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations()))
+  def testShapeInferenceInputBatchDimUnknownWithDropRemainder(self):
+    dataset = dataset_ops.Dataset.range(8).batch(4, drop_remainder=False)
+    rebatched_dataset = distribute._RebatchDataset(
+        dataset, batch_sizes=[2, 2], drop_remainder=True)
+    expected_shapes = [[2]]
+    self.assertEqual(expected_shapes, _flat_shapes(rebatched_dataset))
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations()))
+  def testShapeInferenceInputBatchDimIndivisible(self):
+    dataset = dataset_ops.Dataset.range(10).batch(5, drop_remainder=True)
+    rebatched_dataset = distribute._RebatchDataset(
+        dataset, batch_sizes=[2, 2], drop_remainder=False)
+    expected_shapes = [[None]]
+    self.assertEqual(expected_shapes, _flat_shapes(rebatched_dataset))
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations()))
+  def testShapeInferenceInputBatchDimIndivisibleWithDropRemainder(self):
+    dataset = dataset_ops.Dataset.range(10).batch(5, drop_remainder=True)
+    rebatched_dataset = distribute._RebatchDataset(
+        dataset, batch_sizes=[2, 2], drop_remainder=True)
+    expected_shapes = [[2]]
+    self.assertEqual(expected_shapes, _flat_shapes(rebatched_dataset))
+
+  ##############################################################################
+  # The following tests check _RebatchDataset's output.
+  ##############################################################################
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(drop_remainder=[True, False])))
+  def testBasic(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(8).batch(4, drop_remainder=True)
+    rebatched_dataset = distribute._RebatchDataset(
+        dataset, batch_sizes=[2, 2], drop_remainder=drop_remainder)
+
+    expected_shapes = [[2]]
+    self.assertEqual(expected_shapes, _flat_shapes(rebatched_dataset))
+
+    expected_output = [[0, 1], [2, 3], [4, 5], [6, 7]]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations()))
+  def testPartialBatch(self):
+    dataset = dataset_ops.Dataset.range(5).batch(4, drop_remainder=False)
+    rebatched_dataset = distribute._RebatchDataset(
+        dataset, batch_sizes=[2, 2], drop_remainder=False)
+
+    expected_shapes = [[None]]
+    self.assertEqual(expected_shapes, _flat_shapes(rebatched_dataset))
+    expected_output = [[0, 1], [2, 3], [4]]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations()))
+  def testPartialBatchWithDropRemainder(self):
+    dataset = dataset_ops.Dataset.range(5).batch(4, drop_remainder=False)
+    rebatched_dataset = distribute._RebatchDataset(
+        dataset, batch_sizes=[2, 2], drop_remainder=True)
+
+    expected_shapes = [[2]]
+    self.assertEqual(expected_shapes, _flat_shapes(rebatched_dataset))
+    expected_output = [[0, 1], [2, 3]]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(drop_remainder=[True, False])))
+  def testBatchSizeGreaterThanOriginal(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(12).batch(
+        4, drop_remainder=False)
+    rebatched_dataset = distribute._RebatchDataset(
+        dataset, batch_sizes=[6], drop_remainder=drop_remainder)
+
+    expected_output = [[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(drop_remainder=[True, False])))
+  def testEmptySplits(self, drop_remainder):
+    # It's possible for splits to be empty if the batch size is smaller than
+    # the number of replicas. Here, we use an example with batch_size == 4
+    # and num_replicas == 5.
+    dataset = dataset_ops.Dataset.range(8).batch(4, drop_remainder=True)
+    rebatched_dataset = distribute._RebatchDataset(
+        dataset, batch_sizes=[1, 1, 1, 1, 0], drop_remainder=drop_remainder)
+
+    expected_shapes = [[None]]
+    self.assertEqual(expected_shapes, _flat_shapes(rebatched_dataset))
+
+    expected_output = [[0], [1], [2], [3], [], [4], [5], [6], [7], []]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(drop_remainder=[True, False])))
+  def testScalarBatchSizeInput(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(8).batch(
+        4, drop_remainder=True)
+    rebatched_dataset = distribute._RebatchDataset(
+        dataset, batch_sizes=2, drop_remainder=drop_remainder)
+
+    expected_shapes = [[2]]
+    self.assertEqual(expected_shapes, _flat_shapes(rebatched_dataset))
+
+    expected_output = [[0, 1], [2, 3], [4, 5], [6, 7]]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testMultipleBatches(self):
+    dataset = dataset_ops.Dataset.range(16).batch(
+        2, drop_remainder=True).batch(
+            4, drop_remainder=True)
+    self.assertEqual([[4, 2]], _flat_shapes(dataset))
+
+    rebatched_dataset = distribute._RebatchDataset(dataset, [2, 2])
+    self.assertEqual([[2, 2]], _flat_shapes(rebatched_dataset))
+    # Each element is a list of 2 elements where each element is a list of 2.
+    expected_output = [[[0, 1], [2, 3]], [[4, 5], [6, 7]], [[8, 9], [10, 11]],
+                       [[12, 13], [14, 15]]]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testNestedDictionaryOutput(self):
+    dataset = dataset_ops.Dataset.range(8).map(
+        lambda x: {"a": x, "b": {"c": x + 1}}).batch(4, drop_remainder=True)
+    rebatched_dataset = distribute._RebatchDataset(dataset, [2, 2])
+    self.assertEqual([[2], [2]], _flat_shapes(rebatched_dataset))
+
+    expected_output = [{"a": [0, 1], "b": {"c": [1, 2]}},
+                       {"a": [2, 3], "b": {"c": [3, 4]}},
+                       {"a": [4, 5], "b": {"c": [5, 6]}},
+                       {"a": [6, 7], "b": {"c": [7, 8]}}]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(drop_remainder=[True, False])))
+  def testRaggedDataset(self, drop_remainder):
+    # Set up a dataset that produces ragged tensors with a static batch size.
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        ragged_tensor.RaggedTensor.from_row_lengths(
+            list(range(10)), [1, 2, 3, 4]))
+    # The map changes the internal representation of the ragged tensor.
+    # This test will fail if we don't normalize the tensor representation.
+    dataset = dataset.batch(4, drop_remainder=True).map(lambda x: x)
+
+    rebatched_dataset = distribute._RebatchDataset(
+        dataset, batch_sizes=[2, 2])
+
+    expected_output = [
+        ragged_tensor.RaggedTensor.from_row_lengths(list(range(3)), [1, 2]),
+        ragged_tensor.RaggedTensor.from_row_lengths(list(range(3, 10)),
+                                                    [3, 4]),
+    ]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testNoneDataset(self):
+    # Some datasets, e.g. datasets with None tensors, have components without
+    # output shapes. Test that this doesn't break rebatching shape inference
+    # logic.
+    dataset = dataset_ops.Dataset.range(4)
+    dataset = dataset.map(lambda x: (x, None))
+    dataset = dataset.batch(4, drop_remainder=True)
+    _ = distribute._RebatchDataset(dataset, batch_sizes=[2, 2])
+
+
+class LegacyRebatchDatasetTest(test_base.DatasetTestBase,
+                               parameterized.TestCase):
+
   @combinations.generate(
       combinations.times(test_base.default_test_combinations(),
                          combinations.combine(drop_remainder=[True, False])))
   def testBasic(self, drop_remainder):
-    dataset = dataset_ops.Dataset.range(1024).batch(
-        32, drop_remainder=drop_remainder)
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
-    self.assertEqual([[8] if drop_remainder else [None]],
-                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    dataset = dataset_ops.Dataset.range(8).batch(
+        4, drop_remainder=drop_remainder)
+    rebatched_dataset = distribute._LegacyRebatchDataset(
+        dataset, num_replicas=2)
 
-    expected_output = [[k for k in range(i, i + 8)] for i in range(0, 1024, 8)]  # pylint: disable=g-complex-comprehension
+    expected_shapes = [[2]] if drop_remainder else [[None]]
+    self.assertEqual(expected_shapes, _flat_shapes(rebatched_dataset))
+
+    expected_output = [[0, 1], [2, 3], [4, 5], [6, 7]]
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
   @combinations.generate(test_base.default_test_combinations())
@@ -57,71 +386,54 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     # decode_image results in a tensor of completely unknown shape (i.e. unknown
     # rank)
     dataset = dataset.map(image_ops.decode_image)
-    self.assertEqual([tensor_shape.TensorShape(None)], _flat_shapes(dataset))
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
-    # Note that we are just testing the dataset shapes, not the actual output.
     self.assertEqual([tensor_shape.TensorShape(None)],
-                     _flat_shapes(rebatched_dataset))
+                     nest.flatten(
+                         dataset_ops.get_legacy_output_shapes(dataset)))
+    rebatched_dataset = distribute._LegacyRebatchDataset(
+        dataset, num_replicas=4)
+    # Note that we are just testing the dataset shapes, not the actual output.
+    self.assertEqual(
+        [tensor_shape.TensorShape(None)],
+        nest.flatten(dataset_ops.get_legacy_output_shapes(rebatched_dataset)))
 
   @combinations.generate(test_base.default_test_combinations())
   def testCanHandleUnknownDims(self):
     dataset = dataset_ops.Dataset.range(1000)
     dataset = dataset.batch(10, drop_remainder=False)
     dataset = dataset.batch(10, drop_remainder=False)
-    self.assertEqual([[None, None]],
-                     [ts.as_list() for ts in _flat_shapes(dataset)])
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
+    self.assertEqual([[None, None]], _flat_shapes(dataset))
+    rebatched_dataset = distribute._LegacyRebatchDataset(
+        dataset, num_replicas=4)
     # Note that we are just testing the dataset shapes, not the actual output.
-    self.assertEqual([[None, None]],
-                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    self.assertEqual([[None, None]], _flat_shapes(rebatched_dataset))
 
   @combinations.generate(test_base.default_test_combinations())
   def testScalarInputError(self):
     dataset = dataset_ops.Dataset.range(1024)
-    distribute._RebatchDataset(dataset.batch(4), num_replicas=4)
+    distribute._LegacyRebatchDataset(dataset.batch(4), num_replicas=4)
     with self.assertRaisesRegex(ValueError, ("You can fix the issue "
                                              "by adding the `batch`")):
-      distribute._RebatchDataset(dataset, num_replicas=4)
+      distribute._LegacyRebatchDataset(dataset, num_replicas=4)
 
   @combinations.generate(
       combinations.times(test_base.default_test_combinations(),
                          combinations.combine(drop_remainder=[True, False])))
   def testBatchNotDivisibleByNumReplicas(self, drop_remainder):
-    dataset = dataset_ops.Dataset.range(1024).batch(
-        32, drop_remainder=drop_remainder)
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=5)
-    self.assertEqual([[None]],
-                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
-    expected_output = []
-    i = 0
-    for _ in range(32):  # number of steps
-      # first four minibatches have seven elements
-      for _ in range(4):
-        expected_output.append([k for k in range(i, i + 7)])
-        i += 7
-      # last minibatch has four elements
-      expected_output.append([k for k in range(i, i + 4)])
-      i += 4
-    self.assertDatasetProduces(rebatched_dataset, expected_output)
-
-  @combinations.generate(test_base.default_test_combinations())
-  def testBatchSizeNotDivisibleByNumReplicas2(self):
-    dataset = dataset_ops.Dataset.range(32).batch(16, drop_remainder=True)
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=5)
-    # This will rebatch into sub-batches of size 4, since
-    # ceil(16 / 5) = 4. However, that means only the first 4 replicas will get
-    # data.
-    expected_output = [[k for k in range(i, i + 4)] for i in range(0, 16, 4)]
-    expected_output.extend([[]])  # Last replica gets an empty batch
-    expected_output.extend(
-        [[k for k in range(i, i + 4)] for i in range(16, 32, 4)])
-    expected_output.extend([[]])  # Last replica gets an empty batch
+    dataset = dataset_ops.Dataset.range(8).batch(
+        4, drop_remainder=drop_remainder)
+    rebatched_dataset = distribute._LegacyRebatchDataset(
+        dataset, num_replicas=3)
+    self.assertEqual([[None]], _flat_shapes(rebatched_dataset))
+    # This rebatches into sub-batches of size 2, since ceil(4 / 3) = 2. However,
+    # this means that only the first 2 replicas will get data.
+    expected_output = [[0, 1], [2, 3], [], [4, 5], [6, 7], []]
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
   @combinations.generate(test_base.default_test_combinations())
   def testTupleOutput(self):
     dataset = dataset_ops.Dataset.range(1024).map(lambda x: (x, x)).batch(32)
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
+    rebatched_dataset = distribute._LegacyRebatchDataset(
+        dataset, num_replicas=4)
     expected_output = [([k for k in range(i, i + 8)],  # pylint: disable=g-complex-comprehension
                         [k for k in range(i, i + 8)])
                        for i in range(0, 1024, 8)]
@@ -129,68 +441,63 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.default_test_combinations())
   def testNestedDictionaryOutput(self):
-    dataset = dataset_ops.Dataset.range(1024).map(
-        lambda x: {"a": x, "b": {"c": x}}).batch(32)
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
-    expected_output = [{"a": [k for k in range(i, i + 8)],  # pylint: disable=g-complex-comprehension
-                        "b": {"c": [k for k in range(i, i + 8)]}}
-                       for i in range(0, 1024, 8)]
+    dataset = dataset_ops.Dataset.range(8).map(
+        lambda x: {"a": x, "b": {"c": x + 1}}).batch(4)
+    rebatched_dataset = distribute._LegacyRebatchDataset(
+        dataset, num_replicas=2)
+    expected_output = [{"a": [0, 1], "b": {"c": [1, 2]}},
+                       {"a": [2, 3], "b": {"c": [3, 4]}},
+                       {"a": [4, 5], "b": {"c": [5, 6]}},
+                       {"a": [6, 7], "b": {"c": [7, 8]}}]
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
   @combinations.generate(
       combinations.times(test_base.default_test_combinations(),
                          combinations.combine(drop_remainder=[True, False])))
   def testFinalPartialBatch(self, drop_remainder):
-    dataset = dataset_ops.Dataset.range(1032).batch(
-        32, drop_remainder=drop_remainder)
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
-    self.assertEqual([[8] if drop_remainder else [None]],
-                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
-
-    # if drop_remainder, the final partial batch is dropped, even though it
-    # makes up a complete minibatch.
-    expected_output = [[k for k in range(i, i + 8)] for i in range(0, 1024, 8)]  # pylint: disable=g-complex-comprehension
-    if not drop_remainder:
-      # The last partial batch of size 8 is split over 4 replicas
-      expected_output.extend(
-          [[k for k in range(i, i + 2)] for i in range(1024, 1032, 2)])
+    dataset = dataset_ops.Dataset.range(10).batch(
+        4, drop_remainder=drop_remainder)
+    rebatched_dataset = distribute._LegacyRebatchDataset(
+        dataset, num_replicas=2)
+    self.assertEqual([[2] if drop_remainder else [None]],
+                     _flat_shapes(rebatched_dataset))
+    if drop_remainder:
+      expected_output = [[0, 1], [2, 3], [4, 5], [6, 7]]
+    else:
+      expected_output = [[0, 1], [2, 3], [4, 5], [6, 7], [8], [9]]
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
   @combinations.generate(
       combinations.times(test_base.default_test_combinations(),
                          combinations.combine(drop_remainder=[True, False])))
   def testFinalPartialBatchAfterRebatch(self, drop_remainder):
-    dataset = dataset_ops.Dataset.range(34).batch(
-        32, drop_remainder=drop_remainder)
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
-    self.assertEqual([[8] if drop_remainder else [None]],
-                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
-
-    expected_output = [[k for k in range(i, i + 8)] for i in range(0, 32, 8)]  # pylint: disable=g-complex-comprehension
-    if not drop_remainder:
-      # The last partial batch of size 2 is split over 4 replicas
-      expected_output += [[32], [33], [], []]
+    dataset = dataset_ops.Dataset.range(9).batch(
+        4, drop_remainder=drop_remainder)
+    rebatched_dataset = distribute._LegacyRebatchDataset(
+        dataset, num_replicas=2)
+    self.assertEqual([[2] if drop_remainder else [None]],
+                     _flat_shapes(rebatched_dataset))
+    if drop_remainder:
+      expected_output = [[0, 1], [2, 3], [4, 5], [6, 7]]
+    else:
+      expected_output = [[0, 1], [2, 3], [4, 5], [6, 7], [8], []]
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
   @combinations.generate(test_base.default_test_combinations())
   def testMultipleBatches(self):
-    dataset = dataset_ops.Dataset.range(128).batch(4).batch(8)
-    self.assertEqual([[None, None]],
-                     [ts.as_list() for ts in _flat_shapes(dataset)])
+    dataset = dataset_ops.Dataset.range(16).batch(2).batch(4)
+    self.assertEqual([[None, None]], _flat_shapes(dataset))
 
-    # Each element is a list of 8 elements where each element is a list of 4.
-    expected_output = [[[j, j + 1, j + 2, j + 3]  # pylint: disable=g-complex-comprehension
-                        for j in range(i, i + 32, 4)]  # generates 8 elements
-                       for i in range(0, 128, 32)]
+    # Each element is a list of 4 elements where each element is a list of 2.
+    expected_output = [[[0, 1], [2, 3], [4, 5], [6, 7]],
+                       [[8, 9], [10, 11], [12, 13], [14, 15]]]
     self.assertDatasetProduces(dataset, expected_output)
 
-    rebatched_dataset = distribute._RebatchDataset(dataset, 4)
-    self.assertEqual([[None, None]],
-                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
-    # Each element is a list of 2 elements where each element is a list of 4.
-    expected_output = [[[j, j + 1, j + 2, j + 3]  # pylint: disable=g-complex-comprehension
-                        for j in range(i, i + 8, 4)]  # generates 2 elements
-                       for i in range(0, 128, 8)]
+    rebatched_dataset = distribute._LegacyRebatchDataset(dataset, 2)
+    self.assertEqual([[None, None]], _flat_shapes(rebatched_dataset))
+    # Each element is a list of 2 elements where each element is a list of 2.
+    expected_output = [[[0, 1], [2, 3]], [[4, 5], [6, 7]], [[8, 9], [10, 11]],
+                       [[12, 13], [14, 15]]]
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
   @combinations.generate(test_base.default_test_combinations())
@@ -206,7 +513,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     # This test will fail if we don't normalize the tensor representation.
     dataset = dataset.map(lambda x: x)
 
-    dataset = distribute._RebatchDataset(dataset, num_replicas=8)
+    dataset = distribute._LegacyRebatchDataset(dataset, num_replicas=8)
     # After rebatching, batch size is now 4.
     expected_output = []
     value_index = 0
@@ -219,6 +526,70 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       value_index += num_values
     self.assertDatasetProduces(dataset, expected_output)
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testNoneDataset(self):
+    # Some datasets, e.g. datasets with None tensors, have components without
+    # output shapes. Test that this doesn't break rebatching shape inference
+    # logic.
+    dataset = dataset_ops.Dataset.range(4)
+    dataset = dataset.map(lambda x: (x, None))
+    dataset = dataset.batch(4, drop_remainder=True)
+    _ = distribute._LegacyRebatchDataset(dataset, num_replicas=2)
+
+
+class ComputeBatchSizeTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testComputeBatchSizeKnown(self):
+    # When drop_remainder=True, batch size can be inferred from the type spec.
+    dataset = dataset_ops.Dataset.range(32).batch(4, drop_remainder=True)
+    dataset = dataset_ops.Dataset.zip((dataset, dataset))
+    batch_size = distribute.compute_batch_size(dataset)
+    self.assertEqual(4, self.evaluate(batch_size))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testComputeBatchSizeKnownAndMismatched(self):
+    # Return -1 when different components have different batch sizes.
+    dataset = dataset_ops.Dataset.range(32)
+    dataset = dataset_ops.Dataset.zip((dataset.batch(4, drop_remainder=True),
+                                       dataset.batch(8, drop_remainder=True)))
+    batch_size = distribute.compute_batch_size(dataset)
+    self.assertEqual(-1, self.evaluate(batch_size))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testComputeBatchSizeUnknown(self):
+    dataset = dataset_ops.Dataset.range(32).batch(4)
+    batch_size = distribute.compute_batch_size(dataset)
+    self.assertEqual(4, self.evaluate(batch_size))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testComputeBatchSizeWithPassthrough(self):
+    dataset = dataset_ops.Dataset.range(32).batch(4)
+    dataset = dataset.take(5)
+    batch_size = distribute.compute_batch_size(dataset)
+    self.assertEqual(4, self.evaluate(batch_size))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testComputeBatchSizeWithPassthroughInvalid(self):
+    dataset = dataset_ops.Dataset.range(32).batch(4)
+    dataset = dataset.map(lambda x: x + 1)
+    batch_size = distribute.compute_batch_size(dataset)
+    self.assertEqual(-1, self.evaluate(batch_size))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testComputeBatchSizeWithZip(self):
+    dataset = dataset_ops.Dataset.range(32).batch(4)
+    dataset = dataset_ops.Dataset.zip((dataset, dataset))
+    batch_size = distribute.compute_batch_size(dataset)
+    self.assertEqual(4, self.evaluate(batch_size))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testComputeBatchSizeWithZipMismatched(self):
+    dataset = dataset_ops.Dataset.range(32)
+    dataset = dataset_ops.Dataset.zip((dataset.batch(4), dataset.batch(8)))
+    batch_size = distribute.compute_batch_size(dataset)
+    self.assertEqual(-1, self.evaluate(batch_size))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/replicate_cluster_test.py b/tensorflow/python/data/experimental/kernel_tests/replicate_cluster_test.py
deleted file mode 100644
index a0461ff7e7e..00000000000
--- a/tensorflow/python/data/experimental/kernel_tests/replicate_cluster_test.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Graph mode cluster tests for the experimental `replicate` transformation."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl.testing import parameterized
-
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session
-from tensorflow.python.data.experimental.ops import distribute
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import combinations
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.platform import test
-
-
-class ReplicateClusterTest(test_base.DatasetTestBase, parameterized.TestCase):
-
-  def setUp(self):
-    super(ReplicateClusterTest, self).setUp()
-    # Start the local server.
-    worker_config = config_pb2.ConfigProto()
-    worker_config.device_count["CPU"] = 2
-    worker, _ = test_util.create_local_cluster(
-        3, 0, worker_config=worker_config)
-    self._device0 = "/job:worker/replica:0/task:0/device:CPU:0"
-    self._device1 = "/job:worker/replica:0/task:1/device:CPU:0"
-    self._device2 = "/job:worker/replica:0/task:2/device:CPU:0"
-    self._target = worker[0].target
-
-  @combinations.generate(
-      combinations.combine(tf_api_version=[1], mode=["graph"]))
-  def testBasic(self):
-    with ops.device(self._device0):
-      dataset0 = dataset_ops.Dataset.range(100)
-    replicated_ds = distribute.replicate(dataset0,
-                                         [self._device1, self._device2])
-    dataset1 = replicated_ds[self._device1]
-    dataset2 = replicated_ds[self._device2]
-    with ops.device(self._device0):
-      get_next = self.getNext(dataset0)
-    with ops.device(self._device1):
-      get_next1 = self.getNext(dataset1)
-    with ops.device(self._device2):
-      get_next2 = self.getNext(dataset2)
-
-    with session.Session(self._target) as sess:
-      for i in range(100):
-        self.assertEqual(i, sess.run(get_next()))
-        self.assertEqual(i, sess.run(get_next1()))
-        self.assertEqual(i, sess.run(get_next2()))
-
-  @combinations.generate(
-      combinations.combine(tf_api_version=[1], mode=["graph"]))
-  def testMap(self):
-    with ops.device(self._device0):
-      dataset0 = dataset_ops.Dataset.range(100).map(lambda x: x * 2)
-    replicated_ds = distribute.replicate(dataset0,
-                                         [self._device1, self._device2])
-    dataset1 = replicated_ds[self._device1]
-    dataset2 = replicated_ds[self._device2]
-    with ops.device(self._device0):
-      get_next = self.getNext(dataset0)
-    with ops.device(self._device1):
-      get_next1 = self.getNext(dataset1)
-    with ops.device(self._device2):
-      get_next2 = self.getNext(dataset2)
-
-    with session.Session(self._target) as sess:
-      for i in range(100):
-        self.assertEqual(i * 2, sess.run(get_next()))
-        self.assertEqual(i * 2, sess.run(get_next1()))
-        self.assertEqual(i * 2, sess.run(get_next2()))
-
-  @combinations.generate(
-      combinations.combine(tf_api_version=[1], mode=["graph"]))
-  def testVariableInput(self):
-    with ops.device(self._device0):
-      counter_var = variable_scope.get_variable(
-          "counter", (), dtypes.int32, use_resource=True)
-      dataset0 = dataset_ops.Dataset.range(100).map(
-          lambda _: counter_var.assign_add(1))
-    replicated_ds = distribute.replicate(dataset0,
-                                         [self._device1, self._device2])
-    dataset1 = replicated_ds[self._device1]
-    with ops.device(self._device1):
-      it1 = dataset_ops.make_initializable_iterator(dataset1)
-    # We don't support stateful ops across processes in functions as of now.
-    with session.Session(self._target) as sess:
-      with self.assertRaises(errors.OpError):
-        sess.run(it1.initializer)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/replicate_test.py b/tensorflow/python/data/experimental/kernel_tests/replicate_test.py
index 521b38bf5d3..4995b054011 100644
--- a/tensorflow/python/data/experimental/kernel_tests/replicate_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/replicate_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Eager mode tests for the experimental `replicate` transformation."""
+"""Tests for the private `replicate()` transformation."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -23,15 +23,18 @@ from tensorflow.core.protobuf import cluster_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import tensorflow_server_pb2
 from tensorflow.python import pywrap_tfe
+from tensorflow.python.client import session
 from tensorflow.python.data.experimental.ops import distribute
 from tensorflow.python.data.experimental.ops import distribute_options
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import combinations
+from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
@@ -42,12 +45,18 @@ class LocalReplicateTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def __init__(self, methodName="runTest"):  # pylint: disable=invalid-name
     super(LocalReplicateTest, self).__init__(methodName)
+    cpus = config.list_physical_devices("CPU")
+    # Set 3 virtual CPUs
+    config.set_logical_device_configuration(cpus[0], [
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration()
+    ])
     self._device0 = "/device:CPU:0"
     self._device1 = "/device:CPU:1"
     self._device2 = "/device:CPU:2"
 
-  @combinations.generate(
-      combinations.combine(tf_api_version=[1], mode=["graph", "eager"]))
+  @combinations.generate(test_base.default_test_combinations())
   def testBasic(self):
     with ops.device(self._device0):
       dataset0 = dataset_ops.Dataset.range(100)
@@ -63,8 +72,43 @@ class LocalReplicateTest(test_base.DatasetTestBase, parameterized.TestCase):
     with ops.device(self._device2):
       self.assertDatasetProduces(dataset2, range(100))
 
-  @combinations.generate(
-      combinations.combine(tf_api_version=[1], mode=["graph", "eager"]))
+  @combinations.generate(test_base.default_test_combinations())
+  def testFromTensorsWithDataset(self):
+    with ops.device(self._device0):
+      dataset0 = dataset_ops.Dataset.range(100)
+      dataset0 = dataset_ops.Dataset.from_tensors(dataset0)
+      dataset0 = dataset0.flat_map(lambda x: x)
+    replicated_ds = distribute.replicate(dataset0,
+                                         [self._device1, self._device2])
+    dataset1 = replicated_ds[self._device1]
+    dataset2 = replicated_ds[self._device2]
+
+    with ops.device(self._device0):
+      self.assertDatasetProduces(dataset0, range(100))
+    with ops.device(self._device1):
+      self.assertDatasetProduces(dataset1, range(100))
+    with ops.device(self._device2):
+      self.assertDatasetProduces(dataset2, range(100))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testFromTensorSlicesWithDataset(self):
+    with ops.device(self._device0):
+      dataset0 = dataset_ops.Dataset.range(100)
+      dataset0 = dataset_ops.Dataset.from_tensor_slices([dataset0])
+      dataset0 = dataset0.flat_map(lambda x: x)
+    replicated_ds = distribute.replicate(dataset0,
+                                         [self._device1, self._device2])
+    dataset1 = replicated_ds[self._device1]
+    dataset2 = replicated_ds[self._device2]
+
+    with ops.device(self._device0):
+      self.assertDatasetProduces(dataset0, range(100))
+    with ops.device(self._device1):
+      self.assertDatasetProduces(dataset1, range(100))
+    with ops.device(self._device2):
+      self.assertDatasetProduces(dataset2, range(100))
+
+  @combinations.generate(test_base.default_test_combinations())
   def testVariableInput(self):
     with ops.device(self._device0):
       counter_var = variable_scope.get_variable(
@@ -86,8 +130,7 @@ class LocalReplicateTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertDatasetProduces(
           dataset2, range(201, 301), requires_initialization=True)
 
-  @combinations.generate(
-      combinations.combine(tf_api_version=[1], mode=["graph", "eager"]))
+  @combinations.generate(test_base.default_test_combinations())
   def testExternalStatePolicyIgnore(self):
     with ops.device(self._device0):
       dataset0 = dataset_ops.Dataset.range(100).map(
@@ -117,8 +160,7 @@ class LocalReplicateTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.evaluate(get_next1())
       self.evaluate(get_next2())
 
-  @combinations.generate(
-      combinations.combine(tf_api_version=[1], mode=["graph", "eager"]))
+  @combinations.generate(test_base.default_test_combinations())
   def testExternalStatePolicyWarn(self):
     with ops.device(self._device0):
       dataset0 = dataset_ops.Dataset.range(100).map(
@@ -148,8 +190,7 @@ class LocalReplicateTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.evaluate(get_next1())
       self.evaluate(get_next2())
 
-  @combinations.generate(
-      combinations.combine(tf_api_version=[1], mode=["graph", "eager"]))
+  @combinations.generate(test_base.default_test_combinations())
   def testExternalStatePolicyFail(self):
     with ops.device(self._device0):
       dataset0 = dataset_ops.Dataset.range(100).map(
@@ -181,9 +222,6 @@ class LocalReplicateTest(test_base.DatasetTestBase, parameterized.TestCase):
         self.evaluate(get_next2())
 
 
-JOB_NAME = "remote_device"
-
-
 def _get_server_def(job_name, local_server_port, remote_server_addresses,
                     task_index):
   """Returns a server def with a single job + multiple tasks."""
@@ -204,26 +242,27 @@ def _get_server_def(job_name, local_server_port, remote_server_addresses,
   return server_def
 
 
-# Pure eager mode test that sets up a cluster of processes.
-class RemoteReplicateTest(test_base.DatasetTestBase, parameterized.TestCase):
+class EagerClusterReplicateTest(test_base.DatasetTestBase,
+                                parameterized.TestCase):
 
   def __init__(self, methodName="runTest"):  # pylint: disable=invalid-name
-    super(RemoteReplicateTest, self).__init__(methodName)
+    super(EagerClusterReplicateTest, self).__init__(methodName)
+    self._job_name = "remove_device"
     self._cached_server1 = server_lib.Server.create_local_server()
     self._cached_server2 = server_lib.Server.create_local_server()
     self._cached_server1_target = self._cached_server1.target[len("grpc://"):]
     self._cached_server2_target = self._cached_server2.target[len("grpc://"):]
-    self._device0 = "/job:%s/replica:0/task:0/device:CPU:0" % JOB_NAME
-    self._device1 = "/job:%s/replica:0/task:1/device:CPU:0" % JOB_NAME
-    self._device2 = "/job:%s/replica:0/task:2/device:CPU:0" % JOB_NAME
+    self._device0 = "/job:%s/replica:0/task:0/device:CPU:0" % self._job_name
+    self._device1 = "/job:%s/replica:0/task:1/device:CPU:0" % self._job_name
+    self._device2 = "/job:%s/replica:0/task:2/device:CPU:0" % self._job_name
 
   def setUp(self):
-    super(RemoteReplicateTest, self).setUp()
+    super(EagerClusterReplicateTest, self).setUp()
     # Start the local server.
     local_port = pywrap_tfe.TF_PickUnusedPortOrDie()
     context.set_server_def(
         server_def=_get_server_def(
-            JOB_NAME,
+            self._job_name,
             local_server_port=local_port,
             remote_server_addresses=[
                 self._cached_server1_target, self._cached_server2_target
@@ -285,7 +324,83 @@ class RemoteReplicateTest(test_base.DatasetTestBase, parameterized.TestCase):
           dataset2, range(201, 301), requires_initialization=True)
 
 
+class GraphClusterReplicateTest(test_base.DatasetTestBase,
+                                parameterized.TestCase):
+
+  def setUp(self):
+    super(GraphClusterReplicateTest, self).setUp()
+    # Start the local server.
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+    worker, _ = test_util.create_local_cluster(
+        3, 0, worker_config=worker_config)
+    self._device0 = "/job:worker/replica:0/task:0/device:CPU:0"
+    self._device1 = "/job:worker/replica:0/task:1/device:CPU:0"
+    self._device2 = "/job:worker/replica:0/task:2/device:CPU:0"
+    self._target = worker[0].target
+
+  @combinations.generate(
+      combinations.combine(tf_api_version=[1], mode=["graph"]))
+  def testBasic(self):
+    with ops.device(self._device0):
+      dataset0 = dataset_ops.Dataset.range(100)
+    replicated_ds = distribute.replicate(dataset0,
+                                         [self._device1, self._device2])
+    dataset1 = replicated_ds[self._device1]
+    dataset2 = replicated_ds[self._device2]
+    with ops.device(self._device0):
+      get_next = self.getNext(dataset0)
+    with ops.device(self._device1):
+      get_next1 = self.getNext(dataset1)
+    with ops.device(self._device2):
+      get_next2 = self.getNext(dataset2)
+
+    with session.Session(self._target) as sess:
+      for i in range(100):
+        self.assertEqual(i, sess.run(get_next()))
+        self.assertEqual(i, sess.run(get_next1()))
+        self.assertEqual(i, sess.run(get_next2()))
+
+  @combinations.generate(
+      combinations.combine(tf_api_version=[1], mode=["graph"]))
+  def testMap(self):
+    with ops.device(self._device0):
+      dataset0 = dataset_ops.Dataset.range(100).map(lambda x: x * 2)
+    replicated_ds = distribute.replicate(dataset0,
+                                         [self._device1, self._device2])
+    dataset1 = replicated_ds[self._device1]
+    dataset2 = replicated_ds[self._device2]
+    with ops.device(self._device0):
+      get_next = self.getNext(dataset0)
+    with ops.device(self._device1):
+      get_next1 = self.getNext(dataset1)
+    with ops.device(self._device2):
+      get_next2 = self.getNext(dataset2)
+
+    with session.Session(self._target) as sess:
+      for i in range(100):
+        self.assertEqual(i * 2, sess.run(get_next()))
+        self.assertEqual(i * 2, sess.run(get_next1()))
+        self.assertEqual(i * 2, sess.run(get_next2()))
+
+  @combinations.generate(
+      combinations.combine(tf_api_version=[1], mode=["graph"]))
+  def testVariableInput(self):
+    with ops.device(self._device0):
+      counter_var = variable_scope.get_variable(
+          "counter", (), dtypes.int32, use_resource=True)
+      dataset0 = dataset_ops.Dataset.range(100).map(
+          lambda _: counter_var.assign_add(1))
+    replicated_ds = distribute.replicate(dataset0,
+                                         [self._device1, self._device2])
+    dataset1 = replicated_ds[self._device1]
+    with ops.device(self._device1):
+      it1 = dataset_ops.make_initializable_iterator(dataset1)
+    # We don't support stateful ops across processes in functions as of now.
+    with session.Session(self._target) as sess:
+      with self.assertRaises(errors.OpError):
+        sess.run(it1.initializer)
+
+
 if __name__ == "__main__":
-  ops.enable_eager_execution(
-      config=config_pb2.ConfigProto(device_count={"CPU": 3}))
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/optimize_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/optimize_dataset_serialization_test.py
index 385b1acd49c..30d53165f85 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/optimize_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/optimize_dataset_serialization_test.py
@@ -36,7 +36,8 @@ class OptimizeDatasetSerializationTest(
 
     def build_dataset(num_elements, batch_size):
       return dataset_ops.Dataset.range(num_elements).map(lambda x: x * x).batch(
-          batch_size).apply(optimization.optimize(["map_and_batch_fusion"]))
+          batch_size).apply(
+              optimization.optimize(["map_and_batch_fusion"], None, None))
 
     self.run_core_tests(lambda: build_dataset(200, 10), 20)
 
@@ -50,7 +51,8 @@ class OptimizeDatasetSerializationTest(
       dataset = dataset.batch(5)
       # map_vectorization adds a new vectorized function to the function
       # library.
-      dataset = dataset.apply(optimization.optimize(["map_vectorization"]))
+      dataset = dataset.apply(
+          optimization.optimize(["map_vectorization"], None, None))
       return dataset
 
     self.run_core_tests(build_dataset, 20)
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/rebatch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/rebatch_dataset_serialization_test.py
index 8fa7fcbd10f..fe4eac5b69d 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/rebatch_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/rebatch_dataset_serialization_test.py
@@ -27,6 +27,22 @@ from tensorflow.python.framework import combinations
 from tensorflow.python.platform import test
 
 
+class LegacyRebatchDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testCore(self):
+
+    def build_dataset(num_elements, batch_size):
+      return distribute._LegacyRebatchDataset(
+          dataset_ops.Dataset.range(num_elements).batch(
+              4 * batch_size, drop_remainder=True),
+          num_replicas=4)
+
+    self.run_core_tests(lambda: build_dataset(64, 8), 8)
+
+
 class RebatchDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase,
     parameterized.TestCase):
@@ -37,10 +53,10 @@ class RebatchDatasetSerializationTest(
     def build_dataset(num_elements, batch_size):
       return distribute._RebatchDataset(
           dataset_ops.Dataset.range(num_elements).batch(
-              4 * batch_size, drop_remainder=True),
-          num_replicas=4)
+              2 * batch_size, drop_remainder=True),
+          batch_sizes=[batch_size, batch_size])
 
-    self.run_core_tests(lambda: build_dataset(200, 10), 20)
+    self.run_core_tests(lambda: build_dataset(64, 8), 8)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/unique_test.py b/tensorflow/python/data/experimental/kernel_tests/unique_test.py
index 9a51c4224ff..2b1b525a1b5 100644
--- a/tensorflow/python/data/experimental/kernel_tests/unique_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/unique_test.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for `tf.data.experimental.unique()`."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -76,6 +77,21 @@ class UniqueTest(test_base.DatasetTestBase, parameterized.TestCase):
         (["foo", "bar", "baz", "baz", "bar", "foo"], ["foo", "bar", "baz"]),
     ])
 
+  @combinations.generate(test_base.graph_only_combinations())
+  def testUnsupportedTypes(self):
+    """Should raise TypeError when element type doesn't match with the
+
+    dtypes.int64, dtypes.int32 or dtypes.string (supported types).
+    """
+
+    for dtype in [
+        dtypes.bool, dtypes.double, dtypes.complex64, dtypes.float32,
+        dtypes.float64, dtypes.qint16, dtypes.qint32
+    ]:
+      with self.assertRaises(TypeError):
+        _ = dataset_ops.Dataset.from_generator(lambda: [],
+                                               dtype).apply(unique.unique())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/ops/data_service_ops.py b/tensorflow/python/data/experimental/ops/data_service_ops.py
index d16e9966b23..b5dd6bba5d8 100644
--- a/tensorflow/python/data/experimental/ops/data_service_ops.py
+++ b/tensorflow/python/data/experimental/ops/data_service_ops.py
@@ -28,6 +28,7 @@ from tensorflow.python.data.experimental.ops.distribute_options import ExternalS
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -49,7 +50,6 @@ class _DataServiceDatasetV2(dataset_ops.DatasetSource):
   """A `Dataset` that reads elements from the tf.data service."""
 
   def __init__(self,
-               input_dataset,
                dataset_id,
                processing_mode,
                address,
@@ -60,8 +60,6 @@ class _DataServiceDatasetV2(dataset_ops.DatasetSource):
     """Constructs a _DataServiceDatasetV2.
 
     Args:
-      input_dataset: The input dataset, which should be registered with the
-        tf.data service under `dataset_id`.
       dataset_id: The dataset id for the dataset to read from.
       processing_mode: A string specifying the policy for how data should be
         processed by tf.data workers. Currently, the only supported value is
@@ -69,15 +67,15 @@ class _DataServiceDatasetV2(dataset_ops.DatasetSource):
       address: The tf.data service address, e.g. "localhost:5000".
       protocol: The protocol to use for communicating with the tf.data service,
         e.g. "grpc".
-      job_name: (Optional.) The name of the job. This argument makes it
-        possible for multiple datasets to share the same job. The default
-        behavior is that the dataset creates anonymous, exclusively owned jobs.
+      job_name: (Optional.) The name of the job. This argument makes it possible
+        for multiple datasets to share the same job. The default behavior is
+        that the dataset creates anonymous, exclusively owned jobs.
       max_outstanding_requests: (Optional.) A limit on how many elements may be
         requested at the same time. You can use this option to control the
         amount of memory used, since `distribute` won't use more than
         `element_size` * `max_outstanding_requests` of memory.
       task_refresh_interval_hint_ms: (Optional.) A hint for how often to query
-        the master for task changes.
+        the dispatcher for task changes.
     """
 
     if job_name is None:
@@ -87,7 +85,6 @@ class _DataServiceDatasetV2(dataset_ops.DatasetSource):
     if task_refresh_interval_hint_ms is None:
       task_refresh_interval_hint_ms = dataset_ops.AUTOTUNE
 
-    self._input_dataset = input_dataset
     self._dataset_id = ops.convert_to_tensor(
         dataset_id, dtype=dtypes.int64, name="dataset_id")
     self._processing_mode = ops.convert_to_tensor(
@@ -102,7 +99,9 @@ class _DataServiceDatasetV2(dataset_ops.DatasetSource):
         max_outstanding_requests,
         dtype=dtypes.int64,
         name="max_outstanding_requests")
-    self._element_spec = input_dataset.element_spec
+    # Datasets executed by the tf.data service produce compressed elements
+    # represented by scalar DT_VARIANTs.
+    self._element_spec = tensor_spec.TensorSpec(shape=(), dtype=dtypes.variant)
 
     variant_tensor = gen_experimental_dataset_ops.data_service_dataset(
         dataset_id=self._dataset_id,
@@ -126,12 +125,10 @@ class _DataServiceDatasetV1(dataset_ops.DatasetV1Adapter):
   """A `Dataset` that executes its input through the tf.data service."""
 
   @functools.wraps(_DataServiceDatasetV2.__init__)
-  def __init__(self, input_dataset, dataset_id, processing_mode, address,
-               protocol, job_name, max_outstanding_requests,
-               task_refresh_interval_hint_ms):
+  def __init__(self, dataset_id, processing_mode, address, protocol, job_name,
+               max_outstanding_requests, task_refresh_interval_hint_ms):
 
     self._wrapped = _DataServiceDatasetV2(
-        input_dataset=input_dataset,
         dataset_id=dataset_id,
         processing_mode=processing_mode,
         address=address,
@@ -148,6 +145,106 @@ else:
   _DataServiceDataset = _DataServiceDatasetV1
 
 
+def _parse_service(service):
+  """Parses a tf.data service string into a (protocol, address) tuple.
+
+  Args:
+    service: A string in the format "protocol://address".
+
+  Returns:
+    The parsed (protocol, address) tuple
+  """
+  if not isinstance(service, six.string_types):
+    raise ValueError(
+        "service must be a string, but service was of type {0}. service={1}"
+        .format(type(service), service))
+  if not service:
+    raise ValueError("service must not be empty")
+  parts = service.split("://")
+  if len(parts) == 1:
+    raise ValueError("service string %s does not begin with a protocol. "
+                     "The service should be in the format "
+                     "<protocol>://<address>, e.g. grpc://localhost:5000" %
+                     service)
+  if len(parts) > 2:
+    raise ValueError("malformed service string has multiple '://': %s" %
+                     service)
+  return parts
+
+
+def _from_dataset_id(processing_mode,
+                     service,
+                     dataset_id,
+                     element_spec,
+                     job_name=None,
+                     max_outstanding_requests=None,
+                     task_refresh_interval_hint_ms=None):
+  """Creates a dataset which reads data from the tf.data service.
+
+  This transformation is similar to `from_dataset_id`, but supports additional
+  parameters which we do not yet want to add to the public Python API.
+
+  Args:
+    processing_mode: A string specifying the policy for how data should be
+      processed by tf.data workers. Currently, the only supported value is
+      "parallel_epochs".
+    service: A string indicating how to connect to the tf.data service. The
+      string should be in the format "<protocol>://<address>", e.g.
+      "grpc://localhost:5000".
+    dataset_id: The id of the dataset to read from. This id is returned by
+      `register_dataset` when the dataset is registered with the tf.data
+      service.
+    element_spec: A nested structure of `tf.TypeSpec`s representing the type of
+      elements produced by the dataset. Use `tf.data.Dataset.element_spec` to
+      see the element spec for a given dataset.
+    job_name: (Optional.) The name of the job. This argument makes it possible
+      for multiple datasets to share the same job. The default behavior is that
+      the dataset creates anonymous, exclusively owned jobs.
+    max_outstanding_requests: (Optional.) A limit on how many elements may be
+      requested at the same time. You can use this option to control the amount
+      of memory used, since `distribute` won't use more than `element_size` *
+      `max_outstanding_requests` of memory.
+    task_refresh_interval_hint_ms: (Optional.) A hint for how often to query the
+      dispatcher for task changes.
+
+  Returns:
+    A `tf.data.Dataset` which reads from the tf.data service.
+  """
+  ProcessingMode.validate(processing_mode)
+  if job_name is not None:
+    if not isinstance(job_name, six.string_types):
+      raise ValueError("job_name must be a string, but job_name was of type "
+                       "{0}. job_name={1}".format(type(job_name), job_name))
+    if not job_name:
+      raise ValueError("job_name must not be empty")
+  if element_spec is None:
+    raise ValueError("element_spec must not be None")
+  protocol, address = _parse_service(service)
+
+  dataset = _DataServiceDataset(
+      dataset_id=dataset_id,
+      processing_mode=processing_mode,
+      address=address,
+      protocol=protocol,
+      job_name=job_name,
+      max_outstanding_requests=max_outstanding_requests,
+      task_refresh_interval_hint_ms=task_refresh_interval_hint_ms)
+  # TODO(b/157105111): Make this an autotuned parallel map when we have a way
+  # to limit memory usage.
+  # The value 16 is chosen based on experience with pipelines that require
+  # more than 8 parallel calls to prevent this stage from being a bottleneck.
+  dataset = dataset.map(
+      lambda x: compression_ops.uncompress(x, output_spec=element_spec),
+      num_parallel_calls=16)
+
+  # Disable autosharding for shared jobs.
+  if job_name:
+    options = dataset_ops.Options()
+    options.experimental_distribute.auto_shard_policy = AutoShardPolicy.OFF
+    dataset = dataset.with_options(options)
+  return dataset
+
+
 def _distribute(processing_mode,
                 service,
                 job_name=None,
@@ -163,95 +260,33 @@ def _distribute(processing_mode,
       processed by tf.data workers. Currently, the only supported value is
       "parallel_epochs".
     service: A string indicating how to connect to the tf.data service. The
-      string should be in the format <protocol>://<address>, e.g.
-      grpc://localhost:5000.
-    job_name: (Optional.) The name of the job. This argument makes it
-      possible for multiple datasets to share the same job. The default behavior
-      is that the dataset creates anonymous, exclusively owned jobs.
+      string should be in the format "<protocol>://<address>", e.g.
+      "grpc://localhost:5000".
+    job_name: (Optional.) The name of the job. This argument makes it possible
+      for multiple datasets to share the same job. The default behavior is that
+      the dataset creates anonymous, exclusively owned jobs.
     max_outstanding_requests: (Optional.) A limit on how many elements may be
       requested at the same time. You can use this option to control the amount
       of memory used, since `distribute` won't use more than `element_size` *
       `max_outstanding_requests` of memory.
     task_refresh_interval_hint_ms: (Optional.) A hint for how often to query the
-      master for task changes.
+      dispatcher for task changes.
 
   Returns:
     Dataset: A `Dataset` of the elements produced by the data service.
   """
   ProcessingMode.validate(processing_mode)
-  if job_name is not None:
-    if not isinstance(job_name, six.string_types):
-      raise ValueError("job_name must be a string, but job_name was of type "
-                       "{0}. job_name={1}".format(type(job_name), job_name))
-    if not job_name:
-      raise ValueError("job_name must not be empty")
-  if not isinstance(service, six.string_types):
-    raise ValueError(
-        "service must be a string, but service was of type {0}. service={1}"
-        .format(type(service), service))
-  if not service:
-    raise ValueError("service must not be empty")
-  parts = service.split("://")
-  if len(parts) == 1:
-    raise ValueError("service string %s does not begin with a protocol. "
-                     "The service should be in the format "
-                     "<protocol>://<address>, e.g. grpc://localhost:5000" %
-                     service)
-  if len(parts) > 2:
-    raise ValueError("malformed service string has multiple '://': %s" %
-                     service)
-  protocol, address = parts
-  address = ops.convert_to_tensor(address, dtype=dtypes.string, name="address")
-  protocol = ops.convert_to_tensor(
-      protocol, dtype=dtypes.string, name="protocol")
 
   def _apply_fn(dataset):  # pylint: disable=missing-docstring
-    external_state_policy = dataset.options().experimental_external_state_policy
-    if external_state_policy is None:
-      external_state_policy = ExternalStatePolicy.WARN
-
-    uncompressed_spec = dataset.element_spec
-    # Compress the dataset elements to reduce the amount of data that needs to
-    # be sent over the network.
-    # TODO(b/157105111): Make this an autotuned parallel map when we have a way
-    # to limit memory usage.
-    dataset = dataset.map(lambda *x: compression_ops.compress(x))
-    # Prefetch one compressed element to reduce latency when requesting data
-    # from tf.data workers.
-    # TODO(b/157105111): Set this to autotune when we have a way to limit
-    # memory usage
-    dataset = dataset.prefetch(1)
-    # Apply options so that the dataset executed in the tf.data service will
-    # be optimized and support autotuning.
-    dataset = dataset._apply_options()  # pylint: disable=protected-access
-    dataset_id = gen_experimental_dataset_ops.register_dataset(
-        dataset._variant_tensor,  # pylint: disable=protected-access
-        address=address,
-        protocol=protocol,
-        external_state_policy=external_state_policy.value)
-    dataset = _DataServiceDataset(
-        input_dataset=dataset,
-        dataset_id=dataset_id,
-        processing_mode=processing_mode,
-        address=address,
-        protocol=protocol,
+    dataset_id = register_dataset(service, dataset)
+    return _from_dataset_id(
+        processing_mode,
+        service,
+        dataset_id,
+        dataset.element_spec,
         job_name=job_name,
         max_outstanding_requests=max_outstanding_requests,
         task_refresh_interval_hint_ms=task_refresh_interval_hint_ms)
-    # TODO(b/157105111): Make this an autotuned parallel map when we have a way
-    # to limit memory usage.
-    # The value 16 is chosen based on experience with pipelines that require
-    # more than 8 parallel calls to prevent this stage from being a bottleneck.
-    dataset = dataset.map(
-        lambda x: compression_ops.uncompress(x, output_spec=uncompressed_spec),
-        num_parallel_calls=16)
-
-    # Disable autosharding for shared jobs.
-    if job_name:
-      options = dataset_ops.Options()
-      options.experimental_distribute.auto_shard_policy = AutoShardPolicy.OFF
-      dataset = dataset.with_options(options)
-    return dataset
 
   return _apply_fn
 
@@ -365,8 +400,8 @@ def distribute(processing_mode,
       processed by tf.data workers. Currently, the only supported value is
       "parallel_epochs".
     service: A string indicating how to connect to the tf.data service. The
-      string should be in the format protocol://address, e.g.
-      grpc://localhost:5000.
+      string should be in the format "protocol://address", e.g.
+      "grpc://localhost:5000".
     job_name: (Optional.) The name of the job. This argument makes it possible
       for multiple datasets to share the same job. The default behavior is that
       the dataset creates anonymous, exclusively owned jobs.
@@ -383,3 +418,149 @@ def distribute(processing_mode,
       service=service,
       job_name=job_name,
       max_outstanding_requests=max_outstanding_requests)
+
+
+@tf_export("data.experimental.service.register_dataset")
+def register_dataset(service, dataset):
+  """Registers a dataset with the tf.data service.
+
+  `register_dataset` registers a dataset with the tf.data service so that
+  datasets can be created later with
+  `tf.data.experimental.service.from_dataset_id`. This is useful when the
+  dataset
+  is registered by one process, then used in another process. When the same
+  process is both registering and reading from the dataset, it is simpler to use
+  `tf.data.experimental.service.distribute` instead.
+
+  If the dataset is already registered with the tf.data service,
+  `register_dataset` returns the already-registered dataset's id.
+
+  >>> dispatcher = tf.data.experimental.service.DispatchServer(port=0)
+  >>> dispatcher_address = dispatcher.target.split("://")[1]
+  >>> worker = tf.data.experimental.service.WorkerServer(
+  ...     port=0, dispatcher_address=dispatcher_address)
+  >>> dataset = tf.data.Dataset.range(10)
+  >>> dataset_id = tf.data.experimental.service.register_dataset(
+  ...     dispatcher.target, dataset)
+  >>> dataset = tf.data.experimental.service.from_dataset_id(
+  ...     processing_mode="parallel_epochs",
+  ...     service=dispatcher.target,
+  ...     dataset_id=dataset_id,
+  ...     element_spec=dataset.element_spec)
+  >>> print(list(dataset.as_numpy_iterator()))
+  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+
+  Args:
+    service: A string indicating how to connect to the tf.data service. The
+      string should be in the format "protocol://address", e.g.
+      "grpc://localhost:5000".
+    dataset: A `tf.data.Dataset` to register with the tf.data service.
+
+  Returns:
+    A scalar int64 tensor of the registered dataset's id.
+  """
+  protocol, address = _parse_service(service)
+  external_state_policy = dataset.options().experimental_external_state_policy
+  if external_state_policy is None:
+    external_state_policy = ExternalStatePolicy.WARN
+
+  # Compress the dataset elements to reduce the amount of data that needs to
+  # be sent over the network.
+  # TODO(b/157105111): Make this an autotuned parallel map when we have a way
+  # to limit memory usage.
+  dataset = dataset.map(lambda *x: compression_ops.compress(x))
+  # Prefetch one compressed element to reduce latency when requesting data
+  # from tf.data workers.
+  # TODO(b/157105111): Set this to autotune when we have a way to limit
+  # memory usage
+  dataset = dataset.prefetch(1)
+  # Apply options so that the dataset executed in the tf.data service will
+  # be optimized and support autotuning.
+  dataset = dataset._apply_options()  # pylint: disable=protected-access
+
+  dataset_id = gen_experimental_dataset_ops.register_dataset(
+      dataset._variant_tensor,  # pylint: disable=protected-access
+      address=address,
+      protocol=protocol,
+      external_state_policy=external_state_policy.value)
+
+  return dataset_id
+
+
+@tf_export("data.experimental.service.from_dataset_id")
+def from_dataset_id(processing_mode,
+                    service,
+                    dataset_id,
+                    element_spec=None,
+                    job_name=None,
+                    max_outstanding_requests=None):
+  """Creates a dataset which reads data from the tf.data service.
+
+  This is useful when the dataset is registered by one process, then used in
+  another process. When the same process is both registering and reading from
+  the dataset, it is simpler to use `tf.data.experimental.service.distribute`
+  instead.
+
+  Before using `from_dataset_id`, the dataset must have been registered with the
+  tf.data service using `tf.data.experimental.service.register_dataset`.
+  `register_dataset` returns a dataset id for the registered dataset. That is
+  the `dataset_id` which should be passed to `from_dataset_id`.
+
+  The `element_spec` argument indicates the `tf.TypeSpec`s for the elements
+  produced by the dataset. Currently `element_spec` must be explicitly
+  specified, and match the dataset registered under `dataset_id`. `element_spec`
+  defaults to `None` so that in the future we can support automatically
+  discovering the `element_spec` by querying the tf.data service.
+
+  `tf.data.experimental.service.distribute` is a convenience method which
+  combines `register_dataset` and `from_dataset_id` into a dataset
+  transformation.
+  See the documentation for `tf.data.experimental.service.distribute` for more
+  detail about how `from_dataset_id` works.
+
+  >>> dispatcher = tf.data.experimental.service.DispatchServer(port=0)
+  >>> dispatcher_address = dispatcher.target.split("://")[1]
+  >>> worker = tf.data.experimental.service.WorkerServer(
+  ...     port=0, dispatcher_address=dispatcher_address)
+  >>> dataset = tf.data.Dataset.range(10)
+  >>> dataset_id = tf.data.experimental.service.register_dataset(
+  ...     dispatcher.target, dataset)
+  >>> dataset = tf.data.experimental.service.from_dataset_id(
+  ...     processing_mode="parallel_epochs",
+  ...     service=dispatcher.target,
+  ...     dataset_id=dataset_id,
+  ...     element_spec=dataset.element_spec)
+  >>> print(list(dataset.as_numpy_iterator()))
+  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+
+  Args:
+    processing_mode: A string specifying the policy for how data should be
+      processed by tf.data workers. Currently, the only supported value is
+      "parallel_epochs".
+    service: A string indicating how to connect to the tf.data service. The
+      string should be in the format "protocol://address", e.g.
+      "grpc://localhost:5000".
+    dataset_id: The id of the dataset to read from. This id is returned by
+      `register_dataset` when the dataset is registered with the tf.data
+      service.
+    element_spec: A nested structure of `tf.TypeSpec`s representing the type of
+      elements produced by the dataset. Use `tf.data.Dataset.element_spec` to
+      see the element spec for a given dataset.
+    job_name: (Optional.) The name of the job. This argument makes it possible
+      for multiple datasets to share the same job. The default behavior is that
+      the dataset creates anonymous, exclusively owned jobs.
+    max_outstanding_requests: (Optional.) A limit on how many elements may be
+      requested at the same time. You can use this option to control the amount
+      of memory used, since `distribute` won't use more than `element_size` *
+      `max_outstanding_requests` of memory.
+
+  Returns:
+    A `tf.data.Dataset` which reads from the tf.data service.
+  """
+  return _from_dataset_id(
+      processing_mode=processing_mode,
+      service=service,
+      dataset_id=dataset_id,
+      element_spec=element_spec,
+      job_name=job_name,
+      max_outstanding_requests=max_outstanding_requests)
diff --git a/tensorflow/python/data/experimental/ops/distribute.py b/tensorflow/python/data/experimental/ops/distribute.py
index ca10c0f24f1..c5a9048630c 100644
--- a/tensorflow/python/data/experimental/ops/distribute.py
+++ b/tensorflow/python/data/experimental/ops/distribute.py
@@ -17,10 +17,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.data.experimental.ops.distribute_options import ExternalStatePolicy
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 
 
@@ -67,16 +74,171 @@ def _AutoShardDatasetV1(input_dataset, num_workers, index):  # pylint: disable=i
 
 
 class _RebatchDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` that divides the batch size by `num_replicas`.
+  """A `Dataset` that rebatches elements from its input into new batch sizes.
 
-  For each batch in the input dataset, the resulting dataset will produce
-  `num_replicas` minibatches whose sizes add up to the original batch size.
+  `_RebatchDataset(input_dataset, batch_sizes)` is functionally equivalent to
+  `input_dataset.unbatch().batch(N)`, where the value of N cycles through the
+  `batch_sizes` input list. The elements produced by this dataset have the same
+  rank as the elements of the input dataset.
+
+  For example:
+
+  ```python
+  ds = tf.data.Dataset.range(8)
+  ds = ds.batch(4)
+  ds = _RebatchDataset(ds, batch_sizes=[2, 1, 1])
+  for elem in ds:
+    print(elem)
+  >> [0, 1], [2], [3], [4, 5], [6], [7]
+
+  ds = tf.data.Dataset.range(16)
+  ds = ds.batch(4)
+  ds = _RebatchDataset(ds, batch_sizes=[6])
+  for elem in ds:
+    print(elem)
+  >> [0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11], [12, 13, 14, 15]
+  ```
   """
 
-  def __init__(self, input_dataset, num_replicas, use_fallback=True):
+  def __init__(self, input_dataset, batch_sizes, drop_remainder=False):
+    """Creates a _RebatchDataset.
 
-    def recalculate_batch_size(output_shape):
+    Args:
+      input_dataset: `Dataset` to rebatch.
+      batch_sizes: A `tf.int64` scalar or vector, representing the size of
+        batches to produce. If this argument is a vector, these values are
+        cycled through in order.
+      drop_remainder: (Optional.) A `tf.bool` scalar `tf.Tensor`, representing
+        whether the last batch should be dropped in the case it has fewer than
+        `batch_sizes[cycle_index] elements; the default behavior is not to drop
+        the smaller batch.
+    """
+    self._input_dataset = input_dataset
+    self._batch_sizes = ops.convert_to_tensor(
+        batch_sizes, dtype=dtypes.int64, name="batch_sizes")
+    self._drop_remainder = ops.convert_to_tensor(
+        drop_remainder, dtype=dtypes.bool, name="drop_remainder")
+    new_batch_dim = self._compute_static_batch_dim()
+
+    # pylint: disable=protected-access
+    self._element_spec = nest.map_structure(
+        lambda ts: ts._unbatch()._batch(new_batch_dim),
+        dataset_ops.get_structure(input_dataset))
+    # pylint: enable=protected-access
+
+    input_dataset = dataset_ops.normalize_to_dense(input_dataset)
+    variant_tensor = ged_ops.rebatch_dataset_v2(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        batch_sizes=batch_sizes,
+        drop_remainder=drop_remainder,
+        **self._flat_structure)
+    super(_RebatchDataset, self).__init__(input_dataset, variant_tensor)
+
+  def _compute_static_batch_dim(self):
+    """Computes the static batch dimension of a dataset if it can be determined.
+
+    Given the _RebatchDataset parameters, determines the batch dimension of this
+    dataset statically. Returns None if this cannot be determined or is
+    variable.
+
+    Returns:
+      An integer representing the batch dimension of the dataset. If it cannot
+      be determined statically, returns None.
+
+    Raises:
+      ValueError: The batch_sizes parameter is malformed, input_dataset is
+      not batched, or input_dataset batch sizes are incompatible with each
+      other.
+    """
+    new_batch_dim = tensor_util.constant_value(self._batch_sizes)
+    if new_batch_dim is None:
+      return None
+
+    if isinstance(new_batch_dim, np.ndarray):
+      if len(new_batch_dim.shape) == 1:
+        if np.all(new_batch_dim == new_batch_dim[0]):
+          new_batch_dim = new_batch_dim[0]
+        else:
+          return None
+      elif len(new_batch_dim.shape) > 1:
+        raise ValueError("Expected batch_sizes to be a scalar or vector.")
+
+    if self._may_form_partial_batches(new_batch_dim):
+      return None
+
+    return new_batch_dim
+
+  def _may_form_partial_batches(self, desired_batch_size):
+    """Returns whether this dataset may form partial batches."""
+    if tensor_util.constant_value(self._drop_remainder):
+      return False
+
+    def get_batch_dim(type_spec):
+      shape = type_spec._to_legacy_output_shapes()  # pylint: disable=protected-access
+      if not isinstance(shape, tensor_shape.TensorShape):
+        return None
+      if shape.rank is None:
+        return None
+      if len(shape) < 1:
+        raise ValueError("Expected a dataset whose elements have rank >= 1 "
+                         "but found a dataset whose elements are scalars. "
+                         "You can fix the issue by adding the `batch` "
+                         "transformation to the dataset.")
+      return shape.dims[0].value
+
+    input_batch_dims = [
+        get_batch_dim(ts)
+        for ts in nest.flatten(dataset_ops.get_structure(self._input_dataset))
+    ]
+    known_input_batch_dims = [d for d in input_batch_dims if d is not None]
+
+    if not known_input_batch_dims:
+      return True
+
+    known_input_batch_dims = np.asarray(known_input_batch_dims)
+    if not np.all(known_input_batch_dims == known_input_batch_dims[0]):
+      raise ValueError("Batch dimensions of input dataset are not compatible.")
+
+    return known_input_batch_dims[0] % desired_batch_size != 0
+
+  @property
+  def element_spec(self):
+    return self._element_spec
+
+
+class _LegacyRebatchDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that divides its input batches into `num_replicas` sub-batches.
+
+  For each batch in the input dataset, _LegacyRebatchDataset will produce
+  `num_replicas` smaller batches whose sizes add up to the original batch size.
+
+  For example:
+
+  ```python
+  ds = tf.data.Dataset.range(8)
+  ds = ds.batch(4)
+  ds = _LegacyRebatchDataset(ds, num_replicas=3)
+  for elem in ds:
+    print(elem)
+  >> [0, 1], [2, 3], [], [4, 5], [6, 7], []
+  ```
+  """
+
+  def __init__(self, input_dataset, num_replicas):
+    """Creates a _LegacyRebatchDataset.
+
+    Args:
+      input_dataset: `Dataset` to rebatch.
+      num_replicas: A `tf.int64` scalar, representing the number of sub-batches
+        to split each batch from `input_dataset` into.
+    """
+
+    def recalculate_batch_size(type_spec):
       """Recalculates the output_shape after dividing it by num_replicas."""
+      output_shape = type_spec._to_legacy_output_shapes()  # pylint: disable=protected-access
+      if not isinstance(output_shape, tensor_shape.TensorShape):
+        return None
+
       # If the output shape is unknown, we set the batch dimension to unknown.
       if output_shape.rank is None:
         return None
@@ -97,7 +259,7 @@ class _RebatchDataset(dataset_ops.UnaryDataset):
 
     def rebatch(type_spec):
       # pylint: disable=protected-access
-      batch_size = recalculate_batch_size(type_spec._to_legacy_output_shapes())
+      batch_size = recalculate_batch_size(type_spec)
       return type_spec._unbatch()._batch(batch_size)
       # pylint: enable=protected-access
 
@@ -108,7 +270,7 @@ class _RebatchDataset(dataset_ops.UnaryDataset):
         input_dataset._variant_tensor,  # pylint: disable=protected-access
         num_replicas=num_replicas,
         **self._flat_structure)
-    super(_RebatchDataset, self).__init__(input_dataset, variant_tensor)
+    super(_LegacyRebatchDataset, self).__init__(input_dataset, variant_tensor)
 
   @property
   def element_spec(self):
@@ -156,12 +318,185 @@ def replicate(dataset, devices):
     if policy is None:
       policy = ExternalStatePolicy.WARN
     graph_def = dataset._as_serialized_graph(
-        strip_device_assignment=True,
-        external_state_policy=policy)
+        strip_device_assignment=True, external_state_policy=policy)
   for device in devices:
     ds = _RemoteDataset(graph_def, device, dataset.element_spec)
     datasets[device] = ds
   return datasets
 
 
+def batch_sizes_for_worker(global_batch_size, num_workers,
+                           num_replicas_per_worker, worker_index):
+  """Determines how to rebatch a dataset for the given worker.
+
+  Given the global batch size, number of workers, number of replicas per worker,
+  and worker index, returns the correct batch sizes for rebatching a dataset
+  on worker `worker_index` of `num_workers`, such that each global step (across
+  all workers and replicas) will consume global_batch_size elements. The
+  returned value should be passed as the `batch_sizes` input parameter to
+  `tf.data.experimental.rebatch()`. The returned batch sizes meet the following
+  constraints:
+
+  Let G = global_batch_size, W = num_workers, R = num_replicas_per_worker
+  (A) for any worker, len(batch_sizes) = W * R
+  (B) for any worker, sum(batch_sizes) == G
+  (C) for any global step (i.e. R iterations on each worker), the sum of batches
+      consumed by replicas across all workers is G.
+  (D) any two batch sizes of any two replicas differs by at most one.
+
+  For example, suppose we have G = 7, W = 2, R = 2, and suppose we have two
+  files which each contain 7 elements:
+
+  ```python
+  # WORKER 0
+  batch_sizes_0 = batch_sizes_for_worker(global_batch_size=global_batch_size,
+                                         num_workers=2,
+                                         num_replicas_per_worker=2,
+                                         worker_index=0)
+  print(batch_sizes_0)
+  >> [2, 2, 2, 1]
+
+  dataset_0 = tf.data.Dataset.from_tensor_slices(["file_a", "file_b"])
+  dataset_0 = dataset_0.shard(num_shards, index=0)
+  dataset_0 = dataset_0.batch(7)
+  dataset_0 = dataset_0.apply(tf.data.experimental.rebatch(batch_sizes_0))
+  for elem in dataset_0:
+    print(elem)
+  >> [[A0, A1], [A2, A3], [A4, A5], [A6]]
+
+  # WORKER 1
+  batch_sizes_1 = batch_sizes_for_worker(global_batch_size=global_batch_size,
+                                         num_workers=2,
+                                         num_replicas_per_worker=2,
+                                         worker_index=1)
+  print(batch_sizes_1)
+  >> [2, 1, 2, 2]
+
+  dataset_1 = tf.data.Dataset.from_tensor_slices(["file_a", "file_b"])
+  dataset_1 = dataset_1.shard(num_shards, index=1)
+  dataset_1 = dataset_1.batch(7)
+  dataset_1 = dataset_1.apply(tf.data.experimental.rebatch(batch_sizes_1))
+  for elem in dataset_1:
+    print(elem)
+  >> [[B0, B1], [B2], [B3, B4], [B5, B6]]
+  ```
+
+  The above example will produce the following elements:
+
+  Step 1:
+    Worker 0 Replica 0: [A0, A1]
+    Worker 0 Replica 1: [A2, A3]
+    Worker 1 Replica 0: [B0, B1]
+    Worker 1 Replica 1: [B2]
+  Total batch size = 7
+
+  Step 2:
+    Worker 0 Replica 0: [A4, A5]
+    Worker 0 Replica 1: [A6]
+    Worker 1 Replica 0: [B3, B4]
+    Worker 1 Replica 1: [B5, B6]
+  Total batch size = 7
+
+  Args:
+    global_batch_size: A `tf.int64` scalar, representing the global batch size.
+    num_workers: An integer representing the number of workers the dataset will
+      be distributed across.
+    num_replicas_per_worker: An integer representing the number of replicas per
+      worker. All workers are assumed to have the same number of replicas.
+    worker_index: An integer index of the worker to be rebatched.
+
+  Returns:
+    A `tf.int64` vector, representing the batch sizes to rebatch the dataset
+    into.
+  """
+  # Constraint (A)
+  num_subbatches = num_workers * num_replicas_per_worker
+
+  offset = worker_index * num_replicas_per_worker
+
+  const_value = tensor_util.constant_value(global_batch_size)
+  if const_value is not None:
+    # Use the constant global batch size for further calculations
+    global_batch_size = const_value
+
+  # Let N = W * R. Constraint (B) and (D) jointly mean that the iterations
+  # should have batch size either floor(B/N) or ceil(B/N). Namely, of the N
+  # subbatches a batch is split into, B - N * floor(B/N) of them will have size
+  # ceil(B/N), and the rest will have size floor(B/N).
+  floor = global_batch_size // num_subbatches
+  num_ceil = global_batch_size - (num_subbatches * floor)
+
+  # For worker 0, we assign the first num_ceil subbatches to have size
+  # ceil(B/N), and the remainder to have size floor(B/N). The other workers will
+  # each be offset by R * worker_index in order to meet constraint (C).
+  if const_value is not None:
+    # If the global batch size is a known constant value, we return a constant
+    # tensor directly instead of manipulating it with TF ops. This allows for
+    # better downstream shape inference.
+    worker_0 = [floor + 1] * num_ceil + [floor] * (num_subbatches - num_ceil)
+    return ops.convert_to_tensor(
+        worker_0[offset:] + worker_0[:offset],
+        dtype=dtypes.int64,
+        name="batch_sizes")
+
+  worker_0 = array_ops.ones(num_subbatches, dtype=dtypes.int64)
+  worker_0 = floor * worker_0 + array_ops.concat([
+      array_ops.ones(num_ceil, dtype=dtypes.int64),
+      array_ops.zeros(num_subbatches - num_ceil, dtype=dtypes.int64)
+  ],
+                                                 axis=0)
+
+  return array_ops.concat([worker_0[offset:], worker_0[:offset]], axis=0)
+
+
+def compute_batch_size(dataset):
+  """An operation that returns the batch size of the dataset.
+
+  This op tries to infer the batch size statically by walking up the dataset
+  tree from the final dataset node and returning the batch size of the first
+  batching dataset (such as from .batch() and .padded_batch()) that it
+  encounters. This differs from using the `element_spec` of a dataset in that it
+  does not account for partial batches.
+
+  This operation may fail if it encounters contradictory batch sizes (for
+  example, if the dataset is created by zipping together two datasets with
+  different batch sizes), if there are no explicit batching transformations, or
+  if there are operations downstream from the batching transformation that may
+  modify its batch size. In these cases, it returns a -1.
+
+  Args:
+    dataset: A `tf.data.Dataset` object.
+
+  Returns:
+    A `tf.int64` Tensor representing the batch size of the dataset sans partial
+    batches. If this cannot be inferred statically, the value of this tensor
+    will be -1.
+  """
+
+  def get_static_batch_dim(output_shape):
+    if output_shape.rank is None:
+      return None
+    return output_shape.dims[0].value
+
+  batch_dims = [
+      get_static_batch_dim(ts._to_legacy_output_shapes())  # pylint: disable=protected-access
+      for ts in nest.flatten(dataset_ops.get_structure(dataset))
+  ]
+
+  if all(d is not None for d in batch_dims):
+
+    if all(d == batch_dims[0] for d in batch_dims):
+      # If all batch dimensions are known and equal, return that directly.
+      batch_dim = batch_dims[0]
+    else:
+      # If all batch dimensions are known but not all equal, return -1.
+      batch_dim = -1
+
+    return constant_op.constant(
+        batch_dim, dtype=dtypes.int64, name="static_batch_size")
+
+  # If any batch dimensions are unknown, use compute_batch_size op.
+  return ged_ops.compute_batch_size(dataset._variant_tensor)  # pylint: disable=protected-access
+
+
 _AutoShardDatasetV1.__doc__ = _AutoShardDataset.__doc__
diff --git a/tensorflow/python/data/experimental/ops/optimization.py b/tensorflow/python/data/experimental/ops/optimization.py
index 4581a612ed6..161850521de 100644
--- a/tensorflow/python/data/experimental/ops/optimization.py
+++ b/tensorflow/python/data/experimental/ops/optimization.py
@@ -36,13 +36,19 @@ def model():
   return _apply_fn
 
 
-def optimize(optimizations=None):
+def optimize(optimizations_enabled=None, optimizations_disabled=None,
+             optimizations_default=None):
   """A transformation that applies optimizations.
 
   Args:
-    optimizations: (Optional.) A `tf.string` vector `tf.Tensor` identifying
-      optimizations to use. If not specified, the default set of optimizations
-      is applied.
+    optimizations_enabled: (Optional.) A `tf.string` vector `tf.Tensor`
+    identifying enabled optimizations. If not specified, set to be empty.
+
+    optimizations_disabled: (Optional.) A `tf.string` vector `tf.Tensor`
+    identifying disabled optimizations. If not specified, set to be empty.
+
+    optimizations_default: (Optional.) A `tf.string` vector `tf.Tensor`
+    identifying default optimizations. If not specified, set to be empty.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
@@ -51,7 +57,11 @@ def optimize(optimizations=None):
 
   def _apply_fn(dataset):
     """Function from `Dataset` to `Dataset` that applies the transformation."""
-    return dataset_ops._OptimizeDataset(dataset, optimizations)  # pylint: disable=protected-access
+    return dataset_ops._OptimizeDataset(  # pylint: disable=protected-access
+        dataset,
+        optimizations_enabled,
+        optimizations_disabled,
+        optimizations_default)
 
   return _apply_fn
 
diff --git a/tensorflow/python/data/experimental/ops/optimization_options.py b/tensorflow/python/data/experimental/ops/optimization_options.py
index 5db4db91c17..fa7a0d23dea 100644
--- a/tensorflow/python/data/experimental/ops/optimization_options.py
+++ b/tensorflow/python/data/experimental/ops/optimization_options.py
@@ -53,9 +53,13 @@ class MapVectorizationOptions(options.OptionsBase):
       "defaults to False.")
 
   def _graph_rewrites(self):
-    if self.enabled:
-      return ["map_vectorization"]
-    return []
+    graph_rewrites = options.graph_rewrites()
+    result = graph_rewrites(enabled=[], disabled=[], default=[])
+    if self.enabled is True:  # pylint: disable=g-bool-id-comparison
+      result.enabled.append("map_vectorization")
+    elif self.enabled is False:  # pylint: disable=g-bool-id-comparison
+      result.disabled.append("map_vectorization")
+    return result
 
   def _graph_rewrite_configs(self):
     if not self.enabled:
@@ -177,8 +181,24 @@ class OptimizationOptions(options.OptionsBase):
   parallel_batch = options.create_option(
       name="parallel_batch",
       ty=bool,
-      docstring="Whether to parallelize copying of batch elements. If None, "
-      "defaults to False.")
+      docstring="Whether to parallelize copying of batch elements. This "
+      "optimization is highly experimental and can cause performance "
+      "degradation (e.g. when the parallelization overhead exceeds the "
+      "benefits of performing the data copies in parallel). You should only "
+      "enable this optimization if a) your input pipeline is bottlenecked on "
+      "batching and b) you have validated that this optimization improves "
+      "performance. If None, defaults to False.")
+
+  reorder_data_discarding_ops = options.create_option(
+      name="reorder_data_discarding_ops",
+      ty=bool,
+      docstring="Whether to reorder ops that will discard data to the front of "
+      "unary cardinality preserving transformations, e.g. "
+      "dataset.map(...).take(3) will be optimized to dataset.take(3).map(...). "
+      "For now this optimization will move `skip`, `shard` and `take` to the "
+      "front of `map` and `prefetch`. This optimization is only for "
+      "performance; it will not affect the output of the dataset. "
+      "If None, defaults to True.")
 
   shuffle_and_repeat_fusion = options.create_option(
       name="shuffle_and_repeat_fusion",
@@ -213,8 +233,20 @@ class OptimizationOptions(options.OptionsBase):
     return autotune, algorithm, cpu_budget
 
   def _graph_rewrites(self):
-    """Produces the list of enabled graph optimizations."""
-    result = set()
+    """Produces lists of enabled, disabled and default graph optimizations.
+
+    Returns:
+      result: a namedtuple with three attributes. `result.enabled` is the list
+        of user enabled optimizations. `result.disabled` is the list of user
+        disabled optimizations. `result.default` is the list of optimizations
+        that are enabled by default (the user has not explicitly enabled or
+        disabled them).
+    """
+    if self.map_vectorization is not None:
+      result = self.map_vectorization._graph_rewrites()  # pylint: disable=protected-access
+    else:
+      result = MapVectorizationOptions()._graph_rewrites()  # pylint: disable=protected-access
+
     all_optimizations = [
         "filter_fusion",
         "filter_with_random_uniform_fusion",
@@ -225,13 +257,11 @@ class OptimizationOptions(options.OptionsBase):
         "map_fusion",
         "noop_elimination",
         "parallel_batch",
+        "reorder_data_discarding_ops",
         "shuffle_and_repeat_fusion",
     ]
-    for optimization in all_optimizations:
-      if getattr(self, optimization):
-        result.add(optimization)
 
-    if self.apply_default_optimizations is not False:
+    if self.apply_default_optimizations is not False:  # pylint: disable=g-bool-id-comparison
       # The following optimizations are turned on by default, unless the user
       # explicitly disables them.
       optimizations_to_disable = [
@@ -240,21 +270,29 @@ class OptimizationOptions(options.OptionsBase):
           "shuffle_and_repeat_fusion",
       ]
       for optimization in optimizations_to_disable:
-        if getattr(self, optimization) is not False:
-          result.add(optimization)
+        if getattr(self, optimization) is None:
+          result.default.append(optimization)
 
-    if self.map_vectorization is not None:
-      result.update(self.map_vectorization._graph_rewrites())  # pylint: disable=protected-access
+    # Each of these attributes on the Options object is either True (explicitly
+    # enabled), False (explicitly disabled), or None (default).
+    for optimization in all_optimizations:
+      if getattr(self, optimization) is True:  # pylint: disable=g-bool-id-comparison
+        result.enabled.append(optimization)
+      elif getattr(self, optimization) is False:  # pylint: disable=g-bool-id-comparison
+        result.disabled.append(optimization)
 
     autotune_buffers = self._autotune_buffers()
-    if self.autotune is not False and autotune_buffers:  # pylint: disable=g-bool-id-comparison
+    if self.autotune is not False and autotune_buffers is True:  # pylint: disable=g-bool-id-comparison
       # When autotuning buffer sizes is enabled, we inject a `prefetch`
       # transformation after asynchronous dataset ops. Only the buffer sizes of
       # prefetch transformations will be autotuned, though this is practically
       # equivalent to tuning the buffer sizes of the other asynchronous
       # transformations.
-      result.add("inject_prefetch")
-    return sorted(list(result))
+      result.enabled.append("inject_prefetch")
+    if self.autotune is False:  # pylint: disable=g-bool-id-comparison
+      result.disabled.append("inject_prefetch")
+
+    return result
 
   def _graph_rewrite_configs(self):
     if self.map_vectorization is not None:
diff --git a/tensorflow/python/data/experimental/ops/testing.py b/tensorflow/python/data/experimental/ops/testing.py
index ec5b4810b23..327bce225e4 100644
--- a/tensorflow/python/data/experimental/ops/testing.py
+++ b/tensorflow/python/data/experimental/ops/testing.py
@@ -23,11 +23,13 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_experimental_dataset_ops
 
 
-# TODO(jsimsa): Support RE matching for both individual transformation (e.g. to
-# account for indexing) and transformation sequence.
 def assert_next(transformations):
   """A transformation that asserts which transformations happen next.
 
+  Transformations should be referred to by their base name, not including
+  version suffix. For example, use "Batch" instead of "BatchV2". "Batch" will
+  match any of "Batch", "BatchV1", "BatchV2", etc.
+
   Args:
     transformations: A `tf.string` vector `tf.Tensor` identifying the
       transformations that are expected to happen next.
@@ -119,5 +121,3 @@ class _SleepDataset(dataset_ops.UnaryUnchangedStructureDataset):
         self._sleep_microseconds,
         **self._flat_structure)
     super(_SleepDataset, self).__init__(input_dataset, variant_tensor)
-
-
diff --git a/tensorflow/python/data/experimental/service/BUILD b/tensorflow/python/data/experimental/service/BUILD
index f08fef2b669..f072c5f2208 100644
--- a/tensorflow/python/data/experimental/service/BUILD
+++ b/tensorflow/python/data/experimental/service/BUILD
@@ -39,6 +39,7 @@ tf_py_test(
     srcs = ["server_lib_test.py"],
     deps = [
         ":server_lib",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:platform_test",
     ],
 )
diff --git a/tensorflow/python/data/experimental/service/__init__.py b/tensorflow/python/data/experimental/service/__init__.py
index aecc07965bb..987eb6d6dc2 100644
--- a/tensorflow/python/data/experimental/service/__init__.py
+++ b/tensorflow/python/data/experimental/service/__init__.py
@@ -12,12 +12,121 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Experimental API for using the tf.data service."""
+"""API for using the tf.data service.
+
+This module contains:
+
+1. tf.data server implementations for running the tf.data service.
+2. A `distribute` dataset transformation that moves a dataset's preprocessing
+   to happen in the tf.data service.
+
+The tf.data service offers a way to improve training speed when the host
+attached to a training device can't keep up with the data consumption of the
+model. For example, suppose a host can generate 100 examples/second, but the
+model can process 200 examples/second. Training speed could be doubled by using
+the tf.data service to generate 200 examples/second.
+
+## Before using the tf.data service
+
+There are a few things to do before using the tf.data service to speed up
+training.
+
+### Understand processing_mode
+
+The tf.data service uses a cluster of workers to prepare data for training your
+model. The `processing_mode` argument to
+`tf.data.experimental.service.distribute` describes how to leverage multiple
+workers to process the input dataset. Currently, the only supported
+processing mode is "parallel_epochs", which means that the entire input dataset
+will be processed independently by each of the tf.data service workers. For this
+reason, it is important to shuffle data (e.g. filenames) non-deterministically,
+so that each worker will process the elements of the dataset in a different
+order. If your model  requires input data to arrive in a certain order, the
+"parallel_epochs" processing mode will not work well. We plan to support
+additional modes of processing (such as processing a different shard of the
+input data by each worker) in the near future.
+
+### Measure potential impact
+
+Before using the tf.data service, it is useful to first measure the potential
+performance improvement. To do this, add
+
+```
+dataset = dataset.take(1).cache().repeat()
+```
+
+at the end of your dataset, and see how it affects your model's step time.
+`take(1).cache().repeat()` will cache the first element of your dataset and
+produce it repeatedly. This should make the dataset very fast, so that the model
+becomes the bottleneck and you can identify the ideal model speed. With enough
+workers, the tf.data service should be able to achieve similar speed.
+
+## Running the tf.data service
+
+tf.data servers should be brought up alongside your training jobs, and brought
+down when the jobs are finished. The tf.data service uses one `DispatchServer`
+and any number of `WorkerServers`. See
+https://github.com/tensorflow/ecosystem/tree/master/data_service for an example
+of using Google Kubernetes Engine (GKE) to manage the tf.data service. The
+server implementation in
+[tf_std_data_server.py](https://github.com/tensorflow/ecosystem/blob/master/data_service/tf_std_data_server.py)
+is not GKE-specific, and can be used to run the tf.data service in other
+contexts.
+
+### Fault tolerance
+
+By default, the tf.data dispatch server stores its state in-memory, making it a
+single point of failure during training. To avoid this, pass
+`fault_tolerant_mode=True` when creating your `DispatchServer`. Dispatcher
+fault tolerance requires `work_dir` to be configured and accessible from the
+dispatcher both before and after restart (e.g. a GCS path). With fault tolerant
+mode enabled, the dispatcher will journal its state to the work directory so
+that no state is lost when the dispatcher is restarted.
+
+WorkerServers may be freely restarted, added, or removed during training. At
+startup, workers will register with the dispatcher and begin processing all
+outstanding jobs from the beginning.
+
+## Using the tf.data service from your training job
+
+Once you have a tf.data service cluster running, take note of the dispatcher IP
+address and port. To connect to the service, you will use a string in the format
+"grpc://<dispatcher_address>:<dispatcher_port>".
+
+```
+# Create dataset however you were before using the tf.data service.
+dataset = your_dataset_factory()
+
+service = "grpc://{}:{}".format(dispatcher_address, dispatcher_port)
+# This will register the dataset with the tf.data service cluster so that
+# tf.data workers can run the dataset to produce elements. The dataset returned
+# from applying `distribute` will fetch elements produced by tf.data workers.
+dataset = dataset.apply(tf.data.experimental.service.distribute(
+    processing_mode="parallel_epochs", service=service))
+```
+
+Below is a toy example that you can run yourself.
+
+>>> dispatcher = tf.data.experimental.service.DispatchServer(port=0)
+>>> dispatcher_address = dispatcher.target.split("://")[1]
+>>> worker = tf.data.experimental.service.WorkerServer(
+...     port=0, dispatcher_address=dispatcher_address)
+>>> dataset = tf.data.Dataset.range(10)
+>>> dataset = dataset.apply(tf.data.experimental.service.distribute(
+...     processing_mode="parallel_epochs", service=dispatcher.target))
+>>> print(list(dataset.as_numpy_iterator()))
+[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+
+See the documentation of `tf.data.experimental.service.distribute` for more
+details about using the `distribute` transformation.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.experimental.ops.data_service_ops import distribute
-from tensorflow.python.data.experimental.service.server_lib import MasterServer
+from tensorflow.python.data.experimental.ops.data_service_ops import from_dataset_id
+from tensorflow.python.data.experimental.ops.data_service_ops import register_dataset
+from tensorflow.python.data.experimental.service.server_lib import DispatchServer
 from tensorflow.python.data.experimental.service.server_lib import WorkerServer
diff --git a/tensorflow/python/data/experimental/service/server_lib.py b/tensorflow/python/data/experimental/service/server_lib.py
index f249af671a6..12c1903fe22 100644
--- a/tensorflow/python/data/experimental/service/server_lib.py
+++ b/tensorflow/python/data/experimental/service/server_lib.py
@@ -19,45 +19,72 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=invalid-import-order,g-bad-import-order, unused-import
+from tensorflow.core.protobuf.data.experimental import service_config_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.data.experimental.service import _pywrap_server_lib
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("data.experimental.service.MasterServer", v1=[])
-class MasterServer(object):
-  """An in-process tf.data service master server.
+DEFAULT_PROTOCOL = "grpc"
 
-  A `tf.data.experimental.service.MasterServer` coordinates a cluster of
+
+@tf_export("data.experimental.service.DispatchServer", v1=[])
+class DispatchServer(object):
+  """An in-process tf.data service dispatch server.
+
+  A `tf.data.experimental.service.DispatchServer` coordinates a cluster of
   `tf.data.experimental.service.WorkerServer`s. When the workers start, they
-  register themselves with the master.
+  register themselves with the dispatcher.
 
-  >>> master = tf.data.experimental.service.MasterServer(port=0)
-  >>> master_address = master.target.split("://")[1]
+  >>> dispatcher = tf.data.experimental.service.DispatchServer(port=0)
+  >>> dispatcher_address = dispatcher.target.split("://")[1]
   >>> worker = tf.data.experimental.service.WorkerServer(
-  ...     port=0, master_address=master_address)
+  ...     port=0, dispatcher_address=dispatcher_address)
   >>> dataset = tf.data.Dataset.range(10)
   >>> dataset = dataset.apply(tf.data.experimental.service.distribute(
-  ...     processing_mode="parallel_epochs", service=master.target))
+  ...     processing_mode="parallel_epochs", service=dispatcher.target))
   >>> print(list(dataset.as_numpy_iterator()))
   [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
 
-  When starting a dedicated tf.data master process, use join() to block
+  When starting a dedicated tf.data dispatch process, use join() to block
   indefinitely after starting up the server.
 
   ```
-  master = tf.data.experimental.service.MasterServer(port=5050)
-  master.join()
+  dispatcher = tf.data.experimental.service.DispatchServer(port=5050)
+  dispatcher.join()
+  ```
+
+  To start a `DispatchServer` in fault-tolerant mode, set `work_dir` and
+  `fault_tolerant_mode` like below:
+
+  ```
+  dispatcher = tf.data.experimental.service.DispatchServer(
+      port=5050,
+      work_dir="gs://my-bucket/dispatcher/work_dir",
+      fault_tolerant_mode=True)
   ```
   """
 
-  def __init__(self, port, protocol=None, start=True):
-    """Creates a new master server.
+  def __init__(self,
+               port,
+               protocol=None,
+               work_dir=None,
+               fault_tolerant_mode=None,
+               start=True):
+    """Creates a new dispatch server.
 
     Args:
       port: Specifies the port to bind to.
       protocol: (Optional.) Specifies the protocol to be used by the server.
         Acceptable values include `"grpc", "grpc+local"`. Defaults to `"grpc"`.
+      work_dir: (Optional.) A directory to store dispatcher state in. This
+        argument is required for the dispatcher to be able to recover from
+        restarts.
+      fault_tolerant_mode: (Optional.) Whether the dispatcher should write
+        its state to a journal so that it can recover from restarts. Dispatcher
+        state, including registered datasets and created jobs, is synchronously
+        written to the journal before responding to RPCs. If `True`, `work_dir`
+        must also be specified. Defaults to `False`.
       start: (Optional.) Boolean, indicating whether to start the server after
         creating it. Defaults to `True`.
 
@@ -65,18 +92,28 @@ class MasterServer(object):
       tf.errors.OpError: Or one of its subclasses if an error occurs while
         creating the TensorFlow server.
     """
-    if protocol is None:
-      protocol = "grpc"
-    self._protocol = protocol
-    self._server = _pywrap_server_lib.TF_DATA_NewMasterServer(port, protocol)
+    self._protocol = protocol or DEFAULT_PROTOCOL
+    work_dir = work_dir or ""
+    fault_tolerant_mode = fault_tolerant_mode or False
+    if fault_tolerant_mode and not work_dir:
+      raise ValueError(
+          "Cannot enable fault tolerant mode without configuring a work_dir")
+    config = service_config_pb2.DispatcherConfig(
+        port=port,
+        protocol=self._protocol,
+        work_dir=work_dir,
+        fault_tolerant_mode=fault_tolerant_mode)
+    self._server = _pywrap_server_lib.TF_DATA_NewDispatchServer(
+        config.SerializeToString())
     if start:
       self._server.start()
 
   def start(self):
     """Starts this server.
 
-    >>> master = tf.data.experimental.service.MasterServer(port=0, start=False)
-    >>> master.start()
+    >>> dispatcher = tf.data.experimental.service.DispatchServer(port=0,
+    ...                                                          start=False)
+    >>> dispatcher.start()
 
     Raises:
       tf.errors.OpError: Or one of its subclasses if an error occurs while
@@ -87,11 +124,11 @@ class MasterServer(object):
   def join(self):
     """Blocks until the server has shut down.
 
-    This is useful when starting a dedicated master process.
+    This is useful when starting a dedicated dispatch process.
 
     ```
-    master = tf.data.experimental.service.MasterServer(port=5050)
-    master.join()
+    dispatcher = tf.data.experimental.service.DispatchServer(port=5050)
+    dispatcher.join()
     ```
 
     Raises:
@@ -104,10 +141,10 @@ class MasterServer(object):
   def target(self):
     """Returns a target that can be used to connect to the server.
 
-    >>> master = tf.data.experimental.service.MasterServer(port=0)
+    >>> dispatcher = tf.data.experimental.service.DispatchServer(port=0)
     >>> dataset = tf.data.Dataset.range(10)
     >>> dataset = dataset.apply(tf.data.experimental.service.distribute(
-    ...     processing_mode="parallel_epochs", service=master.target))
+    ...     processing_mode="parallel_epochs", service=dispatcher.target))
 
     The returned string will be in the form protocol://address, e.g.
     "grpc://localhost:5050".
@@ -136,7 +173,7 @@ class MasterServer(object):
     return "localhost:{0}".format(self._server.bound_port())
 
   def _num_workers(self):
-    """Returns the number of workers registered with the master."""
+    """Returns the number of workers registered with the dispatcher."""
     return self._server.num_workers()
 
 
@@ -147,15 +184,15 @@ class WorkerServer(object):
   A `tf.data.experimental.service.WorkerServer` performs `tf.data.Dataset`
   processing for user-defined datasets, and provides the resulting elements over
   RPC. A worker is associated with a single
-  `tf.data.experimental.service.MasterServer`.
+  `tf.data.experimental.service.DispatchServer`.
 
-  >>> master = tf.data.experimental.service.MasterServer(port=0)
-  >>> master_address = master.target.split("://")[1]
+  >>> dispatcher = tf.data.experimental.service.DispatchServer(port=0)
+  >>> dispatcher_address = dispatcher.target.split("://")[1]
   >>> worker = tf.data.experimental.service.WorkerServer(
-  ...     port=0, master_address=master_address)
+  ...     port=0, dispatcher_address=dispatcher_address)
   >>> dataset = tf.data.Dataset.range(10)
   >>> dataset = dataset.apply(tf.data.experimental.service.distribute(
-  ...     processing_mode="parallel_epochs", service=master.target))
+  ...     processing_mode="parallel_epochs", service=dispatcher.target))
   >>> print(list(dataset.as_numpy_iterator()))
   [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
 
@@ -164,14 +201,14 @@ class WorkerServer(object):
 
   ```
   worker = tf.data.experimental.service.WorkerServer(
-      port=5051, master_address="grpc://localhost:5050")
+      port=5051, dispatcher_address="grpc://localhost:5050")
   worker.join()
   ```
   """
 
   def __init__(self,
                port,
-               master_address,
+               dispatcher_address,
                worker_address=None,
                protocol=None,
                start=True):
@@ -180,11 +217,12 @@ class WorkerServer(object):
     Args:
       port: Specifies the port to bind to. A value of 0 indicates that the
         worker can bind to any available port.
-      master_address: Specifies the address of the master server.
+      dispatcher_address: Specifies the address of the dispatcher.
       worker_address: (Optional.) Specifies the address of the worker server.
-        This address is passed to the master server so that the master can tell
-        clients how to connect to this worker. Defaults to `"localhost:%port%"`,
-          where `%port%` will be replaced with the port used by the worker.
+        This address is passed to the dispatcher so that the dispatcher can
+        tell clients how to connect to this worker. Defaults to
+        `"localhost:%port%"`, where `%port%` will be replaced with the port used
+        by the worker.
       protocol: (Optional.) Specifies the protocol to be used by the server.
         Acceptable values include `"grpc", "grpc+local"`. Defaults to `"grpc"`.
       start: (Optional.) Boolean, indicating whether to start the server after
@@ -200,8 +238,13 @@ class WorkerServer(object):
       protocol = "grpc"
 
     self._protocol = protocol
+    config = service_config_pb2.WorkerConfig(
+        port=port,
+        protocol=protocol,
+        dispatcher_address=dispatcher_address,
+        worker_address=worker_address)
     self._server = _pywrap_server_lib.TF_DATA_NewWorkerServer(
-        port, protocol, master_address, worker_address)
+        config.SerializeToString())
     if start:
       self._server.start()
 
@@ -221,7 +264,7 @@ class WorkerServer(object):
 
     ```
     worker_server = tf.data.experimental.service.WorkerServer(
-        port=5051, master_address="grpc://localhost:5050")
+        port=5051, dispatcher_address="grpc://localhost:5050")
     worker_server.join()
     ```
 
diff --git a/tensorflow/python/data/experimental/service/server_lib_test.py b/tensorflow/python/data/experimental/service/server_lib_test.py
index 74eb11dc59c..f7354e64a3a 100644
--- a/tensorflow/python/data/experimental/service/server_lib_test.py
+++ b/tensorflow/python/data/experimental/service/server_lib_test.py
@@ -25,68 +25,68 @@ from tensorflow.python.platform import test
 
 class ServerLibTest(test.TestCase):
 
-  def testStartMaster(self):
-    master = server_lib.MasterServer(0, start=False)
-    master.start()
+  def testStartDispatcher(self):
+    dispatcher = server_lib.DispatchServer(0, start=False)
+    dispatcher.start()
 
-  def testMultipleStartMaster(self):
-    master = server_lib.MasterServer(0, start=True)
-    master.start()
+  def testMultipleStartDispatcher(self):
+    dispatcher = server_lib.DispatchServer(0, start=True)
+    dispatcher.start()
 
   def testStartWorker(self):
-    master = server_lib.MasterServer(0)
-    worker = server_lib.WorkerServer(0, master._address, start=False)
+    dispatcher = server_lib.DispatchServer(0)
+    worker = server_lib.WorkerServer(0, dispatcher._address, start=False)
     worker.start()
 
   def testMultipleStartWorker(self):
-    master = server_lib.MasterServer(0)
-    worker = server_lib.WorkerServer(0, master._address, start=True)
+    dispatcher = server_lib.DispatchServer(0)
+    worker = server_lib.WorkerServer(0, dispatcher._address, start=True)
     worker.start()
 
-  def testStopMaster(self):
-    master = server_lib.MasterServer(0)
-    master._stop()
-    master._stop()
+  def testStopDispatcher(self):
+    dispatcher = server_lib.DispatchServer(0)
+    dispatcher._stop()
+    dispatcher._stop()
 
   def testStopWorker(self):
-    master = server_lib.MasterServer(0)
-    worker = server_lib.WorkerServer(0, master._address)
+    dispatcher = server_lib.DispatchServer(0)
+    worker = server_lib.WorkerServer(0, dispatcher._address)
     worker._stop()
     worker._stop()
 
-  def testStopStartMaster(self):
-    master = server_lib.MasterServer(0)
-    master._stop()
+  def testStopStartDispatcher(self):
+    dispatcher = server_lib.DispatchServer(0)
+    dispatcher._stop()
     with self.assertRaisesRegex(
         RuntimeError, "Server cannot be started after it has been stopped"):
-      master.start()
+      dispatcher.start()
 
   def testStopStartWorker(self):
-    master = server_lib.MasterServer(0)
-    worker = server_lib.WorkerServer(0, master._address)
+    dispatcher = server_lib.DispatchServer(0)
+    worker = server_lib.WorkerServer(0, dispatcher._address)
     worker._stop()
     with self.assertRaisesRegex(
         RuntimeError, "Server cannot be started after it has been stopped"):
       worker.start()
 
-  def testJoinMaster(self):
-    master = server_lib.MasterServer(0)
-    master._stop()
-    master.join()
+  def testJoinDispatcher(self):
+    dispatcher = server_lib.DispatchServer(0)
+    dispatcher._stop()
+    dispatcher.join()
 
   def testJoinWorker(self):
-    master = server_lib.MasterServer(0)
-    worker = server_lib.WorkerServer(0, master._address)
+    dispatcher = server_lib.DispatchServer(0)
+    worker = server_lib.WorkerServer(0, dispatcher._address)
     worker._stop()
     worker.join()
 
-  def testMasterNumWorkers(self):
-    master = server_lib.MasterServer(0)
-    self.assertEqual(0, master._num_workers())
-    worker1 = server_lib.WorkerServer(0, master._address)  # pylint: disable=unused-variable
-    self.assertEqual(1, master._num_workers())
-    worker2 = server_lib.WorkerServer(0, master._address)  # pylint: disable=unused-variable
-    self.assertEqual(2, master._num_workers())
+  def testDispatcherNumWorkers(self):
+    dispatcher = server_lib.DispatchServer(0)
+    self.assertEqual(0, dispatcher._num_workers())
+    worker1 = server_lib.WorkerServer(0, dispatcher._address)  # pylint: disable=unused-variable
+    self.assertEqual(1, dispatcher._num_workers())
+    worker2 = server_lib.WorkerServer(0, dispatcher._address)  # pylint: disable=unused-variable
+    self.assertEqual(2, dispatcher._num_workers())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/service/server_lib_wrapper.cc b/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
index 03453a56c7f..f59c1fb90bf 100644
--- a/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
+++ b/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
@@ -22,19 +22,22 @@ limitations under the License.
 #include "pybind11/pytypes.h"
 #include "pybind11/stl.h"
 #include "tensorflow/core/data/service/server_lib.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/protobuf/data/experimental/service_config.pb.h"
 #include "tensorflow/python/lib/core/pybind11_lib.h"
 #include "tensorflow/python/lib/core/pybind11_status.h"
 
 namespace py = pybind11;
 
 PYBIND11_MODULE(_pywrap_server_lib, m) {
-  py::class_<tensorflow::data::MasterGrpcDataServer>(m, "MasterGrpcDataServer")
-      .def("start", &tensorflow::data::MasterGrpcDataServer::Start)
-      .def("stop", &tensorflow::data::MasterGrpcDataServer::Stop)
-      .def("join", &tensorflow::data::MasterGrpcDataServer::Join)
-      .def("bound_port", &tensorflow::data::MasterGrpcDataServer::BoundPort)
+  py::class_<tensorflow::data::DispatchGrpcDataServer>(m,
+                                                       "DispatchGrpcDataServer")
+      .def("start", &tensorflow::data::DispatchGrpcDataServer::Start)
+      .def("stop", &tensorflow::data::DispatchGrpcDataServer::Stop)
+      .def("join", &tensorflow::data::DispatchGrpcDataServer::Join)
+      .def("bound_port", &tensorflow::data::DispatchGrpcDataServer::BoundPort)
       .def("num_workers",
-           [](tensorflow::data::MasterGrpcDataServer* server) -> int {
+           [](tensorflow::data::DispatchGrpcDataServer* server) -> int {
              int num_workers;
              tensorflow::Status status = server->NumWorkers(&num_workers);
              tensorflow::MaybeRaiseFromStatus(status);
@@ -48,12 +51,17 @@ PYBIND11_MODULE(_pywrap_server_lib, m) {
       .def("bound_port", &tensorflow::data::WorkerGrpcDataServer::BoundPort);
 
   m.def(
-      "TF_DATA_NewMasterServer",
-      [](int port, std::string protocol)
-          -> std::unique_ptr<tensorflow::data::MasterGrpcDataServer> {
-        std::unique_ptr<tensorflow::data::MasterGrpcDataServer> server;
+      "TF_DATA_NewDispatchServer",
+      [](std::string serialized_dispatcher_config)
+          -> std::unique_ptr<tensorflow::data::DispatchGrpcDataServer> {
+        tensorflow::data::experimental::DispatcherConfig config;
+        if (!config.ParseFromString(serialized_dispatcher_config)) {
+          tensorflow::MaybeRaiseFromStatus(tensorflow::errors::InvalidArgument(
+              "Failed to deserialize dispatcher config."));
+        }
+        std::unique_ptr<tensorflow::data::DispatchGrpcDataServer> server;
         tensorflow::Status status =
-            tensorflow::data::NewMasterServer(port, protocol, &server);
+            tensorflow::data::NewDispatchServer(config, &server);
         tensorflow::MaybeRaiseFromStatus(status);
         return server;
       },
@@ -61,12 +69,16 @@ PYBIND11_MODULE(_pywrap_server_lib, m) {
 
   m.def(
       "TF_DATA_NewWorkerServer",
-      [](int port, std::string protocol, std::string master_address,
-         std::string worker_address)
+      [](std::string serialized_worker_config)
           -> std::unique_ptr<tensorflow::data::WorkerGrpcDataServer> {
+        tensorflow::data::experimental::WorkerConfig config;
+        if (!config.ParseFromString(serialized_worker_config)) {
+          tensorflow::MaybeRaiseFromStatus(tensorflow::errors::InvalidArgument(
+              "Failed to deserialize worker config."));
+        }
         std::unique_ptr<tensorflow::data::WorkerGrpcDataServer> server;
-        tensorflow::Status status = tensorflow::data::NewWorkerServer(
-            port, protocol, master_address, worker_address, &server);
+        tensorflow::Status status =
+            tensorflow::data::NewWorkerServer(config, &server);
         tensorflow::MaybeRaiseFromStatus(status);
         return server;
       },
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index eaee1184ff4..210b6f59681 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -29,7 +29,7 @@ tf_py_test(
 
 tf_py_test(
     name = "cache_test",
-    size = "small",
+    size = "medium",
     srcs = ["cache_test.py"],
     deps = [
         ":test_base",
@@ -224,6 +224,7 @@ tf_py_test(
     name = "from_generator_test",
     size = "medium",
     srcs = ["from_generator_test.py"],
+    shard_count = 10,
     deps = [
         ":test_base",
         "//tensorflow/python:client_testlib",
@@ -648,7 +649,7 @@ tf_py_test(
 
 tf_py_test(
     name = "shuffle_test",
-    size = "small",
+    size = "medium",
     srcs = ["shuffle_test.py"],
     deps = [
         ":test_base",
diff --git a/tensorflow/python/data/kernel_tests/as_numpy_iterator_test.py b/tensorflow/python/data/kernel_tests/as_numpy_iterator_test.py
index ea80389b0a5..a69e49439c4 100644
--- a/tensorflow/python/data/kernel_tests/as_numpy_iterator_test.py
+++ b/tensorflow/python/data/kernel_tests/as_numpy_iterator_test.py
@@ -27,7 +27,7 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops.ragged import ragged_tensor_value
+from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
 
 
@@ -74,9 +74,11 @@ class AsNumpyIteratorTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testRaggedElement(self):
-    self._testInvalidElement(
-        ragged_tensor_value.RaggedTensorValue(
-            np.array([0, 1, 2]), np.array([0, 1, 3], dtype=np.int64)))
+    lst = [[1, 2], [3], [4, 5, 6]]
+    rt = ragged_factory_ops.constant(lst)
+    ds = dataset_ops.Dataset.from_tensor_slices(rt)
+    for actual, expected in zip(ds.as_numpy_iterator(), lst):
+      self.assertTrue(np.array_equal(actual, expected))
 
   @combinations.generate(test_base.eager_only_combinations())
   def testDatasetElement(self):
diff --git a/tensorflow/python/data/kernel_tests/cache_test.py b/tensorflow/python/data/kernel_tests/cache_test.py
index a95424b6843..1fa9c551106 100644
--- a/tensorflow/python/data/kernel_tests/cache_test.py
+++ b/tensorflow/python/data/kernel_tests/cache_test.py
@@ -33,8 +33,11 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
 class FileCacheTest(test_base.DatasetTestBase, parameterized.TestCase):
@@ -380,6 +383,24 @@ class MemoryCacheTest(test_base.DatasetTestBase, parameterized.TestCase):
     for i in range(10):
       self.assertEqual(next(it), results[i])
 
+  @combinations.generate(test_base.eager_only_combinations())
+  def testCheckpointLargeCache(self):
+    # Tensor of size 100M
+    dataset = dataset_ops.Dataset.from_tensors(
+        array_ops.ones((25, 1000, 1000), dtype=dtypes.float32))
+    # Repeat 25 times to exceed the 2G proto limit
+    dataset = dataset.repeat(25)
+    dataset = dataset.cache()
+
+    # Iterate to fill the cache.
+    iterator = iter(dataset)
+    for _ in range(23):
+      next(iterator)
+    ckpt = trackable_utils.Checkpoint(iterator=iterator)
+    manager = checkpoint_management.CheckpointManager(
+        ckpt, self.get_temp_dir(), max_to_keep=1)
+    manager.save()
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/data_service_ops_test.py b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
index 488bf97f184..6ef9293ddd7 100644
--- a/tensorflow/python/data/kernel_tests/data_service_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
@@ -17,10 +17,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+import threading
 import time
 
 from absl.testing import parameterized
 
+from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import data_service_ops
 from tensorflow.python.data.experimental.ops import distribute_options
 from tensorflow.python.data.experimental.ops import testing
@@ -29,64 +32,219 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import combinations
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import test
 
-PROTOCOL = "grpc"
+
+def _address_from_target(target):
+  # Targets are in the format <protocol>://<address>
+  return target.split("://")[1]
 
 
-def _make_distributed_dataset(dataset, address, job_name=None):
-  """Creates a distributed dataset with a short task refresh interval."""
+def _make_distributed_dataset(dataset,
+                              dispatcher,
+                              job_name=None,
+                              max_outstanding_requests=None):
   return dataset.apply(
       data_service_ops._distribute(
           "parallel_epochs",
-          "{0}://{1}".format(PROTOCOL, address),
+          dispatcher.target,
           job_name=job_name,
+          max_outstanding_requests=max_outstanding_requests,
           task_refresh_interval_hint_ms=20))
 
 
+def _make_distributed_range_dataset(num_elements,
+                                    dispatcher,
+                                    job_name=None,
+                                    max_outstanding_requests=None):
+  """Creates a distributed dataset.
+
+  Args:
+    num_elements: The number of elements in the range dataset that will be
+      distributed.
+    dispatcher: The dispatcher to distribute to.
+    job_name: Optional job name for the distributed dataset.
+    max_outstanding_requests: Optional limit on the number of outstanding
+      requests.
+
+  Returns:
+    The created dataset.
+  """
+  dataset = dataset_ops.Dataset.range(num_elements)
+  return _make_distributed_dataset(dataset, dispatcher, job_name,
+                                   max_outstanding_requests)
+
+
 class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
-  def create_cluster(self, num_workers):
+  def start_dispatch_server(self, port=0):
+    work_dir = os.path.join(self.get_temp_dir(), "work_dir")
+    return server_lib.DispatchServer(
+        port=port,
+        protocol=server_lib.DEFAULT_PROTOCOL,
+        work_dir=work_dir,
+        fault_tolerant_mode=True)
+
+  def start_worker_server(self, dispatcher, port=0):
+    return server_lib.WorkerServer(
+        port=port,
+        dispatcher_address=_address_from_target(dispatcher.target),
+        protocol=server_lib.DEFAULT_PROTOCOL)
+
+  def restart_dispatcher(self, dispatcher):
+    """Stops `dispatcher` and returns a new dispatcher with the same port."""
+    port = int(_address_from_target(dispatcher.target).split(":")[1])
+    dispatcher._stop()
+    return self.start_dispatch_server(port=port)
+
+  def restart_worker(self, worker, dispatcher, use_same_port=True):
+    """Stops `worker` and returns a new worker."""
+    port = 0
+    if use_same_port:
+      port = int(worker._address.split(":")[1])
+    worker._stop()
+    return self.start_worker_server(dispatcher, port)
+
+  def start_cluster(self, num_workers):
     """Creates a cluster of tf.data service servers.
 
     Args:
       num_workers: The number of workers in the cluster.
 
     Returns:
-      The address of the master.
+      A tuple of (dispatcher, list_of_workers).
     """
-    self._master = server_lib.MasterServer(port=0, protocol=PROTOCOL)
-    self._servers = []
-    for _ in range(num_workers):
-      self._servers.append(
-          server_lib.WorkerServer(
-              port=0, master_address=self._master._address, protocol=PROTOCOL))
-
-    return self._master._address
+    dispatcher = self.start_dispatch_server()
+    servers = [self.start_worker_server(dispatcher) for _ in range(num_workers)]
+    return dispatcher, servers
 
   @combinations.generate(test_base.eager_only_combinations())
   def testDistributeBasic(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
     num_elements = 10
-    master_address = self.create_cluster(1)
-    ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(ds, master_address)
+    ds = _make_distributed_range_dataset(10, dispatcher)
     results = [elem.numpy() for elem in ds]
     self.assertEqual(list(range(num_elements)), results)
 
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDispatcherStop(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    num_elements = 100
+    ds = _make_distributed_range_dataset(num_elements, dispatcher)
+    iterator = iter(ds)
+    results = []
+    results.append(next(iterator).numpy())
+    dispatcher._stop()
+    # After the dispatcher dies, the worker should continue providing the rest
+    # of the dataset's elements.
+    for _ in range(num_elements - 1):
+      results.append(next(iterator).numpy())
+    self.assertEqual(results, list(range(num_elements)))
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDispatcherRestartBeforeReading(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    num_elements = 100
+    ds = _make_distributed_range_dataset(num_elements, dispatcher)
+    dispatcher = self.restart_dispatcher(dispatcher)
+
+    self.assertDatasetProduces(ds, list(range(num_elements)))
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDispatcherRestartDuringReading(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    num_elements = 100
+    ds = _make_distributed_range_dataset(num_elements, dispatcher)
+    iterator = iter(ds)
+    results = []
+    for _ in range(num_elements // 2):
+      results.append(next(iterator).numpy())
+    dispatcher = self.restart_dispatcher(dispatcher)
+    for elem in iterator:
+      results.append(elem.numpy())
+
+    self.assertEqual(list(range(num_elements)), results)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDispatcherRestartBetweenIterations(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    num_elements = 100
+    ds = _make_distributed_range_dataset(100, dispatcher)
+    self.assertDatasetProduces(ds, list(range(num_elements)))
+    dispatcher = self.restart_dispatcher(dispatcher)
+    self.assertDatasetProduces(ds, list(range(num_elements)))
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDispatcherManyRestarts(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    num_elements_start = 10
+    num_elements_end = 15
+    datasets = []
+    for num_elements in range(num_elements_start, num_elements_end):
+      datasets.append(_make_distributed_range_dataset(num_elements, dispatcher))
+      dispatcher = self.restart_dispatcher(dispatcher)
+    for ds, num_elements in zip(datasets,
+                                range(num_elements_start, num_elements_end)):
+      self.assertDatasetProduces(ds, list(range(num_elements)))
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDispatcherAndWorkerRestart(self):
+    dispatcher, [worker] = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    num_elements = 100
+    ds = dataset_ops.Dataset.range(num_elements)
+
+    def restart():
+      return (self.restart_dispatcher(dispatcher),
+              self.restart_worker(worker, dispatcher))
+
+    ds = _make_distributed_dataset(ds, dispatcher)
+    dispatcher, worker = restart()
+    self.assertDatasetProduces(ds, list(range(num_elements)))
+    dispatcher, worker = restart()
+    self.assertDatasetProduces(ds, list(range(num_elements)))
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDistributeSparse(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    element = sparse_tensor.SparseTensor(
+        indices=[[0]],
+        values=constant_op.constant([0], dtype=dtypes.int32),
+        dense_shape=[1])
+    ds = dataset_ops.Dataset.from_tensors(element)
+    ds = _make_distributed_dataset(ds, dispatcher)
+    results = [sparse_ops.sparse_tensor_to_dense(elem) for elem in ds]
+    self.assertAllEqual(results, [[0]])
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDistributeRagged(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    ds = dataset_ops.Dataset.from_tensor_slices([1, 5, 3, 2, 8])
+    ds = ds.map(math_ops.range)
+    ds = ds.apply(batching.dense_to_ragged_batch(2))
+    ds = _make_distributed_dataset(ds, dispatcher)
+    results = [elem.to_tensor() for elem in ds]
+    self.assertAllEqual(results[0], [[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]])
+    self.assertAllEqual(results[1], [[0, 1, 2], [0, 1, 0]])
+    self.assertAllEqual(results[2], [[0, 1, 2, 3, 4, 5, 6, 7]])
+
   @combinations.generate(test_base.eager_only_combinations())
   def testDifferentShuffleOrders(self):
     random_seed.set_random_seed(None)
     num_elements = 100
-    master_address = self.create_cluster(2)
+    dispatcher, workers = self.start_cluster(2)  # to avoid gcing workers, pylint: disable=unused-variable
     ds = dataset_ops.Dataset.range(num_elements)
     ds = ds.shuffle(num_elements)
-    ds = _make_distributed_dataset(ds, master_address)
+    ds = _make_distributed_dataset(ds, dispatcher)
     output = [elem.numpy() for elem in ds]
 
     # The output will be two sequences of range(num_elements)
@@ -103,34 +261,31 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testMultipleEpochs(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
     num_elements = 3
-    master_address = self.create_cluster(1)
-    ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(ds, master_address)
+    ds = _make_distributed_range_dataset(num_elements, dispatcher)
     for _ in range(10):
       self.assertEqual(list(range(num_elements)), [elem.numpy() for elem in ds])
 
   @combinations.generate(test_base.eager_only_combinations())
   def testRepeatedDataset(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
     num_elements = 10
     num_repetitions = 5
-    master_address = self.create_cluster(1)
-    ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(ds, master_address)
+    ds = _make_distributed_range_dataset(num_elements, dispatcher)
     ds = ds.repeat(num_repetitions)
     self.assertDatasetProduces(
         ds, expected_output=num_repetitions * list(range(num_elements)))
 
   @combinations.generate(test_base.eager_only_combinations())
   def testConcurrentEpoch(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
     num_elements = 10
     num_datasets = 3
-    master_address = self.create_cluster(1)
     iterators = []
     results = []
     for _ in range(num_datasets):
-      ds = dataset_ops.Dataset.range(num_elements)
-      ds = _make_distributed_dataset(ds, master_address)
+      ds = _make_distributed_range_dataset(num_elements, dispatcher)
       iterators.append(iter(ds))
       results.append([])
 
@@ -144,11 +299,10 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
   @combinations.generate(test_base.eager_only_combinations())
   def testSharedEpoch(self):
     self.skipTest("Not yet implemented")
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
     num_elements = 10
     num_iterators = 3
-    master_address = self.create_cluster(1)
-    ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(ds, master_address)
+    ds = _make_distributed_range_dataset(num_elements, dispatcher)
     result = []
     iterators = []
     for _ in range(num_iterators):
@@ -169,32 +323,56 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
   @combinations.generate(test_base.eager_only_combinations())
   def testMultiWorker(self):
     num_workers = 3
+    dispatcher, workers = self.start_cluster(num_workers)  # to avoid gcing workers, pylint: disable=unused-variable
     num_elements = 10
-    master_address = self.create_cluster(num_workers)
-    ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(ds, master_address)
+    ds = _make_distributed_range_dataset(num_elements, dispatcher)
     results = [elem.numpy() for elem in ds]
     self.assertCountEqual(num_workers * list(range(num_elements)), results)
 
+  @combinations.generate(test_base.eager_only_combinations())
+  def testStartServersLate(self):
+    # Test that the data service client performs retries instead of failing when
+    # the dataset is created before the master and worker are started.
+    try:
+      import portpicker  # pylint: disable=g-import-not-at-top
+      dispatcher_port = portpicker.pick_unused_port()
+    except:
+      raise self.skipTest("Flakes in portpicker library do not represent "
+                          "TensorFlow errors.")
+    dispatcher = server_lib.DispatchServer(port=dispatcher_port, start=False)
+    worker = server_lib.WorkerServer(
+        port=0,
+        dispatcher_address=_address_from_target(dispatcher.target),
+        start=False)
+
+    def start_servers():
+      time.sleep(1)
+      dispatcher.start()
+      worker.start()
+
+    start_servers_thread = threading.Thread(target=start_servers, daemon=True)
+    start_servers_thread.start()
+
+    num_elements = 10
+    ds = _make_distributed_range_dataset(num_elements, dispatcher)
+    results = [elem.numpy() for elem in ds]
+    self.assertEqual(list(range(num_elements)), results)
+    start_servers_thread.join()
+
   @combinations.generate(test_base.eager_only_combinations())
   def testAddWorkerMidJob(self):
-    self._master = server_lib.MasterServer(port=0, protocol=PROTOCOL)
-    self._worker = server_lib.WorkerServer(
-        port=0, master_address=self._master._address, protocol=PROTOCOL)
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
     num_elements = 100
-    ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(ds, self._master._address)
+    ds = _make_distributed_range_dataset(num_elements, dispatcher)
     iterator = iter(ds)
     results = []
     # Read halfway through the dataset.
     for _ in range(num_elements // 2):
       results.append(next(iterator).numpy())
 
-    self._new_worker = server_lib.WorkerServer(
-        port=0, master_address=self._master._address, protocol=PROTOCOL)
-
-    # Wait for the new worker to register with the master.
-    while self._master._num_workers() < 2:
+    new_worker = self.start_worker_server(dispatcher)  # to avoid gcing workers, pylint: disable=unused-variable
+    # Wait for the new worker to register with the dispatcher.
+    while dispatcher._num_workers() < 2:
       time.sleep(10 / 1000)  # 10ms
 
     for elem in iterator:
@@ -206,12 +384,9 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
       combinations.times(test_base.eager_only_combinations(),
                          combinations.combine(use_same_port=[True, False])))
   def testRestartWorker(self, use_same_port):
-    self._master = server_lib.MasterServer(port=0, protocol=PROTOCOL)
-    self._worker = server_lib.WorkerServer(
-        port=0, master_address=self._master._address, protocol=PROTOCOL)
+    dispatcher, [worker] = self.start_cluster(1)
     num_elements = 100
-    ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(ds, self._master._address)
+    ds = _make_distributed_range_dataset(num_elements, dispatcher)
     iterator = iter(ds)
     # Read halfway through the dataset.
     midpoint = num_elements // 2
@@ -219,12 +394,7 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertEqual(i, next(iterator).numpy())
 
     # Stop the original worker and start a new one.
-    port = 0
-    if use_same_port:
-      port = int(self._worker._address.split(":")[1])
-    self._worker._stop()
-    self._new_worker = server_lib.WorkerServer(
-        port=port, master_address=self._master._address, protocol=PROTOCOL)
+    worker = self.restart_worker(worker, dispatcher, use_same_port)
 
     # There may have been some elements prefetched from the first worker
     # before it was stopped.
@@ -242,29 +412,23 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testMaxOutstandingRequests(self):
-    num_elements = 10
     num_workers = 3
-    address = self.create_cluster(num_workers)
-    ds = dataset_ops.Dataset.range(num_elements)
-    ds = ds.apply(
-        data_service_ops._distribute(
-            "parallel_epochs",
-            "{0}://{1}".format(PROTOCOL, address),
-            max_outstanding_requests=1,
-            task_refresh_interval_hint_ms=20))
+    dispatcher, workers = self.start_cluster(num_workers)  # to avoid gcing workers, pylint: disable=unused-variable
+    num_elements = 10
+    ds = _make_distributed_range_dataset(
+        num_elements, dispatcher, max_outstanding_requests=1)
     self.assertCountEqual(num_workers * list(range(num_elements)),
                           self.getDatasetOutput(ds))
 
   @combinations.generate(test_base.eager_only_combinations())
   def testInsideFunction(self):
     num_workers = 3
+    dispatcher, workers = self.start_cluster(num_workers)  # to avoid gcing workers, pylint: disable=unused-variable
     num_elements = 10
-    master_address = self.create_cluster(num_workers)
 
     @def_function.function
     def f():
-      ds = dataset_ops.Dataset.range(num_elements)
-      ds = _make_distributed_dataset(ds, master_address)
+      ds = _make_distributed_range_dataset(num_elements, dispatcher)
       result = tensor_array_ops.TensorArray(
           dtypes.int64, size=num_workers * num_elements, dynamic_size=True)
       i = 0
@@ -278,11 +442,11 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testSharedJobName(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
     num_elements = 100
-    master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
-    ds1 = _make_distributed_dataset(ds, master_address, job_name="job_name")
-    ds2 = _make_distributed_dataset(ds, master_address, job_name="job_name")
+    ds1 = _make_distributed_dataset(ds, dispatcher, job_name="job_name")
+    ds2 = _make_distributed_dataset(ds, dispatcher, job_name="job_name")
     iter1 = iter(ds1)
     iter2 = iter(ds2)
     results = []
@@ -297,21 +461,21 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testDifferentJobNames(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
     num_elements = 10
-    master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
-    ds1 = _make_distributed_dataset(ds, master_address, job_name="job_name1")
-    ds2 = _make_distributed_dataset(ds, master_address, job_name="job_name2")
+    ds1 = _make_distributed_dataset(ds, dispatcher, job_name="job_name1")
+    ds2 = _make_distributed_dataset(ds, dispatcher, job_name="job_name2")
     self.assertDatasetProduces(ds1, list(range(num_elements)))
     self.assertDatasetProduces(ds2, list(range(num_elements)))
 
   @combinations.generate(test_base.eager_only_combinations())
   def testSharedJobNameMultiIteration(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
     num_elements = 10
-    master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
-    ds1 = _make_distributed_dataset(ds, master_address, job_name="job_name")
-    ds2 = _make_distributed_dataset(ds, master_address, job_name="job_name")
+    ds1 = _make_distributed_dataset(ds, dispatcher, job_name="job_name")
+    ds2 = _make_distributed_dataset(ds, dispatcher, job_name="job_name")
     # iteration 1
     self.assertDatasetProduces(ds1, list(range(num_elements)))
     self.assertDatasetProduces(ds2, [])
@@ -321,13 +485,13 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testSharedJobNameRepeat(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
     num_elements = 100
     num_repetitions = 3
-    master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
-    ds1 = _make_distributed_dataset(ds, master_address, job_name="job_name")
+    ds1 = _make_distributed_dataset(ds, dispatcher, job_name="job_name")
     ds1 = ds1.repeat(num_repetitions)
-    ds2 = _make_distributed_dataset(ds, master_address, job_name="job_name")
+    ds2 = _make_distributed_dataset(ds, dispatcher, job_name="job_name")
     ds2 = ds2.repeat(num_repetitions)
     results = []
     iter1 = iter(ds1)
@@ -345,7 +509,7 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
   @combinations.generate(test_base.eager_only_combinations())
   def testApplyDeterminismOption(self):
     elements = list(range(10))
-    master_address = self.create_cluster(1)
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
 
     def dataset_fn(delay_ms):
 
@@ -362,7 +526,7 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
       opts = dataset_ops.Options()
       opts.experimental_deterministic = False
       ds = ds.with_options(opts)
-      ds = _make_distributed_dataset(ds, master_address)
+      ds = _make_distributed_dataset(ds, dispatcher)
       return ds
 
     self.checkDeterminism(
@@ -379,8 +543,8 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     options.experimental_external_state_policy = external_state_policy
     ds = ds.with_options(options)
 
-    master_address = self.create_cluster(3)
-    ds = _make_distributed_dataset(ds, master_address)
+    dispatcher, workers = self.start_cluster(3)  # to avoid gcing workers, pylint: disable=unused-variable
+    ds = _make_distributed_dataset(ds, dispatcher)
     next(iter(ds))
 
   @combinations.generate(
@@ -400,13 +564,13 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testDistributeFromInterleave(self):
-    master_address = self.create_cluster(1)
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
     ds = dataset_ops.Dataset.range(2)
 
     def interleave_fn(_):
-      ds = dataset_ops.Dataset.range(2)
-      _make_distributed_dataset(ds, master_address)
-      return ds
+      dataset = dataset_ops.Dataset.range(2)
+      _make_distributed_dataset(dataset, dispatcher)
+      return dataset
 
     with self.assertRaisesRegex(
         errors.InvalidArgumentError, r"The `.distribute\(...\)` dataset "
@@ -439,6 +603,95 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
           data_service_ops.distribute(
               processing_mode="invalid", service="grpc://localhost:5000"))
 
+  @combinations.generate(test_base.eager_only_combinations())
+  def testFromDatasetId(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+
+    num_elements = 10
+    ds = dataset_ops.Dataset.range(num_elements)
+    dataset_id = data_service_ops.register_dataset(dispatcher.target, ds)
+    from_dataset_id_ds = data_service_ops.from_dataset_id(
+        "parallel_epochs", dispatcher.target, dataset_id, ds.element_spec)
+    self.assertDatasetProduces(from_dataset_id_ds, list(range(num_elements)))
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testFromDatasetIdMultipleComponents(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+
+    num_elements = 10
+    ds = dataset_ops.Dataset.range(num_elements)
+    ds = dataset_ops.Dataset.zip({"a": (ds, ds), "b": ds})
+    dataset_id = data_service_ops.register_dataset(dispatcher.target, ds)
+    from_dataset_id_ds = data_service_ops.from_dataset_id(
+        "parallel_epochs", dispatcher.target, dataset_id, ds.element_spec)
+    output = self.getDatasetOutput(from_dataset_id_ds)
+    for i in range(num_elements):
+      self.assertEqual(i, output[i]["a"][0])
+      self.assertEqual(i, output[i]["a"][1])
+      self.assertEqual(i, output[i]["b"])
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testFromDatasetIdWrongElementSpec(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+
+    num_elements = 10
+    ds = dataset_ops.Dataset.range(num_elements)
+    dataset_id = data_service_ops.register_dataset(dispatcher.target, ds)
+    wrong_spec = tensor_spec.TensorSpec(shape=(), dtype=dtypes.variant)
+    from_dataset_id_ds = data_service_ops.from_dataset_id(
+        "parallel_epochs", dispatcher.target, dataset_id, wrong_spec)
+    with self.assertRaisesRegex(errors.FailedPreconditionError,
+                                "Expected a tensor of type variant"):
+      self.evaluate(self.getNext(from_dataset_id_ds)())
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testFromDatasetIdNotRegistered(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+
+    dataset_id = 0
+    element_spec = tensor_spec.TensorSpec(shape=(), dtype=dtypes.variant)
+    from_dataset_id_ds = data_service_ops.from_dataset_id(
+        "parallel_epochs", dispatcher.target, dataset_id, element_spec)
+    with self.assertRaisesRegex(errors.NotFoundError, "Dataset id"):
+      self.evaluate(self.getNext(from_dataset_id_ds)())
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testCancellation(self):
+    self.skipTest("b/162521601")
+    sleep_microseconds = int(1e6) * 1000
+
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    # Create a dataset which produces the first element quickly, and the second
+    # element slowly. Fetching the first element triggers prefetching of the
+    # second element, which we should be able to cancel.
+    slow = dataset_ops.Dataset.range(1)
+    slow = slow.apply(testing.sleep(sleep_microseconds))
+    ds = dataset_ops.Dataset.range(1).concatenate(slow)
+    ds = _make_distributed_dataset(ds, dispatcher)
+    ds = ds.prefetch(1)
+    get_next = self.getNext(ds, requires_initialization=True)
+    self.assertEqual(0, self.evaluate(get_next()))
+    # Without properly implemented cancellation, we will hang here while trying
+    # to garbage collect the dataset iterator.
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testRegisterEquivalentDatasets(self):
+    ds_1 = dataset_ops.Dataset.range(10)
+    ds_2 = dataset_ops.Dataset.range(10)
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    id_1 = data_service_ops.register_dataset(dispatcher.target, ds_1)
+    id_2 = data_service_ops.register_dataset(dispatcher.target, ds_2)
+    self.assertEqual(id_1.numpy(), id_2.numpy())
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testRegisterDifferentDatasets(self):
+    ds_1 = dataset_ops.Dataset.range(10)
+    ds_2 = dataset_ops.Dataset.range(20)
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    id_1 = data_service_ops.register_dataset(dispatcher.target, ds_1)
+    id_2 = data_service_ops.register_dataset(dispatcher.target, ds_2)
+    self.assertNotEqual(id_1.numpy(), id_2.numpy())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/dataset_test.py b/tensorflow/python/data/kernel_tests/dataset_test.py
index 32184d1905f..1438ae70158 100644
--- a/tensorflow/python/data/kernel_tests/dataset_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_test.py
@@ -543,6 +543,19 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         10, output_type=dtypes.int32).map(lambda x: (x, None))
     self.assertEqual(self.evaluate(fn(dataset)), 45)
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testIncorrectPythonStructure(self):
+    # Tests that an exception is raised (as opposed to a segfault) when the
+    # Python structure assigned to a dataset is incorrect.
+    dataset = dataset_ops.Dataset.range(10)
+    spec = tensor_spec.TensorSpec([], dtypes.int64)
+    new_structure = (spec, spec)
+    dataset = dataset_ops._RestructuredDataset(dataset, new_structure)
+    dataset = dataset.map(lambda x, y: y)
+
+    with self.assertRaisesOpError(""):
+      self.getDatasetOutput(dataset)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py b/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py
index a16518d1111..e7193567457 100644
--- a/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py
+++ b/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py
@@ -292,6 +292,23 @@ class FromTensorSlicesTest(test_base.DatasetTestBase, parameterized.TestCase):
                      dataset_ops.get_legacy_output_types(dataset))
     self.assertDatasetProduces(dataset, expected_output)
 
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(depth=[1, 2, 3])))
+  def testDatasetInputSerialization(self, depth):
+    dataset = dataset_ops.Dataset.range(100)
+    for _ in range(depth):
+      dataset = [dataset, dataset]
+    dataset = dataset_ops.Dataset.from_tensor_slices(dataset)
+    for _ in range(depth - 1):
+      dataset = dataset.unbatch()
+    dataset = dataset.flat_map(lambda x: x)
+    dataset = self.graphRoundTrip(dataset)
+    expected = list(range(100)) + list(range(100))
+    for _ in range(depth - 1):
+      expected = expected + expected
+    self.assertDatasetProduces(dataset, expected)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/from_tensors_test.py b/tensorflow/python/data/kernel_tests/from_tensors_test.py
index c899c156739..e526745e0e5 100644
--- a/tensorflow/python/data/kernel_tests/from_tensors_test.py
+++ b/tensorflow/python/data/kernel_tests/from_tensors_test.py
@@ -265,6 +265,13 @@ class FromTensorsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
       self.assertEqual(sess.run(iterator.get_next()), 2)
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testDatasetInputSerialization(self):
+    dataset = dataset_ops.Dataset.range(100)
+    dataset = dataset_ops.Dataset.from_tensors(dataset).flat_map(lambda x: x)
+    dataset = self.graphRoundTrip(dataset)
+    self.assertDatasetProduces(dataset, range(100))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/list_files_test.py b/tensorflow/python/data/kernel_tests/list_files_test.py
index 40b4b77116c..b57bfb7293b 100644
--- a/tensorflow/python/data/kernel_tests/list_files_test.py
+++ b/tensorflow/python/data/kernel_tests/list_files_test.py
@@ -113,7 +113,7 @@ class ListFilesTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     # Each run should produce the same set of filenames, which may be
     # different from the order of `expected_filenames`.
-    self.assertItemsEqual(expected_filenames, all_actual_filenames[0])
+    self.assertCountEqual(expected_filenames, all_actual_filenames[0])
     # However, the different runs should produce filenames in the same order
     # as each other.
     self.assertEqual(all_actual_filenames[0], all_actual_filenames[1])
@@ -199,7 +199,7 @@ class ListFilesTest(test_base.DatasetTestBase, parameterized.TestCase):
       actual_filenames.append(compat.as_bytes(self.evaluate(next_element())))
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(next_element())
-    self.assertItemsEqual(expected_filenames, actual_filenames)
+    self.assertCountEqual(expected_filenames, actual_filenames)
     self.assertEqual(actual_filenames[:len(filenames)],
                      actual_filenames[len(filenames):])
 
@@ -234,6 +234,5 @@ class ListFilesTest(test_base.DatasetTestBase, parameterized.TestCase):
         assert_items_equal=True)
 
 
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/shuffle_test.py b/tensorflow/python/data/kernel_tests/shuffle_test.py
index ce30dcbb9a3..07ef600ffac 100644
--- a/tensorflow/python/data/kernel_tests/shuffle_test.py
+++ b/tensorflow/python/data/kernel_tests/shuffle_test.py
@@ -364,8 +364,8 @@ class ShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
     ckpt = trackable_utils.Checkpoint(iterator=iterator)
     manager = checkpoint_management.CheckpointManager(
         ckpt, self.get_temp_dir(), max_to_keep=1)
-    with self.assertRaisesRegex(errors.UnknownError, "Failed to serialize"):
-      manager.save()
+    manager.save()
+    ckpt.restore(manager.latest_checkpoint)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 11b82933595..19d9fe9d88b 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -18,7 +18,6 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
-import collections
 import functools
 import sys
 import threading
@@ -31,11 +30,13 @@ from six.moves import queue as Queue  # pylint: disable=redefined-builtin
 
 from tensorflow.core.framework import graph_pb2
 from tensorflow.python import tf2
+from tensorflow.python.compat import compat
 from tensorflow.python.data.experimental.ops import distribute_options
 from tensorflow.python.data.experimental.ops import optimization_options
 from tensorflow.python.data.experimental.ops import stats_options
 from tensorflow.python.data.experimental.ops import threading_options
 from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.data.util import convert
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import options as options_lib
 from tensorflow.python.data.util import random_seed
@@ -66,12 +67,14 @@ from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import string_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.training.tracking import base as tracking_base
 from tensorflow.python.training.tracking import tracking
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import lazy_loader
 from tensorflow.python.util import nest as tf_nest
+from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import tf_export
 
 # Loaded lazily due to a circular dependency (roughly
@@ -103,7 +106,7 @@ tf_export("data.UNKNOWN_CARDINALITY").export_constant(__name__, "UNKNOWN")
 
 @tf_export("data.Dataset", v1=[])
 @six.add_metaclass(abc.ABCMeta)
-class DatasetV2(collections.Iterable, tracking_base.Trackable,
+class DatasetV2(collections_abc.Iterable, tracking_base.Trackable,
                 composite_tensor.CompositeTensor):
   """Represents a potentially large set of elements.
 
@@ -374,16 +377,18 @@ class DatasetV2(collections.Iterable, tracking_base.Trackable,
     graph_rewrites = options._graph_rewrites()
     graph_rewrite_configs = options._graph_rewrite_configs()
     # pylint: enable=protected-access
-    if graph_rewrites:
+    if graph_rewrites.enabled or graph_rewrites.default:
       if self._has_captured_ref():
         warnings.warn(
             "tf.data graph rewrites are not compatible with tf.Variable. "
             "The following rewrites will be disabled: %s. To enable "
             "rewrites, use resource variables instead by calling "
             "`tf.enable_resource_variables()` at the start of the program." %
-            ", ".join(graph_rewrites))
+            ", ".join(graph_rewrites.enabled + graph_rewrites.default))
       else:
-        dataset = _OptimizeDataset(dataset, graph_rewrites,
+        dataset = _OptimizeDataset(dataset, graph_rewrites.enabled,
+                                   graph_rewrites.disabled,
+                                   graph_rewrites.default,
                                    graph_rewrite_configs)
 
     # (3) Apply autotune options
@@ -518,7 +523,9 @@ class DatasetV2(collections.Iterable, tracking_base.Trackable,
       raise RuntimeError("as_numpy_iterator() is not supported while tracing "
                          "functions")
     for component_spec in nest.flatten(self.element_spec):
-      if not isinstance(component_spec, tensor_spec.TensorSpec):
+      if not isinstance(
+          component_spec,
+          (tensor_spec.TensorSpec, ragged_tensor.RaggedTensorSpec)):
         raise TypeError(
             "Dataset.as_numpy_iterator() does not support datasets containing "
             + str(component_spec.value_type))
@@ -2141,8 +2148,21 @@ name=None))
 
     `cardinality` may return `tf.data.INFINITE_CARDINALITY` if the dataset
     contains an infinite number of elements or `tf.data.UNKNOWN_CARDINALITY` if
-    the analysis fails to determine the number of elements in the dataset
-    (e.g. when the dataset source is a file).
+    the analysis fails to determine the number of elements in the dataset.
+
+    `cardinality` only reports known cardinality (finite or infinite), if it can
+    be inferred statically. In particular, the implementation does not iterate
+    through the dataset or evaluate user-defined functions. As a consequence,
+    the statically inferred cardinality may often be unknown. For example, if
+    the dataset reads from file(s), the cardinality will be unknown. The
+    cardinality will also be unknown if the dataset contains user-defined
+    functions which could affect the cardinality (such as the functions in
+    `filter`, `flat_map`, `interleave`, or `from_generator`).
+
+    When constructing a dataset, you can apply the
+    `tf.data.experimental.assert_cardinality` transformation to inform the
+    dataset of its expected cardinality, so that `cardinality` can produce a
+    known cardinality.
 
     >>> dataset = tf.data.Dataset.range(42)
     >>> print(dataset.cardinality().numpy())
@@ -2151,10 +2171,13 @@ name=None))
     >>> cardinality = dataset.cardinality()
     >>> print((cardinality == tf.data.INFINITE_CARDINALITY).numpy())
     True
-    >>> dataset = dataset.filter(lambda x: True)
+    >>> dataset = dataset.filter(lambda x: False)
     >>> cardinality = dataset.cardinality()
     >>> print((cardinality == tf.data.UNKNOWN_CARDINALITY).numpy())
     True
+    >>> dataset = dataset.apply(tf.data.experimental.assert_cardinality(0))
+    >>> print(dataset.cardinality().numpy())
+    0
 
     Returns:
       A scalar `tf.int64` `Tensor` representing the cardinality of the dataset.
@@ -2887,22 +2910,39 @@ class Options(options_lib.OptionsBase):
       "is being captured.")
 
   def _graph_rewrites(self):
-    """Produces the list of enabled static graph rewrites."""
-    result = []
+    """Produces lists of enabled, disabled, default static graph rewrites.
+
+    Returns:
+      result: a namedtuple with three attributes. `result.enabled` is the list
+        of user enabled graph rewrites. `result.disabled` is the list of user
+        disabled graph rewrites. `result.default` is the list of graph
+        rewrites that are enabled by default (the user has not explicitly
+        enabled or disabled them).
+    """
     if self.experimental_optimization is not None:
-      result.extend(self.experimental_optimization._graph_rewrites())  # pylint: disable=protected-access
+      result = self.experimental_optimization._graph_rewrites()  # pylint: disable=protected-access
     else:
       # Apply default options
-      result.extend(
-          optimization_options.OptimizationOptions()._graph_rewrites())  # pylint: disable=protected-access
+      result = optimization_options.OptimizationOptions()._graph_rewrites()  # pylint: disable=protected-access
 
     if self.experimental_deterministic is False:  # pylint: disable=g-bool-id-comparison
-      result.append("make_sloppy")
-    if self.experimental_stats and self.experimental_stats.latency_all_edges:
-      result.append("latency_all_edges")
-    if self.experimental_slack:
-      result.append("slack")
-    return result
+      result.enabled.append("make_sloppy")
+    elif self.experimental_deterministic is True:  # pylint: disable=g-bool-id-comparison
+      result.disabled.append("make_sloppy")
+    if self.experimental_stats:
+      if  self.experimental_stats.latency_all_edges is True:  # pylint: disable=g-bool-id-comparison
+        result.enabled.append("latency_all_edges")
+      elif self.experimental_stats.latency_all_edges is False:  # pylint: disable=g-bool-id-comparison
+        result.disabled.append("latency_all_edges")
+    if self.experimental_slack is True:  # pylint: disable=g-bool-id-comparison
+      result.enabled.append("slack")
+    elif self.experimental_slack is False:  # pylint: disable=g-bool-id-comparison
+      result.disabled.append("slack")
+
+    graph_rewrites = options_lib.graph_rewrites()
+    return graph_rewrites(enabled=list(set(result.enabled)),
+                          disabled=list(set(result.disabled)),
+                          default=list(set(result.default)))
 
   def _graph_rewrite_configs(self):
     """Produces the list of configurations for enabled graph optimizations."""
@@ -3778,11 +3818,11 @@ class _NumpyIterator(object):
   def __iter__(self):
     return self
 
-  def next(self):
+  def __next__(self):
     return nest.map_structure(lambda x: x.numpy(), next(self._iterator))
 
-  def __next__(self):
-    return self.next()
+  def next(self):
+    return self.__next__()
 
 
 class _VariantTracker(tracking.CapturableResource):
@@ -4387,19 +4427,55 @@ class _ModelDataset(UnaryUnchangedStructureDataset):
 class _OptimizeDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, and applies optimizations."""
 
-  def __init__(self, input_dataset, optimizations, optimization_configs=None):
+  def __init__(self,
+               input_dataset,
+               optimizations_enabled,
+               optimizations_disabled,
+               optimizations_default,
+               optimization_configs=None):
     self._input_dataset = input_dataset
-    if optimizations is None:
-      optimizations = []
     if optimization_configs is None:
       optimization_configs = []
-    self._optimizations = ops.convert_to_tensor(
-        optimizations, dtype=dtypes.string, name="optimizations")
-    variant_tensor = gen_dataset_ops.optimize_dataset(
-        input_dataset._variant_tensor,  # pylint: disable=protected-access
-        self._optimizations,
-        optimization_configs=optimization_configs,
-        **self._flat_structure)
+
+    if compat.forward_compatible(2020, 8, 6):
+      self._optimizations_enabled = convert.optional_param_to_tensor(
+          argument_name="optimizations_enabled",
+          argument_value=optimizations_enabled,
+          argument_default=[],
+          argument_dtype=dtypes.string)
+      self._optimizations_disabled = convert.optional_param_to_tensor(
+          argument_name="optimizations_disabled",
+          argument_value=optimizations_disabled,
+          argument_default=[],
+          argument_dtype=dtypes.string)
+      self._optimizations_default = convert.optional_param_to_tensor(
+          argument_name="optimizations_default",
+          argument_value=optimizations_default,
+          argument_default=[],
+          argument_dtype=dtypes.string)
+
+      variant_tensor = gen_dataset_ops.optimize_dataset_v2(
+          input_dataset._variant_tensor,  # pylint: disable=protected-access
+          self._optimizations_enabled,
+          self._optimizations_disabled,
+          self._optimizations_default,
+          optimization_configs=optimization_configs,
+          **self._flat_structure)
+    else:
+      if optimizations_enabled is None:
+        optimizations_enabled = []
+      if optimizations_default is None:
+        optimizations_default = []
+
+      self._optimizations = ops.convert_to_tensor(
+          optimizations_enabled + optimizations_default,
+          dtype=dtypes.string,
+          name="optimizations")
+      variant_tensor = gen_dataset_ops.optimize_dataset(
+          input_dataset._variant_tensor,  # pylint: disable=protected-access
+          self._optimizations,
+          optimization_configs=optimization_configs,
+          **self._flat_structure)
     super(_OptimizeDataset, self).__init__(input_dataset, variant_tensor)
 
 
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index 462711eef1e..f6f2da0939e 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -18,7 +18,6 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
-import collections
 import threading
 import warnings
 
@@ -41,6 +40,7 @@ from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.training.saver import BaseSaverBuilder
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import deprecation
+from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -432,6 +432,15 @@ class Iterator(trackable.Trackable):
         name=name)
     return structure.from_tensor_list(self._element_spec, flat_ret)
 
+  def get_next_as_optional(self):
+    # pylint: disable=protected-access
+    return optional_ops._OptionalImpl(
+        gen_dataset_ops.iterator_get_next_as_optional(
+            self._iterator_resource,
+            output_types=structure.get_flat_tensor_types(self.element_spec),
+            output_shapes=structure.get_flat_tensor_shapes(
+                self.element_spec)), self.element_spec)
+
   def string_handle(self, name=None):
     """Returns a string-valued `tf.Tensor` that represents this iterator.
 
@@ -545,7 +554,7 @@ class IteratorResourceDeleter(object):
 
 @tf_export("data.Iterator", v1=[])
 @six.add_metaclass(abc.ABCMeta)
-class IteratorBase(collections.Iterator, trackable.Trackable,
+class IteratorBase(collections_abc.Iterator, trackable.Trackable,
                    composite_tensor.CompositeTensor):
   """Represents an iterator of a `tf.data.Dataset`.
 
@@ -734,8 +743,8 @@ class OwnedIterator(IteratorBase):
   def __iter__(self):
     return self
 
-  def __next__(self):  # For Python 3 compatibility
-    return self.next()
+  def next(self):  # For Python 2 compatibility
+    return self.__next__()
 
   def _next_internal(self):
     if not context.executing_eagerly():
@@ -769,7 +778,7 @@ class OwnedIterator(IteratorBase):
   def _type_spec(self):
     return IteratorSpec(self.element_spec)
 
-  def next(self):
+  def __next__(self):
     try:
       return self._next_internal()
     except errors.OutOfRangeError:
diff --git a/tensorflow/python/data/ops/multi_device_iterator_ops.py b/tensorflow/python/data/ops/multi_device_iterator_ops.py
index cb5329650e3..187e1e988e8 100644
--- a/tensorflow/python/data/ops/multi_device_iterator_ops.py
+++ b/tensorflow/python/data/ops/multi_device_iterator_ops.py
@@ -335,8 +335,7 @@ class MultiDeviceIterator(object):
     result = []
     for i, device in enumerate(self._devices):
       with ops.device(device):
-        result.append(
-            iterator_ops.get_next_as_optional(self._device_iterators[i]))
+        result.append(self._device_iterators[i].get_next_as_optional())
     return result
 
   @property
@@ -589,10 +588,10 @@ class OwnedMultiDeviceIterator(composite_tensor.CompositeTensor):
   def __iter__(self):
     return self
 
-  def __next__(self):
-    return self.next()
-
   def next(self):
+    return self.__next__()
+
+  def __next__(self):
     try:
       return self.get_next()
     except errors.OutOfRangeError:
@@ -602,8 +601,7 @@ class OwnedMultiDeviceIterator(composite_tensor.CompositeTensor):
     result = []
     for i, device in enumerate(self._devices):
       with ops.device(device):
-        result.append(
-            iterator_ops.get_next_as_optional(self._device_iterators[i]))
+        result.append(self._device_iterators[i].get_next_as_optional())
     return result
 
   @property
diff --git a/tensorflow/python/data/util/options.py b/tensorflow/python/data/util/options.py
index 3c79197fae8..781ae6403fa 100644
--- a/tensorflow/python/data/util/options.py
+++ b/tensorflow/python/data/util/options.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+
 
 def _internal_attr_name(name):
   return "_" + name
@@ -56,6 +58,12 @@ class OptionsBase(object):
           "Cannot set the property %s on %s." % (name, type(self).__name__))
 
 
+# Creates a namedtuple with three keys for optimization graph rewrites settings.
+def graph_rewrites():
+  return collections.namedtuple("GraphRewrites",
+                                ["enabled", "disabled", "default"])
+
+
 def create_option(name, ty, docstring, default_factory=lambda: None):
   """Creates a type-checked property.
 
diff --git a/tensorflow/python/data/util/structure.py b/tensorflow/python/data/util/structure.py
index 87825005069..30e393c82de 100644
--- a/tensorflow/python/data/util/structure.py
+++ b/tensorflow/python/data/util/structure.py
@@ -440,7 +440,7 @@ def type_spec_from_value(element, use_fallback=True):
 
   if isinstance(element, tuple):
     if hasattr(element, "_fields") and isinstance(
-        element._fields, collections.Sequence) and all(
+        element._fields, collections_abc.Sequence) and all(
             isinstance(f, six.string_types) for f in element._fields):
       if isinstance(element, wrapt.ObjectProxy):
         element_type = type(element.__wrapped__)
diff --git a/tensorflow/python/debug/__init__.py b/tensorflow/python/debug/__init__.py
index ffbdff8c47b..18e7379269f 100644
--- a/tensorflow/python/debug/__init__.py
+++ b/tensorflow/python/debug/__init__.py
@@ -18,7 +18,7 @@ See the [TFDBG](https://www.tensorflow.org/guide/debugger) guide.
 
 @@add_debug_tensor_watch
 @@watch_graph
-@@watch_graph_with_blacklists
+@@watch_graph_with_denylists
 @@DebugTensorDatum
 @@DebugDumpDir
 @@load_tensor_from_event
@@ -57,7 +57,7 @@ from tensorflow.python.debug.lib.debug_graphs import reconstruct_non_debug_graph
 
 from tensorflow.python.debug.lib.debug_utils import add_debug_tensor_watch
 from tensorflow.python.debug.lib.debug_utils import watch_graph
-from tensorflow.python.debug.lib.debug_utils import watch_graph_with_blacklists
+from tensorflow.python.debug.lib.debug_utils import watch_graph_with_denylists
 
 from tensorflow.python.debug.wrappers.dumping_wrapper import DumpingDebugWrapperSession
 from tensorflow.python.debug.wrappers.framework import WatchOptions
diff --git a/tensorflow/python/debug/cli/analyzer_cli.py b/tensorflow/python/debug/cli/analyzer_cli.py
index 49b48fd2dcc..832ac548240 100644
--- a/tensorflow/python/debug/cli/analyzer_cli.py
+++ b/tensorflow/python/debug/cli/analyzer_cli.py
@@ -136,8 +136,8 @@ class DebugAnalyzer(object):
   _TENSOR_NAME_COLUMN_HEAD = "Tensor name"
 
   # Op types to be omitted when generating descriptions of graph structure.
-  _GRAPH_STRUCT_OP_TYPE_BLACKLIST = (
-      "_Send", "_Recv", "_HostSend", "_HostRecv", "_Retval")
+  _GRAPH_STRUCT_OP_TYPE_DENYLIST = ("_Send", "_Recv", "_HostSend", "_HostRecv",
+                                    "_Retval")
 
   def __init__(self, debug_dump, config):
     """DebugAnalyzer constructor.
@@ -795,16 +795,16 @@ class DebugAnalyzer(object):
         lines, font_attr_segs=font_attr_segs)
 
     # List node inputs (non-control and control).
-    inputs = self._exclude_blacklisted_ops(
+    inputs = self._exclude_denylisted_ops(
         self._debug_dump.node_inputs(node_name))
-    ctrl_inputs = self._exclude_blacklisted_ops(
+    ctrl_inputs = self._exclude_denylisted_ops(
         self._debug_dump.node_inputs(node_name, is_control=True))
     output.extend(self._format_neighbors("input", inputs, ctrl_inputs))
 
     # List node output recipients (non-control and control).
-    recs = self._exclude_blacklisted_ops(
+    recs = self._exclude_denylisted_ops(
         self._debug_dump.node_recipients(node_name))
-    ctrl_recs = self._exclude_blacklisted_ops(
+    ctrl_recs = self._exclude_denylisted_ops(
         self._debug_dump.node_recipients(node_name, is_control=True))
     output.extend(self._format_neighbors("recipient", recs, ctrl_recs))
 
@@ -822,19 +822,20 @@ class DebugAnalyzer(object):
     _add_main_menu(output, node_name=node_name, enable_node_info=False)
     return output
 
-  def _exclude_blacklisted_ops(self, node_names):
-    """Exclude all nodes whose op types are in _GRAPH_STRUCT_OP_TYPE_BLACKLIST.
+  def _exclude_denylisted_ops(self, node_names):
+    """Exclude all nodes whose op types are in _GRAPH_STRUCT_OP_TYPE_DENYLIST.
 
     Args:
       node_names: An iterable of node or graph element names.
 
     Returns:
-      A list of node names that are not blacklisted.
+      A list of node names that are not denylisted.
     """
-    return [node_name for node_name in node_names
-            if self._debug_dump.node_op_type(
-                debug_graphs.get_node_name(node_name)) not in
-            self._GRAPH_STRUCT_OP_TYPE_BLACKLIST]
+    return [
+        node_name for node_name in node_names
+        if self._debug_dump.node_op_type(debug_graphs.get_node_name(node_name))
+        not in self._GRAPH_STRUCT_OP_TYPE_DENYLIST
+    ]
 
   def _render_node_traceback(self, node_name):
     """Render traceback of a node's creation in Python, if available.
@@ -1401,13 +1402,13 @@ class DebugAnalyzer(object):
     """
 
     # Make a shallow copy of the list because it may be extended later.
-    all_inputs = self._exclude_blacklisted_ops(
+    all_inputs = self._exclude_denylisted_ops(
         copy.copy(tracker(node_name, is_control=False)))
     is_ctrl = [False] * len(all_inputs)
     if include_control:
       # Sort control inputs or recipients in alphabetical order of the node
       # names.
-      ctrl_inputs = self._exclude_blacklisted_ops(
+      ctrl_inputs = self._exclude_denylisted_ops(
           sorted(tracker(node_name, is_control=True)))
       all_inputs.extend(ctrl_inputs)
       is_ctrl.extend([True] * len(ctrl_inputs))
@@ -1440,7 +1441,7 @@ class DebugAnalyzer(object):
 
     for i, inp in enumerate(all_inputs):
       op_type = self._debug_dump.node_op_type(debug_graphs.get_node_name(inp))
-      if op_type in self._GRAPH_STRUCT_OP_TYPE_BLACKLIST:
+      if op_type in self._GRAPH_STRUCT_OP_TYPE_DENYLIST:
         continue
 
       if is_ctrl[i]:
diff --git a/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py b/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
index b3baa6e7bc2..60cdd6141ff 100644
--- a/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
+++ b/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
@@ -39,8 +39,7 @@ from tensorflow.python.training import gradient_descent
 
 class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
 
-  _OP_TYPE_BLACKLIST = (
-      "_Send", "_Recv", "_HostSend", "_HostRecv", "_Retval")
+  _OP_TYPE_DENYLIST = ("_Send", "_Recv", "_HostSend", "_HostRecv", "_Retval")
 
   def _no_rewrite_session_config(self):
     rewriter_config = rewriter_config_pb2.RewriterConfig(
@@ -60,10 +59,10 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
     file_io.delete_recursively(self._dump_dir)
     super(ReconstructNonDebugGraphTest, self).tearDown()
 
-  def _graphDefWithoutBlacklistedNodes(self, graph_def):
+  def _graphDefWithoutDenylistedNodes(self, graph_def):
     output_graph_def = graph_pb2.GraphDef()
     for node in graph_def.node:
-      if node.op not in self._OP_TYPE_BLACKLIST:
+      if node.op not in self._OP_TYPE_DENYLIST:
         new_node = output_graph_def.node.add()
         new_node.CopyFrom(node)
 
@@ -110,16 +109,16 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
     for i, non_debug_graph_def in enumerate(non_debug_graph_defs):
       device_name = debug_graphs._infer_device_name(non_debug_graph_def)
       test_util.assert_equal_graph_def(
-          self._graphDefWithoutBlacklistedNodes(reconstructed[device_name]),
-          self._graphDefWithoutBlacklistedNodes(non_debug_graph_def))
+          self._graphDefWithoutDenylistedNodes(reconstructed[device_name]),
+          self._graphDefWithoutDenylistedNodes(non_debug_graph_def))
 
       # Test debug_graphs.reconstruct_non_debug_graph_def.
       reconstructed_again = (
           debug_graphs.reconstruct_non_debug_graph_def(
               run_metadata.partition_graphs[i]))
       test_util.assert_equal_graph_def(
-          self._graphDefWithoutBlacklistedNodes(reconstructed_again),
-          self._graphDefWithoutBlacklistedNodes(non_debug_graph_def))
+          self._graphDefWithoutDenylistedNodes(reconstructed_again),
+          self._graphDefWithoutDenylistedNodes(non_debug_graph_def))
 
   def testReconstructSimpleGraph(self):
     with session.Session() as sess:
diff --git a/tensorflow/python/debug/lib/debug_utils.py b/tensorflow/python/debug/lib/debug_utils.py
index 61575cdef76..7b739fe472d 100644
--- a/tensorflow/python/debug/lib/debug_utils.py
+++ b/tensorflow/python/debug/lib/debug_utils.py
@@ -199,20 +199,20 @@ def watch_graph(run_options,
   run_options.debug_options.reset_disk_byte_usage = reset_disk_byte_usage
 
 
-def watch_graph_with_blacklists(run_options,
-                                graph,
-                                debug_ops="DebugIdentity",
-                                debug_urls=None,
-                                node_name_regex_blacklist=None,
-                                op_type_regex_blacklist=None,
-                                tensor_dtype_regex_blacklist=None,
-                                tolerate_debug_op_creation_failures=False,
-                                global_step=-1,
-                                reset_disk_byte_usage=False):
-  """Add debug tensor watches, blacklisting nodes and op types.
+def watch_graph_with_denylists(run_options,
+                               graph,
+                               debug_ops="DebugIdentity",
+                               debug_urls=None,
+                               node_name_regex_denylist=None,
+                               op_type_regex_denylist=None,
+                               tensor_dtype_regex_denylist=None,
+                               tolerate_debug_op_creation_failures=False,
+                               global_step=-1,
+                               reset_disk_byte_usage=False):
+  """Add debug tensor watches, denylisting nodes and op types.
 
   This is similar to `watch_graph()`, but the node names and op types are
-  blacklisted, instead of allowlisted.
+  denylisted, instead of allowlisted.
 
   N.B.:
     1. Under certain circumstances, the `Tensor` may not get actually watched
@@ -225,28 +225,25 @@ def watch_graph_with_blacklists(run_options,
   Args:
     run_options: An instance of `config_pb2.RunOptions` to be modified.
     graph: An instance of `ops.Graph`.
-    debug_ops: (`str` or `list` of `str`) name(s) of the debug op(s) to use.
-      See the documentation of `watch_graph` for more details.
+    debug_ops: (`str` or `list` of `str`) name(s) of the debug op(s) to use. See
+      the documentation of `watch_graph` for more details.
     debug_urls: URL(s) to send debug values to, e.g.,
       `file:///tmp/tfdbg_dump_1`, `grpc://localhost:12345`.
-    node_name_regex_blacklist: Regular-expression blacklist for node_name.
-      This should be a string, e.g., `"(weight_[0-9]+|bias_.*)"`.
-    op_type_regex_blacklist: Regular-expression blacklist for the op type of
-      nodes, e.g., `"(Variable|Add)"`.
-      If both node_name_regex_blacklist and op_type_regex_blacklist
-      are set, the two filtering operations will occur in a logical `OR`
-      relation. In other words, a node will be excluded if it hits either of
-      the two blacklists; a node will be included if and only if it hits
-      neither of the blacklists.
-    tensor_dtype_regex_blacklist: Regular-expression blacklist for Tensor
-      data type, e.g., `"^int.*"`.
-      This blacklist operates in logical `OR` relations to the two allowlists
-      above.
+    node_name_regex_denylist: Regular-expression denylist for node_name. This
+      should be a string, e.g., `"(weight_[0-9]+|bias_.*)"`.
+    op_type_regex_denylist: Regular-expression denylist for the op type of
+      nodes, e.g., `"(Variable|Add)"`. If both node_name_regex_denylist and
+      op_type_regex_denylist are set, the two filtering operations will occur in
+      a logical `OR` relation. In other words, a node will be excluded if it
+      hits either of the two denylists; a node will be included if and only if
+      it hits neither of the denylists.
+    tensor_dtype_regex_denylist: Regular-expression denylist for Tensor data
+      type, e.g., `"^int.*"`. This denylist operates in logical `OR` relations
+      to the two allowlists above.
     tolerate_debug_op_creation_failures: (`bool`) whether debug op creation
       failures (e.g., due to dtype incompatibility) are to be tolerated by not
       throwing exceptions.
-    global_step: (`int`) Optional global_step count for this debug tensor
-      watch.
+    global_step: (`int`) Optional global_step count for this debug tensor watch.
     reset_disk_byte_usage: (`bool`) whether to reset the tracked disk byte
       usage to zero (default: `False`).
   """
@@ -254,12 +251,14 @@ def watch_graph_with_blacklists(run_options,
   if isinstance(debug_ops, str):
     debug_ops = [debug_ops]
 
-  node_name_pattern = (re.compile(node_name_regex_blacklist) if
-                       node_name_regex_blacklist else None)
-  op_type_pattern = (re.compile(op_type_regex_blacklist) if
-                     op_type_regex_blacklist else None)
-  tensor_dtype_pattern = (re.compile(tensor_dtype_regex_blacklist) if
-                          tensor_dtype_regex_blacklist else None)
+  node_name_pattern = (
+      re.compile(node_name_regex_denylist)
+      if node_name_regex_denylist else None)
+  op_type_pattern = (
+      re.compile(op_type_regex_denylist) if op_type_regex_denylist else None)
+  tensor_dtype_pattern = (
+      re.compile(tensor_dtype_regex_denylist)
+      if tensor_dtype_regex_denylist else None)
 
   ops = graph.get_operations()
   for op in ops:
diff --git a/tensorflow/python/debug/lib/debug_utils_test.py b/tensorflow/python/debug/lib/debug_utils_test.py
index 188b89debec..b76583a3d9d 100644
--- a/tensorflow/python/debug/lib/debug_utils_test.py
+++ b/tensorflow/python/debug/lib/debug_utils_test.py
@@ -291,12 +291,12 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertItemsEqual(["a1", "a1/Assign"], node_names)
 
-  def testWatchGraph_nodeNameBlacklist(self):
-    debug_utils.watch_graph_with_blacklists(
+  def testWatchGraph_nodeNameDenylist(self):
+    debug_utils.watch_graph_with_denylists(
         self._run_options,
         self._graph,
         debug_urls="file:///tmp/tfdbg_1",
-        node_name_regex_blacklist="(a1$|a1_init$|a1/.*|p1$)")
+        node_name_regex_denylist="(a1$|a1_init$|a1/.*|p1$)")
 
     node_names = self._verify_watches(
         self._run_options.debug_options.debug_tensor_watch_opts, 0,
@@ -305,37 +305,37 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         sorted(["b_init", "b", "b/Assign", "b/read", "c", "s"]),
         sorted(node_names))
 
-  def testWatchGraph_opTypeBlacklist(self):
-    debug_utils.watch_graph_with_blacklists(
+  def testWatchGraph_opTypeDenylist(self):
+    debug_utils.watch_graph_with_denylists(
         self._run_options,
         self._graph,
         debug_urls="file:///tmp/tfdbg_1",
-        op_type_regex_blacklist="(Variable|Identity|Assign|Const)")
+        op_type_regex_denylist="(Variable|Identity|Assign|Const)")
 
     node_names = self._verify_watches(
         self._run_options.debug_options.debug_tensor_watch_opts, 0,
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertEqual(sorted(["p1", "s"]), sorted(node_names))
 
-  def testWatchGraph_nodeNameAndOpTypeBlacklists(self):
-    debug_utils.watch_graph_with_blacklists(
+  def testWatchGraph_nodeNameAndOpTypeDenylists(self):
+    debug_utils.watch_graph_with_denylists(
         self._run_options,
         self._graph,
         debug_urls="file:///tmp/tfdbg_1",
-        node_name_regex_blacklist="p1$",
-        op_type_regex_blacklist="(Variable|Identity|Assign|Const)")
+        node_name_regex_denylist="p1$",
+        op_type_regex_denylist="(Variable|Identity|Assign|Const)")
 
     node_names = self._verify_watches(
         self._run_options.debug_options.debug_tensor_watch_opts, 0,
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertEqual(["s"], node_names)
 
-  def testWatchGraph_tensorDTypeBlacklists(self):
-    debug_utils.watch_graph_with_blacklists(
+  def testWatchGraph_tensorDTypeDenylists(self):
+    debug_utils.watch_graph_with_denylists(
         self._run_options,
         self._graph,
         debug_urls="file:///tmp/tfdbg_1",
-        tensor_dtype_regex_blacklist=".*_ref")
+        tensor_dtype_regex_denylist=".*_ref")
 
     node_names = self._verify_watches(
         self._run_options.debug_options.debug_tensor_watch_opts, 0,
@@ -346,13 +346,13 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
     self.assertNotIn("b/Assign", node_names)
     self.assertIn("s", node_names)
 
-  def testWatchGraph_nodeNameAndTensorDTypeBlacklists(self):
-    debug_utils.watch_graph_with_blacklists(
+  def testWatchGraph_nodeNameAndTensorDTypeDenylists(self):
+    debug_utils.watch_graph_with_denylists(
         self._run_options,
         self._graph,
         debug_urls="file:///tmp/tfdbg_1",
-        node_name_regex_blacklist="^s$",
-        tensor_dtype_regex_blacklist=".*_ref")
+        node_name_regex_denylist="^s$",
+        tensor_dtype_regex_denylist=".*_ref")
 
     node_names = self._verify_watches(
         self._run_options.debug_options.debug_tensor_watch_opts, 0,
diff --git a/tensorflow/python/debug/lib/session_debug_testlib.py b/tensorflow/python/debug/lib/session_debug_testlib.py
index 16f92085baa..a51d743e746 100644
--- a/tensorflow/python/debug/lib/session_debug_testlib.py
+++ b/tensorflow/python/debug/lib/session_debug_testlib.py
@@ -588,10 +588,10 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       sess.run(variables.global_variables_initializer())
 
       run_options = config_pb2.RunOptions(output_partition_graphs=True)
-      debug_utils.watch_graph_with_blacklists(
+      debug_utils.watch_graph_with_denylists(
           run_options,
           sess.graph,
-          node_name_regex_blacklist="(.*rnn/while/.*|.*TensorArray.*)",
+          node_name_regex_denylist="(.*rnn/while/.*|.*TensorArray.*)",
           debug_urls=self._debug_urls())
       # b/36870549: Nodes with these name patterns need to be excluded from
       # tfdbg in order to prevent MSAN warnings of uninitialized Tensors
diff --git a/tensorflow/python/debug/wrappers/framework.py b/tensorflow/python/debug/wrappers/framework.py
index 4fc1e33d130..e6767cca804 100644
--- a/tensorflow/python/debug/wrappers/framework.py
+++ b/tensorflow/python/debug/wrappers/framework.py
@@ -99,7 +99,6 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
-import collections
 import re
 import threading
 
@@ -113,6 +112,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.training import monitored_session
 from tensorflow.python.util import nest
+from tensorflow.python.util.compat import collections_abc
 
 
 # Helper function.
@@ -445,7 +445,7 @@ class BaseDebugWrapperSession(session.SessionInterface):
       """Check whether a possibly nested structure is empty."""
       if not nest.is_nested(x):
         return False
-      if isinstance(x, collections.Mapping):
+      if isinstance(x, collections_abc.Mapping):
         return is_empty(list(x.values()))
       for item in x:
         if not is_empty(item):
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index f8c04744c5b..8497c4da8a7 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -146,6 +146,7 @@ py_library(
         ":mirrored_strategy",
         ":one_device_strategy",
         ":sharded_variable",
+        "//tensorflow/python/distribute/client:parameter_server_client",
         "//tensorflow/python/distribute/experimental",
     ],
 )
@@ -302,6 +303,7 @@ py_library(
         ":distribute_lib",
         ":reduce_util",
         ":shared_variable_creator",
+        ":tpu_values",
         ":values",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:config",
@@ -433,44 +435,6 @@ py_library(
     ],
 )
 
-py_library(
-    name = "mirrored_function_strategy",
-    srcs = ["mirrored_function_strategy.py"],
-    deps = [
-        ":device_util",
-        ":distribute_lib",
-        ":mirrored_strategy",
-        ":numpy_dataset",
-        ":values",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:type_spec",
-        "//tensorflow/python:util",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:def_function",
-    ],
-)
-
-tf_py_test(
-    name = "mirrored_function_strategy_test",
-    srcs = ["mirrored_function_strategy_test.py"],
-    python_version = "PY3",
-    tags = ["no_pip"],
-    deps = [
-        ":distribute_lib",
-        ":mirrored_function_strategy",
-        ":strategy_combinations",
-        ":values",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/eager:test",
-    ],
-)
-
 py_library(
     name = "multi_worker_util",
     srcs = [
@@ -648,6 +612,7 @@ tpu_py_test(
     python_version = "PY3",
     tags = ["no_oss"],
     deps = [
+        ":strategy_test_lib",
         ":tpu_strategy",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:remote",
@@ -656,6 +621,21 @@ tpu_py_test(
     ],
 )
 
+tpu_py_test(
+    name = "tpu_strategy_compilation_test",
+    srcs = ["tpu_strategy_compilation_test.py"],
+    disable_experimental = True,
+    disable_mlir_bridge = False,
+    python_version = "PY3",
+    tags = ["no_oss"],
+    deps = [
+        ":tpu_strategy",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/eager:remote",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
 # Used only by estimator.
 py_library(
     name = "estimator_training",
@@ -777,9 +757,13 @@ py_library(
     deps = [
         ":distribute_lib",
         ":reduce_util",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python/saved_model:save_context",
+        "//tensorflow/python/saved_model:save_options",
     ],
 )
 
@@ -789,6 +773,7 @@ py_library(
     deps = [
         ":packed_distributed_variable",
         ":values",
+        ":values_util",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops_gen",
@@ -826,9 +811,6 @@ py_test(
     name = "combinations_test",
     srcs = ["combinations_test.py"],
     python_version = "PY3",
-    tags = [
-        "notsan",  # TODO(b/160006974)
-    ],
     deps = [
         ":combinations",
         "//tensorflow/python:client_testlib",
@@ -870,6 +852,7 @@ py_library(
 distribute_py_test(
     name = "strategy_combinations_test",
     srcs = ["strategy_combinations_test.py"],
+    disable_mlir_bridge = False,
     python_version = "PY3",
     deps = [
         ":combinations",
@@ -924,6 +907,7 @@ cuda_py_test(
 distribute_py_test(
     name = "checkpointing_test",
     srcs = ["checkpointing_test.py"],
+    disable_mlir_bridge = False,
     main = "checkpointing_test.py",
     tags = [
         "multi_and_single_gpu",
@@ -939,6 +923,7 @@ distribute_py_test(
 distribute_py_test(
     name = "input_lib_test",
     srcs = ["input_lib_test.py"],
+    disable_mlir_bridge = False,
     main = "input_lib_test.py",
     shard_count = 10,
     tags = [
@@ -983,6 +968,7 @@ distribute_py_test(
         ":multi_worker_test_base",
         ":reduce_util",
         ":strategy_combinations",
+        ":tpu_strategy",
         ":values",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:errors",
@@ -1045,6 +1031,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:remote",
         "//tensorflow/python/eager:test",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1133,6 +1120,7 @@ distribute_py_test(
     name = "values_test",
     size = "medium",
     srcs = ["values_test.py"],
+    disable_mlir_bridge = False,
     main = "values_test.py",
     shard_count = 5,
     tags = [
@@ -1145,6 +1133,9 @@ distribute_py_test(
     deps = [
         ":combinations",
         ":distribute_lib",
+        ":distribute_utils",
+        ":packed_distributed_variable",
+        ":parameter_server_strategy",
         ":strategy_combinations",
         ":test_util",
         ":tpu_strategy",
@@ -1152,6 +1143,7 @@ distribute_py_test(
         ":values",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
@@ -1163,6 +1155,7 @@ distribute_py_test(
         "//tensorflow/python:saver",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python:tf2",
         "//tensorflow/python:training",
@@ -1174,6 +1167,7 @@ distribute_py_test(
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/saved_model:save_context",
+        "//tensorflow/python/saved_model:save_options",
         "//tensorflow/python/saved_model/model_utils:mode_keys",
         "//tensorflow/python/tpu:tpu_lib",
         "//tensorflow/python/types",
@@ -1184,6 +1178,7 @@ distribute_py_test(
 distribute_py_test(
     name = "distribute_utils_test",
     srcs = ["distribute_utils_test.py"],
+    disable_mlir_bridge = False,
     main = "distribute_utils_test.py",
     tags = [
         "multi_and_single_gpu",
@@ -1202,6 +1197,47 @@ distribute_py_test(
         "//tensorflow/python/eager:test",
         "//tensorflow/python/saved_model/model_utils:mode_keys",
         "@absl_py//absl/testing:parameterized",
+        "@wrapt",
+    ],
+)
+
+distribute_py_test(
+    name = "vars_test",
+    size = "medium",
+    srcs = ["vars_test.py"],
+    main = "vars_test.py",
+    shard_count = 5,
+    tags = [
+        "multi_and_single_gpu",
+        "no_rocm",
+    ],
+    tpu_tags = [
+        "no_oss",  # b/150954621 Target too big to run serially reliably.
+    ],
+    deps = [
+        ":combinations",
+        ":distribute_lib",
+        ":strategy_combinations",
+        ":tpu_strategy",
+        ":tpu_values",
+        ":values",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:checkpoint_management",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:indexed_slices",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/tpu:tpu_lib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -1209,6 +1245,7 @@ distribute_py_test(
     name = "ps_values_test",
     size = "medium",
     srcs = ["ps_values_test.py"],
+    disable_mlir_bridge = False,
     main = "ps_values_test.py",
     tags = [
         "multi_and_single_gpu",
@@ -1228,6 +1265,7 @@ distribute_py_test(
 distribute_py_test(
     name = "moving_averages_test",
     srcs = ["moving_averages_test.py"],
+    disable_mlir_bridge = False,
     main = "moving_averages_test.py",
     deps = [
         ":combinations",
@@ -1245,6 +1283,7 @@ distribute_py_test(
 distribute_py_test(
     name = "custom_training_loop_gradient_test",
     srcs = ["custom_training_loop_gradient_test.py"],
+    disable_mlir_bridge = False,
     main = "custom_training_loop_gradient_test.py",
     tags = [
         "multi_and_single_gpu",
@@ -1345,9 +1384,6 @@ cuda_py_test(
     name = "remote_mirrored_strategy_eager_test",
     srcs = ["remote_mirrored_strategy_eager_test.py"],
     python_version = "PY3",
-    tags = [
-        "no_oss",  # b/154743849
-    ],
     deps = [
         ":combinations",
         ":distribute_lib",
@@ -1446,6 +1482,7 @@ distribute_py_test(
 distribute_py_test(
     name = "zero_batch_test",
     srcs = ["zero_batch_test.py"],
+    disable_mlir_bridge = False,
     main = "zero_batch_test.py",
     deps = [
         ":combinations",
@@ -1461,6 +1498,7 @@ cuda_py_test(
     python_version = "PY3",
     tags = [
         "multi_and_single_gpu",
+        "nomsan",  # b/154224457: Re-enable when fixed.
     ],
     # b/155301154 broken with XLA:GPU
     xla_enable_strict_auto_jit = True,
@@ -1645,6 +1683,7 @@ py_test(
 distribute_py_test(
     name = "strategy_common_test",
     srcs = ["strategy_common_test.py"],
+    disable_mlir_bridge = False,
     python_version = "PY3",
     shard_count = 2,
     tags = [
@@ -1667,6 +1706,7 @@ distribute_py_test(
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:def_function",
         "@absl_py//absl/testing:parameterized",
@@ -1676,16 +1716,27 @@ distribute_py_test(
 distribute_py_test(
     name = "tf_function_test",
     srcs = ["tf_function_test.py"],
+    disable_mlir_bridge = False,
     main = "tf_function_test.py",
     tags = [
         "multi_and_single_gpu",
     ],
     deps = [
         ":combinations",
+        ":device_util",
         ":strategy_combinations",
+        ":values",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/saved_model:save_context",
+        "//tensorflow/python/saved_model:save_options",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -1723,3 +1774,21 @@ distribute_py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+py_library(
+    name = "parameter_server_strategy_v2",
+    srcs = ["parameter_server_strategy_v2.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":parameter_server_strategy",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:input_lib",
+        "//tensorflow/python/distribute:sharded_variable",
+        "//tensorflow/python/distribute:values",
+    ],
+)
diff --git a/tensorflow/python/distribute/client/BUILD b/tensorflow/python/distribute/client/BUILD
new file mode 100644
index 00000000000..d0d8d3af4ec
--- /dev/null
+++ b/tensorflow/python/distribute/client/BUILD
@@ -0,0 +1,105 @@
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "parameter_server_client",
+    srcs = ["parameter_server_client.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client",
+        "//tensorflow/python/distribute:parameter_server_strategy_v2",
+    ],
+)
+
+py_library(
+    name = "client",
+    srcs = ["client.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":metric_utils",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:func_graph",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python:util",
+        "//tensorflow/python/distribute:input_lib",
+        "//tensorflow/python/distribute:parameter_server_strategy_v2",
+        "//tensorflow/python/distribute:values",
+        "//tensorflow/python/eager:cancellation",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:executor",
+        "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager:remote",
+        "@absl_py//absl/logging",
+        "@six_archive//:six",
+    ],
+)
+
+tf_py_test(
+    name = "client_test",
+    size = "small",
+    srcs = ["client_test.py"],
+    python_version = "PY3",
+    shard_count = 12,
+    deps = [
+        ":client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:training_lib",
+        "//tensorflow/python:util",
+        "//tensorflow/python/eager:def_function",
+        "@absl_py//absl/logging",
+    ],
+)
+
+tf_py_test(
+    name = "parameter_server_client_test",
+    srcs = ["parameter_server_client_test.py"],
+    python_version = "PY3",
+    shard_count = 14,
+    tags = ["no_oss"],  # TODO(b/162119374)
+    deps = [
+        ":parameter_server_client",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:init_ops_v2",
+        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:multi_worker_test_base",
+        "//tensorflow/python/distribute:sharded_variable",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
+py_library(
+    name = "metric_utils",
+    srcs = ["metric_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/eager:monitoring",
+    ],
+)
+
+tf_py_test(
+    name = "metric_utils_test",
+    srcs = ["metric_utils_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":client",
+        ":metric_utils",
+        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/distribute:multi_worker_test_base",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/eager:test",
+    ],
+)
diff --git a/tensorflow/python/distribute/client/client.py b/tensorflow/python/distribute/client/client.py
new file mode 100644
index 00000000000..37f000d4a87
--- /dev/null
+++ b/tensorflow/python/distribute/client/client.py
@@ -0,0 +1,1211 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Module for `Client` and relevant cluster-worker related library.
+
+This is currently under development and the API is subject to change.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import enum
+import functools
+import os
+import sys
+import threading
+import weakref
+from absl import logging
+from six.moves import queue
+
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import input_lib
+from tensorflow.python.distribute import parameter_server_strategy_v2
+from tensorflow.python.distribute.client import metric_utils
+from tensorflow.python.eager import cancellation
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import executor
+from tensorflow.python.eager import function as tf_function
+from tensorflow.python.eager import remote
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import func_graph
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import server_lib
+from tensorflow.python.util import nest
+
+# Maximum time for failed worker to come back is 1 hour
+_WORKER_MAXIMUM_RECOVERY_SEC = 3600
+
+# Maximum size for queued closures, "infinite" if set to 0.
+# When the maximum queue size is reached, further schedule calls will become
+# blocking until some previously queued closures are executed on workers.
+# Note that using an "infinite" queue size can take a non-trivial portion of
+# memory, and even lead to client OOM. Modify the size to a smaller value for
+# client with constrained memory resource (only recommended for advanced users).
+# Also used in unit tests to ensure the correctness when the queue is full.
+_CLOSURE_QUEUE_MAX_SIZE = 256 * 1024
+
+# RPC error message from PS
+_RPC_ERROR_FROM_PS = "GRPC error information from remote target /job:ps"
+
+# InvalidArgumentError (unknown device) will not have "GRPC error..." string.
+_JOB_WORKER_STRING_IDENTIFIER = "/job:worker"
+
+
+class _RemoteValueStatus(enum.Enum):
+  """The status of a `RemoteValue` object.
+
+  A `RemoteValue` object can have three states:
+    1) not ready: no value, no non-retryable error and not aborted;
+    2) aborted: i.e. the execution of function was aborted because of task
+       failure, but can be retried;
+    3) ready: i.e. has value or has non-tryable error;
+
+  The initial state of a `RemoteValue` is "not ready". When its corresponding
+  closure has
+  been executed at least once, it will become aborted or ready. The state
+  transitions are:
+    1) not ready -> 2) aborted:
+      when the corresponding closure is aborted due to worker failure, and the
+      worker failure is not immediately handled.
+    1) not ready -> 3) ready:
+      when the corresponding closure has been executed successfully.
+    2) aborted -> 3) ready:
+      when the `RemoteValue` is rebuilt by rerunning the corresponding closure
+      and the closure has been executed successfully.
+    3) ready -> 2) aborted:
+      when the corresponding closure had been executed successfully but later
+      the corresponding remote worker failed. This is currently only implemented
+      for resource `RemoteValue` like iterators.
+  """
+  NOT_READY = "NOT_READY"
+  ABORTED = "ABORTED"
+  READY = "READY"
+
+
+class RemoteValue(object):
+  """An asynchronously available value of a remotely executed function.
+
+  `RemoteValue` class is used as the return value of `Client.schedule()` where
+  the underlying concrete value comes at a later time once the function has been
+  remotely executed. `RemoteValue` can be used as an input to a subsequent
+  function scheduled with `Client.schedule()`.
+
+  Note: this class is not thread-safe.
+  """
+
+  def __init__(self, closure, type_spec):
+    self._closure = closure
+    # The type spec for this `RemoteValue` which is used to trace functions that
+    # take this `RemoteValue` as input.
+    self._type_spec = func_graph.convert_structure_to_signature(type_spec)
+    self._value = None
+    self._error = None
+    self._status_available_event = threading.Event()
+    self._status = _RemoteValueStatus.NOT_READY
+
+  def _set_aborted(self):
+    self._status = _RemoteValueStatus.ABORTED
+    self._value = None
+    self._error = None
+
+    # Wake up any waiting thread and clear the event.
+    self._status_available_event.set()
+
+  def _rebuild_on(self, worker):
+    self._status_available_event.clear()
+    # TODO(yuefengz): we may need to rebuild its inputs as well.
+    self._closure.execute_on(worker)
+
+  def _set_value(self, value):
+    self._status = _RemoteValueStatus.READY
+    self._value = value
+    self._error = None
+    self._status_available_event.set()
+
+  def _set_error(self, exception):
+    self._status = _RemoteValueStatus.READY
+    self._value = None
+    self._error = exception
+    self._status_available_event.set()
+
+  def _get_value(self):
+    self._status_available_event.wait()
+    return self._value
+
+  def _get_error(self):
+    self._status_available_event.wait()
+    return self._error
+
+  def _set_type_spec(self, type_spec):
+    self._type_spec = func_graph.convert_structure_to_signature(type_spec)
+
+  def fetch(self):
+    """Wait for the result of RemoteValue to be ready and return the result.
+
+    Returns:
+      The remote value, as a numpy data type (if scalar) or ndarray.
+
+    Raises:
+      FunctionRetryableError: If the function that produces this `RemoteValue`
+        is aborted or cancelled due to failure, and the user should handle and
+        reschedule.
+    """
+    self._status_available_event.wait()
+    if self._status is _RemoteValueStatus.ABORTED:
+      raise FunctionRetryableError(
+          "The corresponding function is aborted. Please reschedule the "
+          "function.")
+    if self._error is not None:
+      raise self._error  # pylint: disable=raising-bad-type
+    else:
+      if isinstance(self._value,
+                    (ops.Tensor, resource_variable_ops.BaseResourceVariable)):
+        return self._value.numpy()
+      else:
+        return self._value
+
+
+class InputError(Exception):
+
+  def __init__(self, original_exception):
+    message = ("Input has an error, the original exception is %r, "
+               "error message is %s." %
+               (original_exception, str(original_exception)))
+    super().__init__(message)
+
+
+class FunctionRetryableError(Exception):
+  """An error that represents the closure was aborted and should be retried."""
+  pass
+
+
+def _maybe_get_error_and_rebuild_remote_values(worker, structure):
+  """Attempts to return errors from `RemoteValue`s. Rebuilds them if needed."""
+  errors_in_structure = []
+
+  def _get_error(val):
+    if isinstance(val, RemoteValue):
+      if val._status is _RemoteValueStatus.ABORTED:  # pylint: disable=protected-access
+        with worker.failure_handler.wait_on_failure(
+            on_recovery_fn=functools.partial(val._rebuild_on, worker),  # pylint: disable=protected-access
+            worker_device_name=worker.device_name):
+          val._rebuild_on(worker)  # pylint: disable=protected-access
+      error = val._get_error()  # pylint: disable=protected-access
+      if error:
+        errors_in_structure.append(error)
+
+  nest.map_structure(_get_error, structure)
+  if errors_in_structure:
+    return errors_in_structure[0]
+  else:
+    return None
+
+
+def _maybe_get_remote_value(val):
+  """Gets the value of `val` if it is a `RemoteValue`."""
+  if isinstance(val, RemoteValue):
+    error = val._get_error()  # pylint: disable=protected-access
+    if error:
+      raise AssertionError(
+          "RemoteValue doesn't have a value because it has errors.")
+    else:
+      return val._get_value()  # pylint: disable=protected-access
+  else:
+    return val
+
+
+def _maybe_as_type_spec(val):
+  if isinstance(val, RemoteValue):
+    if val._type_spec is None:  # pylint: disable=protected-access
+      raise ValueError("Output of a scheduled function that is not "
+                       "tf.function cannot be the input of another function.")
+    return val._type_spec  # pylint: disable=protected-access
+  else:
+    return val
+
+
+class PerWorkerValues(object):
+  """Holds a list of per worker values."""
+
+  def __init__(self, values):
+    self._values = tuple(values)
+
+
+def _select_worker_slice(worker_id, structured):
+  """Selects the worker slice of each of the items in `structured`."""
+
+  def _get(x):
+    return x._values[worker_id] if isinstance(x, PerWorkerValues) else x  # pylint: disable=protected-access
+
+  return nest.map_structure(_get, structured)
+
+
+class Closure(object):
+  """Hold a function to be scheduled and its arguments."""
+
+  def __init__(self, function, cancellation_mgr, args=None, kwargs=None):
+    if not callable(function):
+      raise ValueError("Function passed to `Client.schedule` must be a "
+                       "callable object.")
+    self._args = args or ()
+    self._kwargs = kwargs or {}
+
+    if isinstance(function, def_function.Function):
+      replica_args = _select_worker_slice(0, self._args)
+      replica_kwargs = _select_worker_slice(0, self._kwargs)
+
+      # Note: no need to handle function registration failure since this kind of
+      # failure will not raise exceptions as designed in the runtime. The client
+      # has to rely on subsequent operations that raise to catch function
+      # registration failure.
+
+      # Record the function tracing overhead. Note that we pass in the tracing
+      # count of the def_function.Function as a state tracker, so that metrics
+      # will only record the time for actual function tracing (i.e., excluding
+      # function cache lookups).
+      with metric_utils.monitored_timer(
+          "function_tracing", state_tracker=function._get_tracing_count):  # pylint: disable=protected-access
+        concrete_function = function.get_concrete_function(
+            *nest.map_structure(_maybe_as_type_spec, replica_args),
+            **nest.map_structure(_maybe_as_type_spec, replica_kwargs))
+      self._function = cancellation_mgr.get_cancelable_function(
+          concrete_function)
+      self._output_remote_values = nest.map_structure(
+          lambda x: RemoteValue(self, x), concrete_function.structured_outputs)
+    elif isinstance(function, tf_function.ConcreteFunction):
+      self._function = cancellation_mgr.get_cancelable_function(function)
+      self._output_remote_values = nest.map_structure(
+          lambda x: RemoteValue(self, x), function.structured_outputs)
+    else:
+      # Regular python functions.
+      self._function = function
+      # TODO(yuefengz): maybe we should trace python functions if their inputs
+      # are Python primitives, tensors and composite tensors.
+      self._output_remote_values = RemoteValue(self, None)
+
+  def _fetch_output_remote_values(self):
+    """Temporary method used to sync the scheduler."""
+    # It will do nothing if there is no return value.
+    nest.map_structure(lambda x: x.fetch(), self._output_remote_values)  # pylint: disable=protected-access
+
+  def _set_output_remote_values_aborted(self):
+    """Set output remote_value aborted."""
+    # It will do nothing if there is no return value.
+    nest.map_structure(lambda x: x._set_aborted(), self._output_remote_values)  # pylint: disable=protected-access
+
+  def _set_output_remote_values_cancelled(self):
+    nest.map_structure(
+        lambda x: x._set_error(  # pylint: disable=protected-access,g-long-lambda
+            FunctionRetryableError("The corresponding function is "
+                                   "cancelled. Please reschedule the "
+                                   "function.")),
+        self._output_remote_values)  # pylint: disable=protected-access
+
+  def execute_on(self, worker):
+    """Executes the closure on the given worker.
+
+    Args:
+      worker: a `Worker` object.
+    """
+    replica_args = _select_worker_slice(worker.worker_index, self._args)
+    replica_kwargs = _select_worker_slice(worker.worker_index, self._kwargs)
+
+    e = (
+        _maybe_get_error_and_rebuild_remote_values(worker, replica_args) or
+        _maybe_get_error_and_rebuild_remote_values(worker, replica_kwargs))
+    if e:
+      if not isinstance(e, InputError):
+        e = InputError(e)
+      for remote_value in nest.flatten(self._output_remote_values):
+        remote_value._set_error(e)  # pylint: disable=protected-access
+      return
+
+    with ops.device(worker.device_name):
+      with context.executor_scope(worker.executor):
+        with metric_utils.monitored_timer("closure_execution"):
+          output_value = self._function(
+              *nest.map_structure(_maybe_get_remote_value, replica_args),
+              **nest.map_structure(_maybe_get_remote_value, replica_kwargs))
+    for remote_value, value in zip(
+        nest.flatten(self._output_remote_values), nest.flatten(output_value)):
+      remote_value._set_value(value)  # pylint: disable=protected-access
+
+
+class _CoordinatedClosureQueue(object):
+  """Manage a queue of closures, inflight count and errors from execution.
+
+  This class is thread-safe.
+  """
+
+  def __init__(self, cancellation_mgr):
+    # `self._inflight_closure_count` only tracks the number of inflight closures
+    # that are "in generation". Once an error occurs, error generation is
+    # incremented and all subsequent arriving closures (from inflight) are
+    # considered "out of generation".
+    self._inflight_closure_count = 0
+
+    self._queue_lock = threading.Lock()
+
+    # Condition indicating that all pending closures (either queued or inflight)
+    # have been processed, failed, or cancelled.
+    self._stop_waiting_condition = threading.Condition(self._queue_lock)
+
+    # Condition indicating that an item becomes available in queue (not empty).
+    self._closures_queued_condition = threading.Condition(self._queue_lock)
+
+    # Condition indicating that a queue slot becomes available (not full).
+    # Note that even with "infinite" queue size, there is still a "practical"
+    # size limit for the queue depending on host memory capacity, and thus the
+    # queue will eventually become full with a lot of enqueued closures.
+    self._queue_free_slot_condition = threading.Condition(self._queue_lock)
+
+    # Condition indicating there is no inflight closures.
+    self._no_inflight_closure_condition = threading.Condition(self._queue_lock)
+
+    # Use to cancel in-flight closures.
+    self._cancellation_mgr = cancellation_mgr
+
+    if _CLOSURE_QUEUE_MAX_SIZE <= 0:
+      logging.warning(
+          "In ParameterServerClient, creating an infinite closure queue can "
+          "consume a significant amount of memory and even lead to OOM.")
+    self._queue = queue.Queue(maxsize=_CLOSURE_QUEUE_MAX_SIZE)
+    self._error = None
+
+    # The following is a lock to make sure when `wait` is called and before it
+    # returns no `put` can be executed during this period. It is because `wait`
+    # won't know what to do with newly put closures. This lock adds an cutoff
+    # for `wait` so that closures put into the queue while waiting would not be
+    # taken responsible by this `wait`.
+    #
+    # We cannot reuse the `self._queue_lock` since when `wait` waits for a
+    # condition, the `self._queue_lock` will be released.
+    #
+    # We don't use a reader/writer's lock on purpose to reduce the complexity
+    # of the code.
+    self._put_wait_lock = threading.Lock()
+
+  def _cancel_all_closures(self):
+    """Clears the queue and sets remaining closures cancelled error.
+
+    This method expects self._queue_lock to be held prior to entry.
+    """
+    self._cancellation_mgr.start_cancel()
+    while self._inflight_closure_count > 0:
+      self._no_inflight_closure_condition.wait()
+    while True:
+      try:
+        closure = self._queue.get(block=False)
+        self._queue_free_slot_condition.notify()
+        closure._set_output_remote_values_cancelled()  # pylint: disable=protected-access
+      except queue.Empty:
+        break
+
+  def _raise_if_error(self):
+    """Raises the error if one exists.
+
+    If an error exists, cancel the closures in queue, raises it, and clear
+    the error.
+
+    This method expects self._queue_lock to be held prior to entry.
+    """
+    if self._error:
+      self._cancel_all_closures()
+      try:
+        raise self._error  # pylint: disable=raising-bad-type
+      finally:
+        self._error = None
+
+  def put(self, closure):
+    """Put a closure into the queue for later execution.
+
+    If `mark_failed` was called before `put`, the error from the first
+    invocation of `mark_failed` will be raised.
+
+    Args:
+      closure: The `Closure` to put into the queue.
+    """
+    with self._put_wait_lock, self._queue_lock:
+      self._queue_free_slot_condition.wait_for(lambda: not self._queue.full())
+      self._queue.put(closure, block=False)
+      self._raise_if_error()
+      self._closures_queued_condition.notify()
+
+  def get(self, timeout=None):
+    """Return a closure from the queue to be executed."""
+    with self._queue_lock:
+      while self._queue.empty():
+        if not self._closures_queued_condition.wait(timeout=timeout):
+          return None
+      closure = self._queue.get(block=False)
+      self._queue_free_slot_condition.notify()
+      self._inflight_closure_count += 1
+      return closure
+
+  def mark_finished(self):
+    """Let the queue know that a closure has been successfully executed."""
+    with self._queue_lock:
+      if self._inflight_closure_count < 1:
+        raise AssertionError("There is no inflight closures to mark_finished.")
+      self._inflight_closure_count -= 1
+      if self._inflight_closure_count == 0:
+        self._no_inflight_closure_condition.notifyAll()
+      if self._queue.empty() and self._inflight_closure_count == 0:
+        self._stop_waiting_condition.notifyAll()
+
+  def put_back(self, closure):
+    """Put the closure back into the queue as it was not properly executed."""
+    with self._queue_lock:
+      if self._inflight_closure_count < 1:
+        raise AssertionError("There is no inflight closures to put_back.")
+      if self._error:
+        closure._set_output_remote_values_cancelled()  # pylint: disable=protected-access
+      else:
+        self._queue_free_slot_condition.wait_for(lambda: not self._queue.full())
+        self._queue.put(closure, block=False)
+        self._closures_queued_condition.notify()
+      self._inflight_closure_count -= 1
+      if self._inflight_closure_count == 0:
+        self._no_inflight_closure_condition.notifyAll()
+
+  def wait(self, timeout=None):
+    """Wait for all closures to be finished before returning.
+
+    If `mark_failed` was called before or during `wait`, the error from the
+    first invocation of `mark_failed` will be raised.
+
+    Args:
+      timeout: A float specifying a timeout for the wait in seconds.
+
+    Returns:
+      True unless the given timeout expired, in which case it returns False.
+    """
+    with self._put_wait_lock, self._queue_lock:
+      while (not self._error and
+             (not self._queue.empty() or self._inflight_closure_count > 0)):
+        if not self._stop_waiting_condition.wait(timeout=timeout):
+          return False
+      self._raise_if_error()
+      return True
+
+  def mark_failed(self, e):
+    """Sets error and unblocks any wait() call."""
+    with self._queue_lock:
+      # TODO(yuefengz): maybe record all failure and give users more
+      # information?
+      if self._inflight_closure_count < 1:
+        raise AssertionError("There is no inflight closures to mark_failed.")
+      if self._error is None:
+        self._error = e
+      self._inflight_closure_count -= 1
+      if self._inflight_closure_count == 0:
+        self._no_inflight_closure_condition.notifyAll()
+      self._stop_waiting_condition.notifyAll()
+
+  def done(self):
+    """Returns true if the queue is empty and there is no inflight closure.
+
+    If `mark_failed` was called before `done`, the error from the first
+    invocation of `mark_failed` will be raised.
+    """
+    with self._queue_lock:
+      self._raise_if_error()
+      return self._queue.empty() and self._inflight_closure_count == 0
+
+
+class WorkerPreemptionHandler(object):
+  """Handles worker preemptions."""
+
+  def __init__(self, server_def):
+    self._server_def = server_def
+    self._cluster_update_lock = threading.Lock()
+    self._cluster_due_for_update = threading.Event()
+    self._worker_up_cond = threading.Condition(self._cluster_update_lock)
+    threading.Thread(target=self._preemption_handler,
+                     name="WorkerPreemptionHandler",
+                     daemon=True).start()
+
+  def _validate_preemption_failure(self, e):
+    """Validates that the given exception represents worker preemption."""
+    if _is_worker_failure(e):
+      return
+    raise e
+
+  @contextlib.contextmanager
+  def wait_on_failure(self,
+                      on_failure_fn=None,
+                      on_recovery_fn=None,
+                      worker_device_name="(unknown)"):
+    """Catches worker preemption error and wait until failed workers are back.
+
+    Args:
+      on_failure_fn: an optional function to run if preemption happens.
+      on_recovery_fn: an optional function to run when a worker is recovered
+        from preemption.
+      worker_device_name: the device name of the worker instance that is passing
+        through the failure.
+
+    Yields:
+      None.
+    """
+    try:
+      yield
+    except errors.OpError as e:
+      self._validate_preemption_failure(e)
+      logging.error("Worker %s failed with error: %s", worker_device_name, e)
+      if on_failure_fn:
+        on_failure_fn()
+
+      with self._cluster_update_lock:
+        self._cluster_due_for_update.set()
+        self._worker_up_cond.wait(_WORKER_MAXIMUM_RECOVERY_SEC)
+        logging.info("Worker %s has been recovered.", worker_device_name)
+
+      if on_recovery_fn:
+        with self.wait_on_failure(
+            on_recovery_fn=on_recovery_fn,
+            worker_device_name=worker_device_name):
+          on_recovery_fn()
+
+  def _preemption_handler(self):
+    """A loop that handles preemption.
+
+    This loop waits for signal of worker preemption and upon worker preemption,
+    it waits until all workers are back and updates the cluster about the
+    restarted workers.
+    """
+    while True:
+      self._cluster_due_for_update.wait()
+      with self._cluster_update_lock:
+        try:
+          # TODO(haoyuzhang): support partial cluster recovery
+          logging.info("Cluster now being recovered.")
+          context.context().update_server_def(self._server_def)
+
+          # Cluster updated successfully, clear the update signal, and notify
+          # all workers that they are recovered from failure.
+          logging.info("Cluster successfully recovered.")
+          self._worker_up_cond.notify_all()
+          self._cluster_due_for_update.clear()
+        except Exception as e:  # pylint: disable=broad-except
+          self._validate_preemption_failure(e)
+          # NOTE: Since the first RPC (GetStatus) of update_server_def is
+          # currently blocking by default, error should only happen if:
+          # (1) More workers failed while waiting for the previous workers to
+          #     come back;
+          # (2) Worker failed when exchanging subsequent RPCs after the first
+          #     RPC returns.
+          # Consider adding backoff retry logic if we see the error logged
+          # too frequently.
+          logging.error("Cluster update failed with error: %s. Retrying...", e)
+
+
+class Worker(object):
+  """A worker in a cluster.
+
+  Attributes:
+    worker_index: The index of the worker in the cluster.
+    device_name: The device string of the worker, e.g. "/job:worker/task:1".
+    executor: The worker's executor for remote function execution.
+    failure_handler: The failure handler used to handler worker preemption
+      failure.
+  """
+
+  def __init__(self, worker_index, device_name, cluster):
+    self.worker_index = worker_index
+    self.device_name = device_name
+    self.executor = executor.new_executor(enable_async=False)
+    self.failure_handler = cluster.failure_handler
+    self._cluster = cluster
+    self._resource_remote_value_refs = []
+
+    # Worker threads need to start after `Worker`'s initialization.
+    threading.Thread(target=self._process_queue,
+                     name="WorkerClosureProcessingLoop-%d" % self.worker_index,
+                     daemon=True).start()
+
+  def _set_resources_aborted(self):
+    # TODO(yuefengz): maybe we can query whether a tensor is valid or not
+    # instead of marking a tensor aborted?
+    for weakref_resource in self._resource_remote_value_refs:
+      resource = weakref_resource()
+      if resource:
+        resource._set_aborted()  # pylint: disable=protected-access
+
+  def _set_dead(self):
+    raise NotImplementedError("_set_dead is not implemented.")
+
+  def _process_closure(self, closure):
+    """Runs a closure with preemption handling."""
+    try:
+      with self._cluster.failure_handler.wait_on_failure(
+          on_failure_fn=lambda: self._cluster._closure_queue.put_back(closure),  # pylint: disable=protected-access
+          on_recovery_fn=self._set_resources_aborted,
+          worker_device_name=self.device_name):
+        closure.execute_on(self)
+        # TODO(yuefengz): we don't have to materialize results every step.
+        with metric_utils.monitored_timer("remote_value_fetch"):
+          closure._fetch_output_remote_values()  # pylint: disable=protected-access
+        self._cluster._closure_queue.mark_finished()  # pylint: disable=protected-access
+    except Exception as e:  # pylint: disable=broad-except
+      logging.error(
+          "/job:worker/task:%d encountered the following error when processing "
+          "closure: %r:%s", self.worker_index, e, e)
+      nest.map_structure(
+          lambda x: x._set_error(e),  # pylint: disable=protected-access
+          closure._output_remote_values)  # pylint: disable=protected-access
+      self._cluster._closure_queue.mark_failed(e)  # pylint: disable=protected-access
+
+  def _process_queue(self):
+    while True:
+      closure = self._cluster._closure_queue.get()  # pylint: disable=protected-access
+      self._process_closure(closure)
+
+  def _create_resource(self, function, args=None, kwargs=None):
+    """Synchronously creates a per-worker resource represented by a `RemoteValue`.
+
+    Args:
+      function: the resource function to be run remotely. It should be a
+        `tf.function`, a concrete function or a Python function.
+      args: positional arguments to be passed to the function.
+      kwargs: keyword arguments to be passed to the function.
+
+    Returns:
+      one or several RemoteValue objects depending on the function return
+      values.
+    """
+    # Some notes about the concurrency: currently all the activities related to
+    # the same worker such as creating resources, setting resources' aborted
+    # status, and executing closures happen on the same thread. This allows us
+    # to have simpler logic of concurrency.
+    closure = Closure(
+        function, self._cluster._cancellation_mgr, args=args, kwargs=kwargs)  # pylint: disable=protected-access
+    resource_remote_value = closure._output_remote_values  # pylint: disable=protected-access
+    self._register_resource(resource_remote_value)
+
+    # The following is a short-term solution to lazily create resources in
+    # parallel.
+    # TODO(b/160343165): we should create resources eagerly, i.e. schedule the
+    # resource creation function as soon as users call this method.
+    resource_remote_value._set_aborted()  # pylint: disable=protected-access
+    return resource_remote_value
+
+  def _register_resource(self, resource_remote_value):
+    if not isinstance(resource_remote_value, RemoteValue):
+      raise ValueError(
+          "Resource being registered is not of type `RemoteValue`.")
+    self._resource_remote_value_refs.append(weakref.ref(resource_remote_value))
+
+
+class Cluster(object):
+  """A cluster with workers.
+
+  We assume all function errors are fatal and based on this assumption our
+  error reporting logic is:
+  1) Both `schedule` and `join` can raise a non-retryable error which is the
+  first error seen by the client from any previously scheduled functions.
+  2) When an error is raised, there is no guarantee on how many previously
+  scheduled functions have been executed; functions that have not been executed
+  will be thrown away and marked as cancelled.
+  3) After an error is raised, the internal state of error will be cleared.
+  I.e. functions can continue to be scheduled and subsequent calls of `schedule`
+  or `join` will not raise the same error again.
+
+  Attributes:
+    failure_handler: The failure handler used to handler worker preemption
+      failure.
+    workers: a list of `Worker` objects in the cluster.
+  """
+
+  def __init__(self, cluster_resolver, client_name="chief"):
+    """Initializes the cluster instance and connect to the remote cluster."""
+    if client_name in ["worker", "ps"]:
+      raise ValueError("Client name should not be 'worker' or 'ps'.")
+    cluster_spec = cluster_resolver.cluster_spec()
+
+    self._num_workers = len(cluster_spec.as_dict().get("worker", ()))
+    self._num_ps = len(cluster_spec.as_dict().get("ps", ()))
+    device_filters = server_lib.ClusterDeviceFilters()
+    # For any worker, only the devices on PS and chief nodes are visible
+    for i in range(self._num_workers):
+      device_filters.set_device_filters(
+          "worker", i, ["/job:ps", "/job:%s" % client_name])
+    # Similarly for any ps, only the devices on workers and chief are visible
+    for i in range(self._num_ps):
+      device_filters.set_device_filters(
+          "ps", i, ["/job:worker", "/job:%s" % client_name])
+
+    context.context().mirroring_policy = context.MIRRORING_ALL
+    # Allow at most one outstanding RPC for each worker at a certain time. This
+    # is to simplify worker failure handling in the runtime
+    os.environ["TF_ENABLE_EAGER_CLIENT_STREAMING_ENQUEUE"] = "False"
+    remote.connect_to_cluster(cluster_spec,
+                              job_name=client_name,
+                              protocol=cluster_resolver.rpc_layer,
+                              cluster_device_filters=device_filters)
+
+    self._cancellation_mgr = cancellation.CancellationManager()
+    self._closure_queue = _CoordinatedClosureQueue(self._cancellation_mgr)
+    self.failure_handler = WorkerPreemptionHandler(context.get_server_def())
+    worker_device_strings = [
+        "/job:worker/replica:0/task:%d" % i for i in range(self._num_workers)
+    ]
+    self.workers = [
+        Worker(i, w, self) for i, w in enumerate(worker_device_strings)
+    ]
+
+  def schedule(self, function, args, kwargs):
+    """Schedules `function` to be dispatched to a worker for execution.
+
+    Args:
+      function: The function to be dispatched to a worker for execution
+        asynchronously.
+      args: Positional arguments for `fn`.
+      kwargs: Keyword arguments for `fn`.
+
+    Returns:
+      A structure of `RemoteValue` object.
+    """
+    closure = Closure(
+        function, self._cancellation_mgr, args=args, kwargs=kwargs)
+    self._closure_queue.put(closure)
+    return closure._output_remote_values  # pylint: disable=protected-access
+
+  def join(self):
+    """Blocks until all scheduled functions are executed."""
+    self._closure_queue.wait()
+
+  def done(self):
+    """Returns true if all scheduled functions are executed."""
+    return self._closure_queue.done()
+
+
+class ParameterServerFailureError(Exception):
+  """An error representing at least one parameter server is interrupted."""
+  pass
+
+
+class Client(object):
+  """An object to schedule and orchestrate remote function execution.
+
+  A `Client` object represents a program used to create dataset, schedule
+  functions to be executed, and fetch the results of the functions. Operations
+  that will involve other tasks in the cluster, such as variable creation,
+  reading variables etc., should be performed within `client.context()`.
+
+  Currently, `Client` is not supported to be used in a standalone manner.
+  It should be used in conjunction with `ParameterServerStrategyV2`. The
+  recommended way of using the combination is through a `ParameterServerClient`
+  object. Please see `ParameterServerClient` for more information.
+
+  This is currently under development, and the API as well as implementation
+  is subject to changes.
+  """
+
+  def __init__(self, strategy):
+    """Initialization of a `Client` instance.
+
+    This connects the client to remote workers and parameter servers, through
+    a `tf.config.experimental_connect_to_cluster` call.
+
+    Args:
+      strategy: a `tf.distribute.Strategy` object. Currently, only
+        `ParameterServerStrategyV2` is supported.
+
+    Raises:
+      ValueError: if the strategy being used is not supported.
+    """
+    if not isinstance(strategy,
+                      parameter_server_strategy_v2.ParameterServerStrategyV2):
+      raise ValueError("Only `ParameterServerStrategyV2` is supported in "
+                       "`Client` currently.")
+    self._strategy = strategy
+    self.cluster = Cluster(strategy._cluster_resolver)
+
+  @contextlib.contextmanager
+  def context(self):
+    """Context manager under which client distribution is in effect.
+
+    All distribution related methods using this `Client`, including those that
+    create and update variables, should be used within this context. This
+    context manager handles cluster fault tolerance in remote function
+    execution.
+
+    The context manager calls `join` automatically when exiting successfully.
+
+    Entering `Client.context` also enters the underlying strategy's scope, and
+    this means that `tf.distribute.get_strategy()` will return the strategy
+    object being used.
+
+    Yields:
+      Nothing.
+    """
+    with self._strategy.scope(), self._handle_parameter_server_failure():
+      yield
+    self.join()
+
+  @contextlib.contextmanager
+  def experimental_variable_partitioning_scope(self):
+    with self._strategy.experimental_variable_partitioning_scope():
+      yield
+
+  (experimental_variable_partitioning_scope.__doc__) = (
+      parameter_server_strategy_v2.ParameterServerStrategyV2
+      .experimental_variable_partitioning_scope.__doc__)
+
+  def schedule(self, fn, args=None, kwargs=None):
+    """Schedules `fn` to be dispatched to a worker for execution asynchronously.
+
+    When calling `schedule` with a function `fn`, `fn` will be executed on a
+    remote worker at some later time. The process is asynchronous, meaning
+    `schedule` returns immediately, possibly without having the result ready
+    yet. `schedule` returns a structure of `RemoteValue` object, which wraps the
+    output of the function. Call `fetch()` on `RemoteValue` to wait for the
+    function execution to finish and retrieve its output from the remote worker.
+
+    `schedule` guarantees that `fn` will be executed on a worker at least once;
+    it could be more than once if its corresponding worker fails in the middle
+    of its execution. Note that since worker can fail at any point when
+    executing the function, it is possible that the function is partially
+    executed, but `Client` guarantees that in those events, the function will
+    eventually be fully executed, possibly on a different worker that is
+    available.
+
+    If any previously scheduled function raises an error, `schedule` will fail
+    by raising any one of those errors, and clear the errors collected so far.
+    There are two implications when this happens: 1) user should call `schedule`
+    with `fn` again to re-schedule, and 2) some of the previously scheduled
+    functions may have not been executed. User can call `fetch` on the returned
+    `RemoteValue` to inspect if they have executed, failed, or cancelled, and
+    reschedule the corresponding function if needed.
+
+    When `schedule` raises, it guarantees that there is no function that is
+    still being executed.
+
+    At this time, there is no support of worker assignment for function
+    execution, or priority of the workers.
+
+    `args` and `kwargs` are the arguments passed into `fn`, when `fn` is
+    executed on a worker. They can be `PerWorkerValues`, which is a collection
+    of values, each of which represents a component specific to a worker; in
+    this case, the argument will be substituted with the corresponding component
+    on the target worker. Arguments that are not `PerWorkerValues` will be
+    passed into `fn` as-is.
+
+    Args:
+      fn: A `tf.function`; the function to be dispatched to a worker for
+        execution asynchronously.
+      args: Positional arguments for `fn`.
+      kwargs: Keyword arguments for `fn`.
+
+    Returns:
+      A structure of `RemoteValue` object.
+
+    Raises:
+      Exception: one of the exceptions caught by the client by any previously
+        scheduled function since the last time an error was thrown or since
+        the beginning of the program.
+    """
+    # TODO(b/160702436): Invoke `strategy.run` for user's function so it enters
+    # a `ReplicaContext` in a logically correct way.
+    with distribute_lib.ReplicaContext(
+        self._strategy,
+        replica_id_in_sync_group=constant_op.constant(0, dtypes.int32)):
+      with self._translate_parameter_server_failure():
+        return self.cluster.schedule(fn, args=args, kwargs=kwargs)
+
+  def join(self):
+    """Blocks until all the scheduled functions have finished execution.
+
+    If any previously scheduled function raises an error, `join` will fail by
+    raising any one of those errors, and clear the errors collected so far. If
+    this happens, some of the previously scheduled functions may have not been
+    executed. Users can call `fetch` on the returned `RemoteValue` to inspect if
+    they have executed, failed, or cancelled. If some that have been cancelled
+    need to be rescheduled, users should call `schedule` with the function
+    again.
+
+    When `join` returns or raises, it guarantees that there is no function that
+    is still being executed.
+
+    Raises:
+      Exception: one of the exceptions caught by the client by any previously
+        scheduled function since the last time an error was thrown or since
+        the beginning of the program.
+    """
+    # TODO(b/159486639): Update the docs once we can cancel the functions being
+    # executed on workers, that when `join` returns, the system is stabilized.
+    with self._translate_parameter_server_failure():
+      self.cluster.join()
+
+  def done(self):
+    """Returns whether all the scheduled functions have finished execution.
+
+    If any previously scheduled function raises an error, `done` will fail by
+    raising any one of those errors.
+
+    When `done` returns True or raises, it guarantees that there is no function
+    that is still being executed.
+    """
+    return self.cluster.done()
+
+  def create_per_worker_dataset(self, dataset_fn):
+    """Create dataset on workers by calling `dataset_fn` on worker devices.
+
+    This creates the given dataset generated by dataset_fn on the workers
+    and returns an object that represents the collection of those individual
+    datasets. Calling `iter` on such collection of dataset returns a
+    `PerWorkerValues`, which is a collection of iterators, where the iterators
+    have been placed on respective workers.
+
+    Calling `next` on this `PerWorkerValues` of iterators is currently
+    unsupported; it is meant to be passed as an argument into `Client.schedule`.
+    When the scheduled function is picked up and being executed by a worker, the
+    function will receive the individual iterator that corresponds to the
+    worker, and now `next` can be called on iterator to get the next (batch or
+    example) of data.
+
+    Dataset shuffling and repeating are usually needed in `dataset_fn`; however,
+    sharding is not recommended: some worker may not be available and those
+    examples may be skipped and not covered by other workers, if the dataset is
+    sharded.
+
+    Args:
+      dataset_fn: The dataset function that returns a dataset. This is to be
+        executed on the workers.
+
+    Returns:
+      An object that represents the collection of those individual
+      datasets. `iter` is expected to be called on this object that returns
+      a `PerWorkerValues` of the iterators (that are on the workers).
+    """
+    input_workers = input_lib.InputWorkers([
+        (w.device_name, [w.device_name]) for w in self.cluster.workers
+    ])
+
+    return _PerWorkerDistributedDataset(dataset_fn, input_workers, self)
+
+  def _create_per_worker_resources(self, fn, args=None, kwargs=None):
+    """Synchronously create resources on the workers.
+
+    The resources are represented by `RemoteValue`s.
+
+    Args:
+      fn: The function to be dispatched to all workers for execution
+        asynchronously.
+      args: Positional arguments for `fn`.
+      kwargs: Keyword arguments for `fn`.
+
+    Returns:
+      A `PerWorkerValues` object, which wraps a tuple of `RemoteValue` objects.
+    """
+    results = []
+    for w in self.cluster.workers:
+      results.append(w._create_resource(fn, args=args, kwargs=kwargs))  # pylint: disable=protected-access
+    return PerWorkerValues(tuple(results))
+
+  def fetch(self, val):
+    """Blocking call to fetch results from `RemoteValue`s.
+
+    This returns the execution result of `RemoteValue`s; if not ready,
+    waiting for it while blocking the caller.
+
+    Args:
+      val: The value to fetch the results from. If this is structure of
+        `RemoteValue`, `fetch()` will be called on the individual `RemoteValue`
+        to get the result.
+
+    Returns:
+      If `val` is a `RemoteValue` or a structure of `RemoteValue`s, returns
+      the fetched `RemoteValue` value immediately if it's available, or blocks
+      the call until it's available, and returns the fetched `RemoteValue`
+      values with the same structure. If `val` is other types, return (`val`,).
+    """
+
+    def _maybe_fetch(val):
+      if isinstance(val, RemoteValue):
+        return val.fetch()
+      else:
+        return val
+
+    # TODO(yuefengz): we should fetch values in a batch.
+    result = nest.map_structure(_maybe_fetch, val)
+    if not isinstance(result, tuple):
+      return (result,)
+    return result
+
+  # pylint: disable=missing-function-docstring
+  @contextlib.contextmanager
+  def _translate_parameter_server_failure(self):
+    try:
+      yield
+    except Exception as e:  # pylint: disable=broad-except
+      if _is_ps_failure(e):
+        logging.exception("Encountered parameter server failures!")
+        raise ParameterServerFailureError(e)
+      else:
+        raise
+
+  # pylint: disable=missing-function-docstring
+  @contextlib.contextmanager
+  def _handle_parameter_server_failure(self):
+    try:
+      with self._translate_parameter_server_failure():
+        yield
+    except ParameterServerFailureError as e:  # pylint: disable=broad-except
+      restart_exit_code = os.environ.get(
+          "TF_CLIENT_NON_FATAL_RESTART_EXIT_CODE", None)
+      if restart_exit_code is not None:
+        sys.exit(int(restart_exit_code))
+      else:
+        raise
+
+
+class _PerWorkerDistributedDataset(object):
+  """Represents worker-distributed datasets created from dataset function."""
+
+  def __init__(self, dataset_fn, input_workers, client):
+    """Makes an iterable from datasets created by the given function.
+
+    Args:
+      dataset_fn: A function that returns a `Dataset`.
+      input_workers: an `InputWorkers` object.
+      client: a `Client` object, used to create dataset resources.
+    """
+    def disallow_variable_creation(next_creator, **kwargs):
+      raise ValueError("Creating variables in `dataset_fn` is not allowed.")
+
+    if isinstance(dataset_fn, def_function.Function):
+      with variable_scope.variable_creator_scope(disallow_variable_creation):
+        dataset_fn = dataset_fn.get_concrete_function()
+    elif not isinstance(dataset_fn, tf_function.ConcreteFunction):
+      with variable_scope.variable_creator_scope(disallow_variable_creation):
+        dataset_fn = def_function.function(dataset_fn).get_concrete_function()
+    self._dataset_fn = (
+        client.cluster._cancellation_mgr.get_cancelable_function(  # pylint: disable=protected-access
+            dataset_fn))
+    self._input_workers = input_workers
+    self._client = client
+    self._element_spec = None
+
+  def __iter__(self):
+    # We would like users to create iterators outside `tf.function`s so that we
+    # can track them.
+    if (not context.executing_eagerly() or
+        ops.get_default_graph().building_function):
+      raise RuntimeError(
+          "__iter__() is not supported inside of tf.function or in graph mode.")
+
+    def _create_per_worker_iterator():
+      dataset = self._dataset_fn()
+      return iter(dataset)
+
+    # If _PerWorkerDistributedDataset.__iter__ is called multiple
+    # times, for the same object it should only create and register resource
+    # once. Using object id to distinguish different iterator resources.
+    per_worker_iterator = self._client._create_per_worker_resources(
+        _create_per_worker_iterator)
+
+    # Create an iterator, so the consumer function of this iterator can start
+    # tracing using this iterator without needing to wait for the completion of
+    # the iterater creation. Note: the iterator shouldn't use memory until it is
+    # consumed.
+    # TODO(b/154675763): get rid of this workaround once we can make input_fn a
+    # tf.function.
+    iterator = _create_per_worker_iterator()
+    for iterator_remote_value in per_worker_iterator._values:
+      iterator_remote_value._set_type_spec(iterator._type_spec)
+    return _PerWorkerDistributedIterator(per_worker_iterator._values)
+
+  @property
+  def element_spec(self):
+    """The type specification of an element of this dataset."""
+    raise NotImplementedError("Passing `AsyncDistributedDataset` to a "
+                              "tf.function is not supported.")
+
+
+class _PerWorkerDistributedIterator(PerWorkerValues):
+  """Distributed iterator for `Client`."""
+
+  def __next__(self):
+    return self.get_next()
+
+  def get_next(self, name=None):
+    """Returns the next input from the iterator for all replicas."""
+    raise NotImplementedError("Iterating over an `AsyncDistributedIterator` "
+                              "is not supported right now.")
+
+
+def _is_ps_failure(error):
+  """Whether the error is considered a parameter server failure."""
+  if (_RPC_ERROR_FROM_PS in str(error) or
+      (isinstance(error, errors.InvalidArgumentError) and
+       "/job:ps" in str(error))):
+    return True
+
+
+def _is_worker_failure(error):
+  """Whether the error is considered a worker failure."""
+  if _JOB_WORKER_STRING_IDENTIFIER not in str(error):
+    return False
+  if _RPC_ERROR_FROM_PS in str(error):
+    return False
+
+  # TODO(haoyuzhang): Consider using special status code if error from a
+  # remote is derived from RPC errors originated from other hosts.
+  if isinstance(error, (errors.UnavailableError, errors.AbortedError)):
+    return True
+
+  # The following error could happen when the remote task fails and restarts
+  # in a very short interval during which no RPCs were exchanged to detect the
+  # failure. In that case, gRPC allows channel (which is different from a
+  # connection) to be reused for a replaced server listening to same address.
+  if isinstance(error, errors.InvalidArgumentError):
+    if ("Unable to find a context_id" in str(error) or
+        "unknown device" in str(error) or
+        "Unable to find the relevant tensor remote_handle" in str(error)):
+      # TODO(b/159961667): Fix "Unable to find the relevant tensor
+      # remote_handle" part.
+      return True
+
+  # TODO(b/162541228): The following 3 types of errors are very rare and only
+  # observed in large-scale testing. The types of errors should be reduced.
+  # This error could show up when copying function inputs from remote tasks.
+  if isinstance(error, errors.InternalError):
+    if ("Failed copying input tensor" in str(error) or
+        "Unable to find a context_id" in str(error)):
+      return True
+
+  # This could happen when the function registration fails. In the observed
+  # cases this only happens to the dataset related functions.
+  if isinstance(error, errors.NotFoundError):
+    if ("is neither a type of a primitive operation nor a name of a function "
+        "registered" in str(error)):
+      return True
+
+  # This could happen when the iterator is no longer valid on the remote worker
+  # "Resource input tensor contains an invalid device"
+  if isinstance(error, errors.CancelledError):
+    return True
+
+  return False
diff --git a/tensorflow/python/distribute/client/client_test.py b/tensorflow/python/distribute/client/client_test.py
new file mode 100644
index 00000000000..cf24f8c17ce
--- /dev/null
+++ b/tensorflow/python/distribute/client/client_test.py
@@ -0,0 +1,355 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for client.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import threading
+import time
+from absl import logging
+
+from tensorflow.python.distribute.client import client
+from tensorflow.python.eager import def_function
+from tensorflow.python.platform import test
+from tensorflow.python.training import coordinator
+from tensorflow.python.util import nest
+
+
+class MockCancellationManager(object):
+
+  def __init__(self):
+    self.cancelled = False
+
+  def start_cancel(self):
+    self.cancelled = True
+
+  def get_cancelable_function(self, func):
+    return func
+
+
+class CoordinatedClosureQueueTest(test.TestCase):
+
+  def testBasic(self):
+    queue = client._CoordinatedClosureQueue(MockCancellationManager())
+    closure1 = self._create_closure()
+    queue.put(closure1)
+    self.assertIs(closure1, queue.get())
+    self.assertFalse(queue.done())
+    queue.put_back(closure1)
+    self.assertEqual(closure1, queue.get())
+    queue.mark_finished()
+    self.assertTrue(queue.done())
+    queue.wait()
+
+  def testProcessAtLeaseOnce(self):
+    closure_queue = client._CoordinatedClosureQueue(MockCancellationManager())
+    labels = ['A', 'B', 'C', 'D', 'E']
+    processed_count = collections.defaultdict(int)
+
+    coord = coordinator.Coordinator(clean_stop_exception_types=[])
+
+    def process_queue():
+      with coord.stop_on_exception():
+        has_been_put_back = False
+        while True:
+          closure = closure_queue.get(timeout=30)
+          if closure is None:
+            break
+          if not has_been_put_back:
+            has_been_put_back = True
+            closure_queue.put_back(closure)
+            continue
+          closure._function()
+          closure_queue.mark_finished()
+
+    def get_func(label):
+
+      def func():
+        time.sleep(3)
+        processed_count[label] += 1
+
+      return func
+
+    for label in labels:
+      closure_queue.put(
+          client.Closure(get_func(label), MockCancellationManager()))
+    t1 = threading.Thread(target=process_queue, daemon=True)
+    t1.start()
+    t2 = threading.Thread(target=process_queue, daemon=True)
+    t2.start()
+
+    # Make sure multiple wait() calls are fine.
+    closure_queue.wait()
+    closure_queue.wait()
+    closure_queue.wait()
+    closure_queue.wait()
+
+    self.assertEqual(processed_count, collections.Counter(labels))
+
+    coord.join([t1, t2])
+
+  def testNotifyBeforeWait(self):
+    closure_queue = client._CoordinatedClosureQueue(MockCancellationManager())
+
+    def func():
+      logging.info('func running')
+
+    coord = coordinator.Coordinator(clean_stop_exception_types=[])
+
+    def process_queue():
+      with coord.stop_on_exception():
+        closure_queue.get()
+        closure_queue.mark_finished()
+
+    closure_queue.put(client.Closure(func, MockCancellationManager()))
+    t = threading.Thread(target=process_queue)
+    t.start()
+    coord.join([t])
+
+    # This test asserts that waiting at the time the function has been processed
+    # doesn't time out.
+    closure_queue.wait()
+
+  def _assert_one_unblock_the_other(self, first_fn, second_fn):
+    """Asserts `second_fn` wouldn't return before `first_fn` is finished."""
+    first_fn_done = threading.Event()
+    second_fn_done = threading.Event()
+    coord = coordinator.Coordinator(clean_stop_exception_types=[])
+
+    def wrapped_first_fn():
+      with coord.stop_on_exception():
+        self.assertFalse(second_fn_done.is_set())
+        first_fn()
+        first_fn_done.set()
+
+    self.assertFalse(first_fn_done.is_set())
+    t = threading.Thread(target=wrapped_first_fn)
+    t.start()
+
+    second_fn()
+    self.assertTrue(first_fn_done.is_set())
+    second_fn_done.set()
+
+    coord.join([t])
+
+  def testWaitRaiseErrorAfterMarkFailure(self):
+    closure_queue = client._CoordinatedClosureQueue(MockCancellationManager())
+    closure_queue.put(self._create_closure())
+    closure = closure_queue.get()
+
+    wait_finish_event = threading.Event()
+    coord = coordinator.Coordinator(clean_stop_exception_types=[])
+
+    # Using a thread to verify that closure_queue.wait() will not return until
+    # all inflight closures are finished.
+
+    def mark_finished_fn():
+      try:
+        raise ValueError('Some error.')
+      except ValueError as e:
+        closure_queue.mark_failed(e)
+
+    def wait_fn():
+      with self.assertRaises(ValueError):
+        closure_queue.wait()
+
+    self._assert_one_unblock_the_other(mark_finished_fn, wait_fn)
+
+    self.assertTrue(closure_queue.done())
+
+  def _create_closure(self):
+
+    @def_function.function()
+    def some_function():
+      return 1.0
+
+    return client.Closure(some_function, MockCancellationManager())
+
+  def _put_two_closures_and_get_one(self):
+    closure_queue = client._CoordinatedClosureQueue(MockCancellationManager())
+    closure1 = self._create_closure()
+    closure_queue.put(closure1)
+
+    closure2 = self._create_closure()
+    closure_queue.put(closure2)
+
+    closure_got = closure_queue.get()  # returns closure1
+    self.assertIs(closure_got, closure1)
+    self.assertIsNot(closure_got, closure2)
+    return closure_queue, closure1, closure2
+
+  def testPutRaiseError(self):
+    closure_queue, _, closure2 = self._put_two_closures_and_get_one()
+
+    closure_queue.mark_failed(ValueError())
+
+    with self.assertRaises(ValueError):
+      closure_queue.put(self._create_closure())
+
+    self.assertTrue(closure_queue.done())
+
+    with self.assertRaisesRegex(
+        client.FunctionRetryableError,
+        'The corresponding function is cancelled. Please reschedule the '
+        'function.'):
+      closure2._fetch_output_remote_values()
+
+    # The error is cleared.
+    closure_queue.put(self._create_closure())
+
+  def testWaitRaiseError(self):
+    closure_queue, _, closure2 = self._put_two_closures_and_get_one()
+
+    closure_queue.mark_failed(ValueError())
+
+    with self.assertRaises(ValueError):
+      closure_queue.wait()
+    self.assertTrue(closure_queue.done())
+
+    with self.assertRaisesRegex(
+        client.FunctionRetryableError,
+        'The corresponding function is cancelled. Please reschedule the '
+        'function.'):
+      closure2._fetch_output_remote_values()
+
+    # The error is cleared.
+    closure_queue.wait()
+
+  def testDoneRaiseError(self):
+    closure_queue, _, _ = self._put_two_closures_and_get_one()
+
+    self.assertFalse(closure_queue.done())
+    closure_queue.mark_failed(ValueError())
+    with self.assertRaises(ValueError):
+      closure_queue.done()
+
+  def _set_error(self, closure_queue, closure, error):
+    try:
+      raise error
+    except Exception as e:  # pylint: disable=broad-except
+      nest.map_structure(lambda x: x._set_error(e),
+                         closure._output_remote_values)
+      closure_queue.mark_failed(e)
+
+  def _test_cancel_closure_when_error(self, call_wait):
+    closure_queue, closure1, closure2 = self._put_two_closures_and_get_one()
+    closure_queue.put(self._create_closure())
+    closure_queue.get()
+    # At this moment, there are two inflight, one in queue.
+    self.assertEqual(closure_queue._inflight_closure_count, 2)
+
+    # Simulating closure1 fails.
+    self._set_error(closure_queue, closure1, ValueError('Some error.'))
+
+    # At this moment, there are one inflight, one in queue.
+    self.assertEqual(closure_queue._queue.qsize(), 1)
+    self.assertEqual(closure_queue._inflight_closure_count, 1)
+
+    closure3 = self._create_closure()
+
+    def fake_cancellation():
+      self._set_error(closure_queue, closure2,
+                      ValueError('Fake cancellation error.'))
+
+    def report_error():
+      # It should not report the fake cancellation error.
+      with self.assertRaisesRegex(ValueError, 'Some error.'):
+        # Verifying `wait()` or `put()` raises even if one closure is in
+        # flight.
+        if call_wait:
+          closure_queue.wait()
+        else:
+          closure_queue.put(closure3)
+
+    self._assert_one_unblock_the_other(fake_cancellation, report_error)
+
+    # Cancellation manager has been called.
+    self.assertTrue(closure_queue._cancellation_mgr.cancelled)
+
+    # At this moment, there is zero inflight, nothing in queue.
+    self.assertTrue(closure_queue._queue.empty())
+    self.assertEqual(closure_queue._inflight_closure_count, 0)
+    self.assertIsNone(closure_queue._error)
+
+    # This asserts that closure1 has errored.
+    with self.assertRaisesRegex(ValueError, 'Some error.'):
+      closure1._fetch_output_remote_values()
+
+    # The following asserts that closure3 should have been cancelled.
+    if not call_wait:
+      with self.assertRaisesRegex(
+          client.FunctionRetryableError,
+          'The corresponding function is cancelled. Please reschedule the '
+          'function.'):
+        closure3._fetch_output_remote_values()
+
+    # Closure2 was an inflight closure when it got cancelled.
+    self.assertEqual(closure2._output_remote_values._status,
+                     client._RemoteValueStatus.READY)
+    with self.assertRaisesRegex(ValueError, 'Fake cancellation error.'):
+      closure2._fetch_output_remote_values()
+
+    # This asserts that the queue has a clear state.
+    self.testBasic()
+
+  def testWaitRaiseErrorAfterCancelClosure(self):
+    self._test_cancel_closure_when_error(call_wait=True)
+
+  def testPutRaiseErrorAfterCancelClosure(self):
+    self._test_cancel_closure_when_error(call_wait=False)
+
+  def testStateIsRestoredAfterJoinIsCalled(self):
+    closure_queue, _, _ = self._put_two_closures_and_get_one()
+    self.assertEqual(closure_queue._inflight_closure_count, 1)
+    closure_queue.mark_failed(ValueError('test error'))
+    with self.assertRaises(ValueError):
+      closure_queue.put(self._create_closure())
+
+    # Its error should have been cleared.
+    self.assertIsNone(closure_queue._error)
+    closure_queue.put(self._create_closure())
+    self.assertIsNone(closure_queue._error)
+
+  def testThreadSafey(self):
+    thread_count = 10
+    queue = client._CoordinatedClosureQueue(MockCancellationManager())
+
+    # Each thread performs 20 queue actions: 10 are `put_back` and 10 are
+    # `mark_finished`.
+    action_count = 20
+
+    def func():
+      for i in range(action_count):
+        closure = queue.get()
+        if i % 2 == 0:
+          queue.put_back(closure)
+        else:
+          queue.mark_finished()
+
+    threads = [threading.Thread(target=func) for i in range(thread_count)]
+    for t in threads:
+      t.start()
+
+    for _ in range(thread_count * action_count // 2):
+      queue.put(self._create_closure())
+    queue.wait()
+    self.assertTrue(queue.done())
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/distribute/client/metric_utils.py b/tensorflow/python/distribute/client/metric_utils.py
new file mode 100644
index 00000000000..f0a6628a333
--- /dev/null
+++ b/tensorflow/python/distribute/client/metric_utils.py
@@ -0,0 +1,79 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Metrics collecting utilities for single client training."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+from tensorflow.python.eager import monitoring
+from tensorflow.python.util import tf_contextlib
+
+enable_metrics = False
+
+# Time in seconds to bucket the distribution of execution time. Range from
+# 0.001s (i.e., 1ms) to 1000s.
+_time_buckets = monitoring.ExponentialBuckets(0.001, 10, 6)
+
+_function_tracing_sampler = monitoring.Sampler(
+    '/tensorflow/api/ps_strategy/client/function_tracing', _time_buckets,
+    'Sampler to track the time (in seconds) for tracing functions.')
+
+_closure_execution_sampler = monitoring.Sampler(
+    '/tensorflow/api/ps_strategy/client/closure_execution', _time_buckets,
+    'Sampler to track the time (in seconds) for executing closures.')
+
+_remote_value_fetch_sampler = monitoring.Sampler(
+    '/tensorflow/api/ps_strategy/client/remote_value_fetch', _time_buckets,
+    'Sampler to track the time (in seconds) for fetching remote_value.')
+
+_METRICS_MAPPING = {
+    'function_tracing': _function_tracing_sampler,
+    'closure_execution': _closure_execution_sampler,
+    'remote_value_fetch': _remote_value_fetch_sampler
+}
+
+
+@tf_contextlib.contextmanager
+def monitored_timer(metric_name, state_tracker=None):
+  """Monitor the execution time and collect it into the specified metric."""
+  if not enable_metrics:
+    yield
+  else:
+    start_time = time.time()
+    start_state = state_tracker() if state_tracker else None
+    yield
+    duration_sec = time.time() - start_time
+    # If a state_checker is provided, record the metric only if the end state is
+    # different from the start state.
+    if state_tracker is None or state_tracker() != start_state:
+      metric = _METRICS_MAPPING[metric_name]
+      metric.get_cell().add(duration_sec)
+
+
+def get_metric_summary(metric_name):
+  """Get summary for the specified metric."""
+  metric = _METRICS_MAPPING[metric_name]
+  histogram_proto = metric.get_cell().value()
+  ret = dict()
+  ret['min'] = histogram_proto.min
+  ret['max'] = histogram_proto.max
+  ret['num'] = histogram_proto.num
+  ret['sum'] = histogram_proto.sum
+  # TODO(haoyuzhang): consider reporting the distribution in buckets.
+  return ret
diff --git a/tensorflow/python/distribute/client/metric_utils_test.py b/tensorflow/python/distribute/client/metric_utils_test.py
new file mode 100644
index 00000000000..3dab4367e52
--- /dev/null
+++ b/tensorflow/python/distribute/client/metric_utils_test.py
@@ -0,0 +1,72 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for metrics collecting in client."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.distribute.client import client
+from tensorflow.python.distribute.client import metric_utils
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.training.server_lib import ClusterSpec
+
+
+class MetricUtilsTest(test.TestCase):
+
+  def get_rpc_layer(self):
+    return 'grpc'
+
+  def testClientMetrics(self):
+    metric_utils.enable_metrics = True
+
+    cluster_def = multi_worker_test_base.create_in_process_cluster(
+        num_workers=1, num_ps=1, rpc_layer=self.get_rpc_layer())
+    cluster_def['chief'] = [
+        'localhost:%d' % multi_worker_test_base.pick_unused_port()
+    ]
+    cluster_resolver = SimpleClusterResolver(
+        ClusterSpec(cluster_def), rpc_layer=self.get_rpc_layer())
+    cluster = client.Cluster(cluster_resolver)
+
+    @def_function.function
+    def func():
+      time.sleep(0.5)
+      return 3
+
+    result = cluster.schedule(func, args=None, kwargs=None)
+    result = cluster.schedule(func, args=None, kwargs=None)
+    cluster.join()
+    self.assertEqual(result._get_value().numpy(), 3)
+
+    # Tracing, closure execution, and remote_value fetching should be executed
+    # exactly once for running this function.
+    metric_tracing = metric_utils.get_metric_summary('function_tracing')
+    self.assertEqual(metric_tracing['num'], 1)
+    # Tracing time should be longer than the sleep time in Python function.
+    self.assertGreater(metric_tracing['sum'], 0.5)
+    metric_closure = metric_utils.get_metric_summary('closure_execution')
+    self.assertEqual(metric_closure['num'], 2)
+    metric_remote_value = metric_utils.get_metric_summary('remote_value_fetch')
+    self.assertEqual(metric_remote_value['num'], 2)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/distribute/client/parameter_server_client.py b/tensorflow/python/distribute/client/parameter_server_client.py
new file mode 100644
index 00000000000..8236c2410d8
--- /dev/null
+++ b/tensorflow/python/distribute/client/parameter_server_client.py
@@ -0,0 +1,55 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Parameter server client module.
+
+This is currently under development and the API is subject to change.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute import parameter_server_strategy_v2
+from tensorflow.python.distribute.client import client
+
+
+class ParameterServerClient(client.Client):
+  """A client that uses `ParameterServerStrategy` to distribute tasks.
+
+  Parameter server training refers to the distributed training architecture
+  that requires two jobs in the cluster: workers and parameter servers. The
+  variables and updates to those variables are assigned on the parameter
+  servers' tasks, and the actual computation intensive operations are assigned
+  on worker tasks. In TF2, parameter server training only starts up one
+  client process, to drive and coordinate the workers and parameter servers.
+  This is referred to as single-client architecture, as opposed to multi-client
+  approach which is seen more often in traditional TensorFlow distributed
+  training, including `tf.estimator.Estimator` and `tf.keras` with
+  `tf.distribute.experimental.MultiWorkerMirroredStrategy`.
+
+  `ParameterServerClient` is a `Client` that uses `ParameterServerStrategy` as
+  the underlying strategy to distribute, and is the starting point of parameter
+  server training/evaluation.
+
+  If 'TF_CONFIG' environment variable is used, provide a
+  `TFConfigClusterResolver` to detect configurations for multi-worker training.
+
+  """
+
+  def __init__(self, cluster_resolver):
+    super(ParameterServerClient, self).__init__(
+        parameter_server_strategy_v2.ParameterServerStrategyV2(
+            cluster_resolver))
diff --git a/tensorflow/python/distribute/client/parameter_server_client_test.py b/tensorflow/python/distribute/client/parameter_server_client_test.py
new file mode 100644
index 00000000000..32c7ff9c7e9
--- /dev/null
+++ b/tensorflow/python/distribute/client/parameter_server_client_test.py
@@ -0,0 +1,458 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for parameter_server_client.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import threading
+from absl import logging
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.distribute import sharded_variable
+from tensorflow.python.distribute.client import client
+from tensorflow.python.distribute.client import parameter_server_client
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import init_ops_v2
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.training.server_lib import ClusterSpec
+
+
+class ErrorReportingThread(threading.Thread):
+
+  error = None
+
+  def __init__(self, *args, **kwargs):
+    assert "target" in kwargs
+    target = kwargs["target"]
+
+    @functools.wraps(target)
+    def wrapped_target(*args, **kwargs):
+      try:
+        return target(*args, **kwargs)
+      except Exception as e:  # pylint: disable=broad-except
+        ErrorReportingThread.error = e
+
+    kwargs["target"] = wrapped_target
+    super(ErrorReportingThread, self).__init__(*args, **kwargs)
+
+
+class TestCaseWithErrorReportingThread(test.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    cls._threading_thread = threading.Thread
+    threading.Thread = ErrorReportingThread
+    super(TestCaseWithErrorReportingThread, cls).setUpClass()
+
+  @classmethod
+  def tearDownClass(cls):
+    super(TestCaseWithErrorReportingThread, cls).tearDownClass()
+    threading.Thread = cls._threading_thread
+
+  def setUp(self):
+    ErrorReportingThread.error = None
+    super(TestCaseWithErrorReportingThread, self).setUp()
+
+  def tearDown(self):
+    super(TestCaseWithErrorReportingThread, self).tearDown()
+    if ErrorReportingThread.error:
+      raise ErrorReportingThread.error  # pylint: disable=raising-bad-type
+
+
+def make_client(num_workers, num_ps):
+  # TODO(rchao): Test the internal rpc_layer version.
+  cluster_def = multi_worker_test_base.create_in_process_cluster(
+      num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc")
+  cluster_def["chief"] = [
+      "localhost:%d" % multi_worker_test_base.pick_unused_port()
+  ]
+  cluster_resolver = SimpleClusterResolver(
+      ClusterSpec(cluster_def), rpc_layer="grpc")
+  return parameter_server_client.ParameterServerClient(cluster_resolver)
+
+
+class ParameterServerClientTest(TestCaseWithErrorReportingThread):
+
+  @classmethod
+  def setUpClass(cls):
+    super(ParameterServerClientTest, cls).setUpClass()
+    cls.client = make_client(num_workers=3, num_ps=2)
+
+  def testBasic(self):
+    self.client._strategy.extended._variable_count = 0
+    with self.client.context():
+      v1 = variables.Variable(initial_value=0.0)
+      v2 = variables.Variable(initial_value=1.0)
+    self.assertEqual(self.client._strategy.extended._variable_count, 2)
+
+    @def_function.function
+    def worker_fn():
+      v1.assign_add(0.1)
+      v2.assign_sub(0.2)
+      return v1.read_value() / v2.read_value()
+
+    results = self.client.schedule(worker_fn)
+    logging.info("Results of experimental_run_v2: %f",
+                 self.client.fetch(results))
+
+    self.assertAlmostEqual(v1.read_value().numpy(), 0.1, delta=1e-6)
+    self.assertAlmostEqual(v2.read_value().numpy(), 0.8, delta=1e-6)
+
+  def testFnReturnNestedValues(self):
+    x = constant_op.constant(1)
+
+    @def_function.function
+    def f():
+      return x + 1, (x + 2, x + 3), [x + 4], {"v": x}
+
+    got = self.client.schedule(f)
+    want = 2, (3, 4), [5], {"v": 1}
+    self.assertEqual(self.client.fetch(got), want)
+
+  def testInputFunction(self):
+
+    def input_fn():
+      return dataset_ops.DatasetV2.range(1, 2)
+
+    with self.client.context():
+      v = variables.Variable(initial_value=0, dtype=dtypes.int64)
+
+    @def_function.function
+    def worker_fn(iterator):
+      x = next(iterator)
+      v.assign_add(x)
+      return x
+
+    distributed_dataset = self.client.create_per_worker_dataset(input_fn)
+    result = self.client.schedule(worker_fn, args=(iter(distributed_dataset),))
+    result = self.client.fetch(result)
+    self.assertEqual(result, (1,))
+    result = self.client.schedule(worker_fn, args=(iter(distributed_dataset),))
+    result = self.client.fetch(result)
+    self.assertEqual(result, (1,))
+
+    self.assertAlmostEqual(v.read_value().numpy(), 2, delta=1e-6)
+
+  def testAsyncScheduleAndJoin(self):
+
+    def input_fn():
+      return dataset_ops.DatasetV2.from_tensor_slices([2] * 10)
+
+    with self.client.context():
+      v = variables.Variable(initial_value=0, dtype=dtypes.int32)
+
+    # TODO(yuefengz): the following tf.function has a return value which is None
+    # in its structured_outputs.
+    @def_function.function
+    def worker_fn(iterator):
+      x = next(iterator)
+      v.assign_add(x)
+
+    distributed_dataset = self.client.create_per_worker_dataset(input_fn)
+
+    iterator = iter(distributed_dataset)
+
+    # Verifying joining without any scheduling doesn't hang.
+    self.client.join()
+    self.assertEqual(v.read_value().numpy(), 0)
+
+    for _ in range(5):
+      self.client.schedule(worker_fn, args=(iterator,))
+    self.client.join()
+
+    # With 5 addition it should be 2*5 = 10.
+    self.assertEqual(v.read_value().numpy(), 10)
+
+    for _ in range(5):
+      self.client.schedule(worker_fn, args=(iterator,))
+
+    # Verifying multiple join is fine.
+    self.client.join()
+    self.client.join()
+    self.client.join()
+
+    self.assertTrue(self.client.done())
+
+    # Likewise, it's now 20.
+    self.assertEqual(v.read_value().numpy(), 20)
+
+  def testInputFunctionWithMap(self):
+    self._map_fn_tracing_count = 0
+
+    def input_fn():
+      def map_fn(x):
+        self._map_fn_tracing_count += 1
+        return x + 10
+      return dataset_ops.DatasetV2.range(0, 10).map(map_fn)
+
+    @def_function.function
+    def worker_fn(iterator):
+      return next(iterator)
+
+    distributed_dataset = (
+        self.client.create_per_worker_dataset(input_fn))
+    result = self.client.schedule(
+        worker_fn, args=(iter(distributed_dataset),))
+    self.assertEqual(result.fetch(), (10,))
+    self.assertEqual(self._map_fn_tracing_count, 1)
+
+  def testInputFunctionCreateVariables(self):
+
+    def input_fn():
+      v = variables.Variable(initial_value=0.0)
+      return v.read_value()
+
+    with self.assertRaises(ValueError):
+      self.client.create_per_worker_dataset(input_fn)
+
+
+class LimitedClosureQueueSizeBasicTest(ParameterServerClientTest):
+  """Test basic functionality works with explicit maximum closure queue size.
+
+  Execute the same set of test cases as in ParameterServerClientTest, with an
+  explicit size limit for the closure queue. Note that even when the queue size
+  is set to infinite, there is still a maximum practical size (depends on host
+  memory limit) that might cause the queue.put operations to be blocking when
+  scheduling a large number of closures on a big cluster. These tests make sure
+  that the client does not run into deadlocks in such scenario.
+  """
+
+  @classmethod
+  def setUpClass(cls):
+    super(LimitedClosureQueueSizeBasicTest, cls).setUpClass()
+    client._CLOSURE_QUEUE_MAX_SIZE = 2
+    cls.client = make_client(num_workers=3, num_ps=2)
+
+
+class VariablePartitioningScopeTest(test.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    super(VariablePartitioningScopeTest, cls).setUpClass()
+    cls.client = make_client(num_workers=3, num_ps=2)
+
+  def testBasic(self):
+    with self.client.context():
+      with self.client.experimental_variable_partitioning_scope():
+        init1 = init_ops_v2.Constant([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+        v1 = variables.Variable(
+            initial_value=lambda: init1(shape=(5, 2), dtype=dtypes.int64),
+            shape=(5, 2),
+            dtype=dtypes.int64)
+
+        init2 = init_ops_v2.Constant([0, 1, 2, 3, 4, 5])
+        v2 = variables.Variable(
+            initial_value=lambda: init2(shape=(6, 1), dtype=dtypes.int64),
+            shape=(6, 1),
+            dtype=dtypes.int64)
+
+    self.assertIsInstance(v1, sharded_variable.ShardedVariable)
+    self.assertLen(v1.variables, 2)
+    self.assertRegex(v1.variables[0].device, "/job:ps/replica:0/task:0")
+    self.assertRegex(v1.variables[1].device, "/job:ps/replica:0/task:1")
+    self.assertAllEqual(v1.variables[0].read_value().numpy(),
+                        [[0, 1], [2, 3], [4, 5]])
+    self.assertAllEqual(v1.variables[1].read_value().numpy(), [[6, 7], [8, 9]])
+
+    self.assertIsInstance(v2, sharded_variable.ShardedVariable)
+    self.assertLen(v2.variables, 2)
+    self.assertRegex(v2.variables[0].device, "/job:ps/replica:0/task:0")
+    self.assertRegex(v2.variables[1].device, "/job:ps/replica:0/task:1")
+    self.assertAllEqual(v2.variables[0].read_value().numpy(), [[0], [1], [2]])
+    self.assertAllEqual(v2.variables[1].read_value().numpy(), [[3], [4], [5]])
+
+  def testSurplusPS(self):
+    with self.client.context():
+      with self.client.experimental_variable_partitioning_scope():
+        initializer = init_ops_v2.Constant([0])
+
+        v = variables.Variable(
+            initial_value=lambda: initializer(shape=(1,), dtype=dtypes.int64),
+            shape=(1,),
+            dtype=dtypes.int64)
+
+    self.assertIsInstance(v, sharded_variable.ShardedVariable)
+    self.assertLen(v.variables, 1)
+    self.assertRegex(v.variables[0].device, "/job:ps/replica:0/task:0")
+    self.assertAllEqual(v.variables[0].read_value().numpy(), [0])
+
+  def testInvalidArgument(self):
+    with self.assertRaisesRegex(ValueError, "initial_value"):
+      with self.client.experimental_variable_partitioning_scope():
+        variables.Variable(initial_value=[0, 1, 2], shape=(3,))
+
+    with self.assertRaisesRegex(ValueError, "shape"):
+      with self.client.experimental_variable_partitioning_scope():
+        initializer = init_ops_v2.Constant([0, 1, 2])
+        variables.Variable(
+            initial_value=lambda: initializer(shape=(3,), dtype=dtypes.int64),
+            dtype=dtypes.int64)
+
+  def testPerWorkerValue(self):
+    var_shape = tuple()
+    var_dtype = dtypes.float32
+    var_name = "var"
+
+    def create_var():
+      var = variables.Variable(
+          initial_value=0.0, dtype=var_dtype, name=var_name)
+      self.assertIn("worker", var.device)
+      return var
+
+    worker_local_var = self.client._create_per_worker_resources(create_var)
+
+    # The following is a workaround to allow `worker_local_var` to be passed in
+    # as args to the `client.schedule` method which requires tensor specs to
+    # trace tf.function but _create_worker_resources' return values don't have
+    # tensor specs. We can get rid of this workaround once
+    # _create_worker_resources is able to infer the tensor spec of the return
+    # value of the function passed in. See b/154675763.
+    for var in worker_local_var._values:
+      var._set_type_spec(tensor_spec.TensorSpec(var_shape, var_dtype, var_name))
+
+    def worker_fn(var):
+      var.assign_add(1.0)
+
+    for _ in range(10):
+      # Which slice of `worker_local_var` will be used will depend on which
+      # worker the `worker_fn` gets scheduled on.
+      self.client.schedule(worker_fn, args=(worker_local_var,))
+    self.client.join()
+
+    var_sum = sum(self.client.fetch(worker_local_var._values))
+    self.assertEqual(var_sum, 10.0)
+
+
+class ErrorReportingTest(TestCaseWithErrorReportingThread):
+
+  @classmethod
+  def setUpClass(cls):
+    super(ErrorReportingTest, cls).setUpClass()
+    cls.client = make_client(num_workers=3, num_ps=2)
+
+    with cls.client.context():
+      cls.iteration = variables.Variable(initial_value=0.0)
+
+  @def_function.function
+  def _normal_function(self):
+    x = random_ops.random_uniform((2, 10))
+    y = random_ops.random_uniform((10, 2))
+    self.iteration.assign_add(1.0)
+    return math_ops.reduce_mean(math_ops.matmul(x, y))
+
+  @def_function.function
+  def _error_function(self):
+    x = random_ops.random_uniform((2, 10))
+    y = random_ops.random_uniform((10, 2))
+    check_ops.assert_non_positive_v2(math_ops.reduce_sum(math_ops.matmul(x, y)))
+    self.iteration.assign_add(1.0)
+    return self.iteration
+
+  def testJoinRaiseError(self):
+    for _ in range(3):
+      self.client.schedule(self._normal_function)
+    self.client.schedule(self._error_function)
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.client.join()
+
+  def testScheduleRaiseError(self):
+    for _ in range(3):
+      self.client.schedule(self._normal_function)
+    self.client.schedule(self._error_function)
+    with self.assertRaises(errors.InvalidArgumentError):
+      while True:
+        self.client.schedule(self._normal_function)
+
+  def testScheduleRaiseErrorWithMultipleFailure(self):
+    for _ in range(3):
+      self.client.schedule(self._normal_function)
+    self.client.schedule(self._error_function)
+    with self.assertRaises(errors.InvalidArgumentError):
+      while True:
+        self.client.schedule(self._error_function)
+    self.client.join()
+
+  def testErrorWillbeCleared(self):
+    self.client.schedule(self._error_function)
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.client.join()
+
+    for _ in range(3):
+      self.client.schedule(self._normal_function)
+    self.client.schedule(self._error_function)
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.client.join()
+
+  def testRemoteValueReturnError(self):
+    result = self.client.schedule(self._error_function)
+
+    with self.assertRaises(errors.InvalidArgumentError):
+      result.fetch()
+
+    # Clear the error.
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.client.join()
+
+  def testInputError(self):
+    aborted = self.client.schedule(self._error_function)
+
+    @def_function.function
+    def func(x):
+      return x + 1.0
+
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.client.join()
+
+    result = self.client.schedule(func, args=(aborted,))
+    with self.assertRaises(client.InputError):
+      result.fetch()
+
+    with self.assertRaises(client.InputError):
+      self.client.join()
+
+
+class LimitedClosureQueueErrorTest(ErrorReportingTest):
+  """Test error reporting works with explicit maximum closure queue size.
+
+  Execute the same set of test cases as in ErrorReportingTest, with an explicit
+  size limit for the closure queue.
+  """
+
+  @classmethod
+  def setUpClass(cls):
+    super(LimitedClosureQueueErrorTest, cls).setUpClass()
+    client._CLOSURE_QUEUE_MAX_SIZE = 2
+    cls.client = make_client(num_workers=3, num_ps=2)
+
+    with cls.client.context():
+      cls.iteration = variables.Variable(initial_value=0.0)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
index e42420ec644..d400e7aeed4 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
@@ -22,6 +22,7 @@ import collections
 import re
 
 from tensorflow.python.distribute.cluster_resolver import cluster_resolver
+from tensorflow.python.framework import config as framework_config
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
@@ -155,7 +156,8 @@ class TPUClusterResolver(cluster_resolver.ClusterResolver):
     Args:
       tpu: A string corresponding to the TPU to use. It can be the TPU name or
         TPU worker gRPC address. If not set, it will try automatically resolve
-        the TPU address on Cloud TPUs.
+        the TPU address on Cloud TPUs. If set to "local", it will assume that
+        the TPU is directly connected to the VM instead of over the network.
       zone: Zone where the TPUs are located. If omitted or empty, we will assume
         that the zone of the TPU is the same as the zone of the GCE VM, which we
         will try to discover from the GCE metadata service.
@@ -187,15 +189,21 @@ class TPUClusterResolver(cluster_resolver.ClusterResolver):
         Google Cloud environment.
     """
 
-    self._cloud_tpu_client = client.Client(
-        tpu=tpu,
-        zone=zone,
-        project=project,
-        credentials=credentials,
-        service=service,
-        discovery_url=discovery_url)
+    if tpu != 'local':
+      # Default Cloud environment
+      self._cloud_tpu_client = client.Client(
+          tpu=tpu,
+          zone=zone,
+          project=project,
+          credentials=credentials,
+          service=service,
+          discovery_url=discovery_url)
+      self._tpu = self._cloud_tpu_client.name()
+    else:
+      # Directly connected TPU environment
+      self._cloud_tpu_client = None
+      self._tpu = 'local'
 
-    self._tpu = self._cloud_tpu_client.name()
     # By default the task_type is 'worker` and the task_id is 0 (which is the
     # first worker in the task).
     self.task_type = job_name
@@ -238,20 +246,23 @@ class TPUClusterResolver(cluster_resolver.ClusterResolver):
       ValueError: If none of the TPUs specified exists.
     """
 
-    cluster_spec = self.cluster_spec()
-    if task_type is not None and task_id is not None:
-      # task_type and task_id is from the function parameter
-      master = cluster_spec.task_address(task_type, task_id)
-    elif self.task_type is not None and self.task_id is not None:
-      # task_type and task_id is from the object
-      master = cluster_spec.task_address(self.task_type, self.task_id)
+    if self._tpu != 'local':
+      cluster_spec = self.cluster_spec()
+      if task_type is not None and task_id is not None:
+        # task_type and task_id is from the function parameter
+        master = cluster_spec.task_address(task_type, task_id)
+      elif self.task_type is not None and self.task_id is not None:
+        # task_type and task_id is from the object
+        master = cluster_spec.task_address(self.task_type, self.task_id)
+      else:
+        # by default we take the first item in the cluster with the right name
+        job_tasks = cluster_spec.job_tasks(self.task_type)
+        if not job_tasks:
+          raise ValueError('No TPUs with the specified names exist.')
+        master = job_tasks[0]
+      return cluster_resolver.format_master_url(master, 'grpc')
     else:
-      # by default we take the first item in the cluster with the right name
-      job_tasks = cluster_spec.job_tasks(self.task_type)
-      if not job_tasks:
-        raise ValueError('No TPUs with the specified names exist.')
-      master = job_tasks[0]
-    return cluster_resolver.format_master_url(master, 'grpc')
+      return ''
 
   def get_master(self):
     return self.master()
@@ -298,7 +309,8 @@ class TPUClusterResolver(cluster_resolver.ClusterResolver):
       RuntimeError: If the provided TPU is not healthy.
     """
     ############################################################################
-    # There are 5 potential cases this code must handle:
+    # There are 6 potential cases this code must handle:
+    #  0. [Local case.] When a TPU is connected directly to the VM.
     #  1. [Normal case.] We should resolve the TPU name to a set of tasks, and
     #      a. Create a ClusterSpec that includes the coordinator job
     #      b. Create a ClusterSpec without the coordinator job.
@@ -308,17 +320,19 @@ class TPUClusterResolver(cluster_resolver.ClusterResolver):
     #      b. Create a ClusterSpec without the coordinator
     ############################################################################
 
-    network_endpoints = self._cloud_tpu_client.network_endpoints()
-    worker_list = [
-        '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
-        for endpoint in network_endpoints
-    ]
-    cluster_spec = {self.task_type: worker_list}
-    if self._coordinator_address:
-      # {1, 2}.a
-      cluster_spec[self._coordinator_name] = [self._coordinator_address]
-
-    return server_lib.ClusterSpec(cluster_spec)
+    if self._tpu != 'local':
+      network_endpoints = self._cloud_tpu_client.network_endpoints()
+      worker_list = [
+          '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
+          for endpoint in network_endpoints
+      ]
+      cluster_spec = {self.task_type: worker_list}
+      if self._coordinator_address:
+        # {1, 2}.a
+        cluster_spec[self._coordinator_name] = [self._coordinator_address]
+      return server_lib.ClusterSpec(cluster_spec)
+    else:
+      return server_lib.ClusterSpec({})
 
   def num_accelerators(self,
                        task_type=None,
@@ -340,6 +354,15 @@ class TPUClusterResolver(cluster_resolver.ClusterResolver):
       RuntimeError: If we cannot talk to a TPU worker after retrying or if the
         number of TPU devices per host is different.
     """
+    if self._tpu == 'local':
+      return {
+          'TPU':
+              len([
+                  d for d in framework_config.list_logical_devices()
+                  if d.device_type == 'TPU'
+              ])
+      }
+
     retry_count = 1
     # TODO(b/120564445): Replace with standard library for retries.
     while True:
@@ -360,8 +383,11 @@ class TPUClusterResolver(cluster_resolver.ClusterResolver):
           raise RuntimeError(error_message)
 
     if device_details.total_cores:
-      return {'TPU': TPUClusterResolver._verify_and_return_same_core_count(
-          device_details.device_map)}
+      return {
+          'TPU':
+              TPUClusterResolver._verify_and_return_same_core_count(
+                  device_details.device_map)
+      }
     return {'TPU': 0}
 
   @property
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py
index 51abc850bb2..155410f9668 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py
@@ -706,6 +706,10 @@ class TPUClusterResolverTest(test.TestCase):
     with self.assertRaises(RuntimeError):
       cluster_resolver.num_accelerators()
 
+  def testLocalTpuResolver(self):
+    cr = resolver.TPUClusterResolver(tpu='local')
+    self.assertEqual(cr.get_master(), '')
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py
index 4a7ac9796e6..eeef87f5765 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py
@@ -44,37 +44,53 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
 
-# TODO(yuefengz): support in-graph replication.
 @tf_export("distribute.experimental.MultiWorkerMirroredStrategy", v1=[])
 class CollectiveAllReduceStrategy(distribute_lib.Strategy):
   """A distribution strategy for synchronous training on multiple workers.
 
   This strategy implements synchronous distributed training across multiple
   workers, each with potentially multiple GPUs. Similar to
-  `tf.distribute.MirroredStrategy`, it creates copies of all variables in the
-  model on each device across all workers.
+  `tf.distribute.MirroredStrategy`, it replicates all variables and computations
+  to each local device. The difference is that it uses a distributed collective
+  implementation (e.g. all-reduce), so that multiple workers can work together.
 
-  It uses CollectiveOps's implementation of multi-worker all-reduce to
-  to keep variables in sync. A collective op is a single op in the
-  TensorFlow graph which can automatically choose an all-reduce algorithm in
-  the TensorFlow runtime according to hardware, network topology and tensor
-  sizes.
+  You need to launch your program on each worker and configure
+  `cluster_resolver` correctly. For example, if you are using
+  `tf.distribute.cluster_resolver.TFConfigClusterResolver`, each worker needs to
+  have its corresponding `task_type` and `task_id` set in the `TF_CONFIG`
+  environment variable.
 
-  By default it uses all local GPUs or CPU for single-worker training.
+  Your program runs on each worker as-is. Note that collectives require each
+  worker to participate. All `tf.distribute` and non `tf.distribute` API may use
+  collectives internally, e.g. checkpointing and saving since reading a
+  `tf.Variable` with `tf.VariableSynchronization.ON_READ` all-reduces the value.
+  Therefore it's recommended to run exactly the same program on each worker.
+  Dispatching based on `task_type` or `task_id` of the worker is error-prone.
 
-  When 'TF_CONFIG' environment variable is set, it parses cluster_spec,
-  task_type and task_id from 'TF_CONFIG' and turns into a multi-worker strategy
-  which mirrored models on GPUs of all machines in a cluster. In the current
-  implementation, it uses all GPUs in a cluster and it assumes all workers have
-  the same number of GPUs.
+  `cluster_resolver.num_accelerators()` determines the number of GPUs the
+  strategy uses. If it's zero, the strategy uses the CPU. All workers need to
+  use the same number of devices, otherwise the behavior is undefined.
 
-  You can also pass a `distribute.cluster_resolver.ClusterResolver` instance
-  when instantiating the strategy. The task_type, task_id etc. will be parsed
-  from the resolver instance instead of from the `TF_CONFIG` env var.
+  This strategy is not intended for TPU. Use
+  `tf.distribute.experimental.TPUStrategy` instead.
 
-  It supports both eager mode and graph mode. However, for eager mode, it has to
-  set up the eager context in its constructor and therefore all ops in eager
-  mode have to run after the strategy object is created.
+  __Saving__
+
+  You need to save and checkpoint on all workers instead of just one. This is
+  because variables whose synchronization=ON_READ triggers aggregation during
+  saving. It's recommended to save to a different path on each worker to avoid
+  race conditions. Each worker saves the same thing. See
+  [Multi-worker training with Keras](https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras#model_saving_and_loading)
+  tutorial for examples.
+
+  __Known Issues__
+
+  * `tf.distribute.cluster_resolver.TFConfigClusterResolver` does not return the
+  correct number of accelerators. The strategy uses all available GPUs if
+  `cluster_resolver` is `tf.distribute.cluster_resolver.TFConfigClusterResolver`
+  or `None`.
+  * In eager mode, the strategy needs to be created before calling any other
+  Tensorflow API.
 
   """
   # TODO(anjalisridhar): Update our guides with examples showing how we can use
@@ -87,14 +103,13 @@ class CollectiveAllReduceStrategy(distribute_lib.Strategy):
     """Creates the strategy.
 
     Args:
-      communication: optional Enum of type
-        `distribute.experimental.CollectiveCommunication`.  This provides a way
-        for the user to override the choice of collective op communication.
-        Possible values include `AUTO`, `RING`, and `NCCL`.
-      cluster_resolver: optional `distribute.cluster_resolver.ClusterResolver`
-        object. The default ClusterResolver that is used is the
-        TFConfigClusterResolver which is instantiated from the TF_CONFIG env
-        var.
+      communication: optional
+        `tf.distribute.experimental.CollectiveCommunication`. This is a hint on
+        the preferred collective communication implementation. Possible values
+        include `AUTO`, `RING`, and `NCCL`.
+      cluster_resolver: optional
+        `tf.distribute.cluster_resolver.ClusterResolver`. If `None`,
+        `tf.distribute.cluster_resolver.TFConfigClusterResolver` is used.
     """
     # TODO(b/150151677): consider move communication to CollectiveHints.
     super(CollectiveAllReduceStrategy, self).__init__(
@@ -121,23 +136,6 @@ class CollectiveAllReduceStrategy(distribute_lib.Strategy):
     obj.extended._initialize_local(TFConfigClusterResolver(), devices=devices)  # pylint: disable=protected-access
     return obj
 
-  def scope(self):  # pylint: disable=useless-super-delegation
-    """Returns a context manager selecting this Strategy as current.
-
-    Inside a `with strategy.scope():` code block, this thread
-    will use a variable creator set by `strategy`, and will
-    enter its "cross-replica context".
-
-    In `MultiWorkerMirroredStrategy`, all variables created inside
-    `strategy.scope() will be mirrored on all replicas of each worker.
-    Moreover, it also sets a default device scope so that ops without
-    specified devices will end up on the correct worker.
-
-    Returns:
-      A context manager to use for creating variables with this strategy.
-    """
-    return super(CollectiveAllReduceStrategy, self).scope()
-
   @property
   def cluster_resolver(self):
     """Returns the cluster resolver associated with this strategy.
@@ -190,6 +188,7 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     self._communication = communication
     self._initialize_strategy(self._cluster_resolver)
     self._cfer_fn_cache = weakref.WeakKeyDictionary()
+    self.experimental_enable_get_next_as_optional = True
     assert isinstance(self._cross_device_ops,
                       cross_device_ops_lib.CollectiveAllReduce)
 
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
index 2f114ef11a9..6cfb007bd79 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
@@ -351,31 +351,35 @@ class DistributedCollectiveAllReduceStrategyTest(
       combinations.combine(
           mode=['graph'], required_gpus=[0, 1, 2], use_dataset=[True, False]))
   def testMakeInputFnIterator(self, required_gpus, use_dataset):
-    if use_dataset:
-      fn = lambda: dataset_ops.Dataset.range(100)
-    else:
-      def fn():
-        dataset = dataset_ops.Dataset.range(100)
-        it = dataset_ops.make_one_shot_iterator(dataset)
-        return it.get_next
-    # We use CPU as the device when required_gpus = 0
-    devices_per_worker = max(1, required_gpus)
-    expected_values = [[i+j for j in range(devices_per_worker)]
-                       for i in range(0, 100, devices_per_worker)]
+    def _worker_fn(task_type, task_id, required_gpus):
+      if use_dataset:
+        fn = lambda: dataset_ops.Dataset.range(20)
+      else:
+        def fn():
+          dataset = dataset_ops.Dataset.range(20)
+          it = dataset_ops.make_one_shot_iterator(dataset)
+          return it.get_next
+      # We use CPU as the device when required_gpus = 0
+      devices_per_worker = max(1, required_gpus)
+      expected_values = [[i+j for j in range(devices_per_worker)]
+                         for i in range(0, 20, devices_per_worker)]
 
-    input_fn = self._input_fn_to_test_input_context(
-        fn,
-        expected_num_replicas_in_sync=3*devices_per_worker,
-        expected_num_input_pipelines=3,
-        expected_input_pipeline_id=1)  # because task_id = 1
-    self._test_input_fn_iterator(
-        'worker',
-        1,
-        required_gpus,
-        input_fn,
-        expected_values,
-        test_reinitialize=use_dataset,
-        ignore_order=not use_dataset)
+      input_fn = self._input_fn_to_test_input_context(
+          fn,
+          expected_num_replicas_in_sync=3*devices_per_worker,
+          expected_num_input_pipelines=3,
+          expected_input_pipeline_id=task_id)
+      self._test_input_fn_iterator(
+          task_type,
+          task_id,
+          required_gpus,
+          input_fn,
+          expected_values,
+          test_reinitialize=use_dataset,
+          ignore_order=not use_dataset)
+
+    self._run_between_graph_clients(_worker_fn, self._cluster_spec,
+                                    required_gpus)
 
   @combinations.generate(combinations.combine(mode=['graph']))
   def testUpdateConfigProto(self):
diff --git a/tensorflow/python/distribute/collective_util.py b/tensorflow/python/distribute/collective_util.py
index f79b953eb4a..0d9c404e520 100644
--- a/tensorflow/python/distribute/collective_util.py
+++ b/tensorflow/python/distribute/collective_util.py
@@ -34,7 +34,9 @@ class Hints(object):
   One common optimization is to break gradients all-reduce into multiple packs
   so that weight updates can overlap with gradient all-reduce.
 
-  Example:
+  Examples:
+
+  - bytes_per_pack
 
   ```python
   hints = tf.distribute.experimental.CollectiveHints(
@@ -45,16 +47,35 @@ class Hints(object):
       experimental_aggregate_gradients=False)
   ```
 
+  - timeout_seconds
+
+  ```python
+  strategy = tf.distribute.MirroredStrategy()
+  hints = tf.distribute.experimental.CollectiveHints(
+      timeout_seconds=120)
+  try:
+    strategy.reduce("sum", v, axis=None, experimental_hints=hints)
+  except tf.errors.DeadlineExceededError:
+    do_something()
+  ```
+
   """
 
-  def __init__(self, bytes_per_pack=0):
+  def __init__(self, bytes_per_pack=0, timeout_seconds=None):
     """Creates a CollectiveHints.
 
     Args:
-      bytes_per_pack: A non-negative integer. Breaks collective operations into
+      bytes_per_pack: a non-negative integer. Breaks collective operations into
         packs of certain size. If it's zero, the value is determined
         automatically. This only applies to all-reduce with
         `MultiWorkerMirroredStrategy` currently.
+      timeout_seconds: a float or None, timeout in seconds. If not None, the
+        collective raises `tf.errors.DeadlineExceededError` if it takes longer
+        than this timeout. This can be useful when debugging hanging issues.
+        This should only be used for debugging since it creates a new thread for
+        each collective, i.e. an overhead of `timeout_seconds *
+        num_collectives_per_second` more threads.  This only works for
+        `tf.distribute.experimental.MultiWorkerMirroredStrategy`.
 
     Raises:
       ValueError: When arguments have invalid value.
@@ -62,3 +83,4 @@ class Hints(object):
     if bytes_per_pack < 0:
       raise ValueError("bytes_per_pack must be non-negative")
     self.bytes_per_pack = bytes_per_pack
+    self.timeout_seconds = timeout_seconds
diff --git a/tensorflow/python/distribute/combinations.py b/tensorflow/python/distribute/combinations.py
index ad8bb879b93..3856b6fd132 100644
--- a/tensorflow/python/distribute/combinations.py
+++ b/tensorflow/python/distribute/combinations.py
@@ -58,11 +58,17 @@ class DistributionParameter(combinations_lib.ParameterModifier):
   """
 
   def modified_arguments(self, kwargs, requested_parameters):
-    del requested_parameters
+    # Get the parameter that indicates if we need to set the `_use_policy` flag
+    # on the strategy object. This is a temporary flag for testing the variable
+    # policy rollout.
+    use_var_policy = kwargs.get("use_var_policy", None)
     distribution_arguments = {}
     for k, v in kwargs.items():
       if isinstance(v, NamedDistribution):
-        distribution_arguments[k] = v.strategy
+        strategy = v.strategy
+        if use_var_policy:
+          strategy.extended._use_var_policy = use_var_policy
+        distribution_arguments[k] = strategy
     return distribution_arguments
 
 
@@ -93,7 +99,24 @@ class ClusterParameters(combinations_lib.ParameterModifier):
     return update
 
 
-class NamedGPUCombination(combinations_lib.TestCombination):
+class DistributionCombination(combinations_lib.TestCombination):
+  """Sets up distribution strategy for tests."""
+
+  def parameter_modifiers(self):
+    return [
+        DistributionParameter(),
+        combinations_lib.OptionalParameter("use_var_policy"),
+    ]
+
+
+class ClusterCombination(combinations_lib.TestCombination):
+  """Sets up multi worker tests."""
+
+  def parameter_modifiers(self):
+    return [ClusterParameters()]
+
+
+class GPUCombination(combinations_lib.TestCombination):
   """Enable tests to request GPU hardware and skip non-GPU combinations.
 
   This class expects test_combinations to be generated with `NamedDistribution`
@@ -135,17 +158,7 @@ class NamedGPUCombination(combinations_lib.TestCombination):
     return [combinations_lib.OptionalParameter("required_gpus")]
 
 
-class GPUCombination(NamedGPUCombination):
-  """NamedGPUCombination that passes `tf.distribute.Strategy` to the tests."""
-
-  def parameter_modifiers(self):
-    return [
-        ClusterParameters(),
-        DistributionParameter(),
-    ] + NamedGPUCombination.parameter_modifiers(self)
-
-
-class NamedTPUCombination(combinations_lib.TestCombination):
+class TPUCombination(combinations_lib.TestCombination):
   """Allow to request TPU hardware and skip non-TPU combinations.
 
   This class expects test_combinations to be generated with `NamedDistribution`
@@ -207,16 +220,6 @@ class NamedTPUCombination(combinations_lib.TestCombination):
     ]
 
 
-class TPUCombination(NamedTPUCombination):
-  """NamedTPUCombination that passes `tf.distribute.Strategy` to the tests."""
-
-  def parameter_modifiers(self):
-    return [
-        ClusterParameters(),
-        DistributionParameter(),
-    ] + NamedTPUCombination.parameter_modifiers(self)
-
-
 class NamedDistribution(object):
   """Wraps a `tf.distribute.Strategy` and adds a name for test titles."""
 
@@ -228,7 +231,7 @@ class NamedDistribution(object):
                use_cloud_tpu=False,
                has_chief=False,
                num_workers=1,
-               use_pool_runner=True):
+               use_pool_runner=False):
     """Initialize NamedDistribution.
 
     Args:
@@ -298,6 +301,8 @@ def generate(combinations, test_combinations=()):
   default_combinations = (
       framework_combinations.EagerGraphCombination(),
       framework_combinations.TFVersionCombination(),
+      ClusterCombination(),
+      DistributionCombination(),
       GPUCombination(),
       TPUCombination(),
   )
diff --git a/tensorflow/python/distribute/combinations_test.py b/tensorflow/python/distribute/combinations_test.py
index 6d9d0b2570f..3fc3735d560 100644
--- a/tensorflow/python/distribute/combinations_test.py
+++ b/tensorflow/python/distribute/combinations_test.py
@@ -30,7 +30,7 @@ from tensorflow.python.framework import combinations as framework_combinations
 from tensorflow.python.platform import test
 
 
-class ClusterParametersTest(test.TestCase, parameterized.TestCase):
+class ClusterCombinationTest(test.TestCase, parameterized.TestCase):
   # For this test we need to use `framework.test_combinations` because our
   # `generate` eats the cluster parameters.
   #
@@ -42,7 +42,7 @@ class ClusterParametersTest(test.TestCase, parameterized.TestCase):
           combinations.NamedDistribution(
               "HasClusterParams", lambda: None, has_chief=True, num_workers=2),
       ]),
-      test_combinations=(combinations.GPUCombination(),))
+      test_combinations=(combinations.ClusterCombination(),))
   def testClusterParams(self, distribution, has_chief, num_workers):
     self.assertTrue(has_chief)
     self.assertEqual(num_workers, 2)
@@ -51,14 +51,14 @@ class ClusterParametersTest(test.TestCase, parameterized.TestCase):
       framework_combinations.combine(distribution=[
           combinations.NamedDistribution("NoClusterParams", lambda: None),
       ]),
-      test_combinations=(combinations.GPUCombination(),))
+      test_combinations=(combinations.ClusterCombination(),))
   def testClusterParamsHasDefault(self, distribution, has_chief, num_workers):
     self.assertFalse(has_chief)
     self.assertEqual(num_workers, 1)
 
   @framework_combinations.generate(
       framework_combinations.combine(v=1),
-      test_combinations=(combinations.GPUCombination(),))
+      test_combinations=(combinations.ClusterCombination(),))
   def testClusterParamsNoStrategy(self, v, has_chief, num_workers):
     self.assertFalse(has_chief)
     self.assertEqual(num_workers, 1)
@@ -69,7 +69,7 @@ class ClusterParametersTest(test.TestCase, parameterized.TestCase):
               "WithClusterParams", lambda: None, has_chief=True, num_workers=2),
           combinations.NamedDistribution("WithoutClusterParams", lambda: None),
       ]),
-      test_combinations=(combinations.GPUCombination(),))
+      test_combinations=(combinations.ClusterCombination(),))
   def testClusterParamsAreOptional(self, distribution):
     # If combinations library doesn't raise an exception, the test is passed.
     pass
@@ -83,7 +83,7 @@ class ClusterParametersTest(test.TestCase, parameterized.TestCase):
           ds3=combinations.NamedDistribution(
               "Strategy3", lambda: None, has_chief=True, num_workers=0),
       ),
-      test_combinations=(combinations.GPUCombination(),))
+      test_combinations=(combinations.ClusterCombination(),))
   def testMultipleDistributionSingleWorker(self, ds1, ds2, ds3):
     # If combinations library doesn't raise an exception, the test is passed.
     pass
@@ -101,7 +101,7 @@ class ClusterParametersShouldFailTest(test.TestCase, parameterized.TestCase):
           ds2=combinations.NamedDistribution(
               "Strategy2", lambda: None, has_chief=True, num_workers=2),
       ),
-      test_combinations=(combinations.GPUCombination(),))
+      test_combinations=(combinations.ClusterCombination(),))
   def testMultipleDistributionMultiWorker(self, ds1, ds2):
     # combinations library should raise an exception.
     pass
diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index ed6b0558b46..ed3e2d5d951 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import copy
 import enum
 import threading
 
@@ -958,8 +959,19 @@ class CollectiveAllReduce(CrossDeviceOps):
     self._collective_keys = (collective_keys or
                              cross_device_utils.CollectiveKeys())
     self._communication = communication
+    # This lock guards all collective launches, i.e. calls to
+    # cross_device_utils.build_collectve_*.
+    #
     # In a multi threaded eager program we need to ensure different groups of
-    # collectives don't interleave each other, otherwise there will be deadlock.
+    # collectives don't interleave each other, otherwise there couuld be
+    # deadlocks. E.g. if two user threads both are launching collectives:
+    #   user-thread-0  device0                 device1
+    #   user-thread-1          device0 device1
+    # In eager mode, we use one executor per device. Executors use single FIFO
+    # queues, so the above launch sequences end up with the following queues:
+    #   device-0  collective-0  collective-1
+    #   device-1  collective-1  collective-0
+    # This deadlocks since neither collective is able to finish.
     self._lock = threading.Lock()
 
     # Collective ops requires all devices to participate and is blocking. In
@@ -1039,7 +1051,8 @@ class CollectiveAllReduce(CrossDeviceOps):
       dense_results = []
     if sparse_values:
       sparse_results = self._do_batch_all_reduce_sparse(reduce_op,
-                                                        sparse_values)
+                                                        sparse_values,
+                                                        experimental_hints)
     else:
       sparse_results = []
     return cross_device_utils.stitch_values(
@@ -1085,30 +1098,35 @@ class CollectiveAllReduce(CrossDeviceOps):
               self._devices), self._group_size, communication, len(packs)), 10)
 
     reduced_values = []
-    for pack in packs:
-      # By placing all CollectiveReduce ops in a pack under single name scope,
-      # we ensure they will be picked up by the `ScopedAllocator` grappler
-      # optimizer and packed into a single all-reduce.
-      with self._lock, ops.name_scope("allreduce"):
-        for per_replica in pack:
-          # Add control dependencies per device from the last gradients to the
-          # current set, in order to serialize NCCL launches.
-          if (communication == CollectiveCommunication.NCCL.value and
-              reduced_values):
-            control_inputs = list(reduced_values[-1])
-          else:
-            control_inputs = None
-          reduced_values.append(
-              cross_device_utils.build_collective_reduce(
-                  per_replica.values,
-                  self._devices,
-                  self._group_size,
-                  self._collective_keys,
-                  "Add",
-                  "Id",
-                  communication,
-                  control_inputs,
-                  executors=self._executors))
+    with self._lock:
+      for pack in packs:
+        # By placing all CollectiveReduce ops in a pack under single name scope,
+        # we ensure they will be picked up by the `ScopedAllocator` grappler
+        # optimizer and packed into a single all-reduce.
+        with ops.name_scope("allreduce"):
+          for per_replica in pack:
+            # Add control dependencies per device from the last gradients to the
+            # current set, in order to serialize NCCL launches.
+            if (communication == CollectiveCommunication.NCCL.value and
+                reduced_values):
+              control_inputs = list(reduced_values[-1])
+            else:
+              control_inputs = None
+            reduced_values.append(
+                cross_device_utils.build_collective_reduce(
+                    per_replica.values,
+                    self._devices,
+                    self._group_size,
+                    self._collective_keys,
+                    "Add",
+                    "Id",
+                    communication,
+                    control_inputs,
+                    executors=self._executors,
+                    timeout=experimental_hints.timeout_seconds))
+
+    for e in self._executors:
+      e.wait()
 
     mirrored = []
     # Reverse the order of reduced value to recover the order in the input.
@@ -1121,7 +1139,8 @@ class CollectiveAllReduce(CrossDeviceOps):
           distribute_utils.regroup(value, wrap_class=value_lib.Mirrored))
     return mirrored
 
-  def _do_batch_all_reduce_sparse(self, reduce_op, per_replica_values):
+  def _do_batch_all_reduce_sparse(self, reduce_op, per_replica_values,
+                                  experimental_hints):
     """All-reduce IndexedSlices across all workers in a batch."""
 
     logging.log_first_n(
@@ -1139,12 +1158,16 @@ class CollectiveAllReduce(CrossDeviceOps):
       communication_hint = CollectiveCommunication.AUTO.value
 
     gathered_values = []
-    with ops.name_scope("allreduce"):
+    with self._lock, ops.name_scope("allreduce"):
       for per_replica in per_replica_values:
         gathered_values.append(
             cross_device_utils.build_collective_gather_indexed_slices(
-                per_replica.values, self._devices, self._group_size,
-                self._collective_keys, communication_hint))
+                per_replica.values,
+                self._devices,
+                self._group_size,
+                self._collective_keys,
+                communication_hint,
+                timeout=experimental_hints.timeout_seconds))
 
     mirrored = []
     for value in gathered_values:
@@ -1160,8 +1183,9 @@ class CollectiveAllReduce(CrossDeviceOps):
   def __deepcopy__(self, memo):
     # distribute_coordinator deep-copies the strategy object, so
     # CollectiveAllReduce needs to support deep copy as well.
-    return CollectiveAllReduce(self._devices, self._group_size,
-                               self._collective_keys, self._communication)
+    collective_keys = copy.deepcopy(self._collective_keys, memo)
+    return CollectiveAllReduce(self._devices, self._group_size, collective_keys,
+                               self._communication)
 
 
 def choose_the_best(devices, session_config=None):
diff --git a/tensorflow/python/distribute/cross_device_ops_test.py b/tensorflow/python/distribute/cross_device_ops_test.py
index d54e16c2748..967de7d8426 100644
--- a/tensorflow/python/distribute/cross_device_ops_test.py
+++ b/tensorflow/python/distribute/cross_device_ops_test.py
@@ -41,8 +41,10 @@ from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
+from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import kernels
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -942,9 +944,137 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
     reduced = reduce_fn(v)
     self.assertAllEqual(self.evaluate(reduced), [[3.0, 3.0], [3.0, 3.0]])
 
+  @combinations.generate(
+      combinations.combine(
+          required_gpus=[0, 1],
+          mode="eager",
+          communication=[CollectiveCommunication.RING]))
+  def testTimeoutReduceDense(self, communication, required_gpus):
+    hints = collective_util.Hints(timeout_seconds=1)
+    collective, devices, _ = self._get_test_objects(
+        "worker",
+        0,
+        num_gpus=required_gpus,
+        communication=communication,
+        use_strategy_object=False)
+    remote.connect_to_cluster(
+        multi_worker_util.normalize_cluster_spec(self._cluster_spec),
+        protocol="grpc")
+    devices = [device_util.canonicalize(d) for d in devices]
+    v = _make_per_replica([1.0], devices)
+
+    @def_function.function
+    def reduce_dense():
+      collective.reduce(reduce_util.ReduceOp.SUM, v, v, hints)
+
+    # The collective should time out because we only launch it on worker-0,
+    # while there're three workers in total.
+    with self.assertRaises(errors.DeadlineExceededError):
+      reduce_dense()
+
+    # Reset since collective failures poison the context.
+    context._reset_context()  # pylint: disable=protected-access
+
+  @combinations.generate(
+      combinations.combine(
+          required_gpus=[0, 1],
+          mode="eager",
+          communication=[CollectiveCommunication.RING]))
+  def testTimeoutBatchReduceDense(self, communication, required_gpus):
+    hints = collective_util.Hints(timeout_seconds=1)
+    collective, devices, _ = self._get_test_objects(
+        "worker",
+        0,
+        num_gpus=required_gpus,
+        communication=communication,
+        use_strategy_object=False)
+    remote.connect_to_cluster(
+        multi_worker_util.normalize_cluster_spec(self._cluster_spec),
+        protocol="grpc")
+    devices = [device_util.canonicalize(d) for d in devices]
+    v = _make_per_replica([1.0], devices)
+
+    @def_function.function
+    def batch_reduce_dense():
+      collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v, v), (v, v)], hints)
+
+    # The collective should time out because we only launch it on worker-0,
+    # while there're three workers in total.
+    with self.assertRaises(errors.DeadlineExceededError):
+      batch_reduce_dense()
+
+    # Reset since collective failures poison the context.
+    context._reset_context()  # pylint: disable=protected-access
+
+  @combinations.generate(
+      combinations.combine(
+          required_gpus=[0, 1],
+          mode="eager",
+          communication=[CollectiveCommunication.RING]))
+  def testTimeoutReduceSparse(self, communication, required_gpus):
+    hints = collective_util.Hints(timeout_seconds=1)
+    collective, devices, _ = self._get_test_objects(
+        "worker",
+        0,
+        num_gpus=required_gpus,
+        communication=communication,
+        use_strategy_object=False)
+    remote.connect_to_cluster(
+        multi_worker_util.normalize_cluster_spec(self._cluster_spec),
+        protocol="grpc")
+    devices = [device_util.canonicalize(d) for d in devices]
+    v = value_lib.PerReplica([
+        _make_indexed_slices([[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0])
+    ])
+
+    @def_function.function
+    def reduce_sparse():
+      collective.reduce(reduce_util.ReduceOp.SUM, v, v, hints)
+
+    # The collective should time out because we only launch it on worker-0,
+    # while there're three workers in total.
+    with self.assertRaises(errors.DeadlineExceededError):
+      reduce_sparse()
+
+    # Reset since collective failures poison the context.
+    context._reset_context()  # pylint: disable=protected-access
+
+  @combinations.generate(
+      combinations.combine(
+          required_gpus=[0, 1],
+          mode="eager",
+          communication=[CollectiveCommunication.RING]))
+  def testTimeoutBatchReduceSparse(self, communication, required_gpus):
+    hints = collective_util.Hints(timeout_seconds=1)
+    collective, devices, _ = self._get_test_objects(
+        "worker",
+        0,
+        num_gpus=required_gpus,
+        communication=communication,
+        use_strategy_object=False)
+    remote.connect_to_cluster(
+        multi_worker_util.normalize_cluster_spec(self._cluster_spec),
+        protocol="grpc")
+    devices = [device_util.canonicalize(d) for d in devices]
+    v = value_lib.PerReplica([
+        _make_indexed_slices([[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0])
+    ])
+
+    @def_function.function
+    def batch_reduce_sparse():
+      collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v, v), (v, v)], hints)
+
+    # The collective should time out because we only launch it on worker-0,
+    # while there're three workers in total.
+    with self.assertRaises(errors.DeadlineExceededError):
+      batch_reduce_sparse()
+
+    # Reset since collective failures poison the context.
+    context._reset_context()  # pylint: disable=protected-access
+
 
 if __name__ == "__main__":
   # Set default inter op thread pool size to one to ensure we don't exhaust the
   # thread pool with the additional executors to run collectives in eager.
   os.environ["TF_NUM_INTEROP_THREADS"] = "1"
-  test.main()
+  combinations.main()
diff --git a/tensorflow/python/distribute/cross_device_utils.py b/tensorflow/python/distribute/cross_device_utils.py
index 9dc24b16e6a..a8d4d176ab9 100644
--- a/tensorflow/python/distribute/cross_device_utils.py
+++ b/tensorflow/python/distribute/cross_device_utils.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections as pycoll
+import copy
 import threading
 
 from tensorflow.python.distribute import all_reduce
@@ -229,14 +230,6 @@ def split_grads_by_size(threshold_size, device_grads):
   return small_grads, large_grads
 
 
-# threading.Lock() and threading.local() cannot be pickled and therefore cannot
-# be a field of CollectiveKeys. Right now _thread_local is not necessary to be
-# an instance member of CollectiveKeys since we always create a new thread for
-# each replica.
-_lock = threading.Lock()
-_thread_local = threading.local()
-
-
 # TODO(yuefengz): use random key starts to avoid reusing keys?
 class CollectiveKeys(object):
   """Class that manages collective keys.
@@ -253,6 +246,8 @@ class CollectiveKeys(object):
   "Graph key": an integer key that is unique key graph. This is used to support
   multiple graphs per client session. It must be non-zero and set in the
   `config` argument of each call to `session.run`.
+
+  This class is thread safe.
   """
 
   def __init__(self,
@@ -271,15 +266,9 @@ class CollectiveKeys(object):
     self._group_key_table = {}
 
     assert op_instance_key_start != variable_instance_key_start
-    self._op_instance_key_start = op_instance_key_start
+    self._op_instance_key = op_instance_key_start
     self._variable_instance_key = variable_instance_key_start
-
-  def _get_thread_local_object(self):
-    # We make instance key without key ids thread local so that it will work
-    # with MirroredStrategy and distribute coordinator.
-    if not hasattr(_thread_local, 'op_instance_key'):
-      _thread_local.op_instance_key = self._op_instance_key_start
-    return _thread_local
+    self._lock = threading.Lock()
 
   def get_group_key(self, devices):
     """Returns a group key for the set of devices.
@@ -298,24 +287,36 @@ class CollectiveKeys(object):
     # task_type and task_id.
     names = sorted(['%s:%d' % (d.device_type, d.device_index) for d in parsed])
     key_id = ','.join(names)
-    with _lock:
+    with self._lock:
       if key_id not in self._group_key_table:
         new_key = self._group_key
         self._group_key += 1
         self._group_key_table[key_id] = new_key
-    return self._group_key_table[key_id]
+      return self._group_key_table[key_id]
 
   def get_op_instance_key(self):
     """Returns a new instance key for use in defining a collective op."""
-    v = self._get_thread_local_object().op_instance_key
-    self._get_thread_local_object().op_instance_key += 1
-    return v
+    with self._lock:
+      v = self._op_instance_key
+      self._op_instance_key += 1
+      return v
 
   def get_variable_instance_key(self):
     """Returns a new instance key for use in creating a Variable."""
-    v = self._variable_instance_key
-    self._variable_instance_key += 1
-    return v
+    with self._lock:
+      v = self._variable_instance_key
+      self._variable_instance_key += 1
+      return v
+
+  def __deepcopy__(self, memo):
+    # distribute_coordinator deep-copies the strategy object, so
+    # CollectiveKeys needs to support deep copy as well.
+    copied = CollectiveKeys()
+    copied._group_key = self._group_key
+    copied._group_key_table = copy.deepcopy(self._group_key_table, memo)
+    copied._op_instance_key = self._op_instance_key
+    copied._variable_instance_key = self._variable_instance_key
+    return copied
 
 
 def build_collective_reduce(input_tensors,
@@ -326,7 +327,8 @@ def build_collective_reduce(input_tensors,
                             unary_op='Id',
                             communication_hint='AUTO',
                             control_inputs=None,
-                            executors=None):
+                            executors=None,
+                            timeout=None):
   """Build a subgraph that does one full all-reduce, using the collective Op.
 
   If called in eager mode, it's required to supply a list of async executors for
@@ -347,6 +349,7 @@ def build_collective_reduce(input_tensors,
     control_inputs: if not None, add control edges between control_inputs and
       (index-wise) corresponding collective_reduce tensors
     executors: a list of async executor. Required for eager execution.
+    timeout: a float or None. The timeout in seconds.
 
   Returns:
     An array of final tensors, one per device, computed by the full reduction.
@@ -380,10 +383,16 @@ def build_collective_reduce(input_tensors,
          ops.device(devices[idx]), \
          ops.control_dependencies(
              _control_input(devices, control_inputs, idx)):
-      out_tensor = collective_ops.all_reduce(input_tensor, group_size,
-                                             group_key, instance_key,
-                                             reduction_op, unary_op,
-                                             subdiv_offsets, communication_hint)
+      out_tensor = collective_ops.all_reduce(
+          input_tensor,
+          group_size,
+          group_key,
+          instance_key,
+          reduction_op,
+          unary_op,
+          subdiv_offsets,
+          communication_hint,
+          timeout=timeout)
     out_tensors.append(out_tensor)
   return out_tensors
 
@@ -393,7 +402,8 @@ def build_collective_gather(input_tensors,
                             group_size,
                             collective_keys,
                             communication_hint='AUTO',
-                            control_inputs=None):
+                            control_inputs=None,
+                            timeout=None):
   """Build a subgraph that does one full all-gather, using the collective Op.
 
   This method must be called in graph mode or inside a tf.function.
@@ -410,6 +420,7 @@ def build_collective_gather(input_tensors,
       implementation.
     control_inputs: if not None, add control edges between control_inputs and
       (index-wise) corresponding collective_gather tensors
+    timeout: a float or None. The timeout in seconds.
 
   Returns:
     An array of final tensors, one per device, computed by the full gather.
@@ -432,9 +443,13 @@ def build_collective_gather(input_tensors,
     with ops.device(devices[idx]):
       with ops.control_dependencies(
           _control_input(devices, control_inputs, idx)):
-        out_tensor = collective_ops.all_gather(input_tensor, group_size,
-                                               group_key, instance_key,
-                                               communication_hint)
+        out_tensor = collective_ops.all_gather(
+            input_tensor,
+            group_size,
+            group_key,
+            instance_key,
+            communication_hint,
+            timeout=timeout)
       out_tensors.append(out_tensor)
   return out_tensors
 
@@ -444,7 +459,8 @@ def build_collective_gather_indexed_slices(input_slices_list,
                                            group_size,
                                            collective_keys,
                                            communication_hint='AUTO',
-                                           control_inputs=None):
+                                           control_inputs=None,
+                                           timeout=None):
   """Build a subgraph that all-gathers IndexedSlices using the collective Op.
 
   This method must be called in graph mode or inside a tf.function.
@@ -461,6 +477,7 @@ def build_collective_gather_indexed_slices(input_slices_list,
       implementation.
     control_inputs: if not None, add control edges between control_inputs and
       (index-wise) corresponding collective_reduce tensors
+    timeout: a float or None. The timeout in seconds.
 
   Returns:
     An array of final IndexedSlices, one per device, computed by the full
@@ -501,16 +518,23 @@ def build_collective_gather_indexed_slices(input_slices_list,
 
       def all_gather():
         """Use all_gather to aggregate `IndexedSlices`."""
-        all_values = collective_ops.all_gather(input_slices.values, group_size,
-                                               group_key, gather_values_key,
-                                               communication_hint)
+        all_values = collective_ops.all_gather(
+            input_slices.values,
+            group_size,
+            group_key,
+            gather_values_key,
+            communication_hint,
+            timeout=timeout)
         # Add control dependency to order the all-gather.
         control = [all_values] if communication_hint == 'NCCL' else []
         with ops.control_dependencies(control):
-          all_indices = collective_ops.all_gather(input_slices.indices,
-                                                  group_size, group_key,
-                                                  gather_indices_key,
-                                                  communication_hint)
+          all_indices = collective_ops.all_gather(
+              input_slices.indices,
+              group_size,
+              group_key,
+              gather_indices_key,
+              communication_hint,
+              timeout=timeout)
         return ops.IndexedSlices(
             values=all_values,
             indices=all_indices,
@@ -519,9 +543,15 @@ def build_collective_gather_indexed_slices(input_slices_list,
       def densify_and_all_reduce():
         """Use all_reduce to aggregate `IndexedSlices`."""
         densified = ops.convert_to_tensor(input_slices)
-        reduced = collective_ops.all_reduce(densified, group_size, group_key,
-                                            reduce_densified_key, 'Add', 'Id',
-                                            [0], communication_hint)
+        reduced = collective_ops.all_reduce(
+            densified,
+            group_size,
+            group_key,
+            reduce_densified_key,
+            'Add',
+            'Id', [0],
+            communication_hint,
+            timeout=timeout)
         # We have to convert dense grad to IndexedSlice because all_reduce()
         # and all_gather() must have the same return type as required by
         # control_flow_ops.cond.
@@ -533,9 +563,13 @@ def build_collective_gather_indexed_slices(input_slices_list,
       length = array_ops.shape(input_slices.indices)
       with ops.control_dependencies(
           _control_input(input_slices, control_inputs, idx)):
-        all_lengths = collective_ops.all_gather(length, group_size, group_key,
-                                                gather_length_key,
-                                                communication_hint)
+        all_lengths = collective_ops.all_gather(
+            length,
+            group_size,
+            group_key,
+            gather_length_key,
+            communication_hint,
+            timeout=timeout)
       out_slices = control_flow_ops.cond(
           math_ops.equal(
               math_ops.reduce_max(all_lengths),
diff --git a/tensorflow/python/distribute/custom_training_loop_input_test.py b/tensorflow/python/distribute/custom_training_loop_input_test.py
index 9251721f7d0..832dc061f65 100644
--- a/tensorflow/python/distribute/custom_training_loop_input_test.py
+++ b/tensorflow/python/distribute/custom_training_loop_input_test.py
@@ -533,6 +533,31 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
     # This assumes that there are exactly 2 replicas
     self.assertAllEqual([5.5, 7.], run(input_iterator))
 
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.multidevice_strategies,
+          mode=["eager"]))
+  def testDynamicOutputsWithX64(self, distribution):
+    dataset = get_dataset_from_tensor_slices(
+        [5]).map(lambda x: math_ops.cast(x, dtypes.int64)).batch(2)
+    input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
+
+    @def_function.function
+    def run(iterator):
+
+      def computation(x):
+        return math_ops.add(x, x)
+
+      inputs = next(iterator)
+      outputs = distribution.experimental_local_results(
+          distribution.run(computation, args=(inputs,)))
+      return outputs
+
+    # This assumes that there are exactly 2 replicas
+    result = run(input_iterator)
+    self.assertAllEqual([10], result[0])
+    self.assertAllEqual([], result[1])
+
   @combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.multidevice_strategies,
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index 6dc3c93a51d..e593830f038 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -200,7 +200,6 @@ import six
 from tensorflow.python.autograph.core import ag_ctx as autograph_ctx
 from tensorflow.python.autograph.impl import api as autograph
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.distribute import collective_util
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribution_strategy_context
@@ -213,6 +212,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import custom_gradient
@@ -949,97 +949,92 @@ class StrategyBase(object):
     return self.run(fn, args=args)
 
   def experimental_distribute_dataset(self, dataset, options=None):
+    # pylint: disable=line-too-long
     """Creates `tf.distribute.DistributedDataset` from `tf.data.Dataset`.
 
     The returned `tf.distribute.DistributedDataset` can be iterated over
-    similar to how regular datasets can.
+    similar to regular datasets.
     NOTE: The user cannot add any more transformations to a
-    `tf.distribute.DistributedDataset`.
+    `tf.distribute.DistributedDataset`. You can only create an iterator or
+    examine the `tf.TypeSpec` of the data generated by it. See API docs of
+    `tf.distribute.DistributedDataset` to learn more.
 
     The following is an example:
 
-    ```python
-    strategy = tf.distribute.MirroredStrategy()
+    >>> global_batch_size = 2
+    >>> # Passing the devices is optional.
+    ... strategy = tf.distribute.MirroredStrategy(devices=["GPU:0", "GPU:1"])
+    >>> # Create a dataset
+    ... dataset = tf.data.Dataset.range(4).batch(global_batch_size)
+    >>> # Distribute that dataset
+    ... dist_dataset = strategy.experimental_distribute_dataset(dataset)
+    >>> @tf.function
+    ... def replica_fn(input):
+    ...   return input*2
+    >>> result = []
+    >>> # Iterate over the `tf.distribute.DistributedDataset`
+    ... for x in dist_dataset:
+    ...   # process dataset elements
+    ...   result.append(strategy.run(replica_fn, args=(x,)))
+    >>> print(result)
+    [PerReplica:{
+      0: <tf.Tensor: shape=(1,), dtype=int64, numpy=array([0])>,
+      1: <tf.Tensor: shape=(1,), dtype=int64, numpy=array([2])>
+    }, PerReplica:{
+      0: <tf.Tensor: shape=(1,), dtype=int64, numpy=array([4])>,
+      1: <tf.Tensor: shape=(1,), dtype=int64, numpy=array([6])>
+    }]
 
-    # Create a dataset
-    dataset = dataset_ops.Dataset.TFRecordDataset([
-      "/a/1.tfr", "/a/2.tfr", "/a/3.tfr", "/a/4.tfr"])
 
-    # Distribute that dataset
-    dist_dataset = strategy.experimental_distribute_dataset(dataset)
+    Three key actions happending under the hood of this method are batching,
+    sharding, and prefetching.
 
-    # Iterate over the `tf.distribute.DistributedDataset`
-    for x in dist_dataset:
-      # process dataset elements
-      strategy.run(replica_fn, args=(x,))
-    ```
+    In the code snippet above, `dataset` is batched by `global_batch_size`, and
+    calling `experimental_distribute_dataset` on it rebatches `dataset` to a
+    new batch size that is equal to the global batch size divided by the number
+    of replicas in sync. We iterate through it using a Pythonic for loop.
+    `x` is a `tf.distribute.DistributedValues` containing data for all replicas,
+    and each replica gets data of the new batch size.
+    `tf.distribute.Strategy.run` will take care of feeding the right per-replica
+    data in `x` to the right `replica_fn` executed on each replica.
 
-    In the code snippet above, the `tf.distribute.DistributedDataset`
-    `dist_dataset` is batched by `GLOBAL_BATCH_SIZE`, and we iterate through it
-    using `for x in dist_dataset`. `x` a `tf.distribute.DistributedValues`
-    containing data for all replicas, which aggregates to a batch of
-    `GLOBAL_BATCH_SIZE`. `tf.distribute.Strategy.run` will take care of feeding
-    the right per-replica data in `x` to the right `replica_fn` executed on each
-    replica.
+    Sharding contains autosharding across multiple workers and within every
+    worker. First, in multi-worker distributed training (i.e. when you use
+    `tf.distribute.experimental.MultiWorkerMirroredStrategy`
+    or `tf.distribute.TPUStrategy`), autosharding a dataset over a set of
+    workers means that each worker is assigned a subset of the entire dataset
+    (if the right `tf.data.experimental.AutoShardPolicy` is set). This is to
+    ensure that at each step, a global batch size of non-overlapping dataset
+    elements will be processed by each worker. Autosharding has a couple of
+    different options that can be specified using
+    `tf.data.experimental.DistributeOptions`. Then, sharding within each worker
+    means the method will split the data among all the worker devices (if more
+    than one a present). This will happen regardless of multi-worker
+    autosharding.
 
-    What's under the hood of this method, when we say the `tf.data.Dataset`
-    instance - `dataset` - gets distributed? It depends on how you set the
-    `tf.data.experimental.AutoShardPolicy` through
-    `tf.data.experimental.DistributeOptions`. By default, it is set to
-    `tf.data.experimental.AutoShardPolicy.AUTO`. In a multi-worker setting, we
-    will first attempt to distribute `dataset` by detecting whether `dataset` is
+    Note: for autosharding across multiple workers, the default mode is
+    `tf.data.experimental.AutoShardPolicy.AUTO`. This mode
+    will attempt to shard the input dataset by files if the dataset is
     being created out of reader datasets (e.g. `tf.data.TFRecordDataset`,
-    `tf.data.TextLineDataset`, etc.) and if so, try to shard the input files.
-    Note that there has to be at least one input file per worker. If you have
-    less than one input file per worker, we suggest that you disable dataset
-    sharding across workers, by setting the
-    `tf.data.experimental.DistributeOptions.auto_shard_policy` to be
+    `tf.data.TextLineDataset`, etc.) or otherwise shard the dataset by data,
+    where each of the workers will read the entire dataset and only process the
+    shard assigned to it. However, if you have less than one input file per
+    worker, we suggest that you disable dataset autosharding across workers by
+    setting the `tf.data.experimental.DistributeOptions.auto_shard_policy` to be
     `tf.data.experimental.AutoShardPolicy.OFF`.
 
-    If the attempt to shard by file is unsuccessful (i.e. the dataset is not
-    read from files), we will shard the dataset evenly at the end by
-    appending a `.shard` operation to the end of the processing pipeline. This
-    will cause the entire preprocessing pipeline for all the data to be run on
-    every worker, and each worker will do redundant work. We will print a
-    warning if this route is selected.
-
-    As mentioned before, within each worker, we will also split the data among
-    all the worker devices (if more than one a present). This will happen
-    even if multi-worker sharding is disabled.
+    By default, this method adds a prefetch transformation at the end of the
+    user provided `tf.data.Dataset` instance. The argument to the prefetch
+    transformation which is `buffer_size` is equal to the number of replicas in
+    sync.
 
     If the above batch splitting and dataset sharding logic is undesirable,
     please use
     `tf.distribute.Strategy.experimental_distribute_datasets_from_function`
-    instead, which does not do any automatic splitting or sharding.
+    instead, which does not do any automatic batching or sharding for you.
 
-    You can also use the `element_spec` property of the
-    `tf.distribute.DistributedDataset` instance returned by this API to query
-    the `tf.TypeSpec` of the elements returned
-    by the iterator. This can be used to set the `input_signature` property
-    of a `tf.function`.
-
-    ```python
-    strategy = tf.distribute.MirroredStrategy()
-
-    # Create a dataset
-    dataset = dataset_ops.Dataset.TFRecordDataset([
-      "/a/1.tfr", "/a/2.tfr", "/a/3.tfr", "/a/4.tfr"])
-
-    # Distribute that dataset
-    dist_dataset = strategy.experimental_distribute_dataset(dataset)
-
-    @tf.function(input_signature=[dist_dataset.element_spec])
-    def train_step(inputs):
-      # train model with inputs
-      return
-
-    # Iterate over the `tf.distribute.DistributedDataset`
-    for x in dist_dataset:
-      # process dataset elements
-      strategy.run(train_step, args=(x,))
-    ```
-
-    Note: The order in which the data is processed by the workers when using
+    Note: If you are using TPUStrategy, the order in which the data is processed
+    by the workers when using
     `tf.distribute.Strategy.experimental_distribute_dataset` or
     `tf.distribute.Strategy.experimental_distribute_datasets_from_function` is
     not guaranteed. This is typically required if you are using
@@ -1048,6 +1043,18 @@ class StrategyBase(object):
     snippet](https://www.tensorflow.org/tutorials/distribute/input#caveats)
     for an example of how to order outputs.
 
+    Note: Stateful dataset transformations are currently not supported with
+    `tf.distribute.experimental_distribute_dataset` or
+    `tf.distribute.experimental_distribute_datasets_from_function`. Any stateful
+    ops that the dataset may have are currently ignored. For example, if your
+    dataset has a `map_fn` that uses `tf.random.uniform` to rotate an image,
+    then you have a dataset graph that depends on state (i.e the random seed) on
+    the local machine where the python process is being executed.
+
+    For a tutorial on more usage and properties of this method, refer to the
+    [tutorial on distributed input](https://www.tensorflow.org/tutorials/distribute/input#tfdistributestrategyexperimental_distribute_dataset).
+    If you are interested in last partial batch handling, read [this section](https://www.tensorflow.org/tutorials/distribute/input#partial_batches).
+
     Args:
       dataset: `tf.data.Dataset` that will be sharded across all replicas using
         the rules stated above.
@@ -1057,64 +1064,53 @@ class StrategyBase(object):
     Returns:
       A `tf.distribute.DistributedDataset`.
     """
+    # pylint: enable=line-too-long
     return self._extended._experimental_distribute_dataset(dataset, options)  # pylint: disable=protected-access
 
   def experimental_distribute_datasets_from_function(self, dataset_fn,
                                                      options=None):
+    # pylint: disable=line-too-long
     """Distributes `tf.data.Dataset` instances created by calls to `dataset_fn`.
 
-    `dataset_fn` will be called once for each worker in the strategy. Each
-    replica on that worker will dequeue one batch of inputs from the local
-    `Dataset` (i.e. if a worker has two replicas, two batches will be dequeued
-    from the `Dataset` every step).
+    The argument `dataset_fn` that users pass in is an input function that has a
+    `tf.distribute.InputContext` argument and returns a `tf.data.Dataset`
+    instance. It is expected that the returned dataset from `dataset_fn` is
+    already batched by per-replica batch size (i.e. global batch size divided by
+    the number of replicas in sync) and sharded.
+    `tf.distribute.Strategy.experimental_distribute_datasets_from_function` does
+    not batch or shard the `tf.data.Dataset` instance
+    returned from the input function. `dataset_fn` will be called on the CPU
+    device of each of the workers and each generates a dataset where every
+    replica on that worker will dequeue one batch of inputs (i.e. if a worker
+    has two replicas, two batches will be dequeued from the `Dataset` every
+    step).
 
-    This method can be used for several purposes. For example, where
+    This method can be used for several purposes. First, it allows you to
+    specify your own batching and sharding logic. (In contrast,
+    `tf.distribute.experimental_distribute_dataset` does batching and sharding
+    for you.)For example, where
     `experimental_distribute_dataset` is unable to shard the input files, this
     method might be used to manually shard the dataset (avoiding the slow
     fallback behavior in `experimental_distribute_dataset`). In cases where the
     dataset is infinite, this sharding can be done by creating dataset replicas
     that differ only in their random seed.
-    `experimental_distribute_dataset` may also sometimes fail to split the
-    batch across replicas on a worker. In that case, this method can be used
-    where that limitation does not exist.
 
     The `dataset_fn` should take an `tf.distribute.InputContext` instance where
     information about batching and input replication can be accessed.
 
-    You can also use the `element_spec` property of the
+    You can use `element_spec` property of the
     `tf.distribute.DistributedDataset` returned by this API to query the
     `tf.TypeSpec` of the elements returned by the iterator. This can be used to
-    set the `input_signature` property of a `tf.function`.
-
-    >>> global_batch_size = 8
-    >>> def dataset_fn(input_context):
-    ...   batch_size = input_context.get_per_replica_batch_size(
-    ...                    global_batch_size)
-    ...   d = tf.data.Dataset.from_tensors([[1.]]).repeat().batch(batch_size)
-    ...   return d.shard(
-    ...       input_context.num_input_pipelines,
-    ...       input_context.input_pipeline_id)
-
-    >>> strategy = tf.distribute.MirroredStrategy()
-    >>> ds = strategy.experimental_distribute_datasets_from_function(dataset_fn)
-
-    >>> def train(ds):
-    ...   @tf.function(input_signature=[ds.element_spec])
-    ...   def step_fn(inputs):
-    ...     # train the model with inputs
-    ...     return inputs
-
-    ...   for batch in ds:
-    ...     replica_results = strategy.run(replica_fn, args=(batch,))
-    >>> train(ds)
+    set the `input_signature` property of a `tf.function`. Follow
+    `tf.distribute.DistributedDataset.element_spec` to see an example.
 
     IMPORTANT: The `tf.data.Dataset` returned by `dataset_fn` should have a
     per-replica batch size, unlike `experimental_distribute_dataset`, which uses
-    the global batch size.  This may be computed using
+    the global batch size. This may be computed using
     `input_context.get_per_replica_batch_size`.
 
-
-    Note: The order in which the data is processed by the workers when using
+    Note: If you are using TPUStrategy, the order in which the data is processed
+    by the workers when using
     `tf.distribute.Strategy.experimental_distribute_dataset` or
     `tf.distribute.Strategy.experimental_distribute_datasets_from_function` is
     not guaranteed. This is typically required if you are using
@@ -1123,6 +1119,18 @@ class StrategyBase(object):
     snippet](https://www.tensorflow.org/tutorials/distribute/input#caveats)
     for an example of how to order outputs.
 
+    Note: Stateful dataset transformations are currently not supported with
+    `tf.distribute.experimental_distribute_dataset` or
+    `tf.distribute.experimental_distribute_datasets_from_function`. Any stateful
+    ops that the dataset may have are currently ignored. For example, if your
+    dataset has a `map_fn` that uses `tf.random.uniform` to rotate an image,
+    then you have a dataset graph that depends on state (i.e the random seed) on
+    the local machine where the python process is being executed.
+
+    For a tutorial on more usage and properties of this method, refer to the
+    [tutorial on distributed input](https://www.tensorflow.org/tutorials/distribute/input#tfdistributestrategyexperimental_distribute_datasets_from_function).
+    If you are interested in last partial batch handling, read [this section](https://www.tensorflow.org/tutorials/distribute/input#partial_batches).
+
     Args:
       dataset_fn: A function taking a `tf.distribute.InputContext` instance and
         returning a `tf.data.Dataset`.
@@ -1132,14 +1140,16 @@ class StrategyBase(object):
     Returns:
       A `tf.distribute.DistributedDataset`.
     """
+    # pylint: enable=line-too-long
     return self._extended._experimental_distribute_datasets_from_function(  # pylint: disable=protected-access
         dataset_fn, options)
 
   def run(self, fn, args=(), kwargs=None, options=None):
-    """Run `fn` on each replica, with the given arguments.
+    """Invokes `fn` on each replica, with the given arguments.
 
-    Executes ops specified by `fn` on each replica. If `args` or `kwargs` have
-    `tf.distribute.DistributedValues`, such as those produced by a
+    This method is the primary way to distribute your computation with a
+    tf.distribute object. It invokes `fn` on each replica. If `args` or `kwargs`
+    have `tf.distribute.DistributedValues`, such as those produced by a
     `tf.distribute.DistributedDataset` from
     `tf.distribute.Strategy.experimental_distribute_dataset` or
     `tf.distribute.Strategy.experimental_distribute_datasets_from_function`,
@@ -1147,20 +1157,27 @@ class StrategyBase(object):
     component of `tf.distribute.DistributedValues` that correspond to that
     replica.
 
-    `fn` may call `tf.distribute.get_replica_context()` to access members such
-    as `all_reduce`.
+    `fn` is invoked under a replica context. `fn` may call
+    `tf.distribute.get_replica_context()` to access members such as
+    `all_reduce`. Please see the module-level docstring of tf.distribute for the
+    concept of replica context.
 
-    All arguments in `args` or `kwargs` should either be nest of tensors or
-    `tf.distribute.DistributedValues` containing tensors or composite tensors.
+    All arguments in `args` or `kwargs` should either be Python values of a
+    nested structure of tensors, e.g. a list of tensors, in which case `args`
+    and `kwargs` will be passed to the `fn` invoked on each replica. Or `args`
+    or `kwargs` can be `tf.distribute.DistributedValues` containing tensors or
+    composite tensors, i.e. `tf.compat.v1.TensorInfo.CompositeTensor`, in which
+    case each `fn` call will get the component of a
+    `tf.distribute.DistributedValues` corresponding to its replica.
 
     IMPORTANT: Depending on the implementation of `tf.distribute.Strategy` and
     whether eager execution is enabled, `fn` may be called one or more times. If
     `fn` is annotated with `tf.function` or `tf.distribute.Strategy.run` is
-    called inside a `tf.function`, eager execution is disabled and `fn` is
-    called once (or once per replica, if you are using MirroredStrategy) to
-    generate a Tensorflow graph, which will then be reused for execution with
-    new inputs. Otherwise, if eager execution is enabled, `fn` will be called
-    every step just like regular python code.
+    called inside a `tf.function` (eager execution is disabled inside a
+    `tf.function` by default), `fn` is called once per replica to generate a
+    Tensorflow graph, which will then be reused for execution with new inputs.
+    Otherwise, if eager execution is enabled, `fn` will be called once per
+    replica every step just like regular python code.
 
     Example usage:
 
@@ -1195,11 +1212,33 @@ class StrategyBase(object):
     >>> result
     <tf.Tensor: shape=(), dtype=int32, numpy=4>
 
+    3. Use `tf.distribute.ReplicaContext` to allreduce values.
+
+    >>> strategy = tf.distribute.MirroredStrategy(["gpu:0", "gpu:1"])
+    >>> @tf.function
+    ... def run():
+    ...    def value_fn(value_context):
+    ...      return tf.constant(value_context.replica_id_in_sync_group)
+    ...    distributed_values = (
+    ...        strategy.experimental_distribute_values_from_function(
+    ...            value_fn))
+    ...    def replica_fn(input):
+    ...      return tf.distribute.get_replica_context().all_reduce("sum", input)
+    ...    return strategy.run(replica_fn, args=(distributed_values,))
+    >>> result = run()
+    >>> result
+    PerReplica:{
+      0: <tf.Tensor: shape=(), dtype=int32, numpy=1>,
+      1: <tf.Tensor: shape=(), dtype=int32, numpy=1>
+    }
+
     Args:
-      fn: The function to run. The output must be a `tf.nest` of `Tensor`s.
-      args: (Optional) Positional arguments to `fn`.
-      kwargs: (Optional) Keyword arguments to `fn`.
-      options: (Optional) An instance of `tf.distribute.RunOptions` specifying
+      fn: The function to run on each replica.
+      args: Optional positional arguments to `fn`. Its element can be a Python
+        value, a tensor or a `tf.distribute.DistributedValues`.
+      kwargs: Optional keyword arguments to `fn`. Its element can be a Python
+        value, a tensor or a `tf.distribute.DistributedValues`.
+      options: An optional instance of `tf.distribute.RunOptions` specifying
         the options to run `fn`.
 
     Returns:
@@ -1221,12 +1260,6 @@ class StrategyBase(object):
           fn, autograph_ctx.control_status_ctx(), convert_by_default=False)
       return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
 
-  # TODO(b/151224785): Remove deprecated alias.
-  @doc_controls.do_not_doc_inheritable  # DEPRECATED
-  @deprecation.deprecated(None, "renamed to `run`")
-  def experimental_run_v2(self, fn, args=(), kwargs=None, options=None):
-    return self.run(fn, args=args, kwargs=kwargs, options=options)
-
   def reduce(self, reduce_op, value, axis):
     """Reduce `value` across replicas and return result on current device.
 
@@ -1244,7 +1277,7 @@ class StrategyBase(object):
     example with MirroredStrategy with 2 GPUs:
 
     ```python
-    strategy = tf.distribute.MirroredStrategy(devices=["gpu:0", "gpu:1"])
+    strategy = tf.distribute.MirroredStrategy(devices=["GPU:0", "GPU:1"])
     def step_fn():
       i = tf.distribute.get_replica_context().replica_id_in_sync_group
       return tf.identity(i)
@@ -1523,7 +1556,7 @@ class StrategyBase(object):
 
     In general, when using a multi-worker `tf.distribute` strategy such as
     `tf.distribute.experimental.MultiWorkerMirroredStrategy` or
-    `tf.distribute.experimental.TPUStrategy()`, there is a
+    `tf.distribute.TPUStrategy()`, there is a
     `tf.distribute.cluster_resolver.ClusterResolver` associated with the
     strategy used, and such an instance is returned by this property.
 
@@ -2423,7 +2456,7 @@ class StrategyExtendedV2(object):
     Example usage:
 
     ```python
-    strategy = tf.distribute.MirroredStrategy(['/gpu:0', '/gpu:1']) # With 2 devices
+    strategy = tf.distribute.MirroredStrategy(['GPU:0', 'GPU:1']) # With 2 devices
     with strategy.scope():
       v = tf.Variable(5.0, aggregation=tf.VariableAggregation.SUM)
     def update_fn(v):
@@ -2824,17 +2857,40 @@ class StrategyExtendedV1(StrategyExtendedV2):
 #   and switches the thread mode to a "cross-replica context".
 @tf_export("distribute.ReplicaContext")
 class ReplicaContext(object):
-  """`tf.distribute.Strategy` API when in a replica context.
+  """A class with a collection of APIs that can be called in a replica context.
 
   You can use `tf.distribute.get_replica_context` to get an instance of
-  `ReplicaContext`. This should be inside your replicated step function, such
-  as in a `tf.distribute.Strategy.run` call.
+  `ReplicaContext`, which can only be called inside the function passed to
+  `tf.distribute.Strategy.run`.
+
+  >>> strategy = tf.distribute.MirroredStrategy(['GPU:0', 'GPU:1'])
+  >>> def func():
+  ...   replica_context = tf.distribute.get_replica_context()
+  ...   return replica_context.replica_id_in_sync_group
+  >>> strategy.run(func)
+  PerReplica:{
+    0: <tf.Tensor: shape=(), dtype=int32, numpy=0>,
+    1: <tf.Tensor: shape=(), dtype=int32, numpy=1>
+  }
   """
 
   def __init__(self, strategy, replica_id_in_sync_group):
+    """Creates a ReplicaContext.
+
+    Args:
+      strategy: A `tf.distribute.Strategy`.
+      replica_id_in_sync_group: An integer, a `Tensor` or None. Prefer an
+        integer whenever possible to avoid issues with nested `tf.function`. It
+        accepts a `Tensor` only to be compatible with `tpu.replicate`.
+    """
     self._strategy = strategy
     self._thread_context = distribution_strategy_context._InReplicaThreadMode(  # pylint: disable=protected-access
         self)
+    if not (replica_id_in_sync_group is None or
+            tensor_util.is_tensor(replica_id_in_sync_group) or
+            isinstance(replica_id_in_sync_group, int)):
+      raise ValueError(
+          "replica_id_in_sync_group can only be an integer, a Tensor or None.")
     self._replica_id_in_sync_group = replica_id_in_sync_group
     self._summary_recording_distribution_strategy = None
 
@@ -2843,7 +2899,7 @@ class ReplicaContext(object):
     _push_per_thread_mode(self._thread_context)
 
     def replica_id_is_zero():
-      return math_ops.equal(self._replica_id_in_sync_group,
+      return math_ops.equal(self.replica_id_in_sync_group,
                             constant_op.constant(0))
 
     summary_state = summary_ops_v2._summary_state  # pylint: disable=protected-access
@@ -2903,22 +2959,37 @@ class ReplicaContext(object):
 
   @property
   def num_replicas_in_sync(self):
-    """Returns number of replicas over which gradients are aggregated."""
+    """Returns number of replicas that are kept in sync."""
     return self._strategy.num_replicas_in_sync
 
   @property
   def replica_id_in_sync_group(self):
-    """Returns the id of the replica being defined.
+    """Returns the id of the replica.
 
-    This identifies the replica that is part of a sync group. Currently we
-    assume that all sync groups contain the same number of replicas. The value
-    of the replica id can range from 0 to `num_replica_in_sync` - 1.
+    This identifies the replica among all replicas that are kept in sync. The
+    value of the replica id can range from 0 to
+    `tf.distribute.ReplicaContext.num_replicas_in_sync` - 1.
 
     NOTE: This is not guaranteed to be the same ID as the XLA replica ID use
     for low-level operations such as collective_permute.
+
+    Returns:
+      a `Tensor`.
     """
-    require_replica_context(self)
-    return self._replica_id_in_sync_group
+    # It's important to prefer making the Tensor at call time whenever possible.
+    # Keeping Tensors in global states doesn't work well with nested
+    # tf.function, since it's possible that the tensor is generated in one func
+    # graph, and gets captured by another, which will result in a subtle "An op
+    # outside of the function building code is being passed a Graph tensor"
+    # error. Making the tensor at call time to ensure it is the same graph where
+    # it's used. However to be compatible with tpu.replicate(),
+    # self._replica_id_in_sync_group can also be a Tensor.
+    if tensor_util.is_tensor(self._replica_id_in_sync_group):
+      return self._replica_id_in_sync_group
+    return constant_op.constant(
+        self._replica_id_in_sync_group,
+        dtypes.int32,
+        name="replica_id_in_sync_group")
 
   @property
   def strategy(self):
@@ -2933,7 +3004,7 @@ class ReplicaContext(object):
     NOTE: For `tf.distribute.MirroredStrategy` and
     `tf.distribute.experimental.MultiWorkerMirroredStrategy`, this returns a
     nested
-    list of device strings, e.g, [["gpu:0"]].
+    list of device strings, e.g, [["GPU:0"]].
     """
     require_replica_context(self)
     return (device_util.current(),)
@@ -3142,9 +3213,7 @@ class _DefaultDistributionExtended(StrategyExtendedV1):
       raise NotImplementedError("TODO")
 
   def _call_for_each_replica(self, fn, args, kwargs):
-    with ReplicaContext(
-        self._container_strategy(),
-        replica_id_in_sync_group=constant_op.constant(0, dtypes.int32)):
+    with ReplicaContext(self._container_strategy(), replica_id_in_sync_group=0):
       return fn(*args, **kwargs)
 
   def _reduce_to(self, reduce_op, value, destinations, experimental_hints):
@@ -3222,7 +3291,7 @@ class _DefaultDistributionExtended(StrategyExtendedV1):
       return self._iterator.get_next()
 
     def get_next_as_optional(self):
-      return iterator_ops.get_next_as_optional(self._iterator)
+      return self._iterator.get_next_as_optional()
 
     @deprecated(None, "Use the iterator's `initializer` property instead.")
     def initialize(self):
@@ -3249,6 +3318,16 @@ class _DefaultDistributionExtended(StrategyExtendedV1):
     return True
 
 
+class _DefaultReplicaContext(ReplicaContext):
+  """ReplicaContext for _DefaultDistributionStrategy."""
+
+  @property
+  def replica_id_in_sync_group(self):
+    # Return 0 instead of a constant tensor to avoid creating a new node for
+    # users who don't use distribution strategy.
+    return 0
+
+
 # ------------------------------------------------------------------------------
 # We haven't yet implemented deserialization for DistributedVariables.
 # So here we catch any attempts to deserialize variables
diff --git a/tensorflow/python/distribute/distribute_lib_test.py b/tensorflow/python/distribute/distribute_lib_test.py
index 1aa6911fee6..816ff0ce465 100644
--- a/tensorflow/python/distribute/distribute_lib_test.py
+++ b/tensorflow/python/distribute/distribute_lib_test.py
@@ -32,7 +32,6 @@ from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -75,8 +74,7 @@ class _TestExtended(distribute_lib.StrategyExtendedV1):
 
   def _call_for_each_replica(self, fn, args, kwargs):
     with _TestReplicaContext(
-        self._container_strategy(),
-        replica_id_in_sync_group=constant_op.constant(0, dtypes.int32)):
+        self._container_strategy(), replica_id_in_sync_group=0):
       return fn(*args, **kwargs)
 
   def _create_variable(self, next_creator, **kwargs):
diff --git a/tensorflow/python/distribute/distribute_utils.py b/tensorflow/python/distribute/distribute_utils.py
index 89848b91318..62f03c60224 100644
--- a/tensorflow/python/distribute/distribute_utils.py
+++ b/tensorflow/python/distribute/distribute_utils.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import tpu_values as tpu_values_lib
 from tensorflow.python.distribute import values as values_lib
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
@@ -62,8 +63,8 @@ def regroup(values, wrap_class=values_lib.PerReplica, always_wrap=False):
     if hasattr(v0, "_fields"):
       # This tuple is in fact a namedtuple! Create a new namedtuple instance
       # and initialize it with the regrouped values:
-      assert hasattr(type(v0), "_make")
-      return type(v0)._make(regrouped_tuple)
+      assert hasattr(v0, "_make")
+      return v0._make(regrouped_tuple)
     else:
       return regrouped_tuple
 
@@ -145,7 +146,7 @@ def select_replica_mirrored(replica_id, structured):
 
   def _get_mirrored(x):
     if isinstance(x, values_lib.DistributedValues):
-      if not isinstance(x, values_lib.Mirrored):
+      if not is_mirrored(x):
         raise TypeError(
             "Expected value to be mirrored across replicas: %s in %s." %
             (x, structured))
@@ -245,34 +246,25 @@ def validate_colocate(v, extended):
 
 
 # Variable creation function for sync strategies.
-def create_mirrored_variable(  # pylint: disable=missing-docstring
-    strategy, real_mirrored_creator, mirrored_cls, sync_on_read_cls, **kwargs):
-  # Figure out what collections this variable should be added to.
-  # We'll add the MirroredVariable to those collections instead.
-  var_collections = kwargs.pop("collections", None)
-  if var_collections is None:
-    var_collections = [ops.GraphKeys.GLOBAL_VARIABLES]
-  kwargs["collections"] = []
-
+def _get_and_validate_synchronization(kwargs):
+  """Validate that given synchronization value is valid."""
   synchronization = kwargs.get("synchronization",
-                               vs.VariableSynchronization.ON_WRITE)
-
+                               vs.VariableSynchronization.AUTO)
   if synchronization == vs.VariableSynchronization.NONE:
     raise ValueError(
-        "`NONE` variable synchronization mode is not supported with `Mirrored` "
-        "distribution strategy. Please change the `synchronization` for "
+        "`NONE` variable synchronization mode is not supported with "
+        "tf.distribute strategy. Please change the `synchronization` for "
         "variable: " + str(kwargs["name"]))
-  elif synchronization == vs.VariableSynchronization.ON_READ:
-    is_sync_on_read = True
-  elif synchronization in (vs.VariableSynchronization.ON_WRITE,
-                           vs.VariableSynchronization.AUTO):
-    # `AUTO` synchronization defaults to `ON_WRITE`.
-    is_sync_on_read = False
-  else:
+  if synchronization not in (vs.VariableSynchronization.ON_READ,
+                             vs.VariableSynchronization.ON_WRITE,
+                             vs.VariableSynchronization.AUTO):
     raise ValueError(
         "Invalid variable synchronization mode: %s for variable: %s" %
         (synchronization, kwargs["name"]))
+  return synchronization
 
+
+def _validate_aggregation(kwargs):
   aggregation = kwargs.pop("aggregation", vs.VariableAggregation.NONE)
 
   if aggregation not in (vs.VariableAggregation.NONE,
@@ -281,6 +273,33 @@ def create_mirrored_variable(  # pylint: disable=missing-docstring
                          vs.VariableAggregation.ONLY_FIRST_REPLICA):
     raise ValueError("Invalid variable aggregation mode: %s for variable: %s" %
                      (aggregation, kwargs["name"]))
+  return aggregation
+
+
+def _get_variable_policy_class(synchronization, aggregation, policy_mapping):
+  if synchronization == vs.VariableSynchronization.AUTO:
+    if aggregation == vs.VariableAggregation.NONE:
+      # Use AutoPolicy.
+      return policy_mapping.get(synchronization)
+    else:
+      # Revert to OnWritePolicy
+      return policy_mapping.get(vs.VariableSynchronization.ON_WRITE)
+  return policy_mapping.get(synchronization)
+
+
+def create_mirrored_variable(strategy, real_mirrored_creator, class_mapping,
+                             policy_mapping, **kwargs):
+  """Create distributed variables with given synchronization and aggregation."""
+  # Figure out what collections this variable should be added to.
+  # We'll add the MirroredVariable to those collections instead.
+  var_collections = kwargs.pop("collections", None)
+  if var_collections is None:
+    var_collections = [ops.GraphKeys.GLOBAL_VARIABLES]
+  kwargs["collections"] = []
+
+  synchronization = _get_and_validate_synchronization(kwargs)
+  aggregation = _validate_aggregation(kwargs)
+  use_var_policy = getattr(strategy.extended, "_use_var_policy", False)
 
   # Ignore user-specified caching device, not needed for mirrored variables.
   kwargs.pop("caching_device", None)
@@ -290,8 +309,15 @@ def create_mirrored_variable(  # pylint: disable=missing-docstring
   # here.
   with tape.stop_recording():
     value_list = real_mirrored_creator(**kwargs)
-    var_cls = sync_on_read_cls if is_sync_on_read else mirrored_cls
-    result = var_cls(strategy, value_list, aggregation)
+    if use_var_policy:
+      var_policy_cls = _get_variable_policy_class(synchronization, aggregation,
+                                                  policy_mapping)
+      var_policy = var_policy_cls(aggregation=aggregation)
+      var_cls = class_mapping.get("VariableClass")
+      result = var_cls(strategy, value_list, aggregation, var_policy=var_policy)
+    else:
+      var_cls = class_mapping.get(synchronization)
+      result = var_cls(strategy, value_list, aggregation)
     # Install the created DistributedVariable as _distributed_container property
     # of the underlying variables, to make it easy to map back to the container.
     for v in result.values:
@@ -324,3 +350,55 @@ def create_mirrored_variable(  # pylint: disable=missing-docstring
     ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, result)
 
   return result
+
+
+# Utility functions
+# Return True if the Value is Mirrored or the Variable is replicated and kept in
+# sync.
+def is_mirrored(val):
+  if isinstance(val, values_lib.DistributedVariable):
+    if val._policy:  # pylint: disable=protected-access
+      return val._policy._is_mirrored()  # pylint: disable=protected-access
+  return isinstance(val, values_lib.Mirrored)
+
+
+def is_sync_on_read(val):
+  if isinstance(val, values_lib.DistributedVariable):
+    if val._policy:  # pylint: disable=protected-access
+      return not val._policy._is_mirrored()  # pylint: disable=protected-access
+  return not isinstance(val, values_lib.Mirrored)
+
+# The following mapping indicates the policy that you must use for a given
+# variable `synchronization` and `aggregation` pair.
+# AutoPolicy is used for:
+# (synchronization=Auto, aggregation=None)
+# OnWritePolicy is used for:
+# (synchronization=Auto, aggregation=SUM,MEAN,ONLY_FIRST_REPLICA)
+# (synchronization=ON_WRITE, aggregation=NONE,SUM,MEAN,ONLY_FIRST_REPLICA)
+# OnReadPolicy is used for:
+# (synchronization=ON_READ, aggregation=NONE,SUM,MEAN,ONLY_FIRST_REPLICA)
+VARIABLE_POLICY_MAPPING = {
+    vs.VariableSynchronization.AUTO: values_lib.AutoPolicy,
+    vs.VariableSynchronization.ON_WRITE: values_lib.OnWritePolicy,
+    vs.VariableSynchronization.ON_READ: values_lib.OnReadPolicy,
+}
+
+VARIABLE_CLASS_MAPPING = {
+    "VariableClass": values_lib.DistributedVariable,
+    vs.VariableSynchronization.AUTO: values_lib.MirroredVariable,
+    vs.VariableSynchronization.ON_WRITE: values_lib.MirroredVariable,
+    vs.VariableSynchronization.ON_READ: values_lib.SyncOnReadVariable,
+}
+
+TPU_VARIABLE_POLICY_MAPPING = {
+    vs.VariableSynchronization.AUTO: tpu_values_lib.TPUAutoPolicy,
+    vs.VariableSynchronization.ON_WRITE: tpu_values_lib.TPUOnWritePolicy,
+    vs.VariableSynchronization.ON_READ: tpu_values_lib.TPUOnReadPolicy,
+}
+
+TPU_VARIABLE_CLASS_MAPPING = {
+    "VariableClass": tpu_values_lib.TPUDistributedVariable,
+    vs.VariableSynchronization.AUTO: tpu_values_lib.TPUMirroredVariable,
+    vs.VariableSynchronization.ON_WRITE: tpu_values_lib.TPUMirroredVariable,
+    vs.VariableSynchronization.ON_READ: tpu_values_lib.TPUSyncOnReadVariable,
+}
diff --git a/tensorflow/python/distribute/distribute_utils_test.py b/tensorflow/python/distribute/distribute_utils_test.py
index f91cad2db47..22ea6264d07 100644
--- a/tensorflow/python/distribute/distribute_utils_test.py
+++ b/tensorflow/python/distribute/distribute_utils_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import collections
 
 from absl.testing import parameterized
+import wrapt
 
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import distribute_utils
@@ -211,6 +212,15 @@ class RegroupAndSelectDeviceTest(test.TestCase, parameterized.TestCase):
                          distribute_utils.select_replica(
                              device_id, merged_estimator_spec))
 
+  def testWrappedNamedTuple(self):
+    Point = collections.namedtuple("Point", ["x", "y"])
+    point1 = Point(x=0, y=2)
+    point2 = Point(x=1, y=3)
+    wrapped1 = wrapt.ObjectProxy(point1)
+    wrapped2 = wrapt.ObjectProxy(point2)
+    result = distribute_utils.regroup([wrapped1, wrapped2])
+    self.assertEqual(result.x.values, (0, 1))
+    self.assertEqual(result.y.values, (2, 3))
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/distribute/distribution_strategy_context.py b/tensorflow/python/distribute/distribution_strategy_context.py
index 89f9d3b964b..b08c2313b1c 100644
--- a/tensorflow/python/distribute/distribution_strategy_context.py
+++ b/tensorflow/python/distribute/distribution_strategy_context.py
@@ -334,8 +334,10 @@ def _get_default_replica_context():
     # Avoid race condition causing two defaults to be created
     with _default_replica_context_lock:
       if _defaults["replica_context"] is None:
-        _defaults["replica_context"] = distribute_lib.ReplicaContext(
+        # pylint: disable=protected-access
+        _defaults["replica_context"] = distribute_lib._DefaultReplicaContext(
             _get_default_strategy(), replica_id_in_sync_group=0)
+        # pylint: enable=protected-access
   return _defaults["replica_context"]
 
 
diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
index dc1eeb38f8e..b77739c1274 100644
--- a/tensorflow/python/distribute/input_lib.py
+++ b/tensorflow/python/distribute/input_lib.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import functools
 import sys
 
@@ -53,6 +52,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.types import distribute as distribute_types
 from tensorflow.python.util import nest
+from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 from tensorflow.tools.docs import doc_controls
@@ -143,7 +143,7 @@ def get_distributed_datasets_from_function(dataset_fn,
 
 
 @tf_export("distribute.DistributedIterator", v1=[])
-class DistributedIteratorInterface(collections.Iterator,
+class DistributedIteratorInterface(collections_abc.Iterator,
                                    distribute_types.Iterator):
   """An iterator over `tf.distribute.DistributedDataset`.
 
@@ -216,6 +216,7 @@ class DistributedIteratorInterface(collections.Iterator,
         "DistributedIterator.element_spec() must be implemented in descendants")
 
   def get_next_as_optional(self):
+    # pylint: disable=line-too-long
     """Returns a `tf.experimental.Optional` that contains the next value for all replicas.
 
     If the `tf.distribute.DistributedIterator` has reached the end of the
@@ -230,6 +231,7 @@ class DistributedIteratorInterface(collections.Iterator,
     >>> distributed_iterator = iter(
     ...     strategy.experimental_distribute_dataset(dataset))
     >>> def step_fn(x):
+    ...   # train the model with inputs
     ...   return x
     >>> @tf.function
     ... def train_fn(distributed_iterator):
@@ -237,21 +239,23 @@ class DistributedIteratorInterface(collections.Iterator,
     ...     optional_data = distributed_iterator.get_next_as_optional()
     ...     if not optional_data.has_value():
     ...       break
-    ...     tf.print(strategy.run(step_fn, args=(optional_data.get_value(),)))
+    ...     per_replica_results = strategy.run(step_fn, args=(optional_data.get_value(),))
+    ...     tf.print(strategy.experimental_local_results(per_replica_results))
     >>> train_fn(distributed_iterator)
-    ... # ([0 1],)
-    ... # ([2 3],)
+    ... # ([0 1], [2 3])
+    ... # ([4], [])
 
     Returns:
       An `tf.experimental.Optional` object representing the next value from the
       `tf.distribute.DistributedIterator` (if it has one) or no value.
     """
+    # pylint: enable=line-too-long
     raise NotImplementedError(
         "get_next_as_optional() not implemented in descendants")
 
 
 @tf_export("distribute.DistributedDataset", v1=[])
-class DistributedDatasetInterface(collections.Iterable,
+class DistributedDatasetInterface(collections_abc.Iterable,
                                   distribute_types.Iterable):
   # pylint: disable=line-too-long
   """Represents a dataset distributed among devices and machines.
@@ -513,7 +517,7 @@ def _get_next_as_optional(iterator, strategy, name=None):
       # Collective all-reduce requires explicit devices for inputs.
       with ops.device("/cpu:0"):
         # Converting to integers for all-reduce.
-        worker_has_value = math_ops.cast(worker_has_value, dtypes.int32)
+        worker_has_value = math_ops.cast(worker_has_value, dtypes.int64)
         worker_devices.append(worker_has_value.device)
         worker_has_values.append(worker_has_value)
       # Make `replicas` a flat list of values across all replicas.
@@ -523,8 +527,8 @@ def _get_next_as_optional(iterator, strategy, name=None):
   # TODO(b/131423105): we should be able to short-cut the all-reduce in some
   # cases.
   if getattr(strategy.extended, "_support_per_replica_values", True):
-    # Slight hack: `reduce` expects a `PerReplica`, so we pass it one, even
-    # though it doesn't actually have a value per replica.
+    # `reduce` expects a `PerReplica`, so we pass it one, even
+    # though it doesn't actually have a value per replica
     worker_has_values = values.PerReplica(worker_has_values)
     global_has_value = strategy.reduce(
         reduce_util.ReduceOp.SUM, worker_has_values, axis=None)
@@ -588,16 +592,12 @@ class DistributedIteratorBase(DistributedIteratorInterface):
     # get_next_as_optional(). And we only enable get_next_as_optional when the
     # output shapes are not static.
     #
-    # TODO(yuefengz): Currently `experimental_enable_get_next_as_optional` is
-    # always set to False in CollectiveAllReduceStrategy. We want to have a way
-    # to distinguish multi workers/single worker between graph, so we can enable
-    # the behavior in single worker case.
-    #
     # TODO(rxsang): We want to always enable the get_next_as_optional behavior
     # when user passed input_fn instead of dataset.
     if getattr(
         strategy.extended, "experimental_enable_get_next_as_optional", False):
-      self._enable_get_next_as_optional = not static_shape
+      self._enable_get_next_as_optional = (
+          not static_shape) or strategy.extended._in_multi_worker_mode()
     else:
       self._enable_get_next_as_optional = False
 
@@ -868,9 +868,10 @@ class DistributedIterator(DistributedIteratorBase,
       self._iterators = components
       static_shape = _get_static_shape(self._iterators)
       self._strategy = strategy
-      if getattr(
-          strategy.extended, "experimental_enable_get_next_as_optional", False):
-        self._enable_get_next_as_optional = not static_shape
+      if getattr(strategy.extended,
+                 "experimental_enable_get_next_as_optional", False):
+        self._enable_get_next_as_optional = (
+            not static_shape) or strategy.extended._in_multi_worker_mode()
       else:
         self._enable_get_next_as_optional = False
     else:
@@ -967,7 +968,7 @@ class DistributedDataset(_IterableInput):
       try:
         # pylint: disable=protected-access
         with ops.colocate_with(dataset._variant_tensor):
-          dataset = distribute._RebatchDataset(dataset, split_batch_by)
+          dataset = distribute._LegacyRebatchDataset(dataset, split_batch_by)
           # Add a prefetch to pipeline rebatching for performance.
           # TODO(rachelim): Instead of inserting an extra prefetch stage here,
           # leverage static graph rewrites to insert _RebatchDataset before
@@ -1031,6 +1032,13 @@ class DistributedDataset(_IterableInput):
       iterator = DistributedIterator(self._input_workers, worker_iterators,
                                      self._strategy)
     iterator._element_spec = self.element_spec  # pylint: disable=protected-access
+
+    # When async eager is enabled, sometimes the iterator may not finish
+    # initialization before passing to a multi device function, add a sync point
+    # here to make sure all underlying iterators are initialized.
+    if context.executing_eagerly():
+      context.async_wait()
+
     return iterator
 
   @property
@@ -1105,6 +1113,13 @@ class DistributedDatasetV1(DistributedDataset):
     iterator = DistributedIteratorV1(self._input_workers, worker_iterators,
                                      self._strategy)
     iterator._element_spec = self.element_spec  # pylint: disable=protected-access
+
+    # When async eager is enabled, sometimes the iterator may not finish
+    # initialization before passing to a multi device function, add a sync point
+    # here to make sure all underlying iterators are initialized.
+    if context.executing_eagerly():
+      context.async_wait()
+
     return iterator
 
   def __iter__(self):
@@ -1172,6 +1187,13 @@ class DistributedDatasetsFromFunction(_IterableInput):
         iterator = DistributedIterator(self._input_workers, iterators,
                                        self._strategy)
       iterator._element_spec = self._element_spec  # pylint: disable=protected-access
+
+      # When async eager is enabled, sometimes the iterator may not finish
+      # initialization before passing to a multi device function, add a sync
+      # point here to make sure all underlying iterators are initialized.
+      if context.executing_eagerly():
+        context.async_wait()
+
       return iterator
 
     raise RuntimeError("__iter__() is only supported inside of tf.function "
@@ -1212,6 +1234,13 @@ class DistributedDatasetsFromFunctionV1(DistributedDatasetsFromFunction):
     iterator = DistributedIteratorV1(self._input_workers, iterators,
                                      self._strategy)
     iterator._element_spec = self._element_spec  # pylint: disable=protected-access
+
+    # When async eager is enabled, sometimes the iterator may not finish
+    # initialization before passing to a multi device function, add a sync point
+    # here to make sure all underlying iterators are initialized.
+    if context.executing_eagerly():
+      context.async_wait()
+
     return iterator
 
   def __iter__(self):
@@ -1269,6 +1298,7 @@ class InputFunctionIterator(DistributedIteratorV1):
 
     super(InputFunctionIterator, self).__init__(input_workers, iterators,
                                                 strategy)
+    self._enable_get_next_as_optional = False
 
 
 # TODO(anjalisridhar): This class will soon be removed and users should move
diff --git a/tensorflow/python/distribute/input_lib_test.py b/tensorflow/python/distribute/input_lib_test.py
index 7f02d0121d0..a70eb50dbba 100644
--- a/tensorflow/python/distribute/input_lib_test.py
+++ b/tensorflow/python/distribute/input_lib_test.py
@@ -45,6 +45,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import composite_tensor
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -860,6 +861,92 @@ class DistributedIteratorTensorTypeTest(DistributedIteratorTestBase,
     self.assertEqual(iterator._enable_get_next_as_optional,
                      (not drop_remainder) and enable_get_next_as_optional)
 
+  @combinations.generate(
+      combinations.combine(
+          tf_api_version=2,
+          mode=["eager"],
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.central_storage_strategy_with_gpu_and_cpu,
+              strategy_combinations.one_device_strategy,
+              strategy_combinations.mirrored_strategy_with_one_cpu,
+              # TODO(mdan): Add these?
+              # strategy_combinations.multi_worker_mirrored_2x1_cpu,
+              # strategy_combinations.multi_worker_mirrored_2x1_gpu,
+              # strategy_combinations.multi_worker_mirrored_2x2_gpu,
+          ],
+          input_type=["dataset", "input_fn"],
+          drop_remainder=[False, True],
+      ))
+  def testRaggedSparseGetNextAsOptionalInLoop(
+      self, distribution, input_type, drop_remainder):
+    """Test with `RaggedTensor`s and `SparseTensor`s."""
+    self.skipTest("b/323359921")
+
+    global_batch_size = 8
+
+    def dataset_fn(ctx=None):
+      ctx = ctx or distribute_lib.InputContext()
+      batch_size = ctx.get_per_replica_batch_size(global_batch_size)
+      # Use 20 which isn't divisible by 8 to test partial batch behavior.
+      row_lengths = np.mod(np.arange(20), 4).astype(np.int64)
+      ragged_tensor = ragged_tensor_lib.RaggedTensor.from_row_lengths(
+          np.repeat(np.arange(20, dtype=np.float32), row_lengths), row_lengths)
+      dataset = dataset_ops.DatasetV2.from_tensor_slices({
+          "dense": ragged_tensor.to_tensor(),
+          "ragged": ragged_tensor,
+          "sparse": ragged_tensor.to_sparse(),
+      })
+      dataset = dataset.shard(ctx.num_input_pipelines, ctx.input_pipeline_id)
+      return dataset.batch(batch_size, drop_remainder=drop_remainder)
+
+    if input_type == "dataset":
+      ds = distribution.experimental_distribute_dataset(
+          dataset_fn(distribute_lib.InputContext()))
+    else:
+      ds = distribution.experimental_distribute_datasets_from_function(
+          dataset_fn)
+
+    # Iterate through all the batches and sum them up.
+    def sum_batch(per_replica_features):
+      """Sums the `PerReplica` values in the `per_replica_features` map."""
+
+      def map_fn(per_replica_values):
+        per_replica_sums = distribution.run(
+            (lambda x: math_ops.reduce_sum(x.values)) if all(
+                map(sparse_tensor.is_sparse, per_replica_values.values)) else
+            math_ops.reduce_sum, (per_replica_values,))
+        return distribution.reduce(
+            reduce_util.ReduceOp.SUM, per_replica_sums, axis=None)
+
+      return nest.map_structure(map_fn, per_replica_features)
+
+    def _reduce(state, batch):
+      sums = sum_batch(batch)
+      return {name: value + sums[name] for name, value in state.items()}
+
+    def sum_while_loop(ds):
+      iterator = iter(ds)
+      sums = {"dense": 0., "ragged": 0., "sparse": 0.}
+      try_next = constant_op.constant(True)
+
+      while try_next:
+        opt_iterate = iterator.get_next_as_optional()
+        if opt_iterate.has_value():
+          sums = _reduce(sums, opt_iterate.get_value())
+        else:
+          try_next = False
+      return sums
+
+    sums = def_function.function(sum_while_loop)(ds)
+    # For loops always call get next as optional inside tf functions, so we
+    # expect 310 here when using an input function (as there are 5 batches of
+    # size 4 round robined over 2 replicas.
+    expected_for_sum = 200.
+    if not drop_remainder or input_type == "input_fn":
+      expected_for_sum = 310.
+    self.assertAllEqual(nest.flatten(sums), [expected_for_sum] * 3)
+
 
 class DistributedIteratorMultiWorkerTest(
     multi_worker_test_base.MultiWorkerTestBase, DistributedIteratorTestBase,
@@ -1144,7 +1231,6 @@ class DistributedIteratorMultiWorkerTest(
           expected_values = [[[0, 1]], [[2, 3]], [[4]]]
           input_context = None
 
-        strategy.extended.experimental_enable_get_next_as_optional = True
         self._test_input_iteration(
             input_type,
             api_type,
diff --git a/tensorflow/python/distribute/input_lib_type_spec_test.py b/tensorflow/python/distribute/input_lib_type_spec_test.py
index 7f5b0e09f2c..691b29202e1 100644
--- a/tensorflow/python/distribute/input_lib_type_spec_test.py
+++ b/tensorflow/python/distribute/input_lib_type_spec_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.distribute import values
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
@@ -340,7 +341,17 @@ class RaggedTensorDistributedIteratorTest(test.TestCase,
     distribution.extended.experimental_enable_get_next_as_optional = (
         enable_get_next_as_optional)
 
-    dist_dataset = distribution.experimental_distribute_dataset(dataset)
+    if isinstance(distribution,
+                  (tpu_strategy.TPUStrategyV2, tpu_strategy.TPUStrategy)):
+      # TPUStrategy does not support distributed datasets with device prefetch
+      # when using sparse or ragged tensors.
+      options = distribute_lib.InputOptions(
+          experimental_prefetch_to_device=False)
+    else:
+      options = None
+
+    dist_dataset = distribution.experimental_distribute_dataset(
+        dataset, options)
     with distribution.scope():
       iterator = iter(dist_dataset)
       _check_type_spec_structure(iterator)
@@ -395,7 +406,17 @@ class RaggedTensorDistributedIteratorTest(test.TestCase,
     distribution.extended.experimental_enable_get_next_as_optional = (
         enable_get_next_as_optional)
 
-    dist_dataset = distribution.experimental_distribute_dataset(dataset)
+    if isinstance(distribution,
+                  (tpu_strategy.TPUStrategyV2, tpu_strategy.TPUStrategy)):
+      # TPUStrategy does not support distributed datasets with device prefetch
+      # when using sparse or ragged tensors.
+      options = distribute_lib.InputOptions(
+          experimental_prefetch_to_device=False)
+    else:
+      options = None
+
+    dist_dataset = distribution.experimental_distribute_dataset(
+        dataset, options)
     with distribution.scope():
       for _ in range(3):
         iterator = iter(dist_dataset)
diff --git a/tensorflow/python/distribute/integration_test/BUILD b/tensorflow/python/distribute/integration_test/BUILD
index d997e64be05..307f2580996 100644
--- a/tensorflow/python/distribute/integration_test/BUILD
+++ b/tensorflow/python/distribute/integration_test/BUILD
@@ -1,4 +1,5 @@
 load("//tensorflow/core/platform/default:distribute.bzl", "distribute_py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -8,6 +9,7 @@ package(
 distribute_py_test(
     name = "saved_model_test",
     srcs = ["saved_model_test.py"],
+    disable_mlir_bridge = False,
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/python/distribute:combinations",
@@ -18,3 +20,21 @@ distribute_py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+cuda_py_test(
+    name = "mwms_peer_failure_test",
+    size = "medium",
+    srcs = ["mwms_peer_failure_test.py"],
+    python_version = "PY3",
+    shard_count = 2,
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:multi_process_runner",
+        "//tensorflow/python/distribute:multi_worker_test_base",
+        "//tensorflow/python/eager:test",
+    ],
+)
diff --git a/tensorflow/python/distribute/integration_test/mwms_peer_failure_test.py b/tensorflow/python/distribute/integration_test/mwms_peer_failure_test.py
new file mode 100644
index 00000000000..c247be1c280
--- /dev/null
+++ b/tensorflow/python/distribute/integration_test/mwms_peer_failure_test.py
@@ -0,0 +1,167 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""This file contains tests that simulate peer failures.
+
+When a peer fails during MultiWorkerMirroredStrategy training. All workers
+should get Unavailable error.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import multi_process_runner
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.eager import test
+
+
+def get_attempt(strategy, attempts):
+  task_type = strategy.cluster_resolver.task_type
+  task_id = strategy.cluster_resolver.task_id
+  attempts[(task_type, task_id)] = attempts.get((task_type, task_id), 0) + 1
+  return task_id, attempts[(task_type, task_id)]
+
+
+quick_exit = os._exit  # pylint: disable=protected-access
+
+
+class PeerFailureTest(test.TestCase):
+  # Note that all the tests use auto_restart=True. Currently we rely on the
+  # assumption that an external system restarts failed tasks. If the assumption
+  # is not true, the remaining tasks may still hang instead of fail.
+  #
+  # In these tests we leverage the auto restart feature of MultiProcessRunner.
+  # Failed workers are restarted automatically. In reality there needs to be
+  # some job management system that does the restart, e.g. Kubernetes.
+  #
+  # Worker failures may cause problems if there're more than one collective, and
+  # the failure happens after the first collective. In this case the recovered
+  # worker will be running a different collective with the rest, which causes a
+  # deadlock. Note that collectives are common, e.g. when creating variables the
+  # initial values are broadcasted from the first worker.
+  #
+  # We use a multiprocessing.Manager().dict() object to track the attempts of
+  # each worker. We take different actions in different attempts to simuate the
+  # events in real world. E.g. some tests make a worker fail on the first
+  # attempt only, and asserts that it should recovery.
+
+  def test_creating_variable_broken(self):
+    # This test simulates the case when a worker fails before or during creating
+    # a variable. Creating variables involve broadcasting the initial value from
+    # the first replica to all replicas.
+
+    def worker_fn(attempts):
+      strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+      task_id, attempt = get_attempt(strategy, attempts)
+      with strategy.scope():
+        tf.Variable(1.)
+        # worker-1 dies here.
+        if attempt == 1 and task_id == 1:
+          quick_exit(1)
+        v = tf.Variable(tf.random.uniform(()))
+        return v.read_value().numpy()
+
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    attempts = multi_process_runner.manager().dict()
+    mpr = multi_process_runner.MultiProcessRunner(
+        worker_fn, cluster_spec, args=(attempts,), auto_restart=True)
+    mpr.start()
+    # TODO(b/151232436): worker-0 should raises Unavailable instead of hanging.
+    # Now after worker-1 fails, worker-0 waits on the second variable creation;
+    # after worker-1 recovers, worker-1 waits on the first variable creation.
+    with self.assertRaises(multi_process_runner.SubprocessTimeoutError):
+      mpr.join(timeout=30)
+
+  def test_reduce_small_tensor_broken(self):
+    # This test simulates the case when a worker fails before or during reducing
+    # a small tensors, e.g. reading a metric.
+    #
+    # Note that this is a rather corner case and only happens when all of the
+    # following conditions are met:
+    #   - There're two workers.
+    #   - They're reducing a small tensor. The definition of small varies
+    #     per platform.
+    #   - They're reducing a single tensor. Batched all-reduce are not affected.
+    #   - It must be worker-1 that fails.
+
+    def worker_fn(attempts):
+      strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+      task_id, attempt = get_attempt(strategy, attempts)
+      value = tf.identity([1.])
+      strategy.reduce("sum", value, axis=None)
+      # worker-1 dies here.
+      if attempt == 1 and task_id == 1:
+        quick_exit(1)
+      strategy.reduce("sum", value, axis=None)
+
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    attempts = multi_process_runner.manager().dict()
+    mpr = multi_process_runner.MultiProcessRunner(
+        worker_fn, cluster_spec, args=(attempts,), auto_restart=True)
+    mpr.start()
+    # TODO(b/151232436): worker-0 should raises Unavailable instead of hanging.
+    # Now after worker-1 fails, worker-0 waits on the second reduce; after
+    # worker-1 recovers, worker-1 waits on the first reduce.
+    with self.assertRaises(multi_process_runner.SubprocessTimeoutError):
+      mpr.join(timeout=30)
+
+  def test_quick_recover(self):
+    # This test simulates the case when a worker fails but recovers quickly
+    # before the next collective.
+    #
+    # It's not guaranteed that the cluster only restarts once when one worker
+    # fails. The external job management system is expected to keep restarting
+    # failed workers.
+
+    def worker_fn(attempts):
+      strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+      task_id, attempt = get_attempt(strategy, attempts)
+
+      if attempt == 2 and task_id == 1:
+        multi_process_runner.barrier().wait()
+
+      @tf.function
+      def replica_fn():
+        ctx = tf.distribute.get_replica_context()
+        # Use a large tensor because small tensor may hang regardless when the
+        # worker recovers.
+        value = tf.ones((64, 64))
+        ctx.all_reduce(tf.distribute.ReduceOp.SUM, [value, value])
+
+      strategy.run(replica_fn)
+      # worker-1 dies here.
+      if attempt == 1 and task_id == 1:
+        quick_exit(1)
+      # Make worker-0 waits for worker-1 to restart before entering the next
+      # collective to simulate a quick recovery of worker-1.
+      if attempt == 1 and task_id == 0:
+        multi_process_runner.barrier().wait()
+      strategy.run(replica_fn)
+
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    attempts = multi_process_runner.manager().dict()
+    mpr = multi_process_runner.MultiProcessRunner(
+        worker_fn, cluster_spec, args=(attempts,), auto_restart=True)
+    mpr.start()
+    mpr.join(timeout=90)
+
+
+if __name__ == "__main__":
+  combinations.main()
diff --git a/tensorflow/python/distribute/integration_test/saved_model_test.py b/tensorflow/python/distribute/integration_test/saved_model_test.py
index 60de590bb48..33d94435ff7 100644
--- a/tensorflow/python/distribute/integration_test/saved_model_test.py
+++ b/tensorflow/python/distribute/integration_test/saved_model_test.py
@@ -68,25 +68,9 @@ class SaveAndLoadForServingTest(test.TestCase, parameterized.TestCase):
   # context and the cross-replica context. Saving happens in the cross replica
   # context or the default startegy's replica context.
 
-  def test_read_sync_on_read_variable_broken(self, strategy):
+  def test_read_sync_on_read_variable(self, strategy):
     # synchronizaiton=ON_READ variables are typically used in Keras metrics and
     # batch norm layers.
-    #
-    # This is broken now since the saved variable already has the aggregated
-    # value, but the saved tf.function is traced under the cross-replica context
-    # and contains the aggregation.
-    #
-    # Impacts:
-    #   - MirroredStrategy, TPUStrategy
-    #     - aggregation=NONE: error when saving.
-    #     - aggregation=SUM: incorrect results.
-    #     - aggregation=MEAN: slight computation overhead.
-    #     - aggregation=ONLY_FIRST_REPLICA: none.
-    #   - MultiWorkerMirroredStrategy:
-    #     - aggregation=NONE: error when saving
-    #     - aggregation=MEAN, SUM: error or hanging when using the loaded model.
-    #     - aggregation=ONLY_FIRST_REPLICA: none.
-    # Note that batch norm uses aggregation=MEAN.
 
     class Model(tf.Module):
 
@@ -113,9 +97,7 @@ class SaveAndLoadForServingTest(test.TestCase, parameterized.TestCase):
     loaded = tf.saved_model.load(export_dir)
     # The variable already has the aggregated value.
     self.assertEqual(self.evaluate(loaded.v.read_value()), 1.)
-    # TODO(b/159752793): reading the variable aggregates the values again.
-    # got 2., want 1.
-    self.assertEqual(self.evaluate(loaded()), 2.)
+    self.assertEqual(self.evaluate(loaded()), 1.)
 
   def test_read_mirrored_variable(self, strategy):
     # synchronizaiton=ON_WRITE is the default variable created under
@@ -142,14 +124,10 @@ class SaveAndLoadForServingTest(test.TestCase, parameterized.TestCase):
     loaded = tf.saved_model.load(export_dir)
     self.assertEqual(self.evaluate(loaded()), 1.)
 
-  def test_update_sync_on_read_variable_broken(self, strategy):
+  def test_update_sync_on_read_variable(self, strategy):
     # It's rare to update aggregation=ON_READ variables in serving, but it's
     # possible that the SavedModel contains both serving and training graphs,
     # and the training may contain metrics layers.
-    #
-    # This is now partially broken since assign_add() and assign_sub() are not
-    # allowed in the cross-replica context if aggregation=SUM, which blocks
-    # saving the model.
 
     class Model(tf.Module):
 
@@ -167,23 +145,15 @@ class SaveAndLoadForServingTest(test.TestCase, parameterized.TestCase):
     export_dir = self.get_temp_dir()
     with strategy.scope():
       m = Model()
-      # got error, want no error.
-      with self.assertRaisesRegex(ValueError,
-                                  "SyncOnReadVariable does not support"):
-        tf.saved_model.save(m, export_dir)
+      tf.saved_model.save(m, export_dir)
 
-    # TODO(b/159752793): Uncomment after fix.
-    # loaded = tf.saved_model.load(export_dir)
-    # loaded.update()
-    # self.assertEqual(self.evaluate(loaded.v), 1.)
+    loaded = tf.saved_model.load(export_dir)
+    loaded.update()
+    self.assertEqual(self.evaluate(loaded.v), 1.)
 
-  def test_update_mirrored_variable_broken(self, strategy):
+  def test_update_mirrored_variable(self, strategy):
     # It's very rare to update aggregation=ON_WRITE variables in the forward
     # path, and this test case is mainly for completeness.
-    #
-    # The saved tf.function updates each components of the distributed variable,
-    # which effectively updates the variable in the saved model N times where N
-    # equals the number of local replicas during training.
 
     class Model(tf.Module):
 
@@ -205,9 +175,7 @@ class SaveAndLoadForServingTest(test.TestCase, parameterized.TestCase):
     loaded = tf.saved_model.load(export_dir)
     self.assertEqual(self.evaluate(loaded.v), 0.)
     loaded.update()
-    # TODO(b/159752793): Change after fix.
-    # got 2., want 1.
-    self.assertEqual(self.evaluate(loaded.v), 2.)
+    self.assertEqual(self.evaluate(loaded.v), 1.)
 
   def test_training_only_device(self, strategy):
     # tf.distribute APIs may enter device scopes, but the saved model should not
@@ -237,16 +205,14 @@ class SaveAndLoadForServingTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(op.device, "")
     self.assertEqual(loaded().numpy(), 1.)
 
-  def test_model_with_loaded_layer_broken(self, strategy):
-    # If a model contains a layer loaded from SavedModel, including tf.hub
-    # layers, and if the model is created under tf.distribute.Strategy, it
-    # cannot be saved again. The saving won't error but the saved model cannot
-    # be used.
+  def test_model_with_loaded_layer(self, strategy):
+    # When a model is loaded under strategy, we wrap it so that when it's passed
+    # to strategy.run(), the captured variables resolve to the ones of the
+    # current replica. Since the saved tf.function may contain updates to the
+    # variables, we don't allow using the model outside of strategy.run().
     #
-    # The reason is that if a saved model is loaded under
-    # tf.distribute.Strategy, the tf.functions are wrapped by
-    # saved_model._WrapperFunction, which generates an assertion node in the
-    # cross-replica context.
+    # That is to say, a loaded model is different from the original Python one.
+    # We need to test save-load-save-load to make sure things work correctly.
 
     class Layer(tf.Module):
 
@@ -272,28 +238,13 @@ class SaveAndLoadForServingTest(test.TestCase, parameterized.TestCase):
     with strategy.scope():
       m = Model(tf.saved_model.load(layer_export_dir))
       export_dir = self.get_temp_dir()
-      # It happens to work if we save the model outside of strategy.scope(),
-      # because DistributedVariable.handle and _WrapperFunction behaved
-      # differently under the cross-replica context and the default strategy's
-      # replica context.
       tf.saved_model.save(m, export_dir)
 
     loaded = tf.saved_model.load(export_dir)
-    # got error, want [1., 1.]
-    if isinstance(strategy, tf.distribute.MirroredStrategy):
-      with self.assertRaisesRegex(
-          tf.errors.InvalidArgumentError,
-          "from the cross-replica context in an in-replica context"):
-        strategy.run(loaded)
-    else:
-      with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                  "No registered 'Placeholder'"):
-        strategy.run(loaded)
-    # TODO(b/160646235): Uncomment after fix.
-    #  self.assertAllEqual(
-    #      self.evaluate(
-    #          strategy.experimental_local_results(strategy.run(loaded)),
-    #          [1., 1.]))
+    self.assertAllEqual(
+        self.evaluate(
+            strategy.experimental_local_results(strategy.run(loaded))),
+        [1., 1.])
 
 
 @combinations.generate(
@@ -314,12 +265,14 @@ class SaveAndLoadForTrainingTest(test.TestCase, parameterized.TestCase):
   # can workaround most issues since Keras loader restructs the layers with
   # saved configs if possible, in which case the saved graph is not used.
 
-  def test_read_sync_on_read_variable_broken(self, strategy):
-    # Reading a synchronizaiton=ON_READ in the replica context should only read
-    # the local value, however with a loaded model, reading in the replica
-    # context triggers aggregation as well. While one may argue the behavior is
-    # desirable, note that aggregation can cause hanging if the originall model
-    # is trained with MultiWorkerMirroredStrategy.
+  def test_read_sync_on_read_variable(self, strategy):
+    # Reading a synchronizaiton=ON_READ in the replica context should just read
+    # the local value. Reading it in the cross replica context aggregates the
+    # value from all replicas. Both are true with a loaded model.
+    #
+    # Note that if aggregation=SUM, the value of each replica is the saved value
+    # divided by the number of replicas. In this way if you load a model and
+    # save it again, the values of the variables don't change.
 
     class Model(tf.Module):
 
@@ -334,26 +287,45 @@ class SaveAndLoadForTrainingTest(test.TestCase, parameterized.TestCase):
         return self.v.read_value()
 
     export_dir = self.get_temp_dir()
+    value = strategy.experimental_distribute_values_from_function(
+        lambda ctx: tf.identity([3., 7.][ctx.replica_id_in_sync_group]))
     with strategy.scope():
       m = Model()
-      m.v.assign(1.)
+      strategy.run(m.v.assign, args=(value,))
       self.assertAllEqual(
-          self.evaluate(strategy.experimental_local_results(m.v)), [0.5, 0.5])
+          self.evaluate(strategy.experimental_local_results(m.v)), [3., 7.])
+      self.assertEqual(self.evaluate(m.v.read_value()), 10.)
       tf.saved_model.save(m, export_dir)
+      del m
 
     with strategy.scope():
       loaded = tf.saved_model.load(export_dir)
-    # After loading, reading in the replica context is the same as reading in
-    # the cross-replica context.
-    # TODO(b/159752793): change after fix.
+    # It's intended that we don't save the each replica, but just the aggregated
+    # value.
     self.assertAllEqual(
         self.evaluate(
             strategy.experimental_local_results(strategy.run(loaded))),
-        [1., 1.])
-    self.assertEqual(self.evaluate(loaded.v.read_value()), 1.)
+        [5., 5.])
+    self.assertEqual(self.evaluate(loaded.v.read_value()), 10.)
 
-  def test_update_sync_on_read_variable_broken(self, strategy):
-    # Can't even save.
+    # save and load again.
+    export_dir2 = self.get_temp_dir()
+    tf.saved_model.save(loaded, export_dir2)
+    # loaded.v.read_value() is still 1., both with and without strategy.
+    loaded = tf.saved_model.load(export_dir2)
+    self.assertEqual(self.evaluate(loaded.v.read_value()), 10.)
+    with strategy.scope():
+      loaded = tf.saved_model.load(export_dir2)
+      self.assertEqual(self.evaluate(loaded.v.read_value()), 10.)
+
+  def test_update_sync_on_read_variable(self, strategy):
+    # Updating a synchronizaiton=ON_READ in the replica context should just
+    # update the local value. Updating it in the cross replica context updates
+    # each component of the variable. Both are true with a loaded model.
+    #
+    # Note that if assigning a variable whose aggregation=SUM in the cross
+    # replica context, each replica is assigned with the value divided by the
+    # number of replicas.
 
     class Model(tf.Module):
 
@@ -363,19 +335,36 @@ class SaveAndLoadForTrainingTest(test.TestCase, parameterized.TestCase):
             synchronization=tf.VariableSynchronization.ON_READ,
             aggregation=tf.VariableAggregation.SUM)
 
-      @tf.function(input_signature=[tf.TensorSpec(shape=[1], dtype=tf.float32)])
+      @tf.function(input_signature=[tf.TensorSpec(shape=(), dtype=tf.float32)])
       def update(self, value):
         self.v.assign_add(value)
 
     export_dir = self.get_temp_dir()
+    value = strategy.experimental_distribute_values_from_function(
+        lambda ctx: tf.identity([3., 7.][ctx.replica_id_in_sync_group]))
     with strategy.scope():
       m = Model()
-      # got error, want no error.
-      with self.assertRaisesRegex(ValueError,
-                                  "SyncOnReadVariable does not support"):
-        tf.saved_model.save(m, export_dir)
+      tf.saved_model.save(m, export_dir)
+      self.evaluate(m.v.assign(10.))
+      self.assertAllEqual(
+          self.evaluate(strategy.experimental_local_results(m.v)), [5., 5.])
+      del m
+      # TODO(b/161488560): strategy.run doesn't work with tf.function with
+      # input_signature.
+      # self.evaluate(strategy.run(m.update, args=(value,)))
+      # self.assertAllEqual(
+      #     self.evaluate(strategy.experimental_local_results(m.v)), [8., 12.])
 
-    # TODO(b/159752793): Complete the test after the saving issue is fixed.
+    with strategy.scope():
+      loaded = tf.saved_model.load(export_dir)
+      self.evaluate(loaded.v.assign(10.))
+      self.assertAllEqual(
+          self.evaluate(strategy.experimental_local_results(loaded.v)),
+          [5., 5.])
+      self.evaluate(strategy.run(loaded.update, args=(value,)))
+      self.assertAllEqual(
+          self.evaluate(strategy.experimental_local_results(loaded.v)),
+          [8., 12.])
 
   def test_read_mirrored_variable(self, strategy):
 
@@ -402,13 +391,18 @@ class SaveAndLoadForTrainingTest(test.TestCase, parameterized.TestCase):
             strategy.experimental_local_results(strategy.run(loaded))),
         [1., 1.])
 
-  def test_update_mirrored_variable_broken(self, strategy):
+  def test_update_mirrored_variable(self, strategy):
     # This is also uncommon since most model parameters should be updated by
     # optimizer, and this test case is for completeness.
     #
-    # It's broken the saved model may not contain the aggregation logic. Even if
-    # it does, it's wrong since all inputs to the aggregation are the same
-    # variable.
+    # In the cross replica context, assigning to the variable assigns the same
+    # value to all replicas. This is true with the loaded model as well.
+    #
+    # However in replica context, MirroredVariable (synchronization=ON_WRITE)
+    # in a loaded model behaves differently. Updating MirroredVariable only
+    # update the current replica's variable with the current replica's value.
+    # There's no aggregation. This doesn't affect variables that are updated
+    # through optimizer. This is work as intended but can be surprising.
 
     class Model(tf.Module):
 
@@ -418,24 +412,28 @@ class SaveAndLoadForTrainingTest(test.TestCase, parameterized.TestCase):
             synchronization=tf.VariableSynchronization.ON_WRITE,
             aggregation=tf.VariableAggregation.MEAN)
 
-      @tf.function(input_signature=[tf.TensorSpec(shape=[1], dtype=tf.float32)])
+      @tf.function(input_signature=[tf.TensorSpec(shape=(), dtype=tf.float32)])
       def update(self, value):
-        self.v.assign_add(value[0])
+        self.v.assign_add(value)
 
     export_dir = self.get_temp_dir()
+    value = strategy.experimental_distribute_values_from_function(
+        lambda ctx: tf.identity([1., 2.][ctx.replica_id_in_sync_group]))
     with strategy.scope():
       m = Model()
       tf.saved_model.save(m, export_dir)
+      del m
 
     with strategy.scope():
       loaded = tf.saved_model.load(export_dir)
-    value = strategy.experimental_distribute_dataset(
-        tf.data.Dataset.from_tensor_slices([1., 2.]).batch(2))
-    strategy.run(loaded.update, args=(next(iter(value)),))
-    # TODO(b/159752793): Change after fix.
-    # got [2., 4.], want [1.5, 1.5].
     self.assertAllEqual(
-        self.evaluate(strategy.experimental_local_results(loaded.v)), [2., 4.])
+        self.evaluate(strategy.experimental_local_results(loaded.v)), [0., 0.])
+    self.evaluate(loaded.v.assign(1.))
+    self.assertAllEqual(
+        self.evaluate(strategy.experimental_local_results(loaded.v)), [1., 1.])
+    strategy.run(loaded.update, args=(value,))
+    self.assertAllEqual(
+        self.evaluate(strategy.experimental_local_results(loaded.v)), [2., 3.])
 
   # TODO(crccw): add a test case that trains a saved model with optimizer.
 
diff --git a/tensorflow/python/distribute/mirrored_function_strategy.py b/tensorflow/python/distribute/mirrored_function_strategy.py
deleted file mode 100644
index bbe52984d1e..00000000000
--- a/tensorflow/python/distribute/mirrored_function_strategy.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Class MirroredFunctionStrategy implementing tf.distribute.Strategy."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import threading
-
-from tensorflow.python.distribute import device_util
-from tensorflow.python.distribute import distribute_lib
-from tensorflow.python.distribute import distribute_utils
-from tensorflow.python.distribute import distribution_strategy_context
-from tensorflow.python.distribute import mirrored_strategy
-from tensorflow.python.distribute import values
-from tensorflow.python.eager import context
-from tensorflow.python.eager import def_function
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_spec
-from tensorflow.python.util import nest
-
-
-_replica_index = threading.local()
-_replica_id_key = object()
-
-
-def _replica_id_tensor():
-  return ops.get_default_graph().capture_call_time_value(
-      closure=lambda: constant_op.constant(_replica_index.current),
-      spec=tensor_spec.TensorSpec((), dtypes.int32),
-      key=_replica_id_key)
-
-
-def _in_run():
-  return (hasattr(_replica_index, "current") and
-          _replica_index.current is not None)
-
-
-def _outside_run_graph():
-  if hasattr(_replica_index, "graph_outside_run"):
-    return _replica_index.graph_outside_run
-  else:
-    return None
-
-
-class MirroredFunctionStrategy(distribute_lib.Strategy):
-  """Mirrors vars to distribute across multiple devices and machines.
-
-  This strategy uses one replica per device and sync replication for its
-  multi-GPU version. Unlike `tf.distribute.MirroredStrategy`, it creates a
-  function for a single replica, and calls that function repeatedly instead of
-  recording the operations for each replica separately.
-  """
-
-  def __init__(self, devices=None):
-    """Create an instance of `MirroredFunctionStrategy`.
-
-    Args:
-      devices: a list of device strings.  If `None`, all available GPUs are
-        used. If no GPUs are found, CPU is used.
-    """
-    extended = MirroredFunctionExtended(self, devices)
-    super(MirroredFunctionStrategy, self).__init__(extended)
-
-
-# TODO(josh11b): Switch to V2 when we no longer need to support tf.compat.v1.
-class MirroredFunctionExtended(distribute_lib.StrategyExtendedV1):
-  """Implementation of MirroredFunctionStrategy."""
-
-  def __init__(self, container_strategy, devices):
-    super(MirroredFunctionExtended, self).__init__(container_strategy)
-    if devices is None:
-      devices = mirrored_strategy.all_devices()
-    if not devices:
-      raise ValueError("Got an empty `devices` list. Please make sure the "
-                       "`devices` you pass in is not empty.")
-    device_tuple = tuple(device_util.resolve(d) for d in devices)
-    assert len(set(device_tuple)) == len(device_tuple), (
-        "No duplicates allowed in `devices` argument: %s" % (devices,))
-    self._devices = device_tuple
-    self._retrace_functions_for_each_device = False
-
-  def _call_for_each_replica(self, fn, args, kwargs):
-    # For now, `fn` must be an @tf.function.
-    # TODO(josh11b): Relax this restriction?  Main problem is if
-    # (a) executing eagerly, (b) `fn` not @tf.function, and
-    # (c) executed frequently.
-    assert isinstance(fn, def_function.Function)
-
-    if _outside_run_graph() is not None:
-      # Nested case, should just use outer function's context for things like
-      # the current replica index.
-      # TODO(josh11b): Test this case!
-      with MirroredFunctionReplicaContext(self._container_strategy()):
-        results = fn(*nest.map_structure(_unwrap_tensors, args),
-                     **nest.map_structure(_unwrap_tensors, kwargs))
-        return nest.map_structure(_wrap_tensors, results)
-
-    _replica_index.graph_outside_run = ops.get_default_graph()
-    return_values = []
-
-    try:
-      with MirroredFunctionReplicaContext(self._container_strategy()):
-        for index, device in enumerate(self._devices):
-          _replica_index.current = index
-          with ops.device(device):
-            if context.executing_eagerly():
-              # NOTE: These functions need to execute concurrently if they
-              # use a collective op. This is a particular concern with eager
-              # execution.
-              with context.execution_mode(context.ASYNC):
-                return_values.append(
-                    fn(*distribute_utils.select_replica(index, args),
-                       **distribute_utils.select_replica(index, kwargs)))
-            else:
-              return_values.append(
-                  fn(*distribute_utils.select_replica(index, args),
-                     **distribute_utils.select_replica(index, kwargs)))
-    finally:
-      _replica_index.graph_outside_run = None
-      _replica_index.current = None
-
-    return distribute_utils.regroup(return_values)
-
-  def _local_results(self, val):
-    if isinstance(val, values.DistributedValues):
-      return val.values
-    return (val,)
-
-
-class FnMergedValue(object):
-
-  def __init__(self, value):
-    self._value = value
-
-
-def _wrap_tensors(maybe_tensor):
-  if isinstance(maybe_tensor, ops.Tensor):  # TODO(josh11b): or composite tensor?
-    return FnMergedValue(maybe_tensor)
-  return maybe_tensor
-
-
-def _unwrap_tensors(maybe_wrapped):
-  if isinstance(maybe_wrapped, FnMergedValue):
-    return maybe_wrapped._value  # pylint: disable=protected-access
-  return maybe_wrapped
-
-
-class MirroredFunctionReplicaContext(distribute_lib.ReplicaContext):
-  """ReplicaContext used in MirroredFunctionStrategy."""
-
-  def __init__(self, strategy):
-    distribute_lib.ReplicaContext.__init__(self, strategy, None)
-
-  @property
-  def _replica_id_in_sync_group(self):
-    return _replica_id_tensor()
-
-  @_replica_id_in_sync_group.setter
-  def _replica_id_in_sync_group(self, value):
-    assert value is None
-
-  def _merge_call(self, merge_fn, args, kwargs):
-    # We wrap all args/kwargs with tensor values in a class that prevents them
-    # for being used by anything other than MirroredFunctionStrategy APIs that
-    # have been specifically written to recognize the wrapper and unwrap the
-    # values (such as extended.reduce_to/update).
-
-    # TODO(josh11b): Should these set expand_composites=True?
-    args = nest.map_structure(_wrap_tensors, args)
-    kwargs = nest.map_structure(_wrap_tensors, kwargs)
-    # pylint: disable=protected-access
-    distribution_strategy_context._push_per_thread_mode(
-        distribution_strategy_context._CrossReplicaThreadMode(self._strategy))
-    try:
-      results = merge_fn(self._strategy, *args, **kwargs)
-    finally:
-      distribution_strategy_context._pop_per_thread_mode()
-    # pylint: enable=protected-access
-    return nest.map_structure(_unwrap_tensors, results)
-
-  @property
-  def devices(self):
-    raise RuntimeError("Can't get the devices for the current replica.")
diff --git a/tensorflow/python/distribute/mirrored_function_strategy_test.py b/tensorflow/python/distribute/mirrored_function_strategy_test.py
deleted file mode 100644
index c883241114e..00000000000
--- a/tensorflow/python/distribute/mirrored_function_strategy_test.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for MirroredFunctionStrategy."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.distribute import distribution_strategy_context
-from tensorflow.python.distribute import mirrored_function_strategy
-from tensorflow.python.distribute import strategy_combinations
-from tensorflow.python.distribute import values
-from tensorflow.python.eager import def_function
-from tensorflow.python.eager import test
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-
-
-class MirroredFunctionStrategyTest(test.TestCase):
-
-  def setUp(self):
-    super(MirroredFunctionStrategyTest, self).setUp()
-    strategy_combinations.set_virtual_cpus_to_at_least(3)
-    self._strategy = mirrored_function_strategy.MirroredFunctionStrategy(
-        devices=("/cpu:1", "/cpu:2"))
-
-  def testReplicaId(self):
-    f_traces = []
-
-    @def_function.function
-    def f(x):
-      f_traces.append(None)  # Only happens on trace.
-      replica_context = distribution_strategy_context.get_replica_context()
-      # This is a non-constant tensor.
-      replica_id = replica_context.replica_id_in_sync_group
-      self.assertIsInstance(replica_id, ops.Tensor)
-      self.assertIsNone(tensor_util.constant_value(replica_id))
-      return x + replica_id
-
-    one = constant_op.constant(1)
-    self.assertLen(f_traces, 0)
-    result1 = self._strategy.run(f, args=(one,))
-    self.assertLen(f_traces, 1)  # Function traced once, not for each replica.
-    # Returns a per-replica value.
-    self.assertIsInstance(result1, values.PerReplica)
-    self.assertAllEqual([1, 2],
-                        self._strategy.experimental_local_results(result1))
-
-    # Try passing a per-replica value as an argument.
-    result2 = self._strategy.run(f, args=(result1,))
-    self.assertLen(f_traces, 1)
-    self.assertIsInstance(result2, values.PerReplica)
-    self.assertAllEqual([1, 3],
-                        self._strategy.experimental_local_results(result2))
-
-  def testMergeCall(self):
-    f_traces = []
-    g_traces = []
-
-    def g(strategy, z):
-      g_traces.append(None)  # Only happens on trace.
-      self.assertIs(strategy, self._strategy)
-      self.assertTrue(distribution_strategy_context.in_cross_replica_context())
-      self.assertIsInstance(z, mirrored_function_strategy.FnMergedValue)
-      return z
-
-    @def_function.function
-    def f(x):
-      f_traces.append(None)  # Only happens on trace.
-      replica_context = distribution_strategy_context.get_replica_context()
-      y = replica_context.merge_call(g, args=(x,))
-      self.assertIsInstance(y, ops.Tensor)
-      return y
-
-    one = constant_op.constant(1)
-    self.assertLen(f_traces, 0)
-    self.assertLen(g_traces, 0)
-    result = self._strategy.run(f, args=(one,))
-    # Functions traced once, not for each replica.
-    self.assertLen(f_traces, 1)
-    self.assertLen(g_traces, 1)
-    # Returns a per-replica value.
-    self.assertIsInstance(result, values.PerReplica)
-    self.assertAllEqual([1, 1],
-                        self._strategy.experimental_local_results(result))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/distribute/mirrored_run.py b/tensorflow/python/distribute/mirrored_run.py
index 05018450121..2cf23e96e67 100644
--- a/tensorflow/python/distribute/mirrored_run.py
+++ b/tensorflow/python/distribute/mirrored_run.py
@@ -31,11 +31,8 @@ from tensorflow.python.distribute import distribute_utils
 from tensorflow.python.distribute import shared_variable_creator
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as tf_device
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
@@ -313,8 +310,7 @@ class _MirroredReplicaThread(threading.Thread):
           _enter_graph(self.graph, self.in_eager,
                        self._variable_creator_stack), \
           context.device_policy(self.context_device_policy), \
-          _MirroredReplicaContext(self.distribution, constant_op.constant(
-              self.replica_id, dtypes.int32)), \
+          _MirroredReplicaContext(self.distribution, self.replica_id), \
           ops.device(self.devices[self.replica_id]), \
           ops.name_scope(self._name_scope), \
           variable_scope.variable_scope(
@@ -452,5 +448,7 @@ class _MirroredReplicaContext(distribute_lib.ReplicaContext):
   @property
   def devices(self):
     distribute_lib.require_replica_context(self)
-    replica_id = tensor_util.constant_value(self._replica_id_in_sync_group)
-    return [self._strategy.extended.worker_devices_by_replica[replica_id]]
+    return [
+        self._strategy.extended.worker_devices_by_replica[
+            self._replica_id_in_sync_group]
+    ]
diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py
index b424f798476..07798dc1046 100644
--- a/tensorflow/python/distribute/mirrored_strategy.py
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@@ -319,6 +319,9 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
     if ops.executing_eagerly_outside_functions():
       self.experimental_enable_get_next_as_optional = True
 
+    # Flag to turn on VariablePolicy.
+    self._use_var_policy = False
+
   def _initialize_strategy(self, devices):
     # The _initialize_strategy method is intended to be used by distribute
     # coordinator as well.
@@ -462,7 +465,8 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
 
     return distribute_utils.create_mirrored_variable(
         self._container_strategy(), _real_mirrored_creator,
-        values.MirroredVariable, values.SyncOnReadVariable, **kwargs)
+        distribute_utils.VARIABLE_CLASS_MAPPING,
+        distribute_utils.VARIABLE_POLICY_MAPPING, **kwargs)
 
   def _validate_colocate_with_variable(self, colocate_with_variable):
     distribute_utils.validate_colocate_distributed_variable(
@@ -628,10 +632,10 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
     return self._cross_device_ops or self._inferred_cross_device_ops
 
   def _reduce_to(self, reduce_op, value, destinations, experimental_hints):
-    if (isinstance(value, values.Mirrored) and
+    if (distribute_utils.is_mirrored(value) and
         reduce_op == reduce_util.ReduceOp.MEAN):
       return value
-    assert not isinstance(value, values.Mirrored)
+    assert not distribute_utils.is_mirrored(value)
     if not isinstance(value, values.DistributedValues):
       # This function handles reducing values that are not PerReplica or
       # Mirrored values. For example, the same value could be present on all
@@ -686,10 +690,12 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
 
   def read_var(self, replica_local_var):
     """Read the aggregate value of a replica-local variable."""
-    if isinstance(replica_local_var, values.SyncOnReadVariable):
-      return replica_local_var._get_cross_replica()  # pylint: disable=protected-access
-    assert isinstance(replica_local_var, values.Mirrored)
-    return array_ops.identity(replica_local_var._get())  # pylint: disable=protected-access
+    # pylint: disable=protected-access
+    if distribute_utils.is_sync_on_read(replica_local_var):
+      return replica_local_var._get_cross_replica()
+    assert distribute_utils.is_mirrored(replica_local_var)
+    return array_ops.identity(replica_local_var._get())
+    # pylint: enable=protected-access
 
   def _local_results(self, val):
     if isinstance(val, values.DistributedValues):
diff --git a/tensorflow/python/distribute/mirrored_strategy_test.py b/tensorflow/python/distribute/mirrored_strategy_test.py
index d2a567589b9..5c86cbea1a4 100644
--- a/tensorflow/python/distribute/mirrored_strategy_test.py
+++ b/tensorflow/python/distribute/mirrored_strategy_test.py
@@ -658,7 +658,7 @@ class MirroredThreeDeviceDistributionTest(
 
     with distribution.scope():
       result = distribution.extended.call_for_each_replica(model_fn)
-      self.assertIsInstance(result, values.MirroredVariable)
+      self.assertTrue(distribute_utils.is_mirrored(result))
       self.assertEqual("foo:0", result.name)
 
 
@@ -680,7 +680,7 @@ class MirroredVariableUpdateTest(test.TestCase):
 
     with distribution.scope():
       mirrored_var = distribution.extended.call_for_each_replica(var_fn)
-      self.assertIsInstance(mirrored_var, values.MirroredVariable)
+      self.assertTrue(distribute_utils.is_mirrored(mirrored_var))
       self.evaluate(variables.global_variables_initializer())
 
       def model_fn():
@@ -700,7 +700,7 @@ class MirroredVariableUpdateTest(test.TestCase):
 
     with distribution.scope():
       mirrored_var = distribution.extended.call_for_each_replica(var_fn)
-      self.assertIsInstance(mirrored_var, values.MirroredVariable)
+      self.assertTrue(distribute_utils.is_mirrored(mirrored_var))
       self.evaluate(variables.global_variables_initializer())
 
       def model_fn():
@@ -718,7 +718,7 @@ class MirroredVariableUpdateTest(test.TestCase):
 
     with distribution.scope():
       mirrored_var = distribution.extended.call_for_each_replica(var_fn)
-      self.assertIsInstance(mirrored_var, values.MirroredVariable)
+      self.assertTrue(distribute_utils.is_mirrored(mirrored_var))
       self.evaluate(variables.global_variables_initializer())
       self.assertEqual(1.0, self.evaluate(mirrored_var))
       mirrored_var_result = self.evaluate(mirrored_var.assign(6.0))
@@ -731,7 +731,7 @@ class MirroredVariableUpdateTest(test.TestCase):
 
     with distribution.scope():
       mirrored_var = distribution.extended.call_for_each_replica(var_fn)
-      self.assertIsInstance(mirrored_var, values.MirroredVariable)
+      self.assertTrue(distribute_utils.is_mirrored(mirrored_var))
       self.evaluate(variables.global_variables_initializer())
       self.assertEqual(1.0, self.evaluate(mirrored_var))
 
@@ -752,7 +752,7 @@ class MirroredVariableUpdateTest(test.TestCase):
 
     with distribution.scope():
       mirrored_var = distribution.extended.call_for_each_replica(var_fn)
-      self.assertIsInstance(mirrored_var, values.MirroredVariable)
+      self.assertTrue(distribute_utils.is_mirrored(mirrored_var))
       self.evaluate(variables.global_variables_initializer())
       self.assertEqual(1.0, self.evaluate(mirrored_var))
 
@@ -769,7 +769,7 @@ class MirroredVariableUpdateTest(test.TestCase):
 
     with distribution.scope():
       mirrored_var = distribution.extended.call_for_each_replica(var_fn)
-      self.assertIsInstance(mirrored_var, values.MirroredVariable)
+      self.assertTrue(distribute_utils.is_mirrored(mirrored_var))
       self.evaluate(variables.global_variables_initializer())
       self.assertEqual(1.0, self.evaluate(mirrored_var))
 
@@ -812,7 +812,7 @@ class MirroredVariableUpdateTest(test.TestCase):
 
     with distribution.scope():
       mirrored_var = distribution.extended.call_for_each_replica(var_fn)
-      self.assertIsInstance(mirrored_var, values.MirroredVariable)
+      self.assertTrue(distribute_utils.is_mirrored(mirrored_var))
       self.evaluate(variables.global_variables_initializer())
       self.assertEqual(1.0, self.evaluate(mirrored_var))
 
@@ -833,7 +833,7 @@ class MirroredVariableUpdateTest(test.TestCase):
 
     with distribution.scope():
       mirrored_var = distribution.extended.call_for_each_replica(var_fn)
-      self.assertIsInstance(mirrored_var, values.MirroredVariable)
+      self.assertTrue(distribute_utils.is_mirrored(mirrored_var))
       self.evaluate(variables.global_variables_initializer())
       self.assertEqual(1.0, self.evaluate(mirrored_var))
 
@@ -850,7 +850,7 @@ class MirroredVariableUpdateTest(test.TestCase):
 
     with distribution.scope():
       mirrored_var = distribution.extended.call_for_each_replica(var_fn)
-      self.assertIsInstance(mirrored_var, values.MirroredVariable)
+      self.assertTrue(distribute_utils.is_mirrored(mirrored_var))
       self.evaluate(variables.global_variables_initializer())
       self.assertEqual(5.0, self.evaluate(mirrored_var))
       mirrored_var_result = self.evaluate(mirrored_var.assign_sub(2.0))
@@ -875,7 +875,7 @@ class MirroredVariableUpdateTest(test.TestCase):
 
     with distribution.scope():
       mirrored_var = distribution.extended.call_for_each_replica(var_fn)
-      self.assertIsInstance(mirrored_var, values.MirroredVariable)
+      self.assertTrue(distribute_utils.is_mirrored(mirrored_var))
       self.evaluate(variables.global_variables_initializer())
       self.assertEqual(5.0, self.evaluate(mirrored_var))
 
@@ -896,7 +896,7 @@ class MirroredVariableUpdateTest(test.TestCase):
 
     with distribution.scope():
       mirrored_var = distribution.extended.call_for_each_replica(var_fn)
-      self.assertIsInstance(mirrored_var, values.MirroredVariable)
+      self.assertTrue(distribute_utils.is_mirrored(mirrored_var))
       self.evaluate(variables.global_variables_initializer())
       self.assertEqual(5.0, self.evaluate(mirrored_var))
 
@@ -926,7 +926,7 @@ class MirroredAndSyncOnReadVariableInitializerTest(test.TestCase):
 
       with distribution.scope():
         mirrored_var = distribution.extended.call_for_each_replica(var_fn)
-        self.assertIsInstance(mirrored_var, values.MirroredVariable)
+        self.assertTrue(distribute_utils.is_mirrored(mirrored_var))
         self.assertFalse(self.evaluate(mirrored_var.is_initialized()))
         self.evaluate(mirrored_var.initializer)
         self.assertTrue(self.evaluate(mirrored_var.is_initialized()))
@@ -940,13 +940,13 @@ class MirroredAndSyncOnReadVariableInitializerTest(test.TestCase):
             1.0,
             synchronization=variable_scope.VariableSynchronization.ON_READ,
             aggregation=variable_scope.VariableAggregation.SUM)
-        self.assertIsInstance(v_sum, values.SyncOnReadVariable)
+        self.assertTrue(distribute_utils.is_sync_on_read(v_sum))
         return v_sum
 
       with distribution.scope():
         sync_on_read_var = distribution.extended.call_for_each_replica(
             model_fn)
-        self.assertIsInstance(sync_on_read_var, values.SyncOnReadVariable)
+        self.assertTrue(distribute_utils.is_sync_on_read(sync_on_read_var))
         self.assertFalse(self.evaluate(sync_on_read_var.is_initialized()))
         self.evaluate(sync_on_read_var.initializer)
         self.assertTrue(self.evaluate(sync_on_read_var.is_initialized()))
@@ -970,7 +970,7 @@ class SyncOnReadVariableAssignTest(test.TestCase):
 
     with distribution.scope():
       sync_on_read_var = distribution.extended.call_for_each_replica(model_fn)
-      self.assertIsInstance(sync_on_read_var, values.SyncOnReadVariable)
+      self.assertTrue(distribute_utils.is_sync_on_read(sync_on_read_var))
       self.evaluate(variables.global_variables_initializer())
       # Each replica has a value of 1.0 assigned to it in replica context.
       # When we read the value using `read_var` we should see the SUM of each of
@@ -997,7 +997,7 @@ class SyncOnReadVariableAssignTest(test.TestCase):
 
     with distribution.scope():
       sync_on_read_var = distribution.extended.call_for_each_replica(model_fn)
-      self.assertIsInstance(sync_on_read_var, values.SyncOnReadVariable)
+      self.assertTrue(distribute_utils.is_sync_on_read(sync_on_read_var))
       self.evaluate(variables.global_variables_initializer())
       # Each replica has a value of 1.0 assigned to it in replica context.
       # When we read the value using `read_var` we should see the MEAN of values
diff --git a/tensorflow/python/distribute/mirrored_variable_test.py b/tensorflow/python/distribute/mirrored_variable_test.py
index 8e7d674947e..53a18fb271b 100644
--- a/tensorflow/python/distribute/mirrored_variable_test.py
+++ b/tensorflow/python/distribute/mirrored_variable_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import distribute_utils
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import values
@@ -89,7 +90,7 @@ class MirroredVariableCreationTest(test.TestCase):
   # TODO(priyag): Modify more tests to use this helper and check more
   # properties.
   def _test_mv_properties(self, var, name, strategy):
-    self.assertIsInstance(var, values.MirroredVariable)
+    self.assertTrue(distribute_utils.is_mirrored(var))
     self.assertEqual(name, var.name)
     self.assertIs(strategy, var.distribute_strategy)
     for i, d in enumerate(var._devices):
@@ -185,7 +186,7 @@ class MirroredVariableCreationTest(test.TestCase):
     with distribution.scope():
       result = distribution.extended.call_for_each_replica(model_fn)
       for v in result:
-        self.assertIsInstance(v, values.MirroredVariable)
+        self.assertTrue(distribute_utils.is_mirrored(v))
       self.assertEqual(4, len(result))
       self.assertEqual("foo/bar:0", result[0].name)
       self.assertEqual("foo_1/bar:0", result[1].name)
@@ -202,7 +203,7 @@ class MirroredVariableCreationTest(test.TestCase):
 
     with distribution.scope():
       result = distribution.extended.call_for_each_replica(model_fn)
-      self.assertIsInstance(result, values.MirroredVariable)
+      self.assertTrue(distribute_utils.is_mirrored(result))
       # The resulting mirrored variable will use the name from the first device.
       self.assertEqual("foo_0:0", result.name)
 
@@ -234,14 +235,14 @@ class MirroredVariableCreationTest(test.TestCase):
       result = distribution.extended.call_for_each_replica(model_fn)
       self.assertEqual(4, len(result))
       v0, v1, v2, v3 = result
-      self.assertIsInstance(v0, values.MirroredVariable)
+      self.assertTrue(distribute_utils.is_mirrored(v0))
       self.assertEqual("var0:0", v0.name)
-      self.assertIsInstance(v1, values.MirroredVariable)
+      self.assertTrue(distribute_utils.is_mirrored(v1))
       self.assertEqual("common/var1:0", v1.name)
-      self.assertIsInstance(v2, values.SyncOnReadVariable)
+      self.assertTrue(distribute_utils.is_sync_on_read(v2))
       self.assertEqual("common/var2:0", v2.name)
       self.assertEqual(variable_scope.VariableAggregation.SUM, v2.aggregation)
-      self.assertIsInstance(v3, values.MirroredVariable)
+      self.assertTrue(distribute_utils.is_mirrored(v3))
       self.assertEqual("common/var3:0", v3.name)
       self.assertEqual(variable_scope.VariableAggregation.MEAN, v3.aggregation)
 
@@ -272,14 +273,14 @@ class MirroredVariableCreationTest(test.TestCase):
         result = distribution.extended.call_for_each_replica(model_fn)
         self.assertEqual(4, len(result))
         v0, v1, v2, v3 = result
-        self.assertIsInstance(v0, values.MirroredVariable)
+        self.assertTrue(distribute_utils.is_mirrored(v0))
         self.assertEqual("main/var0:0", v0.name)
-        self.assertIsInstance(v1, values.MirroredVariable)
+        self.assertTrue(distribute_utils.is_mirrored(v1))
         self.assertEqual("main/common/var1:0", v1.name)
-        self.assertIsInstance(v2, values.SyncOnReadVariable)
+        self.assertTrue(distribute_utils.is_sync_on_read(v2))
         self.assertEqual("main/common/var2:0", v2.name)
         self.assertEqual(variable_scope.VariableAggregation.SUM, v2.aggregation)
-        self.assertIsInstance(v3, values.MirroredVariable)
+        self.assertTrue(distribute_utils.is_mirrored(v3))
         self.assertEqual("main/common/var3:0", v3.name)
         self.assertEqual(variable_scope.VariableAggregation.MEAN,
                          v3.aggregation)
@@ -379,8 +380,7 @@ class MirroredVariableCreationTest(test.TestCase):
     with distribution.scope():
       with self.assertRaisesRegex(
           ValueError, "`NONE` variable synchronization mode is not "
-          "supported with `Mirrored` distribution strategy. Please change "
-          "the `synchronization` for variable: v"):
+          "supported with "):
         variable_scope.get_variable(
             "v", [1],
             synchronization=variable_scope.VariableSynchronization.NONE)
@@ -389,8 +389,7 @@ class MirroredVariableCreationTest(test.TestCase):
     with distribution.scope():
       with self.assertRaisesRegex(
           ValueError, "`NONE` variable synchronization mode is not "
-          "supported with `Mirrored` distribution strategy. Please change "
-          "the `synchronization` for variable: v"):
+          "supported with "):
         variable_scope.variable(
             1.0,
             name="v",
@@ -455,8 +454,8 @@ class MirroredVariableCreationTest(test.TestCase):
           4.0,
           synchronization=variable_scope.VariableSynchronization.ON_READ,
           aggregation=variable_scope.VariableAggregation.MEAN)
-      self.assertIsInstance(v_sum, values.SyncOnReadVariable)
-      self.assertIsInstance(v_mean, values.SyncOnReadVariable)
+      self.assertTrue(distribute_utils.is_sync_on_read(v_sum))
+      self.assertTrue(distribute_utils.is_sync_on_read(v_mean))
       updates = [
           v_sum.assign_add(2.0 + replica_id),
           v_mean.assign(6.0 * replica_id)
@@ -550,7 +549,7 @@ class MirroredVariableCreationTest(test.TestCase):
           1.0,
           synchronization=variable_scope.VariableSynchronization.ON_READ,
           aggregation=variable_scope.VariableAggregation.SUM)
-      self.assertIsInstance(v_sum, values.SyncOnReadVariable)
+      self.assertTrue(distribute_utils.is_sync_on_read(v_sum))
       return v_sum
 
     def update(var, value):
diff --git a/tensorflow/python/distribute/multi_process_runner.py b/tensorflow/python/distribute/multi_process_runner.py
index 7f653c0e2de..b36c8e978b4 100644
--- a/tensorflow/python/distribute/multi_process_runner.py
+++ b/tensorflow/python/distribute/multi_process_runner.py
@@ -67,7 +67,8 @@ except ImportError:
 # exception stack trace info is stored in exc_info to pass on to parent process
 # to be re-raised.
 _ProcessStatusInfo = collections.namedtuple(
-    '_ProcessStatusInfo', ['is_successful', 'exc_info', 'return_value'])
+    '_ProcessStatusInfo',
+    ['task_type', 'task_id', 'is_successful', 'exc_info', 'return_value'])
 
 # Information returned from a successful MultiProcessRunner run.
 MultiProcessRunnerResult = collections.namedtuple('MultiProcessRunnerResult',
@@ -97,6 +98,11 @@ Resources = collections.namedtuple('Resources', [
 # "medium" timeout of the test runs.
 _DEFAULT_TIMEOUT_SEC = 200
 
+# The timeout in seconds to wait to force kill a child process. When a child
+# process times out we first try to SIGTERM it so that it has a chance to dump
+# stacktraces. However dumping stacktrace can take a long time.
+_FORCE_KILL_WAIT_SEC = 30
+
 
 class MultiProcessRunner(object):
   """A utility class to start multiple processes to simulate a cluster.
@@ -124,6 +130,8 @@ class MultiProcessRunner(object):
                list_stdout=False,
                use_dill_for_args=True,
                daemon=False,
+               dependence_on_chief=True,
+               auto_restart=False,
                args=None,
                kwargs=None):
     """Creates a multi-process runner.
@@ -161,6 +169,11 @@ class MultiProcessRunner(object):
         can pickle more objects, but doesn't work with types in
         `multiprocessing` library like `Mutex`.
       daemon: Whether to start processes as daemons.
+      dependence_on_chief: Whether to terminates the cluster if the chief exits.
+        If auto_restart is True, it only terminates the cluster if the chief
+        exits with a zero exit code.
+      auto_restart: Whether to automatically restart processes that exit with
+        non-zero exit code.
       args: Positional arguments to be sent to functions run on processes.
       kwargs: Keyword arguments to be sent to functions run on processes.
 
@@ -190,9 +203,10 @@ class MultiProcessRunner(object):
     self._stream_stdout = stream_stdout
     # TODO(rchao): Revisit list_stdout argument to consider other solution.
     self._list_stdout = list_stdout
-    self._dependence_on_chief = True
+    self._dependence_on_chief = dependence_on_chief
     self._use_dill_for_args = use_dill_for_args
     self._daemon = daemon
+    self._auto_restart = auto_restart
     self._args = args or ()
     self._kwargs = kwargs or {}
 
@@ -201,11 +215,18 @@ class MultiProcessRunner(object):
     self._executing_eagerly = context.executing_eagerly()
 
     self._joined = False
+    self._process_lock = threading.Lock()
+    # Guarded by self._process_lock.
     self._processes = {}
-    self._outstanding_subprocess_count = 0
+    # Record which processes are terminated. Due to a bug in Python<3.7,
+    # terminated processes return 255 exit code, which should cause an exception
+    # in join().
+    # https://bugs.python.org/issue30589
+    # Guarded by self._process_lock.
+    self._terminated = set()
     self._reading_threads = []
 
-    self._manager = multiprocessing.Manager()
+    self._manager = manager()
     self._process_status_queue = self._manager.Queue()
     self._parent_to_sub_queue = self._manager.Queue()
     parties = sum(len(addresses) for addresses in self._cluster_spec.values())
@@ -215,8 +236,7 @@ class MultiProcessRunner(object):
     # safe.
     self._streaming_queue = self._manager.Queue()
 
-    # This flag will be set to True once terminate_all() is called.
-    self._all_forced_terminated = False
+    self._watchdog_thread = None
 
   def set_args(self, args=None, kwargs=None):
     self._args = args or self._args
@@ -281,7 +301,7 @@ class MultiProcessRunner(object):
         daemon=self._daemon)
     p.start()
     self._processes[(task_type, task_id)] = p
-    self._outstanding_subprocess_count += 1
+    self._terminated.discard((task_type, task_id))
 
     # For each subprocess, we dedicate a thread continuously reading lines
     # from them.
@@ -291,17 +311,26 @@ class MultiProcessRunner(object):
     thread.start()
     self._reading_threads.append(thread)
 
+    if self._watchdog_thread is None or not self._watchdog_thread.is_alive():
+      self._watchdog_thread = threading.Thread(target=self._process_watchdog)
+      self._watchdog_thread.start()
+
   def start(self):
     """Starts processes, one for each task in `cluster_spec`.
 
     Note that this is best effort by the applicable multiprocessing library,
     and it may take up to seconds for a subprocess to be successfully started.
     """
-    if self._processes:
-      raise ValueError('MultiProcessRunner already started.')
-    for task_type, addresses in self._cluster_spec.items():
-      for task_id, _ in enumerate(addresses):
-        self._start_subprocess_and_reading_thread(task_type, task_id)
+    with self._process_lock:
+      if self._processes:
+        raise ValueError('MultiProcessRunner already started.')
+      if self._joined:
+        raise ValueError('cannot start new processes after'
+                         'MultiProcessRunner.join() is called')
+
+      for task_type, addresses in self._cluster_spec.items():
+        for task_id, _ in enumerate(addresses):
+          self._start_subprocess_and_reading_thread(task_type, task_id)
 
     # TODO(rchao): Remove the need of using SIGALRM if possible. At this time,
     # without this the tests become very flaky.
@@ -353,10 +382,14 @@ class MultiProcessRunner(object):
     """
     if self._processes:
       raise ValueError('MultiProcessRunner already started.')
-    for task_type, addresses in self._cluster_spec.items():
-      for task_id, _ in enumerate(addresses):
-        if not (task_type == as_task_type and task_id == as_task_id):
-          self._start_subprocess_and_reading_thread(task_type, task_id)
+    with self._process_lock:
+      if self._joined:
+        raise ValueError('cannot start new processes after'
+                         'MultiProcessRunner.join() is called')
+      for task_type, addresses in self._cluster_spec.items():
+        for task_id, _ in enumerate(addresses):
+          if not (task_type == as_task_type and task_id == as_task_id):
+            self._start_subprocess_and_reading_thread(task_type, task_id)
 
     _set_tf_config(as_task_type, as_task_id, self._cluster_spec,
                    self._rpc_layer)
@@ -392,13 +425,17 @@ class MultiProcessRunner(object):
       args: Optional positional arguments to be supplied in `proc_func`.
       kwargs: Optional keyword arguments to be supplied in `proc_func`.
     """
-    self._start_subprocess_and_reading_thread(
-        task_type,
-        task_id,
-        cluster_spec=cluster_spec,
-        proc_func=proc_func,
-        args=args or (),
-        kwargs=kwargs or {})
+    with self._process_lock:
+      if self._joined:
+        raise ValueError('cannot start new processes after'
+                         'MultiProcessRunner.join() is called')
+      self._start_subprocess_and_reading_thread(
+          task_type,
+          task_id,
+          cluster_spec=cluster_spec,
+          proc_func=proc_func,
+          args=args or (),
+          kwargs=kwargs or {})
 
   def _queue_to_list(self, queue_to_convert):
     """Convert `queue.Queue` to `list`."""
@@ -411,9 +448,17 @@ class MultiProcessRunner(object):
         break
     return list_to_return
 
+  def _get_process_statuses(self):
+    # One worker may have multiple statuses. We only keep the last one.
+    statuses = {}
+    for status in self._queue_to_list(self._process_status_queue):
+      statuses[(status.task_type, status.task_id)] = status
+    return statuses
+
   def get_process_id(self, task_type, task_id):
     """Returns the subprocess id given the task type and task id."""
-    p = self._processes.get((task_type, task_id), None)
+    with self._process_lock:
+      p = self._processes.get((task_type, task_id), None)
     return p.pid if p else None
 
   def get_process_exit_code(self, task_type, task_id):
@@ -430,22 +475,75 @@ class MultiProcessRunner(object):
       KeyError: If the corresponding subprocess is not found with `task_type`
         and `task_id`.
     """
-    p = self._processes[(task_type, task_id)]
+    with self._process_lock:
+      p = self._processes[(task_type, task_id)]
     return p.exitcode if p else None
 
-  def _join_or_terminate(self, task_type, task_id, process, timeout):
-    """Joins a process. If it times out, terminate all procsses."""
-    logging.info('joining %s-%d', task_type, task_id)
-    process.join(timeout)
-    # If exitcode is None, the process aren't terminated and this is a
-    # timeout.
-    if process.exitcode is None:
-      # Force termination to dump worker processes stack trace.
-      self.terminate_all(sig=signal.SIGTERM)
-      process_statuses = self._queue_to_list(self._process_status_queue)
-      raise SubprocessTimeoutError(
-          '%s-%d and possibly more subprocesses timed out.' %
-          (task_type, task_id), self._get_mpr_result(process_statuses))
+  def process_exists(self, task_type, task_id):
+    """Returns whether the subprocess still exists given the task type and id.
+
+    Args:
+      task_type: The task type.
+      task_id: The task id.
+
+    Returns:
+      Boolean; whether the subprocess still exists. If the subprocess has
+      exited, this returns False.
+    """
+    return self.get_process_exit_code(task_type, task_id) is None
+
+  def _process_watchdog(self):
+    """Simulates a cluster management system.
+
+    - If auto_restart is True, it restarts processes that exit with a non-zero
+      exit code. Note that when join() times out it overrides auto_restart to
+      False.
+    - If dependence_on_chief is True, it terminates all processes once the chief
+      exits. If auto_restart is also True, it only terminates all processes if
+      the chief exit with a zero exit code, otherwise it restarts the chief.
+
+    This runs in self._watchdog_thread.
+    """
+    while True:
+      time.sleep(1)
+      with self._process_lock:
+        chief = self._processes.get(('chief', 0), None)
+        # Terminate the cluster when _dependence_on_chief is True if either:
+        # - chief has exited with zero exit code.
+        # - chief has exited with non-zero exit code and self._auto_restart is
+        #   False.
+        if chief and self._dependence_on_chief and chief.exitcode is not None:
+          if chief.exitcode == 0 or (not self._auto_restart):
+            for p in self._processes.values():
+              # Give other processes a chance to exit on their own.
+              p.join(timeout=3)
+            self._terminate_all()
+            for p in self._processes.values():
+              p.join()
+            return
+
+        # Auto restart failed processes if self._auto_restart is True.
+        if self._auto_restart:
+          has_failure = False
+          for (task_type, task_id), p in self._processes.items():
+            if p.exitcode is not None and p.exitcode != 0:
+              has_failure = True
+              logging.info('Restarting failed %s-%d', task_type, task_id)
+              self._start_subprocess_and_reading_thread(task_type, task_id)
+          if has_failure:
+            continue
+
+        # Exit the thread if all processes have exited at this point.
+        if all(p.exitcode is not None for p in self._processes.values()):
+          return
+
+  def _reraise_if_subprocess_error(self, process_statuses):
+    for process_status in process_statuses.values():
+      assert isinstance(process_status, _ProcessStatusInfo)
+      if not process_status.is_successful:
+        process_status.exc_info[1].mpr_result = self._get_mpr_result(
+            process_statuses)
+        six.reraise(*process_status.exc_info)
 
   def join(self, timeout=_DEFAULT_TIMEOUT_SEC):
     """Joins all the processes with timeout.
@@ -487,43 +585,51 @@ class MultiProcessRunner(object):
         is not `None`, it is expected that some subprocesses may be
         force-killed when `max_run_time` is up, and this is raised in those
         cases.
-      Exception: if there is an Exception propagated from any subprocess.
+      Exception: if there is an Exception propagated from any subprocess. When
+        this is raised, a `MultiProcessRunnerResult` object can be retrieved by
+        `UnexpectedSubprocessExitError`'s mpr_result attribute, which has the
+        same structure as above 'Returns' section describes.
     """
-    if self._joined:
-      raise ValueError("MultiProcessRunner can't be joined twice.")
-    self._joined = True
+    with self._process_lock:
+      if self._joined:
+        raise ValueError("MultiProcessRunner can't be joined twice.")
+      self._joined = True
 
-    chief = self._processes.get(('chief', 0), None)
-    if self._dependence_on_chief and chief:
-      self._join_or_terminate('chief', 0, chief, timeout)
-      # Give other processes a chance to exit on their own.
-      for p in self._processes.values():
-        p.join(timeout=3)
-      self.terminate_all()
-    else:
-      for (task_type, task_id), p in self._processes.items():
-        self._join_or_terminate(task_type, task_id, p, timeout)
+    self._watchdog_thread.join(timeout)
+    if self._watchdog_thread.is_alive():
+      # Timeout. Force termination to dump worker processes stack trace.
+      with self._process_lock:
+        self._auto_restart = False
+      logging.error('Timeout when joining for child processes. Terminating...')
+      self.terminate_all(sig=signal.SIGTERM)
+      # Wait for the processes to terminate by themselves first, so they have a
+      # chance to dump stacktraces. After _FORCE_KILL_WAIT_SEC, we SIGKILL them.
+      self._watchdog_thread.join(_FORCE_KILL_WAIT_SEC)
+      if self._watchdog_thread.is_alive():
+        logging.error('Timeout when waiting for child processes to '
+                      'print stacktrace. Sending SIGKILL...')
+        self.terminate_all()
+        self._watchdog_thread.join()
+      process_statuses = self._get_process_statuses()
+      self._reraise_if_subprocess_error(process_statuses)
+      raise SubprocessTimeoutError('one or more subprocesses timed out.',
+                                   self._get_mpr_result(process_statuses))
 
     for (task_type, task_id), p in self._processes.items():
       logging.info('%s-%d exit code: %s', task_type, task_id, p.exitcode)
 
-    process_statuses = self._queue_to_list(self._process_status_queue)
-    for process_status in process_statuses:
-      assert isinstance(process_status, _ProcessStatusInfo)
-      if not process_status.is_successful:
-        six.reraise(*process_status.exc_info)
+    process_statuses = self._get_process_statuses()
+    self._reraise_if_subprocess_error(process_statuses)
 
     # Checking all the processes that are expected to exit properly.
     for (task_type, task_id), p in self._processes.items():
-      if self._dependence_on_chief and chief and task_type != 'chief':
-        # If _dependence_on_chief, other processes may have been
-        # forced-terminated, which is expected.
-        continue
-      # Successfully exiting process has exit code 0.
-      if p.exitcode is None or p.exitcode > 0:
+      # Successfully exiting process has exit code 0. We ignore processes that
+      # are terminated.
+      assert p.exitcode is not None
+      if (p.exitcode > 0 and (task_type, task_id) not in self._terminated):
         raise UnexpectedSubprocessExitError(
-            'Subprocess %s-%d exited with exit code %d. See logs for details.' %
-            (task_type, task_id, p.exitcode),
+            'Subprocess %s-%d exited with exit code %s. See logs for details.'
+            % (task_type, task_id, p.exitcode),
             self._get_mpr_result(process_statuses))
 
     logging.info('Joining log reading threads.')
@@ -539,57 +645,60 @@ class MultiProcessRunner(object):
   def _get_mpr_result(self, process_statuses):
     stdout = self._queue_to_list(self._streaming_queue)
     return_values = []
-    for process_status in process_statuses:
+    for process_status in process_statuses.values():
       if process_status.return_value is not None:
         return_values.append(process_status.return_value)
     return MultiProcessRunnerResult(stdout=stdout, return_value=return_values)
 
   def terminate(self, task_type, task_id):
-    """Terminates the process with `task_type` and `task_id`."""
-    p = self._processes.get((task_type, task_id), None)
-    if p is None:
-      raise ValueError('{}-{} does not exist'.format(task_type, task_id))
-    # TODO(crccw): change to use Process.terminate() as well.
-    self._parent_to_sub_queue.put('terminate {} {}'.format(task_type, task_id))
-    p.join()
+    """Terminates the process with `task_type` and `task_id`.
+
+    If auto_retart=True, the terminated task will be restarted unless the chief
+    has already exited with zero exit code.
+
+    Args:
+      task_type: the task type.
+      task_id: the task id.
+
+    """
+    with self._process_lock:
+      p = self._processes.get((task_type, task_id), None)
+      if p is None:
+        raise ValueError('{}-{} does not exist'.format(task_type, task_id))
+      self._terminated.add((task_type, task_id))
+      # TODO(crccw): change to use Process.terminate() as well.
+      self._parent_to_sub_queue.put('terminate {} {}'.format(
+          task_type, task_id))
+      p.join()
+
+  def _terminate_all(self, sig=None):
+    """Terminates all subprocesses.
+
+    The caller is required to hold self._process_lock.
+
+    Args:
+      sig: the signal used to terminate the process. The default is SIGKILL.
+    """
 
-  def terminate_all(self, sig=None):
-    """Terminates all subprocesses."""
     # Use SIGKILL as default. In systems where that's unavailable such as
     # windows, use SIGTERM.
     sig = sig or getattr(signal, 'SIGKILL', signal.SIGTERM)
     for (task_type, task_id), p in self._processes.items():
+      if p.exitcode is not None:
+        continue
       try:
         os.kill(p.pid, sig)
+        self._terminated.add((task_type, task_id))
         logging.info('%s-%d terminated with signal %r.', task_type, task_id,
                      sig)
       except ProcessLookupError:
         logging.info('Attempting to kill %s-%d but it does not exist.',
                      task_type, task_id)
-    self._all_forced_terminated = True
 
-  def get_manager(self):
-    """Returns the multiprocessing manager object for concurrency tools.
-
-    The manager object is useful as it controls a server process that holds
-    the python objects that can be shared across processes. This can be used
-    for parent-subprocess communication:
-
-    ```python
-    mpr = multi_process_runner.MultiProcessRunner(...)
-    manager = mpr.get_manager()
-    some_event_happening_in_subprocess = manager.Event()
-    mpr.set_args(args=(some_event_happening_in_subprocess,))
-    mpr.start()
-    some_event_happening_in_subprocess.wait()
-    # Do something that only should after some event happens in subprocess.
-    ```
-
-    Note that the user of multi_process_runner should not create additional
-    `multiprocessing.Manager()` objects; doing so can result in segfault in
-    some cases.
-    """
-    return self._manager
+  def terminate_all(self, sig=None):
+    """Terminates all subprocesses."""
+    with self._process_lock:
+      self._terminate_all(sig)
 
 
 class _Process(multi_process_lib.Process):
@@ -648,11 +757,13 @@ class _ProcFunc(object):
         time.sleep(0.1)
     self._resources.process_status_queue.put(
         _ProcessStatusInfo(
+            task_type=task_type,
+            task_id=task_id,
             is_successful=True,
             exc_info=None,
             return_value=None))
-    # `os._exit(0)` is used to more reliably terminate a subprocess.
-    os._exit(0)  # pylint: disable=protected-access
+    # `os._exit(1)` is used to more reliably terminate a subprocess.
+    os._exit(1)  # pylint: disable=protected-access
 
   def _close_streaming(self):
     """Close stdout, stderr and streaming pipe.
@@ -708,7 +819,8 @@ class _ProcFunc(object):
       v2_compat.enable_v2_behavior()
 
     with self._runtime_mode(test_env.executing_eagerly):
-      info = _run_contained(proc_func, args, kwargs)
+      info = _run_contained(test_env.task_type, test_env.task_id, proc_func,
+                            args, kwargs)
       self._resources.process_status_queue.put(info)
 
       # Re-raise the exception in addition to reporting it to the parent
@@ -797,7 +909,7 @@ class MultiProcessPoolRunner(object):
             task_type,
             task_id,
             proc_func=_pool_runner_worker,
-            args=(initializer, conn2))
+            args=(task_type, task_id, initializer, conn2))
 
   def run(self, proc_func, args=None, kwargs=None):
     """Runs `proc_func` with `args` and `kwargs` on all jobs.
@@ -842,7 +954,7 @@ class MultiProcessPoolRunner(object):
     return return_values
 
 
-def _pool_runner_worker(initializer, conn):
+def _pool_runner_worker(task_type, task_id, initializer, conn):
   """Function that runs on the workers in a pool.
 
   It listens for callables to run and returns the result until `conn` is closed.
@@ -850,8 +962,10 @@ def _pool_runner_worker(initializer, conn):
   `conn`.
 
   Args:
-    initializer: A callable to execute during startup.
-    conn: A multiprocessing.Connection object to listen for tasks and send
+    task_type: the task type.
+    task_id: the task index.
+    initializer: a callable to execute during startup.
+    conn: a multiprocessing.Connection object to listen for tasks and send
       results.
   """
   if initializer:
@@ -863,22 +977,24 @@ def _pool_runner_worker(initializer, conn):
     except EOFError:
       break
     proc_func = dill.loads(proc_func)
-    info = _run_contained(proc_func, args, kwargs)
+    info = _run_contained(task_type, task_id, proc_func, args, kwargs)
     sys.stdout.flush()
     sys.stderr.flush()
     conn.send(info)
 
 
-def _run_contained(proc_func, args, kwargs):
+def _run_contained(task_type, task_id, proc_func, args, kwargs):
   """Runs `proc_func` with `args` and `kwargs`.
 
   The function returns _ProcessStatusInfo which captures the return value and
   the exception.
 
   Args:
-    proc_func: The function to be run.
-    args: Optional positional arguments to be supplied in `proc_func`.
-    kwargs: Optional keyword arguments to be supplied in `proc_func`.
+    task_type: the task type.
+    task_id: the task index.
+    proc_func: the function to be run.
+    args: optional positional arguments to be supplied in `proc_func`.
+    kwargs: optional keyword arguments to be supplied in `proc_func`.
 
   Returns:
     a _ProcessStatusInfo.
@@ -891,6 +1007,8 @@ def _run_contained(proc_func, args, kwargs):
     return_value = proc_func(*args, **kwargs)
     is_successful = True
     return _ProcessStatusInfo(
+        task_type=task_type,
+        task_id=task_id,
         is_successful=is_successful,
         exc_info=exc_info,
         return_value=return_value)
@@ -900,6 +1018,8 @@ def _run_contained(proc_func, args, kwargs):
   except Exception:  # pylint: disable=broad-except
     exc_info = sys.exc_info()
     return _ProcessStatusInfo(
+        task_type=task_type,
+        task_id=task_id,
         is_successful=is_successful,
         exc_info=exc_info,
         return_value=return_value)
@@ -991,6 +1111,41 @@ def barrier():
   return _barrier
 
 
+_manager = None
+_manager_lock = threading.Lock()
+
+
+def manager():
+  """Returns the multiprocessing manager object for concurrency tools.
+
+  The manager object is useful as it controls a server process that holds
+  the python objects that can be shared across processes. This can be used
+  for parent-subprocess communication:
+
+  ```python
+  manager = multi_process_runner.manager()
+  some_event_happening_in_subprocess = manager.Event()
+  mpr = multi_process_runner.MultiProcessRunner(proc_func, cluster_spec,
+      args=(some_event_happening_in_subprocess,))
+  mpr.start()
+  some_event_happening_in_subprocess.wait()
+  # Do something that only should after some event happens in subprocess.
+  ```
+
+  Note that the user of multi_process_runner should not create additional
+  `multiprocessing.Manager()` objects; doing so can result in segfault in
+  some cases.
+
+  This method should only be called after multi_process_runner.test_main() is
+  called.
+  """
+  global _manager
+  with _manager_lock:
+    if _manager is None:
+      _manager = multiprocessing.Manager()
+    return _manager
+
+
 def test_main():
   """Main function to be called within `__main__` of a test file."""
   multi_process_lib.test_main()
diff --git a/tensorflow/python/distribute/multi_process_runner_test.py b/tensorflow/python/distribute/multi_process_runner_test.py
index c6266a5be26..7c1364b7d7c 100644
--- a/tensorflow/python/distribute/multi_process_runner_test.py
+++ b/tensorflow/python/distribute/multi_process_runner_test.py
@@ -156,11 +156,8 @@ class MultiProcessRunnerTest(test.TestCase):
     mpr.start()
     time.sleep(5)
     mpr.terminate('worker', 0)
-    with self.assertRaises(
-        multi_process_runner.UnexpectedSubprocessExitError) as cm:
-      mpr.join()
 
-    std_stream_results = cm.exception.mpr_result.stdout
+    std_stream_results = mpr.join().stdout
 
     # Worker 0 is terminated in the middle, so it should not have iteration 9
     # printed.
@@ -388,6 +385,135 @@ class MultiProcessRunnerTest(test.TestCase):
         'Subprocess worker-0 exited with exit code 10'):
       mpr.join()
 
+  def test_auto_restart(self):
+
+    def proc_func(counter):
+      counter.value += 1
+      if counter.value == 1:
+        raise ValueError
+
+    manager = multi_process_runner.manager()
+    counter = manager.Value(int, 0)
+    mpr = multi_process_runner.MultiProcessRunner(
+        proc_func,
+        multi_worker_test_base.create_cluster_spec(num_workers=1),
+        args=(counter,),
+        auto_restart=True)
+    mpr.start()
+    mpr.join()
+    self.assertEqual(counter.value, 2)
+
+  def test_auto_restart_and_timeout(self):
+
+    def proc_func():
+      logging.info('Running')
+      time.sleep(1)
+      raise ValueError
+
+    mpr = multi_process_runner.MultiProcessRunner(
+        proc_func,
+        multi_worker_test_base.create_cluster_spec(num_workers=1),
+        auto_restart=True,
+        list_stdout=True)
+    mpr.start()
+    with self.assertRaises(ValueError) as cm:
+      mpr.join(timeout=10)
+    self.assertGreater(
+        sum(['Running' in msg for msg in cm.exception.mpr_result.stdout]), 1)
+
+  def test_auto_restart_and_chief(self):
+    # If the chief has exited with zero exit code, auto restart should stop
+    # restarting other tasks even if they fail.
+
+    def proc_func():
+      time.sleep(1)
+      if multi_worker_test_base.get_task_type() != 'chief':
+        raise ValueError
+
+    manager = multi_process_runner.manager()
+    mpr = multi_process_runner.MultiProcessRunner(
+        proc_func,
+        multi_worker_test_base.create_cluster_spec(
+            has_chief=True, num_workers=1),
+        auto_restart=True)
+    mpr.start()
+    with self.assertRaises(ValueError):
+      mpr.join(timeout=10)
+
+  def test_auto_restart_failure_immediate_after_restart(self):
+    # Test the case when worker-0 fails immediately after worker-1 restarts.
+
+    def proc_func():
+      time.sleep(5)
+
+    mpr = multi_process_runner.MultiProcessRunner(
+        proc_func,
+        multi_worker_test_base.create_cluster_spec(
+            has_chief=False, num_workers=2),
+        auto_restart=True)
+    mpr.start()
+    pid = mpr.get_process_id('worker', 1)
+    mpr.terminate('worker', 1)
+    while mpr.get_process_id('worker', 1) == pid:
+      time.sleep(0.1)
+    mpr.terminate('worker', 0)
+    mpr.join(timeout=20)
+
+  def test_auto_restart_terminate(self):
+    # Tasks terminated by the user should also be restarted.
+
+    def proc_func(counter):
+      counter.value += 1
+      if counter.value == 1:
+        time.sleep(100)
+
+    manager = multi_process_runner.manager()
+    counter = manager.Value(int, 0)
+
+    mpr = multi_process_runner.MultiProcessRunner(
+        proc_func,
+        multi_worker_test_base.create_cluster_spec(
+            has_chief=False, num_workers=1),
+        args=(counter,),
+        auto_restart=True)
+    mpr.start()
+    time.sleep(3)
+    mpr.terminate('worker', 0)
+    mpr.join(timeout=20)
+    self.assertEqual(counter.value, 2)
+
+  def test_error_reporting_overrides_timeout_reporting(self):
+
+    def proc_func():
+      if self._worker_idx() == 1:
+        time.sleep(10000)
+      raise ValueError('Worker 0 errored')
+
+    mpr = multi_process_runner.MultiProcessRunner(
+        proc_func,
+        multi_worker_test_base.create_cluster_spec(num_workers=2))
+    mpr.start()
+
+    with self.assertRaisesRegex(
+        ValueError,
+        'Worker 0 errored'):
+      mpr.join(timeout=20)
+
+  def test_process_exists(self):
+
+    def proc_func():
+      time.sleep(100000)
+
+    mpr = multi_process_runner.MultiProcessRunner(
+        proc_func,
+        multi_worker_test_base.create_cluster_spec(num_workers=1))
+    mpr.start()
+    self.assertTrue(mpr.process_exists('worker', 0))
+    mpr.terminate('worker', 0)
+    # Worker 0 should exit at some point, or else the test would time out.
+    while mpr.process_exists('worker', 0):
+      time.sleep(1)
+
 
 class MultiProcessPoolRunnerTest(test.TestCase):
 
diff --git a/tensorflow/python/distribute/one_device_strategy.py b/tensorflow/python/distribute/one_device_strategy.py
index 2a58df28c14..8f40a5f7991 100644
--- a/tensorflow/python/distribute/one_device_strategy.py
+++ b/tensorflow/python/distribute/one_device_strategy.py
@@ -24,7 +24,6 @@ from tensorflow.python.distribute import distribute_utils
 from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import numpy_dataset
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -459,9 +458,8 @@ class _OneDeviceReplicaContext(distribute_lib.ReplicaContext):
   """ReplicaContext for OneDeviceStrategy."""
 
   def __init__(self, strategy):
-    zero = constant_op.constant(0, dtypes.int32)
     distribute_lib.ReplicaContext.__init__(
-        self, strategy, replica_id_in_sync_group=zero)
+        self, strategy, replica_id_in_sync_group=0)
 
   @property
   def devices(self):
diff --git a/tensorflow/python/distribute/parameter_server_strategy.py b/tensorflow/python/distribute/parameter_server_strategy.py
index 5bef04f1ae6..1d4c593d48b 100644
--- a/tensorflow/python/distribute/parameter_server_strategy.py
+++ b/tensorflow/python/distribute/parameter_server_strategy.py
@@ -410,10 +410,12 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
         self._container_strategy())
 
   def _experimental_distribute_values_from_function(self, value_fn):
-    # TODO(b/137795644): Implement this method for ParameterServerStrategy if
-    # needed.
-    raise NotImplementedError("_experimental_distribute_values_from_function "
-                              "not yet implemented in ParameterServerStrategy.")
+    per_replica_values = []
+    for replica_id in range(self._num_replicas_in_sync):
+      per_replica_values.append(
+          value_fn(distribute_lib.ValueContext(replica_id,
+                                               self._num_replicas_in_sync)))
+    return distribute_utils.regroup(per_replica_values, always_wrap=True)
 
   def _broadcast_to(self, tensor, destinations):
     # This is both a fast path for Python constants, and a way to delay
diff --git a/tensorflow/python/distribute/parameter_server_strategy_v2.py b/tensorflow/python/distribute/parameter_server_strategy_v2.py
new file mode 100644
index 00000000000..02f3c35a716
--- /dev/null
+++ b/tensorflow/python/distribute/parameter_server_strategy_v2.py
@@ -0,0 +1,202 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Parameter server strategy V2 class.
+
+This is currently under development and the API is subject to change.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import logging
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import parameter_server_strategy
+from tensorflow.python.distribute import sharded_variable
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import tf_contextlib
+
+
+# pylint: disable=protected-access
+class ParameterServerStrategyV2(distribute_lib.Strategy):
+  """An asynchronous multi-worker parameter server tf.distribute strategy.
+
+  Currently, `ParameterServerStrategyV2` is not supported to be used as a
+  standalone tf.distribute strategy. It must be used in conjunction with
+  `Client`. The recommended way of using the combination is through a
+  `ParameterServerClient` object. Please see `Client` and
+  `ParameterServerClient` for more information.
+
+  This is currently under development, and the API as well as implementation
+  is subject to changes.
+  """
+
+  def __init__(self, cluster_resolver):
+    """Initializes the V2 parameter server strategy.
+
+    Args:
+      cluster_resolver: a `tf.distribute.cluster_resolver.ClusterResolver`
+        object.
+    """
+    self._extended = ParameterServerStrategyV2Extended(self, cluster_resolver)
+    self._cluster_resolver = cluster_resolver
+    self._verify_args_and_config(cluster_resolver)
+    logging.info(
+        "ParameterServerStrategyV2 is initialized with cluster_spec: "
+        "%s", cluster_resolver.cluster_spec())
+    super(ParameterServerStrategyV2, self).__init__(self._extended)
+
+  @tf_contextlib.contextmanager
+  def experimental_variable_partitioning_scope(self):
+    """A context manager for creating `ShardedVariable`.
+
+    Variables created inside a `with experimental_variable_partitioning_scope()`
+    code block will be of type `ShardedVariable` and their values are
+    partitioned among parameter servers along the first / outermost axis. The
+    number of shards are equal to the number of parameter servers.
+
+    Variables created within this scope must be initialized using a callable as
+    `initial_value` and a known shape.
+
+    Div partition strategy is used to partition the variable. Assuming we
+    assign consective integer ids along the first axis of the variable, then ids
+    are assigned to shards in a contiguous manner, while attempting to keep each
+    shard size identical. If the ids do not evenly divide the number of shards,
+    each of the first several shards will be assigned one more id. For instance,
+    a variable whose first dimension is 13 has 13 ids, and they are split across
+    5 shards as: `[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10], [11, 12]]`.
+
+    Yields:
+      A context manager for creating `ShardedVariable`.
+    """
+    with variable_scope.variable_creator_scope(
+        self._extended._make_sharded_variable_creator()):
+      yield
+
+  def _verify_args_and_config(self, cluster_resolver):
+    if not cluster_resolver.cluster_spec():
+      raise ValueError("Cluster spec must be non-empty in `cluster_resolver`.")
+    if self.extended._num_gpus_per_worker > 1:
+      raise NotImplementedError("Multi-gpu is not supported yet.")
+
+
+class ParameterServerStrategyV2Extended(
+    parameter_server_strategy.ParameterServerStrategyExtended):
+  """Extended class for ParameterServerStrategyV2.
+
+  Please see `tf.distribute.StrategyExtended` doc for more information.
+  """
+
+  def __init__(self, container_strategy, cluster_resolver):
+    """Initialization of ParameterServerStrategyV2Extended."""
+    super(ParameterServerStrategyV2Extended, self).__init__(container_strategy)
+    self._num_ps = len(cluster_resolver.cluster_spec().as_dict().get("ps", []))
+    self._variable_count = 0
+
+  def _create_variable(self, next_creator, **kwargs):
+
+    if "colocate_with" in kwargs:
+      colocate_with = kwargs["colocate_with"]
+      # Clear the variable scope to avoid possible conflicts between device
+      # scope and colocation scope.
+      with ops.device(None):
+        with ops.colocate_with(colocate_with):
+          var = next_creator(**kwargs)
+          logging.debug(
+              "Creating variable (name:%s, shape:%r) that colocates with %s",
+              var.name, var.shape, kwargs["colocate_with"].name)
+          return var
+
+    # Clear the colocation scope to avoid possible conflicts between device
+    # scope and colocation scope.
+    with ops.colocate_with(None, ignore_existing=True):
+      with ops.device("/job:ps/task:%d" %
+                      (self._variable_count % self._num_ps)):
+        var = next_creator(**kwargs)
+        logging.debug(
+            "Creating variable (name:%s, shape:%r) on /job:ps/task:%d",
+            var.name, var.shape, (self._variable_count % self._num_ps))
+        self._variable_count += 1
+        return var
+
+  def _make_sharded_variable_creator(self):
+    """Returns a function conforming to the `variable_creator` signature.
+
+    The returned function creates `ShardedVariable` when called.
+    """
+
+    def sharded_variable_creator(next_creator, **kwargs):
+      if "shape" not in kwargs or kwargs["shape"] is None:
+        raise ValueError("shape must be explicitly specified when creating "
+                         "sharded variables")
+      init_fn = kwargs.get("initial_value", None)
+      # We intentionally don't allow non-callable initial_value to ensure the
+      # value is created on PS but not client. If the value is created on
+      # client, it will needed to be sent to PS for variable initialization,
+      # which is inefficient and can potentially hit the 2GB limit on protobuf
+      # serialization.
+      if init_fn is None or not callable(init_fn):
+        raise ValueError("initial_value must be specified as a callable when "
+                         "creating sharded variables")
+
+      # Use "div" partition strategy to partition the variable.
+      full_shape = kwargs["shape"]
+      if self._num_ps < full_shape[0]:
+        num_shards = self._num_ps
+      else:
+        num_shards = full_shape[0]
+      offsets = []
+      base = full_shape[0] // num_shards
+      extra = full_shape[0] % num_shards
+      for i in range(num_shards):
+        if i == 0:
+          offsets.append(0)
+        else:
+          prev_shard_size = base + (1 if i - 1 < extra else 0)
+          offsets.append(offsets[i - 1] + prev_shard_size)
+
+      # Note: The way we initialize sharded variables is suboptimal, as it
+      # needs to create the full value tensor separately on each PS which the
+      # variable is going to be placed on. The full value could be very large
+      # and consume a lot of memory. The ideal way is to only create what's
+      # needed on the shard, however that's not practical because:
+      #  1. Initializers don't have sharded behavior support, even though some
+      #     initializers (e.g, uniform) can be used directly.
+      #  2. tf.Variable signature requires "initial_value" to be either a value
+      #     or a callable without arguments, meaning it is not straightforward
+      #     to make the sharded component from it.
+      def init_shard_fn(shard_index):
+        full_value = init_fn()
+        if shard_index < num_shards - 1:
+          return full_value[offsets[shard_index]:offsets[shard_index + 1]]
+        else:
+          return full_value[offsets[shard_index]:]
+
+      var_list = []
+      for i in range(num_shards):
+        kwargs["shape"] = None
+        kwargs["initial_value"] = lambda: init_shard_fn(i)
+        var_list.append(next_creator(**kwargs))
+
+      result = sharded_variable.ShardedVariable(var_list)
+      return result
+
+    return sharded_variable_creator
+
+  def _call_for_each_replica(self, fn, args, kwargs):
+    # TODO(rchao): Consider implementing sync PS training.
+    raise NotImplementedError("Sync PS training is not implemented yet.")
diff --git a/tensorflow/python/distribute/strategy_combinations.py b/tensorflow/python/distribute/strategy_combinations.py
index 7bb3aea5461..b72cdd77a0e 100644
--- a/tensorflow/python/distribute/strategy_combinations.py
+++ b/tensorflow/python/distribute/strategy_combinations.py
@@ -204,6 +204,7 @@ multi_worker_mirrored_2x1_cpu = combinations.NamedDistribution(
     _get_multi_worker_mirrored_creator(required_gpus=0),
     has_chief=True,
     num_workers=1,
+    use_pool_runner=True,
 )
 # chief + 1 worker, with 1 GPU each.
 multi_worker_mirrored_2x1_gpu = combinations.NamedDistribution(
@@ -212,6 +213,7 @@ multi_worker_mirrored_2x1_gpu = combinations.NamedDistribution(
     has_chief=True,
     num_workers=1,
     required_gpus=1,
+    use_pool_runner=True,
 )
 # chief + 1 worker, with 2 GPU each.
 multi_worker_mirrored_2x2_gpu = combinations.NamedDistribution(
@@ -220,6 +222,7 @@ multi_worker_mirrored_2x2_gpu = combinations.NamedDistribution(
     has_chief=True,
     num_workers=1,
     required_gpus=2,
+    use_pool_runner=True,
 )
 # chief + 3 workers, with CPU.
 multi_worker_mirrored_4x1_cpu = combinations.NamedDistribution(
@@ -227,6 +230,7 @@ multi_worker_mirrored_4x1_cpu = combinations.NamedDistribution(
     _get_multi_worker_mirrored_creator(required_gpus=0),
     has_chief=True,
     num_workers=3,
+    use_pool_runner=True,
 )
 
 
diff --git a/tensorflow/python/distribute/strategy_common_test.py b/tensorflow/python/distribute/strategy_common_test.py
index 9021c53e129..ece8c573ed1 100644
--- a/tensorflow/python/distribute/strategy_common_test.py
+++ b/tensorflow/python/distribute/strategy_common_test.py
@@ -19,9 +19,12 @@ from __future__ import division
 from __future__ import print_function
 
 from absl.testing import parameterized
+import numpy as np
 
+from tensorflow.python.compat import v2_compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_combinations
@@ -32,31 +35,29 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
-class StrategyReduceTest(test.TestCase, parameterized.TestCase):
+@combinations.generate(
+    combinations.combine(
+        strategy=[
+            strategy_combinations.multi_worker_mirrored_2x1_cpu,
+            strategy_combinations.multi_worker_mirrored_2x1_gpu,
+        ] + strategy_combinations.all_strategies,
+        mode=['eager']))
+class StrategyTest(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(
-      combinations.combine(
-          strategy=[
-              strategy_combinations.multi_worker_mirrored_2x1_cpu,
-              strategy_combinations.multi_worker_mirrored_2x1_gpu,
-          ] + strategy_combinations.strategies_minus_tpu,
-          mode=['eager']))
   def testSimpleReduce(self, strategy):
+    per_replica_value = strategy.experimental_distribute_values_from_function(
+        lambda _: array_ops.ones((), dtypes.float32))
 
     def fn_eager():
 
-      def replica_fn():
-        return array_ops.ones((), dtypes.float32)
-
-      per_replica_value = strategy.run(replica_fn)
       return strategy.reduce(
           reduce_util.ReduceOp.SUM, value=per_replica_value, axis=None)
 
     fn_graph = def_function.function(fn_eager)
-
     # Run reduce under the strategy scope to explicitly enter
     # strategy default_device scope.
     with strategy.scope():
@@ -68,6 +69,22 @@ class StrategyReduceTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(fn_eager().numpy(), 1.0 * strategy.num_replicas_in_sync)
     self.assertEqual(fn_graph().numpy(), 1.0 * strategy.num_replicas_in_sync)
 
+  def testCaptureReplicaId(self, strategy):
+    m = {}
+
+    @def_function.function
+    def f():
+      return ds_context.get_replica_context().replica_id_in_sync_group
+
+    @def_function.function
+    def g():
+      # Make g() a stateful function so it's traced twice.
+      if m.get('v', None) is None:
+        m['v'] = variables.Variable(0.)
+      return strategy.run(f)
+
+    g()
+
 
 @combinations.generate(
     combinations.combine(
@@ -102,6 +119,55 @@ class DistributedCollectiveAllReduceStrategyTest(
         sum_value.numpy(),
         expected_sum_on_workers[multi_worker_test_base.get_task_type()])
 
+  def testSimpleInputFromDatasetLastPartialBatch(self, strategy):
+    global_batch_size = 8
+    dataset = dataset_ops.DatasetV2.range(14).batch(
+        global_batch_size, drop_remainder=False)
+    input_iterator = iter(strategy.experimental_distribute_dataset(dataset))
+
+    @def_function.function
+    def run(input_iterator):
+      return strategy.run(lambda x: x, args=(next(input_iterator),))
+
+    # Let the complete batch go.
+    run(input_iterator)
+
+    # `result` is an incomplete batch
+    result = run(input_iterator)
+    expected_data_on_workers = {'chief': [8, 9, 10], 'worker': [11, 12, 13]}
+    self.assertTrue(
+        np.array_equal(
+            result.numpy(),
+            expected_data_on_workers[multi_worker_test_base.get_task_type()]))
+
+  def testSimpleInputFromFnLastPartialBatch(self, strategy):
+
+    def dataset_fn(input_context):
+      global_batch_size = 8
+      batch_size = input_context.get_per_replica_batch_size(global_batch_size)
+      dataset = dataset_ops.DatasetV2.range(14).batch(
+          batch_size, drop_remainder=False)
+      return dataset.shard(input_context.num_input_pipelines,
+                           input_context.input_pipeline_id)
+
+    input_iterator = iter(
+        strategy.experimental_distribute_datasets_from_function(dataset_fn))
+
+    @def_function.function
+    def run(input_iterator):
+      return strategy.run(lambda x: x, args=(next(input_iterator),))
+
+    # Let the complete batch go.
+    run(input_iterator)
+    # `result` is an incomplete batch
+    result = run(input_iterator)
+
+    expected_data_on_worker = {'chief': [8, 9, 10, 11], 'worker': [12, 13]}
+    self.assertTrue(
+        np.array_equal(
+            result.numpy(), expected_data_on_worker[
+                multi_worker_test_base.get_task_type()]))
+
   def testReduceHostTensor(self, strategy):
     reduced = strategy.reduce(
         reduce_util.ReduceOp.SUM, array_ops.identity(1.), axis=None)
@@ -172,4 +238,5 @@ class StrategyClusterResolverTest(test.TestCase, parameterized.TestCase):
 
 
 if __name__ == '__main__':
+  v2_compat.enable_v2_behavior()
   combinations.main()
diff --git a/tensorflow/python/distribute/strategy_test_lib.py b/tensorflow/python/distribute/strategy_test_lib.py
index 06913db5c72..0bc4c6fca68 100644
--- a/tensorflow/python/distribute/strategy_test_lib.py
+++ b/tensorflow/python/distribute/strategy_test_lib.py
@@ -59,6 +59,15 @@ class _TestException(Exception):
   pass
 
 
+# Conditionally wrap the fn in a def_function.function (so it runs in graph
+# mode).
+def _maybe_run_in_function(fn, run_in_function=False):
+  if not run_in_function or not context.executing_eagerly():
+    return fn
+  else:
+    return def_function.function()(fn)
+
+
 # May be the argument to either distribution.extended.call_for_each_replica() or
 # get_replica_context().merge_call()
 def _raise_exception_fn(_=None):
@@ -419,7 +428,7 @@ class DistributionTestBase(test.TestCase):
       global_step_values = self.evaluate(global_step_tensors)
       self.assertEqual((1,) * len(global_step_tensors), global_step_values)
 
-  def _test_numpy_dataset(self, strategy, session=None):
+  def _test_numpy_dataset(self, strategy, session=None, run_in_function=False):
     cached_session = session or self.cached_session()
     with strategy.scope(), cached_session as sess:
       x = np.asarray([[1, 2], [6, 12], [2, 4], [5, 10], [3, 6], [4, 8]])
@@ -440,7 +449,8 @@ class DistributionTestBase(test.TestCase):
       self.evaluate(i.initializer)
 
       def run_and_concatenate(strategy, i):
-        x, y = strategy.experimental_run(lambda z: z, i)
+        x, y = strategy.experimental_run(
+            _maybe_run_in_function(lambda z: z, run_in_function), i)
         x, y = self.evaluate((strategy.experimental_local_results(x),
                               strategy.experimental_local_results(y)))
         return np.concatenate(x), np.concatenate(y)
@@ -596,50 +606,61 @@ class OneDeviceDistributionTestBase(test.TestCase):
 class TwoDeviceDistributionTestBase(test.TestCase):
   """Some tests that should work with any two-device DistributionStrategy."""
 
-  def _test_run(self, strategy):
-    out1 = strategy.run(
-        lambda: ds_context.get_replica_context().replica_id_in_sync_group + 1)
+  def _test_run(self, strategy, run_in_function=False):
+    out1 = strategy.run(_maybe_run_in_function(
+        lambda: ds_context.get_replica_context().replica_id_in_sync_group + 1,
+        run_in_function))
     self.assertAllEqual([1, 2], self.evaluate(strategy.unwrap(out1)))
 
-    out2 = strategy.run(lambda x: {"a": x * 2, "b": x * x}, args=(out1,))
+    out2 = strategy.run(_maybe_run_in_function(
+        lambda x: {"a": x * 2, "b": x * x}, run_in_function), args=(out1,))
     out2_vals = self.evaluate(nest.map_structure(strategy.unwrap, out2))
     self.assertAllEqual([2, 4], out2_vals["a"])
     self.assertAllEqual([1, 4], out2_vals["b"])
 
-    out3 = strategy.run(lambda b, a: a + 2 * b + 2, kwargs=out2)
+    out3 = strategy.run(_maybe_run_in_function(
+        lambda b, a: a + 2 * b + 2, run_in_function), kwargs=out2)
     self.assertAllEqual([6, 14], self.evaluate(strategy.unwrap(out3)))
 
-  def _test_all_reduce_sum(self, strategy):
+  def _test_all_reduce_sum(self, strategy, run_in_function=False):
     self._test_collective_comms(
         strategy,
         _all_sum,
         inputs=([1., 3.], [[39., 2.], [3., 41.]]),
-        expected=(4., [42., 43.]))
+        expected=(4., [42., 43.]),
+        run_in_function=run_in_function)
 
-  def _test_all_reduce_sum_gradients(self, strategy):
+  def _test_all_reduce_sum_gradients(self, strategy, run_in_function=False):
     self._test_collective_comms_gradients(
-        strategy, _all_sum, inputs=[1., 3.], expected_grads=[4., 4.])
+        strategy, _all_sum, inputs=[1., 3.], expected_grads=[4., 4.],
+        run_in_function=run_in_function)
 
-  def _test_all_reduce_sum_gradient_tape(self, strategy):
+  def _test_all_reduce_sum_gradient_tape(self, strategy, run_in_function=False):
     self._test_collective_comms_gradient_tape(
-        strategy, _all_sum, inputs=[1., 3.], expected_grads=[4., 4.])
+        strategy, _all_sum, inputs=[1., 3.], expected_grads=[4., 4.],
+        run_in_function=run_in_function)
 
-  def _test_all_reduce_mean(self, strategy):
+  def _test_all_reduce_mean(self, strategy, run_in_function=False):
     self._test_collective_comms(
         strategy,
         _all_mean,
         inputs=([1., 3.], [[39., 2.], [3., 41.]]),
-        expected=(2., [21., 21.5]))
+        expected=(2., [21., 21.5]),
+        run_in_function=run_in_function)
 
-  def _test_all_reduce_mean_gradients(self, strategy):
+  def _test_all_reduce_mean_gradients(self, strategy, run_in_function=False):
     self._test_collective_comms_gradients(
-        strategy, _all_mean, inputs=[1., 3.], expected_grads=[2., 2.])
+        strategy, _all_mean, inputs=[1., 3.], expected_grads=[2., 2.],
+        run_in_function=run_in_function)
 
-  def _test_all_reduce_mean_gradient_tape(self, strategy):
+  def _test_all_reduce_mean_gradient_tape(self, strategy,
+                                          run_in_function=False):
     self._test_collective_comms_gradient_tape(
-        strategy, _all_mean, inputs=[1., 3.], expected_grads=[2., 2.])
+        strategy, _all_mean, inputs=[1., 3.], expected_grads=[2., 2.],
+        run_in_function=run_in_function)
 
-  def _test_collective_comms(self, strategy, comm_fn, inputs, expected):
+  def _test_collective_comms(self, strategy, comm_fn, inputs, expected,
+                             run_in_function=False):
     inputs = strategy.make_input_fn_iterator(
         lambda _: dataset_ops.Dataset.from_tensor_slices(inputs))
 
@@ -647,14 +668,16 @@ class TwoDeviceDistributionTestBase(test.TestCase):
     outputs = self.evaluate(
         list(
             map(strategy.experimental_local_results,
-                strategy.experimental_run(comm_fn, inputs))))
+                strategy.experimental_run(
+                    _maybe_run_in_function(comm_fn, run_in_function), inputs))))
     self.assertAllEqual([expected[0], expected[0]], outputs[0])
     self.assertAllEqual([expected[1], expected[1]], outputs[1])
 
   def _test_collective_comms_gradients(self, strategy, comm_fn, inputs,
-                                       expected_grads):
-    if context.executing_eagerly():
-      self.skipTest("`tf.gradients` is not supported with eager execution.")
+                                       expected_grads, run_in_function=False):
+    if context.executing_eagerly() and not run_in_function:
+      self.skipTest("`tf.gradients` is not supported with eager execution "
+                    "without using tf.functions.")
 
     def step(c):
       x = array_ops.identity(42.)
@@ -669,10 +692,12 @@ class TwoDeviceDistributionTestBase(test.TestCase):
         expected_grads,
         self.evaluate(
             strategy.experimental_local_results(
-                strategy.experimental_run(step, inputs))))
+                strategy.experimental_run(
+                    _maybe_run_in_function(step, run_in_function), inputs))))
 
   def _test_collective_comms_gradient_tape(self, strategy, comm_fn, inputs,
-                                           expected_grads):
+                                           expected_grads,
+                                           run_in_function=False):
 
     def step(c):
       x = array_ops.identity(42.)
@@ -689,7 +714,9 @@ class TwoDeviceDistributionTestBase(test.TestCase):
         expected_grads,
         self.evaluate(
             strategy.experimental_local_results(
-                strategy.experimental_run(step, inputs))))
+                strategy.experimental_run(
+                    _maybe_run_in_function(step, run_in_function),
+                    inputs))))
 
 
 class RemoteSingleWorkerMirroredStrategyBase(DistributionTestBase):
diff --git a/tensorflow/python/distribute/tf_function_test.py b/tensorflow/python/distribute/tf_function_test.py
index 6621f51cf32..967abebdfb3 100644
--- a/tensorflow/python/distribute/tf_function_test.py
+++ b/tensorflow/python/distribute/tf_function_test.py
@@ -32,11 +32,14 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.saved_model import save_context
+from tensorflow.python.saved_model import save_options
 
 
 class TFFunctionTest(test.TestCase, parameterized.TestCase):
 
-  def setup(self):
+  def setUp(self):
+    super().setUp()
     # Clear the state for every test.
     def_function.run_functions_eagerly(False)
 
@@ -105,6 +108,8 @@ class TFFunctionTest(test.TestCase, parameterized.TestCase):
       ))
   def testReadVariableInsideFunction(self, distribution, run_functions_eagerly):
 
+    def_function.run_functions_eagerly(run_functions_eagerly)
+
     # Get devices on which variables will be placed. Default strategy does not
     # define this, so assume cpu:0 in that case.
     try:
@@ -127,11 +132,40 @@ class TFFunctionTest(test.TestCase, parameterized.TestCase):
     def read():
       return v.read_value()
 
-    for i, d in enumerate(devices):
-      with ops.device(d):
-        # Verify that the value from each device is read, when in that device
-        # scope.
-        self.assertEqual(math_ops.cast(i, dtypes.float32), read())
+    # Verify that the value from each device is read, when in that device
+    # scope. Doing this inside strategy scope is needed to force function
+    # retracing on each device, otherwise `read()` will only be traced once
+    # on the first device and following variable read will always read the value
+    # on the first replica.
+    with distribution.scope():
+      for i, d in enumerate(devices):
+        with ops.device(d):
+          self.assertEqual(math_ops.cast(i, dtypes.float32), read())
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.all_strategies, mode=["eager"]))
+  def testRetraceOnSaving(self, distribution):
+    with distribution.scope():
+      v = variables.Variable(0.)
+
+    tracing_count = [0]
+
+    @def_function.function
+    def func():
+      tracing_count[0] += 1
+      return v + 1.
+
+    distribution.run(func)
+    prev_tracing_count = tracing_count[0]
+    with save_context.save_context(save_options.SaveOptions()):
+      func()
+    self.assertEqual(prev_tracing_count + 1, tracing_count[0])
+
+    prev_tracing_count = tracing_count[0]
+    with save_context.save_context(save_options.SaveOptions()):
+      func()
+    self.assertEqual(prev_tracing_count, tracing_count[0])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index 9684bf2dd6a..22aeb37ff7c 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -537,13 +537,16 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
     self._logical_device_stack = [0]
 
     if context.executing_eagerly():
-      # In async remote eager, we want to sync the exectors before exiting the
+      # In async remote eager, we want to sync the executors before exiting the
       # program.
       def async_wait():
         if context.context()._context_handle is not None:  # pylint: disable=protected-access
           context.async_wait()
       atexit.register(async_wait)
 
+    # Flag to turn on VariablePolicy
+    self._use_var_policy = False
+
   def _validate_colocate_with_variable(self, colocate_with_variable):
     distribute_utils. validate_colocate(colocate_with_variable, self)
 
@@ -687,7 +690,10 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
             select_replica, per_replica_inputs),))
 
       replicate_outputs = tpu.replicate(
-          run_fn, replicate_inputs, device_assignment=self._device_assignment)
+          run_fn,
+          replicate_inputs,
+          device_assignment=self._device_assignment,
+          xla_options=tpu.XLAOptions(use_spmd_for_xla_partitioning=False))
 
       # If run_fn has tensor outputs, tpu.replicate returns a list of list. We
       # will flatten it in this case. If run_fn has no tensor outputs,
@@ -870,8 +876,8 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
 
     return distribute_utils.create_mirrored_variable(
         self._container_strategy(), _real_mirrored_creator,
-        tpu_values.TPUMirroredVariable, tpu_values.TPUSyncOnReadVariable,
-        **kwargs)
+        distribute_utils.TPU_VARIABLE_CLASS_MAPPING,
+        distribute_utils.TPU_VARIABLE_POLICY_MAPPING, **kwargs)
 
   def _reduce_to(self, reduce_op, value, destinations, experimental_hints):
     if (isinstance(value, values.DistributedValues) or
@@ -1163,7 +1169,8 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
             replicate_inputs,
             device_assignment=self._device_assignment,
             maximum_shapes=maximum_shapes,
-            padding_spec=padding_spec)
+            padding_spec=padding_spec,
+            xla_options=tpu.XLAOptions(use_spmd_for_xla_partitioning=False))
 
       # Remove all no ops that may have been added during 'tpu.replicate()'
       if isinstance(result[0], list):
@@ -1202,9 +1209,7 @@ class _TPUReplicaContext(distribute_lib.ReplicaContext):
 
   # TODO(sourabhbajaj): Call for each replica should be updating this.
   # TODO(b/118385803): Always properly initialize replica_id.
-  def __init__(self, strategy, replica_id_in_sync_group=None):
-    if replica_id_in_sync_group is None:
-      replica_id_in_sync_group = constant_op.constant(0, dtypes.int32)
+  def __init__(self, strategy, replica_id_in_sync_group=0):
     distribute_lib.ReplicaContext.__init__(
         self, strategy, replica_id_in_sync_group=replica_id_in_sync_group)
 
@@ -1212,7 +1217,7 @@ class _TPUReplicaContext(distribute_lib.ReplicaContext):
   def devices(self):
     distribute_lib.require_replica_context(self)
     ds = self._strategy
-    replica_id = tensor_util.constant_value(self._replica_id_in_sync_group)
+    replica_id = tensor_util.constant_value(self.replica_id_in_sync_group)
 
     if replica_id is None:  # Non-constant `Tensor` inside `tpu.replicate`.
       # TODO(cjfj): Return other devices when model parallelism is supported.
diff --git a/tensorflow/python/distribute/tpu_strategy_compilation_test.py b/tensorflow/python/distribute/tpu_strategy_compilation_test.py
new file mode 100644
index 00000000000..ed61c063a4f
--- /dev/null
+++ b/tensorflow/python/distribute/tpu_strategy_compilation_test.py
@@ -0,0 +1,87 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TPUStrategy in regards to compiling programs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute import tpu_strategy as tpu_lib
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import remote
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.platform import flags
+from tensorflow.python.tpu import tpu_strategy_util
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string("tpu", "", "Name of TPU to connect to.")
+flags.DEFINE_string("project", None, "Name of GCP project with TPU.")
+flags.DEFINE_string("zone", None, "Name of GCP zone with TPU.")
+
+
+def get_tpu_cluster_resolver():
+  resolver = tpu_cluster_resolver.TPUClusterResolver(
+      tpu=FLAGS.tpu,
+      zone=FLAGS.zone,
+      project=FLAGS.project,
+  )
+  return resolver
+
+
+def get_tpu_strategy():
+  resolver = get_tpu_cluster_resolver()
+  remote.connect_to_cluster(resolver)
+  tpu_strategy_util.initialize_tpu_system(resolver)
+  strategy = tpu_lib.TPUStrategyV2(resolver)
+  return strategy
+
+
+# TODO(b/158494076): Merge this test back into TPUStrategy tests
+# (tpu_strategy_test) once MLIR bridge is enabled by default.
+class TPUStrategyCompilationTest(test.TestCase):
+
+  def test_functions_compile_same_signature(self):
+    """Tests compiling different functions with the same signature."""
+    strategy = get_tpu_strategy()
+
+    @def_function.function
+    def return_one():
+
+      def computation():
+        return constant_op.constant(1)
+
+      return strategy.run(computation)
+
+    @def_function.function
+    def return_two():
+
+      def computation():
+        return constant_op.constant(2)
+
+      return strategy.run(computation)
+
+    expected_result_ones = [1 for _ in range(0, strategy.num_replicas_in_sync)]
+    self.assertAllEqual(expected_result_ones,
+                        strategy.experimental_local_results(return_one()))
+
+    expected_result_twos = [2 for _ in range(0, strategy.num_replicas_in_sync)]
+    self.assertAllEqual(expected_result_twos,
+                        strategy.experimental_local_results(return_two()))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index 850981e073e..c1318927ca8 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -18,12 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from absl.testing import parameterized
 
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import strategy_test_lib
 from tensorflow.python.distribute import tpu_strategy as tpu_lib
 from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.eager import def_function
@@ -38,6 +42,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import embedding_ops
@@ -50,6 +55,10 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.tpu import device_assignment as device_assignment_lib
 from tensorflow.python.tpu import tpu
 from tensorflow.python.tpu import tpu_strategy_util
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training import server_lib
+from tensorflow.python.training.tracking import util
+from tensorflow.python.util import nest
 
 
 FLAGS = flags.FLAGS
@@ -71,7 +80,7 @@ def get_tpu_strategy(enable_packed_var=False):
   resolver = get_tpu_cluster_resolver()
   remote.connect_to_cluster(resolver)
   tpu_strategy_util.initialize_tpu_system(resolver)
-  strategy = tpu_lib.TPUStrategy(resolver)
+  strategy = tpu_lib.TPUStrategyV2(resolver)
   strategy._enable_packed_variable_in_eager_mode = enable_packed_var
   return strategy
 
@@ -136,6 +145,18 @@ class TPUTest(test.TestCase):
 @parameterized.named_parameters([("PackedVar", True), ("", False)])
 class TPUStrategyTest(test.TestCase, parameterized.TestCase):
 
+  def test_function_compile_with_xla(self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
+    with strategy.scope():
+      v = variables.Variable(1.0)
+
+    @def_function.function
+    def func():
+      return v.read_value() + 1.0
+
+    with ops.device("/device:TPU:0"):
+      self.assertAllEqual(func(), 2.0)
+
   def test_sequential_experimental_runs(self, enable_packed_var):
     resolver = get_tpu_cluster_resolver()
     remote.connect_to_cluster(resolver)
@@ -143,15 +164,15 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
     # Computation replicated to all cores.
     device_assignment = device_assignment_lib.DeviceAssignment.build(
         topology, num_replicas=2)
-    strategy = tpu_lib.TPUStrategy(
-        resolver, device_assignment=device_assignment)
+    strategy = tpu_lib.TPUStrategyV2(
+        resolver, experimental_device_assignment=device_assignment)
     strategy._enable_packed_variable_in_eager_mode = enable_packed_var
 
     # Computation on the 1st core.
     device_assignment2 = device_assignment_lib.DeviceAssignment.build(
         topology, num_replicas=1)
-    strategy2 = tpu_lib.TPUStrategy(
-        resolver, device_assignment=device_assignment2)
+    strategy2 = tpu_lib.TPUStrategyV2(
+        resolver, experimental_device_assignment=device_assignment2)
 
     def computation(x):
       return math_ops.square(x)
@@ -252,7 +273,7 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
     resolver = get_tpu_cluster_resolver()
     remote.connect_to_cluster(resolver)
     topology = tpu_strategy_util.initialize_tpu_system(resolver)
-    all_core_strategy = tpu_lib.TPUStrategy(resolver)
+    all_core_strategy = tpu_lib.TPUStrategyV2(resolver)
     all_core_strategy._enable_packed_variable_in_eager_mode = enable_packed_var
 
     with all_core_strategy.scope():
@@ -262,16 +283,16 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
     # Computation on the 1st core.
     device_assignment = device_assignment_lib.DeviceAssignment.build(
         topology, num_replicas=1)
-    first_core_strategy = tpu_lib.TPUStrategy(
-        resolver, device_assignment=device_assignment)
+    first_core_strategy = tpu_lib.TPUStrategyV2(
+        resolver, experimental_device_assignment=device_assignment)
     first_core_strategy._enable_packed_variable_in_eager_mode = (
         enable_packed_var)
 
     # Computation on the 2nd core.
     device_assignment2 = device_assignment_lib.DeviceAssignment(
         topology, [[[0, 0, 0, 1]]])
-    second_core_strategy = tpu_lib.TPUStrategy(
-        resolver, device_assignment=device_assignment2)
+    second_core_strategy = tpu_lib.TPUStrategyV2(
+        resolver, experimental_device_assignment=device_assignment2)
     second_core_strategy._enable_packed_variable_in_eager_mode = (
         enable_packed_var)
 
@@ -297,16 +318,16 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
     # Strategy for the 1st core.
     device_assignment = device_assignment_lib.DeviceAssignment.build(
         topology, num_replicas=1)
-    first_core_strategy = tpu_lib.TPUStrategy(
-        resolver, device_assignment=device_assignment)
+    first_core_strategy = tpu_lib.TPUStrategyV2(
+        resolver, experimental_device_assignment=device_assignment)
     first_core_strategy._enable_packed_variable_in_eager_mode = (
         enable_packed_var)
 
     # Strategy for the 2nd core.
     device_assignment2 = device_assignment_lib.DeviceAssignment(
         topology, [[[0, 0, 0, 1]]])
-    second_core_strategy = tpu_lib.TPUStrategy(
-        resolver, device_assignment=device_assignment2)
+    second_core_strategy = tpu_lib.TPUStrategyV2(
+        resolver, experimental_device_assignment=device_assignment2)
     second_core_strategy._enable_packed_variable_in_eager_mode = (
         enable_packed_var)
 
@@ -453,7 +474,7 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual("/job:localhost/replica:0/task:0/device:TPU:1",
                         results[1].backing_device)
 
-  def test_composite_input(self, enable_packed_var):
+  def test_composite_input_output(self, enable_packed_var):
     strategy = get_tpu_strategy(enable_packed_var)
     if strategy.num_replicas_in_sync != 2:
       self.skipTest("Test assumes two replicas.")
@@ -468,9 +489,12 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
       def tpu_function(sparse):
         # Assumes dense_shape is (2, *)
         looked_up = array_ops.gather(table, sparse.values)
-        return math_ops.unsorted_segment_sum(looked_up, sparse.indices[:, 0], 2)
+        segment_sum = math_ops.unsorted_segment_sum(
+            looked_up, sparse.indices[:, 0], 2)
+        return sparse, segment_sum
 
-      return strategy.experimental_local_results(
+      return nest.map_structure(
+          strategy.experimental_local_results,
           strategy.run(tpu_function, args=(next(iterator),)))
 
     def dataset_fn(_):
@@ -491,9 +515,69 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
             distribute_lib.InputOptions(
                 experimental_prefetch_to_device=False)))
 
-    result = sparse_lookup(dataset)
-    self.assertAllEqual(result,
-                        [[[0.0, 1.0], [3.0, 8.0]], [[0.0, 1.0], [3.0, 8.0]]])
+    sparse, result = sparse_lookup(dataset)
+
+    # All replicas return identical reults.
+    for replica in range(strategy.num_replicas_in_sync):
+      self.assertIsInstance(sparse[replica], sparse_tensor.SparseTensor)
+      self.assertAllEqual(sparse[replica].indices, [[0, 0], [1, 0], [1, 1]])
+      self.assertAllEqual(sparse[replica].values, [0, 0, 1])
+      self.assertAllEqual(sparse[replica].dense_shape, [2, 2])
+      self.assertAllEqual(result[replica], [[0.0, 1.0], [3.0, 8.0]])
+
+  def test_composite_input_non_flat_output(self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
+    if strategy.num_replicas_in_sync != 2:
+      self.skipTest("Test assumes two replicas.")
+
+    with strategy.scope():
+      table = variables.Variable(
+          initial_value=[[0.0, 1.0], [3.0, 7.0]], dtype=dtypes.float32)
+
+    @def_function.function
+    def sparse_lookup(iterator):
+
+      def tpu_function(sparse):
+        # Assumes dense_shape is (2, *)
+        looked_up = array_ops.gather(table, sparse.values)
+        segment_sum = math_ops.unsorted_segment_sum(
+            looked_up, sparse.indices[:, 0], 2)
+        return {"sparse": sparse, "segment_sum": segment_sum}
+
+      return nest.map_structure(
+          strategy.experimental_local_results,
+          strategy.run(tpu_function, args=(next(iterator),)))
+
+    def dataset_fn(_):
+      dataset = dataset_ops.Dataset.range(2)
+
+      def make_sparse(_):
+        return sparse_tensor.SparseTensor(
+            indices=array_ops.constant([[0, 0], [1, 0], [1, 1]],
+                                       dtype=dtypes.int64),
+            values=array_ops.constant([0, 0, 1], dtype=dtypes.int32),
+            dense_shape=array_ops.constant([2, 2], dtype=dtypes.int64))
+
+      return dataset.map(make_sparse)
+
+    dataset = iter(
+        strategy.experimental_distribute_datasets_from_function(
+            dataset_fn,
+            distribute_lib.InputOptions(
+                experimental_prefetch_to_device=False)))
+
+    output = sparse_lookup(dataset)
+
+    # All replicas return identical reults.
+    for replica in range(strategy.num_replicas_in_sync):
+      self.assertIsInstance(output["sparse"][replica],
+                            sparse_tensor.SparseTensor)
+      self.assertAllEqual(output["sparse"][replica].indices,
+                          [[0, 0], [1, 0], [1, 1]])
+      self.assertAllEqual(output["sparse"][replica].values, [0, 0, 1])
+      self.assertAllEqual(output["sparse"][replica].dense_shape, [2, 2])
+      self.assertAllEqual(output["segment_sum"][replica],
+                          [[0.0, 1.0], [3.0, 8.0]])
 
   def test_composite_input_dynamic_shapes_outside_compilation(
       self, enable_packed_var):
@@ -564,13 +648,6 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
       update_variable.get_concrete_function()
       self.assertLen(strategy.extended.worker_devices, trace_count[0])
 
-  def test_cluster_resolver_available(self, enable_packed_var):
-    resolver = get_tpu_cluster_resolver()
-    remote.connect_to_cluster(resolver)
-    tpu_strategy_util.initialize_tpu_system(resolver)
-    strategy = tpu_lib.TPUStrategy(resolver)
-    self.assertIs(strategy.cluster_resolver, resolver)
-
 
 class TPUStrategyDataPrefetchTest(test.TestCase):
 
@@ -671,5 +748,323 @@ class TPUStrategyDataPrefetchTest(test.TestCase):
     with self.assertRaisesRegex(ValueError, "TPUStrategy does not support"):
       iter(strategy.experimental_distribute_datasets_from_function(dataset_fn))
 
+
+class TPUStrategyDistributionTest(
+    strategy_test_lib.DistributionTestBase,
+    strategy_test_lib.TwoDeviceDistributionTestBase):
+
+  def test_update_config_proto(self):
+    resolver = get_tpu_cluster_resolver()
+    remote.connect_to_cluster(resolver)
+    tpu_strategy_util.initialize_tpu_system(resolver)
+    strategy = tpu_lib.TPUStrategyV2(resolver)
+
+    config_proto = config_pb2.ConfigProto()
+    cluster_spec = server_lib.ClusterSpec({"worker": ["fake1", "fake2"]})
+    with test.mock.patch.object(
+        resolver, "cluster_spec", return_value=cluster_spec):
+      new_config = strategy.update_config_proto(config_proto)
+
+    # Verify cluster_def.
+    self.assertProtoEquals(cluster_spec.as_cluster_def(),
+                           new_config.cluster_def)
+
+    # Verify isolate_session_state
+    self.assertTrue(new_config.isolate_session_state)
+
+  def test_make_input_fn_iterable(self):
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+    distribution = get_tpu_strategy()
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=2,
+        expected_num_input_pipelines=1,
+        expected_input_pipeline_id=0)
+    self._test_input_fn_iterable(distribution, input_fn, expected_values)
+
+  def test_make_input_fn_iterator(self):
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+    distribution = get_tpu_strategy()
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=2,
+        expected_num_input_pipelines=1,
+        expected_input_pipeline_id=0)
+    iterator = distribution.make_input_fn_iterator(input_fn)
+    self._test_input_fn_iterator(
+        iterator,
+        distribution.extended.worker_devices,
+        expected_values)
+
+  def test_num_replicas_in_sync(self):
+    strategy = get_tpu_strategy()
+    self.assertEqual(2, strategy.num_replicas_in_sync)
+
+  def test_call_and_merge_exceptions(self):
+    strategy = get_tpu_strategy()
+    self._test_call_and_merge_exceptions(strategy)
+
+  def test_numpy_dataset(self):
+    strategy = get_tpu_strategy()
+    self._test_numpy_dataset(strategy, run_in_function=True)
+
+  def test_global_step_update(self):
+    strategy = get_tpu_strategy()
+    self._test_global_step_update(strategy)
+
+  def test_run(self):
+    strategy = get_tpu_strategy()
+    self._test_run(strategy, run_in_function=True)
+
+  def test_summary_for_replica_zero_only(self):
+    strategy = get_tpu_strategy()
+    self._test_summary_for_replica_zero_only(strategy)
+
+  def test_all_reduce_sum(self):
+    strategy = get_tpu_strategy()
+    self._test_all_reduce_sum(strategy, run_in_function=True)
+
+  def test_all_reduce_sum_gradients(self):
+    strategy = get_tpu_strategy()
+    self._test_all_reduce_sum_gradients(strategy, run_in_function=True)
+
+  def test_all_reduce_sum_gradient_tape(self):
+    strategy = get_tpu_strategy()
+    self._test_all_reduce_sum_gradient_tape(strategy, run_in_function=True)
+
+  def test_all_reduce_mean(self):
+    strategy = get_tpu_strategy()
+    self._test_all_reduce_mean(strategy, run_in_function=True)
+
+  def test_all_reduce_mean_gradients(self):
+    strategy = get_tpu_strategy()
+    self._test_all_reduce_mean_gradients(strategy, run_in_function=True)
+
+  def test_all_reduce_mean_gradient_tape(self):
+    strategy = get_tpu_strategy()
+    self._test_all_reduce_mean_gradient_tape(strategy, run_in_function=True)
+
+  def test_reduce(self):
+    strategy = get_tpu_strategy()
+
+    inputs = strategy.make_input_fn_iterator(
+        lambda _: dataset_ops.Dataset.from_tensor_slices([2., 3.]))
+
+    self.evaluate(inputs.initialize())
+    per_replica_outputs = strategy.run(
+        def_function.function(math_ops.square), args=(next(inputs),))
+
+    with strategy.scope():
+      mean = strategy.reduce(reduce_util.ReduceOp.MEAN, per_replica_outputs,
+                             axis=None)
+      self.assertEqual(6.5, self.evaluate(mean))
+
+  def test_constraint(self):
+    strategy = get_tpu_strategy()
+
+    with strategy.scope():
+      variable = variables.Variable(initial_value=2.,
+                                    constraint=lambda x: 0. * x + 1.)
+    self.assertEqual(variable.value().numpy(), 2)
+
+    @def_function.function
+    def update_variable():
+      variable.assign_add(1)
+      variable.assign(variable.constraint(variable))
+
+    update_variable()
+    self.assertEqual(variable.value().numpy(), 1)
+
+  def test_trainable_variables(self):
+    strategy = get_tpu_strategy()
+    self._test_trainable_variable(strategy)
+
+  def test_model_parallelism(self):
+    resolver = get_tpu_cluster_resolver()
+    remote.connect_to_cluster(resolver)
+    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+    device_assignment = device_assignment_lib.DeviceAssignment(
+        topology, core_assignment=[[[0, 0, 0, 0], [0, 0, 0, 1]]])
+    strategy = tpu_lib.TPUStrategyV2(
+        resolver,
+        experimental_device_assignment=device_assignment)
+
+    with strategy.scope():
+      v = variables.Variable(2.)
+      with strategy.extended.experimental_logical_device(1):
+        w = variables.Variable(3.)
+
+    self.assertLen(strategy.experimental_local_results(v), 1)
+    self.assertLen(strategy.experimental_local_results(w), 1)
+    self.assertEqual("/job:localhost/replica:0/task:0/device:TPU:0",
+                     strategy.experimental_local_results(v)[0].device)
+    self.assertEqual("/job:localhost/replica:0/task:0/device:TPU:1",
+                     strategy.experimental_local_results(w)[0].device)
+
+    logical_devices = []
+    @def_function.function
+    def f(x):
+      replica_ctx = distribution_strategy_context.get_replica_context()
+      with replica_ctx.experimental_logical_device(0):
+        y = v * x
+      with replica_ctx.experimental_logical_device(1):
+        z = w * y
+      logical_devices.append((y.device, z.device))
+      return z
+
+    result = strategy.run(f, args=(5.,))
+
+    self.assertEqual(
+        [("/device:TPU_REPLICATED_CORE:0", "/device:TPU_REPLICATED_CORE:1")],
+        logical_devices)
+
+    with self.cached_session():
+      self.evaluate(variables.global_variables_initializer())
+      self.assertEqual(30., self.evaluate(result))
+
+  def test_model_parallelism_checkpointing(self):
+
+    class PartitionedModel(module.Module):
+
+      def __init__(self, v, w):
+        super(PartitionedModel, self).__init__()
+
+        assert distribution_strategy_context.has_strategy()
+        strategy = distribution_strategy_context.get_strategy()
+
+        with strategy.extended.experimental_logical_device(0):
+          self.v = variables.Variable(v)
+        with strategy.extended.experimental_logical_device(1):
+          self.w = variables.Variable(w)
+
+      def __call__(self, x):
+        replica_ctx = distribution_strategy_context.get_replica_context()
+        with replica_ctx.experimental_logical_device(0):
+          y = self.v * x
+        with replica_ctx.experimental_logical_device(1):
+          z = self.w * y
+        return z
+
+      def change_weights_op(self, v_new, w_new):
+        return control_flow_ops.group([self.v.assign(v_new),
+                                       self.w.assign(w_new)])
+
+    resolver = get_tpu_cluster_resolver()
+    remote.connect_to_cluster(resolver)
+    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+    device_assignment = device_assignment_lib.DeviceAssignment(
+        topology, core_assignment=[[[0, 0, 0, 0], [0, 0, 0, 1]]])
+    strategy = tpu_lib.TPUStrategyV2(
+        resolver,
+        experimental_device_assignment=device_assignment)
+
+    with strategy.scope():
+      model = PartitionedModel(2., 3.)
+
+    checkpoint_dir = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
+    checkpoint = util.Checkpoint(model=model)
+
+    with self.cached_session() as sess:
+      self.evaluate(variables.global_variables_initializer())
+      checkpoint.save(file_prefix=checkpoint_prefix)
+
+      self.evaluate(model.change_weights_op(1., 4.))
+      result = strategy.run(def_function.function(model), args=(5.0,))
+      self.assertEqual(20., self.evaluate(result))
+
+      status = checkpoint.restore(
+          checkpoint_management.latest_checkpoint(checkpoint_dir))
+      status.run_restore_ops(sess)  # must run restore op in non-eager mode.
+      status.assert_consumed()
+      status.assert_existing_objects_matched()
+      result = strategy.run(def_function.function(model), args=(5.0,))
+      self.assertEqual(30., self.evaluate(result))
+
+
+class DeviceAssignmentTest(test.TestCase):
+
+  def test_core_assignment(self):
+    resolver = get_tpu_cluster_resolver()
+    remote.connect_to_cluster(resolver)
+    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+    device_assignment = device_assignment_lib.DeviceAssignment(
+        topology, core_assignment=[[[0, 0, 0, 0]]])
+    self.assertAllEqual([[[0, 0, 0, 0]]], device_assignment.core_assignment)
+    self.assertEqual(1, device_assignment.num_cores_per_replica)
+    self.assertEqual(1, device_assignment.num_replicas)
+    self.assertEqual("/task:0/device:TPU:0", device_assignment.tpu_device())
+    self.assertEqual("/task:0/device:CPU:0", device_assignment.host_device())
+
+  def test_device_assignment_strategy_properties(self):
+    resolver = get_tpu_cluster_resolver()
+    remote.connect_to_cluster(resolver)
+    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+    device_assignment = device_assignment_lib.DeviceAssignment(
+        topology, core_assignment=[[[0, 0, 0, 0]]])
+    strategy = tpu_lib.TPUStrategyV2(
+        resolver,
+        experimental_device_assignment=device_assignment)
+    self.assertEqual(strategy.extended.num_hosts, 1)
+    self.assertEqual(strategy.num_replicas_in_sync, 1)
+    self.assertEqual(strategy.extended.num_replicas_per_host, 1)  # pylint: disable=protected-access
+
+  def test_device_assignment_constants(self):
+    resolver = get_tpu_cluster_resolver()
+    remote.connect_to_cluster(resolver)
+    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+    device_assignment = device_assignment_lib.DeviceAssignment(
+        topology,
+        core_assignment=device_assignment_lib.SINGLE_CORE_ASSIGNMENT)
+    self.assertAllEqual([[[0, 0, 0, 0]]], device_assignment.core_assignment)
+    self.assertEqual(1, device_assignment.num_cores_per_replica)
+    self.assertEqual(1, device_assignment.num_replicas)
+    self.assertEqual("/task:0/device:TPU:0", device_assignment.tpu_device())
+    self.assertEqual("/task:0/device:CPU:0", device_assignment.host_device())
+
+  def test_variables_mismatched_device_assignment(self):
+    resolver = get_tpu_cluster_resolver()
+    remote.connect_to_cluster(resolver)
+    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+
+    strategy0 = tpu_lib.TPUStrategyV2(resolver)
+    self.assertEqual(
+        ("/job:localhost/replica:0/task:0/device:TPU:0",
+         "/job:localhost/replica:0/task:0/device:TPU:1"),
+        strategy0.extended.worker_devices)
+
+    with strategy0.scope():
+      v = variables.Variable(1.)
+
+    v1_assign_op = strategy0.experimental_local_results(v)[1].assign(42.)
+
+    with self.cached_session():
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(v1_assign_op)
+      self.assertAllEqual([1., 42.],
+                          self.evaluate(
+                              strategy0.experimental_local_results(v)))
+
+    # Second strategy has devices reversed relative to the first.
+    device_assignment = device_assignment_lib.DeviceAssignment(
+        topology, core_assignment=[[[0, 0, 0, 1]], [[0, 0, 0, 0]]])
+    strategy1 = tpu_lib.TPUStrategyV2(
+        resolver,
+        experimental_device_assignment=device_assignment)
+    self.assertEqual(
+        ("/job:localhost/replica:0/task:0/device:TPU:1",
+         "/job:localhost/replica:0/task:0/device:TPU:0"),
+        strategy1.extended.worker_devices)
+
+    v_read = strategy1.run(def_function.function(v.read_value))
+
+    with self.cached_session():
+      self.assertAllEqual([42., 1.],
+                          self.evaluate(
+                              strategy0.experimental_local_results(v_read)))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/distribute/tpu_values.py b/tensorflow/python/distribute/tpu_values.py
index 33885531966..f734caef5c5 100644
--- a/tensorflow/python/distribute/tpu_values.py
+++ b/tensorflow/python/distribute/tpu_values.py
@@ -26,6 +26,7 @@ import contextlib
 
 from tensorflow.python.distribute import packed_distributed_variable as packed
 from tensorflow.python.distribute import values
+from tensorflow.python.distribute import values_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import ops
@@ -162,6 +163,8 @@ class TPUVariableMixin(object):
 
   @property
   def op(self):
+    if values_util.is_saving_non_distributed():
+      return self._primary.op
     return values.DistributedVarOp(self._primary.op.name,
                                    self._primary.op.graph,
                                    self._primary.op.traceback,
@@ -197,10 +200,78 @@ def enclosing_tpu_context():
   return None
 
 
+class TPUDistributedVariable(TPUVariableMixin, values.DistributedVariable):
+  """DistributedVariable subclass for TPUStrategy."""
+
+  def _is_mirrored(self):
+    self._policy._is_mirrored()  # pylint: disable=protected-access
+
+  def assign_sub(self, value, use_locking=False, name=None, read_value=True):
+    if values_util.is_saving_non_distributed():
+      return self._primary.assign_sub(value, use_locking, name, read_value)
+    return self._policy.assign_sub(
+        self, value, use_locking=use_locking, name=name, read_value=read_value)
+
+  def assign_add(self, value, use_locking=False, name=None, read_value=True):
+    if values_util.is_saving_non_distributed():
+      return self._primary.assign_add(value, use_locking, name, read_value)
+    return self._policy.assign_add(
+        self, value, use_locking=use_locking, name=name, read_value=read_value)
+
+  def assign(self, value, use_locking=False, name=None, read_value=True):
+    if values_util.is_saving_non_distributed():
+      return self._primary.assign(value, use_locking, name, read_value)
+    return self._policy.assign(
+        self, value, use_locking=use_locking, name=name, read_value=read_value)
+
+  def scatter_sub(self, sparse_delta, use_locking=False, name=None):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_sub(sparse_delta, use_locking, name)
+    return self._policy.scatter_sub(
+        self, sparse_delta, use_locking=use_locking, name=name)
+
+  def scatter_add(self, sparse_delta, use_locking=False, name=None):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_add(sparse_delta, use_locking, name)
+    return self._policy.scatter_add(
+        self, sparse_delta, use_locking=use_locking, name=name)
+
+  def scatter_mul(self, sparse_delta, use_locking=False, name=None):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_mul(sparse_delta, use_locking, name)
+    return self._policy.scatter_mul(
+        self, sparse_delta, use_locking=use_locking, name=name)
+
+  def scatter_div(self, sparse_delta, use_locking=False, name=None):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_div(sparse_delta, use_locking, name)
+    return self._policy.scatter_div(
+        self, sparse_delta, use_locking=use_locking, name=name)
+
+  def scatter_min(self, sparse_delta, use_locking=False, name=None):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_min(sparse_delta, use_locking, name)
+    return self._policy.scatter_min(
+        self, sparse_delta, use_locking=use_locking, name=name)
+
+  def scatter_max(self, sparse_delta, use_locking=False, name=None):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_max(sparse_delta, use_locking, name)
+    return self._policy.scatter_max(
+        self, sparse_delta, use_locking=use_locking, name=name)
+
+  def scatter_update(self, sparse_delta, use_locking=False, name=None):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_update(sparse_delta, use_locking, name)
+    return self._policy.scatter_update(
+        self, sparse_delta, use_locking=use_locking, name=name)
+
+
 class TPUMirroredVariable(TPUVariableMixin, values.MirroredVariable):
   """Holds a map from replica to TPU variables whose values are kept in sync."""
 
-  def assign_sub(self, value, use_locking=False, name=None, read_value=True):
+  def assign_sub(self, value, use_locking=False, name=None,
+                 read_value=True):
     if (enclosing_tpu_context() and
         self.aggregation == variable_scope.VariableAggregation.NONE):
       return _make_raw_assign_fn(
@@ -210,17 +281,11 @@ class TPUMirroredVariable(TPUVariableMixin, values.MirroredVariable):
               use_locking=use_locking,
               name=name,
               read_value=read_value)
+    return assign_sub(self, value, use_locking=use_locking, name=name,
+                      read_value=read_value)
 
-    assign_sub_fn = _make_raw_assign_fn(
-        gen_resource_variable_ops.assign_sub_variable_op)
-    return self._update(
-        update_fn=assign_sub_fn,
-        value=value,
-        use_locking=use_locking,
-        name=name,
-        read_value=read_value)
-
-  def assign_add(self, value, use_locking=False, name=None, read_value=True):
+  def assign_add(self, value, use_locking=False, name=None,
+                 read_value=True):
     if (enclosing_tpu_context() and
         self.aggregation == variable_scope.VariableAggregation.NONE):
       return _make_raw_assign_fn(
@@ -230,54 +295,55 @@ class TPUMirroredVariable(TPUVariableMixin, values.MirroredVariable):
               use_locking=use_locking,
               name=name,
               read_value=read_value)
-
-    assign_add_fn = _make_raw_assign_fn(
-        gen_resource_variable_ops.assign_add_variable_op)
-    return self._update(
-        update_fn=assign_add_fn,
-        value=value,
-        use_locking=use_locking,
-        name=name,
-        read_value=read_value)
+    return assign_add(self, value, use_locking=use_locking, name=name,
+                      read_value=read_value)
 
   def assign(self, value, use_locking=False, name=None, read_value=True):
     if (enclosing_tpu_context() and
         self.aggregation == variable_scope.VariableAggregation.NONE):
-      return _make_raw_assign_fn(gen_resource_variable_ops.assign_variable_op)(
-          self,
-          value=value,
-          use_locking=use_locking,
-          name=name,
-          read_value=read_value)
-
-    assign_fn = _make_raw_assign_fn(
-        gen_resource_variable_ops.assign_variable_op)
-    return self._update(
-        update_fn=assign_fn,
-        value=value,
-        use_locking=use_locking,
-        name=name,
-        read_value=read_value)
+      return _make_raw_assign_fn(
+          gen_resource_variable_ops.assign_variable_op)(
+              self,
+              value=value,
+              use_locking=use_locking,
+              name=name,
+              read_value=read_value)
+    return assign(self, value, use_locking=use_locking, name=name,
+                  read_value=read_value)
 
   def scatter_sub(self, *args, **kwargs):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_sub(*args, **kwargs)
     raise NotImplementedError
 
   def scatter_add(self, *args, **kwargs):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_add(*args, **kwargs)
     raise NotImplementedError
 
   def scatter_max(self, *args, **kwargs):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_max(*args, **kwargs)
     raise NotImplementedError
 
   def scatter_min(self, *args, **kwargs):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_min(*args, **kwargs)
     raise NotImplementedError
 
   def scatter_mul(self, *args, **kwargs):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_mul(*args, **kwargs)
     raise NotImplementedError
 
   def scatter_div(self, *args, **kwargs):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_div(*args, **kwargs)
     raise NotImplementedError
 
   def scatter_update(self, *args, **kwargs):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_update(*args, **kwargs)
     raise NotImplementedError
 
   def _is_mirrored(self):
@@ -312,3 +378,220 @@ class TPUSyncOnReadVariable(TPUVariableMixin, values.SyncOnReadVariable):
 
   def _is_mirrored(self):
     return False
+
+
+# Common method between AutoPolicy, OnWrite and Mirrored variables.
+def assign_sub(var, value, use_locking=False, name=None, read_value=True):
+  assign_sub_fn = _make_raw_assign_fn(
+      gen_resource_variable_ops.assign_sub_variable_op)
+  return var._update(  # pylint: disable=protected-access
+      update_fn=assign_sub_fn,
+      value=value,
+      use_locking=use_locking,
+      name=name,
+      read_value=read_value)
+
+
+def assign_add(var, value, use_locking=False, name=None, read_value=True):
+  assign_add_fn = _make_raw_assign_fn(
+      gen_resource_variable_ops.assign_add_variable_op)
+  return var._update(  # pylint: disable=protected-access
+      update_fn=assign_add_fn,
+      value=value,
+      use_locking=use_locking,
+      name=name,
+      read_value=read_value)
+
+
+def assign(var, value, use_locking=False, name=None, read_value=True):
+  assign_fn = _make_raw_assign_fn(
+      gen_resource_variable_ops.assign_variable_op)
+  return var._update(  # pylint: disable=protected-access
+      update_fn=assign_fn,
+      value=value,
+      use_locking=use_locking,
+      name=name,
+      read_value=read_value)
+
+
+class TPUAutoPolicy(values.AutoPolicy):
+  """Policy defined for `tf.VariableSynchronization.AUTO` synchronization.
+
+  This policy is created when `synchronization` is set to
+  `tf.VariableSynchronization.AUTO` and `aggregation` is set to
+  `tf.VariableAggregation.NONE` when creating a `tf.Variable` in `tf.distribute`
+  scope.
+  """
+
+  def assign_sub(self, var, value, use_locking=False, name=None,
+                 read_value=True):
+    if enclosing_tpu_context():
+      return _make_raw_assign_fn(
+          gen_resource_variable_ops.assign_sub_variable_op)(
+              var,
+              value=value,
+              use_locking=use_locking,
+              name=name,
+              read_value=read_value)
+    return assign_sub(var, value, use_locking=use_locking, name=name,
+                      read_value=read_value)
+
+  def assign_add(self, var, value, use_locking=False, name=None,
+                 read_value=True):
+    if enclosing_tpu_context():
+      return _make_raw_assign_fn(
+          gen_resource_variable_ops.assign_add_variable_op)(
+              var,
+              value=value,
+              use_locking=use_locking,
+              name=name,
+              read_value=read_value)
+    return assign_add(var, value, use_locking=use_locking, name=name,
+                      read_value=read_value)
+
+  def assign(self, var, value, use_locking=False, name=None, read_value=True):
+    if enclosing_tpu_context():
+      return _make_raw_assign_fn(
+          gen_resource_variable_ops.assign_variable_op)(
+              var,
+              value=value,
+              use_locking=use_locking,
+              name=name,
+              read_value=read_value)
+    return assign(var, value, use_locking=use_locking, name=name,
+                  read_value=read_value)
+
+  def scatter_sub(self, *args, **kwargs):
+    raise NotImplementedError
+
+  def scatter_add(self, *args, **kwargs):
+    raise NotImplementedError
+
+  def scatter_max(self, *args, **kwargs):
+    raise NotImplementedError
+
+  def scatter_min(self, *args, **kwargs):
+    raise NotImplementedError
+
+  def scatter_mul(self, *args, **kwargs):
+    raise NotImplementedError
+
+  def scatter_div(self, *args, **kwargs):
+    raise NotImplementedError
+
+  def scatter_update(self, *args, **kwargs):
+    raise NotImplementedError
+
+  def _is_mirrored(self):
+    return True
+
+
+class TPUOnWritePolicy(values.OnWritePolicy):
+  """Policy defined for `tf.VariableSynchronization.ON_WRITE` synchronization.
+
+  This policy is created when the following `synchronization` and
+  `aggregation` parameters are specified when creating a `tf.Variable` in
+  `tf.distribute` scope:
+  * `synchronization` is equal to `tf.VariableSynchronization.AUTO` and
+  aggregation can be any of the following `tf.VariableAggregation` enum
+  values such as `SUM`, `MEAN` or `ONLY_FIRST_REPLICA`.
+  * `synchronization` is equal to `tf.VariableSynchronization.ON_WRITE` and
+  aggregation can be any of the following `tf.VariableAggregation` enum
+  values such as `NONE`, `SUM`, `MEAN` or `ONLY_FIRST_REPLICA`.
+  """
+
+  def assign_sub(self, var, value, use_locking=False, name=None,
+                 read_value=True):
+    return assign_sub(var, value, use_locking=use_locking, name=name,
+                      read_value=read_value)
+
+  def assign_add(self, var, value, use_locking=False, name=None,
+                 read_value=True):
+    return assign_add(var, value, use_locking=use_locking, name=name,
+                      read_value=read_value)
+
+  def assign(self, var, value, use_locking=False, name=None, read_value=True):
+    return assign(var, value, use_locking=use_locking, name=name,
+                  read_value=read_value)
+
+  def scatter_sub(self, *args, **kwargs):
+    raise NotImplementedError
+
+  def scatter_add(self, *args, **kwargs):
+    raise NotImplementedError
+
+  def scatter_max(self, *args, **kwargs):
+    raise NotImplementedError
+
+  def scatter_min(self, *args, **kwargs):
+    raise NotImplementedError
+
+  def scatter_mul(self, *args, **kwargs):
+    raise NotImplementedError
+
+  def scatter_div(self, *args, **kwargs):
+    raise NotImplementedError
+
+  def scatter_update(self, *args, **kwargs):
+    raise NotImplementedError
+
+  def _is_mirrored(self):
+    return True
+
+
+class TPUOnReadPolicy(values.OnReadPolicy):
+  """Policy defined for `tf.VariableSynchronization.ON_READ` synchronization.
+
+  This policy is created when `synchronization` is set to
+  `tf.VariableSynchronization.ON_READ` and `aggregation` is set to any of the
+  values allowed by the `tf.VariableAggregation` enum such as `NONE`, `SUM`,
+  `MEAN` or `ONLY_FIRST_REPLICA`when creating a `tf.Variable` in `tf.distribute`
+  scope.
+  """
+
+  def assign_sub(self, var, *args, **kwargs):
+    if enclosing_tpu_context() is None:
+      return super(TPUOnReadPolicy, self).assign_sub(var, *args, **kwargs)
+    else:
+      return _make_raw_assign_fn(
+          gen_resource_variable_ops.assign_sub_variable_op)(var, *args,
+                                                            **kwargs)
+
+  def assign_add(self, var, *args, **kwargs):
+    if enclosing_tpu_context() is None:
+      return super(TPUOnReadPolicy, self).assign_add(var, *args, **kwargs)
+    else:
+      return _make_raw_assign_fn(
+          gen_resource_variable_ops.assign_add_variable_op)(var, *args,
+                                                            **kwargs)
+
+  def assign(self, var, *args, **kwargs):
+    if enclosing_tpu_context() is None:
+      return super(TPUOnReadPolicy, self).assign(var, *args, **kwargs)
+    else:
+      return _make_raw_assign_fn(gen_resource_variable_ops.assign_variable_op)(
+          var, *args, **kwargs)
+
+  def _is_mirrored(self):
+    return False
+
+  def scatter_sub(self, *args, **kwargs):
+    raise NotImplementedError
+
+  def scatter_add(self, *args, **kwargs):
+    raise NotImplementedError
+
+  def scatter_max(self, *args, **kwargs):
+    raise NotImplementedError
+
+  def scatter_min(self, *args, **kwargs):
+    raise NotImplementedError
+
+  def scatter_mul(self, *args, **kwargs):
+    raise NotImplementedError
+
+  def scatter_div(self, *args, **kwargs):
+    raise NotImplementedError
+
+  def scatter_update(self, *args, **kwargs):
+    raise NotImplementedError
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 9af4bd0bd91..87b711ce693 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -487,6 +487,8 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       The op that evaluates to True or False depending on if all the
       component variables are initialized.
     """
+    if values_util.is_saving_non_distributed():
+      return self._primary.is_initialized()
     if self._use_packed_variable():
       return self._packed_var.is_initialized()
     result = self._primary.is_initialized()
@@ -502,6 +504,8 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
 
   @property
   def initializer(self):
+    if values_util.is_saving_non_distributed():
+      return self._primary.initializer
     if self._initializer_op:
       init_op = self._initializer_op
     else:
@@ -567,6 +571,8 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
 
   @property
   def handle(self):
+    if values_util.is_saving_non_distributed():
+      return self._primary.handle
     replica_id = values_util.get_current_replica_id_as_int()
     if replica_id is None:
       raise ValueError("`handle` is not available outside the replica context"
@@ -610,6 +616,8 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
 
   @property
   def op(self):
+    if values_util.is_saving_non_distributed():
+      return self._primary.op
     # We want cross-replica code that does some var.op.X calls
     # to work (even if the current device isn't in self._devices), but
     # other uses of var.op in a cross-replica context to fail.
@@ -630,6 +638,8 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
 
   def _get(self):
     """Returns the value for the current device or raises a ValueError."""
+    if values_util.is_saving_non_distributed():
+      return self._primary
     replica_id = values_util.get_current_replica_id_as_int()
     if replica_id is None:
       return self._get_cross_replica()
@@ -638,6 +648,8 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
 
   def _get_on_device_or_primary(self):
     """Returns value in same replica or device if possible, else the _primary."""
+    if values_util.is_saving_non_distributed():
+      return self._primary
     replica_id = values_util.get_current_replica_id_as_int()
     if replica_id is None:
       # Try to find a value on the current device.
@@ -650,10 +662,14 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       return self._get_replica(replica_id)
 
   def read_value(self):
+    if values_util.is_saving_non_distributed():
+      return self._primary.read_value()
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       return array_ops.identity(self._get())
 
   def value(self):
+    if values_util.is_saving_non_distributed():
+      return self._primary.value()
     if self._policy:
       return self._policy.value(self)
     return self._get_on_device_or_primary().value()
@@ -666,6 +682,8 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
           "numpy() is only available when eager execution is enabled.")
 
   def assign_sub(self, value, use_locking=False, name=None, read_value=True):
+    if values_util.is_saving_non_distributed():
+      return self._primary.assign_sub(value, use_locking, name, read_value)
     if self._policy:
       return self._policy.assign_sub(
           self,
@@ -677,6 +695,8 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
         self, value, use_locking=use_locking, name=name, read_value=read_value)
 
   def assign_add(self, value, use_locking=False, name=None, read_value=True):
+    if values_util.is_saving_non_distributed():
+      return self._primary.assign_add(value, use_locking, name, read_value)
     if self._policy:
       return self._policy.assign_add(
           self,
@@ -688,6 +708,8 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
         self, value, use_locking=use_locking, name=name, read_value=read_value)
 
   def assign(self, value, use_locking=False, name=None, read_value=True):
+    if values_util.is_saving_non_distributed():
+      return self._primary.assign(value, use_locking, name, read_value)
     if self._policy:
       return self._policy.assign(
           self,
@@ -699,50 +721,64 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
         self, value, use_locking=use_locking, name=name, read_value=read_value)
 
   def scatter_sub(self, sparse_delta, use_locking=False, name=None):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_sub(sparse_delta, use_locking, name)
     if self._policy:
-      self._policy.scatter_sub(
+      return self._policy.scatter_sub(
           self, sparse_delta, use_locking=use_locking, name=name)
     return values_util.scatter_sub(
         self, sparse_delta, use_locking=use_locking, name=name)
 
   def scatter_add(self, sparse_delta, use_locking=False, name=None):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_add(sparse_delta, use_locking, name)
     if self._policy:
-      self._policy.scatter_add(
+      return self._policy.scatter_add(
           self, sparse_delta, use_locking=use_locking, name=name)
     return values_util.scatter_add(
         self, sparse_delta, use_locking=use_locking, name=name)
 
   def scatter_mul(self, sparse_delta, use_locking=False, name=None):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_mul(sparse_delta, use_locking, name)
     if self._policy:
-      self._policy.scatter_mul(
+      return self._policy.scatter_mul(
           self, sparse_delta, use_locking=use_locking, name=name)
     return values_util.scatter_mul(
         self, sparse_delta, use_locking=use_locking, name=name)
 
   def scatter_div(self, sparse_delta, use_locking=False, name=None):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_div(sparse_delta, use_locking, name)
     if self._policy:
-      self._policy.scatter_div(
+      return self._policy.scatter_div(
           self, sparse_delta, use_locking=use_locking, name=name)
     return values_util.scatter_div(
         self, sparse_delta, use_locking=use_locking, name=name)
 
   def scatter_min(self, sparse_delta, use_locking=False, name=None):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_min(sparse_delta, use_locking, name)
     if self._policy:
-      self._policy.scatter_min(
+      return self._policy.scatter_min(
           self, sparse_delta, use_locking=use_locking, name=name)
     return values_util.scatter_min(
         self, sparse_delta, use_locking=use_locking, name=name)
 
   def scatter_max(self, sparse_delta, use_locking=False, name=None):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_max(sparse_delta, use_locking, name)
     if self._policy:
-      self._policy.scatter_max(
+      return self._policy.scatter_max(
           self, sparse_delta, use_locking=use_locking, name=name)
     return values_util.scatter_max(
         self, sparse_delta, use_locking=use_locking, name=name)
 
   def scatter_update(self, sparse_delta, use_locking=False, name=None):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_update(sparse_delta, use_locking, name)
     if self._policy:
-      self._policy.scatter_update(
+      return self._policy.scatter_update(
           self, sparse_delta, use_locking=use_locking, name=name)
     return values_util.scatter_update(
         self, sparse_delta, use_locking=use_locking, name=name)
@@ -763,12 +799,16 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
 
   def _as_graph_element(self):
+    if values_util.is_saving_non_distributed():
+      return self._primary._as_graph_element()  # pylint: disable=protected-access
     if self._policy:
       return self._policy._as_graph_element(self)  # pylint: disable=protected-access
 
     raise NotImplementedError("No policy set for calling _as_graph_element.")
 
   def _get_cross_replica(self):
+    if values_util.is_saving_non_distributed():
+      return self._primary
     if self._policy:
       return self._policy._get_cross_replica(self)  # pylint: disable=protected-access
 
@@ -827,6 +867,8 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       Updated variable or `tf.Operation`.
 
     """
+    if values_util.is_saving_non_distributed():
+      return update_fn(self._primary, value, **kwargs)
     with ds_context.enter_or_assert_strategy(self.distribute_strategy):
       if ds_context.in_cross_replica_context():
         update_replica_id = distribute_lib.get_update_replica_id()
@@ -844,6 +886,9 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
 
   def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
     """Converts a variable to a tensor."""
+    if values_util.is_saving_non_distributed():
+      return ops.convert_to_tensor(
+          self._primary, dtype=dtype, name=name, as_ref=as_ref)
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       return ops.convert_to_tensor(
           self._get(), dtype=dtype, name=name, as_ref=as_ref)
@@ -919,6 +964,8 @@ class MirroredVariable(DistributedVariable, Mirrored):
     return _on_write_update_replica(self, update_fn, value, **kwargs)
 
   def scatter_min(self, *args, **kwargs):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_min(*args, **kwargs)
     if (self._aggregation != vs.VariableAggregation.ONLY_FIRST_REPLICA and
         self._aggregation != vs.VariableAggregation.NONE):
       raise NotImplementedError(values_util.scatter_error_msg.format(
@@ -926,20 +973,26 @@ class MirroredVariable(DistributedVariable, Mirrored):
     return super(MirroredVariable, self).scatter_min(*args, **kwargs)
 
   def scatter_max(self, *args, **kwargs):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_max(*args, **kwargs)
     if (self._aggregation != vs.VariableAggregation.ONLY_FIRST_REPLICA and
         self._aggregation != vs.VariableAggregation.NONE):
       raise NotImplementedError(values_util.scatter_error_msg.format(
-          op_name="scatter_min", aggregation=self._aggregation))
+          op_name="scatter_max", aggregation=self._aggregation))
     return super(MirroredVariable, self).scatter_max(*args, **kwargs)
 
   def scatter_update(self, *args, **kwargs):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_update(*args, **kwargs)
     if (self._aggregation != vs.VariableAggregation.ONLY_FIRST_REPLICA and
         self._aggregation != vs.VariableAggregation.NONE):
       raise NotImplementedError(values_util.scatter_error_msg.format(
-          op_name="scatter_min", aggregation=self._aggregation))
+          op_name="scatter_update", aggregation=self._aggregation))
     return super(MirroredVariable, self).scatter_update(*args, **kwargs)
 
   def _get_cross_replica(self):
+    if values_util.is_saving_non_distributed():
+      return self._primary.read_value()
     # Return identity, to avoid directly exposing the variable to the user and
     # allowing it to be modified by mistake.
     return array_ops.identity(Mirrored._get_cross_replica(self))
@@ -964,8 +1017,11 @@ class MirroredVariable(DistributedVariable, Mirrored):
 
   def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
     """Converts a variable to a tensor."""
+    # TODO(b/154017756): Make _dense_var_to_tensor consistent between ON_READ
+    # and ON_WRITE.
     # Try to avoid assignments to and other mutations of MirroredVariable
-    # state except through a DistributionStrategy.extended.update() call.
+    # state except through a DistributionStrategy.extended.update() or any of
+    # the `assign*` and `scatter*` calls.
     if as_ref:
       # A TF 1.x case where the variable is a boolean variable and used like:
       # tf.cond(v, true_fn, false_fn).
@@ -1005,8 +1061,11 @@ class _SyncOnReadSaveable(saveable_object.SaveableObject):
     # when saving.
     tensor, = restored_tensors
     if self._sync_on_read_variable.aggregation == vs.VariableAggregation.SUM:
-      tensor = math_ops.cast(tensor / len(self._sync_on_read_variable._devices),  # pylint: disable=protected-access
+      # pylint: disable=protected-access
+      strategy = self._sync_on_read_variable._distribute_strategy
+      tensor = math_ops.cast(tensor / strategy.num_replicas_in_sync,
                              self._sync_on_read_variable.dtype)
+      # pylint: enable=protected-access
     return control_flow_ops.group(
         tuple(
             values_util.assign_on_device(v.device, v, tensor)
@@ -1022,8 +1081,11 @@ class SyncOnReadVariable(DistributedVariable):
   # TODO(b/154017756): Make assign behaivor in cross replica context consistent
   # with MirroredVariable.
   def assign_sub(self, value, use_locking=False, name=None, read_value=True):
+    if values_util.is_saving_non_distributed():
+      return self._primary.assign_sub(value, use_locking, name, read_value)
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      if ds_context.in_cross_replica_context():
+      if (ds_context.in_cross_replica_context() and
+          not values_util.in_replica_update_context()):
         return values_util.on_read_assign_sub_cross_replica(
             self, value, read_value=read_value)
       else:
@@ -1031,8 +1093,11 @@ class SyncOnReadVariable(DistributedVariable):
                      self).assign_sub(value, use_locking, name, read_value)
 
   def assign_add(self, value, use_locking=False, name=None, read_value=True):
+    if values_util.is_saving_non_distributed():
+      return self._primary.assign_add(value, use_locking, name, read_value)
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      if ds_context.in_cross_replica_context():
+      if (ds_context.in_cross_replica_context() and
+          not values_util.in_replica_update_context()):
         return values_util.on_read_assign_add_cross_replica(
             self, value, read_value=read_value)
       else:
@@ -1040,8 +1105,11 @@ class SyncOnReadVariable(DistributedVariable):
                      self).assign_add(value, use_locking, name, read_value)
 
   def assign(self, value, use_locking=False, name=None, read_value=True):
+    if values_util.is_saving_non_distributed():
+      return self._primary.assign(value, use_locking, name, read_value)
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      if ds_context.in_cross_replica_context():
+      if (ds_context.in_cross_replica_context() and
+          not values_util.in_replica_update_context()):
         return values_util.on_read_assign_cross_replica(
             self, value, read_value=read_value)
       else:
@@ -1054,29 +1122,46 @@ class SyncOnReadVariable(DistributedVariable):
         method)
 
   def scatter_sub(self, *args, **kwargs):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_sub(*args, **kwargs)
     self._scatter_not_implemented("scatter_sub")
 
   def scatter_add(self, *args, **kwargs):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_add(*args, **kwargs)
     self._scatter_not_implemented("scatter_add")
 
   def scatter_mul(self, *args, **kwargs):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_mul(*args, **kwargs)
     self._scatter_not_implemented("scatter_mul")
 
   def scatter_div(self, *args, **kwargs):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_div(*args, **kwargs)
     self._scatter_not_implemented("scatter_div")
 
   def scatter_min(self, *args, **kwargs):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_min(*args, **kwargs)
     self._scatter_not_implemented("scatter_min")
 
   def scatter_max(self, *args, **kwargs):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_max(*args, **kwargs)
     self._scatter_not_implemented("scatter_max")
 
   def scatter_update(self, *args, **kwargs):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_update(*args, **kwargs)
     self._scatter_not_implemented("scatter_update")
 
   def value(self):
+    if values_util.is_saving_non_distributed():
+      return self._primary.value()
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      if ds_context.in_cross_replica_context():
+      if (ds_context.in_cross_replica_context() and
+          not values_util.in_replica_update_context()):
         if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
           return self._get_replica(0).value()
         return self._get_cross_replica()
@@ -1085,6 +1170,8 @@ class SyncOnReadVariable(DistributedVariable):
         return self._get_on_device_or_primary().value()
 
   def _get_cross_replica(self):
+    if values_util.is_saving_non_distributed():
+      return self._primary.read_value()
     if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
       # Consider returning a tensor value here to make the return value of
       # _get_cross_replica consistent.
@@ -1097,6 +1184,8 @@ class SyncOnReadVariable(DistributedVariable):
           axis=None)
 
   def _as_graph_element(self):
+    if values_util.is_saving_non_distributed():
+      return self._primary._as_graph_element()  # pylint: disable=protected-access
     # pylint: disable=protected-access
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       if ds_context.in_cross_replica_context():
@@ -1118,12 +1207,6 @@ class SyncOnReadVariable(DistributedVariable):
 
     return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
 
-  def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
-    """Converts a variable to a tensor."""
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      return ops.convert_to_tensor(
-          self._get(), dtype=dtype, name=name, as_ref=as_ref)
-
 
 # Register a conversion functions which reads the value of the variable,
 # allowing instances of the class to be used as tensors.
@@ -1213,7 +1296,10 @@ class OnReadPolicy(VariablePolicy):
 
   def value(self, var):
     with ds_context.enter_or_assert_strategy(var.distribute_strategy):
-      if ds_context.in_cross_replica_context():
+      if (ds_context.in_cross_replica_context() and
+          not values_util.in_replica_update_context()):
+        if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
+          return var._get_replica(0).value()  # pylint: disable=protected-access
         return var._get_cross_replica()  # pylint: disable=protected-access
       else:
         return var._get_on_device_or_primary().value()  # pylint: disable=protected-access
@@ -1246,7 +1332,8 @@ class OnReadPolicy(VariablePolicy):
                  read_value=True):
     """Subtracts a value from this variable."""
     with ds_context.enter_or_assert_strategy(var.distribute_strategy):
-      if ds_context.in_cross_replica_context():
+      if (ds_context.in_cross_replica_context() and
+          not values_util.in_replica_update_context()):
         return values_util.on_read_assign_sub_cross_replica(
             var, value, read_value=read_value)
       else:
@@ -1258,7 +1345,8 @@ class OnReadPolicy(VariablePolicy):
                  read_value=True):
     """Adds a value to this variable."""
     with ds_context.enter_or_assert_strategy(var.distribute_strategy):
-      if ds_context.in_cross_replica_context():
+      if (ds_context.in_cross_replica_context() and
+          not values_util.in_replica_update_context()):
         return values_util.on_read_assign_add_cross_replica(
             var, value, read_value=read_value)
       else:
@@ -1268,7 +1356,8 @@ class OnReadPolicy(VariablePolicy):
 
   def assign(self, var, value, use_locking=False, name=None, read_value=True):
     with ds_context.enter_or_assert_strategy(var.distribute_strategy):
-      if ds_context.in_cross_replica_context():
+      if (ds_context.in_cross_replica_context() and
+          not values_util.in_replica_update_context()):
         return values_util.on_read_assign_cross_replica(var, value,
                                                         read_value=read_value)
       else:
@@ -1329,8 +1418,9 @@ class OnReadPolicy(VariablePolicy):
     # total across all devices when restoring a variable that was summed
     # when saving.
     if self._aggregation == vs.VariableAggregation.SUM:
-      tensor = math_ops.cast(tensor / len(var._devices),  # pylint: disable=protected-access
-                             var.dtype)
+      strategy = var._distribute_strategy  # pylint: disable=protected-access
+      num_replicas_in_sync = strategy.num_replicas_in_sync
+      tensor = math_ops.cast(tensor / num_replicas_in_sync, var.dtype)
     return control_flow_ops.group(
         tuple(
             values_util.assign_on_device(v.device, v, tensor)
@@ -1445,20 +1535,3 @@ class OnWritePolicy(AutoPolicy):
 
   def _update_replica(self, var, update_fn, value, **kwargs):
     return _on_write_update_replica(var, update_fn, value, **kwargs)
-
-
-# Utility functions
-# Return True if the Value is Mirrored or the Variable is replicated and kept in
-# sync.
-def _is_mirrored(val):
-  if isinstance(val, DistributedVariable):
-    if val._policy:  # pylint: disable=protected-access
-      return val._policy._is_mirrored()  # pylint: disable=protected-access
-  return isinstance(val, Mirrored)
-
-
-def _is_sync_on_read(val):
-  if isinstance(val, DistributedVariable):
-    if val._policy:  # pylint: disable=protected-access
-      return not val._policy._is_mirrored()  # pylint: disable=protected-access
-  return not isinstance(val, Mirrored)
diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py
index ec4447951ad..899134f0bff 100644
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
-import itertools
 import os
 
 from absl.testing import parameterized
@@ -27,17 +26,17 @@ import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import tf2
+from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribute_utils
-from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import packed_distributed_variable as packed
+from tensorflow.python.distribute import parameter_server_strategy
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import test_util as ds_test_util
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.distribute import tpu_values
 from tensorflow.python.distribute import values as values_lib
-from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
@@ -46,23 +45,63 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.saved_model import save_context
-from tensorflow.python.tpu import tpu_strategy_util
+from tensorflow.python.saved_model import save_options
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training.tracking import util as trackable_utils
 from tensorflow.python.types import core
 from tensorflow.python.util import nest
 
 
+def _device_str(d):
+  return "/device:GPU:" + str(d)
+
+
+def _nested_value(d):
+  return ("a" + d, ["b" + d, {"c": "d" + d, "e": "f" + d}, "g" + d], "h" + d)
+
+
+def _make_mirrored_val(init_val=5.0):
+  v = []
+  devices = ["/device:GPU:0", "/device:CPU:0"]
+  for d, _ in zip(devices, ["v", "v/replica"]):
+    with ops.device(d):
+      v.append(constant_op.constant(init_val))
+  return values_lib.Mirrored(v)
+
+
+def _make_mirrored():
+  v = []
+  devices = ["/device:GPU:0", "/device:CPU:0"]
+  for d, n, init in zip(devices, ["v", "v/replica"], [1., 2.]):
+    with ops.device(d):
+      v.append(variable_scope.get_variable(
+          name=n, initializer=init, use_resource=True))
+  mirrored = values_lib.MirroredVariable(
+      None, v, variable_scope.VariableAggregation.SUM)
+  return mirrored
+
+
+def mirrored_and_tpu_strategy_combinations():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+          strategy_combinations.tpu_strategy,
+          strategy_combinations.tpu_strategy_packed_var,
+      ],
+      mode=["graph", "eager"])
+
+
 class DistributedValuesTest(test.TestCase, parameterized.TestCase):
 
   def testGetEager(self):
@@ -248,8 +287,7 @@ class DistributedValuesTest(test.TestCase, parameterized.TestCase):
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
               strategy_combinations.tpu_strategy_packed_var,
-              # TODO(b/137795644): support CentralStroageStrategy
-              # strategy_combinations.central_storage_strategy_with_two_gpus,
+              strategy_combinations.central_storage_strategy_with_two_gpus,
           ] + strategy_combinations.multiworker_strategies,
           mode=["eager"]))
   def testMakeDistributedValueDefaultDevicePlacement(self, distribution):
@@ -270,8 +308,7 @@ class DistributedValuesTest(test.TestCase, parameterized.TestCase):
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
               strategy_combinations.tpu_strategy_packed_var,
-              # TODO(b/137795644): support CentralStroageStrategy
-              # strategy_combinations.central_storage_strategy_with_two_gpus,
+              strategy_combinations.central_storage_strategy_with_two_gpus,
           ] + strategy_combinations.multiworker_strategies,
           mode=["eager"]))
   def testMakeDistributedValueExplicitDevicePlacement(self, distribution):
@@ -364,45 +401,6 @@ class DistributedDelegateTest(test.TestCase):
     self.assertEqual(v.x, v_deep_copy.x)
 
 
-def _device_str(d):
-  return "/device:GPU:" + str(d)
-
-
-def _nested_value(d):
-  return ("a" + d, ["b" + d, {"c": "d" + d, "e": "f" + d}, "g" + d], "h" + d)
-
-
-def _make_mirrored_val(init_val=5.0):
-  v = []
-  devices = ["/device:GPU:0", "/device:CPU:0"]
-  for d, _ in zip(devices, ["v", "v/replica"]):
-    with ops.device(d):
-      v.append(constant_op.constant(init_val))
-  return values_lib.Mirrored(v)
-
-
-def _make_mirrored():
-  v = []
-  devices = ["/device:GPU:0", "/device:CPU:0"]
-  for d, n, init in zip(devices, ["v", "v/replica"], [1., 2.]):
-    with ops.device(d):
-      v.append(variable_scope.get_variable(
-          name=n, initializer=init, use_resource=True))
-  mirrored = values_lib.MirroredVariable(
-      None, v, variable_scope.VariableAggregation.SUM)
-  return mirrored
-
-
-def mirrored_and_tpu_strategy_combinations():
-  return combinations.combine(
-      distribution=[
-          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-          strategy_combinations.tpu_strategy,
-          strategy_combinations.tpu_strategy_packed_var,
-      ],
-      mode=["graph", "eager"])
-
-
 @combinations.generate(
     combinations.combine(
         distribution=[
@@ -411,6 +409,9 @@ def mirrored_and_tpu_strategy_combinations():
             strategy_combinations.tpu_strategy,
             strategy_combinations.tpu_strategy_packed_var,
             strategy_combinations.central_storage_strategy_with_two_gpus,
+            strategy_combinations.multi_worker_mirrored_2x1_cpu,
+            strategy_combinations.multi_worker_mirrored_2x1_gpu,
+            strategy_combinations.multi_worker_mirrored_2x2_gpu
         ],
         synchronization=[
             variables_lib.VariableSynchronization.ON_READ,
@@ -430,7 +431,13 @@ class DistributedVariableTest(test.TestCase, parameterized.TestCase):
           1., synchronization=synchronization, aggregation=aggregation)
     self.assertIsInstance(v, variables_lib.Variable)
 
-  def testCheckpointing(self, distribution, synchronization, aggregation):
+  def testCheckpointing(self, distribution, synchronization, aggregation, mode):
+
+    if (isinstance(distribution,
+                   collective_all_reduce_strategy.CollectiveAllReduceStrategy)
+        and mode == "graph"):
+      self.skipTest("MWMS combinations tests do not work well in graph mode.")
+
     with distribution.scope():
       v = variables_lib.Variable(
           constant_op.constant([1., 2., 3., 4]),
@@ -555,6 +562,236 @@ class DistributedVariableTest(test.TestCase, parameterized.TestCase):
         self.evaluate(
             distribution.experimental_local_results(distribution.run(assign)))
 
+  def testStrategyExtendedUpdate(self, distribution, synchronization,
+                                 aggregation):
+    if len(distribution.extended.parameter_devices) != 2:
+      self.skipTest("n/a: needs exactly two parameter devices")
+    with distribution.scope():
+      v = variables_lib.Variable(
+          0., synchronization=synchronization, aggregation=aggregation)
+    # Note that this is actually real usage. We're doing this in optimizer to
+    # workaround the current restriction in strategy.extended.update().
+    value = values_lib.Mirrored([1., 2.])
+
+    assign_fn = lambda var, value: var.assign(value)
+    self.evaluate(distribution.extended.update(v, assign_fn, args=(value,)))
+    self.assertAllEqual(self.evaluate(v.values), [1., 2.])
+
+    assign_add_fn = lambda var, value: var.assign_add(value)
+    self.evaluate(distribution.extended.update(v, assign_add_fn, args=(value,)))
+    self.assertAllEqual(self.evaluate(v.values), [2., 4.])
+
+    assign_sub_fn = lambda var, value: var.assign_sub(value)
+    self.evaluate(distribution.extended.update(v, assign_sub_fn, args=(value,)))
+    self.assertAllEqual(self.evaluate(v.values), [1., 2.])
+
+    read_assign_fn = lambda var, value: var.assign_add(var.value() + var.
+                                                       read_value())
+    self.evaluate(
+        distribution.extended.update(v, read_assign_fn, args=(value,)))
+    self.assertAllEqual(self.evaluate(v.values), [3., 6.])
+
+  def testSaveNonDistributed(self, distribution, synchronization, aggregation):
+    # This test verifies that the DistributedVariable behave like the primary
+    # variable when saving a non-distributed version of the model (the default).
+    # The test asserts that the function traced under SaveContext has no device
+    # annotations and only reference the primary component of the variable. Note
+    # that please avoid capturing other eager tensors in this test to make the
+    # assertion easy.
+
+    if isinstance(distribution.extended,
+                  parameter_server_strategy.ParameterServerStrategyExtended):
+      self.skipTest("b/148689177: AggregatingVariable doesn't "
+                    "conform to Variable interface well")
+
+    # tf.function requires the return value to be Tensors, which is not always
+    # case for properties and methods of Variable, so we simply discard the
+    # return values.
+    def _discard_return(f):
+      f()
+      return
+
+    def _test(f, v):
+      # This verifies that the function under SaveContext:
+      #   - contains no device annotations.
+      #   - only references the primary component of the variable.
+      g = def_function.function(lambda: _discard_return(f))
+      options = save_options.SaveOptions(
+          experimental_variable_policy=save_options.VariablePolicy.NONE)
+      with save_context.save_context(options):
+        # The graph should contain no device.
+        graph = g.get_concrete_function().graph
+      for op in graph.get_operations():
+        self.assertEqual(op.device, "", msg=str(op))
+      # The function should only capture the primary variable. Note that it
+      # may not have captures, e.g. v.aggregation.
+      captures = list(graph.captures)
+      self.assertLessEqual(len(captures), 1)
+      if graph.captures:
+        self.assertIs(captures[0][0], v._primary.handle)
+
+    def _assert(cond):
+      return control_flow_ops.Assert(cond, [cond])
+
+    with distribution.scope():
+      # We use four variables for convenience reasons. They have no special
+      # meaning.
+      # - v is used whenever possible, and for the methods that require the
+      # dtype to be integer.
+      # - w is used for scatter and gather, which require the variable to be
+      # non-scalar.
+      # - y is used when the dtype needs to be float.
+      v = variables_lib.Variable(
+          0,
+          synchronization=synchronization,
+          aggregation=aggregation,
+          trainable=True)
+      w = variables_lib.Variable([0., 0., 0.],
+                                 synchronization=synchronization,
+                                 aggregation=aggregation,
+                                 trainable=True)
+      y = variables_lib.Variable(
+          7.,
+          synchronization=synchronization,
+          aggregation=aggregation)
+
+    # pylint: disable=g-long-lambda
+
+    # tf.Variable properties.
+    _test(lambda: self.assertEqual(v.aggregation, aggregation), v)
+    _test(lambda: self.assertIs(v.constraint, None), v)
+    # TODO(crccw): should we raise an error instead?
+    _test(lambda: self.assertEqual(v.device, v._primary.device), v)
+    _test(lambda: self.assertEqual(v.dtype, dtypes.int32), v)
+    if not context.executing_eagerly():
+      _test(lambda: self.assertIs(v.graph, v._primary.graph), v)
+    if not context.executing_eagerly():
+      _test(lambda: _assert(v.initial_value == 0), v)
+    _test(lambda: self.assertIs(v.initializer, v._primary.initializer), v)
+    _test(lambda: self.assertEqual(v.name, "Variable:0"), v)
+    if not context.executing_eagerly():
+      _test(lambda: self.assertIs(v.op, v._primary.op), v)
+    _test(lambda: self.assertEqual(v.shape, tensor_shape.TensorShape(())), v)
+    _test(lambda: self.assertEqual(v.synchronization, synchronization), v)
+    _test(lambda: self.assertTrue(v.trainable, True), v)
+
+    # tf.Variable methods.
+    _test(lambda: check_ops.assert_equal_v2(v.assign(1), 1), v)
+    _test(lambda: check_ops.assert_equal_v2(v.assign_add(1), 2), v)
+    _test(lambda: check_ops.assert_equal_v2(v.assign_sub(1), 1), v)
+    # TODO(b/148689177): Implement batch_scatter_update.
+    # count_up_to() is skipped since it's deprecated.
+    # eval() is skipped since it shouldn't called in a tf.function.
+    # experimental_ref() is skipped since it's deprecated.
+    # from_proto() is skipped since it shouldn't called in a tf.function.
+    # TODO(b/148689177): Implement gather_nd.
+    _test(
+        lambda: check_ops.assert_equal_v2(v.get_shape(),
+                                          tensor_shape.TensorShape(())), v)
+    # initialized_value() is skipped since it shouldn't called in a tf.function.
+    # load() is skipped since it shouldn't called in a tf.function.
+    _test(lambda: check_ops.assert_equal_v2(v.read_value(), 1), v)
+    # ref() is skipped since it shouldn't called in a tf.function.
+    _test(
+        lambda: check_ops.assert_equal_v2(
+            w.scatter_add(_make_index_slices(values=[1., 2.], indices=[0, 2])),
+            [1., 0., 2.]), w)
+    _test(
+        lambda: check_ops.assert_equal_v2(
+            w.scatter_div(_make_index_slices(values=[4., 2.], indices=[0, 2])),
+            [0.25, 0., 1.]), w)
+    _test(
+        lambda: check_ops.assert_equal_v2(
+            w.scatter_max(_make_index_slices(values=[1., 0.5], indices=[1, 2])),
+            [0.25, 1., 1.]), w)
+    _test(
+        lambda: check_ops.assert_equal_v2(
+            w.scatter_min(_make_index_slices(values=[1., 0.5], indices=[0, 1])),
+            [0.25, 0.5, 1.]), w)
+    _test(
+        lambda: check_ops.assert_equal_v2(
+            w.scatter_mul(_make_index_slices(values=[2., 0.5], indices=[0, 1])),
+            [0.5, 0.25, 1.]), w)
+    # TODO(b/148689177): Implement scatter_nd_*
+    _test(
+        lambda: check_ops.assert_equal_v2(
+            w.scatter_sub(_make_index_slices(values=[2., 0.5], indices=[0, 1])),
+            [-1.5, -0.25, 1.]), w)
+    _test(
+        lambda: check_ops.assert_equal_v2(
+            w.scatter_update(
+                _make_index_slices(values=[2., 0.5], indices=[0, 1])),
+            [2., 0.5, 1.]), w)
+    # set_shape() is skipped since ResourceVariable doesn't implement it.
+    # to_proto() is skipped since it shouldn't called in a tf.function.
+    _test(lambda: check_ops.assert_equal_v2(v.value(), 1), v)
+
+    # DistributedVariable should be treated as ResourceVariable, so it needs to
+    # conform to ResourceVariable interface as well.
+    _test(lambda: self.assertIs(v.handle, v._primary.handle), v)
+
+    # Convert to tensor.
+    _test(lambda: check_ops.assert_equal_v2(ops.convert_to_tensor(v), 1), v)
+
+    # Control dependency.
+    def _with_control_dep():
+      with ops.control_dependencies([v.assign(1)]):
+        return array_ops.identity(1)
+
+    _test(_with_control_dep, v)
+
+    # Operator overloads.
+    _test(lambda: check_ops.assert_equal_v2(v.assign(7), 7), v)
+    _test(lambda: check_ops.assert_equal_v2(v + 1, 8), v)
+    _test(lambda: check_ops.assert_equal_v2(3 + v, 10), v)
+    _test(lambda: check_ops.assert_equal_v2(v + v, 14), v)
+    _test(lambda: check_ops.assert_equal_v2(v - 2, 5), v)
+    _test(lambda: check_ops.assert_equal_v2(v - v, 0), v)
+    _test(lambda: check_ops.assert_equal_v2(v * 2, 14), v)
+    _test(lambda: check_ops.assert_equal_v2(3 * v, 21), v)
+    _test(lambda: check_ops.assert_equal_v2(v * v, 49), v)
+    _test(
+        lambda: check_ops.assert_equal_v2(
+            math_ops.cast(v / 2, dtypes.float32), 3.5), v)
+    _test(
+        lambda: check_ops.assert_equal_v2(
+            math_ops.cast(14 / v, dtypes.float32), 2.), v)
+    _test(lambda: check_ops.assert_equal_v2(v // 2, 3), v)
+    _test(lambda: check_ops.assert_equal_v2(15 // v, 2), v)
+    _test(lambda: check_ops.assert_equal_v2(v % 2, 1), v)
+    _test(lambda: check_ops.assert_equal_v2(16 % v, 2), v)
+    _test(lambda: _assert(v < 12), v)
+    _test(lambda: _assert(v <= 12), v)
+    _test(lambda: _assert(not v > 12), v)
+    _test(lambda: _assert(not v >= 12), v)
+    _test(lambda: _assert(not 12 < v), v)
+    _test(lambda: _assert(not 12 <= v), v)
+    _test(lambda: _assert(12 > v), v)
+    _test(lambda: _assert(12 >= v), v)
+    # XLA doesn't implement pow() with integers.
+    _test(lambda: check_ops.assert_near_v2(pow(y, 3.), 343.), y)
+    _test(lambda: check_ops.assert_near_v2(pow(2., y), 128.), y)
+    _test(lambda: check_ops.assert_equal_v2(abs(v), 7), v)
+    _test(lambda: check_ops.assert_equal_v2(v & 3, 3), v)
+    _test(lambda: check_ops.assert_equal_v2(3 & v, 3), v)
+    _test(lambda: check_ops.assert_equal_v2(v | 8, 15), v)
+    _test(lambda: check_ops.assert_equal_v2(16 | v, 23), v)
+    _test(lambda: check_ops.assert_equal_v2(v ^ 3, 4), v)
+    _test(lambda: check_ops.assert_equal_v2(11 ^ v, 12), v)
+    _test(lambda: check_ops.assert_equal_v2(-v, -7), v)
+    _test(lambda: check_ops.assert_equal_v2(~v, ~7), v)
+
+    # Index.
+    if isinstance(distribution.extended, tpu_strategy.TPUExtended):
+      # TODO(b/161572567): slice assignment doesn't work for TPU.
+      _test(lambda: check_ops.assert_equal_v2(w[0], 2.), w)
+    else:
+      _test(lambda: check_ops.assert_equal_v2(w[0].assign(1.), [1., 0.5, 1.]),
+            w)
+      _test(lambda: check_ops.assert_equal_v2(w[0], 1.), w)
+
+    # pylint: enable=g-long-lambda
+
 
 @combinations.generate(
     combinations.combine(
@@ -597,7 +834,8 @@ class PackedDistributedVariableTest(test.TestCase, parameterized.TestCase):
       self.assertIsInstance(
           v._packed_variable, packed.PackedDistributedVariable)
 
-    with save_context.save_context():
+    options = save_options.SaveOptions()
+    with save_context.save_context(options):
       self.assertIsNone(v._packed_variable)
 
 
@@ -767,507 +1005,6 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
     save_path = self._save_normal()
     self._restore_mirrored(save_path)
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_one_gpu,
-          ],
-          mode=["graph"]))
-  def testFetchAMirroredVariable(self, distribution):
-    with self.session(graph=ops.Graph()) as sess, distribution.scope():
-      with ops.device("/device:GPU:0"):
-        v = variable_scope.get_variable(
-            name="v", initializer=1., use_resource=True)
-      mirrored = values_lib.MirroredVariable(
-          distribution, (v,), variable_scope.VariableAggregation.MEAN)
-      sess.run(variables_lib.global_variables_initializer())
-      sess.run({"complicated": mirrored})
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-              strategy_combinations.tpu_strategy,
-              strategy_combinations.tpu_strategy_packed_var,
-          ],
-          mode=["eager"]))
-  def testAssignValueInReplicaContextWithoutAggregation(self, distribution):
-    with distribution.scope():
-      v = variables_lib.Variable(1.0, name="foo")
-
-    @def_function.function
-    def mytest():
-      def model_fn():
-        v.assign(5.0)
-        return v.read_value()
-
-      return distribution.run(model_fn)
-
-    mytest()
-    self.assertAllEqual([5.0, 5.0], self.evaluate(v.values))
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_one_cpu,
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-              strategy_combinations.tpu_strategy,
-              strategy_combinations.tpu_strategy_packed_var,
-          ],
-          mode=["graph", "eager"]))
-  def testValueInReplicaContext(self, distribution):
-    with distribution.scope():
-      v = variables_lib.Variable(
-          1., aggregation=variables_lib.VariableAggregation.MEAN)
-      self.evaluate(variables_lib.global_variables_initializer())
-
-      @def_function.function
-      def f():
-        with ops.control_dependencies([v.assign_add(1.)]):
-          return v.value()
-
-      results = self.evaluate(
-          distribution.experimental_local_results(
-              distribution.run(f)))
-      for value in results:
-        self.assertEqual(2., value)
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_one_cpu,
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-              strategy_combinations.tpu_strategy,
-              strategy_combinations.tpu_strategy_packed_var,
-          ],
-          mode=["graph", "eager"]))
-  def testAssignOutOfScope(self, distribution):
-    with distribution.scope():
-      mirrored = variables_lib.Variable(1.)
-    self.evaluate(mirrored.assign(3.))
-    self.assertEqual(self.evaluate(mirrored.read_value()), 3.)
-    for component in mirrored.values:
-      self.assertEqual(self.evaluate(component.read_value()), 3.)
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-          ],
-          mode=["graph", "eager"]))
-  def testAssignAggregationMeanDTypeNonFloat(self, distribution):
-    with distribution.scope():
-      v = variables_lib.Variable(
-          1,
-          aggregation=variable_scope.VariableAggregation.MEAN,
-          dtype=dtypes.int32)
-    self.evaluate(v.initializer)
-
-    @def_function.function
-    def assign():
-      ctx = distribution_strategy_context.get_replica_context()
-      return v.assign(ctx.replica_id_in_sync_group)
-
-    # disallow assign() with distributed value in replica context.
-    with self.assertRaisesRegex(ValueError,
-                                "Cannot update non-float variables"):
-      self.evaluate(
-          distribution.experimental_local_results(
-              distribution.run(assign)))
-
-    # allow assign() with same value in replica context.
-    @def_function.function
-    def assign_same():
-      return v.assign(2)
-
-    self.evaluate(
-        distribution.experimental_local_results(
-            distribution.run(assign_same)))
-    self.assertEqual(self.evaluate(v.read_value()), 2)
-
-    # allow assign() with mirrored variable in replica context.
-    with distribution.scope():
-      v2 = variables_lib.Variable(
-          3,
-          aggregation=variable_scope.VariableAggregation.SUM,
-          dtype=dtypes.int32)
-    self.evaluate(v2.initializer)
-
-    @def_function.function
-    def assign_mirrored():
-      return v.assign(v2)
-
-    self.evaluate(
-        distribution.experimental_local_results(
-            distribution.run(assign_mirrored)))
-    self.assertEqual(self.evaluate(v.read_value()), 3)
-
-    # allow assign() in cross replica context.
-    with distribution.scope():
-      self.evaluate(v.assign(4))
-      self.assertEqual(self.evaluate(v.read_value()), 4)
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-              strategy_combinations.tpu_strategy,
-              strategy_combinations.tpu_strategy_packed_var,
-          ],
-          mode=["eager"]))
-  def testInitializedToSameValueInsideEagerRun(self, distribution):
-    v = [None]
-
-    @def_function.function
-    def step():
-
-      def f():
-        if v[0] is None:
-          v[0] = variables_lib.Variable(random_ops.random_normal([]))
-
-      distribution.run(f)
-
-    context.set_global_seed(None)
-    step()
-    vals = self.evaluate(v[0].values)
-    self.assertAllEqual(vals[0], vals[1])
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_one_cpu,
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-              strategy_combinations.tpu_strategy,
-              strategy_combinations.tpu_strategy_packed_var,
-          ],
-          mode=["graph", "eager"]))
-  def testAggregationOnlyFirstReplica(self, distribution):
-    with distribution.scope():
-      v = variable_scope.variable(
-          15.,
-          synchronization=variables_lib.VariableSynchronization.ON_WRITE,
-          aggregation=variables_lib.VariableAggregation.ONLY_FIRST_REPLICA)
-    self.evaluate(variables_lib.global_variables_initializer())
-
-    @def_function.function
-    def assign():
-      ctx = distribution_strategy_context.get_replica_context()
-      replica_id = ctx.replica_id_in_sync_group
-      return v.assign(math_ops.cast(replica_id, dtypes.float32))
-    per_replica_results = self.evaluate(distribution.experimental_local_results(
-        distribution.run(assign)))
-    # The per-replica values should always match the first replicas value.
-    self.assertAllEqual(
-        array_ops.zeros(distribution.num_replicas_in_sync, dtypes.float32),
-        per_replica_results)
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-              strategy_combinations.tpu_strategy,
-              strategy_combinations.tpu_strategy_packed_var,
-          ],
-          mode=["eager"]))
-  def testInitScope(self, distribution):
-
-    class C(object):
-      pass
-
-    obj = C()
-    obj.w = None
-    obj.v = None
-
-    @def_function.function
-    def assign():
-      with ops.init_scope():
-        if obj.w is None:
-          obj.w = variables_lib.Variable(
-              0, aggregation=variables_lib.VariableAggregation.MEAN)
-          obj.v = variables_lib.Variable(
-              obj.w.read_value(),
-              aggregation=variables_lib.VariableAggregation.MEAN)
-
-      return obj.v.assign_add(2)
-
-    per_replica_results = self.evaluate(
-        distribution.experimental_local_results(distribution.run(assign)))
-    self.assertAllEqual([2, 2], per_replica_results)
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-              strategy_combinations.tpu_strategy,
-          ],
-          mode=["eager"]))
-  def testOperatorOverride(self, distribution):
-
-    with distribution.scope():
-      v = variable_scope.variable(
-          1, aggregation=variables_lib.VariableAggregation.MEAN)
-
-    self.assertEqual(2, self.evaluate(v + 1))
-
-    @def_function.function
-    def add():
-      return v + 1
-
-    per_replica_results = self.evaluate(
-        distribution.experimental_local_results(distribution.run(add)))
-    self.assertAllEqual([2, 2], per_replica_results)
-
-  @combinations.generate(mirrored_and_tpu_strategy_combinations())
-  def testAssignAdd(self, distribution):
-    with distribution.scope():
-      v = variable_scope.variable(
-          1, aggregation=variables_lib.VariableAggregation.MEAN)
-    self.evaluate(variables_lib.global_variables_initializer())
-
-    @def_function.function
-    def assign():
-      return v.assign_add(2)
-
-    per_replica_results = self.evaluate(
-        distribution.experimental_local_results(distribution.run(assign)))
-    # The per-replica values should always match the first replicas value.
-    self.assertAllEqual([3, 3], per_replica_results)
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-          ],
-          mode=["graph", "eager"]))
-  def testScatterSub(self, distribution):
-    with distribution.scope():
-      v = variables_lib.Variable(
-          [0., 0., 0.], aggregation=variables_lib.VariableAggregation.MEAN)
-    self.evaluate(v.initializer)
-
-    @def_function.function
-    def scatter_sub():
-      ctx = distribution_strategy_context.get_replica_context()
-      replica_id = ctx.replica_id_in_sync_group
-      value = indexed_slices.IndexedSlices(
-          values=array_ops.stack([
-              math_ops.cast(replica_id, dtypes.float32),
-              math_ops.cast(replica_id + 1, dtypes.float32)
-          ]),
-          indices=array_ops.stack([replica_id, replica_id + 1]),
-          dense_shape=(3,))
-      return v.scatter_sub(value)
-
-    per_replica_results = self.evaluate(
-        distribution.experimental_local_results(
-            distribution.run(scatter_sub)))
-    self.assertAllEqual([[0., -1., -1.], [0., -1., -1.]], per_replica_results)
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-          ],
-          mode=["graph", "eager"]))
-  def testScatterAdd(self, distribution):
-    with distribution.scope():
-      v = variables_lib.Variable(
-          [0, 0, 0], aggregation=variables_lib.VariableAggregation.SUM)
-    self.evaluate(v.initializer)
-
-    @def_function.function
-    def scatter_add():
-      ctx = distribution_strategy_context.get_replica_context()
-      replica_id = ctx.replica_id_in_sync_group
-      value = indexed_slices.IndexedSlices(
-          values=array_ops.stack([replica_id, replica_id + 1]),
-          indices=array_ops.stack([replica_id, replica_id + 1]),
-          dense_shape=(3,))
-      return v.scatter_add(value)
-
-    per_replica_results = self.evaluate(
-        distribution.experimental_local_results(
-            distribution.run(scatter_add)))
-    self.assertAllEqual([[0, 2, 2], [0, 2, 2]], per_replica_results)
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-          ],
-          mode=["graph", "eager"]))
-  def testScatterDiv(self, distribution):
-    with distribution.scope():
-      v = variables_lib.Variable(
-          [1, 6, 1], aggregation=variables_lib.VariableAggregation.SUM)
-    self.evaluate(v.initializer)
-
-    @def_function.function
-    def scatter_div():
-      ctx = distribution_strategy_context.get_replica_context()
-      replica_id = ctx.replica_id_in_sync_group
-      value = indexed_slices.IndexedSlices(
-          values=array_ops.reshape(replica_id + 2, [1]),
-          indices=array_ops.reshape(replica_id, [1]),
-          dense_shape=(3,))
-      return v.scatter_div(value)
-
-    per_replica_results = self.evaluate(
-        distribution.experimental_local_results(
-            distribution.run(scatter_div)))
-    self.assertAllEqual([[0, 2, 1], [0, 2, 1]], per_replica_results)
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-          ],
-          mode=["graph", "eager"]))
-  def testScatterMul(self, distribution):
-    with distribution.scope():
-      v = variables_lib.Variable(
-          [2., 1., 1.], aggregation=variables_lib.VariableAggregation.MEAN)
-    self.evaluate(v.initializer)
-
-    @def_function.function
-    def scatter_mul():
-      ctx = distribution_strategy_context.get_replica_context()
-      replica_id = ctx.replica_id_in_sync_group
-      value = indexed_slices.IndexedSlices(
-          values=array_ops.reshape(
-              math_ops.cast(replica_id + 2, dtypes.float32), [1]),
-          indices=array_ops.reshape(replica_id, [1]),
-          dense_shape=(3,))
-      return v.scatter_mul(value)
-
-    per_replica_results = self.evaluate(
-        distribution.experimental_local_results(
-            distribution.run(scatter_mul)))
-    self.assertAllClose([[2., 1.5, 1.], [2., 1.5, 1.]], per_replica_results)
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-          ],
-          mode=["graph", "eager"]))
-  def testScatterMin(self, distribution):
-    with distribution.scope():
-      v1 = variables_lib.Variable(
-          [0, 2, 0], aggregation=variables_lib.VariableAggregation.SUM)
-      v2 = variables_lib.Variable(
-          [0, 2, 0],
-          aggregation=variables_lib.VariableAggregation.ONLY_FIRST_REPLICA)
-    self.evaluate(variables_lib.global_variables_initializer())
-
-    @def_function.function
-    def scatter_min(v):
-      value = indexed_slices.IndexedSlices(
-          values=array_ops.identity([1]),
-          indices=array_ops.identity([1]),
-          dense_shape=(3,))
-      return v.scatter_min(value)
-
-    with self.assertRaisesRegex(NotImplementedError, "scatter_min.*"):
-      self.evaluate(
-          distribution.experimental_local_results(
-              distribution.run(scatter_min, args=(v1,))))
-
-    per_replica_results = self.evaluate(
-        distribution.experimental_local_results(
-            distribution.run(scatter_min, args=(v2,))))
-    self.assertAllClose([[0, 1, 0], [0, 1, 0]], per_replica_results)
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-          ],
-          mode=["graph", "eager"]))
-  def testScatterMax(self, distribution):
-    with distribution.scope():
-      v1 = variables_lib.Variable(
-          [0, 0, 0], aggregation=variables_lib.VariableAggregation.SUM)
-      v2 = variables_lib.Variable(
-          [0, 0, 0],
-          aggregation=variables_lib.VariableAggregation.ONLY_FIRST_REPLICA)
-    self.evaluate(variables_lib.global_variables_initializer())
-
-    @def_function.function
-    def scatter_max(v):
-      value = indexed_slices.IndexedSlices(
-          values=array_ops.identity([1]),
-          indices=array_ops.identity([0]),
-          dense_shape=(3,))
-      return v.scatter_max(value)
-
-    with self.assertRaisesRegex(NotImplementedError, "scatter_max.*"):
-      self.evaluate(
-          distribution.experimental_local_results(
-              distribution.run(scatter_max, args=(v1,))))
-
-    per_replica_results = self.evaluate(
-        distribution.experimental_local_results(
-            distribution.run(scatter_max, args=(v2,))))
-    self.assertAllClose([[1, 0, 0], [1, 0, 0]], per_replica_results)
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-          ],
-          mode=["graph", "eager"]))
-  def testScatterUpdate(self, distribution):
-    with distribution.scope():
-      v1 = variables_lib.Variable(
-          [0, 0, 0], aggregation=variables_lib.VariableAggregation.SUM)
-      v2 = variables_lib.Variable(
-          [0, 0, 0],
-          aggregation=variables_lib.VariableAggregation.ONLY_FIRST_REPLICA)
-    self.evaluate(variables_lib.global_variables_initializer())
-
-    @def_function.function
-    def scatter_update(v):
-      value = indexed_slices.IndexedSlices(
-          values=array_ops.identity([3]),
-          indices=array_ops.identity([1]),
-          dense_shape=(3,))
-      return v.scatter_update(value)
-
-    with self.assertRaisesRegex(NotImplementedError, "scatter_update.*"):
-      self.evaluate(
-          distribution.experimental_local_results(
-              distribution.run(scatter_update, args=(v1,))))
-
-    per_replica_results = self.evaluate(
-        distribution.experimental_local_results(
-            distribution.run(scatter_update, args=(v2,))))
-    self.assertAllClose([[0, 3, 0], [0, 3, 0]], per_replica_results)
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-          ],
-          mode=["graph", "eager"]))
-  def testScatterOpsInCrossReplicaContext(self, distribution):
-    with distribution.scope():
-      v1 = variables_lib.Variable(
-          [1, 1, 1], aggregation=variables_lib.VariableAggregation.SUM)
-      v2 = variables_lib.Variable([1, 1, 1])
-    self.evaluate(variables_lib.global_variables_initializer())
-
-    value = indexed_slices.IndexedSlices(
-        values=array_ops.identity([2]),
-        indices=array_ops.identity([0]),
-        dense_shape=(3,))
-    with distribution.scope():
-      self.evaluate(v1.scatter_add(value))
-      self.assertAllEqual([3, 1, 1], self.evaluate(v1.read_value()))
-
-      self.evaluate(v2.scatter_min(value))
-      self.assertAllEqual([1, 1, 1], self.evaluate(v2.read_value()))
-
 
 _TPU_STRATEGIES = (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)
 
@@ -1292,7 +1029,22 @@ def _make_replica_local(method, strategy=None):
   return v, replica_local
 
 
-class SyncOnReadVariablePropertiesTest(test.TestCase):
+class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase):
+
+  def _assign_replica_local(self, v, new):
+    for var, n in zip(v, new):
+      with ops.device(var.device):
+        self.evaluate(var.assign(n))
+
+  def _save_return_saver(self, sess, var):
+    saver = saver_lib.Saver(var_list=[var])
+    test_dir = self.get_temp_dir()
+    prefix = os.path.join(test_dir, "ckpt")
+    return saver.save(sess, prefix), saver
+
+  def _save(self, sess, var):
+    save_path, _ = self._save_return_saver(sess, var)
+    return save_path
 
   config = config_pb2.ConfigProto()
   config.allow_soft_placement = True
@@ -1323,43 +1075,6 @@ class SyncOnReadVariablePropertiesTest(test.TestCase):
         None, (v,), variable_scope.VariableAggregation.MEAN)
     self.assertEqual(2., self.evaluate(add1(replica_local)))
 
-
-# TODO(b/144432582): Add variable aggregation type to combinations to simplify
-# tests.
-def strategy_and_run_tf_function_combinations():
-  # Test the combination of different strategies and whether a tf.function
-  # is passed into strategy.run."""
-  return combinations.combine(
-      distribution=[
-          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-      ],
-      mode=["graph", "eager"],
-      experimental_run_tf_function=[True, False]) + combinations.combine(
-          distribution=[
-              strategy_combinations.tpu_strategy,
-              strategy_combinations.tpu_strategy_packed_var,
-          ],
-          mode=["graph", "eager"],
-          experimental_run_tf_function=[True])
-
-
-class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase):
-
-  def _assign_replica_local(self, v, new):
-    for var, n in zip(v, new):
-      with ops.device(var.device):
-        self.evaluate(var.assign(n))
-
-  def _save_return_saver(self, sess, var):
-    saver = saver_lib.Saver(var_list=[var])
-    test_dir = self.get_temp_dir()
-    prefix = os.path.join(test_dir, "ckpt")
-    return saver.save(sess, prefix), saver
-
-  def _save(self, sess, var):
-    save_path, _ = self._save_return_saver(sess, var)
-    return save_path
-
   @combinations.generate(mirrored_and_tpu_strategy_combinations())
   def testTensorConversion(self, distribution):
     with context.graph_mode():
@@ -1556,453 +1271,6 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase):
     save_path = self._save_normal()
     self._restore_replica_local_sum(save_path, distribution)
 
-  @combinations.generate(strategy_and_run_tf_function_combinations())
-  def testAssign(self, distribution, experimental_run_tf_function):
-
-    def assign(fn, v, update_value, cross_replica):
-      update_fn = lambda: getattr(v, fn)(update_value)
-      if cross_replica:
-        return update_fn()
-      else:
-        if experimental_run_tf_function:
-          update_fn = def_function.function(update_fn)
-        return distribution.experimental_local_results(
-            distribution.run(update_fn))
-
-    updates = [("assign", 1.), ("assign_add", 1.), ("assign_sub", -1.)]
-    aggregations = [
-        variables_lib.VariableAggregation.NONE,
-        variables_lib.VariableAggregation.SUM,
-        variables_lib.VariableAggregation.MEAN,
-        variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
-    ]
-    options = list(
-        x for x in itertools.product(updates, aggregations, [True, False]))
-    for update, aggregation, cross_replica in options:
-      # VariableAggregation.SUM in cross-replica mode is tested below,
-      # VariableAggregation.NONE in cross-replica mode is not supported.
-      if cross_replica and aggregation in [
-          variables_lib.VariableAggregation.SUM,
-          variables_lib.VariableAggregation.NONE,
-      ]:
-        continue
-      with distribution.scope():
-        v = variable_scope.variable(
-            0.,
-            synchronization=variables_lib.VariableSynchronization.ON_READ,
-            aggregation=aggregation)
-      self.evaluate(variables_lib.global_variables_initializer())
-      fn, update_value = update
-      self.evaluate(assign(fn, v, update_value, cross_replica))
-      for component in v._values:
-        self.assertAllEqual(self.evaluate(component.read_value()),
-                            self.evaluate(array_ops.ones_like(component)))
-
-  @combinations.generate(strategy_and_run_tf_function_combinations())
-  def testAssignDtypeConversion(self, distribution,
-                                experimental_run_tf_function):
-
-    def assign(fn, v, update_value, cross_replica):
-      update_fn = lambda: getattr(v, fn)(update_value)
-      if cross_replica:
-        return update_fn()
-      else:
-        if experimental_run_tf_function:
-          update_fn = def_function.function(update_fn)
-        return distribution.experimental_local_results(
-            distribution.run(update_fn))
-
-    updates = [("assign", 1), ("assign_add", 1), ("assign_sub", -1)]
-    aggregations = [
-        variables_lib.VariableAggregation.NONE,
-        variables_lib.VariableAggregation.SUM,
-        variables_lib.VariableAggregation.MEAN,
-        variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
-    ]
-    options = list(
-        x for x in itertools.product(updates, aggregations, [True, False]))
-    for update, aggregation, cross_replica in options:
-      # VariableAggregation.SUM in cross-replica mode is tested below,
-      # VariableAggregation.NONE in cross-replica mode is not supported.
-      if cross_replica and aggregation in [
-          variables_lib.VariableAggregation.SUM,
-          variables_lib.VariableAggregation.NONE,
-      ]:
-        continue
-      with distribution.scope():
-        v = variable_scope.variable(
-            0.,
-            synchronization=variables_lib.VariableSynchronization.ON_READ,
-            aggregation=aggregation)
-      self.evaluate(variables_lib.global_variables_initializer())
-      fn, update_value = update
-      self.evaluate(assign(fn, v, update_value, cross_replica))
-      for component in v._values:
-        self.assertAllEqual(self.evaluate(component.read_value()),
-                            self.evaluate(array_ops.ones_like(component)))
-
-  @combinations.generate(mirrored_and_tpu_strategy_combinations())
-  def testAssignWithAggregationSum(self, distribution):
-    with distribution.scope():
-      v = variable_scope.variable(
-          0.,
-          synchronization=variables_lib.VariableSynchronization.ON_READ,
-          aggregation=variables_lib.VariableAggregation.SUM)
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(v.assign(1. * distribution.num_replicas_in_sync))
-    for component in v._values:
-      self.assertAllEqual(self.evaluate(component.read_value()),
-                          self.evaluate(array_ops.ones_like(component)))
-
-  @combinations.generate(mirrored_and_tpu_strategy_combinations())
-  def testAssignAddSubWithAggregationSum(self, distribution):
-    with distribution.scope():
-      v = variable_scope.variable(
-          0.,
-          synchronization=variables_lib.VariableSynchronization.ON_READ,
-          aggregation=variables_lib.VariableAggregation.SUM)
-    self.evaluate(variables_lib.global_variables_initializer())
-    with self.assertRaisesRegex(
-        ValueError, "SyncOnReadVariable does not support "):
-      self.evaluate(v.assign_add(1.))
-    with self.assertRaisesRegex(
-        ValueError, "SyncOnReadVariable does not support "):
-      self.evaluate(v.assign_sub(1.))
-
-  @combinations.generate(strategy_and_run_tf_function_combinations())
-  def testReadValueInReplicaContext(self, distribution,
-                                    experimental_run_tf_function):
-    aggregations = [
-        variables_lib.VariableAggregation.NONE,
-        variables_lib.VariableAggregation.SUM,
-        variables_lib.VariableAggregation.MEAN,
-        variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
-    ]
-    for aggregation in aggregations:
-      with distribution.scope():
-        v = variable_scope.variable(
-            0.,
-            synchronization=variables_lib.VariableSynchronization.ON_READ,
-            aggregation=aggregation)
-      self.evaluate(variables_lib.global_variables_initializer())
-      if experimental_run_tf_function:
-        read_var_fn = def_function.function(v.read_value)
-      else:
-        read_var_fn = v.read_value
-      results = self.evaluate(
-          distribution.experimental_local_results(
-              distribution.run(read_var_fn)))
-      for component, value in zip(v._values, results):
-        self.assertAllEqual(self.evaluate(component.read_value()), value)
-
-  @combinations.generate(strategy_and_run_tf_function_combinations())
-  def testReadValueInCrossReplicaContext(self, distribution,
-                                         experimental_run_tf_function):
-    aggregations = [
-        variables_lib.VariableAggregation.SUM,
-        variables_lib.VariableAggregation.MEAN,
-        variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
-    ]
-    for aggregation in aggregations:
-      if isinstance(distribution, _TPU_STRATEGIES):
-        resolver = tpu_cluster_resolver.TPUClusterResolver("")
-        tpu_strategy_util.initialize_tpu_system(resolver)
-      with distribution.scope():
-        v = variable_scope.variable(
-            0.,
-            synchronization=variables_lib.VariableSynchronization.ON_READ,
-            aggregation=aggregation)
-      self.evaluate(variables_lib.global_variables_initializer())
-
-      def assign(v=v):
-        ctx = distribution_strategy_context.get_replica_context()
-        replica_id = ctx.replica_id_in_sync_group
-        return v.assign(math_ops.cast(replica_id, dtypes.float32))
-
-      if experimental_run_tf_function:
-        assign = def_function.function(assign)
-
-      self.evaluate(
-          distribution.experimental_local_results(distribution.run(assign)))
-      num_replicas = distribution.num_replicas_in_sync
-      sum_of_replica_values = num_replicas * (num_replicas - 1) / 2.
-      if aggregation == variables_lib.VariableAggregation.SUM:
-        expected = sum_of_replica_values
-      elif aggregation == variables_lib.VariableAggregation.MEAN:
-        expected = sum_of_replica_values / num_replicas
-      else:
-        expected = 0
-      self.assertEqual(expected, self.evaluate(v.read_value()), aggregation)
-      self.assertEqual(expected, self.evaluate(v.value()), aggregation)
-      self.assertEqual(expected, self.evaluate(v), aggregation)
-      self.assertEqual(expected, self.evaluate(array_ops.identity(v)),
-                       aggregation)
-
-  # TODO(b/145574622): Re-enable this test once ReduceOp argument is
-  # respected on GPUs.
-  @combinations.generate(strategy_and_run_tf_function_combinations())
-  def disable_testAllReduce(self, distribution,
-                            experimental_run_tf_function):
-    with distribution.scope():
-      v = variable_scope.variable(
-          2.,
-          synchronization=variables_lib.VariableSynchronization.ON_WRITE,
-          aggregation=variables_lib.VariableAggregation.MEAN)
-    self.evaluate(variables_lib.global_variables_initializer())
-
-    def all_reduce():
-      ctx = distribution_strategy_context.get_replica_context()
-      replica_id = ctx.replica_id_in_sync_group
-      return ctx.all_reduce("SUM", v) + math_ops.cast(replica_id,
-                                                      dtypes.float32)
-
-    if experimental_run_tf_function:
-      all_reduce = def_function.function(all_reduce)
-
-    per_replica_results = self.evaluate(
-        distribution.experimental_local_results(distribution.run(all_reduce)))
-    expected_result = []
-    for i in range(distribution.num_replicas_in_sync):
-      expected_result.append(2.0 * distribution.num_replicas_in_sync +
-                             1.0 * i)
-    self.assertEqual(per_replica_results, tuple(expected_result))
-
-  @combinations.generate(strategy_and_run_tf_function_combinations())
-  def testAssignPerReplicaBeforeRead(self, distribution,
-                                     experimental_run_tf_function):
-    aggregations = [
-        variables_lib.VariableAggregation.SUM,
-        variables_lib.VariableAggregation.MEAN,
-        variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
-    ]
-    for aggregation in aggregations:
-      with distribution.scope():
-        v = variable_scope.variable(
-            0.,
-            synchronization=variables_lib.VariableSynchronization.ON_READ,
-            aggregation=aggregation)
-      self.evaluate(variables_lib.global_variables_initializer())
-
-      def assign(var=v):
-        ctx = distribution_strategy_context.get_replica_context()
-        replica_id = ctx.replica_id_in_sync_group
-        return var.assign(math_ops.cast(replica_id, dtypes.float32))
-
-      if experimental_run_tf_function:
-        assign = def_function.function(assign)
-
-      per_replica_results = self.evaluate(
-          distribution.experimental_local_results(distribution.run(assign)))
-      expected_result = []
-      for i in range(distribution.num_replicas_in_sync):
-        expected_result.append(1.0 * i)
-      self.assertEqual(per_replica_results, tuple(expected_result))
-
-  @combinations.generate(mirrored_and_tpu_strategy_combinations())
-  def testReadValueWithAggregationNoneInCrossReplicaContext(self, distribution):
-    with distribution.scope():
-      v = variable_scope.variable(
-          0.,
-          synchronization=variables_lib.VariableSynchronization.ON_READ,
-          aggregation=variables_lib.VariableAggregation.NONE)
-    self.evaluate(variables_lib.global_variables_initializer())
-    with self.assertRaisesRegex(
-        ValueError, "Could not convert from .* VariableAggregation\\.NONE"):
-      self.evaluate(v.read_value())
-
-  @combinations.generate(mirrored_and_tpu_strategy_combinations())
-  def testInitializedToSameValueInsideEagerRun(self, distribution):
-    if not context.executing_eagerly(): self.skipTest("eager only")
-
-    v = [None]
-    @def_function.function
-    def step():
-      def f():
-        if v[0] is None:
-          v[0] = variables_lib.Variable(
-              random_ops.random_normal([]),
-              synchronization=variables_lib.VariableSynchronization.ON_READ)
-
-      distribution.run(f)
-
-    context.set_global_seed(None)
-    step()
-    vals = self.evaluate(v[0].values)
-    self.assertAllEqual(vals[0], vals[1])
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.tpu_strategy,
-          ],
-          mode=["eager"]))
-  def testOperatorOverride(self, distribution):
-
-    with distribution.scope():
-      v = variable_scope.variable(
-          0.0,
-          synchronization=variables_lib.VariableSynchronization.ON_READ,
-          aggregation=variables_lib.VariableAggregation.MEAN)
-
-    @def_function.function
-    def assign():
-      ctx = distribution_strategy_context.get_replica_context()
-      replica_id = ctx.replica_id_in_sync_group
-      return v.assign(math_ops.cast(replica_id, dtypes.float32))
-
-    # Assign different replicas with different values.
-    distribution.run(assign)
-
-    self.assertEqual(1.5, self.evaluate(v + 1))
-
-    @def_function.function
-    def add():
-      return v + 1
-
-    per_replica_results = self.evaluate(
-        distribution.experimental_local_results(distribution.run(add)))
-    self.assertAllEqual([1, 2], per_replica_results)
-
-
-@combinations.generate(
-    combinations.combine(
-        distribution=[
-            strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-        ],
-        aggregation=[
-            variables_lib.VariableAggregation.MEAN,
-            variables_lib.VariableAggregation.SUM,
-            variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
-        ],
-        mode=["graph", "eager"]))
-class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
-
-  def testScatterSub(self, distribution, aggregation):
-    with distribution.scope():
-      v = variables_lib.Variable(
-          [1., 1., 1.],
-          synchronization=variables_lib.VariableSynchronization.ON_READ,
-          aggregation=aggregation)
-    self.evaluate(v.initializer)
-
-    delta = values_lib.PerReplica([
-        indexed_slices.IndexedSlices(
-            values=[[0.], [1.]], indices=[0, 1], dense_shape=(3,)),
-        indexed_slices.IndexedSlices(
-            values=[[1.], [2.]], indices=[1, 2], dense_shape=(3,)),
-    ])
-
-    with self.assertRaises(NotImplementedError):
-      self.evaluate(distribution.run(v.scatter_sub, args=(delta,)))
-
-  def testScatterAdd(self, distribution, aggregation):
-    with distribution.scope():
-      v = variables_lib.Variable(
-          [1., 1., 1.],
-          synchronization=variables_lib.VariableSynchronization.ON_READ,
-          aggregation=aggregation)
-    self.evaluate(v.initializer)
-
-    delta = values_lib.PerReplica([
-        indexed_slices.IndexedSlices(
-            values=[[0.], [1.]], indices=[0, 1], dense_shape=(3,)),
-        indexed_slices.IndexedSlices(
-            values=[[1.], [2.]], indices=[1, 2], dense_shape=(3,)),
-    ])
-
-    with self.assertRaises(NotImplementedError):
-      self.evaluate(distribution.run(v.scatter_add, args=(delta,)))
-
-  def testScatterDiv(self, distribution, aggregation):
-    with distribution.scope():
-      v = variables_lib.Variable(
-          [2., 6., 1.],
-          synchronization=variables_lib.VariableSynchronization.ON_READ,
-          aggregation=aggregation)
-    self.evaluate(v.initializer)
-
-    delta = values_lib.PerReplica([
-        indexed_slices.IndexedSlices(
-            values=[[2.], [2.]], indices=[0, 1], dense_shape=(3,)),
-        indexed_slices.IndexedSlices(
-            values=[[3.], [3.]], indices=[1, 2], dense_shape=(3,)),
-    ])
-
-    with self.assertRaises(NotImplementedError):
-      self.evaluate(distribution.run(v.scatter_div, args=(delta,)))
-
-  def testScatterMul(self, distribution, aggregation):
-    with distribution.scope():
-      v = variables_lib.Variable(
-          [2., 1., 1.],
-          synchronization=variables_lib.VariableSynchronization.ON_READ,
-          aggregation=aggregation)
-    self.evaluate(v.initializer)
-
-    delta = values_lib.PerReplica([
-        indexed_slices.IndexedSlices(
-            values=[[2.], [3.]], indices=[0, 1], dense_shape=(3,)),
-        indexed_slices.IndexedSlices(
-            values=[[4.], [5.]], indices=[1, 2], dense_shape=(3,)),
-    ])
-
-    with self.assertRaises(NotImplementedError):
-      self.evaluate(distribution.run(v.scatter_mul, args=(delta,)))
-
-  def testScatterMin(self, distribution, aggregation):
-    with distribution.scope():
-      v = variables_lib.Variable(
-          [3., 4., 5.],
-          synchronization=variables_lib.VariableSynchronization.ON_READ,
-          aggregation=aggregation)
-    self.evaluate(v.initializer)
-
-    delta = values_lib.PerReplica([
-        indexed_slices.IndexedSlices(
-            values=[[1.], [8.]], indices=[0, 1], dense_shape=(3,)),
-        indexed_slices.IndexedSlices(
-            values=[[9.], [2.]], indices=[1, 2], dense_shape=(3,)),
-    ])
-
-    with self.assertRaises(NotImplementedError):
-      self.evaluate(distribution.run(v.scatter_min, args=(delta,)))
-
-  def testScatterMax(self, distribution, aggregation):
-    with distribution.scope():
-      v = variables_lib.Variable(
-          [3., 4., 5.],
-          synchronization=variables_lib.VariableSynchronization.ON_READ,
-          aggregation=aggregation)
-    self.evaluate(v.initializer)
-
-    delta = values_lib.PerReplica([
-        indexed_slices.IndexedSlices(
-            values=[[1.], [8.]], indices=[0, 1], dense_shape=(3,)),
-        indexed_slices.IndexedSlices(
-            values=[[9.], [2.]], indices=[1, 2], dense_shape=(3,)),
-    ])
-
-    with self.assertRaises(NotImplementedError):
-      self.evaluate(distribution.run(v.scatter_max, args=(delta,)))
-
-  def testScatterUpdate(self, distribution, aggregation):
-    with distribution.scope():
-      v = variables_lib.Variable(
-          [0., 0., 0.],
-          synchronization=variables_lib.VariableSynchronization.ON_READ,
-          aggregation=aggregation)
-    self.evaluate(v.initializer)
-
-    delta = values_lib.PerReplica([
-        indexed_slices.IndexedSlices(
-            values=[[1.], [2.]], indices=[0, 1], dense_shape=(3,)),
-        indexed_slices.IndexedSlices(
-            values=[[3.], [4.]], indices=[1, 2], dense_shape=(3,)),
-    ])
-
-    with self.assertRaises(NotImplementedError):
-      self.evaluate(distribution.run(v.scatter_min, args=(delta,)))
-
 
 class MirroredTest(test.TestCase):
 
diff --git a/tensorflow/python/distribute/values_util.py b/tensorflow/python/distribute/values_util.py
index 5909bdd229e..535351e6563 100644
--- a/tensorflow/python/distribute/values_util.py
+++ b/tensorflow/python/distribute/values_util.py
@@ -26,6 +26,14 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.saved_model import save_context
+from tensorflow.python.saved_model import save_options
+
+
+# Utility function that indicates if you are in an UpdateContext when running
+# in a replica fn.
+def in_replica_update_context():
+  return distribute_lib.get_update_replica_id() is not None
 
 
 def on_write_assign(var, value, use_locking=False, name=None, read_value=True):
@@ -107,10 +115,10 @@ def on_read_assign_cross_replica(var, value, read_value=True):
       # total across all devices when restoring a variable that was summed
       # when saving.
       tensor = value
-      # TODO(anjs): Should this be over all the replicas in sync since we
-      # call `reduce` on the variable during read?
       if var.aggregation == vs.VariableAggregation.SUM:
-        tensor = math_ops.cast(tensor / len(var._devices), var.dtype)  # pylint: disable=protected-access
+        strategy = var._distribute_strategy  # pylint: disable=protected-access
+        tensor = math_ops.cast(tensor / strategy.num_replicas_in_sync,
+                               var.dtype)
       return assign_on_each_device(var, assign_on_device, tensor,
                                    read_value)
 
@@ -247,3 +255,20 @@ scatter_error_msg = ("{op_name} is only supported for mirrored "
                      "variable (variable created within certain "
                      "`tf.distribute.Strategy` scope) with NONE or "
                      "`ONLY_FIRST_REPLICA` aggregation, got: {aggregation}.")
+
+
+def is_saving_non_distributed():
+  """Returns whether we're saving a non-distributed version of the model.
+
+  It returns True iff we are in saving context and are saving a non-distributed
+  version of the model. That is, SaveOptions.experimental_variable_policy is
+  NONE.
+
+  Returns:
+    A boolean.
+  """
+  if not save_context.in_save_context():
+    return False
+  options = save_context.get_save_options()
+  return (options.experimental_variable_policy !=
+          save_options.VariablePolicy.EXPAND_DISTRIBUTED_VARIABLES)
diff --git a/tensorflow/python/distribute/vars_test.py b/tensorflow/python/distribute/vars_test.py
new file mode 100644
index 00000000000..a8605a3f2da
--- /dev/null
+++ b/tensorflow/python/distribute/vars_test.py
@@ -0,0 +1,1234 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the distributed values library."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+from absl.testing import parameterized
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.distribute import values
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import indexed_slices
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.tpu import tpu_strategy_util
+
+
+_TPU_STRATEGIES = (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)
+
+
+def strategy_and_run_tf_function_combinations():
+  # Test the combination of different strategies and whether a tf.function
+  # is passed into strategy.run."""
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+      ],
+      mode=["graph", "eager"],
+      experimental_run_tf_function=[True, False],
+      use_var_policy=[True, False]) + combinations.combine(
+          distribution=[
+              strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
+          ],
+          mode=["graph", "eager"],
+          experimental_run_tf_function=[True],
+          use_var_policy=[True, False])
+
+
+def strategy_with_var_policy():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+          strategy_combinations.tpu_strategy,
+          strategy_combinations.tpu_strategy_packed_var,
+      ],
+      mode=["graph", "eager"],
+      use_var_policy=[True, False])
+
+
+class OnWriteVariableSync(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_one_gpu,
+          ],
+          mode=["graph"]))
+  def testFetchAMirroredVariable(self, distribution):
+    with self.session(graph=ops.Graph()) as sess, distribution.scope():
+      with ops.device("/device:GPU:0"):
+        v = variable_scope.get_variable(
+            name="v", initializer=1., use_resource=True)
+      mirrored = values.MirroredVariable(
+          distribution, (v,), variable_scope.VariableAggregation.MEAN)
+      sess.run(variables_lib.global_variables_initializer())
+      sess.run({"complicated": mirrored})
+
+  @combinations.generate(strategy_and_run_tf_function_combinations())
+  def testAssign(self, distribution, experimental_run_tf_function):
+
+    def assign(fn, v, update_value, cross_replica):
+      update_fn = lambda: getattr(v, fn)(update_value)
+      if cross_replica:
+        return update_fn()
+      else:
+        if experimental_run_tf_function:
+          update_fn = def_function.function(update_fn)
+        return distribution.experimental_local_results(
+            distribution.run(update_fn))
+
+    updates = [("assign", 1.), ("assign_add", 1.), ("assign_sub", -1.)]
+    aggregations = [
+        variables_lib.VariableAggregation.NONE,
+        variables_lib.VariableAggregation.SUM,
+        variables_lib.VariableAggregation.MEAN,
+        variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
+    ]
+    options = list(
+        x for x in itertools.product(updates, aggregations, [True, False]))
+    for update, aggregation, cross_replica in options:
+      # assign in replica context with SUM does not make sense cause you can
+      # just do value * num replicas error is 1. is not a distributed value and
+      # is unsupported for aggregation SUM
+      if (not cross_replica and aggregation ==
+          variables_lib.VariableAggregation.SUM):
+        continue
+      with distribution.scope():
+        v = variable_scope.variable(
+            0.,
+            aggregation=aggregation)
+      self.evaluate(variables_lib.global_variables_initializer())
+      fn, update_value = update
+      self.evaluate(assign(fn, v, update_value, cross_replica))
+      for component in v._values:
+        self.assertAllEqual(self.evaluate(component.read_value()),
+                            self.evaluate(array_ops.ones_like(component)))
+
+  @combinations.generate(strategy_and_run_tf_function_combinations())
+  def testAssignOnWriteVar(self, distribution, experimental_run_tf_function):
+
+    with distribution.scope():
+      v_to_assign = variable_scope.variable(
+          2., aggregation=variables_lib.VariableAggregation.MEAN)
+      v_to_assign_sub = variable_scope.variable(
+          -2., aggregation=variables_lib.VariableAggregation.MEAN)
+
+    def assign(fn, v, update_value, cross_replica):
+      update_fn = lambda: getattr(v, fn)(update_value)
+      if cross_replica:
+        return update_fn()
+      else:
+        if experimental_run_tf_function:
+          update_fn = def_function.function(update_fn)
+        return distribution.experimental_local_results(
+            distribution.run(update_fn))
+
+    updates = [("assign", v_to_assign), ("assign_add", v_to_assign),
+               ("assign_sub", v_to_assign_sub)]
+    aggregations = [
+        variables_lib.VariableAggregation.NONE,
+        variables_lib.VariableAggregation.SUM,
+        variables_lib.VariableAggregation.MEAN,
+        variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
+    ]
+    options = list(
+        x for x in itertools.product(updates, aggregations, [True, False]))
+    for update, aggregation, cross_replica in options:
+      # assign in replica context with SUM does not make sense cause you can
+      # just do value * num replicas error is 1. is not a distributed value and
+      # is unsupported for aggregation SUM
+      if aggregation == variables_lib.VariableAggregation.SUM:
+        continue
+      with distribution.scope():
+        v = variable_scope.variable(
+            0.,
+            aggregation=aggregation)
+      self.evaluate(variables_lib.global_variables_initializer())
+      fn, update_value = update
+      self.evaluate(assign(fn, v, update_value, cross_replica))
+      for component in v._values:
+        self.assertAllEqual(2.0, self.evaluate(component.read_value()))
+
+  @combinations.generate(strategy_and_run_tf_function_combinations())
+  def testAssignPerReplicaVal(self, distribution, experimental_run_tf_function):
+
+    if isinstance(distribution, _TPU_STRATEGIES):
+      self.skipTest("Assigning PerReplica values is not supported. See"
+                    " sponge/80ba41f8-4220-4516-98ce-bbad48f9f11a.")
+
+    with distribution.scope():
+      per_replica_value = values.PerReplica(
+          [constant_op.constant(2.0),
+           constant_op.constant(2.0)])
+      per_replica_sub_value = values.PerReplica(
+          [constant_op.constant(-2.0),
+           constant_op.constant(-2.0)])
+
+    def assign(fn, v, update_value, cross_replica):
+      update_fn = lambda: getattr(v, fn)(update_value)
+      if cross_replica:
+        return update_fn()
+      else:
+        if experimental_run_tf_function:
+          update_fn = def_function.function(update_fn)
+        return distribution.experimental_local_results(
+            distribution.run(update_fn))
+
+    updates = [("assign", per_replica_value), ("assign_add", per_replica_value),
+               ("assign_sub", per_replica_sub_value)]
+    # We don't support assigning PerReplica valus to vars in replica context
+    # with aggregation=NONE.
+    aggregations = [
+        variables_lib.VariableAggregation.SUM,
+        variables_lib.VariableAggregation.MEAN,
+        variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
+    ]
+    options = list(
+        x for x in itertools.product(updates, aggregations, [True, False]))
+    for update, aggregation, cross_replica in options:
+      # assign in replica context with SUM does not make sense cause you can
+      # just do value * num replicas error is 1. is not a distributed value and
+      # is unsupported for aggregation SUM
+      if cross_replica:
+        # We don't support assigning PerReplica values to MirroredVariables in
+        # cross replica context
+        continue
+      with distribution.scope():
+        v = variable_scope.variable(
+            0.,
+            aggregation=aggregation)
+      self.evaluate(variables_lib.global_variables_initializer())
+      fn, update_value = update
+      self.evaluate(assign(fn, v, update_value, cross_replica))
+      if aggregation == variables_lib.VariableAggregation.SUM:
+        expected = 4.0
+      else:
+        expected = 2.0
+      for component in v._values:
+        self.assertAllEqual(expected, self.evaluate(component.read_value()))
+
+  @combinations.generate(strategy_with_var_policy())
+  def testValueInReplicaContext(self, distribution):
+    with distribution.scope():
+      v = variables_lib.Variable(
+          1., aggregation=variables_lib.VariableAggregation.MEAN)
+      self.evaluate(variables_lib.global_variables_initializer())
+
+      @def_function.function
+      def f():
+        with ops.control_dependencies([v.assign_add(1.)]):
+          return v.value()
+
+      results = self.evaluate(
+          distribution.experimental_local_results(
+              distribution.run(f)))
+      for value in results:
+        self.assertEqual(2., value)
+
+  @combinations.generate(strategy_and_run_tf_function_combinations())
+  def testReadValueInReplicaContext(self, distribution,
+                                    experimental_run_tf_function):
+    aggregations = [
+        variables_lib.VariableAggregation.NONE,
+        variables_lib.VariableAggregation.SUM,
+        variables_lib.VariableAggregation.MEAN,
+        variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
+    ]
+    for aggregation in aggregations:
+      with distribution.scope():
+        v = variable_scope.variable(
+            0.,
+            aggregation=aggregation)
+      self.evaluate(variables_lib.global_variables_initializer())
+      if experimental_run_tf_function:
+        read_var_fn = def_function.function(v.read_value)
+      else:
+        read_var_fn = v.read_value
+      results = self.evaluate(
+          distribution.experimental_local_results(
+              distribution.run(read_var_fn)))
+      for component, value in zip(v._values, results):
+        self.assertAllEqual(self.evaluate(component.read_value()), value)
+
+  @combinations.generate(strategy_and_run_tf_function_combinations())
+  def testReadValueInCrossReplicaContext(self, distribution,
+                                         experimental_run_tf_function):
+    aggregations = [
+        variables_lib.VariableAggregation.NONE,
+        variables_lib.VariableAggregation.SUM,
+        variables_lib.VariableAggregation.MEAN,
+        variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
+    ]
+    for aggregation in aggregations:
+      with distribution.scope():
+        v = variable_scope.variable(
+            2.,
+            aggregation=aggregation)
+      self.evaluate(variables_lib.global_variables_initializer())
+
+      if experimental_run_tf_function:
+        read_var_fn = def_function.function(v.read_value)
+      else:
+        read_var_fn = v.read_value
+
+      results = read_var_fn()
+      for component in v._values:
+        self.assertEqual(self.evaluate(component.read_value()),
+                         self.evaluate(results))
+
+  @combinations.generate(strategy_with_var_policy())
+  def testAssignOutOfScope(self, distribution):
+    with distribution.scope():
+      mirrored = variables_lib.Variable(1.)
+    self.evaluate(mirrored.assign(3.))
+    self.assertEqual(self.evaluate(mirrored.read_value()), 3.)
+    for component in mirrored.values:
+      self.assertEqual(self.evaluate(component.read_value()), 3.)
+
+  @combinations.generate(strategy_with_var_policy())
+  def testAssignAggregationMeanDTypeNonFloat(self, distribution):
+    if isinstance(distribution, _TPU_STRATEGIES):
+      self.skipTest("Fix sponge/6e8ab540-4c0f-4da5-aedf-86505ff810c9 before "
+                    "reenabling test.")
+
+    with distribution.scope():
+      v = variables_lib.Variable(
+          1,
+          aggregation=variable_scope.VariableAggregation.MEAN,
+          dtype=dtypes.int32)
+    self.evaluate(v.initializer)
+
+    @def_function.function
+    def assign():
+      ctx = distribution_strategy_context.get_replica_context()
+      return v.assign(ctx.replica_id_in_sync_group)
+
+    # disallow assign() with distributed value in replica context.
+    with self.assertRaisesRegex(ValueError,
+                                "Cannot update non-float variables"):
+      self.evaluate(
+          distribution.experimental_local_results(
+              distribution.run(assign)))
+
+    # allow assign() with same value in replica context.
+    @def_function.function
+    def assign_same():
+      return v.assign(2)
+
+    self.evaluate(
+        distribution.experimental_local_results(
+            distribution.run(assign_same)))
+    self.assertEqual(self.evaluate(v.read_value()), 2)
+
+    # allow assign() with mirrored variable in replica context.
+    with distribution.scope():
+      v2 = variables_lib.Variable(
+          3,
+          aggregation=variable_scope.VariableAggregation.SUM,
+          dtype=dtypes.int32)
+    self.evaluate(v2.initializer)
+
+    @def_function.function
+    def assign_mirrored():
+      return v.assign(v2)
+
+    self.evaluate(
+        distribution.experimental_local_results(
+            distribution.run(assign_mirrored)))
+    self.assertEqual(self.evaluate(v.read_value()), 3)
+
+    # allow assign() in cross replica context.
+    with distribution.scope():
+      self.evaluate(v.assign(4))
+      self.assertEqual(self.evaluate(v.read_value()), 4)
+
+  @combinations.generate(strategy_with_var_policy())
+  def testInitializedToSameValueInsideEagerRun(self, distribution):
+    if not context.executing_eagerly(): self.skipTest("eager only test")
+    v = [None]
+
+    @def_function.function
+    def step():
+
+      def f():
+        if v[0] is None:
+          v[0] = variables_lib.Variable(random_ops.random_normal([]))
+
+      distribution.run(f)
+
+    context.set_global_seed(None)
+    step()
+    vals = self.evaluate(v[0].values)
+    self.assertAllEqual(vals[0], vals[1])
+
+  @combinations.generate(strategy_with_var_policy())
+  def testAggregationOnlyFirstReplica(self, distribution):
+    with distribution.scope():
+      v = variable_scope.variable(
+          15.,
+          synchronization=variables_lib.VariableSynchronization.ON_WRITE,
+          aggregation=variables_lib.VariableAggregation.ONLY_FIRST_REPLICA)
+    self.evaluate(variables_lib.global_variables_initializer())
+
+    @def_function.function
+    def assign():
+      ctx = distribution_strategy_context.get_replica_context()
+      replica_id = ctx.replica_id_in_sync_group
+      return v.assign(math_ops.cast(replica_id, dtypes.float32))
+    per_replica_results = self.evaluate(distribution.experimental_local_results(
+        distribution.run(assign)))
+    # The per-replica values should always match the first replicas value.
+    self.assertAllEqual(
+        array_ops.zeros(distribution.num_replicas_in_sync, dtypes.float32),
+        per_replica_results)
+
+  @combinations.generate(strategy_with_var_policy())
+  def testInitScope(self, distribution):
+    if not context.executing_eagerly(): self.skipTest("eager only")
+
+    class C(object):
+      pass
+
+    obj = C()
+    obj.w = None
+    obj.v = None
+
+    @def_function.function
+    def assign():
+      with ops.init_scope():
+        if obj.w is None:
+          obj.w = variables_lib.Variable(
+              0, aggregation=variables_lib.VariableAggregation.MEAN)
+          obj.v = variables_lib.Variable(
+              obj.w.read_value(),
+              aggregation=variables_lib.VariableAggregation.MEAN)
+          self.evaluate(variables_lib.global_variables_initializer())
+
+      return obj.v.assign_add(2)
+
+    per_replica_results = self.evaluate(
+        distribution.experimental_local_results(distribution.run(assign)))
+    self.assertAllEqual([2, 2], per_replica_results)
+
+  @combinations.generate(strategy_with_var_policy())
+  def testOperatorOverride(self, distribution):
+
+    with distribution.scope():
+      v = variable_scope.variable(
+          1, aggregation=variables_lib.VariableAggregation.MEAN)
+      self.evaluate(variables_lib.global_variables_initializer())
+
+    self.assertEqual(2, self.evaluate(v + 1))
+
+    @def_function.function
+    def add():
+      return v + 1
+
+    per_replica_results = self.evaluate(
+        distribution.experimental_local_results(distribution.run(add)))
+    self.assertAllEqual([2, 2], per_replica_results)
+
+
+@combinations.generate(
+    combinations.combine(
+        distribution=[
+            strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+        ],
+        mode=["graph", "eager"],
+        use_var_policy=[True, False]))
+class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
+
+  def testScatterSub(self, distribution, use_var_policy):
+    with distribution.scope():
+      v = variables_lib.Variable(
+          [0., 0., 0.], aggregation=variables_lib.VariableAggregation.MEAN)
+    self.evaluate(v.initializer)
+
+    @def_function.function
+    def scatter_sub():
+      ctx = distribution_strategy_context.get_replica_context()
+      replica_id = ctx.replica_id_in_sync_group
+      value = indexed_slices.IndexedSlices(
+          values=array_ops.stack([
+              math_ops.cast(replica_id, dtypes.float32),
+              math_ops.cast(replica_id + 1, dtypes.float32)
+          ]),
+          indices=array_ops.stack([replica_id, replica_id + 1]),
+          dense_shape=(3,))
+      return v.scatter_sub(value)
+
+    per_replica_results = self.evaluate(
+        distribution.experimental_local_results(
+            distribution.run(scatter_sub)))
+    self.assertAllEqual([[0., -1., -1.], [0., -1., -1.]], per_replica_results)
+
+  def testScatterAdd(self, distribution, use_var_policy):
+    with distribution.scope():
+      v = variables_lib.Variable(
+          [0, 0, 0], aggregation=variables_lib.VariableAggregation.SUM)
+    self.evaluate(v.initializer)
+
+    @def_function.function
+    def scatter_add():
+      ctx = distribution_strategy_context.get_replica_context()
+      replica_id = ctx.replica_id_in_sync_group
+      value = indexed_slices.IndexedSlices(
+          values=array_ops.stack([replica_id, replica_id + 1]),
+          indices=array_ops.stack([replica_id, replica_id + 1]),
+          dense_shape=(3,))
+      return v.scatter_add(value)
+
+    per_replica_results = self.evaluate(
+        distribution.experimental_local_results(
+            distribution.run(scatter_add)))
+    self.assertAllEqual([[0, 2, 2], [0, 2, 2]], per_replica_results)
+
+  def testScatterDiv(self, distribution, use_var_policy):
+    with distribution.scope():
+      v = variables_lib.Variable(
+          [1, 6, 1], aggregation=variables_lib.VariableAggregation.SUM)
+    self.evaluate(v.initializer)
+
+    @def_function.function
+    def scatter_div():
+      ctx = distribution_strategy_context.get_replica_context()
+      replica_id = ctx.replica_id_in_sync_group
+      value = indexed_slices.IndexedSlices(
+          values=array_ops.reshape(replica_id + 2, [1]),
+          indices=array_ops.reshape(replica_id, [1]),
+          dense_shape=(3,))
+      return v.scatter_div(value)
+
+    per_replica_results = self.evaluate(
+        distribution.experimental_local_results(
+            distribution.run(scatter_div)))
+    self.assertAllEqual([[0, 2, 1], [0, 2, 1]], per_replica_results)
+
+  def testScatterMul(self, distribution, use_var_policy):
+    with distribution.scope():
+      v = variables_lib.Variable(
+          [2., 1., 1.], aggregation=variables_lib.VariableAggregation.MEAN)
+    self.evaluate(v.initializer)
+
+    @def_function.function
+    def scatter_mul():
+      ctx = distribution_strategy_context.get_replica_context()
+      replica_id = ctx.replica_id_in_sync_group
+      value = indexed_slices.IndexedSlices(
+          values=array_ops.reshape(
+              math_ops.cast(replica_id + 2, dtypes.float32), [1]),
+          indices=array_ops.reshape(replica_id, [1]),
+          dense_shape=(3,))
+      return v.scatter_mul(value)
+
+    per_replica_results = self.evaluate(
+        distribution.experimental_local_results(
+            distribution.run(scatter_mul)))
+    self.assertAllClose([[2., 1.5, 1.], [2., 1.5, 1.]], per_replica_results)
+
+  def testScatterMin(self, distribution, use_var_policy):
+    with distribution.scope():
+      v1 = variables_lib.Variable(
+          [0, 2, 0], aggregation=variables_lib.VariableAggregation.SUM)
+      v2 = variables_lib.Variable(
+          [0, 2, 0],
+          aggregation=variables_lib.VariableAggregation.ONLY_FIRST_REPLICA)
+    self.evaluate(variables_lib.global_variables_initializer())
+
+    @def_function.function
+    def scatter_min(v):
+      value = indexed_slices.IndexedSlices(
+          values=array_ops.identity([1]),
+          indices=array_ops.identity([1]),
+          dense_shape=(3,))
+      return v.scatter_min(value)
+
+    with self.assertRaisesRegex(NotImplementedError, "scatter_min.*"):
+      self.evaluate(
+          distribution.experimental_local_results(
+              distribution.run(scatter_min, args=(v1,))))
+
+    per_replica_results = self.evaluate(
+        distribution.experimental_local_results(
+            distribution.run(scatter_min, args=(v2,))))
+    self.assertAllClose([[0, 1, 0], [0, 1, 0]], per_replica_results)
+
+  def testScatterMax(self, distribution, use_var_policy):
+    with distribution.scope():
+      v1 = variables_lib.Variable(
+          [0, 0, 0], aggregation=variables_lib.VariableAggregation.SUM)
+      v2 = variables_lib.Variable(
+          [0, 0, 0],
+          aggregation=variables_lib.VariableAggregation.ONLY_FIRST_REPLICA)
+    self.evaluate(variables_lib.global_variables_initializer())
+
+    @def_function.function
+    def scatter_max(v):
+      value = indexed_slices.IndexedSlices(
+          values=array_ops.identity([1]),
+          indices=array_ops.identity([0]),
+          dense_shape=(3,))
+      return v.scatter_max(value)
+
+    with self.assertRaisesRegex(NotImplementedError, "scatter_max.*"):
+      self.evaluate(
+          distribution.experimental_local_results(
+              distribution.run(scatter_max, args=(v1,))))
+
+    per_replica_results = self.evaluate(
+        distribution.experimental_local_results(
+            distribution.run(scatter_max, args=(v2,))))
+    self.assertAllClose([[1, 0, 0], [1, 0, 0]], per_replica_results)
+
+  def testScatterUpdate(self, distribution, use_var_policy):
+    with distribution.scope():
+      v1 = variables_lib.Variable(
+          [0, 0, 0], aggregation=variables_lib.VariableAggregation.SUM)
+      v2 = variables_lib.Variable(
+          [0, 0, 0],
+          aggregation=variables_lib.VariableAggregation.ONLY_FIRST_REPLICA)
+    self.evaluate(variables_lib.global_variables_initializer())
+
+    @def_function.function
+    def scatter_update(v):
+      value = indexed_slices.IndexedSlices(
+          values=array_ops.identity([3]),
+          indices=array_ops.identity([1]),
+          dense_shape=(3,))
+      return v.scatter_update(value)
+
+    with self.assertRaisesRegex(NotImplementedError, "scatter_update.*"):
+      self.evaluate(
+          distribution.experimental_local_results(
+              distribution.run(scatter_update, args=(v1,))))
+
+    per_replica_results = self.evaluate(
+        distribution.experimental_local_results(
+            distribution.run(scatter_update, args=(v2,))))
+    self.assertAllClose([[0, 3, 0], [0, 3, 0]], per_replica_results)
+
+  def testScatterOpsInCrossReplicaContext(self, distribution, use_var_policy):
+    with distribution.scope():
+      v1 = variables_lib.Variable(
+          [1, 1, 1], aggregation=variables_lib.VariableAggregation.SUM)
+      v2 = variables_lib.Variable([1, 1, 1])
+    self.evaluate(variables_lib.global_variables_initializer())
+
+    value = indexed_slices.IndexedSlices(
+        values=array_ops.identity([2]),
+        indices=array_ops.identity([0]),
+        dense_shape=(3,))
+    with distribution.scope():
+      self.evaluate(v1.scatter_add(value))
+      self.assertAllEqual([3, 1, 1], self.evaluate(v1.read_value()))
+
+      self.evaluate(v2.scatter_min(value))
+      self.assertAllEqual([1, 1, 1], self.evaluate(v2.read_value()))
+
+
+class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(strategy_and_run_tf_function_combinations())
+  def testAssign(self, distribution, experimental_run_tf_function,
+                 use_var_policy):
+
+    def assign(fn, v, update_value, cross_replica):
+      update_fn = lambda: getattr(v, fn)(update_value)
+      if cross_replica:
+        return update_fn()
+      else:
+        if experimental_run_tf_function:
+          update_fn = def_function.function(update_fn)
+        return distribution.experimental_local_results(
+            distribution.run(update_fn))
+
+    updates = [("assign", 1.), ("assign_add", 1.), ("assign_sub", -1.)]
+    aggregations = [
+        variables_lib.VariableAggregation.NONE,
+        variables_lib.VariableAggregation.SUM,
+        variables_lib.VariableAggregation.MEAN,
+        variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
+    ]
+    options = list(
+        x for x in itertools.product(updates, aggregations, [True, False]))
+    for update, aggregation, cross_replica in options:
+      # VariableAggregation.SUM in cross-replica mode is tested below,
+      # VariableAggregation.NONE in cross-replica mode is not supported.
+      if cross_replica and aggregation in [
+          variables_lib.VariableAggregation.SUM,
+          variables_lib.VariableAggregation.NONE,
+      ]:
+        continue
+      with distribution.scope():
+        v = variable_scope.variable(
+            0.,
+            synchronization=variables_lib.VariableSynchronization.ON_READ,
+            aggregation=aggregation)
+      self.evaluate(variables_lib.global_variables_initializer())
+      fn, update_value = update
+      self.evaluate(assign(fn, v, update_value, cross_replica))
+      for component in v._values:
+        self.assertAllEqual(self.evaluate(component.read_value()),
+                            self.evaluate(array_ops.ones_like(component)))
+
+  @combinations.generate(strategy_and_run_tf_function_combinations())
+  def testAssignOnReadVar(self, distribution, experimental_run_tf_function,
+                          use_var_policy):
+
+    with distribution.scope():
+      v_to_assign = variable_scope.variable(
+          2., aggregation=variables_lib.VariableAggregation.MEAN)
+      v_to_assign_sub = variable_scope.variable(
+          -2., aggregation=variables_lib.VariableAggregation.MEAN)
+
+    def assign(fn, v, update_value, cross_replica):
+      update_fn = lambda: getattr(v, fn)(update_value)
+      if cross_replica:
+        return update_fn()
+      else:
+        if experimental_run_tf_function:
+          update_fn = def_function.function(update_fn)
+        return distribution.experimental_local_results(
+            distribution.run(update_fn))
+
+    updates = [("assign", v_to_assign), ("assign_add", v_to_assign),
+               ("assign_sub", v_to_assign_sub)]
+    expected_cross_replica = {
+        variables_lib.VariableAggregation.SUM: 1.0,
+        variables_lib.VariableAggregation.MEAN: 2.0,
+        variables_lib.VariableAggregation.ONLY_FIRST_REPLICA: 2.0
+    }
+    expected_replica = {
+        variables_lib.VariableAggregation.SUM: 2.0,
+        variables_lib.VariableAggregation.MEAN: 2.0,
+        variables_lib.VariableAggregation.ONLY_FIRST_REPLICA: 2.0
+    }
+    # aggregation=NONE is not supported for OnReadVariables.
+    aggregations = [
+        variables_lib.VariableAggregation.SUM,
+        variables_lib.VariableAggregation.MEAN,
+        variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
+    ]
+    options = list(
+        x for x in itertools.product(updates, aggregations, [True, False]))
+    for update, aggregation, cross_replica in options:
+      # assign in replica context with SUM does not make sense cause you can
+      # just do value * num replicas error is 1. is not a distributed value and
+      # is unsupported for aggregation SUM
+      if aggregation == variables_lib.VariableAggregation.SUM:
+        continue
+      with distribution.scope():
+        v = variable_scope.variable(
+            0.,
+            aggregation=aggregation)
+      self.evaluate(variables_lib.global_variables_initializer())
+      fn, update_value = update
+      self.evaluate(assign(fn, v, update_value, cross_replica))
+      if cross_replica:
+        for component in v._values:
+          self.assertAllEqual(expected_cross_replica.get(aggregation),
+                              self.evaluate(component.read_value()))
+      else:
+        for component in v._values:
+          self.assertAllEqual(expected_replica.get(aggregation),
+                              self.evaluate(component.read_value()))
+
+  @combinations.generate(strategy_and_run_tf_function_combinations())
+  def testAssignPerReplicaVal(self, distribution, experimental_run_tf_function,
+                              use_var_policy):
+
+    if isinstance(distribution, _TPU_STRATEGIES):
+      self.skipTest("Assigning PerReplica values is not supported. See"
+                    " sponge/80ba41f8-4220-4516-98ce-bbad48f9f11a.")
+
+    self.skipTest("We don't support assiging PerReplica values in cross "
+                  "replica context or replica context. see error in "
+                  "sponge/2b2e54c1-eda6-4534-82e1-c73b1dcd517f.")
+
+    with distribution.scope():
+      per_replica_value = values.PerReplica(
+          [constant_op.constant(2.0),
+           constant_op.constant(2.0)])
+
+    def assign(fn, v, update_value, cross_replica):
+      update_fn = lambda: getattr(v, fn)(update_value)
+      if cross_replica:
+        return update_fn()
+      else:
+        if experimental_run_tf_function:
+          update_fn = def_function.function(update_fn)
+        return distribution.experimental_local_results(
+            distribution.run(update_fn))
+
+    updates = [("assign", per_replica_value)]
+    # We don't support assigning PerReplica valus to vars in replica context
+    # with aggregation=NONE.
+    aggregations = [
+        variables_lib.VariableAggregation.SUM,
+        variables_lib.VariableAggregation.MEAN,
+        variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
+    ]
+    options = list(
+        x for x in itertools.product(updates, aggregations, [True, False]))
+    for update, aggregation, cross_replica in options:
+      # assign in replica context with SUM does not make sense cause you can
+      # just do value * num replicas error is 1. is not a distributed value and
+      # is unsupported for aggregation SUM
+      with distribution.scope():
+        v = variable_scope.variable(
+            0.,
+            synchronization=variables_lib.VariableSynchronization.ON_READ,
+            aggregation=aggregation)
+      self.evaluate(variables_lib.global_variables_initializer())
+      fn, update_value = update
+      # with self.assertRaisesRegex(ValueError, "Attempt to convert a value "):
+      self.evaluate(assign(fn, v, update_value, cross_replica))
+      if aggregation == variables_lib.VariableAggregation.SUM:
+        expected = 4.0
+      else:
+        expected = 2.0
+      for component in v._values:
+        self.assertAllEqual(expected, self.evaluate(component.read_value()))
+
+  @combinations.generate(strategy_and_run_tf_function_combinations())
+  def testAssignDtypeConversion(self, distribution,
+                                experimental_run_tf_function,
+                                use_var_policy):
+
+    def assign(fn, v, update_value, cross_replica):
+      update_fn = lambda: getattr(v, fn)(update_value)
+      if cross_replica:
+        return update_fn()
+      else:
+        if experimental_run_tf_function:
+          update_fn = def_function.function(update_fn)
+        return distribution.experimental_local_results(
+            distribution.run(update_fn))
+
+    updates = [("assign", 1), ("assign_add", 1), ("assign_sub", -1)]
+    aggregations = [
+        variables_lib.VariableAggregation.NONE,
+        variables_lib.VariableAggregation.SUM,
+        variables_lib.VariableAggregation.MEAN,
+        variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
+    ]
+    options = list(
+        x for x in itertools.product(updates, aggregations, [True, False]))
+    for update, aggregation, cross_replica in options:
+      # VariableAggregation.SUM in cross-replica mode is tested below,
+      # VariableAggregation.NONE in cross-replica mode is not supported.
+      if cross_replica and aggregation in [
+          variables_lib.VariableAggregation.SUM,
+          variables_lib.VariableAggregation.NONE,
+      ]:
+        continue
+      with distribution.scope():
+        v = variable_scope.variable(
+            0.,
+            synchronization=variables_lib.VariableSynchronization.ON_READ,
+            aggregation=aggregation)
+      self.evaluate(variables_lib.global_variables_initializer())
+      fn, update_value = update
+      self.evaluate(assign(fn, v, update_value, cross_replica))
+      for component in v._values:
+        self.assertAllEqual(self.evaluate(component.read_value()),
+                            self.evaluate(array_ops.ones_like(component)))
+
+  @combinations.generate(strategy_with_var_policy())
+  def testAssignWithAggregationSum(self, distribution, use_var_policy):
+    with distribution.scope():
+      v = variable_scope.variable(
+          0.,
+          synchronization=variables_lib.VariableSynchronization.ON_READ,
+          aggregation=variables_lib.VariableAggregation.SUM)
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(v.assign(1. * distribution.num_replicas_in_sync))
+    for component in v._values:
+      self.assertAllEqual(self.evaluate(component.read_value()),
+                          self.evaluate(array_ops.ones_like(component)))
+
+  @combinations.generate(strategy_with_var_policy())
+  def testAssignAddSubWithAggregationSum(self, distribution, use_var_policy):
+    with distribution.scope():
+      v = variable_scope.variable(
+          0.,
+          synchronization=variables_lib.VariableSynchronization.ON_READ,
+          aggregation=variables_lib.VariableAggregation.SUM)
+    self.evaluate(variables_lib.global_variables_initializer())
+    with self.assertRaisesRegex(
+        ValueError, "SyncOnReadVariable does not support "):
+      self.evaluate(v.assign_add(1.))
+    with self.assertRaisesRegex(
+        ValueError, "SyncOnReadVariable does not support "):
+      self.evaluate(v.assign_sub(1.))
+
+  @combinations.generate(strategy_and_run_tf_function_combinations())
+  def testReadValueInReplicaContext(self, distribution,
+                                    experimental_run_tf_function,
+                                    use_var_policy):
+    aggregations = [
+        variables_lib.VariableAggregation.NONE,
+        variables_lib.VariableAggregation.SUM,
+        variables_lib.VariableAggregation.MEAN,
+        variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
+    ]
+    for aggregation in aggregations:
+      with distribution.scope():
+        v = variable_scope.variable(
+            0.,
+            synchronization=variables_lib.VariableSynchronization.ON_READ,
+            aggregation=aggregation)
+      self.evaluate(variables_lib.global_variables_initializer())
+      if experimental_run_tf_function:
+        read_var_fn = def_function.function(v.read_value)
+      else:
+        read_var_fn = v.read_value
+      results = self.evaluate(
+          distribution.experimental_local_results(
+              distribution.run(read_var_fn)))
+      for component, value in zip(v._values, results):
+        self.assertAllEqual(self.evaluate(component.read_value()), value)
+
+  @combinations.generate(strategy_and_run_tf_function_combinations())
+  def testReadValueInCrossReplicaContext(self, distribution,
+                                         experimental_run_tf_function,
+                                         use_var_policy):
+    aggregations = [
+        variables_lib.VariableAggregation.SUM,
+        variables_lib.VariableAggregation.MEAN,
+        variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
+    ]
+    for aggregation in aggregations:
+      if isinstance(distribution, _TPU_STRATEGIES):
+        resolver = tpu_cluster_resolver.TPUClusterResolver("")
+        tpu_strategy_util.initialize_tpu_system(resolver)
+      with distribution.scope():
+        v = variable_scope.variable(
+            0.,
+            synchronization=variables_lib.VariableSynchronization.ON_READ,
+            aggregation=aggregation)
+      self.evaluate(variables_lib.global_variables_initializer())
+
+      def assign(v=v):
+        ctx = distribution_strategy_context.get_replica_context()
+        replica_id = ctx.replica_id_in_sync_group
+        return v.assign(math_ops.cast(replica_id, dtypes.float32))
+
+      if experimental_run_tf_function:
+        assign = def_function.function(assign)
+
+      self.evaluate(
+          distribution.experimental_local_results(distribution.run(assign)))
+      num_replicas = distribution.num_replicas_in_sync
+      sum_of_replica_values = num_replicas * (num_replicas - 1) / 2.
+      if aggregation == variables_lib.VariableAggregation.SUM:
+        expected = sum_of_replica_values
+      elif aggregation == variables_lib.VariableAggregation.MEAN:
+        expected = sum_of_replica_values / num_replicas
+      else:
+        expected = 0
+      self.assertEqual(expected, self.evaluate(v.read_value()), aggregation)
+      self.assertEqual(expected, self.evaluate(v.value()), aggregation)
+      self.assertEqual(expected, self.evaluate(v), aggregation)
+      self.assertEqual(expected, self.evaluate(array_ops.identity(v)),
+                       aggregation)
+
+  # TODO(b/145574622): Re-enable this test once ReduceOp argument is
+  # respected on GPUs.
+  @combinations.generate(strategy_and_run_tf_function_combinations())
+  def disable_testAllReduce(self, distribution,
+                            experimental_run_tf_function,
+                            use_var_policy):
+    with distribution.scope():
+      v = variable_scope.variable(
+          2.,
+          synchronization=variables_lib.VariableSynchronization.ON_WRITE,
+          aggregation=variables_lib.VariableAggregation.MEAN)
+    self.evaluate(variables_lib.global_variables_initializer())
+
+    def all_reduce():
+      ctx = distribution_strategy_context.get_replica_context()
+      replica_id = ctx.replica_id_in_sync_group
+      return ctx.all_reduce("SUM", v) + math_ops.cast(replica_id,
+                                                      dtypes.float32)
+
+    if experimental_run_tf_function:
+      all_reduce = def_function.function(all_reduce)
+
+    per_replica_results = self.evaluate(
+        distribution.experimental_local_results(distribution.run(all_reduce)))
+    expected_result = []
+    for i in range(distribution.num_replicas_in_sync):
+      expected_result.append(2.0 * distribution.num_replicas_in_sync +
+                             1.0 * i)
+    self.assertEqual(per_replica_results, tuple(expected_result))
+
+  @combinations.generate(strategy_and_run_tf_function_combinations())
+  def testAssignPerReplicaBeforeRead(self, distribution,
+                                     experimental_run_tf_function,
+                                     use_var_policy):
+    aggregations = [
+        variables_lib.VariableAggregation.SUM,
+        variables_lib.VariableAggregation.MEAN,
+        variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
+    ]
+    for aggregation in aggregations:
+      with distribution.scope():
+        v = variable_scope.variable(
+            0.,
+            synchronization=variables_lib.VariableSynchronization.ON_READ,
+            aggregation=aggregation)
+      self.evaluate(variables_lib.global_variables_initializer())
+
+      def assign(var=v):
+        ctx = distribution_strategy_context.get_replica_context()
+        replica_id = ctx.replica_id_in_sync_group
+        return var.assign(math_ops.cast(replica_id, dtypes.float32))
+
+      if experimental_run_tf_function:
+        assign = def_function.function(assign)
+
+      per_replica_results = self.evaluate(
+          distribution.experimental_local_results(distribution.run(assign)))
+      expected_result = []
+      for i in range(distribution.num_replicas_in_sync):
+        expected_result.append(1.0 * i)
+      self.assertEqual(per_replica_results, tuple(expected_result))
+
+  @combinations.generate(strategy_with_var_policy())
+  def testReadValueWithAggregationNoneInCrossReplicaContext(self, distribution,
+                                                            use_var_policy):
+    with distribution.scope():
+      v = variable_scope.variable(
+          0.,
+          synchronization=variables_lib.VariableSynchronization.ON_READ,
+          aggregation=variables_lib.VariableAggregation.NONE)
+    self.evaluate(variables_lib.global_variables_initializer())
+    with self.assertRaisesRegex(
+        ValueError, "Could not convert from .* VariableAggregation\\.NONE"):
+      self.evaluate(v.read_value())
+
+  @combinations.generate(strategy_with_var_policy())
+  def testInitializedToSameValueInsideEagerRun(self, distribution,
+                                               use_var_policy):
+    if not context.executing_eagerly(): self.skipTest("eager only")
+
+    v = [None]
+    @def_function.function
+    def step():
+      def f():
+        if v[0] is None:
+          v[0] = variables_lib.Variable(
+              random_ops.random_normal([]),
+              synchronization=variables_lib.VariableSynchronization.ON_READ)
+
+      distribution.run(f)
+
+    context.set_global_seed(None)
+    step()
+    vals = self.evaluate(v[0].values)
+    self.assertAllEqual(vals[0], vals[1])
+
+  @combinations.generate(strategy_with_var_policy())
+  def testOperatorOverride(self, distribution, use_var_policy):
+
+    with distribution.scope():
+      v = variable_scope.variable(
+          0.0,
+          synchronization=variables_lib.VariableSynchronization.ON_READ,
+          aggregation=variables_lib.VariableAggregation.MEAN)
+      self.evaluate(variables_lib.global_variables_initializer())
+
+      @def_function.function
+      def assign():
+        ctx = distribution_strategy_context.get_replica_context()
+        replica_id = ctx.replica_id_in_sync_group
+        return v.assign(math_ops.cast(replica_id, dtypes.float32))
+
+      # Assign different replicas with different values.
+      self.evaluate(distribution.experimental_local_results(
+          distribution.run(assign)))
+      self.assertEqual(1.5, self.evaluate(v + 1))
+
+      @def_function.function
+      def add():
+        return v + 1
+
+      per_replica_results = self.evaluate(
+          distribution.experimental_local_results(distribution.run(add)))
+      self.assertAllEqual([1, 2], per_replica_results)
+
+
+@combinations.generate(
+    combinations.combine(
+        distribution=[
+            strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+        ],
+        aggregation=[
+            variables_lib.VariableAggregation.MEAN,
+            variables_lib.VariableAggregation.SUM,
+            variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
+        ],
+        mode=["graph", "eager"],
+        use_var_policy=[True, False]))
+class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
+
+  def testScatterSub(self, distribution, aggregation, use_var_policy):
+    with distribution.scope():
+      v = variables_lib.Variable(
+          [1., 1., 1.],
+          synchronization=variables_lib.VariableSynchronization.ON_READ,
+          aggregation=aggregation)
+    self.evaluate(v.initializer)
+
+    delta = values.PerReplica([
+        indexed_slices.IndexedSlices(
+            values=[[0.], [1.]], indices=[0, 1], dense_shape=(3,)),
+        indexed_slices.IndexedSlices(
+            values=[[1.], [2.]], indices=[1, 2], dense_shape=(3,)),
+    ])
+
+    with self.assertRaises(NotImplementedError):
+      self.evaluate(distribution.run(v.scatter_sub, args=(delta,)))
+
+  def testScatterAdd(self, distribution, aggregation, use_var_policy):
+    with distribution.scope():
+      v = variables_lib.Variable(
+          [1., 1., 1.],
+          synchronization=variables_lib.VariableSynchronization.ON_READ,
+          aggregation=aggregation)
+    self.evaluate(v.initializer)
+
+    delta = values.PerReplica([
+        indexed_slices.IndexedSlices(
+            values=[[0.], [1.]], indices=[0, 1], dense_shape=(3,)),
+        indexed_slices.IndexedSlices(
+            values=[[1.], [2.]], indices=[1, 2], dense_shape=(3,)),
+    ])
+
+    with self.assertRaises(NotImplementedError):
+      self.evaluate(distribution.run(v.scatter_add, args=(delta,)))
+
+  def testScatterDiv(self, distribution, aggregation, use_var_policy):
+    with distribution.scope():
+      v = variables_lib.Variable(
+          [2., 6., 1.],
+          synchronization=variables_lib.VariableSynchronization.ON_READ,
+          aggregation=aggregation)
+    self.evaluate(v.initializer)
+
+    delta = values.PerReplica([
+        indexed_slices.IndexedSlices(
+            values=[[2.], [2.]], indices=[0, 1], dense_shape=(3,)),
+        indexed_slices.IndexedSlices(
+            values=[[3.], [3.]], indices=[1, 2], dense_shape=(3,)),
+    ])
+
+    with self.assertRaises(NotImplementedError):
+      self.evaluate(distribution.run(v.scatter_div, args=(delta,)))
+
+  def testScatterMul(self, distribution, aggregation, use_var_policy):
+    with distribution.scope():
+      v = variables_lib.Variable(
+          [2., 1., 1.],
+          synchronization=variables_lib.VariableSynchronization.ON_READ,
+          aggregation=aggregation)
+    self.evaluate(v.initializer)
+
+    delta = values.PerReplica([
+        indexed_slices.IndexedSlices(
+            values=[[2.], [3.]], indices=[0, 1], dense_shape=(3,)),
+        indexed_slices.IndexedSlices(
+            values=[[4.], [5.]], indices=[1, 2], dense_shape=(3,)),
+    ])
+
+    with self.assertRaises(NotImplementedError):
+      self.evaluate(distribution.run(v.scatter_mul, args=(delta,)))
+
+  def testScatterMin(self, distribution, aggregation, use_var_policy):
+    with distribution.scope():
+      v = variables_lib.Variable(
+          [3., 4., 5.],
+          synchronization=variables_lib.VariableSynchronization.ON_READ,
+          aggregation=aggregation)
+    self.evaluate(v.initializer)
+
+    delta = values.PerReplica([
+        indexed_slices.IndexedSlices(
+            values=[[1.], [8.]], indices=[0, 1], dense_shape=(3,)),
+        indexed_slices.IndexedSlices(
+            values=[[9.], [2.]], indices=[1, 2], dense_shape=(3,)),
+    ])
+
+    with self.assertRaises(NotImplementedError):
+      self.evaluate(distribution.run(v.scatter_min, args=(delta,)))
+
+  def testScatterMax(self, distribution, aggregation, use_var_policy):
+    with distribution.scope():
+      v = variables_lib.Variable(
+          [3., 4., 5.],
+          synchronization=variables_lib.VariableSynchronization.ON_READ,
+          aggregation=aggregation)
+    self.evaluate(v.initializer)
+
+    delta = values.PerReplica([
+        indexed_slices.IndexedSlices(
+            values=[[1.], [8.]], indices=[0, 1], dense_shape=(3,)),
+        indexed_slices.IndexedSlices(
+            values=[[9.], [2.]], indices=[1, 2], dense_shape=(3,)),
+    ])
+
+    with self.assertRaises(NotImplementedError):
+      self.evaluate(distribution.run(v.scatter_max, args=(delta,)))
+
+  def testScatterUpdate(self, distribution, aggregation, use_var_policy):
+    with distribution.scope():
+      v = variables_lib.Variable(
+          [0., 0., 0.],
+          synchronization=variables_lib.VariableSynchronization.ON_READ,
+          aggregation=aggregation)
+    self.evaluate(v.initializer)
+
+    delta = values.PerReplica([
+        indexed_slices.IndexedSlices(
+            values=[[1.], [2.]], indices=[0, 1], dense_shape=(3,)),
+        indexed_slices.IndexedSlices(
+            values=[[3.], [4.]], indices=[1, 2], dense_shape=(3,)),
+    ])
+
+    with self.assertRaises(NotImplementedError):
+      self.evaluate(distribution.run(v.scatter_min, args=(delta,)))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 65c99b8c6e5..358929dc870 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -56,13 +56,16 @@ cc_library(
         "//tensorflow/core/platform:logging",
         "//tensorflow/core/platform:types",
         "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/util:abstract_stack_trace",
         "//tensorflow/python:cpp_python_util",
         "//tensorflow/python:ndarray_tensor",
         "//tensorflow/python:ndarray_tensor_bridge",
         "//tensorflow/python:numpy_lib",
         "//tensorflow/python:py_exception_registry",
         "//tensorflow/python:py_seq_tensor",
+        "//tensorflow/python:py_util",
         "//tensorflow/python:safe_ptr",
+        "//tensorflow/python:stack_trace",
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -141,6 +144,7 @@ cuda_py_test(
     size = "small",
     srcs = ["cancellation_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":cancellation",
         ":test",
@@ -247,6 +251,7 @@ cuda_py_test(
     name = "monitoring_test",
     srcs = ["monitoring_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":monitoring",
         ":test",
@@ -390,6 +395,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["function_argument_naming_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":backprop",
         ":def_function",
@@ -405,6 +411,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["function_defun_collection_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":backprop",
         ":def_function",
@@ -440,7 +447,10 @@ cuda_py_test(
     srcs = ["function_test.py"],
     python_version = "PY3",
     shard_count = 15,
-    tags = ["nomac"],  # b/157056289
+    tags = [
+        "nogpu",  # TODO(b/162544929): segfault
+        "nomac",  # b/157056289
+    ],
     deps = [
         ":backprop",
         ":cancellation",
@@ -518,6 +528,7 @@ cuda_py_test(
     name = "graph_only_ops_test",
     srcs = ["graph_only_ops_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         "graph_only_ops",
         "//tensorflow/python:client_testlib",
@@ -561,6 +572,8 @@ py_library(
         "//tensorflow/python:pywrap_tf_session",
         "//tensorflow/python:util",
         "//tensorflow/python/ops/numpy_ops:numpy",
+        "//tensorflow/python/saved_model:save_context",
+        "//tensorflow/python/saved_model:save_options",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -615,6 +628,7 @@ py_library(
         "//tensorflow/python:platform",
         "//tensorflow/python:pywrap_tfe",
         "//tensorflow/python:util",
+        "//tensorflow/python/ops/numpy_ops:numpy",
         "//tensorflow/python/ops/parallel_for:control_flow_ops",
     ],
 )
@@ -661,6 +675,7 @@ cuda_py_test(
     name = "remote_benchmarks_test",
     srcs = ["remote_benchmarks_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":backprop",
         ":benchmarks_test_base",
@@ -686,6 +701,7 @@ tf_py_test(
     name = "tape_test",
     srcs = ["tape_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":backprop",
         ":context",
@@ -794,6 +810,7 @@ tf_py_test(
     size = "medium",
     srcs = ["lift_to_graph_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         "lift_to_graph",
         "//tensorflow/python:framework_ops",
@@ -814,6 +831,8 @@ cuda_py_test(
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python/autograph/core",
+        "//tensorflow/python/saved_model:save_context",
+        "//tensorflow/python/saved_model:save_options",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -839,18 +858,24 @@ tf_py_test(
     ],
 )
 
-cuda_py_test(
+tf_xla_py_test(
     name = "def_function_xla_jit_test",
     srcs = ["def_function_xla_jit_test.py"],
+    disabled_backends = [
+        "cpu_ondemand",
+    ],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_mac",
+        "no_pip",
         "no_windows",
     ],
-    xla_enabled = True,
+    use_xla_device = False,
     deps = [
         ":backprop",
         ":def_function",
+        "//tensorflow/compiler/tests:xla_test",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_ops",
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index b6e3661fc1d..71b1303ecf4 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -997,6 +997,9 @@ class GradientTape(object):
                unconnected_gradients=UnconnectedGradients.NONE):
     """Computes the gradient using operations recorded in context of this tape.
 
+    Note: Unless you set `persistent=True` a GradientTape can only be used to
+    compute one set of gradients (or jacobians).
+
     Args:
       target: a list or nested structure of Tensors or Variables to be
         differentiated.
@@ -1015,14 +1018,14 @@ class GradientTape(object):
       the structure of `sources`.
 
     Raises:
-      RuntimeError: if called inside the context of the tape, or if called more
-       than once on a non-persistent tape.
-      ValueError: if the target is a variable or if unconnected gradients is
+      RuntimeError: If called on a used, non-persistent tape.
+      RuntimeError: If called inside the context of the tape.
+      ValueError: If the target is a variable or if unconnected gradients is
        called with an unknown value.
     """
     if self._tape is None:
-      raise RuntimeError("GradientTape.gradient can only be called once on "
-                         "non-persistent tapes.")
+      raise RuntimeError("A non-persistent GradientTape can only be used to"
+                         "compute one set of gradients (or jacobians)")
     if self._recording:
       if not self._persistent:
         self._pop_tape()
@@ -1088,7 +1091,11 @@ class GradientTape(object):
       self._tape = None
 
     if rewrap_as_ndarray:
-      flat_grad = nest.map_structure(np_arrays.tensor_to_ndarray, flat_grad)
+      def _tensor_to_ndarray(x):
+        if x is not None:
+          return np_arrays.tensor_to_ndarray(x)
+        return None
+      flat_grad = nest.map_structure(_tensor_to_ndarray, flat_grad)
 
     grad = nest.pack_sequence_as(sources, flat_grad)
     return grad
@@ -1101,6 +1108,9 @@ class GradientTape(object):
                experimental_use_pfor=True):
     """Computes the jacobian using operations recorded in context of this tape.
 
+    Note: Unless you set `persistent=True` a GradientTape can only be used to
+    compute one set of gradients (or jacobians).
+
     See[wikipedia article](http://en.wikipedia.org/wiki/jacobian_matrix_and_determinant)
     for the definition of a Jacobian.
 
@@ -1139,10 +1149,15 @@ class GradientTape(object):
 
 
     Raises:
+      RuntimeError: If called on a used, non-persistent tape.
       RuntimeError: If called on a non-persistent tape with eager execution
         enabled and without enabling experimental_use_pfor.
       ValueError: If vectorization of jacobian computation fails.
     """
+    if self._tape is None:
+      raise RuntimeError("A non-persistent GradientTape can only be used to"
+                         "compute one set of gradients (or jacobians)")
+
     flat_sources = nest.flatten(sources)
     rewrap_as_ndarray = False
     if isinstance(target, np_arrays.ndarray):
@@ -1225,6 +1240,9 @@ class GradientTape(object):
     are lower dimensional and avoid a bunch of redundant zeros which would
     result in the jacobian computation given the independence assumption.
 
+    Note: Unless you set `persistent=True` a GradientTape can only be used to
+    compute one set of gradients (or jacobians).
+
     Example usage:
 
     ```python
@@ -1255,11 +1273,21 @@ class GradientTape(object):
       per-example jacobians.
 
     Raises:
+      RuntimeError: If called on a used, non-persistent tape.
       RuntimeError: If called on a non-persistent tape with eager execution
         enabled and without enabling experimental_use_pfor.
       ValueError: If vectorization of jacobian computation fails or if first
         dimension of `target` and `source` do not match.
     """
+    if self._tape is None:
+      raise RuntimeError("A non-persistent GradientTape can only be used to"
+                         "compute one set of gradients (or jacobians)")
+    rewrap_as_ndarray = False
+    if isinstance(target, np_arrays.ndarray):
+      target = target.data
+      rewrap_as_ndarray = True
+    if isinstance(source, np_arrays.ndarray):
+      source = source.data
     target_shape = target.shape
     if target_shape.rank is None:
       dim = tensor_shape.Dimension(None)
@@ -1317,9 +1345,16 @@ class GradientTape(object):
                                  parallel_iterations=parallel_iterations)
     new_shape = array_ops.concat([target_shape, source_shape[1:]], axis=0)
     if output is None:
-      return array_ops.zeros(new_shape)
+      output = array_ops.zeros(new_shape)
+      if rewrap_as_ndarray:
+        output = np_arrays.tensor_to_ndarray(output)
+      return output
     else:
       output = array_ops.reshape(output,
                                  [target_row_size, batch_size, -1])
       output = array_ops.transpose(output, [1, 0, 2])
-      return array_ops.reshape(output, new_shape)
+
+      output = array_ops.reshape(output, new_shape)
+      if rewrap_as_ndarray:
+        output = np_arrays.tensor_to_ndarray(output)
+      return output
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 4f53e45ba0a..0adb4698529 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -288,8 +288,8 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
       tf_opt = training.GradientDescentOptimizer(0.1)
       tf_embedding.initializer.run()
 
-      self.assertAllClose(tf_grad.indices.eval(), grad.indices)
-      self.assertAllClose(tf_grad.values.eval(), grad.values)
+      self.assertAllClose(tf_grad.indices, grad.indices)
+      self.assertAllClose(tf_grad.values, grad.values)
 
       tf_opt.apply_gradients([(tf_grad, tf_embedding)]).run()
       expected = self.evaluate(tf_embedding)
@@ -837,9 +837,33 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
       z = y * y
     g.gradient(z, [x])
     with self.assertRaisesRegex(
-        RuntimeError, 'GradientTape.gradient can only be called once'):
+        RuntimeError, 'A non-persistent GradientTape can only'):
       g.gradient(y, [x])
 
+  @test_util.assert_no_new_tensors
+  def testGradientTapeJacobianCalledMultipleTimes(self):
+    with backprop.GradientTape() as g:
+      x = constant_op.constant(3.0)
+      g.watch(x)
+      y = x * x
+      z = y * y
+    g.jacobian(z, [x])
+    with self.assertRaisesRegex(
+        RuntimeError, 'A non-persistent GradientTape can only'):
+      g.jacobian(y, [x])
+
+  @test_util.assert_no_new_tensors
+  def testGradientTapeBatchJacobianCalledMultipleTimes(self):
+    with backprop.GradientTape() as g:
+      x = constant_op.constant([[3.0]])
+      g.watch(x)
+      y = x * x
+      z = y * y
+    g.batch_jacobian(z, x)
+    with self.assertRaisesRegex(
+        RuntimeError, 'A non-persistent GradientTape can only'):
+      g.batch_jacobian(y, [x])
+
   @test_util.assert_no_new_tensors
   @test_util.run_in_graph_and_eager_modes
   @test_util.run_v1_only('b/120545219')
diff --git a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
index b2e57c11e3c..e034cf0e296 100644
--- a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
+++ b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
@@ -287,6 +287,9 @@ class ResNet50Benchmarks(tf.test.Benchmark):
         # which isn't useful.
         if 'K20' in device.physical_device_desc:
           return (16,)
+        # Quardro P1000.
+        if 'P1000' in device.physical_device_desc:
+          return (16,)
         if 'P100' in device.physical_device_desc:
           return (16, 32, 64)
 
@@ -355,7 +358,9 @@ class ResNet50Benchmarks(tf.test.Benchmark):
         (images, labels) = resnet50_test_util.random_batch(
             batch_size, data_format)
         model = resnet50.ResNet50(data_format)
-        optimizer = tf.keras.optimizers.SGD(0.1)
+        # TODO(b/161911585): tf_to_corert MLIR lowering pipeline should handle
+        # case when momentum is not set.
+        optimizer = tf.keras.optimizers.SGD(0.1, 0.1)
         apply_grads = apply_gradients
         if defun:
           model.call = tf.function(model.call)
@@ -397,7 +402,6 @@ class ResNet50Benchmarks(tf.test.Benchmark):
         defun=False,
         execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt('Graph is not supported yet. b/156187905')
   def benchmark_eager_train_with_defun(self):
     self._benchmark_eager_train(
         'eager_train_with_defun', MockIterator,
diff --git a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test_util.py b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test_util.py
index 3c1f73ec304..4f76b788490 100644
--- a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test_util.py
+++ b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test_util.py
@@ -29,8 +29,10 @@ def device_and_data_format():
   return ('/cpu:0', 'channels_last')
 
 
-def random_batch(batch_size, data_format):
+def random_batch(batch_size, data_format, seed=None):
   """Create synthetic resnet50 images and labels for testing."""
+  if seed:
+    tf.random.set_seed(seed)
   shape = (3, 224, 224) if data_format == 'channels_first' else (224, 224, 3)
   shape = (batch_size,) + shape
 
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 24e86c77a14..22110e1ae71 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -476,6 +476,33 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     func = lambda: f(m, m, transpose_b=transpose_b)
     self._run(func, num_iters, execution_mode=execution_mode)
 
+  def _benchmark_defun_matmul_with_signature(self,
+                                             m,
+                                             num_iters,
+                                             execution_mode=None):
+
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec([2, 2], dtypes.float32)])
+    def defun_matmul(m):
+      return math_ops.matmul(m, m)
+
+    func = lambda: defun_matmul(m)
+    self._run(func, num_iters, execution_mode=execution_mode)
+
+  def _benchmark_defun_matmul_relaxed_shape(self,
+                                            m,
+                                            num_iters,
+                                            execution_mode=None):
+
+    @def_function.function(experimental_relax_shapes=True)
+    def defun_matmul(m):
+      return math_ops.matmul(m, m)
+
+    m_3_by_3 = random_ops.random_uniform((3, 3))
+    defun_matmul(m_3_by_3)
+    func = lambda: defun_matmul(m)
+    self._run(func, num_iters, execution_mode=execution_mode)
+
   def _benchmark_defun_args_matmul(self, m, num_iters, execution_mode=None):
 
     @def_function.function
@@ -570,13 +597,30 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_tfe_py_execute_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("Mutex corrupt: waiting writer with no waiters")
   def benchmark_defun_matmul_2_by_2_CPU(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
       self._benchmark_defun_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
+  def benchmark_defun_matmul_2_by_2_with_signature_CPU(self):
+    with context.device(CPU):
+      m = self._m_2_by_2.cpu()
+      self._benchmark_defun_matmul_with_signature(
+          m, num_iters=self._num_iters_2_by_2)
+
+  def benchmark_defun_matmul_2_by_2_relaxed_shape_CPU(self):
+    with context.device(CPU):
+      m = self._m_2_by_2.cpu()
+      self._benchmark_defun_matmul_relaxed_shape(
+          m, num_iters=self._num_iters_2_by_2)
+
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
+  def benchmark_defun_args_matmul_2_by_2_CPU(self):
+    with context.device(CPU):
+      m = self._m_2_by_2.cpu()
+      self._benchmark_defun_args_matmul(m, num_iters=self._num_iters_2_by_2)
+
   @test_util.disable_tfrt("async not supported")
   def benchmark_defun_matmul_2_by_2_CPU_async(self):
     with context.device(CPU):
@@ -587,7 +631,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
           num_iters=self._num_iters_2_by_2,
           execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt("Mutex corrupt: waiting writer with no waiters")
   def benchmark_defun_matmul_forward_backward_2_by_2_CPU(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
@@ -643,7 +686,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_tfe_py_execute_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_defun_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -652,6 +695,24 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_defun_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
+  @test_util.disable_tfrt("copy to GPU not supported")
+  def benchmark_defun_matmul_2_by_2_with_signature_GPU(self):
+    if not context.num_gpus():
+      return
+    with context.device(GPU):
+      m = self._m_2_by_2.gpu()
+      self._benchmark_defun_matmul_with_signature(
+          m, num_iters=self._num_iters_2_by_2)
+
+  @test_util.disable_tfrt("copy to GPU not supported")
+  def benchmark_defun_matmul_2_by_2_relaxed_shape_GPU(self):
+    if not context.num_gpus():
+      return
+    with context.device(GPU):
+      m = self._m_2_by_2.gpu()
+      self._benchmark_defun_matmul_relaxed_shape(
+          m, num_iters=self._num_iters_2_by_2)
+
   @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_args_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
@@ -672,7 +733,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
           num_iters=self._num_iters_2_by_2,
           execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_nested_defun_matmul_2_by_2(self):
     m = self._m_2_by_2.cpu()
     self._benchmark_nested_defun_matmul(
@@ -765,7 +825,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_tfe_py_execute_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_defun_matmul_100_by_784_GPU(self):
     if not context.num_gpus():
       return
@@ -774,8 +834,8 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_defun_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("defun not supported")
-  def benchmark_nested_defun_matmul_100_by_784(self):
+  @test_util.disable_tfrt("copy to GPU not supported")
+  def benchmark_nested_defun_matmul_100_by_784_GPU(self):
     m = self._m_100_by_784.gpu()
     self._benchmark_nested_defun_matmul(
         m, transpose_b=True, num_iters=self._num_iters_100_by_784)
@@ -847,35 +907,27 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
         func()
       self._run(func, 3000)
 
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_matmul_256_by_2096_CPU(self):
     self._benchmark_forwardprop_matmul_CPU(shape=(256, 2096))
 
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_in_defun_matmul_256_by_2096_CPU(self):
     self._benchmark_forwardprop_in_defun_matmul_CPU(shape=(256, 2096))
 
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_in_defun_of_defun_matmul_256_by_2096_CPU(self):
     self._benchmark_forwardprop_in_defun_of_defun_matmul_CPU(shape=(256, 2096))
 
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_of_defun_matmul_256_by_2096_CPU(self):
     self._benchmark_forwardprop_of_defun_matmul_CPU(shape=(256, 2096))
 
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_matmul_100_by_784_CPU(self):
     self._benchmark_forwardprop_matmul_CPU(shape=(100, 784))
 
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_in_defun_matmul_100_by_784_CPU(self):
     self._benchmark_forwardprop_in_defun_matmul_CPU(shape=(100, 784))
 
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_in_defun_of_defun_matmul_100_by_784_CPU(self):
     self._benchmark_forwardprop_in_defun_of_defun_matmul_CPU(shape=(100, 784))
 
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_of_defun_matmul_100_by_784_CPU(self):
     self._benchmark_forwardprop_of_defun_matmul_CPU(shape=(100, 784))
 
@@ -1117,7 +1169,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     func = lambda: array_ops.transpose(m, perm, conjugate)
     self._run(func, num_iters, execution_mode=execution_mode)
 
-  @test_util.disable_tfrt("ConvertToEagerTensorUncached error")
   def benchmark_tf_transpose_2_by_2_CPU(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
@@ -1129,7 +1180,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       m = self._m_2_by_2.gpu()
       self._benchmark_transpose(m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("ConvertToEagerTensorUncached error")
   def benchmark_tf_transpose_variable_2_by_2_CPU(self):
     with context.device(CPU):
       m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
@@ -1141,7 +1191,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
       self._benchmark_transpose(m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_without_signature(self):
 
     def func(t1, t2, t3, t4, t5, t6, t7, t8):
@@ -1153,7 +1202,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     cache_computation = lambda: defined(t, t, t, t, t, t, t, t)
     self._run(cache_computation, 30000)
 
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_without_signature_and_with_kwargs(self):
 
     def func(t1, t2, t3, t4, t5, t6, t7, t8):
@@ -1166,7 +1214,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       return defined(t1=t, t2=t, t3=t, t4=t, t5=t, t6=t, t7=t, t8=t)
     self._run(cache_computation, 30000)
 
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_with_signature(self):
 
     def func(t1, t2, t3, t4, t5, t6, t7, t8):
@@ -1179,7 +1226,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     signature_computation = lambda: defined(t, t, t, t, t, t, t, t)
     self._run(signature_computation, 30000)
 
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_with_signature_and_kwargs(self):
 
     def func(t1, t2, t3, t4, t5, t6, t7, t8):
@@ -1232,7 +1278,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_read_variable_with_tape(
           m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("Scan, loops need fallback")
   def benchmarkScan(self):
     elems = math_ops.range(1600)
 
@@ -1242,7 +1287,8 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
 
     self._run(scan, 100)
 
-  @test_util.disable_tfrt("Scan, loops need fallback")
+  @test_util.disable_tfrt(
+      "tf.While not supported in TF to CoreRT lowing. b/162685874")
   def benchmarkScanDefun(self):
     elems = math_ops.range(1600)
 
@@ -1346,6 +1392,14 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       values.append(array_ops.zeros(shape=(1000,)))
     self._run(lambda: np.array([x.numpy() for x in values]), 1000)
 
+  def benchmark_function_trace(self):
+
+    def func(x):
+      return x
+
+    self._run(lambda: (def_function.function(func)(x) for x in range(1000)),
+              30000)
+
   def _benchmarkFunctionWithResourceInputs(self, num_resources, num_iters):
     @def_function.function
     def add_all(*args):
@@ -1357,11 +1411,9 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
         resources.append(resource_variable_ops.ResourceVariable(self._m_2))
       self._run(lambda: add_all(resources), num_iters)
 
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmarkFunctionWithFiveResourceInputs(self):
     self._benchmarkFunctionWithResourceInputs(5, 1000)
 
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmarkFunctionWithFiveHundredResourceInputs(self):
     self._benchmarkFunctionWithResourceInputs(500, 100)
 
diff --git a/tensorflow/python/eager/benchmarks_test_base.py b/tensorflow/python/eager/benchmarks_test_base.py
index 3d81d08ccbf..1c56ebfd279 100644
--- a/tensorflow/python/eager/benchmarks_test_base.py
+++ b/tensorflow/python/eager/benchmarks_test_base.py
@@ -18,11 +18,37 @@ from __future__ import absolute_import as _absolute_import
 from __future__ import division as _division
 from __future__ import print_function as _print_function
 
+import os
+import uuid
+
 from tensorflow.python.eager import test
+from tensorflow.python.platform import flags
+from tensorflow.python.profiler import profiler_v2 as profiler
+
+flags.DEFINE_bool("xprof", False, "Run and report benchmarks with xprof on")
+flags.DEFINE_string("logdir", "/tmp/xprof/", "Directory to store xprof data")
 
 
 class MicroBenchmarksBase(test.Benchmark):
-  """Run and report benchmark results."""
+  """Run and report benchmark results.
+
+  The first run is without any profilng.
+  Second run is with xprof and python trace. Third run is with xprof without
+  python trace. Note: xprof runs are with fewer iterations.
+  """
+
+  def run_with_xprof(self, enable_python_trace, run_benchmark, func,
+                     num_iters_xprof, execution_mode, suid):
+    if enable_python_trace:
+      options = profiler.ProfilerOptions(python_tracer_level=1)
+      logdir = os.path.join(flags.FLAGS.logdir, suid + "_with_python")
+    else:
+      options = profiler.ProfilerOptions(python_tracer_level=0)
+      logdir = os.path.join(flags.FLAGS.logdir, suid)
+    with profiler.Profile(logdir, options):
+      total_time = run_benchmark(func, num_iters_xprof, execution_mode)
+    us_per_example = float("{0:.3f}".format(total_time * 1e6 / num_iters_xprof))
+    return logdir, us_per_example
 
   def run_report(self, run_benchmark, func, num_iters, execution_mode=None):
     """Run and report benchmark results."""
@@ -32,6 +58,24 @@ class MicroBenchmarksBase(test.Benchmark):
         "examples_per_sec": float("{0:.3f}".format(num_iters / total_time)),
         "us_per_example": float("{0:.3f}".format(total_time * 1e6 / num_iters))
     }
+
+    if flags.FLAGS.xprof:
+      suid = str(uuid.uuid4())
+      # Re-run with xprof and python trace.
+      num_iters_xprof = min(100, num_iters)
+      xprof_link, us_per_example = self.run_with_xprof(True, run_benchmark,
+                                                       func, num_iters_xprof,
+                                                       execution_mode, suid)
+      extras["xprof link with python trace"] = xprof_link
+      extras["us_per_example with xprof and python"] = us_per_example
+
+      # Re-run with xprof but no python trace.
+      xprof_link, us_per_example = self.run_with_xprof(False, run_benchmark,
+                                                       func, num_iters_xprof,
+                                                       execution_mode, suid)
+      extras["xprof link"] = xprof_link
+      extras["us_per_example with xprof"] = us_per_example
+
     benchmark_name = self._get_benchmark_name()
     self.report_benchmark(
         iters=num_iters, wall_time=mean_us, extras=extras, name=benchmark_name)
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index a83b0ee1f77..765c77af7cd 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -756,6 +756,21 @@ class Context(object):
     self._collective_use_nccl_communication = use_nccl_communication
     self._collective_device_filters = device_filters
 
+  def abort_collective_ops(self, code, message):
+    """Abort the collective ops.
+
+    This is intended to be used when a peer failure is detected, which allows
+    the user to handle the case instead of hanging. This aborts all on-going
+    collectives. After all subsequent collectives error immediately. The only
+    way to recovery now is to restart the program.
+
+    Args:
+      code: a `tf.errors` error code.
+      message: a string. The error message.
+    """
+    self.ensure_initialized()
+    pywrap_tfe.TFE_AbortCollectiveOps(self._handle, code, message)
+
   @property
   def _handle(self):
     if self._context_handle is None:
@@ -1398,6 +1413,12 @@ class Context(object):
 
     self._visible_device_list = visible_device_list
 
+  def get_total_memory_usage(self, dev):
+    """Returns total memory usage in bytes for the current device."""
+    self._initialize_physical_devices()
+    self.ensure_initialized()
+    return pywrap_tfe.TFE_GetTotalMemoryUsage(self._context_handle, dev)
+
   def get_memory_growth(self, dev):
     """Get if memory growth is enabled for a PhysicalDevice."""
     self._initialize_physical_devices()
@@ -2295,7 +2316,7 @@ def async_scope():
         train_step_fn()
   except tf.errors.OutOfRangeError:
     tf.experimental.async_clear_error()
-  logging.info('loss =', loss.numpy())
+  logging.info('loss = %s', loss.numpy())
   ```
 
   Yields:
@@ -2349,7 +2370,7 @@ def async_clear_error():
     except tf.errors.OutOfRangeError:
       tf.experimental.async_clear_error()
       break
-  logging.info('loss =', loss.numpy())
+  logging.info('loss = %s', loss.numpy())
   ```
   """
   context().clear_executor_errors()
diff --git a/tensorflow/python/eager/context_test.py b/tensorflow/python/eager/context_test.py
index fd815fe7433..086f943b3b0 100644
--- a/tensorflow/python/eager/context_test.py
+++ b/tensorflow/python/eager/context_test.py
@@ -26,6 +26,8 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -108,6 +110,27 @@ class ContextTest(test.TestCase):
     with self.assertRaises(errors.NotFoundError):
       _ = context.get_function_def('this_should_not_be_found')
 
+  @test_util.run_gpu_only
+  def testGetMemoryUsage(self):
+    array_ops.zeros([10]) # Allocate some memory on the GPU.
+    self.assertGreater(
+        context.context().get_total_memory_usage('GPU:0'), 0)
+
+  def testGetMemoryUsageCPU(self):
+    with self.assertRaisesRegex(ValueError, 'CPU does not support'):
+      context.context().get_total_memory_usage('CPU:0')
+
+  def testGetMemoryUsageUnknownDevice(self):
+    with self.assertRaisesRegex(ValueError, 'Failed parsing device name'):
+      context.context().get_total_memory_usage('unknown_device')
+
+  @test_util.run_gpu_only
+  def testGetMemoryUsageAmbiguousDevice(self):
+    if len(context.context().list_physical_devices('GPU')) < 2:
+      self.skipTest('Need at least 2 GPUs')
+    with self.assertRaisesRegex(ValueError, 'Multiple devices'):
+      context.context().get_total_memory_usage('GPU')
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index 1eba58d1ec1..3ba95b0076d 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -27,7 +27,6 @@ import six
 from google.protobuf import text_format as _text_format
 from google.protobuf.message import DecodeError
 from tensorflow.core.framework import attr_value_pb2
-from tensorflow.python import pywrap_tfe
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function as function_lib
 from tensorflow.python.eager import lift_to_graph
@@ -462,7 +461,8 @@ class Function(object):
                experimental_implements=None,
                experimental_autograph_options=None,
                experimental_relax_shapes=False,
-               experimental_compile=None):
+               experimental_compile=None,
+               experimental_follow_type_hints=None):
     """Initializes a `Function`.
 
     Args:
@@ -518,6 +518,8 @@ class Function(object):
         executor). Set this value to `False` when directly running a
         multi-device function on TPUs (e.g. two TPU cores, one TPU core and its
         host CPU).
+      experimental_follow_type_hints: See the documentation for `tf.function`.
+
     Raises:
       ValueError: if `input_signature` is not None and the `python_function`'s
         argspec has keyword arguments.
@@ -525,7 +527,9 @@ class Function(object):
     self._lock = threading.Lock()
     self._python_function = python_function
     self._function_spec = function_lib.FunctionSpec.from_function_and_signature(
-        python_function, input_signature)
+        python_function,
+        input_signature,
+        experimental_follow_type_hints=experimental_follow_type_hints)
     self._implements = experimental_implements
     # If `True`, the function uses the rendezvous of the parent. This is only
     # needed to support code where raw send/recv operations are inserted and
@@ -535,6 +539,9 @@ class Function(object):
     self._experimental_autograph_options = experimental_autograph_options
     self._experimental_relax_shapes = experimental_relax_shapes
     self._experimental_compile = experimental_compile
+    if experimental_follow_type_hints is None:
+      experimental_follow_type_hints = False
+    self._experimental_follow_type_hints = experimental_follow_type_hints
     self._created_variables = None  # GUARDED_BY(self._lock)
     self._stateful_fn = None  # GUARDED_BY(self._lock)
     self._stateless_fn = None  # GUARDED_BY(self._lock)
@@ -647,14 +654,6 @@ class Function(object):
       attributes.update(_XlaMustCompile=bool(self._experimental_compile))
       if self._experimental_compile:
         attributes.update(_noinline=True)
-        # TODO(b/149755889): Until XLA is always linked, we have to do a runtime
-        # check.
-        if not pywrap_tfe.TF_IsXlaEnabled():
-          raise ValueError(
-              "Attempting to use experimental_compile, "
-              "but XLA support is not linked in. "
-              "Is the dependency to tensorflow/compiler/jit:xla_gpu_jit "
-              "(or xla_cpu_jit) present?")
     if not attributes:
       attributes = None
     return function_lib.defun_with_attributes(
@@ -664,6 +663,7 @@ class Function(object):
         autograph=self._autograph,
         experimental_autograph_options=self._experimental_autograph_options,
         experimental_compile=self._experimental_compile,
+        experimental_follow_type_hints=self._experimental_follow_type_hints,
         experimental_relax_shapes=self._experimental_relax_shapes)
 
   def _initialize(self, args, kwds, add_initializers_to=None):
@@ -722,7 +722,8 @@ class Function(object):
         experimental_implements=self._implements,
         experimental_autograph_options=self._experimental_autograph_options,
         experimental_relax_shapes=self._experimental_relax_shapes,
-        experimental_compile=self._experimental_compile)
+        experimental_compile=self._experimental_compile,
+        experimental_follow_type_hints=self._experimental_follow_type_hints)
 
     if self._shared_rendezvous:
       f._shared_rendezvous = self._shared_rendezvous  # pylint: disable=protected-access
@@ -845,13 +846,13 @@ class Function(object):
         # stateless function.
         return self._stateless_fn(*args, **kwds)
     else:
-      canon_args, canon_kwds = \
+      _, _, flat_args, flat_kwds = \
           self._stateful_fn._function_spec.canonicalize_function_inputs(  # pylint: disable=protected-access
               *args, **kwds)
       # If we did not create any variables the trace we have is good enough.
-      return self._concrete_stateful_fn._filtered_call(canon_args, canon_kwds)  # pylint: disable=protected-access
+      return self._concrete_stateful_fn._filtered_call(flat_args, flat_kwds)  # pylint: disable=protected-access
 
-    def fn_with_cond(*inner_args, **inner_kwds):
+    def fn_with_cond(inner_args, inner_kwds, inner_flat_args, inner_flat_kwds):
       """Conditionally runs initialization if it's needed."""
       condition = True
       for wr in self._created_variables:
@@ -899,15 +900,18 @@ class Function(object):
       return control_flow_ops.cond(
           condition,
           lambda: self._stateless_fn(*inner_args, **inner_kwds),
-          functools.partial(self._concrete_stateful_fn._filtered_call,  # pylint: disable=protected-access
-                            inner_args, inner_kwds))
+          functools.partial(
+              self._concrete_stateful_fn._filtered_call,  # pylint: disable=protected-access
+              inner_flat_args,
+              inner_flat_kwds))
 
     # We've created variables and are unable to lift the initialization graphs,
     # so we fall back to initializing with conds while running the function.
-    canon_args, canon_kwds = \
+    canon_args, canon_kwds, flat_args, flat_kwds = \
         self._stateful_fn._function_spec.canonicalize_function_inputs(  # pylint: disable=protected-access
             *args, **kwds)
-    return function_lib.defun(fn_with_cond)(*canon_args, **canon_kwds)
+    return function_lib.defun(fn_with_cond)(canon_args, canon_kwds, flat_args,
+                                            flat_kwds)
 
   @property
   def python_function(self):
@@ -1209,7 +1213,8 @@ def function(func=None,
              experimental_implements=None,
              experimental_autograph_options=None,
              experimental_relax_shapes=False,
-             experimental_compile=None):
+             experimental_compile=None,
+             experimental_follow_type_hints=None):
   """Compiles a function into a callable TensorFlow graph.
 
   `tf.function` constructs a callable that executes a TensorFlow graph
@@ -1372,6 +1377,33 @@ def function(func=None,
   In general, it is recommended to create stateful objects like `tf.Variable`
   outside of `tf.function` and passing them as arguments.
 
+  _Using type annotations to improve performance_
+
+  'experimental_follow_type_hints` can be used along with type annotations to
+  improve performance by reducing the number of expensive graph retracings.
+  For example, an argument annotated with `tf.Tensor` is converted to Tensor
+  even when the input is a non-Tensor value.
+
+  >>> @tf.function(experimental_follow_type_hints=True)
+  ... def f_with_hints(x: tf.Tensor):
+  ...   print('Tracing')
+  ...   return x
+  >>> @tf.function(experimental_follow_type_hints=False)
+  ... def f_no_hints(x: tf.Tensor):
+  ...   print('Tracing')
+  ...   return x
+  >>> f_no_hints(1)
+  Tracing
+  <tf.Tensor: shape=(), dtype=int32, numpy=1>
+  >>> f_no_hints(2)
+  Tracing
+  <tf.Tensor: shape=(), dtype=int32, numpy=2>
+  >>> f_with_hints(1)
+  Tracing
+  <tf.Tensor: shape=(), dtype=int32, numpy=1>
+  >>> f_with_hints(2)
+  <tf.Tensor: shape=(), dtype=int32, numpy=2>
+
   Args:
     func: the function to be compiled. If `func` is None, `tf.function` returns
       a decorator that can be invoked with a single argument - `func`. In other
@@ -1412,6 +1444,10 @@ def function(func=None,
     experimental_compile: If True, the function is always compiled by
       [XLA](https://www.tensorflow.org/xla). XLA may be more efficient in some
       cases (e.g. TPU, XLA_GPU, dense tensor computations).
+    experimental_follow_type_hints: When True, the function may use type
+      annotations from `func` to optimize the tracing performance. For example,
+      arguments annotated with `tf.Tensor` will automatically be converted
+      to a Tensor.
 
   Returns:
      If `func` is not None, returns a callable that will execute the compiled
@@ -1423,8 +1459,11 @@ def function(func=None,
      ValueError when attempting to use experimental_compile, but XLA support is
      not enabled.
   """
+  # TODO(mdan): Link to `tf.types` section once published.
   if input_signature is not None:
     function_lib.validate_signature(input_signature)
+  if experimental_follow_type_hints is None:
+    experimental_follow_type_hints = False
 
   def decorated(inner_function):
     try:
@@ -1442,7 +1481,8 @@ def function(func=None,
             experimental_autograph_options=experimental_autograph_options,
             experimental_relax_shapes=experimental_relax_shapes,
             experimental_compile=experimental_compile,
-            experimental_implements=experimental_implements))
+            experimental_implements=experimental_implements,
+            experimental_follow_type_hints=experimental_follow_type_hints))
 
   # This code path is for the `foo = tf.function(foo, ...)` use case
   if func is not None:
diff --git a/tensorflow/python/eager/def_function_test.py b/tensorflow/python/eager/def_function_test.py
index 0ae69fa0b8c..8784fb1cd0a 100644
--- a/tensorflow/python/eager/def_function_test.py
+++ b/tensorflow/python/eager/def_function_test.py
@@ -45,6 +45,8 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.saved_model import save_context
+from tensorflow.python.saved_model import save_options
 
 
 def undecorated_function(x):
@@ -586,6 +588,22 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
 
     self.assertIs(func_a, func_b)
 
+  def testCacheWithinSaveContext(self):
+
+    @def_function.function
+    def func(x):
+      return 2 * x
+
+    func_a = func.get_concrete_function(constant_op.constant(2.))
+    func_b = func.get_concrete_function(constant_op.constant(2.))
+
+    self.assertIs(func_a, func_b)
+
+    with save_context.save_context(save_options.SaveOptions()):
+      func_c = func.get_concrete_function(constant_op.constant(2.))
+
+    self.assertIs(func_a, func_c)
+
   @test_util.disable_tfrt('Nested function is not supported')
   def testInitializationInNestedCall(self):
     v_holder = []
diff --git a/tensorflow/python/eager/def_function_test_cpu_only.py b/tensorflow/python/eager/def_function_test_cpu_only.py
index bd3774269ea..7bb6ade8f6c 100644
--- a/tensorflow/python/eager/def_function_test_cpu_only.py
+++ b/tensorflow/python/eager/def_function_test_cpu_only.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -37,11 +37,12 @@ class DefFunctionCpuOnlyTest(test.TestCase, parameterized.TestCase):
     if test.is_built_with_rocm() or test_util.is_xla_enabled():
       return
 
-    with self.assertRaisesRegexp(ValueError, 'XLA support is not'):
+    with self.assertRaisesRegexp(errors.UnimplementedError,
+                                 'check target linkage'):
 
       @def_function.function(experimental_compile=True)
       def fn(x):
-        return array_ops.unique(x).y
+        return x + x
 
       fn([1, 1, 2, 3])
 
diff --git a/tensorflow/python/eager/def_function_xla_jit_test.py b/tensorflow/python/eager/def_function_xla_jit_test.py
index d55f84863e9..ba75aed5f1c 100644
--- a/tensorflow/python/eager/def_function_xla_jit_test.py
+++ b/tensorflow/python/eager/def_function_xla_jit_test.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.compiler.tests import xla_test
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -32,376 +33,493 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
-class DefFunctionTest(test.TestCase):
+class DefFunctionTest(xla_test.XLATestCase):
 
   def testAutoclusteringWithTfFunction(self):
+    if 'tpu' in self.device.lower():
+      self.skipTest('Autoclustering does not run on TPU')
 
-    @def_function.function(experimental_compile=False)
-    def outer(a, b, c):
-      return a * inner(b, c) + c
+    with ops.device('device:{}:0'.format(self.device)):
 
-    @def_function.function(experimental_compile=True)
-    def inner(b, c):
-      return b + c * b
+      @def_function.function(experimental_compile=False)
+      def outer(a, b, c):
+        return a * inner(b, c) + c
 
-    i1 = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0])
-    i2 = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0])
-    i3 = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0])
+      @def_function.function(experimental_compile=True)
+      def inner(b, c):
+        return b + c * b
 
-    with context.collect_graphs(optimized=True) as graphs:
-      outer(i1, i2, i3)
+      i1 = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0])
+      i2 = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0])
+      i3 = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0])
 
-    if test_util.is_xla_enabled():
-      self.assertIn('_XlaRun', [n.op for n in graphs[0].node])
-    else:
-      self.assertNotIn('_XlaRun', [n.op for n in graphs[0].node])
+      with context.collect_graphs(optimized=True) as graphs:
+        outer(i1, i2, i3)
+
+      if test_util.is_xla_enabled():
+        self.assertIn('_XlaRun', [n.op for n in graphs[0].node])
+      else:
+        self.assertNotIn('_XlaRun', [n.op for n in graphs[0].node])
 
   def testBasic(self):
+    with ops.device('device:{}:0'.format(self.device)):
 
-    def fn(x, a):
-      return x + a
+      def fn(x, a):
+        return x + a
 
-    func = def_function.function(fn, experimental_compile=False)
-    xla_func = def_function.function(fn, experimental_compile=True)
+      func = def_function.function(fn, experimental_compile=False)
+      xla_func = def_function.function(fn, experimental_compile=True)
 
-    inputs = constant_op.constant([1, 2, 2, 3, 3])
-    self.assertAllClose([2, 3, 3, 4, 4], func(inputs, 1))
-    if not test.is_built_with_rocm():
-      # XLA support is not yet enabled for TF ROCm
+      inputs = constant_op.constant([1, 2, 2, 3, 3])
+      self.assertAllClose([2, 3, 3, 4, 4], func(inputs, 1))
       self.assertAllClose([2, 3, 3, 4, 4], xla_func(inputs, 1))
 
   def testBasicInt32(self):
+    with ops.device('device:{}:0'.format(self.device)):
 
-    def fn(x, a):
-      return x + a
+      @def_function.function(experimental_compile=True)
+      def fn(x, a):
+        return x + a
 
-    xla_func = def_function.function(fn, experimental_compile=True)
-
-    inputs = constant_op.constant([1, 2, 2, 3, 3], dtype=dtypes.int32)
-    if not test.is_built_with_rocm():
-      # XLA support is not yet enabled for TF ROCm
-      self.assertAllClose([2, 3, 3, 4, 4], xla_func(inputs, 1))
+      inputs = constant_op.constant([1, 2, 2, 3, 3], dtype=dtypes.int32)
+      self.assertAllClose([2, 3, 3, 4, 4], fn(inputs, 1))
 
   def testDerivative(self):
-    if test.is_built_with_rocm():
-      return
+    with ops.device('device:{}:0'.format(self.device)):
 
-    def fn(x, a):
-      return 2 * x + a
+      def fn(x, a):
+        return 2 * x + a
 
-    xla_func = def_function.function(fn, experimental_compile=True)
+      xla_func = def_function.function(fn, experimental_compile=True)
 
-    with backprop.GradientTape() as tape:
-      inputs = constant_op.constant([1., 2., 2., 3., 3.])
-      tape.watch(inputs)
-      outputs = xla_func(inputs, 1)
+      with backprop.GradientTape() as tape:
+        inputs = constant_op.constant([1., 2., 2., 3., 3.])
+        tape.watch(inputs)
+        outputs = xla_func(inputs, 1)
 
-    self.assertAllClose([2, 2, 2, 2, 2], tape.gradient(outputs, inputs))
+      self.assertAllClose([2, 2, 2, 2, 2], tape.gradient(outputs, inputs))
 
-    # pylint: disable=protected-access
-    (forward, backward) = xla_func.get_concrete_function(
-        inputs, 1)._delayed_rewrite_functions.forward_backward()
+      # pylint: disable=protected-access
+      (forward, backward) = xla_func.get_concrete_function(
+          inputs, 1)._delayed_rewrite_functions.forward_backward()
 
-    # Check that the must-compile attribute gets correctly propagated to the
-    # created derivatives.
-    self.assertTrue(backward.function_def.attr['_XlaMustCompile'])
-    self.assertTrue(forward.definition.attr['_XlaMustCompile'])
+      # Check that the must-compile attribute gets correctly propagated to the
+      # created derivatives.
+      self.assertTrue(backward.function_def.attr['_XlaMustCompile'])
+      self.assertTrue(forward.definition.attr['_XlaMustCompile'])
 
   # Calling function with experimental_compile=True from
   # experimental_compile=False should compile the inner func.
   def testNestedCall(self):
+    if 'tpu' in self.device.lower():
+      self.skipTest('b/162800687: Inner function runs on host')
 
-    def fn(x, a):
-      return x + a
+    with ops.device('device:{}:0'.format(self.device)):
 
-    xla_func = def_function.function(fn, experimental_compile=True)
+      @def_function.function(experimental_compile=True)
+      def fn(x, a):
+        return x + a
 
-    def fn2(x, a):
-      return xla_func(x, a)
+      @def_function.function(experimental_compile=False)
+      def fn2(x, a):
+        return fn(x, a)
 
-    func = def_function.function(fn2, experimental_compile=False)
-
-    inputs = constant_op.constant([1, 2, 2, 3, 3])
-    if not test.is_built_with_rocm():
-      # XLA support is not yet enabled for TF ROCm
-      self.assertAllClose([2, 3, 3, 4, 4], func(inputs, 1))
+      inputs = constant_op.constant([1, 2, 2, 3, 3])
+      self.assertAllClose([2, 3, 3, 4, 4], fn2(inputs, 1))
 
+  @test_util.disable_mlir_bridge('TODO(b/162272821): MLIR bridge returns'
+                                 ' wrong status type')
   def testNestedCallUnsupportedOps(self):
+    with ops.device('device:{}:0'.format(self.device)):
 
-    def fn(x):
-      return array_ops.unique(x).y
+      def fn(x):
+        return array_ops.unique(x).y
 
-    xla_func = def_function.function(fn, experimental_compile=True)
+      xla_func = def_function.function(fn, experimental_compile=True)
 
-    def fn2(x):
-      return xla_func(x)
+      def fn2(x):
+        return xla_func(x)
 
-    func = def_function.function(fn2, experimental_compile=False)
-    inputs = constant_op.constant([1, 2, 2, 3, 3])
-    if not test.is_built_with_rocm():
+      func = def_function.function(fn2, experimental_compile=False)
+      inputs = constant_op.constant([1, 2, 2, 3, 3])
       with self.assertRaisesRegex(errors.InvalidArgumentError,
                                   'not compilable'):
         func(inputs)
 
+  @test_util.disable_mlir_bridge('TODO(b/162272821): MLIR bridge returns'
+                                 ' wrong status type')
   def testUnsupportedOps(self):
+    with ops.device('device:{}:0'.format(self.device)):
 
-    def fn(x):
-      return array_ops.unique(x).y  # Unique is not supported by XLA
+      def fn(x):
+        return array_ops.unique(x).y  # Unique is not supported by XLA
 
-    func = def_function.function(fn, experimental_compile=False)
-    xla_func = def_function.function(fn, experimental_compile=True)
+      func = def_function.function(fn, experimental_compile=False)
+      xla_func = def_function.function(fn, experimental_compile=True)
 
-    inputs = constant_op.constant([1, 2, 2, 3, 3])
-    self.assertAllClose([1, 2, 3], func(inputs))
-    with self.assertRaisesRegex(errors.InvalidArgumentError, 'not compilable'):
-      xla_func(inputs)
+      inputs = constant_op.constant([1, 2, 2, 3, 3])
+      self.assertAllClose([1, 2, 3], func(inputs))
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  'not compilable'):
+        xla_func(inputs)
 
   def testFunctionGradient(self):
-    v = resource_variable_ops.ResourceVariable(2.0)
+    with ops.device('device:{}:0'.format(self.device)):
+      v = resource_variable_ops.ResourceVariable(2.0)
 
-    def fn(x):
-      return v * x
+      def fn(x):
+        return v * x
 
-    func = def_function.function(fn, experimental_compile=False)
-    xla_func = def_function.function(fn, experimental_compile=True)
+      func = def_function.function(fn, experimental_compile=False)
+      xla_func = def_function.function(fn, experimental_compile=True)
 
-    def run_and_check(test_func):
-      x = constant_op.constant(3.0)
-      with backprop.GradientTape() as tape:
-        y = test_func(x)
-      dy = tape.gradient(y, v)
+      def run_and_check(test_func):
+        x = constant_op.constant(3.0)
+        with backprop.GradientTape() as tape:
+          y = test_func(x)
+        dy = tape.gradient(y, v)
 
-      self.assertAllClose(6.0, y)
-      self.assertAllClose(3.0, dy)
+        self.assertAllClose(6.0, y)
+        self.assertAllClose(3.0, dy)
 
-    run_and_check(func)
-    if not test.is_built_with_rocm():
-      # XLA support is not yet enabled for TF ROCm
+      run_and_check(func)
       run_and_check(xla_func)
 
+  @test_util.disable_mlir_bridge('TODO(b/162521846): MLIR bridge fails'
+                                 ' msan, function library not found')
   def testControlFlow(self):
 
-    @def_function.function(experimental_compile=True)
-    def f(x):
-      assert control_flow_util.GraphOrParentsInXlaContext(
-          ops.get_default_graph())
-      x = ops.convert_to_tensor(x)
+    with ops.device('device:{}:0'.format(self.device)):
 
-      def body(i, a):
-        return i + 1, control_flow_ops.cond(i > 2, lambda: a + (x**2),
-                                            lambda: a + 3)
+      @def_function.function(experimental_compile=True)
+      def f(x):
+        assert control_flow_util.GraphOrParentsInXlaContext(
+            ops.get_default_graph())
+        x = ops.convert_to_tensor(x)
 
-      return control_flow_ops.while_loop(
-          lambda i, *_: i < 10,
-          body, (constant_op.constant(0), constant_op.constant(3.)),
-          maximum_iterations=10)[1]
+        def body(i, a):
+          return i + 1, control_flow_ops.cond(i > 2, lambda: a + (x**2),
+                                              lambda: a + 3)
 
-    @def_function.function(experimental_compile=True)
-    def g(x):
-      x = ops.convert_to_tensor(x)
-      with backprop.GradientTape() as tape:
-        tape.watch(x)
-        y = f(x)
-      return y, tape.gradient(y, x)
+        return control_flow_ops.while_loop(
+            lambda i, *_: i < 10,
+            body, (constant_op.constant(0), constant_op.constant(3.)),
+            maximum_iterations=10)[1]
 
-    self.assertAllClose(40.0, f(2.0))
-    self.assertAllClose([40.0, 28.0], g(2.0))
+      @def_function.function(experimental_compile=True)
+      def g(x):
+        x = ops.convert_to_tensor(x)
+        with backprop.GradientTape() as tape:
+          tape.watch(x)
+          y = f(x)
+        return y, tape.gradient(y, x)
+
+      self.assertAllClose(40.0, f(2.0))
+      self.assertAllClose([40.0, 28.0], g(2.0))
 
   def testMethodCompilation(self):
-    if test.is_built_with_rocm():
-      return
 
-    class C(object):
+    with ops.device('device:{}:0'.format(self.device)):
 
-      @def_function.function(experimental_compile=True)
-      def f1(self, x, a):
-        return x + a
+      class C(object):
 
-    inputs = constant_op.constant([1, 2, 2, 3, 3])
-    c = C()
-    self.assertAllClose([2, 3, 3, 4, 4], c.f1(inputs, 1))
+        @def_function.function(experimental_compile=True)
+        def f1(self, x, a):
+          return x + a
 
+      inputs = constant_op.constant([1, 2, 2, 3, 3])
+      c = C()
+      self.assertAllClose([2, 3, 3, 4, 4], c.f1(inputs, 1))
+
+  @test_util.disable_mlir_bridge('TODO(b/162272821): MLIR bridge returns '
+                                 ' wrong status type')
   def testMethodCompilationUnsupportedFunc(self):
-    if test.is_built_with_rocm():
-      return
 
-    class C(object):
+    with ops.device('device:{}:0'.format(self.device)):
 
-      @def_function.function(experimental_compile=True)
-      def f1(self, x):
-        return array_ops.unique(x).y
+      class C(object):
 
-    inputs = constant_op.constant([1, 2, 2, 3, 3])
-    c = C()
-    with self.assertRaisesRegex(errors.InvalidArgumentError, 'not compilable'):
-      c.f1(inputs)
+        @def_function.function(experimental_compile=True)
+        def f1(self, x):
+          return array_ops.unique(x).y
+
+      inputs = constant_op.constant([1, 2, 2, 3, 3])
+      c = C()
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  'not compilable'):
+        c.f1(inputs)
 
   def testMustBeConstantPropagation(self):
-    if test.is_built_with_rocm():
-      return
+    if 'tpu' in self.device.lower():
+      self.skipTest('b/162799319: Cannot resolve constant on TPU')
 
-    @def_function.function(experimental_compile=True)
-    def f():
-      return constant_op.constant([0, 2, 1], dtype=dtypes.int32)
+    with ops.device('device:{}:0'.format(self.device)):
 
-    @def_function.function(experimental_compile=True)
-    def g(a, b):
-      return array_ops.transpose(a, b)
+      @def_function.function(experimental_compile=True)
+      def f():
+        return constant_op.constant([0, 2, 1], dtype=dtypes.int32)
 
-    @def_function.function
-    def z():
-      return g(array_ops.ones([3, 4, 3], dtype=dtypes.float32), f())
+      @def_function.function(experimental_compile=True)
+      def g(a, b):
+        return array_ops.transpose(a, b)
 
-    z()
+      @def_function.function
+      def z():
+        return g(array_ops.ones([3, 4, 3], dtype=dtypes.float32), f())
 
+      z()
+
+  @test_util.disable_mlir_bridge('TODO(b/162271237): argmax gives different'
+                                 ' results in MLIR-based bridge')
   def testArgMinMax(self):
+    with ops.device('device:{}:0'.format(self.device)):
 
-    @def_function.function(experimental_compile=True)
-    def argmax(x):
-      return math_ops.argmax(x)
+      @def_function.function(experimental_compile=True)
+      def argmax(x):
+        return math_ops.argmax(x)
 
-    @def_function.function(experimental_compile=True)
-    def argmin(x):
-      return math_ops.argmin(x)
+      @def_function.function(experimental_compile=True)
+      def argmin(x):
+        return math_ops.argmin(x)
 
-    self.assertAllClose(0, argmax(array_ops.ones([10], dtype=dtypes.float32)))
-    self.assertAllClose(0, argmax(array_ops.ones([10])))
-    self.assertAllClose(0, argmin(array_ops.ones([10], dtype=dtypes.float32)))
-    self.assertAllClose(0, argmin(array_ops.ones([10])))
+      self.assertAllClose(0, argmax(array_ops.ones([10], dtype=dtypes.float32)))
+      self.assertAllClose(0, argmax(array_ops.ones([10])))
+      self.assertAllClose(0, argmin(array_ops.ones([10], dtype=dtypes.float32)))
+      self.assertAllClose(0, argmin(array_ops.ones([10])))
 
+  @test_util.disable_mlir_bridge('TensorArray support not implemented')
   def testErrorMessagePassingTensorArray(self):
+    with ops.device('device:{}:0'.format(self.device)):
 
-    @def_function.function(experimental_compile=True)
-    def f(x):
-      ta = tensor_array_ops.TensorArray(
-          dtype=dtypes.float32, size=1, element_shape=[])
-      ta = ta.write(0, 2 * x)
-      y = ta.read(0)
-      return y
+      @def_function.function(experimental_compile=True)
+      def f(x):
+        ta = tensor_array_ops.TensorArray(
+            dtype=dtypes.float32, size=1, element_shape=[])
+        ta = ta.write(0, 2 * x)
+        y = ta.read(0)
+        return y
 
-    x = constant_op.constant(3.14)
-    with backprop.GradientTape() as tape:
-      tape.watch(x)
-      with self.assertRaisesRegex(errors.UnimplementedError,
-                                  'TensorList crossing the XLA/TF boundary'):
-        y = f(x)
-        tape.gradient(y, x)
+      x = constant_op.constant(3.14)
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        with self.assertRaisesRegex(errors.UnimplementedError,
+                                    'TensorList crossing the XLA/TF boundary'):
+          y = f(x)
+          tape.gradient(y, x)
 
+  @test_util.disable_mlir_bridge('TODO(b/162281863): MLIR bridge errors out'
+                                 ' lowering TensorListConcatV2')
   def testTensorListConcatV2(self):
+    with ops.device('device:{}:0'.format(self.device)):
 
-    def f(x):
-      ta = tensor_array_ops.TensorArray(
-          dtype=dtypes.float32, size=2, element_shape=[3])
-      ta = ta.write(0, 2 * x)
-      ta = ta.write(1, 3 * x)
-      return ta.concat()
+      def f(x):
+        ta = tensor_array_ops.TensorArray(
+            dtype=dtypes.float32, size=2, element_shape=[3])
+        ta = ta.write(0, 2 * x)
+        ta = ta.write(1, 3 * x)
+        return ta.concat()
 
-    compiled_f = def_function.function(experimental_compile=True)(f)
+      compiled_f = def_function.function(experimental_compile=True)(f)
 
-    inputs = constant_op.constant([3.14, 2.68, 7.69])
+      inputs = constant_op.constant([3.14, 2.68, 7.69])
 
-    self.assertAllClose([6.28, 5.36, 15.38, 9.42, 8.04, 23.07], f(inputs))
+      self.assertAllClose([6.28, 5.36, 15.38, 9.42, 8.04, 23.07], f(inputs))
 
-    self.assertAllClose(compiled_f(inputs), f(inputs))
+      self.assertAllClose(compiled_f(inputs), f(inputs))
 
+  @test_util.disable_mlir_bridge('TODO(b/162281863): MLIR bridge errors out'
+                                 ' lowering TensorListConcatV2')
   def testTensorListConcatV2Multidim(self):
+    with ops.device('device:{}:0'.format(self.device)):
 
-    def f(x):
-      ta = tensor_array_ops.TensorArray(
-          dtype=dtypes.float32, size=2, element_shape=[3, 2])
-      ta = ta.write(0, 2 * x)
-      ta = ta.write(1, 3 * x)
-      return ta.concat()
+      def f(x):
+        ta = tensor_array_ops.TensorArray(
+            dtype=dtypes.float32, size=2, element_shape=[3, 2])
+        ta = ta.write(0, 2 * x)
+        ta = ta.write(1, 3 * x)
+        return ta.concat()
 
-    compiled_f = def_function.function(experimental_compile=True)(f)
+      compiled_f = def_function.function(experimental_compile=True)(f)
 
-    inputs = constant_op.constant([[3.14, 21.1], [2.68, 22.2], [7.69, 23.3]])
-    self.assertAllClose(f(inputs), compiled_f(inputs))
+      inputs = constant_op.constant([[3.14, 21.1], [2.68, 22.2], [7.69, 23.3]])
+      self.assertAllClose(f(inputs), compiled_f(inputs))
 
+  @test_util.disable_mlir_bridge('TODO(b/162281863): MLIR bridge errors out'
+                                 ' lowering TensorListConcatV2')
   def testTensorListConcatV2Scalars(self):
+    with ops.device('device:{}:0'.format(self.device)):
 
-    def f(x):
-      ta = tensor_array_ops.TensorArray(
-          dtype=dtypes.float32, size=2, element_shape=[1])
-      ta = ta.write(0, 2 * x)
-      ta = ta.write(1, 3 * x)
-      return ta.concat()
+      def f(x):
+        ta = tensor_array_ops.TensorArray(
+            dtype=dtypes.float32, size=2, element_shape=[1])
+        ta = ta.write(0, 2 * x)
+        ta = ta.write(1, 3 * x)
+        return ta.concat()
 
-    compiled_f = def_function.function(experimental_compile=True)(f)
-    inputs = constant_op.constant([3.14])
-    self.assertAllClose(f(inputs), compiled_f(inputs))
+      compiled_f = def_function.function(experimental_compile=True)(f)
+      inputs = constant_op.constant([3.14])
+      self.assertAllClose(f(inputs), compiled_f(inputs))
 
+  @test_util.disable_mlir_bridge('TODO(b/162281863): MLIR bridge errors out'
+                                 ' lowering TensorListConcatV2')
   def testTensorListConcatGrad(self):
+    with ops.device('device:{}:0'.format(self.device)):
 
-    def f(x):
-      ta = tensor_array_ops.TensorArray(
-          dtype=dtypes.float32, size=2, element_shape=[3])
-      ta = ta.write(0, 2 * x)
-      ta = ta.write(1, 3 * x)
-      return ta.concat()
+      def f(x):
+        ta = tensor_array_ops.TensorArray(
+            dtype=dtypes.float32, size=2, element_shape=[3])
+        ta = ta.write(0, 2 * x)
+        ta = ta.write(1, 3 * x)
+        return ta.concat()
 
-    def g():
-      x = constant_op.constant([3.14, 2.68, 7.69])
-      with backprop.GradientTape() as tape:
-        tape.watch(x)
-        y = f(x)
-        return tape.gradient(y, x)
+      def g():
+        x = constant_op.constant([3.14, 2.68, 7.69])
+        with backprop.GradientTape() as tape:
+          tape.watch(x)
+          y = f(x)
+          return tape.gradient(y, x)
 
-    compiled_g = def_function.function(experimental_compile=True)(g)
+      compiled_g = def_function.function(experimental_compile=True)(g)
 
-    self.assertAllClose([5.0, 5.0, 5.0], g())
-    self.assertAllClose(compiled_g(), g())
+      self.assertAllClose([5.0, 5.0, 5.0], g())
+      self.assertAllClose(compiled_g(), g())
 
+  @test_util.disable_mlir_bridge('TODO(b/162281863): MLIR bridge errors out'
+                                 ' lowering TensorListConcatV2')
   def testTensorListConcatGradNestedCompile(self):
+    with ops.device('device:{}:0'.format(self.device)):
 
-    @def_function.function(experimental_compile=True)
-    def f(x):
-      ta = tensor_array_ops.TensorArray(
-          dtype=dtypes.float32, size=2, element_shape=[3])
-      ta = ta.write(0, 2 * x)
-      ta = ta.write(1, 3 * x)
-      return ta.concat()
+      @def_function.function(experimental_compile=True)
+      def f(x):
+        ta = tensor_array_ops.TensorArray(
+            dtype=dtypes.float32, size=2, element_shape=[3])
+        ta = ta.write(0, 2 * x)
+        ta = ta.write(1, 3 * x)
+        return ta.concat()
 
-    @def_function.function(experimental_compile=True)
-    def g():
-      x = constant_op.constant([3.14, 2.68, 7.69])
-      with backprop.GradientTape() as tape:
-        tape.watch(x)
-        y = f(x)
-        out = tape.gradient(y, x)
-      return out
+      @def_function.function(experimental_compile=True)
+      def g():
+        x = constant_op.constant([3.14, 2.68, 7.69])
+        with backprop.GradientTape() as tape:
+          tape.watch(x)
+          y = f(x)
+          out = tape.gradient(y, x)
+        return out
 
-    self.assertAllClose([5.0, 5.0, 5.0], g())
+      self.assertAllClose([5.0, 5.0, 5.0], g())
 
   def testCumsum(self):
+    if 'tpu' in self.device.lower():
+      self.skipTest('b/162771302: 64bit rewrite of cumsum not supported')
 
-    @def_function.function(experimental_compile=True)
-    def f(x):
-      return math_ops.cumsum(x)
+    with ops.device('device:{}:0'.format(self.device)):
 
-    f64_input = constant_op.constant([1.1, 2.2, 3.3], dtype=dtypes.float64)
-    self.assertAllClose([1.1, 3.3, 6.6], f(f64_input))
+      @def_function.function(experimental_compile=True)
+      def f(x):
+        return math_ops.cumsum(x)
+
+      f64_input = constant_op.constant([1.1, 2.2, 3.3], dtype=dtypes.float64)
+      self.assertAllClose([1.1, 3.3, 6.6], f(f64_input))
 
   def testNoExcessiveRetracing(self):
-    inner_retracings = 0
+    with ops.device('device:{}:0'.format(self.device)):
+      inner_retracings = 0
 
-    @def_function.function(experimental_compile=True)
-    def inner(a, b):
-      nonlocal inner_retracings
-      inner_retracings += 1
-      return a * b + a
+      @def_function.function(experimental_compile=True)
+      def inner(a, b):
+        nonlocal inner_retracings
+        inner_retracings += 1
+        return a * b + a
 
-    def outer(a, b):
-      return inner(a, b)
+      def outer(a, b):
+        return inner(a, b)
 
-    func_input = random_ops.random_normal([10, 10])
-    for _ in range(2):
-      def_function.function(outer)(func_input, func_input)
+      func_input = random_ops.random_normal([10, 10])
+      for _ in range(2):
+        def_function.function(outer)(func_input, func_input)
 
-    self.assertEqual(inner_retracings, 1)
+      self.assertEqual(inner_retracings, 1)
+
+  def testUpdateVariable(self):
+    with ops.device('device:{}:0'.format(self.device)):
+
+      on_gpu = 'gpu' in self.device.lower()
+      v = variables.Variable([3.1, 3.2])
+
+      @def_function.function(experimental_compile=True)
+      def update_var(a, b):
+        v.assign_add(a * b)
+
+      arg1 = random_ops.random_normal([2])
+      arg2 = random_ops.random_normal([2])
+
+      initial_usage = context.context().get_total_memory_usage(
+          v.device) if on_gpu else 0
+      update_var(arg1, arg2)
+      final_usage = context.context().get_total_memory_usage(
+          v.device) if on_gpu else 0
+      self.assertEqual(initial_usage, final_usage)
+
+  @test_util.disable_mlir_bridge('TODO(b/162381930): MLIR bridge renames '
+                                 ' functions')
+  def testUpdateVariableInClass(self):
+    with ops.device('device:{}:0'.format(self.device)):
+
+      class C(object):
+
+        @def_function.function(experimental_compile=True)
+        def update_var(self, a, b):
+          if not hasattr(self, 'v'):
+            self.v = variables.Variable(3.1)
+          self.v.assign_add(a * b)
+
+      c = C()
+
+      @def_function.function
+      def outer():
+        c.update_var(constant_op.constant(0.7), constant_op.constant(0.6))
+
+      outer()
+      self.assertAllClose(c.v, 3.52)
+
+  @test_util.disable_mlir_bridge('TODO(b/162801728): MLIR bridge causes '
+                                 ' invalid free on TPUs')
+  def testUpdateVariableMultipleOutputs(self):
+    with ops.device('device:{}:0'.format(self.device)):
+      v = variables.Variable(3.1)
+
+      @def_function.function(experimental_compile=True)
+      def update_var(a, b):
+        v.assign_add(a * b)
+        return a * b + v
+
+      out = update_var(constant_op.constant(0.7), constant_op.constant(0.6))
+      self.assertAllClose(v, 3.52)
+      self.assertAllClose(out, 3.94)
+
+  def testReturnIdentity(self):
+    with ops.device('device:{}:0'.format(self.device)):
+
+      @def_function.function(experimental_compile=True)
+      def f(a, b):
+        return (a, b)
+
+      a = random_ops.random_normal([10, 10])
+      b = random_ops.random_normal([10, 10])
+
+      on_gpu = 'gpu' in self.device.lower()
+      initial_usage = context.context().get_total_memory_usage(
+          b.backing_device) if on_gpu else 0
+
+      f(a, b)
+
+      final_usage = context.context().get_total_memory_usage(
+          b.backing_device) if on_gpu else 0
+      self.assertEqual(initial_usage, final_usage)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/eager/device_placement_test.py b/tensorflow/python/eager/device_placement_test.py
index af6c68243b4..1ebe5e2ffef 100644
--- a/tensorflow/python/eager/device_placement_test.py
+++ b/tensorflow/python/eager/device_placement_test.py
@@ -82,6 +82,14 @@ class SoftDevicePlacementTest(test.TestCase, parameterized.TestCase):
     self.assertIn('CPU', c.device)
     self.assertIn('CPU', d.device)
 
+  @test_util.run_gpu_only
+  def testSoftPlacedGPU(self):
+    a = constant_op.constant(1)
+    b = constant_op.constant(2)
+    with ops.device('GPU:110'):
+      c = a + b
+    self.assertIn('GPU:0', c.device)
+
   @test_util.run_gpu_only
   def testNestedDeviceScope(self):
     a = constant_op.constant(1)
diff --git a/tensorflow/python/eager/execute.py b/tensorflow/python/eager/execute.py
index e206262309e..32808071e63 100644
--- a/tensorflow/python/eager/execute.py
+++ b/tensorflow/python/eager/execute.py
@@ -233,7 +233,7 @@ def make_tensor(v, arg_name):
       (repr(v), arg_name))
 
 
-def args_to_matching_eager(l, ctx, default_dtype=None):
+def args_to_matching_eager(l, ctx, allowed_dtypes, default_dtype=None):
   """Convert sequence `l` to eager same-type Tensors."""
   if (not l) and (default_dtype is not None):
     return default_dtype, []  # List is empty; assume default dtype.
@@ -243,8 +243,6 @@ def args_to_matching_eager(l, ctx, default_dtype=None):
       break
   else:  # note: intentional for-else
     return l[0]._datatype_enum(), l  # pylint: disable=protected-access
-  # TODO(josh11b): Could we do a better job if we also passed in the
-  # allowed dtypes when that was known?
 
   # Is some input already a Tensor with a dtype?
   dtype = None
@@ -256,13 +254,28 @@ def args_to_matching_eager(l, ctx, default_dtype=None):
   if dtype is None:
     # Infer a dtype based on the first value, and use that dtype for the
     # remaining values.
+
     ret = []
     for t in l:
-      ret.append(
-          ops.convert_to_tensor(
-              t, dtype, preferred_dtype=default_dtype, ctx=ctx))
+      tensor = None
+      # First see if we can get a valid dtype with the default conversion
+      # and see if it matches an allowed dtypes. Some ops like ConcatV2 may
+      # not list allowed dtypes, in which case we should skip this.
+      if dtype is None and allowed_dtypes:
+        tensor = ops.convert_to_tensor(t, ctx=ctx)
+        # If we did not match an allowed dtype, try again with the default
+        # dtype. This could be because we have an empty tensor and thus we
+        # picked the wrong type.
+        if tensor.dtype not in allowed_dtypes:
+          tensor = None
+
+      if tensor is None:
+        tensor = ops.convert_to_tensor(
+            t, dtype, preferred_dtype=default_dtype, ctx=ctx)
+
+      ret.append(tensor)
       if dtype is None:
-        dtype = ret[-1].dtype
+        dtype = tensor.dtype
   else:
     ret = [ops.convert_to_tensor(t, dtype, ctx=ctx) for t in l]
 
diff --git a/tensorflow/python/eager/forwardprop.py b/tensorflow/python/eager/forwardprop.py
index cd91295caab..8f6c2da5fbd 100644
--- a/tensorflow/python/eager/forwardprop.py
+++ b/tensorflow/python/eager/forwardprop.py
@@ -29,8 +29,10 @@ from tensorflow.python.eager import forwardprop_util
 from tensorflow.python.eager import function
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.numpy_ops import np_arrays
 from tensorflow.python.ops.parallel_for import control_flow_ops
 from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
 from tensorflow.python.platform import tf_logging as logging
@@ -218,7 +220,7 @@ pywrap_tfe.TFE_Py_RegisterJVPFunction(_jvp_dispatch)
 
 
 @tf_export("autodiff.ForwardAccumulator", v1=[])
-class ForwardAccumulator(object):
+class ForwardAccumulator():
   """Computes Jacobian-vector products ("JVP"s) using forward-mode autodiff.
 
   Compare to `tf.GradientTape` which computes vector-Jacobian products ("VJP"s)
@@ -348,7 +350,7 @@ class ForwardAccumulator(object):
       ValueError: If the same tensor or variable is specified multiple times in
         `primals`.
     """
-    self._accumulator = pywrap_tfe.TFE_Py_ForwardAccumulatorNew()
+    self._accumulator = pywrap_tfe.TFE_Py_ForwardAccumulatorNew(False)
     self._recording = False
     primal_ids = set()
     for primal in nest.flatten(primals):
@@ -395,18 +397,21 @@ class ForwardAccumulator(object):
       primals: A Tensor or list of Tensors.
       tangents: A Tensor or list of Tensors matching `primals`.
     """
-    nest.assert_same_structure(primals, tangents)
-    for t, g in zip(nest.flatten(primals), nest.flatten(tangents)):
-      if not t.dtype.is_floating:
+
+    def _watch(primal, tangent):
+      if not primal.dtype.is_floating:
         logging.log_first_n(
             logging.WARN, "The dtype of the watched primal must be "
-            "floating (e.g. tf.float32), got %r", 5, t.dtype)
-      g = ops.convert_to_tensor(g, dtype=t.dtype)
-      if hasattr(t, "handle"):
+            "floating (e.g. tf.float32), got %r", 5, primal.dtype)
+      tangent = ops.convert_to_tensor(tangent, dtype=primal.dtype)
+      if hasattr(primal, "handle"):
         # Run convert_to_tensor to get the captured handle from whichever
         # function we're running if necessary.
-        t = ops.convert_to_tensor(t.handle)
-      pywrap_tfe.TFE_Py_ForwardAccumulatorWatch(self._accumulator, t, g)
+        primal = ops.convert_to_tensor(primal.handle)
+      pywrap_tfe.TFE_Py_ForwardAccumulatorWatch(self._accumulator, primal,
+                                                tangent)
+
+    nest.map_structure(_watch, primals, tangents, expand_composites=True)
 
   def jvp(self, primals, unconnected_gradients=UnconnectedGradients.NONE):
     """Fetches the Jacobian-vector product computed for `primals`.
@@ -432,10 +437,47 @@ class ForwardAccumulator(object):
 
     def _fetch_jvp(tensor):
       if hasattr(tensor, "handle"):
-        tensor = ops.convert_to_tensor(tensor.handle)
+        unwrapped_tensor = ops.convert_to_tensor(tensor.handle)
+      else:
+        if isinstance(tensor, np_arrays.ndarray):
+          unwrapped_tensor = tensor.data
+        else:
+          unwrapped_tensor = tensor
       result = pywrap_tfe.TFE_Py_ForwardAccumulatorJVP(self._accumulator,
-                                                       tensor)
+                                                       unwrapped_tensor)
       if result is None and unconnected_gradients == UnconnectedGradients.ZERO:
-        return array_ops.zeros_like(tensor)
+        result = array_ops.zeros_like(tensor)
+      if result is not None and isinstance(tensor, np_arrays.ndarray):
+        return np_arrays.tensor_to_ndarray(result)
       return result
+
     return nest.map_structure(_fetch_jvp, primals)
+
+  @classmethod
+  def _batch_accumulator(cls, primals, tangents):
+    """Factory constructor to test accumulator on batches of tangents.
+
+    Args:
+      primals: A tensor or nested structure of tensors to watch.
+      tangents: A tensor or nested structure of tensors, with the same nesting
+        structure as `primals`, with each element being a vector with compatible
+        shape `[None] + primal.shape` of the corresponding primal element.
+
+    Returns:
+      A batch accumulator object.
+    """
+    acc = super(ForwardAccumulator, cls).__new__(cls, primals, tangents)
+    acc._recording = False
+    acc._accumulator = pywrap_tfe.TFE_Py_ForwardAccumulatorNew(True)
+    primal_ids = set()
+    for primal, tangent in zip(nest.flatten(primals), nest.flatten(tangents)):
+      tangent.shape.assert_is_compatible_with(
+          tensor_shape.TensorShape([None]) + primal.shape)
+      if id(primal) in primal_ids:
+        raise ValueError(
+            "Tensor {} was specified as a primal multiple times. This may "
+            "indicate an error. If it was intended, please sum the "
+            "corresponding tangents.")
+      primal_ids.add(id(primal))
+    acc._watch(primals, tangents)
+    return acc
diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index ad55a5301a9..fea6c9963ff 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -340,6 +340,15 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
         y = x + x
       self.assertAllClose(2. * array_ops.ones_like(x), acc.jvp(y))
 
+  def testVariableUnwatchedZero(self):
+    v = variables.Variable([[1.]])
+    x = constant_op.constant(1.)
+    xt = constant_op.constant(2.)
+    with forwardprop.ForwardAccumulator(x, xt) as acc:
+      pass
+    self.assertIsNone(acc.jvp(v))
+    self.assertAllClose([[0.]], acc.jvp(v, unconnected_gradients="zero"))
+
   @test_util.assert_no_new_pyobjects_executing_eagerly
   def testMultipleWatchesAdd(self):
     x = constant_op.constant(-2.)
@@ -1009,7 +1018,7 @@ class HessianTests(test.TestCase, parameterized.TestCase):
     self.assertAllClose(hess_value, hessian_pfor)
 
 
-class JacobianTests(test.TestCase, parameterized.TestCase):
+class BatchTests(test.TestCase, parameterized.TestCase):
 
   @parameterized.parameters([(math_ops.sin, (2, 3), 5),
                              (math_ops.sin, (2, 3, 4), 10)])
@@ -1020,6 +1029,18 @@ class JacobianTests(test.TestCase, parameterized.TestCase):
         _jvp_batch(f, primals, tangent_batch)[1],
         _jvp_batch_matmul(f, primals, *tangent_batch))
 
+  def testBatchCorrectness(self):
+    x = constant_op.constant(2.0)
+    y = constant_op.constant(5.0)
+    tangents = (
+        constant_op.constant([1., 0., 1.]),
+        constant_op.constant([0., 1., 1.]),
+    )
+    with forwardprop.ForwardAccumulator._batch_accumulator((x, y),
+                                                           tangents) as acc:
+      z = x * y
+    self.assertAllClose(acc.jvp(z), constant_op.constant([5.0, 2.0, 7.0]))
+
 
 if __name__ == "__main__":
   # TODO(allenl): Also test with 1.x-style graph mode.
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 709ecaa37e6..c6f472a4071 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -66,6 +66,8 @@ from tensorflow.python.ops import resource_variable_ops
 
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.profiler import trace
+from tensorflow.python.saved_model import save_context
+from tensorflow.python.saved_model import save_options
 from tensorflow.python.util import compat
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import lazy_loader
@@ -150,6 +152,7 @@ CacheKey = collections.namedtuple("CacheKey", [
     "device_functions",
     "colocation_stack",
     "in_cross_replica_context",
+    "variable_policy",
     "xla_context_id",
 ])
 
@@ -1744,12 +1747,12 @@ class ConcreteFunction(object):
       TypeError: if `args` and `kwargs` do not match the structured signature
         of this `ConcreteFunction`.
     """
-    args, kwargs = self._function_spec.canonicalize_function_inputs(
-        *args, **kwargs)
+    args, kwargs, flat_args, flat_kwargs = \
+        self._function_spec.canonicalize_function_inputs(*args, **kwargs)
     self._structured_signature_check_missing_args(args, kwargs)
     self._structured_signature_check_unexpected_args(args, kwargs)
     self._structured_signature_check_arg_types(args, kwargs)
-    return self._filtered_call(args, kwargs, cancellation_manager)
+    return self._filtered_call(flat_args, flat_kwargs, cancellation_manager)
 
   def _structured_signature_check_missing_args(self, args, kwargs):
     """Raises a TypeError if any args are missing."""
@@ -1831,28 +1834,31 @@ class ConcreteFunction(object):
                             type(spec_piece).__name__, spec_piece, name,
                             type(arg_piece).__name__, arg_piece))
 
-  def _filtered_call(self, args, kwargs, cancellation_manager=None):
+  def _filtered_call(self, flat_args, flat_kwargs, cancellation_manager=None):
     """Executes the function, filtering arguments from the Python function.
 
     Objects aside from Tensors, CompositeTensors, and Variables are ignored.
-    CompositeTensors are expanded into their components.
+    CompositeTensors have been expanded into their components on input.
 
     Args:
-      args: Canonicalized positional arguments of the Python function.
-      kwargs: Canonicalized keyword arguments of the Python function.
+      flat_args: Flattened canonicalized positional arguments of the Python
+        function.
+      flat_kwargs: Flattened canonicalized keyword arguments of the Python
+        function.
       cancellation_manager: (Optional.) A `CancellationManager` that can be
         used to cancel function invocation.
 
     Returns:
       The result of applying the function on the Tensors/Variables contained in
-      `args` and `kwargs`.
+      `flat_args` and `flat_kwargs`.
     """
-    return self._call_flat(
-        [t for t in nest.flatten((args, kwargs), expand_composites=True)
-         if isinstance(t, (ops.Tensor,
-                           resource_variable_ops.BaseResourceVariable))],
-        captured_inputs=self.captured_inputs,
-        cancellation_manager=cancellation_manager)
+    return self._call_flat([
+        t for t in flat_args + flat_kwargs
+        if isinstance(t, (ops.Tensor,
+                          resource_variable_ops.BaseResourceVariable))
+    ],
+                           captured_inputs=self.captured_inputs,
+                           cancellation_manager=cancellation_manager)
 
   def _call_flat(self, args, captured_inputs, cancellation_manager=None):
     """Executes the wrapped function.
@@ -2163,6 +2169,7 @@ class ConcreteFunction(object):
     Returns:
       The actual call output.
     """
+    # TODO(jlchu): implement in C++.
     if self._func_graph.structured_outputs is None:
       return result
 
@@ -2320,8 +2327,10 @@ class FunctionSpec(object):
   """Specification of how to bind arguments to a function."""
 
   @staticmethod
-  def from_function_and_signature(python_function, input_signature,
-                                  is_pure=False):
+  def from_function_and_signature(python_function,
+                                  input_signature,
+                                  is_pure=False,
+                                  experimental_follow_type_hints=False):
     """Create a FunctionSpec instance given a python function and signature.
 
     Args:
@@ -2329,6 +2338,7 @@ class FunctionSpec(object):
       input_signature: a signature of the function (None, if variable)
       is_pure: if True all input arguments (including variables and constants)
       will be converted to tensors and no variable changes allowed.
+      experimental_follow_type_hints: see `tf.function`
 
     Returns:
       instance of FunctionSpec
@@ -2406,13 +2416,19 @@ class FunctionSpec(object):
     name = getattr(python_function, "__name__", "f")
 
     return FunctionSpec(
-        fullargspec, is_method, input_signature, is_pure=is_pure, name=name)
+        fullargspec,
+        is_method,
+        input_signature,
+        is_pure=is_pure,
+        experimental_follow_type_hints=experimental_follow_type_hints,
+        name=name)
 
   def __init__(self,
                fullargspec,
                is_method,
                input_signature,
                is_pure=False,
+               experimental_follow_type_hints=False,
                name=None):
     """Constructs a FunctionSpec describing a python function.
 
@@ -2422,11 +2438,13 @@ class FunctionSpec(object):
       input_signature: a signature of the function (None, if variable)
       is_pure: if True all input arguments (including variables and constants)
         will be converted to tensors and no variable changes allowed.
+      experimental_follow_type_hints: see `tf.function`.
       name: Name of the function
     """
     self._fullargspec = fullargspec
     self._is_method = is_method
     self._is_pure = is_pure
+    self._experimental_follow_type_hints = experimental_follow_type_hints
 
     # TODO(edloper): Include name when serializing for SavedModel?
     self._name = name or "f"
@@ -2533,11 +2551,49 @@ class FunctionSpec(object):
     kwargs = {kw: ops.convert_to_tensor(x) for kw, x in kwargs.items()}
     return tuple(args), kwargs
 
+  def _convert_annotated_args_to_tensors(self, args, kwargs):
+    """Attempts to autobox arguments annotated as tf.Tensor."""
+    if self.input_signature is not None:
+      return
+
+    args = list(args)
+    for i, arg in enumerate(args):
+      # See
+      # https://docs.python.org/3/library/inspect.html#inspect.getfullargspec
+      if i < len(self._fullargspec.args):
+        arg_annotation = self._fullargspec.annotations.get(
+            self._fullargspec.args[i])
+        # TODO(rahulkamat): Change to TensorLike (here ans below).
+        if arg_annotation == ops.Tensor:
+          args[i] = ops.convert_to_tensor(arg)
+      else:
+        varargs_annotation = self._fullargspec.annotations.get(
+            self._fullargspec.varargs)
+        if varargs_annotation == ops.Tensor:
+          args[i] = ops.convert_to_tensor(arg)
+
+    for kw, v in kwargs.items():
+      if kw in self._fullargspec.kwonlyargs:
+        kwonlyarg_annotation = self._fullargspec.annotations.get(kw)
+        if kwonlyarg_annotation == ops.Tensor:
+          kwargs[kw] = ops.convert_to_tensor(v)
+      elif self._fullargspec.varkw is not None:
+        varkw_annotation = self._fullargspec.annotations.get(
+            self._fullargspec.varkw)
+        if kw in self._fullargspec.args:
+          arg_annotation = self._fullargspec.annotations.get(kw)
+          if arg_annotation == ops.Tensor:
+            kwargs[kw] = ops.convert_to_tensor(v)
+        elif varkw_annotation == ops.Tensor:
+          kwargs[kw] = ops.convert_to_tensor(v)
+
+    return tuple(args), kwargs
+
   def canonicalize_function_inputs(self, *args, **kwargs):
     """Canonicalizes `args` and `kwargs`.
 
     Canonicalize the inputs to the Python function using a `FunctionSpec`
-    instance. In particular, we parse the varags and kwargs that the
+    instance. In particular, we parse the varargs and kwargs that the
     original function was called with into a tuple corresponding to the
     Python function's positional (named) arguments and a dictionary
     corresponding to its kwargs.  Missing default arguments are added.
@@ -2554,8 +2610,9 @@ class FunctionSpec(object):
 
     Returns:
       A canonicalized ordering of the inputs representened by a tuple in the
-      form (args, kwargs). Here: `args` is a full list of bound arguments, and
-      `kwargs` contains only true keyword arguments, as opposed to named
+      form (args, kwargs), followed by their flattened versions in the form
+      (flat_args, flat_kwargs). Here: `args` is a full list of bound arguments,
+      and `kwargs` contains only true keyword arguments, as opposed to named
       arguments called in a keyword-like fashion.
 
     Raises:
@@ -2565,6 +2622,8 @@ class FunctionSpec(object):
     """
     if self._is_pure:
       args, kwargs = self._convert_variables_to_tensors(args, kwargs)
+    if self._experimental_follow_type_hints:
+      args, kwargs = self._convert_annotated_args_to_tensors(args, kwargs)
     if self._input_signature is not None:
       if len(args) > len(self._input_signature):
         raise TypeError("{} takes {} positional arguments (as specified by the "
@@ -2634,16 +2693,14 @@ class FunctionSpec(object):
           kwargs.setdefault(kwarg, default)
 
     if self._input_signature is None:
-      inputs = _convert_numpy_inputs(inputs)
-      kwargs = _convert_numpy_inputs(kwargs)
-      return inputs, kwargs
+      inputs, flat_inputs = _convert_numpy_inputs(inputs)
+      kwargs, flat_kwargs = _convert_numpy_inputs(kwargs)
+      return inputs, kwargs, flat_inputs, flat_kwargs
     else:
       assert not kwargs
-      inputs = _convert_inputs_to_signature(
-          inputs,
-          self._input_signature,
-          self._flat_input_signature)
-      return inputs, {}
+      inputs, flat_inputs = _convert_inputs_to_signature(
+          inputs, self._input_signature, self._flat_input_signature)
+      return inputs, {}, flat_inputs, []
 
 
 def _as_ndarray(value):
@@ -2669,8 +2726,11 @@ def _is_ndarray(value):
 def _convert_numpy_inputs(inputs):
   """Convert numpy array inputs to tensors."""
   # We assume that any CompositeTensors have already converted their components
-  # from numpy arrays to Tensors, so we don't need to expand composites here.
-  flat_inputs = nest.flatten(inputs, expand_composites=False)
+  # from numpy arrays to Tensors, so we don't need to expand composites here for
+  # the numpy array conversion. Instead, we do so because the flattened inputs
+  # are eventually passed to ConcreteFunction()._filtered_call, which requires
+  # expanded composites.
+  flat_inputs = nest.flatten(inputs, expand_composites=True)
 
   # Check for NumPy arrays in arguments and convert them to Tensors.
   # TODO(nareshmodi): Skip ndarray conversion to tensor altogether, perhaps
@@ -2686,10 +2746,11 @@ def _convert_numpy_inputs(inputs):
       flat_inputs[index] = constant_op.constant(a)
       need_packing = True
   if need_packing:
-    return nest.pack_sequence_as(
-        structure=inputs, flat_sequence=flat_inputs, expand_composites=False)
+    return (nest.pack_sequence_as(
+        structure=inputs, flat_sequence=flat_inputs,
+        expand_composites=True), flat_inputs)
   else:
-    return inputs
+    return inputs, flat_inputs
 
 
 def _convert_inputs_to_signature(inputs, input_signature, flat_input_signature):
@@ -2701,9 +2762,6 @@ def _convert_inputs_to_signature(inputs, input_signature, flat_input_signature):
             ",\n    ".join(str(i) for i in input_signature) + ")")
 
   try:
-    # TODO(b/124370185): Use all elements as inputs to throw an error if there
-    # are ignored arguments. Calling with arguments that are not part of the
-    # signature should throw an error.
     flatten_inputs = nest.flatten_up_to(
         input_signature,
         inputs[:len(input_signature)],
@@ -2741,7 +2799,7 @@ def _convert_inputs_to_signature(inputs, input_signature, flat_input_signature):
         flat_sequence=flatten_inputs,
         expand_composites=True)
 
-  return inputs
+  return inputs, nest.flatten(inputs, expand_composites=True)
 
 
 class FunctionCache(object):
@@ -2801,7 +2859,8 @@ class Function(object):
                autograph_options=None,
                experimental_relax_shapes=False,
                capture_by_value=None,
-               experimental_compile=None):
+               experimental_compile=None,
+               experimental_follow_type_hints=False):
     """Initializes a `Function`.
 
     Args:
@@ -2825,6 +2884,7 @@ class Function(object):
         default to False.
       experimental_compile: Force-compile the function with XLA, cf.
         def_function.Function doc on experimental_compile.
+      experimental_follow_type_hints: See the documentation for `tf.function`.
 
     Raises:
       ValueError: if `input_signature` is not None and the `python_function`'s
@@ -2833,7 +2893,10 @@ class Function(object):
     self._python_function = python_function
     pure_function = attributes and IMPLEMENTS_ATTRIBUTE_NAME in attributes
     self._function_spec = FunctionSpec.from_function_and_signature(
-        python_function, input_signature, is_pure=pure_function)
+        python_function,
+        input_signature,
+        is_pure=pure_function,
+        experimental_follow_type_hints=experimental_follow_type_hints)
     self._name = name
     self._autograph = autograph
     self._autograph_options = autograph_options
@@ -2842,6 +2905,9 @@ class Function(object):
     self._function_attributes = attributes or {}
     self._capture_by_value = capture_by_value
     self.tracing_count = 0
+    if self.input_signature is not None:
+      self._hashable_input_signature = _make_input_signature_hashable(
+          self.flat_input_signature)
 
     self._lock = threading.Lock()
     # _descriptor_cache is a of instance of a class to an instance-specific
@@ -2849,12 +2915,18 @@ class Function(object):
     # functions for each instance.
     self._descriptor_cache = weakref.WeakKeyDictionary()
     self._experimental_compile = experimental_compile
+    self._experimental_follow_type_hints = experimental_follow_type_hints
+
+    # A boolean indicating whether the function has been traced with
+    # distribution strategy.
+    self._traced_with_distribution_strategy = False
 
   def __call__(self, *args, **kwargs):
     """Calls a graph function specialized to the inputs."""
     with self._lock:
-      graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
-    return graph_function._filtered_call(args, kwargs)  # pylint: disable=protected-access
+      graph_function, flat_args, flat_kwargs = \
+          self._maybe_define_function(args, kwargs)
+    return graph_function._filtered_call(flat_args, flat_kwargs)  # pylint: disable=protected-access
 
   @property
   def python_function(self):
@@ -2930,7 +3002,7 @@ class Function(object):
                            (str(args), str(self.input_signature)))
       args, kwargs = None, None
     with self._lock:
-      graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
+      graph_function, _, _ = self._maybe_define_function(args, kwargs)
       seen_names = set()
       captured = object_identity.ObjectIdentitySet(
           graph_function.graph.internal_captures)
@@ -3007,10 +3079,11 @@ class Function(object):
       inputs = (args, kwargs) if kwargs else args
       input_signature = pywrap_tfe.TFE_Py_EncodeArg(inputs,
                                                     include_tensor_ranks_only)
+      hashable_input_signature = _make_input_signature_hashable(input_signature)
     else:
       del args, kwargs
       assert not include_tensor_ranks_only
-      input_signature = self.flat_input_signature
+      hashable_input_signature = self._hashable_input_signature
 
     ctx = context.context()
 
@@ -3066,10 +3139,23 @@ class Function(object):
     except (AttributeError, IndexError):
       pass
 
-    return CacheKey(
-        _make_input_signature_hashable(input_signature), parent_graph,
-        device_functions, colocation_stack, in_cross_replica_context,
-        xla_context_id)
+    # If the function has been traced with a distribution strategy, it might
+    # need to be retraced at saving time as DistributedVariable created under
+    # distribution strategy may want different tracing behavior at training and
+    # saving, e.g, it wants to resolve to the primary component at saving time,
+    # but wants resolve to the component residing in the current device at
+    # training time. We achieve this by adding variable_policy to the function
+    # cache key.
+    if save_context.in_save_context(
+    ) and self._traced_with_distribution_strategy:
+      variable_policy = (
+          save_context.get_save_options().experimental_variable_policy)
+    else:
+      variable_policy = save_options.VariablePolicy.EXPAND_DISTRIBUTED_VARIABLES
+
+    return CacheKey(hashable_input_signature, parent_graph, device_functions,
+                    colocation_stack, in_cross_replica_context, variable_policy,
+                    xla_context_id)
 
   def _create_graph_function(self, args, kwargs, override_flat_arg_shapes=None):
     """Create a `ConcreteFunction` from `args` and `kwargs`."""
@@ -3109,10 +3195,13 @@ class Function(object):
         shared_func_graph=False)
     return graph_function
 
-  def _define_function_with_shape_relaxation(self, args, kwargs):
+  def _define_function_with_shape_relaxation(self, args, kwargs, flat_args,
+                                             flat_kwargs):
     """Define a function, relaxing arg shapes to avoid unnecessary retracing."""
-    any_composite_args = any(isinstance(x, composite_tensor.CompositeTensor)
-                             for x in nest.flatten((args, kwargs)))
+    flat_args_all = nest.flatten((args, kwargs), expand_composites=False)
+
+    any_composite_args = any(
+        isinstance(x, composite_tensor.CompositeTensor) for x in flat_args_all)
 
     # Build a cache key where TensorShapes include only rank information (and
     # not information about the size of each dimension).
@@ -3127,7 +3216,7 @@ class Function(object):
       rank_only_cache_key = self._cache_key(
           cache_key_args, cache_key_kwargs, include_tensor_ranks_only=True)
 
-    arg_specs = [_type_spec_for(x) for x in nest.flatten((args, kwargs))]
+    arg_specs = [_type_spec_for(x) for x in flat_args_all]
     relaxed_arg_specs = self._function_cache.arg_relaxed_specs.get(
         rank_only_cache_key, None)
     relaxed_arg_function = self._function_cache.arg_relaxed.get(
@@ -3136,7 +3225,7 @@ class Function(object):
     if (relaxed_arg_function is not None
         and all(_is_type_subset(x, y) for (x, y) in
                 zip(relaxed_arg_specs, arg_specs))):
-      return relaxed_arg_function, args, kwargs
+      return relaxed_arg_function, flat_args, flat_kwargs
 
     if relaxed_arg_specs is None:
       relaxed_arg_specs = arg_specs
@@ -3162,14 +3251,15 @@ class Function(object):
           (args, kwargs), relaxed_arg_specs, expand_composites=False)
       (args, kwargs) = nest.pack_sequence_as(
           (relaxed_arg_specs, relaxed_kwarg_specs),
-          nest.flatten((args, kwargs), expand_composites=True),
+          flat_args + flat_kwargs,
           expand_composites=True)
 
     graph_function = self._create_graph_function(
         args, kwargs, override_flat_arg_shapes=relaxed_arg_shapes)
     self._function_cache.arg_relaxed[rank_only_cache_key] = graph_function
 
-    return graph_function, args, kwargs
+    return (graph_function, nest.flatten(args, expand_composites=True),
+            nest.flatten(kwargs, expand_composites=True))
 
   def _maybe_define_function(self, args, kwargs):
     """Gets a function for these inputs, defining it if necessary.
@@ -3185,7 +3275,7 @@ class Function(object):
 
     Returns:
       A graph function corresponding to the input signature implied by args and
-      kwargs, as well as the inputs that the object should be called with.
+      kwargs, as well as flattened inputs that the object should be called with.
 
     Raises:
       ValueError: If inputs are incompatible with the input signature.
@@ -3194,8 +3284,10 @@ class Function(object):
         shape relaxation retracing.
     """
     if self.input_signature is None or args is not None or kwargs is not None:
-      args, kwargs = self._function_spec.canonicalize_function_inputs(
-          *args, **kwargs)
+      args, kwargs, flat_args, flat_kwargs = \
+          self._function_spec.canonicalize_function_inputs(*args, **kwargs)
+    else:
+      flat_args, flat_kwargs = [None], [None]
 
     cache_key = self._cache_key(args, kwargs)
 
@@ -3208,7 +3300,7 @@ class Function(object):
 
     graph_function = self._function_cache.primary.get(cache_key, None)
     if graph_function is not None:
-      return graph_function, args, kwargs
+      return graph_function, flat_args, flat_kwargs
 
     logging.vlog(1,
                  "Creating new FuncGraph for Python function %r (key: %r)",
@@ -3234,12 +3326,17 @@ class Function(object):
       if (self._experimental_relax_shapes
           and self.input_signature is None
           and call_context_key in self._function_cache.missed):
-        return self._define_function_with_shape_relaxation(args, kwargs)
+        return self._define_function_with_shape_relaxation(
+            args, kwargs, flat_args, flat_kwargs)
 
       self._function_cache.missed.add(call_context_key)
       graph_function = self._create_graph_function(args, kwargs)
       self._function_cache.primary[cache_key] = graph_function
-      return graph_function, args, kwargs
+
+      if ops.get_default_graph()._distribution_strategy_stack:
+        self._traced_with_distribution_strategy = True
+
+      return graph_function, flat_args, flat_kwargs
 
 
 def register(func, *args, **kwargs):
@@ -3625,7 +3722,8 @@ def defun_with_attributes(func=None,
                           autograph=True,
                           experimental_autograph_options=None,
                           experimental_compile=None,
-                          experimental_relax_shapes=False):
+                          experimental_relax_shapes=False,
+                          experimental_follow_type_hints=False):
   """Compiles a Python function into a callable TensorFlow graph.
 
   This function supports adding extra function attributes. See detailed
@@ -3647,6 +3745,7 @@ def defun_with_attributes(func=None,
       experimental_autograph_options.
     experimental_compile: same as defun()'s experimental_compile.
     experimental_relax_shapes: same as defun()'s experimental_relax_shapes
+    experimental_follow_type_hints: see `tf.function`.
 
   Returns:
     Same as the return value of defun, with attributes added to the function in
@@ -3674,7 +3773,8 @@ def defun_with_attributes(func=None,
             autograph=autograph,
             autograph_options=experimental_autograph_options,
             experimental_compile=experimental_compile,
-            experimental_relax_shapes=experimental_relax_shapes))
+            experimental_relax_shapes=experimental_relax_shapes,
+            experimental_follow_type_hints=experimental_follow_type_hints))
 
   # This code path is for the `foo = tfe.defun(foo, ...)` use case
   if func is not None:
diff --git a/tensorflow/python/eager/function_gradients_test.py b/tensorflow/python/eager/function_gradients_test.py
index d0fc4f5e809..a19cd662083 100644
--- a/tensorflow/python/eager/function_gradients_test.py
+++ b/tensorflow/python/eager/function_gradients_test.py
@@ -825,7 +825,7 @@ class FunctionGradientsTest(test.TestCase, parameterized.TestCase):
         return middle_fn(x, v)
 
       x = constant_op.constant(5.0)
-      self.assertAllEqual(outer_fn(x).eval(), 5.0 * (5.0 + 3.0))
+      self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
 
       grad, = gradients_impl.gradients(outer_fn(x), x)
 
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 1bcf51e62c6..65b23401431 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -1360,7 +1360,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       def g(x):
         return f(x) + 1
 
-      self.assertAllEqual(g(constant_op.constant(2.0)).eval(), 5.0)
+      self.assertAllEqual(g(constant_op.constant(2.0)), 5.0)
 
   def testDict(self):
 
@@ -3927,6 +3927,299 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     gradients(constant_op.constant([[[1.0], [2.0]]]))  # No error is raised
 
+  def testFollowTypeHintsTraceBasic(self):
+    trace_count = [0]
+
+    def func(x: ops.Tensor):
+      trace_count[0] += 1
+      return x
+
+    enabled = def_function.function(func, experimental_follow_type_hints=True)
+    disabled = def_function.function(func, experimental_follow_type_hints=False)
+
+    enabled(1)  # Initial call gets traced
+    enabled(2)
+    enabled(3)
+    self.assertEqual(trace_count[0], 1)
+
+    trace_count = [0]
+    disabled(1)
+    disabled(2)  # Retrace
+    disabled(3)  # Retrace
+    self.assertEqual(trace_count[0], 3)
+
+  def testFollowTypeHintsTraceWithArgs(self):
+    trace_count = [0]
+
+    def func(*args: ops.Tensor):
+      trace_count[0] += 1
+      return args
+
+    enabled = def_function.function(func, experimental_follow_type_hints=True)
+    disabled = def_function.function(func, experimental_follow_type_hints=False)
+
+    args = (
+        'abc',
+        'def',
+    ) * 20
+    args2 = (
+        'def',
+        'abc',
+    ) * 20
+
+    enabled(args)
+    enabled(args2)
+    self.assertEqual(trace_count[0], 1)
+
+    trace_count = [0]
+    disabled(args)
+    disabled(args2)  # Retrace
+    self.assertEqual(trace_count[0], 2)
+
+  def testFollowTypeHintsTraceWithKwargs(self):
+    trace_count = [0]
+
+    def func(t: ops.Tensor, **kwargs: ops.Tensor):
+      del kwargs
+      trace_count[0] += 1
+      return t
+
+    enabled = def_function.function(func, experimental_follow_type_hints=True)
+    disabled = def_function.function(func, experimental_follow_type_hints=False)
+
+    enabled(1, x=1, y=1.0, z='one')
+    enabled(2, x=2, y=2.0, z='two')
+    self.assertEqual(trace_count[0], 1)
+
+    trace_count = [0]
+    disabled(1, x=1, y=1.0, z='one')
+    disabled(2, x=2, y=2.0, z='two')  # Retrace
+    self.assertEqual(trace_count[0], 2)
+
+  def testFollowTypeHintsTraceWithMultipleInputTypes(self):
+    trace_count = [0]
+
+    def func(t: ops.Tensor, *args: ops.Tensor, **kwargs: ops.Tensor):
+      del args, kwargs
+      trace_count[0] += 1
+      return t
+
+    enabled = def_function.function(func, experimental_follow_type_hints=True)
+    disabled = def_function.function(func, experimental_follow_type_hints=False)
+
+    enabled(1, constant_op.constant(1), 'str', x=4.0)
+    enabled(2, constant_op.constant(2), 'str2', x=5.0)
+    self.assertEqual(trace_count[0], 1)
+
+    trace_count = [0]
+    disabled(1, constant_op.constant(1), 'str', x=4.0)
+    disabled(2, constant_op.constant(2), 'str2', x=5.0)  # Retrace
+    self.assertEqual(trace_count[0], 2)
+
+  def testFollowTypeHintsTraceWithOnlyArgNamed(self):
+    trace_count = [0]
+
+    def func(t: ops.Tensor, i: int = 1, **kwargs):  # pylint: disable=bad-whitespace
+      del i, kwargs
+      trace_count[0] += 1
+      return t
+
+    enabled = def_function.function(func, experimental_follow_type_hints=True)
+
+    enabled(1, 3, x=4.0, y='str')
+    enabled(2, 4, x=4.0, y='str')  # Retrace
+    self.assertEqual(trace_count[0], 2)
+
+  def testFollowTypeHintsTraceWithNotAllNamed(self):
+    trace_count = [0]
+
+    def func(x, y: ops.Tensor, z: int):
+      del y, z
+      trace_count[0] += 1
+      return x
+
+    enabled = def_function.function(func, experimental_follow_type_hints=True)
+
+    enabled(1, 2, 3)
+    enabled(1, 20, 3)  # No retrace - change in ops.Tensor typed arg
+    enabled(2, 2, 3)  # Retrace - change in untyped arg
+    enabled(2, 2, 4)  # Retrace - change in typed arg
+    self.assertEqual(trace_count[0], 3)
+
+  def testFollowTypeHintsTraceWithOnlyArgsNamed(self):
+    trace_count = [0]
+
+    def func(x, y, *args: ops.Tensor):
+      del y, args
+      trace_count[0] += 1
+      return x
+
+    enabled = def_function.function(func, experimental_follow_type_hints=True)
+
+    enabled(1, 20, 3, 4, 5, 6)
+    enabled(1, 20, 3, 4, 5, 60)  # No retrace - change in *args
+    enabled(1, 30, 7, 8, 9, 10)  # Retrace - change in args
+    self.assertEqual(trace_count[0], 2)
+
+  def testFollowTypeHintsTraceWithOnlyKwargsNamed(self):
+    trace_count = [0]
+
+    def func(x, y, *args, **kwargs: ops.Tensor):
+      del y, args, kwargs
+      trace_count[0] += 1
+      return x
+
+    enabled = def_function.function(func, experimental_follow_type_hints=True)
+
+    enabled(1, 2, 3, 4, 5, 6, a=1.0, b=2.0, c=3.0)
+    enabled(
+        1, 2, 3, 4, 5, 6, a=1.5, b=2.5,
+        c=3.5)  # No retrace - change in **kwargs
+    enabled(100, 2, 3, 4, 5, 6, a=1.0, b=2.0, c=3.0)  # Retrace - change in args
+    enabled(
+        1, 2, 3, 4, 5, 100, a=1.0, b=2.0, c=3.0)  # Retrace - change in *args
+    self.assertEqual(trace_count[0], 3)
+
+  def testFollowTypeHintsTraceWithArgsEquals(self):
+    trace_count = [0]
+
+    def func(
+        x: ops.Tensor = 0,  # pylint:disable=bad-whitespace
+        y: int = 1,  # pylint:disable=bad-whitespace
+        **kwargs: ops.Tensor):
+      del y, kwargs
+      trace_count[0] += 1
+      return x
+
+    enabled = def_function.function(func, experimental_follow_type_hints=True)
+
+    enabled(x=1, y=2, z=3)
+    enabled(x=1, y=3, z=3)  # Retrace - change in args
+    enabled(x=2, y=2, z=4)  # No retrace - change in args and **kwargs
+    enabled(x=2, y=2, z=4, u=5)  # Retrace - change in **kwargs
+    self.assertEqual(trace_count[0], 3)
+
+  def testFollowTypeHintsTraceWithArgsEqualsTypedKwargs(self):
+    trace_count = [0]
+
+    def func(x, y, **kwargs: ops.Tensor):
+      del y, kwargs
+      trace_count[0] += 1
+      return x
+
+    enabled = def_function.function(func, experimental_follow_type_hints=True)
+
+    enabled(x=1, y=2, z=3)
+    enabled(x=1, y=3, z=3)  # Retrace
+    enabled(x=1, y=2, z=4)  # No retrace
+    enabled(x=2, y=2, z=4)  # Retrace
+    enabled(x=2, y=2, z=4, u=5)  # Retrace
+    self.assertEqual(trace_count[0], 4)
+
+  def testFollowTypeHintsTraceWithArgsEqualsTypedArgs(self):
+    trace_count = [0]
+
+    def func(x: ops.Tensor, y: int, **kwargs):
+      del y, kwargs
+      trace_count[0] += 1
+      return x
+
+    enabled = def_function.function(func, experimental_follow_type_hints=True)
+
+    enabled(x=1, y=2, z=3)
+    enabled(x=1, y=3, z=3)  # Retrace
+    enabled(x=1, y=2, z=4)  # Retrace
+    enabled(x=2, y=2, z=3)  # No retrace
+    enabled(x=2, y=2, z=4, u=5)  # Retrace
+    self.assertEqual(trace_count[0], 4)
+
+  def testFollowTypeHintsTraceWithKwOnlyArgsBasic(self):
+    trace_count = [0]
+
+    def func(*, a: ops.Tensor = None, b=1):  # pylint: disable=bad-whitespace
+      del b
+      trace_count[0] += 1
+      return a
+
+    enabled = def_function.function(func, experimental_follow_type_hints=True)
+
+    enabled(a=1, b=2)
+    enabled(a=2, b=2)  # No retrace
+    enabled(a=1, b=1)  # Retrace
+    self.assertEqual(trace_count[0], 2)
+
+  def testFollowTypeHintsTraceWithArgsKwOnlyArgsKwargsAndTypedArg(self):
+    trace_count = [0]
+
+    def func(arg: ops.Tensor, *args, kwonly, **kwargs):
+      del args, kwonly, kwargs
+      trace_count[0] += 1
+      return arg
+
+    enabled = def_function.function(func, experimental_follow_type_hints=True)
+
+    enabled(1, 2, 3, 4, kwonly=5, kwarg1=6, kwarg2=7)
+    enabled(100, 2, 3, 4, kwonly=5, kwarg1=6, kwarg2=7)  # No retrace
+    enabled(1000, 2, 3, 4, kwonly=5, kwarg1=6, kwarg2=7)  # No retrace
+    enabled(1, 20, 30, 40, kwonly=5, kwarg1=6, kwarg2=7)  # Retrace
+    enabled(1, 2, 3, 4, kwonly=50, kwarg1=6, kwarg2=7)  # Retrace
+    enabled(1, 2, 3, 4, kwonly=5, kwarg1=60, kwarg2=70)  # Retrace
+    self.assertEqual(trace_count[0], 4)
+
+  def testFollowTypeHintsTraceWithArgsKwOnlyArgsKwargsAndTypedArgs(self):
+    trace_count = [0]
+
+    def func(arg, *args: ops.Tensor, kwonly, **kwargs):
+      del args, kwonly, kwargs
+      trace_count[0] += 1
+      return arg
+
+    enabled = def_function.function(func, experimental_follow_type_hints=True)
+
+    enabled(1, 2, 3, 4, kwonly=5, kwarg1=6, kwarg2=7)
+    enabled(100, 2, 3, 4, kwonly=5, kwarg1=6, kwarg2=7)  # Retrace
+    enabled(1, 20, 30, 40, kwonly=5, kwarg1=6, kwarg2=7)  # No retrace
+    enabled(1, 200, 300, 400, kwonly=5, kwarg1=6, kwarg2=7)  # No retrace
+    enabled(1, 2, 3, 4, kwonly=50, kwarg1=6, kwarg2=7)  # Retrace
+    enabled(1, 2, 3, 4, kwonly=5, kwarg1=60, kwarg2=70)  # Retrace
+    self.assertEqual(trace_count[0], 4)
+
+  def testFollowTypeHintsTraceWithArgsKwOnlyArgsKwargsAndTypedKwOnlyArg(self):
+    trace_count = [0]
+
+    def func(arg, *args, kwonly: ops.Tensor, **kwargs):
+      del args, kwonly, kwargs
+      trace_count[0] += 1
+      return arg
+
+    enabled = def_function.function(func, experimental_follow_type_hints=True)
+
+    enabled(1, 2, 3, 4, kwonly=5, kwarg1=6, kwarg2=7)
+    enabled(100, 2, 3, 4, kwonly=5, kwarg1=6, kwarg2=7)  # Retrace
+    enabled(1, 20, 30, 40, kwonly=5, kwarg1=6, kwarg2=7)  # Retrace
+    enabled(1, 2, 3, 4, kwonly=50, kwarg1=6, kwarg2=7)  # No retrace
+    enabled(1, 2, 3, 4, kwonly=500, kwarg1=6, kwarg2=7)  # No retrace
+    enabled(1, 2, 3, 4, kwonly=5, kwarg1=60, kwarg2=70)  # Retrace
+    self.assertEqual(trace_count[0], 4)
+
+  def testFollowTypeHintsTraceWithArgsKwOnlyArgsKwargsAndTypedKwargs(self):
+    trace_count = [0]
+
+    def func(arg, *args, kwonly, **kwargs: ops.Tensor):
+      del args, kwonly, kwargs
+      trace_count[0] += 1
+      return arg
+
+    enabled = def_function.function(func, experimental_follow_type_hints=True)
+
+    enabled(1, 2, 3, 4, kwonly=5, kwarg1=6, kwarg2=7)
+    enabled(100, 2, 3, 4, kwonly=5, kwarg1=6, kwarg2=7)  # Retrace
+    enabled(1, 20, 30, 40, kwonly=5, kwarg1=6, kwarg2=7)  # Retrace
+    enabled(1, 2, 3, 4, kwonly=50, kwarg1=6, kwarg2=7)  # Retrace
+    enabled(1, 2, 3, 4, kwonly=5, kwarg1=60, kwarg2=70)  # No retrace
+    enabled(1, 2, 3, 4, kwonly=5, kwarg1=600, kwarg2=700)  # No retrace
+    self.assertEqual(trace_count[0], 4)
+
 
 class MultiDeviceTest(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/eager/graph_only_ops_test.py b/tensorflow/python/eager/graph_only_ops_test.py
index 97cf69f1905..2c5e478558c 100644
--- a/tensorflow/python/eager/graph_only_ops_test.py
+++ b/tensorflow/python/eager/graph_only_ops_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.eager import graph_only_ops
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -29,14 +30,14 @@ from tensorflow.python.platform import test
 
 class GraphOnlyOpsTest(test_util.TensorFlowTestCase):
 
-  @test_util.deprecated_graph_mode_only
   def testGraphPlaceholder(self):
-    x_tf = graph_only_ops.graph_placeholder(dtypes.int32, shape=(1,))
-    y_tf = math_ops.square(x_tf)
-    with self.cached_session() as sess:
-      x = np.array([42])
-      y = sess.run(y_tf, feed_dict={x_tf: np.array([42])})
-      self.assertAllClose(np.square(x), y)
+    with ops.Graph().as_default():
+      x_tf = graph_only_ops.graph_placeholder(dtypes.int32, shape=(1,))
+      y_tf = math_ops.square(x_tf)
+      with self.cached_session() as sess:
+        x = np.array([42])
+        y = sess.run(y_tf, feed_dict={x_tf: np.array([42])})
+        self.assertAllClose(np.square(x), y)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/eager/ops_test.py b/tensorflow/python/eager/ops_test.py
index 0e9b6283237..a859f4edf01 100644
--- a/tensorflow/python/eager/ops_test.py
+++ b/tensorflow/python/eager/ops_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.eager import test
 from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
@@ -326,17 +327,42 @@ class OpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def testArgsToMatchingEagerDefault(self):
     # Uses default
     ctx = context.context()
-    t, r = execute.args_to_matching_eager([[3, 4]], ctx, dtypes.int32)
+    allowed_dtypes = [dtypes.int32, dtypes.int64]
+
+    # Follows standard int conversion rules
+    t, r = execute.args_to_matching_eager([[3, 4]], ctx, allowed_dtypes,
+                                          dtypes.int32)
     self.assertEqual(t, dtypes.int32)
     self.assertEqual(r[0].dtype, dtypes.int32)
-    t, r = execute.args_to_matching_eager([[3, 4]], ctx, dtypes.int64)
+    t, r = execute.args_to_matching_eager([[3, 4]], ctx, allowed_dtypes,
+                                          dtypes.int64)
+    self.assertEqual(t, dtypes.int32)
+    self.assertEqual(r[0].dtype, dtypes.int32)
+    # Use int64 since it is a better fit
+    t, r = execute.args_to_matching_eager([[2**48]], ctx, allowed_dtypes,
+                                          dtypes.int32)
     self.assertEqual(t, dtypes.int64)
     self.assertEqual(r[0].dtype, dtypes.int64)
-    t, r = execute.args_to_matching_eager([], ctx, dtypes.int64)
+
+    # When the regular tensor conversion fails, then use the default type as a
+    # hint.
+    allowed_dtypes = [dtypes.uint32, dtypes.uint32]
+    t, r = execute.args_to_matching_eager([[3, 4]], ctx, allowed_dtypes,
+                                          dtypes.uint32)
+    self.assertEqual(t, dtypes.uint32)
+    self.assertEqual(r[0].dtype, dtypes.uint32)
+    t, r = execute.args_to_matching_eager([[3, 4]], ctx, allowed_dtypes,
+                                          dtypes.uint64)
+    self.assertEqual(t, dtypes.uint64)
+    self.assertEqual(r[0].dtype, dtypes.uint64)
+
+    t, r = execute.args_to_matching_eager([], ctx, allowed_dtypes, dtypes.int64)
     self.assertEqual(t, dtypes.int64)
+
     # Doesn't use default
-    t, r = execute.args_to_matching_eager(
-        [['string', 'arg']], ctx, dtypes.int32)
+    allowed_dtypes = [dtypes.int32, dtypes.string]
+    t, r = execute.args_to_matching_eager([['string', 'arg']], ctx,
+                                          allowed_dtypes, dtypes.int32)
     self.assertEqual(t, dtypes.string)
     self.assertEqual(r[0].dtype, dtypes.string)
 
@@ -455,6 +481,24 @@ class OpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertIs(weak_x(), None)
     self.assertIs(weak_y(), None)
 
+  def testAsyncExceptionStackTrace(self):
+    config.set_synchronous_execution(False)
+
+    def exception_originated_from_here():
+      # Invalid shapes for matmul.
+      return math_ops.matmul([[1]], [[2], [3]])
+
+    # In sync mode, an exception would have been raised here but since this is
+    # in async, the exception will be raised next.
+    x = exception_originated_from_here()
+
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                'in exception_originated_from_here'):
+      x.numpy()
+
+    context.async_clear_error()
+    config.set_synchronous_execution(True)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/eager/pywrap_gradient_exclusions.cc b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
index 7e9f0b16334..83523f321bd 100644
--- a/tensorflow/python/eager/pywrap_gradient_exclusions.cc
+++ b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
@@ -50,7 +50,7 @@ auto OpGradientInfoInit(const T &a) {
 
 absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
     const tensorflow::string &op_name) {
-  static std::array<OpIndexInfo, 348> a = {{
+  static std::array<OpIndexInfo, 349> a = {{
       {"Acosh"},
       {"AllToAll", 1, {0}},
       {"ApproximateEqual"},
@@ -324,6 +324,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
       {"StackClose"},
       {"StackPop"},
       {"StackPush"},
+      {"StatelessCase"},
       {"StatelessMultinomial"},
       {"StatelessParameterizedTruncatedNormal", 1, {1}},
       {"StatelessRandomBinomial"},
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index a5c9c181539..4431502f428 100755
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -284,7 +284,7 @@ PyObject* TFE_Py_RecordGradient(PyObject* op_name, PyObject* inputs,
 PyObject* TFE_Py_TapeWatchedVariables(PyObject* tape);
 
 // Creates a new forward accumulator. Does not add it to the active set.
-PyObject* TFE_Py_ForwardAccumulatorNew();
+PyObject* TFE_Py_ForwardAccumulatorNew(bool use_batch);
 
 // Adds a ForwardAccumulator to the active set, meaning it will watch executed
 // operations. It must not already be in the active set.
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index a4c06f8e72f..cd0ecc8182e 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -41,10 +41,13 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/util/abstract_stack_trace.h"
 #include "tensorflow/python/eager/pywrap_gradient_exclusions.h"
 #include "tensorflow/python/eager/pywrap_tensor.h"
 #include "tensorflow/python/eager/pywrap_tfe.h"
+#include "tensorflow/python/lib/core/py_util.h"
 #include "tensorflow/python/lib/core/safe_ptr.h"
+#include "tensorflow/python/util/stack_trace.h"
 #include "tensorflow/python/util/util.h"
 
 using tensorflow::string;
@@ -854,10 +857,14 @@ void TFE_Py_ExecuteCancelable(TFE_Context* ctx, const char* device_name,
                               TF_Status* out_status) {
   tensorflow::profiler::TraceMe activity(
       "TFE_Py_ExecuteCancelable", tensorflow::profiler::TraceMeLevel::kInfo);
+
   TFE_Op* op = GetOp(ctx, op_name, device_name, out_status);
+
   auto cleaner = tensorflow::gtl::MakeCleanup([ctx, op] { ReturnOp(ctx, op); });
   if (!out_status->status.ok()) return;
 
+  tensorflow::unwrap(op)->SetStackTrace(tensorflow::GetStackTrace());
+
   for (int i = 0; i < inputs->size() && out_status->status.ok(); ++i) {
     TFE_OpAddInput(op, inputs->at(i), out_status);
   }
@@ -970,14 +977,54 @@ void RaiseFallbackException(const char* message) {
           .data());
 }
 
+// Format and return `status`' error message with the attached stack trace if
+// available. `status` must have an error.
+std::string FormatErrorStatusStackTrace(const tensorflow::Status& status) {
+  tensorflow::DCheckPyGilState();
+  DCHECK(!status.ok());
+
+  if (status.stack_trace().empty()) return status.error_message();
+
+  const std::vector<tensorflow::StackFrame>& stack_trace = status.stack_trace();
+
+  PyObject* linecache = PyImport_ImportModule("linecache");
+  PyObject* getline =
+      PyObject_GetAttr(linecache, PyUnicode_FromString("getline"));
+  DCHECK(getline);
+
+  std::ostringstream result;
+  result << "Exception originated from\n\n";
+
+  for (const tensorflow::StackFrame& stack_frame : stack_trace) {
+    PyObject* line_str_obj = PyObject_CallFunction(
+        getline, const_cast<char*>("si"), stack_frame.file_name.c_str(),
+        stack_frame.line_number);
+    tensorflow::StringPiece line_str = TFE_GetPythonString(line_str_obj);
+    tensorflow::str_util::RemoveWhitespaceContext(&line_str);
+    result << "  File \"" << stack_frame.file_name << "\", line "
+           << stack_frame.line_number << ", in " << stack_frame.function_name
+           << '\n';
+
+    if (!line_str.empty()) result << "    " << line_str << '\n';
+    Py_XDECREF(line_str_obj);
+  }
+
+  Py_DecRef(getline);
+  Py_DecRef(linecache);
+
+  result << '\n' << status.error_message();
+  return result.str();
+}
+
 int MaybeRaiseExceptionFromTFStatus(TF_Status* status, PyObject* exception) {
   if (status->status.ok()) return 0;
   const char* msg = TF_Message(status);
   if (exception == nullptr) {
     tensorflow::mutex_lock l(exception_class_mutex);
     if (exception_class != nullptr) {
-      tensorflow::Safe_PyObjectPtr val(
-          Py_BuildValue("si", msg, TF_GetCode(status)));
+      tensorflow::Safe_PyObjectPtr val(Py_BuildValue(
+          "si", FormatErrorStatusStackTrace(status->status).c_str(),
+          TF_GetCode(status)));
       if (PyErr_Occurred()) {
         // NOTE: This hides the actual error (i.e. the reason `status` was not
         // TF_OK), but there is nothing we can do at this point since we can't
@@ -1003,7 +1050,8 @@ int MaybeRaiseExceptionFromStatus(const tensorflow::Status& status,
   if (exception == nullptr) {
     tensorflow::mutex_lock l(exception_class_mutex);
     if (exception_class != nullptr) {
-      tensorflow::Safe_PyObjectPtr val(Py_BuildValue("si", msg, status.code()));
+      tensorflow::Safe_PyObjectPtr val(Py_BuildValue(
+          "si", FormatErrorStatusStackTrace(status).c_str(), status.code()));
       PyErr_SetObject(exception_class, val.get());
       return -1;
     } else {
@@ -1070,7 +1118,7 @@ static tensorflow::DataType FastTensorDtype(PyObject* tensor) {
   }
   PyObject* enum_field = PyObject_GetAttrString(dtype_field, "_type_enum");
   Py_DECREF(dtype_field);
-  if (dtype_field == nullptr) {
+  if (enum_field == nullptr) {
     return tensorflow::DT_INVALID;
   }
   tensorflow::int64 id = MakeInt(enum_field);
@@ -2371,7 +2419,8 @@ tensorflow::Status ParseTangentOutputs(
 tensorflow::Status CallJVPFunction(PyObject* op_name, PyObject* attrs,
                                    PyObject* inputs, PyObject* results,
                                    const std::vector<PyObject*>& input_tangents,
-                                   std::vector<PyObject*>* output_tangents) {
+                                   std::vector<PyObject*>* output_tangents,
+                                   bool use_batch) {
   if (forward_gradient_function == nullptr) {
     return tensorflow::errors::Internal(
         "No forward gradient function registered.");
@@ -2382,9 +2431,10 @@ tensorflow::Status CallJVPFunction(PyObject* op_name, PyObject* attrs,
   // Normalize the input sequence to a tuple so it works with function
   // caching; otherwise it may be an opaque _InputList object.
   tensorflow::Safe_PyObjectPtr input_tuple(PySequence_Tuple(inputs));
+  PyObject* to_batch = (use_batch) ? Py_True : Py_False;
   tensorflow::Safe_PyObjectPtr callback_args(
-      Py_BuildValue("OOOOO", op_name, attrs, input_tuple.get(), results,
-                    py_input_tangents.get()));
+      Py_BuildValue("OOOOOO", op_name, attrs, input_tuple.get(), results,
+                    py_input_tangents.get(), to_batch));
   tensorflow::Safe_PyObjectPtr py_result(
       PyObject_CallObject(forward_gradient_function, callback_args.get()));
   if (py_result == nullptr || PyErr_Occurred()) {
@@ -2507,7 +2557,8 @@ PyObject* TFE_Py_TapeSetRecordOperation(PyObject* op_type,
   } else {
     tensorflow::eager::ForwardFunction<PyObject> wrapped_forward_function(
         [forward_function](const std::vector<PyObject*>& input_tangents,
-                           std::vector<PyObject*>* output_tangents) {
+                           std::vector<PyObject*>* output_tangents,
+                           bool use_batch = false) {
           return CallOpSpecificJVPFunction(forward_function, input_tangents,
                                            output_tangents);
         });
@@ -2749,7 +2800,7 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* target,
   return PyList_New(0);
 }
 
-PyObject* TFE_Py_ForwardAccumulatorNew() {
+PyObject* TFE_Py_ForwardAccumulatorNew(bool use_batch) {
   TFE_Py_ForwardAccumulator_Type.tp_new = PyType_GenericNew;
   if (PyType_Ready(&TFE_Py_ForwardAccumulator_Type) < 0) return nullptr;
   TFE_Py_ForwardAccumulator* accumulator =
@@ -2760,7 +2811,7 @@ PyObject* TFE_Py_ForwardAccumulatorNew() {
             "ForwardAccumulator requires a PyVSpace to be registered."),
         nullptr);
   }
-  accumulator->accumulator = new ForwardAccumulator(*py_vspace);
+  accumulator->accumulator = new ForwardAccumulator(*py_vspace, use_batch);
   return reinterpret_cast<PyObject*>(accumulator);
 }
 
@@ -3118,9 +3169,9 @@ PyObject* RecordGradient(PyObject* op_name, PyObject* inputs, PyObject* attrs,
   tensorflow::eager::ForwardFunction<PyObject> py_forward_function(
       [op_name, attrs, inputs, results](
           const std::vector<PyObject*>& input_tangents,
-          std::vector<PyObject*>* output_tangents) {
+          std::vector<PyObject*>* output_tangents, bool use_batch) {
         return CallJVPFunction(op_name, attrs, inputs, results, input_tangents,
-                               output_tangents);
+                               output_tangents, use_batch);
       });
   tensorflow::eager::ForwardFunction<PyObject>* forward_function;
   if (c_op_name == "While" || c_op_name == "StatelessWhile" ||
@@ -3527,6 +3578,8 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject* args) {
   }
 
   TFE_Op* op = GetOp(ctx, op_name, op_exec_info.device_name, status);
+  tensorflow::unwrap(op)->SetStackTrace(tensorflow::GetStackTrace());
+
   auto cleaner = tensorflow::gtl::MakeCleanup([status, ctx, op] {
     ReturnStatus(status);
     ReturnOp(ctx, op);
@@ -3746,11 +3799,14 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject* args) {
   if (!status->status.ok()) {
     // Augment the status with the op_name for easier debugging similar to
     // TFE_Py_Execute.
-    TF_SetStatus(status, TF_GetCode(status),
-                 tensorflow::strings::StrCat(
-                     TF_Message(status),
-                     " [Op:", TFE_GetPythonString(op_exec_info.op_name), "]")
-                     .c_str());
+    std::vector<tensorflow::StackFrame> stack_trace =
+        status->status.stack_trace();
+    status->status = tensorflow::Status(
+        status->status.code(),
+        tensorflow::strings::StrCat(
+            TF_Message(status),
+            " [Op:", TFE_GetPythonString(op_exec_info.op_name), "]"),
+        std::move(stack_trace));
 
     MaybeRaiseExceptionFromTFStatus(status, nullptr);
     return nullptr;
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index e3dff5d1591..755df6060d3 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -171,7 +171,6 @@ class LazyColumnTest(test.TestCase):
         TypeError, '"key" must be either a "str" or "_FeatureColumn".'):
       builder.get(NotAFeatureColumn())
 
-  @test_util.run_deprecated_v1
   def test_expand_dim_rank_1_sparse_tensor_empty_batch(self):
     # empty 1-D sparse tensor:
     builder = _LazyBuilder(features={'a': sparse_tensor.SparseTensor(
@@ -179,7 +178,7 @@ class LazyColumnTest(test.TestCase):
         dense_shape=[0],
         values=np.array([]))})
     with self.cached_session():
-      spv = builder.get('a').eval()
+      spv = builder.get('a')
       self.assertAllEqual(np.array([0, 1], dtype=np.int64), spv.dense_shape)
       self.assertAllEqual(
           np.reshape(np.array([], dtype=np.int64), (0, 2)), spv.indices)
@@ -187,7 +186,6 @@ class LazyColumnTest(test.TestCase):
 
 class NumericColumnTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def test_defaults(self):
     a = fc._numeric_column('aaa')
     self.assertEqual('aaa', a.key)
@@ -266,7 +264,6 @@ class NumericColumnTest(test.TestCase):
         'aaa': parsing_ops.FixedLenFeature((2, 3), dtype=dtypes.int32)
     }, a._parse_example_spec)
 
-  @test_util.run_deprecated_v1
   def test_parse_example_no_default_value(self):
     price = fc._numeric_column('price', shape=[2])
     data = example_pb2.Example(features=feature_pb2.Features(
@@ -309,7 +306,6 @@ class NumericColumnTest(test.TestCase):
     with self.assertRaisesRegex(TypeError, 'must be a callable'):
       fc._numeric_column('price', normalizer_fn='NotACallable')
 
-  @test_util.run_deprecated_v1
   def test_normalizer_fn_transform_feature(self):
 
     def _increment_two(input_tensor):
@@ -328,7 +324,7 @@ class NumericColumnTest(test.TestCase):
 
     price = fc._numeric_column('price', shape=[2], normalizer_fn=_increment_two)
     builder = _LazyBuilder({'price': [[1., 2.], [5., 6.]]})
-    self.assertEqual(builder.get(price), price._get_dense_tensor(builder))
+    self.assertAllClose(builder.get(price), price._get_dense_tensor(builder))
 
   def test_sparse_tensor_not_supported(self):
     price = fc._numeric_column('price')
@@ -340,7 +336,6 @@ class NumericColumnTest(test.TestCase):
     with self.assertRaisesRegex(ValueError, 'must be a Tensor'):
       price._transform_feature(builder)
 
-  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     a = fc._numeric_column('aaa', shape=[1, 2], default_value=[[3., 2.]])
     a_copy = copy.deepcopy(a)
@@ -353,7 +348,6 @@ class NumericColumnTest(test.TestCase):
         'aaa', shape=[1, 2], default_value=np.array([[3., 2.]]))
     self.assertEqual(a.default_value, ((3., 2.),))
 
-  @test_util.run_deprecated_v1
   def test_linear_model(self):
     price = fc._numeric_column('price')
     with ops.Graph().as_default():
@@ -368,7 +362,6 @@ class NumericColumnTest(test.TestCase):
         sess.run(price_var.assign([[10.]]))
         self.assertAllClose([[10.], [50.]], self.evaluate(predictions))
 
-  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
     price = fc._numeric_column('price')
     with ops.Graph().as_default():
@@ -465,8 +458,8 @@ class BucketizedColumnTest(test.TestCase):
           'price': [[-1., 1.], [5., 6.]]
       }, [bucketized_price])
       with _initialized_session():
-        self.assertAllEqual([[0, 1], [3, 4]],
-                            transformed_tensor[bucketized_price].eval())
+        self.assertAllClose([[0, 1], [3, 4]],
+                            transformed_tensor[bucketized_price])
 
   def test_get_dense_tensor_one_input_value(self):
     """Tests _get_dense_tensor() for input with shape=[1]."""
@@ -539,7 +532,6 @@ class BucketizedColumnTest(test.TestCase):
     with self.assertRaisesRegex(ValueError, 'must be a Tensor'):
       bucketized_price._transform_feature(builder)
 
-  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     a = fc._numeric_column('aaa', shape=[2])
     a_bucketized = fc._bucketized_column(a, boundaries=[0, 1])
@@ -667,7 +659,6 @@ class BucketizedColumnTest(test.TestCase):
 
 class HashedCategoricalColumnTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def test_defaults(self):
     a = fc._categorical_column_with_hash_bucket('aaa', 10)
     self.assertEqual('aaa', a.name)
@@ -695,7 +686,6 @@ class HashedCategoricalColumnTest(test.TestCase):
     with self.assertRaisesRegex(ValueError, 'dtype must be string or integer'):
       fc._categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.float32)
 
-  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     original = fc._categorical_column_with_hash_bucket('aaa', 10)
     for column in (original, copy.deepcopy(original)):
@@ -735,10 +725,8 @@ class HashedCategoricalColumnTest(test.TestCase):
           sparse_tensor.SparseTensorValue(
               indices=[[0, 0], [0, 1]],
               values=np.array([b'omar', b'stringer'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
+              dense_shape=[1, 2]), features['aaa'].eval())
 
-  @test_util.run_deprecated_v1
   def test_strings_should_be_hashed(self):
     hashed_sparse = fc._categorical_column_with_hash_bucket('wire', 10)
     wire_tensor = sparse_tensor.SparseTensor(
@@ -752,9 +740,8 @@ class HashedCategoricalColumnTest(test.TestCase):
     with self.cached_session():
       self.assertEqual(dtypes.int64, output.values.dtype)
       self.assertAllEqual(expected_values, output.values)
-      self.assertAllEqual(wire_tensor.indices.eval(), output.indices)
-      self.assertAllEqual(wire_tensor.dense_shape.eval(),
-                          output.dense_shape.eval())
+      self.assertAllEqual(wire_tensor.indices, output.indices)
+      self.assertAllEqual(wire_tensor.dense_shape, output.dense_shape)
 
   def test_tensor_dtype_should_be_string_or_integer(self):
     string_fc = fc._categorical_column_with_hash_bucket(
@@ -794,7 +781,6 @@ class HashedCategoricalColumnTest(test.TestCase):
     with self.assertRaisesRegex(ValueError, 'dtype must be compatible'):
       builder.get(hashed_sparse)
 
-  @test_util.run_deprecated_v1
   def test_ints_should_be_hashed(self):
     hashed_sparse = fc._categorical_column_with_hash_bucket(
         'wire', 10, dtype=dtypes.int64)
@@ -853,7 +839,6 @@ class HashedCategoricalColumnTest(test.TestCase):
                           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
     self.assertCountEqual([], ops.get_collection('my_weights'))
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
     hashed_sparse = fc._categorical_column_with_hash_bucket('wire', 10)
     builder = _LazyBuilder({'wire': (('omar', ''), ('stringer', 'marlo'))})
@@ -861,7 +846,6 @@ class HashedCategoricalColumnTest(test.TestCase):
     self.assertIsNone(id_weight_pair.weight_tensor)
     self.assertEqual(builder.get(hashed_sparse), id_weight_pair.id_tensor)
 
-  @test_util.run_deprecated_v1
   def test_linear_model(self):
     wire_column = fc._categorical_column_with_hash_bucket('wire', 4)
     self.assertEqual(4, wire_column._num_buckets)
@@ -879,12 +863,11 @@ class HashedCategoricalColumnTest(test.TestCase):
         self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
                             self.evaluate(wire_var))
         self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
-        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
+        self.evaluate(wire_var.assign(((1.,), (2.,), (3.,), (4.,))))
         # 'marlo' -> 3: wire_var[3] = 4
         # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
         self.assertAllClose(((4.,), (6.,)), self.evaluate(predictions))
 
-  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
     wire_column = fc._categorical_column_with_hash_bucket('wire', 4)
     self.assertEqual(4, wire_column._num_buckets)
@@ -903,7 +886,7 @@ class HashedCategoricalColumnTest(test.TestCase):
         self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
                             self.evaluate(wire_var))
         self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
-        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
+        self.evaluate(wire_var.assign(((1.,), (2.,), (3.,), (4.,))))
         # 'marlo' -> 3: wire_var[3] = 4
         # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
         self.assertAllClose(((4.,), (6.,)), self.evaluate(predictions))
@@ -991,7 +974,6 @@ class CrossedColumnTest(test.TestCase):
     crossed = fc._crossed_column([b, 'c'], 15)
     self.assertEqual(15, crossed._num_buckets)
 
-  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     a = fc._numeric_column('a', dtype=dtypes.int32)
     b = fc._bucketized_column(a, boundaries=[0, 1])
@@ -1002,7 +984,6 @@ class CrossedColumnTest(test.TestCase):
     self.assertEqual(15, crossed2_copy.hash_bucket_size)
     self.assertEqual(5, crossed2_copy.hash_key)
 
-  @test_util.run_deprecated_v1
   def test_parse_example(self):
     price = fc._numeric_column('price', shape=[2])
     bucketized_price = fc._bucketized_column(price, boundaries=[0, 50])
@@ -1045,7 +1026,7 @@ class CrossedColumnTest(test.TestCase):
     }
     outputs = _transform_features(features, [price_cross_wire])
     output = outputs[price_cross_wire]
-    with self.cached_session() as sess:
+    with self.cached_session():
       output_val = self.evaluate(output)
       self.assertAllEqual(
           [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]], output_val.indices)
@@ -1053,7 +1034,6 @@ class CrossedColumnTest(test.TestCase):
         self.assertIn(val, list(range(hash_bucket_size)))
       self.assertAllEqual([2, 4], output_val.dense_shape)
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
     a = fc._numeric_column('a', dtype=dtypes.int32, shape=(2,))
     b = fc._bucketized_column(a, boundaries=(0, 1))
@@ -1121,7 +1101,6 @@ class CrossedColumnTest(test.TestCase):
         self.assertAllEqual(expected_values, id_tensor_eval.values)
         self.assertAllEqual((2, 4), id_tensor_eval.dense_shape)
 
-  @test_util.run_deprecated_v1
   def test_linear_model(self):
     """Tests linear_model.
 
@@ -1140,15 +1119,15 @@ class CrossedColumnTest(test.TestCase):
       }, (crossed,))
       bias = get_linear_model_bias()
       crossed_var = get_linear_model_column_var(crossed)
-      with _initialized_session() as sess:
+      with _initialized_session():
         self.assertAllClose((0.,), self.evaluate(bias))
         self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
                             self.evaluate(crossed_var))
         self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
-        sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
+        self.evaluate(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
         # Expected ids after cross = (1, 0, 1, 3, 4, 2)
         self.assertAllClose(((3.,), (14.,)), self.evaluate(predictions))
-        sess.run(bias.assign((.1,)))
+        self.evaluate(bias.assign((.1,)))
         self.assertAllClose(((3.1,), (14.1,)), self.evaluate(predictions))
 
   def test_linear_model_with_weights(self):
@@ -1203,7 +1182,6 @@ class CrossedColumnTest(test.TestCase):
                 dense_shape=(2, 2)),
         }, (crossed,))
 
-  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
     """Tests _LinearModel.
 
@@ -1224,15 +1202,15 @@ class CrossedColumnTest(test.TestCase):
       }, (crossed,))
       bias = get_linear_model_bias()
       crossed_var = get_linear_model_column_var(crossed)
-      with _initialized_session() as sess:
+      with _initialized_session():
         self.assertAllClose((0.,), self.evaluate(bias))
         self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
                             self.evaluate(crossed_var))
         self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
-        sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
+        self.evaluate(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
         # Expected ids after cross = (1, 0, 1, 3, 4, 2)
         self.assertAllClose(((3.,), (14.,)), self.evaluate(predictions))
-        sess.run(bias.assign((.1,)))
+        self.evaluate(bias.assign((.1,)))
         self.assertAllClose(((3.1,), (14.1,)), self.evaluate(predictions))
 
   def test_keras_linear_model_with_weights(self):
@@ -2843,27 +2821,26 @@ class FunctionalInputLayerTest(test.TestCase):
                             variables_lib.Variable)
       self.assertAllEqual(cols_to_vars[some_embedding_column][0].shape, [5, 10])
 
-  @test_util.run_deprecated_v1
   def test_fills_cols_to_vars_shared_embedding(self):
     # Provide 5 DenseColumn's to input_layer: a NumericColumn, a
     # BucketizedColumn, an EmbeddingColumn, two SharedEmbeddingColumns. The
     # EmbeddingColumn creates a Variable and the two SharedEmbeddingColumns
     # shared one variable.
-    price1 = fc._numeric_column('price1')
-    dense_feature = fc._numeric_column('dense_feature')
-    dense_feature_bucketized = fc._bucketized_column(
-        dense_feature, boundaries=[0.])
-    some_sparse_column = fc._categorical_column_with_hash_bucket(
-        'sparse_feature', hash_bucket_size=5)
-    some_embedding_column = fc._embedding_column(
-        some_sparse_column, dimension=10)
-    categorical_column_a = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=3)
-    categorical_column_b = fc._categorical_column_with_identity(
-        key='bbb', num_buckets=3)
-    shared_embedding_a, shared_embedding_b = fc_new.shared_embedding_columns(
-        [categorical_column_a, categorical_column_b], dimension=2)
     with ops.Graph().as_default():
+      price1 = fc._numeric_column('price1')
+      dense_feature = fc._numeric_column('dense_feature')
+      dense_feature_bucketized = fc._bucketized_column(
+          dense_feature, boundaries=[0.])
+      some_sparse_column = fc._categorical_column_with_hash_bucket(
+          'sparse_feature', hash_bucket_size=5)
+      some_embedding_column = fc._embedding_column(
+          some_sparse_column, dimension=10)
+      categorical_column_a = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=3)
+      categorical_column_b = fc._categorical_column_with_identity(
+          key='bbb', num_buckets=3)
+      shared_embedding_a, shared_embedding_b = fc_new.shared_embedding_columns(
+          [categorical_column_a, categorical_column_b], dimension=2)
       features = {
           'price1': [[3.], [4.]],
           'dense_feature': [[-1.], [4.]],
@@ -3041,18 +3018,17 @@ class FunctionalInputLayerTest(test.TestCase):
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
-  @test_util.run_deprecated_v1
   def test_multiple_layers_with_same_shared_embedding_column(self):
-    categorical_column_a = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=3)
-    categorical_column_b = fc._categorical_column_with_identity(
-        key='bbb', num_buckets=3)
-    embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc_new.shared_embedding_columns(
-        [categorical_column_b, categorical_column_a],
-        dimension=embedding_dimension)
-
     with ops.Graph().as_default():
+      categorical_column_a = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=3)
+      categorical_column_b = fc._categorical_column_with_identity(
+          key='bbb', num_buckets=3)
+      embedding_dimension = 2
+      embedding_column_b, embedding_column_a = fc_new.shared_embedding_columns(
+          [categorical_column_b, categorical_column_a],
+          dimension=embedding_dimension)
+
       features = {
           'aaa':
               sparse_tensor.SparseTensor(
@@ -3075,19 +3051,18 @@ class FunctionalInputLayerTest(test.TestCase):
           ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
-  @test_util.run_deprecated_v1
   def test_multiple_layers_with_same_shared_embedding_column_diff_graphs(self):
-    categorical_column_a = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=3)
-    categorical_column_b = fc._categorical_column_with_identity(
-        key='bbb', num_buckets=3)
-    embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc_new.shared_embedding_columns(
-        [categorical_column_b, categorical_column_a],
-        dimension=embedding_dimension)
-    all_cols = [embedding_column_a, embedding_column_b]
-
     with ops.Graph().as_default():
+      categorical_column_a = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=3)
+      categorical_column_b = fc._categorical_column_with_identity(
+          key='bbb', num_buckets=3)
+      embedding_dimension = 2
+      embedding_column_b, embedding_column_a = fc_new.shared_embedding_columns(
+          [categorical_column_b, categorical_column_a],
+          dimension=embedding_dimension)
+      all_cols = [embedding_column_a, embedding_column_b]
+
       features = {
           'aaa':
               sparse_tensor.SparseTensor(
@@ -3127,56 +3102,56 @@ class FunctionalInputLayerTest(test.TestCase):
           ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
-  @test_util.run_deprecated_v1
   def test_with_1d_sparse_tensor(self):
-    embedding_values = (
-        (1., 2., 3., 4., 5.),  # id 0
-        (6., 7., 8., 9., 10.),  # id 1
-        (11., 12., 13., 14., 15.)  # id 2
-    )
-    def _initializer(shape, dtype, partition_info):
-      del shape, dtype, partition_info
-      return embedding_values
+    with ops.Graph().as_default():
+      embedding_values = (
+          (1., 2., 3., 4., 5.),  # id 0
+          (6., 7., 8., 9., 10.),  # id 1
+          (11., 12., 13., 14., 15.)  # id 2
+      )
+      def _initializer(shape, dtype, partition_info):
+        del shape, dtype, partition_info
+        return embedding_values
 
-    # price has 1 dimension in input_layer
-    price = fc._numeric_column('price')
+      # price has 1 dimension in input_layer
+      price = fc._numeric_column('price')
 
-    # one_hot_body_style has 3 dims in input_layer.
-    body_style = fc._categorical_column_with_vocabulary_list(
-        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    one_hot_body_style = fc._indicator_column(body_style)
+      # one_hot_body_style has 3 dims in input_layer.
+      body_style = fc._categorical_column_with_vocabulary_list(
+          'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+      one_hot_body_style = fc._indicator_column(body_style)
 
-    # embedded_body_style has 5 dims in input_layer.
-    country = fc._categorical_column_with_vocabulary_list(
-        'country', vocabulary_list=['US', 'JP', 'CA'])
-    embedded_country = fc._embedding_column(
-        country, dimension=5, initializer=_initializer)
+      # embedded_body_style has 5 dims in input_layer.
+      country = fc._categorical_column_with_vocabulary_list(
+          'country', vocabulary_list=['US', 'JP', 'CA'])
+      embedded_country = fc._embedding_column(
+          country, dimension=5, initializer=_initializer)
 
-    # Provides 1-dim tensor and dense tensor.
-    features = {
-        'price': constant_op.constant([11., 12.,]),
-        'body-style': sparse_tensor.SparseTensor(
-            indices=((0,), (1,)),
-            values=('sedan', 'hardtop'),
-            dense_shape=(2,)),
-        # This is dense tensor for the categorical_column.
-        'country': constant_op.constant(['CA', 'US']),
-    }
-    self.assertEqual(1, features['price'].shape.ndims)
-    self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
-    self.assertEqual(1, features['country'].shape.ndims)
+      # Provides 1-dim tensor and dense tensor.
+      features = {
+          'price': constant_op.constant([11., 12.,]),
+          'body-style': sparse_tensor.SparseTensor(
+              indices=((0,), (1,)),
+              values=('sedan', 'hardtop'),
+              dense_shape=(2,)),
+          # This is dense tensor for the categorical_column.
+          'country': constant_op.constant(['CA', 'US']),
+      }
+      self.assertEqual(1, features['price'].shape.ndims)
+      self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
+      self.assertEqual(1, features['country'].shape.ndims)
 
-    net = fc.input_layer(features,
-                         [price, one_hot_body_style, embedded_country])
-    self.assertEqual(1 + 3 + 5, net.shape[1])
-    with _initialized_session() as sess:
+      net = fc.input_layer(features,
+                           [price, one_hot_body_style, embedded_country])
+      self.assertEqual(1 + 3 + 5, net.shape[1])
+      with _initialized_session():
 
-      # Each row is formed by concatenating `embedded_body_style`,
-      # `one_hot_body_style`, and `price` in order.
-      self.assertAllEqual(
-          [[0., 0., 1., 11., 12., 13., 14., 15., 11.],
-           [1., 0., 0., 1., 2., 3., 4., 5., 12.]],
-          sess.run(net))
+        # Each row is formed by concatenating `embedded_body_style`,
+        # `one_hot_body_style`, and `price` in order.
+        self.assertAllEqual(
+            [[0., 0., 1., 11., 12., 13., 14., 15., 11.],
+             [1., 0., 0., 1., 2., 3., 4., 5., 12.]],
+            self.evaluate(net))
 
   @test_util.run_deprecated_v1
   def test_with_1d_unknown_shape_sparse_tensor(self):
@@ -3369,7 +3344,6 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         'python/feature_column/testdata/wire_vocabulary.txt')
     self._wire_vocabulary_size = 3
 
-  @test_util.run_deprecated_v1
   def test_defaults(self):
     column = fc._categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file='path_to_file', vocabulary_size=3)
@@ -3386,7 +3360,6 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       fc._categorical_column_with_vocabulary_file(
           key=('aaa',), vocabulary_file='path_to_file', vocabulary_size=3)
 
-  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
     column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
@@ -3399,7 +3372,6 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int32)
     }, column._parse_example_spec)
 
-  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     original = fc._categorical_column_with_vocabulary_file(
         key='aaa',
@@ -3424,18 +3396,18 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       fc._categorical_column_with_vocabulary_file(
           key='aaa', vocabulary_file='', vocabulary_size=3)
 
-  @test_util.run_deprecated_v1
   def test_invalid_vocabulary_file(self):
-    column = fc._categorical_column_with_vocabulary_file(
-        key='aaa', vocabulary_file='file_does_not_exist', vocabulary_size=10)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=('marlo', 'skywalker', 'omar'),
-        dense_shape=(2, 2))
-    column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    with self.assertRaisesRegex(errors.OpError, 'file_does_not_exist'):
-      with self.cached_session():
-        lookup_ops.tables_initializer().run()
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_file(
+          key='aaa', vocabulary_file='file_does_not_exist', vocabulary_size=10)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=('marlo', 'skywalker', 'omar'),
+          dense_shape=(2, 2))
+      column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      with self.assertRaisesRegex(errors.OpError, 'file_does_not_exist'):
+        with self.cached_session():
+          lookup_ops.tables_initializer().run()
 
   def test_invalid_vocabulary_size(self):
     with self.assertRaisesRegex(ValueError, 'Invalid vocabulary_size'):
@@ -3449,20 +3421,20 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
           vocabulary_file=self._wire_vocabulary_file_name,
           vocabulary_size=0)
 
-  @test_util.run_deprecated_v1
   def test_too_large_vocabulary_size(self):
-    column = fc._categorical_column_with_vocabulary_file(
-        key='aaa',
-        vocabulary_file=self._wire_vocabulary_file_name,
-        vocabulary_size=self._wire_vocabulary_size + 1)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=('marlo', 'skywalker', 'omar'),
-        dense_shape=(2, 2))
-    column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    with self.assertRaisesRegex(errors.OpError, 'Invalid vocab_size'):
-      with self.cached_session():
-        lookup_ops.tables_initializer().run()
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=self._wire_vocabulary_size + 1)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=('marlo', 'skywalker', 'omar'),
+          dense_shape=(2, 2))
+      column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      with self.assertRaisesRegex(errors.OpError, 'Invalid vocab_size'):
+        with self.cached_session():
+          lookup_ops.tables_initializer().run()
 
   def test_invalid_num_oov_buckets(self):
     with self.assertRaisesRegex(ValueError, 'Invalid num_oov_buckets'):
@@ -3539,64 +3511,64 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
-    column = fc._categorical_column_with_vocabulary_file(
-        key='aaa',
-        vocabulary_file=self._wire_vocabulary_file_name,
-        vocabulary_size=self._wire_vocabulary_size)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=('marlo', 'skywalker', 'omar'),
-        dense_shape=(2, 2))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=self._wire_vocabulary_size)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=('marlo', 'skywalker', 'omar'),
+          dense_shape=(2, 2))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((2, -1, 0), dtype=np.int64),
+                dense_shape=inputs.dense_shape),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_none_vocabulary_size(self):
-    column = fc._categorical_column_with_vocabulary_file(
-        key='aaa', vocabulary_file=self._wire_vocabulary_file_name)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=('marlo', 'skywalker', 'omar'),
-        dense_shape=(2, 2))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(self,
-                                  sparse_tensor.SparseTensorValue(
-                                      indices=inputs.indices,
-                                      values=np.array(
-                                          (2, -1, 0), dtype=np.int64),
-                                      dense_shape=inputs.dense_shape),
-                                  id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_file(
+          key='aaa', vocabulary_file=self._wire_vocabulary_file_name)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=('marlo', 'skywalker', 'omar'),
+          dense_shape=(2, 2))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(self,
+                                    sparse_tensor.SparseTensorValue(
+                                        indices=inputs.indices,
+                                        values=np.array(
+                                            (2, -1, 0), dtype=np.int64),
+                                        dense_shape=inputs.dense_shape),
+                                    id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    column = fc._categorical_column_with_vocabulary_file(
-        key='aaa',
-        vocabulary_file=self._wire_vocabulary_file_name,
-        vocabulary_size=self._wire_vocabulary_size)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=('marlo', 'skywalker', 'omar'),
-        dense_shape=(2, 2))
-    id_tensor = _transform_features({'aaa': inputs}, [column])[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=self._wire_vocabulary_size)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=('marlo', 'skywalker', 'omar'),
+          dense_shape=(2, 2))
+      id_tensor = _transform_features({'aaa': inputs}, [column])[column]
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((2, -1, 0), dtype=np.int64),
+                dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
 
   def test_get_sparse_tensors_weight_collections(self):
     column = fc._categorical_column_with_vocabulary_file(
@@ -3616,163 +3588,162 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
                           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
     self.assertCountEqual([], ops.get_collection('my_weights'))
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
-    column = fc._categorical_column_with_vocabulary_file(
-        key='aaa',
-        vocabulary_file=self._wire_vocabulary_file_name,
-        vocabulary_size=self._wire_vocabulary_size)
-    id_weight_pair = column._get_sparse_tensors(
-        _LazyBuilder({
-            'aaa': (('marlo', ''), ('skywalker', 'omar'))
-        }))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=(2, 2)),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=self._wire_vocabulary_size)
+      id_weight_pair = column._get_sparse_tensors(
+          _LazyBuilder({
+              'aaa': (('marlo', ''), ('skywalker', 'omar'))
+          }))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=np.array((2, -1, 0), dtype=np.int64),
+                dense_shape=(2, 2)),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_default_value_in_vocabulary(self):
-    column = fc._categorical_column_with_vocabulary_file(
-        key='aaa',
-        vocabulary_file=self._wire_vocabulary_file_name,
-        vocabulary_size=self._wire_vocabulary_size,
-        default_value=2)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=('marlo', 'skywalker', 'omar'),
-        dense_shape=(2, 2))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 2, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=self._wire_vocabulary_size,
+          default_value=2)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=('marlo', 'skywalker', 'omar'),
+          dense_shape=(2, 2))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((2, 2, 0), dtype=np.int64),
+                dense_shape=inputs.dense_shape),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_oov_buckets(self):
-    column = fc._categorical_column_with_vocabulary_file(
-        key='aaa',
-        vocabulary_file=self._wire_vocabulary_file_name,
-        vocabulary_size=self._wire_vocabulary_size,
-        num_oov_buckets=100)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1), (1, 2)),
-        values=('marlo', 'skywalker', 'omar', 'heisenberg'),
-        dense_shape=(2, 3))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 33, 0, 62), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=self._wire_vocabulary_size,
+          num_oov_buckets=100)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1), (1, 2)),
+          values=('marlo', 'skywalker', 'omar', 'heisenberg'),
+          dense_shape=(2, 3))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((2, 33, 0, 62), dtype=np.int64),
+                dense_shape=inputs.dense_shape),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_small_vocabulary_size(self):
-    # 'marlo' is the last entry in our vocabulary file, so be setting
-    # `vocabulary_size` to 1 less than number of entries in file, we take
-    # 'marlo' out of the vocabulary.
-    column = fc._categorical_column_with_vocabulary_file(
-        key='aaa',
-        vocabulary_file=self._wire_vocabulary_file_name,
-        vocabulary_size=self._wire_vocabulary_size - 1)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=('marlo', 'skywalker', 'omar'),
-        dense_shape=(2, 2))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((-1, -1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      # 'marlo' is the last entry in our vocabulary file, so be setting
+      # `vocabulary_size` to 1 less than number of entries in file, we take
+      # 'marlo' out of the vocabulary.
+      column = fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=self._wire_vocabulary_size - 1)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=('marlo', 'skywalker', 'omar'),
+          dense_shape=(2, 2))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((-1, -1, 0), dtype=np.int64),
+                dense_shape=inputs.dense_shape),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32(self):
-    column = fc._categorical_column_with_vocabulary_file(
-        key='aaa',
-        vocabulary_file=self._warriors_vocabulary_file_name,
-        vocabulary_size=self._warriors_vocabulary_size,
-        dtype=dtypes.int32)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1), (2, 2)),
-        values=(11, 100, 30, 22),
-        dense_shape=(3, 3))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, -1, 0, 4), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._warriors_vocabulary_file_name,
+          vocabulary_size=self._warriors_vocabulary_size,
+          dtype=dtypes.int32)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+          values=(11, 100, 30, 22),
+          dense_shape=(3, 3))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((2, -1, 0, 4), dtype=np.int64),
+                dense_shape=inputs.dense_shape),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_dense_input(self):
-    default_value = -100
-    column = fc._categorical_column_with_vocabulary_file(
-        key='aaa',
-        vocabulary_file=self._warriors_vocabulary_file_name,
-        vocabulary_size=self._warriors_vocabulary_size,
-        dtype=dtypes.int32,
-        default_value=default_value)
-    id_weight_pair = column._get_sparse_tensors(
-        _LazyBuilder({
-            'aaa': ((11, -1, -1), (100, 30, -1), (-1, -1, 22))
-        }))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1), (2, 2)),
-              values=np.array((2, default_value, 0, 4), dtype=np.int64),
-              dense_shape=(3, 3)),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      default_value = -100
+      column = fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._warriors_vocabulary_file_name,
+          vocabulary_size=self._warriors_vocabulary_size,
+          dtype=dtypes.int32,
+          default_value=default_value)
+      id_weight_pair = column._get_sparse_tensors(
+          _LazyBuilder({
+              'aaa': ((11, -1, -1), (100, 30, -1), (-1, -1, 22))
+          }))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+                values=np.array((2, default_value, 0, 4), dtype=np.int64),
+                dense_shape=(3, 3)),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_with_oov_buckets(self):
-    column = fc._categorical_column_with_vocabulary_file(
-        key='aaa',
-        vocabulary_file=self._warriors_vocabulary_file_name,
-        vocabulary_size=self._warriors_vocabulary_size,
-        dtype=dtypes.int32,
-        num_oov_buckets=100)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1), (2, 2)),
-        values=(11, 100, 30, 22),
-        dense_shape=(3, 3))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 60, 0, 4), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._warriors_vocabulary_file_name,
+          vocabulary_size=self._warriors_vocabulary_size,
+          dtype=dtypes.int32,
+          num_oov_buckets=100)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+          values=(11, 100, 30, 22),
+          dense_shape=(3, 3))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((2, 60, 0, 4), dtype=np.int64),
+                dense_shape=inputs.dense_shape),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_linear_model(self):
     wire_column = fc._categorical_column_with_vocabulary_file(
         key='wire',
@@ -3799,7 +3770,6 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
         self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
 
-  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
     wire_column = fc._categorical_column_with_vocabulary_file(
         key='wire',
@@ -3857,7 +3827,6 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int64)
     }, column._parse_example_spec)
 
-  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
     column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
@@ -3869,7 +3838,6 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int32)
     }, column._parse_example_spec)
 
-  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     original = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.int32)
@@ -4004,41 +3972,41 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
-    column = fc._categorical_column_with_vocabulary_list(
-        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=('marlo', 'skywalker', 'omar'),
-        dense_shape=(2, 2))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=('marlo', 'skywalker', 'omar'),
+          dense_shape=(2, 2))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((2, -1, 0), dtype=np.int64),
+                dense_shape=inputs.dense_shape),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    column = fc._categorical_column_with_vocabulary_list(
-        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=('marlo', 'skywalker', 'omar'),
-        dense_shape=(2, 2))
-    id_tensor = _transform_features({'aaa': inputs}, [column])[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=('marlo', 'skywalker', 'omar'),
+          dense_shape=(2, 2))
+      id_tensor = _transform_features({'aaa': inputs}, [column])[column]
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((2, -1, 0), dtype=np.int64),
+                dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
 
   def test_get_sparse_tensors_weight_collections(self):
     column = fc._categorical_column_with_vocabulary_list(
@@ -4056,134 +4024,129 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
                           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
     self.assertCountEqual([], ops.get_collection('my_weights'))
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
-    column = fc._categorical_column_with_vocabulary_list(
-        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    id_weight_pair = column._get_sparse_tensors(
-        _LazyBuilder({
-            'aaa': (('marlo', ''), ('skywalker', 'omar'))
-        }))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=(2, 2)),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+      id_weight_pair = column._get_sparse_tensors(
+          _LazyBuilder({'aaa': (('marlo', ''), ('skywalker', 'omar'))}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=np.array((2, -1, 0), dtype=np.int64),
+                dense_shape=(2, 2)), id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_default_value_in_vocabulary(self):
-    column = fc._categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'),
-        default_value=2)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=('marlo', 'skywalker', 'omar'),
-        dense_shape=(2, 2))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 2, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_list(
+          key='aaa',
+          vocabulary_list=('omar', 'stringer', 'marlo'),
+          default_value=2)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=('marlo', 'skywalker', 'omar'),
+          dense_shape=(2, 2))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((2, 2, 0), dtype=np.int64),
+                dense_shape=inputs.dense_shape),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_oov_buckets(self):
-    column = fc._categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'),
-        num_oov_buckets=100)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1), (1, 2)),
-        values=('marlo', 'skywalker', 'omar', 'heisenberg'),
-        dense_shape=(2, 3))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 33, 0, 62), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_list(
+          key='aaa',
+          vocabulary_list=('omar', 'stringer', 'marlo'),
+          num_oov_buckets=100)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1), (1, 2)),
+          values=('marlo', 'skywalker', 'omar', 'heisenberg'),
+          dense_shape=(2, 3))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((2, 33, 0, 62), dtype=np.int64),
+                dense_shape=inputs.dense_shape),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32(self):
-    column = fc._categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
-        dtype=dtypes.int32)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1), (2, 2)),
-        values=np.array((11, 100, 30, 22), dtype=np.int32),
-        dense_shape=(3, 3))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, -1, 0, 4), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_list(
+          key='aaa',
+          vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
+          dtype=dtypes.int32)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+          values=np.array((11, 100, 30, 22), dtype=np.int32),
+          dense_shape=(3, 3))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((2, -1, 0, 4), dtype=np.int64),
+                dense_shape=inputs.dense_shape),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_dense_input(self):
-    default_value = -100
-    column = fc._categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
-        dtype=dtypes.int32,
-        default_value=default_value)
-    id_weight_pair = column._get_sparse_tensors(
-        _LazyBuilder({
-            'aaa':
-                np.array(
-                    ((11, -1, -1), (100, 30, -1), (-1, -1, 22)), dtype=np.int32)
-        }))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1), (2, 2)),
-              values=np.array((2, default_value, 0, 4), dtype=np.int64),
-              dense_shape=(3, 3)),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      default_value = -100
+      column = fc._categorical_column_with_vocabulary_list(
+          key='aaa',
+          vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
+          dtype=dtypes.int32,
+          default_value=default_value)
+      id_weight_pair = column._get_sparse_tensors(
+          _LazyBuilder({
+              'aaa':
+                  np.array(((11, -1, -1), (100, 30, -1), (-1, -1, 22)),
+                           dtype=np.int32)
+          }))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+                values=np.array((2, default_value, 0, 4), dtype=np.int64),
+                dense_shape=(3, 3)), id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_with_oov_buckets(self):
-    column = fc._categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
-        dtype=dtypes.int32,
-        num_oov_buckets=100)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1), (2, 2)),
-        values=(11, 100, 30, 22),
-        dense_shape=(3, 3))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 60, 0, 4), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_list(
+          key='aaa',
+          vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
+          dtype=dtypes.int32,
+          num_oov_buckets=100)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+          values=(11, 100, 30, 22),
+          dense_shape=(3, 3))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((2, 60, 0, 4), dtype=np.int64),
+                dense_shape=inputs.dense_shape),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_linear_model(self):
     wire_column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
@@ -4209,7 +4172,6 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
         self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
 
-  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
     wire_column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
@@ -4253,7 +4215,6 @@ class IdentityCategoricalColumnTest(test.TestCase):
     with self.assertRaisesRegex(ValueError, 'key must be a string.'):
       fc._categorical_column_with_identity(key=('aaa',), num_buckets=3)
 
-  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     original = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     for column in (original, copy.deepcopy(original)):
@@ -4312,39 +4273,39 @@ class IdentityCategoricalColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
-    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(0, 1, 0),
-        dense_shape=(2, 2))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=(0, 1, 0),
+          dense_shape=(2, 2))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((0, 1, 0), dtype=np.int64),
+                dense_shape=inputs.dense_shape),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(0, 1, 0),
-        dense_shape=(2, 2))
-    id_tensor = _transform_features({'aaa': inputs}, [column])[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=(0, 1, 0),
+          dense_shape=(2, 2))
+      id_tensor = _transform_features({'aaa': inputs}, [column])[column]
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((0, 1, 0), dtype=np.int64),
+                dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
 
   def test_get_sparse_tensors_weight_collections(self):
     column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
@@ -4361,139 +4322,139 @@ class IdentityCategoricalColumnTest(test.TestCase):
                           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
     self.assertCountEqual([], ops.get_collection('my_weights'))
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
-    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
-    id_weight_pair = column._get_sparse_tensors(
-        _LazyBuilder({
-            'aaa': ((0, -1), (1, 0))
-        }))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=(2, 2)),
-          id_weight_pair.id_tensor.eval())
-
-  @test_util.run_deprecated_v1
-  def test_get_sparse_tensors_with_inputs_too_big(self):
-    # Inputs.
-    vocabulary_size = 2
-    sparse_input = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)), values=(2, 1, 0), dense_shape=(2, 2))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      del shape, dtype, partition_info
-      return embedding_values
-
-    # Build columns.
-    categorical_column = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc._embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer)
-
-    # Provide sparse input and get dense result.
-    embedding_lookup = embedding_column._get_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
-
-    with _initialized_session():
-      with self.assertRaisesRegex(errors.OpError,
-                                  r'indices\[0\] .* 2 .* \[0, 2\)'):
-        self.evaluate(embedding_lookup)
-
-  @test_util.run_deprecated_v1
-  def test_get_sparse_tensors_with_inputs_too_small(self):
-    # Inputs.
-    vocabulary_size = 2
-    sparse_input = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (0, 0), (1, 1), (1, 2)),
-        values=(-9, 0, -6, 1),
-        dense_shape=(2, 4))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      del shape, dtype, partition_info
-      return embedding_values
-
-    # Build columns.
-    categorical_column = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc._embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer)
-
-    # Provide sparse input and get dense result.
-    embedding_lookup = embedding_column._get_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
-    expected_lookups = ((1., 2.), (3., 5))
-    with _initialized_session():
-      self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
-
-  @test_util.run_deprecated_v1
-  def test_get_sparse_tensors_with_default_value(self):
-    column = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=4, default_value=3)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(1, -1, 99),
-        dense_shape=(2, 2))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((1, 3, 3), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
-
-  @test_util.run_deprecated_v1
-  def test_get_sparse_tensors_with_default_value_and_placeholder_inputs(self):
-    column = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=4, default_value=3)
-    input_indices = array_ops.placeholder(dtype=dtypes.int64)
-    input_values = array_ops.placeholder(dtype=dtypes.int32)
-    input_shape = array_ops.placeholder(dtype=dtypes.int64)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=input_indices,
-        values=input_values,
-        dense_shape=input_shape)
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=np.array(((0, 0), (1, 0), (1, 1)), dtype=np.int64),
-              values=np.array((1, 3, 3), dtype=np.int64),
-              dense_shape=np.array((2, 2), dtype=np.int64)),
-          id_weight_pair.id_tensor.eval(feed_dict={
-              input_indices: ((0, 0), (1, 0), (1, 1)),
-              input_values: (1, -1, 99),
-              input_shape: (2, 2),
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
+      id_weight_pair = column._get_sparse_tensors(
+          _LazyBuilder({
+              'aaa': ((0, -1), (1, 0))
           }))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=np.array((0, 1, 0), dtype=np.int64),
+                dense_shape=(2, 2)),
+            id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_with_inputs_too_big(self):
+    with ops.Graph().as_default():
+      # Inputs.
+      vocabulary_size = 2
+      sparse_input = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)), values=(2, 1, 0),
+          dense_shape=(2, 2))
+
+      # Embedding variable.
+      embedding_dimension = 2
+      embedding_values = (
+          (1., 2.),  # id 0
+          (3., 5.),  # id 1
+      )
+
+      def _initializer(shape, dtype, partition_info=None):
+        del shape, dtype, partition_info
+        return embedding_values
+
+      # Build columns.
+      categorical_column = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=vocabulary_size)
+      embedding_column = fc._embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_initializer)
+
+      # Provide sparse input and get dense result.
+      embedding_lookup = embedding_column._get_dense_tensor(
+          _LazyBuilder({'aaa': sparse_input}))
+
+      with _initialized_session():
+        with self.assertRaisesRegex(errors.OpError,
+                                    r'indices\[0\] .* 2 .* \[0, 2\)'):
+          self.evaluate(embedding_lookup)
+
+  def test_get_sparse_tensors_with_inputs_too_small(self):
+    with ops.Graph().as_default():
+      # Inputs.
+      vocabulary_size = 2
+      sparse_input = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (0, 0), (1, 1), (1, 2)),
+          values=(-9, 0, -6, 1),
+          dense_shape=(2, 4))
+
+      # Embedding variable.
+      embedding_dimension = 2
+      embedding_values = (
+          (1., 2.),  # id 0
+          (3., 5.),  # id 1
+      )
+
+      def _initializer(shape, dtype, partition_info=None):
+        del shape, dtype, partition_info
+        return embedding_values
+
+      # Build columns.
+      categorical_column = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=vocabulary_size)
+      embedding_column = fc._embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_initializer)
+
+      # Provide sparse input and get dense result.
+      embedding_lookup = embedding_column._get_dense_tensor(
+          _LazyBuilder({'aaa': sparse_input}))
+      expected_lookups = ((1., 2.), (3., 5))
+      with _initialized_session():
+        self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
+
+  def test_get_sparse_tensors_with_default_value(self):
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=4, default_value=3)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=(1, -1, 99),
+          dense_shape=(2, 2))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((1, 3, 3), dtype=np.int64),
+                dense_shape=inputs.dense_shape),
+            id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_with_default_value_and_placeholder_inputs(self):
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=4, default_value=3)
+      input_indices = array_ops.placeholder(dtype=dtypes.int64)
+      input_values = array_ops.placeholder(dtype=dtypes.int32)
+      input_shape = array_ops.placeholder(dtype=dtypes.int64)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=input_indices,
+          values=input_values,
+          dense_shape=input_shape)
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=np.array(((0, 0), (1, 0), (1, 1)), dtype=np.int64),
+                values=np.array((1, 3, 3), dtype=np.int64),
+                dense_shape=np.array((2, 2), dtype=np.int64)),
+            id_weight_pair.id_tensor.eval(feed_dict={
+                input_indices: ((0, 0), (1, 0), (1, 1)),
+                input_values: (1, -1, 99),
+                input_shape: (2, 2),
+            }))
 
-  @test_util.run_deprecated_v1
   def test_linear_model(self):
     column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     self.assertEqual(3, column._num_buckets)
@@ -4515,7 +4476,6 @@ class IdentityCategoricalColumnTest(test.TestCase):
         # weight_var[2] + weight_var[1] = 3+2 = 5
         self.assertAllClose(((1.,), (5.,)), self.evaluate(predictions))
 
-  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
     column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     self.assertEqual(3, column._num_buckets)
@@ -4667,7 +4627,6 @@ class IndicatorColumnTest(test.TestCase):
     with self.cached_session():
       self.assertAllEqual([[0., 1., 1., 0.]], self.evaluate(output))
 
-  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     a = fc._categorical_column_with_hash_bucket('a', 4)
     column = fc._indicator_column(a)
@@ -4700,66 +4659,66 @@ class IndicatorColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
-  @test_util.run_deprecated_v1
   def test_transform(self):
-    a = fc._categorical_column_with_vocabulary_list(
-        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    a_indicator = fc._indicator_column(a)
-    features = {
-        'aaa': sparse_tensor.SparseTensorValue(
-            indices=((0, 0), (1, 0), (1, 1)),
-            values=('marlo', 'skywalker', 'omar'),
-            dense_shape=(2, 2))
-    }
-    indicator_tensor = _transform_features(features, [a_indicator])[a_indicator]
-    with _initialized_session():
-      self.assertAllEqual([[0, 0, 1], [1, 0, 0]],
-                          self.evaluate(indicator_tensor))
+    with ops.Graph().as_default():
+      a = fc._categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+      a_indicator = fc._indicator_column(a)
+      features = {
+          'aaa': sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=('marlo', 'skywalker', 'omar'),
+              dense_shape=(2, 2))
+      }
+      indicator_tensor = _transform_features(features,
+                                             [a_indicator])[a_indicator]
+      with _initialized_session():
+        self.assertAllEqual([[0, 0, 1], [1, 0, 0]],
+                            self.evaluate(indicator_tensor))
 
-  @test_util.run_deprecated_v1
   def test_transform_with_weighted_column(self):
-    # Github issue 12557
-    ids = fc._categorical_column_with_vocabulary_list(
-        key='ids', vocabulary_list=('a', 'b', 'c'))
-    weights = fc._weighted_categorical_column(ids, 'weights')
-    indicator = fc._indicator_column(weights)
-    features = {
-        'ids': constant_op.constant([['c', 'b', 'a', 'c']]),
-        'weights': constant_op.constant([[2., 4., 6., 1.]])
-    }
-    indicator_tensor = _transform_features(features, [indicator])[indicator]
-    with _initialized_session():
-      self.assertAllEqual([[6., 4., 3.]], self.evaluate(indicator_tensor))
+    with ops.Graph().as_default():
+      # Github issue 12557
+      ids = fc._categorical_column_with_vocabulary_list(
+          key='ids', vocabulary_list=('a', 'b', 'c'))
+      weights = fc._weighted_categorical_column(ids, 'weights')
+      indicator = fc._indicator_column(weights)
+      features = {
+          'ids': constant_op.constant([['c', 'b', 'a', 'c']]),
+          'weights': constant_op.constant([[2., 4., 6., 1.]])
+      }
+      indicator_tensor = _transform_features(features, [indicator])[indicator]
+      with _initialized_session():
+        self.assertAllEqual([[6., 4., 3.]], self.evaluate(indicator_tensor))
 
-  @test_util.run_deprecated_v1
   def test_transform_with_missing_value_in_weighted_column(self):
-    # Github issue 12583
-    ids = fc._categorical_column_with_vocabulary_list(
-        key='ids', vocabulary_list=('a', 'b', 'c'))
-    weights = fc._weighted_categorical_column(ids, 'weights')
-    indicator = fc._indicator_column(weights)
-    features = {
-        'ids': constant_op.constant([['c', 'b', 'unknown']]),
-        'weights': constant_op.constant([[2., 4., 6.]])
-    }
-    indicator_tensor = _transform_features(features, [indicator])[indicator]
-    with _initialized_session():
-      self.assertAllEqual([[0., 4., 2.]], self.evaluate(indicator_tensor))
+    with ops.Graph().as_default():
+      # Github issue 12583
+      ids = fc._categorical_column_with_vocabulary_list(
+          key='ids', vocabulary_list=('a', 'b', 'c'))
+      weights = fc._weighted_categorical_column(ids, 'weights')
+      indicator = fc._indicator_column(weights)
+      features = {
+          'ids': constant_op.constant([['c', 'b', 'unknown']]),
+          'weights': constant_op.constant([[2., 4., 6.]])
+      }
+      indicator_tensor = _transform_features(features, [indicator])[indicator]
+      with _initialized_session():
+        self.assertAllEqual([[0., 4., 2.]], self.evaluate(indicator_tensor))
 
-  @test_util.run_deprecated_v1
   def test_transform_with_missing_value_in_categorical_column(self):
-    # Github issue 12583
-    ids = fc._categorical_column_with_vocabulary_list(
-        key='ids', vocabulary_list=('a', 'b', 'c'))
-    indicator = fc._indicator_column(ids)
-    features = {
-        'ids': constant_op.constant([['c', 'b', 'unknown']]),
-    }
-    indicator_tensor = _transform_features(features, [indicator])[indicator]
-    with _initialized_session():
-      self.assertAllEqual([[0., 1., 1.]], self.evaluate(indicator_tensor))
+    with ops.Graph().as_default():
+      # Github issue 12583
+      ids = fc._categorical_column_with_vocabulary_list(
+          key='ids', vocabulary_list=('a', 'b', 'c'))
+      indicator = fc._indicator_column(ids)
+      features = {
+          'ids': constant_op.constant([['c', 'b', 'unknown']]),
+      }
+      indicator_tensor = _transform_features(features, [indicator])[indicator]
+      with _initialized_session():
+        self.assertAllEqual([[0., 1., 1.]], self.evaluate(indicator_tensor))
 
-  @test_util.run_deprecated_v1
   def test_linear_model(self):
     animal = fc._indicator_column(
         fc._categorical_column_with_identity('animal', num_buckets=4))
@@ -4779,7 +4738,6 @@ class IndicatorColumnTest(test.TestCase):
         weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
         self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
 
-  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
     animal = fc._indicator_column(
         fc._categorical_column_with_identity('animal', num_buckets=4))
@@ -4799,7 +4757,6 @@ class IndicatorColumnTest(test.TestCase):
         weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
         self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
 
-  @test_util.run_deprecated_v1
   def test_input_layer(self):
     animal = fc._indicator_column(
         fc._categorical_column_with_identity('animal', num_buckets=4))
@@ -4816,7 +4773,6 @@ class IndicatorColumnTest(test.TestCase):
 
 class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
-  @test_util.run_deprecated_v1
   def test_defaults(self):
     categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -4838,7 +4794,6 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column._parse_example_spec)
 
-  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
     categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -4867,7 +4822,6 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column._parse_example_spec)
 
-  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -4901,7 +4855,6 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
           'aaa': parsing_ops.VarLenFeature(dtypes.int64)
       }, embedding_column._parse_example_spec)
 
-  @test_util.run_deprecated_v1
   def test_invalid_initializer(self):
     categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -4930,25 +4883,24 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
           sparse_tensor.SparseTensorValue(
               indices=[[0, 0], [0, 1]],
               values=np.array([b'omar', b'stringer'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
+              dense_shape=[1, 2]), features['aaa'].eval())
 
-  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    a = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
-    a_embedded = fc._embedding_column(a, dimension=2)
-    features = {
-        'aaa': sparse_tensor.SparseTensor(
-            indices=((0, 0), (1, 0), (1, 1)),
-            values=(0, 1, 0),
-            dense_shape=(2, 2))
-    }
-    outputs = _transform_features(features, [a, a_embedded])
-    output_a = outputs[a]
-    output_embedded = outputs[a_embedded]
-    with _initialized_session():
-      _assert_sparse_tensor_value(self, self.evaluate(output_a),
-                                  self.evaluate(output_embedded))
+    with ops.Graph().as_default():
+      a = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
+      a_embedded = fc._embedding_column(a, dimension=2)
+      features = {
+          'aaa': sparse_tensor.SparseTensor(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=(0, 1, 0),
+              dense_shape=(2, 2))
+      }
+      outputs = _transform_features(features, [a, a_embedded])
+      output_a = outputs[a]
+      output_embedded = outputs[a_embedded]
+      with _initialized_session():
+        _assert_sparse_tensor_value(self, self.evaluate(output_a),
+                                    self.evaluate(output_embedded))
 
   @parameterized.named_parameters(
       {
@@ -4968,184 +4920,183 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
           'use_safe_embedding_lookup': False,
           'partition_variables': True,
       })
-  @test_util.run_deprecated_v1
+
   def test_get_dense_tensor(self, use_safe_embedding_lookup,
                             partition_variables):
-    # Inputs.
-    vocabulary_size = 4
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
+    with ops.Graph().as_default():
+      # Inputs.
+      vocabulary_size = 4
+      sparse_input = sparse_tensor.SparseTensorValue(
+          # example 0, ids [2]
+          # example 1, ids [0, 1]
+          # example 2, ids []
+          # example 3, ids [1]
+          indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+          values=(2, 0, 1, 1),
+          dense_shape=(4, 5))
 
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.),  # id 2
-        (9., 13.)  # id 3
-    )
+      # Embedding variable.
+      embedding_dimension = 2
+      embedding_values = (
+          (1., 2.),  # id 0
+          (3., 5.),  # id 1
+          (7., 11.),  # id 2
+          (9., 13.)  # id 3
+      )
 
-    def _initializer(shape, dtype, partition_info=None):
+      def _initializer(shape, dtype, partition_info=None):
+        if partition_variables:
+          self.assertEqual([vocabulary_size, embedding_dimension],
+                           partition_info.full_shape)
+          self.assertAllEqual((2, embedding_dimension), shape)
+        else:
+          self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+          self.assertIsNone(partition_info)
+
+        self.assertEqual(dtypes.float32, dtype)
+        return embedding_values
+
+      # Expected lookup result, using combiner='mean'.
+      expected_lookups = (
+          # example 0, ids [2], embedding = [7, 11]
+          (7., 11.),
+          # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+          (2., 3.5),
+          # example 2, ids [], embedding = [0, 0]
+          (0., 0.),
+          # example 3, ids [1], embedding = [3, 5]
+          (3., 5.),
+      )
+
+      # Build columns.
+      categorical_column = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=vocabulary_size)
+      partitioner = None
       if partition_variables:
-        self.assertEqual([vocabulary_size, embedding_dimension],
-                         partition_info.full_shape)
-        self.assertAllEqual((2, embedding_dimension), shape)
+        partitioner = partitioned_variables.fixed_size_partitioner(2, axis=0)
+      with variable_scope.variable_scope('vars', partitioner=partitioner):
+        embedding_column = fc._embedding_column(
+            categorical_column,
+            dimension=embedding_dimension,
+            initializer=_initializer,
+            use_safe_embedding_lookup=use_safe_embedding_lookup)
+
+        # Provide sparse input and get dense result.
+        embedding_lookup = embedding_column._get_dense_tensor(
+            _LazyBuilder({'aaa': sparse_input}))
+
+      # Assert expected embedding variable and lookups.
+      global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+      if partition_variables:
+        self.assertCountEqual(('vars/embedding_weights/part_0:0',
+                               'vars/embedding_weights/part_1:0'),
+                              tuple([v.name for v in global_vars]))
       else:
+        self.assertCountEqual(('vars/embedding_weights:0',),
+                              tuple([v.name for v in global_vars]))
+      for v in global_vars:
+        self.assertIsInstance(v, variables_lib.Variable)
+      with _initialized_session():
+        self.assertAllEqual(embedding_values, global_vars[0])
+        self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
+
+      if use_safe_embedding_lookup:
+        self.assertIn(
+            'SparseFillEmptyRows',
+            [x.type for x in ops.get_default_graph().get_operations()])
+      else:
+        self.assertNotIn(
+            'SparseFillEmptyRows',
+            [x.type for x in ops.get_default_graph().get_operations()])
+
+  def test_get_dense_tensor_3d(self):
+    with ops.Graph().as_default():
+      # Inputs.
+      vocabulary_size = 4
+      sparse_input = sparse_tensor.SparseTensorValue(
+          # example 0, ids [2]
+          # example 1, ids [0, 1]
+          # example 2, ids []
+          # example 3, ids [1]
+          indices=((0, 0, 0), (1, 1, 0), (1, 1, 4), (3, 0, 0), (3, 1, 2)),
+          values=(2, 0, 1, 1, 2),
+          dense_shape=(4, 2, 5))
+
+      # Embedding variable.
+      embedding_dimension = 3
+      embedding_values = (
+          (1., 2., 4.),  # id 0
+          (3., 5., 1.),  # id 1
+          (7., 11., 2.),  # id 2
+          (2., 7., 12.)  # id 3
+      )
+
+      def _initializer(shape, dtype, partition_info):
         self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertEqual(dtypes.float32, dtype)
         self.assertIsNone(partition_info)
+        return embedding_values
 
-      self.assertEqual(dtypes.float32, dtype)
-      return embedding_values
+      # Expected lookup result, using combiner='mean'.
+      expected_lookups = (
+          # example 0, ids [[2], []], embedding = [[7, 11, 2], [0, 0, 0]]
+          ((7., 11., 2.), (0., 0., 0.)),
+          # example 1, ids [[], [0, 1]], embedding
+          # = mean([[], [1, 2, 4] + [3, 5, 1]]) = [[0, 0, 0], [2, 3.5, 2.5]]
+          ((0., 0., 0.), (2., 3.5, 2.5)),
+          # example 2, ids [[], []], embedding = [[0, 0, 0], [0, 0, 0]]
+          ((0., 0., 0.), (0., 0., 0.)),
+          # example 3, ids [[1], [2]], embedding = [[3, 5, 1], [7, 11, 2]]
+          ((3., 5., 1.), (7., 11., 2.)),
+      )
 
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0, ids [2], embedding = [7, 11]
-        (7., 11.),
-        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        (2., 3.5),
-        # example 2, ids [], embedding = [0, 0]
-        (0., 0.),
-        # example 3, ids [1], embedding = [3, 5]
-        (3., 5.),
-    )
-
-    # Build columns.
-    categorical_column = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    partitioner = None
-    if partition_variables:
-      partitioner = partitioned_variables.fixed_size_partitioner(2, axis=0)
-    with variable_scope.variable_scope('vars', partitioner=partitioner):
+      # Build columns.
+      categorical_column = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=vocabulary_size)
       embedding_column = fc._embedding_column(
           categorical_column,
           dimension=embedding_dimension,
-          initializer=_initializer,
-          use_safe_embedding_lookup=use_safe_embedding_lookup)
+          initializer=_initializer)
 
       # Provide sparse input and get dense result.
       embedding_lookup = embedding_column._get_dense_tensor(
           _LazyBuilder({'aaa': sparse_input}))
 
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    if partition_variables:
-      self.assertCountEqual(('vars/embedding_weights/part_0:0',
-                             'vars/embedding_weights/part_1:0'),
+      # Assert expected embedding variable and lookups.
+      global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+      self.assertCountEqual(('embedding_weights:0',),
                             tuple([v.name for v in global_vars]))
-    else:
-      self.assertCountEqual(('vars/embedding_weights:0',),
-                            tuple([v.name for v in global_vars]))
-    for v in global_vars:
-      self.assertIsInstance(v, variables_lib.Variable)
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0])
-      self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
+      with _initialized_session():
+        self.assertAllEqual(embedding_values, global_vars[0])
+        self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
-    if use_safe_embedding_lookup:
-      self.assertIn('SparseFillEmptyRows',
-                    [x.type for x in ops.get_default_graph().get_operations()])
-    else:
-      self.assertNotIn(
-          'SparseFillEmptyRows',
-          [x.type for x in ops.get_default_graph().get_operations()])
-
-  @test_util.run_deprecated_v1
-  def test_get_dense_tensor_3d(self):
-    # Inputs.
-    vocabulary_size = 4
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0, 0), (1, 1, 0), (1, 1, 4), (3, 0, 0), (3, 1, 2)),
-        values=(2, 0, 1, 1, 2),
-        dense_shape=(4, 2, 5))
-
-    # Embedding variable.
-    embedding_dimension = 3
-    embedding_values = (
-        (1., 2., 4.),   # id 0
-        (3., 5., 1.),   # id 1
-        (7., 11., 2.),  # id 2
-        (2., 7., 12.)   # id 3
-    )
-    def _initializer(shape, dtype, partition_info):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0, ids [[2], []], embedding = [[7, 11, 2], [0, 0, 0]]
-        ((7., 11., 2.), (0., 0., 0.)),
-        # example 1, ids [[], [0, 1]], embedding
-        # = mean([[], [1, 2, 4] + [3, 5, 1]]) = [[0, 0, 0], [2, 3.5, 2.5]]
-        ((0., 0., 0.), (2., 3.5, 2.5)),
-        # example 2, ids [[], []], embedding = [[0, 0, 0], [0, 0, 0]]
-        ((0., 0., 0.), (0., 0., 0.)),
-        # example 3, ids [[1], [2]], embedding = [[3, 5, 1], [7, 11, 2]]
-        ((3., 5., 1.), (7., 11., 2.)),
-    )
-
-    # Build columns.
-    categorical_column = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc._embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer)
-
-    # Provide sparse input and get dense result.
-    embedding_lookup = embedding_column._get_dense_tensor(
-        _LazyBuilder({
-            'aaa': sparse_input
-        }))
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertCountEqual(('embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0])
-      self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
-
-  @test_util.run_deprecated_v1
   def test_get_dense_tensor_weight_collections(self):
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
+    with ops.Graph().as_default():
+      sparse_input = sparse_tensor.SparseTensorValue(
+          # example 0, ids [2]
+          # example 1, ids [0, 1]
+          # example 2, ids []
+          # example 3, ids [1]
+          indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+          values=(2, 0, 1, 1),
+          dense_shape=(4, 5))
 
-    # Build columns.
-    categorical_column = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=3)
-    embedding_column = fc._embedding_column(categorical_column, dimension=2)
+      # Build columns.
+      categorical_column = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=3)
+      embedding_column = fc._embedding_column(categorical_column, dimension=2)
 
-    # Provide sparse input and get dense result.
-    embedding_column._get_dense_tensor(
-        _LazyBuilder({
-            'aaa': sparse_input
-        }), weight_collections=('my_vars',))
+      # Provide sparse input and get dense result.
+      embedding_column._get_dense_tensor(
+          _LazyBuilder({'aaa': sparse_input}), weight_collections=('my_vars',))
 
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertCountEqual(('embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
-    my_vars = ops.get_collection('my_vars')
-    self.assertCountEqual(('embedding_weights:0',),
-                          tuple([v.name for v in my_vars]))
+      # Assert expected embedding variable and lookups.
+      global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+      self.assertCountEqual(('embedding_weights:0',),
+                            tuple([v.name for v in global_vars]))
+      my_vars = ops.get_collection('my_vars')
+      self.assertCountEqual(('embedding_weights:0',),
+                            tuple([v.name for v in my_vars]))
 
   @test_util.run_deprecated_v1
   def test_get_dense_tensor_placeholder_inputs(self):
@@ -5219,66 +5170,63 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
               input_shape: sparse_input.dense_shape,
           }))
 
-  @test_util.run_deprecated_v1
   def test_get_dense_tensor_restore_from_ckpt(self):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
+    with ops.Graph().as_default():
+      # Inputs.
+      vocabulary_size = 3
+      sparse_input = sparse_tensor.SparseTensorValue(
+          # example 0, ids [2]
+          # example 1, ids [0, 1]
+          # example 2, ids []
+          # example 3, ids [1]
+          indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+          values=(2, 0, 1, 1),
+          dense_shape=(4, 5))
 
-    # Embedding variable. The checkpoint file contains _embedding_values.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-    ckpt_path = test.test_src_dir_path(
-        'python/feature_column/testdata/embedding.ckpt')
-    ckpt_tensor = 'my_embedding'
+      # Embedding variable. The checkpoint file contains _embedding_values.
+      embedding_dimension = 2
+      embedding_values = (
+          (1., 2.),  # id 0
+          (3., 5.),  # id 1
+          (7., 11.)  # id 2
+      )
+      ckpt_path = test.test_src_dir_path(
+          'python/feature_column/testdata/embedding.ckpt')
+      ckpt_tensor = 'my_embedding'
 
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0, ids [2], embedding = [7, 11]
-        (7., 11.),
-        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        (2., 3.5),
-        # example 2, ids [], embedding = [0, 0]
-        (0., 0.),
-        # example 3, ids [1], embedding = [3, 5]
-        (3., 5.),
-    )
+      # Expected lookup result, using combiner='mean'.
+      expected_lookups = (
+          # example 0, ids [2], embedding = [7, 11]
+          (7., 11.),
+          # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+          (2., 3.5),
+          # example 2, ids [], embedding = [0, 0]
+          (0., 0.),
+          # example 3, ids [1], embedding = [3, 5]
+          (3., 5.),
+      )
 
-    # Build columns.
-    categorical_column = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc._embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        ckpt_to_load_from=ckpt_path,
-        tensor_name_in_ckpt=ckpt_tensor)
+      # Build columns.
+      categorical_column = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=vocabulary_size)
+      embedding_column = fc._embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          ckpt_to_load_from=ckpt_path,
+          tensor_name_in_ckpt=ckpt_tensor)
 
-    # Provide sparse input and get dense result.
-    embedding_lookup = embedding_column._get_dense_tensor(
-        _LazyBuilder({
-            'aaa': sparse_input
-        }))
+      # Provide sparse input and get dense result.
+      embedding_lookup = embedding_column._get_dense_tensor(
+          _LazyBuilder({'aaa': sparse_input}))
 
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertCountEqual(('embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0])
-      self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
+      # Assert expected embedding variable and lookups.
+      global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+      self.assertCountEqual(('embedding_weights:0',),
+                            tuple([v.name for v in global_vars]))
+      with _initialized_session():
+        self.assertAllEqual(embedding_values, global_vars[0])
+        self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
-  @test_util.run_deprecated_v1
   def test_linear_model(self):
     # Inputs.
     batch_size = 4
@@ -5358,7 +5306,6 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
         self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
                             self.evaluate(predictions))
 
-  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
     # Inputs.
     batch_size = 4
@@ -5438,374 +5385,377 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
         self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
                             self.evaluate(predictions))
 
-  @test_util.run_deprecated_v1
   def test_input_layer(self):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
+    with ops.Graph().as_default():
+      # Inputs.
+      vocabulary_size = 3
+      sparse_input = sparse_tensor.SparseTensorValue(
+          # example 0, ids [2]
+          # example 1, ids [0, 1]
+          # example 2, ids []
+          # example 3, ids [1]
+          indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+          values=(2, 0, 1, 1),
+          dense_shape=(4, 5))
 
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-    def _initializer(shape, dtype, partition_info):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
+      # Embedding variable.
+      embedding_dimension = 2
+      embedding_values = (
+          (1., 2.),  # id 0
+          (3., 5.),  # id 1
+          (7., 11.)  # id 2
+      )
 
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0, ids [2], embedding = [7, 11]
-        (7., 11.),
-        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        (2., 3.5),
-        # example 2, ids [], embedding = [0, 0]
-        (0., 0.),
-        # example 3, ids [1], embedding = [3, 5]
-        (3., 5.),
-    )
+      def _initializer(shape, dtype, partition_info):
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertEqual(dtypes.float32, dtype)
+        self.assertIsNone(partition_info)
+        return embedding_values
 
-    # Build columns.
-    categorical_column = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc._embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer)
+      # Expected lookup result, using combiner='mean'.
+      expected_lookups = (
+          # example 0, ids [2], embedding = [7, 11]
+          (7., 11.),
+          # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+          (2., 3.5),
+          # example 2, ids [], embedding = [0, 0]
+          (0., 0.),
+          # example 3, ids [1], embedding = [3, 5]
+          (3., 5.),
+      )
 
-    # Provide sparse input and get dense result.
-    input_layer = fc.input_layer({'aaa': sparse_input}, (embedding_column,))
+      # Build columns.
+      categorical_column = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=vocabulary_size)
+      embedding_column = fc._embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_initializer)
 
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertCountEqual(('input_layer/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
-    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    self.assertCountEqual(('input_layer/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in trainable_vars]))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, trainable_vars[0])
-      self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
+      # Provide sparse input and get dense result.
+      input_layer = fc.input_layer({'aaa': sparse_input}, (embedding_column,))
+
+      # Assert expected embedding variable and lookups.
+      global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+      self.assertCountEqual(('input_layer/aaa_embedding/embedding_weights:0',),
+                            tuple([v.name for v in global_vars]))
+      trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertCountEqual(('input_layer/aaa_embedding/embedding_weights:0',),
+                            tuple([v.name for v in trainable_vars]))
+      with _initialized_session():
+        self.assertAllEqual(embedding_values, trainable_vars[0])
+        self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
 
-  @test_util.run_deprecated_v1
   def test_input_layer_not_trainable(self):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
+    with ops.Graph().as_default():
+      # Inputs.
+      vocabulary_size = 3
+      sparse_input = sparse_tensor.SparseTensorValue(
+          # example 0, ids [2]
+          # example 1, ids [0, 1]
+          # example 2, ids []
+          # example 3, ids [1]
+          indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+          values=(2, 0, 1, 1),
+          dense_shape=(4, 5))
 
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-    def _initializer(shape, dtype, partition_info):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
+      # Embedding variable.
+      embedding_dimension = 2
+      embedding_values = (
+          (1., 2.),  # id 0
+          (3., 5.),  # id 1
+          (7., 11.)  # id 2
+      )
 
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0, ids [2], embedding = [7, 11]
-        (7., 11.),
-        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        (2., 3.5),
-        # example 2, ids [], embedding = [0, 0]
-        (0., 0.),
-        # example 3, ids [1], embedding = [3, 5]
-        (3., 5.),
-    )
+      def _initializer(shape, dtype, partition_info):
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertEqual(dtypes.float32, dtype)
+        self.assertIsNone(partition_info)
+        return embedding_values
 
-    # Build columns.
-    categorical_column = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc._embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=False)
+      # Expected lookup result, using combiner='mean'.
+      expected_lookups = (
+          # example 0, ids [2], embedding = [7, 11]
+          (7., 11.),
+          # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+          (2., 3.5),
+          # example 2, ids [], embedding = [0, 0]
+          (0., 0.),
+          # example 3, ids [1], embedding = [3, 5]
+          (3., 5.),
+      )
 
-    # Provide sparse input and get dense result.
-    input_layer = fc.input_layer({'aaa': sparse_input}, (embedding_column,))
+      # Build columns.
+      categorical_column = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=vocabulary_size)
+      embedding_column = fc._embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_initializer,
+          trainable=False)
 
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertCountEqual(('input_layer/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
-    self.assertCountEqual([],
-                          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0])
-      self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
+      # Provide sparse input and get dense result.
+      input_layer = fc.input_layer({'aaa': sparse_input}, (embedding_column,))
+
+      # Assert expected embedding variable and lookups.
+      global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+      self.assertCountEqual(('input_layer/aaa_embedding/embedding_weights:0',),
+                            tuple([v.name for v in global_vars]))
+      self.assertCountEqual([],
+                            ops.get_collection(
+                                ops.GraphKeys.TRAINABLE_VARIABLES))
+      with _initialized_session():
+        self.assertAllEqual(embedding_values, global_vars[0])
+        self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
 
 
 class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
-  @test_util.run_deprecated_v1
   def test_defaults(self):
-    categorical_column_a = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=3)
-    categorical_column_b = fc._categorical_column_with_identity(
-        key='bbb', num_buckets=3)
-    embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc_new.shared_embedding_columns(
-        [categorical_column_b, categorical_column_a],
-        dimension=embedding_dimension)
-    self.assertIs(categorical_column_a, embedding_column_a.categorical_column)
-    self.assertIs(categorical_column_b, embedding_column_b.categorical_column)
-    self.assertEqual(embedding_dimension, embedding_column_a.dimension)
-    self.assertEqual(embedding_dimension, embedding_column_b.dimension)
-    self.assertEqual('mean', embedding_column_a.combiner)
-    self.assertEqual('mean', embedding_column_b.combiner)
-    self.assertIsNone(embedding_column_a.ckpt_to_load_from)
-    self.assertIsNone(embedding_column_b.ckpt_to_load_from)
-    self.assertEqual('aaa_bbb_shared_embedding',
-                     embedding_column_a.shared_embedding_collection_name)
-    self.assertEqual('aaa_bbb_shared_embedding',
-                     embedding_column_b.shared_embedding_collection_name)
-    self.assertIsNone(embedding_column_a.tensor_name_in_ckpt)
-    self.assertIsNone(embedding_column_b.tensor_name_in_ckpt)
-    self.assertIsNone(embedding_column_a.max_norm)
-    self.assertIsNone(embedding_column_b.max_norm)
-    self.assertTrue(embedding_column_a.trainable)
-    self.assertTrue(embedding_column_b.trainable)
-    self.assertEqual('aaa_shared_embedding', embedding_column_a.name)
-    self.assertEqual('bbb_shared_embedding', embedding_column_b.name)
-    self.assertEqual(
-        'aaa_bbb_shared_embedding', embedding_column_a._var_scope_name)
-    self.assertEqual(
-        'aaa_bbb_shared_embedding', embedding_column_b._var_scope_name)
-    self.assertEqual(
-        (embedding_dimension,), embedding_column_a._variable_shape)
-    self.assertEqual(
-        (embedding_dimension,), embedding_column_b._variable_shape)
-    self.assertEqual({
-        'aaa': parsing_ops.VarLenFeature(dtypes.int64)
-    }, embedding_column_a._parse_example_spec)
-    self.assertEqual({
-        'bbb': parsing_ops.VarLenFeature(dtypes.int64)
-    }, embedding_column_b._parse_example_spec)
-
-  @test_util.run_deprecated_v1
-  def test_all_constructor_args(self):
-    categorical_column_a = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=3)
-    categorical_column_b = fc._categorical_column_with_identity(
-        key='bbb', num_buckets=3)
-    embedding_dimension = 2
-    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        combiner='my_combiner',
-        initializer=lambda: 'my_initializer',
-        shared_embedding_collection_name='shared_embedding_collection_name',
-        ckpt_to_load_from='my_ckpt',
-        tensor_name_in_ckpt='my_ckpt_tensor',
-        max_norm=42.,
-        trainable=False)
-    self.assertIs(categorical_column_a, embedding_column_a.categorical_column)
-    self.assertIs(categorical_column_b, embedding_column_b.categorical_column)
-    self.assertEqual(embedding_dimension, embedding_column_a.dimension)
-    self.assertEqual(embedding_dimension, embedding_column_b.dimension)
-    self.assertEqual('my_combiner', embedding_column_a.combiner)
-    self.assertEqual('my_combiner', embedding_column_b.combiner)
-    self.assertEqual('shared_embedding_collection_name',
-                     embedding_column_a.shared_embedding_collection_name)
-    self.assertEqual('shared_embedding_collection_name',
-                     embedding_column_b.shared_embedding_collection_name)
-    self.assertEqual('my_ckpt', embedding_column_a.ckpt_to_load_from)
-    self.assertEqual('my_ckpt', embedding_column_b.ckpt_to_load_from)
-    self.assertEqual('my_ckpt_tensor', embedding_column_a.tensor_name_in_ckpt)
-    self.assertEqual('my_ckpt_tensor', embedding_column_b.tensor_name_in_ckpt)
-    self.assertEqual(42., embedding_column_a.max_norm)
-    self.assertEqual(42., embedding_column_b.max_norm)
-    self.assertFalse(embedding_column_a.trainable)
-    self.assertFalse(embedding_column_b.trainable)
-    self.assertEqual('aaa_shared_embedding', embedding_column_a.name)
-    self.assertEqual('bbb_shared_embedding', embedding_column_b.name)
-    self.assertEqual(
-        'shared_embedding_collection_name', embedding_column_a._var_scope_name)
-    self.assertEqual(
-        'shared_embedding_collection_name', embedding_column_b._var_scope_name)
-    self.assertEqual(
-        (embedding_dimension,), embedding_column_a._variable_shape)
-    self.assertEqual(
-        (embedding_dimension,), embedding_column_b._variable_shape)
-    self.assertEqual({
-        'aaa': parsing_ops.VarLenFeature(dtypes.int64)
-    }, embedding_column_a._parse_example_spec)
-    self.assertEqual({
-        'bbb': parsing_ops.VarLenFeature(dtypes.int64)
-    }, embedding_column_b._parse_example_spec)
-
-  @test_util.run_deprecated_v1
-  def test_deep_copy(self):
-    categorical_column_a = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=3)
-    categorical_column_b = fc._categorical_column_with_identity(
-        key='bbb', num_buckets=3)
-    embedding_dimension = 2
-    original_a, _ = fc_new.shared_embedding_columns(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        combiner='my_combiner',
-        initializer=lambda: 'my_initializer',
-        shared_embedding_collection_name='shared_embedding_collection_name',
-        ckpt_to_load_from='my_ckpt',
-        tensor_name_in_ckpt='my_ckpt_tensor',
-        max_norm=42.,
-        trainable=False)
-    for embedding_column_a in (original_a, copy.deepcopy(original_a)):
-      self.assertEqual('aaa', embedding_column_a.categorical_column.name)
-      self.assertEqual(3, embedding_column_a.categorical_column._num_buckets)
-      self.assertEqual({
-          'aaa': parsing_ops.VarLenFeature(dtypes.int64)
-      }, embedding_column_a.categorical_column._parse_example_spec)
-
+    with ops.Graph().as_default():
+      categorical_column_a = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=3)
+      categorical_column_b = fc._categorical_column_with_identity(
+          key='bbb', num_buckets=3)
+      embedding_dimension = 2
+      embedding_column_b, embedding_column_a = fc_new.shared_embedding_columns(
+          [categorical_column_b, categorical_column_a],
+          dimension=embedding_dimension)
+      self.assertIs(categorical_column_a, embedding_column_a.categorical_column)
+      self.assertIs(categorical_column_b, embedding_column_b.categorical_column)
       self.assertEqual(embedding_dimension, embedding_column_a.dimension)
+      self.assertEqual(embedding_dimension, embedding_column_b.dimension)
+      self.assertEqual('mean', embedding_column_a.combiner)
+      self.assertEqual('mean', embedding_column_b.combiner)
+      self.assertIsNone(embedding_column_a.ckpt_to_load_from)
+      self.assertIsNone(embedding_column_b.ckpt_to_load_from)
+      self.assertEqual('aaa_bbb_shared_embedding',
+                       embedding_column_a.shared_embedding_collection_name)
+      self.assertEqual('aaa_bbb_shared_embedding',
+                       embedding_column_b.shared_embedding_collection_name)
+      self.assertIsNone(embedding_column_a.tensor_name_in_ckpt)
+      self.assertIsNone(embedding_column_b.tensor_name_in_ckpt)
+      self.assertIsNone(embedding_column_a.max_norm)
+      self.assertIsNone(embedding_column_b.max_norm)
+      self.assertTrue(embedding_column_a.trainable)
+      self.assertTrue(embedding_column_b.trainable)
+      self.assertEqual('aaa_shared_embedding', embedding_column_a.name)
+      self.assertEqual('bbb_shared_embedding', embedding_column_b.name)
+      self.assertEqual('aaa_bbb_shared_embedding',
+                       embedding_column_a._var_scope_name)
+      self.assertEqual('aaa_bbb_shared_embedding',
+                       embedding_column_b._var_scope_name)
+      self.assertEqual((embedding_dimension,),
+                       embedding_column_a._variable_shape)
+      self.assertEqual((embedding_dimension,),
+                       embedding_column_b._variable_shape)
+      self.assertEqual({'aaa': parsing_ops.VarLenFeature(dtypes.int64)},
+                       embedding_column_a._parse_example_spec)
+      self.assertEqual({'bbb': parsing_ops.VarLenFeature(dtypes.int64)},
+                       embedding_column_b._parse_example_spec)
+
+  def test_all_constructor_args(self):
+    with ops.Graph().as_default():
+      categorical_column_a = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=3)
+      categorical_column_b = fc._categorical_column_with_identity(
+          key='bbb', num_buckets=3)
+      embedding_dimension = 2
+      embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
+          [categorical_column_a, categorical_column_b],
+          dimension=embedding_dimension,
+          combiner='my_combiner',
+          initializer=lambda: 'my_initializer',
+          shared_embedding_collection_name='shared_embedding_collection_name',
+          ckpt_to_load_from='my_ckpt',
+          tensor_name_in_ckpt='my_ckpt_tensor',
+          max_norm=42.,
+          trainable=False)
+      self.assertIs(categorical_column_a, embedding_column_a.categorical_column)
+      self.assertIs(categorical_column_b, embedding_column_b.categorical_column)
+      self.assertEqual(embedding_dimension, embedding_column_a.dimension)
+      self.assertEqual(embedding_dimension, embedding_column_b.dimension)
       self.assertEqual('my_combiner', embedding_column_a.combiner)
+      self.assertEqual('my_combiner', embedding_column_b.combiner)
       self.assertEqual('shared_embedding_collection_name',
                        embedding_column_a.shared_embedding_collection_name)
+      self.assertEqual('shared_embedding_collection_name',
+                       embedding_column_b.shared_embedding_collection_name)
       self.assertEqual('my_ckpt', embedding_column_a.ckpt_to_load_from)
+      self.assertEqual('my_ckpt', embedding_column_b.ckpt_to_load_from)
       self.assertEqual('my_ckpt_tensor', embedding_column_a.tensor_name_in_ckpt)
+      self.assertEqual('my_ckpt_tensor', embedding_column_b.tensor_name_in_ckpt)
       self.assertEqual(42., embedding_column_a.max_norm)
+      self.assertEqual(42., embedding_column_b.max_norm)
       self.assertFalse(embedding_column_a.trainable)
+      self.assertFalse(embedding_column_b.trainable)
       self.assertEqual('aaa_shared_embedding', embedding_column_a.name)
+      self.assertEqual('bbb_shared_embedding', embedding_column_b.name)
+      self.assertEqual('shared_embedding_collection_name',
+                       embedding_column_a._var_scope_name)
+      self.assertEqual('shared_embedding_collection_name',
+                       embedding_column_b._var_scope_name)
       self.assertEqual(
           (embedding_dimension,), embedding_column_a._variable_shape)
+      self.assertEqual((embedding_dimension,),
+                       embedding_column_b._variable_shape)
       self.assertEqual({
           'aaa': parsing_ops.VarLenFeature(dtypes.int64)
       }, embedding_column_a._parse_example_spec)
+      self.assertEqual({'bbb': parsing_ops.VarLenFeature(dtypes.int64)},
+                       embedding_column_b._parse_example_spec)
 
-  @test_util.run_deprecated_v1
-  def test_invalid_initializer(self):
-    categorical_column_a = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=3)
-    categorical_column_b = fc._categorical_column_with_identity(
-        key='bbb', num_buckets=3)
-    with self.assertRaisesRegex(ValueError, 'initializer must be callable'):
-      fc_new.shared_embedding_columns(
+  def test_deep_copy(self):
+    with ops.Graph().as_default():
+      categorical_column_a = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=3)
+      categorical_column_b = fc._categorical_column_with_identity(
+          key='bbb', num_buckets=3)
+      embedding_dimension = 2
+      original_a, _ = fc_new.shared_embedding_columns(
           [categorical_column_a, categorical_column_b],
-          dimension=2,
-          initializer='not_fn')
+          dimension=embedding_dimension,
+          combiner='my_combiner',
+          initializer=lambda: 'my_initializer',
+          shared_embedding_collection_name='shared_embedding_collection_name',
+          ckpt_to_load_from='my_ckpt',
+          tensor_name_in_ckpt='my_ckpt_tensor',
+          max_norm=42.,
+          trainable=False)
+      for embedding_column_a in (original_a, copy.deepcopy(original_a)):
+        self.assertEqual('aaa', embedding_column_a.categorical_column.name)
+        self.assertEqual(3, embedding_column_a.categorical_column._num_buckets)
+        self.assertEqual(
+            {'aaa': parsing_ops.VarLenFeature(dtypes.int64)},
+            embedding_column_a.categorical_column._parse_example_spec)
+
+        self.assertEqual(embedding_dimension, embedding_column_a.dimension)
+        self.assertEqual('my_combiner', embedding_column_a.combiner)
+        self.assertEqual('shared_embedding_collection_name',
+                         embedding_column_a.shared_embedding_collection_name)
+        self.assertEqual('my_ckpt', embedding_column_a.ckpt_to_load_from)
+        self.assertEqual('my_ckpt_tensor',
+                         embedding_column_a.tensor_name_in_ckpt)
+        self.assertEqual(42., embedding_column_a.max_norm)
+        self.assertFalse(embedding_column_a.trainable)
+        self.assertEqual('aaa_shared_embedding', embedding_column_a.name)
+        self.assertEqual((embedding_dimension,),
+                         embedding_column_a._variable_shape)
+        self.assertEqual({'aaa': parsing_ops.VarLenFeature(dtypes.int64)},
+                         embedding_column_a._parse_example_spec)
+
+  def test_invalid_initializer(self):
+    with ops.Graph().as_default():
+      categorical_column_a = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=3)
+      categorical_column_b = fc._categorical_column_with_identity(
+          key='bbb', num_buckets=3)
+      with self.assertRaisesRegex(ValueError, 'initializer must be callable'):
+        fc_new.shared_embedding_columns(
+            [categorical_column_a, categorical_column_b],
+            dimension=2,
+            initializer='not_fn')
 
-  @test_util.run_deprecated_v1
   def test_incompatible_column_type(self):
-    categorical_column_a = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=3)
-    categorical_column_b = fc._categorical_column_with_identity(
-        key='bbb', num_buckets=3)
-    categorical_column_c = fc._categorical_column_with_hash_bucket(
-        key='ccc', hash_bucket_size=3)
-    with self.assertRaisesRegex(
-        ValueError, 'all categorical_columns must have the same type.*'
-        '_IdentityCategoricalColumn.*_HashedCategoricalColumn'):
+    with ops.Graph().as_default():
+      categorical_column_a = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=3)
+      categorical_column_b = fc._categorical_column_with_identity(
+          key='bbb', num_buckets=3)
+      categorical_column_c = fc._categorical_column_with_hash_bucket(
+          key='ccc', hash_bucket_size=3)
+      with self.assertRaisesRegex(
+          ValueError, 'all categorical_columns must have the same type.*'
+          '_IdentityCategoricalColumn.*_HashedCategoricalColumn'):
+        fc_new.shared_embedding_columns(
+            [categorical_column_a, categorical_column_b, categorical_column_c],
+            dimension=2)
+
+  def test_weighted_categorical_column_ok(self):
+    with ops.Graph().as_default():
+      categorical_column_a = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=3)
+      weighted_categorical_column_a = fc._weighted_categorical_column(
+          categorical_column_a, weight_feature_key='aaa_weights')
+      categorical_column_b = fc._categorical_column_with_identity(
+          key='bbb', num_buckets=3)
+      weighted_categorical_column_b = fc._weighted_categorical_column(
+          categorical_column_b, weight_feature_key='bbb_weights')
       fc_new.shared_embedding_columns(
-          [categorical_column_a, categorical_column_b, categorical_column_c],
+          [weighted_categorical_column_a, categorical_column_b], dimension=2)
+      fc_new.shared_embedding_columns(
+          [categorical_column_a, weighted_categorical_column_b], dimension=2)
+      fc_new.shared_embedding_columns(
+          [weighted_categorical_column_a, weighted_categorical_column_b],
           dimension=2)
 
-  @test_util.run_deprecated_v1
-  def test_weighted_categorical_column_ok(self):
-    categorical_column_a = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=3)
-    weighted_categorical_column_a = fc._weighted_categorical_column(
-        categorical_column_a, weight_feature_key='aaa_weights')
-    categorical_column_b = fc._categorical_column_with_identity(
-        key='bbb', num_buckets=3)
-    weighted_categorical_column_b = fc._weighted_categorical_column(
-        categorical_column_b, weight_feature_key='bbb_weights')
-    fc_new.shared_embedding_columns(
-        [weighted_categorical_column_a, categorical_column_b], dimension=2)
-    fc_new.shared_embedding_columns(
-        [categorical_column_a, weighted_categorical_column_b], dimension=2)
-    fc_new.shared_embedding_columns(
-        [weighted_categorical_column_a, weighted_categorical_column_b],
-        dimension=2)
-
-  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    a = fc._categorical_column_with_vocabulary_list(
-        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    b = fc._categorical_column_with_vocabulary_list(
-        key='bbb', vocabulary_list=('omar', 'stringer', 'marlo'))
-    a_embedded, b_embedded = fc_new.shared_embedding_columns([a, b],
-                                                             dimension=2)
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'omar', b'stringer'])),
-            'bbb':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'stringer', b'marlo'])),
-        }))
-    features = parsing_ops.parse_example(
-        serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a_embedded, b_embedded]))
-    self.assertIn('aaa', features)
-    self.assertIn('bbb', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([b'omar', b'stringer'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([b'stringer', b'marlo'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['bbb'].eval())
+    with ops.Graph().as_default():
+      a = fc._categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+      b = fc._categorical_column_with_vocabulary_list(
+          key='bbb', vocabulary_list=('omar', 'stringer', 'marlo'))
+      a_embedded, b_embedded = fc_new.shared_embedding_columns([a, b],
+                                                               dimension=2)
+      data = example_pb2.Example(
+          features=feature_pb2.Features(
+              feature={
+                  'aaa':
+                      feature_pb2.Feature(
+                          bytes_list=feature_pb2.BytesList(
+                              value=[b'omar', b'stringer'])),
+                  'bbb':
+                      feature_pb2.Feature(
+                          bytes_list=feature_pb2.BytesList(
+                              value=[b'stringer', b'marlo'])),
+              }))
+      features = parsing_ops.parse_example(
+          serialized=[data.SerializeToString()],
+          features=fc.make_parse_example_spec([a_embedded, b_embedded]))
+      self.assertIn('aaa', features)
+      self.assertIn('bbb', features)
+      with self.cached_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=[[0, 0], [0, 1]],
+                values=np.array([b'omar', b'stringer'], dtype=np.object_),
+                dense_shape=[1, 2]), features['aaa'].eval())
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=[[0, 0], [0, 1]],
+                values=np.array([b'stringer', b'marlo'], dtype=np.object_),
+                dense_shape=[1, 2]), features['bbb'].eval())
 
-  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    a = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
-    b = fc._categorical_column_with_identity(key='bbb', num_buckets=3)
-    a_embedded, b_embedded = fc_new.shared_embedding_columns([a, b],
-                                                             dimension=2)
-    features = {
-        'aaa': sparse_tensor.SparseTensor(
-            indices=((0, 0), (1, 0), (1, 1)),
-            values=(0, 1, 0),
-            dense_shape=(2, 2)),
-        'bbb': sparse_tensor.SparseTensor(
-            indices=((0, 0), (1, 0), (1, 1)),
-            values=(1, 2, 1),
-            dense_shape=(2, 2)),
-    }
-    outputs = _transform_features(features, [a, a_embedded, b, b_embedded])
-    output_a = outputs[a]
-    output_a_embedded = outputs[a_embedded]
-    output_b = outputs[b]
-    output_b_embedded = outputs[b_embedded]
-    with _initialized_session():
-      _assert_sparse_tensor_value(self, self.evaluate(output_a),
-                                  self.evaluate(output_a_embedded))
-      _assert_sparse_tensor_value(self, self.evaluate(output_b),
-                                  self.evaluate(output_b_embedded))
+    with ops.Graph().as_default():
+      a = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
+      b = fc._categorical_column_with_identity(key='bbb', num_buckets=3)
+      a_embedded, b_embedded = fc_new.shared_embedding_columns([a, b],
+                                                               dimension=2)
+      features = {
+          'aaa':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 1, 0),
+                  dense_shape=(2, 2)),
+          'bbb':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(1, 2, 1),
+                  dense_shape=(2, 2)),
+      }
+      outputs = _transform_features(features, [a, a_embedded, b, b_embedded])
+      output_a = outputs[a]
+      output_a_embedded = outputs[a_embedded]
+      output_b = outputs[b]
+      output_b_embedded = outputs[b_embedded]
+      with _initialized_session():
+        _assert_sparse_tensor_value(self, self.evaluate(output_a),
+                                    self.evaluate(output_a_embedded))
+        _assert_sparse_tensor_value(self, self.evaluate(output_b),
+                                    self.evaluate(output_b_embedded))
 
   @parameterized.named_parameters(
       {
@@ -5825,162 +5775,165 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
           'use_safe_embedding_lookup': False,
           'partition_variables': True,
       })
-  @test_util.run_deprecated_v1
+
   def test_get_dense_tensor(self, use_safe_embedding_lookup,
                             partition_variables):
-    # Inputs.
-    vocabulary_size = 4
-    # -1 values are ignored.
-    input_a = np.array([
-        [2, -1, -1],  # example 0, ids [2]
-        [0, 1, -1]
-    ])  # example 1, ids [0, 1]
-    input_b = np.array([
-        [0, -1, -1],  # example 0, ids [0]
-        [-1, -1, -1]
-    ])  # example 1, ids []
-    input_features = {'aaa': input_a, 'bbb': input_b}
+    with ops.Graph().as_default():
+      # Inputs.
+      vocabulary_size = 4
+      # -1 values are ignored.
+      input_a = np.array([
+          [2, -1, -1],  # example 0, ids [2]
+          [0, 1, -1]
+      ])  # example 1, ids [0, 1]
+      input_b = np.array([
+          [0, -1, -1],  # example 0, ids [0]
+          [-1, -1, -1]
+      ])  # example 1, ids []
+      input_features = {'aaa': input_a, 'bbb': input_b}
 
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.),  # id 2
-        (9., 13.)  # id 3
-    )
+      # Embedding variable.
+      embedding_dimension = 2
+      embedding_values = (
+          (1., 2.),  # id 0
+          (3., 5.),  # id 1
+          (7., 11.),  # id 2
+          (9., 13.)  # id 3
+      )
 
-    def _initializer(shape, dtype, partition_info=None):
-      if partition_variables:
-        self.assertEqual([vocabulary_size, embedding_dimension],
-                         partition_info.full_shape)
-        self.assertAllEqual((2, embedding_dimension), shape)
-      else:
-        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-        self.assertIsNone(partition_info)
+      def _initializer(shape, dtype, partition_info=None):
+        if partition_variables:
+          self.assertEqual([vocabulary_size, embedding_dimension],
+                           partition_info.full_shape)
+          self.assertAllEqual((2, embedding_dimension), shape)
+        else:
+          self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+          self.assertIsNone(partition_info)
 
-      self.assertEqual(dtypes.float32, dtype)
-      return embedding_values
+        self.assertEqual(dtypes.float32, dtype)
+        return embedding_values
 
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups_a = (
-        # example 0:
-        (7., 11.),  # ids [2], embedding = [7, 11]
-        # example 1:
-        (2., 3.5),  # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-    )
-    if use_safe_embedding_lookup:
-      expected_lookups_b = (
+      # Expected lookup result, using combiner='mean'.
+      expected_lookups_a = (
           # example 0:
-          (1., 2.),  # ids [0], embedding = [1, 2]
+          (7., 11.),  # ids [2], embedding = [7, 11]
           # example 1:
-          (0., 0.),  # ids [], embedding = [0, 0]
+          (2., 3.5),  # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
       )
-    else:
-      expected_lookups_b = (
-          # example 0:
-          (1., 2.),  # ids [0], embedding = [1, 2]
+      if use_safe_embedding_lookup:
+        expected_lookups_b = (
+            # example 0:
+            (1., 2.),  # ids [0], embedding = [1, 2]
+            # example 1:
+            (0., 0.),  # ids [], embedding = [0, 0]
+        )
+      else:
+        expected_lookups_b = (
+            # example 0:
+            (1., 2.),  # ids [0], embedding = [1, 2]
+        )
+
+      # Build columns.
+      categorical_column_a = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=vocabulary_size)
+      categorical_column_b = fc._categorical_column_with_identity(
+          key='bbb', num_buckets=vocabulary_size)
+
+      partitioner = None
+      if partition_variables:
+        partitioner = partitioned_variables.fixed_size_partitioner(2, axis=0)
+
+      with variable_scope.variable_scope('vars', partitioner=partitioner):
+        embedding_column_a, embedding_column_b = (
+            fc_new.shared_embedding_columns(
+                [categorical_column_a, categorical_column_b],
+                dimension=embedding_dimension,
+                initializer=_initializer,
+                use_safe_embedding_lookup=use_safe_embedding_lookup))
+        # Provide sparse input and get dense result.
+        embedding_lookup_a = embedding_column_a._get_dense_tensor(
+            _LazyBuilder(input_features))
+        embedding_lookup_b = embedding_column_b._get_dense_tensor(
+            _LazyBuilder(input_features))
+      # Assert expected embedding variable and lookups.
+      global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+      if partition_variables:
+        self.assertCountEqual(('vars/embedding_weights/part_0:0',
+                               'vars/embedding_weights/part_1:0'),
+                              tuple([v.name for v in global_vars]))
+      else:
+        self.assertCountEqual(('vars/embedding_weights:0',),
+                              tuple([v.name for v in global_vars]))
+      embedding_var = global_vars[0]
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllEqual(embedding_values, self.evaluate(embedding_var))
+      self.assertAllEqual(expected_lookups_a, self.evaluate(embedding_lookup_a))
+      self.assertAllEqual(expected_lookups_b, self.evaluate(embedding_lookup_b))
+
+      if use_safe_embedding_lookup:
+        self.assertIn(
+            'SparseFillEmptyRows',
+            [x.type for x in ops.get_default_graph().get_operations()])
+      else:
+        self.assertNotIn(
+            'SparseFillEmptyRows',
+            [x.type for x in ops.get_default_graph().get_operations()])
+
+  def test_get_dense_tensor_weight_collections(self):
+    with ops.Graph().as_default():
+      # Inputs.
+      vocabulary_size = 3
+      # -1 values are ignored.
+      input_a = np.array([
+          [2, -1, -1],  # example 0, ids [2]
+          [0, 1, -1]
+      ])  # example 1, ids [0, 1]
+      input_b = np.array([
+          [0, -1, -1],  # example 0, ids [0]
+          [-1, -1, -1]
+      ])  # example 1, ids []
+      input_features = {'aaa': input_a, 'bbb': input_b}
+
+      # Embedding variable.
+      embedding_dimension = 2
+      embedding_values = (
+          (1., 2.),  # id 0
+          (3., 5.),  # id 1
+          (7., 11.)  # id 2
       )
 
-    # Build columns.
-    categorical_column_a = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc._categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
+      def _initializer(shape, dtype, partition_info):
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertEqual(dtypes.float32, dtype)
+        self.assertIsNone(partition_info)
+        return embedding_values
 
-    partitioner = None
-    if partition_variables:
-      partitioner = partitioned_variables.fixed_size_partitioner(2, axis=0)
-
-    with variable_scope.variable_scope('vars', partitioner=partitioner):
+      # Build columns.
+      categorical_column_a = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=vocabulary_size)
+      categorical_column_b = fc._categorical_column_with_identity(
+          key='bbb', num_buckets=vocabulary_size)
       embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
           [categorical_column_a, categorical_column_b],
           dimension=embedding_dimension,
-          initializer=_initializer,
-          use_safe_embedding_lookup=use_safe_embedding_lookup)
-      # Provide sparse input and get dense result.
-      embedding_lookup_a = embedding_column_a._get_dense_tensor(
-          _LazyBuilder(input_features))
-      embedding_lookup_b = embedding_column_b._get_dense_tensor(
-          _LazyBuilder(input_features))
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    if partition_variables:
-      self.assertCountEqual(('vars/embedding_weights/part_0:0',
-                             'vars/embedding_weights/part_1:0'),
-                            tuple([v.name for v in global_vars]))
-    else:
-      self.assertCountEqual(('vars/embedding_weights:0',),
-                            tuple([v.name for v in global_vars]))
-    embedding_var = global_vars[0]
+          initializer=_initializer)
 
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
+      fc.input_layer(
+          input_features, [embedding_column_a, embedding_column_b],
+          weight_collections=('my_vars',))
 
-    self.assertAllEqual(embedding_values, self.evaluate(embedding_var))
-    self.assertAllEqual(expected_lookups_a, self.evaluate(embedding_lookup_a))
-    self.assertAllEqual(expected_lookups_b, self.evaluate(embedding_lookup_b))
-
-    if use_safe_embedding_lookup:
-      self.assertIn('SparseFillEmptyRows',
-                    [x.type for x in ops.get_default_graph().get_operations()])
-    else:
-      self.assertNotIn(
-          'SparseFillEmptyRows',
-          [x.type for x in ops.get_default_graph().get_operations()])
-
-  @test_util.run_deprecated_v1
-  def test_get_dense_tensor_weight_collections(self):
-    # Inputs.
-    vocabulary_size = 3
-    # -1 values are ignored.
-    input_a = np.array([
-        [2, -1, -1],  # example 0, ids [2]
-        [0, 1, -1]
-    ])  # example 1, ids [0, 1]
-    input_b = np.array([
-        [0, -1, -1],  # example 0, ids [0]
-        [-1, -1, -1]
-    ])  # example 1, ids []
-    input_features = {'aaa': input_a, 'bbb': input_b}
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Build columns.
-    categorical_column_a = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc._categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer)
-
-    fc.input_layer(
-        input_features, [embedding_column_a, embedding_column_b],
-        weight_collections=('my_vars',))
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertCountEqual(
-        ('input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
-        tuple(v.name for v in global_vars))
-    my_vars = ops.get_collection('my_vars')
-    self.assertCountEqual(
-        ('input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
-        tuple(v.name for v in my_vars))
+      # Assert expected embedding variable and lookups.
+      global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+      self.assertCountEqual(
+          ('input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
+          tuple(v.name for v in global_vars))
+      my_vars = ops.get_collection('my_vars')
+      self.assertCountEqual(
+          ('input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
+          tuple(v.name for v in my_vars))
 
   @test_util.run_deprecated_v1
   def test_get_dense_tensor_placeholder_inputs(self):
@@ -6039,40 +5992,42 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
     with _initialized_session() as sess:
       sess.run([embedding_lookup_a, embedding_lookup_b], feed_dict=feed_dict)
 
-  @test_util.run_deprecated_v1
   def test_linear_model(self):
-    # Inputs.
-    batch_size = 2
-    vocabulary_size = 3
-    # -1 values are ignored.
-    input_a = np.array(
-        [[2, -1, -1],  # example 0, ids [2]
-         [0, 1, -1]])  # example 1, ids [0, 1]
-    input_b = np.array(
-        [[0, -1, -1],  # example 0, ids [0]
-         [-1, -1, -1]])  # example 1, ids []
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_shape = (vocabulary_size, embedding_dimension)
-    zeros_embedding_values = np.zeros(embedding_shape)
-    def _initializer(shape, dtype, partition_info):
-      self.assertAllEqual(embedding_shape, shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return zeros_embedding_values
-
-    # Build columns.
-    categorical_column_a = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc._categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer)
-
     with ops.Graph().as_default():
+      # Inputs.
+      batch_size = 2
+      vocabulary_size = 3
+      # -1 values are ignored.
+      input_a = np.array([
+          [2, -1, -1],  # example 0, ids [2]
+          [0, 1, -1]
+      ])  # example 1, ids [0, 1]
+      input_b = np.array([
+          [0, -1, -1],  # example 0, ids [0]
+          [-1, -1, -1]
+      ])  # example 1, ids []
+
+      # Embedding variable.
+      embedding_dimension = 2
+      embedding_shape = (vocabulary_size, embedding_dimension)
+      zeros_embedding_values = np.zeros(embedding_shape)
+
+      def _initializer(shape, dtype, partition_info):
+        self.assertAllEqual(embedding_shape, shape)
+        self.assertEqual(dtypes.float32, dtype)
+        self.assertIsNone(partition_info)
+        return zeros_embedding_values
+
+      # Build columns.
+      categorical_column_a = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=vocabulary_size)
+      categorical_column_b = fc._categorical_column_with_identity(
+          key='bbb', num_buckets=vocabulary_size)
+      embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
+          [categorical_column_a, categorical_column_b],
+          dimension=embedding_dimension,
+          initializer=_initializer)
+
       predictions = fc.linear_model({
           categorical_column_a.name: input_a,
           categorical_column_b.name: input_b,
@@ -6130,43 +6085,42 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
         # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
         self.assertAllClose([[94. + 13.], [29.]], self.evaluate(predictions))
 
-  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
-    # Inputs.
-    batch_size = 2
-    vocabulary_size = 3
-    # -1 values are ignored.
-    input_a = np.array([
-        [2, -1, -1],  # example 0, ids [2]
-        [0, 1, -1]
-    ])  # example 1, ids [0, 1]
-    input_b = np.array([
-        [0, -1, -1],  # example 0, ids [0]
-        [-1, -1, -1]
-    ])  # example 1, ids []
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_shape = (vocabulary_size, embedding_dimension)
-    zeros_embedding_values = np.zeros(embedding_shape)
-
-    def _initializer(shape, dtype, partition_info):
-      self.assertAllEqual(embedding_shape, shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return zeros_embedding_values
-
-    # Build columns.
-    categorical_column_a = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc._categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer)
-
     with ops.Graph().as_default():
+      # Inputs.
+      batch_size = 2
+      vocabulary_size = 3
+      # -1 values are ignored.
+      input_a = np.array([
+          [2, -1, -1],  # example 0, ids [2]
+          [0, 1, -1]
+      ])  # example 1, ids [0, 1]
+      input_b = np.array([
+          [0, -1, -1],  # example 0, ids [0]
+          [-1, -1, -1]
+      ])  # example 1, ids []
+
+      # Embedding variable.
+      embedding_dimension = 2
+      embedding_shape = (vocabulary_size, embedding_dimension)
+      zeros_embedding_values = np.zeros(embedding_shape)
+
+      def _initializer(shape, dtype, partition_info):
+        self.assertAllEqual(embedding_shape, shape)
+        self.assertEqual(dtypes.float32, dtype)
+        self.assertIsNone(partition_info)
+        return zeros_embedding_values
+
+      # Build columns.
+      categorical_column_a = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=vocabulary_size)
+      categorical_column_b = fc._categorical_column_with_identity(
+          key='bbb', num_buckets=vocabulary_size)
+      embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
+          [categorical_column_a, categorical_column_b],
+          dimension=embedding_dimension,
+          initializer=_initializer)
+
       predictions = get_keras_linear_model_predictions({
           categorical_column_a.name: input_a,
           categorical_column_b.name: input_b,
@@ -6225,91 +6179,93 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
         self.assertAllClose([[94. + 13.], [29.]], self.evaluate(predictions))
 
   def _test_input_layer(self, trainable=True):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input_a = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 4)),
-        values=(2, 0, 1),
-        dense_shape=(2, 5))
-    sparse_input_b = sparse_tensor.SparseTensorValue(
-        # example 0, ids [0]
-        # example 1, ids []
-        indices=((0, 0),),
-        values=(0,),
-        dense_shape=(2, 5))
+    with ops.Graph().as_default():
+      # Inputs.
+      vocabulary_size = 3
+      sparse_input_a = sparse_tensor.SparseTensorValue(
+          # example 0, ids [2]
+          # example 1, ids [0, 1]
+          indices=((0, 0), (1, 0), (1, 4)),
+          values=(2, 0, 1),
+          dense_shape=(2, 5))
+      sparse_input_b = sparse_tensor.SparseTensorValue(
+          # example 0, ids [0]
+          # example 1, ids []
+          indices=((0, 0),),
+          values=(0,),
+          dense_shape=(2, 5))
 
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-    def _initializer(shape, dtype, partition_info):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
+      # Embedding variable.
+      embedding_dimension = 2
+      embedding_values = (
+          (1., 2.),  # id 0
+          (3., 5.),  # id 1
+          (7., 11.)  # id 2
+      )
 
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0:
-        # A ids [2], embedding = [7, 11]
-        # B ids [0], embedding = [1, 2]
-        (7., 11., 1., 2.),
-        # example 1:
-        # A ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # B ids [], embedding = [0, 0]
-        (2., 3.5, 0., 0.),
-    )
+      def _initializer(shape, dtype, partition_info):
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertEqual(dtypes.float32, dtype)
+        self.assertIsNone(partition_info)
+        return embedding_values
 
-    # Build columns.
-    categorical_column_a = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc._categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=trainable)
+      # Expected lookup result, using combiner='mean'.
+      expected_lookups = (
+          # example 0:
+          # A ids [2], embedding = [7, 11]
+          # B ids [0], embedding = [1, 2]
+          (7., 11., 1., 2.),
+          # example 1:
+          # A ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+          # B ids [], embedding = [0, 0]
+          (2., 3.5, 0., 0.),
+      )
 
-    # Provide sparse input and get dense result.
-    input_layer = fc.input_layer(
-        features={'aaa': sparse_input_a, 'bbb': sparse_input_b},
-        feature_columns=(embedding_column_b, embedding_column_a))
+      # Build columns.
+      categorical_column_a = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=vocabulary_size)
+      categorical_column_b = fc._categorical_column_with_identity(
+          key='bbb', num_buckets=vocabulary_size)
+      embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
+          [categorical_column_a, categorical_column_b],
+          dimension=embedding_dimension,
+          initializer=_initializer,
+          trainable=trainable)
 
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertCountEqual(
-        ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
-        tuple([v.name for v in global_vars]))
-    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    if trainable:
+      # Provide sparse input and get dense result.
+      input_layer = fc.input_layer(
+          features={
+              'aaa': sparse_input_a,
+              'bbb': sparse_input_b
+          },
+          feature_columns=(embedding_column_b, embedding_column_a))
+
+      # Assert expected embedding variable and lookups.
+      global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
       self.assertCountEqual(
           ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
-          tuple([v.name for v in trainable_vars]))
-    else:
-      self.assertCountEqual([], tuple([v.name for v in trainable_vars]))
-    shared_embedding_vars = global_vars
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, shared_embedding_vars[0])
-      self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
+          tuple([v.name for v in global_vars]))
+      trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      if trainable:
+        self.assertCountEqual(
+            ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
+            tuple([v.name for v in trainable_vars]))
+      else:
+        self.assertCountEqual([], tuple([v.name for v in trainable_vars]))
+      shared_embedding_vars = global_vars
+      with _initialized_session():
+        self.assertAllEqual(embedding_values, shared_embedding_vars[0])
+        self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
 
-  @test_util.run_deprecated_v1
   def test_input_layer(self):
     self._test_input_layer()
 
-  @test_util.run_deprecated_v1
   def test_input_layer_no_trainable(self):
     self._test_input_layer(trainable=False)
 
 
 class WeightedCategoricalColumnTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def test_defaults(self):
     column = fc._weighted_categorical_column(
         categorical_column=fc._categorical_column_with_identity(
@@ -6323,7 +6279,6 @@ class WeightedCategoricalColumnTest(test.TestCase):
         'values': parsing_ops.VarLenFeature(dtypes.float32)
     }, column._parse_example_spec)
 
-  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     """Tests deepcopy of categorical_column_with_hash_bucket."""
     original = fc._weighted_categorical_column(
@@ -6422,95 +6377,94 @@ class WeightedCategoricalColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['weights'].eval())
 
-  @test_util.run_deprecated_v1
   def test_transform_features(self):
-    column = fc._weighted_categorical_column(
-        categorical_column=fc._categorical_column_with_identity(
-            key='ids', num_buckets=3),
-        weight_feature_key='values')
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(0, 1, 0),
-        dense_shape=(2, 2))
-    weights = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(0.5, 1.0, 0.1),
-        dense_shape=(2, 2))
-    id_tensor, weight_tensor = _transform_features({
-        'ids': inputs,
-        'values': weights,
-    }, (column,))[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array(inputs.values, dtype=np.int64),
-              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=weights.indices,
-              values=np.array(weights.values, dtype=np.float32),
-              dense_shape=weights.dense_shape), self.evaluate(weight_tensor))
+    with ops.Graph().as_default():
+      column = fc._weighted_categorical_column(
+          categorical_column=fc._categorical_column_with_identity(
+              key='ids', num_buckets=3),
+          weight_feature_key='values')
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=(0, 1, 0),
+          dense_shape=(2, 2))
+      weights = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=(0.5, 1.0, 0.1),
+          dense_shape=(2, 2))
+      id_tensor, weight_tensor = _transform_features({
+          'ids': inputs,
+          'values': weights,
+      }, (column,))[column]
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array(inputs.values, dtype=np.int64),
+                dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=weights.indices,
+                values=np.array(weights.values, dtype=np.float32),
+                dense_shape=weights.dense_shape), self.evaluate(weight_tensor))
 
-  @test_util.run_deprecated_v1
   def test_transform_features_dense_input(self):
-    column = fc._weighted_categorical_column(
-        categorical_column=fc._categorical_column_with_identity(
-            key='ids', num_buckets=3),
-        weight_feature_key='values')
-    weights = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(0.5, 1.0, 0.1),
-        dense_shape=(2, 2))
-    id_tensor, weight_tensor = _transform_features({
-        'ids': ((0, -1), (1, 0)),
-        'values': weights,
-    }, (column,))[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=(2, 2)), self.evaluate(id_tensor))
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=weights.indices,
-              values=np.array(weights.values, dtype=np.float32),
-              dense_shape=weights.dense_shape), self.evaluate(weight_tensor))
+    with ops.Graph().as_default():
+      column = fc._weighted_categorical_column(
+          categorical_column=fc._categorical_column_with_identity(
+              key='ids', num_buckets=3),
+          weight_feature_key='values')
+      weights = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=(0.5, 1.0, 0.1),
+          dense_shape=(2, 2))
+      id_tensor, weight_tensor = _transform_features({
+          'ids': ((0, -1), (1, 0)),
+          'values': weights,
+      }, (column,))[column]
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=np.array((0, 1, 0), dtype=np.int64),
+                dense_shape=(2, 2)), self.evaluate(id_tensor))
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=weights.indices,
+                values=np.array(weights.values, dtype=np.float32),
+                dense_shape=weights.dense_shape), self.evaluate(weight_tensor))
 
-  @test_util.run_deprecated_v1
   def test_transform_features_dense_weights(self):
-    column = fc._weighted_categorical_column(
-        categorical_column=fc._categorical_column_with_identity(
-            key='ids', num_buckets=3),
-        weight_feature_key='values')
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 1, 0),
-        dense_shape=(2, 2))
-    id_tensor, weight_tensor = _transform_features({
-        'ids': inputs,
-        'values': ((.5, 0.), (1., .1)),
-    }, (column,))[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array(inputs.values, dtype=np.int64),
-              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=np.array((.5, 1., .1), dtype=np.float32),
-              dense_shape=(2, 2)), self.evaluate(weight_tensor))
+    with ops.Graph().as_default():
+      column = fc._weighted_categorical_column(
+          categorical_column=fc._categorical_column_with_identity(
+              key='ids', num_buckets=3),
+          weight_feature_key='values')
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=(2, 1, 0),
+          dense_shape=(2, 2))
+      id_tensor, weight_tensor = _transform_features({
+          'ids': inputs,
+          'values': ((.5, 0.), (1., .1)),
+      }, (column,))[column]
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array(inputs.values, dtype=np.int64),
+                dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=np.array((.5, 1., .1), dtype=np.float32),
+                dense_shape=(2, 2)), self.evaluate(weight_tensor))
 
-  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
     column = fc._weighted_categorical_column(
         categorical_column=fc._categorical_column_with_identity(
@@ -6613,7 +6567,6 @@ class WeightedCategoricalColumnTest(test.TestCase):
         # = 3*1 + 2*.1 = 3+.2 = 3.2
         self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
-  @test_util.run_deprecated_v1
   def test_linear_model(self):
     column = fc._weighted_categorical_column(
         categorical_column=fc._categorical_column_with_identity(
diff --git a/tensorflow/python/feature_column/sequence_feature_column_test.py b/tensorflow/python/feature_column/sequence_feature_column_test.py
index da6d1dee4ba..e98a202bc5a 100644
--- a/tensorflow/python/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/python/feature_column/sequence_feature_column_test.py
@@ -516,7 +516,6 @@ class SequenceEmbeddingColumnTest(
 
 class SequenceSharedEmbeddingColumnTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def test_get_sequence_dense_tensor(self):
     vocabulary_size = 3
     embedding_dimension = 2
@@ -532,67 +531,68 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
       self.assertIsNone(partition_info)
       return embedding_values
 
-    sparse_input_a = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 1), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 2))
-    sparse_input_b = sparse_tensor.SparseTensorValue(
-        # example 0, ids [1]
-        # example 1, ids [0, 2]
-        # example 2, ids [0]
-        # example 3, ids []
-        indices=((0, 0), (1, 0), (1, 1), (2, 0)),
-        values=(1, 0, 2, 0),
-        dense_shape=(4, 2))
+    with ops.Graph().as_default():
+      sparse_input_a = sparse_tensor.SparseTensorValue(
+          # example 0, ids [2]
+          # example 1, ids [0, 1]
+          # example 2, ids []
+          # example 3, ids [1]
+          indices=((0, 0), (1, 0), (1, 1), (3, 0)),
+          values=(2, 0, 1, 1),
+          dense_shape=(4, 2))
+      sparse_input_b = sparse_tensor.SparseTensorValue(
+          # example 0, ids [1]
+          # example 1, ids [0, 2]
+          # example 2, ids [0]
+          # example 3, ids []
+          indices=((0, 0), (1, 0), (1, 1), (2, 0)),
+          values=(1, 0, 2, 0),
+          dense_shape=(4, 2))
 
-    expected_lookups_a = [
-        # example 0, ids [2]
-        [[7., 11.], [0., 0.]],
-        # example 1, ids [0, 1]
-        [[1., 2.], [3., 5.]],
-        # example 2, ids []
-        [[0., 0.], [0., 0.]],
-        # example 3, ids [1]
-        [[3., 5.], [0., 0.]],
-    ]
+      expected_lookups_a = [
+          # example 0, ids [2]
+          [[7., 11.], [0., 0.]],
+          # example 1, ids [0, 1]
+          [[1., 2.], [3., 5.]],
+          # example 2, ids []
+          [[0., 0.], [0., 0.]],
+          # example 3, ids [1]
+          [[3., 5.], [0., 0.]],
+      ]
 
-    expected_lookups_b = [
-        # example 0, ids [1]
-        [[3., 5.], [0., 0.]],
-        # example 1, ids [0, 2]
-        [[1., 2.], [7., 11.]],
-        # example 2, ids [0]
-        [[1., 2.], [0., 0.]],
-        # example 3, ids []
-        [[0., 0.], [0., 0.]],
-    ]
+      expected_lookups_b = [
+          # example 0, ids [1]
+          [[3., 5.], [0., 0.]],
+          # example 1, ids [0, 2]
+          [[1., 2.], [7., 11.]],
+          # example 2, ids [0]
+          [[1., 2.], [0., 0.]],
+          # example 3, ids []
+          [[0., 0.], [0., 0.]],
+      ]
 
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = sfc.sequence_categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc.shared_embedding_columns_v2(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer)
+      categorical_column_a = sfc.sequence_categorical_column_with_identity(
+          key='aaa', num_buckets=vocabulary_size)
+      categorical_column_b = sfc.sequence_categorical_column_with_identity(
+          key='bbb', num_buckets=vocabulary_size)
+      shared_embedding_columns = fc.shared_embedding_columns_v2(
+          [categorical_column_a, categorical_column_b],
+          dimension=embedding_dimension,
+          initializer=_initializer)
 
-    embedding_lookup_a = _get_sequence_dense_tensor(
-        shared_embedding_columns[0], {'aaa': sparse_input_a})[0]
-    embedding_lookup_b = _get_sequence_dense_tensor(
-        shared_embedding_columns[1], {'bbb': sparse_input_b})[0]
+      embedding_lookup_a = _get_sequence_dense_tensor(
+          shared_embedding_columns[0], {'aaa': sparse_input_a})[0]
+      embedding_lookup_b = _get_sequence_dense_tensor(
+          shared_embedding_columns[1], {'bbb': sparse_input_b})[0]
 
-    self.evaluate(variables_lib.global_variables_initializer())
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('aaa_bbb_shared_embedding:0',),
-                          tuple([v.name for v in global_vars]))
-    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
-    self.assertAllEqual(
-        expected_lookups_a, self.evaluate(embedding_lookup_a))
-    self.assertAllEqual(expected_lookups_b, self.evaluate(embedding_lookup_b))
+      self.evaluate(variables_lib.global_variables_initializer())
+      global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+      self.assertItemsEqual(('aaa_bbb_shared_embedding:0',),
+                            tuple([v.name for v in global_vars]))
+      self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+      self.assertAllEqual(
+          expected_lookups_a, self.evaluate(embedding_lookup_a))
+      self.assertAllEqual(expected_lookups_b, self.evaluate(embedding_lookup_b))
 
   def test_sequence_length(self):
     with ops.Graph().as_default():
diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py
index 0928661ef76..06a5b6dea33 100644
--- a/tensorflow/python/framework/auto_control_deps.py
+++ b/tensorflow/python/framework/auto_control_deps.py
@@ -42,6 +42,7 @@ from tensorflow.python.util import tf_decorator
 ASYNC_STATEFUL_OPS = [
     "CollectiveGather",
     "CollectiveReduce",
+    "CollectiveReduceV2",
     "CollectiveBcastSend",
     "CollectiveBcastRecv",
     "NcclAllReduce",
@@ -74,13 +75,13 @@ LEGACY_RANDOM_OPS = [
     # random OpKernel instantiation is reused across multiple steps
     # of the loop.  Since legacy Random OpKernels have an internal rng state,
     # automatic dependency tracking across loop steps would likely
-    # fix this race; and for that case this blacklist is problematic.
+    # fix this race; and for that case this denylist is problematic.
     # However, since automatic dependency tracking inside while loops is not
     # currently supported, and there are no other examples of OpKernel reuse
     # (each OpKernel is associated with a unique op in graph mode),
-    # this blacklist has no effect on the aforementioned behavior.
+    # this denylist has no effect on the aforementioned behavior.
     #
-    # TODO(ebrevdo,skyewm): Modify the check against this blacklist to
+    # TODO(ebrevdo,skyewm): Modify the check against this denylist to
     # only occur when the op is inside a "variable initialization scope"; and
     # add proper autodeps inside while_loops that respects this updated check.
     "RandomUniform",
@@ -97,14 +98,15 @@ LEGACY_RANDOM_OPS = [
 ]
 
 _ORDER_INSENSITIVE_STATEFUL_OPS = [
-    "CudnnRNNV2", "CudnnRNNV3", "CudnnRNNBackpropV2", "CudnnRNNBackpropV3",
+    "CudnnRNN", "CudnnRNNBackprop", "CudnnRNNV2", "CudnnRNNV3",
+    "CudnnRNNBackpropV2", "CudnnRNNBackpropV3",
     "EnqueueTPUEmbeddingSparseBatch", "EnqueueTPUEmbeddingIntegerBatch",
     "EnqueueTPUEmbeddingSparseTensorBatch",
     "EnqueueTPUEmbeddingRaggedTensorBatch", "RestoreV2", "SaveV2"
 ]
 # LINT.ThenChange(//tensorflow/core/grappler/optimizers/function_optimizer.cc)
 
-_ALL_BLACKLISTED_OPS = (
+_ALL_DENYLISTED_OPS = (
     set(ASYNC_STATEFUL_OPS) | set(LEGACY_RANDOM_OPS)
     | set(_ORDER_INSENSITIVE_STATEFUL_OPS))
 
@@ -124,7 +126,7 @@ _ALLOWLIST_STATELESS_OPS = [
 
 def op_is_stateful(op):
   # pylint: disable=protected-access
-  return (op._is_stateful and op.type not in _ALL_BLACKLISTED_OPS) or (
+  return (op._is_stateful and op.type not in _ALL_DENYLISTED_OPS) or (
       op.type in _ALLOWLIST_STATELESS_OPS)
 
 
diff --git a/tensorflow/python/framework/auto_control_deps_test.py b/tensorflow/python/framework/auto_control_deps_test.py
index 07049b869e1..dc5d8986958 100644
--- a/tensorflow/python/framework/auto_control_deps_test.py
+++ b/tensorflow/python/framework/auto_control_deps_test.py
@@ -607,9 +607,9 @@ class AutomaticControlDependenciesTest(test.TestCase):
         one = constant_op.constant(1.0)
         one = c.mark_as_return(one)
       one.eval(feed_dict={p: False})
-      self.assertAllEqual(v.read_value().eval(), 5.0)
+      self.assertAllEqual(v.read_value(), 5.0)
       one.eval(feed_dict={p: True})
-      self.assertAllEqual(v.read_value().eval(), 6.0)
+      self.assertAllEqual(v.read_value(), 6.0)
 
   @test_util.run_v1_only("b/120545219")
   def testCondNested(self):
@@ -737,7 +737,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
         v.assign(2 * v)
         return v.read_value()
 
-      self.assertAllEqual(f().eval(), 4.0)
+      self.assertAllEqual(f(), 4.0)
 
   def testOptimizerInDefun(self):
     def loss(v):
diff --git a/tensorflow/python/framework/config_test.py b/tensorflow/python/framework/config_test.py
index 345e7f0d9be..70857ef4b83 100644
--- a/tensorflow/python/framework/config_test.py
+++ b/tensorflow/python/framework/config_test.py
@@ -756,6 +756,44 @@ class DeviceTest(test.TestCase):
                      new_rewrite_options.scoped_allocator_opts.enable_op)
 
 
+class TensorFloat32Test(test.TestCase):
+
+  def setUp(self):
+    if not test_util.is_gpu_available(
+        cuda_only=True, min_cuda_compute_capability=(8, 0)):
+      self.skipTest('TensorFloat-32 requires an NVIDIA GPU with compute '
+                    'capability of at least 8.0')
+
+  def tearDown(self):
+    config.allow_tensor_float_32_execution(False)
+
+  def test_tf32_enabled(self):
+    self.assertFalse(config.tensor_float_32_execution_allowed())
+    config.allow_tensor_float_32_execution(True)
+    self.assertTrue(config.tensor_float_32_execution_allowed())
+
+    x = array_ops.fill((8, 8), 1 + 2**-20)
+    y = array_ops.ones((8, 8))
+    out = math_ops.matmul(x, y)
+    # In tf32, each element of x is rounded to 1, so the output will be 8s.
+    expected = array_ops.fill((8, 8), 8)
+    self.assertAllEqual(out, expected)
+
+  def test_tf32_disabled(self):
+    x = array_ops.fill((8, 8), 1 + 2**-20)
+    y = array_ops.ones((8, 8))
+    out = math_ops.matmul(x, y)
+    expected = array_ops.fill((8, 8), 8 * (1 + 2**-20))
+    self.assertAllEqual(out, expected)
+
+    # Test disabling tf32 after enabling it works correctly
+    config.allow_tensor_float_32_execution(True)
+    config.allow_tensor_float_32_execution(False)
+    self.assertFalse(config.tensor_float_32_execution_allowed())
+    out = math_ops.matmul(x, y)
+    self.assertAllEqual(out, expected)
+
+
 if __name__ == '__main__':
   ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py
index 4bf67c75162..343856b6749 100644
--- a/tensorflow/python/framework/constant_op.py
+++ b/tensorflow/python/framework/constant_op.py
@@ -40,7 +40,7 @@ def _eager_reshape(tensor, shape, ctx):
   """Eager-only version of Reshape op; requires tensor is an eager Tensor."""
   attr_t = tensor._datatype_enum()  # pylint: disable=protected-access
   attr_tshape, (shape,) = execute.args_to_matching_eager(
-      [shape], ctx, dtypes.int32)
+      [shape], ctx, [dtypes.int32, dtypes.int64], dtypes.int32)
   inputs_flat = [tensor, shape]
   attrs = ("T", attr_t, "Tshape", attr_tshape)
   result, = execute.execute(
diff --git a/tensorflow/python/framework/convert_to_constants.py b/tensorflow/python/framework/convert_to_constants.py
index 555004e0836..10541ed8e34 100644
--- a/tensorflow/python/framework/convert_to_constants.py
+++ b/tensorflow/python/framework/convert_to_constants.py
@@ -612,12 +612,14 @@ class _While(_FunctionCaller):
   def convert_variable_to_constant(self, incoming_edge, tensor_data):
     super(_While, self).convert_variable_to_constant(incoming_edge, tensor_data)
     node = self.converted_self()
-    node.node.attr["output_shapes"].list.shape[
-        incoming_edge.destination.index].CopyFrom(
-            tensor_shape_pb2.TensorShapeProto(dim=[
-                tensor_shape_pb2.TensorShapeProto.Dim(size=dim)
-                for dim in tensor_data.numpy.shape
-            ]))
+    if node.node.attr["output_shapes"].list.shape:
+      node.node.attr["output_shapes"].list.shape[
+          incoming_edge.destination.index].CopyFrom(
+              tensor_shape_pb2.TensorShapeProto(dim=[
+                  tensor_shape_pb2.TensorShapeProto.Dim(size=dim)
+                  for dim in tensor_data.numpy.shape
+              ]))
+
     # The while's body inputs and outputs have the same type, so here we can go
     # ahead and change that function's output type.
     body_name = self._node.attr["body"].func.name
@@ -711,12 +713,12 @@ class _ConverterData(object):
   def __init__(self,
                graph_def,
                variable_names_allowlist=None,
-               variable_names_blacklist=None):
+               variable_names_denylist=None):
     self._graph_def = graph_def
     self._tensor_data = {}
     self._build_node_defs_list()
     self._variable_names_allowlist = variable_names_allowlist
-    self._variable_names_blacklist = variable_names_blacklist
+    self._variable_names_denylist = variable_names_denylist
 
   @property
   def graph_def(self):
@@ -742,8 +744,8 @@ class _ConverterData(object):
     """Checks whether to convert the given variable name to a constant."""
     return (self._variable_names_allowlist is None or
             name in self._variable_names_allowlist) and (
-                self._variable_names_blacklist is None or
-                name not in self._variable_names_blacklist)
+                self._variable_names_denylist is None or
+                name not in self._variable_names_denylist)
 
   def _build_node_defs_list(self):
     """Builds the list of NodeDefs in the GraphDef.
@@ -777,7 +779,7 @@ class _FunctionConverterData(_ConverterData):
                lower_control_flow,
                aggressive_inlining,
                variable_names_allowlist=None,
-               variable_names_blacklist=None):
+               variable_names_denylist=None):
     """Creates the conversion data for the given function.
 
     Args:
@@ -789,7 +791,7 @@ class _FunctionConverterData(_ConverterData):
         properly connected to control outputs).
       variable_names_allowlist: The set of variable names to convert (by
         default, all variables are converted).
-      variable_names_blacklist: The set of variable names to omit converting to
+      variable_names_denylist: The set of variable names to omit converting to
         constants.
     """
 
@@ -800,7 +802,7 @@ class _FunctionConverterData(_ConverterData):
     super(_FunctionConverterData, self).__init__(
         graph_def,
         variable_names_allowlist=variable_names_allowlist,
-        variable_names_blacklist=variable_names_blacklist)
+        variable_names_denylist=variable_names_denylist)
     self._build_tensor_data()
 
   def _build_tensor_data(self):
@@ -850,12 +852,12 @@ class _SessionConverterData(_ConverterData):
                graph_def,
                output_node_names,
                variable_names_allowlist=None,
-               variable_names_blacklist=None):
+               variable_names_denylist=None):
     graph_def = graph_util.extract_sub_graph(graph_def, output_node_names)
     super(_SessionConverterData, self).__init__(
         graph_def,
         variable_names_allowlist=variable_names_allowlist,
-        variable_names_blacklist=variable_names_blacklist)
+        variable_names_denylist=variable_names_denylist)
 
     nodes_to_convert = []
     tensor_names_to_convert = []
@@ -1115,7 +1117,7 @@ def convert_variables_to_constants_from_session_graph(
     graph_def,
     output_node_names,
     variable_names_allowlist=None,
-    variable_names_blacklist=None):
+    variable_names_denylist=None):
   """Replaces all the variables in a graph with constants of the same values.
 
   This function works similarly to convert_variables_to_constants_v2, but it
@@ -1131,7 +1133,7 @@ def convert_variables_to_constants_from_session_graph(
     output_node_names: List of name strings for the result nodes of the graph.
     variable_names_allowlist: The set of variable names to convert (by default,
       all variables are converted).
-    variable_names_blacklist: The set of variable names to omit converting to
+    variable_names_denylist: The set of variable names to omit converting to
       constants.
 
   Returns:
@@ -1143,5 +1145,5 @@ def convert_variables_to_constants_from_session_graph(
           graph_def=graph_def,
           output_node_names=output_node_names,
           variable_names_allowlist=variable_names_allowlist,
-          variable_names_blacklist=variable_names_blacklist))
+          variable_names_denylist=variable_names_denylist))
   return graph_def
diff --git a/tensorflow/python/framework/convert_to_constants_test.py b/tensorflow/python/framework/convert_to_constants_test.py
index 7252082d084..3adabb00a3e 100644
--- a/tensorflow/python/framework/convert_to_constants_test.py
+++ b/tensorflow/python/framework/convert_to_constants_test.py
@@ -594,7 +594,7 @@ class ConvertVariablesToConstantsSessionTest(test.TestCase):
         output = self.evaluate(output_node)
         self.assertNear(2.0, output, 0.00001)
 
-  def test_resource_variable_can_be_written_after_blacklisting(self):
+  def test_resource_variable_can_be_written_after_denylisting(self):
     with ops.Graph().as_default():
       with variable_scope.variable_scope("", use_resource=True):
         variable_node = variable_scope.get_variable(
@@ -614,17 +614,17 @@ class ConvertVariablesToConstantsSessionTest(test.TestCase):
 
           # Test variable name black list. This should result in the variable
           # not being a const.  Furthermore, the paths that read from and assign
-          # to the blacklisted variable should continue to be valid.
-          constant_graph_def_with_blacklist = (
+          # to the denylisted variable should continue to be valid.
+          constant_graph_def_with_denylist = (
               convert_to_constants
               .convert_variables_to_constants_from_session_graph(
                   session=sess,
                   graph_def=variable_graph_def,
                   output_node_names=["output_node", initializer_name],
-                  variable_names_blacklist=set(["variable_node"])))
+                  variable_names_denylist=set(["variable_node"])))
 
           variable_node = None
-          for node in constant_graph_def_with_blacklist.node:
+          for node in constant_graph_def_with_denylist.node:
             if node.name == "variable_node":
               variable_node = node
           self.assertIsNotNone(variable_node)
@@ -634,7 +634,7 @@ class ConvertVariablesToConstantsSessionTest(test.TestCase):
     # variable is not, and that the graph can be executed and update the
     # variable can be updated with each execution.
     with ops.Graph().as_default():
-      _ = importer.import_graph_def(constant_graph_def_with_blacklist, name="")
+      _ = importer.import_graph_def(constant_graph_def_with_denylist, name="")
       with session_lib.Session() as sess:
         output_node = sess.graph.get_tensor_by_name("output_node:0")
         self.evaluate(sess.graph.get_operation_by_name(initializer_name))
@@ -798,7 +798,7 @@ class ConvertVariablesToConstantsSessionTest(test.TestCase):
             .convert_variables_to_constants_from_session_graph(
                 sess,
                 variable_graph_def, ["out"],
-                variable_names_blacklist=["y"]))
+                variable_names_denylist=["y"]))
         self._assertGraphContains(
             constant_graph_def, """
             node {
@@ -840,7 +840,7 @@ class ConvertVariablesToConstantsSessionTest(test.TestCase):
             .convert_variables_to_constants_from_session_graph(
                 sess,
                 variable_graph_def, ["out"],
-                variable_names_blacklist=["y"]))
+                variable_names_denylist=["y"]))
         self._assertGraphContains(
             constant_graph_def, """
             node {
@@ -1086,7 +1086,7 @@ class ConvertVariablesToConstantsSessionTest(test.TestCase):
             .convert_variables_to_constants_from_session_graph(
                 sess,
                 variable_graph_def, ["case/cond"],
-                variable_names_blacklist=["y"]))
+                variable_names_denylist=["y"]))
         self._assertGraphContains(
             constant_graph_def, """
             node {name: "x" op: "Const"}
diff --git a/tensorflow/python/framework/errors_impl.py b/tensorflow/python/framework/errors_impl.py
index 48ed060556d..34daf43372a 100644
--- a/tensorflow/python/framework/errors_impl.py
+++ b/tensorflow/python/framework/errors_impl.py
@@ -48,7 +48,13 @@ class InaccessibleTensorError(ValueError):
   pass
 
 
+@tf_export("errors.OperatorNotAllowedInGraphError", v1=[])
 class OperatorNotAllowedInGraphError(TypeError):
+  """An error is raised for unsupported operator in Graph execution.
+
+  For example, using a `tf.Tensor` as a Python `bool` in Graph execution
+  is not allowed.
+  """
   pass
 
 
diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index 55508c4803b..dbe0d57759b 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -192,6 +192,7 @@ class FuncGraph(ops.Graph):
     self.structured_outputs = None
     self._weak_variables = []
     self._watched_variables = object_identity.ObjectIdentityWeakSet()
+    self.is_control_flow_graph = False
 
     outer_graph = ops.get_default_graph()
     self._weak_outer_graph = weakref.ref(outer_graph)
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 16b2c7c5048..596b93227bf 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -437,10 +437,10 @@ class FunctionTest(test.TestCase):
     self.assertEqual([("Assert", "Assert")], Foo.stateful_ops)
     g = ops.Graph()
     with g.as_default(), self.cached_session():
-      self.assertAllEqual(Foo(constant_op.constant(3.0)).eval(), 6.0)
+      self.assertAllEqual(Foo(constant_op.constant(3.0)), 6.0)
       with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
                                   "assertion failed.*-3"):
-        self.assertAllEqual(Foo(constant_op.constant(-3.0)).eval(), 6.0)
+        self.assertAllEqual(Foo(constant_op.constant(-3.0)), 6.0)
 
   @test_util.run_deprecated_v1
   def testAssertWrapper(self):
diff --git a/tensorflow/python/framework/graph_util_impl.py b/tensorflow/python/framework/graph_util_impl.py
index 753584813f9..4ef26fc8539 100644
--- a/tensorflow/python/framework/graph_util_impl.py
+++ b/tensorflow/python/framework/graph_util_impl.py
@@ -270,14 +270,14 @@ def convert_variables_to_constants(sess,
 
   Raises:
     RuntimeError: if a DT_RESOURCE op is found whose ancestor Variables are both
-      blacklisted AND whitelisted for freezing.
+      denylisted AND whitelisted for freezing.
   """
   ret = convert_to_constants.convert_variables_to_constants_from_session_graph(
       session=sess,
       graph_def=input_graph_def,
       output_node_names=output_node_names,
       variable_names_allowlist=variable_names_whitelist,
-      variable_names_blacklist=variable_names_blacklist)
+      variable_names_denylist=variable_names_blacklist)
   # The previous code logic generated an empty versions field, we clear it here
   # to maintain backwards compatibility.
   ret.versions.Clear()
diff --git a/tensorflow/python/framework/importer_test.py b/tensorflow/python/framework/importer_test.py
index de295955c78..8fd25a39bd4 100644
--- a/tensorflow/python/framework/importer_test.py
+++ b/tensorflow/python/framework/importer_test.py
@@ -945,7 +945,7 @@ class ImportGraphDefTest(test.TestCase):
 
     with self.cached_session():
       pack, = importer.import_graph_def(gdef, return_elements=["pack"])
-      self.assertAllEqual(pack.outputs[0].eval(), [5.0, 5.0])
+      self.assertAllEqual(pack.outputs[0], [5.0, 5.0])
 
   def testWithDevice(self):
     with ops.Graph().as_default() as g:
diff --git a/tensorflow/python/framework/indexed_slices.py b/tensorflow/python/framework/indexed_slices.py
index 6ddf9410fd7..45f6e254b0e 100644
--- a/tensorflow/python/framework/indexed_slices.py
+++ b/tensorflow/python/framework/indexed_slices.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import type_spec
 from tensorflow.python.types import internal
+from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
 
@@ -344,7 +345,7 @@ def internal_convert_n_to_tensor_or_indexed_slices(values,
     RuntimeError: If a registered conversion function returns an invalid
       value.
   """
-  if not isinstance(values, collections.Iterable):
+  if not isinstance(values, collections_abc.Iterable):
     raise TypeError("values must be iterable.")
   ret = []
   for i, value in enumerate(values):
diff --git a/tensorflow/python/framework/meta_graph.py b/tensorflow/python/framework/meta_graph.py
index 327b476c576..dbc2a894d65 100644
--- a/tensorflow/python/framework/meta_graph.py
+++ b/tensorflow/python/framework/meta_graph.py
@@ -161,12 +161,17 @@ def ops_used_by_graph_def(graph_def):
       functions_to_process.append(name_to_function[op])
     used_ops.add(op)
 
-  for node in graph_def.node:
+  def process_node(node):
     mark_op_as_used(node.op)
+    if node.op in ["PartitionedCall", "StatefulPartitionedCall"]:
+      mark_op_as_used(node.attr["f"].func.name)
+
+  for node in graph_def.node:
+    process_node(node)
   while functions_to_process:
     fun = functions_to_process.pop()
     for node in fun.node_def:
-      mark_op_as_used(node.op)
+      process_node(node)
 
   return [op for op in used_ops if op not in name_to_function]
 
diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index ae44fbce0f0..36acd81fe26 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -161,6 +161,29 @@ class SimpleMetaGraphTest(test.TestCase):
     op_list = meta_graph.stripped_op_list_for_graph(graph)
     self.assertEqual(["Const"], [op.name for op in op_list.op])
 
+  def testStrippedOpListPartitionedCalls(self):
+    # Function A calls B via StatefulPartitionedCall.
+    graph = graph_pb2.GraphDef()
+    a = graph.library.function.add()
+    b = graph.library.function.add()
+    a.signature.name = "A"
+    b.signature.name = "B"
+    node_in_a = a.node_def.add()
+    node_in_a.op = "StatefulPartitionedCall"
+    node_in_a.attr["f"].func.name = "B"
+    b.node_def.add().op = "Const"
+    b.node_def.add().op = "A"
+
+    # Use A in the graph via PartitionedCall.
+    node = graph.node.add()
+    node.op = "PartitionedCall"
+    node.attr["f"].func.name = "A"
+
+    op_list = meta_graph.stripped_op_list_for_graph(graph)
+    self.assertSameElements(
+        ["Const", "PartitionedCall", "StatefulPartitionedCall"],
+        [op.name for op in op_list.op])
+
   @test_util.run_deprecated_v1
   def testDefaultAttrStripping(self):
     """Verifies that default attributes are stripped from a graph def."""
diff --git a/tensorflow/python/framework/op_def_library.py b/tensorflow/python/framework/op_def_library.py
index 6c72d38c197..53d092787f6 100644
--- a/tensorflow/python/framework/op_def_library.py
+++ b/tensorflow/python/framework/op_def_library.py
@@ -21,10 +21,12 @@ from __future__ import print_function
 
 import six
 
+from google.protobuf import text_format
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import tensor_pb2
 from tensorflow.core.framework import tensor_shape_pb2
 from tensorflow.core.framework import types_pb2
+from tensorflow.python import _pywrap_utils
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import op_callbacks
 from tensorflow.python.framework import op_def_registry
@@ -337,6 +339,7 @@ def _apply_op_helper(op_type_name, name=None, **keywords):  # pylint: disable=in
   # on the other.  Handling this will require restructuring this code
   # significantly.
   default_type_attr_map = {}
+  allowed_list_attr_map = {}
   for attr_def in op_def.attr:
     if attr_def.type != "type":
       continue
@@ -344,6 +347,8 @@ def _apply_op_helper(op_type_name, name=None, **keywords):  # pylint: disable=in
     if attr_def.HasField("default_value"):
       default_type_attr_map[key] = dtypes.as_dtype(
           attr_def.default_value.type)
+    if attr_def.HasField("allowed_values"):
+      allowed_list_attr_map[key] = attr_def.allowed_values.list.type
 
   # Requires that op_def has passed validation (using the C++
   # ValidateOpDef() from ../framework/op_def_util.h).
@@ -451,6 +456,7 @@ def _apply_op_helper(op_type_name, name=None, **keywords):  # pylint: disable=in
         # arguments to that type.
         dtype = None
         default_dtype = None
+        allowed_list = None
         if input_arg.type != types_pb2.DT_INVALID:
           dtype = input_arg.type
         elif input_arg.type_attr in attrs:
@@ -460,14 +466,41 @@ def _apply_op_helper(op_type_name, name=None, **keywords):  # pylint: disable=in
           # so we prefer the attr's default, so code that adds a new attr
           # with a default is backwards compatible.
           default_dtype = default_type_attr_map[input_arg.type_attr]
+          allowed_list = allowed_list_attr_map.get(input_arg.type_attr)
 
         try:
-          values = ops.convert_to_tensor(
-              values,
-              name=input_arg.name,
-              dtype=dtype,
-              as_ref=input_arg.is_ref,
-              preferred_dtype=default_dtype)
+          # First see if we can get a valid dtype with the default conversion
+          # and see if it matches an allowed dtypes. Some ops like ConcatV2 may
+          # not list allowed dtypes, in which case we should skip this.
+          if dtype is None and allowed_list:
+            inferred = None
+            try:
+              inferred = ops.convert_to_tensor(
+                  values, name=input_arg.name, as_ref=input_arg.is_ref)
+            except TypeError as err:
+              # When converting a python object such as a list of Dimensions, we
+              # need a dtype to be specified, thus tensor conversion may throw
+              # an exception which we will ignore and try again below.
+              pass
+
+            # If we did not match an allowed dtype, try again with the default
+            # dtype. This could be because we have an empty tensor and thus we
+            # picked the wrong type.
+            if inferred is not None and inferred.dtype in allowed_list:
+              values = inferred
+            else:
+              values = ops.convert_to_tensor(
+                  values,
+                  name=input_arg.name,
+                  as_ref=input_arg.is_ref,
+                  preferred_dtype=default_dtype)
+          else:
+            values = ops.convert_to_tensor(
+                values,
+                name=input_arg.name,
+                dtype=dtype,
+                as_ref=input_arg.is_ref,
+                preferred_dtype=default_dtype)
         except TypeError as err:
           if dtype is None:
             raise err
@@ -757,3 +790,13 @@ def _apply_op_helper(op_type_name, name=None, **keywords):  # pylint: disable=in
         outputs = callback_outputs
 
     return output_structure, op_def.is_stateful, op, outputs
+
+
+# The following symbols are used by op_def_util.cc.
+_pywrap_utils.RegisterPyObject("tf.dtypes.DType", dtypes.DType)
+_pywrap_utils.RegisterPyObject("tf.dtypes.as_dtype", dtypes.as_dtype)
+_pywrap_utils.RegisterPyObject("tf.TensorShape", tensor_shape.TensorShape)
+_pywrap_utils.RegisterPyObject("tf.as_shape", tensor_shape.as_shape)
+_pywrap_utils.RegisterPyObject("tf.TensorProto", tensor_pb2.TensorProto)
+_pywrap_utils.RegisterPyObject("text_format.Parse", text_format.Parse)
+_pywrap_utils.RegisterPyObject("tf.convert_to_tensor", ops.convert_to_tensor)
diff --git a/tensorflow/python/framework/op_def_util.cc b/tensorflow/python/framework/op_def_util.cc
new file mode 100644
index 00000000000..c915c494be9
--- /dev/null
+++ b/tensorflow/python/framework/op_def_util.cc
@@ -0,0 +1,272 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/python/framework/op_def_util.h"
+
+#include <map>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/python/util/util.h"
+
+using ::tensorflow::swig::GetRegisteredPyObject;
+
+#if PY_MAJOR_VERSION < 3
+#define PY_STRING_CHECK(x) (PyString_Check(x) || PyUnicode_Check(x))
+#define PY_INT_CHECK(x) (PyInt_Check(x))
+#define PY_INT_TYPE PyInt_Type
+#else
+#define PY_STRING_CHECK(x) (PyBytes_Check(x) || PyUnicode_Check(x))
+#define PY_INT_CHECK(x) (PyLong_Check(x))
+#define PY_INT_TYPE PyLong_Type
+#endif
+
+namespace tensorflow {
+
+namespace {
+
+const std::map<std::string, AttributeType>* AttributeTypeNameMap() {
+  static auto* type_map = new std::map<std::string, AttributeType>(
+      {{"any", AttributeType::ANY},
+       {"float", AttributeType::FLOAT},
+       {"int", AttributeType::INT},
+       {"string", AttributeType::STRING},
+       {"bool", AttributeType::BOOL},
+       {"shape", AttributeType::SHAPE},
+       {"type", AttributeType::DTYPE},
+       {"tensor", AttributeType::TENSOR},
+       {"list(any)", AttributeType::LIST_ANY},
+       {"list(float)", AttributeType::LIST_FLOAT},
+       {"list(int)", AttributeType::LIST_INT},
+       {"list(string)", AttributeType::LIST_STRING},
+       {"list(bool)", AttributeType::LIST_BOOL},
+       {"list(type)", AttributeType::LIST_DTYPE},
+       {"list(shape)", AttributeType::LIST_SHAPE},
+       {"list(tensor)", AttributeType::LIST_TENSOR}});
+  return type_map;
+}
+
+// Note: we define functors for converting value types (rather than simple
+// functions) so we can define a generic ConvertListAttr method.  These
+// functors all return a new reference on success, or nullptr on failure.
+// They do not (necessarily) call PyErr_SetString.
+
+struct ConvertAnyFunctor {
+  Safe_PyObjectPtr operator()(PyObject* value) {
+    Py_INCREF(value);
+    return Safe_PyObjectPtr(value);
+  }
+};
+
+struct ConvertFloatFunctor {
+  Safe_PyObjectPtr operator()(PyObject* value) {
+    Safe_PyObjectPtr result;
+    if (PyFloat_Check(value)) {
+      Py_INCREF(value);
+      result.reset(value);
+    } else if (!PY_STRING_CHECK(value)) {
+      result.reset(PyObject_CallFunctionObjArgs(
+          reinterpret_cast<PyObject*>(&PyFloat_Type), value, nullptr));
+    }
+    return result;
+  }
+};
+
+struct ConvertIntFunctor {
+  Safe_PyObjectPtr operator()(PyObject* value) {
+    Safe_PyObjectPtr result;
+    if (PY_INT_CHECK(value)) {
+      Py_INCREF(value);
+      result.reset(value);
+    } else if (!PY_STRING_CHECK(value)) {
+      result.reset(PyObject_CallFunctionObjArgs(
+          reinterpret_cast<PyObject*>(&PY_INT_TYPE), value, nullptr));
+    }
+    return result;
+  }
+};
+
+struct ConvertStringFunctor {
+  Safe_PyObjectPtr operator()(PyObject* value) {
+    Safe_PyObjectPtr result;
+    if (PY_STRING_CHECK(value)) {
+      Py_INCREF(value);
+      result.reset(value);
+    }
+    return result;
+  }
+};
+
+// TODO(edloper): Should we allow ints (or any other values) to be converted
+// to booleans?  Currently, TensorFlow does not do this conversion for attribute
+// values in _MakeBool or make_bool.
+struct ConvertBoolFunctor {
+  Safe_PyObjectPtr operator()(PyObject* value) {
+    Safe_PyObjectPtr result;
+    if (PyBool_Check(value)) {
+      Py_INCREF(value);
+      result.reset(value);
+    }
+    return result;
+  }
+};
+
+struct ConvertDTypeFunctor {
+  Safe_PyObjectPtr operator()(PyObject* value) {
+    Safe_PyObjectPtr result;
+    // The following symbols are registered in op_def_library.py
+    static PyObject* dtype = GetRegisteredPyObject("tf.dtypes.DType");
+    static PyObject* as_dtype = GetRegisteredPyObject("tf.dtypes.as_dtype");
+    if (reinterpret_cast<PyObject*>(value->ob_type) == dtype) {
+      Py_INCREF(value);
+      result.reset(value);
+    } else {
+      result.reset(PyObject_CallFunctionObjArgs(as_dtype, value, nullptr));
+    }
+    return result;
+  }
+};
+
+struct ConvertTensorShapeFunctor {
+  Safe_PyObjectPtr operator()(PyObject* value) {
+    Safe_PyObjectPtr result;
+    // The following symbols are registered in op_def_library.py
+    static PyObject* shape = GetRegisteredPyObject("tf.TensorShape");
+    static PyObject* as_shape = GetRegisteredPyObject("tf.as_shape");
+    if (reinterpret_cast<PyObject*>(value->ob_type) == shape) {
+      Py_INCREF(value);
+      result.reset(value);
+    } else {
+      result.reset(PyObject_CallFunctionObjArgs(as_shape, value, nullptr));
+    }
+    return result;
+  }
+};
+
+struct ConvertTensorProtoFunctor {
+  Safe_PyObjectPtr operator()(PyObject* value) {
+    Safe_PyObjectPtr result;
+    // The following symbols are registered in op_def_library.py
+    static PyObject* tensor_proto = GetRegisteredPyObject("tf.TensorProto");
+    static PyObject* text_format_parse =
+        GetRegisteredPyObject("text_format.Parse");
+    if (reinterpret_cast<PyObject*>(value->ob_type) == tensor_proto) {
+      Py_INCREF(value);
+      result.reset(value);
+    } else if (PY_STRING_CHECK(value)) {
+      result.reset(PyObject_CallObject(tensor_proto, nullptr));
+      if (result) {
+        if (!PyObject_CallFunctionObjArgs(text_format_parse, value,
+                                          result.get(), nullptr)) {
+          return nullptr;
+        }
+      }
+    }
+    return result;
+  }
+};
+
+// Converts `value` to a list of elements with the same type, using
+// `convert_functor` to convert each element.
+template <typename T>
+Safe_PyObjectPtr ConvertListAttr(PyObject* value, T convert_functor) {
+  // Copy the list.
+  Safe_PyObjectPtr result(PySequence_List(value));
+  if (!result) return nullptr;
+
+  // Check the type of each item in the list.
+  Py_ssize_t len = PySequence_Fast_GET_SIZE(result.get());
+  PyObject** items = PySequence_Fast_ITEMS(result.get());
+  for (Py_ssize_t i = 0; i < len; ++i) {
+    if (!PyFloat_Check(value)) {
+      Safe_PyObjectPtr item = convert_functor(items[i]);
+      if (!item) return nullptr;
+      PySequence_SetItem(result.get(), i, item.get());
+    }
+  }
+  return result;
+}
+
+// Returns the given `value` value, converted to the indicated type.
+// Returns nullptr if `value` is not convertible.
+Safe_PyObjectPtr ConvertAttrOrNull(PyObject* value, AttributeType attr_type) {
+  switch (attr_type) {
+    case AttributeType::ANY:
+      return ConvertAnyFunctor()(value);
+    case AttributeType::FLOAT:
+      return ConvertFloatFunctor()(value);
+    case AttributeType::INT:
+      return ConvertIntFunctor()(value);
+    case AttributeType::STRING:
+      return ConvertStringFunctor()(value);
+    case AttributeType::BOOL:
+      return ConvertBoolFunctor()(value);
+    case AttributeType::DTYPE:
+      return ConvertDTypeFunctor()(value);
+    case AttributeType::SHAPE:
+      return ConvertTensorShapeFunctor()(value);
+    case AttributeType::TENSOR:
+      return ConvertTensorProtoFunctor()(value);
+    case AttributeType::LIST_ANY:
+      return ConvertListAttr(value, ConvertAnyFunctor());
+    case AttributeType::LIST_FLOAT:
+      return ConvertListAttr(value, ConvertFloatFunctor());
+    case AttributeType::LIST_INT:
+      return ConvertListAttr(value, ConvertIntFunctor());
+    case AttributeType::LIST_STRING:
+      return ConvertListAttr(value, ConvertStringFunctor());
+    case AttributeType::LIST_BOOL:
+      return ConvertListAttr(value, ConvertBoolFunctor());
+    case AttributeType::LIST_DTYPE:
+      return ConvertListAttr(value, ConvertDTypeFunctor());
+    case AttributeType::LIST_SHAPE:
+      return ConvertListAttr(value, ConvertTensorShapeFunctor());
+    case AttributeType::LIST_TENSOR:
+      return ConvertListAttr(value, ConvertTensorProtoFunctor());
+    default:
+      return nullptr;
+  }
+}
+
+}  // namespace
+
+AttributeType AttributeTypeFromName(const std::string& type_name) {
+  const auto* type_map = AttributeTypeNameMap();
+  auto it = type_map->find(type_name);
+  return it != type_map->end() ? it->second : AttributeType::UNKNOWN;
+}
+
+std::string AttributeTypeToName(AttributeType attr_type) {
+  for (const auto& pair : *AttributeTypeNameMap()) {
+    if (pair.second == attr_type) {
+      return pair.first;
+    }
+  }
+  return "<unknown>";
+}
+
+Safe_PyObjectPtr ConvertPyObjectToAttributeType(PyObject* value,
+                                                AttributeType type) {
+  Safe_PyObjectPtr result = ConvertAttrOrNull(value, type);
+  if (!result) {
+    auto err = absl::StrCat("Failed to convert value of type '",
+                            value->ob_type->tp_name, "' to type '",
+                            AttributeTypeToName(type), "'.");
+    PyErr_SetString(PyExc_TypeError, err.c_str());
+  }
+
+  return result;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/python/framework/op_def_util.h b/tensorflow/python/framework/op_def_util.h
new file mode 100644
index 00000000000..ef5e64e68fa
--- /dev/null
+++ b/tensorflow/python/framework/op_def_util.h
@@ -0,0 +1,77 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_PYTHON_FRAMEWORK_OP_DEF_UTIL_H_
+#define TENSORFLOW_PYTHON_FRAMEWORK_OP_DEF_UTIL_H_
+
+#include <string>
+
+#include "tensorflow/python/lib/core/safe_ptr.h"
+
+namespace tensorflow {
+
+// Enumerated type corresponding with string values in AttrDef::type.
+enum class AttributeType {
+  UNKNOWN,
+  ANY,          // "any"
+  FLOAT,        // "float"
+  INT,          // "int"
+  STRING,       // "string"
+  BOOL,         // "bool"
+  DTYPE,        // "type" (tf.dtypes.DType)
+  SHAPE,        // "shape" (tf.TensorShape)
+  TENSOR,       // "tensor" (tf.TensorProto)
+  LIST_ANY,     // "list(any)"
+  LIST_FLOAT,   // "list(float)"
+  LIST_INT,     // "list(int)"
+  LIST_STRING,  // "list(string)"
+  LIST_BOOL,    // "list(bool)"
+  LIST_DTYPE,   // "list(dtype)"
+  LIST_SHAPE,   // "list(shape)"
+  LIST_TENSOR   // "list(tensor)"
+};
+
+// Returns the enumerated value corresponding to a given string (e.g.
+// "string" or "list(string)".
+AttributeType AttributeTypeFromName(const std::string& type_name);
+
+// Returns the string corresponding to a given enumerated value.
+std::string AttributeTypeToName(AttributeType attr_type);
+
+// Converts `value` to the specified type and returns a new reference to the
+// converted value (if possible); or sets a Python exception and returns
+// nullptr.  This function is optimized to be fast if `value` already has the
+// desired type.
+//
+//   * 'any' values are returned as-is.
+//   * 'float' values are converted by calling float(value).
+//   * 'int' values are converted by calling int(value).
+//   * 'string' values are returned as-is if they are (bytes, unicode);
+//     otherwise, an exception is raised.
+//   * 'bool' values are returned as-is if they are boolean; otherwise, an
+//     exception is raised.
+//   * 'dtype' values are converted using `dtypes.as_dtype`.
+//   * 'shape' values are converted using `tensor_shape.as_shape`.
+//   * 'tensor' values are returned as-is if they are a `TensorProto`; or are
+//     parsed into `TensorProto` using `textformat.merge` if they are a string.
+//     Otherwise, an exception is raised.
+//   * 'list(*)' values are copied to a new list, and then each element is
+//     converted (in-place) as described above.  (If the value is not iterable,
+//     or if conversion fails for any item, then an exception is raised.)
+Safe_PyObjectPtr ConvertPyObjectToAttributeType(PyObject* value,
+                                                AttributeType type);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_FRAMEWORK_OP_DEF_UTIL_H_
diff --git a/tensorflow/python/framework/op_def_util_pybind.cc b/tensorflow/python/framework/op_def_util_pybind.cc
new file mode 100644
index 00000000000..d13f605b599
--- /dev/null
+++ b/tensorflow/python/framework/op_def_util_pybind.cc
@@ -0,0 +1,41 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "pybind11/pybind11.h"
+#include "tensorflow/python/framework/op_def_util.h"
+
+namespace py = pybind11;
+
+namespace {
+
+py::handle ConvertAttr(py::handle value, std::string attr_type) {
+  tensorflow::Safe_PyObjectPtr result =
+      ::tensorflow::ConvertPyObjectToAttributeType(
+          value.ptr(), ::tensorflow::AttributeTypeFromName(attr_type));
+  if (!result) {
+    throw py::error_already_set();
+  }
+  Py_INCREF(result.get());
+  return result.release();
+}
+
+}  // namespace
+
+// Expose ConvertPyObjectToAttributeType via Python.  Note: this is done to
+// simplify testing; ConvertPyObjectToAttributeType is expected to be called
+// directly from c++.
+PYBIND11_MODULE(_op_def_util, m) {
+  m.def("ConvertPyObjectToAttributeType", ConvertAttr, py::arg("value"),
+        py::arg("attr_type_enum"));
+}
diff --git a/tensorflow/python/framework/op_def_util_test.py b/tensorflow/python/framework/op_def_util_test.py
new file mode 100644
index 00000000000..69aaffbf19f
--- /dev/null
+++ b/tensorflow/python/framework/op_def_util_test.py
@@ -0,0 +1,98 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for tensorflow.python.ops.op_def_library."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+import numpy as np
+
+from tensorflow.core.framework import tensor_pb2
+from tensorflow.core.framework import types_pb2
+from tensorflow.python import _op_def_util
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+
+
+class OpDefUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.parameters([
+      ("any", "Foo", "Foo"),
+      ("any", 12, 12),
+      ("any", {2: 3}, {2: 3}),
+      ("string", "Foo", "Foo"),
+      ("string", b"Foo", b"Foo"),
+      ("int", 12, 12),
+      ("int", 12.3, 12),
+      ("float", 12, 12.0),
+      ("float", 12.3, 12.3),
+      ("bool", True, True),
+      ("shape", tensor_shape.TensorShape([3]), tensor_shape.TensorShape([3])),
+      ("shape", [3], tensor_shape.TensorShape([3])),
+      ("type", dtypes.int32, dtypes.int32),
+      ("type", np.int32, dtypes.int32),
+      ("type", "int32", dtypes.int32),
+      ("tensor", tensor_pb2.TensorProto(dtype=types_pb2.DataType.DT_FLOAT),
+       tensor_pb2.TensorProto(dtype=types_pb2.DataType.DT_FLOAT)),
+      ("tensor", "dtype: DT_FLOAT",
+       tensor_pb2.TensorProto(dtype=types_pb2.DataType.DT_FLOAT)),
+      ("list(any)", [1, "foo", 7.3, dtypes.int32],
+       [1, "foo", 7.3, dtypes.int32]),
+      ("list(any)", (1, "foo"), [1, "foo"]),
+      ("list(string)", ["foo", "bar"], ["foo", "bar"]),
+      ("list(string)", ("foo", "bar"), ["foo", "bar"]),
+      ("list(string)", iter("abcd"), ["a", "b", "c", "d"]),
+      ("list(int)", (1, 2.3), [1, 2]),
+      ("list(float)", (1, 2.3), [1.0, 2.3]),
+      ("list(bool)", [True, False], [True, False]),
+  ])
+  def testConvert(self, attr_type, value, expected):
+    result = _op_def_util.ConvertPyObjectToAttributeType(value, attr_type)
+
+    # Check that we get the expected value(s).
+    self.assertEqual(expected, result)
+
+    # Check that we get the expected type(s).
+    self.assertEqual(type(expected), type(result))
+    if isinstance(result, list):
+      for expected_item, result_item in zip(expected, result):
+        self.assertEqual(type(expected_item), type(result_item))
+
+  @parameterized.parameters([
+      ("string", 12),
+      ("int", "foo"),
+      ("float", "foo"),
+      ("bool", 1),
+      ("dtype", None),
+      ("shape", 12.0),
+      ("tensor", [1, 2, 3]),
+      ("list(any)", 12),
+      ("list(int)", [1, "two"]),
+      ("list(string)", [1, "two"]),
+      ("tensor", "string that is not a text-formatted TensorProto"),
+  ])
+  def testConvertError(self, attr_type, value):
+    with self.assertRaisesRegex(TypeError, "Failed to convert value"):
+      _op_def_util.ConvertPyObjectToAttributeType(value, attr_type)
+
+if __name__ == "__main__":
+  googletest.main()
+
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 8a823e2e92c..f07bca17061 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -24,7 +24,6 @@ import sys
 import threading
 import types
 
-from typing import Generic, TypeVar
 import numpy as np
 import six
 from six.moves import map  # pylint: disable=redefined-builtin
@@ -63,6 +62,7 @@ from tensorflow.python.framework import versions
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.platform import app
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.profiler import trace
 from tensorflow.python.types import core as core_tf_types
 from tensorflow.python.types import internal
 from tensorflow.python.util import compat
@@ -255,19 +255,9 @@ def disable_tensor_equality():
   Tensor._USE_EQUALITY = False  # pylint: disable=protected-access
 
 
-DataType = TypeVar("DataType", bound=dtypes.DType)
-
-# TODO(rahulkamat): Remove this and make Tensor a generic class
-# once compatibility with Python 2 is dropped.
-if sys.version_info[0] >= 3:
-  TensorTypeBase = Generic[DataType]
-else:
-  TensorTypeBase = object
-
-
 # TODO(mdan): This object should subclass Symbol, not just Tensor.
 @tf_export("Tensor")
-class Tensor(internal.NativeObject, core_tf_types.Tensor, TensorTypeBase):
+class Tensor(internal.NativeObject, core_tf_types.Tensor):
   """A tensor is a multidimensional array of elements represented by a
 
   `tf.Tensor` object.  All elements are of a single known data type.
@@ -864,6 +854,7 @@ class Tensor(internal.NativeObject, core_tf_types.Tensor, TensorTypeBase):
                     "Please call `x.shape` rather than `len(x)` for "
                     "shape information.".format(self.name))
 
+  # TODO(mdan): This convoluted machinery is hard to maintain. Clean up.
   @staticmethod
   def _override_operator(operator, func):
     _override_helper(Tensor, operator, func)
@@ -1482,6 +1473,7 @@ def pack_eager_tensors(tensors, ctx=None):
   return packed_tensor
 
 
+@trace.trace_wrapper("convert_to_tensor")
 def convert_to_tensor(value,
                       dtype=None,
                       name=None,
@@ -3612,12 +3604,17 @@ class Graph(object):
 
     if self._colocation_stack:
       all_colocation_groups = []
+      is_device_set = False
       for colocation_op in self._colocation_stack.peek_objs():
-        all_colocation_groups.extend(colocation_op.colocation_groups())
-        if colocation_op.device:
+        try:
+          all_colocation_groups.extend(colocation_op.colocation_groups())
+        except AttributeError:
+          pass
+        if colocation_op.device and not is_device_set:
           # pylint: disable=protected-access
           op._set_device(colocation_op.device)
           # pylint: enable=protected-access
+          is_device_set = True
 
       all_colocation_groups = sorted(set(all_colocation_groups))
       # pylint: disable=protected-access
@@ -4367,7 +4364,7 @@ class Graph(object):
     if op is None and not ignore_existing:
       raise ValueError("Trying to reset colocation (op is None) but "
                        "ignore_existing is not True")
-    op = _op_to_colocate_with(op, self)
+    op, device_only_candidate = _op_to_colocate_with(op, self)
 
     # By default, colocate_with resets the device function stack,
     # since colocate_with is typically used in specific internal
@@ -4387,8 +4384,12 @@ class Graph(object):
       # offset refers to the stack frame used for storing code location.
       # We use 4, the sum of 1 to use our caller's stack frame and 3
       # to jump over layers of context managers above us.
+      if device_only_candidate is not None:
+        self._colocation_stack.push_obj(device_only_candidate, offset=4)
       self._colocation_stack.push_obj(op, offset=4)
-
+    elif not ignore_existing:
+      raise ValueError("Trying to reset colocation (op is None) but "
+                       "ignore_existing is not True")
     try:
       yield
     finally:
@@ -4396,6 +4397,8 @@ class Graph(object):
       self._device_function_stack = device_fn_tmp
       if op is not None:
         self._colocation_stack.pop_obj()
+        if device_only_candidate is not None:
+          self._colocation_stack.pop_obj()
 
       # Reset the colocation stack if requested.
       if ignore_existing:
@@ -5819,7 +5822,24 @@ def executing_eagerly_outside_functions():
       return context.executing_eagerly()
 
 
+@tf_export("inside_function", v1=[])
 def inside_function():
+  """Indicates whether the caller code is executing inside a `tf.function`.
+
+  Returns:
+    Boolean, True if the caller code is executing inside a `tf.function`
+    rather than eagerly.
+
+  Example:
+
+  >>> tf.inside_function()
+  False
+  >>> @tf.function
+  ... def f():
+  ...   print(tf.inside_function())
+  >>> f()
+  True
+  """
   return get_default_graph().building_function
 
 
@@ -6823,26 +6843,30 @@ def _operation_conversion_error(op, dtype=None, name=None, as_ref=False):
 def _op_to_colocate_with(v, graph):
   """Operation object corresponding to v to use for colocation constraints."""
   if v is None:
-    return None
+    return None, None
   if isinstance(v, Operation):
-    return v
+    return v, None
+
   # We always want to colocate with the reference op.
   # When 'v' is a ResourceVariable, the reference op is the handle creating op.
   #
   # What this should be is:
   # if isinstance(v, ResourceVariable):
-  #   return v.handle.op
+  #   return v.handle.op, v
   # However, that would require a circular import dependency.
   # As of October 2018, there were attempts underway to remove
   # colocation constraints altogether. Assuming that will
   # happen soon, perhaps this hack to work around the circular
   # import dependency is acceptable.
   if hasattr(v, "handle") and isinstance(v.handle, Tensor):
+    device_only_candidate = lambda: None
+    device_only_candidate.device = v.device
+    device_only_candidate.name = v.name
     if graph.building_function:
-      return graph.capture(v.handle).op
+      return graph.capture(v.handle).op, device_only_candidate
     else:
-      return v.handle.op
-  return internal_convert_to_tensor_or_indexed_slices(v, as_ref=True).op
+      return v.handle.op, device_only_candidate
+  return internal_convert_to_tensor_or_indexed_slices(v, as_ref=True).op, None
 
 
 def _is_keras_symbolic_tensor(x):
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 09a192dea52..4129b55e3fd 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -309,6 +309,155 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
     del x
     self.assertIsNotNone(x_ref.deref())
 
+  @test_util.run_in_graph_and_eager_modes
+  def testBitwiseAndNumeric(self):
+    x = constant_op.constant([0, 1, 3])
+    y = constant_op.constant([1, 1, 1])
+
+    z = x & y
+
+    self.assertAllEqual(z, [0, 1, 1])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBitwiseAndBool(self):
+    x = constant_op.constant([False, False, True, True])
+    y = constant_op.constant([False, True, False, True])
+
+    z = x & y
+
+    self.assertAllEqual(z, [False, False, False, True])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBitwiseAndErrors(self):
+    x_int = constant_op.constant(0)
+    x_bool = constant_op.constant(True)
+
+    if context.executing_eagerly():  # :(
+      expected_errtype = errors.InvalidArgumentError
+    else:
+      expected_errtype = TypeError
+
+    with self.assertRaises(expected_errtype):
+      _ = x_int & x_bool
+    with self.assertRaises(expected_errtype):
+      _ = x_int & constant_op.constant("a")
+
+    with self.assertRaises(expected_errtype):
+      _ = x_bool & x_int
+    with self.assertRaises(expected_errtype):
+      _ = x_bool & constant_op.constant("a")
+
+    with self.assertRaises(expected_errtype):
+      _ = constant_op.constant("a") & constant_op.constant("b")
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBitwiseOrNumeric(self):
+    x = constant_op.constant([0, 1, 2])
+    y = constant_op.constant([1, 1, 1])
+
+    z = x | y
+
+    self.assertAllEqual(z, [1, 1, 3])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBitwiseOrBool(self):
+    x = constant_op.constant([False, False, True, True])
+    y = constant_op.constant([False, True, False, True])
+
+    z = x | y
+
+    self.assertAllEqual(z, [False, True, True, True])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBitwiseOrErrors(self):
+    x_int = constant_op.constant(0)
+    x_bool = constant_op.constant(True)
+
+    if context.executing_eagerly():  # :(
+      expected_errtype = errors.InvalidArgumentError
+    else:
+      expected_errtype = TypeError
+
+    with self.assertRaises(expected_errtype):
+      _ = x_int | x_bool
+    with self.assertRaises(expected_errtype):
+      _ = x_int | constant_op.constant("a")
+
+    with self.assertRaises(expected_errtype):
+      _ = x_bool | x_int
+    with self.assertRaises(expected_errtype):
+      _ = x_bool | constant_op.constant("a")
+
+    with self.assertRaises(expected_errtype):
+      _ = constant_op.constant("a") | constant_op.constant("b")
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBitwiseXorNumeric(self):
+    x = constant_op.constant([0, 1, 3])
+    y = constant_op.constant([1, 1, 1])
+
+    z = x ^ y
+
+    self.assertAllEqual(z, [1, 0, 2])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBitwiseXorBool(self):
+    x = constant_op.constant([False, False, True, True])
+    y = constant_op.constant([False, True, False, True])
+
+    z = x ^ y
+
+    self.assertAllEqual(z, [False, True, True, False])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBitwiseXorErrors(self):
+    x_int = constant_op.constant(0)
+    x_bool = constant_op.constant(True)
+
+    if context.executing_eagerly():  # :(
+      expected_errtype = errors.InvalidArgumentError
+    else:
+      expected_errtype = TypeError
+
+    with self.assertRaises(expected_errtype):
+      _ = x_int ^ x_bool
+    with self.assertRaises(expected_errtype):
+      _ = x_int ^ constant_op.constant("a")
+
+    with self.assertRaises(expected_errtype):
+      _ = x_bool ^ x_int
+    with self.assertRaises(expected_errtype):
+      _ = x_bool ^ constant_op.constant("a")
+
+    with self.assertRaises(expected_errtype):
+      _ = constant_op.constant("a") ^ constant_op.constant("b")
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBitwiseNotNumeric(self):
+    x = constant_op.constant([0, dtypes.int32.min, 1])
+
+    y = ~x
+
+    self.assertAllEqual(y, [-1, dtypes.int32.max, -2])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBitwiseNotBool(self):
+    x = constant_op.constant([False, True])
+
+    y = ~x
+
+    self.assertAllEqual(y, [True, False])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBitwiseNotErrors(self):
+    if context.executing_eagerly():  # :(
+      expected_errtype = errors.InvalidArgumentError
+    else:
+      expected_errtype = TypeError
+
+    with self.assertRaises(expected_errtype):
+      _ = ~constant_op.constant("a")
+
 
 @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 @test_util.run_all_in_graph_and_eager_modes
@@ -3231,6 +3380,18 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
       b = variables.Variable([3.0], name="b")
     self.assertEqual([b"loc:@a"], b.op.colocation_groups())
 
+  @test_util.run_deprecated_v1
+  def testColocateResourceVariablesInFunction(self):
+    with ops.device("/device:CPU:0"):
+      a = resource_variable_ops.ResourceVariable(1.0)
+
+    @def_function.function
+    def f():
+      with ops.colocate_with(a):
+        b = array_ops.ones([], name="output")
+        self.assertEqual("/device:CPU:0", b.op.device)
+    f()
+
   @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def testColocateWithVariableInFunction(self):
     v = variables.Variable(1.)
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 8a3c940a566..f81e99242bf 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -90,7 +90,7 @@ void AddInferredAttr(const string& indentation, const string& attr_name,
 string VectorToTuple(const std::vector<string>& l) {
   if (l.size() == 1) return strings::StrCat("(", l.front(), ",)");
   string ret = "(";
-  for (int i = 0, iter_limit = l.size(); i < iter_limit; ++i) {
+  for (int i = 0, end = l.size(); i < end; ++i) {
     if (i > 0) {
       strings::StrAppend(&ret, ", ");
     }
@@ -102,11 +102,11 @@ string VectorToTuple(const std::vector<string>& l) {
 
 void Unflatten(const string& prefix, const std::vector<string>& output_sizes,
                const string& var, string* result) {
-  for (int i = 0, iter_limit = output_sizes.size(); i < iter_limit; ++i) {
+  for (int i = 0, end = output_sizes.size(); i < end; ++i) {
     if (!output_sizes[i].empty()) {
       strings::StrAppend(result, prefix, var, " = ");
       if (i > 0) strings::StrAppend(result, var, "[:", i, "] + ");
-      if (i + 1 < iter_limit) {
+      if (i + 1 < end) {
         // Special case i == 0 to avoid "0 +" in the generated code.
         if (i == 0) {
           strings::StrAppend(result, "[", var, "[:", output_sizes[i], "]] + ",
@@ -334,8 +334,8 @@ string GenEagerPythonOp::Code() {
   // from the end of params_no_default_, and adding params_no_default_.
   attrs_.reserve(params_no_default_.size() - op_def_.input_arg_size() +
                  params_with_default_.size());
-  for (int i = op_def_.input_arg_size(), iter_limit = params_no_default_.size();
-       i < iter_limit; ++i) {
+  for (int i = op_def_.input_arg_size(), end = params_no_default_.size();
+       i < end; ++i) {
     attrs_.push_back(params_no_default_[i].GetName());
   }
   for (const auto& p : params_with_default_) {
@@ -397,7 +397,7 @@ string GenEagerPythonOp::Code() {
                      parameters_with_defaults.empty() ? "" : ", ", "name=None");
 
   // Add attr_expressions_ for attrs that are params.
-  for (int i = 0, iter_limit = attrs_.size(); i < iter_limit; ++i) {
+  for (int i = 0, end = attrs_.size(); i < end; ++i) {
     const string& attr_name = attrs_[i];
     const string& attr_api_name =
         param_names_[i + op_def_.input_arg_size()].GetRenameTo();
@@ -678,7 +678,7 @@ bool GenEagerPythonOp::GetEagerFunctionSetup(const string& indentation,
     }
   }
 
-  for (int i = 0, iter_limit = attrs_.size(); i < iter_limit; ++i) {
+  for (int i = 0, end = attrs_.size(); i < end; ++i) {
     const string& attr_name = attrs_[i];
     const auto& param = param_names_[i + op_def_.input_arg_size()];
     const auto& attr = *FindAttr(attr_name, op_def_);
@@ -1008,6 +1008,16 @@ void GenEagerPythonOp::AddEagerInferredAttrs(const string& indentation) {
             FlattenInputs(&arg_list->second, &output_sizes);
         string conversion = strings::StrCat("_execute.args_to_matching_eager(",
                                             flattened, ", ctx");
+
+        strings::StrAppend(&conversion, ", [");
+        for (int t : attr.allowed_values().list().type()) {
+          DataType dtype = static_cast<DataType>(t);
+          const string py_dtype =
+              python_op_gen_internal::DataTypeToPython(dtype, "_dtypes.");
+          strings::StrAppend(&conversion, py_dtype, ", ");
+        }
+        strings::StrAppend(&conversion, "]");
+
         if (attr.has_default_value()) {
           strings::StrAppend(
               &conversion, ", ",
diff --git a/tensorflow/python/framework/python_op_gen_internal.cc b/tensorflow/python/framework/python_op_gen_internal.cc
index b345a8da68f..adcde1052fd 100644
--- a/tensorflow/python/framework/python_op_gen_internal.cc
+++ b/tensorflow/python/framework/python_op_gen_internal.cc
@@ -562,12 +562,12 @@ string GenPythonOp::Code() {
   // from the end of args_no_default, and adding args_no_default.
   attrs_.reserve(params_no_default.size() - op_def_.input_arg_size() +
                  params_with_default.size());
-  for (int i = op_def_.input_arg_size(), iter_limit = params_no_default.size();
-       i < iter_limit; ++i) {
+  for (int i = op_def_.input_arg_size(), end = params_no_default.size();
+       i < end; ++i) {
     attrs_.push_back(params_no_default[i].GetName());
   }
-  for (const auto& param : params_with_default) {
-    attrs_.push_back(param.GetName());
+  for (int i = 0, end = params_with_default.size(); i < end; ++i) {
+    attrs_.push_back(params_with_default[i].GetName());
   }
 
   param_names_.reserve(params_no_default.size() + params_with_default.size());
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index 76cb24f2cc6..5704563a92e 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -178,6 +178,31 @@ class SparseTensor(internal.NativeObject, composite_tensor.CompositeTensor):
     """
     return self._values
 
+  def with_values(self, new_values):
+    """Returns a copy of `self` with `values` replaced by `new_values`.
+
+    This method produces a new `SparseTensor` that has the same nonzero
+    `indices` and same `dense_shape`, but updated values.
+
+    Args:
+      new_values: The values of the new `SparseTensor`. Needs to have the same
+        shape as the current `.values` `Tensor`. May have a different type than
+        the current `values`.
+
+    Returns:
+      A `SparseTensor` with identical indices and shape but updated values.
+
+    Example usage:
+
+    >>> st = tf.sparse.from_dense([[1, 0, 2, 0], [3, 0, 0, 4]])
+    >>> tf.sparse.to_dense(st.with_values([10, 20, 30, 40]))  # 4 nonzero values
+    <tf.Tensor: shape=(2, 4), dtype=int32, numpy=
+    array([[10,  0, 20,  0],
+           [30,  0,  0, 40]], dtype=int32)>
+
+    """
+    return SparseTensor(self._indices, new_values, self._dense_shape)
+
   @property
   def op(self):
     """The `Operation` that produces `values` as an output."""
diff --git a/tensorflow/python/framework/sparse_tensor_test.py b/tensorflow/python/framework/sparse_tensor_test.py
index 0d18af1fe2f..404d91e799e 100644
--- a/tensorflow/python/framework/sparse_tensor_test.py
+++ b/tensorflow/python/framework/sparse_tensor_test.py
@@ -24,6 +24,7 @@ import numpy as np
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
@@ -52,15 +53,14 @@ class SparseTensorTest(test_util.TensorFlowTestCase):
       self.assertEqual(sp.dense_shape.dtype, dtypes.int64)
       self.assertEqual(sp.get_shape(), (4, 5))
 
-      with self.cached_session() as sess:
-        value = self.evaluate(sp)
-        self.assertAllEqual(indices, value.indices)
-        self.assertAllEqual(values, value.values)
-        self.assertAllEqual(shape, value.dense_shape)
-        sess_run_value = self.evaluate(sp)
-        self.assertAllEqual(sess_run_value.indices, value.indices)
-        self.assertAllEqual(sess_run_value.values, value.values)
-        self.assertAllEqual(sess_run_value.dense_shape, value.dense_shape)
+      value = self.evaluate(sp)
+      self.assertAllEqual(indices, value.indices)
+      self.assertAllEqual(values, value.values)
+      self.assertAllEqual(shape, value.dense_shape)
+      sp_value = self.evaluate(sp)
+      self.assertAllEqual(sp_value.indices, value.indices)
+      self.assertAllEqual(sp_value.values, value.values)
+      self.assertAllEqual(sp_value.dense_shape, value.dense_shape)
 
   def testShape(self):
 
@@ -97,33 +97,43 @@ class SparseTensorTest(test_util.TensorFlowTestCase):
       self.assertIn(dense.op, sp.consumers())
       self.assertIn(out.op, sp.consumers())
 
+  def testWithValues(self):
+    source = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]], values=[1., 2], dense_shape=[3, 4])
+    new_tensor = source.with_values([5.0, 1.0])
+    self.assertAllEqual(new_tensor.indices, source.indices)
+    self.assertAllEqual(new_tensor.values, [5.0, 1.0])
+    self.assertAllEqual(new_tensor.dense_shape, source.dense_shape)
+
+    # ensure new value's shape is checked
+    with self.assertRaises((errors.InvalidArgumentError, ValueError)):
+      source.with_values([[5.0, 1.0]])
+
 
 class ConvertToTensorOrSparseTensorTest(test_util.TensorFlowTestCase):
 
   def test_convert_dense(self):
-    with self.cached_session():
-      value = [42, 43]
-      from_value = sparse_tensor.convert_to_tensor_or_sparse_tensor(
-          value)
-      self.assertAllEqual(value, self.evaluate(from_value))
+    value = [42, 43]
+    from_value = sparse_tensor.convert_to_tensor_or_sparse_tensor(
+        value)
+    self.assertAllEqual(value, self.evaluate(from_value))
 
-  @test_util.run_deprecated_v1
   def test_convert_sparse(self):
-    with self.cached_session():
-      indices = [[0, 1], [1, 0]]
-      values = [42, 43]
-      shape = [2, 2]
-      sparse_tensor_value = sparse_tensor.SparseTensorValue(
-          indices, values, shape)
-      st = sparse_tensor.SparseTensor.from_value(sparse_tensor_value)
-      from_value = sparse_tensor.convert_to_tensor_or_sparse_tensor(
-          sparse_tensor_value).eval()
-      from_tensor = sparse_tensor.convert_to_tensor_or_sparse_tensor(st).eval()
-      for convertee in [from_value, from_tensor]:
-        self.assertAllEqual(sparse_tensor_value.indices, convertee.indices)
-        self.assertAllEqual(sparse_tensor_value.values, convertee.values)
-        self.assertAllEqual(
-            sparse_tensor_value.dense_shape, convertee.dense_shape)
+    indices = [[0, 1], [1, 0]]
+    values = [42, 43]
+    shape = [2, 2]
+    sparse_tensor_value = sparse_tensor.SparseTensorValue(
+        indices, values, shape)
+    st = sparse_tensor.SparseTensor.from_value(sparse_tensor_value)
+    from_value = self.evaluate(
+        sparse_tensor.convert_to_tensor_or_sparse_tensor(sparse_tensor_value))
+    from_tensor = self.evaluate(
+        sparse_tensor.convert_to_tensor_or_sparse_tensor(st))
+    for convertee in [from_value, from_tensor]:
+      self.assertAllEqual(sparse_tensor_value.indices, convertee.indices)
+      self.assertAllEqual(sparse_tensor_value.values, convertee.values)
+      self.assertAllEqual(
+          sparse_tensor_value.dense_shape, convertee.dense_shape)
 
 
 class SparseTensorShapeTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/framework/tensor_spec_test.py b/tensorflow/python/framework/tensor_spec_test.py
index f67aa4c9013..d8932275f88 100644
--- a/tensorflow/python/framework/tensor_spec_test.py
+++ b/tensorflow/python/framework/tensor_spec_test.py
@@ -22,6 +22,7 @@ import pickle
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -51,41 +52,41 @@ class TensorSpecTest(test_util.TensorFlowTestCase):
     desc = tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32)
     self.assertEqual(desc.shape, tensor_shape.TensorShape(None))
 
-  @test_util.run_deprecated_v1
   def testShapeCompatibility(self):
-    unknown = array_ops.placeholder(dtypes.int64)
-    partial = array_ops.placeholder(dtypes.int64, shape=[None, 1])
-    full = array_ops.placeholder(dtypes.int64, shape=[2, 3])
-    rank3 = array_ops.placeholder(dtypes.int64, shape=[4, 5, 6])
+    # This test needs a placeholder which means we need to construct a graph.
+    with ops.Graph().as_default():
+      unknown = array_ops.placeholder(dtypes.int64)
+      partial = array_ops.placeholder(dtypes.int64, shape=[None, 1])
+      full = array_ops.placeholder(dtypes.int64, shape=[2, 3])
+      rank3 = array_ops.placeholder(dtypes.int64, shape=[4, 5, 6])
 
-    desc_unknown = tensor_spec.TensorSpec(None, dtypes.int64)
-    self.assertTrue(desc_unknown.is_compatible_with(unknown))
-    self.assertTrue(desc_unknown.is_compatible_with(partial))
-    self.assertTrue(desc_unknown.is_compatible_with(full))
-    self.assertTrue(desc_unknown.is_compatible_with(rank3))
+      desc_unknown = tensor_spec.TensorSpec(None, dtypes.int64)
+      self.assertTrue(desc_unknown.is_compatible_with(unknown))
+      self.assertTrue(desc_unknown.is_compatible_with(partial))
+      self.assertTrue(desc_unknown.is_compatible_with(full))
+      self.assertTrue(desc_unknown.is_compatible_with(rank3))
 
-    desc_partial = tensor_spec.TensorSpec([2, None], dtypes.int64)
-    self.assertTrue(desc_partial.is_compatible_with(unknown))
-    self.assertTrue(desc_partial.is_compatible_with(partial))
-    self.assertTrue(desc_partial.is_compatible_with(full))
-    self.assertFalse(desc_partial.is_compatible_with(rank3))
+      desc_partial = tensor_spec.TensorSpec([2, None], dtypes.int64)
+      self.assertTrue(desc_partial.is_compatible_with(unknown))
+      self.assertTrue(desc_partial.is_compatible_with(partial))
+      self.assertTrue(desc_partial.is_compatible_with(full))
+      self.assertFalse(desc_partial.is_compatible_with(rank3))
 
-    desc_full = tensor_spec.TensorSpec([2, 3], dtypes.int64)
-    self.assertTrue(desc_full.is_compatible_with(unknown))
-    self.assertFalse(desc_full.is_compatible_with(partial))
-    self.assertTrue(desc_full.is_compatible_with(full))
-    self.assertFalse(desc_full.is_compatible_with(rank3))
+      desc_full = tensor_spec.TensorSpec([2, 3], dtypes.int64)
+      self.assertTrue(desc_full.is_compatible_with(unknown))
+      self.assertFalse(desc_full.is_compatible_with(partial))
+      self.assertTrue(desc_full.is_compatible_with(full))
+      self.assertFalse(desc_full.is_compatible_with(rank3))
 
-    desc_rank3 = tensor_spec.TensorSpec([4, 5, 6], dtypes.int64)
-    self.assertTrue(desc_rank3.is_compatible_with(unknown))
-    self.assertFalse(desc_rank3.is_compatible_with(partial))
-    self.assertFalse(desc_rank3.is_compatible_with(full))
-    self.assertTrue(desc_rank3.is_compatible_with(rank3))
+      desc_rank3 = tensor_spec.TensorSpec([4, 5, 6], dtypes.int64)
+      self.assertTrue(desc_rank3.is_compatible_with(unknown))
+      self.assertFalse(desc_rank3.is_compatible_with(partial))
+      self.assertFalse(desc_rank3.is_compatible_with(full))
+      self.assertTrue(desc_rank3.is_compatible_with(rank3))
 
-  @test_util.run_deprecated_v1
   def testTypeCompatibility(self):
-    floats = array_ops.placeholder(dtypes.float32, shape=[10, 10])
-    ints = array_ops.placeholder(dtypes.int32, shape=[10, 10])
+    floats = constant_op.constant(1, dtype=dtypes.float32, shape=[10, 10])
+    ints = constant_op.constant(1, dtype=dtypes.int32, shape=[10, 10])
     desc = tensor_spec.TensorSpec(shape=(10, 10), dtype=dtypes.float32)
     self.assertTrue(desc.is_compatible_with(floats))
     self.assertFalse(desc.is_compatible_with(ints))
@@ -118,28 +119,31 @@ class TensorSpecTest(test_util.TensorFlowTestCase):
     spec_2 = tensor_spec.TensorSpec.from_spec(spec_1)
     self.assertEqual(spec_1, spec_2)
 
-  @test_util.run_deprecated_v1
   def testFromTensor(self):
     zero = constant_op.constant(0)
     spec = tensor_spec.TensorSpec.from_tensor(zero)
     self.assertEqual(spec.dtype, dtypes.int32)
     self.assertEqual(spec.shape, [])
-    self.assertEqual(spec.name, "Const")
+    # Tensor.name is meaningless when eager execution is enabled.
+    if not context.executing_eagerly():
+      self.assertEqual(spec.name, "Const")
 
-  @test_util.run_deprecated_v1
   def testFromPlaceholder(self):
-    unknown = array_ops.placeholder(dtypes.int64, name="unknown")
-    partial = array_ops.placeholder(dtypes.float32,
-                                    shape=[None, 1],
-                                    name="partial")
-    spec_1 = tensor_spec.TensorSpec.from_tensor(unknown)
-    self.assertEqual(spec_1.dtype, dtypes.int64)
-    self.assertEqual(spec_1.shape, None)
-    self.assertEqual(spec_1.name, "unknown")
-    spec_2 = tensor_spec.TensorSpec.from_tensor(partial)
-    self.assertEqual(spec_2.dtype, dtypes.float32)
-    self.assertEqual(spec_2.shape.as_list(), [None, 1])
-    self.assertEqual(spec_2.name, "partial")
+    # This test needs a placeholder which means we need to construct a graph.
+    with ops.Graph().as_default():
+      unknown = array_ops.placeholder(dtypes.int64, name="unknown")
+      partial = array_ops.placeholder(dtypes.float32,
+                                      shape=[None, 1],
+                                      name="partial")
+
+      spec_1 = tensor_spec.TensorSpec.from_tensor(unknown)
+      self.assertEqual(spec_1.dtype, dtypes.int64)
+      self.assertEqual(spec_1.shape, None)
+      self.assertEqual(spec_1.name, "unknown")
+      spec_2 = tensor_spec.TensorSpec.from_tensor(partial)
+      self.assertEqual(spec_2.dtype, dtypes.float32)
+      self.assertEqual(spec_2.shape.as_list(), [None, 1])
+      self.assertEqual(spec_2.name, "partial")
 
   def testFromBoundedTensorSpec(self):
     bounded_spec = tensor_spec.BoundedTensorSpec((1, 2), dtypes.int32, 0, 1)
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 6289ee5b3e8..6c4c9855cf8 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -114,9 +114,9 @@ if _FAST_TENSOR_UTIL_AVAILABLE:
       dtypes.quint8.as_numpy_dtype:
           fast_tensor_util.AppendUInt8ArrayToTensorProto,
       dtypes.qint16.as_numpy_dtype:
-          fast_tensor_util.AppendInt8ArrayToTensorProto,
+          fast_tensor_util.AppendInt16ArrayToTensorProto,
       dtypes.quint16.as_numpy_dtype:
-          fast_tensor_util.AppendUInt8ArrayToTensorProto,
+          fast_tensor_util.AppendUInt16ArrayToTensorProto,
       dtypes.qint32.as_numpy_dtype:
           fast_tensor_util.AppendInt32ArrayToTensorProto,
       # NOTE(touts): Intentionally no way to feed a DT_BFLOAT16.
@@ -865,7 +865,7 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
     ValueError: If the shape is rank-0 and is not statically known to be -1.
   """
   if isinstance(tensor, ops.EagerTensor):
-    return tensor_shape.as_shape(
+    return tensor_shape.TensorShape(
         [dim if dim != -1 else None for dim in tensor.numpy()])
 
   if tensor.get_shape().ndims == 0:
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index 6d7643cc805..9d2a7b7ed6d 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import indexed_slices
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
@@ -816,15 +817,15 @@ class ConstantValueTest(test.TestCase):
     tf_val = constant_op.constant(np_val)
     self.assertAllClose(np_val, tensor_util.constant_value(tf_val))
 
-  @test_util.run_deprecated_v1
   def testUnknown(self):
-    tf_val = gen_state_ops.variable(
-        shape=[3, 4, 7],
-        dtype=dtypes.float32,
-        name="tf_val",
-        container="",
-        shared_name="")
-    self.assertIs(None, tensor_util.constant_value(tf_val))
+    with ops.Graph().as_default():
+      tf_val = gen_state_ops.variable(
+          shape=[3, 4, 7],
+          dtype=dtypes.float32,
+          name="tf_val",
+          container="",
+          shared_name="")
+      self.assertIs(None, tensor_util.constant_value(tf_val))
 
   def testShape(self):
     np_val = np.array([1, 2, 3], dtype=np.int32)
@@ -845,19 +846,17 @@ class ConstantValueTest(test.TestCase):
     c_val = tensor_util.constant_value(tf_val)
     self.assertEqual(6, c_val)
 
-  @test_util.run_deprecated_v1
   def testSizeOfScalar(self):
     tf_val = array_ops.size(constant_op.constant(0.0))
     c_val = tensor_util.constant_value(tf_val)
     self.assertEqual(1, c_val)
-    self.assertEqual(np.ndarray, type(c_val))
+    self.assertIn(type(c_val), [np.ndarray, np.int32])
 
-  @test_util.run_deprecated_v1
   def testRank(self):
     tf_val = array_ops.rank(constant_op.constant(0.0, shape=[1, 2, 3]))
     c_val = tensor_util.constant_value(tf_val)
 
-    self.assertEqual(np.ndarray, type(c_val))
+    self.assertIn(type(c_val), [np.ndarray, np.int32])
     self.assertEqual((), c_val.shape)
     self.assertEqual(3, c_val)
 
@@ -868,7 +867,7 @@ class ConstantValueTest(test.TestCase):
             0.0, shape=[1, 2, 3]), optimize=False)
     c_val = tensor_util.constant_value(tf_val)
 
-    self.assertEqual(np.ndarray, type(c_val))
+    self.assertIn(type(c_val), [np.ndarray, np.int32])
     self.assertEqual((), c_val.shape)
     self.assertEqual(3, c_val)
     self.assertEqual([3], c_val)
@@ -884,7 +883,6 @@ class ConstantValueTest(test.TestCase):
     c_val = tensor_util.constant_value(tf_val)
     self.assertAllClose(np_val.astype(np.float64), c_val)
 
-  @test_util.run_deprecated_v1
   def testConcat(self):
     np_val = np.random.rand(3, 4, 7).astype(np.float32)
     tf_val = array_ops.concat(
@@ -892,19 +890,21 @@ class ConstantValueTest(test.TestCase):
     c_val = tensor_util.constant_value(tf_val)
     self.assertAllClose(np_val, c_val)
 
-    tf_val = array_ops.concat(
-        [np_val[0, :, :], np_val[1, :, :], np_val[2, :, :]],
-        array_ops.placeholder(dtypes.int32))
-    c_val = tensor_util.constant_value(tf_val)
-    self.assertIs(None, c_val)
+    # This test needs a placeholder which means we need to construct a graph.
+    with ops.Graph().as_default():
+      tf_val = array_ops.concat(
+          [np_val[0, :, :], np_val[1, :, :], np_val[2, :, :]],
+          array_ops.placeholder(dtypes.int32))
+      c_val = tensor_util.constant_value(tf_val)
+      self.assertIs(None, c_val)
 
-    tf_val = array_ops.concat([
-        np_val[0, :, :], array_ops.placeholder(dtypes.float32), np_val[2, :, :]
-    ], 1)
-    c_val = tensor_util.constant_value(tf_val)
-    self.assertIs(None, c_val)
+      tf_val = array_ops.concat([
+          np_val[0, :, :],
+          array_ops.placeholder(dtypes.float32), np_val[2, :, :]
+      ], 1)
+      c_val = tensor_util.constant_value(tf_val)
+      self.assertIs(None, c_val)
 
-  @test_util.run_deprecated_v1
   def testPack_Axis0(self):
     inputs = [np.random.rand(4, 7) for _ in range(3)]
     np_val = np.array(inputs)
@@ -912,72 +912,79 @@ class ConstantValueTest(test.TestCase):
     c_val = tensor_util.constant_value(tf_val)
     self.assertAllClose(np_val, c_val)
 
-    tf_val = array_ops.stack(
-        [inputs[0], array_ops.placeholder(dtypes.float32), inputs[2]])
-    c_val = tensor_util.constant_value(tf_val)
-    self.assertIs(None, c_val)
+    # This test needs a placeholder which means we need to construct a graph.
+    with ops.Graph().as_default():
+      tf_val = array_ops.stack(
+          [inputs[0],
+           array_ops.placeholder(dtypes.float32), inputs[2]])
+      c_val = tensor_util.constant_value(tf_val)
+      self.assertIs(None, c_val)
 
-  @test_util.run_deprecated_v1
   def testPack_Axis1(self):
-    inputs = [np.random.rand(4, 7) for _ in range(3)]
-    tf_val = array_ops.stack(inputs, axis=1)
-    c_val = tensor_util.constant_value(tf_val)
-    self.assertIsNone(c_val)
+    # This test needs a placeholder which means we need to construct a graph.
+    with ops.Graph().as_default():
+      inputs = [np.random.rand(4, 7) for _ in range(3)]
+      tf_val = array_ops.stack(inputs, axis=1)
+      c_val = tensor_util.constant_value(tf_val)
+      self.assertIsNone(c_val)
 
-    tf_val = array_ops.stack(
-        [inputs[0], array_ops.placeholder(dtypes.float32), inputs[2]], axis=1)
-    c_val = tensor_util.constant_value(tf_val)
-    self.assertIs(None, c_val)
+      tf_val = array_ops.stack(
+          [inputs[0],
+           array_ops.placeholder(dtypes.float32), inputs[2]], axis=1)
+      c_val = tensor_util.constant_value(tf_val)
+      self.assertIs(None, c_val)
 
-  @test_util.run_deprecated_v1
   def testPack_Partial_Axis0(self):
     input_ = np.random.rand(4, 7)
-    tf_val = array_ops.stack([input_, array_ops.placeholder(dtypes.float32)])
-    c_val = tensor_util.constant_value(tf_val, partial=True)
-    self.assertAllClose(input_, c_val[0])
-    self.assertIsNone(c_val[1])
+    # This test needs a placeholder which means we need to construct a graph.
+    with ops.Graph().as_default():
+      tf_val = array_ops.stack([input_, array_ops.placeholder(dtypes.float32)])
+      c_val = tensor_util.constant_value(tf_val, partial=True)
+      self.assertAllClose(input_, c_val[0])
+      self.assertIsNone(c_val[1])
 
-  @test_util.run_deprecated_v1
   def testPack_Partial_Axis1(self):
     input_ = np.random.rand(4, 7)
-    tf_val = array_ops.stack([input_, array_ops.placeholder(dtypes.float32)],
-                             axis=1)
-    c_val = tensor_util.constant_value(tf_val, partial=True)
-    self.assertIsNone(c_val)
+    # This test needs a placeholder which means we need to construct a graph.
+    with ops.Graph().as_default():
+      tf_val = array_ops.stack(
+          [input_, array_ops.placeholder(dtypes.float32)], axis=1)
+      c_val = tensor_util.constant_value(tf_val, partial=True)
+      self.assertIsNone(c_val)
 
-  @test_util.run_deprecated_v1
   def testUnpack_Axis0(self):
     inputs = np.random.rand(3, 4, 7)
     tf_vals = array_ops.unstack(inputs)
     c_vals = [tensor_util.constant_value(x) for x in tf_vals]
     self.assertAllClose(inputs, c_vals)
 
-  @test_util.run_deprecated_v1
   def testUnpack_Partial_Axis0(self):
     input_ = np.random.rand(4, 7)
-    packed = array_ops.stack([input_, array_ops.placeholder(dtypes.float32)])
-    tf_vals = array_ops.unstack(packed)
-    c_vals = [tensor_util.constant_value(x, partial=True) for x in tf_vals]
-    self.assertAllClose(input_, c_vals[0])
-    self.assertIsNone(c_vals[1])
+    # This test needs a placeholder which means we need to construct a graph.
+    with ops.Graph().as_default():
+      packed = array_ops.stack([input_, array_ops.placeholder(dtypes.float32)])
+      tf_vals = array_ops.unstack(packed)
+      c_vals = [tensor_util.constant_value(x, partial=True) for x in tf_vals]
+      self.assertAllClose(input_, c_vals[0])
+      self.assertIsNone(c_vals[1])
 
-  @test_util.run_deprecated_v1
   def testSplit_Axis0(self):
     inputs = np.random.rand(6, 5, 7)
     tf_vals = array_ops.split(inputs, 3)
     c_vals = [tensor_util.constant_value(x) for x in tf_vals]
     self.assertAllClose(np.split(inputs, 3), c_vals)
 
-  @test_util.run_deprecated_v1
   def testSplit_Partial_Axis0(self):
     input_ = np.random.rand(4, 7)
-    placeholder = array_ops.placeholder(dtypes.float32, shape=(4, 7))
-    # it'd be better to use concat here, but concat doesn't support partial
-    packed = array_ops.stack([input_, placeholder])
-    tf_vals = array_ops.split(packed, 2)
-    c_vals = [tensor_util.constant_value(x, partial=True) for x in tf_vals]
-    self.assertAllClose(input_, c_vals[0][0])
-    self.assertIsNone(c_vals[1][0])
+    # This test needs a placeholder which means we need to construct a graph.
+    with ops.Graph().as_default():
+      placeholder = array_ops.placeholder(dtypes.float32, shape=(4, 7))
+      # it'd be better to use concat here, but concat doesn't support partial
+      packed = array_ops.stack([input_, placeholder])
+      tf_vals = array_ops.split(packed, 2)
+      c_vals = [tensor_util.constant_value(x, partial=True) for x in tf_vals]
+      self.assertAllClose(input_, c_vals[0][0])
+      self.assertIsNone(c_vals[1][0])
 
   def testEqual(self):
     # Scalar inputs.
@@ -1079,32 +1086,35 @@ class ConstantValueAsShapeTest(test.TestCase):
     c_val = tensor_util.constant_value_as_shape(tf_val)
     self.assertEqual([None, 1, None], c_val.as_list())
 
-  @test_util.run_deprecated_v1
   def testPack(self):
-    tf_val = array_ops.stack(
-        [constant_op.constant(16), 37, array_ops.placeholder(dtypes.int32)])
-    c_val = tensor_util.constant_value_as_shape(tf_val)
-    self.assertEqual([16, 37, None], c_val.as_list())
+    # This test needs a placeholder which means we need to construct a graph.
+    with ops.Graph().as_default():
+      tf_val = array_ops.stack(
+          [constant_op.constant(16), 37,
+           array_ops.placeholder(dtypes.int32)])
+      c_val = tensor_util.constant_value_as_shape(tf_val)
+      self.assertEqual([16, 37, None], c_val.as_list())
 
-  @test_util.run_deprecated_v1
   def testConcat(self):
-    tf_val = array_ops.concat(
-        [[16, 37], array_ops.placeholder(
-            dtypes.int32, shape=(2,))], 0)
-    c_val = tensor_util.constant_value_as_shape(tf_val)
-    self.assertEqual([16, 37, None, None], c_val.as_list())
+    # This test needs a placeholder which means we need to construct a graph.
+    with ops.Graph().as_default():
+      tf_val = array_ops.concat(
+          [[16, 37], array_ops.placeholder(dtypes.int32, shape=(2,))], 0)
+      c_val = tensor_util.constant_value_as_shape(tf_val)
+      self.assertEqual([16, 37, None, None], c_val.as_list())
 
-    tf_val = array_ops.concat(
-        [[16, 37], array_ops.placeholder(
-            dtypes.int32, shape=(1,)), [48]], 0)
-    c_val = tensor_util.constant_value_as_shape(tf_val)
-    self.assertEqual([16, 37, None, 48], c_val.as_list())
+      tf_val = array_ops.concat(
+          [[16, 37],
+           array_ops.placeholder(dtypes.int32, shape=(1,)), [48]], 0)
+      c_val = tensor_util.constant_value_as_shape(tf_val)
+      self.assertEqual([16, 37, None, 48], c_val.as_list())
 
-  @test_util.run_deprecated_v1
   def testSlice(self):
-    tf_val = array_ops.placeholder(dtypes.int32, shape=(4,))[0:2]
-    c_val = tensor_util.constant_value_as_shape(tf_val)
-    self.assertEqual([None, None], c_val.as_list())
+    # This test needs a placeholder which means we need to construct a graph.
+    with ops.Graph().as_default():
+      tf_val = array_ops.placeholder(dtypes.int32, shape=(4,))[0:2]
+      c_val = tensor_util.constant_value_as_shape(tf_val)
+      self.assertEqual([None, None], c_val.as_list())
 
     # begin:end
     tf_val = constant_op.constant([10, 20, 30])[1:3]
@@ -1118,65 +1128,67 @@ class ConstantValueAsShapeTest(test.TestCase):
     self.assertEqual([20], c_val.as_list())
 
     # [1, 2, 16, 37, None, 48]
-    tf_val_orig = array_ops.concat(
-        [[1, 2, 16, 37], array_ops.placeholder(
-            dtypes.int32, shape=(1,)), [48]], 0)
+    # This test needs a placeholder which means we need to construct a graph.
+    with ops.Graph().as_default():
+      tf_val_orig = array_ops.concat(
+          [[1, 2, 16, 37],
+           array_ops.placeholder(dtypes.int32, shape=(1,)), [48]], 0)
 
-    # begin: no end
-    tf_val = tf_val_orig[2:]
-    c_val = tensor_util.constant_value_as_shape(tf_val)
-    self.assertEqual([16, 37, None, 48], c_val.as_list())
-
-    # begin::negative slice
-    tf_val = tf_val_orig[2::-1]
-    c_val = tensor_util.constant_value_as_shape(tf_val)
-    self.assertEqual([16, 2, 1], c_val.as_list())
-
-    # :end:negative slice
-    tf_val = tf_val_orig[:1:-2]
-    c_val = tensor_util.constant_value_as_shape(tf_val)
-    self.assertEqual([48, 37], c_val.as_list())
-
-    # begin:end:negative slice
-    tf_val = tf_val_orig[3:1:-1]
-    c_val = tensor_util.constant_value_as_shape(tf_val)
-    self.assertEqual([37, 16], c_val.as_list())
-
-    # begin:negative end:slice
-    tf_val = tf_val_orig[1:-3:1]
-    c_val = tensor_util.constant_value_as_shape(tf_val)
-    self.assertEqual([2, 16], c_val.as_list())
-
-    # negative begin::slice
-    tf_val = tf_val_orig[-3::1]
-    c_val = tensor_util.constant_value_as_shape(tf_val)
-    self.assertEqual([37, None, 48], c_val.as_list())
-
-    # negative begin::negative slice
-    tf_val = tf_val_orig[-3::-1]
-    c_val = tensor_util.constant_value_as_shape(tf_val)
-    self.assertEqual([37, 16, 2, 1], c_val.as_list())
-
-    # negative begin:negative end:negative slice
-    tf_val = tf_val_orig[-3:-5:-1]
-    c_val = tensor_util.constant_value_as_shape(tf_val)
-    self.assertEqual([37, 16], c_val.as_list())
-
-    # Do not support shape inference for additional arguments
-    tf_val = constant_op.constant([10, 20, 30])[...]
-    c_val = tensor_util.constant_value_as_shape(tf_val)
-    self.assertEqual([None, None, None], c_val.as_list())
-
-    # Do not support shape inference for tensor slices.
-    tf_val = constant_op.constant([10, 20, 30])[
-        array_ops.placeholder(dtypes.int32, shape=()):]
-    c_val = tensor_util.constant_value_as_shape(tf_val)
-    self.assertEqual(tensor_shape.unknown_shape(), c_val)
-
-    # Do not support shape inference for higher rank
-    with self.assertRaises(ValueError):
-      tf_val = constant_op.constant([[10], [20], [30]])[:, 0:]
+      # begin: no end
+      tf_val = tf_val_orig[2:]
       c_val = tensor_util.constant_value_as_shape(tf_val)
+      self.assertEqual([16, 37, None, 48], c_val.as_list())
+
+      # begin::negative slice
+      tf_val = tf_val_orig[2::-1]
+      c_val = tensor_util.constant_value_as_shape(tf_val)
+      self.assertEqual([16, 2, 1], c_val.as_list())
+
+      # :end:negative slice
+      tf_val = tf_val_orig[:1:-2]
+      c_val = tensor_util.constant_value_as_shape(tf_val)
+      self.assertEqual([48, 37], c_val.as_list())
+
+      # begin:end:negative slice
+      tf_val = tf_val_orig[3:1:-1]
+      c_val = tensor_util.constant_value_as_shape(tf_val)
+      self.assertEqual([37, 16], c_val.as_list())
+
+      # begin:negative end:slice
+      tf_val = tf_val_orig[1:-3:1]
+      c_val = tensor_util.constant_value_as_shape(tf_val)
+      self.assertEqual([2, 16], c_val.as_list())
+
+      # negative begin::slice
+      tf_val = tf_val_orig[-3::1]
+      c_val = tensor_util.constant_value_as_shape(tf_val)
+      self.assertEqual([37, None, 48], c_val.as_list())
+
+      # negative begin::negative slice
+      tf_val = tf_val_orig[-3::-1]
+      c_val = tensor_util.constant_value_as_shape(tf_val)
+      self.assertEqual([37, 16, 2, 1], c_val.as_list())
+
+      # negative begin:negative end:negative slice
+      tf_val = tf_val_orig[-3:-5:-1]
+      c_val = tensor_util.constant_value_as_shape(tf_val)
+      self.assertEqual([37, 16], c_val.as_list())
+
+      # Do not support shape inference for additional arguments
+      tf_val = constant_op.constant([10, 20, 30])[...]
+      c_val = tensor_util.constant_value_as_shape(tf_val)
+      self.assertEqual([None, None, None], c_val.as_list())
+
+      # Do not support shape inference for tensor slices.
+      tf_val = constant_op.constant(
+          [10, 20, 30])[array_ops.placeholder(dtypes.int32, shape=()):]
+      c_val = tensor_util.constant_value_as_shape(tf_val)
+      self.assertEqual(tensor_shape.unknown_shape(), c_val)
+
+      # Do not support shape inference for higher rank
+      with self.assertRaises(ValueError):
+        tf_val = constant_op.constant([[10], [20], [30]])[:, 0:]
+        c_val = tensor_util.constant_value_as_shape(tf_val)
 
 
 class MaybeSetStaticShapeTest(test.TestCase):
@@ -1190,24 +1202,23 @@ class MaybeSetStaticShapeTest(test.TestCase):
     finally:
       tensor_util._ENABLE_MAYBE_SET_STATIC_SHAPE = flag_old
 
-  @test_util.run_deprecated_v1
   def testMaybeSetStaticShape(self):
     shape = constant_op.constant([2, 5], dtype=dtypes.int32)
 
     def reshape():
       v = array_ops.zeros([10])
       return array_ops.reshape(v, shape)
+    # This test needs a placeholder which means we need to construct a graph.
+    with ops.Graph().as_default():
+      with self.disableSetStaticShape():
+        graph_without_shape_propagation = func_graph.func_graph_from_py_func(
+            "without_shape_propagation", reshape, [], {})
+      graph_with_shape_propagation = func_graph.func_graph_from_py_func(
+          "with_shape_propagation", reshape, [], {})
+      self.assertCountEqual(
+          [op.type for op in graph_without_shape_propagation.get_operations()],
+          [op.type for op in graph_with_shape_propagation.get_operations()])
 
-    with self.disableSetStaticShape():
-      graph_without_shape_propagation = func_graph.func_graph_from_py_func(
-          "without_shape_propagation", reshape, [], {})
-    graph_with_shape_propagation = func_graph.func_graph_from_py_func(
-        "with_shape_propagation", reshape, [], {})
-    self.assertCountEqual(
-        [op.type for op in graph_without_shape_propagation.get_operations()],
-        [op.type for op in graph_with_shape_propagation.get_operations()])
-
-  @test_util.run_deprecated_v1
   def testMaybeSetStaticShapeScalarShape(self):
 
     def reshape():
diff --git a/tensorflow/python/framework/test_file_system.cc b/tensorflow/python/framework/test_file_system.cc
index 6e9915adbb6..ed0a66fbefd 100644
--- a/tensorflow/python/framework/test_file_system.cc
+++ b/tensorflow/python/framework/test_file_system.cc
@@ -39,12 +39,14 @@ class TestRandomAccessFile : public RandomAccessFile {
 class TestFileSystem : public NullFileSystem {
  public:
   Status NewRandomAccessFile(
-      const string& fname, std::unique_ptr<RandomAccessFile>* result) override {
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override {
     result->reset(new TestRandomAccessFile);
     return Status::OK();
   }
   // Always return size of 10
-  Status GetFileSize(const string& fname, uint64* file_size) override {
+  Status GetFileSize(const string& fname, TransactionToken* token,
+                     uint64* file_size) override {
     *file_size = 10;
     return Status::OK();
   }
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 8ddbcf34f3b..4d7b7746b9c 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -50,6 +50,7 @@ from tensorflow.python.client import device_lib
 from tensorflow.python.client import pywrap_tf_session
 from tensorflow.python.client import session
 from tensorflow.python.compat.compat import forward_compatibility_horizon
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import tape
@@ -68,6 +69,7 @@ from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import control_flow_util_v2
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.ops import variables
@@ -773,34 +775,34 @@ def assert_no_new_tensors(f):
 
 def _find_reference_cycle(objects, idx):
 
-  def get_ignore_reason(obj, blacklist):
+  def get_ignore_reason(obj, denylist):
     """Tests whether an object should be omitted from the dependency graph."""
-    if len(blacklist) > 100:
+    if len(denylist) > 100:
       return "<depth limit>"
     if tf_inspect.isframe(obj):
       if "test_util.py" in tf_inspect.getframeinfo(obj)[0]:
         return "<test code>"
-    for b in blacklist:
+    for b in denylist:
       if b is obj:
         return "<test code>"
-    if obj is blacklist:
+    if obj is denylist:
       return "<test code>"
     return None
 
   # Note: this function is meant to help with diagnostics. Its output is purely
   # a human-readable representation, so you may freely modify it to suit your
   # needs.
-  def describe(obj, blacklist, leaves_only=False):
+  def describe(obj, denylist, leaves_only=False):
     """Returns a custom human-readable summary of obj.
 
     Args:
       obj: the value to describe.
-      blacklist: same as blacklist in get_ignore_reason.
+      denylist: same as denylist in get_ignore_reason.
       leaves_only: boolean flag used when calling describe recursively. Useful
         for summarizing collections.
     """
-    if get_ignore_reason(obj, blacklist):
-      return "{}{}".format(get_ignore_reason(obj, blacklist), type(obj))
+    if get_ignore_reason(obj, denylist):
+      return "{}{}".format(get_ignore_reason(obj, denylist), type(obj))
     if tf_inspect.isframe(obj):
       return "frame: {}".format(tf_inspect.getframeinfo(obj))
     elif tf_inspect.ismodule(obj):
@@ -810,10 +812,10 @@ def _find_reference_cycle(objects, idx):
         return "{}, {}".format(type(obj), id(obj))
       elif isinstance(obj, list):
         return "list({}): {}".format(
-            id(obj), [describe(e, blacklist, leaves_only=True) for e in obj])
+            id(obj), [describe(e, denylist, leaves_only=True) for e in obj])
       elif isinstance(obj, tuple):
         return "tuple({}): {}".format(
-            id(obj), [describe(e, blacklist, leaves_only=True) for e in obj])
+            id(obj), [describe(e, denylist, leaves_only=True) for e in obj])
       elif isinstance(obj, dict):
         return "dict({}): {} keys".format(id(obj), len(obj.keys()))
       elif tf_inspect.isfunction(obj):
@@ -822,7 +824,7 @@ def _find_reference_cycle(objects, idx):
       else:
         return "{}, {}".format(type(obj), id(obj))
 
-  def build_ref_graph(obj, graph, reprs, blacklist):
+  def build_ref_graph(obj, graph, reprs, denylist):
     """Builds a reference graph as <referrer> -> <list of referents>.
 
     Args:
@@ -832,21 +834,21 @@ def _find_reference_cycle(objects, idx):
         references, the graph holds object IDs rather than actual objects.
       reprs: Auxiliary structure that maps object IDs to their human-readable
         description.
-      blacklist: List of objects to ignore.
+      denylist: List of objects to ignore.
     """
     referrers = gc.get_referrers(obj)
-    blacklist = blacklist + (referrers,)
+    denylist = denylist + (referrers,)
 
     obj_id = id(obj)
     for r in referrers:
-      if get_ignore_reason(r, blacklist) is None:
+      if get_ignore_reason(r, denylist) is None:
         r_id = id(r)
         if r_id not in graph:
           graph[r_id] = []
         if obj_id not in graph[r_id]:
           graph[r_id].append(obj_id)
-          build_ref_graph(r, graph, reprs, blacklist)
-          reprs[r_id] = describe(r, blacklist)
+          build_ref_graph(r, graph, reprs, denylist)
+          reprs[r_id] = describe(r, denylist)
 
   def find_cycle(el, graph, reprs, path):
     """Finds and prints a single cycle in the dependency graph."""
@@ -3276,3 +3278,56 @@ def set_producer_version(graph, producer_version):
   with graph.as_default():
     importer.import_graph_def(graph_def)
   assert graph.graph_def_versions.producer, producer_version
+
+
+@contextlib.contextmanager
+def _fake_gradient_tape_context_manager():
+  """tf.gradients(...) implemented as tf.GradientTape context manager interface.
+
+  This is useful to test tf.gradients() in tests that uses tf.GradientTape().
+
+  Yields:
+    gradient tape instance that's implemented by tf.gradients() underneath.
+  """
+  try:
+    class FakeGradientTape:
+
+      def watch(self, x):
+        pass
+
+      def gradient(self, y, x, grad_ys=None):
+        result = gradients_impl.gradients(y, x, grad_ys)
+
+        # Unlike `tape.gradient()`, `tf.gradients()` returns a list for a single
+        # element. So unpack if needed to match `tape.gradient()` behavior.
+        if not isinstance(x, (list, tuple)):
+          assert len(result) == 1
+          return result[0]
+
+        return result
+
+    yield FakeGradientTape()
+  finally:
+    pass
+
+
+class AbstractGradientTape:
+  """Abstract GradientTape context manager that has multiple implementations.
+
+  This is useful to test both tf.GradientTape() and tf.gradients() without
+  duplicating tests.
+  """
+
+  def __init__(self, use_tape, persistent=False):
+    self._use_tape = use_tape
+    self._persistent = persistent
+
+  def __enter__(self):
+    if self._use_tape:
+      self._tape_impl = backprop.GradientTape(persistent=self._persistent)
+    else:
+      self._tape_impl = _fake_gradient_tape_context_manager()
+    return self._tape_impl.__enter__()
+
+  def __exit__(self, exc_type, exc_val, exc_tb):
+    self._tape_impl.__exit__(exc_type, exc_val, exc_tb)
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index f2176cd0b3b..4d85772fe42 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -52,17 +52,16 @@ from tensorflow.python.platform import googletest
 
 class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
-  @test_util.run_deprecated_v1
   def test_assert_ops_in_graph(self):
-    with self.test_session():
+    with ops.Graph().as_default():
       constant_op.constant(["hello", "taffy"], name="hello")
       test_util.assert_ops_in_graph({"hello": "Const"}, ops.get_default_graph())
 
-    self.assertRaises(ValueError, test_util.assert_ops_in_graph,
-                      {"bye": "Const"}, ops.get_default_graph())
+      self.assertRaises(ValueError, test_util.assert_ops_in_graph,
+                        {"bye": "Const"}, ops.get_default_graph())
 
-    self.assertRaises(ValueError, test_util.assert_ops_in_graph,
-                      {"hello": "Variable"}, ops.get_default_graph())
+      self.assertRaises(ValueError, test_util.assert_ops_in_graph,
+                        {"hello": "Variable"}, ops.get_default_graph())
 
   @test_util.run_deprecated_v1
   def test_session_functions(self):
@@ -571,7 +570,6 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with self.assertRaises(AssertionError):
       self.assertAllLessEqual(x, 95.0)
 
-  @test_util.run_deprecated_v1
   def testAssertAllInRangeWithNonNumericValuesFails(self):
     s1 = constant_op.constant("Hello, ", name="s1")
     c = constant_op.constant([1 + 2j, -3 + 5j], name="c")
@@ -635,25 +633,23 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with self.assertRaises(AssertionError):
       self.assertAllInSet(x, (42,))
 
-  @test_util.run_deprecated_v1
   def testRandomSeed(self):
     # Call setUp again for WithCApi case (since it makes a new default graph
     # after setup).
     # TODO(skyewm): remove this when C API is permanently enabled.
-    self.setUp()
-    a = random.randint(1, 1000)
-    a_np_rand = np.random.rand(1)
-    with self.test_session():
-      a_rand = random_ops.random_normal([1]).eval()
-    # ensure that randomness in multiple testCases is deterministic.
-    self.setUp()
-    b = random.randint(1, 1000)
-    b_np_rand = np.random.rand(1)
-    with self.test_session():
-      b_rand = random_ops.random_normal([1]).eval()
-    self.assertEqual(a, b)
-    self.assertEqual(a_np_rand, b_np_rand)
-    self.assertEqual(a_rand, b_rand)
+    with context.eager_mode():
+      self.setUp()
+      a = random.randint(1, 1000)
+      a_np_rand = np.random.rand(1)
+      a_rand = random_ops.random_normal([1])
+      # ensure that randomness in multiple testCases is deterministic.
+      self.setUp()
+      b = random.randint(1, 1000)
+      b_np_rand = np.random.rand(1)
+      b_rand = random_ops.random_normal([1])
+      self.assertEqual(a, b)
+      self.assertEqual(a_np_rand, b_np_rand)
+      self.assertAllEqual(a_rand, b_rand)
 
   @test_util.run_in_graph_and_eager_modes
   def test_callable_evaluate(self):
@@ -729,7 +725,6 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     test_util.run_in_graph_and_eager_modes(_test)(self)
     self.assertEqual(modes, ["graph"])
 
-  @test_util.run_deprecated_v1
   def test_run_in_graph_and_eager_modes_setup_in_same_mode(self):
     modes = []
     mode_name = lambda: "eager" if context.executing_eagerly() else "graph"
@@ -750,7 +745,7 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     e.setUp()
     e.testBody()
 
-    self.assertEqual(modes[0:2], ["setup_graph", "run_graph"])
+    self.assertEqual(modes[1:2], ["run_graph"])
     self.assertEqual(modes[2:], ["setup_eager", "run_eager"])
 
   @parameterized.named_parameters(dict(testcase_name="argument",
diff --git a/tensorflow/python/grappler/auto_mixed_precision_test.py b/tensorflow/python/grappler/auto_mixed_precision_test.py
index 539c2bca9f3..f196c15bd7f 100644
--- a/tensorflow/python/grappler/auto_mixed_precision_test.py
+++ b/tensorflow/python/grappler/auto_mixed_precision_test.py
@@ -46,6 +46,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import sysconfig
 from tensorflow.python.platform import test
 from tensorflow.python.training import adam
 from tensorflow.python.training import gradient_descent
@@ -59,8 +60,8 @@ def _input(shape):
 def _weight(shape):
   """Generates a weight of a given shape."""
   # Note that the lambda is needed to allow construction inside loops.
-  return variables.Variable(
-      lambda: init_ops.glorot_uniform_initializer(seed=0)(shape))
+  return variables.Variable(lambda: init_ops.glorot_uniform_initializer(seed=0)
+                            (shape))
 
 
 def _bias(shape):
@@ -138,6 +139,11 @@ def _conv_pool(x):
   return h_pool2
 
 
+def _depthwise_conv2d(x, w):
+  """Returns a 2d depthwise convolution layer with full stride."""
+  return nn.depthwise_conv2d(x, w, strides=[1, 1, 1, 1], padding='SAME')
+
+
 def _simple_loop(x, functor):
   """Simple loop whose body is provided by the functor."""
   init = (constant_op.constant(0), x)
@@ -204,11 +210,11 @@ def _make_node_with_color(color, input_tensor, name=None):
   if color == 'w':  # Allow node
     weights = _weight(input_tensor.get_shape().as_list())
     return math_ops.matmul(input_tensor, weights, name=name)
-  if color == 'g':  # Gray node
+  if color == 'g':  # Infer node
     return math_ops.add(input_tensor, 0.1, name=name)
   if color == 'c':  # Clear node
     return nn.relu(input_tensor, name=name)
-  if color == 'b':  # Black node
+  if color == 'b':  # Deny node
     return math_ops.pow(math_ops.pow(input_tensor, 2.), 0.5, name=name)
   raise ValueError('Invalid node color: ' + str(color))
 
@@ -371,8 +377,8 @@ class AutoMixedPrecisionTest(test.TestCase, parameterized.TestCase):
 
     The loop has different node colors in different sections of the graph. The
     arguments must be strings where each character represents the color of a
-    node in that section of the graph: w = allow, g = gray, c = clear,
-    b = black. CAPITALIZED characters indicate that the node is expected to be
+    node in that section of the graph: w = allow, g = infer, c = clear,
+    b = deny. CAPITALIZED characters indicate that the node is expected to be
     changed to DT_HALF during graph optimization.
 
     inp -> loop [ body ] -> out.
@@ -566,6 +572,43 @@ class AutoMixedPrecisionTest(test.TestCase, parameterized.TestCase):
     tol = 5e-3 if mode == 'mkl' else 1e-3
     self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
 
+  # TODO(benbarsdell): This test has not been tried with MKL.
+  @parameterized.parameters(['cuda'])
+  @test_util.run_deprecated_v1
+  @test_util.disable_xla('This test does not pass with XLA')
+  def test_depthwise_conv2d(self, mode):
+    """Test grad ops with depthwise convolution2d graph."""
+    self._maybe_skip(mode)
+    cudnn_version = tuple([
+        int(x) for x in sysconfig.get_build_info()['cudnn_version'].split('.')
+    ])
+    if cudnn_version < (8,):
+      # Depthwise conv2d ops are only enabled in auto_mixed_precision as of
+      # cuDNN v8.
+      self.skipTest('cuDNN version >= 8 required')
+    random_seed.set_random_seed(0)
+    x = _input([2, 8, 8, 1])
+    f = _weight([3, 3, 1, 4])
+    y = _depthwise_conv2d(x, f)
+    y = array_ops.identity(y)
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+    g = optimizer.compute_gradients(y, [x, f])
+    output = (y, g)
+
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
+    self._assert_output_f16(mode, node_map, 'depthwise')
+    self._assert_output_f16(
+        mode, node_map,
+        'gradients/depthwise_grad/DepthwiseConv2dNativeBackpropInput')
+    self._assert_output_f16(
+        mode, node_map,
+        'gradients/depthwise_grad/DepthwiseConv2dNativeBackpropFilter')
+
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    tol = 2e-3
+    self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
+
   @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_v1_only('b/138749235')
   @test_util.disable_xla('This test does not pass with XLA')
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index 10f869805d8..d5c53ae2959 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -212,6 +212,12 @@ class LayoutOptimizerTest(test.TestCase):
   def _assert_trans_nhwc_to_nchw(self, name, nodes):
     self.assertIn(name + '-TransposeNHWCToNCHW-LayoutOptimizer', nodes)
 
+  def _assert_trans_ncdhw_to_ndhwc(self, name, nodes):
+    self.assertIn(name + '-TransposeNCDHWToNDHWC-LayoutOptimizer', nodes)
+
+  def _assert_trans_ndhwc_to_ncdhw(self, name, nodes):
+    self.assertIn(name + '-TransposeNDHWCToNCDHW-LayoutOptimizer', nodes)
+
   def _assert_map_nhwc_to_nchw(self, name, nodes):
     self.assertIn(name + '-DimMapNHWCToNCHW-LayoutOptimizer', nodes)
 
@@ -221,6 +227,14 @@ class LayoutOptimizerTest(test.TestCase):
   def _assert_vec_nhwc_to_nchw(self, name, nodes):
     self.assertIn(name + '-VecPermuteNHWCToNCHW-LayoutOptimizer', nodes)
 
+  def _assert_vec_ncdhw_to_ndhwc(self, name, nodes):
+    self.assertIn(name + '-DataFormatVecPermuteNCDHWToNDHWC-LayoutOptimizer',
+                  nodes)
+
+  def _assert_vec_ndhwc_to_ncdhw(self, name, nodes):
+    self.assertIn(name + '-DataFormatVecPermuteNDHWCToNCDHW-LayoutOptimizer',
+                  nodes)
+
   def _train(self, checkpoint_path, layout_optimizer=False, restore=False):
     ops.reset_default_graph()
     graph = ops.get_default_graph()
@@ -1121,6 +1135,110 @@ class LayoutOptimizerTest(test.TestCase):
       self.assertIn('MaxPoolGradV2-3-LayoutOptimizer', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
+  def testConv3D(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      filters = random_ops.truncated_normal([2, 2, 2, 1, 2], seed=0)
+      strides_val = [1, 1, 1, 1, 1]
+      x_3d = array_ops.reshape(conv, [-1, 4, 14, 14, 1])
+      conv3d = gen_nn_ops.conv3d(x_3d, filters, strides_val, 'VALID')
+      output = array_ops.identity(conv3d)
+
+      with session.Session(config=_get_config(False)) as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_trans_ndhwc_to_ncdhw('Conv3D-0', nodes)
+      self._assert_trans_ncdhw_to_ndhwc('Conv3D-0-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  @test_util.deprecated_graph_mode_only
+  def testConv3DBackpropInput(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      x_3d = array_ops.reshape(conv, [-1, 4, 14, 14, 1])
+      filters = random_ops.truncated_normal([2, 2, 2, 1, 1], seed=0)
+      strides_val = [1, 1, 1, 1, 1]
+      shape = array_ops.shape(x_3d)
+      conv3d_grad = gen_nn_ops.conv3d_backprop_input_v2(shape, filters, x_3d,
+                                                        strides_val, 'SAME')
+      output = array_ops.identity(conv3d_grad)
+
+      with session.Session(config=_get_config(False)) as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_vec_ndhwc_to_ncdhw('Conv3DBackpropInputV2-0', nodes)
+      self._assert_trans_ndhwc_to_ncdhw('Conv3DBackpropInputV2-2', nodes)
+      self._assert_trans_ncdhw_to_ndhwc('Conv3DBackpropInputV2-0-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  @test_util.deprecated_graph_mode_only
+  def testConv3DBackpropFilter(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      x_3d = array_ops.reshape(conv, [-1, 4, 14, 14, 1])
+      filters = random_ops.truncated_normal([2, 2, 2, 1, 1], seed=0)
+      strides_val = [1, 1, 1, 1, 1]
+      shape = constant_op.constant([2, 2, 2, 1, 1], shape=[5])
+      conv3d_grad = gen_nn_ops.conv3d_backprop_filter_v2(
+          x_3d, shape, x_3d, strides_val, 'SAME')
+      output = array_ops.identity(conv3d_grad)
+
+      with session.Session(config=_get_config(False)) as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_trans_ndhwc_to_ncdhw('Conv3DBackpropFilterV2-0', nodes)
+      self._assert_trans_ndhwc_to_ncdhw('Conv3DBackpropFilterV2-2', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
   @test_util.deprecated_graph_mode_only
   def testSliceWithNonConstAxis(self):
     if test.is_gpu_available(cuda_only=True):
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 4e88494a374..24c5b9de8ca 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -55,7 +55,6 @@ py_library(
         "//tensorflow/python:composite_tensor_utils",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:control_flow_util",
         "//tensorflow/python:ctc_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework",
@@ -87,6 +86,7 @@ py_library(
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:multi_worker_util",
         "//tensorflow/python/keras/engine:keras_tensor",
+        "//tensorflow/python/keras/utils:control_flow_util",
     ],
 )
 
diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py
index 9e30e3610ec..fe0bf5977f9 100644
--- a/tensorflow/python/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -302,6 +302,46 @@ def relu(x, alpha=0., max_value=None, threshold=0):
   return K.relu(x, alpha=alpha, max_value=max_value, threshold=threshold)
 
 
+@keras_export('keras.activations.gelu', v1=[])
+@dispatch.add_dispatch_support
+def gelu(x, approximate=False):
+  """Applies the Gaussian error linear unit (GELU) activation function.
+
+  Gaussian error linear unit (GELU) computes
+  `x * P(X <= x)`, where `P(X) ~ N(0, 1)`.
+  The (GELU) nonlinearity weights inputs by their value, rather than gates
+  inputs by their sign as in ReLU.
+
+  For example:
+
+  >>> x = tf.constant([-3.0, -1.0, 0.0, 1.0, 3.0], dtype=tf.float32)
+  >>> y = tf.keras.activations.gelu(x)
+  >>> y.numpy()
+  array([-0.00404951, -0.15865529,  0.        ,  0.8413447 ,  2.9959507 ],
+      dtype=float32)
+  >>> y = tf.keras.activations.gelu(x, approximate=True)
+  >>> y.numpy()
+  array([-0.00363752, -0.15880796,  0.        ,  0.841192  ,  2.9963627 ],
+      dtype=float32)
+
+  Arguments:
+      x: Input tensor.
+      approximate: A `bool`, whether to enable approximation.
+
+  Returns:
+      The gaussian error linear activation:
+      `0.5 * x * (1 + tanh(sqrt(2 / pi) * (x + 0.044715 * x^3)))`
+      if `approximate` is `True` or
+      `x * P(X <= x) = 0.5 * x * (1 + erf(x / sqrt(2)))`,
+      where `P(X) ~ N(0, 1)`,
+      if `approximate` is `False`.
+
+  Reference:
+    - [Gaussian Error Linear Units (GELUs)](https://arxiv.org/abs/1606.08415)
+  """
+  return nn.gelu(x, approximate)
+
+
 @keras_export('keras.activations.tanh')
 @dispatch.add_dispatch_support
 def tanh(x):
diff --git a/tensorflow/python/keras/activations_test.py b/tensorflow/python/keras/activations_test.py
index f951076efbb..ddd3863a3f6 100644
--- a/tensorflow/python/keras/activations_test.py
+++ b/tensorflow/python/keras/activations_test.py
@@ -41,9 +41,10 @@ def _ref_softmax(values):
 class KerasActivationsTest(test.TestCase, parameterized.TestCase):
 
   def test_serialization(self):
-    all_activations = ['softmax', 'relu', 'elu', 'tanh',
-                       'sigmoid', 'hard_sigmoid', 'linear',
-                       'softplus', 'softsign', 'selu']
+    all_activations = [
+        'softmax', 'relu', 'elu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear',
+        'softplus', 'softsign', 'selu', 'gelu'
+    ]
     for name in all_activations:
       fn = activations.get(name)
       ref_fn = getattr(activations, name)
@@ -170,6 +171,29 @@ class KerasActivationsTest(test.TestCase, parameterized.TestCase):
     expected = np.zeros((2, 5))
     self.assertAllClose(result, expected, rtol=1e-05)
 
+  def test_gelu(self):
+
+    def gelu(x, approximate=False):
+      if approximate:
+        return 0.5 * x * (1.0 + np.tanh(
+            np.sqrt(2.0 / np.pi) * (x + 0.044715 * np.power(x, 3))))
+      else:
+        from scipy.stats import norm  # pylint: disable=g-import-not-at-top
+        return x * norm.cdf(x)
+
+    x = backend.placeholder(ndim=2)
+    f = backend.function([x], [activations.gelu(x)])
+    test_values = np.random.random((2, 5))
+    result = f([test_values])[0]
+    expected = gelu(test_values)
+    self.assertAllClose(result, expected, rtol=1e-05)
+
+    f = backend.function([x], [activations.gelu(x, True)])
+    test_values = np.random.random((2, 5))
+    result = f([test_values])[0]
+    expected = gelu(test_values, True)
+    self.assertAllClose(result, expected, rtol=1e-05)
+
   def test_elu(self):
     x = backend.placeholder(ndim=2)
     f = backend.function([x], [activations.elu(x, 0.5)])
diff --git a/tensorflow/python/keras/api/BUILD b/tensorflow/python/keras/api/BUILD
index ff54400ae15..d69930b7455 100644
--- a/tensorflow/python/keras/api/BUILD
+++ b/tensorflow/python/keras/api/BUILD
@@ -23,6 +23,7 @@ keras_packages = [
     "tensorflow.python.keras.applications.inception_v3",
     "tensorflow.python.keras.applications.mobilenet",
     "tensorflow.python.keras.applications.mobilenet_v2",
+    "tensorflow.python.keras.applications.mobilenet_v3",
     "tensorflow.python.keras.applications.nasnet",
     "tensorflow.python.keras.applications.resnet",
     "tensorflow.python.keras.applications.resnet_v2",
diff --git a/tensorflow/python/keras/applications/BUILD b/tensorflow/python/keras/applications/BUILD
index 0c566c6e6d5..a2c41dbe501 100644
--- a/tensorflow/python/keras/applications/BUILD
+++ b/tensorflow/python/keras/applications/BUILD
@@ -25,6 +25,7 @@ py_library(
         "inception_v3.py",
         "mobilenet.py",
         "mobilenet_v2.py",
+        "mobilenet_v3.py",
         "nasnet.py",
         "resnet.py",
         "resnet_v2.py",
@@ -209,6 +210,22 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "applications_load_weight_test_mobilenet_v3",
+    srcs = ["applications_load_weight_test.py"],
+    args = ["--module=mobilenet_v3"],
+    main = "applications_load_weight_test.py",
+    tags = [
+        "no_oss",
+        "no_pip",
+    ],
+    deps = [
+        ":applications",
+        "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_test(
     name = "applications_load_weight_test_densenet",
     size = "large",
diff --git a/tensorflow/python/keras/applications/applications_load_weight_test.py b/tensorflow/python/keras/applications/applications_load_weight_test.py
index 42146c66f97..aaafe9f984a 100644
--- a/tensorflow/python/keras/applications/applications_load_weight_test.py
+++ b/tensorflow/python/keras/applications/applications_load_weight_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.keras.applications import inception_resnet_v2
 from tensorflow.python.keras.applications import inception_v3
 from tensorflow.python.keras.applications import mobilenet
 from tensorflow.python.keras.applications import mobilenet_v2
+from tensorflow.python.keras.applications import mobilenet_v3
 from tensorflow.python.keras.applications import nasnet
 from tensorflow.python.keras.applications import resnet
 from tensorflow.python.keras.applications import resnet_v2
@@ -51,6 +52,8 @@ ARG_TO_MODEL = {
                             [inception_resnet_v2.InceptionResNetV2]),
     'mobilenet': (mobilenet, [mobilenet.MobileNet]),
     'mobilenet_v2': (mobilenet_v2, [mobilenet_v2.MobileNetV2]),
+    'mobilenet_v3': (mobilenet_v3, [mobilenet_v3.MobileNetV3Small,
+                                    mobilenet_v3.MobileNetV3Large]),
     'densenet': (densenet, [densenet.DenseNet121,
                             densenet.DenseNet169, densenet.DenseNet201]),
     'nasnet_mobile': (nasnet, [nasnet.NASNetMobile]),
diff --git a/tensorflow/python/keras/applications/applications_test.py b/tensorflow/python/keras/applications/applications_test.py
index 198bebd904c..d92a2aaee7f 100644
--- a/tensorflow/python/keras/applications/applications_test.py
+++ b/tensorflow/python/keras/applications/applications_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.keras.applications import inception_resnet_v2
 from tensorflow.python.keras.applications import inception_v3
 from tensorflow.python.keras.applications import mobilenet
 from tensorflow.python.keras.applications import mobilenet_v2
+from tensorflow.python.keras.applications import mobilenet_v3
 from tensorflow.python.keras.applications import nasnet
 from tensorflow.python.keras.applications import resnet
 from tensorflow.python.keras.applications import resnet_v2
@@ -50,6 +51,8 @@ MODEL_LIST_NO_NASNET = [
     (inception_resnet_v2.InceptionResNetV2, 1536),
     (mobilenet.MobileNet, 1024),
     (mobilenet_v2.MobileNetV2, 1280),
+    (mobilenet_v3.MobileNetV3Small, 1024),
+    (mobilenet_v3.MobileNetV3Large, 1280),
     (densenet.DenseNet121, 1024),
     (densenet.DenseNet169, 1664),
     (densenet.DenseNet201, 1920),
diff --git a/tensorflow/python/keras/applications/densenet.py b/tensorflow/python/keras/applications/densenet.py
index 09069556a26..1302598f8e9 100644
--- a/tensorflow/python/keras/applications/densenet.py
+++ b/tensorflow/python/keras/applications/densenet.py
@@ -15,9 +15,9 @@
 # pylint: disable=invalid-name
 """DenseNet models for Keras.
 
-Reference paper:
-  - [Densely Connected Convolutional Networks]
-    (https://arxiv.org/abs/1608.06993) (CVPR 2017 Best Paper Award)
+Reference:
+  - [Densely Connected Convolutional Networks](
+      https://arxiv.org/abs/1608.06993) (CVPR 2017)
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -375,13 +375,16 @@ decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
 
 DOC = """
 
-  Reference paper:
-  - [Densely Connected Convolutional Networks]
-    (https://arxiv.org/abs/1608.06993) (CVPR 2017 Best Paper Award)
+  Reference:
+  - [Densely Connected Convolutional Networks](
+      https://arxiv.org/abs/1608.06993) (CVPR 2017)
 
   Optionally loads weights pre-trained on ImageNet.
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
+  
+  Caution: Be sure to properly pre-process your inputs to the application.
+  Please see `applications.densenet.preprocess_input` for an example.
 
   Arguments:
     include_top: whether to include the fully-connected
diff --git a/tensorflow/python/keras/applications/efficientnet.py b/tensorflow/python/keras/applications/efficientnet.py
index e3c1a261e80..b791bbc2bc1 100644
--- a/tensorflow/python/keras/applications/efficientnet.py
+++ b/tensorflow/python/keras/applications/efficientnet.py
@@ -16,9 +16,9 @@
 # pylint: disable=missing-docstring
 """EfficientNet models for Keras.
 
-Reference paper:
-  - [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks]
-    (https://arxiv.org/abs/1905.11946) (ICML 2019)
+Reference:
+  - [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](
+      https://arxiv.org/abs/1905.11946) (ICML 2019)
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -210,7 +210,7 @@ def EfficientNet(
     classifier_activation='softmax'):
   """Instantiates the EfficientNet architecture using given scaling coefficients.
 
-  Reference paper:
+  Reference:
   - [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](
       https://arxiv.org/abs/1905.11946) (ICML 2019)
 
diff --git a/tensorflow/python/keras/applications/inception_resnet_v2.py b/tensorflow/python/keras/applications/inception_resnet_v2.py
index ec5ae0fb453..3bf296967f6 100644
--- a/tensorflow/python/keras/applications/inception_resnet_v2.py
+++ b/tensorflow/python/keras/applications/inception_resnet_v2.py
@@ -16,7 +16,7 @@
 """Inception-ResNet V2 model for Keras.
 
 
-Reference paper:
+Reference:
   - [Inception-v4, Inception-ResNet and the Impact of
      Residual Connections on Learning](https://arxiv.org/abs/1602.07261)
     (AAAI 2017)
@@ -291,7 +291,7 @@ def conv2d_bn(x,
 
 
 def inception_resnet_block(x, scale, block_type, block_idx, activation='relu'):
-  """Adds a Inception-ResNet block.
+  """Adds an Inception-ResNet block.
 
   This function builds 3 types of Inception-ResNet blocks mentioned
   in the paper, controlled by the `block_type` argument (which is the
@@ -302,25 +302,20 @@ def inception_resnet_block(x, scale, block_type, block_idx, activation='relu'):
 
   Arguments:
     x: input tensor.
-    scale: scaling factor to scale the residuals (i.e., the output of
-      passing `x` through an inception module) before adding them
-      to the shortcut branch.
-      Let `r` be the output from the residual branch,
-      the output of this block will be `x + scale * r`.
-    block_type: `'block35'`, `'block17'` or `'block8'`, determines
-      the network structure in the residual branch.
-    block_idx: an `int` used for generating layer names.
-      The Inception-ResNet blocks
-      are repeated many times in this network.
-      We use `block_idx` to identify
-      each of the repetitions. For example,
-      the first Inception-ResNet-A block
-      will have `block_type='block35', block_idx=0`,
-      and the layer names will have
-      a common prefix `'block35_0'`.
-    activation: activation function to use at the end of the block
-      (see [activations](../activations.md)).
-      When `activation=None`, no activation is applied
+    scale: scaling factor to scale the residuals (i.e., the output of passing
+      `x` through an inception module) before adding them to the shortcut
+      branch. Let `r` be the output from the residual branch, the output of this
+      block will be `x + scale * r`.
+    block_type: `'block35'`, `'block17'` or `'block8'`, determines the network
+      structure in the residual branch.
+    block_idx: an `int` used for generating layer names. The Inception-ResNet
+      blocks are repeated many times in this network. We use `block_idx` to
+      identify each of the repetitions. For example, the first
+      Inception-ResNet-A block will have `block_type='block35', block_idx=0`,
+      and the layer names will have a common prefix `'block35_0'`.
+    activation: activation function to use at the end of the block (see
+      [activations](../activations.md)). When `activation=None`, no activation
+      is applied
       (i.e., "linear" activation: `a(x) = x`).
 
   Returns:
diff --git a/tensorflow/python/keras/applications/inception_v3.py b/tensorflow/python/keras/applications/inception_v3.py
index 89b8398d489..7237cf558e1 100644
--- a/tensorflow/python/keras/applications/inception_v3.py
+++ b/tensorflow/python/keras/applications/inception_v3.py
@@ -15,7 +15,7 @@
 # pylint: disable=invalid-name
 """Inception V3 model for Keras.
 
-Reference paper:
+Reference:
   - [Rethinking the Inception Architecture for Computer Vision](
       http://arxiv.org/abs/1512.00567) (CVPR 2016)
 """
diff --git a/tensorflow/python/keras/applications/mobilenet.py b/tensorflow/python/keras/applications/mobilenet.py
index 29a672eed30..c59246fb8ef 100644
--- a/tensorflow/python/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/applications/mobilenet.py
@@ -56,9 +56,10 @@ the 100 % MobileNet on various input sizes:
 |  1.0 MobileNet-128  |    64.4 %    |        529        |     4.2     |
 ------------------------------------------------------------------------
 
-Reference paper:
-  - [MobileNets: Efficient Convolutional Neural Networks for
-     Mobile Vision Applications](https://arxiv.org/abs/1704.04861)
+Reference:
+  - [MobileNets: Efficient Convolutional Neural Networks
+     for Mobile Vision Applications](
+      https://arxiv.org/abs/1704.04861)
 """
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/keras/applications/mobilenet_v2.py b/tensorflow/python/keras/applications/mobilenet_v2.py
index bc9afb47e23..1891c3ff19f 100644
--- a/tensorflow/python/keras/applications/mobilenet_v2.py
+++ b/tensorflow/python/keras/applications/mobilenet_v2.py
@@ -69,9 +69,9 @@ MACs stands for Multiply Adds
 | [mobilenet_v2_0.35_128] | 20  | 1.66 |          50.8 | 75.0 |
 | [mobilenet_v2_0.35_96]  | 11  | 1.66 |          45.5 | 70.4 |
 
-  Reference paper:
-  - [MobileNetV2: Inverted Residuals and Linear Bottlenecks]
-  (https://arxiv.org/abs/1801.04381) (CVPR 2018)
+  Reference:
+  - [MobileNetV2: Inverted Residuals and Linear Bottlenecks](
+      https://arxiv.org/abs/1801.04381) (CVPR 2018)
 """
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/keras/applications/mobilenet_v3.py b/tensorflow/python/keras/applications/mobilenet_v3.py
new file mode 100644
index 00000000000..44ba6fd3a39
--- /dev/null
+++ b/tensorflow/python/keras/applications/mobilenet_v3.py
@@ -0,0 +1,567 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=invalid-name
+# pylint: disable=missing-function-docstring
+"""MobileNet v3 models for Keras."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import models
+from tensorflow.python.keras.applications import imagenet_utils
+from tensorflow.python.keras.layers import VersionAwareLayers
+from tensorflow.python.keras.utils import data_utils
+from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import keras_export
+
+
+# TODO(scottzhu): Change this to the GCS path.
+BASE_WEIGHT_PATH = ('https://storage.googleapis.com/tensorflow/'
+                    'keras-applications/mobilenet_v3/')
+WEIGHTS_HASHES = {
+    'large_224_0.75_float': ('765b44a33ad4005b3ac83185abf1d0eb',
+                             'e7b4d1071996dd51a2c2ca2424570e20'),
+    'large_224_1.0_float': ('59e551e166be033d707958cf9e29a6a7',
+                            '037116398e07f018c0005ffcb0406831'),
+    'large_minimalistic_224_1.0_float': ('675e7b876c45c57e9e63e6d90a36599c',
+                                         'a2c33aed672524d1d0b4431808177695'),
+    'small_224_0.75_float': ('cb65d4e5be93758266aa0a7f2c6708b7',
+                             '4d2fe46f1c1f38057392514b0df1d673'),
+    'small_224_1.0_float': ('8768d4c2e7dee89b9d02b2d03d65d862',
+                            'be7100780f875c06bcab93d76641aa26'),
+    'small_minimalistic_224_1.0_float': ('99cd97fb2fcdad2bf028eb838de69e37',
+                                         '20d4e357df3f7a6361f3a288857b1051'),
+}
+
+layers = VersionAwareLayers()
+
+
+BASE_DOCSTRING = """Instantiates the {name} architecture.
+
+  Reference:
+  - [Searching for MobileNetV3](
+      https://arxiv.org/pdf/1905.02244.pdf) (ICCV 2019)
+
+  The following table describes the performance of MobileNets:
+  ------------------------------------------------------------------------
+  MACs stands for Multiply Adds
+
+  |Classification Checkpoint|MACs(M)|Parameters(M)|Top1 Accuracy|Pixel1|CPU(ms)|
+  | [mobilenet_v3_large_1.0_224]              | 217 | 5.4 |   75.6   |   51.2  |
+  | [mobilenet_v3_large_0.75_224]             | 155 | 4.0 |   73.3   |   39.8  |
+  | [mobilenet_v3_large_minimalistic_1.0_224] | 209 | 3.9 |   72.3   |   44.1  |
+  | [mobilenet_v3_small_1.0_224]              | 66  | 2.9 |   68.1   |   15.8  |
+  | [mobilenet_v3_small_0.75_224]             | 44  | 2.4 |   65.4   |   12.8  |
+  | [mobilenet_v3_small_minimalistic_1.0_224] | 65  | 2.0 |   61.9   |   12.2  |
+
+  The weights for all 6 models are obtained and translated from the Tensorflow
+  checkpoints from TensorFlow checkpoints found [here]
+  (https://github.com/tensorflow/models/tree/master/research/slim/nets/mobilenet/README.md).
+
+  Optionally loads weights pre-trained on ImageNet.
+
+  Caution: Be sure to properly pre-process your inputs to the application.
+  Please see `applications.mobilenet_v3.preprocess_input` for an example.
+
+  Arguments:
+    input_shape: Optional shape tuple, to be specified if you would
+      like to use a model with an input image resolution that is not
+      (224, 224, 3).
+      It should have exactly 3 inputs channels (224, 224, 3).
+      You can also omit this option if you would like
+      to infer input_shape from an input_tensor.
+      If you choose to include both input_tensor and input_shape then
+      input_shape will be used if they match, if the shapes
+      do not match then we will throw an error.
+      E.g. `(160, 160, 3)` would be one valid value.
+    alpha: controls the width of the network. This is known as the
+      depth multiplier in the MobileNetV3 paper, but the name is kept for
+      consistency with MobileNetV1 in Keras.
+      - If `alpha` < 1.0, proportionally decreases the number
+          of filters in each layer.
+      - If `alpha` > 1.0, proportionally increases the number
+          of filters in each layer.
+      - If `alpha` = 1, default number of filters from the paper
+          are used at each layer.
+    minimalistic: In addition to large and small models this module also
+      contains so-called minimalistic models, these models have the same
+      per-layer dimensions characteristic as MobilenetV3 however, they don't
+      utilize any of the advanced blocks (squeeze-and-excite units, hard-swish,
+      and 5x5 convolutions). While these models are less efficient on CPU, they
+      are much more performant on GPU/DSP.
+    include_top: Boolean, whether to include the fully-connected
+      layer at the top of the network. Defaults to `True`.
+    weights: String, one of `None` (random initialization),
+      'imagenet' (pre-training on ImageNet),
+      or the path to the weights file to be loaded.
+    input_tensor: Optional Keras tensor (i.e. output of
+      `layers.Input()`)
+      to use as image input for the model.
+    pooling: String, optional pooling mode for feature extraction
+      when `include_top` is `False`.
+      - `None` means that the output of the model
+          will be the 4D tensor output of the
+          last convolutional block.
+      - `avg` means that global average pooling
+          will be applied to the output of the
+          last convolutional block, and thus
+          the output of the model will be a
+          2D tensor.
+      - `max` means that global max pooling will
+          be applied.
+    classes: Integer, optional number of classes to classify images
+      into, only to be specified if `include_top` is True, and
+      if no `weights` argument is specified.
+    dropout_rate: fraction of the input units to drop on the last layer.
+    classifier_activation: A `str` or callable. The activation function to use
+      on the "top" layer. Ignored unless `include_top=True`. Set
+      `classifier_activation=None` to return the logits of the "top" layer.
+
+  Returns:
+    A `keras.Model` instance.
+
+  Raises:
+    ValueError: in case of invalid argument for `weights`,
+      or invalid input shape or invalid alpha, rows when
+      weights='imagenet'
+    ValueError: if `classifier_activation` is not `softmax` or `None` when
+      using a pretrained top layer.
+"""
+
+
+def MobileNetV3(stack_fn,
+                last_point_ch,
+                input_shape=None,
+                alpha=1.0,
+                model_type='large',
+                minimalistic=False,
+                include_top=True,
+                weights='imagenet',
+                input_tensor=None,
+                classes=1000,
+                pooling=None,
+                dropout_rate=0.2,
+                classifier_activation='softmax'):
+  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
+    raise ValueError('The `weights` argument should be either '
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
+
+  if weights == 'imagenet' and include_top and classes != 1000:
+    raise ValueError('If using `weights` as `"imagenet"` with `include_top` '
+                     'as true, `classes` should be 1000')
+
+  # Determine proper input shape and default size.
+  # If both input_shape and input_tensor are used, they should match
+  if input_shape is not None and input_tensor is not None:
+    try:
+      is_input_t_tensor = backend.is_keras_tensor(input_tensor)
+    except ValueError:
+      try:
+        is_input_t_tensor = backend.is_keras_tensor(
+            layer_utils.get_source_inputs(input_tensor))
+      except ValueError:
+        raise ValueError('input_tensor: ', input_tensor,
+                         'is not type input_tensor')
+    if is_input_t_tensor:
+      if backend.image_data_format == 'channels_first':
+        if backend.int_shape(input_tensor)[1] != input_shape[1]:
+          raise ValueError('input_shape: ', input_shape, 'and input_tensor: ',
+                           input_tensor,
+                           'do not meet the same shape requirements')
+      else:
+        if backend.int_shape(input_tensor)[2] != input_shape[1]:
+          raise ValueError('input_shape: ', input_shape, 'and input_tensor: ',
+                           input_tensor,
+                           'do not meet the same shape requirements')
+    else:
+      raise ValueError('input_tensor specified: ', input_tensor,
+                       'is not a keras tensor')
+
+  # If input_shape is None, infer shape from input_tensor
+  if input_shape is None and input_tensor is not None:
+
+    try:
+      backend.is_keras_tensor(input_tensor)
+    except ValueError:
+      raise ValueError('input_tensor: ', input_tensor, 'is type: ',
+                       type(input_tensor), 'which is not a valid type')
+
+    if backend.is_keras_tensor(input_tensor):
+      if backend.image_data_format() == 'channels_first':
+        rows = backend.int_shape(input_tensor)[2]
+        cols = backend.int_shape(input_tensor)[3]
+        input_shape = (3, cols, rows)
+      else:
+        rows = backend.int_shape(input_tensor)[1]
+        cols = backend.int_shape(input_tensor)[2]
+        input_shape = (cols, rows, 3)
+  # If input_shape is None and input_tensor is None using standart shape
+  if input_shape is None and input_tensor is None:
+    input_shape = (None, None, 3)
+
+  if backend.image_data_format() == 'channels_last':
+    row_axis, col_axis = (0, 1)
+  else:
+    row_axis, col_axis = (1, 2)
+  rows = input_shape[row_axis]
+  cols = input_shape[col_axis]
+  if rows and cols and (rows < 32 or cols < 32):
+    raise ValueError('Input size must be at least 32x32; got `input_shape=' +
+                     str(input_shape) + '`')
+  if weights == 'imagenet':
+    if (not minimalistic and alpha not in [0.75, 1.0]
+        or minimalistic and alpha != 1.0):
+      raise ValueError('If imagenet weights are being loaded, '
+                       'alpha can be one of `0.75`, `1.0` for non minimalistic'
+                       ' or `1.0` for minimalistic only.')
+
+    if rows != cols or rows != 224:
+      logging.warning('`input_shape` is undefined or non-square, '
+                      'or `rows` is not 224.'
+                      ' Weights for input shape (224, 224) will be'
+                      ' loaded as the default.')
+
+  if input_tensor is None:
+    img_input = layers.Input(shape=input_shape)
+  else:
+    if not backend.is_keras_tensor(input_tensor):
+      img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+    else:
+      img_input = input_tensor
+
+  channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1
+
+  if minimalistic:
+    kernel = 3
+    activation = relu
+    se_ratio = None
+  else:
+    kernel = 5
+    activation = hard_swish
+    se_ratio = 0.25
+
+  x = img_input
+  x = layers.Rescaling(1. / 255.)(x)
+  x = layers.Conv2D(
+      16,
+      kernel_size=3,
+      strides=(2, 2),
+      padding='same',
+      use_bias=False,
+      name='Conv')(x)
+  x = layers.BatchNormalization(
+      axis=channel_axis, epsilon=1e-3,
+      momentum=0.999, name='Conv/BatchNorm')(x)
+  x = activation(x)
+
+  x = stack_fn(x, kernel, activation, se_ratio)
+
+  last_conv_ch = _depth(backend.int_shape(x)[channel_axis] * 6)
+
+  # if the width multiplier is greater than 1 we
+  # increase the number of output channels
+  if alpha > 1.0:
+    last_point_ch = _depth(last_point_ch * alpha)
+  x = layers.Conv2D(
+      last_conv_ch,
+      kernel_size=1,
+      padding='same',
+      use_bias=False,
+      name='Conv_1')(x)
+  x = layers.BatchNormalization(
+      axis=channel_axis, epsilon=1e-3,
+      momentum=0.999, name='Conv_1/BatchNorm')(x)
+  x = activation(x)
+  x = layers.Conv2D(
+      last_point_ch,
+      kernel_size=1,
+      padding='same',
+      use_bias=True,
+      name='Conv_2')(x)
+  x = activation(x)
+
+  if include_top:
+    x = layers.GlobalAveragePooling2D()(x)
+    if channel_axis == 1:
+      x = layers.Reshape((last_point_ch, 1, 1))(x)
+    else:
+      x = layers.Reshape((1, 1, last_point_ch))(x)
+    if dropout_rate > 0:
+      x = layers.Dropout(dropout_rate)(x)
+    x = layers.Conv2D(classes, kernel_size=1, padding='same', name='Logits')(x)
+    x = layers.Flatten()(x)
+    imagenet_utils.validate_activation(classifier_activation, weights)
+    x = layers.Activation(activation=classifier_activation,
+                          name='Predictions')(x)
+  else:
+    if pooling == 'avg':
+      x = layers.GlobalAveragePooling2D(name='avg_pool')(x)
+    elif pooling == 'max':
+      x = layers.GlobalMaxPooling2D(name='max_pool')(x)
+  # Ensure that the model takes into account
+  # any potential predecessors of `input_tensor`.
+  if input_tensor is not None:
+    inputs = layer_utils.get_source_inputs(input_tensor)
+  else:
+    inputs = img_input
+
+  # Create model.
+  model = models.Model(inputs, x, name='MobilenetV3' + model_type)
+
+  # Load weights.
+  if weights == 'imagenet':
+    model_name = '{}{}_224_{}_float'.format(
+        model_type, '_minimalistic' if minimalistic else '', str(alpha))
+    if include_top:
+      file_name = 'weights_mobilenet_v3_' + model_name + '.h5'
+      file_hash = WEIGHTS_HASHES[model_name][0]
+    else:
+      file_name = 'weights_mobilenet_v3_' + model_name + '_no_top.h5'
+      file_hash = WEIGHTS_HASHES[model_name][1]
+    weights_path = data_utils.get_file(
+        file_name,
+        BASE_WEIGHT_PATH + file_name,
+        cache_subdir='models',
+        file_hash=file_hash)
+    model.load_weights(weights_path)
+  elif weights is not None:
+    model.load_weights(weights)
+
+  return model
+
+
+@keras_export('keras.applications.MobileNetV3Small')
+def MobileNetV3Small(input_shape=None,
+                     alpha=1.0,
+                     minimalistic=False,
+                     include_top=True,
+                     weights='imagenet',
+                     input_tensor=None,
+                     classes=1000,
+                     pooling=None,
+                     dropout_rate=0.2,
+                     classifier_activation='softmax'):
+
+  def stack_fn(x, kernel, activation, se_ratio):
+
+    def depth(d):
+      return _depth(d * alpha)
+
+    x = _inverted_res_block(x, 1, depth(16), 3, 2, se_ratio, relu, 0)
+    x = _inverted_res_block(x, 72. / 16, depth(24), 3, 2, None, relu, 1)
+    x = _inverted_res_block(x, 88. / 24, depth(24), 3, 1, None, relu, 2)
+    x = _inverted_res_block(x, 4, depth(40), kernel, 2, se_ratio, activation, 3)
+    x = _inverted_res_block(x, 6, depth(40), kernel, 1, se_ratio, activation, 4)
+    x = _inverted_res_block(x, 6, depth(40), kernel, 1, se_ratio, activation, 5)
+    x = _inverted_res_block(x, 3, depth(48), kernel, 1, se_ratio, activation, 6)
+    x = _inverted_res_block(x, 3, depth(48), kernel, 1, se_ratio, activation, 7)
+    x = _inverted_res_block(x, 6, depth(96), kernel, 2, se_ratio, activation, 8)
+    x = _inverted_res_block(x, 6, depth(96), kernel, 1, se_ratio, activation, 9)
+    x = _inverted_res_block(x, 6, depth(96), kernel, 1, se_ratio, activation,
+                            10)
+    return x
+
+  return MobileNetV3(stack_fn, 1024, input_shape, alpha, 'small', minimalistic,
+                     include_top, weights, input_tensor, classes, pooling,
+                     dropout_rate, classifier_activation)
+
+
+@keras_export('keras.applications.MobileNetV3Large')
+def MobileNetV3Large(input_shape=None,
+                     alpha=1.0,
+                     minimalistic=False,
+                     include_top=True,
+                     weights='imagenet',
+                     input_tensor=None,
+                     classes=1000,
+                     pooling=None,
+                     dropout_rate=0.2,
+                     classifier_activation='softmax'):
+
+  def stack_fn(x, kernel, activation, se_ratio):
+
+    def depth(d):
+      return _depth(d * alpha)
+
+    x = _inverted_res_block(x, 1, depth(16), 3, 1, None, relu, 0)
+    x = _inverted_res_block(x, 4, depth(24), 3, 2, None, relu, 1)
+    x = _inverted_res_block(x, 3, depth(24), 3, 1, None, relu, 2)
+    x = _inverted_res_block(x, 3, depth(40), kernel, 2, se_ratio, relu, 3)
+    x = _inverted_res_block(x, 3, depth(40), kernel, 1, se_ratio, relu, 4)
+    x = _inverted_res_block(x, 3, depth(40), kernel, 1, se_ratio, relu, 5)
+    x = _inverted_res_block(x, 6, depth(80), 3, 2, None, activation, 6)
+    x = _inverted_res_block(x, 2.5, depth(80), 3, 1, None, activation, 7)
+    x = _inverted_res_block(x, 2.3, depth(80), 3, 1, None, activation, 8)
+    x = _inverted_res_block(x, 2.3, depth(80), 3, 1, None, activation, 9)
+    x = _inverted_res_block(x, 6, depth(112), 3, 1, se_ratio, activation, 10)
+    x = _inverted_res_block(x, 6, depth(112), 3, 1, se_ratio, activation, 11)
+    x = _inverted_res_block(x, 6, depth(160), kernel, 2, se_ratio, activation,
+                            12)
+    x = _inverted_res_block(x, 6, depth(160), kernel, 1, se_ratio, activation,
+                            13)
+    x = _inverted_res_block(x, 6, depth(160), kernel, 1, se_ratio, activation,
+                            14)
+    return x
+
+  return MobileNetV3(stack_fn, 1280, input_shape, alpha, 'large', minimalistic,
+                     include_top, weights, input_tensor, classes, pooling,
+                     dropout_rate, classifier_activation)
+
+
+MobileNetV3Small.__doc__ = BASE_DOCSTRING.format(name='MobileNetV3Small')
+MobileNetV3Large.__doc__ = BASE_DOCSTRING.format(name='MobileNetV3Large')
+
+
+def relu(x):
+  return layers.ReLU()(x)
+
+
+def hard_sigmoid(x):
+  return layers.ReLU(6.)(x + 3.) * (1. / 6.)
+
+
+def hard_swish(x):
+  return layers.Multiply()([hard_sigmoid(x), x])
+
+
+# This function is taken from the original tf repo.
+# It ensures that all layers have a channel number that is divisible by 8
+# It can be seen here:
+# https://github.com/tensorflow/models/blob/master/research/
+# slim/nets/mobilenet/mobilenet.py
+
+
+def _depth(v, divisor=8, min_value=None):
+  if min_value is None:
+    min_value = divisor
+  new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+  # Make sure that round down does not go down by more than 10%.
+  if new_v < 0.9 * v:
+    new_v += divisor
+  return new_v
+
+
+def _se_block(inputs, filters, se_ratio, prefix):
+  x = layers.GlobalAveragePooling2D(name=prefix + 'squeeze_excite/AvgPool')(
+      inputs)
+  if backend.image_data_format() == 'channels_first':
+    x = layers.Reshape((filters, 1, 1))(x)
+  else:
+    x = layers.Reshape((1, 1, filters))(x)
+  x = layers.Conv2D(
+      _depth(filters * se_ratio),
+      kernel_size=1,
+      padding='same',
+      name=prefix + 'squeeze_excite/Conv')(
+          x)
+  x = layers.ReLU(name=prefix + 'squeeze_excite/Relu')(x)
+  x = layers.Conv2D(
+      filters,
+      kernel_size=1,
+      padding='same',
+      name=prefix + 'squeeze_excite/Conv_1')(
+          x)
+  x = hard_sigmoid(x)
+  x = layers.Multiply(name=prefix + 'squeeze_excite/Mul')([inputs, x])
+  return x
+
+
+def _inverted_res_block(x, expansion, filters, kernel_size, stride, se_ratio,
+                        activation, block_id):
+  channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1
+  shortcut = x
+  prefix = 'expanded_conv/'
+  infilters = backend.int_shape(x)[channel_axis]
+  if block_id:
+    # Expand
+    prefix = 'expanded_conv_{}/'.format(block_id)
+    x = layers.Conv2D(
+        _depth(infilters * expansion),
+        kernel_size=1,
+        padding='same',
+        use_bias=False,
+        name=prefix + 'expand')(
+            x)
+    x = layers.BatchNormalization(
+        axis=channel_axis,
+        epsilon=1e-3,
+        momentum=0.999,
+        name=prefix + 'expand/BatchNorm')(
+            x)
+    x = activation(x)
+
+  if stride == 2:
+    x = layers.ZeroPadding2D(
+        padding=imagenet_utils.correct_pad(x, kernel_size),
+        name=prefix + 'depthwise/pad')(
+            x)
+  x = layers.DepthwiseConv2D(
+      kernel_size,
+      strides=stride,
+      padding='same' if stride == 1 else 'valid',
+      use_bias=False,
+      name=prefix + 'depthwise')(
+          x)
+  x = layers.BatchNormalization(
+      axis=channel_axis,
+      epsilon=1e-3,
+      momentum=0.999,
+      name=prefix + 'depthwise/BatchNorm')(
+          x)
+  x = activation(x)
+
+  if se_ratio:
+    x = _se_block(x, _depth(infilters * expansion), se_ratio, prefix)
+
+  x = layers.Conv2D(
+      filters,
+      kernel_size=1,
+      padding='same',
+      use_bias=False,
+      name=prefix + 'project')(
+          x)
+  x = layers.BatchNormalization(
+      axis=channel_axis,
+      epsilon=1e-3,
+      momentum=0.999,
+      name=prefix + 'project/BatchNorm')(
+          x)
+
+  if stride == 1 and infilters == filters:
+    x = layers.Add(name=prefix + 'Add')([shortcut, x])
+  return x
+
+
+@keras_export('keras.applications.mobilenet_v3.preprocess_input')
+def preprocess_input(x, data_format=None):  # pylint: disable=unused-argument
+  return x
+
+
+@keras_export('keras.applications.mobilenet_v3.decode_predictions')
+def decode_predictions(preds, top=5):
+  return imagenet_utils.decode_predictions(preds, top=top)
+
+
+preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
+    mode='',
+    ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
+decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/tensorflow/python/keras/applications/nasnet.py b/tensorflow/python/keras/applications/nasnet.py
index cd4979ece10..4f71165f2e9 100644
--- a/tensorflow/python/keras/applications/nasnet.py
+++ b/tensorflow/python/keras/applications/nasnet.py
@@ -33,9 +33,9 @@ The below table describes the performance on ImageNet 2012:
 |   NASNet-A (6 @ 4032)  |   82.7 %  |   96.2 %  |      23.8 B    |    88.9    |
 --------------------------------------------------------------------------------
 
-Reference paper:
-  - [Learning Transferable Architectures for Scalable Image Recognition]
-    (https://arxiv.org/abs/1707.07012) (CVPR 2018)
+Reference:
+  - [Learning Transferable Architectures for Scalable Image Recognition](
+      https://arxiv.org/abs/1707.07012) (CVPR 2018)
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -331,10 +331,17 @@ def NASNetMobile(input_shape=None,
                  pooling=None,
                  classes=1000):
   """Instantiates a Mobile NASNet model in ImageNet mode.
+  
+  Reference:
+  - [Learning Transferable Architectures for Scalable Image Recognition](
+      https://arxiv.org/abs/1707.07012) (CVPR 2018)
 
   Optionally loads weights pre-trained on ImageNet.
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
+  
+  Caution: Be sure to properly pre-process your inputs to the application.
+  Please see `applications.nasnet.preprocess_input` for an example.
 
   Arguments:
       input_shape: Optional shape tuple, only to be specified
@@ -347,6 +354,7 @@ def NASNetMobile(input_shape=None,
           layer at the top of the network.
       weights: `None` (random initialization) or
           `imagenet` (ImageNet weights)
+          For loading `imagenet` weights, `input_shape` should be (224, 224, 3)
       input_tensor: Optional Keras tensor (i.e. output of
           `layers.Input()`)
           to use as image input for the model.
@@ -399,10 +407,17 @@ def NASNetLarge(input_shape=None,
                 pooling=None,
                 classes=1000):
   """Instantiates a NASNet model in ImageNet mode.
+  
+  Reference:
+  - [Learning Transferable Architectures for Scalable Image Recognition](
+      https://arxiv.org/abs/1707.07012) (CVPR 2018)
 
   Optionally loads weights pre-trained on ImageNet.
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
+  
+  Caution: Be sure to properly pre-process your inputs to the application.
+  Please see `applications.nasnet.preprocess_input` for an example.
 
   Arguments:
       input_shape: Optional shape tuple, only to be specified
@@ -415,6 +430,7 @@ def NASNetLarge(input_shape=None,
           layer at the top of the network.
       weights: `None` (random initialization) or
           `imagenet` (ImageNet weights)
+          For loading `imagenet` weights, `input_shape` should be (331, 331, 3)
       input_tensor: Optional Keras tensor (i.e. output of
           `layers.Input()`)
           to use as image input for the model.
diff --git a/tensorflow/python/keras/applications/resnet.py b/tensorflow/python/keras/applications/resnet.py
index 91562d91e47..61310399180 100644
--- a/tensorflow/python/keras/applications/resnet.py
+++ b/tensorflow/python/keras/applications/resnet.py
@@ -538,13 +538,16 @@ decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
 
 DOC = """
 
-  Reference paper:
-  - [Deep Residual Learning for Image Recognition]
-  (https://arxiv.org/abs/1512.03385) (CVPR 2015)
+  Reference:
+  - [Deep Residual Learning for Image Recognition](
+      https://arxiv.org/abs/1512.03385) (CVPR 2015)
 
   Optionally loads weights pre-trained on ImageNet.
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
+  
+  Caution: Be sure to properly pre-process your inputs to the application.
+  Please see `applications.resnet.preprocess_input` for an example.
 
   Arguments:
     include_top: whether to include the fully-connected
diff --git a/tensorflow/python/keras/applications/resnet_v2.py b/tensorflow/python/keras/applications/resnet_v2.py
index 55880bb16ad..212e25350a2 100644
--- a/tensorflow/python/keras/applications/resnet_v2.py
+++ b/tensorflow/python/keras/applications/resnet_v2.py
@@ -15,7 +15,7 @@
 # pylint: disable=invalid-name
 """ResNet v2 models for Keras.
 
-Reference paper:
+Reference:
   - [Identity Mappings in Deep Residual Networks]
     (https://arxiv.org/abs/1603.05027) (CVPR 2016)
 """
diff --git a/tensorflow/python/keras/applications/vgg16.py b/tensorflow/python/keras/applications/vgg16.py
index e0780fa9926..0d508997d0f 100644
--- a/tensorflow/python/keras/applications/vgg16.py
+++ b/tensorflow/python/keras/applications/vgg16.py
@@ -15,7 +15,7 @@
 # pylint: disable=invalid-name
 """VGG16 model for Keras.
 
-Reference paper:
+Reference:
   - [Very Deep Convolutional Networks for Large-Scale Image Recognition]
     (https://arxiv.org/abs/1409.1556) (ICLR 2015)
 """
@@ -53,7 +53,7 @@ def VGG16(
     classifier_activation='softmax'):
   """Instantiates the VGG16 model.
 
-  Reference paper:
+  Reference:
   - [Very Deep Convolutional Networks for Large-Scale Image Recognition](
   https://arxiv.org/abs/1409.1556) (ICLR 2015)
 
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 866f7569ca7..3f057361cab 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -43,7 +43,7 @@ from tensorflow.python.eager import lift_to_graph
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import device as tfdev
+from tensorflow.python.framework import device_spec
 from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
@@ -53,10 +53,10 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend_config
 from tensorflow.python.keras.engine import keras_tensor
+from tensorflow.python.keras.utils import control_flow_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import ctc_ops as ctc
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gradients as gradients_module
@@ -83,7 +83,6 @@ from tensorflow.python.util import nest
 from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
-from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import keras_export
 
 py_all = all
@@ -395,9 +394,6 @@ def _default_learning_phase():
           False, shape=(), name='keras_learning_phase')
 
 
-@deprecated('2020-10-11',
-            'Simply pass a True/False value to the `training` argument '
-            'of the `__call__` method of your layer or model.')
 @keras_export('keras.backend.set_learning_phase')
 def set_learning_phase(value):
   """Sets the learning phase to a fixed value.
@@ -423,6 +419,10 @@ def set_learning_phase(value):
   Raises:
       ValueError: if `value` is neither `0` nor `1`.
   """
+  logging.warning('`tf.keras.backend.set_learning_phase` is deprecated and '
+                  'will be removed after 2020-10-11. To update it, simply '
+                  'pass a True/False value to the `training` argument of the '
+                  '`__call__` method of your layer or model.')
   deprecated_internal_set_learning_phase(value)
 
 
@@ -459,9 +459,6 @@ def deprecated_internal_set_learning_phase(value):
     _GRAPH_LEARNING_PHASES[get_graph()] = value
 
 
-@deprecated('2020-10-11',
-            'Simply pass a True/False value to the `training` argument '
-            'of the `__call__` method of your layer or model.')
 @keras_export('keras.backend.learning_phase_scope')
 @tf_contextlib.contextmanager
 def learning_phase_scope(value):
@@ -479,6 +476,10 @@ def learning_phase_scope(value):
   Raises:
      ValueError: if `value` is neither `0` nor `1`.
   """
+  logging.warning('`tf.keras.backend.learning_phase_scope` is deprecated and '
+                  'will be removed after 2020-10-11. To update it, simply '
+                  'pass a True/False value to the `training` argument of the '
+                  '`__call__` method of your layer or model.')
   with deprecated_internal_learning_phase_scope(value):
     try:
       yield
@@ -725,7 +726,7 @@ class _TfDeviceCaptureOp(object):
 
   def _set_device(self, device):
     """This method captures TF's explicit device scope setting."""
-    if tfdev.is_device_spec(device):
+    if isinstance(device, device_spec.DeviceSpecV2):
       device = device.to_string()
     self.device = device
 
@@ -744,7 +745,10 @@ def _get_current_tf_device():
   graph = get_graph()
   op = _TfDeviceCaptureOp()
   graph._apply_device_functions(op)
-  return tfdev.DeviceSpec.from_string(op.device)
+  if tf2.enabled():
+    return device_spec.DeviceSpecV2.from_string(op.device)
+  else:
+    return device_spec.DeviceSpecV1.from_string(op.device)
 
 
 def _is_current_explicit_device(device_type):
@@ -1177,7 +1181,7 @@ def placeholder(shape=None,
 
   >>> input_ph = tf.keras.backend.placeholder(shape=(2, 4, 5))
   >>> input_ph
-  <tf.Tensor 'Placeholder_...' shape=(2, 4, 5) dtype=float32>
+  <KerasTensor: shape=(2, 4, 5) dtype=float32 (created by layer ...)>
 
   """
   if sparse and ragged:
@@ -1278,7 +1282,7 @@ def shape(x):
   <tf.Tensor: shape=(2,), dtype=int32, numpy=array([2, 2], dtype=int32)>
   >>> input = tf.keras.backend.placeholder(shape=(2, 4, 5))
   >>> tf.keras.backend.shape(input)
-  <tf.Tensor 'Shape_...' shape=(3,) dtype=int32>
+  <KerasTensor: shape=(3,) dtype=int32 inferred_value=[2, 4, 5] ...>
 
   """
   return array_ops.shape(x)
@@ -1793,13 +1797,13 @@ def dot(x, y):
   >>> y = tf.keras.backend.placeholder(shape=(3, 4))
   >>> xy = tf.keras.backend.dot(x, y)
   >>> xy
-  <tf.Tensor ... shape=(2, 4) dtype=float32>
+  <KerasTensor: shape=(2, 4) dtype=float32 ...>
 
   >>> x = tf.keras.backend.placeholder(shape=(32, 28, 3))
   >>> y = tf.keras.backend.placeholder(shape=(3, 4))
   >>> xy = tf.keras.backend.dot(x, y)
   >>> xy
-  <tf.Tensor ... shape=(32, 28, 4) dtype=float32>
+  <KerasTensor: shape=(32, 28, 4) dtype=float32 ...>
 
   >>> x = tf.keras.backend.random_uniform_variable(shape=(2, 3), low=0, high=1)
   >>> y = tf.keras.backend.ones((4, 3, 5))
@@ -2049,10 +2053,10 @@ def transpose(x):
          [3.,  6.]], dtype=float32)
   >>> input = tf.keras.backend.placeholder((2, 3))
   >>> input
-  <tf.Tensor 'Placeholder_...' shape=(2, 3) dtype=float32>
+  <KerasTensor: shape=(2, 3) dtype=float32 ...>
   >>> input_transposed = tf.keras.backend.transpose(input)
   >>> input_transposed
-  <tf.Tensor 'Transpose_...' shape=(3, 2) dtype=float32>
+  <KerasTensor: shape=(3, 2) dtype=float32 ...>
   """
   return array_ops.transpose(x)
 
@@ -4102,9 +4106,9 @@ def rnn(step_function,
   # That's what the tile call does, it just repeats the mask along its
   # second dimension n times.
   def _expand_mask(mask_t, input_t, fixed_dim=1):
-    if nest.is_sequence(mask_t):
+    if nest.is_nested(mask_t):
       raise ValueError('mask_t is expected to be tensor, but got %s' % mask_t)
-    if nest.is_sequence(input_t):
+    if nest.is_nested(input_t):
       raise ValueError('input_t is expected to be tensor, but got %s' % input_t)
     rank_diff = len(input_t.shape) - len(mask_t.shape)
     for _ in range(rank_diff):
@@ -4130,7 +4134,7 @@ def rnn(step_function,
         input_t.reverse()
       return input_t
 
-    if nest.is_sequence(inputs):
+    if nest.is_nested(inputs):
       processed_input = nest.map_structure(_process_single_input_t, inputs)
     else:
       processed_input = (_process_single_input_t(inputs),)
@@ -4869,7 +4873,7 @@ def hard_sigmoid(x):
   """
   point_two = _constant_to_tensor(0.2, x.dtype.base_dtype)
   point_five = _constant_to_tensor(0.5, x.dtype.base_dtype)
-  x = math_ops.mul(x, point_two)
+  x = math_ops.multiply(x, point_two)
   x = math_ops.add(x, point_five)
   x = clip_ops.clip_by_value(x, 0., 1.)
   return x
@@ -5859,7 +5863,6 @@ def random_uniform(shape, minval=0.0, maxval=1.0, dtype=None, seed=None):
       shape, minval=minval, maxval=maxval, dtype=dtype, seed=seed)
 
 
-@deprecated(None, 'Use `tf.keras.backend.random_bernoulli` instead.')
 @keras_export('keras.backend.random_binomial')
 @dispatch.add_dispatch_support
 def random_binomial(shape, p=0.0, dtype=None, seed=None):
@@ -5888,6 +5891,8 @@ def random_binomial(shape, p=0.0, dtype=None, seed=None):
   <tf.Tensor: shape=(2, 3), dtype=float32, numpy=...,
   dtype=float32)>
   """
+  logging.warning('`tf.keras.backend.random_binomial` is deprecated. '
+                  'Please use `tf.keras.backend.random_bernoulli` instead.')
   if dtype is None:
     dtype = floatx()
   if seed is None:
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index 48bbedbd4fc..2e0274a509b 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -169,7 +169,7 @@ class BackendUtilsTest(test.TestCase):
           sess.run(y, feed_dict={x: np.random.random((2, 3))})
 
   def test_learning_phase_name(self):
-    with ops.name_scope('test_scope'):
+    with backend.name_scope('test_scope'):
       # Test that outer name scopes do not affect the learning phase's name.
       lp = backend.symbolic_learning_phase()
     self.assertEqual(lp.name, 'keras_learning_phase:0')
diff --git a/tensorflow/python/keras/benchmarks/BUILD b/tensorflow/python/keras/benchmarks/BUILD
index 2386b01c426..28a80820bc2 100644
--- a/tensorflow/python/keras/benchmarks/BUILD
+++ b/tensorflow/python/keras/benchmarks/BUILD
@@ -23,11 +23,27 @@ exports_files(["LICENSE"])
 # to the regular expression is executed.
 # e.g. --benchmarks=".*lstm*." will run all lstm layer related benchmarks.
 
+# Add all benchmarks related utils here for pip testing dependencis.
+py_library(
+    name = "keras_benchmark_lib_pip",
+    deps = [
+        ":benchmark_util",
+        ":distribution_util",
+        "//tensorflow/python/keras/benchmarks/saved_model_benchmarks:saved_model_benchmark_util",
+    ],
+)
+
+COMMON_TAGS = [
+    "no_pip",  # b/161253163
+    "no_windows",  # b/160628318
+]
+
 py_test(
     name = "keras_cpu_benchmark_test",
     size = "large",
     srcs = ["keras_cpu_benchmark_test.py"],
     python_version = "PY3",
+    tags = COMMON_TAGS,
     deps = [
         ":benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -40,31 +56,14 @@ cuda_py_test(
     size = "medium",
     srcs = ["eager_microbenchmarks_test.py"],
     python_version = "PY3",
-    tags = [
-        "no_oss_py38",  # b/160170347
-        "no_windows",  # b/160269052
+    tags = COMMON_TAGS + [
+        "no_oss_py38",  # TODO(b/162044699)
     ],
     deps = [
         "//tensorflow:tensorflow_py",
     ],
 )
 
-cuda_py_test(
-    name = "applications_saved_model_test",
-    size = "medium",
-    srcs = ["applications_saved_model_test.py"],
-    shard_count = 8,
-    tags = [
-        "no_oss_py38",  # b/160170347
-        "no_windows",  # b/160269052
-    ],
-    deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python/keras/applications",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
 cuda_py_test(
     name = "model_components_benchmarks_test",
     srcs = ["model_components_benchmarks_test.py"],
@@ -74,14 +73,100 @@ cuda_py_test(
     ],
 )
 
-py_test(
-    name = "keras_examples_benchmark_test",
-    size = "medium",
-    srcs = ["keras_examples_benchmark_test.py"],
-    python_version = "PY3",
-    tags = [
-        "no_windows",  # b/160628318
+py_library(
+    name = "benchmark_util",
+    srcs = ["benchmark_util.py"],
+    deps = [
+        ":distribution_util",
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/numpy",
     ],
+)
+
+cuda_py_test(
+    name = "bidirectional_lstm_benchmark_test",
+    srcs = ["keras_examples_benchmarks/bidirectional_lstm_benchmark_test.py"],
+    python_version = "PY3",
+    tags = COMMON_TAGS,
+    deps = [
+        ":benchmark_util",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+cuda_py_test(
+    name = "text_classification_transformer_benchmark_test",
+    srcs = ["keras_examples_benchmarks/text_classification_transformer_benchmark_test.py"],
+    python_version = "PY3",
+    tags = COMMON_TAGS,
+    deps = [
+        ":benchmark_util",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+cuda_py_test(
+    name = "antirectifier_benchmark_test",
+    srcs = ["keras_examples_benchmarks/antirectifier_benchmark_test.py"],
+    python_version = "PY3",
+    tags = COMMON_TAGS,
+    deps = [
+        ":benchmark_util",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+cuda_py_test(
+    name = "mnist_conv_benchmark_test",
+    srcs = ["keras_examples_benchmarks/mnist_conv_benchmark_test.py"],
+    python_version = "PY3",
+    tags = COMMON_TAGS,
+    deps = [
+        ":benchmark_util",
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_test(
+    name = "mnist_hierarchical_rnn_benchmark_test",
+    srcs = ["keras_examples_benchmarks/mnist_hierarchical_rnn_benchmark_test.py"],
+    python_version = "PY3",
+    tags = COMMON_TAGS,
+    deps = [
+        ":benchmark_util",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+cuda_py_test(
+    name = "mnist_irnn_benchmark_test",
+    srcs = ["keras_examples_benchmarks/mnist_irnn_benchmark_test.py"],
+    python_version = "PY3",
+    tags = COMMON_TAGS,
+    deps = [
+        ":benchmark_util",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+cuda_py_test(
+    name = "reuters_mlp_benchmark_test",
+    srcs = ["keras_examples_benchmarks/reuters_mlp_benchmark_test.py"],
+    python_version = "PY3",
+    tags = COMMON_TAGS,
+    deps = [
+        ":benchmark_util",
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_test(
+    name = "cifar10_cnn_benchmark_test",
+    srcs = ["keras_examples_benchmarks/cifar10_cnn_benchmark_test.py"],
+    python_version = "PY3",
+    tags = COMMON_TAGS,
     deps = [
         ":benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -89,10 +174,24 @@ py_test(
 )
 
 py_library(
-    name = "benchmark_util",
-    srcs = ["benchmark_util.py"],
+    name = "distribution_util",
+    srcs = ["distribution_util.py"],
     deps = [
         "//tensorflow:tensorflow_py",
-        "//third_party/py/numpy",
     ],
 )
+
+# Run memory profiler on Keras model.
+# Please make sure `meomry_profiler` is installed.
+# To run the memory profiler:
+# With CPU:
+#   bazel run -c opt model_memory_profile -- --model=YOUR_MODEL_NAME
+# With GPU:
+#   bazel run -c opt --config=cuda model_memory_profile -- --model=YOUR_MODEL_NAME
+py_binary(
+    name = "model_memory_profile",
+    srcs = ["model_memory_profile.py"],
+    python_version = "PY3",
+    tags = ["no_oss"],
+    deps = ["//tensorflow:tensorflow_py"],
+)
diff --git a/tensorflow/python/keras/benchmarks/applications_saved_model_test.py b/tensorflow/python/keras/benchmarks/applications_saved_model_test.py
deleted file mode 100644
index 0111c8f13b9..00000000000
--- a/tensorflow/python/keras/benchmarks/applications_saved_model_test.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Benchmarks for Keras applications."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tempfile
-import time
-
-import six
-
-from tensorflow.python.keras.applications import densenet
-from tensorflow.python.keras.applications import efficientnet
-from tensorflow.python.keras.applications import inception_resnet_v2
-from tensorflow.python.keras.applications import mobilenet_v2
-from tensorflow.python.keras.applications import nasnet
-from tensorflow.python.keras.applications import resnet_v2
-from tensorflow.python.keras.applications import vgg19
-from tensorflow.python.keras.applications import xception
-from tensorflow.python.keras.saving.saved_model import load as keras_load
-from tensorflow.python.platform import benchmark
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import googletest
-from tensorflow.python.platform import test
-
-
-class BenchmarkSaveApplications(
-    six.with_metaclass(benchmark.ParameterizedBenchmark, test.Benchmark)):
-
-  _benchmark_parameters = [
-      ('ResNet152V2', resnet_v2.ResNet152V2, 2048),
-      ('VGG19', vgg19.VGG19, 512),
-      ('Xception', xception.Xception, 2048),
-      ('InceptionResNetV2', inception_resnet_v2.InceptionResNetV2, 1536),
-      ('MobileNetV2', mobilenet_v2.MobileNetV2, 1280),
-      ('DenseNet201', densenet.DenseNet201, 1920),
-      ('EfficientNetB7', efficientnet.EfficientNetB7, 2560),
-      ('NASNetLarge', nasnet.NASNetLarge, 4032),
-  ]
-
-  def benchmark_save_and_load_applications(self, app, _):
-    trials = 3
-
-    model = app(weights=None)
-    model_name = app.__name__
-
-    tmp_dir = googletest.GetTempDir()
-    gfile.MakeDirs(tmp_dir)
-    save_dir = tempfile.mkdtemp(dir=tmp_dir)
-
-    total_save_time = 0
-    total_load_time = 0
-
-    # Run one untimed iteration of saving/loading.
-    model.save(save_dir, save_format='tf')
-    keras_load.load(save_dir)
-
-    for _ in range(trials):
-      start_time = time.time()
-      model.save(save_dir, save_format='tf')
-      total_save_time += time.time() - start_time
-
-      start_time = time.time()
-      keras_load.load(save_dir)
-      total_load_time += time.time() - start_time
-    self.report_benchmark(
-        iters=trials,
-        wall_time=total_save_time / trials,
-        name='{}.save'.format(model_name))
-
-    self.report_benchmark(
-        iters=1,
-        wall_time=total_load_time / trials,
-        name='{}.load'.format(model_name))
-    gfile.DeleteRecursively(save_dir)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/keras/benchmarks/benchmark_util.py b/tensorflow/python/keras/benchmarks/benchmark_util.py
index dcf1fa437d2..93aea7c9ead 100644
--- a/tensorflow/python/keras/benchmarks/benchmark_util.py
+++ b/tensorflow/python/keras/benchmarks/benchmark_util.py
@@ -22,6 +22,8 @@ import numpy as np
 
 import tensorflow as tf
 
+from tensorflow.python.keras.benchmarks import distribution_util
+
 
 class TimerCallBack(tf.keras.callbacks.Callback):
   """Callback for logging time in each epoch or batch."""
@@ -47,7 +49,7 @@ class TimerCallBack(tf.keras.callbacks.Callback):
 def measure_performance(model_fn,
                         x=None,
                         y=None,
-                        epoch=2,
+                        epochs=2,
                         batch_size=32,
                         run_iters=4,
                         optimizer=None,
@@ -62,8 +64,8 @@ def measure_performance(model_fn,
     model_fn: Model function to be benchmarked.
     x: Input data. See `x` in the `fit()` method of `keras.Model`.
     y: Target data. See `y` in the `fit()` method of `keras.Model`.
-    epoch: Integer. Number of epochs to train the model. If unspecified, `epoch`
-      will default to 2.
+    epochs: Integer. Number of epochs to train the model.
+      If unspecified, `epochs` will default to 2.
     batch_size: Integer. Number of samples per gradient update. If unspecified,
       `batch_size` will default to 32.
     run_iters: Integer. Number of iterations to run the performance measurement.
@@ -84,8 +86,8 @@ def measure_performance(model_fn,
 
   Returns:
     Performance summary, which contains build_time, compile_time,
-    startup_time, avg_epoch_time, wall_time, exp_per_sec, distribution_strategy,
-    epoch.
+    startup_time, avg_epoch_time, wall_time, exp_per_sec, epochs,
+    distribution_strategy.
 
   Raise:
     ValueError: If `x` is none or if `optimizer` is not provided or
@@ -106,21 +108,28 @@ def measure_performance(model_fn,
 
   build_time_list, compile_time_list, startup_time_list = [], [], []
   avg_epoch_time_list, wall_time_list, exp_per_sec_list = [], [], []
-  total_num_examples = epoch * num_examples
+  total_num_examples = epochs * num_examples
+
+  strategy = distribution_util.get_distribution_strategy(
+      distribution_strategy=distribution_strategy, num_gpus=num_gpus)
 
   for _ in range(run_iters):
     timer = timeit.default_timer
-    t0 = timer()
-    model = model_fn()
-    build_time = timer() - t0
+    start_time = timer()
+    # Init the distribution strategy scope for each iteration.
+    strategy_scope = distribution_util.get_strategy_scope(strategy)
+    with strategy_scope:
+      t0 = timer()
+      model = model_fn()
+      build_time = timer() - t0
 
-    t1 = timer()
-    model.compile(
-        optimizer=optimizer,
-        loss=loss,
-        metrics=metrics,
-    )
-    compile_time = timer() - t1
+      t1 = timer()
+      model.compile(
+          optimizer=optimizer,
+          loss=loss,
+          metrics=metrics,
+      )
+      compile_time = timer() - t1
     # Run one warm up epoch.
     model.fit(x=x, y=y, batch_size=batch_size, epochs=1)
     cbk = TimerCallBack()
@@ -129,7 +138,7 @@ def measure_performance(model_fn,
         x=x,
         y=y,
         batch_size=batch_size,
-        epochs=epoch,
+        epochs=epochs,
         callbacks=[cbk],
         verbose=verbose)
     end_time = timer()
@@ -138,18 +147,24 @@ def measure_performance(model_fn,
     compile_time_list.append(compile_time)
     startup_time_list.append(cbk.startup_time)
     avg_epoch_time_list.append(np.mean(cbk.times))
-    wall_time_list.append(end_time - t0)
+    wall_time_list.append(end_time - start_time)
     exp_per_sec_list.append(total_num_examples / (end_time - t2))
 
-  results = {
-      'build_time': np.mean(build_time_list),
-      'compile_time': np.mean(compile_time_list),
-      'startup_time': np.mean(startup_time_list),
-      'avg_epoch_time': np.mean(avg_epoch_time_list),
-      'wall_time': np.mean(wall_time_list),
-      'exp_per_sec': np.mean(exp_per_sec_list),
+  metrics = []
+  metrics.append({'name': 'build_time', 'value': np.mean(build_time_list)})
+  metrics.append({'name': 'compile_time', 'value': np.mean(compile_time_list)})
+  metrics.append({'name': 'startup_time', 'value': np.mean(startup_time_list)})
+  metrics.append({
+      'name': 'avg_epoch_time',
+      'value': np.mean(avg_epoch_time_list)
+  })
+  metrics.append({'name': 'exp_per_sec', 'value': np.mean(exp_per_sec_list)})
+  metrics.append({'name': 'epochs', 'value': epochs})
+
+  wall_time = np.mean(wall_time_list)
+  extras = {
       'distribution_strategy': distribution_strategy,
-      'epoch': epoch
+      'num_gpus': num_gpus
   }
 
-  return results
+  return metrics, wall_time, extras
diff --git a/tensorflow/python/keras/benchmarks/distribution_util.py b/tensorflow/python/keras/benchmarks/distribution_util.py
new file mode 100644
index 00000000000..4bf75e4f769
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/distribution_util.py
@@ -0,0 +1,190 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Util for running models in a distribution setting.
+
+Mostly from
+https://github.com/tensorflow/models/blob/master/official/
+utils/misc/distribution_utils.py.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import json
+
+import tensorflow as tf
+
+
+def _collective_communication(all_reduce_alg):
+  """Return a CollectiveCommunication based on all_reduce_alg.
+
+  Args:
+    all_reduce_alg: a string specifying which collective communication to pick,
+      or None.
+
+  Returns:
+    tf.distribute.experimental.CollectiveCommunication object
+
+  Raises:
+    ValueError: if `all_reduce_alg` not in [None, "ring", "nccl"]
+  """
+  collective_communication_options = {
+      None: tf.distribute.experimental.CollectiveCommunication.AUTO,
+      "ring": tf.distribute.experimental.CollectiveCommunication.RING,
+      "nccl": tf.distribute.experimental.CollectiveCommunication.NCCL
+  }
+  if all_reduce_alg not in collective_communication_options:
+    raise ValueError(
+        "When used with `multi_worker_mirrored`, valid values for "
+        "all_reduce_alg are [`ring`, `nccl`].  Supplied value: {}".format(
+            all_reduce_alg))
+  return collective_communication_options[all_reduce_alg]
+
+
+def _mirrored_cross_device_ops(all_reduce_alg, num_packs):
+  """Return a CrossDeviceOps based on all_reduce_alg and num_packs.
+
+  Args:
+    all_reduce_alg: a string specifying which cross device op to pick, or None.
+    num_packs: an integer specifying number of packs for the cross device op.
+
+  Returns:
+    tf.distribute.CrossDeviceOps object or None.
+
+  Raises:
+    ValueError: if `all_reduce_alg` not in [None, "nccl", "hierarchical_copy"].
+  """
+  if all_reduce_alg is None:
+    return None
+  mirrored_all_reduce_options = {
+      "nccl": tf.distribute.NcclAllReduce,
+      "hierarchical_copy": tf.distribute.HierarchicalCopyAllReduce
+  }
+  if all_reduce_alg not in mirrored_all_reduce_options:
+    raise ValueError(
+        "When used with `mirrored`, valid values for all_reduce_alg are "
+        "[`nccl`, `hierarchical_copy`].  Supplied value: {}".format(
+            all_reduce_alg))
+  cross_device_ops_class = mirrored_all_reduce_options[all_reduce_alg]
+  return cross_device_ops_class(num_packs=num_packs)
+
+
+def get_distribution_strategy(distribution_strategy="mirrored",
+                              num_gpus=0,
+                              all_reduce_alg=None,
+                              num_packs=1):
+  """Return a DistributionStrategy for running the model.
+
+  Args:
+    distribution_strategy: a string specifying which distribution strategy to
+      use. Accepted values are "off", "one_device", "mirrored", and
+      "multi_worker_mirrored" -- case insensitive. "off" means not to use
+      Distribution Strategy.
+    num_gpus: Number of GPUs to run this model.
+
+  Returns:
+    tf.distribute.DistibutionStrategy object.
+  Raises:
+    ValueError: if `distribution_strategy` is "off" or "one_device" and
+      `num_gpus` is larger than 1; or `num_gpus` is negative.
+  """
+  if num_gpus < 0:
+    raise ValueError("`num_gpus` can not be negative.")
+
+  distribution_strategy = distribution_strategy.lower()
+
+  if distribution_strategy == "off":
+    if num_gpus > 1:
+      raise ValueError("When {} GPUs are specified, distribution_strategy "
+                       "flag cannot be set to `off`.".format(num_gpus))
+    return None
+
+  if distribution_strategy == "multi_worker_mirrored":
+    return tf.distribute.experimental.MultiWorkerMirroredStrategy(
+        communication=_collective_communication(all_reduce_alg))
+
+  if distribution_strategy == "one_device":
+    if num_gpus == 0:
+      return tf.distribute.OneDeviceStrategy("device:CPU:0")
+    if num_gpus > 1:
+      raise ValueError("`OneDeviceStrategy` can not be used for more than "
+                       "one device.")
+    return tf.distribute.OneDeviceStrategy("device:GPU:0")
+
+  if distribution_strategy == "mirrored":
+    if num_gpus == 0:
+      devices = ["device:CPU:0"]
+    else:
+      devices = ["device:GPU:%d" % i for i in range(num_gpus)]
+    return tf.distribute.MirroredStrategy(
+        devices=devices,
+        cross_device_ops=_mirrored_cross_device_ops(all_reduce_alg, num_packs))
+
+  raise ValueError("Unrecognized Distribution Strategy: %r" %
+                   distribution_strategy)
+
+
+def configure_cluster(worker_hosts=None, task_index=-1):
+  """Set multi-worker cluster spec in TF_CONFIG environment variable.
+
+  Args:
+    worker_hosts: comma-separated list of worker ip:port pairs.
+
+  Returns:
+    Number of workers in the cluster.
+  """
+  tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
+  if tf_config:
+    num_workers = (
+        len(tf_config["cluster"].get("chief", [])) +
+        len(tf_config["cluster"].get("worker", [])))
+  elif worker_hosts:
+    workers = worker_hosts.split(",")
+    num_workers = len(workers)
+    if num_workers > 1 and task_index < 0:
+      raise ValueError("Must specify task_index when number of workers > 1")
+    task_index = 0 if num_workers == 1 else task_index
+    os.environ["TF_CONFIG"] = json.dumps({
+        "cluster": {
+            "worker": workers
+        },
+        "task": {
+            "type": "worker",
+            "index": task_index
+        }
+    })
+  else:
+    num_workers = 1
+  return num_workers
+
+
+def get_strategy_scope(strategy):
+  if strategy:
+    strategy_scope = strategy.scope()
+  else:
+    strategy_scope = DummyContextManager()
+
+  return strategy_scope
+
+
+class DummyContextManager(object):
+
+  def __enter__(self):
+    pass
+
+  def __exit__(self, *args):
+    pass
diff --git a/tensorflow/python/keras/benchmarks/eager_microbenchmarks_test.py b/tensorflow/python/keras/benchmarks/eager_microbenchmarks_test.py
index 83a4fc27424..82f2a8342c2 100644
--- a/tensorflow/python/keras/benchmarks/eager_microbenchmarks_test.py
+++ b/tensorflow/python/keras/benchmarks/eager_microbenchmarks_test.py
@@ -51,13 +51,19 @@ class MicroBenchmarksBase(tf.test.Benchmark):
     """Run and report benchmark results."""
     total_time = run_benchmark(func, num_iters, execution_mode)
     mean_us = total_time * 1e6 / num_iters
-    extras = {
-        "examples_per_sec": float("{0:.3f}".format(num_iters / total_time)),
-        "us_per_example": float("{0:.3f}".format(total_time * 1e6 / num_iters))
-    }
+    metrics = [{
+        "name": "exp_per_sec",
+        "value": float("{0:.3f}".format(num_iters / total_time))
+    }, {
+        "name": "us_per_exp",
+        "value": float("{0:.3f}".format(total_time * 1e6 / num_iters))
+    }]
     benchmark_name = self._get_benchmark_name()
     self.report_benchmark(
-        iters=num_iters, wall_time=mean_us, extras=extras, name=benchmark_name)
+        iters=num_iters,
+        wall_time=mean_us,
+        metrics=metrics,
+        name=benchmark_name)
 
   def _get_benchmark_name(self):
     """Mostly copied from benchmark.py _get_name()."""
diff --git a/tensorflow/python/keras/benchmarks/keras_cpu_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_cpu_benchmark_test.py
index 07419885fec..2e7896fdef2 100644
--- a/tensorflow/python/keras/benchmarks/keras_cpu_benchmark_test.py
+++ b/tensorflow/python/keras/benchmarks/keras_cpu_benchmark_test.py
@@ -95,7 +95,7 @@ class KerasModelCPUBenchmark(
     """Benchmark for MLP model on synthetic mnist data."""
     mlp_x = np.random.random((5000, 784))
     mlp_y = np.random.random((5000, 10))
-    results = benchmark_util.measure_performance(
+    metrics, wall_time, extras = benchmark_util.measure_performance(
         self._mnist_mlp,
         x=mlp_x,
         y=mlp_y,
@@ -104,13 +104,13 @@ class KerasModelCPUBenchmark(
         optimizer=_OPTIMIZER,
         loss=_LOSS)
     self.report_benchmark(
-        iters=run_iters, wall_time=results['wall_time'], extras=results)
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_mnist_convnet(self, batch_size, run_iters):
     """Benchmark for Convnet model on synthetic mnist data."""
     convnet_x = np.random.random((5000, 28, 28, 1))
     convnet_y = np.random.random((5000, 10))
-    results = benchmark_util.measure_performance(
+    metrics, wall_time, extras = benchmark_util.measure_performance(
         self._mnist_convnet,
         x=convnet_x,
         y=convnet_y,
@@ -119,13 +119,13 @@ class KerasModelCPUBenchmark(
         optimizer=_OPTIMIZER,
         loss=_LOSS)
     self.report_benchmark(
-        iters=run_iters, wall_time=results['wall_time'], extras=results)
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_imdb_lstm(self, batch_size, run_iters):
     """Benchmark for LSTM model on synthetic imdb review dataset."""
     lstm_x = np.random.randint(0, 1999, size=(2500, 100))
     lstm_y = np.random.random((2500, 1))
-    results = benchmark_util.measure_performance(
+    metrics, wall_time, extras = benchmark_util.measure_performance(
         self._imdb_lstm,
         x=lstm_x,
         y=lstm_y,
@@ -134,7 +134,7 @@ class KerasModelCPUBenchmark(
         optimizer=_OPTIMIZER,
         loss=_LOSS)
     self.report_benchmark(
-        iters=run_iters, wall_time=results['wall_time'], extras=results)
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_examples_benchmark_test.py
deleted file mode 100644
index 75d0480f30c..00000000000
--- a/tensorflow/python/keras/benchmarks/keras_examples_benchmark_test.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Benchmark for examples on https://keras.io/examples."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import six
-
-import tensorflow as tf
-
-from tensorflow.python.keras.benchmarks import benchmark_util
-from tensorflow.python.platform import benchmark
-
-_MAX_FEATURE = 20000
-_MAX_LEN = 200
-
-
-class KerasExamplesBenchmark(
-    six.with_metaclass(benchmark.ParameterizedBenchmark, tf.test.Benchmark)):
-  """Required Arguments for measure_performance.
-
-      x: Input data, it could be Numpy or load from tfds.
-      y: Target data. If `x` is a dataset, generator instance,
-         `y` should not be specified.
-      loss: Loss function for model.
-      optimizer: Optimizer for model.
-      Other details can see in `measure_performance()` method of
-      benchmark_util.
-  """
-  # The parameters of each benchmark is a tuple:
-
-  # (benchmark_name_suffix, batch_size, run_iters).
-  # benchmark_name_suffix: The suffix of the benchmark test name with
-  # convention `{bs}_{batch_size}`.
-  # batch_size: Integer. Number of samples per gradient update.
-  # run_iters: Integer. Number of iterations to run the
-  # performance measurement.
-
-  _benchmark_parameters = [('bs_32', 32, 2), ('bs_64', 64, 2),
-                           ('bs_128', 128, 1), ('bs_256', 256, 1),
-                           ('bs_512', 512, 3)]
-
-  def _lstm_imdb_model(self):
-    """LSTM model from https://keras.io/examples/nlp/bidirectional_lstm_imdb/."""
-    inputs = tf.keras.Input(shape=(None,), dtype='int32')
-    x = tf.keras.layers.Embedding(_MAX_FEATURE, 128)(inputs)
-    x = tf.keras.layers.Bidirectional(
-        tf.keras.layers.LSTM(64, return_sequences=True))(
-            x)
-    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(x)
-    outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
-    model = tf.keras.Model(inputs, outputs)
-    return model
-
-  def benchmark_bidirect_lstm_imdb(self, batch_size, run_iters):
-    """Benchmark for Bidirectional LSTM on IMDB."""
-    # Load dataset.
-    (x_train,
-     y_train), _ = tf.keras.datasets.imdb.load_data(num_words=_MAX_FEATURE)
-    x_train = tf.keras.preprocessing.sequence.pad_sequences(
-        x_train, maxlen=_MAX_LEN)
-    results = benchmark_util.measure_performance(
-        self._lstm_imdb_model,
-        x=x_train,
-        y=y_train,
-        batch_size=batch_size,
-        run_iters=run_iters,
-        optimizer='adam',
-        loss='binary_crossentropy',
-        metrics=['accuracy'])
-
-    self.report_benchmark(
-        iters=run_iters, wall_time=results['wall_time'], extras=results)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/README.md b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/README.md
new file mode 100644
index 00000000000..d26d9495019
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/README.md
@@ -0,0 +1,240 @@
+# Benchmarks for keras model examples
+
+-   [Benchmarks for keras model examples](#benchmarks-for-keras-model-examples)
+    -   [Keras benchmarks](#keras-benchmarks)
+    -   [Available models](#available-models)
+        -   [Computer Vision examples](#computer-vision-examples)
+        -   [Text & Sequence examples](#text--sequence-examples)
+        -   [Other examples](#other-examples)
+    -   [Available benchmark results](#available-benchmark-results)
+        -   [Cifar10 CNN benchmark](#cifar10-cnn-benchmark)
+        -   [MNIST Conv benchmark](#mnist-conv-benchmark)
+        -   [MNIST Hierarchical RNN (HRNN) benchmark](#mnist-hierarchical-rnn-hrnn-benchmark)
+        -   [Bidirectional LSTM benchmark](#bidirectional-lstm-benchmark)
+        -   [Text classification with transformer benchmark](#text-classification-with-transformer-benchmark)
+        -   [MLP benchmark](#mlp-benchmark)
+        -   [Antirectifier benchmark](#antirectifier-benchmark)
+        -   [IRNN benchmark](#irnn-benchmark)
+    -   [Install Bazel](#install-bazel)
+    -   [Run benchmarks](#run-benchmarks)
+    -   [Add new benchmarks](#add-new-benchmarks)
+    -   [Troubleshooting](#troubleshooting)
+
+## Keras benchmarks
+
+These are benchmark tests running on keras models: models from
+[keras/examples](https://github.com/keras-team/keras/tree/master/examples).
+Benchmarks in the current folder
+(`tensorflow/python/keras/benchmarks/keras_examples_benchmarks`) use Keras
+[built-in dataset](https://keras.io/api/datasets/). In addition, these
+benchmarks support different
+[distribution strategies](https://www.tensorflow.org/guide/distributed_training)
+on multiple GPUs.
+
+### Available models
+
+These examples are implemented by Functional API and Sequential API.
+
+#### Computer Vision examples
+
+-   [cifar10_cnn_benchmark_test.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/cifar10_cnn_benchmark_test.py):
+    Simple CNN on CIFAR10 image dataset.
+-   [mnist_conv_benchmark_test.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_conv_benchmark_test.py):
+    Simple Convnet that achieves ~99% test accuracy on MNIST.
+-   [mnist_hierarchical_rnn_benchmark_test.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_hierarchical_rnn_benchmark_test.py):
+    Hierarchical RNN (HRNN) to classify MNIST digits.
+
+#### Text & Sequence examples
+
+-   [Bidirectional_lstm_benchmark_test.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/bidirectional_lstm_benchmark_test.py):
+    2-layer bidirectional LSTM on IMDB movie review dataset.
+-   [text_classification_transformer_benchmark_test.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/text_classification_transformer_benchmark_test.py):
+    Text classification with custom transformer block.
+-   [reuters_mlp_benchmark_test.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/reuters_mlp_benchmark_test.py):
+    Simple MLP on Reuters newswire topic classification dataset.
+
+#### Other examples
+
+-   [antirectifier_benchmark_test.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/antirectifier_benchmark_test.py):
+    Simple custom layer example.
+-   [mnist_irnn_benchmark_test.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_irnn_benchmark_test.py):Reproduction
+    of the IRNN experiment with pixel-by-pixel sequential MNIST in
+    ["A Simple Way to Initialize Recurrent Networks of Rectified Linear Units"](https://arxiv.org/abs/1504.00941)
+    by Le et al.
+
+### Available benchmark results
+
+The listed benchmark results are obtained by running on Google Cloud Platform (GCP) with the following setup: </br>
+
+-   GPU: 2 x Tesla V100</br>
+-   OS: Ubuntu 18.04 </br>
+-   CPU: 8 x vCPUs, 30 GB memory </br>
+-   CUDA: 10.1 </br>
+-   Bazel: 3.1.0 </br>
+
+If you want to run benchmark tests on GPU, please make sure you already installed CUDA and other dependencies by following the instructions from the [official tutorial](https://www.tensorflow.org/install/gpu) for GPU support.</br>
+
+Metrics for following benchmarks:</br>
+
+-   Batch_size: Number of samples per batch of computation.</br>
+-   Wall_time: Total time to run benchmark test in seconds.</br>
+-   Avg_epoch_time: Average time for each epoch.</br>
+-   Exp_per_sec: Examples per second. The number of examples processed in one second.</br>
+-   Distribution_Strategy: The [distribution strategies](https://www.tensorflow.org/guide/distributed_training) used in the benchmark. </br>
+
+#### Cifar10 CNN benchmark
+
+      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
+:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
+CPU   | 256        | 1393.4896 | 3.21           | 15397.69    | `off`
+GPU:2 | 256        | 76.49     | 2.59           | 18758.01    | `mirrored`
+
+#### MNIST Conv benchmark
+
+      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
+:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
+CPU   | 256        | 196.52    | 12.19          | 4915.26     | `off`
+GPU:2 | 256        | 24.5794   | 1.21           | 47899.32    | `mirrored`
+
+#### MNIST Hierarchical RNN (HRNN) benchmark
+
+      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
+:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
+CPU   | 256        | 654.05    | 218.68         | 274.24      | `off`
+GPU:2 | 256        | 20.77     | 3.73           | 15088.06    | `mirrored`
+
+#### Bidirectional LSTM benchmark
+
+      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
+:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
+CPU   | 512        | 225.57    | 72.55          | 344.70      | `off`
+GPU:2 | 512        | 23.54     | 3.23           | 7532.53     | `mirrored`
+
+#### Text classification with transformer benchmark
+
+      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
+:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
+CPU   | 512        | 109.22    | 35.93          | 698.10      | `off`
+GPU:2 | 512        | 9.28      | 0.83           | 26567.54    | `mirrored`
+
+#### MLP benchmark
+
+      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
+:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
+CPU   | 128        | 3.76      | 0.54           | 17678.54    | `off`
+GPU:2 | 128        | 5.91      | 0.30           | 25435.14    | `mirrored`
+
+#### Antirectifier benchmark
+
+      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
+:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
+CPU   | 512        | 6.77      | 1.79           | 30916.39    | `off`
+GPU:2 | 512        | 6.81      | 0.66           | 66563.17    | `mirrored`
+
+#### IRNN benchmark
+
+      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
+:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
+CPU   | 1024       | 213.00    | 69.01          | 868.08      | `off`
+GPU:2 | 1024       | 92.71     | 29.12          | 2042.94     | `mirrored`
+
+**Note**: For the small models, running on GPU might be even slower than CPU.
+The potential reason is, training small models is not computation dominant, and
+there might be some overhead on model replication and data sharding with
+distributed training on GPUs.
+
+## Install Bazel
+
+This step can be skipped if Bazel is already installed. </br>
+
+[Bazel](https://bazel.build/) is used to build targets based on BUILD files. It
+will take a while for the first time because it will compile all dependencies
+from your BUILD file. For the next time, Bazel will use the cache and it’ll be
+much faster. For Ubuntu OS, please use the following steps for Bazel
+installation. For other platforms, you may follow the corresponding guide for
+the installation.
+
+1.  Add bazel as package source
+
+    ```shell
+    sudo apt install curl gnupg
+    ```
+
+    ```shell
+    curl https://bazel.build/bazel-release.pub.gpg | sudo apt-key add -
+    ```
+
+    ```shell
+    echo "deb [arch=amd64] https://storage.googleapis.com/bazel-apt stable jdk1.8" | sudo tee /etc/apt/sources.list.d/bazel.list
+    ```
+
+    Before we install the bazel, We should take a look for a bazel version that
+    can build the specific tensorflow version, you can check it from
+    [here](https://www.tensorflow.org/install/source#tested_build_configurations).
+    In addition, you can follow the instructions from
+    [Bazel website](https://docs.bazel.build/versions/3.4.0/install.html).
+
+2.  Install Bazel
+
+    ```shell
+    sudo apt update && sudo apt install bazel-`version`
+    ```
+
+## Run benchmarks
+
+To run benchmarks in
+[keras/benchmarks](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python/keras/benchmarks),
+please take the following steps:
+
+1.  Pull the latest tensorflow repo from github.
+2.  Install the Bazel tool which works with tensorflow, please take a look for
+    the [Install bazel](#install-bazel) section.
+3.  To run benchmarks with Bazel, use the `--benchmarks=.` flags to specify the
+    benchmarks to run.
+
+    -   To run all benchmarks on CPU
+
+        ```shell
+        bazel run -c opt benchmark_test -- --benchmarks=.
+        ```
+
+    -   To run all benchmarks on GPU
+
+        ```shell
+        bazel run run --config=cuda -c opt --copt="-mavx" benchmarks_test -- --benchmarks=.
+        ```
+
+    -   To run a subset of benchmarks using `--benchmarks` flag, `--benchmarks`:
+        the list of benchmarks to run. The specified value is interpreted as a
+        regular expression and any benchmarks whose name contains a partial
+        match to the regular expression is executed. e.g.
+        `--benchmarks=".*lstm*."`, will run all lstm layer related benchmarks.
+
+## Add new benchmarks
+
+To add a new benchmark, please take the following steps:
+
+1.  Create your own benchmark test file, `xxxx_benchmark_test.py`.
+2.  Import `benchmark_util` to measure and track performance if needed.
+3.  Create class which inherits from `tf.test.Benchmark`
+4.  Define and load dataset in `__init__` method.
+5.  Design and create a model in `_build_model` method.
+6.  Define the benchmark_xxx method to measure the performance of benchmarks
+    with different hyper parameters, such as `batch_size`, `run_iters`,
+    `distribution_strategy` and etc. You can check examples from
+    [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/bidirectional_lstm_benchmark_test.py#L60).
+7.  Add the benchmark target to the
+    [BUILD](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/benchmarks/BUILD)
+    file.
+
+## Troubleshooting
+
+1.  tensorflow.python.framework.errors_impl.InternalError: CUDA runtime implicit
+    initialization on GPU:0 failed. Status: device kernel image is invalid
+
+    -   Make sure CUDA is installed on your machine.
+    -   Pull the latest tensorflow repo and run the `./configure` in the root
+        folder of tensorflow. It will help you to create the configuration file
+        which shows your local environment. Please check
+        [this post](https://www.tensorflow.org/install/source#configure_the_build)
+        for more details.
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/antirectifier_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/antirectifier_benchmark_test.py
new file mode 100644
index 00000000000..430bae6186f
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/antirectifier_benchmark_test.py
@@ -0,0 +1,162 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks on Antirectifier."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from tensorflow.python.keras.benchmarks import benchmark_util
+
+
+class AntirectifierBenchmark(tf.test.Benchmark):
+  """Benchmarks for Antirectifier using `tf.test.Benchmark`."""
+
+  def __init__(self):
+    super(AntirectifierBenchmark, self).__init__()
+    (self.x_train, self.y_train), _ = tf.keras.datasets.mnist.load_data()
+    self.x_train = self.x_train.reshape(-1, 784)
+    self.x_train = self.x_train.astype("float32") / 255
+
+  def _build_model(self):
+    """Model from https://keras.io/examples/keras_recipes/antirectifier/."""
+    model = tf.keras.Sequential([
+        tf.keras.Input(shape=(784,)),
+        tf.keras.layers.Dense(256),
+        Antirectifier(),
+        tf.keras.layers.Dense(256),
+        Antirectifier(),
+        tf.keras.layers.Dropout(0.5),
+        tf.keras.layers.Dense(10),
+    ])
+    return model
+
+  # In each benchmark test, the required arguments for the
+  # method `measure_performance` include:
+  #   x: Input data, it could be Numpy or loaded from tfds.
+  #   y: Target data. If `x` is a dataset or generator instance,
+  #      `y` should not be specified.
+  #   loss: Loss function for model.
+  #   optimizer: Optimizer for model.
+  #   Check more details in `measure_performance()` method of
+  #   benchmark_util.
+  def benchmark_antirectifier_bs_128(self):
+    """Measure performance with batch_size=128 and run_iters=2."""
+    batch_size = 128
+    run_iters = 2
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        optimizer="rmsprop",
+        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        metrics=["sparse_categorical_accuracy"])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_antirectifier_bs_256(self):
+    """Measure performance with batch_size=256 and run_iters=3."""
+    batch_size = 256
+    run_iters = 3
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        optimizer="rmsprop",
+        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        metrics=["sparse_categorical_accuracy"])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_antirectifier_bs_512(self):
+    """Measure performance with batch_size=512 and run_iters=4."""
+    batch_size = 512
+    run_iters = 4
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        optimizer="rmsprop",
+        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        metrics=["sparse_categorical_accuracy"])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_antirectifier_bs_512_gpu_2(self):
+    """Measure performance with batch_size=512, run_iters=4, gpu=2 and
+
+    distribution_strategy=`mirrored`.
+    """
+    batch_size = 512
+    run_iters = 4
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        num_gpus=2,
+        distribution_strategy="mirrored",
+        optimizer="rmsprop",
+        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        metrics=["sparse_categorical_accuracy"])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+
+class Antirectifier(tf.keras.layers.Layer):
+  """Build simple custome layer."""
+
+  def __init__(self, initializer="he_normal", **kwargs):
+    super(Antirectifier, self).__init__(**kwargs)
+    self.initializer = tf.keras.initializers.get(initializer)
+
+  def build(self, input_shape):
+    output_dim = input_shape[-1]
+    self.kernel = self.add_weight(
+        shape=(output_dim * 2, output_dim),
+        initializer=self.initializer,
+        name="kernel",
+        trainable=True,
+    )
+
+  def call(self, inputs):  #pylint: disable=arguments-differ
+    inputs -= tf.reduce_mean(inputs, axis=-1, keepdims=True)
+    pos = tf.nn.relu(inputs)
+    neg = tf.nn.relu(-inputs)
+    concatenated = tf.concat([pos, neg], axis=-1)
+    mixed = tf.matmul(concatenated, self.kernel)
+    return mixed
+
+  def get_config(self):
+    # Implement get_config to enable serialization. This is optional.
+    base_config = super(Antirectifier, self).get_config()
+    config = {"initializer": tf.keras.initializers.serialize(self.initializer)}
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/bidirectional_lstm_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/bidirectional_lstm_benchmark_test.py
new file mode 100644
index 00000000000..77f231902a1
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/bidirectional_lstm_benchmark_test.py
@@ -0,0 +1,133 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks on Bidirectional LSTM on IMDB."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from tensorflow.python.keras.benchmarks import benchmark_util
+
+
+class BidirectionalLSTMBenchmark(tf.test.Benchmark):
+  """Benchmarks for Bidirectional LSTM using `tf.test.Benchmark`."""
+
+  def __init__(self):
+    super(BidirectionalLSTMBenchmark, self).__init__()
+    self.max_feature = 20000
+    self.max_len = 200
+    (self.imdb_x, self.imdb_y), _ = tf.keras.datasets.imdb.load_data(
+        num_words=self.max_feature)
+    self.imdb_x = tf.keras.preprocessing.sequence.pad_sequences(
+        self.imdb_x, maxlen=self.max_len)
+
+  def _build_model(self):
+    """Model from https://keras.io/examples/nlp/bidirectional_lstm_imdb/."""
+    inputs = tf.keras.Input(shape=(None,), dtype='int32')
+    x = tf.keras.layers.Embedding(self.max_feature, 128)(inputs)
+    x = tf.keras.layers.Bidirectional(
+        tf.keras.layers.LSTM(64, return_sequences=True))(
+            x)
+    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(x)
+    outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
+    model = tf.keras.Model(inputs, outputs)
+    return model
+
+  # In each benchmark test, the required arguments for the
+  # method `measure_performance` include:
+  #   x: Input data, it could be Numpy or loaded from tfds.
+  #   y: Target data. If `x` is a dataset or generator instance,
+  #      `y` should not be specified.
+  #   loss: Loss function for model.
+  #   optimizer: Optimizer for model.
+  #   Check more details in `measure_performance()` method of
+  #   benchmark_util.
+  def benchmark_bidirect_lstm_imdb_bs_128(self):
+    """Measure performance with batch_size=128 and run_iters=3."""
+    batch_size = 128
+    run_iters = 3
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.imdb_x,
+        y=self.imdb_y,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        optimizer='adam',
+        loss='binary_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_bidirect_lstm_imdb_bs_256(self):
+    """Measure performance with batch_size=256 and run_iters=2."""
+    batch_size = 256
+    run_iters = 2
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.imdb_x,
+        y=self.imdb_y,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        optimizer='adam',
+        loss='binary_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_bidirect_lstm_imdb_bs_512(self):
+    """Measure performance with batch_size=512 and run_iters=4."""
+    batch_size = 512
+    run_iters = 4
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.imdb_x,
+        y=self.imdb_y,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        optimizer='adam',
+        loss='binary_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_bidirect_lstm_imdb_bs_512_gpu_2(self):
+    """Measure performance with batch_size=512, run_iters=4, gpu=2 and
+
+    distribution_strategy=`mirrored`.
+    """
+    batch_size = 512
+    run_iters = 4
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.imdb_x,
+        y=self.imdb_y,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        num_gpus=2,
+        distribution_strategy='mirrored',
+        optimizer='adam',
+        loss='binary_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/cifar10_cnn_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/cifar10_cnn_benchmark_test.py
new file mode 100644
index 00000000000..88d27a2a040
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/cifar10_cnn_benchmark_test.py
@@ -0,0 +1,151 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks on CNN on cifar10 dataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from tensorflow.python.keras.benchmarks import benchmark_util
+
+
+class Cifar10CNNBenchmark(tf.test.Benchmark):
+  """Benchmarks for CNN using `tf.test.Benchmark`."""
+
+  def __init__(self):
+    super(Cifar10CNNBenchmark, self).__init__()
+    self.num_classes = 10
+    (self.x_train, self.y_train), _ = tf.keras.datasets.cifar10.load_data()
+    self.x_train = self.x_train.astype('float32') / 255
+    self.y_train = tf.keras.utils.to_categorical(self.y_train, self.num_classes)
+    self.epochs = 25
+
+  def _build_model(self):
+    """Model from https://github.com/keras-team/keras/blob/master/examples/cifar10_cnn.py."""
+    model = tf.keras.Sequential()
+    model.add(
+        tf.keras.layers.Conv2D(
+            32, (3, 3), padding='same', input_shape=self.x_train.shape[1:]))
+    model.add(tf.keras.layers.Activation('relu'))
+    model.add(tf.keras.layers.Conv2D(32, (3, 3)))
+    model.add(tf.keras.layers.Activation('relu'))
+    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
+    model.add(tf.keras.layers.Dropout(0.25))
+
+    model.add(tf.keras.layers.Conv2D(64, (3, 3), padding='same'))
+    model.add(tf.keras.layers.Activation('relu'))
+    model.add(tf.keras.layers.Conv2D(64, (3, 3)))
+    model.add(tf.keras.layers.Activation('relu'))
+    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
+    model.add(tf.keras.layers.Dropout(0.25))
+
+    model.add(tf.keras.layers.Flatten())
+    model.add(tf.keras.layers.Dense(512))
+    model.add(tf.keras.layers.Activation('relu'))
+    model.add(tf.keras.layers.Dropout(0.5))
+    model.add(tf.keras.layers.Dense(self.num_classes))
+    model.add(tf.keras.layers.Activation('softmax'))
+    return model
+
+  # In each benchmark test, the required arguments for the
+  # method `measure_performance` include:
+  #   x: Input data, it could be Numpy or loaded from tfds.
+  #   y: Target data. If `x` is a dataset or generator instance,
+  #      `y` should not be specified.
+  #   loss: Loss function for model.
+  #   optimizer: Optimizer for model.
+  #   Check more details in `measure_performance()` method of
+  #   benchmark_util.
+  def benchmark_cnn_cifar10_bs_256(self):
+    """Measure performance with batch_size=256 and run_iters=3."""
+    batch_size = 256
+    run_iters = 3
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        epochs=self.epochs,
+        optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.0001, decay=1e-6),
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_cnn_cifar10_bs_512(self):
+    """Measure performance with batch_size=512 and run_iters=3."""
+    batch_size = 512
+    run_iters = 3
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        epochs=self.epochs,
+        optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.0001, decay=1e-6),
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_cnn_cifar10_bs_1024(self):
+    """Measure performance with batch_size=1024 and run_iters=2."""
+    batch_size = 1024
+    run_iters = 2
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        epochs=self.epochs,
+        optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.0001, decay=1e-6),
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_cnn_cifar10_bs_1024_gpu_2(self):
+    """Measure performance with batch_size=1024, run_iters=2, gpu=2 and
+
+    distribution_strategy=`mirrored`.
+    """
+    batch_size = 1024
+    run_iters = 2
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        num_gpus=2,
+        distribution_strategy='mirrored',
+        epochs=self.epochs,
+        optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.0001, decay=1e-6),
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_conv_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_conv_benchmark_test.py
new file mode 100644
index 00000000000..1ea2e45ec73
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_conv_benchmark_test.py
@@ -0,0 +1,142 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks on Convnet on MNIST dataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+import tensorflow as tf
+
+from tensorflow.python.keras.benchmarks import benchmark_util
+
+
+class ConvMnistBenchmark(tf.test.Benchmark):
+  """Benchmarks for Convnet using `tf.test.Benchmark`."""
+
+  def __init__(self):
+    super(ConvMnistBenchmark, self).__init__()
+    self.num_classes = 10
+    self.input_shape = (28, 28, 1)
+    (self.x_train, self.y_train), _ = tf.keras.datasets.mnist.load_data()
+    self.x_train = self.x_train.astype('float32') / 255
+    self.x_train = np.expand_dims(self.x_train, -1)
+    self.y_train = tf.keras.utils.to_categorical(self.y_train, self.num_classes)
+    self.epochs = 15
+
+  def _build_model(self):
+    """Model from https://keras.io/examples/vision/mnist_convnet/."""
+    model = tf.keras.Sequential([
+        tf.keras.Input(shape=self.input_shape),
+        tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu'),
+        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
+        tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu'),
+        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
+        tf.keras.layers.Flatten(),
+        tf.keras.layers.Dropout(0.5),
+        tf.keras.layers.Dense(self.num_classes, activation='softmax'),
+    ])
+    return model
+
+  # In each benchmark test, the required arguments for the
+  # method `measure_performance` include:
+  #   x: Input data, it could be Numpy or loaded from tfds.
+  #   y: Target data. If `x` is a dataset or generator instance,
+  #      `y` should not be specified.
+  #   loss: Loss function for model.
+  #   optimizer: Optimizer for model.
+  #   Check more details in `measure_performance()` method of
+  #   benchmark_util.
+  def benchmark_conv_mnist_bs_128(self):
+    """Measure performance with batch_size=128 and run_iters=2."""
+    batch_size = 128
+    run_iters = 2
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        epochs=self.epochs,
+        optimizer='adam',
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_conv_mnist_bs_256(self):
+    """Measure performance with batch_size=256 and run_iters=3."""
+    batch_size = 256
+    run_iters = 3
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        epochs=self.epochs,
+        optimizer='adam',
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_conv_mnist_bs_256_gpu_2(self):
+    """Measure performance with batch_size=256, run_iters=3, gpu=2 and
+
+    distribution_strategy='mirrored'
+    """
+    batch_size = 256
+    run_iters = 3
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        num_gpus=2,
+        distribution_strategy='mirrored',
+        epochs=self.epochs,
+        optimizer='adam',
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_conv_mnist_bs_512(self):
+    """Measure performance with batch_size=512 and run_iters=3."""
+    batch_size = 512
+    run_iters = 3
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        epochs=self.epochs,
+        optimizer='adam',
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_hierarchical_rnn_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_hierarchical_rnn_benchmark_test.py
new file mode 100644
index 00000000000..29c71e576ba
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_hierarchical_rnn_benchmark_test.py
@@ -0,0 +1,139 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks on Hierarchical RNN on MNIST digits."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from tensorflow.python.keras.benchmarks import benchmark_util
+
+
+class HierarchicalRNNBenchmark(tf.test.Benchmark):
+  """Benchmarks for Hierarchical RNN using `tf.test.Benchmark`."""
+
+  def __init__(self):
+    super(HierarchicalRNNBenchmark, self).__init__()
+    self.num_classes = 10
+    self.row_hidden, self.col_hidden = 128, 128
+    (self.x_train, self.y_train), _ = tf.keras.datasets.mnist.load_data()
+    self.x_train = self.x_train.reshape(self.x_train.shape[0], 28, 28, 1)
+    self.x_train = self.x_train.astype('float32') / 255
+    self.y_train = tf.keras.utils.to_categorical(self.y_train, self.num_classes)
+
+  def _build_model(self):
+    """Model from https://github.com/keras-team/keras/blob/master/examples
+
+    /mnist_hierarchical_rnn.py.
+    """
+    row, col, pixel = self.x_train.shape[1:]
+    inputs = tf.keras.layers.Input(shape=(row, col, pixel))
+    encoded_rows = tf.keras.layers.TimeDistributed(
+        tf.keras.layers.LSTM(self.row_hidden))(
+            inputs)
+    encoded_cols = tf.keras.layers.LSTM(self.col_hidden)(encoded_rows)
+    outputs = tf.keras.layers.Dense(
+        self.num_classes, activation='softmax')(
+            encoded_cols)
+    model = tf.keras.Model(inputs, outputs)
+
+    return model
+
+  # In each benchmark test, the required arguments for the
+  # method `measure_performance` include:
+  #   x: Input data, it could be Numpy or loaded from tfds.
+  #   y: Target data. If `x` is a dataset or generator instance,
+  #      `y` should not be specified.
+  #   loss: Loss function for model.
+  #   optimizer: Optimizer for model.
+  #   Check more details in `measure_performance()` method of
+  #   benchmark_util.
+  def benchmark_hrnn_mnist_bs_256(self):
+    """Measure performance with batch_size=256 and run_iters=4."""
+    batch_size = 256
+    run_iters = 4
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        optimizer='rmsprop',
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_hrnn_mnist_bs_256_gpu_2(self):
+    """Measure performance with batch_size=256, run_iters=4, gpu=2 and
+
+    distribution_strategy='mirrored'
+    """
+    batch_size = 256
+    run_iters = 4
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        num_gpus=2,
+        distribution_strategy='mirrored',
+        optimizer='rmsprop',
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_hrnn_mnist_bs_512(self):
+    """Measure performance with batch_size=512 and run_iters=5."""
+    batch_size = 512
+    run_iters = 5
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        optimizer='rmsprop',
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_hrnn_mnist_bs_1024(self):
+    """Measure performance with batch_size=1024 and run_iters=3."""
+    batch_size = 1024
+    run_iters = 3
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        optimizer='rmsprop',
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_irnn_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_irnn_benchmark_test.py
new file mode 100644
index 00000000000..b1913a2c268
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_irnn_benchmark_test.py
@@ -0,0 +1,139 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks on IRNN on MNIST digits."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from tensorflow.python.keras.benchmarks import benchmark_util
+
+
+class IRNNMnistBenchmark(tf.test.Benchmark):
+  """Benchmarks for IRNN using `tf.test.Benchmark`."""
+
+  def __init__(self):
+    super(IRNNMnistBenchmark, self).__init__()
+    self.num_classes = 10
+    self.hidden_units = 100
+    self.learning_rate = 1e-6
+    (self.x_train, self.y_train), _ = tf.keras.datasets.mnist.load_data()
+    self.x_train = self.x_train.reshape(self.x_train.shape[0], -1, 1)
+    self.x_train = self.x_train.astype('float32') / 255
+    self.y_train = tf.keras.utils.to_categorical(self.y_train, self.num_classes)
+
+  def _build_model(self):
+    """Model from https://github.com/keras-team/keras/
+
+    blob/master/examples/mnist_irnn.py.
+    """
+    model = tf.keras.Sequential()
+    model.add(
+        tf.keras.layers.SimpleRNN(
+            self.hidden_units,
+            kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.001),
+            recurrent_initializer=tf.keras.initializers.Identity(gain=1.0),
+            activation='relu',
+            input_shape=self.x_train.shape[1:]))
+    model.add(tf.keras.layers.Dense(self.num_classes))
+    model.add(tf.keras.layers.Activation('softmax'))
+    return model
+
+  # In each benchmark test, the required arguments for the
+  # method `measure_performance` include:
+  #   x: Input data, it could be Numpy or loaded from tfds.
+  #   y: Target data. If `x` is a dataset or generator instance,
+  #      `y` should not be specified.
+  #   loss: Loss function for model.
+  #   optimizer: Optimizer for model.
+  #   Check more details in `measure_performance()` method of
+  #   benchmark_util.
+  def benchmark_irnn_mnist_bs_256(self):
+    """Measure performance with batch_size=256 and run_iters=4."""
+    batch_size = 256
+    run_iters = 4
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        optimizer=tf.keras.optimizers.RMSprop(learning_rate=self.learning_rate),
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_irnn_mnist_bs_512(self):
+    """Measure performance with batch_size=512 and run_iters=3."""
+    batch_size = 512
+    run_iters = 3
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        optimizer=tf.keras.optimizers.RMSprop(learning_rate=self.learning_rate),
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_irnn_mnist_bs_1024(self):
+    """Measure performance with batch_size=1024 and run_iters=3."""
+    batch_size = 1024
+    run_iters = 3
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        optimizer=tf.keras.optimizers.RMSprop(learning_rate=self.learning_rate),
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_irnn_mnist_bs_1024_gpu_3(self):
+    """Measure performance with batch_size=1024, run_iters=3, gpu=3 and
+
+    distribution_strategy='mirrored'
+    """
+    batch_size = 1024
+    run_iters = 3
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        num_gpus=3,
+        distribution_strategy='mirrored',
+        optimizer=tf.keras.optimizers.RMSprop(learning_rate=self.learning_rate),
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/reuters_mlp_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/reuters_mlp_benchmark_test.py
new file mode 100644
index 00000000000..be7823441f3
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/reuters_mlp_benchmark_test.py
@@ -0,0 +1,142 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks on MLP on Reuters dataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+import tensorflow as tf
+
+from tensorflow.python.keras.benchmarks import benchmark_util
+
+
+class MLPReutersBenchmark(tf.test.Benchmark):
+  """Benchmarks for MLP using `tf.test.Benchmark`."""
+
+  def __init__(self):
+    super(MLPReutersBenchmark, self).__init__()
+    self.max_words = 1000
+    (self.x_train, self.y_train), _ = tf.keras.datasets.reuters.load_data(
+        num_words=self.max_words)
+    self.num_classes = np.max(self.y_train) + 1
+    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=self.max_words)
+    self.x_train = tokenizer.sequences_to_matrix(self.x_train, mode='binary')
+    self.y_train = tf.keras.utils.to_categorical(self.y_train, self.num_classes)
+    self.epochs = 5
+
+  def _build_model(self):
+    """Model from https://github.com/keras-team/keras/blob/master/
+
+    examples/reuters_mlp.py.
+    """
+    model = tf.keras.Sequential()
+    model.add(tf.keras.layers.Dense(512, input_shape=(self.max_words,)))
+    model.add(tf.keras.layers.Activation('relu'))
+    model.add(tf.keras.layers.Dropout(0.5))
+    model.add(tf.keras.layers.Dense(self.num_classes))
+    model.add(tf.keras.layers.Activation('softmax'))
+    return model
+
+  # In each benchmark test, the required arguments for the
+  # method `measure_performance` include:
+  #   x: Input data, it could be Numpy or loaded from tfds.
+  #   y: Target data. If `x` is a dataset or generator instance,
+  #      `y` should not be specified.
+  #   loss: Loss function for model.
+  #   optimizer: Optimizer for model.
+  #   Check more details in `measure_performance()` method of
+  #   benchmark_util.
+  def benchmark_mlp_reuters_bs_128(self):
+    """Measure performance with batch_size=128 and run_iters=2."""
+    batch_size = 128
+    run_iters = 2
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        epochs=self.epochs,
+        optimizer='adam',
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_mlp_reuters_bs_128_gpu_3(self):
+    """Measure performance with batch_size=128, run_iters=2, gpu=3 and
+
+    distribution_strategy='mirrored'
+    """
+    batch_size = 128
+    run_iters = 2
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        num_gpus=3,
+        distribution_strategy='mirrored',
+        epochs=self.epochs,
+        optimizer='adam',
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_mlp_reuters_bs_256(self):
+    """Measure performance with batch_size=256 and run_iters=3."""
+    batch_size = 256
+    run_iters = 3
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        epochs=self.epochs,
+        optimizer='adam',
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_mlp_reuters_bs_512(self):
+    """Measure performance with batch_size=512 and run_iters=4."""
+    batch_size = 512
+    run_iters = 4
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        epochs=self.epochs,
+        optimizer='adam',
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/text_classification_transformer_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/text_classification_transformer_benchmark_test.py
new file mode 100644
index 00000000000..8d1847ae204
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/text_classification_transformer_benchmark_test.py
@@ -0,0 +1,237 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks on Text classification with Transformer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from tensorflow.python.keras.benchmarks import benchmark_util
+
+
+class TextWithTransformerBenchmark(tf.test.Benchmark):
+  """Benchmarks for Text classification with Transformer
+  using `tf.test.Benchmark`.
+  """
+
+  def __init__(self):
+    super(TextWithTransformerBenchmark, self).__init__()
+    self.max_feature = 20000
+    self.max_len = 200
+    (self.imdb_x, self.imdb_y), _ = tf.keras.datasets.imdb.load_data(
+        num_words=self.max_feature)
+    self.imdb_x = tf.keras.preprocessing.sequence.pad_sequences(
+        self.imdb_x, maxlen=self.max_len)
+
+  def _build_model(self):
+    """Model from https://keras.io/examples/nlp/text_classification_with_transformer/."""
+    embed_dim = 32
+    num_heads = 2
+    ff_dim = 32
+    inputs = tf.keras.layers.Input(shape=(self.max_len,))
+    embedding_layer = TokenAndPositionEmbedding(self.max_len, self.max_feature,
+                                                embed_dim)
+    x = embedding_layer(inputs)  #pylint: disable=not-callable
+    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
+    x = transformer_block(x)  #pylint: disable=not-callable
+    x = tf.keras.layers.GlobalAvgPool1D()(x)
+    x = tf.keras.layers.Dropout(0.1)(x)
+    x = tf.keras.layers.Dense(20, activation='relu')(x)
+    x = tf.keras.layers.Dropout(0.1)(x)
+    outputs = tf.keras.layers.Dense(2, activation='softmax')(x)
+
+    model = tf.keras.Model(inputs=inputs, outputs=outputs)
+    return model
+
+  # In each benchmark test, the required arguments for the
+  # method `measure_performance` include:
+  #   x: Input data, it could be Numpy or loaded from tfds.
+  #   y: Target data. If `x` is a dataset or generator instance,
+  #      `y` should not be specified.
+  #   loss: Loss function for model.
+  #   optimizer: Optimizer for model.
+  #   Check more details in `measure_performance()` method of
+  #   benchmark_util.
+  def benchmark_text_classification_bs_128(self):
+    """Measure performance with batch_size=128 and run_iters=3."""
+    batch_size = 128
+    run_iters = 3
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.imdb_x,
+        y=self.imdb_y,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        optimizer='adam',
+        loss='sparse_categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_text_classification_bs_256(self):
+    """Measure performance with batch_size=256 and run_iters=3."""
+    batch_size = 256
+    run_iters = 3
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.imdb_x,
+        y=self.imdb_y,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        optimizer='adam',
+        loss='sparse_categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_text_classification_bs_512(self):
+    """Measure performance with batch_size=512 and run_iters=4."""
+    batch_size = 512
+    run_iters = 4
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.imdb_x,
+        y=self.imdb_y,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        optimizer='adam',
+        loss='sparse_categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_text_classification_bs_512_gpu_2(self):
+    """Measure performance with batch_size=512, run_iters=4, gpu=1 and
+
+    distribution_strategy='mirrored'
+    """
+    batch_size = 512
+    run_iters = 4
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.imdb_x,
+        y=self.imdb_y,
+        batch_size=batch_size,
+        run_iters=run_iters,
+        num_gpus=2,
+        distribution_strategy='mirrored',
+        optimizer='adam',
+        loss='sparse_categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(
+        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+
+
+class MultiHeadSelfAttention(tf.keras.layers.Layer):
+  """Implement multi head self attention as a Keras layer."""
+
+  def __init__(self, embed_dim, num_heads=8):
+    super(MultiHeadSelfAttention, self).__init__()
+    self.embed_dim = embed_dim
+    self.num_heads = num_heads
+    if embed_dim % num_heads != 0:
+      raise ValueError('embedding dimension = {embed_dim} should be divisible'
+                       'by number of heads = {num_heads}')
+    self.projection_dim = embed_dim // num_heads
+    self.query_dense = tf.keras.layers.Dense(embed_dim)
+    self.key_dense = tf.keras.layers.Dense(embed_dim)
+    self.value_dense = tf.keras.layers.Dense(embed_dim)
+    self.combine_heads = tf.keras.layers.Dense(embed_dim)
+
+  def attention(self, query, key, value):
+    score = tf.matmul(query, key, transpose_b=True)
+    dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
+    scaled_score = score / tf.math.sqrt(dim_key)
+    weights = tf.nn.softmax(scaled_score, axis=-1)
+    output = tf.matmul(weights, value)
+    return output, weights
+
+  def separate_heads(self, x, batch_size):
+    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
+    return tf.transpose(x, perm=[0, 2, 1, 3])
+
+  def call(self, inputs):  #pylint: disable=arguments-differ
+    # x.shape = [batch_size, seq_len, embedding_dim]
+    batch_size = tf.shape(inputs)[0]
+    query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
+    key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
+    value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
+    query = self.separate_heads(
+        query, batch_size)  # (batch_size, num_heads, seq_len, projection_dim)
+    key = self.separate_heads(
+        key, batch_size)  # (batch_size, num_heads, seq_len, projection_dim)
+    value = self.separate_heads(
+        value, batch_size)  # (batch_size, num_heads, seq_len, projection_dim)
+    attention, _ = self.attention(query, key, value)
+    attention = tf.transpose(
+        attention, perm=[0, 2, 1,
+                         3])  # (batch_size, seq_len, num_heads, projection_dim)
+    concat_attention = tf.reshape(
+        attention,
+        (batch_size, -1, self.embed_dim))  # (batch_size, seq_len, embed_dim)
+    output = self.combine_heads(
+        concat_attention)  # (batch_size, seq_len, embed_dim)
+    return output
+
+
+class TransformerBlock(tf.keras.layers.Layer):
+  """Implement a Transformer block as a layer."""
+
+  def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
+    super(TransformerBlock, self).__init__()
+    self.att = MultiHeadSelfAttention(embed_dim, num_heads)
+    self.ffn = tf.keras.Sequential([
+        tf.keras.layers.Dense(ff_dim, activation='relu'),
+        tf.keras.layers.Dense(embed_dim)
+    ])
+    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+    self.dropout1 = tf.keras.layers.Dropout(rate)
+    self.dropout2 = tf.keras.layers.Dropout(rate)
+
+  def call(self, inputs, training):  #pylint: disable=arguments-differ
+    attn_output = self.att(inputs)  #pylint: disable=not-callable
+    attn_output = self.dropout1(attn_output, training=training)
+    out1 = self.layernorm1(inputs + attn_output)
+    ffn_output = self.ffn(out1)
+    ffn_output = self.dropout2(ffn_output, training=training)
+    return self.layernorm2(out1 + ffn_output)
+
+
+class TokenAndPositionEmbedding(tf.keras.layers.Layer):
+  """Implement embedding layer."""
+
+  def __init__(self, maxlen, vocab_size, embed_dim):
+    super(TokenAndPositionEmbedding, self).__init__()
+    self.token_emb = tf.keras.layers.Embedding(
+        input_dim=vocab_size, output_dim=embed_dim)
+    self.pos_emb = tf.keras.layers.Embedding(
+        input_dim=maxlen, output_dim=embed_dim)
+
+  def call(self, x):  #pylint: disable=arguments-differ
+    maxlen = tf.shape(x)[-1]
+    positions = tf.range(start=0, limit=maxlen, delta=1)
+    positions = self.pos_emb(positions)
+    x = self.token_emb(x)
+    return x + positions
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/keras/benchmarks/model_components_benchmarks_test.py b/tensorflow/python/keras/benchmarks/model_components_benchmarks_test.py
index 5119d196b6a..624c318bedb 100644
--- a/tensorflow/python/keras/benchmarks/model_components_benchmarks_test.py
+++ b/tensorflow/python/keras/benchmarks/model_components_benchmarks_test.py
@@ -108,12 +108,16 @@ class KerasComponentsBenchmarks(test.Benchmark):
     self.report_benchmark(
         iters=num_iters,
         wall_time=mean_us,
-        extras={
-            "examples_per_sec":
-                float("{0:.3f}".format(num_iters / total_time)),
-            "us_per_example":
-                float("{0:.3f}".format(total_time * 1e6 / num_iters))
-        })
+        metrics=[
+            {
+                "name": "exp_per_sec",
+                "value": float("{0:.3f}".format(num_iters / total_time))
+            },
+            {
+                "name": "us_per_exp",
+                "value": float("{0:.3f}".format(total_time * 1e6 / num_iters))
+            },
+        ])
 
   def benchmark_keras_model_subclassed(self):
     model = SubclassedKerasModel()
diff --git a/tensorflow/python/keras/benchmarks/model_memory_profile.py b/tensorflow/python/keras/benchmarks/model_memory_profile.py
new file mode 100644
index 00000000000..eb548a033d5
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/model_memory_profile.py
@@ -0,0 +1,80 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Memory profile on Keras model.
+
+To add a new model for memory profile:
+1. Create the model.
+2. Decorate it with `@memory_profiler.profile`.
+3. Add the model function to the dict `models`.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+
+from absl import logging
+import numpy as np
+
+import tensorflow as tf
+
+try:
+  import memory_profiler  # pylint:disable=g-import-not-at-top
+except ImportError:
+  memory_profiler = None
+
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string('model', None,
+                    'The model to run memory profiler.')
+
+
+@memory_profiler.profile
+def _imdb_lstm_model():
+  """LSTM model."""
+  x_train = np.random.randint(0, 1999, size=(2500, 100))
+  y_train = np.random.random((2500, 1))
+
+  # IMDB LSTM model.
+  model = tf.keras.Sequential()
+  model.add(tf.keras.layers.Embedding(20000, 128))
+  model.add(tf.keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2))
+  model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
+
+  model.compile('sgd', 'mse')
+  # Warm up the model with one epoch.
+  model.fit(x_train, y_train, batch_size=512, epochs=3)
+
+
+def main(_):
+  # Add the model for memory profile.
+  models = {
+      'lstm': _imdb_lstm_model,
+  }
+
+  if FLAGS.model in models:
+    logging.info('Run memory profile on %s.', FLAGS.model)
+    run_model = models[FLAGS.model]
+    run_model()
+  else:
+    logging.info('The model does not exist. Please verify the model name.')
+
+
+if __name__ == '__main__':
+  flags.mark_flags_as_required(['model'])
+  if memory_profiler:
+    app.run(main)
+
diff --git a/tensorflow/python/keras/benchmarks/saved_model_benchmarks/BUILD b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/BUILD
new file mode 100644
index 00000000000..25a81cc41cc
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/BUILD
@@ -0,0 +1,136 @@
+# Description:
+#   Implementation of Keras benchmarks.
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+exports_files(["LICENSE"])
+
+# To run CPU benchmarks:
+#   bazel run -c opt benchmarks_test -- --benchmarks=.
+
+# To run GPU benchmarks:
+#   bazel run --config=cuda -c opt --copt="-mavx" benchmarks_test -- \
+#     --benchmarks=.
+
+# To run a subset of benchmarks using --benchmarks flag.
+# --benchmarks: the list of benchmarks to run. The specified value is interpreted
+# as a regular expression and any benchmark whose name contains a partial match
+# to the regular expression is executed.
+# e.g. --benchmarks=".*lstm*." will run all lstm layer related benchmarks.
+
+py_library(
+    name = "saved_model_benchmark_util",
+    srcs = ["saved_model_benchmark_util.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+cuda_py_test(
+    name = "densenet_benchmark_test",
+    srcs = ["densenet_benchmark_test.py"],
+    tags = [
+        "no_pip",  # b/161253163
+        "no_windows",  # b/160628318
+    ],
+    deps = [
+        ":saved_model_benchmark_util",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+cuda_py_test(
+    name = "efficientnet_benchmark_test",
+    srcs = ["efficientnet_benchmark_test.py"],
+    tags = [
+        "no_pip",  # b/161253163
+        "no_windows",  # b/160628318
+    ],
+    deps = [
+        ":saved_model_benchmark_util",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+cuda_py_test(
+    name = "inception_resnet_v2_benchmark_test",
+    srcs = ["inception_resnet_v2_benchmark_test.py"],
+    tags = [
+        "no_pip",  # b/161253163
+        "no_windows",  # b/160628318
+    ],
+    deps = [
+        ":saved_model_benchmark_util",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+cuda_py_test(
+    name = "mobilenet_benchmark_test",
+    srcs = ["mobilenet_benchmark_test.py"],
+    tags = [
+        "no_pip",  # b/161253163
+        "no_windows",  # b/160628318
+    ],
+    deps = [
+        ":saved_model_benchmark_util",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+cuda_py_test(
+    name = "nasnet_large_benchmark_test",
+    srcs = ["nasnet_large_benchmark_test.py"],
+    tags = [
+        "no_pip",  # b/161253163
+        "no_windows",  # b/160628318
+    ],
+    deps = [
+        ":saved_model_benchmark_util",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+cuda_py_test(
+    name = "resnet152_v2_benchmark_test",
+    srcs = ["resnet152_v2_benchmark_test.py"],
+    tags = [
+        "no_pip",  # b/161253163
+        "no_windows",  # b/160628318
+    ],
+    deps = [
+        ":saved_model_benchmark_util",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+cuda_py_test(
+    name = "vgg_benchmark_test",
+    srcs = ["vgg_benchmark_test.py"],
+    tags = [
+        "no_pip",  # b/161253163
+        "no_windows",  # b/160628318
+    ],
+    deps = [
+        ":saved_model_benchmark_util",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+cuda_py_test(
+    name = "xception_benchmark_test",
+    srcs = ["xception_benchmark_test.py"],
+    tags = [
+        "no_pip",  # b/161253163
+        "no_windows",  # b/160628318
+    ],
+    deps = [
+        ":saved_model_benchmark_util",
+        "//tensorflow:tensorflow_py",
+    ],
+)
diff --git a/tensorflow/python/keras/benchmarks/saved_model_benchmarks/densenet_benchmark_test.py b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/densenet_benchmark_test.py
new file mode 100644
index 00000000000..3b8e9d632f5
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/densenet_benchmark_test.py
@@ -0,0 +1,43 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for saved model on DenseNet201."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.python.keras.benchmarks.saved_model_benchmarks import saved_model_benchmark_util
+
+
+class BenchmarkSaveApplications(tf.test.Benchmark):
+
+  def benchmark_save_and_load_densenet_201(self):
+    app = tf.keras.applications.DenseNet201
+    save_result, load_result = (
+        saved_model_benchmark_util.save_and_load_benchmark(app))
+
+    self.report_benchmark(
+        iters=save_result['iters'],
+        wall_time=save_result['wall_time'],
+        name=save_result['name'])
+
+    self.report_benchmark(
+        iters=load_result['iters'],
+        wall_time=load_result['wall_time'],
+        name=load_result['name'])
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/keras/benchmarks/saved_model_benchmarks/efficientnet_benchmark_test.py b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/efficientnet_benchmark_test.py
new file mode 100644
index 00000000000..27316e2997a
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/efficientnet_benchmark_test.py
@@ -0,0 +1,43 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for saved model on EfficientNetB7."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.python.keras.benchmarks.saved_model_benchmarks import saved_model_benchmark_util
+
+
+class BenchmarkSaveApplications(tf.test.Benchmark):
+
+  def benchmark_save_and_load_efficient_net_b7(self):
+    app = tf.keras.applications.EfficientNetB7
+    save_result, load_result = (
+        saved_model_benchmark_util.save_and_load_benchmark(app))
+
+    self.report_benchmark(
+        iters=save_result['iters'],
+        wall_time=save_result['wall_time'],
+        name=save_result['name'])
+
+    self.report_benchmark(
+        iters=load_result['iters'],
+        wall_time=load_result['wall_time'],
+        name=load_result['name'])
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/keras/benchmarks/saved_model_benchmarks/inception_resnet_v2_benchmark_test.py b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/inception_resnet_v2_benchmark_test.py
new file mode 100644
index 00000000000..d2d5090e878
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/inception_resnet_v2_benchmark_test.py
@@ -0,0 +1,44 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for saved model on InceptionResNetV2."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.python.keras.benchmarks.saved_model_benchmarks import saved_model_benchmark_util
+
+
+class BenchmarkSaveApplications(tf.test.Benchmark):
+
+  def benchmark_save_and_load_inception_resnet_v2(self):
+    app = tf.keras.applications.InceptionResNetV2
+    save_result, load_result = (
+        saved_model_benchmark_util.save_and_load_benchmark(app))
+
+    self.report_benchmark(
+        iters=save_result['iters'],
+        wall_time=save_result['wall_time'],
+        name=save_result['name'])
+
+    self.report_benchmark(
+        iters=load_result['iters'],
+        wall_time=load_result['wall_time'],
+        name=load_result['name'])
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/keras/benchmarks/saved_model_benchmarks/mobilenet_benchmark_test.py b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/mobilenet_benchmark_test.py
new file mode 100644
index 00000000000..0d6b61f141e
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/mobilenet_benchmark_test.py
@@ -0,0 +1,43 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for saved model on MobileNetV2."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.python.keras.benchmarks.saved_model_benchmarks import saved_model_benchmark_util
+
+
+class BenchmarkSaveApplications(tf.test.Benchmark):
+
+  def benchmark_save_and_load_mobilenet_v2(self):
+    app = tf.keras.applications.MobileNetV2
+    save_result, load_result = (
+        saved_model_benchmark_util.save_and_load_benchmark(app))
+
+    self.report_benchmark(
+        iters=save_result['iters'],
+        wall_time=save_result['wall_time'],
+        name=save_result['name'])
+
+    self.report_benchmark(
+        iters=load_result['iters'],
+        wall_time=load_result['wall_time'],
+        name=load_result['name'])
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/keras/benchmarks/saved_model_benchmarks/nasnet_large_benchmark_test.py b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/nasnet_large_benchmark_test.py
new file mode 100644
index 00000000000..864ce1930ee
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/nasnet_large_benchmark_test.py
@@ -0,0 +1,43 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for saved model on NASNetLarge."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.python.keras.benchmarks.saved_model_benchmarks import saved_model_benchmark_util
+
+
+class BenchmarkSaveApplications(tf.test.Benchmark):
+
+  def benchmark_save_and_load_nasnet_large(self):
+    app = tf.keras.applications.NASNetLarge
+    save_result, load_result = (
+        saved_model_benchmark_util.save_and_load_benchmark(app))
+
+    self.report_benchmark(
+        iters=save_result['iters'],
+        wall_time=save_result['wall_time'],
+        name=save_result['name'])
+
+    self.report_benchmark(
+        iters=load_result['iters'],
+        wall_time=load_result['wall_time'],
+        name=load_result['name'])
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/keras/benchmarks/saved_model_benchmarks/resnet152_v2_benchmark_test.py b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/resnet152_v2_benchmark_test.py
new file mode 100644
index 00000000000..a0603eb5136
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/resnet152_v2_benchmark_test.py
@@ -0,0 +1,44 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for saved model on ResNet152V2."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.python.keras.benchmarks.saved_model_benchmarks import saved_model_benchmark_util
+
+
+class BenchmarkSaveApplications(tf.test.Benchmark):
+
+  def benchmark_save_and_load_resnet152_v2(self):
+    app = tf.keras.applications.ResNet152V2
+    save_result, load_result = (
+        saved_model_benchmark_util.save_and_load_benchmark(app))
+
+    self.report_benchmark(
+        iters=save_result['iters'],
+        wall_time=save_result['wall_time'],
+        name=save_result['name'])
+
+    self.report_benchmark(
+        iters=load_result['iters'],
+        wall_time=load_result['wall_time'],
+        name=load_result['name'])
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/keras/benchmarks/saved_model_benchmarks/saved_model_benchmark_util.py b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/saved_model_benchmark_util.py
new file mode 100644
index 00000000000..a0760fa075c
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/saved_model_benchmark_util.py
@@ -0,0 +1,70 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utils for saved model benchmarks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tempfile
+import time
+
+import tensorflow as tf
+
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import googletest
+
+
+def save_and_load_benchmark(app):
+  """Util for saved model benchmarks."""
+  trials = 3
+
+  model = app(weights=None)
+  model_name = app.__name__
+
+  tmp_dir = googletest.GetTempDir()
+  gfile.MakeDirs(tmp_dir)
+  save_dir = tempfile.mkdtemp(dir=tmp_dir)
+
+  total_save_time = 0
+  total_load_time = 0
+
+  # Run one untimed iteration of saving/loading.
+  model.save(save_dir, save_format='tf')
+  tf.keras.models.load_model(save_dir)
+
+  for _ in range(trials):
+    start_time = time.time()
+    model.save(save_dir, save_format='tf')
+    total_save_time += time.time() - start_time
+
+    start_time = time.time()
+    tf.keras.models.load_model(save_dir)
+    total_load_time += time.time() - start_time
+
+  save_result = {
+      'iters': trials,
+      'wall_time': total_save_time / trials,
+      'name': '{}.save'.format(model_name)
+  }
+
+  load_result = {
+      'iters': trials,
+      'wall_time': total_load_time / trials,
+      'name': '{}.load'.format(model_name)
+  }
+  gfile.DeleteRecursively(save_dir)
+  return save_result, load_result
+
diff --git a/tensorflow/python/keras/benchmarks/saved_model_benchmarks/vgg_benchmark_test.py b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/vgg_benchmark_test.py
new file mode 100644
index 00000000000..3ceebe4fcc4
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/vgg_benchmark_test.py
@@ -0,0 +1,44 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for saved model on VGG19."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.python.keras.benchmarks.saved_model_benchmarks import saved_model_benchmark_util
+
+
+class BenchmarkSaveApplications(tf.test.Benchmark):
+
+  def benchmark_save_and_load_vgg19(self):
+    app = tf.keras.applications.VGG19
+    save_result, load_result = (
+        saved_model_benchmark_util.save_and_load_benchmark(app))
+
+    self.report_benchmark(
+        iters=save_result['iters'],
+        wall_time=save_result['wall_time'],
+        name=save_result['name'])
+
+    self.report_benchmark(
+        iters=load_result['iters'],
+        wall_time=load_result['wall_time'],
+        name=load_result['name'])
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/keras/benchmarks/saved_model_benchmarks/xception_benchmark_test.py b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/xception_benchmark_test.py
new file mode 100644
index 00000000000..ddab2f68ffd
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/xception_benchmark_test.py
@@ -0,0 +1,44 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for saved model on Xception."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.python.keras.benchmarks.saved_model_benchmarks import saved_model_benchmark_util
+
+
+class BenchmarkSaveApplications(tf.test.Benchmark):
+
+  def benchmark_save_and_load_xception(self):
+    app = tf.keras.applications.Xception
+    save_result, load_result = (
+        saved_model_benchmark_util.save_and_load_benchmark(app))
+
+    self.report_benchmark(
+        iters=save_result['iters'],
+        wall_time=save_result['wall_time'],
+        name=save_result['name'])
+
+    self.report_benchmark(
+        iters=load_result['iters'],
+        wall_time=load_result['wall_time'],
+        name=load_result['name'])
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 74b0300c648..5a191263241 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -41,6 +41,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.distribute import worker_training_state
+from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils import version_utils
@@ -180,7 +181,7 @@ def set_callback_parameters(callback_list,
 
 def _is_generator_like(data):
   """Checks if data is a generator, Sequence, or Iterator."""
-  return (hasattr(data, 'next') or hasattr(data, '__next__') or isinstance(
+  return (hasattr(data, '__next__') or hasattr(data, 'next') or isinstance(
       data, (Sequence, iterator_ops.Iterator, iterator_ops.OwnedIterator)))
 
 
@@ -424,8 +425,6 @@ class CallbackList(object):
           the values of the `Model`'s metrics are returned.  Example:
           `{'loss': 0.2, 'accuracy': 0.7}`.
     """
-    # TODO(b/150629188): Make ProgBarLogger callback not use batch hooks
-    # when verbose != 1
     if self._should_call_train_batch_hooks:
       self._call_batch_hook(ModeKeys.TRAIN, 'begin', batch, logs=logs)
 
@@ -929,6 +928,9 @@ class ProgbarLogger(Callback):
     self.verbose = 1
     self.epochs = 1
 
+    self._train_step, self._test_step, self._predict_step = None, None, None
+    self._call_batch_hooks = True
+
     self._called_in_fit = False
 
   def set_params(self, params):
@@ -941,6 +943,15 @@ class ProgbarLogger(Callback):
     else:
       self.target = None  # Will be inferred at the end of the first epoch.
 
+    self._call_batch_hooks = self.verbose == 1
+    if self.target is None:
+      try:
+        self._train_step = self.model._train_counter  # pylint: disable=protected-access
+        self._test_step = self.model._test_counter  # pylint: disable=protected-access
+        self._predict_step = self.model._predict_counter  # pylint: disable=protected-access
+      except AttributeError:
+        self._call_batch_hooks = True
+
   def on_train_begin(self, logs=None):
     # When this logger is called inside `fit`, validation is silent.
     self._called_in_fit = True
@@ -948,12 +959,15 @@ class ProgbarLogger(Callback):
   def on_test_begin(self, logs=None):
     if not self._called_in_fit:
       self._reset_progbar()
+      self._maybe_init_progbar()
 
   def on_predict_begin(self, logs=None):
     self._reset_progbar()
+    self._maybe_init_progbar()
 
   def on_epoch_begin(self, epoch, logs=None):
     self._reset_progbar()
+    self._maybe_init_progbar()
     if self.verbose and self.epochs > 1:
       print('Epoch %d/%d' % (epoch + 1, self.epochs))
 
@@ -969,14 +983,14 @@ class ProgbarLogger(Callback):
     self._batch_update_progbar(batch, None)
 
   def on_epoch_end(self, epoch, logs=None):
-    self._finalize_progbar(logs)
+    self._finalize_progbar(logs, self._train_step)
 
   def on_test_end(self, logs=None):
     if not self._called_in_fit:
-      self._finalize_progbar(logs)
+      self._finalize_progbar(logs, self._test_step)
 
   def on_predict_end(self, logs=None):
-    self._finalize_progbar(logs)
+    self._finalize_progbar(logs, self._predict_step)
 
   def _reset_progbar(self):
     self.seen = 0
@@ -985,7 +999,7 @@ class ProgbarLogger(Callback):
   def _maybe_init_progbar(self):
     if self.stateful_metrics is None:
       if self.model:
-        self.stateful_metrics = (set(m.name for m in self.model.metrics))
+        self.stateful_metrics = set(m.name for m in self.model.metrics)
       else:
         self.stateful_metrics = set()
 
@@ -996,6 +1010,15 @@ class ProgbarLogger(Callback):
           stateful_metrics=self.stateful_metrics,
           unit_name='step' if self.use_steps else 'sample')
 
+  def _implements_train_batch_hooks(self):
+    return self._call_batch_hooks
+
+  def _implements_test_batch_hooks(self):
+    return self._call_batch_hooks
+
+  def _implements_predict_batch_hooks(self):
+    return self._call_batch_hooks
+
   def _batch_update_progbar(self, batch, logs=None):
     """Updates the progbar."""
     logs = logs or {}
@@ -1016,14 +1039,16 @@ class ProgbarLogger(Callback):
       logs = tf_utils.to_numpy_or_python_type(logs)
       self.progbar.update(self.seen, list(logs.items()), finalize=False)
 
-  def _finalize_progbar(self, logs):
-    logs = logs or {}
-    self._maybe_init_progbar()
+  def _finalize_progbar(self, logs, counter):
+    logs = tf_utils.to_numpy_or_python_type(logs or {})
     if self.target is None:
-      self.target = self.seen
-      self.progbar.target = self.seen
-    logs = tf_utils.to_numpy_or_python_type(logs)
-    self.progbar.update(self.seen, list(logs.items()), finalize=True)
+      if counter is not None:
+        counter = counter.numpy()
+        if not self.use_steps:
+          counter *= logs.get('size', 1)
+      self.target = counter or self.seen
+      self.progbar.target = self.target
+    self.progbar.update(self.target, list(logs.items()), finalize=True)
 
 
 @keras_export('keras.callbacks.History')
@@ -1245,6 +1270,10 @@ class ModelCheckpoint(Callback):
           raise ValueError('Error loading file from {}. Reason: {}'.format(
               filepath_to_load, e))
 
+  def _implements_train_batch_hooks(self):
+    # Only call batch hooks when saving on batch
+    return self.save_freq != 'epoch'
+
   def on_train_batch_end(self, batch, logs=None):
     if self._should_save_on_batch(batch):
       self._save_model(epoch=self._current_epoch, logs=logs)
@@ -1986,8 +2015,10 @@ class TensorBoard(Callback, version_utils.TensorBoardVersionSelector):
 
     self._writers = {}  # Resets writers.
 
+    self._should_write_train_graph = False
     if self.write_graph:
-      self._write_keras_model_graph()
+      self._write_keras_model_summary()
+      self._should_write_train_graph = True
     if self.embeddings_freq:
       self._configure_embeddings()
 
@@ -2014,13 +2045,19 @@ class TensorBoard(Callback, version_utils.TensorBoardVersionSelector):
     distributed_file_utils.remove_temp_dirpath(self.log_dir,
                                                self.model.distribute_strategy)
 
-  def _write_keras_model_graph(self):
-    """Writes Keras graph networks to TensorBoard."""
+  def _write_keras_model_train_graph(self):
+    """Writes Keras model train_function graph to TensorBoard."""
     with self._train_writer.as_default():
       with summary_ops_v2.always_record_summaries():
-        if not self.model.run_eagerly:
-          summary_ops_v2.graph(K.get_graph(), step=0)
+        train_fn = self.model.train_function
+        # If the train_function is a `tf.function`, we can write out a graph
+        if hasattr(train_fn, 'function_spec'):
+          summary_ops_v2.graph(train_fn._concrete_stateful_fn.graph, step=0)  # pylint: disable=protected-access
 
+  def _write_keras_model_summary(self):
+    """Writes Keras graph network summary to TensorBoard."""
+    with self._train_writer.as_default():
+      with summary_ops_v2.always_record_summaries():
         summary_writable = (
             self.model._is_graph_network or  # pylint: disable=protected-access
             self.model.__class__.__name__ == 'Sequential')  # pylint: disable=protected-access
@@ -2139,7 +2176,9 @@ class TensorBoard(Callback, version_utils.TensorBoardVersionSelector):
       raise ValueError(profile_batch_error_message)
 
     if self._start_batch > 0:
-      profiler.warmup()  # Improve the profiling accuracy.
+      # Warm up and improve the profiling accuracy.
+      profiler.start('')
+      profiler.stop(save=False)
     # True when a trace is running.
     self._is_tracing = False
 
@@ -2165,6 +2204,9 @@ class TensorBoard(Callback, version_utils.TensorBoardVersionSelector):
   def on_test_end(self, logs=None):
     self._pop_writer()
 
+  def _implements_train_batch_hooks(self):
+    return self._should_trace  # Only call batch hooks when tracing is enabled
+
   def on_train_batch_begin(self, batch, logs=None):
     self._global_train_batch += 1
     if not self._should_trace:
@@ -2174,6 +2216,9 @@ class TensorBoard(Callback, version_utils.TensorBoardVersionSelector):
       self._start_trace()
 
   def on_train_batch_end(self, batch, logs=None):
+    if self._should_write_train_graph:
+      self._write_keras_model_train_graph()
+      self._should_write_train_graph = False
     if not self._should_trace:
       return
 
@@ -2210,6 +2255,12 @@ class TensorBoard(Callback, version_utils.TensorBoardVersionSelector):
     profiler.stop()
     self._is_tracing = False
 
+  def _collect_learning_rate(self, logs):
+    lr_schedule = getattr(self.model.optimizer, 'lr', None)
+    if isinstance(lr_schedule, learning_rate_schedule.LearningRateSchedule):
+      logs['learning_rate'] = lr_schedule(self.model.optimizer.iterations)
+    return logs
+
   def _log_epoch_metrics(self, epoch, logs):
     """Writes epoch metrics out as scalar summaries.
 
@@ -2222,6 +2273,7 @@ class TensorBoard(Callback, version_utils.TensorBoardVersionSelector):
 
     train_logs = {k: v for k, v in logs.items() if not k.startswith('val_')}
     val_logs = {k: v for k, v in logs.items() if k.startswith('val_')}
+    train_logs = self._collect_learning_rate(train_logs)
 
     with summary_ops_v2.always_record_summaries():
       if train_logs:
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index fdaf2e24227..1ac933135b9 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -1705,6 +1705,48 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
     self.assertEqual(my_cb.test_batches, 0)
     self.assertEqual(my_cb.predict_batches, 0)
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_default_callbacks_do_not_call_batch_hooks(self):
+    model = keras.Sequential([keras.layers.Dense(1)])
+    log_dir = self.get_temp_dir()
+    cb_list = keras.callbacks.CallbackList([
+        keras.callbacks.TensorBoard(log_dir, profile_batch=0),
+        keras.callbacks.ModelCheckpoint(log_dir),
+    ],
+                                           add_progbar=True,
+                                           model=model,
+                                           verbose=2,
+                                           epochs=3)
+    self.assertLen(cb_list.callbacks, 3)
+    self.assertFalse(cb_list._should_call_train_batch_hooks)
+    self.assertFalse(cb_list._should_call_test_batch_hooks)
+    self.assertFalse(cb_list._should_call_predict_batch_hooks)
+
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_change_tf_functions_during_fit(self):
+
+    class ChangeFunctions(keras.callbacks.Callback):
+
+      def on_epoch_end(self, epochs, logs=None):
+
+        def new_fn(iterator):
+          raise ValueError('New function substituted successfully.')
+
+        self.model.train_function = new_fn
+        self.model.test_function = new_fn
+        self.model.predict_function = new_fn
+
+    model = keras.Sequential([keras.layers.Dense(1)])
+    model.compile('sgd', 'mse')
+
+    x, y = np.ones((10, 10)), np.ones((10, 1))
+    with self.assertRaisesRegexp(ValueError, 'New function '):
+      model.fit(x, y, batch_size=2, epochs=2, callbacks=[ChangeFunctions()])
+    with self.assertRaisesRegexp(ValueError, 'New function '):
+      model.evaluate(x, y, batch_size=2)
+    with self.assertRaisesRegexp(ValueError, 'New function '):
+      model.predict(x, batch_size=2)
+
 
 # A summary that was emitted during a test. Fields:
 #   logdir: str. The logdir of the FileWriter to which the summary was
@@ -1725,6 +1767,7 @@ class _SummaryFile(object):
     self.images = set()
     self.histograms = set()
     self.tensors = set()
+    self.graph_defs = []
 
 
 def list_summaries(logdir):
@@ -1751,6 +1794,8 @@ def list_summaries(logdir):
         continue
       path = os.path.join(dirpath, filename)
       for event in summary_iterator.summary_iterator(path):
+        if event.graph_def:
+          result.graph_defs.append(event.graph_def)
         if not event.summary:  # (e.g., it's a `graph_def` event)
           continue
         for value in event.summary.value:
@@ -1789,18 +1834,16 @@ class TestTensorBoardV2(keras_parameterized.TestCase):
     self.train_dir = os.path.join(self.logdir, 'train')
     self.validation_dir = os.path.join(self.logdir, 'validation')
 
-  def _get_model(self):
+  def _get_model(self, compile_model=True):
     layers = [
         keras.layers.Conv2D(8, (3, 3)),
         keras.layers.Flatten(),
         keras.layers.Dense(1)
     ]
     model = testing_utils.get_model_from_layers(layers, input_shape=(10, 10, 1))
-    opt = gradient_descent.SGD(learning_rate=0.001)
-    model.compile(
-        opt,
-        'mse',
-        run_eagerly=testing_utils.should_run_eagerly())
+    if compile_model:
+      opt = gradient_descent.SGD(learning_rate=0.001)
+      model.compile(opt, 'mse', run_eagerly=testing_utils.should_run_eagerly())
     return model
 
   def test_TensorBoard_default_logdir(self):
@@ -1914,6 +1957,29 @@ class TestTensorBoardV2(keras_parameterized.TestCase):
         },
     )
 
+  def test_TensorBoard_learning_rate_schedules(self):
+    model = self._get_model(compile_model=False)
+    opt = gradient_descent.SGD(learning_rate_schedule.CosineDecay(0.01, 1))
+    model.compile(opt, 'mse', run_eagerly=testing_utils.should_run_eagerly())
+
+    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+
+    model.fit(
+        x,
+        y,
+        batch_size=2,
+        epochs=2,
+        callbacks=[keras.callbacks.TensorBoard(self.logdir)])
+
+    summary_file = list_summaries(self.logdir)
+    self.assertEqual(
+        summary_file.scalars,
+        {
+            _ObservedSummary(logdir=self.train_dir, tag='epoch_loss'),
+            _ObservedSummary(logdir=self.train_dir, tag='epoch_learning_rate'),
+        },
+    )
+
   def test_TensorBoard_weight_histograms(self):
     model = self._get_model()
     x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
@@ -2175,7 +2241,7 @@ class TestTensorBoardV2NonParameterizedTest(keras_parameterized.TestCase):
         x,
         y,
         batch_size=2,
-        epochs=2,
+        epochs=3,
         validation_data=(x, y),
         callbacks=[tb_cbk])
     summary_file = list_summaries(self.logdir)
@@ -2185,6 +2251,16 @@ class TestTensorBoardV2NonParameterizedTest(keras_parameterized.TestCase):
             _ObservedSummary(logdir=self.train_dir, tag='keras'),
         },
     )
+    if not model.run_eagerly:
+      # There should be one train graph
+      self.assertLen(summary_file.graph_defs, 1)
+      for graph_def in summary_file.graph_defs:
+        graph_def_str = str(graph_def)
+
+        # All the model layers should appear in the graphs
+        for layer in model.layers:
+          if 'input' not in layer.name:
+            self.assertIn(layer.name, graph_def_str)
 
   def test_TensorBoard_writeSequentialModel_noInputShape(self):
     model = keras.models.Sequential([
@@ -2192,7 +2268,7 @@ class TestTensorBoardV2NonParameterizedTest(keras_parameterized.TestCase):
         keras.layers.Flatten(),
         keras.layers.Dense(1),
     ])
-    model.compile('sgd', 'mse', run_eagerly=False)
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
     self.fitModelAndAssertKerasModelWritten(model)
 
   def test_TensorBoard_writeSequentialModel_withInputShape(self):
@@ -2201,16 +2277,16 @@ class TestTensorBoardV2NonParameterizedTest(keras_parameterized.TestCase):
         keras.layers.Flatten(),
         keras.layers.Dense(1),
     ])
-    model.compile('sgd', 'mse', run_eagerly=False)
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
     self.fitModelAndAssertKerasModelWritten(model)
 
-  def test_TensoriBoard_writeModel(self):
+  def test_TensorBoard_writeModel(self):
     inputs = keras.layers.Input([10, 10, 1])
     x = keras.layers.Conv2D(8, (3, 3), activation='relu')(inputs)
     x = keras.layers.Flatten()(x)
     x = keras.layers.Dense(1)(x)
     model = keras.models.Model(inputs=inputs, outputs=[x])
-    model.compile('sgd', 'mse', run_eagerly=False)
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
     self.fitModelAndAssertKerasModelWritten(model)
 
   def test_TensorBoard_autoTrace(self):
diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index 83c25562323..2a0421cf998 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -144,6 +144,7 @@ cuda_py_test(
 distribute_py_test(
     name = "checkpointing_test",
     srcs = ["checkpointing_test.py"],
+    disable_mlir_bridge = False,
     main = "checkpointing_test.py",
     tags = [
         "multi_and_single_gpu",
@@ -163,6 +164,7 @@ cuda_py_test(
     python_version = "PY3",
     tags = [
         "multi_and_single_gpu",
+        "nomsan",  # TODO(b/162894966)
     ],
     # b/155301154 broken with XLA:GPU
     xla_enable_strict_auto_jit = True,
@@ -176,11 +178,13 @@ cuda_py_test(
         "//tensorflow/python:training_lib",
         "//tensorflow/python:training_util",
         "//tensorflow/python:variables",
+        "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/distribute:collective_all_reduce_strategy",
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:cross_device_utils",
         "//tensorflow/python/distribute:multi_worker_test_base",
         "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/distribute:strategy_combinations",
         "//tensorflow/python/distribute:strategy_test_lib",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:context",
@@ -198,7 +202,7 @@ distribute_py_test(
     name = "ctl_correctness_test",
     srcs = ["ctl_correctness_test.py"],
     main = "ctl_correctness_test.py",
-    shard_count = 5,
+    shard_count = 10,
     tags = [
         "multi_and_single_gpu",
     ],
@@ -218,6 +222,7 @@ distribute_py_test(
 distribute_py_test(
     name = "custom_training_loop_metrics_test",
     srcs = ["custom_training_loop_metrics_test.py"],
+    disable_mlir_bridge = False,
     main = "custom_training_loop_metrics_test.py",
     tags = [
         "multi_and_single_gpu",
@@ -264,6 +269,7 @@ distribute_py_test(
 distribute_py_test(
     name = "custom_training_loop_optimizer_test",
     srcs = ["custom_training_loop_optimizer_test.py"],
+    disable_mlir_bridge = False,
     main = "custom_training_loop_optimizer_test.py",
     tags = [
         "multi_and_single_gpu",
@@ -308,6 +314,7 @@ py_library(
 distribute_py_test(
     name = "keras_premade_models_test",
     srcs = ["keras_premade_models_test.py"],
+    disable_mlir_bridge = False,
     full_precision = True,
     main = "keras_premade_models_test.py",
     shard_count = 4,
@@ -344,6 +351,7 @@ distribute_py_test(
 distribute_py_test(
     name = "distributed_training_utils_test",
     srcs = ["distributed_training_utils_test.py"],
+    disable_mlir_bridge = False,
     full_precision = True,
     main = "distributed_training_utils_test.py",
     deps = [
@@ -386,6 +394,7 @@ distribute_py_test(
     name = "keras_dnn_correctness_test",
     size = "medium",
     srcs = ["keras_dnn_correctness_test.py"],
+    disable_mlir_bridge = False,
     full_precision = True,
     main = "keras_dnn_correctness_test.py",
     # Shard count is set to an odd number to distribute tasks across
@@ -423,6 +432,7 @@ distribute_py_test(
     name = "keras_image_model_correctness_test",
     size = "medium",
     srcs = ["keras_image_model_correctness_test.py"],
+    disable_mlir_bridge = False,
     full_precision = True,
     main = "keras_image_model_correctness_test.py",
     shard_count = 16,
@@ -441,6 +451,7 @@ distribute_py_test(
 distribute_py_test(
     name = "keras_metrics_test",
     srcs = ["keras_metrics_test.py"],
+    disable_mlir_bridge = False,
     main = "keras_metrics_test.py",
     tags = [
         "multi_and_single_gpu",
@@ -468,7 +479,6 @@ distribute_py_test(
     shard_count = 31,
     tags = [
         "multi_and_single_gpu",
-        "no_oss",  # b/136660639
         "no_windows_gpu",
         "notpu",  # TODO(b/153672562)
         "notsan",
@@ -482,6 +492,7 @@ distribute_py_test(
     name = "keras_save_load_test",
     size = "medium",
     srcs = ["keras_save_load_test.py"],
+    disable_mlir_bridge = False,
     full_precision = True,
     main = "keras_save_load_test.py",
     shard_count = 7,
@@ -498,6 +509,7 @@ distribute_py_test(
     name = "keras_stateful_lstm_model_correctness_test",
     size = "medium",
     srcs = ["keras_stateful_lstm_model_correctness_test.py"],
+    disable_mlir_bridge = False,
     full_precision = True,
     main = "keras_stateful_lstm_model_correctness_test.py",
     shard_count = 4,
@@ -747,6 +759,7 @@ distribute_py_test(
     name = "saved_model_save_load_test",
     size = "medium",
     srcs = ["saved_model_save_load_test.py"],
+    disable_mlir_bridge = False,
     full_precision = True,
     main = "saved_model_save_load_test.py",
     shard_count = 7,
@@ -764,6 +777,7 @@ distribute_py_test(
     name = "saved_model_mixed_api_test",
     size = "medium",
     srcs = ["saved_model_mixed_api_test.py"],
+    disable_mlir_bridge = False,
     full_precision = True,
     main = "saved_model_mixed_api_test.py",
     shard_count = 7,
diff --git a/tensorflow/python/keras/distribute/checkpointing_test.py b/tensorflow/python/keras/distribute/checkpointing_test.py
index 77c335fe46d..b9689adede9 100644
--- a/tensorflow/python/keras/distribute/checkpointing_test.py
+++ b/tensorflow/python/keras/distribute/checkpointing_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.eager import test
 from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.training.saving import checkpoint_options
 from tensorflow.python.training.tracking import util as trackable_utils
 
 
@@ -93,6 +94,46 @@ class TrainingCheckpointTests(test.TestCase, parameterized.TestCase):
           ValueError, "optimizer slot variable under the scope"):
         checkpoint.restore(save_path)
 
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_one_cpu,
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.cloud_tpu_strategy,
+              strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
+              strategy_combinations.central_storage_strategy_with_two_gpus,
+          ],
+          mode=["eager"]))
+  def testCheckpointSaveRestoreIoDevice(self, distribution):
+
+    def state():
+      with distribution.scope():
+        v = variables_lib.Variable(random_ops.random_normal([]))
+        return v
+
+    ckpt_options = checkpoint_options.CheckpointOptions(
+        experimental_io_device="/job:localhost")
+
+    def checkpoint():
+      v = state()
+      # Save random weights into checkpoint.
+      checkpoint = trackable_utils.Checkpoint(v=v)
+      prefix = os.path.join(self.get_temp_dir(), "ckpt")
+      with self.test_session():
+        save_path = checkpoint.save(prefix, options=ckpt_options)
+      return save_path
+
+    save_path = checkpoint()
+
+    v = state()
+    checkpoint = trackable_utils.Checkpoint(v=v)
+    # Restore from the checkpoint inside a distribution.scope().
+    # Check that restore works without error.
+    with self.test_session():
+      with distribution.scope():
+        checkpoint.restore(save_path, options=ckpt_options)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/distribute/collective_all_reduce_strategy_test.py b/tensorflow/python/keras/distribute/collective_all_reduce_strategy_test.py
index f2869e4d478..60b7d4690bb 100644
--- a/tensorflow/python/keras/distribute/collective_all_reduce_strategy_test.py
+++ b/tensorflow/python/keras/distribute/collective_all_reduce_strategy_test.py
@@ -22,22 +22,29 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import cross_device_utils
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import strategy_test_lib
 from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import layers
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.mixed_precision.experimental import policy
 from tensorflow.python.keras.mixed_precision.experimental import test_util as mp_test_util
+from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
@@ -316,5 +323,51 @@ class LocalCollectiveAllReduceStrategy(
       self._test_mixed_precision(None, None, required_gpus)
 
 
+@combinations.generate(
+    combinations.combine(
+        strategy=[
+            strategy_combinations.multi_worker_mirrored_2x1_cpu,
+            strategy_combinations.multi_worker_mirrored_2x1_gpu,
+        ],
+        mode=['eager']))
+class DistributedCollectiveAllReduceStrategyEagerTest(test.TestCase,
+                                                      parameterized.TestCase):
+
+  def testFitWithoutStepsPerEpochPartialBatch(self, strategy):
+
+    def _model_fn():
+      x = layers.Input(shape=(1,), name='input')
+      y = layers.Dense(1, name='dense')(x)
+      model = training.Model(x, y)
+      return model
+
+    def _get_dataset():
+      inputs = array_ops.expand_dims_v2(constant_op.constant(range(10)), axis=1)
+      targets = array_ops.expand_dims_v2(
+          constant_op.constant(range(10)), axis=1)
+      # Make global batch size 12 for 2 replicas and a non-repeated dataset with
+      # 10 elements so that we have partial batch
+      dataset = dataset_ops.Dataset.from_tensor_slices(
+          (inputs, targets)).batch(12, drop_remainder=False)
+      return dataset
+
+    with strategy.scope():
+      optimizer_fn = gradient_descent_keras.SGD
+      optimizer = optimizer_fn(0.001)
+      model = _model_fn()
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(
+          optimizer,
+          loss,
+          metrics=metrics)
+    dataset = _get_dataset()
+    kernel_before = model.get_weights()[0][0]
+    model.fit(dataset, epochs=10)
+    kernel_after = model.get_weights()[0][0]
+    self.assertNotEqual(kernel_before, kernel_after)
+    self.assertGreater(abs(kernel_before-1), abs(kernel_after-1))
+
 if __name__ == '__main__':
-  test.main()
+  v2_compat.enable_v2_behavior()
+  combinations.main()
diff --git a/tensorflow/python/keras/distribute/ctl_correctness_test.py b/tensorflow/python/keras/distribute/ctl_correctness_test.py
index eade27ee57c..3af3ee218c9 100644
--- a/tensorflow/python/keras/distribute/ctl_correctness_test.py
+++ b/tensorflow/python/keras/distribute/ctl_correctness_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras.distribute import optimizer_combinations
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
@@ -230,6 +231,16 @@ class TestDistributionStrategyDnnCorrectness(test.TestCase,
           mode=['eager'],
           iteration_type=['iterator', 'dataset'],
           inside_func=[False, True],
+          sync_batchnorm=[True, False]) +
+      combinations.combine(
+          distribution=strategy_combinations.multiworker_strategies,
+          optimizer_fn=[
+              optimizer_combinations.gradient_descent_optimizer_keras_v2_fn,
+              optimizer_combinations.adagrad_optimizer_keras_v2_fn
+          ],
+          mode=['eager'],
+          iteration_type=['iterator', 'dataset'],
+          inside_func=[False, True],
           sync_batchnorm=[True, False]
       ))
   def test_dnn_correctness_minus_tpus(self, distribution, optimizer_fn,
@@ -238,6 +249,9 @@ class TestDistributionStrategyDnnCorrectness(test.TestCase,
     # TODO(anjs): Identify why this particular V1 optimizer needs a higher tol.
     if 'FtrlV1' in optimizer_fn._name and 'TPU' in type(distribution).__name__:
       self.skipTest('Reduced tolerance of the order of 1e-1 required.')
+    if ('CollectiveAllReduce' in type(distribution).__name__ and
+        test_util.is_xla_enabled()):
+      self.skipTest('XLA tests fail with MWMS.')
     self.dnn_correctness(distribution, optimizer_fn, iteration_type,
                          inside_func, sync_batchnorm)
 
@@ -263,4 +277,4 @@ class TestDistributionStrategyDnnCorrectness(test.TestCase,
 
 
 if __name__ == '__main__':
-  test.main()
+  combinations.main()
diff --git a/tensorflow/python/keras/distribute/custom_training_loop_metrics_test.py b/tensorflow/python/keras/distribute/custom_training_loop_metrics_test.py
index 8704b8378bf..a41d1f369a4 100644
--- a/tensorflow/python/keras/distribute/custom_training_loop_metrics_test.py
+++ b/tensorflow/python/keras/distribute/custom_training_loop_metrics_test.py
@@ -34,7 +34,8 @@ class KerasMetricsTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(
       combinations.combine(
-          distribution=strategy_combinations.all_strategies,
+          distribution=strategy_combinations.all_strategies +
+          strategy_combinations.multiworker_strategies,
           mode=["eager"]
       ))
   def test_multiple_keras_metrics_experimental_run(self, distribution):
@@ -58,7 +59,8 @@ class KerasMetricsTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(
       combinations.combine(
-          distribution=strategy_combinations.all_strategies,
+          distribution=strategy_combinations.all_strategies+
+          strategy_combinations.multiworker_strategies,
           mode=["eager"]
       ))
   def test_update_keras_metric_declared_in_strategy_scope(self, distribution):
@@ -98,4 +100,4 @@ class KerasMetricsTest(test.TestCase, parameterized.TestCase):
 
 
 if __name__ == "__main__":
-  test.main()
+  combinations.main()
diff --git a/tensorflow/python/keras/distribute/custom_training_loop_models_test.py b/tensorflow/python/keras/distribute/custom_training_loop_models_test.py
index 5a9384bb7e0..b680960429c 100644
--- a/tensorflow/python/keras/distribute/custom_training_loop_models_test.py
+++ b/tensorflow/python/keras/distribute/custom_training_loop_models_test.py
@@ -251,6 +251,33 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
 
     train_step(input_iterator)
 
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.all_strategies,
+          mode=["eager"]))
+  def test_model_predict_with_dynamic_batch(self, distribution):
+    input_data = np.random.random([1, 32, 64, 64, 3])
+    input_shape = tuple(input_data.shape[1:])
+
+    def build_model():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.ConvLSTM2D(
+              4,
+              kernel_size=(4, 4),
+              activation="sigmoid",
+              padding="same",
+              input_shape=input_shape))
+      model.add(keras.layers.GlobalMaxPooling2D())
+      model.add(keras.layers.Dense(2, activation="sigmoid"))
+      return model
+
+    with distribution.scope():
+      model = build_model()
+      model.compile(loss="binary_crossentropy", optimizer="adam")
+      result = model.predict(input_data)
+      self.assertEqual(result.shape, (1, 2))
+
   @combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.all_strategies,
diff --git a/tensorflow/python/keras/distribute/distribute_strategy_test.py b/tensorflow/python/keras/distribute/distribute_strategy_test.py
index df8f4e29764..4b6d3a80730 100644
--- a/tensorflow/python/keras/distribute/distribute_strategy_test.py
+++ b/tensorflow/python/keras/distribute/distribute_strategy_test.py
@@ -1239,7 +1239,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       dataset = dataset.repeat(100)
       dataset = dataset.batch(10)
 
-      with self.assertRaisesRegex(ValueError, 'incompatible with the layer'):
+      with self.assertRaisesRegex(ValueError, 'is incompatible with'):
         model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
 
   @combinations.generate(
diff --git a/tensorflow/python/keras/distribute/keras_optimizer_v2_test.py b/tensorflow/python/keras/distribute/keras_optimizer_v2_test.py
index 35e355c093c..75c7ce833c5 100644
--- a/tensorflow/python/keras/distribute/keras_optimizer_v2_test.py
+++ b/tensorflow/python/keras/distribute/keras_optimizer_v2_test.py
@@ -52,7 +52,6 @@ class MirroredStrategyOptimizerV2Test(test.TestCase, parameterized.TestCase):
           ],
           mode=['graph', 'eager']))
   def testKerasOptimizerWithUnequalInput(self, distribution):
-    self.skipTest('b/130309197')
     with distribution.scope():
       var = variables.Variable(
           2.0, name='var', aggregation=variable_scope.VariableAggregation.SUM)
@@ -109,7 +108,6 @@ class MirroredStrategyOptimizerV2Test(test.TestCase, parameterized.TestCase):
           ],
           mode=['graph', 'eager']))
   def testOptimizerWithKerasModelAndNumpyArrays(self, distribution):
-    self.skipTest('b/130309197')
     with self.cached_session():
       with distribution.scope():
         model = get_model()
diff --git a/tensorflow/python/keras/distribute/mirrored_variable_test.py b/tensorflow/python/keras/distribute/mirrored_variable_test.py
index 0edfa4806f2..e24420ffc4c 100644
--- a/tensorflow/python/keras/distribute/mirrored_variable_test.py
+++ b/tensorflow/python/keras/distribute/mirrored_variable_test.py
@@ -21,9 +21,9 @@ from __future__ import print_function
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import distribute_utils
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import strategy_combinations
-from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import config
@@ -96,9 +96,9 @@ class MirroredVariableCreationTest(test.TestCase):
       result = distribution.extended.call_for_each_replica(
           model_fn, args=(features,))
       for kernel, bias in result:
-        self.assertIsInstance(kernel, values.MirroredVariable)
+        self.assertTrue(distribute_utils.is_mirrored(kernel))
         self.assertAllDifferent(distribution.experimental_local_results(kernel))
-        self.assertIsInstance(bias, values.MirroredVariable)
+        self.assertTrue(distribute_utils.is_mirrored(bias))
         self.assertAllDifferent(distribution.experimental_local_results(kernel))
 
 
diff --git a/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py b/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
index 35e2c7309fc..f7d64c2fc23 100644
--- a/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
@@ -20,6 +20,7 @@ import contextlib
 import os
 import re
 import zipfile
+from absl import logging
 from absl.testing import parameterized
 import numpy as np
 from tensorflow.python import keras
@@ -35,6 +36,8 @@ from tensorflow.python.keras.datasets import mnist
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import test
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training.tracking import util as tracking_util
 from tensorflow.python.util import nest
 
 
@@ -105,7 +108,7 @@ class MultiWorkerTutorialTest(parameterized.TestCase, test.TestCase):
 
     num_workers = 4
 
-    def proc_func(model_path):
+    def proc_func(model_path, checkpoint_dir):
       global_batch_size = per_worker_batch_size * num_workers
       strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy()
       with strategy.scope():
@@ -129,7 +132,8 @@ class MultiWorkerTutorialTest(parameterized.TestCase, test.TestCase):
           callbacks=callbacks)
 
       def _is_chief(task_type, task_id):
-        return task_type == 'chief' or (task_type == 'worker' and task_id == 0)
+        return task_type is None or task_type == 'chief' or (
+            task_type == 'worker' and task_id == 0)
 
       def _get_temp_dir(dirpath, task_id):
         base_dirpath = 'workertemp_' + str(task_id)
@@ -163,14 +167,46 @@ class MultiWorkerTutorialTest(parameterized.TestCase, test.TestCase):
       loaded_model = keras.saving.save.load_model(model_path)
       loaded_model.fit(multi_worker_dataset, epochs=2, steps_per_epoch=20)
 
-    model_path = os.path.join(self.get_temp_dir(), 'ckpt.tf')
+      checkpoint = tracking_util.Checkpoint(model=multi_worker_model)
+      write_checkpoint_dir = write_filepath(checkpoint_dir, task_type, task_id)
+      checkpoint_manager = checkpoint_management.CheckpointManager(
+          checkpoint, directory=write_checkpoint_dir, max_to_keep=1)
+
+      checkpoint_manager.save()
+      if not _is_chief(task_type, task_id):
+        file_io.delete_recursively_v2(write_checkpoint_dir)
+
+      # Make sure chief finishes saving before non-chief's assertions.
+      multi_process_runner.barrier().wait()
+
+      if not file_io.file_exists(checkpoint_dir):
+        raise RuntimeError()
+      if file_io.file_exists(write_checkpoint_dir) != _is_chief(
+          task_type, task_id):
+        raise RuntimeError()
+
+      latest_checkpoint = checkpoint_management.latest_checkpoint(
+          checkpoint_dir)
+      checkpoint.restore(latest_checkpoint)
+      multi_worker_model.fit(multi_worker_dataset, epochs=2, steps_per_epoch=20)
+
+      logging.info('testMultiWorkerTutorial successfully ends')
+
+    model_path = os.path.join(self.get_temp_dir(), 'model.tf')
+    checkpoint_dir = os.path.join(self.get_temp_dir(), 'ckpt')
     with test_util.skip_if_error(self, errors_impl.UnavailableError):
       mpr_result = multi_process_runner.run(
           proc_func,
           multi_worker_test_base.create_cluster_spec(num_workers=num_workers),
-          args=(model_path,),
+          args=(model_path, checkpoint_dir),
           list_stdout=True)
 
+    self.assertTrue(
+        any([
+            'testMultiWorkerTutorial successfully ends' in msg
+            for msg in mpr_result.stdout
+        ]))
+
     def extract_accuracy(worker_id, input_string):
       match = re.match(
           r'\[worker\-{}\].*accuracy: (\d+\.\d+).*'.format(worker_id),
diff --git a/tensorflow/python/keras/distribute/saved_model_save_load_test.py b/tensorflow/python/keras/distribute/saved_model_save_load_test.py
index c8f47386783..39856af2a20 100644
--- a/tensorflow/python/keras/distribute/saved_model_save_load_test.py
+++ b/tensorflow/python/keras/distribute/saved_model_save_load_test.py
@@ -18,11 +18,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import test
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.keras.distribute import model_combinations
 from tensorflow.python.keras.distribute import saved_model_test_base as test_base
 from tensorflow.python.ops import array_ops
+from tensorflow.python.saved_model import load_options as load_options_lib
+from tensorflow.python.saved_model import save_options as save_options_lib
 from tensorflow.python.saved_model import saved_model
 
 
@@ -146,6 +152,29 @@ class SavedModelTFModuleTest(test_base.TestSavedModelBase):
                                                  distribution_for_restoring,
                                                  save_in_scope)
 
+  @combinations.generate(
+      combinations.combine(
+          model_and_input=[model_combinations.simple_tfmodule_model],
+          distribution=test_base.strategies +
+          [strategy_combinations.cloud_tpu_strategy]))
+  def test_save_load_io_device(self, model_and_input, distribution):
+    saved_dir = os.path.join(self.get_temp_dir(), 'io_device')
+    with distribution.scope():
+      model = model_and_input.get_model()
+      x_train, y_train, _ = model_and_input.get_data()
+      batch_size = model_and_input.get_batch_size()
+      self._train_model(model, x_train, y_train, batch_size)
+    call = model.__call__.get_concrete_function(tensor_spec.TensorSpec(None))
+    save_options = save_options_lib.SaveOptions(
+        experimental_io_device='/job:localhost')
+    saved_model.save(model, saved_dir, signatures=call, options=save_options)
+    load_options = load_options_lib.LoadOptions(
+        experimental_io_device='/job:localhost')
+    # Check that the model can be loaded and training continued without error.
+    with distribution.scope():
+      loaded_model = saved_model.load(saved_dir, options=load_options)
+      self._train_model(loaded_model, x_train, y_train, batch_size)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/BUILD b/tensorflow/python/keras/engine/BUILD
index c64e38122e9..0d2ddb46049 100644
--- a/tensorflow/python/keras/engine/BUILD
+++ b/tensorflow/python/keras/engine/BUILD
@@ -90,8 +90,6 @@ py_library(
         "//tensorflow/python:control_flow_v2_func_graphs",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:init_ops_v2",
         "//tensorflow/python:tf2",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
@@ -322,6 +320,21 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "keras_tensor_test",
+    size = "small",
+    srcs = ["keras_tensor_test.py"],
+    python_version = "PY3",
+    tags = [
+        "nomac",  # TODO(mihaimaruseac): b/127695564
+    ],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_test(
     name = "input_spec_test",
     size = "small",
@@ -532,6 +545,23 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "input_layer_test",
+    size = "medium",
+    srcs = ["input_layer_test.py"],
+    python_version = "PY3",
+    shard_count = 3,
+    tags = [
+        "nomac",  # TODO(mihaimaruseac): b/127695564
+    ],
+    deps = [
+        ":base_layer",
+        ":engine",
+        "//tensorflow/python/keras:testing_utils",
+        "//tensorflow/python/keras/utils:layer_utils",
+    ],
+)
+
 tf_py_test(
     name = "functional_test",
     size = "medium",
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 7e21cba15c0..c01c3d96aec 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -35,7 +35,6 @@ from tensorflow.python import tf2
 from tensorflow.python.autograph.core import ag_ctx
 from tensorflow.python.autograph.impl import api as autograph
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
-from tensorflow.python.distribute import sharded_variable
 from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
 from tensorflow.python.eager import function
@@ -72,6 +71,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.ops.numpy_ops import np_arrays
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.training.tracking import base as trackable
@@ -94,9 +94,11 @@ _TF_OP_LAYER_NAME_PREFIX = 'tf_op_layer_'
 _AUTOCAST_TYPES = (ops.Tensor, sparse_tensor.SparseTensor,
                    ragged_tensor.RaggedTensor)
 
-_keras_layers_gauge = monitoring.BoolGauge('/tensorflow/api/keras/layers',
-                                           'keras layers usage', 'method')
-_keras_model_gauge = monitoring.BoolGauge(
+keras_layers_gauge = monitoring.BoolGauge('/tensorflow/api/keras/layers',
+                                          'keras layers usage', 'method')
+keras_api_gauge = monitoring.BoolGauge('/tensorflow/api/keras',
+                                       'keras api usage', 'method')
+keras_model_gauge = monitoring.BoolGauge(
     '/tensorflow/api/keras/premade_models', 'premade keras model usage', 'type')
 
 
@@ -301,6 +303,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
                dtype=None,
                dynamic=False,
                **kwargs):
+    keras_api_gauge.get_cell('layer').set(True)
+    keras_layers_gauge.get_cell(self.__class__.__name__).set(True)
     # These properties should be set by the user via keyword arguments.
     # note that 'dtype', 'input_shape' and 'batch_input_shape'
     # are only applicable to input layers: do not pass these keywords
@@ -378,8 +382,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # These lists will be filled via successive calls
     # to self._add_inbound_node().
     # Used in symbolic mode only, only in conjunction with graph-networks
-    self._inbound_nodes = []
-    self._outbound_nodes = []
+    self._inbound_nodes_value = []
+    self._outbound_nodes_value = []
 
     self._init_call_fn_args()
 
@@ -415,6 +419,12 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # might want to turn it off, like Sequential model.
     self._auto_track_sub_layers = True
 
+    # For backwards compat reasons, most built-in layers do not guarantee
+    # That they will 100% preserve the structure of input args when saving
+    # / loading configs. E.g. they may un-nest an arg that is
+    # a list with one element.
+    self._preserve_input_structure_in_config = False
+
   @trackable.no_automatic_dependency_tracking
   @generic_utils.default
   def build(self, input_shape):
@@ -483,7 +493,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
                  regularizer=None,
                  trainable=None,
                  constraint=None,
-                 partitioner=None,
                  use_resource=None,
                  synchronization=tf_variables.VariableSynchronization.AUTO,
                  aggregation=tf_variables.VariableAggregation.NONE,
@@ -502,7 +511,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         Note that `trainable` cannot be `True` if `synchronization`
         is set to `ON_READ`.
       constraint: Constraint instance (callable).
-      partitioner: Partitioner to be passed to the `Trackable` API.
       use_resource: Whether to use `ResourceVariable`.
       synchronization: Indicates when a distributed a variable will be
         aggregated. Accepted values are constants defined in the class
@@ -517,24 +525,20 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         `collections`, `experimental_autocast` and `caching_device`.
 
     Returns:
-      The created variable. Usually either a `Variable` or `ResourceVariable`
-      instance. If `partitioner` is not `None`, a `PartitionedVariable`
-      instance is returned.
+      The variable created.
 
     Raises:
-      RuntimeError: If called with partitioned variable regularization and
-        eager execution is enabled.
       ValueError: When giving unsupported dtype and no initializer or when
         trainable has been set to True with synchronization set as `ON_READ`.
     """
     if shape is None:
       shape = ()
+    kwargs.pop('partitioner', None)  # Ignored.
     # Validate optional keyword arguments.
     for kwarg in kwargs:
-      if kwarg not in ['getter', 'collections', 'experimental_autocast',
-                       'caching_device']:
+      if kwarg not in ['collections', 'experimental_autocast',
+                       'caching_device', 'getter']:
         raise TypeError('Unknown keyword argument:', kwarg)
-    getter = kwargs.pop('getter', base_layer_utils.make_variable)
     collections_arg = kwargs.pop('collections', None)
     # 'experimental_autocast' can be set to False by the caller to indicate an
     # AutoCastVariable should never be created.
@@ -579,10 +583,11 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         raise ValueError('An initializer for variable %s of type %s is required'
                          ' for layer %s' % (name, dtype.base_dtype, self.name))
 
+    getter = kwargs.pop('getter', base_layer_utils.make_variable)
     if (autocast and self._dtype_policy.should_cast_variables and
         dtype.is_floating):
-      # Wrap 'getter' with a version that returns an AutoCastVariable.
       old_getter = getter
+      # Wrap variable constructor to return an AutoCastVariable.
       def getter(*args, **kwargs):  # pylint: disable=function-redefined
         variable = old_getter(*args, **kwargs)
         return autocast_variable.create_autocast_variable(variable)
@@ -606,7 +611,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         dtype=dtype,
         constraint=constraint,
         trainable=trainable,
-        partitioner=partitioner,
         use_resource=use_resource,
         collections=collections_arg,
         synchronization=synchronization,
@@ -620,9 +624,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       self._handle_weight_regularization(name_in_scope,
                                          variable,
                                          regularizer)
-    if isinstance(
-        variable,
-        (tf_variables.PartitionedVariable, sharded_variable.ShardedVariable)):
+    if base_layer_utils.is_split_variable(variable):
       for v in variable:
         backend.track_variable(v)
         if trainable:
@@ -738,7 +740,9 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
                   '`compute_output_shape` method on your layer (%s).' %
                   self.__class__.__name__), e)
       return nest.map_structure(lambda t: t.shape, outputs)
-    raise NotImplementedError
+    raise NotImplementedError(
+        'Please run in eager mode or implement the `compute_output_shape` '
+        'method on your layer (%s).' % self.__class__.__name__)
 
   @doc_controls.for_subclass_implementers
   def compute_output_signature(self, input_signature):
@@ -922,7 +926,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # >> inputs = tf.keras.Input(10)
     # >> outputs = MyLayer()(inputs)  # Functional construction mode.
     # >> model = tf.keras.Model(inputs, outputs)
-    if _in_functional_construction_mode(inputs, args, kwargs, input_list):
+    if _in_functional_construction_mode(self, inputs, args, kwargs, input_list):
       return self._functional_construction_call(inputs, args, kwargs,
                                                 input_list)
 
@@ -930,7 +934,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     call_context = base_layer_utils.call_context()
 
     # Accept NumPy and scalar inputs by converting to Tensors.
-    if any(isinstance(x, (np.ndarray, float, int)) for x in input_list):
+    if any(isinstance(x, (
+        np_arrays.ndarray, np.ndarray, float, int)) for x in input_list):
       inputs = nest.map_structure(_convert_numpy_or_python_types, inputs)
       input_list = nest.flatten(inputs)
 
@@ -969,12 +974,11 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       if self._autocast:
         inputs = self._maybe_cast_inputs(inputs, input_list)
 
+      input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
       if eager:
         call_fn = self.call
         name_scope = self._name
       else:
-        input_spec.assert_input_compatibility(self.input_spec, inputs,
-                                              self.name)
         name_scope = self._name_scope()  # Avoid autoincrementing.
         call_fn = self._autographed_call()
 
@@ -998,12 +1002,13 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     call_context = base_layer_utils.call_context()
 
     # Accept NumPy and scalar inputs by converting to Tensors.
-    if any(isinstance(x, (np.ndarray, float, int)) for x in input_list):
+    if any(isinstance(x, (
+        np_arrays.ndarray, np.ndarray, float, int)) for x in input_list):
 
       def _convert_non_tensor(x):
         # Don't call `ops.convert_to_tensor_v2` on all `inputs` because
         # `SparseTensors` can't be converted to `Tensor`.
-        if isinstance(x, (np.ndarray, float, int)):
+        if isinstance(x, (np_arrays.ndarray, np.ndarray, float, int)):
           return ops.convert_to_tensor_v2(x)
         return x
 
@@ -2263,6 +2268,24 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
   # Methods & attributes below are all private and only used by the framework. #
   ##############################################################################
 
+  @property
+  def _inbound_nodes(self):
+    return self._inbound_nodes_value
+
+  @_inbound_nodes.setter
+  @trackable.no_automatic_dependency_tracking
+  def _inbound_nodes(self, value):
+    self._inbound_nodes_value = value
+
+  @property
+  def _outbound_nodes(self):
+    return self._outbound_nodes_value
+
+  @_outbound_nodes.setter
+  @trackable.no_automatic_dependency_tracking
+  def _outbound_nodes(self, value):
+    self._outbound_nodes_value = value
+
   def _set_dtype_policy(self, dtype):
     """Sets self._dtype_policy."""
     if isinstance(dtype, policy.Policy):
@@ -2438,7 +2461,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         regularization = regularizer(v)
       return regularization
 
-    if isinstance(variable, tf_variables.PartitionedVariable):
+    if base_layer_utils.is_split_variable(variable):
       for v in variable:
         self.add_loss(functools.partial(_loss_for_variable, v))
     else:
@@ -2746,7 +2769,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     super(tracking.AutoTrackable, self).__delattr__(name)
 
     if (isinstance(existing_value, Layer)
-        or trackable_layer_utils.has_weights(existing_value)):
+        or base_layer_utils.has_weights(existing_value)):
       super(tracking.AutoTrackable, self).__setattr__(
           '_layers',
           [l for l in self._layers if l is not existing_value])
@@ -2796,7 +2819,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # Be careful about metric if it becomes a Module in future.
     # Append value to self._layers if relevant
     if (getattr(self, '_auto_track_sub_layers', True) and
-        (isinstance(value, Layer) or trackable_layer_utils.has_weights(value))):
+        (isinstance(value, Layer) or base_layer_utils.has_weights(value))):
       self._maybe_create_attribute('_layers', [])
       # We need to check object identity to avoid de-duplicating empty
       # container types which compare equal.
@@ -3083,7 +3106,6 @@ class TensorFlowOpLayer(Layer):
     super(TensorFlowOpLayer, self).__init__(
         name=_TF_OP_LAYER_NAME_PREFIX + name, trainable=trainable, dtype=dtype,
         autocast=False)
-    _keras_layers_gauge.get_cell('TensorflowOpLayer').set(True)
     if isinstance(node_def, dict):
       self.node_def = json_format.ParseDict(node_def, node_def_pb2.NodeDef())
     else:
@@ -3213,7 +3235,7 @@ class AddMetric(Layer):
     return config
 
 
-def _in_functional_construction_mode(inputs, args, kwargs, input_list):  # pylint: disable=unused-argument
+def _in_functional_construction_mode(layer, inputs, args, kwargs, input_list):  # pylint: disable=unused-argument
   """Check the arguments to see if we are constructing a functional model."""
   if keras_tensor.keras_tensors_enabled():
     # We are constructing a functional model if any of the inputs
@@ -3223,14 +3245,27 @@ def _in_functional_construction_mode(inputs, args, kwargs, input_list):  # pylin
         for tensor in nest.flatten([inputs, args, kwargs]))
   else:
     if context.executing_eagerly():
-      return all(tf_utils.is_symbolic_tensor(t) for t in input_list)
+      all_inputs_symbolic = all(
+          tf_utils.is_symbolic_tensor(t) for t in input_list)
+      if (base_layer_utils.is_subclassed(layer) and
+          any(tf_utils.is_symbolic_tensor(t) for t in nest.flatten(
+              [inputs, args, kwargs])) and not all_inputs_symbolic):
+        raise ValueError('It appears you are trying to construct a '
+                         'functional model, but not all of the inputs in '
+                         'the first positional argument of your layer call '
+                         'are symbolic tensors. '
+                         '(Input objects, or the output of another layer) '
+                         'Functional models cannot correctly track custom '
+                         'layers unless all values in the first call argument '
+                         'are symbolic.')
+      return all_inputs_symbolic
     else:
       return (base_layer_utils.is_in_keras_graph() or
               all(hasattr(t, '_keras_history') for t in input_list))
 
 
 def _convert_numpy_or_python_types(x):
-  if isinstance(x, (np.ndarray, float, int)):
+  if isinstance(x, (np_arrays.ndarray, np.ndarray, float, int)):
     return ops.convert_to_tensor_v2(x)
   return x
 
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index efd0e490059..022718ea549 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -45,7 +45,7 @@ from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training as training_lib
 from tensorflow.python.keras.mixed_precision.experimental import policy
 from tensorflow.python.keras.optimizer_v2 import rmsprop
-from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.keras.utils import control_flow_util
 from tensorflow.python.layers import core as legacy_core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -376,9 +376,9 @@ class BaseLayerTest(keras_parameterized.TestCase):
       def call(self, inputs, training=None):
         if training is None:
           training = backend.learning_phase()
-        return tf_utils.smart_cond(training,
-                                   lambda: array_ops.ones_like(inputs),
-                                   lambda: array_ops.zeros_like(inputs))
+        return control_flow_util.smart_cond(
+            training, lambda: array_ops.ones_like(inputs),
+            lambda: array_ops.zeros_like(inputs))
 
     return TrainingLayer()
 
diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index de67080af66..6fa955399d9 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -29,16 +29,16 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
+from tensorflow.python.keras.utils import control_flow_util
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import control_flow_util_v2
 from tensorflow.python.ops import control_flow_v2_func_graphs
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import init_ops_v2
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.training.tracking import base as tracking
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 
 _call_context = threading.local()
@@ -118,9 +118,7 @@ def make_variable(name,
     variable_dtype = None
   else:
     # Instantiate initializer if provided initializer is a type object.
-    if isinstance(
-        initializer,
-        (type(init_ops.Initializer), type(init_ops_v2.Initializer))):
+    if tf_inspect.isclass(initializer):
       initializer = initializer()
     init_val = functools.partial(initializer, shape, dtype=dtype)
     variable_dtype = dtype.base_dtype
@@ -211,18 +209,18 @@ def _create_keras_history_helper(tensors, processed_ops, created_layers):
   # TODO(omalleyt): Resolve circular dependency.
   from tensorflow.python.keras.engine import base_layer  # pylint: disable=g-import-not-at-top
   tensor_list = nest.flatten(tensors)
+  sparse_ops = []
+  ragged_tensors = []
   for tensor in tensor_list:
     if getattr(tensor, '_keras_history', None) is not None:
       continue
-    if sparse_tensor.is_sparse(tensor) or ragged_tensor.is_ragged(tensor):
-      example = """
-      weights_mult = lambda x: tf.sparse.sparse_dense_matmul(x, weights)
-      output = tf.keras.layers.Lambda(weights_mult)(input)
-      """
-      raise ValueError('Tensorflow ops that generate ragged or sparse tensor '
-                       'outputs are currently not supported by Keras automatic '
-                       'op wrapping. Please wrap these ops in a Lambda layer: '
-                       '\n\n```\n{example}\n```\n'.format(example=example))
+    if sparse_tensor.is_sparse(tensor):
+      sparse_ops.append(tensor.op)
+      continue
+    if tf_utils.is_ragged(tensor):
+      # Ragged tensors don't have an op property
+      ragged_tensors.append(tensor)
+      continue
     op = tensor.op  # The Op that created this Tensor.
     if op not in processed_ops:
       # Recursively set `_keras_history`.
@@ -264,6 +262,21 @@ def _create_keras_history_helper(tensors, processed_ops, created_layers):
           kwargs={},
           outputs=op.outputs)
       processed_ops.update([op])
+  if sparse_ops or ragged_tensors:
+    lambda_example = """
+    weights_mult = lambda x: tf.sparse.sparse_dense_matmul(x, weights)
+    output = tf.keras.layers.Lambda(weights_mult)(input)
+    """
+    raise ValueError(
+        'Tensorflow ops that generate ragged or sparse tensor '
+        'outputs are currently not supported by Keras automatic '
+        'op wrapping. Please wrap these ops in a Lambda layer: '
+        '\n\n```\n{example}\n```\n'
+        'Sparse ops encountered: {sparse_ops}\n'
+        'Ragged tensors encountered: {ragged_tensors}\n'.format(
+            example=lambda_example,
+            sparse_ops=str(sparse_ops),
+            ragged_tensors=str(ragged_tensors)))
   return processed_ops, created_layers
 
 
@@ -838,6 +851,18 @@ def no_ragged_support(inputs, layer_name):
                      'input to an uniform tensor.' % (layer_name, inputs))
 
 
+def is_split_variable(v):
+  """Returns True if `v` is either a PartionedVariable or a SharedVariable."""
+  return hasattr(v, '_variable_list') or hasattr(v, '_variables')
+
+
+def has_weights(obj):
+  obj_type = type(obj)
+  return (hasattr(obj_type, 'trainable_weights') and
+          hasattr(obj_type, 'non_trainable_weights') and
+          not isinstance(obj, type))
+
+
 # TODO(kathywu): This is a temporary hack. When a network of layers is revived
 # from SavedModel, only the top-level layer will have losses. This causes issues
 # in eager mode because the child layers may have graph losses
diff --git a/tensorflow/python/keras/engine/base_layer_utils_test.py b/tensorflow/python/keras/engine/base_layer_utils_test.py
index 72a4977f003..af389402eb8 100644
--- a/tensorflow/python/keras/engine/base_layer_utils_test.py
+++ b/tensorflow/python/keras/engine/base_layer_utils_test.py
@@ -90,16 +90,25 @@ class OpLayerTest(keras_parameterized.TestCase):
     self.assertAllClose(expected, output)
 
   def test_ragged_op_layer(self):
-    with self.assertRaisesRegex(ValueError, 'Keras automatic op wrapping'):
-      int_values = keras.Input(shape=(None,), dtype=dtypes.int32, ragged=True)
-      float_values = math_ops.cast(int_values, dtypes.float32)
-      _ = keras.Model(int_values, float_values)
+    with testing_utils.use_keras_tensors_scope(False):
+      with self.assertRaisesRegex(
+          ValueError, '(?ms)Keras automatic op wrapping'
+          '.*Ragged tensors encountered: '
+          r'\[tf.RaggedTensor\(values=Tensor\("Cast:0", shape=\((\?|None),\), '
+          r'dtype=float32\), row_splits=Tensor\("Placeholder_1:0", '
+          r'shape=\((\?|None),\), dtype=int64\)\)\]'):
+        int_values = keras.Input(shape=(None,), dtype=dtypes.int32, ragged=True)
+        float_values = math_ops.cast(int_values, dtypes.float32)
+        _ = keras.Model(int_values, float_values)
 
   def test_sparse_op_layer(self):
-    with self.assertRaisesRegex(ValueError, 'Keras automatic op wrapping'):
-      int_values = keras.Input(shape=(None,), dtype=dtypes.int32, sparse=True)
-      float_values = math_ops.cast(int_values, dtypes.float32)
-      _ = keras.Model(int_values, float_values)
+    with testing_utils.use_keras_tensors_scope(False):
+      with self.assertRaisesRegex(
+          ValueError, "(?ms)Keras automatic op wrapping"
+          r".*Sparse ops encountered: \[\<tf\.Operation 'Cast' type=Cast\>\]"):
+        int_values = keras.Input(shape=(None,), dtype=dtypes.int32, sparse=True)
+        float_values = math_ops.cast(int_values, dtypes.float32)
+        _ = keras.Model(int_values, float_values)
 
   def test_ragged_op_layer_keras_tensors(self):
     with testing_utils.use_keras_tensors_scope(True):
diff --git a/tensorflow/python/keras/engine/base_layer_v1.py b/tensorflow/python/keras/engine/base_layer_v1.py
index 724559f5823..85d390f2360 100644
--- a/tensorflow/python/keras/engine/base_layer_v1.py
+++ b/tensorflow/python/keras/engine/base_layer_v1.py
@@ -153,6 +153,8 @@ class Layer(base_layer.Layer):
   @trackable.no_automatic_dependency_tracking
   def __init__(self, trainable=True, name=None, dtype=None, dynamic=False,
                **kwargs):
+    base_layer.keras_api_gauge.get_cell('layer v1').set(True)
+    base_layer.keras_layers_gauge.get_cell(self.__class__.__name__).set(True)
     # These properties should be set by the user via keyword arguments.
     # note that 'dtype', 'input_shape' and 'batch_input_shape'
     # are only applicable to input layers: do not pass these keywords
@@ -215,8 +217,8 @@ class Layer(base_layer.Layer):
     # These lists will be filled via successive calls
     # to self._add_inbound_node().
     # Used in symbolic mode only, only in conjunction with graph-networks
-    self._inbound_nodes = []
-    self._outbound_nodes = []
+    self._inbound_nodes_value = []
+    self._outbound_nodes_value = []
 
     self._init_call_fn_args()
 
@@ -252,6 +254,15 @@ class Layer(base_layer.Layer):
     # might want to turn it off, like Sequential model.
     self._auto_track_sub_layers = True
 
+    # Mark this layer as having been originally built as a tf1 layer/model
+    self._originally_built_as_v1 = True
+
+    # For backwards compat reasons, most built-in layers do not guarantee
+    # That they will 100% preserve the structure of input args when saving
+    # / loading configs. E.g. they may un-nest an arg that is
+    # a list with one element.
+    self._preserve_input_structure_in_config = False
+
   @trackable.no_automatic_dependency_tracking
   @generic_utils.default
   def build(self, input_shape):
@@ -451,7 +462,7 @@ class Layer(base_layer.Layer):
       self._handle_weight_regularization(name_in_scope,
                                          variable,
                                          regularizer)
-    if isinstance(variable, tf_variables.PartitionedVariable):
+    if base_layer_utils.is_split_variable(variable):
       for v in variable:
         backend.track_variable(v)
         if trainable:
@@ -651,6 +662,8 @@ class Layer(base_layer.Layer):
       ValueError: if the layer's `call` method returns None (an invalid value).
       RuntimeError: if `super().__init__()` was not called in the constructor.
     """
+    self._assert_built_as_v1()
+
     if not hasattr(self, '_thread_local'):
       raise RuntimeError(
           'You must call `super().__init__()` in the layer constructor.')
@@ -818,6 +831,20 @@ class Layer(base_layer.Layer):
 
     return outputs
 
+  def _assert_built_as_v1(self):
+    if not hasattr(self, '_originally_built_as_v1'):
+      raise ValueError(
+          'Your Layer or Model is in an invalid state. This can happen if you '
+          'are interleaving estimator/non-estimator models or '
+          'interleaving models/layers made in tf.compat.v1.Graph.as_default() '
+          'with models/layers created outside of it. '
+          'Converting a model to an estimator (via model_to_estimator) '
+          'invalidates all models/layers made before the conversion (even '
+          'if they were not the model converted to an estimator). '
+          'Similarly, making a layer or a model inside a '
+          'a tf.compat.v1.Graph invalidates all layers/models you previously '
+          'made outside of the graph.')
+
   @property
   def dtype(self):
     return self._dtype_policy.variable_dtype
@@ -1713,6 +1740,24 @@ class Layer(base_layer.Layer):
   # Methods & attributes below are all private and only used by the framework. #
   ##############################################################################
 
+  @property
+  def _inbound_nodes(self):
+    return self._inbound_nodes_value
+
+  @_inbound_nodes.setter
+  @trackable.no_automatic_dependency_tracking
+  def _inbound_nodes(self, value):
+    self._inbound_nodes_value = value
+
+  @property
+  def _outbound_nodes(self):
+    return self._outbound_nodes_value
+
+  @_outbound_nodes.setter
+  @trackable.no_automatic_dependency_tracking
+  def _outbound_nodes(self, value):
+    self._outbound_nodes_value = value
+
   def _set_dtype_policy(self, dtype):
     """Sets self._dtype_policy."""
     if isinstance(dtype, policy.Policy):
@@ -1904,7 +1949,7 @@ class Layer(base_layer.Layer):
         regularization = regularizer(v)
       return regularization
 
-    if isinstance(variable, tf_variables.PartitionedVariable):
+    if base_layer_utils.is_split_variable(variable):
       for v in variable:
         self.add_loss(functools.partial(_loss_for_variable, v))
     else:
@@ -2175,7 +2220,7 @@ class Layer(base_layer.Layer):
     super(tracking.AutoTrackable, self).__delattr__(name)
 
     if (isinstance(existing_value, Layer)
-        or trackable_layer_utils.has_weights(existing_value)):
+        or base_layer_utils.has_weights(existing_value)):
       super(tracking.AutoTrackable, self).__setattr__(
           '_layers',
           [l for l in self._layers if l is not existing_value])
@@ -2225,7 +2270,7 @@ class Layer(base_layer.Layer):
     # Be careful about metric if it becomes a Module in future.
     # Append value to self._layers if relevant
     if (getattr(self, '_auto_track_sub_layers', True) and
-        (isinstance(value, Layer) or trackable_layer_utils.has_weights(value))):
+        (isinstance(value, Layer) or base_layer_utils.has_weights(value))):
       self._maybe_create_attribute('_layers', [])
       # We need to check object identity to avoid de-duplicating empty
       # container types which compare equal.
diff --git a/tensorflow/python/keras/engine/base_preprocessing_layer.py b/tensorflow/python/keras/engine/base_preprocessing_layer.py
index c8ba1229ff5..f5577bf058e 100644
--- a/tensorflow/python/keras/engine/base_preprocessing_layer.py
+++ b/tensorflow/python/keras/engine/base_preprocessing_layer.py
@@ -21,9 +21,11 @@ import abc
 import collections
 
 import numpy as np
+import six
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
+from tensorflow.python.eager import monitoring
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -37,13 +39,17 @@ from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util.tf_export import keras_export
 
 
+_kpl_gauge = monitoring.StringGauge(
+    '/tensorflow/api/keras/layers/preprocessing',
+    'keras preprocessing layers usage', 'TFVersion')
+
+
 @keras_export('keras.layers.experimental.preprocessing.PreprocessingLayer')
+@six.add_metaclass(abc.ABCMeta)
 class PreprocessingLayer(Layer):
   """Base class for PreprocessingLayers."""
-  __metaclass__ = abc.ABCMeta
   _must_restore_from_config = True
 
-  @abc.abstractmethod
   def adapt(self, data, reset_state=True):
     # TODO(momernick): Add examples.
     """Fits the state of the preprocessing layer to the data being passed.
@@ -239,7 +245,7 @@ class CombinerPreprocessingLayer(PreprocessingLayer):
 
 def convert_to_list(values, sparse_default_value=None):
   """Convert a TensorLike, CompositeTensor, or ndarray into a Python list."""
-  if ragged_tensor.is_ragged(values):
+  if tf_utils.is_ragged(values):
     # There is a corner case when dealing with ragged tensors: if you get an
     # actual RaggedTensor (not a RaggedTensorValue) passed in non-eager mode,
     # you can't call to_list() on it without evaluating it first. However,
diff --git a/tensorflow/python/keras/engine/compile_utils.py b/tensorflow/python/keras/engine/compile_utils.py
index 2e4e40826e5..5b8f1492fd0 100644
--- a/tensorflow/python/keras/engine/compile_utils.py
+++ b/tensorflow/python/keras/engine/compile_utils.py
@@ -62,7 +62,7 @@ class Container(object):
     struct = map_to_output_names(outputs, self._output_names, struct)
     struct = map_missing_dict_keys(outputs, struct)
     # Allow passing one object that applies to all outputs.
-    if not nest.is_sequence(struct) and nest.is_sequence(outputs):
+    if not nest.is_nested(struct) and nest.is_nested(outputs):
       struct = nest.map_structure(lambda _: struct, outputs)
     return struct
 
@@ -267,7 +267,7 @@ class LossesContainer(Container):
     return loss
 
   def _should_broadcast(self, obj):
-    return not nest.is_sequence(obj)
+    return not nest.is_nested(obj)
 
   def _copy_object(self, obj):
     return obj  # Losses don't need to be copied.
@@ -478,11 +478,11 @@ class MetricsContainer(Container):
 
   def _should_broadcast(self, obj):
     # e.g. 'mse'.
-    if not nest.is_sequence(obj):
+    if not nest.is_nested(obj):
       return True
     # e.g. ['mse'] or ['mse', 'mae'].
     return (isinstance(obj, (list, tuple)) and
-            not any(nest.is_sequence(o) for o in obj))
+            not any(nest.is_nested(o) for o in obj))
 
   def _copy_object(self, obj):
     if isinstance(obj, metrics_mod.Metric):
@@ -572,10 +572,10 @@ def map_to_output_names(y_pred, output_names, struct):
   Returns:
     `struct` mapped to a list in same order as `output_names`.
   """
-  single_output = not nest.is_sequence(y_pred)
+  single_output = not nest.is_nested(y_pred)
   outputs_are_flat_list = (not single_output and
                            isinstance(y_pred, (list, tuple)) and
-                           not any(nest.is_sequence(y_p) for y_p in y_pred))
+                           not any(nest.is_nested(y_p) for y_p in y_pred))
 
   if (single_output or outputs_are_flat_list) and isinstance(struct, dict):
     output_names = output_names or create_pseudo_output_names(y_pred)
diff --git a/tensorflow/python/keras/engine/data_adapter.py b/tensorflow/python/keras/engine/data_adapter.py
index 8184ec7a0c1..e9662da73e7 100644
--- a/tensorflow/python/keras/engine/data_adapter.py
+++ b/tensorflow/python/keras/engine/data_adapter.py
@@ -1238,18 +1238,6 @@ class DataHandler(object):
     if adapter_steps is not None:
       return adapter_steps
 
-    if (ds_context.get_strategy().extended._in_multi_worker_mode() and  # pylint: disable=protected-access
-        (dataset.options().experimental_distribute.auto_shard_policy !=
-         distribute_options.AutoShardPolicy.OFF)):
-      # If the dataset would be auto-sharded, we should not infer a local
-      # steps_per_epoch due to the possible inbalanced sharding between workers.
-      raise ValueError("When dataset is sharded across workers, please "
-                       "specify a reasonable `steps_per_epoch` such that all "
-                       "workers will train the same number of steps and each "
-                       "step can get data from dataset without EOF. This is "
-                       "required for allreduce to succeed. We will handle the "
-                       "last partial batch in the future.")
-
     size = cardinality.cardinality(dataset)
     if size == cardinality.INFINITE and steps is None:
       raise ValueError("When passing an infinitely repeating dataset, you "
@@ -1300,7 +1288,7 @@ def _make_class_weight_map_fn(class_weight):
     """Convert `class_weight` to `sample_weight`."""
     x, y, sw = unpack_x_y_sample_weight(data)
 
-    if nest.is_sequence(y):
+    if nest.is_nested(y):
       raise ValueError(
           "`class_weight` is only supported for Models with a single output.")
 
@@ -1496,7 +1484,7 @@ def pack_x_y_sample_weight(x, y=None, sample_weight=None):
     # there is no ambiguity. This also makes NumPy and Dataset
     # consistent in that the user does not have to wrap their Dataset
     # data in an unecessary tuple
-    if not nest.is_sequence(x):
+    if not nest.is_nested(x):
       return x
     else:
       return (x,)
diff --git a/tensorflow/python/keras/engine/functional.py b/tensorflow/python/keras/engine/functional.py
index fd80e7f8bb4..67fb5bb2cb1 100644
--- a/tensorflow/python/keras/engine/functional.py
+++ b/tensorflow/python/keras/engine/functional.py
@@ -33,6 +33,7 @@ from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import input_layer as input_layer_module
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.keras.engine import keras_tensor
 from tensorflow.python.keras.engine import node as node_module
 from tensorflow.python.keras.engine import training as training_lib
@@ -107,15 +108,12 @@ class Functional(training_lib.Model):
 
   @trackable.no_automatic_dependency_tracking
   def __init__(self, inputs=None, outputs=None, name=None, trainable=True):
-    # generic_utils.validate_kwargs(
-    #     kwargs, {'name', 'trainable'},
-    #     'Functional models may only specify `name` and `trainable` keyword '
-    #     'arguments during initialization. Got an unexpected argument:')
     super(Functional, self).__init__(name=name, trainable=trainable)
     self._init_graph_network(inputs, outputs)
 
   @trackable.no_automatic_dependency_tracking
   def _init_graph_network(self, inputs, outputs):
+    base_layer.keras_api_gauge.get_cell('Functional').set(True)
     # This method is needed for Sequential to reinitialize graph network when
     # layer is added or removed.
     self._is_graph_network = True
@@ -134,12 +132,13 @@ class Functional(training_lib.Model):
     # be called with a dict, where the keys of the dict are the names
     # of the `Input` objects. Extra keys are ignored with warning.
     self._enable_dict_to_input_mapping = (
-        not nest.is_sequence(self._nested_inputs) or
+        not nest.is_nested(self._nested_inputs) or
         (isinstance(self._nested_inputs, (list, tuple, dict)) and
-         not any(nest.is_sequence(t) for t in self._nested_inputs)))
+         not any(nest.is_nested(t) for t in self._nested_inputs)))
 
-    if any(not hasattr(tensor, '_keras_history') for tensor in self.outputs):
-      base_layer_utils.create_keras_history(self._nested_outputs)
+    if not keras_tensor.keras_tensors_enabled():
+      if any(not hasattr(tensor, '_keras_history') for tensor in self.outputs):
+        base_layer_utils.create_keras_history(self._nested_outputs)
 
     self._validate_graph_inputs_and_outputs()
 
@@ -252,6 +251,32 @@ class Functional(training_lib.Model):
     """
     return nest.map_structure(backend.int_shape, self.input)
 
+  @property
+  def input_spec(self):
+    if hasattr(self, '_manual_input_spec'):
+      return self._manual_input_spec
+    if (isinstance(self._nested_inputs, (dict, list, tuple)) and
+        len(self._nested_inputs) != len(self.inputs)):
+      # Case where we have a nested structure.
+      # In such a case we can't safely run any checks.
+      return None
+    if isinstance(self._nested_inputs, dict):
+      # Case where `_nested_inputs` is a plain dict of Inputs.
+      names = sorted(self._nested_inputs.keys())
+      return [input_spec.InputSpec(
+          shape=shape_with_no_batch_size(self._nested_inputs[name]),
+          allow_last_axis_squeeze=True, name=name) for name in names]
+    else:
+      # Single input, or list / tuple of inputs.
+      # The data may be passed as a dict keyed by input name.
+      return [input_spec.InputSpec(
+          shape=shape_with_no_batch_size(x), allow_last_axis_squeeze=True,
+          name=x._keras_history.layer.name) for x in self.inputs]
+
+  @input_spec.setter
+  def input_spec(self, value):
+    self._manual_input_spec = value
+
   @property
   def output(self):
     """Retrieves the output tensor(s) of a layer.
@@ -523,7 +548,7 @@ class Functional(training_lib.Model):
     """Maps `tensors` to their respective `keras.Input`."""
     if self._enable_dict_to_input_mapping and isinstance(tensors, dict):
       ref_inputs = self._nested_inputs
-      if not nest.is_sequence(ref_inputs):
+      if not nest.is_nested(ref_inputs):
         ref_inputs = [self._nested_inputs]
       if isinstance(ref_inputs, dict):
         # In the case that the graph is constructed with dict input tensors,
@@ -643,7 +668,7 @@ class Functional(training_lib.Model):
       if len(layer._inbound_nodes) > 1 or (
           layer._inbound_nodes and not layer._inbound_nodes[0].is_input):
         cls_name = self.__class__.__name__
-        logging.warning(cls_name + ' inputs must come from '
+        logging.warning(cls_name + ' model inputs must come from '
                         '`tf.keras.Input` (thus holding past layer metadata), '
                         'they cannot be the output of '
                         'a previous non-Input layer. '
@@ -672,7 +697,7 @@ class Functional(training_lib.Model):
     for x in self.outputs:
       if not hasattr(x, '_keras_history'):
         cls_name = self.__class__.__name__
-        raise ValueError('Output tensors to a ' + cls_name + ' must be '
+        raise ValueError('Output tensors of a ' + cls_name + ' model must be '
                          'the output of a TensorFlow `Layer` '
                          '(thus holding past layer metadata). Found: ' + str(x))
 
@@ -1033,26 +1058,6 @@ def _should_skip_first_node(layer):
           isinstance(layer._layers[0], input_layer_module.InputLayer))
 
 
-def _deserialize_keras_tensors(kwargs, layer_map):
-  """Deserializes Keras Tensors passed to `call`.."""
-
-  def _deserialize_keras_tensor(t):
-    """Deserializes a single Keras Tensor passed to `call`."""
-    if isinstance(t, tf_utils.ListWrapper):
-      t = t.as_list()
-      layer_name = t[0]
-      node_index = t[1]
-      tensor_index = t[2]
-
-      layer = layer_map[layer_name]
-      node = layer._inbound_nodes[node_index]
-      return nest.flatten(node.outputs)[tensor_index]
-    return t
-
-  kwargs = tf_utils.convert_inner_node_data(kwargs, wrap=True)
-  return nest.map_structure(_deserialize_keras_tensor, kwargs)
-
-
 def connect_ancillary_layers(model, created_layers):
   """Adds layers that are not connected to the outputs to the model."""
   # Layers not connected to outputs, such as those added in `add_loss`.
@@ -1112,6 +1117,36 @@ def reconstruct_from_config(config, custom_objects=None, created_layers=None):
       return 0
     return node_index_map.get((layer.name, config_node_index), None)
 
+  def _deserialize_keras_tensors(kwargs, layer_map):
+    """Deserializes Keras Tensors passed to `call`.."""
+
+    def _deserialize_keras_tensor(t):
+      """Deserializes a single Keras Tensor passed to `call`."""
+      if isinstance(t, tf_utils.ListWrapper):
+        t = t.as_list()
+        layer_name = t[0]
+        node_index = t[1]
+        tensor_index = t[2]
+
+        layer = layer_map[layer_name]
+        new_node_index = get_node_index(layer, node_index)
+        if new_node_index is None:
+          # The inbound node may not have been processed yet,
+          # (This can happen e.g. if it depends on a different set
+          # of inputs than those that have been processed already).
+          # raise an IndexError so that the current node puts itself
+          # back on the unprocessed queue.
+          # Caution: This may lead to infinite loops for malformed
+          # network configurations! (or when there is a bug in
+          # the network config loading code).
+          raise IndexError
+        node = layer._inbound_nodes[new_node_index]
+        return nest.flatten(node.outputs)[tensor_index]
+      return t
+
+    kwargs = tf_utils.convert_inner_node_data(kwargs, wrap=True)
+    return nest.map_structure(_deserialize_keras_tensor, kwargs)
+
   def process_node(layer, node_data):
     """Deserialize a node.
 
@@ -1158,7 +1193,9 @@ def reconstruct_from_config(config, custom_objects=None, created_layers=None):
     # Call layer on its inputs, thus creating the node
     # and building the layer if needed.
     if input_tensors is not None:
-      input_tensors = base_layer_utils.unnest_if_single_tensor(input_tensors)
+      if not layer._preserve_input_structure_in_config:
+        input_tensors = (
+            base_layer_utils.unnest_if_single_tensor(input_tensors))
       output_tensors = layer(input_tensors, **kwargs)
 
       # Update node index map.
@@ -1293,7 +1330,7 @@ def get_network_config(network, serialize_layer_fn=None):
         tf_utils.ListWrapper([layer.name, new_node_index, tensor_index]))
   model_inputs = nest.pack_sequence_as(network._nested_inputs, model_inputs)
   # Preserve external Keras compat for Models with single input.
-  if not nest.is_sequence(model_inputs):
+  if not nest.is_nested(model_inputs):
     model_inputs = [model_inputs]
   model_inputs = tf_utils.convert_inner_node_data(model_inputs)
   config['input_layers'] = model_inputs
@@ -1309,8 +1346,17 @@ def get_network_config(network, serialize_layer_fn=None):
         tf_utils.ListWrapper([layer.name, new_node_index, tensor_index]))
   model_outputs = nest.pack_sequence_as(network._nested_outputs, model_outputs)
   # Preserve external Keras compat for Models with single output.
-  if not nest.is_sequence(model_outputs):
+  if not nest.is_nested(model_outputs):
     model_outputs = [model_outputs]
   model_outputs = tf_utils.convert_inner_node_data(model_outputs)
   config['output_layers'] = model_outputs
   return config
+
+
+def shape_with_no_batch_size(x):
+  if x.shape.rank is None:
+    return None
+  shape = x.shape.as_list()
+  if shape:
+    shape[0] = None
+  return shape
diff --git a/tensorflow/python/keras/engine/functional_test.py b/tensorflow/python/keras/engine/functional_test.py
index 47e4dc488a3..dc87098d71f 100644
--- a/tensorflow/python/keras/engine/functional_test.py
+++ b/tensorflow/python/keras/engine/functional_test.py
@@ -932,8 +932,77 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     # Check that second input was correctly added to first.
     self.assertEqual(history.history['loss'][0], 0.0)
 
-  @combinations.generate(combinations.keras_mode_combinations())
-  def test_call_kwarg_derived_from_keras_layer(self):
+  @combinations.generate(combinations.times(
+      combinations.keras_mode_combinations(mode='eager'),
+      combinations.combine(use_keras_tensors=False)))
+  def test_only_some_in_first_arg_derived_from_keras_layer(self):
+    class MyAddAll(layers.Layer):
+
+      def call(self, inputs):
+        x = inputs[0]
+        for inp in inputs[1:]:
+          if inp is not None:
+            x = x + inp
+        return x
+
+    input1 = input_layer_lib.Input(10)
+    input2 = input_layer_lib.Input(10)
+    layer = MyAddAll()
+
+    with self.assertRaisesRegexp(ValueError, 'construct a functional'):
+      layer([0.0, input1, None, input2, None])
+
+  @combinations.generate(combinations.times(
+      combinations.keras_mode_combinations(mode='eager'),
+      combinations.combine(use_keras_tensors=True)))
+  def test_only_some_in_first_arg_derived_from_keras_layer_keras_tensors(self):
+    # This functionality is unsupported in v1 graphs
+
+    class MyAddAll(layers.Layer):
+
+      def call(self, inputs):
+        x = inputs[0]
+        for inp in inputs[1:]:
+          if inp is not None:
+            x = x + inp
+        return x
+
+    input1 = input_layer_lib.Input(10)
+    input2 = input_layer_lib.Input(10)
+    layer = MyAddAll()
+    outputs = layer([0.0, input1, None, input2, None])
+    model = training_lib.Model([input1, input2], outputs)
+    self.assertIn(layer, model.layers)
+    model.compile(
+        'sgd',
+        'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    history = model.fit(
+        x=[3 * np.ones((10, 10)), 7 * np.ones((10, 10))],
+        y=10 * np.ones((10, 10)),
+        batch_size=2)
+    # Check that second input was correctly added to first.
+    self.assertEqual(history.history['loss'][0], 0.0)
+
+    # Check serialization.
+    model = training_lib.Model.from_config(
+        model.get_config(), custom_objects={'MyAddAll': MyAddAll})
+    model.compile(
+        'sgd',
+        'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    history = model.fit(
+        x=[3 * np.ones((10, 10)), 7 * np.ones((10, 10))],
+        y=10 * np.ones((10, 10)),
+        batch_size=2)
+    # Check that second input was correctly added to first.
+    self.assertEqual(history.history['loss'][0], 0.0)
+
+  @combinations.generate(
+      combinations.times(
+          combinations.keras_mode_combinations(),
+          combinations.combine(share_already_used_layer=[True, False])))
+  def test_call_kwarg_derived_from_keras_layer(self, share_already_used_layer):
 
     class MaybeAdd(layers.Layer):
 
@@ -942,9 +1011,26 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
           return x1 + x2
         return x1
 
+    class IdentityLayer(layers.Layer):
+
+      def call(self, x):
+        return x
+
     input1 = input_layer_lib.Input(10)
     input2 = input_layer_lib.Input(10)
-    outputs = MaybeAdd()(input1, x2=input2)
+    identity_layer = IdentityLayer()
+
+    if share_already_used_layer:
+      # We have had model serialization/deserialization break in the past:
+      # when a layer was previously used to construct other functional models
+      # and had a non-empty list of inbound nodes before being used to define
+      # the model being serialized/deserialized.
+      # (The serialization/deserialization was not correctly adjusting
+      # the node_index serialization/deserialization).
+      # So, we explicitly test this case.
+      training_lib.Model([input1], identity_layer(input1))
+
+    outputs = MaybeAdd()(input1, x2=identity_layer(input2))
     model = training_lib.Model([input1, input2], outputs)
     model.compile(
         'sgd',
@@ -958,7 +1044,11 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     self.assertEqual(history.history['loss'][0], 0.0)
 
     model = training_lib.Model.from_config(
-        model.get_config(), custom_objects={'MaybeAdd': MaybeAdd})
+        model.get_config(),
+        custom_objects={
+            'MaybeAdd': MaybeAdd,
+            'IdentityLayer': IdentityLayer
+        })
     model.compile(
         'sgd',
         'mse',
@@ -993,7 +1083,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     self.assertEqual(history.history['loss'][0], 0.0)
 
     # Check the output dtype
-    self.assertEqual(model(array_ops.ones(3, 3)).dtype, dtypes.float16)
+    self.assertEqual(model(array_ops.ones((3, 10))).dtype, dtypes.float16)
 
     model = training_lib.Model.from_config(
         model.get_config(), custom_objects={'Double': Double})
@@ -1009,7 +1099,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     self.assertEqual(history.history['loss'][0], 0.0)
 
     # Check the output dtype
-    self.assertEqual(model(array_ops.ones(3, 3)).dtype, dtypes.float16)
+    self.assertEqual(model(array_ops.ones((3, 10))).dtype, dtypes.float16)
 
   @combinations.generate(combinations.keras_mode_combinations())
   def test_call_kwarg_nonserializable(self):
@@ -1041,10 +1131,18 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
         TypeError, 'Layer double was passed non-JSON-serializable arguments.'):
       model.get_config()
 
-  @combinations.generate(combinations.times(
-      combinations.keras_mode_combinations(),
-      combinations.keras_tensor_combinations()))
-  def test_call_kwarg_derived_from_keras_layer_and_first_arg_is_constant(self):
+  @combinations.generate(
+      combinations.times(
+          combinations.keras_mode_combinations(),
+          combinations.keras_tensor_combinations(),
+          combinations.combine(share_already_used_layer=[True, False])))
+  def test_call_kwarg_derived_from_keras_layer_and_first_arg_is_constant(
+      self, share_already_used_layer):
+
+    class IdentityLayer(layers.Layer):
+
+      def call(self, x):
+        return x
 
     class MaybeAdd(layers.Layer):
 
@@ -1054,7 +1152,18 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
         return x1
 
     input2 = input_layer_lib.Input(10)
-    outputs = MaybeAdd()(3., x2=input2)
+    identity_layer = IdentityLayer()
+    if share_already_used_layer:
+      # We have had model serialization/deserialization break in the past:
+      # when a layer was previously used to construct other functional models
+      # and had a non-empty list of inbound nodes before being used to define
+      # the model being serialized/deserialized.
+      # (The serialization/deserialization was not correctly adjusting
+      # the node_index serialization/deserialization).
+      # So, we explicitly test this case.
+      training_lib.Model([input2], identity_layer(input2))
+
+    outputs = MaybeAdd()(3., x2=identity_layer(input2))
     model = training_lib.Model([input2], outputs)
     model.compile(
         'sgd',
@@ -1068,7 +1177,11 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     self.assertEqual(history.history['loss'][0], 0.0)
 
     model = training_lib.Model.from_config(
-        model.get_config(), custom_objects={'MaybeAdd': MaybeAdd})
+        model.get_config(),
+        custom_objects={
+            'MaybeAdd': MaybeAdd,
+            'IdentityLayer': IdentityLayer
+        })
     model.compile(
         'sgd',
         'mse',
@@ -1141,7 +1254,8 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     input2 = input_layer_lib.Input(10)
     input3 = input_layer_lib.Input(10)
 
-    outputs = AddAll()(
+    layer = AddAll()
+    outputs = layer(
         [input1, 4 * array_ops.ones((1, 10))],
         x3={
             'a': input2,
@@ -1149,6 +1263,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
             'c': 5 * array_ops.ones((1, 10))
         })
     model = training_lib.Model([input1, input2, input3], outputs)
+    self.assertIn(layer, model.layers)
     model.compile(
         'sgd',
         'mse',
@@ -1725,8 +1840,8 @@ class NestedNetworkTest(keras_parameterized.TestCase):
     network = functional.Functional.from_config(network.get_config())
 
     result_tensor = network({
-        'x': array_ops.ones((1, 1), 'float32'),
-        'y': array_ops.ones((1, 1), 'float32')
+        'x1': array_ops.ones((1, 1), 'float32'),
+        'x2': array_ops.ones((1, 1), 'float32')
     })
     result = self.evaluate(result_tensor)
     self.assertAllEqual(result, [[2.]])
@@ -2253,5 +2368,76 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
       # if training is not passed at runtime
       self.assertAllEqual(network(x), _call(x, None))
 
+
+class InputsOutputsErrorTest(keras_parameterized.TestCase):
+
+  @testing_utils.enable_v2_dtype_behavior
+  def test_input_error(self):
+    inputs = input_layer_lib.Input((10,))
+    outputs = layers.Dense(10)(inputs)
+    with self.assertRaisesRegex(
+        TypeError, "('Keyword argument not understood:', 'input')"):
+      models.Model(input=inputs, outputs=outputs)
+
+  @testing_utils.enable_v2_dtype_behavior
+  def test_output_error(self):
+    inputs = input_layer_lib.Input((10,))
+    outputs = layers.Dense(10)(inputs)
+    with self.assertRaisesRegex(
+        TypeError, "('Keyword argument not understood:', 'output')"):
+      models.Model(inputs=inputs, output=outputs)
+
+  def test_input_spec(self):
+    if not context.executing_eagerly():
+      return
+    inputs = input_layer_lib.Input((10,))
+    outputs = layers.Dense(10)(inputs)
+    model = models.Model(inputs, outputs)
+    with self.assertRaisesRegex(
+        ValueError, r'.*expected shape=.*'):
+      model(np.zeros((3, 11)))
+
+  def test_input_spec_list_of_inputs(self):
+    if not context.executing_eagerly():
+      return
+    input_1 = input_layer_lib.Input((10,), name='1')
+    input_2 = input_layer_lib.Input((5,), name='2')
+    x = layers.Concatenate()([input_1, input_2])
+    outputs = layers.Dense(10)(x)
+    model = models.Model([input_1, input_2], outputs)
+    with self.assertRaisesRegex(
+        ValueError, r'.*expects 2 input.*'):
+      model(np.zeros((3, 10)))
+    with self.assertRaisesRegex(
+        ValueError, r'.*expects 2 input.*'):
+      model([np.zeros((3, 10)), np.zeros((3, 5)), np.zeros((3, 10))])
+    with self.assertRaisesRegex(
+        ValueError, r'.*expected shape=.*'):
+      model([np.zeros((3, 10)), np.zeros((3, 6))])
+
+    # Test passing data via dict keyed by input name
+    with self.assertRaisesRegex(
+        ValueError, r'Missing data for input.*'):
+      model({'1': np.zeros((3, 10))})
+    with self.assertRaisesRegex(
+        ValueError, r'.*expected shape=.*'):
+      model({'1': np.zeros((3, 10)), '2': np.zeros((3, 6))})
+
+  def test_input_spec_dict(self):
+    if not context.executing_eagerly():
+      return
+    input_1 = input_layer_lib.Input((10,))
+    input_2 = input_layer_lib.Input((5,))
+    x = layers.Concatenate()([input_1, input_2])
+    outputs = layers.Dense(10)(x)
+    model = models.Model({'1': input_1, '2': input_2}, outputs)
+    with self.assertRaisesRegex(
+        ValueError, r'Missing data for input.*'):
+      model({'1': np.zeros((3, 10))})
+    with self.assertRaisesRegex(
+        ValueError, r'.*expected shape=.*'):
+      model({'1': np.zeros((3, 10)), '2': np.zeros((3, 6))})
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/input_layer.py b/tensorflow/python/keras/engine/input_layer.py
index 75cf4960d27..33f9320e516 100644
--- a/tensorflow/python/keras/engine/input_layer.py
+++ b/tensorflow/python/keras/engine/input_layer.py
@@ -76,8 +76,9 @@ class InputLayer(base_layer.Layer):
       batch_size: Optional input batch size (integer or None).
       dtype: Optional datatype of the input. When not provided, the Keras
           default float type will be used.
-      input_tensor: Optional tensor to use as layer input
-          instead of creating a placeholder.
+      input_tensor: Optional tensor to use as layer input. If set, the layer
+          will use the `tf.TypeSpec` of this tensor rather
+          than creating a new placeholder tensor.
       sparse: Boolean, whether the placeholder created is meant to be sparse.
           Default to False.
       ragged: Boolean, whether the placeholder created is meant to be ragged.
@@ -162,18 +163,15 @@ class InputLayer(base_layer.Layer):
       self.is_placeholder = True
       self._batch_input_shape = batch_input_shape
     else:
-      raise_eager_tensor_error = False
       if keras_tensor.keras_tensors_enabled():
         if not isinstance(input_tensor, keras_tensor.KerasTensor):
-          raise_eager_tensor_error = True
+          input_tensor = keras_tensor.keras_tensor_from_tensor(input_tensor)
       else:
         if not tf_utils.is_symbolic_tensor(input_tensor):
-          raise_eager_tensor_error = True
-      if raise_eager_tensor_error:
-        raise ValueError('You should not pass an EagerTensor to `Input`. '
-                         'For example, instead of creating an '
-                         'InputLayer, you should instantiate your model and '
-                         'directly call it on your input.')
+          raise ValueError('You should not pass an EagerTensor to `Input`. '
+                           'For example, instead of creating an '
+                           'InputLayer, you should instantiate your model and '
+                           'directly call it on your input.')
       self.is_placeholder = False
       try:
         self._batch_input_shape = tuple(input_tensor.shape.as_list())
@@ -244,7 +242,8 @@ def Input(  # pylint: disable=invalid-name
           if `sparse` is False, sparse tensors can still be passed into the
           input - they will be densified with a default value of 0.
       tensor: Optional existing tensor to wrap into the `Input` layer.
-          If set, the layer will not create a placeholder tensor.
+          If set, the layer will use the `tf.TypeSpec` of this tensor rather
+          than creating a new placeholder tensor.
       ragged: A boolean specifying whether the placeholder to be created is
           ragged. Only one of 'ragged' and 'sparse' can be True. In this case,
           values of 'None' in the 'shape' argument represent ragged dimensions.
diff --git a/tensorflow/python/keras/engine/input_layer_test.py b/tensorflow/python/keras/engine/input_layer_test.py
new file mode 100644
index 00000000000..1b15f34458c
--- /dev/null
+++ b/tensorflow/python/keras/engine/input_layer_test.py
@@ -0,0 +1,148 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#,============================================================================
+"""Tests for InputLayer construction."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import def_function
+from tensorflow.python.keras import combinations
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.engine import functional
+from tensorflow.python.keras.engine import input_layer as input_layer_lib
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import test
+
+
+class InputLayerTest(keras_parameterized.TestCase):
+
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  def testBasicOutputShapeNoBatchSize(self):
+    # Create a Keras Input
+    x = input_layer_lib.Input(shape=(32,), name='input_a')
+    self.assertAllEqual(x.shape.as_list(), [None, 32])
+
+    # Verify you can construct and use a model w/ this input
+    model = functional.Functional(x, x * 2.0)
+    self.assertAllEqual(model(array_ops.ones((3, 32))),
+                        array_ops.ones((3, 32)) * 2.0)
+
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  def testBasicOutputShapeWithBatchSize(self):
+    # Create a Keras Input
+    x = input_layer_lib.Input(batch_size=6, shape=(32,), name='input_b')
+    self.assertAllEqual(x.shape.as_list(), [6, 32])
+
+    # Verify you can construct and use a model w/ this input
+    model = functional.Functional(x, x * 2.0)
+    self.assertAllEqual(model(array_ops.ones(x.shape)),
+                        array_ops.ones(x.shape) * 2.0)
+
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def testBasicOutputShapeNoBatchSizeInTFFunction(self):
+    model = None
+    @def_function.function
+    def run_model(inp):
+      nonlocal model
+      if not model:
+        # Create a Keras Input
+        x = input_layer_lib.Input(shape=(8,), name='input_a')
+        self.assertAllEqual(x.shape.as_list(), [None, 8])
+
+        # Verify you can construct and use a model w/ this input
+        model = functional.Functional(x, x * 2.0)
+      return model(inp)
+
+    self.assertAllEqual(run_model(array_ops.ones((10, 8))),
+                        array_ops.ones((10, 8)) * 2.0)
+
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  def testInputTensorArg(self):
+    with testing_utils.use_keras_tensors_scope(True):
+      # Create a Keras Input
+      x = input_layer_lib.Input(tensor=array_ops.zeros((7, 32)))
+      self.assertAllEqual(x.shape.as_list(), [7, 32])
+
+      # Verify you can construct and use a model w/ this input
+      model = functional.Functional(x, x * 2.0)
+      self.assertAllEqual(model(array_ops.ones(x.shape)),
+                          array_ops.ones(x.shape) * 2.0)
+
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def testInputTensorArgInTFFunction(self):
+    with testing_utils.use_keras_tensors_scope(True):
+      # We use a mutable model container instead of a model python variable,
+      # because python 2.7 does not have `nonlocal`
+      model_container = {}
+
+      @def_function.function
+      def run_model(inp):
+        if not model_container:
+          # Create a Keras Input
+          x = input_layer_lib.Input(tensor=array_ops.zeros((10, 16)))
+          self.assertAllEqual(x.shape.as_list(), [10, 16])
+
+          # Verify you can construct and use a model w/ this input
+          model_container['model'] = functional.Functional(x, x * 3.0)
+        return model_container['model'](inp)
+
+      self.assertAllEqual(run_model(array_ops.ones((10, 16))),
+                          array_ops.ones((10, 16)) * 3.0)
+
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def testCompositeInputTensorArg(self):
+    with testing_utils.use_keras_tensors_scope(True):
+      # Create a Keras Input
+      rt = ragged_tensor.RaggedTensor.from_row_splits(
+          values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
+      x = input_layer_lib.Input(tensor=rt)
+
+      # Verify you can construct and use a model w/ this input
+      model = functional.Functional(x, x * 2)
+
+      # And that the model works
+      rt = ragged_tensor.RaggedTensor.from_row_splits(
+          values=[3, 21, 4, 1, 53, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
+      self.assertAllEqual(model(rt), rt * 2)
+
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def testCompositeInputTensorArgInTFFunction(self):
+    with testing_utils.use_keras_tensors_scope(True):
+      # We use a mutable model container instead of a model python variable,
+      # because python 2.7 does not have `nonlocal`
+      model_container = {}
+
+      @def_function.function
+      def run_model(inp):
+        if not model_container:
+          # Create a Keras Input
+          rt = ragged_tensor.RaggedTensor.from_row_splits(
+              values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
+          x = input_layer_lib.Input(tensor=rt)
+
+          # Verify you can construct and use a model w/ this input
+          model_container['model'] = functional.Functional(x, x * 3)
+        return model_container['model'](inp)
+
+      # And verify the model works
+      rt = ragged_tensor.RaggedTensor.from_row_splits(
+          values=[3, 21, 4, 1, 53, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
+      self.assertAllEqual(run_model(rt), rt * 3)
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/engine/input_spec.py b/tensorflow/python/keras/engine/input_spec.py
index b57b2974aae..52a2829ffdb 100644
--- a/tensorflow/python/keras/engine/input_spec.py
+++ b/tensorflow/python/keras/engine/input_spec.py
@@ -44,14 +44,32 @@ class InputSpec(object):
   a None shape is compatible with any shape.
 
   Arguments:
-      dtype: Expected DataType of the input.
-      shape: Shape tuple, expected shape of the input
-          (may include None for unchecked axes).
-      ndim: Integer, expected rank of the input.
-      max_ndim: Integer, maximum rank of the input.
-      min_ndim: Integer, minimum rank of the input.
-      axes: Dictionary mapping integer axes to
-          a specific dimension value.
+    dtype: Expected DataType of the input.
+    shape: Shape tuple, expected shape of the input
+      (may include None for unchecked axes). Includes the batch size.
+    ndim: Integer, expected rank of the input.
+    max_ndim: Integer, maximum rank of the input.
+    min_ndim: Integer, minimum rank of the input.
+    axes: Dictionary mapping integer axes to
+      a specific dimension value.
+    allow_last_axis_squeeze: If True, then allow inputs of rank N+1 as long
+      as the last axis of the input is 1, as well as inputs of rank N-1
+      as long as the last axis of the spec is 1.
+    name: Expected key corresponding to this input when passing data as
+      a dictionary.
+
+  Example:
+
+  ```python
+  class MyLayer(Layer):
+      def __init__(self):
+          super(MyLayer, self).__init__()
+          # The layer will accept inputs with shape (?, 28, 28) & (?, 28, 28, 1)
+          # and raise an appropriate error message otherwise.
+          self.input_spec = InputSpec(
+              shape=(None, 28, 28, 1),
+              allow_last_axis_squeeze=True)
+  ```
   """
 
   def __init__(self,
@@ -60,8 +78,15 @@ class InputSpec(object):
                ndim=None,
                max_ndim=None,
                min_ndim=None,
-               axes=None):
+               axes=None,
+               allow_last_axis_squeeze=False,
+               name=None):
     self.dtype = dtypes.as_dtype(dtype).name if dtype is not None else None
+    shape = tensor_shape.TensorShape(shape)
+    if shape.rank is None:
+      shape = None
+    else:
+      shape = tuple(shape.as_list())
     if shape is not None:
       self.ndim = len(shape)
       self.shape = shape
@@ -70,6 +95,8 @@ class InputSpec(object):
       self.shape = None
     self.max_ndim = max_ndim
     self.min_ndim = min_ndim
+    self.name = name
+    self.allow_last_axis_squeeze = allow_last_axis_squeeze
     try:
       axes = axes or {}
       self.axes = {int(k): axes[k] for k in axes}
@@ -149,6 +176,21 @@ def assert_input_compatibility(input_spec, inputs, layer_name):
   if not input_spec:
     return
 
+  input_spec = nest.flatten(input_spec)
+  if isinstance(inputs, dict):
+    # Flatten `inputs` by reference order if input spec names are provided
+    names = [spec.name for spec in input_spec]
+    if all(names):
+      list_inputs = []
+      for name in names:
+        if name not in inputs:
+          raise ValueError('Missing data for input "%s". '
+                           'You passed a data dictionary with keys %s. '
+                           'Expected the following keys: %s' %
+                           (name, list(inputs.keys()), names))
+        list_inputs.append(inputs[name])
+      inputs = list_inputs
+
   inputs = nest.flatten(inputs)
   for x in inputs:
     # Having a shape/dtype is the only commonality of the various tensor-like
@@ -157,81 +199,83 @@ def assert_input_compatibility(input_spec, inputs, layer_name):
     # have a `shape` attribute.
     if not hasattr(x, 'shape'):
       raise TypeError('Inputs to a layer should be tensors. Got: %s' % (x,))
-  input_spec = nest.flatten(input_spec)
+
   if len(inputs) != len(input_spec):
     raise ValueError('Layer ' + layer_name + ' expects ' +
-                     str(len(input_spec)) + ' inputs, '
+                     str(len(input_spec)) + ' input(s), '
                      'but it received ' + str(len(inputs)) +
                      ' input tensors. Inputs received: ' + str(inputs))
   for input_index, (x, spec) in enumerate(zip(inputs, input_spec)):
     if spec is None:
       continue
 
-    if (spec.ndim is not None or
-        spec.min_ndim is not None or
-        spec.max_ndim is not None):
-      if x.shape.ndims is None:
-        raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                         layer_name + ' is incompatible with the layer: '
-                         'its rank is undefined, but the layer requires a '
-                         'defined rank.')
-
+    shape = tensor_shape.TensorShape(x.shape)
+    if shape.rank is None:
+      return
     # Check ndim.
-    if spec.ndim is not None:
-      ndim = x.shape.ndims
+    if spec.ndim is not None and not spec.allow_last_axis_squeeze:
+      ndim = shape.rank
       if ndim != spec.ndim:
         raise ValueError('Input ' + str(input_index) + ' of layer ' +
                          layer_name + ' is incompatible with the layer: '
                          'expected ndim=' + str(spec.ndim) + ', found ndim=' +
                          str(ndim) + '. Full shape received: ' +
-                         str(x.shape.as_list()))
+                         str(tuple(shape)))
     if spec.max_ndim is not None:
-      ndim = x.shape.ndims
+      ndim = x.shape.rank
       if ndim is not None and ndim > spec.max_ndim:
         raise ValueError('Input ' + str(input_index) + ' of layer ' +
                          layer_name + ' is incompatible with the layer: '
                          'expected max_ndim=' + str(spec.max_ndim) +
                          ', found ndim=' + str(ndim))
     if spec.min_ndim is not None:
-      ndim = x.shape.ndims
+      ndim = x.shape.rank
       if ndim is not None and ndim < spec.min_ndim:
         raise ValueError('Input ' + str(input_index) + ' of layer ' +
                          layer_name + ' is incompatible with the layer: '
                          ': expected min_ndim=' + str(spec.min_ndim) +
                          ', found ndim=' + str(ndim) +
                          '. Full shape received: ' +
-                         str(x.shape.as_list()))
+                         str(tuple(shape)))
     # Check dtype.
     if spec.dtype is not None:
-      if x.dtype != spec.dtype:
+      if x.dtype.name != spec.dtype:
         raise ValueError('Input ' + str(input_index) + ' of layer ' +
                          layer_name + ' is incompatible with the layer: '
                          'expected dtype=' + str(spec.dtype) +
                          ', found dtype=' + str(x.dtype))
+
     # Check specific shape axes.
+    shape_as_list = shape.as_list()
     if spec.axes:
-      shape = x.shape.as_list()
-      if shape is not None:
-        for axis, value in spec.axes.items():
-          if hasattr(value, 'value'):
-            value = value.value
-          if value is not None and shape[int(axis)] not in {value, None}:
-            raise ValueError(
-                'Input ' + str(input_index) + ' of layer ' + layer_name + ' is'
-                ' incompatible with the layer: expected axis ' + str(axis) +
-                ' of input shape to have value ' + str(value) +
-                ' but received input with shape ' + str(shape))
+      for axis, value in spec.axes.items():
+        if hasattr(value, 'value'):
+          value = value.value
+        if value is not None and shape_as_list[int(axis)] not in {value, None}:
+          raise ValueError(
+              'Input ' + str(input_index) + ' of layer ' + layer_name + ' is'
+              ' incompatible with the layer: expected axis ' + str(axis) +
+              ' of input shape to have value ' + str(value) +
+              ' but received input with shape ' + display_shape(x.shape))
     # Check shape.
-    if spec.shape is not None:
-      shape = x.shape.as_list()
-      if shape is not None:
-        for spec_dim, dim in zip(spec.shape, shape):
-          if spec_dim is not None and dim is not None:
-            if spec_dim != dim:
-              raise ValueError('Input ' + str(input_index) +
-                               ' is incompatible with layer ' + layer_name +
-                               ': expected shape=' + str(spec.shape) +
-                               ', found shape=' + str(shape))
+    if spec.shape is not None and shape.rank is not None:
+      spec_shape = spec.shape
+      if spec.allow_last_axis_squeeze:
+        if shape_as_list and shape_as_list[-1] == 1:
+          shape_as_list = shape_as_list[:-1]
+        if spec_shape and spec_shape[-1] == 1:
+          spec_shape = spec_shape[:-1]
+      for spec_dim, dim in zip(spec_shape, shape_as_list):
+        if spec_dim is not None and dim is not None:
+          if spec_dim != dim:
+            raise ValueError('Input ' + str(input_index) +
+                             ' is incompatible with layer ' + layer_name +
+                             ': expected shape=' + str(spec.shape) +
+                             ', found shape=' + display_shape(x.shape))
+
+
+def display_shape(shape):
+  return str(tuple(shape.as_list()))
 
 
 def to_tensor_spec(input_spec, default_dtype=None):
diff --git a/tensorflow/python/keras/engine/keras_tensor.py b/tensorflow/python/keras/engine/keras_tensor.py
index b31e71e1fa8..3aa9b595d4f 100644
--- a/tensorflow/python/keras/engine/keras_tensor.py
+++ b/tensorflow/python/keras/engine/keras_tensor.py
@@ -28,7 +28,9 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.util import nest
 from tensorflow.python.util import object_identity
 
-_KERAS_TENSORS_ENABLED = False
+# pylint: disable=g-classes-have-attributes
+
+_KERAS_TENSORS_ENABLED = True
 
 
 def enable_keras_tensors():
@@ -51,67 +53,100 @@ def keras_tensors_enabled():
 class KerasTensor(object):
   """A representation of a Keras in/output during Functional API construction.
 
-  `KerasTensor`s are an alternative representation for Keras `Inputs`
-  and for intermediate outputs of layers during Functional API construction of
-  models. They are a lightweight data structure comprised of only the
-  `tf.TypeSpec` of the Tensor that will be consumed/produced in the
-  corresponding position of the model.
+  `KerasTensor`s are tensor-like objects that represent the symbolic inputs
+  and outputs of Keras layers during Functional model construction. They are
+  comprised of the `tf.TypeSpec` of the (Composite)Tensor that will be
+  consumed/produced in the corresponding location of the Functional model.
 
-  They implement just small subset of `tf.Tensor`'s attributes and
-  methods, and also overload
-  the same operators as `tf.Tensor` and automatically turn them into
-  Keras layers in the model.
+  KerasTensors are intended as a private API, so users should never need to
+  directly instantiate `KerasTensor`s.
 
-  `KerasTensor`s are still internal-only and are a work in progress, but they
-  have several advantages over using a graph `tf.Tensor` to represent
-  symbolic values in functional models.
-  - Unlike symbolic tensors, they do not need to refer to a graph. This means
-    Keras does not need to maintain a never-deleted global background graph
-    containing all layers ever called during functional model construction when
-    constructing Functional Models with KerasTensors. These memory savings
-    can be significant.
+  **Building Functional Models with KerasTensors**
+  `tf.keras.Input` produces `KerasTensor`s that represent the symbolic inputs
+  to your model.
 
-  - Triggering Keras functional model construction is simpler
-    when it just has to check whether something is a KerasTensor, rather
-    than trying to infer if a tensor was meant to be a symbolic keras
-    representation or just a value produced during function tracing.
+  Passing a `KerasTensor` to a `tf.keras.Layer` `__call__` lets the layer know
+  that you are building a Functional model. The layer __call__ will
+  infer the output signature and return `KerasTensor`s with `tf.TypeSpec`s
+  corresponding to the symbolic outputs of that layer call. These output
+  `KerasTensor`s will have all of the internal KerasHistory metadata attached
+  to them that Keras needs to construct a Functional Model.
 
-  - Autolambda layers (converting tf ops on symbolic Keras tensors to lambda
-    Keras layers in the model) use TF's internal dispatching mechanism, instead
-    of trying to manually walk a graph and extract nodes from it.
-    The dispatching mechanism is simpler, works more reliably, and is less
-    likely to run into issues with composite tensors or strange tf ops/nodes.
+  Currently, layers infer the output signature by:
+    * creating a scratch `FuncGraph`
+    * making placeholders in the scratch graph that match the input typespecs
+    * Calling `layer.call` on these placeholders
+    * extracting the signatures of the outputs before clearing the scratch graph
 
-    (And when it fails, it's by design: because dispatch is explicitly not
-    supported on the op & it's more obvious that dispatch doesn't support the
-    setting).
+  (Note: names assigned to KerasTensors by this process are not guaranteed to
+  be unique, and are subject to implementation details).
 
-  - Because they support arbitrary typespecs, models/layers that use
-    KerasTensors are generally more friendly to composite tensors of different
-    types than using symbolic graph tensors (which must have a TensorSpec and
-    can't have arbitrary typespecs)
+  `tf.nest` methods are used to insure all of the inputs/output data
+  structures get maintained, with elements swapped between KerasTensors and
+  placeholders.
 
-  To experiment with using KerasTensors instead of symbolic graph `tf.Tensors`,
-  import keras_tensor directly and call `keras_tensor.enable_keras_tensors()`
+  In rare cases (such as when directly manipulating shapes using Keras layers),
+  the layer may be able to partially infer the value of of the output in
+  addition to just inferring the signature.
+  When this happens, the returned KerasTensor will also contain the inferred
+  value information. Follow-on layers can use this information.
+  during their own output signature inference.
+  E.g. if one layer produces a symbolic `KerasTensor` that the next layer uses
+  as the shape of its outputs, partially knowing the value helps infer the
+  output shape.
+
+  **Automatically converting TF APIs to layers**:
+  If you passing a `KerasTensor` to a TF API that supports dispatching,
+  Keras will automatically turn that API call into a lambda
+  layer in the Functional model, and return KerasTensors representing the
+  symbolic outputs.
+
+  Most TF APIs that take only tensors as input and produce output tensors
+  will support dispatching.
+
+  Calling a `tf.function` does not support dispatching, so you cannot pass
+  `KerasTensor`s as inputs to a `tf.function`.
+
+  Higher-order apis that take methods which produce tensors (e.g. `tf.while`,
+  `tf.map_fn`, `tf.cond`) also do not currently support dispatching. So, you
+  cannot directly pass KerasTensors as inputs to these APIs either. If you
+  want to use these APIs inside of a Functional model, you must put them inside
+  of a custom layer.
+
+  Args:
+    type_spec: The `tf.TypeSpec` for the symbolic input created by
+      `tf.keras.Input`, or symbolically inferred for the output
+      during a symbolic layer `__call__`.
+    inferred_value: (Optional) a non-symbolic static value, possibly partially
+      specified, that could be symbolically inferred for the outputs during
+      a symbolic layer `__call__`. This will generally only happen when
+      grabbing and manipulating `tf.int32` shapes directly as tensors.
+      Statically inferring values in this way and storing them in the
+      KerasTensor allows follow-on layers to infer output signatures
+      more effectively. (e.g. when using a symbolic shape tensor to later
+      construct a tensor with that shape).
+    name: (optional) string name for this KerasTensor. Names automatically
+      generated by symbolic layer `__call__`s are not guaranteed to be unique,
+      and are subject to implementation details.
   """
 
-  def __init__(self, type_spec, inferred_shape_value=None, name=None):
-    """Construct a KerasTensor from a type_spec and an optional name."""
+  def __init__(self, type_spec, inferred_value=None, name=None):
+    """Constructs a KerasTensor."""
     if not isinstance(type_spec, type_spec_module.TypeSpec):
       raise ValueError('KerasTensors must be constructed with a `tf.TypeSpec`.')
 
     self._type_spec = type_spec
-    self._inferred_shape_value = inferred_shape_value
+    self._inferred_value = inferred_value
     self._name = name
 
   @property
   def type_spec(self):
-    """Returns the `TypeSpec` that represents this Tensor."""
+    """Returns the `tf.TypeSpec` symbolically inferred for this Keras output."""
     return self._type_spec
 
   @property
   def shape(self):
-    """Returns the `TensorShape` that represents the shape of the tensor."""
+    """Returns the `TensorShape` symbolically inferred for this Keras output."""
     # TODO(kaftan): This is only valid for normal/sparse/ragged tensors.
     # may need to raise an error when it's not valid for a type_spec,
     # but some keras code (e.g. build-related stuff) will likely fail when
@@ -122,11 +157,48 @@ class KerasTensor(object):
     return self.shape
 
   def __len__(self):
-    raise TypeError('Keras Functional inputs/outputs do not '
+    raise TypeError('Keras symbolic inputs/outputs do not '
                     'implement `__len__`. You may be '
-                    'seeing this error if you are passing them '
-                    'to a TF API that Keras cannot automatically '
-                    'convert to a lambda layer.')
+                    'trying to pass Keras symbolic inputs/outputs '
+                    'to a TF API that does not register dispatching, '
+                    'preventing Keras from automatically '
+                    'converting the API call to a lambda layer '
+                    'in the Functional Model. This error will also get raised '
+                    'if you try asserting a symbolic input/output directly.')
+
+  @property
+  def op(self):
+    raise TypeError('Keras symbolic inputs/outputs do not '
+                    'implement `op`. You may be '
+                    'trying to pass Keras symbolic inputs/outputs '
+                    'to a TF API that does not register dispatching, '
+                    'preventing Keras from automatically '
+                    'converting the API call to a lambda layer '
+                    'in the Functional Model.')
+
+  def __hash__(self):
+    raise TypeError('Tensors are unhashable. (%s)'
+                    'Instead, use tensor.ref() as the key.' % self)
+
+  # Note: This enables the KerasTensor's overloaded "right" binary
+  # operators to run when the left operand is an ndarray, because it
+  # accords the Tensor class higher priority than an ndarray, or a
+  # numpy matrix.
+  # In the future explore chaning this to using numpy's __numpy_ufunc__
+  # mechanism, which allows more control over how Tensors interact
+  # with ndarrays.
+  __array_priority__ = 100
+
+  def __array__(self):
+    raise TypeError(
+        'Cannot convert a symbolic Keras input/output to a numpy array. '
+        'This error may indicate that you\'re trying to pass a symbolic value '
+        'to a NumPy call, which is not supported. Or, '
+        'you may be trying to pass Keras symbolic inputs/outputs '
+        'to a TF API that does not register dispatching, '
+        'preventing Keras from automatically '
+        'converting the API call to a lambda layer '
+        'in the Functional Model.')
 
   @property
   def is_tensor_like(self):
@@ -144,15 +216,50 @@ class KerasTensor(object):
       shape = tensor_shape.TensorShape(dim_list)
     if not self.shape.is_compatible_with(shape):
       raise ValueError(
-          "Keras Intermediate Value's shape %s is not"
+          "Keras symbolic input/output's shape %s is not"
           "compatible with supplied shape %s" %
           (self.shape, shape))
     else:
       self._type_spec._shape = shape  # pylint: disable=protected-access
 
+  def __str__(self):
+    symbolic_description = ''
+    inferred_value_string = ''
+    name_string = ''
+
+    if hasattr(self, '_keras_history'):
+      layer = self._keras_history.layer
+      symbolic_description = (
+          ', description="created by layer \'%s\'"' % (layer.name,))
+    if self._inferred_value is not None:
+      inferred_value_string = (
+          ', inferred_value=%s' % self._inferred_value)
+    if self.name is not None:
+      name_string = ', name=\'%s\'' % self._name
+    return 'KerasTensor(type_spec=%s%s%s%s)' % (
+        self.type_spec, inferred_value_string,
+        name_string, symbolic_description)
+
+  def __repr__(self):
+    symbolic_description = ''
+    inferred_value_string = ''
+    if isinstance(self.type_spec, tensor_spec.TensorSpec):
+      type_spec_string = 'shape=%s dtype=%s' % (self.shape, self.dtype.name)
+    else:
+      type_spec_string = 'type_spec=%s' % self.type_spec
+
+    if hasattr(self, '_keras_history'):
+      layer = self._keras_history.layer
+      symbolic_description = ' (created by layer \'%s\')' % (layer.name,)
+    if self._inferred_value is not None:
+      inferred_value_string = (
+          ' inferred_value=%s' % self._inferred_value)
+    return '<KerasTensor: %s%s%s>' % (
+        type_spec_string, inferred_value_string, symbolic_description)
+
   @property
   def dtype(self):
-    """Returns the `dtype` of elements in the tensor."""
+    """Returns the `dtype` symbolically inferred for this Keras output."""
     # TODO(kaftan): This is only valid for normal/sparse/ragged tensors.
     # may need to raise an error when it's not valid for a type_spec,
     # but some keras code (e.g. build-related stuff) will likely fail when
@@ -177,17 +284,17 @@ class KerasTensor(object):
       shape = [dim.value for dim in self.shape.dims]
 
     if shape is None:
-      raise TypeError('Cannot iterate over a KerasTensor with unknown shape.')
+      raise TypeError('Cannot iterate over a Tensor with unknown shape.')
     if not shape:
       raise TypeError('Cannot iterate over a scalar.')
     if shape[0] is None:
       raise TypeError(
-          'Cannot iterate over a KerasTensor with unknown first dimension.')
+          'Cannot iterate over a Tensor with unknown first dimension.')
     return _KerasTensorIterator(self, shape[0])
 
   @property
   def name(self):
-    """Returns the (optionally provided) name of the described tensor."""
+    """Returns the (non-unique, optional) name of this symbolic Keras value."""
     return self._name
 
   @classmethod
@@ -246,10 +353,13 @@ class _KerasTensorIterator(object):
 
 def keras_tensor_to_placeholder(x):
   """Construct a graph placeholder to represent a KerasTensor when tracing."""
+  if hasattr(x, '_user_registered_symbolic_object'):
+    return x._user_registered_symbolic_object  # pylint: disable=protected-access
+
   if isinstance(x, KerasTensor):
     spec = x.type_spec
 
-    if x._inferred_shape_value is not None:  # pylint: disable=protected-access
+    if x._inferred_value is not None:  # pylint: disable=protected-access
       # If we suspect this KerasTensor might be representing a shape tensor,
       # and we were able to extract value information with TensorFlow's shape
       # handling when making the KerasTensor, we construct the placeholder by
@@ -265,14 +375,14 @@ def keras_tensor_to_placeholder(x):
       #   manipulated w/ floating point numbers then converted back
       # * cases where int32 tensors w/ rank > 2 are manipulated before being
       #   used as a shape tensor
-      inferred_shape_value = array_ops.shape(
+      inferred_value = array_ops.shape(
           array_ops.placeholder(
-              shape=x._inferred_shape_value, dtype=dtypes.int32))  # pylint: disable=protected-access
+              shape=x._inferred_value, dtype=dtypes.int32))  # pylint: disable=protected-access
       if spec.shape.rank == 0:
         # `tf.shape` always returns a rank-1, we may need to turn it back to a
         # scalar.
-        inferred_shape_value = inferred_shape_value[0]
-      return inferred_shape_value  # pylint: disable=protected-access
+        inferred_value = inferred_value[0]
+      return inferred_value  # pylint: disable=protected-access
 
     if isinstance(spec, sparse_tensor.SparseTensorSpec):
       # nest.map_structure loses dense shape information for sparse tensors.
@@ -292,17 +402,64 @@ def keras_tensor_to_placeholder(x):
     return x
 
 
+class UserRegisteredSpec(type_spec_module.TypeSpec):
+  """TypeSpec to represent user-registered symbolic objects."""
+
+  def __init__(self, shape, dtype):
+    self.shape = shape
+    self._dtype = dtype
+    self.dtype = dtype
+
+  def _component_specs(self):
+    raise NotImplementedError
+
+  def _from_components(self, components):
+    raise NotImplementedError
+
+  def _serialize(self):
+    raise NotImplementedError
+
+  def _to_components(self, value):
+    raise NotImplementedError
+
+  def value_type(self):
+    raise NotImplementedError
+
+# Tensorflow tensors have a maximum dimension of 254
+# (See //tensorflow/core/framework/tensor_shape.h )
+# So we do not try to infer values for int32 tensors larger than this,
+# As they cannot represent shapes.
+_MAX_TENSOR_DIMS = 254
+
+
 def keras_tensor_from_tensor(x):
   """Convert a traced (composite)tensor to a representative KerasTensor."""
   name = getattr(x, 'name', None)
-  inferred_shape_value = None
-  type_spec = type_spec_module.type_spec_from_value(x)
+  inferred_value = None
+
+  # TODO(b/161487382):
+  # Special-case user-registered symbolic objects (registered by the
+  # private `register_symbolic_tensor_type` method) by passing them between
+  # scratch graphs directly.
+  # This is needed to not break Tensorflow probability
+  # while they finish migrating to composite tensors.
+  user_registered_symbolic = False
+  try:
+    from tensorflow.python.keras.utils import tf_utils  # pylint: disable=g-import-not-at-top to prevent circular imports
+    if isinstance(x, tuple(tf_utils._user_convertible_tensor_types)):  # pylint: disable=protected-access
+      user_registered_symbolic = True
+  except ImportError:
+    pass
+  if user_registered_symbolic:
+    type_spec = UserRegisteredSpec(x.shape, x.dtype)
+  else:
+    type_spec = type_spec_module.type_spec_from_value(x)
 
   if (isinstance(type_spec, tensor_spec.TensorSpec)
       and type_spec.dtype == dtypes.int32
       and type_spec.shape.rank < 2):
     # If this tensor might be representing shape information,
-    # (dtype=int32, rank of 0 or 1)
+    # (dtype=int32, rank of 0 or 1, not too large to represent a shape)
     # we attempt to capture any value information tensorflow's
     # shape handling can extract from the current scratch graph.
     #
@@ -317,14 +474,21 @@ def keras_tensor_from_tensor(x):
     #   manipulated w/ floating point numbers then converted back
     # * cases where int32 tensors w/ rank > 2 are manipulated before being
     #   used as a shape tensor
-    inferred_shape_value = array_ops.ones(shape=x).shape
-    if inferred_shape_value.dims:
-      inferred_shape_value = inferred_shape_value.as_list()
+    # * cases where int32 tensors too large to represent shapes are manipulated
+    #   to a smaller size before being used as a shape tensor
+    inferred_value = array_ops.ones(shape=x).shape
+    if inferred_value.dims:
+      inferred_value = inferred_value.as_list()
+      if len(inferred_value) > _MAX_TENSOR_DIMS:
+        inferred_value = None
     else:
-      inferred_shape_value = None
+      inferred_value = None
 
   out = KerasTensor(type_spec,
-                    inferred_shape_value=inferred_shape_value, name=name)
+                    inferred_value=inferred_value, name=name)
+  if user_registered_symbolic:
+    out._user_registered_symbolic_object = x  # pylint: disable=protected-access
+
   if hasattr(x, '_keras_mask'):
     out._keras_mask = KerasTensor(  # pylint: disable=protected-access
         type_spec_module.type_spec_from_value(x._keras_mask))  # pylint: disable=protected-access
diff --git a/tensorflow/python/keras/engine/keras_tensor_test.py b/tensorflow/python/keras/engine/keras_tensor_test.py
new file mode 100644
index 00000000000..dfe1077ddd9
--- /dev/null
+++ b/tensorflow/python/keras/engine/keras_tensor_test.py
@@ -0,0 +1,102 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""InputSpec tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.keras import layers
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.engine import keras_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class KerasTensorTest(test.TestCase):
+
+  def test_repr_and_string(self):
+    kt = keras_tensor.KerasTensor(
+        type_spec=tensor_spec.TensorSpec(shape=(1, 2, 3), dtype=dtypes.float32))
+    expected_str = ("KerasTensor(type_spec=TensorSpec(shape=(1, 2, 3), "
+                    "dtype=tf.float32, name=None))")
+    expected_repr = "<KerasTensor: shape=(1, 2, 3) dtype=float32>"
+    self.assertEqual(expected_str, str(kt))
+    self.assertEqual(expected_repr, repr(kt))
+
+    kt = keras_tensor.KerasTensor(
+        type_spec=tensor_spec.TensorSpec(shape=(2,), dtype=dtypes.int32),
+        inferred_value=[2, 3])
+    expected_str = ("KerasTensor(type_spec=TensorSpec(shape=(2,), "
+                    "dtype=tf.int32, name=None), inferred_value=[2, 3])")
+    expected_repr = (
+        "<KerasTensor: shape=(2,) dtype=int32 inferred_value=[2, 3]>")
+    self.assertEqual(expected_str, str(kt))
+    self.assertEqual(expected_repr, repr(kt))
+
+    kt = keras_tensor.KerasTensor(
+        type_spec=sparse_tensor.SparseTensorSpec(
+            shape=(1, 2, 3), dtype=dtypes.float32))
+    expected_str = ("KerasTensor(type_spec=SparseTensorSpec("
+                    "TensorShape([1, 2, 3]), tf.float32))")
+    expected_repr = (
+        "<KerasTensor: type_spec=SparseTensorSpec("
+        "TensorShape([1, 2, 3]), tf.float32)>")
+    self.assertEqual(expected_str, str(kt))
+    self.assertEqual(expected_repr, repr(kt))
+
+    with testing_utils.use_keras_tensors_scope(True):
+      inp = layers.Input(shape=(3, 5))
+      kt = layers.Dense(10)(inp)
+      expected_str = (
+          "KerasTensor(type_spec=TensorSpec(shape=(None, 3, 10), "
+          "dtype=tf.float32, name=None), name='dense/BiasAdd:0', "
+          "description=\"created by layer 'dense'\")")
+      expected_repr = (
+          "<KerasTensor: shape=(None, 3, 10) dtype=float32 (created "
+          "by layer 'dense')>")
+      self.assertEqual(expected_str, str(kt))
+      self.assertEqual(expected_repr, repr(kt))
+
+      kt = array_ops.reshape(kt, shape=(3, 5, 2))
+      expected_str = (
+          "KerasTensor(type_spec=TensorSpec(shape=(3, 5, 2), dtype=tf.float32, "
+          "name=None), name='tf.reshape/Reshape:0', description=\"created "
+          "by layer 'tf.reshape'\")")
+      expected_repr = ("<KerasTensor: shape=(3, 5, 2) dtype=float32 (created "
+                       "by layer 'tf.reshape')>")
+      self.assertEqual(expected_str, str(kt))
+      self.assertEqual(expected_repr, repr(kt))
+
+      kts = array_ops.unstack(kt)
+      for i in range(3):
+        expected_str = (
+            "KerasTensor(type_spec=TensorSpec(shape=(5, 2), dtype=tf.float32, "
+            "name=None), name='tf.unstack/unstack:%s', description=\"created "
+            "by layer 'tf.unstack'\")" % (i,))
+        expected_repr = ("<KerasTensor: shape=(5, 2) dtype=float32 "
+                         "(created by layer 'tf.unstack')>")
+        self.assertEqual(expected_str, str(kts[i]))
+        self.assertEqual(expected_repr, repr(kts[i]))
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  tensor_shape.enable_v2_tensorshape()
+  test.main()
diff --git a/tensorflow/python/keras/engine/node.py b/tensorflow/python/keras/engine/node.py
index f629648baf0..2a35477eea2 100644
--- a/tensorflow/python/keras/engine/node.py
+++ b/tensorflow/python/keras/engine/node.py
@@ -28,9 +28,9 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import keras_tensor
+from tensorflow.python.keras.saving.saved_model import json_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.util import nest
-from tensorflow.python.util import serialization
 
 _CONSTANT_VALUE = '_CONSTANT_VALUE'
 
@@ -169,9 +169,26 @@ class Node(object):
     arguments.update(kwargs)
     kwargs = arguments
 
+    def _serialize_keras_tensor(t):
+      """Serializes a single Tensor passed to `call`."""
+      if hasattr(t, '_keras_history'):
+        kh = t._keras_history
+        node_index = kh.node_index
+        node_key = make_node_key(kh.layer.name, node_index)
+        new_node_index = node_conversion_map.get(node_key, 0)
+        return [kh.layer.name, new_node_index, kh.tensor_index]
+
+      if isinstance(t, np.ndarray):
+        return t.tolist()
+
+      if isinstance(t, ops.Tensor):
+        return backend.get_value(t).tolist()
+
+      return t
+
     kwargs = nest.map_structure(_serialize_keras_tensor, kwargs)
     try:
-      json.dumps(kwargs, default=serialization.get_json_type)
+      json.dumps(kwargs, default=json_utils.get_json_type)
     except TypeError:
       kwarg_types = nest.map_structure(type, kwargs)
       raise TypeError('Layer ' + self.layer.name +
@@ -198,7 +215,8 @@ class Node(object):
       return tf_utils.ListWrapper(data)
 
     data = nest.map_structure(serialize_first_arg_tensor, inputs)
-    if not nest.is_sequence(data):
+    if (not nest.is_nested(data) and
+        not self.layer._preserve_input_structure_in_config):
       data = [data]
     data = tf_utils.convert_inner_node_data(data)
     return data
@@ -272,18 +290,3 @@ class KerasHistory(
 
 def is_keras_tensor(obj):
   return hasattr(obj, '_keras_history')
-
-
-def _serialize_keras_tensor(t):
-  """Serializes a single Tensor passed to `call`."""
-  if hasattr(t, '_keras_history'):
-    kh = t._keras_history
-    return [kh.layer.name, kh.node_index, kh.tensor_index]
-
-  if isinstance(t, np.ndarray):
-    return t.tolist()
-
-  if isinstance(t, ops.Tensor):
-    return backend.get_value(t).tolist()
-
-  return t
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index a79d541c4e4..3b50506370b 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -37,7 +37,6 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
-from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -115,6 +114,7 @@ class Sequential(functional.Functional):
     # Skip the init in FunctionalModel since model doesn't have input/output yet
     super(functional.Functional, self).__init__(  # pylint: disable=bad-super-call
         name=name, autocast=False)
+    base_layer.keras_api_gauge.get_cell('Sequential').set(True)
     self.supports_masking = True
     self._compute_output_and_mask_jointly = True
     self._auto_track_sub_layers = False
@@ -405,7 +405,6 @@ class Sequential(functional.Functional):
     outputs = self.call(inputs, mask=mask)
     return getattr(outputs, '_keras_mask', None)
 
-  @deprecated('2021-01-01', 'Please use `model.predict()` instead.')
   def predict_proba(self, x, batch_size=32, verbose=0):
     """Generates class probability predictions for the input samples.
 
@@ -420,6 +419,9 @@ class Sequential(functional.Functional):
     Returns:
         A Numpy array of probability predictions.
     """
+    logging.warning('`model.predict_proba()` is deprecated and '
+                    'will be removed after 2021-01-01. '
+                    'Please use `model.predict()` instead.')
     preds = self.predict(x, batch_size, verbose)
     if preds.min() < 0. or preds.max() > 1.:
       logging.warning('Network returning invalid probability values. '
@@ -428,14 +430,6 @@ class Sequential(functional.Functional):
                       '(like softmax or sigmoid would).')
     return preds
 
-  @deprecated('2021-01-01',
-              'Please use instead:'
-              '* `np.argmax(model.predict(x), axis=-1)`, '
-              '  if your model does multi-class classification '
-              '  (e.g. if it uses a `softmax` last-layer activation).'
-              '* `(model.predict(x) > 0.5).astype("int32")`, '
-              '  if your model does binary classification '
-              '  (e.g. if it uses a `sigmoid` last-layer activation).')
   def predict_classes(self, x, batch_size=32, verbose=0):
     """Generate class predictions for the input samples.
 
@@ -450,6 +444,15 @@ class Sequential(functional.Functional):
     Returns:
         A numpy array of class predictions.
     """
+    logging.warning('`model.predict_classes()` is deprecated and '
+                    'will be removed after 2021-01-01. '
+                    'Please use instead:'
+                    '* `np.argmax(model.predict(x), axis=-1)`, '
+                    '  if your model does multi-class classification '
+                    '  (e.g. if it uses a `softmax` last-layer activation).'
+                    '* `(model.predict(x) > 0.5).astype("int32")`, '
+                    '  if your model does binary classification '
+                    '  (e.g. if it uses a `sigmoid` last-layer activation).')
     proba = self.predict(x, batch_size=batch_size, verbose=verbose)
     if proba.shape[-1] > 1:
       return proba.argmax(axis=-1)
@@ -493,10 +496,16 @@ class Sequential(functional.Functional):
 
   @property
   def input_spec(self):
+    if hasattr(self, '_manual_input_spec'):
+      return self._manual_input_spec
     if self.layers and hasattr(self.layers[0], 'input_spec'):
       return self.layers[0].input_spec
     return None
 
+  @input_spec.setter
+  def input_spec(self, value):
+    self._manual_input_spec = value
+
   @property
   def _trackable_saved_model_saver(self):
     return model_serialization.SequentialSavedModelSaver(self)
@@ -518,6 +527,8 @@ class Sequential(functional.Functional):
 def _get_shape_tuple(t):
   if hasattr(t, 'shape'):
     shape = t.shape
+    if isinstance(shape, tuple):
+      return shape
     if shape.rank is not None:
       return tuple(shape.as_list())
     return None
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 986d3a05887..bf542129e5c 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -33,7 +33,6 @@ from tensorflow.python.distribute import values as ds_values
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import monitoring
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import func_graph
@@ -52,6 +51,7 @@ from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer as lso
 from tensorflow.python.keras.saving import hdf5_format
 from tensorflow.python.keras.saving import save
+from tensorflow.python.keras.saving.saved_model import json_utils
 from tensorflow.python.keras.saving.saved_model import model_serialization
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import layer_utils
@@ -77,7 +77,6 @@ from tensorflow.python.training.tracking import layer_utils as trackable_layer_u
 from tensorflow.python.training.tracking import util as trackable_utils
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
-from tensorflow.python.util import serialization
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
@@ -96,10 +95,6 @@ except ImportError:
 # pylint: enable=g-import-not-at-top
 
 
-_keras_api_gauge = monitoring.BoolGauge('/tensorflow/api/keras',
-                                        'keras api usage', 'method')
-
-
 def enable_multi_worker(method):
   """Decorator that handles running `method` with multi-worker strategy."""
 
@@ -245,6 +240,8 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
 
   @trackable.no_automatic_dependency_tracking
   def __init__(self, *args, **kwargs):
+    base_layer.keras_api_gauge.get_cell('model').set(True)
+
     # Special case for Subclassed Functional Model, which we couldn't detect
     # when __new__ is called. We only realize it is a functional model when it
     # calls super.__init__ with input and output tensor.
@@ -255,11 +252,14 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
       functional.Functional.__init__(self, *args, **kwargs)
       return
 
+    base_layer.keras_api_gauge.get_cell('Model subclass').set(True)
     # The following are implemented as property functions:
     # self.trainable_weights
     # self.non_trainable_weights
-    generic_utils.validate_kwargs(kwargs, {'trainable', 'dtype', 'dynamic',
-                                           'name', 'autocast'})
+    # `inputs` / `outputs` will only appear in kwargs if either are misspelled.
+    generic_utils.validate_kwargs(kwargs, {
+        'trainable', 'dtype', 'dynamic', 'name', 'autocast', 'inputs', 'outputs'
+    })
     super(Model, self).__init__(**kwargs)
     # By default, Model is a subclass model, which is not in graph network.
     self._is_graph_network = False
@@ -307,7 +307,6 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
 
     self._init_batch_counters()
     self._base_model_initialized = True
-    _keras_api_gauge.get_cell('model').set(True)
 
   @trackable.no_automatic_dependency_tracking
   def _init_batch_counters(self):
@@ -327,7 +326,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     if all(
         isinstance(v, (base_layer.Layer,
                        data_structures.TrackableDataStructure)) or
-        trackable_layer_utils.has_weights(v) for v in nest.flatten(value)):
+        base_layer_utils.has_weights(v) for v in nest.flatten(value)):
       try:
         self._base_model_initialized
       except AttributeError:
@@ -391,6 +390,9 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
       else:
         graph = backend.get_graph()
       with graph.as_default():
+        if (isinstance(input_shape, list) and
+            all(d is None or isinstance(d, int) for d in input_shape)):
+          input_shape = tuple(input_shape)
         if isinstance(input_shape, list):
           x = [base_layer_utils.generate_placeholders_from_shape(shape)
                for shape in input_shape]
@@ -533,7 +535,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
         ValueError: In case of invalid arguments for
             `optimizer`, `loss` or `metrics`.
     """
-    _keras_api_gauge.get_cell('compile').set(True)
+    base_layer.keras_api_gauge.get_cell('compile').set(True)
     with self.distribute_strategy.scope():
       self._validate_compile(optimizer, metrics, **kwargs)
       self._run_eagerly = run_eagerly
@@ -707,7 +709,8 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     # (2) Explicitly setting run_eagerly causes a Model to be run eagerly.
     # (3) Not explicitly setting run_eagerly defaults to TF's global setting.
     return (self.dynamic or self._run_eagerly or
-            (def_function.RUN_FUNCTIONS_EAGERLY and self._run_eagerly is None))
+            (def_function.functions_run_eagerly() and
+             self._run_eagerly is None))
 
   @run_eagerly.setter
   def run_eagerly(self, value):
@@ -1025,7 +1028,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
         ValueError: In case of mismatch between the provided input data
             and what the model expects or when the input data is empty.
     """
-    _keras_api_gauge.get_cell('fit').set(True)
+    base_layer.keras_api_gauge.get_cell('fit').set(True)
     # Legacy graph support is contained in `training_v1.Model`.
     version_utils.disallow_legacy_graph('Model', 'fit')
     self._assert_compile_was_called()
@@ -1074,7 +1077,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
             steps=data_handler.inferred_steps)
 
       self.stop_training = False
-      train_function = self.make_train_function()
+      self.train_function = self.make_train_function()
       self._train_counter.assign(0)
       callbacks.on_train_begin()
       training_logs = None
@@ -1090,13 +1093,13 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
         with data_handler.catch_stop_iteration():
           for step in data_handler.steps():
             with trace.Trace(
-                'TraceContext',
-                graph_type='train',
+                'train',
                 epoch_num=epoch,
                 step_num=step,
-                batch_size=batch_size):
+                batch_size=batch_size,
+                _r=1):
               callbacks.on_train_batch_begin(step)
-              tmp_logs = train_function(iterator)
+              tmp_logs = self.train_function(iterator)
               if data_handler.should_sync:
                 context.async_wait()
               logs = tmp_logs  # No error, now safe to assign to logs.
@@ -1334,7 +1337,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
         RuntimeError: If `model.evaluate` is wrapped in `tf.function`.
         ValueError: in case of invalid arguments.
     """
-    _keras_api_gauge.get_cell('evaluate').set(True)
+    base_layer.keras_api_gauge.get_cell('evaluate').set(True)
     version_utils.disallow_legacy_graph('Model', 'evaluate')
     self._assert_compile_was_called()
     self._check_call_args('evaluate')
@@ -1371,16 +1374,16 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
             steps=data_handler.inferred_steps)
 
       logs = {}
-      test_function = self.make_test_function()
+      self.test_function = self.make_test_function()
       self._test_counter.assign(0)
       callbacks.on_test_begin()
       for _, iterator in data_handler.enumerate_epochs():  # Single epoch.
         self.reset_metrics()
         with data_handler.catch_stop_iteration():
           for step in data_handler.steps():
-            with trace.Trace('TraceContext', graph_type='test', step_num=step):
+            with trace.Trace('test', step_num=step, _r=1):
               callbacks.on_test_batch_begin(step)
-              tmp_logs = test_function(iterator)
+              tmp_logs = self.test_function(iterator)
               if data_handler.should_sync:
                 context.async_wait()
               logs = tmp_logs  # No error, now safe to assign to logs.
@@ -1562,7 +1565,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
             or in case a stateful model receives a number of samples
             that is not a multiple of the batch size.
     """
-    _keras_api_gauge.get_cell('predict').set(True)
+    base_layer.keras_api_gauge.get_cell('predict').set(True)
     version_utils.disallow_legacy_graph('Model', 'predict')
     self._check_call_args('predict')
     _disallow_inside_tf_function('predict')
@@ -1593,7 +1596,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
             epochs=1,
             steps=data_handler.inferred_steps)
 
-      predict_function = self.make_predict_function()
+      self.predict_function = self.make_predict_function()
       self._predict_counter.assign(0)
       callbacks.on_predict_begin()
       batch_outputs = None
@@ -1601,7 +1604,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
         with data_handler.catch_stop_iteration():
           for step in data_handler.steps():
             callbacks.on_predict_batch_begin(step)
-            tmp_batch_outputs = predict_function(iterator)
+            tmp_batch_outputs = self.predict_function(iterator)
             if data_handler.should_sync:
               context.async_wait()
             batch_outputs = tmp_batch_outputs  # No error, now safe to assign.
@@ -1698,8 +1701,8 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
       iterator = data_adapter.single_batch_iterator(self.distribute_strategy, x,
                                                     y, sample_weight,
                                                     class_weight)
-      train_function = self.make_train_function()
-      logs = train_function(iterator)
+      self.train_function = self.make_train_function()
+      logs = self.train_function(iterator)
 
     if reset_metrics:
       self.reset_metrics()
@@ -1757,8 +1760,8 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     with self.distribute_strategy.scope():
       iterator = data_adapter.single_batch_iterator(self.distribute_strategy, x,
                                                     y, sample_weight)
-      test_function = self.make_test_function()
-      logs = test_function(iterator)
+      self.test_function = self.make_test_function()
+      logs = self.test_function(iterator)
 
     if reset_metrics:
       self.reset_metrics()
@@ -1791,8 +1794,8 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     _disallow_inside_tf_function('predict_on_batch')
     with self.distribute_strategy.scope():
       iterator = data_adapter.single_batch_iterator(self.distribute_strategy, x)
-      predict_function = self.make_predict_function()
-      outputs = predict_function(iterator)
+      self.predict_function = self.make_predict_function()
+      outputs = self.predict_function(iterator)
     return tf_utils.to_numpy_or_python_type(outputs)
 
   @deprecation.deprecated(
@@ -1818,7 +1821,6 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
       `Model.fit` now supports generators, so there is no longer any need to use
       this endpoint.
     """
-    _keras_api_gauge.get_cell('fit_generator').set(True)
     return self.fit(
         generator,
         steps_per_epoch=steps_per_epoch,
@@ -1851,7 +1853,6 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
       `Model.evaluate` now supports generators, so there is no longer any need
       to use this endpoint.
     """
-    _keras_api_gauge.get_cell('evaluate_generator').set(True)
     self._check_call_args('evaluate_generator')
 
     return self.evaluate(
@@ -1879,7 +1880,6 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
       `Model.predict` now supports generators, so there is no longer any need
       to use this endpoint.
     """
-    _keras_api_gauge.get_cell('predict_generator').set(True)
     return self.predict(
         generator,
         steps=steps,
@@ -2260,7 +2260,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     """
     model_config = self._updated_config()
     return json.dumps(
-        model_config, default=serialization.get_json_type, **kwargs)
+        model_config, default=json_utils.get_json_type, **kwargs)
 
   def to_yaml(self, **kwargs):
     """Returns a yaml string containing the network configuration.
@@ -2730,7 +2730,7 @@ def _minimize(strategy, tape, optimizer, loss, trainable_variables):
 
   # Whether to aggregate gradients outside of optimizer. This requires support
   # of the optimizer and doesn't work with ParameterServerStrategy and
-  # CentralStroageStrategy.
+  # CentralStorageStrategy.
   aggregate_grads_outside_optimizer = (
       optimizer._HAS_AGGREGATE_GRAD and  # pylint: disable=protected-access
       not isinstance(strategy.extended,
diff --git a/tensorflow/python/keras/engine/training_gpu_test.py b/tensorflow/python/keras/engine/training_gpu_test.py
index 996e281bf0c..0498a03a1ed 100644
--- a/tensorflow/python/keras/engine/training_gpu_test.py
+++ b/tensorflow/python/keras/engine/training_gpu_test.py
@@ -20,10 +20,9 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 import numpy as np
-
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import combinations
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers.convolutional import Conv2D
@@ -71,7 +70,7 @@ class TrainingGPUTest(test.TestCase, parameterized.TestCase):
       return simple_model
 
     if test.is_gpu_available(cuda_only=True):
-      with test_util.use_gpu():
+      with testing_utils.use_gpu():
         losses_to_test = ['sparse_categorical_crossentropy',
                           'categorical_crossentropy', 'binary_crossentropy']
 
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 2885422ac42..15976c0a072 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -1644,13 +1644,21 @@ class TestExceptionsAndWarnings(keras_parameterized.TestCase):
           run_eagerly=testing_utils.should_run_eagerly())
 
   @keras_parameterized.run_with_all_model_types
-  @keras_parameterized.run_all_keras_modes(skip_keras_tensors=True)
+  @keras_parameterized.run_all_keras_modes
   def test_sparse_op_with_op_layer(self):
-    inputs = layers_module.Input(shape=(2,), sparse=True, name='sparse_tensor')
-    output = sparse_ops.sparse_minimum(inputs, inputs)
-    with self.assertRaisesRegex(ValueError,
-                                'not supported by Keras automatic op wrapping'):
-      training_module.Model([inputs], output)
+    with testing_utils.use_keras_tensors_scope(False):
+      # The meaningful error is only raised w/o KerasTensors.
+      # It's tricky to raise the exact same error w/ KerasTensors enabled.
+      # We may want to add dispatching to the sparse_ops and have dispatch
+      # trigger on attributeerror so that these ops fully work w/ KerasTensors.
+      # This may need to wait until dispatch v2
+      inputs = layers_module.Input(
+          shape=(2,), sparse=True, name='sparse_tensor')
+      output = sparse_ops.sparse_minimum(inputs, inputs)
+      with self.assertRaisesRegex(
+          ValueError, 'not supported by Keras automatic '
+          'op wrapping'):
+        training_module.Model([inputs], output)
 
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_predict_error_with_empty_x(self):
@@ -2906,7 +2914,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     scores = model.train_on_batch(x, y, sample_weight=w)
     self.assertArrayNear(scores, [0.3328, 0.8], 0.001)
 
-  @keras_parameterized.run_all_keras_modes(skip_keras_tensors=True)
+  @keras_parameterized.run_all_keras_modes
   def test_add_metric_with_tensor_on_model(self):
     x = layers_module.Input(shape=(1,))
     y = layers_module.Dense(1, kernel_initializer='ones')(x)
@@ -2920,11 +2928,11 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
       with self.assertRaisesRegex(
           ValueError, 'Expected a symbolic Tensor for the metric value'):
         model.add_metric(mean_result, name='metric_2')
-
-    with self.assertRaisesRegex(
-        ValueError, 'Using the result of calling a `Metric` object '):
-      with backend.get_graph().as_default():
-        model.add_metric(metrics_module.Mean(name='metric_2')(y))
+    else:
+      with self.assertRaisesRegex(
+          ValueError, 'Using the result of calling a `Metric` object '):
+        with backend.get_graph().as_default():
+          model.add_metric(metrics_module.Mean(name='metric_2')(y))
 
     model.compile(
         'sgd',
@@ -3021,8 +3029,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     self.assertEqual(history.history['metric_1'][-1], 5)
     self.assertAlmostEqual(history.history['val_metric_1'][-1], 5, 0)
 
-  @keras_parameterized.run_all_keras_modes(always_skip_v1=True,
-                                           skip_keras_tensors=True)
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_model_metrics_list(self):
 
     class LayerWithAddMetric(layers_module.Layer):
@@ -3063,10 +3070,11 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
           ValueError, 'Expected a symbolic Tensor for the metric value'):
         model.add_metric(mean_result, name='metric_4')
 
-    with self.assertRaisesRegex(
-        ValueError, 'Using the result of calling a `Metric` object '):
-      with backend.get_graph().as_default():
-        model.add_metric(metrics_module.Mean(name='metric_4')(y))
+    else:
+      with self.assertRaisesRegex(
+          ValueError, 'Using the result of calling a `Metric` object '):
+        with backend.get_graph().as_default():
+          model.add_metric(metrics_module.Mean(name='metric_4')(y))
 
     model.compile(
         'sgd',
@@ -3596,5 +3604,52 @@ class TestFunctionTracing(keras_parameterized.TestCase):
     self.assertEqual(sum(new_func_graph in log for log in logs.output), 9)
 
 
+class TestBuildCustomModel(keras_parameterized.TestCase):
+
+  @keras_parameterized.run_all_keras_modes
+  def test_build_list_of_inputs(self):
+
+    class MyModel(training_module.Model):
+
+      def __init__(self):
+        super(MyModel, self).__init__()
+        self.l1 = layers_module.Dense(1)
+        self.l2 = layers_module.Dense(2)
+
+      def call(self, x):
+        a, b = x
+        return self.l1(a) + self.l2(b)
+
+    # List of tuples
+    model = MyModel()
+    model.build([(None, 1), (None, 2)])
+    self.assertEqual(model.l1.kernel.shape.as_list(), [1, 1])
+    self.assertEqual(model.l2.kernel.shape.as_list(), [2, 2])
+    # List of lists
+    model = MyModel()
+    model.build([[None, 1], [None, 2]])
+    self.assertEqual(model.l1.kernel.shape.as_list(), [1, 1])
+    self.assertEqual(model.l2.kernel.shape.as_list(), [2, 2])
+
+  @keras_parameterized.run_all_keras_modes
+  def test_build_single_inputs(self):
+
+    class MyModel(training_module.Model):
+
+      def __init__(self):
+        super(MyModel, self).__init__()
+        self.l1 = layers_module.Dense(1)
+
+      def call(self, x):
+        return self.l1(x)
+
+    model = MyModel()
+    model.build((None, 1))
+    self.assertEqual(model.l1.kernel.shape.as_list(), [1, 1])
+    model = MyModel()
+    model.build([None, 1])
+    self.assertEqual(model.l1.kernel.shape.as_list(), [1, 1])
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
index 3e0735ceec4..84bcd99922f 100644
--- a/tensorflow/python/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 
 import abc
 import atexit
-import collections
 from collections import OrderedDict
 import functools
 import multiprocessing.pool
@@ -616,7 +615,7 @@ def standardize_sample_or_class_weights(x_weight, output_names, weight_type):
                        'You should provide one `' + weight_type + '`'
                        'array per model output.')
     return x_weight
-  if isinstance(x_weight, collections.Mapping):
+  if isinstance(x_weight, collections_abc.Mapping):
     generic_utils.check_for_unexpected_keys(weight_type, x_weight, output_names)
     x_weights = []
     for name in output_names:
@@ -863,7 +862,7 @@ def collect_per_output_metric_info(metrics,
               [metrics_module.clone_metric(m) for m in metrics])
       else:
         nested_metrics = [metrics]
-  elif isinstance(metrics, collections.Mapping):
+  elif isinstance(metrics, collections_abc.Mapping):
     generic_utils.check_for_unexpected_keys('metrics', metrics, output_names)
     nested_metrics = []
     for name in output_names:
@@ -1001,8 +1000,7 @@ def standardize_weights(y,
       y_classes = smart_cond.smart_cond(
           len(y.shape.as_list()) == 2 and K.shape(y)[1] > 1,
           lambda: K.argmax(y, axis=1),
-          lambda: math_ops.cast(K.reshape(y, (-1,)), dtypes.int64)
-      )
+          lambda: math_ops.cast(K.reshape(y, (-1,)), dtypes.int64))
       class_sample_weight = array_ops.gather(weight_vector, y_classes)
       gen_array_ops.check_numerics(
           class_sample_weight,
@@ -1442,7 +1440,7 @@ def prepare_sample_weight_modes(training_endpoints, sample_weight_mode):
     ValueError: In case of invalid `sample_weight_mode` input.
   """
 
-  if isinstance(sample_weight_mode, collections.Mapping):
+  if isinstance(sample_weight_mode, collections_abc.Mapping):
     generic_utils.check_for_unexpected_keys(
         'sample_weight_mode', sample_weight_mode,
         [e.output_name for e in training_endpoints])
@@ -1535,7 +1533,7 @@ def prepare_loss_weights(training_endpoints, loss_weights=None):
   if loss_weights is None:
     for e in training_endpoints:
       e.loss_weight = 1.
-  elif isinstance(loss_weights, collections.Mapping):
+  elif isinstance(loss_weights, collections_abc.Mapping):
     generic_utils.check_for_unexpected_keys(
         'loss_weights', loss_weights,
         [e.output_name for e in training_endpoints])
diff --git a/tensorflow/python/keras/engine/training_utils_test.py b/tensorflow/python/keras/engine/training_utils_test.py
index bc2c4c91268..06d26ef5088 100644
--- a/tensorflow/python/keras/engine/training_utils_test.py
+++ b/tensorflow/python/keras/engine/training_utils_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.engine import keras_tensor
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.platform import test
@@ -54,16 +55,28 @@ class ModelInputsTest(test.TestCase):
     self.assertEqual(backend.floatx(), vals[0].dtype)
 
   def test_single_thing_eager(self):
-    with context.eager_mode():
-      a = np.ones(10, dtype=np.int32)
-      model_inputs = training_utils.ModelInputs(a)
-      self.assertEqual(['input_1'], model_inputs.get_input_names())
-      val = model_inputs.get_symbolic_inputs()
-      self.assertTrue(tf_utils.is_symbolic_tensor(val))
-      vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
-      self.assertEqual(1, len(vals))
-      self.assertTrue(tf_utils.is_symbolic_tensor(vals[0]))
-      self.assertEqual(dtypes.int32, vals[0].dtype)
+    with testing_utils.use_keras_tensors_scope(False):
+      with context.eager_mode():
+        a = np.ones(10, dtype=np.int32)
+        model_inputs = training_utils.ModelInputs(a)
+        self.assertEqual(['input_1'], model_inputs.get_input_names())
+        val = model_inputs.get_symbolic_inputs()
+        self.assertTrue(tf_utils.is_symbolic_tensor(val))
+        vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
+        self.assertEqual(1, len(vals))
+        self.assertTrue(tf_utils.is_symbolic_tensor(vals[0]))
+        self.assertEqual(dtypes.int32, vals[0].dtype)
+    with testing_utils.use_keras_tensors_scope(True):
+      with context.eager_mode():
+        a = np.ones(10, dtype=np.int32)
+        model_inputs = training_utils.ModelInputs(a)
+        self.assertEqual(['input_1'], model_inputs.get_input_names())
+        val = model_inputs.get_symbolic_inputs()
+        self.assertIsInstance(val, keras_tensor.KerasTensor)
+        vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
+        self.assertEqual(1, len(vals))
+        self.assertIsInstance(vals[0], keras_tensor.KerasTensor)
+        self.assertEqual(dtypes.int32, vals[0].dtype)
 
   def test_list(self):
     a = [np.ones(10), np.ones(20)]
@@ -74,13 +87,22 @@ class ModelInputsTest(test.TestCase):
     self.assertTrue(tensor_util.is_tensor(vals[1]))
 
   def test_list_eager(self):
-    with context.eager_mode():
-      a = [np.ones(10), np.ones(20)]
-      model_inputs = training_utils.ModelInputs(a)
-      self.assertEqual(['input_1', 'input_2'], model_inputs.get_input_names())
-      vals = model_inputs.get_symbolic_inputs()
-      self.assertTrue(tf_utils.is_symbolic_tensor(vals[0]))
-      self.assertTrue(tf_utils.is_symbolic_tensor(vals[1]))
+    with testing_utils.use_keras_tensors_scope(False):
+      with context.eager_mode():
+        a = [np.ones(10), np.ones(20)]
+        model_inputs = training_utils.ModelInputs(a)
+        self.assertEqual(['input_1', 'input_2'], model_inputs.get_input_names())
+        vals = model_inputs.get_symbolic_inputs()
+        self.assertTrue(tf_utils.is_symbolic_tensor(vals[0]))
+        self.assertTrue(tf_utils.is_symbolic_tensor(vals[1]))
+    with testing_utils.use_keras_tensors_scope(True):
+      with context.eager_mode():
+        a = [np.ones(10), np.ones(20)]
+        model_inputs = training_utils.ModelInputs(a)
+        self.assertEqual(['input_1', 'input_2'], model_inputs.get_input_names())
+        vals = model_inputs.get_symbolic_inputs()
+        self.assertIsInstance(vals[0], keras_tensor.KerasTensor)
+        self.assertIsInstance(vals[1], keras_tensor.KerasTensor)
 
   def test_dict(self):
     a = {'b': np.ones(10), 'a': np.ones(20)}
@@ -91,13 +113,22 @@ class ModelInputsTest(test.TestCase):
     self.assertTrue(tensor_util.is_tensor(vals['b']))
 
   def test_dict_eager(self):
-    with context.eager_mode():
-      a = {'b': np.ones(10), 'a': np.ones(20)}
-      model_inputs = training_utils.ModelInputs(a)
-      self.assertEqual(['a', 'b'], model_inputs.get_input_names())
-      vals = model_inputs.get_symbolic_inputs()
-      self.assertTrue(tf_utils.is_symbolic_tensor(vals['a']))
-      self.assertTrue(tf_utils.is_symbolic_tensor(vals['b']))
+    with testing_utils.use_keras_tensors_scope(False):
+      with context.eager_mode():
+        a = {'b': np.ones(10), 'a': np.ones(20)}
+        model_inputs = training_utils.ModelInputs(a)
+        self.assertEqual(['a', 'b'], model_inputs.get_input_names())
+        vals = model_inputs.get_symbolic_inputs()
+        self.assertTrue(tf_utils.is_symbolic_tensor(vals['a']))
+        self.assertTrue(tf_utils.is_symbolic_tensor(vals['b']))
+    with testing_utils.use_keras_tensors_scope(True):
+      with context.eager_mode():
+        a = {'b': np.ones(10), 'a': np.ones(20)}
+        model_inputs = training_utils.ModelInputs(a)
+        self.assertEqual(['a', 'b'], model_inputs.get_input_names())
+        vals = model_inputs.get_symbolic_inputs()
+        self.assertIsInstance(vals['a'], keras_tensor.KerasTensor)
+        self.assertIsInstance(vals['b'], keras_tensor.KerasTensor)
 
 
 class DatasetUtilsTest(test.TestCase, parameterized.TestCase):
diff --git a/tensorflow/python/keras/engine/training_v1.py b/tensorflow/python/keras/engine/training_v1.py
index bf518e1e702..2ac3337948a 100644
--- a/tensorflow/python/keras/engine/training_v1.py
+++ b/tensorflow/python/keras/engine/training_v1.py
@@ -28,7 +28,6 @@ from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import parameter_server_strategy
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import monitoring
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import composite_tensor_utils
 from tensorflow.python.framework import constant_op
@@ -72,9 +71,6 @@ try:
 except ImportError:
   issparse = None
 
-_keras_api_gauge = monitoring.BoolGauge('/tensorflow/api/keras/model_v1',
-                                        'keras model v1 usage', 'method')
-
 
 class Model(training_lib.Model):
   """`Model` groups layers into an object with training and inference features.
@@ -142,7 +138,7 @@ class Model(training_lib.Model):
 
   def __init__(self, *args, **kwargs):
     super(Model, self).__init__(*args, **kwargs)
-    _keras_api_gauge.get_cell('model_v1').set(True)
+    base_layer.keras_api_gauge.get_cell('model v1').set(True)
     # initializing _distribution_strategy here since it is possible to call
     # predict on a model without compiling it.
     self._distribution_strategy = None
@@ -302,6 +298,7 @@ class Model(training_lib.Model):
         ValueError: In case of invalid arguments for
             `optimizer`, `loss`, `metrics` or `sample_weight_mode`.
     """
+    self._assert_built_as_v1()
     self._run_eagerly = kwargs.pop('run_eagerly', None)
     self._experimental_run_tf_function = kwargs.pop(
         'experimental_run_tf_function', True)
@@ -412,7 +409,7 @@ class Model(training_lib.Model):
       # time the model gets called on training data.
       return
     self._is_compiled = True
-    _keras_api_gauge.get_cell('compile_v1').set(True)
+    base_layer.keras_api_gauge.get_cell('compile_v1').set(True)
 
     # Prepare list of loss functions, same size of model outputs.
     self.loss_functions = training_utils.prepare_loss_functions(
@@ -547,7 +544,7 @@ class Model(training_lib.Model):
       if self._run_eagerly is None:
         # Respect `tf.config.run_functions_eagerly` unless
         # `run_eagerly` was explicitly passed to `compile`.
-        return def_function.RUN_FUNCTIONS_EAGERLY
+        return def_function.functions_run_eagerly()
       else:
         return self._run_eagerly
     else:
@@ -772,7 +769,8 @@ class Model(training_lib.Model):
         ValueError: In case of mismatch between the provided input data
             and what the model expects.
     """
-    _keras_api_gauge.get_cell('fit_v1').set(True)
+    self._assert_built_as_v1()
+    base_layer.keras_api_gauge.get_cell('fit_v1').set(True)
     # Legacy support
     if 'nb_epoch' in kwargs:
       logging.warning(
@@ -892,7 +890,8 @@ class Model(training_lib.Model):
     Raises:
         ValueError: in case of invalid arguments.
     """
-    _keras_api_gauge.get_cell('evaluate_v1').set(True)
+    self._assert_built_as_v1()
+    base_layer.keras_api_gauge.get_cell('evaluate_v1').set(True)
     self._assert_compile_was_called()
     self._check_call_args('evaluate')
 
@@ -971,7 +970,8 @@ class Model(training_lib.Model):
             or in case a stateful model receives a number of samples
             that is not a multiple of the batch size.
     """
-    _keras_api_gauge.get_cell('predict_v1').set(True)
+    self._assert_built_as_v1()
+    base_layer.keras_api_gauge.get_cell('predict_v1').set(True)
     self._check_call_args('predict')
 
     func = self._select_training_loop(x)
diff --git a/tensorflow/python/keras/feature_column/BUILD b/tensorflow/python/keras/feature_column/BUILD
index 6af53646d2f..f1acbeba66c 100644
--- a/tensorflow/python/keras/feature_column/BUILD
+++ b/tensorflow/python/keras/feature_column/BUILD
@@ -18,6 +18,7 @@ py_library(
         ":dense_features",
         ":dense_features_v2",
         ":sequence_feature_column",
+        "//tensorflow/python/keras:combinations",
     ],
 )
 
@@ -59,6 +60,7 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:tf_export",
         "//tensorflow/python/feature_column:feature_column_v2",
+        "//tensorflow/python/keras:combinations",
     ],
 )
 
diff --git a/tensorflow/python/keras/feature_column/dense_features.py b/tensorflow/python/keras/feature_column/dense_features.py
index ef533b71fe7..50403e5459d 100644
--- a/tensorflow/python/keras/feature_column/dense_features.py
+++ b/tensorflow/python/keras/feature_column/dense_features.py
@@ -21,10 +21,9 @@ from __future__ import print_function
 import json
 
 from tensorflow.python.feature_column import feature_column_v2 as fc
-from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.feature_column import base_feature_layer as kfc
-from tensorflow.python.util import serialization
+from tensorflow.python.keras.saving.saved_model import json_utils
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -112,7 +111,7 @@ class DenseFeatures(kfc._BaseFeaturesLayer):  # pylint: disable=protected-access
     """
     metadata = json.loads(super(DenseFeatures, self)._tracking_metadata)
     metadata['_is_feature_layer'] = True
-    return json.dumps(metadata, default=serialization.get_json_type)
+    return json.dumps(metadata, default=json_utils.get_json_type)
 
   def _target_shape(self, input_shape, total_elements):
     return (input_shape[0], total_elements)
@@ -161,7 +160,7 @@ class DenseFeatures(kfc._BaseFeaturesLayer):  # pylint: disable=protected-access
     transformation_cache = fc.FeatureTransformationCache(features)
     output_tensors = []
     for column in self._feature_columns:
-      with ops.name_scope(column.name):
+      with backend.name_scope(column.name):
         try:
           tensor = column.get_dense_tensor(
               transformation_cache, self._state_manager, training=training)
diff --git a/tensorflow/python/keras/feature_column/dense_features_test.py b/tensorflow/python/keras/feature_column/dense_features_test.py
index ef132b67707..a9fcb4ad315 100644
--- a/tensorflow/python/keras/feature_column/dense_features_test.py
+++ b/tensorflow/python/keras/feature_column/dense_features_test.py
@@ -32,6 +32,8 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import combinations
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.feature_column import dense_features as df
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
@@ -48,9 +50,9 @@ def _initialized_session(config=None):
   return sess
 
 
-class DenseFeaturesTest(test.TestCase):
+class DenseFeaturesTest(keras_parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes()
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_retrieving_input(self):
     features = {'a': [0.]}
     dense_features = df.DenseFeatures(fc.numeric_column('a'))
@@ -1013,7 +1015,7 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
     self._test_dense_features(trainable=False)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class DenseFeaturesSerializationTest(test.TestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(
@@ -1078,7 +1080,7 @@ class DenseFeaturesSerializationTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(new_layer._feature_columns[0].name, 'a_X_b_indicator')
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class SequenceFeatureColumnsTest(test.TestCase):
   """Tests DenseFeatures with sequence feature columns."""
 
diff --git a/tensorflow/python/keras/feature_column/dense_features_v2.py b/tensorflow/python/keras/feature_column/dense_features_v2.py
index 40c71ce7bd6..ae1294c6fca 100644
--- a/tensorflow/python/keras/feature_column/dense_features_v2.py
+++ b/tensorflow/python/keras/feature_column/dense_features_v2.py
@@ -89,7 +89,7 @@ class DenseFeatures(dense_features.DenseFeatures):
 
   def build(self, _):
     for column in self._feature_columns:
-      with ops.name_scope(column.name):
+      with ops.name_scope_v2(column.name):
         column.create_state(self._state_manager)
     # We would like to call Layer.build and not _DenseFeaturesHelper.build.
     # pylint: disable=protected-access
diff --git a/tensorflow/python/keras/feature_column/dense_features_v2_test.py b/tensorflow/python/keras/feature_column/dense_features_v2_test.py
index 384d6424f47..bb2ce657c46 100644
--- a/tensorflow/python/keras/feature_column/dense_features_v2_test.py
+++ b/tensorflow/python/keras/feature_column/dense_features_v2_test.py
@@ -29,7 +29,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import combinations
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.feature_column import dense_features_v2 as df
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
@@ -44,9 +45,9 @@ def _initialized_session(config=None):
   return sess
 
 
-class DenseFeaturesTest(test.TestCase):
+class DenseFeaturesTest(keras_parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes()
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_retrieving_input(self):
     features = {'a': [0.]}
     dense_features = df.DenseFeatures(fc.numeric_column('a'))
@@ -434,18 +435,18 @@ class DenseFeaturesTest(test.TestCase):
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
-  @test_util.run_deprecated_v1
   def test_multiple_layers_with_same_shared_embedding_column(self):
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc.shared_embedding_columns_v2(
-        [categorical_column_b, categorical_column_a],
-        dimension=embedding_dimension)
 
+    # feature_column.shared_embeddings is not supported in eager.
     with ops.Graph().as_default():
+      embedding_column_b, embedding_column_a = fc.shared_embedding_columns_v2(
+          [categorical_column_b, categorical_column_a],
+          dimension=embedding_dimension)
       features = {
           'aaa':
               sparse_tensor.SparseTensor(
@@ -468,19 +469,19 @@ class DenseFeaturesTest(test.TestCase):
           ['aaa_bbb_shared_embedding:0'],
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
-  @test_util.run_deprecated_v1
   def test_multiple_layers_with_same_shared_embedding_column_diff_graphs(self):
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc.shared_embedding_columns_v2(
-        [categorical_column_b, categorical_column_a],
-        dimension=embedding_dimension)
-    all_cols = [embedding_column_a, embedding_column_b]
 
+    # feature_column.shared_embeddings is not supported in eager.
     with ops.Graph().as_default():
+      embedding_column_b, embedding_column_a = fc.shared_embedding_columns_v2(
+          [categorical_column_b, categorical_column_a],
+          dimension=embedding_dimension)
+      all_cols = [embedding_column_a, embedding_column_b]
       features = {
           'aaa':
               sparse_tensor.SparseTensor(
@@ -520,7 +521,6 @@ class DenseFeaturesTest(test.TestCase):
           ['aaa_bbb_shared_embedding:0'],
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
-  @test_util.run_deprecated_v1
   def test_with_1d_sparse_tensor(self):
     embedding_values = (
         (1., 2., 3., 4., 5.),  # id 0
@@ -546,38 +546,38 @@ class DenseFeaturesTest(test.TestCase):
     embedded_country = fc.embedding_column(
         country, dimension=5, initializer=_initializer)
 
-    # Provides 1-dim tensor and dense tensor.
-    features = {
-        'price':
-            constant_op.constant([
-                11.,
-                12.,
-            ]),
-        'body-style':
-            sparse_tensor.SparseTensor(
-                indices=((0,), (1,)),
-                values=('sedan', 'hardtop'),
-                dense_shape=(2,)),
-        # This is dense tensor for the categorical_column.
-        'country':
-            constant_op.constant(['CA', 'US']),
-    }
-    self.assertEqual(1, features['price'].shape.ndims)
-    self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
-    self.assertEqual(1, features['country'].shape.ndims)
+    with ops.Graph().as_default():
+      # Provides 1-dim tensor and dense tensor.
+      features = {
+          'price':
+              constant_op.constant([
+                  11.,
+                  12.,
+              ]),
+          'body-style':
+              sparse_tensor.SparseTensor(
+                  indices=((0,), (1,)),
+                  values=('sedan', 'hardtop'),
+                  dense_shape=(2,)),
+          # This is dense tensor for the categorical_column.
+          'country':
+              constant_op.constant(['CA', 'US']),
+      }
+      self.assertEqual(1, features['price'].shape.ndims)
+      self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
+      self.assertEqual(1, features['country'].shape.ndims)
 
-    net = df.DenseFeatures([price, one_hot_body_style, embedded_country])(
-        features)
-    self.assertEqual(1 + 3 + 5, net.shape[1])
-    with _initialized_session() as sess:
+      net = df.DenseFeatures([price, one_hot_body_style, embedded_country])(
+          features)
+      self.assertEqual(1 + 3 + 5, net.shape[1])
+      with _initialized_session() as sess:
 
-      # Each row is formed by concatenating `embedded_body_style`,
-      # `one_hot_body_style`, and `price` in order.
-      self.assertAllEqual([[0., 0., 1., 11., 12., 13., 14., 15., 11.],
-                           [1., 0., 0., 1., 2., 3., 4., 5., 12.]],
-                          sess.run(net))
+        # Each row is formed by concatenating `embedded_body_style`,
+        # `one_hot_body_style`, and `price` in order.
+        self.assertAllEqual([[0., 0., 1., 11., 12., 13., 14., 15., 11.],
+                             [1., 0., 0., 1., 2., 3., 4., 5., 12.]],
+                            sess.run(net))
 
-  @test_util.run_deprecated_v1
   def test_with_1d_unknown_shape_sparse_tensor(self):
     embedding_values = (
         (1., 2.),  # id 0
@@ -604,39 +604,39 @@ class DenseFeaturesTest(test.TestCase):
         country, dimension=2, initializer=_initializer)
 
     # Provides 1-dim tensor and dense tensor.
-    features = {
-        'price': array_ops.placeholder(dtypes.float32),
-        'body-style': array_ops.sparse_placeholder(dtypes.string),
-        # This is dense tensor for the categorical_column.
-        'country': array_ops.placeholder(dtypes.string),
-    }
-    self.assertIsNone(features['price'].shape.ndims)
-    self.assertIsNone(features['body-style'].get_shape().ndims)
-    self.assertIsNone(features['country'].shape.ndims)
+    with ops.Graph().as_default():
+      features = {
+          'price': array_ops.placeholder(dtypes.float32),
+          'body-style': array_ops.sparse_placeholder(dtypes.string),
+          # This is dense tensor for the categorical_column.
+          'country': array_ops.placeholder(dtypes.string),
+      }
+      self.assertIsNone(features['price'].shape.ndims)
+      self.assertIsNone(features['body-style'].get_shape().ndims)
+      self.assertIsNone(features['country'].shape.ndims)
 
-    price_data = np.array([11., 12.])
-    body_style_data = sparse_tensor.SparseTensorValue(
-        indices=((0,), (1,)), values=('sedan', 'hardtop'), dense_shape=(2,))
-    country_data = np.array([['US'], ['CA']])
+      price_data = np.array([11., 12.])
+      body_style_data = sparse_tensor.SparseTensorValue(
+          indices=((0,), (1,)), values=('sedan', 'hardtop'), dense_shape=(2,))
+      country_data = np.array([['US'], ['CA']])
 
-    net = df.DenseFeatures([price, one_hot_body_style, embedded_country])(
-        features)
-    self.assertEqual(1 + 3 + 2, net.shape[1])
-    with _initialized_session() as sess:
+      net = df.DenseFeatures([price, one_hot_body_style, embedded_country])(
+          features)
+      self.assertEqual(1 + 3 + 2, net.shape[1])
+      with _initialized_session() as sess:
 
-      # Each row is formed by concatenating `embedded_body_style`,
-      # `one_hot_body_style`, and `price` in order.
-      self.assertAllEqual(
-          [[0., 0., 1., 1., 2., 11.], [1., 0., 0., 11., 12., 12.]],
-          sess.run(
-              net,
-              feed_dict={
-                  features['price']: price_data,
-                  features['body-style']: body_style_data,
-                  features['country']: country_data
-              }))
+        # Each row is formed by concatenating `embedded_body_style`,
+        # `one_hot_body_style`, and `price` in order.
+        self.assertAllEqual(
+            [[0., 0., 1., 1., 2., 11.], [1., 0., 0., 11., 12., 12.]],
+            sess.run(
+                net,
+                feed_dict={
+                    features['price']: price_data,
+                    features['body-style']: body_style_data,
+                    features['country']: country_data
+                }))
 
-  @test_util.run_deprecated_v1
   def test_with_rank_0_feature(self):
     # price has 1 dimension in dense_features
     price = fc.numeric_column('price')
@@ -649,15 +649,16 @@ class DenseFeaturesTest(test.TestCase):
     with self.assertRaisesRegex(ValueError, 'Feature .* cannot have rank 0'):
       df.DenseFeatures([price])(features)
 
-    # Dynamic rank 0 should fail
-    features = {
-        'price': array_ops.placeholder(dtypes.float32),
-    }
-    net = df.DenseFeatures([price])(features)
-    self.assertEqual(1, net.shape[1])
-    with _initialized_session() as sess:
-      with self.assertRaisesOpError('Feature .* cannot have rank 0'):
-        sess.run(net, feed_dict={features['price']: np.array(1)})
+    with ops.Graph().as_default():
+      # Dynamic rank 0 should fail
+      features = {
+          'price': array_ops.placeholder(dtypes.float32),
+      }
+      net = df.DenseFeatures([price])(features)
+      self.assertEqual(1, net.shape[1])
+      with _initialized_session() as sess:
+        with self.assertRaisesOpError('Feature .* cannot have rank 0'):
+          sess.run(net, feed_dict={features['price']: np.array(1)})
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/feature_column/sequence_feature_column.py b/tensorflow/python/keras/feature_column/sequence_feature_column.py
index 5f64ca9642e..cb60bac22eb 100644
--- a/tensorflow/python/keras/feature_column/sequence_feature_column.py
+++ b/tensorflow/python/keras/feature_column/sequence_feature_column.py
@@ -143,7 +143,7 @@ class SequenceFeatures(kfc._BaseFeaturesLayer):
     sequence_lengths = []
 
     for column in self._feature_columns:
-      with ops.name_scope(column.name):
+      with backend.name_scope(column.name):
         try:
           dense_tensor, sequence_length = column.get_sequence_dense_tensor(
               transformation_cache, self._state_manager, training=training)
@@ -164,7 +164,7 @@ class SequenceFeatures(kfc._BaseFeaturesLayer):
 
 def _assert_all_equal_and_return(tensors, name=None):
   """Asserts that all tensors are equal and returns the first one."""
-  with ops.name_scope(name, 'assert_all_equal', values=tensors):
+  with backend.name_scope(name or 'assert_all_equal'):
     if len(tensors) == 1:
       return tensors[0]
     assert_equal_ops = []
diff --git a/tensorflow/python/keras/feature_column/sequence_feature_column_test.py b/tensorflow/python/keras/feature_column/sequence_feature_column_test.py
index f1ce83b0855..ea07a6bdcc8 100644
--- a/tensorflow/python/keras/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/python/keras/feature_column/sequence_feature_column_test.py
@@ -31,7 +31,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras.feature_column import sequence_feature_column as ksfc
 from tensorflow.python.keras.saving import model_config
@@ -47,6 +46,7 @@ def _initialized_session(config=None):
   return sess
 
 
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(
@@ -91,7 +91,6 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
            [[1., 2., 17., 18., 19.], [3., 4., 11., 12., 13.]]],
        'expected_sequence_length': [2, 2]},
       )
-  @test_util.run_in_graph_and_eager_modes
   def test_embedding_column(
       self, sparse_input_args_a, sparse_input_args_b, expected_input_layer,
       expected_sequence_length):
@@ -151,7 +150,6 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(
         expected_sequence_length, self.evaluate(sequence_length))
 
-  @test_util.run_in_graph_and_eager_modes
   def test_embedding_column_with_non_sequence_categorical(self):
     """Tests that error is raised for non-sequence embedding column."""
     vocabulary_size = 3
@@ -173,7 +171,6 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
         r'type SequenceCategoricalColumn to use SequenceFeatures\.'):
       _, _ = sequence_input_layer({'aaa': sparse_input})
 
-  @test_util.run_in_graph_and_eager_modes
   def test_shared_embedding_column(self):
     with ops.Graph().as_default():
       vocabulary_size = 3
@@ -316,7 +313,6 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
            [[2., 0., 0., 0., 1.], [0., 1., 0., 1., 0.]]],
        'expected_sequence_length': [2, 2]},
       )
-  @test_util.run_in_graph_and_eager_modes
   def test_indicator_column(
       self, sparse_input_args_a, sparse_input_args_b, expected_input_layer,
       expected_sequence_length):
@@ -342,7 +338,6 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(
         expected_sequence_length, self.evaluate(sequence_length))
 
-  @test_util.run_in_graph_and_eager_modes
   def test_indicator_column_with_non_sequence_categorical(self):
     """Tests that error is raised for non-sequence categorical column."""
     vocabulary_size = 3
@@ -388,7 +383,6 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
            [[3.], [0.], [8.], [0.]]],
        'expected_sequence_length': [2, 2]},
       )
-  @test_util.run_in_graph_and_eager_modes
   def test_numeric_column(
       self, sparse_input_args, expected_input_layer, expected_sequence_length):
     sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
@@ -431,7 +425,6 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
            [[10., 11., 12., 13.], [0., 0., 0., 0.]]],
        'expected_sequence_length': [2, 1]},
       )
-  @test_util.run_in_graph_and_eager_modes
   def test_numeric_column_multi_dim(
       self, sparse_input_args, expected_input_layer, expected_sequence_length):
     """Tests SequenceFeatures for multi-dimensional numeric_column."""
@@ -446,7 +439,6 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(
         expected_sequence_length, self.evaluate(sequence_length))
 
-  @test_util.run_in_graph_and_eager_modes
   def test_sequence_length_not_equal(self):
     """Tests that an error is raised when sequence lengths are not equal."""
     # Input a with sequence_length = [2, 1]
@@ -494,7 +486,6 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
            'dense_shape': (2, 2, 4)},
        'expected_shape': [2, 2, 4]},
       )
-  @test_util.run_in_graph_and_eager_modes
   def test_static_shape_from_tensors_numeric(
       self, sparse_input_args, expected_shape):
     """Tests that we return a known static shape when we have one."""
@@ -529,7 +520,6 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
            'dense_shape': (4, 2, 2)},
        'expected_shape': [4, 2, 3]}
       )
-  @test_util.run_in_graph_and_eager_modes
   def test_static_shape_from_tensors_indicator(
       self, sparse_input_args, expected_shape):
     """Tests that we return a known static shape when we have one."""
@@ -543,7 +533,6 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
     shape = input_layer.get_shape()
     self.assertEqual(shape, expected_shape)
 
-  @test_util.run_in_graph_and_eager_modes
   def test_compute_output_shape(self):
     price1 = sfc.sequence_numeric_column('price1', shape=2)
     price2 = sfc.sequence_numeric_column('price2')
@@ -580,7 +569,7 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
     self.assertAllClose([2, 1, 1, 1], self.evaluate(seq_len))
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class SequenceFeaturesSerializationTest(test.TestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(('default', None, None),
diff --git a/tensorflow/python/keras/initializers_test.py b/tensorflow/python/keras/initializers_test.py
index 3e4502f14fc..e254f6340fc 100644
--- a/tensorflow/python/keras/initializers_test.py
+++ b/tensorflow/python/keras/initializers_test.py
@@ -28,10 +28,36 @@ from tensorflow.python.keras import models
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
 from tensorflow.python.platform import test
 
 
+def _compute_fans(shape):
+  """Computes the number of input and output units for a weight shape.
+
+  Args:
+    shape: Integer shape tuple or TF tensor shape.
+
+  Returns:
+    A tuple of integer scalars (fan_in, fan_out).
+  """
+  if len(shape) < 1:  # Just to avoid errors for constants.
+    fan_in = fan_out = 1
+  elif len(shape) == 1:
+    fan_in = fan_out = shape[0]
+  elif len(shape) == 2:
+    fan_in = shape[0]
+    fan_out = shape[1]
+  else:
+    # Assuming convolution kernels (2D, 3D, or more).
+    # kernel shape: (..., input_depth, depth)
+    receptive_field_size = 1
+    for dim in shape[:-2]:
+      receptive_field_size *= dim
+    fan_in = shape[-2] * receptive_field_size
+    fan_out = shape[-1] * receptive_field_size
+  return int(fan_in), int(fan_out)
+
+
 @combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class KerasInitializersTest(test.TestCase):
 
@@ -88,7 +114,7 @@ class KerasInitializersTest(test.TestCase):
   def test_lecun_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
-      fan_in, _ = init_ops._compute_fans(tensor_shape)
+      fan_in, _ = _compute_fans(tensor_shape)
       std = np.sqrt(1. / fan_in)
       self._runner(
           initializers.LecunUniformV2(seed=123),
@@ -99,7 +125,7 @@ class KerasInitializersTest(test.TestCase):
   def test_glorot_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
-      fan_in, fan_out = init_ops._compute_fans(tensor_shape)
+      fan_in, fan_out = _compute_fans(tensor_shape)
       std = np.sqrt(2. / (fan_in + fan_out))
       self._runner(
           initializers.GlorotUniformV2(seed=123),
@@ -110,7 +136,7 @@ class KerasInitializersTest(test.TestCase):
   def test_he_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
-      fan_in, _ = init_ops._compute_fans(tensor_shape)
+      fan_in, _ = _compute_fans(tensor_shape)
       std = np.sqrt(2. / fan_in)
       self._runner(
           initializers.HeUniformV2(seed=123),
@@ -121,7 +147,7 @@ class KerasInitializersTest(test.TestCase):
   def test_lecun_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
-      fan_in, _ = init_ops._compute_fans(tensor_shape)
+      fan_in, _ = _compute_fans(tensor_shape)
       std = np.sqrt(1. / fan_in)
       self._runner(
           initializers.LecunNormalV2(seed=123),
@@ -132,7 +158,7 @@ class KerasInitializersTest(test.TestCase):
   def test_glorot_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
-      fan_in, fan_out = init_ops._compute_fans(tensor_shape)
+      fan_in, fan_out = _compute_fans(tensor_shape)
       std = np.sqrt(2. / (fan_in + fan_out))
       self._runner(
           initializers.GlorotNormalV2(seed=123),
@@ -143,7 +169,7 @@ class KerasInitializersTest(test.TestCase):
   def test_he_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
-      fan_in, _ = init_ops._compute_fans(tensor_shape)
+      fan_in, _ = _compute_fans(tensor_shape)
       std = np.sqrt(2. / fan_in)
       self._runner(
           initializers.HeNormalV2(seed=123),
diff --git a/tensorflow/python/keras/integration_test/BUILD b/tensorflow/python/keras/integration_test/BUILD
index b23dcc59b97..20e1a886d4e 100644
--- a/tensorflow/python/keras/integration_test/BUILD
+++ b/tensorflow/python/keras/integration_test/BUILD
@@ -97,6 +97,7 @@ tpu_py_test(
     name = "tpu_strategy_test",
     srcs = ["tpu_strategy_test.py"],
     disable_experimental = True,
+    disable_mlir_bridge = False,
     python_version = "PY3",
     tags = ["no_oss"],
     deps = [
diff --git a/tensorflow/python/keras/keras_parameterized_test.py b/tensorflow/python/keras/keras_parameterized_test.py
index 9bddc6608ff..33c68df62c4 100644
--- a/tensorflow/python/keras/keras_parameterized_test.py
+++ b/tensorflow/python/keras/keras_parameterized_test.py
@@ -269,8 +269,8 @@ class KerasParameterizedTest(keras_parameterized.TestCase):
       self.assertLen(l, 4)
       self.assertAllEqual(l, [
           ("graph", False, False),
-          ("eager", True, False),
-          ("eager", False, False),
+          ("eager", True, keras_tensor._KERAS_TENSORS_ENABLED),
+          ("eager", False, keras_tensor._KERAS_TENSORS_ENABLED),
           ("eager", False, True),
       ])
 
@@ -281,8 +281,8 @@ class KerasParameterizedTest(keras_parameterized.TestCase):
     else:
       self.assertLen(l, 3)
       self.assertAllEqual(l, [
-          ("eager", True, False),
-          ("eager", False, False),
+          ("eager", True, keras_tensor._KERAS_TENSORS_ENABLED),
+          ("eager", False, keras_tensor._KERAS_TENSORS_ENABLED),
           ("eager", False, True),
       ])
 
diff --git a/tensorflow/python/keras/layers/BUILD b/tensorflow/python/keras/layers/BUILD
index 4d1571583fe..fe46f580162 100644
--- a/tensorflow/python/keras/layers/BUILD
+++ b/tensorflow/python/keras/layers/BUILD
@@ -358,7 +358,6 @@ py_library(
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:control_flow_util",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index 3207689339c..05efbd23c1e 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -862,6 +862,7 @@ class Conv1DTranspose(Conv1D):
       the dilation rate to use for dilated convolution.
       Currently, specifying a `dilation_rate` value != 1 is
       incompatible with specifying a stride value != 1.
+      Also dilation rate larger than 1 is not currently supported.
     activation: Activation function to use.
       If you don't specify anything, no activation is applied (
       see `keras.activations`).
diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index 1661f843dc9..0bc869160ec 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -433,9 +433,9 @@ class GroupedConvTest(keras_parameterized.TestCase):
       ('Conv2D', keras.layers.Conv2D, (32, 12, 12, 32)),
       ('Conv3D', keras.layers.Conv3D, (32, 12, 12, 12, 32)),
   )
-  def disable_test_group_conv(self, layer_cls, input_shape):
+  def test_group_conv(self, layer_cls, input_shape):
     if test.is_gpu_available(cuda_only=True):
-      with test_util.use_gpu():
+      with testing_utils.use_gpu():
         inputs = random_ops.random_uniform(shape=input_shape)
 
         layer = layer_cls(16, 3, groups=4, use_bias=False)
@@ -448,12 +448,12 @@ class GroupedConvTest(keras_parameterized.TestCase):
             for inputs, weights in zip(input_slices, weight_slices)
         ],
                                             axis=-1)
-
-        self.assertAllClose(layer(inputs), expected_outputs, rtol=1e-5)
+        self.assertAllClose(
+            layer(inputs), expected_outputs, rtol=3e-5, atol=3e-5)
 
   def test_group_conv_depthwise(self):
     if test.is_gpu_available(cuda_only=True):
-      with test_util.use_gpu():
+      with testing_utils.use_gpu():
         inputs = random_ops.random_uniform(shape=(3, 27, 27, 32))
 
         layer = keras.layers.Conv2D(32, 3, groups=32, use_bias=False)
@@ -474,7 +474,7 @@ class Conv1DTransposeTest(keras_parameterized.TestCase):
     stack_size = 3
     num_col = 6
 
-    with test_util.use_gpu():
+    with testing_utils.use_gpu():
       testing_utils.layer_test(
           keras.layers.Conv1DTranspose,
           kwargs=kwargs,
@@ -509,7 +509,7 @@ class Conv3DTransposeTest(keras_parameterized.TestCase):
     num_col = 6
     depth = 5
 
-    with test_util.use_gpu():
+    with testing_utils.use_gpu():
       testing_utils.layer_test(
           keras.layers.Conv3DTranspose,
           kwargs=kwargs,
@@ -645,83 +645,90 @@ class ZeroPaddingTest(keras_parameterized.TestCase):
     with self.assertRaises(ValueError):
       keras.layers.ZeroPadding1D(padding=None)
 
-  def test_zero_padding_2d(self):
+  @parameterized.named_parameters(('channels_first', 'channels_first'),
+                                  ('channels_last', 'channels_last'))
+  def test_zero_padding_2d(self, data_format):
     num_samples = 2
     stack_size = 2
     input_num_row = 4
     input_num_col = 5
-    for data_format in ['channels_first', 'channels_last']:
-      inputs = np.ones((num_samples, input_num_row, input_num_col, stack_size))
+    if data_format == 'channels_first':
       inputs = np.ones((num_samples, stack_size, input_num_row, input_num_col))
+    elif data_format == 'channels_last':
+      inputs = np.ones((num_samples, input_num_row, input_num_col, stack_size))
 
-      # basic test
-      with self.cached_session(use_gpu=True):
-        testing_utils.layer_test(
-            keras.layers.ZeroPadding2D,
-            kwargs={'padding': (2, 2),
-                    'data_format': data_format},
-            input_shape=inputs.shape)
-        testing_utils.layer_test(
-            keras.layers.ZeroPadding2D,
-            kwargs={'padding': ((1, 2), (3, 4)),
-                    'data_format': data_format},
-            input_shape=inputs.shape)
+    # basic test
+    with self.cached_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.ZeroPadding2D,
+          kwargs={
+              'padding': (2, 2),
+              'data_format': data_format
+          },
+          input_shape=inputs.shape)
+      testing_utils.layer_test(
+          keras.layers.ZeroPadding2D,
+          kwargs={
+              'padding': ((1, 2), (3, 4)),
+              'data_format': data_format
+          },
+          input_shape=inputs.shape)
 
-      # correctness test
-      with self.cached_session(use_gpu=True):
-        layer = keras.layers.ZeroPadding2D(
-            padding=(2, 2), data_format=data_format)
-        layer.build(inputs.shape)
-        output = layer(keras.backend.variable(inputs))
-        if context.executing_eagerly():
-          np_output = output.numpy()
-        else:
-          np_output = keras.backend.eval(output)
-        if data_format == 'channels_last':
-          for offset in [0, 1, -1, -2]:
-            np.testing.assert_allclose(np_output[:, offset, :, :], 0.)
-            np.testing.assert_allclose(np_output[:, :, offset, :], 0.)
-          np.testing.assert_allclose(np_output[:, 2:-2, 2:-2, :], 1.)
-        elif data_format == 'channels_first':
-          for offset in [0, 1, -1, -2]:
-            np.testing.assert_allclose(np_output[:, :, offset, :], 0.)
-            np.testing.assert_allclose(np_output[:, :, :, offset], 0.)
-          np.testing.assert_allclose(np_output[:, 2:-2, 2:-2, :], 1.)
+    # correctness test
+    with self.cached_session(use_gpu=True):
+      layer = keras.layers.ZeroPadding2D(
+          padding=(2, 2), data_format=data_format)
+      layer.build(inputs.shape)
+      output = layer(keras.backend.variable(inputs))
+      if context.executing_eagerly():
+        np_output = output.numpy()
+      else:
+        np_output = keras.backend.eval(output)
+      if data_format == 'channels_last':
+        for offset in [0, 1, -1, -2]:
+          np.testing.assert_allclose(np_output[:, offset, :, :], 0.)
+          np.testing.assert_allclose(np_output[:, :, offset, :], 0.)
+        np.testing.assert_allclose(np_output[:, 2:-2, 2:-2, :], 1.)
+      elif data_format == 'channels_first':
+        for offset in [0, 1, -1, -2]:
+          np.testing.assert_allclose(np_output[:, :, offset, :], 0.)
+          np.testing.assert_allclose(np_output[:, :, :, offset], 0.)
+        np.testing.assert_allclose(np_output[:, 2:-2, 2:-2, :], 1.)
 
-        layer = keras.layers.ZeroPadding2D(
-            padding=((1, 2), (3, 4)), data_format=data_format)
-        layer.build(inputs.shape)
-        output = layer(keras.backend.variable(inputs))
-        if context.executing_eagerly():
-          np_output = output.numpy()
-        else:
-          np_output = keras.backend.eval(output)
-        if data_format == 'channels_last':
-          for top_offset in [0]:
-            np.testing.assert_allclose(np_output[:, top_offset, :, :], 0.)
-          for bottom_offset in [-1, -2]:
-            np.testing.assert_allclose(np_output[:, bottom_offset, :, :], 0.)
-          for left_offset in [0, 1, 2]:
-            np.testing.assert_allclose(np_output[:, :, left_offset, :], 0.)
-          for right_offset in [-1, -2, -3, -4]:
-            np.testing.assert_allclose(np_output[:, :, right_offset, :], 0.)
-          np.testing.assert_allclose(np_output[:, 1:-2, 3:-4, :], 1.)
-        elif data_format == 'channels_first':
-          for top_offset in [0]:
-            np.testing.assert_allclose(np_output[:, :, top_offset, :], 0.)
-          for bottom_offset in [-1, -2]:
-            np.testing.assert_allclose(np_output[:, :, bottom_offset, :], 0.)
-          for left_offset in [0, 1, 2]:
-            np.testing.assert_allclose(np_output[:, :, :, left_offset], 0.)
-          for right_offset in [-1, -2, -3, -4]:
-            np.testing.assert_allclose(np_output[:, :, :, right_offset], 0.)
-          np.testing.assert_allclose(np_output[:, :, 1:-2, 3:-4], 1.)
+      layer = keras.layers.ZeroPadding2D(
+          padding=((1, 2), (3, 4)), data_format=data_format)
+      layer.build(inputs.shape)
+      output = layer(keras.backend.variable(inputs))
+      if context.executing_eagerly():
+        np_output = output.numpy()
+      else:
+        np_output = keras.backend.eval(output)
+      if data_format == 'channels_last':
+        for top_offset in [0]:
+          np.testing.assert_allclose(np_output[:, top_offset, :, :], 0.)
+        for bottom_offset in [-1, -2]:
+          np.testing.assert_allclose(np_output[:, bottom_offset, :, :], 0.)
+        for left_offset in [0, 1, 2]:
+          np.testing.assert_allclose(np_output[:, :, left_offset, :], 0.)
+        for right_offset in [-1, -2, -3, -4]:
+          np.testing.assert_allclose(np_output[:, :, right_offset, :], 0.)
+        np.testing.assert_allclose(np_output[:, 1:-2, 3:-4, :], 1.)
+      elif data_format == 'channels_first':
+        for top_offset in [0]:
+          np.testing.assert_allclose(np_output[:, :, top_offset, :], 0.)
+        for bottom_offset in [-1, -2]:
+          np.testing.assert_allclose(np_output[:, :, bottom_offset, :], 0.)
+        for left_offset in [0, 1, 2]:
+          np.testing.assert_allclose(np_output[:, :, :, left_offset], 0.)
+        for right_offset in [-1, -2, -3, -4]:
+          np.testing.assert_allclose(np_output[:, :, :, right_offset], 0.)
+        np.testing.assert_allclose(np_output[:, :, 1:-2, 3:-4], 1.)
 
-      # test incorrect use
-      with self.assertRaises(ValueError):
-        keras.layers.ZeroPadding2D(padding=(1, 1, 1))
-      with self.assertRaises(ValueError):
-        keras.layers.ZeroPadding2D(padding=None)
+    # test incorrect use
+    with self.assertRaises(ValueError):
+      keras.layers.ZeroPadding2D(padding=(1, 1, 1))
+    with self.assertRaises(ValueError):
+      keras.layers.ZeroPadding2D(padding=None)
 
   @parameterized.named_parameters(('channels_first', 'channels_first'),
                                   ('channels_last', 'channels_last'))
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index 43f61a0b861..36ac087ef64 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -43,6 +43,7 @@ from tensorflow.python.keras.engine import keras_tensor
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.layers.ops import core as core_ops
+from tensorflow.python.keras.utils import control_flow_util
 from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
@@ -213,9 +214,8 @@ class Dropout(Layer):
           seed=self.seed,
           rate=self.rate)
 
-    output = tf_utils.smart_cond(training,
-                                 dropped_inputs,
-                                 lambda: array_ops.identity(inputs))
+    output = control_flow_util.smart_cond(training, dropped_inputs,
+                                          lambda: array_ops.identity(inputs))
     return output
 
   def compute_output_shape(self, input_shape):
@@ -679,7 +679,7 @@ class Flatten(Layer):
         return array_ops.reshape(inputs, flattened_shape)
 
   def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.as_shape(input_shape).as_list()
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if not input_shape:
       output_shape = tensor_shape.TensorShape([1])
     else:
@@ -1306,6 +1306,10 @@ class TFOpLambda(Layer):
 
     super(TFOpLambda, self).__init__(**kwargs)
 
+    # Preserve all argument data structures when saving/loading a config
+    # (e.g., don't unnest lists that contain one element)
+    self._preserve_input_structure_in_config = True
+
     # Warning on every invocation will be quite irksome in Eager mode.
     self._already_warned = False
 
@@ -1422,3 +1426,94 @@ class KerasOpDispatcher(dispatch.GlobalOpDispatcher):
       return self.NOT_SUPPORTED
 
 KerasOpDispatcher().register()
+
+
+def _slice_to_dict(x):
+  if isinstance(x, slice):
+    return {'start': x.start, 'stop': x.stop, 'step': x.step}
+  return x
+
+
+def _dict_to_slice(x):
+  if isinstance(x, dict):
+    return slice(x['start'], x['stop'], x['step'])
+  return x
+
+
+class SlicingOpLambda(TFOpLambda):
+  """Wraps TF API symbols in a `Layer` object.
+
+  It is inserted by the Functional API construction whenever users call
+  a supported TF symbol on KerasTensors.
+
+  Like Lambda layers, this layer tries to raise warnings when it detects users
+  explicitly use variables in the call. (To let them know
+  that the layer will not capture the variables).
+
+  This is useful in the case where users do something like:
+  x = keras.Input(...)
+  y = tf.Variable(...)
+  out = x * tf_variable
+  """
+
+  @trackable.no_automatic_dependency_tracking
+  def __init__(self, function, **kwargs):
+    super(SlicingOpLambda, self).__init__(function, **kwargs)
+
+    original_call = self.call
+    # Decorate the function to produce this layer's call method
+    def _call_wrapper(*args, **kwargs):
+      # Turn any slice dicts in the args back into `slice` objects.
+      # This conversion cannot use nest.flatten/map_structure,
+      # because dicts are flattened by nest while slices aren't.
+      # So, map_structure would only see the individual elements in the
+      # dict.
+      # This can't use map_structure_up_to either because the 'shallowness' of
+      # the shallow tree would have to vary depending on if only one dim or
+      # multiple are being sliced.
+      new_args = []
+      for arg in args:
+        arg = _dict_to_slice(arg)
+        if isinstance(arg, (list, tuple)):
+          new_arg = []
+          for sub_arg in arg:
+            new_arg.append(_dict_to_slice(sub_arg))
+          arg = new_arg
+        new_args.append(arg)
+
+      # Handle the kwargs too.
+      new_kwargs = {}
+      for key, value in kwargs.items():
+        value = _dict_to_slice(value)
+        if isinstance(value, (list, tuple)):
+          new_value = []
+          for v in value:
+            new_value.append(_dict_to_slice(v))
+          value = new_value
+        new_kwargs[key] = value
+
+      return original_call(*new_args, **new_kwargs)
+    self.call = tf_decorator.make_decorator(original_call, _call_wrapper)
+
+
+class TFSlicingOpDispatcher(dispatch.OpDispatcher):
+  """A global dispatcher that allows building a functional model with TF Ops."""
+
+  def __init__(self, op):
+    self.op = op
+
+  def handle(self, args, kwargs):
+    """Handle the specified operation with the specified arguments."""
+    args = nest.map_structure(_slice_to_dict, args)
+    kwargs = nest.map_structure(_slice_to_dict, kwargs)
+    if any(
+        isinstance(x, keras_tensor.KerasTensor)
+        for x in nest.flatten([args, kwargs])):
+      return SlicingOpLambda(self.op)(*args, **kwargs)
+    else:
+      return self.NOT_SUPPORTED
+
+for slicing_op in [array_ops._slice_helper,  # pylint: disable=protected-access
+                   array_ops.boolean_mask,
+                   array_ops.boolean_mask_v2]:
+  TFSlicingOpDispatcher(slicing_op).register(slicing_op)
diff --git a/tensorflow/python/keras/layers/dense_attention.py b/tensorflow/python/keras/layers/dense_attention.py
index fa9aea02372..cd277a1a6a9 100644
--- a/tensorflow/python/keras/layers/dense_attention.py
+++ b/tensorflow/python/keras/layers/dense_attention.py
@@ -27,7 +27,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine.base_layer import Layer
-from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.keras.utils import control_flow_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -49,6 +49,8 @@ class BaseDenseAttention(Layer):
       flow of information from the future towards the past.
     dropout: Float between 0 and 1. Fraction of the units to drop for the
       attention scores.
+    return_attention_scores: bool, it `True`, returns the attention scores
+      (after masking and softmax) as an additional output argument.
 
   Call Arguments:
 
@@ -68,15 +70,19 @@ class BaseDenseAttention(Layer):
     training: Python boolean indicating whether the layer should behave in
       training mode (adding dropout) or in inference mode (no dropout).
 
-  Output shape:
+  Output:
 
     Attention outputs of shape `[batch_size, Tq, dim]`.
+    [Optional] Attention scores after masking and softmax with shape
+      `[batch_size, Tq, Tv]`.
   """
 
-  def __init__(self, causal=False, dropout=0.0, **kwargs):
+  def __init__(self, causal=False, dropout=0.0, return_attention_scores=False,
+               **kwargs):
     super(BaseDenseAttention, self).__init__(**kwargs)
     self.causal = causal
     self.dropout = dropout
+    self.return_attention_scores = return_attention_scores
     self.supports_masking = True
 
   def _calculate_scores(self, query, key):
@@ -115,6 +121,8 @@ class BaseDenseAttention(Layer):
 
     Returns:
       Tensor of shape `[batch_size, Tq, dim]`.
+      Attention scores after masking and softmax with shape
+        `[batch_size, Tq, Tv]`.
     """
     if scores_mask is not None:
       padding_mask = math_ops.logical_not(scores_mask)
@@ -127,11 +135,9 @@ class BaseDenseAttention(Layer):
     def dropped_weights():
       return nn.dropout(weights, rate=self.dropout)
 
-    weights = tf_utils.smart_cond(
-        training,
-        dropped_weights,
-        lambda: array_ops.identity(weights))
-    return math_ops.matmul(weights, value)
+    weights = control_flow_util.smart_cond(training, dropped_weights,
+                                           lambda: array_ops.identity(weights))
+    return math_ops.matmul(weights, value), weights
 
   # TODO(b/125916026): Consider exposing a __call__ method with named args.
   def call(self, inputs, mask=None, training=None):
@@ -158,12 +164,14 @@ class BaseDenseAttention(Layer):
     else:
       causal_mask = None
     scores_mask = _merge_masks(v_mask, causal_mask)
-    result = self._apply_scores(
+    result, attention_scores = self._apply_scores(
         scores=scores, value=v, scores_mask=scores_mask, training=training)
     if q_mask is not None:
       # Mask of shape [batch_size, Tq, 1].
       q_mask = array_ops.expand_dims(q_mask, axis=-1)
       result *= math_ops.cast(q_mask, dtype=result.dtype)
+    if self.return_attention_scores:
+      return result, attention_scores
     return result
 
   def compute_mask(self, inputs, mask=None):
@@ -192,7 +200,7 @@ class BaseDenseAttention(Layer):
         raise ValueError(
             '{} layer mask must be a list, '
             'namely [query_mask, value_mask].'.format(class_name))
-      if len(mask) != 2:
+      if len(mask) < 2 or len(mask) > len(inputs):
         raise ValueError(
             '{} layer mask must be a list of length 2, namely [query_mask, '
             'value_mask]. Given length: {}'.format(class_name, len(mask)))
@@ -201,6 +209,7 @@ class BaseDenseAttention(Layer):
     config = {
         'causal': self.causal,
         'dropout': self.dropout,
+        'return_attention_scores': self.return_attention_scores,
     }
     base_config = super(BaseDenseAttention, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -230,6 +239,8 @@ class Attention(BaseDenseAttention):
       flow of information from the future towards the past.
     dropout: Float between 0 and 1. Fraction of the units to drop for the
       attention scores.
+    return_attention_scores: bool, it `True`, returns the attention scores
+      (after masking and softmax) as an additional output argument.
 
   Call Arguments:
 
@@ -249,9 +260,11 @@ class Attention(BaseDenseAttention):
     training: Python boolean indicating whether the layer should behave in
       training mode (adding dropout) or in inference mode (no dropout).
 
-  Output shape:
+  Output:
 
     Attention outputs of shape `[batch_size, Tq, dim]`.
+    [Optional] Attention scores after masking and softmax with shape
+      `[batch_size, Tq, Tv]`.
 
   The meaning of `query`, `value` and `key` depend on the application. In the
   case of text similarity, for example, `query` is the sequence embeddings of
@@ -266,7 +279,7 @@ class Attention(BaseDenseAttention):
   value_input = tf.keras.Input(shape=(None,), dtype='int32')
 
   # Embedding lookup.
-  token_embedding = tf.keras.layers.Embedding(max_tokens, dimension)
+  token_embedding = tf.keras.layers.Embedding(input_dim=1000, output_dim=64)
   # Query embeddings of shape [batch_size, Tq, dimension].
   query_embeddings = token_embedding(query_input)
   # Value embeddings of shape [batch_size, Tv, dimension].
@@ -365,6 +378,8 @@ class AdditiveAttention(BaseDenseAttention):
       flow of information from the future towards the past.
     dropout: Float between 0 and 1. Fraction of the units to drop for the
       attention scores.
+    return_attention_scores: bool, it `True`, returns the attention scores
+      (after masking and softmax) as an additional output argument.
 
   Call Arguments:
 
@@ -384,9 +399,11 @@ class AdditiveAttention(BaseDenseAttention):
     training: Python boolean indicating whether the layer should behave in
       training mode (adding dropout) or in inference mode (no dropout).
 
-  Output shape:
+  Output:
 
     Attention outputs of shape `[batch_size, Tq, dim]`.
+    [Optional] Attention scores after masking and softmax with shape
+      `[batch_size, Tq, Tv]`.
 
   The meaning of `query`, `value` and `key` depend on the application. In the
   case of text similarity, for example, `query` is the sequence embeddings of
diff --git a/tensorflow/python/keras/layers/dense_attention_test.py b/tensorflow/python/keras/layers/dense_attention_test.py
index 504c4ab6984..942304e4316 100644
--- a/tensorflow/python/keras/layers/dense_attention_test.py
+++ b/tensorflow/python/keras/layers/dense_attention_test.py
@@ -23,7 +23,6 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.layers import dense_attention
@@ -41,11 +40,14 @@ class BaseDenseAttentionTest(test.TestCase, parameterized.TestCase):
     v = np.array([[[1.6]]], dtype=np.float32)
     # Scores mask tensor of shape [1, 1, 1]
     scores_mask = np.array([[[True]]], dtype=np.bool_)
-    actual = dense_attention.BaseDenseAttention()._apply_scores(
+    actual, actual_scores = dense_attention.BaseDenseAttention()._apply_scores(
         scores=scores, value=v, scores_mask=scores_mask)
 
+    # Expected softmax_scores = [[[1]]]
+    expected_scores = np.array([[[1.]]], dtype=np.float32)
+    self.assertAllClose(expected_scores, actual_scores)
     # Expected tensor of shape [1, 1, 1].
-    # expected000 = softmax(scores)[0, 0] * 1.6 = 1.6
+    # expected000 = softmax_scores[0, 0] * 1.6 = 1.6
     expected = np.array([[[1.6]]], dtype=np.float32)
     self.assertAllClose(expected, actual)
 
@@ -54,11 +56,14 @@ class BaseDenseAttentionTest(test.TestCase, parameterized.TestCase):
     scores = np.array([[[1.1]]], dtype=np.float32)
     # Value tensor of shape [1, 1, 1]
     v = np.array([[[1.6]]], dtype=np.float32)
-    actual = dense_attention.BaseDenseAttention()._apply_scores(
+    actual, actual_scores = dense_attention.BaseDenseAttention()._apply_scores(
         scores=scores, value=v)
 
+    # Expected softmax_scores = [[[1]]]
+    expected_scores = np.array([[[1.]]], dtype=np.float32)
+    self.assertAllClose(expected_scores, actual_scores)
     # Expected tensor of shape [1, 1, 1].
-    # expected000 = softmax(scores)[0, 0] * 1.6 = 1.6
+    # expected000 = softmax_scores[0, 0] * 1.6 = 1.6
     expected = np.array([[[1.6]]], dtype=np.float32)
     self.assertAllClose(expected, actual)
 
@@ -69,15 +74,17 @@ class BaseDenseAttentionTest(test.TestCase, parameterized.TestCase):
     v = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
     # Scores mask tensor of shape [1, 1, 3]
     scores_mask = np.array([[[True, True, False]]], dtype=np.bool_)
-    actual = dense_attention.BaseDenseAttention()._apply_scores(
+    actual, actual_scores = dense_attention.BaseDenseAttention()._apply_scores(
         scores=scores, value=v, scores_mask=scores_mask)
 
-    # Expected attention distribution = softmax(scores) with zeros in
-    # positions where v_mask == False.
-    # => attention_distribution000 = exp(1)/(exp(1) + exp(0)) = 0.73105857863
-    #    attention_distribution001 = exp(0)/(exp(1) + exp(0)) = 0.26894142137
-    #    attention_distribution002 = 0
-    #
+    # Expected softmax scores = softmax(scores) with zeros in positions where
+    # v_mask == False.
+    # => softmax_scores000 = exp(1)/(exp(1) + exp(0)) = 0.73105857863
+    #    softmax_scores001 = exp(0)/(exp(1) + exp(0)) = 0.26894142137
+    #    softmax_scores002 = 0
+    expected_scores = np.array(
+        [[[0.73105857863, 0.26894142137, 0.]]], dtype=np.float32)
+    self.assertAllClose(expected_scores, actual_scores)
     # Expected tensor of shape [1, 1, 1].
     # expected000 = 0.73105857863 * 1.6 + 0.26894142137 * 0.7 - 0 * 0.8
     #             = 1.35795272077
@@ -89,17 +96,19 @@ class BaseDenseAttentionTest(test.TestCase, parameterized.TestCase):
     scores = np.array([[[1., 0., 1.]]], dtype=np.float32)
     # Value tensor of shape [1, 3, 1]
     v = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
-    actual = dense_attention.BaseDenseAttention()._apply_scores(
+    actual, actual_scores = dense_attention.BaseDenseAttention()._apply_scores(
         scores=scores, value=v)
 
-    # Expected attention distribution = softmax(scores).
-    # => attention_distribution000 = exp(1)/(exp(1) + exp(0) + exp(1))
-    #                              = 0.42231879825
-    #    attention_distribution001 = exp(0)/(exp(1) + exp(0) + exp(1))
-    #                              = 0.15536240349
-    #    attention_distribution002 = exp(1)/(exp(1) + exp(0) + exp(1))
-    #                              = 0.42231879825
-    #
+    # Expected softmax_scores = softmax(scores).
+    # => softmax_scores000 = exp(1)/(exp(1) + exp(0) + exp(1))
+    #                      = 0.42231879825
+    #    softmax_scores001 = exp(0)/(exp(1) + exp(0) + exp(1))
+    #                      = 0.15536240349
+    #    softmax_scores002 = exp(1)/(exp(1) + exp(0) + exp(1))
+    #                      = 0.42231879825
+    expected_scores = np.array(
+        [[[0.42231879825, 0.15536240349, 0.42231879825]]], dtype=np.float32)
+    self.assertAllClose(expected_scores, actual_scores)
     # Expected tensor of shape [1, 1, 1].
     # expected000 = 0.42231879825 * 1.6 + 0.15536240349 * 0.7
     #               - 0.42231879825 * 0.8
@@ -114,12 +123,15 @@ class BaseDenseAttentionTest(test.TestCase, parameterized.TestCase):
     v = np.array([[[1.6]], [[2.6]]], dtype=np.float32)
     # Scpres mask tensor of shape [2, 1, 1]
     scores_mask = np.array([[[True]], [[True]]], dtype=np.bool_)
-    actual = dense_attention.BaseDenseAttention()._apply_scores(
+    actual, actual_scores = dense_attention.BaseDenseAttention()._apply_scores(
         scores=scores, value=v, scores_mask=scores_mask)
 
+    # Expected softmax_scores = [[[1]], [[1]]]
+    expected_scores = np.array([[[1.]], [[1.]]], dtype=np.float32)
+    self.assertAllClose(expected_scores, actual_scores)
     # Expected tensor of shape [2, 1, 1].
-    # expected000 = softmax(scores)[0, 0] * 1.6 = 1.6
-    # expected100 = softmax(scores)[1, 0] * 2.6 = 2.6
+    # expected000 = softmax_scores[0, 0] * 1.6 = 1.6
+    # expected100 = softmax_scores[1, 0] * 2.6 = 2.6
     expected = np.array([[[1.6]], [[2.6]]], dtype=np.float32)
     self.assertAllClose(expected, actual)
 
@@ -132,9 +144,13 @@ class BaseDenseAttentionTest(test.TestCase, parameterized.TestCase):
     dim = 7
     scores = np.ones((batch_size, tq, tv))
     value = np.ones((batch_size, tv, dim))
-    actual = dense_attention.BaseDenseAttention(dropout=0.1)._apply_scores(
-        scores=scores, value=value, training=False)
+    actual, actual_scores = dense_attention.BaseDenseAttention(
+        dropout=0.1)._apply_scores(
+            scores=scores, value=value, training=False)
 
+    # Expected Tensor of shape `[batch_size, tq, tv]`.
+    expected_scores_shape = [batch_size, tq, tv]
+    self.assertAllEqual(expected_scores_shape, array_ops.shape(actual_scores))
     # Expected Tensor of shape `[batch_size, tq, dim]`.
     expected_shape = [batch_size, tq, dim]
     self.assertAllEqual(expected_shape, array_ops.shape(actual))
@@ -313,7 +329,11 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
     expected = np.array([[[0.58127362329]]], dtype=np.float32)
     self.assertAllClose(expected, actual)
 
-  def test_multi_dim_with_query_mask(self):
+  @parameterized.named_parameters(
+      ('', False),
+      ('return_attention_scores', True),
+  )
+  def test_multi_dim_with_query_mask(self, return_attention_scores):
     # Query tensor of shape [1, 2, 1]
     q = np.array([[[1.1], [-0.5]]], dtype=np.float32)
     # Value tensor of shape [1, 3, 1]
@@ -322,8 +342,12 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
     q_mask = np.array([[True, False]], dtype=np.bool_)
     # Value mask tensor of shape [1, 3]
     v_mask = np.array([[True, True, False]], dtype=np.bool_)
-    attention_layer = dense_attention.Attention()
-    actual = attention_layer([q, v], mask=[q_mask, v_mask])
+    attention_layer = dense_attention.Attention(
+        return_attention_scores=return_attention_scores)
+    if return_attention_scores:
+      actual, actual_scores = attention_layer([q, v], mask=[q_mask, v_mask])
+    else:
+      actual = attention_layer([q, v], mask=[q_mask, v_mask])
 
     # Expected scores of shape [1, 2, 3]
     # scores = [[[1.1*1.6, 1.1*0.7, -1.1*0.8], [-0.5*1.6, -0.5*0.7, 0.5*0.8]]]
@@ -340,7 +364,12 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
     #    attention_distribution011 = exp(-0.35)/(exp(-0.8) + exp(-0.35))
     #                              = 0.61063923394
     #    attention_distribution012 = 0
-    #
+    if return_attention_scores:
+      expected_scores = np.array(
+          [[[0.72908792234, 0.27091207765, 0.],
+            [0.38936076605, 0.61063923394, 0.]]],
+          dtype=np.float32)
+      self.assertAllClose(expected_scores, actual_scores)
     # Expected tensor of shape [1, 2, 1] with zeros where  q_mask == False.
     # expected000 = 0.72908792234 * 1.6 + 0.27091207765 * 0.7 - 0 * 0.8
     #             = 1.3561791301
@@ -361,7 +390,6 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
       attention_layer.build(input_shape=([1, 1, 1], [1, 1, 1]))
       self.assertAllClose(1., attention_layer.scale.value())
 
-  @test_util.deprecated_graph_mode_only
   def test_scale_init_graph(self):
     """Tests that scale initializes to 1 when use_scale=True."""
     with self.cached_session() as sess:
@@ -370,11 +398,19 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
       sess.run(attention_layer.scale.initializer)
       self.assertAllClose(1., attention_layer.scale.value())
 
-  def test_self_attention_causal(self):
+  @parameterized.named_parameters(
+      ('', False),
+      ('return_attention_scores', True),
+  )
+  def test_self_attention_causal(self, return_attention_scores):
     # Query-value tensor of shape [1, 3, 1]
     q = np.array([[[0.5], [0.8], [-0.3]]], dtype=np.float32)
-    attention_layer = dense_attention.Attention(causal=True)
-    actual = attention_layer([q, q])
+    attention_layer = dense_attention.Attention(
+        causal=True, return_attention_scores=return_attention_scores)
+    if return_attention_scores:
+      actual, actual_scores = attention_layer([q, q])
+    else:
+      actual = attention_layer([q, q])
 
     # Expected scores of shape [1, 3, 3]
     # scores = [[0.25, 0.4, -0.15], [0.4, 0.64, -0.24], [-0.15, -0.24, 0.09]]
@@ -387,7 +423,13 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
     #      = [exp(-0.15), exp(-0.24), exp(0.09)]
     #        / (exp(-0.15) + exp(-0.24) + exp(0.09))
     #      = [0.31395396638, 0.28693232061, 0.399113713]
-    #
+    if return_attention_scores:
+      expected_scores = np.array(
+          [[[1., 0., 0.],
+            [0.44028635073, 0.55971364926, 0.],
+            [0.31395396638, 0.28693232061, 0.399113713]]],
+          dtype=np.float32)
+      self.assertAllClose(expected_scores, actual_scores)
     # Expected tensor of shape [1, 3, 1].
     # expected000 = 0.5
     # expected010 = 0.44028635073 * 0.5 + 0.55971364926 * 0.8
@@ -457,17 +499,25 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
     actual = attention_layer([q, v])
     self.assertAllClose([[[0], [1]]], actual)
 
-  def test_serialization(self):
+  @parameterized.named_parameters(
+      ('', False, False),
+      ('use_scale', True, False),
+      ('return_attention_scores', False, True),
+  )
+  def test_serialization(self, use_scale, return_attention_scores):
     # Test serialization with use_scale
-    layer = dense_attention.Attention(use_scale=True)
+    layer = dense_attention.Attention(
+        use_scale=use_scale, return_attention_scores=return_attention_scores)
 
     config = keras.layers.serialize(layer)
     new_layer = keras.layers.deserialize(config)
-    self.assertEqual(new_layer.use_scale, True)
+    self.assertEqual(new_layer.use_scale, use_scale)
+    self.assertEqual(new_layer.return_attention_scores, return_attention_scores)
 
     config = layer.get_config()
     new_layer = dense_attention.Attention.from_config(config)
-    self.assertEqual(new_layer.use_scale, True)
+    self.assertEqual(new_layer.use_scale, use_scale)
+    self.assertEqual(new_layer.return_attention_scores, return_attention_scores)
 
 
 @combinations.generate(combinations.combine(mode=['graph', 'eager']))
diff --git a/tensorflow/python/keras/layers/einsum_dense.py b/tensorflow/python/keras/layers/einsum_dense.py
index 7b5bd085703..f8f2e01058d 100644
--- a/tensorflow/python/keras/layers/einsum_dense.py
+++ b/tensorflow/python/keras/layers/einsum_dense.py
@@ -73,7 +73,7 @@ class EinsumDense(Layer):
   >>> input_tensor = tf.keras.Input(shape=[32])
   >>> output_tensor = layer(input_tensor)
   >>> output_tensor
-  <tf.Tensor '...' shape=(None, 64) dtype=...>
+  <... shape=(None, 64) dtype=...>
 
   **Applying a dense layer to a sequence**
 
@@ -89,7 +89,7 @@ class EinsumDense(Layer):
   >>> input_tensor = tf.keras.Input(shape=[32, 128])
   >>> output_tensor = layer(input_tensor)
   >>> output_tensor
-  <tf.Tensor '...' shape=(None, 32, 64) dtype=...>
+  <... shape=(None, 32, 64) dtype=...>
 
   **Applying a dense layer to a sequence using ellipses**
 
@@ -106,7 +106,7 @@ class EinsumDense(Layer):
   >>> input_tensor = tf.keras.Input(shape=[32, 128])
   >>> output_tensor = layer(input_tensor)
   >>> output_tensor
-  <tf.Tensor '...' shape=(None, 32, 64) dtype=...>
+  <... shape=(None, 32, 64) dtype=...>
   """
 
   def __init__(self,
diff --git a/tensorflow/python/keras/layers/embeddings.py b/tensorflow/python/keras/layers/embeddings.py
index 3444b3a7665..defa03409a2 100644
--- a/tensorflow/python/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/layers/embeddings.py
@@ -132,8 +132,7 @@ class Embedding(Layer):
     # right now. Checking for the presence of GPUs to avoid complicating the
     # TPU codepaths which can handle sparse optimizers. But if we are within
     # a tf.function, we go back the graph mode logic and rely on the placer.
-    if (context.executing_eagerly() and context.context().num_gpus() and
-        not ops.inside_function()):
+    if context.executing_eagerly() and context.context().num_gpus():
       with ops.device('cpu:0'):
         self.embeddings = self.add_weight(
             shape=(self.input_dim, self.output_dim),
diff --git a/tensorflow/python/keras/layers/gru_v2_test.py b/tensorflow/python/keras/layers/gru_v2_test.py
index be71a4100bd..e5100e495d3 100644
--- a/tensorflow/python/keras/layers/gru_v2_test.py
+++ b/tensorflow/python/keras/layers/gru_v2_test.py
@@ -186,7 +186,7 @@ class GRUV2Test(keras_parameterized.TestCase):
     gru_model.fit(x_train, y_train)
     y_2 = gru_model.predict(x_train)
 
-    with test_util.device(use_gpu=True):
+    with testing_utils.device(should_use_gpu=True):
       cudnn_layer = rnn.GRU(rnn_state_size,
                             recurrent_activation='sigmoid',
                             reset_after=True)
@@ -249,14 +249,14 @@ class GRUV2Test(keras_parameterized.TestCase):
 
     inputs = keras.layers.Input(
         shape=[timestep, input_shape], dtype=dtypes.float32)
-    with test_util.device(use_gpu=False):
+    with testing_utils.device(should_use_gpu=False):
       layer = rnn.GRU(rnn_state_size)
       output = layer(inputs)
       cpu_model = keras.models.Model(inputs, output)
       weights = cpu_model.get_weights()
       y_1 = cpu_model.predict(x_train)
 
-    with test_util.device(use_gpu=True):
+    with testing_utils.device(should_use_gpu=True):
       layer = rnn.GRU(rnn_state_size)
       output = layer(inputs)
       gpu_model = keras.models.Model(inputs, output)
@@ -266,7 +266,7 @@ class GRUV2Test(keras_parameterized.TestCase):
     # Note that CuDNN uses 'sigmoid' as activation, so the GRU V2 uses
     # 'sigmoid' as default. Construct the canonical GRU with sigmoid to achieve
     # the same output.
-    with test_util.device(use_gpu=True):
+    with testing_utils.device(should_use_gpu=True):
       layer = rnn_v1.GRU(rnn_state_size,
                          recurrent_activation='sigmoid',
                          reset_after=True)
@@ -583,14 +583,14 @@ class GRUV2Test(keras_parameterized.TestCase):
 
     # Test for V1 behavior.
     lstm_v1 = rnn_v1.GRU(units, return_sequences=True, go_backwards=True)
-    with test_util.device(use_gpu=True):
+    with testing_utils.device(should_use_gpu=True):
       outputs_masked_v1 = lstm_v1(inputs, mask=constant_op.constant(mask))
       outputs_trimmed_v1 = lstm_v1(inputs[:, :masksteps])
     self.assertAllClose(outputs_masked_v1[:, -masksteps:], outputs_trimmed_v1)
 
     # Test for V2 behavior.
     lstm = rnn.GRU(units, return_sequences=True, go_backwards=True)
-    with test_util.device(use_gpu=True):
+    with testing_utils.device(should_use_gpu=True):
       outputs_masked = lstm(inputs, mask=constant_op.constant(mask))
       outputs_trimmed = lstm(inputs[:, :masksteps])
     self.assertAllClose(outputs_masked[:, -masksteps:], outputs_trimmed)
@@ -612,6 +612,34 @@ class GRUV2Test(keras_parameterized.TestCase):
       model.compile(loss='mse', optimizer='sgd')
       model.fit(dataset)
 
+  def test_with_fully_masked_inputs(self):
+    num_samples = 8
+    timestep = 5
+    embedding_dim = 4
+    vocab_size = 20
+    units = 2
+
+    inputs = np.random.randint(0, vocab_size, size=(num_samples, timestep))
+    # Set the first inputs to be fully zero.
+    inputs[0, :] = 0.0
+
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.Embedding(
+            vocab_size,
+            embedding_dim,
+            mask_zero=True,
+            input_length=timestep,
+            batch_input_shape=(num_samples, timestep)))
+    layer = rnn.GRU(units)
+    model.add(layer)
+    model.compile(
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    # Make sure it doesn't crash with cudnn kernel.
+    model.predict(inputs)
+
 
 class GRULayerGradientTapeTest(keras_parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/layers/kernelized.py b/tensorflow/python/keras/layers/kernelized.py
index 5f401899bec..eac985e63bf 100644
--- a/tensorflow/python/keras/layers/kernelized.py
+++ b/tensorflow/python/keras/layers/kernelized.py
@@ -236,8 +236,8 @@ class RandomFourierFeatures(base_layer.Layer):
 
   def get_config(self):
     kernel_initializer = self.kernel_initializer
-    if isinstance(self.kernel_initializer, init_ops.Initializer):
-      kernel_initializer = initializers.serialize(self.kernel_initializer)
+    if not isinstance(kernel_initializer, six.string_types):
+      kernel_initializer = initializers.serialize(kernel_initializer)
     config = {
         'output_dim': self.output_dim,
         'kernel_initializer': kernel_initializer,
diff --git a/tensorflow/python/keras/layers/kernelized_test.py b/tensorflow/python/keras/layers/kernelized_test.py
index 3c836f1ccde..8ae3b2f31cb 100644
--- a/tensorflow/python/keras/layers/kernelized_test.py
+++ b/tensorflow/python/keras/layers/kernelized_test.py
@@ -25,6 +25,7 @@ import shutil
 
 from absl.testing import parameterized
 import numpy as np
+import six
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
@@ -226,7 +227,7 @@ class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
         name='random_fourier_features',
     )
     expected_initializer = initializer
-    if isinstance(initializer, init_ops.Initializer):
+    if not isinstance(initializer, six.string_types):
       expected_initializer = initializers.serialize(initializer)
 
     expected_dtype = (
diff --git a/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_impl.py b/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_impl.py
index 43bcd799f8b..1e33edd497c 100644
--- a/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_impl.py
+++ b/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_impl.py
@@ -33,7 +33,9 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import activations
+from tensorflow.python.keras import backend
 from tensorflow.python.keras import initializers
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.keras.layers.legacy_rnn import rnn_cell_wrapper_impl
 from tensorflow.python.keras.utils import tf_utils
@@ -49,7 +51,6 @@ from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
-from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
 _BIAS_VARIABLE_NAME = "bias"
@@ -133,7 +134,7 @@ def _concat(prefix, suffix, static=False):
       raise ValueError("prefix tensor must be either a scalar or vector, "
                        "but saw tensor: %s" % p)
   else:
-    p = tensor_shape.as_shape(prefix)
+    p = tensor_shape.TensorShape(prefix)
     p_static = p.as_list() if p.ndims is not None else None
     p = (
         constant_op.constant(p.as_list(), dtype=dtypes.int32)
@@ -147,14 +148,14 @@ def _concat(prefix, suffix, static=False):
       raise ValueError("suffix tensor must be either a scalar or vector, "
                        "but saw tensor: %s" % s)
   else:
-    s = tensor_shape.as_shape(suffix)
+    s = tensor_shape.TensorShape(suffix)
     s_static = s.as_list() if s.ndims is not None else None
     s = (
         constant_op.constant(s.as_list(), dtype=dtypes.int32)
         if s.is_fully_defined() else None)
 
   if static:
-    shape = tensor_shape.as_shape(p_static).concatenate(s_static)
+    shape = tensor_shape.TensorShape(p_static).concatenate(s_static)
     shape = shape.as_list() if shape.ndims is not None else None
   else:
     if p is None or s is None:
@@ -250,7 +251,7 @@ class RNNCell(base_layer.Layer):
     else:
       trainable = (
           variable in tf_variables.trainable_variables() or
-          (isinstance(variable, tf_variables.PartitionedVariable) and
+          (base_layer_utils.is_split_variable(variable) and
            list(variable)[0] in tf_variables.trainable_variables()))
     if trainable and all(variable is not v for v in self._trainable_weights):
       self._trainable_weights.append(variable)
@@ -333,7 +334,7 @@ class RNNCell(base_layer.Layer):
       if (last_batch_size == batch_size and last_dtype == dtype and
           last_state_size == state_size):
         return last_output
-    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
+    with backend.name_scope(type(self).__name__ + "ZeroState"):
       output = _zero_state_tensors(state_size, batch_size, dtype)
     if is_eager:
       self._last_zero_state = (state_size, batch_size, dtype, output)
@@ -408,8 +409,6 @@ class BasicRNNCell(LayerRNNCell):
       `trainable` etc when constructing the cell from configs of get_config().
   """
 
-  @deprecated(None, "This class is equivalent as tf.keras.layers.SimpleRNNCell,"
-              " and will be replaced by that in Tensorflow 2.0.")
   def __init__(self,
                num_units,
                activation=None,
@@ -417,6 +416,9 @@ class BasicRNNCell(LayerRNNCell):
                name=None,
                dtype=None,
                **kwargs):
+    logging.warning("`tf.nn.rnn_cell.BasicRNNCell` is deprecated. This class "
+                    "is equivalent as `tf.keras.layers.SimpleRNNCell`, "
+                    "and will be replaced by that in Tensorflow 2.0.")
     super(BasicRNNCell, self).__init__(
         _reuse=reuse, name=name, dtype=dtype, **kwargs)
     _check_supported_dtypes(self.dtype)
@@ -512,8 +514,6 @@ class GRUCell(LayerRNNCell):
       ([pdf](http://emnlp2014.org/papers/pdf/EMNLP2014179.pdf))
   """
 
-  @deprecated(None, "This class is equivalent as tf.keras.layers.GRUCell,"
-              " and will be replaced by that in Tensorflow 2.0.")
   def __init__(self,
                num_units,
                activation=None,
@@ -523,6 +523,9 @@ class GRUCell(LayerRNNCell):
                name=None,
                dtype=None,
                **kwargs):
+    logging.warning("`tf.nn.rnn_cell.GRUCell` is deprecated. This class "
+                    "is equivalent as `tf.keras.layers.GRUCell`, "
+                    "and will be replaced by that in Tensorflow 2.0.")
     super(GRUCell, self).__init__(
         _reuse=reuse, name=name, dtype=dtype, **kwargs)
     _check_supported_dtypes(self.dtype)
@@ -660,8 +663,6 @@ class BasicLSTMCell(LayerRNNCell):
   better performance on CPU.
   """
 
-  @deprecated(None, "This class is equivalent as tf.keras.layers.LSTMCell,"
-              " and will be replaced by that in Tensorflow 2.0.")
   def __init__(self,
                num_units,
                forget_bias=1.0,
@@ -694,6 +695,9 @@ class BasicLSTMCell(LayerRNNCell):
         When restoring from CudnnLSTM-trained checkpoints, must use
         `CudnnCompatibleLSTMCell` instead.
     """
+    logging.warning("`tf.nn.rnn_cell.BasicLSTMCell` is deprecated. This class "
+                    "is equivalent as `tf.keras.layers.LSTMCell`, "
+                    "and will be replaced by that in Tensorflow 2.0.")
     super(BasicLSTMCell, self).__init__(
         _reuse=reuse, name=name, dtype=dtype, **kwargs)
     _check_supported_dtypes(self.dtype)
@@ -836,8 +840,6 @@ class LSTMCell(LayerRNNCell):
       ([pdf](http://ml.jku.at/publications/older/3504.pdf))
   """
 
-  @deprecated(None, "This class is equivalent as tf.keras.layers.LSTMCell,"
-              " and will be replaced by that in Tensorflow 2.0.")
   def __init__(self,
                num_units,
                use_peepholes=False,
@@ -893,6 +895,9 @@ class LSTMCell(LayerRNNCell):
         When restoring from CudnnLSTM-trained checkpoints, use
         `CudnnCompatibleLSTMCell` instead.
     """
+    logging.warning("`tf.nn.rnn_cell.LSTMCell` is deprecated. This class "
+                    "is equivalent as `tf.keras.layers.LSTMCell`, "
+                    "and will be replaced by that in Tensorflow 2.0.")
     super(LSTMCell, self).__init__(
         _reuse=reuse, name=name, dtype=dtype, **kwargs)
     _check_supported_dtypes(self.dtype)
@@ -1215,9 +1220,6 @@ class MultiRNNCell(RNNCell):
   ```
   """
 
-  @deprecated(None, "This class is equivalent as "
-              "tf.keras.layers.StackedRNNCells, and will be replaced by "
-              "that in Tensorflow 2.0.")
   def __init__(self, cells, state_is_tuple=True):
     """Create a RNN cell composed sequentially of a number of RNNCells.
 
@@ -1231,10 +1233,13 @@ class MultiRNNCell(RNNCell):
       ValueError: if cells is empty (not allowed), or at least one of the cells
         returns a state tuple but the flag `state_is_tuple` is `False`.
     """
+    logging.warning("`tf.nn.rnn_cell.MultiRNNCell` is deprecated. This class "
+                    "is equivalent as `tf.keras.layers.StackedRNNCells`, "
+                    "and will be replaced by that in Tensorflow 2.0.")
     super(MultiRNNCell, self).__init__()
     if not cells:
       raise ValueError("Must specify at least one cell for MultiRNNCell.")
-    if not nest.is_sequence(cells):
+    if not nest.is_nested(cells):
       raise TypeError("cells must be a list or tuple, but saw: %s." % cells)
 
     if len(set(id(cell) for cell in cells)) < len(cells):
@@ -1251,7 +1256,7 @@ class MultiRNNCell(RNNCell):
         self._track_trackable(cell, name="cell-%d" % (cell_number,))
     self._state_is_tuple = state_is_tuple
     if not state_is_tuple:
-      if any(nest.is_sequence(c.state_size) for c in self._cells):
+      if any(nest.is_nested(c.state_size) for c in self._cells):
         raise ValueError("Some cells return tuples of states, but the flag "
                          "state_is_tuple is not set.  State sizes are: %s" %
                          str([c.state_size for c in self._cells]))
@@ -1268,7 +1273,7 @@ class MultiRNNCell(RNNCell):
     return self._cells[-1].output_size
 
   def zero_state(self, batch_size, dtype):
-    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
+    with backend.name_scope(type(self).__name__ + "ZeroState"):
       if self._state_is_tuple:
         return tuple(cell.zero_state(batch_size, dtype) for cell in self._cells)
       else:
@@ -1308,7 +1313,7 @@ class MultiRNNCell(RNNCell):
     for i, cell in enumerate(self._cells):
       with vs.variable_scope("cell_%d" % i):
         if self._state_is_tuple:
-          if not nest.is_sequence(state):
+          if not nest.is_nested(state):
             raise ValueError(
                 "Expected state to be a tuple of length %d, but received: %s" %
                 (len(self.state_size), state))
diff --git a/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_wrapper_impl.py b/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_wrapper_impl.py
index 62a6baa5640..2e3923918a0 100644
--- a/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_wrapper_impl.py
+++ b/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_wrapper_impl.py
@@ -113,7 +113,7 @@ class DropoutWrapperBase(object):
       raise TypeError("dropout_state_filter_visitor must be callable")
     self._dropout_state_filter = (
         dropout_state_filter_visitor or _default_dropout_state_filter_visitor)
-    with ops.name_scope("DropoutWrapperInit"):
+    with ops.name_scope_v2("DropoutWrapperInit"):
 
       def tensor_and_const_value(v):
         tensor_value = ops.convert_to_tensor(v)
@@ -199,7 +199,7 @@ class DropoutWrapperBase(object):
     self.built = True
 
   def zero_state(self, batch_size, dtype):
-    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
+    with ops.name_scope_v2(type(self).__name__ + "ZeroState"):
       return self.cell.zero_state(batch_size, dtype)
 
   def _variational_recurrent_dropout_value(
@@ -346,7 +346,7 @@ class ResidualWrapperBase(object):
     return self.cell.output_size
 
   def zero_state(self, batch_size, dtype):
-    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
+    with ops.name_scope_v2(type(self).__name__ + "ZeroState"):
       return self.cell.zero_state(batch_size, dtype)
 
   def _call_wrapped_cell(self, inputs, state, cell_call_fn, **kwargs):
@@ -433,7 +433,7 @@ class DeviceWrapperBase(object):
     return self.cell.output_size
 
   def zero_state(self, batch_size, dtype):
-    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
+    with ops.name_scope_v2(type(self).__name__ + "ZeroState"):
       with ops.device(self._device):
         return self.cell.zero_state(batch_size, dtype)
 
diff --git a/tensorflow/python/keras/layers/lstm_v2_test.py b/tensorflow/python/keras/layers/lstm_v2_test.py
index b60d8acb5f2..e9774e15c5f 100644
--- a/tensorflow/python/keras/layers/lstm_v2_test.py
+++ b/tensorflow/python/keras/layers/lstm_v2_test.py
@@ -359,7 +359,7 @@ class LSTMV2Test(keras_parameterized.TestCase):
     lstm_model.fit(x_train, y_train)
     y_2 = lstm_model.predict(x_train)
 
-    with test_util.device(use_gpu=True):
+    with testing_utils.device(should_use_gpu=True):
       cudnn_layer = rnn.LSTM(rnn_state_size)
       cudnn_model = keras.models.Model(inputs, cudnn_layer(masked_input))
     cudnn_model.set_weights(weights)
@@ -551,14 +551,14 @@ class LSTMV2Test(keras_parameterized.TestCase):
 
     inputs = keras.layers.Input(
         shape=[timestep, input_shape], dtype=dtypes.float32)
-    with test_util.device(use_gpu=False):
+    with testing_utils.device(should_use_gpu=False):
       layer = rnn.LSTM(rnn_state_size)
       output = layer(inputs)
       cpu_model = keras.models.Model(inputs, output)
       weights = cpu_model.get_weights()
     y_1 = cpu_model.predict(x_train)
 
-    with test_util.device(use_gpu=True):
+    with testing_utils.device(should_use_gpu=True):
       layer = rnn.LSTM(rnn_state_size)
       output = layer(inputs)
       gpu_model = keras.models.Model(inputs, output)
@@ -568,7 +568,7 @@ class LSTMV2Test(keras_parameterized.TestCase):
     # Note that CuDNN uses 'sigmoid' as activation, so the LSTM V2 uses
     # 'sigmoid' as default. Construct the canonical LSTM with sigmoid to achieve
     # the same output.
-    with test_util.device(use_gpu=True):
+    with testing_utils.device(should_use_gpu=True):
       layer = rnn_v1.LSTM(rnn_state_size, recurrent_activation='sigmoid')
       output = layer(inputs)
       canonical_model = keras.models.Model(inputs, output)
@@ -784,14 +784,14 @@ class LSTMV2Test(keras_parameterized.TestCase):
 
     # Test for V1 behavior.
     lstm_v1 = rnn_v1.LSTM(units, return_sequences=True, go_backwards=True)
-    with test_util.device(use_gpu=True):
+    with testing_utils.device(should_use_gpu=True):
       outputs_masked_v1 = lstm_v1(inputs, mask=constant_op.constant(mask))
       outputs_trimmed_v1 = lstm_v1(inputs[:, :masksteps])
     self.assertAllClose(outputs_masked_v1[:, -masksteps:], outputs_trimmed_v1)
 
     # Test for V2 behavior.
     lstm = rnn.LSTM(units, return_sequences=True, go_backwards=True)
-    with test_util.device(use_gpu=True):
+    with testing_utils.device(should_use_gpu=True):
       outputs_masked = lstm(inputs, mask=constant_op.constant(mask))
       outputs_trimmed = lstm(inputs[:, :masksteps])
     self.assertAllClose(outputs_masked[:, -masksteps:], outputs_trimmed)
@@ -813,6 +813,34 @@ class LSTMV2Test(keras_parameterized.TestCase):
       model.compile(loss='mse', optimizer='sgd')
       model.fit(dataset)
 
+  def test_with_fully_masked_inputs(self):
+    num_samples = 8
+    timestep = 5
+    embedding_dim = 4
+    vocab_size = 20
+    units = 2
+
+    inputs = np.random.randint(0, vocab_size, size=(num_samples, timestep))
+    # Set the first inputs to be fully zero.
+    inputs[0, :] = 0.0
+
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.Embedding(
+            vocab_size,
+            embedding_dim,
+            mask_zero=True,
+            input_length=timestep,
+            batch_input_shape=(num_samples, timestep)))
+    layer = rnn.LSTM(units)
+    model.add(layer)
+    model.compile(
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    # Make sure it doesn't crash with cudnn kernel.
+    model.predict(inputs)
+
 
 @keras_parameterized.run_all_keras_modes(config=_config)
 class LSTMGraphRewriteTest(keras_parameterized.TestCase):
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index e5723a3ef98..12013882ff5 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -28,7 +28,7 @@ from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_spec import InputSpec
-from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.keras.utils import control_flow_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -568,21 +568,22 @@ class BatchNormalizationBase(Layer):
     train_op = _fused_batch_norm_training
     if use_fused_avg_updates and input_batch_size is not None:
       # pylint: disable=g-long-lambda
-      train_op = lambda: tf_utils.smart_cond(input_batch_size > 0,
-                                             _fused_batch_norm_training,
-                                             _fused_batch_norm_training_empty)
+      train_op = lambda: control_flow_util.smart_cond(
+          input_batch_size > 0, _fused_batch_norm_training,
+          _fused_batch_norm_training_empty)
       # pylint: enable=g-long-lambda
 
-    output, mean, variance = tf_utils.smart_cond(training, train_op,
-                                                 _fused_batch_norm_inference)
+    output, mean, variance = control_flow_util.smart_cond(
+        training, train_op, _fused_batch_norm_inference)
     variance = _maybe_add_or_remove_bessels_correction(variance, remove=True)
 
-    training_value = tf_utils.constant_value(training)
+    training_value = control_flow_util.constant_value(training)
     if training_value or training_value is None:
       if not use_fused_avg_updates:
         if training_value is None:
-          momentum = tf_utils.smart_cond(training, lambda: self.momentum,
-                                         lambda: 1.0)
+          momentum = control_flow_util.smart_cond(training,
+                                                  lambda: self.momentum,
+                                                  lambda: 1.0)
         else:
           momentum = ops.convert_to_tensor_v2(self.momentum)
 
@@ -635,9 +636,10 @@ class BatchNormalizationBase(Layer):
       d = math_ops.maximum(d, -dmax)
       d = math_ops.minimum(d, dmax)
     # When not training, use r=1, d=0.
-    r = tf_utils.smart_cond(training, lambda: r, lambda: array_ops.ones_like(r))
-    d = tf_utils.smart_cond(training, lambda: d,
-                            lambda: array_ops.zeros_like(d))
+    r = control_flow_util.smart_cond(training, lambda: r,
+                                     lambda: array_ops.ones_like(r))
+    d = control_flow_util.smart_cond(training, lambda: d,
+                                     lambda: array_ops.zeros_like(d))
 
     def _update_renorm_variable(var, value, inputs_size):
       """Updates a moving average and weight, returns the unbiased value."""
@@ -652,7 +654,7 @@ class BatchNormalizationBase(Layer):
       def _fake_update():
         return array_ops.identity(var)
 
-      return tf_utils.smart_cond(training, _do_update, _fake_update)
+      return control_flow_util.smart_cond(training, _do_update, _fake_update)
 
     # TODO(yuefengz): colocate the operations
     update_new_mean = _update_renorm_variable(self.renorm_mean, mean,
@@ -760,17 +762,17 @@ class BatchNormalizationBase(Layer):
       return (scale, offset)
 
     # Determine a boolean value for `training`: could be True, False, or None.
-    training_value = tf_utils.constant_value(training)
+    training_value = control_flow_util.constant_value(training)
     if training_value == False:  # pylint: disable=singleton-comparison,g-explicit-bool-comparison
       mean, variance = self.moving_mean, self.moving_variance
     else:
       if self.adjustment:
         adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs))
         # Adjust only during training.
-        adj_scale = tf_utils.smart_cond(training, lambda: adj_scale,
-                                        lambda: array_ops.ones_like(adj_scale))
-        adj_bias = tf_utils.smart_cond(training, lambda: adj_bias,
-                                       lambda: array_ops.zeros_like(adj_bias))
+        adj_scale = control_flow_util.smart_cond(
+            training, lambda: adj_scale, lambda: array_ops.ones_like(adj_scale))
+        adj_bias = control_flow_util.smart_cond(
+            training, lambda: adj_bias, lambda: array_ops.zeros_like(adj_bias))
         scale, offset = _compose_transforms(adj_scale, adj_bias, scale, offset)
 
       # Some of the computations here are not necessary when training==False
@@ -784,9 +786,9 @@ class BatchNormalizationBase(Layer):
       moving_mean = self.moving_mean
       moving_variance = self.moving_variance
 
-      mean = tf_utils.smart_cond(training, lambda: mean,
-                                 lambda: ops.convert_to_tensor_v2(moving_mean))
-      variance = tf_utils.smart_cond(
+      mean = control_flow_util.smart_cond(
+          training, lambda: mean, lambda: ops.convert_to_tensor_v2(moving_mean))
+      variance = control_flow_util.smart_cond(
           training, lambda: variance,
           lambda: ops.convert_to_tensor_v2(moving_variance))
 
@@ -826,7 +828,7 @@ class BatchNormalizationBase(Layer):
       def mean_update():
         true_branch = lambda: _do_update(self.moving_mean, new_mean)
         false_branch = lambda: self.moving_mean
-        return tf_utils.smart_cond(training, true_branch, false_branch)
+        return control_flow_util.smart_cond(training, true_branch, false_branch)
 
       def variance_update():
         """Update the moving variance."""
@@ -848,7 +850,7 @@ class BatchNormalizationBase(Layer):
           true_branch = lambda: _do_update(self.moving_variance, new_variance)
 
         false_branch = lambda: self.moving_variance
-        return tf_utils.smart_cond(training, true_branch, false_branch)
+        return control_flow_util.smart_cond(training, true_branch, false_branch)
 
       self.add_update(mean_update)
       self.add_update(variance_update)
diff --git a/tensorflow/python/keras/layers/normalization_v2.py b/tensorflow/python/keras/layers/normalization_v2.py
index 48af6b97ce1..afc5c4ba412 100644
--- a/tensorflow/python/keras/layers/normalization_v2.py
+++ b/tensorflow/python/keras/layers/normalization_v2.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 from tensorflow.python.distribute import distribution_strategy_context as ds
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend
 from tensorflow.python.keras.layers import normalization
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -158,7 +158,7 @@ class SyncBatchNormalization(normalization.BatchNormalizationBase):
 
   def _calculate_mean_and_var(self, x, axes, keep_dims):
 
-    with ops.name_scope('moments', values=[x, axes]):
+    with backend.name_scope('moments'):
       # The dynamic range of fp16 is too limited to support the collection of
       # sufficient statistics. As a workaround we simply perform the operations
       # on 32-bit floats before converting the mean and variance back to fp16
@@ -208,9 +208,8 @@ class SyncBatchNormalization(normalization.BatchNormalizationBase):
 class BatchNormalization(normalization.BatchNormalizationBase):
 
   __doc__ = normalization.replace_in_base_docstring([
-      ('{{TRAINABLE_ATTRIBUTE_NOTE}}',
-       '''
-  **About setting `layer.trainable = False` on a `BatchNormalization layer:**
+      ('{{TRAINABLE_ATTRIBUTE_NOTE}}', '''
+  **About setting `layer.trainable = False` on a `BatchNormalization` layer:**
 
   The meaning of setting `layer.trainable = False` is to freeze the layer,
   i.e. its internal state will not change during training:
@@ -242,6 +241,7 @@ class BatchNormalization(normalization.BatchNormalizationBase):
       attribute is changed after calling `compile()` on a model,
       the new value doesn't take effect for this model
       until `compile()` is called again.
-      ''')])
+      ''')
+  ])
 
   _USE_V2_BEHAVIOR = True
diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index 9adf97d1fa5..3e6624bac40 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@@ -326,6 +326,7 @@ tf_py_test(
 distribute_py_test(
     name = "category_encoding_distribution_test",
     srcs = ["category_encoding_distribution_test.py"],
+    disable_mlir_bridge = False,
     main = "category_encoding_distribution_test.py",
     python_version = "PY3",
     tags = [
@@ -423,6 +424,7 @@ cuda_py_test(
 tpu_py_test(
     name = "hashing_distribution_test",
     srcs = ["hashing_distribution_test.py"],
+    disable_mlir_bridge = False,
     main = "hashing_distribution_test.py",
     python_version = "PY3",
     tags = ["multi_and_single_gpu"],
@@ -453,6 +455,7 @@ tf_py_test(
 tpu_py_test(
     name = "index_lookup_distribution_test",
     srcs = ["index_lookup_distribution_test.py"],
+    disable_mlir_bridge = False,
     main = "index_lookup_distribution_test.py",
     python_version = "PY3",
     tags = ["no_oss"],
@@ -473,6 +476,7 @@ cuda_py_test(
     deps = [
         ":image_preprocessing",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras:testing_utils",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -541,6 +545,9 @@ tf_py_test(
     srcs = ["text_vectorization_test.py"],
     python_version = "PY3",
     shard_count = 4,
+    tags = [
+        "noasan",  #TODO(b/161376526): Enable when bug fix lands.
+    ],
     deps = [
         ":preprocessing_test_utils",
         ":text_vectorization",
diff --git a/tensorflow/python/keras/layers/preprocessing/category_crossing.py b/tensorflow/python/keras/layers/preprocessing/category_crossing.py
index 7c80e0e140e..bdb29d21c4e 100644
--- a/tensorflow/python/keras/layers/preprocessing/category_crossing.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_crossing.py
@@ -26,7 +26,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
-from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine import base_preprocessing_layer
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.ragged import ragged_array_ops
@@ -35,7 +36,7 @@ from tensorflow.python.util.tf_export import keras_export
 
 
 @keras_export('keras.layers.experimental.preprocessing.CategoryCrossing')
-class CategoryCrossing(Layer):
+class CategoryCrossing(base_preprocessing_layer.PreprocessingLayer):
   """Category crossing layer.
 
   This layer concatenates multiple categorical inputs into a single categorical
@@ -115,6 +116,7 @@ class CategoryCrossing(Layer):
 
   def __init__(self, depth=None, name=None, separator=None, **kwargs):
     super(CategoryCrossing, self).__init__(name=name, **kwargs)
+    base_preprocessing_layer._kpl_gauge.get_cell('V2').set('CategoryCrossing')
     self.depth = depth
     if separator is None:
       separator = '_X_'
@@ -150,7 +152,7 @@ class CategoryCrossing(Layer):
     inputs = [self._preprocess_input(inp) for inp in inputs]
     depth_tuple = self._depth_tuple if self.depth else (len(inputs),)
     ragged_out = sparse_out = False
-    if any(ragged_tensor.is_ragged(inp) for inp in inputs):
+    if any(tf_utils.is_ragged(inp) for inp in inputs):
       ragged_out = True
     elif any(isinstance(inp, sparse_tensor.SparseTensor) for inp in inputs):
       sparse_out = True
diff --git a/tensorflow/python/keras/layers/preprocessing/category_encoding.py b/tensorflow/python/keras/layers/preprocessing/category_encoding.py
index 128188b09c2..95540176e04 100644
--- a/tensorflow/python/keras/layers/preprocessing/category_encoding.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_encoding.py
@@ -131,6 +131,7 @@ class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
         compute_max_element=max_tokens is None,
         compute_idf=output_mode == TFIDF)
     super(CategoryEncoding, self).__init__(combiner=combiner, **kwargs)
+    base_preprocessing_layer._kpl_gauge.get_cell("V2").set("CategoryEncoding")
 
     self._max_tokens = max_tokens
     self._output_mode = output_mode
diff --git a/tensorflow/python/keras/layers/preprocessing/discretization.py b/tensorflow/python/keras/layers/preprocessing/discretization.py
index d621410146c..6f5414d1a9f 100644
--- a/tensorflow/python/keras/layers/preprocessing/discretization.py
+++ b/tensorflow/python/keras/layers/preprocessing/discretization.py
@@ -20,16 +20,16 @@ from __future__ import print_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_spec
-from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine import base_preprocessing_layer
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
-from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util.tf_export import keras_export
 
 
 @keras_export("keras.layers.experimental.preprocessing.Discretization")
-class Discretization(Layer):
+class Discretization(base_preprocessing_layer.PreprocessingLayer):
   """Buckets data into discrete ranges.
 
   This layer will place each element of its input data into one of several
@@ -61,6 +61,7 @@ class Discretization(Layer):
 
   def __init__(self, bins, **kwargs):
     super(Discretization, self).__init__(**kwargs)
+    base_preprocessing_layer._kpl_gauge.get_cell("V2").set("Discretization")
     self.bins = bins
 
   def get_config(self):
@@ -82,20 +83,19 @@ class Discretization(Layer):
     return tensor_spec.TensorSpec(shape=output_shape, dtype=output_dtype)
 
   def call(self, inputs):
-    if ragged_tensor.is_ragged(inputs):
+    if tf_utils.is_ragged(inputs):
       integer_buckets = ragged_functional_ops.map_flat_values(
-          math_ops._bucketize, inputs, boundaries=self.bins)  # pylint: disable=protected-access
+          gen_math_ops.Bucketize, input=inputs, boundaries=self.bins)
       # Ragged map_flat_values doesn't touch the non-values tensors in the
       # ragged composite tensor. If this op is the only op a Keras model,
       # this can cause errors in Graph mode, so wrap the tensor in an identity.
       return array_ops.identity(integer_buckets)
     elif isinstance(inputs, sparse_tensor.SparseTensor):
-      integer_buckets = math_ops._bucketize(  # pylint: disable=protected-access
-          inputs.values,
-          boundaries=self.bins)
+      integer_buckets = gen_math_ops.Bucketize(
+          input=inputs.values, boundaries=self.bins)
       return sparse_tensor.SparseTensor(
           indices=array_ops.identity(inputs.indices),
           values=integer_buckets,
           dense_shape=array_ops.identity(inputs.dense_shape))
     else:
-      return math_ops._bucketize(inputs, boundaries=self.bins)  # pylint: disable=protected-access
+      return gen_math_ops.Bucketize(input=inputs, boundaries=self.bins)
diff --git a/tensorflow/python/keras/layers/preprocessing/hashing.py b/tensorflow/python/keras/layers/preprocessing/hashing.py
index 861301637fc..a6de075535c 100644
--- a/tensorflow/python/keras/layers/preprocessing/hashing.py
+++ b/tensorflow/python/keras/layers/preprocessing/hashing.py
@@ -27,7 +27,8 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine import base_preprocessing_layer
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import gen_sparse_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import string_ops
@@ -40,7 +41,7 @@ _DEFAULT_SALT_KEY = [0xDECAFCAFFE, 0xDECAFCAFFE]
 
 
 @keras_export('keras.layers.experimental.preprocessing.Hashing')
-class Hashing(Layer):
+class Hashing(base_preprocessing_layer.PreprocessingLayer):
   """Implements categorical feature hashing, also known as "hashing trick".
 
   This layer transforms single or multiple categorical inputs to hashed output.
@@ -137,6 +138,7 @@ class Hashing(Layer):
     if num_bins is None or num_bins <= 0:
       raise ValueError('`num_bins` cannot be `None` or non-positive values.')
     super(Hashing, self).__init__(name=name, **kwargs)
+    base_preprocessing_layer._kpl_gauge.get_cell('V2').set('Hashing')
     self.num_bins = num_bins
     self.strong_hash = True if salt is not None else False
     if salt is not None:
@@ -182,7 +184,7 @@ class Hashing(Layer):
       else:
         inputs = string_ops.as_string(inputs)
     str_to_hash_bucket = self._get_string_to_hash_bucket_fn()
-    if ragged_tensor.is_ragged(inputs):
+    if tf_utils.is_ragged(inputs):
       return ragged_functional_ops.map_flat_values(
           str_to_hash_bucket, inputs, num_buckets=self.num_bins, name='hash')
     elif isinstance(inputs, sparse_tensor.SparseTensor):
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
index dd741c8c72c..87a18db31f3 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
@@ -26,12 +26,14 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine import base_preprocessing_layer
+from tensorflow.python.keras.engine.base_preprocessing_layer import PreprocessingLayer
 from tensorflow.python.keras.engine.input_spec import InputSpec
-from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.keras.utils import control_flow_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_image_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import stateful_random_ops
@@ -57,17 +59,17 @@ W_AXIS = 2
 
 
 def check_fill_mode_and_interpolation(fill_mode, interpolation):
-  if fill_mode not in {'reflect', 'wrap', 'constant'}:
+  if fill_mode not in {'reflect', 'wrap', 'constant', 'nearest'}:
     raise NotImplementedError(
-        'Unknown `fill_mode` {}. Only `reflect`, `wrap` and '
-        '`constant` are supported.'.format(fill_mode))
+        'Unknown `fill_mode` {}. Only `reflect`, `wrap`, '
+        '`constant` and `nearest` are supported.'.format(fill_mode))
   if interpolation not in {'nearest', 'bilinear'}:
     raise NotImplementedError('Unknown `interpolation` {}. Only `nearest` and '
                               '`bilinear` are supported.'.format(interpolation))
 
 
 @keras_export('keras.layers.experimental.preprocessing.Resizing')
-class Resizing(Layer):
+class Resizing(PreprocessingLayer):
   """Image resizing layer.
 
   Resize the batched image input to target height and width. The input should
@@ -94,6 +96,7 @@ class Resizing(Layer):
     self._interpolation_method = get_interpolation(interpolation)
     self.input_spec = InputSpec(ndim=4)
     super(Resizing, self).__init__(name=name, **kwargs)
+    base_preprocessing_layer._kpl_gauge.get_cell('V2').set('Resizing')
 
   def call(self, inputs):
     outputs = image_ops.resize_images_v2(
@@ -118,7 +121,7 @@ class Resizing(Layer):
 
 
 @keras_export('keras.layers.experimental.preprocessing.CenterCrop')
-class CenterCrop(Layer):
+class CenterCrop(PreprocessingLayer):
   """Crop the central portion of the images to target height and width.
 
   Input shape:
@@ -143,6 +146,7 @@ class CenterCrop(Layer):
     self.target_width = width
     self.input_spec = InputSpec(ndim=4)
     super(CenterCrop, self).__init__(name=name, **kwargs)
+    base_preprocessing_layer._kpl_gauge.get_cell('V2').set('CenterCrop')
 
   def call(self, inputs):
     inputs_shape = array_ops.shape(inputs)
@@ -185,7 +189,7 @@ class CenterCrop(Layer):
 
 
 @keras_export('keras.layers.experimental.preprocessing.RandomCrop')
-class RandomCrop(Layer):
+class RandomCrop(PreprocessingLayer):
   """Randomly crop the images to target height and width.
 
   This layer will crop all the images in the same batch to the same cropping
@@ -217,6 +221,7 @@ class RandomCrop(Layer):
     self._rng = make_generator(self.seed)
     self.input_spec = InputSpec(ndim=4)
     super(RandomCrop, self).__init__(name=name, **kwargs)
+    base_preprocessing_layer._kpl_gauge.get_cell('V2').set('RandomCrop')
 
   def call(self, inputs, training=True):
     if training is None:
@@ -247,11 +252,11 @@ class RandomCrop(Layer):
       input_width_t = input_shape[W_AXIS]
       ratio_cond = (input_height_t / input_width_t > (self.height / self.width))
       # pylint: disable=g-long-lambda
-      resized_height = tf_utils.smart_cond(
+      resized_height = control_flow_util.smart_cond(
           ratio_cond,
           lambda: math_ops.cast(self.width * input_height_t / input_width_t,
                                 input_height_t.dtype), lambda: self.height)
-      resized_width = tf_utils.smart_cond(
+      resized_width = control_flow_util.smart_cond(
           ratio_cond, lambda: self.width,
           lambda: math_ops.cast(self.height * input_width_t / input_height_t,
                                 input_width_t.dtype))
@@ -268,8 +273,8 @@ class RandomCrop(Layer):
       outputs = array_ops.slice(resized_inputs, bbox_begin, bbox_size)
       return outputs
 
-    output = tf_utils.smart_cond(training, random_cropped_inputs,
-                                 resize_and_center_cropped_inputs)
+    output = control_flow_util.smart_cond(training, random_cropped_inputs,
+                                          resize_and_center_cropped_inputs)
     original_shape = inputs.shape.as_list()
     batch_size, num_channels = original_shape[0], original_shape[3]
     output_shape = [batch_size] + [self.height, self.width] + [num_channels]
@@ -292,7 +297,7 @@ class RandomCrop(Layer):
 
 
 @keras_export('keras.layers.experimental.preprocessing.Rescaling')
-class Rescaling(Layer):
+class Rescaling(PreprocessingLayer):
   """Multiply inputs by `scale` and adds `offset`.
 
   For instance:
@@ -321,6 +326,7 @@ class Rescaling(Layer):
     self.scale = scale
     self.offset = offset
     super(Rescaling, self).__init__(name=name, **kwargs)
+    base_preprocessing_layer._kpl_gauge.get_cell('V2').set('Rescaling')
 
   def call(self, inputs):
     dtype = self._compute_dtype
@@ -346,7 +352,7 @@ HORIZONTAL_AND_VERTICAL = 'horizontal_and_vertical'
 
 
 @keras_export('keras.layers.experimental.preprocessing.RandomFlip')
-class RandomFlip(Layer):
+class RandomFlip(PreprocessingLayer):
   """Randomly flip each image horizontally and vertically.
 
   This layer will flip the images based on the `mode` attribute.
@@ -376,6 +382,7 @@ class RandomFlip(Layer):
                name=None,
                **kwargs):
     super(RandomFlip, self).__init__(name=name, **kwargs)
+    base_preprocessing_layer._kpl_gauge.get_cell('V2').set('RandomFlip')
     self.mode = mode
     if mode == HORIZONTAL:
       self.horizontal = True
@@ -407,8 +414,8 @@ class RandomFlip(Layer):
             flipped_outputs, self.seed)
       return flipped_outputs
 
-    output = tf_utils.smart_cond(training, random_flipped_inputs,
-                                 lambda: inputs)
+    output = control_flow_util.smart_cond(training, random_flipped_inputs,
+                                          lambda: inputs)
     output.set_shape(inputs.shape)
     return output
 
@@ -426,7 +433,7 @@ class RandomFlip(Layer):
 
 # TODO(tanzheny): Add examples, here and everywhere.
 @keras_export('keras.layers.experimental.preprocessing.RandomTranslation')
-class RandomTranslation(Layer):
+class RandomTranslation(PreprocessingLayer):
   """Randomly translate each image during training.
 
   Arguments:
@@ -449,7 +456,7 @@ class RandomTranslation(Layer):
       `width_factor=0.2` results in an output height shifted left or right
       by 20%.
     fill_mode: Points outside the boundaries of the input are filled according
-      to the given mode (one of `{'constant', 'reflect', 'wrap'}`).
+      to the given mode (one of `{'constant', 'reflect', 'wrap', 'nearest'}`).
       - *reflect*: `(d c b a | a b c d | d c b a)`
         The input is extended by reflecting about the edge of the last pixel.
       - *constant*: `(k k k k | a b c d | k k k k)`
@@ -457,6 +464,8 @@ class RandomTranslation(Layer):
         same constant value k = 0.
       - *wrap*: `(a b c d | a b c d | a b c d)`
         The input is extended by wrapping around to the opposite edge.
+      - *nearest*: `(a a a a | a b c d | d d d d)`
+        The input is extended by the nearest pixel.
     interpolation: Interpolation mode. Supported values: "nearest", "bilinear".
     seed: Integer. Used to create a random seed.
     name: A string, the name of the layer.
@@ -518,6 +527,7 @@ class RandomTranslation(Layer):
     self._rng = make_generator(self.seed)
     self.input_spec = InputSpec(ndim=4)
     super(RandomTranslation, self).__init__(name=name, **kwargs)
+    base_preprocessing_layer._kpl_gauge.get_cell('V2').set('RandomTranslation')
 
   def call(self, inputs, training=True):
     if training is None:
@@ -551,8 +561,8 @@ class RandomTranslation(Layer):
           interpolation=self.interpolation,
           fill_mode=self.fill_mode)
 
-    output = tf_utils.smart_cond(training, random_translated_inputs,
-                                 lambda: inputs)
+    output = control_flow_util.smart_cond(training, random_translated_inputs,
+                                          lambda: inputs)
     output.set_shape(inputs.shape)
     return output
 
@@ -583,7 +593,7 @@ def get_translation_matrix(translations, name=None):
     A tensor of shape (num_images, 8) projective transforms which can be given
       to `transform`.
   """
-  with ops.name_scope(name, 'translation_matrix'):
+  with K.name_scope(name or 'translation_matrix'):
     num_translations = array_ops.shape(translations)[0]
     # The translation matrix looks like:
     #     [[1 0 -dx]
@@ -625,7 +635,7 @@ def transform(images,
       transform mapping input points to output points. Note that gradients are
       not backpropagated into transformation parameters.
     fill_mode: Points outside the boundaries of the input are filled according
-      to the given mode (one of `{'constant', 'reflect', 'wrap'}`).
+      to the given mode (one of `{'constant', 'reflect', 'wrap', 'nearest'}`).
     interpolation: Interpolation mode. Supported values: "nearest", "bilinear".
     output_shape: Output dimesion after the transform, [height, width]. If None,
       output is the same size as input image.
@@ -644,6 +654,9 @@ def transform(images,
   wrap (a b c d | a b c d | a b c d)
   The input is extended by wrapping around to the opposite edge.
 
+  nearest (a a a a | a b c d | d d d d)
+  The input is extended by the nearest pixel.
+
   Input shape:
     4D tensor with shape: `(samples, height, width, channels)`,
       data_format='channels_last'.
@@ -660,7 +673,7 @@ def transform(images,
     TypeError: If `image` is an invalid type.
     ValueError: If output shape is not 1-D int32 Tensor.
   """
-  with ops.name_scope(name, 'transform'):
+  with K.name_scope(name or 'transform'):
     if output_shape is None:
       output_shape = array_ops.shape(images)[1:3]
       if not context.executing_eagerly():
@@ -676,8 +689,8 @@ def transform(images,
                        'new_height, new_width, instead got '
                        '{}'.format(output_shape))
 
-    return image_ops.image_projective_transform_v2(
-        images,
+    return gen_image_ops.ImageProjectiveTransformV2(
+        images=images,
         output_shape=output_shape,
         transforms=transforms,
         fill_mode=fill_mode.upper(),
@@ -703,7 +716,7 @@ def get_rotation_matrix(angles, image_height, image_width, name=None):
        `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`,
        where `k = c0 x + c1 y + 1`.
   """
-  with ops.name_scope(name, 'rotation_matrix'):
+  with K.name_scope(name or 'rotation_matrix'):
     x_offset = ((image_width - 1) - (math_ops.cos(angles) *
                                      (image_width - 1) - math_ops.sin(angles) *
                                      (image_height - 1))) / 2.0
@@ -725,7 +738,7 @@ def get_rotation_matrix(angles, image_height, image_width, name=None):
 
 
 @keras_export('keras.layers.experimental.preprocessing.RandomRotation')
-class RandomRotation(Layer):
+class RandomRotation(PreprocessingLayer):
   """Randomly rotate each image.
 
   By default, random rotations are only applied during training.
@@ -751,13 +764,16 @@ class RandomRotation(Layer):
       `factor=0.2` results in an output rotating by a random amount in the range
       `[-20% * 2pi, 20% * 2pi]`.
     fill_mode: Points outside the boundaries of the input are filled according
-      to the given mode (one of `{'constant', 'reflect', 'wrap'}`).
+      to the given mode (one of `{'constant', 'reflect', 'wrap', 'nearest'}`).
       - *reflect*: `(d c b a | a b c d | d c b a)`
         The input is extended by reflecting about the edge of the last pixel.
       - *constant*: `(k k k k | a b c d | k k k k)`
         The input is extended by filling all values beyond the edge with the
         same constant value k = 0.
       - *wrap*: `(a b c d | a b c d | a b c d)`
+        The input is extended by wrapping around to the opposite edge.
+      - *nearest*: `(a a a a | a b c d | d d d d)`
+        The input is extended by the nearest pixel.
     interpolation: Interpolation mode. Supported values: "nearest", "bilinear".
     seed: Integer. Used to create a random seed.
     name: A string, the name of the layer.
@@ -798,6 +814,7 @@ class RandomRotation(Layer):
     self._rng = make_generator(self.seed)
     self.input_spec = InputSpec(ndim=4)
     super(RandomRotation, self).__init__(name=name, **kwargs)
+    base_preprocessing_layer._kpl_gauge.get_cell('V2').set('RandomRotation')
 
   def call(self, inputs, training=True):
     if training is None:
@@ -819,8 +836,8 @@ class RandomRotation(Layer):
           fill_mode=self.fill_mode,
           interpolation=self.interpolation)
 
-    output = tf_utils.smart_cond(training, random_rotated_inputs,
-                                 lambda: inputs)
+    output = control_flow_util.smart_cond(training, random_rotated_inputs,
+                                          lambda: inputs)
     output.set_shape(inputs.shape)
     return output
 
@@ -839,7 +856,7 @@ class RandomRotation(Layer):
 
 
 @keras_export('keras.layers.experimental.preprocessing.RandomZoom')
-class RandomZoom(Layer):
+class RandomZoom(PreprocessingLayer):
   """Randomly zoom each image during training.
 
   Arguments:
@@ -862,13 +879,16 @@ class RandomZoom(Layer):
       to 30%. Defaults to `None`, i.e., zooming vertical and horizontal
       directions by preserving the aspect ratio.
     fill_mode: Points outside the boundaries of the input are filled according
-      to the given mode (one of `{'constant', 'reflect', 'wrap'}`).
+      to the given mode (one of `{'constant', 'reflect', 'wrap', 'nearest'}`).
       - *reflect*: `(d c b a | a b c d | d c b a)`
         The input is extended by reflecting about the edge of the last pixel.
       - *constant*: `(k k k k | a b c d | k k k k)`
         The input is extended by filling all values beyond the edge with the
         same constant value k = 0.
       - *wrap*: `(a b c d | a b c d | a b c d)`
+        The input is extended by wrapping around to the opposite edge.
+      - *nearest*: `(a a a a | a b c d | d d d d)`
+        The input is extended by the nearest pixel.
     interpolation: Interpolation mode. Supported values: "nearest", "bilinear".
     seed: Integer. Used to create a random seed.
     name: A string, the name of the layer.
@@ -936,6 +956,7 @@ class RandomZoom(Layer):
     self._rng = make_generator(self.seed)
     self.input_spec = InputSpec(ndim=4)
     super(RandomZoom, self).__init__(name=name, **kwargs)
+    base_preprocessing_layer._kpl_gauge.get_cell('V2').set('RandomZoom')
 
   def call(self, inputs, training=True):
     if training is None:
@@ -966,8 +987,8 @@ class RandomZoom(Layer):
           fill_mode=self.fill_mode,
           interpolation=self.interpolation)
 
-    output = tf_utils.smart_cond(training, random_zoomed_inputs,
-                                 lambda: inputs)
+    output = control_flow_util.smart_cond(training, random_zoomed_inputs,
+                                          lambda: inputs)
     output.set_shape(inputs.shape)
     return output
 
@@ -1004,7 +1025,7 @@ def get_zoom_matrix(zooms, image_height, image_width, name=None):
        `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`,
        where `k = c0 x + c1 y + 1`.
   """
-  with ops.name_scope(name, 'zoom_matrix'):
+  with K.name_scope(name or 'zoom_matrix'):
     num_zooms = array_ops.shape(zooms)[0]
     # The zoom matrix looks like:
     #     [[zx 0 0]
@@ -1028,7 +1049,7 @@ def get_zoom_matrix(zooms, image_height, image_width, name=None):
 
 
 @keras_export('keras.layers.experimental.preprocessing.RandomContrast')
-class RandomContrast(Layer):
+class RandomContrast(PreprocessingLayer):
   """Adjust the contrast of an image or images by a random factor.
 
   Contrast is adjusted independently for each channel of each image during
@@ -1072,6 +1093,7 @@ class RandomContrast(Layer):
     self.seed = seed
     self.input_spec = InputSpec(ndim=4)
     super(RandomContrast, self).__init__(name=name, **kwargs)
+    base_preprocessing_layer._kpl_gauge.get_cell('V2').set('RandomContrast')
 
   def call(self, inputs, training=True):
     if training is None:
@@ -1081,8 +1103,8 @@ class RandomContrast(Layer):
       return image_ops.random_contrast(inputs, 1. - self.lower, 1. + self.upper,
                                        self.seed)
 
-    output = tf_utils.smart_cond(training, random_contrasted_inputs,
-                                 lambda: inputs)
+    output = control_flow_util.smart_cond(training, random_contrasted_inputs,
+                                          lambda: inputs)
     output.set_shape(inputs.shape)
     return output
 
@@ -1099,7 +1121,7 @@ class RandomContrast(Layer):
 
 
 @keras_export('keras.layers.experimental.preprocessing.RandomHeight')
-class RandomHeight(Layer):
+class RandomHeight(PreprocessingLayer):
   """Randomly vary the height of a batch of images during training.
 
   Adjusts the height of a batch of images by a random factor. The input
@@ -1155,6 +1177,7 @@ class RandomHeight(Layer):
     self.seed = seed
     self._rng = make_generator(self.seed)
     super(RandomHeight, self).__init__(name=name, **kwargs)
+    base_preprocessing_layer._kpl_gauge.get_cell('V2').set('RandomHeight')
 
   def call(self, inputs, training=True):
     if training is None:
@@ -1178,7 +1201,8 @@ class RandomHeight(Layer):
       output.set_shape(output_shape)
       return output
 
-    return tf_utils.smart_cond(training, random_height_inputs, lambda: inputs)
+    return control_flow_util.smart_cond(training, random_height_inputs,
+                                        lambda: inputs)
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
@@ -1196,7 +1220,7 @@ class RandomHeight(Layer):
 
 
 @keras_export('keras.layers.experimental.preprocessing.RandomWidth')
-class RandomWidth(Layer):
+class RandomWidth(PreprocessingLayer):
   """Randomly vary the width of a batch of images during training.
 
   Adjusts the width of a batch of images by a random factor. The input
@@ -1253,6 +1277,7 @@ class RandomWidth(Layer):
     self.seed = seed
     self._rng = make_generator(self.seed)
     super(RandomWidth, self).__init__(name=name, **kwargs)
+    base_preprocessing_layer._kpl_gauge.get_cell('V2').set('RandomWidth')
 
   def call(self, inputs, training=True):
     if training is None:
@@ -1276,7 +1301,8 @@ class RandomWidth(Layer):
       output.set_shape(output_shape)
       return output
 
-    return tf_utils.smart_cond(training, random_width_inputs, lambda: inputs)
+    return control_flow_util.smart_cond(training, random_width_inputs,
+                                        lambda: inputs)
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
index a3540fca6df..b51e948baea 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
@@ -47,7 +47,7 @@ class ResizingTest(keras_parameterized.TestCase):
     orig_width = 8
     channels = 3
     kwargs.update({'height': expected_height, 'width': expected_width})
-    with tf_test_util.use_gpu():
+    with testing_utils.use_gpu():
       testing_utils.layer_test(
           image_preprocessing.Resizing,
           kwargs=kwargs,
@@ -79,7 +79,7 @@ class ResizingTest(keras_parameterized.TestCase):
 
   def test_down_sampling_numeric(self):
     for dtype in (np.int64, np.float32):
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         input_image = np.reshape(np.arange(0, 16), (1, 4, 4, 1)).astype(dtype)
         layer = image_preprocessing.Resizing(
             height=2, width=2, interpolation='nearest')
@@ -95,7 +95,7 @@ class ResizingTest(keras_parameterized.TestCase):
 
   def test_up_sampling_numeric(self):
     for dtype in (np.int64, np.float32):
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         input_image = np.reshape(np.arange(0, 4), (1, 2, 2, 1)).astype(dtype)
         layer = image_preprocessing.Resizing(
             height=4, width=4, interpolation='nearest')
@@ -152,7 +152,7 @@ class CenterCropTest(keras_parameterized.TestCase):
         (num_samples, orig_height, orig_width, channels)).astype(np.float32)
     expected_output = get_numpy_center_crop(
         input_images, expected_height, expected_width)
-    with tf_test_util.use_gpu():
+    with testing_utils.use_gpu():
       testing_utils.layer_test(
           image_preprocessing.CenterCrop,
           kwargs=kwargs,
@@ -209,7 +209,7 @@ class RandomCropTest(keras_parameterized.TestCase):
     orig_width = 8
     channels = 3
     kwargs = {'height': expected_height, 'width': expected_width}
-    with tf_test_util.use_gpu():
+    with testing_utils.use_gpu():
       testing_utils.layer_test(
           image_preprocessing.RandomCrop,
           kwargs=kwargs,
@@ -240,7 +240,7 @@ class RandomCropTest(keras_parameterized.TestCase):
     with test.mock.patch.object(
         stateless_random_ops, 'stateless_random_uniform',
         return_value=mock_offset):
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         layer = image_preprocessing.RandomCrop(height, width)
         inp = np.random.random((12, 5, 8, 3))
         actual_output = layer(inp, training=1)
@@ -270,7 +270,7 @@ class RandomCropTest(keras_parameterized.TestCase):
     np.random.seed(1337)
     height, width = 8, 16
     inp = np.random.random((12, 8, 16, 3))
-    with tf_test_util.use_gpu():
+    with testing_utils.use_gpu():
       layer = image_preprocessing.RandomCrop(height, width)
       actual_output = layer(inp, training=0)
       self.assertAllClose(inp, actual_output)
@@ -279,7 +279,7 @@ class RandomCropTest(keras_parameterized.TestCase):
     np.random.seed(1337)
     height, width = 3, 3
     inp = np.random.random((12, 10, 6, 3))
-    with tf_test_util.use_gpu():
+    with testing_utils.use_gpu():
       layer = image_preprocessing.RandomCrop(height, width)
       actual_output = layer(inp, training=0)
       resized_inp = image_ops.resize_images_v2(
@@ -291,7 +291,7 @@ class RandomCropTest(keras_parameterized.TestCase):
     np.random.seed(1337)
     height, width = 4, 6
     inp = np.random.random((12, 8, 16, 3))
-    with tf_test_util.use_gpu():
+    with testing_utils.use_gpu():
       layer = image_preprocessing.RandomCrop(height, width)
       actual_output = layer(inp, training=0)
       resized_inp = image_ops.resize_images_v2(inp, size=[4, 8])
@@ -359,7 +359,7 @@ class RandomFlipTest(keras_parameterized.TestCase):
         expected_output = np.flip(expected_output, axis=1)
     with test.mock.patch.object(
         random_ops, 'random_uniform', return_value=mock_random):
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         layer = image_preprocessing.RandomFlip(mode)
         actual_output = layer(inp, training=1)
         self.assertAllClose(expected_output, actual_output)
@@ -396,7 +396,7 @@ class RandomFlipTest(keras_parameterized.TestCase):
     with CustomObjectScope({'RandomFlip': image_preprocessing.RandomFlip}):
       input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
       expected_output = input_images
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         layer = image_preprocessing.RandomFlip()
         actual_output = layer(input_images, training=0)
         self.assertAllClose(expected_output, actual_output)
@@ -446,7 +446,7 @@ class RandomContrastTest(keras_parameterized.TestCase):
       expected_output = (inp - inp_mean) * mock_random + inp_mean
     with test.mock.patch.object(
         random_ops, 'random_uniform', return_value=mock_random):
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         layer = image_preprocessing.RandomContrast((lower, upper))
         actual_output = layer(inp, training=True)
         self.assertAllClose(expected_output, actual_output)
@@ -467,7 +467,7 @@ class RandomContrastTest(keras_parameterized.TestCase):
     with CustomObjectScope(
         {'RandomContrast': image_preprocessing.RandomContrast}):
       input_images = np.random.random((2, 5, 8, 3))
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         layer = image_preprocessing.RandomContrast(amplitude)
         layer(input_images)
 
@@ -476,7 +476,7 @@ class RandomContrastTest(keras_parameterized.TestCase):
         {'RandomContrast': image_preprocessing.RandomContrast}):
       input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
       expected_output = input_images
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         layer = image_preprocessing.RandomContrast((0.1, 0.2))
         actual_output = layer(input_images, training=False)
         self.assertAllClose(expected_output, actual_output)
@@ -485,7 +485,7 @@ class RandomContrastTest(keras_parameterized.TestCase):
     with CustomObjectScope(
         {'RandomContrast': image_preprocessing.RandomContrast}):
       input_images = np.random.randint(low=0, high=255, size=(2, 5, 8, 3))
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         layer = image_preprocessing.RandomContrast((0.1, 0.2))
         layer(input_images)
 
@@ -517,7 +517,7 @@ class RandomTranslationTest(keras_parameterized.TestCase):
     orig_width = 8
     channels = 3
     kwargs = {'height_factor': height_factor, 'width_factor': width_factor}
-    with tf_test_util.use_gpu():
+    with testing_utils.use_gpu():
       testing_utils.layer_test(
           image_preprocessing.RandomTranslation,
           kwargs=kwargs,
@@ -532,7 +532,7 @@ class RandomTranslationTest(keras_parameterized.TestCase):
 
   def test_random_translation_up_numeric_reflect(self):
     for dtype in (np.int64, np.float32):
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
         # Shifting by -.2 * 5 = 1 pixel.
         layer = image_preprocessing.RandomTranslation(
@@ -552,7 +552,7 @@ class RandomTranslationTest(keras_parameterized.TestCase):
 
   def test_random_translation_up_numeric_constant(self):
     for dtype in (np.int64, np.float32):
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
         # Shifting by -.2 * 5 = 1 pixel.
         layer = image_preprocessing.RandomTranslation(
@@ -572,7 +572,7 @@ class RandomTranslationTest(keras_parameterized.TestCase):
 
   def test_random_translation_down_numeric_reflect(self):
     for dtype in (np.int64, np.float32):
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
         # Shifting by .2 * 5 = 1 pixel.
         layer = image_preprocessing.RandomTranslation(
@@ -592,7 +592,7 @@ class RandomTranslationTest(keras_parameterized.TestCase):
 
   def test_random_translation_asymmetric_size_numeric_reflect(self):
     for dtype in (np.int64, np.float32):
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         input_image = np.reshape(np.arange(0, 16), (1, 8, 2, 1)).astype(dtype)
         # Shifting by .5 * 8 = 1 pixel.
         layer = image_preprocessing.RandomTranslation(
@@ -615,7 +615,7 @@ class RandomTranslationTest(keras_parameterized.TestCase):
 
   def test_random_translation_down_numeric_constant(self):
     for dtype in (np.int64, np.float32):
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
         # Shifting by -.2 * 5 = 1 pixel.
         layer = image_preprocessing.RandomTranslation(
@@ -635,7 +635,7 @@ class RandomTranslationTest(keras_parameterized.TestCase):
 
   def test_random_translation_left_numeric_reflect(self):
     for dtype in (np.int64, np.float32):
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
         # Shifting by .2 * 5 = 1 pixel.
         layer = image_preprocessing.RandomTranslation(
@@ -655,7 +655,7 @@ class RandomTranslationTest(keras_parameterized.TestCase):
 
   def test_random_translation_left_numeric_constant(self):
     for dtype in (np.int64, np.float32):
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
         # Shifting by -.2 * 5 = 1 pixel.
         layer = image_preprocessing.RandomTranslation(
@@ -678,7 +678,7 @@ class RandomTranslationTest(keras_parameterized.TestCase):
         {'RandomTranslation': image_preprocessing.RandomTranslation}):
       input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
       expected_output = input_images
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         layer = image_preprocessing.RandomTranslation(.5, .5)
         actual_output = layer(input_images, training=0)
         self.assertAllClose(expected_output, actual_output)
@@ -816,6 +816,61 @@ class RandomTransformTest(keras_parameterized.TestCase):
     self._run_random_transform_with_mock(transform_matrix, expected_output,
                                          'wrap')
 
+  def test_random_translation_nearest(self):
+    # nearest output is (aaaa|abcd|dddd)
+
+    # Test down shift by 1.
+    # pyformat: disable
+    expected_output = np.asarray(
+        [[0., 1., 2.],
+         [0., 1., 2.],
+         [3., 4., 5.],
+         [6., 7., 8],
+         [9., 10., 11]]).reshape((1, 5, 3, 1)).astype(np.float32)
+    # pyformat: enable
+    transform_matrix = np.asarray([[1., 0., 0., 0., 1., -1., 0., 0.]])
+    self._run_random_transform_with_mock(transform_matrix, expected_output,
+                                         'nearest')
+
+    # Test up shift by 1.
+    # pyformat: disable
+    expected_output = np.asarray(
+        [[3., 4., 5.],
+         [6., 7., 8],
+         [9., 10., 11.],
+         [12., 13., 14.],
+         [12., 13., 14.]]).reshape((1, 5, 3, 1)).astype(np.float32)
+    # pyformat: enable
+    transform_matrix = np.asarray([[1., 0., 0., 0., 1., 1., 0., 0.]])
+    self._run_random_transform_with_mock(transform_matrix, expected_output,
+                                         'nearest')
+
+    # Test left shift by 1.
+    # pyformat: disable
+    expected_output = np.asarray(
+        [[1., 2., 2.],
+         [4., 5., 5.],
+         [7., 8., 8.],
+         [10., 11., 11.],
+         [13., 14., 14.]]).reshape((1, 5, 3, 1)).astype(np.float32)
+    # pyformat: enable
+    transform_matrix = np.asarray([[1., 0., 1., 0., 1., 0., 0., 0.]])
+    self._run_random_transform_with_mock(transform_matrix, expected_output,
+                                         'nearest')
+
+    # Test right shift by 1.
+    # pyformat: disable
+    expected_output = np.asarray(
+        [[0., 0., 1.],
+         [3., 3., 4],
+         [6., 6., 7.],
+         [9., 9., 10.],
+         [12., 12., 13.]]).reshape((1, 5, 3, 1)).astype(np.float32)
+    # pyformat: enable
+    transform_matrix = np.asarray([[1., 0., -1., 0., 1., 0., 0., 0.]])
+    self._run_random_transform_with_mock(transform_matrix, expected_output,
+                                         'nearest')
+
   def test_random_translation_constant(self):
     # constant output is (0000|abcd|0000)
 
@@ -941,7 +996,7 @@ class RandomRotationTest(keras_parameterized.TestCase):
     orig_width = 8
     channels = 3
     kwargs = {'factor': factor}
-    with tf_test_util.use_gpu():
+    with testing_utils.use_gpu():
       testing_utils.layer_test(
           image_preprocessing.RandomRotation,
           kwargs=kwargs,
@@ -959,7 +1014,7 @@ class RandomRotationTest(keras_parameterized.TestCase):
         {'RandomTranslation': image_preprocessing.RandomRotation}):
       input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
       expected_output = input_images
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         layer = image_preprocessing.RandomRotation(.5)
         actual_output = layer(input_images, training=0)
         self.assertAllClose(expected_output, actual_output)
@@ -970,7 +1025,7 @@ class RandomRotationTest(keras_parameterized.TestCase):
     And that replicas got the same random result.
     """
     input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
-    with tf_test_util.use_gpu():
+    with testing_utils.use_gpu():
       strat = MirroredStrategy(devices=['cpu', 'gpu'])
       with strat.scope():
         layer = image_preprocessing.RandomRotation(.5)
@@ -997,7 +1052,7 @@ class RandomZoomTest(keras_parameterized.TestCase):
     orig_width = 8
     channels = 3
     kwargs = {'height_factor': height_factor, 'width_factor': width_factor}
-    with tf_test_util.use_gpu():
+    with testing_utils.use_gpu():
       testing_utils.layer_test(
           image_preprocessing.RandomZoom,
           kwargs=kwargs,
@@ -1018,7 +1073,7 @@ class RandomZoomTest(keras_parameterized.TestCase):
 
   def test_random_zoom_in_numeric(self):
     for dtype in (np.int64, np.float32):
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(dtype)
         layer = image_preprocessing.RandomZoom((-.5, -.5), (-.5, -.5),
                                                interpolation='nearest')
@@ -1037,7 +1092,7 @@ class RandomZoomTest(keras_parameterized.TestCase):
 
   def test_random_zoom_out_numeric(self):
     for dtype in (np.int64, np.float32):
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(dtype)
         layer = image_preprocessing.RandomZoom((.5, .5), (.8, .8),
                                                fill_mode='constant',
@@ -1057,7 +1112,7 @@ class RandomZoomTest(keras_parameterized.TestCase):
 
   def test_random_zoom_out_numeric_preserve_aspect_ratio(self):
     for dtype in (np.int64, np.float32):
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(dtype)
         layer = image_preprocessing.RandomZoom((.5, .5),
                                                fill_mode='constant',
@@ -1080,7 +1135,7 @@ class RandomZoomTest(keras_parameterized.TestCase):
         {'RandomZoom': image_preprocessing.RandomZoom}):
       input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
       expected_output = input_images
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         layer = image_preprocessing.RandomZoom(.5, .5)
         actual_output = layer(input_images, training=0)
         self.assertAllClose(expected_output, actual_output)
@@ -1102,7 +1157,7 @@ class RandomHeightTest(keras_parameterized.TestCase):
     orig_height = 5
     orig_width = 8
     channels = 3
-    with tf_test_util.use_gpu():
+    with testing_utils.use_gpu():
       img = np.random.random((num_samples, orig_height, orig_width, channels))
       layer = image_preprocessing.RandomHeight(factor)
       img_out = layer(img, training=True)
@@ -1121,7 +1176,7 @@ class RandomHeightTest(keras_parameterized.TestCase):
     mock_factor = 0
     with test.mock.patch.object(
         gen_stateful_random_ops, 'stateful_uniform', return_value=mock_factor):
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         img = np.random.random((12, 5, 8, 3))
         layer = image_preprocessing.RandomHeight(.4)
         img_out = layer(img, training=True)
@@ -1129,7 +1184,7 @@ class RandomHeightTest(keras_parameterized.TestCase):
 
   def test_random_height_longer_numeric(self):
     for dtype in (np.int64, np.float32):
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         input_image = np.reshape(np.arange(0, 6), (2, 3, 1)).astype(dtype)
         layer = image_preprocessing.RandomHeight(factor=(1., 1.))
         # Return type of RandomHeight() is float32 if `interpolation` is not
@@ -1149,7 +1204,7 @@ class RandomHeightTest(keras_parameterized.TestCase):
 
   def test_random_height_shorter_numeric(self):
     for dtype in (np.int64, np.float32):
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         input_image = np.reshape(np.arange(0, 8), (4, 2, 1)).astype(dtype)
         layer = image_preprocessing.RandomHeight(
             factor=(-.5, -.5), interpolation='nearest')
@@ -1171,7 +1226,7 @@ class RandomHeightTest(keras_parameterized.TestCase):
     with CustomObjectScope({'RandomHeight': image_preprocessing.RandomHeight}):
       input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
       expected_output = input_images
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         layer = image_preprocessing.RandomHeight(.5)
         actual_output = layer(input_images, training=0)
         self.assertAllClose(expected_output, actual_output)
@@ -1193,7 +1248,7 @@ class RandomWidthTest(keras_parameterized.TestCase):
     orig_height = 5
     orig_width = 8
     channels = 3
-    with tf_test_util.use_gpu():
+    with testing_utils.use_gpu():
       img = np.random.random((num_samples, orig_height, orig_width, channels))
       layer = image_preprocessing.RandomWidth(factor)
       img_out = layer(img, training=True)
@@ -1212,7 +1267,7 @@ class RandomWidthTest(keras_parameterized.TestCase):
     mock_factor = 0
     with test.mock.patch.object(
         gen_stateful_random_ops, 'stateful_uniform', return_value=mock_factor):
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         img = np.random.random((12, 8, 5, 3))
         layer = image_preprocessing.RandomWidth(.4)
         img_out = layer(img, training=True)
@@ -1220,7 +1275,7 @@ class RandomWidthTest(keras_parameterized.TestCase):
 
   def test_random_width_longer_numeric(self):
     for dtype in (np.int64, np.float32):
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         input_image = np.reshape(np.arange(0, 6), (3, 2, 1)).astype(dtype)
         layer = image_preprocessing.RandomWidth(factor=(1., 1.))
         # Return type of RandomWidth() is float32 if `interpolation` is not
@@ -1239,7 +1294,7 @@ class RandomWidthTest(keras_parameterized.TestCase):
 
   def test_random_width_shorter_numeric(self):
     for dtype in (np.int64, np.float32):
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         input_image = np.reshape(np.arange(0, 8), (2, 4, 1)).astype(dtype)
         layer = image_preprocessing.RandomWidth(
             factor=(-.5, -.5), interpolation='nearest')
@@ -1261,7 +1316,7 @@ class RandomWidthTest(keras_parameterized.TestCase):
     with CustomObjectScope({'RandomWidth': image_preprocessing.RandomWidth}):
       input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
       expected_output = input_images
-      with tf_test_util.use_gpu():
+      with testing_utils.use_gpu():
         layer = image_preprocessing.RandomWidth(.5)
         actual_output = layer(input_images, training=0)
         self.assertAllClose(expected_output, actual_output)
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup.py b/tensorflow/python/keras/layers/preprocessing/index_lookup.py
index 7d11feae341..c25ff2c0d05 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup.py
@@ -46,7 +46,7 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
 
   This layer translates a set of arbitrary hashables into an integer output via
   a table-based lookup, with optional out-of-vocabulary handling. This is the
-  basis layer for both IntegerLookup and IndexLookup; it holds the common
+  basis layer for both IntegerLookup and StringLookup; it holds the common
   logic but is not intended to be exported as part of the Keras API.
 
   If desired, the user can call this layer's `adapt()` method on a data set,
diff --git a/tensorflow/python/keras/layers/preprocessing/integer_lookup.py b/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
index 3512b9988c1..d0ffc987e01 100644
--- a/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.keras.engine import base_preprocessing_layer
 from tensorflow.python.keras.layers.preprocessing import index_lookup
 from tensorflow.python.keras.layers.preprocessing import table_utils
 from tensorflow.python.util.tf_export import keras_export
@@ -201,6 +202,7 @@ class IntegerLookup(index_lookup.IndexLookup):
         vocabulary=vocabulary,
         invert=invert,
         **kwargs)
+    base_preprocessing_layer._kpl_gauge.get_cell("V2").set("IntegerLookup")
 
   def get_config(self):
     base_config = super(IntegerLookup, self).get_config()
diff --git a/tensorflow/python/keras/layers/preprocessing/integer_lookup_v1.py b/tensorflow/python/keras/layers/preprocessing/integer_lookup_v1.py
index 2a86e9d56b0..da37b15abd2 100644
--- a/tensorflow/python/keras/layers/preprocessing/integer_lookup_v1.py
+++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup_v1.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.keras.engine import base_preprocessing_layer
 from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
 from tensorflow.python.keras.layers.preprocessing import integer_lookup
 from tensorflow.python.util.tf_export import keras_export
@@ -24,4 +25,16 @@ from tensorflow.python.util.tf_export import keras_export
 
 @keras_export(v1=["keras.layers.experimental.preprocessing.IntegerLookup"])
 class IntegerLookup(integer_lookup.IntegerLookup, index_lookup_v1.IndexLookup):
-  pass
+  """Maps integers from a vocabulary to integer indices."""
+
+  def __init__(self,
+               max_values=None,
+               num_oov_indices=1,
+               mask_value=0,
+               oov_value=-1,
+               vocabulary=None,
+               invert=False,
+               **kwargs):
+    super(IntegerLookup, self).__init__(max_values, num_oov_indices, mask_value,
+                                        oov_value, vocabulary, invert, **kwargs)
+    base_preprocessing_layer._kpl_gauge.get_cell("V1").set("IntegerLookup")
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization.py b/tensorflow/python/keras/layers/preprocessing/normalization.py
index ba2f7eaae89..4b75def0247 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization.py
@@ -25,8 +25,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.engine.base_preprocessing_layer import Combiner
-from tensorflow.python.keras.engine.base_preprocessing_layer import CombinerPreprocessingLayer
+from tensorflow.python.keras.engine import base_preprocessing_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -40,7 +39,7 @@ _VARIANCE_NAME = 'variance'
 
 # TODO(momernick): Find a good example of normalization?
 @keras_export('keras.layers.experimental.preprocessing.Normalization', v1=[])
-class Normalization(CombinerPreprocessingLayer):
+class Normalization(base_preprocessing_layer.CombinerPreprocessingLayer):
   """Feature-wise normalization of the data.
 
   This layer will coerce its inputs into a distribution centered around
@@ -91,6 +90,7 @@ class Normalization(CombinerPreprocessingLayer):
 
     super(Normalization, self).__init__(
         combiner=_NormalizingCombiner(axis), dtype=dtype, **kwargs)
+    base_preprocessing_layer._kpl_gauge.get_cell('V2').set('Normalization')
 
     if 0 in axis:
       raise ValueError('The argument \'axis\' may not be 0.')
@@ -156,7 +156,8 @@ class Normalization(CombinerPreprocessingLayer):
     # broadcasts the data correctly.
     mean = array_ops.reshape(self.mean, self._broadcast_shape)
     variance = array_ops.reshape(self.variance, self._broadcast_shape)
-    return (inputs - mean) / math_ops.sqrt(variance)
+    return ((inputs - mean) /
+            math_ops.maximum(math_ops.sqrt(variance), K.epsilon()))
 
   def compute_output_shape(self, input_shape):
     return input_shape
@@ -176,7 +177,7 @@ class Normalization(CombinerPreprocessingLayer):
     super(Normalization, self).set_weights(weights)
 
 
-class _NormalizingCombiner(Combiner):
+class _NormalizingCombiner(base_preprocessing_layer.Combiner):
   """Combiner for the Normalization preprocessing layer.
 
   This class encapsulates the computations for finding the mean and variance
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization_test.py b/tensorflow/python/keras/layers/preprocessing/normalization_test.py
index f97b8db50ec..69eafc54adc 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization_test.py
@@ -97,6 +97,16 @@ def _get_layer_computation_test_cases():
               np.float32),
       "testcase_name":
           "3d_multiple_axis"
+  }, {
+      "adapt_data":
+          np.zeros((3, 4)),
+      "axis": -1,
+      "test_data":
+          np.zeros((3, 4)),
+      "expected":
+          np.zeros((3, 4)),
+      "testcase_name":
+          "zero_variance"
   })
 
   crossed_test_cases = []
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization_v1.py b/tensorflow/python/keras/layers/preprocessing/normalization_v1.py
index 2cb4413cf7f..12b29e36f4a 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization_v1.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization_v1.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.keras.engine import base_preprocessing_layer
 from tensorflow.python.keras.engine.base_preprocessing_layer_v1 import CombinerPreprocessingLayer
 from tensorflow.python.keras.layers.preprocessing import normalization
 from tensorflow.python.util.tf_export import keras_export
@@ -25,4 +26,7 @@ from tensorflow.python.util.tf_export import keras_export
 
 @keras_export(v1=['keras.layers.experimental.preprocessing.Normalization'])
 class Normalization(normalization.Normalization, CombinerPreprocessingLayer):
-  pass
+
+  def __init__(self, axis=-1, dtype=None, **kwargs):
+    super(Normalization, self).__init__(axis, dtype, **kwargs)
+    base_preprocessing_layer._kpl_gauge.get_cell('V1').set('Normalization')
diff --git a/tensorflow/python/keras/layers/preprocessing/preprocessing_test_utils.py b/tensorflow/python/keras/layers/preprocessing/preprocessing_test_utils.py
index 006cab1fb11..91545b8ee28 100644
--- a/tensorflow/python/keras/layers/preprocessing/preprocessing_test_utils.py
+++ b/tensorflow/python/keras/layers/preprocessing/preprocessing_test_utils.py
@@ -18,11 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-
 import numpy as np
 
 from tensorflow.python.platform import test
+from tensorflow.python.util.compat import collections_abc
 
 
 class PreprocessingLayerTest(test.TestCase):
@@ -38,7 +37,7 @@ class PreprocessingLayerTest(test.TestCase):
       self.assertEqual(len(a), len(b))
       for a_value, b_value in zip(a, b):
         self.assertAllCloseOrEqual(a_value, b_value, msg=msg)
-    elif isinstance(a, collections.Mapping):
+    elif isinstance(a, collections_abc.Mapping):
       self.assertEqual(len(a), len(b))
       for key, a_value in a.items():
         b_value = b[key]
diff --git a/tensorflow/python/keras/layers/preprocessing/string_lookup.py b/tensorflow/python/keras/layers/preprocessing/string_lookup.py
index d772f57aa4d..c70ac50dd07 100644
--- a/tensorflow/python/keras/layers/preprocessing/string_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/string_lookup.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.keras.engine import base_preprocessing_layer
 from tensorflow.python.keras.layers.preprocessing import index_lookup
 from tensorflow.python.keras.layers.preprocessing import table_utils
 from tensorflow.python.util.tf_export import keras_export
@@ -196,6 +197,7 @@ class StringLookup(index_lookup.IndexLookup):
         vocabulary=vocabulary,
         invert=invert,
         **kwargs)
+    base_preprocessing_layer._kpl_gauge.get_cell("V2").set("StringLookup")
 
   def get_config(self):
     config = {"encoding": self.encoding}
diff --git a/tensorflow/python/keras/layers/preprocessing/string_lookup_v1.py b/tensorflow/python/keras/layers/preprocessing/string_lookup_v1.py
index 3b5d0679372..59649be720b 100644
--- a/tensorflow/python/keras/layers/preprocessing/string_lookup_v1.py
+++ b/tensorflow/python/keras/layers/preprocessing/string_lookup_v1.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.keras.engine import base_preprocessing_layer
 from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
 from tensorflow.python.keras.layers.preprocessing import string_lookup
 from tensorflow.python.util.tf_export import keras_export
@@ -24,4 +25,23 @@ from tensorflow.python.util.tf_export import keras_export
 
 @keras_export(v1=["keras.layers.experimental.preprocessing.StringLookup"])
 class StringLookup(string_lookup.StringLookup, index_lookup_v1.IndexLookup):
-  pass
+  """Maps strings from a vocabulary to integer indices."""
+
+  def __init__(self,
+               max_tokens=None,
+               num_oov_indices=1,
+               mask_token="",
+               oov_token="[UNK]",
+               vocabulary=None,
+               encoding=None,
+               invert=False,
+               **kwargs):
+    super(StringLookup, self).__init__(
+        max_tokens=max_tokens,
+        num_oov_indices=num_oov_indices,
+        mask_token=mask_token,
+        oov_token=oov_token,
+        vocabulary=vocabulary,
+        invert=invert,
+        **kwargs)
+    base_preprocessing_layer._kpl_gauge.get_cell("V1").set("StringLookup")
diff --git a/tensorflow/python/keras/layers/preprocessing/table_utils.py b/tensorflow/python/keras/layers/preprocessing/table_utils.py
index cf1bfd741c9..3329f32b4fe 100644
--- a/tensorflow/python/keras/layers/preprocessing/table_utils.py
+++ b/tensorflow/python/keras/layers/preprocessing/table_utils.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import string_ops
@@ -131,7 +132,7 @@ class TableHandler(object):
     inputs = ragged_tensor.convert_to_tensor_or_ragged_tensor(inputs)
 
     # Run the lookup operation on the converted tensor.
-    if ragged_tensor.is_ragged(inputs):
+    if tf_utils.is_ragged(inputs):
       return self._ragged_lookup(inputs)
     else:
       return self._tensor_lookup(inputs)
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
index 97e3ac4a63c..2cc8bc2b340 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
@@ -25,17 +25,17 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.engine.base_preprocessing_layer import CombinerPreprocessingLayer
+from tensorflow.python.keras.engine import base_preprocessing_layer
 from tensorflow.python.keras.layers.preprocessing import category_encoding
 from tensorflow.python.keras.layers.preprocessing import string_lookup
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_string_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_string_ops
-from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util.tf_export import keras_export
 
 LOWER_AND_STRIP_PUNCTUATION = "lower_and_strip_punctuation"
@@ -71,7 +71,7 @@ _ACCUMULATOR_NUM_DOCUMENTS = "num_documents"
 
 @keras_export(
     "keras.layers.experimental.preprocessing.TextVectorization", v1=[])
-class TextVectorization(CombinerPreprocessingLayer):
+class TextVectorization(base_preprocessing_layer.CombinerPreprocessingLayer):
   """Text vectorization layer.
 
   This layer has basic options for managing text in a Keras model. It
@@ -291,6 +291,7 @@ class TextVectorization(CombinerPreprocessingLayer):
     super(TextVectorization, self).__init__(
         combiner=None,
         **kwargs)
+    base_preprocessing_layer._kpl_gauge.get_cell("V2").set("TextVectorization")
 
     mask_token = "" if output_mode in [None, INT] else None
     self._index_lookup_layer = self._get_index_lookup_class()(
@@ -515,7 +516,7 @@ class TextVectorization(CombinerPreprocessingLayer):
 
   def _preprocess(self, inputs):
     if self._standardize == LOWER_AND_STRIP_PUNCTUATION:
-      if ragged_tensor.is_ragged(inputs):
+      if tf_utils.is_ragged(inputs):
         lowercase_inputs = ragged_functional_ops.map_flat_values(
             gen_string_ops.string_lower, inputs)
         # Depending on configuration, we may never touch the non-data tensor
@@ -580,7 +581,7 @@ class TextVectorization(CombinerPreprocessingLayer):
       # choose whether to pad or trim it based on each tensor.
 
       # We need to convert to dense if we have a ragged tensor.
-      if ragged_tensor.is_ragged(indexed_data):
+      if tf_utils.is_ragged(indexed_data):
         dense_data = indexed_data.to_tensor(default_value=0)
       else:
         dense_data = indexed_data
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
index 505cdc39547..ecb49d1fbdd 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.keras.engine import base_preprocessing_layer
 from tensorflow.python.keras.engine import base_preprocessing_layer_v1
 from tensorflow.python.keras.layers.preprocessing import category_encoding_v1
 from tensorflow.python.keras.layers.preprocessing import string_lookup_v1
@@ -76,6 +77,20 @@ class TextVectorization(text_vectorization.TextVectorization,
       vocabulary is less than max_tokens.
   """
 
+  def __init__(self,
+               max_tokens=None,
+               standardize=text_vectorization.LOWER_AND_STRIP_PUNCTUATION,
+               split=text_vectorization.SPLIT_ON_WHITESPACE,
+               ngrams=None,
+               output_mode=text_vectorization.INT,
+               output_sequence_length=None,
+               pad_to_max_tokens=True,
+               **kwargs):
+    super(TextVectorization,
+          self).__init__(max_tokens, standardize, split, ngrams, output_mode,
+                         output_sequence_length, pad_to_max_tokens, **kwargs)
+    base_preprocessing_layer._kpl_gauge.get_cell("V1").set("TextVectorization")
+
   def _get_vectorization_class(self):
     return category_encoding_v1.CategoryEncoding
 
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index 2760509cd72..cfaa5a78758 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -33,11 +33,11 @@ from tensorflow.python.keras import regularizers
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.saving.saved_model import layer_serialization
+from tensorflow.python.keras.utils import control_flow_util
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
@@ -45,14 +45,10 @@ from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking import data_structures
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
+from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
-try:
-  from collections import abc as collections_abc  # pylint: disable=g-import-not-at-top
-except ImportError:  # For Python 2
-  import collections as collections_abc  # pylint: disable=g-import-not-at-top
-
 
 RECURRENT_DROPOUT_WARNING_MSG = (
     'RNN `implementation=2` is not supported when `recurrent_dropout` is set. '
@@ -143,7 +139,7 @@ class StackedRNNCells(Layer):
     # Call the cells in order and store the returned states.
     new_nested_states = []
     for cell, states in zip(self.cells, nested_states):
-      states = states if nest.is_sequence(states) else [states]
+      states = states if nest.is_nested(states) else [states]
       # TF cell does not wrap the state into list when there is only one state.
       is_tf_rnn_cell = getattr(cell, '_is_tf_rnn_cell', None) is not None
       states = states[0] if len(states) == 1 and is_tf_rnn_cell else states
@@ -180,7 +176,7 @@ class StackedRNNCells(Layer):
       else:
         output_dim = cell.state_size
       input_shape = tuple([input_shape[0]] +
-                          tensor_shape.as_shape(output_dim).as_list())
+                          tensor_shape.TensorShape(output_dim).as_list())
     self.built = True
 
   def get_config(self):
@@ -327,7 +323,7 @@ class RNN(Layer):
         This is the expected shape of your inputs
         *including the batch size*.
         It should be a tuple of integers, e.g. `(32, 10, 100)`.
-      - Specify `shuffle=False` when calling fit().
+      - Specify `shuffle=False` when calling `fit()`.
 
     To reset the states of your model, call `.reset_states()` on either
     a specific layer, or on your entire model.
@@ -452,7 +448,7 @@ class RNN(Layer):
   def states(self):
     if self._states is None:
       state = nest.map_structure(lambda _: None, self.cell.state_size)
-      return state if nest.is_sequence(self.cell.state_size) else [state]
+      return state if nest.is_nested(self.cell.state_size) else [state]
     return self._states
 
   @states.setter
@@ -469,7 +465,7 @@ class RNN(Layer):
     # (tensor_shape(1, 2), tensor_shape(3, 4)) or (1, 2, 3) which is from numpy
     # inputs.
     try:
-      input_shape = tensor_shape.as_shape(input_shape)
+      input_shape = tensor_shape.TensorShape(input_shape)
     except (ValueError, TypeError):
       # A nested tensor input
       input_shape = nest.flatten(input_shape)[0]
@@ -485,14 +481,16 @@ class RNN(Layer):
       state_size = [self.cell.state_size]
 
     def _get_output_shape(flat_output_size):
-      output_dim = tensor_shape.as_shape(flat_output_size).as_list()
+      output_dim = tensor_shape.TensorShape(flat_output_size).as_list()
       if self.return_sequences:
         if self.time_major:
-          output_shape = tensor_shape.as_shape([time_step, batch] + output_dim)
+          output_shape = tensor_shape.TensorShape(
+              [time_step, batch] + output_dim)
         else:
-          output_shape = tensor_shape.as_shape([batch, time_step] + output_dim)
+          output_shape = tensor_shape.TensorShape(
+              [batch, time_step] + output_dim)
       else:
-        output_shape = tensor_shape.as_shape([batch] + output_dim)
+        output_shape = tensor_shape.TensorShape([batch] + output_dim)
       return output_shape
 
     if getattr(self.cell, 'output_size', None) is not None:
@@ -506,8 +504,8 @@ class RNN(Layer):
 
     if self.return_state:
       def _get_state_shape(flat_state):
-        state_shape = [batch] + tensor_shape.as_shape(flat_state).as_list()
-        return tensor_shape.as_shape(state_shape)
+        state_shape = [batch] + tensor_shape.TensorShape(flat_state).as_list()
+        return tensor_shape.TensorShape(state_shape)
       state_shape = nest.map_structure(_get_state_shape, state_size)
       return generic_utils.to_list(output_shape) + nest.flatten(state_shape)
     else:
@@ -556,12 +554,12 @@ class RNN(Layer):
     # (tensor_shape(1, 2), tensor_shape(3, 4)) or (1, 2, 3) which is from numpy
     # inputs.
     try:
-      input_shape = tensor_shape.as_shape(input_shape)
+      input_shape = tensor_shape.TensorShape(input_shape)
     except (ValueError, TypeError):
       # A nested tensor input
       pass
 
-    if not nest.is_sequence(input_shape):
+    if not nest.is_nested(input_shape):
       # This indicates the there is only one input.
       if self.input_spec is not None:
         self.input_spec[0] = get_input_spec(input_shape)
@@ -593,7 +591,7 @@ class RNN(Layer):
       self._validate_state_spec(state_size, self.state_spec)
     else:
       self.state_spec = [
-          InputSpec(shape=[None] + tensor_shape.as_shape(dim).as_list())
+          InputSpec(shape=[None] + tensor_shape.TensorShape(dim).as_list())
           for dim in state_size
       ]
     if self.stateful:
@@ -634,7 +632,7 @@ class RNN(Layer):
   def get_initial_state(self, inputs):
     get_initial_state_fn = getattr(self.cell, 'get_initial_state', None)
 
-    if nest.is_sequence(inputs):
+    if nest.is_nested(inputs):
       # The input are nested sequences. Use the first element in the seq to get
       # batch size and dtype.
       inputs = nest.flatten(inputs)[0]
@@ -649,7 +647,7 @@ class RNN(Layer):
       init_state = _generate_zero_filled_state(batch_size, self.cell.state_size,
                                                dtype)
     # Keras RNN expect the states in a list, even if it's a single state tensor.
-    if not nest.is_sequence(init_state):
+    if not nest.is_nested(init_state):
       init_state = [init_state]
     # Force the state to be a list in case it is a namedtuple eg LSTMStateTuple.
     return list(init_state)
@@ -745,7 +743,7 @@ class RNN(Layer):
       # TODO(scottzhu): Should we accept multiple different masks?
       mask = nest.flatten(mask)[0]
 
-    if nest.is_sequence(inputs):
+    if nest.is_nested(inputs):
       # In the case of nested input, use the first element for shape check.
       input_shape = K.int_shape(nest.flatten(inputs)[0])
     else:
@@ -784,7 +782,7 @@ class RNN(Layer):
         states = states[0] if len(states) == 1 and is_tf_rnn_cell else states
         output, new_states = cell_call_fn(
             inputs, states, constants=constants, **kwargs)
-        if not nest.is_sequence(new_states):
+        if not nest.is_nested(new_states):
           new_states = [new_states]
         return output, new_states
     else:
@@ -792,7 +790,7 @@ class RNN(Layer):
       def step(inputs, states):
         states = states[0] if len(states) == 1 and is_tf_rnn_cell else states
         output, new_states = cell_call_fn(inputs, states, **kwargs)
-        if not nest.is_sequence(new_states):
+        if not nest.is_nested(new_states):
           new_states = [new_states]
         return output, new_states
     last_output, outputs, states = K.rnn(
@@ -928,16 +926,16 @@ class RNN(Layer):
     # initialize state if None
     if nest.flatten(self.states)[0] is None:
       def create_state_variable(state):
-        return K.zeros([batch_size] + tensor_shape.as_shape(state).as_list())
+        return K.zeros([batch_size] + tensor_shape.TensorShape(state).as_list())
       self.states = nest.map_structure(
           create_state_variable, self.cell.state_size)
-      if not nest.is_sequence(self.states):
+      if not nest.is_nested(self.states):
         self.states = [self.states]
     elif states is None:
       for state, size in zip(nest.flatten(self.states),
                              nest.flatten(self.cell.state_size)):
         K.set_value(state, np.zeros([batch_size] +
-                                    tensor_shape.as_shape(size).as_list()))
+                                    tensor_shape.TensorShape(size).as_list()))
     else:
       flat_states = nest.flatten(self.states)
       flat_input_states = nest.flatten(states)
@@ -1118,7 +1116,7 @@ class DropoutRNNCellMixin(object):
     is used every time.
 
     Also the caches are created without tracking. Since they are not picklable
-    by python when deepcopy, we don't want layer._obj_reference_counts_dict
+    by python when deepcopy, we don't want `layer._obj_reference_counts_dict`
     to track it by default.
     """
     self._dropout_mask_cache = K.ContextValueCache(self._create_dropout_mask)
@@ -1128,8 +1126,8 @@ class DropoutRNNCellMixin(object):
   def reset_dropout_mask(self):
     """Reset the cached dropout masks if any.
 
-    This is important for the RNN layer to invoke this in it call() method so
-    that the cached mask is cleared before calling the cell.call(). The mask
+    This is important for the RNN layer to invoke this in it `call()` method so
+    that the cached mask is cleared before calling the `cell.call()`. The mask
     should be cached across the timestep within the same batch, but shouldn't
     be cached between batches. Otherwise it will introduce unreasonable bias
     against certain index of data within the batch.
@@ -1361,7 +1359,7 @@ class SimpleRNNCell(DropoutRNNCellMixin, Layer):
     self.built = True
 
   def call(self, inputs, states, training=None):
-    prev_output = states[0] if nest.is_sequence(states) else states
+    prev_output = states[0] if nest.is_nested(states) else states
     dp_mask = self.get_dropout_mask_for_cell(inputs, training)
     rec_dp_mask = self.get_recurrent_dropout_mask_for_cell(
         prev_output, training)
@@ -1379,7 +1377,7 @@ class SimpleRNNCell(DropoutRNNCellMixin, Layer):
     if self.activation is not None:
       output = self.activation(output)
 
-    new_state = [output] if nest.is_sequence(states) else output
+    new_state = [output] if nest.is_nested(states) else output
     return output, new_state
 
   def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
@@ -1821,7 +1819,7 @@ class GRUCell(DropoutRNNCellMixin, Layer):
     self.built = True
 
   def call(self, inputs, states, training=None):
-    h_tm1 = states[0] if nest.is_sequence(states) else states  # previous memory
+    h_tm1 = states[0] if nest.is_nested(states) else states  # previous memory
 
     dp_mask = self.get_dropout_mask_for_cell(inputs, training, count=3)
     rec_dp_mask = self.get_recurrent_dropout_mask_for_cell(
@@ -1919,7 +1917,7 @@ class GRUCell(DropoutRNNCellMixin, Layer):
       hh = self.activation(x_h + recurrent_h)
     # previous and candidate state mixed by update gate
     h = z * h_tm1 + (1 - z) * hh
-    new_state = [h] if nest.is_sequence(states) else h
+    new_state = [h] if nest.is_nested(states) else h
     return h, new_state
 
   def get_config(self):
@@ -2679,7 +2677,7 @@ class LSTM(RNN):
       the `recurrent_kernel` weights matrix.
     bias_regularizer: Regularizer function applied to the bias vector.
     activity_regularizer: Regularizer function applied to
-      the output of the layer (its "activation")..
+      the output of the layer (its "activation").
     kernel_constraint: Constraint function applied to
       the `kernel` weights matrix.
     recurrent_constraint: Constraint function applied to
@@ -3018,11 +3016,11 @@ def _generate_zero_filled_state(batch_size_tensor, state_size, dtype):
         'batch_size={}, dtype={}'.format(batch_size_tensor, dtype))
 
   def create_zeros(unnested_state_size):
-    flat_dims = tensor_shape.as_shape(unnested_state_size).as_list()
+    flat_dims = tensor_shape.TensorShape(unnested_state_size).as_list()
     init_state_size = [batch_size_tensor] + flat_dims
     return array_ops.zeros(init_state_size, dtype=dtype)
 
-  if nest.is_sequence(state_size):
+  if nest.is_nested(state_size):
     return nest.map_structure(create_zeros, state_size)
   else:
     return create_zeros(state_size)
diff --git a/tensorflow/python/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py
index c2c3d135f68..c8785a8eb9e 100644
--- a/tensorflow/python/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/layers/recurrent_test.py
@@ -1745,8 +1745,8 @@ class Minimal2DRNNCell(keras.layers.Layer):
   def __init__(self, unit_a, unit_b, **kwargs):
     self.unit_a = unit_a
     self.unit_b = unit_b
-    self.state_size = tensor_shape.as_shape([unit_a, unit_b])
-    self.output_size = tensor_shape.as_shape([unit_a, unit_b])
+    self.state_size = tensor_shape.TensorShape([unit_a, unit_b])
+    self.output_size = tensor_shape.TensorShape([unit_a, unit_b])
     super(Minimal2DRNNCell, self).__init__(**kwargs)
 
   def build(self, input_shape):
diff --git a/tensorflow/python/keras/layers/recurrent_v2.py b/tensorflow/python/keras/layers/recurrent_v2.py
index 58eb0bb025b..a2ed7141608 100644
--- a/tensorflow/python/keras/layers/recurrent_v2.py
+++ b/tensorflow/python/keras/layers/recurrent_v2.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import uuid
 
+from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
@@ -37,6 +38,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import build_info
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
@@ -385,16 +387,19 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
       else:
         logging.warn(_CUDNN_NOT_AVAILABLE_MSG % self.name)
 
-    # The first two attributes are added to support TFLite use case.
-    supportive_attributes = {
-        'time_major': time_major,
-        'go_backwards': go_backwards,
-        _FUNCTION_API_NAME_ATTRIBUTE: 'gru_' + str(uuid.uuid4())
-    }
-    self.defun_gru_with_backend_selection = function.defun_with_attributes(
-        gru_with_backend_selection,
-        attributes=supportive_attributes,
-        autograph=False)
+    # TODO(b/162616551): Remove all compat statements after 08/20/2020.
+    # This follows b/161915509 and is mainly to test the stateless Case op.
+    if compat.forward_compatible(2020, 8, 20):
+      # The first two attributes are added to support TFLite use case.
+      supportive_attributes = {
+          'time_major': time_major,
+          'go_backwards': go_backwards,
+          _FUNCTION_API_NAME_ATTRIBUTE: 'gru_' + str(uuid.uuid4())
+      }
+      self.defun_gru_with_backend_selection = function.defun_with_attributes(
+          gru_with_backend_selection,
+          attributes=supportive_attributes,
+          autograph=False)
 
   def build(self, input_shape):
     super(GRU, self).build(input_shape)
@@ -478,20 +483,54 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
     if dropout_mask is not None:
       inputs = inputs * dropout_mask[0]
 
-    gru_kwargs = {
-        'inputs': inputs,
-        'init_h': _read_variable_value(initial_state[0]),
-        'kernel': _read_variable_value(self.cell.kernel),
-        'recurrent_kernel': _read_variable_value(self.cell.recurrent_kernel),
-        'bias': _read_variable_value(self.cell.bias),
-        'mask': mask,
-        'time_major': self.time_major,
-        'go_backwards': self.go_backwards,
-        'sequence_lengths': sequence_lengths,
-        'zero_output_for_mask': self.zero_output_for_mask
-    }
-    (last_output, outputs, new_h,
-     runtime) = self.defun_gru_with_backend_selection(**gru_kwargs)
+    if compat.forward_compatible(2020, 8, 20):
+      gru_kwargs = {
+          'inputs': inputs,
+          'init_h': _read_variable_value(initial_state[0]),
+          'kernel': _read_variable_value(self.cell.kernel),
+          'recurrent_kernel': _read_variable_value(self.cell.recurrent_kernel),
+          'bias': _read_variable_value(self.cell.bias),
+          'mask': mask,
+          'time_major': self.time_major,
+          'go_backwards': self.go_backwards,
+          'sequence_lengths': sequence_lengths,
+          'zero_output_for_mask': self.zero_output_for_mask
+      }
+      (last_output, outputs, new_h,
+       runtime) = self.defun_gru_with_backend_selection(**gru_kwargs)
+    else:
+      gpu_gru_kwargs = {
+          'inputs': inputs,
+          'init_h': _read_variable_value(initial_state[0]),
+          'kernel': _read_variable_value(self.cell.kernel),
+          'recurrent_kernel': _read_variable_value(self.cell.recurrent_kernel),
+          'bias': _read_variable_value(self.cell.bias),
+          'mask': mask,
+          'time_major': self.time_major,
+          'go_backwards': self.go_backwards,
+          'sequence_lengths': sequence_lengths
+      }
+      normal_gru_kwargs = gpu_gru_kwargs.copy()
+      normal_gru_kwargs.update({
+          'zero_output_for_mask': self.zero_output_for_mask,
+      })
+
+      if context.executing_eagerly():
+        device_type = _get_context_device_type()
+        can_use_gpu = (
+            # Either user specified GPU or unspecified but GPU is available.
+            (device_type == _GPU_DEVICE_NAME or
+             (device_type is None and context.num_gpus() > 0)) and
+            (mask is None or is_cudnn_supported_inputs(mask, self.time_major)))
+        # Under eager context, check the device placement and prefer the
+        if can_use_gpu:
+          last_output, outputs, new_h, runtime = gpu_gru(**gpu_gru_kwargs)
+        else:
+          last_output, outputs, new_h, runtime = standard_gru(
+              **normal_gru_kwargs)
+      else:
+        last_output, outputs, new_h, runtime = gru_with_backend_selection(
+            **normal_gru_kwargs)
 
     states = [new_h]
     return last_output, outputs, runtime, states
@@ -728,7 +767,7 @@ def gru_with_backend_selection(inputs, init_h, kernel, recurrent_kernel, bias,
           go_backwards=go_backwards,
           sequence_lengths=sequence_lengths)
 
-    def input_right_padded():
+    def cudnn_gru_fn():
       return gpu_gru(
           inputs=inputs,
           init_h=init_h,
@@ -740,7 +779,7 @@ def gru_with_backend_selection(inputs, init_h, kernel, recurrent_kernel, bias,
           go_backwards=go_backwards,
           sequence_lengths=sequence_lengths)
 
-    def input_not_right_padded():
+    def standard_gru_fn():
       return standard_gru(
           inputs=inputs,
           init_h=init_h,
@@ -754,17 +793,39 @@ def gru_with_backend_selection(inputs, init_h, kernel, recurrent_kernel, bias,
           zero_output_for_mask=zero_output_for_mask)
 
     return control_flow_ops.cond(
-        is_sequence_right_padded(mask, time_major),
-        true_fn=input_right_padded,
-        false_fn=input_not_right_padded)
+        is_cudnn_supported_inputs(mask, time_major),
+        true_fn=cudnn_gru_fn,
+        false_fn=standard_gru_fn)
 
-  # Chooses the implementation dynamicly based on the running device.
-  (last_output, outputs, new_h,
-   runtime) = control_flow_ops.execute_fn_for_device(
-       {
-           _CPU_DEVICE_NAME: lambda: standard_gru(**params),
-           _GPU_DEVICE_NAME: lambda: gpu_gru_with_fallback(**params)
-       }, lambda: standard_gru(**params))
+  if compat.forward_compatible(2020, 8, 20):
+    # Chooses the implementation dynamicly based on the running device.
+    (last_output, outputs, new_h,
+     runtime) = control_flow_ops.execute_fn_for_device(
+         {
+             _CPU_DEVICE_NAME: lambda: standard_gru(**params),
+             _GPU_DEVICE_NAME: lambda: gpu_gru_with_fallback(**params)
+         }, lambda: standard_gru(**params))
+  else:
+    # Each time a `tf.function` is called, we will give it a unique
+    # identifiable API name, so that Grappler won't get confused when it
+    # sees multiple GRU layers added into same graph, and it will be able
+    # to pair up the different implementations across them.
+    api_name = 'gru_' + str(uuid.uuid4())
+    supportive_attribute = {
+        'time_major': time_major,
+        'go_backwards': go_backwards,
+    }
+    defun_standard_gru = _generate_defun_backend(api_name, _CPU_DEVICE_NAME,
+                                                 standard_gru,
+                                                 supportive_attribute)
+    defun_gpu_gru = _generate_defun_backend(api_name, _GPU_DEVICE_NAME,
+                                            gpu_gru_with_fallback,
+                                            supportive_attribute)
+
+    # Call the normal GRU impl and register the CuDNN impl function. The
+    # grappler will kick in during session execution to optimize the graph.
+    last_output, outputs, new_h, runtime = defun_standard_gru(**params)
+    function.register(defun_gpu_gru, **params)
 
   return last_output, outputs, new_h, runtime
 
@@ -1080,17 +1141,18 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
       else:
         logging.warn(_CUDNN_NOT_AVAILABLE_MSG % self.name)
 
-    # The first two attributes are added to support TFLite use case.
-    supportive_attributes = {
-        'time_major': time_major,
-        'go_backwards': go_backwards,
-        _FUNCTION_API_NAME_ATTRIBUTE: 'lstm_' + str(uuid.uuid4())
-    }
+    if compat.forward_compatible(2020, 8, 20):
+      # The first two attributes are added to support TFLite use case.
+      supportive_attributes = {
+          'time_major': time_major,
+          'go_backwards': go_backwards,
+          _FUNCTION_API_NAME_ATTRIBUTE: 'lstm_' + str(uuid.uuid4())
+      }
 
-    self.defun_lstm_with_backend_selection = function.defun_with_attributes(
-        lstm_with_backend_selection,
-        attributes=supportive_attributes,
-        autograph=False)
+      self.defun_lstm_with_backend_selection = function.defun_with_attributes(
+          lstm_with_backend_selection,
+          attributes=supportive_attributes,
+          autograph=False)
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
     # The input should be dense, padded with zeros. If a ragged input is fed
@@ -1140,21 +1202,80 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
       dropout_mask = self.get_dropout_mask_for_cell(inputs, training, count=4)
       if dropout_mask is not None:
         inputs = inputs * dropout_mask[0]
-      lstm_kwargs = {
-          'inputs': inputs,
-          'init_h': _read_variable_value(initial_state[0]),
-          'init_c': _read_variable_value(initial_state[1]),
-          'kernel': _read_variable_value(self.cell.kernel),
-          'recurrent_kernel': _read_variable_value(self.cell.recurrent_kernel),
-          'bias': _read_variable_value(self.cell.bias),
-          'mask': mask,
-          'time_major': self.time_major,
-          'go_backwards': self.go_backwards,
-          'sequence_lengths': row_lengths,
-          'zero_output_for_mask': self.zero_output_for_mask,
-      }
-      (last_output, outputs, new_h, new_c,
-       runtime) = self.defun_lstm_with_backend_selection(**lstm_kwargs)
+      if compat.forward_compatible(2020, 8, 20):
+        lstm_kwargs = {
+            'inputs':
+                inputs,
+            'init_h':
+                _read_variable_value(initial_state[0]),
+            'init_c':
+                _read_variable_value(initial_state[1]),
+            'kernel':
+                _read_variable_value(self.cell.kernel),
+            'recurrent_kernel':
+                _read_variable_value(self.cell.recurrent_kernel),
+            'bias':
+                _read_variable_value(self.cell.bias),
+            'mask':
+                mask,
+            'time_major':
+                self.time_major,
+            'go_backwards':
+                self.go_backwards,
+            'sequence_lengths':
+                row_lengths,
+            'zero_output_for_mask':
+                self.zero_output_for_mask,
+        }
+        (last_output, outputs, new_h, new_c,
+         runtime) = self.defun_lstm_with_backend_selection(**lstm_kwargs)
+      else:
+        gpu_lstm_kwargs = {
+            'inputs':
+                inputs,
+            'init_h':
+                _read_variable_value(initial_state[0]),
+            'init_c':
+                _read_variable_value(initial_state[1]),
+            'kernel':
+                _read_variable_value(self.cell.kernel),
+            'recurrent_kernel':
+                _read_variable_value(self.cell.recurrent_kernel),
+            'bias':
+                _read_variable_value(self.cell.bias),
+            'mask':
+                mask,
+            'time_major':
+                self.time_major,
+            'go_backwards':
+                self.go_backwards,
+            'sequence_lengths':
+                row_lengths
+        }
+        normal_lstm_kwargs = gpu_lstm_kwargs.copy()
+        normal_lstm_kwargs.update({
+            'zero_output_for_mask': self.zero_output_for_mask,
+        })
+
+        if context.executing_eagerly():
+          device_type = _get_context_device_type()
+          can_use_gpu = (
+              # Either user specified GPU or unspecified but GPU is available.
+              (device_type == _GPU_DEVICE_NAME or
+               (device_type is None and context.num_gpus() > 0)) and
+              (mask is None or
+               is_cudnn_supported_inputs(mask, self.time_major)))
+          # Under eager context, check the device placement and prefer the
+          # GPU implementation when GPU is available.
+          if can_use_gpu:
+            last_output, outputs, new_h, new_c, runtime = gpu_lstm(
+                **gpu_lstm_kwargs)
+          else:
+            last_output, outputs, new_h, new_c, runtime = standard_lstm(
+                **normal_lstm_kwargs)
+        else:
+          (last_output, outputs, new_h, new_c,
+           runtime) = lstm_with_backend_selection(**normal_lstm_kwargs)
 
       states = [new_h, new_c]
 
@@ -1480,7 +1601,7 @@ def lstm_with_backend_selection(inputs, init_h, init_c, kernel,
           go_backwards=go_backwards,
           sequence_lengths=sequence_lengths)
 
-    def input_right_padded():
+    def cudnn_lstm_fn():
       return gpu_lstm(
           inputs=inputs,
           init_h=init_h,
@@ -1493,7 +1614,7 @@ def lstm_with_backend_selection(inputs, init_h, init_c, kernel,
           go_backwards=go_backwards,
           sequence_lengths=sequence_lengths)
 
-    def input_not_right_padded():
+    def stardard_lstm_fn():
       return standard_lstm(
           inputs=inputs,
           init_h=init_h,
@@ -1508,22 +1629,44 @@ def lstm_with_backend_selection(inputs, init_h, init_c, kernel,
           zero_output_for_mask=zero_output_for_mask)
 
     return control_flow_ops.cond(
-        is_sequence_right_padded(mask, time_major),
-        true_fn=input_right_padded,
-        false_fn=input_not_right_padded)
+        is_cudnn_supported_inputs(mask, time_major),
+        true_fn=cudnn_lstm_fn,
+        false_fn=stardard_lstm_fn)
 
-  # Chooses the implementation dynamicly based on the running device.
-  (last_output, outputs, new_h, new_c,
-   runtime) = control_flow_ops.execute_fn_for_device(
-       {
-           _CPU_DEVICE_NAME: lambda: standard_lstm(**params),
-           _GPU_DEVICE_NAME: lambda: gpu_lstm_with_fallback(**params)
-       }, lambda: standard_lstm(**params))
+  if compat.forward_compatible(2020, 8, 20):
+    # Chooses the implementation dynamicly based on the running device.
+    (last_output, outputs, new_h, new_c,
+     runtime) = control_flow_ops.execute_fn_for_device(
+         {
+             _CPU_DEVICE_NAME: lambda: standard_lstm(**params),
+             _GPU_DEVICE_NAME: lambda: gpu_lstm_with_fallback(**params)
+         }, lambda: standard_lstm(**params))
+  else:
+    # Each time a `tf.function` is called, we will give it a unique
+    # identifiable API name, so that Grappler won't get confused when it
+    # sees multiple LSTM layers added into same graph, and it will be able
+    # to pair up the different implementations across them.
+    api_name = 'lstm_' + str(uuid.uuid4())
+    supportive_attribute = {
+        'time_major': time_major,
+        'go_backwards': go_backwards,
+    }
+    defun_standard_lstm = _generate_defun_backend(api_name, _CPU_DEVICE_NAME,
+                                                  standard_lstm,
+                                                  supportive_attribute)
+    defun_gpu_lstm = _generate_defun_backend(api_name, _GPU_DEVICE_NAME,
+                                             gpu_lstm_with_fallback,
+                                             supportive_attribute)
+
+    # Call the normal LSTM impl and register the CuDNN impl function. The
+    # grappler will kick in during session execution to optimize the graph.
+    last_output, outputs, new_h, new_c, runtime = defun_standard_lstm(**params)
+    function.register(defun_gpu_lstm, **params)
 
   return last_output, outputs, new_h, new_c, runtime
 
 
-def is_sequence_right_padded(mask, time_major):
+def is_sequence_right_padded(mask):
   """Check the mask tensor and see if it right padded.
 
   For CuDNN kernel, it uses the sequence length param to skip the tailing
@@ -1540,15 +1683,11 @@ def is_sequence_right_padded(mask, time_major):
   pollute the internal states.
 
   Args:
-    mask: the Boolean tensor with shape [batch, timestep] or [timestep, batch]
-      when time_major is True.
-    time_major: Boolean, whether the input mask is time major or batch major.
+    mask: the Boolean tensor with shape [batch, timestep]
 
   Returns:
     boolean scalar tensor, whether the mask is strictly right padded.
   """
-  if time_major:
-    mask = array_ops.transpose(mask)
   max_seq_length = array_ops.shape(mask)[1]
   count_of_true = math_ops.reduce_sum(math_ops.cast(mask, dtypes.int32), axis=1)
   right_padded_mask = array_ops.sequence_mask(
@@ -1556,6 +1695,28 @@ def is_sequence_right_padded(mask, time_major):
   return math_ops.reduce_all(math_ops.equal(mask, right_padded_mask))
 
 
+def has_fully_masked_sequence(mask):
+  # See https://github.com/tensorflow/tensorflow/issues/33148 for more details.
+  # Cudnn kernel will error out if the input sequence contains any fully masked
+  # data. We walk around this issue by rerouting the computation to standard
+  # kernel, until the issue on cudnn side has been fixed.
+  # For a fully masked sequence, it will contain all Falses. To make it easy to
+  # check, we inverse the boolean, check if any of the seqence has all True.
+  return math_ops.reduce_any(
+      math_ops.reduce_all(
+          math_ops.logical_not(mask),
+          axis=1))
+
+
+def is_cudnn_supported_inputs(mask, time_major):
+  if time_major:
+    mask = array_ops.transpose(mask)
+
+  return math_ops.logical_and(
+      is_sequence_right_padded(mask),
+      math_ops.logical_not(has_fully_masked_sequence(mask)))
+
+
 def calculate_sequence_by_mask(mask, time_major):
   """Calculate the sequence length tensor (1-D) based on the masking tensor.
 
@@ -1581,6 +1742,18 @@ def calculate_sequence_by_mask(mask, time_major):
                              axis=timestep_index)
 
 
+def _generate_defun_backend(unique_api_name, preferred_device, func,
+                            supportive_attributes):
+  function_attributes = {
+      _FUNCTION_API_NAME_ATTRIBUTE: unique_api_name,
+      _FUNCTION_DEVICE_ATTRIBUTE: preferred_device,
+  }
+  function_attributes.update(supportive_attributes)
+  return function.defun_with_attributes(func=func,
+                                        attributes=function_attributes,
+                                        autograph=False)
+
+
 def _get_context_device_type():
   """Parse the current context and return the device type, eg CPU/GPU."""
   current_device = context.context().device_name
@@ -1596,7 +1769,7 @@ def _runtime(runtime_name):
 
 
 def _read_variable_value(v):
-  """Read the value of a resource variable if it is variable."""
-  if resource_variable_ops.is_resource_variable(v):
+  """Read the value of a variable if it is variable."""
+  if isinstance(v, variables.Variable):
     return v.read_value()
   return v
diff --git a/tensorflow/python/keras/layers/recurrent_v2_test.py b/tensorflow/python/keras/layers/recurrent_v2_test.py
index ec70761c8a8..8686c0a0518 100644
--- a/tensorflow/python/keras/layers/recurrent_v2_test.py
+++ b/tensorflow/python/keras/layers/recurrent_v2_test.py
@@ -27,7 +27,6 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.layers import embeddings
@@ -53,7 +52,7 @@ class RNNV2Test(keras_parameterized.TestCase):
 
     # Test when GPU is available but not used, the graph should be properly
     # created with CPU ops.
-    with test_util.device(use_gpu=False):
+    with testing_utils.device(should_use_gpu=False):
       model = keras.Sequential([
           keras.layers.Embedding(vocab_size, embedding_dim,
                                  batch_input_shape=[batch_size, timestep]),
diff --git a/tensorflow/python/keras/layers/tensorflow_op_layer_test.py b/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
index cb044260106..e128323a1a6 100644
--- a/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
+++ b/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.engine import keras_tensor
 from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.keras.saving import model_config
 from tensorflow.python.ops import array_ops
@@ -130,6 +131,55 @@ def _shape_op_slice_and_range_known_dim():
     return keras.Model(inputs, inputs)
 
 
+def _int32_manipulation_too_big_for_shape():
+  # This test verifies that the Keras Functional API
+  # won't crash when manipulating int32 tensors that are too large
+  # to represent shapes.
+  inputs = keras.Input(batch_size=2, shape=(10,))
+  batch_size = array_ops.shape(inputs)[0]
+  num_features = 3 * 1024 * 16
+  x = math_ops.range(batch_size * num_features, dtype='int32')
+  assert x.shape.as_list() == [inputs.shape[0] * num_features]
+  x = array_ops.reshape(x, (batch_size, num_features))
+  x = math_ops.cast(x, dtype='float32')
+  outputs = keras.layers.Dense(10)(x)
+  if context.executing_eagerly():
+    return keras.Model(inputs, outputs)
+  else:
+    # In V1 the op layer fails for some reason,
+    # but we don't have access to the test case to call
+    # self.skip_test in this util method
+    return keras.Model(inputs, inputs)
+
+
+def _int32_manipulation_at_max_shape_dims_limit():
+  # This test verifies that the Keras Functional API
+  # won't crash when manipulating int32 tensors that are at the limit
+  # of the max tensor size Keras can try inferring values for.
+  inputs = keras.Input(batch_size=2, shape=(10,))
+  batch_size = array_ops.shape(inputs)[0]
+  num_features = int(keras_tensor._MAX_TENSOR_DIMS / int(inputs.shape[0]))
+  x = math_ops.range(batch_size * num_features, dtype='int32')
+  assert x.shape.as_list() == [keras_tensor._MAX_TENSOR_DIMS]
+
+  # Verify that a value was actually inferred for a tensor that *might*
+  # represent the shape, bying checking that a value in
+  # the range appears in the printed inferred value
+  if keras_tensor.keras_tensors_enabled():
+    assert str(keras_tensor._MAX_TENSOR_DIMS - 1) in str(x)
+
+  x = array_ops.reshape(x, (batch_size, num_features))
+  x = math_ops.cast(x, dtype='float32')
+  outputs = keras.layers.Dense(10)(x)
+  if context.executing_eagerly():
+    return keras.Model(inputs, outputs)
+  else:
+    # In V1 the op layer fails for some reason,
+    # but we don't have access to the test case to call
+    # self.skip_test in this util method
+    return keras.Model(inputs, inputs)
+
+
 def _single_standalone_branch():
   inputs = keras.Input(shape=(10,))
   x = keras.layers.Dense(10)(inputs)
@@ -251,6 +301,10 @@ class AutoLambdaTest(keras_parameterized.TestCase):
       ('shape_op_slice_and_range', _shape_op_slice_and_range),
       ('shape_op_slice_and_range_known_dim',
        _shape_op_slice_and_range_known_dim),
+      ('int32_manipulation_too_big_for_shape',
+       _int32_manipulation_too_big_for_shape),
+      ('int32_manipulation_at_max_shape_dims_limit',
+       _int32_manipulation_at_max_shape_dims_limit),
       ('single_standalone_branch', _single_standalone_branch),
       ('single_op_with_attrs', _single_op_with_attrs),
       ('multiple_uses', _multiple_uses),
@@ -270,9 +324,9 @@ class AutoLambdaTest(keras_parameterized.TestCase):
         run_eagerly=testing_utils.should_run_eagerly())
 
     np_inputs = nest.map_structure(
-        lambda x: np.ones((10,) + tuple(x.shape[1:]), 'float32'), model.inputs)
+        lambda x: np.ones((2,) + tuple(x.shape[1:]), 'float32'), model.inputs)
     np_outputs = nest.map_structure(
-        lambda x: np.ones((10,) + tuple(x.shape[1:]), 'float32'), model.outputs)
+        lambda x: np.ones((2,) + tuple(x.shape[1:]), 'float32'), model.outputs)
     model.fit(np_inputs, np_outputs, batch_size=2)
     model(np_inputs)  # Test calling the model directly on inputs.
 
@@ -294,6 +348,294 @@ class AutoLambdaTest(keras_parameterized.TestCase):
     self.assertAllEqual([layer.name for layer in model.layers],
                         [layer.name for layer in new_model.layers])
 
+  def test_stack_preserves_correct_shape(self):
+    ## Test stack([x])
+    inp = keras.Input(shape=(), dtype='float32')
+
+    out = array_ops.stack([inp])
+    model = keras.Model(
+        inputs=inp,
+        outputs=out)
+    model.compile(
+        adam.Adam(0.001),
+        'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    x = array_ops.ones(shape=(4, 4))
+    expected = array_ops.stack([x])
+    self.assertAllEqual(expected.shape, (1, 4, 4))
+
+    self.assertAllEqual(model(x).shape, (1, 4, 4))
+    self.assertAllEqual(model(x), expected)
+
+    config = model.get_config()
+    model = keras.Model.from_config(config)
+
+    self.assertAllEqual(model(x).shape, (1, 4, 4))
+    self.assertAllEqual(model(x), expected)
+
+    ## Test stack(x)
+    inp = keras.Input(shape=(), dtype='float32')
+
+    out = array_ops.stack(inp)
+    model = keras.Model(
+        inputs=inp,
+        outputs=out)
+    model.compile(
+        adam.Adam(0.001),
+        'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    x = array_ops.ones(shape=(4, 4))
+    expected = array_ops.stack(x)
+    self.assertAllEqual(expected.shape, (4, 4))
+
+    self.assertAllEqual(model(x).shape, (4, 4))
+    self.assertAllEqual(model(x), expected)
+
+    config = model.get_config()
+    model = keras.Model.from_config(config)
+
+    self.assertAllEqual(model(x).shape, (4, 4))
+    self.assertAllEqual(model(x), expected)
+
+  def test_getitem_slice_with_step_only(self):
+    if not context.executing_eagerly():
+      self.skipTest('Complex slicing like this fails in v1')
+    inp = keras.Input(shape=(8,))
+    slice_step = keras.Input(shape=(), dtype='int32')
+
+    out = inp[..., ::slice_step[0]]
+    model = keras.Model(
+        inputs=[inp, slice_step],
+        outputs=out)
+    model.compile(
+        adam.Adam(0.001),
+        'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    batch_size = 7
+    step = 3
+    x = array_ops.stack([
+        math_ops.range(8) for _ in range(batch_size)])
+    args = [x, constant_op.constant(step, shape=(batch_size,))]
+    expected = array_ops.stack([
+        math_ops.range(8)[::step] for _ in range(batch_size)])
+
+    if keras_tensor.keras_tensors_enabled():
+      self.assertIn('tf.__operators__.getitem', (
+          x.name for x in model.layers))
+      self.assertNotIn('tf.strided_slice', (
+          x.name for x in model.layers))
+    self.assertAllEqual(model(args), expected)
+    self.assertAllEqual(model.predict(args, batch_size=batch_size), expected)
+
+    # Make sure it can be successfully saved and loaded
+    config = model.get_config()
+    model = keras.Model.from_config(config)
+
+    self.assertAllEqual(model(args), expected)
+    self.assertAllEqual(model.predict(args, batch_size=batch_size), expected)
+
+  def test_getitem_slice_real_tensor(self):
+    if not context.executing_eagerly():
+      self.skipTest('Complex slicing like this fails in v1')
+    x = math_ops.range(10.0)
+    slice_stop = keras.Input(shape=(), dtype='int32')
+
+    out = x[:slice_stop[0]]
+    model = keras.Model(
+        inputs=slice_stop,
+        outputs=out)
+    model.compile(
+        adam.Adam(0.001),
+        'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    batch_size = 7
+    stop = 6
+    args = constant_op.constant(stop, shape=(batch_size,))
+    expected = x[:stop]
+
+    if keras_tensor.keras_tensors_enabled():
+      self.assertIn('tf.__operators__.getitem', (
+          x.name for x in model.layers))
+      # TODO(b/161925288): Fix the dispatch triggering then uncomment:
+      # self.assertNotIn('tf.strided_slice', (
+      #     x.name for x in model.layers))
+    self.assertAllEqual(model(args), expected)
+    self.assertAllEqual(model.predict(args, batch_size=batch_size), expected)
+
+    config = model.get_config()
+    model = keras.Model.from_config(config)
+
+    self.assertAllEqual(model(args), expected)
+    self.assertAllEqual(model.predict(args, batch_size=batch_size), expected)
+
+  def test_getitem_index_real_tensor(self):
+    if not context.executing_eagerly():
+      self.skipTest('Complex slicing like this fails in v1')
+    x = math_ops.range(10.0)
+    slice_stop = keras.Input(shape=(), dtype='int32')
+
+    out = x[slice_stop[0]]
+    model = keras.Model(
+        inputs=slice_stop,
+        outputs=out)
+    model.compile(
+        adam.Adam(0.001),
+        'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    batch_size = 7
+    index = 6
+    args = constant_op.constant(index, shape=(batch_size,))
+    expected = x[index]
+
+    if keras_tensor.keras_tensors_enabled():
+      self.assertIn('tf.__operators__.getitem', (
+          x.name for x in model.layers))
+      # TODO(b/161925288): Fix the bug then uncomment:
+      # self.assertNotIn('tf.strided_slice', (
+      #     x.name for x in model.layers))
+    self.assertAllEqual(model(args), expected)
+    self.assertAllEqual(model.predict(args, batch_size=batch_size), expected)
+
+    # Make sure it can be successfully saved and loaded
+    config = model.get_config()
+    model = keras.Model.from_config(config)
+
+    self.assertAllEqual(model(args), expected)
+    self.assertAllEqual(model.predict(args, batch_size=batch_size), expected)
+
+  def test_getitem_slice_with_stop_only(self):
+    if not context.executing_eagerly():
+      self.skipTest('Complex slicing like this fails in v1')
+    inp = keras.Input(shape=(8,))
+    slice_stop = keras.Input(shape=(), dtype='int32')
+
+    out = inp[:slice_stop[0]]
+    model = keras.Model(
+        inputs=[inp, slice_stop],
+        outputs=out)
+    model.compile(
+        adam.Adam(0.001),
+        'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    batch_size = 7
+    stop = 6
+    x = array_ops.stack([
+        math_ops.range(8) for _ in range(batch_size)])
+    args = [x, constant_op.constant(stop, shape=(batch_size,))]
+    expected = x[:stop]
+
+    if keras_tensor.keras_tensors_enabled():
+      self.assertIn('tf.__operators__.getitem', (
+          x.name for x in model.layers))
+      self.assertNotIn('tf.strided_slice', (
+          x.name for x in model.layers))
+    self.assertAllEqual(model(args), expected)
+    self.assertAllEqual(model.predict(args, batch_size=batch_size), expected)
+
+    # Make sure it can be successfully saved and loaded
+    config = model.get_config()
+    model = keras.Model.from_config(config)
+
+    self.assertAllEqual(model(args), expected)
+    self.assertAllEqual(model.predict(args, batch_size=batch_size), expected)
+
+  def test_getitem_slice_with_stop_and_ellipsis_only(self):
+    if not context.executing_eagerly():
+      self.skipTest('Complex slicing like this fails in v1')
+    inp = keras.Input(shape=(8,))
+    slice_stop = keras.Input(shape=(), dtype='int32')
+
+    out = inp[..., :slice_stop[0]]
+    model = keras.Model(
+        inputs=[inp, slice_stop],
+        outputs=out)
+    model.compile(
+        adam.Adam(0.001),
+        'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    batch_size = 7
+    stop = 6
+    x = array_ops.stack([
+        math_ops.range(8) for _ in range(batch_size)])
+    args = [x, constant_op.constant(stop, shape=(batch_size,))]
+    expected = array_ops.stack([
+        math_ops.range(8)[:stop] for _ in range(batch_size)])
+
+    if keras_tensor.keras_tensors_enabled():
+      self.assertIn('tf.__operators__.getitem', (
+          x.name for x in model.layers))
+      self.assertNotIn('tf.strided_slice', (
+          x.name for x in model.layers))
+    self.assertAllEqual(model(args), expected)
+    self.assertAllEqual(model.predict(args, batch_size=batch_size), expected)
+
+    # Make sure it can be successfully saved and loaded
+    config = model.get_config()
+    model = keras.Model.from_config(config)
+
+    self.assertAllEqual(model(args), expected)
+    self.assertAllEqual(model.predict(args, batch_size=batch_size), expected)
+
+  def test_getitem_complex_slicing(self):
+    if not context.executing_eagerly():
+      self.skipTest('Complex slicing like this fails in v1')
+    inp = keras.Input(shape=(4, 3, 8))
+    first_dim = keras.Input(shape=(), dtype='int32')
+    slice_start = keras.Input(shape=(), dtype='int32')
+    slice_stop = keras.Input(shape=(), dtype='int32')
+    slice_stride = keras.Input(shape=(), dtype='int32')
+
+    out = inp[..., first_dim[0], slice_start[0]:slice_stop[0]:slice_stride[0]]
+    model = keras.Model(
+        inputs=[inp, first_dim, slice_start, slice_stop, slice_stride],
+        outputs=out)
+    model.compile(
+        adam.Adam(0.001),
+        'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    batch_size = 7
+    start = 1
+    stop = 6
+    step = 2
+    x = array_ops.stack([array_ops.stack([array_ops.stack([
+        math_ops.range(8)
+        for _ in range(3)]) for _ in range(4)]) for _ in range(batch_size)])
+    args = [x,
+            constant_op.constant(0, shape=(batch_size,)),
+            constant_op.constant(start, shape=(batch_size,)),
+            constant_op.constant(stop, shape=(batch_size,)),
+            constant_op.constant(step, shape=(batch_size,))]
+    # Slice the innermost dim. only grab one index from the second-to-innermost
+    # dim, removing that dim from the shape.
+    expected = array_ops.stack([array_ops.stack([
+        math_ops.range(8)[start:stop:step]
+        for _ in range(4)]) for _ in range(batch_size)])
+
+    if keras_tensor.keras_tensors_enabled():
+      self.assertIn('tf.__operators__.getitem', (
+          x.name for x in model.layers))
+      self.assertNotIn('tf.strided_slice', (
+          x.name for x in model.layers))
+    self.assertAllEqual(model(args), expected)
+    self.assertAllEqual(model.predict(args, batch_size=batch_size), expected)
+
+    # Make sure it can be successfully saved and loaded
+    config = model.get_config()
+    model = keras.Model.from_config(config)
+
+    self.assertAllEqual(model(args), expected)
+    self.assertAllEqual(model.predict(args, batch_size=batch_size), expected)
+
+  def test_left_hand_numpy_multiplication(self):
+    x = np.asarray([3.0])
+    inputs = keras.Input(shape=(4,))
+    outputs = x * inputs
+    model = keras.Model(inputs, outputs)
+    ones = array_ops.ones((5, 4), dtype='float32')
+    self.assertAllEqual(model(ones), 3.0 * ones)
+
   def test_numerical_correctness_simple(self):
     x = ops.convert_to_tensor_v2([[-1., 0., -2., 1.]])
     inputs = keras.Input(shape=(4,))
@@ -304,14 +646,14 @@ class AutoLambdaTest(keras_parameterized.TestCase):
 
   def test_numerical_correctness_with_attrs(self):
     x = ops.convert_to_tensor_v2([[1.5, 1.5], [2.5, 3.5]])
-    inputs = keras.Input(shape=(10,))
+    inputs = keras.Input(shape=(2,))
     outputs = math_ops.reduce_mean(inputs, axis=1)
     model = keras.Model(inputs, outputs)
     y = self.evaluate(model(x))
     self.assertAllClose(y, [1.5, 3.])
 
   def test_numerical_correctness_serialization(self):
-    x = ops.convert_to_tensor_v2([-1., 0., -2., 1.])
+    x = ops.convert_to_tensor_v2([[-1., 0., -2., 1.]])
     inputs = keras.Input(shape=(4,))
     outputs = gen_nn_ops.relu(inputs)
     model1 = keras.Model(inputs, outputs)
diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
index efff254c688..671fe65d520 100644
--- a/tensorflow/python/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -305,7 +305,7 @@ class TimeDistributedTest(keras_parameterized.TestCase):
     self.assertEqual(out_2.shape.as_list(), [None, 1, 5])
 
     ph_3 = keras.backend.placeholder(shape=(None, 1, 18))
-    with self.assertRaisesRegex(ValueError, 'is incompatible with layer'):
+    with self.assertRaisesRegex(ValueError, 'is incompatible with'):
       time_dist(ph_3)
 
   def test_TimeDistributed_with_invalid_dimensions(self):
diff --git a/tensorflow/python/keras/legacy_tf_layers/base.py b/tensorflow/python/keras/legacy_tf_layers/base.py
index 5a944703af9..25b9ddca65e 100644
--- a/tensorflow/python/keras/legacy_tf_layers/base.py
+++ b/tensorflow/python/keras/legacy_tf_layers/base.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.mixed_precision.experimental import policy
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
@@ -405,7 +406,7 @@ class Layer(base_layer.Layer):
       trainable = True
 
     def _should_add_regularizer(variable, existing_variable_set):
-      if isinstance(variable, tf_variables.PartitionedVariable):
+      if base_layer_utils.is_split_variable(variable):
         for var in variable:
           if var in existing_variable_set:
             return False
@@ -439,7 +440,7 @@ class Layer(base_layer.Layer):
     with vs.variable_scope(
         self._scope, reuse=reuse, auxiliary_name_scope=False) as scope:
       self._current_scope = scope
-      with ops.name_scope(self._name_scope(), skip_on_eager=False):
+      with backend.name_scope(self._name_scope()):
         use_resource = (use_resource or
                         self._use_resource_variables or
                         scope.use_resource)
diff --git a/tensorflow/python/keras/legacy_tf_layers/base_test.py b/tensorflow/python/keras/legacy_tf_layers/base_test.py
index b3d6789d4dc..2c9810c4109 100644
--- a/tensorflow/python/keras/legacy_tf_layers/base_test.py
+++ b/tensorflow/python/keras/legacy_tf_layers/base_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras.engine import base_layer as keras_base_layer
 from tensorflow.python.keras.engine import input_spec
@@ -68,12 +69,12 @@ class BaseLayerTest(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testKerasStyleAddWeight(self):
     keras_layer = keras_base_layer.Layer(name='keras_layer')
-    with ops.name_scope('foo', skip_on_eager=False):
+    with backend.name_scope('foo'):
       keras_variable = keras_layer.add_variable(
           'my_var', [2, 2], initializer=init_ops.zeros_initializer())
     self.assertEqual(keras_variable.name, 'foo/my_var:0')
 
-    with ops.name_scope('baz', skip_on_eager=False):
+    with backend.name_scope('baz'):
       old_style_layer = base_layers.Layer(name='my_layer')
       # Test basic variable creation.
       variable = old_style_layer.add_variable(
@@ -83,7 +84,7 @@ class BaseLayerTest(test.TestCase, parameterized.TestCase):
     with base_layers.keras_style_scope():
       layer = base_layers.Layer(name='my_layer')
     # Test basic variable creation.
-    with ops.name_scope('bar', skip_on_eager=False):
+    with backend.name_scope('bar'):
       variable = layer.add_variable(
           'my_var', [2, 2], initializer=init_ops.zeros_initializer())
     self.assertEqual(variable.name, 'bar/my_var:0')
@@ -241,7 +242,7 @@ class BaseLayerTest(test.TestCase, parameterized.TestCase):
     my_layer2.apply(inputs)
     self.assertEqual(my_layer2._scope.name, 'my_layer_2')
     # Name scope shouldn't affect names.
-    with ops.name_scope('some_name_scope'):
+    with backend.name_scope('some_name_scope'):
       default_layer2 = PrivateLayer()
       default_layer2.apply(inputs)
       self.assertEqual(default_layer2._scope.name, 'private_layer_2')
@@ -276,11 +277,6 @@ class BaseLayerTest(test.TestCase, parameterized.TestCase):
       def call(self, inputs):
         return inputs
 
-    if not context.executing_eagerly():
-      layer = CustomerLayer()
-      with self.assertRaisesRegex(ValueError, r'requires a defined rank'):
-        layer.apply(array_ops.placeholder('int32'))
-
     layer = CustomerLayer()
     with self.assertRaisesRegex(ValueError, r'expected ndim=2'):
       layer.apply(constant_op.constant([1]))
@@ -294,29 +290,24 @@ class BaseLayerTest(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testInputSpecMinNdimCheck(self):
 
-    class CustomerLayer(base_layers.Layer):
+    class CustomLayer(base_layers.Layer):
 
       def __init__(self):
-        super(CustomerLayer, self).__init__()
+        super(CustomLayer, self).__init__()
         self.input_spec = input_spec.InputSpec(min_ndim=2)
 
       def call(self, inputs):
         return inputs
 
-    if not context.executing_eagerly():
-      layer = CustomerLayer()
-      with self.assertRaisesRegex(ValueError, r'requires a defined rank'):
-        layer.apply(array_ops.placeholder('int32'))
-
-    layer = CustomerLayer()
+    layer = CustomLayer()
     with self.assertRaisesRegex(ValueError, r'expected min_ndim=2'):
       layer.apply(constant_op.constant([1]))
 
     # Works
-    layer = CustomerLayer()
+    layer = CustomLayer()
     layer.apply(constant_op.constant([[1], [2]]))
 
-    layer = CustomerLayer()
+    layer = CustomLayer()
     layer.apply(constant_op.constant([[[1], [2]]]))
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
@@ -331,11 +322,6 @@ class BaseLayerTest(test.TestCase, parameterized.TestCase):
       def call(self, inputs):
         return inputs
 
-    if not context.executing_eagerly():
-      layer = CustomerLayer()
-      with self.assertRaisesRegex(ValueError, r'requires a defined rank'):
-        layer.apply(array_ops.placeholder('int32'))
-
     layer = CustomerLayer()
     with self.assertRaisesRegex(ValueError, r'expected max_ndim=2'):
       layer.apply(constant_op.constant([[[1], [2]]]))
diff --git a/tensorflow/python/keras/legacy_tf_layers/convolutional_test.py b/tensorflow/python/keras/legacy_tf_layers/convolutional_test.py
index a6a4bc7a088..8a05380543e 100644
--- a/tensorflow/python/keras/legacy_tf_layers/convolutional_test.py
+++ b/tensorflow/python/keras/legacy_tf_layers/convolutional_test.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras.legacy_tf_layers import convolutional as conv_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -60,13 +60,13 @@ class ConvTest(test.TestCase):
     with self.assertRaisesRegex(ValueError, 'kernel_size'):
       conv_layers.conv2d(images, 32, None)
 
-  @test_util.run_deprecated_v1
   def testCreateConv2D(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
     layer = conv_layers.Conv2D(32, [3, 3], activation=nn_ops.relu)
     output = layer.apply(images)
-    self.assertEqual(output.op.name, 'conv2d/Relu')
+    if not context.executing_eagerly():
+      self.assertEqual(output.op.name, 'conv2d/Relu')
     self.assertListEqual(output.get_shape().as_list(),
                          [5, height - 2, width - 2, 32])
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 4, 32])
@@ -89,32 +89,32 @@ class ConvTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
-  @test_util.run_deprecated_v1
   def testCreateConv2DChannelsFirst(self):
-    height, width = 7, 9
-    images = random_ops.random_uniform((5, 4, height, width))
-    layer = conv_layers.Conv2D(32, [3, 3], data_format='channels_first')
-    output = layer.apply(images)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, 32, height - 2, width - 2])
-    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 4, 32])
-    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+    with ops.Graph().as_default():
+      height, width = 7, 9
+      images = random_ops.random_uniform((5, 4, height, width))
+      layer = conv_layers.Conv2D(32, [3, 3], data_format='channels_first')
+      output = layer.apply(images)
+      self.assertListEqual(output.get_shape().as_list(),
+                           [5, 32, height - 2, width - 2])
+      self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 4, 32])
+      self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
-  @test_util.run_deprecated_v1
   def testUnknownInputChannels(self):
-    images = array_ops.placeholder(dtypes.float32, (5, 7, 9, None))
-    layer = conv_layers.Conv2D(32, [3, 3], activation=nn_ops.relu)
-    with self.assertRaisesRegex(
-        ValueError, 'The channel dimension of the inputs '
-        'should be defined. Found `None`.'):
-      _ = layer.apply(images)
+    with ops.Graph().as_default():
+      images = array_ops.placeholder(dtypes.float32, (5, 7, 9, None))
+      layer = conv_layers.Conv2D(32, [3, 3], activation=nn_ops.relu)
+      with self.assertRaisesRegex(
+          ValueError, 'The channel dimension of the inputs '
+          'should be defined. Found `None`.'):
+        _ = layer.apply(images)
 
-    images = array_ops.placeholder(dtypes.float32, (5, None, 7, 9))
-    layer = conv_layers.Conv2D(32, [3, 3], data_format='channels_first')
-    with self.assertRaisesRegex(
-        ValueError, 'The channel dimension of the inputs '
-        'should be defined. Found `None`.'):
-      _ = layer.apply(images)
+      images = array_ops.placeholder(dtypes.float32, (5, None, 7, 9))
+      layer = conv_layers.Conv2D(32, [3, 3], data_format='channels_first')
+      with self.assertRaisesRegex(
+          ValueError, 'The channel dimension of the inputs '
+          'should be defined. Found `None`.'):
+        _ = layer.apply(images)
 
   def testConv2DPaddingSame(self):
     height, width = 7, 9
@@ -144,13 +144,13 @@ class ConvTest(test.TestCase):
     self.assertListEqual(output.get_shape().as_list(),
                          [5, height / 2, width, 32])
 
-  @test_util.run_deprecated_v1
   def testCreateConv1D(self):
     width = 7
     data = random_ops.random_uniform((5, width, 4))
     layer = conv_layers.Conv1D(32, 3, activation=nn_ops.relu)
     output = layer.apply(data)
-    self.assertEqual(output.op.name, 'conv1d/Relu')
+    if not context.executing_eagerly():
+      self.assertEqual(output.op.name, 'conv1d/Relu')
     self.assertListEqual(output.get_shape().as_list(), [5, width - 2, 32])
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
@@ -161,85 +161,87 @@ class ConvTest(test.TestCase):
     output = conv_layers.conv1d(data, 32, 3, activation=nn_ops.relu)
     self.assertListEqual(output.get_shape().as_list(), [5, width - 2, 32])
 
-  @test_util.run_deprecated_v1
   def testCreateConv1DChannelsFirst(self):
-    width = 7
-    data = random_ops.random_uniform((5, 4, width))
-    layer = conv_layers.Conv1D(32, 3, data_format='channels_first')
-    output = layer.apply(data)
-    self.assertListEqual(output.get_shape().as_list(), [5, 32, width - 2])
-    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 4, 32])
-    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+    with ops.Graph().as_default():
+      width = 7
+      data = random_ops.random_uniform((5, 4, width))
+      layer = conv_layers.Conv1D(32, 3, data_format='channels_first')
+      output = layer.apply(data)
+      self.assertListEqual(output.get_shape().as_list(), [5, 32, width - 2])
+      self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 4, 32])
+      self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
-  @test_util.run_deprecated_v1
   def testUnknownInputChannelsConv1D(self):
-    data = array_ops.placeholder(dtypes.float32, (5, 4, None))
-    layer = conv_layers.Conv1D(32, 3, activation=nn_ops.relu)
-    with self.assertRaisesRegex(
-        ValueError, 'The channel dimension of the inputs '
-        'should be defined. Found `None`.'):
-      _ = layer.apply(data)
+    with ops.Graph().as_default():
+      data = array_ops.placeholder(dtypes.float32, (5, 4, None))
+      layer = conv_layers.Conv1D(32, 3, activation=nn_ops.relu)
+      with self.assertRaisesRegex(
+          ValueError, 'The channel dimension of the inputs '
+          'should be defined. Found `None`.'):
+        _ = layer.apply(data)
 
-    data = array_ops.placeholder(dtypes.float32, (5, None, 4))
-    layer = conv_layers.Conv1D(32, 3, data_format='channels_first')
-    with self.assertRaisesRegex(
-        ValueError, 'The channel dimension of the inputs '
-        'should be defined. Found `None`.'):
-      _ = layer.apply(data)
+      data = array_ops.placeholder(dtypes.float32, (5, None, 4))
+      layer = conv_layers.Conv1D(32, 3, data_format='channels_first')
+      with self.assertRaisesRegex(
+          ValueError, 'The channel dimension of the inputs '
+          'should be defined. Found `None`.'):
+        _ = layer.apply(data)
 
-  @test_util.run_deprecated_v1
   def testCreateConv3D(self):
     depth, height, width = 6, 7, 9
     volumes = random_ops.random_uniform((5, depth, height, width, 4))
     layer = conv_layers.Conv3D(32, [3, 3, 3], activation=nn_ops.relu)
     output = layer.apply(volumes)
-    self.assertEqual(output.op.name, 'conv3d/Relu')
+    if not context.executing_eagerly():
+      self.assertEqual(output.op.name, 'conv3d/Relu')
     self.assertListEqual(output.get_shape().as_list(),
                          [5, depth - 2, height - 2, width - 2, 32])
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
-  @test_util.run_deprecated_v1
   def testUnknownInputChannelsConv3D(self):
-    volumes = array_ops.placeholder(dtypes.float32, (5, 6, 7, 9, None))
-    layer = conv_layers.Conv3D(32, [3, 3, 3], activation=nn_ops.relu)
-    with self.assertRaisesRegex(
-        ValueError, 'The channel dimension of the inputs '
-        'should be defined. Found `None`.'):
-      _ = layer.apply(volumes)
+    with ops.Graph().as_default():
+      volumes = array_ops.placeholder(dtypes.float32, (5, 6, 7, 9, None))
+      layer = conv_layers.Conv3D(32, [3, 3, 3], activation=nn_ops.relu)
+      with self.assertRaisesRegex(
+          ValueError, 'The channel dimension of the inputs '
+          'should be defined. Found `None`.'):
+        _ = layer.apply(volumes)
 
-  @test_util.run_deprecated_v1
   def testConv2DKernelRegularizer(self):
-    height, width = 7, 9
-    images = random_ops.random_uniform((5, height, width, 4))
-    reg = lambda x: 0.1 * math_ops.reduce_sum(x)
-    layer = conv_layers.Conv2D(32, [3, 3], kernel_regularizer=reg)
-    layer.apply(images)
-    loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-    self.assertEqual(len(loss_keys), 1)
-    self.evaluate([v.initializer for v in layer.variables])
-    self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
+    with ops.Graph().as_default():
+      height, width = 7, 9
+      images = random_ops.random_uniform((5, height, width, 4))
+      reg = lambda x: 0.1 * math_ops.reduce_sum(x)
+      layer = conv_layers.Conv2D(32, [3, 3], kernel_regularizer=reg)
+      layer.apply(images)
+      loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(loss_keys), 1)
+      self.evaluate([v.initializer for v in layer.variables])
+      self.assertListEqual(
+          self.evaluate(layer.losses), self.evaluate(loss_keys))
 
-  @test_util.run_deprecated_v1
   def testConv2DBiasRegularizer(self):
-    height, width = 7, 9
-    images = random_ops.random_uniform((5, height, width, 4))
-    reg = lambda x: 0.1 * math_ops.reduce_sum(x)
-    layer = conv_layers.Conv2D(32, [3, 3], bias_regularizer=reg)
-    layer.apply(images)
-    loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-    self.assertEqual(len(loss_keys), 1)
-    self.evaluate([v.initializer for v in layer.variables])
-    self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
+    with ops.Graph().as_default():
+      height, width = 7, 9
+      images = random_ops.random_uniform((5, height, width, 4))
+      reg = lambda x: 0.1 * math_ops.reduce_sum(x)
+      layer = conv_layers.Conv2D(32, [3, 3], bias_regularizer=reg)
+      layer.apply(images)
+      loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(loss_keys), 1)
+      self.evaluate([v.initializer for v in layer.variables])
+      self.assertListEqual(
+          self.evaluate(layer.losses), self.evaluate(loss_keys))
 
-  @test_util.run_deprecated_v1
   def testConv2DNoBias(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
     layer = conv_layers.Conv2D(
         32, [3, 3], activation=nn_ops.relu, use_bias=False)
     output = layer.apply(images)
-    self.assertEqual(output.op.name, 'conv2d/Relu')
+    if not context.executing_eagerly():
+      self.assertEqual(output.op.name, 'conv2d/Relu')
     self.assertListEqual(output.get_shape().as_list(),
                          [5, height - 2, width - 2, 32])
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 4, 32])
@@ -259,29 +261,28 @@ class ConvTest(test.TestCase):
     output = layer.apply(images)
     self.assertListEqual(output.get_shape().as_list(), [5, height - 2, 3, 32])
 
-  @test_util.run_deprecated_v1
   def testFunctionalConv2DReuse(self):
-    height, width = 7, 9
-    images = random_ops.random_uniform((5, height, width, 3), seed=1)
-    conv_layers.conv2d(images, 32, [3, 3], name='conv1')
-    self.assertEqual(len(variables.trainable_variables()), 2)
-    conv_layers.conv2d(images, 32, [3, 3], name='conv1', reuse=True)
-    self.assertEqual(len(variables.trainable_variables()), 2)
-
-  @test_util.run_deprecated_v1
-  def testFunctionalConv2DReuseFromScope(self):
-    with variable_scope.variable_scope('scope'):
+    with ops.Graph().as_default():
       height, width = 7, 9
       images = random_ops.random_uniform((5, height, width, 3), seed=1)
       conv_layers.conv2d(images, 32, [3, 3], name='conv1')
       self.assertEqual(len(variables.trainable_variables()), 2)
-    with variable_scope.variable_scope('scope', reuse=True):
-      conv_layers.conv2d(images, 32, [3, 3], name='conv1')
+      conv_layers.conv2d(images, 32, [3, 3], name='conv1', reuse=True)
       self.assertEqual(len(variables.trainable_variables()), 2)
 
-  @test_util.run_deprecated_v1
+  def testFunctionalConv2DReuseFromScope(self):
+    with ops.Graph().as_default():
+      with variable_scope.variable_scope('scope'):
+        height, width = 7, 9
+        images = random_ops.random_uniform((5, height, width, 3), seed=1)
+        conv_layers.conv2d(images, 32, [3, 3], name='conv1')
+        self.assertEqual(len(variables.trainable_variables()), 2)
+      with variable_scope.variable_scope('scope', reuse=True):
+        conv_layers.conv2d(images, 32, [3, 3], name='conv1')
+        self.assertEqual(len(variables.trainable_variables()), 2)
+
   def testFunctionalConv2DInitializerFromScope(self):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       with variable_scope.variable_scope(
           'scope', initializer=init_ops.ones_initializer()):
         height, width = 7, 9
@@ -298,14 +299,14 @@ class ConvTest(test.TestCase):
         # Check that the bias still got initialized to zeros.
         self.assertAllClose(weights[1], np.zeros((32)))
 
-  @test_util.run_deprecated_v1
   def testFunctionalConv2DNoReuse(self):
-    height, width = 7, 9
-    images = random_ops.random_uniform((5, height, width, 3), seed=1)
-    conv_layers.conv2d(images, 32, [3, 3])
-    self.assertEqual(len(variables.trainable_variables()), 2)
-    conv_layers.conv2d(images, 32, [3, 3])
-    self.assertEqual(len(variables.trainable_variables()), 4)
+    with ops.Graph().as_default():
+      height, width = 7, 9
+      images = random_ops.random_uniform((5, height, width, 3), seed=1)
+      conv_layers.conv2d(images, 32, [3, 3])
+      self.assertEqual(len(variables.trainable_variables()), 2)
+      conv_layers.conv2d(images, 32, [3, 3])
+      self.assertEqual(len(variables.trainable_variables()), 4)
 
   def testConstraints(self):
     # Conv1D
@@ -341,12 +342,12 @@ class ConvTest(test.TestCase):
     self.assertEqual(conv3d.kernel_constraint, k_constraint)
     self.assertEqual(conv3d.bias_constraint, b_constraint)
 
-  @test_util.run_deprecated_v1
   def testConv3DChannelsFirst(self):
     # Test case for GitHub issue 15655
-    images = array_ops.placeholder(
-        dtype=dtypes.float32, shape=[None, 1, 32, 32, 32])
-    conv_layers.conv3d(images, 32, 9, data_format='channels_first')
+    with ops.Graph().as_default():
+      images = array_ops.placeholder(
+          dtype=dtypes.float32, shape=[None, 1, 32, 32, 32])
+      conv_layers.conv3d(images, 32, 9, data_format='channels_first')
 
 
 class SeparableConv1DTest(test.TestCase):
@@ -375,13 +376,13 @@ class SeparableConv1DTest(test.TestCase):
     with self.assertRaisesRegex(ValueError, 'kernel_size'):
       conv_layers.separable_conv1d(data, 32, None)
 
-  @test_util.run_deprecated_v1
   def testCreateSeparableConv1D(self):
     length = 9
     data = random_ops.random_uniform((5, length, 4))
     layer = conv_layers.SeparableConv1D(32, 3, activation=nn_ops.relu)
     output = layer.apply(data)
-    self.assertEqual(output.op.name, 'separable_conv1d/Relu')
+    if not context.executing_eagerly():
+      self.assertEqual(output.op.name, 'separable_conv1d/Relu')
     self.assertEqual(output.get_shape().as_list(), [5, length - 2, 32])
     self.assertEqual(layer.depthwise_kernel.get_shape().as_list(), [3, 4, 1])
     self.assertEqual(layer.pointwise_kernel.get_shape().as_list(), [1, 4, 32])
@@ -397,16 +398,16 @@ class SeparableConv1DTest(test.TestCase):
     self.assertEqual(layer.pointwise_kernel.get_shape().as_list(), [1, 8, 32])
     self.assertEqual(layer.bias.get_shape().as_list(), [32])
 
-  @test_util.run_deprecated_v1
   def testCreateSeparableConv1DChannelsFirst(self):
-    length = 9
-    data = random_ops.random_uniform((5, 4, length))
-    layer = conv_layers.SeparableConv1D(32, 3, data_format='channels_first')
-    output = layer.apply(data)
-    self.assertEqual(output.get_shape().as_list(), [5, 32, length - 2])
-    self.assertEqual(layer.depthwise_kernel.get_shape().as_list(), [3, 4, 1])
-    self.assertEqual(layer.pointwise_kernel.get_shape().as_list(), [1, 4, 32])
-    self.assertEqual(layer.bias.get_shape().as_list(), [32])
+    with ops.Graph().as_default():
+      length = 9
+      data = random_ops.random_uniform((5, 4, length))
+      layer = conv_layers.SeparableConv1D(32, 3, data_format='channels_first')
+      output = layer.apply(data)
+      self.assertEqual(output.get_shape().as_list(), [5, 32, length - 2])
+      self.assertEqual(layer.depthwise_kernel.get_shape().as_list(), [3, 4, 1])
+      self.assertEqual(layer.pointwise_kernel.get_shape().as_list(), [1, 4, 32])
+      self.assertEqual(layer.bias.get_shape().as_list(), [32])
 
   def testSeparableConv1DPaddingSame(self):
     length = 9
@@ -423,90 +424,93 @@ class SeparableConv1DTest(test.TestCase):
     output = layer.apply(data)
     self.assertEqual(output.get_shape().as_list(), [5, length // 2, 32])
 
-  @test_util.run_deprecated_v1
   def testCreateSeparableConv1DWithStridesChannelsFirst(self):
-    data_format = 'channels_first'
-    length = 10
-    data = random_ops.random_uniform((5, 3, length), seed=1)
-    layer = conv_layers.SeparableConv1D(
-        32, 3, strides=2, padding='same', data_format=data_format)
-    output = layer.apply(data)
-    self.assertEqual(output.get_shape().as_list(), [5, 32, length // 2])
+    with ops.Graph().as_default():
+      data_format = 'channels_first'
+      length = 10
+      data = random_ops.random_uniform((5, 3, length), seed=1)
+      layer = conv_layers.SeparableConv1D(
+          32, 3, strides=2, padding='same', data_format=data_format)
+      output = layer.apply(data)
+      self.assertEqual(output.get_shape().as_list(), [5, 32, length // 2])
 
-  @test_util.run_deprecated_v1
   def testFunctionalConv1DReuse(self):
-    length = 10
-    data = random_ops.random_uniform((5, length, 3), seed=1)
-    conv_layers.separable_conv1d(data, 32, 3, name='sepconv1')
-    self.assertEqual(len(variables.trainable_variables()), 3)
-    conv_layers.separable_conv1d(data, 32, 3, name='sepconv1', reuse=True)
-    self.assertEqual(len(variables.trainable_variables()), 3)
-
-  @test_util.run_deprecated_v1
-  def testFunctionalConv1DReuseFromScope(self):
-    with variable_scope.variable_scope('scope'):
+    with ops.Graph().as_default():
       length = 10
       data = random_ops.random_uniform((5, length, 3), seed=1)
       conv_layers.separable_conv1d(data, 32, 3, name='sepconv1')
       self.assertEqual(len(variables.trainable_variables()), 3)
-    with variable_scope.variable_scope('scope', reuse=True):
-      conv_layers.separable_conv1d(data, 32, 3, name='sepconv1')
+      conv_layers.separable_conv1d(data, 32, 3, name='sepconv1', reuse=True)
       self.assertEqual(len(variables.trainable_variables()), 3)
 
-  @test_util.run_deprecated_v1
+  def testFunctionalConv1DReuseFromScope(self):
+    with ops.Graph().as_default():
+      with variable_scope.variable_scope('scope'):
+        length = 10
+        data = random_ops.random_uniform((5, length, 3), seed=1)
+        conv_layers.separable_conv1d(data, 32, 3, name='sepconv1')
+        self.assertEqual(len(variables.trainable_variables()), 3)
+      with variable_scope.variable_scope('scope', reuse=True):
+        conv_layers.separable_conv1d(data, 32, 3, name='sepconv1')
+        self.assertEqual(len(variables.trainable_variables()), 3)
+
   def testFunctionalConv1DNoReuse(self):
-    length = 10
-    data = random_ops.random_uniform((5, length, 3), seed=1)
-    conv_layers.separable_conv1d(data, 32, 3)
-    self.assertEqual(len(variables.trainable_variables()), 3)
-    conv_layers.separable_conv1d(data, 32, 3)
-    self.assertEqual(len(variables.trainable_variables()), 6)
+    with ops.Graph().as_default():
+      length = 10
+      data = random_ops.random_uniform((5, length, 3), seed=1)
+      conv_layers.separable_conv1d(data, 32, 3)
+      self.assertEqual(len(variables.trainable_variables()), 3)
+      conv_layers.separable_conv1d(data, 32, 3)
+      self.assertEqual(len(variables.trainable_variables()), 6)
 
-  @test_util.run_deprecated_v1
   def testSeparableConv1DDepthwiseRegularizer(self):
-    length = 9
-    data = random_ops.random_uniform((5, length, 4))
-    reg = lambda x: 0.1 * math_ops.reduce_sum(x)
-    layer = conv_layers.SeparableConv1D(32, 3, depthwise_regularizer=reg)
-    layer.apply(data)
-    loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-    self.assertEqual(len(loss_keys), 1)
-    self.evaluate([v.initializer for v in layer.variables])
-    self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
+    with ops.Graph().as_default():
+      length = 9
+      data = random_ops.random_uniform((5, length, 4))
+      reg = lambda x: 0.1 * math_ops.reduce_sum(x)
+      layer = conv_layers.SeparableConv1D(32, 3, depthwise_regularizer=reg)
+      layer.apply(data)
+      loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(loss_keys), 1)
+      self.evaluate([v.initializer for v in layer.variables])
+      self.assertListEqual(
+          self.evaluate(layer.losses), self.evaluate(loss_keys))
 
-  @test_util.run_deprecated_v1
   def testSeparableConv1DPointwiseRegularizer(self):
-    length = 9
-    data = random_ops.random_uniform((5, length, 4))
-    reg = lambda x: 0.1 * math_ops.reduce_sum(x)
-    layer = conv_layers.SeparableConv1D(32, 3, pointwise_regularizer=reg)
-    layer.apply(data)
-    loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-    self.assertEqual(len(loss_keys), 1)
-    self.evaluate([v.initializer for v in layer.variables])
-    self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
+    with ops.Graph().as_default():
+      length = 9
+      data = random_ops.random_uniform((5, length, 4))
+      reg = lambda x: 0.1 * math_ops.reduce_sum(x)
+      layer = conv_layers.SeparableConv1D(32, 3, pointwise_regularizer=reg)
+      layer.apply(data)
+      loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(loss_keys), 1)
+      self.evaluate([v.initializer for v in layer.variables])
+      self.assertListEqual(
+          self.evaluate(layer.losses), self.evaluate(loss_keys))
 
-  @test_util.run_deprecated_v1
   def testSeparableConv1DBiasRegularizer(self):
-    length = 9
-    data = random_ops.random_uniform((5, length, 4))
-    reg = lambda x: 0.1 * math_ops.reduce_sum(x)
-    layer = conv_layers.SeparableConv1D(32, 3, bias_regularizer=reg)
-    layer.apply(data)
-    loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-    self.assertEqual(len(loss_keys), 1)
-    self.evaluate([v.initializer for v in layer.variables])
-    self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
+    with ops.Graph().as_default():
+      length = 9
+      data = random_ops.random_uniform((5, length, 4))
+      reg = lambda x: 0.1 * math_ops.reduce_sum(x)
+      layer = conv_layers.SeparableConv1D(32, 3, bias_regularizer=reg)
+      layer.apply(data)
+      loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(loss_keys), 1)
+      self.evaluate([v.initializer for v in layer.variables])
+      self.assertListEqual(
+          self.evaluate(layer.losses), self.evaluate(loss_keys))
 
-  @test_util.run_deprecated_v1
   def testSeparableConv1DNoBias(self):
-    length = 9
-    data = random_ops.random_uniform((5, length, 4))
-    layer = conv_layers.SeparableConv1D(
-        32, 3, activation=nn_ops.relu, use_bias=False)
-    output = layer.apply(data)
-    self.assertEqual(output.op.name, 'separable_conv1d/Relu')
-    self.assertEqual(layer.bias, None)
+    with ops.Graph().as_default():
+      length = 9
+      data = random_ops.random_uniform((5, length, 4))
+      layer = conv_layers.SeparableConv1D(
+          32, 3, activation=nn_ops.relu, use_bias=False)
+      output = layer.apply(data)
+      self.assertEqual(output.op.name, 'separable_conv1d/Relu')
+      self.assertEqual(layer.bias, None)
 
   def testConstraints(self):
     d_constraint = lambda x: x / math_ops.reduce_sum(x)
@@ -549,13 +553,13 @@ class SeparableConv2DTest(test.TestCase):
     with self.assertRaisesRegex(ValueError, 'kernel_size'):
       conv_layers.separable_conv2d(images, 32, None)
 
-  @test_util.run_deprecated_v1
   def testCreateSeparableConv2D(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
     layer = conv_layers.SeparableConv2D(32, [3, 3], activation=nn_ops.relu)
     output = layer.apply(images)
-    self.assertEqual(output.op.name, 'separable_conv2d/Relu')
+    if not context.executing_eagerly():
+      self.assertEqual(output.op.name, 'separable_conv2d/Relu')
     self.assertListEqual(output.get_shape().as_list(),
                          [5, height - 2, width - 2, 32])
     self.assertListEqual(layer.depthwise_kernel.get_shape().as_list(),
@@ -590,20 +594,20 @@ class SeparableConv2DTest(test.TestCase):
                          [1, 1, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
-  @test_util.run_deprecated_v1
   def testCreateSeparableConv2DChannelsFirst(self):
-    height, width = 7, 9
-    images = random_ops.random_uniform((5, 4, height, width))
-    layer = conv_layers.SeparableConv2D(
-        32, [3, 3], data_format='channels_first')
-    output = layer.apply(images)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, 32, height - 2, width - 2])
-    self.assertListEqual(layer.depthwise_kernel.get_shape().as_list(),
-                         [3, 3, 4, 1])
-    self.assertListEqual(layer.pointwise_kernel.get_shape().as_list(),
-                         [1, 1, 4, 32])
-    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+    with ops.Graph().as_default():
+      height, width = 7, 9
+      images = random_ops.random_uniform((5, 4, height, width))
+      layer = conv_layers.SeparableConv2D(
+          32, [3, 3], data_format='channels_first')
+      output = layer.apply(images)
+      self.assertListEqual(output.get_shape().as_list(),
+                           [5, 32, height - 2, width - 2])
+      self.assertListEqual(layer.depthwise_kernel.get_shape().as_list(),
+                           [3, 3, 4, 1])
+      self.assertListEqual(layer.pointwise_kernel.get_shape().as_list(),
+                           [1, 1, 4, 32])
+      self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
   def testSeparableConv2DPaddingSame(self):
     height, width = 7, 9
@@ -613,80 +617,79 @@ class SeparableConv2DTest(test.TestCase):
     output = layer.apply(images)
     self.assertListEqual(output.get_shape().as_list(), [5, height, width, 64])
 
-  @test_util.run_deprecated_v1
   def testCreateSeparableConvWithStrides(self):
-    height, width = 6, 8
-    # Test strides tuple
-    images = random_ops.random_uniform((5, height, width, 3), seed=1)
-    layer = conv_layers.SeparableConv2D(
-        32, [3, 3], strides=(2, 2), padding='same')
-    output = layer.apply(images)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, height / 2, width / 2, 32])
+    with ops.Graph().as_default():
+      height, width = 6, 8
+      # Test strides tuple
+      images = random_ops.random_uniform((5, height, width, 3), seed=1)
+      layer = conv_layers.SeparableConv2D(
+          32, [3, 3], strides=(2, 2), padding='same')
+      output = layer.apply(images)
+      self.assertListEqual(output.get_shape().as_list(),
+                           [5, height / 2, width / 2, 32])
 
-    # Test strides integer
-    layer = conv_layers.SeparableConv2D(32, [3, 3], strides=2, padding='same')
-    output = layer.apply(images)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, height / 2, width / 2, 32])
+      # Test strides integer
+      layer = conv_layers.SeparableConv2D(32, [3, 3], strides=2, padding='same')
+      output = layer.apply(images)
+      self.assertListEqual(output.get_shape().as_list(),
+                           [5, height / 2, width / 2, 32])
 
-    # Test unequal strides
-    layer = conv_layers.SeparableConv2D(
-        32, [3, 3], strides=(2, 1), padding='same')
-    output = layer.apply(images)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, height / 2, width, 32])
+      # Test unequal strides
+      layer = conv_layers.SeparableConv2D(
+          32, [3, 3], strides=(2, 1), padding='same')
+      output = layer.apply(images)
+      self.assertListEqual(output.get_shape().as_list(),
+                           [5, height / 2, width, 32])
 
-  @test_util.run_deprecated_v1
   def testCreateSeparableConvWithStridesChannelsFirst(self):
-    data_format = 'channels_first'
-    height, width = 6, 8
-    # Test strides tuple
-    images = random_ops.random_uniform((5, 3, height, width), seed=1)
-    layer = conv_layers.SeparableConv2D(
-        32, [3, 3], strides=(2, 2), padding='same', data_format=data_format)
-    output = layer.apply(images)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, 32, height / 2, width / 2])
+    with ops.Graph().as_default():
+      data_format = 'channels_first'
+      height, width = 6, 8
+      # Test strides tuple
+      images = random_ops.random_uniform((5, 3, height, width), seed=1)
+      layer = conv_layers.SeparableConv2D(
+          32, [3, 3], strides=(2, 2), padding='same', data_format=data_format)
+      output = layer.apply(images)
+      self.assertListEqual(output.get_shape().as_list(),
+                           [5, 32, height / 2, width / 2])
 
-    # Test strides integer
-    layer = conv_layers.SeparableConv2D(32, [3, 3], strides=2, padding='same',
-                                        data_format=data_format)
-    output = layer.apply(images)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, 32, height / 2, width / 2])
+      # Test strides integer
+      layer = conv_layers.SeparableConv2D(32, [3, 3], strides=2, padding='same',
+                                          data_format=data_format)
+      output = layer.apply(images)
+      self.assertListEqual(output.get_shape().as_list(),
+                           [5, 32, height / 2, width / 2])
 
-    # Test unequal strides
-    layer = conv_layers.SeparableConv2D(
-        32, [3, 3], strides=(2, 1), padding='same', data_format=data_format)
-    output = layer.apply(images)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, 32, height / 2, width])
+      # Test unequal strides
+      layer = conv_layers.SeparableConv2D(
+          32, [3, 3], strides=(2, 1), padding='same', data_format=data_format)
+      output = layer.apply(images)
+      self.assertListEqual(output.get_shape().as_list(),
+                           [5, 32, height / 2, width])
 
-  @test_util.run_deprecated_v1
   def testFunctionalConv2DReuse(self):
-    height, width = 7, 9
-    images = random_ops.random_uniform((5, height, width, 3), seed=1)
-    conv_layers.separable_conv2d(images, 32, [3, 3], name='sepconv1')
-    self.assertEqual(len(variables.trainable_variables()), 3)
-    conv_layers.separable_conv2d(
-        images, 32, [3, 3], name='sepconv1', reuse=True)
-    self.assertEqual(len(variables.trainable_variables()), 3)
-
-  @test_util.run_deprecated_v1
-  def testFunctionalConv2DReuseFromScope(self):
-    with variable_scope.variable_scope('scope'):
+    with ops.Graph().as_default():
       height, width = 7, 9
       images = random_ops.random_uniform((5, height, width, 3), seed=1)
       conv_layers.separable_conv2d(images, 32, [3, 3], name='sepconv1')
       self.assertEqual(len(variables.trainable_variables()), 3)
-    with variable_scope.variable_scope('scope', reuse=True):
-      conv_layers.separable_conv2d(images, 32, [3, 3], name='sepconv1')
+      conv_layers.separable_conv2d(
+          images, 32, [3, 3], name='sepconv1', reuse=True)
       self.assertEqual(len(variables.trainable_variables()), 3)
 
-  @test_util.run_deprecated_v1
+  def testFunctionalConv2DReuseFromScope(self):
+    with ops.Graph().as_default():
+      with variable_scope.variable_scope('scope'):
+        height, width = 7, 9
+        images = random_ops.random_uniform((5, height, width, 3), seed=1)
+        conv_layers.separable_conv2d(images, 32, [3, 3], name='sepconv1')
+        self.assertEqual(len(variables.trainable_variables()), 3)
+      with variable_scope.variable_scope('scope', reuse=True):
+        conv_layers.separable_conv2d(images, 32, [3, 3], name='sepconv1')
+        self.assertEqual(len(variables.trainable_variables()), 3)
+
   def testFunctionalConv2DInitializerFromScope(self):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       with variable_scope.variable_scope(
           'scope', initializer=init_ops.ones_initializer()):
         height, width = 7, 9
@@ -705,66 +708,69 @@ class SeparableConv2DTest(test.TestCase):
         # Check that the bias still got initialized to zeros.
         self.assertAllClose(weights[2], np.zeros((32)))
 
-  @test_util.run_deprecated_v1
   def testFunctionalConv2DNoReuse(self):
-    height, width = 7, 9
-    images = random_ops.random_uniform((5, height, width, 3), seed=1)
-    conv_layers.separable_conv2d(images, 32, [3, 3])
-    self.assertEqual(len(variables.trainable_variables()), 3)
-    conv_layers.separable_conv2d(images, 32, [3, 3])
-    self.assertEqual(len(variables.trainable_variables()), 6)
+    with ops.Graph().as_default():
+      height, width = 7, 9
+      images = random_ops.random_uniform((5, height, width, 3), seed=1)
+      conv_layers.separable_conv2d(images, 32, [3, 3])
+      self.assertEqual(len(variables.trainable_variables()), 3)
+      conv_layers.separable_conv2d(images, 32, [3, 3])
+      self.assertEqual(len(variables.trainable_variables()), 6)
 
-  @test_util.run_deprecated_v1
   def testSeparableConv2DDepthwiseRegularizer(self):
-    height, width = 7, 9
-    images = random_ops.random_uniform((5, height, width, 4))
-    reg = lambda x: 0.1 * math_ops.reduce_sum(x)
-    layer = conv_layers.SeparableConv2D(32, [3, 3], depthwise_regularizer=reg)
-    layer.apply(images)
-    loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-    self.assertEqual(len(loss_keys), 1)
-    self.evaluate([v.initializer for v in layer.variables])
-    self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
+    with ops.Graph().as_default():
+      height, width = 7, 9
+      images = random_ops.random_uniform((5, height, width, 4))
+      reg = lambda x: 0.1 * math_ops.reduce_sum(x)
+      layer = conv_layers.SeparableConv2D(32, [3, 3], depthwise_regularizer=reg)
+      layer.apply(images)
+      loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(loss_keys), 1)
+      self.evaluate([v.initializer for v in layer.variables])
+      self.assertListEqual(
+          self.evaluate(layer.losses), self.evaluate(loss_keys))
 
-  @test_util.run_deprecated_v1
   def testSeparableConv2DPointwiseRegularizer(self):
-    height, width = 7, 9
-    images = random_ops.random_uniform((5, height, width, 4))
-    reg = lambda x: 0.1 * math_ops.reduce_sum(x)
-    layer = conv_layers.SeparableConv2D(32, [3, 3], pointwise_regularizer=reg)
-    layer.apply(images)
-    loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-    self.assertEqual(len(loss_keys), 1)
-    self.evaluate([v.initializer for v in layer.variables])
-    self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
+    with ops.Graph().as_default():
+      height, width = 7, 9
+      images = random_ops.random_uniform((5, height, width, 4))
+      reg = lambda x: 0.1 * math_ops.reduce_sum(x)
+      layer = conv_layers.SeparableConv2D(32, [3, 3], pointwise_regularizer=reg)
+      layer.apply(images)
+      loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(loss_keys), 1)
+      self.evaluate([v.initializer for v in layer.variables])
+      self.assertListEqual(
+          self.evaluate(layer.losses), self.evaluate(loss_keys))
 
-  @test_util.run_deprecated_v1
   def testSeparableConv2DBiasRegularizer(self):
-    height, width = 7, 9
-    images = random_ops.random_uniform((5, height, width, 4))
-    reg = lambda x: 0.1 * math_ops.reduce_sum(x)
-    layer = conv_layers.SeparableConv2D(32, [3, 3], bias_regularizer=reg)
-    layer.apply(images)
-    loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-    self.assertEqual(len(loss_keys), 1)
-    self.evaluate([v.initializer for v in layer.variables])
-    self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
+    with ops.Graph().as_default():
+      height, width = 7, 9
+      images = random_ops.random_uniform((5, height, width, 4))
+      reg = lambda x: 0.1 * math_ops.reduce_sum(x)
+      layer = conv_layers.SeparableConv2D(32, [3, 3], bias_regularizer=reg)
+      layer.apply(images)
+      loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(loss_keys), 1)
+      self.evaluate([v.initializer for v in layer.variables])
+      self.assertListEqual(
+          self.evaluate(layer.losses), self.evaluate(loss_keys))
 
-  @test_util.run_deprecated_v1
   def testSeparableConv2DNoBias(self):
-    height, width = 7, 9
-    images = random_ops.random_uniform((5, height, width, 4))
-    layer = conv_layers.SeparableConv2D(
-        32, [3, 3], activation=nn_ops.relu, use_bias=False)
-    output = layer.apply(images)
-    self.assertEqual(output.op.name, 'separable_conv2d/Relu')
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, height - 2, width - 2, 32])
-    self.assertListEqual(layer.depthwise_kernel.get_shape().as_list(),
-                         [3, 3, 4, 1])
-    self.assertListEqual(layer.pointwise_kernel.get_shape().as_list(),
-                         [1, 1, 4, 32])
-    self.assertEqual(layer.bias, None)
+    with ops.Graph().as_default():
+      height, width = 7, 9
+      images = random_ops.random_uniform((5, height, width, 4))
+      layer = conv_layers.SeparableConv2D(
+          32, [3, 3], activation=nn_ops.relu, use_bias=False)
+      output = layer.apply(images)
+      self.assertEqual(output.op.name, 'separable_conv2d/Relu')
+      self.assertListEqual(output.get_shape().as_list(),
+                           [5, height - 2, width - 2, 32])
+      self.assertListEqual(layer.depthwise_kernel.get_shape().as_list(),
+                           [3, 3, 4, 1])
+      self.assertListEqual(layer.pointwise_kernel.get_shape().as_list(),
+                           [1, 1, 4, 32])
+      self.assertEqual(layer.bias, None)
 
   def testConstraints(self):
     d_constraint = lambda x: x / math_ops.reduce_sum(x)
@@ -807,13 +813,13 @@ class Conv2DTransposeTest(test.TestCase):
     with self.assertRaisesRegex(ValueError, 'kernel_size'):
       conv_layers.conv2d_transpose(images, 32, None)
 
-  @test_util.run_deprecated_v1
   def testCreateConv2DTranspose(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
     layer = conv_layers.Conv2DTranspose(32, [3, 3], activation=nn_ops.relu)
     output = layer.apply(images)
-    self.assertEqual(output.op.name, 'conv2d_transpose/Relu')
+    if not context.executing_eagerly():
+      self.assertEqual(output.op.name, 'conv2d_transpose/Relu')
     self.assertListEqual(output.get_shape().as_list(),
                          [5, height + 2, width + 2, 32])
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 32, 4])
@@ -879,66 +885,68 @@ class Conv2DTransposeTest(test.TestCase):
     self.assertListEqual(output.get_shape().as_list(),
                          [5, height * 2, width, 32])
 
-  @test_util.run_deprecated_v1
   def testConv2DTransposeKernelRegularizer(self):
-    height, width = 7, 9
-    images = random_ops.random_uniform((5, height, width, 4))
-    reg = lambda x: 0.1 * math_ops.reduce_sum(x)
-    layer = conv_layers.Conv2DTranspose(32, [3, 3], kernel_regularizer=reg)
-    layer.apply(images)
-    loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-    self.assertEqual(len(loss_keys), 1)
-    self.evaluate([v.initializer for v in layer.variables])
-    self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
+    with ops.Graph().as_default():
+      height, width = 7, 9
+      images = random_ops.random_uniform((5, height, width, 4))
+      reg = lambda x: 0.1 * math_ops.reduce_sum(x)
+      layer = conv_layers.Conv2DTranspose(32, [3, 3], kernel_regularizer=reg)
+      layer.apply(images)
+      loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(loss_keys), 1)
+      self.evaluate([v.initializer for v in layer.variables])
+      self.assertListEqual(
+          self.evaluate(layer.losses), self.evaluate(loss_keys))
 
-  @test_util.run_deprecated_v1
   def testConv2DTransposeBiasRegularizer(self):
-    height, width = 7, 9
-    images = random_ops.random_uniform((5, height, width, 4))
-    reg = lambda x: 0.1 * math_ops.reduce_sum(x)
-    layer = conv_layers.Conv2DTranspose(32, [3, 3], bias_regularizer=reg)
-    layer.apply(images)
-    loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-    self.assertEqual(len(loss_keys), 1)
-    self.evaluate([v.initializer for v in layer.variables])
-    self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
+    with ops.Graph().as_default():
+      height, width = 7, 9
+      images = random_ops.random_uniform((5, height, width, 4))
+      reg = lambda x: 0.1 * math_ops.reduce_sum(x)
+      layer = conv_layers.Conv2DTranspose(32, [3, 3], bias_regularizer=reg)
+      layer.apply(images)
+      loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(loss_keys), 1)
+      self.evaluate([v.initializer for v in layer.variables])
+      self.assertListEqual(
+          self.evaluate(layer.losses), self.evaluate(loss_keys))
 
-  @test_util.run_deprecated_v1
   def testConv2DTransposeNoBias(self):
-    height, width = 7, 9
-    images = random_ops.random_uniform((5, height, width, 4))
-    layer = conv_layers.Conv2DTranspose(
-        32, [3, 3], activation=nn_ops.relu, use_bias=False)
-    output = layer.apply(images)
-    self.assertEqual(output.op.name, 'conv2d_transpose/Relu')
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, height + 2, width + 2, 32])
-    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 32, 4])
-    self.assertEqual(layer.bias, None)
+    with ops.Graph().as_default():
+      height, width = 7, 9
+      images = random_ops.random_uniform((5, height, width, 4))
+      layer = conv_layers.Conv2DTranspose(
+          32, [3, 3], activation=nn_ops.relu, use_bias=False)
+      output = layer.apply(images)
+      self.assertEqual(output.op.name, 'conv2d_transpose/Relu')
+      self.assertListEqual(output.get_shape().as_list(),
+                           [5, height + 2, width + 2, 32])
+      self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 32, 4])
+      self.assertEqual(layer.bias, None)
 
-  @test_util.run_deprecated_v1
   def testFunctionalConv2DTransposeReuse(self):
-    height, width = 7, 9
-    images = random_ops.random_uniform((5, height, width, 3), seed=1)
-    conv_layers.conv2d_transpose(images, 32, [3, 3], name='deconv1')
-    self.assertEqual(len(variables.trainable_variables()), 2)
-    conv_layers.conv2d_transpose(images, 32, [3, 3], name='deconv1', reuse=True)
-    self.assertEqual(len(variables.trainable_variables()), 2)
-
-  @test_util.run_deprecated_v1
-  def testFunctionalConv2DTransposeReuseFromScope(self):
-    with variable_scope.variable_scope('scope'):
+    with ops.Graph().as_default():
       height, width = 7, 9
       images = random_ops.random_uniform((5, height, width, 3), seed=1)
       conv_layers.conv2d_transpose(images, 32, [3, 3], name='deconv1')
       self.assertEqual(len(variables.trainable_variables()), 2)
-    with variable_scope.variable_scope('scope', reuse=True):
-      conv_layers.conv2d_transpose(images, 32, [3, 3], name='deconv1')
+      conv_layers.conv2d_transpose(
+          images, 32, [3, 3], name='deconv1', reuse=True)
       self.assertEqual(len(variables.trainable_variables()), 2)
 
-  @test_util.run_deprecated_v1
+  def testFunctionalConv2DTransposeReuseFromScope(self):
+    with ops.Graph().as_default():
+      with variable_scope.variable_scope('scope'):
+        height, width = 7, 9
+        images = random_ops.random_uniform((5, height, width, 3), seed=1)
+        conv_layers.conv2d_transpose(images, 32, [3, 3], name='deconv1')
+        self.assertEqual(len(variables.trainable_variables()), 2)
+      with variable_scope.variable_scope('scope', reuse=True):
+        conv_layers.conv2d_transpose(images, 32, [3, 3], name='deconv1')
+        self.assertEqual(len(variables.trainable_variables()), 2)
+
   def testFunctionalConv2DTransposeInitializerFromScope(self):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       with variable_scope.variable_scope(
           'scope', initializer=init_ops.ones_initializer()):
         height, width = 7, 9
@@ -955,14 +963,14 @@ class Conv2DTransposeTest(test.TestCase):
         # Check that the bias still got initialized to zeros.
         self.assertAllClose(weights[1], np.zeros((32)))
 
-  @test_util.run_deprecated_v1
   def testFunctionalConv2DTransposeNoReuse(self):
-    height, width = 7, 9
-    images = random_ops.random_uniform((5, height, width, 3), seed=1)
-    conv_layers.conv2d_transpose(images, 32, [3, 3])
-    self.assertEqual(len(variables.trainable_variables()), 2)
-    conv_layers.conv2d_transpose(images, 32, [3, 3])
-    self.assertEqual(len(variables.trainable_variables()), 4)
+    with ops.Graph().as_default():
+      height, width = 7, 9
+      images = random_ops.random_uniform((5, height, width, 3), seed=1)
+      conv_layers.conv2d_transpose(images, 32, [3, 3])
+      self.assertEqual(len(variables.trainable_variables()), 2)
+      conv_layers.conv2d_transpose(images, 32, [3, 3])
+      self.assertEqual(len(variables.trainable_variables()), 4)
 
   def testConstraints(self):
     k_constraint = lambda x: x / math_ops.reduce_sum(x)
@@ -1002,13 +1010,13 @@ class Conv3DTransposeTest(test.TestCase):
     with self.assertRaisesRegex(ValueError, 'kernel_size'):
       conv_layers.conv3d_transpose(volumes, 4, None)
 
-  @test_util.run_deprecated_v1
   def testCreateConv3DTranspose(self):
     depth, height, width = 5, 7, 9
     volumes = random_ops.random_uniform((5, depth, height, width, 32))
     layer = conv_layers.Conv3DTranspose(4, [3, 3, 3], activation=nn_ops.relu)
     output = layer.apply(volumes)
-    self.assertEqual(output.op.name, 'conv3d_transpose/Relu')
+    if not context.executing_eagerly():
+      self.assertEqual(output.op.name, 'conv3d_transpose/Relu')
     self.assertListEqual(output.get_shape().as_list(),
                          [5, depth + 2, height + 2, width + 2, 4])
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
@@ -1024,17 +1032,17 @@ class Conv3DTransposeTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [4])
 
-  @test_util.run_deprecated_v1
   def testCreateConv3DTransposeChannelsFirst(self):
-    depth, height, width = 5, 7, 9
-    volumes = random_ops.random_uniform((5, 32, depth, height, width))
-    layer = conv_layers.Conv3DTranspose(
-        4, [3, 3, 3], data_format='channels_first')
-    output = layer.apply(volumes)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, 4, depth + 2, height + 2, width + 2])
-    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
-    self.assertListEqual(layer.bias.get_shape().as_list(), [4])
+    with ops.Graph().as_default():
+      depth, height, width = 5, 7, 9
+      volumes = random_ops.random_uniform((5, 32, depth, height, width))
+      layer = conv_layers.Conv3DTranspose(
+          4, [3, 3, 3], data_format='channels_first')
+      output = layer.apply(volumes)
+      self.assertListEqual(output.get_shape().as_list(),
+                           [5, 4, depth + 2, height + 2, width + 2])
+      self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
+      self.assertListEqual(layer.bias.get_shape().as_list(), [4])
 
   def testConv3DTransposePaddingSame(self):
     depth, height, width = 5, 7, 9
@@ -1068,67 +1076,69 @@ class Conv3DTransposeTest(test.TestCase):
     self.assertListEqual(output.get_shape().as_list(),
                          [5, depth * 2, height, width, 4])
 
-  @test_util.run_deprecated_v1
   def testConv3DTransposeKernelRegularizer(self):
-    depth, height, width = 5, 7, 9
-    volumes = random_ops.random_uniform((5, depth, height, width, 32))
-    reg = lambda x: 0.1 * math_ops.reduce_sum(x)
-    layer = conv_layers.Conv3DTranspose(4, [3, 3, 3], kernel_regularizer=reg)
-    layer.apply(volumes)
-    loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-    self.assertEqual(len(loss_keys), 1)
-    self.evaluate([v.initializer for v in layer.variables])
-    self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
+    with ops.Graph().as_default():
+      depth, height, width = 5, 7, 9
+      volumes = random_ops.random_uniform((5, depth, height, width, 32))
+      reg = lambda x: 0.1 * math_ops.reduce_sum(x)
+      layer = conv_layers.Conv3DTranspose(4, [3, 3, 3], kernel_regularizer=reg)
+      layer.apply(volumes)
+      loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(loss_keys), 1)
+      self.evaluate([v.initializer for v in layer.variables])
+      self.assertListEqual(
+          self.evaluate(layer.losses), self.evaluate(loss_keys))
 
-  @test_util.run_deprecated_v1
   def testConv3DTransposeBiasRegularizer(self):
-    depth, height, width = 5, 7, 9
-    volumes = random_ops.random_uniform((5, depth, height, width, 32))
-    reg = lambda x: 0.1 * math_ops.reduce_sum(x)
-    layer = conv_layers.Conv3DTranspose(4, [3, 3, 3], bias_regularizer=reg)
-    layer.apply(volumes)
-    loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-    self.assertEqual(len(loss_keys), 1)
-    self.evaluate([v.initializer for v in layer.variables])
-    self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
+    with ops.Graph().as_default():
+      depth, height, width = 5, 7, 9
+      volumes = random_ops.random_uniform((5, depth, height, width, 32))
+      reg = lambda x: 0.1 * math_ops.reduce_sum(x)
+      layer = conv_layers.Conv3DTranspose(4, [3, 3, 3], bias_regularizer=reg)
+      layer.apply(volumes)
+      loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(loss_keys), 1)
+      self.evaluate([v.initializer for v in layer.variables])
+      self.assertListEqual(
+          self.evaluate(layer.losses), self.evaluate(loss_keys))
 
-  @test_util.run_deprecated_v1
   def testConv3DTransposeNoBias(self):
-    depth, height, width = 5, 7, 9
-    volumes = random_ops.random_uniform((5, depth, height, width, 32))
-    layer = conv_layers.Conv3DTranspose(
-        4, [3, 3, 3], activation=nn_ops.relu, use_bias=False)
-    output = layer.apply(volumes)
-    self.assertEqual(output.op.name, 'conv3d_transpose/Relu')
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, depth + 2, height + 2, width + 2, 4])
-    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
-    self.assertEqual(layer.bias, None)
+    with ops.Graph().as_default():
+      depth, height, width = 5, 7, 9
+      volumes = random_ops.random_uniform((5, depth, height, width, 32))
+      layer = conv_layers.Conv3DTranspose(
+          4, [3, 3, 3], activation=nn_ops.relu, use_bias=False)
+      output = layer.apply(volumes)
+      self.assertEqual(output.op.name, 'conv3d_transpose/Relu')
+      self.assertListEqual(output.get_shape().as_list(),
+                           [5, depth + 2, height + 2, width + 2, 4])
+      self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
+      self.assertEqual(layer.bias, None)
 
-  @test_util.run_deprecated_v1
   def testFunctionalConv3DTransposeReuse(self):
-    depth, height, width = 5, 7, 9
-    volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
-    conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3], name='deconv1')
-    self.assertEqual(len(variables.trainable_variables()), 2)
-    conv_layers.conv3d_transpose(
-        volumes, 4, [3, 3, 3], name='deconv1', reuse=True)
-    self.assertEqual(len(variables.trainable_variables()), 2)
-
-  @test_util.run_deprecated_v1
-  def testFunctionalConv3DTransposeReuseFromScope(self):
-    with variable_scope.variable_scope('scope'):
+    with ops.Graph().as_default():
       depth, height, width = 5, 7, 9
       volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
       conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3], name='deconv1')
       self.assertEqual(len(variables.trainable_variables()), 2)
-    with variable_scope.variable_scope('scope', reuse=True):
-      conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3], name='deconv1')
+      conv_layers.conv3d_transpose(
+          volumes, 4, [3, 3, 3], name='deconv1', reuse=True)
       self.assertEqual(len(variables.trainable_variables()), 2)
 
-  @test_util.run_deprecated_v1
+  def testFunctionalConv3DTransposeReuseFromScope(self):
+    with ops.Graph().as_default():
+      with variable_scope.variable_scope('scope'):
+        depth, height, width = 5, 7, 9
+        volumes = random_ops.random_uniform(
+            (5, depth, height, width, 32), seed=1)
+        conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3], name='deconv1')
+        self.assertEqual(len(variables.trainable_variables()), 2)
+      with variable_scope.variable_scope('scope', reuse=True):
+        conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3], name='deconv1')
+        self.assertEqual(len(variables.trainable_variables()), 2)
+
   def testFunctionalConv3DTransposeInitializerFromScope(self):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       with variable_scope.variable_scope(
           'scope', initializer=init_ops.ones_initializer()):
         depth, height, width = 5, 7, 9
@@ -1146,14 +1156,14 @@ class Conv3DTransposeTest(test.TestCase):
         # Check that the bias still got initialized to zeros.
         self.assertAllClose(weights[1], np.zeros((4)))
 
-  @test_util.run_deprecated_v1
   def testFunctionalConv3DTransposeNoReuse(self):
-    depth, height, width = 5, 7, 9
-    volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
-    conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3])
-    self.assertEqual(len(variables.trainable_variables()), 2)
-    conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3])
-    self.assertEqual(len(variables.trainable_variables()), 4)
+    with ops.Graph().as_default():
+      depth, height, width = 5, 7, 9
+      volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
+      conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3])
+      self.assertEqual(len(variables.trainable_variables()), 2)
+      conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3])
+      self.assertEqual(len(variables.trainable_variables()), 4)
 
   def testConstraints(self):
     k_constraint = lambda x: x / math_ops.reduce_sum(x)
diff --git a/tensorflow/python/keras/legacy_tf_layers/core_test.py b/tensorflow/python/keras/legacy_tf_layers/core_test.py
index 46fb4bef620..88f9a1afa0a 100644
--- a/tensorflow/python/keras/legacy_tf_layers/core_test.py
+++ b/tensorflow/python/keras/legacy_tf_layers/core_test.py
@@ -284,11 +284,11 @@ class DenseTest(test.TestCase, parameterized.TestCase):
       weights = _get_variable_dict_from_varstore()
       self.assertEqual(len(weights), 2)
       # Check that the matrix weights got initialized to ones (from scope).
-      self.assertAllClose(weights['scope/dense/kernel'].read_value().eval(),
+      self.assertAllClose(weights['scope/dense/kernel'].read_value(),
                           np.ones((3, 2)))
       # Check that the bias still got initialized to zeros.
-      self.assertAllClose(weights['scope/dense/bias'].read_value().eval(),
-                          np.zeros((2)))
+      self.assertAllClose(weights['scope/dense/bias'].read_value(), np.zeros(
+          (2)))
 
   def testEagerExecution(self):
     with context.eager_mode():
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index 36402489feb..f75e6af6e30 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -1313,8 +1313,8 @@ def _maybe_convert_labels(y_true):
     # Convert the binary labels to -1 or 1.
     return 2. * y_true - 1.
 
-  updated_y_true = smart_cond.smart_cond(is_binary,
-                                         _convert_binary_labels, lambda: y_true)
+  updated_y_true = smart_cond.smart_cond(is_binary, _convert_binary_labels,
+                                         lambda: y_true)
   return updated_y_true
 
 
@@ -1526,8 +1526,8 @@ def categorical_crossentropy(y_true,
     num_classes = math_ops.cast(array_ops.shape(y_true)[-1], y_pred.dtype)
     return y_true * (1.0 - label_smoothing) + (label_smoothing / num_classes)
 
-  y_true = smart_cond.smart_cond(label_smoothing,
-                                 _smooth_labels, lambda: y_true)
+  y_true = smart_cond.smart_cond(label_smoothing, _smooth_labels,
+                                 lambda: y_true)
   return K.categorical_crossentropy(y_true, y_pred, from_logits=from_logits)
 
 
@@ -1595,8 +1595,8 @@ def binary_crossentropy(y_true, y_pred, from_logits=False, label_smoothing=0):
   def _smooth_labels():
     return y_true * (1.0 - label_smoothing) + 0.5 * label_smoothing
 
-  y_true = smart_cond.smart_cond(label_smoothing,
-                                 _smooth_labels, lambda: y_true)
+  y_true = smart_cond.smart_cond(label_smoothing, _smooth_labels,
+                                 lambda: y_true)
   return K.mean(
       K.binary_crossentropy(y_true, y_pred, from_logits=from_logits), axis=-1)
 
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index cb24e3f0fc8..b3f391c7897 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -39,6 +39,7 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
+from tensorflow.python.keras.engine import keras_tensor
 from tensorflow.python.keras.losses import binary_crossentropy
 from tensorflow.python.keras.losses import categorical_crossentropy
 from tensorflow.python.keras.losses import categorical_hinge
@@ -208,7 +209,12 @@ class Metric(base_layer.Layer):
 
     def replica_local_fn(*args, **kwargs):
       """Updates the state of the metric in a replica-local context."""
-      update_op = self.update_state(*args, **kwargs)  # pylint: disable=not-callable
+      if any(
+          isinstance(arg, keras_tensor.KerasTensor)
+          for arg in nest.flatten((args, kwargs))):
+        update_op = None
+      else:
+        update_op = self.update_state(*args, **kwargs)  # pylint: disable=not-callable
       update_ops = []
       if update_op is not None:
         update_ops.append(update_op)
@@ -547,7 +553,7 @@ class MeanRelativeError(Mean):
     y_pred, y_true = losses_utils.squeeze_or_expand_dimensions(
         y_pred, y_true)
 
-    y_pred, self.normalizer = confusion_matrix.remove_squeezable_dimensions(
+    y_pred, self.normalizer = losses_utils.remove_squeezable_dimensions(
         y_pred, self.normalizer)
     y_pred.shape.assert_is_compatible_with(y_true.shape)
     relative_errors = math_ops.div_no_nan(
@@ -638,7 +644,7 @@ class MeanMetricWrapper(Mean):
 
 @keras_export('keras.metrics.Accuracy')
 class Accuracy(MeanMetricWrapper):
-  """Calculates how often predictions equals labels.
+  """Calculates how often predictions equal labels.
 
   This metric creates two local variables, `total` and `count` that are used to
   compute the frequency with which `y_pred` matches `y_true`. This frequency is
@@ -680,7 +686,7 @@ class Accuracy(MeanMetricWrapper):
 
 @keras_export('keras.metrics.BinaryAccuracy')
 class BinaryAccuracy(MeanMetricWrapper):
-  """Calculates how often predictions matches binary labels.
+  """Calculates how often predictions match binary labels.
 
   This metric creates two local variables, `total` and `count` that are used to
   compute the frequency with which `y_pred` matches `y_true`. This frequency is
@@ -3396,6 +3402,7 @@ mae = MAE = mean_absolute_error
 mape = MAPE = mean_absolute_percentage_error
 msle = MSLE = mean_squared_logarithmic_error
 cosine_similarity = cosine_proximity
+log_cosh = logcosh
 
 
 def clone_metric(metric):
diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index 90d87b4041e..7b339fc5a47 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -71,7 +71,7 @@ class KerasSumTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(self.evaluate(m.total), 100)
 
       # check update_state() and result() + state accumulation + tensor input
-      update_op = m.update_state(ops.convert_n_to_tensor([1, 5]))
+      update_op = m.update_state(ops.convert_to_tensor_v2([1, 5]))
       self.evaluate(update_op)
       self.assertAlmostEqual(self.evaluate(m.result()), 106)
       self.assertEqual(self.evaluate(m.total), 106)  # 100 + 1 + 5
diff --git a/tensorflow/python/keras/mixed_precision/experimental/BUILD b/tensorflow/python/keras/mixed_precision/experimental/BUILD
index 4060e455f84..b143e5946f5 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/BUILD
+++ b/tensorflow/python/keras/mixed_precision/experimental/BUILD
@@ -169,7 +169,6 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":loss_scale",
-        "//tensorflow/python:loss_scale",
         "//tensorflow/python/distribute:collective_all_reduce_strategy",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:mirrored_strategy",
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
index 06a4b92bb8e..caad08ce066 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
@@ -62,7 +62,7 @@ class AutoCastVariable(variables.Variable, core.Tensor):
     Raises:
       ValueError: If `variable` is not a floating-point resource variable
     """
-    if not resource_variable_ops.is_resource_variable(variable):
+    if not isinstance(variable, variables.Variable):
       raise ValueError('variable must be of type tf.ResourceVariable, but got: '
                        '%s' % variable)
     if not variable.dtype.is_floating:
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
index 48fa93459a7..c3162e0e80a 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
@@ -432,13 +432,21 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
         )
 
   def test_repr_distributed(self):
-    with mirrored_strategy.MirroredStrategy(['/cpu:1', '/cpu:2']).scope():
+    strategy = mirrored_strategy.MirroredStrategy(['/cpu:1', '/cpu:2'])
+    with strategy.scope():
       x = get_var(1., dtypes.float32)
       x = autocast_variable.create_autocast_variable(x)
-      self.assertRegex(
-          repr(x).replace('\n', ' '),
-          '<AutoCastDistributedVariable dtype=float32 true_dtype=float32 '
-          'inner_variable=MirroredVariable.*>')
+      use_policy = getattr(strategy.extended, '_use_policy', False)
+      if use_policy:
+        self.assertRegex(
+            repr(x).replace('\n', ' '),
+            '<AutoCastDistributedVariable dtype=float32 true_dtype=float32 '
+            'inner_variable=DistributedVariable.*>')
+      else:
+        self.assertRegex(
+            repr(x).replace('\n', ' '),
+            '<AutoCastDistributedVariable dtype=float32 true_dtype=float32 '
+            'inner_variable=MirroredVariable.*>')
 
   @parameterized.named_parameters(
       ('v1', gradient_descent_v1.GradientDescentOptimizer),
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale.py
index 680b0a5b89f..307313d7e36 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale.py
@@ -21,6 +21,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import six
+
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.training.experimental import loss_scale as loss_scale_module
 
@@ -44,6 +46,18 @@ def deserialize(config, custom_objects=None):
 
 
 def get(identifier):
+  """Get a loss scale object."""
   if isinstance(identifier, dict):
     return deserialize(identifier)
-  return loss_scale_module.get(identifier)
+
+  if isinstance(identifier, six.integer_types + (float,)):
+    return loss_scale_module.FixedLossScale(identifier)
+  if identifier == 'dynamic':
+    return loss_scale_module.DynamicLossScale()
+  if isinstance(identifier, loss_scale_module.LossScale):
+    return identifier
+  elif identifier is None:
+    return None
+  else:
+    raise ValueError('Could not interpret loss scale identifier: %s' %
+                     identifier)
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_benchmark.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_benchmark.py
index 8f8f50b4052..4ebc360b973 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_benchmark.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_benchmark.py
@@ -12,27 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Benchmarks for LossScaleOptimizer and LossScaleGradientTape."""
+"""Benchmarks for LossScaleOptimizer."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import time
 
-from tensorflow.python.client import session as session_module
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.framework import ops
 from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
 from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training.experimental import loss_scale as loss_scale_module
-from tensorflow.python.training.experimental import loss_scaling_gradient_tape as lsgt_module
 
 
 def _get_strategy(num_gpus):
@@ -56,28 +53,18 @@ class LossScaleBenchmark(test.Benchmark):
     Args:
       gradient_type: "optimizer" or "gradient_tape". How gradients are computed.
         "optimizer" uses Optimizer.minimize. "gradient_tape" uses
-        GradientTape.gradient.
+        GradientTape.gradient along with LossScaleOptimizer.get_scaled_loss and
+        LossScaleOptimizer.get_unscaled_gradients.
       num_gpus: The number of GPUs to use. Must be at least 1.
-      mode: "eager", "tf_function", or "graph". "eager" means to use eager mode.
-        "tf_function" means to use eager mode where all computations are wrapped
-        in a tf.function. "graph" means to use TensorFlow 1's graph mode with a
-        tf.compat.v1.Session. "graph" is unsupported with a
-        LossScaleGradientTape.
+      mode: "eager" or "tf_function". "tf_function" causes all computations to
+        be wrapped in a tf.function, while "eager" runs computations eagerly.
       loss_scaling: "fixed", "dynamic", or None. The type of loss scaling to
         use. None means use no loss scaling, which is useful as a baseline to
         see how much slower loss scaling is in comparison.
     """
-    if mode == 'graph':
-      graph = ops.Graph()
-      ctx_mgr = graph.as_default()
-    elif mode == 'eager':
-      ctx_mgr = context.eager_mode()
-    else:
-      assert mode == 'tf_function'
-      ctx_mgr = context.eager_mode()
     ls_str = loss_scaling or 'no_loss_scaling'
     name = '%s_%d_GPU_%s_%s' % (gradient_type, num_gpus, mode, ls_str)
-    with ctx_mgr, _get_strategy(num_gpus).scope() as strategy:
+    with context.eager_mode(), _get_strategy(num_gpus).scope() as strategy:
       opt = adam.Adam()
       if loss_scaling == 'fixed':
         loss_scale = loss_scale_module.FixedLossScale(2.)
@@ -93,6 +80,8 @@ class LossScaleBenchmark(test.Benchmark):
       else:
         assert loss_scaling is None
         loss_scale = None
+      if loss_scale:
+        opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
 
       num_vars = 200
       num_warmup_iters = 1
@@ -112,29 +101,25 @@ class LossScaleBenchmark(test.Benchmark):
         return math_ops.add_n(var_list)
 
       if gradient_type == 'gradient_tape':
-        tape_cls = ((lambda: lsgt_module.LossScaleGradientTape(loss_scale))
-                    if loss_scale else backprop.GradientTape)
-        def minimize_fn():
-          with tape_cls() as tape:
-            loss = get_loss()
-          grads = tape.gradient(loss, var_list)
-          return opt.apply_gradients(zip(grads, var_list))
+        if loss_scale is None:
+          def minimize_fn():
+            with backprop.GradientTape() as tape:
+              loss = get_loss()
+            grads = tape.gradient(loss, var_list)
+            return opt.apply_gradients(zip(grads, var_list))
+        else:
+          def minimize_fn():
+            with backprop.GradientTape() as tape:
+              loss = get_loss()
+              scaled_loss = opt.get_scaled_loss(loss)
+            scaled_grads = tape.gradient(scaled_loss, var_list)
+            grads = opt.get_unscaled_gradients(scaled_grads)
+            return opt.apply_gradients(zip(grads, var_list))
       else:
         assert gradient_type == 'optimizer'
-        if loss_scale:
-          opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
         def minimize_fn():
           return opt.minimize(get_loss, var_list)
 
-      if mode == 'graph':
-        run_op = strategy.run(minimize_fn)
-        init_op = variables.global_variables_initializer()
-        with session_module.Session() as sess:
-          sess.run(init_op)
-          self.run_op_benchmark(sess, run_op, min_iters=num_iters,
-                                burn_iters=num_warmup_iters, name=name)
-        return
-
       def run_fn():
         strategy.run(minimize_fn)
       if mode == 'tf_function':
@@ -163,13 +148,12 @@ class LossScaleBenchmark(test.Benchmark):
 
   def benchmark_optimizer(self):
     for num_gpus in self._gpus_to_test_with():
-      for mode in 'eager', 'tf_function', 'graph':
+      for mode in 'eager', 'tf_function':
         for loss_scaling in None, 'fixed', 'dynamic':
           self._benchmark('optimizer', num_gpus, mode, loss_scaling)
 
   def benchmark_gradient_tape(self):
     for num_gpus in self._gpus_to_test_with():
-      # LossScaleGradientTape doesn't support graph mode
       for mode in 'eager', 'tf_function':
         for loss_scaling in None, 'fixed', 'dynamic':
           self._benchmark('gradient_tape', num_gpus, mode, loss_scaling)
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
index 3d37d10791d..4a3f459de80 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
@@ -31,7 +31,6 @@ from tensorflow.python.keras.mixed_precision.experimental import loss_scale as k
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.training.experimental import loss_scale as loss_scale_module
 from tensorflow.python.training.experimental import mixed_precision
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util.tf_export import keras_export
@@ -271,7 +270,7 @@ class LossScaleOptimizer(_DelegatingTrackableMixin, optimizer_v2.OptimizerV2):
     # constructor.
     _DelegatingTrackableMixin.__init__(self, self._optimizer)
 
-    for weight in loss_scale_module.get_loss_scale_weights(self._loss_scale):
+    for weight in self._loss_scale._weights.values():  # pylint: disable=protected-access
       # We cannot call `track_variable` in the LossScale class itself, because a
       # file outside of Keras cannot depend on a Keras file. Calling it here
       # instead is OK, because a variable only needs to be tracked if used with
@@ -349,7 +348,7 @@ class LossScaleOptimizer(_DelegatingTrackableMixin, optimizer_v2.OptimizerV2):
         for g in grads
     ]
 
-  def _compute_gradients(self, loss, var_list, grad_loss=None):
+  def _compute_gradients(self, loss, var_list, grad_loss=None, tape=None):
     loss = self.get_scaled_loss(loss)
     grads_and_vars = self._optimizer._compute_gradients(loss, var_list,  # pylint: disable=protected-access
                                                         grad_loss)
@@ -407,8 +406,7 @@ class LossScaleOptimizer(_DelegatingTrackableMixin, optimizer_v2.OptimizerV2):
     # DistributionStrategy does not support having a cond in a replica context
     # with a branch that calls `merge_call`, and self._optimizer.apply_gradients
     # calls `merge_call`.
-    maybe_apply_op = smart_cond.smart_cond(should_apply_grads,
-                                           apply_fn,
+    maybe_apply_op = smart_cond.smart_cond(should_apply_grads, apply_fn,
                                            do_not_apply_fn)
     return control_flow_ops.group(maybe_apply_op, loss_scale_update_op)
 
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
index 350cfe6a09c..9a9d174a64f 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
@@ -106,19 +106,20 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3
       self.assertAllClose([3.], self.evaluate(var))
 
-  @test_util.deprecated_graph_mode_only
   def testFixedLossScaleAppliedToLossWithGetGradients(self):
-    var = variables.Variable([2.0])
-    opt = gradient_descent.SGD(1.0)
-    loss_scale = 10.
-    opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
-    grad_check_fn = mp_test_util.create_identity_with_grad_check_fn(loss_scale)
-    loss = grad_check_fn(var)
-    run_op = opt.get_gradients(loss, [var])
-    self.evaluate(variables.global_variables_initializer())
-    # This will cause an assertion to run, as
-    # mp_test_util.create_identity_with_grad_check_fn added an assertion op.
-    self.evaluate(run_op)
+    with ops.Graph().as_default():
+      var = variables.Variable([2.0])
+      opt = gradient_descent.SGD(1.0)
+      loss_scale = 10.
+      opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+      grad_check_fn = mp_test_util.create_identity_with_grad_check_fn(
+          loss_scale)
+      loss = grad_check_fn(var)
+      run_op = opt.get_gradients(loss, [var])
+      self.evaluate(variables.global_variables_initializer())
+      # This will cause an assertion to run, as
+      # mp_test_util.create_identity_with_grad_check_fn added an assertion op.
+      self.evaluate(run_op)
 
   def testGetScaledLoss(self):
     opt = gradient_descent.SGD(2.0)
diff --git a/tensorflow/python/keras/mixed_precision/experimental/mixed_precision_graph_rewrite_test.py b/tensorflow/python/keras/mixed_precision/experimental/mixed_precision_graph_rewrite_test.py
index b2c5f80544e..d0fea573bd0 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/mixed_precision_graph_rewrite_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/mixed_precision_graph_rewrite_test.py
@@ -19,11 +19,10 @@ from __future__ import print_function
 
 import os
 
-from absl.testing import parameterized
-
 from tensorflow.python import tf2
 from tensorflow.python.framework import config
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import combinations
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer as loss_scale_optimizer_v2
 from tensorflow.python.keras.mixed_precision.experimental import policy
@@ -40,7 +39,7 @@ else:
       mixed_precision.enable_mixed_precision_graph_rewrite_v1)
 
 
-class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
+class MixedPrecisionTest(keras_parameterized.TestCase):
 
   IGNORE_PERF_VAR = 'TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_IGNORE_PERFORMANCE'
 
@@ -61,7 +60,7 @@ class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
     mixed_precision.disable_mixed_precision_graph_rewrite()
     super(MixedPrecisionTest, self).tearDown()
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_wrap_optimizer(self):
     opt = gradient_descent_v2.SGD(1.0)
     opt = enable_mixed_precision_graph_rewrite(opt, 123.)
@@ -69,7 +68,7 @@ class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
         opt, loss_scale_optimizer_v2.LossScaleOptimizer)
     self.assertEqual(self.evaluate(opt._loss_scale()), 123.)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_optimizer_errors(self):
     opt = gradient_descent_v2.SGD(1.0)
     opt = loss_scale_optimizer_v2.LossScaleOptimizer(opt, 'dynamic')
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index 37a3f01272f..76324621a8b 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -206,6 +206,8 @@ def _clone_functional_model(model, input_tensors=None, layer_fn=_clone_layer):
   ancillary_layers = [
       layer for layer in created_layers.values() if layer not in model.layers
   ]
+  # TODO(b/162887610): This may need to adjust the inbound node index if the
+  # created layers had already been used to define other models.
   if ancillary_layers:
     new_nodes = nest.flatten([
         layer.inbound_nodes[1:]
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index ea0dc148326..8411ed0d3ea 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -278,9 +278,20 @@ class TestModelCloning(keras_parameterized.TestCase):
       has_placeholder = _has_placeholder(graph)
       self.assertFalse(has_placeholder)
 
-  def test_functional_cloning_with_tensor_kwarg(self):
+  @keras_parameterized.run_all_keras_modes
+  @parameterized.named_parameters([
+      {'testcase_name': 'clone_weights', 'share_weights': False},
+      {'testcase_name': 'share_weights', 'share_weights': True},
+  ])
+  def test_functional_cloning_with_tensor_kwarg(self, share_weights):
     """Test that cloning works with models that use Tensor kwargs."""
 
+    if share_weights:
+      clone_fn = functools.partial(
+          keras.models.clone_model, clone_function=models.share_weights)
+    else:
+      clone_fn = keras.models.clone_model
+
     class LayerWithTensorKwarg(keras.layers.Layer):
 
       def call(self, inputs, tensor=None):
@@ -295,13 +306,21 @@ class TestModelCloning(keras_parameterized.TestCase):
     model.add_loss(math_ops.reduce_sum(model.outputs))
 
     input_arr = np.random.random((1, 3)).astype(np.float32)
-    with ops.Graph().as_default():
-      with self.session() as sess:
-        clone = keras.models.clone_model(model)
-        self.assertLen(clone.losses, 1)
+    clone = clone_fn(model)
 
-        loss = sess.run(clone.losses[0], feed_dict={clone.input: input_arr})
-        self.assertAllClose(np.sum(input_arr), loss)
+    if context.executing_eagerly():
+      clone(input_arr)
+      loss = clone.losses[0]
+    else:
+      with self.session() as sess:
+        clone(input_arr)
+        if share_weights:
+          self.skipTest('Weight sharing with inputs in call **kwargs does '
+                        'not work correctly in v1')
+        else:
+          feed_dict = {clone.input: input_arr}
+        loss = sess.run(clone.losses[0], feed_dict=feed_dict)
+    self.assertAllClose(np.sum(input_arr), loss)
 
 
 def _has_placeholder(graph):
diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD
index be793378538..b519ec7fb3d 100644
--- a/tensorflow/python/keras/optimizer_v2/BUILD
+++ b/tensorflow/python/keras/optimizer_v2/BUILD
@@ -28,6 +28,7 @@ py_library(
         "nadam.py",
         "optimizer_v2.py",
         "rmsprop.py",
+        "utils.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -273,6 +274,7 @@ cuda_py_test(
         "//tensorflow/python:training_lib",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/keras:combinations",
     ],
 )
 
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta.py b/tensorflow/python/keras/optimizer_v2/adadelta.py
index 12f9e40c394..8c895ae07f4 100644
--- a/tensorflow/python/keras/optimizer_v2/adadelta.py
+++ b/tensorflow/python/keras/optimizer_v2/adadelta.py
@@ -24,7 +24,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend_config
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
-from tensorflow.python.training import training_ops
+from tensorflow.python.training import gen_training_ops
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -120,14 +120,14 @@ class Adadelta(optimizer_v2.OptimizerV2):
 
     accum_grad = self.get_slot(var, 'accum_grad')
     accum_var = self.get_slot(var, 'accum_var')
-    return training_ops.resource_apply_adadelta(
-        var.handle,
-        accum_grad.handle,
-        accum_var.handle,
-        coefficients['lr_t'],
-        coefficients['rho'],
-        coefficients['epsilon'],
-        grad,
+    return gen_training_ops.ResourceApplyAdadelta(
+        var=var.handle,
+        accum=accum_grad.handle,
+        accum_update=accum_var.handle,
+        lr=coefficients['lr_t'],
+        rho=coefficients['rho'],
+        epsilon=coefficients['epsilon'],
+        grad=grad,
         use_locking=self._use_locking)
 
   def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
@@ -137,15 +137,15 @@ class Adadelta(optimizer_v2.OptimizerV2):
 
     accum_grad = self.get_slot(var, 'accum_grad')
     accum_var = self.get_slot(var, 'accum_var')
-    return training_ops.resource_sparse_apply_adadelta(
-        var.handle,
-        accum_grad.handle,
-        accum_var.handle,
-        coefficients['lr_t'],
-        coefficients['rho'],
-        coefficients['epsilon'],
-        grad,
-        indices,
+    return gen_training_ops.ResourceSparseApplyAdadelta(
+        var=var.handle,
+        accum=accum_grad.handle,
+        accum_update=accum_var.handle,
+        lr=coefficients['lr_t'],
+        rho=coefficients['rho'],
+        epsilon=coefficients['epsilon'],
+        grad=grad,
+        indices=indices,
         use_locking=self._use_locking)
 
   def get_config(self):
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad.py b/tensorflow/python/keras/optimizer_v2/adagrad.py
index dbed9de92c6..ba76b837942 100644
--- a/tensorflow/python/keras/optimizer_v2/adagrad.py
+++ b/tensorflow/python/keras/optimizer_v2/adagrad.py
@@ -26,7 +26,7 @@ from tensorflow.python.keras import backend_config
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
-from tensorflow.python.training import training_ops
+from tensorflow.python.training import gen_training_ops
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -129,12 +129,12 @@ class Adagrad(optimizer_v2.OptimizerV2):
                     or self._fallback_apply_state(var_device, var_dtype))
 
     acc = self.get_slot(var, 'accumulator')
-    return training_ops.resource_apply_adagrad_v2(
-        var.handle,
-        acc.handle,
-        coefficients['lr_t'],
-        coefficients['epsilon'],
-        grad,
+    return gen_training_ops.ResourceApplyAdagradV2(
+        var=var.handle,
+        accum=acc.handle,
+        lr=coefficients['lr_t'],
+        epsilon=coefficients['epsilon'],
+        grad=grad,
         use_locking=self._use_locking)
 
   def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
@@ -143,13 +143,13 @@ class Adagrad(optimizer_v2.OptimizerV2):
                     or self._fallback_apply_state(var_device, var_dtype))
 
     acc = self.get_slot(var, 'accumulator')
-    return training_ops.resource_sparse_apply_adagrad_v2(
-        var.handle,
-        acc.handle,
-        coefficients['lr_t'],
-        coefficients['epsilon'],
-        grad,
-        indices,
+    return gen_training_ops.ResourceSparseApplyAdagradV2(
+        var=var.handle,
+        accum=acc.handle,
+        lr=coefficients['lr_t'],
+        epsilon=coefficients['epsilon'],
+        grad=grad,
+        indices=indices,
         use_locking=self._use_locking)
 
   def get_config(self):
diff --git a/tensorflow/python/keras/optimizer_v2/adam.py b/tensorflow/python/keras/optimizer_v2/adam.py
index df41201e14b..1fccd116012 100644
--- a/tensorflow/python/keras/optimizer_v2/adam.py
+++ b/tensorflow/python/keras/optimizer_v2/adam.py
@@ -26,7 +26,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.training import training_ops
+from tensorflow.python.training import gen_training_ops
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -171,32 +171,32 @@ class Adam(optimizer_v2.OptimizerV2):
     v = self.get_slot(var, 'v')
 
     if not self.amsgrad:
-      return training_ops.resource_apply_adam(
-          var.handle,
-          m.handle,
-          v.handle,
-          coefficients['beta_1_power'],
-          coefficients['beta_2_power'],
-          coefficients['lr_t'],
-          coefficients['beta_1_t'],
-          coefficients['beta_2_t'],
-          coefficients['epsilon'],
-          grad,
+      return gen_training_ops.ResourceApplyAdam(
+          var=var.handle,
+          m=m.handle,
+          v=v.handle,
+          beta1_power=coefficients['beta_1_power'],
+          beta2_power=coefficients['beta_2_power'],
+          lr=coefficients['lr_t'],
+          beta1=coefficients['beta_1_t'],
+          beta2=coefficients['beta_2_t'],
+          epsilon=coefficients['epsilon'],
+          grad=grad,
           use_locking=self._use_locking)
     else:
       vhat = self.get_slot(var, 'vhat')
-      return training_ops.resource_apply_adam_with_amsgrad(
-          var.handle,
-          m.handle,
-          v.handle,
-          vhat.handle,
-          coefficients['beta_1_power'],
-          coefficients['beta_2_power'],
-          coefficients['lr_t'],
-          coefficients['beta_1_t'],
-          coefficients['beta_2_t'],
-          coefficients['epsilon'],
-          grad,
+      return gen_training_ops.ResourceApplyAdamWithAmsgrad(
+          var=var.handle,
+          m=m.handle,
+          v=v.handle,
+          vhat=vhat.handle,
+          beta1_power=coefficients['beta_1_power'],
+          beta2_power=coefficients['beta_2_power'],
+          lr=coefficients['lr_t'],
+          beta1=coefficients['beta_1_t'],
+          beta2=coefficients['beta_2_t'],
+          epsilon=coefficients['epsilon'],
+          grad=grad,
           use_locking=self._use_locking)
 
   def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
diff --git a/tensorflow/python/keras/optimizer_v2/adamax.py b/tensorflow/python/keras/optimizer_v2/adamax.py
index 5ac4734c6a2..3f4312c731e 100644
--- a/tensorflow/python/keras/optimizer_v2/adamax.py
+++ b/tensorflow/python/keras/optimizer_v2/adamax.py
@@ -25,7 +25,7 @@ from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.training import training_ops
+from tensorflow.python.training import gen_training_ops
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -136,17 +136,16 @@ class Adamax(optimizer_v2.OptimizerV2):
 
     m = self.get_slot(var, 'm')
     v = self.get_slot(var, 'v')
-
-    return training_ops.resource_apply_ada_max(
-        var.handle,
-        m.handle,
-        v.handle,
-        coefficients['beta_1_power'],
-        coefficients['lr_t'],
-        coefficients['beta_1_t'],
-        coefficients['beta_2_t'],
-        coefficients['epsilon'],
-        grad,
+    return gen_training_ops.ResourceApplyAdaMax(
+        var=var.handle,
+        m=m.handle,
+        v=v.handle,
+        beta1_power=coefficients['beta_1_power'],
+        lr=coefficients['lr_t'],
+        beta1=coefficients['beta_1_t'],
+        beta2=coefficients['beta_2_t'],
+        epsilon=coefficients['epsilon'],
+        grad=grad,
         use_locking=self._use_locking)
 
   def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
diff --git a/tensorflow/python/keras/optimizer_v2/ftrl.py b/tensorflow/python/keras/optimizer_v2/ftrl.py
index 419f0f70125..0e96724a44d 100644
--- a/tensorflow/python/keras/optimizer_v2/ftrl.py
+++ b/tensorflow/python/keras/optimizer_v2/ftrl.py
@@ -22,7 +22,7 @@ from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.training import training_ops
+from tensorflow.python.training import gen_training_ops
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -135,27 +135,27 @@ class Ftrl(optimizer_v2.OptimizerV2):
     linear = self.get_slot(var, 'linear')
 
     if self._l2_shrinkage_regularization_strength <= 0.0:
-      return training_ops.resource_apply_ftrl(
-          var.handle,
-          accum.handle,
-          linear.handle,
-          grad,
-          coefficients['lr_t'],
-          coefficients['l1_regularization_strength'],
-          coefficients['l2_regularization_strength'],
-          coefficients['learning_rate_power'],
+      return gen_training_ops.ResourceApplyFtrl(
+          var=var.handle,
+          accum=accum.handle,
+          linear=linear.handle,
+          grad=grad,
+          lr=coefficients['lr_t'],
+          l1=coefficients['l1_regularization_strength'],
+          l2=coefficients['l2_regularization_strength'],
+          lr_power=coefficients['learning_rate_power'],
           use_locking=self._use_locking)
     else:
-      return training_ops.resource_apply_ftrl_v2(
-          var.handle,
-          accum.handle,
-          linear.handle,
-          grad,
-          coefficients['lr_t'],
-          coefficients['l1_regularization_strength'],
-          coefficients['l2_regularization_strength'],
-          coefficients['l2_shrinkage_regularization_strength'],
-          coefficients['learning_rate_power'],
+      return gen_training_ops.ResourceApplyFtrlV2(
+          var=var.handle,
+          accum=accum.handle,
+          linear=linear.handle,
+          grad=grad,
+          lr=coefficients['lr_t'],
+          l1=coefficients['l1_regularization_strength'],
+          l2=coefficients['l2_regularization_strength'],
+          l2_shrinkage=coefficients['l2_shrinkage_regularization_strength'],
+          lr_power=coefficients['learning_rate_power'],
           use_locking=self._use_locking)
 
   def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
@@ -167,29 +167,29 @@ class Ftrl(optimizer_v2.OptimizerV2):
     linear = self.get_slot(var, 'linear')
 
     if self._l2_shrinkage_regularization_strength <= 0.0:
-      return training_ops.resource_sparse_apply_ftrl(
-          var.handle,
-          accum.handle,
-          linear.handle,
-          grad,
-          indices,
-          coefficients['lr_t'],
-          coefficients['l1_regularization_strength'],
-          coefficients['l2_regularization_strength'],
-          coefficients['learning_rate_power'],
+      return gen_training_ops.ResourceSparseApplyFtrl(
+          var=var.handle,
+          accum=accum.handle,
+          linear=linear.handle,
+          grad=grad,
+          indices=indices,
+          lr=coefficients['lr_t'],
+          l1=coefficients['l1_regularization_strength'],
+          l2=coefficients['l2_regularization_strength'],
+          lr_power=coefficients['learning_rate_power'],
           use_locking=self._use_locking)
     else:
-      return training_ops.resource_sparse_apply_ftrl_v2(
-          var.handle,
-          accum.handle,
-          linear.handle,
-          grad,
-          indices,
-          coefficients['lr_t'],
-          coefficients['l1_regularization_strength'],
-          coefficients['l2_regularization_strength'],
-          coefficients['l2_shrinkage_regularization_strength'],
-          coefficients['learning_rate_power'],
+      return gen_training_ops.ResourceSparseApplyFtrlV2(
+          var=var.handle,
+          accum=accum.handle,
+          linear=linear.handle,
+          grad=grad,
+          indices=indices,
+          lr=coefficients['lr_t'],
+          l1=coefficients['l1_regularization_strength'],
+          l2=coefficients['l2_regularization_strength'],
+          l2_shrinkage=coefficients['l2_shrinkage_regularization_strength'],
+          lr_power=coefficients['learning_rate_power'],
           use_locking=self._use_locking)
 
   def get_config(self):
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent.py b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
index 856cc692431..466b42a3818 100644
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
@@ -21,8 +21,8 @@ from __future__ import print_function
 from tensorflow.python.framework import ops
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.training import training_ops
+from tensorflow.python.ops import gen_resource_variable_ops
+from tensorflow.python.training import gen_training_ops
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -136,17 +136,20 @@ class SGD(optimizer_v2.OptimizerV2):
 
     if self._momentum:
       momentum_var = self.get_slot(var, "momentum")
-      return training_ops.resource_apply_keras_momentum(
-          var.handle,
-          momentum_var.handle,
-          coefficients["lr_t"],
-          grad,
-          coefficients["momentum"],
+      return gen_training_ops.ResourceApplyKerasMomentum(
+          var=var.handle,
+          accum=momentum_var.handle,
+          lr=coefficients["lr_t"],
+          grad=grad,
+          momentum=coefficients["momentum"],
           use_locking=self._use_locking,
           use_nesterov=self.nesterov)
     else:
-      return training_ops.resource_apply_gradient_descent(
-          var.handle, coefficients["lr_t"], grad, use_locking=self._use_locking)
+      return gen_training_ops.ResourceApplyGradientDescent(
+          var=var.handle,
+          alpha=coefficients["lr_t"],
+          delta=grad,
+          use_locking=self._use_locking)
 
   def _resource_apply_sparse_duplicate_indices(self, grad, var, indices,
                                                **kwargs):
@@ -158,8 +161,10 @@ class SGD(optimizer_v2.OptimizerV2):
       coefficients = (kwargs.get("apply_state", {}).get((var_device, var_dtype))
                       or self._fallback_apply_state(var_device, var_dtype))
 
-      return resource_variable_ops.resource_scatter_add(
-          var.handle, indices, -grad * coefficients["lr_t"])
+      return gen_resource_variable_ops.ResourceScatterAdd(
+          resource=var.handle,
+          indices=indices,
+          updates=-grad * coefficients["lr_t"])
 
   def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
     # This method is only needed for momentum optimization.
@@ -168,13 +173,13 @@ class SGD(optimizer_v2.OptimizerV2):
                     or self._fallback_apply_state(var_device, var_dtype))
 
     momentum_var = self.get_slot(var, "momentum")
-    return training_ops.resource_sparse_apply_keras_momentum(
-        var.handle,
-        momentum_var.handle,
-        coefficients["lr_t"],
-        grad,
-        indices,
-        coefficients["momentum"],
+    return gen_training_ops.ResourceSparseApplyKerasMomentum(
+        var=var.handle,
+        accum=momentum_var.handle,
+        lr=coefficients["lr_t"],
+        grad=grad,
+        indices=indices,
+        momentum=coefficients["momentum"],
         use_locking=self._use_locking,
         use_nesterov=self.nesterov)
 
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
index 0084f04bdd9..0f25beacc9a 100644
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
@@ -702,6 +702,25 @@ class MomentumOptimizerTest(test.TestCase, parameterized.TestCase):
     self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
     self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
 
+  @combinations.generate(combinations.combine(mode=["eager"]))
+  def testMinimizeLossTensor(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      var0 = variables.Variable([[1.0, 2.0]], dtype=dtype)
+      var1 = variables.Variable([3.0], dtype=dtype)
+      x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+
+      tape = backprop.GradientTape()
+      with tape:
+        loss = math_ops.matmul(var0, x) + var1
+      sgd = gradient_descent.SGD(1.0)
+      with self.assertRaisesRegex(ValueError, "`tape` is required"):
+        sgd.minimize(loss, [var0, var1])
+      sgd.minimize(loss, [var0, var1], tape=tape)
+
+      self.assertAllCloseAccordingToType([[1.0 - 4.0, 2.0 - 5.0]],
+                                         self.evaluate(var0))
+      self.assertAllCloseAccordingToType([3.0 - 1.0], self.evaluate(var1))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
index 9efda8faa5d..4dcff3d6c44 100644
--- a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
+++ b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
@@ -26,6 +26,7 @@ from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -236,8 +237,10 @@ class PiecewiseConstantDecay(LearningRateSchedule):
 
   def __call__(self, step):
     with ops.name_scope_v2(self.name or "PiecewiseConstant"):
-      boundaries = ops.convert_n_to_tensor(self.boundaries)
-      values = ops.convert_n_to_tensor(self.values)
+      boundaries = nest.map_structure(ops.convert_to_tensor_v2,
+                                      nest.flatten(self.boundaries))
+      values = nest.map_structure(ops.convert_to_tensor_v2,
+                                  nest.flatten(self.values))
       x_recomp = ops.convert_to_tensor_v2(step)
       for i, b in enumerate(boundaries):
         if b.dtype.base_dtype != x_recomp.dtype.base_dtype:
diff --git a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py
index a6fc22f7927..d2bc7b94ac2 100644
--- a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py
+++ b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py
@@ -25,6 +25,7 @@ from absl.testing import parameterized
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import combinations
@@ -143,7 +144,7 @@ class LRDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testPiecewiseConstantEdgeCases(self, serialize):
     # Test casting boundaries from int32 to int64.
-    x_int64 = variables.Variable(0, dtype=variables.dtypes.int64)
+    x_int64 = variables.Variable(0, dtype=dtypes.int64)
     boundaries, values = [1, 2, 3], [0.4, 0.5, 0.6, 0.7]
     decayed_lr = learning_rate_schedule.PiecewiseConstantDecay(
         boundaries, values)
diff --git a/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay.py b/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay.py
index f86e68d188f..ad280568fc7 100644
--- a/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay.py
+++ b/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -147,8 +148,9 @@ def piecewise_constant(x, boundaries, values, name=None):
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  boundaries = ops.convert_n_to_tensor(boundaries)
-  values = ops.convert_n_to_tensor(values)
+  boundaries = nest.map_structure(ops.convert_to_tensor_v2,
+                                  nest.flatten(boundaries))
+  values = nest.map_structure(ops.convert_to_tensor_v2, nest.flatten(values))
   x_recomp = ops.convert_to_tensor(x)
   # Avoid explicit conversion to x's dtype. This could result in faulty
   # comparisons, for example if floats are converted to integers.
diff --git a/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay_test.py b/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay_test.py
index 19a59a64be0..b530767b6f8 100644
--- a/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay_test.py
+++ b/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay_test.py
@@ -21,15 +21,17 @@ from __future__ import print_function
 import math
 
 from tensorflow.python.eager import context
-from tensorflow.python.framework import test_util
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras import combinations
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.optimizer_v2 import legacy_learning_rate_decay as learning_rate_decay
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
-class LRDecayTest(test_util.TensorFlowTestCase):
+@combinations.generate(combinations.combine(mode=["graph", "eager"]))
+class LRDecayTest(keras_parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
   def testContinuous(self):
     self.evaluate(variables.global_variables_initializer())
     step = 5
@@ -37,7 +39,6 @@ class LRDecayTest(test_util.TensorFlowTestCase):
     expected = .05 * 0.96**(5.0 / 10.0)
     self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testStaircase(self):
     if context.executing_eagerly():
       step = variables.Variable(0)
@@ -59,26 +60,28 @@ class LRDecayTest(test_util.TensorFlowTestCase):
       self.evaluate(step.assign(100))
       self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
-  @test_util.run_deprecated_v1
   def testVariables(self):
     step = variables.VariableV1(1)
-    assign_1 = step.assign(1)
-    assign_2 = step.assign(2)
-    assign_100 = step.assign(100)
+
     decayed_lr = learning_rate_decay.exponential_decay(
         .1, step, 3, 0.96, staircase=True)
     self.evaluate(variables.global_variables_initializer())
     # No change to learning rate
-    self.evaluate(assign_1.op)
+    assign_1 = step.assign(1)
+    if not context.executing_eagerly():
+      self.evaluate(assign_1.op)
     self.assertAllClose(self.evaluate(decayed_lr), .1, 1e-6)
-    self.evaluate(assign_2.op)
+    assign_2 = step.assign(2)
+    if not context.executing_eagerly():
+      self.evaluate(assign_2.op)
     self.assertAllClose(self.evaluate(decayed_lr), .1, 1e-6)
     # Decayed learning rate
-    self.evaluate(assign_100.op)
+    assign_100 = step.assign(100)
+    if not context.executing_eagerly():
+      self.evaluate(assign_100.op)
     expected = .1 * 0.96**(100 // 3)
     self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testPiecewiseConstant(self):
     x = variables.Variable(-999)
     decayed_lr = learning_rate_decay.piecewise_constant(
@@ -98,10 +101,8 @@ class LRDecayTest(test_util.TensorFlowTestCase):
     self.evaluate(x.assign(999))
     self.assertAllClose(self.evaluate(decayed_lr), 0.001, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
-  @test_util.run_v1_only("b/120545219")
   def testPiecewiseConstantEdgeCases(self):
-    x_int = variables.Variable(0, dtype=variables.dtypes.int32)
+    x_int = variables.Variable(0, dtype=dtypes.int32)
     boundaries, values = [-1.0, 1.0], [1, 2, 3]
     with self.assertRaises(ValueError):
       decayed_lr = learning_rate_decay.piecewise_constant(
@@ -119,13 +120,13 @@ class LRDecayTest(test_util.TensorFlowTestCase):
 
     # Test that ref types are valid.
     if not context.executing_eagerly():
-      x = variables.VariableV1(0.0)
+      x = variables.VariableV1(0.0, use_resource=False)
       x_ref = x.op.outputs[0]   # float32_ref tensor should be accepted
       boundaries, values = [1.0, 2.0], [1, 2, 3]
       learning_rate_decay.piecewise_constant(x_ref, boundaries, values)
 
     # Test casting boundaries from int32 to int64.
-    x_int64 = variables.Variable(0, dtype=variables.dtypes.int64)
+    x_int64 = variables.Variable(0, dtype=dtypes.int64)
     boundaries, values = [1, 2, 3], [0.4, 0.5, 0.6, 0.7]
     decayed_lr = learning_rate_decay.piecewise_constant(
         x_int64, boundaries, values)
@@ -142,9 +143,9 @@ class LRDecayTest(test_util.TensorFlowTestCase):
     self.assertAllClose(self.evaluate(decayed_lr), 0.7, 1e-6)
 
 
-class LinearDecayTest(test_util.TensorFlowTestCase):
+@combinations.generate(combinations.combine(mode=["graph", "eager"]))
+class LinearDecayTest(keras_parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
   def testHalfWay(self):
     step = 5
     lr = 0.05
@@ -153,7 +154,6 @@ class LinearDecayTest(test_util.TensorFlowTestCase):
     expected = lr * 0.5
     self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testEnd(self):
     step = 10
     lr = 0.05
@@ -162,7 +162,6 @@ class LinearDecayTest(test_util.TensorFlowTestCase):
     expected = end_lr
     self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testHalfWayWithEnd(self):
     step = 5
     lr = 0.05
@@ -171,7 +170,6 @@ class LinearDecayTest(test_util.TensorFlowTestCase):
     expected = (lr + end_lr) * 0.5
     self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testBeyondEnd(self):
     step = 15
     lr = 0.05
@@ -180,7 +178,6 @@ class LinearDecayTest(test_util.TensorFlowTestCase):
     expected = end_lr
     self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testBeyondEndWithCycle(self):
     step = 15
     lr = 0.05
@@ -191,9 +188,9 @@ class LinearDecayTest(test_util.TensorFlowTestCase):
     self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
 
-class SqrtDecayTest(test_util.TensorFlowTestCase):
+@combinations.generate(combinations.combine(mode=["graph", "eager"]))
+class SqrtDecayTest(keras_parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
   def testHalfWay(self):
     step = 5
     lr = 0.05
@@ -204,7 +201,6 @@ class SqrtDecayTest(test_util.TensorFlowTestCase):
     expected = lr * 0.5**power
     self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testEnd(self):
     step = 10
     lr = 0.05
@@ -215,7 +211,6 @@ class SqrtDecayTest(test_util.TensorFlowTestCase):
     expected = end_lr
     self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testHalfWayWithEnd(self):
     step = 5
     lr = 0.05
@@ -226,7 +221,6 @@ class SqrtDecayTest(test_util.TensorFlowTestCase):
     expected = (lr - end_lr) * 0.5**power + end_lr
     self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testBeyondEnd(self):
     step = 15
     lr = 0.05
@@ -237,7 +231,6 @@ class SqrtDecayTest(test_util.TensorFlowTestCase):
     expected = end_lr
     self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testBeyondEndWithCycle(self):
     step = 15
     lr = 0.05
@@ -249,9 +242,9 @@ class SqrtDecayTest(test_util.TensorFlowTestCase):
     self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
 
-class PolynomialDecayTest(test_util.TensorFlowTestCase):
+@combinations.generate(combinations.combine(mode=["graph", "eager"]))
+class PolynomialDecayTest(keras_parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
   def testBeginWithCycle(self):
     lr = 0.001
     decay_steps = 10
@@ -262,9 +255,9 @@ class PolynomialDecayTest(test_util.TensorFlowTestCase):
     self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
 
-class ExponentialDecayTest(test_util.TensorFlowTestCase):
+@combinations.generate(combinations.combine(mode=["graph", "eager"]))
+class ExponentialDecayTest(keras_parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
   def testDecay(self):
     initial_lr = 0.1
     k = 10
@@ -279,7 +272,6 @@ class ExponentialDecayTest(test_util.TensorFlowTestCase):
       self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
       self.evaluate(step.assign_add(1))
 
-  @test_util.run_in_graph_and_eager_modes
   def testStaircase(self):
     initial_lr = 0.1
     k = 10
@@ -295,9 +287,9 @@ class ExponentialDecayTest(test_util.TensorFlowTestCase):
       self.evaluate(step.assign_add(1))
 
 
-class InverseDecayTest(test_util.TensorFlowTestCase):
+@combinations.generate(combinations.combine(mode=["graph", "eager"]))
+class InverseDecayTest(keras_parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
   def testDecay(self):
     initial_lr = 0.1
     k = 10
@@ -312,7 +304,6 @@ class InverseDecayTest(test_util.TensorFlowTestCase):
       self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
       self.evaluate(step.assign_add(1))
 
-  @test_util.run_in_graph_and_eager_modes
   def testStaircase(self):
     initial_lr = 0.1
     k = 10
@@ -328,7 +319,8 @@ class InverseDecayTest(test_util.TensorFlowTestCase):
       self.evaluate(step.assign_add(1))
 
 
-class CosineDecayTest(test_util.TensorFlowTestCase):
+@combinations.generate(combinations.combine(mode=["graph", "eager"]))
+class CosineDecayTest(keras_parameterized.TestCase):
 
   def np_cosine_decay(self, step, decay_steps, alpha=0.0):
     step = min(step, decay_steps)
@@ -336,7 +328,6 @@ class CosineDecayTest(test_util.TensorFlowTestCase):
     decay = 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
     return (1.0 - alpha) * decay + alpha
 
-  @test_util.run_in_graph_and_eager_modes
   def testDecay(self):
     num_training_steps = 1000
     initial_lr = 1.0
@@ -346,7 +337,6 @@ class CosineDecayTest(test_util.TensorFlowTestCase):
       expected = self.np_cosine_decay(step, num_training_steps)
       self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testAlpha(self):
     num_training_steps = 1000
     initial_lr = 1.0
@@ -358,7 +348,8 @@ class CosineDecayTest(test_util.TensorFlowTestCase):
       self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
 
-class CosineDecayRestartsTest(test_util.TensorFlowTestCase):
+@combinations.generate(combinations.combine(mode=["graph", "eager"]))
+class CosineDecayRestartsTest(keras_parameterized.TestCase):
 
   def np_cosine_decay_restarts(self, step, decay_steps, t_mul=2.0, m_mul=1.0,
                                alpha=0.0):
@@ -372,7 +363,6 @@ class CosineDecayRestartsTest(test_util.TensorFlowTestCase):
     decay = fac * 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
     return (1.0 - alpha) * decay + alpha
 
-  @test_util.run_in_graph_and_eager_modes
   def testDecay(self):
     num_training_steps = 1000
     initial_lr = 1.0
@@ -382,7 +372,6 @@ class CosineDecayRestartsTest(test_util.TensorFlowTestCase):
       expected = self.np_cosine_decay_restarts(step, num_training_steps)
       self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testAlpha(self):
     num_training_steps = 1000
     initial_lr = 1.0
@@ -394,7 +383,6 @@ class CosineDecayRestartsTest(test_util.TensorFlowTestCase):
           step, num_training_steps, alpha=alpha)
       self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testMMul(self):
     num_training_steps = 1000
     initial_lr = 1.0
@@ -406,7 +394,6 @@ class CosineDecayRestartsTest(test_util.TensorFlowTestCase):
           step, num_training_steps, m_mul=m_mul)
       self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testTMul(self):
     num_training_steps = 1000
     initial_lr = 1.0
@@ -419,7 +406,8 @@ class CosineDecayRestartsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
 
-class LinearCosineDecayTest(test_util.TensorFlowTestCase):
+@combinations.generate(combinations.combine(mode=["graph", "eager"]))
+class LinearCosineDecayTest(keras_parameterized.TestCase):
 
   def np_linear_cosine_decay(self,
                              step,
@@ -433,7 +421,6 @@ class LinearCosineDecayTest(test_util.TensorFlowTestCase):
     cosine_decayed = 0.5 * (1.0 + math.cos(math.pi * fraction))
     return (alpha + linear_decayed) * cosine_decayed + beta
 
-  @test_util.run_in_graph_and_eager_modes
   def testDefaultDecay(self):
     num_training_steps = 1000
     initial_lr = 1.0
@@ -443,7 +430,6 @@ class LinearCosineDecayTest(test_util.TensorFlowTestCase):
       expected = self.np_linear_cosine_decay(step, num_training_steps)
       self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testNonDefaultDecay(self):
     num_training_steps = 1000
     initial_lr = 1.0
@@ -460,9 +446,9 @@ class LinearCosineDecayTest(test_util.TensorFlowTestCase):
       self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
 
-class NoisyLinearCosineDecayTest(test_util.TensorFlowTestCase):
+@combinations.generate(combinations.combine(mode=["graph", "eager"]))
+class NoisyLinearCosineDecayTest(keras_parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
   def testDefaultNoisyLinearCosine(self):
     num_training_steps = 1000
     initial_lr = 1.0
@@ -473,7 +459,6 @@ class NoisyLinearCosineDecayTest(test_util.TensorFlowTestCase):
       # Cannot be deterministically tested
       self.evaluate(decayed_lr)
 
-  @test_util.run_in_graph_and_eager_modes
   def testNonDefaultNoisyLinearCosine(self):
     num_training_steps = 1000
     initial_lr = 1.0
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index c9ce3b043e8..18d94594542 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -27,7 +27,6 @@ import six
 
 from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
 from tensorflow.python.distribute import parameter_server_strategy
-from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.distribute import values as ds_values
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
@@ -38,16 +37,16 @@ from tensorflow.python.keras import backend
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
+from tensorflow.python.keras.optimizer_v2 import utils as optimizer_utils
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables as tf_variables
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import revived_types
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking import tracking
@@ -56,6 +55,12 @@ from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 
 
+_DEFAULT_VALID_DTYPES = frozenset([
+    dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64,
+    dtypes.complex64, dtypes.complex128
+])
+
+
 def _deduplicate_indexed_slices(values, indices):
   """Sums `values` associated with any non-unique `indices`.
 
@@ -345,7 +350,7 @@ class OptimizerV2(trackable.Trackable):
     else:
       self._distribution_strategy = None
 
-  def minimize(self, loss, var_list, grad_loss=None, name=None):
+  def minimize(self, loss, var_list, grad_loss=None, name=None, tape=None):
     """Minimize `loss` by updating `var_list`.
 
     This method simply computes gradient using `tf.GradientTape` and calls
@@ -354,14 +359,19 @@ class OptimizerV2(trackable.Trackable):
     of using this function.
 
     Args:
-      loss: A callable taking no arguments which returns the value to minimize.
+      loss: `Tensor` or callable. If a callable, `loss` should take no arguments
+        and return the value to minimize. If a `Tensor`, the `tape` argument
+        must be passed.
       var_list: list or tuple of `Variable` objects to update to minimize
         `loss`, or a callable returning the list or tuple of `Variable` objects.
         Use callable when the variable list would otherwise be incomplete before
         `minimize` since the variables are created at the first time `loss` is
         called.
-      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
-      name: Optional name for the returned operation.
+      grad_loss: (Optional). A `Tensor` holding the gradient computed for
+        `loss`.
+      name: (Optional) str. Name for the returned operation.
+      tape: (Optional) `tf.GradientTape`. If `loss` is provided as a `Tensor`,
+        the tape that computed the `loss` must be provided.
 
     Returns:
       An `Operation` that updates the variables in `var_list`. The `iterations`
@@ -372,8 +382,7 @@ class OptimizerV2(trackable.Trackable):
 
     """
     grads_and_vars = self._compute_gradients(
-        loss, var_list=var_list, grad_loss=grad_loss)
-
+        loss, var_list=var_list, grad_loss=grad_loss, tape=tape)
     return self.apply_gradients(grads_and_vars, name=name)
 
   def _clip_gradients(self, grads):
@@ -396,7 +405,7 @@ class OptimizerV2(trackable.Trackable):
       ]
     return grads
 
-  def _compute_gradients(self, loss, var_list, grad_loss=None):
+  def _compute_gradients(self, loss, var_list, grad_loss=None, tape=None):
     """Compute gradients of `loss` for the variables in `var_list`.
 
     This is the first part of `minimize()`.  It returns a list
@@ -406,13 +415,17 @@ class OptimizerV2(trackable.Trackable):
     given variable.
 
     Args:
-      loss: A callable taking no arguments which returns the value to minimize.
+      loss: `Tensor` or callable. If a callable, `loss` should take no
+        arguments and return the value to minimize. If a `Tensor`, the `tape`
+        argument must be passed.
       var_list: list or tuple of `Variable` objects to update to minimize
         `loss`, or a callable returning the list or tuple of `Variable` objects.
         Use callable when the variable list would otherwise be incomplete before
         `minimize` and the variables are created at the first time when `loss`
         is called.
       grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
+      tape: (Optional) `tf.GradientTape`. If `loss` is provided as a `Tensor`,
+        the tape that computed the `loss` must be provided.
 
     Returns:
       A list of (gradient, variable) pairs. Variable is always present, but
@@ -423,18 +436,28 @@ class OptimizerV2(trackable.Trackable):
       ValueError: If some arguments are invalid, or var_list is None.
     """
     # TODO(josh11b): Test that we handle weight decay in a reasonable way.
-    with backprop.GradientTape() as tape:
-      if not callable(var_list):
-        tape.watch(var_list)
-      loss_value = loss()
-    if callable(var_list):
-      var_list = var_list()
-    var_list = nest.flatten(var_list)
-    with backend.name_scope(self._name + "/gradients"):
-      grads = tape.gradient(loss_value, var_list, grad_loss)
-      grads = self._clip_gradients(grads)
+    if not callable(loss) and tape is None:
+      raise ValueError("`tape` is required when a `Tensor` loss is passed.")
+    tape = tape if tape is not None else backprop.GradientTape()
 
+    if callable(loss):
+      with tape:
+        if not callable(var_list):
+          tape.watch(var_list)
+
+        if callable(loss):
+          loss = loss()
+
+        if callable(var_list):
+          var_list = var_list()
+
+    var_list = nest.flatten(var_list)
+    with ops.name_scope_v2(self._name + "/gradients"):
+      grads = tape.gradient(loss, var_list, grad_loss)
+      # TODO(omalleyt): Move to post-aggregation.
+      grads = self._clip_gradients(grads)
     grads_and_vars = list(zip(grads, var_list))
+
     self._assert_valid_dtypes([
         v for g, v in grads_and_vars
         if g is not None and v.dtype != dtypes.resource
@@ -509,8 +532,9 @@ class OptimizerV2(trackable.Trackable):
     Raises:
       TypeError: If `grads_and_vars` is malformed.
       ValueError: If none of the variables have gradients.
+      RuntimeError: If called in a cross-replica context.
     """
-    grads_and_vars = _filter_grads(grads_and_vars)
+    grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
     var_list = [v for (_, v) in grads_and_vars]
 
     with backend.name_scope(self._name):
@@ -550,7 +574,10 @@ class OptimizerV2(trackable.Trackable):
           })
 
   def _aggregate_gradients(self, grads_and_vars):
-    """Returns all-reduced gradients.
+    """Returns aggregated gradients.
+
+    This method must be preserved to maintain backwards compatibility with
+    Horovod aggregation.
 
     Args:
       grads_and_vars: List of (gradient, variable) pairs.
@@ -558,32 +585,7 @@ class OptimizerV2(trackable.Trackable):
     Returns:
       A list of all-reduced gradients.
     """
-    grads_and_vars = list(grads_and_vars)
-    filtered_grads_and_vars = _filter_grads(grads_and_vars)
-    def all_reduce_fn(distribution, grads_and_vars):
-      return distribution.extended.batch_reduce_to(
-          ds_reduce_util.ReduceOp.SUM, grads_and_vars)
-    # We switch to a cross-replica context since there is a bug which causes
-    # IndexedSlices to be converted to dense tensors when all-reduced in a
-    # replica context.
-    # TODO(b/150507409): Do not switch to a cross-replica context once the bug
-    # is fixed.
-    if filtered_grads_and_vars:
-      reduced = distribute_ctx.get_replica_context().merge_call(
-          all_reduce_fn, args=(filtered_grads_and_vars,))
-    else:
-      reduced = []
-    # Copy 'reduced' but add None gradients back in
-    reduced_with_nones = []
-    reduced_pos = 0
-    for g, _ in grads_and_vars:
-      if g is None:
-        reduced_with_nones.append(None)
-      else:
-        reduced_with_nones.append(reduced[reduced_pos])
-        reduced_pos += 1
-    assert reduced_pos == len(reduced), "Failed to add all gradients"
-    return reduced_with_nones
+    return optimizer_utils.all_reduce_sum_gradients(grads_and_vars)
 
   def _distributed_apply(self, distribution, grads_and_vars, name, apply_state):
     """`apply_gradients` using a `DistributionStrategy`."""
@@ -1077,10 +1079,7 @@ class OptimizerV2(trackable.Trackable):
     Returns:
       Valid types for loss, variables and gradients.
     """
-    return set([
-        dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64,
-        dtypes.complex64, dtypes.complex128
-    ])
+    return _DEFAULT_VALID_DTYPES
 
   def _call_if_callable(self, param):
     """Call the function if param is callable."""
@@ -1150,13 +1149,16 @@ class OptimizerV2(trackable.Trackable):
     raise NotImplementedError("Must be implemented in subclasses.")
 
   def _resource_scatter_add(self, x, i, v):
-    with ops.control_dependencies(
-        [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
+    with ops.control_dependencies([
+        gen_resource_variable_ops.ResourceScatterAdd(
+            resource=x.handle, indices=i, updates=v)
+    ]):
       return x.value()
 
   def _resource_scatter_update(self, x, i, v):
     with ops.control_dependencies(
-        [resource_variable_ops.resource_scatter_update(x.handle, i, v)]):
+        [gen_resource_variable_ops.ResourceScatterUpdate(
+            resource=x.handle, indices=i, updates=v)]):
       return x.value()
 
   @property
@@ -1256,29 +1258,6 @@ class OptimizerV2(trackable.Trackable):
       yield
 
 
-def _filter_grads(grads_and_vars):
-  """Filter out iterable with grad equal to None."""
-  grads_and_vars = tuple(grads_and_vars)
-  if not grads_and_vars:
-    return grads_and_vars
-  filtered = []
-  vars_with_empty_grads = []
-  for grad, var in grads_and_vars:
-    if grad is None:
-      vars_with_empty_grads.append(var)
-    else:
-      filtered.append((grad, var))
-  filtered = tuple(filtered)
-  if not filtered:
-    raise ValueError("No gradients provided for any variable: %s." %
-                     ([v.name for _, v in grads_and_vars],))
-  if vars_with_empty_grads:
-    logging.warning(
-        ("Gradients do not exist for variables %s when minimizing the loss."),
-        ([v.name for v in vars_with_empty_grads]))
-  return filtered
-
-
 def _var_key(var):
   """Key for representing a primary variable, for looking up slots.
 
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
index 4479c378638..e994a6e1e44 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
@@ -59,6 +59,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import momentum
 from tensorflow.python.training import training_util
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
 _DATA_TYPES = [dtypes.half, dtypes.float32, dtypes.float64]
@@ -72,7 +73,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testBasic(self):
     for dtype in _DATA_TYPES:
-      with test_util.use_gpu():
+      with testing_utils.use_gpu():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         loss = lambda: 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
@@ -137,7 +138,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testPrecomputedGradient(self):
     for dtype in _DATA_TYPES:
-      with test_util.use_gpu():
+      with testing_utils.use_gpu():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         loss = lambda: 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
@@ -161,7 +162,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testNoGradients(self):
     for dtype in _DATA_TYPES:
-      with test_util.use_gpu():
+      with testing_utils.use_gpu():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         loss = lambda: 5 * var0  # pylint: disable=cell-var-from-loop
@@ -173,7 +174,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testNoGradientsForAnyVariables_Minimize(self):
     for dtype in _DATA_TYPES:
-      with test_util.use_gpu():
+      with testing_utils.use_gpu():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         loss = lambda: constant_op.constant(5.0)
@@ -186,7 +187,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testNoGradientsForAnyVariables_ApplyGradients(self):
     for dtype in _DATA_TYPES:
-      with test_util.use_gpu():
+      with testing_utils.use_gpu():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         sgd_op = gradient_descent.SGD(3.0)
@@ -197,7 +198,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testGradientsAsVariables(self):
     for i, dtype in enumerate(_DATA_TYPES):
-      with test_util.use_gpu():
+      with testing_utils.use_gpu():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         loss = lambda: 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
@@ -235,7 +236,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testComputeGradientsWithTensors(self):
-    with test_util.use_gpu():
+    with testing_utils.use_gpu():
       x = ops.convert_to_tensor_v2(1.0)
 
       def f():
@@ -255,7 +256,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
   def testConstraint(self):
     constraint_01 = lambda x: clip_ops.clip_by_value(x, -0.1, 0.)
     constraint_0 = lambda x: clip_ops.clip_by_value(x, 0., 1.)
-    with test_util.use_gpu():
+    with testing_utils.use_gpu():
       var0 = variables.Variable([1.0, 2.0],
                                 constraint=constraint_01)
       var1 = variables.Variable([3.0, 4.0],
@@ -277,14 +278,14 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testIterationWithoutMinimize(self):
-    with test_util.use_gpu():
+    with testing_utils.use_gpu():
       sgd = gradient_descent.SGD(3.0)
       self.evaluate(sgd.iterations.initializer)
       self.assertEqual(0, self.evaluate(sgd.iterations))
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testConfig(self):
-    with test_util.use_gpu():
+    with testing_utils.use_gpu():
       opt = gradient_descent.SGD(learning_rate=1.0)
       config = opt.get_config()
       opt2 = gradient_descent.SGD.from_config(config)
@@ -304,7 +305,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testConfigWithLearningRateDecay(self):
-    with test_util.use_gpu():
+    with testing_utils.use_gpu():
       var0 = variables.Variable([[1.0], [2.0]], dtype=dtypes.float32)
       for decay_schedule in [
           learning_rate_schedule.InverseTimeDecay(
@@ -335,7 +336,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testGradClipValue(self):
-    with test_util.use_gpu():
+    with testing_utils.use_gpu():
       var = variables.Variable([1.0, 2.0])
       loss = lambda: 3 * var
       opt = gradient_descent.SGD(learning_rate=1.0, clipvalue=1.0)
@@ -346,7 +347,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testGradClipNorm(self):
-    with test_util.use_gpu():
+    with testing_utils.use_gpu():
       var = variables.Variable([1.0])
       loss = lambda: 3 * var
       opt = gradient_descent.SGD(learning_rate=1.0, clipnorm=1.0)
@@ -367,7 +368,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testWeights(self):
-    with test_util.use_gpu():
+    with testing_utils.use_gpu():
       opt1 = adam.Adam(learning_rate=1.0)
       var1 = variables.Variable([1.0, 2.0], dtype=dtypes.float32)
       loss1 = lambda: 3 * var1
@@ -644,6 +645,23 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
     self.evaluate(opt_op)
     self.assertAllClose([0.7, 1.7], self.evaluate(var))
 
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def testRestoringIterationsWithoutAnOptimizer(self):
+    opt = gradient_descent.SGD(3.0)
+    opt.iterations.assign(5)
+    checkpoint = trackable_utils.Checkpoint(optimizer=opt)
+    path = checkpoint.save(self.get_temp_dir())
+
+    # Following verifies that the `iterations` can be restored with the absence
+    # of an `Optimizer` object (using a `Checkpoint` as a placeholder).
+    iterations_var = variables.Variable(0, dtype=dtypes.int64)
+    optimizer_checkpoint = trackable_utils.Checkpoint(iter=iterations_var)
+    checkpoint_to_restore = trackable_utils.Checkpoint(
+        optimizer=optimizer_checkpoint)
+    checkpoint_to_restore.restore(path)
+
+    self.assertEqual(5, self.evaluate(iterations_var))
+
 
 @keras_parameterized.run_all_keras_modes
 class OptimizersCompatibilityTest(keras_parameterized.TestCase):
@@ -653,7 +671,7 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase):
       self.skipTest(
           'v1 optimizer does not run in eager mode')
     np.random.seed(1331)
-    with test_util.use_gpu():
+    with testing_utils.use_gpu():
       train_samples = 20
       input_dim = 3
       num_classes = 2
@@ -739,7 +757,7 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase):
       self.skipTest(
           'v1 optimizer does not run in eager mode')
     np.random.seed(1331)
-    with test_util.use_gpu():
+    with testing_utils.use_gpu():
       train_samples = 20
       input_dim = 3
       num_classes = 2
@@ -796,7 +814,7 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase):
       self.skipTest(
           'v1 optimizer does not run in eager mode')
     np.random.seed(1331)
-    with test_util.use_gpu():
+    with testing_utils.use_gpu():
       train_samples = 20
       input_dim = 3
       num_classes = 2
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop.py b/tensorflow/python/keras/optimizer_v2/rmsprop.py
index d1deaf34f45..1fa2577e72f 100644
--- a/tensorflow/python/keras/optimizer_v2/rmsprop.py
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop.py
@@ -27,7 +27,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.training import training_ops
+from tensorflow.python.training import gen_training_ops
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -182,27 +182,27 @@ class RMSprop(optimizer_v2.OptimizerV2):
       mom = self.get_slot(var, "momentum")
       if self.centered:
         mg = self.get_slot(var, "mg")
-        return training_ops.resource_apply_centered_rms_prop(
-            var.handle,
-            mg.handle,
-            rms.handle,
-            mom.handle,
-            coefficients["lr_t"],
-            coefficients["rho"],
-            coefficients["momentum"],
-            coefficients["epsilon"],
-            grad,
+        return gen_training_ops.ResourceApplyCenteredRMSProp(
+            var=var.handle,
+            mg=mg.handle,
+            ms=rms.handle,
+            mom=mom.handle,
+            lr=coefficients["lr_t"],
+            rho=coefficients["rho"],
+            momentum=coefficients["momentum"],
+            epsilon=coefficients["epsilon"],
+            grad=grad,
             use_locking=self._use_locking)
       else:
-        return training_ops.resource_apply_rms_prop(
-            var.handle,
-            rms.handle,
-            mom.handle,
-            coefficients["lr_t"],
-            coefficients["rho"],
-            coefficients["momentum"],
-            coefficients["epsilon"],
-            grad,
+        return gen_training_ops.ResourceApplyRMSProp(
+            var=var.handle,
+            ms=rms.handle,
+            mom=mom.handle,
+            lr=coefficients["lr_t"],
+            rho=coefficients["rho"],
+            momentum=coefficients["momentum"],
+            epsilon=coefficients["epsilon"],
+            grad=grad,
             use_locking=self._use_locking)
     else:
       rms_t = (coefficients["rho"] * rms +
@@ -228,29 +228,29 @@ class RMSprop(optimizer_v2.OptimizerV2):
       mom = self.get_slot(var, "momentum")
       if self.centered:
         mg = self.get_slot(var, "mg")
-        return training_ops.resource_sparse_apply_centered_rms_prop(
-            var.handle,
-            mg.handle,
-            rms.handle,
-            mom.handle,
-            coefficients["lr_t"],
-            coefficients["rho"],
-            coefficients["momentum"],
-            coefficients["epsilon"],
-            grad,
-            indices,
+        return gen_training_ops.ResourceSparseApplyCenteredRMSProp(
+            var=var.handle,
+            mg=mg.handle,
+            ms=rms.handle,
+            mom=mom.handle,
+            lr=coefficients["lr_t"],
+            rho=coefficients["rho"],
+            momentum=coefficients["momentum"],
+            epsilon=coefficients["epsilon"],
+            grad=grad,
+            indices=indices,
             use_locking=self._use_locking)
       else:
-        return training_ops.resource_sparse_apply_rms_prop(
-            var.handle,
-            rms.handle,
-            mom.handle,
-            coefficients["lr_t"],
-            coefficients["rho"],
-            coefficients["momentum"],
-            coefficients["epsilon"],
-            grad,
-            indices,
+        return gen_training_ops.ResourceSparseApplyRMSProp(
+            var=var.handle,
+            ms=rms.handle,
+            mom=mom.handle,
+            lr=coefficients["lr_t"],
+            rho=coefficients["rho"],
+            momentum=coefficients["momentum"],
+            epsilon=coefficients["epsilon"],
+            grad=grad,
+            indices=indices,
             use_locking=self._use_locking)
     else:
       rms_scaled_g_values = (grad * grad) * coefficients["one_minus_rho"]
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
index 5fd91588227..35f795edb53 100644
--- a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import combinations
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.keras.optimizer_v2 import rmsprop
 from tensorflow.python.ops import embedding_ops
@@ -104,7 +105,7 @@ class RMSpropOptimizerTest(test.TestCase):
   def testDense(self):
     # TODO(tanzheny, omalleyt): Fix test in eager mode.
     for (dtype, learning_rate, rho, momentum, epsilon, centered) in _TESTPARAMS:
-      with ops.get_default_graph().as_default(), test_util.use_gpu():
+      with ops.get_default_graph().as_default(), testing_utils.use_gpu():
         # Initialize variables for numpy implementation.
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
         grads0_np = np.array([0.1, 0.2], dtype=dtype.as_numpy_dtype)
@@ -379,7 +380,7 @@ class RMSpropOptimizerTest(test.TestCase):
   def testSparse(self):
     # TODO(tanzheny, omalleyt): Fix test in eager mode.
     for (dtype, learning_rate, rho, momentum, epsilon, centered) in _TESTPARAMS:
-      with ops.get_default_graph().as_default(), test_util.use_gpu():
+      with ops.get_default_graph().as_default(), testing_utils.use_gpu():
         # Initialize variables for numpy implementation.
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
         grads0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
diff --git a/tensorflow/python/keras/optimizer_v2/utils.py b/tensorflow/python/keras/optimizer_v2/utils.py
new file mode 100644
index 00000000000..9f680e04dd6
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/utils.py
@@ -0,0 +1,87 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Optimizer utilities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
+from tensorflow.python.distribute import reduce_util as ds_reduce_util
+from tensorflow.python.platform import tf_logging as logging
+
+
+def all_reduce_sum_gradients(grads_and_vars):
+  """Returns all-reduced gradients aggregated via summation.
+
+  Args:
+    grads_and_vars: List of (gradient, variable) pairs.
+
+  Returns:
+    A list of all-reduced gradients.
+  """
+  grads_and_vars = list(grads_and_vars)
+  filtered_grads_and_vars = filter_empty_gradients(grads_and_vars)
+  # We switch to a cross-replica context since there is a bug which causes
+  # IndexedSlices to be converted to dense tensors when all-reduced in a
+  # replica context.
+  # TODO(b/150507409): Do not switch to a cross-replica context once the bug
+  # is fixed.
+  if filtered_grads_and_vars:
+    reduced = distribute_ctx.get_replica_context().merge_call(
+        _all_reduce_sum_fn, args=(filtered_grads_and_vars,))
+  else:
+    reduced = []
+  # Copy 'reduced' but add None gradients back in
+  reduced_with_nones = []
+  reduced_pos = 0
+  for g, _ in grads_and_vars:
+    if g is None:
+      reduced_with_nones.append(None)
+    else:
+      reduced_with_nones.append(reduced[reduced_pos])
+      reduced_pos += 1
+  assert reduced_pos == len(reduced), "Failed to add all gradients"
+  return reduced_with_nones
+
+
+def filter_empty_gradients(grads_and_vars):
+  """Filter out `(grad, var)` pairs that have a gradient equal to `None`."""
+  grads_and_vars = tuple(grads_and_vars)
+  if not grads_and_vars:
+    return grads_and_vars
+
+  filtered = []
+  vars_with_empty_grads = []
+  for grad, var in grads_and_vars:
+    if grad is None:
+      vars_with_empty_grads.append(var)
+    else:
+      filtered.append((grad, var))
+  filtered = tuple(filtered)
+
+  if not filtered:
+    raise ValueError("No gradients provided for any variable: %s." %
+                     ([v.name for _, v in grads_and_vars],))
+  if vars_with_empty_grads:
+    logging.warning(
+        ("Gradients do not exist for variables %s when minimizing the loss."),
+        ([v.name for v in vars_with_empty_grads]))
+  return filtered
+
+
+def _all_reduce_sum_fn(distribution, grads_and_vars):
+  return distribution.extended.batch_reduce_to(ds_reduce_util.ReduceOp.SUM,
+                                               grads_and_vars)
diff --git a/tensorflow/python/keras/premade/linear.py b/tensorflow/python/keras/premade/linear.py
index edb4dc4b442..438e3270021 100644
--- a/tensorflow/python/keras/premade/linear.py
+++ b/tensorflow/python/keras/premade/linear.py
@@ -18,10 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import activations
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import nn
@@ -93,27 +95,45 @@ class LinearModel(training.Model):
     self.kernel_regularizer = regularizers.get(kernel_regularizer)
     self.bias_regularizer = regularizers.get(bias_regularizer)
     super(LinearModel, self).__init__(**kwargs)
-    base_layer._keras_model_gauge.get_cell('Linear').set(True)  # pylint: disable=protected-access
+    base_layer.keras_model_gauge.get_cell('Linear').set(True)
 
   def build(self, input_shape):
-    self.dense_layers = []
-    if isinstance(input_shape, (tuple, list)):
-      for shape in input_shape:
+    if isinstance(input_shape, dict):
+      names = sorted(list(input_shape.keys()))
+      self.input_specs = []
+      self.dense_layers = []
+      for name in names:
+        shape = input_shape[name]
         layer = core.Dense(
             units=self.units,
             use_bias=False,
             kernel_initializer=self.kernel_initializer,
             kernel_regularizer=self.kernel_regularizer,
-            input_shape=shape)
+            name=name)
+        layer.build(shape)
+        self.input_specs.append(
+            input_spec.InputSpec(shape=shape, name=name))
+        self.dense_layers.append(layer)
+    elif isinstance(input_shape, (tuple, list)) and all(
+        isinstance(shape, tensor_shape.TensorShape) for shape in input_shape):
+      self.dense_layers = []
+      for shape in input_shape:
+        layer = core.Dense(
+            units=self.units,
+            use_bias=False,
+            kernel_initializer=self.kernel_initializer,
+            kernel_regularizer=self.kernel_regularizer)
+        layer.build(shape)
         self.dense_layers.append(layer)
     else:
+      # input_shape can be a single TensorShape or a tuple of ints.
       layer = core.Dense(
           units=self.units,
           use_bias=False,
           kernel_initializer=self.kernel_initializer,
-          kernel_regularizer=self.kernel_regularizer,
-          input_shape=input_shape)
-      self.dense_layers.append(layer)
+          kernel_regularizer=self.kernel_regularizer)
+      layer.build(input_shape)
+      self.dense_layers = [layer]
 
     if self.use_bias:
       self.bias = self.add_weight(
@@ -125,20 +145,37 @@ class LinearModel(training.Model):
           trainable=True)
     else:
       self.bias = None
+    self.built = True
 
   def call(self, inputs):
-    if not isinstance(inputs, (tuple, list)):
-      inputs = [inputs]
-    if len(inputs) != len(self.dense_layers):
-      raise ValueError('Expected {} inputs, but got {} inputs'.format(
-          len(self.dense_layers), len(inputs)))
     result = None
-    for inp, layer in zip(inputs, self.dense_layers):
-      output = layer(inp)
-      if result is None:
-        result = output
-      else:
-        result += output
+    if isinstance(inputs, dict):
+      names = [layer.name for layer in self.dense_layers]
+      different_keys = set(names) - set(inputs.keys())
+      if different_keys:
+        raise ValueError(
+            'The input dictionary does not match '
+            'the structure expected by the model.'
+            '\n\tExpected keys: {}'
+            '\n\tReceived keys: {}'
+            '\n\tMissing keys: {}'.format(set(names), set(inputs.keys()),
+                                          different_keys))
+      inputs = [inputs[name] for name in names]
+      for inp, layer in zip(inputs, self.dense_layers):
+        output = layer(inp)
+        if result is None:
+          result = output
+        else:
+          result += output
+    elif isinstance(inputs, (tuple, list)):
+      for inp, layer in zip(inputs, self.dense_layers):
+        output = layer(inp)
+        if result is None:
+          result = output
+        else:
+          result += output
+    else:
+      result = self.dense_layers[0](inputs)
 
     if self.use_bias:
       result = nn.bias_add(result, self.bias)
diff --git a/tensorflow/python/keras/premade/linear_test.py b/tensorflow/python/keras/premade/linear_test.py
index 676f29bb840..ad57baa7813 100644
--- a/tensorflow/python/keras/premade/linear_test.py
+++ b/tensorflow/python/keras/premade/linear_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import losses
@@ -51,7 +52,7 @@ class LinearModelTest(keras_parameterized.TestCase):
     model.fit(inp, output, epochs=5)
     self.assertTrue(model.built)
 
-  def test_linear_model_with_multi_input(self):
+  def test_linear_model_with_list_input(self):
     model = linear.LinearModel()
     input_a = np.random.uniform(low=-5, high=5, size=(64, 1))
     input_b = np.random.uniform(low=-5, high=5, size=(64, 1))
@@ -59,6 +60,25 @@ class LinearModelTest(keras_parameterized.TestCase):
     model.compile('sgd', 'mse', [])
     model.fit([input_a, input_b], output, epochs=5)
 
+  def test_linear_model_with_mismatched_dict_inputs(self):
+    model = linear.LinearModel()
+    input_a = np.random.uniform(low=-5, high=5, size=(64, 1))
+    input_b = np.random.uniform(low=-5, high=5, size=(64, 1))
+    output = .3 * input_a + .2 * input_b
+    model.compile('sgd', 'mse', [])
+    model.build({'a': tensor_shape.TensorShape([None, 1]),
+                 'b': tensor_shape.TensorShape([None, 1])})
+    with self.assertRaisesRegex(ValueError, 'Missing keys'):
+      model.fit({'c': input_a, 'b': input_b}, output, epochs=5)
+
+  def test_linear_model_with_dict_input(self):
+    model = linear.LinearModel()
+    input_a = np.random.uniform(low=-5, high=5, size=(64, 1))
+    input_b = np.random.uniform(low=-5, high=5, size=(64, 1))
+    output = .3 * input_a + .2 * input_b
+    model.compile('sgd', 'mse', [])
+    model.fit({'a': input_a, 'b': input_b}, output, epochs=5)
+
   def test_linear_model_as_layer(self):
     input_a = input_layer.Input(shape=(1,), name='a')
     output_a = linear.LinearModel()(input_a)
diff --git a/tensorflow/python/keras/premade/wide_deep.py b/tensorflow/python/keras/premade/wide_deep.py
index 8638d3afc71..edb0124276f 100644
--- a/tensorflow/python/keras/premade/wide_deep.py
+++ b/tensorflow/python/keras/premade/wide_deep.py
@@ -85,7 +85,7 @@ class WideDeepModel(keras_training.Model):
         Allowed keyword arguments include `name`.
     """
     super(WideDeepModel, self).__init__(**kwargs)
-    base_layer._keras_model_gauge.get_cell('WideDeep').set(True)  # pylint: disable=protected-access
+    base_layer.keras_model_gauge.get_cell('WideDeep').set(True)
     self.linear_model = linear_model
     self.dnn_model = dnn_model
     self.activation = activations.get(activation)
diff --git a/tensorflow/python/keras/preprocessing/dataset_utils.py b/tensorflow/python/keras/preprocessing/dataset_utils.py
index 1c9d283c2f1..5000f5f798a 100644
--- a/tensorflow/python/keras/preprocessing/dataset_utils.py
+++ b/tensorflow/python/keras/preprocessing/dataset_utils.py
@@ -189,6 +189,16 @@ def get_training_or_validation_split(samples, labels, validation_split, subset):
 
 
 def labels_to_dataset(labels, label_mode, num_classes):
+  """Create a tf.data.Dataset from the list/tuple of labels.
+
+  Args:
+    labels: list/tuple of labels to be converted into a tf.data.Dataset.
+    label_mode: - 'binary' indicates that the labels (there can be only 2) are
+      encoded as `float32` scalars with values 0 or 1 (e.g. for
+      `binary_crossentropy`). - 'categorical' means that the labels are mapped
+      into a categorical vector. (e.g. for `categorical_crossentropy` loss).
+    num_classes: number of classes of labels.
+  """
   label_ds = dataset_ops.Dataset.from_tensor_slices(labels)
   if label_mode == 'binary':
     label_ds = label_ds.map(
@@ -199,7 +209,16 @@ def labels_to_dataset(labels, label_mode, num_classes):
 
 
 def check_validation_split_arg(validation_split, subset, shuffle, seed):
-  """Raise errors in case of invalid argument values."""
+  """Raise errors in case of invalid argument values.
+
+  Args:
+    shuffle: Whether to shuffle the data. Either True or False.
+    seed: random seed for shuffling and transformations.
+    validation_split: float between 0 and 1, fraction of data to reserve for
+      validation.
+    subset: One of "training" or "validation". Only used if `validation_split`
+      is set.
+  """
   if validation_split and not 0 < validation_split < 1:
     raise ValueError(
         '`validation_split` must be between 0 and 1, received: %s' %
diff --git a/tensorflow/python/keras/preprocessing/image.py b/tensorflow/python/keras/preprocessing/image.py
index ecf069958cd..f943967d65b 100644
--- a/tensorflow/python/keras/preprocessing/image.py
+++ b/tensorflow/python/keras/preprocessing/image.py
@@ -120,25 +120,24 @@ def smart_resize(x, size, interpolation='bilinear'):
   shape = array_ops.shape(img)
   height, width = shape[0], shape[1]
   target_height, target_width = size
-  target_ratio = float(target_height) / target_width
-  img_ratio = math_ops.cast(
-      height, 'float32') / math_ops.cast(width, 'float32')
-  if target_ratio < img_ratio:
-    crop_height = math_ops.cast(
-        math_ops.cast(width, 'float32') * target_height / target_width, 'int32')
-    crop_box_hstart = math_ops.cast(
-        math_ops.cast(height - crop_height, 'float32') / 2, 'int32')
-    crop_box_start = [crop_box_hstart, 0, 0]
-    crop_box_size = [crop_height, -1, -1]
-  else:
-    crop_width = math_ops.cast(
-        math_ops.cast(height * target_width, 'float32') / target_height,
-        'int32')
-    crop_box_wstart = math_ops.cast((width - crop_width) / 2, 'int32')
-    crop_box_start = [0, crop_box_wstart, 0]
-    crop_box_size = [-1, crop_width, -1]
-  crop_box_start = array_ops.stack(crop_box_start)
-  crop_box_size = array_ops.stack(crop_box_size)
+
+  crop_height = math_ops.cast(
+      math_ops.cast(width * target_height, 'float32') / target_width, 'int32')
+  crop_width = math_ops.cast(
+      math_ops.cast(height * target_width, 'float32') / target_height, 'int32')
+
+  # Set back to input height / width if crop_height / crop_width is not smaller.
+  crop_height = math_ops.minimum(height, crop_height)
+  crop_width = math_ops.minimum(width, crop_width)
+
+  crop_box_hstart = math_ops.cast(
+      math_ops.cast(height - crop_height, 'float32') / 2, 'int32')
+  crop_box_wstart = math_ops.cast(
+      math_ops.cast(width - crop_width, 'float32') / 2, 'int32')
+
+  crop_box_start = array_ops.stack([crop_box_hstart, crop_box_wstart, 0])
+  crop_box_size = array_ops.stack([crop_height, crop_width, -1])
+
   img = array_ops.slice(img, crop_box_start, crop_box_size)
   img = image_ops.resize_images_v2(
       images=img,
diff --git a/tensorflow/python/keras/preprocessing/image_test.py b/tensorflow/python/keras/preprocessing/image_test.py
index d2f4b18f7dd..9d010a98f61 100644
--- a/tensorflow/python/keras/preprocessing/image_test.py
+++ b/tensorflow/python/keras/preprocessing/image_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.keras import layers
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.preprocessing import image as preprocessing_image
 from tensorflow.python.platform import test
+from tensorflow.python.data import Dataset
 
 try:
   import PIL  # pylint:disable=g-import-not-at-top
@@ -70,6 +71,19 @@ class TestImage(keras_parameterized.TestCase):
     output = preprocessing_image.smart_resize(test_input, size=(5, 15))
     self.assertListEqual(list(output.shape), [5, 15, 3])
 
+  @test_util.run_v2_only
+  def test_smart_resize_tf_dataset(self):
+    test_input_np = np.random.random((2, 20, 40, 3))
+    test_ds = Dataset.from_tensor_slices(test_input_np)
+
+    resize = lambda img: preprocessing_image.smart_resize(img, size=size)
+
+    for size in [(50, 50), (10, 10), (100, 50), (5, 15)]:
+      test_ds = test_ds.map(resize)
+      for sample in test_ds.as_numpy_iterator():
+        self.assertIsInstance(sample, np.ndarray)
+        self.assertListEqual(list(sample.shape), [size[0], size[1], 3])
+
   def test_smart_resize_errors(self):
     with self.assertRaisesRegex(ValueError, 'a tuple of 2 integers'):
       preprocessing_image.smart_resize(
diff --git a/tensorflow/python/keras/saving/BUILD b/tensorflow/python/keras/saving/BUILD
index 258c1dbfaec..62000be42d9 100644
--- a/tensorflow/python/keras/saving/BUILD
+++ b/tensorflow/python/keras/saving/BUILD
@@ -144,6 +144,7 @@ tf_py_test(
     tags = [
         "no_rocm",
         "no_windows",
+        "notap",  # TODO(b/161198218): flaky timeout
     ],
     deps = [
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
index 3aa4fe1245a..31c9a6e14e0 100644
--- a/tensorflow/python/keras/saving/hdf5_format.py
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -31,11 +31,10 @@ from tensorflow.python.keras.saving import model_config as model_config_lib
 from tensorflow.python.keras.saving import saving_utils
 from tensorflow.python.keras.saving.saved_model import json_utils
 from tensorflow.python.keras.utils import conv_utils
+from tensorflow.python.keras.utils.generic_utils import LazyLoader
 from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.ops import variables as variables_module
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import serialization
-from tensorflow.python.util.lazy_loader import LazyLoader
 
 # pylint: disable=g-import-not-at-top
 try:
@@ -111,7 +110,7 @@ def save_model_to_hdf5(model, filepath, overwrite=True, include_optimizer=True):
     for k, v in model_metadata.items():
       if isinstance(v, (dict, list, tuple)):
         f.attrs[k] = json.dumps(
-            v, default=serialization.get_json_type).encode('utf8')
+            v, default=json_utils.get_json_type).encode('utf8')
       else:
         f.attrs[k] = v
 
diff --git a/tensorflow/python/keras/saving/hdf5_format_test.py b/tensorflow/python/keras/saving/hdf5_format_test.py
index 1bd3cd614e8..dea492db4dc 100644
--- a/tensorflow/python/keras/saving/hdf5_format_test.py
+++ b/tensorflow/python/keras/saving/hdf5_format_test.py
@@ -32,7 +32,6 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import optimizers
@@ -41,6 +40,7 @@ from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.saving import hdf5_format
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
@@ -451,7 +451,7 @@ class TestWholeModelSaving(keras_parameterized.TestCase):
       eval_out2 = loaded_model.evaluate(x, y)
       self.assertArrayNear(eval_out, eval_out2, 0.001)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_sequential_model_saving_without_input_shape(self):
     saved_model_dir = self._save_model_dir()
     save_format = testing_utils.get_save_format()
@@ -486,7 +486,7 @@ class TestWholeModelSaving(keras_parameterized.TestCase):
       out2 = new_model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_sequential_model_saving_without_compile(self):
     saved_model_dir = self._save_model_dir()
     save_format = testing_utils.get_save_format()
@@ -833,6 +833,34 @@ class TestWholeModelSaving(keras_parameterized.TestCase):
         self.assertIsInstance(loaded.optimizer,
                               keras.optimizer_v2.optimizer_v2.OptimizerV2)
 
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def test_functional_model_with_getitem_op_layer(self):
+    inp = keras.Input(shape=(8))
+
+    out = inp[:]
+    model = keras.Model(
+        inputs=[inp],
+        outputs=out)
+    batch_size = 7
+    x = array_ops.stack([
+        math_ops.range(8) for _ in range(batch_size)])
+    args = [x]
+    expected = x[:]
+
+    self.assertAllEqual(model(args), expected)
+    self.assertAllEqual(model.predict(args, batch_size=batch_size), expected)
+
+    # Make sure it can be successfully saved and loaded
+    save_format = testing_utils.get_save_format()
+    saved_model_dir = self._save_model_dir()
+    keras.models.save_model(model, saved_model_dir, save_format=save_format)
+
+    loaded_model = keras.models.load_model(saved_model_dir)
+
+    self.assertAllEqual(loaded_model(args), expected)
+    self.assertAllEqual(loaded_model.predict(args, batch_size=batch_size),
+                        expected)
+
 
 # Factory functions to create models that will be serialized inside a Network.
 def _make_graph_network(input_size, output_size):
diff --git a/tensorflow/python/keras/saving/save.py b/tensorflow/python/keras/saving/save.py
index 9c83914d380..c0c69c4e715 100644
--- a/tensorflow/python/keras/saving/save.py
+++ b/tensorflow/python/keras/saving/save.py
@@ -27,6 +27,7 @@ from tensorflow.python.keras.saving.saved_model import load as saved_model_load
 from tensorflow.python.keras.saving.saved_model import save as saved_model_save
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils.io_utils import path_to_string
+from tensorflow.python.saved_model import load_context
 from tensorflow.python.saved_model import loader_impl
 from tensorflow.python.util.tf_export import keras_export
 
@@ -177,14 +178,16 @@ def load_model(filepath, custom_objects=None, compile=True, options=None):  # py
       IOError: In case of an invalid savefile.
   """
   with generic_utils.CustomObjectScope(custom_objects or {}):
-    if (h5py is not None and (
-        isinstance(filepath, h5py.File) or h5py.is_hdf5(filepath))):
-      return hdf5_format.load_model_from_hdf5(filepath, custom_objects, compile)
+    with load_context.load_context(options):
+      if (h5py is not None and
+          (isinstance(filepath, h5py.File) or h5py.is_hdf5(filepath))):
+        return hdf5_format.load_model_from_hdf5(filepath, custom_objects,
+                                                compile)
 
-    filepath = path_to_string(filepath)
-    if isinstance(filepath, six.string_types):
-      loader_impl.parse_saved_model(filepath)
-      return saved_model_load.load(filepath, compile, options)
+      filepath = path_to_string(filepath)
+      if isinstance(filepath, six.string_types):
+        loader_impl.parse_saved_model(filepath)
+        return saved_model_load.load(filepath, compile, options)
 
   raise IOError(
       'Unable to load model. Filepath is not an hdf5 file (or h5py is not '
diff --git a/tensorflow/python/keras/saving/save_test.py b/tensorflow/python/keras/saving/save_test.py
index 59fe6c2c756..4df46864f22 100644
--- a/tensorflow/python/keras/saving/save_test.py
+++ b/tensorflow/python/keras/saving/save_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.keras import combinations
 from tensorflow.python.keras import losses
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.feature_column import dense_features
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.saving import model_config
 from tensorflow.python.keras.saving import save
@@ -71,6 +72,14 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
     save.save_model(self.model, path)
     self.assert_saved_model(path)
 
+  @test_util.run_v2_only
+  def test_save_format_defaults_pathlib(self):
+    if sys.version_info < (3, 6):
+      self.skipTest('pathlib is only available for python version >= 3.6')
+    path = pathlib.Path(self.get_temp_dir()) / 'model_path'
+    save.save_model(self.model, path)
+    self.assert_saved_model(path)
+
   @test_util.run_v2_only
   def test_save_hdf5(self):
     path = os.path.join(self.get_temp_dir(), 'model')
@@ -81,6 +90,14 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
         'requires the model to be a Functional model or a Sequential model.'):
       save.save_model(self.subclassed_model, path, save_format='h5')
 
+  @test_util.run_v2_only
+  def test_save_load_hdf5_pathlib(self):
+    if sys.version_info < (3, 6):
+      self.skipTest('pathlib is only available for python version >= 3.6')
+    path = pathlib.Path(self.get_temp_dir()) / 'model'
+    save.save_model(self.model, path, save_format='h5')
+    save.load_model(path)
+
   @test_util.run_v2_only
   def test_save_tf(self):
     path = os.path.join(self.get_temp_dir(), 'model')
@@ -100,10 +117,27 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
 
   @test_util.run_v2_only
   def test_save_load_tf_pathlib(self):
-    if sys.version_info >= (3, 6):
-      path = pathlib.Path(self.get_temp_dir()) / 'model'
-      save.save_model(self.model, path, save_format='tf')
-      save.load_model(path)
+    if sys.version_info < (3, 6):
+      self.skipTest('pathlib is only available for python version >= 3.6')
+    path = pathlib.Path(self.get_temp_dir()) / 'model'
+    save.save_model(self.model, path, save_format='tf')
+    save.load_model(path)
+
+  @test_util.run_v2_only
+  def test_save_load_weights_tf_pathlib(self):
+    if sys.version_info < (3, 6):
+      self.skipTest('pathlib is only available for python version >= 3.6')
+    path = pathlib.Path(self.get_temp_dir()) / 'model'
+    self.model.save_weights(path, save_format='tf')
+    self.model.load_weights(path)
+
+  @test_util.run_v2_only
+  def test_save_load_weights_hdf5_pathlib(self):
+    if sys.version_info < (3, 6):
+      self.skipTest('pathlib is only available for python version >= 3.6')
+    path = pathlib.Path(self.get_temp_dir()) / 'model'
+    self.model.save_weights(path, save_format='h5')
+    self.model.load_weights(path)
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_saving_with_dense_features(self):
@@ -118,7 +152,7 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
         'b': keras.layers.Input(shape=(1,), name='b', dtype='string')
     }
 
-    fc_layer = feature_column_lib.DenseFeatures(cols)(input_layers)
+    fc_layer = dense_features.DenseFeatures(cols)(input_layers)
     output = keras.layers.Dense(10)(fc_layer)
 
     model = keras.models.Model(input_layers, output)
diff --git a/tensorflow/python/keras/saving/saved_model/json_utils.py b/tensorflow/python/keras/saving/saved_model/json_utils.py
index 0ac86d4e692..d06e4180564 100644
--- a/tensorflow/python/keras/saving/saved_model/json_utils.py
+++ b/tensorflow/python/keras/saving/saved_model/json_utils.py
@@ -26,10 +26,19 @@ from __future__ import division
 from __future__ import print_function
 
 import json
+import numpy as np
+import wrapt
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.util import serialization
 
+try:
+  # This import only works on python 3.3 and above.
+  import collections.abc as collections_abc  # pylint: disable=unused-import, g-import-not-at-top
+except ImportError:
+  import collections as collections_abc  # pylint: disable=unused-import, g-import-not-at-top
+
 
 class Encoder(json.JSONEncoder):
   """JSON encoder and decoder that handles TensorShapes and tuples."""
@@ -61,9 +70,65 @@ def decode(json_string):
 
 
 def _decode_helper(obj):
+  """A decoding helper that is TF-object aware."""
   if isinstance(obj, dict) and 'class_name' in obj:
     if obj['class_name'] == 'TensorShape':
       return tensor_shape.TensorShape(obj['items'])
     elif obj['class_name'] == '__tuple__':
       return tuple(_decode_helper(i) for i in obj['items'])
+    elif obj['class_name'] == '__ellipsis__':
+      return Ellipsis
   return obj
+
+
+def get_json_type(obj):
+  """Serializes any object to a JSON-serializable structure.
+
+  Arguments:
+      obj: the object to serialize
+
+  Returns:
+      JSON-serializable structure representing `obj`.
+
+  Raises:
+      TypeError: if `obj` cannot be serialized.
+  """
+  # if obj is a serializable Keras class instance
+  # e.g. optimizer, layer
+  if hasattr(obj, 'get_config'):
+    return {'class_name': obj.__class__.__name__, 'config': obj.get_config()}
+
+  # if obj is any numpy type
+  if type(obj).__module__ == np.__name__:
+    if isinstance(obj, np.ndarray):
+      return obj.tolist()
+    else:
+      return obj.item()
+
+  # misc functions (e.g. loss function)
+  if callable(obj):
+    return obj.__name__
+
+  # if obj is a python 'type'
+  if type(obj).__name__ == type.__name__:
+    return obj.__name__
+
+  if isinstance(obj, tensor_shape.Dimension):
+    return obj.value
+
+  if isinstance(obj, tensor_shape.TensorShape):
+    return obj.as_list()
+
+  if isinstance(obj, dtypes.DType):
+    return obj.name
+
+  if isinstance(obj, collections_abc.Mapping):
+    return dict(obj)
+
+  if obj is Ellipsis:
+    return {'class_name': '__ellipsis__'}
+
+  if isinstance(obj, wrapt.ObjectProxy):
+    return obj.__wrapped__
+
+  raise TypeError('Not JSON Serializable:', obj)
diff --git a/tensorflow/python/keras/saving/saved_model/load.py b/tensorflow/python/keras/saving/saved_model/load.py
index 0b55e30c27b..c0160609ef4 100644
--- a/tensorflow/python/keras/saving/saved_model/load.py
+++ b/tensorflow/python/keras/saving/saved_model/load.py
@@ -34,6 +34,7 @@ from tensorflow.python.keras.saving.saved_model import utils
 from tensorflow.python.keras.saving.saved_model.serialized_attributes import CommonEndpoints
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import metrics_utils
+from tensorflow.python.keras.utils.generic_utils import LazyLoader
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import load as tf_load
 from tensorflow.python.saved_model import nested_structure_coder
@@ -43,7 +44,6 @@ from tensorflow.python.training.tracking.tracking import delete_tracking
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util import object_identity
-from tensorflow.python.util.lazy_loader import LazyLoader
 
 # To avoid circular dependencies between keras/engine and keras/saving,
 # code in keras/saving must delay imports.
diff --git a/tensorflow/python/keras/saving/saved_model/save.py b/tensorflow/python/keras/saving/saved_model/save.py
index 7d6bc120758..a40856cbf54 100644
--- a/tensorflow/python/keras/saving/saved_model/save.py
+++ b/tensorflow/python/keras/saving/saved_model/save.py
@@ -22,9 +22,9 @@ from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.saving import saving_utils
 from tensorflow.python.keras.saving.saved_model import save_impl
+from tensorflow.python.keras.utils.generic_utils import LazyLoader
 from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.saved_model import save as save_lib
-from tensorflow.python.util.lazy_loader import LazyLoader
 
 # To avoid circular dependencies between keras/engine and keras/saving,
 # code in keras/saving must delay imports.
diff --git a/tensorflow/python/keras/saving/saved_model/save_impl.py b/tensorflow/python/keras/saving/saved_model/save_impl.py
index c2e4f96e127..a2c4d58d18e 100644
--- a/tensorflow/python/keras/saving/saved_model/save_impl.py
+++ b/tensorflow/python/keras/saving/saved_model/save_impl.py
@@ -38,13 +38,13 @@ from tensorflow.python.keras.saving.saved_model import load as keras_load
 from tensorflow.python.keras.saving.saved_model import serialized_attributes
 from tensorflow.python.keras.saving.saved_model import utils
 from tensorflow.python.keras.utils import version_utils
+from tensorflow.python.keras.utils.generic_utils import LazyLoader
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking import data_structures
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
-from tensorflow.python.util.lazy_loader import LazyLoader
 
 # To avoid circular dependencies between keras/engine and keras/saving,
 # code in keras/saving must delay imports.
diff --git a/tensorflow/python/keras/saving/saved_model/saved_model_test.py b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
index e76e524f93b..1dff9a2e8cf 100644
--- a/tensorflow/python/keras/saving/saved_model/saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
@@ -51,8 +51,8 @@ from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.feature_column.dense_features import DenseFeatures
 from tensorflow.python.keras.saving.saved_model import load as keras_load
 from tensorflow.python.keras.saving.saved_model import save_impl as keras_save
+from tensorflow.python.keras.utils import control_flow_util
 from tensorflow.python.keras.utils import generic_utils
-from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -75,8 +75,8 @@ class LayerWithLearningPhase(keras.engine.base_layer.Layer):
   def call(self, x, training=None):
     if training is None:
       training = keras.backend.learning_phase()
-    output = tf_utils.smart_cond(
-        training, lambda: x * 0, lambda: array_ops.identity(x))
+    output = control_flow_util.smart_cond(training, lambda: x * 0,
+                                          lambda: array_ops.identity(x))
     if not context.executing_eagerly():
       output._uses_learning_phase = True  # pylint: disable=protected-access
     return output
@@ -530,14 +530,14 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
     class LayerWithTrainingRequiredArg(keras.engine.base_layer.Layer):
 
       def call(self, inputs, training):
-        return tf_utils.smart_cond(
-            training, lambda: inputs * 0, lambda: array_ops.identity(inputs))
+        return control_flow_util.smart_cond(training, lambda: inputs * 0,
+                                            lambda: array_ops.identity(inputs))
 
     class LayerWithTrainingDefaultTrue(keras.engine.base_layer.Layer):
 
       def call(self, inputs, training=True):
-        return tf_utils.smart_cond(
-            training, lambda: inputs * 0, lambda: array_ops.identity(inputs))
+        return control_flow_util.smart_cond(training, lambda: inputs * 0,
+                                            lambda: array_ops.identity(inputs))
 
     class Model(keras.models.Model):
 
diff --git a/tensorflow/python/keras/saving/saved_model/serialized_attributes.py b/tensorflow/python/keras/saving/saved_model/serialized_attributes.py
index 3aff9265a13..ac17cc50225 100644
--- a/tensorflow/python/keras/saving/saved_model/serialized_attributes.py
+++ b/tensorflow/python/keras/saving/saved_model/serialized_attributes.py
@@ -21,9 +21,9 @@ from __future__ import print_function
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as defun
 from tensorflow.python.keras.saving.saved_model import constants
+from tensorflow.python.keras.utils.generic_utils import LazyLoader
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking.tracking import AutoTrackable
-from tensorflow.python.util.lazy_loader import LazyLoader
 
 # TODO(b/134426265): Switch back to single-quotes to match the rest of the file
 # once the issue with copybara is fixed.
diff --git a/tensorflow/python/keras/saving/saved_model/utils.py b/tensorflow/python/keras/saving/saved_model/utils.py
index bd3f0c1b626..82547cc393d 100644
--- a/tensorflow/python/keras/saving/saved_model/utils.py
+++ b/tensorflow/python/keras/saving/saved_model/utils.py
@@ -23,11 +23,11 @@ import types
 from tensorflow.python.eager import context
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine import base_layer_utils
-from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.keras.utils import control_flow_util
+from tensorflow.python.keras.utils.generic_utils import LazyLoader
 from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
-from tensorflow.python.util.lazy_loader import LazyLoader
 
 
 # pylint:disable=g-inconsistent-quotes
@@ -164,9 +164,8 @@ def maybe_add_training_arg(
       set_training_arg(training, training_arg_index, args, kwargs)
       return wrapped_call(*args, **kwargs)
 
-    return tf_utils.smart_cond(
-        training,
-        lambda: replace_training_and_call(True),
+    return control_flow_util.smart_cond(
+        training, lambda: replace_training_and_call(True),
         lambda: replace_training_and_call(False))
 
   # Create arg spec for decorated function. If 'training' is not defined in the
diff --git a/tensorflow/python/keras/saving/saved_model_experimental.py b/tensorflow/python/keras/saving/saved_model_experimental.py
index 0c6714b8340..25628cd1ba3 100644
--- a/tensorflow/python/keras/saving/saved_model_experimental.py
+++ b/tensorflow/python/keras/saving/saved_model_experimental.py
@@ -29,6 +29,7 @@ from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.keras.saving import model_config
 from tensorflow.python.keras.saving import saving_utils
 from tensorflow.python.keras.utils import mode_keys
+from tensorflow.python.keras.utils.generic_utils import LazyLoader
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
@@ -42,7 +43,6 @@ from tensorflow.python.training.tracking import graph_view
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
-from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import keras_export
 
 # To avoid circular dependencies between keras/engine and keras/saving,
diff --git a/tensorflow/python/keras/saving/saved_model_experimental_test.py b/tensorflow/python/keras/saving/saved_model_experimental_test.py
index 527d2721481..f4b91298d10 100644
--- a/tensorflow/python/keras/saving/saved_model_experimental_test.py
+++ b/tensorflow/python/keras/saving/saved_model_experimental_test.py
@@ -35,8 +35,8 @@ from tensorflow.python.keras.engine import training as model_lib
 from tensorflow.python.keras.optimizer_v2 import adadelta
 from tensorflow.python.keras.optimizer_v2 import rmsprop
 from tensorflow.python.keras.saving import saved_model_experimental as keras_saved_model
+from tensorflow.python.keras.utils import control_flow_util
 from tensorflow.python.keras.utils import mode_keys
-from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import loader_impl
@@ -209,8 +209,8 @@ class LayerWithLearningPhase(keras.engine.base_layer.Layer):
   def call(self, x, training=None):
     if training is None:
       training = keras.backend.learning_phase()
-    output = tf_utils.smart_cond(
-        training, lambda: x * 0, lambda: array_ops.identity(x))
+    output = control_flow_util.smart_cond(training, lambda: x * 0,
+                                          lambda: array_ops.identity(x))
     if not context.executing_eagerly():
       output._uses_learning_phase = True  # pylint: disable=protected-access
     return output
diff --git a/tensorflow/python/keras/saving/saving_utils_test.py b/tensorflow/python/keras/saving/saving_utils_test.py
index 574e42a2aff..49b6fde9ec7 100644
--- a/tensorflow/python/keras/saving/saving_utils_test.py
+++ b/tensorflow/python/keras/saving/saving_utils_test.py
@@ -35,7 +35,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
@@ -295,44 +294,46 @@ class ModelSaveTest(keras_parameterized.TestCase):
                                           {input_name: np.ones((8, 5))}))
 
 
-@test_util.run_deprecated_v1  # Not used in v2.
 class ExtractModelMetricsTest(keras_parameterized.TestCase):
 
   def test_extract_model_metrics(self):
-    a = keras.layers.Input(shape=(3,), name='input_a')
-    b = keras.layers.Input(shape=(3,), name='input_b')
+    # saving_utils.extract_model_metrics is used in V1 only API
+    # keras.experimental.export_saved_model.
+    with ops.Graph().as_default():
+      a = keras.layers.Input(shape=(3,), name='input_a')
+      b = keras.layers.Input(shape=(3,), name='input_b')
 
-    dense = keras.layers.Dense(4, name='dense')
-    c = dense(a)
-    d = dense(b)
-    e = keras.layers.Dropout(0.5, name='dropout')(c)
+      dense = keras.layers.Dense(4, name='dense')
+      c = dense(a)
+      d = dense(b)
+      e = keras.layers.Dropout(0.5, name='dropout')(c)
 
-    model = keras.models.Model([a, b], [d, e])
-    extract_metrics = saving_utils.extract_model_metrics(model)
-    self.assertEqual(None, extract_metrics)
+      model = keras.models.Model([a, b], [d, e])
+      extract_metrics = saving_utils.extract_model_metrics(model)
+      self.assertEqual(None, extract_metrics)
 
-    extract_metric_names = [
-        'dense_binary_accuracy', 'dropout_binary_accuracy',
-        'dense_mean_squared_error', 'dropout_mean_squared_error'
-    ]
-    if tf2.enabled():
-      extract_metric_names.extend(['dense_mae', 'dropout_mae'])
-    else:
-      extract_metric_names.extend(
-          ['dense_mean_absolute_error', 'dropout_mean_absolute_error'])
+      extract_metric_names = [
+          'dense_binary_accuracy', 'dropout_binary_accuracy',
+          'dense_mean_squared_error', 'dropout_mean_squared_error'
+      ]
+      if tf2.enabled():
+        extract_metric_names.extend(['dense_mae', 'dropout_mae'])
+      else:
+        extract_metric_names.extend(
+            ['dense_mean_absolute_error', 'dropout_mean_absolute_error'])
 
-    model_metric_names = ['loss', 'dense_loss', 'dropout_loss'
-                         ] + extract_metric_names
-    model.compile(
-        loss='mae',
-        metrics=[
-            keras.metrics.BinaryAccuracy(), 'mae',
-            keras.metrics.mean_squared_error
-        ],
-        optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01))
-    extract_metrics = saving_utils.extract_model_metrics(model)
-    self.assertEqual(set(model_metric_names), set(model.metrics_names))
-    self.assertEqual(set(extract_metric_names), set(extract_metrics.keys()))
+      model_metric_names = ['loss', 'dense_loss', 'dropout_loss'
+                           ] + extract_metric_names
+      model.compile(
+          loss='mae',
+          metrics=[
+              keras.metrics.BinaryAccuracy(), 'mae',
+              keras.metrics.mean_squared_error
+          ],
+          optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01))
+      extract_metrics = saving_utils.extract_model_metrics(model)
+      self.assertEqual(set(model_metric_names), set(model.metrics_names))
+      self.assertEqual(set(extract_metric_names), set(extract_metrics.keys()))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index cceaabe37a5..550ff664823 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import contextlib
 import functools
 import threading
 
@@ -26,6 +27,7 @@ import numpy as np
 from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
@@ -917,3 +919,21 @@ def _set_v2_dtype_behavior(fn, enabled):
       base_layer_utils.V2_DTYPE_BEHAVIOR = v2_dtype_behavior
 
   return tf_decorator.make_decorator(fn, wrapper)
+
+
+@contextlib.contextmanager
+def device(should_use_gpu):
+  """Uses gpu when requested and available."""
+  if should_use_gpu and test_util.is_gpu_available():
+    dev = '/device:GPU:0'
+  else:
+    dev = '/device:CPU:0'
+  with ops.device(dev):
+    yield
+
+
+@contextlib.contextmanager
+def use_gpu():
+  """Uses gpu when requested and available."""
+  with device(should_use_gpu=True):
+    yield
diff --git a/tensorflow/python/keras/tests/BUILD b/tensorflow/python/keras/tests/BUILD
index 52131bab7ff..4db3327d1f6 100644
--- a/tensorflow/python/keras/tests/BUILD
+++ b/tensorflow/python/keras/tests/BUILD
@@ -284,6 +284,7 @@ cuda_py_test(
         "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras:combinations",
     ],
 )
 
@@ -356,6 +357,7 @@ tf_py_test(
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:util",
+        "//tensorflow/python/keras:combinations",
         "//tensorflow/python/keras/engine",
         "//tensorflow/python/keras/layers:core",
     ],
@@ -391,6 +393,7 @@ tf_py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras:combinations",
         "//tensorflow/python/keras/engine",
         "//tensorflow/python/keras/layers:core",
         "//tensorflow/python/keras/layers:normalization",
@@ -428,6 +431,7 @@ tf_py_test(
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/keras:combinations",
         "//tensorflow/python/keras/engine",
         "//tensorflow/python/keras/layers:core",
         "//tensorflow/python/keras/optimizer_v2",
@@ -461,6 +465,7 @@ tf_py_test(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras:combinations",
         "//tensorflow/python/keras/engine",
         "//tensorflow/python/keras/layers:core",
         "//tensorflow/python/training/tracking",
diff --git a/tensorflow/python/keras/tests/automatic_outside_compilation_test.py b/tensorflow/python/keras/tests/automatic_outside_compilation_test.py
index 76e5ca98af9..a770b7fa6aa 100644
--- a/tensorflow/python/keras/tests/automatic_outside_compilation_test.py
+++ b/tensorflow/python/keras/tests/automatic_outside_compilation_test.py
@@ -22,6 +22,9 @@ import os
 
 import numpy as np
 
+from tensorboard.plugins.histogram import summary_v2 as histogram_summary_v2
+from tensorboard.plugins.image import summary_v2 as image_summary_v2
+from tensorboard.plugins.scalar import summary_v2 as scalar_summary_v2
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import tpu_strategy as tpu_strategy_lib
@@ -74,7 +77,8 @@ class LayerForScalarSummary(base_layer.Layer):
   """A pass-through layer that only records scalar values to summary."""
 
   def call(self, x):
-    summary_ops_v2.scalar('custom_scalar_summary', math_ops.reduce_sum(x))
+    # Add summary scalar using compat v2 implementation.
+    scalar_summary_v2.scalar('custom_scalar_summary_v2', math_ops.reduce_sum(x))
     return x
 
 
@@ -82,7 +86,9 @@ class LayerForImageSummary(base_layer.Layer):
   """A pass-through layer that only records image values to summary."""
 
   def call(self, x):
-    summary_ops_v2.image('custom_image_summary', x)
+    # Add summary image using compat v2 implementation.
+    image_summary_v2.image('custom_image_summary_v2', x)
+
     return x
 
 
@@ -90,7 +96,9 @@ class LayerForHistogramSummary(base_layer.Layer):
   """A pass-through layer that records histogram values to summary."""
 
   def call(self, x):
-    summary_ops_v2.histogram('custom_histogram_summary', x)
+    # Add summary histogram using compat v2 implementation.
+    histogram_summary_v2.histogram('custom_histogram_summary_v2', x)
+
     return x
 
 
@@ -161,7 +169,18 @@ class AutoOutsideCompilationWithKerasTest(test.TestCase):
     context.context().soft_device_placement = True
     self.summary_dir = self.get_temp_dir()
 
-  def testV2SummaryWithImageModel(self):
+  def validate_recorded_sumary_file(self, event_files, summary_dict,
+                                    expected_count):
+    for event_file in event_files:
+      for e in summary_iterator.summary_iterator(event_file):
+        for v in e.summary.value:
+          if v.tag in summary_dict:
+            summary_dict[v.tag] += 1
+
+    for key in summary_dict:
+      self.assertEqual(summary_dict[key], expected_count)
+
+  def testV2SummaryWithKerasSequentialModel(self):
     strategy = get_tpu_strategy()
 
     with strategy.scope():
@@ -177,34 +196,21 @@ class AutoOutsideCompilationWithKerasTest(test.TestCase):
           epochs=1,
           callbacks=[tensorboard_callback])
 
-      event_files = file_io.get_matching_files_v2(
-          os.path.join(self.summary_dir, 'train', 'event*'))
       events_count_dictionary = {
-          ('sequential/layer_for_histogram_summary'
-           '/custom_histogram_summary'): 0,
-          'sequential/layer_for_image_summary/custom_image_summary/image/0': 0
+          'sequential/layer_for_histogram_summary/custom_histogram_summary_v2':
+              0,
+          'sequential/layer_for_image_summary/custom_image_summary_v2':
+              0,
       }
 
-      for event_file in event_files:
-        for e in summary_iterator.summary_iterator(event_file):
-          for v in e.summary.value:
-            if v.tag in events_count_dictionary:
-              events_count_dictionary[v.tag] += 1
-
+      event_files = file_io.get_matching_files_v2(
+          os.path.join(self.summary_dir, 'train', 'event*'))
       # Since total of 10 steps are ran and summary ops should be invoked
       # every 2 batches, we should see total of 5 event logs.
-      self.assertEqual(
-          events_count_dictionary[
-              ('sequential/layer_for_histogram_summary/'
-               'custom_histogram_summary')],
-          5)
-      self.assertEqual(
-          events_count_dictionary[
-              ('sequential/layer_for_image_summary/'
-               'custom_image_summary/image/0')],
-          5)
+      self.validate_recorded_sumary_file(event_files, events_count_dictionary,
+                                         5)
 
-  def testV2SummaryWithKerasFit(self):
+  def testV2SummaryWithKerasSubclassedModel(self):
     strategy = get_tpu_strategy()
 
     with strategy.scope():
@@ -223,25 +229,18 @@ class AutoOutsideCompilationWithKerasTest(test.TestCase):
       event_files = file_io.get_matching_files_v2(
           os.path.join(self.summary_dir, 'train', 'event*'))
       events_count_dictionary = {
-          'custom_model/layer_for_scalar_summary/custom_scalar_summary': 0,
-          'custom_model/layer_for_histogram_summary/custom_histogram_summary': 0
+          ('custom_model/layer_for_scalar_summary/'
+           'custom_scalar_summary_v2'):
+              0,
+          ('custom_model/layer_for_histogram_summary/'
+           'custom_histogram_summary_v2'):
+              0
       }
 
-      for event_file in event_files:
-        for e in summary_iterator.summary_iterator(event_file):
-          for v in e.summary.value:
-            if v.tag in events_count_dictionary:
-              events_count_dictionary[v.tag] += 1
-
       # Since total of 10 steps are ran and summary ops should be invoked
       # every 2 batches, we should see total of 5 event logs.
-      self.assertEqual(
-          events_count_dictionary[('custom_model/layer_for_histogram_summary/'
-                                   'custom_histogram_summary')],
-          5)
-      self.assertEqual(
-          events_count_dictionary[
-              'custom_model/layer_for_scalar_summary/custom_scalar_summary'], 5)
+      self.validate_recorded_sumary_file(event_files, events_count_dictionary,
+                                         5)
 
   def testSummaryWithCustomTrainingLoop(self):
     strategy = get_tpu_strategy()
@@ -258,8 +257,10 @@ class AutoOutsideCompilationWithKerasTest(test.TestCase):
           del labels
           logits = model(features)
           with summary_ops_v2.always_record_summaries(), writer.as_default():
-            summary_ops_v2.scalar(
-                'logits', logits, step=model.optimizer.iterations)
+            scalar_summary_v2.scalar(
+                'logits',
+                math_ops.reduce_sum(logits),
+                step=model.optimizer.iterations)
           return logits
 
         iterator = iter(dataset)
diff --git a/tensorflow/python/keras/tests/convert_to_constants_test.py b/tensorflow/python/keras/tests/convert_to_constants_test.py
index 21081682089..f59c83b79dc 100644
--- a/tensorflow/python/keras/tests/convert_to_constants_test.py
+++ b/tensorflow/python/keras/tests/convert_to_constants_test.py
@@ -29,12 +29,12 @@ from tensorflow.python.framework import convert_to_constants
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
+from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model.load import load
 from tensorflow.python.saved_model.save import save
-from tensorflow.python.training.tracking import tracking
 from tensorflow.python.util import nest
 
 
@@ -50,7 +50,7 @@ class VariablesToConstantsTest(test.TestCase):
       root: AutoTrackable object with original ConcreteFunction.
       output_func: frozen ConcreteFunction.
     """
-    root = tracking.AutoTrackable()
+    root = module.Module()
     root.f = model
     input_func = root.f.get_concrete_function()
 
@@ -91,7 +91,7 @@ class VariablesToConstantsTest(test.TestCase):
 
     # Save the converted ConcreteFunction as a signature.
     save_dir = os.path.join(self.get_temp_dir(), "frozen_saved_model")
-    root = tracking.AutoTrackable()
+    root = module.Module()
     root.f = converted_concrete_func
     save(root, save_dir, {"mykey": converted_concrete_func})
 
diff --git a/tensorflow/python/keras/tests/op_callbacks_test.py b/tensorflow/python/keras/tests/op_callbacks_test.py
index a8abc07e3d4..ca50bbb1a81 100644
--- a/tensorflow/python/keras/tests/op_callbacks_test.py
+++ b/tensorflow/python/keras/tests/op_callbacks_test.py
@@ -25,7 +25,8 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import op_callbacks
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import combinations
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.ops import script_ops
 from tensorflow.python.util import compat
 
@@ -128,13 +129,13 @@ class _NumpyFunctionCallback(object):
     self.graph_internal_ndarrays = {}
 
 
-class OpCallbacksTest(test_util.TensorFlowTestCase):
+@combinations.generate(combinations.combine(mode=["graph", "eager"]))
+class OpCallbacksTest(keras_parameterized.TestCase):
 
   def tearDown(self):
     op_callbacks.clear_op_callbacks()
     super(OpCallbacksTest, self).tearDown()
 
-  @test_util.run_in_graph_and_eager_modes
   def testKerasLSTMPredict(self):
     instrument = _NumpyFunctionCallback(float_only=True)
 
@@ -153,7 +154,6 @@ class OpCallbacksTest(test_util.TensorFlowTestCase):
     # recorded by the callback.
     self.assertTrue(instrument.graph_internal_ndarrays)
 
-  @test_util.run_in_graph_and_eager_modes
   def testKeraModelFit(self):
     # TODO(cais): The purely PyFunc (numpy_function) based instrumentation
     # doesn't work for the entire Keras model and its fit() call, due to some
diff --git a/tensorflow/python/keras/tests/saver_test.py b/tensorflow/python/keras/tests/saver_test.py
index 28c65961a53..03496544033 100644
--- a/tensorflow/python/keras/tests/saver_test.py
+++ b/tensorflow/python/keras/tests/saver_test.py
@@ -27,16 +27,16 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops as ops_lib
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
+from tensorflow.python.module import module
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import adam
 from tensorflow.python.training import saver as saver_module
 from tensorflow.python.training import training_util
-from tensorflow.python.training.tracking import tracking as trackable_tracking
 from tensorflow.python.training.tracking import util as trackable_utils
 
 
-class NonLayerTrackable(trackable_tracking.AutoTrackable):
+class NonLayerTrackable(module.Module):
 
   def __init__(self):
     super(NonLayerTrackable, self).__init__()
diff --git a/tensorflow/python/keras/tests/serialization_util_test.py b/tensorflow/python/keras/tests/serialization_util_test.py
index 058bdaec56c..f24d24ceacb 100644
--- a/tensorflow/python/keras/tests/serialization_util_test.py
+++ b/tensorflow/python/keras/tests/serialization_util_test.py
@@ -21,45 +21,45 @@ from __future__ import print_function
 import json
 
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import combinations
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.saving.saved_model import json_utils
 from tensorflow.python.platform import test
-from tensorflow.python.util import serialization
 
 
-class SerializationTests(test.TestCase):
+@combinations.generate(combinations.combine(mode=["graph", "eager"]))
+class SerializationTests(keras_parameterized.TestCase):
 
   def test_serialize_dense(self):
     dense = core.Dense(3)
     dense(constant_op.constant([[4.]]))
     round_trip = json.loads(json.dumps(
-        dense, default=serialization.get_json_type))
+        dense, default=json_utils.get_json_type))
     self.assertEqual(3, round_trip["config"]["units"])
 
-  @test_util.run_in_graph_and_eager_modes
   def test_serialize_sequential(self):
     model = sequential.Sequential()
     model.add(core.Dense(4))
     model.add(core.Dense(5))
     model(constant_op.constant([[1.]]))
     sequential_round_trip = json.loads(
-        json.dumps(model, default=serialization.get_json_type))
+        json.dumps(model, default=json_utils.get_json_type))
     self.assertEqual(
         # Note that `config['layers'][0]` will be an InputLayer in V2
         # (but not in V1)
         5, sequential_round_trip["config"]["layers"][-1]["config"]["units"])
 
-  @test_util.run_in_graph_and_eager_modes
   def test_serialize_model(self):
     x = input_layer.Input(shape=[3])
     y = core.Dense(10)(x)
     model = training.Model(x, y)
     model(constant_op.constant([[1., 1., 1.]]))
     model_round_trip = json.loads(
-        json.dumps(model, default=serialization.get_json_type))
+        json.dumps(model, default=json_utils.get_json_type))
     self.assertEqual(
         10, model_round_trip["config"]["layers"][1]["config"]["units"])
 
diff --git a/tensorflow/python/keras/tests/tracking_test.py b/tensorflow/python/keras/tests/tracking_test.py
index a05706eec7a..02d5cd519ab 100644
--- a/tensorflow/python/keras/tests/tracking_test.py
+++ b/tensorflow/python/keras/tests/tracking_test.py
@@ -27,7 +27,8 @@ from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import combinations
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
@@ -38,7 +39,6 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.training.tracking import base
 from tensorflow.python.training.tracking import data_structures
-from tensorflow.python.training.tracking import tracking
 from tensorflow.python.training.tracking import util
 
 
@@ -73,43 +73,44 @@ class HasList(training.Model):
     return bn(x) / aggregation
 
 
-class ListTests(test.TestCase):
+class ListTests(keras_parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
-  @test_util.run_v1_only("b/120545219")
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testTracking(self):
-    model = HasList()
-    output = model(array_ops.ones([32, 2]))
-    self.assertAllEqual([32, 12], output.shape)
-    self.assertEqual(11, len(model.layers))
-    self.assertEqual(10, len(model.layer_list.layers))
-    six.assertCountEqual(
-        self,
-        model.layers,
-        model.layer_list.layers + model.layers_with_updates)
-    for index in range(10):
-      self.assertEqual(3 + index, model.layer_list.layers[index].units)
-    self.assertEqual(2, len(model._checkpoint_dependencies))
-    self.assertIs(model.layer_list, model._checkpoint_dependencies[0].ref)
-    self.assertIs(model.layers_with_updates,
-                  model._checkpoint_dependencies[1].ref)
-    self.assertEqual(
-        10, len(model._checkpoint_dependencies[0].ref._checkpoint_dependencies))
-    self.evaluate([v.initializer for v in model.variables])
-    self.evaluate(model.variables[0].assign([[1., 2., 3.], [4., 5., 6.]]))
-    save_path = os.path.join(self.get_temp_dir(), "ckpt")
-    model.save_weights(save_path)
-    self.evaluate(model.variables[0].assign(array_ops.zeros([2, 3])))
-    model.load_weights(save_path)
-    self.assertAllEqual([[1., 2., 3.], [4., 5., 6.]],
-                        self.evaluate(model.variables[0]))
-    v = variables.Variable(1.)
-    model.var_list = [v]
-    self.assertIn(v, model.variables)
-    self.assertIn(v, model.trainable_variables)
-    self.assertNotIn(v, model.non_trainable_variables)
-    self.assertIn(model.layer_list[0].trainable_weights[0],
-                  model.trainable_weights)
+    with self.test_session():
+      model = HasList()
+      output = model(array_ops.ones([32, 2]))
+      self.assertAllEqual([32, 12], output.shape)
+      self.assertEqual(11, len(model.layers))
+      self.assertEqual(10, len(model.layer_list.layers))
+      six.assertCountEqual(
+          self,
+          model.layers,
+          model.layer_list.layers + model.layers_with_updates)
+      for index in range(10):
+        self.assertEqual(3 + index, model.layer_list.layers[index].units)
+      self.assertEqual(2, len(model._checkpoint_dependencies))
+      self.assertIs(model.layer_list, model._checkpoint_dependencies[0].ref)
+      self.assertIs(model.layers_with_updates,
+                    model._checkpoint_dependencies[1].ref)
+      self.assertEqual(
+          10,
+          len(model._checkpoint_dependencies[0].ref._checkpoint_dependencies))
+      self.evaluate([v.initializer for v in model.variables])
+      self.evaluate(model.variables[0].assign([[1., 2., 3.], [4., 5., 6.]]))
+      save_path = os.path.join(self.get_temp_dir(), "ckpt")
+      model.save_weights(save_path)
+      self.evaluate(model.variables[0].assign(array_ops.zeros([2, 3])))
+      model.load_weights(save_path)
+      self.assertAllEqual([[1., 2., 3.], [4., 5., 6.]],
+                          self.evaluate(model.variables[0]))
+      v = variables.Variable(1.)
+      model.var_list = [v]
+    self.assertTrue(any(v is t for t in model.variables))
+    self.assertTrue(any(v is t for t in model.trainable_variables))
+    self.assertFalse(any(v is t for t in model.non_trainable_variables))
+    self.assertTrue(any(model.layer_list[0].trainable_weights[0]
+                        is t for t in model.trainable_weights))
 
   def testSubModelTracking(self):
     model = training.Model()
@@ -176,7 +177,6 @@ class ListTests(test.TestCase):
     m2(m2.null_input())
     self.assertLen(m2.trainable_variables, 6)
 
-  @test_util.run_v1_only("b/120545219")
   def testUpdatesForwarded(self):
     with context.graph_mode():
       model = HasList()
@@ -192,8 +192,7 @@ class ListTests(test.TestCase):
       model(model_input)
       self.assertEqual(0, len(model.updates))
 
-  @test_util.run_in_graph_and_eager_modes
-  @test_util.run_v1_only("b/120545219")
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testLossesForwarded(self):
     model = HasList()
     model_input = array_ops.ones([32, 2])
@@ -215,7 +214,7 @@ class ListTests(test.TestCase):
     model.l2.append(second_layer)
     self.assertEqual([first_layer, second_layer], model.layers)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testTensorConversion(self):
 
     class ListToTensor(training.Model):
@@ -266,30 +265,31 @@ class HasMapping(training.Model):
     return self.layer_dict["output"](x) / aggregation
 
 
-class MappingTests(test.TestCase):
+class MappingTests(keras_parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testTracking(self):
-    model = HasMapping()
-    output = model(array_ops.ones([32, 2]))
-    self.assertAllEqual([32, 7], output.shape.as_list())
-    self.assertEqual(5, len(model.layers))
-    six.assertCountEqual(self, model.layers, model.layer_dict.layers)
-    self.assertEqual(1, len(model._checkpoint_dependencies))
-    self.assertIs(model.layer_dict, model._checkpoint_dependencies[0].ref)
-    self.evaluate([v.initializer for v in model.variables])
-    test_var = model.layer_dict["output"].kernel
-    self.evaluate(test_var.assign(array_ops.ones([6, 7])))
-    save_path = os.path.join(self.get_temp_dir(), "ckpt")
-    model.save_weights(save_path)
-    self.evaluate(test_var.assign(array_ops.zeros([6, 7])))
-    model.load_weights(save_path)
-    self.assertAllEqual(numpy.ones([6, 7]),
-                        self.evaluate(test_var))
+    with self.test_session():
+      model = HasMapping()
+      output = model(array_ops.ones([32, 2]))
+      self.assertAllEqual([32, 7], output.shape.as_list())
+      self.assertEqual(5, len(model.layers))
+      six.assertCountEqual(self, model.layers, model.layer_dict.layers)
+      self.assertEqual(1, len(model._checkpoint_dependencies))
+      self.assertIs(model.layer_dict, model._checkpoint_dependencies[0].ref)
+      self.evaluate([v.initializer for v in model.variables])
+      test_var = model.layer_dict["output"].kernel
+      self.evaluate(test_var.assign(array_ops.ones([6, 7])))
+      save_path = os.path.join(self.get_temp_dir(), "ckpt")
+      model.save_weights(save_path)
+      self.evaluate(test_var.assign(array_ops.zeros([6, 7])))
+      model.load_weights(save_path)
+      self.assertAllEqual(numpy.ones([6, 7]),
+                          self.evaluate(test_var))
 
   def testLayerCollectionWithExternalMutation(self):
     d = {}
-    root = tracking.AutoTrackable()
+    root = module.Module()
     root.wrapper = d
     self.assertEqual([], root.wrapper.layers)
     self.assertEqual([], root.wrapper.trainable_weights)
@@ -302,7 +302,7 @@ class MappingTests(test.TestCase):
     self.assertEqual([], root.wrapper.trainable_weights)
 
   def testDictWrapperBadKeys(self):
-    a = tracking.AutoTrackable()
+    a = module.Module()
     a.d = {}
     a.d[1] = data_structures.List()
     model = training.Model()
@@ -312,7 +312,7 @@ class MappingTests(test.TestCase):
       model.save_weights(save_path)
 
   def testDictWrapperNoDependency(self):
-    a = tracking.AutoTrackable()
+    a = module.Module()
     a.d = data_structures.NoDependency({})
     a.d[1] = [3]
     self.assertEqual([a], util.list_objects(a))
@@ -323,7 +323,7 @@ class MappingTests(test.TestCase):
     model.load_weights(save_path)
 
   def testNonStringKeyNotTrackableValue(self):
-    a = tracking.AutoTrackable()
+    a = module.Module()
     a.d = {}
     a.d["a"] = [3]
     a.d[1] = data_structures.NoDependency([3])
@@ -337,15 +337,15 @@ class MappingTests(test.TestCase):
   def testNonAppendNotTrackable(self):
     # Non-append mutations (deleting or overwriting values) are OK when the
     # values aren't tracked.
-    a = tracking.AutoTrackable()
+    a = module.Module()
     a.d = {}
     a.d["a"] = [3]
     a.d[1] = 3
     a.d[1] = 2
     self.assertEqual(2, a.d[1])
     del a.d[1]
-    a.d[2] = data_structures.NoDependency(tracking.AutoTrackable())
-    second = tracking.AutoTrackable()
+    a.d[2] = data_structures.NoDependency(module.Module())
+    second = module.Module()
     a.d[2] = data_structures.NoDependency(second)
     self.assertIs(second, a.d[2])
     self.assertEqual([a, a.d, a.d["a"]], util.list_objects(a))
@@ -415,43 +415,45 @@ class HasTuple(training.Model):
     return bn(x) / aggregation
 
 
-class TupleTests(test.TestCase, parameterized.TestCase):
+class TupleTests(keras_parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testTracking(self):
-    model = HasTuple()
-    output = model(array_ops.ones([32, 2]))
-    self.assertAllEqual([32, 5], output.shape.as_list())
-    self.assertLen(model.layers, 4)
-    self.assertLen(model.layer_list.layers, 3)
-    six.assertCountEqual(
-        self,
-        model.layers,
-        tuple(model.layer_list.layers) + model.layers_with_updates)
-    self.assertEqual(3, model.layer_list.layers[0].units)
-    self.assertEqual(4, model.layer_list.layers[1].units)
-    self.assertEqual(5, model.layer_list.layers[2].units)
-    self.assertLen(model._checkpoint_dependencies, 2)
-    self.assertIs(model.layer_list, model._checkpoint_dependencies[0].ref)
-    self.assertIs(model.layers_with_updates,
-                  model._checkpoint_dependencies[1].ref)
-    self.assertLen(
-        model._checkpoint_dependencies[0].ref._checkpoint_dependencies, 3)
-    self.evaluate([v.initializer for v in model.variables])
-    self.evaluate(model.variables[0].assign([[1., 2., 3.], [4., 5., 6.]]))
-    save_path = os.path.join(self.get_temp_dir(), "ckpt")
-    model.save_weights(save_path)
-    self.evaluate(model.variables[0].assign(array_ops.zeros([2, 3])))
-    model.load_weights(save_path)
-    self.assertAllEqual([[1., 2., 3.], [4., 5., 6.]],
-                        self.evaluate(model.variables[0]))
-    v = variables.Variable(1.)
-    model.var_list = (v,)
-    self.assertIn(id(v), [id(obj) for obj in model.variables])
-    self.assertIn(id(v), [id(obj) for obj in model.trainable_variables])
-    self.assertNotIn(id(v), [id(obj) for obj in model.non_trainable_variables])
-    self.assertIn(id(model.layer_list[0].trainable_weights[0]),
-                  [id(obj) for obj in model.trainable_weights])
+    with self.test_session():
+      model = HasTuple()
+      output = model(array_ops.ones([32, 2]))
+      self.assertAllEqual([32, 5], output.shape.as_list())
+      self.assertLen(model.layers, 4)
+      self.assertLen(model.layer_list.layers, 3)
+      six.assertCountEqual(
+          self,
+          model.layers,
+          tuple(model.layer_list.layers) + model.layers_with_updates)
+      self.assertEqual(3, model.layer_list.layers[0].units)
+      self.assertEqual(4, model.layer_list.layers[1].units)
+      self.assertEqual(5, model.layer_list.layers[2].units)
+      self.assertLen(model._checkpoint_dependencies, 2)
+      self.assertIs(model.layer_list, model._checkpoint_dependencies[0].ref)
+      self.assertIs(model.layers_with_updates,
+                    model._checkpoint_dependencies[1].ref)
+      self.assertLen(
+          model._checkpoint_dependencies[0].ref._checkpoint_dependencies, 3)
+      self.evaluate([v.initializer for v in model.variables])
+      self.evaluate(model.variables[0].assign([[1., 2., 3.], [4., 5., 6.]]))
+      save_path = os.path.join(self.get_temp_dir(), "ckpt")
+      model.save_weights(save_path)
+      self.evaluate(model.variables[0].assign(array_ops.zeros([2, 3])))
+      model.load_weights(save_path)
+      self.assertAllEqual([[1., 2., 3.], [4., 5., 6.]],
+                          self.evaluate(model.variables[0]))
+      v = variables.Variable(1.)
+      model.var_list = (v,)
+      self.assertIn(id(v), [id(obj) for obj in model.variables])
+      self.assertIn(id(v), [id(obj) for obj in model.trainable_variables])
+      self.assertNotIn(id(v),
+                       [id(obj) for obj in model.non_trainable_variables])
+      self.assertIn(id(model.layer_list[0].trainable_weights[0]),
+                    [id(obj) for obj in model.trainable_weights])
 
   @parameterized.named_parameters(
       ("Module", module.Module),
@@ -498,7 +500,7 @@ class TupleTests(test.TestCase, parameterized.TestCase):
     model(model_input)
     self.assertEmpty(model.updates)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testLossesForwarded(self):
     model = HasTuple()
     model_input = array_ops.ones([32, 2])
@@ -526,7 +528,7 @@ class TupleTests(test.TestCase, parameterized.TestCase):
     self.assertEqual(2, d[(second_layer,)])
     self.assertEqual([first_layer, second_layer], model.layers)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testTensorConversion(self):
 
     class TupleToTensor(training.Model):
@@ -544,13 +546,13 @@ class TupleTests(test.TestCase, parameterized.TestCase):
         self.evaluate(array_ops.pack(TupleToTensor().l)))
 
 
-class InterfaceTests(test.TestCase):
+class InterfaceTests(keras_parameterized.TestCase):
 
   def testNoDependency(self):
-    root = tracking.AutoTrackable()
-    hasdep = tracking.AutoTrackable()
+    root = module.Module()
+    hasdep = module.Module()
     root.hasdep = hasdep
-    nodep = tracking.AutoTrackable()
+    nodep = module.Module()
     root.nodep = data_structures.NoDependency(nodep)
     self.assertEqual(1, len(root._checkpoint_dependencies))
     self.assertIs(root._checkpoint_dependencies[0].ref, root.hasdep)
@@ -563,12 +565,12 @@ class InterfaceTests(test.TestCase):
       def __init__(self):
         super(NoDependencyModel, self).__init__()
         self.a = []
-        self.b = tracking.AutoTrackable()
+        self.b = module.Module()
 
     nodeps = NoDependencyModel()
     self.assertEqual([nodeps], util.list_objects(nodeps))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testDictionariesBasic(self):
     a = training.Model()
     b = training.Model()
@@ -592,7 +594,7 @@ class InterfaceTests(test.TestCase):
     with self.cached_session():
       checkpoint.restore(save_path).assert_consumed().initialize_or_restore()
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testNoDepList(self):
     a = training.Model()
     a.l1 = data_structures.NoDependency([])
diff --git a/tensorflow/python/keras/tests/tracking_util_test.py b/tensorflow/python/keras/tests/tracking_util_test.py
index 1c55c366d82..21b6ef8e8d2 100644
--- a/tensorflow/python/keras/tests/tracking_util_test.py
+++ b/tensorflow/python/keras/tests/tracking_util_test.py
@@ -20,7 +20,6 @@ import functools
 import os
 import weakref
 
-from absl.testing import parameterized
 import six
 
 from tensorflow.python.eager import backprop
@@ -29,11 +28,15 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import combinations
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.module import module
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -47,7 +50,6 @@ from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
 from tensorflow.python.training.tracking import graph_view
-from tensorflow.python.training.tracking import tracking
 from tensorflow.python.training.tracking import util as trackable_utils
 
 
@@ -67,7 +69,7 @@ class MyModel(training.Model):
     return ret
 
 
-class NonLayerTrackable(tracking.AutoTrackable):
+class NonLayerTrackable(module.Module):
 
   def __init__(self):
     super(NonLayerTrackable, self).__init__()
@@ -114,7 +116,7 @@ class InterfaceTests(test.TestCase):
     self.assertIn("dense/kernel", all_variable_names)
 
 
-class CheckpointingTests(parameterized.TestCase, test.TestCase):
+class CheckpointingTests(keras_parameterized.TestCase):
 
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testNamingWithOptimizer(self):
@@ -210,73 +212,74 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
         [key + suffix for key in expected_slot_keys],
         serialized_slot_keys)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testSaveRestore(self):
-    model = MyModel()
-    optimizer = adam.Adam(0.001)
-    root_trackable = trackable_utils.Checkpoint(
-        optimizer=optimizer, model=model)
-    input_value = constant_op.constant([[3.]])
-    with backprop.GradientTape() as tape:
-      loss = model(input_value)
-    variables = model.trainable_variables
-    gradients = tape.gradient(loss, variables)
-    train_op = optimizer.apply_gradients(zip(gradients, variables))
-    self.assertFalse(root_trackable.save_counter.trainable)
-    self.evaluate(trackable_utils.gather_initializers(
-        root_trackable))
-    self.evaluate(train_op)
-    prefix = os.path.join(self.get_temp_dir(), "ckpt")
-    self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
-    m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
-    self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
-    save_path = root_trackable.save(file_prefix=prefix)
-    self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
-    self.evaluate(state_ops.assign(root_trackable.save_counter, 3))
-    optimizer_variables = self.evaluate(
-        sorted(optimizer.variables(), key=lambda v: v.name))
-    self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
-    # Immediate restoration
-    status = root_trackable.restore(save_path=save_path).assert_consumed()
-    status.run_restore_ops()
-    self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
-    self.assertAllEqual(1, self.evaluate(root_trackable.save_counter))
-    self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
-    if not context.executing_eagerly():
-      return  # Restore-on-create is only supported when executing eagerly
-    on_create_model = MyModel()
-    on_create_optimizer = adam.Adam(0.001)
-    on_create_root = trackable_utils.Checkpoint(
-        optimizer=on_create_optimizer, model=on_create_model)
-    # Deferred restoration
-    status = on_create_root.restore(save_path=save_path)
-    status.assert_nontrivial_match()
-    status.assert_existing_objects_matched()
-    with self.assertRaises(AssertionError):
-      status.assert_consumed()
-    on_create_model(constant_op.constant([[3.]]))  # create variables
-    self.assertAllEqual(1, self.evaluate(on_create_root.save_counter))
-    self.assertAllEqual([42.],
-                        self.evaluate(
-                            on_create_model._named_dense.variables[1]))
-    on_create_m_bias_slot = on_create_optimizer.get_slot(
-        on_create_model._named_dense.variables[1], "m")
-    status.assert_existing_objects_matched()
-    if not context.executing_eagerly():
+    with self.test_session():
+      model = MyModel()
+      optimizer = adam.Adam(0.001)
+      root_trackable = trackable_utils.Checkpoint(
+          optimizer=optimizer, model=model)
+      input_value = constant_op.constant([[3.]])
+      with backprop.GradientTape() as tape:
+        loss = model(input_value)
+      variables = model.trainable_variables
+      gradients = tape.gradient(loss, variables)
+      train_op = optimizer.apply_gradients(zip(gradients, variables))
+      self.assertFalse(root_trackable.save_counter.trainable)
+      self.evaluate(trackable_utils.gather_initializers(
+          root_trackable))
+      self.evaluate(train_op)
+      prefix = os.path.join(self.get_temp_dir(), "ckpt")
+      self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
+      m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
+      self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
+      save_path = root_trackable.save(file_prefix=prefix)
+      self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
+      self.evaluate(state_ops.assign(root_trackable.save_counter, 3))
+      optimizer_variables = self.evaluate(
+          sorted(optimizer.variables(), key=lambda v: v.name))
+      self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
+      # Immediate restoration
+      status = root_trackable.restore(save_path=save_path).assert_consumed()
+      status.run_restore_ops()
+      self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
+      self.assertAllEqual(1, self.evaluate(root_trackable.save_counter))
+      self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
+      if not context.executing_eagerly():
+        return  # Restore-on-create is only supported when executing eagerly
+      on_create_model = MyModel()
+      on_create_optimizer = adam.Adam(0.001)
+      on_create_root = trackable_utils.Checkpoint(
+          optimizer=on_create_optimizer, model=on_create_model)
+      # Deferred restoration
+      status = on_create_root.restore(save_path=save_path)
+      status.assert_nontrivial_match()
+      status.assert_existing_objects_matched()
       with self.assertRaises(AssertionError):
         status.assert_consumed()
-    # Optimizer slot variables are created when the original variable is
-    # restored.
-    self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
-    dummy_var = resource_variable_ops.ResourceVariable([1.])
-    on_create_optimizer.minimize(loss=dummy_var.read_value,
-                                 var_list=[dummy_var])
-    status.assert_existing_objects_matched()
-    status.assert_consumed()
-    self.assertAllEqual(
-        optimizer_variables,
-        # Creation order is different, so .variables() needs to be re-sorted.
-        self.evaluate(sorted(optimizer.variables(), key=lambda v: v.name)))
+      on_create_model(constant_op.constant([[3.]]))  # create variables
+      self.assertAllEqual(1, self.evaluate(on_create_root.save_counter))
+      self.assertAllEqual([42.],
+                          self.evaluate(
+                              on_create_model._named_dense.variables[1]))
+      on_create_m_bias_slot = on_create_optimizer.get_slot(
+          on_create_model._named_dense.variables[1], "m")
+      status.assert_existing_objects_matched()
+      if not context.executing_eagerly():
+        with self.assertRaises(AssertionError):
+          status.assert_consumed()
+      # Optimizer slot variables are created when the original variable is
+      # restored.
+      self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
+      dummy_var = resource_variable_ops.ResourceVariable([1.])
+      on_create_optimizer.minimize(loss=dummy_var.read_value,
+                                   var_list=[dummy_var])
+      status.assert_existing_objects_matched()
+      status.assert_consumed()
+      self.assertAllEqual(
+          optimizer_variables,
+          # Creation order is different, so .variables() needs to be re-sorted.
+          self.evaluate(sorted(optimizer.variables(), key=lambda v: v.name)))
 
   # TODO(allenl): Debug garbage created by this test in python3.
   def testDeferredRestorationUsageEager(self):
@@ -344,39 +347,40 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
             self.assertEqual(training_continuation + 1,
                              session.run(root.save_counter))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testAgnosticUsage(self):
     """Graph/eager agnostic usage."""
     # Does create garbage when executing eagerly due to ops.Graph() creation.
-    num_training_steps = 10
-    checkpoint_directory = self.get_temp_dir()
-    def _train_fn(model, input_value):
-      with backprop.GradientTape() as tape:
-        loss = model(input_value)
-      variables = model.trainable_variables
-      gradients = tape.gradient(loss, variables)
-      return optimizer.apply_gradients(zip(gradients, variables))
-    for training_continuation in range(3):
-      with test_util.device(use_gpu=True):
-        model = MyModel()
-        optimizer = adam.Adam(0.001)
-        root = trackable_utils.Checkpoint(
-            optimizer=optimizer, model=model)
-        manager = checkpoint_management.CheckpointManager(
-            root, checkpoint_directory, max_to_keep=1)
-        status = root.restore(save_path=manager.latest_checkpoint)
-        input_value = constant_op.constant([[3.]])
-        train_fn = functools.partial(_train_fn, model, input_value)
-        if not context.executing_eagerly():
-          train_fn = functools.partial(self.evaluate, train_fn())
-        status.initialize_or_restore()
-        for _ in range(num_training_steps):
-          train_fn()
-        manager.save()
-        self.assertEqual((training_continuation + 1) * num_training_steps,
-                         self.evaluate(root.optimizer.iterations))
-        self.assertEqual(training_continuation + 1,
-                         self.evaluate(root.save_counter))
+    with self.test_session():
+      num_training_steps = 10
+      checkpoint_directory = self.get_temp_dir()
+      def _train_fn(model, input_value):
+        with backprop.GradientTape() as tape:
+          loss = model(input_value)
+        variables = model.trainable_variables
+        gradients = tape.gradient(loss, variables)
+        return optimizer.apply_gradients(zip(gradients, variables))
+      for training_continuation in range(3):
+        with testing_utils.device(should_use_gpu=True):
+          model = MyModel()
+          optimizer = adam.Adam(0.001)
+          root = trackable_utils.Checkpoint(
+              optimizer=optimizer, model=model)
+          manager = checkpoint_management.CheckpointManager(
+              root, checkpoint_directory, max_to_keep=1)
+          status = root.restore(save_path=manager.latest_checkpoint)
+          input_value = constant_op.constant([[3.]])
+          train_fn = functools.partial(_train_fn, model, input_value)
+          if not context.executing_eagerly():
+            train_fn = functools.partial(self.evaluate, train_fn())
+          status.initialize_or_restore()
+          for _ in range(num_training_steps):
+            train_fn()
+          manager.save()
+          self.assertEqual((training_continuation + 1) * num_training_steps,
+                           self.evaluate(root.optimizer.iterations))
+          self.assertEqual(training_continuation + 1,
+                           self.evaluate(root.save_counter))
 
   def testPartialRestoreWarningObject(self):
     with context.eager_mode():
@@ -404,46 +408,46 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
       self.assertIn("expect_partial()", messages)
 
   # pylint: disable=cell-var-from-loop
-  @test_util.run_in_graph_and_eager_modes
-  @test_util.run_v1_only("b/120545219")
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testWithDefun(self):
-    num_training_steps = 2
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    for training_continuation in range(3):
-      with test_util.device(use_gpu=True):
-        model = MyModel()
-        # Don't actually train so we can test variable values
-        optimizer = adam.Adam(0.)
-        root = trackable_utils.Checkpoint(
-            optimizer=optimizer, model=model)
-        checkpoint_path = checkpoint_management.latest_checkpoint(
-            checkpoint_directory)
-        status = root.restore(save_path=checkpoint_path)
-        def train_fn():
-          @def_function.function
-          def _call_model(x):
-            return model(x)
-          with backprop.GradientTape() as tape:
-            loss = _call_model(constant_op.constant([[3.]]))
-          gradients = tape.gradient(loss, model.variables)
-          return optimizer.apply_gradients(zip(gradients, model.variables))
-        if not context.executing_eagerly():
-          train_fn = functools.partial(
-              self.evaluate, train_fn())
-        status.initialize_or_restore()
-        for _ in range(num_training_steps):
-          train_fn()
-        if training_continuation > 0:
-          status.assert_consumed()
-          self.assertAllClose([[42.]], self.evaluate(model.variables[0]))
-        else:
-          self.evaluate(model.variables[0].assign([[42.]]))
-        root.save(file_prefix=checkpoint_prefix)
-        self.assertEqual((training_continuation + 1) * num_training_steps,
-                         self.evaluate(optimizer.iterations))
-        self.assertEqual(training_continuation + 1,
-                         self.evaluate(root.save_counter))
+    with self.test_session():
+      num_training_steps = 2
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      for training_continuation in range(3):
+        with testing_utils.device(should_use_gpu=True):
+          model = MyModel()
+          # Don't actually train so we can test variable values
+          optimizer = adam.Adam(0.)
+          root = trackable_utils.Checkpoint(
+              optimizer=optimizer, model=model)
+          checkpoint_path = checkpoint_management.latest_checkpoint(
+              checkpoint_directory)
+          status = root.restore(save_path=checkpoint_path)
+          def train_fn():
+            @def_function.function
+            def _call_model(x):
+              return model(x)
+            with backprop.GradientTape() as tape:
+              loss = _call_model(constant_op.constant([[3.]]))
+            gradients = tape.gradient(loss, model.variables)
+            return optimizer.apply_gradients(zip(gradients, model.variables))
+          if not context.executing_eagerly():
+            train_fn = functools.partial(
+                self.evaluate, train_fn())
+          status.initialize_or_restore()
+          for _ in range(num_training_steps):
+            train_fn()
+          if training_continuation > 0:
+            status.assert_consumed()
+            self.assertAllClose([[42.]], self.evaluate(model.variables[0]))
+          else:
+            self.evaluate(model.variables[0].assign([[42.]]))
+          root.save(file_prefix=checkpoint_prefix)
+          self.assertEqual((training_continuation + 1) * num_training_steps,
+                           self.evaluate(optimizer.iterations))
+          self.assertEqual(training_continuation + 1,
+                           self.evaluate(root.save_counter))
   # pylint: enable=cell-var-from-loop
 
   def testAnonymousVarsInInit(self):
@@ -475,71 +479,73 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
         optimizer.apply_gradients(
             [(g, v) for g, v in zip(grad, model.vars)])
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testDeferredSlotRestoration(self):
-    checkpoint_directory = self.get_temp_dir()
+    with self.test_session():
+      checkpoint_directory = self.get_temp_dir()
 
-    root = trackable_utils.Checkpoint()
-    root.var = trackable_utils.add_variable(
-        root, name="var", initializer=0.)
-    optimizer = adam.Adam(0.1)
-    variables = [root.var]
-    gradients = [1.]
-    train_op = optimizer.apply_gradients(zip(gradients, variables))
-    # Note that `optimizer` has not been added as a dependency of
-    # `root`. Create a one-off grouping so that slot variables for `root.var`
-    # get initialized too.
-    self.evaluate(trackable_utils.gather_initializers(
-        trackable_utils.Checkpoint(root=root, optimizer=optimizer)))
-    self.evaluate(train_op)
-    self.evaluate(state_ops.assign(root.var, 12.))
-    no_slots_path = root.save(os.path.join(checkpoint_directory, "no_slots"))
-    root.optimizer = optimizer
-    self.evaluate(state_ops.assign(root.var, 13.))
-    self.evaluate(state_ops.assign(
-        optimizer.get_slot(slot_name="m", var=root.var),
-        14.))
-    slots_path = root.save(os.path.join(checkpoint_directory, "with_slots"))
-    new_root = trackable_utils.Checkpoint()
-    # Load the slot-containing checkpoint (deferred), then immediately overwrite
-    # the non-slot variable (also deferred).
-    slot_status = new_root.restore(slots_path)
-    no_slot_status = new_root.restore(no_slots_path)
-    with self.assertRaises(AssertionError):
+      root = trackable_utils.Checkpoint()
+      root.var = trackable_utils.add_variable(
+          root, name="var", initializer=0.)
+      optimizer = adam.Adam(0.1)
+      variables = [root.var]
+      gradients = [1.]
+      train_op = optimizer.apply_gradients(zip(gradients, variables))
+      # Note that `optimizer` has not been added as a dependency of
+      # `root`. Create a one-off grouping so that slot variables for `root.var`
+      # get initialized too.
+      self.evaluate(trackable_utils.gather_initializers(
+          trackable_utils.Checkpoint(root=root, optimizer=optimizer)))
+      self.evaluate(train_op)
+      self.evaluate(state_ops.assign(root.var, 12.))
+      no_slots_path = root.save(os.path.join(checkpoint_directory, "no_slots"))
+      root.optimizer = optimizer
+      self.evaluate(state_ops.assign(root.var, 13.))
+      self.evaluate(state_ops.assign(
+          optimizer.get_slot(slot_name="m", var=root.var),
+          14.))
+      slots_path = root.save(os.path.join(checkpoint_directory, "with_slots"))
+      new_root = trackable_utils.Checkpoint()
+      # Load the slot-containing checkpoint (deferred), then immediately
+      # overwrite the non-slot variable (also deferred).
+      slot_status = new_root.restore(slots_path)
+      no_slot_status = new_root.restore(no_slots_path)
+      with self.assertRaises(AssertionError):
+        no_slot_status.assert_consumed()
+      new_root.var = trackable_utils.add_variable(
+          new_root, name="var", shape=[])
       no_slot_status.assert_consumed()
-    new_root.var = trackable_utils.add_variable(
-        new_root, name="var", shape=[])
-    no_slot_status.assert_consumed()
-    no_slot_status.run_restore_ops()
-    self.assertEqual(12., self.evaluate(new_root.var))
-    new_root.optimizer = adam.Adam(0.1)
-    slot_status.assert_existing_objects_matched()
-    if not context.executing_eagerly():
-      with self.assertRaisesRegex(AssertionError, "Unresolved object"):
-        slot_status.assert_consumed()
-    self.assertEqual(12., self.evaluate(new_root.var))
-    if context.executing_eagerly():
-      # Slot variables are only created with restoring initializers when
-      # executing eagerly.
-      self.assertEqual(14., self.evaluate(
-          new_root.optimizer.get_slot(slot_name="m", var=new_root.var)))
-    else:
-      # Slot variables are not created eagerly when graph building.
-      with self.assertRaises(KeyError):
-        new_root.optimizer.get_slot(slot_name="m", var=new_root.var)
-    variables = [new_root.var]
-    gradients = [1.]
-    train_op = new_root.optimizer.apply_gradients(zip(gradients, variables))
-    # The slot variable now exists; restore() didn't create it, but we should
-    # now have a restore op for it.
-    slot_status.run_restore_ops()
-    if not context.executing_eagerly():
-      # The train op hasn't run when graph building, so the slot variable has
-      # its restored value. It has run in eager, so the value will be different.
-      self.assertEqual(14., self.evaluate(
-          new_root.optimizer.get_slot(slot_name="m", var=new_root.var)))
-    self.evaluate(train_op)
-    slot_status.assert_consumed()
+      no_slot_status.run_restore_ops()
+      self.assertEqual(12., self.evaluate(new_root.var))
+      new_root.optimizer = adam.Adam(0.1)
+      slot_status.assert_existing_objects_matched()
+      if not context.executing_eagerly():
+        with self.assertRaisesRegex(AssertionError, "Unresolved object"):
+          slot_status.assert_consumed()
+      self.assertEqual(12., self.evaluate(new_root.var))
+      if context.executing_eagerly():
+        # Slot variables are only created with restoring initializers when
+        # executing eagerly.
+        self.assertEqual(14., self.evaluate(
+            new_root.optimizer.get_slot(slot_name="m", var=new_root.var)))
+      else:
+        # Slot variables are not created eagerly when graph building.
+        with self.assertRaises(KeyError):
+          new_root.optimizer.get_slot(slot_name="m", var=new_root.var)
+      variables = [new_root.var]
+      gradients = [1.]
+      train_op = new_root.optimizer.apply_gradients(zip(gradients, variables))
+      # The slot variable now exists; restore() didn't create it, but we should
+      # now have a restore op for it.
+      slot_status.run_restore_ops()
+      if not context.executing_eagerly():
+        # The train op hasn't run when graph building, so the slot variable has
+        # its restored value. It has run in eager, so the value will
+        # be different.
+        self.assertEqual(14., self.evaluate(
+            new_root.optimizer.get_slot(slot_name="m", var=new_root.var)))
+      self.evaluate(train_op)
+      slot_status.assert_consumed()
 
   def testManySavesGraph(self):
     """Saves after the first should not modify the graph."""
@@ -578,129 +584,132 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
         graph.finalize()
         obj.restore(save_path)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def test_sequential(self):
-    model = sequential.Sequential()
-    checkpoint = trackable_utils.Checkpoint(model=model)
-    model.add(core.Dense(4))
-    second_dense = core.Dense(5)
-    model.add(second_dense)
-    model(constant_op.constant([[1.]]))
-    checkpoint.restore(None).initialize_or_restore()
-    self.evaluate(second_dense.bias.assign(
-        constant_op.constant([1., 2., 3., 4., 5.])))
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    save_path = checkpoint.save(checkpoint_prefix)
-    self.evaluate(second_dense.bias.assign(
-        constant_op.constant([5., 6., 7., 8., 9.])))
-    checkpoint.restore(save_path).assert_consumed().run_restore_ops()
-    self.assertAllEqual([1., 2., 3., 4., 5.], self.evaluate(second_dense.bias))
+    with self.test_session():
+      model = sequential.Sequential()
+      checkpoint = trackable_utils.Checkpoint(model=model)
+      model.add(core.Dense(4))
+      second_dense = core.Dense(5)
+      model.add(second_dense)
+      model(constant_op.constant([[1.]]))
+      checkpoint.restore(None).initialize_or_restore()
+      self.evaluate(second_dense.bias.assign(
+          constant_op.constant([1., 2., 3., 4., 5.])))
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      save_path = checkpoint.save(checkpoint_prefix)
+      self.evaluate(second_dense.bias.assign(
+          constant_op.constant([5., 6., 7., 8., 9.])))
+      checkpoint.restore(save_path).assert_consumed().run_restore_ops()
+      self.assertAllEqual([1., 2., 3., 4., 5.],
+                          self.evaluate(second_dense.bias))
 
-    deferred_sequential = sequential.Sequential()
-    deferred_sequential_checkpoint = trackable_utils.Checkpoint(
-        model=deferred_sequential)
-    status = deferred_sequential_checkpoint.restore(save_path)
-    deferred_sequential.add(core.Dense(4))
-    deferred_second_dense = core.Dense(5)
-    deferred_sequential.add(deferred_second_dense)
-    deferred_sequential(constant_op.constant([[1.]]))
-    status.run_restore_ops()
-    self.assertAllEqual([1., 2., 3., 4., 5.],
-                        self.evaluate(deferred_second_dense.bias))
+      deferred_sequential = sequential.Sequential()
+      deferred_sequential_checkpoint = trackable_utils.Checkpoint(
+          model=deferred_sequential)
+      status = deferred_sequential_checkpoint.restore(save_path)
+      deferred_sequential.add(core.Dense(4))
+      deferred_second_dense = core.Dense(5)
+      deferred_sequential.add(deferred_second_dense)
+      deferred_sequential(constant_op.constant([[1.]]))
+      status.run_restore_ops()
+      self.assertAllEqual([1., 2., 3., 4., 5.],
+                          self.evaluate(deferred_second_dense.bias))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def test_initialize_if_not_restoring(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    optimizer_only_prefix = os.path.join(checkpoint_directory, "opt")
-    with test_util.device(use_gpu=True):
-      model = MyModel()
-      optimizer = adam.Adam(0.001)
-      root = trackable_utils.Checkpoint(
-          model=model)  # Do not save the optimizer with the checkpoint.
-      optimizer_checkpoint = trackable_utils.Checkpoint(
-          optimizer=optimizer)
+    with self.test_session():
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      optimizer_only_prefix = os.path.join(checkpoint_directory, "opt")
+      with testing_utils.device(should_use_gpu=True):
+        model = MyModel()
+        optimizer = adam.Adam(0.001)
+        root = trackable_utils.Checkpoint(
+            model=model)  # Do not save the optimizer with the checkpoint.
+        optimizer_checkpoint = trackable_utils.Checkpoint(
+            optimizer=optimizer)
 
-      checkpoint_path = checkpoint_management.latest_checkpoint(
-          checkpoint_directory)
-      status = root.restore(save_path=checkpoint_path)
-      input_value = constant_op.constant([[3.]])
-      def train_fn():
-        with backprop.GradientTape() as tape:
-          loss = model(input_value)
-        variables = model.trainable_variables
-        gradients = tape.gradient(loss, variables)
-        return optimizer.apply_gradients(zip(gradients, variables))
-      if not context.executing_eagerly():
-        train_fn = functools.partial(self.evaluate, train_fn())
-      status.initialize_or_restore()
-      # TODO(tanzheny): Add hyper variables to .variables(), and set them with
-      # set_weights etc.
-      variables_not_in_the_variables_property = [
-          obj for obj in optimizer._hyper.values()
-          if isinstance(obj, variables_lib.Variable)]
-      self.evaluate([v.initializer for v
-                     in optimizer.variables()
-                     + variables_not_in_the_variables_property])
-      train_fn()
-      model_save_path = root.save(file_prefix=checkpoint_prefix)
-      self.evaluate(optimizer.beta_1.assign(42.))
-      optimizer_save_path = optimizer_checkpoint.save(optimizer_only_prefix)
-    del train_fn
+        checkpoint_path = checkpoint_management.latest_checkpoint(
+            checkpoint_directory)
+        status = root.restore(save_path=checkpoint_path)
+        input_value = constant_op.constant([[3.]])
+        def train_fn():
+          with backprop.GradientTape() as tape:
+            loss = model(input_value)
+          variables = model.trainable_variables
+          gradients = tape.gradient(loss, variables)
+          return optimizer.apply_gradients(zip(gradients, variables))
+        if not context.executing_eagerly():
+          train_fn = functools.partial(self.evaluate, train_fn())
+        status.initialize_or_restore()
+        # TODO(tanzheny): Add hyper variables to .variables(), and set them with
+        # set_weights etc.
+        variables_not_in_the_variables_property = [
+            obj for obj in optimizer._hyper.values()
+            if isinstance(obj, variables_lib.Variable)]
+        self.evaluate([v.initializer for v
+                       in optimizer.variables()
+                       + variables_not_in_the_variables_property])
+        train_fn()
+        model_save_path = root.save(file_prefix=checkpoint_prefix)
+        self.evaluate(optimizer.beta_1.assign(42.))
+        optimizer_save_path = optimizer_checkpoint.save(optimizer_only_prefix)
+      del train_fn
 
-    # Restore into a graph with the optimizer
-    with test_util.device(use_gpu=True):
-      model = MyModel()
-      optimizer = adam.Adam(0.001)
-      root = trackable_utils.Checkpoint(
-          optimizer=optimizer, model=model)
-      status = root.restore(save_path=model_save_path)
-      input_value = constant_op.constant([[3.]])
-      def train_fn1():
-        with backprop.GradientTape() as tape:
-          loss = model(input_value)
-        variables = model.trainable_variables
-        gradients = tape.gradient(loss, variables)
-        return optimizer.apply_gradients(zip(gradients, variables))
-      if not context.executing_eagerly():
-        train_fn1 = functools.partial(self.evaluate, train_fn1())
-      status.initialize_or_restore()
-      train_fn1()
-      with self.assertRaises(AssertionError):
-        status.assert_existing_objects_matched()
-      with self.assertRaises(AssertionError):
-        status.assert_consumed()
-    del train_fn1
+      # Restore into a graph with the optimizer
+      with testing_utils.device(should_use_gpu=True):
+        model = MyModel()
+        optimizer = adam.Adam(0.001)
+        root = trackable_utils.Checkpoint(
+            optimizer=optimizer, model=model)
+        status = root.restore(save_path=model_save_path)
+        input_value = constant_op.constant([[3.]])
+        def train_fn1():
+          with backprop.GradientTape() as tape:
+            loss = model(input_value)
+          variables = model.trainable_variables
+          gradients = tape.gradient(loss, variables)
+          return optimizer.apply_gradients(zip(gradients, variables))
+        if not context.executing_eagerly():
+          train_fn1 = functools.partial(self.evaluate, train_fn1())
+        status.initialize_or_restore()
+        train_fn1()
+        with self.assertRaises(AssertionError):
+          status.assert_existing_objects_matched()
+        with self.assertRaises(AssertionError):
+          status.assert_consumed()
+      del train_fn1
 
-    # Make sure initialization doesn't clobber later restores
-    with test_util.device(use_gpu=True):
-      model = MyModel()
-      optimizer = adam.Adam(0.001, beta_1=1.0)
-      root = trackable_utils.Checkpoint(
-          optimizer=optimizer, model=model)
-      opt_root = trackable_utils.Checkpoint(
-          optimizer=optimizer)
-      status = root.restore(save_path=model_save_path)
-      init_only_optimizer_status = opt_root.restore(save_path=None)
-      optimizer_status = opt_root.restore(save_path=optimizer_save_path)
-      input_value = constant_op.constant([[3.]])
-      def train_fn2():
-        with backprop.GradientTape() as tape:
-          loss = model(input_value)
-        variables = model.trainable_variables
-        gradients = tape.gradient(loss, variables)
-        return optimizer.apply_gradients(zip(gradients, variables))
-      if not context.executing_eagerly():
-        train_fn2 = functools.partial(self.evaluate, train_fn2())
-      optimizer_status.run_restore_ops()
-      status.initialize_or_restore()
-      init_only_optimizer_status.initialize_or_restore()
-      train_fn2()
-      self.assertEqual(42., self.evaluate(optimizer.beta_1))
+      # Make sure initialization doesn't clobber later restores
+      with testing_utils.device(should_use_gpu=True):
+        model = MyModel()
+        optimizer = adam.Adam(0.001, beta_1=1.0)
+        root = trackable_utils.Checkpoint(
+            optimizer=optimizer, model=model)
+        opt_root = trackable_utils.Checkpoint(
+            optimizer=optimizer)
+        status = root.restore(save_path=model_save_path)
+        init_only_optimizer_status = opt_root.restore(save_path=None)
+        optimizer_status = opt_root.restore(save_path=optimizer_save_path)
+        input_value = constant_op.constant([[3.]])
+        def train_fn2():
+          with backprop.GradientTape() as tape:
+            loss = model(input_value)
+          variables = model.trainable_variables
+          gradients = tape.gradient(loss, variables)
+          return optimizer.apply_gradients(zip(gradients, variables))
+        if not context.executing_eagerly():
+          train_fn2 = functools.partial(self.evaluate, train_fn2())
+        optimizer_status.run_restore_ops()
+        status.initialize_or_restore()
+        init_only_optimizer_status.initialize_or_restore()
+        train_fn2()
+        self.assertEqual(42., self.evaluate(optimizer.beta_1))
 
 
-class _ManualScope(tracking.AutoTrackable):
+class _ManualScope(module.Module):
 
   def __call__(self):
     with variable_scope.variable_scope("ManualScope") as vs:
@@ -712,65 +721,65 @@ class _ManualScope(tracking.AutoTrackable):
     return variable_scope.get_variable(name="in_manual_scope", shape=[])
 
 
-class TemplateTests(parameterized.TestCase, test.TestCase):
+@combinations.generate(combinations.combine(mode=["graph", "eager"]))
+class TemplateTests(keras_parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
   def test_trackable_save_restore(self):
+    with self.test_session():
+      def _templated():
+        v = variable_scope.get_variable(
+            "v", shape=[1], initializer=init_ops.zeros_initializer(),
+            use_resource=True)
+        v2 = variable_scope.get_variable(
+            "v2", shape=[1], initializer=init_ops.zeros_initializer(),
+            use_resource=True)
+        manual = _ManualScope()
+        return v, v + 1., v2, manual, manual()
 
-    def _templated():
-      v = variable_scope.get_variable(
-          "v", shape=[1], initializer=init_ops.zeros_initializer(),
-          use_resource=True)
-      v2 = variable_scope.get_variable(
-          "v2", shape=[1], initializer=init_ops.zeros_initializer(),
-          use_resource=True)
-      manual = _ManualScope()
-      return v, v + 1., v2, manual, manual()
+      save_template = template.make_template("s1", _templated)
+      v1_save, _, v2_save, manual_scope, manual_scope_v = save_template()
+      six.assertCountEqual(
+          self,
+          [id(v1_save), id(v2_save), id(manual_scope),
+           id(manual_scope_v), id(save_template)],
+          map(id, trackable_utils.list_objects(save_template)))
+      manual_dep, = manual_scope._checkpoint_dependencies
+      self.assertEqual("in_manual_scope", manual_dep.name)
+      self.assertIs(manual_scope_v, manual_dep.ref)
+      optimizer = adam.Adam(0.0)
+      save_root = trackable_utils.Checkpoint(
+          my_template=save_template, optimizer=optimizer)
+      optimizer.minimize(v1_save.read_value,
+                         var_list=[v1_save])
+      self.evaluate([v.initializer for v in save_template.variables])
+      optimizer_variables = optimizer.variables() + list(
+          optimizer._hyper.values())
+      self.evaluate([v.initializer for v in optimizer_variables])
+      self.evaluate(v1_save.assign([12.]))
+      self.evaluate(v2_save.assign([14.]))
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      save_path = save_root.save(checkpoint_prefix)
 
-    save_template = template.make_template("s1", _templated)
-    v1_save, _, v2_save, manual_scope, manual_scope_v = save_template()
-    six.assertCountEqual(
-        self,
-        [id(v1_save), id(v2_save), id(manual_scope),
-         id(manual_scope_v), id(save_template)],
-        map(id, trackable_utils.list_objects(save_template)))
-    manual_dep, = manual_scope._checkpoint_dependencies
-    self.assertEqual("in_manual_scope", manual_dep.name)
-    self.assertIs(manual_scope_v, manual_dep.ref)
-    optimizer = adam.Adam(0.0)
-    save_root = trackable_utils.Checkpoint(
-        my_template=save_template, optimizer=optimizer)
-    optimizer.minimize(v1_save.read_value,
-                       var_list=[v1_save])
-    self.evaluate([v.initializer for v in save_template.variables])
-    optimizer_variables = optimizer.variables() + list(
-        optimizer._hyper.values())
-    self.evaluate([v.initializer for v in optimizer_variables])
-    self.evaluate(v1_save.assign([12.]))
-    self.evaluate(v2_save.assign([14.]))
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    save_path = save_root.save(checkpoint_prefix)
-
-    load_template = template.make_template("s2", _templated)
-    load_optimizer = adam.Adam(0.0)
-    load_root = trackable_utils.Checkpoint(
-        my_template=load_template, optimizer=load_optimizer)
-    status = load_root.restore(save_path)
-    var, var_plus_one, var2, _, _ = load_template()
-    load_optimizer.minimize(var.read_value, var_list=[var])
-    self.assertLen(load_template._checkpoint_dependencies, 3)
-    self.assertEqual("v", load_template._checkpoint_dependencies[0].name)
-    self.assertEqual("v2", load_template._checkpoint_dependencies[1].name)
-    self.assertEqual("ManualScope",
-                     load_template._checkpoint_dependencies[2].name)
-    status.assert_consumed().run_restore_ops()
-    self.assertAllEqual([12.], self.evaluate(var))
-    self.assertAllEqual([13.], self.evaluate(var_plus_one))
-    self.assertAllEqual([14.], self.evaluate(var2))
+      load_template = template.make_template("s2", _templated)
+      load_optimizer = adam.Adam(0.0)
+      load_root = trackable_utils.Checkpoint(
+          my_template=load_template, optimizer=load_optimizer)
+      status = load_root.restore(save_path)
+      var, var_plus_one, var2, _, _ = load_template()
+      load_optimizer.minimize(var.read_value, var_list=[var])
+      self.assertLen(load_template._checkpoint_dependencies, 3)
+      self.assertEqual("v", load_template._checkpoint_dependencies[0].name)
+      self.assertEqual("v2", load_template._checkpoint_dependencies[1].name)
+      self.assertEqual("ManualScope",
+                       load_template._checkpoint_dependencies[2].name)
+      status.assert_consumed().run_restore_ops()
+      self.assertAllEqual([12.], self.evaluate(var))
+      self.assertAllEqual([13.], self.evaluate(var_plus_one))
+      self.assertAllEqual([14.], self.evaluate(var2))
 
 
-class CheckpointCompatibilityTests(test.TestCase):
+class CheckpointCompatibilityTests(keras_parameterized.TestCase):
 
   def _initialized_model(self):
     input_value = constant_op.constant([[3.]])
@@ -825,47 +834,49 @@ class CheckpointCompatibilityTests(test.TestCase):
             save_path=checkpoint_prefix,
             global_step=root.optimizer.iterations)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testLoadFromNameBasedSaver(self):
     """Save a name-based checkpoint, load it using the object-based API."""
-    with test_util.device(use_gpu=True):
-      save_path = self._write_name_based_checkpoint()
-      root = self._initialized_model()
-      self._set_sentinels(root)
-      with self.assertRaises(AssertionError):
-        self._check_sentinels(root)
-      object_saver = trackable_utils.TrackableSaver(
-          graph_view.ObjectGraphView(root))
-      self._set_sentinels(root)
-      status = object_saver.restore(save_path)
-      if context.executing_eagerly():
-        self._check_sentinels(root)
-      if context.executing_eagerly():
-        status.assert_consumed()
-        status.assert_existing_objects_matched()
-        status.assert_nontrivial_match()
-      else:
-        # When graph building, we haven't read any keys, so we don't know
-        # whether the restore will be complete.
-        with self.assertRaisesRegex(AssertionError, "not restored"):
+    with testing_utils.device(should_use_gpu=True):
+      with self.test_session():
+        save_path = self._write_name_based_checkpoint()
+        root = self._initialized_model()
+        self._set_sentinels(root)
+        with self.assertRaises(AssertionError):
+          self._check_sentinels(root)
+        object_saver = trackable_utils.TrackableSaver(
+            graph_view.ObjectGraphView(root))
+        self._set_sentinels(root)
+        status = object_saver.restore(save_path)
+        if context.executing_eagerly():
+          self._check_sentinels(root)
+        if context.executing_eagerly():
           status.assert_consumed()
-        with self.assertRaisesRegex(AssertionError, "not restored"):
           status.assert_existing_objects_matched()
-        with self.assertRaisesRegex(AssertionError, "not restored"):
           status.assert_nontrivial_match()
-      status.run_restore_ops()
-      self._check_sentinels(root)
-      self._set_sentinels(root)
-      status = object_saver.restore(save_path)
-      status.initialize_or_restore()
-      status.assert_nontrivial_match()
-      self._check_sentinels(root)
-      # Check that there is no error when keys are missing from the name-based
-      # checkpoint.
-      root.not_in_name_checkpoint = resource_variable_ops.ResourceVariable([1.])
-      status = object_saver.restore(save_path)
-      with self.assertRaises(AssertionError):
-        status.assert_existing_objects_matched()
+        else:
+          # When graph building, we haven't read any keys, so we don't know
+          # whether the restore will be complete.
+          with self.assertRaisesRegex(AssertionError, "not restored"):
+            status.assert_consumed()
+          with self.assertRaisesRegex(AssertionError, "not restored"):
+            status.assert_existing_objects_matched()
+          with self.assertRaisesRegex(AssertionError, "not restored"):
+            status.assert_nontrivial_match()
+        status.run_restore_ops()
+        self._check_sentinels(root)
+        self._set_sentinels(root)
+        status = object_saver.restore(save_path)
+        status.initialize_or_restore()
+        status.assert_nontrivial_match()
+        self._check_sentinels(root)
+        # Check that there is no error when keys are missing from the name-based
+        # checkpoint.
+        root.not_in_name_checkpoint = resource_variable_ops.ResourceVariable(
+            [1.])
+        status = object_saver.restore(save_path)
+        with self.assertRaises(AssertionError):
+          status.assert_existing_objects_matched()
 
   def testSaveGraphLoadEager(self):
     checkpoint_directory = self.get_temp_dir()
diff --git a/tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py b/tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py
index 1a699803e1a..6a998b238fe 100644
--- a/tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py
+++ b/tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+  # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -31,8 +31,12 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import combinations
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
+from tensorflow.python.module import module
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.training import adam
@@ -40,11 +44,10 @@ from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
 from tensorflow.python.training.tracking import graph_view
-from tensorflow.python.training.tracking import tracking
 from tensorflow.python.training.tracking import util as trackable_utils
 
 
-class NonLayerTrackable(tracking.AutoTrackable):
+class NonLayerTrackable(module.Module):
 
   def __init__(self):
     super(NonLayerTrackable, self).__init__()
@@ -68,7 +71,7 @@ class MyModel(training.Model):
     return ret
 
 
-class CheckpointingTests(test.TestCase):
+class CheckpointingTests(keras_parameterized.TestCase):
 
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testNamingWithOptimizer(self):
@@ -178,77 +181,78 @@ class CheckpointingTests(test.TestCase):
             optimizer_node.slot_variables[0]
             .slot_variable_node_id].attributes[0].checkpoint_key)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testSaveRestore(self):
-    model = MyModel()
-    optimizer = adam.AdamOptimizer(0.001)
-    root_trackable = trackable_utils.Checkpoint(
-        optimizer=optimizer, model=model)
-    input_value = constant_op.constant([[3.]])
-    if context.executing_eagerly():
-      optimizer.minimize(
-          lambda: model(input_value))
-    else:
-      train_op = optimizer.minimize(model(input_value))
-      # TODO(allenl): Make initialization more pleasant when graph building.
-      root_trackable.save_counter  # pylint: disable=pointless-statement
-      self.evaluate(trackable_utils.gather_initializers(
-          root_trackable))
-      self.evaluate(train_op)
-    prefix = os.path.join(self.get_temp_dir(), "ckpt")
-    self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
-    m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
-    self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
-    save_path = root_trackable.save(file_prefix=prefix)
-    self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
-    self.evaluate(state_ops.assign(root_trackable.save_counter, 3))
-    optimizer_variables = self.evaluate(optimizer.variables())
-    self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
-    # Immediate restoration
-    status = root_trackable.restore(save_path=save_path).assert_consumed()
-    status.run_restore_ops()
-    self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
-    self.assertAllEqual(1, self.evaluate(root_trackable.save_counter))
-    self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
-    if not context.executing_eagerly():
-      return  # Restore-on-create is only supported when executing eagerly
-    on_create_model = MyModel()
-    on_create_optimizer = adam.AdamOptimizer(
-        0.001,
-        # Preserve beta1_power and beta2_power when applying gradients so we can
-        # test that they've been restored correctly.
-        beta1=1.0,
-        beta2=1.0)
-    on_create_root = trackable_utils.Checkpoint(
-        optimizer=on_create_optimizer, model=on_create_model)
-    # Deferred restoration
-    status = on_create_root.restore(save_path=save_path)
-    status.assert_nontrivial_match()
-    status.assert_existing_objects_matched()
-    with self.assertRaises(AssertionError):
+    with self.test_session():
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001)
+      root_trackable = trackable_utils.Checkpoint(
+          optimizer=optimizer, model=model)
+      input_value = constant_op.constant([[3.]])
+      if context.executing_eagerly():
+        optimizer.minimize(
+            lambda: model(input_value))
+      else:
+        train_op = optimizer.minimize(model(input_value))
+        # TODO(allenl): Make initialization more pleasant when graph building.
+        root_trackable.save_counter  # pylint: disable=pointless-statement
+        self.evaluate(trackable_utils.gather_initializers(
+            root_trackable))
+        self.evaluate(train_op)
+      prefix = os.path.join(self.get_temp_dir(), "ckpt")
+      self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
+      m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
+      self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
+      save_path = root_trackable.save(file_prefix=prefix)
+      self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
+      self.evaluate(state_ops.assign(root_trackable.save_counter, 3))
+      optimizer_variables = self.evaluate(optimizer.variables())
+      self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
+      # Immediate restoration
+      status = root_trackable.restore(save_path=save_path).assert_consumed()
+      status.run_restore_ops()
+      self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
+      self.assertAllEqual(1, self.evaluate(root_trackable.save_counter))
+      self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
+      if not context.executing_eagerly():
+        return  # Restore-on-create is only supported when executing eagerly
+      on_create_model = MyModel()
+      on_create_optimizer = adam.AdamOptimizer(
+          0.001,
+          # Preserve beta1_power and beta2_power when applying gradients
+          # so we can test that they've been restored correctly.
+          beta1=1.0,
+          beta2=1.0)
+      on_create_root = trackable_utils.Checkpoint(
+          optimizer=on_create_optimizer, model=on_create_model)
+      # Deferred restoration
+      status = on_create_root.restore(save_path=save_path)
+      status.assert_nontrivial_match()
+      status.assert_existing_objects_matched()
+      with self.assertRaises(AssertionError):
+        status.assert_consumed()
+      on_create_model(constant_op.constant([[3.]]))  # create variables
+      self.assertAllEqual(1, self.evaluate(on_create_root.save_counter))
+      self.assertAllEqual([42.],
+                          self.evaluate(
+                              on_create_model._named_dense.variables[1]))
+      on_create_m_bias_slot = on_create_optimizer.get_slot(
+          on_create_model._named_dense.variables[1], "m")
+      status.assert_existing_objects_matched()
+      with self.assertRaises(AssertionError):
+        status.assert_consumed()
+      # Optimizer slot variables are created when the original variable is
+      # restored.
+      self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
+      self.assertAllEqual(optimizer_variables[2:],
+                          self.evaluate(on_create_optimizer.variables()))
+      dummy_var = variables.Variable([1.])
+      on_create_optimizer.minimize(loss=dummy_var.read_value)
+      status.assert_existing_objects_matched()
       status.assert_consumed()
-    on_create_model(constant_op.constant([[3.]]))  # create variables
-    self.assertAllEqual(1, self.evaluate(on_create_root.save_counter))
-    self.assertAllEqual([42.],
-                        self.evaluate(
-                            on_create_model._named_dense.variables[1]))
-    on_create_m_bias_slot = on_create_optimizer.get_slot(
-        on_create_model._named_dense.variables[1], "m")
-    status.assert_existing_objects_matched()
-    with self.assertRaises(AssertionError):
-      status.assert_consumed()
-    # Optimizer slot variables are created when the original variable is
-    # restored.
-    self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
-    self.assertAllEqual(optimizer_variables[2:],
-                        self.evaluate(on_create_optimizer.variables()))
-    dummy_var = variables.Variable([1.])
-    on_create_optimizer.minimize(loss=dummy_var.read_value)
-    status.assert_existing_objects_matched()
-    status.assert_consumed()
-    beta1_power, beta2_power = on_create_optimizer._get_beta_accumulators()
-    self.assertAllEqual(optimizer_variables[0], self.evaluate(beta1_power))
-    self.assertAllEqual(optimizer_variables[1], self.evaluate(beta2_power))
+      beta1_power, beta2_power = on_create_optimizer._get_beta_accumulators()
+      self.assertAllEqual(optimizer_variables[0], self.evaluate(beta1_power))
+      self.assertAllEqual(optimizer_variables[1], self.evaluate(beta2_power))
 
   # TODO(allenl): Debug garbage created by this test in python3.
   def testDeferredRestorationUsageEager(self):
@@ -378,89 +382,91 @@ class CheckpointingTests(test.TestCase):
             self.assertEqual(training_continuation + 1,
                              session.run(root.save_counter))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testAgnosticUsage(self):
     """Graph/eager agnostic usage."""
     # Does create garbage when executing eagerly due to ops.Graph() creation.
-    num_training_steps = 10
-    checkpoint_directory = self.get_temp_dir()
-    for training_continuation in range(3):
-      with test_util.device(use_gpu=True):
-        model = MyModel()
-        optimizer = adam.AdamOptimizer(0.001)
-        root = trackable_utils.Checkpoint(
-            optimizer=optimizer, model=model,
-            global_step=training_util.get_or_create_global_step())
-        manager = checkpoint_management.CheckpointManager(
-            root, checkpoint_directory, max_to_keep=1)
-        status = root.restore(save_path=manager.latest_checkpoint)
-        input_value = constant_op.constant([[3.]])
-        train_fn = functools.partial(
-            optimizer.minimize,
-            functools.partial(model, input_value),
-            global_step=root.global_step)
-        if not context.executing_eagerly():
-          train_fn = functools.partial(self.evaluate, train_fn())
-        status.initialize_or_restore()
-        for _ in range(num_training_steps):
-          train_fn()
-        manager.save()
-        self.assertEqual((training_continuation + 1) * num_training_steps,
-                         self.evaluate(root.global_step))
-        self.assertEqual(training_continuation + 1,
-                         self.evaluate(root.save_counter))
+    with self.test_session():
+      num_training_steps = 10
+      checkpoint_directory = self.get_temp_dir()
+      for training_continuation in range(3):
+        with testing_utils.device(should_use_gpu=True):
+          model = MyModel()
+          optimizer = adam.AdamOptimizer(0.001)
+          root = trackable_utils.Checkpoint(
+              optimizer=optimizer, model=model,
+              global_step=training_util.get_or_create_global_step())
+          manager = checkpoint_management.CheckpointManager(
+              root, checkpoint_directory, max_to_keep=1)
+          status = root.restore(save_path=manager.latest_checkpoint)
+          input_value = constant_op.constant([[3.]])
+          train_fn = functools.partial(
+              optimizer.minimize,
+              functools.partial(model, input_value),
+              global_step=root.global_step)
+          if not context.executing_eagerly():
+            train_fn = functools.partial(self.evaluate, train_fn())
+          status.initialize_or_restore()
+          for _ in range(num_training_steps):
+            train_fn()
+          manager.save()
+          self.assertEqual((training_continuation + 1) * num_training_steps,
+                           self.evaluate(root.global_step))
+          self.assertEqual(training_continuation + 1,
+                           self.evaluate(root.save_counter))
 
   # pylint: disable=cell-var-from-loop
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testWithDefun(self):
-    num_training_steps = 2
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    for training_continuation in range(3):
-      with test_util.device(use_gpu=True):
-        model = MyModel()
-        # Don't actually train so we can test variable values
-        optimizer = adam.AdamOptimizer(0.)
-        root = trackable_utils.Checkpoint(
-            optimizer=optimizer, model=model,
-            global_step=training_util.get_or_create_global_step())
-        checkpoint_path = checkpoint_management.latest_checkpoint(
-            checkpoint_directory)
-        status = root.restore(save_path=checkpoint_path)
-        def train_fn():
-          @def_function.function
-          def _call_model(x):
-            return model(x)
-          with backprop.GradientTape() as tape:
-            loss = _call_model(constant_op.constant([[3.]]))
-          gradients = tape.gradient(loss, model.variables)
-          return optimizer.apply_gradients(zip(gradients, model.variables),
-                                           global_step=root.global_step)
-        if not context.executing_eagerly():
-          train_fn = functools.partial(
-              self.evaluate, train_fn())
-        status.initialize_or_restore()
-        for _ in range(num_training_steps):
-          train_fn()
-        if training_continuation > 0:
-          status.assert_consumed()
-          self.assertAllClose([[42.]], self.evaluate(model.variables[0]))
-        else:
-          self.evaluate(model.variables[0].assign([[42.]]))
-        root.save(file_prefix=checkpoint_prefix)
-        self.assertEqual((training_continuation + 1) * num_training_steps,
-                         self.evaluate(root.global_step))
-        self.assertEqual(training_continuation + 1,
-                         self.evaluate(root.save_counter))
+    with self.test_session():
+      num_training_steps = 2
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      for training_continuation in range(3):
+        with testing_utils.device(should_use_gpu=True):
+          model = MyModel()
+          # Don't actually train so we can test variable values
+          optimizer = adam.AdamOptimizer(0.)
+          root = trackable_utils.Checkpoint(
+              optimizer=optimizer, model=model,
+              global_step=training_util.get_or_create_global_step())
+          checkpoint_path = checkpoint_management.latest_checkpoint(
+              checkpoint_directory)
+          status = root.restore(save_path=checkpoint_path)
+          def train_fn():
+            @def_function.function
+            def _call_model(x):
+              return model(x)
+            with backprop.GradientTape() as tape:
+              loss = _call_model(constant_op.constant([[3.]]))
+            gradients = tape.gradient(loss, model.variables)
+            return optimizer.apply_gradients(zip(gradients, model.variables),
+                                             global_step=root.global_step)
+          if not context.executing_eagerly():
+            train_fn = functools.partial(
+                self.evaluate, train_fn())
+          status.initialize_or_restore()
+          for _ in range(num_training_steps):
+            train_fn()
+          if training_continuation > 0:
+            status.assert_consumed()
+            self.assertAllClose([[42.]], self.evaluate(model.variables[0]))
+          else:
+            self.evaluate(model.variables[0].assign([[42.]]))
+          root.save(file_prefix=checkpoint_prefix)
+          self.assertEqual((training_continuation + 1) * num_training_steps,
+                           self.evaluate(root.global_step))
+          self.assertEqual(training_continuation + 1,
+                           self.evaluate(root.save_counter))
   # pylint: enable=cell-var-from-loop
 
   def _get_checkpoint_name(self, name):
-    root = tracking.AutoTrackable()
+    root = module.Module()
     trackable_utils.add_variable(
         root, name=name, shape=[1, 2], dtype=dtypes.float64)
     (named_variable,), _, _ = trackable_utils._serialize_object_graph(
         root, saveables_cache=None)
-    with ops.name_scope("root/" + named_variable.name):
+    with ops.name_scope_v2("root/" + named_variable.name):
       pass  # Make sure we can use this as an op name if we prefix it.
     return named_variable.name
 
@@ -493,86 +499,87 @@ class CheckpointingTests(test.TestCase):
         optimizer.apply_gradients(
             [(g, v) for g, v in zip(grad, model.vars)])
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def test_initialize_if_not_restoring(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    optimizer_only_prefix = os.path.join(checkpoint_directory, "opt")
-    with test_util.device(use_gpu=True):
-      model = MyModel()
-      optimizer = adam.AdamOptimizer(0.001)
-      root = trackable_utils.Checkpoint(
-          model=model,  # Do not save the optimizer with the checkpoint.
-          global_step=training_util.get_or_create_global_step())
-      optimizer_checkpoint = trackable_utils.Checkpoint(
-          optimizer=optimizer)
+    with self.test_session():
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      optimizer_only_prefix = os.path.join(checkpoint_directory, "opt")
+      with testing_utils.device(should_use_gpu=True):
+        model = MyModel()
+        optimizer = adam.AdamOptimizer(0.001)
+        root = trackable_utils.Checkpoint(
+            model=model,  # Do not save the optimizer with the checkpoint.
+            global_step=training_util.get_or_create_global_step())
+        optimizer_checkpoint = trackable_utils.Checkpoint(
+            optimizer=optimizer)
 
-      checkpoint_path = checkpoint_management.latest_checkpoint(
-          checkpoint_directory)
-      status = root.restore(save_path=checkpoint_path)
-      input_value = constant_op.constant([[3.]])
-      train_fn = functools.partial(
-          optimizer.minimize,
-          functools.partial(model, input_value),
-          global_step=root.global_step)
-      if not context.executing_eagerly():
-        train_fn = functools.partial(self.evaluate, train_fn())
-      status.initialize_or_restore()
-      self.evaluate([v.initializer for v in optimizer.variables()])
-      train_fn()
-      model_save_path = root.save(file_prefix=checkpoint_prefix)
-      self.evaluate(optimizer.variables()[0].assign(42.))
-      optimizer_save_path = optimizer_checkpoint.save(optimizer_only_prefix)
+        checkpoint_path = checkpoint_management.latest_checkpoint(
+            checkpoint_directory)
+        status = root.restore(save_path=checkpoint_path)
+        input_value = constant_op.constant([[3.]])
+        train_fn = functools.partial(
+            optimizer.minimize,
+            functools.partial(model, input_value),
+            global_step=root.global_step)
+        if not context.executing_eagerly():
+          train_fn = functools.partial(self.evaluate, train_fn())
+        status.initialize_or_restore()
+        self.evaluate([v.initializer for v in optimizer.variables()])
+        train_fn()
+        model_save_path = root.save(file_prefix=checkpoint_prefix)
+        self.evaluate(optimizer.variables()[0].assign(42.))
+        optimizer_save_path = optimizer_checkpoint.save(optimizer_only_prefix)
 
-    # Restore into a graph with the optimizer
-    with test_util.device(use_gpu=True):
-      model = MyModel()
-      optimizer = adam.AdamOptimizer(0.001)
-      root = trackable_utils.Checkpoint(
-          optimizer=optimizer, model=model,
-          global_step=training_util.get_or_create_global_step())
-      status = root.restore(save_path=model_save_path)
-      input_value = constant_op.constant([[3.]])
-      train_fn = functools.partial(
-          optimizer.minimize,
-          functools.partial(model, input_value),
-          global_step=root.global_step)
-      if not context.executing_eagerly():
-        train_fn = functools.partial(self.evaluate, train_fn())
-      status.initialize_or_restore()
-      train_fn()
-      with self.assertRaises(AssertionError):
-        status.assert_existing_objects_matched()
-      with self.assertRaises(AssertionError):
-        status.assert_consumed()
+      # Restore into a graph with the optimizer
+      with testing_utils.device(should_use_gpu=True):
+        model = MyModel()
+        optimizer = adam.AdamOptimizer(0.001)
+        root = trackable_utils.Checkpoint(
+            optimizer=optimizer, model=model,
+            global_step=training_util.get_or_create_global_step())
+        status = root.restore(save_path=model_save_path)
+        input_value = constant_op.constant([[3.]])
+        train_fn = functools.partial(
+            optimizer.minimize,
+            functools.partial(model, input_value),
+            global_step=root.global_step)
+        if not context.executing_eagerly():
+          train_fn = functools.partial(self.evaluate, train_fn())
+        status.initialize_or_restore()
+        train_fn()
+        with self.assertRaises(AssertionError):
+          status.assert_existing_objects_matched()
+        with self.assertRaises(AssertionError):
+          status.assert_consumed()
 
-    # Make sure initialization doesn't clobber later restores
-    with test_util.device(use_gpu=True):
-      model = MyModel()
-      optimizer = adam.AdamOptimizer(0.001, beta1=1.0)
-      root = trackable_utils.Checkpoint(
-          optimizer=optimizer, model=model,
-          global_step=training_util.get_or_create_global_step())
-      opt_root = trackable_utils.Checkpoint(
-          optimizer=optimizer)
-      status = root.restore(save_path=model_save_path)
-      init_only_optimizer_status = opt_root.restore(save_path=None)
-      optimizer_status = opt_root.restore(save_path=optimizer_save_path)
-      input_value = constant_op.constant([[3.]])
-      train_fn = functools.partial(
-          optimizer.minimize,
-          functools.partial(model, input_value),
-          global_step=root.global_step)
-      if not context.executing_eagerly():
-        train_fn = functools.partial(self.evaluate, train_fn())
-      optimizer_status.run_restore_ops()
-      status.initialize_or_restore()
-      init_only_optimizer_status.initialize_or_restore()
-      train_fn()
-      self.assertEqual(42., self.evaluate(optimizer.variables()[0]))
+      # Make sure initialization doesn't clobber later restores
+      with testing_utils.device(should_use_gpu=True):
+        model = MyModel()
+        optimizer = adam.AdamOptimizer(0.001, beta1=1.0)
+        root = trackable_utils.Checkpoint(
+            optimizer=optimizer, model=model,
+            global_step=training_util.get_or_create_global_step())
+        opt_root = trackable_utils.Checkpoint(
+            optimizer=optimizer)
+        status = root.restore(save_path=model_save_path)
+        init_only_optimizer_status = opt_root.restore(save_path=None)
+        optimizer_status = opt_root.restore(save_path=optimizer_save_path)
+        input_value = constant_op.constant([[3.]])
+        train_fn = functools.partial(
+            optimizer.minimize,
+            functools.partial(model, input_value),
+            global_step=root.global_step)
+        if not context.executing_eagerly():
+          train_fn = functools.partial(self.evaluate, train_fn())
+        optimizer_status.run_restore_ops()
+        status.initialize_or_restore()
+        init_only_optimizer_status.initialize_or_restore()
+        train_fn()
+        self.assertEqual(42., self.evaluate(optimizer.variables()[0]))
 
 
-class CheckpointCompatibilityTests(test.TestCase):
+class CheckpointCompatibilityTests(keras_parameterized.TestCase):
 
   def _initialized_model(self):
     input_value = constant_op.constant([[3.]])
@@ -627,46 +634,47 @@ class CheckpointCompatibilityTests(test.TestCase):
             sess=session, save_path=checkpoint_prefix,
             global_step=root.optimizer_step)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testLoadFromNameBasedSaver(self):
     """Save a name-based checkpoint, load it using the object-based API."""
-    with test_util.device(use_gpu=True):
-      save_path = self._write_name_based_checkpoint()
-      root = self._initialized_model()
-      self._set_sentinels(root)
-      with self.assertRaises(AssertionError):
-        self._check_sentinels(root)
-      object_saver = trackable_utils.TrackableSaver(
-          graph_view.ObjectGraphView(root))
-      self._set_sentinels(root)
-      status = object_saver.restore(save_path)
-      if context.executing_eagerly():
-        self._check_sentinels(root)
-      if context.executing_eagerly():
-        status.assert_consumed()
-        status.assert_existing_objects_matched()
-        status.assert_nontrivial_match()
-      else:
-        # When graph building, we haven't read any keys, so we don't know
-        # whether the restore will be complete.
-        with self.assertRaisesRegex(AssertionError, "not restored"):
+    with testing_utils.device(should_use_gpu=True):
+      with self.test_session():
+        save_path = self._write_name_based_checkpoint()
+        root = self._initialized_model()
+        self._set_sentinels(root)
+        with self.assertRaises(AssertionError):
+          self._check_sentinels(root)
+        object_saver = trackable_utils.TrackableSaver(
+            graph_view.ObjectGraphView(root))
+        self._set_sentinels(root)
+        status = object_saver.restore(save_path)
+        if context.executing_eagerly():
+          self._check_sentinels(root)
+        if context.executing_eagerly():
           status.assert_consumed()
-        with self.assertRaisesRegex(AssertionError, "not restored"):
           status.assert_existing_objects_matched()
-        with self.assertRaisesRegex(AssertionError, "not restored"):
           status.assert_nontrivial_match()
-      status.run_restore_ops()
-      self._check_sentinels(root)
-      self._set_sentinels(root)
-      status = object_saver.restore(save_path)
-      status.initialize_or_restore()
-      self._check_sentinels(root)
-      # Check that there is no error when keys are missing from the name-based
-      # checkpoint.
-      root.not_in_name_checkpoint = variables.Variable([1.])
-      status = object_saver.restore(save_path)
-      with self.assertRaises(AssertionError):
-        status.assert_existing_objects_matched()
+        else:
+          # When graph building, we haven't read any keys, so we don't know
+          # whether the restore will be complete.
+          with self.assertRaisesRegex(AssertionError, "not restored"):
+            status.assert_consumed()
+          with self.assertRaisesRegex(AssertionError, "not restored"):
+            status.assert_existing_objects_matched()
+          with self.assertRaisesRegex(AssertionError, "not restored"):
+            status.assert_nontrivial_match()
+        status.run_restore_ops()
+        self._check_sentinels(root)
+        self._set_sentinels(root)
+        status = object_saver.restore(save_path)
+        status.initialize_or_restore()
+        self._check_sentinels(root)
+        # Check that there is no error when keys are missing from the name-based
+        # checkpoint.
+        root.not_in_name_checkpoint = variables.Variable([1.])
+        status = object_saver.restore(save_path)
+        with self.assertRaises(AssertionError):
+          status.assert_existing_objects_matched()
 
   def testSaveGraphLoadEager(self):
     checkpoint_directory = self.get_temp_dir()
diff --git a/tensorflow/python/keras/tests/tracking_util_xla_test.py b/tensorflow/python/keras/tests/tracking_util_xla_test.py
index 4e8dd0a6fd3..0a311011c5a 100644
--- a/tensorflow/python/keras/tests/tracking_util_xla_test.py
+++ b/tensorflow/python/keras/tests/tracking_util_xla_test.py
@@ -23,13 +23,13 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.module import module
 from tensorflow.python.platform import test
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training.tracking import tracking
 from tensorflow.python.training.tracking import util as trackable_utils
 
 
-class NonLayerTrackable(tracking.AutoTrackable):
+class NonLayerTrackable(module.Module):
 
   def __init__(self):
     super(NonLayerTrackable, self).__init__()
diff --git a/tensorflow/python/keras/utils/BUILD b/tensorflow/python/keras/utils/BUILD
index 8e84a789c66..899701d624c 100644
--- a/tensorflow/python/keras/utils/BUILD
+++ b/tensorflow/python/keras/utils/BUILD
@@ -31,6 +31,7 @@ py_library(
         "all_utils.py",
     ],
     deps = [
+        ":control_flow_util",
         ":engine_utils",
         ":generic_utils",
         ":layer_utils",
@@ -40,6 +41,13 @@ py_library(
     ],
 )
 
+py_library(
+    name = "control_flow_util",
+    srcs = ["control_flow_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [],
+)
+
 py_library(
     name = "data_utils",
     srcs = ["data_utils.py"],
diff --git a/tensorflow/python/keras/utils/control_flow_util.py b/tensorflow/python/keras/utils/control_flow_util.py
new file mode 100644
index 00000000000..8d13c573149
--- /dev/null
+++ b/tensorflow/python/keras/utils/control_flow_util.py
@@ -0,0 +1,139 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Utility functions for control flow.
+
+This file is copied from tensorflow/python/ops/control_flow_util.py.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import smart_cond as smart_module
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import variables
+
+
+def InXlaContext(graph):
+  ctxt = graph._get_control_flow_context()  # pylint: disable=protected-access
+  return GetContainingXLAContext(ctxt) is not None
+
+
+def GraphOrParentsInXlaContext(graph):
+  while True:
+    if InXlaContext(graph): return True
+    try:
+      graph = graph.outer_graph
+    except AttributeError:
+      return False
+
+
+def IsInWhileLoop(op):
+  ctxt = op._get_control_flow_context()  # pylint: disable=protected-access
+  return GetContainingWhileContext(ctxt) is not None
+
+
+def GetContainingWhileContext(ctxt, stop_ctxt=None):
+  """Returns the first ancestor WhileContext of `ctxt`.
+
+  Returns `ctxt` if `ctxt` is a WhileContext, or None if `ctxt` is not in a
+  while loop.
+
+  Args:
+    ctxt: ControlFlowContext
+    stop_ctxt: ControlFlowContext, optional. If provided, the search will end
+      if it sees stop_ctxt.
+
+  Returns:
+    `ctxt` if `ctxt` is a WhileContext, the most nested WhileContext containing
+    `ctxt`, or None if `ctxt` is not in a while loop.  If `stop_ctxt` is not
+    `None`, this returns `ctxt` if it matches `stop_ctxt` in its traversal.
+  """
+  while ctxt:
+    if ctxt.IsWhileContext() or ctxt == stop_ctxt: return ctxt
+    ctxt = ctxt.outer_context
+  return None
+
+
+def GetContainingXLAContext(ctxt):
+  """Returns the first ancestor XLAContext of `ctxt`.
+
+  Returns `ctxt` if `ctxt` is a XLAContext, or None if `ctxt` is not in a
+  while loop.
+
+  Args:
+    ctxt: ControlFlowContext
+
+  Returns:
+    `ctxt` if `ctxt` is a XLAContext, the most nested XLAContext containing
+    `ctxt`, or None if `ctxt` is not in a while loop.
+  """
+  while ctxt:
+    if ctxt.IsXLAContext(): return ctxt
+    ctxt = ctxt.outer_context
+  return None
+
+
+def smart_cond(pred, true_fn=None, false_fn=None, name=None):  # pylint: disable=invalid-name
+  """Return either `true_fn()` if predicate `pred` is true else `false_fn()`.
+
+  If `pred` is a bool or has a constant value, we return either `true_fn()`
+  or `false_fn()`, otherwise we use `tf.cond` to dynamically route to both.
+
+  Arguments:
+    pred: A scalar determining whether to return the result of `true_fn` or
+      `false_fn`.
+    true_fn: The callable to be performed if pred is true.
+    false_fn: The callable to be performed if pred is false.
+    name: Optional name prefix when using `tf.cond`.
+
+  Returns:
+    Tensors returned by the call to either `true_fn` or `false_fn`.
+
+  Raises:
+    TypeError: If `true_fn` or `false_fn` is not callable.
+  """
+  if isinstance(pred, variables.Variable):
+    return control_flow_ops.cond(
+        pred, true_fn=true_fn, false_fn=false_fn, name=name)
+  return smart_module.smart_cond(
+      pred, true_fn=true_fn, false_fn=false_fn, name=name)
+
+
+def constant_value(pred):  # pylint: disable=invalid-name
+  """Return the bool value for `pred`, or None if `pred` had a dynamic value.
+
+  Arguments:
+    pred: A scalar, either a Python bool or a TensorFlow boolean variable
+      or tensor, or the Python integer 1 or 0.
+
+  Returns:
+    True or False if `pred` has a constant boolean value, None otherwise.
+
+  Raises:
+    TypeError: If `pred` is not a Variable, Tensor or bool, or Python
+      integer 1 or 0.
+  """
+  # Allow integer booleans.
+  if isinstance(pred, int):
+    if pred == 1:
+      pred = True
+    elif pred == 0:
+      pred = False
+
+  if isinstance(pred, variables.Variable):
+    return None
+  return smart_module.smart_constant_value(pred)
diff --git a/tensorflow/python/keras/utils/data_utils.py b/tensorflow/python/keras/utils/data_utils.py
index 3456db013d3..7eb0b63aebd 100644
--- a/tensorflow/python/keras/utils/data_utils.py
+++ b/tensorflow/python/keras/utils/data_utils.py
@@ -384,10 +384,10 @@ class ThreadsafeIter(object):
   def __iter__(self):
     return self
 
-  def __next__(self):
-    return self.next()
-
   def next(self):
+    return self.__next__()
+
+  def __next__(self):
     with self.lock:
       if self._exception:
         raise self._exception  # pylint: disable=raising-bad-type
diff --git a/tensorflow/python/keras/utils/generic_utils.py b/tensorflow/python/keras/utils/generic_utils.py
index f26e6a61947..e33a24b93dd 100644
--- a/tensorflow/python/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/utils/generic_utils.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import binascii
 import codecs
+import importlib
 import marshal
 import os
 import re
@@ -801,7 +802,31 @@ def populate_dict_with_module_objects(target_dict, modules, obj_filter):
       if obj_filter(obj):
         target_dict[name] = obj
 
+
+class LazyLoader(python_types.ModuleType):
+  """Lazily import a module, mainly to avoid pulling in large dependencies."""
+
+  def __init__(self, local_name, parent_module_globals, name):
+    self._local_name = local_name
+    self._parent_module_globals = parent_module_globals
+    super(LazyLoader, self).__init__(name)
+
+  def _load(self):
+    """Load the module and insert it into the parent's globals."""
+    # Import the target module and insert it into the parent's namespace
+    module = importlib.import_module(self.__name__)
+    self._parent_module_globals[self._local_name] = module
+    # Update this object's dict so that if someone keeps a reference to the
+    #   LazyLoader, lookups are efficient (__getattr__ is only called on lookups
+    #   that fail).
+    self.__dict__.update(module.__dict__)
+    return module
+
+  def __getattr__(self, item):
+    module = self._load()
+    return getattr(module, item)
+
+
 # Aliases
 
-
 custom_object_scope = CustomObjectScope  # pylint: disable=invalid-name
diff --git a/tensorflow/python/keras/utils/losses_utils.py b/tensorflow/python/keras/utils/losses_utils.py
index aebae99d3eb..b8a063e3b42 100644
--- a/tensorflow/python/keras/utils/losses_utils.py
+++ b/tensorflow/python/keras/utils/losses_utils.py
@@ -26,7 +26,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.losses import loss_reduction
-from tensorflow.python.ops.losses import util as tf_losses_utils
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -62,8 +61,7 @@ def remove_squeezable_dimensions(
   Returns:
     Tuple of `labels` and `predictions`, possibly with last dim squeezed.
   """
-  with ops.name_scope(name, 'remove_squeezable_dimensions',
-                      [labels, predictions]):
+  with K.name_scope(name or 'remove_squeezable_dimensions'):
     predictions = ops.convert_to_tensor_v2_with_dispatch(predictions)
     labels = ops.convert_to_tensor_v2_with_dispatch(labels)
     predictions_shape = predictions.get_shape()
@@ -260,8 +258,16 @@ def compute_weighted_loss(losses,
 
     if not isinstance(sample_weight, keras_tensor.KerasTensor):
       sample_weight = ops.convert_to_tensor_v2(sample_weight)
-    weighted_losses = tf_losses_utils.scale_losses_by_sample_weight(
-        losses, sample_weight)
+
+    # TODO(psv): Handle casting here in a better way, eg. if losses is float64
+    # we do not want to lose precision.
+    losses = math_ops.cast(losses, 'float32')
+    sample_weight = math_ops.cast(sample_weight, 'float32')
+    # Update dimensions of `sample_weight` to match with `losses` if possible.
+    losses, _, sample_weight = squeeze_or_expand_dimensions(  # pylint: disable=unbalanced-tuple-unpacking
+        losses, None, sample_weight)
+    weighted_losses = math_ops.multiply(losses, sample_weight)
+
     # Apply reduction function to the individual weighted losses.
     loss = reduce_weighted_loss(weighted_losses, reduction)
     # Convert the result back to the input type.
diff --git a/tensorflow/python/keras/utils/multi_gpu_utils_test.py b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
index 465ace7f264..0765afb4db7 100644
--- a/tensorflow/python/keras/utils/multi_gpu_utils_test.py
+++ b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
@@ -23,7 +23,7 @@ from tensorflow.python import data
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import config
-from tensorflow.python.framework import test_util
+from tensorflow.python.framework import ops
 from tensorflow.python.keras.utils import multi_gpu_utils
 from tensorflow.python.keras.utils import np_utils
 from tensorflow.python.platform import test
@@ -38,7 +38,7 @@ def check_if_compatible_devices(gpus=2):
     return False
   return True
 
-@test_util.run_all_in_deprecated_graph_mode_only
+
 class TestMultiGPUModel(test.TestCase):
 
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
@@ -161,7 +161,7 @@ class TestMultiGPUModel(test.TestCase):
     if not check_if_compatible_devices(gpus=gpus):
       self.skipTest('multi gpu only')
 
-    with self.cached_session():
+    with ops.Graph().as_default(), self.cached_session():
       input_shape = (num_samples,) + shape
       x_train = np.random.randint(0, 255, input_shape)
       y_train = np.random.randint(0, num_classes, (input_shape[0],))
diff --git a/tensorflow/python/keras/utils/tf_utils.py b/tensorflow/python/keras/utils/tf_utils.py
index 01f3c2a8de2..51cb1acc899 100644
--- a/tensorflow/python/keras/utils/tf_utils.py
+++ b/tensorflow/python/keras/utils/tf_utils.py
@@ -25,72 +25,20 @@ from tensorflow.python.data.experimental.ops import cardinality
 from tensorflow.python.eager import context
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import smart_cond as smart_module
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import type_spec
 from tensorflow.python.keras import backend as K
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.util import nest
 from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_contextlib
 
 
-def smart_cond(pred, true_fn=None, false_fn=None, name=None):
-  """Return either `true_fn()` if predicate `pred` is true else `false_fn()`.
-
-  If `pred` is a bool or has a constant value, we return either `true_fn()`
-  or `false_fn()`, otherwise we use `tf.cond` to dynamically route to both.
-
-  Arguments:
-    pred: A scalar determining whether to return the result of `true_fn` or
-      `false_fn`.
-    true_fn: The callable to be performed if pred is true.
-    false_fn: The callable to be performed if pred is false.
-    name: Optional name prefix when using `tf.cond`.
-
-  Returns:
-    Tensors returned by the call to either `true_fn` or `false_fn`.
-
-  Raises:
-    TypeError: If `true_fn` or `false_fn` is not callable.
-  """
-  if isinstance(pred, variables.Variable):
-    return control_flow_ops.cond(
-        pred, true_fn=true_fn, false_fn=false_fn, name=name)
-  return smart_module.smart_cond(
-      pred, true_fn=true_fn, false_fn=false_fn, name=name)
-
-
-def constant_value(pred):
-  """Return the bool value for `pred`, or None if `pred` had a dynamic value.
-
-  Arguments:
-    pred: A scalar, either a Python bool or a TensorFlow boolean variable
-      or tensor, or the Python integer 1 or 0.
-
-  Returns:
-    True or False if `pred` has a constant boolean value, None otherwise.
-
-  Raises:
-    TypeError: If `pred` is not a Variable, Tensor or bool, or Python
-      integer 1 or 0.
-  """
-  # Allow integer booleans.
-  if isinstance(pred, int):
-    if pred == 1:
-      pred = True
-    elif pred == 0:
-      pred = False
-
-  if isinstance(pred, variables.Variable):
-    return None
-  return smart_module.smart_constant_value(pred)
-
-
 def is_tensor_or_tensor_list(v):
   v = nest.flatten(v)
   if v and isinstance(v[0], ops.Tensor):
@@ -174,7 +122,7 @@ def map_structure_with_atomic(is_atomic_fn, map_fn, nested):
     return map_fn(nested)
 
   # Recursively convert.
-  if not nest.is_sequence(nested):
+  if not nest.is_nested(nested):
     raise ValueError(
         'Received non-atomic and non-sequence element: {}'.format(nested))
   if nest._is_mapping(nested):
@@ -284,7 +232,7 @@ def convert_inner_node_data(nested, wrap=False):
       return True
     if _is_serialized_node_data(nested):
       return True
-    return not nest.is_sequence(nested)
+    return not nest.is_nested(nested)
 
   def _convert_object_or_list(nested):
     """Convert b/t `ListWrapper` object and list representations."""
@@ -413,6 +361,13 @@ def type_spec_from_value(value):
     return type_spec.type_spec_from_value(value)
 
 
+def is_ragged(tensor):
+  """Returns true if `tensor` is a ragged tensor or ragged tensor value."""
+  return isinstance(
+      tensor,
+      (ragged_tensor.RaggedTensor, ragged_tensor_value.RaggedTensorValue))
+
+
 def is_tensor_or_variable(x):
   return tensor_util.is_tensor(x) or isinstance(x, variables.Variable)
 
diff --git a/tensorflow/python/keras/utils/tf_utils_test.py b/tensorflow/python/keras/utils/tf_utils_test.py
index 04aef00f5a3..9a3939e0c39 100644
--- a/tensorflow/python/keras/utils/tf_utils_test.py
+++ b/tensorflow/python/keras/utils/tf_utils_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 
 try:
@@ -182,5 +183,16 @@ class AttrsTest(test.TestCase):
             nested=Foo(1)))
 
 
+class TestIsRagged(test.TestCase):
+
+  def test_is_ragged_return_true_for_ragged_tensor(self):
+    tensor = ragged_tensor.RaggedTensor.from_row_splits(
+        values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
+    self.assertTrue(tf_utils.is_ragged(tensor))
+
+  def test_is_ragged_return_false_for_list(self):
+    tensor = [1., 2., 3.]
+    self.assertFalse(tf_utils.is_ragged(tensor))
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/utils/version_utils.py b/tensorflow/python/keras/utils/version_utils.py
index d3796dcbf92..4c300d0e459 100644
--- a/tensorflow/python/keras/utils/version_utils.py
+++ b/tensorflow/python/keras/utils/version_utils.py
@@ -20,27 +20,27 @@ from __future__ import print_function
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
-from tensorflow.python.util import lazy_loader
+from tensorflow.python.keras.utils.generic_utils import LazyLoader
 
 # TODO(b/134426265): Switch back to single-quotes once the issue
 # with copybara is fixed.
 # pylint: disable=g-inconsistent-quotes
-training = lazy_loader.LazyLoader(
+training = LazyLoader(
     "training", globals(),
     "tensorflow.python.keras.engine.training")
-training_v1 = lazy_loader.LazyLoader(
+training_v1 = LazyLoader(
     "training_v1", globals(),
     "tensorflow.python.keras.engine.training_v1")
-base_layer = lazy_loader.LazyLoader(
+base_layer = LazyLoader(
     "base_layer", globals(),
     "tensorflow.python.keras.engine.base_layer")
-base_layer_v1 = lazy_loader.LazyLoader(
+base_layer_v1 = LazyLoader(
     "base_layer_v1", globals(),
     "tensorflow.python.keras.engine.base_layer_v1")
-callbacks = lazy_loader.LazyLoader(
+callbacks = LazyLoader(
     "callbacks", globals(),
     "tensorflow.python.keras.callbacks")
-callbacks_v1 = lazy_loader.LazyLoader(
+callbacks_v1 = LazyLoader(
     "callbacks_v1", globals(),
     "tensorflow.python.keras.callbacks_v1")
 
diff --git a/tensorflow/python/keras/utils/vis_utils.py b/tensorflow/python/keras/utils/vis_utils.py
index 32e32b587fb..8e587e0c80d 100644
--- a/tensorflow/python/keras/utils/vis_utils.py
+++ b/tensorflow/python/keras/utils/vis_utils.py
@@ -206,7 +206,7 @@ def model_to_dot(model,
     if show_shapes:
 
       def format_shape(shape):
-        return str(shape).replace(str(None), '?')
+        return str(shape).replace(str(None), 'None')
 
       try:
         outputlabels = format_shape(layer.output_shape)
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 49928980c41..2888730e2bb 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -20,6 +20,7 @@ tf_py_test(
     size = "small",
     srcs = ["as_string_op_test.py"],
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -33,6 +34,7 @@ tf_py_test(
     name = "attention_ops_test",
     size = "small",
     srcs = ["attention_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -47,8 +49,12 @@ tf_py_test(
     size = "medium",  # NOTE(ebrevdo): This test is NOT small.
     srcs = ["barrier_ops_test.py"],
     shard_count = 20,
-    # TODO(b/129706424): Re-enable this test on Mac.
-    tags = ["no_mac"],
+    tags = [
+        "no_mac",  # TODO(b/129706424): Re-enable this test on Mac.
+        "nomsan",  # TODO(b/161902335): Re-enable.
+        "notsan",  # TODO(b/161829717): Re-enable.
+    ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:data_flow_ops",
@@ -63,6 +69,7 @@ tf_py_test(
     size = "small",
     srcs = ["base64_ops_test.py"],
     tags = ["nomac"],  # b/35468214
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -74,24 +81,10 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "batch_gather_op_test",
-    srcs = ["batch_gather_op_test.py"],
-    tags = [
-        "no_gpu",  # b/127001953
-    ],
-    deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
 tf_py_test(
     name = "batch_scatter_ops_test",
     srcs = ["batch_scatter_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -112,6 +105,7 @@ tf_py_test(
     name = "bcast_ops_test",
     size = "small",
     srcs = ["bcast_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops_gen",
         "//tensorflow/python:client_testlib",
@@ -142,11 +136,34 @@ cuda_py_test(
     ],
 )
 
+# TODO(kattian): add GPU capability and change to cuda_py_test
+tf_py_test(
+    name = "map_ops_test",
+    size = "small",
+    srcs = ["map_ops_test.py"],
+    grpc_enabled = True,
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients_impl",
+        "//tensorflow/python:map_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 cuda_py_test(
     name = "benchmark_test",
     size = "small",
     srcs = ["benchmark_test.py"],
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
@@ -159,6 +176,7 @@ cuda_py_test(
 cuda_py_test(
     name = "reduce_benchmark_test",
     srcs = ["reduce_benchmark_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -177,6 +195,7 @@ cuda_py_test(
     size = "small",
     srcs = ["bincount_op_test.py"],
     tags = ["no_windows_gpu"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:bincount_ops",
         "//tensorflow/python:client_testlib",
@@ -188,6 +207,7 @@ tf_py_test(
     name = "candidate_sampler_ops_test",
     size = "small",
     srcs = ["candidate_sampler_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:candidate_sampling_ops",
@@ -202,6 +222,7 @@ tf_py_test(
     name = "checkpoint_ops_test",
     size = "medium",
     srcs = ["checkpoint_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:checkpoint_ops_gen",
@@ -249,6 +270,7 @@ tf_py_test(
         "no_gpu",  # b/127001953
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:clip_ops",
@@ -256,10 +278,23 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "collective_ops_test",
+    size = "small",
+    srcs = ["collective_ops_test.py"],
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:collective_ops_gen",
+        "//tensorflow/python:framework_for_generated_wrappers",
+    ],
+)
+
 tf_py_test(
     name = "conditional_accumulator_test",
     size = "small",
     srcs = ["conditional_accumulator_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -277,6 +312,7 @@ tf_py_test(
     name = "ctc_decoder_ops_test",
     size = "small",
     srcs = ["ctc_decoder_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -317,6 +353,7 @@ cuda_py_test(
     name = "cudnn_deterministic_ops_test",
     size = "small",
     srcs = ["cudnn_deterministic_ops_test.py"],
+    tfrt_enabled = True,
     xla_enable_strict_auto_jit = True,
     deps = [
         ":cudnn_deterministic_base",
@@ -327,6 +364,7 @@ cuda_py_test(
     name = "cudnn_deterministic_test",
     size = "small",
     srcs = ["cudnn_deterministic_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":cudnn_deterministic_base",
     ],
@@ -336,6 +374,7 @@ cuda_py_test(
     name = "cumulative_logsumexp_test",
     size = "medium",
     srcs = ["cumulative_logsumexp_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -351,6 +390,7 @@ tf_py_test(
     name = "decode_csv_op_test",
     size = "small",
     srcs = ["decode_csv_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -365,6 +405,7 @@ tf_py_test(
     name = "decode_png_op_test",
     size = "small",
     srcs = ["decode_png_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -378,6 +419,7 @@ tf_py_test(
     name = "decode_bmp_op_test",
     size = "small",
     srcs = ["decode_bmp_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -391,6 +433,7 @@ tf_py_test(
     name = "decode_jpeg_op_test",
     srcs = ["decode_jpeg_op_test.py"],
     data = ["//tensorflow/core:image_testdata"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -404,6 +447,7 @@ tf_py_test(
     size = "small",
     srcs = ["decode_image_op_test.py"],
     data = ["//tensorflow/core:image_testdata"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -418,6 +462,7 @@ tf_py_test(
     name = "decode_raw_op_test",
     size = "small",
     srcs = ["decode_raw_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -431,6 +476,7 @@ tf_py_test(
     name = "decode_compressed_op_test",
     size = "small",
     srcs = ["decode_compressed_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -444,6 +490,7 @@ cuda_py_test(
     name = "determinant_op_test",
     size = "medium",
     srcs = ["determinant_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -456,6 +503,7 @@ tf_py_test(
     name = "draw_bounding_box_op_test",
     size = "small",
     srcs = ["draw_bounding_box_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -470,6 +518,7 @@ tf_py_test(
     name = "edit_distance_op_test",
     size = "small",
     srcs = ["edit_distance_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -501,6 +550,7 @@ tf_py_test(
     name = "fingerprint_op_test",
     size = "small",
     srcs = ["fingerprint_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//third_party/py/numpy",
     ],
@@ -511,6 +561,7 @@ tf_py_test(
     size = "small",
     srcs = ["fractional_avg_pool_op_test.py"],
     shard_count = 5,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -527,6 +578,7 @@ tf_py_test(
     size = "small",
     srcs = ["fractional_max_pool_op_test.py"],
     shard_count = 5,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -542,6 +594,7 @@ tf_py_test(
     name = "identity_op_py_test",
     size = "small",
     srcs = ["identity_op_py_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:array_ops_gen",
@@ -556,6 +609,7 @@ tf_py_test(
     name = "identity_n_op_py_test",
     size = "small",
     srcs = ["identity_n_op_py_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:array_ops_gen",
@@ -570,6 +624,7 @@ cuda_py_test(
     name = "in_topk_op_test",
     size = "small",
     srcs = ["in_topk_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -582,6 +637,7 @@ tf_py_test(
     name = "record_input_test",
     size = "medium",
     srcs = ["record_input_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:data_flow_ops",
@@ -594,6 +650,7 @@ tf_py_test(
     name = "io_ops_test",
     size = "small",
     srcs = ["io_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:io_ops",
@@ -605,6 +662,7 @@ tf_py_test(
     name = "listdiff_op_test",
     size = "small",
     srcs = ["listdiff_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -621,6 +679,7 @@ tf_py_test(
     tags = [
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -667,6 +726,7 @@ tf_py_test(
     name = "losses_test",
     size = "medium",
     srcs = ["losses_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -689,6 +749,7 @@ tf_py_test(
     srcs = ["matrix_exponential_op_test.py"],
     shard_count = 16,
     tags = ["no_windows_gpu"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -766,6 +827,7 @@ cuda_py_test(
     name = "banded_triangular_solve_op_test",
     size = "small",
     srcs = ["banded_triangular_solve_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:linalg_ops",
@@ -778,6 +840,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["matrix_triangular_solve_op_test.py"],
     shard_count = 3,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:linalg_ops",
@@ -806,6 +869,7 @@ tf_py_test(
     name = "parse_single_example_op_test",
     size = "small",
     srcs = ["parse_single_example_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -823,6 +887,7 @@ tf_py_test(
     name = "partitioned_variables_test",
     size = "small",
     srcs = ["partitioned_variables_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -840,6 +905,7 @@ tf_py_test(
     name = "priority_queue_test",
     size = "medium",
     srcs = ["priority_queue_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -875,6 +941,7 @@ tf_py_test(
     name = "regex_replace_op_test",
     size = "small",
     srcs = ["regex_replace_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -888,6 +955,7 @@ tf_py_test(
     name = "regex_full_match_op_test",
     size = "small",
     srcs = ["regex_full_match_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -949,6 +1017,7 @@ tf_py_test(
     name = "sparse_add_op_test",
     size = "small",
     srcs = ["sparse_add_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
@@ -965,6 +1034,7 @@ tf_py_test(
     name = "sparse_concat_op_test",
     size = "small",
     srcs = ["sparse_concat_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -979,6 +1049,7 @@ tf_py_test(
     name = "sparse_conditional_accumulator_test",
     size = "small",
     srcs = ["sparse_conditional_accumulator_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -993,6 +1064,7 @@ tf_py_test(
     name = "sparse_reorder_op_test",
     size = "small",
     srcs = ["sparse_reorder_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1008,6 +1080,7 @@ tf_py_test(
     name = "sparse_reshape_op_test",
     size = "small",
     srcs = ["sparse_reshape_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1023,6 +1096,7 @@ tf_py_test(
     name = "sparse_split_op_test",
     size = "small",
     srcs = ["sparse_split_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
@@ -1035,6 +1109,7 @@ tf_py_test(
     name = "sparse_slice_op_test",
     size = "small",
     srcs = ["sparse_slice_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
@@ -1048,6 +1123,7 @@ tf_py_test(
     name = "sparse_to_dense_op_py_test",
     size = "small",
     srcs = ["sparse_to_dense_op_py_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1061,6 +1137,7 @@ tf_py_test(
     name = "sparsemask_op_test",
     size = "small",
     srcs = ["sparsemask_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1086,6 +1163,7 @@ tf_py_test(
     name = "string_join_op_test",
     size = "small",
     srcs = ["string_join_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:string_ops",
@@ -1096,6 +1174,7 @@ tf_py_test(
     name = "string_split_op_test",
     size = "small",
     srcs = ["string_split_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1115,6 +1194,7 @@ tf_py_test(
     name = "string_bytes_split_op_test",
     size = "small",
     srcs = ["string_bytes_split_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1134,6 +1214,7 @@ tf_py_test(
     name = "string_length_op_test",
     size = "small",
     srcs = ["string_length_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -1145,6 +1226,7 @@ tf_py_test(
     name = "string_strip_op_test",
     size = "small",
     srcs = ["string_strip_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1159,6 +1241,7 @@ tf_py_test(
     name = "string_lower_op_test",
     size = "small",
     srcs = ["string_lower_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1173,6 +1256,7 @@ tf_py_test(
     name = "string_upper_op_test",
     size = "small",
     srcs = ["string_upper_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1187,6 +1271,7 @@ tf_py_test(
     name = "substr_op_test",
     size = "small",
     srcs = ["substr_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -1225,6 +1310,7 @@ tf_py_test(
     name = "summary_v1_ops_test",
     size = "small",
     srcs = ["summary_v1_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
@@ -1238,6 +1324,7 @@ tf_py_test(
     name = "summary_v1_tensor_op_test",
     size = "small",
     srcs = ["summary_v1_tensor_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -1272,6 +1359,7 @@ cuda_py_test(
     name = "template_mirrored_strategy_test",
     size = "small",
     srcs = ["template_mirrored_strategy_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:init_ops",
@@ -1291,6 +1379,7 @@ cuda_py_test(
     tags = [
         "no_oss",  # TODO(b/142818120): Re-enable.
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -1303,6 +1392,7 @@ tf_py_test(
     name = "unicode_script_op_test",
     size = "small",
     srcs = ["unicode_script_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -1315,6 +1405,7 @@ cuda_py_test(
     name = "topk_op_test",
     size = "medium",
     srcs = ["topk_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1330,6 +1421,7 @@ cuda_py_test(
     name = "nth_element_op_test",
     size = "small",
     srcs = ["nth_element_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1345,6 +1437,7 @@ tf_py_test(
     name = "unicode_encode_op_test",
     size = "small",
     srcs = ["unicode_encode_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -1363,6 +1456,7 @@ tf_py_test(
     name = "unicode_transcode_op_test",
     size = "small",
     srcs = ["unicode_transcode_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -1395,6 +1489,7 @@ tf_py_test(
     name = "unique_op_test",
     size = "small",
     srcs = ["unique_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1456,6 +1551,7 @@ cuda_py_test(
     name = "where_op_test",
     size = "medium",
     srcs = ["where_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1468,6 +1564,7 @@ cuda_py_test(
     name = "cast_op_test",
     size = "small",
     srcs = ["cast_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1484,6 +1581,7 @@ cuda_py_test(
     size = "small",
     srcs = ["dense_update_ops_no_tsan_test.py"],
     tags = ["notsan"],
+    tfrt_enabled = True,
     # TODO (b/140294007): the test fails with XLA.
     xla_enable_strict_auto_jit = False,
     deps = [
@@ -1502,6 +1600,7 @@ cuda_py_test(
     srcs = ["diag_op_test.py"],
     shard_count = 6,
     tags = ["no_windows_gpu"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1517,6 +1616,7 @@ tf_py_test(
     size = "small",
     srcs = ["reader_ops_test.py"],
     data = ["//tensorflow/core:lmdb_testdata"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
@@ -1535,6 +1635,7 @@ cuda_py_test(
     name = "aggregate_ops_test",
     size = "small",
     srcs = ["aggregate_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1548,6 +1649,7 @@ cuda_py_test(
     name = "argmax_op_test",
     size = "small",
     srcs = ["argmax_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
@@ -1601,6 +1703,7 @@ cuda_py_test(
     size = "small",
     srcs = ["inplace_ops_test.py"],
     shard_count = 10,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1617,6 +1720,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["batch_matmul_op_test.py"],
     shard_count = 20,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1630,6 +1734,7 @@ cuda_py_test(
     name = "batchtospace_op_test",
     size = "small",
     srcs = ["batchtospace_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:array_ops_gen",
@@ -1643,6 +1748,7 @@ cuda_py_test(
     name = "betainc_op_test",
     size = "small",
     srcs = ["betainc_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1671,6 +1777,7 @@ cuda_py_test(
     name = "bias_op_deterministic_test",
     size = "medium",
     srcs = ["bias_op_deterministic_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":bias_op_base",
     ],
@@ -1689,6 +1796,7 @@ cuda_py_test(
     name = "bitcast_op_test",
     size = "small",
     srcs = ["bitcast_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1719,6 +1827,7 @@ cuda_py_test(
     name = "constant_op_test",
     size = "small",
     srcs = ["constant_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1795,6 +1904,7 @@ tf_py_test(
     name = "control_flow_util_test",
     size = "small",
     srcs = ["control_flow_util_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
@@ -1822,6 +1932,7 @@ cuda_py_test(
     name = "conv1d_test",
     size = "small",
     srcs = ["conv1d_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1834,6 +1945,7 @@ cuda_py_test(
     name = "conv1d_transpose_test",
     size = "small",
     srcs = ["conv1d_transpose_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
@@ -1847,6 +1959,7 @@ cuda_py_test(
     name = "conv2d_transpose_test",
     size = "small",
     srcs = ["conv2d_transpose_test.py"],
+    tfrt_enabled = True,
 
     # TODO(b/144432983): S32 convolutions should not be auto-clustered, only
     # crashes tests.
@@ -1865,6 +1978,7 @@ cuda_py_test(
     name = "conv3d_backprop_filter_v2_grad_test",
     size = "small",
     srcs = ["conv3d_backprop_filter_v2_grad_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1879,6 +1993,7 @@ cuda_py_test(
     name = "cross_grad_test",
     size = "small",
     srcs = ["cross_grad_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1903,6 +2018,7 @@ cuda_py_test(
     name = "dense_update_ops_test",
     size = "small",
     srcs = ["dense_update_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1919,6 +2035,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["depthtospace_op_test.py"],
     tags = ["no_windows_gpu"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1933,6 +2050,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["division_past_test.py"],
     tags = ["manual"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -1944,6 +2062,7 @@ cuda_py_test(
     name = "dynamic_partition_op_test",
     size = "medium",
     srcs = ["dynamic_partition_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1959,6 +2078,7 @@ cuda_py_test(
     name = "dynamic_stitch_op_test",
     size = "small",
     srcs = ["dynamic_stitch_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:data_flow_grad",
@@ -1973,6 +2093,7 @@ cuda_py_test(
     name = "extract_image_patches_op_test",
     size = "small",
     srcs = ["extract_image_patches_op_test.py"],
+    tfrt_enabled = True,
     # TODO(b/144432983): S32 convolutions should not be auto-clustered.
     xla_enable_strict_auto_jit = False,
     deps = [
@@ -1987,6 +2108,7 @@ cuda_py_test(
     name = "extract_volume_patches_op_test",
     size = "small",
     srcs = ["extract_volume_patches_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2025,6 +2147,7 @@ cuda_py_test(
     name = "gather_nd_op_test",
     size = "small",
     srcs = ["gather_nd_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
@@ -2036,24 +2159,11 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "gather_op_test",
-    size = "medium",
-    srcs = ["gather_op_test.py"],
-    deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
 cuda_py_test(
     name = "gradient_correctness_test",
     size = "small",
     srcs = ["gradient_correctness_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2111,6 +2221,7 @@ cuda_py_test(
     name = "lrn_op_test",
     size = "medium",
     srcs = ["lrn_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2126,6 +2237,7 @@ cuda_py_test(
     name = "lu_op_test",
     size = "small",
     srcs = ["lu_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2160,6 +2272,7 @@ cuda_py_test(
     size = "small",
     srcs = ["manip_ops_test.py"],
     tags = ["no_windows_gpu"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2173,6 +2286,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["matmul_op_test.py"],
     shard_count = 20,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2189,6 +2303,7 @@ cuda_py_test(
     name = "morphological_ops_test",
     size = "small",
     srcs = ["morphological_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2202,6 +2317,7 @@ cuda_py_test(
     name = "numerics_test",
     size = "small",
     srcs = ["numerics_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2218,6 +2334,7 @@ cuda_py_test(
     size = "small",
     srcs = ["one_hot_op_test.py"],
     tags = ["no_windows_gpu"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2230,6 +2347,7 @@ cuda_py_test(
     name = "stack_op_test",
     size = "small",
     srcs = ["stack_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2266,6 +2384,7 @@ cuda_py_test(
     name = "pad_op_test",
     size = "small",
     srcs = ["pad_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2278,6 +2397,7 @@ cuda_py_test(
     name = "padding_fifo_queue_test",
     size = "small",
     srcs = ["padding_fifo_queue_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2311,6 +2431,7 @@ cuda_py_test(
     name = "reduce_join_op_test",
     size = "small",
     srcs = ["reduce_join_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2341,6 +2462,7 @@ cuda_py_test(
     tags = [
         "no_windows_gpu",
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2360,6 +2482,7 @@ cuda_py_test(
         "no_gpu",
         "noguitar",
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2373,6 +2496,7 @@ cuda_py_test(
     name = "relu_op_test",
     size = "small",
     srcs = ["relu_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2391,6 +2515,7 @@ cuda_py_test(
     name = "reshape_op_test",
     size = "small",
     srcs = ["reshape_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2403,6 +2528,7 @@ cuda_py_test(
     name = "reverse_sequence_op_test",
     size = "small",
     srcs = ["reverse_sequence_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2415,6 +2541,7 @@ cuda_py_test(
     name = "compare_and_bitpack_op_test",
     size = "small",
     srcs = ["compare_and_bitpack_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2427,6 +2554,7 @@ cuda_py_test(
     name = "scalar_test",
     size = "small",
     srcs = ["scalar_test.py"],
+    tfrt_enabled = True,
     # b/140221961: Invalid dims for operations
     xla_enable_strict_auto_jit = False,
     deps = [
@@ -2447,6 +2575,7 @@ cuda_py_test(
     name = "scan_ops_test",
     size = "medium",
     srcs = ["scan_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -2456,22 +2585,11 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "session_ops_test",
-    size = "small",
-    srcs = ["session_ops_test.py"],
-    deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:session_ops",
-    ],
-)
-
 cuda_py_test(
     name = "shape_ops_test",
     size = "medium",
     srcs = ["shape_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -2489,6 +2607,7 @@ cuda_py_test(
     name = "softmax_op_test",
     size = "medium",
     srcs = ["softmax_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2503,6 +2622,7 @@ cuda_py_test(
     name = "softplus_op_test",
     size = "small",
     srcs = ["softplus_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2516,6 +2636,7 @@ cuda_py_test(
     name = "softsign_op_test",
     size = "small",
     srcs = ["softsign_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2529,6 +2650,7 @@ cuda_py_test(
     name = "spacetobatch_op_test",
     size = "small",
     srcs = ["spacetobatch_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:array_ops_gen",
@@ -2548,6 +2670,7 @@ cuda_py_test(
         "no_windows",
         "no_windows_gpu",
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2561,6 +2684,7 @@ tf_py_test(
     name = "sparse_serialization_ops_test",
     size = "small",
     srcs = ["sparse_serialization_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2575,6 +2699,7 @@ tf_py_test(
     name = "sparse_tensors_map_ops_test",
     size = "small",
     srcs = ["sparse_tensors_map_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
@@ -2591,6 +2716,7 @@ cuda_py_test(
     name = "sparse_tensor_dense_matmul_grad_test",
     size = "small",
     srcs = ["sparse_tensor_dense_matmul_grad_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
@@ -2605,6 +2731,7 @@ cuda_py_test(
     name = "sparse_xent_op_test",
     size = "small",
     srcs = ["sparse_xent_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -2643,6 +2770,7 @@ cuda_py_test(
     name = "stack_ops_test",
     size = "small",
     srcs = ["stack_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
@@ -2658,6 +2786,7 @@ cuda_py_test(
     name = "string_to_hash_bucket_op_test",
     size = "small",
     srcs = ["string_to_hash_bucket_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2670,6 +2799,7 @@ cuda_py_test(
     name = "string_to_number_op_test",
     size = "small",
     srcs = ["string_to_number_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2682,6 +2812,7 @@ cuda_py_test(
     name = "summary_v1_audio_op_test",
     size = "small",
     srcs = ["summary_v1_audio_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
@@ -2695,6 +2826,7 @@ cuda_py_test(
     name = "summary_v1_image_op_test",
     size = "small",
     srcs = ["summary_v1_image_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
@@ -2745,6 +2877,7 @@ cuda_py_test(
     size = "small",
     srcs = ["trace_op_test.py"],
     tags = ["no_windows_gpu"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
@@ -2771,22 +2904,11 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "unstack_op_test",
-    size = "small",
-    srcs = ["unstack_op_test.py"],
-    deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//third_party/py/numpy",
-    ],
-)
-
 cuda_py_test(
     name = "variable_ops_test",
     size = "small",
     srcs = ["variable_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2804,6 +2926,7 @@ cuda_py_test(
     name = "xent_op_test",
     size = "small",
     srcs = ["xent_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2820,6 +2943,7 @@ cuda_py_test(
     name = "zero_division_test",
     size = "medium",
     srcs = ["zero_division_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -2835,6 +2959,7 @@ cuda_py_test(
     tags = [
         "no_gpu",  #  Flaky: b/80127739, b/127001953
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2851,6 +2976,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["atrous_convolution_test.py"],
     tags = ["manual"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2865,6 +2991,7 @@ cuda_py_test(
     name = "pool_test",
     size = "medium",
     srcs = ["pool_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2896,6 +3023,7 @@ cuda_py_test(
     name = "conv3d_transpose_test",
     size = "medium",
     srcs = ["conv3d_transpose_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2939,6 +3067,7 @@ cuda_py_test(
     shard_count = 3,
     # TODO(b/118842098): Re-enable this test in Kokoro.
     tags = ["no_oss"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2955,6 +3084,7 @@ tf_py_test(
     size = "medium",
     srcs = ["neon_depthwise_conv_op_test.py"],
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2971,6 +3101,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["division_future_test.py"],
     tags = ["manual"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2982,6 +3113,7 @@ cuda_py_test(
     name = "pooling_ops_3d_test",
     size = "medium",
     srcs = ["pooling_ops_3d_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2996,6 +3128,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["pooling_ops_test.py"],
     shard_count = 4,
+    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,  # Flaky in XLA b/149568654
     deps = [
         "//tensorflow/python:array_ops",
@@ -3016,6 +3149,7 @@ cuda_py_test(
     timeout = "long",
     srcs = ["rnn_test.py"],
     shard_count = 10,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -3083,21 +3217,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "slice_op_test",
-    size = "medium",
-    srcs = ["slice_op_test.py"],
-    tags = ["no_windows"],  # b/126916429
-    deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
-        "//third_party/py/numpy",
-    ],
-)
-
 cuda_py_test(
     name = "huge_slice_op_test",
     size = "medium",
@@ -3105,6 +3224,7 @@ cuda_py_test(
     tags = [
         "no_oss",  # Requires 4GB+ RAM
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3119,6 +3239,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["sparse_matmul_op_test.py"],
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -3154,6 +3275,7 @@ cuda_py_test(
     name = "sparse_tensor_dense_matmul_op_test",
     size = "medium",
     srcs = ["sparse_tensor_dense_matmul_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -3208,6 +3330,7 @@ cuda_py_test(
     name = "stage_op_test",
     size = "medium",
     srcs = ["stage_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3223,6 +3346,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["map_stage_op_test.py"],
     tags = ["no_oss"],  # b/124474135
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3238,6 +3362,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["concat_op_test.py"],
     tags = ["no_windows"],  # b/126916429
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:array_ops_gen",
@@ -3259,6 +3384,7 @@ cuda_py_test(
         "nomsan",
         "notsan",
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3271,6 +3397,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["conv_ops_3d_test.py"],
     shard_count = 30,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -3284,6 +3411,8 @@ cuda_py_test(
     size = "medium",
     srcs = ["cwise_ops_test.py"],
     shard_count = 50,
+    tags = ["no_windows"],  # b/163222163
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3304,6 +3433,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["cwise_ops_binary_test.py"],
     shard_count = 50,
+    tfrt_enabled = True,
     # b/140155647: Error just outside of tolerance
     xla_enable_strict_auto_jit = False,
     deps = [
@@ -3348,6 +3478,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["embedding_ops_test.py"],
     shard_count = 20,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3390,6 +3521,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["matrix_band_part_op_test.py"],
     shard_count = 20,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3407,6 +3539,7 @@ tf_py_test(
     tags = [
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3427,6 +3560,7 @@ cuda_py_test(
     tags = [
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3486,6 +3620,7 @@ cuda_py_test(
         "no_windows_gpu",
         "nomsan",
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3518,6 +3653,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["tensordot_op_test.py"],
     shard_count = 20,
+    xla_enable_strict_auto_jit = False,  # b/161856380
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3546,6 +3682,7 @@ tf_py_test(
     name = "sets_test",
     size = "medium",
     srcs = ["sets_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:errors",
         "//tensorflow/python:framework",
@@ -3564,6 +3701,7 @@ tf_py_test(
     size = "small",
     srcs = ["weights_broadcast_test.py"],
     shard_count = 3,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3583,6 +3721,7 @@ tf_py_test(
     srcs = ["metrics_test.py"],
     shard_count = 20,
     tags = ["no_windows_gpu"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3603,6 +3742,7 @@ tf_py_test(
     name = "confusion_matrix_test",
     size = "small",
     srcs = ["confusion_matrix_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3618,6 +3758,7 @@ cuda_py_test(
     name = "bucketize_op_test",
     size = "medium",
     srcs = ["bucketize_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -3631,6 +3772,7 @@ tf_py_test(
     size = "small",
     srcs = ["sparse_cross_op_test.py"],
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -3643,6 +3785,7 @@ tf_py_test(
     name = "garbage_collection_test",
     size = "small",
     srcs = ["garbage_collection_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
@@ -3724,6 +3867,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["cond_v2_test.py"],
     grpc_enabled = True,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3799,6 +3943,7 @@ cuda_py_test(
     srcs = ["tridiagonal_matmul_op_test.py"],
     shard_count = 10,
     tags = ["no_rocm"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
diff --git a/tensorflow/python/kernel_tests/array_ops/BUILD b/tensorflow/python/kernel_tests/array_ops/BUILD
new file mode 100644
index 00000000000..bc448f3da05
--- /dev/null
+++ b/tensorflow/python/kernel_tests/array_ops/BUILD
@@ -0,0 +1,62 @@
+# Tests of TensorFlow array ops kernels written using the Python API.
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cuda_py_test(
+    name = "batch_gather_op_test",
+    srcs = ["batch_gather_op_test.py"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+cuda_py_test(
+    name = "unstack_op_test",
+    size = "small",
+    srcs = ["unstack_op_test.py"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradients",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_test(
+    name = "slice_op_test",
+    size = "medium",
+    srcs = ["slice_op_test.py"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradients",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_test(
+    name = "gather_op_test",
+    size = "medium",
+    srcs = ["gather_op_test.py"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradients",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/kernel_tests/batch_gather_op_test.py b/tensorflow/python/kernel_tests/array_ops/batch_gather_op_test.py
similarity index 90%
rename from tensorflow/python/kernel_tests/batch_gather_op_test.py
rename to tensorflow/python/kernel_tests/array_ops/batch_gather_op_test.py
index 8a7d8669d08..e41053b3182 100644
--- a/tensorflow/python/kernel_tests/batch_gather_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/batch_gather_op_test.py
@@ -23,6 +23,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
@@ -96,15 +97,17 @@ class GatherTest(test.TestCase, parameterized.TestCase):
           [[b"qwer", b"uiop"]],
           self.evaluate(array_ops.batch_gather(params, indices_tf)))
 
-  @test_util.run_deprecated_v1
   def testUnknownIndices(self):
-    params = constant_op.constant([[0, 1, 2]])
-    indices = array_ops.placeholder(dtypes.int32, shape=[None, None])
-    gather_t = array_ops.batch_gather(params, indices)
-    self.assertEqual([1, None], gather_t.get_shape().as_list())
+    # This test needs a placeholder which means we need to construct a graph.
+    with ops.Graph().as_default():
+      params = constant_op.constant([[0, 1, 2]])
+      indices = array_ops.placeholder(dtypes.int32, shape=[None, None])
+      gather_t = array_ops.batch_gather(params, indices)
+      self.assertEqual([1, None], gather_t.get_shape().as_list())
 
+  @test_util.disable_xla("Cannot force cpu placement for xla_gpu test")
   def testBadIndicesCPU(self):
-    with self.session(use_gpu=False):
+    with ops.device_v2("cpu:0"):
       params = [[0, 1, 2], [3, 4, 5]]
       with self.assertRaisesOpError(r"indices\[0\] = 7 is not in \[0, 2\)"):
         self.evaluate(array_ops.batch_gather(params, [7]))
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/array_ops/gather_op_test.py
similarity index 75%
rename from tensorflow/python/kernel_tests/gather_op_test.py
rename to tensorflow/python/kernel_tests/array_ops/gather_op_test.py
index 0f59d10c720..d553b2912ef 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/gather_op_test.py
@@ -107,18 +107,20 @@ class GatherTest(test.TestCase, parameterized.TestCase):
             expected_shape = data.shape[:axis] + (4,) + data.shape[axis + 1:]
             self.assertEqual(expected_shape, gather_t.get_shape())
 
-  @test_util.run_deprecated_v1
   def testHigherRank(self):
-    # We check that scalar and empty indices shapes work as well
-    shape = (2, 1, 3, 2)
-    for indices_shape in (), (0,), (2, 0), (2, 3):
-      for dtype in _TEST_TYPES:
-        for axis in range(len(shape)):
-          params = self._buildParams(np.random.randn(*shape), dtype)
-          indices = np.random.randint(shape[axis], size=indices_shape)
-          with self.subTest(indices_shape=indices_shape, dtype=dtype, axis=axis,
-                            indices=indices):
-            with self.cached_session(use_gpu=True) as sess:
+    with ops.Graph().as_default():
+      # We check that scalar and empty indices shapes work as well
+      shape = (2, 1, 3, 2)
+      for indices_shape in (), (0,), (2, 0), (2, 3):
+        for dtype in _TEST_TYPES:
+          for axis in range(len(shape)):
+            params = self._buildParams(np.random.randn(*shape), dtype)
+            indices = np.random.randint(shape[axis], size=indices_shape)
+            with self.subTest(
+                indices_shape=indices_shape,
+                dtype=dtype,
+                axis=axis,
+                indices=indices):
               tf_params = constant_op.constant(params)
               tf_indices = constant_op.constant(indices)
               # Check that both positive and negative indices for axis work.
@@ -127,7 +129,7 @@ class GatherTest(test.TestCase, parameterized.TestCase):
               gather = array_ops.gather(tf_params, tf_indices, axis=tf_axis)
               gather_negative_axis = array_ops.gather(
                   tf_params, tf_indices, axis=tf_negative_axis)
-              gather_value, gather_negative_axis_value = sess.run(
+              gather_value, gather_negative_axis_value = self.evaluate(
                   [gather, gather_negative_axis])
               gather_np = np.take(params, indices, axis)
               self.assertAllEqual(gather_np, gather_value)
@@ -144,10 +146,10 @@ class GatherTest(test.TestCase, parameterized.TestCase):
                 gather_grad -= 1j * gather_grad
               params_grad, indices_grad, axis_grad = gradients_impl.gradients(
                   gather, [tf_params, tf_indices, tf_axis], gather_grad)
-              self.assertEqual(indices_grad, None)
-              self.assertEqual(axis_grad, None)
+              self.assertIsNone(indices_grad)
+              self.assertIsNone(axis_grad)
               if dtype.is_integer:
-                self.assertEqual(params_grad, None)
+                self.assertIsNone(params_grad)
                 continue
               # For axis 0, we are able to create an efficient IndexedSlices for
               # the gradient.
@@ -171,47 +173,113 @@ class GatherTest(test.TestCase, parameterized.TestCase):
                   atol=2e-6,
                   rtol=2e-6)
 
-  @test_util.run_deprecated_v1
+  def testHigherRankGradientTape(self):
+    # We check that scalar and empty indices shapes work as well
+    shape = (2, 1, 3, 2)
+    for indices_shape in (), (0,), (2, 0), (2, 3):
+      for dtype in _TEST_TYPES:
+        for axis in range(len(shape)):
+          params = self._buildParams(np.random.randn(*shape), dtype)
+          indices = np.random.randint(shape[axis], size=indices_shape)
+          with self.subTest(
+              indices_shape=indices_shape,
+              dtype=dtype,
+              axis=axis,
+              indices=indices):
+            with backprop.GradientTape() as tape:
+              tf_params = constant_op.constant(params)
+              tf_indices = constant_op.constant(indices)
+              # Check that both positive and negative indices for axis work.
+              tf_axis = constant_op.constant(axis)
+              tape.watch(tf_params)
+              tape.watch(tf_indices)
+              tape.watch(tf_axis)
+              tf_negative_axis = constant_op.constant(-len(shape) + axis)
+              gather = array_ops.gather(tf_params, tf_indices, axis=tf_axis)
+              gather_negative_axis = array_ops.gather(
+                  tf_params, tf_indices, axis=tf_negative_axis)
+              gather_value, gather_negative_axis_value = self.evaluate(
+                  [gather, gather_negative_axis])
+              gather_np = np.take(params, indices, axis)
+              self.assertAllEqual(gather_np, gather_value)
+              self.assertAllEqual(gather_np, gather_negative_axis_value)
+              expected_shape = (
+                  params.shape[:axis] + indices.shape + params.shape[axis + 1:])
+              self.assertEqual(expected_shape, gather.shape)
+              self.assertEqual(expected_shape, gather_negative_axis.shape)
+
+              # Test gradients
+              gather_grad = np.random.randn(
+                  *gather.get_shape().as_list()).astype(dtype.as_numpy_dtype)
+              if dtype.is_complex:
+                gather_grad -= 1j * gather_grad
+            params_grad, indices_grad, axis_grad = tape.gradient(
+                gather, [tf_params, tf_indices, tf_axis], gather_grad)
+            self.assertIsNone(indices_grad)
+            self.assertIsNone(axis_grad)
+            if dtype.is_integer:
+              self.assertIsNone(params_grad)
+              continue
+            # For axis 0, we are able to create an efficient IndexedSlices for
+            # the gradient.
+            if axis == 0:
+              self.assertEqual(type(params_grad), ops.IndexedSlices)
+              params_grad = ops.convert_to_tensor(params_grad)
+            correct_params_grad = np.zeros(shape).astype(dtype.as_numpy_dtype)
+            outer_dims = axis
+            inner_dims = len(shape) - axis - 1
+            gather_grad = gather_grad.reshape(shape[:axis] + (indices.size,) +
+                                              shape[axis + 1:])
+            for source_index, dest_index in enumerate(indices.flat):
+              dest_slice = ((slice(None),) * outer_dims + (dest_index,) +
+                            (slice(None),) * inner_dims)
+              source_slice = ((slice(None),) * outer_dims + (source_index,) +
+                              (slice(None),) * inner_dims)
+              correct_params_grad[dest_slice] += gather_grad[source_slice]
+            self.assertAllClose(
+                correct_params_grad,
+                self.evaluate(params_grad),
+                atol=2e-6,
+                rtol=2e-6)
+
   def testString(self):
     params = np.array([[b"asdf", b"zxcv"], [b"qwer", b"uiop"]])
-    with self.cached_session():
-      self.assertAllEqual([b"qwer", b"uiop"],
-                          array_ops.gather(params, 1, axis=0).eval())
-      self.assertAllEqual([b"asdf", b"qwer"],
-                          array_ops.gather(params, 0, axis=1).eval())
+    self.assertAllEqual([b"qwer", b"uiop"], array_ops.gather(params, 1, axis=0))
+    self.assertAllEqual([b"asdf", b"qwer"], array_ops.gather(params, 0, axis=1))
 
-  @test_util.run_deprecated_v1
   def testUInt32AndUInt64(self):
     for unsigned_type in (dtypes.uint32, dtypes.uint64):
       with self.subTest(unsigned_type=unsigned_type):
         params = self._buildParams(
             np.array([[1, 2, 3], [7, 8, 9]]), unsigned_type)
         with self.cached_session():
-          self.assertAllEqual([7, 8, 9],
-                              array_ops.gather(params, 1, axis=0).eval())
-          self.assertAllEqual([1, 7],
-                              array_ops.gather(params, 0, axis=1).eval())
+          self.assertAllEqual([7, 8, 9], array_ops.gather(params, 1, axis=0))
+          self.assertAllEqual([1, 7], array_ops.gather(params, 0, axis=1))
 
-  @test_util.run_deprecated_v1
   def testUnknownIndices(self):
-    params = constant_op.constant([[0, 1, 2]])
-    indices = array_ops.placeholder(dtypes.int32)
-    gather_t = array_ops.gather(params, indices)
-    self.assertEqual(None, gather_t.get_shape())
+    # This test is purely a test for placeholder inputs which is only applicable
+    # in graph mode.
+    with ops.Graph().as_default():
+      params = constant_op.constant([[0, 1, 2]])
+      indices = array_ops.placeholder(dtypes.int32)
+      gather_t = array_ops.gather(params, indices)
+      self.assertEqual(None, gather_t.get_shape())
 
-  @test_util.run_deprecated_v1
   def testUnknownAxis(self):
-    params = constant_op.constant([[0, 1, 2]])
-    indices = constant_op.constant([[0, 0], [0, 0]])
-    axis = array_ops.placeholder(dtypes.int32)
-    gather_t = array_ops.gather(params, indices, axis=axis)
-    # Rank 2 params with rank 2 indices results in a rank 3 shape.
-    self.assertEqual([None, None, None], gather_t.shape.as_list())
+    # This test is purely a test for placeholder inputs which is only applicable
+    # in graph mode.
+    with ops.Graph().as_default():
+      params = constant_op.constant([[0, 1, 2]])
+      indices = constant_op.constant([[0, 0], [0, 0]])
+      axis = array_ops.placeholder(dtypes.int32)
+      gather_t = array_ops.gather(params, indices, axis=axis)
+      # Rank 2 params with rank 2 indices results in a rank 3 shape.
+      self.assertEqual([None, None, None], gather_t.shape.as_list())
 
-    # If indices is also unknown the result rank is unknown.
-    indices = array_ops.placeholder(dtypes.int32)
-    gather_t = array_ops.gather(params, indices, axis=axis)
-    self.assertEqual(None, gather_t.shape)
+      # If indices is also unknown the result rank is unknown.
+      indices = array_ops.placeholder(dtypes.int32)
+      gather_t = array_ops.gather(params, indices, axis=axis)
+      self.assertEqual(None, gather_t.shape)
 
   def testBadIndicesType(self):
     with self.assertRaisesRegex(
@@ -243,45 +311,36 @@ class GatherTest(test.TestCase, parameterized.TestCase):
       with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 3\)"):
         array_ops.gather(params, [[7]], axis=1).eval()
 
-  @test_util.run_deprecated_v1
   def testBadAxis(self):
-    with self.session(use_gpu=True):
-      params = [0, 1, 2]
-      params_ph = array_ops.placeholder(dtypes.int32)
-      indices = 0
-      for bad_axis in (1, 2, -2):
-        # Shape inference can validate axis for known params rank.
-        with self.subTest(bad_axis=bad_axis):
-          with self.assertRaisesWithPredicateMatch(
-              ValueError, "Shape must be at least rank . but is rank 1"):
-            array_ops.gather(params, indices, axis=bad_axis)
-          # If params rank is unknown, an op error occurs.
-          with self.assertRaisesOpError(
-              r"Expected axis in the range \[-1, 1\), but got %s" % bad_axis):
-            array_ops.gather(params_ph, indices, axis=bad_axis).eval(
-                feed_dict={params_ph: params})
+    params = [0, 1, 2]
+    indices = 0
+    for bad_axis in (1, 2, -2):
+      # Shape inference can validate axis for known params rank.
+      with self.subTest(bad_axis=bad_axis):
+        with self.assertRaisesRegex(
+            (ValueError, errors.InvalidArgumentError),
+            "Shape must be at least rank .* but is rank 1"):
+          array_ops.gather(params, indices, axis=bad_axis)
 
-  @test_util.run_deprecated_v1
   def testEmptySlices(self):
-    with self.session(use_gpu=True):
-      for dtype in _TEST_TYPES:
-        for itype in np.int32, np.int64:
-          # Leading axis gather.
-          with self.subTest(dtype=dtype, itype=itype):
-            params = np.zeros((7, 0, 0), dtype=dtype.as_numpy_dtype)
-            indices = np.array([3, 4], dtype=itype)
-            gather = array_ops.gather(params, indices, axis=0)
-            self.assertAllEqual(gather, np.zeros((2, 0, 0)))
+    for dtype in _TEST_TYPES:
+      for itype in np.int32, np.int64:
+        # Leading axis gather.
+        with self.subTest(dtype=dtype, itype=itype):
+          params = np.zeros((7, 0, 0), dtype=dtype.as_numpy_dtype)
+          indices = np.array([3, 4], dtype=itype)
+          gather = array_ops.gather(params, indices, axis=0)
+          self.assertAllEqual(gather, np.zeros((2, 0, 0)))
 
-            # Middle axis gather.
-            params = np.zeros((0, 7, 0), dtype=dtype.as_numpy_dtype)
-            gather = array_ops.gather(params, indices, axis=1)
-            self.assertAllEqual(gather, np.zeros((0, 2, 0)))
+          # Middle axis gather.
+          params = np.zeros((0, 7, 0), dtype=dtype.as_numpy_dtype)
+          gather = array_ops.gather(params, indices, axis=1)
+          self.assertAllEqual(gather, np.zeros((0, 2, 0)))
 
-            # Trailing axis gather.
-            params = np.zeros((0, 0, 7), dtype=dtype.as_numpy_dtype)
-            gather = array_ops.gather(params, indices, axis=2)
-            self.assertAllEqual(gather, np.zeros((0, 0, 2)))
+          # Trailing axis gather.
+          params = np.zeros((0, 0, 7), dtype=dtype.as_numpy_dtype)
+          gather = array_ops.gather(params, indices, axis=2)
+          self.assertAllEqual(gather, np.zeros((0, 0, 2)))
 
   @parameterized.parameters([
       # batch_dims=0 (equivalent to tf.gather)
@@ -385,20 +444,13 @@ class GatherTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(expected, result)
 
     # Test the gradients shape.
-    if context.executing_eagerly():
-      with backprop.GradientTape() as tape:
-        zeros = array_ops.zeros_like(params, dtype=dtypes.float32)
-        tape.watch(zeros)
-        values = zeros * 2 + zeros
-        result = array_ops.gather(
-            values, indices, axis=axis, batch_dims=batch_dims)
-      gradients = tape.gradient(result, zeros)
-    else:
+    with backprop.GradientTape() as tape:
       zeros = array_ops.zeros_like(params, dtype=dtypes.float32)
+      tape.watch(zeros)
       values = zeros * 2 + zeros
       result = array_ops.gather(
           values, indices, axis=axis, batch_dims=batch_dims)
-      gradients = gradients_impl.gradients(result, [zeros])[0]
+    gradients = tape.gradient(result, zeros)
 
     self.assertAllEqual(array_ops.shape(params), array_ops.shape(gradients))
 
diff --git a/tensorflow/python/kernel_tests/slice_op_test.py b/tensorflow/python/kernel_tests/array_ops/slice_op_test.py
similarity index 65%
rename from tensorflow/python/kernel_tests/slice_op_test.py
rename to tensorflow/python/kernel_tests/array_ops/slice_op_test.py
index fb894191040..d8097ad15d8 100644
--- a/tensorflow/python/kernel_tests/slice_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/slice_op_test.py
@@ -21,9 +21,12 @@ from __future__ import print_function
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
@@ -178,43 +181,44 @@ class SliceTest(test.TestCase):
       ]
       self.assertAllClose(expected, result.flatten(), rtol=1e-6)
 
-  @test_util.run_deprecated_v1
   def testScalarInput(self):
     input_val = 0
-    with self.cached_session() as sess:
-      # Test with constant input; shape inference fails.
-      with self.assertRaisesWithPredicateMatch(ValueError, "out of range"):
-        constant_op.constant(input_val)[:].get_shape()
+    # Test with constant input; shape inference fails.
+    with self.assertRaisesWithPredicateMatch(
+        (ValueError, errors_impl.InvalidArgumentError), "out of range"):
+      constant_op.constant(input_val)[:].get_shape()
 
-      # Test evaluating with non-constant input; kernel execution fails.
-      input_t = array_ops.placeholder(dtypes.int32)
+    # Test evaluating with non-constant input; kernel execution fails.
+    @def_function.function
+    def func(input_t):
       slice_t = input_t[:]
-      with self.assertRaisesWithPredicateMatch(errors_impl.InvalidArgumentError,
-                                               "out of range"):
-        sess.run([slice_t], feed_dict={input_t: input_val})
+      return slice_t
+
+    with self.assertRaisesWithPredicateMatch(TypeError, "not subscriptable"):
+      self.evaluate(func(input_val))
 
-  @test_util.run_deprecated_v1
   def testInvalidIndex(self):
     input_val = [1, 2]
-    with self.cached_session() as sess:
-      # Test with constant input; shape inference fails.
-      with self.assertRaisesWithPredicateMatch(ValueError, "out of range"):
-        constant_op.constant(input_val)[1:, 1:].get_shape()
+    # Test with constant input; shape inference fails.
+    with self.assertRaisesWithPredicateMatch(
+        (ValueError, errors_impl.InvalidArgumentError), "out of range"):
+      constant_op.constant(input_val)[1:, 1:].get_shape()
 
-      # Test evaluating with non-constant input; kernel execution fails.
-      input_t = array_ops.placeholder(dtypes.int32)
+    # Test evaluating with non-constant input; kernel execution fails.
+    @def_function.function
+    def func(input_t):
       slice_t = input_t[1:, 1:]
-      with self.assertRaisesWithPredicateMatch(errors_impl.InvalidArgumentError,
-                                               "out of range"):
-        sess.run([slice_t], feed_dict={input_t: input_val})
+      return slice_t
+
+    with self.assertRaisesWithPredicateMatch(
+        TypeError, "must be integers or slices, not tuple"):
+      self.evaluate(func(input_val))
 
   def _testSliceMatrixDim0(self, x, begin, size):
-    with self.cached_session(use_gpu=True):
-      tf_ans = array_ops.slice(x, [begin, 0], [size, x.shape[1]]).eval()
+    tf_ans = self.evaluate(array_ops.slice(x, [begin, 0], [size, x.shape[1]]))
     np_ans = x[begin:begin + size, :]
     self.assertAllEqual(tf_ans, np_ans)
 
-  @test_util.run_deprecated_v1
   def testSliceMatrixDim0(self):
     x = np.random.rand(8, 4).astype("f")
     self._testSliceMatrixDim0(x, 1, 2)
@@ -252,47 +256,46 @@ class SliceTest(test.TestCase):
         self.assertEqual(slice_val.shape, slice_t.get_shape())
         self.assertEqual(slice2_val.shape, slice2_t.get_shape())
 
-  @test_util.run_deprecated_v1
   def testComplex(self):
-    with self.session(use_gpu=True):
-      inp = np.random.rand(4, 10, 10, 4).astype("f")
-      a = constant_op.constant(inp, dtype=dtypes.float32)
+    inp = np.random.rand(4, 10, 10, 4).astype("f")
+    a = constant_op.constant(inp, dtype=dtypes.float32)
 
-      x = np.random.randint(0, 9)
-      z = np.random.randint(0, 9)
-      if z > 0:
-        y = np.random.randint(0, z)
-      else:
-        y = 0
-      slice_t = a[:, x, y:z, :]
-      self.assertAllEqual(slice_t, inp[:, x, y:z, :])
+    x = np.random.randint(0, 9)
+    z = np.random.randint(0, 9)
+    if z > 0:
+      y = np.random.randint(0, z)
+    else:
+      y = 0
+    slice_t = a[:, x, y:z, :]
+    self.assertAllEqual(slice_t, inp[:, x, y:z, :])
 
   def testRandom(self):
     # Random dims of rank 6
     input_shape = np.random.randint(0, 20, size=6)
     inp = np.random.rand(*input_shape).astype("f")
-    with self.session(use_gpu=True) as sess:
-      a = constant_op.constant(
-          [float(x) for x in inp.ravel(order="C")],
-          shape=input_shape,
-          dtype=dtypes.float32)
-      indices = [0 if x == 0 else np.random.randint(x) for x in input_shape]
-      sizes = [
-          np.random.randint(0, input_shape[i] - indices[i] + 1)
-          for i in range(6)
-      ]
-      slice_t = array_ops.slice(a, indices, sizes)
-      slice2_t = a[indices[0]:indices[0] + sizes[0], indices[1]:indices[
-          1] + sizes[1], indices[2]:indices[2] + sizes[2], indices[3]:indices[3]
-                   + sizes[3], indices[4]:indices[4] + sizes[4], indices[5]:
-                   indices[5] + sizes[5]]
+    a = constant_op.constant([float(x) for x in inp.ravel(order="C")],
+                             shape=input_shape,
+                             dtype=dtypes.float32)
+    indices = [0 if x == 0 else np.random.randint(x) for x in input_shape]
+    sizes = [
+        np.random.randint(0, input_shape[i] - indices[i] + 1) for i in range(6)
+    ]
+    slice_t = array_ops.slice(a, indices, sizes)
+    slice2_t = a[indices[0]:indices[0] + sizes[0],
+                 indices[1]:indices[1] + sizes[1],
+                 indices[2]:indices[2] + sizes[2],
+                 indices[3]:indices[3] + sizes[3],
+                 indices[4]:indices[4] + sizes[4],
+                 indices[5]:indices[5] + sizes[5]]
 
-      slice_val, slice2_val = self.evaluate([slice_t, slice2_t])
+    slice_val, slice2_val = self.evaluate([slice_t, slice2_t])
 
-    expected_val = inp[indices[0]:indices[0] + sizes[0], indices[1]:indices[
-        1] + sizes[1], indices[2]:indices[2] + sizes[2], indices[3]:indices[
-            3] + sizes[3], indices[4]:indices[4] + sizes[4], indices[5]:indices[
-                5] + sizes[5]]
+    expected_val = inp[indices[0]:indices[0] + sizes[0],
+                       indices[1]:indices[1] + sizes[1],
+                       indices[2]:indices[2] + sizes[2],
+                       indices[3]:indices[3] + sizes[3],
+                       indices[4]:indices[4] + sizes[4],
+                       indices[5]:indices[5] + sizes[5]]
     self.assertAllEqual(slice_val, expected_val)
     self.assertAllEqual(slice2_val, expected_val)
     self.assertEqual(expected_val.shape, slice_t.get_shape())
@@ -308,7 +311,6 @@ class SliceTest(test.TestCase):
     m2 = array_ops.slice(z, [0, 0, 0], [constant_op.constant(1) + 0, 2, -1])
     self.assertAllEqual(m2.get_shape().as_list(), [1, 2, 3])
 
-
   def _testGradientSlice(self, input_shape, slice_begin, slice_size):
     with self.cached_session(use_gpu=True):
       num_inputs = np.prod(input_shape)
@@ -334,18 +336,51 @@ class SliceTest(test.TestCase):
 
     self.assertAllClose(np_ans, result)
 
+  def _testGradientSliceTape(self, input_shape, slice_begin, slice_size):
+    with backprop.GradientTape() as tape:
+      num_inputs = np.prod(input_shape)
+      num_grads = np.prod(slice_size)
+      inp = np.random.rand(num_inputs).astype("f").reshape(input_shape)
+      a = constant_op.constant([float(x) for x in inp.ravel(order="C")],
+                               shape=input_shape,
+                               dtype=dtypes.float32)
+      tape.watch(a)
+      slice_t = array_ops.slice(a, slice_begin, slice_size)
+      grads = np.random.rand(num_grads).astype("f").reshape(slice_size)
+      grad_tensor = constant_op.constant(grads)
+    grad = tape.gradient(slice_t, [a], grad_tensor)[0]
+    result = self.evaluate(grad)
+
+    # Create a zero tensor of the input shape ane place
+    # the grads into the right location to compare against TensorFlow.
+    np_ans = np.zeros(input_shape)
+    slices = []
+    for i in xrange(len(input_shape)):
+      slices.append(slice(slice_begin[i], slice_begin[i] + slice_size[i]))
+    np_ans[slices] = grads
+
+    self.assertAllClose(np_ans, result)
+
   def _testGradientVariableSize(self):
     with self.cached_session(use_gpu=True):
       inp = constant_op.constant([1.0, 2.0, 3.0], name="in")
       out = array_ops.slice(inp, [1], [-1])
-      grad_actual = gradients_impl.gradients(out, inp)[0].eval()
+      grad_actual = self.evaluate(gradients_impl.gradients(out, inp)[0])
+    self.assertAllClose([0., 1., 1.], grad_actual)
+
+  def _testGradientVariableSizeTape(self):
+    with backprop.GradientTape() as tape:
+      inp = constant_op.constant([1.0, 2.0, 3.0], name="in")
+      tape.watch(inp)
+      out = array_ops.slice(inp, [1], [-1])
+    grad_actual = self.evaluate(tape.gradient(out, inp))
     self.assertAllClose([0., 1., 1.], grad_actual)
 
   def _testGradientVariableSize2D(self):
     # Regression test for bug in slice. A low-level bug in Eigen was causing
     # incorrect results for negative indices in multi-dimensional tensors.
     # See b/114318298.
-    with self.cached_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True):
       x = constant_op.constant([[1., 2., 3.], [4., 5., 6.], [7., 8., 7]])
       loss1 = math_ops.reduce_sum(x[:-1, :-1] * 1.0)
       loss2 = math_ops.reduce_sum(x[:-1][:, :-1])
@@ -356,39 +391,73 @@ class SliceTest(test.TestCase):
       g1_val, g2_val = self.evaluate([g1, g2])
     self.assertAllEqual(g1_val, g2_val)
 
-  @test_util.run_deprecated_v1
+  def _testGradientVariableSize2DTape(self):
+    # Regression test for bug in slice. A low-level bug in Eigen was causing
+    # incorrect results for negative indices in multi-dimensional tensors.
+    # See b/114318298.
+    with backprop.GradientTape(persistent=True) as tape:
+      x = constant_op.constant([[1., 2., 3.], [4., 5., 6.], [7., 8., 7]])
+      tape.watch(x)
+      loss1 = math_ops.reduce_sum(x[:-1, :-1] * 1.0)
+      loss2 = math_ops.reduce_sum(x[:-1][:, :-1])
+
+    g1 = tape.gradient(loss1, x)
+    g2 = tape.gradient(loss2, x)
+    g1_val, g2_val = self.evaluate([g1, g2])
+    self.assertAllEqual(g1_val, g2_val)
+
   def testGradientsAll(self):
+    with ops.Graph().as_default():
+      # Slice the middle square out of a 4x4 input
+      self._testGradientSlice([4, 4], [1, 1], [2, 2])
+
+      # Slice the upper left square out of a 4x4 input
+      self._testGradientSlice([4, 4], [0, 0], [2, 2])
+
+      # Slice a non-square input starting from (2,1)
+      self._testGradientSlice([4, 4], [2, 1], [1, 2])
+
+      # Slice a 3D tensor
+      self._testGradientSlice([3, 3, 3], [0, 1, 0], [2, 1, 1])
+
+      # Use -1 as a slice dimension.
+      self._testGradientVariableSize()
+
+      # Use -1 as a slice dimension on a 2D tensor.
+      self._testGradientVariableSize2D()
+
+  def testGradientsAllTape(self):
     # Slice the middle square out of a 4x4 input
-    self._testGradientSlice([4, 4], [1, 1], [2, 2])
+    self._testGradientSliceTape([4, 4], [1, 1], [2, 2])
 
     # Slice the upper left square out of a 4x4 input
-    self._testGradientSlice([4, 4], [0, 0], [2, 2])
+    self._testGradientSliceTape([4, 4], [0, 0], [2, 2])
 
     # Slice a non-square input starting from (2,1)
-    self._testGradientSlice([4, 4], [2, 1], [1, 2])
+    self._testGradientSliceTape([4, 4], [2, 1], [1, 2])
 
     # Slice a 3D tensor
-    self._testGradientSlice([3, 3, 3], [0, 1, 0], [2, 1, 1])
+    self._testGradientSliceTape([3, 3, 3], [0, 1, 0], [2, 1, 1])
 
     # Use -1 as a slice dimension.
-    self._testGradientVariableSize()
+    self._testGradientVariableSizeTape()
 
     # Use -1 as a slice dimension on a 2D tensor.
-    self._testGradientVariableSize2D()
+    self._testGradientVariableSize2DTape()
 
-  @test_util.run_deprecated_v1
   def testNotIterable(self):
-    # NOTE(mrry): If we register __getitem__ as an overloaded
-    # operator, Python will valiantly attempt to iterate over the
-    # Tensor from 0 to infinity.  This test ensures that this
-    # unintended behavior is prevented.
-    c = constant_op.constant(5.0)
-    with self.assertRaisesRegex(errors_impl.OperatorNotAllowedInGraphError,
-                                "iterating over `tf.Tensor`"):
-      for _ in c:
-        pass
+    # Tensor iteration is disabled explicitly for only graph mode.
+    with ops.Graph().as_default():
+      # NOTE(mrry): If we register __getitem__ as an overloaded
+      # operator, Python will valiantly attempt to iterate over the
+      # Tensor from 0 to infinity.  This test ensures that this
+      # unintended behavior is prevented.
+      c = constant_op.constant(5.0)
+      with self.assertRaisesRegex(errors_impl.OperatorNotAllowedInGraphError,
+                                  "iterating over `tf.Tensor`"):
+        for _ in c:
+          pass
 
-  @test_util.run_deprecated_v1
   def testComputedShape(self):
     # NOTE(mrry): We cannot currently handle partially-known values,
     # because `tf.slice()` uses -1 to specify a wildcard size, and
@@ -400,9 +469,12 @@ class SliceTest(test.TestCase):
     b = array_ops.slice(a, [begin, 0], [size, 2])
     self.assertEqual([1, 2], b.get_shape())
 
-    begin = array_ops.placeholder(dtypes.int32, shape=())
-    c = array_ops.slice(a, [begin, 0], [-1, 2])
-    self.assertEqual([None, 2], c.get_shape().as_list())
+    # placeholders only make sense in a graph.
+    with ops.Graph().as_default():
+      a = constant_op.constant([[1, 2, 3], [4, 5, 6]])
+      begin = array_ops.placeholder(dtypes.int32, shape=())
+      c = array_ops.slice(a, [begin, 0], [-1, 2])
+      self.assertEqual([None, 2], c.get_shape().as_list())
 
   def testSliceOfSlice(self):
     with self.session(use_gpu=True):
diff --git a/tensorflow/python/kernel_tests/unstack_op_test.py b/tensorflow/python/kernel_tests/array_ops/unstack_op_test.py
similarity index 83%
rename from tensorflow/python/kernel_tests/unstack_op_test.py
rename to tensorflow/python/kernel_tests/array_ops/unstack_op_test.py
index 65217dde255..a9c1131acfe 100644
--- a/tensorflow/python/kernel_tests/unstack_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/unstack_op_test.py
@@ -22,9 +22,10 @@ import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.platform import test
 
 
@@ -109,59 +110,62 @@ class UnstackOpTest(test.TestCase):
               with self.subTest(shape=shape, k=k, axis=axis, dtype=dtype):
                 self.assertAllEqual(ref[k], self.evaluate(c))
 
-  @test_util.run_deprecated_v1
   def testGradientsAxis0(self):
     for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
       data = np.random.randn(*shape)
-      shapes = [shape[1:]] * shape[0]
+      x = constant_op.constant(data)
+
       for i in xrange(shape[0]):
+        def func(x, shape=shape, i=i):
+          return array_ops.unstack(x, num=shape[0])[i]
+
         with self.cached_session():
-          x = constant_op.constant(data)
-          cs = array_ops.unstack(x, num=shape[0])
-          err = gradient_checker.compute_gradient_error(x, shape, cs[i],
-                                                        shapes[i])
+          err = gradient_checker_v2.max_error(
+              *gradient_checker_v2.compute_gradient(func, [x]))
           self.assertLess(err, 1e-6)
 
-  @test_util.run_deprecated_v1
   def testGradientsAxis1(self):
     for shape in (2, 3), (3, 2), (4, 3, 2):
       data = np.random.randn(*shape)
-      out_shape = list(shape)
-      del out_shape[1]
+      x = constant_op.constant(data)
+
       for i in xrange(shape[1]):
+        def func(x, shape=shape, i=i):
+          return array_ops.unstack(x, num=shape[1], axis=1)[i]
+
         with self.cached_session():
-          x = constant_op.constant(data)
-          cs = array_ops.unstack(x, num=shape[1], axis=1)
-          err = gradient_checker.compute_gradient_error(x, shape, cs[i],
-                                                        out_shape)
+          err = gradient_checker_v2.max_error(
+              *gradient_checker_v2.compute_gradient(func, [x]))
           self.assertLess(err, 1e-6)
 
-  @test_util.run_deprecated_v1
   def testInferNum(self):
     for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
-      x = array_ops.placeholder(np.float32, shape=shape)
+      x = array_ops.ones(shape, dtype=np.float32)
       cs = array_ops.unstack(x)
       self.assertEqual(type(cs), list)
       self.assertEqual(len(cs), shape[0])
 
-  @test_util.run_deprecated_v1
   def testCannotInferNumFromUnknownShape(self):
-    x = array_ops.placeholder(np.float32)
-    with self.assertRaisesRegex(ValueError,
-                                r'Cannot infer num from shape <unknown>'):
-      array_ops.unstack(x)
+    # Testing unknown shape in graph mode.
+    with ops.Graph().as_default():
+      x = array_ops.placeholder(np.float32)
+      with self.assertRaisesRegex(ValueError,
+                                  r'Cannot infer num from shape <unknown>'):
+        array_ops.unstack(x)
 
-  @test_util.run_deprecated_v1
   def testUnknownShapeOkWithNum(self):
-    x = array_ops.placeholder(np.float32)
-    array_ops.unstack(x, num=2)
+    # Testing unknown shape in graph mode.
+    with ops.Graph().as_default():
+      x = array_ops.placeholder(np.float32)
+      array_ops.unstack(x, num=2)
 
-  @test_util.run_deprecated_v1
   def testCannotInferNumFromNoneShape(self):
-    x = array_ops.placeholder(np.float32, shape=(None,))
-    with self.assertRaisesRegex(ValueError,
-                                r'Cannot infer num from shape \((\?|None),\)'):
-      array_ops.unstack(x)
+    # Testing unknown shape in graph mode.
+    with ops.Graph().as_default():
+      x = array_ops.placeholder(np.float32, shape=(None,))
+      with self.assertRaisesRegex(
+          ValueError, r'Cannot infer num from shape \((\?|None),\)'):
+        array_ops.unstack(x)
 
   def testAgainstNumpy(self):
     # For 1 to 5 dimensions.
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 994a6a6cd9b..c6f924daca6 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -800,7 +800,7 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
   def testExpandVariable(self):
     with self.session(use_gpu=True):
       x = variables.Variable(7, dtype=dtypes.int32)
-      x.initializer.run()
+      self.evaluate(x.initializer)
       y = x[None].eval()
       self.assertEqual(y.shape, (1,))
       self.assertAllEqual(y, (7,))
@@ -1069,11 +1069,11 @@ class StridedSliceBenchmark(test_lib.Benchmark):
   def run_and_time(self, slice_op):
     self.evaluate(variables.global_variables_initializer())
     for _ in range(10):
-      _ = slice_op.eval()
+      _ = self.evaluate(slice_op)
     iters = 1000
     t0 = time.time()
     for _ in range(iters):
-      slice_op.eval()
+      self.evaluate(slice_op)
     t1 = time.time()
     self.report_benchmark(iters=iters, wall_time=(t1 - t0) / 1000.0)
 
@@ -1227,6 +1227,26 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
       with self.assertRaises(ValueError):
         sess.run(v[:].assign(too_small_val))
 
+  @test_util.run_in_graph_and_eager_modes
+  def testTensorStridedSliceAssignWithInputForward(self):
+    """Tests tensor_strided_slice_update with input-forwarding taking effect."""
+    @def_function.function
+    def assign(x):
+      y = x + 1
+      return gen_array_ops.tensor_strided_slice_update(y, [0], [1], [1], [0])
+    self.assertAllEqual([0, 1], self.evaluate(assign(array_ops.zeros([2]))))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testTensorStridedSliceAssignNoInputForward(self):
+    """Tests tensor_strided_slice_update with no input-forwarding."""
+    x = constant_op.constant([0.2, 0.3])
+    y = x + 1
+    # y's buffer won't be forwarded to z because y and z will be alive at the
+    # same time later.
+    z = gen_array_ops.tensor_strided_slice_update(y, [0], [1], [1], [0.4])
+    ans = y + z
+    self.assertAllClose([1.6, 2.6], self.evaluate(ans))
+
 
 class ShapeSizeRankTest(test_util.TensorFlowTestCase):
 
@@ -1277,7 +1297,7 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
       res = array_ops.sequence_mask(constant_op.constant([1, 3, 2]), 5)
       self.assertAllEqual(res.get_shape(), [3, 5])
       self.assertAllEqual(
-          res.eval(),
+          res,
           [[True, False, False, False, False], [True, True, True, False, False],
            [True, True, False, False, False]])
 
@@ -1289,7 +1309,7 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
           constant_op.constant([0, 1, 4]), dtype=dtypes.float32)
       self.assertAllEqual(res.get_shape().as_list(), [3, 4])
       self.assertAllEqual(
-          res.eval(),
+          res,
           [[0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]])
 
   @test_util.run_deprecated_v1
@@ -1298,8 +1318,8 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
       res = array_ops.sequence_mask(constant_op.constant([0, 1, 4]))
       self.assertAllEqual(res.get_shape().as_list(), [3, 4])
       self.assertAllEqual(
-          res.eval(), [[False, False, False, False],
-                       [True, False, False, False], [True, True, True, True]])
+          res, [[False, False, False, False], [True, False, False, False],
+                [True, True, True, True]])
 
   @test_util.run_deprecated_v1
   def testTwoDimensional(self):
@@ -1315,7 +1335,7 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
           constant_op.constant([[0, 1, 4], [1, 2, 3]]), dtype=dtypes.float32)
       self.assertAllEqual(res.get_shape().as_list(), [2, 3, 4])
       self.assertAllEqual(
-          res.eval(),
+          res,
           [[[0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]],
            [[1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0]]])
 
@@ -1334,7 +1354,7 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
           constant_op.constant(5, dtype=maxlen_dtype))
       self.assertAllEqual(res.get_shape(), [3, 5])
       self.assertAllEqual(
-          res.eval(),
+          res,
           [[True, False, False, False, False], [True, True, True, False, False],
            [True, True, False, False, False]])
 
@@ -1454,7 +1474,7 @@ class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
     with self.cached_session():
       a = array_ops.constant(10)
       guarantee_a = array_ops.guarantee_const(a)
-      self.assertEqual(10, guarantee_a.eval())
+      self.assertEqual(10, self.evaluate(guarantee_a))
 
   @test_util.run_deprecated_v1
   def testVariables(self):
@@ -1467,7 +1487,7 @@ class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
               use_resource=use_resource)
           guarantee_a = array_ops.guarantee_const(a)
           self.evaluate(variables.global_variables_initializer())
-          self.assertEqual(10.0, guarantee_a.eval())
+          self.assertEqual(10.0, self.evaluate(guarantee_a))
 
   @test_util.run_deprecated_v1
   def testResourceRejection(self):
@@ -1480,7 +1500,7 @@ class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
       self.evaluate(variables.global_variables_initializer())
       with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
                                                "cannot be a resource variable"):
-        guarantee_a.eval()
+        self.evaluate(guarantee_a)
 
 
 class SnapshotOpTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/kernel_tests/atrous_conv2d_test.py b/tensorflow/python/kernel_tests/atrous_conv2d_test.py
index b84e7647239..e0cf7c2cc50 100644
--- a/tensorflow/python/kernel_tests/atrous_conv2d_test.py
+++ b/tensorflow/python/kernel_tests/atrous_conv2d_test.py
@@ -81,8 +81,7 @@ class AtrousConv2DTest(test.TestCase):
                 y1 = nn_ops.atrous_conv2d(x, f, rate, padding=padding)
                 y2 = nn_ops.conv2d(
                     x, f_up, strides=[1, 1, 1, 1], padding=padding)
-                self.assertAllClose(
-                    y1.eval(), self.evaluate(y2), rtol=1e-3, atol=1e-3)
+                self.assertAllClose(y1, y2, rtol=1e-3, atol=1e-3)
 
   @test_util.run_deprecated_v1
   def testAtrousSequence(self):
@@ -135,8 +134,7 @@ class AtrousConv2DTest(test.TestCase):
               y2 = nn_ops.conv2d(y2, f, strides=[1, 1, 1, 1], padding=padding)
               y2 = nn_ops.conv2d(y2, f, strides=[1, 1, 1, 1], padding=padding)
               y2 = array_ops.batch_to_space(y2, crops=pad, block_size=rate)
-              self.assertAllClose(
-                  y1.eval(), self.evaluate(y2), rtol=1e-2, atol=1e-2)
+              self.assertAllClose(y1, y2, rtol=1e-2, atol=1e-2)
 
   @test_util.run_deprecated_v1
   def testGradient(self):
@@ -200,8 +198,7 @@ class AtrousConv2DTransposeTest(test.TestCase):
                                                     padding)
                 y2 = nn_ops.conv2d_transpose(
                     x, f_up, y_shape, strides=[1, 1, 1, 1], padding=padding)
-                self.assertAllClose(
-                    y1.eval(), self.evaluate(y2), rtol=1e-3, atol=1e-3)
+                self.assertAllClose(y1, y2, rtol=1e-3, atol=1e-3)
 
 
 class AtrousDepthwiseConv2DTest(test.TestCase):
@@ -229,8 +226,7 @@ class AtrousDepthwiseConv2DTest(test.TestCase):
                 y1 = nn_impl.depthwise_conv2d(
                     x, f, strides, padding, rate=[rate, rate])
                 y2 = nn_impl.depthwise_conv2d(x, f_up, strides, padding)
-                self.assertAllClose(
-                    y1.eval(), self.evaluate(y2), rtol=1e-3, atol=1e-3)
+                self.assertAllClose(y1, y2, rtol=1e-3, atol=1e-3)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/barrier_ops_test.py b/tensorflow/python/kernel_tests/barrier_ops_test.py
index 9aef798bc90..6e789ceba53 100644
--- a/tensorflow/python/kernel_tests/barrier_ops_test.py
+++ b/tensorflow/python/kernel_tests/barrier_ops_test.py
@@ -78,11 +78,11 @@ class BarrierTest(test.TestCase):
       insert_0_op = b.insert_many(0, keys, [10.0, 20.0, 30.0])
       insert_1_op = b.insert_many(1, keys, [100.0, 200.0, 300.0])
 
-      self.assertEqual(size_t.eval(), [0])
+      self.assertEqual(self.evaluate(size_t), [0])
       insert_0_op.run()
-      self.assertEqual(size_t.eval(), [0])
+      self.assertEqual(self.evaluate(size_t), [0])
       insert_1_op.run()
-      self.assertEqual(size_t.eval(), [3])
+      self.assertEqual(self.evaluate(size_t), [3])
 
   def testInsertManyEmptyTensor(self):
     with self.cached_session():
@@ -100,7 +100,7 @@ class BarrierTest(test.TestCase):
       self.assertEqual([], size_t.get_shape())
       keys = [b"a", b"b", b"c"]
       insert_0_op = b.insert_many(0, keys, np.array([[], [], []], np.float32))
-      self.assertEqual(size_t.eval(), [0])
+      self.assertEqual(self.evaluate(size_t), [0])
       with self.assertRaisesOpError(
           ".*Tensors with no elements are not supported.*"):
         insert_0_op.run()
@@ -120,7 +120,7 @@ class BarrierTest(test.TestCase):
 
       insert_0_op.run()
       insert_1_op.run()
-      self.assertEqual(size_t.eval(), [3])
+      self.assertEqual(self.evaluate(size_t), [3])
 
       indices_val, keys_val, values_0_val, values_1_val = sess.run(
           [take_t[0], take_t[1], take_t[2][0], take_t[2][1]])
@@ -157,8 +157,9 @@ class BarrierTest(test.TestCase):
       close_op.run()
       # Now we have a closed barrier with 2 ready elements. Running take_t
       # should return a reduced batch with 2 elements only.
-      self.assertEqual(size_i.eval(), [2])  # assert that incomplete size = 2
-      self.assertEqual(size_t.eval(), [2])  # assert that ready size = 2
+      self.assertEqual(self.evaluate(size_i),
+                       [2])  # assert that incomplete size = 2
+      self.assertEqual(self.evaluate(size_t), [2])  # assert that ready size = 2
       _, keys_val, values_0_val, values_1_val = sess.run(
           [index_t, key_t, value_list_t[0], value_list_t[1]])
       # Check that correct values have been returned.
@@ -170,8 +171,9 @@ class BarrierTest(test.TestCase):
       # The next insert completes the element with key "c". The next take_t
       # should return a batch with just 1 element.
       insert_1_2_op.run()
-      self.assertEqual(size_i.eval(), [1])  # assert that incomplete size = 1
-      self.assertEqual(size_t.eval(), [1])  # assert that ready size = 1
+      self.assertEqual(self.evaluate(size_i),
+                       [1])  # assert that incomplete size = 1
+      self.assertEqual(self.evaluate(size_t), [1])  # assert that ready size = 1
       _, keys_val, values_0_val, values_1_val = sess.run(
           [index_t, key_t, value_list_t[0], value_list_t[1]])
       # Check that correct values have been returned.
@@ -212,7 +214,7 @@ class BarrierTest(test.TestCase):
 
       insert_0_op.run()
       insert_1_op.run()
-      self.assertEqual(size_t.eval(), [3])
+      self.assertEqual(self.evaluate(size_t), [3])
 
       indices_val, keys_val, values_0_val, values_1_val = sess.run(
           [take_t[0], take_t[1], take_t[2][0], take_t[2][1]])
@@ -237,7 +239,7 @@ class BarrierTest(test.TestCase):
       take_t = b.take_many(10)
 
       self.evaluate(insert_ops)
-      self.assertEqual(size_t.eval(), [10])
+      self.assertEqual(self.evaluate(size_t), [10])
 
       indices_val, keys_val, values_val = sess.run(
           [take_t[0], take_t[1], take_t[2][0]])
@@ -258,7 +260,7 @@ class BarrierTest(test.TestCase):
       take_t = [b.take_many(1) for _ in keys]
 
       insert_op.run()
-      self.assertEqual(size_t.eval(), [10])
+      self.assertEqual(self.evaluate(size_t), [10])
 
       index_fetches = []
       key_fetches = []
@@ -402,11 +404,11 @@ class BarrierTest(test.TestCase):
       take_t = b.take_many(3)
       take_too_many_t = b.take_many(4)
 
-      self.assertEqual(size_t.eval(), [0])
-      self.assertEqual(incomplete_t.eval(), [0])
+      self.assertEqual(self.evaluate(size_t), [0])
+      self.assertEqual(self.evaluate(incomplete_t), [0])
       insert_0_op.run()
-      self.assertEqual(size_t.eval(), [0])
-      self.assertEqual(incomplete_t.eval(), [3])
+      self.assertEqual(self.evaluate(size_t), [0])
+      self.assertEqual(self.evaluate(incomplete_t), [3])
       close_op.run()
 
       # This op should fail because the barrier is closed.
@@ -416,8 +418,8 @@ class BarrierTest(test.TestCase):
       # This op should succeed because the barrier has not canceled
       # pending enqueues
       insert_1_op.run()
-      self.assertEqual(size_t.eval(), [3])
-      self.assertEqual(incomplete_t.eval(), [0])
+      self.assertEqual(self.evaluate(size_t), [3])
+      self.assertEqual(self.evaluate(incomplete_t), [0])
 
       # This op should fail because the barrier is closed.
       with self.assertRaisesOpError("is closed"):
@@ -462,11 +464,11 @@ class BarrierTest(test.TestCase):
       take_t = b.take_many(2)
       take_too_many_t = b.take_many(3)
 
-      self.assertEqual(size_t.eval(), [0])
+      self.assertEqual(self.evaluate(size_t), [0])
       insert_0_op.run()
       insert_1_op.run()
-      self.assertEqual(size_t.eval(), [2])
-      self.assertEqual(incomplete_t.eval(), [1])
+      self.assertEqual(self.evaluate(size_t), [2])
+      self.assertEqual(self.evaluate(incomplete_t), [1])
       cancel_op.run()
 
       # This op should fail because the queue is closed.
@@ -700,17 +702,17 @@ class BarrierTest(test.TestCase):
           (dtypes.float32,), shapes=(()), shared_name="b_a")
       b_a_2 = data_flow_ops.Barrier(
           (dtypes.int32,), shapes=(()), shared_name="b_a")
-      b_a_1.barrier_ref.eval()
+      self.evaluate(b_a_1.barrier_ref)
       with self.assertRaisesOpError("component types"):
-        b_a_2.barrier_ref.eval()
+        self.evaluate(b_a_2.barrier_ref)
 
       b_b_1 = data_flow_ops.Barrier(
           (dtypes.float32,), shapes=(()), shared_name="b_b")
       b_b_2 = data_flow_ops.Barrier(
           (dtypes.float32, dtypes.int32), shapes=((), ()), shared_name="b_b")
-      b_b_1.barrier_ref.eval()
+      self.evaluate(b_b_1.barrier_ref)
       with self.assertRaisesOpError("component types"):
-        b_b_2.barrier_ref.eval()
+        self.evaluate(b_b_2.barrier_ref)
 
       b_c_1 = data_flow_ops.Barrier(
           (dtypes.float32, dtypes.float32),
@@ -718,9 +720,9 @@ class BarrierTest(test.TestCase):
           shared_name="b_c")
       b_c_2 = data_flow_ops.Barrier(
           (dtypes.float32, dtypes.float32), shared_name="b_c")
-      b_c_1.barrier_ref.eval()
+      self.evaluate(b_c_1.barrier_ref)
       with self.assertRaisesOpError("component shapes"):
-        b_c_2.barrier_ref.eval()
+        self.evaluate(b_c_2.barrier_ref)
 
       b_d_1 = data_flow_ops.Barrier(
           (dtypes.float32, dtypes.float32), shapes=((), ()), shared_name="b_d")
@@ -728,9 +730,9 @@ class BarrierTest(test.TestCase):
           (dtypes.float32, dtypes.float32),
           shapes=((2, 2), (8,)),
           shared_name="b_d")
-      b_d_1.barrier_ref.eval()
+      self.evaluate(b_d_1.barrier_ref)
       with self.assertRaisesOpError("component shapes"):
-        b_d_2.barrier_ref.eval()
+        self.evaluate(b_d_2.barrier_ref)
 
       b_e_1 = data_flow_ops.Barrier(
           (dtypes.float32, dtypes.float32),
@@ -740,9 +742,9 @@ class BarrierTest(test.TestCase):
           (dtypes.float32, dtypes.float32),
           shapes=((2, 5), (8,)),
           shared_name="b_e")
-      b_e_1.barrier_ref.eval()
+      self.evaluate(b_e_1.barrier_ref)
       with self.assertRaisesOpError("component shapes"):
-        b_e_2.barrier_ref.eval()
+        self.evaluate(b_e_2.barrier_ref)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/basic_gpu_test.py b/tensorflow/python/kernel_tests/basic_gpu_test.py
index df27e8afbba..a64032ec216 100644
--- a/tensorflow/python/kernel_tests/basic_gpu_test.py
+++ b/tensorflow/python/kernel_tests/basic_gpu_test.py
@@ -29,7 +29,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
@@ -156,10 +156,8 @@ class MathBuiltinUnaryTest(test.TestCase):
 class BroadcastSimpleTest(test.TestCase):
 
   def _GetGradientArgs(self, xs, ys):
-    with self.cached_session(use_gpu=True) as sess:
-      return sess.run(broadcast_gradient_args(xs, ys))
+    return self.evaluate(broadcast_gradient_args(xs, ys))
 
-  @test_util.run_deprecated_v1
   def testBroadcast(self):
     r0, r1 = self._GetGradientArgs([2, 3, 5], [1])
     self.assertAllEqual(r0, [])
@@ -167,48 +165,6 @@ class BroadcastSimpleTest(test.TestCase):
 
   _GRAD_TOL = {dtypes.float32: 1e-3}
 
-  def _compareGradientX(self,
-                        x,
-                        y,
-                        np_func,
-                        tf_func,
-                        numeric_gradient_type=None):
-    z = np_func(x, y)
-    zs = list(z.shape)
-    with self.cached_session():
-      inx = ops.convert_to_tensor(x)
-      iny = ops.convert_to_tensor(y)
-      if x.dtype in (np.float32, np.float64):
-        out = 1.1 * tf_func(inx, iny)
-      else:
-        out = tf_func(inx, iny)
-      xs = list(x.shape)
-      jacob_t, jacob_n = gradient_checker.compute_gradient(
-          inx, xs, out, zs, x_init_value=x)
-      tol = self._GRAD_TOL[dtypes.as_dtype(x.dtype)]
-      self.assertAllClose(jacob_t, jacob_n, rtol=tol, atol=tol)
-
-  def _compareGradientY(self,
-                        x,
-                        y,
-                        np_func,
-                        tf_func,
-                        numeric_gradient_type=None):
-    z = np_func(x, y)
-    zs = list(z.shape)
-    with self.cached_session():
-      inx = ops.convert_to_tensor(x)
-      iny = ops.convert_to_tensor(y)
-      if x.dtype in (np.float32, np.float64):
-        out = 1.1 * tf_func(inx, iny)
-      else:
-        out = tf_func(inx, iny)
-      ys = list(np.shape(y))
-      jacob_t, jacob_n = gradient_checker.compute_gradient(
-          iny, ys, out, zs, x_init_value=y)
-    tol = self._GRAD_TOL[dtypes.as_dtype(x.dtype)]
-    self.assertAllClose(jacob_t, jacob_n, rtol=tol, atol=tol)
-
   def _compareGpu(self, x, y, np_func, tf_func):
     np_ans = np_func(x, y)
     with self.cached_session(use_gpu=True):
@@ -220,17 +176,29 @@ class BroadcastSimpleTest(test.TestCase):
     self.assertShapeEqual(np_ans, out)
     # TODO(zhifengc/ke): make gradient checker work on GPU.
 
-  @test_util.run_deprecated_v1
   def testGradient(self):
-    x = (1 + np.linspace(0, 5, np.prod([1, 3, 2]))).astype(np.float32).reshape(
+    x1 = (1 + np.linspace(0, 5, np.prod([1, 3, 2]))).astype(np.float32).reshape(
         [1, 3, 2])
-    y = (1 + np.linspace(0, 5, np.prod([1, 3, 2]))).astype(np.float32).reshape(
+    x2 = (1 + np.linspace(0, 5, np.prod([1, 3, 2]))).astype(np.float32).reshape(
         [1, 3, 2])
 
-    self._compareGradientX(x, y, np.true_divide, math_ops.truediv)
-    self._compareGradientY(x, y, np.true_divide, math_ops.truediv)
-    self._compareGpu(x, y, np.true_divide, math_ops.truediv)
-    self._compareGpu(x, y + 0.1, np.floor_divide, math_ops.floordiv)
+    def div_x1(x1):
+      return math_ops.truediv(x1, x2) * math_ops.cast(1.1, dtype=x1.dtype)
+
+    def div_x2(x2):
+      return math_ops.truediv(x1, x2) * math_ops.cast(1.1, dtype=x2.dtype)
+
+    with self.cached_session():
+      err = gradient_checker_v2.max_error(*gradient_checker_v2.compute_gradient(
+          div_x1, [x1]))
+      self.assertLess(err, self._GRAD_TOL[dtypes.as_dtype(x1.dtype)])
+
+      err = gradient_checker_v2.max_error(*gradient_checker_v2.compute_gradient(
+          div_x2, [x2]))
+      self.assertLess(err, self._GRAD_TOL[dtypes.as_dtype(x2.dtype)])
+
+    self._compareGpu(x1, x2, np.true_divide, math_ops.truediv)
+    self._compareGpu(x1, x2 + 0.1, np.floor_divide, math_ops.floordiv)
 
 
 class GpuMultiSessionMemoryTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/kernel_tests/batch_scatter_ops_test.py b/tensorflow/python/kernel_tests/batch_scatter_ops_test.py
index 47dcdf1c489..f51095a94f5 100644
--- a/tensorflow/python/kernel_tests/batch_scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/batch_scatter_ops_test.py
@@ -23,7 +23,6 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -72,14 +71,14 @@ class ScatterTest(test.TestCase):
           np_scatter(new, indices, updates)
           # Scatter via tensorflow
           ref = variables.Variable(old)
-          ref.initializer.run()
+          self.evaluate(variables.variables_initializer([ref]))
+
           if method:
             ref.batch_scatter_update(ops.IndexedSlices(indices, updates))
           else:
-            tf_scatter(ref, indices, updates).eval()
+            self.evaluate(tf_scatter(ref, indices, updates))
           self.assertAllClose(ref, new)
 
-  @test_util.run_deprecated_v1
   def testVariableRankUpdate(self):
     vtypes = [np.float32, np.float64]
     for vtype in vtypes:
@@ -87,42 +86,39 @@ class ScatterTest(test.TestCase):
         self._VariableRankTest(
             state_ops.batch_scatter_update, vtype, itype)
 
-  @test_util.run_deprecated_v1
   def testBooleanScatterUpdate(self):
-    with self.session(use_gpu=False) as session:
-      var = variables.Variable([True, False])
-      update0 = state_ops.batch_scatter_update(var, [1], [True])
-      update1 = state_ops.batch_scatter_update(
-          var, constant_op.constant(
-              [0], dtype=dtypes.int64), [False])
-      var.initializer.run()
+    var = variables.Variable([True, False])
+    update0 = state_ops.batch_scatter_update(var, [1], [True])
+    update1 = state_ops.batch_scatter_update(
+        var, constant_op.constant(
+            [0], dtype=dtypes.int64), [False])
+    self.evaluate(variables.variables_initializer([var]))
 
-      session.run([update0, update1])
+    self.evaluate([update0, update1])
 
-      self.assertAllEqual([False, True], self.evaluate(var))
+    self.assertAllEqual([False, True], self.evaluate(var))
 
-  @test_util.run_deprecated_v1
   def testScatterOutOfRange(self):
     params = np.array([1, 2, 3, 4, 5, 6]).astype(np.float32)
     updates = np.array([-3, -4, -5]).astype(np.float32)
-    with self.session(use_gpu=False):
-      ref = variables.Variable(params)
-      ref.initializer.run()
 
-      # Indices all in range, no problem.
-      indices = np.array([2, 0, 5])
-      state_ops.batch_scatter_update(ref, indices, updates).eval()
+    ref = variables.Variable(params)
+    self.evaluate(variables.variables_initializer([ref]))
 
-      # Test some out of range errors.
-      indices = np.array([-1, 0, 5])
-      with self.assertRaisesOpError(
-          r'indices\[0\] = \[-1\] does not index into shape \[6\]'):
-        state_ops.batch_scatter_update(ref, indices, updates).eval()
+    # Indices all in range, no problem.
+    indices = np.array([2, 0, 5])
+    self.evaluate(state_ops.batch_scatter_update(ref, indices, updates))
 
-      indices = np.array([2, 0, 6])
-      with self.assertRaisesOpError(r'indices\[2\] = \[6\] does not index into '
-                                    r'shape \[6\]'):
-        state_ops.batch_scatter_update(ref, indices, updates).eval()
+    # Test some out of range errors.
+    indices = np.array([-1, 0, 5])
+    with self.assertRaisesOpError(
+        r'indices\[0\] = \[-1\] does not index into shape \[6\]'):
+      self.evaluate(state_ops.batch_scatter_update(ref, indices, updates))
+
+    indices = np.array([2, 0, 6])
+    with self.assertRaisesOpError(r'indices\[2\] = \[6\] does not index into '
+                                  r'shape \[6\]'):
+      self.evaluate(state_ops.batch_scatter_update(ref, indices, updates))
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/batchtospace_op_test.py b/tensorflow/python/kernel_tests/batchtospace_op_test.py
index 9e435f438cb..124454dffd1 100644
--- a/tensorflow/python/kernel_tests/batchtospace_op_test.py
+++ b/tensorflow/python/kernel_tests/batchtospace_op_test.py
@@ -89,7 +89,7 @@ class BatchToSpaceErrorHandlingTest(test.TestCase, PythonOpImpl):
     block_size = 0
     with self.assertRaises(ValueError):
       out_tf = self.batch_to_space(x_np, crops, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
   @test_util.run_deprecated_v1
   def testBlockSizeOne(self):
@@ -109,7 +109,7 @@ class BatchToSpaceErrorHandlingTest(test.TestCase, PythonOpImpl):
     block_size = 10
     with self.assertRaises(ValueError):
       out_tf = self.batch_to_space(x_np, crops, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
   @test_util.run_deprecated_v1
   def testBlockSizeSquaredNotDivisibleBatch(self):
diff --git a/tensorflow/python/kernel_tests/bcast_ops_test.py b/tensorflow/python/kernel_tests/bcast_ops_test.py
index ae00955ac29..52cd5380416 100644
--- a/tensorflow/python/kernel_tests/bcast_ops_test.py
+++ b/tensorflow/python/kernel_tests/bcast_ops_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops.gen_array_ops import broadcast_args
 from tensorflow.python.ops.gen_array_ops import broadcast_gradient_args
 from tensorflow.python.platform import test
@@ -29,14 +28,11 @@ from tensorflow.python.platform import test
 class BcastOpsTest(test.TestCase):
 
   def _GetBroadcastShape(self, xs, ys):
-    with self.cached_session() as sess:
-      return sess.run(broadcast_args(xs, ys))
+    return self.evaluate(broadcast_args(xs, ys))
 
   def _GetGradientArgs(self, xs, ys):
-    with self.cached_session() as sess:
-      return sess.run(broadcast_gradient_args(xs, ys))
+    return self.evaluate(broadcast_gradient_args(xs, ys))
 
-  @test_util.run_deprecated_v1
   def testBasic(self):
     r = self._GetBroadcastShape([2, 3, 5], [1])
     self.assertAllEqual(r, [2, 3, 5])
@@ -68,7 +64,6 @@ class BcastOpsTest(test.TestCase):
     r = self._GetBroadcastShape([3, 1], [2, 1, 5])
     self.assertAllEqual(r, [2, 3, 5])
 
-  @test_util.run_deprecated_v1
   def testBasicGradient(self):
     r0, r1 = self._GetGradientArgs([2, 3, 5], [1])
     self.assertAllEqual(r0, [])
@@ -110,7 +105,6 @@ class BcastOpsTest(test.TestCase):
     self.assertAllEqual(r0, [0, 2])
     self.assertAllEqual(r1, [1])
 
-  @test_util.run_deprecated_v1
   def testZeroDims(self):
     r = self._GetBroadcastShape([2, 0, 3, 0, 5], [3, 0, 5])
     self.assertAllEqual(r, [2, 0, 3, 0, 5])
@@ -124,7 +118,6 @@ class BcastOpsTest(test.TestCase):
     r = self._GetBroadcastShape([3, 1, 5], [2, 0, 3, 0, 5])
     self.assertAllEqual(r, [2, 0, 3, 0, 5])
 
-  @test_util.run_deprecated_v1
   def testZeroDimsGradient(self):
     r0, r1 = self._GetGradientArgs([2, 0, 3, 0, 5], [3, 0, 5])
     self.assertAllEqual(r0, [])
@@ -142,7 +135,6 @@ class BcastOpsTest(test.TestCase):
     self.assertAllEqual(r0, [0, 1, 3])
     self.assertAllEqual(r1, [])
 
-  @test_util.run_deprecated_v1
   def testDataTypes(self):
     for dtype in [dtypes.int32, dtypes.int64]:
       r = self._GetBroadcastShape(
diff --git a/tensorflow/python/kernel_tests/benchmark_test.py b/tensorflow/python/kernel_tests/benchmark_test.py
index f4548baddaa..3e64f9d5c15 100644
--- a/tensorflow/python/kernel_tests/benchmark_test.py
+++ b/tensorflow/python/kernel_tests/benchmark_test.py
@@ -96,7 +96,8 @@ class BenchmarkTest(test.TestCase):
     self.assertFalse(_ran_somebenchmark_but_shouldnt[0])
 
     # Run other benchmarks, but this wont run the one we care about
-    benchmark._run_benchmarks("unrelated")
+    with self.assertRaises(ValueError):
+      benchmark._run_benchmarks("unrelated")
 
     # Validate that SomeBenchmark has not run yet
     self.assertFalse(_ran_somebenchmark_1[0])
diff --git a/tensorflow/python/kernel_tests/bincount_op_test.py b/tensorflow/python/kernel_tests/bincount_op_test.py
index efa68fd6521..133d33996f9 100644
--- a/tensorflow/python/kernel_tests/bincount_op_test.py
+++ b/tensorflow/python/kernel_tests/bincount_op_test.py
@@ -119,22 +119,24 @@ class BincountTest(test_util.TensorFlowTestCase):
       with self.assertRaises(errors.InvalidArgumentError):
         self.evaluate(bincount_ops.bincount([1, 2, 3, -1, 6, 8]))
 
-  @test_util.run_deprecated_v1
   def test_shape_function(self):
     # size must be scalar.
     with self.assertRaisesRegex(
-        ValueError, "Shape must be rank 0 but is rank 1 for .*Bincount"):
-      gen_math_ops.bincount([1, 2, 3, -1, 6, 8], [1], [])
+        (ValueError, errors.InvalidArgumentError),
+        "Shape must be rank 0 but is rank 1 .*Bincount"):
+      gen_math_ops.bincount([1, 2, 3, 1, 6, 8], [1], [])
     # size must be positive.
-    with self.assertRaisesRegex(ValueError, "must be non-negative"):
-      gen_math_ops.bincount([1, 2, 3, -1, 6, 8], -5, [])
+    with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
+                                "must be non-negative"):
+      gen_math_ops.bincount([1, 2, 3, 1, 6, 8], -5, [])
     # if size is a constant then the shape is known.
-    v1 = gen_math_ops.bincount([1, 2, 3, -1, 6, 8], 5, [])
+    v1 = gen_math_ops.bincount([1, 2, 3, 1, 6, 8], 5, [])
     self.assertAllEqual(v1.get_shape().as_list(), [5])
     # if size is a placeholder then the shape is unknown.
-    s = array_ops.placeholder(dtype=dtypes.int32)
-    v2 = gen_math_ops.bincount([1, 2, 3, -1, 6, 8], s, [])
-    self.assertAllEqual(v2.get_shape().as_list(), [None])
+    with ops.Graph().as_default():
+      s = array_ops.placeholder(dtype=dtypes.int32)
+      v2 = gen_math_ops.bincount([1, 2, 3, 1, 6, 8], s, [])
+      self.assertAllEqual(v2.get_shape().as_list(), [None])
 
 
 class BincountOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
@@ -322,9 +324,9 @@ class BincountOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     size = 10
     self._test_bincount_col_binary(num_rows, num_cols, size, dtype)
 
-  @test_util.run_deprecated_v1
   def test_invalid_rank(self):
-    with self.assertRaisesRegex(ValueError, "at most rank 2"):
+    with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
+                                "at most rank 2"):
       with test_util.use_gpu():
         self.evaluate(
             gen_math_ops.dense_bincount(
diff --git a/tensorflow/python/kernel_tests/bitcast_op_test.py b/tensorflow/python/kernel_tests/bitcast_op_test.py
index 60ed92d2173..ed6d7799c7e 100644
--- a/tensorflow/python/kernel_tests/bitcast_op_test.py
+++ b/tensorflow/python/kernel_tests/bitcast_op_test.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
@@ -60,11 +62,11 @@ class BitcastTest(test.TestCase):
     shape = [3, 4]
     self._testBitcast(x, dtypes.int64, shape)
 
-  @test_util.run_deprecated_v1
   def testErrors(self):
     x = np.zeros([1, 1], np.int8)
     datatype = dtypes.int32
-    with self.assertRaisesRegex(ValueError, "Cannot bitcast due to shape"):
+    with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
+                                "Cannot bitcast from 6 to 3"):
       array_ops.bitcast(x, datatype, None)
 
   def testEmpty(self):
@@ -73,11 +75,12 @@ class BitcastTest(test.TestCase):
     shape = [4]
     self._testBitcast(x, datatype, shape)
 
-  @test_util.run_deprecated_v1
-  def testUnknown(self):
-    x = array_ops.placeholder(dtypes.float32)
-    datatype = dtypes.int8
-    array_ops.bitcast(x, datatype, None)
+  def testUnknownShape(self):
+    # Need to use placeholder for unknown shape
+    with ops.Graph().as_default():
+      x = array_ops.placeholder(dtypes.float32)
+      datatype = dtypes.int8
+      array_ops.bitcast(x, datatype, None)
 
   def testQuantizedType(self):
     shape = [3, 4]
diff --git a/tensorflow/python/kernel_tests/broadcast_to_ops_test.py b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
index 9915b12c642..f1e1ff1d86b 100644
--- a/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
+++ b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
@@ -25,13 +25,12 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.platform import test as test_lib
 
 
 class BroadcastToTest(test_util.TensorFlowTestCase):
 
-  @test_util.run_deprecated_v1
   def testBroadcastToBasic(self):
     for dtype in [np.uint8, np.uint16, np.int8, np.int16, np.int32, np.int64]:
       with self.session(use_gpu=True):
@@ -40,7 +39,6 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
         v_np = np.broadcast_to(x, [3, 3])
         self.assertAllEqual(v_tf, v_np)
 
-  @test_util.run_deprecated_v1
   def testBroadcastToString(self):
     with self.session(use_gpu=True):
       x = np.array([b"1", b"2", b"3"])
@@ -48,7 +46,6 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
       v_np = np.broadcast_to(x, [3, 3])
       self.assertAllEqual(v_tf, v_np)
 
-  @test_util.run_deprecated_v1
   def testBroadcastToBool(self):
     with self.session(use_gpu=True):
       x = np.array([True, False, True], dtype=np.bool)
@@ -56,7 +53,6 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
       v_np = np.broadcast_to(x, [3, 3])
       self.assertAllEqual(v_tf, v_np)
 
-  @test_util.run_deprecated_v1
   def testBroadcastToShape(self):
     for input_dim in range(1, 6):
       for output_dim in range(input_dim, 6):
@@ -68,7 +64,6 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
           v_np = np.broadcast_to(x, output_shape)
           self.assertAllEqual(v_tf, v_np)
 
-  @test_util.run_deprecated_v1
   def testBroadcastToShapeInnerDim(self):
     input_shape = [2, 1, 3]
     output_shape = [2, 5, 3]
@@ -78,7 +73,6 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
       v_np = np.broadcast_to(x, output_shape)
       self.assertAllEqual(v_tf, v_np)
 
-  @test_util.run_deprecated_v1
   def testBroadcastToShapeLargerDim(self):
     input_shape = [2, 1, 3, 2, 2, 2]
     output_shape = [1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 15, 3, 2, 2, 2]
@@ -88,7 +82,6 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
       v_np = np.broadcast_to(x, output_shape)
       self.assertAllEqual(v_tf, v_np)
 
-  @test_util.run_deprecated_v1
   def testBroadcastToShapeLargerDim2(self):
     input_shape = [2, 1, 3, 2, 2, 2, 1, 1, 1]
     output_shape = [1, 1, 1, 2, 5, 3, 2, 2, 2, 3, 3, 3]
@@ -98,7 +91,6 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
       v_np = np.broadcast_to(x, output_shape)
       self.assertAllEqual(v_tf, v_np)
 
-  @test_util.run_deprecated_v1
   def testBroadcastToScalar(self):
     with self.session(use_gpu=True):
       x = np.array(1, dtype=np.int32)
@@ -106,7 +98,6 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
       v_np = np.broadcast_to(x, [3, 3])
       self.assertAllEqual(v_tf, v_np)
 
-  @test_util.run_deprecated_v1
   def testBroadcastScalarToNonScalar(self):
     with self.session(use_gpu=True):
       x = np.array(1.0, dtype=np.float)
@@ -115,7 +106,6 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
       v_np = np.broadcast_to(x, [2, 3, 4, 1, 1, 1])
       self.assertAllEqual(v_tf, v_np)
 
-  @test_util.run_deprecated_v1
   def testBroadcastToShapeTypeAndInference(self):
     for dtype in [dtypes.int32, dtypes.int64]:
       with self.cached_session(use_gpu=True):
@@ -137,60 +127,80 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
             array_ops.broadcast_to(
                 constant_op.constant([0, 1]), constant_op.constant([2, 1])))
 
-  @test_util.run_deprecated_v1
   def testGradientForScalar(self):
     x = constant_op.constant(1, dtype=dtypes.float32)
-    v = array_ops.broadcast_to(x, [2, 4, 3])
-    out = 2 * v
+
+    def func(x):
+      v = array_ops.broadcast_to(x, [2, 4, 3])
+      return 2 * v
+
     with self.cached_session():
-      err = gradient_checker.compute_gradient_error(x, x.get_shape(), out,
-                                                    out.get_shape())
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(func, [x]))
+
     self.assertLess(err, 1e-4)
 
-  @test_util.run_deprecated_v1
   def testGradientWithSameRank(self):
     x = constant_op.constant(np.reshape(np.arange(6), (2, 1, 3)),
                              dtype=dtypes.float32)
-    v = array_ops.broadcast_to(x, [2, 5, 3])
-    out = 2 * v
+    def func(x):
+      v = array_ops.broadcast_to(x, [2, 5, 3])
+      return 2 * v
+
     with self.cached_session():
-      err = gradient_checker.compute_gradient_error(x, x.get_shape(),
-                                                    out, out.get_shape())
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(func, [x], delta=1e-2))
+
     self.assertLess(err, 1e-4)
 
-  @test_util.run_deprecated_v1
   def testGradientWithIncreasingRank(self):
     x = constant_op.constant([[1], [2]],
                              dtype=dtypes.float32)
-    v = array_ops.broadcast_to(x, [5, 2, 3])
-    out = 2 * v
+    def func(x):
+      v = array_ops.broadcast_to(x, [5, 2, 3])
+      return 2 * v
+
     with self.cached_session():
-      err = gradient_checker.compute_gradient_error(x, x.get_shape(),
-                                                    out, out.get_shape())
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(func, [x]))
+
     self.assertLess(err, 1e-4)
 
-  @test_util.run_deprecated_v1
   def testGradientWithBroadcastAllDimensions(self):
     x = constant_op.constant([1], dtype=dtypes.float32)
-    v = array_ops.broadcast_to(x, [5, 2, 3])
-    out = 2 * v
+    def func(x):
+      v = array_ops.broadcast_to(x, [5, 2, 3])
+      return 2 * v
+
     with self.cached_session():
-      err = gradient_checker.compute_gradient_error(x, x.get_shape(),
-                                                    out, out.get_shape())
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(func, [x]))
+
     self.assertLess(err, 1e-4)
 
-  @test_util.run_deprecated_v1
   def testGradientWithLargeDim(self):
     input_shape = [2, 1, 3, 2, 2, 2, 1, 1, 1]
     output_shape = [1, 1, 1, 2, 5, 3, 2, 2, 2, 3, 3, 3]
     x = constant_op.constant(np.array(np.random.randn(*input_shape),
                                       dtype=np.float32))
-    v = array_ops.broadcast_to(x, output_shape)
-    out = 2 * v
+    def func(x):
+      v = array_ops.broadcast_to(x, output_shape)
+      return 2 * v
+
     with self.cached_session():
-      err = gradient_checker.compute_gradient_error(x, x.get_shape(),
-                                                    out, out.get_shape())
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(func, [x], delta=1e-2))
+
     self.assertLess(err, 1e-4)
 
+  def testBroadcastToInvalidShape(self):
+    with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
+                                "110,53,104,147,157,123,5,24,188,40,5,2"):
+      output_shape = [110, 53, 104, 147, 157, 123, 5, 24, 188, 40, 5, 2]
+      x = np.array([1, 2, 3], dtype=np.int32)
+      v = array_ops.broadcast_to(constant_op.constant(x), output_shape)
+      self.evaluate(v)
+
+
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/python/kernel_tests/cast_op_test.py b/tensorflow/python/kernel_tests/cast_op_test.py
index 3149aaadf57..7b794153ce6 100644
--- a/tensorflow/python/kernel_tests/cast_op_test.py
+++ b/tensorflow/python/kernel_tests/cast_op_test.py
@@ -25,7 +25,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -168,16 +168,20 @@ class CastOpTest(test.TestCase):
       self.evaluate(variables.global_variables_initializer())
       self.assertEqual(1.0, self.evaluate(cast))
 
-  @test_util.run_deprecated_v1
   def testGradients(self):
     t = [dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
     for src_t in t:
       for dst_t in t:
         with self.cached_session():
           x = constant_op.constant(1.0, src_t)
-          z = array_ops.identity(x)
-          y = math_ops.cast(z, dst_t)
-          err = gradient_checker.compute_gradient_error(x, [], y, [])
+
+          def cast(x, dst_t=dst_t):
+            x = array_ops.identity(x)
+            x = math_ops.cast(x, dst_t)
+            return x
+
+          err = gradient_checker_v2.max_error(
+              *gradient_checker_v2.compute_gradient(cast, [x]))
           self.assertLess(err, 1e-3)
 
 
diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index 9d79c1b8573..d0c805f96e3 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -67,6 +67,7 @@ class ClipTest(test.TestCase):
         dtypes.float16,
         dtypes.float32,
         dtypes.float64,
+        dtypes.bfloat16,
         dtypes.int16,
         dtypes.int32,
         dtypes.int64,
@@ -88,6 +89,7 @@ class ClipTest(test.TestCase):
         dtypes.float16,
         dtypes.float32,
         dtypes.float64,
+        dtypes.bfloat16,
         dtypes.int16,
         dtypes.int32,
         dtypes.int64,
@@ -110,6 +112,7 @@ class ClipTest(test.TestCase):
         dtypes.float16,
         dtypes.float32,
         dtypes.float64,
+        dtypes.bfloat16,
         dtypes.int16,
         dtypes.int32,
         dtypes.int64,
@@ -132,6 +135,7 @@ class ClipTest(test.TestCase):
         dtypes.float16,
         dtypes.float32,
         dtypes.float64,
+        dtypes.bfloat16,
         dtypes.int16,
         dtypes.int32,
         dtypes.int64,
@@ -381,8 +385,8 @@ class ClipTest(test.TestCase):
       np_ans_1 = [0.8, -1.6]
 
       ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
-      tf_ans_1 = ans[0].eval()
-      tf_ans_2 = ans[1].values.eval()
+      tf_ans_1 = self.evaluate(ans[0])
+      tf_ans_2 = self.evaluate(ans[1].values)
       tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 5.0)
diff --git a/tensorflow/python/kernel_tests/collective_ops_test.py b/tensorflow/python/kernel_tests/collective_ops_test.py
new file mode 100644
index 00000000000..4225df7537a
--- /dev/null
+++ b/tensorflow/python/kernel_tests/collective_ops_test.py
@@ -0,0 +1,126 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for V2 Collective Operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import config
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gen_collective_ops
+from tensorflow.python.platform import test
+
+
+class CollectiveOpsTest(test.TestCase):
+
+  def _setup_context(self, num_cpus=2):
+    context._reset_context()
+    cpus = config.list_physical_devices('CPU')
+    self.assertEqual(len(cpus), 1)
+    config.set_logical_device_configuration(cpus[0], [
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration()
+    ])
+    context.ensure_initialized()
+
+  @test_util.run_v2_only
+  def testReduceV2(self):
+    self._setup_context()
+
+    @def_function.function
+    def single_all_reduce(in_value, group_size, group_key, instance_key):
+      return gen_collective_ops.collective_reduce_v2(
+          in_value, group_size, group_key, instance_key, merge_op='Add',
+          final_op='Id', communication_hint='auto')
+
+    @def_function.function
+    def run_all_reduce_1cpu():
+      with ops.device('/device:CPU:0'):
+        in_value = constant_op.constant([1.])
+        group_size = constant_op.constant(1)
+        group_key = constant_op.constant(1)
+        instance_key = constant_op.constant(1)
+        return single_all_reduce(in_value, group_size, group_key, instance_key)
+
+    @def_function.function
+    def run_all_reduce_2cpus():
+      in_value = constant_op.constant([1.])
+      group_size = constant_op.constant(2)
+      group_key = constant_op.constant(2)
+      instance_key = constant_op.constant(2)
+      collectives = []
+      with ops.device('/device:CPU:0'):
+        collectives.append(single_all_reduce(in_value, group_size, group_key,
+                                             instance_key))
+      with ops.device('/device:CPU:1'):
+        collectives.append(single_all_reduce(in_value, group_size, group_key,
+                                             instance_key))
+      return collectives
+
+    self.assertAllClose(run_all_reduce_1cpu(), [1.], rtol=1e-5, atol=1e-5)
+    for result in run_all_reduce_2cpus():
+      self.assertAllClose(result, [2.], rtol=1e-5, atol=1e-5)
+
+  @test_util.run_v2_only
+  def testInstanceKeyScopedUnderGroupKey(self):
+    self._setup_context()
+
+    @def_function.function
+    def single_all_reduce(in_value, group_size, group_key, instance_key):
+      return gen_collective_ops.collective_reduce_v2(
+          in_value, group_size, group_key, instance_key, merge_op='Add',
+          final_op='Id', communication_hint='auto')
+
+    @def_function.function
+    def run_all_reduce_4cpus_same_instance_key():
+      # Use a common instance key for both groups.
+      instance_key = constant_op.constant(0)
+      # We will create 2 groups each with 2 devices.
+      group_size = constant_op.constant(2)
+      # Group 0 comprises cpu:0 and cpu:1.
+      group0_key = constant_op.constant(0)
+      # Group 1 comprises cpu:2 and cpu:3.
+      group1_key = constant_op.constant(1)
+      collectives = []
+      with ops.device('/device:CPU:0'):
+        collectives.append(single_all_reduce(
+            constant_op.constant(1.), group_size, group0_key, instance_key))
+      with ops.device('/device:CPU:1'):
+        collectives.append(single_all_reduce(
+            constant_op.constant(2.), group_size, group0_key, instance_key))
+      with ops.device('/device:CPU:2'):
+        collectives.append(single_all_reduce(
+            constant_op.constant(3.), group_size, group1_key, instance_key))
+      with ops.device('/device:CPU:3'):
+        collectives.append(single_all_reduce(
+            constant_op.constant(4.), group_size, group1_key, instance_key))
+      return collectives
+
+    results = run_all_reduce_4cpus_same_instance_key()
+    self.assertAllClose(results[0], 3., rtol=1e-5, atol=1e-5)
+    self.assertAllClose(results[1], 3., rtol=1e-5, atol=1e-5)
+    self.assertAllClose(results[2], 7., rtol=1e-5, atol=1e-5)
+    self.assertAllClose(results[3], 7., rtol=1e-5, atol=1e-5)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/kernel_tests/concat_op_test.py b/tensorflow/python/kernel_tests/concat_op_test.py
index ba2d1abbd10..334e25cfc4e 100644
--- a/tensorflow/python/kernel_tests/concat_op_test.py
+++ b/tensorflow/python/kernel_tests/concat_op_test.py
@@ -68,6 +68,22 @@ class ConcatOpTest(test.TestCase):
     self.assertAllEqual(result[:, :4], params[p1])
     self.assertAllEqual(result[:, 4:], params[p2])
 
+  @test_util.run_deprecated_v1
+  def test4DStack(self):
+    with self.session(use_gpu=True):
+      p1 = array_ops.placeholder(dtypes.float32, shape=[2, 3, 1, 1])
+      p2 = array_ops.placeholder(dtypes.float32, shape=[2, 3, 4, 1])
+      c = array_ops.concat([p1, p2], 2)
+      params = {
+          p1: np.random.rand(2, 3, 1, 1).astype("f"),
+          p2: np.random.rand(2, 3, 4, 1).astype("f")
+      }
+      result = c.eval(feed_dict=params)
+
+    self.assertEqual(result.shape, c.get_shape())
+    self.assertAllEqual(result[:, :, :1, :], params[p1])
+    self.assertAllEqual(result[:, :, 1:, :], params[p2])
+
   def testInt32GPU(self):
     with test_util.use_gpu():
       p1 = np.random.rand(2, 3).astype("i")
@@ -685,7 +701,6 @@ class ConcatOffsetTest(test.TestCase):
       self.evaluate(off)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testSizeMismatch(self):
     cdim = constant_op.constant(1, dtypes.int32)
     s0 = constant_op.constant([2, 3, 5], dtypes.int32)
diff --git a/tensorflow/python/kernel_tests/cond_v2_test.py b/tensorflow/python/kernel_tests/cond_v2_test.py
index 52bd240019b..b8829181747 100644
--- a/tensorflow/python/kernel_tests/cond_v2_test.py
+++ b/tensorflow/python/kernel_tests/cond_v2_test.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.compat.compat import forward_compatibility_horizon
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -34,6 +35,7 @@ from tensorflow.python.ops import cond_v2
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables
@@ -145,7 +147,7 @@ class CondV2Test(test.TestCase):
   def testExternalControlDependencies(self):
     with ops.Graph().as_default(), self.test_session():
       v = variables.Variable(1.0)
-      v.initializer.run()
+      self.evaluate(v.initializer)
       op = v.assign_add(1.0)
 
       def true_branch():
@@ -1235,6 +1237,30 @@ class CondV2Test(test.TestCase):
     self.assertEqual(len(if_op.outputs), 1)
     # pylint: enable=g-deprecated-assert
 
+  def testIsControlFlowGraph(self):
+    x = constant_op.constant(1.0, name="x")
+
+    @def_function.function
+    def f(c):
+
+      def then_branch():
+        i = x + 1
+        self.assertTrue(i.graph.is_control_flow_graph)
+        return i
+
+      def else_branch():
+        i = x + 1
+        self.assertTrue(i.graph.is_control_flow_graph)
+        return i
+
+      return cond_v2.cond_v2(c, then_branch, else_branch)
+
+    i = f(constant_op.constant(True))
+    self.assertEqual(self.evaluate(i), 2.0)
+
+    i = f(constant_op.constant(False))
+    self.assertEqual(self.evaluate(i), 2.0)
+
 
 class CondV2CollectionTest(test.TestCase):
 
@@ -1252,7 +1278,7 @@ class CondV2CollectionTest(test.TestCase):
           return math_ops.add(x_const, y_const)
 
         cnd = cond_v2.cond_v2(constant_op.constant(True), fn, fn)
-        self.assertEqual(cnd.eval(), 7)
+        self.assertEqual(self.evaluate(cnd), 7)
 
   def testCollectionTensorValueAccessInCond(self):
     """Read tensors from collections inside of cond_v2 & use them."""
@@ -1269,7 +1295,7 @@ class CondV2CollectionTest(test.TestCase):
           return math_ops.add(x_read, y_read)
 
         cnd = cond_v2.cond_v2(math_ops.less(x, y), fn, fn)
-        self.assertEqual(cnd.eval(), 7)
+        self.assertEqual(self.evaluate(cnd), 7)
 
   def testCollectionIntValueWriteInCond(self):
     """Make sure Int writes to collections work inside of cond_v2."""
@@ -1287,7 +1313,7 @@ class CondV2CollectionTest(test.TestCase):
           return math_ops.mul(x, z)
 
         cnd = cond_v2.cond_v2(constant_op.constant(True), true_fn, false_fn)
-        self.assertEqual(cnd.eval(), 14)
+        self.assertEqual(self.evaluate(cnd), 14)
 
         read_z_collection = ops.get_collection("z")
         self.assertEqual(read_z_collection, [7])
@@ -1361,11 +1387,11 @@ class CondV2ContainerTest(test.TestCase):
         with ops.container("l1"):
           cnd_true = cond_v2.cond_v2(
               constant_op.constant(True), true_fn, false_fn)
-          self.assertEqual(cnd_true.eval(), 2)
+          self.assertEqual(self.evaluate(cnd_true), 2)
 
           cnd_false = cond_v2.cond_v2(
               constant_op.constant(False), true_fn, false_fn)
-          self.assertEqual(cnd_false.eval(), 6)
+          self.assertEqual(self.evaluate(cnd_false), 6)
 
           v4 = variables.Variable([3])
           q4 = data_flow_ops.FIFOQueue(1, dtypes.float32)
@@ -1519,7 +1545,43 @@ class CondV2ColocationGroupAndDeviceTest(test.TestCase):
         run_metadata = config_pb2.RunMetadata()
         sess.run(out_cond_2, options=run_options, run_metadata=run_metadata)
 
-        self.assertTrue(len(run_metadata.partition_graphs) >= 2)
+        self.assertGreaterEqual(len(run_metadata.partition_graphs), 2)
+
+
+class CaseTest(test.TestCase):
+
+  def testCase(self):
+
+    def branch1(x):
+      logging_ops.print_v2("1")
+      return x
+
+    def branch2(x):
+      return x + 1
+
+    with ops.Graph().as_default():
+      x = array_ops.constant(1)
+      output = cond_v2.indexed_case(
+          array_ops.constant(0), [lambda: branch1(x), lambda: branch2(x)])
+      cond_op = output.op.inputs[0].op
+      self.assertEqual(cond_op.type, "Case")
+      self.assertEqual(1., self.evaluate(output))
+
+  def testStatelessCase(self):
+
+    def branch1(x):
+      return x + 1
+
+    def branch2(x):
+      return x + 2
+
+    with ops.Graph().as_default():
+      x = array_ops.constant(1)
+      output = cond_v2.indexed_case(
+          array_ops.constant(0), [lambda: branch1(x), lambda: branch2(x)])
+      cond_op = output.op.inputs[0].op
+      self.assertEqual(cond_op.type, "StatelessCase")
+      self.assertEqual(2., self.evaluate(output))
 
 
 def _cond(pred, true_fn, false_fn, name):
@@ -1544,4 +1606,5 @@ def _has_node_with_op(run_metadata, op_type):
 
 
 if __name__ == "__main__":
-  test.main()
+  with forward_compatibility_horizon(2020, 8, 21):
+    test.main()
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index a1316df34f8..e35b62a4556 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -24,6 +24,7 @@ from google.protobuf import text_format
 
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import tensor_pb2
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
@@ -455,6 +456,26 @@ class ZerosTest(test.TestCase):
         self.assertFalse(np.any(z_value))
         self.assertEqual((2, 3), z_value.shape)
 
+  def testQint8Dtype(self):
+    dtype = dtypes_lib.qint8
+    z = array_ops.zeros([2, 3], dtype=dtype)
+    self.assertEqual(z.dtype, dtype)
+    self.assertEqual([2, 3], z.get_shape())
+    # cast to int32 so that it can be compred with numpy
+    # where [qint|quint][8|16] are not available.
+    z_value = self.evaluate(math_ops.cast(z, dtypes_lib.int32))
+    self.assertFalse(np.any(z_value))
+
+  def testQint16Dtype(self):
+    dtype = dtypes_lib.qint16
+    z = array_ops.zeros([2, 3], dtype=dtype)
+    self.assertEqual(z.dtype, dtype)
+    self.assertEqual([2, 3], z.get_shape())
+    # cast to int32 so that it can be compred with numpy
+    # where [qint|quint][8|16] are not available.
+    z_value = self.evaluate(math_ops.cast(z, dtypes_lib.int32))
+    self.assertFalse(np.any(z_value))
+
 
 class ZerosLikeTest(test.TestCase):
 
@@ -629,6 +650,16 @@ class OnesTest(test.TestCase):
         self.assertEqual([2, 3], z.get_shape())
         self.assertAllEqual(z, np.ones([2, 3]))
 
+  def testQintDtype(self):
+
+    @def_function.function(autograph=False)
+    def f():
+      return math_ops.cast(
+          array_ops.ones([2, 3], dtype=dtypes_lib.quint8), dtypes_lib.int32)
+
+    value = self.evaluate(f())
+    self.assertTrue(np.all(value))
+
 
 class OnesLikeTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index f0e37dfe6a2..2e13414f720 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -1505,7 +1505,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
   def testWhileExternalControlDependencies(self):
     with self.cached_session():
       v = variables.Variable(0.0)
-      v.initializer.run()
+      self.evaluate(v.initializer)
       increment = v.assign_add(1.0).read_value()
 
       def body_fn(i):
@@ -1521,7 +1521,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
   def testWhileExternalControlDependenciesNoInput(self):
     with self.cached_session():
       v = variables.Variable(0.0)
-      v.initializer.run()
+      self.evaluate(v.initializer)
       # TODO(apassos): figure out why the reading is necessary here.
       increment = v.assign_add(1.0).read_value()
 
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 5c7ef34cad8..f480f4319da 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -2522,79 +2522,138 @@ class Conv2DTest(test.TestCase):
           padding=[0, 0, 0, 0])
 
   @test_util.deprecated_graph_mode_only
-  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testOpEdgeCases(self):
     with self.cached_session() as sess:
       # Illegal strides.
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+      with self.assertRaisesRegex(errors_impl.UnimplementedError,
                                   "strides in the batch and depth"):
+        input_placeholder = array_ops.placeholder(dtypes.float32)
+        input_val = np.ones([10, 10])
+        filter_placeholder = array_ops.placeholder(dtypes.float32)
+        filter_val = np.ones([10, 10])
         sess.run(
             nn_ops.conv2d(
-                array_ops.placeholder(dtypes.float32),
-                array_ops.placeholder(dtypes.float32),
+                input_placeholder,
+                filter_placeholder,
                 strides=[2, 1, 1, 1],
-                padding="SAME"))
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                padding="SAME"),
+            feed_dict={
+                input_placeholder: input_val,
+                filter_placeholder: filter_val
+            })
+      with self.assertRaisesRegex(errors_impl.UnimplementedError,
                                   "strides in the batch and depth"):
+        input_placeholder = array_ops.placeholder(dtypes.float32)
+        filter_placeholder = array_ops.placeholder(dtypes.float32)
+        input_val = np.ones([10, 10])
+        filter_val = np.ones([10, 10])
         sess.run(
             nn_ops.conv2d(
-                array_ops.placeholder(dtypes.float32),
-                array_ops.placeholder(dtypes.float32),
+                input_placeholder,
+                filter_placeholder,
                 strides=[1, 1, 1, 2],
-                padding="SAME"))
+                padding="SAME"),
+            feed_dict={
+                input_placeholder: input_val,
+                filter_placeholder: filter_val
+            })
 
       # Filter larger than input.
       with self.assertRaisesRegex(ValueError, "Negative dimension size"):
+        input_placeholder = array_ops.placeholder(
+            dtypes.float32, shape=[32, 20, 20, 3])
+        input_val = np.ones([32, 20, 20, 3])
+        filter_placeholder = array_ops.placeholder(
+            dtypes.float32, shape=[20, 21, 3, 2])
+        filter_val = np.ones([20, 21, 3, 2])
+
         sess.run(
             nn_ops.conv2d(
-                array_ops.placeholder(
-                    dtypes.float32, shape=[32, 20, 20, 3]),
-                array_ops.placeholder(
-                    dtypes.float32, shape=[20, 21, 3, 2]),
+                input_placeholder,
+                filter_placeholder,
                 strides=[1, 1, 1, 1],
-                padding="VALID"))
+                padding="VALID"),
+            feed_dict={
+                input_placeholder: input_val,
+                filter_placeholder: filter_val
+            })
       with self.assertRaisesRegex(ValueError, "Negative dimension size"):
+        input_placeholder = array_ops.placeholder(
+            dtypes.float32, shape=[32, 20, 20, 3])
+        input_val = np.ones([32, 20, 20, 3])
+        filter_placeholder = array_ops.placeholder(
+            dtypes.float32, shape=[21, 20, 3, 2])
+        filter_val = np.ones([21, 20, 3, 2])
         sess.run(
             nn_ops.conv2d(
-                array_ops.placeholder(
-                    dtypes.float32, shape=[32, 20, 20, 3]),
-                array_ops.placeholder(
-                    dtypes.float32, shape=[21, 20, 3, 2]),
+                input_placeholder,
+                filter_placeholder,
                 strides=[1, 1, 1, 1],
-                padding="VALID"))
+                padding="VALID"),
+            feed_dict={
+                input_placeholder: input_val,
+                filter_placeholder: filter_val
+            })
 
       # Filter larger than input + padding.
       with self.assertRaisesRegex(ValueError, "Negative dimension size"):
+        input_placeholder = array_ops.placeholder(
+            dtypes.float32, shape=[32, 20, 20, 3])
+        input_val = np.ones([32, 20, 20, 3])
+        filter_placeholder = array_ops.placeholder(
+            dtypes.float32, shape=[24, 25, 3, 2])
+        filter_val = np.ones([24, 25, 3, 2])
         sess.run(
             nn_ops.conv2d(
-                array_ops.placeholder(dtypes.float32, shape=[32, 20, 20, 3]),
-                array_ops.placeholder(dtypes.float32, shape=[24, 25, 3, 2]),
+                input_placeholder,
+                filter_placeholder,
                 strides=[1, 1, 1, 1],
-                padding=[[0, 0], [2, 2], [2, 2], [0, 0]]))
+                padding=[[0, 0], [2, 2], [2, 2], [0, 0]]),
+            feed_dict={
+                input_placeholder: input_val,
+                filter_placeholder: filter_val
+            })
 
       # Negative padding during backprop.
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
-                                  "nonnegative"):
+      with self.assertRaisesRegex(
+          errors_impl.InvalidArgumentError,
+          "All elements of explicit_paddings must be nonnegative"):
+        filter_placeholder = array_ops.placeholder(
+            dtypes.float32, shape=[18, 18, 3, 2])
+        filter_val = np.ones([18, 18, 3, 2])
+        out_backprop = array_ops.placeholder(
+            dtypes.float32, shape=[32, 3, 2, 2])
+        out_backprop_val = np.ones([32, 3, 2, 2])
         sess.run(
             nn_ops.conv2d_backprop_input([32, 20, 20, 3],
-                                         array_ops.placeholder(
-                                             dtypes.float32,
-                                             shape=[18, 18, 3, 2]),
-                                         array_ops.placeholder(
-                                             dtypes.float32,
-                                             shape=[32, 3, 2, 2]),
+                                         filter_placeholder,
+                                         out_backprop,
                                          strides=[1, 1, 1, 1],
                                          padding=[[0, 0], [-1, 0], [0, 0],
-                                                  [0, 0]]))
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
-                                  "nonnegative"):
+                                                  [0, 0]]),
+            feed_dict={
+                filter_placeholder: filter_val,
+                out_backprop: out_backprop_val
+            })
+      with self.assertRaisesRegex(
+          errors_impl.InvalidArgumentError,
+          "All elements of explicit_paddings must be nonnegative"):
+        input_placeholder = array_ops.placeholder(
+            dtypes.float32, shape=[32, 20, 20, 3])
+        input_val = np.ones([32, 20, 20, 3])
+        out_backprop = array_ops.placeholder(
+            dtypes.float32, shape=[32, 3, 2, 2])
+        out_backprop_val = np.ones([32, 3, 2, 2])
         sess.run(
             nn_ops.conv2d_backprop_filter(
-                array_ops.placeholder(dtypes.float32, shape=[32, 20, 20, 3]),
-                [18, 18, 3, 2],
-                array_ops.placeholder(dtypes.float32, shape=[32, 3, 2, 2]),
+                input_placeholder, [18, 18, 3, 2],
+                out_backprop,
                 strides=[1, 1, 1, 1],
-                padding=[[0, 0], [-1, 0], [0, 0], [0, 0]]))
+                padding=[[0, 0], [-1, 0], [0, 0], [0, 0]]),
+            feed_dict={
+                input_placeholder: input_val,
+                out_backprop: out_backprop_val
+            })
 
 
 class DepthwiseConv2DTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 78d3af17990..a7d8f841401 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -840,11 +840,16 @@ class MathOpsOverloadTest(test.TestCase):
       return self.evaluate(z)
 
   def _compareBinary(self, x, y, dtype, np_func, tf_func):
-    np_ans = np_func(x, y).astype(dtype.as_numpy_dtype)
+    # astype and assertAllClose do not properly handle bfloat16 values
+    np_ans = np_func(x, y).astype(np.float32 if dtype == dtypes_lib.bfloat16
+                                  else dtype.as_numpy_dtype)
+    rtol = 1e-2 if dtype == dtypes_lib.bfloat16 else 1e-6
     self.assertAllClose(np_ans,
-                        self._computeTensorAndLiteral(x, y, dtype, tf_func))
+                        self._computeTensorAndLiteral(x, y, dtype, tf_func),
+                        rtol=rtol)
     self.assertAllClose(np_ans,
-                        self._computeLiteralAndTensor(x, y, dtype, tf_func))
+                        self._computeLiteralAndTensor(x, y, dtype, tf_func),
+                        rtol=rtol)
 
   def _compareUnary(self, x, dtype, np_func, tf_func):
     np_ans = np_func(x).astype(dtype.as_numpy_dtype)
@@ -857,6 +862,7 @@ class MathOpsOverloadTest(test.TestCase):
         dtypes_lib.float16,
         dtypes_lib.float32,
         dtypes_lib.float64,
+        dtypes_lib.bfloat16,
         dtypes_lib.int32,
         dtypes_lib.int64,
         dtypes_lib.complex64,
@@ -920,12 +926,16 @@ class MathOpsOverloadTest(test.TestCase):
 class IsFiniteInfNanTest(test.TestCase):
 
   def _compare(self, x, use_gpu):
-    np_finite, np_inf, np_nan = np.isfinite(x), np.isinf(x), np.isnan(x)
     with test_util.device(use_gpu=use_gpu):
       inx = ops.convert_to_tensor(x)
       ofinite, oinf, onan = math_ops.is_finite(inx), math_ops.is_inf(
           inx), math_ops.is_nan(inx)
       tf_finite, tf_inf, tf_nan = self.evaluate([ofinite, oinf, onan])
+    if x.dtype == dtypes_lib.bfloat16.as_numpy_dtype:
+      # Numpy will implicitly convert bfloat16 value to float16, so we cast to
+      # float32 to avoid this.
+      x = x.astype(np.float32)
+    np_finite, np_inf, np_nan = np.isfinite(x), np.isinf(x), np.isnan(x)
     self.assertAllEqual(np_inf, tf_inf)
     self.assertAllEqual(np_nan, tf_nan)
     self.assertAllEqual(np_finite, tf_finite)
@@ -934,11 +944,18 @@ class IsFiniteInfNanTest(test.TestCase):
     self.assertShapeEqual(np_finite, ofinite)
 
   def _testDtype(self, dtype):
-    fi = np.finfo(dtype)
-    data = np.array([
-        0, -1, 1, fi.resolution, -fi.resolution, fi.min, fi.max, -np.inf,
-        np.inf, np.nan
-    ]).astype(dtype)
+    if dtype != dtypes_lib.bfloat16.as_numpy_dtype:
+      fi = np.finfo(dtype)
+      data = np.array([
+          0, -1, 1, fi.resolution, -fi.resolution, fi.min, fi.max, -np.inf,
+          np.inf, np.nan
+      ]).astype(dtype)
+    else:
+      # np.finfo does not support bfloat16
+      data = np.array([
+          0, -1, 1, 0.01, -0.01, -3.3895e+38, 3.3895e+38, -np.inf, np.inf,
+          np.nan
+      ]).astype(dtype)
     self._compare(data, use_gpu=False)
     self._compare(data, use_gpu=True)
 
@@ -951,6 +968,9 @@ class IsFiniteInfNanTest(test.TestCase):
   def testDouble(self):
     self._testDtype(np.float64)
 
+  def testBfloat16(self):
+    self._testDtype(dtypes_lib.bfloat16.as_numpy_dtype)
+
   def testSqrt(self):
     for dtype in [np.float16, np.float32, np.float64]:
       fi = np.finfo(dtype)
@@ -998,8 +1018,8 @@ class RoundingTest(test.TestCase):
   def _testDtype(self, dtype):
     data = (np.arange(-3, 3) / 4.).reshape(1, 3, 2).astype(dtype)
     self._compare(data)
-    # TODO: rint op is not supported for float16
-    if dtype is np.float16:
+    # TODO(reedwm): rint op is not supported for float16 and bfloat16
+    if dtype in (np.float16, dtypes_lib.bfloat16.as_numpy_dtype):
       return
     self._compare_values(data)
     x = [0.5, 0.5000001]
@@ -1012,10 +1032,12 @@ class RoundingTest(test.TestCase):
     self._compare_values(x, y=y)
 
   def testTypes(self):
-    self.skipTest("b/131162241")
-    for dtype in [np.float16, np.float32, np.float64]:
-      with self.subTest(dtype=dtype):
-        self._testDtype(dtype)
+    # TODO(b/131162241): Enable test for GPU
+    with ops.device("/CPU:0"):
+      for dtype in [np.float16, np.float32, np.float64,
+                    dtypes_lib.bfloat16.as_numpy_dtype]:
+        with self.subTest(dtype=dtype):
+          self._testDtype(dtype)
 
 
 class ComplexMakeRealImagTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/cwise_ops_unary_test.py b/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
index df848a653d4..9d46ed35639 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
@@ -61,6 +61,8 @@ def _default_tolerance(dtype):
   Args:
     dtype: A datatype.
   """
+  if dtype == dtypes_lib.bfloat16.as_numpy_dtype:
+    return 5e-3
   if dtype == np.float16:
     return 5e-3
   elif dtype in (np.float32, np.complex64):
@@ -81,12 +83,7 @@ class UnaryOpTest(test.TestCase):
     np_ans = np_func(x)
     with self.cached_session(use_gpu=False):
       inx = ops.convert_to_tensor(x)
-      if x.dtype in (np.float32, np.float64,
-                     dtypes_lib.bfloat16.as_numpy_dtype):
-        y = 1.1 * tf_func(inx)
-        np_ans *= 1.1
-      else:
-        y = tf_func(inx)
+      y = tf_func(inx)
       tf_cpu = self.evaluate(y)
       self.assertShapeEqual(np_ans, y)
       if x.dtype == np.float16:
@@ -99,7 +96,7 @@ class UnaryOpTest(test.TestCase):
       if x.dtype in (np.complex64, np.complex128) and tf_func == math_ops.sign:
         return  # Return early
 
-      if x.dtype == np.float16:
+      if x.dtype in (np.float16, dtypes_lib.bfloat16.as_numpy_dtype):
         s = list(np.shape(x))
         jacob_t, _ = gradient_checker.compute_gradient(
             inx, s, y, s, x_init_value=x)
@@ -108,7 +105,7 @@ class UnaryOpTest(test.TestCase):
         yf = tf_func(inxf)
         _, jacob_n = gradient_checker.compute_gradient(
             inxf, s, yf, s, x_init_value=xf, delta=1e-2)
-        jacob_n = jacob_n.astype(np.float16)
+        jacob_n = jacob_n.astype(x.dtype)
         self.assertAllClose(jacob_t, jacob_n, rtol=grad_rtol, atol=grad_atol)
       elif x.dtype in (np.float32, np.complex64):
         s = list(np.shape(x))
@@ -384,13 +381,36 @@ class UnaryOpTest(test.TestCase):
     self._compareBothSparse(y, np.sign, math_ops.sign)
     self._compareBothSparse(x, np.vectorize(math.erf), math_ops.erf, tol=1e-3)
 
+  @test_util.run_deprecated_v1
   def testBFloat16Basic(self):
+    def compute_f32(np_func):
+      """Decorator to compute Numpy function with float32 math."""
+      def f(x):
+        y = np_func(x.astype(np.float32))
+        return y.astype(x.dtype)
+      return f
+
+    bfloat16 = dtypes_lib.bfloat16.as_numpy_dtype
     x = np.arange(-6, 6,
                   2).reshape(1, 3, 2).astype(dtypes_lib.bfloat16.as_numpy_dtype)
+    y = (x + .5).astype(bfloat16)  # no zero
+    z = (x + 15.5).astype(bfloat16)  # all positive
     self._compareCpu(x, np.abs, math_ops.abs)
     self._compareCpu(x, np.abs, _ABS)
     self._compareBoth(x, np.negative, math_ops.negative)
     self._compareBoth(x, np.negative, _NEG)
+    self._compareCpu(y, compute_f32(self._inv), math_ops.reciprocal)
+    self._compareCpu(x, np.exp, math_ops.exp)
+    self._compareCpu(x, np.expm1, math_ops.expm1)
+    self._compareCpu(z, compute_f32(np.log), math_ops.log)
+    self._compareCpu(z, compute_f32(np.log1p), math_ops.log1p)
+    self._compareCpu(y, np.sign, math_ops.sign)
+    self._compareBoth(x, compute_f32(np.sin), math_ops.sin)
+    self._compareBoth(x, compute_f32(np.cos), math_ops.cos)
+    self._compareBoth(x, compute_f32(np.tan), math_ops.tan)
+    self._compareBoth(x, compute_f32(np.sinh), math_ops.sinh)
+    self._compareBoth(x, compute_f32(np.cosh), math_ops.cosh)
+    self._compareBoth(x, compute_f32(np.tanh), math_ops.tanh)
 
   def testInt8Basic(self):
     x = np.arange(-6, 6, 2).reshape(1, 3, 2).astype(np.int8)
diff --git a/tensorflow/python/kernel_tests/dense_update_ops_test.py b/tensorflow/python/kernel_tests/dense_update_ops_test.py
index 3d5ff3f47e5..b73f04b25d0 100644
--- a/tensorflow/python/kernel_tests/dense_update_ops_test.py
+++ b/tensorflow/python/kernel_tests/dense_update_ops_test.py
@@ -30,31 +30,31 @@ from tensorflow.python.platform import test
 
 class AssignOpTest(test.TestCase):
 
-  def _initAssignFetch(self, x, y, use_gpu=False):
+  def _initAssignFetch(self, x, y, use_gpu):
     """Initialize a param to init and update it with y."""
     super(AssignOpTest, self).setUp()
-    with self.cached_session(use_gpu=use_gpu):
+    with test_util.device(use_gpu=use_gpu):
       p = variables.Variable(x)
       assign = state_ops.assign(p, y)
-      p.initializer.run()
+      self.evaluate(p.initializer)
       new_value = self.evaluate(assign)
       return self.evaluate(p), new_value
 
-  def _initAssignAddFetch(self, x, y, use_gpu=False):
+  def _initAssignAddFetch(self, x, y, use_gpu):
     """Initialize a param to init, and compute param += y."""
-    with self.cached_session(use_gpu=use_gpu):
+    with test_util.device(use_gpu=use_gpu):
       p = variables.Variable(x)
       add = state_ops.assign_add(p, y)
-      p.initializer.run()
+      self.evaluate(p.initializer)
       new_value = self.evaluate(add)
       return self.evaluate(p), new_value
 
-  def _initAssignSubFetch(self, x, y, use_gpu=False):
+  def _initAssignSubFetch(self, x, y, use_gpu):
     """Initialize a param to init, and compute param -= y."""
-    with self.cached_session(use_gpu=use_gpu):
+    with test_util.device(use_gpu=use_gpu):
       p = variables.Variable(x)
       sub = state_ops.assign_sub(p, y)
-      p.initializer.run()
+      self.evaluate(p.initializer)
       new_value = self.evaluate(sub)
       return self.evaluate(p), new_value
 
@@ -78,11 +78,10 @@ class AssignOpTest(test.TestCase):
         var_value, op_value = self._initAssignAddFetch(x, y, use_gpu=True)
         self.assertAllEqual(x + y, var_value)
         self.assertAllEqual(x + y, op_value)
-        var_value, op_value = self._initAssignSubFetch(x, y, use_gpu=False)
+        var_value, op_value = self._initAssignSubFetch(x, y, use_gpu=True)
         self.assertAllEqual(x - y, var_value)
         self.assertAllEqual(x - y, op_value)
 
-  @test_util.run_deprecated_v1
   def testBasic(self):
     self._testTypes(np.arange(0, 20).reshape([4, 5]))
 
diff --git a/tensorflow/python/kernel_tests/depthtospace_op_test.py b/tensorflow/python/kernel_tests/depthtospace_op_test.py
index 43e8033a2c3..27461ac4a9d 100644
--- a/tensorflow/python/kernel_tests/depthtospace_op_test.py
+++ b/tensorflow/python/kernel_tests/depthtospace_op_test.py
@@ -53,7 +53,7 @@ class DepthToSpaceTest(test.TestCase):
         with self.assertRaisesRegex(
             errors_impl.InvalidArgumentError,
             "No OpKernel was registered to support Op 'DepthToSpace'"):
-          output_nhwc.eval()
+          self.evaluate(output_nhwc)
 
     if test.is_gpu_available():
       with self.cached_session(use_gpu=True):
diff --git a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
index 093de720b53..266a0f8d0fb 100644
--- a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
@@ -832,7 +832,7 @@ class DepthwiseConv2DTest(test.TestCase):
       # double datatype is currently not supported for convolution ops
       # on the ROCm platform
       optional_float64 = [] if test.is_built_with_rocm() else [dtypes.float64]
-      for data_type in ([dtypes.float32] + optional_float64):
+      for data_type in ([dtypes.float16, dtypes.float32] + optional_float64):
         self._ConstructAndTestGradient(
             input_size,
             filter_size,
diff --git a/tensorflow/python/kernel_tests/diag_op_test.py b/tensorflow/python/kernel_tests/diag_op_test.py
index 9c679ff34c9..8e8586b88d1 100644
--- a/tensorflow/python/kernel_tests/diag_op_test.py
+++ b/tensorflow/python/kernel_tests/diag_op_test.py
@@ -541,7 +541,6 @@ class MatrixDiagTest(test.TestCase):
       array_ops.matrix_diag(0)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testInvalidShapeAtEval(self):
     with self.session(use_gpu=True):
       v = array_ops.placeholder(dtype=dtypes_lib.float32)
@@ -891,7 +890,6 @@ class MatrixDiagPartTest(test.TestCase):
       array_ops.matrix_diag_part(0)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testInvalidShapeAtEval(self):
     with self.session(use_gpu=True):
       v = array_ops.placeholder(dtype=dtypes_lib.float32)
diff --git a/tensorflow/python/kernel_tests/distributions/categorical_test.py b/tensorflow/python/kernel_tests/distributions/categorical_test.py
index fbde3abba45..7dd953e1cb4 100644
--- a/tensorflow/python/kernel_tests/distributions/categorical_test.py
+++ b/tensorflow/python/kernel_tests/distributions/categorical_test.py
@@ -61,8 +61,8 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     with self.cached_session():
       self.assertAllEqual([2], dist.probs.get_shape())
       self.assertAllEqual([2], dist.logits.get_shape())
-      self.assertAllClose(dist.probs.eval(), p)
-      self.assertAllClose(dist.logits.eval(), logits)
+      self.assertAllClose(dist.probs, p)
+      self.assertAllClose(dist.logits, logits)
 
   @test_util.run_deprecated_v1
   def testShapes(self):
@@ -131,14 +131,14 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     histograms = [[0.2, 0.8], [0.6, 0.4]]
     dist = categorical.Categorical(math_ops.log(histograms) - 50.)
     with self.cached_session():
-      self.assertAllClose(dist.prob([0, 1]).eval(), [0.2, 0.4])
+      self.assertAllClose(dist.prob([0, 1]), [0.2, 0.4])
 
   @test_util.run_deprecated_v1
   def testPMFNoBatch(self):
     histograms = [0.2, 0.8]
     dist = categorical.Categorical(math_ops.log(histograms) - 50.)
     with self.cached_session():
-      self.assertAllClose(dist.prob(0).eval(), 0.2)
+      self.assertAllClose(dist.prob(0), 0.2)
 
   @test_util.run_deprecated_v1
   def testCDFWithDynamicEventShapeKnownNdims(self):
@@ -240,7 +240,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     expected_cdf_result[2, 1] = 0.75
 
     with self.cached_session():
-      self.assertAllClose(dist.cdf(devent).eval(), expected_cdf_result)
+      self.assertAllClose(dist.cdf(devent), expected_cdf_result)
 
   def testBroadcastWithBatchParamsAndBiggerEvent(self):
     ## The parameters have a single batch dimension, and the event has two.
@@ -314,15 +314,15 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     logits = np.log([[0.2, 0.8], [0.6, 0.4]]) - 50.
     dist = categorical.Categorical(logits)
     with self.cached_session():
-      self.assertAllClose(dist.log_prob([0, 1]).eval(), np.log([0.2, 0.4]))
-      self.assertAllClose(dist.log_prob([0.0, 1.0]).eval(), np.log([0.2, 0.4]))
+      self.assertAllClose(dist.log_prob([0, 1]), np.log([0.2, 0.4]))
+      self.assertAllClose(dist.log_prob([0.0, 1.0]), np.log([0.2, 0.4]))
 
   @test_util.run_deprecated_v1
   def testEntropyNoBatch(self):
     logits = np.log([0.2, 0.8]) - 50.
     dist = categorical.Categorical(logits)
     with self.cached_session():
-      self.assertAllClose(dist.entropy().eval(),
+      self.assertAllClose(dist.entropy(),
                           -(0.2 * np.log(0.2) + 0.8 * np.log(0.8)))
 
   @test_util.run_deprecated_v1
@@ -330,7 +330,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     logits = np.log([[0.2, 0.8], [0.6, 0.4]]) - 50.
     dist = categorical.Categorical(logits)
     with self.cached_session():
-      self.assertAllClose(dist.entropy().eval(), [
+      self.assertAllClose(dist.entropy(), [
           -(0.2 * np.log(0.2) + 0.8 * np.log(0.8)),
           -(0.6 * np.log(0.6) + 0.4 * np.log(0.4))
       ])
@@ -460,7 +460,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     with self.cached_session():
       histograms = [[[0.2, 0.8], [0.6, 0.4]]]
       dist = categorical.Categorical(math_ops.log(histograms) - 50.)
-      self.assertAllEqual(dist.mode().eval(), [[1, 0]])
+      self.assertAllEqual(dist.mode(), [[1, 0]])
 
   @test_util.run_deprecated_v1
   def testCategoricalCategoricalKL(self):
diff --git a/tensorflow/python/kernel_tests/embedding_ops_test.py b/tensorflow/python/kernel_tests/embedding_ops_test.py
index be8ff5f7d08..e1a5086a670 100644
--- a/tensorflow/python/kernel_tests/embedding_ops_test.py
+++ b/tensorflow/python/kernel_tests/embedding_ops_test.py
@@ -556,7 +556,7 @@ class EmbeddingLookupTest(test.TestCase):
           ids = np.random.randint(
               params.shape[0], size=np.prod(ids_shape)).reshape(ids_shape)
           # Compare nonsharded to gather
-          simple = embedding_ops.embedding_lookup(params, ids).eval()
+          simple = embedding_ops.embedding_lookup(params, ids)
           self.assertAllEqual(simple, array_ops.gather(params, ids))
           # Run a few random sharded versions
           for procs in 1, 2, 3:
@@ -564,7 +564,7 @@ class EmbeddingLookupTest(test.TestCase):
             split_params = [
                 array_ops.gather(params, stride + p) for p in xrange(procs)
             ]
-            sharded = embedding_ops.embedding_lookup(split_params, ids).eval()
+            sharded = embedding_ops.embedding_lookup(split_params, ids)
             self.assertAllEqual(simple, sharded)
 
   @test_util.run_deprecated_v1
@@ -583,8 +583,7 @@ class EmbeddingLookupTest(test.TestCase):
               params.shape[0], size=np.prod(ids_shape,
                                             dtype=np.int64)).reshape(ids_shape)
           # Compare nonsharded to gather
-          simple = embedding_ops.embedding_lookup(
-              params, ids, max_norm=1.0).eval()
+          simple = embedding_ops.embedding_lookup(params, ids, max_norm=1.0)
           # assertAllClose is used here as different implementations of sqrt may
           # be used to compute each of the values being compared.  For example,
           # on AVX512 builds the embedding operation makes use of Eigen's fast
@@ -599,7 +598,7 @@ class EmbeddingLookupTest(test.TestCase):
                 array_ops.gather(params, stride + p) for p in xrange(procs)
             ]
             sharded = embedding_ops.embedding_lookup(
-                split_params, ids, max_norm=1.0).eval()
+                split_params, ids, max_norm=1.0)
             self.assertAllEqual(simple, sharded)
 
   @test_util.run_deprecated_v1
@@ -626,7 +625,7 @@ class EmbeddingLookupTest(test.TestCase):
                                           dtype=np.int64)).reshape(ids_shape)
         # Compare nonsharded to gather.
         simple = embedding_ops._embedding_lookup_and_transform(
-            params, ids, max_norm=l2_norm, transform_fn=transform).eval()
+            params, ids, max_norm=l2_norm, transform_fn=transform)
         self.assertAllClose(simple, array_ops.gather(params_norm, ids))
         # Run a few different sharded versions.
         for procs in 1, 2, 3:
@@ -635,8 +634,7 @@ class EmbeddingLookupTest(test.TestCase):
               array_ops.gather(params, stride + p) for p in xrange(procs)
           ]
           sharded = embedding_ops._embedding_lookup_and_transform(
-              split_params, ids, max_norm=l2_norm,
-              transform_fn=transform).eval()
+              split_params, ids, max_norm=l2_norm, transform_fn=transform)
           # assertAllClose is used here as different implementations of sqrt may
           # be used to compute each of the values being compared.  For example,
           # on AVX512 builds the embedding operation makes use of Eigen's fast
@@ -810,8 +808,8 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
         partitioner=partitioned_variables.fixed_size_partitioner(num_shards),
         initializer=initializer))
     for w in embedding_weights:
-      w.initializer.run()
-    embedding_weights = [w.eval() for w in embedding_weights]
+      self.evaluate(w.initializer)
+    embedding_weights = [self.evaluate(w) for w in embedding_weights]
     return embedding_weights
 
   def _ids_and_weights_2d(self):
@@ -871,8 +869,9 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       sparse_ids, sparse_weights = self._ids_and_weights_2d()
 
       embedding_lookup_result = (
-          embedding_ops.safe_embedding_lookup_sparse_v2(
-              embedding_weights, sparse_ids, sparse_weights).eval())
+          embedding_ops.safe_embedding_lookup_sparse_v2(embedding_weights,
+                                                        sparse_ids,
+                                                        sparse_weights))
 
       self.assertAllClose(
           embedding_lookup_result,
@@ -887,8 +886,7 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
 
       embedding_lookup_result = (
           embedding_ops.safe_embedding_lookup_sparse_v2(
-              embedding_weights, sparse_ids, sparse_weights,
-              default_id=3).eval())
+              embedding_weights, sparse_ids, sparse_weights, default_id=3))
 
       self.assertAllClose(
           embedding_lookup_result,
@@ -903,8 +901,8 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       sparse_ids, _ = self._ids_and_weights_2d()
 
       embedding_lookup_result = (
-          embedding_ops.safe_embedding_lookup_sparse_v2(
-              embedding_weights, sparse_ids, None).eval())
+          embedding_ops.safe_embedding_lookup_sparse_v2(embedding_weights,
+                                                        sparse_ids, None))
 
       self.assertAllClose(
           embedding_lookup_result,
@@ -919,8 +917,8 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       sparse_ids, _ = self._ids_and_weights_2d()
 
       embedding_lookup_result = (
-          embedding_ops.safe_embedding_lookup_sparse_v2(
-              embedding_weights, sparse_ids, None).eval())
+          embedding_ops.safe_embedding_lookup_sparse_v2(embedding_weights,
+                                                        sparse_ids, None))
 
       embedding_weights = list(itertools.chain(*embedding_weights))
       self.assertAllClose(embedding_lookup_result,
@@ -951,8 +949,9 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       sparse_ids, sparse_weights = self._ids_and_weights_3d()
 
       embedding_lookup_result = (
-          embedding_ops.safe_embedding_lookup_sparse_v2(
-              embedding_weights, sparse_ids, sparse_weights).eval())
+          embedding_ops.safe_embedding_lookup_sparse_v2(embedding_weights,
+                                                        sparse_ids,
+                                                        sparse_weights))
 
       self.assertAllClose(embedding_lookup_result, [[
           (1.0 * embedding_weights[0][0] + 2.0 * embedding_weights[0][1]) / 3.0,
@@ -967,8 +966,7 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
 
       embedding_lookup_result = (
           embedding_ops.safe_embedding_lookup_sparse_v2(
-              embedding_weights, sparse_ids, sparse_weights,
-              default_id=3).eval())
+              embedding_weights, sparse_ids, sparse_weights, default_id=3))
 
       self.assertAllClose(
           embedding_lookup_result,
@@ -985,8 +983,8 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       sparse_ids, _ = self._ids_and_weights_3d()
 
       embedding_lookup_result = (
-          embedding_ops.safe_embedding_lookup_sparse_v2(
-              embedding_weights, sparse_ids, None).eval())
+          embedding_ops.safe_embedding_lookup_sparse_v2(embedding_weights,
+                                                        sparse_ids, None))
 
       self.assertAllClose(embedding_lookup_result, [[(
           embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4, [
@@ -1003,8 +1001,8 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       sparse_ids, _ = self._ids_and_weights_3d()
 
       embedding_lookup_result = (
-          embedding_ops.safe_embedding_lookup_sparse_v2(
-              embedding_weights, sparse_ids, None).eval())
+          embedding_ops.safe_embedding_lookup_sparse_v2(embedding_weights,
+                                                        sparse_ids, None))
 
       embedding_weights = list(itertools.chain(*embedding_weights))
       self.assertAllClose(embedding_lookup_result, [[
@@ -1046,7 +1044,7 @@ class DynamicStitchOpTest(test.TestCase):
           ops.convert_to_tensor([1, 2])
       ]
       self.assertAllEqual(
-          data_flow_ops.dynamic_stitch(indices, values).eval(), [12, 23, 1, 2])
+          data_flow_ops.dynamic_stitch(indices, values), [12, 23, 1, 2])
 
   @test_util.run_deprecated_v1
   def testCint32Gpu(self):
@@ -1060,7 +1058,7 @@ class DynamicStitchOpTest(test.TestCase):
           ops.convert_to_tensor([1, 2])
       ]
       self.assertAllEqual(
-          data_flow_ops.dynamic_stitch(indices, values).eval(), [12, 23, 1, 2])
+          data_flow_ops.dynamic_stitch(indices, values), [12, 23, 1, 2])
 
   @test_util.run_deprecated_v1
   def testInt32Cpu(self):
@@ -1074,7 +1072,7 @@ class DynamicStitchOpTest(test.TestCase):
           ops.convert_to_tensor([1, 2])
       ]
       self.assertAllEqual(
-          data_flow_ops.dynamic_stitch(indices, values).eval(), [12, 23, 1, 2])
+          data_flow_ops.dynamic_stitch(indices, values), [12, 23, 1, 2])
 
   @test_util.run_deprecated_v1
   def testInt32Gpu(self):
@@ -1088,7 +1086,7 @@ class DynamicStitchOpTest(test.TestCase):
           ops.convert_to_tensor([1, 2])
       ]
       self.assertAllEqual(
-          data_flow_ops.dynamic_stitch(indices, values).eval(), [12, 23, 1, 2])
+          data_flow_ops.dynamic_stitch(indices, values), [12, 23, 1, 2])
 
   @test_util.run_deprecated_v1
   def testSumGradArgs(self):
@@ -1102,7 +1100,7 @@ class DynamicStitchOpTest(test.TestCase):
           ops.convert_to_tensor([1, 1])
       ]
       self.assertAllEqual(
-          data_flow_ops.dynamic_stitch(indices, values).eval(), [2, 3, 1, 1])
+          data_flow_ops.dynamic_stitch(indices, values), [2, 3, 1, 1])
 
   # We expect that the values are merged in order.
   @test_util.run_deprecated_v1
@@ -1115,7 +1113,7 @@ class DynamicStitchOpTest(test.TestCase):
         indices.extend([ops.convert_to_tensor(np.arange(100).astype(np.int32))])
         np_values.extend([np.random.uniform(size=100)])
         values.extend([ops.convert_to_tensor(np_values[-1])])
-      stitched = data_flow_ops.dynamic_stitch(indices, values).eval()
+      stitched = data_flow_ops.dynamic_stitch(indices, values)
     self.assertAllEqual(np_values[-1], stitched)
 
 
@@ -1133,7 +1131,7 @@ class ParallelDynamicStitchOpTest(test.TestCase):
           ops.convert_to_tensor([1, 2, 3])
       ]
       self.assertAllEqual(
-          data_flow_ops.parallel_dynamic_stitch(indices, values).eval(),
+          data_flow_ops.parallel_dynamic_stitch(indices, values),
           [12, 23, 1, 2, 34, 3, 45])
 
   @test_util.run_deprecated_v1
@@ -1148,7 +1146,7 @@ class ParallelDynamicStitchOpTest(test.TestCase):
           ops.convert_to_tensor([1, 3, 2])
       ]
       self.assertAllEqual(
-          data_flow_ops.parallel_dynamic_stitch(indices, values).eval(),
+          data_flow_ops.parallel_dynamic_stitch(indices, values),
           [12, 23, 1, 2, 3, 34, 45, 56])
 
   @test_util.run_deprecated_v1
@@ -1157,8 +1155,7 @@ class ParallelDynamicStitchOpTest(test.TestCase):
       indices = [ops.convert_to_tensor([0, 1]), ops.convert_to_tensor([2, 3])]
       values = [ops.convert_to_tensor([2, 3]), ops.convert_to_tensor([1, 1])]
       self.assertAllEqual(
-          data_flow_ops.parallel_dynamic_stitch(indices, values).eval(),
-          [2, 3, 1, 1])
+          data_flow_ops.parallel_dynamic_stitch(indices, values), [2, 3, 1, 1])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/fifo_queue_test.py b/tensorflow/python/kernel_tests/fifo_queue_test.py
index b470115440a..dc2963ee84d 100644
--- a/tensorflow/python/kernel_tests/fifo_queue_test.py
+++ b/tensorflow/python/kernel_tests/fifo_queue_test.py
@@ -456,7 +456,7 @@ class UnconvertedFIFOQueueTests(test.TestCase):
 
       dequeued_elems = []
       for _ in dequeue_counts:
-        dequeued_elems.extend(dequeued_t.eval())
+        dequeued_elems.extend(self.evaluate(dequeued_t))
       self.assertEqual(elems, dequeued_elems)
 
   def testDequeueFromClosedQueue(self):
@@ -485,7 +485,7 @@ class UnconvertedFIFOQueueTests(test.TestCase):
 
       enqueue_op.run()
       for _ in range(500):
-        self.assertEqual(size_t.eval(), [1])
+        self.assertEqual(self.evaluate(size_t), [1])
 
   def testSharedQueueSameSession(self):
     with self.cached_session():
@@ -499,23 +499,23 @@ class UnconvertedFIFOQueueTests(test.TestCase):
       q1_size_t = q1.size()
       q2_size_t = q2.size()
 
-      self.assertEqual(q1_size_t.eval(), [1])
-      self.assertEqual(q2_size_t.eval(), [1])
+      self.assertEqual(self.evaluate(q1_size_t), [1])
+      self.assertEqual(self.evaluate(q2_size_t), [1])
 
       self.assertEqual(q2.dequeue().eval(), [10.0])
 
-      self.assertEqual(q1_size_t.eval(), [0])
-      self.assertEqual(q2_size_t.eval(), [0])
+      self.assertEqual(self.evaluate(q1_size_t), [0])
+      self.assertEqual(self.evaluate(q2_size_t), [0])
 
       q2.enqueue((20.0,)).run()
 
-      self.assertEqual(q1_size_t.eval(), [1])
-      self.assertEqual(q2_size_t.eval(), [1])
+      self.assertEqual(self.evaluate(q1_size_t), [1])
+      self.assertEqual(self.evaluate(q2_size_t), [1])
 
       self.assertEqual(q1.dequeue().eval(), [20.0])
 
-      self.assertEqual(q1_size_t.eval(), [0])
-      self.assertEqual(q2_size_t.eval(), [0])
+      self.assertEqual(self.evaluate(q1_size_t), [0])
+      self.assertEqual(self.evaluate(q2_size_t), [0])
 
   def testIncompatibleSharedQueueErrors(self):
     with self.cached_session():
@@ -796,7 +796,7 @@ class FIFOQueueParallelTests(test.TestCase):
       # Dequeue every element using a single thread.
       results = []
       for _ in xrange(len(elems)):
-        results.append(dequeued_t.eval())
+        results.append(self.evaluate(dequeued_t))
       self.assertItemsEqual(elems, results)
 
   def testParallelDequeue(self):
@@ -906,27 +906,28 @@ class FIFOQueueParallelTests(test.TestCase):
 
       # The enqueue should start and then block.
       results = []
-      results.append(deq.eval())  # Will only complete after the enqueue starts.
+      results.append(
+          self.evaluate(deq))  # Will only complete after the enqueue starts.
       self.assertEqual(len(enq_done), 1)
       self.assertEqual(self.evaluate(size_op), 5)
 
       for _ in range(3):
-        results.append(deq.eval())
+        results.append(self.evaluate(deq))
 
       time.sleep(0.1)
       self.assertEqual(len(enq_done), 1)
       self.assertEqual(self.evaluate(size_op), 5)
 
       # This dequeue will unblock the thread.
-      results.append(deq.eval())
+      results.append(self.evaluate(deq))
       time.sleep(0.1)
       self.assertEqual(len(enq_done), 2)
       thread.join()
 
       for i in range(5):
-        self.assertEqual(size_op.eval(), 5 - i)
-        results.append(deq.eval())
-        self.assertEqual(size_op.eval(), 5 - i - 1)
+        self.assertEqual(self.evaluate(size_op), 5 - i)
+        results.append(self.evaluate(deq))
+        self.assertEqual(self.evaluate(size_op), 5 - i - 1)
 
       self.assertAllEqual(elem, results)
 
@@ -1404,7 +1405,7 @@ class FIFOQueueParallelTests(test.TestCase):
       for thread in threads:
         thread.join()
 
-      self.assertItemsEqual(dequeued_t.eval(), elems * 10)
+      self.assertCountEqual(self.evaluate(dequeued_t), elems * 10)
 
   def testParallelDequeueMany(self):
     # We need each thread to keep its own device stack or the device scopes
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index 7c660d837f3..84a95934607 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -1039,7 +1039,7 @@ class PartitionedCallTest(test.TestCase):
       output, = functional_ops.partitioned_call(
           args=[constant_op.constant(1.),
                 constant_op.constant(2.)], f=Body)
-      self.assertEqual(output.eval(), 12.)
+      self.assertEqual(self.evaluate(output), 12.)
 
   @test_util.run_deprecated_v1
   def testBasicMultiDeviceGPU(self):
diff --git a/tensorflow/python/kernel_tests/gradient_correctness_test.py b/tensorflow/python/kernel_tests/gradient_correctness_test.py
index 682566742c2..ddbe514fa9d 100644
--- a/tensorflow/python/kernel_tests/gradient_correctness_test.py
+++ b/tensorflow/python/kernel_tests/gradient_correctness_test.py
@@ -18,103 +18,129 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
-class GradientCorrectnessTest(test.TestCase):
+class GradientCorrectnessTest(test.TestCase, parameterized.TestCase):
 
-  @test_util.run_deprecated_v1
-  def testMultipleOutputChainedGradients(self):
-    with self.cached_session() as sess:
+  @parameterized.parameters(set((True, context.executing_eagerly())))
+  def testMultipleOutputChainedGradients(self, use_tape):
+    with test_util.AbstractGradientTape(use_tape=use_tape) as tape:
       x = constant_op.constant(1.0, dtype=dtypes.float32)
+      tape.watch(x)
+
       yexp = math_ops.exp(x)
       yexplog = math_ops.log(yexp)
-      grads = gradients_impl.gradients([yexp, yexplog], [x])
+      grads = tape.gradient([yexp, yexplog], [x])
       grad_vals = self.evaluate(grads)
       exp1_plus_one = (1.0 + np.exp(1.0)).astype(np.float32)
       # [dexp(x)/dx + d(log(exp(x)))/dx] @ x=1 == exp(1) + 1
       self.assertAllClose(grad_vals[0], exp1_plus_one)
 
-  @test_util.run_deprecated_v1
-  def testIdentityGradient(self):
+  @parameterized.parameters(set((True, context.executing_eagerly())))
+  def testIdentityGradient(self, use_tape):
     x = constant_op.constant(3.)
-    dx_dx, = gradients_impl.gradients(x, x)
-    with self.cached_session() as sess:
-      self.assertAllClose(1., self.evaluate(dx_dx))
+    with test_util.AbstractGradientTape(use_tape=use_tape) as tape:
+      tape.watch(x)
+      dx_dx = tape.gradient(x, x)
+    self.assertAllClose(1., self.evaluate(dx_dx))
 
-  @test_util.run_deprecated_v1
-  def testIntegerIdentityGradient(self):
+  @parameterized.parameters(set((True, context.executing_eagerly())))
+  def testIntegerIdentityGradient(self, use_tape):
     x = constant_op.constant(3)
-    dx_dx, = gradients_impl.gradients(x, x)
-    with self.cached_session() as sess:
-      self.assertAllClose(1, self.evaluate(dx_dx))
+    with test_util.AbstractGradientTape(use_tape=use_tape) as tape:
+      tape.watch(x)
+      dx_dx = tape.gradient(x, x)
+    self.assertAllClose(1, self.evaluate(dx_dx))
 
-  @test_util.run_deprecated_v1
-  def testGradientWithIntegerPath(self):
-    x = constant_op.constant([3.9, 4.1])
-    k = math_ops.cast(math_ops.cast(x, dtypes.int32), dtypes.float32)
-    y = x * k
-    dy_dx, = gradients_impl.gradients(y, x)
-    with self.cached_session() as sess:
+  @parameterized.parameters(set((True, context.executing_eagerly())))
+  def testGradientWithIntegerPath(self, use_tape):
+    with test_util.AbstractGradientTape(use_tape=use_tape) as tape:
+      x = constant_op.constant([3.9, 4.1])
+      tape.watch(x)
+
+      k = math_ops.cast(math_ops.cast(x, dtypes.int32), dtypes.float32)
+      y = x * k
+      dy_dx = tape.gradient(y, x)
       self.assertAllClose([3., 4.], self.evaluate(dy_dx))
 
-  @test_util.run_deprecated_v1
-  def testNoIntegerGradient1(self):
-    x = constant_op.constant([3.9, 4.1])
-    k = math_ops.cast(math_ops.cast(x, dtypes.int32), dtypes.float32)
-    y = k * k
-    dy_dx, = gradients_impl.gradients(y, x)
-    self.assertIsNone(dy_dx)
+  @parameterized.parameters(set((True, context.executing_eagerly())))
+  def testNoIntegerGradient1(self, use_tape):
+    with test_util.AbstractGradientTape(use_tape=use_tape) as tape:
+      x = constant_op.constant([3.9, 4.1])
+      tape.watch(x)
 
-  @test_util.run_deprecated_v1
-  def testNoIntegerGradient2(self):
-    k = constant_op.constant([3, 4])
-    x = math_ops.cast(k, dtypes.float32)
-    y = x * x
-    dy_dk, = gradients_impl.gradients(y, k)
-    self.assertIsNone(dy_dk)
+      k = math_ops.cast(math_ops.cast(x, dtypes.int32), dtypes.float32)
+      y = k * k
+      dy_dx = tape.gradient(y, x)
+      self.assertIsNone(dy_dx)
 
-  @test_util.run_deprecated_v1
-  def testNoIntegerGradient3(self):
-    k = constant_op.constant([3, 4])
-    m = k * k
-    dm_dk, = gradients_impl.gradients(m, k)
-    self.assertIsNone(dm_dk)
+  @parameterized.parameters(set((True, context.executing_eagerly())))
+  def testNoIntegerGradient2(self, use_tape):
+    with test_util.AbstractGradientTape(use_tape=use_tape) as tape:
+      k = constant_op.constant([3, 4])
+      x = math_ops.cast(k, dtypes.float32)
+      tape.watch([k, x])
 
-  @test_util.run_deprecated_v1
-  def testNoIntegerGradient4(self):
-    k = constant_op.constant([3, 4])
-    m = k * k * k
-    dm_dk, = gradients_impl.gradients(m, k)
-    self.assertIsNone(dm_dk)
+      y = x * x
+      dy_dk = tape.gradient(y, k)
+      self.assertIsNone(dy_dk)
 
-  @test_util.run_deprecated_v1
-  def testNoIntegerGradient5(self):
-    k = constant_op.constant([3, 4])
-    m = k * k
-    n = m * m
-    dn_dk, = gradients_impl.gradients(n, k)
-    self.assertIsNone(dn_dk)
+  @parameterized.parameters(set((True, context.executing_eagerly())))
+  def testNoIntegerGradient3(self, use_tape):
+    with test_util.AbstractGradientTape(use_tape=use_tape) as tape:
+      k = constant_op.constant([3, 4])
+      tape.watch(k)
 
-  @test_util.run_deprecated_v1
-  def testNoIntegerGradient6(self):
-    k = constant_op.constant(3)
-    x = math_ops.cast(k, dtypes.float32)
-    grad_1, = gradients_impl.gradients(k * k, k)
-    grad_2, = gradients_impl.gradients(x * x, k)
-    grad_3, = gradients_impl.gradients(math_ops.square(k), k)
-    grad_4, = gradients_impl.gradients(math_ops.square(x), k)
-    self.assertIsNone(grad_1)
-    self.assertIsNone(grad_2)
-    self.assertIsNone(grad_3)
-    self.assertIsNone(grad_4)
+      m = k * k
+      dm_dk = tape.gradient(m, k)
+      self.assertIsNone(dm_dk)
+
+  @parameterized.parameters(set((True, context.executing_eagerly())))
+  def testNoIntegerGradient4(self, use_tape):
+    with test_util.AbstractGradientTape(use_tape=use_tape) as tape:
+      k = constant_op.constant([3, 4])
+      tape.watch(k)
+
+      m = k * k * k
+      dm_dk = tape.gradient(m, k)
+      self.assertIsNone(dm_dk)
+
+  @parameterized.parameters(set((True, context.executing_eagerly())))
+  def testNoIntegerGradient5(self, use_tape):
+    with test_util.AbstractGradientTape(use_tape=use_tape) as tape:
+      k = constant_op.constant([3, 4])
+      tape.watch(k)
+
+      m = k * k
+      n = m * m
+      dn_dk = tape.gradient(n, k)
+      self.assertIsNone(dn_dk)
+
+  @parameterized.parameters(set((True, context.executing_eagerly())))
+  def testNoIntegerGradient6(self, use_tape):
+    with test_util.AbstractGradientTape(
+        use_tape=use_tape, persistent=True) as tape:
+      k = constant_op.constant(3)
+      tape.watch(k)
+
+      x = math_ops.cast(k, dtypes.float32)
+      grad_1 = tape.gradient(k * k, k)
+      grad_2 = tape.gradient(x * x, k)
+      grad_3 = tape.gradient(math_ops.square(k), k)
+      grad_4 = tape.gradient(math_ops.square(x), k)
+      self.assertIsNone(grad_1)
+      self.assertIsNone(grad_2)
+      self.assertIsNone(grad_3)
+      self.assertIsNone(grad_4)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/kernel_tests/identity_n_op_py_test.py b/tensorflow/python/kernel_tests/identity_n_op_py_test.py
index 0498c1b019c..04b9eebb54f 100644
--- a/tensorflow/python/kernel_tests/identity_n_op_py_test.py
+++ b/tensorflow/python/kernel_tests/identity_n_op_py_test.py
@@ -21,43 +21,38 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
 class IdentityNOpTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def testInt32String_6(self):
-    with self.cached_session() as sess:
-      [value0, value1] = sess.run(
-          array_ops.identity_n([[1, 2, 3, 4, 5, 6],
-                                [b"a", b"b", b"C", b"d", b"E", b"f", b"g"]]))
+    value0, value1 = self.evaluate(
+        array_ops.identity_n([[1, 2, 3, 4, 5, 6],
+                              [b"a", b"b", b"C", b"d", b"E", b"f", b"g"]]))
+
     self.assertAllEqual(np.array([1, 2, 3, 4, 5, 6]), value0)
     self.assertAllEqual(
         np.array([b"a", b"b", b"C", b"d", b"E", b"f", b"g"]), value1)
 
-  @test_util.run_deprecated_v1
   def testInt32_shapes(self):
-    with self.cached_session() as sess:
-      inp0 = constant_op.constant([10, 20, 30, 40, 50, 60], shape=[2, 3])
-      inp1 = constant_op.constant([11, 21, 31, 41, 51, 61], shape=[3, 2])
-      inp2 = constant_op.constant(
-          [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], shape=[5, 3])
-      [value0, value1,
-       value2] = sess.run(array_ops.identity_n([inp0, inp1, inp2]))
+    inp0 = constant_op.constant([10, 20, 30, 40, 50, 60], shape=[2, 3])
+    inp1 = constant_op.constant([11, 21, 31, 41, 51, 61], shape=[3, 2])
+    inp2 = constant_op.constant(
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], shape=[5, 3])
+    value0, value1, value2 = self.evaluate(
+        array_ops.identity_n([inp0, inp1, inp2]))
+
     self.assertAllEqual(np.array([[10, 20, 30], [40, 50, 60]]), value0)
     self.assertAllEqual(np.array([[11, 21], [31, 41], [51, 61]]), value1)
     self.assertAllEqual(
         np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]]),
         value2)
 
-  @test_util.run_deprecated_v1
   def testString(self):
     source = [b"A", b"b", b"C", b"d", b"E", b"f"]
-    with self.cached_session() as sess:
-      [value] = sess.run(array_ops.identity_n([source]))
+    [value] = self.evaluate(array_ops.identity_n([source]))
     self.assertAllEqual(source, value)
 
   def testIdentityShape(self):
diff --git a/tensorflow/python/kernel_tests/identity_op_py_test.py b/tensorflow/python/kernel_tests/identity_op_py_test.py
index 013502dfe09..9e8b0eb6938 100644
--- a/tensorflow/python/kernel_tests/identity_op_py_test.py
+++ b/tensorflow/python/kernel_tests/identity_op_py_test.py
@@ -21,35 +21,25 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_array_ops
-from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
 class IdentityOpTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def testInt32_6(self):
-    with self.cached_session():
-      value = array_ops.identity([1, 2, 3, 4, 5, 6]).eval()
+    value = self.evaluate(array_ops.identity([1, 2, 3, 4, 5, 6]))
     self.assertAllEqual(np.array([1, 2, 3, 4, 5, 6]), value)
 
-  @test_util.run_deprecated_v1
   def testInt32_2_3(self):
-    with self.cached_session():
-      inp = constant_op.constant([10, 20, 30, 40, 50, 60], shape=[2, 3])
-      value = array_ops.identity(inp).eval()
+    inp = constant_op.constant([10, 20, 30, 40, 50, 60], shape=[2, 3])
+    value = self.evaluate(array_ops.identity(inp))
     self.assertAllEqual(np.array([[10, 20, 30], [40, 50, 60]]), value)
 
-  @test_util.run_deprecated_v1
   def testString(self):
     source = [b"A", b"b", b"C", b"d", b"E", b"f"]
-    with self.cached_session():
-      value = array_ops.identity(source).eval()
+    value = self.evaluate(array_ops.identity(source))
     self.assertAllEqual(source, value)
 
   def testIdentityShape(self):
@@ -63,16 +53,6 @@ class IdentityOpTest(test.TestCase):
       self.assertEqual(shape,
                        array_ops.identity(np.array(array_2x3)).get_shape())
 
-  @test_util.run_v1_only("b/120545219")
-  def testRefIdentityShape(self):
-    with self.cached_session():
-      shape = [2, 3]
-      tensor = variables.VariableV1(
-          constant_op.constant(
-              [[1, 2, 3], [6, 5, 4]], dtype=dtypes.int32))
-      self.assertEqual(shape, tensor.get_shape())
-      self.assertEqual(shape, gen_array_ops.ref_identity(tensor).get_shape())
-
   def testCompositeTensor(self):
     original = sparse_tensor.SparseTensor([[3]], [1.0], [100])
     copied = array_ops.identity(original)
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index 393d2cdfdcd..e3268fad2d8 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -116,7 +116,7 @@ class ConstantInitializersTest(test.TestCase):
       shape = [2, 3]
       x = variable_scope.get_variable(
           "x", shape=shape, initializer=init_ops.zeros_initializer())
-      x.initializer.run()
+      self.evaluate(x.initializer)
       self.assertAllEqual(x, np.zeros(shape))
 
   @test_util.run_deprecated_v1
@@ -125,7 +125,7 @@ class ConstantInitializersTest(test.TestCase):
       shape = [2, 3]
       x = variable_scope.get_variable(
           "x", shape=shape, initializer=init_ops.ones_initializer())
-      x.initializer.run()
+      self.evaluate(x.initializer)
       self.assertAllEqual(x, np.ones(shape))
 
   @test_util.run_deprecated_v1
@@ -134,7 +134,7 @@ class ConstantInitializersTest(test.TestCase):
       shape = [2, 3]
       x = variable_scope.get_variable(
           "x", shape=shape, initializer=init_ops.constant_initializer(0.0))
-      x.initializer.run()
+      self.evaluate(x.initializer)
       self.assertAllEqual(x, np.zeros(shape))
 
   @test_util.run_deprecated_v1
@@ -143,7 +143,7 @@ class ConstantInitializersTest(test.TestCase):
       shape = [2, 3]
       x = variable_scope.get_variable(
           "x", shape=shape, initializer=init_ops.constant_initializer(1.0))
-      x.initializer.run()
+      self.evaluate(x.initializer)
       self.assertAllEqual(x, np.ones(shape))
 
   @test_util.run_deprecated_v1
@@ -155,7 +155,7 @@ class ConstantInitializersTest(test.TestCase):
           shape=shape,
           dtype=dtypes.int32,
           initializer=init_ops.constant_initializer(7))
-      x.initializer.run()
+      self.evaluate(x.initializer)
       self.assertEqual(x.dtype.base_dtype, dtypes.int32)
       self.assertAllEqual(x, 7 * np.ones(shape, dtype=np.int32))
 
@@ -168,7 +168,7 @@ class ConstantInitializersTest(test.TestCase):
           shape=shape,
           dtype=dtypes.int32,
           initializer=init_ops.constant_initializer((10, 20, 30)))
-      x.initializer.run()
+      self.evaluate(x.initializer)
       self.assertEqual(x.dtype.base_dtype, dtypes.int32)
       self.assertAllEqual(x, [10, 20, 30])
 
@@ -176,7 +176,7 @@ class ConstantInitializersTest(test.TestCase):
     with self.cached_session(use_gpu=True):
       init = init_ops.constant_initializer(value, dtype=dtypes.int32)
       x = variable_scope.get_variable(name, shape=shape, initializer=init)
-      x.initializer.run()
+      self.evaluate(x.initializer)
 
       actual = array_ops.reshape(x, [-1]).eval()
       self.assertEqual(len(actual), len(expected))
@@ -201,7 +201,7 @@ class ConstantInitializersTest(test.TestCase):
     with self.cached_session(use_gpu=True):
       init = init_ops.constant_initializer(value, dtype=dtypes.int32)
       x = variable_scope.get_variable(name, shape=shape, initializer=init)
-      x.initializer.run()
+      self.evaluate(x.initializer)
 
       actual = array_ops.reshape(x, [-1]).eval()
       self.assertGreater(len(actual), len(expected))
@@ -931,7 +931,7 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
             "{}".format(i),
             shape=shape,
             initializer=init_ops.convolutional_delta_orthogonal)
-        x.initializer.run()
+        self.evaluate(x.initializer)
         y = self.evaluate(x)[1, 1, :, :]
         determinant = np.linalg.det(y)
         value += determinant
@@ -1000,8 +1000,8 @@ class ConvolutionOrthogonal1dInitializerTest(test.TestCase):
             "{}".format(i),
             shape=shape,
             initializer=init_ops.convolutional_orthogonal_1d)
-        x.initializer.run()
-        y = np.sum(x.eval(), axis=0)
+        self.evaluate(x.initializer)
+        y = np.sum(self.evaluate(x), axis=0)
         determinant = np.linalg.det(y)
         value += determinant
         abs_value += np.abs(determinant)
@@ -1229,8 +1229,8 @@ class ConvolutionOrthogonal3dInitializerTest(test.TestCase):
             "{}".format(i),
             shape=shape,
             initializer=init_ops.convolutional_orthogonal_3d)
-        x.initializer.run()
-        y = np.sum(x.eval(), axis=(0, 1, 2))
+        self.evaluate(x.initializer)
+        y = np.sum(self.evaluate(x), axis=(0, 1, 2))
         determinant = np.linalg.det(y)
         value += determinant
         abs_value += np.abs(determinant)
@@ -1325,7 +1325,7 @@ class IdentityInitializerTest(test.TestCase):
     init = init_ops.identity_initializer()
     shape = (10, 5)
     with self.session(graph=ops.Graph(), use_gpu=True):
-      self.assertAllClose(init(shape).eval(), np.eye(*shape))
+      self.assertAllClose(init(shape), np.eye(*shape))
 
   @test_util.run_deprecated_v1
   def testGain(self):
@@ -1334,9 +1334,9 @@ class IdentityInitializerTest(test.TestCase):
       init_default = init_ops.identity_initializer(dtype=dtype)
       init_custom = init_ops.identity_initializer(gain=0.9, dtype=dtype)
       with self.session(graph=ops.Graph(), use_gpu=True):
-        self.assertAllClose(init_default(shape).eval(), np.eye(*shape))
+        self.assertAllClose(init_default(shape), np.eye(*shape))
       with self.session(graph=ops.Graph(), use_gpu=True):
-        self.assertAllClose(init_custom(shape).eval(), np.eye(*shape) * 0.9)
+        self.assertAllClose(init_custom(shape), np.eye(*shape) * 0.9)
 
   @test_util.run_deprecated_v1
   def testPartitions(self):
diff --git a/tensorflow/python/kernel_tests/inplace_ops_test.py b/tensorflow/python/kernel_tests/inplace_ops_test.py
index ab9267c968d..b77ab710b43 100644
--- a/tensorflow/python/kernel_tests/inplace_ops_test.py
+++ b/tensorflow/python/kernel_tests/inplace_ops_test.py
@@ -31,10 +31,9 @@ from tensorflow.python.platform import test as test_lib
 
 class InplaceOpsTest(test_util.TensorFlowTestCase):
 
-  @test_util.run_deprecated_v1
   def testBasicUpdate(self):
     for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
-      with self.session(use_gpu=True):
+      with test_util.use_gpu():
         x = array_ops.ones([7, 3], dtype)
         y = np.ones([7, 3], dtype.as_numpy_dtype)
         self.assertAllClose(x, y)
@@ -49,9 +48,8 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
         y[5, :] = 7
         self.assertAllClose(x, y)
 
-  @test_util.run_deprecated_v1
   def testBasicUpdateBool(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       x = array_ops.ones([7, 3], dtypes.bool)
       y = np.ones([7, 3], dtypes.bool.as_numpy_dtype)
       self.assertAllClose(x, y)
@@ -67,10 +65,9 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
       y[5, :] = False
       self.assertAllClose(x, y)
 
-  @test_util.run_deprecated_v1
   def testBasicAdd(self):
     for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
-      with self.cached_session(use_gpu=True):
+      with test_util.use_gpu():
         x = array_ops.ones([7, 3], dtype)
         y = np.ones([7, 3], dtype.as_numpy_dtype)
         self.assertAllClose(x, y)
@@ -87,10 +84,9 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
         y[:, :] += 99
         self.assertAllClose(x, y)
 
-  @test_util.run_deprecated_v1
   def testBasicSub(self):
     for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
-      with self.cached_session(use_gpu=True):
+      with test_util.use_gpu():
         x = array_ops.ones([7, 3], dtype)
         y = np.ones([7, 3], dtype.as_numpy_dtype)
         self.assertAllClose(x, y)
@@ -107,9 +103,8 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
         y[:, :] -= 99
         self.assertAllClose(x, y)
 
-  @test_util.run_deprecated_v1
   def testRandom(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       d0, d1, d2 = 100, 3, 5
       x = array_ops.zeros([d0, d1, d2])
       y = np.zeros([d0, d1, d2])
@@ -128,9 +123,8 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
           y[idx, :] -= val
         self.assertAllClose(x, y)
 
-  @test_util.run_deprecated_v1
   def testRandom1D(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       d0 = 100
       x = array_ops.zeros([d0])
       y = np.zeros([d0])
@@ -150,7 +144,7 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
         self.assertAllClose(x, y)
 
   def testAlias(self):
-    with self.session(use_gpu=True) as sess:
+    with test_util.use_gpu():
       x = array_ops.ones([2, 3])
       y = inplace_ops.alias_inplace_add(x, [0], [[1, 2, 3]])
       with ops.control_dependencies([y]):
@@ -159,50 +153,48 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(vy, vz)
 
   def testError(self):
-    with self.cached_session():
-      with self.assertRaisesRegex(errors.InvalidArgumentError,
-                                  "must be a vector"):
-        _ = inplace_ops.inplace_update([[1.]], [[0]], [[10]]).eval()
-      with self.assertRaisesRegex(errors.InvalidArgumentError,
-                                  "x and v shape doesn't match"):
-        _ = inplace_ops.inplace_update([[1.]], [0], [10]).eval()
-      with self.assertRaisesRegex(errors.InvalidArgumentError,
-                                  "i and x shape doesn't match"):
-        _ = inplace_ops.inplace_update([[1.]], [0, 1], [[10]]).eval()
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "must be a vector"):
+      _ = self.evaluate(inplace_ops.inplace_update([[1.]], [[0]], [[10]]))
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "x and v shape doesn't match"):
+      _ = self.evaluate(inplace_ops.inplace_update([[1.]], [0], [10]))
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "i and x shape doesn't match"):
+      _ = self.evaluate(inplace_ops.inplace_update([[1.]], [0, 1], [[10]]))
 
-  @test_util.run_deprecated_v1
   def testEmpty(self):
     for dtype in [
         dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64, dtypes.bool,
         dtypes.uint8
     ]:
-      with self.cached_session(use_gpu=True):
+      with test_util.use_gpu():
         test_shapes = [(), (1,), (2, 3), (0, 2), (2, 3, 5), (2, 0, 5)]
         for shape in test_shapes:
-          val = inplace_ops.empty(shape, dtype).eval()
+          val = self.evaluate(inplace_ops.empty(shape, dtype))
           self.assertEqual(val.shape, shape)
           self.assertEqual(val.dtype, dtype.as_numpy_dtype)
-          val = inplace_ops.empty(shape, dtype, init=True).eval()
+          val = self.evaluate(inplace_ops.empty(shape, dtype, init=True))
           self.assertEqual(val.shape, shape)
           self.assertEqual(val.dtype, dtype.as_numpy_dtype)
           self.assertAllEqual(val, np.zeros(shape, dtype.as_numpy_dtype))
-          val = inplace_ops.empty_like(array_ops.zeros(shape, dtype)).eval()
+          val = self.evaluate(
+              inplace_ops.empty_like(array_ops.zeros(shape, dtype)))
           self.assertEqual(val.shape, shape)
           self.assertEqual(val.dtype, dtype.as_numpy_dtype)
-          val = inplace_ops.empty_like(
-              array_ops.zeros(shape, dtype), init=True).eval()
+          val = self.evaluate(inplace_ops.empty_like(
+              array_ops.zeros(shape, dtype), init=True))
           self.assertEqual(val.shape, shape)
           self.assertEqual(val.dtype, dtype.as_numpy_dtype)
           self.assertAllEqual(val, np.zeros(shape, dtype.as_numpy_dtype))
 
-    with self.cached_session(use_gpu=True):
-      val = inplace_ops.empty((1, 2), dtypes.string, init=True).eval()
+    with test_util.use_gpu():
+      val = self.evaluate(inplace_ops.empty((1, 2), dtypes.string, init=True))
       self.assertEqual(val.tolist(), [[b"", b""]])
 
-      val = inplace_ops.empty((1, 2), dtypes.string, init=False).eval()
+      val = self.evaluate(inplace_ops.empty((1, 2), dtypes.string, init=False))
       self.assertEqual(val.tolist(), [[b"", b""]])
 
-  @test_util.run_deprecated_v1
   def testInplaceOpOnEmptyTensors(self):
     op_fns = [
         inplace_ops.inplace_add,
@@ -211,7 +203,7 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
     ]
     for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
       for op_fn in op_fns:
-        with self.cached_session(use_gpu=True):
+        with test_util.use_gpu():
           x = array_ops.zeros([7, 0], dtype)
           y = np.zeros([7, 0], dtype.as_numpy_dtype)
           self.assertAllClose(x, y)
diff --git a/tensorflow/python/kernel_tests/io_ops_test.py b/tensorflow/python/kernel_tests/io_ops_test.py
index c5df5231bf6..38461efd73f 100644
--- a/tensorflow/python/kernel_tests/io_ops_test.py
+++ b/tensorflow/python/kernel_tests/io_ops_test.py
@@ -42,7 +42,7 @@ class IoOpsTest(test.TestCase):
       with self.cached_session():
         read = io_ops.read_file(temp.name)
         self.assertEqual([], read.get_shape())
-        self.assertEqual(read.eval(), contents)
+        self.assertEqual(self.evaluate(read), contents)
       os.remove(temp.name)
 
   def testWriteFile(self):
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_addition_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_addition_test.py
index 4e4a81d0647..fa45c27228b 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_addition_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_addition_test.py
@@ -279,9 +279,8 @@ class AddAndReturnScaledIdentityTest(test.TestCase):
     self.assertIsInstance(operator, linalg.LinearOperatorScaledIdentity)
 
     with self.cached_session():
-      self.assertAllClose(2 *
-                          linalg_ops.eye(num_rows=2, batch_shape=[3]).eval(),
-                          operator.to_dense().eval())
+      self.assertAllClose(2 * linalg_ops.eye(num_rows=2, batch_shape=[3]),
+                          operator.to_dense())
     self.assertTrue(operator.is_positive_definite)
     self.assertTrue(operator.is_non_singular)
     self.assertEqual("my_operator", operator.name)
@@ -298,9 +297,8 @@ class AddAndReturnScaledIdentityTest(test.TestCase):
     self.assertIsInstance(operator, linalg.LinearOperatorScaledIdentity)
 
     with self.cached_session():
-      self.assertAllClose(3.2 *
-                          linalg_ops.eye(num_rows=2, batch_shape=[3]).eval(),
-                          operator.to_dense().eval())
+      self.assertAllClose(3.2 * linalg_ops.eye(num_rows=2, batch_shape=[3]),
+                          operator.to_dense())
     self.assertTrue(operator.is_positive_definite)
     self.assertTrue(operator.is_non_singular)
     self.assertEqual("my_operator", operator.name)
@@ -318,9 +316,8 @@ class AddAndReturnScaledIdentityTest(test.TestCase):
     self.assertIsInstance(operator, linalg.LinearOperatorScaledIdentity)
 
     with self.cached_session():
-      self.assertAllClose(1.2 *
-                          linalg_ops.eye(num_rows=2, batch_shape=[3]).eval(),
-                          operator.to_dense().eval())
+      self.assertAllClose(1.2 * linalg_ops.eye(num_rows=2, batch_shape=[3]),
+                          operator.to_dense())
     self.assertTrue(operator.is_positive_definite)
     self.assertTrue(operator.is_non_singular)
     self.assertEqual("my_operator", operator.name)
@@ -343,9 +340,8 @@ class AddAndReturnDiagTest(test.TestCase):
     self.assertIsInstance(operator, linalg.LinearOperatorDiag)
 
     with self.cached_session():
-      self.assertAllClose(2 *
-                          linalg_ops.eye(num_rows=2, batch_shape=[3]).eval(),
-                          operator.to_dense().eval())
+      self.assertAllClose(2 * linalg_ops.eye(num_rows=2, batch_shape=[3]),
+                          operator.to_dense())
     self.assertTrue(operator.is_positive_definite)
     self.assertTrue(operator.is_non_singular)
     self.assertEqual("my_operator", operator.name)
@@ -365,8 +361,8 @@ class AddAndReturnDiagTest(test.TestCase):
 
     with self.cached_session():
       self.assertAllClose(
-          linalg.LinearOperatorDiag(diag1 + diag2).to_dense().eval(),
-          operator.to_dense().eval())
+          linalg.LinearOperatorDiag(diag1 + diag2).to_dense(),
+          operator.to_dense())
     self.assertTrue(operator.is_positive_definite)
     self.assertTrue(operator.is_non_singular)
     self.assertEqual("my_operator", operator.name)
diff --git a/tensorflow/python/kernel_tests/linalg_ops_test.py b/tensorflow/python/kernel_tests/linalg_ops_test.py
index 916d9a4b8c8..2cddddae0dd 100644
--- a/tensorflow/python/kernel_tests/linalg_ops_test.py
+++ b/tensorflow/python/kernel_tests/linalg_ops_test.py
@@ -69,8 +69,7 @@ class CholeskySolveTest(test.TestCase):
             with self.subTest(n=n, np_type=np_type, atol=atol, k=k):
               rhs = self.rng.randn(2, n, k).astype(np_type)
               x = linalg_ops.cholesky_solve(chol, rhs)
-              self.assertAllClose(
-                  rhs, math_ops.matmul(array, x).eval(), atol=atol)
+              self.assertAllClose(rhs, math_ops.matmul(array, x), atol=atol)
 
 
 class LogdetTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/lookup_ops_test.py b/tensorflow/python/kernel_tests/lookup_ops_test.py
index 59afb2c27ab..045dafc3089 100644
--- a/tensorflow/python/kernel_tests/lookup_ops_test.py
+++ b/tensorflow/python/kernel_tests/lookup_ops_test.py
@@ -45,6 +45,7 @@ from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 from tensorflow.python.training import saver
 from tensorflow.python.training import server_lib
@@ -213,6 +214,25 @@ class StaticHashTableTest(BaseLookupTableTest):
     self.assertAllEqual(sp_indices, out_indices)
     self.assertAllEqual(sp_shape, out_shape)
 
+  def testStaticHashTableWithRaggedTensorInput(self):
+    default_val = constant_op.constant(-1, dtypes.int64)
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = self.getHashTable()(
+        lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+    self.initialize_table(table)
+
+    row_splits = [0, 2, 3]
+    input_tensor = ragged_tensor.RaggedTensor.from_row_splits(
+        constant_op.constant(["brain", "salad", "tank"]),
+        constant_op.constant(row_splits, dtypes.int64))
+    output = table.lookup(input_tensor)
+
+    out = self.evaluate(output)
+
+    self.assertAllEqual([0, 1, -1], out.values)
+    self.assertAllEqual(row_splits, out.row_splits)
+
   def testSignatureMismatch(self):
     default_val = -1
     keys = constant_op.constant(["brain", "salad", "surgery"])
@@ -306,7 +326,7 @@ class StaticHashTableTest(BaseLookupTableTest):
     # Init the table in the second session and verify that we do not get a
     # "Table already initialized" error.
     with session2:
-      table.initializer.run()
+      self.evaluate(table.initializer)
       self.assertAllEqual(3, self.evaluate(table.size()))
 
   @test_util.run_v2_only
@@ -812,7 +832,7 @@ class InitializeTableFromFileOpTest(BaseLookupTableTest):
       # Initialize with non existing file (old_file.txt) should fail.
       # TODO(yleon): Update message, which might change per FileSystem.
       with self.assertRaisesOpError("old_file.txt"):
-        table.initializer.run()
+        self.evaluate(table.initializer)
 
       # Initialize the model feeding the vocabulary file.
       filenames = ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
@@ -1081,6 +1101,28 @@ class StaticVocabularyTableTest(BaseLookupTableTest):
     self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
     self.assertAllEqual(input_shape, sp_ids_shape)
 
+  def testRaggedTensor(self):
+    vocab_file = self._createVocabFile("feat_to_id_7.txt")
+    input_row_splits = [0, 2, 4, 5]
+    ragged_features = ragged_tensor.RaggedTensor.from_row_splits(
+        constant_op.constant(["brain", "salad", "brain", "surgery", "tarkus"],
+                             dtypes.string),
+        constant_op.constant(input_row_splits, dtypes.int64))
+
+    table = self.getVocabularyTable()(lookup_ops.TextFileIdTableInitializer(
+        vocab_file, vocab_size=3), 1)
+    self.initialize_table(table)
+
+    ragged_ids = table.lookup(ragged_features)
+
+    self.assertAllEqual([5], ragged_ids.values._shape_as_list())
+
+    ragged_ids_val, ragged_ids_row_splits = self.evaluate(
+        [ragged_ids.values, ragged_ids.row_splits])
+
+    self.assertAllEqual([0, 1, 0, 2, 3], ragged_ids_val)
+    self.assertAllEqual(input_row_splits, ragged_ids_row_splits)
+
   def testInt32SparseTensor(self):
     input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
     input_shape = [4, 4]
@@ -1107,6 +1149,29 @@ class StaticVocabularyTableTest(BaseLookupTableTest):
     self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
     self.assertAllEqual(input_shape, sp_ids_shape)
 
+  def testInt32RaggedTensor(self):
+    input_row_splits = [0, 2, 4, 5]
+    ragged_features = ragged_tensor.RaggedTensor.from_row_splits(
+        constant_op.constant([42, 1, 42, -1000, 11], dtypes.int32),
+        constant_op.constant(input_row_splits, dtypes.int64))
+
+    table = self.getVocabularyTable()(
+        lookup_ops.KeyValueTensorInitializer((42, 1, -1000), (0, 1, 2),
+                                             dtypes.int64, dtypes.int64),
+        1,
+        lookup_key_dtype=dtypes.int32)
+    self.initialize_table(table)
+
+    ragged_ids = table.lookup(ragged_features)
+
+    self.assertAllEqual([5], ragged_ids.values._shape_as_list())
+
+    ragged_ids_val, ragged_ids_row_splits = self.evaluate(
+        [ragged_ids.values, ragged_ids.row_splits])
+
+    self.assertAllEqual([0, 1, 0, 2, 3], ragged_ids_val)
+    self.assertAllEqual(input_row_splits, ragged_ids_row_splits)
+
   def testInt64SparseTensor(self):
     input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
     input_shape = [4, 4]
@@ -1130,6 +1195,26 @@ class StaticVocabularyTableTest(BaseLookupTableTest):
     self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
     self.assertAllEqual(input_shape, sp_ids_shape)
 
+  def testInt64RaggedTensor(self):
+    input_row_splits = [0, 2, 4, 5]
+    ragged_features = ragged_tensor.RaggedTensor.from_row_splits(
+        constant_op.constant([42, 1, 42, -1000, 11], dtypes.int64),
+        constant_op.constant(input_row_splits, dtypes.int64))
+
+    table = self.getVocabularyTable()(lookup_ops.KeyValueTensorInitializer(
+        (42, 1, -1000), (0, 1, 2), dtypes.int64, dtypes.int64), 1)
+    self.initialize_table(table)
+
+    ragged_ids = table.lookup(ragged_features)
+
+    self.assertAllEqual([5], ragged_ids.values._shape_as_list())
+
+    ragged_ids_val, ragged_ids_row_splits = self.evaluate(
+        [ragged_ids.values, ragged_ids.row_splits])
+
+    self.assertAllEqual([0, 1, 0, 2, 3], ragged_ids_val)
+    self.assertAllEqual(input_row_splits, ragged_ids_row_splits)
+
   def testStaticVocabularyTableNoInnerTable(self):
     table = self.getVocabularyTable()(None, num_oov_buckets=1)
     self.assertIsNone(table.resource_handle)
@@ -1690,7 +1775,7 @@ class DenseHashTableOpTest(test.TestCase):
           [[11, 12], [11, 14], [11, 15], [13, 14], [13, 15]], dtypes.int64)
       output = table.lookup(input_string)
       self.assertAllEqual([[0, 1], [2, 3], [-1, -2], [4, 5], [-1, -2]],
-                          output.eval())
+                          self.evaluate(output))
 
   @test_util.run_v1_only("Saver V1 only")
   def testVectorScalarSaveRestore(self):
@@ -2430,170 +2515,156 @@ class IdTableWithHashBucketsTest(test.TestCase):
       f.write("\n".join(values) + "\n")
     return vocabulary_file
 
-  @test_util.run_deprecated_v1
   def testStringIdTableWithHashBuckets(self):
     vocab_file = self._createVocabFile("feat_to_id_1.txt")
-    with self.cached_session():
-      default_value = -1
-      vocab_size = 3
-      oov_buckets = 1
-      table = lookup_ops.IdTableWithHashBuckets(
-          lookup_ops.StaticHashTable(
-              lookup_ops.TextFileIdTableInitializer(
-                  vocab_file, vocab_size=vocab_size), default_value),
-          oov_buckets)
+    default_value = -1
+    vocab_size = 3
+    oov_buckets = 1
+    table = lookup_ops.IdTableWithHashBuckets(
+        lookup_ops.StaticHashTable(
+            lookup_ops.TextFileIdTableInitializer(
+                vocab_file, vocab_size=vocab_size), default_value),
+        oov_buckets)
 
-      table.initializer.run()
+    self.evaluate(table.initializer)
 
-      input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
+    input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
 
-      out = table.lookup(input_string)
-      self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
-      self.assertEqual(vocab_size + oov_buckets, table.size().eval())
+    out = table.lookup(input_string)
+    self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
+    self.assertEqual(vocab_size + oov_buckets, self.evaluate(table.size()))
 
-  @test_util.run_deprecated_v1
   def testInt32IdTableWithHashBuckets(self):
     vocab_file = self._createVocabFile("feat_to_id_2.txt", ("42", "1", "-1000"))
-    with self.cached_session():
-      default_value = -1
-      vocab_size = 3
-      oov_buckets = 1
-      table = lookup_ops.IdTableWithHashBuckets(
-          lookup_ops.StaticHashTable(
-              lookup_ops.TextFileIdTableInitializer(
-                  vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64),
-              default_value),
-          oov_buckets,
-          key_dtype=dtypes.int32)
+    default_value = -1
+    vocab_size = 3
+    oov_buckets = 1
+    table = lookup_ops.IdTableWithHashBuckets(
+        lookup_ops.StaticHashTable(
+            lookup_ops.TextFileIdTableInitializer(
+                vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64),
+            default_value),
+        oov_buckets,
+        key_dtype=dtypes.int32)
 
-      table.initializer.run()
+    self.evaluate(table.initializer)
 
-      values = constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int32)
+    values = constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int32)
 
-      out = table.lookup(values)
-      self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
-      self.assertEqual(vocab_size + oov_buckets, table.size().eval())
+    out = table.lookup(values)
+    self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
+    self.assertEqual(vocab_size + oov_buckets, self.evaluate(table.size()))
 
-  @test_util.run_deprecated_v1
   def testInt64IdTableWithHashBuckets(self):
     vocab_file = self._createVocabFile("feat_to_id_3.txt", ("42", "1", "-1000"))
-    with self.cached_session():
-      default_value = -1
-      vocab_size = 3
-      oov_buckets = 1
-      table = lookup_ops.IdTableWithHashBuckets(
-          lookup_ops.StaticHashTable(
-              lookup_ops.TextFileIdTableInitializer(
-                  vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64),
-              default_value), oov_buckets)
+    default_value = -1
+    vocab_size = 3
+    oov_buckets = 1
+    table = lookup_ops.IdTableWithHashBuckets(
+        lookup_ops.StaticHashTable(
+            lookup_ops.TextFileIdTableInitializer(
+                vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64),
+            default_value), oov_buckets)
 
-      table.initializer.run()
+    self.evaluate(table.initializer)
 
-      values = constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int64)
+    values = constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int64)
 
-      out = table.lookup(values)
-      self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
-      self.assertEqual(vocab_size + oov_buckets, table.size().eval())
+    out = table.lookup(values)
+    self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
+    self.assertEqual(vocab_size + oov_buckets, self.evaluate(table.size()))
 
-  @test_util.run_deprecated_v1
   def testStringIdTableWithOnlyHashBucket(self):
-    with self.cached_session():
-      oov_buckets = 5
+    oov_buckets = 5
 
-      # Set a table that only uses hash buckets, for each input value returns
-      # an id calculated by fingerprint("input") mod oov_buckets.
-      table = lookup_ops.IdTableWithHashBuckets(None, oov_buckets)
-      table.initializer.run()
+    # Set a table that only uses hash buckets, for each input value returns
+    # an id calculated by fingerprint("input") mod oov_buckets.
+    table = lookup_ops.IdTableWithHashBuckets(None, oov_buckets)
+    self.evaluate(table.initializer)
 
-      values = constant_op.constant(("brain", "salad", "surgery"))
+    values = constant_op.constant(("brain", "salad", "surgery"))
 
-      out = table.lookup(values)
-      self.assertAllEqual(
-          [
-              3,  # fingerprint("brain") mod 5.
-              1,  # fingerprint("salad") mod 5.
-              4  # fingerprint("surgery") mod 5
-          ],
-          self.evaluate(out))
-      self.assertEqual(oov_buckets, table.size().eval())
+    out = table.lookup(values)
+    self.assertAllEqual(
+        [
+            3,  # fingerprint("brain") mod 5.
+            1,  # fingerprint("salad") mod 5.
+            4  # fingerprint("surgery") mod 5
+        ],
+        self.evaluate(out))
+    self.assertEqual(oov_buckets, self.evaluate(table.size()))
 
-  @test_util.run_deprecated_v1
   def testInt32IdTableWithOnlyHashBucket(self):
-    with self.cached_session():
-      oov_buckets = 5
+    oov_buckets = 5
 
-      # Set a table that only uses hash buckets, for each input value returns
-      # an id calculated by fingerprint("input") mod oov_buckets.
-      table = lookup_ops.IdTableWithHashBuckets(
-          None, oov_buckets, key_dtype=dtypes.int32)
-      table.initializer.run()
+    # Set a table that only uses hash buckets, for each input value returns
+    # an id calculated by fingerprint("input") mod oov_buckets.
+    table = lookup_ops.IdTableWithHashBuckets(
+        None, oov_buckets, key_dtype=dtypes.int32)
+    self.evaluate(table.initializer)
 
-      input_string = constant_op.constant([42, 1, -1000], dtype=dtypes.int32)
+    input_string = constant_op.constant([42, 1, -1000], dtype=dtypes.int32)
 
-      out = table.lookup(input_string)
-      self.assertAllEqual(
-          [
-              1,  # fingerprint("42") mod 5.
-              4,  # fingerprint("1") mod 5.
-              2  # fingerprint("-1000") mod 5
-          ],
-          self.evaluate(out))
-      self.assertEqual(oov_buckets, table.size().eval())
+    out = table.lookup(input_string)
+    self.assertAllEqual(
+        [
+            1,  # fingerprint("42") mod 5.
+            4,  # fingerprint("1") mod 5.
+            2  # fingerprint("-1000") mod 5
+        ],
+        self.evaluate(out))
+    self.assertEqual(oov_buckets, self.evaluate(table.size()))
 
   def testFloat64IdTableWithOnlyHashBucket(self):
-    with self.cached_session():
-      with self.assertRaisesRegex(TypeError, "Invalid key_dtype"):
-        lookup_ops.IdTableWithHashBuckets(
-            None, num_oov_buckets=5, key_dtype=dtypes.float64)
+    with self.assertRaisesRegex(TypeError, "Invalid key_dtype"):
+      lookup_ops.IdTableWithHashBuckets(
+          None, num_oov_buckets=5, key_dtype=dtypes.float64)
 
   def testBoolIdTableWithOnlyHashBucket(self):
-    with self.cached_session():
-      with self.assertRaisesRegex(TypeError, "Invalid key_dtype"):
-        lookup_ops.IdTableWithHashBuckets(
-            None, num_oov_buckets=5, key_dtype=dtypes.bool)
+    with self.assertRaisesRegex(TypeError, "Invalid key_dtype"):
+      lookup_ops.IdTableWithHashBuckets(
+          None, num_oov_buckets=5, key_dtype=dtypes.bool)
 
-  @test_util.run_deprecated_v1
   def testIdTableWithHashBucketsWithMultipleInitializers(self):
     vocab_file = self._createVocabFile("feat_to_id_4.txt")
-    with self.cached_session() as sess:
-      default_value = -1
-      vocab_size = 3
-      oov_buckets = 3
+    default_value = -1
+    vocab_size = 3
+    oov_buckets = 3
 
-      vocab_table = lookup_ops.StaticHashTable(
-          lookup_ops.TextFileIdTableInitializer(
-              vocab_file, vocab_size=vocab_size), default_value)
-      table1 = lookup_ops.IdTableWithHashBuckets(
-          vocab_table,
-          oov_buckets,
-          hasher_spec=lookup_ops.FastHashSpec,
-          name="table1")
+    vocab_table = lookup_ops.StaticHashTable(
+        lookup_ops.TextFileIdTableInitializer(
+            vocab_file, vocab_size=vocab_size), default_value)
+    table1 = lookup_ops.IdTableWithHashBuckets(
+        vocab_table,
+        oov_buckets,
+        hasher_spec=lookup_ops.FastHashSpec,
+        name="table1")
 
-      table2 = lookup_ops.IdTableWithHashBuckets(
-          vocab_table,
-          oov_buckets,
-          hasher_spec=lookup_ops.StrongHashSpec((1, 2)),
-          name="table2")
+    table2 = lookup_ops.IdTableWithHashBuckets(
+        vocab_table,
+        oov_buckets,
+        hasher_spec=lookup_ops.StrongHashSpec((1, 2)),
+        name="table2")
 
-      lookup_ops.tables_initializer().run()
+    self.evaluate(lookup_ops.tables_initializer())
 
-      input_string = constant_op.constant(
-          ["fruit", "brain", "salad", "surgery", "UNK"])
+    input_string = constant_op.constant(
+        ["fruit", "brain", "salad", "surgery", "UNK"])
 
-      out1 = table1.lookup(input_string)
-      out2 = table2.lookup(input_string)
+    out1 = table1.lookup(input_string)
+    out2 = table2.lookup(input_string)
 
-      out1, out2 = self.evaluate([out1, out2])
-      self.assertAllEqual([5, 0, 1, 2, 5], out1)
-      self.assertAllEqual([5, 0, 1, 2, 3], out2)
-      self.assertEqual(vocab_size + oov_buckets, table1.size().eval())
-      self.assertEqual(vocab_size + oov_buckets, table2.size().eval())
+    out1, out2 = self.evaluate([out1, out2])
+    self.assertAllEqual([5, 0, 1, 2, 5], out1)
+    self.assertAllEqual([5, 0, 1, 2, 3], out2)
+    self.assertEqual(vocab_size + oov_buckets, self.evaluate(table1.size()))
+    self.assertEqual(vocab_size + oov_buckets, self.evaluate(table2.size()))
+    if not context.executing_eagerly():
       test_util.assert_ops_in_graph({
           "table1_Lookup/hash_bucket": "StringToHashBucketFast",
           "table2_Lookup/hash_bucket": "StringToHashBucketStrong",
-      }, sess.graph)
+      }, ops.get_default_graph())
 
-  @test_util.run_deprecated_v1
   def testIdTableWithHashBucketsInitializationAcrossSessions(self):
     vocab_file = self._createVocabFile("feat_to_id_5.txt")
     with self.cached_session():
@@ -2606,7 +2677,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
                   vocab_file, vocab_size=vocab_size), default_value),
           oov_buckets)
 
-      table1.initializer.run()
+      self.evaluate(table1.initializer)
 
       input_string_1 = constant_op.constant(
           ["brain", "salad", "surgery", "UNK"])
@@ -2614,7 +2685,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       out1 = table1.lookup(input_string_1)
 
       self.assertAllEqual([0, 1, 2, 3], self.evaluate(out1))
-      self.assertEqual(vocab_size + oov_buckets, table1.size().eval())
+      self.assertEqual(vocab_size + oov_buckets, self.evaluate(table1.size()))
 
     with self.cached_session():
       default_value = -1
@@ -2622,7 +2693,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       oov_buckets = 1
 
       # Underlying lookup table already initialized in previous session.
-      # No need to call table2.initializer.run()
+      # No need to call self.evaluate(table2.initializer)
       table2 = lookup_ops.IdTableWithHashBuckets(
           lookup_ops.StaticHashTable(
               lookup_ops.TextFileIdTableInitializer(
@@ -2634,129 +2705,192 @@ class IdTableWithHashBucketsTest(test.TestCase):
       out2 = table2.lookup(input_string_2)
 
       self.assertAllEqual([3, 1, 3], self.evaluate(out2))
-      self.assertEqual(vocab_size + oov_buckets, table2.size().eval())
+      self.assertEqual(vocab_size + oov_buckets, self.evaluate(table2.size()))
 
-  @test_util.run_deprecated_v1
   def testIdTableWithHashBucketsWithMultipleInitializersDifferentDefault(self):
     vocab_file = self._createVocabFile("feat_to_id_6.txt")
-    with self.cached_session() as sess:
-      default_value1 = -1
-      vocab_size = 3
-      oov_buckets = 0
-      table1 = lookup_ops.IdTableWithHashBuckets(
-          lookup_ops.StaticHashTable(
-              lookup_ops.TextFileIdTableInitializer(
-                  vocab_file, vocab_size=vocab_size), default_value1),
-          oov_buckets)
+    default_value1 = -1
+    vocab_size = 3
+    oov_buckets = 0
+    table1 = lookup_ops.IdTableWithHashBuckets(
+        lookup_ops.StaticHashTable(
+            lookup_ops.TextFileIdTableInitializer(
+                vocab_file, vocab_size=vocab_size), default_value1),
+        oov_buckets)
 
-      default_value2 = -2
-      table2 = lookup_ops.IdTableWithHashBuckets(
-          lookup_ops.StaticHashTable(
-              lookup_ops.TextFileIdTableInitializer(
-                  vocab_file, vocab_size=vocab_size), default_value2),
-          oov_buckets)
+    default_value2 = -2
+    table2 = lookup_ops.IdTableWithHashBuckets(
+        lookup_ops.StaticHashTable(
+            lookup_ops.TextFileIdTableInitializer(
+                vocab_file, vocab_size=vocab_size), default_value2),
+        oov_buckets)
 
-      lookup_ops.tables_initializer().run()
+    self.evaluate(lookup_ops.tables_initializer())
 
-      input_string_1 = constant_op.constant(
-          ["brain", "salad", "surgery", "UNK"])
-      input_string_2 = constant_op.constant(["fruit", "salad", "UNK"])
+    input_string_1 = constant_op.constant(
+        ["brain", "salad", "surgery", "UNK"])
+    input_string_2 = constant_op.constant(["fruit", "salad", "UNK"])
 
-      out1 = table1.lookup(input_string_1)
-      out2 = table2.lookup(input_string_2)
+    out1 = table1.lookup(input_string_1)
+    out2 = table2.lookup(input_string_2)
 
-      out1, out2 = self.evaluate([out1, out2])
-      self.assertAllEqual([0, 1, 2, -1], out1)
-      self.assertAllEqual([-2, 1, -2], out2)
-      self.assertEqual(vocab_size + oov_buckets, table1.size().eval())
-      self.assertEqual(vocab_size + oov_buckets, table2.size().eval())
+    out1, out2 = self.evaluate([out1, out2])
+    self.assertAllEqual([0, 1, 2, -1], out1)
+    self.assertAllEqual([-2, 1, -2], out2)
+    self.assertEqual(vocab_size + oov_buckets, self.evaluate(table1.size()))
+    self.assertEqual(vocab_size + oov_buckets, self.evaluate(table2.size()))
 
-  @test_util.run_deprecated_v1
   def testSparseTensor(self):
     vocab_file = self._createVocabFile("feat_to_id_7.txt")
     input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
     input_shape = [4, 4]
-    with self.cached_session() as sess:
-      sp_features = sparse_tensor.SparseTensor(
-          constant_op.constant(input_indices, dtypes.int64),
-          constant_op.constant(["brain", "salad", "brain", "surgery", "tarkus"],
-                               dtypes.string),
-          constant_op.constant(input_shape, dtypes.int64))
+    sp_features = sparse_tensor.SparseTensor(
+        constant_op.constant(input_indices, dtypes.int64),
+        constant_op.constant(["brain", "salad", "brain", "surgery", "tarkus"],
+                             dtypes.string),
+        constant_op.constant(input_shape, dtypes.int64))
 
-      table = lookup_ops.IdTableWithHashBuckets(
-          lookup_ops.StaticHashTable(
-              lookup_ops.TextFileIdTableInitializer(vocab_file, vocab_size=3),
-              -1), 1)
-      table.initializer.run()
+    table = lookup_ops.IdTableWithHashBuckets(
+        lookup_ops.StaticHashTable(
+            lookup_ops.TextFileIdTableInitializer(vocab_file, vocab_size=3),
+            -1), 1)
+    self.evaluate(table.initializer)
 
-      sp_ids = table.lookup(sp_features)
+    sp_ids = table.lookup(sp_features)
 
-      self.assertAllEqual([5], sp_ids.values._shape_as_list())
+    self.assertAllEqual([5], sp_ids.values._shape_as_list())
 
-      sp_ids_ind, sp_ids_val, sp_ids_shape = sess.run(
-          [sp_ids.indices, sp_ids.values, sp_ids.dense_shape])
+    sp_ids_ind, sp_ids_val, sp_ids_shape = self.evaluate(
+        [sp_ids.indices, sp_ids.values, sp_ids.dense_shape])
 
-      self.assertAllEqual(input_indices, sp_ids_ind)
-      self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
-      self.assertAllEqual(input_shape, sp_ids_shape)
+    self.assertAllEqual(input_indices, sp_ids_ind)
+    self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
+    self.assertAllEqual(input_shape, sp_ids_shape)
+
+  def testRaggedTensor(self):
+    vocab_file = self._createVocabFile("feat_to_id_7.txt")
+    input_row_splits = [0, 2, 4, 5]
+    ragged_features = ragged_tensor.RaggedTensor.from_row_splits(
+        constant_op.constant(["brain", "salad", "brain", "surgery", "tarkus"],
+                             dtypes.string),
+        constant_op.constant(input_row_splits, dtypes.int64))
+
+    table = lookup_ops.IdTableWithHashBuckets(
+        lookup_ops.StaticHashTable(
+            lookup_ops.TextFileIdTableInitializer(vocab_file, vocab_size=3),
+            -1), 1)
+    self.evaluate(table.initializer)
+
+    ragged_ids = table.lookup(ragged_features)
+    self.assertAllEqual([5], ragged_ids.values._shape_as_list())
+
+    ragged_ids_val, ragged_ids_row_splits = self.evaluate(
+        [ragged_ids.values, ragged_ids.row_splits])
+
+    self.assertAllEqual([0, 1, 0, 2, 3], ragged_ids_val)
+    self.assertAllEqual(input_row_splits, ragged_ids_row_splits)
 
-  @test_util.run_deprecated_v1
   def testInt32SparseTensor(self):
     input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
     input_shape = [4, 4]
-    with self.cached_session() as sess:
-      sp_features = sparse_tensor.SparseTensor(
-          constant_op.constant(input_indices, dtypes.int64),
-          constant_op.constant([42, 1, 42, -1000, 11], dtypes.int32),
-          constant_op.constant(input_shape, dtypes.int64))
+    sp_features = sparse_tensor.SparseTensor(
+        constant_op.constant(input_indices, dtypes.int64),
+        constant_op.constant([42, 1, 42, -1000, 11], dtypes.int32),
+        constant_op.constant(input_shape, dtypes.int64))
 
-      table = lookup_ops.IdTableWithHashBuckets(
-          lookup_ops.StaticHashTable(
-              lookup_ops.KeyValueTensorInitializer(
-                  (42, 1, -1000), (0, 1, 2), dtypes.int64, dtypes.int64), -1),
-          1,
-          key_dtype=dtypes.int32)
-      table.initializer.run()
+    table = lookup_ops.IdTableWithHashBuckets(
+        lookup_ops.StaticHashTable(
+            lookup_ops.KeyValueTensorInitializer(
+                (42, 1, -1000), (0, 1, 2), dtypes.int64, dtypes.int64), -1),
+        1,
+        key_dtype=dtypes.int32)
+    self.evaluate(table.initializer)
 
-      sp_ids = table.lookup(sp_features)
+    sp_ids = table.lookup(sp_features)
 
-      self.assertAllEqual([5], sp_ids.values._shape_as_list())
+    self.assertAllEqual([5], sp_ids.values._shape_as_list())
 
-      sp_ids_ind, sp_ids_val, sp_ids_shape = sess.run(
-          [sp_ids.indices, sp_ids.values, sp_ids.dense_shape])
+    sp_ids_ind, sp_ids_val, sp_ids_shape = self.evaluate(
+        [sp_ids.indices, sp_ids.values, sp_ids.dense_shape])
 
-      self.assertAllEqual(input_indices, sp_ids_ind)
-      self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
-      self.assertAllEqual(input_shape, sp_ids_shape)
+    self.assertAllEqual(input_indices, sp_ids_ind)
+    self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
+    self.assertAllEqual(input_shape, sp_ids_shape)
+
+  def testInt32RaggedTensor(self):
+    input_row_splits = [0, 2, 4, 5]
+    ragged_features = ragged_tensor.RaggedTensor.from_row_splits(
+        constant_op.constant([42, 1, 42, -1000, 11], dtypes.int32),
+        constant_op.constant(input_row_splits, dtypes.int32))
+
+    table = lookup_ops.IdTableWithHashBuckets(
+        lookup_ops.StaticHashTable(
+            lookup_ops.KeyValueTensorInitializer(
+                (42, 1, -1000), (0, 1, 2), dtypes.int64, dtypes.int64), -1),
+        1,
+        key_dtype=dtypes.int32)
+    self.evaluate(table.initializer)
+
+    ragged_ids = table.lookup(ragged_features)
+
+    self.assertAllEqual([5], ragged_ids.values._shape_as_list())
+
+    ragged_ids_val, ragged_ids_row_splits = self.evaluate(
+        [ragged_ids.values, ragged_ids.row_splits])
+
+    self.assertAllEqual([0, 1, 0, 2, 3], ragged_ids_val)
+    self.assertAllEqual(input_row_splits, ragged_ids_row_splits)
 
-  @test_util.run_deprecated_v1
   def testInt64SparseTensor(self):
     input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
     input_shape = [4, 4]
-    with self.cached_session() as sess:
-      sp_features = sparse_tensor.SparseTensor(
-          constant_op.constant(input_indices, dtypes.int64),
-          constant_op.constant([42, 1, 42, -1000, 11], dtypes.int64),
-          constant_op.constant(input_shape, dtypes.int64))
+    sp_features = sparse_tensor.SparseTensor(
+        constant_op.constant(input_indices, dtypes.int64),
+        constant_op.constant([42, 1, 42, -1000, 11], dtypes.int64),
+        constant_op.constant(input_shape, dtypes.int64))
 
-      table = lookup_ops.IdTableWithHashBuckets(
-          lookup_ops.StaticHashTable(
-              lookup_ops.KeyValueTensorInitializer(
-                  (42, 1, -1000), (0, 1, 2), dtypes.int64, dtypes.int64), -1),
-          1,
-          key_dtype=dtypes.int64)
-      table.initializer.run()
+    table = lookup_ops.IdTableWithHashBuckets(
+        lookup_ops.StaticHashTable(
+            lookup_ops.KeyValueTensorInitializer(
+                (42, 1, -1000), (0, 1, 2), dtypes.int64, dtypes.int64), -1),
+        1,
+        key_dtype=dtypes.int64)
+    self.evaluate(table.initializer)
 
-      sp_ids = table.lookup(sp_features)
+    sp_ids = table.lookup(sp_features)
 
-      self.assertAllEqual([5], sp_ids.values._shape_as_list())
+    self.assertAllEqual([5], sp_ids.values._shape_as_list())
 
-      sp_ids_ind, sp_ids_val, sp_ids_shape = sess.run(
-          [sp_ids.indices, sp_ids.values, sp_ids.dense_shape])
+    sp_ids_ind, sp_ids_val, sp_ids_shape = self.evaluate(
+        [sp_ids.indices, sp_ids.values, sp_ids.dense_shape])
 
-      self.assertAllEqual(input_indices, sp_ids_ind)
-      self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
-      self.assertAllEqual(input_shape, sp_ids_shape)
+    self.assertAllEqual(input_indices, sp_ids_ind)
+    self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
+    self.assertAllEqual(input_shape, sp_ids_shape)
+
+  def testInt64RaggedTensor(self):
+    input_row_splits = [0, 2, 4, 5]
+    ragged_features = ragged_tensor.RaggedTensor.from_row_splits(
+        constant_op.constant([42, 1, 42, -1000, 11], dtypes.int64),
+        constant_op.constant(input_row_splits, dtypes.int64))
+
+    table = lookup_ops.IdTableWithHashBuckets(
+        lookup_ops.StaticHashTable(
+            lookup_ops.KeyValueTensorInitializer(
+                (42, 1, -1000), (0, 1, 2), dtypes.int64, dtypes.int64), -1),
+        1,
+        key_dtype=dtypes.int64)
+    self.evaluate(table.initializer)
+
+    ragged_ids = table.lookup(ragged_features)
+
+    self.assertAllEqual([5], ragged_ids.values._shape_as_list())
+
+    ragged_ids_val, ragged_ids_row_splits = self.evaluate(
+        [ragged_ids.values, ragged_ids.row_splits])
+
+    self.assertAllEqual([0, 1, 0, 2, 3], ragged_ids_val)
+    self.assertAllEqual(input_row_splits, ragged_ids_row_splits)
 
   def testIdTableWithHashBucketsWithInvalidHashers(self):
     vocab_file = self._createVocabFile("feat_to_id_4.txt")
diff --git a/tensorflow/python/kernel_tests/losses_test.py b/tensorflow/python/kernel_tests/losses_test.py
index 9e490407fd2..32cebe147f5 100644
--- a/tensorflow/python/kernel_tests/losses_test.py
+++ b/tensorflow/python/kernel_tests/losses_test.py
@@ -132,7 +132,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
       labels = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
       loss = losses.softmax_cross_entropy(labels, logits)
       self.assertEqual('softmax_cross_entropy_loss/value', loss.op.name)
-      self.assertAlmostEqual(loss.eval(), 0.0, 3)
+      self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
   @test_util.run_deprecated_v1
   def testAllWrong(self):
@@ -143,7 +143,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
     with self.cached_session():
       loss = losses.softmax_cross_entropy(labels, logits)
       self.assertEqual(loss.op.name, 'softmax_cross_entropy_loss/value')
-      self.assertAlmostEqual(loss.eval(), 10.0, 3)
+      self.assertAlmostEqual(self.evaluate(loss), 10.0, 3)
 
   @test_util.run_deprecated_v1
   def testNonZeroLossWithPythonScalarWeight(self):
@@ -225,7 +225,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
           labels, logits, label_smoothing=label_smoothing)
       self.assertEqual(loss.op.name, 'softmax_cross_entropy_loss/value')
       expected_value = 400.0 * label_smoothing / 3.0
-      self.assertAlmostEqual(loss.eval(), expected_value, 3)
+      self.assertAlmostEqual(self.evaluate(loss), expected_value, 3)
 
 
 class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
@@ -246,7 +246,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       labels = constant_op.constant([[0], [1], [2]], dtype=dtypes.int32)
       loss = losses.sparse_softmax_cross_entropy(labels, logits)
       self.assertEqual(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
-      self.assertAlmostEqual(loss.eval(), 0.0, 3)
+      self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
   def testEagerNoMemoryLeaked(self):
@@ -263,7 +263,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       labels = constant_op.constant([[0], [1], [2]], dtype=dtypes.int64)
       loss = losses.sparse_softmax_cross_entropy(labels, logits)
       self.assertEqual(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
-      self.assertAlmostEqual(loss.eval(), 0.0, 3)
+      self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
   @test_util.run_deprecated_v1
   def testAllCorrectNonColumnLabels(self):
@@ -273,7 +273,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       labels = constant_op.constant([0, 1, 2])
       loss = losses.sparse_softmax_cross_entropy(labels, logits)
       self.assertEqual(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
-      self.assertAlmostEqual(loss.eval(), 0.0, 3)
+      self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
   @test_util.run_deprecated_v1
   def testAllWrongInt32Labels(self):
@@ -284,7 +284,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits)
       self.assertEqual(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
-      self.assertAlmostEqual(loss.eval(), 10.0, 3)
+      self.assertAlmostEqual(self.evaluate(loss), 10.0, 3)
 
   @test_util.run_deprecated_v1
   def testAllWrongInt64Labels(self):
@@ -295,7 +295,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits)
       self.assertEqual(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
-      self.assertAlmostEqual(loss.eval(), 10.0, 3)
+      self.assertAlmostEqual(self.evaluate(loss), 10.0, 3)
 
   @test_util.run_deprecated_v1
   def testAllWrongNonColumnLabels(self):
@@ -306,7 +306,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits)
       self.assertEqual(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
-      self.assertAlmostEqual(loss.eval(), 10.0, 3)
+      self.assertAlmostEqual(self.evaluate(loss), 10.0, 3)
 
   @test_util.run_deprecated_v1
   def testNonZeroLossWithPythonScalarWeight(self):
@@ -551,7 +551,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       loss = losses.sigmoid_cross_entropy(labels, logits)
       self.assertEqual(logits.dtype, loss.dtype)
       self.assertEqual('sigmoid_cross_entropy_loss/value', loss.op.name)
-      self.assertAlmostEqual(loss.eval(), 600.0 / 9.0, 3)
+      self.assertAlmostEqual(self.evaluate(loss), 600.0 / 9.0, 3)
 
   @test_util.run_deprecated_v1
   def testAllWrongSigmoidWithMeasurementSpecificWeights(self):
@@ -630,7 +630,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       self.assertEqual(logits.dtype, loss.dtype)
       self.assertEqual('sigmoid_cross_entropy_loss/value', loss.op.name)
       expected_value = (100.0 + 50.0 * label_smoothing) / 3.0
-      self.assertAlmostEqual(loss.eval(), expected_value, 3)
+      self.assertAlmostEqual(self.evaluate(loss), expected_value, 3)
 
   @test_util.run_deprecated_v1
   def testSigmoidLabelSmoothingEqualsSoftmaxTwoLabel(self):
@@ -647,8 +647,8 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       softmax_labels = constant_op.constant([[0, 1], [1, 0], [0, 1]])
       softmax_loss = losses.softmax_cross_entropy(
           softmax_labels, softmax_logits, label_smoothing=label_smoothing)
-      self.assertAlmostEqual(sigmoid_loss.eval(), self.evaluate(softmax_loss),
-                             3)
+      self.assertAlmostEqual(
+          self.evaluate(sigmoid_loss), self.evaluate(softmax_loss), 3)
 
 
 @test_util.run_deprecated_v1
diff --git a/tensorflow/python/kernel_tests/map_ops_test.py b/tensorflow/python/kernel_tests/map_ops_test.py
new file mode 100644
index 00000000000..440d595804c
--- /dev/null
+++ b/tensorflow/python/kernel_tests/map_ops_test.py
@@ -0,0 +1,205 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Tests for TensorMap ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+from tensorflow.python.eager import backprop
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import map_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MapOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  def testEmptyTensorMapSize(self):
+    m = map_ops.empty_tensor_map()
+    s = map_ops.tensor_map_size(m)
+    self.assertAllEqual(s, 0)
+
+  def testTensorMapInsert(self):
+    m = map_ops.empty_tensor_map()
+    k = constant_op.constant(1.0)
+    v = constant_op.constant(2.0)
+    m = map_ops.tensor_map_insert(m, k, v)
+    s = map_ops.tensor_map_size(m)
+    self.assertAllEqual(s, 1)
+
+  def testTensorMapLookup(self):
+    m = map_ops.empty_tensor_map()
+    k = constant_op.constant(1.0)
+    v = constant_op.constant(2.0)
+    m = map_ops.tensor_map_insert(m, k, v)
+    l = map_ops.tensor_map_lookup(m, k, dtypes.float32)
+    self.assertAllClose(l, v)
+
+  def testTensorMapLookupMissingKeyFails(self):
+    m = map_ops.empty_tensor_map()
+    k = constant_op.constant(1.0)
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "Trying to lookup non-existent key."):
+      l = map_ops.tensor_map_lookup(m, k, dtypes.float32)
+      self.evaluate(l)
+
+  def testTensorMapErase(self):
+    m = map_ops.empty_tensor_map()
+    k = constant_op.constant(1.0)
+    v = constant_op.constant(2.0)
+    m = map_ops.tensor_map_insert(m, k, v)
+    s = map_ops.tensor_map_size(m)
+    self.assertAllEqual(s, 1)
+
+    m, e = map_ops.tensor_map_erase(m, k, dtypes.float32)
+    s = map_ops.tensor_map_size(m)
+    self.assertAllEqual(s, 0)
+    self.assertAllClose(e, v)
+
+  def testTensorMapEraseFromEmptyMapFails(self):
+    m = map_ops.empty_tensor_map()
+    k = constant_op.constant(1.0)
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "Trying to erase non-existent item."):
+      m, e = map_ops.tensor_map_erase(m, k, dtypes.float32)
+      self.evaluate(e)
+
+  def testTensorMapEraseMissingKeyFails(self):
+    m = map_ops.empty_tensor_map()
+    k = constant_op.constant(1.0)
+    k2 = constant_op.constant(2.0)
+    v = constant_op.constant(2.0)
+    m = map_ops.tensor_map_insert(m, k2, v)
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "Trying to erase non-existent item."):
+      m, e = map_ops.tensor_map_erase(m, k, dtypes.float32)
+      self.evaluate(e)
+
+  def testTensorMapHasKey(self):
+    m = map_ops.empty_tensor_map()
+    k = constant_op.constant(1.0)
+    k2 = constant_op.constant(2.0)
+    v = constant_op.constant(2.0)
+    m = map_ops.tensor_map_insert(m, k, v)
+
+    b = map_ops.tensor_map_has_key(m, k)
+    b2 = map_ops.tensor_map_has_key(m, k2)
+    self.assertAllEqual(b, True)
+    self.assertAllEqual(b2, False)
+
+  def testHasKeyLookup(self):
+    with self.test_session():
+      m = map_ops.empty_tensor_map()
+      k = constant_op.constant(1.0)
+      k2 = constant_op.constant(2.0)
+      v = constant_op.constant(2.0)
+      m = map_ops.tensor_map_insert(m, k, v)
+
+      default_value = array_ops.zeros_like(v)
+      l = control_flow_ops.cond(
+          map_ops.tensor_map_has_key(m, k),
+          lambda: map_ops.tensor_map_lookup(m, k, dtypes.float32),
+          lambda: default_value)
+      l2 = control_flow_ops.cond(
+          map_ops.tensor_map_has_key(m, k2),
+          lambda: map_ops.tensor_map_lookup(m, k, dtypes.float32),
+          lambda: default_value)
+      self.assertAllClose(l, v)
+      self.assertAllClose(l2, default_value)
+
+  def testInsertLookupGrad(self):
+    with backprop.GradientTape() as tape:
+      m = map_ops.empty_tensor_map()
+      k = constant_op.constant(1.0)
+      v = constant_op.constant(2.0)
+      tape.watch(v)
+      m = map_ops.tensor_map_insert(m, k, v)
+      l = map_ops.tensor_map_lookup(m, k, dtypes.float32)
+      l *= 5
+      g = tape.gradient(l, v)
+      self.assertAllClose(g, 5)
+
+  def testMultipleInsertLookupGrad(self):
+    with backprop.GradientTape(persistent=True) as tape:
+      m = map_ops.empty_tensor_map()
+      k = constant_op.constant(1.0)
+      v = constant_op.constant(2.0)
+      k2 = constant_op.constant(12.0)
+      v2 = constant_op.constant(22.0)
+      k3 = constant_op.constant(13.0)
+      v3 = constant_op.constant(23.0)
+      tape.watch(v)
+      tape.watch(v2)
+      tape.watch(v3)
+      m = map_ops.tensor_map_insert(m, k, v)
+      m = map_ops.tensor_map_insert(m, k2, v2)
+      m = map_ops.tensor_map_insert(m, k3, v3)
+
+      l = map_ops.tensor_map_lookup(m, k, v.dtype)
+      l2 = map_ops.tensor_map_lookup(m, k2, v2.dtype)
+      l3 = map_ops.tensor_map_lookup(m, k3, v3.dtype)
+      g = tape.gradient(l * 5, v)
+      g2 = tape.gradient(l2 * 6, v2)
+      g3 = tape.gradient(l3 * 7, v3)
+      self.assertAllClose(g, 5)
+      self.assertAllClose(g2, 6)
+      self.assertAllClose(g3, 7)
+
+  def testSameKeyInsertLookupGrad(self):
+    with backprop.GradientTape(persistent=True) as tape:
+      m = map_ops.empty_tensor_map()
+      k = constant_op.constant(1.0)
+      v = constant_op.constant(2.0)
+      v2 = constant_op.constant(22.0)
+      tape.watch(v)
+      tape.watch(v2)
+      m = map_ops.tensor_map_insert(m, k, v)
+      m = map_ops.tensor_map_insert(m, k, v2)
+      l = map_ops.tensor_map_lookup(m, k, v.dtype)
+      g = tape.gradient(l * 5, v)
+      g2 = tape.gradient(l * 5, v2)
+      self.assertAllClose(g, array_ops.zeros_like(v))
+      self.assertAllClose(g2, 5)
+
+  def testSameKeyAlternatingInsertLookupGrad(self):
+    with backprop.GradientTape(persistent=True) as tape:
+      m = map_ops.empty_tensor_map()
+      k = constant_op.constant(1.0)
+      v = constant_op.constant(2.0)
+      v2 = constant_op.constant(22.0)
+      tape.watch(v)
+      tape.watch(v2)
+      m = map_ops.tensor_map_insert(m, k, v)
+      l = map_ops.tensor_map_lookup(m, k, v.dtype)
+      self.assertAllClose(l, v)
+      g = tape.gradient(l * 5, v)
+      self.assertAllClose(g, 5)
+      m = map_ops.tensor_map_insert(m, k, v2)
+      l2 = map_ops.tensor_map_lookup(m, k, v2.dtype)
+      self.assertAllClose(l2, v2)
+      g2 = tape.gradient(l2 * 6, v)
+      g3 = tape.gradient(l2 * 7, v2)
+      self.assertAllClose(g2, array_ops.zeros_like(v))
+      self.assertAllClose(g3, 7)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/metrics_test.py b/tensorflow/python/kernel_tests/metrics_test.py
index 81e1defbd48..8783d39cce2 100644
--- a/tensorflow/python/kernel_tests/metrics_test.py
+++ b/tensorflow/python/kernel_tests/metrics_test.py
@@ -264,8 +264,8 @@ class MeanTest(test.TestCase):
       variables.local_variables_initializer().run()
       for mean_result in mean_results:
         mean, update_op = mean_result
-        self.assertAlmostEqual(expected, update_op.eval())
-        self.assertAlmostEqual(expected, mean.eval())
+        self.assertAlmostEqual(expected, self.evaluate(update_op))
+        self.assertAlmostEqual(expected, self.evaluate(mean))
 
   def _test_3d_weighted(self, values, weights):
     expected = (
@@ -275,8 +275,8 @@ class MeanTest(test.TestCase):
     mean, update_op = metrics.mean(values, weights=weights)
     with self.cached_session():
       variables.local_variables_initializer().run()
-      self.assertAlmostEqual(expected, update_op.eval(), places=5)
-      self.assertAlmostEqual(expected, mean.eval(), places=5)
+      self.assertAlmostEqual(expected, self.evaluate(update_op), places=5)
+      self.assertAlmostEqual(expected, self.evaluate(mean), places=5)
 
   @test_util.run_deprecated_v1
   def test1x1x1Weighted(self):
@@ -615,9 +615,9 @@ class AccuracyTest(test.TestCase):
         self.evaluate(update_op)
 
       # Then verify idempotency.
-      initial_accuracy = accuracy.eval()
+      initial_accuracy = self.evaluate(accuracy)
       for _ in range(10):
-        self.assertEqual(initial_accuracy, accuracy.eval())
+        self.assertEqual(initial_accuracy, self.evaluate(accuracy))
 
   @test_util.run_deprecated_v1
   def testMultipleUpdates(self):
@@ -646,7 +646,7 @@ class AccuracyTest(test.TestCase):
       for _ in xrange(3):
         self.evaluate(update_op)
       self.assertEqual(0.5, self.evaluate(update_op))
-      self.assertEqual(0.5, accuracy.eval())
+      self.assertEqual(0.5, self.evaluate(accuracy))
 
   @test_util.run_deprecated_v1
   def testEffectivelyEquivalentSizes(self):
@@ -656,8 +656,8 @@ class AccuracyTest(test.TestCase):
       accuracy, update_op = metrics.accuracy(labels, predictions)
 
       self.evaluate(variables.local_variables_initializer())
-      self.assertEqual(1.0, update_op.eval())
-      self.assertEqual(1.0, accuracy.eval())
+      self.assertEqual(1.0, self.evaluate(update_op))
+      self.assertEqual(1.0, self.evaluate(accuracy))
 
   @test_util.run_deprecated_v1
   def testEffectivelyEquivalentSizesWithScalarWeight(self):
@@ -667,8 +667,8 @@ class AccuracyTest(test.TestCase):
       accuracy, update_op = metrics.accuracy(labels, predictions, weights=2.0)
 
       self.evaluate(variables.local_variables_initializer())
-      self.assertEqual(1.0, update_op.eval())
-      self.assertEqual(1.0, accuracy.eval())
+      self.assertEqual(1.0, self.evaluate(update_op))
+      self.assertEqual(1.0, self.evaluate(accuracy))
 
   @test_util.run_deprecated_v1
   def testEffectivelyEquivalentSizesWithStaticShapedWeight(self):
@@ -685,8 +685,8 @@ class AccuracyTest(test.TestCase):
       # if streaming_accuracy does not flatten the weight, accuracy would be
       # 0.33333334 due to an intended broadcast of weight. Due to flattening,
       # it will be higher than .95
-      self.assertGreater(update_op.eval(), .95)
-      self.assertGreater(accuracy.eval(), .95)
+      self.assertGreater(self.evaluate(update_op), .95)
+      self.assertGreater(self.evaluate(accuracy), .95)
 
   @test_util.run_deprecated_v1
   def testEffectivelyEquivalentSizesWithDynamicallyShapedWeight(self):
@@ -746,7 +746,7 @@ class AccuracyTest(test.TestCase):
       for _ in xrange(3):
         self.evaluate(update_op)
       self.assertEqual(1.0, self.evaluate(update_op))
-      self.assertEqual(1.0, accuracy.eval())
+      self.assertEqual(1.0, self.evaluate(accuracy))
 
 
 class PrecisionTest(test.TestCase):
@@ -796,9 +796,9 @@ class PrecisionTest(test.TestCase):
         self.evaluate(update_op)
 
       # Then verify idempotency.
-      initial_precision = precision.eval()
+      initial_precision = self.evaluate(precision)
       for _ in range(10):
-        self.assertEqual(initial_precision, precision.eval())
+        self.assertEqual(initial_precision, self.evaluate(precision))
 
   @test_util.run_deprecated_v1
   def testAllCorrect(self):
@@ -811,7 +811,7 @@ class PrecisionTest(test.TestCase):
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
       self.assertAlmostEqual(1.0, self.evaluate(update_op), 6)
-      self.assertAlmostEqual(1.0, precision.eval(), 6)
+      self.assertAlmostEqual(1.0, self.evaluate(precision), 6)
 
   @test_util.run_deprecated_v1
   def testSomeCorrect_multipleInputDtypes(self):
@@ -824,8 +824,8 @@ class PrecisionTest(test.TestCase):
 
       with self.cached_session():
         self.evaluate(variables.local_variables_initializer())
-        self.assertAlmostEqual(0.5, update_op.eval())
-        self.assertAlmostEqual(0.5, precision.eval())
+        self.assertAlmostEqual(0.5, self.evaluate(update_op))
+        self.assertAlmostEqual(0.5, self.evaluate(precision))
 
   @test_util.run_deprecated_v1
   def testWeighted1d(self):
@@ -839,8 +839,8 @@ class PrecisionTest(test.TestCase):
       weighted_tp = 2.0 + 5.0
       weighted_positives = (2.0 + 2.0) + (5.0 + 5.0)
       expected_precision = weighted_tp / weighted_positives
-      self.assertAlmostEqual(expected_precision, update_op.eval())
-      self.assertAlmostEqual(expected_precision, precision.eval())
+      self.assertAlmostEqual(expected_precision, self.evaluate(update_op))
+      self.assertAlmostEqual(expected_precision, self.evaluate(precision))
 
   @test_util.run_deprecated_v1
   def testWeightedScalar_placeholders(self):
@@ -897,8 +897,8 @@ class PrecisionTest(test.TestCase):
       weighted_tp = 3.0 + 4.0
       weighted_positives = (1.0 + 3.0) + (4.0 + 2.0)
       expected_precision = weighted_tp / weighted_positives
-      self.assertAlmostEqual(expected_precision, update_op.eval())
-      self.assertAlmostEqual(expected_precision, precision.eval())
+      self.assertAlmostEqual(expected_precision, self.evaluate(update_op))
+      self.assertAlmostEqual(expected_precision, self.evaluate(precision))
 
   @test_util.run_deprecated_v1
   def testWeighted2d_placeholders(self):
@@ -934,7 +934,7 @@ class PrecisionTest(test.TestCase):
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
       self.evaluate(update_op)
-      self.assertAlmostEqual(0, precision.eval())
+      self.assertAlmostEqual(0, self.evaluate(precision))
 
   @test_util.run_deprecated_v1
   def testZeroTrueAndFalsePositivesGivesZeroPrecision(self):
@@ -945,7 +945,7 @@ class PrecisionTest(test.TestCase):
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
       self.evaluate(update_op)
-      self.assertEqual(0.0, precision.eval())
+      self.assertEqual(0.0, self.evaluate(precision))
 
 
 class RecallTest(test.TestCase):
@@ -996,9 +996,9 @@ class RecallTest(test.TestCase):
         self.evaluate(update_op)
 
       # Then verify idempotency.
-      initial_recall = recall.eval()
+      initial_recall = self.evaluate(recall)
       for _ in range(10):
-        self.assertEqual(initial_recall, recall.eval())
+        self.assertEqual(initial_recall, self.evaluate(recall))
 
   @test_util.run_deprecated_v1
   def testAllCorrect(self):
@@ -1011,7 +1011,7 @@ class RecallTest(test.TestCase):
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
       self.evaluate(update_op)
-      self.assertAlmostEqual(1.0, recall.eval(), 6)
+      self.assertAlmostEqual(1.0, self.evaluate(recall), 6)
 
   @test_util.run_deprecated_v1
   def testSomeCorrect_multipleInputDtypes(self):
@@ -1024,8 +1024,8 @@ class RecallTest(test.TestCase):
 
       with self.cached_session():
         self.evaluate(variables.local_variables_initializer())
-        self.assertAlmostEqual(0.5, update_op.eval())
-        self.assertAlmostEqual(0.5, recall.eval())
+        self.assertAlmostEqual(0.5, self.evaluate(update_op))
+        self.assertAlmostEqual(0.5, self.evaluate(recall))
 
   @test_util.run_deprecated_v1
   def testWeighted1d(self):
@@ -1039,8 +1039,8 @@ class RecallTest(test.TestCase):
       weighted_tp = 2.0 + 5.0
       weighted_t = (2.0 + 2.0) + (5.0 + 5.0)
       expected_precision = weighted_tp / weighted_t
-      self.assertAlmostEqual(expected_precision, update_op.eval())
-      self.assertAlmostEqual(expected_precision, recall.eval())
+      self.assertAlmostEqual(expected_precision, self.evaluate(update_op))
+      self.assertAlmostEqual(expected_precision, self.evaluate(recall))
 
   @test_util.run_deprecated_v1
   def testWeighted2d(self):
@@ -1054,8 +1054,8 @@ class RecallTest(test.TestCase):
       weighted_tp = 3.0 + 1.0
       weighted_t = (2.0 + 3.0) + (4.0 + 1.0)
       expected_precision = weighted_tp / weighted_t
-      self.assertAlmostEqual(expected_precision, update_op.eval())
-      self.assertAlmostEqual(expected_precision, recall.eval())
+      self.assertAlmostEqual(expected_precision, self.evaluate(update_op))
+      self.assertAlmostEqual(expected_precision, self.evaluate(recall))
 
   @test_util.run_deprecated_v1
   def testAllIncorrect(self):
@@ -1068,7 +1068,7 @@ class RecallTest(test.TestCase):
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
       self.evaluate(update_op)
-      self.assertEqual(0, recall.eval())
+      self.assertEqual(0, self.evaluate(recall))
 
   @test_util.run_deprecated_v1
   def testZeroTruePositivesAndFalseNegativesGivesZeroRecall(self):
@@ -1079,7 +1079,7 @@ class RecallTest(test.TestCase):
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
       self.evaluate(update_op)
-      self.assertEqual(0, recall.eval())
+      self.assertEqual(0, self.evaluate(recall))
 
 
 class AUCTest(test.TestCase):
@@ -1128,9 +1128,9 @@ class AUCTest(test.TestCase):
         self.evaluate(update_op)
 
       # Then verify idempotency.
-      initial_auc = auc.eval()
+      initial_auc = self.evaluate(auc)
       for _ in range(10):
-        self.assertAlmostEqual(initial_auc, auc.eval(), 5)
+        self.assertAlmostEqual(initial_auc, self.evaluate(auc), 5)
 
   @test_util.run_deprecated_v1
   def testAllCorrect(self):
@@ -1147,7 +1147,7 @@ class AUCTest(test.TestCase):
       self.evaluate(variables.local_variables_initializer())
       self.assertEqual(1, self.evaluate(update_op))
 
-      self.assertEqual(1, auc.eval())
+      self.assertEqual(1, self.evaluate(auc))
 
   @test_util.run_deprecated_v1
   def testSomeCorrect_multipleLabelDtypes(self):
@@ -1163,7 +1163,7 @@ class AUCTest(test.TestCase):
         self.evaluate(variables.local_variables_initializer())
         self.assertAlmostEqual(0.5, self.evaluate(update_op))
 
-        self.assertAlmostEqual(0.5, auc.eval())
+        self.assertAlmostEqual(0.5, self.evaluate(auc))
 
   @test_util.run_deprecated_v1
   def testWeighted1d(self):
@@ -1177,7 +1177,7 @@ class AUCTest(test.TestCase):
       self.evaluate(variables.local_variables_initializer())
       self.assertAlmostEqual(0.5, self.evaluate(update_op), 5)
 
-      self.assertAlmostEqual(0.5, auc.eval(), 5)
+      self.assertAlmostEqual(0.5, self.evaluate(auc), 5)
 
   @test_util.run_deprecated_v1
   def testWeighted2d(self):
@@ -1191,7 +1191,7 @@ class AUCTest(test.TestCase):
       self.evaluate(variables.local_variables_initializer())
       self.assertAlmostEqual(0.7, self.evaluate(update_op), 5)
 
-      self.assertAlmostEqual(0.7, auc.eval(), 5)
+      self.assertAlmostEqual(0.7, self.evaluate(auc), 5)
 
   @test_util.run_deprecated_v1
   def testManualThresholds(self):
@@ -1216,10 +1216,10 @@ class AUCTest(test.TestCase):
 
       self.evaluate(variables.local_variables_initializer())
       self.assertAlmostEqual(0.875, self.evaluate(default_update_op), 3)
-      self.assertAlmostEqual(0.875, default_auc.eval(), 3)
+      self.assertAlmostEqual(0.875, self.evaluate(default_auc), 3)
 
       self.assertAlmostEqual(0.75, self.evaluate(manual_update_op), 3)
-      self.assertAlmostEqual(0.75, manual_auc.eval(), 3)
+      self.assertAlmostEqual(0.75, self.evaluate(manual_auc), 3)
 
   # Regarding the AUC-PR tests: note that the preferred method when
   # calculating AUC-PR is summation_method='careful_interpolation'.
@@ -1236,7 +1236,7 @@ class AUCTest(test.TestCase):
       # expected ~= 0.79726744594
       expected = 1 - math.log(1.5) / 2
       self.assertAlmostEqual(expected, self.evaluate(update_op), delta=1e-3)
-      self.assertAlmostEqual(expected, auc.eval(), delta=1e-3)
+      self.assertAlmostEqual(expected, self.evaluate(auc), delta=1e-3)
 
   @test_util.run_deprecated_v1
   def testCorrectAnotherAUCPRSpecialCase(self):
@@ -1253,7 +1253,7 @@ class AUCTest(test.TestCase):
       # expected ~= 0.61350593198
       expected = (2.5 - 2 * math.log(4./3) - 0.25 * math.log(7./5)) / 3
       self.assertAlmostEqual(expected, self.evaluate(update_op), delta=1e-3)
-      self.assertAlmostEqual(expected, auc.eval(), delta=1e-3)
+      self.assertAlmostEqual(expected, self.evaluate(auc), delta=1e-3)
 
   @test_util.run_deprecated_v1
   def testThirdCorrectAUCPRSpecialCase(self):
@@ -1270,7 +1270,7 @@ class AUCTest(test.TestCase):
       # expected ~= 0.90410597584
       expected = 1 - math.log(4./3) / 3
       self.assertAlmostEqual(expected, self.evaluate(update_op), delta=1e-3)
-      self.assertAlmostEqual(expected, auc.eval(), delta=1e-3)
+      self.assertAlmostEqual(expected, self.evaluate(auc), delta=1e-3)
 
   @test_util.run_deprecated_v1
   def testIncorrectAUCPRSpecialCase(self):
@@ -1284,7 +1284,7 @@ class AUCTest(test.TestCase):
       self.evaluate(variables.local_variables_initializer())
       self.assertAlmostEqual(0.79166, self.evaluate(update_op), delta=1e-3)
 
-      self.assertAlmostEqual(0.79166, auc.eval(), delta=1e-3)
+      self.assertAlmostEqual(0.79166, self.evaluate(auc), delta=1e-3)
 
   @test_util.run_deprecated_v1
   def testAnotherIncorrectAUCPRSpecialCase(self):
@@ -1300,7 +1300,7 @@ class AUCTest(test.TestCase):
       self.evaluate(variables.local_variables_initializer())
       self.assertAlmostEqual(0.610317, self.evaluate(update_op), delta=1e-3)
 
-      self.assertAlmostEqual(0.610317, auc.eval(), delta=1e-3)
+      self.assertAlmostEqual(0.610317, self.evaluate(auc), delta=1e-3)
 
   @test_util.run_deprecated_v1
   def testThirdIncorrectAUCPRSpecialCase(self):
@@ -1316,7 +1316,7 @@ class AUCTest(test.TestCase):
       self.evaluate(variables.local_variables_initializer())
       self.assertAlmostEqual(0.90277, self.evaluate(update_op), delta=1e-3)
 
-      self.assertAlmostEqual(0.90277, auc.eval(), delta=1e-3)
+      self.assertAlmostEqual(0.90277, self.evaluate(auc), delta=1e-3)
 
   @test_util.run_deprecated_v1
   def testAllIncorrect(self):
@@ -1330,7 +1330,7 @@ class AUCTest(test.TestCase):
       self.evaluate(variables.local_variables_initializer())
       self.assertAlmostEqual(0, self.evaluate(update_op))
 
-      self.assertAlmostEqual(0, auc.eval())
+      self.assertAlmostEqual(0, self.evaluate(auc))
 
   @test_util.run_deprecated_v1
   def testZeroTruePositivesAndFalseNegativesGivesOneAUC(self):
@@ -1342,7 +1342,7 @@ class AUCTest(test.TestCase):
       self.evaluate(variables.local_variables_initializer())
       self.assertAlmostEqual(1, self.evaluate(update_op), 6)
 
-      self.assertAlmostEqual(1, auc.eval(), 6)
+      self.assertAlmostEqual(1, self.evaluate(auc), 6)
 
   @test_util.run_deprecated_v1
   def testRecallOneAndPrecisionOneGivesOnePRAUC(self):
@@ -1354,7 +1354,7 @@ class AUCTest(test.TestCase):
       self.evaluate(variables.local_variables_initializer())
       self.assertAlmostEqual(1, self.evaluate(update_op), 6)
 
-      self.assertAlmostEqual(1, auc.eval(), 6)
+      self.assertAlmostEqual(1, self.evaluate(auc), 6)
 
   def np_auc(self, predictions, labels, weights):
     """Computes the AUC explicitly using Numpy.
@@ -1431,7 +1431,7 @@ class AUCTest(test.TestCase):
         # Since this is only approximate, we can't expect a 6 digits match.
         # Although with higher number of samples/thresholds we should see the
         # accuracy improving
-        self.assertAlmostEqual(expected_auc, auc.eval(), 2)
+        self.assertAlmostEqual(expected_auc, self.evaluate(auc), 2)
 
 
 class SpecificityAtSensitivityTest(test.TestCase):
@@ -1489,9 +1489,10 @@ class SpecificityAtSensitivityTest(test.TestCase):
         self.evaluate(update_op)
 
       # Then verify idempotency.
-      initial_specificity = specificity.eval()
+      initial_specificity = self.evaluate(specificity)
       for _ in range(10):
-        self.assertAlmostEqual(initial_specificity, specificity.eval(), 5)
+        self.assertAlmostEqual(initial_specificity, self.evaluate(specificity),
+                               5)
 
   @test_util.run_deprecated_v1
   def testAllCorrect(self):
@@ -1505,7 +1506,7 @@ class SpecificityAtSensitivityTest(test.TestCase):
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
       self.assertEqual(1, self.evaluate(update_op))
-      self.assertEqual(1, specificity.eval())
+      self.assertEqual(1, self.evaluate(specificity))
 
   @test_util.run_deprecated_v1
   def testSomeCorrectHighSensitivity(self):
@@ -1521,7 +1522,7 @@ class SpecificityAtSensitivityTest(test.TestCase):
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
       self.assertAlmostEqual(1.0, self.evaluate(update_op))
-      self.assertAlmostEqual(1.0, specificity.eval())
+      self.assertAlmostEqual(1.0, self.evaluate(specificity))
 
   @test_util.run_deprecated_v1
   def testSomeCorrectLowSensitivity(self):
@@ -1538,7 +1539,7 @@ class SpecificityAtSensitivityTest(test.TestCase):
       self.evaluate(variables.local_variables_initializer())
 
       self.assertAlmostEqual(0.6, self.evaluate(update_op))
-      self.assertAlmostEqual(0.6, specificity.eval())
+      self.assertAlmostEqual(0.6, self.evaluate(specificity))
 
   @test_util.run_deprecated_v1
   def testWeighted1d_multipleLabelDtypes(self):
@@ -1558,7 +1559,7 @@ class SpecificityAtSensitivityTest(test.TestCase):
         self.evaluate(variables.local_variables_initializer())
 
         self.assertAlmostEqual(0.6, self.evaluate(update_op))
-        self.assertAlmostEqual(0.6, specificity.eval())
+        self.assertAlmostEqual(0.6, self.evaluate(specificity))
 
   @test_util.run_deprecated_v1
   def testWeighted2d(self):
@@ -1577,7 +1578,7 @@ class SpecificityAtSensitivityTest(test.TestCase):
       self.evaluate(variables.local_variables_initializer())
 
       self.assertAlmostEqual(8.0 / 15.0, self.evaluate(update_op))
-      self.assertAlmostEqual(8.0 / 15.0, specificity.eval())
+      self.assertAlmostEqual(8.0 / 15.0, self.evaluate(specificity))
 
 
 class SensitivityAtSpecificityTest(test.TestCase):
@@ -1635,9 +1636,10 @@ class SensitivityAtSpecificityTest(test.TestCase):
         self.evaluate(update_op)
 
       # Then verify idempotency.
-      initial_sensitivity = sensitivity.eval()
+      initial_sensitivity = self.evaluate(sensitivity)
       for _ in range(10):
-        self.assertAlmostEqual(initial_sensitivity, sensitivity.eval(), 5)
+        self.assertAlmostEqual(initial_sensitivity, self.evaluate(sensitivity),
+                               5)
 
   @test_util.run_deprecated_v1
   def testAllCorrect(self):
@@ -1651,7 +1653,7 @@ class SensitivityAtSpecificityTest(test.TestCase):
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
       self.assertAlmostEqual(1.0, self.evaluate(update_op), 6)
-      self.assertAlmostEqual(1.0, specificity.eval(), 6)
+      self.assertAlmostEqual(1.0, self.evaluate(specificity), 6)
 
   @test_util.run_deprecated_v1
   def testSomeCorrectHighSpecificity(self):
@@ -1667,7 +1669,7 @@ class SensitivityAtSpecificityTest(test.TestCase):
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
       self.assertAlmostEqual(0.8, self.evaluate(update_op))
-      self.assertAlmostEqual(0.8, specificity.eval())
+      self.assertAlmostEqual(0.8, self.evaluate(specificity))
 
   @test_util.run_deprecated_v1
   def testSomeCorrectLowSpecificity(self):
@@ -1683,7 +1685,7 @@ class SensitivityAtSpecificityTest(test.TestCase):
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
       self.assertAlmostEqual(0.6, self.evaluate(update_op))
-      self.assertAlmostEqual(0.6, specificity.eval())
+      self.assertAlmostEqual(0.6, self.evaluate(specificity))
 
   @test_util.run_deprecated_v1
   def testWeighted_multipleLabelDtypes(self):
@@ -1703,7 +1705,7 @@ class SensitivityAtSpecificityTest(test.TestCase):
       with self.cached_session():
         self.evaluate(variables.local_variables_initializer())
         self.assertAlmostEqual(0.675, self.evaluate(update_op))
-        self.assertAlmostEqual(0.675, specificity.eval())
+        self.assertAlmostEqual(0.675, self.evaluate(specificity))
 
 
 # TODO(nsilberman): Break this up into two sets of tests.
@@ -1771,8 +1773,8 @@ class PrecisionRecallThresholdsTest(test.TestCase):
 
       # Run several updates, then verify idempotency.
       self.evaluate([prec_op, rec_op])
-      initial_prec = prec.eval()
-      initial_rec = rec.eval()
+      initial_prec = self.evaluate(prec)
+      initial_rec = self.evaluate(rec)
       for _ in range(10):
         self.evaluate([prec_op, rec_op])
         self.assertAllClose(initial_prec, prec)
@@ -1795,8 +1797,8 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       self.evaluate(variables.local_variables_initializer())
       self.evaluate([prec_op, rec_op])
 
-      self.assertEqual(1, prec.eval())
-      self.assertEqual(1, rec.eval())
+      self.assertEqual(1, self.evaluate(prec))
+      self.assertEqual(1, self.evaluate(rec))
 
   @test_util.run_deprecated_v1
   def testSomeCorrect_multipleLabelDtypes(self):
@@ -1816,8 +1818,8 @@ class PrecisionRecallThresholdsTest(test.TestCase):
         self.evaluate(variables.local_variables_initializer())
         self.evaluate([prec_op, rec_op])
 
-        self.assertAlmostEqual(0.5, prec.eval())
-        self.assertAlmostEqual(0.5, rec.eval())
+        self.assertAlmostEqual(0.5, self.evaluate(prec))
+        self.assertAlmostEqual(0.5, self.evaluate(rec))
 
   @test_util.run_deprecated_v1
   def testAllIncorrect(self):
@@ -1835,8 +1837,8 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       self.evaluate(variables.local_variables_initializer())
       self.evaluate([prec_op, rec_op])
 
-      self.assertAlmostEqual(0, prec.eval())
-      self.assertAlmostEqual(0, rec.eval())
+      self.assertAlmostEqual(0, self.evaluate(prec))
+      self.assertAlmostEqual(0, self.evaluate(rec))
 
   @test_util.run_deprecated_v1
   def testWeights1d(self):
@@ -1864,10 +1866,10 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       self.evaluate(variables.local_variables_initializer())
       self.evaluate([prec_op, rec_op])
 
-      self.assertAlmostEqual(1.0, prec_low.eval(), places=5)
-      self.assertAlmostEqual(0.0, prec_high.eval(), places=5)
-      self.assertAlmostEqual(1.0, rec_low.eval(), places=5)
-      self.assertAlmostEqual(0.0, rec_high.eval(), places=5)
+      self.assertAlmostEqual(1.0, self.evaluate(prec_low), places=5)
+      self.assertAlmostEqual(0.0, self.evaluate(prec_high), places=5)
+      self.assertAlmostEqual(1.0, self.evaluate(rec_low), places=5)
+      self.assertAlmostEqual(0.0, self.evaluate(rec_high), places=5)
 
   @test_util.run_deprecated_v1
   def testWeights2d(self):
@@ -1895,10 +1897,10 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       self.evaluate(variables.local_variables_initializer())
       self.evaluate([prec_op, rec_op])
 
-      self.assertAlmostEqual(1.0, prec_low.eval(), places=5)
-      self.assertAlmostEqual(0.0, prec_high.eval(), places=5)
-      self.assertAlmostEqual(1.0, rec_low.eval(), places=5)
-      self.assertAlmostEqual(0.0, rec_high.eval(), places=5)
+      self.assertAlmostEqual(1.0, self.evaluate(prec_low), places=5)
+      self.assertAlmostEqual(0.0, self.evaluate(prec_high), places=5)
+      self.assertAlmostEqual(1.0, self.evaluate(rec_low), places=5)
+      self.assertAlmostEqual(0.0, self.evaluate(rec_high), places=5)
 
   @test_util.run_deprecated_v1
   def testExtremeThresholds(self):
@@ -1920,10 +1922,10 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       self.evaluate(variables.local_variables_initializer())
       self.evaluate([prec_op, rec_op])
 
-      self.assertAlmostEqual(0.75, prec_low.eval())
-      self.assertAlmostEqual(0.0, prec_high.eval())
-      self.assertAlmostEqual(1.0, rec_low.eval())
-      self.assertAlmostEqual(0.0, rec_high.eval())
+      self.assertAlmostEqual(0.75, self.evaluate(prec_low))
+      self.assertAlmostEqual(0.0, self.evaluate(prec_high))
+      self.assertAlmostEqual(1.0, self.evaluate(rec_low))
+      self.assertAlmostEqual(0.0, self.evaluate(rec_high))
 
   @test_util.run_deprecated_v1
   def testZeroLabelsPredictions(self):
@@ -1939,8 +1941,8 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       self.evaluate(variables.local_variables_initializer())
       self.evaluate([prec_op, rec_op])
 
-      self.assertAlmostEqual(0, prec.eval(), 6)
-      self.assertAlmostEqual(0, rec.eval(), 6)
+      self.assertAlmostEqual(0, self.evaluate(prec), 6)
+      self.assertAlmostEqual(0, self.evaluate(rec), 6)
 
   @test_util.run_deprecated_v1
   def testWithMultipleUpdates(self):
@@ -2011,8 +2013,8 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       # Since this is only approximate, we can't expect a 6 digits match.
       # Although with higher number of samples/thresholds we should see the
       # accuracy improving
-      self.assertAlmostEqual(expected_prec, prec.eval(), 2)
-      self.assertAlmostEqual(expected_rec, rec.eval(), 2)
+      self.assertAlmostEqual(expected_prec, self.evaluate(prec), 2)
+      self.assertAlmostEqual(expected_rec, self.evaluate(rec), 2)
 
 
 def _test_precision_at_k(predictions,
@@ -3001,9 +3003,9 @@ class MeanAbsoluteErrorTest(test.TestCase):
         self.evaluate(update_op)
 
       # Then verify idempotency.
-      initial_error = error.eval()
+      initial_error = self.evaluate(error)
       for _ in range(10):
-        self.assertEqual(initial_error, error.eval())
+        self.assertEqual(initial_error, self.evaluate(error))
 
   @test_util.run_deprecated_v1
   def testSingleUpdateWithErrorAndWeights(self):
@@ -3018,7 +3020,7 @@ class MeanAbsoluteErrorTest(test.TestCase):
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
       self.assertEqual(3, self.evaluate(update_op))
-      self.assertEqual(3, error.eval())
+      self.assertEqual(3, self.evaluate(error))
 
 
 class MeanRelativeErrorTest(test.TestCase):
@@ -3071,9 +3073,9 @@ class MeanRelativeErrorTest(test.TestCase):
         self.evaluate(update_op)
 
       # Then verify idempotency.
-      initial_error = error.eval()
+      initial_error = self.evaluate(error)
       for _ in range(10):
-        self.assertEqual(initial_error, error.eval())
+        self.assertEqual(initial_error, self.evaluate(error))
 
   @test_util.run_deprecated_v1
   def testSingleUpdateNormalizedByLabels(self):
@@ -3092,7 +3094,7 @@ class MeanRelativeErrorTest(test.TestCase):
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
       self.assertEqual(expected_error, self.evaluate(update_op))
-      self.assertEqual(expected_error, error.eval())
+      self.assertEqual(expected_error, self.evaluate(error))
 
   @test_util.run_deprecated_v1
   def testSingleUpdateNormalizedByZeros(self):
@@ -3109,7 +3111,7 @@ class MeanRelativeErrorTest(test.TestCase):
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
       self.assertEqual(0.0, self.evaluate(update_op))
-      self.assertEqual(0.0, error.eval())
+      self.assertEqual(0.0, self.evaluate(error))
 
 
 class MeanSquaredErrorTest(test.TestCase):
@@ -3156,9 +3158,9 @@ class MeanSquaredErrorTest(test.TestCase):
         self.evaluate(update_op)
 
       # Then verify idempotency.
-      initial_error = error.eval()
+      initial_error = self.evaluate(error)
       for _ in range(10):
-        self.assertEqual(initial_error, error.eval())
+        self.assertEqual(initial_error, self.evaluate(error))
 
   @test_util.run_deprecated_v1
   def testSingleUpdateZeroError(self):
@@ -3170,7 +3172,7 @@ class MeanSquaredErrorTest(test.TestCase):
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
       self.assertEqual(0, self.evaluate(update_op))
-      self.assertEqual(0, error.eval())
+      self.assertEqual(0, self.evaluate(error))
 
   @test_util.run_deprecated_v1
   def testSingleUpdateWithError(self):
@@ -3184,7 +3186,7 @@ class MeanSquaredErrorTest(test.TestCase):
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
       self.assertEqual(6, self.evaluate(update_op))
-      self.assertEqual(6, error.eval())
+      self.assertEqual(6, self.evaluate(error))
 
   @test_util.run_deprecated_v1
   def testSingleUpdateWithErrorAndWeights(self):
@@ -3199,7 +3201,7 @@ class MeanSquaredErrorTest(test.TestCase):
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
       self.assertEqual(13, self.evaluate(update_op))
-      self.assertEqual(13, error.eval())
+      self.assertEqual(13, self.evaluate(error))
 
   @test_util.run_deprecated_v1
   def testMultipleBatchesOfSizeOne(self):
@@ -3224,7 +3226,7 @@ class MeanSquaredErrorTest(test.TestCase):
       self.evaluate(update_op)
       self.assertAlmostEqual(208.0 / 6, self.evaluate(update_op), 5)
 
-      self.assertAlmostEqual(208.0 / 6, error.eval(), 5)
+      self.assertAlmostEqual(208.0 / 6, self.evaluate(error), 5)
 
   @test_util.run_deprecated_v1
   def testMetricsComputedConcurrently(self):
@@ -3294,8 +3296,8 @@ class MeanSquaredErrorTest(test.TestCase):
       self.evaluate([ma_update_op, ms_update_op])
       self.evaluate([ma_update_op, ms_update_op])
 
-      self.assertAlmostEqual(32.0 / 6, mae.eval(), 5)
-      self.assertAlmostEqual(208.0 / 6, mse.eval(), 5)
+      self.assertAlmostEqual(32.0 / 6, self.evaluate(mae), 5)
+      self.assertAlmostEqual(208.0 / 6, self.evaluate(mse), 5)
 
 
 class RootMeanSquaredErrorTest(test.TestCase):
@@ -3343,9 +3345,9 @@ class RootMeanSquaredErrorTest(test.TestCase):
         self.evaluate(update_op)
 
       # Then verify idempotency.
-      initial_error = error.eval()
+      initial_error = self.evaluate(error)
       for _ in range(10):
-        self.assertEqual(initial_error, error.eval())
+        self.assertEqual(initial_error, self.evaluate(error))
 
   @test_util.run_deprecated_v1
   def testSingleUpdateZeroError(self):
@@ -3359,7 +3361,7 @@ class RootMeanSquaredErrorTest(test.TestCase):
       self.evaluate(variables.local_variables_initializer())
       self.assertEqual(0, self.evaluate(update_op))
 
-      self.assertEqual(0, rmse.eval())
+      self.assertEqual(0, self.evaluate(rmse))
 
   @test_util.run_deprecated_v1
   def testSingleUpdateWithError(self):
@@ -3372,8 +3374,8 @@ class RootMeanSquaredErrorTest(test.TestCase):
       rmse, update_op = metrics.root_mean_squared_error(labels, predictions)
 
       self.evaluate(variables.local_variables_initializer())
-      self.assertAlmostEqual(math.sqrt(6), update_op.eval(), 5)
-      self.assertAlmostEqual(math.sqrt(6), rmse.eval(), 5)
+      self.assertAlmostEqual(math.sqrt(6), self.evaluate(update_op), 5)
+      self.assertAlmostEqual(math.sqrt(6), self.evaluate(rmse), 5)
 
   @test_util.run_deprecated_v1
   def testSingleUpdateWithErrorAndWeights(self):
@@ -3390,7 +3392,7 @@ class RootMeanSquaredErrorTest(test.TestCase):
       self.evaluate(variables.local_variables_initializer())
       self.assertAlmostEqual(math.sqrt(13), self.evaluate(update_op))
 
-      self.assertAlmostEqual(math.sqrt(13), rmse.eval(), 5)
+      self.assertAlmostEqual(math.sqrt(13), self.evaluate(rmse), 5)
 
 
 def _reweight(predictions, labels, weights):
@@ -3448,9 +3450,9 @@ class MeanCosineDistanceTest(test.TestCase):
         self.evaluate(update_op)
 
       # Then verify idempotency.
-      initial_error = error.eval()
+      initial_error = self.evaluate(error)
       for _ in range(10):
-        self.assertEqual(initial_error, error.eval())
+        self.assertEqual(initial_error, self.evaluate(error))
 
   @test_util.run_deprecated_v1
   def testSingleUpdateZeroError(self):
@@ -3466,7 +3468,7 @@ class MeanCosineDistanceTest(test.TestCase):
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
       self.assertEqual(0, self.evaluate(update_op))
-      self.assertEqual(0, error.eval())
+      self.assertEqual(0, self.evaluate(error))
 
   @test_util.run_deprecated_v1
   def testSingleUpdateWithError1(self):
@@ -3483,7 +3485,7 @@ class MeanCosineDistanceTest(test.TestCase):
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
       self.assertAlmostEqual(1, self.evaluate(update_op), 5)
-      self.assertAlmostEqual(1, error.eval(), 5)
+      self.assertAlmostEqual(1, self.evaluate(error), 5)
 
   @test_util.run_deprecated_v1
   def testSingleUpdateWithError2(self):
@@ -3505,7 +3507,7 @@ class MeanCosineDistanceTest(test.TestCase):
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
       self.assertAlmostEqual(1.0, self.evaluate(update_op), 5)
-      self.assertAlmostEqual(1.0, error.eval(), 5)
+      self.assertAlmostEqual(1.0, self.evaluate(error), 5)
 
   @test_util.run_deprecated_v1
   def testSingleUpdateWithErrorAndWeights1(self):
@@ -3525,7 +3527,7 @@ class MeanCosineDistanceTest(test.TestCase):
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
       self.assertEqual(0, self.evaluate(update_op))
-      self.assertEqual(0, error.eval())
+      self.assertEqual(0, self.evaluate(error))
 
   @test_util.run_deprecated_v1
   def testSingleUpdateWithErrorAndWeights2(self):
@@ -3544,8 +3546,8 @@ class MeanCosineDistanceTest(test.TestCase):
 
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
-      self.assertEqual(1.5, update_op.eval())
-      self.assertEqual(1.5, error.eval())
+      self.assertEqual(1.5, self.evaluate(update_op))
+      self.assertEqual(1.5, self.evaluate(error))
 
 
 class PcntBelowThreshTest(test.TestCase):
@@ -3689,9 +3691,9 @@ class MeanIOUTest(test.TestCase):
         self.evaluate(update_op)
 
       # Then verify idempotency.
-      initial_mean_iou = mean_iou.eval()
+      initial_mean_iou = self.evaluate(mean_iou)
       for _ in range(10):
-        self.assertEqual(initial_mean_iou, mean_iou.eval())
+        self.assertEqual(initial_mean_iou, self.evaluate(mean_iou))
 
   @test_util.run_deprecated_v1
   def testMultipleUpdates(self):
@@ -3723,7 +3725,7 @@ class MeanIOUTest(test.TestCase):
       for _ in range(5):
         self.evaluate(update_op)
       desired_output = np.mean([1.0 / 2.0, 1.0 / 4.0, 0.])
-      self.assertEqual(desired_output, miou.eval())
+      self.assertEqual(desired_output, self.evaluate(miou))
 
   @test_util.run_deprecated_v1
   def testMultipleUpdatesWithWeights(self):
@@ -3769,7 +3771,7 @@ class MeanIOUTest(test.TestCase):
       for _ in range(6):
         self.evaluate(update_op)
       desired_output = np.mean([2.0 / 3.0, 1.0 / 2.0])
-      self.assertAlmostEqual(desired_output, mean_iou.eval())
+      self.assertAlmostEqual(desired_output, self.evaluate(mean_iou))
 
   @test_util.run_deprecated_v1
   def testMultipleUpdatesWithMissingClass(self):
@@ -3806,7 +3808,7 @@ class MeanIOUTest(test.TestCase):
       for _ in range(5):
         self.evaluate(update_op)
       desired_output = np.mean([1.0 / 3.0, 2.0 / 4.0])
-      self.assertAlmostEqual(desired_output, miou.eval())
+      self.assertAlmostEqual(desired_output, self.evaluate(miou))
 
   @test_util.run_deprecated_v1
   def testUpdateOpEvalIsAccumulatedConfusionMatrix(self):
@@ -3828,10 +3830,10 @@ class MeanIOUTest(test.TestCase):
     with self.cached_session():
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
       self.evaluate(variables.local_variables_initializer())
-      confusion_matrix = update_op.eval()
+      confusion_matrix = self.evaluate(update_op)
       self.assertAllEqual([[3, 0], [2, 5]], confusion_matrix)
       desired_miou = np.mean([3. / 5., 5. / 7.])
-      self.assertAlmostEqual(desired_miou, miou.eval())
+      self.assertAlmostEqual(desired_miou, self.evaluate(miou))
 
   @test_util.run_deprecated_v1
   def testAllCorrect(self):
@@ -3841,8 +3843,8 @@ class MeanIOUTest(test.TestCase):
     with self.cached_session():
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
       self.evaluate(variables.local_variables_initializer())
-      self.assertEqual(40, update_op.eval()[0])
-      self.assertEqual(1.0, miou.eval())
+      self.assertEqual(40, self.evaluate(update_op)[0])
+      self.assertEqual(1.0, self.evaluate(miou))
 
   @test_util.run_deprecated_v1
   def testAllWrong(self):
@@ -3853,7 +3855,7 @@ class MeanIOUTest(test.TestCase):
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
       self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([[0, 0], [40, 0]], update_op)
-      self.assertEqual(0., miou.eval())
+      self.assertEqual(0., self.evaluate(miou))
 
   @test_util.run_deprecated_v1
   def testResultsWithSomeMissing(self):
@@ -3886,7 +3888,7 @@ class MeanIOUTest(test.TestCase):
       self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([[2, 0], [2, 4]], update_op)
       desired_miou = np.mean([2. / 4., 4. / 6.])
-      self.assertAlmostEqual(desired_miou, miou.eval())
+      self.assertAlmostEqual(desired_miou, self.evaluate(miou))
 
   @test_util.run_deprecated_v1
   def testMissingClassInLabels(self):
@@ -3907,7 +3909,7 @@ class MeanIOUTest(test.TestCase):
       self.assertAllEqual([[7, 4, 3], [3, 5, 2], [0, 0, 0]], update_op)
       self.assertAlmostEqual(
           1 / 3 * (7 / (7 + 3 + 7) + 5 / (5 + 4 + 5) + 0 / (0 + 5 + 0)),
-          miou.eval())
+          self.evaluate(miou))
 
   @test_util.run_deprecated_v1
   def testMissingClassOverallSmall(self):
@@ -3918,7 +3920,7 @@ class MeanIOUTest(test.TestCase):
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
       self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([[1, 0], [0, 0]], update_op)
-      self.assertAlmostEqual(1, miou.eval())
+      self.assertAlmostEqual(1, self.evaluate(miou))
 
   @test_util.run_deprecated_v1
   def testMissingClassOverallLarge(self):
@@ -3937,8 +3939,8 @@ class MeanIOUTest(test.TestCase):
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
       self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([[9, 5, 0], [3, 7, 0], [0, 0, 0]], update_op)
-      self.assertAlmostEqual(
-          1 / 2 * (9 / (9 + 3 + 5) + 7 / (7 + 5 + 3)), miou.eval())
+      self.assertAlmostEqual(1 / 2 * (9 / (9 + 3 + 5) + 7 / (7 + 5 + 3)),
+                             self.evaluate(miou))
 
 
 class MeanPerClassAccuracyTest(test.TestCase):
@@ -4011,9 +4013,9 @@ class MeanPerClassAccuracyTest(test.TestCase):
         self.evaluate(update_op)
 
       # Then verify idempotency.
-      initial_mean_accuracy = mean_accuracy.eval()
+      initial_mean_accuracy = self.evaluate(mean_accuracy)
       for _ in range(10):
-        self.assertEqual(initial_mean_accuracy, mean_accuracy.eval())
+        self.assertEqual(initial_mean_accuracy, self.evaluate(mean_accuracy))
 
     num_classes = 3
     with self.cached_session() as sess:
@@ -4044,7 +4046,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
       for _ in range(5):
         self.evaluate(update_op)
       desired_output = np.mean([1.0, 1.0 / 3.0, 0.0])
-      self.assertAlmostEqual(desired_output, mean_accuracy.eval())
+      self.assertAlmostEqual(desired_output, self.evaluate(mean_accuracy))
 
   @test_util.run_deprecated_v1
   def testMultipleUpdatesWithWeights(self):
@@ -4090,7 +4092,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
       for _ in range(6):
         self.evaluate(update_op)
       desired_output = np.mean([2.0 / 2.0, 0.5 / 1.5])
-      self.assertAlmostEqual(desired_output, mean_accuracy.eval())
+      self.assertAlmostEqual(desired_output, self.evaluate(mean_accuracy))
 
   @test_util.run_deprecated_v1
   def testMultipleUpdatesWithMissingClass(self):
@@ -4128,7 +4130,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
       for _ in range(5):
         self.evaluate(update_op)
       desired_output = np.mean([1.0 / 2.0, 2.0 / 3.0, 0.])
-      self.assertAlmostEqual(desired_output, mean_accuracy.eval())
+      self.assertAlmostEqual(desired_output, self.evaluate(mean_accuracy))
 
   @test_util.run_deprecated_v1
   def testAllCorrect(self):
@@ -4139,8 +4141,8 @@ class MeanPerClassAccuracyTest(test.TestCase):
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes)
       self.evaluate(variables.local_variables_initializer())
-      self.assertEqual(1.0, update_op.eval()[0])
-      self.assertEqual(1.0, mean_accuracy.eval())
+      self.assertEqual(1.0, self.evaluate(update_op)[0])
+      self.assertEqual(1.0, self.evaluate(mean_accuracy))
 
   @test_util.run_deprecated_v1
   def testAllWrong(self):
@@ -4152,7 +4154,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
           labels, predictions, num_classes)
       self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([0.0, 0.0], update_op)
-      self.assertEqual(0., mean_accuracy.eval())
+      self.assertEqual(0., self.evaluate(mean_accuracy))
 
   @test_util.run_deprecated_v1
   def testResultsWithSomeMissing(self):
@@ -4174,7 +4176,8 @@ class MeanPerClassAccuracyTest(test.TestCase):
       desired_accuracy = np.array([2. / 2., 4. / 6.], dtype=np.float32)
       self.assertAllEqual(desired_accuracy, update_op)
       desired_mean_accuracy = np.mean(desired_accuracy)
-      self.assertAlmostEqual(desired_mean_accuracy, mean_accuracy.eval())
+      self.assertAlmostEqual(desired_mean_accuracy,
+                             self.evaluate(mean_accuracy))
 
 
 class FalseNegativesTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/morphological_ops_test.py b/tensorflow/python/kernel_tests/morphological_ops_test.py
index f54aaf30d0a..60da4e14b1b 100644
--- a/tensorflow/python/kernel_tests/morphological_ops_test.py
+++ b/tensorflow/python/kernel_tests/morphological_ops_test.py
@@ -22,7 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
@@ -199,32 +199,33 @@ class DilationTest(test.TestCase):
     np.random.seed(1)  # Make it reproducible.
     image = np.random.random_sample(image_shape).astype(np.float32)
     kernel = np.random.random_sample(kernel_shape).astype(np.float32)
-    image_init = np.random.random_sample(image_shape).astype(np.float32)
-    kernel_init = np.random.random_sample(kernel_shape).astype(np.float32)
 
     strides = [1] + strides + [1]
     rates = [1] + rates + [1]
 
-    with self.cached_session(use_gpu=use_gpu):
-      image_tensor = constant_op.constant(
-          image, shape=image_shape, name="input")
-      kernel_tensor = constant_op.constant(
-          kernel, shape=kernel_shape, name="filter")
-      out_tensor = nn_ops.dilation2d(
+    image_tensor = constant_op.constant(image, shape=image_shape, name="input")
+    kernel_tensor = constant_op.constant(
+        kernel, shape=kernel_shape, name="filter")
+
+    def compute_dilation2d(image_tensor, kernel_tensor):
+      return nn_ops.dilation2d(
           image_tensor,
           kernel_tensor,
           strides=strides,
           rates=rates,
           padding=padding,
           name="dilation2d")
-      out_shape = self.evaluate(out_tensor).shape
 
-      # Small delta is necessary for argmax to remain the same.
-      err = gradient_checker.compute_gradient_error(
-          [image_tensor, kernel_tensor], [image_shape, kernel_shape],
-          out_tensor,
-          out_shape, [image_init, kernel_init],
-          delta=1e-3)
+    with test_util.device(use_gpu=use_gpu):
+      with self.cached_session():
+        # Small delta is necessary for argmax to remain the same.
+        err1 = gradient_checker_v2.max_error(
+            *gradient_checker_v2.compute_gradient(
+                lambda x: compute_dilation2d(x, kernel_tensor), [image_tensor]))
+        err2 = gradient_checker_v2.max_error(
+            *gradient_checker_v2.compute_gradient(
+                lambda x: compute_dilation2d(image_tensor, x), [kernel_tensor]))
+        err = max(err1, err2)
 
     print("Dilation gradient error = %f" % err)
     self.assertLess(err, 1e-4)
@@ -292,7 +293,6 @@ class DilationTest(test.TestCase):
         padding="SAME",
         use_gpu=use_gpu)
 
-  @test_util.run_deprecated_v1
   def testDilationGrad(self):
     for use_gpu in True, False:
       self._testDilationGradValidPadding_1x1x1(use_gpu)
@@ -475,32 +475,33 @@ class ErosionTest(test.TestCase):
     np.random.seed(1)  # Make it reproducible.
     image = np.random.random_sample(image_shape).astype(np.float32)
     kernel = np.random.random_sample(kernel_shape).astype(np.float32)
-    image_init = np.random.random_sample(image_shape).astype(np.float32)
-    kernel_init = np.random.random_sample(kernel_shape).astype(np.float32)
 
     strides = [1] + strides + [1]
     rates = [1] + rates + [1]
 
-    with self.cached_session(use_gpu=use_gpu):
-      image_tensor = constant_op.constant(
-          image, shape=image_shape, name="input")
-      kernel_tensor = constant_op.constant(
-          kernel, shape=kernel_shape, name="filter")
-      out_tensor = nn_ops.erosion2d(
+    image_tensor = constant_op.constant(image, shape=image_shape, name="input")
+    kernel_tensor = constant_op.constant(
+        kernel, shape=kernel_shape, name="filter")
+
+    def compute_erosion2d(image_tensor, kernel_tensor):
+      return nn_ops.erosion2d(
           image_tensor,
           kernel_tensor,
           strides=strides,
           rates=rates,
           padding=padding,
           name="erosion2d")
-      out_shape = self.evaluate(out_tensor).shape
 
-      # Small delta is necessary for argmax to remain the same.
-      err = gradient_checker.compute_gradient_error(
-          [image_tensor, kernel_tensor], [image_shape, kernel_shape],
-          out_tensor,
-          out_shape, [image_init, kernel_init],
-          delta=1e-3)
+    with test_util.device(use_gpu=use_gpu):
+      with self.cached_session():
+        # Small delta is necessary for argmax to remain the same.
+        err1 = gradient_checker_v2.max_error(
+            *gradient_checker_v2.compute_gradient(
+                lambda x: compute_erosion2d(x, kernel_tensor), [image_tensor]))
+        err2 = gradient_checker_v2.max_error(
+            *gradient_checker_v2.compute_gradient(
+                lambda x: compute_erosion2d(image_tensor, x), [kernel_tensor]))
+        err = max(err1, err2)
 
     print("Erosion gradient error = %f" % err)
     self.assertLess(err, 1e-4)
@@ -568,7 +569,6 @@ class ErosionTest(test.TestCase):
         padding="SAME",
         use_gpu=use_gpu)
 
-  @test_util.run_deprecated_v1
   def testErosionGrad(self):
     for use_gpu in True, False:
       self._testErosionGradValidPadding_1x1x1(use_gpu)
diff --git a/tensorflow/python/kernel_tests/nth_element_op_test.py b/tensorflow/python/kernel_tests/nth_element_op_test.py
index d8b9adb8731..488dfc0db66 100644
--- a/tensorflow/python/kernel_tests/nth_element_op_test.py
+++ b/tensorflow/python/kernel_tests/nth_element_op_test.py
@@ -19,13 +19,15 @@ from __future__ import print_function
 
 import numpy as np
 
-import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import nn_ops
+import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 
@@ -112,65 +114,77 @@ class NthElementTest(test.TestCase):
     self._testEnumerateN([10, 10, 10])
     self._testEnumerateN([10, 10, 10, 10])
 
-  @test_util.run_deprecated_v1
   def testInvalidInput(self):
-    with self.assertRaisesRegex(ValueError, "at least rank 1 but is rank 0"):
+    with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
+                                "at least rank 1 but is rank 0"):
       nn_ops.nth_element(5, 0)
 
-  @test_util.run_deprecated_v1
-  def testInvalidInputAtEval(self):
-    with self.session(use_gpu=False):
-      v = array_ops.placeholder(dtype=dtypes.float32)
-      with self.assertRaisesOpError("Input must be >= 1-D"):
-        nn_ops.nth_element(v, 0).eval(feed_dict={v: 5.0})
+    # Test with placeholders
+    with ops.Graph().as_default():
+      with self.session(use_gpu=False):
+        v = array_ops.placeholder(dtype=dtypes.int32)
+        with self.assertRaisesOpError("at least rank 1 but is rank 0"):
+          nn_ops.nth_element(v, 0).eval(feed_dict={v: 5})
 
-  @test_util.run_deprecated_v1
   def testInvalidN(self):
-    with self.assertRaisesRegex(ValueError, "non-negative but is -1"):
+    with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
+                                "non-negative but is -1"):
       nn_ops.nth_element([5], -1)
-    with self.assertRaisesRegex(ValueError, "scalar but has rank 1"):
+    with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
+                                "scalar but has rank 1"):
       nn_ops.nth_element([5, 6, 3], [1])
 
-  @test_util.run_deprecated_v1
-  def testInvalidNAtEval(self):
-    inputs = [[0.1, 0.2], [0.3, 0.4]]
-    with self.session(use_gpu=False):
-      n = array_ops.placeholder(dtypes.int32)
-      values = nn_ops.nth_element(inputs, n)
-      with self.assertRaisesOpError("Need n >= 0, got -7"):
-        values.eval(feed_dict={n: -7})
+    # Test with placeholders
+    with ops.Graph().as_default():
+      with self.session(use_gpu=False):
+        n = array_ops.placeholder(dtypes.int32)
+        values = nn_ops.nth_element([5], n)
+        with self.assertRaisesOpError("non-negative but is -1"):
+          values.eval(feed_dict={n: -1})
 
-  @test_util.run_deprecated_v1
   def testNTooLarge(self):
     inputs = [[0.1, 0.2], [0.3, 0.4]]
-    with self.assertRaisesRegex(ValueError, "must have last dimension > n = 2"):
+    with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
+                                "must have last dimension > n = 2"):
       nn_ops.nth_element(inputs, 2)
 
-  @test_util.run_deprecated_v1
-  def testNTooLargeAtEval(self):
-    inputs = [[0.1, 0.2], [0.3, 0.4]]
-    with self.session(use_gpu=False):
-      n = array_ops.placeholder(dtypes.int32)
-      values = nn_ops.nth_element(inputs, n)
-      with self.assertRaisesOpError(r"Input must have at least n\+1 columns"):
-        values.eval(feed_dict={n: 2})
+    # Test with placeholders
+    with ops.Graph().as_default():
+      with self.session(use_gpu=False):
+        n = array_ops.placeholder(dtypes.int32)
+        values = nn_ops.nth_element(inputs, n)
+        with self.assertRaisesOpError("must have last dimension > n = 2"):
+          values.eval(feed_dict={n: 2})
 
-  @test_util.run_deprecated_v1
   def testGradients(self):
-    with self.session(use_gpu=False) as sess:
-      inputs = array_ops.placeholder(dtypes.float32, shape=[3, 5])
-      values = nn_ops.nth_element(inputs, 3)
-      grad = sess.run(
-          gradients_impl.gradients(
-              values, inputs, grad_ys=[[-1., 2., 5.]]),
-          feed_dict={inputs: [[2., -1., 1000., 3., 1000.],
-                              [1., 5., 2., 4., 3.],
-                              [2., 2., 2., 2., 2.],
-                             ]})
-    self.assertAllClose(grad[0], [[0, 0, -0.5, 0, -0.5],
-                                  [0, 0, 0, 2, 0],
-                                  [1, 1, 1, 1, 1],
-                                 ])
+    x = [
+        [2., -1., 1000., 3., 1000.],
+        [1., 5., 2., 4., 3.],
+        [2., 2., 2., 2., 2.],
+    ]
+    grad_ys = [[-1., 2., 5.]]
+    result = [
+        [0, 0, -0.5, 0, -0.5],
+        [0, 0, 0, 2, 0],
+        [1, 1, 1, 1, 1],
+    ]
+    if context.executing_eagerly():
+      inputs = ops.convert_to_tensor(x)
+      with backprop.GradientTape() as tape:
+        tape.watch(inputs)
+        values = nn_ops.nth_element(inputs, 3)
+      grad = tape.gradient(values, inputs, ops.convert_to_tensor(grad_ys))
+      self.assertAllClose(grad[0], result)
+
+    # Test with tf.gradients
+    with ops.Graph().as_default():
+      with self.session(use_gpu=False) as sess:
+        inputs = array_ops.placeholder(dtypes.float32, shape=[3, 5])
+        values = nn_ops.nth_element(inputs, 3)
+        grad = sess.run(
+            gradients_impl.gradients(values, inputs, grad_ys=grad_ys),
+            feed_dict={inputs: x})
+    self.assertAllClose(grad[0], result)
 
 
 
diff --git a/tensorflow/python/kernel_tests/pad_op_test.py b/tensorflow/python/kernel_tests/pad_op_test.py
index 06f4780421d..30abf9a758c 100644
--- a/tensorflow/python/kernel_tests/pad_op_test.py
+++ b/tensorflow/python/kernel_tests/pad_op_test.py
@@ -22,10 +22,11 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.platform import test
 
 
@@ -86,24 +87,31 @@ class PadOpTest(test.TestCase):
   def _testPad(self, np_inputs, paddings, mode, constant_values):
     np_val = self._npPad(np_inputs, paddings, mode=mode,
                          constant_values=constant_values)
-    with self.cached_session(use_gpu=True):
+
+    with test_util.use_gpu():
       tf_val = array_ops.pad(np_inputs, paddings, mode=mode,
                              constant_values=constant_values)
       out = self.evaluate(tf_val)
     self.assertAllEqual(np_val, out)
     self.assertShapeEqual(np_val, tf_val)
 
-  def _testGradient(self, x, a, mode, constant_values):
-    with self.cached_session(use_gpu=True):
-      inx = ops.convert_to_tensor(x)
-      xs = list(x.shape)
-      ina = ops.convert_to_tensor(a)
-      y = array_ops.pad(inx, ina, mode=mode, constant_values=constant_values)
-      # Expected y's shape to be:
-      ys = list(np.array(x.shape) + np.sum(np.array(a), axis=1))
-      jacob_t, jacob_n = gradient_checker.compute_gradient(
-          inx, xs, y, ys, x_init_value=x)
-    self.assertAllClose(jacob_t, jacob_n, rtol=1e-5, atol=1e-5)
+  def _testGradient(self,
+                    x,
+                    a,
+                    mode,
+                    constant_values,
+                    paddings_dtype=dtypes.int32):
+
+    def pad(x):
+      return array_ops.pad(
+          x,
+          ops.convert_to_tensor(a, paddings_dtype),
+          mode=mode,
+          constant_values=constant_values)
+
+    with self.cached_session():
+      jacob_t, jacob_n = gradient_checker_v2.compute_gradient(pad, [x])
+      self.assertAllClose(jacob_t, jacob_n, rtol=1e-5, atol=1e-5)
 
   def _testAll(self, np_inputs, paddings, constant_values):
     for mode in ("CONSTANT", "REFLECT", "SYMMETRIC", "reflect", "symmetric",
@@ -117,71 +125,76 @@ class PadOpTest(test.TestCase):
           self._testGradient(np_inputs, paddings, mode=mode,
                              constant_values=constant_values)
 
-  @test_util.run_deprecated_v1
   def testInputDims(self):
-    with self.session(use_gpu=True):
-      with self.assertRaises(ValueError):
+    with test_util.use_gpu():
+      with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
+                                  "Shape must be rank 1 but is rank 6|"
+                                  "paddings must be the rank of inputs"):
         array_ops.pad(array_ops.reshape(
             [1, 2], shape=[1, 2, 1, 1, 1, 1]),
                       array_ops.reshape(
                           [1, 2], shape=[1, 2]))
 
-  @test_util.run_deprecated_v1
   def testPaddingsDim(self):
-    with self.session(use_gpu=True):
-      with self.assertRaises(ValueError):
+    with test_util.use_gpu():
+      with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
+                                  "Shape must be rank 2 but is rank 1|"
+                                  "paddings must be a matrix with 2 columns"):
         array_ops.pad(array_ops.reshape(
             [1, 2], shape=[1, 2]),
                       array_ops.reshape(
                           [1, 2], shape=[2]))
 
-  @test_util.run_deprecated_v1
   def testPaddingsDim2(self):
-    with self.session(use_gpu=True):
-      with self.assertRaises(ValueError):
+    with test_util.use_gpu():
+      with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
+                                  "Dimension must be 2 but is 1|"
+                                  "paddings must be a matrix with 2 columns"):
         array_ops.pad(array_ops.reshape(
             [1, 2], shape=[1, 2]),
                       array_ops.reshape(
                           [1, 2], shape=[2, 1]))
 
-  @test_util.run_deprecated_v1
   def testPaddingsDim3(self):
-    with self.session(use_gpu=True):
-      with self.assertRaises(ValueError):
+    with test_util.use_gpu():
+      with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
+                                  "Shape must be rank 1 but is rank 2|"
+                                  "paddings must be the rank of inputs"):
         array_ops.pad(array_ops.reshape(
             [1, 2], shape=[1, 2]),
                       array_ops.reshape(
                           [1, 2], shape=[1, 2]))
 
-  @test_util.run_deprecated_v1
   def testPaddingsDim4(self):
-    with self.session(use_gpu=True):
-      with self.assertRaises(ValueError):
+    with test_util.use_gpu():
+      with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
+                                  "Shape must be rank 3 but is rank 2|"
+                                  "paddings must be the rank of inputs"):
         array_ops.pad(array_ops.reshape(
             [1, 2], shape=[1, 2]),
                       array_ops.reshape(
                           [1, 2, 3, 4, 5, 6], shape=[3, 2]))
 
-  @test_util.run_deprecated_v1
   def testPaddingsNonNegative(self):
-    with self.session(use_gpu=True):
-      with self.assertRaisesRegex(ValueError, "must be non-negative"):
+    with test_util.use_gpu():
+      with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
+                                  "must be non-negative"):
         array_ops.pad(constant_op.constant(
             [1], shape=[1]),
                       constant_op.constant(
                           [-1, 0], shape=[1, 2]))
 
-  @test_util.run_deprecated_v1
   def testPaddingsNonNegative2(self):
-    with self.session(use_gpu=True):
-      with self.assertRaisesRegex(ValueError, "must be non-negative"):
+    with test_util.use_gpu():
+      with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
+                                  "must be non-negative"):
         array_ops.pad(constant_op.constant(
             [1], shape=[1]),
                       constant_op.constant(
                           [-1, 0], shape=[1, 2]))
 
   def testPaddingsMaximum(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       with self.assertRaises(Exception):
         array_ops.pad(constant_op.constant(
             [1], shape=[2]),
@@ -199,27 +212,39 @@ class PadOpTest(test.TestCase):
     with self.cached_session():
       x = [[1, 2, 3], [4, 5, 6]]
       with self.assertRaisesRegex(ValueError, "Unknown padding mode"):
-        array_ops.pad(x, [[1, 0], [2, 1]], mode="weird").eval()
+        self.evaluate(array_ops.pad(x, [[1, 0], [2, 1]], mode="weird"))
 
   def testPaddingTypes(self):
-    paddings = [[1, 0], [2, 3], [0, 2]]
-    inputs = np.random.randint(-100, 100, (4, 4, 3)).astype(np.float32)
+    paddings = [[1, 0], [2, 0]]
+    inputs = np.random.rand(2, 5).astype(np.float32)
     for mode in ("CONSTANT", "REFLECT", "SYMMETRIC", "reflect", "symmetric",
                  "constant"):
-      for padding_dtype in [dtypes.int32, dtypes.int64]:
+      for paddings_dtype in [dtypes.int32, dtypes.int64]:
         np_val = self._npPad(inputs,
                              paddings,
                              mode=mode,
                              constant_values=0)
-        with self.cached_session(use_gpu=True):
-          tf_val = array_ops.pad(inputs,
-                                 constant_op.constant(paddings, padding_dtype),
-                                 mode=mode,
-                                 constant_values=0)
+
+        with test_util.use_gpu():
+          tf_val = array_ops.pad(
+              inputs,
+              constant_op.constant(paddings, paddings_dtype),
+              mode=mode,
+              constant_values=0)
           out = self.evaluate(tf_val)
+
         self.assertAllEqual(np_val, out)
         self.assertShapeEqual(np_val, tf_val)
 
+        if mode.upper() != "REFLECT":
+          with ops.Graph().as_default():
+            self._testGradient(
+                inputs,
+                paddings,
+                mode=mode,
+                constant_values=0,
+                paddings_dtype=paddings_dtype)
+
   def testIntTypes(self):
     # TODO(touts): Figure out why the padding tests do not work on GPU
     # for int types and rank > 2.
@@ -231,7 +256,6 @@ class PadOpTest(test.TestCase):
           np.random.randint(-100, 100, (4, 2, 1, 3)).astype(t),
           [[0, 0], [0, 0], [0, 0], [0, 0]], -123)
 
-  @test_util.run_deprecated_v1
   def testFloatTypes(self):
     for t in [np.float32, np.float64]:
       self._testAll(np.random.rand(2, 5).astype(t), [[1, 0], [2, 0]], 0.0)
@@ -258,7 +282,7 @@ class PadOpTest(test.TestCase):
                             constant_values="PAD")
     symmetric = array_ops.pad(x, [[1, 0], [0, 1]], mode="SYMMETRIC",
                               constant_values="PAD")
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       self.assertAllEqual(
           [[b"PAD", b"PAD", b"PAD"], [b"Hello", b"World", b"PAD"],
            [b"Goodnight", b"Moon", b"PAD"]], self.evaluate(constant))
@@ -270,73 +294,75 @@ class PadOpTest(test.TestCase):
           [[b"Hello", b"World", b"World"], [b"Hello", b"World", b"World"],
            [b"Goodnight", b"Moon", b"Moon"]], self.evaluate(symmetric))
 
-  @test_util.run_deprecated_v1
   def testShapeFunctionEdgeCases(self):
-    # Unknown paddings shape.
-    inp = constant_op.constant(0.0, shape=[4, 4, 4, 4])
-    padded = array_ops.pad(inp, array_ops.placeholder(dtypes.int32))
-    self.assertEqual([None, None, None, None], padded.get_shape().as_list())
+    # Shape function requires placeholders and a graph
+    with ops.Graph().as_default():
+      # Unknown paddings shape.
+      inp = constant_op.constant(0.0, shape=[4, 4, 4, 4])
+      padded = array_ops.pad(inp, array_ops.placeholder(dtypes.int32))
+      self.assertEqual([None, None, None, None], padded.get_shape().as_list())
 
-    # Unknown input shape.
-    inp = array_ops.placeholder(dtypes.float32)
-    padded = array_ops.pad(inp, [[2, 2], [2, 2]])
-    self.assertEqual([None, None], padded.get_shape().as_list())
+      # Unknown input shape.
+      inp = array_ops.placeholder(dtypes.float32)
+      padded = array_ops.pad(inp, [[2, 2], [2, 2]])
+      self.assertEqual([None, None], padded.get_shape().as_list())
 
-    # Unknown input and paddings shape.
-    inp = array_ops.placeholder(dtypes.float32)
-    padded = array_ops.pad(inp, array_ops.placeholder(dtypes.int32))
-    self.assertAllEqual(None, padded.get_shape().ndims)
+      # Unknown input and paddings shape.
+      inp = array_ops.placeholder(dtypes.float32)
+      padded = array_ops.pad(inp, array_ops.placeholder(dtypes.int32))
+      self.assertAllEqual(None, padded.get_shape().ndims)
 
-  @test_util.run_deprecated_v1
   def testPartialShapeInformation(self):
-    unknown = array_ops.placeholder(dtypes.int32)
+    # Partial shapes requires placeholders and a graph
+    with ops.Graph().as_default():
+      unknown = array_ops.placeholder(dtypes.int32)
 
-    # Known input shape, partial unknown padding (one dimension).
-    inp = constant_op.constant(0.0, shape=[4, 4])
-    padded = array_ops.pad(inp, [[1, 2], unknown])
-    self.assertEqual([7, None], padded.get_shape().as_list())
+      # Known input shape, partial unknown padding (one dimension).
+      inp = constant_op.constant(0.0, shape=[4, 4])
+      padded = array_ops.pad(inp, [[1, 2], unknown])
+      self.assertEqual([7, None], padded.get_shape().as_list())
 
-    # Known input shape, partial unknown padding (begin).
-    inp = constant_op.constant(0.0, shape=[4, 4])
-    padded = array_ops.pad(inp, [[unknown, 0], [1, 2]])
-    self.assertEqual([None, 7], padded.get_shape().as_list())
+      # Known input shape, partial unknown padding (begin).
+      inp = constant_op.constant(0.0, shape=[4, 4])
+      padded = array_ops.pad(inp, [[unknown, 0], [1, 2]])
+      self.assertEqual([None, 7], padded.get_shape().as_list())
 
-    # Known input shape, partial unknown padding (end).
-    inp = constant_op.constant(0.0, shape=[4, 4])
-    padded = array_ops.pad(inp, [[1, 2], [0, unknown]])
-    self.assertEqual([7, None], padded.get_shape().as_list())
+      # Known input shape, partial unknown padding (end).
+      inp = constant_op.constant(0.0, shape=[4, 4])
+      padded = array_ops.pad(inp, [[1, 2], [0, unknown]])
+      self.assertEqual([7, None], padded.get_shape().as_list())
 
-    # Unknown input shape, partial unknown padding (one dimension).
-    padded = array_ops.pad(unknown, [[1, 2], unknown])
-    self.assertEqual([None, None], padded.get_shape().as_list())
+      # Unknown input shape, partial unknown padding (one dimension).
+      padded = array_ops.pad(unknown, [[1, 2], unknown])
+      self.assertEqual([None, None], padded.get_shape().as_list())
 
-    # Unknown input shape (rank known), partial unknown padding (one dimension).
-    rank_known = array_ops.placeholder(dtypes.int32)
-    rank_known.set_shape([None, None])
-    padded = array_ops.pad(rank_known, [[1, 2], unknown])
-    self.assertEqual([None, None], padded.get_shape().as_list())
+      # Unknown input shape (rank known), partial unknown padding (one dim).
+      rank_known = array_ops.placeholder(dtypes.int32)
+      rank_known.set_shape([None, None])
+      padded = array_ops.pad(rank_known, [[1, 2], unknown])
+      self.assertEqual([None, None], padded.get_shape().as_list())
 
-    # Known input shape, partial unknown padding (begin), with constant begin.
-    inp = constant_op.constant(0.0, shape=[4, 4])
-    padded = array_ops.pad(inp, [[constant_op.constant(1, shape=[]), 2],
-                                 [0, unknown]])
-    self.assertEqual([7, None], padded.get_shape().as_list())
+      # Known input shape, partial unknown padding (begin), with constant begin.
+      inp = constant_op.constant(0.0, shape=[4, 4])
+      padded = array_ops.pad(
+          inp, [[constant_op.constant(1, shape=[]), 2], [0, unknown]])
+      self.assertEqual([7, None], padded.get_shape().as_list())
 
-    # Known input shape, partial unknown padding (begin), with constant dim.
-    inp = constant_op.constant(0.0, shape=[4, 4])
-    padded = array_ops.pad(inp,
-                           [constant_op.constant(1, shape=[2]), [0, unknown]])
-    self.assertEqual([6, None], padded.get_shape().as_list())
+      # Known input shape, partial unknown padding (begin), with constant dim.
+      inp = constant_op.constant(0.0, shape=[4, 4])
+      padded = array_ops.pad(inp,
+                             [constant_op.constant(1, shape=[2]), [0, unknown]])
+      self.assertEqual([6, None], padded.get_shape().as_list())
 
-    # Zero padding on a known dimension.
-    inp = array_ops.placeholder(dtypes.int32, [None, None, 20])
-    padded = array_ops.pad(inp, [[0, 0], [0, unknown], [0, 0]])
-    self.assertEqual([None, None, 20], padded.get_shape().as_list())
+      # Zero padding on a known dimension.
+      inp = array_ops.placeholder(dtypes.int32, [None, None, 20])
+      padded = array_ops.pad(inp, [[0, 0], [0, unknown], [0, 0]])
+      self.assertEqual([None, None, 20], padded.get_shape().as_list())
 
   def testScalars(self):
     paddings = np.zeros((0, 2), dtype=np.int32)
     inp = np.asarray(7)
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       tf_val = array_ops.pad(inp, paddings)
       out = self.evaluate(tf_val)
     self.assertAllEqual(inp, out)
@@ -352,7 +378,6 @@ class PadOpTest(test.TestCase):
       self.assertAllEqual(inp, out)
       self.assertShapeEqual(inp, tf_val)
 
-  @test_util.run_deprecated_v1
   def testCollapseAdjacentNonPaddedDimensions(self):
     # pyformat: disable
     paddings_values = [[[0, 0], [0, 0], [0, 0], [0, 1]],
diff --git a/tensorflow/python/kernel_tests/padding_fifo_queue_test.py b/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
index 03e89042e21..0b3983f91b2 100644
--- a/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
+++ b/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
@@ -145,7 +145,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       # Dequeue every element using a single thread.
       results = []
       for _ in xrange(len(elems)):
-        results.append(dequeued_t.eval())
+        results.append(self.evaluate(dequeued_t))
       self.assertItemsEqual(elems, results)
 
   def testParallelDequeue(self):
@@ -321,8 +321,9 @@ class PaddingFIFOQueueTest(test.TestCase):
       with self.assertRaisesRegex(
           ValueError,
           r"When providing partial shapes, a list of shapes must be provided."):
-        data_flow_ops.PaddingFIFOQueue(10, dtypes_lib.float32,
-                                       None).queue_ref.eval()
+        self.evaluate(
+            data_flow_ops.PaddingFIFOQueue(10, dtypes_lib.float32,
+                                           None).queue_ref)
 
   def testMultiEnqueueMany(self):
     with self.cached_session() as sess:
@@ -656,7 +657,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       for thread in threads:
         thread.join()
 
-      self.assertItemsEqual(dequeued_t.eval(), elems * 10)
+      self.assertCountEqual(self.evaluate(dequeued_t), elems * 10)
 
   def testParallelDequeueMany(self):
     # We need each thread to keep its own device stack or the device scopes
@@ -898,7 +899,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       dequeued_elems = []
       for _ in dequeue_counts:
-        dequeued_elems.extend(dequeued_t.eval())
+        dequeued_elems.extend(self.evaluate(dequeued_t))
       self.assertEqual(elems, dequeued_elems)
 
   def testDequeueFromClosedQueue(self):
@@ -1335,7 +1336,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
       for _ in range(500):
-        self.assertEqual(size_t.eval(), [1])
+        self.assertEqual(self.evaluate(size_t), [1])
 
   def testSharedQueueSameSession(self):
     with self.cached_session():
@@ -1349,23 +1350,23 @@ class PaddingFIFOQueueTest(test.TestCase):
       q1_size_t = q1.size()
       q2_size_t = q2.size()
 
-      self.assertEqual(q1_size_t.eval(), [1])
-      self.assertEqual(q2_size_t.eval(), [1])
+      self.assertEqual(self.evaluate(q1_size_t), [1])
+      self.assertEqual(self.evaluate(q2_size_t), [1])
 
       self.assertEqual(q2.dequeue().eval(), [10.0])
 
-      self.assertEqual(q1_size_t.eval(), [0])
-      self.assertEqual(q2_size_t.eval(), [0])
+      self.assertEqual(self.evaluate(q1_size_t), [0])
+      self.assertEqual(self.evaluate(q2_size_t), [0])
 
       q2.enqueue((20.0,)).run()
 
-      self.assertEqual(q1_size_t.eval(), [1])
-      self.assertEqual(q2_size_t.eval(), [1])
+      self.assertEqual(self.evaluate(q1_size_t), [1])
+      self.assertEqual(self.evaluate(q2_size_t), [1])
 
       self.assertEqual(q1.dequeue().eval(), [20.0])
 
-      self.assertEqual(q1_size_t.eval(), [0])
-      self.assertEqual(q2_size_t.eval(), [0])
+      self.assertEqual(self.evaluate(q1_size_t), [0])
+      self.assertEqual(self.evaluate(q2_size_t), [0])
 
   def testIncompatibleSharedQueueErrors(self):
     with self.cached_session():
@@ -1509,27 +1510,28 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       # The enqueue should start and then block.
       results = []
-      results.append(deq.eval())  # Will only complete after the enqueue starts.
+      results.append(
+          self.evaluate(deq))  # Will only complete after the enqueue starts.
       self.assertEqual(len(enq_done), 1)
       self.assertEqual(self.evaluate(size_op), 5)
 
       for _ in range(3):
-        results.append(deq.eval())
+        results.append(self.evaluate(deq))
 
       time.sleep(0.1)
       self.assertEqual(len(enq_done), 1)
       self.assertEqual(self.evaluate(size_op), 5)
 
       # This dequeue will unblock the thread.
-      results.append(deq.eval())
+      results.append(self.evaluate(deq))
       time.sleep(0.1)
       self.assertEqual(len(enq_done), 2)
       thread.join()
 
       for i in range(5):
-        self.assertEqual(size_op.eval(), 5 - i)
-        results.append(deq.eval())
-        self.assertEqual(size_op.eval(), 5 - i - 1)
+        self.assertEqual(self.evaluate(size_op), 5 - i)
+        results.append(self.evaluate(deq))
+        self.assertEqual(self.evaluate(size_op), 5 - i - 1)
 
       self.assertAllEqual(elem, results)
 
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 134d4240201..b374119bceb 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -251,7 +251,7 @@ class PyFuncTest(PyFuncTestBase):
       y, = script_ops.py_func(read_object_array, [],
                               [dtypes.string])
       z, = script_ops.py_func(read_and_return_strings, [x, y], [dtypes.string])
-      self.assertListEqual(list(z.eval()), [b"hello there", b"hi ya"])
+      self.assertListEqual(list(self.evaluate(z)), [b"hello there", b"hi ya"])
 
   @test_util.run_v1_only("b/120545219")
   def testStringPadding(self):
@@ -308,7 +308,7 @@ class PyFuncTest(PyFuncTestBase):
         return correct
 
       z, = script_ops.py_func(unicode_string, [], [dtypes.string])
-      self.assertEqual(z.eval(), correct.encode("utf8"))
+      self.assertEqual(self.evaluate(z), correct.encode("utf8"))
 
   @test_util.run_v1_only("b/120545219")
   def testBadNumpyReturnType(self):
diff --git a/tensorflow/python/kernel_tests/qr_op_test.py b/tensorflow/python/kernel_tests/qr_op_test.py
index 31d538f8b27..b895fe4ea99 100644
--- a/tensorflow/python/kernel_tests/qr_op_test.py
+++ b/tensorflow/python/kernel_tests/qr_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import backprop
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
@@ -48,7 +49,7 @@ class QrOpTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testWrongDimensions(self):
-    # The input to svd should be a tensor of at least rank 2.
+    # The input to qr should be a tensor of at least rank 2.
     scalar = constant_op.constant(1.)
     with self.assertRaisesRegex((ValueError, errors_impl.InvalidArgumentError),
                                 "rank.* 2.*0"):
@@ -170,7 +171,23 @@ def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
 
 
 class QrGradOpTest(test.TestCase):
-  pass
+
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  def testNotImplementedCheck(self):
+    # Test that the correct message is issued
+    np.random.seed(42)
+    matrix = constant_op.constant(
+        np.random.uniform(low=-1.0, high=1.0, size=(5, 2)).astype(np.float32))
+
+    def _NoGrad(x):
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        ret = linalg_ops.qr(x, full_matrices=True)
+      return tape.gradient(ret, x)
+
+    m = r"QrGrad not implemented when nrows > ncols and full_matrices is true."
+    with self.assertRaisesRegex(NotImplementedError, m):
+      _NoGrad(matrix)
 
 
 def _GetQrGradOpTest(dtype_, shape_, full_matrices_):
diff --git a/tensorflow/python/kernel_tests/random/BUILD b/tensorflow/python/kernel_tests/random/BUILD
index 6e404b4cd5f..06360fc2095 100644
--- a/tensorflow/python/kernel_tests/random/BUILD
+++ b/tensorflow/python/kernel_tests/random/BUILD
@@ -119,7 +119,7 @@ cuda_py_test(
     name = "stateless_random_ops_test",
     size = "medium",
     srcs = ["stateless_random_ops_test.py"],
-    shard_count = 2,
+    shard_count = 10,
     tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/python/kernel_tests/random/random_crop_test.py b/tensorflow/python/kernel_tests/random/random_crop_test.py
index 724bee07157..f8effa0ee7b 100644
--- a/tensorflow/python/kernel_tests/random/random_crop_test.py
+++ b/tensorflow/python/kernel_tests/random/random_crop_test.py
@@ -77,5 +77,88 @@ class RandomCropTest(test.TestCase):
     self.assertAllClose(counts, mean, atol=four_stddev)
 
 
-if __name__ == '__main__':
+class StatelessRandomCropTest(test.TestCase):
+
+  def testNoOp(self):
+    # No random cropping is performed since the size is value.shape.
+    for shape in (2, 1, 1), (2, 1, 3), (4, 5, 3):
+      value = np.arange(0, np.prod(shape), dtype=np.int32).reshape(shape)
+      crop = random_ops.stateless_random_crop(value, shape, seed=(1, 2))
+      self.evaluate(crop)
+      self.assertAllEqual(crop, value)
+
+  def testContains(self):
+    with test_util.use_gpu():
+      shape = (3, 5, 7)
+      target = (2, 3, 4)
+      value = np.random.randint(1000000, size=shape)
+      iterations = 10
+      value_set = set(
+          tuple(value[i:i + 2, j:j + 3, k:k + 4].ravel())  # pylint: disable=g-complex-comprehension
+          for i in range(2) for j in range(3) for k in range(4))
+      test_seeds = [
+          tuple(map(lambda x, i=i: x + 1 * i, t))
+          for (i, t) in enumerate((1, 2) for _ in range(iterations))
+      ]
+
+      # Check that the result is valid by making sure that it is one of all
+      # possible values for randomly cropping `value` with `target` shape.
+      for seed in test_seeds:
+        crop = random_ops.stateless_random_crop(value, size=target, seed=seed)
+        y = self.evaluate(crop)
+        self.assertAllEqual(y.shape, target)
+        self.assertIn(tuple(y.ravel()), value_set)
+
+  # TODO(b/162345082): stateless random op generates different random number
+  # with xla_gpu. Update tests such that there is a single ground truth result
+  # to test against.
+  def testRandomization(self):
+    with test_util.use_gpu():
+      shape = [5, 4, 1]
+      size = np.prod(shape)
+      single = [1, 1, 1]
+      value = np.arange(size).reshape(shape)
+      iterations = 5
+      num_samples = 5
+
+      # Test that the same result is returned given the same seed is provided
+      # for each round.
+      test_seed = (1, 2)
+      observations = [[] for _ in range(iterations)]
+      for observation in observations:
+        crop = random_ops.stateless_random_crop(value, single, seed=test_seed)
+        counts = np.zeros(size, dtype=np.int32)
+        for _ in range(num_samples):
+          y = self.evaluate(crop)
+          self.assertAllEqual(y.shape, single)
+          counts[y] += 1
+
+        observation.append(counts)
+
+      for i in range(1, iterations):
+        self.assertAllEqual(observations[0], observations[i])
+
+      # Test that the same sequence of results are returned given the same
+      # sequence of seeds provided.
+      test_seeds = [
+          tuple(map(lambda x, i=i: x + 1 * i, t))
+          for (i, t) in enumerate((1, 2) for _ in range(iterations))
+      ]
+      observations = [[] for _ in range(iterations)]
+      for observation in observations:
+        counts = np.zeros(size, dtype=np.int32)
+        for seed in test_seeds:
+          crop = random_ops.stateless_random_crop(
+              value, single, seed=seed)
+          y = self.evaluate(crop)
+          self.assertAllEqual(y.shape, single)
+          counts[y] += 1
+
+        observation.append(counts)
+
+      for i in range(1, iterations):
+        self.assertAllEqual(observations[0], observations[i])
+
+
+if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
index d7e50083deb..f3949f30c03 100644
--- a/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
@@ -22,6 +22,7 @@ import functools
 
 from absl.testing import parameterized
 import numpy as np
+from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -51,160 +52,220 @@ def invert_philox(key, value):
   return np.array(value)
 
 
-class StatelessOpsTest(test.TestCase, parameterized.TestCase):
+SEEDS = ((7, 17), (11, 5), (2, 3))
+SEED_TYPES = [dtypes.int32, dtypes.int64]
 
-  def _test_match(self, cases):
-    # Stateless ops should be the same as stateful ops on the first call
-    # after seed scrambling.
-    cases = tuple(cases)
-    key = 0x3ec8f720, 0x02461e29
-    for seed in (7, 17), (11, 5), (2, 3):
-      preseed = invert_philox(key, (seed[0], 0, seed[1], 0)).astype(np.uint64)
-      preseed = preseed[::2] | preseed[1::2] << 32
-      random_seed.set_random_seed(seed[0])
-      with test_util.use_gpu():
-        for stateless_op, stateful_op in cases:
-          stateful = stateful_op(seed=seed[1])
-          pure = stateless_op(seed=preseed)
-          self.assertAllEqual(self.evaluate(stateful), self.evaluate(pure))
 
-  def _test_determinism(self, cases):
-    # Stateless values should be equal iff the seeds are equal (roughly)
-    cases = tuple(cases)
-    with self.test_session(use_gpu=True):
-      for seed_type in [dtypes.int32, dtypes.int64]:
-        seed_t = array_ops.placeholder(seed_type, shape=[2])
-        seeds = [(x, y) for x in range(5) for y in range(5)] * 3
-        for stateless_op, _ in cases:
-          pure = stateless_op(seed=seed_t)
-          values = [
-              (seed, pure.eval(feed_dict={seed_t: seed})) for seed in seeds
-          ]
-          for s0, v0 in values:
-            for s1, v1 in values:
-              self.assertEqual(s0 == s1, np.all(v0 == v1))
-
-  def _float_cases(self, shape_dtypes=(None,)):
-    float_cases = (
-        # Uniform distribution, with and without range
-        (stateless.stateless_random_uniform, random_ops.random_uniform, {}),
-        (stateless.stateless_random_uniform, random_ops.random_uniform,
-         dict(minval=2.2, maxval=7.1)),
-        # Normal distribution, with and without mean+stddev
-        (stateless.stateless_random_normal, random_ops.random_normal, {}),
-        (stateless.stateless_random_normal, random_ops.random_normal,
-         dict(mean=2, stddev=3)),
-        # Truncated normal distribution, with and without mean+stddev
-        (stateless.stateless_truncated_normal, random_ops.truncated_normal, {}),
-        (stateless.stateless_truncated_normal, random_ops.truncated_normal,
-         dict(mean=3, stddev=4)),
-    )
-    for dtype in dtypes.float16, dtypes.float32, dtypes.float64:
-      for shape_dtype in shape_dtypes:
-        for shape in (), (3,), (2, 5):
-          if shape_dtype is not None:
-            shape = constant_op.constant(shape, dtype=shape_dtype)
-          for stateless_op, stateful_op, kwds in float_cases:
-            kwds = dict(shape=shape, dtype=dtype, **kwds)
-            yield (functools.partial(stateless_op, **kwds),
-                   functools.partial(stateful_op, **kwds))
-
-  def _int_cases(self, shape_dtypes=(None,)):
+def float_cases(shape_dtypes=(None,)):
+  cases = (
+      # Uniform distribution, with and without range
+      (stateless.stateless_random_uniform, random_ops.random_uniform, {}),
+      (stateless.stateless_random_uniform, random_ops.random_uniform,
+       dict(minval=2.2, maxval=7.1)),
+      # Normal distribution, with and without mean+stddev
+      (stateless.stateless_random_normal, random_ops.random_normal, {}),
+      (stateless.stateless_random_normal, random_ops.random_normal,
+       dict(mean=2, stddev=3)),
+      # Truncated normal distribution, with and without mean+stddev
+      (stateless.stateless_truncated_normal, random_ops.truncated_normal, {}),
+      (stateless.stateless_truncated_normal, random_ops.truncated_normal,
+       dict(mean=3, stddev=4)),
+  )
+  # Explicitly passing in params because capturing cell variable from loop is
+  # problematic in Python
+  def wrap(op, dtype, shape, shape_dtype, kwds, seed):
+    shape_ = (constant_op.constant(shape, dtype=shape_dtype)
+              if shape_dtype is not None else shape)
+    return op(seed=seed, shape=shape_, dtype=dtype, **kwds)
+  for dtype in dtypes.float16, dtypes.float32, dtypes.float64:
     for shape_dtype in shape_dtypes:
       for shape in (), (3,), (2, 5):
-        if shape_dtype is not None:
-          shape = constant_op.constant(shape, dtype=shape_dtype)
-        for dtype in dtypes.int32, dtypes.int64:
-          kwds = dict(minval=2, maxval=11111, dtype=dtype, shape=shape)
-          yield (functools.partial(stateless.stateless_random_uniform, **kwds),
-                 functools.partial(random_ops.random_uniform, **kwds))
+        for stateless_op, stateful_op, kwds in cases:
+          yield (functools.partial(wrap, stateless_op, dtype, shape,
+                                   shape_dtype, kwds),
+                 functools.partial(wrap, stateful_op, dtype, shape,
+                                   shape_dtype, kwds))
 
-  def _multinomial_cases(self):
-    num_samples = 10
-    for logits_dtype in np.float16, np.float32, np.float64:
-      for output_dtype in dtypes.int32, dtypes.int64:
-        for logits in ([[0.1, 0.25, 0.5, 0.15]], [[0.5, 0.5], [0.8, 0.2],
-                                                  [0.25, 0.75]]):
-          kwds = dict(
+
+def int_cases(shape_dtypes=(None,)):
+  def wrap(op, shape, shape_dtype, dtype, seed):
+    shape_ = (constant_op.constant(shape, dtype=shape_dtype)
+              if shape_dtype is not None else shape)
+    return op(seed=seed, shape=shape_, minval=2, maxval=11111,
+              dtype=dtype)
+  for shape_dtype in shape_dtypes:
+    for shape in (), (3,), (2, 5):
+      for dtype in dtypes.int32, dtypes.int64:
+        yield (functools.partial(wrap, stateless.stateless_random_uniform,
+                                 shape, shape_dtype, dtype),
+               functools.partial(wrap, random_ops.random_uniform,
+                                 shape, shape_dtype, dtype))
+
+
+def multinomial_cases():
+  num_samples = 10
+  def wrap(op, logits, logits_dtype, output_dtype, seed):
+    return op(seed=seed,
               logits=constant_op.constant(logits, dtype=logits_dtype),
-              num_samples=num_samples,
-              output_dtype=output_dtype)
-          yield (functools.partial(stateless.stateless_multinomial, **kwds),
-                 functools.partial(random_ops.multinomial, **kwds))
+              num_samples=num_samples, output_dtype=output_dtype)
+  for logits_dtype in np.float16, np.float32, np.float64:
+    for output_dtype in dtypes.int32, dtypes.int64:
+      for logits in ([[0.1, 0.25, 0.5, 0.15]], [[0.5, 0.5], [0.8, 0.2],
+                                                [0.25, 0.75]]):
+        yield (functools.partial(wrap, stateless.stateless_multinomial, logits,
+                                 logits_dtype, output_dtype),
+               functools.partial(wrap, random_ops.multinomial, logits,
+                                 logits_dtype, output_dtype))
 
-  def _gamma_cases(self):
-    for dtype in np.float16, np.float32, np.float64:
-      for alpha in ([[.5, 1., 2.]], [[0.5, 0.5], [0.8, 0.2], [0.25, 0.75]]):
-        kwds = dict(alpha=constant_op.constant(alpha, dtype=dtype), dtype=dtype)
-        yield (
-            functools.partial(stateless.stateless_random_gamma,
-                              shape=(10,) + tuple(np.shape(alpha)), **kwds),
-            functools.partial(random_ops.random_gamma, shape=(10,), **kwds))
 
-  def _poisson_cases(self):
-    for lam_dtype in np.float16, np.float32, np.float64, np.int32, np.int64:
-      for out_dtype in np.float16, np.float32, np.float64, np.int32, np.int64:
-        for lam in ([[5.5, 1., 2.]], [[7.5, 10.5], [3.8, 8.2], [1.25, 9.75]]):
-          kwds = dict(
+def gamma_cases():
+  def wrap(op, alpha, dtype, shape, seed):
+    return op(seed=seed, shape=shape,
+              alpha=constant_op.constant(alpha, dtype=dtype), dtype=dtype)
+  for dtype in np.float16, np.float32, np.float64:
+    for alpha in ([[.5, 1., 2.]], [[0.5, 0.5], [0.8, 0.2], [0.25, 0.75]]):
+      yield (functools.partial(wrap, stateless.stateless_random_gamma, alpha,
+                               dtype, (10,) + tuple(np.shape(alpha))),
+             functools.partial(wrap, random_ops.random_gamma, alpha,
+                               dtype, (10,)))
+
+
+def poisson_cases():
+  def wrap(op, lam, lam_dtype, out_dtype, shape, seed):
+    return op(seed=seed, shape=shape,
               lam=constant_op.constant(lam_dtype(lam), dtype=lam_dtype),
               dtype=out_dtype)
-          yield (
-              functools.partial(stateless.stateless_random_poisson,
-                                shape=(10,) + tuple(np.shape(lam)),
-                                **kwds),
-              functools.partial(random_ops.random_poisson, shape=(10,), **kwds))
+  for lam_dtype in np.float16, np.float32, np.float64, np.int32, np.int64:
+    for out_dtype in np.float16, np.float32, np.float64, np.int32, np.int64:
+      for lam in ([[5.5, 1., 2.]], [[7.5, 10.5], [3.8, 8.2], [1.25, 9.75]]):
+        yield (functools.partial(wrap, stateless.stateless_random_poisson, lam,
+                                 lam_dtype, out_dtype,
+                                 (10,) + tuple(np.shape(lam))),
+               functools.partial(wrap, random_ops.random_poisson, lam,
+                                 lam_dtype, out_dtype, (10,)))
 
-  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
-  @test_util.run_deprecated_v1
-  def testMatchFloat(self):
-    self._test_match(self._float_cases())
 
-  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
-  @test_util.run_deprecated_v1
-  def testMatchInt(self):
-    self._test_match(self._int_cases())
+class StatelessOpsTest(test.TestCase, parameterized.TestCase):
 
-  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
-  @test_util.run_deprecated_v1
-  def testMatchMultinomial(self):
-    self._test_match(self._multinomial_cases())
+  def _test_match(self, case, seed):
+    # Stateless ops should be the same as stateful ops on the first call
+    # after seed scrambling.
+    key = 0x3ec8f720, 0x02461e29
+    preseed = invert_philox(key, (seed[0], 0, seed[1], 0)).astype(np.uint64)
+    preseed = preseed[::2] | preseed[1::2] << 32
+    random_seed.set_random_seed(seed[0])
+    with test_util.use_gpu():
+      stateless_op, stateful_op = case
+      if context.executing_eagerly():
+        # Call set_random_seed in order to clear kernel cache, to prevent
+        # kernel reusing for the stateful op
+        random_seed.set_random_seed(seed[0])
+      stateful = stateful_op(seed=seed[1])
+      pure = stateless_op(seed=preseed)
+      self.assertAllEqual(stateful, pure)
 
-  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
-  @test_util.run_deprecated_v1
-  def testMatchGamma(self):
-    self._test_match(self._gamma_cases())
+  def _test_determinism(self, case, seed_type):
+    # Stateless values should be equal iff the seeds are equal (roughly)
+    seeds = [(x, y) for x in range(5) for y in range(5)] * 3  # pylint: disable=g-complex-comprehension
+    with self.test_session(use_gpu=True), test_util.use_gpu():
+      stateless_op, _ = case
+      if context.executing_eagerly():
+        values = [
+            (seed, stateless_op(seed=constant_op.constant(seed, seed_type)))
+            for seed in seeds]
+      else:
+        # Have this branch because the above branch is too slow in graph
+        # mode
+        seed_t = array_ops.placeholder(seed_type, shape=[2])
+        pure = stateless_op(seed=seed_t)
+        values = [
+            (seed, pure.eval(feed_dict={seed_t: seed})) for seed in seeds
+        ]
+      for s0, v0 in values:
+        for s1, v1 in values:
+          self.assertEqual(s0 == s1, np.all(v0 == v1))
 
+  @parameterized.named_parameters(
+      ('_%s_%s' % (case_id, seed_id), case, seed)  # pylint: disable=g-complex-comprehension
+      for seed_id, seed in enumerate(SEEDS)
+      for case_id, case in enumerate(float_cases()))
   @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
-  @test_util.run_deprecated_v1
-  def testMatchPoisson(self):
-    self._test_match(self._poisson_cases())
+  def testMatchFloat(self, case, seed):
+    self._test_match(case, seed)
 
+  @parameterized.named_parameters(
+      ('_%s_%s' % (case_id, seed_id), case, seed)  # pylint: disable=g-complex-comprehension
+      for seed_id, seed in enumerate(SEEDS)
+      for case_id, case in enumerate(int_cases()))
   @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
-  @test_util.run_deprecated_v1
-  def testDeterminismFloat(self):
-    self._test_determinism(
-        self._float_cases(shape_dtypes=(dtypes.int32, dtypes.int64)))
+  def testMatchInt(self, case, seed):
+    self._test_match(case, seed)
 
+  @parameterized.named_parameters(
+      ('_%s_%s' % (case_id, seed_id), case, seed)  # pylint: disable=g-complex-comprehension
+      for seed_id, seed in enumerate(SEEDS)
+      for case_id, case in enumerate(multinomial_cases()))
   @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
-  @test_util.run_deprecated_v1
-  def testDeterminismInt(self):
-    self._test_determinism(
-        self._int_cases(shape_dtypes=(dtypes.int32, dtypes.int64)))
+  def testMatchMultinomial(self, case, seed):
+    self._test_match(case, seed)
 
+  @parameterized.named_parameters(
+      ('_%s_%s' % (case_id, seed_id), case, seed)  # pylint: disable=g-complex-comprehension
+      for seed_id, seed in enumerate(SEEDS)
+      for case_id, case in enumerate(gamma_cases()))
   @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
-  @test_util.run_deprecated_v1
-  def testDeterminismMultinomial(self):
-    self._test_determinism(self._multinomial_cases())
+  def testMatchGamma(self, case, seed):
+    self._test_match(case, seed)
 
+  @parameterized.named_parameters(
+      ('_%s_%s' % (case_id, seed_id), case, seed)  # pylint: disable=g-complex-comprehension
+      for seed_id, seed in enumerate(SEEDS)
+      for case_id, case in enumerate(poisson_cases()))
   @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
-  @test_util.run_deprecated_v1
-  def testDeterminismGamma(self):
-    self._test_determinism(self._gamma_cases())
+  def testMatchPoisson(self, case, seed):
+    self._test_match(case, seed)
 
+  @parameterized.named_parameters(
+      ('_%s_%s' % (case_id, type_id), case, seed_type)  # pylint: disable=g-complex-comprehension
+      for type_id, seed_type in enumerate(SEED_TYPES)
+      for case_id, case in enumerate(float_cases(
+          shape_dtypes=(dtypes.int32, dtypes.int64))))
   @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
-  @test_util.run_deprecated_v1
-  def testDeterminismPoisson(self):
-    self._test_determinism(self._poisson_cases())
+  def testDeterminismFloat(self, case, seed_type):
+    self._test_determinism(case, seed_type)
+
+  @parameterized.named_parameters(
+      ('_%s_%s' % (case_id, type_id), case, seed_type)  # pylint: disable=g-complex-comprehension
+      for type_id, seed_type in enumerate(SEED_TYPES)
+      for case_id, case in enumerate(int_cases(
+          shape_dtypes=(dtypes.int32, dtypes.int64))))
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
+  def testDeterminismInt(self, case, seed_type):
+    self._test_determinism(case, seed_type)
+
+  @parameterized.named_parameters(
+      ('_%s_%s' % (case_id, type_id), case, seed_type)  # pylint: disable=g-complex-comprehension
+      for type_id, seed_type in enumerate(SEED_TYPES)
+      for case_id, case in enumerate(multinomial_cases()))
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
+  def testDeterminismMultinomial(self, case, seed_type):
+    self._test_determinism(case, seed_type)
+
+  @parameterized.named_parameters(
+      ('_%s_%s' % (case_id, type_id), case, seed_type)  # pylint: disable=g-complex-comprehension
+      for type_id, seed_type in enumerate(SEED_TYPES)
+      for case_id, case in enumerate(gamma_cases()))
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
+  def testDeterminismGamma(self, case, seed_type):
+    self._test_determinism(case, seed_type)
+
+  @parameterized.named_parameters(
+      ('_%s_%s' % (case_id, type_id), case, seed_type)  # pylint: disable=g-complex-comprehension
+      for type_id, seed_type in enumerate(SEED_TYPES)
+      for case_id, case in enumerate(poisson_cases()))
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
+  def testDeterminismPoisson(self, case, seed_type):
+    self._test_determinism(case, seed_type)
 
   def assertDTypeEqual(self, a, b):
     self.assertEqual(dtypes.as_dtype(a), dtypes.as_dtype(b))
diff --git a/tensorflow/python/kernel_tests/regex_replace_op_test.py b/tensorflow/python/kernel_tests/regex_replace_op_test.py
index 6c7dfee7b40..c0e5f9d76fb 100644
--- a/tensorflow/python/kernel_tests/regex_replace_op_test.py
+++ b/tensorflow/python/kernel_tests/regex_replace_op_test.py
@@ -44,7 +44,7 @@ class RegexReplaceOpVariantsTest(test.TestCase, parameterized.TestCase):
                                 "HiJkLmN"], dtypes.string),
           pos=0,
           len=5)
-      stripped = op(inp, "\\p{Ll}", ".").eval()
+      stripped = op(inp, "\\p{Ll}", ".")
       self.assertAllEqual([b"A.C.E", b"H.J.L"], stripped)
 
   @test_util.run_deprecated_v1
@@ -52,7 +52,7 @@ class RegexReplaceOpVariantsTest(test.TestCase, parameterized.TestCase):
     values = ["a:foo", "a:bar", "a:foo", "b:baz", "b:qux", "ca:b"]
     with self.cached_session():
       input_vector = constant_op.constant(values, dtypes.string)
-      stripped = op(input_vector, "^(a:|b:)", "", replace_global=False).eval()
+      stripped = op(input_vector, "^(a:|b:)", "", replace_global=False)
       self.assertAllEqual([b"foo", b"bar", b"foo", b"baz", b"qux", b"ca:b"],
                           stripped)
 
@@ -61,7 +61,7 @@ class RegexReplaceOpVariantsTest(test.TestCase, parameterized.TestCase):
     values = ["aba\naba", "abcdabcde"]
     with self.cached_session():
       input_vector = constant_op.constant(values, dtypes.string)
-      stripped = op(input_vector, "a.*a", "(\\0)").eval()
+      stripped = op(input_vector, "a.*a", "(\\0)")
       self.assertAllEqual([b"(aba)\n(aba)", b"(abcda)bcde"], stripped)
 
   @test_util.run_deprecated_v1
@@ -69,7 +69,7 @@ class RegexReplaceOpVariantsTest(test.TestCase, parameterized.TestCase):
     values = ["abc", "1"]
     with self.cached_session():
       input_vector = constant_op.constant(values, dtypes.string)
-      stripped = op(input_vector, "", "x").eval()
+      stripped = op(input_vector, "", "x")
       self.assertAllEqual([b"xaxbxcx", b"x1x"], stripped)
 
   @test_util.run_deprecated_v1
@@ -87,7 +87,7 @@ class RegexReplaceOpVariantsTest(test.TestCase, parameterized.TestCase):
     values = ["ababababab", "abcabcabc", ""]
     with self.cached_session():
       input_vector = constant_op.constant(values, dtypes.string)
-      stripped = op(input_vector, "ab", "abc", True).eval()
+      stripped = op(input_vector, "ab", "abc", True)
       self.assertAllEqual([b"abcabcabcabcabc", b"abccabccabcc", b""], stripped)
 
 
diff --git a/tensorflow/python/kernel_tests/reshape_op_test.py b/tensorflow/python/kernel_tests/reshape_op_test.py
index 0d54138e053..80f72554aeb 100644
--- a/tensorflow/python/kernel_tests/reshape_op_test.py
+++ b/tensorflow/python/kernel_tests/reshape_op_test.py
@@ -22,9 +22,11 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.platform import test
 
 
@@ -116,17 +118,17 @@ class ReshapeTest(test.TestCase):
   # TODO(vrv): Add tests for failure conditions once python test_util
   # reports errors.
 
-  @test_util.run_deprecated_v1
   def testFloatReshapeGradThreeDimensions(self):
     x = np.arange(1., 25.).reshape([2, 3, 4]).astype(np.float32)
-    s = list(np.shape(x))
+    input_tensor = constant_op.constant(x)
+
+    def reshape(x):
+      return array_ops.reshape(x, [1, 8, 3])
+
     with self.cached_session():
-      input_tensor = constant_op.constant(x)
-      reshape_out = array_ops.reshape(input_tensor, [1, 8, 3])
-      err = gradient_checker.compute_gradient_error(
-          input_tensor, s, reshape_out, s, x_init_value=x)
-    print("Reshape gradient error = " % err)
-    self.assertLess(err, 1e-3)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(reshape, [input_tensor]))
+      self.assertLess(err, 1e-3)
 
   def testFloatEmpty(self):
     x = np.empty((0, 0, 0, 0), dtype=np.float32)
@@ -155,41 +157,62 @@ class ReshapeTest(test.TestCase):
                                 "Cannot reshape a tensor with 4096 elements"):
       array_ops.reshape(z, [4095])
 
-  @test_util.run_deprecated_v1
   def testPartialShapes(self):
-    x = array_ops.placeholder(dtypes.float32)
 
-    # Unknown input shape, partial new shape.
-    y = array_ops.reshape(x, [1, 1, -1, 1])
-    self.assertEqual([1, 1, None, 1], y.get_shape().as_list())
+    # Testing unknown shapes in graph building.
+    with ops.Graph().as_default():
+      x = array_ops.placeholder(dtypes.float32)
 
-    # Unknown input shape, unknown new shape.
-    y = array_ops.reshape(x, array_ops.placeholder(dtypes.int32))
-    self.assertEqual(None, y.get_shape().ndims)
+      # Unknown input shape, partial new shape.
+      y = array_ops.reshape(x, [1, 1, -1, 1])
+      self.assertEqual([1, 1, None, 1], y.get_shape().as_list())
 
-    # Unknown input shape, known rank for new shape.
-    y = array_ops.reshape(x, array_ops.placeholder(dtypes.int32, shape=(3,)))
-    self.assertEqual([None, None, None], y.get_shape().as_list())
+      # Unknown input shape, unknown new shape.
+      y = array_ops.reshape(x, array_ops.placeholder(dtypes.int32))
+      self.assertEqual(None, y.get_shape().ndims)
 
-    # Unknown input shape, partial new shape using `tf.stack()`.
-    y = array_ops.reshape(x, [array_ops.placeholder(dtypes.int32), 37])
-    self.assertEqual([None, 37], y.get_shape().as_list())
+      # Unknown input shape, known rank for new shape.
+      y = array_ops.reshape(x, array_ops.placeholder(dtypes.int32, shape=(3,)))
+      self.assertEqual([None, None, None], y.get_shape().as_list())
 
-    # Unknown input shape, partial new shape using `tf.concat()`.
+      # Unknown input shape, partial new shape using `tf.stack()`.
+      y = array_ops.reshape(x, [array_ops.placeholder(dtypes.int32), 37])
+      self.assertEqual([None, 37], y.get_shape().as_list())
+
+      # Unknown input shape, partial new shape using `tf.concat()`.
+      y = array_ops.reshape(
+          x,
+          array_ops.concat(
+              [array_ops.placeholder(
+                  dtypes.int32, shape=(2,)), [37, 42]], 0))
+      self.assertEqual([None, None, 37, 42], y.get_shape().as_list())
+
+      # Unknown input shape, partial new shape using `tf.shape()`.
+      y = array_ops.reshape(
+          x,
+          array_ops.shape(
+              array_ops.placeholder(
+                  dtypes.float32, shape=[None, 37, None])))
+      self.assertEqual([None, 37, None], y.get_shape().as_list())
+
+  def testTensorShape(self):
+    x = array_ops.zeros([1, 100])
     y = array_ops.reshape(
-        x,
-        array_ops.concat(
-            [array_ops.placeholder(
-                dtypes.int32, shape=(2,)), [37, 42]], 0))
-    self.assertEqual([None, None, 37, 42], y.get_shape().as_list())
+        x, [tensor_shape.Dimension(100),
+            tensor_shape.Dimension(1)])
+    self.assertEqual([100, 1], y.get_shape().as_list())
+    y = array_ops.reshape(x, tensor_shape.TensorShape([100, 1]))
+    self.assertEqual([100, 1], y.get_shape().as_list())
 
-    # Unknown input shape, partial new shape using `tf.shape()`.
-    y = array_ops.reshape(
-        x,
-        array_ops.shape(
-            array_ops.placeholder(
-                dtypes.float32, shape=[None, 37, None])))
-    self.assertEqual([None, 37, None], y.get_shape().as_list())
+  def testInt64Shape(self):
+    with ops.device("/device:CPU:0"):
+      x = array_ops.zeros([50000, 50000], dtype=dtypes.bool)
+      # Provide dimension larger than int32
+      y = array_ops.reshape(x, [50000**2])
+      self.assertEqual([50000**2], y.get_shape().as_list())
+      # Even if first dimension is within int32, ensure we correctly go to int64
+      y = array_ops.reshape(x, [1, 50000**2])
+      self.assertEqual([1, 50000**2], y.get_shape().as_list())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 953c616b0bc..beedf6ef1f1 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -187,7 +187,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
     with self.cached_session():
       handle = resource_variable_ops.var_handle_op(
           dtype=dtypes.int32, shape=[1], name="foo")
-      self.assertNotEmpty(handle.eval())
+      self.assertNotEmpty(self.evaluate(handle))
 
   @test_util.run_deprecated_v1
   def testCachedValueReadBeforeWrite(self):
@@ -562,73 +562,116 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
     read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
     self.assertEqual(self.evaluate(read), [[6]])
 
+  @parameterized.parameters(dtypes.float16, dtypes.float32, dtypes.float64)
   @test_util.run_in_graph_and_eager_modes
-  def testScatterAddVariableMethod(self):
-    v = resource_variable_ops.ResourceVariable([0.0, 1.5], name="add")
+  def testScatterAddVariableMethod(self, dtype):
+    v = resource_variable_ops.ResourceVariable([0.0, 1.5],
+                                               name="add",
+                                               dtype=dtype)
     self.evaluate(variables.global_variables_initializer())
     self.evaluate(
-        v.scatter_add(ops.IndexedSlices(indices=[1], values=[2.5])))
-    self.assertAllEqual([0.0, 4.0], self.evaluate(v))
+        v.scatter_add(
+            ops.IndexedSlices(
+                indices=[1], values=constant_op.constant([2.5], dtype=dtype))))
+    self.assertAllCloseAccordingToType([0.0, 4.0], self.evaluate(v))
 
+  @parameterized.parameters(dtypes.float16, dtypes.float32, dtypes.float64)
   @test_util.run_in_graph_and_eager_modes
-  def testScatterSubVariableMethod(self):
-    v = resource_variable_ops.ResourceVariable([0.0, 2.5], name="sub")
+  def testScatterSubVariableMethod(self, dtype):
+    v = resource_variable_ops.ResourceVariable([0.0, 2.5],
+                                               name="sub",
+                                               dtype=dtype)
     self.evaluate(variables.global_variables_initializer())
     self.evaluate(
-        v.scatter_sub(ops.IndexedSlices(indices=[1], values=[1.5])))
-    self.assertAllEqual([0.0, 1.0], self.evaluate(v))
+        v.scatter_sub(
+            ops.IndexedSlices(
+                indices=[1], values=constant_op.constant([1.5], dtype=dtype))))
+    self.assertAllCloseAccordingToType([0.0, 1.0], self.evaluate(v))
 
+  @parameterized.parameters(dtypes.float16, dtypes.float32, dtypes.float64)
   @test_util.run_in_graph_and_eager_modes
-  def testScatterMaxVariableMethod(self):
-    v = resource_variable_ops.ResourceVariable([0.0, 4.0], name="max1")
+  def testScatterMaxVariableMethod(self, dtype):
+    v = resource_variable_ops.ResourceVariable([0.0, 4.0],
+                                               name="max1",
+                                               dtype=dtype)
     self.evaluate(variables.global_variables_initializer())
     self.evaluate(
-        v.scatter_max(ops.IndexedSlices(indices=[1], values=[5.0])))
-    self.assertAllEqual([0.0, 5.0], self.evaluate(v))
+        v.scatter_max(
+            ops.IndexedSlices(
+                indices=[1], values=constant_op.constant([5.0], dtype=dtype))))
+    self.assertAllCloseAccordingToType([0.0, 5.0], self.evaluate(v))
 
-    v = resource_variable_ops.ResourceVariable([0.0, 3.5], name="max2")
+    v = resource_variable_ops.ResourceVariable([0.0, 3.5],
+                                               name="max2",
+                                               dtype=dtype)
     self.evaluate(variables.global_variables_initializer())
     self.evaluate(
-        v.scatter_max(ops.IndexedSlices(indices=[1], values=[2.0])))
-    self.assertAllEqual([0.0, 3.5], self.evaluate(v))
+        v.scatter_max(
+            ops.IndexedSlices(
+                indices=[1], values=constant_op.constant([2.0], dtype=dtype))))
+    self.assertAllCloseAccordingToType([0.0, 3.5], self.evaluate(v))
 
+  @parameterized.parameters(dtypes.float16, dtypes.float32, dtypes.float64)
   @test_util.run_in_graph_and_eager_modes
-  def testScatterMinVariableMethod(self):
-    v = resource_variable_ops.ResourceVariable([0.0, 4.0], name="min1")
+  def testScatterMinVariableMethod(self, dtype):
+    v = resource_variable_ops.ResourceVariable([0.0, 4.0],
+                                               name="min1",
+                                               dtype=dtype)
     self.evaluate(variables.global_variables_initializer())
     self.evaluate(
-        v.scatter_min(ops.IndexedSlices(indices=[1], values=[5.0])))
-    self.assertAllEqual([0.0, 4.0], self.evaluate(v))
+        v.scatter_min(
+            ops.IndexedSlices(
+                indices=[1], values=constant_op.constant([5.0], dtype=dtype))))
+    self.assertAllCloseAccordingToType([0.0, 4.0], self.evaluate(v))
 
-    v = resource_variable_ops.ResourceVariable([0.0, 3.5], name="min2")
+    v = resource_variable_ops.ResourceVariable([0.0, 3.5],
+                                               name="min2",
+                                               dtype=dtype)
     self.evaluate(variables.global_variables_initializer())
     self.evaluate(
-        v.scatter_min(ops.IndexedSlices(indices=[1], values=[2.0])))
-    self.assertAllEqual([0.0, 2.0], self.evaluate(v))
+        v.scatter_min(
+            ops.IndexedSlices(
+                indices=[1], values=constant_op.constant([2.0], dtype=dtype))))
+    self.assertAllCloseAccordingToType([0.0, 2.0], self.evaluate(v))
 
+  @parameterized.parameters(dtypes.float16, dtypes.float32, dtypes.float64)
   @test_util.run_in_graph_and_eager_modes
-  def testScatterMulVariableMethod(self):
-    v = resource_variable_ops.ResourceVariable([0.0, 4.0], name="mul")
+  def testScatterMulVariableMethod(self, dtype):
+    v = resource_variable_ops.ResourceVariable([0.0, 4.0],
+                                               name="mul",
+                                               dtype=dtype)
     self.evaluate(variables.global_variables_initializer())
     self.evaluate(
-        v.scatter_mul(ops.IndexedSlices(indices=[1], values=[3.0])))
-    self.assertAllEqual([0.0, 12.0], self.evaluate(v))
+        v.scatter_mul(
+            ops.IndexedSlices(
+                indices=[1], values=constant_op.constant([3.0], dtype=dtype))))
+    self.assertAllCloseAccordingToType([0.0, 12.0], self.evaluate(v))
 
+  @parameterized.parameters(dtypes.float16, dtypes.float32, dtypes.float64)
   @test_util.run_in_graph_and_eager_modes
-  def testScatterDivVariableMethod(self):
-    v = resource_variable_ops.ResourceVariable([0.0, 6.0], name="div")
+  def testScatterDivVariableMethod(self, dtype):
+    v = resource_variable_ops.ResourceVariable([0.0, 6.0],
+                                               name="div",
+                                               dtype=dtype)
     self.evaluate(variables.global_variables_initializer())
     self.evaluate(
-        v.scatter_div(ops.IndexedSlices(indices=[1], values=[2.0])))
-    self.assertAllEqual([0.0, 3.0], self.evaluate(v))
+        v.scatter_div(
+            ops.IndexedSlices(
+                indices=[1], values=constant_op.constant([2.0], dtype=dtype))))
+    self.assertAllCloseAccordingToType([0.0, 3.0], self.evaluate(v))
 
+  @parameterized.parameters(dtypes.float16, dtypes.float32, dtypes.float64)
   @test_util.run_in_graph_and_eager_modes
-  def testScatterUpdateVariableMethod(self):
-    v = resource_variable_ops.ResourceVariable([0.0, 6.0], name="update")
+  def testScatterUpdateVariableMethod(self, dtype):
+    v = resource_variable_ops.ResourceVariable([0.0, 6.0],
+                                               name="update",
+                                               dtype=dtype)
     self.evaluate(variables.global_variables_initializer())
     self.evaluate(
-        v.scatter_update(ops.IndexedSlices(indices=[1], values=[3.0])))
-    self.assertAllEqual([0.0, 3.0], self.evaluate(v))
+        v.scatter_update(
+            ops.IndexedSlices(
+                indices=[1], values=constant_op.constant([3.0], dtype=dtype))))
+    self.assertAllCloseAccordingToType([0.0, 3.0], self.evaluate(v))
 
   @test_util.run_deprecated_v1
   def testScatterUpdateString(self):
@@ -1176,6 +1219,18 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
       # Test operations
       self.assertAllEqual((v * 2).numpy(), (v + v).numpy())
 
+  def testNumpyDotArray(self):
+    with context.eager_mode():
+      # Scalars use a separate code path.
+      v1 = resource_variable_ops.ResourceVariable(initial_value=lambda: 1,
+                                                  name="v1")
+      self.assertEqual(1, np.array(v1))
+
+      v2 = resource_variable_ops.ResourceVariable(initial_value=lambda: [1, 2],
+                                                  name="v2")
+      self.assertAllEqual(v2.read_value().numpy(), np.array(v2))
+      self.assertAllEqual([1, 2], np.array(v2))
+
   def testContainerEager(self):
     with context.eager_mode():
       v1 = resource_variable_ops.ResourceVariable(initial_value=lambda: 1,
diff --git a/tensorflow/python/kernel_tests/reverse_sequence_op_test.py b/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
index 39bbc613a0e..e1d05889df7 100644
--- a/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
+++ b/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
@@ -24,9 +24,8 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.platform import test
 
 
@@ -109,7 +108,6 @@ class ReverseSequenceTest(test.TestCase):
   def testComplex128Basic(self):
     self._testBasic(np.complex128)
 
-  @test_util.run_deprecated_v1
   def testFloatReverseSequenceGrad(self):
     x = np.asarray(
         [[[1, 2, 3, 4], [5, 6, 7, 8]], [[9, 10, 11, 12], [13, 14, 15, 16]],
@@ -123,18 +121,18 @@ class ReverseSequenceTest(test.TestCase):
     batch_axis = 2
     seq_lengths = np.asarray([3, 0, 4], dtype=np.int64)
 
-    with self.cached_session():
-      input_t = constant_op.constant(x, shape=x.shape)
+    def reverse_sequence(x):
       seq_lengths_t = constant_op.constant(seq_lengths, shape=seq_lengths.shape)
-      reverse_sequence_out = array_ops.reverse_sequence(
-          input_t,
+      return array_ops.reverse_sequence(
+          x,
           batch_axis=batch_axis,
           seq_axis=seq_axis,
           seq_lengths=seq_lengths_t)
-      err = gradient_checker.compute_gradient_error(
-          input_t, x.shape, reverse_sequence_out, x.shape, x_init_value=x)
-    print("ReverseSequence gradient error = %g" % err)
-    self.assertLess(err, 1e-8)
+
+    with self.cached_session():
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(reverse_sequence, [x]))
+      self.assertLess(err, 1e-8)
 
   def testShapeFunctionEdgeCases(self):
     # Enter graph mode since we want to test partial shapes
diff --git a/tensorflow/python/kernel_tests/rnn_cell_test.py b/tensorflow/python/kernel_tests/rnn_cell_test.py
index 7157bd0c1d1..01b324f29fb 100644
--- a/tensorflow/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/python/kernel_tests/rnn_cell_test.py
@@ -2197,7 +2197,7 @@ class RawRNNTest(test.TestCase):
 
       r = rnn.raw_rnn(cell, loop_fn)
       loop_state = r[-1]
-      self.assertEqual([10], loop_state.eval())
+      self.assertEqual([10], self.evaluate(loop_state))
 
   @test_util.run_v1_only("b/124229375")
   def testLoopStateWithTensorArray(self):
diff --git a/tensorflow/python/kernel_tests/save_restore_ops_test.py b/tensorflow/python/kernel_tests/save_restore_ops_test.py
index fecc9a3800f..19fd22bd87a 100644
--- a/tensorflow/python/kernel_tests/save_restore_ops_test.py
+++ b/tensorflow/python/kernel_tests/save_restore_ops_test.py
@@ -23,6 +23,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.ops import io_ops
@@ -55,24 +56,24 @@ class ShardedFileOpsTest(test.TestCase):
 
 class ShapeInferenceTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def testRestoreV2WithSliceInput(self):
-    op = io_ops.restore_v2("model", ["var1", "var2"], ["", "3 4 0,1:-"],
-                           [dtypes.float32, dtypes.float32])
-    self.assertEqual(2, len(op))
-    self.assertFalse(op[0].get_shape().is_fully_defined())
-    self.assertEqual([1, 4], op[1].get_shape())
+    with ops.Graph().as_default():
+      op = io_ops.restore_v2("model", ["var1", "var2"], ["", "3 4 0,1:-"],
+                             [dtypes.float32, dtypes.float32])
+      self.assertEqual(2, len(op))
+      self.assertFalse(op[0].get_shape().is_fully_defined())
+      self.assertEqual([1, 4], op[1].get_shape())
 
-  @test_util.run_deprecated_v1
   def testRestoreV2NumSlicesNotMatch(self):
-    with self.assertRaises(ValueError):
-      io_ops.restore_v2("model", ["var1", "var2", "var3"], ["", "3 4 0,1:-"],
-                        [dtypes.float32, dtypes.float32])
+    with ops.Graph().as_default():
+      with self.assertRaises(ValueError):
+        io_ops.restore_v2("model", ["var1", "var2", "var3"], ["", "3 4 0,1:-"],
+                          [dtypes.float32, dtypes.float32])
 
-  @test_util.run_deprecated_v1
   def testRestoreSlice(self):
-    op = gen_io_ops.restore_slice("model", "var", "3 4 0,1:-", dtypes.float32)
-    self.assertEqual([1, 4], op.get_shape())
+    with ops.Graph().as_default():
+      op = gen_io_ops.restore_slice("model", "var", "3 4 0,1:-", dtypes.float32)
+      self.assertEqual([1, 4], op.get_shape())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index 83ade1b19cb..c5e5e549ee7 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -20,18 +20,18 @@ from __future__ import print_function
 
 import functools
 
+from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
-from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
@@ -115,7 +115,7 @@ class StatefulScatterNdTest(test.TestCase):
     np.random.seed(8)
     ref_shapes = [(3, 6), (3, 6), (3, 6, 9), (3, 6, 9), (3, 6, 9), (3, 6, 9)]
     indices_shapes = [(2,), (2, 2), (2,), (2, 2), (2, 3), (2, 3, 3)]
-    with self.cached_session(use_gpu=True):
+    with test_util.device(use_gpu=True):
       for ref_shape, indices_shape in zip(ref_shapes, indices_shapes):
         num_updates = indices_shape[0]
         ixdim = indices_shape[-1]
@@ -150,8 +150,8 @@ class StatefulScatterNdTest(test.TestCase):
         np_scatter(new, indices, updates)
         # Scatter via tensorflow
         ref_var = variables.VariableV1(ref)
-        ref_var.initializer.run()
-        tf_scatter(ref_var, indices, updates).eval()
+        self.evaluate(ref_var.initializer)
+        self.evaluate(tf_scatter(ref_var, indices, updates))
 
         # Compare
         self.assertAllClose(new, self.evaluate(ref_var))
@@ -187,7 +187,6 @@ class StatefulScatterNdTest(test.TestCase):
     self.assertAllEqual(self.evaluate(update),
                         [b"qq", b"cc", b"ee", b"dd", b"aa", b"", b"", b"bb"])
 
-  @test_util.run_deprecated_v1
   def testSimpleResource(self):
     indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
     updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32)
@@ -195,10 +194,9 @@ class StatefulScatterNdTest(test.TestCase):
         [0, 0, 0, 0, 0, 0, 0, 0], dtype=dtypes.float32)
     expected = np.array([0, 11, 0, 10, 9, 0, 0, 12])
     scatter = state_ops.scatter_nd_update(ref, indices, updates)
-    init = variables.global_variables_initializer()
 
-    with self.session(use_gpu=True) as sess:
-      self.evaluate(init)
+    with test_util.device(use_gpu=True):
+      self.evaluate(ref.initializer)
       self.evaluate(scatter)
       self.assertAllClose(ref, expected)
 
@@ -230,15 +228,12 @@ class StatefulScatterNdTest(test.TestCase):
       result = self.evaluate(scatter)
       self.assertAllClose(result, expected)
 
-  @test_util.run_deprecated_v1
   def testVariableRankUpdate(self):
     self._VariableRankTests(_NumpyUpdate, state_ops.scatter_nd_update)
 
-  @test_util.run_deprecated_v1
   def testVariableRankAdd(self):
     self._VariableRankTests(_NumpyAdd, state_ops.scatter_nd_add)
 
-  @test_util.run_deprecated_v1
   def testVariableRankSub(self):
     self._VariableRankTests(_NumpySub, state_ops.scatter_nd_sub)
 
@@ -256,13 +251,10 @@ class StatefulScatterNdTest(test.TestCase):
         self._VariableRankTest(
             np_scatter, tf_scatter, vtype, itype, repeat_indices=True)
 
-  @test_util.run_v1_only("b/120545219")
   def testScatterRepeatIndices(self):
     """This tests scatter_add using indices that repeat."""
     self._ScatterRepeatIndicesTest(_NumpyAdd, state_ops.scatter_nd_add)
     self._ScatterRepeatIndicesTest(_NumpySub, state_ops.scatter_nd_sub)
-    self._ScatterRepeatIndicesTest(_NumpyMin, state_ops.scatter_nd_min)
-    self._ScatterRepeatIndicesTest(_NumpyMax, state_ops.scatter_nd_max)
     # TODO(ebrevdo): Re-enable when we need ScatterNdMul and ScatterNdDiv.
     # self._ScatterRepeatIndicesTest(_NumpyMul, state_ops.scatter_nd_mul)
     # self._ScatterRepeatIndicesTest(_NumpyDiv, state_ops.scatter_nd_div)
@@ -276,38 +268,36 @@ class StatefulScatterNdTest(test.TestCase):
   #     update1 = tf.compat.v1.scatter_nd_update(
   #         var, tf.constant(
   #             [[0]], dtype=tf.int64), [False])
-  #     var.initializer.run()
+  #     self.evaluate(var.initializer)
   #     session.run([update0, update1])
   #     self.assertAllEqual([False, True], self.evaluate(var))
 
-  @test_util.run_v1_only("b/120545219")
   def testScatterOutOfRangeCpu(self):
     # TODO(simister): Re-enable once binary size increase due to
     # scatter_nd ops is under control.
     #  tf.scatter_nd_mul, tf.scatter_nd_div,
     for op in (state_ops.scatter_nd_add, state_ops.scatter_nd_sub,
-               state_ops.scatter_nd_min, state_ops.scatter_nd_max,
                state_ops.scatter_nd_update):
       params = np.array([1, 2, 3, 4, 5, 6]).astype(np.float32)
       updates = np.array([-3, -4, -5]).astype(np.float32)
-      with self.cached_session(use_gpu=False):
+      with test_util.device(use_gpu=False):
         ref = variables.VariableV1(params)
-        ref.initializer.run()
+        self.evaluate(ref.initializer)
 
         # Indices all in range, no problem.
         indices = np.array([[2], [0], [5]])
-        op(ref, indices, updates).eval()
+        self.evaluate(op(ref, indices, updates))
 
         # Test some out of range errors.
         indices = np.array([[-1], [0], [5]])
         with self.assertRaisesOpError(
             r"indices\[0\] = \[-1\] does not index into shape \[6\]"):
-          op(ref, indices, updates).eval()
+          self.evaluate(op(ref, indices, updates))
 
         indices = np.array([[2], [0], [6]])
         with self.assertRaisesOpError(
             r"indices\[2\] = \[6\] does not index into shape \[6\]"):
-          op(ref, indices, updates).eval()
+          self.evaluate(op(ref, indices, updates))
 
   def testRank3ValidShape(self):
     indices = array_ops.zeros([2, 2, 2], dtypes.int32)
@@ -318,18 +308,16 @@ class StatefulScatterNdTest(test.TestCase):
         state_ops.scatter_nd_update(ref, indices,
                                     updates).get_shape().as_list(), shape)
 
-  @test_util.run_v1_only("b/120545219")
   @test_util.disable_xla("b/123337890")  # Error messages differ
   def testResVarInvalidOutputShape(self):
     res = variables.Variable(
         initial_value=lambda: array_ops.zeros(shape=[], dtype=dtypes.float32),
         dtype=dtypes.float32)
     with self.cached_session():
-      res.initializer.run()
+      self.evaluate(res.initializer)
       with self.assertRaisesOpError("Output must be at least 1-D"):
         state_ops.scatter_nd_update(res, [[0]], [0.22]).eval()
 
-  @test_util.run_deprecated_v1
   def testExtraIndicesDimensions(self):
     indices = array_ops.zeros([1, 1, 2], dtypes.int32)
     updates = array_ops.zeros([1, 1], dtypes.int32)
@@ -340,7 +328,7 @@ class StatefulScatterNdTest(test.TestCase):
 
     expected_result = np.zeros([2, 2], dtype=np.int32)
     with self.cached_session():
-      ref.initializer.run()
+      self.evaluate(ref.initializer)
       self.assertAllEqual(expected_result, self.evaluate(scatter_update))
 
   @test_util.run_deprecated_v1
@@ -363,7 +351,6 @@ class StatefulScatterNdTest(test.TestCase):
         ValueError, r"The inner \d+ dimensions of input\.shape="):
       state_ops.scatter_nd_update(ref, indices, updates)
 
-  @test_util.run_deprecated_v1
   def testConcurrentUpdates(self):
     num_updates = 10000
     update_values = np.random.rand(num_updates)
@@ -377,10 +364,9 @@ class StatefulScatterNdTest(test.TestCase):
     scatter = state_ops.scatter_nd_add(ref, indices, updates)
     init = variables.global_variables_initializer()
 
-    with session.Session() as sess:
-      self.evaluate(init)
-      result = self.evaluate(scatter)
-      assert np.allclose(result, expected_result)
+    self.evaluate(init)
+    result = self.evaluate(scatter)
+    assert np.allclose(result, expected_result)
 
   # TODO(fpmc): Re-enable this test when gpu_pip test actually runs on a GPU.
   def _disabledTestScatterOutOfRangeGpu(self):
@@ -397,7 +383,7 @@ class StatefulScatterNdTest(test.TestCase):
       # We don't test the implementation; just test there's no failures.
       with self.cached_session(force_gpu=True):
         ref = variables.Variable(params)
-        ref.initializer.run()
+        self.evaluate(ref.initializer)
 
         # Indices all in range, no problem.
         indices = np.array([2, 0, 5])
@@ -410,7 +396,7 @@ class StatefulScatterNdTest(test.TestCase):
         op(ref, indices, updates).eval()
 
 
-class ScatterNdTest(test.TestCase):
+class ScatterNdTest(test.TestCase, parameterized.TestCase):
   non_aliasing_add_test = False
 
   def scatter_nd(self, indices, updates, shape, input_=None):
@@ -492,7 +478,6 @@ class ScatterNdTest(test.TestCase):
     self.assertAllEqual(
         self.scatter_nd(indices, updates, shape).get_shape().as_list(), shape)
 
-  @test_util.run_deprecated_v1
   def testExtraIndicesDimensions(self):
     indices = array_ops.zeros([1, 1, 2], dtypes.int32)
     updates = array_ops.zeros([1, 1], dtypes.int32)
@@ -500,29 +485,31 @@ class ScatterNdTest(test.TestCase):
     scatter = self.scatter_nd(indices, updates, shape)
     self.assertAllEqual(scatter.get_shape().as_list(), shape)
     expected_result = np.zeros([2, 2], dtype=np.int32)
-    with self.cached_session():
-      self.assertAllEqual(expected_result, self.evaluate(scatter))
+    self.assertAllEqual(expected_result, self.evaluate(scatter))
 
-  @test_util.run_deprecated_v1
   def testUndefinedIndicesShape(self):
-    indices = array_ops.placeholder(dtypes.int32, shape=None)
-    updates = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
-    shape = constant_op.constant([2, 2, 2], dtypes.int32)
-    self.scatter_nd(indices, updates, shape)
+    # Placeholders are only valid in Graph.
+    with ops.Graph().as_default():
+      indices = array_ops.placeholder(dtypes.int32, shape=None)
+      updates = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
+      shape = constant_op.constant([2, 2, 2], dtypes.int32)
+      self.scatter_nd(indices, updates, shape)
 
-  @test_util.run_deprecated_v1
   def testUndefinedUpdatesShape(self):
-    indices = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
-    updates = array_ops.placeholder(dtypes.int32, shape=None)
-    shape = constant_op.constant([2, 2, 2], dtypes.int32)
-    self.scatter_nd(indices, updates, shape)
+    # Placeholders are only valid in Graph.
+    with ops.Graph().as_default():
+      indices = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
+      updates = array_ops.placeholder(dtypes.int32, shape=None)
+      shape = constant_op.constant([2, 2, 2], dtypes.int32)
+      self.scatter_nd(indices, updates, shape)
 
-  @test_util.run_deprecated_v1
   def testUndefinedOutputShape(self):
-    indices = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
-    updates = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
-    shape = array_ops.placeholder(dtypes.int32, shape=[None])
-    self.scatter_nd(indices, updates, shape)
+    # Placeholders are only valid in Graph.
+    with ops.Graph().as_default():
+      indices = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
+      updates = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
+      shape = array_ops.placeholder(dtypes.int32, shape=[None])
+      self.scatter_nd(indices, updates, shape)
 
   @test_util.run_deprecated_v1
   def testEmptyOutputShape1(self):
@@ -534,21 +521,21 @@ class ScatterNdTest(test.TestCase):
         ValueError, "Indices and updates specified for empty output shape"):
       self.scatter_nd(indices, updates, shape)
 
-  @test_util.run_v1_only("b/120545219")
   def testEmptyOutputShape2(self):
-    indices = array_ops.placeholder(dtypes.int32, shape=None)
-    updates = array_ops.placeholder(dtypes.int32, shape=None)
-    shape = constant_op.constant([0, 3, 2], dtypes.int32)
+    with ops.Graph().as_default():
+      indices = array_ops.placeholder(dtypes.int32, shape=None)
+      updates = array_ops.placeholder(dtypes.int32, shape=None)
+      shape = constant_op.constant([0, 3, 2], dtypes.int32)
 
-    with self.cached_session():
-      with self.assertRaisesOpError(
-          "Indices and updates specified for empty output"):
-        self.scatter_nd(indices, updates, shape).eval(feed_dict={
-            indices: np.zeros([2, 2, 2], dtype=np.int32),
-            updates: np.zeros([2, 2, 2], dtype=np.int32)
-        })
+      with self.cached_session():
+        with self.assertRaisesOpError(
+            "Indices and updates specified for empty output"):
+          self.scatter_nd(indices, updates, shape).eval(
+              feed_dict={
+                  indices: np.zeros([2, 2, 2], dtype=np.int32),
+                  updates: np.zeros([2, 2, 2], dtype=np.int32)
+              })
 
-  @test_util.run_deprecated_v1
   def testEmptyOutputShape3(self):
     indices = array_ops.zeros([0], dtypes.int32)
     updates = array_ops.zeros([0], dtypes.int32)
@@ -556,7 +543,7 @@ class ScatterNdTest(test.TestCase):
     scatter = self.scatter_nd(indices, updates, shape)
 
     with self.cached_session():
-      self.assertEqual(scatter.eval().size, 0)
+      self.assertEqual(self.evaluate(scatter).size, 0)
 
   @test_util.run_deprecated_v1
   def testRank3InvalidShape1(self):
@@ -576,139 +563,138 @@ class ScatterNdTest(test.TestCase):
         ValueError, r"The inner \d+ dimensions of (input|output)\.shape="):
       self.scatter_nd(indices, updates, shape)
 
-  @test_util.run_deprecated_v1
-  def testGradientsRank2ElementUpdate(self):
+  @parameterized.parameters(set((True, context.executing_eagerly())))
+  def testGradientsRank2ElementUpdate(self, use_tape):
     for dtype in GRADIENT_TESTS_DTYPES:
-      indices = constant_op.constant([[0, 0], [1, 1]], dtype=dtypes.int32)
-      updates = constant_op.constant([1, 4], dtype=dtype)
-      shape = constant_op.constant([2, 2], dtype=dtypes.int32)
-      input_ = array_ops.zeros(shape, dtype=dtype)
-      outputs = self.scatter_nd(indices, updates, shape, input_)
+      with test_util.AbstractGradientTape(use_tape=use_tape) as tape:
+        indices = constant_op.constant([[0, 0], [1, 1]], dtype=dtypes.int32)
+        updates = constant_op.constant([1, 4], dtype=dtype)
+        tape.watch(updates)
+        shape = constant_op.constant([2, 2], dtype=dtypes.int32)
+        input_ = array_ops.zeros(shape, dtype=dtype)
+        tape.watch(input_)
+        outputs = self.scatter_nd(indices, updates, shape, input_)
 
-      grad_vals = constant_op.constant([[1, 2], [3, 4]], dtype=dtype)
-      updates_grad, input_grad = gradients_impl.gradients(
-          [outputs], [updates, input_], [grad_vals])
+        grad_vals = constant_op.constant([[1, 2], [3, 4]], dtype=dtype)
+
+        updates_grad, input_grad = tape.gradient([outputs], [updates, input_],
+                                                 [grad_vals])
       expected_updates_grad = np.array([1, 4], dtype=dtype.as_numpy_dtype())
       expected_input_grad = np.array([[1, 2], [3, 4]],
                                      dtype=dtype.as_numpy_dtype())
-      with self.cached_session():
-        self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
-        if self.non_aliasing_add_test:
-          self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
+      self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
+      if self.non_aliasing_add_test:
+        self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
 
-  @test_util.run_deprecated_v1
-  def testGradientsRank2SliceUpdate(self):
+  @parameterized.parameters(set((True, context.executing_eagerly())))
+  def testGradientsRank2SliceUpdate(self, use_tape):
     for dtype in GRADIENT_TESTS_DTYPES:
-      indices = constant_op.constant([[1], [0]], dtype=dtypes.int32)
-      updates = constant_op.constant([[3, 4], [1, 2]], dtype=dtype)
-      shape = constant_op.constant([2, 2], dtype=dtypes.int32)
-      input_ = array_ops.zeros(shape, dtype=dtype)
-      outputs = self.scatter_nd(indices, updates, shape, input_)
+      with test_util.AbstractGradientTape(use_tape=use_tape) as tape:
+        indices = constant_op.constant([[1], [0]], dtype=dtypes.int32)
+        updates = constant_op.constant([[3, 4], [1, 2]], dtype=dtype)
+        tape.watch(updates)
+        shape = constant_op.constant([2, 2], dtype=dtypes.int32)
+        input_ = array_ops.zeros(shape, dtype=dtype)
+        tape.watch(input_)
+        outputs = self.scatter_nd(indices, updates, shape, input_)
 
-      grad_vals = constant_op.constant([[3, 4], [1, 2]], dtype=dtype)
-      updates_grad, input_grad = gradients_impl.gradients(
-          [outputs], [updates, input_], [grad_vals])
+        grad_vals = constant_op.constant([[3, 4], [1, 2]], dtype=dtype)
+        updates_grad, input_grad = tape.gradient([outputs], [updates, input_],
+                                                 [grad_vals])
       expected_updates_grad = np.array([[1, 2], [3, 4]],
                                        dtype=dtype.as_numpy_dtype())
       expected_input_grad = np.array([[3, 4], [1, 2]],
                                      dtype=dtype.as_numpy_dtype())
-      with self.cached_session():
-        self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
-        if self.non_aliasing_add_test:
-          self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
+      self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
+      if self.non_aliasing_add_test:
+        self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
 
-  @test_util.run_deprecated_v1
-  def testGradientsRank3SliceUpdate(self):
+  @parameterized.parameters(set((True, context.executing_eagerly())))
+  def testGradientsRank3SliceUpdate(self, use_tape):
     for dtype in GRADIENT_TESTS_DTYPES:
-      indices = constant_op.constant([[[0, 1], [1, 0]], [[0, 0], [1, 1]]],
-                                     dtype=dtypes.int32)
-      updates = constant_op.constant([[[5, 7], [2, 4]], [[1, 3], [6, 8]]],
-                                     dtype=dtype)
-      shape = constant_op.constant([2, 2, 2], dtype=dtypes.int32)
-      input_ = array_ops.zeros(shape, dtype=dtype)
-      outputs = self.scatter_nd(indices, updates, shape, input_)
-
-      grad_vals = constant_op.constant([[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+      with test_util.AbstractGradientTape(use_tape=use_tape) as tape:
+        indices = constant_op.constant([[[0, 1], [1, 0]], [[0, 0], [1, 1]]],
+                                       dtype=dtypes.int32)
+        updates = constant_op.constant([[[5, 7], [2, 4]], [[1, 3], [6, 8]]],
                                        dtype=dtype)
-      updates_grad, input_grad = gradients_impl.gradients(
-          [outputs], [updates, input_], [grad_vals])
+        tape.watch(updates)
+        shape = constant_op.constant([2, 2, 2], dtype=dtypes.int32)
+        input_ = array_ops.zeros(shape, dtype=dtype)
+        tape.watch(input_)
+        outputs = self.scatter_nd(indices, updates, shape, input_)
+
+        grad_vals = constant_op.constant([[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                                         dtype=dtype)
+        updates_grad, input_grad = tape.gradient([outputs], [updates, input_],
+                                                 [grad_vals])
       expected_updates_grad = np.array([[[3, 4], [5, 6]], [[1, 2], [7, 8]]],
                                        dtype=dtype.as_numpy_dtype())
       expected_input_grad = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
                                      dtype=dtype.as_numpy_dtype())
-      with self.cached_session():
-        self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
-        if self.non_aliasing_add_test:
-          self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
+      self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
+      if self.non_aliasing_add_test:
+        self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
 
-  @test_util.run_deprecated_v1
-  def testGradientsRank7SliceUpdate(self):
+  @parameterized.parameters(set((True, context.executing_eagerly())))
+  def testGradientsRank7SliceUpdate(self, use_tape):
     for dtype in GRADIENT_TESTS_DTYPES:
-      indices = constant_op.constant(
-          [[[[[[[0, 0, 0, 0, 0, 1], [0, 0, 1, 0, 0, 0]]]],
-             [[[[0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 1]]]]]]],
-          dtype=dtypes.int32)
-      updates = constant_op.constant(
-          [[[[[[[5, 6], [2, 4]]]], [[[[1, 3], [6, 8]]]]]]], dtype=dtype)
-      shape = constant_op.constant([1, 1, 2, 1, 1, 2, 2], dtype=dtypes.int32)
-      input_ = array_ops.zeros(shape, dtype=dtype)
-      outputs = self.scatter_nd(indices, updates, shape, input_)
+      with test_util.AbstractGradientTape(use_tape=use_tape) as tape:
+        indices = constant_op.constant(
+            [[[[[[[0, 0, 0, 0, 0, 1], [0, 0, 1, 0, 0, 0]]]],
+               [[[[0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 1]]]]]]],
+            dtype=dtypes.int32)
+        updates = constant_op.constant(
+            [[[[[[[5, 6], [2, 4]]]], [[[[1, 3], [6, 8]]]]]]], dtype=dtype)
+        tape.watch(updates)
+        shape = constant_op.constant([1, 1, 2, 1, 1, 2, 2], dtype=dtypes.int32)
+        input_ = array_ops.zeros(shape, dtype=dtype)
+        tape.watch(input_)
+        outputs = self.scatter_nd(indices, updates, shape, input_)
 
-      grad_vals = constant_op.constant(
-          [[[[[[[1, 2], [3, 4]]]], [[[[5, 6], [7, 8]]]]]]], dtype=dtype)
-      updates_grad, input_grad = gradients_impl.gradients(
-          [outputs], [updates, input_], [grad_vals])
+        grad_vals = constant_op.constant(
+            [[[[[[[1, 2], [3, 4]]]], [[[[5, 6], [7, 8]]]]]]], dtype=dtype)
+        updates_grad, input_grad = tape.gradient([outputs], [updates, input_],
+                                                 [grad_vals])
       expected_updates_grad = np.array(
           [[[[[[[3, 4], [5, 6]]]], [[[[1, 2], [7, 8]]]]]]],
           dtype=dtype.as_numpy_dtype())
       expected_input_grad = np.array(
           [[[[[[[1, 2], [3, 4]]]], [[[[5, 6], [7, 8]]]]]]],
           dtype=dtype.as_numpy_dtype())
-      with self.cached_session():
-        self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
-        if self.non_aliasing_add_test:
-          self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
+      self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
+      if self.non_aliasing_add_test:
+        self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
 
-  @test_util.run_deprecated_v1
   def testScatterNdRepeatedIndicesAdd(self):
     indices = array_ops.zeros([100000, 1], dtypes.int32)
     values = np.random.randn(100000)
     shape = [1]
-    with self.cached_session():
-      val = self.scatter_nd(indices, values, shape).eval()
+    val = self.evaluate(self.scatter_nd(indices, values, shape))
     self.assertAllClose([np.sum(values)], val)
 
-  @test_util.run_deprecated_v1
   def testSmokeScatterNdBatch2DSliceDim2(self):
-    with self.cached_session():
-      indices = array_ops.zeros([3, 5, 2], dtype=dtypes.int32)
-      values = array_ops.zeros([3, 5, 7])
-      shape = [4, 6, 7]
-      self.scatter_nd(indices, values, shape).eval()
+    indices = array_ops.zeros([3, 5, 2], dtype=dtypes.int32)
+    values = array_ops.zeros([3, 5, 7])
+    shape = [4, 6, 7]
+    self.evaluate(self.scatter_nd(indices, values, shape))
 
-  @test_util.run_deprecated_v1
   def testSmokeScatterNdBatch1DSliceDim2(self):
-    with self.cached_session():
-      indices = array_ops.zeros([0, 2], dtype=dtypes.int32)
-      values = array_ops.zeros([0, 7])
-      shape = [4, 6, 7]
-      self.scatter_nd(indices, values, shape).eval()
+    indices = array_ops.zeros([0, 2], dtype=dtypes.int32)
+    values = array_ops.zeros([0, 7])
+    shape = [4, 6, 7]
+    self.evaluate(self.scatter_nd(indices, values, shape))
 
-  @test_util.run_deprecated_v1
   def testSmokeScatterNdBatch1DSliceDim3ShapeRank7(self):
-    with self.cached_session():
-      indices = array_ops.zeros([1, 3], dtype=dtypes.int32)
-      values = array_ops.zeros([1, 6, 7, 8, 9])
-      shape = [3, 4, 5, 6, 7, 8, 9]
-      self.scatter_nd(indices, values, shape).eval()
+    indices = array_ops.zeros([1, 3], dtype=dtypes.int32)
+    values = array_ops.zeros([1, 6, 7, 8, 9])
+    shape = [3, 4, 5, 6, 7, 8, 9]
+    self.evaluate(self.scatter_nd(indices, values, shape))
 
-  @test_util.run_deprecated_v1
   def testSmokeScatterNdBatch2DSliceDim3ShapeRank7(self):
-    with self.cached_session():
-      indices = array_ops.zeros([1, 2, 3], dtype=dtypes.int32)
-      values = array_ops.zeros([1, 2, 6, 7, 8, 9])
-      shape = [3, 4, 5, 6, 7, 8, 9]
-      self.scatter_nd(indices, values, shape).eval()
+    indices = array_ops.zeros([1, 2, 3], dtype=dtypes.int32)
+    values = array_ops.zeros([1, 2, 6, 7, 8, 9])
+    shape = [3, 4, 5, 6, 7, 8, 9]
+    self.evaluate(self.scatter_nd(indices, values, shape))
 
 
 class ScatterNdNonAliasingAddTest(ScatterNdTest):
@@ -742,37 +728,34 @@ class ScatterNdTensorTest(test.TestCase):
     self.assertAllEqual(subbed,
                         constant_op.constant([1, -10, 1, -9, -8, 1, 1, -11]))
 
-  @test_util.run_v1_only("b/120545219")
   def testUpdateAddSubGradients(self):
-
     with self.cached_session():
       indices = constant_op.constant([[3], [1]])
       updates = constant_op.constant([9, 10], dtype=dtypes.float32)
       x = array_ops.ones([4], dtype=dtypes.float32)
 
-      assigned = array_ops.tensor_scatter_update(x, indices, updates)
-      added = array_ops.tensor_scatter_add(x, indices, updates)
-      subbed = array_ops.tensor_scatter_sub(x, indices, updates)
+      theoretical, numerical = gradient_checker_v2.compute_gradient(
+          lambda x: array_ops.tensor_scatter_update(x, indices, updates), [x])
+      self.assertAllClose(theoretical, numerical, 5e-4, 5e-4)
+      theoretical, numerical = gradient_checker_v2.compute_gradient(
+          lambda x: array_ops.tensor_scatter_add(x, indices, updates), [x])
+      self.assertAllClose(theoretical, numerical, 5e-4, 5e-4)
+      theoretical, numerical = gradient_checker_v2.compute_gradient(
+          lambda x: array_ops.tensor_scatter_sub(x, indices, updates), [x])
+      self.assertAllClose(theoretical, numerical, 5e-4, 5e-4)
 
-      err_assigned = gradient_checker.compute_gradient_error(
-          x, [4], assigned, [4])
-      err_added = gradient_checker.compute_gradient_error(x, [4], added, [4])
-      err_subbed = gradient_checker.compute_gradient_error(x, [4], subbed, [4])
-
-      self.assertLess(err_assigned, 2e-4)
-      self.assertLess(err_added, 2e-4)
-      self.assertLess(err_subbed, 2e-4)
-
-      err_assigned_wrt_updates = gradient_checker.compute_gradient_error(
-          updates, [2], assigned, [4])
-      err_added_wrt_updates = gradient_checker.compute_gradient_error(
-          updates, [2], added, [4])
-      err_subbed_wrt_updates = gradient_checker.compute_gradient_error(
-          updates, [2], subbed, [4])
-
-      self.assertLess(err_assigned_wrt_updates, 2e-4)
-      self.assertLess(err_added_wrt_updates, 2e-4)
-      self.assertLess(err_subbed_wrt_updates, 2e-4)
+      theoretical, numerical = gradient_checker_v2.compute_gradient(
+          lambda updates: array_ops.tensor_scatter_update(x, indices, updates),
+          [updates])
+      self.assertAllClose(theoretical, numerical, 5e-4, 5e-4)
+      theoretical, numerical = gradient_checker_v2.compute_gradient(
+          lambda updates: array_ops.tensor_scatter_add(x, indices, updates),
+          [updates])
+      self.assertAllClose(theoretical, numerical, 5e-4, 5e-4)
+      theoretical, numerical = gradient_checker_v2.compute_gradient(
+          lambda updates: array_ops.tensor_scatter_sub(x, indices, updates),
+          [updates])
+      self.assertAllClose(theoretical, numerical, 5e-4, 5e-4)
 
   @test_util.run_in_graph_and_eager_modes
   def testUpdateMinMax(self):
diff --git a/tensorflow/python/kernel_tests/scatter_ops_test.py b/tensorflow/python/kernel_tests/scatter_ops_test.py
index 8ed3595b904..b9206bf3221 100644
--- a/tensorflow/python/kernel_tests/scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_ops_test.py
@@ -182,7 +182,8 @@ class ScatterTest(test.TestCase):
           ref = variables.Variable(old)
           self.evaluate(ref.initializer)
           self.evaluate(tf_scatter(ref, indices, updates))
-          self.assertAllClose(self.evaluate(ref), new)
+          self.assertAllCloseAccordingToType(
+              self.evaluate(ref), new, half_rtol=5e-3, half_atol=5e-3)
 
   def _VariableRankTests(self,
                          tf_scatter,
@@ -191,6 +192,8 @@ class ScatterTest(test.TestCase):
     vtypes = [np.float32, np.float64]
     if tf_scatter != state_ops.scatter_div:
       vtypes.append(np.int32)
+      # float16 is numerically unstable for div
+      vtypes.append(np.float16)
 
     for vtype in vtypes:
       for itype in (np.int32, np.int64):
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index 9c0e0e38b6a..6a9350bd3da 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
@@ -255,6 +256,17 @@ class SegmentReductionOpTest(SegmentReductionHelper):
             delta=1)
       self.assertAllClose(jacob_t, jacob_n)
 
+  def testDataInvalid(self):
+    # Test case for GitHub issue 40653.
+    for use_gpu in [True, False]:
+      with self.cached_session(use_gpu=use_gpu):
+        with self.assertRaisesRegex(
+            (ValueError, errors_impl.InvalidArgumentError),
+            "must be at least rank 1"):
+          s = math_ops.segment_mean(
+              data=np.uint16(10), segment_ids=np.array([]).astype("int64"))
+          self.evaluate(s)
+
 
 class UnsortedSegmentTest(SegmentReductionHelper):
 
diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py
index 4debe180690..2efa60993be 100644
--- a/tensorflow/python/kernel_tests/softmax_op_test.py
+++ b/tensorflow/python/kernel_tests/softmax_op_test.py
@@ -23,9 +23,9 @@ import unittest
 import numpy as np
 
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
@@ -242,14 +242,11 @@ class SoftmaxTest(test.TestCase):
                          [[5., 4., 3., 2.], [1., 2., 3., 4.]]])
     self.assertEqual([3, 2, 4], op.get_shape())
 
-  @test_util.run_deprecated_v1
   def testEmptyInput(self):
-    with self.cached_session():
-      x = array_ops.placeholder(dtypes.float32, shape=[0, 3])
-      self.assertEqual(0, array_ops.size(x).eval())
-      # reshape would raise if logits is empty
-      with self.assertRaises(errors_impl.InvalidArgumentError):
-        nn_ops.softmax(x, axis=0).eval()
+    x = array_ops.ones(shape=[0, 3], dtype=dtypes.float32)
+    y = np.zeros(shape=[0, 3], dtype=np.float32)
+    self.assertEqual(0, self.evaluate(array_ops.size(x)))
+    self.assertAllEqual(y, self.evaluate(nn_ops.softmax(x, axis=0)))
 
   def testDimTooLarge(self):
     with self.cached_session():
@@ -266,7 +263,6 @@ class SoftmaxTest(test.TestCase):
       with self.assertRaises(errors_impl.InvalidArgumentError):
         nn_ops.softmax(ones, axis=2).eval()
 
-  @test_util.run_deprecated_v1
   def testLargeDims(self):
     # Make sure that we properly handle large inputs. See
     # https://github.com/tensorflow/tensorflow/issues/4425 for details
@@ -275,10 +271,10 @@ class SoftmaxTest(test.TestCase):
       np_softmax = self._npSoftmax(ones)
 
       for use_gpu in [True, False]:
-        with self.cached_session(use_gpu=use_gpu) as sess:
-          x = array_ops.placeholder(dtypes.float32)
+        with self.cached_session(use_gpu=use_gpu):
+          x = constant_op.constant(ones)
           y = nn_ops.softmax(x)
-          tf_softmax = sess.run(y, feed_dict={x: ones})
+          tf_softmax = self.evaluate(y)
         self.assertAllClose(tf_softmax, np_softmax)
 
 
diff --git a/tensorflow/python/kernel_tests/spacetobatch_op_test.py b/tensorflow/python/kernel_tests/spacetobatch_op_test.py
index c6116db55b0..0147f2b70f3 100644
--- a/tensorflow/python/kernel_tests/spacetobatch_op_test.py
+++ b/tensorflow/python/kernel_tests/spacetobatch_op_test.py
@@ -374,7 +374,7 @@ class SpaceToBatchErrorHandlingTest(test.TestCase, PythonOpImpl):
     block_size = 10
     with self.assertRaises(ValueError):
       out_tf = self.space_to_batch(x_np, paddings, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
   @test_util.run_deprecated_v1
   def testBlockSizeNotDivisibleWidth(self):
diff --git a/tensorflow/python/kernel_tests/spacetodepth_op_test.py b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
index 976880c10ee..762a644b065 100644
--- a/tensorflow/python/kernel_tests/spacetodepth_op_test.py
+++ b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
@@ -22,11 +22,12 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
@@ -159,79 +160,72 @@ class SpaceToDepthTest(test.TestCase):
 
   # Error handling:
 
-  @test_util.run_deprecated_v1
   def testInputWrongDimMissingDepth(self):
     # The input is missing the last dimension ("depth")
     x_np = [[[1, 2], [3, 4]]]
     block_size = 2
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors.InvalidArgumentError)):
       out_tf = array_ops.space_to_depth(x_np, block_size)
       self.evaluate(out_tf)
 
-  @test_util.run_deprecated_v1
   def testInputWrongDimMissingBatch(self):
     # The input is missing the first dimension ("batch")
     x_np = [[[1], [2]], [[3], [4]]]
     block_size = 2
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors.InvalidArgumentError)):
       _ = array_ops.space_to_depth(x_np, block_size)
 
-  @test_util.run_deprecated_v1
   def testBlockSize0(self):
     # The block size is 0.
     x_np = [[[[1], [2]], [[3], [4]]]]
     block_size = 0
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors.InvalidArgumentError)):
       out_tf = array_ops.space_to_depth(x_np, block_size)
       self.evaluate(out_tf)
 
-  @test_util.run_deprecated_v1
   def testBlockSizeOne(self):
     # The block size is 1. The block size needs to be > 1.
     x_np = [[[[1], [2]], [[3], [4]]]]
     block_size = 1
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors.InvalidArgumentError)):
       out_tf = array_ops.space_to_depth(x_np, block_size)
       self.evaluate(out_tf)
 
-  @test_util.run_deprecated_v1
   def testBlockSizeLarger(self):
     # The block size is too large for this input.
     x_np = [[[[1], [2]], [[3], [4]]]]
     block_size = 10
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors.InvalidArgumentError)):
       out_tf = array_ops.space_to_depth(x_np, block_size)
       self.evaluate(out_tf)
 
-  @test_util.run_deprecated_v1
   def testBlockSizeNotDivisibleWidth(self):
     # The block size divides width but not height.
     x_np = [[[[1], [2], [3]], [[3], [4], [7]]]]
     block_size = 3
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors.InvalidArgumentError)):
       _ = array_ops.space_to_depth(x_np, block_size)
 
-  @test_util.run_deprecated_v1
   def testBlockSizeNotDivisibleHeight(self):
     # The block size divides height but not width.
     x_np = [[[[1], [2]], [[3], [4]], [[5], [6]]]]
     block_size = 3
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors.InvalidArgumentError)):
       _ = array_ops.space_to_depth(x_np, block_size)
 
-  @test_util.run_deprecated_v1
   def testBlockSizeNotDivisibleBoth(self):
     # The block size does not divide neither width or height.
     x_np = [[[[1], [2]], [[3], [4]]]]
     block_size = 3
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors.InvalidArgumentError)):
       _ = array_ops.space_to_depth(x_np, block_size)
 
-  @test_util.run_deprecated_v1
   def testUnknownShape(self):
-    t = array_ops.space_to_depth(
-        array_ops.placeholder(dtypes.float32), block_size=4)
-    self.assertEqual(4, t.get_shape().ndims)
+    # Testing an unkown shape in graph.
+    with ops.Graph().as_default():
+      t = array_ops.space_to_depth(
+          array_ops.placeholder(dtypes.float32), block_size=4)
+      self.assertEqual(4, t.get_shape().ndims)
 
   def spaceToDepthUsingTranspose(self, tensor, block_size, data_format):
     block_size_sq = block_size * block_size
@@ -315,8 +309,6 @@ class SpaceToDepthTest(test.TestCase):
       actual_vals, expected_vals = self.evaluate([actual, expected])
       self.assertTrue(np.array_equal(actual_vals, expected_vals))
 
-  # TODO(jingyue): figure out why this test failed in eager mode.
-  @test_util.run_deprecated_v1
   def testAgainstTranspose(self):
     self.compareToTranspose(3, 2, 3, 1, 2, "NHWC", dtypes.float32, False)
     self.compareToTranspose(1, 2, 3, 2, 2, "NHWC", dtypes.float32, False)
@@ -350,19 +342,15 @@ class SpaceToDepthGradientTest(test.TestCase):
       return
 
     assert 4 == x.ndim
-    with self.cached_session(use_gpu=True):
-      tf_x = ops.convert_to_tensor(x)
-      tf_y = array_ops.space_to_depth(tf_x, block_size, data_format=data_format)
-      epsilon = 1e-2
-      ((x_jacob_t, x_jacob_n)) = gradient_checker.compute_gradient(
-          tf_x,
-          x.shape,
-          tf_y,
-          tf_y.get_shape().as_list(),
-          x_init_value=x,
-          delta=epsilon)
 
-    self.assertAllClose(x_jacob_t, x_jacob_n, rtol=1e-2, atol=epsilon)
+    def func(x):
+      return array_ops.space_to_depth(x, block_size, data_format=data_format)
+
+    with test_util.use_gpu():
+      with self.cached_session():
+        theoretical, numerical = gradient_checker_v2.compute_gradient(
+            func, [ops.convert_to_tensor(x)])
+        self.assertAllClose(theoretical, numerical, rtol=1e-2, atol=1e-2)
 
   # Tests a gradient for space_to_depth of x which is a four dimensional
   # tensor of shape [b, h * block_size, w * block_size, d].
@@ -379,7 +367,6 @@ class SpaceToDepthGradientTest(test.TestCase):
 
   # Don't use very large numbers as dimensions here as the result is tensor
   # with cartesian product of the dimensions.
-  @test_util.run_deprecated_v1
   def testSmall(self):
     block_size = 2
     self._compare(1, 2, 3, 5, block_size, "NHWC")
diff --git a/tensorflow/python/kernel_tests/sparse_cross_op_test.py b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
index 48192551a18..9f36e7212e1 100644
--- a/tensorflow/python/kernel_tests/sparse_cross_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
@@ -65,9 +65,9 @@ class BaseSparseCrossOpTest(test.TestCase):
         constant_op.constant(shape, dtypes.int64))
 
   def _assert_sparse_tensor_equals(self, sp1, sp2):
-    self.assertAllEqual(sp1.indices.eval(), sp2.indices)
-    self.assertAllEqual(sp1.values.eval(), sp2.values)
-    self.assertAllEqual(sp1.dense_shape.eval(), sp2.dense_shape)
+    self.assertAllEqual(sp1.indices, sp2.indices)
+    self.assertAllEqual(sp1.values, sp2.values)
+    self.assertAllEqual(sp1.dense_shape, sp2.dense_shape)
 
   def _assert_sparse_tensor_empty(self, sp):
     self.assertEqual(0, sp.indices.size)
@@ -424,9 +424,9 @@ class SparseCrossOpTest(test.TestCase):
     self.assertEqual(0, sp.dense_shape[1])
 
   def _assert_sparse_tensor_equals(self, sp1, sp2):
-    self.assertAllEqual(sp1.indices.eval(), sp2.indices)
-    self.assertAllEqual(sp1.values.eval(), sp2.values)
-    self.assertAllEqual(sp1.dense_shape.eval(), sp2.dense_shape)
+    self.assertAllEqual(sp1.indices, sp2.indices)
+    self.assertAllEqual(sp1.values, sp2.values)
+    self.assertAllEqual(sp1.dense_shape, sp2.dense_shape)
 
   def _sparse_tensor(self, data, batch_size=-1):
     """Generates a SparseTensor.
diff --git a/tensorflow/python/kernel_tests/sparse_matmul_op_test.py b/tensorflow/python/kernel_tests/sparse_matmul_op_test.py
index 2e17a9c608f..b0b26ec4008 100644
--- a/tensorflow/python/kernel_tests/sparse_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_matmul_op_test.py
@@ -154,15 +154,16 @@ class MatMulGradientTest(test.TestCase):
           transpose_b=tr_b,
           a_is_sparse=sp_a,
           b_is_sparse=sp_b)
-      err = (gradient_checker.compute_gradient_error(
-          a, [2, 3] if tr_a else [3, 2],
-          m, [3, 4],
-          x_init_value=a.eval(),
-          delta=delta) + gradient_checker.compute_gradient_error(
-              b, [4, 2] if tr_b else [2, 4],
+      err = (
+          gradient_checker.compute_gradient_error(
+              a, [2, 3] if tr_a else [3, 2],
               m, [3, 4],
-              x_init_value=b.eval(),
-              delta=delta))
+              x_init_value=self.evaluate(a),
+              delta=delta) + gradient_checker.compute_gradient_error(
+                  b, [4, 2] if tr_b else [2, 4],
+                  m, [3, 4],
+                  x_init_value=self.evaluate(b),
+                  delta=delta))
     self.assertLessEqual(err, delta / 2.)
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/kernel_tests/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops_test.py
index 5268a2be537..bef02fb6dbb 100644
--- a/tensorflow/python/kernel_tests/sparse_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops_test.py
@@ -585,6 +585,23 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(output.dense_shape, [2, 6])
       self.assertAllEqual(empty_row_indicator_out, np.zeros(2).astype(np.bool))
 
+  def testNoEmptyRowsAndUnordered(self):
+    with test_util.force_cpu():
+      sp_input = sparse_tensor.SparseTensor(
+          indices=np.array([[1, 2], [1, 3], [0, 1], [0, 3]]),
+          values=np.array([1, 3, 2, 4]),
+          dense_shape=np.array([2, 5]))
+      sp_output, empty_row_indicator = (
+          sparse_ops.sparse_fill_empty_rows(sp_input, -1))
+
+      output, empty_row_indicator_out = self.evaluate(
+          [sp_output, empty_row_indicator])
+
+      self.assertAllEqual(output.indices, [[0, 1], [0, 3], [1, 2], [1, 3]])
+      self.assertAllEqual(output.values, [2, 4, 1, 3])
+      self.assertAllEqual(output.dense_shape, [2, 5])
+      self.assertAllEqual(empty_row_indicator_out, np.zeros(2).astype(np.bool))
+
 
 class SparseAddTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/kernel_tests/sparse_slice_op_test.py b/tensorflow/python/kernel_tests/sparse_slice_op_test.py
index 7f8c91bde67..a363f80c2fb 100644
--- a/tensorflow/python/kernel_tests/sparse_slice_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_slice_op_test.py
@@ -87,16 +87,15 @@ class SparseSliceOpTest(test.TestCase):
       sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [2, 6])
       sp_tensor1 = sparse_ops.sparse_slice(sp_input, [2, 0], [3, 7])
       self.assertAllEqual(
-          sp_tensor0.indices.eval(),
+          sp_tensor0.indices,
           [[0, 0], [0, 2], [0, 4], [0, 5], [1, 1], [1, 3], [1, 4]])
-      self.assertAllEqual(sp_tensor0.values.eval(), [0, 2, 4, 5, 11, 13, 14])
-      self.assertAllEqual(sp_tensor0.dense_shape.eval(), [2, 6])
+      self.assertAllEqual(sp_tensor0.values, [0, 2, 4, 5, 11, 13, 14])
+      self.assertAllEqual(sp_tensor0.dense_shape, [2, 6])
       self.assertAllEqual(
-          sp_tensor1.indices.eval(),
+          sp_tensor1.indices,
           [[0, 0], [0, 3], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5]])
-      self.assertAllEqual(sp_tensor1.values.eval(),
-                          [20, 23, 25, 30, 32, 33, 35])
-      self.assertAllEqual(sp_tensor1.dense_shape.eval(), [2, 6])
+      self.assertAllEqual(sp_tensor1.values, [20, 23, 25, 30, 32, 33, 35])
+      self.assertAllEqual(sp_tensor1.dense_shape, [2, 6])
 
   @test_util.run_deprecated_v1
   def testSliceMatrixUnevenCols(self):
@@ -107,38 +106,38 @@ class SparseSliceOpTest(test.TestCase):
       sp_tensor2 = sparse_ops.sparse_slice(sp_input, [0, 5], [5, 2])
 
       self.assertAllEqual(
-          sp_tensor0.indices.eval(),
+          sp_tensor0.indices,
           [[0, 0], [0, 2], [1, 1], [2, 0], [3, 0], [3, 2], [4, 1]])
-      self.assertAllEqual(sp_tensor0.values.eval(), [0, 2, 11, 20, 30, 32, 41])
-      self.assertAllEqual(sp_tensor0.dense_shape.eval(), [5, 3])
-      self.assertAllEqual(sp_tensor1.indices.eval(),
+      self.assertAllEqual(sp_tensor0.values, [0, 2, 11, 20, 30, 32, 41])
+      self.assertAllEqual(sp_tensor0.dense_shape, [5, 3])
+      self.assertAllEqual(sp_tensor1.indices,
                           [[0, 1], [1, 0], [1, 1], [2, 0], [3, 0], [4, 1]])
-      self.assertAllEqual(sp_tensor1.values.eval(), [4, 13, 14, 23, 33, 44])
-      self.assertAllEqual(sp_tensor1.dense_shape.eval(), [5, 2])
-      self.assertAllEqual(sp_tensor2.indices.eval(),
+      self.assertAllEqual(sp_tensor1.values, [4, 13, 14, 23, 33, 44])
+      self.assertAllEqual(sp_tensor1.dense_shape, [5, 2])
+      self.assertAllEqual(sp_tensor2.indices,
                           [[0, 0], [1, 1], [2, 0], [3, 0], [4, 1]])
-      self.assertAllEqual(sp_tensor2.values.eval(), [5, 16, 25, 35, 46])
-      self.assertAllEqual(sp_tensor2.dense_shape.eval(), [5, 2])
+      self.assertAllEqual(sp_tensor2.values, [5, 16, 25, 35, 46])
+      self.assertAllEqual(sp_tensor2.dense_shape, [5, 2])
 
       sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [5, 2])
       sp_tensor1 = sparse_ops.sparse_slice(sp_input, [0, 2], [5, 2])
       sp_tensor2 = sparse_ops.sparse_slice(sp_input, [0, 4], [5, 2])
       sp_tensor3 = sparse_ops.sparse_slice(sp_input, [0, 6], [5, 2])
-      self.assertAllEqual(sp_tensor0.indices.eval(),
+      self.assertAllEqual(sp_tensor0.indices,
                           [[0, 0], [1, 1], [2, 0], [3, 0], [4, 1]])
-      self.assertAllEqual(sp_tensor0.values.eval(), [0, 11, 20, 30, 41])
-      self.assertAllEqual(sp_tensor0.dense_shape.eval(), [5, 2])
-      self.assertAllEqual(sp_tensor1.indices.eval(),
+      self.assertAllEqual(sp_tensor0.values, [0, 11, 20, 30, 41])
+      self.assertAllEqual(sp_tensor0.dense_shape, [5, 2])
+      self.assertAllEqual(sp_tensor1.indices,
                           [[0, 0], [1, 1], [2, 1], [3, 0], [3, 1]])
-      self.assertAllEqual(sp_tensor1.values.eval(), [2, 13, 23, 32, 33])
-      self.assertAllEqual(sp_tensor1.dense_shape.eval(), [5, 2])
-      self.assertAllEqual(sp_tensor2.indices.eval(),
+      self.assertAllEqual(sp_tensor1.values, [2, 13, 23, 32, 33])
+      self.assertAllEqual(sp_tensor1.dense_shape, [5, 2])
+      self.assertAllEqual(sp_tensor2.indices,
                           [[0, 0], [0, 1], [1, 0], [2, 1], [3, 1], [4, 0]])
-      self.assertAllEqual(sp_tensor2.values.eval(), [4, 5, 14, 25, 35, 44])
-      self.assertAllEqual(sp_tensor2.dense_shape.eval(), [5, 2])
-      self.assertAllEqual(sp_tensor3.indices.eval(), [[1, 0], [4, 0]])
-      self.assertAllEqual(sp_tensor3.values.eval(), [16, 46])
-      self.assertAllEqual(sp_tensor3.dense_shape.eval(), [5, 1])
+      self.assertAllEqual(sp_tensor2.values, [4, 5, 14, 25, 35, 44])
+      self.assertAllEqual(sp_tensor2.dense_shape, [5, 2])
+      self.assertAllEqual(sp_tensor3.indices, [[1, 0], [4, 0]])
+      self.assertAllEqual(sp_tensor3.values, [16, 46])
+      self.assertAllEqual(sp_tensor3.dense_shape, [5, 1])
 
   @test_util.run_deprecated_v1
   def testSliceMatrixUnevenRows(self):
@@ -146,35 +145,32 @@ class SparseSliceOpTest(test.TestCase):
       sp_input = self._SparseTensor_5x7()
       sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [3, 7])
       sp_tensor1 = sparse_ops.sparse_slice(sp_input, [3, 0], [3, 7])
-      self.assertAllEqual(sp_tensor0.indices.eval(),
+      self.assertAllEqual(sp_tensor0.indices,
                           [[0, 0], [0, 2], [0, 4], [0, 5], [1, 1], [1, 3],
                            [1, 4], [1, 6], [2, 0], [2, 3], [2, 5]])
-      self.assertAllEqual(sp_tensor0.values.eval(),
+      self.assertAllEqual(sp_tensor0.values,
                           [0, 2, 4, 5, 11, 13, 14, 16, 20, 23, 25])
-      self.assertAllEqual(sp_tensor0.dense_shape.eval(), [3, 7])
+      self.assertAllEqual(sp_tensor0.dense_shape, [3, 7])
       self.assertAllEqual(
-          sp_tensor1.indices.eval(),
+          sp_tensor1.indices,
           [[0, 0], [0, 2], [0, 3], [0, 5], [1, 1], [1, 4], [1, 6]])
-      self.assertAllEqual(sp_tensor1.values.eval(),
-                          [30, 32, 33, 35, 41, 44, 46])
-      self.assertAllEqual(sp_tensor1.dense_shape.eval(), [2, 7])
+      self.assertAllEqual(sp_tensor1.values, [30, 32, 33, 35, 41, 44, 46])
+      self.assertAllEqual(sp_tensor1.dense_shape, [2, 7])
 
       sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [2, 7])
       sp_tensor1 = sparse_ops.sparse_slice(sp_input, [2, 0], [2, 7])
       sp_tensor2 = sparse_ops.sparse_slice(sp_input, [4, 0], [2, 7])
       self.assertAllEqual(
-          sp_tensor0.indices.eval(),
+          sp_tensor0.indices,
           [[0, 0], [0, 2], [0, 4], [0, 5], [1, 1], [1, 3], [1, 4], [1, 6]])
-      self.assertAllEqual(sp_tensor0.values.eval(),
-                          [0, 2, 4, 5, 11, 13, 14, 16])
-      self.assertAllEqual(sp_tensor0.dense_shape.eval(), [2, 7])
+      self.assertAllEqual(sp_tensor0.values, [0, 2, 4, 5, 11, 13, 14, 16])
+      self.assertAllEqual(sp_tensor0.dense_shape, [2, 7])
 
-      self.assertAllEqual(sp_tensor1.values.eval(),
-                          [20, 23, 25, 30, 32, 33, 35])
-      self.assertAllEqual(sp_tensor1.dense_shape.eval(), [2, 7])
-      self.assertAllEqual(sp_tensor2.indices.eval(), [[0, 1], [0, 4], [0, 6]])
-      self.assertAllEqual(sp_tensor2.values.eval(), [41, 44, 46])
-      self.assertAllEqual(sp_tensor2.dense_shape.eval(), [1, 7])
+      self.assertAllEqual(sp_tensor1.values, [20, 23, 25, 30, 32, 33, 35])
+      self.assertAllEqual(sp_tensor1.dense_shape, [2, 7])
+      self.assertAllEqual(sp_tensor2.indices, [[0, 1], [0, 4], [0, 6]])
+      self.assertAllEqual(sp_tensor2.values, [41, 44, 46])
+      self.assertAllEqual(sp_tensor2.dense_shape, [1, 7])
     return
 
   @test_util.run_deprecated_v1
@@ -185,20 +181,18 @@ class SparseSliceOpTest(test.TestCase):
       sp_tensor1 = sparse_ops.sparse_slice(sp_input, [1, 0], [1, 6])
       sp_tensor2 = sparse_ops.sparse_slice(sp_input, [2, 0], [1, 7])
       sp_tensor3 = sparse_ops.sparse_slice(sp_input, [3, 0], [2, 7])
-      self.assertAllEqual(sp_tensor0.indices.eval(),
-                          [[0, 0], [0, 2], [0, 4], [0, 5]])
-      self.assertAllEqual(sp_tensor0.values.eval(), [0, 2, 4, 5])
-      self.assertAllEqual(sp_tensor0.dense_shape.eval(), [1, 6])
-      self.assertAllEqual(sp_tensor1.indices.eval(), [[0, 1], [0, 3], [0, 4]])
-      self.assertAllEqual(sp_tensor1.values.eval(), [11, 13, 14])
-      self.assertAllEqual(sp_tensor1.dense_shape.eval(), [1, 6])
-      self.assertAllEqual(sp_tensor2.indices.eval(), [[0, 0], [0, 3], [0, 5]])
-      self.assertAllEqual(sp_tensor2.values.eval(), [20, 23, 25])
-      self.assertAllEqual(sp_tensor2.dense_shape.eval(), [1, 6])
-      self.assertAllEqual(sp_tensor3.indices.eval(),
-                          [[0, 0], [0, 2], [0, 3], [0, 5]])
-      self.assertAllEqual(sp_tensor3.values.eval(), [30, 32, 33, 35])
-      self.assertAllEqual(sp_tensor3.dense_shape.eval(), [1, 6])
+      self.assertAllEqual(sp_tensor0.indices, [[0, 0], [0, 2], [0, 4], [0, 5]])
+      self.assertAllEqual(sp_tensor0.values, [0, 2, 4, 5])
+      self.assertAllEqual(sp_tensor0.dense_shape, [1, 6])
+      self.assertAllEqual(sp_tensor1.indices, [[0, 1], [0, 3], [0, 4]])
+      self.assertAllEqual(sp_tensor1.values, [11, 13, 14])
+      self.assertAllEqual(sp_tensor1.dense_shape, [1, 6])
+      self.assertAllEqual(sp_tensor2.indices, [[0, 0], [0, 3], [0, 5]])
+      self.assertAllEqual(sp_tensor2.values, [20, 23, 25])
+      self.assertAllEqual(sp_tensor2.dense_shape, [1, 6])
+      self.assertAllEqual(sp_tensor3.indices, [[0, 0], [0, 2], [0, 3], [0, 5]])
+      self.assertAllEqual(sp_tensor3.values, [30, 32, 33, 35])
+      self.assertAllEqual(sp_tensor3.dense_shape, [1, 6])
 
   @test_util.run_deprecated_v1
   def testSliceColumns(self):
@@ -208,18 +202,18 @@ class SparseSliceOpTest(test.TestCase):
       sparse_tensor1 = sparse_ops.sparse_slice(sp_input, [0, 2], [5, 2])
       sparse_tensor2 = sparse_ops.sparse_slice(sp_input, [0, 4], [5, 3])
 
-      self.assertAllEqual(sparse_tensor0.indices.eval(),
+      self.assertAllEqual(sparse_tensor0.indices,
                           [[0, 0], [1, 1], [2, 0], [3, 0]])
-      self.assertAllEqual(sparse_tensor0.values.eval(), [0, 11, 20, 30])
-      self.assertAllEqual(sparse_tensor0.dense_shape.eval(), [4, 2])
-      self.assertAllEqual(sparse_tensor1.indices.eval(),
+      self.assertAllEqual(sparse_tensor0.values, [0, 11, 20, 30])
+      self.assertAllEqual(sparse_tensor0.dense_shape, [4, 2])
+      self.assertAllEqual(sparse_tensor1.indices,
                           [[0, 0], [1, 1], [2, 1], [3, 0], [3, 1]])
-      self.assertAllEqual(sparse_tensor1.values.eval(), [2, 13, 23, 32, 33])
-      self.assertAllEqual(sparse_tensor1.dense_shape.eval(), [4, 2])
-      self.assertAllEqual(sparse_tensor2.indices.eval(),
+      self.assertAllEqual(sparse_tensor1.values, [2, 13, 23, 32, 33])
+      self.assertAllEqual(sparse_tensor1.dense_shape, [4, 2])
+      self.assertAllEqual(sparse_tensor2.indices,
                           [[0, 0], [0, 1], [1, 0], [2, 1], [3, 1]])
-      self.assertAllEqual(sparse_tensor2.values.eval(), [4, 5, 14, 25, 35])
-      self.assertAllEqual(sparse_tensor2.dense_shape.eval(), [4, 2])
+      self.assertAllEqual(sparse_tensor2.values, [4, 5, 14, 25, 35])
+      self.assertAllEqual(sparse_tensor2.dense_shape, [4, 2])
 
   @test_util.run_deprecated_v1
   def testSliceAllColumns(self):
@@ -231,27 +225,24 @@ class SparseSliceOpTest(test.TestCase):
       sparse_tensor3 = sparse_ops.sparse_slice(sp_input, [0, 3], [4, 1])
       sparse_tensor4 = sparse_ops.sparse_slice(sp_input, [0, 4], [5, 1])
       sparse_tensor5 = sparse_ops.sparse_slice(sp_input, [0, 5], [6, 3])
-      self.assertAllEqual(sparse_tensor0.indices.eval(),
-                          [[0, 0], [2, 0], [3, 0]])
-      self.assertAllEqual(sparse_tensor0.values.eval(), [0, 20, 30])
-      self.assertAllEqual(sparse_tensor0.dense_shape.eval(), [4, 1])
-      self.assertAllEqual(sparse_tensor1.indices.eval(), [[1, 0]])
-      self.assertAllEqual(sparse_tensor1.values.eval(), [11])
-      self.assertAllEqual(sparse_tensor1.dense_shape.eval(), [4, 1])
-      self.assertAllEqual(sparse_tensor2.indices.eval(), [[0, 0], [3, 0]])
-      self.assertAllEqual(sparse_tensor2.values.eval(), [2, 32])
-      self.assertAllEqual(sparse_tensor2.dense_shape.eval(), [4, 1])
-      self.assertAllEqual(sparse_tensor3.indices.eval(),
-                          [[1, 0], [2, 0], [3, 0]])
-      self.assertAllEqual(sparse_tensor3.dense_shape.eval(), [4, 1])
-      self.assertAllEqual(sparse_tensor3.values.eval(), [13, 23, 33])
-      self.assertAllEqual(sparse_tensor4.indices.eval(), [[0, 0], [1, 0]])
-      self.assertAllEqual(sparse_tensor4.values.eval(), [4, 14])
-      self.assertAllEqual(sparse_tensor4.dense_shape.eval(), [4, 1])
-      self.assertAllEqual(sparse_tensor5.indices.eval(),
-                          [[0, 0], [2, 0], [3, 0]])
-      self.assertAllEqual(sparse_tensor5.values.eval(), [5, 25, 35])
-      self.assertAllEqual(sparse_tensor5.dense_shape.eval(), [4, 1])
+      self.assertAllEqual(sparse_tensor0.indices, [[0, 0], [2, 0], [3, 0]])
+      self.assertAllEqual(sparse_tensor0.values, [0, 20, 30])
+      self.assertAllEqual(sparse_tensor0.dense_shape, [4, 1])
+      self.assertAllEqual(sparse_tensor1.indices, [[1, 0]])
+      self.assertAllEqual(sparse_tensor1.values, [11])
+      self.assertAllEqual(sparse_tensor1.dense_shape, [4, 1])
+      self.assertAllEqual(sparse_tensor2.indices, [[0, 0], [3, 0]])
+      self.assertAllEqual(sparse_tensor2.values, [2, 32])
+      self.assertAllEqual(sparse_tensor2.dense_shape, [4, 1])
+      self.assertAllEqual(sparse_tensor3.indices, [[1, 0], [2, 0], [3, 0]])
+      self.assertAllEqual(sparse_tensor3.dense_shape, [4, 1])
+      self.assertAllEqual(sparse_tensor3.values, [13, 23, 33])
+      self.assertAllEqual(sparse_tensor4.indices, [[0, 0], [1, 0]])
+      self.assertAllEqual(sparse_tensor4.values, [4, 14])
+      self.assertAllEqual(sparse_tensor4.dense_shape, [4, 1])
+      self.assertAllEqual(sparse_tensor5.indices, [[0, 0], [2, 0], [3, 0]])
+      self.assertAllEqual(sparse_tensor5.values, [5, 25, 35])
+      self.assertAllEqual(sparse_tensor5.dense_shape, [4, 1])
 
   @test_util.run_deprecated_v1
   def testGradients(self):
@@ -263,8 +254,8 @@ class SparseSliceOpTest(test.TestCase):
     with self.session(use_gpu=False):
       for start, size in start_and_size:
         sp_output = sparse_ops.sparse_slice(sp_input, start, size)
-        nnz_in = len(sp_input.values.eval())
-        nnz_out = len(sp_output.values.eval())
+        nnz_in = len(self.evaluate(sp_input.values))
+        nnz_out = len(self.evaluate(sp_output.values))
 
         err = gradient_checker.compute_gradient_error(
             [sp_input.values], [(nnz_in,)], sp_output.values, (nnz_out,))
diff --git a/tensorflow/python/kernel_tests/split_op_test.py b/tensorflow/python/kernel_tests/split_op_test.py
index ef66d8dda0b..16f92dbd875 100644
--- a/tensorflow/python/kernel_tests/split_op_test.py
+++ b/tensorflow/python/kernel_tests/split_op_test.py
@@ -373,7 +373,6 @@ class SplitOpTest(test.TestCase):
     assert s1.shape.as_list() == [1]
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testNonexistentDimTensor(self):
     x = array_ops.placeholder(dtypes.int32)
     values = np.zeros([5, 30])
diff --git a/tensorflow/python/kernel_tests/stack_op_test.py b/tensorflow/python/kernel_tests/stack_op_test.py
index 600e61a802c..8237ce228af 100644
--- a/tensorflow/python/kernel_tests/stack_op_test.py
+++ b/tensorflow/python/kernel_tests/stack_op_test.py
@@ -306,7 +306,7 @@ class AutomaticStackingTest(test.TestCase):
     with self.session(use_gpu=True):
       v = variables.Variable(17)
       result = ops.convert_to_tensor([[0, 0, 0], [0, v, 0], [0, 0, 0]])
-      v.initializer.run()
+      self.evaluate(v.initializer)
       self.assertAllEqual([[0, 0, 0], [0, 17, 0], [0, 0, 0]],
                           self.evaluate(result))
 
diff --git a/tensorflow/python/kernel_tests/stack_ops_test.py b/tensorflow/python/kernel_tests/stack_ops_test.py
index 1930d2484fd..b6a3406e5cc 100644
--- a/tensorflow/python/kernel_tests/stack_ops_test.py
+++ b/tensorflow/python/kernel_tests/stack_ops_test.py
@@ -265,7 +265,7 @@ class StackOpRefTest(test.TestCase):
       h2 = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
       c2 = gen_data_flow_ops.stack_push(h2, 5.0)
       _ = c1 + c2
-      self.assertNotEqual(h1.eval()[1], self.evaluate(h2)[1])
+      self.assertNotEqual(self.evaluate(h1)[1], self.evaluate(h2)[1])
 
   @test_util.run_deprecated_v1
   def testSameNameStacks(self):
diff --git a/tensorflow/python/kernel_tests/string_format_op_test.py b/tensorflow/python/kernel_tests/string_format_op_test.py
index adb8ad6e677..d546b09a5b4 100644
--- a/tensorflow/python/kernel_tests/string_format_op_test.py
+++ b/tensorflow/python/kernel_tests/string_format_op_test.py
@@ -379,6 +379,15 @@ class StringFormatOpTest(test.TestCase):
         format_output = string_ops.string_format("{}", (tensor, tensor))
         self.evaluate(format_output)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testTensorAndFormatUnicode(self):
+    with self.cached_session():
+      tensor = constant_op.constant("😊")
+      format_output = string_ops.string_format("😊:{}", tensor)
+      out = self.evaluate(format_output)
+      expected = '😊:"😊"'
+      self.assertEqual(compat.as_text(out), expected)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/string_join_op_test.py b/tensorflow/python/kernel_tests/string_join_op_test.py
index 9e2f3e7ae7e..5e46752526e 100644
--- a/tensorflow/python/kernel_tests/string_join_op_test.py
+++ b/tensorflow/python/kernel_tests/string_join_op_test.py
@@ -41,7 +41,7 @@ class StringJoinOpTest(test.TestCase):
       self.assertAllEqual(output, [b"a--a--a", b"b--a--b"])
 
       output = string_ops.string_join([input1] * 4, separator="!")
-      self.assertEqual(output.eval(), b"a!a!a!a")
+      self.assertEqual(self.evaluate(output), b"a!a!a!a")
 
       output = string_ops.string_join([input2] * 2, separator="")
       self.assertAllEqual(output, [[b"bb"], [b"cc"]])
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index 2350084d330..4d0f6507aef 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -617,7 +617,7 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaisesOpError(
           r"TensorArray foo_.*: Could not write to TensorArray index 2 because "
           r"it has already been written to."):
-        w1.flow.eval()
+        self.evaluate(w1.flow)
 
       # Using differing shapes causes an exception
       wb0_grad = ta_grad.write(1, c(1.0))
@@ -626,7 +626,7 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaisesOpError(
           r"Could not aggregate to TensorArray index 1 because the "
           r"existing shape is \[\] but the new input shape is \[1\]"):
-        wb1_grad.flow.eval()
+        self.evaluate(wb1_grad.flow)
 
   @test_util.disable_control_flow_v2("v2 does not support TensorArray.grad.")
   @test_util.run_v1_only("v2 does not support TensorArray.grad.")
diff --git a/tensorflow/python/kernel_tests/topk_op_test.py b/tensorflow/python/kernel_tests/topk_op_test.py
index eb74d96786b..a895827d5fe 100644
--- a/tensorflow/python/kernel_tests/topk_op_test.py
+++ b/tensorflow/python/kernel_tests/topk_op_test.py
@@ -240,7 +240,7 @@ class TopKBenchmark(test.Benchmark):
           v = resource_variable_ops.ResourceVariable(x)
           op = nn_ops.top_k(v, k)
         with session.Session() as sess:
-          v.initializer.run()
+          self.evaluate(v.initializer)
           r = self.run_op_benchmark(sess, op, min_iters=100, name=name)
           gb_processed_input = m * n / 1.0e9
           throughput = gb_processed_input / r["wall_time"]
diff --git a/tensorflow/python/kernel_tests/v1_compat_tests/BUILD b/tensorflow/python/kernel_tests/v1_compat_tests/BUILD
new file mode 100644
index 00000000000..bd9c02d8101
--- /dev/null
+++ b/tensorflow/python/kernel_tests/v1_compat_tests/BUILD
@@ -0,0 +1,44 @@
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+tf_py_test(
+    name = "identity_op_py_test",
+    size = "small",
+    srcs = ["identity_op_py_test.py"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_gen",
+        "//tensorflow/python:variables",
+    ],
+)
+
+cuda_py_test(
+    name = "scatter_nd_ops_test",
+    size = "small",
+    srcs = ["scatter_nd_ops_test.py"],
+    deps = [
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_test(
+    name = "session_ops_test",
+    size = "small",
+    srcs = ["session_ops_test.py"],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:session_ops",
+    ],
+)
diff --git a/tensorflow/python/kernel_tests/v1_compat_tests/identity_op_py_test.py b/tensorflow/python/kernel_tests/v1_compat_tests/identity_op_py_test.py
new file mode 100644
index 00000000000..3c53d021f49
--- /dev/null
+++ b/tensorflow/python/kernel_tests/v1_compat_tests/identity_op_py_test.py
@@ -0,0 +1,42 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for IdentityOp."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class IdentityOpTest(test.TestCase):
+
+  @test_util.run_v1_only("Don't need to test VariableV1 in TF2.")
+  def testRefIdentityShape(self):
+    shape = [2, 3]
+    tensor = variables.VariableV1(
+        constant_op.constant(
+            [[1, 2, 3], [6, 5, 4]], dtype=dtypes.int32))
+    self.assertEqual(shape, tensor.get_shape())
+    self.assertEqual(shape, gen_array_ops.ref_identity(tensor).get_shape())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/v1_compat_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/v1_compat_tests/scatter_nd_ops_test.py
new file mode 100644
index 00000000000..6ee75649867
--- /dev/null
+++ b/tensorflow/python/kernel_tests/v1_compat_tests/scatter_nd_ops_test.py
@@ -0,0 +1,159 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for scatter_nd_ops that only work in V1."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+import numpy as np
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def _AsType(v, vtype):
+  return v.astype(vtype) if isinstance(v, np.ndarray) else vtype(v)
+
+
+def _FlatInnerDims(tensor, ndims=2):
+  shape = list(tensor.shape)
+  return tensor.reshape(
+      [functools.reduce(lambda x, y: x * y, shape[:-ndims + 1], 1)] +
+      shape[-ndims + 1:])
+
+
+def _FlatOuterDims(tensor, ndims=2):
+  shape = list(tensor.shape)
+  return tensor.reshape(
+      shape[:ndims - 1] +
+      [functools.reduce(lambda x, y: x * y, shape[ndims - 1:], 1)])
+
+
+def _NumpyScatterNd(ref, indices, updates, op):
+  ixdim = indices.shape[-1]
+  num_updates = indices.size // ixdim
+  total_nd = len(ref.shape)
+  slice_size = 1
+  for i in range(ixdim, total_nd):
+    slice_size *= ref.shape[i]
+  flat_indices = _FlatInnerDims(indices)
+  flat_updates = updates.reshape((num_updates, slice_size))
+  output_flat = _FlatOuterDims(ref, ixdim + 1)
+  for ix_updates, ix_output in enumerate(flat_indices):
+    ix_output = tuple(ix_output)
+    output_flat[ix_output] = op(output_flat[ix_output],
+                                flat_updates[ix_updates])
+  return output_flat.reshape(ref.shape)
+
+
+def _NumpyMin(ref, indices, updates):
+  return _NumpyScatterNd(ref, indices, updates, np.minimum)
+
+
+def _NumpyMax(ref, indices, updates):
+  return _NumpyScatterNd(ref, indices, updates, np.maximum)
+
+
+class StatefulScatterNdTest(test.TestCase):
+
+  def _VariableRankTest(self,
+                        np_scatter,
+                        tf_scatter,
+                        vtype,
+                        itype,
+                        repeat_indices=False):
+    np.random.seed(8)
+    ref_shapes = [(3, 6), (3, 6), (3, 6, 9), (3, 6, 9), (3, 6, 9), (3, 6, 9)]
+    indices_shapes = [(2,), (2, 2), (2,), (2, 2), (2, 3), (2, 3, 3)]
+    with test_util.device(use_gpu=True):
+      for ref_shape, indices_shape in zip(ref_shapes, indices_shapes):
+        num_updates = indices_shape[0]
+        ixdim = indices_shape[-1]
+
+        indexable_area_shape = ()
+        for i in range(ixdim):
+          indexable_area_shape += (ref_shape[i],)
+        all_indices = [
+            list(coord) for coord, _ in np.ndenumerate(
+                np.empty(indexable_area_shape, vtype))
+        ]
+        np.random.shuffle(all_indices)
+        indices = np.array(all_indices[:num_updates])
+
+        if num_updates > 1 and repeat_indices:
+          indices = indices[:num_updates // 2]
+          for _ in range(num_updates - num_updates // 2):
+            indices = np.append(
+                indices, [indices[np.random.randint(num_updates // 2)]], axis=0)
+          np.random.shuffle(indices)
+        indices = _AsType(indices[:num_updates], itype)
+
+        updates_shape = (num_updates,)
+        for i in range(ixdim, len(ref_shape)):
+          updates_shape += (ref_shape[i],)
+        updates = _AsType(np.random.randn(*(updates_shape)), vtype)
+        ref = _AsType(np.random.randn(*(ref_shape)), vtype)
+
+        # Scatter via numpy
+        new = ref.copy()
+        np_scatter(new, indices, updates)
+        # Scatter via tensorflow
+        ref_var = variables.VariableV1(ref)
+        self.evaluate(ref_var.initializer)
+        self.evaluate(tf_scatter(ref_var, indices, updates))
+
+        # Compare
+        self.assertAllClose(new, self.evaluate(ref_var))
+
+  def _ScatterRepeatIndicesTest(self, np_scatter, tf_scatter):
+    for vtype in (np.int32, np.float16, np.float32, np.float64):
+      for itype in (np.int32, np.int64):
+        self._VariableRankTest(
+            np_scatter, tf_scatter, vtype, itype, repeat_indices=True)
+
+  @test_util.run_v1_only("Don't need to test VariableV1 in TF2")
+  def testScatterRepeatIndicesMinMax(self):
+    """This tests scatter_add using indices that repeat."""
+    self._ScatterRepeatIndicesTest(_NumpyMin, state_ops.scatter_nd_min)
+    self._ScatterRepeatIndicesTest(_NumpyMax, state_ops.scatter_nd_max)
+
+  @test_util.run_v1_only("Don't need to test VariableV1 in TF2")
+  def testScatterOutOfRangeCpu(self):
+    for op in (state_ops.scatter_nd_min, state_ops.scatter_nd_max):
+      params = np.array([1, 2, 3, 4, 5, 6]).astype(np.float32)
+      updates = np.array([-3, -4, -5]).astype(np.float32)
+      with self.cached_session(use_gpu=False):
+        ref = variables.VariableV1(params)
+        self.evaluate(ref.initializer)
+
+        # Indices all in range, no problem.
+        indices = np.array([[2], [0], [5]])
+        self.evaluate(op(ref, indices, updates))
+
+        # Test some out of range errors.
+        indices = np.array([[-1], [0], [5]])
+        with self.assertRaisesOpError(
+            r"indices\[0\] = \[-1\] does not index into shape \[6\]"):
+          op(ref, indices, updates).eval()
+
+        indices = np.array([[2], [0], [6]])
+        with self.assertRaisesOpError(
+            r"indices\[2\] = \[6\] does not index into shape \[6\]"):
+          op(ref, indices, updates).eval()
diff --git a/tensorflow/python/kernel_tests/session_ops_test.py b/tensorflow/python/kernel_tests/v1_compat_tests/session_ops_test.py
similarity index 99%
rename from tensorflow/python/kernel_tests/session_ops_test.py
rename to tensorflow/python/kernel_tests/v1_compat_tests/session_ops_test.py
index 7d422278408..15f6afc6eaa 100644
--- a/tensorflow/python/kernel_tests/session_ops_test.py
+++ b/tensorflow/python/kernel_tests/v1_compat_tests/session_ops_test.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index 790749b1fa4..d81f9c23d97 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -656,14 +656,14 @@ class IsInitializedTest(test.TestCase):
       self.assertAllEqual(np.array([b"v", b"w"]), self.evaluate(uninited))
       self.evaluate(w.initializer)
       self.assertAllEqual(np.array([b"v"]), self.evaluate(uninited))
-      v.initializer.run()
+      self.evaluate(v.initializer)
       self.assertEqual(0, self.evaluate(uninited).size)
 
   def testZeroSizeVarInitialized(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v = variables.Variable(array_ops.zeros([0, 2]), name="v")
       uninited = variables.report_uninitialized_variables()
-      v.initializer.run()  # not strictly necessary
+      self.evaluate(v.initializer)  # not strictly necessary
       self.assertEqual(0, self.evaluate(uninited).size)
 
   def testTrainingWithZeroSizeVar(self):
@@ -707,7 +707,7 @@ class ObsoleteIsInitializedTest(test.TestCase):
       self.evaluate(w.initializer)
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
         inited.op.run()
-      v.initializer.run()
+      self.evaluate(v.initializer)
       inited.op.run()
 
 
diff --git a/tensorflow/python/kernel_tests/where_op_test.py b/tensorflow/python/kernel_tests/where_op_test.py
index 00d916c06eb..c16d016f5e3 100644
--- a/tensorflow/python/kernel_tests/where_op_test.py
+++ b/tensorflow/python/kernel_tests/where_op_test.py
@@ -284,7 +284,7 @@ class WhereBenchmark(test.Benchmark):
           v = resource_variable_ops.ResourceVariable(x)
           op = array_ops.where(v)
         with session.Session(config=benchmark.benchmark_config()) as sess:
-          v.initializer.run()
+          self.evaluate(v.initializer)
           r = self.run_op_benchmark(sess, op, min_iters=100, name=name)
           gb_processed_input = m * n / 1.0e9
           # approximate size of output: m*n*p int64s for each axis.
@@ -310,9 +310,9 @@ class WhereBenchmark(test.Benchmark):
           c = resource_variable_ops.ResourceVariable(c_gen)
           op = array_ops.where(c, x, y)
         with session.Session(config=benchmark.benchmark_config()) as sess:
-          x.initializer.run()
-          y.initializer.run()
-          c.initializer.run()
+          self.evaluate(x.initializer)
+          self.evaluate(y.initializer)
+          self.evaluate(c.initializer)
           r = self.run_op_benchmark(sess, op, min_iters=100, name=name)
           # approximate size of output: m*n*2 floats for each axis.
           gb_processed = m * n * 8 / 1.0e9
diff --git a/tensorflow/python/kernel_tests/while_v2_test.py b/tensorflow/python/kernel_tests/while_v2_test.py
index dbfd5426006..de2e8e3cc8d 100644
--- a/tensorflow/python/kernel_tests/while_v2_test.py
+++ b/tensorflow/python/kernel_tests/while_v2_test.py
@@ -174,7 +174,7 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
   def testExternalControlDependencies(self):
     with ops.Graph().as_default(), self.test_session():
       v = variables.Variable(1.)
-      v.initializer.run()
+      self.evaluate(v.initializer)
       op = v.assign_add(1.)
 
       def body_fn(i):  # pylint: disable=invalid-name
@@ -1241,6 +1241,26 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     config.experimental.executor_type = "SINGLE_THREADED_EXECUTOR"
     self._runBasicWithConfig(config)
 
+  def testIsControlFlowGraph(self):
+    x = constant_op.constant(0)
+
+    @def_function.function
+    def F(c):
+
+      def Cond(i):
+        self.assertTrue(i.graph.is_control_flow_graph)
+        return i < 2
+
+      def Body(i):
+        i = i + 1
+        self.assertTrue(i.graph.is_control_flow_graph)
+        return i
+
+      return while_loop_v2(Cond, Body, [c])
+
+    ret, = F(x)
+    self.assertEqual(2, self.evaluate(ret))
+
   def testImportFromSerializedWithFunctionInBody(self):
     serialized = """node {
       name: "Const"
diff --git a/tensorflow/python/lib/core/bfloat16.cc b/tensorflow/python/lib/core/bfloat16.cc
index bb6b720febe..c50a0ff246c 100644
--- a/tensorflow/python/lib/core/bfloat16.cc
+++ b/tensorflow/python/lib/core/bfloat16.cc
@@ -426,10 +426,10 @@ int NPyBfloat16_Compare(const void* a, const void* b, void* arr) {
     return 1;
   }
   // NaNs sort to the end.
-  if (!std::isnan(x) && std::isnan(y)) {
+  if (!Eigen::numext::isnan(x) && Eigen::numext::isnan(y)) {
     return -1;
   }
-  if (std::isnan(x) && !std::isnan(y)) {
+  if (Eigen::numext::isnan(x) && !Eigen::numext::isnan(y)) {
     return 1;
   }
   return 0;
diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index e2fb3ec8dc9..7be05c03e36 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -181,6 +181,16 @@ Status PyArray_TYPE_to_TF_DataType(PyArrayObject* array,
         // might be different on certain platforms.
         *out_tf_datatype = TF_INT64;
         break;
+      } else if (pyarray_type == NPY_INT) {
+        // NPY_INT is equivalent to NPY_INT32, while their enum values might be
+        // different on certain platforms.
+        *out_tf_datatype = TF_INT32;
+        break;
+      } else if (pyarray_type == NPY_UINT) {
+        // NPY_UINT is equivalent to NPY_UINT32, while their enum values might
+        // be different on certain platforms.
+        *out_tf_datatype = TF_UINT32;
+        break;
       }
       return errors::Internal("Unsupported numpy type: ",
                               numpy_type_name(pyarray_type));
@@ -271,7 +281,7 @@ Status CopyTF_TensorStringsToPyArray(const TF_Tensor* src, uint64 nelems,
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   auto iter = make_safe(PyArray_IterNew(reinterpret_cast<PyObject*>(dst)));
-  for (int64 i = 0; i < nelems; ++i) {
+  for (int64 i = 0; i < static_cast<int64>(nelems); ++i) {
     const tstring& tstr_i = tstr[i];
     auto py_string =
         make_safe(PyBytes_FromStringAndSize(tstr_i.data(), tstr_i.size()));
diff --git a/tensorflow/python/lib/core/pybind11_lib.h b/tensorflow/python/lib/core/pybind11_lib.h
index 6a0471cb4da..a0fb45a5152 100644
--- a/tensorflow/python/lib/core/pybind11_lib.h
+++ b/tensorflow/python/lib/core/pybind11_lib.h
@@ -60,6 +60,11 @@ void ThrowTypeError(const char* error_message) {
   throw pybind11::error_already_set();
 }
 
+void ThrowValueError(const char* error_message) {
+  PyErr_SetString(PyExc_ValueError, error_message);
+  throw pybind11::error_already_set();
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_PYTHON_LIB_CORE_PYBIND11_LIB_H_
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index dbf04097fce..fb4e19da902 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -202,14 +202,14 @@ class FileIO(object):
   def __iter__(self):
     return self
 
-  def next(self):
+  def __next__(self):
     retval = self.readline()
     if not retval:
       raise StopIteration()
     return retval
 
-  def __next__(self):
-    return self.next()
+  def next(self):
+    return self.__next__()
 
   def flush(self):
     """Flushes the Writable file.
diff --git a/tensorflow/python/lib/io/file_io_test.py b/tensorflow/python/lib/io/file_io_test.py
index b2bb1bd6cf5..9b9150daa52 100644
--- a/tensorflow/python/lib/io/file_io_test.py
+++ b/tensorflow/python/lib/io/file_io_test.py
@@ -39,6 +39,9 @@ class PathLike(object):
   def __fspath__(self):
     return self.name
 
+  def __str__(self):
+    return self.name
+
 
 run_all_path_types = parameterized.named_parameters(
     ("str", os.path.join),
@@ -197,7 +200,7 @@ class FileIoTest(test.TestCase, parameterized.TestCase):
     dir_path = join(self._base_dir, "temp_dir/temp_dir1/temp_dir2")
     file_io.recursive_create_dir(dir_path)
     file_io.recursive_create_dir(dir_path)  # repeat creation
-    file_path = os.path.join(dir_path, "temp_file")
+    file_path = os.path.join(str(dir_path), "temp_file")
     file_io.FileIO(file_path, mode="w").write("testing")
     self.assertTrue(file_io.file_exists(file_path))
     file_io.delete_recursively(os.path.join(self._base_dir, "temp_dir"))
@@ -271,7 +274,7 @@ class FileIoTest(test.TestCase, parameterized.TestCase):
     self.assertFalse(file_io.is_directory(dir_path))
     file_io.create_dir(dir_path)
     self.assertTrue(file_io.is_directory(dir_path))
-    file_path = join(dir_path, "test_file")
+    file_path = join(str(dir_path), "test_file")
     file_io.FileIO(file_path, mode="w").write("test")
     # False for a file.
     self.assertFalse(file_io.is_directory(file_path))
@@ -285,11 +288,11 @@ class FileIoTest(test.TestCase, parameterized.TestCase):
     file_io.create_dir(dir_path)
     files = ["file1.txt", "file2.txt", "file3.txt"]
     for name in files:
-      file_path = join(dir_path, name)
+      file_path = join(str(dir_path), name)
       file_io.FileIO(file_path, mode="w").write("testing")
-    subdir_path = join(dir_path, "sub_dir")
+    subdir_path = join(str(dir_path), "sub_dir")
     file_io.create_dir(subdir_path)
-    subdir_file_path = join(subdir_path, "file4.txt")
+    subdir_file_path = join(str(subdir_path), "file4.txt")
     file_io.FileIO(subdir_file_path, mode="w").write("testing")
     dir_list = file_io.list_directory(dir_path)
     self.assertItemsEqual(files + ["sub_dir"], dir_list)
@@ -319,7 +322,7 @@ class FileIoTest(test.TestCase, parameterized.TestCase):
   def testWalkInOrder(self, join):
     dir_path_str = os.path.join(self._base_dir, "test_dir")
     dir_path = join(self._base_dir, "test_dir")
-    self._setupWalkDirectories(dir_path)
+    self._setupWalkDirectories(dir_path_str)
     # Now test the walk (in_order = True)
     all_dirs = []
     all_subdirs = []
@@ -389,7 +392,7 @@ class FileIoTest(test.TestCase, parameterized.TestCase):
     file_path = join(self._base_dir, "temp_file")
     file_io.FileIO(file_path, mode="w").write("testing")
     file_statistics = file_io.stat(file_path)
-    os_statistics = os.stat(file_path)
+    os_statistics = os.stat(str(file_path))
     self.assertEqual(7, file_statistics.length)
     self.assertEqual(
         int(os_statistics.st_mtime), int(file_statistics.mtime_nsec / 1e9))
diff --git a/tensorflow/python/lib/io/file_io_wrapper.cc b/tensorflow/python/lib/io/file_io_wrapper.cc
index 54b06d69559..3ede938bed0 100644
--- a/tensorflow/python/lib/io/file_io_wrapper.cc
+++ b/tensorflow/python/lib/io/file_io_wrapper.cc
@@ -32,119 +32,178 @@ limitations under the License.
 #include "tensorflow/python/lib/core/pybind11_absl.h"
 #include "tensorflow/python/lib/core/pybind11_status.h"
 
+namespace tensorflow {
+struct PyTransactionToken {
+  TransactionToken* token_;
+};
+
+inline TransactionToken* TokenFromPyToken(PyTransactionToken* t) {
+  return (t ? t->token_ : nullptr);
+}
+}  // namespace tensorflow
+
 namespace {
 namespace py = pybind11;
 
 PYBIND11_MODULE(_pywrap_file_io, m) {
-  m.def("FileExists", [](const std::string& filename) {
-    tensorflow::Status status;
-    {
-      py::gil_scoped_release release;
-      status = tensorflow::Env::Default()->FileExists(filename);
-    }
-    tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
-  });
-  m.def("DeleteFile", [](const std::string& filename) {
-    py::gil_scoped_release release;
-    tensorflow::Status status =
-        tensorflow::Env::Default()->DeleteFile(filename);
-    tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
-  });
-  m.def("ReadFileToString", [](const std::string& filename) {
-    std::string data;
-    py::gil_scoped_release release;
-    const auto status =
-        ReadFileToString(tensorflow::Env::Default(), filename, &data);
-    pybind11::gil_scoped_acquire acquire;
-    tensorflow::MaybeRaiseRegisteredFromStatus(status);
-    return py::bytes(data);
-  });
-  m.def("WriteStringToFile",
-        [](const std::string& filename, tensorflow::StringPiece data) {
-          py::gil_scoped_release release;
-          const auto status =
-              WriteStringToFile(tensorflow::Env::Default(), filename, data);
-          tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
-        });
-  m.def("GetChildren", [](const std::string& dirname) {
-    std::vector<std::string> results;
-    py::gil_scoped_release release;
-    const auto status =
-        tensorflow::Env::Default()->GetChildren(dirname, &results);
-    pybind11::gil_scoped_acquire acquire;
-    tensorflow::MaybeRaiseRegisteredFromStatus(status);
-    return results;
-  });
-  m.def("GetMatchingFiles", [](const std::string& pattern) {
-    std::vector<std::string> results;
-    py::gil_scoped_release release;
-    const auto status =
-        tensorflow::Env::Default()->GetMatchingPaths(pattern, &results);
-    pybind11::gil_scoped_acquire acquire;
-    tensorflow::MaybeRaiseRegisteredFromStatus(status);
-    return results;
-  });
-  m.def("CreateDir", [](const std::string& dirname) {
-    py::gil_scoped_release release;
-    const auto status = tensorflow::Env::Default()->CreateDir(dirname);
-    if (tensorflow::errors::IsAlreadyExists(status)) {
-      return;
-    }
-    tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
-  });
-  m.def("RecursivelyCreateDir", [](const std::string& dirname) {
-    py::gil_scoped_release release;
-    const auto status =
-        tensorflow::Env::Default()->RecursivelyCreateDir(dirname);
-    tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
-  });
-  m.def("CopyFile",
-        [](const std::string& src, const std::string& target, bool overwrite) {
-          py::gil_scoped_release release;
-          auto* env = tensorflow::Env::Default();
-          tensorflow::Status status;
-          if (!overwrite && env->FileExists(target).ok()) {
-            status = tensorflow::errors::AlreadyExists("file already exists");
-          } else {
-            status = env->CopyFile(src, target);
-          }
-          tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
-        });
-  m.def("RenameFile",
-        [](const std::string& src, const std::string& target, bool overwrite) {
-          py::gil_scoped_release release;
-          auto* env = tensorflow::Env::Default();
-          tensorflow::Status status;
-          if (!overwrite && env->FileExists(target).ok()) {
-            status = tensorflow::errors::AlreadyExists("file already exists");
-          } else {
-            status = env->RenameFile(src, target);
-          }
-          tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
-        });
-  m.def("DeleteRecursively", [](const std::string& dirname) {
-    py::gil_scoped_release release;
-    tensorflow::int64 undeleted_files;
-    tensorflow::int64 undeleted_dirs;
-    auto status = tensorflow::Env::Default()->DeleteRecursively(
-        dirname, &undeleted_files, &undeleted_dirs);
-    if (status.ok() && (undeleted_files > 0 || undeleted_dirs > 0)) {
-      status =
-          tensorflow::errors::PermissionDenied("could not fully delete dir");
-    }
-    tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
-  });
-  m.def("IsDirectory", [](const std::string& dirname) {
-    py::gil_scoped_release release;
-    const auto status = tensorflow::Env::Default()->IsDirectory(dirname);
-    // FAILED_PRECONDITION response means path exists but isn't a dir.
-    if (tensorflow::errors::IsFailedPrecondition(status)) {
-      return false;
-    }
+  using tensorflow::PyTransactionToken;
+  using tensorflow::TransactionToken;
+  py::class_<PyTransactionToken>(m, "TransactionToken")
+      .def("__repr__", [](const PyTransactionToken* t) {
+        if (t->token_) {
+          return std::string(t->token_->owner->DecodeTransaction(t->token_));
+        }
+        return std::string("Invalid token!");
+      });
 
-    tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
-    return true;
-  });
+  m.def(
+      "FileExists",
+      [](const std::string& filename, PyTransactionToken* token) {
+        tensorflow::Status status;
+        {
+          py::gil_scoped_release release;
+          status = tensorflow::Env::Default()->FileExists(filename);
+        }
+        tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
+      },
+      py::arg("filename"), py::arg("token") = (PyTransactionToken*)nullptr);
+  m.def(
+      "DeleteFile",
+      [](const std::string& filename, PyTransactionToken* token) {
+        py::gil_scoped_release release;
+        tensorflow::Status status =
+            tensorflow::Env::Default()->DeleteFile(filename);
+        tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
+      },
+      py::arg("filename"), py::arg("token") = (PyTransactionToken*)nullptr);
+  m.def(
+      "ReadFileToString",
+      [](const std::string& filename, PyTransactionToken* token) {
+        std::string data;
+        py::gil_scoped_release release;
+        const auto status =
+            ReadFileToString(tensorflow::Env::Default(), filename, &data);
+        pybind11::gil_scoped_acquire acquire;
+        tensorflow::MaybeRaiseRegisteredFromStatus(status);
+        return py::bytes(data);
+      },
+      py::arg("filename"), py::arg("token") = (PyTransactionToken*)nullptr);
+  m.def(
+      "WriteStringToFile",
+      [](const std::string& filename, tensorflow::StringPiece data,
+         PyTransactionToken* token) {
+        py::gil_scoped_release release;
+        const auto status =
+            WriteStringToFile(tensorflow::Env::Default(), filename, data);
+        tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
+      },
+      py::arg("filename"), py::arg("data"),
+      py::arg("token") = (PyTransactionToken*)nullptr);
+  m.def(
+      "GetChildren",
+      [](const std::string& dirname, PyTransactionToken* token) {
+        std::vector<std::string> results;
+        py::gil_scoped_release release;
+        const auto status =
+            tensorflow::Env::Default()->GetChildren(dirname, &results);
+        pybind11::gil_scoped_acquire acquire;
+        tensorflow::MaybeRaiseRegisteredFromStatus(status);
+        return results;
+      },
+      py::arg("dirname"), py::arg("token") = (PyTransactionToken*)nullptr);
+  m.def(
+      "GetMatchingFiles",
+      [](const std::string& pattern, PyTransactionToken* token) {
+        std::vector<std::string> results;
+        py::gil_scoped_release release;
+        const auto status =
+            tensorflow::Env::Default()->GetMatchingPaths(pattern, &results);
+        pybind11::gil_scoped_acquire acquire;
+        tensorflow::MaybeRaiseRegisteredFromStatus(status);
+        return results;
+      },
+      py::arg("pattern"), py::arg("token") = (PyTransactionToken*)nullptr);
+  m.def(
+      "CreateDir",
+      [](const std::string& dirname, PyTransactionToken* token) {
+        py::gil_scoped_release release;
+        const auto status = tensorflow::Env::Default()->CreateDir(dirname);
+        if (tensorflow::errors::IsAlreadyExists(status)) {
+          return;
+        }
+        tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
+      },
+      py::arg("dirname"), py::arg("token") = (PyTransactionToken*)nullptr);
+  m.def(
+      "RecursivelyCreateDir",
+      [](const std::string& dirname, PyTransactionToken* token) {
+        py::gil_scoped_release release;
+        const auto status =
+            tensorflow::Env::Default()->RecursivelyCreateDir(dirname);
+        tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
+      },
+      py::arg("dirname"), py::arg("token") = (PyTransactionToken*)nullptr);
+  m.def(
+      "CopyFile",
+      [](const std::string& src, const std::string& target, bool overwrite,
+         PyTransactionToken* token) {
+        py::gil_scoped_release release;
+        auto* env = tensorflow::Env::Default();
+        tensorflow::Status status;
+        if (!overwrite && env->FileExists(target).ok()) {
+          status = tensorflow::errors::AlreadyExists("file already exists");
+        } else {
+          status = env->CopyFile(src, target);
+        }
+        tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
+      },
+      py::arg("src"), py::arg("target"), py::arg("overwrite"),
+      py::arg("token") = (PyTransactionToken*)nullptr);
+  m.def(
+      "RenameFile",
+      [](const std::string& src, const std::string& target, bool overwrite,
+         PyTransactionToken* token) {
+        py::gil_scoped_release release;
+        auto* env = tensorflow::Env::Default();
+        tensorflow::Status status;
+        if (!overwrite && env->FileExists(target).ok()) {
+          status = tensorflow::errors::AlreadyExists("file already exists");
+        } else {
+          status = env->RenameFile(src, target);
+        }
+        tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
+      },
+      py::arg("src"), py::arg("target"), py::arg("overwrite"),
+      py::arg("token") = (PyTransactionToken*)nullptr);
+  m.def(
+      "DeleteRecursively",
+      [](const std::string& dirname, PyTransactionToken* token) {
+        py::gil_scoped_release release;
+        tensorflow::int64 undeleted_files;
+        tensorflow::int64 undeleted_dirs;
+        auto status = tensorflow::Env::Default()->DeleteRecursively(
+            dirname, &undeleted_files, &undeleted_dirs);
+        if (status.ok() && (undeleted_files > 0 || undeleted_dirs > 0)) {
+          status = tensorflow::errors::PermissionDenied(
+              "could not fully delete dir");
+        }
+        tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
+      },
+      py::arg("dirname"), py::arg("token") = (PyTransactionToken*)nullptr);
+  m.def(
+      "IsDirectory",
+      [](const std::string& dirname, PyTransactionToken* token) {
+        py::gil_scoped_release release;
+        const auto status = tensorflow::Env::Default()->IsDirectory(dirname);
+        // FAILED_PRECONDITION response means path exists but isn't a dir.
+        if (tensorflow::errors::IsFailedPrecondition(status)) {
+          return false;
+        }
+
+        tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
+        return true;
+      },
+      py::arg("dirname"), py::arg("token") = (PyTransactionToken*)nullptr);
   m.def("HasAtomicMove", [](const std::string& path) {
     py::gil_scoped_release release;
     bool has_atomic_move;
@@ -159,29 +218,36 @@ PYBIND11_MODULE(_pywrap_file_io, m) {
       .def_readonly("mtime_nsec", &tensorflow::FileStatistics::mtime_nsec)
       .def_readonly("is_directory", &tensorflow::FileStatistics::is_directory);
 
-  m.def("Stat", [](const std::string& filename) {
-    py::gil_scoped_release release;
-    std::unique_ptr<tensorflow::FileStatistics> self(
-        new tensorflow::FileStatistics);
-    const auto status = tensorflow::Env::Default()->Stat(filename, self.get());
-    py::gil_scoped_acquire acquire;
-    tensorflow::MaybeRaiseRegisteredFromStatus(status);
-    return self.release();
-  });
-
-  using tensorflow::WritableFile;
-  py::class_<WritableFile>(m, "WritableFile")
-      .def(py::init([](const std::string& filename, const std::string& mode) {
+  m.def(
+      "Stat",
+      [](const std::string& filename, PyTransactionToken* token) {
         py::gil_scoped_release release;
-        auto* env = tensorflow::Env::Default();
-        std::unique_ptr<WritableFile> self;
-        const auto status = mode.find("a") == std::string::npos
-                                ? env->NewWritableFile(filename, &self)
-                                : env->NewAppendableFile(filename, &self);
+        std::unique_ptr<tensorflow::FileStatistics> self(
+            new tensorflow::FileStatistics);
+        const auto status =
+            tensorflow::Env::Default()->Stat(filename, self.get());
         py::gil_scoped_acquire acquire;
         tensorflow::MaybeRaiseRegisteredFromStatus(status);
         return self.release();
-      }))
+      },
+      py::arg("filename"), py::arg("token") = (PyTransactionToken*)nullptr);
+
+  using tensorflow::WritableFile;
+  py::class_<WritableFile>(m, "WritableFile")
+      .def(py::init([](const std::string& filename, const std::string& mode,
+                       PyTransactionToken* token) {
+             py::gil_scoped_release release;
+             auto* env = tensorflow::Env::Default();
+             std::unique_ptr<WritableFile> self;
+             const auto status = mode.find("a") == std::string::npos
+                                     ? env->NewWritableFile(filename, &self)
+                                     : env->NewAppendableFile(filename, &self);
+             py::gil_scoped_acquire acquire;
+             tensorflow::MaybeRaiseRegisteredFromStatus(status);
+             return self.release();
+           }),
+           py::arg("filename"), py::arg("mode"),
+           py::arg("token") = (PyTransactionToken*)nullptr)
       .def("append",
            [](WritableFile* self, tensorflow::StringPiece data) {
              const auto status = self->Append(data);
@@ -209,19 +275,24 @@ PYBIND11_MODULE(_pywrap_file_io, m) {
 
   using tensorflow::io::BufferedInputStream;
   py::class_<BufferedInputStream>(m, "BufferedInputStream")
-      .def(py::init([](const std::string& filename, size_t buffer_size) {
-        py::gil_scoped_release release;
-        std::unique_ptr<tensorflow::RandomAccessFile> file;
-        const auto status =
-            tensorflow::Env::Default()->NewRandomAccessFile(filename, &file);
-        tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
-        std::unique_ptr<tensorflow::io::RandomAccessInputStream> input_stream(
-            new tensorflow::io::RandomAccessInputStream(file.release(),
-                                                        /*owns_file=*/true));
-        py::gil_scoped_acquire acquire;
-        return new BufferedInputStream(input_stream.release(), buffer_size,
-                                       /*owns_input_stream=*/true);
-      }))
+      .def(py::init([](const std::string& filename, size_t buffer_size,
+                       PyTransactionToken* token) {
+             py::gil_scoped_release release;
+             std::unique_ptr<tensorflow::RandomAccessFile> file;
+             const auto status =
+                 tensorflow::Env::Default()->NewRandomAccessFile(filename,
+                                                                 &file);
+             tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
+             std::unique_ptr<tensorflow::io::RandomAccessInputStream>
+                 input_stream(new tensorflow::io::RandomAccessInputStream(
+                     file.release(),
+                     /*owns_file=*/true));
+             py::gil_scoped_acquire acquire;
+             return new BufferedInputStream(input_stream.release(), buffer_size,
+                                            /*owns_input_stream=*/true);
+           }),
+           py::arg("filename"), py::arg("buffer_size"),
+           py::arg("token") = (PyTransactionToken*)nullptr)
       .def("read",
            [](BufferedInputStream* self, tensorflow::int64 bytes_to_read) {
              py::gil_scoped_release release;
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index c9a634230e0..5576ce5e538 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -847,7 +847,7 @@ def _PadGrad(op, grad):
                                array_ops.stack([array_ops.rank(x), 1]))
   # Make it a 1-D tensor.
   begin = array_ops.reshape(pad_before, [-1])
-  sizes = array_ops.shape(x)
+  sizes = array_ops.shape(x, out_type=begin.dtype)
   x_grad = array_ops.slice(grad, begin, sizes)
   if len(op.inputs) == 3:
     return x_grad, None, None
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index cc41f5f1a3c..5d68deb7ac1 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -591,7 +591,7 @@ def shape_v2(input, out_type=dtypes.int32, name=None):
 
   >>> a = tf.keras.layers.Input((None, 10))
   >>> tf.shape(a)
-  <tf.Tensor ... shape=(3,) dtype=int32>
+  <... shape=(3,) dtype=int32...>
 
   In these cases, using `tf.Tensor.shape` will return more informative results.
 
@@ -955,6 +955,8 @@ def _slice_helper(tensor, slice_spec, var=None):
     TypeError: If the slice indices aren't int, slice, ellipsis,
       tf.newaxis or scalar int32/int64 tensors.
   """
+  tensor = ops.convert_to_tensor(tensor)
+
   if isinstance(slice_spec, bool) or \
   (isinstance(slice_spec, ops.Tensor) and slice_spec.dtype == dtypes.bool) or \
   (isinstance(slice_spec, np.ndarray) and slice_spec.dtype == bool):
@@ -2804,6 +2806,8 @@ def zeros(shape, dtype=dtypes.float32, name=None):
       zero = False
     elif dtype == dtypes.string:
       zero = ""
+    elif dtype.is_quantized:
+      zero = np.zeros([]).astype(dtype.as_numpy_dtype)
     else:
       zero = 0
 
@@ -3053,7 +3057,12 @@ def ones(shape, dtype=dtypes.float32, name=None):
   """
   dtype = dtypes.as_dtype(dtype).base_dtype
   with ops.name_scope(name, "ones", [shape]) as name:
-    one = True if dtype == dtypes.bool else 1
+    if dtype == dtypes.bool:
+      one = True
+    elif dtype.is_quantized:
+      one = np.ones([]).astype(dtype.as_numpy_dtype)
+    else:
+      one = 1
     if not isinstance(shape, ops.Tensor):
       try:
         if not context.executing_eagerly():
diff --git a/tensorflow/python/ops/array_ops_test.py b/tensorflow/python/ops/array_ops_test.py
index d8e2dcd0fb3..87c05b47455 100644
--- a/tensorflow/python/ops/array_ops_test.py
+++ b/tensorflow/python/ops/array_ops_test.py
@@ -18,11 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import backprop
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
@@ -30,7 +28,6 @@ from tensorflow.python.platform import test
 
 class ArrayOpTest(test.TestCase):
 
-  @test_util.deprecated_graph_mode_only
   def testGatherGradHasPartialStaticShape(self):
     # Create a tensor with an unknown dim 1.
     x = random_ops.random_normal([4, 10, 10])
@@ -38,19 +35,22 @@ class ArrayOpTest(test.TestCase):
         x,
         array_ops.reshape(array_ops.where_v2(x[0, :, 0] > 0.5), [-1]),
         axis=1)
-    self.assertAllEqual(x.shape.as_list(), [4, None, 10])
+    x.shape.assert_is_compatible_with([4, None, 10])
 
-    a = array_ops.gather(array_ops.gather(x, [0, 1]), [0, 1])
-    b = array_ops.gather(array_ops.gather(x, [2, 3], axis=2), [0, 1])
-    grad_a = ops.convert_to_tensor(gradients.gradients(a, x)[0])
-    grad_b = ops.convert_to_tensor(gradients.gradients(b, x)[0])
+    with backprop.GradientTape() as tape:
+      tape.watch(x)
+      a = array_ops.gather(array_ops.gather(x, [0, 1]), [0, 1])
+    grad_a = tape.gradient(a, x)
+    with backprop.GradientTape() as tape:
+      tape.watch(x)
+      b = array_ops.gather(array_ops.gather(x, [2, 3], axis=2), [0, 1])
+    grad_b = tape.gradient(b, x)
 
     # We make sure that the representation of the shapes are correct; the shape
     # equality check will always eval to false due to the shapes being partial.
-    self.assertAllEqual(grad_a.shape.as_list(), [None, None, 10])
-    self.assertAllEqual(grad_b.shape.as_list(), [4, None, 10])
+    grad_a.shape.assert_is_compatible_with([None, None, 10])
+    grad_b.shape.assert_is_compatible_with([4, None, 10])
 
-  @test_util.deprecated_graph_mode_only
   def testReshapeShapeInference(self):
     # Create a tensor with an unknown dim 1.
     x = random_ops.random_normal([4, 10, 10])
@@ -58,11 +58,11 @@ class ArrayOpTest(test.TestCase):
         x,
         array_ops.reshape(array_ops.where_v2(x[0, :, 0] > 0.5), [-1]),
         axis=1)
-    self.assertAllEqual(x.shape.as_list(), [4, None, 10])
+    x.shape.assert_is_compatible_with([4, None, 10])
     a = array_ops.reshape(x, array_ops.shape(x))
-    self.assertAllEqual(a.shape.as_list(), [4, None, 10])
+    a.shape.assert_is_compatible_with([4, None, 10])
     b = array_ops.reshape(x, math_ops.cast(array_ops.shape(x), dtypes.int64))
-    self.assertAllEqual(b.shape.as_list(), [4, None, 10])
+    b.shape.assert_is_compatible_with([4, None, 10])
 
     # We do not shape-infer across a tf.cast into anything that's not tf.int32
     # or tf.int64, since they might end up mangling the shape.
@@ -70,7 +70,7 @@ class ArrayOpTest(test.TestCase):
         x,
         math_ops.cast(
             math_ops.cast(array_ops.shape(x), dtypes.float32), dtypes.int32))
-    self.assertAllEqual(c.shape.as_list(), [None, None, None])
+    c.shape.assert_is_compatible_with([None, None, None])
 
   def testEmptyMeshgrid(self):
     self.assertEqual(array_ops.meshgrid(), [])
diff --git a/tensorflow/python/ops/batch_ops.py b/tensorflow/python/ops/batch_ops.py
index 4c470270975..8f163d078ff 100644
--- a/tensorflow/python/ops/batch_ops.py
+++ b/tensorflow/python/ops/batch_ops.py
@@ -34,7 +34,8 @@ def batch_function(num_batch_threads,
                    batch_timeout_micros,
                    allowed_batch_sizes=None,
                    max_enqueued_batches=10,
-                   autograph=True):
+                   autograph=True,
+                   enable_large_batch_splitting=True):
   """Batches the computation done by the decorated function.
 
   So, for example, in the following code
@@ -71,6 +72,15 @@ def batch_function(num_batch_threads,
     max_enqueued_batches: The maximum depth of the batch queue. Defaults to 10.
     autograph: Whether to use autograph to compile python and eager style code
      for efficient graph-mode execution.
+    enable_large_batch_splitting: The value of this option doesn't affect
+     processing output given the same input; it affects implementation details
+     as stated below: 1. Improve batching efficiency by eliminating unnecessary
+     adding. 2.`max_batch_size` specifies the limit of input and
+     `allowed_batch_sizes` specifies the limit of a task to be processed. API
+     user can give an input of size 128 when 'max_execution_batch_size'
+     is 32 -> implementation can split input of 128 into 4 x 32, schedule
+     concurrent processing, and then return concatenated results corresponding
+     to 128.
 
   Returns:
     The decorated function will return the unbatched computation output Tensors.
@@ -101,6 +111,7 @@ def batch_function(num_batch_threads,
             allowed_batch_sizes=allowed_batch_sizes,
             max_enqueued_batches=max_enqueued_batches,
             shared_name=name,
+            enable_large_batch_splitting=enable_large_batch_splitting,
             f=computation,
             in_tensors=list(args),
             captured_tensors=computation.captured_inputs,
diff --git a/tensorflow/python/ops/collective_ops_gpu_test.py b/tensorflow/python/ops/collective_ops_gpu_test.py
index efa97bd9555..87758a314b2 100644
--- a/tensorflow/python/ops/collective_ops_gpu_test.py
+++ b/tensorflow/python/ops/collective_ops_gpu_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import config
@@ -28,10 +27,8 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import collective_ops
 from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging as logging
 
 
 class CollectiveOpGPUTest(test.TestCase):
@@ -43,232 +40,217 @@ class CollectiveOpGPUTest(test.TestCase):
     # Group size is the number of devices in a group communicating collectively.
     # This will be passed into the collective ops in the tests below.
     cls._group_size = 2
+    cls._devices = ['/device:GPU:{}'.format(i) for i in range(2)]
     os.environ['NCCL_DEBUG'] = 'INFO'
     os.environ['NCCL_LAUNCH_MODE'] = 'PARALLEL'
 
-  def _configure(self, set_config_proto_nccl=True):
-    """Return `ConfigProto` for NCCL execution."""
-    experimental = config_pb2.ConfigProto.Experimental()
-    if set_config_proto_nccl:
-      experimental.collective_nccl = True
-    return config_pb2.ConfigProto(experimental=experimental)
-
-  def _ensure_context_initialized(self):
+  def _setup_context(self, num_gpus=2):
+    context._reset_context()
     gpus = config.list_physical_devices('GPU')
-    if len(gpus) < 2:
-      self.skipTest('Expected at least 2 GPUs but found {} GPUs'.format(
-          len(gpus)))
+    if len(gpus) < num_gpus:
+      self.skipTest('Expected at least {} GPUs but found {} GPUs'.format(
+          num_gpus, len(gpus)))
     context.ensure_initialized()
 
   def testBasicNcclAllReduce(self):
+    self._setup_context()
+
     inputs = [[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
               [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]]
     expected = [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2]
     group_key = 1
     instance_key = 1
-    devices = ['/GPU:{}'.format(i) for i in range(self._group_size)]
 
-    # Tests that execute collectives need to be enclosed in graph or tf.function
-    with ops.Graph().as_default(), self.session(
-        config=self._configure()) as sess:
-      if not test_util.is_gpu_available(cuda_only=True):
-        self.skipTest('No GPU available')
+    @def_function.function
+    def run_basic_all_reduce():
       collectives = []
       for i in range(self._group_size):
-        with ops.device(devices[i]):
+        with ops.device(self._devices[i]):
           t = constant_op.constant(inputs[i])
           collectives.append(collective_ops.all_reduce(
               t, self._group_size, group_key, instance_key, 'Add', 'Div'))
-      results = sess.run(collectives)
-    for result in results:
+      return collectives
+
+    for result in run_basic_all_reduce():
       self.assertAllClose(result, expected, rtol=1e-5, atol=1e-5)
 
   def testInt32Error(self):
+    self._setup_context()
+
     inputs = [[0, 1], [2, 3]]
     group_key = 1
     instance_key = 50
-    devices = ['/GPU:{}'.format(i) for i in range(self._group_size)]
 
-    # Tests that execute collectives need to be enclosed in graph or tf.function
-    with ops.Graph().as_default(), self.session(
-        config=self._configure()) as sess:
-      if not test_util.is_gpu_available(cuda_only=True):
-        self.skipTest('No GPU available')
-      collectives = []
+    @def_function.function
+    def run_int32_error():
       for i in range(self._group_size):
-        with ops.device(devices[i]):
+        with ops.device(self._devices[i]):
           t = constant_op.constant(inputs[i], dtype=dtypes.int32)
-          collectives.append(collective_ops.all_reduce(
-              t, self._group_size, group_key, instance_key, 'Add', 'Div'))
-      with self.assertRaisesRegex(
-          errors.InternalError,
-          'does not support datatype DT_INT32 on DEVICE_GPU'):
-        sess.run(collectives)
+          collective_ops.all_reduce(
+              t, self._group_size, group_key, instance_key, 'Add', 'Div')
+
+    with self.assertRaisesRegex(
+        errors.InternalError,
+        'does not support datatype DT_INT32 on DEVICE_GPU'):
+      run_int32_error()
 
   def testFp16Reduce(self):
+    self._setup_context()
+
     inputs = [[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
               [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]]
     expected = [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2]
     group_key = 1
     instance_key = 100
-    devices = ['/GPU:{}'.format(i) for i in range(self._group_size)]
 
-    with ops.Graph().as_default(), self.session(
-        config=self._configure()) as sess:
-      if not test_util.is_gpu_available(cuda_only=True):
-        self.skipTest('No GPU available')
+    @def_function.function
+    def run_fp16_reduce():
       collectives = []
       for i in range(self._group_size):
-        with ops.device(devices[i]):
+        with ops.device(self._devices[i]):
           t = constant_op.constant(inputs[i], dtype=dtypes.float16)
           collectives.append(collective_ops.all_reduce(
               t, self._group_size, group_key, instance_key, 'Add', 'Div'))
-      results = sess.run(collectives)
-    for result in results:
-      logging.info('i {} result {} expected {}'.format(i, results[i], expected))
+      return collectives
+
+    for result in run_fp16_reduce():
       self.assertAllClose(result, expected, rtol=1e-3, atol=1e-3)
 
   def testNcclHintAllReduce(self):
+    self._setup_context()
+
     inputs = [[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
               [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]]
     expected = [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2]
     group_key = 1
     instance_key = 1
-    devices = ['/GPU:{}'.format(i) for i in range(self._group_size)]
 
-    with ops.Graph().as_default(), self.session(
-        config=self._configure(set_config_proto_nccl=False)) as sess:
-      if not test_util.is_gpu_available(cuda_only=True):
-        self.skipTest('No GPU available')
+    @def_function.function
+    def run_nccl_hint_all_reduce():
       collectives = []
       for i in range(self._group_size):
-        with ops.device(devices[i]):
+        with ops.device(self._devices[i]):
           t = constant_op.constant(inputs[i])
           collectives.append(collective_ops.all_reduce(
               t, self._group_size, group_key, instance_key, 'Add', 'Div',
               communication_hint='nccl'))
-      results = sess.run(collectives)
-    for result in results:
+      return collectives
+
+    for result in run_nccl_hint_all_reduce():
       self.assertAllClose(result, expected, rtol=1e-5, atol=1e-5)
 
   def testBasicNcclBroadcast(self):
+    self._setup_context()
+
     tensor_value = [0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1]
     group_key = 1
     instance_key = 1
-    devices = ['/GPU:{}'.format(i) for i in range(self._group_size)]
 
-    with ops.Graph().as_default(), self.session(
-        config=self._configure()) as sess:
-      if not test_util.is_gpu_available(cuda_only=True):
-        self.skipTest('No GPU available')
+    @def_function.function
+    def run_basic_nccl_broadcast():
       collectives = []
-      with ops.device(devices[0]):
+      with ops.device(self._devices[0]):
         t = constant_op.constant(tensor_value)
         collectives.append(collective_ops.broadcast_send(
             t, t.shape, t.dtype, self._group_size, group_key, instance_key))
-      with ops.device(devices[1]):
+      with ops.device(self._devices[1]):
         t = constant_op.constant(tensor_value)
         collectives.append(collective_ops.broadcast_recv(
             t.shape, t.dtype, self._group_size, group_key, instance_key))
-      results = sess.run(collectives)
-    for result in results:
+      return collectives
+
+    for result in run_basic_nccl_broadcast():
       self.assertAllClose(result, tensor_value, rtol=1e-5, atol=1e-5)
 
   def testNcclBroadcastDoubleRecv(self):
+    self._setup_context()
+
     tensor_value = [0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1]
     group_key = 1
     instance_key = 1
-    devices = ['/GPU:{}'.format(i) for i in range(self._group_size)]
 
-    with ops.Graph().as_default(), self.session(
-        config=self._configure()) as sess:
-      if not test_util.is_gpu_available(cuda_only=True):
-        self.skipTest('No GPU available')
-      collectives = []
-      for device in devices:
+    @def_function.function
+    def run_nccl_broadcast_double_recv():
+      for device in self._devices:
         with ops.device(device):
           t = constant_op.constant(tensor_value)
-          collectives.append(collective_ops.broadcast_recv(
-              t.shape, t.dtype, self._group_size, group_key, instance_key))
-      with self.assertRaisesRegex(errors.InternalError, 'found no source'):
-        sess.run(collectives)
+          collective_ops.broadcast_recv(
+              t.shape, t.dtype, self._group_size, group_key, instance_key)
+
+    with self.assertRaisesRegex(errors.InternalError, 'found no source'):
+      run_nccl_broadcast_double_recv()
 
   def testNcclBroadcastDoubleSend(self):
+    self._setup_context()
+
     tensor_value = [0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1]
     group_key = 1
     instance_key = 1
-    devices = ['/GPU:{}'.format(i) for i in range(self._group_size)]
 
-    with ops.Graph().as_default(), self.session(
-        config=self._configure()) as sess:
-      if not test_util.is_gpu_available(cuda_only=True):
-        self.skipTest('No GPU available')
-      collectives = []
-      for device in devices:
+    @def_function.function
+    def run_nccl_broadcast_double_send():
+      for device in self._devices:
         with ops.device(device):
           t = constant_op.constant(tensor_value)
-          collectives.append(collective_ops.broadcast_send(
-              t, t.shape, t.dtype, self._group_size, group_key, instance_key))
-      with self.assertRaisesRegex(errors.InternalError, 'already has source'):
-        sess.run(collectives)
+          collective_ops.broadcast_send(
+              t, t.shape, t.dtype, self._group_size, group_key, instance_key)
+
+    with self.assertRaisesRegex(errors.InternalError, 'already has source'):
+      run_nccl_broadcast_double_send()
 
   def testBasicNcclAllGather(self):
+    self._setup_context()
+
     inputs = [[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
               [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]]
     expected = [0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1,
                 0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]
     group_key = 1
     instance_key = 1
-    devices = ['/GPU:{}'.format(i) for i in range(self._group_size)]
 
-    with ops.Graph().as_default(), self.session(
-        config=self._configure()) as sess:
-      if not test_util.is_gpu_available(cuda_only=True):
-        self.skipTest('No GPU available')
+    @def_function.function
+    def run_basic_nccl_all_gather():
       collectives = []
       for i in range(self._group_size):
-        with ops.device(devices[i]):
+        with ops.device(self._devices[i]):
           t = constant_op.constant(inputs[i])
           collectives.append(collective_ops.all_gather(t, self._group_size,
                                                        group_key, instance_key))
-      results = sess.run(collectives)
-    for result in results:
+      return collectives
+
+    for result in run_basic_nccl_all_gather():
       self.assertAllClose(result, expected, rtol=1e-5, atol=1e-5)
 
   def testCollectiveDeviceMismatch(self):
+    self._setup_context()
+
     group_key = 10
     instance_key = 20
     t0 = [1, 2, 3, 4]
     t1 = [5, 6, 7, 8]
 
-    with ops.Graph().as_default(), self.session(
-        config=self._configure(set_config_proto_nccl=False)) as sess:
-      if not test_util.is_gpu_available(cuda_only=True):
-        self.skipTest('No GPU available')
+    @def_function.function
+    def run_collective_device_mismatch():
       with ops.device('/CPU:0'):
         in0 = constant_op.constant(t0)
-        c0 = collective_ops.all_reduce(in0, self._group_size, group_key,
-                                       instance_key, 'Add', 'Id')
+        collective_ops.all_reduce(in0, self._group_size, group_key,
+                                  instance_key, 'Add', 'Id')
       with ops.device('/GPU:0'):
         in1 = constant_op.constant(t1)
-        c1 = collective_ops.all_reduce(in1, self._group_size, group_key,
-                                       instance_key, 'Add', 'Id')
-      run_options = config_pb2.RunOptions()
-      run_options.experimental.collective_graph_key = 100
-      with self.assertRaisesRegex(errors.InternalError,
-                                  'but that group has type'):
-        sess.run([c0, c1], options=run_options)
+        collective_ops.all_reduce(in1, self._group_size, group_key,
+                                  instance_key, 'Add', 'Id')
+
+    with self.assertRaisesRegex(errors.InternalError,
+                                'but that group has type'):
+      run_collective_device_mismatch()
 
-  @test_util.run_v2_only
   def testCollectiveReduceMinMax(self):
-    self._ensure_context_initialized()
+    self._setup_context()
 
     @def_function.function
     def run_all_reduce(group_key, instance_key, merge_op):
       t0 = [1., 20., 3., 40., 5.]
       t1 = [10., 2., 30., 4., 50.]
-      os.environ['NCCL_DEBUG'] = 'INFO'
-      os.environ['NCCL_LAUNCH_MODE'] = 'PARALLEL'
       with ops.device('/GPU:0'):
         in0 = constant_op.constant(t0)
         c0 = collective_ops.all_reduce(
@@ -289,9 +271,8 @@ class CollectiveOpGPUTest(test.TestCase):
       for result in results:
         self.assertAllClose(result, expected, rtol=1e-5, atol=1e-5)
 
-  @test_util.run_v2_only
   def testCollectiveGroupSizeOne(self):
-    self._ensure_context_initialized()
+    self._setup_context()
 
     group_size = 1
     group_key = 100
@@ -310,6 +291,16 @@ class CollectiveOpGPUTest(test.TestCase):
           in_tensor, group_size, group_key, instance_key)
     self.assertAllEqual(in_value, gathered_tensor.numpy())
 
+  def testNcclStress(self):
+    self._setup_context(num_gpus=1)
+
+    num_iters = 1000
+    for _ in range(num_iters):
+      with ops.device('/device:GPU:0'):
+        collective_ops.all_reduce(
+            [1.], group_size=1, group_key=0, instance_key=0, merge_op='Add',
+            final_op='Id', communication_hint='NCCL')
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/ops/collective_ops_test.py b/tensorflow/python/ops/collective_ops_test.py
index 6e238c40de8..fd93da34847 100644
--- a/tensorflow/python/ops/collective_ops_test.py
+++ b/tensorflow/python/ops/collective_ops_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import threading
 import time
 
 from tensorflow.core.protobuf import config_pb2
@@ -42,6 +43,10 @@ from tensorflow.python.platform import tf_logging as logging
 
 class CollectiveOpTest(test.TestCase):
 
+  def setUp(self):
+    context._reset_context()  # pylint: disable=protected-access
+    super(CollectiveOpTest, self).setUp()
+
   def _testCollectiveReduce(self,
                             inputs,
                             expected,
@@ -164,7 +169,6 @@ class CollectiveOpTest(test.TestCase):
 
   @test_util.run_v2_only
   def testCollectiveTimeoutV2(self):
-    context._reset_context()
     timeout = 4.5
     cpus = config.list_physical_devices('CPU')
     self.assertEqual(len(cpus), 1)
@@ -207,7 +211,6 @@ class CollectiveOpTest(test.TestCase):
 
   @test_util.run_v2_only
   def testParamResolutionAfterTimeoutV2(self):
-    context._reset_context()
     timeout = 1.5
     cpus = config.list_physical_devices('CPU')
     self.assertEqual(len(cpus), 1)
@@ -251,6 +254,64 @@ class CollectiveOpTest(test.TestCase):
             final_op='Id',
             timeout=timeout)
 
+  @test_util.run_v2_only
+  def testExecutionAfterTimeoutV2(self):
+    timeout = 1.5
+    cpus = config.list_physical_devices('CPU')
+    self.assertEqual(len(cpus), 1)
+    config.set_logical_device_configuration(cpus[0], [
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration()
+    ])
+    context.ensure_initialized()
+
+    group_key = 20
+    instance_key = 30
+    input_data = constant_op.constant([1, 2, 3, 4])
+
+    @def_function.function
+    def run_all_reduce():
+      for device in ['CPU:0', 'CPU:1']:
+        with ops.device(device):
+          collective_ops.all_reduce(
+              input_data,
+              group_size=2,
+              group_key=group_key,
+              instance_key=instance_key,
+              merge_op='Add',
+              final_op='Id',
+              timeout=timeout)
+
+    # Run a normal all-reduce to complete param resolution.
+    run_all_reduce()
+
+    with self.assertRaisesRegex(errors.DeadlineExceededError,
+                                'Collective has timed out during execution'):
+      with ops.device('CPU:0'):
+        collective_ops.all_reduce(
+            input_data,
+            group_size=2,
+            group_key=group_key,
+            instance_key=instance_key,
+            merge_op='Add',
+            final_op='Id',
+            timeout=timeout)
+
+    # We launch the second device after the first device times out. This is to
+    # simulate the situation when other workers are slow and the timeout is
+    # short. It should error immediately.
+    with self.assertRaisesRegex(errors.DeadlineExceededError,
+                                'Collective has timed out during execution'):
+      with ops.device('CPU:1'):
+        # No timeout.
+        collective_ops.all_reduce(
+            input_data,
+            group_size=2,
+            group_key=group_key,
+            merge_op='Add',
+            final_op='Id',
+            instance_key=instance_key)
+
   def testNcclHintFallbackToRingReduce(self):
     """Tests that setting `communication_hint=nccl` works on non-GPU builds."""
     if kernels.get_registered_kernels_for_op('NcclAllReduce'):
@@ -562,7 +623,6 @@ class CollectiveOpTest(test.TestCase):
 
   @test_util.run_v2_only
   def testCollectiveTensorsHaveNoDeviceSpecified(self):
-    context._reset_context()
     cpus = config.list_physical_devices('CPU')
     self.assertEqual(len(cpus), 1)
     config.set_logical_device_configuration(cpus[0], [
@@ -655,7 +715,6 @@ class CollectiveOpTest(test.TestCase):
 
   @test_util.run_v2_only
   def testMultipleGroups(self):
-    context._reset_context()
     cpus = config.list_physical_devices('CPU')
     self.assertEqual(len(cpus), 1)
     config.set_logical_device_configuration(cpus[0], [
@@ -688,6 +747,171 @@ class CollectiveOpTest(test.TestCase):
     run_and_assert(group_size=2, group_key=1)
     run_and_assert(group_size=3, group_key=2)
 
+  @test_util.run_v2_only
+  def testAbortGroupParamsResolution(self):
+    group_size = 2
+    group_key = 100
+    instance_key = 100
+    in_tensor = constant_op.constant(1.)
+
+    def abort_fn():
+      time.sleep(2)
+      context.context().abort_collective_ops(errors.UNAVAILABLE, 'peer down')
+
+    t = threading.Thread(target=abort_fn)
+    t.start()
+
+    with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
+      # This hangs on params resolution since we're only launching one
+      # collective for a group size of 2.
+      collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key,
+                                'Add', 'Id')
+
+    # After abortion, subsequent collectives should fail immediately.
+    with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
+      collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key,
+                                'Add', 'Id')
+
+    # Reset the context in order to reset the collective executor.
+    context._reset_context()  # pylint: disable=protected-access
+    t.join()
+
+    # After reset non-NCCL collectives should work.
+    cpus = config.list_physical_devices('CPU')
+    config.set_logical_device_configuration(cpus[0], [
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration()
+    ])
+
+    def collective_fn():
+      for device in ['CPU:0', 'CPU:1']:
+        with ops.device(device):
+          collective_ops.all_reduce(
+              in_tensor,
+              group_size,
+              group_key,
+              instance_key,
+              'Add',
+              'Id',
+              communication_hint='ring')
+
+    def_function.function(collective_fn)()
+
+  @test_util.run_v2_only
+  def testAbortInstanceParamsResolution(self):
+    cpus = config.list_physical_devices('CPU')
+    config.set_logical_device_configuration(cpus[0], [
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration()
+    ])
+    group_size = 2
+    group_key = 100
+    instance_key = 100
+    in_tensor = constant_op.constant(1.)
+
+    def collective_fn():
+      for device in ['CPU:0', 'CPU:1']:
+        with ops.device(device):
+          collective_ops.all_reduce(
+              in_tensor,
+              group_size,
+              group_key,
+              instance_key,
+              'Add',
+              'Id',
+              communication_hint='ring')
+
+    # First perform a normal all-reduce to complete the group resolution.
+    def_function.function(collective_fn)()
+
+    def abort_fn():
+      time.sleep(2)
+      context.context().abort_collective_ops(errors.UNAVAILABLE, 'peer down')
+
+    t = threading.Thread(target=abort_fn)
+    t.start()
+
+    # Use a different instance key to trigger another instance resolution.
+    instance_key = 101
+    with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
+      # This hangs on params resolution since we're only launching one
+      # collective for a group size of 2.
+      collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key,
+                                'Add', 'Id')
+
+    # After abortion, subsequent collectives should fail immediately.
+    with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
+      collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key,
+                                'Add', 'Id')
+
+    # Reset the context in order to reset the collective executor.
+    context._reset_context()  # pylint: disable=protected-access
+    t.join()
+
+    # After reset non-NCCL collectives should work.
+    cpus = config.list_physical_devices('CPU')
+    config.set_logical_device_configuration(cpus[0], [
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration()
+    ])
+    def_function.function(collective_fn)()
+
+  @test_util.run_v2_only
+  def testAbortRing(self):
+    cpus = config.list_physical_devices('CPU')
+    config.set_logical_device_configuration(cpus[0], [
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration()
+    ])
+    group_size = 2
+    group_key = 100
+    instance_key = 100
+    in_tensor = constant_op.constant(1.)
+
+    # First perform a normal collective to finish resolution.
+    def collective_fn():
+      for device in ['CPU:0', 'CPU:1']:
+        with ops.device(device):
+          collective_ops.all_reduce(
+              in_tensor,
+              group_size,
+              group_key,
+              instance_key,
+              'Add',
+              'Id',
+              communication_hint='ring')
+
+    def_function.function(collective_fn)()
+
+    # Launch a collective that hangs, and abort the collective executor after
+    # the launch.
+    def abort_fn():
+      time.sleep(2)
+      context.context().abort_collective_ops(errors.UNAVAILABLE, 'peer down')
+
+    t = threading.Thread(target=abort_fn)
+    t.start()
+
+    with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
+      collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key,
+                                'Add', 'Id')
+
+    # After abortion, subsequent collectives should fail immediately.
+    with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
+      collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key,
+                                'Add', 'Id')
+
+    # Reset the context in order to reset the collective executor.
+    t.join()
+    context._reset_context()  # pylint: disable=protected-access
+    # After reset non-NCCL collectives should work.
+    cpus = config.list_physical_devices('CPU')
+    config.set_logical_device_configuration(cpus[0], [
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration()
+    ])
+    def_function.function(collective_fn)()
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/ops/cond_v2.py b/tensorflow/python/ops/cond_v2.py
index e0a8ff4d862..17a5d5e97fa 100644
--- a/tensorflow/python/ops/cond_v2.py
+++ b/tensorflow/python/ops/cond_v2.py
@@ -25,7 +25,9 @@ from __future__ import print_function
 
 import collections
 
+from tensorflow.python.compat import compat
 from tensorflow.python.eager import backprop_util
+from tensorflow.python.framework import auto_control_deps
 from tensorflow.python.framework import auto_control_deps_utils as acd
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -330,7 +332,7 @@ def get_func_graphs(op):
         op.get_attr("then_branch"), "_true_graph"),
             _get_func_graph_for_branch(
                 op.get_attr("else_branch"), "_false_graph"))
-  elif op.type == "Case":
+  elif op.type in ["Case", "StatelessCase"]:
     # TODO(b/141114088): investigate whether to cache graphs in forward pass
     return [_get_func_graph_for_branch(branch_fn)
             for branch_fn in op.get_attr("branches")]
@@ -985,6 +987,7 @@ def indexed_case(branch_index,
 
 
 @ops.RegisterGradient("Case")
+@ops.RegisterGradient("StatelessCase")
 def _CaseGrad(op, *grads):  # pylint: disable=invalid-name
   """The gradient of a Case op produced by tf.switch_case."""
   # Get the Case operator (this logic handles the case where op is a MockOp)
@@ -1111,10 +1114,24 @@ def _build_case(branch_index,
   # graphs in `branch_graphs`.
   case_inputs = _make_inputs_match(branch_graphs, branch_inputs)
 
+  stateful_ops = []
+  for bg in branch_graphs:
+    stateful_ops.extend([
+        op for op in bg.get_operations() if auto_control_deps.op_is_stateful(op)
+    ])
+
+  # TODO(b/161915509): Remove this after 08/20/2020. This is required to abide
+  # by 3-week forward compat window of new TF python op generating code with
+  # stale runtime binaries.
+  if (stateful_ops or not compat.forward_compatible(2020, 8, 20)):
+    op_fn = gen_functional_ops.case
+  else:
+    op_fn = gen_functional_ops.stateless_case
+
   # Create the Case op.
   with ops.control_dependencies(
       sum((list(bg.control_captures) for bg in branch_graphs), [])):
-    tensors = gen_functional_ops.case(
+    tensors = op_fn(
         branch_index,
         case_inputs, [t.dtype for t in branch_graphs[0].outputs],
         [util.create_new_tf_function(g) for g in branch_graphs],
diff --git a/tensorflow/python/ops/control_flow_v2_func_graphs.py b/tensorflow/python/ops/control_flow_v2_func_graphs.py
index 97e04f8d73d..23edd712797 100644
--- a/tensorflow/python/ops/control_flow_v2_func_graphs.py
+++ b/tensorflow/python/ops/control_flow_v2_func_graphs.py
@@ -30,6 +30,7 @@ class CondBranchFuncGraph(func_graph.FuncGraph):
 
   def __init__(self, *args, **kwargs):
     super(CondBranchFuncGraph, self).__init__(*args, **kwargs)
+    self.is_control_flow_graph = True
     if ops.executing_eagerly_outside_functions():
       func_graph.override_func_graph_name_scope(
           self, self.outer_graph.get_name_scope())
@@ -43,6 +44,7 @@ class WhileCondFuncGraph(func_graph.FuncGraph):
 
   def __init__(self, *args, **kwargs):
     super(WhileCondFuncGraph, self).__init__(*args, **kwargs)
+    self.is_control_flow_graph = True
     if ops.executing_eagerly_outside_functions():
       func_graph.override_func_graph_name_scope(
           self, self.outer_graph.get_name_scope())
@@ -56,6 +58,7 @@ class WhileBodyFuncGraph(func_graph.FuncGraph):
 
   def __init__(self, *args, **kwargs):
     super(WhileBodyFuncGraph, self).__init__(*args, **kwargs)
+    self.is_control_flow_graph = True
     if ops.executing_eagerly_outside_functions():
       func_graph.override_func_graph_name_scope(
           self, self.outer_graph.get_name_scope())
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index a33d9a1de1f..75d88cfe44f 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -842,7 +842,7 @@ def ctc_loss_v3(labels,
                 name=None):
   """Computes CTC (Connectionist Temporal Classification) loss.
 
-  This op implements the CTC loss as presented in (Graves et al., 2016).
+  This op implements the CTC loss as presented in (Graves et al., 2006).
 
   Notes:
 
@@ -882,7 +882,7 @@ def ctc_loss_v3(labels,
   References:
       Connectionist Temporal Classification - Labeling Unsegmented Sequence Data
       with Recurrent Neural Networks:
-        [Graves et al., 2016](https://dl.acm.org/citation.cfm?id=1143891)
+        [Graves et al., 2006](https://dl.acm.org/citation.cfm?id=1143891)
         ([pdf](http://www.cs.toronto.edu/~graves/icml_2006.pdf))
   """
   if isinstance(labels, sparse_tensor.SparseTensor):
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index 58e5c3c9794..f081f036b58 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -125,6 +125,42 @@ def custom_gradient(f=None):
   With this definition, the gradient at x=100 will be correctly evaluated as
   1.0.
 
+  The variable `dy` is defined as the upstream gradient. i.e. the gradient from
+  all the layers or functions originating from this layer.
+
+  By chain rule we know that
+  `dy/dx = dy/x_0 * dx_0/dx_1 * ... * dx_i/dx_i+1 * ... * dx_n/dx`
+
+  In this case the gradient of our current function defined as 
+  `dx_i/dx_i+1 = (1 - 1 / (1 + e))`. The upstream gradient `dy` would be
+  `dx_i+1/dx_i+2 * dx_i+2/dx_i+3 * ... * dx_n/dx`. The upstream gradient 
+  multiplied by the current gradient is then passed downstream.
+
+  In case the function takes multiple variables as input, the `grad` 
+  function must also return  the same number of variables.
+  We take the function `z = x * y` as an example.
+
+  >>> @tf.custom_gradient
+  ... def bar(x, y):
+  ...   def grad(upstream):
+  ...     dz_dx = y
+  ...     dz_dy = x
+  ...     return upstream * dz_dx, upstream * dz_dy
+  ...   z = x * y
+  ...   return z, grad
+  >>> x = tf.constant(2.0, dtype=tf.float32)
+  >>> y = tf.constant(3.0, dtype=tf.float32)
+  >>> with tf.GradientTape(persistent=True) as tape:
+  ...   tape.watch(x)
+  ...   tape.watch(y)
+  ...   z = bar(x, y)
+  >>> z
+  <tf.Tensor: shape=(), dtype=float32, numpy=6.0>
+  >>> tape.gradient(z, x)
+  <tf.Tensor: shape=(), dtype=float32, numpy=3.0>
+  >>> tape.gradient(z, y)
+  <tf.Tensor: shape=(), dtype=float32, numpy=2.0>
+
   Nesting custom gradients can lead to unintuitive results. The default
   behavior does not correspond to n-th order derivatives. For example
 
@@ -354,9 +390,27 @@ def _graph_mode_decorator(f, args, kwargs):
   variables_in_tape = frozenset([
       v.ref() for v in variable_watcher.watched_variables()
   ])
+
+  graphs = {getattr(o, "graph", None) for o in flat_result}
+  # Not all results may be tensors. However, we want to ensure all tensor
+  # outputs are from the same graph and get a list of captured inputs for
+  # variable search
+  graphs.discard(None)  # Discard non-graph outputs
+  if graphs:
+    if len(graphs) > 1:
+      raise ValueError(
+          "All custom_gradient outputs should be from the same graph")
+    output_graph = graphs.pop()
+    filtered_input_tensors = []
+    for i in args:
+      if i.graph == output_graph:
+        filtered_input_tensors.append(i)
+  else:
+    filtered_input_tensors = args
+
   variables_in_subgraph = frozenset([
-      v.ref()
-      for v in _get_dependent_variables(input_ops=args, output_ops=flat_result)
+      v.ref() for v in _get_dependent_variables(
+          input_ops=filtered_input_tensors, output_ops=flat_result)
   ])
   variables = list(
       [v.deref() for v in variables_in_subgraph.union(variables_in_tape)])
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 8575ea807e4..46db5d2b0e5 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -446,8 +446,34 @@ def HessiansV2(ys,
                gate_gradients=False,
                aggregation_method=None,
                name="hessians"):
-  return hessians(ys, xs, name=name, gate_gradients=gate_gradients,
-                  aggregation_method=aggregation_method)
+  """Constructs the Hessian of sum of `ys` with respect to `x` in `xs`.
 
+  `hessians()` adds ops to the graph to output the Hessian matrix of `ys`
+  with respect to `xs`.  It returns a list of `Tensor` of length `len(xs)`
+  where each tensor is the Hessian of `sum(ys)`.
 
-HessiansV2.__doc__ = hessians.__doc__
+  The Hessian is a matrix of second-order partial derivatives of a scalar
+  tensor (see https://en.wikipedia.org/wiki/Hessian_matrix for more details).
+
+  Args:
+    ys: A `Tensor` or list of tensors to be differentiated.
+    xs: A `Tensor` or list of tensors to be used for differentiation.
+    gate_gradients: See `gradients()` documentation for details.
+    aggregation_method: See `gradients()` documentation for details.
+    name: Optional name to use for grouping all the gradient ops together.
+      defaults to 'hessians'.
+
+  Returns:
+    A list of Hessian matrices of `sum(ys)` for each `x` in `xs`.
+
+  Raises:
+    LookupError: if one of the operations between `xs` and `ys` does not
+      have a registered gradient function.
+  """
+  return hessians(
+      ys,
+      xs,
+      name=name,
+      colocate_gradients_with_ops=True,
+      gate_gradients=gate_gradients,
+      aggregation_method=aggregation_method)
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index cc9b029e6d9..5bd31aa8c73 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -1172,6 +1172,30 @@ class CustomGradientTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       dw = self.evaluate(math_ops.reduce_sum(grads[1]))
       self.assertEqual(12., dw)
 
+  def testCustomGradientWithCapture(self):
+    with ops.Graph().as_default():
+      x = constant(3.)
+
+      @framework_function.Defun(dtypes.float32)
+      def F(y):
+
+        @custom_gradient.custom_gradient
+        def MyMultiply(x1, x2):
+          result = x1 * x2
+
+          def Grad(dy):
+            # Switched the ordering here.
+            return [dy * x1, dy * x2]
+
+          return result, Grad
+
+        res = MyMultiply(x, y)
+        return gradients.gradients(res, [y])
+
+      y = constant(5.)
+      dy = F(y)
+      self.assertAllEqual(5., self.evaluate(dy))
+
   def testCustomGradientWithVariablesNoFalsePositives(self):
 
     @custom_gradient.custom_gradient
@@ -1420,7 +1444,8 @@ class TensorListGradientsTest(test_util.TensorFlowTestCase):
       self.assertEqual(self.evaluate(grad), 5.)
 
 
-class VariablesGradientTest(test_util.TensorFlowTestCase):
+class VariablesGradientTest(test_util.TensorFlowTestCase,
+                            parameterized.TestCase):
 
   def _TestFnVariablesGradient(self, inputs, test_fn, vars_to_grad):
     """Returns gradients of `test_model` with respect to `vars_to_grad`."""
@@ -1526,8 +1551,8 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
       for g, g_re in zip(grads, grads_re):
         self.assertAllClose(g, g_re)
 
-  @test_util.deprecated_graph_mode_only
-  def testFnRecomputeWithScopeGradientTape(self):
+  @parameterized.parameters(set((True, context.executing_eagerly())))
+  def testFnRecomputeWithScopeGradient(self, use_tape):
     """Checks that recompute_grad works with var scope and GradientTape."""
 
     def TestFn(input_t):
@@ -1537,7 +1562,6 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
             shape=10,
             trainable=True,
         )
-        self.evaluate(test_var.assign(np.ones([10])))
         return input_t * test_var
 
     test_input_t = constant(np.zeros((10, 10), dtype=np.float32))
@@ -1546,10 +1570,12 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
         "output_scope", reuse=variable_scope.AUTO_REUSE, use_resource=True):
       test_fn_re = custom_gradient.recompute_grad(TestFn)
 
-      with backprop.GradientTape(persistent=True) as tape:
+      with test_util.AbstractGradientTape(
+          use_tape=use_tape, persistent=True) as tape:
         out_re = test_fn_re(test_input_t)
         out = TestFn(test_input_t)
 
+    self.evaluate(variables.global_variables_initializer())
     grads_re = tape.gradient(out_re, variables.trainable_variables())
     grads = tape.gradient(out, variables.trainable_variables())
 
@@ -1557,39 +1583,6 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
     grads = self.evaluate(grads)
     for g, g_re in zip(grads, grads_re):
       self.assertAllClose(g, g_re)
-      self.assertAllClose(g, g_re)
-
-  @test_util.deprecated_graph_mode_only
-  def testFnRecomputeWithScopeGradients(self):
-    """Checks that recompute_grad works with var scope and gradients(..)."""
-
-    def TestFn(input_t):
-      with variable_scope.variable_scope("inner_scope"):
-        test_var = variable_scope.get_variable(
-            name="test_var",
-            shape=10,
-            trainable=True,
-        )
-        return input_t * test_var
-
-    test_input_t = constant(np.zeros((10, 10), dtype=np.float32))
-
-    with variable_scope.variable_scope(
-        "output_scope", reuse=variable_scope.AUTO_REUSE, use_resource=True):
-      test_fn_re = custom_gradient.recompute_grad(TestFn)
-      out_re = test_fn_re(test_input_t)
-      out = TestFn(test_input_t)
-
-    init = variables.global_variables_initializer()
-    self.evaluate(init)
-    grads_re = gradients.gradients(out_re, variables.trainable_variables())
-    grads = gradients.gradients(out, variables.trainable_variables())
-
-    grads_re = self.evaluate(grads_re)
-    grads = self.evaluate(grads)
-    for g, g_re in zip(grads, grads_re):
-      self.assertAllClose(g, g_re)
-      self.assertAllClose(g, g_re)
 
   @test_util.run_in_graph_and_eager_modes
   def testFnRecomputeSameTensor(self):
@@ -1646,7 +1639,7 @@ class GradPassThroughTest(test_util.TensorFlowTestCase):
 
     with self.cached_session():
       self.evaluate(variables.global_variables_initializer())
-      self.assertAllClose(grads[0].eval(), 6.0)
+      self.assertAllClose(grads[0], 6.0)
 
     # Verify that variables involved in the wrapped op do not receive gradients.
     y = custom_gradient.grad_pass_through(lambda v: x * v)(z)
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 1d373afdb62..e728da34117 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import numpy as np
 
 from tensorflow.python.compat import compat
@@ -37,6 +38,7 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import sort_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.util import deprecation
@@ -338,13 +340,12 @@ def random_flip_up_down(image, seed=None):
 
   Example usage:
 
-  >>> import numpy as np
-
   >>> image = np.array([[[1], [2]], [[3], [4]]])
   >>> tf.image.random_flip_up_down(image, 3).numpy().tolist()
   [[[3], [4]], [[1], [2]]]
 
   Randomly flip multiple images.
+
   >>> images = np.array(
   ... [
   ...     [[[1], [2]], [[3], [4]]],
@@ -364,7 +365,8 @@ def random_flip_up_down(image, seed=None):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  return _random_flip(image, 0, seed, 'random_flip_up_down')
+  random_func = functools.partial(random_ops.random_uniform, seed=seed)
+  return _random_flip(image, 0, random_func, 'random_flip_up_down')
 
 
 @tf_export('image.random_flip_left_right')
@@ -379,8 +381,6 @@ def random_flip_left_right(image, seed=None):
 
   Example usage:
 
-  >>> import numpy as np
-
   >>> image = np.array([[[1], [2]], [[3], [4]]])
   >>> tf.image.random_flip_left_right(image, 5).numpy().tolist()
   [[[2], [1]], [[4], [3]]]
@@ -407,19 +407,82 @@ def random_flip_left_right(image, seed=None):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  return _random_flip(image, 1, seed, 'random_flip_left_right')
+  random_func = functools.partial(random_ops.random_uniform, seed=seed)
+  return _random_flip(image, 1, random_func, 'random_flip_left_right')
 
 
-def _random_flip(image, flip_index, seed, scope_name):
+@tf_export('image.stateless_random_flip_left_right', v1=[])
+@dispatch.add_dispatch_support
+def stateless_random_flip_left_right(image, seed):
+  """Randomly flip an image horizontally (left to right) deterministically.
+
+  Guarantees the same results given the same `seed` independent of how many
+  times the function is called, and independent of global seed settings (e.g.
+  `tf.random.set_seed`).
+
+  Example usage:
+
+  >>> image = np.array([[[1], [2]], [[3], [4]]])
+  >>> seed = (2, 3)
+  >>> tf.image.stateless_random_flip_left_right(image, seed).numpy().tolist()
+  [[[2], [1]], [[4], [3]]]
+
+  Args:
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or 3-D Tensor
+      of shape `[height, width, channels]`.
+    seed: A shape [2] Tensor, the seed to the random number generator. Must have
+      dtype `int32` or `int64`. (When using XLA, only `int32` is allowed.)
+
+  Returns:
+    A tensor of the same type and shape as `image`.
+  """
+  random_func = functools.partial(
+      stateless_random_ops.stateless_random_uniform, seed=seed)
+  return _random_flip(
+      image, 1, random_func, 'stateless_random_flip_left_right')
+
+
+@tf_export('image.stateless_random_flip_up_down', v1=[])
+@dispatch.add_dispatch_support
+def stateless_random_flip_up_down(image, seed):
+  """Randomly flip an image vertically (upside down) deterministically.
+
+  Guarantees the same results given the same `seed` independent of how many
+  times the function is called, and independent of global seed settings (e.g.
+  `tf.random.set_seed`).
+
+  Example usage:
+
+  >>> image = np.array([[[1], [2]], [[3], [4]]])
+  >>> seed = (2, 3)
+  >>> tf.image.stateless_random_flip_up_down(image, seed).numpy().tolist()
+  [[[3], [4]], [[1], [2]]]
+
+  Args:
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or 3-D Tensor
+      of shape `[height, width, channels]`.
+    seed: A shape [2] Tensor, the seed to the random number generator. Must have
+      dtype `int32` or `int64`. (When using XLA, only `int32` is allowed.)
+
+  Returns:
+    A tensor of the same type and shape as `image`.
+  """
+  random_func = functools.partial(
+      stateless_random_ops.stateless_random_uniform, seed=seed)
+  return _random_flip(
+      image, 0, random_func, 'stateless_random_flip_up_down')
+
+
+def _random_flip(image, flip_index, random_func, scope_name):
   """Randomly (50% chance) flip an image along axis `flip_index`.
 
   Args:
     image: 4-D Tensor of shape `[batch, height, width, channels]` or 3-D Tensor
       of shape `[height, width, channels]`.
     flip_index: Dimension along which to flip the image.
-      Vertical: 0, Horizontal: 1
-    seed: A Python integer. Used to create a random seed. See
-      `tf.compat.v1.set_random_seed` for behavior.
+      Vertical is 0, Horizontal is 1.
+    random_func: partial function for calling either stateful or stateless
+      random ops with `seed` parameter specified.
     scope_name: Name of the scope in which the ops are added.
 
   Returns:
@@ -432,8 +495,9 @@ def _random_flip(image, flip_index, seed, scope_name):
     image = ops.convert_to_tensor(image, name='image')
     image = _AssertAtLeast3DImage(image)
     shape = image.get_shape()
-    if shape.ndims == 3 or shape.ndims is None:
-      uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
+
+    def f_rank3():
+      uniform_random = random_func(shape=[], minval=0, maxval=1.0)
       mirror_cond = math_ops.less(uniform_random, .5)
       result = control_flow_ops.cond(
           mirror_cond,
@@ -441,17 +505,23 @@ def _random_flip(image, flip_index, seed, scope_name):
           lambda: image,
           name=scope)
       return fix_image_flip_shape(image, result)
-    elif shape.ndims == 4:
+
+    def f_rank4():
       batch_size = array_ops.shape(image)[0]
-      uniform_random = random_ops.random_uniform([batch_size],
-                                                 0,
-                                                 1.0,
-                                                 seed=seed)
+      uniform_random = random_func(shape=[batch_size], minval=0, maxval=1.0)
       flips = math_ops.round(
           array_ops.reshape(uniform_random, [batch_size, 1, 1, 1]))
       flips = math_ops.cast(flips, image.dtype)
       flipped_input = array_ops.reverse(image, [flip_index + 1])
       return flips * flipped_input + (1 - flips) * image
+
+    if shape.ndims is None:
+      rank = array_ops.rank(image)
+      return control_flow_ops.cond(math_ops.equal(rank, 3), f_rank3, f_rank4)
+    if shape.ndims == 3:
+      return f_rank3()
+    elif shape.ndims == 4:
+      return f_rank4()
     else:
       raise ValueError(
           '\'image\' (shape %s) must have either 3 or 4 dimensions.' % shape)
@@ -550,10 +620,20 @@ def _flip(image, flip_index, scope_name):
     image = ops.convert_to_tensor(image, name='image')
     image = _AssertAtLeast3DImage(image)
     shape = image.get_shape()
-    if shape.ndims == 3 or shape.ndims is None:
+
+    def f_rank3():
       return fix_image_flip_shape(image, array_ops.reverse(image, [flip_index]))
-    elif shape.ndims == 4:
+
+    def f_rank4():
       return array_ops.reverse(image, [flip_index + 1])
+
+    if shape.ndims is None:
+      rank = array_ops.rank(image)
+      return control_flow_ops.cond(math_ops.equal(rank, 3), f_rank3, f_rank4)
+    elif shape.ndims == 3:
+      return f_rank3()
+    elif shape.ndims == 4:
+      return f_rank4()
     else:
       raise ValueError(
           '\'image\' (shape %s)must have either 3 or 4 dimensions.' % shape)
@@ -600,7 +680,17 @@ def rot90(image, k=1, name=None):
     k = math_ops.mod(k, 4)
 
     shape = image.get_shape()
-    if shape.ndims == 3 or shape.ndims is None:
+    if shape.ndims is None:
+      rank = array_ops.rank(image)
+
+      def f_rank3():
+        return _rot90_3D(image, k, scope)
+
+      def f_rank4():
+        return _rot90_4D(image, k, scope)
+
+      return control_flow_ops.cond(math_ops.equal(rank, 3), f_rank3, f_rank4)
+    elif shape.ndims == 3:
       return _rot90_3D(image, k, scope)
     elif shape.ndims == 4:
       return _rot90_4D(image, k, scope)
@@ -723,7 +813,17 @@ def transpose(image, name=None):
     image = ops.convert_to_tensor(image, name='image')
     image = _AssertAtLeast3DImage(image)
     shape = image.get_shape()
-    if shape.ndims == 3 or shape.ndims is None:
+    if shape.ndims is None:
+      rank = array_ops.rank(image)
+
+      def f_rank3():
+        return array_ops.transpose(image, [1, 0, 2], name=name)
+
+      def f_rank4():
+        return array_ops.transpose(image, [0, 2, 1, 3], name=name)
+
+      return control_flow_ops.cond(math_ops.equal(rank, 3), f_rank3, f_rank4)
+    elif shape.ndims == 3:
       return array_ops.transpose(image, [1, 0, 2], name=name)
     elif shape.ndims == 4:
       return array_ops.transpose(image, [0, 2, 1, 3], name=name)
@@ -1788,13 +1888,59 @@ def random_brightness(image, max_delta, seed=None):
   return adjust_brightness(image, delta)
 
 
+@tf_export('image.stateless_random_brightness', v1=[])
+@dispatch.add_dispatch_support
+def stateless_random_brightness(image, max_delta, seed):
+  """Adjust the brightness of images by a random factor deterministically.
+
+  Equivalent to `adjust_brightness()` using a `delta` randomly picked in the
+  interval `[-max_delta, max_delta)`.
+
+  Guarantees the same results given the same `seed` independent of how many
+  times the function is called, and independent of global seed settings (e.g.
+  `tf.random.set_seed`).
+
+  Usage Example:
+
+  >>> x = [[[1.0, 2.0, 3.0],
+  ...       [4.0, 5.0, 6.0]],
+  ...      [[7.0, 8.0, 9.0],
+  ...       [10.0, 11.0, 12.0]]]
+  >>> seed = (1, 2)
+  >>> tf.image.stateless_random_brightness(x, 0.2, seed)
+  <tf.Tensor: shape=(2, 2, 3), dtype=float32, numpy=
+  array([[[ 1.1376241,  2.1376243,  3.1376243],
+          [ 4.1376243,  5.1376243,  6.1376243]],
+         [[ 7.1376243,  8.137624 ,  9.137624 ],
+          [10.137624 , 11.137624 , 12.137624 ]]], dtype=float32)>
+
+  Args:
+    image: An image or images to adjust.
+    max_delta: float, must be non-negative.
+    seed: A shape [2] Tensor, the seed to the random number generator. Must have
+      dtype `int32` or `int64`. (When using XLA, only `int32` is allowed.)
+
+  Returns:
+    The brightness-adjusted image(s).
+
+  Raises:
+    ValueError: if `max_delta` is negative.
+  """
+  if max_delta < 0:
+    raise ValueError('max_delta must be non-negative.')
+
+  delta = stateless_random_ops.stateless_random_uniform(
+      shape=[], minval=-max_delta, maxval=max_delta, seed=seed)
+  return adjust_brightness(image, delta)
+
+
 @tf_export('image.random_contrast')
 @dispatch.add_dispatch_support
 def random_contrast(image, lower, upper, seed=None):
   """Adjust the contrast of an image or images by a random factor.
 
   Equivalent to `adjust_contrast()` but uses a `contrast_factor` randomly
-  picked in the interval `[lower, upper]`.
+  picked in the interval `[lower, upper)`.
 
   Args:
     image: An image tensor with 3 or more dimensions.
@@ -1824,11 +1970,57 @@ def random_contrast(image, lower, upper, seed=None):
   if lower < 0:
     raise ValueError('lower must be non-negative.')
 
-  # Generate an a float in [lower, upper]
   contrast_factor = random_ops.random_uniform([], lower, upper, seed=seed)
   return adjust_contrast(image, contrast_factor)
 
 
+@tf_export('image.stateless_random_contrast', v1=[])
+@dispatch.add_dispatch_support
+def stateless_random_contrast(image, lower, upper, seed):
+  """Adjust the contrast of images by a random factor deterministically.
+
+  Guarantees the same results given the same `seed` independent of how many
+  times the function is called, and independent of global seed settings (e.g.
+  `tf.random.set_seed`).
+
+  Args:
+    image: An image tensor with 3 or more dimensions.
+    lower: float.  Lower bound for the random contrast factor.
+    upper: float.  Upper bound for the random contrast factor.
+    seed: A shape [2] Tensor, the seed to the random number generator. Must have
+      dtype `int32` or `int64`. (When using XLA, only `int32` is allowed.)
+
+  Usage Example:
+
+  >>> x = [[[1.0, 2.0, 3.0],
+  ...       [4.0, 5.0, 6.0]],
+  ...      [[7.0, 8.0, 9.0],
+  ...       [10.0, 11.0, 12.0]]]
+  >>> seed = (1, 2)
+  >>> tf.image.stateless_random_contrast(x, 0.2, 0.5, seed)
+  <tf.Tensor: shape=(2, 2, 3), dtype=float32, numpy=
+  array([[[3.4605184, 4.4605184, 5.4605184],
+          [4.820173 , 5.820173 , 6.820173 ]],
+         [[6.179827 , 7.179827 , 8.179828 ],
+          [7.5394816, 8.539482 , 9.539482 ]]], dtype=float32)>
+
+  Returns:
+    The contrast-adjusted image(s).
+
+  Raises:
+    ValueError: if `upper <= lower` or if `lower < 0`.
+  """
+  if upper <= lower:
+    raise ValueError('upper must be > lower.')
+
+  if lower < 0:
+    raise ValueError('lower must be non-negative.')
+
+  contrast_factor = stateless_random_ops.stateless_random_uniform(
+      shape=[], minval=lower, maxval=upper, seed=seed)
+  return adjust_contrast(image, contrast_factor)
+
+
 @tf_export('image.adjust_brightness')
 @dispatch.add_dispatch_support
 def adjust_brightness(image, delta):
@@ -2182,7 +2374,7 @@ def random_hue(image, max_delta, seed=None):
   """Adjust the hue of RGB images by a random factor.
 
   Equivalent to `adjust_hue()` but uses a `delta` randomly
-  picked in the interval `[-max_delta, max_delta]`.
+  picked in the interval `[-max_delta, max_delta)`.
 
   `max_delta` must be in the interval `[0, 0.5]`.
 
@@ -2219,6 +2411,57 @@ def random_hue(image, max_delta, seed=None):
   return adjust_hue(image, delta)
 
 
+@tf_export('image.stateless_random_hue', v1=[])
+@dispatch.add_dispatch_support
+def stateless_random_hue(image, max_delta, seed):
+  """Adjust the hue of RGB images by a random factor deterministically.
+
+  Equivalent to `adjust_hue()` but uses a `delta` randomly picked in the
+  interval `[-max_delta, max_delta)`.
+
+  Guarantees the same results given the same `seed` independent of how many
+  times the function is called, and independent of global seed settings (e.g.
+  `tf.random.set_seed`).
+
+  `max_delta` must be in the interval `[0, 0.5]`.
+
+  Usage Example:
+
+  >>> x = [[[1.0, 2.0, 3.0],
+  ...       [4.0, 5.0, 6.0]],
+  ...      [[7.0, 8.0, 9.0],
+  ...       [10.0, 11.0, 12.0]]]
+  >>> seed = (1, 2)
+  >>> tf.image.stateless_random_hue(x, 0.2, seed)
+  <tf.Tensor: shape=(2, 2, 3), dtype=float32, numpy=
+  array([[[ 1.6514902,  1.       ,  3.       ],
+          [ 4.65149  ,  4.       ,  6.       ]],
+         [[ 7.65149  ,  7.       ,  9.       ],
+          [10.65149  , 10.       , 12.       ]]], dtype=float32)>
+
+  Args:
+    image: RGB image or images. The size of the last dimension must be 3.
+    max_delta: float. The maximum value for the random delta.
+    seed: A shape [2] Tensor, the seed to the random number generator. Must have
+      dtype `int32` or `int64`. (When using XLA, only `int32` is allowed.)
+
+  Returns:
+    Adjusted image(s), same shape and DType as `image`.
+
+  Raises:
+    ValueError: if `max_delta` is invalid.
+  """
+  if max_delta > 0.5:
+    raise ValueError('max_delta must be <= 0.5.')
+
+  if max_delta < 0:
+    raise ValueError('max_delta must be non-negative.')
+
+  delta = stateless_random_ops.stateless_random_uniform(
+      shape=[], minval=-max_delta, maxval=max_delta, seed=seed)
+  return adjust_hue(image, delta)
+
+
 @tf_export('image.adjust_hue')
 @dispatch.add_dispatch_support
 def adjust_hue(image, delta, name=None):
@@ -2335,6 +2578,63 @@ def random_jpeg_quality(image, min_jpeg_quality, max_jpeg_quality, seed=None):
   return adjust_jpeg_quality(image, jpeg_quality)
 
 
+@tf_export('image.stateless_random_jpeg_quality', v1=[])
+@dispatch.add_dispatch_support
+def stateless_random_jpeg_quality(image,
+                                  min_jpeg_quality,
+                                  max_jpeg_quality,
+                                  seed):
+  """Deterministically radomize jpeg encoding quality for inducing jpeg noise.
+
+  Guarantees the same results given the same `seed` independent of how many
+  times the function is called, and independent of global seed settings (e.g.
+  `tf.random.set_seed`).
+
+  `min_jpeg_quality` must be in the interval `[0, 100]` and less than
+  `max_jpeg_quality`.
+  `max_jpeg_quality` must be in the interval `[0, 100]`.
+
+  Usage Example:
+
+  >>> x = [[[1, 2, 3],
+  ...       [4, 5, 6]],
+  ...      [[7, 8, 9],
+  ...       [10, 11, 12]]]
+  >>> x_uint8 = tf.cast(x, tf.uint8)
+  >>> seed = (1, 2)
+  >>> tf.image.stateless_random_jpeg_quality(x_uint8, 75, 95, seed)
+  <tf.Tensor: shape=(2, 2, 3), dtype=uint8, numpy=
+  array([[[ 0,  4,  5],
+          [ 1,  5,  6]],
+         [[ 5,  9, 10],
+          [ 5,  9, 10]]], dtype=uint8)>
+
+  Args:
+    image: 3D image. Size of the last dimension must be 1 or 3.
+    min_jpeg_quality: Minimum jpeg encoding quality to use.
+    max_jpeg_quality: Maximum jpeg encoding quality to use.
+    seed: A shape [2] Tensor, the seed to the random number generator. Must have
+      dtype `int32` or `int64`. (When using XLA, only `int32` is allowed.)
+
+  Returns:
+    Adjusted image(s), same shape and DType as `image`.
+
+  Raises:
+    ValueError: if `min_jpeg_quality` or `max_jpeg_quality` is invalid.
+  """
+  if (min_jpeg_quality < 0 or max_jpeg_quality < 0 or min_jpeg_quality > 100 or
+      max_jpeg_quality > 100):
+    raise ValueError('jpeg encoding range must be between 0 and 100.')
+
+  if min_jpeg_quality >= max_jpeg_quality:
+    raise ValueError('`min_jpeg_quality` must be less than `max_jpeg_quality`.')
+
+  jpeg_quality = stateless_random_ops.stateless_random_uniform(
+      shape=[], minval=min_jpeg_quality, maxval=max_jpeg_quality, seed=seed,
+      dtype=dtypes.int32)
+  return adjust_jpeg_quality(image, jpeg_quality)
+
+
 @tf_export('image.adjust_jpeg_quality')
 @dispatch.add_dispatch_support
 def adjust_jpeg_quality(image, jpeg_quality, name=None):
@@ -2392,7 +2692,7 @@ def random_saturation(image, lower, upper, seed=None):
   """Adjust the saturation of RGB images by a random factor.
 
   Equivalent to `adjust_saturation()` but uses a `saturation_factor` randomly
-  picked in the interval `[lower, upper]`.
+  picked in the interval `[lower, upper)`.
 
   Usage Example:
 
@@ -2428,11 +2728,60 @@ def random_saturation(image, lower, upper, seed=None):
   if lower < 0:
     raise ValueError('lower must be non-negative.')
 
-  # Pick a float in [lower, upper]
   saturation_factor = random_ops.random_uniform([], lower, upper, seed=seed)
   return adjust_saturation(image, saturation_factor)
 
 
+@tf_export('image.stateless_random_saturation', v1=[])
+@dispatch.add_dispatch_support
+def stateless_random_saturation(image, lower, upper, seed=None):
+  """Adjust the saturation of RGB images by a random factor deterministically.
+
+  Equivalent to `adjust_saturation()` but uses a `saturation_factor` randomly
+  picked in the interval `[lower, upper)`.
+
+  Guarantees the same results given the same `seed` independent of how many
+  times the function is called, and independent of global seed settings (e.g.
+  `tf.random.set_seed`).
+
+  Usage Example:
+
+  >>> x = [[[1.0, 2.0, 3.0],
+  ...       [4.0, 5.0, 6.0]],
+  ...      [[7.0, 8.0, 9.0],
+  ...       [10.0, 11.0, 12.0]]]
+  >>> seed = (1, 2)
+  >>> tf.image.stateless_random_saturation(x, 0.5, 1.0, seed)
+  <tf.Tensor: shape=(2, 2, 3), dtype=float32, numpy=
+  array([[[ 1.1559395,  2.0779698,  3.       ],
+          [ 4.1559396,  5.07797  ,  6.       ]],
+         [[ 7.1559396,  8.07797  ,  9.       ],
+          [10.155939 , 11.07797  , 12.       ]]], dtype=float32)>
+
+  Args:
+    image: RGB image or images. The size of the last dimension must be 3.
+    lower: float.  Lower bound for the random saturation factor.
+    upper: float.  Upper bound for the random saturation factor.
+    seed: A shape [2] Tensor, the seed to the random number generator. Must have
+      dtype `int32` or `int64`. (When using XLA, only `int32` is allowed.)
+
+  Returns:
+    Adjusted image(s), same shape and DType as `image`.
+
+  Raises:
+    ValueError: if `upper <= lower` or if `lower < 0`.
+  """
+  if upper <= lower:
+    raise ValueError('upper must be > lower.')
+
+  if lower < 0:
+    raise ValueError('lower must be non-negative.')
+
+  saturation_factor = stateless_random_ops.stateless_random_uniform(
+      shape=[], minval=lower, maxval=upper, seed=seed)
+  return adjust_saturation(image, saturation_factor)
+
+
 @tf_export('image.adjust_saturation')
 @dispatch.add_dispatch_support
 def adjust_saturation(image, saturation_factor, name=None):
@@ -2635,7 +2984,7 @@ def decode_image(contents,
     ValueError: On incorrect number of channels.
   """
   with ops.name_scope(name, 'decode_image'):
-    if compat.forward_compatible(2020, 7, 14):
+    if compat.forward_compatible(2020, 8, 14):
       channels = 0 if channels is None else channels
       if dtype not in [dtypes.float32, dtypes.uint8, dtypes.uint16]:
         dest_dtype = dtype
@@ -2893,10 +3242,142 @@ def sample_distorted_bounding_box_v2(image_size,
     Provide as input to `tf.image.draw_bounding_boxes`.
   """
   seed1, seed2 = random_seed.get_seed(seed) if seed else (0, 0)
-  return sample_distorted_bounding_box(image_size, bounding_boxes, seed1, seed2,
-                                       min_object_covered, aspect_ratio_range,
-                                       area_range, max_attempts,
-                                       use_image_if_no_bounding_boxes, name)
+  with ops.name_scope(name, 'sample_distorted_bounding_box'):
+    return gen_image_ops.sample_distorted_bounding_box_v2(
+        image_size,
+        bounding_boxes,
+        seed=seed1,
+        seed2=seed2,
+        min_object_covered=min_object_covered,
+        aspect_ratio_range=aspect_ratio_range,
+        area_range=area_range,
+        max_attempts=max_attempts,
+        use_image_if_no_bounding_boxes=use_image_if_no_bounding_boxes,
+        name=name)
+
+
+@tf_export('image.stateless_sample_distorted_bounding_box', v1=[])
+@dispatch.add_dispatch_support
+def stateless_sample_distorted_bounding_box(image_size,
+                                            bounding_boxes,
+                                            seed,
+                                            min_object_covered=0.1,
+                                            aspect_ratio_range=None,
+                                            area_range=None,
+                                            max_attempts=None,
+                                            use_image_if_no_bounding_boxes=None,
+                                            name=None):
+  """Generate a randomly distorted bounding box for an image deterministically.
+
+  Bounding box annotations are often supplied in addition to ground-truth labels
+  in image recognition or object localization tasks. A common technique for
+  training such a system is to randomly distort an image while preserving
+  its content, i.e. *data augmentation*. This Op, given the same `seed`,
+  deterministically outputs a randomly distorted localization of an object, i.e.
+  bounding box, given an `image_size`, `bounding_boxes` and a series of
+  constraints.
+
+  The output of this Op is a single bounding box that may be used to crop the
+  original image. The output is returned as 3 tensors: `begin`, `size` and
+  `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+  image. The latter may be supplied to `tf.image.draw_bounding_boxes` to
+  visualize what the bounding box looks like.
+
+  Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`.
+  The bounding box coordinates are floats in `[0.0, 1.0]` relative to the width
+  and the height of the underlying image.
+
+  The output of this Op is guaranteed to be the same given the same `seed` and
+  is independent of how many times the function is called, and independent of
+  global seed settings (e.g. `tf.random.set_seed`).
+
+  Example usage:
+
+  >>> image = np.array([[[1], [2], [3]], [[4], [5], [6]], [[7], [8], [9]]])
+  >>> bbox = tf.constant(
+  ...   [0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4])
+  >>> seed = (1, 2)
+  >>> # Generate a single distorted bounding box.
+  >>> bbox_begin, bbox_size, bbox_draw = (
+  ...   tf.image.stateless_sample_distorted_bounding_box(
+  ...     tf.shape(image), bounding_boxes=bbox, seed=seed))
+  >>> # Employ the bounding box to distort the image.
+  >>> tf.slice(image, bbox_begin, bbox_size)
+  <tf.Tensor: shape=(2, 2, 1), dtype=int64, numpy=
+  array([[[1],
+          [2]],
+         [[4],
+          [5]]])>
+  >>> # Draw the bounding box in an image summary.
+  >>> colors = np.array([[1.0, 0.0, 0.0], [0.0, 0.0, 1.0]])
+  >>> tf.image.draw_bounding_boxes(
+  ...   tf.expand_dims(tf.cast(image, tf.float32),0), bbox_draw, colors)
+  <tf.Tensor: shape=(1, 3, 3, 1), dtype=float32, numpy=
+  array([[[[1.],
+           [1.],
+           [3.]],
+          [[1.],
+           [1.],
+           [6.]],
+          [[7.],
+           [8.],
+           [9.]]]], dtype=float32)>
+
+  Note that if no bounding box information is available, setting
+  `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+  bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+  false and no bounding boxes are supplied, an error is raised.
+
+  Args:
+    image_size: A `Tensor`. Must be one of the following types: `uint8`, `int8`,
+      `int16`, `int32`, `int64`. 1-D, containing `[height, width, channels]`.
+    bounding_boxes: A `Tensor` of type `float32`. 3-D with shape `[batch, N, 4]`
+      describing the N bounding boxes associated with the image.
+    seed: A shape [2] Tensor, the seed to the random number generator. Must have
+      dtype `int32` or `int64`. (When using XLA, only `int32` is allowed.)
+    min_object_covered: A Tensor of type `float32`. Defaults to `0.1`. The
+      cropped area of the image must contain at least this fraction of any
+      bounding box supplied. The value of this parameter should be non-negative.
+      In the case of 0, the cropped area does not need to overlap any of the
+      bounding boxes supplied.
+    aspect_ratio_range: An optional list of `floats`. Defaults to `[0.75,
+      1.33]`. The cropped area of the image must have an aspect `ratio = width /
+      height` within this range.
+    area_range: An optional list of `floats`. Defaults to `[0.05, 1]`. The
+      cropped area of the image must contain a fraction of the supplied image
+      within this range.
+    max_attempts: An optional `int`. Defaults to `100`. Number of attempts at
+      generating a cropped region of the image of the specified constraints.
+      After `max_attempts` failures, return the entire image.
+    use_image_if_no_bounding_boxes: An optional `bool`. Defaults to `False`.
+      Controls behavior if no bounding boxes supplied. If true, assume an
+      implicit bounding box covering the whole input. If false, raise an error.
+    name: A name for the operation (optional).
+
+  Returns:
+    A tuple of `Tensor` objects (begin, size, bboxes).
+
+    begin: A `Tensor`. Has the same type as `image_size`. 1-D, containing
+    `[offset_height, offset_width, 0]`. Provide as input to
+      `tf.slice`.
+    size: A `Tensor`. Has the same type as `image_size`. 1-D, containing
+    `[target_height, target_width, -1]`. Provide as input to
+      `tf.slice`.
+    bboxes: A `Tensor` of type `float32`. 3-D with shape `[1, 1, 4]` containing
+    the distorted bounding box.
+    Provide as input to `tf.image.draw_bounding_boxes`.
+  """
+  with ops.name_scope(name, 'stateless_sample_distorted_bounding_box'):
+    return gen_image_ops.stateless_sample_distorted_bounding_box(
+        image_size=image_size,
+        bounding_boxes=bounding_boxes,
+        seed=seed,
+        min_object_covered=min_object_covered,
+        aspect_ratio_range=aspect_ratio_range,
+        area_range=area_range,
+        max_attempts=max_attempts,
+        use_image_if_no_bounding_boxes=use_image_if_no_bounding_boxes,
+        name=name)
 
 
 @tf_export(v1=['image.sample_distorted_bounding_box'])
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 59f219beb45..210b6c6e65d 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -25,16 +25,20 @@ import math
 import os
 import time
 
+from absl.testing import parameterized
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.compat import compat
+from tensorflow.python.data.experimental.ops import get_single_element
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -45,6 +49,7 @@ from tensorflow.python.ops import image_ops_impl
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import test
@@ -998,7 +1003,8 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
           self.assertAllClose(y_fused, y_baseline, rtol=2e-5, atol=1e-5)
 
 
-class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
+class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
+                              parameterized.TestCase):
 
   def testInvolutionLeftRight(self):
     x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
@@ -1073,6 +1079,109 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertGreaterEqual(count_flipped, 20)
       self.assertGreaterEqual(count_unflipped, 20)
 
+  # TODO(b/162345082): stateless random op generates different random number
+  # with xla_gpu. Update tests such that there is a single ground truth result
+  # to test against.
+  @parameterized.named_parameters(
+      ("_RandomFlipLeftRight", image_ops.stateless_random_flip_left_right),
+      ("_RandomFlipUpDown", image_ops.stateless_random_flip_up_down),
+  )
+  def testRandomFlipStateless(self, func):
+    with test_util.use_gpu():
+      x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
+      y_np = np.array([[3, 2, 1], [6, 5, 4]], dtype=np.uint8).reshape([2, 3, 1])
+      if "RandomFlipUpDown" in self.id():
+        y_np = np.array(
+            [[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
+
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+
+      iterations = 2
+      flip_counts = [None for _ in range(iterations)]
+      flip_sequences = ["" for _ in range(iterations)]
+      test_seed = (1, 2)
+      split_seeds = stateless_random_ops.split(test_seed, 10)
+      seeds_list = self.evaluate(split_seeds)
+      for i in range(iterations):
+        count_flipped = 0
+        count_unflipped = 0
+        flip_seq = ""
+        for seed in seeds_list:
+          y_tf = func(x_tf, seed=seed)
+          y_tf_eval = self.evaluate(y_tf)
+          if y_tf_eval[0][0] == 1:
+            self.assertAllEqual(y_tf_eval, x_np)
+            count_unflipped += 1
+            flip_seq += "U"
+          else:
+            self.assertAllEqual(y_tf_eval, y_np)
+            count_flipped += 1
+            flip_seq += "F"
+
+        flip_counts[i] = (count_flipped, count_unflipped)
+        flip_sequences[i] = flip_seq
+
+      # Verify that results are deterministic.
+      for i in range(1, iterations):
+        self.assertAllEqual(flip_counts[0], flip_counts[i])
+        self.assertAllEqual(flip_sequences[0], flip_sequences[i])
+
+  # TODO(b/162345082): stateless random op generates different random number
+  # with xla_gpu. Update tests such that there is a single ground truth result
+  # to test against.
+  @parameterized.named_parameters(
+      ("_RandomFlipLeftRight", image_ops.stateless_random_flip_left_right),
+      ("_RandomFlipUpDown", image_ops.stateless_random_flip_up_down)
+  )
+  def testRandomFlipStatelessWithBatch(self, func):
+    with test_util.use_gpu():
+      batch_size = 16
+
+      # create single item of test data
+      x_np_raw = np.array(
+          [[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([1, 2, 3, 1])
+      y_np_raw = np.array(
+          [[3, 2, 1], [6, 5, 4]], dtype=np.uint8).reshape([1, 2, 3, 1])
+      if "RandomFlipUpDown" in self.id():
+        y_np_raw = np.array(
+            [[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([1, 2, 3, 1])
+
+      # create batched test data
+      x_np = np.vstack([x_np_raw for _ in range(batch_size)])
+      y_np = np.vstack([y_np_raw for _ in range(batch_size)])
+
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+
+      iterations = 2
+      flip_counts = [None for _ in range(iterations)]
+      flip_sequences = ["" for _ in range(iterations)]
+      test_seed = (1, 2)
+      split_seeds = stateless_random_ops.split(test_seed, 10)
+      seeds_list = self.evaluate(split_seeds)
+      for i in range(iterations):
+        count_flipped = 0
+        count_unflipped = 0
+        flip_seq = ""
+        for seed in seeds_list:
+          y_tf = func(x_tf, seed=seed)
+          y_tf_eval = self.evaluate(y_tf)
+          for j in range(batch_size):
+            if y_tf_eval[j][0][0] == 1:
+              self.assertAllEqual(y_tf_eval[j], x_np[j])
+              count_unflipped += 1
+              flip_seq += "U"
+            else:
+              self.assertAllEqual(y_tf_eval[j], y_np[j])
+              count_flipped += 1
+              flip_seq += "F"
+
+        flip_counts[i] = (count_flipped, count_unflipped)
+        flip_sequences[i] = flip_seq
+
+      for i in range(1, iterations):
+        self.assertAllEqual(flip_counts[0], flip_counts[i])
+        self.assertAllEqual(flip_sequences[0], flip_sequences[i])
+
   @test_util.run_deprecated_v1
   def testRandomFlipLeftRightWithBatch(self):
     batch_size = 16
@@ -1305,7 +1414,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         image_ops.transpose, image_ops.rot90
     ]:
       transformed_unknown_rank = op(p_unknown_rank)
-      self.assertEqual(3, transformed_unknown_rank.get_shape().ndims)
+      self.assertIsNone(transformed_unknown_rank.get_shape().ndims)
       transformed_unknown_dims_3 = op(p_unknown_dims_3)
       self.assertEqual(3, transformed_unknown_dims_3.get_shape().ndims)
       transformed_unknown_width = op(p_unknown_width)
@@ -1364,6 +1473,26 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         y_np = np.rot90(image, k=k, axes=(1, 2))
         self.assertAllEqual(y_np, y_tf.eval({k_placeholder: k}))
 
+  def testFlipImageUnknownShape(self):
+    expected_output = constant_op.constant([[[[3, 4, 5], [0, 1, 2]],
+                                             [[9, 10, 11], [6, 7, 8]]]])
+
+    def generator():
+      image_input = np.array(
+          [[[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [9, 10, 11]]]], np.int32)
+      yield image_input
+
+    dataset = dataset_ops.Dataset.from_generator(
+        generator,
+        output_types=dtypes.int32,
+        output_shapes=tensor_shape.TensorShape([1, 2, 2, 3]))
+    dataset = dataset.map(image_ops.flip_left_right)
+
+    image_flipped_via_dataset_map = get_single_element.get_single_element(
+        dataset.take(1))
+    self.assertAllEqual(image_flipped_via_dataset_map, expected_output)
+
+
 class AdjustContrastTest(test_util.TensorFlowTestCase):
 
   def _testContrast(self, x_np, y_np, contrast_factor):
@@ -2289,6 +2418,149 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       end = self.evaluate(end)
       bbox_for_drawing = self.evaluate(bbox_for_drawing)
 
+  def _testStatelessSampleDistortedBoundingBox(self, image, bounding_box,
+                                               min_object_covered,
+                                               aspect_ratio_range, area_range):
+    with test_util.use_gpu():
+      original_area = float(np.prod(image.shape))
+      bounding_box_area = float((bounding_box[3] - bounding_box[1]) *
+                                (bounding_box[2] - bounding_box[0]))
+
+      image_size_np = np.array(image.shape, dtype=np.int32)
+      bounding_box_np = (
+          np.array(bounding_box, dtype=np.float32).reshape([1, 1, 4]))
+
+      iterations = 2
+      test_seeds = [(1, 2), (3, 4), (5, 6)]
+
+      for seed in test_seeds:
+        aspect_ratios = []
+        area_ratios = []
+        fraction_object_covered = []
+        for _ in range(iterations):
+          image_tf = constant_op.constant(image, shape=image.shape)
+          image_size_tf = constant_op.constant(
+              image_size_np, shape=image_size_np.shape)
+          bounding_box_tf = constant_op.constant(bounding_box_np,
+                                                 dtype=dtypes.float32,
+                                                 shape=bounding_box_np.shape)
+          begin, size, _ = image_ops.stateless_sample_distorted_bounding_box(
+              image_size=image_size_tf,
+              bounding_boxes=bounding_box_tf,
+              seed=seed,
+              min_object_covered=min_object_covered,
+              aspect_ratio_range=aspect_ratio_range,
+              area_range=area_range)
+          y = array_ops.strided_slice(image_tf, begin, begin + size)
+          y_tf = self.evaluate(y)
+          crop_height = y_tf.shape[0]
+          crop_width = y_tf.shape[1]
+          aspect_ratio = float(crop_width) / float(crop_height)
+          area = float(crop_width * crop_height)
+          aspect_ratios.append(aspect_ratio)
+          area_ratio = area / original_area
+          area_ratios.append(area_ratio)
+          fraction_object_covered.append(
+              float(np.sum(y_tf)) / bounding_box_area)
+
+        # Check that `area_ratio` is within valid range.
+        self.assertLessEqual(area_ratio, area_range[1])
+        self.assertGreaterEqual(area_ratio, area_range[0])
+
+        # Each array should consist of one value just repeated `iteration` times
+        # because the same seed is used.
+        self.assertEqual(len(set(aspect_ratios)), 1)
+        self.assertEqual(len(set(area_ratios)), 1)
+        self.assertEqual(len(set(fraction_object_covered)), 1)
+
+  # TODO(b/162345082): stateless random op generates different random number
+  # with xla_gpu. Update tests such that there is a single ground truth result
+  # to test against.
+  def testWholeImageBoundingBoxStateless(self):
+    height = 40
+    width = 50
+    image_size = [height, width, 1]
+    bounding_box = [0.0, 0.0, 1.0, 1.0]
+    image = np.arange(
+        0, np.prod(image_size), dtype=np.int32).reshape(image_size)
+    for min_obj_covered in [0.1, constant_op.constant(0.1)]:
+      self._testStatelessSampleDistortedBoundingBox(
+          image,
+          bounding_box,
+          min_object_covered=min_obj_covered,
+          aspect_ratio_range=(0.75, 1.33),
+          area_range=(0.05, 1.0))
+
+  # TODO(b/162345082): stateless random op generates different random number
+  # with xla_gpu. Update tests such that there is a single ground truth result
+  # to test against.
+  def testWithBoundingBoxStateless(self):
+    height = 40
+    width = 50
+    x_shape = [height, width, 1]
+    image = np.zeros(x_shape, dtype=np.int32)
+
+    xmin = 2
+    ymin = 3
+    xmax = 12
+    ymax = 13
+    for x in np.arange(xmin, xmax + 1, 1):
+      for y in np.arange(ymin, ymax + 1, 1):
+        image[x, y] = 1
+
+    # Bounding box is specified as (ymin, xmin, ymax, xmax) in
+    # relative coordinates.
+    bounding_box = (float(ymin) / height, float(xmin) / width,
+                    float(ymax) / height, float(xmax) / width)
+
+    # Test both scalar and tensor input for `min_object_covered`.
+    for min_obj_covered in [0.1, constant_op.constant(0.1)]:
+      self._testStatelessSampleDistortedBoundingBox(
+          image,
+          bounding_box=bounding_box,
+          min_object_covered=min_obj_covered,
+          aspect_ratio_range=(0.75, 1.33),
+          area_range=(0.05, 1.0))
+
+  def testSampleDistortedBoundingBoxShapeStateless(self):
+    with test_util.use_gpu():
+      image_size = constant_op.constant(
+          [40, 50, 1], shape=[3], dtype=dtypes.int32)
+      bounding_box = constant_op.constant(
+          [[[0.0, 0.0, 1.0, 1.0]]],
+          shape=[1, 1, 4],
+          dtype=dtypes.float32,
+      )
+
+      bbox_func = functools.partial(
+          image_ops.stateless_sample_distorted_bounding_box,
+          image_size=image_size,
+          bounding_boxes=bounding_box,
+          min_object_covered=0.1,
+          aspect_ratio_range=(0.75, 1.33),
+          area_range=(0.05, 1.0))
+
+      # Check error is raised with wrong seed shapes.
+      for seed in [1, (1, 2, 3)]:
+        with self.assertRaises((ValueError, errors.InvalidArgumentError)):
+          begin, end, bbox_for_drawing = bbox_func(seed=seed)
+
+      test_seed = (1, 2)
+      begin, end, bbox_for_drawing = bbox_func(seed=test_seed)
+
+      # Test that the shapes are correct.
+      self.assertAllEqual([3], begin.get_shape().as_list())
+      self.assertAllEqual([3], end.get_shape().as_list())
+      self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
+
+      # Actual run to make sure shape is correct inside Compute().
+      begin = self.evaluate(begin)
+      end = self.evaluate(end)
+      bbox_for_drawing = self.evaluate(bbox_for_drawing)
+      self.assertAllEqual([3], begin.shape)
+      self.assertAllEqual([3], end.shape)
+      self.assertAllEqual([1, 1, 4], bbox_for_drawing.shape)
+
 
 class ResizeImagesV2Test(test_util.TensorFlowTestCase):
 
@@ -3789,7 +4061,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
     self.assertTrue(y.op.name.startswith("resize_image_with_crop_or_pad"))
 
 
-def _SimpleColorRamp():
+def simple_color_ramp():
   """Build a simple color ramp RGB image."""
   w, h = 256, 200
   i = np.arange(h)[:, None]
@@ -3888,7 +4160,7 @@ class JpegTest(test_util.TensorFlowTestCase):
   def testSynthetic(self):
     with self.cached_session(use_gpu=True) as sess:
       # Encode it, then decode it, then encode it
-      image0 = constant_op.constant(_SimpleColorRamp())
+      image0 = constant_op.constant(simple_color_ramp())
       jpeg0 = image_ops.encode_jpeg(image0)
       image1 = image_ops.decode_jpeg(jpeg0, dct_method="INTEGER_ACCURATE")
       image2 = image_ops.decode_jpeg(
@@ -3909,7 +4181,7 @@ class JpegTest(test_util.TensorFlowTestCase):
   def testSyntheticFasterAlgorithm(self):
     with self.cached_session(use_gpu=True) as sess:
       # Encode it, then decode it, then encode it
-      image0 = constant_op.constant(_SimpleColorRamp())
+      image0 = constant_op.constant(simple_color_ramp())
       jpeg0 = image_ops.encode_jpeg(image0)
       image1 = image_ops.decode_jpeg(jpeg0, dct_method="INTEGER_FAST")
       image2 = image_ops.decode_jpeg(
@@ -3934,7 +4206,7 @@ class JpegTest(test_util.TensorFlowTestCase):
     with self.cached_session(use_gpu=True) as sess:
       # Compare decoding with both dct_option=INTEGER_FAST and
       # default.  They should be the same.
-      image0 = constant_op.constant(_SimpleColorRamp())
+      image0 = constant_op.constant(simple_color_ramp())
       jpeg0 = image_ops.encode_jpeg(image0)
       image1 = image_ops.decode_jpeg(jpeg0, dct_method="INTEGER_FAST")
       image2 = image_ops.decode_jpeg(jpeg0)
@@ -3996,6 +4268,36 @@ class JpegTest(test_util.TensorFlowTestCase):
               np.array_equal(random_jpeg_images[0], random_jpeg_images[i]))
         self.assertFalse(all(are_images_equal))
 
+  # TODO(b/162345082): stateless random op generates different random number
+  # with xla_gpu. Update tests such that there is a single ground truth result
+  # to test against.
+  def testStatelessRandomJpegQuality(self):
+    # Test deterministic randomness in jpeg quality by checking that the same
+    # sequence of jpeg quality adjustments are returned each round given the
+    # same seed.
+    with test_util.use_gpu():
+      path = ("tensorflow/core/lib/jpeg/testdata/medium.jpg")
+      jpeg = io_ops.read_file(path)
+      image = image_ops.decode_jpeg(jpeg)
+      jpeg_quality = (40, 100)
+      seeds_list = [(1, 2), (3, 4)]
+
+      iterations = 2
+      random_jpeg_images_all = [[] for _ in range(iterations)]
+      for random_jpeg_images in random_jpeg_images_all:
+        for seed in seeds_list:
+          distorted_jpeg = image_ops.stateless_random_jpeg_quality(
+              image, jpeg_quality[0], jpeg_quality[1], seed=seed)
+          # Verify that the random jpeg image is different from the original
+          # jpeg image.
+          self.assertNotAllEqual(image, distorted_jpeg)
+          random_jpeg_images.append(self.evaluate(distorted_jpeg))
+
+      # Verify that the results are identical given the same seed.
+      for i in range(1, iterations):
+        self.assertAllEqual(random_jpeg_images_all[0],
+                            random_jpeg_images_all[i])
+
   def testAdjustJpegQuality(self):
     # Test if image_ops.adjust_jpeg_quality works when jpeq quality
     # is an int (not tensor) for backward compatibility.
@@ -4041,7 +4343,7 @@ class PngTest(test_util.TensorFlowTestCase):
   def testSynthetic(self):
     with self.cached_session(use_gpu=True) as sess:
       # Encode it, then decode it
-      image0 = constant_op.constant(_SimpleColorRamp())
+      image0 = constant_op.constant(simple_color_ramp())
       png0 = image_ops.encode_png(image0, compression=7)
       image1 = image_ops.decode_png(png0)
       png0, image0, image1 = self.evaluate([png0, image0, image1])
@@ -4056,7 +4358,7 @@ class PngTest(test_util.TensorFlowTestCase):
   def testSyntheticUint16(self):
     with self.cached_session(use_gpu=True) as sess:
       # Encode it, then decode it
-      image0 = constant_op.constant(_SimpleColorRamp(), dtype=dtypes.uint16)
+      image0 = constant_op.constant(simple_color_ramp(), dtype=dtypes.uint16)
       png0 = image_ops.encode_png(image0, compression=7)
       image1 = image_ops.decode_png(png0, dtype=dtypes.uint16)
       png0, image0, image1 = self.evaluate([png0, image0, image1])
@@ -4071,7 +4373,7 @@ class PngTest(test_util.TensorFlowTestCase):
   def testSyntheticTwoChannel(self):
     with self.cached_session(use_gpu=True) as sess:
       # Strip the b channel from an rgb image to get a two-channel image.
-      gray_alpha = _SimpleColorRamp()[:, :, 0:2]
+      gray_alpha = simple_color_ramp()[:, :, 0:2]
       image0 = constant_op.constant(gray_alpha)
       png0 = image_ops.encode_png(image0, compression=7)
       image1 = image_ops.decode_png(png0)
@@ -4082,7 +4384,7 @@ class PngTest(test_util.TensorFlowTestCase):
   def testSyntheticTwoChannelUint16(self):
     with self.cached_session(use_gpu=True) as sess:
       # Strip the b channel from an rgb image to get a two-channel image.
-      gray_alpha = _SimpleColorRamp()[:, :, 0:2]
+      gray_alpha = simple_color_ramp()[:, :, 0:2]
       image0 = constant_op.constant(gray_alpha, dtype=dtypes.uint16)
       png0 = image_ops.encode_png(image0, compression=7)
       image1 = image_ops.decode_png(png0, dtype=dtypes.uint16)
@@ -4099,6 +4401,17 @@ class PngTest(test_util.TensorFlowTestCase):
         self.assertEqual(image.get_shape().as_list(),
                          [None, None, channels or None])
 
+  def testPaletteOnly(self):
+    filename = "tensorflow/core/lib/png/testdata/palette_only.png"
+    expected = np.zeros((20, 20, 1), np.uint8)
+    expected[1, 1:19, :] = 1
+    expected[3, 1:19, :] = 2
+    with self.cached_session(use_gpu=True):
+      channels = 1
+      png = image_ops.decode_png(io_ops.read_file(filename), channels=channels)
+      png = self.evaluate(png)
+      self.assertAllEqual(expected, png)
+
 
 class GifTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/ops/init_ops_v2_test.py b/tensorflow/python/ops/init_ops_v2_test.py
index d45d5f6f6b3..37b66d59c09 100644
--- a/tensorflow/python/ops/init_ops_v2_test.py
+++ b/tensorflow/python/ops/init_ops_v2_test.py
@@ -162,6 +162,7 @@ class RandomUniformInitializerTest(InitializersTest):
 
   @test_util.run_in_graph_and_eager_modes
   def testRangeInitializer(self):
+    self.skipTest("b/161580897")
     shape = (9, 6, 7)
     self._range_test(
         init_ops_v2.RandomUniform(minval=-1, maxval=1, seed=124),
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index 8e1967f63c1..cf14cdb6eae 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -385,7 +385,7 @@ class LinearOperator(module.Module):
     # `shape` may be passed in if this can be pre-computed in a
     # more efficient manner, e.g. without excessive Tensor conversions.
     if self.tensor_rank is not None:
-      return ops.convert_to_tensor(self.tensor_rank)
+      return ops.convert_to_tensor_v2_with_dispatch(self.tensor_rank)
     else:
       shape = self.shape_tensor() if shape is None else shape
       return array_ops.size(shape)
@@ -429,7 +429,7 @@ class LinearOperator(module.Module):
     # more efficient manner, e.g. without excessive Tensor conversions.
     dim_value = tensor_shape.dimension_value(self.domain_dimension)
     if dim_value is not None:
-      return ops.convert_to_tensor(dim_value)
+      return ops.convert_to_tensor_v2_with_dispatch(dim_value)
     else:
       shape = self.shape_tensor() if shape is None else shape
       return shape[-1]
@@ -473,7 +473,7 @@ class LinearOperator(module.Module):
     # more efficient manner, e.g. without excessive Tensor conversions.
     dim_value = tensor_shape.dimension_value(self.range_dimension)
     if dim_value is not None:
-      return ops.convert_to_tensor(dim_value)
+      return ops.convert_to_tensor_v2_with_dispatch(dim_value)
     else:
       shape = self.shape_tensor() if shape is None else shape
       return shape[-2]
@@ -641,7 +641,7 @@ class LinearOperator(module.Module):
         return linear_operator_algebra.matmul(left_operator, right_operator)
 
     with self._name_scope(name):
-      x = ops.convert_to_tensor(x, name="x")
+      x = ops.convert_to_tensor_v2_with_dispatch(x, name="x")
       self._check_input_dtype(x)
 
       self_dim = -2 if adjoint else -1
@@ -688,7 +688,7 @@ class LinearOperator(module.Module):
       A `Tensor` with shape `[..., M]` and same `dtype` as `self`.
     """
     with self._name_scope(name):
-      x = ops.convert_to_tensor(x, name="x")
+      x = ops.convert_to_tensor_v2_with_dispatch(x, name="x")
       self._check_input_dtype(x)
       self_dim = -2 if adjoint else -1
       tensor_shape.dimension_at_index(
@@ -834,7 +834,7 @@ class LinearOperator(module.Module):
         return linear_operator_algebra.solve(left_operator, right_operator)
 
     with self._name_scope(name):
-      rhs = ops.convert_to_tensor(rhs, name="rhs")
+      rhs = ops.convert_to_tensor_v2_with_dispatch(rhs, name="rhs")
       self._check_input_dtype(rhs)
 
       self_dim = -1 if adjoint else -2
@@ -891,7 +891,7 @@ class LinearOperator(module.Module):
       NotImplementedError:  If `self.is_non_singular` or `is_square` is False.
     """
     with self._name_scope(name):
-      rhs = ops.convert_to_tensor(rhs, name="rhs")
+      rhs = ops.convert_to_tensor_v2_with_dispatch(rhs, name="rhs")
       self._check_input_dtype(rhs)
       self_dim = -1 if adjoint else -2
       tensor_shape.dimension_at_index(
@@ -1054,7 +1054,7 @@ class LinearOperator(module.Module):
       A `Tensor` with broadcast shape and same `dtype` as `self`.
     """
     with self._name_scope(name):
-      x = ops.convert_to_tensor(x, name="x")
+      x = ops.convert_to_tensor_v2_with_dispatch(x, name="x")
       self._check_input_dtype(x)
       return self._add_to_tensor(x)
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_block_diag.py b/tensorflow/python/ops/linalg/linear_operator_block_diag.py
index 7c50d00a055..7afa15ae069 100644
--- a/tensorflow/python/ops/linalg/linear_operator_block_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_block_diag.py
@@ -263,7 +263,7 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
   def _shape_tensor(self):
     # Avoid messy broadcasting if possible.
     if self.shape.is_fully_defined():
-      return ops.convert_to_tensor(
+      return ops.convert_to_tensor_v2_with_dispatch(
           self.shape.as_list(), dtype=dtypes.int32, name="shape")
 
     domain_dimension = sum(self._block_domain_dimension_tensors())
@@ -330,12 +330,12 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
       if linear_operator_util.arg_is_blockwise(block_dimensions, x, arg_dim):
         for i, block in enumerate(x):
           if not isinstance(block, linear_operator.LinearOperator):
-            block = ops.convert_to_tensor(block)
+            block = ops.convert_to_tensor_v2_with_dispatch(block)
             self._check_input_dtype(block)
             block_dimensions[i].assert_is_compatible_with(block.shape[arg_dim])
             x[i] = block
       else:
-        x = ops.convert_to_tensor(x, name="x")
+        x = ops.convert_to_tensor_v2_with_dispatch(x, name="x")
         self._check_input_dtype(x)
         op_dimension = (self.range_dimension if adjoint
                         else self.domain_dimension)
@@ -404,7 +404,7 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
       if linear_operator_util.arg_is_blockwise(block_dimensions, x, -1):
         for i, block in enumerate(x):
           if not isinstance(block, linear_operator.LinearOperator):
-            block = ops.convert_to_tensor(block)
+            block = ops.convert_to_tensor_v2_with_dispatch(block)
             self._check_input_dtype(block)
             block_dimensions[i].assert_is_compatible_with(block.shape[-1])
             x[i] = block
@@ -412,7 +412,7 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
         y_mat = self.matmul(x_mat, adjoint=adjoint)
         return [array_ops.squeeze(y, axis=-1) for y in y_mat]
 
-      x = ops.convert_to_tensor(x, name="x")
+      x = ops.convert_to_tensor_v2_with_dispatch(x, name="x")
       self._check_input_dtype(x)
       op_dimension = (self.range_dimension if adjoint
                       else self.domain_dimension)
@@ -508,12 +508,12 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
         split_rhs = rhs
         for i, block in enumerate(split_rhs):
           if not isinstance(block, linear_operator.LinearOperator):
-            block = ops.convert_to_tensor(block)
+            block = ops.convert_to_tensor_v2_with_dispatch(block)
             self._check_input_dtype(block)
             block_dimensions[i].assert_is_compatible_with(block.shape[arg_dim])
             split_rhs[i] = block
       else:
-        rhs = ops.convert_to_tensor(rhs, name="rhs")
+        rhs = ops.convert_to_tensor_v2_with_dispatch(rhs, name="rhs")
         self._check_input_dtype(rhs)
         op_dimension = (self.domain_dimension if adjoint
                         else self.range_dimension)
@@ -583,7 +583,7 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
       if linear_operator_util.arg_is_blockwise(block_dimensions, rhs, -1):
         for i, block in enumerate(rhs):
           if not isinstance(block, linear_operator.LinearOperator):
-            block = ops.convert_to_tensor(block)
+            block = ops.convert_to_tensor_v2_with_dispatch(block)
             self._check_input_dtype(block)
             block_dimensions[i].assert_is_compatible_with(block.shape[-1])
             rhs[i] = block
@@ -591,7 +591,7 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
         solution_mat = self.solve(rhs_mat, adjoint=adjoint)
         return [array_ops.squeeze(x, axis=-1) for x in solution_mat]
 
-      rhs = ops.convert_to_tensor(rhs, name="rhs")
+      rhs = ops.convert_to_tensor_v2_with_dispatch(rhs, name="rhs")
       self._check_input_dtype(rhs)
       op_dimension = (self.domain_dimension if adjoint
                       else self.range_dimension)
diff --git a/tensorflow/python/ops/linalg/linear_operator_block_lower_triangular.py b/tensorflow/python/ops/linalg/linear_operator_block_lower_triangular.py
index b4bf8bdb142..84f2ff15345 100644
--- a/tensorflow/python/ops/linalg/linear_operator_block_lower_triangular.py
+++ b/tensorflow/python/ops/linalg/linear_operator_block_lower_triangular.py
@@ -366,7 +366,7 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
   def _shape_tensor(self):
     # Avoid messy broadcasting if possible.
     if self.shape.is_fully_defined():
-      return ops.convert_to_tensor(
+      return ops.convert_to_tensor_v2_with_dispatch(
           self.shape.as_list(), dtype=dtypes.int32, name="shape")
 
     domain_dimension = sum(self._block_domain_dimension_tensors())
@@ -433,12 +433,12 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
       if linear_operator_util.arg_is_blockwise(block_dimensions, x, arg_dim):
         for i, block in enumerate(x):
           if not isinstance(block, linear_operator.LinearOperator):
-            block = ops.convert_to_tensor(block)
+            block = ops.convert_to_tensor_v2_with_dispatch(block)
             self._check_input_dtype(block)
             block_dimensions[i].assert_is_compatible_with(block.shape[arg_dim])
             x[i] = block
       else:
-        x = ops.convert_to_tensor(x, name="x")
+        x = ops.convert_to_tensor_v2_with_dispatch(x, name="x")
         self._check_input_dtype(x)
         op_dimension = (self.range_dimension if adjoint
                         else self.domain_dimension)
@@ -543,7 +543,7 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
       if linear_operator_util.arg_is_blockwise(block_dimensions, x, -1):
         for i, block in enumerate(x):
           if not isinstance(block, linear_operator.LinearOperator):
-            block = ops.convert_to_tensor(block)
+            block = ops.convert_to_tensor_v2_with_dispatch(block)
             self._check_input_dtype(block)
             block_dimensions[i].assert_is_compatible_with(block.shape[-1])
             x[i] = block
@@ -551,7 +551,7 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
         y_mat = self.matmul(x_mat, adjoint=adjoint)
         return [array_ops.squeeze(y, axis=-1) for y in y_mat]
 
-      x = ops.convert_to_tensor(x, name="x")
+      x = ops.convert_to_tensor_v2_with_dispatch(x, name="x")
       self._check_input_dtype(x)
       op_dimension = (self.range_dimension if adjoint
                       else self.domain_dimension)
@@ -674,7 +674,7 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
       if blockwise_arg:
         for i, block in enumerate(rhs):
           if not isinstance(block, linear_operator.LinearOperator):
-            block = ops.convert_to_tensor(block)
+            block = ops.convert_to_tensor_v2_with_dispatch(block)
             self._check_input_dtype(block)
             block_dimensions[i].assert_is_compatible_with(block.shape[arg_dim])
             rhs[i] = block
@@ -684,7 +684,7 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
           split_rhs = rhs
 
       else:
-        rhs = ops.convert_to_tensor(rhs, name="rhs")
+        rhs = ops.convert_to_tensor_v2_with_dispatch(rhs, name="rhs")
         self._check_input_dtype(rhs)
         op_dimension = (self.domain_dimension if adjoint
                         else self.range_dimension)
@@ -795,14 +795,14 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
       if linear_operator_util.arg_is_blockwise(block_dimensions, rhs, -1):
         for i, block in enumerate(rhs):
           if not isinstance(block, linear_operator.LinearOperator):
-            block = ops.convert_to_tensor(block)
+            block = ops.convert_to_tensor_v2_with_dispatch(block)
             self._check_input_dtype(block)
             block_dimensions[i].assert_is_compatible_with(block.shape[-1])
             rhs[i] = block
         rhs_mat = [array_ops.expand_dims(block, axis=-1) for block in rhs]
         solution_mat = self.solve(rhs_mat, adjoint=adjoint)
         return [array_ops.squeeze(x, axis=-1) for x in solution_mat]
-      rhs = ops.convert_to_tensor(rhs, name="rhs")
+      rhs = ops.convert_to_tensor_v2_with_dispatch(rhs, name="rhs")
       self._check_input_dtype(rhs)
       op_dimension = (self.domain_dimension if adjoint
                       else self.range_dimension)
diff --git a/tensorflow/python/ops/linalg/linear_operator_circulant.py b/tensorflow/python/ops/linalg/linear_operator_circulant.py
index ace276900fc..d4b671c53bd 100644
--- a/tensorflow/python/ops/linalg/linear_operator_circulant.py
+++ b/tensorflow/python/ops/linalg/linear_operator_circulant.py
@@ -378,7 +378,7 @@ class _BaseLinearOperatorCirculant(linear_operator.LinearOperator):
 
   def _broadcast_batch_dims(self, x, spectrum):
     """Broadcast batch dims of batch matrix `x` and spectrum."""
-    spectrum = ops.convert_to_tensor(spectrum, name="spectrum")
+    spectrum = ops.convert_to_tensor_v2_with_dispatch(spectrum, name="spectrum")
     # spectrum.shape = batch_shape + block_shape
     # First make spectrum a batch matrix with
     #   spectrum.shape = batch_shape + [prod(block_shape), 1]
@@ -755,7 +755,7 @@ class LinearOperatorCirculant(_BaseLinearOperatorCirculant):
         name=name)
 
   def _eigvals(self):
-    return ops.convert_to_tensor(self.spectrum)
+    return ops.convert_to_tensor_v2_with_dispatch(self.spectrum)
 
 
 @tf_export("linalg.LinearOperatorCirculant2D")
diff --git a/tensorflow/python/ops/linalg/linear_operator_diag.py b/tensorflow/python/ops/linalg/linear_operator_diag.py
index d51d6b81c5d..b5e81b267ce 100644
--- a/tensorflow/python/ops/linalg/linear_operator_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_diag.py
@@ -251,7 +251,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
     return array_ops.matrix_set_diag(x, new_diag)
 
   def _eigvals(self):
-    return ops.convert_to_tensor(self.diag)
+    return ops.convert_to_tensor_v2_with_dispatch(self.diag)
 
   def _cond(self):
     abs_diag = math_ops.abs(self.diag)
diff --git a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
index 8d92d1accaa..b10822589d5 100644
--- a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
+++ b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
@@ -160,7 +160,7 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
         dtypes.complex128,
     ]
 
-    matrix = ops.convert_to_tensor(matrix, name="matrix")
+    matrix = ops.convert_to_tensor_v2_with_dispatch(matrix, name="matrix")
 
     dtype = matrix.dtype
     if dtype not in allowed_dtypes:
diff --git a/tensorflow/python/ops/linalg/linear_operator_householder.py b/tensorflow/python/ops/linalg/linear_operator_householder.py
index 142d48c5331..265c862ea03 100644
--- a/tensorflow/python/ops/linalg/linear_operator_householder.py
+++ b/tensorflow/python/ops/linalg/linear_operator_householder.py
@@ -198,7 +198,8 @@ class LinearOperatorHouseholder(linear_operator.LinearOperator):
 
     # Note that because this is a reflection, it lies in O(n) (for real vector
     # spaces) or U(n) (for complex vector spaces), and thus is its own adjoint.
-    reflection_axis = ops.convert_to_tensor(self.reflection_axis)
+    reflection_axis = ops.convert_to_tensor_v2_with_dispatch(
+        self.reflection_axis)
     x = linalg.adjoint(x) if adjoint_arg else x
     normalized_axis = reflection_axis / linalg.norm(
         reflection_axis, axis=-1, keepdims=True)
@@ -229,7 +230,8 @@ class LinearOperatorHouseholder(linear_operator.LinearOperator):
     return self._matmul(rhs, adjoint, adjoint_arg)
 
   def _to_dense(self):
-    reflection_axis = ops.convert_to_tensor(self.reflection_axis)
+    reflection_axis = ops.convert_to_tensor_v2_with_dispatch(
+        self.reflection_axis)
     normalized_axis = reflection_axis / linalg.norm(
         reflection_axis, axis=-1, keepdims=True)
     mat = normalized_axis[..., array_ops.newaxis]
@@ -238,7 +240,8 @@ class LinearOperatorHouseholder(linear_operator.LinearOperator):
         matrix, 1. + array_ops.matrix_diag_part(matrix))
 
   def _diag_part(self):
-    reflection_axis = ops.convert_to_tensor(self.reflection_axis)
+    reflection_axis = ops.convert_to_tensor_v2_with_dispatch(
+        self.reflection_axis)
     normalized_axis = reflection_axis / linalg.norm(
         reflection_axis, axis=-1, keepdims=True)
     return 1. - 2 * normalized_axis * math_ops.conj(normalized_axis)
diff --git a/tensorflow/python/ops/linalg/linear_operator_identity.py b/tensorflow/python/ops/linalg/linear_operator_identity.py
index 8226e74bacd..a0f7ead42d6 100644
--- a/tensorflow/python/ops/linalg/linear_operator_identity.py
+++ b/tensorflow/python/ops/linalg/linear_operator_identity.py
@@ -394,7 +394,7 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
       A `Tensor` with broadcast shape and same `dtype` as `self`.
     """
     with self._name_scope(name):
-      mat = ops.convert_to_tensor(mat, name="mat")
+      mat = ops.convert_to_tensor_v2_with_dispatch(mat, name="mat")
       mat_diag = array_ops.matrix_diag_part(mat)
       new_diag = 1 + mat_diag
       return array_ops.matrix_set_diag(mat, new_diag)
@@ -720,7 +720,7 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
       multiplier_vector = array_ops.expand_dims(self.multiplier, -1)
 
       # Shape [C1,...,Cc, M, M]
-      mat = ops.convert_to_tensor(mat, name="mat")
+      mat = ops.convert_to_tensor_v2_with_dispatch(mat, name="mat")
 
       # Shape [C1,...,Cc, M]
       mat_diag = array_ops.matrix_diag_part(mat)
diff --git a/tensorflow/python/ops/linalg/linear_operator_permutation.py b/tensorflow/python/ops/linalg/linear_operator_permutation.py
index 3a44cd5ef1b..9cc8e158a21 100644
--- a/tensorflow/python/ops/linalg/linear_operator_permutation.py
+++ b/tensorflow/python/ops/linalg/linear_operator_permutation.py
@@ -197,7 +197,7 @@ class LinearOperatorPermutation(linear_operator.LinearOperator):
     return array_ops.shape(perm)[-1]
 
   def _matmul(self, x, adjoint=False, adjoint_arg=False):
-    perm = ops.convert_to_tensor(self.perm)
+    perm = ops.convert_to_tensor_v2_with_dispatch(self.perm)
     if adjoint and not self.is_self_adjoint:
       # TODO(srvasude): invert_permutation doesn't work on batches so we use
       # argsort.
@@ -232,13 +232,13 @@ class LinearOperatorPermutation(linear_operator.LinearOperator):
     return self._matmul(rhs, adjoint=(not adjoint), adjoint_arg=adjoint_arg)
 
   def _to_dense(self):
-    perm = ops.convert_to_tensor(self.perm)
+    perm = ops.convert_to_tensor_v2_with_dispatch(self.perm)
     return math_ops.cast(math_ops.equal(
         math_ops.range(0, self._domain_dimension_tensor(perm)),
         perm[..., array_ops.newaxis]), self.dtype)
 
   def _diag_part(self):
-    perm = ops.convert_to_tensor(self.perm)
+    perm = ops.convert_to_tensor_v2_with_dispatch(self.perm)
     return math_ops.cast(math_ops.equal(
         math_ops.range(0, self._domain_dimension_tensor(perm)),
         perm), self.dtype)
diff --git a/tensorflow/python/ops/linalg/linear_operator_toeplitz.py b/tensorflow/python/ops/linalg/linear_operator_toeplitz.py
index 71fff44da44..2d61a536e29 100644
--- a/tensorflow/python/ops/linalg/linear_operator_toeplitz.py
+++ b/tensorflow/python/ops/linalg/linear_operator_toeplitz.py
@@ -209,8 +209,8 @@ class LinearOperatorToeplitz(linear_operator.LinearOperator):
     # for more details.
     x = linalg.adjoint(x) if adjoint_arg else x
     expanded_x = array_ops.concat([x, array_ops.zeros_like(x)], axis=-2)
-    col = ops.convert_to_tensor(self.col)
-    row = ops.convert_to_tensor(self.row)
+    col = ops.convert_to_tensor_v2_with_dispatch(self.col)
+    row = ops.convert_to_tensor_v2_with_dispatch(self.row)
     circulant_col = array_ops.concat(
         [col,
          array_ops.zeros_like(col[..., 0:1]),
@@ -236,8 +236,8 @@ class LinearOperatorToeplitz(linear_operator.LinearOperator):
         [self.domain_dimension_tensor()], self.dtype)
 
   def _to_dense(self):
-    row = ops.convert_to_tensor(self.row)
-    col = ops.convert_to_tensor(self.col)
+    row = ops.convert_to_tensor_v2_with_dispatch(self.row)
+    col = ops.convert_to_tensor_v2_with_dispatch(self.col)
     total_shape = array_ops.broadcast_dynamic_shape(
         array_ops.shape(row), array_ops.shape(col))
     n = array_ops.shape(row)[-1]
diff --git a/tensorflow/python/ops/linalg/linear_operator_tridiag.py b/tensorflow/python/ops/linalg/linear_operator_tridiag.py
index 422747848c0..2ba310f75bf 100644
--- a/tensorflow/python/ops/linalg/linear_operator_tridiag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_tridiag.py
@@ -246,7 +246,7 @@ class LinearOperatorTridiag(linear_operator.LinearOperator):
           self.diagonals, linalg.adjoint(self.diagonals),
           message='Matrix was not equal to its adjoint.')]
     elif self.diagonals_format == _COMPACT:
-      diagonals = ops.convert_to_tensor(self.diagonals)
+      diagonals = ops.convert_to_tensor_v2_with_dispatch(self.diagonals)
       asserts += [linear_operator_util.assert_zero_imag_part(
           diagonals[..., 1, :], message=diag_message)]
       # Roll the subdiagonal so the shifted argument is at the end.
@@ -353,7 +353,9 @@ class LinearOperatorTridiag(linear_operator.LinearOperator):
           align='LEFT_RIGHT',
           padding_value=0.)
 
-    diagonals = [ops.convert_to_tensor(d) for d in self.diagonals]
+    diagonals = [
+        ops.convert_to_tensor_v2_with_dispatch(d) for d in self.diagonals
+    ]
     diagonals = array_ops.stack(diagonals, axis=-2)
 
     return gen_array_ops.matrix_diag_v3(
diff --git a/tensorflow/python/ops/linalg/linear_operator_util.py b/tensorflow/python/ops/linalg/linear_operator_util.py
index 948f2f86a53..096ad3fb4bb 100644
--- a/tensorflow/python/ops/linalg/linear_operator_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_util.py
@@ -114,7 +114,7 @@ def convert_nonref_to_tensor(value, dtype=None, dtype_hint=None, name=None):
       raise TypeError('Mutable type must be of dtype "{}" but is "{}".'.format(
           dtype_name(dtype_base), dtype_name(value_dtype_base)))
     return value
-  return ops.convert_to_tensor(
+  return ops.convert_to_tensor_v2_with_dispatch(
       value, dtype=dtype, dtype_hint=dtype_hint, name=name)
 
 
@@ -189,10 +189,10 @@ def assert_no_entries_with_modulus_zero(
     An `Op` that asserts `x` has no entries with modulus zero.
   """
   with ops.name_scope(name, values=[x]):
-    x = ops.convert_to_tensor(x, name="x")
+    x = ops.convert_to_tensor_v2_with_dispatch(x, name="x")
     dtype = x.dtype.base_dtype
     should_be_nonzero = math_ops.abs(x)
-    zero = ops.convert_to_tensor(0, dtype=dtype.real_dtype)
+    zero = ops.convert_to_tensor_v2_with_dispatch(0, dtype=dtype.real_dtype)
     return check_ops.assert_less(zero, should_be_nonzero, message=message)
 
 
@@ -208,13 +208,13 @@ def assert_zero_imag_part(x, message=None, name="assert_zero_imag_part"):
     An `Op` that asserts `x` has no entries with modulus zero.
   """
   with ops.name_scope(name, values=[x]):
-    x = ops.convert_to_tensor(x, name="x")
+    x = ops.convert_to_tensor_v2_with_dispatch(x, name="x")
     dtype = x.dtype.base_dtype
 
     if dtype.is_floating:
       return control_flow_ops.no_op()
 
-    zero = ops.convert_to_tensor(0, dtype=dtype.real_dtype)
+    zero = ops.convert_to_tensor_v2_with_dispatch(0, dtype=dtype.real_dtype)
     return check_ops.assert_equal(zero, math_ops.imag(x), message=message)
 
 
@@ -261,7 +261,7 @@ def shape_tensor(shape, name=None):
     dtype = dtypes.int32
   else:
     dtype = None
-  return ops.convert_to_tensor(shape, dtype=dtype, name=name)
+  return ops.convert_to_tensor_v2_with_dispatch(shape, dtype=dtype, name=name)
 
 
 ################################################################################
@@ -323,7 +323,7 @@ def broadcast_matrix_batch_dims(batch_matrices, name=None):
     batch_matrices = list(batch_matrices)
 
     for i, mat in enumerate(batch_matrices):
-      batch_matrices[i] = ops.convert_to_tensor(mat)
+      batch_matrices[i] = ops.convert_to_tensor_v2_with_dispatch(mat)
       assert_is_batch_matrix(batch_matrices[i])
 
     if len(batch_matrices) < 2:
@@ -366,8 +366,9 @@ def broadcast_matrix_batch_dims(batch_matrices, name=None):
 def matrix_solve_with_broadcast(matrix, rhs, adjoint=False, name=None):
   """Solve systems of linear equations."""
   with ops.name_scope(name, "MatrixSolveWithBroadcast", [matrix, rhs]):
-    matrix = ops.convert_to_tensor(matrix, name="matrix")
-    rhs = ops.convert_to_tensor(rhs, name="rhs", dtype=matrix.dtype)
+    matrix = ops.convert_to_tensor_v2_with_dispatch(matrix, name="matrix")
+    rhs = ops.convert_to_tensor_v2_with_dispatch(
+        rhs, name="rhs", dtype=matrix.dtype)
 
     # If either matrix/rhs has extra dims, we can reshape to get rid of them.
     matrix, rhs, reshape_inv, still_need_to_transpose = _reshape_for_efficiency(
@@ -526,7 +527,8 @@ def arg_is_blockwise(block_dimensions, arg, arg_split_dim):
     if not any(nest.is_nested(x) for x in arg):
       return True
     else:
-      arg_dims = [ops.convert_to_tensor(x).shape[arg_split_dim] for x in arg]
+      arg_dims = [ops.convert_to_tensor_v2_with_dispatch(
+          x).shape[arg_split_dim] for x in arg]
       self_dims = [dim.value for dim in block_dimensions]
 
       # If none of the operator dimensions are known, interpret the input as
diff --git a/tensorflow/python/ops/linalg_grad.py b/tensorflow/python/ops/linalg_grad.py
index 53708dde221..40f03491e6d 100644
--- a/tensorflow/python/ops/linalg_grad.py
+++ b/tensorflow/python/ops/linalg_grad.py
@@ -495,7 +495,7 @@ def _QrGrad(op, dq, dr):
     raise NotImplementedError("QrGrad not implemented with dynamic shapes.")
   if (r.shape.dims[-2].value > r.shape.dims[-1].value and
       q.shape.dims[-2].value == q.shape.dims[-1].value):
-    raise NotImplementedError("QrGrad not implemented when ncols > nrows "
+    raise NotImplementedError("QrGrad not implemented when nrows > ncols "
                               "and full_matrices is true.")
 
   def _TriangularSolve(x, r):
@@ -506,7 +506,6 @@ def _QrGrad(op, dq, dr):
 
   def _QrGradSquareAndDeepMatrices(q, r, dq, dr):
     """Gradient for matrix orders num_rows >= num_cols
-
     and full_matrices is false.
     """
     qdq = math_ops.matmul(q, dq, adjoint_a=True)
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index 87b8aaa30bd..9f27ccf9a1c 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -40,6 +40,7 @@ from tensorflow.python.ops import string_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_lookup_ops import *
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.training.saver import BaseSaverBuilder
 # pylint: enable=wildcard-import
 from tensorflow.python.training.tracking import base as trackable_base
@@ -209,14 +210,16 @@ class InitializableLookupTableBase(LookupInterface):
       name: A name for the operation (optional).
 
     Returns:
-      A `SparseTensor` if keys are sparse, otherwise a dense `Tensor`.
+      A `SparseTensor` if keys are sparse, a `RaggedTensor` if keys are ragged,
+      otherwise a dense `Tensor`.
 
     Raises:
       TypeError: when `keys` or `default_value` doesn't match the table data
         types.
     """
     key_tensor = keys
-    if isinstance(keys, sparse_tensor.SparseTensor):
+    if isinstance(keys,
+                  (sparse_tensor.SparseTensor, ragged_tensor.RaggedTensor)):
       key_tensor = keys.values
 
     if keys.dtype.base_dtype != self._key_dtype:
@@ -233,6 +236,8 @@ class InitializableLookupTableBase(LookupInterface):
     values.set_shape(key_tensor.get_shape())
     if isinstance(keys, sparse_tensor.SparseTensor):
       return sparse_tensor.SparseTensor(keys.indices, values, keys.dense_shape)
+    elif isinstance(keys, ragged_tensor.RaggedTensor):
+      return keys.with_values(values)
     else:
       return values
 
@@ -1058,7 +1063,8 @@ class IdTableWithHashBuckets(LookupInterface):
       name: Optional name for the op.
 
     Returns:
-      A `SparseTensor` if keys are sparse, otherwise a dense `Tensor`.
+      A `SparseTensor` if keys are sparse, a `RaggedTensor` if keys are ragged,
+      otherwise a dense `Tensor`.
 
     Raises:
       TypeError: when `keys` doesn't match the table key data type.
@@ -1067,7 +1073,8 @@ class IdTableWithHashBuckets(LookupInterface):
       raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
                       (self._key_dtype, keys.dtype))
     values = keys
-    if isinstance(keys, sparse_tensor.SparseTensor):
+    if isinstance(keys,
+                  (sparse_tensor.SparseTensor, ragged_tensor.RaggedTensor)):
       values = keys.values
     if self._table and (self._table.key_dtype.base_dtype == dtypes.int64):
       values = math_ops.cast(values, dtypes.int64)
@@ -1092,6 +1099,8 @@ class IdTableWithHashBuckets(LookupInterface):
           ids = buckets
     if isinstance(keys, sparse_tensor.SparseTensor):
       return sparse_tensor.SparseTensor(keys.indices, ids, keys.dense_shape)
+    elif isinstance(keys, ragged_tensor.RaggedTensor):
+      return keys.with_values(ids)
     return ids
 
 
@@ -1244,7 +1253,8 @@ class StaticVocabularyTable(LookupInterface):
       name: Optional name for the op.
 
     Returns:
-      A `SparseTensor` if keys are sparse, otherwise a dense `Tensor`.
+      A `SparseTensor` if keys are sparse, a `RaggedTensor` if keys are ragged,
+      otherwise a dense `Tensor`.
 
     Raises:
       TypeError: when `keys` doesn't match the table key data type.
@@ -1253,7 +1263,8 @@ class StaticVocabularyTable(LookupInterface):
       raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
                       (self._key_dtype, keys.dtype))
     values = keys
-    if isinstance(keys, sparse_tensor.SparseTensor):
+    if isinstance(keys,
+                  (sparse_tensor.SparseTensor, ragged_tensor.RaggedTensor)):
       values = keys.values
     if self._table and (self._table.key_dtype.base_dtype == dtypes.int64):
       values = math_ops.cast(values, dtypes.int64)
@@ -1273,6 +1284,8 @@ class StaticVocabularyTable(LookupInterface):
         ids = buckets
     if isinstance(keys, sparse_tensor.SparseTensor):
       return sparse_tensor.SparseTensor(keys.indices, ids, keys.dense_shape)
+    elif isinstance(keys, ragged_tensor.RaggedTensor):
+      return keys.with_values(ids)
     return ids
 
 
diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py
index e751ba1b25c..bf7b2cda7f3 100644
--- a/tensorflow/python/ops/map_fn.py
+++ b/tensorflow/python/ops/map_fn.py
@@ -38,10 +38,16 @@ from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import lazy_loader
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 
+np_arrays = lazy_loader.LazyLoader(
+    "np_arrays", globals(),
+    "tensorflow.python.ops.numpy_ops.np_arrays")
+
+
 @tf_export(v1=["map_fn"])
 @deprecation.deprecated_args(None, "Use fn_output_signature instead", "dtype")
 def map_fn(fn,
@@ -419,7 +425,10 @@ def map_fn(fn,
     ]
 
     # Check that inputs are not scalars.
-    elems_static_shape = elems_flat[0].shape
+    first_elem = elems_flat[0]
+    if isinstance(first_elem, np_arrays.ndarray):
+      first_elem = first_elem.data
+    elems_static_shape = first_elem.shape
     if elems_static_shape.ndims is not None and elems_static_shape.ndims < 1:
       if len(elems_flat) == 1:
         raise ValueError("elems must be a 1+ dimensional Tensor, not a scalar")
diff --git a/tensorflow/python/ops/map_ops.py b/tensorflow/python/ops/map_ops.py
new file mode 100644
index 00000000000..3f684bab22e
--- /dev/null
+++ b/tensorflow/python/ops/map_ops.py
@@ -0,0 +1,75 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Ops to manipulate hashmap of tensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_map_ops
+from tensorflow.python.ops.gen_map_ops import *
+
+ops.NotDifferentiable("EmptyTensorMap")
+
+
+def empty_tensor_map():
+  return gen_map_ops.empty_tensor_map()
+
+
+def tensor_map_size(input_handle):
+  return gen_map_ops.tensor_map_size(input_handle)
+
+
+def tensor_map_insert(input_handle, key, value):
+  return gen_map_ops.tensor_map_insert(input_handle, key, value)
+
+
+def tensor_map_lookup(input_handle, key, value_dtype):
+  return gen_map_ops.tensor_map_lookup(input_handle, key, value_dtype)
+
+
+def tensor_map_erase(input_handle, key, value_dtype):
+  return gen_map_ops.tensor_map_erase(input_handle, key, value_dtype)
+
+
+def tensor_map_has_key(input_handle, key):
+  return gen_map_ops.tensor_map_has_key(input_handle, key)
+
+
+@ops.RegisterGradient("TensorMapLookup")
+def LookupGrad(op, dval):
+  _, k = op.inputs
+  map_grad = empty_tensor_map()
+  map_grad = tensor_map_insert(map_grad, k, dval)
+  key_grad = None
+  return map_grad, key_grad
+
+
+@ops.RegisterGradient("TensorMapInsert")
+def InsertGrad(op, dmap):
+  _, k, v = op.inputs
+  key_grad = None
+  value_grad = control_flow_ops.cond(
+      tensor_map_has_key(dmap, k), lambda: tensor_map_lookup(dmap, k, v.dtype),
+      lambda: array_ops.zeros_like(v))
+  map_grad = control_flow_ops.cond(
+      tensor_map_has_key(dmap, k),
+      lambda: tensor_map_erase(dmap, k, v.dtype)[0], lambda: dmap)
+  return map_grad, key_grad, value_grad
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index d8f309a29a7..c29fd8f1477 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -70,8 +70,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-
 import numpy as np
 import six
 from six.moves import builtins
@@ -87,6 +85,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_bitwise_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_nn_ops
@@ -100,6 +99,7 @@ from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
+from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import tf_export
 
 # Aliases for some automatically-generated names.
@@ -366,9 +366,18 @@ def abs(x, name=None):  # pylint: disable=redefined-builtin
 
   Given a tensor `x` of complex numbers, this operation returns a tensor of type
   `float32` or `float64` that is the absolute value of each element in `x`. For
-  a complex number \\(a + bj\\), its absolute value is computed as \\(\sqrt{a^2
-  + b^2}\\).  For example:
+  a complex number \\(a + bj\\), its absolute value is computed as
+  \\(\sqrt{a^2 + b^2}\\).
 
+  For example:
+
+  >>> # real number
+  >>> x = tf.constant([-2.25, 3.25])
+  >>> tf.abs(x)
+  <tf.Tensor: shape=(2,), dtype=float32,
+  numpy=array([2.25, 3.25], dtype=float32)>
+
+  >>> # complex number
   >>> x = tf.constant([[-2.25 + 4.75j], [-3.25 + 5.75j]])
   >>> tf.abs(x)
   <tf.Tensor: shape=(2, 1), dtype=float64, numpy=
@@ -685,20 +694,27 @@ def complex(real, imag, name=None):
 @tf_export("math.sign", "sign")
 @dispatch.add_dispatch_support
 def sign(x, name=None):
-  """Returns an element-wise indication of the sign of a number.
+  r"""Returns an element-wise indication of the sign of a number.
 
-  y = sign(x) = -1 if x < 0; 0 if x == 0; 1 if x > 0.
+  `y = sign(x) = -1 if x < 0; 0 if x == 0; 1 if x > 0`.
 
-  For complex numbers, y = sign(x) = x / |x| if x != 0, otherwise y = 0.
+  For complex numbers, `y = sign(x) = x / |x| if x != 0, otherwise y = 0`.
 
   Example usage:
 
+  >>> # real number
   >>> tf.math.sign([0., 2., -3.])
-  <tf.Tensor: ... numpy=array([ 0.,  1., -1.], dtype=float32)>
+  <tf.Tensor: shape=(3,), dtype=float32,
+  numpy=array([ 0.,  1., -1.], dtype=float32)>
+
+  >>> # complex number
+  >>> tf.math.sign([1 + 1j, 0 + 0j])
+  <tf.Tensor: shape=(2,), dtype=complex128,
+  numpy=array([0.70710678+0.70710678j, 0.        +0.j        ])>
 
   Args:
    x: A Tensor. Must be one of the following types: bfloat16, half, float32,
-      float64, int32, int64, complex64, complex128.
+     float64, int32, int64, complex64, complex128.
    name: A name for the operation (optional).
 
   Returns:
@@ -708,7 +724,7 @@ def sign(x, name=None):
      tf.math.sign(x.values, ...), x.dense_shape).
   """
   x = ops.convert_to_tensor(x)
-  if x.dtype in (dtypes.complex64, dtypes.complex128):
+  if x.dtype.is_complex:
     return gen_math_ops.div_no_nan(
         x,
         cast(
@@ -1103,10 +1119,6 @@ def to_complex128(x, name="ToComplex128"):
 
 ops.Tensor._override_operator("__neg__", gen_math_ops.neg)
 ops.Tensor._override_operator("__abs__", abs)
-# __invert__ corresponds to the ~ operator.  Here we follow the numpy convention
-# ~ marks an elementwise bit-wise inverse.  This is only implemented for boolean
-# tensors and will throw a TypeError if used on nonboolean arrays
-ops.Tensor._override_operator("__invert__", gen_math_ops.logical_not)
 
 
 def _OverrideBinaryOperatorHelper(func, op_name, clazz_object=ops.Tensor):
@@ -1570,9 +1582,35 @@ def logical_and(x, y, name=None):
   return gen_math_ops.logical_and(x, y, name)
 
 
-_OverrideBinaryOperatorHelper(logical_and, "and")
-_OverrideBinaryOperatorHelper(gen_math_ops.logical_or, "or")
-_OverrideBinaryOperatorHelper(logical_xor, "xor")
+def and_(x, y, name=None):
+  if x.dtype == dtypes.bool:
+    return gen_math_ops.logical_and(x, y, name)
+  return gen_bitwise_ops.bitwise_and(x, y)
+
+
+def or_(x, y, name=None):
+  if x.dtype == dtypes.bool:
+    return gen_math_ops.logical_or(x, y, name)
+  return gen_bitwise_ops.bitwise_or(x, y)
+
+
+def xor_(x, y, name=None):
+  if x.dtype == dtypes.bool:
+    return logical_xor(x, y, name)
+  return gen_bitwise_ops.bitwise_xor(x, y)
+
+
+def invert_(x, name=None):
+  if x.dtype == dtypes.bool:
+    return gen_math_ops.logical_not(x, name=name)
+  return gen_bitwise_ops.invert(x, name=name)
+
+
+_OverrideBinaryOperatorHelper(and_, "and")
+_OverrideBinaryOperatorHelper(or_, "or")
+_OverrideBinaryOperatorHelper(xor_, "xor")
+ops.Tensor._override_operator("__invert__", invert_)
+
 
 ops.Tensor._override_operator("__lt__", gen_math_ops.less)
 ops.Tensor._override_operator("__le__", gen_math_ops.less_equal)
@@ -3023,7 +3061,7 @@ def trace(x, name=None):
   in x. If x is of rank `k` with shape `[I, J, K, ..., L, M, N]`, then output
   is a tensor of rank `k-2` with dimensions `[I, J, K, ..., L]` where
 
-  `output[i, j, k, ..., l] = trace(x[i, j, i, ..., l, :, :])`
+  `output[i, j, k, ..., l] = trace(x[i, j, k, ..., l, :, :])`
 
   For example:
 
@@ -3490,7 +3528,7 @@ def add_n(inputs, name=None):
     ValueError: If `inputs` don't all have same shape and dtype or the shape
     cannot be inferred.
   """
-  if not inputs or not isinstance(inputs, collections.Iterable):
+  if not inputs or not isinstance(inputs, collections_abc.Iterable):
     raise ValueError("inputs must be an iterable of at least one "
                      "Tensor/IndexedSlices with the same dtype and shape")
   inputs = ops.convert_n_to_tensor_or_indexed_slices(inputs)
@@ -3593,9 +3631,9 @@ def _accumulate_n_grad(op, grad):
 def sigmoid(x, name=None):
   r"""Computes sigmoid of `x` element-wise.
 
-  Formula for calculating sigmoid(x): `y = 1 / (1 + exp(-x))`.
+  Formula for calculating $\mathrm{sigmoid}(x) = y = 1 / (1 + \exp(-x))$.
 
-  For x \in (-inf, inf) => sigmoid(x) \in (0, 1)
+  For $x \in (-\infty, \infty)$, $\mathrm{sigmoid}(x) \in (0, 1)$.
 
   Example Usage:
 
@@ -3623,9 +3661,9 @@ def sigmoid(x, name=None):
 
   Returns:
     A Tensor with the same type as `x`.
-  
+
   Usage Example:
-  
+
   >>> x = tf.constant([-128.0, 0.0, 128.0], dtype=tf.float32)
   >>> tf.sigmoid(x)
   <tf.Tensor: shape=(3,), dtype=float32,
@@ -3849,20 +3887,29 @@ def cumulative_logsumexp(x, axis=0, exclusive=False, reverse=False, name=None):
 def conj(x, name=None):
   r"""Returns the complex conjugate of a complex number.
 
-  Given a tensor `input` of complex numbers, this operation returns a tensor of
-  complex numbers that are the complex conjugate of each element in `input`. The
-  complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
-  real part and *b* is the imaginary part.
+  Given a tensor `x` of complex numbers, this operation returns a tensor of
+  complex numbers that are the complex conjugate of each element in `x`. The
+  complex numbers in `x` must be of the form \\(a + bj\\), where `a` is the
+  real part and `b` is the imaginary part.
 
   The complex conjugate returned by this operation is of the form \\(a - bj\\).
 
   For example:
 
-      # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-      tf.math.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
+  >>> x = tf.constant([-2.25 + 4.75j, 3.25 + 5.75j])
+  >>> tf.math.conj(x)
+  <tf.Tensor: shape=(2,), dtype=complex128,
+  numpy=array([-2.25-4.75j,  3.25-5.75j])>
 
   If `x` is real, it is returned unchanged.
 
+  For example:
+
+  >>> x = tf.constant([-2.25, 3.25])
+  >>> tf.math.conj(x)
+  <tf.Tensor: shape=(2,), dtype=float32,
+  numpy=array([-2.25,  3.25], dtype=float32)>
+
   Args:
     x: `Tensor` to conjugate.  Must have numeric or variant type.
     name: A name for the operation (optional).
@@ -3872,6 +3919,10 @@ def conj(x, name=None):
 
   Raises:
     TypeError: If `x` is not a numeric tensor.
+
+  @compatibility(numpy)
+  Equivalent to numpy.conj.
+  @end_compatibility
   """
   if isinstance(x, ops.Tensor):
     dt = x.dtype
@@ -3965,8 +4016,7 @@ def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
   segmentation](https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/math#about_segmentation)
   for an explanation of segments.
 
-  This operator is similar to the unsorted segment sum operator found
-  [here](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+  This operator is similar to the `tf.math.unsorted_segment_sum` operator.
   Instead of computing the sum over segments, it computes the mean of all
   entries belonging to a segment such that:
 
@@ -4012,8 +4062,7 @@ def unsorted_segment_sqrt_n(data, segment_ids, num_segments, name=None):
   segmentation](https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/math#about_segmentation)
   for an explanation of segments.
 
-  This operator is similar to the unsorted segment sum operator found
-  [here](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+  This operator is similar to the `tf.math.unsorted_segment_sum` operator.
   Additionally to computing the sum over segments, it divides the results by
   sqrt(N).
 
@@ -4546,12 +4595,12 @@ def polyval(coeffs, x, name=None):
   If `x` is a tensor and `coeffs` is a list n + 1 tensors,
   this function returns the value of the n-th order polynomial
 
-     p(x) = coeffs[n-1] + coeffs[n-2] * x + ...  + coeffs[0] * x**(n-1)
+  `p(x) = coeffs[n-1] + coeffs[n-2] * x + ...  + coeffs[0] * x**(n-1)`
 
   evaluated using Horner's method, i.e.
 
-     p(x) = coeffs[n-1] + x * (coeffs[n-2] + ... + x * (coeffs[1] +
-            x * coeffs[0]))
+  `p(x) = coeffs[n-1] + x * (coeffs[n-2] + ... + x * (coeffs[1]
+          + x * coeffs[0]))`
 
   Usage Example:
 
@@ -4798,10 +4847,14 @@ def exp(x, name=None):
   numpy=array([   7.389056, 2980.958   ], dtype=float32)>
 
   For complex numbers, the exponential value is calculated as
-  \\(e^{x+iy}={e^x}{e^{iy}}={e^x}{\\cos(y)+i\\sin(y)}\\)
+  $$
+  e^{x+iy} = {e^x} {e^{iy}} = {e^x} ({\cos (y) + i \sin (y)})
+  $$
 
   For `1+1j` the value would be computed as:
-  \\(e^1{\\cos(1)+i\\sin(1)} = 2.7182817 \\times (0.5403023+0.84147096j)\\)
+  $$
+  e^1 (\cos (1) + i \sin (1)) = 2.7182817 \times (0.5403023+0.84147096j)
+  $$
 
   >>> x = tf.constant(1 + 1j)
   >>> tf.math.exp(x)
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index bf15bf86ee2..dabf4bb9d33 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -261,7 +261,10 @@ class ModTest(test_util.TensorFlowTestCase):
 class SquaredDifferenceTest(test_util.TensorFlowTestCase):
 
   def testSquaredDifference(self):
-    for dtype in [np.float16, np.float32, np.float64, np.int32, np.int64]:
+    for dtype in [
+        np.float16, np.float32, np.float64, dtypes.bfloat16.as_numpy_dtype,
+        np.int32, np.int64
+    ]:
       x = np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype)
       y = np.array([-3, -2, -1], dtype=dtype)
       z = (x - y) * (x - y)
@@ -473,6 +476,13 @@ class DivAndModTest(test_util.TensorFlowTestCase):
     #               % array_ops.constant(divs))
     # self.assertAllEqual(tf2_result, tf_result)
 
+  def testFloorModBfloat64(self):
+    nums, divs = self.floatTestData()
+    tf_result = math_ops.floormod(math_ops.cast(nums, dtypes.bfloat16),
+                                  math_ops.cast(divs, dtypes.bfloat16))
+    np_result = nums % divs
+    self.assertAllEqual(tf_result, np_result)
+
   def testTruncateModInt(self):
     nums, divs = self.intTestData()
     tf_result = math_ops.truncatemod(nums, divs)
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 702da66b1c5..89174b29336 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -1158,9 +1158,23 @@ def sufficient_statistics(x, axes, shift=None, keep_dims=None, name=None,
   an input that's optionally shifted. See:
   https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data
 
+  For example:
+  >>> t = [[1, 2, 3], [4, 5, 6]]
+  >>> sufficient_statistics(t, [1])
+  (<tf.Tensor: shape=(), dtype=int32, numpy=3>, <tf.Tensor: shape=(2,),
+  dtype=int32, numpy=array([ 6, 15], dtype=int32)>, <tf.Tensor: shape=(2,),
+  dtype=int32, numpy=array([14, 77], dtype=int32)>, None)
+  >>> sufficient_statistics(t, [-1])
+  (<tf.Tensor: shape=(), dtype=int32, numpy=3>, <tf.Tensor: shape=(2,),
+  dtype=int32, numpy=array([ 6, 15], dtype=int32)>, <tf.Tensor: shape=(2,),
+  dtype=int32, numpy=array([14, 77], dtype=int32)>, None)
+
   Args:
     x: A `Tensor`.
-    axes: Array of ints. Axes along which to compute mean and variance.
+    axes: Array of ints. Axes along which to compute mean and variance. As in
+      Python, the axes can also be negative numbers. A negative axis is
+      interpreted as counting from the end of the rank, i.e., axis +
+      rank(values)-th dimension.
     shift: A `Tensor` containing the value by which to shift the data for
       numerical stability, or `None` if no shift is to be performed. A shift
       close to the true mean provides the most numerically stable results.
@@ -1191,8 +1205,11 @@ def sufficient_statistics(x, axes, shift=None, keep_dims=None, name=None,
         counts *= x_shape.dims[d].value
       counts = constant_op.constant(counts, dtype=x.dtype)
     else:  # shape needs to be inferred at runtime.
+      # Normalize axes to be positive. Required for gather.
+      rank = array_ops.rank(x)
+      positive_axes = [axis + rank if axis < 0 else axis for axis in axes]
       x_dims = array_ops.gather(
-          math_ops.cast(array_ops.shape(x), x.dtype), axes)
+          math_ops.cast(array_ops.shape(x), x.dtype), positive_axes)
       counts = math_ops.reduce_prod(x_dims, name="count")
     if shift is not None:
       shift = ops.convert_to_tensor(shift, name="shift")
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 5a9a63637f6..ff55ca32e8d 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import functools
 import numbers
 import os
@@ -940,8 +939,8 @@ def convolution(
     filter: An (N+2)-D `Tensor` with the same type as `input` and shape
       `spatial_filter_shape + [in_channels, out_channels]`.
     padding: A string, either `"VALID"` or `"SAME"`. The padding algorithm.
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
+      `"valid"` means no padding. `"same"` results in padding evenly to
+      the left/right or up/down of the input such that output has the same
       height/width dimension as the input.
     strides: Optional.  Sequence of N ints >= 1.  Specifies the output stride.
       Defaults to [1]*N.  If any value of strides is > 1, then all values of
@@ -2257,11 +2256,12 @@ def conv2d(  # pylint: disable=redefined-builtin,dangerous-default-value
   strides = _get_sequence(strides, 2, channel_index, "strides")
   dilations = _get_sequence(dilations, 2, channel_index, "dilations")
 
-  # Try really hard to avoid modifying the legacy name scopes - return early.
-  shape = getattr(input, "shape", None)
-  if shape is not None:
-    ndims = getattr(shape, "ndims", -1)
-    if ndims == -1: ndims = len(shape)
+  shape = input.shape
+  # shape object may lack ndims, e.g., if input is an np.ndarray.  In that case,
+  # we fall back to len(shape).
+  ndims = getattr(shape, "ndims", -1)
+  if ndims == -1:
+    ndims = len(shape)
   if ndims in (4, 3, 2, 1, 0, None):
     # We avoid calling squeeze_batch_dims to reduce extra python function
     # call slowdown in eager mode.  This branch doesn't require reshapes.
@@ -2990,12 +2990,12 @@ def _conv3d_expanded_batch(
     dilations=None,
     name=None):
   """Helper function for `conv3d`; handles expanded batches."""
-  # Try really hard to avoid modifying the legacy name sceops - return early.
-  shape = getattr(input, "shape", None)
-  if shape is not None:
-    ndims = getattr(shape, "ndims", -1)
-    if ndims == -1:
-      ndims = len(shape)
+  shape = input.shape
+  # shape object may lack ndims, e.g., if input is an np.ndarray.  In that case,
+  # we fall back to len(shape).
+  ndims = getattr(shape, "ndims", -1)
+  if ndims == -1:
+    ndims = len(shape)
   if ndims in (5, 4, 3, 2, 1, 0, None):
     # We avoid calling squeeze_batch_dims to reduce extra python function
     # call slowdown in eager mode.  This branch doesn't require reshapes.
@@ -3273,7 +3273,7 @@ def conv_transpose(input,  # pylint: disable=redefined-builtin
                       [input, filter, output_shape]) as name:
     if tensor_util.is_tensor(output_shape):
       n = output_shape.shape[0] - 2
-    elif isinstance(output_shape, collections.Sized):
+    elif isinstance(output_shape, collections_abc.Sized):
       n = len(output_shape) - 2
     else:
       raise ValueError("output_shape must be a tensor or sized collection.")
@@ -3493,6 +3493,52 @@ def leaky_relu(features, alpha=0.2, name=None):
     return gen_nn_ops.leaky_relu(features, alpha=alpha, name=name)
 
 
+@tf_export("nn.gelu", v1=[])
+@dispatch.add_dispatch_support
+def gelu(features, approximate=False, name=None):
+  """Compute the Gaussian Error Linear Unit (GELU) activation function.
+
+  Gaussian error linear unit (GELU) computes
+  `x * P(X <= x)`, where `P(X) ~ N(0, 1)`.
+  The (GELU) nonlinearity weights inputs by their value, rather than gates
+  inputs by their sign as in ReLU.
+
+  For example:
+
+  >>> x = tf.constant([-3.0, -1.0, 0.0, 1.0, 3.0], dtype=tf.float32)
+  >>> y = tf.nn.gelu(x)
+  >>> y.numpy()
+  array([-0.00404951, -0.15865529,  0.        ,  0.8413447 ,  2.9959507 ],
+      dtype=float32)
+  >>> y = tf.nn.gelu(x, approximate=True)
+  >>> y.numpy()
+  array([-0.00363752, -0.15880796,  0.        ,  0.841192  ,  2.9963627 ],
+      dtype=float32)
+
+  Args:
+    features: A `Tensor` representing preactivation values.
+    approximate: An optional `bool`. Defaults to `False`. Whether to enable
+      approximation.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` with the same type as `features`.
+
+  References:
+    [Gaussian Error Linear Units (GELUs)](https://arxiv.org/abs/1606.08415).
+  """
+  with ops.name_scope(name, "Gelu", [features]):
+    features = ops.convert_to_tensor(features, name="features")
+    if approximate:
+      coeff = math_ops.cast(0.044715, features.dtype)
+      return 0.5 * features * (
+          1.0 + math_ops.tanh(0.7978845608028654 *
+                              (features + coeff * math_ops.pow(features, 3))))
+    else:
+      return 0.5 * features * (1.0 + math_ops.erf(
+          features / math_ops.cast(1.4142135623730951, features.dtype)))
+
+
 def _flatten_outer_dims(logits):
   """Flattens logits' outer dimensions and keep its last dimension."""
   rank = array_ops.rank(logits)
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index c6433db610d..3802f92b384 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -1059,6 +1059,29 @@ class LeakyReluTest(test_lib.TestCase):
     self.assertEqual(outputs_without_name_set.name, 'LeakyRelu:0')
 
 
+class GeluTest(test_lib.TestCase):
+
+  def test(self):
+
+    def gelu(x, approximate=False):
+      if approximate:
+        return 0.5 * x * (1.0 + np.tanh(
+            np.sqrt(2.0 / np.pi) * (x + 0.044715 * np.power(x, 3))))
+      else:
+        from scipy.stats import norm  # pylint: disable=g-import-not-at-top
+        return x * norm.cdf(x)
+
+    np.random.seed(1)  # Make it reproducible.
+    x = np.random.randn(3, 4).astype(np.float32)
+    y = gelu(x)
+    z = self.evaluate(nn_ops.gelu(x))
+    self.assertAllClose(y, z)
+
+    y = gelu(x, True)
+    z = self.evaluate(nn_ops.gelu(x, True))
+    self.assertAllClose(y, z)
+
+
 class SwishTest(test_lib.TestCase):
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/ops/numpy_ops/README.md b/tensorflow/python/ops/numpy_ops/README.md
index 111de75044f..c12f0670a21 100644
--- a/tensorflow/python/ops/numpy_ops/README.md
+++ b/tensorflow/python/ops/numpy_ops/README.md
@@ -1,144 +1,5 @@
-# NumPy API on TensorFlow
+This module implements `tf.experimental.numpy` APIs, which provide NumPy APIs
+implemented on top of TensorFlow.
 
-This module provides a subset of NumPy API, built on top of TensorFlow
-operations. APIs are based on and have been tested with NumPy 1.16 version.
-
-The set of supported APIs may be expanded over time. Also future releases may
-change the baseline version of NumPy API being supported. A list of some
-systematic differences with NumPy are listed later in the "Differences with
-NumPy" section.
-
-## Getting Started
-
-```python
-import tensorflow as tf
-from tensorflow.python.ops import numpy_ops as np
-print(np.ones([2,1]) + np.ones([1, 2]))
-```
-
-## Types
-
-The module provides an `ndarray` class which wraps an immutable `tf.Tensor`.
-Additional functions are provided which accept array-like objects. Here
-array-like objects includes `ndarrays` as defined by this module, as well as
-`tf.Tensor`, in addition to types accepted by NumPy.
-
-A subset of NumPy dtypes are supported. Type promotion follows NumPy
-semantics.
-
-```python
-print(np.ones([1, 2], dtype=np.int16) + np.ones([2, 1], dtype=np.uint8))
-```
-
-## Array Interface
-
-The `ndarray` class implements the `__array__` interface. This should allow
-these objects to be passed into contexts that expect a NumPy or array-like
-object (e.g. matplotlib).
-
-```python
-import numpy as onp
-onp.sum(np.ones([1, 2]) + onp.ones([2, 1]))
-```
-
-
-## TF Interoperability
-
-The TF-NumPy API calls can be interleaved with TensorFlow calls
-without incurring Tensor data copies. This is true even if the `ndarray` or
-`tf.Tensor` is placed on a non-CPU device.
-
-In general, the expected behavior should be on par with that of code involving
-`tf.Tensor` and running stateless TensorFlow functions on them.
-
-```python
-np.sum(np.ones([1, 2]) + tf.ones([2, 1]))
-```
-
-Note that the `__array_priority__` is currently chosen to be lower than
-`tf.Tensor`. Hence the `+` operator above returns a `tf.Tensor`.
-
-Additional examples of interopability include:
-
-*  using `with tf.GradientTape()` scope to compute gradients through the
-  TF-NumPy API calls.
-*  using `tf.distribution.Strategy` scope for distributed execution
-*  using `tf.vectorized_map()` for speeding up code using auto-vectorization
-
-
-
-## Device Support
-
-Given that `ndarray` and functions wrap TensorFlow constructs, the code will
-have GPU and TPU support on par with TensorFlow. Device placement can be
-controlled by using `with tf.device` scopes. Note that these devices could
-be local or remote.
-
-```python
-with tf.device("GPU:0"):
-  x = np.ones([1, 2])
-print(tf.convert_to_tensor(x).device)
-```
-
-## Graph and Eager Modes
-
-Eager mode execution should typically match NumPy semantics of executing
-op-by-op. However the same code can be executed in graph mode, by putting it
-inside a `tf.function`. The function body can contain NumPy code, and the inputs
-can be `ndarray` as well.
-
-```python
-@tf.function
-def f(x, y):
-  return np.sum(x + y)
-
-f(np.ones([1, 2]), tf.ones([2, 1]))
-```
-Python control flow based on `ndarray` values will be translated by
-[autograph](https://www.tensorflow.org/code/tensorflow/python/autograph/g3doc/reference/index.md)
-into `tf.cond` and `tf.while_loop` constructs. The code can be XLA compiled
-for further optimizations.
-
-However, note that graph mode execution can change behavior of certain
-operations since symbolic execution may not have information that is computed
-during runtime. Some differences are:
-
-*   Shapes can be incomplete or unknown in graph mode. This means that
-    `ndarray.shape`, `ndarray.size` and `ndarray.ndim` can return `ndarray`
-    objects instead of returning integer (or tuple of integer) values.
-*   `__len__`, `__iter__` and `__index__` properties of `ndarray`
-    may similarly not be supported in graph mode. Code using these
-    may need to change to explicit shape operations or control flow
-    constructs.
-*   Also note the [autograph limitations](https://www.tensorflow.org/code/tensorflow/python/autograph/g3doc/reference/limitations.md).
-
-
-## Mutation and Variables
-
-`ndarrays` currently wrap immutable `tf.Tensor`. Hence mutation
-operations like slice assigns are not supported. This may change in the future.
-Note however that one can directly construct a `tf.Variable` and use that with
-the TF-NumPy APIs.
-
-```python
-tf_var = tf.Variable(2.0)
-tf_var.assign_add(np.square(tf_var))
-```
-
-## Differences with NumPy
-
-Here is a non-exhaustive list of differences:
-
-*   Not all dtypes are currently supported. e.g. `np.float96`, `np.float128`.
-    `np.object`, `np.str`, `np.recarray` types are not supported.
-*   `ndarray` storage is in C order only. Fortran order, views, `stride_tricks`
-    are not supported.
-*   Only a subset of functions and modules are supported. This set will be
-    expanded over time. For supported functions, some arguments or argument
-    values may not be supported. This differences are generally provide in the
-    function comments. Full `ufunc` support is also not provided.
-*   Buffer mutation is currently not supported. `ndarrays` wrap immutable
-    tensors. This means that output buffer arguments (e..g `out` in ufuncs) are
-    not supported
-*   NumPy C API is not supported. NumPy's Cython and Swig integration are not
-    supported.
+Please see [TensorFlow NumPy API
+Documentation](https://www.tensorflow.org/api_docs/python/tf/experimental/numpy).
diff --git a/tensorflow/python/ops/numpy_ops/__init__.py b/tensorflow/python/ops/numpy_ops/__init__.py
index f4d3e8b3e05..633b74b4a78 100644
--- a/tensorflow/python/ops/numpy_ops/__init__.py
+++ b/tensorflow/python/ops/numpy_ops/__init__.py
@@ -12,11 +12,154 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""tf.experimental.numpy: Numpy API on top of TensorFlow.
+"""# tf.experimental.numpy: NumPy API on TensorFlow.
 
-This module provides a subset of numpy APIs, built on top of TensorFlow
-operations. Please see documentation here:
-https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python/ops/numpy_ops.
+This module provides a subset of NumPy API, built on top of TensorFlow
+operations. APIs are based on and have been tested with NumPy 1.16 version.
+
+The set of supported APIs may be expanded over time. Also future releases may
+change the baseline version of NumPy API being supported. A list of some
+systematic differences with NumPy are listed later in the "Differences with
+NumPy" section.
+
+## Getting Started
+
+Please also see [TensorFlow NumPy Guide](
+https://www.tensorflow.org/guide/tf_numpy).
+
+In the code snippets below, we will assume that `tf.experimental.numpy` is
+imported as `tnp` and NumPy is imported as `np`
+
+```python
+print(tnp.ones([2,1]) + tnp.ones([1, 2]))
+```
+
+## Types
+
+The module provides an `ndarray` class which wraps an immutable `tf.Tensor`.
+Additional functions are provided which accept array-like objects. Here
+array-like objects includes `ndarrays` as defined by this module, as well as
+`tf.Tensor`, in addition to types accepted by NumPy.
+
+A subset of NumPy dtypes are supported. Type promotion follows NumPy
+semantics.
+
+```python
+print(tnp.ones([1, 2], dtype=tnp.int16) + tnp.ones([2, 1], dtype=tnp.uint8))
+```
+
+## Array Interface
+
+The `ndarray` class implements the `__array__` interface. This should allow
+these objects to be passed into contexts that expect a NumPy or array-like
+object (e.g. matplotlib).
+
+```python
+np.sum(tnp.ones([1, 2]) + np.ones([2, 1]))
+```
+
+
+## TF Interoperability
+
+The TF-NumPy API calls can be interleaved with TensorFlow calls
+without incurring Tensor data copies. This is true even if the `ndarray` or
+`tf.Tensor` is placed on a non-CPU device.
+
+In general, the expected behavior should be on par with that of code involving
+`tf.Tensor` and running stateless TensorFlow functions on them.
+
+```python
+tnp.sum(tnp.ones([1, 2]) + tf.ones([2, 1]))
+```
+
+Note that the `__array_priority__` is currently chosen to be lower than
+`tf.Tensor`. Hence the `+` operator above returns a `tf.Tensor`.
+
+Additional examples of interopability include:
+
+*  using `with tf.GradientTape()` scope to compute gradients through the
+  TF-NumPy API calls.
+*  using `tf.distribution.Strategy` scope for distributed execution
+*  using `tf.vectorized_map()` for speeding up code using auto-vectorization
+
+
+
+## Device Support
+
+Given that `ndarray` and functions wrap TensorFlow constructs, the code will
+have GPU and TPU support on par with TensorFlow. Device placement can be
+controlled by using `with tf.device` scopes. Note that these devices could
+be local or remote.
+
+```python
+with tf.device("GPU:0"):
+  x = tnp.ones([1, 2])
+print(tf.convert_to_tensor(x).device)
+```
+
+## Graph and Eager Modes
+
+Eager mode execution should typically match NumPy semantics of executing
+op-by-op. However the same code can be executed in graph mode, by putting it
+inside a `tf.function`. The function body can contain NumPy code, and the inputs
+can be `ndarray` as well.
+
+```python
+@tf.function
+def f(x, y):
+  return tnp.sum(x + y)
+
+f(tnp.ones([1, 2]), tf.ones([2, 1]))
+```
+Python control flow based on `ndarray` values will be translated by
+[autograph](https://www.tensorflow.org/code/tensorflow/python/autograph/g3doc/reference/index.md)
+into `tf.cond` and `tf.while_loop` constructs. The code can be XLA compiled
+for further optimizations.
+
+However, note that graph mode execution can change behavior of certain
+operations since symbolic execution may not have information that is computed
+during runtime. Some differences are:
+
+*   Shapes can be incomplete or unknown in graph mode. This means that
+    `ndarray.shape`, `ndarray.size` and `ndarray.ndim` can return `ndarray`
+    objects instead of returning integer (or tuple of integer) values.
+*   `__len__`, `__iter__` and `__index__` properties of `ndarray`
+    may similarly not be supported in graph mode. Code using these
+    may need to change to explicit shape operations or control flow
+    constructs.
+*   Also note the [autograph limitations](
+https://www.tensorflow.org/code/tensorflow/python/autograph/g3doc/reference/limitations.md).
+
+
+## Mutation and Variables
+
+`ndarrays` currently wrap immutable `tf.Tensor`. Hence mutation
+operations like slice assigns are not supported. This may change in the future.
+Note however that one can directly construct a `tf.Variable` and use that with
+the TF-NumPy APIs.
+
+```python
+tf_var = tf.Variable(2.0)
+tf_var.assign_add(tnp.square(tf_var))
+```
+
+## Differences with NumPy
+
+Here is a non-exhaustive list of differences:
+
+*   Not all dtypes are currently supported. e.g. `np.float96`, `np.float128`.
+    `np.object`, `np.str`, `np.recarray` types are not supported.
+*   `ndarray` storage is in C order only. Fortran order, views, `stride_tricks`
+    are not supported.
+*   Only a subset of functions and modules are supported. This set will be
+    expanded over time. For supported functions, some arguments or argument
+    values may not be supported. This differences are generally provide in the
+    function comments. Full `ufunc` support is also not provided.
+*   Buffer mutation is currently not supported. `ndarrays` wrap immutable
+    tensors. This means that output buffer arguments (e..g `out` in ufuncs) are
+    not supported
+*   NumPy C API is not supported. NumPy's Cython and Swig integration are not
+    supported.
 """
 # TODO(wangpeng): Append `np_export`ed symbols to the comments above.
 
@@ -41,17 +184,17 @@ from tensorflow.python.ops.numpy_ops.np_utils import result_type
 
 
 # pylint: disable=redefined-builtin,undefined-variable
-@np_utils.np_doc("max")
+@np_utils.np_doc("max", link=np_utils.AliasOf("maximum"))
 def max(a, axis=None, keepdims=None):
   return amax(a, axis=axis, keepdims=keepdims)
 
 
-@np_utils.np_doc("min")
+@np_utils.np_doc("min", link=np_utils.AliasOf("minimum"))
 def min(a, axis=None, keepdims=None):
   return amin(a, axis=axis, keepdims=keepdims)
 
 
-@np_utils.np_doc("round")
+@np_utils.np_doc("round", link=np_utils.AliasOf("around"))
 def round(a, decimals=0):
   return around(a, decimals=decimals)
 # pylint: enable=redefined-builtin,undefined-variable
diff --git a/tensorflow/python/ops/numpy_ops/integration_test/BUILD b/tensorflow/python/ops/numpy_ops/integration_test/BUILD
new file mode 100644
index 00000000000..e5483166406
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/integration_test/BUILD
@@ -0,0 +1,12 @@
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+licenses(["notice"])
+
+py_test(
+    name = "public_symbol_test",
+    srcs = ["public_symbol_test.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
diff --git a/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/BUILD b/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/BUILD
new file mode 100644
index 00000000000..012e1fbfcf3
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/BUILD
@@ -0,0 +1,42 @@
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+py_binary(
+    name = "micro_benchmarks",
+    srcs = ["micro_benchmarks.py"],
+    python_version = "PY3",
+    deps = [
+        ":numpy_mlp",
+        ":tf_numpy_mlp",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:extra_py_tests_deps",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_benchmark",
+        "//tensorflow/python/ops/numpy_ops:numpy",
+        "//third_party/py/numpy",
+        "@absl_py//absl/flags",
+        "@absl_py//absl/logging",
+    ],
+)
+
+py_library(
+    name = "numpy_mlp",
+    srcs = ["numpy_mlp.py"],
+    deps = [
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "tf_numpy_mlp",
+    srcs = ["tf_numpy_mlp.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python/ops/numpy_ops:numpy",
+    ],
+)
diff --git a/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/micro_benchmarks.py b/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/micro_benchmarks.py
new file mode 100644
index 00000000000..bff4db17242
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/micro_benchmarks.py
@@ -0,0 +1,165 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Lint as: python3
+r"""Micro benchmark.
+
+bazel run -c opt --config=cuda \
+  //third_party/tensorflow/python/ops/numpy_ops/benchmarks:micro_benchmarks -- \
+  --number=100 --repeat=100 \
+  --benchmarks=.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gc
+import time
+
+from absl import flags
+from absl import logging
+
+import numpy as np  # pylint: disable=unused-import
+import tensorflow.compat.v2 as tf
+
+from tensorflow.python.ops import numpy_ops as tfnp   # pylint: disable=g-direct-tensorflow-import
+from tensorflow.python.ops.numpy_ops.integration_test.benchmarks import numpy_mlp
+from tensorflow.python.ops.numpy_ops.integration_test.benchmarks import tf_numpy_mlp
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_integer('repeat', 100, '#Measurements per benchmark.')
+flags.DEFINE_integer('number', 100, '#Runs per a measure.')
+
+
+class MicroBenchmarks(tf.test.Benchmark):
+  """Main micro benchmark class."""
+
+  def _benchmark_and_report(
+      self,
+      name,
+      fn,
+      repeat=None,
+      number=None):
+    """Run fn repeat * number times, report time, and return fastest time."""
+    # Can't make these default above since the flags may not have been parsed
+    # at module import time.
+    repeat = repeat or int(FLAGS.repeat)
+    number = number or int(FLAGS.number)
+
+    # Warmup
+    fn()
+
+    times = []
+    for _ in range(repeat):
+      gc.disable()
+      start = time.time()
+      for _ in range(number):
+        fn()
+      times.append(time.time() - start)
+      gc.enable()
+      gc.collect()
+
+    # Regular benchmark to report numbers.
+    fastest_time_us = min(times) * 1e6 / number
+    total_time = sum(times)
+    self.report_benchmark(name=name,
+                          wall_time=total_time,
+                          extras={'fastest_time_us': fastest_time_us})
+
+    return fastest_time_us
+
+  def benchmark_tf_np_mlp_inference_batch_1_cpu(self):
+    with tf.device('/CPU:0'):
+      model = tf_numpy_mlp.MLP()
+      x = tfnp.ones(shape=(1, 10)).astype(np.float32)
+      self._benchmark_and_report(self._get_name(), lambda: model.inference(x))
+
+  def benchmark_tf_np_tf_function_mlp_inference_batch_1_cpu(self):
+    with tf.device('/CPU:0'):
+      model = tf_numpy_mlp.MLP()
+      x = tfnp.ones(shape=(1, 10)).astype(np.float32)
+      self._benchmark_and_report(
+          self._get_name(), tf.function(lambda: model.inference(x)))
+
+  def benchmark_numpy_mlp_inference_batch_1_cpu(self):
+    model = numpy_mlp.MLP()
+    x = np.random.uniform(size=(1, 10)).astype(np.float32, copy=False)
+    self._benchmark_and_report(self._get_name(), lambda: model.inference(x))
+
+  def _benchmark_np_and_tf_np(self, name, op, args, repeat=None):  # pylint: disable=redefined-builtin
+    fn = getattr(np, op)
+    assert fn is not None
+
+    np_time = self._benchmark_and_report(
+        '{}_numpy'.format(name), lambda: fn(*args), repeat=repeat)
+
+    fn = getattr(tfnp, op)
+    assert fn is not None
+
+    with tf.device('CPU:0'):
+      tf_time = self._benchmark_and_report(
+          '{}_tfnp_cpu'.format(name), lambda: fn(*args), repeat=repeat)
+
+    return np_time, tf_time
+
+  def _print_times(self, op, sizes, times):
+    # For easy reporting.
+    print('For np.{}:'.format(op))
+    print('{:<15}  {:>11}  {:>11}'.format('Size', 'NP time', 'TF NP Time'))
+    for size, (np_time, tf_time) in zip(sizes, times):
+      print('{:<15} {:>10.5}us {:>10.5}us'.format(
+          str(size), np_time, tf_time))
+    print()
+
+  def _benchmark_np_and_tf_np_unary(self, op):
+    sizes = [(100,), (10000,), (1000000,)]
+    repeats = [FLAGS.repeat] * 2 + [10]
+    times = []
+    for size, repeat in zip(sizes, repeats):
+      x = np.random.uniform(size=size).astype(np.float32, copy=False)
+      name = '{}_{}'.format(self._get_name(), size)
+      times.append(self._benchmark_np_and_tf_np(name, op, (x,), repeat))
+    self._print_times(op, sizes, times)
+
+  def benchmark_count_nonzero(self):
+    self._benchmark_np_and_tf_np_unary('count_nonzero')
+
+  def benchmark_log(self):
+    self._benchmark_np_and_tf_np_unary('log')
+
+  def benchmark_exp(self):
+    self._benchmark_np_and_tf_np_unary('exp')
+
+  def benchmark_tanh(self):
+    self._benchmark_np_and_tf_np_unary('tanh')
+
+  def benchmark_matmul(self):
+    sizes = [(2, 2), (10, 10), (100, 100), (200, 200), (1000, 1000)]
+    # Override repeat flag since this can be very slow.
+    repeats = [FLAGS.repeat] * 3 + [50, 10]
+    times = []
+    for size, repeat in zip(sizes, repeats):
+      x = np.random.uniform(size=size).astype(np.float32, copy=False)
+      name = '{}_{}'.format(self._get_name(), size)
+      times.append(
+          self._benchmark_np_and_tf_np(name, 'matmul', (x, x), repeat=repeat))
+
+    self._print_times('matmul', sizes, times)
+
+
+if __name__ == '__main__':
+  logging.set_verbosity(logging.WARNING)
+  tf.enable_v2_behavior()
+  tf.test.main()
diff --git a/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/numpy_mlp.py b/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/numpy_mlp.py
new file mode 100644
index 00000000000..7e801d96f3a
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/numpy_mlp.py
@@ -0,0 +1,51 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Builds the MLP network."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+NUM_CLASSES = 3
+INPUT_SIZE = 10
+HIDDEN_UNITS = 10
+
+
+class MLP(object):
+  """MLP model.
+
+  T = Relu(Add(MatMul(A, B), C))
+  R = Relu(Add(MatMul(T, D), E))
+  """
+
+  def __init__(self, num_classes=NUM_CLASSES, input_size=INPUT_SIZE,
+               hidden_units=HIDDEN_UNITS):
+    self.w1 = np.random.uniform(size=[input_size, hidden_units]).astype(
+        np.float32, copy=False)
+    self.w2 = np.random.uniform(size=[hidden_units, num_classes]).astype(
+        np.float32, copy=False)
+    self.b1 = np.random.uniform(size=[1, hidden_units]).astype(
+        np.float32, copy=False)
+    self.b2 = np.random.uniform(size=[1, num_classes]).astype(
+        np.float32, copy=False)
+
+  def inference(self, inputs):
+    return self._forward(inputs, self.w1, self.w2, self.b1, self.b2)
+
+  def _forward(self, x, w1, w2, b1, b2):
+    x = np.maximum(np.matmul(x, w1) + b1, 0.)
+    x = np.maximum(np.matmul(x, w2) + b2, 0.)
+    return x
diff --git a/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/tf_numpy_mlp.py b/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/tf_numpy_mlp.py
new file mode 100644
index 00000000000..f3c4727e18f
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/tf_numpy_mlp.py
@@ -0,0 +1,53 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Builds the MLP network."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v2 as tf
+np = tf.experimental.numpy
+
+
+NUM_CLASSES = 3
+INPUT_SIZE = 10
+HIDDEN_UNITS = 10
+
+
+class MLP(object):
+  """MLP model.
+
+  T = Relu(Add(MatMul(A, B), C))
+  R = Relu(Add(MatMul(T, D), E))
+  """
+
+  def __init__(self, num_classes=NUM_CLASSES, input_size=INPUT_SIZE,
+               hidden_units=HIDDEN_UNITS):
+    self.w1 = np.random.uniform(size=[input_size, hidden_units]).astype(
+        np.float32)
+    self.w2 = np.random.uniform(size=[hidden_units, num_classes]).astype(
+        np.float32)
+    self.b1 = np.random.uniform(size=[1, hidden_units]).astype(
+        np.float32)
+    self.b2 = np.random.uniform(size=[1, num_classes]).astype(
+        np.float32)
+
+  def inference(self, inputs):
+    return self._forward(inputs, self.w1, self.w2, self.b1, self.b2)
+
+  def _forward(self, x, w1, w2, b1, b2):
+    x = np.maximum(np.matmul(x, w1) + b1, 0.)
+    x = np.maximum(np.matmul(x, w2) + b2, 0.)
+    return x
diff --git a/tensorflow/python/ops/numpy_ops/integration_test/public_symbol_test.py b/tensorflow/python/ops/numpy_ops/integration_test/public_symbol_test.py
new file mode 100644
index 00000000000..f0c41cd21e5
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/integration_test/public_symbol_test.py
@@ -0,0 +1,38 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests using module `tf.experimental.numpy` via an alias."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as onp
+import tensorflow as tf
+
+
+np = tf.experimental.numpy
+
+
+class PublicSymbolTest(tf.test.TestCase):
+
+  def testSimple(self):
+    a = 0.1
+    b = 0.2
+    self.assertAllClose(onp.add(a, b), np.add(a, b))
+
+
+if __name__ == "__main__":
+  tf.compat.v1.enable_eager_execution()
+  tf.test.main()
diff --git a/tensorflow/python/ops/numpy_ops/np_array_ops.py b/tensorflow/python/ops/numpy_ops/np_array_ops.py
index d1b8135ca95..5f82bca0061 100644
--- a/tensorflow/python/ops/numpy_ops/np_array_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_array_ops.py
@@ -37,10 +37,14 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sort_ops
 from tensorflow.python.ops.numpy_ops import np_arrays
 from tensorflow.python.ops.numpy_ops import np_dtypes
+from tensorflow.python.ops.numpy_ops import np_export
 from tensorflow.python.ops.numpy_ops import np_utils
 from tensorflow.python.util import nest
 
 
+newaxis = np_export.np_export_constant(__name__, 'newaxis', np.newaxis)
+
+
 @np_utils.np_doc('empty')
 def empty(shape, dtype=float):  # pylint: disable=redefined-outer-name
   return zeros(shape, dtype)
@@ -378,11 +382,11 @@ def _promote_dtype(*arrays):
 
 def _promote_dtype_binary(t1, t2):
   dtype = np_utils._result_type_binary(t1, t2)  # pylint: disable=protected-access
-  def _fast_asarray(a):
-    if isinstance(a, np_arrays.ndarray) and dtype == a.dtype:
-      return a
-    return _array_internal(a, dtype=dtype, copy=False)
-  return _fast_asarray(t1), _fast_asarray(t2)
+  if not(isinstance(t1, np_arrays.ndarray) and dtype == t1.dtype):
+    t1 = _array_internal(t1, dtype=dtype, copy=False)
+  if not(isinstance(t2, np_arrays.ndarray) and dtype == t2.dtype):
+    t2 = _array_internal(t2, dtype=dtype, copy=False)
+  return t1, t2
 
 
 @np_utils.np_doc('all')
@@ -954,13 +958,14 @@ def select(condlist, choicelist, default=0):  # pylint: disable=missing-docstrin
   return output
 
 
-@np_utils.np_doc('shape')
+@np_utils.np_doc('shape', link=np_utils.Link(
+    'https://numpy.org/doc/1.18/reference/generated/numpy.shape.html'))
 def shape(a):
   a = asarray(a)
   return a.shape
 
 
-@np_utils.np_doc('ndim')
+@np_utils.np_doc('ndim', link=np_utils.NoLink())
 def ndim(a):
   a = asarray(a)
   return a.ndim
diff --git a/tensorflow/python/ops/numpy_ops/np_arrays.py b/tensorflow/python/ops/numpy_ops/np_arrays.py
index a8be2c9f47b..9a859047843 100644
--- a/tensorflow/python/ops/numpy_ops/np_arrays.py
+++ b/tensorflow/python/ops/numpy_ops/np_arrays.py
@@ -263,11 +263,11 @@ class ndarray(composite_tensor.CompositeTensor):
   def __float__(self):
     return float(self.data)
 
-  def __nonzero__(self):
+  def __bool__(self):
     return bool(self.data)
 
-  def __bool__(self):
-    return self.__nonzero__()
+  def __nonzero__(self):
+    return self.__bool__()
 
   def __iter__(self):
     if not isinstance(self.data, ops.EagerTensor):
diff --git a/tensorflow/python/ops/numpy_ops/np_dtypes.py b/tensorflow/python/ops/numpy_ops/np_dtypes.py
index 099ae3ed5b1..cde3883d3d9 100644
--- a/tensorflow/python/ops/numpy_ops/np_dtypes.py
+++ b/tensorflow/python/ops/numpy_ops/np_dtypes.py
@@ -77,9 +77,11 @@ def set_allow_float64(b):
 
 def canonicalize_dtype(dtype):
   if not _allow_float64:
-    return _to_float32.get(dtype, dtype)
-  else:
-    return dtype
+    try:
+      return _to_float32[dtype]
+    except KeyError:
+      pass
+  return dtype
 
 
 def _result_type(*arrays_and_dtypes):
@@ -90,10 +92,12 @@ def _result_type(*arrays_and_dtypes):
 def _get_cached_dtype(dtype):
   """Returns an np.dtype for the TensorFlow DType."""
   global _cached_np_dtypes
-  cached_dtype = _cached_np_dtypes.get(dtype, None)
-  if cached_dtype is None:
-    cached_dtype = np.dtype(dtype.as_numpy_dtype)
-    _cached_np_dtypes[dtype] = cached_dtype
+  try:
+    return _cached_np_dtypes[dtype]
+  except KeyError:
+    pass
+  cached_dtype = np.dtype(dtype.as_numpy_dtype)
+  _cached_np_dtypes[dtype] = cached_dtype
   return cached_dtype
 
 
diff --git a/tensorflow/python/ops/numpy_ops/np_export.py b/tensorflow/python/ops/numpy_ops/np_export.py
index b431db54d58..7a6424cbc77 100644
--- a/tensorflow/python/ops/numpy_ops/np_export.py
+++ b/tensorflow/python/ops/numpy_ops/np_export.py
@@ -18,23 +18,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.util import tf_export
+
 
 def public_name(np_fun_name):
   return "experimental.numpy." + np_fun_name
 
 
 def np_export(np_fun_name):
-  # TODO(wangpeng): Remove the following two lines and do actual exporting using
-  #   this:
-  #   return tf_export.tf_export(public_name(np_fun_name), v1=[])
-  del np_fun_name
-  return lambda f: f
+  return tf_export.tf_export(public_name(np_fun_name), v1=[])
 
 
 def np_export_constant(module_name, name, value):
-  # TODO(wangpeng): Remove the following two lines and do actual exporting using
-  #   this:
-  #   np_export(name).export_constant(module_name, name)
-  del module_name
-  del name
+  np_export(name).export_constant(module_name, name)
   return value
diff --git a/tensorflow/python/ops/numpy_ops/np_interop_test.py b/tensorflow/python/ops/numpy_ops/np_interop_test.py
index 33abb58f260..0b474035edd 100644
--- a/tensorflow/python/ops/numpy_ops/np_interop_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_interop_test.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 import numpy as onp
 import tensorflow.compat.v2 as tf
 
-import tensorflow.python.ops.numpy_ops as np
+from tensorflow.python.ops import numpy_ops as np
 
 
 # Tests for code snippet put in README.md
@@ -98,6 +98,18 @@ class InteropTest(tf.test.TestCase):
     self.assertAllClose(dx, 2.0)
     self.assertAllClose(dy, 3.0)
 
+  def testGradientTapeNoneGradients(self):
+    y = np.asarray(2.0)
+
+    with tf.GradientTape() as t:
+      x = np.asarray(3.0)
+      t.watch([x])
+      z = 2 * x
+
+    dz = t.gradient(z, y)
+
+    self.assertIsNone(dz)
+
   def testCondInterop(self):
     x = np.asarray(3.0)
 
@@ -273,6 +285,33 @@ class InteropTest(tf.test.TestCase):
     self.assertIsInstance(result, np.ndarray)
     self.assertAllClose(result, onp.square(values))
 
+  def testKerasInteropSequential(self):
+    class ProjectionLayer(tf.keras.layers.Layer):
+      """Linear projection layer using TF NumPy."""
+
+      def __init__(self, units):
+        super(ProjectionLayer, self).__init__()
+        self._units = units
+
+      def build(self, input_shape):
+        stddev = np.sqrt(self._units).astype(np.float32)
+        initial_value = np.random.randn(input_shape[1], self._units).astype(
+            np.float32) / stddev
+        # Note that TF NumPy can interoperate with tf.Variable.
+        self.w = tf.Variable(initial_value, trainable=True)
+
+      def call(self, inputs):
+        return np.matmul(inputs, self.w)
+
+    model = tf.keras.Sequential(
+        [tf.keras.layers.Dense(100), ProjectionLayer(2)])
+    output = model.call(np.random.randn(10, 100))
+
+    self.assertIsInstance(output, np.ndarray)
+
+    dense_layer = tf.keras.layers.Dense(100)
+    output = dense_layer(np.random.randn(10, 100))
+
   def testPForInterop(self):
     def outer_product(a):
       return np.tensordot(a, a, 0)
@@ -281,8 +320,7 @@ class InteropTest(tf.test.TestCase):
     a = np.ones((batch_size, 32, 32))
     c = tf.vectorized_map(outer_product, a)
 
-    # # TODO(nareshmodi): vectorized_map doesn't rewrap tensors in ndarray.
-    # self.assertIsInstance(c, np.ndarray)
+    self.assertIsInstance(c, np.ndarray)
     self.assertEqual(c.shape, (batch_size, 32, 32, 32, 32))
 
   def testJacobian(self):
@@ -300,6 +338,42 @@ class InteropTest(tf.test.TestCase):
     self.assertIsInstance(jacobian[1], np.ndarray)
     self.assertAllClose(jacobian, answer)
 
+  def testBatchJacobian(self):
+    with tf.GradientTape() as g:
+      x = np.asarray([[1., 2.], [3., 4.]])
+      y = np.asarray([[3., 4.], [5., 6.]])
+      g.watch(x)
+      g.watch(y)
+      z = x * x * y
+
+    batch_jacobian = g.batch_jacobian(z, x)
+    answer = tf.stack(
+        [tf.linalg.diag(2 * x[0] * y[0]),
+         tf.linalg.diag(2 * x[1] * y[1])])
+
+    self.assertIsInstance(batch_jacobian, np.ndarray)
+    self.assertAllClose(batch_jacobian, answer)
+
+  def testForwardprop(self):
+    x = np.asarray([1., 2.])
+    xt = np.asarray([3., 4.])
+    with tf.autodiff.ForwardAccumulator(x, xt) as acc:
+      y = x * 2.
+    yt = acc.jvp(y)
+    self.assertIsInstance(yt, np.ndarray)
+    self.assertAllClose([6., 8.], yt)
+    z = np.asarray([1.])
+    self.assertIsNone(acc.jvp(z))
+
+  def testMapFn(self):
+    x = np.asarray([1., 2.])
+    mapped_x = tf.map_fn(lambda x: (x[0]+1, x[1]+1), (x, x))
+
+    self.assertIsInstance(mapped_x[0], np.ndarray)
+    self.assertIsInstance(mapped_x[1], np.ndarray)
+    self.assertAllClose(mapped_x[0], [2., 3.])
+    self.assertAllClose(mapped_x[1], [2., 3.])
+
 
 class FunctionTest(InteropTest):
 
diff --git a/tensorflow/python/ops/numpy_ops/np_math_ops.py b/tensorflow/python/ops/numpy_ops/np_math_ops.py
index 138fac3d294..c1505e6fb65 100644
--- a/tensorflow/python/ops/numpy_ops/np_math_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_math_ops.py
@@ -221,7 +221,7 @@ def clip(a, a_min, a_max):  # pylint: disable=missing-docstring
 def matmul(x1, x2):  # pylint: disable=missing-docstring
   def f(x1, x2):
     try:
-      if x1.shape.rank == 2 and x2.shape.rank == 2:
+      if x1._rank() == 2 and x2._rank() == 2:  # pylint: disable=protected-access
         # Fast path for known ranks.
         return gen_math_ops.mat_mul(x1, x2)
       return np_utils.cond(
@@ -565,7 +565,7 @@ def bitwise_xor(x1, x2):
   return _bitwise_binary_op(bitwise_ops.bitwise_xor, x1, x2)
 
 
-@np_utils.np_doc('bitwise_not')
+@np_utils.np_doc('bitwise_not', link=np_utils.AliasOf('invert'))
 def bitwise_not(x):
 
   def f(x):
@@ -612,7 +612,7 @@ def sqrt(x):
   return _scalar(math_ops.sqrt, x, True)
 
 
-@np_utils.np_doc('abs')
+@np_utils.np_doc('abs', link=np_utils.AliasOf('absolute'))
 def abs(x):  # pylint: disable=redefined-builtin
   return _scalar(math_ops.abs, x)
 
@@ -769,7 +769,7 @@ def cbrt(x):
   return _scalar(f, x, True)
 
 
-@np_utils.np_doc('conjugate')
+@np_utils.np_doc('conjugate', link=np_utils.AliasOf('conj'))
 def conjugate(x):
   return _scalar(math_ops.conj, x)
 
diff --git a/tensorflow/python/ops/numpy_ops/np_utils.py b/tensorflow/python/ops/numpy_ops/np_utils.py
index ef55ffff6af..ca09624de76 100644
--- a/tensorflow/python/ops/numpy_ops/np_utils.py
+++ b/tensorflow/python/ops/numpy_ops/np_utils.py
@@ -207,12 +207,13 @@ def _prepare_np_fun_name_and_fun(np_fun_name, np_fun):
   return np_fun_name, np_fun
 
 
-def _np_doc_helper(f, np_f, np_fun_name=None, unsupported_params=None):
+def _np_doc_helper(f, np_f, np_fun_name=None, unsupported_params=None,
+                   link=None):
   """Helper to get docs."""
   assert np_f or np_fun_name
   if not np_fun_name:
     np_fun_name = np_f.__name__
-  doc = 'TensorFlow variant of `numpy.%s`.\n\n' % np_fun_name
+  doc = 'TensorFlow variant of NumPy\'s `%s`.\n\n' % np_fun_name
   if unsupported_params:
     doc += 'Unsupported arguments: ' + ', '.join(
         '`' + name + '`' for name in unsupported_params) + '.\n\n'
@@ -221,7 +222,7 @@ def _np_doc_helper(f, np_f, np_fun_name=None, unsupported_params=None):
     doc = _add_blank_line(doc)
   # TODO(wangpeng): Re-enable the following and choose inlined vs. link to numpy
   #   doc according to some global switch.
-  doc = _add_np_doc(doc, np_fun_name, np_f)
+  doc = _add_np_doc(doc, np_fun_name, np_f, link=link)
   return doc
 
 
@@ -257,7 +258,61 @@ def set_np_doc_form(value):
   _np_doc_form = value
 
 
-def _add_np_doc(doc, np_fun_name, np_f):
+class Link:
+
+  def __init__(self, v):
+    self.value = v
+
+
+class AliasOf:
+
+  def __init__(self, v):
+    self.value = v
+
+
+class NoLink:
+  pass
+
+
+def generate_link(flag, np_fun_name):
+  """Generates link from numpy function name.
+
+  Args:
+    flag: the flag to control link form. See `set_np_doc_form`.
+    np_fun_name: the numpy function name.
+
+  Returns:
+    A string.
+  """
+  # Only adds link in this case
+  if flag == 'dev':
+    template = 'https://numpy.org/devdocs/reference/generated/numpy.%s.html'
+  elif flag == 'stable':
+    template = (
+        'https://numpy.org/doc/stable/reference/generated/numpy.%s.html')
+  elif re.match(r'\d+(\.\d+(\.\d+)?)?$', flag):
+    # `flag` is the version number
+    template = ('https://numpy.org/doc/' + flag +
+                '/reference/generated/numpy.%s.html')
+  else:
+    return None
+  return template % np_fun_name
+
+
+_is_check_link = (os.getenv('TF_NP_CHECK_LINK', 'False') in
+                  ('True', 'true', '1'))
+
+
+def is_check_link():
+  return _is_check_link
+
+
+def set_check_link(value):
+  global _is_check_link
+  _is_check_link = value
+
+
+def _add_np_doc(doc, np_fun_name, np_f, link):
   """Appends the numpy docstring to `doc`, according to `set_np_doc_form`.
 
   See `set_np_doc_form` for how it controls the form of the numpy docstring.
@@ -266,6 +321,7 @@ def _add_np_doc(doc, np_fun_name, np_f):
     doc: the docstring to be appended to.
     np_fun_name: the name of the numpy function.
     np_f: (optional) the numpy function.
+    link: (optional) which link to use. See `np_doc` for details.
 
   Returns:
     `doc` with numpy docstring appended.
@@ -279,21 +335,23 @@ def _add_np_doc(doc, np_fun_name, np_f):
       # comment.
       doc += np_f.__doc__.replace('>>>', '>')
   elif isinstance(flag, str):
-    # Only adds link in this case
-    if flag == 'dev':
-      template = 'https://numpy.org/devdocs/reference/generated/numpy.%s.html'
-    elif flag == 'stable':
-      template = (
-          'https://numpy.org/doc/stable/reference/generated/numpy.%s.html')
-    elif re.match(r'\d+(\.\d+(\.\d+)?)?$', flag):
-      # `flag` is the version number
-      template = ('https://numpy.org/doc/' + flag +
-                  '/reference/generated/numpy.%s.html')
+    if link is None:
+      url = generate_link(flag, np_fun_name)
+    elif isinstance(link, AliasOf):
+      url = generate_link(flag, link.value)
+    elif isinstance(link, Link):
+      url = link.value
     else:
-      template = None
-    if template is not None:
-      link = template % np_fun_name
-      doc += 'See the documentation for `numpy.%s`: [%s]' % (np_fun_name, link)
+      url = None
+    if url is not None:
+      if is_check_link():
+        # Imports locally because some builds may not have `requests`
+        import requests  # pylint: disable=g-import-not-at-top
+        r = requests.head(url)
+        if r.status_code != 200:
+          raise ValueError("Can't open link for %s: %s" % (np_fun_name, url))
+      doc += 'See the NumPy documentation for [`numpy.%s`](%s).' % (
+          np_fun_name, url)
   return doc
 
 
@@ -310,7 +368,7 @@ def set_is_sig_mismatch_an_error(value):
   _is_sig_mismatch_an_error = value
 
 
-def np_doc(np_fun_name, np_fun=None, export=True):
+def np_doc(np_fun_name, np_fun=None, export=True, link=None):
   """Attachs numpy docstring to a function.
 
   Args:
@@ -321,6 +379,11 @@ def np_doc(np_fun_name, np_fun=None, export=True):
       `tf.experimental.numpy`. Note that if `export` is `True`, `np_fun` must be
       a function directly under the `numpy` module, not under any submodule of
       `numpy` (e.g. `numpy.random`).
+    link: (optional) which link to use. If `None`, a default link generated from
+      `np_fun_name` will be used. If an instance of `AliasOf`, `link.value` will
+      be used in place of `np_fun_name` for the link generation. If an instance
+      of `Link`, `link.value` will be used as the whole link. If an instance of
+      `NoLink`, no link will be added.
 
   Returns:
     A function decorator that attaches the docstring from `np_fun` to the
@@ -362,10 +425,8 @@ def np_doc(np_fun_name, np_fun=None, export=True):
           if name not in sig.parameters:
             unsupported_params.append(name)
     f.__doc__ = _np_doc_helper(
-        f,
-        np_fun,
-        np_fun_name=np_fun_name,
-        unsupported_params=unsupported_params)
+        f, np_fun, np_fun_name=np_fun_name,
+        unsupported_params=unsupported_params, link=link)
     if export:
       return np_export.np_export(np_fun_name)(f)
     else:
diff --git a/tensorflow/python/ops/numpy_ops/np_utils_test.py b/tensorflow/python/ops/numpy_ops/np_utils_test.py
index 71444585600..f22d1a6954d 100644
--- a/tensorflow/python/ops/numpy_ops/np_utils_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_utils_test.py
@@ -46,7 +46,7 @@ class UtilsTest(test.TestCase, parameterized.TestCase):
     def f():
       """f docstring."""
       return
-    expected = """TensorFlow variant of `numpy.np_fun`.
+    expected = """TensorFlow variant of NumPy's `np_fun`.
 
 Unsupported arguments: `x`.
 
@@ -75,13 +75,14 @@ np_fun docstring."""
     def f():
       """f docstring."""
       return
-    expected = """TensorFlow variant of `numpy.np_fun`.
+    expected = """TensorFlow variant of NumPy's `np_fun`.
 
 Unsupported arguments: `x`.
 
 f docstring.
 
-See the documentation for `numpy.np_fun`: [%s]""" % link
+See the NumPy documentation for [`numpy.np_fun`](%s)."""
+    expected = expected % (link)
     self.assertEqual(expected, f.__doc__)
 
   @parameterized.parameters([None, 1, 'a', '1a', '1.1a', '1.1.1a'])
@@ -94,7 +95,7 @@ See the documentation for `numpy.np_fun`: [%s]""" % link
     def f():
       """f docstring."""
       return
-    expected = """TensorFlow variant of `numpy.np_fun`.
+    expected = """TensorFlow variant of NumPy's `np_fun`.
 
 Unsupported arguments: `x`.
 
@@ -109,7 +110,7 @@ f docstring.
     def f():
       """f docstring."""
       return
-    expected = """TensorFlow variant of `numpy.foo`.
+    expected = """TensorFlow variant of NumPy's `foo`.
 
 f docstring.
 
diff --git a/tensorflow/python/ops/parallel_for/BUILD b/tensorflow/python/ops/parallel_for/BUILD
index 2f3f7309395..b189ac57bb9 100644
--- a/tensorflow/python/ops/parallel_for/BUILD
+++ b/tensorflow/python/ops/parallel_for/BUILD
@@ -85,6 +85,7 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
diff --git a/tensorflow/python/ops/parallel_for/array_test.py b/tensorflow/python/ops/parallel_for/array_test.py
index 85a2f6c191b..1e2ecdbea7b 100644
--- a/tensorflow/python/ops/parallel_for/array_test.py
+++ b/tensorflow/python/ops/parallel_for/array_test.py
@@ -148,9 +148,12 @@ class ArrayTest(PForTestCase):
 
     def loop_fn(i):
       x1 = array_ops.gather(x, i)
-      return array_ops.expand_dims(
-          x1, axis=-1), array_ops.expand_dims(
-              x1, axis=1)
+      return [
+          array_ops.expand_dims(x1, axis=-1),
+          array_ops.expand_dims(x1, axis=1),
+          array_ops.expand_dims(
+              x1, axis=constant_op.constant(1, dtype=dtypes.int64))
+      ]
 
     self._test_loop_fn(loop_fn, 3)
 
@@ -319,8 +322,12 @@ class ArrayTest(PForTestCase):
 
     def loop_fn(i):
       x1 = array_ops.gather(x, i)
-      return array_ops.concat([x1, x1, y],
-                              axis=0), array_ops.concat([x1, x1, y], axis=-1)
+      return [
+          array_ops.concat([x1, x1, y], axis=0),
+          array_ops.concat([x1, x1, y], axis=-1),
+          array_ops.concat([x1, x1, y],
+                           axis=constant_op.constant(0, dtype=dtypes.int64))
+      ]
 
     self._test_loop_fn(loop_fn, 3)
 
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops.py b/tensorflow/python/ops/parallel_for/control_flow_ops.py
index a7649778161..e7a5c38381e 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops.py
@@ -26,11 +26,13 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops.numpy_ops import np_arrays
 from tensorflow.python.ops.parallel_for.pfor import PFor
 from tensorflow.python.ops.parallel_for.pfor import PForConfig
 from tensorflow.python.platform import tf_logging as logging
@@ -51,8 +53,8 @@ def for_loop(loop_fn, loop_fn_dtypes, iters, parallel_iterations=None):
     loop_fn: A function that takes an int32 scalar tf.Tensor object representing
       the iteration number, and returns a possibly nested structure of tensor
       objects. The shape of these outputs should not depend on the input.
-    loop_fn_dtypes: dtypes for the outputs of loop_fn.
-    iters: Number of iterations for which to run loop_fn.
+    loop_fn_dtypes: dtypes for the outputs of `loop_fn`.
+    iters: Number of iterations for which to run `loop_fn`.
     parallel_iterations: The number of iterations that can be dispatched in
       parallel. This knob can be used to control the total memory usage.
 
@@ -137,7 +139,7 @@ def pfor(loop_fn, iters, fallback_to_while_loop=True, parallel_iterations=None):
 
   `pfor` has functionality similar to `for_loop`, i.e. running `loop_fn` `iters`
   times, with input from 0 to `iters - 1`, and stacking corresponding output of
-  each iteration. However the implementation does not use a tf.while_loop.
+  each iteration. However the implementation does not use a `tf.while_loop`.
   Instead it adds new operations to the graph that collectively compute the same
   value as what running `loop_fn` in a loop would compute.
 
@@ -152,7 +154,7 @@ def pfor(loop_fn, iters, fallback_to_while_loop=True, parallel_iterations=None):
       reads, etc).
     - Conversion works only on a limited set of kernels for which a converter
       has been registered.
-    - loop_fn has limited support for control flow operations. tf.cond in
+    - `loop_fn` has limited support for control flow operations. `tf.cond` in
       particular is not supported.
     - `loop_fn` should return nested structure of Tensors or Operations. However
       if an Operation is returned, it should have zero outputs.
@@ -166,9 +168,9 @@ def pfor(loop_fn, iters, fallback_to_while_loop=True, parallel_iterations=None):
       or Operation objects. Note that if setting `parallel_iterations` argument
       to something other than None, `loop_fn` may be called more than once
       during graph construction. So it may need to avoid mutating global state.
-    iters: Number of iterations for which to run loop_fn.
+    iters: Number of iterations for which to run `loop_fn`.
     fallback_to_while_loop: If true, on failing to vectorize an operation, pfor
-      fallbacks to using a tf.while_loop to dispatch the iterations.
+      fallbacks to using a `tf.while_loop` to dispatch the iterations.
     parallel_iterations: A knob to control how many iterations are vectorized
       and dispatched in parallel. The default value of None corresponds to
       vectorizing all the iterations.  If `parallel_iterations` is smaller than
@@ -246,6 +248,7 @@ def _pfor_impl(loop_fn,
       loop_fn_outputs = loop_fn(loop_var)
 
   # Convert outputs to Tensor if needed.
+  rewrap_as_ndarray = False
   tmp_loop_fn_outputs = []
   for loop_fn_output in nest.flatten(loop_fn_outputs):
     if (loop_fn_output is not None and not isinstance(
@@ -256,7 +259,12 @@ def _pfor_impl(loop_fn,
                      " Alternatively, output the indices and values of the"
                      " IndexedSlices separately, and handle the vectorized"
                      " outputs directly." % loop_fn_output)
-      loop_fn_output = ops.convert_to_tensor(loop_fn_output)
+        loop_fn_output = ops.convert_to_tensor(loop_fn_output)
+      elif isinstance(loop_fn_output, np_arrays.ndarray):
+        loop_fn_output = loop_fn_output.data
+        rewrap_as_ndarray = True
+      else:
+        loop_fn_output = ops.convert_to_tensor(loop_fn_output)
     tmp_loop_fn_outputs.append(loop_fn_output)
   loop_fn_outputs = nest.pack_sequence_as(loop_fn_outputs, tmp_loop_fn_outputs)
 
@@ -277,7 +285,10 @@ def _pfor_impl(loop_fn,
                        pfor_config=pfor_config)
       outputs = []
       for loop_fn_output in nest.flatten(loop_fn_outputs):
-        outputs.append(converter.convert(loop_fn_output))
+        output = converter.convert(loop_fn_output)
+        if rewrap_as_ndarray:
+          output = np_arrays.tensor_to_ndarray(output)
+        outputs.append(output)
       return nest.pack_sequence_as(loop_fn_outputs, outputs)
   else:
     if pfor_config is not None and pfor_config._has_reductions():  # pylint: disable=protected-access
@@ -294,7 +305,10 @@ def _pfor_impl(loop_fn,
       remaining_outputs = []
       flattened_loop_fn_outputs = nest.flatten(loop_fn_outputs)
       for loop_fn_output in flattened_loop_fn_outputs:
-        remaining_outputs.append(converter.convert(loop_fn_output))
+        output = converter.convert(loop_fn_output)
+        if rewrap_as_ndarray:
+          output = np_arrays.tensor_to_ndarray(output)
+        remaining_outputs.append(output)
 
     with ops.name_scope("pfor_tiled"):
       loop_fn_dtypes = [ops.convert_to_tensor(x).dtype
@@ -329,18 +343,32 @@ def _pfor_impl(loop_fn,
                      for x, y in zip(remaining_outputs, tiled_outputs)])
       else:
         outputs = tiled_outputs
+      flattened_outputs = nest.flatten(outputs)
+      if rewrap_as_ndarray:
+        flattened_outputs = [
+            np_arrays.tensor_to_ndarray(x) for x in flattened_outputs]
       return nest.pack_sequence_as(loop_fn_outputs, nest.flatten(outputs))
 
 
+def _broadcasting_gather(x, i):
+  """Wrapper for gather that implicitly broadcasts unit dimensions."""
+  static_first_dim = tensor_shape.dimension_value(x.shape[0])
+  if static_first_dim == 1:
+    i = 0
+  elif static_first_dim is None:
+    i = array_ops.where_v2(array_ops.shape(x)[0] > 1, i, 0)
+  return array_ops.gather(x, i)
+
+
 @tf_export("vectorized_map")
 def vectorized_map(fn, elems, fallback_to_while_loop=True):
   """Parallel map on the list of tensors unpacked from `elems` on dimension 0.
 
-
-  This method works similar to tf.map_fn but is optimized to run much faster,
+  This method works similar to `tf.map_fn` but is optimized to run much faster,
   possibly with a much larger memory footprint. The speedups are obtained by
-  vectorization (see https://arxiv.org/pdf/1903.04243.pdf). The idea behind
-  vectorization is to semantically launch all the invocations of `fn` in
+  vectorization (see [Auto-Vectorizing TensorFlow Graphs: Jacobians, 
+  Auto-Batching and Beyond](https://arxiv.org/pdf/1903.04243.pdf)). The idea 
+  behind vectorization is to semantically launch all the invocations of `fn` in
   parallel and fuse corresponding operations across all these invocations. This
   fusion is done statically at graph generation time and the generated code is
   often similar in performance to a manually fused version.
@@ -402,7 +430,10 @@ def vectorized_map(fn, elems, fallback_to_while_loop=True):
       the structure of `elems`.
     elems: A tensor or (possibly nested) sequence of tensors, each of which will
       be unpacked along their first dimension. The nested sequence of the
-      resulting slices will be mapped over by `fn`.
+      resulting slices will be mapped over by `fn`. The first dimensions of all
+      elements must broadcast to a consistent value; equivalently, each
+      element tensor must have first dimension of either `B` or `1`, for some
+      common batch size `B >= 1`.
     fallback_to_while_loop: If true, on failing to vectorize an operation,
       the unsupported op is wrapped in a tf.while_loop to execute the map
       iterations. Note that this fallback only happens for unsupported ops and
@@ -419,14 +450,23 @@ def vectorized_map(fn, elems, fallback_to_while_loop=True):
   Raises:
     ValueError: If vectorization fails and fallback_to_while_loop is False.
   """
+  elems = nest.map_structure(ops.convert_to_tensor, elems)
+
   def loop_fn(i):
-    gathered_elems = nest.map_structure(lambda x: array_ops.gather(x, i), elems)
+    gathered_elems = nest.map_structure(lambda x: _broadcasting_gather(x, i),
+                                        elems)
     return fn(gathered_elems)
-  batch_size = None
-  first_elem = ops.convert_to_tensor(nest.flatten(elems)[0])
-  if first_elem.shape.rank is not None:
-    batch_size = first_elem.shape.as_list()[0]
-  if batch_size is None:
-    batch_size = array_ops.shape(first_elem)[0]
+
+  # Extract batch size from the maximum first dimension of any element.
+  flat_elems = nest.flatten(elems)
+  static_first_dims = [elem.shape.as_list()[0]
+                       if elem.shape.rank is not None else None
+                       for elem in flat_elems]
+  if any([s is None for s in static_first_dims]):
+    batch_size = math_ops.reduce_max(
+        [array_ops.shape(elem)[0] for elem in flat_elems])
+  else:
+    batch_size = max(static_first_dims)
+
   return pfor(loop_fn, batch_size,
               fallback_to_while_loop=fallback_to_while_loop)
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index f8e4e4762ac..fe3d5f55d4e 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -132,6 +132,27 @@ class PForTest(PForTestCase):
     result = pfor_control_flow_ops.vectorized_map(compute, x)
     self.run_and_assert_equal(result, array_ops.ones((10, 1, 3)))
 
+  def test_vectorized_map_broadcasts_unit_dimensions(self):
+    convert_with_static_shape = ops.convert_to_tensor
+    convert_with_dynamic_shape = (
+        lambda x: array_ops.placeholder_with_default(x, shape=None))
+
+    for convert in (convert_with_static_shape, convert_with_dynamic_shape):
+      a = convert([3.1])
+      b = convert([-2., 6., 9.])
+
+      # One elem with leading unit dimension.
+      a_plus_1 = pfor_control_flow_ops.vectorized_map(lambda a: a + 1, a)
+      self.assertAllEqual(*self.evaluate((a_plus_1, a + 1)))
+
+      # Two elems, both with leading unit dimension.
+      a_plus_a = pfor_control_flow_ops.vectorized_map(sum, (a, a))
+      self.assertAllEqual(*self.evaluate((a_plus_a, a + a)))
+
+      # Elem w/ unit dimension broadcast against elem with batch dim.
+      a_plus_b = pfor_control_flow_ops.vectorized_map(sum, (a, b))
+      self.assertAllEqual(*self.evaluate((a_plus_b, a + b)))
+
   def test_vectorized_map_example_1(self):
 
     def outer_product(a):
diff --git a/tensorflow/python/ops/parallel_for/math_test.py b/tensorflow/python/ops/parallel_for/math_test.py
index 26bce86de73..85b58055d8f 100644
--- a/tensorflow/python/ops/parallel_for/math_test.py
+++ b/tensorflow/python/ops/parallel_for/math_test.py
@@ -341,7 +341,7 @@ class MathTest(PForTestCase, parameterized.TestCase):
         math_ops.reduce_min,
         math_ops.reduce_mean,
     ]:
-      for axis in ([1], None, [0, 2]):
+      for axis in ([1], None, [0, 2], constant_op.constant([1], dtypes.int64)):
         for keepdims in (True, False):
 
           # pylint: disable=cell-var-from-loop
@@ -356,7 +356,7 @@ class MathTest(PForTestCase, parameterized.TestCase):
   def test_boolean_reduction(self):
     x = random_ops.random_uniform([2, 3, 4, 5]) > 0.5
     for op in [math_ops.reduce_any, math_ops.reduce_all]:
-      for axis in ([1], None, [0, 2]):
+      for axis in ([1], None, [0, 2], constant_op.constant([1], dtypes.int64)):
         for keepdims in (True, False):
 
           # pylint: disable=cell-var-from-loop
@@ -402,7 +402,7 @@ class MathTest(PForTestCase, parameterized.TestCase):
 
   def test_cum_sum(self):
     x = random_ops.random_uniform([2, 3, 4, 5])
-    for axis in (1, -2):
+    for axis in (1, -2, constant_op.constant(1, dtypes.int64)):
       for exclusive in (True, False):
         for reverse in (True, False):
 
@@ -418,7 +418,7 @@ class MathTest(PForTestCase, parameterized.TestCase):
 
   def test_cum_prod(self):
     x = random_ops.random_uniform([2, 3, 4, 5])
-    for axis in (1, -2):
+    for axis in (1, -2, constant_op.constant(1, dtypes.int64)):
       for exclusive in (True, False):
         for reverse in (True, False):
 
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index 1bbfb65bb23..d14ad1e5cba 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -2029,7 +2029,7 @@ def _convert_broadcast_to(pfor_input):
 def _convert_expanddims(pfor_input):
   t = pfor_input.stacked_input(0)
   dim = pfor_input.unstacked_input(1)
-  dim += math_ops.cast(dim >= 0, dtypes.int32)
+  dim += math_ops.cast(dim >= 0, dim.dtype)
   return wrap(array_ops.expand_dims(t, axis=dim), True)
 
 
@@ -2510,7 +2510,7 @@ def _convert_reduction(pfor_input, _, op_func):
   t = pfor_input.stacked_input(0)
   indices = pfor_input.unstacked_input(1)
   # Shift positive indices by one to account for the extra dimension.
-  indices += math_ops.cast(indices >= 0, dtypes.int32)
+  indices += math_ops.cast(indices >= 0, indices.dtype)
   keep_dims = pfor_input.get_attr("keep_dims")
   return wrap(op_func(t, indices, keepdims=keep_dims), True)
 
@@ -2547,7 +2547,7 @@ def _convert_cumfoo(pfor_input, _, op_func):
   t = pfor_input.stacked_input(0)
   axis = pfor_input.unstacked_input(1)
   # Shift positive indices by one to account for the extra dimension.
-  axis += math_ops.cast(axis >= 0, dtypes.int32)
+  axis += math_ops.cast(axis >= 0, axis.dtype)
   exclusive = pfor_input.get_attr("exclusive")
   reverse = pfor_input.get_attr("reverse")
   return wrap(op_func(t, axis, exclusive=exclusive, reverse=reverse), True)
@@ -3064,6 +3064,7 @@ def _convert_multinomial(pfor_input):
 
 
 @RegisterPFor("StatelessMultinomial")
+@RegisterPFor("StatelessParameterizedTruncatedNormal")
 @RegisterPFor("StatelessRandomBinomial")
 @RegisterPFor("StatelessRandomGammaV2")
 @RegisterPFor("StatelessRandomNormal")
diff --git a/tensorflow/python/ops/parallel_for/xla_control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/xla_control_flow_ops_test.py
index b1762e2f55f..33f0d7b76ae 100644
--- a/tensorflow/python/ops/parallel_for/xla_control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/xla_control_flow_ops_test.py
@@ -233,8 +233,7 @@ class WhileV2Test(PForTestCase):
           body,
           [True, 0, 0.])
 
-    # b/155430349: Enabling forrce_xla=True triggers a CHECK in debug mode.
-    self._test_loop_fn(loop_fn, 3, force_xla=False)
+    self._test_loop_fn(loop_fn, 3, force_xla=True)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/ops/ragged/ragged_functional_ops.py b/tensorflow/python/ops/ragged/ragged_functional_ops.py
index 00b5ced6170..22625077e56 100644
--- a/tensorflow/python/ops/ragged/ragged_functional_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_functional_ops.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_config
 from tensorflow.python.ops.ragged import ragged_tensor
@@ -70,10 +71,22 @@ def map_flat_values(op, *args, **kwargs):
   # Replace RaggedTensors with their values; and collect the splits tensors
   # from each RaggedTensor.
   nested_splits_lists = []
-  inner_args = _replace_ragged_with_flat_values(args, nested_splits_lists)
-  inner_kwargs = _replace_ragged_with_flat_values(kwargs, nested_splits_lists)
+  flat_values_nrows = []
+  inner_args = _replace_ragged_with_flat_values(args, nested_splits_lists,
+                                                flat_values_nrows)
+  inner_kwargs = _replace_ragged_with_flat_values(kwargs, nested_splits_lists,
+                                                  flat_values_nrows)
   if not nested_splits_lists:
     return op(*args, **kwargs)
+  if flat_values_nrows:
+    flat_values_nrows = set(flat_values_nrows)
+    if len(flat_values_nrows) != 1:
+      raise ValueError("Input RaggedTensors' flat_values must all have the "
+                       "same outer-dimension size.  Got sizes: %s" %
+                       flat_values_nrows)
+    flat_values_nrows = flat_values_nrows.pop()  # Get the single element
+  else:
+    flat_values_nrows = None
 
   split_dtypes = set(splits[0].dtype for splits in nested_splits_lists)
   if len(split_dtypes) > 1:
@@ -88,13 +101,23 @@ def map_flat_values(op, *args, **kwargs):
 
   with ops.control_dependencies(
       ragged_util.assert_splits_match(nested_splits_lists)):
-    # Delegate to op, and then compose the result from the transformed values
-    # and the splits.
+    # Delegate to `op`
+    op_output = op(*inner_args, **inner_kwargs)
+    # Check that the result has the expected shape (if known).
+    if flat_values_nrows is not None:
+      if not op_output.shape[:1].is_compatible_with([flat_values_nrows]):
+        raise ValueError(
+            "tf.ragged.map_flat_values requires that the output of `op` have "
+            "the same outer-dimension size as flat_values of any ragged "
+            "inputs. (output shape: %s; expected outer dimension size: %s)" %
+            (op_output.shape, flat_values_nrows))
+    # Compose the result from the transformed values and the splits.
     return ragged_tensor.RaggedTensor.from_nested_row_splits(
-        op(*inner_args, **inner_kwargs), nested_splits_lists[0], validate=False)
+        op_output, nested_splits_lists[0], validate=False)
 
 
-def _replace_ragged_with_flat_values(value, nested_splits_lists):
+def _replace_ragged_with_flat_values(value, nested_splits_lists,
+                                     flat_values_nrows):
   """Replace RaggedTensors with their flat_values, and record their splits.
 
   Returns a copy of `value`, with any nested `RaggedTensor`s replaced by their
@@ -106,6 +129,9 @@ def _replace_ragged_with_flat_values(value, nested_splits_lists):
     value: The value that should be transformed by replacing `RaggedTensors`.
     nested_splits_lists: An output parameter used to record the `nested_splits`
       for any `RaggedTensors` that were replaced.
+    flat_values_nrows: An output parameter used to record the outer dimension
+      size for each replacement `flat_values` (when known).  Contains a list of
+      int.
 
   Returns:
     A copy of `value` with nested `RaggedTensors` replaced by their `values`.
@@ -114,11 +140,15 @@ def _replace_ragged_with_flat_values(value, nested_splits_lists):
   if ragged_tensor.is_ragged(value):
     value = ragged_tensor.convert_to_tensor_or_ragged_tensor(value)
     nested_splits_lists.append(value.nested_row_splits)
+    nrows = tensor_shape.dimension_at_index(value.flat_values.shape, 0).value
+    if nrows is not None:
+      flat_values_nrows.append(nrows)
     return value.flat_values
 
   # Recursion cases
   def recurse(v):
-    return _replace_ragged_with_flat_values(v, nested_splits_lists)
+    return _replace_ragged_with_flat_values(v, nested_splits_lists,
+                                            flat_values_nrows)
 
   if isinstance(value, list):
     return [recurse(v) for v in value]
diff --git a/tensorflow/python/ops/ragged/ragged_map_flat_values_op_test.py b/tensorflow/python/ops/ragged/ragged_map_flat_values_op_test.py
index 588a5473741..e65c877aa68 100644
--- a/tensorflow/python/ops/ragged/ragged_map_flat_values_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_map_flat_values_op_test.py
@@ -178,18 +178,33 @@ class RaggedMapInnerValuesOpTest(test_util.TensorFlowTestCase):
   def testRaggedTensorSplitsRaggedRankMismatchError(self):
     x = ragged_factory_ops.constant([[3, 1, 4], [], [1, 5]])
     y = ragged_factory_ops.constant([[[3, 1, 4], []], [], [[1, 5]]])
-    self.assertRaisesRegex(ValueError,
-                           r'Inputs must have identical ragged splits.*',
-                           ragged_functional_ops.map_flat_values, math_ops.add,
-                           x, y)
+    with self.assertRaisesRegex(ValueError,
+                                r'Inputs must have identical ragged splits.*'):
+      ragged_functional_ops.map_flat_values(math_ops.add, x, y)
 
   def testRaggedTensorSplitsValueMismatchError(self):
     x = ragged_factory_ops.constant([[3, 1, 4], [], [1, 5]])
     y = ragged_factory_ops.constant([[1], [2, 3], [4, 5]])
-    self.assertRaisesRegex(errors.InvalidArgumentError,
-                           r'Inputs must have identical ragged splits.*',
-                           ragged_functional_ops.map_flat_values, math_ops.add,
-                           x, y)
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                r'Inputs must have identical ragged splits.*'):
+      ragged_functional_ops.map_flat_values(math_ops.add, x, y)
+
+    z_splits = array_ops.placeholder_with_default(
+        constant_op.constant([0, 3], dtypes.int64), None)
+    z = ragged_tensor.RaggedTensor.from_row_splits([0, 1, 2], z_splits)
+    with self.assertRaisesRegex(
+        ValueError,
+        r"Input RaggedTensors' flat_values must all have the same "
+        r'outer-dimension size.  Got sizes: \{3, 5\}'):
+      ragged_functional_ops.map_flat_values(math_ops.add, x, z)
+
+  def testRaggedTensorShapeMismatchError(self):
+    x = ragged_factory_ops.constant([[1, 2, 3], [4, 5]])
+    with self.assertRaisesRegex(
+        ValueError, r'tf.ragged.map_flat_values requires that the output of '
+        '`op` have the same outer-dimension size as flat_values of any ragged '
+        r'inputs. \(output shape: \(\); expected outer dimension size: 5\)'):
+      ragged_functional_ops.map_flat_values(math_ops.argmax, x)
 
   def testRaggedTensorSplitsMismatchErrorAtRuntime(self):
     splits1 = array_ops.placeholder_with_default(
diff --git a/tensorflow/python/ops/ragged/ragged_tensor.py b/tensorflow/python/ops/ragged/ragged_tensor.py
index 93eede6e18a..767f549e952 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor.py
@@ -825,6 +825,27 @@ class RaggedTensor(composite_tensor.CompositeTensor,
     value_shape = self._values.shape[1:]
     return tensor_shape.TensorShape([nrows, ncols]).concatenate(value_shape)
 
+  def get_shape(self):
+    """The statically known shape of this ragged tensor.
+
+    Returns:
+      A `TensorShape` containing the statically known shape of this ragged
+      tensor.  Ragged dimensions have a size of `None`.
+
+    Alias for `shape` property.
+
+    Examples:
+
+    >>> tf.ragged.constant([[0], [1, 2]]).get_shape()
+    TensorShape([2, None])
+
+    >>> tf.ragged.constant(
+    ...    [[[0, 1]], [[1, 2], [3, 4]]], ragged_rank=1).get_shape()
+    TensorShape([2, None, 2])
+
+    """
+    return self.shape
+
   @property
   def ragged_rank(self):
     """The number of times the RaggedTensor's flat_values is partitioned.
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_test.py b/tensorflow/python/ops/ragged/ragged_tensor_test.py
index 9a15245ea2a..286b730b298 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_test.py
@@ -760,6 +760,11 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           [1, 2, 3], array_ops.placeholder(dtype=dtypes.int64))
       self.assertEqual(rt6.shape.as_list(), [None, None])
 
+  def testGetShape(self):
+    rt = RaggedTensor.from_row_splits(b'a b c d e f g'.split(),
+                                      [0, 2, 5, 6, 6, 7])
+    self.assertEqual(rt.shape.as_list(), rt.get_shape().as_list())
+
   #=============================================================================
   # RaggedTensor.__str__
   #=============================================================================
diff --git a/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py b/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
index a4fa2dc292f..28955c825e6 100644
--- a/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
@@ -610,7 +610,6 @@ class RaggedTensorToTensorOpTest(test_util.TensorFlowTestCase,
           output_value  = [],
           output_grad   = [])
   ])  # pyformat: disable
-  @test_util.run_deprecated_v1
   def test_gradient(self,
                     shape,
                     rt_value,
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index 6aad3fd43f0..46a1e321093 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_random_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import stateless_random_ops
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
@@ -373,9 +374,6 @@ def random_crop(value, size, seed=None, name=None):
   Returns:
     A cropped tensor of the same rank as `value` and shape `size`.
   """
-  # TODO(shlens): Implement edge case to guarantee output size dimensions.
-  # If size > value.shape, zero pad the result so that it always has shape
-  # exactly size.
   with ops.name_scope(name, "random_crop", [value, size]) as name:
     value = ops.convert_to_tensor(value, name="value")
     size = ops.convert_to_tensor(size, dtype=dtypes.int32, name="size")
@@ -394,6 +392,59 @@ def random_crop(value, size, seed=None, name=None):
     return array_ops.slice(value, offset, size, name=name)
 
 
+@tf_export("image.stateless_random_crop", v1=[])
+@dispatch.add_dispatch_support
+def stateless_random_crop(value, size, seed, name=None):
+  """Randomly crops a tensor to a given size in a deterministic manner.
+
+  Slices a shape `size` portion out of `value` at a uniformly chosen offset.
+  Requires `value.shape >= size`.
+
+  If a dimension should not be cropped, pass the full size of that dimension.
+  For example, RGB images can be cropped with
+  `size = [crop_height, crop_width, 3]`.
+
+  Guarantees the same results given the same `seed` independent of how many
+  times the function is called, and independent of global seed settings (e.g.
+  `tf.random.set_seed`).
+
+  Usage Example:
+
+  >>> image = [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]
+  >>> seed = (1, 2)
+  >>> tf.image.stateless_random_crop(value=image, size=(1, 2, 3), seed=seed)
+  <tf.Tensor: shape=(1, 2, 3), dtype=int32, numpy=
+  array([[[1, 2, 3],
+          [4, 5, 6]]], dtype=int32)>
+
+  Args:
+    value: Input tensor to crop.
+    size: 1-D tensor with size the rank of `value`.
+    seed: A shape [2] Tensor, the seed to the random number generator. Must have
+      dtype `int32` or `int64`. (When using XLA, only `int32` is allowed.)
+    name: A name for this operation (optional).
+
+  Returns:
+    A cropped tensor of the same rank as `value` and shape `size`.
+  """
+  with ops.name_scope(name, "random_crop", [value, size]) as name:
+    value = ops.convert_to_tensor(value, name="value")
+    size = ops.convert_to_tensor(size, dtype=dtypes.int32, name="size")
+    shape = array_ops.shape(value)
+    check = control_flow_ops.Assert(
+        math_ops.reduce_all(shape >= size),
+        ["Need value.shape >= size, got ", shape, size],
+        summarize=1000)
+    shape = control_flow_ops.with_dependencies([check], shape)
+    limit = shape - size + 1
+    offset = stateless_random_ops.stateless_random_uniform(
+        array_ops.shape(shape),
+        dtype=size.dtype,
+        maxval=size.dtype.max,
+        seed=seed) % limit
+    return array_ops.slice(value, offset, size, name=name)
+
+
 @tf_export(v1=["random.multinomial", "multinomial"])
 @dispatch.add_dispatch_support
 @deprecation.deprecated(
@@ -426,6 +477,7 @@ def multinomial(logits, num_samples, seed=None, name=None, output_dtype=None):
 
 
 @tf_export("random.categorical")
+@dispatch.add_dispatch_support
 def categorical(logits, num_samples, dtype=None, seed=None, name=None):
   """Draws samples from a categorical distribution.
 
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 8f5d056807f..7b319e4270e 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -23,6 +23,8 @@ import contextlib
 import functools
 import weakref
 
+import numpy as np
+
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import variable_pb2
 from tensorflow.python import _pywrap_utils
@@ -474,6 +476,23 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     else:
       yield
 
+  def __array__(self):
+    """Allows direct conversion to a numpy array.
+
+    >>> np.array(tf.Variable([1.0]))
+    array([1.], dtype=float32)
+
+    Returns:
+      The variable value as a numpy array.
+    """
+    # You can't return `self.numpy()` here because for scalars
+    # that raises:
+    #     ValueError: object __array__ method not producing an array
+    # Even `self.read_value().__array__()` and `self.read_value()._numpy()` give
+    # the same error. The `EagerTensor` class must be doing something behind the
+    # scenes to make `np.array(tf.constant(1))` work.
+    return np.asarray(self.numpy())
+
   def __nonzero__(self):
     return self.__bool__()
 
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 3a145e96f19..18b7561b113 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -47,6 +47,7 @@ from tensorflow.python.ops.gen_sparse_ops import *
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import dispatch
+from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import get_canonical_name_for_symbol
@@ -2772,6 +2773,119 @@ def sparse_transpose(sp_input, perm=None, name=None):
     return transposed_st
 
 
+@tf_export("sparse.map_values", v1=[])
+@dispatch.add_dispatch_support
+def map_values(op, *args, **kwargs):
+  """Applies `op` to the `.values` tensor of one or more `SparseTensor`s.
+
+  Replaces any `SparseTensor` in `args` or `kwargs` with its `values`
+  tensor (which contains the non-default values for the SparseTensor),
+  and then calls `op`.  Returns a `SparseTensor` that is constructed
+  from the input `SparseTensor`s' `indices`, `dense_shape`, and the
+  value returned by the `op`.
+
+  If the input arguments contain multiple `SparseTensor`s, then they must have
+  equal `indices` and dense shapes.
+
+  Examples:
+
+  >>> s = tf.sparse.from_dense([[1, 2, 0],
+  ...                           [0, 4, 0],
+  ...                           [1, 0, 0]])
+  >>> tf.sparse.to_dense(tf.sparse.map_values(tf.ones_like, s)).numpy()
+  array([[1, 1, 0],
+         [0, 1, 0],
+         [1, 0, 0]], dtype=int32)
+
+  >>> tf.sparse.to_dense(tf.sparse.map_values(tf.multiply, s, s)).numpy()
+  array([[ 1,  4,  0],
+         [ 0, 16,  0],
+         [ 1,  0,  0]], dtype=int32)
+
+  >>> tf.sparse.to_dense(tf.sparse.map_values(tf.add, s, 5)).numpy()
+  array([[6, 7, 0],
+         [0, 9, 0],
+         [6, 0, 0]], dtype=int32)
+
+  Note: even though `tf.add(0, 5) != 0`, implicit zeros
+  will remain unchanged. However, if the sparse tensor contains any explict
+  zeros, these will be affected by the mapping!
+
+  Args:
+    op: The operation that should be applied to the SparseTensor `values`. `op`
+      is typically an element-wise operation (such as math_ops.add), but any
+      operation that preserves the shape can be used.
+    *args: Arguments for `op`.
+    **kwargs: Keyword arguments for `op`.
+
+  Returns:
+    A `SparseTensor` whose `indices` and `dense_shape` matches the `indices`
+    and `dense_shape` of all input `SparseTensor`s.
+  Raises:
+    ValueError: If args contains no `SparseTensor`, or if the `indices`
+      or `dense_shape`s of the input `SparseTensor`s are not equal.
+  """
+  sparse_list = []
+  inner_args = _replace_sparse_with_values(args, sparse_list)
+  inner_kwargs = _replace_sparse_with_values(kwargs, sparse_list)
+  if not sparse_list:
+    raise ValueError("No SparseTensor in argument list of map_values")
+
+  with ops.control_dependencies(_assert_sparse_compatible(sparse_list)):
+    # Delegate to op, and then compose the result from the transformed values
+    # and the known indices/dense shape. Since we ensure that indices and shape
+    # are identical, we can just use the first one.
+    return sparse_tensor.SparseTensor(sparse_list[0].indices,
+                                      op(*inner_args, **inner_kwargs),
+                                      sparse_list[0].dense_shape)
+
+
+def _assert_sparse_compatible(sparse_tensors):
+  """Check that all of `sparse_tensors` have same `indices` and `dense_shape`.
+
+  Args:
+    sparse_tensors: A list of sparse tensors.
+
+  Returns:
+    An op to be used as a control dependency.
+  """
+  checks = []
+  first = sparse_tensors[0]
+  for t in sparse_tensors[1:]:
+    checks.append(
+        check_ops.assert_equal(
+            first.dense_shape, t.dense_shape, message="Mismatched shapes!"))
+    checks.append(
+        check_ops.assert_equal(
+            first.indices, t.indices, message="Mismatched indices!"))
+  return checks
+
+
+def _replace_sparse_with_values(value, sparse_list):
+  """Replace `SparseTensor`s with their values in `value`
+
+  Each `SparseTensor` in `value` is replaced by its `values` tensor, and
+  collects all `SparseTensor`s in `sparse_list`.
+
+  Args:
+    value: A structure of `Tensor`s and `SparseTensor`s
+    sparse_list: A list. Output parameter that collects all `SparseTensor`s in
+      `value`.
+
+  Returns:
+    `value` with each SparseTensor replaced by its `.value` attribute.
+  """
+  flat_vals = nest.flatten(value, expand_composites=False)
+  new_vals = []
+  for v in flat_vals:
+    if isinstance(v, sparse_tensor.SparseTensor):
+      sparse_list.append(v)
+      new_vals.append(v.values)
+    else:
+      new_vals.append(v)
+  return nest.pack_sequence_as(value, new_vals, expand_composites=False)
+
+
 def _add_sparse_to_tensors_map(sp_input,
                                container=None,
                                shared_name=None,
diff --git a/tensorflow/python/ops/sparse_ops_test.py b/tensorflow/python/ops/sparse_ops_test.py
index d321f41a85a..829971b1af1 100644
--- a/tensorflow/python/ops/sparse_ops_test.py
+++ b/tensorflow/python/ops/sparse_ops_test.py
@@ -23,6 +23,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
@@ -180,6 +181,49 @@ class SparseOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         array_ops.transpose(dense_of_sparse))
     self.assertAllEqual(expected, result)
 
+  def testMapValues(self):
+    # supplying no sparse tensor should result in ValueError
+    with self.assertRaises(ValueError):
+      sparse_ops.map_values(math_ops.abs, 0.0)
+
+    sp = sparse_ops.from_dense([[0.0, 1.0, 0.0], [-2.0, 1.0, 0.0]])
+
+    # helper function to check equality of sparse tensor
+    def assert_sparse_equal(expected, result):
+      self.assertAllEqual(expected.values, result.values, msg='Values differ')
+      self.assertAllEqual(
+          expected.indices, result.indices, msg='Indices differ')
+      self.assertAllEqual(
+          expected.dense_shape, result.dense_shape, msg='Shapes differ')
+
+    # check for a single sparse argument
+    expected = sparse_ops.from_dense([[0.0, 1.0, 0.0], [2.0, 1.0, 0.0]])
+    result = sparse_ops.map_values(math_ops.abs, sp)
+    assert_sparse_equal(expected, result)
+
+    # check correct passing of keyword argument, and handling of two sparse
+    # arguments at the same time
+    def mapping(arg1, arg2, kwarg):
+      self.assertEqual(kwarg, 'kwarg')
+      return arg1 + arg2
+
+    result = sparse_ops.map_values(mapping, sp, sp, kwarg='kwarg')
+    expected = sparse_ops.from_dense([[0.0, 2.0, 0.0], [-4.0, 2.0, 0.0]])
+    assert_sparse_equal(expected, result)
+
+    # check that index mismatches are correctly detected even if the `value`s
+    # have compatible shape
+    sp_incomp = sparse_ops.from_dense([[0.0, 1.0, 0.0], [-2.0, 0.0, 1.0]])
+    with self.assertRaises((errors.InvalidArgumentError, ValueError)):
+      result = sparse_ops.map_values(mapping, sp, sp_incomp, kwarg='kwarg')
+      self.evaluate(result)
+
+    # check that shape mismatches are correctly detected
+    sp_incomp = sparse_tensor.SparseTensor(sp.indices, sp.values, (25, 25))
+    with self.assertRaises((errors.InvalidArgumentError, ValueError)):
+      result = sparse_ops.map_values(mapping, sp, sp_incomp, kwarg='kwarg')
+      self.evaluate(result)
+
   def testConstantStringToSparse(self):
     # Test case for GitHub issue 40633.
     tensor = constant_op.constant(list('ababa'))
diff --git a/tensorflow/python/ops/structured/structured_tensor.py b/tensorflow/python/ops/structured/structured_tensor.py
index 3c3bd03a06b..c09a38f1d21 100644
--- a/tensorflow/python/ops/structured/structured_tensor.py
+++ b/tensorflow/python/ops/structured/structured_tensor.py
@@ -1,3 +1,4 @@
+# Lint as python3
 # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 import re
+from typing import Callable, Dict, List, Sequence, Tuple, Union
 
 import numpy as np
 
@@ -85,6 +87,23 @@ class StructuredTensor(composite_tensor.CompositeTensor):
   field.
   """
 
+  #=============================================================================
+  # Common Types
+  #=============================================================================
+  # pylint: disable=invalid-name
+  # Field names work as key, and they can be a sequence to refer to the
+  # sub-levels (embedded) StructuredTensor's.
+  FieldName = Union[str, Sequence[str]]
+
+  # Each field may contain one of the following types of Tensors.
+  FieldValue = Union[ops.Tensor, ragged_tensor.RaggedTensor, 'StructuredTensor']
+
+  # Function that takes a FieldValue as input and returns the transformed
+  # FieldValue.
+  FieldFn = Callable[[FieldValue], FieldValue]
+
+  # pylint: enable=invalid-name
+
   #=============================================================================
   # Constructor & Factory Methods
   #=============================================================================
@@ -252,6 +271,180 @@ class StructuredTensor(composite_tensor.CompositeTensor):
         row_partitions,
         internal=_structured_tensor_factory_key)
 
+  def with_updates(self,
+                   updates: Dict[FieldName, Union[FieldValue, FieldFn, None]],
+                   validate: bool = False) -> 'StructuredTensor':    # pylint: disable=bad-whitespace
+    """Creates a new `StructuredTensor` with the updated fields.
+
+    If this `StructuredTensor` is a scalar, and `k` is the `FieldName` being
+    updated and `v` the new value, then:
+
+    ```
+    result[k] = v              # If (k, v) is in updates and v is a FieldValue
+    result[k] = f(self[k])     # If (k, f) is in updates and f is a FieldFn
+    result[k] = self[k]        # If k is in self.field_names but not in updates
+    ```
+
+    If this `StructuredTensor` has rank `N` and shape `[D1...DN]`, then each
+    FieldValue `v` in `updates` must have shape `[D1...DN, ...]`, that is,
+    prefixed with the same shape as the `StructuredTensor`. Then the resulting
+    `StructuredTensor` will have:
+
+    ```
+    result[i1...iN][k] = v[i1...iN]                        # (k, v) in updates
+    result[i1...iN][k] = f(self.field_value(k))[i1...iN]   # (k, f) in updates
+    result[i1...iN][k] = self[i1...iN][k]                  # k not in updates
+    ```
+
+    Note that `result.shape` is always equal to `self.shape` (but the shapes
+    of nested StructuredTensors may be changed if they are updated with new
+    values).
+
+    Args:
+      updates: A dictionary mapping `FieldName` to either a `FieldValue` to be
+        used to update, or a `FieldFn` that will transform the value for the
+        given `FieldName`. `FieldName` can be a string for a direct field, or a
+        sequence of strings to refer to a nested sub-field. `FieldFn` is a
+        function that takes a `FieldValue` as input and should return a
+        `FieldValue`. All other fields are copied over to the new
+        `StructuredTensor`. New `FieldName` can be given (to add new fields),
+        but only to existing `StructuredTensor`, it won't automatically create
+        new nested structures -- but one can create a whole `StructureTensor`
+        sub-structure and set that into an existing structure. If the new value
+        is set to `None`, it is removed.
+      validate: If true, then add runtime validation ops that check that the
+        field values all have compatible shapes in the outer `shape.rank`
+        dimensions.
+
+    Returns:
+      A `StructuredTensor`.
+
+    Raises:
+      `ValueError`: If the any of the `FieldName` keys points to non-existent
+        sub-structures, if parent and child nodes are updated, if shapes
+        change, if a delete update is given for a non-existant field, or if a
+        `FieldFn` transforming function is given for a `FieldName` that doesn't
+        yet exist.
+
+    Examples:
+
+    >>> shoes_us = StructuredTensor.from_pyval([
+    ...    {"age": 12, "nicknames": ["Josaphine"],
+    ...       "shoes": {"sizes": [8.0, 7.5, 7.5]}},
+    ...    {"age": 82, "nicknames": ["Bob", "Bobby"],
+    ...        "shoes": {"sizes": [11.0, 11.5, 12.0]}},
+    ...    {"age": 42, "nicknames": ["Elmo"],
+    ...        "shoes": {"sizes": [9.0, 9.5, 10.0]}}])
+    >>> def us_to_europe(t):
+    ...   return tf.round(t * 2.54 + 17.0)  # Rough approximation.
+    >>> shoe_sizes_key = ("shoes", "sizes")
+    >>> shoes_eu = shoes_us.with_updates({shoe_sizes_key: us_to_europe})
+    >>> shoes_eu.field_value(shoe_sizes_key)
+    <tf.RaggedTensor [[37.0, 36.0, 36.0], [45.0, 46.0, 47.0],
+    [40.0, 41.0, 42.0]]>
+    """
+    updates_items = [(_normalize_field_name_to_tuple(name), value)
+                     for name, value in updates.items()]
+
+    # Sort by keys and check for updates of both parent and child nodes.
+    updates_items = sorted(updates_items)
+    for i in range(1, len(updates_items)):
+      # Parent of a node would precede node in the sorted order.
+      name = updates_items[i][0]  # item[0] is the name, item[1] is the value.
+      prev_name = updates_items[i - 1][0]
+      if name[:len(prev_name)] == prev_name:
+        raise ValueError(
+            '`StructuredTensor.with_updates` does not allow both parent and '
+            'child nodes to be updated: parent={}, child={}. If needed you can '
+            'update child nodes in the parent update value.'.format(
+                prev_name, name))
+    return self._with_updates_impl((), updates_items, validate)
+
+  def _with_updates_impl(self, error_prefix: Tuple[str],  # pylint: disable=invalid-sequence-index
+                         updates: List[Tuple[FieldName, Union[FieldValue,  # pylint: disable=invalid-sequence-index
+                                                              FieldFn]]],
+                         validate: bool) -> 'StructuredTensor':
+    """Recursive part of `with_updates` implementation."""
+    # Get current fields.
+    new_fields = dict(self._fields)
+
+    # Convert field name to string with full path for error messages.
+    def name_fullpath(name: Sequence[str]) -> str:
+      return str(error_prefix + (name,))
+
+    # Apply value if a function or the value itself.
+    def apply_value(name: str, value: Union['FieldValue',
+                                            'FieldFn']) -> 'FieldValue':
+      if callable(value):
+        # `value` is actually a transforming function.
+        if name not in new_fields:
+          raise ValueError(
+              '`StructuredTensor.with_updates` cannot update the field {} '
+              'because a transforming function was given, but that field '
+              'does not already exist.'.format(name_fullpath(name)))
+        value = value(new_fields[name])
+      return value
+
+    # Merge updates.
+    for name, value in updates:
+      if not name or not name[0]:
+        raise ValueError(
+            '`StructuredTensor.with_updates` does not allow empty names '
+            '{}.'.format(name_fullpath(name)))
+
+      if len(name) == 1:
+        name = name[0]
+        if value is None:
+          if name not in new_fields:
+            raise ValueError(
+                '`StructuredTensor.with_updates` cannot delete field '
+                '{} because it is not present.'.format(name_fullpath(name)))
+          new_fields.pop(name)
+        else:
+          new_fields[name] = apply_value(name, value)
+      else:
+        # Recursive
+        prefix = name[0]
+        suffix = name[1:]
+        if prefix not in new_fields:
+          raise ValueError(
+              '`StructuredTensor.with_updates` cannot create new sub-field '
+              '{} if parent field {} is not set.'.format(
+                  error_prefix + tuple(name), name_fullpath(prefix)))
+        current_value = new_fields[prefix]
+        if not isinstance(current_value, StructuredTensor):
+          raise ValueError(
+              '`StructuredTensor.with_updates` cannot create new sub-field '
+              '{} if parent structure {} is not a `StructuredTensor` that '
+              'can contain sub-structures -- it is a `{}`.'.format(
+                  error_prefix + tuple(name), name_fullpath(prefix),
+                  type(current_value)))
+        one_update = [(suffix, value)]
+
+        # Accessing protected member in recursion.
+        # FutureWork: optimize by aggregating the recursions, instead of
+        #   calling one at a time.
+        # pylint: disable=protected-access
+        value = current_value._with_updates_impl(error_prefix + (prefix,),
+                                                 one_update, validate)
+        # pylint: enable=protected-access
+        new_fields[prefix] = value
+
+    # TODO(edloper): When validate=True, only validate the modified fields.
+    try:
+      return StructuredTensor.from_fields(
+          new_fields,
+          shape=self.shape,
+          row_partitions=self._row_partitions,
+          nrows=self._nrows,
+          validate=validate)
+
+    except ValueError as e:
+      msg = '`StructuredTensor.with_updates` failed'
+      if error_prefix:
+        msg = '{} for field {}'.format(msg, error_prefix)
+      raise ValueError('{}: {}'.format(msg, e))
+
   #=============================================================================
   # Properties
   #=============================================================================
@@ -279,22 +472,74 @@ class StructuredTensor(composite_tensor.CompositeTensor):
   def row_partitions(self):
     """A tuple of `RowPartition`s defining the shape of this `StructuredTensor`.
 
-    If this `StructuredTensor` has a ragged shape, then all fields will be
-    encoded as either `RaggedTensor`s or `StructuredTensor`s with these
-    `RowPartition`s used to define their outermost `self.rank` dimensions.
+    When `self.rank <= 1`, this tuple will be empty.
 
-    If this `StructuredTensor` has a uniform (non-ragged) shape, then these
-    row partitions will all be defined using `uniform_row_length`.
+    When `self.rank > 1`, these `RowPartitions` define the shape of the
+    `StructuredTensor` by describing how a flat (1D) list of structures can be
+    repeatedly partitioned to form a higher-dimensional object.  In particular,
+    the flat list is first partitioned into sublists using `row_partitions[-1]`,
+    and then those sublists are further partitioned using `row_partitions[-2]`,
+    etc.  The following examples show the row partitions used to describe
+    several different `StructuredTensor`, each of which contains 8 copies of
+    the same structure (`x`):
+
+    >>> x = {'a': 1, 'b': ['foo', 'bar', 'baz']}       # shape = [] (scalar)
+
+    >>> s1 = [[x, x, x, x], [x, x, x, x]]              # shape = [2, 4]
+    >>> StructuredTensor.from_pyval(s1).row_partitions
+    (tf.RowPartition(row_splits=tf.Tensor([0 4 8], shape=(3,),
+                                          dtype=int64)),)
+
+    >>> s2 = [[x, x], [x, x], [x, x], [x, x]]          # shape = [4, 2]
+    >>> StructuredTensor.from_pyval(s2).row_partitions
+    (tf.RowPartition(row_splits=tf.Tensor([0 2 4 6 8], shape=(5,),
+                                          dtype=int64)),)
+
+    >>> s3 = [[x, x, x], [], [x, x, x, x], [x]]        # shape = [2, None]
+    >>> StructuredTensor.from_pyval(s3).row_partitions
+    (tf.RowPartition(row_splits=tf.Tensor([0 3 3 7 8], shape=(5,),
+                                          dtype=int64)),)
+
+    >>> s4 = [[[x, x], [x, x]], [[x, x], [x, x]]]      # shape = [2, 2, 2]
+    >>> StructuredTensor.from_pyval(s4).row_partitions
+    (tf.RowPartition(row_splits=tf.Tensor([0 2 4], shape=(3,), dtype=int64)),
+     tf.RowPartition(row_splits=tf.Tensor([0 2 4 6 8], shape=(5,),
+                                          dtype=int64)))
+
+
+    >>> s5 = [[[x, x], [x]], [[x, x]], [[x, x], [x]]]  # shape = [3, None, None]
+    >>> StructuredTensor.from_pyval(s5).row_partitions
+    (tf.RowPartition(row_splits=tf.Tensor([0 2 3 5], shape=(4,), dtype=int64)),
+     tf.RowPartition(row_splits=tf.Tensor([0 2 3 5 7 8], shape=(6,),
+                                          dtype=int64)))
+
+    Note that shapes for nested fields (such as `x['b']` in the above example)
+    are not considered part of the shape of a `StructuredTensor`, and are not
+    included in `row_partitions`.
+
+    If this `StructuredTensor` has a ragged shape (i.e., if any of the
+    `row_partitions` is not uniform in size), then all fields will be encoded
+    as either `RaggedTensor`s or `StructuredTensor`s with these `RowPartition`s
+    used to define their outermost `self.rank` dimensions.
 
     Returns:
       A `tuple` of `RowPartition` objects with length `self.rank - 1`
-      (or `0` if `self.rank < 2`).
+      (or `0` if `self.rank < 2`)
+
     """
     return self._row_partitions
 
   def nrows(self):
     """The number of rows in this StructuredTensor (if rank>0).
 
+    This means the length of the outer-most dimension of the StructuredTensor.
+
+    Notice that if `self.rank > 1`, then this equals the number of rows
+    of the first row partition. That is,
+    `self.nrows() == self.row_partitions[0].nrows()`.
+
+    Otherwise `self.nrows()` will be the first dimension of the field values.
+
     Returns:
       A scalar integer `Tensor` (or `None` if `self.rank == 0`).
     """
@@ -1175,3 +1420,13 @@ def _merge_dims(value, outer_axis, inner_axis):
 
 
 _structured_tensor_factory_key = object()  # unique private object
+
+
+def _normalize_field_name_to_tuple(name: 'FieldName') -> Sequence[str]:
+  """FieldName can be given also as string, this normalizes it to a tuple."""
+  if isinstance(name, str):
+    return (name,)
+  if isinstance(name, list):
+    return tuple(name)
+  assert isinstance(name, tuple)
+  return name
diff --git a/tensorflow/python/ops/structured/structured_tensor_test.py b/tensorflow/python/ops/structured/structured_tensor_test.py
index 75aa5a872a6..f4218042cc2 100644
--- a/tensorflow/python/ops/structured/structured_tensor_test.py
+++ b/tensorflow/python/ops/structured/structured_tensor_test.py
@@ -924,7 +924,7 @@ class StructuredTensorTest(test_util.TensorFlowTestCase,
     st = StructuredTensor.from_pyval({"a": 5, "b": {"c": [1, 2, 3]}})
     self.assertAllEqual(st.field_value(("a",)), 5)
     self.assertAllEqual(st.field_value(("b", "c")), [1, 2, 3])
-    expected = "Field path \(.*a.*,.*b.*\) not found in .*"
+    expected = r"Field path \(.*a.*,.*b.*\) not found in .*"
     with self.assertRaisesRegex(KeyError, expected):
       st.field_value(("a", "b"))
 
@@ -961,6 +961,179 @@ class StructuredTensorTest(test_util.TensorFlowTestCase,
     r = result.field_value("r")
     self.assertAllEqual(r, [[[1, 2], [3, 4]]])
 
+  @parameterized.parameters([
+      # Simple example.
+      (
+          {"a": 12, "b": 23},
+          {"a": 7},
+      ),
+      # New field.
+      (
+          {"a": 12},
+          {("b",): 13},
+      ),
+      # Nested example.
+      (
+          {"a": 12, "b": {"c": 23}},
+          {("b", "c"): 7},
+      ),
+      # Multipe updates.
+      (
+          {"a": 12, "b": {"c": 23}},
+          {"a": 3, ("b", "c"): 7},
+      ),
+      # Deep updates.
+      (
+          {"a": 12, "b": {"c": 23, "d": {"e": 11}}},
+          {("b", "c"): 7, ("b", "d", "e"): 13},
+      ),
+      # Multiple updates to the same substructure.
+      (
+          {"a": 12, "b": {"c": 23, "d": {"e": 11}}},
+          {("b", "c"): 7, ("b", "f"): 13},
+      ),
+      # Scalar to non-scalar elements. Shape remains unchanged.
+      (
+          {"a": 5},
+          {"a": ragged_factory_ops.constant_value([[51, 52], [61, 62, 63]])},
+      ),
+      # Non-scalar element to scalar.
+      (
+          {"c": {"a": [5, 3], "b": 2}},
+          {("c", "a"): 5},
+      ),
+      # Rank-1 StructuredTensor: shape is preserved and an item is added.
+      (
+          [{"a": 5}, {"a": 6}],
+          {"a": [15, 16], "b": np.array([0.9, 1.1])},
+      ),
+      # Non-scalar ragged elements, within a rank-2 StructuredTensor: elements
+      # rows (inner dimensions) are changed, but StructuredTensor shape
+      # (outer dimensions) are preserved.
+      (
+          [[{"a": [5]}], [{"a": [3, 4]}, {"a": [8]}]],
+          {"a": ragged_factory_ops.constant_value([[[50, 60]], [[30], []]])},
+      ),
+  ])  # pyformat: disable
+  def testWithUpdatesValues(self, pyval, updates):
+    st = StructuredTensor.from_pyval(pyval)
+    updated_st = st.with_updates(updates, validate=False)
+    for key, value in updates.items():
+      got = updated_st.field_value(key)
+      self.assertAllEqual(
+          value, got, "Update failed: key={}, value={}, got={}".format(
+              key, value, got))
+
+  def testWithUpdatesFunctions(self):
+    pyval = {"a": 12, "b": {"c": 23, "d": {"e": 11}}}
+    st = StructuredTensor.from_pyval(pyval)
+    st_updated = st.with_updates(
+        {
+            "a": lambda x: x + 1,
+            ("b", "d", "e"): lambda x: x + 7
+        }, validate=True)
+    # Updated values.
+    self.assertAllEqual(st_updated.field_value("a"), 13)
+    self.assertAllEqual(st_updated.field_value(("b", "d", "e")), 18)
+    # Unchanged value.
+    self.assertAllEqual(st_updated.field_value(("b", "c")), 23)
+
+  def testWithUpdatesChecks(self):
+    pyval = {"a": 12, "b": {"c": 23, "d": {"e": 11}}}
+    st = StructuredTensor.from_pyval(pyval)
+
+    # Try to set non-existant sub-structure.
+    with self.assertRaisesRegex(
+        ValueError, r"cannot create new sub-field.*\('b', 'x'\).*is not set"):
+      st.with_updates({("b", "x", "e"): 5})
+
+    # Try to set with path to a non-sub-structure.
+    with self.assertRaisesRegex(
+        ValueError, r"cannot create new sub-field.*\('b', 'c'\).*is not a "
+        r"`StructuredTensor`"):
+      st.with_updates({("b", "c", "e"): 5})
+
+    # Try to apply function to non-existing value.
+    with self.assertRaisesRegex(
+        ValueError, r"cannot update.*\('b', 'd', 'x'\).*does not already "
+        r"exist"):
+      st.with_updates({("b", "d", "x"): lambda x: x + 1})
+
+    # Empty names not allowed.
+    with self.assertRaisesRegex(ValueError, r"does not allow empty names"):
+      st.with_updates({(): lambda x: x + 1})
+    with self.assertRaisesRegex(ValueError, r"does not allow empty names"):
+      st.with_updates({("b", ""): lambda x: x + 1})
+
+    # Parent and child nodes cannot be updated simultaneously.
+    with self.assertRaisesRegex(
+        ValueError, r"does not allow both parent and child nodes.*"
+        r"parent=\('b'.*child=\('b', 'd'"):
+      st.with_updates({("b", "d"): lambda x: x + 1, "a": 3, "b": 10})
+
+    # Invalid shape change.
+    with self.assertRaisesRegex(
+        ValueError, r"\('c'.*incompatible with the shape that was specified"):
+      st_with_shape = StructuredTensor.from_pyval([[{
+          "c": {
+              "a": 5,
+              "b": 2
+          }
+      }], [{
+          "c": {
+              "a": 3,
+              "b": 1
+          }
+      }, {
+          "c": {
+              "a": 8,
+              "b": 18
+          }
+      }]])
+      st_with_shape.with_updates({("c", "a"): 3})
+
+  def testWithUpdatesDelete(self):
+    pyval = {"a": 12, "b": {"c": 23, "d": {"e": 11}}}
+    st = StructuredTensor.from_pyval(pyval)
+    updated_st = st.with_updates({("b", "c"): None}, validate=True)
+    self.assertNotIn("c", updated_st.field_value("b").field_names())
+    with self.assertRaisesRegex(ValueError,
+                                r"cannot delete.*\('b', 'x'\).*not present"):
+      st.with_updates({("b", "x"): None}, validate=True)
+    with self.assertRaisesRegex(ValueError,
+                                r"cannot delete.*\'x'.*not present"):
+      st.with_updates({"x": None}, validate=False)
+
+    # Test that nrows() and rowpartitions() is preserved after removal.
+    pyval = [[{"a": 1}, {"a": 2}], [{"a": 3}]]
+    st = StructuredTensor.from_pyval(pyval)
+    self.assertLen(st.row_partitions, 1)
+    self.assertAllEqual(st.nrows(), 2)
+    self.assertAllEqual(st.row_partitions[0].row_lengths(), [2, 1])
+    updated_st = st.with_updates({("a",): None}, validate=True)
+    self.assertLen(updated_st.row_partitions, 1)
+    self.assertAllEqual(updated_st.nrows(), 2)
+    self.assertAllEqual(updated_st.row_partitions[0].row_lengths(), [2, 1])
+
+    # Test that it works also for rank-1 and rank-0 empty results.
+    pyval = [{"a": 1}, {"a": 2}]
+    st = StructuredTensor.from_pyval(pyval)
+    self.assertEqual(st.rank, 1)
+    updated_st = st.with_updates({("a",): None}, validate=True)
+    self.assertEqual(updated_st.rank, 1)
+
+    # assertEqual won't work because nrows() returns a tensor, and
+    # assertEqual doesn't do the magic to convert them to numbers in a
+    # way that works in eager/non-eager mode.
+    self.assertAllEqual(updated_st.nrows(), 2)
+    pyval = {"a": [0, 1]}
+    st = StructuredTensor.from_pyval(pyval)
+    self.assertEqual(st.rank, 0)
+    updated_st = st.with_updates({("a",): None}, validate=True)
+    self.assertEqual(updated_st.rank, 0)
+    self.assertFalse(updated_st.row_partitions)
+    self.assertIsNone(updated_st.nrows())
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/v1_compat_tests/BUILD b/tensorflow/python/ops/v1_compat_tests/BUILD
new file mode 100644
index 00000000000..37bff01d429
--- /dev/null
+++ b/tensorflow/python/ops/v1_compat_tests/BUILD
@@ -0,0 +1,23 @@
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cuda_py_test(
+    name = "gradient_checker_test",
+    size = "medium",
+    srcs = ["gradient_checker_test.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_grad",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/python/ops/gradient_checker_test.py b/tensorflow/python/ops/v1_compat_tests/gradient_checker_test.py
similarity index 100%
rename from tensorflow/python/ops/gradient_checker_test.py
rename to tensorflow/python/ops/v1_compat_tests/gradient_checker_test.py
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index a3c28112350..6e0e83f8564 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections as collections_lib
 import copy
 import enum  # pylint: disable=g-bad-import-order
 import functools
@@ -47,6 +46,7 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import tf_export
 
 __all__ = [
@@ -79,13 +79,13 @@ class _PartitionInfo(object):
       ValueError: If `full_shape` or `var_offset` differ in length. If
         `var_offset` exceeds `full_shape` in any dimension.
     """
-    if not isinstance(full_shape, collections_lib.Sequence) or isinstance(
+    if not isinstance(full_shape, collections_abc.Sequence) or isinstance(
         full_shape, six.string_types):
       raise TypeError(
           "`full_shape` must be a sequence (like tuple or list) instead of " +
           type(full_shape).__name__)
 
-    if not isinstance(var_offset, collections_lib.Sequence) or isinstance(
+    if not isinstance(var_offset, collections_abc.Sequence) or isinstance(
         var_offset, six.string_types):
       raise TypeError(
           "`var_offset` must be a sequence (like tuple or list) instead of " +
@@ -153,7 +153,7 @@ class _PartitionInfo(object):
       ValueError: If `shape` is not the same length as `self.full_shape`. If
         the variable is partitioned in more than one dimension.
     """
-    if not isinstance(shape, collections_lib.Sequence) or isinstance(
+    if not isinstance(shape, collections_abc.Sequence) or isinstance(
         shape, six.string_types):
       raise TypeError(
           "`shape` must be a sequence (like tuple or list) instead of " +
@@ -455,7 +455,7 @@ class _VariableStore(object):
         synchronization=VariableSynchronization.AUTO,
         aggregation=VariableAggregation.NONE):
       is_scalar = (
-          shape is not None and isinstance(shape, collections_lib.Sequence) and
+          shape is not None and isinstance(shape, collections_abc.Sequence) and
           not shape)
       # Partitioned variable case
       if partitioner is not None and not is_scalar:
@@ -2515,7 +2515,7 @@ def _call_partitioner(partitioner, shape, dtype):
                      "shape: %s" % shape)
 
   slicing = partitioner(shape=shape, dtype=dtype)
-  if not isinstance(slicing, collections_lib.Sequence):
+  if not isinstance(slicing, collections_abc.Sequence):
     raise ValueError("Partitioner must return a sequence, but saw: %s" %
                      slicing)
   if len(slicing) != shape.ndims:
diff --git a/tensorflow/python/platform/benchmark.py b/tensorflow/python/platform/benchmark.py
index dcfa4d1ef1a..0f328b2df5f 100644
--- a/tensorflow/python/platform/benchmark.py
+++ b/tensorflow/python/platform/benchmark.py
@@ -430,9 +430,13 @@ def _run_benchmarks(regex):
 
   Args:
     regex: The string regular expression to match Benchmark classes against.
+
+  Raises:
+    ValueError: If no benchmarks were selected by the input regex.
   """
   registry = list(GLOBAL_BENCHMARK_REGISTRY)
 
+  selected_benchmarks = []
   # Match benchmarks in registry against regex
   for benchmark in registry:
     benchmark_name = "%s.%s" % (benchmark.__module__, benchmark.__name__)
@@ -448,6 +452,7 @@ def _run_benchmarks(regex):
         continue
       full_benchmark_name = "%s.%s" % (benchmark_name, attr)
       if regex == "all" or re.search(regex, full_benchmark_name):
+        selected_benchmarks.append(full_benchmark_name)
         # Instantiate the class if it hasn't been instantiated
         benchmark_instance = benchmark_instance or benchmark()
         # Get the method tied to the class
@@ -455,6 +460,9 @@ def _run_benchmarks(regex):
         # Call the instance method
         instance_benchmark_fn()
 
+  if not selected_benchmarks:
+    raise ValueError("No benchmarks matched the pattern: '{}'".format(regex))
+
 
 def benchmarks_main(true_main, argv=None):
   """Run benchmarks as declared in argv.
diff --git a/tensorflow/python/platform/build_info_test.py b/tensorflow/python/platform/build_info_test.py
index be253885715..5d4b3cfa251 100644
--- a/tensorflow/python/platform/build_info_test.py
+++ b/tensorflow/python/platform/build_info_test.py
@@ -30,6 +30,12 @@ class BuildInfoTest(test.TestCase):
     self.assertEqual(build_info.build_info['is_cuda_build'],
                      test.is_built_with_cuda())
 
+  def testDeterministicOrder(self):
+    # The dict may contain other keys depending on the platform, but the ones
+    # it always contains should be in order.
+    self.assertContainsSubsequence(build_info.build_info.keys(),
+                                   ('is_cuda_build', 'is_rocm_build'))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index 6f7193b3207..eead8061e14 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -118,15 +118,18 @@ tf_python_pybind_extension(
     ],
     deps = [
         "//tensorflow/core:lib",
-        "//tensorflow/core/profiler:profiler_service_proto_cc",
+        "//tensorflow/core/profiler/convert:op_stats_to_tf_stats",
+        "//tensorflow/core/profiler/convert:xplane_to_op_stats",
         "//tensorflow/core/profiler/convert:xplane_to_profile_response",
         "//tensorflow/core/profiler/convert:xplane_to_trace_events",
         "//tensorflow/core/profiler/lib:profiler_session_headers",
-        "//tensorflow/core/profiler/rpc:profiler_server",
+        "//tensorflow/core/profiler/rpc:profiler_server_headers",
         "//tensorflow/core/profiler/rpc/client:capture_profile",
         "//tensorflow/core/profiler/rpc/client:save_profile",
         "//tensorflow/python:pybind11_status",
+        "@com_github_grpc_grpc//:grpc++_public_hdrs",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
         "@pybind11",
     ],
 )
diff --git a/tensorflow/python/profiler/internal/profiler_wrapper.cc b/tensorflow/python/profiler/internal/profiler_wrapper.cc
index 63300f2a1ec..0984a8b45c5 100644
--- a/tensorflow/python/profiler/internal/profiler_wrapper.cc
+++ b/tensorflow/python/profiler/internal/profiler_wrapper.cc
@@ -16,13 +16,23 @@ limitations under the License.
 #include <memory>
 
 #include "absl/memory/memory.h"
+#include "absl/strings/numbers.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/pytypes.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/host_info.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
+#include "tensorflow/core/profiler/convert/op_stats_to_overview_page.h"
+#include "tensorflow/core/profiler/convert/op_stats_to_tf_stats.h"
+#include "tensorflow/core/profiler/convert/xplane_to_memory_profile.h"
+#include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
 #include "tensorflow/core/profiler/convert/xplane_to_profile_response.h"
 #include "tensorflow/core/profiler/convert/xplane_to_trace_events.h"
 #include "tensorflow/core/profiler/lib/profiler_session.h"
+#include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
+#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
 #include "tensorflow/core/profiler/rpc/client/capture_profile.h"
 #include "tensorflow/core/profiler/rpc/client/save_profile.h"
 #include "tensorflow/core/profiler/rpc/profiler_server.h"
@@ -32,20 +42,21 @@ namespace py = ::pybind11;
 
 namespace {
 
-tensorflow::ProfileRequest MakeProfileRequest(
-    const tensorflow::string& logdir, const tensorflow::string& session_id,
-    const tensorflow::string& host) {
-  tensorflow::ProfileRequest request;
-  request.add_tools("trace_viewer");
-  request.add_tools("overview_page");
-  request.add_tools("input_pipeline");
-  request.add_tools("kernel_stats");
-  request.add_tools("tensorflow_stats");
-  request.add_tools("memory_profile");
-  request.set_host_name(host);
-  request.set_repository_root(logdir);
-  request.set_session_id(session_id);
-  return request;
+using ::tensorflow::profiler::KERNEL_STATS_DB;
+using ::tensorflow::profiler::OP_METRICS_DB;
+using ::tensorflow::profiler::STEP_DB;
+
+tensorflow::Status ValidateHostPortPair(const std::string& host_port) {
+  tensorflow::uint32 port;
+  std::vector<absl::string_view> parts = absl::StrSplit(host_port, ':');
+  // Must be host:port, port must be a number, host must not contain a '/',
+  // host also must not be empty.
+  if (parts.size() != 2 || !absl::SimpleAtoi(parts[1], &port) ||
+      parts[0].find("/") != std::string::npos || parts[0].empty()) {
+    return tensorflow::errors::InvalidArgument(
+        "Could not interpret \"", host_port, "\" as a host-port pair.");
+  }
+  return tensorflow::Status::OK();
 }
 
 tensorflow::ProfileOptions GetOptions(const py::dict& opts) {
@@ -97,9 +108,11 @@ class ProfilerSessionWrapper {
     tensorflow::MaybeRaiseRegisteredFromStatus(status);
 
     tensorflow::ProfileResponse response;
-    tensorflow::ProfileRequest request = MakeProfileRequest(
-        logdir_, tensorflow::profiler::GetCurrentTimeStampAsString(),
-        tensorflow::port::Hostname());
+    tensorflow::ProfileRequest request =
+        tensorflow::profiler::PopulateProfileRequest(
+            /*duration_ms=*/0, logdir_,
+            tensorflow::profiler::GetCurrentTimeStampAsString(),
+            tensorflow::port::Hostname(), /*opts=*/{});
     status = tensorflow::profiler::ConvertXSpaceToProfileResponse(
         xspace, request, &response);
     tensorflow::MaybeRaiseRegisteredFromStatus(status);
@@ -139,8 +152,7 @@ PYBIND11_MODULE(_pywrap_profiler, m) {
                     const char* worker_list, bool include_dataset_ops,
                     int duration_ms, int num_tracing_attempts,
                     py::dict options) {
-    tensorflow::Status status =
-        tensorflow::profiler::ValidateHostPortPair(service_addr);
+    tensorflow::Status status = ValidateHostPortPair(service_addr);
     tensorflow::MaybeRaiseRegisteredFromStatus(status);
     tensorflow::ProfileOptions opts = GetOptions(options);
     opts.set_include_dataset_ops(include_dataset_ops);
@@ -152,8 +164,7 @@ PYBIND11_MODULE(_pywrap_profiler, m) {
 
   m.def("monitor", [](const char* service_addr, int duration_ms,
                       int monitoring_level, bool display_timestamp) {
-    tensorflow::Status status =
-        tensorflow::profiler::ValidateHostPortPair(service_addr);
+    tensorflow::Status status = ValidateHostPortPair(service_addr);
     tensorflow::MaybeRaiseRegisteredFromStatus(status);
     tensorflow::string content;
     status = tensorflow::profiler::Monitor(service_addr, duration_ms,
@@ -162,4 +173,60 @@ PYBIND11_MODULE(_pywrap_profiler, m) {
     tensorflow::MaybeRaiseRegisteredFromStatus(status);
     return content;
   });
+
+  m.def("xspace_to_trace_events", [](const py::bytes& serialized_xspace_proto) {
+    tensorflow::string content;
+    tensorflow::profiler::XSpace xspace;
+    xspace.ParseFromString(std::string(serialized_xspace_proto));
+    tensorflow::profiler::ConvertXSpaceToTraceEventsString(xspace, &content);
+    return py::bytes(content);
+  });
+
+  m.def("xspace_to_overview_page",
+        [](const py::bytes& serialized_xspace_proto) {
+          tensorflow::profiler::XSpace xspace;
+          xspace.ParseFromString(std::string(serialized_xspace_proto));
+          tensorflow::profiler::OverviewPage overview_page =
+              tensorflow::profiler::ConvertOpStatsToOverviewPage(
+                  ConvertXSpaceToOpStats(
+                      xspace, {OP_METRICS_DB, STEP_DB, KERNEL_STATS_DB}));
+          return py::bytes(overview_page.SerializeAsString());
+        });
+
+  m.def("xspace_to_input_pipeline",
+        [](const py::bytes& serialized_xspace_proto) {
+          tensorflow::profiler::XSpace xspace;
+          xspace.ParseFromString(std::string(serialized_xspace_proto));
+          tensorflow::profiler::InputPipelineAnalysisResult input_pipeline =
+              tensorflow::profiler::ConvertOpStatsToInputPipelineAnalysis(
+                  ConvertXSpaceToOpStats(xspace, {OP_METRICS_DB, STEP_DB}));
+          return py::bytes(input_pipeline.SerializeAsString());
+        });
+
+  m.def("xspace_to_tf_stats", [](const py::bytes& serialized_xspace_proto) {
+    tensorflow::profiler::XSpace xspace;
+    xspace.ParseFromString(std::string(serialized_xspace_proto));
+    tensorflow::profiler::TfStatsDatabase tf_stats_db =
+        tensorflow::profiler::ConvertOpStatsToTfStats(
+            ConvertXSpaceToOpStats(xspace, {OP_METRICS_DB}));
+    return py::bytes(tf_stats_db.SerializeAsString());
+  });
+
+  m.def("xspace_to_kernel_stats", [](const py::bytes& serialized_xspace_proto) {
+    tensorflow::profiler::XSpace xspace;
+    xspace.ParseFromString(std::string(serialized_xspace_proto));
+    tensorflow::profiler::OpStats op_stats =
+        ConvertXSpaceToOpStats(xspace, {KERNEL_STATS_DB});
+    return py::bytes(op_stats.kernel_stats_db().SerializeAsString());
+  });
+
+  m.def("xspace_to_memory_profile",
+        [](const py::bytes& serialized_xspace_proto) {
+          tensorflow::profiler::XSpace xspace;
+          xspace.ParseFromString(std::string(serialized_xspace_proto));
+          std::string json_output;
+          tensorflow::profiler::ConvertXSpaceToMemoryProfileJson(xspace,
+                                                                 &json_output);
+          return py::bytes(json_output);
+        });
 };
diff --git a/tensorflow/python/profiler/internal/python_hooks.cc b/tensorflow/python/profiler/internal/python_hooks.cc
index 33e182f8de0..ee2ad1e254b 100644
--- a/tensorflow/python/profiler/internal/python_hooks.cc
+++ b/tensorflow/python/profiler/internal/python_hooks.cc
@@ -120,8 +120,11 @@ void PythonHooks::ProfileFast(PyFrameObject* frame, int what, PyObject* arg) {
       function = py::reinterpret_borrow<py::str>(f_code->co_name);
     }
 
-    tracemes_[thread_id].push_back(absl::make_unique<TraceMe>(absl::StrCat(
-        "$", io::Basename(filename), ":", line_no, " ", function)));
+    tracemes_[thread_id].push_back(
+        absl::make_unique<TraceMe>([&filename, line_no, &function] {
+          return absl::StrCat("$", io::Basename(filename), ":", line_no, " ",
+                              function);
+        }));
   } else if (what == PyTrace_C_CALL && PyCFunction_Check(arg)) {
     // Python stack does not have a filename/line_no for native calls.
     auto* func = reinterpret_cast<PyCFunctionObject*>(arg);
@@ -139,9 +142,10 @@ void PythonHooks::ProfileFast(PyFrameObject* frame, int what, PyObject* arg) {
       filename = "<unknown>";
     }
 
-    string function(func->m_ml->ml_name);
-    tracemes_[thread_id].push_back(absl::make_unique<TraceMe>(
-        absl::StrCat(filename, " ", func->m_ml->ml_name)));
+    tracemes_[thread_id].push_back(
+        absl::make_unique<TraceMe>([&filename, func] {
+          return absl::StrCat(filename, " ", func->m_ml->ml_name);
+        }));
   } else if (what == PyTrace_RETURN || what == PyTrace_C_RETURN ||
              what == PyTrace_EXCEPTION || what == PyTrace_C_EXCEPTION) {
     auto& thread_tracemes = tracemes_[thread_id];
diff --git a/tensorflow/python/profiler/trace.py b/tensorflow/python/profiler/trace.py
index 0591d90fa43..e4cf581bd25 100644
--- a/tensorflow/python/profiler/trace.py
+++ b/tensorflow/python/profiler/trace.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from tensorflow.python.profiler.internal import _pywrap_traceme
 from tensorflow.python.util.tf_export import tf_export
 
@@ -123,3 +125,43 @@ class Trace(object):
   def __exit__(self, exc_type, exc_val, exc_tb):
     if self._traceme:
       self._traceme.Stop()
+
+
+def trace_wrapper(trace_name, **trace_kwargs):
+  """Decorator alternative to `with Trace(): ...`.  It's faster.
+
+  Args:
+    trace_name: The name of the trace event.
+    **trace_kwargs: Keyword arguments added to the trace event. Both the key and
+      value are of types that can be converted to strings, which will be
+      interpreted by the profiler according to the traceme name.
+
+  Returns:
+    A decorator that can wrap a function and apply `Trace` scope if needed.
+
+  Example usage:
+    ```python
+
+    @trace_wrapper('trace_name')
+    def func(x, y, z):
+      pass  # code to execute and apply `Trace` if needed.
+
+    # Equivalent to
+    # with Trace('trace_name'):
+    #   func(1, 2, 3)
+    func(1, 2, 3)
+    ```
+  """
+
+  def inner_wrapper(func):
+
+    @functools.wraps(func)
+    def wrapped(*args, **kwargs):
+      if enabled:
+        with Trace(trace_name, **trace_kwargs):
+          return func(*args, **kwargs)
+      return func(*args, **kwargs)
+
+    return wrapped
+
+  return inner_wrapper
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index fc84eaf6c26..45ee73de51c 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -302,6 +302,18 @@ py_strict_library(
     deps = [],
 )
 
+tf_py_test(
+    name = "save_context_test",
+    srcs = ["save_context_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":save_context",
+        ":save_options",
+        "//tensorflow/python:extra_py_tests_deps",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
 py_strict_library(
     name = "save",
     srcs = [
@@ -368,6 +380,15 @@ tf_py_test(
 )
 
 py_strict_library(
+    name = "load_context",
+    srcs = [
+        "load_context.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [],
+)
+
+py_library(
     name = "load",
     srcs = [
         "load.py",
@@ -375,6 +396,7 @@ py_strict_library(
     srcs_version = "PY2AND3",
     deps = [
         ":function_deserialization",
+        ":load_context",
         ":load_options",
         ":load_v1_in_v2",
         ":loader",
@@ -397,6 +419,7 @@ py_strict_library(
         "//tensorflow/python:variables",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:distribute_utils",
+        "//tensorflow/python/distribute:values_util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
         "//tensorflow/python/training/saving:checkpoint_options",
@@ -422,6 +445,7 @@ py_strict_library(
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:func_graph",
         "//tensorflow/python:platform",
         "//tensorflow/python:saver",
         "//tensorflow/python:sparse_tensor",
diff --git a/tensorflow/python/saved_model/load.py b/tensorflow/python/saved_model/load.py
index 0835481ab69..0c64275ce01 100644
--- a/tensorflow/python/saved_model/load.py
+++ b/tensorflow/python/saved_model/load.py
@@ -24,6 +24,7 @@ import os
 from tensorflow.core.protobuf import graph_debug_info_pb2
 from tensorflow.python.distribute import distribute_utils
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import values_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
@@ -90,18 +91,26 @@ class _WrapperFunction(function.ConcreteFunction):
 
   def _call_flat(self, args, captured_inputs, cancellation_manager=None):
 
-    def get_in_replica_handle(x):
+    def get_handle(x):
       return x.handle if distribute_utils.is_distributed_variable(x) else x
 
-    def get_cross_replica_handle(x):
+    def get_unused_handle(x):
       return _unused_handle() if distribute_utils.is_distributed_variable(x)   \
           else x
 
-    if ds_context.get_replica_context() is not None:  # in-replica context
-      captured_inputs = list(map(get_in_replica_handle, captured_inputs))
+    if (ds_context.get_replica_context() is not None or
+        values_util.is_saving_non_distributed()):
+      # If we're in the replica context or are saving a non-distributed version
+      # of the model, we resolve the captured variables to the corresponding
+      # resource handle. In both situation we call var.handle, but it has
+      # different behavior. In the replica context, var.handle resolves the
+      # replica local variable handle if the variable is replicated. When saving
+      # a non-distributed version of the model, var.handle resolves to the
+      # primary variable handle, since we only save one copy of a replicated
+      # variable.
+      captured_inputs = list(map(get_handle, captured_inputs))
     else:  # cross-replica context
-      captured_inputs = list(
-          map(get_cross_replica_handle, captured_inputs))
+      captured_inputs = list(map(get_unused_handle, captured_inputs))
     return super(_WrapperFunction, self)._call_flat(args, captured_inputs,
                                                     cancellation_manager)
 
diff --git a/tensorflow/python/saved_model/load_context.py b/tensorflow/python/saved_model/load_context.py
new file mode 100644
index 00000000000..e5988ff2833
--- /dev/null
+++ b/tensorflow/python/saved_model/load_context.py
@@ -0,0 +1,56 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Context for storing options for loading a SavedModel."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import threading
+
+
+class LoadContext(threading.local):
+  """A context for loading a model."""
+
+  def __init__(self):
+    super(LoadContext, self).__init__()
+    self._load_options = None
+
+  def set_load_options(self, load_options):
+    self._load_options = load_options
+
+  def clear_load_options(self):
+    self._load_options = None
+
+  def load_options(self):
+    return self._load_options
+
+
+_load_context = LoadContext()
+
+
+@contextlib.contextmanager
+def load_context(load_options):
+  _load_context.set_load_options(load_options)
+  try:
+    yield
+  finally:
+    _load_context.clear_load_options()
+
+
+def get_load_options():
+  """Returns whether under a load context."""
+  return _load_context.load_options()
diff --git a/tensorflow/python/saved_model/load_v1_in_v2.py b/tensorflow/python/saved_model/load_v1_in_v2.py
index ede91da168c..add3b4e6320 100644
--- a/tensorflow/python/saved_model/load_v1_in_v2.py
+++ b/tensorflow/python/saved_model/load_v1_in_v2.py
@@ -25,6 +25,7 @@ from tensorflow.python.eager import lift_to_graph
 from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
@@ -143,6 +144,7 @@ class _EagerSavedModelLoader(loader_impl.SavedModelLoader):
           for input_spec in input_specs
       ]
       input_names = []
+      input_tensors = []
       for original_input_name, feed in zip(original_input_names, feeds):
         if isinstance(feed, sparse_tensor.SparseTensor):
           # We have to give explicit name for SparseTensor arguments, because
@@ -151,8 +153,10 @@ class _EagerSavedModelLoader(loader_impl.SavedModelLoader):
           values_name = "%s_values" % original_input_name
           dense_shape_name = "%s_dense_shape" % original_input_name
           input_names.extend([indices_name, values_name, dense_shape_name])
+          input_tensors.extend([feed.indices, feed.values, feed.dense_shape])
         else:
           input_names.append(original_input_name)
+          input_tensors.append(feed)
       fetches = {name: out for name, out in signature_def.outputs.items()}
       try:
         signature_fn = wrapped.prune(feeds=feeds, fetches=fetches)
@@ -173,6 +177,11 @@ class _EagerSavedModelLoader(loader_impl.SavedModelLoader):
         raise
       # pylint: disable=protected-access
       signature_fn._arg_keywords = input_names
+      signature_fn._func_graph.structured_input_signature = (
+          (),
+          func_graph.convert_structure_to_signature(
+              dict(zip(input_names, input_tensors))))
+
       if len(input_names) == 1:
         # Allowing positional arguments does not create any ambiguity if there's
         # only one.
diff --git a/tensorflow/python/saved_model/load_v1_in_v2_test.py b/tensorflow/python/saved_model/load_v1_in_v2_test.py
index bafeea128ed..806a4db6fba 100644
--- a/tensorflow/python/saved_model/load_v1_in_v2_test.py
+++ b/tensorflow/python/saved_model/load_v1_in_v2_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import function as framework_function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework import versions
 from tensorflow.python.lib.io import file_io
@@ -630,6 +631,15 @@ class LoadTest(test.TestCase):
             imported.signatures["serving_default"](constant_op.constant(2.))),
         {"y": [10, 8, 6, 4, 2, 0]})
 
+  def test_structured_input_signature(self):
+    path = self._v1_single_metagraph_saved_model(False)
+    imported = load.load(path)
+    args, kwargs = (
+        imported.signatures["serving_default"].structured_input_signature)
+    self.assertEqual(args, ())
+    self.assertAllEqual(
+        kwargs, {"start": tensor_spec.TensorSpec(shape=None, name="start")})
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index 2df2bea428e..6daa631084f 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -73,7 +73,8 @@ def parse_saved_model(export_dir):
   """Reads the savedmodel.pb or savedmodel.pbtxt file containing `SavedModel`.
 
   Args:
-    export_dir: Directory containing the SavedModel file.
+    export_dir: String or Pathlike, path to the directory containing the
+    SavedModel file.
 
   Returns:
     A `SavedModel` protocol buffer.
@@ -83,11 +84,11 @@ def parse_saved_model(export_dir):
   """
   # Build the path to the SavedModel in pbtxt format.
   path_to_pbtxt = os.path.join(
-      compat.as_bytes(export_dir),
+      compat.as_bytes(compat.path_to_str(export_dir)),
       compat.as_bytes(constants.SAVED_MODEL_FILENAME_PBTXT))
   # Build the path to the SavedModel in pb format.
   path_to_pb = os.path.join(
-      compat.as_bytes(export_dir),
+      compat.as_bytes(compat.path_to_str(export_dir)),
       compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
 
   # Parse the SavedModel protocol buffer.
diff --git a/tensorflow/python/saved_model/loader_test.py b/tensorflow/python/saved_model/loader_test.py
index c1072664c3d..d1ccad51110 100644
--- a/tensorflow/python/saved_model/loader_test.py
+++ b/tensorflow/python/saved_model/loader_test.py
@@ -26,9 +26,9 @@ from absl.testing import parameterized
 from tensorflow.python.client import session
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import builder as saved_model_builder
@@ -41,6 +41,13 @@ from tensorflow.python.training import saver as tf_saver
 def _get_export_dir(label):
   return os.path.join(test.get_temp_dir(), label)
 
+
+def _tensor_name(name):
+  if variable_scope.resource_variables_enabled():
+    return name + "/Read/ReadVariableOp:0"
+  return name + ":0"
+
+
 SIMPLE_ADD_SAVED_MODEL = _get_export_dir("simple_add_saved_model")
 SAVED_MODEL_WITH_MAIN_OP = _get_export_dir("saved_model_with_main_op")
 
@@ -94,105 +101,116 @@ class SavedModelLoaderTest(test.TestCase, parameterized.TestCase):
     super(SavedModelLoaderTest, self).tearDown()
     shutil.rmtree(test.get_temp_dir(), ignore_errors=True)
 
-  @test_util.run_v1_only("b/120545219")
   def test_load_function(self, builder_cls):
-    self.export_simple_graph(builder_cls)
-    loader = loader_impl.SavedModelLoader(SIMPLE_ADD_SAVED_MODEL)
-    with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["foo_graph"])
-      self.assertEqual(5, sess.graph.get_tensor_by_name("x:0").eval())
-      self.assertEqual(11, sess.graph.get_tensor_by_name("y:0").eval())
+    # Force test to run in graph mode.
+    # The SaveModelLoader.load method is a v1-only API that requires a session
+    # to work.
+    with ops.Graph().as_default():
+      self.export_simple_graph(builder_cls)
+      loader = loader_impl.SavedModelLoader(SIMPLE_ADD_SAVED_MODEL)
+      with self.session(graph=ops.Graph()) as sess:
+        loader.load(sess, ["foo_graph"])
+        self.assertEqual(5, sess.run(_tensor_name("x")))
+        self.assertEqual(11, sess.run(_tensor_name("y")))
 
-    self.export_graph_with_main_op(builder_cls)
-    loader2 = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
-    with self.session(graph=ops.Graph()) as sess:
-      loader2.load(sess, ["foo_graph"])
-      self.assertEqual(5, sess.graph.get_tensor_by_name("x:0").eval())
-      self.assertEqual(7, sess.graph.get_tensor_by_name("y:0").eval())
+      self.export_graph_with_main_op(builder_cls)
+      loader2 = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
+      with self.session(graph=ops.Graph()) as sess:
+        loader2.load(sess, ["foo_graph"])
+        self.assertEqual(5, sess.run(_tensor_name("x")))
+        self.assertEqual(7, sess.run(_tensor_name("y")))
 
-  @test_util.run_v1_only("b/120545219")
   def test_load_graph(self, builder_cls):
     self.export_simple_graph(builder_cls)
     loader = loader_impl.SavedModelLoader(SIMPLE_ADD_SAVED_MODEL)
     graph = ops.Graph()
     loader.load_graph(graph, ["foo_graph"])
 
-    x = graph.get_tensor_by_name("x:0")
-    y = graph.get_tensor_by_name("y:0")
+    x = graph.get_tensor_by_name(_tensor_name("x"))
+    y = graph.get_tensor_by_name(_tensor_name("y"))
 
     with self.assertRaises(KeyError):
-      graph.get_tensor_by_name("z:0")
+      graph.get_tensor_by_name(_tensor_name("z"))
 
-    with self.session(graph=graph):
+    with graph.as_default(), self.session():
       # Check that x and y are not initialized
       with self.assertRaises(errors.FailedPreconditionError):
         self.evaluate(x)
       with self.assertRaises(errors.FailedPreconditionError):
         self.evaluate(y)
 
-  @test_util.run_v1_only("b/120545219")
   def test_load_with_import_scope(self, builder_cls):
-    self.export_graph_with_main_op(builder_cls)
-    loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
-    with self.session(graph=ops.Graph()) as sess:
-      saver, _ = loader.load_graph(
-          sess.graph, ["foo_graph"], import_scope="baz")
+    # Force test to run in graph mode.
+    # The SaveModelLoader.restore_variables ahd SaveModelLoader.run_init_ops
+    # methods are v1-only APIs that require a session to work.
+    with ops.Graph().as_default():
+      self.export_graph_with_main_op(builder_cls)
+      loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
+      with self.session(graph=ops.Graph()) as sess:
+        saver, _ = loader.load_graph(
+            sess.graph, ["foo_graph"], import_scope="baz")
 
-      # The default saver should not work when the import scope is set.
-      with self.assertRaises(errors.NotFoundError):
-        loader.restore_variables(sess, tf_saver.Saver())
-
-      loader.restore_variables(sess, saver)
-
-      if builder_cls == saved_model_builder._SavedModelBuilder:
+        # The default saver should not work when the import scope is set.
         with self.assertRaises(errors.NotFoundError):
+          loader.restore_variables(sess, tf_saver.Saver())
+
+        loader.restore_variables(sess, saver)
+
+        if builder_cls == saved_model_builder._SavedModelBuilder:
+          with self.assertRaises(errors.NotFoundError):
+            loader.run_init_ops(sess, ["foo_graph"])
+          loader.run_init_ops(sess, ["foo_graph"], import_scope="baz")
+        else:
           loader.run_init_ops(sess, ["foo_graph"])
-        loader.run_init_ops(sess, ["foo_graph"], import_scope="baz")
-      else:
-        loader.run_init_ops(sess, ["foo_graph"])
 
-      self.assertEqual(5, sess.graph.get_tensor_by_name("baz/x:0").eval())
-      self.assertEqual(7, sess.graph.get_tensor_by_name("baz/y:0").eval())
+        self.assertEqual(5, sess.run(_tensor_name("baz/x")))
+        self.assertEqual(7, sess.run(_tensor_name("baz/y")))
 
-    # Test combined load function.
-    loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
-    with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["foo_graph"], import_scope="baa")
-      self.assertEqual(5, sess.graph.get_tensor_by_name("baa/x:0").eval())
-      self.assertEqual(7, sess.graph.get_tensor_by_name("baa/y:0").eval())
+      # Test combined load function.
+      loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
+      with self.session(graph=ops.Graph()) as sess:
+        loader.load(sess, ["foo_graph"], import_scope="baa")
+        self.assertEqual(5, sess.run(_tensor_name("baa/x")))
+        self.assertEqual(7, sess.run(_tensor_name("baa/y")))
 
-  @test_util.run_deprecated_v1
   def test_restore_variables(self, builder_cls):
-    self.export_graph_with_main_op(builder_cls)
-    loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
-    with self.session(graph=ops.Graph()) as sess:
-      x = variables.VariableV1(0, name="x")
-      y = variables.VariableV1(0, name="y")
-      z = x * y
+    # Force test to run in graph mode.
+    # The SaveModelLoader.restore_variables method is a v1-only API requiring a
+    # session to work.
+    with ops.Graph().as_default():
+      self.export_graph_with_main_op(builder_cls)
+      loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
+      with self.session() as sess:
+        x = variables.VariableV1(0, name="x")
+        y = variables.VariableV1(0, name="y")
+        z = x * y
 
-      self.evaluate(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
 
-      # There are variables to restore, so a saver must be created.
-      with self.assertRaises(ValueError):
-        loader.restore_variables(sess, None)
+        # There are variables to restore, so a saver must be created.
+        with self.assertRaises(ValueError):
+          loader.restore_variables(sess, None)
 
-      loader.restore_variables(sess, tf_saver.Saver())
-      self.assertEqual(55, self.evaluate(z))
+        loader.restore_variables(sess, tf_saver.Saver())
+        self.assertEqual(55, self.evaluate(z))
 
-  @test_util.run_v1_only("b/120545219")
   def test_run_init_op(self, builder_cls):
-    self.export_graph_with_main_op(builder_cls)
-    loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
-    graph = ops.Graph()
-    saver, _ = loader.load_graph(graph, ["foo_graph"])
-    with self.session(graph=graph) as sess:
-      loader.restore_variables(sess, saver)
-      self.assertEqual(5, sess.graph.get_tensor_by_name("x:0").eval())
-      self.assertEqual(11, sess.graph.get_tensor_by_name("y:0").eval())
+    # Force test to run in graph mode.
+    # The SaveModelLoader.restore_variables ahd SaveModelLoader.run_init_ops
+    # methods are v1-only APIs that require a session to work.
+    with ops.Graph().as_default():
+      self.export_graph_with_main_op(builder_cls)
+      loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
+      graph = ops.Graph()
+      saver, _ = loader.load_graph(graph, ["foo_graph"])
+      with self.session(graph=graph) as sess:
+        loader.restore_variables(sess, saver)
+        self.assertEqual(5, sess.run(_tensor_name("x")))
+        self.assertEqual(11, sess.run(_tensor_name("y")))
 
-      loader.run_init_ops(sess, ["foo_graph"])
-      self.assertEqual(5, sess.graph.get_tensor_by_name("x:0").eval())
-      self.assertEqual(7, sess.graph.get_tensor_by_name("y:0").eval())
+        loader.run_init_ops(sess, ["foo_graph"])
+        self.assertEqual(5, sess.run(_tensor_name("x")))
+        self.assertEqual(7, sess.run(_tensor_name("y")))
 
   def test_parse_saved_model(self, builder_cls):
     self.export_simple_graph(builder_cls)
@@ -212,7 +230,6 @@ class SavedModelLoaderTest(test.TestCase, parameterized.TestCase):
     with self.assertRaises(RuntimeError):
       loader.get_meta_graph_def_from_tags(["not_a_graph"])
 
-  @test_util.run_v1_only("b/120545219")
   def test_load_saved_model_with_no_variables(self, builder_cls):
     """Test that SavedModel runs saver when there appear to be no variables.
 
@@ -224,36 +241,40 @@ class SavedModelLoaderTest(test.TestCase, parameterized.TestCase):
     Args:
       builder_cls: SavedModelBuilder or _SavedModelBuilder class
     """
-    path = _get_export_dir("no_variable_saved_model")
-    with session.Session(graph=ops.Graph()) as sess:
-      x = variables.VariableV1(
-          5, name="x", collections=["not_global_variable"])
-      y = variables.VariableV1(
-          11, name="y", collections=["not_global_variable"])
-      self.assertFalse(variables._all_saveable_objects())
-      z = x + y
-      self.evaluate(variables.variables_initializer([x, y]))
+    # Force test to run in graph mode.
+    # The SaveModelBuilder.add_meta_graph_and_variables and SaveModelLoader.load
+    # methods are v1-only APIs that require a session to work.
+    with ops.Graph().as_default():
+      path = _get_export_dir("no_variable_saved_model")
+      with session.Session(graph=ops.Graph()) as sess:
+        x = variables.VariableV1(
+            5, name="x", collections=["not_global_variable"])
+        y = variables.VariableV1(
+            11, name="y", collections=["not_global_variable"])
+        self.assertFalse(variables._all_saveable_objects())
+        z = x + y
+        self.evaluate(variables.variables_initializer([x, y]))
 
-      foo_sig_def = signature_def_utils.build_signature_def(
-          {"foo_input": utils.build_tensor_info(x)},
-          {"foo_output": utils.build_tensor_info(z)})
+        foo_sig_def = signature_def_utils.build_signature_def(
+            {"foo_input": utils.build_tensor_info(x)},
+            {"foo_output": utils.build_tensor_info(z)})
 
-      builder = saved_model_builder.SavedModelBuilder(path)
-      builder.add_meta_graph_and_variables(
-          sess, ["foo_graph"], {"foo": foo_sig_def},
-          saver=tf_saver.Saver([x, y]))
-      builder.save()
+        builder = saved_model_builder.SavedModelBuilder(path)
+        builder.add_meta_graph_and_variables(
+            sess, ["foo_graph"], {"foo": foo_sig_def},
+            saver=tf_saver.Saver([x, y]))
+        builder.save()
 
-    loader = loader_impl.SavedModelLoader(path)
-    with self.session(graph=ops.Graph()) as sess:
-      saver, _ = loader.load_graph(sess.graph, ["foo_graph"])
-      self.assertFalse(variables._all_saveable_objects())
-      self.assertIsNotNone(saver)
+      loader = loader_impl.SavedModelLoader(path)
+      with self.session(graph=ops.Graph()) as sess:
+        saver, _ = loader.load_graph(sess.graph, ["foo_graph"])
+        self.assertFalse(variables._all_saveable_objects())
+        self.assertIsNotNone(saver)
 
-    with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["foo_graph"])
-      self.assertEqual(5, sess.graph.get_tensor_by_name("x:0").eval())
-      self.assertEqual(11, sess.graph.get_tensor_by_name("y:0").eval())
+      with self.session(graph=ops.Graph()) as sess:
+        loader.load(sess, ["foo_graph"])
+        self.assertEqual(5, sess.run(_tensor_name("x")))
+        self.assertEqual(11, sess.run(_tensor_name("y")))
 
   def test_load_saved_model_graph_with_return_elements(self, builder_cls):
     """Ensure that the correct elements are returned."""
diff --git a/tensorflow/python/saved_model/model_utils/BUILD b/tensorflow/python/saved_model/model_utils/BUILD
index 82a33c8e522..8e41a613b64 100644
--- a/tensorflow/python/saved_model/model_utils/BUILD
+++ b/tensorflow/python/saved_model/model_utils/BUILD
@@ -48,6 +48,7 @@ py_strict_library(
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python/saved_model:signature_def_utils",
         "@six_archive//:six",
     ],
@@ -69,6 +70,7 @@ py_strict_test(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:metrics",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/saved_model:signature_constants",
     ],
@@ -103,6 +105,7 @@ py_strict_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python/saved_model:signature_constants",
         "//tensorflow/python/saved_model:signature_def_utils",
diff --git a/tensorflow/python/saved_model/model_utils/export_output.py b/tensorflow/python/saved_model/model_utils/export_output.py
index b571bad067e..9b3ce04e071 100644
--- a/tensorflow/python/saved_model/model_utils/export_output.py
+++ b/tensorflow/python/saved_model/model_utils/export_output.py
@@ -26,6 +26,7 @@ import six
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.saved_model import signature_def_utils
 
 
@@ -342,16 +343,16 @@ class _SupervisedOutput(ExportOutput):
         raise ValueError(
             '{} output value must be a Tensor; got {}.'.format(
                 key, metric_val))
-      if (not isinstance(metric_op, ops.Tensor) and
-          not isinstance(metric_op, ops.Operation)):
+      if not (tensor_util.is_tensor(metric_op) or
+              isinstance(metric_op, ops.Operation)):
         raise ValueError(
             '{} update_op must be a Tensor or Operation; got {}.'.format(
                 key, metric_op))
 
-      # We must wrap any ops in a Tensor before export, as the SignatureDef
-      # proto expects tensors only. See b/109740581
+      # We must wrap any ops (or variables) in a Tensor before export, as the
+      # SignatureDef proto expects tensors only. See b/109740581
       metric_op_tensor = metric_op
-      if isinstance(metric_op, ops.Operation):
+      if not isinstance(metric_op, ops.Tensor):
         with ops.control_dependencies([metric_op]):
           metric_op_tensor = constant_op.constant([], name='metric_op_wrapper')
 
diff --git a/tensorflow/python/saved_model/model_utils/export_output_test.py b/tensorflow/python/saved_model/model_utils/export_output_test.py
index 8a3f107ce6c..8fd13b3d72e 100644
--- a/tensorflow/python/saved_model/model_utils/export_output_test.py
+++ b/tensorflow/python/saved_model/model_utils/export_output_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import metrics as metrics_module
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model.model_utils import export_output as export_output_lib
@@ -373,10 +374,16 @@ class SupervisedOutputTest(test.TestCase):
       mean, update_op = metrics_module.mean_tensor(constant_op.constant([0]))
       metrics = {
           'metrics_1': (mean, update_op),
-          'metrics_2': (constant_op.constant([0]), control_flow_ops.no_op())
+          'metrics_2': (constant_op.constant([0]), control_flow_ops.no_op()),
+          # Keras metric's update_state() could return a Variable, rather than
+          # an Operation or Tensor.
+          'keras_1': (constant_op.constant([0.5]),
+                      variables.Variable(1.0, name='AssignAddVariableOp_3'))
       }
 
       outputter = MockSupervisedOutput(loss, predictions, metrics)
+      # If we get there, it means constructor succeeded; which is sufficient
+      # for testing the constructor.
 
       self.assertTrue(outputter.metrics['metrics_1/update_op'].name.startswith(
           'mean/update_op'))
diff --git a/tensorflow/python/saved_model/model_utils/export_test.py b/tensorflow/python/saved_model/model_utils/export_test.py
index 8620a3a6a06..f62f4150bf6 100644
--- a/tensorflow/python/saved_model/model_utils/export_test.py
+++ b/tensorflow/python/saved_model/model_utils/export_test.py
@@ -24,6 +24,7 @@ import time
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
@@ -36,136 +37,147 @@ from tensorflow.python.saved_model.model_utils.mode_keys import KerasModeKeys
 
 class ExportTest(test_util.TensorFlowTestCase):
 
-  @test_util.deprecated_graph_mode_only
   def test_build_all_signature_defs_without_receiver_alternatives(self):
-    receiver_tensor = array_ops.placeholder(dtypes.string)
-    output_1 = constant_op.constant([1.])
-    output_2 = constant_op.constant(["2"])
-    output_3 = constant_op.constant(["3"])
-    export_outputs = {
-        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-            export_output.RegressionOutput(value=output_1),
-        "head-2": export_output.ClassificationOutput(classes=output_2),
-        "head-3": export_output.PredictOutput(outputs={
-            "some_output_3": output_3
-        }),
-    }
+    # Force the test to run in graph mode.
+    # This tests a deprecated v1 API that depends on graph-only functions such
+    # as build_tensor_info.
+    with ops.Graph().as_default():
+      receiver_tensor = array_ops.placeholder(dtypes.string)
+      output_1 = constant_op.constant([1.])
+      output_2 = constant_op.constant(["2"])
+      output_3 = constant_op.constant(["3"])
+      export_outputs = {
+          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+              export_output.RegressionOutput(value=output_1),
+          "head-2":
+              export_output.ClassificationOutput(classes=output_2),
+          "head-3":
+              export_output.PredictOutput(outputs={"some_output_3": output_3}),
+      }
 
-    signature_defs = export_utils.build_all_signature_defs(
-        receiver_tensor, export_outputs)
+      signature_defs = export_utils.build_all_signature_defs(
+          receiver_tensor, export_outputs)
 
-    expected_signature_defs = {
-        "serving_default":
-            signature_def_utils.regression_signature_def(receiver_tensor,
-                                                         output_1),
-        "head-2":
-            signature_def_utils.classification_signature_def(receiver_tensor,
-                                                             output_2, None),
-        "head-3":
-            signature_def_utils.predict_signature_def({
-                "input": receiver_tensor
-            }, {"some_output_3": output_3})
-    }
+      expected_signature_defs = {
+          "serving_default":
+              signature_def_utils.regression_signature_def(
+                  receiver_tensor, output_1),
+          "head-2":
+              signature_def_utils.classification_signature_def(
+                  receiver_tensor, output_2, None),
+          "head-3":
+              signature_def_utils.predict_signature_def(
+                  {"input": receiver_tensor}, {"some_output_3": output_3})
+      }
 
-    self.assertDictEqual(expected_signature_defs, signature_defs)
+      self.assertDictEqual(expected_signature_defs, signature_defs)
 
-  @test_util.deprecated_graph_mode_only
   def test_build_all_signature_defs_with_dict_alternatives(self):
-    receiver_tensor = array_ops.placeholder(dtypes.string)
-    receiver_tensors_alternative_1 = {
-        "foo": array_ops.placeholder(dtypes.int64),
-        "bar": array_ops.sparse_placeholder(dtypes.float32)}
-    receiver_tensors_alternatives = {"other": receiver_tensors_alternative_1}
-    output_1 = constant_op.constant([1.])
-    output_2 = constant_op.constant(["2"])
-    output_3 = constant_op.constant(["3"])
-    export_outputs = {
-        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-            export_output.RegressionOutput(value=output_1),
-        "head-2": export_output.ClassificationOutput(classes=output_2),
-        "head-3": export_output.PredictOutput(outputs={
-            "some_output_3": output_3
-        }),
-    }
+    # Force the test to run in graph mode.
+    # This tests a deprecated v1 API that depends on graph-only functions such
+    # as build_tensor_info.
+    with ops.Graph().as_default():
+      receiver_tensor = array_ops.placeholder(dtypes.string)
+      receiver_tensors_alternative_1 = {
+          "foo": array_ops.placeholder(dtypes.int64),
+          "bar": array_ops.sparse_placeholder(dtypes.float32)
+      }
+      receiver_tensors_alternatives = {"other": receiver_tensors_alternative_1}
+      output_1 = constant_op.constant([1.])
+      output_2 = constant_op.constant(["2"])
+      output_3 = constant_op.constant(["3"])
+      export_outputs = {
+          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+              export_output.RegressionOutput(value=output_1),
+          "head-2":
+              export_output.ClassificationOutput(classes=output_2),
+          "head-3":
+              export_output.PredictOutput(outputs={"some_output_3": output_3}),
+      }
 
-    signature_defs = export_utils.build_all_signature_defs(
-        receiver_tensor, export_outputs, receiver_tensors_alternatives)
+      signature_defs = export_utils.build_all_signature_defs(
+          receiver_tensor, export_outputs, receiver_tensors_alternatives)
 
-    expected_signature_defs = {
-        "serving_default":
-            signature_def_utils.regression_signature_def(
-                receiver_tensor, output_1),
-        "head-2":
-            signature_def_utils.classification_signature_def(
-                receiver_tensor, output_2, None),
-        "head-3":
-            signature_def_utils.predict_signature_def(
-                {"input": receiver_tensor}, {"some_output_3": output_3}),
-        "other:head-3":
-            signature_def_utils.predict_signature_def(
-                receiver_tensors_alternative_1, {"some_output_3": output_3})
+      expected_signature_defs = {
+          "serving_default":
+              signature_def_utils.regression_signature_def(
+                  receiver_tensor, output_1),
+          "head-2":
+              signature_def_utils.classification_signature_def(
+                  receiver_tensor, output_2, None),
+          "head-3":
+              signature_def_utils.predict_signature_def(
+                  {"input": receiver_tensor}, {"some_output_3": output_3}),
+          "other:head-3":
+              signature_def_utils.predict_signature_def(
+                  receiver_tensors_alternative_1, {"some_output_3": output_3})
 
-        # Note that the alternatives 'other:serving_default' and
-        # 'other:head-2' are invalid, because regression and classification
-        # signatures must take a single string input.  Here we verify that
-        # these invalid signatures are not included in the export_utils.
-    }
+          # Note that the alternatives 'other:serving_default' and
+          # 'other:head-2' are invalid, because regression and classification
+          # signatures must take a single string input.  Here we verify that
+          # these invalid signatures are not included in the export_utils.
+      }
 
-    self.assertDictEqual(expected_signature_defs, signature_defs)
+      self.assertDictEqual(expected_signature_defs, signature_defs)
 
-  @test_util.deprecated_graph_mode_only
   def test_build_all_signature_defs_with_single_alternatives(self):
-    receiver_tensor = array_ops.placeholder(dtypes.string)
-    receiver_tensors_alternative_1 = array_ops.placeholder(dtypes.int64)
-    receiver_tensors_alternative_2 = array_ops.sparse_placeholder(
-        dtypes.float32)
-    # Note we are passing single Tensors as values of
-    # receiver_tensors_alternatives, where normally that is a dict.
-    # In this case a dict will be created using the default receiver tensor
-    # name "input".
-    receiver_tensors_alternatives = {"other1": receiver_tensors_alternative_1,
-                                     "other2": receiver_tensors_alternative_2}
-    output_1 = constant_op.constant([1.])
-    output_2 = constant_op.constant(["2"])
-    output_3 = constant_op.constant(["3"])
-    export_outputs = {
-        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-            export_output.RegressionOutput(value=output_1),
-        "head-2": export_output.ClassificationOutput(classes=output_2),
-        "head-3": export_output.PredictOutput(outputs={
-            "some_output_3": output_3
-        }),
-    }
+    # Force the test to run in graph mode.
+    # This tests a deprecated v1 API that depends on graph-only functions such
+    # as build_tensor_info.
+    with ops.Graph().as_default():
+      receiver_tensor = array_ops.placeholder(dtypes.string)
+      receiver_tensors_alternative_1 = array_ops.placeholder(dtypes.int64)
+      receiver_tensors_alternative_2 = array_ops.sparse_placeholder(
+          dtypes.float32)
+      # Note we are passing single Tensors as values of
+      # receiver_tensors_alternatives, where normally that is a dict.
+      # In this case a dict will be created using the default receiver tensor
+      # name "input".
+      receiver_tensors_alternatives = {
+          "other1": receiver_tensors_alternative_1,
+          "other2": receiver_tensors_alternative_2
+      }
+      output_1 = constant_op.constant([1.])
+      output_2 = constant_op.constant(["2"])
+      output_3 = constant_op.constant(["3"])
+      export_outputs = {
+          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+              export_output.RegressionOutput(value=output_1),
+          "head-2":
+              export_output.ClassificationOutput(classes=output_2),
+          "head-3":
+              export_output.PredictOutput(outputs={"some_output_3": output_3}),
+      }
 
-    signature_defs = export_utils.build_all_signature_defs(
-        receiver_tensor, export_outputs, receiver_tensors_alternatives)
+      signature_defs = export_utils.build_all_signature_defs(
+          receiver_tensor, export_outputs, receiver_tensors_alternatives)
 
-    expected_signature_defs = {
-        "serving_default":
-            signature_def_utils.regression_signature_def(
-                receiver_tensor, output_1),
-        "head-2":
-            signature_def_utils.classification_signature_def(
-                receiver_tensor, output_2, None),
-        "head-3":
-            signature_def_utils.predict_signature_def(
-                {"input": receiver_tensor}, {"some_output_3": output_3}),
-        "other1:head-3":
-            signature_def_utils.predict_signature_def(
-                {"input": receiver_tensors_alternative_1},
-                {"some_output_3": output_3}),
-        "other2:head-3":
-            signature_def_utils.predict_signature_def(
-                {"input": receiver_tensors_alternative_2},
-                {"some_output_3": output_3})
+      expected_signature_defs = {
+          "serving_default":
+              signature_def_utils.regression_signature_def(
+                  receiver_tensor, output_1),
+          "head-2":
+              signature_def_utils.classification_signature_def(
+                  receiver_tensor, output_2, None),
+          "head-3":
+              signature_def_utils.predict_signature_def(
+                  {"input": receiver_tensor}, {"some_output_3": output_3}),
+          "other1:head-3":
+              signature_def_utils.predict_signature_def(
+                  {"input": receiver_tensors_alternative_1},
+                  {"some_output_3": output_3}),
+          "other2:head-3":
+              signature_def_utils.predict_signature_def(
+                  {"input": receiver_tensors_alternative_2},
+                  {"some_output_3": output_3})
 
-        # Note that the alternatives 'other:serving_default' and 'other:head-2'
-        # are invalid, because regression and classification signatures must take
-        # a single string input.  Here we verify that these invalid signatures
-        # are not included in the export_utils.
-    }
+          # Note that the alternatives 'other:serving_default' and
+          # 'other:head-2' are invalid, because regression and classification
+          # signatures must take a single string input.  Here we verify that
+          # these invalid signatures are not included in the export_utils.
+      }
 
-    self.assertDictEqual(expected_signature_defs, signature_defs)
+      self.assertDictEqual(expected_signature_defs, signature_defs)
 
   def test_build_all_signature_defs_export_outputs_required(self):
     receiver_tensor = constant_op.constant(["11"])
@@ -210,37 +222,42 @@ class ExportTest(test_util.TensorFlowTestCase):
     self.assertEqual(tmp_export_dir,
                      os.path.join(b"tmp", b"export", b"temp-1576013284"))
 
-  @test_util.deprecated_graph_mode_only
   def test_build_all_signature_defs_serving_only(self):
-    receiver_tensor = {"input": array_ops.placeholder(dtypes.string)}
-    output_1 = constant_op.constant([1.])
-    export_outputs = {
-        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-            export_output.PredictOutput(outputs=output_1),
-        "train": export_output.TrainOutput(loss=output_1),
-    }
+    # Force the test to run in graph mode.
+    # This tests a deprecated v1 API that depends on graph-only functions such
+    # as build_tensor_info.
+    with ops.Graph().as_default():
+      receiver_tensor = {"input": array_ops.placeholder(dtypes.string)}
+      output_1 = constant_op.constant([1.])
+      export_outputs = {
+          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+              export_output.PredictOutput(outputs=output_1),
+          "train":
+              export_output.TrainOutput(loss=output_1),
+      }
 
-    signature_defs = export_utils.build_all_signature_defs(
-        receiver_tensor, export_outputs)
+      signature_defs = export_utils.build_all_signature_defs(
+          receiver_tensor, export_outputs)
 
-    expected_signature_defs = {
-        "serving_default": signature_def_utils.predict_signature_def(
-            receiver_tensor, {"output": output_1})
-    }
+      expected_signature_defs = {
+          "serving_default":
+              signature_def_utils.predict_signature_def(receiver_tensor,
+                                                        {"output": output_1})
+      }
 
-    self.assertDictEqual(expected_signature_defs, signature_defs)
+      self.assertDictEqual(expected_signature_defs, signature_defs)
 
-    signature_defs = export_utils.build_all_signature_defs(
-        receiver_tensor, export_outputs, serving_only=False)
+      signature_defs = export_utils.build_all_signature_defs(
+          receiver_tensor, export_outputs, serving_only=False)
 
-    expected_signature_defs.update({
-        "train": signature_def_utils.supervised_train_signature_def(
-            receiver_tensor, loss={"loss": output_1})
-    })
+      expected_signature_defs.update({
+          "train":
+              signature_def_utils.supervised_train_signature_def(
+                  receiver_tensor, loss={"loss": output_1})
+      })
 
-    self.assertDictEqual(expected_signature_defs, signature_defs)
+      self.assertDictEqual(expected_signature_defs, signature_defs)
 
-  @test_util.deprecated_graph_mode_only
   def test_export_outputs_for_mode(self):
     predictions = {"predictions": constant_op.constant([1.])}
     loss = {"loss": constant_op.constant([2.])}
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index a4b14fe9892..33780c14db8 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -960,15 +960,16 @@ def save(obj, export_dir, signatures=None, options=None):
   Args:
     obj: A trackable object to export.
     export_dir: A directory in which to write the SavedModel.
-    signatures: Optional, either a `tf.function` with an input signature
-      specified or the result of `f.get_concrete_function` on a
-      `@tf.function`-decorated function `f`, in which case `f` will be used to
-      generate a signature for the SavedModel under the default serving
-      signature key. `signatures` may also be a dictionary, in which case it
-      maps from signature keys to either `tf.function` instances with input
-      signatures or concrete functions. The keys of such a dictionary may be
-      arbitrary strings, but will typically be from the
-      `tf.saved_model.signature_constants` module.
+    signatures: Optional, one of three types:
+      * a `tf.function` with an input signature specified, which will use the
+        default serving signature key,
+      * the result of `f.get_concrete_function` on a `@tf.function`-decorated
+        function `f`, in which case `f` will be used to generate a signature for
+        the SavedModel under the default serving signature key,
+      * a dictionary, which maps signature keys to either `tf.function`
+        instances with input signatures or concrete functions. Keys of such a
+        dictionary may be arbitrary strings, but will typically be from the
+        `tf.saved_model.signature_constants` module.
     options: Optional, `tf.saved_model.SaveOptions` object that specifies
       options for saving.
 
@@ -1142,6 +1143,6 @@ def _build_meta_graph(obj,
                       options,
                       meta_graph_def=None):
   """Creates a MetaGraph under a SaveContext."""
-  with save_context.save_context():
+  with save_context.save_context(options):
     return _build_meta_graph_impl(obj, export_dir, signatures, options,
                                   meta_graph_def)
diff --git a/tensorflow/python/saved_model/save_context.py b/tensorflow/python/saved_model/save_context.py
index 53d92587247..68fce83ae81 100644
--- a/tensorflow/python/saved_model/save_context.py
+++ b/tensorflow/python/saved_model/save_context.py
@@ -28,12 +28,20 @@ class SaveContext(threading.local):
   def __init__(self):
     super(SaveContext, self).__init__()
     self._in_save_context = False
+    self._options = None
 
-  def enter_save_context(self):
+  def options(self):
+    if not self.in_save_context():
+      raise ValueError("not in a SaveContext")
+    return self._options
+
+  def enter_save_context(self, options):
     self._in_save_context = True
+    self._options = options
 
   def exit_save_context(self):
     self._in_save_context = False
+    self._options = None
 
   def in_save_context(self):
     return self._in_save_context
@@ -42,8 +50,10 @@ _save_context = SaveContext()
 
 
 @contextlib.contextmanager
-def save_context():
-  _save_context.enter_save_context()
+def save_context(options):
+  if in_save_context():
+    raise ValueError("already in a SaveContext")
+  _save_context.enter_save_context(options)
   try:
     yield
   finally:
@@ -54,3 +64,7 @@ def in_save_context():
   """Returns whether under a save context."""
   return _save_context.in_save_context()
 
+
+def get_save_options():
+  """Returns the save options if under a save context."""
+  return _save_context.options()
diff --git a/tensorflow/python/saved_model/save_context_test.py b/tensorflow/python/saved_model/save_context_test.py
new file mode 100644
index 00000000000..424a3ea0d3b
--- /dev/null
+++ b/tensorflow/python/saved_model/save_context_test.py
@@ -0,0 +1,87 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test for SaveContext."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+
+from tensorflow.python.eager import test
+from tensorflow.python.saved_model import save_context
+from tensorflow.python.saved_model import save_options
+
+
+class SaveContextTest(test.TestCase):
+
+  def test_multi_thread(self):
+    self.assertFalse(save_context.in_save_context())
+    with self.assertRaisesRegex(ValueError, 'not in a SaveContext'):
+      save_context.get_save_options()
+
+    options = save_options.SaveOptions(save_debug_info=True)
+    with save_context.save_context(options):
+      self.assertTrue(save_context.in_save_context())
+      self.assertTrue(save_context.get_save_options().save_debug_info)
+
+      entered_context_in_thread = threading.Event()
+      continue_thread = threading.Event()
+
+      def thread_fn():
+        self.assertFalse(save_context.in_save_context())
+        with self.assertRaisesRegex(ValueError, 'not in a SaveContext'):
+          save_context.get_save_options()
+
+        options = save_options.SaveOptions(save_debug_info=False)
+        with save_context.save_context(options):
+          self.assertTrue(save_context.in_save_context())
+          # save_debug_info has a different value in this thread.
+          self.assertFalse(save_context.get_save_options().save_debug_info)
+          entered_context_in_thread.set()
+          continue_thread.wait()
+
+        self.assertFalse(save_context.in_save_context())
+        with self.assertRaisesRegex(ValueError, 'not in a SaveContext'):
+          save_context.get_save_options()
+
+      t = threading.Thread(target=thread_fn)
+      t.start()
+
+      entered_context_in_thread.wait()
+      # Another thread shouldn't affect this thread.
+      self.assertTrue(save_context.in_save_context())
+      self.assertTrue(save_context.get_save_options().save_debug_info)
+
+      continue_thread.set()
+      t.join()
+      # Another thread exiting SaveContext shouldn't affect this thread.
+      self.assertTrue(save_context.in_save_context())
+      self.assertTrue(save_context.get_save_options().save_debug_info)
+
+    self.assertFalse(save_context.in_save_context())
+    with self.assertRaisesRegex(ValueError, 'not in a SaveContext'):
+      save_context.get_save_options()
+
+  def test_enter_multiple(self):
+    options = save_options.SaveOptions()
+    with self.assertRaisesRegex(ValueError, 'already in a SaveContext'):
+      with save_context.save_context(options):
+        with save_context.save_context(options):
+          pass
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/saved_model/save_test.py b/tensorflow/python/saved_model/save_test.py
index a5171f3eee2..28b8fa907e0 100644
--- a/tensorflow/python/saved_model/save_test.py
+++ b/tensorflow/python/saved_model/save_test.py
@@ -580,8 +580,7 @@ class SaveTest(test.TestCase, parameterized.TestCase):
     else:
       self.assertIsNone(v1)
       self.assertEmpty(v0.device)
-      # TODO(b/159752793): There should be only one input here.
-      self.assertLen(saved_function.signature.input_arg, 2)
+      self.assertLen(saved_function.signature.input_arg, 1)
 
   def test_expand_distributed_variables_not_allowed(self):
     root = tracking.AutoTrackable()
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index f998bbfce38..6e662d7d83c 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import six
 
 from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import config_pb2
@@ -34,6 +35,7 @@ from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
@@ -80,6 +82,26 @@ class SavedModelTestBase(test.TestCase):
     asset_collection = ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
     return asset_collection
 
+  def _eval(self, tensor):
+    """Evaluate a tensor.
+
+    Takes care of the variations between graphs produced with and without
+    resource variables when determining the name of the operation to run.
+
+    Args:
+      tensor: The tensor to evaluate, or a string with the tensor name.
+
+    Returns:
+      The evaluated tensor as a numpy array.
+    """
+    name = tensor if isinstance(tensor, six.string_types) else tensor.name
+    index = "0"
+    if ":" in name:
+      name, index = name.split(":")
+    if variable_scope.resource_variables_enabled():
+      name = name + "/Read/ReadVariableOp"
+    return self.evaluate(name + ":" + index)
+
 
 class SavedModelTest(SavedModelTestBase):
 
@@ -119,12 +141,10 @@ class SavedModelTest(SavedModelTestBase):
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
-      foo_signature = signature_def_utils.build_signature_def({
-          "foo_inputs": tensor_info
-      }, dict(), "foo")
+      foo_signature = signature_def_utils.build_signature_def(
+          {"foo_inputs": tensor_info}, dict(), "foo")
       builder.add_meta_graph_and_variables(
-          sess, ["foo"],
-          signature_def_map={"foo_key": foo_signature})
+          sess, ["foo"], signature_def_map={"foo_key": foo_signature})
 
   def _validate_outputs_tensor_info_fail(self, builder, tensor_info):
     with self.session(graph=ops.Graph()) as sess:
@@ -145,8 +165,7 @@ class SavedModelTest(SavedModelTestBase):
       foo_signature = signature_def_utils.build_signature_def(
           dict(), {"foo_outputs": tensor_info}, "foo")
       builder.add_meta_graph_and_variables(
-          sess, ["foo"],
-          signature_def_map={"foo_key": foo_signature})
+          sess, ["foo"], signature_def_map={"foo_key": foo_signature})
 
   def _validate_sig_def_keys(self, builder, valid_tensor_info, invalid_key):
     with self.session(graph=ops.Graph()) as sess:
@@ -201,392 +220,404 @@ class SavedModelTest(SavedModelTestBase):
           "Cannot parse file.*%s" % constants.SAVED_MODEL_FILENAME_PBTXT):
         loader.load(sess, ["foo"], export_dir)
 
-  @test_util.run_deprecated_v1
   def testVerifySessionGraphUsage(self):
     export_dir = self._get_export_dir("test_verify_session_graph_usage")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
 
-    with self.session(graph=ops.Graph()) as sess:
-      self._init_and_validate_variable(sess, "v", 42)
-      builder.add_meta_graph_and_variables(sess, [tag_constants.TRAINING])
+    with ops.Graph().as_default():
+      with self.session(graph=ops.Graph()) as sess:
+        self._init_and_validate_variable(sess, "v", 42)
+        builder.add_meta_graph_and_variables(sess, [tag_constants.TRAINING])
 
-    # Save the SavedModel to disk.
-    builder.save()
+      # Save the SavedModel to disk.
+      builder.save()
 
-    # Build a session and supply it to the load operation.
-    sess = session.Session(graph=ops.Graph())
-    loader.load(sess, [tag_constants.TRAINING], export_dir)
+      # Build a session and supply it to the load operation.
+      sess = session.Session(graph=ops.Graph())
+      loader.load(sess, [tag_constants.TRAINING], export_dir)
 
-    # Check the variable within the scope of the session and its graph.
-    with sess:
-      self.assertEqual(
-          42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
+      # Check the variable within the scope of the session and its graph.
+      with sess:
+        self.assertEqual(
+            42,
+            self._eval(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0]))
 
-  @test_util.run_deprecated_v1
   def testSequence(self):
     export_dir = self._get_export_dir("test_sequence")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
 
-    # Expect an assertion error since add_meta_graph_and_variables() should be
-    # invoked before any add_meta_graph() calls.
-    with self.session(graph=ops.Graph()) as sess:
-      self.assertRaises(AssertionError, builder.add_meta_graph, ["foo"])
+    with ops.Graph().as_default():
+      # Expect an assertion error since add_meta_graph_and_variables() should be
+      # invoked before any add_meta_graph() calls.
+      with self.session(graph=ops.Graph()) as sess:
+        self.assertRaises(AssertionError, builder.add_meta_graph, ["foo"])
 
-    # Expect an assertion error for multiple calls of
-    # add_meta_graph_and_variables() since weights should be saved exactly once.
-    with self.session(graph=ops.Graph()) as sess:
-      self._init_and_validate_variable(sess, "v", 42)
-      builder.add_meta_graph_and_variables(sess, ["bar"])
-      self.assertRaises(AssertionError, builder.add_meta_graph_and_variables,
-                        sess, ["baz"])
+      # Expect an assertion error for multiple calls of
+      # add_meta_graph_and_variables() since weights should be saved exactly
+      # once.
+      with self.session(graph=ops.Graph()) as sess:
+        self._init_and_validate_variable(sess, "v", 42)
+        builder.add_meta_graph_and_variables(sess, ["bar"])
+        self.assertRaises(AssertionError, builder.add_meta_graph_and_variables,
+                          sess, ["baz"])
 
-  @test_util.run_deprecated_v1
   def testTags(self):
     export_dir = self._get_export_dir("test_tags")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
 
-    # Graph with a single variable. SavedModel invoked to:
-    # - add with weights.
-    # - a single tag (from predefined constants).
-    with self.session(graph=ops.Graph()) as sess:
-      self._init_and_validate_variable(sess, "v", 42)
-      builder.add_meta_graph_and_variables(sess, [tag_constants.TRAINING])
+    with ops.Graph().as_default():
+      # Graph with a single variable. SavedModel invoked to:
+      # - add with weights.
+      # - a single tag (from predefined constants).
+      with self.session(graph=ops.Graph()) as sess:
+        self._init_and_validate_variable(sess, "v", 42)
+        builder.add_meta_graph_and_variables(sess, [tag_constants.TRAINING])
 
-    # Graph that updates the single variable. SavedModel invoked to:
-    # - simply add the model (weights are not updated).
-    # - a single tag (from predefined constants).
-    with self.session(graph=ops.Graph()) as sess:
-      self._init_and_validate_variable(sess, "v", 43)
-      builder.add_meta_graph([tag_constants.SERVING])
+      # Graph that updates the single variable. SavedModel invoked to:
+      # - simply add the model (weights are not updated).
+      # - a single tag (from predefined constants).
+      with self.session(graph=ops.Graph()) as sess:
+        self._init_and_validate_variable(sess, "v", 43)
+        builder.add_meta_graph([tag_constants.SERVING])
 
-    # Graph that updates the single variable. SavedModel invoked to:
-    # - simply add the model (weights are not updated).
-    # - multiple tags (from predefined constants).
-    with self.session(graph=ops.Graph()) as sess:
-      self._init_and_validate_variable(sess, "v", 45)
-      builder.add_meta_graph([tag_constants.SERVING, tag_constants.GPU])
+      # Graph that updates the single variable. SavedModel invoked to:
+      # - simply add the model (weights are not updated).
+      # - multiple tags (from predefined constants).
+      with self.session(graph=ops.Graph()) as sess:
+        self._init_and_validate_variable(sess, "v", 45)
+        builder.add_meta_graph([tag_constants.SERVING, tag_constants.GPU])
 
-    # Graph that updates the single variable. SavedModel invoked to:
-    # - simply add the model (weights are not updated).
-    # - multiple tags (from predefined constants for serving on TPU).
-    with self.session(graph=ops.Graph()) as sess:
-      self._init_and_validate_variable(sess, "v", 45)
-      builder.add_meta_graph([tag_constants.SERVING, tag_constants.TPU])
+      # Graph that updates the single variable. SavedModel invoked to:
+      # - simply add the model (weights are not updated).
+      # - multiple tags (from predefined constants for serving on TPU).
+      with self.session(graph=ops.Graph()) as sess:
+        self._init_and_validate_variable(sess, "v", 45)
+        builder.add_meta_graph([tag_constants.SERVING, tag_constants.TPU])
 
-    # Graph that updates the single variable. SavedModel is invoked:
-    # - to add the model (weights are not updated).
-    # - multiple custom tags.
-    with self.session(graph=ops.Graph()) as sess:
-      self._init_and_validate_variable(sess, "v", 44)
-      builder.add_meta_graph(["foo", "bar"])
+      # Graph that updates the single variable. SavedModel is invoked:
+      # - to add the model (weights are not updated).
+      # - multiple custom tags.
+      with self.session(graph=ops.Graph()) as sess:
+        self._init_and_validate_variable(sess, "v", 44)
+        builder.add_meta_graph(["foo", "bar"])
 
-    # Save the SavedModel to disk.
-    builder.save()
+      # Save the SavedModel to disk.
+      builder.save()
 
-    # Restore the graph with a single predefined tag whose variables were saved.
-    with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, [tag_constants.TRAINING], export_dir)
-      self.assertEqual(
-          42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
+      # Restore the graph with a single predefined tag whose variables were
+      # saved.
+      with self.session(graph=ops.Graph()) as sess:
+        loader.load(sess, [tag_constants.TRAINING], export_dir)
+        self.assertEqual(
+            42,
+            self._eval(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0]))
 
-    # Restore the graph with a single predefined tag whose variables were not
-    # saved.
-    with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, [tag_constants.SERVING], export_dir)
-      self.assertEqual(
-          42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
+      # Restore the graph with a single predefined tag whose variables were not
+      # saved.
+      with self.session(graph=ops.Graph()) as sess:
+        loader.load(sess, [tag_constants.SERVING], export_dir)
+        self.assertEqual(
+            42,
+            self._eval(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0]))
 
-    # Restore the graph with multiple predefined tags whose variables were not
-    # saved.
-    with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, [tag_constants.SERVING, tag_constants.GPU], export_dir)
-      self.assertEqual(
-          42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
+      # Restore the graph with multiple predefined tags whose variables were not
+      # saved.
+      with self.session(graph=ops.Graph()) as sess:
+        loader.load(sess, [tag_constants.SERVING, tag_constants.GPU],
+                    export_dir)
+        self.assertEqual(
+            42,
+            self._eval(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0]))
 
-    # Restore the graph with multiple predefined tags (for serving on TPU)
-    # whose variables were not saved.
-    with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, [tag_constants.SERVING, tag_constants.TPU], export_dir)
-      self.assertEqual(
-          42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
+      # Restore the graph with multiple predefined tags (for serving on TPU)
+      # whose variables were not saved.
+      with self.session(graph=ops.Graph()) as sess:
+        loader.load(sess, [tag_constants.SERVING, tag_constants.TPU],
+                    export_dir)
+        self.assertEqual(
+            42,
+            self._eval(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0]))
 
-    # Restore the graph with multiple tags. Provide duplicate tags to test set
-    # semantics.
-    with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["foo", "bar", "foo"], export_dir)
-      self.assertEqual(
-          42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
+      # Restore the graph with multiple tags. Provide duplicate tags to test set
+      # semantics.
+      with self.session(graph=ops.Graph()) as sess:
+        loader.load(sess, ["foo", "bar", "foo"], export_dir)
+        self.assertEqual(
+            42,
+            self._eval(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0]))
 
-    # Try restoring a graph with a non-existent tag. This should yield a runtime
-    # error.
-    with self.session(graph=ops.Graph()) as sess:
-      self.assertRaises(RuntimeError, loader.load, sess, ["INVALID"],
-                        export_dir)
+      # Try restoring a graph with a non-existent tag. This should yield a
+      # runtime error.
+      with self.session(graph=ops.Graph()) as sess:
+        self.assertRaises(RuntimeError, loader.load, sess, ["INVALID"],
+                          export_dir)
 
-    # Try restoring a graph where a subset of the tags match. Since tag matching
-    # for meta graph defs follows "all" semantics, this should yield a runtime
-    # error.
-    with self.session(graph=ops.Graph()) as sess:
-      self.assertRaises(RuntimeError, loader.load, sess, ["foo", "baz"],
-                        export_dir)
+      # Try restoring a graph where a subset of the tags match. Since tag
+      # matching for meta graph defs follows "all" semantics, this should yield
+      # a runtime error.
+      with self.session(graph=ops.Graph()) as sess:
+        self.assertRaises(RuntimeError, loader.load, sess, ["foo", "baz"],
+                          export_dir)
 
-  @test_util.run_v1_only("b/120545219")
   def testVariables(self):
     export_dir = self._get_export_dir("test_variables")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
 
-    # Graph with two variables. SavedModel invoked to:
-    # - add with weights.
-    with self.session(graph=ops.Graph()) as sess:
-      self._init_and_validate_variable(sess, "v1", 1)
-      self._init_and_validate_variable(sess, "v2", 2)
-      builder.add_meta_graph_and_variables(sess, ["foo"])
+    with ops.Graph().as_default():
+      # Graph with two variables. SavedModel invoked to:
+      # - add with weights.
+      with self.session(graph=ops.Graph()) as sess:
+        self._init_and_validate_variable(sess, "v1", 1)
+        self._init_and_validate_variable(sess, "v2", 2)
+        builder.add_meta_graph_and_variables(sess, ["foo"])
 
-    # Graph with a single variable (subset of the variables from the previous
-    # graph whose weights were saved). SavedModel invoked to:
-    # - simply add the model (weights are not updated).
-    with self.session(graph=ops.Graph()) as sess:
-      self._init_and_validate_variable(sess, "v2", 3)
-      builder.add_meta_graph(["bar"])
+      # Graph with a single variable (subset of the variables from the previous
+      # graph whose weights were saved). SavedModel invoked to:
+      # - simply add the model (weights are not updated).
+      with self.session(graph=ops.Graph()) as sess:
+        self._init_and_validate_variable(sess, "v2", 3)
+        builder.add_meta_graph(["bar"])
 
-    # Graph with a single variable (disjoint set of variables from the previous
-    # graph whose weights were saved). SavedModel invoked to:
-    # - simply add the model (weights are not updated).
-    with self.session(graph=ops.Graph()) as sess:
-      self._init_and_validate_variable(sess, "v3", 4)
-      builder.add_meta_graph(["baz"])
+      # Graph with a single variable (disjoint set of variables from the
+      # previous graph whose weights were saved). SavedModel invoked to:
+      # - simply add the model (weights are not updated).
+      with self.session(graph=ops.Graph()) as sess:
+        self._init_and_validate_variable(sess, "v3", 4)
+        builder.add_meta_graph(["baz"])
 
-    # Save the SavedModel to disk.
-    builder.save()
+      # Save the SavedModel to disk.
+      builder.save()
 
-    # Restore the graph with tag "foo", whose variables were saved.
-    with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["foo"], export_dir)
-      collection_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-      self.assertEqual(len(collection_vars), 2)
-      self.assertEqual(1, collection_vars[0].eval())
-      self.assertEqual(2, collection_vars[1].eval())
+      # Restore the graph with tag "foo", whose variables were saved.
+      with self.session(graph=ops.Graph()) as sess:
+        loader.load(sess, ["foo"], export_dir)
+        collection_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+        self.assertEqual(len(collection_vars), 2)
+        self.assertEqual(1, self._eval(collection_vars[0]))
+        self.assertEqual(2, self._eval(collection_vars[1]))
 
-    # Restore the graph with tag "bar", whose variables were not saved. Only the
-    # subset of the variables added to the graph will be restored with the
-    # checkpointed value.
-    with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["bar"], export_dir)
-      collection_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-      self.assertEqual(len(collection_vars), 1)
-      self.assertEqual(2, collection_vars[0].eval())
+      # Restore the graph with tag "bar", whose variables were not saved. Only
+      # the subset of the variables added to the graph will be restored with the
+      # checkpointed value.
+      with self.session(graph=ops.Graph()) as sess:
+        loader.load(sess, ["bar"], export_dir)
+        collection_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+        self.assertEqual(len(collection_vars), 1)
+        self.assertEqual(2, self._eval(collection_vars[0]))
 
-    # Try restoring the graph with tag "baz", whose variables were not saved.
-    # Since this graph has a disjoint set of variables from the set that was
-    # saved, this should raise an error.
-    with self.session(graph=ops.Graph()) as sess:
-      self.assertRaises(errors.NotFoundError, loader.load, sess, ["baz"],
-                        export_dir)
+      # Try restoring the graph with tag "baz", whose variables were not saved.
+      # Since this graph has a disjoint set of variables from the set that was
+      # saved, this should raise an error.
+      with self.session(graph=ops.Graph()) as sess:
+        self.assertRaises(errors.NotFoundError, loader.load, sess, ["baz"],
+                          export_dir)
 
-  @test_util.run_deprecated_v1
   def testGraphWithoutVariables(self):
     export_dir = self._get_export_dir("test_graph_has_variables")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
 
-    # Graph with no variables.
-    with self.session(graph=ops.Graph()) as sess:
-      constant_5_name = constant_op.constant(5.0).name
-      builder.add_meta_graph_and_variables(sess, ["foo"])
+    with ops.Graph().as_default():
+      # Graph with no variables.
+      with self.session(graph=ops.Graph()) as sess:
+        constant_5_name = constant_op.constant(5.0).name
+        builder.add_meta_graph_and_variables(sess, ["foo"])
 
-    # Second graph with no variables
-    with self.session(graph=ops.Graph()) as sess:
-      constant_6_name = constant_op.constant(6.0).name
-      builder.add_meta_graph(["bar"])
+      # Second graph with no variables
+      with self.session(graph=ops.Graph()) as sess:
+        constant_6_name = constant_op.constant(6.0).name
+        builder.add_meta_graph(["bar"])
 
-    # Save the SavedModel to disk.
-    builder.save()
+      # Save the SavedModel to disk.
+      builder.save()
 
-    # Restore the graph with tag "foo".
-    with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["foo"], export_dir)
-      # Read the constant a from the graph.
-      a = ops.get_default_graph().get_tensor_by_name(constant_5_name)
-      b = constant_op.constant(6.0)
-      c = a * b
-      self.assertEqual(30.0, self.evaluate(c))
+      # Restore the graph with tag "foo".
+      with self.session(graph=ops.Graph()) as sess:
+        loader.load(sess, ["foo"], export_dir)
+        # Read the constant a from the graph.
+        a = ops.get_default_graph().get_tensor_by_name(constant_5_name)
+        b = constant_op.constant(6.0)
+        c = a * b
+        self.assertEqual(30.0, self.evaluate(c))
 
-    # Restore the graph with tag "bar".
-    with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["bar"], export_dir)
-      # Read the constant a from the graph.
-      a = ops.get_default_graph().get_tensor_by_name(constant_6_name)
-      b = constant_op.constant(5.0)
-      c = a * b
-      self.assertEqual(30.0, self.evaluate(c))
+      # Restore the graph with tag "bar".
+      with self.session(graph=ops.Graph()) as sess:
+        loader.load(sess, ["bar"], export_dir)
+        # Read the constant a from the graph.
+        a = ops.get_default_graph().get_tensor_by_name(constant_6_name)
+        b = constant_op.constant(5.0)
+        c = a * b
+        self.assertEqual(30.0, self.evaluate(c))
 
-  @test_util.run_deprecated_v1
   def testNoOverwrite(self):
     export_dir = self._get_export_dir("test_no_overwrite")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
 
-    # Graph with a single variable. SavedModel invoked to:
-    # - add with weights.
-    with self.session(graph=ops.Graph()) as sess:
-      self._init_and_validate_variable(sess, "v", 42)
-      builder.add_meta_graph_and_variables(sess, ["foo"])
+    with ops.Graph().as_default():
+      # Graph with a single variable. SavedModel invoked to:
+      # - add with weights.
+      with self.session(graph=ops.Graph()) as sess:
+        self._init_and_validate_variable(sess, "v", 42)
+        builder.add_meta_graph_and_variables(sess, ["foo"])
 
-    # Save the SavedModel to disk in text format.
-    builder.save(as_text=True)
+      # Save the SavedModel to disk in text format.
+      builder.save(as_text=True)
 
-    # Restore the graph with tag "foo", whose variables were saved.
-    with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["foo"], export_dir)
-      self.assertEqual(
-          42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
+      # Restore the graph with tag "foo", whose variables were saved.
+      with self.session(graph=ops.Graph()) as sess:
+        loader.load(sess, ["foo"], export_dir)
+        self.assertEqual(42, self._eval("v"))
 
-    # An attempt to create another builder with the same export directory should
-    # result in an assertion error.
-    self.assertRaises(AssertionError, saved_model_builder._SavedModelBuilder,
-                      export_dir)
+      # An attempt to create another builder with the same export directory
+      # should result in an assertion error.
+      self.assertRaises(AssertionError, saved_model_builder._SavedModelBuilder,
+                        export_dir)
 
-  @test_util.run_deprecated_v1
   def testSaveAsText(self):
     export_dir = self._get_export_dir("test_astext")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
 
-    # Graph with a single variable. SavedModel invoked to:
-    # - add with weights.
-    with self.session(graph=ops.Graph()) as sess:
-      self._init_and_validate_variable(sess, "v", 42)
-      builder.add_meta_graph_and_variables(sess, ["foo"])
+    with ops.Graph().as_default():
+      # Graph with a single variable. SavedModel invoked to:
+      # - add with weights.
+      with self.session(graph=ops.Graph()) as sess:
+        self._init_and_validate_variable(sess, "v", 42)
+        builder.add_meta_graph_and_variables(sess, ["foo"])
 
-    # Graph with the same single variable. SavedModel invoked to:
-    # - simply add the model (weights are not updated).
-    with self.session(graph=ops.Graph()) as sess:
-      self._init_and_validate_variable(sess, "v", 43)
-      builder.add_meta_graph(["bar"])
+      # Graph with the same single variable. SavedModel invoked to:
+      # - simply add the model (weights are not updated).
+      with self.session(graph=ops.Graph()) as sess:
+        self._init_and_validate_variable(sess, "v", 43)
+        builder.add_meta_graph(["bar"])
 
-    # Save the SavedModel to disk in text format.
-    builder.save(as_text=True)
+      # Save the SavedModel to disk in text format.
+      builder.save(as_text=True)
 
-    # Restore the graph with tag "foo", whose variables were saved.
-    with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["foo"], export_dir)
-      self.assertEqual(
-          42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
+      # Restore the graph with tag "foo", whose variables were saved.
+      with self.session(graph=ops.Graph()) as sess:
+        loader.load(sess, ["foo"], export_dir)
+        self.assertEqual(
+            42,
+            self._eval(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0]))
 
-    # Restore the graph with tag "bar", whose variables were not saved.
-    with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["bar"], export_dir)
-      self.assertEqual(
-          42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
+      # Restore the graph with tag "bar", whose variables were not saved.
+      with self.session(graph=ops.Graph()) as sess:
+        loader.load(sess, ["bar"], export_dir)
+        self.assertEqual(
+            42,
+            self._eval(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0]))
 
-  @test_util.run_v1_only("b/120545219")
   def testCollections(self):
     export_dir = self._get_export_dir("test_collections")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
 
-    # Graph with a single variable added to a collection. SavedModel invoked to:
-    # - add with weights.
-    with self.session(graph=ops.Graph()) as sess:
-      v = variables.VariableV1(42, name="v")
-      ops.add_to_collection("foo_vars", v)
-      self.evaluate(variables.global_variables_initializer())
-      self.assertEqual(42, self.evaluate(v))
-      builder.add_meta_graph_and_variables(sess, ["foo"])
+    with ops.Graph().as_default():
+      # Graph with a single variable added to a collection. SavedModel invoked
+      # to:
+      # - add with weights.
+      with self.session(graph=ops.Graph()) as sess:
+        v = variables.VariableV1(42, name="v")
+        ops.add_to_collection("foo_vars", v)
+        self.evaluate(variables.global_variables_initializer())
+        self.assertEqual(42, self.evaluate(v))
+        builder.add_meta_graph_and_variables(sess, ["foo"])
 
-    # Graph with the same single variable added to a different collection.
-    # SavedModel invoked to:
-    # - simply add the model (weights are not updated).
-    with self.session(graph=ops.Graph()) as sess:
-      v = variables.VariableV1(43, name="v")
-      ops.add_to_collection("bar_vars", v)
-      self.evaluate(variables.global_variables_initializer())
-      self.assertEqual(43, self.evaluate(v))
-      builder.add_meta_graph(["bar"])
+      # Graph with the same single variable added to a different collection.
+      # SavedModel invoked to:
+      # - simply add the model (weights are not updated).
+      with self.session(graph=ops.Graph()) as sess:
+        v = variables.VariableV1(43, name="v")
+        ops.add_to_collection("bar_vars", v)
+        self.evaluate(variables.global_variables_initializer())
+        self.assertEqual(43, self.evaluate(v))
+        builder.add_meta_graph(["bar"])
 
-    # Save the SavedModel to disk.
-    builder.save()
+      # Save the SavedModel to disk.
+      builder.save()
 
-    # Restore the graph with tag "foo", whose variables were saved. The
-    # collection 'foo_vars' should contain a single element. The collection
-    # 'bar_vars' should not be found.
-    with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["foo"], export_dir)
-      collection_foo_vars = ops.get_collection("foo_vars")
-      self.assertEqual(len(collection_foo_vars), 1)
-      self.assertEqual(42, collection_foo_vars[0].eval())
+      # Restore the graph with tag "foo", whose variables were saved. The
+      # collection 'foo_vars' should contain a single element. The collection
+      # 'bar_vars' should not be found.
+      with self.session(graph=ops.Graph()) as sess:
+        loader.load(sess, ["foo"], export_dir)
+        collection_foo_vars = ops.get_collection("foo_vars")
+        self.assertEqual(len(collection_foo_vars), 1)
+        self.assertEqual(42, self._eval(collection_foo_vars[0]))
 
-      self.assertEqual(len(ops.get_collection("bar_vars")), 0)
+        self.assertEqual(len(ops.get_collection("bar_vars")), 0)
 
-    # Restore the graph with tag "bar", whose variables were not saved. The
-    # collection-def exported as part of the meta graph def is updated to
-    # reflect the new collection. The value of the variable in the
-    # collection-def corresponds to the saved value (from the previous graph
-    # with tag "foo").
-    with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["bar"], export_dir)
-      collection_bar_vars = ops.get_collection("bar_vars")
-      self.assertEqual(len(collection_bar_vars), 1)
-      self.assertEqual(42, collection_bar_vars[0].eval())
+      # Restore the graph with tag "bar", whose variables were not saved. The
+      # collection-def exported as part of the meta graph def is updated to
+      # reflect the new collection. The value of the variable in the
+      # collection-def corresponds to the saved value (from the previous graph
+      # with tag "foo").
+      with self.session(graph=ops.Graph()) as sess:
+        loader.load(sess, ["bar"], export_dir)
+        collection_bar_vars = ops.get_collection("bar_vars")
+        self.assertEqual(len(collection_bar_vars), 1)
+        self.assertEqual(42, self._eval(collection_bar_vars[0]))
 
-      self.assertEqual(len(ops.get_collection("foo_vars")), 0)
+        self.assertEqual(len(ops.get_collection("foo_vars")), 0)
 
-  @test_util.run_deprecated_v1
   def testSignatureDefs(self):
     export_dir = self._get_export_dir("test_signature_defs")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
 
-    # Graph with a single variable and a single entry in the signature def map.
-    # SavedModel is invoked to add with weights.
-    with self.session(graph=ops.Graph()) as sess:
-      self._init_and_validate_variable(sess, "v", 42)
-      # Build and populate an empty SignatureDef for testing.
-      foo_signature = signature_def_utils.build_signature_def(dict(),
-                                                              dict(), "foo")
-      builder.add_meta_graph_and_variables(
-          sess, ["foo"], signature_def_map={"foo_key": foo_signature})
+    with ops.Graph().as_default():
+      # Graph with a single variable and a single entry in the signature def
+      # map.  SavedModel is invoked to add with weights.
+      with self.session(graph=ops.Graph()) as sess:
+        self._init_and_validate_variable(sess, "v", 42)
+        # Build and populate an empty SignatureDef for testing.
+        foo_signature = signature_def_utils.build_signature_def(
+            dict(), dict(), "foo")
+        builder.add_meta_graph_and_variables(
+            sess, ["foo"], signature_def_map={"foo_key": foo_signature})
 
-    # Graph with the same single variable and multiple entries in the signature
-    # def map. No weights are saved by SavedModel.
-    with self.session(graph=ops.Graph()) as sess:
-      self._init_and_validate_variable(sess, "v", 43)
-      # Build and populate a different SignatureDef for testing.
-      bar_signature = signature_def_utils.build_signature_def(dict(),
-                                                              dict(), "bar")
-      # Also, build a different SignatureDef corresponding to "foo_key" defined
-      # in the previous graph.
-      foo_new_signature = signature_def_utils.build_signature_def(dict(),
-                                                                  dict(),
-                                                                  "foo_new")
-      builder.add_meta_graph(
-          ["bar"],
-          signature_def_map={
-              "bar_key": bar_signature,
-              "foo_key": foo_new_signature
-          })
+      # Graph with the same single variable and multiple entries in the
+      # signature def map. No weights are saved by SavedModel.
+      with self.session(graph=ops.Graph()) as sess:
+        self._init_and_validate_variable(sess, "v", 43)
+        # Build and populate a different SignatureDef for testing.
+        bar_signature = signature_def_utils.build_signature_def(
+            dict(), dict(), "bar")
+        # Also, build a different SignatureDef corresponding to "foo_key"
+        # defined in the previous graph.
+        foo_new_signature = signature_def_utils.build_signature_def(
+            dict(), dict(), "foo_new")
+        builder.add_meta_graph(["bar"],
+                               signature_def_map={
+                                   "bar_key": bar_signature,
+                                   "foo_key": foo_new_signature
+                               })
 
-    # Save the SavedModel to disk.
-    builder.save()
+      # Save the SavedModel to disk.
+      builder.save()
 
-    # Restore the graph with tag "foo". The single entry in the SignatureDef map
-    # corresponding to "foo_key" should exist.
-    with self.session(graph=ops.Graph()) as sess:
-      foo_graph = loader.load(sess, ["foo"], export_dir)
-      self.assertEqual(
-          42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
+      # Restore the graph with tag "foo". The single entry in the SignatureDef
+      # map corresponding to "foo_key" should exist.
+      with self.session(graph=ops.Graph()) as sess:
+        foo_graph = loader.load(sess, ["foo"], export_dir)
+        self.assertEqual(
+            42,
+            self._eval(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0]))
 
-      foo_signature = foo_graph.signature_def
-      self.assertEqual(len(foo_signature), 1)
-      self.assertEqual("foo", foo_signature["foo_key"].method_name)
+        foo_signature = foo_graph.signature_def
+        self.assertEqual(len(foo_signature), 1)
+        self.assertEqual("foo", foo_signature["foo_key"].method_name)
 
-    # Restore the graph with tag "bar". The SignatureDef map should have two
-    # entries. One corresponding to "bar_key" and another corresponding to the
-    # new value of "foo_key".
-    with self.session(graph=ops.Graph()) as sess:
-      bar_graph = loader.load(sess, ["bar"], export_dir)
-      self.assertEqual(
-          42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
+      # Restore the graph with tag "bar". The SignatureDef map should have two
+      # entries. One corresponding to "bar_key" and another corresponding to the
+      # new value of "foo_key".
+      with self.session(graph=ops.Graph()) as sess:
+        bar_graph = loader.load(sess, ["bar"], export_dir)
+        self.assertEqual(
+            42,
+            self._eval(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0]))
 
-      bar_signature = bar_graph.signature_def
-      self.assertEqual(len(bar_signature), 2)
-      self.assertEqual("bar", bar_signature["bar_key"].method_name)
-      self.assertEqual("foo_new", bar_signature["foo_key"].method_name)
+        bar_signature = bar_graph.signature_def
+        self.assertEqual(len(bar_signature), 2)
+        self.assertEqual("bar", bar_signature["bar_key"].method_name)
+        self.assertEqual("foo_new", bar_signature["foo_key"].method_name)
 
   def testSignatureDefValidationFails(self):
     export_dir = self._get_export_dir("test_signature_def_validation_fail")
@@ -615,479 +646,491 @@ class SavedModelTest(SavedModelTestBase):
     self._validate_sig_def_keys(builder, valid_tensor_info,
                                 constants.TRAIN_OP_SIGNATURE_KEY)
 
-  @test_util.run_deprecated_v1
   def testSignatureDefValidationSucceedsWithName(self):
     tensor_with_name = meta_graph_pb2.TensorInfo()
     tensor_with_name.name = "foo"
     tensor_with_name.dtype = types_pb2.DT_FLOAT
 
-    export_dir = self._get_export_dir("test_signature_def_validation_name_1")
-    builder = saved_model_builder._SavedModelBuilder(export_dir)
-    self._validate_inputs_tensor_info_accept(builder, tensor_with_name)
+    with ops.Graph().as_default():
+      export_dir = self._get_export_dir("test_signature_def_validation_name_1")
+      builder = saved_model_builder._SavedModelBuilder(export_dir)
+      self._validate_inputs_tensor_info_accept(builder, tensor_with_name)
 
-    export_dir = self._get_export_dir("test_signature_def_validation_name_2")
-    builder = saved_model_builder._SavedModelBuilder(export_dir)
-    self._validate_outputs_tensor_info_accept(builder, tensor_with_name)
+      export_dir = self._get_export_dir("test_signature_def_validation_name_2")
+      builder = saved_model_builder._SavedModelBuilder(export_dir)
+      self._validate_outputs_tensor_info_accept(builder, tensor_with_name)
 
-  @test_util.run_deprecated_v1
   def testSignatureDefValidationSucceedsWithCoo(self):
-    tensor_with_coo = meta_graph_pb2.TensorInfo()
-    # TODO(soergel) test validation of each of the fields of coo_sparse
-    tensor_with_coo.coo_sparse.values_tensor_name = "foo"
-    tensor_with_coo.dtype = types_pb2.DT_FLOAT
+    with ops.Graph().as_default():
+      tensor_with_coo = meta_graph_pb2.TensorInfo()
+      # TODO(soergel) test validation of each of the fields of coo_sparse
+      tensor_with_coo.coo_sparse.values_tensor_name = "foo"
+      tensor_with_coo.dtype = types_pb2.DT_FLOAT
 
-    export_dir = self._get_export_dir("test_signature_def_validation_coo_1")
-    builder = saved_model_builder._SavedModelBuilder(export_dir)
-    self._validate_inputs_tensor_info_accept(builder, tensor_with_coo)
+      export_dir = self._get_export_dir("test_signature_def_validation_coo_1")
+      builder = saved_model_builder._SavedModelBuilder(export_dir)
+      self._validate_inputs_tensor_info_accept(builder, tensor_with_coo)
 
-    export_dir = self._get_export_dir("test_signature_def_validation_coo_2")
-    builder = saved_model_builder._SavedModelBuilder(export_dir)
-    self._validate_outputs_tensor_info_accept(builder, tensor_with_coo)
+      export_dir = self._get_export_dir("test_signature_def_validation_coo_2")
+      builder = saved_model_builder._SavedModelBuilder(export_dir)
+      self._validate_outputs_tensor_info_accept(builder, tensor_with_coo)
 
-  @test_util.run_deprecated_v1
   def testSignatureDefValidationSucceedsWithRagged(self):
-    ragged_tensor = ragged_factory_ops.constant([[1, 2], [3]])
-    tensor_with_ragged = utils.build_tensor_info(ragged_tensor)
+    with ops.Graph().as_default():
+      ragged_tensor = ragged_factory_ops.constant([[1, 2], [3]])
+      tensor_with_ragged = utils.build_tensor_info(ragged_tensor)
 
-    export_dir = self._get_export_dir("test_signature_def_validation_ragged_1")
-    builder = saved_model_builder._SavedModelBuilder(export_dir)
-    self._validate_inputs_tensor_info_accept(builder, tensor_with_ragged)
+      export_dir = self._get_export_dir(
+          "test_signature_def_validation_ragged_1")
+      builder = saved_model_builder._SavedModelBuilder(export_dir)
+      self._validate_inputs_tensor_info_accept(builder, tensor_with_ragged)
 
-    export_dir = self._get_export_dir("test_signature_def_validation_ragged_2")
-    builder = saved_model_builder._SavedModelBuilder(export_dir)
-    self._validate_outputs_tensor_info_accept(builder, tensor_with_ragged)
+      export_dir = self._get_export_dir(
+          "test_signature_def_validation_ragged_2")
+      builder = saved_model_builder._SavedModelBuilder(export_dir)
+      self._validate_outputs_tensor_info_accept(builder, tensor_with_ragged)
 
-  @test_util.run_deprecated_v1
   def testAssets(self):
     export_dir = self._get_export_dir("test_assets")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
 
-    with self.session(graph=ops.Graph()) as sess:
-      self._init_and_validate_variable(sess, "v", 42)
+    with ops.Graph().as_default():
+      with self.session(graph=ops.Graph()) as sess:
+        self._init_and_validate_variable(sess, "v", 42)
 
-      # Build an asset collection.
-      ignored_filepath = os.path.join(
-          compat.as_bytes(test.get_temp_dir()), compat.as_bytes("ignored.txt"))
-      file_io.write_string_to_file(ignored_filepath, "will be ignored")
+        # Build an asset collection.
+        ignored_filepath = os.path.join(
+            compat.as_bytes(test.get_temp_dir()),
+            compat.as_bytes("ignored.txt"))
+        file_io.write_string_to_file(ignored_filepath, "will be ignored")
 
-      asset_list = self._build_asset_collection("hello42.txt", "foo bar baz",
-                                                "asset_file_tensor")
+        asset_list = self._build_asset_collection("hello42.txt", "foo bar baz",
+                                                  "asset_file_tensor")
 
-      builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_list=asset_list)
+        builder.add_meta_graph_and_variables(
+            sess, ["foo"], assets_list=asset_list)
 
-    # Save the SavedModel to disk.
-    builder.save()
+      # Save the SavedModel to disk.
+      builder.save()
 
-    with self.session(graph=ops.Graph()) as sess:
-      foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_assets(export_dir, foo_graph.asset_file_def, "hello42.txt",
-                            "foo bar baz", "asset_file_tensor:0")
-      ignored_asset_path = os.path.join(
-          compat.as_bytes(export_dir),
-          compat.as_bytes(constants.ASSETS_DIRECTORY),
-          compat.as_bytes("ignored.txt"))
-      self.assertFalse(file_io.file_exists(ignored_asset_path))
+      with self.session(graph=ops.Graph()) as sess:
+        foo_graph = loader.load(sess, ["foo"], export_dir)
+        self._validate_assets(export_dir, foo_graph.asset_file_def,
+                              "hello42.txt", "foo bar baz",
+                              "asset_file_tensor:0")
+        ignored_asset_path = os.path.join(
+            compat.as_bytes(export_dir),
+            compat.as_bytes(constants.ASSETS_DIRECTORY),
+            compat.as_bytes("ignored.txt"))
+        self.assertFalse(file_io.file_exists(ignored_asset_path))
 
-  @test_util.run_deprecated_v1
   def testAssetsNameCollisionDiffFile(self):
     export_dir = self._get_export_dir("test_assets_name_collision_diff_file")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
 
-    with self.session(graph=ops.Graph()) as sess:
-      self._init_and_validate_variable(sess, "v", 42)
+    with ops.Graph().as_default():
+      with self.session(graph=ops.Graph()) as sess:
+        self._init_and_validate_variable(sess, "v", 42)
 
-      asset_list = self._build_asset_collection(
-          "hello42.txt", "foo bar bak", "asset_file_tensor", asset_subdir="1")
+        asset_list = self._build_asset_collection(
+            "hello42.txt", "foo bar bak", "asset_file_tensor", asset_subdir="1")
 
-      asset_list = self._build_asset_collection(
-          "hello42.txt", "foo bar baz", "asset_file_tensor_1", asset_subdir="2")
+        asset_list = self._build_asset_collection(
+            "hello42.txt",
+            "foo bar baz",
+            "asset_file_tensor_1",
+            asset_subdir="2")
 
-      builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_list=asset_list)
+        builder.add_meta_graph_and_variables(
+            sess, ["foo"], assets_list=asset_list)
 
-    # Save the SavedModel to disk.
-    builder.save()
+      # Save the SavedModel to disk.
+      builder.save()
 
-    with self.session(graph=ops.Graph()) as sess:
-      foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_assets(export_dir, foo_graph.asset_file_def, "hello42.txt",
-                            "foo bar bak", "asset_file_tensor:0")
-      self._validate_assets(
-          export_dir,
-          foo_graph.asset_file_def,
-          "hello42.txt_1",
-          "foo bar baz",
-          "asset_file_tensor_1:0",
-          asset_id=1)
+      with self.session(graph=ops.Graph()) as sess:
+        foo_graph = loader.load(sess, ["foo"], export_dir)
+        self._validate_assets(export_dir, foo_graph.asset_file_def,
+                              "hello42.txt", "foo bar bak",
+                              "asset_file_tensor:0")
+        self._validate_assets(
+            export_dir,
+            foo_graph.asset_file_def,
+            "hello42.txt_1",
+            "foo bar baz",
+            "asset_file_tensor_1:0",
+            asset_id=1)
 
-  @test_util.run_deprecated_v1
   def testAssetsNameCollisionSameFilepath(self):
     export_dir = self._get_export_dir("test_assets_name_collision_same_path")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
 
-    with self.session(graph=ops.Graph()) as sess:
-      self._init_and_validate_variable(sess, "v", 42)
+    with ops.Graph().as_default():
+      with self.session(graph=ops.Graph()) as sess:
+        self._init_and_validate_variable(sess, "v", 42)
 
-      asset_list = self._build_asset_collection("hello42.txt", "foo bar baz",
-                                                "asset_file_tensor")
+        asset_list = self._build_asset_collection("hello42.txt", "foo bar baz",
+                                                  "asset_file_tensor")
 
-      asset_list = self._build_asset_collection("hello42.txt", "foo bar baz",
-                                                "asset_file_tensor_1")
+        asset_list = self._build_asset_collection("hello42.txt", "foo bar baz",
+                                                  "asset_file_tensor_1")
 
-      builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_list=asset_list)
+        builder.add_meta_graph_and_variables(
+            sess, ["foo"], assets_list=asset_list)
 
-    # Save the SavedModel to disk.
-    builder.save()
+      # Save the SavedModel to disk.
+      builder.save()
 
-    with self.session(graph=ops.Graph()) as sess:
-      foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_assets(export_dir, foo_graph.asset_file_def, "hello42.txt",
-                            "foo bar baz", "asset_file_tensor:0")
-      # The second tensor should be recorded, but the same.
-      self._validate_assets(
-          export_dir,
-          foo_graph.asset_file_def,
-          "hello42.txt",
-          "foo bar baz",
-          "asset_file_tensor_1:0",
-          asset_id=1)
-      ignored_asset_path = os.path.join(
-          compat.as_bytes(export_dir),
-          compat.as_bytes(constants.ASSETS_DIRECTORY),
-          compat.as_bytes("hello42.txt_1"))
-      self.assertFalse(file_io.file_exists(ignored_asset_path))
+      with self.session(graph=ops.Graph()) as sess:
+        foo_graph = loader.load(sess, ["foo"], export_dir)
+        self._validate_assets(export_dir, foo_graph.asset_file_def,
+                              "hello42.txt", "foo bar baz",
+                              "asset_file_tensor:0")
+        # The second tensor should be recorded, but the same.
+        self._validate_assets(
+            export_dir,
+            foo_graph.asset_file_def,
+            "hello42.txt",
+            "foo bar baz",
+            "asset_file_tensor_1:0",
+            asset_id=1)
+        ignored_asset_path = os.path.join(
+            compat.as_bytes(export_dir),
+            compat.as_bytes(constants.ASSETS_DIRECTORY),
+            compat.as_bytes("hello42.txt_1"))
+        self.assertFalse(file_io.file_exists(ignored_asset_path))
 
-  @test_util.run_deprecated_v1
   def testAssetsNameCollisionSameFile(self):
     export_dir = self._get_export_dir("test_assets_name_collision_same_file")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
 
-    with self.session(graph=ops.Graph()) as sess:
-      self._init_and_validate_variable(sess, "v", 42)
+    with ops.Graph().as_default():
+      with self.session(graph=ops.Graph()) as sess:
+        self._init_and_validate_variable(sess, "v", 42)
 
-      asset_list = self._build_asset_collection(
-          "hello42.txt", "foo bar baz", "asset_file_tensor", asset_subdir="1")
+        asset_list = self._build_asset_collection(
+            "hello42.txt", "foo bar baz", "asset_file_tensor", asset_subdir="1")
 
-      asset_list = self._build_asset_collection(
-          "hello42.txt", "foo bar baz", "asset_file_tensor_1", asset_subdir="2")
+        asset_list = self._build_asset_collection(
+            "hello42.txt",
+            "foo bar baz",
+            "asset_file_tensor_1",
+            asset_subdir="2")
 
-      builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_list=asset_list)
+        builder.add_meta_graph_and_variables(
+            sess, ["foo"], assets_list=asset_list)
 
-    # Save the SavedModel to disk.
-    builder.save()
+      # Save the SavedModel to disk.
+      builder.save()
 
-    with self.session(graph=ops.Graph()) as sess:
-      foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_assets(export_dir, foo_graph.asset_file_def, "hello42.txt",
-                            "foo bar baz", "asset_file_tensor:0")
-      # The second tensor should be recorded, but the same.
-      self._validate_assets(
-          export_dir,
-          foo_graph.asset_file_def,
-          "hello42.txt",
-          "foo bar baz",
-          "asset_file_tensor_1:0",
-          asset_id=1)
-      ignored_asset_path = os.path.join(
-          compat.as_bytes(export_dir),
-          compat.as_bytes(constants.ASSETS_DIRECTORY),
-          compat.as_bytes("hello42.txt_1"))
-      self.assertFalse(file_io.file_exists(ignored_asset_path))
+      with self.session(graph=ops.Graph()) as sess:
+        foo_graph = loader.load(sess, ["foo"], export_dir)
+        self._validate_assets(export_dir, foo_graph.asset_file_def,
+                              "hello42.txt", "foo bar baz",
+                              "asset_file_tensor:0")
+        # The second tensor should be recorded, but the same.
+        self._validate_assets(
+            export_dir,
+            foo_graph.asset_file_def,
+            "hello42.txt",
+            "foo bar baz",
+            "asset_file_tensor_1:0",
+            asset_id=1)
+        ignored_asset_path = os.path.join(
+            compat.as_bytes(export_dir),
+            compat.as_bytes(constants.ASSETS_DIRECTORY),
+            compat.as_bytes("hello42.txt_1"))
+        self.assertFalse(file_io.file_exists(ignored_asset_path))
 
-  @test_util.run_deprecated_v1
   def testAssetsNameCollisionManyFiles(self):
     export_dir = self._get_export_dir("test_assets_name_collision_many_files")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
 
-    with self.session(graph=ops.Graph()) as sess:
-      self._init_and_validate_variable(sess, "v", 42)
+    with ops.Graph().as_default():
+      with self.session(graph=ops.Graph()) as sess:
+        self._init_and_validate_variable(sess, "v", 42)
 
-      for i in range(5):
-        idx = str(i)
-        asset_list = self._build_asset_collection(
-            "hello42.txt",
-            "foo bar baz " + idx,
-            "asset_file_tensor_" + idx,
-            asset_subdir=idx)
+        for i in range(5):
+          idx = str(i)
+          asset_list = self._build_asset_collection(
+              "hello42.txt",
+              "foo bar baz " + idx,
+              "asset_file_tensor_" + idx,
+              asset_subdir=idx)
 
-      builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_list=asset_list)
+        builder.add_meta_graph_and_variables(
+            sess, ["foo"], assets_list=asset_list)
 
-    # Save the SavedModel to disk.
-    builder.save()
+      # Save the SavedModel to disk.
+      builder.save()
 
-    with self.session(graph=ops.Graph()) as sess:
-      foo_graph = loader.load(sess, ["foo"], export_dir)
-      for i in range(1, 5):
-        idx = str(i)
-        self._validate_assets(
-            export_dir,
-            foo_graph.asset_file_def,
-            "hello42.txt_" + idx,
-            "foo bar baz " + idx,
-            "asset_file_tensor_{}:0".format(idx),
-            asset_id=i)
+      with self.session(graph=ops.Graph()) as sess:
+        foo_graph = loader.load(sess, ["foo"], export_dir)
+        for i in range(1, 5):
+          idx = str(i)
+          self._validate_assets(
+              export_dir,
+              foo_graph.asset_file_def,
+              "hello42.txt_" + idx,
+              "foo bar baz " + idx,
+              "asset_file_tensor_{}:0".format(idx),
+              asset_id=i)
 
-      self._validate_assets(export_dir, foo_graph.asset_file_def, "hello42.txt",
-                            "foo bar baz 0", "asset_file_tensor_0:0")
+        self._validate_assets(export_dir, foo_graph.asset_file_def,
+                              "hello42.txt", "foo bar baz 0",
+                              "asset_file_tensor_0:0")
 
-  @test_util.run_v1_only("b/120545219")
   def testCustomInitOp(self):
     export_dir = self._get_export_dir("test_main_op")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
 
-    with self.session(graph=ops.Graph()) as sess:
-      # Add `v1` and `v2` variables to the graph.
-      v1 = variables.VariableV1(1, name="v1")
-      ops.add_to_collection("v", v1)
-      v2 = variables.VariableV1(2, name="v2")
-      ops.add_to_collection("v", v2)
+    with ops.Graph().as_default():
+      with self.session(graph=ops.Graph()) as sess:
+        # Add `v1` and `v2` variables to the graph.
+        v1 = variables.VariableV1(1, name="v1")
+        v2 = variables.VariableV1(2, name="v2")
 
-      # Initialize another variable `v3` to 42.
-      v3 = variables.VariableV1(42, name="v3")
-      ops.add_to_collection("v", v3)
+        # Initialize another variable `v3` to 42.
+        v3 = variables.VariableV1(42, name="v3")
 
-      # Set up an assignment op to be run as part of the main_op.
-      with ops.control_dependencies([main_op.main_op()]):
-        add_v1_v2 = math_ops.add(v1._ref(), v2._ref())
-        custom_init_op = control_flow_ops.group(state_ops.assign(v3, add_v1_v2))
+        # Set up an assignment op to be run as part of the main_op.
+        with ops.control_dependencies([main_op.main_op()]):
+          add_v1_v2 = math_ops.add(v1, v2)
+          custom_init_op = control_flow_ops.group(
+              state_ops.assign(v3, add_v1_v2))
 
-      self.evaluate(custom_init_op)
-      builder.add_meta_graph_and_variables(
-          sess, ["foo"], init_op=custom_init_op)
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(custom_init_op)
+        builder.add_meta_graph_and_variables(
+            sess, ["foo"], init_op=custom_init_op)
 
-    # Save the SavedModel to disk.
-    builder.save()
+      # Save the SavedModel to disk.
+      builder.save()
 
-    with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["foo"], export_dir)
-      self.assertEqual(1, ops.get_collection("v")[0].eval())
-      self.assertEqual(2, ops.get_collection("v")[1].eval())
-      # Evaluates to the sum of the first two variables and assigned as part of
-      # the main_op, following a restore.
-      self.assertEqual(3, ops.get_collection("v")[2].eval())
+      with self.session(graph=ops.Graph()) as sess:
+        loader.load(sess, ["foo"], export_dir)
+        self.assertEqual(1, self._eval("v1"))
+        self.assertEqual(2, self._eval("v2"))
+        # Evaluates to the sum of the first two variables and assigned as part
+        # of the main_op, following a restore.
+        self.assertEqual(3, self._eval("v3"))
 
-  @test_util.run_v1_only("b/120545219")
   def testTrainOp(self):
     export_dir = self._get_export_dir("test_train_op")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
 
-    with self.session(graph=ops.Graph()) as sess:
-      # Add `v1` and `v2` variables to the graph.
-      v1 = variables.VariableV1(1, name="v1")
-      ops.add_to_collection("v", v1)
-      v2 = variables.VariableV1(2, name="v2")
-      ops.add_to_collection("v", v2)
+    with ops.Graph().as_default():
+      with self.session(graph=ops.Graph()) as sess:
+        # Add `v1` and `v2` variables to the graph.
+        v1 = variables.VariableV1(1, name="v1")
+        v2 = variables.VariableV1(2, name="v2")
 
-      self.evaluate(variables.global_variables_initializer())
-      train_op = state_ops.assign_add(v1, v2)
+        self.evaluate(variables.global_variables_initializer())
+        train_op = state_ops.assign_add(v1, v2)
 
-      self.evaluate(train_op)
-      builder.add_meta_graph_and_variables(sess, ["foo"], train_op=train_op)
+        self.evaluate(train_op)
+        builder.add_meta_graph_and_variables(sess, ["foo"], train_op=train_op)
 
-    # Save the SavedModel to disk.
-    builder.save()
+      # Save the SavedModel to disk.
+      builder.save()
 
-    with self.session(graph=ops.Graph()) as sess:
-      meta_graph_def = loader.load(sess, ["foo"], export_dir)
-      self.assertEqual(3, ops.get_collection("v")[0].eval())
-      self.assertEqual(2, ops.get_collection("v")[1].eval())
-      self.assertIsInstance(
-          loader_impl.get_train_op(meta_graph_def), ops.Tensor)
+      with self.session(graph=ops.Graph()) as sess:
+        meta_graph_def = loader.load(sess, ["foo"], export_dir)
+        self.assertEqual(3, self._eval("v1"))
+        self.assertEqual(2, self._eval("v2"))
+        if variable_scope.resource_variables_enabled():
+          self.assertEqual(
+              loader_impl.get_train_op(meta_graph_def).type,
+              "AssignAddVariableOp")
+        else:
+          self.assertIsInstance(
+              loader_impl.get_train_op(meta_graph_def), ops.Tensor)
 
-  @test_util.run_v1_only("b/120545219")
   def testTrainOpGroup(self):
     export_dir = self._get_export_dir("test_train_op_group")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
 
-    with self.session(graph=ops.Graph()) as sess:
-      # Add `v1` and `v2` variables to the graph.
-      v1 = variables.VariableV1(1, name="v1")
-      ops.add_to_collection("v", v1)
-      v2 = variables.VariableV1(2, name="v2")
-      ops.add_to_collection("v", v2)
+    with ops.Graph().as_default():
+      with self.session(graph=ops.Graph()) as sess:
+        # Add `v1` and `v2` variables to the graph.
+        variables.VariableV1(1, name="v1")
+        variables.VariableV1(2, name="v2")
 
-      self.evaluate(variables.global_variables_initializer())
-      train_op = control_flow_ops.group()
+        self.evaluate(variables.global_variables_initializer())
+        train_op = control_flow_ops.group()
 
-      self.evaluate(train_op)
-      builder.add_meta_graph_and_variables(sess, ["foo"], train_op=train_op)
+        self.evaluate(train_op)
+        builder.add_meta_graph_and_variables(sess, ["foo"], train_op=train_op)
 
-    # Save the SavedModel to disk.
-    builder.save()
+      # Save the SavedModel to disk.
+      builder.save()
 
-    with self.session(graph=ops.Graph()) as sess:
-      meta_graph_def = loader.load(sess, ["foo"], export_dir)
-      self.assertEqual(1, ops.get_collection("v")[0].eval())
-      self.assertEqual(2, ops.get_collection("v")[1].eval())
-      self.assertIsInstance(
-          loader_impl.get_train_op(meta_graph_def), ops.Operation)
+      with self.session(graph=ops.Graph()) as sess:
+        meta_graph_def = loader.load(sess, ["foo"], export_dir)
+        self.assertEqual(1, self._eval("v1"))
+        self.assertEqual(2, self._eval("v2"))
+        self.assertIsInstance(
+            loader_impl.get_train_op(meta_graph_def), ops.Operation)
 
-  @test_util.run_v1_only("b/120545219")
   def testTrainOpAfterVariables(self):
     export_dir = self._get_export_dir("test_train_op_after_variables")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
 
-    with self.session(graph=ops.Graph()) as sess:
-      # Add `v1` and `v2` variables to the graph.
-      v1 = variables.VariableV1(1, name="v1")
-      ops.add_to_collection("v", v1)
-      v2 = variables.VariableV1(2, name="v2")
-      ops.add_to_collection("v", v2)
+    with ops.Graph().as_default():
+      with self.session(graph=ops.Graph()) as sess:
+        # Add `v1` and `v2` variables to the graph.
+        v1 = variables.VariableV1(1, name="v1")
+        v2 = variables.VariableV1(2, name="v2")
 
-      self.evaluate(variables.global_variables_initializer())
-      builder.add_meta_graph_and_variables(sess, ["pre_foo"])
+        self.evaluate(variables.global_variables_initializer())
+        builder.add_meta_graph_and_variables(sess, ["pre_foo"])
 
-      train_op = state_ops.assign_add(v1, v2)
-      self.evaluate(train_op)
-      builder.add_meta_graph(["foo"], train_op=train_op)
+        train_op = state_ops.assign_add(v1, v2)
+        self.evaluate(train_op)
+        builder.add_meta_graph(["foo"], train_op=train_op)
 
-    # Save the SavedModel to disk.
-    builder.save()
+      # Save the SavedModel to disk.
+      builder.save()
 
-    with self.session(graph=ops.Graph()) as sess:
-      meta_graph_def = loader.load(sess, ["foo"], export_dir)
-      self.assertIsInstance(
-          loader_impl.get_train_op(meta_graph_def), ops.Tensor)
+      with self.session(graph=ops.Graph()) as sess:
+        meta_graph_def = loader.load(sess, ["foo"], export_dir)
+        if variable_scope.resource_variables_enabled():
+          self.assertEqual(
+              loader_impl.get_train_op(meta_graph_def).type,
+              "AssignAddVariableOp")
+        else:
+          self.assertIsInstance(
+              loader_impl.get_train_op(meta_graph_def), ops.Tensor)
 
-    with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["pre_foo"], export_dir)
-      self.assertFalse(ops.get_collection(constants.TRAIN_OP_KEY))
+      with self.session(graph=ops.Graph()) as sess:
+        loader.load(sess, ["pre_foo"], export_dir)
+        self.assertFalse(ops.get_collection(constants.TRAIN_OP_KEY))
 
-  @test_util.run_deprecated_v1
   def testMultipleAssets(self):
     export_dir = self._get_export_dir("test_multiple_assets")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
 
-    with self.session(graph=ops.Graph()) as sess:
-      self._init_and_validate_variable(sess, "v", 42)
+    with ops.Graph().as_default():
+      with self.session(graph=ops.Graph()) as sess:
+        self._init_and_validate_variable(sess, "v", 42)
 
-      # Build an asset collection specific to `foo` graph.
-      asset_list = self._build_asset_collection("foo.txt", "content_foo",
-                                                "asset_file_tensor")
+        # Build an asset collection specific to `foo` graph.
+        asset_list = self._build_asset_collection("foo.txt", "content_foo",
+                                                  "asset_file_tensor")
 
-      # Add the asset collection as part of the graph with tag "foo".
-      builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_list=asset_list)
+        # Add the asset collection as part of the graph with tag "foo".
+        builder.add_meta_graph_and_variables(
+            sess, ["foo"], assets_list=asset_list)
 
-    with self.session(graph=ops.Graph()) as sess:
-      self._init_and_validate_variable(sess, "v", 42)
+      with self.session(graph=ops.Graph()) as sess:
+        self._init_and_validate_variable(sess, "v", 42)
 
-      # Build an asset collection specific to `bar` graph.
-      asset_list = self._build_asset_collection("bar.txt", "content_bar",
-                                                "asset_file_tensor")
+        # Build an asset collection specific to `bar` graph.
+        asset_list = self._build_asset_collection("bar.txt", "content_bar",
+                                                  "asset_file_tensor")
 
-      # Add the asset collection as part of the graph with tag "bar".
-      builder.add_meta_graph(["bar"], assets_list=asset_list)
+        # Add the asset collection as part of the graph with tag "bar".
+        builder.add_meta_graph(["bar"], assets_list=asset_list)
 
-    # Save the SavedModel to disk.
-    builder.save()
+      # Save the SavedModel to disk.
+      builder.save()
 
-    # Check assets restored for graph with tag "foo".
-    with self.session(graph=ops.Graph()) as sess:
-      foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_assets(export_dir, foo_graph.asset_file_def, "foo.txt",
-                            "content_foo", "asset_file_tensor:0")
+      # Check assets restored for graph with tag "foo".
+      with self.session(graph=ops.Graph()) as sess:
+        foo_graph = loader.load(sess, ["foo"], export_dir)
+        self._validate_assets(export_dir, foo_graph.asset_file_def, "foo.txt",
+                              "content_foo", "asset_file_tensor:0")
 
-    # Check assets restored for graph with tag "bar".
-    with self.session(graph=ops.Graph()) as sess:
-      bar_graph = loader.load(sess, ["bar"], export_dir)
-      self._validate_assets(export_dir, bar_graph.asset_file_def, "bar.txt",
-                            "content_bar", "asset_file_tensor:0")
+      # Check assets restored for graph with tag "bar".
+      with self.session(graph=ops.Graph()) as sess:
+        bar_graph = loader.load(sess, ["bar"], export_dir)
+        self._validate_assets(export_dir, bar_graph.asset_file_def, "bar.txt",
+                              "content_bar", "asset_file_tensor:0")
 
-  @test_util.run_deprecated_v1
   def testDuplicateAssets(self):
     export_dir = self._get_export_dir("test_duplicate_assets")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
 
-    with self.session(graph=ops.Graph()) as sess:
-      self._init_and_validate_variable(sess, "v", 42)
+    with ops.Graph().as_default():
+      with self.session(graph=ops.Graph()) as sess:
+        self._init_and_validate_variable(sess, "v", 42)
 
-      # Build an asset collection with `foo.txt` that has `foo` specific
-      # content.
-      asset_list = self._build_asset_collection("foo.txt", "content_foo",
-                                                "asset_file_tensor")
+        # Build an asset collection with `foo.txt` that has `foo` specific
+        # content.
+        asset_list = self._build_asset_collection("foo.txt", "content_foo",
+                                                  "asset_file_tensor")
 
-      # Add the asset collection as part of the graph with tag "foo".
-      builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_list=asset_list)
+        # Add the asset collection as part of the graph with tag "foo".
+        builder.add_meta_graph_and_variables(
+            sess, ["foo"], assets_list=asset_list)
 
-    with self.session(graph=ops.Graph()) as sess:
-      self._init_and_validate_variable(sess, "v", 42)
+      with self.session(graph=ops.Graph()) as sess:
+        self._init_and_validate_variable(sess, "v", 42)
 
-      # Build an asset collection with `foo.txt` that has `bar` specific
-      # content.
-      asset_list = self._build_asset_collection("foo.txt", "content_bar",
-                                                "asset_file_tensor")
+        # Build an asset collection with `foo.txt` that has `bar` specific
+        # content.
+        asset_list = self._build_asset_collection("foo.txt", "content_bar",
+                                                  "asset_file_tensor")
 
-      # Add the asset collection as part of the graph with tag "bar".
-      builder.add_meta_graph(["bar"], assets_list=asset_list)
+        # Add the asset collection as part of the graph with tag "bar".
+        builder.add_meta_graph(["bar"], assets_list=asset_list)
 
-    # Save the SavedModel to disk.
-    builder.save()
+      # Save the SavedModel to disk.
+      builder.save()
 
-    # Check assets restored for graph with tag "foo".
-    with self.session(graph=ops.Graph()) as sess:
-      foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_assets(export_dir, foo_graph.asset_file_def, "foo.txt",
-                            "content_foo", "asset_file_tensor:0")
+      # Check assets restored for graph with tag "foo".
+      with self.session(graph=ops.Graph()) as sess:
+        foo_graph = loader.load(sess, ["foo"], export_dir)
+        self._validate_assets(export_dir, foo_graph.asset_file_def, "foo.txt",
+                              "content_foo", "asset_file_tensor:0")
 
-    # Check assets restored for graph with tag "bar".
-    with self.session(graph=ops.Graph()) as sess:
-      bar_graph = loader.load(sess, ["bar"], export_dir)
+      # Check assets restored for graph with tag "bar".
+      with self.session(graph=ops.Graph()) as sess:
+        bar_graph = loader.load(sess, ["bar"], export_dir)
 
-      # Validate the assets for `bar` graph. `foo.txt` should contain the
-      # original contents corresponding to `foo` graph since an asset with the
-      # same name across multiple graphs is only stored the first time
-      self._validate_assets(export_dir, bar_graph.asset_file_def, "foo.txt",
-                            "content_foo", "asset_file_tensor:0")
+        # Validate the assets for `bar` graph. `foo.txt` should contain the
+        # original contents corresponding to `foo` graph since an asset with the
+        # same name across multiple graphs is only stored the first time
+        self._validate_assets(export_dir, bar_graph.asset_file_def, "foo.txt",
+                              "content_foo", "asset_file_tensor:0")
 
-  @test_util.run_v1_only("b/120545219")
   def testOp(self):
     export_dir = self._get_export_dir("test_op")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
 
-    with session.Session(
-        graph=ops.Graph(),
-        config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
-      with sess.graph.device("/cpu:0"):
-        v1 = variables.VariableV1(1, name="v1")
-      with sess.graph.device("/cpu:1"):
-        v2 = variables.VariableV1(2, name="v2")
+    with ops.Graph().as_default():
+      with session.Session(
+          graph=ops.Graph(),
+          config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
+        with sess.graph.device("/cpu:0"):
+          v1 = variables.VariableV1(1, name="v1")
+        with sess.graph.device("/cpu:1"):
+          v2 = variables.VariableV1(2, name="v2")
 
-      # v3 is an unsaved variable derived from v1 and v2.  It is used to
-      # exercise the ability to run an init op when restoring a graph.
-      v3 = variables.VariableV1(1, name="v3", trainable=False, collections=[])
-      assign_v3 = state_ops.assign(v3, math_ops.add(v1, v2))
-      init_op = control_flow_ops.group(assign_v3, name="init_op")
+        # v3 is an unsaved variable derived from v1 and v2.  It is used to
+        # exercise the ability to run an init op when restoring a graph.
+        v3 = variables.VariableV1(1, name="v3", trainable=False, collections=[])
+        assign_v3 = state_ops.assign(v3, math_ops.add(v1, v2))
+        control_flow_ops.group(assign_v3, name="init_op")
 
-      ops.add_to_collection("v", v1)
-      ops.add_to_collection("v", v2)
-      ops.add_to_collection("v", v3)
-      ops.add_to_collection("init_op", init_op)
+        self.evaluate(variables.global_variables_initializer())
+        self.assertEqual(1, self._eval("v1"))
+        self.assertEqual(2, self._eval("v2"))
 
-      self.evaluate(variables.global_variables_initializer())
-      self.assertEqual(1, ops.get_collection("v")[0].eval())
-      self.assertEqual(2, ops.get_collection("v")[1].eval())
+        builder.add_meta_graph_and_variables(sess, ["foo"])
 
-      builder.add_meta_graph_and_variables(sess, ["foo"])
+      # Save the SavedModel to disk.
+      builder.save()
 
-    # Save the SavedModel to disk.
-    builder.save()
+      with session.Session(
+          graph=ops.Graph(),
+          config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
+        loader.load(sess, ["foo"], export_dir)
 
-    with session.Session(
-        graph=ops.Graph(),
-        config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
-      loader.load(sess, ["foo"], export_dir)
-
-      # Validate variables, run the init op and verify result.
-      self.assertEqual(1, ops.get_collection("v")[0].eval())
-      self.assertEqual(2, ops.get_collection("v")[1].eval())
-      ops.get_collection("init_op")[0].run()
-      self.assertEqual(3, ops.get_collection("v")[2].eval())
+        # Validate variables, run the init op and verify result.
+        self.assertEqual(1, self._eval("v1"))
+        self.assertEqual(2, self._eval("v2"))
+        sess.run("init_op")
+        self.assertEqual(3, self._eval("v3"))
 
   def testCustomSaveable(self):
     export_dir = self._get_export_dir("custom_saveable")
@@ -1118,21 +1161,20 @@ class SavedModelTest(SavedModelTestBase):
       self.assertEqual(b"k1", v1.keys().eval())
       self.assertEqual(3.0, v1.values().eval())
 
-  @test_util.run_deprecated_v1
   def testCustomSaver(self):
     export_dir = self._get_export_dir("test_custom_saver")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
 
-    with self.session(graph=ops.Graph()) as sess:
-      variables.VariableV1(1, name="v1")
-      self.evaluate(variables.global_variables_initializer())
-      custom_saver = training.Saver(name="my_saver")
-      builder.add_meta_graph_and_variables(sess, ["tag"], saver=custom_saver)
-
-    # Save the SavedModel to disk.
-    builder.save()
-
     with ops.Graph().as_default() as graph:
+      with self.session(graph=ops.Graph()) as sess:
+        variables.VariableV1(1, name="v1")
+        self.evaluate(variables.global_variables_initializer())
+        custom_saver = training.Saver(name="my_saver")
+        builder.add_meta_graph_and_variables(sess, ["tag"], saver=custom_saver)
+
+      # Save the SavedModel to disk.
+      builder.save()
+
       with self.session(graph=graph) as sess:
         saved_graph = loader.load(sess, ["tag"], export_dir)
         graph_ops = [x.name for x in graph.get_operations()]
@@ -1141,21 +1183,20 @@ class SavedModelTest(SavedModelTestBase):
         self.assertEqual(
             saved_graph.saver_def.restore_op_name, "my_saver/restore_all")
 
-  @test_util.run_deprecated_v1
   def testNoCustomSaver(self):
     export_dir = self._get_export_dir("test_no_custom_saver")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
 
-    with self.session(graph=ops.Graph()) as sess:
-      variables.VariableV1(1, name="v1")
-      self.evaluate(variables.global_variables_initializer())
-      training.Saver(name="my_saver")
-      builder.add_meta_graph_and_variables(sess, ["tag"])
-
-    # Save the SavedModel to disk.
-    builder.save()
-
     with ops.Graph().as_default() as graph:
+      with self.session(graph=ops.Graph()) as sess:
+        variables.VariableV1(1, name="v1")
+        self.evaluate(variables.global_variables_initializer())
+        training.Saver(name="my_saver")
+        builder.add_meta_graph_and_variables(sess, ["tag"])
+
+      # Save the SavedModel to disk.
+      builder.save()
+
       with self.session(graph=graph) as sess:
         saved_graph = loader.load(sess, ["tag"], export_dir)
         graph_ops = [x.name for x in graph.get_operations()]
@@ -1164,24 +1205,24 @@ class SavedModelTest(SavedModelTestBase):
         self.assertEqual(
             saved_graph.saver_def.restore_op_name, "save/restore_all")
 
-  @test_util.run_deprecated_v1
   def testMultipleCustomSavers(self):
     export_dir = self._get_export_dir("test_multiple_custom_savers")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
 
-    with self.session(graph=ops.Graph()) as sess:
-      variables.VariableV1(1, name="v1")
-      self.evaluate(variables.global_variables_initializer())
-      builder.add_meta_graph_and_variables(sess, ["tag_0"])
+    with ops.Graph().as_default():
+      with self.session(graph=ops.Graph()) as sess:
+        variables.VariableV1(1, name="v1")
+        self.evaluate(variables.global_variables_initializer())
+        builder.add_meta_graph_and_variables(sess, ["tag_0"])
 
-      saver_1 = training.Saver()
-      builder.add_meta_graph(["tag_1"], saver=saver_1)
+        saver_1 = training.Saver()
+        builder.add_meta_graph(["tag_1"], saver=saver_1)
 
-      saver_2 = training.Saver()
-      builder.add_meta_graph(["tag_2"], saver=saver_2)
+        saver_2 = training.Saver()
+        builder.add_meta_graph(["tag_2"], saver=saver_2)
 
-    # Save the SavedModel to disk.
-    builder.save()
+      # Save the SavedModel to disk.
+      builder.save()
 
     def _validate_custom_saver(tag_name, saver_name):
       with ops.Graph().as_default() as graph:
@@ -1195,82 +1236,78 @@ class SavedModelTest(SavedModelTestBase):
     _validate_custom_saver("tag_1", "save_1/restore_all")
     _validate_custom_saver("tag_2", "save_2/restore_all")
 
-  @test_util.run_deprecated_v1
   def testImportScope(self):
     export_dir = self._get_export_dir("test_scoped_assets")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
 
-    # Build a SavedModel with a variable, an asset, and a constant tensor.
-    with self.session(graph=ops.Graph()) as sess:
-      self._init_and_validate_variable(sess, "v", 42)
-      asset_list = self._build_asset_collection("foo.txt", "content_foo",
-                                                "asset_file_tensor")
-      constant_op.constant("constant value", name="constant_tensor_name")
-      builder.add_meta_graph_and_variables(
-          sess, ["tag_name"], assets_list=asset_list)
+    with ops.Graph().as_default():
+      # Build a SavedModel with a variable, an asset, and a constant tensor.
+      with self.session(graph=ops.Graph()) as sess:
+        self._init_and_validate_variable(sess, "v", 42)
+        asset_list = self._build_asset_collection("foo.txt", "content_foo",
+                                                  "asset_file_tensor")
+        constant_op.constant("constant value", name="constant_tensor_name")
+        builder.add_meta_graph_and_variables(
+            sess, ["tag_name"], assets_list=asset_list)
 
-      # Save the asset file path for later comparison.
-      asset_file_path = asset_list[0].eval()
+        # Save the asset file path for later comparison.
+        asset_file_path = asset_list[0].eval()
 
-    # Save the SavedModel to disk.
-    builder.save()
+      # Save the SavedModel to disk.
+      builder.save()
 
-    with self.session(graph=ops.Graph()) as sess:
-      # Restore the SavedModel under an import_scope in a new graph/session.
-      graph_proto = loader.load(
-          sess, ["tag_name"], export_dir, import_scope="scope_name")
+      with self.session(graph=ops.Graph()) as sess:
+        # Restore the SavedModel under an import_scope in a new graph/session.
+        graph_proto = loader.load(
+            sess, ["tag_name"], export_dir, import_scope="scope_name")
 
-      # The loaded variable tensor should be scoped, but its contents should be
-      # unchanged.
-      self.assertEqual(
-          "scope_name/v:0",
-          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].name)
-      self.assertEqual(
-          42,
-          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
+        # The loaded variable tensor should be scoped, but its contents should
+        # be unchanged.
+        self.assertEqual(
+            "scope_name/v:0",
+            ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].name)
+        self.assertEqual(42, self._eval("scope_name/v"))
 
-      # The loaded asset tensor should be scoped, but the asset file path and
-      # contents should be unchanged.
-      asset_list = ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
-      self.assertEqual(1, len(asset_list))
-      self.assertEqual(asset_file_path, asset_list[0].eval())
-      self.assertEqual("scope_name/asset_file_tensor:0", asset_list[0].name)
-      # The static asset data inside graph_proto.collection_def should not be
-      # scoped.
-      self._validate_assets(export_dir, graph_proto.asset_file_def, "foo.txt",
-                            "content_foo", "asset_file_tensor:0")
+        # The loaded asset tensor should be scoped, but the asset file path and
+        # contents should be unchanged.
+        asset_list = ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
+        self.assertEqual(1, len(asset_list))
+        self.assertEqual(asset_file_path, asset_list[0].eval())
+        self.assertEqual("scope_name/asset_file_tensor:0", asset_list[0].name)
+        # The static asset data inside graph_proto.collection_def should not be
+        # scoped.
+        self._validate_assets(export_dir, graph_proto.asset_file_def, "foo.txt",
+                              "content_foo", "asset_file_tensor:0")
 
-      # The constant tensor should be scoped, but its contents should be
-      # unchanged.
-      self.assertEqual(
-          compat.as_bytes("constant value"),
-          ops.get_default_graph().get_tensor_by_name(
-              "scope_name/constant_tensor_name:0").eval())
+        # The constant tensor should be scoped, but its contents should be
+        # unchanged.
+        self.assertEqual(
+            compat.as_bytes("constant value"),
+            ops.get_default_graph().get_tensor_by_name(
+                "scope_name/constant_tensor_name:0").eval())
 
-  @test_util.run_deprecated_v1
   def testClearDevices(self):
     export_dir = self._get_export_dir("test_clear_devices")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
 
-    # Specify a device and save a variable.
-    ops.reset_default_graph()
-    with session.Session(
-        target="",
-        config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
-      with sess.graph.device("/cpu:0"):
-        self._init_and_validate_variable(sess, "v", 42)
-        builder.add_meta_graph_and_variables(
-            sess, [tag_constants.TRAINING], clear_devices=True)
+    with ops.Graph().as_default():
+      # Specify a device and save a variable.
+      with session.Session(
+          target="",
+          config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
+        with sess.graph.device("/cpu:0"):
+          self._init_and_validate_variable(sess, "v", 42)
+          builder.add_meta_graph_and_variables(
+              sess, [tag_constants.TRAINING], clear_devices=True)
 
-    # Save the SavedModel to disk.
-    builder.save()
+      # Save the SavedModel to disk.
+      builder.save()
 
-    # Restore the graph with a single predefined tag whose variables were saved
-    # without any device information.
-    with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, [tag_constants.TRAINING], export_dir)
-      self.assertEqual(
-          42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
+      # Restore the graph with a single predefined tag whose variables were
+      # saved without any device information.
+      with self.session(graph=ops.Graph()) as sess:
+        loader.load(sess, [tag_constants.TRAINING], export_dir)
+        self.assertEqual(42, self._eval("v"))
 
   # Tests the behavior of loading SavedModels that having missing attrs or attrs
   # with incorrect types.
@@ -1361,47 +1398,47 @@ class SavedModelV1Test(SavedModelTestBase):
     self.assertEqual(expected_asset_file_name, asset.filename)
     self.assertEqual(expected_asset_tensor_name, asset.tensor_info.name)
 
-  @test_util.run_deprecated_v1
   def testWritingAssetsToCollection(self):
     export_dir = self._get_export_dir("test_writing_assets_to_collection")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    with self.session(graph=ops.Graph()) as sess:
-      self._init_and_validate_variable(sess, "v", 42)
+    with ops.Graph().as_default():
+      with self.session(graph=ops.Graph()) as sess:
+        self._init_and_validate_variable(sess, "v", 42)
 
-      # Build an asset list.
-      ignored_filepath = os.path.join(
-          compat.as_bytes(test.get_temp_dir()), compat.as_bytes("ignored.txt"))
-      file_io.write_string_to_file(ignored_filepath, "will be ignored")
+        # Build an asset list.
+        ignored_filepath = os.path.join(
+            compat.as_bytes(test.get_temp_dir()),
+            compat.as_bytes("ignored.txt"))
+        file_io.write_string_to_file(ignored_filepath, "will be ignored")
 
-      asset_collection = self._build_asset_collection(
-          "hello42.txt", "foo bar baz", "asset_file_tensor")
+        asset_collection = self._build_asset_collection("hello42.txt",
+                                                        "foo bar baz",
+                                                        "asset_file_tensor")
 
-      builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+        builder.add_meta_graph_and_variables(
+            sess, ["foo"], assets_collection=asset_collection)
 
-    # Save the SavedModel to disk.
-    builder.save()
+      # Save the SavedModel to disk.
+      builder.save()
 
-    with self.session(graph=ops.Graph()) as sess:
-      foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar baz",
-                                      "asset_file_tensor:0")
-      ignored_asset_path = os.path.join(
-          compat.as_bytes(export_dir),
-          compat.as_bytes(constants.ASSETS_DIRECTORY),
-          compat.as_bytes("ignored.txt"))
-      self.assertFalse(file_io.file_exists(ignored_asset_path))
+      with self.session(graph=ops.Graph()) as sess:
+        foo_graph = loader.load(sess, ["foo"], export_dir)
+        self._validate_asset_collection(export_dir, foo_graph.collection_def,
+                                        "hello42.txt", "foo bar baz",
+                                        "asset_file_tensor:0")
+        ignored_asset_path = os.path.join(
+            compat.as_bytes(export_dir),
+            compat.as_bytes(constants.ASSETS_DIRECTORY),
+            compat.as_bytes("ignored.txt"))
+        self.assertFalse(file_io.file_exists(ignored_asset_path))
 
-  @test_util.run_deprecated_v1
   def testLegacyInitOpWithNonEmptyCollection(self):
     export_dir = self._get_export_dir(
         "test_legacy_init_op_with_non_empty_collection")
     self._testInitOpsWithNonEmptyCollection(export_dir,
                                             constants.LEGACY_INIT_OP_KEY)
 
-  @test_util.run_deprecated_v1
   def testMainOpWithNonEmptyCollection(self):
     export_dir = self._get_export_dir("test_main_op_with_non_empty_collection")
     self._testInitOpsWithNonEmptyCollection(export_dir, constants.MAIN_OP_KEY)
@@ -1409,31 +1446,32 @@ class SavedModelV1Test(SavedModelTestBase):
   def _testInitOpsWithNonEmptyCollection(self, export_dir, key):
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    g = ops.Graph()
-    with self.session(graph=g) as sess:
-      # Initialize variable `v1` to 1.
-      v1 = variables.VariableV1(1, name="v1")
-      ops.add_to_collection("v", v1)
+    with ops.Graph().as_default():
+      with self.session() as sess:
+        # Initialize variable `v1` to 1.
+        v1 = variables.VariableV1(1, name="v1")
+        ops.add_to_collection("v", v1)
 
-      # Initialize another variable `v2` to 42.
-      v2 = variables.VariableV1(42, name="v2", trainable=False, collections=[])
-      ops.add_to_collection("v", v2)
+        # Initialize another variable `v2` to 42.
+        v2 = variables.VariableV1(
+            42, name="v2", trainable=False, collections=[])
+        ops.add_to_collection("v", v2)
 
-      # Set up an assignment op to be run as part of the init op.
-      assign_v2 = state_ops.assign(v2, v1)
-      init_op = control_flow_ops.group(assign_v2, name="init_op")
+        # Set up an assignment op to be run as part of the init op.
+        assign_v2 = state_ops.assign(v2, v1)
+        init_op = control_flow_ops.group(assign_v2, name="init_op")
 
-      self.evaluate(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
 
-      ops.add_to_collection(key, control_flow_ops.no_op())
-      # ValueError should be raised since the LEGACY_INIT_OP_KEY collection
-      # is not empty and we don't support multiple init ops.
-      with self.assertRaisesRegex(ValueError, "Graph already contains"):
-        builder.add_meta_graph_and_variables(
-            sess, ["foo"], legacy_init_op=init_op)
-      # We shouldn't be able to add as MAIN_OP, either.
-      with self.assertRaisesRegex(ValueError, "Graph already contains"):
-        builder.add_meta_graph_and_variables(sess, ["foo"], main_op=init_op)
+        ops.add_to_collection(key, control_flow_ops.no_op())
+        # ValueError should be raised since the LEGACY_INIT_OP_KEY collection
+        # is not empty and we don't support multiple init ops.
+        with self.assertRaisesRegex(ValueError, "Graph already contains"):
+          builder.add_meta_graph_and_variables(
+              sess, ["foo"], legacy_init_op=init_op)
+        # We shouldn't be able to add as MAIN_OP, either.
+        with self.assertRaisesRegex(ValueError, "Graph already contains"):
+          builder.add_meta_graph_and_variables(sess, ["foo"], main_op=init_op)
 
   def testStripDefaultAttrs(self):
     export_dir = self._get_export_dir("test_strip_default_attrs")
@@ -1503,40 +1541,38 @@ class SavedModelV1Test(SavedModelTestBase):
     self.assertIn("T", node_def.attr)
     self.assertIn("Tout", node_def.attr)
 
-  @test_util.run_v1_only("b/120545219")
   def testLegacyInitOp(self):
     export_dir = self._get_export_dir("test_legacy_init_op")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    with self.session(graph=ops.Graph()) as sess:
-      # Add `v1` and `v2` variables to the graph.
-      v1 = variables.VariableV1(1, name="v1")
-      ops.add_to_collection("v", v1)
-      v2 = variables.VariableV1(2, name="v2")
-      ops.add_to_collection("v", v2)
+    with ops.Graph().as_default():
+      with self.session(graph=ops.Graph()) as sess:
+        # Add `v1` and `v2` variables to the graph.
+        v1 = variables.VariableV1(1, name="v1")
+        v2 = variables.VariableV1(2, name="v2")
 
-      # Initialize another variable `v3` to 42.
-      v3 = variables.VariableV1(42, name="v3", trainable=False, collections=[])
-      ops.add_to_collection("v", v3)
+        # Initialize another variable `v3` to 42.
+        v3 = variables.VariableV1(42, name="v3", trainable=False)
 
-      # Set up an assignment op to be run as part of the init_op.
-      assign_v3 = state_ops.assign(v3, math_ops.add(v1, v2))
-      legacy_init_op = control_flow_ops.group(assign_v3, name="legacy_init_op")
+        # Set up an assignment op to be run as part of the init_op.
+        assign_v3 = state_ops.assign(v3, math_ops.add(v1, v2))
+        legacy_init_op = control_flow_ops.group(
+            assign_v3, name="legacy_init_op")
 
-      self.evaluate(variables.global_variables_initializer())
-      builder.add_meta_graph_and_variables(
-          sess, ["foo"], legacy_init_op=legacy_init_op)
+        self.evaluate(variables.global_variables_initializer())
+        builder.add_meta_graph_and_variables(
+            sess, ["foo"], legacy_init_op=legacy_init_op)
 
-    # Save the SavedModel to disk.
-    builder.save()
+      # Save the SavedModel to disk.
+      builder.save()
 
-    with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["foo"], export_dir)
-      self.assertEqual(1, ops.get_collection("v")[0].eval())
-      self.assertEqual(2, ops.get_collection("v")[1].eval())
-      # Evaluates to the sum of the first two variables and assigned as part of
-      # the legacy_init_op, following a restore.
-      self.assertEqual(3, ops.get_collection("v")[2].eval())
+      with self.session(graph=ops.Graph()) as sess:
+        loader.load(sess, ["foo"], export_dir)
+        self.assertEqual(1, self._eval("v1"))
+        self.assertEqual(2, self._eval("v2"))
+        # Evaluates to the sum of the first two variables and assigned as part
+        # of the legacy_init_op, following a restore.
+        self.assertEqual(3, self._eval("v3"))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/saved_model/signature_def_utils_test.py b/tensorflow/python/saved_model/signature_def_utils_test.py
index 9a18f185d0b..98ae4b2f891 100644
--- a/tensorflow/python/saved_model/signature_def_utils_test.py
+++ b/tensorflow/python/saved_model/signature_def_utils_test.py
@@ -22,7 +22,7 @@ from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import test_util
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -60,17 +60,20 @@ def _make_signature(inputs, outputs, name=None):
 
 class SignatureDefUtilsTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def testBuildSignatureDef(self):
-    x = array_ops.placeholder(dtypes.float32, 1, name="x")
-    x_tensor_info = utils.build_tensor_info(x)
-    inputs = {}
-    inputs["foo-input"] = x_tensor_info
+    # Force the test to run in graph mode.
+    # This tests a deprecated v1 API that uses functionality that does not work
+    # with eager tensors (namely build_tensor_info).
+    with ops.Graph().as_default():
+      x = array_ops.placeholder(dtypes.float32, 1, name="x")
+      x_tensor_info = utils.build_tensor_info(x)
+      inputs = {}
+      inputs["foo-input"] = x_tensor_info
 
-    y = array_ops.placeholder(dtypes.float32, name="y")
-    y_tensor_info = utils.build_tensor_info(y)
-    outputs = {}
-    outputs["foo-output"] = y_tensor_info
+      y = array_ops.placeholder(dtypes.float32, name="y")
+      y_tensor_info = utils.build_tensor_info(y)
+      outputs = {}
+      outputs["foo-output"] = y_tensor_info
 
     signature_def = signature_def_utils_impl.build_signature_def(
         inputs, outputs, "foo-method-name")
@@ -91,12 +94,15 @@ class SignatureDefUtilsTest(test.TestCase):
     self.assertEqual(types_pb2.DT_FLOAT, y_tensor_info_actual.dtype)
     self.assertEqual(0, len(y_tensor_info_actual.tensor_shape.dim))
 
-  @test_util.run_deprecated_v1
   def testRegressionSignatureDef(self):
-    input1 = constant_op.constant("a", name="input-1")
-    output1 = constant_op.constant(2.2, name="output-1")
-    signature_def = signature_def_utils_impl.regression_signature_def(
-        input1, output1)
+    # Force the test to run in graph mode.
+    # This tests a deprecated v1 API that uses functionality that does not work
+    # with eager tensors (namely build_tensor_info).
+    with ops.Graph().as_default():
+      input1 = constant_op.constant("a", name="input-1")
+      output1 = constant_op.constant(2.2, name="output-1")
+      signature_def = signature_def_utils_impl.regression_signature_def(
+          input1, output1)
 
     self.assertEqual(signature_constants.REGRESS_METHOD_NAME,
                      signature_def.method_name)
@@ -117,13 +123,16 @@ class SignatureDefUtilsTest(test.TestCase):
     self.assertEqual(types_pb2.DT_FLOAT, y_tensor_info_actual.dtype)
     self.assertEqual(0, len(y_tensor_info_actual.tensor_shape.dim))
 
-  @test_util.run_deprecated_v1
   def testClassificationSignatureDef(self):
-    input1 = constant_op.constant("a", name="input-1")
-    output1 = constant_op.constant("b", name="output-1")
-    output2 = constant_op.constant(3.3, name="output-2")
-    signature_def = signature_def_utils_impl.classification_signature_def(
-        input1, output1, output2)
+    # Force the test to run in graph mode.
+    # This tests a deprecated v1 API that uses functionality that does not work
+    # with eager tensors (namely build_tensor_info).
+    with ops.Graph().as_default():
+      input1 = constant_op.constant("a", name="input-1")
+      output1 = constant_op.constant("b", name="output-1")
+      output2 = constant_op.constant(3.3, name="output-2")
+      signature_def = signature_def_utils_impl.classification_signature_def(
+          input1, output1, output2)
 
     self.assertEqual(signature_constants.CLASSIFY_METHOD_NAME,
                      signature_def.method_name)
@@ -149,17 +158,23 @@ class SignatureDefUtilsTest(test.TestCase):
     self.assertEqual(types_pb2.DT_FLOAT, scores_tensor_info_actual.dtype)
     self.assertEqual(0, len(scores_tensor_info_actual.tensor_shape.dim))
 
-  @test_util.run_deprecated_v1
   def testPredictionSignatureDef(self):
-    input1 = constant_op.constant("a", name="input-1")
-    input2 = constant_op.constant("b", name="input-2")
-    output1 = constant_op.constant("c", name="output-1")
-    output2 = constant_op.constant("d", name="output-2")
-    signature_def = signature_def_utils_impl.predict_signature_def({
-        "input-1": input1,
-        "input-2": input2
-    }, {"output-1": output1,
-        "output-2": output2})
+    # Force the test to run in graph mode.
+    # This tests a deprecated v1 API that uses functionality that does not work
+    # with eager tensors (namely build_tensor_info).
+    with ops.Graph().as_default():
+      input1 = constant_op.constant("a", name="input-1")
+      input2 = constant_op.constant("b", name="input-2")
+      output1 = constant_op.constant("c", name="output-1")
+      output2 = constant_op.constant("d", name="output-2")
+      signature_def = signature_def_utils_impl.predict_signature_def(
+          {
+              "input-1": input1,
+              "input-2": input2
+          }, {
+              "output-1": output1,
+              "output-2": output2
+          })
 
     self.assertEqual(signature_constants.PREDICT_METHOD_NAME,
                      signature_def.method_name)
@@ -186,34 +201,38 @@ class SignatureDefUtilsTest(test.TestCase):
     self.assertEqual(types_pb2.DT_STRING, output2_tensor_info_actual.dtype)
     self.assertEqual(0, len(output2_tensor_info_actual.tensor_shape.dim))
 
-  @test_util.run_deprecated_v1
   def testTrainSignatureDef(self):
     self._testSupervisedSignatureDef(
         signature_def_utils_impl.supervised_train_signature_def,
         signature_constants.SUPERVISED_TRAIN_METHOD_NAME)
 
-  @test_util.run_deprecated_v1
   def testEvalSignatureDef(self):
     self._testSupervisedSignatureDef(
         signature_def_utils_impl.supervised_eval_signature_def,
         signature_constants.SUPERVISED_EVAL_METHOD_NAME)
 
   def _testSupervisedSignatureDef(self, fn_to_test, method_name):
-    inputs = {
-        "input-1": constant_op.constant("a", name="input-1"),
-        "input-2": constant_op.constant("b", name="input-2"),
-    }
-    loss = {"loss-1": constant_op.constant(0.45, name="loss-1")}
-    predictions = {
-        "classes": constant_op.constant([100], name="classes"),
-    }
-    metrics_val = constant_op.constant(100.0, name="metrics_val")
-    metrics = {
-        "metrics/value": metrics_val,
-        "metrics/update_op": array_ops.identity(metrics_val, name="metrics_op"),
-    }
+    # Force the test to run in graph mode.
+    # This tests a deprecated v1 API that uses functionality that does not work
+    # with eager tensors (namely build_tensor_info).
+    with ops.Graph().as_default():
+      inputs = {
+          "input-1": constant_op.constant("a", name="input-1"),
+          "input-2": constant_op.constant("b", name="input-2"),
+      }
+      loss = {"loss-1": constant_op.constant(0.45, name="loss-1")}
+      predictions = {
+          "classes": constant_op.constant([100], name="classes"),
+      }
+      metrics_val = constant_op.constant(100.0, name="metrics_val")
+      metrics = {
+          "metrics/value":
+              metrics_val,
+          "metrics/update_op":
+              array_ops.identity(metrics_val, name="metrics_op"),
+      }
 
-    signature_def = fn_to_test(inputs, loss, predictions, metrics)
+      signature_def = fn_to_test(inputs, loss, predictions, metrics)
 
     self.assertEqual(method_name, signature_def.method_name)
 
@@ -246,44 +265,50 @@ class SignatureDefUtilsTest(test.TestCase):
     self.assertEqual(
         types_pb2.DT_FLOAT, signature_def.outputs["metrics/value"].dtype)
 
-  @test_util.run_deprecated_v1
   def testTrainSignatureDefMissingInputs(self):
     self._testSupervisedSignatureDefMissingInputs(
         signature_def_utils_impl.supervised_train_signature_def,
         signature_constants.SUPERVISED_TRAIN_METHOD_NAME)
 
-  @test_util.run_deprecated_v1
   def testEvalSignatureDefMissingInputs(self):
     self._testSupervisedSignatureDefMissingInputs(
         signature_def_utils_impl.supervised_eval_signature_def,
         signature_constants.SUPERVISED_EVAL_METHOD_NAME)
 
   def _testSupervisedSignatureDefMissingInputs(self, fn_to_test, method_name):
-    inputs = {
-        "input-1": constant_op.constant("a", name="input-1"),
-        "input-2": constant_op.constant("b", name="input-2"),
-    }
-    loss = {"loss-1": constant_op.constant(0.45, name="loss-1")}
-    predictions = {
-        "classes": constant_op.constant([100], name="classes"),
-    }
-    metrics_val = constant_op.constant(100, name="metrics_val")
-    metrics = {
-        "metrics/value": metrics_val,
-        "metrics/update_op": array_ops.identity(metrics_val, name="metrics_op"),
-    }
+    # Force the test to run in graph mode.
+    # This tests a deprecated v1 API that uses functionality that does not work
+    # with eager tensors (namely build_tensor_info).
+    with ops.Graph().as_default():
+      inputs = {
+          "input-1": constant_op.constant("a", name="input-1"),
+          "input-2": constant_op.constant("b", name="input-2"),
+      }
+      loss = {"loss-1": constant_op.constant(0.45, name="loss-1")}
+      predictions = {
+          "classes": constant_op.constant([100], name="classes"),
+      }
+      metrics_val = constant_op.constant(100, name="metrics_val")
+      metrics = {
+          "metrics/value":
+              metrics_val,
+          "metrics/update_op":
+              array_ops.identity(metrics_val, name="metrics_op"),
+      }
 
-    with self.assertRaises(ValueError):
-      signature_def = fn_to_test(
-          {}, loss=loss, predictions=predictions, metrics=metrics)
+      with self.assertRaises(ValueError):
+        signature_def = fn_to_test({},
+                                   loss=loss,
+                                   predictions=predictions,
+                                   metrics=metrics)
 
-    signature_def = fn_to_test(inputs, loss=loss)
-    self.assertEqual(method_name, signature_def.method_name)
-    self.assertEqual(1, len(signature_def.outputs))
+      signature_def = fn_to_test(inputs, loss=loss)
+      self.assertEqual(method_name, signature_def.method_name)
+      self.assertEqual(1, len(signature_def.outputs))
 
-    signature_def = fn_to_test(inputs, metrics=metrics, loss=loss)
-    self.assertEqual(method_name, signature_def.method_name)
-    self.assertEqual(3, len(signature_def.outputs))
+      signature_def = fn_to_test(inputs, metrics=metrics, loss=loss)
+      self.assertEqual(method_name, signature_def.method_name)
+      self.assertEqual(3, len(signature_def.outputs))
 
   def _assertValidSignature(self, inputs, outputs, method_name):
     signature_def = signature_def_utils_impl.build_signature_def(
@@ -423,23 +448,30 @@ class SignatureDefUtilsTest(test.TestCase):
         {},
         signature_constants.PREDICT_METHOD_NAME)
 
-  @test_util.run_v1_only("b/120545219")
   def testOpSignatureDef(self):
-    key = "adding_1_and_2_key"
-    add_op = math_ops.add(1, 2, name="adding_1_and_2")
-    signature_def = signature_def_utils_impl.op_signature_def(add_op, key)
+    # Force the test to run in graph mode.
+    # This tests a deprecated v1 API that uses functionality that does not work
+    # with eager tensors (namely build_tensor_info_from_op).
+    with ops.Graph().as_default():
+      key = "adding_1_and_2_key"
+      add_op = math_ops.add(1, 2, name="adding_1_and_2")
+      signature_def = signature_def_utils_impl.op_signature_def(add_op, key)
+
     self.assertIn(key, signature_def.outputs)
     self.assertEqual(add_op.name, signature_def.outputs[key].name)
 
-  @test_util.run_v1_only("b/120545219")
   def testLoadOpFromSignatureDef(self):
-    key = "adding_1_and_2_key"
-    add_op = math_ops.add(1, 2, name="adding_1_and_2")
-    signature_def = signature_def_utils_impl.op_signature_def(add_op, key)
-
-    self.assertEqual(
-        add_op,
-        signature_def_utils_impl.load_op_from_signature_def(signature_def, key))
+    # Force the test to run in graph mode.
+    # This tests a deprecated v1 API that uses functionality that does not work
+    # with eager tensors (namely build_tensor_info_from_op).
+    with ops.Graph().as_default():
+      key = "adding_1_and_2_key"
+      add_op = math_ops.add(1, 2, name="adding_1_and_2")
+      signature_def = signature_def_utils_impl.op_signature_def(add_op, key)
+      self.assertEqual(
+          add_op,
+          signature_def_utils_impl.load_op_from_signature_def(
+              signature_def, key))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/saved_model/simple_save_test.py b/tensorflow/python/saved_model/simple_save_test.py
index 21c2e9df2fa..21be3677aa8 100644
--- a/tensorflow/python/saved_model/simple_save_test.py
+++ b/tensorflow/python/saved_model/simple_save_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import os
 
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import loader
@@ -32,7 +31,7 @@ from tensorflow.python.saved_model import tag_constants
 
 class SimpleSaveTest(test.TestCase):
 
-  def _init_and_validate_variable(self, sess, variable_name, variable_value):
+  def _init_and_validate_variable(self, variable_name, variable_value):
     v = variables.Variable(variable_value, name=variable_name)
     self.evaluate(variables.global_variables_initializer())
     self.assertEqual(variable_value, self.evaluate(v))
@@ -54,50 +53,54 @@ class SimpleSaveTest(test.TestCase):
       self.assertEqual(actual_tensor_info.tensor_shape.dim[i].size,
                        expected_tensor.shape[i])
 
-  @test_util.run_deprecated_v1
   def testSimpleSave(self):
     """Test simple_save that uses the default parameters."""
     export_dir = os.path.join(test.get_temp_dir(),
                               "test_simple_save")
 
-    # Initialize input and output variables and save a prediction graph using
-    # the default parameters.
-    with self.session(graph=ops.Graph()) as sess:
-      var_x = self._init_and_validate_variable(sess, "var_x", 1)
-      var_y = self._init_and_validate_variable(sess, "var_y", 2)
-      inputs = {"x": var_x}
-      outputs = {"y": var_y}
-      simple_save.simple_save(sess, export_dir, inputs, outputs)
+    # Force the test to run in graph mode.
+    # This tests a deprecated v1 API that both requires a session and uses
+    # functionality that does not work with eager tensors (such as
+    # build_tensor_info as called by predict_signature_def).
+    with ops.Graph().as_default():
+      # Initialize input and output variables and save a prediction graph using
+      # the default parameters.
+      with self.session(graph=ops.Graph()) as sess:
+        var_x = self._init_and_validate_variable("var_x", 1)
+        var_y = self._init_and_validate_variable("var_y", 2)
+        inputs = {"x": var_x}
+        outputs = {"y": var_y}
+        simple_save.simple_save(sess, export_dir, inputs, outputs)
 
-    # Restore the graph with a valid tag and check the global variables and
-    # signature def map.
-    with self.session(graph=ops.Graph()) as sess:
-      graph = loader.load(sess, [tag_constants.SERVING], export_dir)
-      collection_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+      # Restore the graph with a valid tag and check the global variables and
+      # signature def map.
+      with self.session(graph=ops.Graph()) as sess:
+        graph = loader.load(sess, [tag_constants.SERVING], export_dir)
+        collection_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
 
-      # Check value and metadata of the saved variables.
-      self.assertEqual(len(collection_vars), 2)
-      self.assertEqual(1, collection_vars[0].eval())
-      self.assertEqual(2, collection_vars[1].eval())
-      self._check_variable_info(collection_vars[0], var_x)
-      self._check_variable_info(collection_vars[1], var_y)
+        # Check value and metadata of the saved variables.
+        self.assertEqual(len(collection_vars), 2)
+        self.assertEqual(1, collection_vars[0].eval())
+        self.assertEqual(2, collection_vars[1].eval())
+        self._check_variable_info(collection_vars[0], var_x)
+        self._check_variable_info(collection_vars[1], var_y)
 
-      # Check that the appropriate signature_def_map is created with the
-      # default key and method name, and the specified inputs and outputs.
-      signature_def_map = graph.signature_def
-      self.assertEqual(1, len(signature_def_map))
-      self.assertEqual(signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
-                       list(signature_def_map.keys())[0])
+        # Check that the appropriate signature_def_map is created with the
+        # default key and method name, and the specified inputs and outputs.
+        signature_def_map = graph.signature_def
+        self.assertEqual(1, len(signature_def_map))
+        self.assertEqual(signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
+                         list(signature_def_map.keys())[0])
 
-      signature_def = signature_def_map[
-          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
-      self.assertEqual(signature_constants.PREDICT_METHOD_NAME,
-                       signature_def.method_name)
+        signature_def = signature_def_map[
+            signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+        self.assertEqual(signature_constants.PREDICT_METHOD_NAME,
+                         signature_def.method_name)
 
-      self.assertEqual(1, len(signature_def.inputs))
-      self._check_tensor_info(signature_def.inputs["x"], var_x)
-      self.assertEqual(1, len(signature_def.outputs))
-      self._check_tensor_info(signature_def.outputs["y"], var_y)
+        self.assertEqual(1, len(signature_def.inputs))
+        self._check_tensor_info(signature_def.inputs["x"], var_x)
+        self.assertEqual(1, len(signature_def.outputs))
+        self._check_tensor_info(signature_def.outputs["y"], var_y)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/saved_model/utils_impl.py b/tensorflow/python/saved_model/utils_impl.py
index 0f635b6bf85..899dd61d172 100644
--- a/tensorflow/python/saved_model/utils_impl.py
+++ b/tensorflow/python/saved_model/utils_impl.py
@@ -126,7 +126,13 @@ def build_tensor_info_from_op(op):
 
   Returns:
     A TensorInfo protocol buffer constructed based on the supplied argument.
+
+  Raises:
+    RuntimeError: If eager execution is enabled.
   """
+  if context.executing_eagerly():
+    raise RuntimeError(
+        "build_tensor_info_from_op is not supported in Eager mode.")
   return meta_graph_pb2.TensorInfo(
       dtype=types_pb2.DT_INVALID,
       tensor_shape=tensor_shape.unknown_shape().as_proto(),
diff --git a/tensorflow/python/summary/summary_iterator.py b/tensorflow/python/summary/summary_iterator.py
index 5840a7a124e..35c6fa03039 100644
--- a/tensorflow/python/summary/summary_iterator.py
+++ b/tensorflow/python/summary/summary_iterator.py
@@ -24,10 +24,26 @@ from tensorflow.python.lib.io import tf_record
 from tensorflow.python.util.tf_export import tf_export
 
 
+class _SummaryIterator(object):
+  """Yields `Event` protocol buffers from a given path."""
+
+  def __init__(self, path):
+    self._tf_record_iterator = tf_record.tf_record_iterator(path)
+
+  def __iter__(self):
+    return self
+
+  def __next__(self):
+    r = next(self._tf_record_iterator)
+    return event_pb2.Event.FromString(r)
+
+  next = __next__
+
+
 @tf_export(v1=['train.summary_iterator'])
 def summary_iterator(path):
   # pylint: disable=line-too-long
-  """An iterator for reading `Event` protocol buffers from an event file.
+  """Returns a iterator for reading `Event` protocol buffers from an event file.
 
   You can use this function to read events written to an event file. It returns
   a Python iterator that yields `Event` protocol buffers.
@@ -51,6 +67,18 @@ def summary_iterator(path):
           if v.tag == 'loss':
               print(v.simple_value)
   ```
+  Example: Continuously check for new summary values.
+
+  ```python
+  summaries = tf.compat.v1.train.summary_iterator(path to events file)
+  while True:
+    for e in summaries:
+        for v in e.summary.value:
+            if v.tag == 'loss':
+                print(v.simple_value)
+    # Wait for a bit before checking the file for any new events
+    time.sleep(wait time)
+  ```
 
   See the protocol buffer definitions of
   [Event](https://www.tensorflow.org/code/tensorflow/core/util/event.proto)
@@ -61,9 +89,7 @@ def summary_iterator(path):
   Args:
     path: The path to an event file created by a `SummaryWriter`.
 
-  Yields:
-    `Event` protocol buffers.
+  Returns:
+    A iterator that yields `Event` protocol buffers
   """
-  # pylint: enable=line-too-long
-  for r in tf_record.tf_record_iterator(path):
-    yield event_pb2.Event.FromString(r)
+  return _SummaryIterator(path)
diff --git a/tensorflow/python/summary/summary_iterator_test.py b/tensorflow/python/summary/summary_iterator_test.py
new file mode 100644
index 00000000000..d41d8d4c775
--- /dev/null
+++ b/tensorflow/python/summary/summary_iterator_test.py
@@ -0,0 +1,61 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.python.summary.summary_iterator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import glob
+import os.path
+
+from tensorflow.core.util import event_pb2
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+from tensorflow.python.summary import summary_iterator
+from tensorflow.python.summary.writer import writer
+
+
+class SummaryIteratorTestCase(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def testSummaryIteratorEventsAddedAfterEndOfFile(self):
+    test_dir = os.path.join(self.get_temp_dir(), "events")
+    with writer.FileWriter(test_dir) as w:
+      session_log_start = event_pb2.SessionLog.START
+      w.add_session_log(event_pb2.SessionLog(status=session_log_start), 1)
+      w.flush()
+      path = glob.glob(os.path.join(test_dir, "event*"))[0]
+      rr = summary_iterator.summary_iterator(path)
+      # The first event should list the file_version.
+      ev = next(rr)
+      self.assertEqual("brain.Event:2", ev.file_version)
+      # The next event should be the START message.
+      ev = next(rr)
+      self.assertEqual(1, ev.step)
+      self.assertEqual(session_log_start, ev.session_log.status)
+      # Reached EOF.
+      self.assertRaises(StopIteration, lambda: next(rr))
+      w.add_session_log(event_pb2.SessionLog(status=session_log_start), 2)
+      w.flush()
+      # The new event is read, after previously seeing EOF.
+      ev = next(rr)
+      self.assertEqual(2, ev.step)
+      self.assertEqual(session_log_start, ev.session_log.status)
+      # Get EOF again.
+      self.assertRaises(StopIteration, lambda: next(rr))
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index bf11faaf89d..c66397036c0 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <memory>
 
 #include "Python.h"
+#include "absl/strings/str_format.h"
 #include "pybind11/chrono.h"
 #include "pybind11/complex.h"
 #include "pybind11/functional.h"
@@ -351,6 +352,83 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
         TFE_Py_RegisterFallbackExceptionClass(e.ptr()));
   });
 
+  m.def(
+      "TFE_GetTotalMemoryUsage", [](py::handle& ctx, const char* device_name) {
+        tensorflow::EagerContext* context = tensorflow::ContextFromInterface(
+            reinterpret_cast<tensorflow::ImmediateExecutionContext*>(
+                tensorflow::InputTFE_Context(ctx)));
+
+        tensorflow::DeviceNameUtils::ParsedName input_device_name;
+        if (!tensorflow::DeviceNameUtils::ParseFullName(device_name,
+                                                        &input_device_name) &&
+            !tensorflow::DeviceNameUtils::ParseLocalName(device_name,
+                                                         &input_device_name)) {
+          tensorflow::ThrowValueError(
+              absl::StrFormat("Failed parsing device name: '%s'", device_name)
+                  .c_str());
+        }
+
+        std::vector<tensorflow::Device*> devices =
+            context->local_device_mgr()->ListDevices();
+
+        tensorflow::Device* matched_device = nullptr;
+        for (int device_idx = 0; device_idx < devices.size(); device_idx++) {
+          tensorflow::Device* device = devices[device_idx];
+
+          if (absl::StrContains(device->name(), "XLA") &&
+              !absl::StrContains(device_name, "XLA")) {
+            continue;
+          }
+
+          if (tensorflow::DeviceNameUtils::AreCompatibleDevNames(
+                  input_device_name, device->parsed_name())) {
+            if (device->device_type() == tensorflow::DEVICE_CPU) {
+              tensorflow::ThrowValueError(
+                  "CPU does not support getting allocator information");
+            }
+
+            if (absl::StrContains(device->device_type(), "XLA") &&
+                !absl::StrContains(device_name, "XLA")) {
+              // TODO(b/140134773): Remove this workaround.
+              // Do not accidentally match XLA devices.
+              continue;
+            }
+
+            if (matched_device != nullptr) {
+              tensorflow::ThrowValueError(
+                  absl::StrFormat(
+                      "Multiple devices matching the provided string "
+                      "'%s': '%s' and "
+                      "'%s' ",
+                      device_name, matched_device->name(), device->name())
+                      .c_str());
+            }
+            matched_device = device;
+          }
+        }
+
+        if (matched_device == nullptr) {
+          tensorflow::ThrowValueError(
+              absl::StrFormat("No matching devices found for '%s'", device_name)
+                  .c_str());
+        }
+        CHECK(matched_device);
+
+        tensorflow::AllocatorAttributes attrs;
+        tensorflow::Allocator* allocator = matched_device->GetAllocator(attrs);
+
+        if (absl::optional<tensorflow::AllocatorStats> stats =
+                allocator->GetStats()) {
+          return stats->bytes_in_use;
+        }
+
+        tensorflow::ThrowTypeError(
+            absl::StrFormat("Allocator stats not available for device '%s'",
+                            matched_device->name())
+                .c_str());
+        LOG(FATAL) << "Unreachable";
+      });
+
   // XLA Eager Logic
   m.def("TF_SetXlaEnableLazyCompilation", &TF_SetXlaEnableLazyCompilation);
   m.def("TF_SetTfXlaCpuGlobalJit", &TF_SetTfXlaCpuGlobalJit);
@@ -358,7 +436,6 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
   m.def("TF_SetXlaConstantFoldingDisabled", &TF_SetXlaConstantFoldingDisabled);
   m.def("TF_GetXlaConstantFoldingDisabled", &TF_GetXlaConstantFoldingDisabled);
   m.def("TF_SetXlaMinClusterSize", &TF_SetXlaMinClusterSize);
-  m.def("TF_IsXlaEnabled", [] { return tensorflow::IsXlaEnabled(); });
 
   // MLIR Logic
   m.def("TF_IsMlirBridgeEnabled", [] {
@@ -730,8 +807,8 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
         });
 
   // TFE_Py_ForwardAccumulator logic.
-  m.def("TFE_Py_ForwardAccumulatorNew", []() {
-    return tensorflow::PyoOrThrow(TFE_Py_ForwardAccumulatorNew());
+  m.def("TFE_Py_ForwardAccumulatorNew", [](bool use_batch) {
+    return tensorflow::PyoOrThrow(TFE_Py_ForwardAccumulatorNew(use_batch));
   });
 
   m.def("TFE_Py_ForwardAccumulatorSetAdd", [](const py::handle& accumulator) {
@@ -828,6 +905,13 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
                             buf.get()->length, status.get());
     tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
   });
+  m.def("TFE_AbortCollectiveOps", [](const py::handle& ctx, int code,
+                                     const char* message) {
+    tensorflow::Safe_TF_StatusPtr status =
+        tensorflow::make_safe(TF_NewStatus());
+    TF_SetStatus(status.get(), static_cast<TF_Code>(code), message);
+    TFE_AbortCollectiveOps(tensorflow::InputTFE_Context(ctx), status.get());
+  });
   m.def("TF_ListPhysicalDevices", &tensorflow::TF_ListPhysicalDevices);
   m.def("TF_GetDeviceDetails", &tensorflow::TF_GetDeviceDetails);
   m.def("TF_DeleteDeviceList", &TF_DeleteDeviceList,
diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index 1dc0f12eb12..c7a788450dc 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -26,6 +26,8 @@ TENSORFLOW_API_INIT_FILES = [
     "dtypes/__init__.py",
     "errors/__init__.py",
     "experimental/__init__.py",
+    "experimental/numpy/__init__.py",
+    "experimental/numpy/random/__init__.py",
     "experimental/tensorrt/__init__.py",
     "experimental/dlpack/__init__.py",
     "feature_column/__init__.py",
@@ -75,6 +77,8 @@ TENSORFLOW_API_INIT_FILES = [
     "tpu/__init__.py",
     "train/__init__.py",
     "train/experimental/__init__.py",
+    "types/__init__.py",
+    "types/experimental/__init__.py",
     "version/__init__.py",
     "xla/__init__.py",
     "xla/experimental/__init__.py",
@@ -93,6 +97,7 @@ KERAS_API_INIT_FILES = [
     "keras/applications/inception_v3/__init__.py",
     "keras/applications/mobilenet/__init__.py",
     "keras/applications/mobilenet_v2/__init__.py",
+    "keras/applications/mobilenet_v3/__init__.py",
     "keras/applications/nasnet/__init__.py",
     "keras/applications/resnet/__init__.py",
     "keras/applications/resnet_v2/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
index 5c9f1694081..36593eff901 100644
--- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
@@ -92,6 +92,8 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "train/__init__.py",
     "train/experimental/__init__.py",
     "train/queue_runner/__init__.py",
+    "types/__init__.py",
+    "types/experimental/__init__.py",
     "user_ops/__init__.py",
     "version/__init__.py",
     "xla/__init__.py",
@@ -111,6 +113,7 @@ KERAS_API_INIT_FILES_V1 = [
     "keras/applications/inception_v3/__init__.py",
     "keras/applications/mobilenet/__init__.py",
     "keras/applications/mobilenet_v2/__init__.py",
+    "keras/applications/mobilenet_v3/__init__.py",
     "keras/applications/nasnet/__init__.py",
     "keras/applications/resnet/__init__.py",
     "keras/applications/resnet_v2/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/doc_srcs.py b/tensorflow/python/tools/api/generator/doc_srcs.py
index 2f34db241a3..b514607d67e 100644
--- a/tensorflow/python/tools/api/generator/doc_srcs.py
+++ b/tensorflow/python/tools/api/generator/doc_srcs.py
@@ -17,21 +17,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-
 from tensorflow.python.util import tf_export
 
 
-# Specifies docstring source for a module.
-# Only one of docstring or docstring_module_name should be set.
-# * If docstring is set, then we will use this docstring when
-#   for the module.
-# * If docstring_module_name is set, then we will copy the docstring
-#   from docstring source module.
-DocSource = collections.namedtuple(
-    'DocSource', ['docstring', 'docstring_module_name'])
-# Each attribute of DocSource is optional.
-DocSource.__new__.__defaults__ = (None,) * len(DocSource._fields)
+class DocSource(object):
+  """Specifies docstring source for a module.
+
+  Only one of docstring or docstring_module_name should be set.
+  * If docstring is set, then we will use this docstring when
+    for the module.
+  * If docstring_module_name is set, then we will copy the docstring
+    from docstring source module.
+  """
+
+  def __init__(self, docstring=None, docstring_module_name=None):
+    self.docstring = docstring
+    self.docstring_module_name = docstring_module_name
+
+    if self.docstring is not None and self.docstring_module_name is not None:
+      raise ValueError('Only one of `docstring` or `docstring_module_name` can '
+                       'be set.')
+
 
 _TENSORFLOW_DOC_SOURCES = {
     'app': DocSource(docstring_module_name='platform.app'),
@@ -41,6 +47,7 @@ _TENSORFLOW_DOC_SOURCES = {
     'distributions': DocSource(
         docstring_module_name='ops.distributions.distributions'),
     'errors': DocSource(docstring_module_name='framework.errors'),
+    'experimental.numpy': DocSource(docstring_module_name='ops.numpy_ops'),
     'gfile': DocSource(docstring_module_name='platform.gfile'),
     'graph_util': DocSource(docstring_module_name='framework.graph_util'),
     'image': DocSource(docstring_module_name='ops.image_ops'),
diff --git a/tensorflow/python/tools/freeze_graph.py b/tensorflow/python/tools/freeze_graph.py
index 561e998f6c3..33b5c78d982 100644
--- a/tensorflow/python/tools/freeze_graph.py
+++ b/tensorflow/python/tools/freeze_graph.py
@@ -84,7 +84,7 @@ def freeze_graph_with_def_protos(input_graph_def,
                                  clear_devices,
                                  initializer_nodes,
                                  variable_names_whitelist="",
-                                 variable_names_blacklist="",
+                                 variable_names_denylist="",
                                  input_meta_graph_def=None,
                                  input_saved_model_dir=None,
                                  saved_model_tags=None,
@@ -107,7 +107,7 @@ def freeze_graph_with_def_protos(input_graph_def,
                        freezing.
     variable_names_whitelist: The set of variable names to convert (optional, by
                               default, all variables are converted).
-    variable_names_blacklist: The set of variable names to omit converting
+    variable_names_denylist: The set of variable names to omit converting
                               to constants (optional).
     input_meta_graph_def: A `MetaGraphDef` (optional),
     input_saved_model_dir: Path to the dir with TensorFlow 'SavedModel' file
@@ -213,9 +213,9 @@ def freeze_graph_with_def_protos(input_graph_def,
     variable_names_whitelist = (
         variable_names_whitelist.replace(" ", "").split(",")
         if variable_names_whitelist else None)
-    variable_names_blacklist = (
-        variable_names_blacklist.replace(" ", "").split(",")
-        if variable_names_blacklist else None)
+    variable_names_denylist = (
+        variable_names_denylist.replace(" ", "").split(",")
+        if variable_names_denylist else None)
 
     if input_meta_graph_def:
       output_graph_def = graph_util.convert_variables_to_constants(
@@ -223,14 +223,14 @@ def freeze_graph_with_def_protos(input_graph_def,
           input_meta_graph_def.graph_def,
           output_node_names.replace(" ", "").split(","),
           variable_names_whitelist=variable_names_whitelist,
-          variable_names_blacklist=variable_names_blacklist)
+          variable_names_blacklist=variable_names_denylist)
     else:
       output_graph_def = graph_util.convert_variables_to_constants(
           sess,
           input_graph_def,
           output_node_names.replace(" ", "").split(","),
           variable_names_whitelist=variable_names_whitelist,
-          variable_names_blacklist=variable_names_blacklist)
+          variable_names_blacklist=variable_names_denylist)
 
   # Write GraphDef to file if output path has been given.
   if output_graph:
@@ -294,7 +294,7 @@ def freeze_graph(input_graph,
                  clear_devices,
                  initializer_nodes,
                  variable_names_whitelist="",
-                 variable_names_blacklist="",
+                 variable_names_denylist="",
                  input_meta_graph=None,
                  input_saved_model_dir=None,
                  saved_model_tags=tag_constants.SERVING,
@@ -318,7 +318,7 @@ def freeze_graph(input_graph,
                        freezing.
     variable_names_whitelist: The set of variable names to convert (optional, by
                               default, all variables are converted),
-    variable_names_blacklist: The set of variable names to omit converting
+    variable_names_denylist: The set of variable names to omit converting
                               to constants (optional).
     input_meta_graph: A `MetaGraphDef` file to load (optional).
     input_saved_model_dir: Path to the dir with TensorFlow 'SavedModel' file and
@@ -354,7 +354,7 @@ def freeze_graph(input_graph,
       clear_devices,
       initializer_nodes,
       variable_names_whitelist,
-      variable_names_blacklist,
+      variable_names_denylist,
       input_meta_graph_def,
       input_saved_model_dir,
       [tag for tag in saved_model_tags.replace(" ", "").split(",") if tag],
@@ -373,7 +373,7 @@ def main(unused_args, flags):
                flags.input_checkpoint, flags.output_node_names,
                flags.restore_op_name, flags.filename_tensor_name,
                flags.output_graph, flags.clear_devices, flags.initializer_nodes,
-               flags.variable_names_whitelist, flags.variable_names_blacklist,
+               flags.variable_names_whitelist, flags.variable_names_denylist,
                flags.input_meta_graph, flags.input_saved_model_dir,
                flags.saved_model_tags, checkpoint_version)
 
@@ -456,7 +456,7 @@ def run_main():
       only those variables will be converted to constants.\
       """)
   parser.add_argument(
-      "--variable_names_blacklist",
+      "--variable_names_denylist",
       type=str,
       default="",
       help="""\
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index 0f8f68436a3..bdbdd3499ad 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -24,7 +24,6 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
-import collections
 import os
 import re
 import sys
@@ -51,6 +50,7 @@ from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.tools import saved_model_aot_compile
 from tensorflow.python.tools import saved_model_utils
 from tensorflow.python.tpu import tpu
+from tensorflow.python.util.compat import collections_abc
 
 
 _XLA_DEBUG_OPTIONS_URL = (
@@ -58,8 +58,8 @@ _XLA_DEBUG_OPTIONS_URL = (
     'tensorflow/compiler/xla/debug_options_flags.cc')
 
 
-# Set of ops to blacklist.
-_OP_BLACKLIST = set(['WriteFile', 'ReadFile', 'PrintV2'])
+# Set of ops to denylist.
+_OP_DENYLIST = set(['WriteFile', 'ReadFile', 'PrintV2'])
 
 
 def _show_tag_sets(saved_model_dir):
@@ -241,7 +241,7 @@ def _print_args(arguments, argument_type='Argument', indent=0):
       in_print('  %s' % element)
     elif isinstance(element, tensor_spec.TensorSpec):
       print((indent + 1) * '  ' + '%s: %s' % (element.name, repr(element)))
-    elif (isinstance(element, collections.Iterable) and
+    elif (isinstance(element, collections_abc.Iterable) and
           not isinstance(element, dict)):
       in_print('  DType: %s' % type(element).__name__)
       in_print('  Value: [', end='')
@@ -349,9 +349,9 @@ def get_signature_def_map(saved_model_dir, tag_set):
 
 
 def scan_meta_graph_def(meta_graph_def):
-  """Scans meta_graph_def and reports if there are ops on blacklist.
+  """Scans meta_graph_def and reports if there are ops on denylist.
 
-  Print ops if they are on black list, or print success if no blacklisted ops
+  Print ops if they are on black list, or print success if no denylisted ops
   found.
 
   Args:
@@ -359,13 +359,14 @@ def scan_meta_graph_def(meta_graph_def):
   """
   all_ops_set = set(
       meta_graph_lib.ops_used_by_graph_def(meta_graph_def.graph_def))
-  blacklisted_ops = _OP_BLACKLIST & all_ops_set
-  if blacklisted_ops:
+  denylisted_ops = _OP_DENYLIST & all_ops_set
+  if denylisted_ops:
     # TODO(yifeif): print more warnings
-    print('MetaGraph with tag set %s contains the following blacklisted ops:' %
-          meta_graph_def.meta_info_def.tags, blacklisted_ops)
+    print(
+        'MetaGraph with tag set %s contains the following denylisted ops:' %
+        meta_graph_def.meta_info_def.tags, denylisted_ops)
   else:
-    print('MetaGraph with tag set %s does not contain blacklisted ops.' %
+    print('MetaGraph with tag set %s does not contain denylisted ops.' %
           meta_graph_def.meta_info_def.tags)
 
 
@@ -957,7 +958,7 @@ def add_run_subparser(subparsers):
 def add_scan_subparser(subparsers):
   """Add parser for `scan`."""
   scan_msg = ('Usage example:\n'
-              'To scan for blacklisted ops in SavedModel:\n'
+              'To scan for denylisted ops in SavedModel:\n'
               '$saved_model_cli scan --dir /tmp/saved_model\n'
               'To scan a specific MetaGraph, pass in --tag_set\n')
   parser_scan = subparsers.add_parser(
diff --git a/tensorflow/python/tools/saved_model_cli_test.py b/tensorflow/python/tools/saved_model_cli_test.py
index 0baca7fef55..84283ec7dd7 100644
--- a/tensorflow/python/tools/saved_model_cli_test.py
+++ b/tensorflow/python/tools/saved_model_cli_test.py
@@ -698,18 +698,18 @@ Defined Functions:
     with captured_output() as (out, _):
       saved_model_cli.scan(args)
     output = out.getvalue().strip()
-    self.assertTrue('does not contain blacklisted ops' in output)
+    self.assertTrue('does not contain denylisted ops' in output)
 
-  def testScanCommandFoundBlacklistedOp(self):
+  def testScanCommandFoundDenylistedOp(self):
     self.parser = saved_model_cli.create_parser()
     base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
     args = self.parser.parse_args(
         ['scan', '--dir', base_path, '--tag_set', 'serve'])
-    op_blacklist = saved_model_cli._OP_BLACKLIST
-    saved_model_cli._OP_BLACKLIST = set(['VariableV2'])
+    op_denylist = saved_model_cli._OP_DENYLIST
+    saved_model_cli._OP_DENYLIST = set(['VariableV2'])
     with captured_output() as (out, _):
       saved_model_cli.scan(args)
-    saved_model_cli._OP_BLACKLIST = op_blacklist
+    saved_model_cli._OP_DENYLIST = op_denylist
     output = out.getvalue().strip()
     self.assertTrue('\'VariableV2\'' in output)
 
diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index 96b4fda7aa4..9cd75d1bed7 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -474,6 +474,7 @@ tpu_py_test(
         "tpu_embedding_v2_test.py",
     ],
     disable_experimental = True,
+    disable_mlir_bridge = False,
     python_version = "PY3",
     shard_count = 4,
     srcs_version = "PY2AND3",
@@ -499,6 +500,35 @@ tpu_py_test(
     ],
 )
 
+tpu_py_test(
+    name = "tpu_embedding_v2_correctness_test",
+    srcs = [
+        "tpu_embedding_v2_correctness_test.py",
+    ],
+    disable_experimental = True,
+    disable_mlir_bridge = False,
+    python_version = "PY3",
+    shard_count = 4,
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tpu_embedding",
+        ":tpu_embedding_v2",
+        ":tpu_strategy_util",
+        "//tensorflow/python:init_ops_v2",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:tpu_strategy",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:remote",
+        "//tensorflow/python/training/tracking:util",
+        "//third_party/py/numpy",
+    ],
+)
+
 tf_py_test(
     name = "tpu_embedding_v2_cpu_test",
     srcs = [
@@ -532,6 +562,13 @@ tpu_py_test(
     ],
 )
 
+# NOTE this target should only be depended on by the tpu_test_wrapper macro.
+py_library(
+    name = "tpu_test_deps",
+    visibility = ["//visibility:public"],
+    deps = ["//tensorflow/python:client_testlib"],
+)
+
 tf_proto_library(
     name = "tensor_tracer_proto",
     srcs = ["tensor_tracer.proto"],
diff --git a/tensorflow/python/tpu/client/client.py b/tensorflow/python/tpu/client/client.py
index c834a57c153..2897320be91 100644
--- a/tensorflow/python/tpu/client/client.py
+++ b/tensorflow/python/tpu/client/client.py
@@ -46,7 +46,7 @@ _GKE_ENV_VARIABLE = 'KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'
 _ENDPOINTS_SEPARATOR = ','
 _DEFAULT_ENV_VARIABLE = 'TPU_NAME'
 _DISCOVERY_SERVICE_URL_ENV_VARIABLE = 'TPU_API_DISCOVERY_URL'
-_GCE_METADATA_ENDPOINT = 'http://metadata.google.internal'
+_GCE_METADATA_URL_ENV_VARIABLE = 'GCE_METADATA_IP'
 _DEFAULT_ENDPOINT_PORT = '8470'
 _OOM_EVENT_COOL_TIME_SEC = 90
 _VERSION_SWITCHER_ENDPOINT = 'http://{}:8475/requestversion'
@@ -68,9 +68,14 @@ def _environment_discovery_url():
   return os.environ.get(_DISCOVERY_SERVICE_URL_ENV_VARIABLE)
 
 
+def _gce_metadata_endpoint():
+  return 'http://' + os.environ.get(_GCE_METADATA_URL_ENV_VARIABLE,
+                                    'metadata.google.internal')
+
+
 def _request_compute_metadata(path):
   req = request.Request(
-      '%s/computeMetadata/v1/%s' % (_GCE_METADATA_ENDPOINT, path),
+      '%s/computeMetadata/v1/%s' % (_gce_metadata_endpoint(), path),
       headers={'Metadata-Flavor': 'Google'})
   resp = request.urlopen(req)
   return _as_text(resp.read())
diff --git a/tensorflow/python/tpu/feature_column.py b/tensorflow/python/tpu/feature_column.py
index 6039a57ce90..3a481f6ff84 100644
--- a/tensorflow/python/tpu/feature_column.py
+++ b/tensorflow/python/tpu/feature_column.py
@@ -36,13 +36,13 @@ _SUPPORTED_SEQUENCE_COLUMNS = (fc._SequenceCategoricalColumn,
 
 
 # For V2 columns, we support anything that inherits from CategoricalColumn
-# other than those in the blacklist. User-provided columns that inherit from
+# other than those in the denylist. User-provided columns that inherit from
 # CategoricalColumn may or may not be compatible; it is up to the user to
 # manage TPU compatibility for custom columns.
 _SUPPORTED_CATEGORICAL_COLUMNS_V2 = (fc_lib.CategoricalColumn,)
-_BLACKLISTED_CATEGORICAL_COLUMNS_V2 = (fc_lib.HashedCategoricalColumn,
-                                       fc_lib.BucketizedColumn,
-                                       fc_lib.CrossedColumn)
+_DENYLISTED_CATEGORICAL_COLUMNS_V2 = (fc_lib.HashedCategoricalColumn,
+                                      fc_lib.BucketizedColumn,
+                                      fc_lib.CrossedColumn)
 _SUPPORTED_CATEGORICAL_COLUMNS = (fc._IdentityCategoricalColumn,
                                   fc._VocabularyFileCategoricalColumn,
                                   fc._VocabularyListCategoricalColumn,
@@ -106,9 +106,9 @@ def embedding_column(categorical_column,
     ValueError: if `initializer` is specified but not callable.
     TypeError: if categorical_column is not a supported type.
   """
-  if isinstance(categorical_column, _BLACKLISTED_CATEGORICAL_COLUMNS_V2):
+  if isinstance(categorical_column, _DENYLISTED_CATEGORICAL_COLUMNS_V2):
     raise TypeError('categorical_column for tpu '
-                    ' embedding_column was blacklisted type %s' %
+                    ' embedding_column was denylisted type %s' %
                     type(categorical_column))
   if not isinstance(categorical_column, _SUPPORTED_CATEGORICAL_COLUMNS):
     raise TypeError(
@@ -223,9 +223,9 @@ def shared_embedding_columns(categorical_columns,
       or 0 for a sequence column.
   """
   for categorical_column in categorical_columns:
-    if isinstance(categorical_column, _BLACKLISTED_CATEGORICAL_COLUMNS_V2):
+    if isinstance(categorical_column, _DENYLISTED_CATEGORICAL_COLUMNS_V2):
       raise TypeError('categorical_column for tpu '
-                      ' embedding_column was blacklisted type %s' %
+                      ' embedding_column was denylisted type %s' %
                       type(categorical_column))
     if not isinstance(categorical_column, _SUPPORTED_CATEGORICAL_COLUMNS):
       raise TypeError(
@@ -372,10 +372,12 @@ class _TPUEmbeddingColumn(_TPUBaseEmbeddingColumn, fc._EmbeddingColumn):
               trainable=True,
               max_sequence_length=0,
               learning_rate_fn=None,
-              use_safe_embedding_lookup=True):
+              use_safe_embedding_lookup=True,
+              bypass_scope_validation=False):
     # Note, args ckpt_to_load_from, tensor_name_in_ckpt, max_norm and trainable
     # are not supported on TPU. They are solely for matching the signature of
     # __new__ of parent class fc._EmbeddingColumn.
+    del bypass_scope_validation
     return fc._EmbeddingColumn.__new__(
         cls,
         categorical_column,
@@ -399,13 +401,18 @@ class _TPUEmbeddingColumn(_TPUBaseEmbeddingColumn, fc._EmbeddingColumn):
                trainable=True,
                max_sequence_length=0,
                learning_rate_fn=None,
-               use_safe_embedding_lookup=True):
+               use_safe_embedding_lookup=True,
+               bypass_scope_validation=False):
     _TPUBaseEmbeddingColumn.__init__(
         self,
         categorical_column,
         max_sequence_length=max_sequence_length,
         learning_rate_fn=learning_rate_fn)
     self._key = None
+    # If true, scope validation is skipped to allow the same column to be used
+    # in multiple variable scopes. By default, this is False, and we expect a
+    # 1:1 mapping between feature columns and scopes.
+    self._bypass_scope_validation = bypass_scope_validation
 
   def get_combiner(self):
     return self.combiner
@@ -459,8 +466,10 @@ class _TPUEmbeddingColumn(_TPUBaseEmbeddingColumn, fc._EmbeddingColumn):
     tensor = inputs.get(self.get_feature_key_name())
 
     # Add to collection for _create_tpu_embedding_variables_and_ops
-    _record_variable_scope_and_name(self.get_embedding_var_name(),
-                                    'embedding_weights')
+    _record_variable_scope_and_name(
+        self.get_embedding_var_name(),
+        'embedding_weights',
+        bypass_scope_validation=self._bypass_scope_validation)
 
     return tensor
 
@@ -484,8 +493,10 @@ class _TPUEmbeddingColumn(_TPUBaseEmbeddingColumn, fc._EmbeddingColumn):
     tensor_lengths = array_ops.squeeze(tensor_lengths, -1)
 
     # Add to collection for _create_tpu_embedding_variables_and_ops
-    _record_variable_scope_and_name(self.get_embedding_var_name(),
-                                    'embedding_weights')
+    _record_variable_scope_and_name(
+        self.get_embedding_var_name(),
+        'embedding_weights',
+        bypass_scope_validation=self._bypass_scope_validation)
 
     return fc._SequenceDenseColumn.TensorSequenceLengthPair(
         dense_tensor=tensor, sequence_length=tensor_lengths)
@@ -627,7 +638,8 @@ class _TPUSharedEmbeddingColumn(_TPUBaseEmbeddingColumn,
 
 def _record_variable_scope_and_name(embedding_var_name,
                                     embedding_var_name_in_fc,
-                                    is_shared_embedding=False):
+                                    is_shared_embedding=False,
+                                    bypass_scope_validation=False):
   """Add embedding variable name and scope to collection."""
   g = ops.get_default_graph()
   collection = g.get_collection_ref(_TPU_FC_TO_SCOPE)
@@ -640,8 +652,8 @@ def _record_variable_scope_and_name(embedding_var_name,
   captured_scope_name = captured_scope.name
 
   if embedding_var_name in var_def_dict:
-    if (var_def_dict[embedding_var_name][0] != captured_scope_name
-        and not is_shared_embedding):
+    if (var_def_dict[embedding_var_name][0] != captured_scope_name and
+        not is_shared_embedding and not bypass_scope_validation):
       raise ValueError(
           'For embedding var name {}, the variable scope name is different, '
           'got {}; expected {}'.format(embedding_var_name,
diff --git a/tensorflow/python/tpu/feature_column_test.py b/tensorflow/python/tpu/feature_column_test.py
index 74cfe27f006..5992e74972f 100644
--- a/tensorflow/python/tpu/feature_column_test.py
+++ b/tensorflow/python/tpu/feature_column_test.py
@@ -59,8 +59,8 @@ class EmbeddingColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column._parse_example_spec)
 
-  def test_blacklisted_column(self):
-    # HashedCategoricalColumn is blacklisted and so will raise an exception.
+  def test_denylisted_column(self):
+    # HashedCategoricalColumn is denylisted and so will raise an exception.
     categorical_column = fc_lib.categorical_column_with_hash_bucket(
         key='aaa', hash_bucket_size=3)
     embedding_dimension = 2
diff --git a/tensorflow/python/tpu/feature_column_v2.py b/tensorflow/python/tpu/feature_column_v2.py
index 1012506c48b..32472053791 100644
--- a/tensorflow/python/tpu/feature_column_v2.py
+++ b/tensorflow/python/tpu/feature_column_v2.py
@@ -427,7 +427,9 @@ class _TPUEmbeddingColumnV2(_TPUBaseEmbeddingColumn, fc_lib.EmbeddingColumn):
               initializer=None,
               max_sequence_length=0,
               learning_rate_fn=None,
-              use_safe_embedding_lookup=True):
+              use_safe_embedding_lookup=True,
+              bypass_scope_validation=False):
+    del bypass_scope_validation
     return fc_lib.EmbeddingColumn.__new__(
         cls,
         categorical_column,
@@ -455,13 +457,18 @@ class _TPUEmbeddingColumnV2(_TPUBaseEmbeddingColumn, fc_lib.EmbeddingColumn):
                initializer=None,
                max_sequence_length=0,
                learning_rate_fn=None,
-               use_safe_embedding_lookup=True):
+               use_safe_embedding_lookup=True,
+               bypass_scope_validation=False):
     _TPUBaseEmbeddingColumn.__init__(
         self,
         categorical_column,
         max_sequence_length=max_sequence_length,
         learning_rate_fn=learning_rate_fn)
     self._key = None
+    # If true, scope validation is skipped to allow the same column to be used
+    # in multiple variable scopes. By default, this is False, and we expect a
+    # 1:1 mapping between feature columns and scopes.
+    self._bypass_scope_validation = bypass_scope_validation
 
   def get_combiner(self):
     return self.combiner
@@ -515,8 +522,10 @@ class _TPUEmbeddingColumnV2(_TPUBaseEmbeddingColumn, fc_lib.EmbeddingColumn):
     tensor = inputs.get(self.get_feature_key_name())
 
     # Add to collection for _create_tpu_embedding_variables_and_ops
-    _record_variable_scope_and_name(self.get_embedding_var_name(),
-                                    'embedding_weights')
+    _record_variable_scope_and_name(
+        self.get_embedding_var_name(),
+        'embedding_weights',
+        bypass_scope_validation=self._bypass_scope_validation)
 
     return tensor
 
@@ -528,8 +537,10 @@ class _TPUEmbeddingColumnV2(_TPUBaseEmbeddingColumn, fc_lib.EmbeddingColumn):
     # Create state is called for the EmbeddingColumn to create its embedding
     # variables under feature column V2, if we are on TPU so record the scope
     # here.
-    _record_variable_scope_and_name(self.get_embedding_var_name(),
-                                    'embedding_weights')
+    _record_variable_scope_and_name(
+        self.get_embedding_var_name(),
+        'embedding_weights',
+        bypass_scope_validation=self._bypass_scope_validation)
 
   def get_dense_tensor(self, transformation_cache, state_manager):
     if tpu.under_tpu_inference_context():
@@ -569,8 +580,10 @@ class _TPUEmbeddingColumnV2(_TPUBaseEmbeddingColumn, fc_lib.EmbeddingColumn):
     tensor_lengths = array_ops.squeeze(tensor_lengths, -1)
 
     # Add to collection for _create_tpu_embedding_variables_and_ops
-    _record_variable_scope_and_name(self.get_embedding_var_name(),
-                                    'embedding_weights')
+    _record_variable_scope_and_name(
+        self.get_embedding_var_name(),
+        'embedding_weights',
+        bypass_scope_validation=self._bypass_scope_validation)
 
     return fc_lib.SequenceDenseColumn.TensorSequenceLengthPair(
         dense_tensor=tensor, sequence_length=tensor_lengths)
diff --git a/tensorflow/python/tpu/feature_column_v2_test.py b/tensorflow/python/tpu/feature_column_v2_test.py
index c1a34fad107..93f65d6e1c4 100644
--- a/tensorflow/python/tpu/feature_column_v2_test.py
+++ b/tensorflow/python/tpu/feature_column_v2_test.py
@@ -28,8 +28,10 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.tpu import feature_column_v2 as tpu_fc
@@ -44,6 +46,40 @@ def _initialized_session():
   return sess
 
 
+class _TestStateManager(fc_lib.StateManager):
+
+  def __init__(self, trainable=True):
+    self._all_variables = {}
+    self._trainable = trainable
+
+  def create_variable(self,
+                      feature_column,
+                      name,
+                      shape,
+                      dtype=None,
+                      trainable=True,
+                      use_resource=True,
+                      initializer=None):
+    if feature_column not in self._all_variables:
+      self._all_variables[feature_column] = {}
+    var_dict = self._all_variables[feature_column]
+    if name in var_dict:
+      return var_dict[name]
+    else:
+      var = variable_scope.get_variable(
+          name=name,
+          shape=shape,
+          dtype=dtype,
+          trainable=self._trainable and trainable,
+          use_resource=use_resource,
+          initializer=initializer)
+      var_dict[name] = var
+      return var
+
+  def get_variable(self, feature_column, name):
+    return self._all_variables[feature_column][name]
+
+
 class EmbeddingColumnTestV2(test.TestCase, parameterized.TestCase):
 
   def test_defaults(self):
@@ -193,6 +229,56 @@ class EmbeddingColumnTestV2(test.TestCase, parameterized.TestCase):
     self.assertEqual(embedding_column._max_sequence_length,
                      embedding_column_copy._max_sequence_length)
 
+  def test_with_scope_validation(self):
+    categorical_column = fc_lib.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    embedding_dimension = 2
+    initializer = init_ops.truncated_normal_initializer(mean=0.0, stddev=.5)
+    embedding_column = tpu_fc._TPUEmbeddingColumnV2(
+        categorical_column=categorical_column,
+        dimension=embedding_dimension,
+        combiner='mean',
+        initializer=initializer,
+        max_sequence_length=0,
+        learning_rate_fn=None,
+        use_safe_embedding_lookup=True,
+        bypass_scope_validation=False)
+    self.assertIs(categorical_column, embedding_column.categorical_column)
+    self.assertEqual(embedding_dimension, embedding_column.dimension)
+    state_manager = _TestStateManager()
+    with tpu_function.tpu_shard_context(1):
+      with variable_scope.variable_scope('tower1/scope1'):
+        embedding_column.create_state(state_manager)
+      with variable_scope.variable_scope('tower2/scope2'):
+        # With default scope validation, the same column cannot be used in a new
+        # variable scope.
+        with self.assertRaisesRegex(ValueError,
+                                    'the variable scope name is different'):
+          embedding_column.create_state(state_manager)
+
+  def test_bypass_scope_validation(self):
+    categorical_column = fc_lib.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    embedding_dimension = 2
+    initializer = init_ops.truncated_normal_initializer(mean=0.0, stddev=.5)
+    embedding_column = tpu_fc._TPUEmbeddingColumnV2(
+        categorical_column=categorical_column,
+        dimension=embedding_dimension,
+        combiner='mean',
+        initializer=initializer,
+        max_sequence_length=0,
+        learning_rate_fn=None,
+        use_safe_embedding_lookup=True,
+        bypass_scope_validation=True)
+    self.assertIs(categorical_column, embedding_column.categorical_column)
+    self.assertEqual(embedding_dimension, embedding_column.dimension)
+    state_manager = _TestStateManager()
+    with tpu_function.tpu_shard_context(1):
+      with variable_scope.variable_scope('tower1/scope1'):
+        embedding_column.create_state(state_manager)
+      with variable_scope.variable_scope('tower2/scope2'):
+        embedding_column.create_state(state_manager)
+
 
 class SharedEmbeddingColumnTestV2(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/tpu/tensor_tracer.py b/tensorflow/python/tpu/tensor_tracer.py
index c0536d84182..3f8f7530a8d 100644
--- a/tensorflow/python/tpu/tensor_tracer.py
+++ b/tensorflow/python/tpu/tensor_tracer.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import summary_ops_v2 as summary
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import analytics
@@ -1643,11 +1644,12 @@ class TensorTracer(object):
       raise ValueError('Provide a trace_dir for tensor tracer in summary mode. '
                        '--trace_dir=/model/dir')
 
-    def _write_cache(step, **kwargs):
+    def _write_cache(step, event_file_suffix=None, **kwargs):
       """Writes the given caches as tensor summary.
 
       Args:
         step: Step tensor with dimension [num_cores].
+        event_file_suffix: Event filename suffix tensor.
         **kwargs: The dictionary of tensors that needs to be written as
           summaries. Key and value pairs within kwargs correspond to the tag
           name, and tensor content that will be written using summary.write.
@@ -1664,16 +1666,20 @@ class TensorTracer(object):
       Raises:
         RuntimeError: if there is no aggregate function defined for a signature.
       """
-
+      file_suffix = _TT_EVENT_FILE_SUFFIX
+      if event_file_suffix is not None:
+        file_suffix = string_ops.string_join([file_suffix, event_file_suffix],
+                                             separator='.')
       # TODO(deveci): Parametrize max_queue, so that flushing op can be called
       # less frequently.
       # Setting max_queue to 100 appears to be safe even when the number of
       # iterations are much lower, as the destructor of the writer flushes it.
       summary_write_ops = []
-      with summary.create_file_writer_v2(
+      summary_writer = summary.create_file_writer_v2(
           self._parameters.trace_dir,
-          filename_suffix=_TT_EVENT_FILE_SUFFIX,
-          max_queue=_TT_SUMMARY_MAX_QUEUE).as_default():
+          filename_suffix=file_suffix,
+          max_queue=_TT_SUMMARY_MAX_QUEUE)
+      with summary_writer.as_default():
         summary_metadata = summary_pb2.SummaryMetadata(
             plugin_data=summary_pb2.SummaryMetadata.PluginData(
                 plugin_name=_TT_TENSORBOARD_PLUGIN_NAME))
@@ -1688,8 +1694,7 @@ class TensorTracer(object):
             if key == _TT_SUMMARY_TAG and value.shape.as_list()[0] != 1:
               value = self.aggregate_global_cache(value)
 
-          with ops.control_dependencies(
-              summary.summary_writer_initializer_op()):
+          with ops.control_dependencies([summary_writer.init()]):
             summary_write_ops.append(summary.write(
                 _TT_SUMMARY_TAG + '/' + key, value, metadata=summary_metadata,
                 step=step[0]))
diff --git a/tensorflow/python/tpu/tpu.py b/tensorflow/python/tpu/tpu.py
index 14ba164314e..5a2f7ba4454 100644
--- a/tensorflow/python/tpu/tpu.py
+++ b/tensorflow/python/tpu/tpu.py
@@ -50,14 +50,13 @@ from tensorflow.python.tpu import tpu_function
 from tensorflow.python.tpu.ops import tpu_ops
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
-from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import tf_export
 
 ops.NotDifferentiable("TPUReplicatedInput")
 
 # Operations that indicate some error in the users graph, e.g. a placeholder
 # that's introduced outside of the infeed.
-_BLACKLISTED_OPS = set([
+_DENYLISTED_OPS = set([
     "Placeholder",
 ])
 
@@ -526,7 +525,7 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
 
   def AddOp(self, op):
     # pylint: disable=protected-access
-    if op.type in _BLACKLISTED_OPS:
+    if op.type in _DENYLISTED_OPS:
       logging.error("Operation of type %s (%s) is not supported on the TPU. "
                     "Execution will fail if this op is used in the graph. " %
                     (op.type, op.name))
@@ -1227,7 +1226,7 @@ def split_compile_and_replicate(computation,
       nest.flatten(per_replica_input, expand_composites=True)
       for per_replica_input in inputs
   ]
-  # Mask parallel to one replicat's inputs with True for tensors coming from
+  # Mask parallel to one replica's inputs with True for tensors coming from
   # composites.
   is_composite = nest.flatten(nest.map_structure(
       lambda x: _flatten_and_filter_composite(x, False, True), inputs[0]))
@@ -1412,9 +1411,11 @@ def split_compile_and_replicate(computation,
 
     outputs_is_flat = xla.is_flat(outputs)
     if outputs_is_flat:
-      output_tensors, control_deps = _postprocess_flat_outputs(outputs)
+      output_tensors, control_deps, pack_template = _postprocess_flat_outputs(
+          outputs)
     else:
-      output_tensors, control_deps = _postprocess_non_flat_outputs(outputs)
+      output_tensors, control_deps, pack_template = (
+          _postprocess_non_flat_outputs(outputs))
 
     # tensor_tracer imports tpu.py. Local import to tensor_tracer to avoid
     # import-cycle
@@ -1473,11 +1474,10 @@ def split_compile_and_replicate(computation,
             array_ops.identity(
                 ys[replica], name="output_%d_shard_%d" % (i, replica)))
 
-  if not outputs_is_flat:
-    replicated_outputs = [
-        nest.pack_sequence_as(outputs, replica_outs)
-        for replica_outs in replicated_outputs
-    ]
+  replicated_outputs = [
+      nest.pack_sequence_as(pack_template, replica_outs, expand_composites=True)
+      for replica_outs in replicated_outputs
+  ]
 
   return [compile_status, replicated_outputs]
 
@@ -1489,7 +1489,9 @@ def _postprocess_flat_outputs(outputs):
     outputs: Output from `computation` inside `tpu.rewrite`.
 
   Returns:
-    Tensors and Operations extracted from outputs.
+    - Tensors extracted from outputs.
+    - Operations extracted from outputs.
+    - A pack template for use with nest.pack_sequence_as to pack the tensors.
   """
   # Following code segment is to preserve legacy behavior. Previously we only
   # supported flat outputs and thus for consistency it was nice to convert even
@@ -1500,9 +1502,17 @@ def _postprocess_flat_outputs(outputs):
   # If the computation returns `None`, make it an empty tuple.
   if outputs is None:
     outputs = tuple()
-  # If the computation only returned one value, makes it a tuple.
-  if not isinstance(outputs, collections_abc.Sequence):
-    outputs = (outputs,)
+
+  # For legacy / backwards compatibility reasons we return a list for "flat"
+  # output values (even if the user's flat return value was a different type or
+  # even just a scalar value) so use nest.flatten to compute a flat list pack
+  # template.
+  pack_template = nest.flatten(outputs, expand_composites=False)
+
+  # Even though outputs is already "flat", we flatten any composites so their
+  # component tensors can be tagged and replicated. The pack_template will be
+  # used by the caller to repack the composite tensors.
+  outputs = nest.flatten(outputs, expand_composites=True)
 
   # Append `no_op` here so that fetching any return value of this function
   # will trigger TPUExecute node.
@@ -1527,6 +1537,11 @@ def _postprocess_flat_outputs(outputs):
         "TPU functions must return zero-or more Tensor values followed by "
         "zero or more Operations.")
 
+  # Trim operations off the end of the pack template. output_operations has 1
+  # extra element due to the no-op that is added.
+  if len(output_operations) > 1:
+    pack_template = pack_template[:1 - len(output_operations)]
+
   # Wraps outputs in Identity ops. Otherwise a replicated input copied
   # straight to an output would bypass the replicate(). This would be bad
   # because the TPUReplicatedInput/TPUReplicatedOutput operator would not
@@ -1540,7 +1555,7 @@ def _postprocess_flat_outputs(outputs):
       o.op._set_attr("_tpu_output_identity", attr_value_pb2.AttrValue(b=True))
       # pylint: enable=protected-access
       new_output_tensors.append(o)
-  return new_output_tensors, output_operations
+  return new_output_tensors, output_operations, pack_template
 
 
 def _postprocess_non_flat_outputs(outputs):
@@ -1550,12 +1565,14 @@ def _postprocess_non_flat_outputs(outputs):
     outputs: Output from `computation` inside `tpu.rewrite`.
 
   Returns:
-    Tensors extracted from outputs and an empty list because Operations are not
-    allowed in non-flat outputs..
+    - Tensors extracted from outputs.
+    - An empty Operations list because Operations are not allowed in non-flat
+      outputs.
+    - A pack template for use with nest.pack_sequence_as to pack the tensors.
   """
 
   # Flatten output items.
-  flat_outputs = nest.flatten(outputs)
+  flat_outputs = nest.flatten(outputs, expand_composites=True)
 
   # Convert all non-Operation outputs to Tensors.
   for i, o in enumerate(flat_outputs):
@@ -1586,7 +1603,7 @@ def _postprocess_non_flat_outputs(outputs):
       flat_outputs[i] = array_ops.identity(o)
 
   # All flat_outputs are Tensors, and no Operations.
-  return flat_outputs, []
+  return flat_outputs, [], outputs
 
 
 def split_compile_and_shard(computation,
@@ -1947,7 +1964,9 @@ def rewrite(computation,
   # pylint: enable=indexing-exception
 
   # Operations that indicate some error in the user's inference graph.
-_BLACKLISTED_INFERENCE_OPS = set([
+
+
+_DENYLISTED_INFERENCE_OPS = set([
     "ReadVariableOp",
     "AssignVariableOp",
     "AssignAddVariableOp",
@@ -1993,7 +2012,7 @@ class _TPUInferenceContext(control_flow_ops.XLAControlFlowContext):
 
   def _AddOpInternal(self, op):
     # pylint: disable=protected-access
-    if self._check_ops and op.type in _BLACKLISTED_INFERENCE_OPS:
+    if self._check_ops and op.type in _DENYLISTED_INFERENCE_OPS:
       raise NotImplementedError(
           "Operation of type %s (%s) is not supported on the TPU for inference."
           " Execution will fail if this op is used in the graph. Make sure your"
diff --git a/tensorflow/python/tpu/tpu_embedding.py b/tensorflow/python/tpu/tpu_embedding.py
index d1848f34502..e9d1a3be6b6 100644
--- a/tensorflow/python/tpu/tpu_embedding.py
+++ b/tensorflow/python/tpu/tpu_embedding.py
@@ -577,9 +577,15 @@ class FtrlParameters(_OptimizationParameters):
                clip_weight_min=None,
                clip_weight_max=None,
                weight_decay_factor=None,
-               multiply_weight_decay_factor_by_learning_rate=None):
+               multiply_weight_decay_factor_by_learning_rate=None,
+               multiply_linear_by_learning_rate=False,
+               beta=0,
+               allow_zero_accumulator=False):
     """Optimization parameters for Ftrl.
 
+    Implements FTRL as described in the following [paper](
+    https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/41159.pdf)
+
     Args:
       learning_rate: a floating point value. The learning rate.
       learning_rate_power: A float value, must be less or equal to zero.
@@ -602,6 +608,14 @@ class FtrlParameters(_OptimizationParameters):
         weights are not decayed.
       multiply_weight_decay_factor_by_learning_rate: if true,
         `weight_decay_factor` is multiplied by the current learning rate.
+      multiply_linear_by_learning_rate: When true, multiplies the usages of the
+        linear slot in the weight update by the learning rate. This is useful
+        when ramping up learning rate from 0 (which would normally produce
+        NaNs).
+      beta: The beta parameter for FTRL.
+      allow_zero_accumulator: Changes the implementation of the square root to
+        allow for the case of initial_accumulator_value being zero. This will
+        cause a slight performance drop.
     """
     super(FtrlParameters,
           self).__init__(learning_rate, use_gradient_accumulation,
@@ -628,6 +642,9 @@ class FtrlParameters(_OptimizationParameters):
     self.initial_linear_value = 0.0
     self.l1_regularization_strength = l1_regularization_strength
     self.l2_regularization_strength = l2_regularization_strength
+    self.multiply_linear_by_learning_rate = multiply_linear_by_learning_rate
+    self.beta = beta
+    self.allow_zero_accumulator = allow_zero_accumulator
 
 
 class ProximalYogiParameters(_OptimizationParameters):
@@ -1896,10 +1913,12 @@ class _FtrlHandler(_OptimizerHandler):
         self._optimization_parameters.l1_regularization_strength)
     table_descriptor.optimization_parameters.ftrl.l2 = (
         self._optimization_parameters.l2_regularization_strength)
-    table_descriptor.optimization_parameters.ftrl.initial_accum = (
-        self._optimization_parameters.initial_accumulator_value)
-    table_descriptor.optimization_parameters.ftrl.initial_linear = (
-        self._optimization_parameters.initial_linear_value)
+    table_descriptor.optimization_parameters.ftrl.multiply_linear_by_lr = (
+        self._optimization_parameters.multiply_linear_by_learning_rate)
+    table_descriptor.optimization_parameters.ftrl.beta = (
+        self._optimization_parameters.beta)
+    table_descriptor.optimization_parameters.ftrl.allow_zero_accumulator = (
+        self._optimization_parameters.allow_zero_accumulator)
 
   def get_default_slot_variable_names(self, table):
     # These match the default slot variable names created by
diff --git a/tensorflow/python/tpu/tpu_embedding_v2.py b/tensorflow/python/tpu/tpu_embedding_v2.py
index eea2dea53c2..412c7eb03d3 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2.py
@@ -45,6 +45,7 @@ from tensorflow.python.tpu import tpu
 from tensorflow.python.tpu import tpu_embedding_v2_utils
 from tensorflow.python.tpu.ops import tpu_ops
 from tensorflow.python.training.saving import saveable_hook
+from tensorflow.python.training.tracking import base
 from tensorflow.python.training.tracking import tracking
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
@@ -136,7 +137,6 @@ class TPUEmbedding(tracking.AutoTrackable):
   with strategy.scope():
     embedding = tf.tpu.experimental.embedding.TPUEmbedding(
         feature_config=feature_config,
-        batch_size=1024,
         optimizer=tf.tpu.experimental.embedding.SGD(0.1))
   ```
 
@@ -152,6 +152,12 @@ class TPUEmbedding(tracking.AutoTrackable):
   dataset_iterator = iter(distributed_dataset)
   ```
 
+  NOTE: All batches passed to the layer must have the same batch size for each
+  input, more over once you have called the layer with one batch size all
+  subsequent calls must use the same batch_size. In the event that the batch
+  size cannot be automatically determined by the enqueue method, you must call
+  the build method with the batch size to initialize the layer.
+
   To use this API on TPU you should use a custom training loop. Below is an
   example of a training and evaluation step:
 
@@ -228,9 +234,8 @@ class TPUEmbedding(tracking.AutoTrackable):
 
   """
 
-  def __init__(self, feature_config, batch_size, optimizer,
-               pipeline_execution_with_tensor_core=False,
-               initialize_tpu_embedding=True):
+  def __init__(self, feature_config, optimizer,
+               pipeline_execution_with_tensor_core=False):
     """Creates the TPUEmbedding mid level API object.
 
     ```python
@@ -246,23 +251,19 @@ class TPUEmbedding(tracking.AutoTrackable):
     Args:
       feature_config: A nested structure of
         `tf.tpu.experimental.embedding.FeatureConfig` configs.
-      batch_size: The global batch size that you indend to use. Note that is
-        fixed and the same batch size must be used for both training and
-        evaluation.
       optimizer: An instance of one of `tf.tpu.experimental.embedding.SGD`,
         `tf.tpu.experimental.embedding.Adagrad` or
-        `tf.tpu.experimental.embedding.Adam`.
+        `tf.tpu.experimental.embedding.Adam`. When not created under
+        TPUStrategy may be set to None to avoid the creation of the optimizer
+        slot variables, useful for optimizing memory consumption when exporting
+        the model for serving where slot variables aren't needed.
       pipeline_execution_with_tensor_core: If True, the TPU embedding
         computations will overlap with the TensorCore computations (and hence
         will be one step old). Set to True for improved performance.
-      initialize_tpu_embedding: If False, will not initialize the TPU embedding
-        engine. If this is set to False and another instance of this class has
-        not initialized the tpu embedding engine, the creation of this object
-        will fail.
 
     Raises:
       ValueError: If optimizer is not one of tf.tpu.experimental.embedding.(SGD,
-      Adam or Adagrad).
+      Adam or Adagrad) or None when created under a TPUStrategy.
     """
     self._strategy = distribution_strategy_context.get_strategy()
     self._using_tpu = isinstance(self._strategy, (tpu_strategy.TPUStrategy,
@@ -299,7 +300,8 @@ class TPUEmbedding(tracking.AutoTrackable):
       if table.optimizer is None:
         # TODO(bfontain) Should we allow some sort of optimizer merging here?
         table.optimizer = optimizer
-      if not isinstance(table.optimizer, tpu_embedding_v2_utils._Optimizer):  # pylint: disable=protected-access
+      if ((table.optimizer is not None or self._using_tpu) and
+          not isinstance(table.optimizer, tpu_embedding_v2_utils._Optimizer)):  # pylint: disable=protected-access
         raise ValueError("{} is an unsupported optimizer class. Please pass an "
                          "instance of one of the optimizer classes under "
                          "tf.tpu.experimental.embedding.".format(
@@ -323,32 +325,73 @@ class TPUEmbedding(tracking.AutoTrackable):
       # We need to list of host devices for the load/retrieve operations.
       self._hosts = get_list_of_hosts(self._strategy)
 
-      # We generally use the per core batch size, but will have the user pass
-      # in a global batch size.
-      self._batch_size = batch_size // self._strategy.num_replicas_in_sync
+    self._built = False
+
+  def build(self, per_replica_batch_size=None):
+    """Create the underlying variables and initializes the TPU for embeddings.
+
+    This method creates the underlying variables (including slot variables). If
+    created under a TPUStrategy, this will also initialize the TPU for
+    embeddings.
+
+    This function will automatically get called by enqueue, which will try to
+    determine your batch size automatically. If this fails, you must manually
+    call this method before you call enqueue.
+
+    Args:
+      per_replica_batch_size: The per replica batch size that you intend to use.
+        Note that is fixed and the same batch size must be used for both
+        training and evaluation. If you want to calculate this from the global
+        batch size, you can use `num_replicas_in_sync` property of your strategy
+        object. May be set to None if not created under a TPUStrategy.
+
+    Raises:
+      ValueError: If per_replica_batch_size is None and object was created in a
+        TPUStrategy scope.
+    """
+    if self._built:
+      return
+
+    if self._using_tpu:
+      if per_replica_batch_size is None:
+        raise ValueError("You must specify a per_replica_batch_size when "
+                         "calling build if object is created under a "
+                         "TPUStrategy.")
+
+      self._batch_size = per_replica_batch_size
 
       self._config_proto = self._create_config_proto()
-      if initialize_tpu_embedding:
-        # This is mainly for testing purposes, sometimes we don't want to
-        # initialize the embedding engine, but just want a copy of the API
-        # which can interact with an already initialized engine.
-        logging.info("Initializing TPU Embedding engine with config: %s",
-                     self._config_proto)
-        @def_function.function
-        def load_config():
-          tpu.initialize_system_for_tpu_embedding(self._config_proto)
 
-        load_config()
-        logging.info("Done initializing TPU Embedding engine.")
+      logging.info("Initializing TPU Embedding engine with config: %s",
+                   self._config_proto)
+      @def_function.function
+      def load_config():
+        tpu.initialize_system_for_tpu_embedding(self._config_proto)
+
+      load_config()
+      logging.info("Done initializing TPU Embedding engine.")
 
     # Create and load variables and slot variables into the TPU.
     # Note that this is a dict of dicts. Keys to the first dict are table names.
     # We would prefer to use TableConfigs, but then these variables won't be
     # properly tracked by the tracking API.
     self._variables = self._create_variables_and_slots()
+
     if self._using_tpu:
       self._load_variables()
 
+    self._built = True
+
+  def _maybe_build(self, batch_size):
+    if not self._built:
+      # This can be called while tracing a function, so we wrap the
+      # initialization code with init_scope so it runs eagerly, this means that
+      # it will not be included the function graph generated by tracing so that
+      # we can be sure that we only initialize the TPU for embeddings exactly
+      # once.
+      with ops.init_scope():
+        self.build(batch_size)
+
   @property
   def embedding_tables(self):
     """Returns a dict of embedding tables, keyed by `TableConfig`.
@@ -372,6 +415,8 @@ class TPUEmbedding(tracking.AutoTrackable):
                          "strategy. If you need access, save your model, "
                          "create this object under a CPU strategy and restore.")
 
+    self._maybe_build(None)
+
     # Only return the tables and not the slot variables. On CPU this are honest
     # tf.Variables.
     return {table: self._variables[table.name]["parameters"]
@@ -549,7 +594,8 @@ class TPUEmbedding(tracking.AutoTrackable):
       name: A name for the underlying op.
 
     Raises:
-      RuntimeError: If called when object wasn't created under a `TPUStrategy`.
+      RuntimeError: If called when object wasn't created under a `TPUStrategy`
+        or if not built (either by manually calling build or calling enqueue).
       ValueError: If a non-`tf.Tensor` non-`None` gradient is passed in, or a
         `tf.Tensor` of the incorrect shape is passed in. Also if
         the size of any sequence in `gradients` does not match corresponding
@@ -561,6 +607,11 @@ class TPUEmbedding(tracking.AutoTrackable):
       raise RuntimeError("apply_gradients is not valid when TPUEmbedding "
                          "object is not created under a TPUStrategy.")
 
+    if not self._built:
+      raise RuntimeError("apply_gradients called on unbuilt TPUEmbedding "
+                         "object. Please either call enqueue first or manually "
+                         "call the build method.")
+
     # send_tpu_embedding_gradients requires per table gradient, if we only have
     # one feature per table this isn't an issue. When multiple features share
     # the same table, the order of the features in per table tensor returned by
@@ -642,12 +693,18 @@ class TPUEmbedding(tracking.AutoTrackable):
     passed to this instance of the `TPUEmbedding` object.
 
     Raises:
-      RuntimeError: If called when object wasn't created under a `TPUStrategy`.
+      RuntimeError: If called when object wasn't created under a `TPUStrategy`
+        or if not built (either by manually calling build or calling enqueue).
     """
     if not self._using_tpu:
       raise RuntimeError("dequeue is not valid when TPUEmbedding object is not "
                          "created under a TPUStrategy.")
 
+    if not self._built:
+      raise RuntimeError("dequeue called on unbuilt TPUEmbedding object. "
+                         "Please either call enqueue first or manually call "
+                         "the build method.")
+
     # The activations returned by this op are per table. So we must separate
     # them out into per feature activations. The activations are interleaved:
     # for each table, we expect a [num_features*batch_size, dim] tensor.
@@ -715,9 +772,16 @@ class TPUEmbedding(tracking.AutoTrackable):
       shape = (table.vocabulary_size, table.dim)
 
       def getter(name, shape, dtype, initializer, trainable):
+        # TODO(bfontain): make CheckpointInitialValue a callable rather than
+        # something that inherits from tensor.
+        if not isinstance(initializer, base.CheckpointInitialValue):
+          initial_value = functools.partial(initializer, shape, dtype=dtype)
+        else:
+          initial_value = initializer
+
         return tf_variables.Variable(
             name=name,
-            initial_value=functools.partial(initializer, shape, dtype=dtype),
+            initial_value=initial_value,
             trainable=trainable)
 
       def variable_creator(name, initializer, trainable=True):
@@ -740,7 +804,10 @@ class TPUEmbedding(tracking.AutoTrackable):
                                 initializer,
                                 False)
 
-      slot_vars = table.optimizer._create_slots(parameters, slot_creator)  # pylint: disable=protected-access
+      if table.optimizer is not None:
+        slot_vars = table.optimizer._create_slots(parameters, slot_creator)  # pylint: disable=protected-access
+      else:
+        slot_vars = {}
       slot_vars["parameters"] = parameters
       return slot_vars
 
@@ -820,11 +887,17 @@ class TPUEmbedding(tracking.AutoTrackable):
     # TODO(bfontain): Update restore logic in saver so that these hooks are
     # always executed. Once that is done, we can output an empty list when on
     # CPU.
+
+    def _load_variables():
+      if self._using_tpu and self._built:
+        self._load_variables()
+
+    def _retrieve_variables():
+      if self._using_tpu and self._built:
+        self._retrieve_variables()
+
     def factory(name=_HOOK_KEY):
-      return TPUEmbeddingSaveable(
-          name,
-          self._load_variables if self._using_tpu else None,
-          self._retrieve_variables if self._using_tpu else None)
+      return TPUEmbeddingSaveable(name, _load_variables, _retrieve_variables)
     return {_HOOK_KEY: factory}
 
   # Some helper functions for the below enqueue function.
@@ -1117,8 +1190,10 @@ class TPUEmbedding(tracking.AutoTrackable):
         directly taken from the args of the `strategy.run` call. Also if
         the size of any sequence in `features` does not match corresponding
         sequence in `feature_config`. Similarly for `weights`, if not `None`.
+        If batch size of features is unequal or different from a previous call.
       RuntimeError: When called inside a strategy.run call and inside XLA
-        control flow.
+        control flow. If batch_size is not able to be determined and build was
+        not called.
       TypeError: If the type of any sequence in `features` does not match
         corresponding sequence in `feature_config`. Similarly for `weights`, if
         not `None`.
@@ -1127,10 +1202,24 @@ class TPUEmbedding(tracking.AutoTrackable):
       raise RuntimeError("enqueue is not valid when TPUEmbedding object is not "
                          "created under a TPUStrategy.")
 
-    nest.assert_same_structure(self._feature_config, features)
+    in_tpu_context = self._raise_error_for_incorrect_control_flow_context()
 
-    # TODO(bfontain): Add a check that the input batch_size matches the per core
-    # batch size that this instance of the API was initialized with.
+    # Should we also get batch_size from weights if they exist?
+    # Since features is assumed to be batched at the per replica batch size
+    # the returned batch size here is per replica an not global.
+    batch_size = self._get_batch_size(features, in_tpu_context)
+    if batch_size is None and not self._built:
+      raise RuntimeError("Unable to determine batch size from input features."
+                         "Please call build() with global batch size to "
+                         "initialize the TPU for embeddings.")
+    if batch_size is not None:
+      self._maybe_build(batch_size)
+      if self._batch_size != batch_size:
+        raise ValueError("Multiple calls to enqueue with different batch sizes "
+                         "{} and {}.".format(self._batch_size,
+                                             batch_size))
+
+    nest.assert_same_structure(self._feature_config, features)
 
     flat_inputs = nest.flatten(features)
     flat_weights = [None] * len(flat_inputs)
@@ -1140,7 +1229,6 @@ class TPUEmbedding(tracking.AutoTrackable):
     flat_features = nest.flatten_with_joined_string_paths(self._feature_config)
 
     self._raise_error_for_inputs_not_on_cpu(features)
-    in_tpu_context = self._raise_error_for_incorrect_control_flow_context()
     # If we are in a tpu_context, automatically apply outside compilation.
     if in_tpu_context:
       self._raise_error_for_non_direct_inputs(features)
@@ -1199,6 +1287,34 @@ class TPUEmbedding(tracking.AutoTrackable):
           enqueue_ops.append(enqueue_op)
       ops.get_default_graph().control_outputs.extend(enqueue_ops)
 
+  def _get_batch_size(self, tensors, in_tpu_context):
+    """Gets the batch size from a nested structure of features."""
+    batch_size = None
+    for path, maybe_tensor in nest.flatten_with_joined_string_paths(tensors):
+      tensor_list = []
+      if not in_tpu_context:
+        # if we are not in a context, then this is PerReplica and we need to
+        # check each replica's batch size.
+        for replica_id in range(self._strategy.num_replicas_in_sync):
+          tensor_list.append(distribute_utils.select_replica(replica_id,
+                                                             maybe_tensor))
+      else:
+        tensor_list = [maybe_tensor]
+
+      for tensor in tensor_list:
+        if tensor.shape.rank < 1:
+          raise ValueError(
+              "Input {} has rank 0, rank must be at least 1.".format(path))
+        shape = tensor.shape.as_list()
+        if shape[0] is not None:
+          if batch_size is None:
+            batch_size = shape[0]
+          elif batch_size != shape[0]:
+            raise ValueError("Found multiple batch sizes {} and {}. All inputs "
+                             "must have the same batch dimensions size.".format(
+                                 batch_size, shape[0]))
+    return batch_size
+
 
 class TPUEmbeddingSaveable(saveable_hook.SaveableHook):
   """Save/Restore hook to Retrieve/Load TPUEmbedding variables."""
@@ -1230,7 +1346,7 @@ def _ragged_embedding_lookup_with_reduce(table, ragged, weights, combiner):
     A Tensor.
   """
   if weights is None:
-    weights = array_ops.ones_like(ragged)
+    weights = array_ops.ones_like(ragged, dtype=table.dtype)
   weights = array_ops.expand_dims(weights, axis=2)
   ragged_result = embedding_ops.embedding_lookup_ragged(table, ragged)
   ragged_result = math_ops.reduce_sum(ragged_result * weights, axis=1)
@@ -1339,6 +1455,9 @@ def extract_variable_info(kwargs):
     return (kwargs["name"], shape,
             kwargs["initial_value"].keywords.get("dtype", kwargs["dtype"]),
             kwargs["initial_value"].func)
+  elif isinstance(kwargs["initial_value"], base.CheckpointInitialValue):
+    return (kwargs["name"], kwargs["initial_value"].shape,
+            kwargs["initial_value"].dtype, kwargs["initial_value"])
   elif "shape" not in kwargs or kwargs["shape"] is None:
     raise ValueError(
         "Unable to extract initializer function and shape from {}. Please "
@@ -1377,12 +1496,24 @@ def make_sharded_variable_creator(hosts):
     variables = []
     newkwargs = kwargs
     newkwargs["dtype"] = dtype
+    # TODO(bfontain): Remove this check once we can pass position and shape of
+    # shards to CheckpointInitialValue.
+    if isinstance(initial_value, base.CheckpointInitialValue) and num_hosts > 1:
+      raise RuntimeError("Delayed restoration of variables not available when "
+                         "there are multiple TPU hosts, please ensure that the "
+                         "api object has been built before you restore.")
+
     for i, p in enumerate(partitions):
       with ops.device(hosts[i]):
         newkwargs["shape"] = (p, cols)
         newkwargs["name"] = "{}_{}".format(name, i)
-        newkwargs["initial_value"] = (
-            lambda: initial_value(newkwargs["shape"], dtype=dtype))
+        if isinstance(initial_value, base.CheckpointInitialValue):
+          # TODO(bfontain): Patch CheckpointInitialValue to take in account the
+          # position and shape of this shard.
+          newkwargs["initial_value"] = initial_value
+        else:
+          newkwargs["initial_value"] = (
+              lambda: initial_value(newkwargs["shape"], dtype=dtype))
         variables.append(next_creator(*args, **kwargs))
     return TPUShardedVariable(variables, name=name)
   return sharded_variable_creator
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_correctness_test.py b/tensorflow/python/tpu/tpu_embedding_v2_correctness_test.py
new file mode 100644
index 00000000000..7a9a727d956
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_embedding_v2_correctness_test.py
@@ -0,0 +1,630 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TPU Embeddings mid level API on TPU."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+import os
+
+from absl import flags
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import remote
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import init_ops_v2
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+from tensorflow.python.tpu import tpu_embedding_v2
+from tensorflow.python.tpu import tpu_embedding_v2_utils
+from tensorflow.python.tpu import tpu_strategy_util
+from tensorflow.python.util import nest
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string('tpu', '', 'Name of TPU to connect to.')
+flags.DEFINE_string('project', None, 'Name of GCP project with TPU.')
+flags.DEFINE_string('zone', None, 'Name of GCP zone with TPU.')
+flags.DEFINE_string('model_dir', os.environ.get('TEST_TMPDIR'),
+                    'A temporary directory.')
+
+
+class TPUEmbeddingCorrectness(parameterized.TestCase, test.TestCase):
+
+  def setUp(self):
+    super(TPUEmbeddingCorrectness, self).setUp()
+    self.embedding_values = np.array(list(range(32)), dtype=np.float64)
+    self.initializer = init_ops_v2.Constant(self.embedding_values)
+    # Embedding for video initialized to
+    # 0 1 2 3
+    # 4 5 6 7
+    # ...
+    self.table_video = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=8,
+        dim=4,
+        initializer=self.initializer,
+        combiner='sum',
+        name='video')
+    # Embedding for user initialized to
+    # 0 1
+    # 2 3
+    # 4 5
+    # 6 7
+    # ...
+    self.table_user = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=16,
+        dim=2,
+        initializer=self.initializer,
+        combiner='mean',
+        name='user')
+    self.feature_config = (
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, name='watched'),
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, name='favorited'),
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_user, name='friends'))
+
+    self.batch_size = 2
+    self.data_batch_size = 4
+
+    # One (global) batch of inputs
+    # sparse tensor for watched:
+    # row 0: 0
+    # row 1: 0, 1
+    # row 2: 0, 1
+    # row 3: 1
+    self.feature_watched_indices = [[0, 0], [1, 0], [1, 1],
+                                    [2, 0], [2, 1], [3, 0]]
+    self.feature_watched_values = [0, 0, 1, 0, 1, 1]
+    self.feature_watched_row_lengths = [1, 2, 2, 1]
+    # sparse tensor for favorited:
+    # row 0: 0, 1
+    # row 1: 1
+    # row 2: 0
+    # row 3: 0, 1
+    self.feature_favorited_indices = [[0, 0], [0, 1], [1, 0],
+                                      [2, 0], [3, 0], [3, 1]]
+    self.feature_favorited_values = [0, 1, 1, 0, 0, 1]
+    self.feature_favorited_row_lengths = [2, 1, 1, 2]
+    # sparse tensor for friends:
+    # row 0: 3
+    # row 1: 0, 1, 2
+    # row 2: 3
+    # row 3: 0, 1, 2
+    self.feature_friends_indices = [[0, 0], [1, 0], [1, 1], [1, 2],
+                                    [2, 0], [3, 0], [3, 1], [3, 2]]
+    self.feature_friends_values = [3, 0, 1, 2, 3, 0, 1, 2]
+    self.feature_friends_row_lengths = [1, 3, 1, 3]
+    self.resolver = None
+
+  def tearDown(self):
+    if self.resolver:
+      tpu_strategy_util.shutdown_tpu_system(self.resolver)
+    super(TPUEmbeddingCorrectness, self).tearDown()
+
+  def _get_strategy(self):
+    self.resolver = tpu_cluster_resolver.TPUClusterResolver(
+        tpu=FLAGS.tpu, zone=FLAGS.zone, project=FLAGS.project)
+    remote.connect_to_cluster(self.resolver)
+    tpu_strategy_util.initialize_tpu_system(self.resolver)
+    return tpu_strategy.TPUStrategy(self.resolver)
+
+  def _create_strategy_and_mid_level(self, optimizer_name):
+    strategy = self._get_strategy()
+
+    with strategy.scope():
+      if optimizer_name == 'sgd':
+        optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
+      elif optimizer_name == 'adagrad':
+        optimizer = tpu_embedding_v2_utils.Adagrad(learning_rate=0.1)
+      elif optimizer_name == 'adam':
+        optimizer = tpu_embedding_v2_utils.Adam(learning_rate=0.1)
+      else:
+        raise ValueError('optimizer is not recognized: ', optimizer_name)
+      mid_level_api = self._create_mid_level(optimizer=optimizer)
+
+    return strategy, mid_level_api, optimizer
+
+  @parameterized.parameters(
+      *itertools.product(
+          ['sgd', 'adagrad', 'adam'],
+          [True, False]))
+  def test_embedding(self, optimizer_name, training):
+    strategy, mid_level_api, optimizer = (
+        self._create_strategy_and_mid_level(optimizer_name))
+
+    dataset = self._create_sparse_dataset(strategy)
+    dist = strategy.experimental_distribute_dataset(
+        dataset,
+        options=distribute_lib.InputOptions(
+            experimental_prefetch_to_device=False))
+    dist_iter = iter(dist)
+
+    @def_function.function
+    def test_fn():
+
+      def step():
+        """Create and run computation that returns the embedding activations."""
+        if not training:
+          activations = mid_level_api.dequeue()
+          total_loss = _get_total_loss_tensor(activations)
+          ret_val = [total_loss] + list(activations)
+          return ret_val
+        else:
+          with backprop.GradientTape() as tape:
+            activations = mid_level_api.dequeue()
+            tape.watch(activations)
+            total_loss = _get_total_loss_tensor(activations)
+            loss_per_replica = total_loss / strategy.num_replicas_in_sync
+          gradients = tape.gradient(loss_per_replica, activations)
+          mid_level_api.apply_gradients(gradients)
+        ret_val = [total_loss] + list(activations)
+        return ret_val
+
+      mid_level_api.enqueue(next(dist_iter), training=training)
+      result = strategy.run(step)
+      return result
+
+    # Run model.
+    shard_out_val = test_fn()
+
+    # Retrieve TPU weights to CPU.
+    mid_level_api._retrieve_variables()
+
+    # Compute sparse tensors for global batch.
+    input_data = next(iter(self._create_sparse_dataset(strategy)))
+
+    # Check results.
+    self._check_results(strategy, shard_out_val, training, input_data,
+                        mid_level_api._variables,
+                        optimizer)
+
+  def _create_mid_level(self, optimizer=None):
+    # Create `TPUEmbedding` object.
+    if optimizer is None:
+      optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
+
+    return tpu_embedding_v2.TPUEmbedding(
+        feature_config=self.feature_config,
+        optimizer=optimizer)
+
+  def _create_sparse_dataset(self, strategy, include_weights=False, weight=0.5):
+    # Create dataset for enqueue operation
+    sparse_features = (
+        sparse_tensor.SparseTensor(
+            indices=self.feature_watched_indices,
+            values=self.feature_watched_values,
+            dense_shape=[self.data_batch_size, 2]),
+        sparse_tensor.SparseTensor(
+            indices=self.feature_favorited_indices,
+            values=self.feature_favorited_values,
+            dense_shape=[self.data_batch_size, 2]),
+        sparse_tensor.SparseTensor(
+            indices=self.feature_friends_indices,
+            values=self.feature_friends_values,
+            dense_shape=[self.data_batch_size, 3]))
+    if include_weights:
+      weights = []
+      for sparse in sparse_features:
+        values = (
+            array_ops.ones_like(sparse.values, dtype=dtypes.float32) * weight)
+        weights.append(sparse_tensor.SparseTensor(
+            indices=sparse.indices,
+            values=values,
+            dense_shape=sparse.dense_shape))
+      sparse_features = (sparse_features, tuple(weights))
+
+    dataset = dataset_ops.DatasetV2.from_tensors(sparse_features)
+
+    # Data is batched to self.data_batch_size, rebatch to global batch size.
+    return dataset.unbatch().repeat().batch(
+        self.batch_size * strategy.num_replicas_in_sync, drop_remainder=True)
+
+  def _create_dense_input_fn(self, strategy, include_weights=False, weight=0.5):
+
+    def input_fn(ctx):
+      del ctx
+      features = (
+          constant_op.constant(self.feature_watched_values[-2:],
+                               dtype=dtypes.int32),
+          constant_op.constant(self.feature_favorited_values[-2:],
+                               dtype=dtypes.int32),
+          constant_op.constant(self.feature_friends_values[-2:],
+                               dtype=dtypes.int32))
+      if include_weights:
+        weights = [array_ops.ones_like(t, dtype=dtypes.float32) * weight
+                   for t in features]
+        features = (features, tuple(weights))
+      return dataset_ops.DatasetV2.from_tensors(features).repeat()
+
+    return input_fn
+
+  def _check_results(self, strategy, shard_out_val, training, input_data,
+                     table_to_variable, optimizer):
+    num_replicas = strategy.num_replicas_in_sync
+
+    # Unpack the values `strategy.run()` returns.
+    loss = _unpack(strategy, shard_out_val[0])
+    activation_watched = _unpack(strategy, shard_out_val[1])
+    activation_favorited = _unpack(strategy, shard_out_val[2])
+    activation_friends = _unpack(strategy, shard_out_val[3])
+
+    # Core 0:
+    # Calculate the values of embedding activations.
+    activation_watched_gold0 = np.array([[0, 1, 2, 3], [4, 6, 8, 10]])
+    activation_favorited_gold0 = np.array([[4, 6, 8, 10], [4, 5, 6, 7]])
+    # Second row of `activation_friends_gold0` is the mean of the following.
+    # row 0: 0 1
+    # row 1: 2 3
+    # row 2: 4 5
+    activation_friends_gold0 = np.array([[6, 7], [2, 3]])
+
+    loss_gold0 = _compute_loss(activation_watched_gold0,
+                               activation_favorited_gold0,
+                               activation_friends_gold0)
+
+    # Add on values from other cores:
+    # Activations for watched are an alternating sequence of
+    # activation_watched_gold0 and activation_favorited_gold0.
+    # For favorited it is the same but in the opposite order.
+    activation_watched_gold = np.concatenate(
+        (np.concatenate((np.expand_dims(activation_watched_gold0, axis=0),) *
+                        (num_replicas // 2)),
+         np.concatenate((np.expand_dims(activation_favorited_gold0, axis=0),) *
+                        (num_replicas // 2))),
+        axis=1).reshape([self.batch_size * num_replicas, 4])
+    activation_favorited_gold = np.concatenate(
+        (activation_watched_gold[self.batch_size:,],
+         activation_watched_gold[0:self.batch_size,]))
+    activation_friends_gold = np.concatenate(
+        (activation_friends_gold0,) * num_replicas)
+
+    loss_gold = [loss_gold0] * num_replicas
+
+    # Test values.
+    self.assertAllClose(activation_watched_gold, activation_watched)
+    self.assertAllClose(activation_favorited_gold, activation_favorited)
+    self.assertAllClose(activation_friends_gold, activation_friends)
+
+    self.assertAllClose(loss_gold, loss)
+
+    embedding_table_video_before = np.copy(
+        np.reshape(self.embedding_values, [8, 4]))
+    embedding_table_user_before = np.copy(
+        np.reshape(self.embedding_values, [16, 2]))
+
+    global_batch_size = self.batch_size * num_replicas
+    if training:
+      gradient_wrt_watched_gold = (2 * activation_watched_gold /
+                                   global_batch_size)
+      gradient_wrt_favorited_gold = (2 * activation_favorited_gold /
+                                     global_batch_size)
+      gradient_wrt_friends_gold = (2 * activation_friends_gold /
+                                   global_batch_size)
+
+      # Calculate gradients wrt embedding tables.
+      gradients_wrt_user = (
+          _compute_gradients_wrt_embedding_table(
+              global_batch_size, gradient_wrt_friends_gold,
+              embedding_table_user_before, input_data[2].indices.numpy(),
+              input_data[2].values.numpy(), self.table_user.combiner))
+      gradients_wrt_video = (
+          _compute_gradients_wrt_embedding_table(
+              global_batch_size, gradient_wrt_favorited_gold,
+              embedding_table_video_before, input_data[1].indices.numpy(),
+              input_data[1].values.numpy(), self.table_video.combiner) +
+          _compute_gradients_wrt_embedding_table(
+              global_batch_size, gradient_wrt_watched_gold,
+              embedding_table_video_before, input_data[0].indices.numpy(),
+              input_data[0].values.numpy(), self.table_video.combiner))
+
+      self._check_embedding_and_slot_variables(embedding_table_user_before,
+                                               gradients_wrt_user,
+                                               embedding_table_video_before,
+                                               gradients_wrt_video,
+                                               optimizer,
+                                               table_to_variable)
+
+  def _check_embedding_and_slot_variables(self, embedding_table_user_before,
+                                          gradients_wrt_user,
+                                          embedding_table_video_before,
+                                          gradients_wrt_video,
+                                          optimizer,
+                                          table_to_variable):
+    if isinstance(optimizer, tpu_embedding_v2_utils.SGD):
+      check_fn = self._check_embedding_and_slot_variables_for_sgd
+    elif isinstance(optimizer, tpu_embedding_v2_utils.Adagrad):
+      check_fn = self._check_embedding_and_slot_variables_for_adagrad
+    elif isinstance(optimizer, tpu_embedding_v2_utils.Adam):
+      check_fn = self._check_embedding_and_slot_variables_for_adam
+    else:
+      raise ValueError('optimizer is not recognized: ', type(optimizer))
+    check_fn(embedding_table_user_before, gradients_wrt_user,
+             optimizer, table_to_variable[self.table_user.name])
+    check_fn(embedding_table_video_before, gradients_wrt_video,
+             optimizer, table_to_variable[self.table_video.name])
+
+  def _check_embedding_and_slot_variables_for_sgd(self, embedding_table_before,
+                                                  gradients,
+                                                  optimizer,
+                                                  variables):
+    embedding_table = np.copy(embedding_table_before)
+    embedding_table -= optimizer.learning_rate * np.sum(gradients, axis=0)
+    self.assertAllClose(_get_variable(variables['parameters']).numpy(),
+                        embedding_table)
+
+  def _check_embedding_and_slot_variables_for_adagrad(self,
+                                                      embedding_table_before,
+                                                      gradients,
+                                                      optimizer,
+                                                      variable):
+    embedding_table = np.copy(embedding_table_before)
+    accumulator = (
+        optimizer.initial_accumulator_value + np.sum(gradients, axis=0)**2)
+    embedding_table -= (
+        optimizer.learning_rate * np.sum(gradients, axis=0) /
+        np.sqrt(accumulator))
+    self.assertAllClose(_get_variable(variable['parameters']).numpy(),
+                        embedding_table)
+    self.assertAllClose(_get_variable(variable['accumulators']).numpy(),
+                        accumulator)
+
+  def _check_embedding_and_slot_variables_for_adam(self, embedding_table_before,
+                                                   gradients,
+                                                   optimizer,
+                                                   variable):
+    embedding_table = np.copy(embedding_table_before)
+    g = np.sum(gradients, axis=0)
+    v = g**2 * (1 - optimizer.beta_2)
+    m = g * (1 - optimizer.beta_1)
+    epsilon = optimizer.epsilon
+    # TPU Embeddings don't have the LR decay factor for Adam.
+    lr_modifier = 1
+    embedding_table -= (
+        m * optimizer.learning_rate * lr_modifier / (np.sqrt(v) + epsilon))
+    self.assertAllClose(_get_variable(variable['parameters']).numpy(),
+                        embedding_table, rtol=1e-4)
+    self.assertAllClose(_get_variable(variable['momenta']).numpy(),
+                        m, rtol=1e-4)
+    self.assertAllClose(_get_variable(variable['velocities']).numpy(),
+                        v, rtol=1e-4)
+
+  def _get_replica_numpy(self, structured, strategy, replica_id):
+    def select_replica(x):
+      x = strategy.experimental_local_results(x)
+      if len(x) == 1:
+        return x.numpy()
+      return x[replica_id].numpy()
+    return nest.map_structure(select_replica, structured)
+
+  def test_dense_lookup(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    input_fn = self._create_dense_input_fn(strategy)
+    dist = strategy.experimental_distribute_datasets_from_function(
+        input_fn,
+        options=distribute_lib.InputOptions(
+            experimental_prefetch_to_device=False))
+    dist_iter = iter(dist)
+
+    @def_function.function
+    def test_fn():
+      def step():
+        return mid_level_api.dequeue()
+
+      mid_level_api.enqueue(next(dist_iter), training=False)
+      return strategy.run(step)
+
+    # Run model.
+    shard0 = self._get_replica_numpy(test_fn(), strategy, 0)
+
+    # embedding_values is a linear list, so we reshape to match the correct
+    # shape of the corresponding table before performing the lookup.
+    numpy_videos = np.reshape(self.embedding_values, (8, 4))
+    numpy_users = np.reshape(self.embedding_values, (16, 2))
+    golden = ((numpy_videos[self.feature_watched_values[-2:]],
+               numpy_videos[self.feature_favorited_values[-2:]],
+               numpy_users[self.feature_friends_values[-2:]]))
+    self.assertAllClose(shard0, golden)
+
+  def test_sequence_embeddings(self):
+    feature_config = (
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, name='watched',
+            max_sequence_length=2),
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, name='favorited',
+            max_sequence_length=2),
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_user, name='friends',
+            max_sequence_length=3))
+    optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
+    strategy = self._get_strategy()
+    num_replicas = strategy.num_replicas_in_sync
+    with strategy.scope():
+      mid_level = tpu_embedding_v2.TPUEmbedding(
+          feature_config=feature_config,
+          optimizer=optimizer)
+    # Call build here. We call 'next' outside of the tf.function and this
+    # results in data where the shape of the sparse tensor is a tensor which we
+    # can't tell the shape of at tracing time.
+    mid_level.build(self.batch_size)
+    dataset = self._create_sparse_dataset(strategy)
+    data = next(iter(strategy.experimental_distribute_dataset(
+        dataset,
+        options=distribute_lib.InputOptions(
+            experimental_prefetch_to_device=False))))
+
+    @def_function.function
+    def embedding_and_set_gradients(data):
+      def tpu_fn():
+        activations = mid_level.dequeue()
+        mid_level.apply_gradients(nest.map_structure(array_ops.ones_like,
+                                                     activations))
+        return activations
+      mid_level.enqueue(data)
+      return strategy.run(tpu_fn)
+
+    @def_function.function
+    def embedding_only(data):
+      def tpu_fn():
+        return mid_level.dequeue()
+      mid_level.enqueue(data)
+      return strategy.run(tpu_fn)
+
+    # Only check core 0.
+    before_update = self._get_replica_numpy(
+        embedding_and_set_gradients(data), strategy, 0)
+    after_update = self._get_replica_numpy(embedding_only(data), strategy, 0)
+
+    # For videos table, row 0 and row 1 are looked up 3*num_replicas times as
+    # they occur 3 times per replica (considering the features 0 and 1 which are
+    # both looked up in the videos table).
+    # Feature 0 has ids [0, 0, 1], [0, 1, 1], ... repeated over num_replicas
+    # Feature 1 has ids [0, 1, 1], [0, 0, 1], ... repeated over num_replicas
+    # This means that both rows 0 and 1 get a -0.1*3*num_replicas update
+    # For users table, each row is looked up twice:
+    # Feature 2 has ids [3, 0, 1, 2], .. repeated over num_replicas
+    # This means that we get a -0.1*num_replicas update to the third feature.
+
+    # In general this means that after the update, if we lookup feature 0 and 1
+    # the values will be 0.3*num_replicas lower per entry and for feature 2 they
+    # will be 0.1*num_replicas lower.
+    # The one issue that that these lookups contain padding values.
+    # For core 0, we get the first 2 elements of the 4 element batch.
+    # For feature 0, the indices are [[0, 0], [1, 0], [1, 1]] with max sequence
+    # length of 2, which means that [0, 1] will be 0s.
+    # For feature 1, the indices are [[0, 0], [0, 1], [1, 0]] with max sequence
+    # length of 2, which means that [1, 1] will be 0s.
+    # For feature 2, the indices are [[0, 0], [1, 0], [1, 1], [1, 2]] with max
+    # sequence length of 3, which means that [0, 1], [0, 2] will be 0s.
+    # The following masks represent that so that we only apply the above updates
+    # to the non-padding rows:
+    masks = (
+        np.array([[[1], [0]], [[1], [1]]]),
+        np.array([[[1], [1]], [[1], [0]]]),
+        np.array([[[1], [0], [0]], [[1], [1], [1]]]))
+
+    per_row_update = (0.3 * num_replicas,
+                      0.3 * num_replicas,
+                      0.1 * num_replicas)
+    golden = tuple([before - update * mask for before, update, mask in
+                    zip(before_update, per_row_update, masks)])
+    self.assertAllClose(golden, after_update)
+
+
+def _compute_gradients_wrt_embedding_table(batch_size,
+                                           gradient_wrt_activation,
+                                           embedding_table,
+                                           feature_indices,
+                                           feature_values,
+                                           combiner,
+                                           max_sequence_length=0):
+  """Compute gradients wrt embedding_table.
+
+  Args:
+    batch_size: `int`, batch size.
+    gradient_wrt_activation: `np.array` with shape `batch_size` by
+      embedding `dimension`.
+    embedding_table: `np.array` with shape `vocabulary_size` by embedding
+      `dimension`.
+    feature_indices: `indices` as used to construct `SparseTensor`.
+    feature_values: `values` as used to construct `SparseTensor`.
+    combiner: `String`, 'mean' or 'sum'.
+    max_sequence_length: If non-zero, a sequence feature with the given length.
+
+  Returns:
+    Gradients wrt `embedding_table`, an `np.array`s with shape
+      `batch_size` by `vocabulary_size` by
+      embedding `dimension`.
+
+  Raises:
+    ValueError: if `combiner` is not one of 'mean' or 'sum'.
+  """
+  if combiner not in ('mean', 'sum'):
+    raise ValueError('`combiner` must be mean or sum; got {}.'.format(combiner))
+  grads = []
+  for i in range(batch_size):
+    grad = np.zeros_like(embedding_table)
+    count = 0
+    for (batch_i, seq_index), vocabulary_id in zip(feature_indices,
+                                                   feature_values):
+      if batch_i == i:
+        count += 1
+        if max_sequence_length > 0:
+          if seq_index < max_sequence_length:
+            grad[vocabulary_id, :] += gradient_wrt_activation[i, seq_index, :]
+        else:
+          grad[vocabulary_id, :] += gradient_wrt_activation[i, :]
+    if combiner == 'mean' and not max_sequence_length:
+      grad = grad / count
+    grads.append(grad)
+  return np.stack(grads)
+
+
+def _unpack(strategy, per_replica_output):
+  per_replica_output = strategy.experimental_local_results(per_replica_output)
+  per_replica_output = array_ops.concat(per_replica_output, axis=0).numpy()
+  return per_replica_output
+
+
+def _get_total_loss_tensor(activations):
+  losses = []
+  for activation in activations:
+    losses.append(
+        math_ops.reduce_mean(
+            math_ops.reduce_sum(
+                gen_math_ops.squared_difference(activation, 0), 1)))
+  total_loss = array_ops.expand_dims_v2(sum(losses), 0)
+  return total_loss
+
+
+def _compute_loss(activation_watched, activation_favorited, activation_friends):
+  watched_loss = np.mean(np.sum(activation_watched**2, axis=1))
+  if len(activation_favorited.shape) == 2:
+    favorited_loss = np.mean(np.sum(activation_favorited**2, axis=1))
+  else:
+    favorited_loss = np.mean(np.sum(activation_favorited**2, axis=(1, 2)))
+  if len(activation_friends.shape) == 2:
+    friends_loss = np.mean(np.sum(activation_friends**2, axis=1))
+  else:
+    friends_loss = np.mean(np.sum(activation_friends**2, axis=(1, 2)))
+  loss = watched_loss + favorited_loss + friends_loss
+  return loss
+
+
+def _get_variable(variable):
+  if isinstance(variable, tpu_embedding_v2.TPUShardedVariable):
+    return variable.variables[0]
+  return variable
+
+
+if __name__ == '__main__':
+  v2_compat.enable_v2_behavior()
+  test.main()
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_cpu_test.py b/tensorflow/python/tpu/tpu_embedding_v2_cpu_test.py
index 3177498deba..fa1e843179f 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2_cpu_test.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2_cpu_test.py
@@ -105,7 +105,6 @@ class CPUEmbeddingTest(test.TestCase):
     optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
     return tpu_embedding_v2.TPUEmbedding(
         feature_config=self.feature_config,
-        batch_size=self.batch_size,
         optimizer=optimizer)
 
   def _get_dense_tensors(self, dtype=dtypes.int32):
@@ -285,7 +284,6 @@ class CPUEmbeddingTest(test.TestCase):
     optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
     mid_level = tpu_embedding_v2.TPUEmbedding(
         feature_config=feature_config,
-        batch_size=self.batch_size,
         optimizer=optimizer)
     features = tuple(self._get_sparse_tensors()[:1])
     with self.assertRaisesRegex(
@@ -296,6 +294,20 @@ class CPUEmbeddingTest(test.TestCase):
           tables=mid_level.embedding_tables,
           feature_config=feature_config)
 
+  def test_cpu_no_optimizer(self):
+    feature_config = (
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, name='watched', max_sequence_length=2),)
+    mid_level = tpu_embedding_v2.TPUEmbedding(
+        feature_config=feature_config,
+        optimizer=None)
+    # Build the layer manually to create the variables. Normally calling enqueue
+    # would do this.
+    mid_level.build()
+    self.assertEqual(
+        list(mid_level._variables[self.table_video.name].keys()),
+        ['parameters'])
+
 
 if __name__ == '__main__':
   v2_compat.enable_v2_behavior()
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_test.py b/tensorflow/python/tpu/tpu_embedding_v2_test.py
index ff09085f3f1..5e081d6f9ef 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2_test.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import functools
-import itertools
 import os
 
 from absl import flags
@@ -51,6 +50,7 @@ from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import load
 from tensorflow.python.saved_model import save
+from tensorflow.python.tpu import tpu
 from tensorflow.python.tpu import tpu_embedding
 from tensorflow.python.tpu import tpu_embedding_v2
 from tensorflow.python.tpu import tpu_embedding_v2_utils
@@ -164,6 +164,9 @@ class TPUEmbeddingCheckpointTest(parameterized.TestCase, test.TestCase):
     )
 
   def test_checkpoint_restore_before_variable_creation(self):
+    # This test works right now because we only have one TPU host in the unit
+    # environment. Initializing from checkpoint does not understand how to
+    # pass the sharding info to the restore op right now.
 
     class TestModule(module.Module):
 
@@ -171,7 +174,6 @@ class TPUEmbeddingCheckpointTest(parameterized.TestCase, test.TestCase):
         self._initializer = initializer
         self._rows = rows
 
-      def create_embedding(self):
         table = tpu_embedding_v2_utils.TableConfig(
             vocabulary_size=self._rows, dim=4, initializer=self._initializer,
             combiner='sum', name='table')
@@ -180,9 +182,13 @@ class TPUEmbeddingCheckpointTest(parameterized.TestCase, test.TestCase):
         optimizer = tpu_embedding_v2_utils.SGD()
 
         self.tpu_embedding = tpu_embedding_v2.TPUEmbedding(
-            feature_config, self._rows, optimizer)
+            feature_config, optimizer)
 
-    # We need to clear the already loaded config provided by setUp method.
+      def create_embedding(self):
+        # We aren't training so batch_size here doesn't matter.
+        self.tpu_embedding.build(64)
+
+    # We need to clear the any already loaded config provided by setUp method.
     tpu_strategy_util.initialize_tpu_system(self.resolver)
 
     with self.strategy.scope():
@@ -228,11 +234,23 @@ class TPUEmbeddingCheckpointTest(parameterized.TestCase, test.TestCase):
     feature_config = (tpu_embedding_v2_utils.FeatureConfig(
         table=table, name='feature'),)
 
+    mid_level = tpu_embedding_v2.TPUEmbedding(
+        feature_config, optimizer)
+
+    # We want to create a second object (with its own variables) but not
+    # initialize the TPU.
+    if not initialize_tpu_embedding:
+      saved_fn = tpu.initialize_system_for_tpu_embedding
+      tpu.initialize_system_for_tpu_embedding = lambda x: None
+
     # batch_size here does not matter as we aren't training in any of these
     # tests.
-    return tpu_embedding_v2.TPUEmbedding(
-        feature_config, 64, optimizer,
-        initialize_tpu_embedding=initialize_tpu_embedding)
+    mid_level.build(64)
+
+    if not initialize_tpu_embedding:
+      tpu.initialize_system_for_tpu_embedding = saved_fn
+
+    return mid_level
 
   def make_checkpoint_and_get_embedding(self, name, model):
     """Saves model to checkpoint name, retrieves embedding variables."""
@@ -407,7 +425,6 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
                      dim=2,
                      initializer=self.initializer),
                  name='favorited')),
-            self.batch_size,
             tpu_embedding_v2_utils.SGD(learning_rate=0.1))
 
   def test_unsupported_optimizer(self):
@@ -415,11 +432,14 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
         ValueError, 'is an unsupported optimizer class.'):
       with self._get_strategy().scope():
         tpu_embedding_v2.TPUEmbedding(
-            self.feature_config, self.batch_size,
+            self.feature_config,
             tpu_embedding.AdagradParameters(learning_rate=0.1))
 
   def test_pass_non_tensor_to_apply_gradients(self):
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+    # We aren't going to actually run anything, so the batch_size here does not
+    # matter.
+    mid_level_api.build(64)
 
     @def_function.function
     def test_apply():
@@ -430,7 +450,9 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
 
   def test_pass_different_structure_to_apply_gradients(self):
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
-
+    # We aren't going to actually run anything, so the batch_size here does not
+    # matter.
+    mid_level_api.build(64)
     @def_function.function
     def test_apply():
       # This should be a tuple as feature_config is a tuple of 3 configs.
@@ -443,6 +465,7 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
 
   def test_pass_none_to_apply_gradients(self):
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+    mid_level_api.build(self.batch_size)
     dataset = self._create_sparse_dataset(strategy)
     data = next(iter(strategy.experimental_distribute_dataset(
         dataset,
@@ -493,7 +516,9 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
         tpu=FLAGS.tpu, zone=FLAGS.zone, project=FLAGS.project)
     remote.connect_to_cluster(self.resolver)
     tpu_strategy_util.initialize_tpu_system(self.resolver)
-    return tpu_strategy.TPUStrategy(self.resolver)
+    strategy = tpu_strategy.TPUStrategy(self.resolver)
+    self.num_replicas = strategy.num_replicas_in_sync
+    return strategy
 
   def test_dequeue_on_cpu(self):
     mid_level_api = self._create_mid_level()
@@ -768,6 +793,7 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
     else:
       dataset = self._create_sparse_dataset(strategy, include_weights=True,
                                             weight=weight)
+      mid_level_api.build(self.batch_size)
 
     dataset_iter = iter(strategy.experimental_distribute_dataset(
         dataset,
@@ -809,6 +835,7 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
       config.enable_mlir_bridge()
 
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+    mid_level_api.build(self.batch_size)
     dataset = self._create_sparse_dataset(strategy)
     dataset_iter = iter(strategy.experimental_distribute_dataset(
         dataset,
@@ -873,6 +900,7 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
 
   def test_enqueue_with_outside_compilation_non_direct_input(self):
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+    mid_level_api.build(self.batch_size)
     dataset = self._create_sparse_dataset(strategy)
     dataset_iter = iter(strategy.experimental_distribute_dataset(
         dataset,
@@ -895,6 +923,7 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
 
   def test_enqueue_with_outside_compilation_auto_mode(self):
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+    mid_level_api.build(self.batch_size)
     dataset = self._create_sparse_dataset(strategy)
     dataset_iter = iter(strategy.experimental_distribute_dataset(
         dataset,
@@ -969,70 +998,13 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
 
     return strategy, mid_level_api, optimizer
 
-  @parameterized.parameters(
-      *itertools.product(
-          ['sgd', 'adagrad', 'adam'],
-          [True, False]))
-  def test_embedding(self, optimizer_name, training):
-    strategy, mid_level_api, optimizer = (
-        self._create_strategy_and_mid_level(optimizer_name))
-
-    dataset = self._create_sparse_dataset(strategy)
-    dist = strategy.experimental_distribute_dataset(
-        dataset,
-        options=distribute_lib.InputOptions(
-            experimental_prefetch_to_device=False))
-    dist_iter = iter(dist)
-
-    @def_function.function
-    def test_fn():
-
-      def step():
-        """Create and run computation that returns the embedding activations."""
-        if not training:
-          activations = mid_level_api.dequeue()
-          total_loss = _get_total_loss_tensor(activations)
-          ret_val = [total_loss] + list(activations)
-          return ret_val
-        else:
-          with backprop.GradientTape() as tape:
-            activations = mid_level_api.dequeue()
-            tape.watch(activations)
-            total_loss = _get_total_loss_tensor(activations)
-            loss_per_replica = total_loss / strategy.num_replicas_in_sync
-          gradients = tape.gradient(loss_per_replica, activations)
-          mid_level_api.apply_gradients(gradients)
-        ret_val = [total_loss] + list(activations)
-        return ret_val
-
-      mid_level_api.enqueue(next(dist_iter), training=training)
-      result = strategy.run(step)
-      return result
-
-    # Run model.
-    shard_out_val = test_fn()
-
-    # Retrieve TPU weights to CPU.
-    mid_level_api._retrieve_variables()
-
-    # Compute sparse tensors for global batch.
-    input_data = next(iter(self._create_sparse_dataset(strategy)))
-
-    # Check results.
-    self._check_results(strategy, shard_out_val, training, input_data,
-                        mid_level_api._variables,
-                        optimizer)
-
   def _create_mid_level(self, optimizer=None):
     # Create `TPUEmbedding` object.
     if optimizer is None:
       optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
 
-    num_replicas = (
-        distribution_strategy_context.get_strategy().num_replicas_in_sync)
     return tpu_embedding_v2.TPUEmbedding(
         feature_config=self.feature_config,
-        batch_size=self.batch_size * num_replicas,
         optimizer=optimizer)
 
   def _create_sparse_dataset(self, strategy, include_weights=False, weight=0.5):
@@ -1111,156 +1083,6 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
 
     return input_fn
 
-  def _check_results(self, strategy, shard_out_val, training, input_data,
-                     table_to_variable, optimizer):
-    num_replicas = strategy.num_replicas_in_sync
-
-    # Unpack the values `strategy.run()` returns.
-    loss = _unpack(strategy, shard_out_val[0])
-    activation_watched = _unpack(strategy, shard_out_val[1])
-    activation_favorited = _unpack(strategy, shard_out_val[2])
-    activation_friends = _unpack(strategy, shard_out_val[3])
-
-    # Core 0:
-    # Calculate the values of embedding activations.
-    activation_watched_gold0 = np.array([[0, 1, 2, 3], [4, 6, 8, 10]])
-    activation_favorited_gold0 = np.array([[4, 6, 8, 10], [4, 5, 6, 7]])
-    # Second row of `activation_friends_gold0` is the mean of the following.
-    # row 0: 0 1
-    # row 1: 2 3
-    # row 2: 4 5
-    activation_friends_gold0 = np.array([[6, 7], [2, 3]])
-
-    loss_gold0 = _compute_loss(activation_watched_gold0,
-                               activation_favorited_gold0,
-                               activation_friends_gold0)
-
-    # Add on values from other cores:
-    # Activations for watched are an alternating sequence of
-    # activation_watched_gold0 and activation_favorited_gold0.
-    # For favorited it is the same but in the opposite order.
-    activation_watched_gold = np.concatenate(
-        (np.concatenate((np.expand_dims(activation_watched_gold0, axis=0),) *
-                        (num_replicas // 2)),
-         np.concatenate((np.expand_dims(activation_favorited_gold0, axis=0),) *
-                        (num_replicas // 2))),
-        axis=1).reshape([self.batch_size * num_replicas, 4])
-    activation_favorited_gold = np.concatenate(
-        (activation_watched_gold[self.batch_size:,],
-         activation_watched_gold[0:self.batch_size,]))
-    activation_friends_gold = np.concatenate(
-        (activation_friends_gold0,) * num_replicas)
-
-    loss_gold = [loss_gold0] * num_replicas
-
-    # Test values.
-    self.assertAllClose(activation_watched_gold, activation_watched)
-    self.assertAllClose(activation_favorited_gold, activation_favorited)
-    self.assertAllClose(activation_friends_gold, activation_friends)
-
-    self.assertAllClose(loss_gold, loss)
-
-    embedding_table_video_before = np.copy(
-        np.reshape(self.embedding_values, [8, 4]))
-    embedding_table_user_before = np.copy(
-        np.reshape(self.embedding_values, [16, 2]))
-
-    global_batch_size = self.batch_size * num_replicas
-    if training:
-      gradient_wrt_watched_gold = (2 * activation_watched_gold /
-                                   global_batch_size)
-      gradient_wrt_favorited_gold = (2 * activation_favorited_gold /
-                                     global_batch_size)
-      gradient_wrt_friends_gold = (2 * activation_friends_gold /
-                                   global_batch_size)
-
-      # Calculate gradients wrt embedding tables.
-      gradients_wrt_user = (
-          _compute_gradients_wrt_embedding_table(
-              global_batch_size, gradient_wrt_friends_gold,
-              embedding_table_user_before, input_data[2].indices.numpy(),
-              input_data[2].values.numpy(), self.table_user.combiner))
-      gradients_wrt_video = (
-          _compute_gradients_wrt_embedding_table(
-              global_batch_size, gradient_wrt_favorited_gold,
-              embedding_table_video_before, input_data[1].indices.numpy(),
-              input_data[1].values.numpy(), self.table_video.combiner) +
-          _compute_gradients_wrt_embedding_table(
-              global_batch_size, gradient_wrt_watched_gold,
-              embedding_table_video_before, input_data[0].indices.numpy(),
-              input_data[0].values.numpy(), self.table_video.combiner))
-
-      self._check_embedding_and_slot_variables(embedding_table_user_before,
-                                               gradients_wrt_user,
-                                               embedding_table_video_before,
-                                               gradients_wrt_video,
-                                               optimizer,
-                                               table_to_variable)
-
-  def _check_embedding_and_slot_variables(self, embedding_table_user_before,
-                                          gradients_wrt_user,
-                                          embedding_table_video_before,
-                                          gradients_wrt_video,
-                                          optimizer,
-                                          table_to_variable):
-    if isinstance(optimizer, tpu_embedding_v2_utils.SGD):
-      check_fn = self._check_embedding_and_slot_variables_for_sgd
-    elif isinstance(optimizer, tpu_embedding_v2_utils.Adagrad):
-      check_fn = self._check_embedding_and_slot_variables_for_adagrad
-    elif isinstance(optimizer, tpu_embedding_v2_utils.Adam):
-      check_fn = self._check_embedding_and_slot_variables_for_adam
-    else:
-      raise ValueError('optimizer is not recognized: ', type(optimizer))
-    check_fn(embedding_table_user_before, gradients_wrt_user,
-             optimizer, table_to_variable[self.table_user.name])
-    check_fn(embedding_table_video_before, gradients_wrt_video,
-             optimizer, table_to_variable[self.table_video.name])
-
-  def _check_embedding_and_slot_variables_for_sgd(self, embedding_table_before,
-                                                  gradients,
-                                                  optimizer,
-                                                  variables):
-    embedding_table = np.copy(embedding_table_before)
-    embedding_table -= optimizer.learning_rate * np.sum(gradients, axis=0)
-    self.assertAllClose(_get_variable(variables['parameters']).numpy(),
-                        embedding_table)
-
-  def _check_embedding_and_slot_variables_for_adagrad(self,
-                                                      embedding_table_before,
-                                                      gradients,
-                                                      optimizer,
-                                                      variable):
-    embedding_table = np.copy(embedding_table_before)
-    accumulator = (
-        optimizer.initial_accumulator_value + np.sum(gradients, axis=0)**2)
-    embedding_table -= (
-        optimizer.learning_rate * np.sum(gradients, axis=0) /
-        np.sqrt(accumulator))
-    self.assertAllClose(_get_variable(variable['parameters']).numpy(),
-                        embedding_table)
-    self.assertAllClose(_get_variable(variable['accumulators']).numpy(),
-                        accumulator)
-
-  def _check_embedding_and_slot_variables_for_adam(self, embedding_table_before,
-                                                   gradients,
-                                                   optimizer,
-                                                   variable):
-    embedding_table = np.copy(embedding_table_before)
-    g = np.sum(gradients, axis=0)
-    v = g**2 * (1 - optimizer.beta_2)
-    m = g * (1 - optimizer.beta_1)
-    epsilon = optimizer.epsilon
-    # TPU Embeddings don't have the LR decay factor for Adam.
-    lr_modifier = 1
-    embedding_table -= (
-        m * optimizer.learning_rate * lr_modifier / (np.sqrt(v) + epsilon))
-    self.assertAllClose(_get_variable(variable['parameters']).numpy(),
-                        embedding_table, rtol=1e-4)
-    self.assertAllClose(_get_variable(variable['momenta']).numpy(),
-                        m, rtol=1e-4)
-    self.assertAllClose(_get_variable(variable['velocities']).numpy(),
-                        v, rtol=1e-4)
-
   def _get_replica_numpy(self, structured, strategy, replica_id):
     def select_replica(x):
       x = strategy.experimental_local_results(x)
@@ -1269,36 +1091,6 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
       return x[replica_id].numpy()
     return nest.map_structure(select_replica, structured)
 
-  def test_dense_lookup(self):
-    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
-
-    input_fn = self._create_dense_input_fn(strategy)
-    dist = strategy.experimental_distribute_datasets_from_function(
-        input_fn,
-        options=distribute_lib.InputOptions(
-            experimental_prefetch_to_device=False))
-    dist_iter = iter(dist)
-
-    @def_function.function
-    def test_fn():
-      def step():
-        return mid_level_api.dequeue()
-
-      mid_level_api.enqueue(next(dist_iter), training=False)
-      return strategy.run(step)
-
-    # Run model.
-    shard0 = self._get_replica_numpy(test_fn(), strategy, 0)
-
-    # embedding_values is a linear list, so we reshape to match the correct
-    # shape of the corresponding table before performing the lookup.
-    numpy_videos = np.reshape(self.embedding_values, (8, 4))
-    numpy_users = np.reshape(self.embedding_values, (16, 2))
-    golden = ((numpy_videos[self.feature_watched_values[-2:]],
-               numpy_videos[self.feature_favorited_values[-2:]],
-               numpy_users[self.feature_friends_values[-2:]]))
-    self.assertAllClose(shard0, golden)
-
   def test_variable_learning_rate(self):
     num_steps = 10
     num_steps_float = float(num_steps)
@@ -1328,7 +1120,6 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
           feature_config={
               'feature': tpu_embedding_v2_utils.FeatureConfig(
                   table=table_config, name='feature')},
-          batch_size=num_replicas,
           optimizer=optimizer)
 
     feature = {'feature': constant_op.constant([0], dtype=dtypes.int32)}
@@ -1383,7 +1174,7 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
 
   @parameterized.parameters([True, False])
   def test_optimizer_with_slot_creation_fn(self, use_tpu):
-    def slot_creation_fn(table, slot_names):
+    def slot_creation_fn(table, slot_names, _):
       slots = {}
       for slot in slot_names:
         slots[slot] = tf_variables.Variable(
@@ -1399,12 +1190,13 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
       strategy = self._get_strategy()
     else:
       strategy = distribution_strategy_context.get_strategy()
-    num_replicas = strategy.num_replicas_in_sync
     with strategy.scope():
       mid_level = tpu_embedding_v2.TPUEmbedding(
           feature_config=self.feature_config,
-          batch_size=self.batch_size * num_replicas,
           optimizer=optimizer)
+      # We aren't going to actually run anything, so the batch_size here does
+      # not matter.
+      mid_level.build(self.batch_size)
     video_accumulator = mid_level._variables['video']['accumulators']
     user_accumulator = mid_level._variables['user']['accumulators']
     if use_tpu:
@@ -1423,7 +1215,7 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
                                   self.table_user.dim)))
 
   def test_optimizer_with_slot_creation_fn_non_partial(self):
-    def slot_creation_fn(table, slot_names):
+    def slot_creation_fn(table, slot_names, _):
       slots = {}
       for slot in slot_names:
         # Note that we don't pass functools.partial here, so on TPU we can't
@@ -1438,146 +1230,15 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
         learning_rate=0.1,
         slot_variable_creation_fn=slot_creation_fn)
     strategy = self._get_strategy()
-    num_replicas = strategy.num_replicas_in_sync
     with strategy.scope():
+      mid_level_api = tpu_embedding_v2.TPUEmbedding(
+          feature_config=self.feature_config,
+          optimizer=optimizer)
       with self.assertRaisesRegex(ValueError,
                                   'Unable to extract initializer function'):
-        tpu_embedding_v2.TPUEmbedding(
-            feature_config=self.feature_config,
-            batch_size=self.batch_size*num_replicas,
-            optimizer=optimizer)
-
-  def test_sequence_embeddings(self):
-    feature_config = (
-        tpu_embedding_v2_utils.FeatureConfig(
-            table=self.table_video, name='watched',
-            max_sequence_length=2),
-        tpu_embedding_v2_utils.FeatureConfig(
-            table=self.table_video, name='favorited',
-            max_sequence_length=2),
-        tpu_embedding_v2_utils.FeatureConfig(
-            table=self.table_user, name='friends',
-            max_sequence_length=3))
-    optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
-    strategy = self._get_strategy()
-    num_replicas = strategy.num_replicas_in_sync
-    with strategy.scope():
-      mid_level = tpu_embedding_v2.TPUEmbedding(
-          feature_config=feature_config,
-          batch_size=self.batch_size * num_replicas,
-          optimizer=optimizer)
-
-    dataset = self._create_sparse_dataset(strategy)
-    data = next(iter(strategy.experimental_distribute_dataset(
-        dataset,
-        options=distribute_lib.InputOptions(
-            experimental_prefetch_to_device=False))))
-
-    @def_function.function
-    def embedding_and_set_gradients(data):
-      def tpu_fn():
-        activations = mid_level.dequeue()
-        mid_level.apply_gradients(nest.map_structure(array_ops.ones_like,
-                                                     activations))
-        return activations
-      mid_level.enqueue(data)
-      return strategy.run(tpu_fn)
-
-    @def_function.function
-    def embedding_only(data):
-      def tpu_fn():
-        return mid_level.dequeue()
-      mid_level.enqueue(data)
-      return strategy.run(tpu_fn)
-
-    # Only check core 0.
-    before_update = self._get_replica_numpy(
-        embedding_and_set_gradients(data), strategy, 0)
-    after_update = self._get_replica_numpy(embedding_only(data), strategy, 0)
-
-    # For videos table, row 0 and row 1 are looked up 3*num_replicas times as
-    # they occur 3 times per replica (considering the features 0 and 1 which are
-    # both looked up in the videos table).
-    # Feature 0 has ids [0, 0, 1], [0, 1, 1], ... repeated over num_replicas
-    # Feature 1 has ids [0, 1, 1], [0, 0, 1], ... repeated over num_replicas
-    # This means that both rows 0 and 1 get a -0.1*3*num_replicas update
-    # For users table, each row is looked up twice:
-    # Feature 2 has ids [3, 0, 1, 2], .. repeated over num_replicas
-    # This means that we get a -0.1*num_replicas update to the third feature.
-
-    # In general this means that after the update, if we lookup feature 0 and 1
-    # the values will be 0.3*num_replicas lower per entry and for feature 2 they
-    # will be 0.1*num_replicas lower.
-    # The one issue that that these lookups contain padding values.
-    # For core 0, we get the first 2 elements of the 4 element batch.
-    # For feature 0, the indices are [[0, 0], [1, 0], [1, 1]] with max sequence
-    # length of 2, which means that [0, 1] will be 0s.
-    # For feature 1, the indices are [[0, 0], [0, 1], [1, 0]] with max sequence
-    # length of 2, which means that [1, 1] will be 0s.
-    # For feature 2, the indices are [[0, 0], [1, 0], [1, 1], [1, 2]] with max
-    # sequence length of 3, which means that [0, 1], [0, 2] will be 0s.
-    # The following masks represent that so that we only apply the above updates
-    # to the non-padding rows:
-    masks = (
-        np.array([[[1], [0]], [[1], [1]]]),
-        np.array([[[1], [1]], [[1], [0]]]),
-        np.array([[[1], [0], [0]], [[1], [1], [1]]]))
-
-    per_row_update = (0.3 * num_replicas,
-                      0.3 * num_replicas,
-                      0.1 * num_replicas)
-    golden = tuple([before - update * mask for before, update, mask in
-                    zip(before_update, per_row_update, masks)])
-    self.assertAllClose(golden, after_update)
-
-
-def _compute_gradients_wrt_embedding_table(batch_size,
-                                           gradient_wrt_activation,
-                                           embedding_table,
-                                           feature_indices,
-                                           feature_values,
-                                           combiner,
-                                           max_sequence_length=0):
-  """Compute gradients wrt embedding_table.
-
-  Args:
-    batch_size: `int`, batch size.
-    gradient_wrt_activation: `np.array` with shape `batch_size` by
-      embedding `dimension`.
-    embedding_table: `np.array` with shape `vocabulary_size` by embedding
-      `dimension`.
-    feature_indices: `indices` as used to construct `SparseTensor`.
-    feature_values: `values` as used to construct `SparseTensor`.
-    combiner: `String`, 'mean' or 'sum'.
-    max_sequence_length: If non-zero, a sequence feature with the given length.
-
-  Returns:
-    Gradients wrt `embedding_table`, an `np.array`s with shape
-      `batch_size` by `vocabulary_size` by
-      embedding `dimension`.
-
-  Raises:
-    ValueError: if `combiner` is not one of 'mean' or 'sum'.
-  """
-  if combiner not in ('mean', 'sum'):
-    raise ValueError('`combiner` must be mean or sum; got {}.'.format(combiner))
-  grads = []
-  for i in range(batch_size):
-    grad = np.zeros_like(embedding_table)
-    count = 0
-    for (batch_i, seq_index), vocabulary_id in zip(feature_indices,
-                                                   feature_values):
-      if batch_i == i:
-        count += 1
-        if max_sequence_length > 0:
-          if seq_index < max_sequence_length:
-            grad[vocabulary_id, :] += gradient_wrt_activation[i, seq_index, :]
-        else:
-          grad[vocabulary_id, :] += gradient_wrt_activation[i, :]
-    if combiner == 'mean' and not max_sequence_length:
-      grad = grad / count
-    grads.append(grad)
-  return np.stack(grads)
+        # We aren't going to actually run anything, so the batch_size here does
+        # not matter.
+        mid_level_api.build(self.batch_size)
 
 
 def _unpack(strategy, per_replica_output):
@@ -1586,31 +1247,6 @@ def _unpack(strategy, per_replica_output):
   return per_replica_output
 
 
-def _get_total_loss_tensor(activations):
-  losses = []
-  for activation in activations:
-    losses.append(
-        math_ops.reduce_mean(
-            math_ops.reduce_sum(
-                gen_math_ops.squared_difference(activation, 0), 1)))
-  total_loss = array_ops.expand_dims_v2(sum(losses), 0)
-  return total_loss
-
-
-def _compute_loss(activation_watched, activation_favorited, activation_friends):
-  watched_loss = np.mean(np.sum(activation_watched**2, axis=1))
-  if len(activation_favorited.shape) == 2:
-    favorited_loss = np.mean(np.sum(activation_favorited**2, axis=1))
-  else:
-    favorited_loss = np.mean(np.sum(activation_favorited**2, axis=(1, 2)))
-  if len(activation_friends.shape) == 2:
-    friends_loss = np.mean(np.sum(activation_friends**2, axis=1))
-  else:
-    friends_loss = np.mean(np.sum(activation_friends**2, axis=(1, 2)))
-  loss = watched_loss + favorited_loss + friends_loss
-  return loss
-
-
 def _get_tmpdir(name, subdir=''):
   segments = [FLAGS.model_dir, name] + ([subdir] if subdir else [])
   return os.path.join(*segments)
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_utils.py b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
index 9d7de203889..86f85392681 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2_utils.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
@@ -111,7 +111,8 @@ class _Optimizer(object):
       A dict of variables, keyed by self._slot_names().
     """
     if self.slot_variable_creation_fn is not None:
-      return self.slot_variable_creation_fn(table, self._slot_names())
+      return self.slot_variable_creation_fn(table, self._slot_names(),
+                                            self._slot_initializers())
     else:
       slots = {}
       for slot, initializer in zip(self._slot_names(),
diff --git a/tensorflow/python/tpu/tpu_feed.py b/tensorflow/python/tpu/tpu_feed.py
index ce5f9aa6b8b..d3b66e3fd08 100644
--- a/tensorflow/python/tpu/tpu_feed.py
+++ b/tensorflow/python/tpu/tpu_feed.py
@@ -135,6 +135,7 @@ class InfeedQueue(object):
                tuple_types=None,
                tuple_shapes=None,
                shard_dimensions=None,
+               number_of_partitions=None,
                name=None):
     """Creates a new InfeedQueue with the given configuration.
 
@@ -150,6 +151,13 @@ class InfeedQueue(object):
       shard_dimensions: if not None, a list of dimensions on which the
         elements of the queue should be sharded during automatic
         parallelization.
+      number_of_partitions: if > 1, the infeed dequeue shape will contain
+        the full shape that includes all partitions and add corresponding XLA
+        annotation on the infeed dequeue op. In this case, the infeed is still
+        data parallel that feeds per-core batch size to each core while the XLA
+        computation may be partitioned. As XLA requires infeed dequeue shape to
+        be per-replica shape, thus we need number_of_partitions here to
+        calculate the per-replica unpartitioned shape.
       name: the name of the queue.
 
     Raises:
@@ -166,6 +174,10 @@ class InfeedQueue(object):
     self._generated_enqueue_ops = False
     self._generated_dequeue_op = False
     self._name = "InfeedQueue" if name is None else name
+    if number_of_partitions is None:
+      self._number_of_partitions = 1
+    else:
+      self._number_of_partitions = number_of_partitions
     if number_of_tuple_elements is None:
       if tuple_types is not None:
         number_of_tuple_elements = len(tuple_types)
@@ -359,6 +371,7 @@ class InfeedQueue(object):
     """
     for policy in self._sharding_policies:
       policy.set_number_of_shards(number_of_shards)
+      policy.set_number_of_partitions(self._number_of_partitions)
     self._validate()
 
   def set_configuration_from_input_tensors(self, input_tensors):
@@ -485,16 +498,23 @@ class InfeedQueue(object):
     self._generated_dequeue_op = True
     full_name = "%s/dequeue" % self._name
     sharded_shapes = [
-        policy.get_sharded_shape(shape)
+        policy.get_unpartitioned_shape(policy.get_sharded_shape(shape))
         for (shape, policy) in zip(self._tuple_shapes, self._sharding_policies)
     ]
     if tpu_device is not None:
       with ops.device(tpu.core(tpu_device)):
-        return tpu_ops.infeed_dequeue_tuple(
+        dequeue_op = tpu_ops.infeed_dequeue_tuple(
             dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
     else:
-      return tpu_ops.infeed_dequeue_tuple(
+      dequeue_op = tpu_ops.infeed_dequeue_tuple(
           dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
+    if self._number_of_partitions <= 1:
+      return dequeue_op
+    partitions = [
+        policy.get_unpartitioned_shape([1] * shape.ndims).as_list()
+        for (shape, policy) in zip(self._tuple_shapes, self._sharding_policies)
+    ]
+    return tag_sharding_attribute_for_dequeued_tensors(dequeue_op, partitions)
 
   def _generate_enqueue_op(self,
                            inputs,
diff --git a/tensorflow/python/tpu/tpu_outside_compilation_test.py b/tensorflow/python/tpu/tpu_outside_compilation_test.py
index 811807378e8..72e9f10d184 100644
--- a/tensorflow/python/tpu/tpu_outside_compilation_test.py
+++ b/tensorflow/python/tpu/tpu_outside_compilation_test.py
@@ -19,23 +19,31 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import tempfile
 
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.core.util import event_pb2
 from tensorflow.python.distribute import tpu_strategy as tpu_lib
 from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
+from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
+from tensorflow.python.lib.io import tf_record
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import summary_ops_v2 as summary
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import flags
+from tensorflow.python.platform import gfile
 from tensorflow.python.tpu import tpu
 from tensorflow.python.tpu import tpu_strategy_util
 
@@ -58,7 +66,26 @@ def get_tpu_strategy():
   resolver = get_tpu_cluster_resolver()
   remote.connect_to_cluster(resolver)
   tpu_strategy_util.initialize_tpu_system(resolver)
-  return tpu_lib.TPUStrategy(resolver)
+  return tpu_lib.TPUStrategyV2(resolver)
+
+
+def computation_with_string_ops(x):
+  output = string_ops.string_format("1{}", x)
+  return string_ops.string_to_number(output)
+
+
+def _events_from_logdir(test_case, logdir):
+  """Reads summary events from log directory."""
+  test_case.assertTrue(gfile.Exists(logdir))
+  files = gfile.ListDirectory(logdir)
+  test_case.assertLen(files, 1)
+  records = list(tf_record.tf_record_iterator(os.path.join(logdir, files[0])))
+  result = []
+  for r in records:
+    event = event_pb2.Event()
+    event.ParseFromString(r)
+    result.append(event)
+  return result
 
 
 class TpuOutsideCompilationTest(test.TestCase, parameterized.TestCase):
@@ -416,7 +443,7 @@ class TpuOutsideCompilationTest(test.TestCase, parameterized.TestCase):
         strategy.experimental_local_results(train_step()),
         constant_op.constant(8748., shape=(strategy.num_replicas_in_sync)))
 
-  def testiGradientOfGradientAcrossOutsideCompilation(self):
+  def testGradientOfGradientAcrossOutsideCompilation(self):
     """Tests compiled gradients of gradients can contain host computations."""
     strategy = get_tpu_strategy()
 
@@ -443,5 +470,107 @@ class TpuOutsideCompilationTest(test.TestCase, parameterized.TestCase):
         constant_op.constant(2916., shape=(strategy.num_replicas_in_sync)))
 
 
+class OutsideCompilationOnUnsupportedOpTest(test.TestCase):
+
+  def setUp(self):
+    super(OutsideCompilationOnUnsupportedOpTest, self).setUp()
+    config.set_soft_device_placement(True)
+
+  def testStringOpWithManualOutsideCompilation(self):
+    strategy = get_tpu_strategy()
+
+    @def_function.function
+    def train_step(x):
+
+      def computation(x):
+        return tpu.outside_compilation(computation_with_string_ops, x)
+
+      return strategy.run(computation, args=(x,))
+
+    self.assertAllEqual(
+        strategy.experimental_local_results(train_step(0)),
+        constant_op.constant(10, shape=(strategy.num_replicas_in_sync)))
+
+  def testStringOpWithAutoOutsideCompilation(self):
+    strategy = get_tpu_strategy()
+
+    @def_function.function
+    def train_step(x):
+
+      def computation(x):
+        return computation_with_string_ops(x)
+
+      return strategy.run(computation, args=(x,))
+
+    self.assertAllEqual(
+        strategy.experimental_local_results(train_step(0)),
+        constant_op.constant(10, shape=(strategy.num_replicas_in_sync)))
+
+  def testSummaryWithAutoOutsideCompilation(self):
+    strategy = get_tpu_strategy()
+
+    def host_computation(x):
+      summary.scalar("x", x, step=0)
+      return x * 2.0
+
+    @def_function.function
+    def step():
+
+      def computation(x):
+        x = x + 1.0
+        y = host_computation(x)
+        return y + 1.0
+
+      return strategy.run(computation, args=(2.0,))
+
+    logdir = tempfile.mkdtemp()
+    summary_writer = summary.create_file_writer(logdir, flush_millis=10000)
+    with summary_writer.as_default(), summary.always_record_summaries():
+      self.assertAllEqual(
+          strategy.experimental_local_results(step()),
+          constant_op.constant(7., shape=(strategy.num_replicas_in_sync)))
+    events = _events_from_logdir(self, logdir)
+    # There will be 2 entries: 1 summary file header entry, and 1 entry
+    # written by host.
+    self.assertLen(events, 2)
+    self.assertEqual(events[1].summary.value[0].tag, "x")
+    self.assertEqual(events[1].summary.value[0].simple_value, 3.0)
+
+  def testAutoOutsideCompilationWithFunctionalNodes(self):
+    strategy = get_tpu_strategy()
+
+    @def_function.function
+    def train_step(a, b):
+
+      def fn(a, b):
+        fn1 = lambda: computation_with_string_ops(a * 100)
+        fn2 = lambda: computation_with_string_ops(a)
+        pred = math_ops.greater_equal(a, b)
+        result = array_ops.identity(
+            control_flow_ops.cond(pred, fn1, fn2),
+            name="uncompilable_control_flow")
+        return result
+
+      return strategy.run(fn, args=(a, b))
+
+    self.assertAllEqual(
+        strategy.experimental_local_results(train_step(0.0, -1.0)),
+        constant_op.constant(10, shape=(strategy.num_replicas_in_sync)))
+
+  def testRandomOpsWithAutoOutsideCompilation(self):
+    strategy = get_tpu_strategy()
+
+    @def_function.function
+    def train_step():
+
+      def computation():
+        return random_ops.random_normal(shape=[1, 2, 3])
+
+      return strategy.run(computation, args=())
+
+    self.assertAllEqual(
+        strategy.experimental_local_results(train_step())[0].shape, [1, 2, 3])
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/tpu/tpu_sharding.py b/tensorflow/python/tpu/tpu_sharding.py
index 05e6ce2c506..6fd4256a8a1 100644
--- a/tensorflow/python/tpu/tpu_sharding.py
+++ b/tensorflow/python/tpu/tpu_sharding.py
@@ -34,6 +34,7 @@ class ShardingPolicy(object):
 
   def __init__(self):
     self._number_of_shards = None
+    self._number_of_partitions = 1
     self._shard_dimension = None
     self._frozen = False
 
@@ -92,6 +93,32 @@ class ShardingPolicy(object):
             "Can't set sharding policy to use %s shards; value must be >0" %
             str(number_of_shards))
 
+  @property
+  def number_of_partitions(self):
+    """Returns the number of partitions of the policy or None if unspecified."""
+    return self._number_of_partitions
+
+  def set_number_of_partitions(self, number_of_partitions):
+    """Sets the number of partitions for the current policy.
+
+    If the policy has been frozen then shard_dimension must match the
+    existing setting.
+
+    Args:
+      number_of_partitions: The number of partitions to use in the policy.
+
+    Raises:
+      ValueError: If the policy has been frozen and shard_dimension
+        differs from the frozen value.
+    """
+    if self._frozen:
+      if self._number_of_partitions != number_of_partitions:
+        raise ValueError(
+            "Can't set number_of_partitions to %d since it has been frozen to "
+            "use %d." % (number_of_partitions, self._number_of_partitions))
+    else:
+      self._number_of_partitions = number_of_partitions
+
   @property
   def shard_dimension(self):
     """Returns the shard dimension of the policy or None if unspecified."""
@@ -134,6 +161,34 @@ class ShardingPolicy(object):
     if other.shard_dimension is not None:
       self.set_shard_dimension(other.shard_dimension)
 
+  def get_unpartitioned_shape(self, shape):
+    """Returns the shape of an unpartitioned Tensor.
+
+    When given the shape of a 'sharded-size' Tensor, returns the shape
+    of the full shape of its unpartitioned Tensor.
+
+    Args:
+      shape: The shape of the sharded Tensor.
+
+    Returns:
+      The shape of the unpartitioned version of the Tensor.
+
+    Raises:
+      ValueError: if shape has unknown sharded dimension
+    """
+    shape = tensor_shape.as_shape(shape)
+    dims = shape.as_list()
+    if (self._shard_dimension is None or self._number_of_partitions is None or
+        not dims):
+      return None
+    if dims[self._shard_dimension] is None:
+      raise ValueError("shape %s must have a fixed size for dimension %d "
+                       "that is known at graph construction time." %
+                       (shape.as_list(), self._shard_dimension))
+    if self._number_of_partitions > 1:
+      dims[self._shard_dimension] *= self._number_of_partitions
+    return tensor_shape.as_shape(dims)
+
   def get_sharded_shape(self, shape, shard_index=None):
     """Returns the shape of a shard of a full Tensor.
 
@@ -185,7 +240,7 @@ class ShardingPolicy(object):
                        (shape.as_list(), self._number_of_shards,
                         self._shard_dimension))
     dims[self._shard_dimension] //= self._number_of_shards
-    return tensor_shape.as_shape(dims)
+    return tensor_shape.TensorShape(dims)
 
   def _unshard_shape(self, shape):
     """Return the unsharded shape that would generate a given sharded shape.
@@ -213,7 +268,7 @@ class ShardingPolicy(object):
                        (shape.as_list(), self._shard_dimension))
     dims = shape.as_list()
     dims[self._shard_dimension] *= self._number_of_shards
-    return tensor_shape.as_shape(dims)
+    return tensor_shape.TensorShape(dims)
 
   def get_unsharded_shape(self, shapes):
     """Returns the shape of an unsharded Tensor given a list of shards.
diff --git a/tensorflow/python/tpu/tpu_sharding_test.py b/tensorflow/python/tpu/tpu_sharding_test.py
index 21d2a0897a0..0d67939adfa 100644
--- a/tensorflow/python/tpu/tpu_sharding_test.py
+++ b/tensorflow/python/tpu/tpu_sharding_test.py
@@ -107,6 +107,17 @@ class ShardingTest(test.TestCase):
     with self.assertRaises(ValueError):
       _ = p.get_sharded_shape([4, 10], shard_index=-1)
 
+  def testGetUnpartitionedShape(self):
+    """Tests getting a sharded shape."""
+    p = tpu_sharding.ShardingPolicy()
+    p.set_number_of_shards(3)
+    p.set_shard_dimension(1)
+    p.set_number_of_partitions(4)
+    self.assertEqual(p.get_unpartitioned_shape([3, 5]), [3, 20])
+    p.freeze()
+    with self.assertRaises(ValueError):
+      _ = p.get_unpartitioned_shape([3, None])
+
   def testGetUnshardedShape(self):
     """Tests getting an unsharded shape."""
     p = tpu_sharding.ShardingPolicy()
diff --git a/tensorflow/python/tpu/tpu_test_wrapper.bzl b/tensorflow/python/tpu/tpu_test_wrapper.bzl
index fda631701bc..48ccdc3b2d1 100644
--- a/tensorflow/python/tpu/tpu_test_wrapper.bzl
+++ b/tensorflow/python/tpu/tpu_test_wrapper.bzl
@@ -57,7 +57,7 @@ def get_kwargs_for_wrapping(
     kwargs["python_version"] = kwargs.get("python_version", "PY3")
     kwargs["srcs"] = [wrapper_src] + kwargs["srcs"]
     kwargs["deps"] = depset(
-        ["//tensorflow/python:client_testlib"],
+        ["//tensorflow/python/tpu:tpu_test_deps"],
         transitive = [deps],
     )
     kwargs["main"] = wrapper_src
diff --git a/tensorflow/python/training/adagrad_da_test.py b/tensorflow/python/training/adagrad_da_test.py
index dd40bf58bb1..96131ef0a73 100644
--- a/tensorflow/python/training/adagrad_da_test.py
+++ b/tensorflow/python/training/adagrad_da_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
@@ -35,7 +36,7 @@ class AdagradDAOptimizerTest(test.TestCase):
 
   def doTestAdagradDAwithoutRegularizationBasic1(self, use_resource=False):
     for dtype in [dtypes.float64, dtypes.float32]:
-      with self.cached_session() as sess:
+      with ops.Graph().as_default(), self.cached_session():
         global_step = variables.Variable(0, dtype=dtypes.int64)
         if use_resource:
           var0 = resource_variable_ops.ResourceVariable([0.0, 0.0], dtype=dtype)
@@ -74,15 +75,13 @@ class AdagradDAOptimizerTest(test.TestCase):
         self.assertAllCloseAccordingToType(
             np.array([-0.094821, -0.189358]), v1_val)
 
-  @test_util.run_deprecated_v1
   def testAdagradDAWithoutRegularizationBasic1(self):
     self.doTestAdagradDAwithoutRegularizationBasic1()
 
-  @test_util.run_deprecated_v1
   def testResourceAdagradDAWithoutRegularizationBasic1(self):
     self.doTestAdagradDAwithoutRegularizationBasic1(use_resource=True)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("loss needs to be callable in v2")
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -104,10 +103,9 @@ class AdagradDAOptimizerTest(test.TestCase):
                                            self.evaluate(var0),
                                            rtol=0.01)
 
-  @test_util.run_deprecated_v1
   def testAdagradDAwithoutRegularizationBasic2(self):
     for dtype in [dtypes.float64, dtypes.float32]:
-      with self.cached_session() as sess:
+      with ops.Graph().as_default(), self.cached_session():
         global_step = variables.Variable(0, dtype=dtypes.int64)
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([4.0, 3.0], dtype=dtype)
@@ -137,10 +135,9 @@ class AdagradDAOptimizerTest(test.TestCase):
         self.assertAllCloseAccordingToType(
             np.array([-0.094821, -0.189358]), v1_val)
 
-  @test_util.run_deprecated_v1
   def testAdagradDAWithL1(self):
     for dtype in [dtypes.float64, dtypes.float32]:
-      with self.cached_session() as sess:
+      with ops.Graph().as_default(), self.cached_session():
         global_step = variables.Variable(0, dtype=dtypes.int64)
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([4.0, 3.0], dtype=dtype)
@@ -170,10 +167,9 @@ class AdagradDAOptimizerTest(test.TestCase):
         self.assertAllCloseAccordingToType(
             np.array([-0.085339, -0.17989]), v1_val)
 
-  @test_util.run_deprecated_v1
   def testAdagradDAWithL1_L2(self):
     for dtype in [dtypes.float64, dtypes.float32]:
-      with self.cached_session() as sess:
+      with ops.Graph().as_default(), self.cached_session():
         global_step = variables.Variable(0, dtype=dtypes.int64)
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([4.0, 3.0], dtype=dtype)
diff --git a/tensorflow/python/training/basic_loops_test.py b/tensorflow/python/training/basic_loops_test.py
index 748116331dc..75a866371ce 100644
--- a/tensorflow/python/training/basic_loops_test.py
+++ b/tensorflow/python/training/basic_loops_test.py
@@ -23,7 +23,6 @@ import shutil
 
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.training import basic_loops
 from tensorflow.python.training import supervisor
@@ -38,10 +37,8 @@ def _test_dir(test_name):
 
 class BasicTrainLoopTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def testBasicTrainLoop(self):
     logdir = _test_dir("basic_train_loop")
-    sv = supervisor.Supervisor(logdir=logdir)
     # Counts the number of calls.
     num_calls = [0]
 
@@ -53,14 +50,13 @@ class BasicTrainLoopTest(test.TestCase):
         sv.request_stop()
 
     with ops.Graph().as_default():
+      sv = supervisor.Supervisor(logdir=logdir)
       basic_loops.basic_train_loop(
           sv, train_fn, args=(sv, "y"), kwargs={"a": "A"})
       self.assertEqual(3, num_calls[0])
 
-  @test_util.run_deprecated_v1
   def testBasicTrainLoopExceptionAborts(self):
     logdir = _test_dir("basic_train_loop_exception_aborts")
-    sv = supervisor.Supervisor(logdir=logdir)
 
     def train_fn(unused_sess):
       train_fn.counter += 1
@@ -71,13 +67,12 @@ class BasicTrainLoopTest(test.TestCase):
     train_fn.counter = 0
 
     with ops.Graph().as_default():
+      sv = supervisor.Supervisor(logdir=logdir)
       with self.assertRaisesRegex(RuntimeError, "Failed"):
         basic_loops.basic_train_loop(sv, train_fn)
 
-  @test_util.run_deprecated_v1
   def testBasicTrainLoopRetryOnAborted(self):
     logdir = _test_dir("basic_train_loop_exception_aborts")
-    sv = supervisor.Supervisor(logdir=logdir)
 
     class AbortAndRetry(object):
 
@@ -95,6 +90,7 @@ class BasicTrainLoopTest(test.TestCase):
           raise RuntimeError("Failed Again")
 
     with ops.Graph().as_default():
+      sv = supervisor.Supervisor(logdir=logdir)
       aar = AbortAndRetry()
       with self.assertRaisesRegex(RuntimeError, "Failed Again"):
         basic_loops.basic_train_loop(sv, aar.train_fn)
diff --git a/tensorflow/python/training/checkpoint_management.py b/tensorflow/python/training/checkpoint_management.py
index e54ed9907c1..b65cce7ae5c 100644
--- a/tensorflow/python/training/checkpoint_management.py
+++ b/tensorflow/python/training/checkpoint_management.py
@@ -331,7 +331,7 @@ def latest_checkpoint(checkpoint_dir, latest_filename=None):
   Gets the checkpoint state given the provided checkpoint_dir and looks for a
   corresponding TensorFlow 2 (preferred) or TensorFlow 1.x checkpoint path.
   The latest_filename argument is only applicable if you are saving checkpoint
-  using `v1.Saver.save`
+  using `v1.train.Saver.save`
 
 
   See the [Training Checkpoints
@@ -342,7 +342,7 @@ def latest_checkpoint(checkpoint_dir, latest_filename=None):
     checkpoint_dir: Directory where the variables were saved.
     latest_filename: Optional name for the protocol buffer file that
       contains the list of most recent checkpoint filenames.
-      See the corresponding argument to `v1.Saver.save`.
+      See the corresponding argument to `v1.train.Saver.save`.
 
   Returns:
     The full path to the latest checkpoint or `None` if no checkpoint was found.
diff --git a/tensorflow/python/training/checkpoint_ops_test.py b/tensorflow/python/training/checkpoint_ops_test.py
index 5a6a66f1312..6f6fe1c18fe 100644
--- a/tensorflow/python/training/checkpoint_ops_test.py
+++ b/tensorflow/python/training/checkpoint_ops_test.py
@@ -35,7 +35,8 @@ from tensorflow.python.training import checkpoint_ops
 from tensorflow.python.training import saver as saver_lib
 
 
-@test_util.run_v1_only('b/120545219')
+@test_util.run_v1_only(
+    'This is to test V1 name-based checkpoints which is not supported in V2.')
 class LoadAndRemapWrappersTest(test.TestCase):
   """Tests for the functionality of the Python wrappers."""
 
@@ -156,8 +157,7 @@ class LoadAndRemapWrappersTest(test.TestCase):
 
     with self.cached_session():
       self.evaluate(variables.global_variables_initializer())
-      self.assertAllClose(expected_remapped_matrix,
-                          remapped_matrix.as_tensor().eval())
+      self.assertAllClose(expected_remapped_matrix, remapped_matrix.as_tensor())
 
   def test_load_and_remap_output_layer_weight_initializer_dnn_output(self):
     """Tests for the output layer initializer in the DNN output case."""
@@ -190,8 +190,7 @@ class LoadAndRemapWrappersTest(test.TestCase):
 
     with self.cached_session():
       self.evaluate(variables.global_variables_initializer())
-      self.assertAllClose(expected_remapped_matrix,
-                          remapped_matrix.as_tensor().eval())
+      self.assertAllClose(expected_remapped_matrix, remapped_matrix.as_tensor())
 
   def test_initializer_with_oov_only_partition(self):
     """Tests for the output layer initializer where one partition is all OOV."""
@@ -228,8 +227,7 @@ class LoadAndRemapWrappersTest(test.TestCase):
 
     with self.cached_session():
       self.evaluate(variables.global_variables_initializer())
-      self.assertAllClose(expected_remapped_matrix,
-                          remapped_matrix.as_tensor().eval())
+      self.assertAllClose(expected_remapped_matrix, remapped_matrix.as_tensor())
 
   def test_load_and_remap_linear_multiclass_initializer_default_init(self):
     """Tests where the zeros_initializer default is used for linear."""
@@ -264,8 +262,7 @@ class LoadAndRemapWrappersTest(test.TestCase):
 
     with self.cached_session():
       self.evaluate(variables.global_variables_initializer())
-      self.assertAllClose(expected_remapped_matrix,
-                          remapped_matrix.as_tensor().eval())
+      self.assertAllClose(expected_remapped_matrix, remapped_matrix.as_tensor())
 
   def test_load_embedding_initializer(self):
     """Tests for the load_embedding_initializer wrapper."""
@@ -299,7 +296,7 @@ class LoadAndRemapWrappersTest(test.TestCase):
     with self.cached_session():
       self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(expected_remapped_embeddings,
-                          remapped_embeddings.as_tensor().eval())
+                          remapped_embeddings.as_tensor())
 
   def test_load_embedding_initializer_large_oov(self):
     """Tests for the large OOV case for load_embedding_initializer wrapper."""
@@ -345,7 +342,7 @@ class LoadAndRemapWrappersTest(test.TestCase):
     with self.cached_session():
       self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(expected_remapped_embeddings,
-                          remapped_embeddings.as_tensor().eval())
+                          remapped_embeddings.as_tensor())
 
   def test_load_embedding_initializer_old_row_vocab(self):
     """Tests for load_embedding_initializer where we constrain old vocab."""
@@ -383,7 +380,8 @@ class LoadAndRemapWrappersTest(test.TestCase):
     with self.cached_session():
       self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(expected_remapped_embeddings,
-                          remapped_embeddings.as_tensor().eval())
+                          remapped_embeddings.as_tensor())
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/training/experimental/loss_scale.py b/tensorflow/python/training/experimental/loss_scale.py
index 86d8cee16dd..542311c75d8 100644
--- a/tensorflow/python/training/experimental/loss_scale.py
+++ b/tensorflow/python/training/experimental/loss_scale.py
@@ -28,9 +28,9 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training.tracking import base as trackable
-from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
@@ -198,10 +198,6 @@ class LossScale(trackable.Trackable):
     return cls(**config)
 
 
-def get_loss_scale_weights(loss_scale):
-  return loss_scale._weights.values()  # pylint: disable=protected-access
-
-
 @deprecation.deprecated_endpoints('train.experimental.FixedLossScale')
 @tf_export('mixed_precision.experimental.FixedLossScale',
            'train.experimental.FixedLossScale')
diff --git a/tensorflow/python/training/experimental/loss_scaling_gradient_tape_test.py b/tensorflow/python/training/experimental/loss_scaling_gradient_tape_test.py
index 5c6b4d71649..ec07d215f4f 100644
--- a/tensorflow/python/training/experimental/loss_scaling_gradient_tape_test.py
+++ b/tensorflow/python/training/experimental/loss_scaling_gradient_tape_test.py
@@ -114,7 +114,7 @@ class LossScaleGradientTapeTest(test.TestCase, parameterized.TestCase):
       with lsgt.LossScaleGradientTape(loss_scale) as g:
         y = x * x
       return g.gradient(y, x, output_gradients=constant_op.constant(2.0))
-    dy_dx_list = self._run_with_strategy(run_fn, strategy_fn(), use_tf_function)
+    dy_dx_list = self._run_with_strategy(run_fn, strategy, use_tf_function)
     self.assertEqual(loss_scale(), 32)
     for dy_dx in dy_dx_list:
       self.assertEqual(dy_dx, 12.0)
@@ -236,7 +236,7 @@ class LossScaleGradientTapeTest(test.TestCase, parameterized.TestCase):
       dy_dx = g.gradient(y, x)
       return dz_dx, dy_dx
 
-    dz_dx_list, dy_dx_list = self._run_with_strategy(run_fn, strategy_fn(),
+    dz_dx_list, dy_dx_list = self._run_with_strategy(run_fn, strategy,
                                                      use_tf_function)
     for dz_dx in dz_dx_list:
       self.assertEqual(dz_dx, 108.0)
diff --git a/tensorflow/python/training/experimental/mixed_precision.py b/tensorflow/python/training/experimental/mixed_precision.py
index c41ec38ccef..af0e27dd860 100644
--- a/tensorflow/python/training/experimental/mixed_precision.py
+++ b/tensorflow/python/training/experimental/mixed_precision.py
@@ -124,10 +124,10 @@ def enable_mixed_precision_graph_rewrite(opt, loss_scale='dynamic'):
   E.g. `ArgMax` and `Floor`.
   * `AllowList`: Ops that are considered numerically safe for execution in
   float16, and thus are always converted. E.g. `Conv2D`.
-  * `BlackList`: Ops that are numerically unsafe to execute in float16 and
+  * `DenyList`: Ops that are numerically unsafe to execute in float16 and
   can negatively affect downstream nodes. E.g. `Softmax`.
   * `GrayList`: Ops that are considered numerically safe for execution in
-  float16 unless downstream from a BlackList Op. E.g. `Add` and `AvgPool`.
+  float16 unless downstream from a DenyList Op. E.g. `Add` and `AvgPool`.
 
   When this function is used, gradients should be computed and applied with the
   returned optimizer, either by calling `opt.minimize()` or
@@ -269,10 +269,10 @@ def enable_mixed_precision_graph_rewrite_v1(opt, loss_scale='dynamic'):
   E.g. `ArgMax` and `Floor`.
   * `AllowList`: Ops that are considered numerically safe for execution in
   float16, and thus are always converted. E.g. `Conv2D`.
-  * `BlackList`: Ops that are numerically unsafe to execute in float16 and
+  * `DenyList`: Ops that are numerically unsafe to execute in float16 and
   can negatively affect downstream nodes. E.g. `Softmax`.
   * `GrayList`: Ops that are considered numerically safe for execution in
-  float16 unless downstream from a BlackList Op. E.g. `Add` and `AvgPool`.
+  float16 unless downstream from a DenyList Op. E.g. `Add` and `AvgPool`.
 
   When this function is used, gradients should only be computed and applied
   with the returned optimizer, either by calling `opt.minimize()` or
diff --git a/tensorflow/python/training/ftrl.py b/tensorflow/python/training/ftrl.py
index c7b3867631d..6c8a6ceadc5 100644
--- a/tensorflow/python/training/ftrl.py
+++ b/tensorflow/python/training/ftrl.py
@@ -49,7 +49,8 @@ class FtrlOptimizer(optimizer.Optimizer):
                name="Ftrl",
                accum_name=None,
                linear_name=None,
-               l2_shrinkage_regularization_strength=0.0):
+               l2_shrinkage_regularization_strength=0.0,
+               beta=None):
     r"""Construct a new FTRL optimizer.
 
     Args:
@@ -79,10 +80,11 @@ class FtrlOptimizer(optimizer.Optimizer):
         function w.r.t. the weights w.
         Specifically, in the absence of L1 regularization, it is equivalent to
         the following update rule:
-        w_{t+1} = w_t - lr_t / (1 + 2*L2*lr_t) * g_t -
-                  2*L2_shrinkage*lr_t / (1 + 2*L2*lr_t) * w_t
+        w_{t+1} = w_t - lr_t / (beta + 2*L2*lr_t) * g_t -
+                  2*L2_shrinkage*lr_t / (beta + 2*L2*lr_t) * w_t
         where lr_t is the learning rate at t.
         When input is sparse shrinkage will only happen on the active weights.
+      beta: A float value; corresponds to the beta parameter in the paper.
 
     Raises:
       ValueError: If one of the arguments is invalid.
@@ -119,12 +121,13 @@ class FtrlOptimizer(optimizer.Optimizer):
     self._initial_accumulator_value = initial_accumulator_value
     self._l1_regularization_strength = l1_regularization_strength
     self._l2_regularization_strength = l2_regularization_strength
+    self._beta = (0.0 if beta is None else beta)
     self._l2_shrinkage_regularization_strength = (
         l2_shrinkage_regularization_strength)
     self._learning_rate_tensor = None
     self._learning_rate_power_tensor = None
     self._l1_regularization_strength_tensor = None
-    self._l2_regularization_strength_tensor = None
+    self._adjusted_l2_regularization_strength_tensor = None
     self._l2_shrinkage_regularization_strength_tensor = None
     self._accum_name = accum_name
     self._linear_name = linear_name
@@ -142,8 +145,14 @@ class FtrlOptimizer(optimizer.Optimizer):
         self._learning_rate, name="learning_rate")
     self._l1_regularization_strength_tensor = ops.convert_to_tensor(
         self._l1_regularization_strength, name="l1_regularization_strength")
-    self._l2_regularization_strength_tensor = ops.convert_to_tensor(
-        self._l2_regularization_strength, name="l2_regularization_strength")
+    # L2 regularization strength with beta added in so that the underlying
+    # TensorFlow ops do not need to include that parameter.
+    self._adjusted_l2_regularization_strength_tensor = ops.convert_to_tensor(
+        self._l2_regularization_strength + self._beta /
+        (2. * self._learning_rate),
+        name="adjusted_l2_regularization_strength")
+    assert self._adjusted_l2_regularization_strength_tensor is not None
+    self._beta_tensor = ops.convert_to_tensor(self._beta, name="beta")
     self._l2_shrinkage_regularization_strength_tensor = ops.convert_to_tensor(
         self._l2_shrinkage_regularization_strength,
         name="l2_shrinkage_regularization_strength")
@@ -162,7 +171,7 @@ class FtrlOptimizer(optimizer.Optimizer):
           math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
           math_ops.cast(self._l1_regularization_strength_tensor,
                         var.dtype.base_dtype),
-          math_ops.cast(self._l2_regularization_strength_tensor,
+          math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
                         var.dtype.base_dtype),
           math_ops.cast(self._learning_rate_power_tensor, var.dtype.base_dtype),
           use_locking=self._use_locking)
@@ -175,7 +184,7 @@ class FtrlOptimizer(optimizer.Optimizer):
           math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
           math_ops.cast(self._l1_regularization_strength_tensor,
                         var.dtype.base_dtype),
-          math_ops.cast(self._l2_regularization_strength_tensor,
+          math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
                         var.dtype.base_dtype),
           math_ops.cast(self._l2_shrinkage_regularization_strength_tensor,
                         var.dtype.base_dtype),
@@ -194,7 +203,7 @@ class FtrlOptimizer(optimizer.Optimizer):
           math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
           math_ops.cast(self._l1_regularization_strength_tensor,
                         var.dtype.base_dtype),
-          math_ops.cast(self._l2_regularization_strength_tensor,
+          math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
                         var.dtype.base_dtype),
           math_ops.cast(self._learning_rate_power_tensor, var.dtype.base_dtype),
           use_locking=self._use_locking)
@@ -207,7 +216,7 @@ class FtrlOptimizer(optimizer.Optimizer):
           math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
           math_ops.cast(self._l1_regularization_strength_tensor,
                         var.dtype.base_dtype),
-          math_ops.cast(self._l2_regularization_strength_tensor,
+          math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
                         var.dtype.base_dtype),
           math_ops.cast(self._l2_shrinkage_regularization_strength_tensor,
                         var.dtype.base_dtype),
@@ -227,7 +236,7 @@ class FtrlOptimizer(optimizer.Optimizer):
           math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
           math_ops.cast(self._l1_regularization_strength_tensor,
                         var.dtype.base_dtype),
-          math_ops.cast(self._l2_regularization_strength_tensor,
+          math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
                         var.dtype.base_dtype),
           math_ops.cast(self._learning_rate_power_tensor, var.dtype.base_dtype),
           use_locking=self._use_locking)
@@ -241,7 +250,7 @@ class FtrlOptimizer(optimizer.Optimizer):
           math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
           math_ops.cast(self._l1_regularization_strength_tensor,
                         var.dtype.base_dtype),
-          math_ops.cast(self._l2_regularization_strength_tensor,
+          math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
                         var.dtype.base_dtype),
           math_ops.cast(self._l2_shrinkage_regularization_strength_tensor,
                         grad.dtype.base_dtype),
@@ -260,7 +269,8 @@ class FtrlOptimizer(optimizer.Optimizer):
           indices,
           math_ops.cast(self._learning_rate_tensor, grad.dtype),
           math_ops.cast(self._l1_regularization_strength_tensor, grad.dtype),
-          math_ops.cast(self._l2_regularization_strength_tensor, grad.dtype),
+          math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
+                        grad.dtype),
           math_ops.cast(self._learning_rate_power_tensor, grad.dtype),
           use_locking=self._use_locking)
     else:
@@ -272,7 +282,8 @@ class FtrlOptimizer(optimizer.Optimizer):
           indices,
           math_ops.cast(self._learning_rate_tensor, grad.dtype),
           math_ops.cast(self._l1_regularization_strength_tensor, grad.dtype),
-          math_ops.cast(self._l2_regularization_strength_tensor, grad.dtype),
+          math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
+                        grad.dtype),
           math_ops.cast(self._l2_shrinkage_regularization_strength_tensor,
                         grad.dtype),
           math_ops.cast(self._learning_rate_power_tensor, grad.dtype),
diff --git a/tensorflow/python/training/ftrl_test.py b/tensorflow/python/training/ftrl_test.py
index f0cbe13e037..ff1bf177a72 100644
--- a/tensorflow/python/training/ftrl_test.py
+++ b/tensorflow/python/training/ftrl_test.py
@@ -161,6 +161,65 @@ class FtrlOptimizerTest(test.TestCase):
           self.assertAllCloseAccordingToType(
               np.array([-0.93460727, -1.86147261]), v1_val)
 
+  def testFtrlWithBeta(self):
+    # The v1 optimizers do not support eager execution
+    with ops.Graph().as_default():
+      for dtype in [dtypes.half, dtypes.float32]:
+        with self.cached_session():
+          var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+          var1 = variables.Variable([4.0, 3.0], dtype=dtype)
+          grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+          grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+          opt = ftrl.FtrlOptimizer(3.0, initial_accumulator_value=0.1, beta=0.1)
+          update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+          self.evaluate(variables.global_variables_initializer())
+
+          v0_val, v1_val = self.evaluate([var0, var1])
+          self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+          self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+          # Run 10 steps FTRL
+          for _ in range(10):
+            update.run()
+          v0_val, v1_val = self.evaluate([var0, var1])
+          self.assertAllCloseAccordingToType(
+              np.array([-6.096838, -9.162214]), v0_val)
+          self.assertAllCloseAccordingToType(
+              np.array([-0.717741, -1.425132]), v1_val)
+
+  def testFtrlWithL2_Beta(self):
+    # The v1 optimizers do not support eager execution
+    with ops.Graph().as_default():
+      for dtype in [dtypes.half, dtypes.float32]:
+        with self.cached_session():
+          var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+          var1 = variables.Variable([4.0, 3.0], dtype=dtype)
+          grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+          grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+          opt = ftrl.FtrlOptimizer(
+              3.0,
+              initial_accumulator_value=0.1,
+              l1_regularization_strength=0.0,
+              l2_regularization_strength=0.1,
+              beta=0.1)
+          update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+          self.evaluate(variables.global_variables_initializer())
+
+          v0_val, v1_val = self.evaluate([var0, var1])
+          self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+          self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+          # Run 10 steps FTRL
+          for _ in range(10):
+            update.run()
+          v0_val, v1_val = self.evaluate([var0, var1])
+          self.assertAllCloseAccordingToType(
+              np.array([-2.735487, -4.704625]), v0_val)
+          self.assertAllCloseAccordingToType(
+              np.array([-0.294335, -0.586556]), v1_val)
+
   def testFtrlWithL1_L2(self):
     # The v1 optimizers do not support eager execution
     with ops.Graph().as_default():
diff --git a/tensorflow/python/training/input_test.py b/tensorflow/python/training/input_test.py
index 3dc889a7895..421f0f3534e 100644
--- a/tensorflow/python/training/input_test.py
+++ b/tensorflow/python/training/input_test.py
@@ -27,6 +27,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -40,9 +41,10 @@ from tensorflow.python.training import queue_runner_impl
 from tensorflow.python.util import compat
 
 
+# Queue-based input pipelines are not supported when eager execution is enabled.
+# Please use tf.data instead in TF2.
 class MatchFilenamesOnceTest(test_lib.TestCase):
 
-  @test_util.run_deprecated_v1
   def test(self):
     temp_dir = self.get_temp_dir()
     filenames = [os.path.join(temp_dir, n) for n in os.listdir(temp_dir)]
@@ -53,7 +55,7 @@ class MatchFilenamesOnceTest(test_lib.TestCase):
     for name in additional:
       open(name, "w").write("Some contents")
     filenames = list(set(filenames + additional))
-    with self.cached_session():
+    with ops.Graph().as_default(), self.cached_session():
       star = inp.match_filenames_once(os.path.join(self.get_temp_dir(), "*"))
       question = inp.match_filenames_once(
           os.path.join(self.get_temp_dir(), "match_filenames.?"))
@@ -70,18 +72,16 @@ class MatchFilenamesOnceTest(test_lib.TestCase):
 
 class LimitEpochsTest(test_lib.TestCase):
 
-  @test_util.run_deprecated_v1
   def testNoLimit(self):
-    with self.cached_session():
+    with ops.Graph().as_default(), self.cached_session():
       seven = constant_op.constant(7)
       seven_forever = inp.limit_epochs(seven)
       variables.local_variables_initializer().run()
       for _ in range(100):
         self.assertEqual(7, self.evaluate(seven_forever))
 
-  @test_util.run_deprecated_v1
   def testLimit(self):
-    with self.cached_session():
+    with ops.Graph().as_default(), self.cached_session():
       love_me = constant_op.constant("Love Me")
       love_me_two_times = inp.limit_epochs(love_me, num_epochs=2)
       self.evaluate(variables.global_variables_initializer())
@@ -94,9 +94,8 @@ class LimitEpochsTest(test_lib.TestCase):
 
 class InputProducerTest(test_lib.TestCase):
 
-  @test_util.run_deprecated_v1
   def testNoShuffle(self):
-    with self.cached_session():
+    with ops.Graph().as_default(), self.cached_session():
       input_tensor = [[1, 2, 3, 4],
                       [5, 6, 7, 8],
                       [9, 10, 11, 12]]
@@ -119,9 +118,8 @@ class InputProducerTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
   def testNoShapeInference(self):
-    with self.cached_session():
+    with ops.Graph().as_default(), self.cached_session():
       # Disable shape inference for the input.
       input_value = [[1, 2, 3, 4],
                      [5, 6, 7, 8],
@@ -145,18 +143,17 @@ class InputProducerTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
   def testShapeError(self):
-    input_tensor = array_ops.placeholder(dtypes.float32, None)
-    with self.assertRaisesRegex(ValueError, "fully defined shape"):
-      _ = inp.input_producer(input_tensor)
+    with ops.Graph().as_default():
+      input_tensor = array_ops.placeholder(dtypes.float32, None)
+      with self.assertRaisesRegex(ValueError, "fully defined shape"):
+        _ = inp.input_producer(input_tensor)
 
 
 class StringInputProducerTest(test_lib.TestCase):
 
-  @test_util.run_deprecated_v1
   def testNoShuffle(self):
-    with self.cached_session():
+    with ops.Graph().as_default(), self.cached_session():
       strings = [b"to", b"be", b"or", b"not", b"to", b"be"]
       num_epochs = 3
       queue = inp.string_input_producer(
@@ -177,9 +174,8 @@ class StringInputProducerTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
   def testShuffle(self):
-    with self.cached_session():
+    with ops.Graph().as_default(), self.cached_session():
       strings = [b"a", b"b", b"c"]
       num_epochs = 600
       queue = inp.string_input_producer(
@@ -223,12 +219,11 @@ class StringInputProducerTest(test_lib.TestCase):
       with self.assertRaises(ValueError):
         _ = inp.string_input_producer([])
 
-  @test_util.run_deprecated_v1
   def testNullString(self):
     # Runtime check for empty string list.  This is slightly oblique:
     # The queue runner should die with an assertion error on the null
     # input tensor, causing the dequeue to fail with an OutOfRangeError.
-    with self.cached_session():
+    with ops.Graph().as_default(), self.cached_session():
       coord = coordinator.Coordinator()
       queue = inp.string_input_producer(
           constant_op.constant(
@@ -243,18 +238,16 @@ class StringInputProducerTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
   def testSharedName(self):
-    with self.cached_session():
+    with ops.Graph().as_default(), self.cached_session():
       strings = [b"to", b"be", b"or", b"not", b"to", b"be"]
       queue = inp.string_input_producer(
           strings, shared_name="SHARED_NAME_XYZ", name="Q")
       self.assertProtoEquals("s: 'SHARED_NAME_XYZ'",
                              queue.queue_ref.op.node_def.attr["shared_name"])
 
-  @test_util.run_deprecated_v1
   def testConstructionRace(self):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session() as sess:
       strings = [b"to", b"be", b"or", b"not", b"to", b"be"]
       queue = inp.string_input_producer(strings, shuffle=False)
       coord = coordinator.Coordinator()
@@ -275,9 +268,8 @@ class StringInputProducerTest(test_lib.TestCase):
 
 class RangeInputProducerTest(test_lib.TestCase):
 
-  @test_util.run_deprecated_v1
   def testNoShuffle(self):
-    with self.cached_session():
+    with ops.Graph().as_default(), self.cached_session():
       num_epochs = 3
       range_size = 5
       queue = inp.range_input_producer(
@@ -298,9 +290,8 @@ class RangeInputProducerTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
   def testShuffle(self):
-    with self.cached_session():
+    with ops.Graph().as_default(), self.cached_session():
       num_epochs = 200
       range_size = 2
       queue = inp.range_input_producer(
@@ -338,9 +329,8 @@ class RangeInputProducerTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
   def testSharedName(self):
-    with self.cached_session():
+    with ops.Graph().as_default(), self.cached_session():
       range_size = 5
       queue = inp.range_input_producer(
           range_size, shared_name="SHARED_NAME_XYZ", name="Q")
@@ -350,9 +340,8 @@ class RangeInputProducerTest(test_lib.TestCase):
 
 class SliceInputProducerTest(test_lib.TestCase):
 
-  @test_util.run_deprecated_v1
   def testNoShuffle(self):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       num_epochs = 3
       source_strings = [b"Alpha", b"Beta", b"Delta", b"Gamma"]
       source_ints = [2, 3, 5, 7]
@@ -375,9 +364,8 @@ class SliceInputProducerTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
   def testShuffle(self):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       num_epochs = 1200
       source_strings = ["A", "B", "D", "G"]
       source_ints = [7, 3, 5, 2]
@@ -420,9 +408,8 @@ class SliceInputProducerTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
   def testSharedName(self):
-    with self.cached_session():
+    with ops.Graph().as_default(), self.cached_session():
       source_strings = ["A", "B", "D", "G"]
       source_ints = [7, 3, 5, 2]
       slices = inp.slice_input_producer(
@@ -462,7 +449,7 @@ class DictHelperTest(test_lib.TestCase):
 class BatchTest(test_lib.TestCase):
 
   def _testOneThreadHelper(self, use_dict):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       batch_size = 10
       num_batches = 3
       zero64 = constant_op.constant(0, dtype=dtypes.int64)
@@ -513,41 +500,38 @@ class BatchTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
   def testOneThread(self):
     self._testOneThreadHelper(use_dict=False)
 
-  @test_util.run_deprecated_v1
   def testOneThreadDict(self):
     self._testOneThreadHelper(use_dict=True)
 
-  @test_util.run_deprecated_v1
   def testUint32DataTypes(self):
-    values = constant_op.constant([0, 1, 2, 3, 4, 5], dtype=dtypes.uint32)
-    batched = inp.batch([values], batch_size=2)
-    with self.cached_session() as sess:
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
-      self.evaluate(batched)
-      coord.request_stop()
-      for thread in threads:
-        thread.join()
+    with ops.Graph().as_default():
+      values = constant_op.constant([0, 1, 2, 3, 4, 5], dtype=dtypes.uint32)
+      batched = inp.batch([values], batch_size=2)
+      with self.cached_session() as sess:
+        coord = coordinator.Coordinator()
+        threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
+        self.evaluate(batched)
+        coord.request_stop()
+        for thread in threads:
+          thread.join()
 
-  @test_util.run_deprecated_v1
   def testUint64DataTypes(self):
-    values = constant_op.constant([0, 1, 2, 3, 4, 5], dtype=dtypes.uint64)
-    batched = inp.batch([values], batch_size=2)
-    with self.cached_session() as sess:
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
-      self.evaluate(batched)
-      coord.request_stop()
-      for thread in threads:
-        thread.join()
+    with ops.Graph().as_default():
+      values = constant_op.constant([0, 1, 2, 3, 4, 5], dtype=dtypes.uint64)
+      batched = inp.batch([values], batch_size=2)
+      with self.cached_session() as sess:
+        coord = coordinator.Coordinator()
+        threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
+        self.evaluate(batched)
+        coord.request_stop()
+        for thread in threads:
+          thread.join()
 
-  @test_util.run_deprecated_v1
   def testOneThreadDynamicPad(self):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       batch_size = 10
       num_batches = 3
       zero64 = constant_op.constant(0, dtype=dtypes.int64)
@@ -577,9 +561,8 @@ class BatchTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
   def testOneThreadEnqueueMany(self):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       batch_size = 10
       num_batches = 3
       zero64 = constant_op.constant(0, dtype=dtypes.int64)
@@ -613,9 +596,8 @@ class BatchTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
   def testManyThreads(self):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       batch_size = 10
       num_batches = 3
       zero64 = constant_op.constant(0, dtype=dtypes.int64)
@@ -654,9 +636,8 @@ class BatchTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
   def testOneThreadSmallerBatch(self):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       batch_size = 10
       num_batches = 3
       extra_elements = 5
@@ -712,9 +693,8 @@ class BatchTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
   def testManyThreadsSmallerBatch(self):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       batch_size = 10
       num_batches = 3
       extra_elements = 5
@@ -768,9 +748,8 @@ class BatchTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
   def testSharedName(self):
-    with self.cached_session():
+    with ops.Graph().as_default(), self.cached_session():
       batch_size = 10
       num_batches = 3
       zero64 = constant_op.constant(0, dtype=dtypes.int64)
@@ -786,57 +765,56 @@ class BatchTest(test_lib.TestCase):
           "s: 'SHARED_NAME_XYZ'",
           batched[0].op.inputs[0].op.node_def.attr["shared_name"])
 
-  @test_util.run_deprecated_v1
   def testCannotInferRankError(self):
-    with self.cached_session():
+    with ops.Graph().as_default(), self.cached_session():
       x = array_ops.placeholder(dtype=dtypes.int64)
       with self.assertRaisesRegex(ValueError, "Cannot infer Tensor's rank"):
         inp.batch([x], batch_size=2)
 
-  @test_util.run_deprecated_v1
   def testBatchedSparseTensorInferredShape(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=[[0]], values=[1.0], dense_shape=[1])
-    self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
-    batched = inp.batch([sparse], batch_size=2)
-    self.assertAllEqual((2,), batched.dense_shape.get_shape().as_list())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=[[0]], values=[1.0], dense_shape=[1])
+      self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
+      batched = inp.batch([sparse], batch_size=2)
+      self.assertAllEqual((2,), batched.dense_shape.get_shape().as_list())
 
-  @test_util.run_deprecated_v1
   def testBatchedSparseTensorInferredShapeEnqueueMany(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=[[0]], values=[1.0], dense_shape=[1])
-    self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
-    batched = inp.batch([sparse], batch_size=2, enqueue_many=True)
-    self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=[[0]], values=[1.0], dense_shape=[1])
+      self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
+      batched = inp.batch([sparse], batch_size=2, enqueue_many=True)
+      self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
-  @test_util.run_deprecated_v1
   def testBatchedSparseTensorInferredShapeUnknownRank(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=array_ops.placeholder(dtypes.int64),
-        values=array_ops.placeholder(dtypes.float32),
-        dense_shape=array_ops.placeholder(dtypes.int64))
-    self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
-    batched = inp.batch([sparse], batch_size=2)
-    self.assertIs(None, batched.dense_shape.get_shape().num_elements())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=array_ops.placeholder(dtypes.int64),
+          values=array_ops.placeholder(dtypes.float32),
+          dense_shape=array_ops.placeholder(dtypes.int64))
+      self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
+      batched = inp.batch([sparse], batch_size=2)
+      self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
-  @test_util.run_deprecated_v1
   def testBatchedSparseTensorInferredShapeUnknownRankEnqueueMany(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=array_ops.placeholder(dtypes.int64),
-        values=array_ops.placeholder(dtypes.float32),
-        dense_shape=array_ops.placeholder(dtypes.int64))
-    self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
-    batched = inp.batch([sparse], batch_size=2, enqueue_many=True)
-    self.assertIs(None, batched.dense_shape.get_shape().num_elements())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=array_ops.placeholder(dtypes.int64),
+          values=array_ops.placeholder(dtypes.float32),
+          dense_shape=array_ops.placeholder(dtypes.int64))
+      self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
+      batched = inp.batch([sparse], batch_size=2, enqueue_many=True)
+      self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
-  @test_util.run_deprecated_v1
   def testSingleElementDict(self):
-    x = inp.batch({"c": [12, 12]}, batch_size=8)
-    self.assertAllEqual((8, 2), x["c"].get_shape().as_list())
+    with ops.Graph().as_default():
+      x = inp.batch({"c": [12, 12]}, batch_size=8)
+      self.assertAllEqual((8, 2), x["c"].get_shape().as_list())
 
   def _testKeepInputHelper(self, num_threads, enqueue_many,
                            keep_input_vector=False):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       batch_size = 5
       num_batches = 4
       examples = variables.Variable(0)
@@ -873,139 +851,156 @@ class BatchTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("Input pipelines based on Queues are not supported "
+                         "when eager execution is enabled. TF2 uses tf.data "
+                         "instead.")
   def testSingleThreadKeepInput(self):
     self._testKeepInputHelper(1, False)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("Input pipelines based on Queues are not supported "
+                         "when eager execution is enabled. TF2 uses tf.data "
+                         "instead.")
   def testSingleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(1, True)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("Input pipelines based on Queues are not supported "
+                         "when eager execution is enabled. TF2 uses tf.data "
+                         "instead.")
   def testMultipleThreadKeepInput(self):
     self._testKeepInputHelper(5, False)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("Input pipelines based on Queues are not supported "
+                         "when eager execution is enabled. TF2 uses tf.data "
+                         "instead.")
   def testMultipleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(5, True)
 
-  @test_util.run_deprecated_v1
   def testMaybeEnqueuePerExample(self):
     self._testKeepInputHelper(1, True, keep_input_vector=True)
 
-  @test_util.run_deprecated_v1
   def testMultipleThreadMaybeEnqueuePerExample(self):
     self._testKeepInputHelper(5, True, keep_input_vector=True)
 
-  @test_util.run_deprecated_v1
   def testInvalidKeepInputVector(self):
-    # Can't have vector `keep_input` with `enqueue_many=False`.
-    with self.assertRaisesRegex(ValueError, "`keep_input` cannot be a vector"):
-      inp.maybe_batch([array_ops.zeros(5)],
-                      keep_input=constant_op.constant([True, False]),
-                      batch_size=1,
-                      enqueue_many=False)
-    # Can't have `keep_input` with more than one dimension.
-    with self.assertRaisesRegex(ValueError, "must be 0 or 1 dimensions"):
-      inp.maybe_batch([array_ops.zeros(5)],
-                      keep_input=constant_op.constant([[True], [False]]),
-                      batch_size=1,
-                      enqueue_many=True)
-    # `keep_input` must have dimensions determined at graph construction.
-    with self.assertRaisesRegex(ValueError,
-                                "must be known at graph construction"):
-      inp.maybe_batch([array_ops.zeros(5)],
-                      keep_input=array_ops.placeholder(dtypes.bool),
-                      batch_size=1,
-                      enqueue_many=True)
+    with ops.Graph().as_default():
+      # Can't have vector `keep_input` with `enqueue_many=False`.
+      with self.assertRaisesRegex(ValueError,
+                                  "`keep_input` cannot be a vector"):
+        inp.maybe_batch([array_ops.zeros(5)],
+                        keep_input=constant_op.constant([True, False]),
+                        batch_size=1,
+                        enqueue_many=False)
+      # Can't have `keep_input` with more than one dimension.
+      with self.assertRaisesRegex(ValueError, "must be 0 or 1 dimensions"):
+        inp.maybe_batch([array_ops.zeros(5)],
+                        keep_input=constant_op.constant([[True], [False]]),
+                        batch_size=1,
+                        enqueue_many=True)
+      # `keep_input` must have dimensions determined at graph construction.
+      with self.assertRaisesRegex(ValueError,
+                                  "must be known at graph construction"):
+        inp.maybe_batch([array_ops.zeros(5)],
+                        keep_input=array_ops.placeholder(dtypes.bool),
+                        batch_size=1,
+                        enqueue_many=True)
 
-  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShape(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=[[0]], values=[1.0], dense_shape=[1])
-    self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
-    batched = inp.maybe_batch([sparse], keep_input=True, batch_size=2)
-    self.assertAllEqual((2,), batched.dense_shape.get_shape().as_list())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=[[0]], values=[1.0], dense_shape=[1])
+      self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
+      batched = inp.maybe_batch([sparse], keep_input=True, batch_size=2)
+      self.assertAllEqual((2,), batched.dense_shape.get_shape().as_list())
 
-  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueMany(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=[[0]], values=[1.0], dense_shape=[1])
-    self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
-    batched = inp.maybe_batch(
-        [sparse], keep_input=True, batch_size=2, enqueue_many=True)
-    self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=[[0]], values=[1.0], dense_shape=[1])
+      self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
+      batched = inp.maybe_batch([sparse],
+                                keep_input=True,
+                                batch_size=2,
+                                enqueue_many=True)
+      self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
-  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueManyPerExample(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=[[0], [0]], values=[1.0, 2.0], dense_shape=[2])
-    self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
-    batched = inp.maybe_batch(
-        [sparse], keep_input=[True, False], batch_size=2, enqueue_many=True)
-    self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=[[0], [0]], values=[1.0, 2.0], dense_shape=[2])
+      self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
+      batched = inp.maybe_batch([sparse],
+                                keep_input=[True, False],
+                                batch_size=2,
+                                enqueue_many=True)
+      self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
-  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRank(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=array_ops.placeholder(dtypes.int64),
-        values=array_ops.placeholder(dtypes.float32),
-        dense_shape=array_ops.placeholder(dtypes.int64))
-    self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
-    batched = inp.maybe_batch([sparse], keep_input=True, batch_size=2)
-    self.assertIs(None, batched.dense_shape.get_shape().num_elements())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=array_ops.placeholder(dtypes.int64),
+          values=array_ops.placeholder(dtypes.float32),
+          dense_shape=array_ops.placeholder(dtypes.int64))
+      self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
+      batched = inp.maybe_batch([sparse], keep_input=True, batch_size=2)
+      self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
-  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankEnqueueMany(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=array_ops.placeholder(dtypes.int64),
-        values=array_ops.placeholder(dtypes.float32),
-        dense_shape=array_ops.placeholder(dtypes.int64))
-    self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
-    batched = inp.maybe_batch(
-        [sparse], keep_input=True, batch_size=2, enqueue_many=True)
-    self.assertIs(None, batched.dense_shape.get_shape().num_elements())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=array_ops.placeholder(dtypes.int64),
+          values=array_ops.placeholder(dtypes.float32),
+          dense_shape=array_ops.placeholder(dtypes.int64))
+      self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
+      batched = inp.maybe_batch([sparse],
+                                keep_input=True,
+                                batch_size=2,
+                                enqueue_many=True)
+      self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
-  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankPerExample(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=array_ops.placeholder(dtypes.int64),
-        values=array_ops.placeholder(dtypes.float32),
-        dense_shape=array_ops.placeholder(dtypes.int64))
-    self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
-    batched = inp.maybe_batch(
-        [sparse], keep_input=[True, False], batch_size=2, enqueue_many=True)
-    self.assertIs(None, batched.dense_shape.get_shape().num_elements())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=array_ops.placeholder(dtypes.int64),
+          values=array_ops.placeholder(dtypes.float32),
+          dense_shape=array_ops.placeholder(dtypes.int64))
+      self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
+      batched = inp.maybe_batch([sparse],
+                                keep_input=[True, False],
+                                batch_size=2,
+                                enqueue_many=True)
+      self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
-  @test_util.run_deprecated_v1
   def testMaybeBatchCorrectValues(self):
-    sparse_t = sparse_tensor.SparseTensor(
-        indices=[[0, 1], [0, 2], [1, 0], [1, 3]],
-        dense_shape=[2, 4],
-        values=[5, 4, 7, 2])
-    keep = constant_op.constant([True, False])
-    batched = inp.maybe_batch(
-        [sparse_t], keep_input=keep, batch_size=1, enqueue_many=True)
+    with ops.Graph().as_default():
+      sparse_t = sparse_tensor.SparseTensor(
+          indices=[[0, 1], [0, 2], [1, 0], [1, 3]],
+          dense_shape=[2, 4],
+          values=[5, 4, 7, 2])
+      keep = constant_op.constant([True, False])
+      batched = inp.maybe_batch([sparse_t],
+                                keep_input=keep,
+                                batch_size=1,
+                                enqueue_many=True)
 
-    with self.cached_session():
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(coord=coord)
+      with self.cached_session():
+        coord = coordinator.Coordinator()
+        threads = queue_runner_impl.start_queue_runners(coord=coord)
 
-      batched_np = self.evaluate(batched)
+        batched_np = self.evaluate(batched)
 
-      coord.request_stop()
-      for thread in threads:
-        thread.join()
+        coord.request_stop()
+        for thread in threads:
+          thread.join()
 
-    self.assertAllEqual([[0, 1], [0, 2]], batched_np.indices)
-    self.assertAllEqual([5, 4], batched_np.values)
-    self.assertAllEqual([1, 4], batched_np.dense_shape)
+      self.assertAllEqual([[0, 1], [0, 2]], batched_np.indices)
+      self.assertAllEqual([5, 4], batched_np.values)
+      self.assertAllEqual([1, 4], batched_np.dense_shape)
 
 
 class BatchJoinTest(test_lib.TestCase):
 
   def _testTwoThreadsHelper(self, use_dict):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       # Two threads, the first generates (0..69, "a").
       num_a = 70
       zero64 = constant_op.constant(0, dtype=dtypes.int64)
@@ -1104,17 +1099,15 @@ class BatchJoinTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
   def testTwoThreads(self):
     self._testTwoThreadsHelper(use_dict=False)
 
-  @test_util.run_deprecated_v1
   def testTwoThreadsDict(self):
     self._testTwoThreadsHelper(use_dict=True)
 
-  @test_util.run_deprecated_v1
   def testMismatchedDictKeys(self):
-    with self.assertRaisesRegex(ValueError, "must have the same keys"):
+    with ops.Graph().as_default(), self.assertRaisesRegex(
+        ValueError, "must have the same keys"):
       inp.batch_join(
           [{
               "c": 12,
@@ -1127,9 +1120,8 @@ class BatchJoinTest(test_lib.TestCase):
           }],
           batch_size=8)
 
-  @test_util.run_deprecated_v1
   def testTwoThreadsDynamicPad(self):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       # Two threads, the first generates (0..69, ["a"] * 1..70).
       num_a = 70
       zero64 = constant_op.constant(0, dtype=dtypes.int64)
@@ -1207,9 +1199,8 @@ class BatchJoinTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
   def testTwoThreadsSmallerBatch(self):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       extra_elements = 2
       # Two threads, the first generates (0..69, "a").
       num_a = 70 + extra_elements
@@ -1309,9 +1300,8 @@ class BatchJoinTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
   def testTwoThreadsDynamicPadSmallerBatch(self):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       extra_elements = 2
       # Two threads, the first generates (0..69, ["a"] * 1..70).
       num_a = 70 + extra_elements
@@ -1410,9 +1400,8 @@ class BatchJoinTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
   def testSharedName(self):
-    with self.cached_session():
+    with ops.Graph().as_default(), self.cached_session():
       batch_size = 10
       num_batches = 3
       zero64 = constant_op.constant(0, dtype=dtypes.int64)
@@ -1433,21 +1422,20 @@ class BatchJoinTest(test_lib.TestCase):
           "s: 'SHARED_NAME_XYZ'",
           batched[0].op.inputs[0].op.node_def.attr["shared_name"])
 
-  @test_util.run_deprecated_v1
   def testCannotInferRankError(self):
-    with self.cached_session():
+    with ops.Graph().as_default(), self.cached_session():
       x = array_ops.placeholder(dtype=dtypes.int64)
       with self.assertRaisesRegex(ValueError, "Cannot infer Tensor's rank"):
         inp.batch_join([[x]], batch_size=2)
 
-  @test_util.run_deprecated_v1
   def testSingleElementDict(self):
-    x = inp.batch_join([{"c": [12, 12]}], batch_size=8)
-    self.assertAllEqual((8, 2), x["c"].get_shape().as_list())
+    with ops.Graph().as_default():
+      x = inp.batch_join([{"c": [12, 12]}], batch_size=8)
+      self.assertAllEqual((8, 2), x["c"].get_shape().as_list())
 
   def _testKeepInputHelper(self, num_threads, enqueue_many,
                            keep_input_vector=False):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       batch_size = 5
       num_batches = 4
       examples = variables.Variable(0)
@@ -1487,139 +1475,156 @@ class BatchJoinTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("Input pipelines based on Queues are not supported "
+                         "when eager execution is enabled. TF2 uses tf.data "
+                         "instead.")
   def testSingleThreadKeepInput(self):
     self._testKeepInputHelper(1, False)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("Input pipelines based on Queues are not supported "
+                         "when eager execution is enabled. TF2 uses tf.data "
+                         "instead.")
   def testSingleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(1, True)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("Input pipelines based on Queues are not supported "
+                         "when eager execution is enabled. TF2 uses tf.data "
+                         "instead.")
   def testMultipleThreadKeepInput(self):
     self._testKeepInputHelper(5, False)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("Input pipelines based on Queues are not supported "
+                         "when eager execution is enabled. TF2 uses tf.data "
+                         "instead.")
   def testMultipleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(5, True)
 
-  @test_util.run_deprecated_v1
   def testSingleThreadKeepInputPerExample(self):
     self._testKeepInputHelper(1, True, keep_input_vector=True)
 
-  @test_util.run_deprecated_v1
   def testMultipleThreadKeepInputPerExample(self):
     self._testKeepInputHelper(5, True, keep_input_vector=True)
 
-  @test_util.run_deprecated_v1
   def testInvalidKeepInputVector(self):
-    # Can't have vector `keep_input` with `enqueue_many=False`.
-    with self.assertRaisesRegex(ValueError, "`keep_input` cannot be a vector"):
-      inp.maybe_batch_join([[array_ops.zeros(5)]],
-                           keep_input=constant_op.constant([True, False]),
-                           batch_size=1,
-                           enqueue_many=False)
-    # Can't have `keep_input` with more than one dimension.
-    with self.assertRaisesRegex(ValueError, "must be 0 or 1 dimensions"):
-      inp.maybe_batch_join([[array_ops.zeros(5)]],
-                           keep_input=constant_op.constant([[True], [False]]),
-                           batch_size=1,
-                           enqueue_many=True)
-    # `keep_input` must have dimensions determined at graph construction.
-    with self.assertRaisesRegex(ValueError,
-                                "must be known at graph construction"):
-      inp.maybe_batch_join([[array_ops.zeros(5)]],
-                           keep_input=array_ops.placeholder(dtypes.bool),
-                           batch_size=1,
-                           enqueue_many=True)
+    with ops.Graph().as_default():
+      # Can't have vector `keep_input` with `enqueue_many=False`.
+      with self.assertRaisesRegex(ValueError,
+                                  "`keep_input` cannot be a vector"):
+        inp.maybe_batch_join([[array_ops.zeros(5)]],
+                             keep_input=constant_op.constant([True, False]),
+                             batch_size=1,
+                             enqueue_many=False)
+      # Can't have `keep_input` with more than one dimension.
+      with self.assertRaisesRegex(ValueError, "must be 0 or 1 dimensions"):
+        inp.maybe_batch_join([[array_ops.zeros(5)]],
+                             keep_input=constant_op.constant([[True], [False]]),
+                             batch_size=1,
+                             enqueue_many=True)
+      # `keep_input` must have dimensions determined at graph construction.
+      with self.assertRaisesRegex(ValueError,
+                                  "must be known at graph construction"):
+        inp.maybe_batch_join([[array_ops.zeros(5)]],
+                             keep_input=array_ops.placeholder(dtypes.bool),
+                             batch_size=1,
+                             enqueue_many=True)
 
-  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShape(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=[[0]], values=[1.0], dense_shape=[1])
-    self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
-    batched = inp.maybe_batch_join([[sparse]], keep_input=True, batch_size=2)
-    self.assertAllEqual((2,), batched.dense_shape.get_shape().as_list())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=[[0]], values=[1.0], dense_shape=[1])
+      self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
+      batched = inp.maybe_batch_join([[sparse]], keep_input=True, batch_size=2)
+      self.assertAllEqual((2,), batched.dense_shape.get_shape().as_list())
 
-  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueMany(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=[[0]], values=[1.0], dense_shape=[1])
-    self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
-    batched = inp.maybe_batch_join(
-        [[sparse]], keep_input=True, batch_size=2, enqueue_many=True)
-    self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=[[0]], values=[1.0], dense_shape=[1])
+      self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
+      batched = inp.maybe_batch_join([[sparse]],
+                                     keep_input=True,
+                                     batch_size=2,
+                                     enqueue_many=True)
+      self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
-  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueManyPerExample(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=[[0], [0]], values=[1.0, 2.0], dense_shape=[2])
-    self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
-    batched = inp.maybe_batch_join(
-        [[sparse]], keep_input=[True, False], batch_size=2, enqueue_many=True)
-    self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=[[0], [0]], values=[1.0, 2.0], dense_shape=[2])
+      self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
+      batched = inp.maybe_batch_join([[sparse]],
+                                     keep_input=[True, False],
+                                     batch_size=2,
+                                     enqueue_many=True)
+      self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
-  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRank(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=array_ops.placeholder(dtypes.int64),
-        values=array_ops.placeholder(dtypes.float32),
-        dense_shape=array_ops.placeholder(dtypes.int64))
-    self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
-    batched = inp.maybe_batch_join([[sparse]], keep_input=True, batch_size=2)
-    self.assertIs(None, batched.dense_shape.get_shape().num_elements())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=array_ops.placeholder(dtypes.int64),
+          values=array_ops.placeholder(dtypes.float32),
+          dense_shape=array_ops.placeholder(dtypes.int64))
+      self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
+      batched = inp.maybe_batch_join([[sparse]], keep_input=True, batch_size=2)
+      self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
-  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankEnqueueMany(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=array_ops.placeholder(dtypes.int64),
-        values=array_ops.placeholder(dtypes.float32),
-        dense_shape=array_ops.placeholder(dtypes.int64))
-    self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
-    batched = inp.maybe_batch_join(
-        [[sparse]], keep_input=True, batch_size=2, enqueue_many=True)
-    self.assertIs(None, batched.dense_shape.get_shape().num_elements())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=array_ops.placeholder(dtypes.int64),
+          values=array_ops.placeholder(dtypes.float32),
+          dense_shape=array_ops.placeholder(dtypes.int64))
+      self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
+      batched = inp.maybe_batch_join([[sparse]],
+                                     keep_input=True,
+                                     batch_size=2,
+                                     enqueue_many=True)
+      self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
-  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankPerExample(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=array_ops.placeholder(dtypes.int64),
-        values=array_ops.placeholder(dtypes.float32),
-        dense_shape=array_ops.placeholder(dtypes.int64))
-    self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
-    batched = inp.maybe_batch_join(
-        [[sparse]], keep_input=[True, False], batch_size=2, enqueue_many=True)
-    self.assertIs(None, batched.dense_shape.get_shape().num_elements())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=array_ops.placeholder(dtypes.int64),
+          values=array_ops.placeholder(dtypes.float32),
+          dense_shape=array_ops.placeholder(dtypes.int64))
+      self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
+      batched = inp.maybe_batch_join([[sparse]],
+                                     keep_input=[True, False],
+                                     batch_size=2,
+                                     enqueue_many=True)
+      self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
-  @test_util.run_deprecated_v1
   def testMaybeBatchCorrectValues(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=[[0, 1], [0, 2], [1, 0], [1, 3]],
-        dense_shape=[2, 4],
-        values=[5, 4, 7, 2])
-    keep = constant_op.constant([True, False])
-    batched = inp.maybe_batch_join(
-        [[sparse]], keep_input=keep, batch_size=1, enqueue_many=True)
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=[[0, 1], [0, 2], [1, 0], [1, 3]],
+          dense_shape=[2, 4],
+          values=[5, 4, 7, 2])
+      keep = constant_op.constant([True, False])
+      batched = inp.maybe_batch_join([[sparse]],
+                                     keep_input=keep,
+                                     batch_size=1,
+                                     enqueue_many=True)
 
-    with self.cached_session():
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(coord=coord)
+      with self.cached_session():
+        coord = coordinator.Coordinator()
+        threads = queue_runner_impl.start_queue_runners(coord=coord)
 
-      batched_np = self.evaluate(batched)
+        batched_np = self.evaluate(batched)
 
-      coord.request_stop()
-      for thread in threads:
-        thread.join()
+        coord.request_stop()
+        for thread in threads:
+          thread.join()
 
-    self.assertAllEqual([[0, 1], [0, 2]], batched_np.indices)
-    self.assertAllEqual([5, 4], batched_np.values)
-    self.assertAllEqual([1, 4], batched_np.dense_shape)
+      self.assertAllEqual([[0, 1], [0, 2]], batched_np.indices)
+      self.assertAllEqual([5, 4], batched_np.values)
+      self.assertAllEqual([1, 4], batched_np.dense_shape)
 
 
 class ShuffleBatchTest(test_lib.TestCase):
 
   def _testOneThreadHelper(self, use_dict):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       batch_size = 10
       num_batches = 3
       zero64 = constant_op.constant(0, dtype=dtypes.int64)
@@ -1677,17 +1682,14 @@ class ShuffleBatchTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
   def testOneThread(self):
     self._testOneThreadHelper(use_dict=False)
 
-  @test_util.run_deprecated_v1
   def testOneThreadDict(self):
     self._testOneThreadHelper(use_dict=True)
 
-  @test_util.run_deprecated_v1
   def testOneThreadSmallerBatch(self):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       batch_size = 10
       num_batches = 3
       extra_elements = 5
@@ -1742,9 +1744,8 @@ class ShuffleBatchTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
   def testManyThreads(self):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       batch_size = 10
       num_batches = 3
       zero64 = constant_op.constant(0, dtype=dtypes.int64)
@@ -1790,9 +1791,8 @@ class ShuffleBatchTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
   def testManyThreadsSmallerBatch(self):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       batch_size = 10
       num_batches = 3
       extra_elements = 5
@@ -1849,9 +1849,8 @@ class ShuffleBatchTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
   def testSharedName(self):
-    with self.cached_session():
+    with ops.Graph().as_default(), self.cached_session():
       batch_size = 10
       num_batches = 3
       zero64 = constant_op.constant(0, dtype=dtypes.int64)
@@ -1871,7 +1870,7 @@ class ShuffleBatchTest(test_lib.TestCase):
 
   def _testKeepInputHelper(self, num_threads, enqueue_many,
                            keep_input_vector=False):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       batch_size = 5
       num_batches = 4
       examples = variables.Variable(0)
@@ -1910,112 +1909,142 @@ class ShuffleBatchTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("Input pipelines based on Queues are not supported "
+                         "when eager execution is enabled. TF2 uses tf.data "
+                         "instead.")
   def testSingleThreadKeepInput(self):
     self._testKeepInputHelper(1, False)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("Input pipelines based on Queues are not supported "
+                         "when eager execution is enabled. TF2 uses tf.data "
+                         "instead.")
   def testSingleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(1, True)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("Input pipelines based on Queues are not supported "
+                         "when eager execution is enabled. TF2 uses tf.data "
+                         "instead.")
   def testMultipleThreadKeepInput(self):
     self._testKeepInputHelper(5, False)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("Input pipelines based on Queues are not supported "
+                         "when eager execution is enabled. TF2 uses tf.data "
+                         "instead.")
   def testMultipleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(5, True)
 
-  @test_util.run_deprecated_v1
   def testSingleThreadKeepInputPerExample(self):
     self._testKeepInputHelper(1, True, keep_input_vector=True)
 
-  @test_util.run_deprecated_v1
   def testMultipleThreadKeepInputPerExample(self):
     self._testKeepInputHelper(5, True, keep_input_vector=True)
 
-  @test_util.run_deprecated_v1
   def testInvalidKeepInputVector(self):
-    # Can't have vector `keep_input` with `enqueue_many=False`.
-    with self.assertRaisesRegex(ValueError, "`keep_input` cannot be a vector"):
-      inp.maybe_shuffle_batch([array_ops.zeros(5)], 1, 10, 1,
-                              keep_input=constant_op.constant([True, False]),
-                              enqueue_many=False)
-    # Can't have `keep_input` with more than one dimension.
-    with self.assertRaisesRegex(ValueError, "must be 0 or 1 dimensions"):
-      inp.maybe_shuffle_batch([array_ops.zeros(5)], 1, 10, 1,
-                              keep_input=constant_op.constant([[True]]),
-                              enqueue_many=True)
-    # `keep_input` must have dimensions determined at graph construction.
-    with self.assertRaisesRegex(ValueError,
-                                "must be known at graph construction"):
-      inp.maybe_shuffle_batch([array_ops.zeros(5)], 1, 10, 1,
-                              keep_input=array_ops.placeholder(dtypes.bool),
-                              enqueue_many=True)
+    with ops.Graph().as_default():
+      # Can't have vector `keep_input` with `enqueue_many=False`.
+      with self.assertRaisesRegex(ValueError,
+                                  "`keep_input` cannot be a vector"):
+        inp.maybe_shuffle_batch([array_ops.zeros(5)],
+                                1,
+                                10,
+                                1,
+                                keep_input=constant_op.constant([True, False]),
+                                enqueue_many=False)
+      # Can't have `keep_input` with more than one dimension.
+      with self.assertRaisesRegex(ValueError, "must be 0 or 1 dimensions"):
+        inp.maybe_shuffle_batch([array_ops.zeros(5)],
+                                1,
+                                10,
+                                1,
+                                keep_input=constant_op.constant([[True]]),
+                                enqueue_many=True)
+      # `keep_input` must have dimensions determined at graph construction.
+      with self.assertRaisesRegex(ValueError,
+                                  "must be known at graph construction"):
+        inp.maybe_shuffle_batch([array_ops.zeros(5)],
+                                1,
+                                10,
+                                1,
+                                keep_input=array_ops.placeholder(dtypes.bool),
+                                enqueue_many=True)
 
-  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShape(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=[[0]], values=[1.0], dense_shape=[1])
-    self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
-    batched = inp.maybe_shuffle_batch([sparse], 2, 10, 1, True)
-    self.assertAllEqual((2,), batched.dense_shape.get_shape().as_list())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=[[0]], values=[1.0], dense_shape=[1])
+      self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
+      batched = inp.maybe_shuffle_batch([sparse], 2, 10, 1, True)
+      self.assertAllEqual((2,), batched.dense_shape.get_shape().as_list())
 
-  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueMany(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=[[0]], values=[1.0], dense_shape=[1])
-    self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
-    batched = inp.maybe_shuffle_batch(
-        [sparse], 2, 10, 1, True, enqueue_many=True)
-    self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=[[0]], values=[1.0], dense_shape=[1])
+      self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
+      batched = inp.maybe_shuffle_batch([sparse],
+                                        2,
+                                        10,
+                                        1,
+                                        True,
+                                        enqueue_many=True)
+      self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
-  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueManyPerExample(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=[[0], [0]], values=[1.0, 2.0], dense_shape=[2])
-    self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
-    batched = inp.maybe_shuffle_batch(
-        [sparse], 2, 10, 1, [True, False], enqueue_many=True)
-    self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=[[0], [0]], values=[1.0, 2.0], dense_shape=[2])
+      self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
+      batched = inp.maybe_shuffle_batch([sparse],
+                                        2,
+                                        10,
+                                        1, [True, False],
+                                        enqueue_many=True)
+      self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
-  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRank(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=array_ops.placeholder(dtypes.int64),
-        values=array_ops.placeholder(dtypes.float32),
-        dense_shape=array_ops.placeholder(dtypes.int64))
-    self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
-    batched = inp.maybe_shuffle_batch([sparse], 2, 10, 1, True)
-    self.assertIs(None, batched.dense_shape.get_shape().num_elements())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=array_ops.placeholder(dtypes.int64),
+          values=array_ops.placeholder(dtypes.float32),
+          dense_shape=array_ops.placeholder(dtypes.int64))
+      self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
+      batched = inp.maybe_shuffle_batch([sparse], 2, 10, 1, True)
+      self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
-  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankEnqueueMany(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=array_ops.placeholder(dtypes.int64),
-        values=array_ops.placeholder(dtypes.float32),
-        dense_shape=array_ops.placeholder(dtypes.int64))
-    self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
-    batched = inp.maybe_shuffle_batch(
-        [sparse], 2, 10, 1, True, enqueue_many=True)
-    self.assertIs(None, batched.dense_shape.get_shape().num_elements())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=array_ops.placeholder(dtypes.int64),
+          values=array_ops.placeholder(dtypes.float32),
+          dense_shape=array_ops.placeholder(dtypes.int64))
+      self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
+      batched = inp.maybe_shuffle_batch([sparse],
+                                        2,
+                                        10,
+                                        1,
+                                        True,
+                                        enqueue_many=True)
+      self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
-  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankPerExample(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=array_ops.placeholder(dtypes.int64),
-        values=array_ops.placeholder(dtypes.float32),
-        dense_shape=array_ops.placeholder(dtypes.int64))
-    self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
-    batched = inp.maybe_shuffle_batch(
-        [sparse], 2, 10, 1, [True, False], enqueue_many=True)
-    self.assertIs(None, batched.dense_shape.get_shape().num_elements())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=array_ops.placeholder(dtypes.int64),
+          values=array_ops.placeholder(dtypes.float32),
+          dense_shape=array_ops.placeholder(dtypes.int64))
+      self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
+      batched = inp.maybe_shuffle_batch([sparse],
+                                        2,
+                                        10,
+                                        1, [True, False],
+                                        enqueue_many=True)
+      self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
 
 class ShuffleBatchJoinTest(test_lib.TestCase):
 
   def _testTwoThreadsHelper(self, use_dict):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       # Two threads, the first generates (0..24, "a").
       num_a = 25
       zero64 = constant_op.constant(0, dtype=dtypes.int64)
@@ -2119,17 +2148,14 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
   def testTwoThreads(self):
     self._testTwoThreadsHelper(use_dict=False)
 
-  @test_util.run_deprecated_v1
   def testTwoThreadsDict(self):
     self._testTwoThreadsHelper(use_dict=True)
 
-  @test_util.run_deprecated_v1
   def testTwoThreadsSmallerBatch(self):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       # Two threads, the first generates (0..26, "a").
       extra_elements = 2
       num_a = 25 + extra_elements
@@ -2231,9 +2257,9 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
   def testMismatchedDictKeys(self):
-    with self.assertRaisesRegex(ValueError, "must have the same keys"):
+    with ops.Graph().as_default(), self.assertRaisesRegex(
+        ValueError, "must have the same keys"):
       inp.shuffle_batch_join(
           [{
               "c": 12,
@@ -2249,9 +2275,8 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
           min_after_dequeue=16,
           seed=223607)
 
-  @test_util.run_deprecated_v1
   def testSharedName(self):
-    with self.cached_session():
+    with ops.Graph().as_default(), self.cached_session():
       batch_size = 10
       num_batches = 3
       zero64 = constant_op.constant(0, dtype=dtypes.int64)
@@ -2276,7 +2301,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
 
   def _testKeepInputHelper(self, num_threads, enqueue_many,
                            keep_input_vector=False):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       batch_size = 5
       num_batches = 4
       examples = variables.Variable(0)
@@ -2314,109 +2339,138 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("Input pipelines based on Queues are not supported "
+                         "when eager execution is enabled. TF2 uses tf.data "
+                         "instead.")
   def testSingleThreadKeepInput(self):
     self._testKeepInputHelper(1, False)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("Input pipelines based on Queues are not supported "
+                         "when eager execution is enabled. TF2 uses tf.data "
+                         "instead.")
   def testSingleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(1, True)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("Input pipelines based on Queues are not supported "
+                         "when eager execution is enabled. TF2 uses tf.data "
+                         "instead.")
   def testMultipleThreadKeepInput(self):
     self._testKeepInputHelper(5, False)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("Input pipelines based on Queues are not supported "
+                         "when eager execution is enabled. TF2 uses tf.data "
+                         "instead.")
   def testMultipleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(5, True)
 
-  @test_util.run_deprecated_v1
   def testSingleThreadKeepInputPerExample(self):
     self._testKeepInputHelper(1, True, keep_input_vector=True)
 
-  @test_util.run_deprecated_v1
   def testMultipleThreadKeepInputPerExample(self):
     self._testKeepInputHelper(5, True, keep_input_vector=True)
 
-  @test_util.run_deprecated_v1
   def testInvalidKeepInputVector(self):
-    # Can't have vector `keep_input` with `enqueue_many=False`.
-    with self.assertRaisesRegex(ValueError, "`keep_input` cannot be a vector"):
-      inp.maybe_shuffle_batch_join(
-          [[array_ops.zeros(5)]], 1, 10, 1,
-          keep_input=constant_op.constant([True, False]),
-          enqueue_many=False)
-    # Can't have `keep_input` with more than one dimension.
-    with self.assertRaisesRegex(ValueError, "must be 0 or 1 dimensions"):
-      inp.maybe_shuffle_batch_join(
-          [[array_ops.zeros(5)]], 1, 10, 1,
-          keep_input=constant_op.constant([[True]]),
-          enqueue_many=True)
-    # `keep_input` must have dimensions determined at graph construction.
-    with self.assertRaisesRegex(ValueError,
-                                "must be known at graph construction"):
-      inp.maybe_shuffle_batch_join(
-          [[array_ops.zeros(5)]], 1, 10, 1,
-          keep_input=array_ops.placeholder(dtypes.bool),
-          enqueue_many=True)
+    with ops.Graph().as_default():
+      # Can't have vector `keep_input` with `enqueue_many=False`.
+      with self.assertRaisesRegex(ValueError,
+                                  "`keep_input` cannot be a vector"):
+        inp.maybe_shuffle_batch_join([[array_ops.zeros(5)]],
+                                     1,
+                                     10,
+                                     1,
+                                     keep_input=constant_op.constant(
+                                         [True, False]),
+                                     enqueue_many=False)
+      # Can't have `keep_input` with more than one dimension.
+      with self.assertRaisesRegex(ValueError, "must be 0 or 1 dimensions"):
+        inp.maybe_shuffle_batch_join([[array_ops.zeros(5)]],
+                                     1,
+                                     10,
+                                     1,
+                                     keep_input=constant_op.constant([[True]]),
+                                     enqueue_many=True)
+      # `keep_input` must have dimensions determined at graph construction.
+      with self.assertRaisesRegex(ValueError,
+                                  "must be known at graph construction"):
+        inp.maybe_shuffle_batch_join([[array_ops.zeros(5)]],
+                                     1,
+                                     10,
+                                     1,
+                                     keep_input=array_ops.placeholder(
+                                         dtypes.bool),
+                                     enqueue_many=True)
 
-  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShape(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=[[0]], values=[1.0], dense_shape=[1])
-    self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
-    batched = inp.maybe_shuffle_batch_join([[sparse]], 2, 10, 1, True)
-    self.assertAllEqual((2,), batched.dense_shape.get_shape().as_list())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=[[0]], values=[1.0], dense_shape=[1])
+      self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
+      batched = inp.maybe_shuffle_batch_join([[sparse]], 2, 10, 1, True)
+      self.assertAllEqual((2,), batched.dense_shape.get_shape().as_list())
 
-  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueMany(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=[[0]], values=[1.0], dense_shape=[1])
-    self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
-    batched = inp.maybe_shuffle_batch_join(
-        [[sparse]], 2, 10, 1, True, enqueue_many=True)
-    self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=[[0]], values=[1.0], dense_shape=[1])
+      self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
+      batched = inp.maybe_shuffle_batch_join([[sparse]],
+                                             2,
+                                             10,
+                                             1,
+                                             True,
+                                             enqueue_many=True)
+      self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
-  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueManyPerExample(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=[[0], [0]], values=[1.0, 2.0], dense_shape=[2])
-    self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
-    batched = inp.maybe_shuffle_batch_join(
-        [[sparse]], 2, 10, 1, [True, False], enqueue_many=True)
-    self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=[[0], [0]], values=[1.0, 2.0], dense_shape=[2])
+      self.assertAllEqual((1,), sparse.dense_shape.get_shape().as_list())
+      batched = inp.maybe_shuffle_batch_join([[sparse]],
+                                             2,
+                                             10,
+                                             1, [True, False],
+                                             enqueue_many=True)
+      self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
-  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRank(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=array_ops.placeholder(dtypes.int64),
-        values=array_ops.placeholder(dtypes.float32),
-        dense_shape=array_ops.placeholder(dtypes.int64))
-    self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
-    batched = inp.maybe_shuffle_batch_join([[sparse]], 2, 10, 1, True)
-    self.assertIs(None, batched.dense_shape.get_shape().num_elements())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=array_ops.placeholder(dtypes.int64),
+          values=array_ops.placeholder(dtypes.float32),
+          dense_shape=array_ops.placeholder(dtypes.int64))
+      self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
+      batched = inp.maybe_shuffle_batch_join([[sparse]], 2, 10, 1, True)
+      self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
-  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankEnqueueMany(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=array_ops.placeholder(dtypes.int64),
-        values=array_ops.placeholder(dtypes.float32),
-        dense_shape=array_ops.placeholder(dtypes.int64))
-    self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
-    batched = inp.maybe_shuffle_batch_join(
-        [[sparse]], 2, 10, 1, True, enqueue_many=True)
-    self.assertIs(None, batched.dense_shape.get_shape().num_elements())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=array_ops.placeholder(dtypes.int64),
+          values=array_ops.placeholder(dtypes.float32),
+          dense_shape=array_ops.placeholder(dtypes.int64))
+      self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
+      batched = inp.maybe_shuffle_batch_join([[sparse]],
+                                             2,
+                                             10,
+                                             1,
+                                             True,
+                                             enqueue_many=True)
+      self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
-  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankPerExample(self):
-    sparse = sparse_tensor.SparseTensor(
-        indices=array_ops.placeholder(dtypes.int64),
-        values=array_ops.placeholder(dtypes.float32),
-        dense_shape=array_ops.placeholder(dtypes.int64))
-    self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
-    batched = inp.maybe_shuffle_batch_join(
-        [[sparse]], 2, 10, 1, [True, False], enqueue_many=True)
-    self.assertIs(None, batched.dense_shape.get_shape().num_elements())
+    with ops.Graph().as_default():
+      sparse = sparse_tensor.SparseTensor(
+          indices=array_ops.placeholder(dtypes.int64),
+          values=array_ops.placeholder(dtypes.float32),
+          dense_shape=array_ops.placeholder(dtypes.int64))
+      self.assertIs(None, sparse.dense_shape.get_shape().num_elements())
+      batched = inp.maybe_shuffle_batch_join([[sparse]],
+                                             2,
+                                             10,
+                                             1, [True, False],
+                                             enqueue_many=True)
+      self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/training/momentum_test.py b/tensorflow/python/training/momentum_test.py
index 6e47a2e5f2e..332cc4018ac 100644
--- a/tensorflow/python/training/momentum_test.py
+++ b/tensorflow/python/training/momentum_test.py
@@ -160,10 +160,10 @@ class MomentumOptimizerTest(test.TestCase):
       self.assertStartsWith(optimizer_variables[1].name, "var3")
       self.assertEqual(2, len(optimizer_variables))
 
-  @test_util.run_deprecated_v1
   def testNesterovMomentum(self):
     for dtype in [dtypes.float32, dtypes.float64]:
-      with self.cached_session():
+      # train.MomentumOptimizer is V1 only API.
+      with ops.Graph().as_default(), self.cached_session():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
@@ -187,10 +187,10 @@ class MomentumOptimizerTest(test.TestCase):
           self.assertAllClose(var0_np, self.evaluate(var0))
           self.assertAllClose(var1_np, self.evaluate(var1))
 
-  @test_util.run_deprecated_v1
   def testSparseNesterovMomentum(self):
     for dtype in [dtypes.float32, dtypes.float64]:
-      with self.cached_session():
+      # train.MomentumOptimizer is V1 only API.
+      with ops.Graph().as_default(), self.cached_session():
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
         accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
@@ -282,10 +282,10 @@ class MomentumOptimizerTest(test.TestCase):
     self.evaluate(sgd_op)
     self.assertAllCloseAccordingToType([[1, 1], [0, 0]], self.evaluate(var0))
 
-  @test_util.run_deprecated_v1
   def testTensorLearningRateAndMomentum(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
+      # train.MomentumOptimizer is V1 only API.
+      with ops.Graph().as_default(), self.cached_session():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
@@ -443,9 +443,9 @@ class MomentumOptimizerTest(test.TestCase):
     # pylint: enable=line-too-long
     return db_grad, db_out
 
-  @test_util.run_deprecated_v1
   def testLikeDistBeliefMom01(self):
-    with self.cached_session():
+    # train.MomentumOptimizer is V1 only API.
+    with ops.Graph().as_default(), self.cached_session():
       db_grad, db_out = self._dbParamsMom01()
       num_samples = len(db_grad)
       var0 = variables.Variable([0.0] * num_samples)
@@ -457,10 +457,10 @@ class MomentumOptimizerTest(test.TestCase):
         mom_update.run(feed_dict={grads0: db_grad[i]})
         self.assertAllClose(np.array(db_out[i]), self.evaluate(var0))
 
-  @test_util.run_deprecated_v1
   def testSparse(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
+      # train.MomentumOptimizer is V1 only API.
+      with ops.Graph().as_default(), self.cached_session():
         var0 = variables.Variable(array_ops.zeros([4, 2], dtype=dtype))
         var1 = variables.Variable(constant_op.constant(1.0, dtype, [4, 2]))
         grads0 = ops.IndexedSlices(
@@ -539,10 +539,10 @@ class MomentumOptimizerTest(test.TestCase):
             ]),
             self.evaluate(var1)[2])
 
-  @test_util.run_deprecated_v1
   def testSharing(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
+      # train.MomentumOptimizer is V1 only API.
+      with ops.Graph().as_default(), self.cached_session():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
diff --git a/tensorflow/python/training/proximal_gradient_descent_test.py b/tensorflow/python/training/proximal_gradient_descent_test.py
index 603807332ca..994590840af 100644
--- a/tensorflow/python/training/proximal_gradient_descent_test.py
+++ b/tensorflow/python/training/proximal_gradient_descent_test.py
@@ -23,7 +23,6 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -37,7 +36,7 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
 
   def doTestProximalGradientDescentwithoutRegularization(
       self, use_resource=False):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       if use_resource:
         var0 = resource_variable_ops.ResourceVariable([0.0, 0.0])
         var1 = resource_variable_ops.ResourceVariable([0.0, 0.0])
@@ -63,17 +62,14 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
       self.assertAllClose(np.array([-0.9, -1.8]), v0_val)
       self.assertAllClose(np.array([-0.09, -0.18]), v1_val)
 
-  @test_util.run_deprecated_v1
   def testProximalGradientDescentwithoutRegularization(self):
     self.doTestProximalGradientDescentwithoutRegularization(use_resource=False)
 
-  @test_util.run_deprecated_v1
   def testResourceProximalGradientDescentwithoutRegularization(self):
     self.doTestProximalGradientDescentwithoutRegularization(use_resource=True)
 
-  @test_util.run_deprecated_v1
   def testProximalGradientDescentwithoutRegularization2(self):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       var0 = variables.Variable([1.0, 2.0])
       var1 = variables.Variable([4.0, 3.0])
       grads0 = constant_op.constant([0.1, 0.2])
@@ -96,10 +92,9 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
       self.assertAllClose(np.array([0.1, 0.2]), v0_val)
       self.assertAllClose(np.array([3.91, 2.82]), v1_val)
 
-  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.float32, dtypes.float64]:
-      with self.cached_session():
+      with ops.Graph().as_default(), self.cached_session():
         var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
         pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
@@ -116,9 +111,8 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
                                            self.evaluate(var0),
                                            atol=0.01)
 
-  @test_util.run_deprecated_v1
   def testProximalGradientDescentWithL1_L2(self):
-    with self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       var0 = variables.Variable([1.0, 2.0])
       var1 = variables.Variable([4.0, 3.0])
       grads0 = constant_op.constant([0.1, 0.2])
@@ -164,7 +158,6 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
     update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     self.evaluate(variables.global_variables_initializer())
 
-    sess = ops.get_default_session()
     v0_val, v1_val = self.evaluate([var0, var1])
     if is_sparse:
       self.assertAllClose([[1.0], [2.0]], v0_val)
@@ -180,9 +173,8 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
     v0_val, v1_val = self.evaluate([var0, var1])
     return v0_val, v1_val
 
-  @test_util.run_deprecated_v1
   def testEquivSparseGradientDescentwithoutRegularization(self):
-    with self.cached_session():
+    with ops.Graph().as_default(), self.cached_session():
       val0, val1 = self.applyOptimizer(
           proximal_gradient_descent.ProximalGradientDescentOptimizer(
               3.0,
@@ -190,23 +182,20 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
               l2_regularization_strength=0.0),
           is_sparse=True)
 
-    with self.cached_session():
       val2, val3 = self.applyOptimizer(
           gradient_descent.GradientDescentOptimizer(3.0), is_sparse=True)
 
     self.assertAllClose(val0, val2)
     self.assertAllClose(val1, val3)
 
-  @test_util.run_deprecated_v1
   def testEquivGradientDescentwithoutRegularization(self):
-    with self.cached_session():
+    with ops.Graph().as_default(), self.cached_session():
       val0, val1 = self.applyOptimizer(
           proximal_gradient_descent.ProximalGradientDescentOptimizer(
               3.0,
               l1_regularization_strength=0.0,
               l2_regularization_strength=0.0))
 
-    with self.cached_session():
       val2, val3 = self.applyOptimizer(
           gradient_descent.GradientDescentOptimizer(3.0))
 
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index 75608b8dbf5..2770a490de8 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -166,24 +166,24 @@ class SaverTest(test.TestCase):
   def testResourceBasic(self):
     self.basicSaveRestore(resource_variable_ops.ResourceVariable)
 
-  @test_util.run_deprecated_v1
   def testResourceColocation(self):
-    partitioner = partitioned_variables.fixed_size_partitioner(num_shards=2)
-    with ops_lib.device("/job:ps/device:GPU:0"):
-      v = variable_scope.get_variable("v0",
-                                      shape=[10, 2],
-                                      partitioner=partitioner,
-                                      use_resource=True)
-    saver_module.Saver({"v0": v}).build()
-    save_op = None
-    for op in ops_lib.get_default_graph().get_operations():
-      if op.type == "SaveV2":
-        save_op = op
-        break
-    assert save_op is not None
-    for save_inp in save_op.inputs[3:]:
-      # Input to SaveV2 op is placed on CPU of the same device as the Variable.
-      self.assertEqual("/job:ps/device:CPU:0", save_inp.device)
+    # train.Saver is V1 only API.
+    with ops_lib.Graph().as_default():
+      partitioner = partitioned_variables.fixed_size_partitioner(num_shards=2)
+      with ops_lib.device("/job:ps/device:GPU:0"):
+        v = variable_scope.get_variable(
+            "v0", shape=[10, 2], partitioner=partitioner, use_resource=True)
+      saver_module.Saver({"v0": v}).build()
+      save_op = None
+      for op in ops_lib.get_default_graph().get_operations():
+        if op.type == "SaveV2":
+          save_op = op
+          break
+      assert save_op is not None
+      for save_inp in save_op.inputs[3:]:
+        # Input to SaveV2 op is placed on CPU of the same device as
+        # the Variable.
+        self.assertEqual("/job:ps/device:CPU:0", save_inp.device)
 
   def testResourceVariableReadOpsAddedDeterministically(self):
     graph_defs = []
@@ -297,7 +297,6 @@ class SaverTest(test.TestCase):
             not op.name.startswith("saver2/save/"))]
     self.assertEqual(ops_in_saver2_scope_but_not_save_scope, [])
 
-  @test_util.run_deprecated_v1
   def testSaveCopyRestoreWithSaveRelativePaths(self):
     """Save, copy checkpoint dir and restore from copied dir.
 
@@ -307,75 +306,79 @@ class SaverTest(test.TestCase):
     os.mkdir(save_dir1)
     save_path1 = os.path.join(save_dir1, "save_copy_restore")
 
-    # Build a graph with 2 parameter nodes, and Save and
-    # Restore nodes for them.
-    v0 = variables.VariableV1(10.0, name="v0")
-    v1 = variables.VariableV1(20.0, name="v1")
-    v2 = saver_test_utils.CheckpointedOp(name="v2")
-    v2_init = v2.insert("k1", 30.0)
-    save = saver_module.Saver(
-        var_list={
-            "v0": v0,
-            "v1": v1,
-            "v2": v2.saveable},
-        restore_sequentially=True,
-        save_relative_paths=True)
-    init_all_op = [variables.global_variables_initializer(), v2_init]
-
-    with self.cached_session() as sess:
-      # Initialize all variables
-      self.evaluate(init_all_op)
-
-      # Check that the parameter nodes have been initialized.
-      self.assertEqual(10.0, self.evaluate(v0))
-      self.assertEqual(20.0, self.evaluate(v1))
-      self.assertEqual(b"k1", self.evaluate(v2.keys()))
-      self.assertEqual(30.0, self.evaluate(v2.values()))
-
-      # Save the initialized values in the file at "save_path"
-      val = save.save(sess, save_path1)
-      self.assertTrue(isinstance(val, six.string_types))
-      self.assertEqual(save_path1, val)
-
-    self.assertEqual(
-        checkpoint_management.latest_checkpoint(save_dir1), save_path1)
-    save_dir2 = os.path.join(self.get_temp_dir(), "save_dir2")
-    os.renames(save_dir1, save_dir2)
-    save_path2 = os.path.join(save_dir2, "save_copy_restore")
-    self.assertEqual(
-        checkpoint_management.latest_checkpoint(save_dir2), save_path2)
-
-    # Start a second session.  In that session the parameter nodes
-    # have not been initialized either.
-    with self.cached_session() as sess:
-      v0 = variables.VariableV1(-1.0, name="v0")
-      v1 = variables.VariableV1(-1.0, name="v1")
+    # train.Saver is V1 only API.
+    with ops_lib.Graph().as_default():
+      # Build a graph with 2 parameter nodes, and Save and
+      # Restore nodes for them.
+      v0 = variables.VariableV1(10.0, name="v0")
+      v1 = variables.VariableV1(20.0, name="v1")
       v2 = saver_test_utils.CheckpointedOp(name="v2")
-      save = saver_module.Saver({"v0": v0, "v1": v1, "v2": v2.saveable})
+      v2_init = v2.insert("k1", 30.0)
+      save = saver_module.Saver(
+          var_list={
+              "v0": v0,
+              "v1": v1,
+              "v2": v2.saveable
+          },
+          restore_sequentially=True,
+          save_relative_paths=True)
+      init_all_op = [variables.global_variables_initializer(), v2_init]
+
+      with self.cached_session() as sess:
+        # Initialize all variables
+        self.evaluate(init_all_op)
+
+        # Check that the parameter nodes have been initialized.
+        self.assertEqual(10.0, self.evaluate(v0))
+        self.assertEqual(20.0, self.evaluate(v1))
+        self.assertEqual(b"k1", self.evaluate(v2.keys()))
+        self.assertEqual(30.0, self.evaluate(v2.values()))
+
+        # Save the initialized values in the file at "save_path"
+        val = save.save(sess, save_path1)
+        self.assertTrue(isinstance(val, six.string_types))
+        self.assertEqual(save_path1, val)
 
-      # Assert that the variables are not initialized.
       self.assertEqual(
-          len(variables.report_uninitialized_variables().eval()), 2)
-      self.assertEqual(0, len(self.evaluate(v2.keys())))
-      self.assertEqual(0, len(self.evaluate(v2.values())))
+          checkpoint_management.latest_checkpoint(save_dir1), save_path1)
+      save_dir2 = os.path.join(self.get_temp_dir(), "save_dir2")
+      os.renames(save_dir1, save_dir2)
+      save_path2 = os.path.join(save_dir2, "save_copy_restore")
+      self.assertEqual(
+          checkpoint_management.latest_checkpoint(save_dir2), save_path2)
 
-      # Restore the saved values in the parameter nodes.
-      save.restore(sess, save_path2)
-      # Check that the parameter nodes have been restored.
-      self.assertEqual(10.0, self.evaluate(v0))
-      self.assertEqual(20.0, self.evaluate(v1))
-      self.assertEqual(b"k1", self.evaluate(v2.keys()))
-      self.assertEqual(30.0, self.evaluate(v2.values()))
+      # Start a second session.  In that session the parameter nodes
+      # have not been initialized either.
+      with self.cached_session() as sess:
+        v0 = variables.VariableV1(-1.0, name="v0")
+        v1 = variables.VariableV1(-1.0, name="v1")
+        v2 = saver_test_utils.CheckpointedOp(name="v2")
+        save = saver_module.Saver({"v0": v0, "v1": v1, "v2": v2.saveable})
+
+        # Assert that the variables are not initialized.
+        self.assertEqual(
+            len(variables.report_uninitialized_variables().eval()), 2)
+        self.assertEqual(0, len(self.evaluate(v2.keys())))
+        self.assertEqual(0, len(self.evaluate(v2.values())))
+
+        # Restore the saved values in the parameter nodes.
+        save.restore(sess, save_path2)
+        # Check that the parameter nodes have been restored.
+        self.assertEqual(10.0, self.evaluate(v0))
+        self.assertEqual(20.0, self.evaluate(v1))
+        self.assertEqual(b"k1", self.evaluate(v2.keys()))
+        self.assertEqual(30.0, self.evaluate(v2.values()))
 
-  @test_util.run_deprecated_v1
   def testFilenameTensor(self):
-    v0 = variables.VariableV1(0, name="v0")
-    filename = b"somerandomfilename"
-    save = saver_module.Saver({"v0": v0}, filename=filename)
-    with self.cached_session() as sess:
-      tensor = sess.graph.get_tensor_by_name(
-          save.saver_def.filename_tensor_name)
-      self.assertEqual(self.evaluate(tensor), filename)
+    # train.Saver is V1 only API.
+    with ops_lib.Graph().as_default():
+      v0 = variables.VariableV1(0, name="v0")
+      filename = b"somerandomfilename"
+      save = saver_module.Saver({"v0": v0}, filename=filename)
+      with self.cached_session() as sess:
+        tensor = sess.graph.get_tensor_by_name(
+            save.saver_def.filename_tensor_name)
+        self.assertEqual(self.evaluate(tensor), filename)
 
   def testInvalidPath(self):
     v0 = variables.VariableV1(0, name="v0")
@@ -386,7 +389,7 @@ class SaverTest(test.TestCase):
             ValueError, "The passed save_path is not a valid checkpoint:"):
           save.restore(sess, "invalid path")
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("train.Saver is V1 only API.")
   def testInt64(self):
     save_path = os.path.join(self.get_temp_dir(), "int64")
 
@@ -462,7 +465,7 @@ class SaverTest(test.TestCase):
       # Verify non-duplicate names work.
       saver_module.Saver({"v0": v0, "v2": v2.saveable})
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("train.Saver and VariableV1 are V1 only APIs.")
   def testBasicsWithListOfVariables(self):
     save_path = os.path.join(self.get_temp_dir(), "basics_with_list")
 
@@ -558,15 +561,15 @@ class SaverTest(test.TestCase):
     # The cached readers should know to re-read the file.
     self._SaveAndLoad("var1", 1.1, 2.2, save_path)
 
-  @test_util.run_deprecated_v1
   def testAllowEmpty(self):
     save_path = os.path.join(self.get_temp_dir(), "allow_empty")
-    with self.cached_session() as sess:
+    # train.Saver is V1 only API.
+    with ops_lib.Graph().as_default(), self.cached_session() as sess:
       _ = constant_op.constant(1)
       save = saver_module.Saver(allow_empty=True)
       val = save.save(sess, save_path)
       self.assertIsNone(val)
-    with self.cached_session() as sess:
+    with ops_lib.Graph().as_default(), self.cached_session() as sess:
       save = saver_module.Saver(allow_empty=True)
       save.restore(sess, save_path)
 
@@ -663,7 +666,7 @@ class SaverTest(test.TestCase):
       self.assertAllClose(1.0, self.evaluate(one))
       self.assertAllClose([2.0, 2.0, 2.0], self.evaluate(twos))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("train.Saver is V1 only API.")
   def testReshape(self):
     save_path = os.path.join(self.get_temp_dir(), "variables_reshape")
     with session.Session("", graph=ops_lib.Graph()) as sess:
@@ -986,9 +989,9 @@ class SaveRestoreShardedTest(test.TestCase):
           checkpoint_management.latest_checkpoint(self.get_temp_dir()),
           os.path.join(self.get_temp_dir(), "sharded_basics"))
 
-  @test_util.run_deprecated_v1
   def testSaverDef(self):
-    with self.cached_session():
+    # train.Saver is V1 only API.
+    with ops_lib.Graph().as_default(), self.cached_session():
       v0 = variables.VariableV1(123, name="v0")
       save = saver_module.Saver({"v0": v0}, sharded=True)
       sd = save.as_saver_def()
@@ -1004,7 +1007,8 @@ class SaveRestoreShardedTest(test.TestCase):
     call_saver_with_dict = False  # updated by test loop below
 
     def _save(partitioner=None):
-      with self.session(graph=ops_lib.Graph()) as sess:
+      # train.Saver is V1 only API.
+      with ops_lib.Graph().as_default(), self.session() as sess:
         # Calls .eval() to return the ndarray that makes up the full variable.
         rnd = random_ops.random_uniform(var_full_shape).eval()
 
@@ -1034,7 +1038,8 @@ class SaveRestoreShardedTest(test.TestCase):
         return rnd
 
     def _restore(partitioner=None):
-      with self.session(graph=ops_lib.Graph()) as sess:
+      # train.Saver is V1 only API.
+      with ops_lib.Graph().as_default(), self.session() as sess:
         if partitioner:
           new_vs = [
               variable_scope.get_variable(
@@ -1092,11 +1097,9 @@ class SaveRestoreShardedTest(test.TestCase):
               num_shards=3))
       self.assertAllEqual(saved_full, restored_full)
 
-  @test_util.run_deprecated_v1
   def testPartitionedVariable(self):
     self._testPartitionedVariables(use_resource=False)
 
-  @test_util.run_deprecated_v1
   def testPartitionedResourceVariable(self):
     self._testPartitionedVariables(use_resource=True)
 
@@ -1321,11 +1324,11 @@ class MaxToKeepTest(test.TestCase):
       # Deleted by the first helper.
       self.assertFalse(checkpoint_management.checkpoint_exists(s3))
 
-  @test_util.run_deprecated_v1
   def testNonSharded(self):
     save_dir = self._get_test_dir("max_to_keep_non_sharded")
 
-    with self.cached_session() as sess:
+    # train.Saver is V1 only API.
+    with ops_lib.Graph().as_default(), self.cached_session() as sess:
       v = variables.VariableV1(10.0, name="v")
       save = saver_module.Saver({"v": v}, max_to_keep=2)
       self.evaluate(variables.global_variables_initializer())
@@ -1804,7 +1807,9 @@ class MetaGraphTest(test.TestCase):
     gfile.MakeDirs(test_dir)
     return test_dir
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only(
+      "Queue-based input pipelines have been replaced by `tf.data` "
+      "and not supported in V2.")
   def testAddCollectionDef(self):
     test_dir = self._get_test_dir("good_collection")
     filename = os.path.join(test_dir, "metafile")
@@ -1955,13 +1960,15 @@ class MetaGraphTest(test.TestCase):
       v1 = sess.graph.get_tensor_by_name("v1:0")
       self.assertEqual(11.0, self.evaluate(v1))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only(
+      "Exporting/importing meta graphs is only supported in V1.")
   def testMultiSaverCollection(self):
     test_dir = self._get_test_dir("saver_collection")
     self._testMultiSaverCollectionSave(test_dir)
     self._testMultiSaverCollectionRestore(test_dir)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only(
+      "Exporting/importing meta graphs is only supported in V1.")
   def testClearExtraneousSavers(self):
     test_dir = self._get_test_dir("clear_extraneous_savers")
     filename = os.path.join(test_dir, "metafile")
@@ -2016,29 +2023,29 @@ class MetaGraphTest(test.TestCase):
       self.assertEqual(33, len(meta_graph_def0.graph_def.node))
       self.assertEqual(21, len(meta_graph_def1.graph_def.node))
 
-  @test_util.run_deprecated_v1
   def testBinaryAndTextFormat(self):
     test_dir = self._get_test_dir("binary_and_text")
     filename = os.path.join(test_dir, "metafile")
-    with self.session(graph=ops_lib.Graph()):
+    # train.Saver is V1 only API.
+    with ops_lib.Graph().as_default(), self.session():
       # Creates a graph.
       variables.VariableV1(10.0, name="v0")
       # Exports the graph as binary format.
       saver_module.export_meta_graph(filename, as_text=False)
-    with self.session(graph=ops_lib.Graph()):
+    with ops_lib.Graph().as_default(), self.session():
       # Imports the binary format graph.
       saver = saver_module.import_meta_graph(filename)
       self.assertIsNotNone(saver)
       # Exports the graph as text format.
       saver.export_meta_graph(filename, as_text=True)
-    with self.session(graph=ops_lib.Graph()):
+    with ops_lib.Graph().as_default(), self.session():
       # Imports the text format graph.
       saver_module.import_meta_graph(filename)
       # Writes wrong contents to the file.
       graph_io.write_graph(saver.as_saver_def(),
                            os.path.dirname(filename),
                            os.path.basename(filename))
-    with self.session(graph=ops_lib.Graph()):
+    with ops_lib.Graph().as_default(), self.session():
       # Import should fail.
       with self.assertRaisesWithPredicateMatch(IOError,
                                                lambda e: "Cannot parse file"):
@@ -2049,7 +2056,8 @@ class MetaGraphTest(test.TestCase):
                                                lambda e: "does not exist"):
         saver_module.import_meta_graph(filename)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only(
+      "Exporting/importing meta graphs is only supported in V1.")
   def testSliceVariable(self):
     test_dir = self._get_test_dir("slice_saver")
     filename = os.path.join(test_dir, "metafile")
@@ -2190,12 +2198,13 @@ class MetaGraphTest(test.TestCase):
       train_op = ops_lib.get_collection("train_op")[0]
       self.evaluate(train_op)
 
-  @test_util.run_deprecated_v1
   def testGraphExtension(self):
     test_dir = self._get_test_dir("graph_extension")
-    self._testGraphExtensionSave(test_dir)
-    self._testGraphExtensionRestore(test_dir)
-    self._testRestoreFromTrainGraphWithControlContext(test_dir)
+    # train.Saver and train.import_meta_graph are V1 only APIs.
+    with ops_lib.Graph().as_default():
+      self._testGraphExtensionSave(test_dir)
+      self._testGraphExtensionRestore(test_dir)
+      self._testRestoreFromTrainGraphWithControlContext(test_dir)
 
   def _testGradientSerDes(self, graph_fn):
     """Tests that gradients can be computed after exporting and importing.
@@ -2304,7 +2313,7 @@ class MetaGraphTest(test.TestCase):
                                       lambda: math_ops.multiply(x, -1.0))))
     # pylint: enable=g-long-lambda
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("This exercises Tensor.op which is meaningless in V2.")
   def testStrippedOpListDef(self):
     with self.cached_session():
       # Creates a graph.
@@ -2342,13 +2351,13 @@ class MetaGraphTest(test.TestCase):
         self.assertEqual(o.summary, "")
         self.assertEqual(o.description, "")
 
-  @test_util.run_deprecated_v1
   def testStripDefaultValuedAttrs(self):
     """Verifies that default valued attrs are stripped, unless disabled."""
 
     # With strip_default_attrs enabled, attributes "T" (float32) and "Tout"
     # (complex64) in the "Complex" op must be removed.
-    with self.cached_session():
+    # train.Saver and train.export_meta_graph are V1 only APIs.
+    with ops_lib.Graph().as_default(), self.cached_session():
       real_num = variables.VariableV1(1.0, dtype=dtypes.float32, name="real")
       imag_num = variables.VariableV1(2.0, dtype=dtypes.float32, name="imag")
       math_ops.complex(real_num, imag_num, name="complex")
@@ -2365,7 +2374,7 @@ class MetaGraphTest(test.TestCase):
     # With strip_default_attrs disabled, attributes "T" (float32) and "Tout"
     # (complex64) in the "Complex" op must *not* be removed, even if they map
     # to their defaults.
-    with self.session(graph=ops_lib.Graph()):
+    with ops_lib.Graph().as_default(), self.session():
       real_num = variables.VariableV1(1.0, dtype=dtypes.float32, name="real")
       imag_num = variables.VariableV1(2.0, dtype=dtypes.float32, name="imag")
       math_ops.complex(real_num, imag_num, name="complex")
@@ -2379,25 +2388,27 @@ class MetaGraphTest(test.TestCase):
       self.assertIn("T", node_def.attr)
       self.assertIn("Tout", node_def.attr)
 
-  @test_util.run_deprecated_v1
   def testImportIntoNamescope(self):
     # Test that we can import a meta graph into a namescope.
     test_dir = self._get_test_dir("import_into_namescope")
     filename = os.path.join(test_dir, "ckpt")
-    image = array_ops.placeholder(dtypes.float32, [None, 784], name="image")
-    label = array_ops.placeholder(dtypes.float32, [None, 10], name="label")
-    with session.Session() as sess:
-      weights = variables.VariableV1(
-          random_ops.random_uniform([784, 10]), name="weights")
-      bias = variables.VariableV1(array_ops.zeros([10]), name="bias")
-      logit = nn_ops.relu(math_ops.matmul(image, weights) + bias, name="logits")
-      nn_ops.softmax(logit, name="prediction")
-      cost = nn_ops.softmax_cross_entropy_with_logits(labels=label,
-                                                      logits=logit, name="cost")
-      adam.AdamOptimizer().minimize(cost, name="optimize")
-      saver = saver_module.Saver()
-      self.evaluate(variables.global_variables_initializer())
-      saver.save(sess, filename)
+    # train.Saver is V1 only API.
+    with ops_lib.Graph().as_default():
+      image = array_ops.placeholder(dtypes.float32, [None, 784], name="image")
+      label = array_ops.placeholder(dtypes.float32, [None, 10], name="label")
+      with session.Session() as sess:
+        weights = variables.VariableV1(
+            random_ops.random_uniform([784, 10]), name="weights")
+        bias = variables.VariableV1(array_ops.zeros([10]), name="bias")
+        logit = nn_ops.relu(
+            math_ops.matmul(image, weights) + bias, name="logits")
+        nn_ops.softmax(logit, name="prediction")
+        cost = nn_ops.softmax_cross_entropy_with_logits(
+            labels=label, logits=logit, name="cost")
+        adam.AdamOptimizer().minimize(cost, name="optimize")
+        saver = saver_module.Saver()
+        self.evaluate(variables.global_variables_initializer())
+        saver.save(sess, filename)
 
     graph = ops_lib.Graph()
     with session.Session(graph=graph) as sess:
@@ -2450,25 +2461,27 @@ class MetaGraphTest(test.TestCase):
           filename + ".meta", graph=graph_2, import_scope="my_scope")
       self.assertIsInstance(new_saver_3, saver_module.Saver)
 
-  @test_util.run_deprecated_v1
   def testImportIntoImplicitNamescope(self):
     # Test that we can import a meta graph into an implicit namescope.
     test_dir = self._get_test_dir("import_into_namescope")
     filename = os.path.join(test_dir, "ckpt")
-    image = array_ops.placeholder(dtypes.float32, [None, 784], name="image")
-    label = array_ops.placeholder(dtypes.float32, [None, 10], name="label")
-    with session.Session() as sess:
-      weights = variables.VariableV1(
-          random_ops.random_uniform([784, 10]), name="weights")
-      bias = variables.VariableV1(array_ops.zeros([10]), name="bias")
-      logit = nn_ops.relu(math_ops.matmul(image, weights) + bias, name="logits")
-      nn_ops.softmax(logit, name="prediction")
-      cost = nn_ops.softmax_cross_entropy_with_logits(labels=label,
-                                                      logits=logit, name="cost")
-      adam.AdamOptimizer().minimize(cost, name="optimize")
-      saver = saver_module.Saver()
-      self.evaluate(variables.global_variables_initializer())
-      saver.save(sess, filename)
+    # train.Saver is V1 only API.
+    with ops_lib.Graph().as_default():
+      image = array_ops.placeholder(dtypes.float32, [None, 784], name="image")
+      label = array_ops.placeholder(dtypes.float32, [None, 10], name="label")
+      with session.Session() as sess:
+        weights = variables.VariableV1(
+            random_ops.random_uniform([784, 10]), name="weights")
+        bias = variables.VariableV1(array_ops.zeros([10]), name="bias")
+        logit = nn_ops.relu(
+            math_ops.matmul(image, weights) + bias, name="logits")
+        nn_ops.softmax(logit, name="prediction")
+        cost = nn_ops.softmax_cross_entropy_with_logits(
+            labels=label, logits=logit, name="cost")
+        adam.AdamOptimizer().minimize(cost, name="optimize")
+        saver = saver_module.Saver()
+        self.evaluate(variables.global_variables_initializer())
+        saver.save(sess, filename)
 
     graph = ops_lib.Graph()
     with session.Session(graph=graph) as sess:
@@ -2573,7 +2586,6 @@ class CheckpointReaderTest(test.TestCase):
 
   _WRITE_VERSION = saver_pb2.SaverDef.V1
 
-  @test_util.run_deprecated_v1
   def testDebugString(self):
     # Builds a graph.
     v0 = variables.VariableV1(
@@ -2734,7 +2746,7 @@ class ScopedGraphTest(test.TestCase):
           export_scope="hidden1")
       self.assertEqual(["biases:0", "weights:0"], sorted(var_list.keys()))
 
-    with self.session(graph=graph) as sess:
+    with graph.as_default(), self.session() as sess:
       self.evaluate(variables.global_variables_initializer())
       saver = saver_module.Saver(var_list=var_list, max_to_keep=1)
       saver.save(sess, os.path.join(test_dir, ckpt_filename), write_state=False)
@@ -2746,15 +2758,15 @@ class ScopedGraphTest(test.TestCase):
     with graph.as_default():
       new_image = constant_op.constant(
           1.2, dtypes.float32, shape=[100, 28], name="images")
-    var_list = meta_graph.import_scoped_meta_graph(
-        os.path.join(test_dir, exported_filename),
-        graph=graph,
-        input_map={"$unbound_inputs_images": new_image},
-        import_scope="new_hidden1")
-    self.assertEqual(["biases:0", "weights:0"], sorted(var_list.keys()))
-    hidden1 = graph.as_graph_element("new_hidden1/Relu:0")
-    weights1 = graph.as_graph_element("new_hidden1/weights:0")
-    biases1 = graph.as_graph_element("new_hidden1/biases:0")
+      var_list = meta_graph.import_scoped_meta_graph(
+          os.path.join(test_dir, exported_filename),
+          graph=graph,
+          input_map={"$unbound_inputs_images": new_image},
+          import_scope="new_hidden1")
+      self.assertEqual(["biases:0", "weights:0"], sorted(var_list.keys()))
+      hidden1 = graph.as_graph_element("new_hidden1/Relu:0")
+      weights1 = graph.as_graph_element("new_hidden1/weights:0")
+      biases1 = graph.as_graph_element("new_hidden1/biases:0")
 
     with graph.as_default():
       # Hidden 2
@@ -2794,7 +2806,7 @@ class ScopedGraphTest(test.TestCase):
           set(variables.global_variables()) - set(var_list.keys()))
       init_rest_op = variables.variables_initializer(rest_variables)
 
-    with self.session(graph=graph) as sess:
+    with graph.as_default(), self.session() as sess:
       saver = saver_module.Saver(var_list=var_list, max_to_keep=1)
       saver.restore(sess, os.path.join(test_dir, ckpt_filename))
       # Verify that we have restored weights1 and biases1.
@@ -2805,7 +2817,6 @@ class ScopedGraphTest(test.TestCase):
 
   # Verifies that we can save the subgraph under "hidden1" and restore it
   # into "new_hidden1" in the new graph.
-  @test_util.run_deprecated_v1
   def testScopedSaveAndRestore(self):
     test_dir = self._get_test_dir("scoped_export_import")
     ckpt_filename = "ckpt"
@@ -2815,7 +2826,6 @@ class ScopedGraphTest(test.TestCase):
 
   # Verifies that we can copy the subgraph under "hidden1" and copy it
   # to different name scope in the same graph or different graph.
-  @test_util.run_deprecated_v1
   def testCopyScopedGraph(self):
     test_dir = self._get_test_dir("scoped_copy")
     saver0_ckpt = os.path.join(test_dir, "saver0.ckpt")
@@ -2830,7 +2840,7 @@ class ScopedGraphTest(test.TestCase):
         nn_ops.relu(math_ops.matmul(images, weights1) + biases1, name="relu")
 
     # Run the graph and save scoped checkpoint.
-    with self.session(graph=graph1) as sess:
+    with graph1.as_default(), self.session(graph=graph1) as sess:
       self.evaluate(variables.global_variables_initializer())
       _, var_list_1 = meta_graph.export_scoped_meta_graph(
           export_scope="hidden1")
@@ -2851,7 +2861,7 @@ class ScopedGraphTest(test.TestCase):
       var_list_2 = meta_graph.copy_scoped_meta_graph(
           from_scope="hidden1", to_scope="hidden2")
 
-    with self.session(graph=graph1) as sess:
+    with graph1.as_default(), self.session(graph=graph1) as sess:
       saver1 = saver_module.Saver(var_list=var_list_1, max_to_keep=1)
       saver1.restore(sess, saver0_ckpt)
       saver2 = saver_module.Saver(var_list=var_list_2, max_to_keep=1)
@@ -2861,18 +2871,18 @@ class ScopedGraphTest(test.TestCase):
 
     # Verifies copy to different graph.
     graph2 = ops_lib.Graph()
-    new_var_list_1 = meta_graph.copy_scoped_meta_graph(
-        from_scope="hidden1",
-        to_scope="new_hidden1",
-        from_graph=graph1,
-        to_graph=graph2)
+    with graph2.as_default():
+      new_var_list_1 = meta_graph.copy_scoped_meta_graph(
+          from_scope="hidden1",
+          to_scope="new_hidden1",
+          from_graph=graph1,
+          to_graph=graph2)
 
-    with self.session(graph=graph2) as sess:
-      saver3 = saver_module.Saver(var_list=new_var_list_1, max_to_keep=1)
-      saver3.restore(sess, saver0_ckpt)
-      self.assertAllClose(expected, sess.run("new_hidden1/relu:0"))
+      with self.session() as sess:
+        saver3 = saver_module.Saver(var_list=new_var_list_1, max_to_keep=1)
+        saver3.restore(sess, saver0_ckpt)
+        self.assertAllClose(expected, sess.run("new_hidden1/relu:0"))
 
-  @test_util.run_deprecated_v1
   def testExportGraphDefWithScope(self):
     test_dir = self._get_test_dir("export_graph_def")
     saver0_ckpt = os.path.join(test_dir, "saver0.ckpt")
@@ -2886,30 +2896,30 @@ class ScopedGraphTest(test.TestCase):
         biases1 = variables.VariableV1([0.1] * 3, name="biases")
         nn_ops.relu(math_ops.matmul(images, weights1) + biases1, name="relu")
 
-    # Run the graph and save scoped checkpoint.
-    with self.session(graph=graph1) as sess:
-      self.evaluate(variables.global_variables_initializer())
-      _, var_list_1 = meta_graph.export_scoped_meta_graph(
-          graph_def=graph1.as_graph_def(), export_scope="hidden1")
-      saver = saver_module.Saver(var_list=var_list_1, max_to_keep=1)
-      saver.save(sess, saver0_ckpt, write_state=False)
+      # Run the graph and save scoped checkpoint.
+      with self.session(graph=graph1) as sess:
+        self.evaluate(variables.global_variables_initializer())
+        _, var_list_1 = meta_graph.export_scoped_meta_graph(
+            graph_def=graph1.as_graph_def(), export_scope="hidden1")
+        saver = saver_module.Saver(var_list=var_list_1, max_to_keep=1)
+        saver.save(sess, saver0_ckpt, write_state=False)
 
     expected = np.reshape([[5.0999999, 7.0999999, 9.10000038] * 3], (3, 3))
 
     # Verifies that we can run successfully after restoring.
     graph2 = ops_lib.Graph()
-    new_var_list_1 = meta_graph.copy_scoped_meta_graph(
-        from_scope="hidden1",
-        to_scope="new_hidden1",
-        from_graph=graph1,
-        to_graph=graph2)
+    with graph2.as_default():
+      new_var_list_1 = meta_graph.copy_scoped_meta_graph(
+          from_scope="hidden1",
+          to_scope="new_hidden1",
+          from_graph=graph1,
+          to_graph=graph2)
 
-    with self.session(graph=graph2) as sess:
-      saver3 = saver_module.Saver(var_list=new_var_list_1, max_to_keep=1)
-      saver3.restore(sess, saver0_ckpt)
-      self.assertAllClose(expected, sess.run("new_hidden1/relu:0"))
+      with self.session(graph=graph2) as sess:
+        saver3 = saver_module.Saver(var_list=new_var_list_1, max_to_keep=1)
+        saver3.restore(sess, saver0_ckpt)
+        self.assertAllClose(expected, sess.run("new_hidden1/relu:0"))
 
-  @test_util.run_deprecated_v1
   def testSerializeSaverWithScope(self):
     test_dir = self._get_test_dir("export_graph_def")
     saver1_ckpt = os.path.join(test_dir, "saver1.ckpt")
@@ -2926,40 +2936,42 @@ class ScopedGraphTest(test.TestCase):
       saver2 = saver_module.Saver(var_list=[variable2], name="hidden2/")
       graph.add_to_collection(ops_lib.GraphKeys.SAVERS, saver2)
 
-    with self.session(graph=graph) as sess:
-      self.evaluate(variables.global_variables_initializer())
-      saver1.save(sess, saver1_ckpt, write_state=False)
-      saver2.save(sess, saver2_ckpt, write_state=False)
+      with self.session(graph=graph) as sess:
+        self.evaluate(variables.global_variables_initializer())
+        saver1.save(sess, saver1_ckpt, write_state=False)
+        saver2.save(sess, saver2_ckpt, write_state=False)
 
     graph1 = ops_lib.Graph()
-    var_dict1 = meta_graph.copy_scoped_meta_graph(
-        from_scope="hidden1",
-        to_scope="new_hidden1",
-        from_graph=graph,
-        to_graph=graph1)
-    self.assertEqual(1, len(var_dict1))
+    with graph1.as_default():
+      var_dict1 = meta_graph.copy_scoped_meta_graph(
+          from_scope="hidden1",
+          to_scope="new_hidden1",
+          from_graph=graph,
+          to_graph=graph1)
+      self.assertEqual(1, len(var_dict1))
 
-    saver_list1 = graph1.get_collection(ops_lib.GraphKeys.SAVERS)
-    self.assertEqual(1, len(saver_list1))
+      saver_list1 = graph1.get_collection(ops_lib.GraphKeys.SAVERS)
+      self.assertEqual(1, len(saver_list1))
 
-    with self.session(graph=graph1) as sess:
-      saver_list1[0].restore(sess, saver1_ckpt)
-      self.assertEqual(1.0, self.evaluate(var_dict1["variable1:0"]))
+      with self.session(graph=graph1) as sess:
+        saver_list1[0].restore(sess, saver1_ckpt)
+        self.assertEqual(1.0, self.evaluate(var_dict1["variable1:0"]))
 
     graph2 = ops_lib.Graph()
-    var_dict2 = meta_graph.copy_scoped_meta_graph(
-        from_scope="hidden2",
-        to_scope="new_hidden2",
-        from_graph=graph,
-        to_graph=graph2)
-    self.assertEqual(1, len(var_dict2))
+    with graph2.as_default():
+      var_dict2 = meta_graph.copy_scoped_meta_graph(
+          from_scope="hidden2",
+          to_scope="new_hidden2",
+          from_graph=graph,
+          to_graph=graph2)
+      self.assertEqual(1, len(var_dict2))
 
-    saver_list2 = graph2.get_collection(ops_lib.GraphKeys.SAVERS)
-    self.assertEqual(1, len(saver_list2))
+      saver_list2 = graph2.get_collection(ops_lib.GraphKeys.SAVERS)
+      self.assertEqual(1, len(saver_list2))
 
-    with self.session(graph=graph2) as sess:
-      saver_list2[0].restore(sess, saver2_ckpt)
-      self.assertEqual(2.0, self.evaluate(var_dict2["variable2:0"]))
+      with self.session(graph=graph2) as sess:
+        saver_list2[0].restore(sess, saver2_ckpt)
+        self.assertEqual(2.0, self.evaluate(var_dict2["variable2:0"]))
 
 
 class _OwnsAVariableSimple(trackable_base.Trackable):
@@ -3108,7 +3120,7 @@ class TrackableCompatibilityTests(test.TestCase):
       # exception" block in Python 3.
       self.assertNotIn("NewCheckpointReader", cs.exception.message)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("train.Saver is V1 only API.")
   def testGraphChangedForRestoreErrorRaised(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
diff --git a/tensorflow/python/training/server_lib.py b/tensorflow/python/training/server_lib.py
index be7a9f62d4f..b811319930e 100644
--- a/tensorflow/python/training/server_lib.py
+++ b/tensorflow/python/training/server_lib.py
@@ -319,11 +319,11 @@ class ClusterSpec(object):
                       "job names to lists of network addresses, or a "
                       "`ClusterDef` protocol buffer")
 
-  def __nonzero__(self):
+  def __bool__(self):
     return bool(self._cluster_spec)
 
-  # Python 3.x
-  __bool__ = __nonzero__
+  # Python 2.x
+  __nonzero__ = __bool__
 
   def __eq__(self, other):
     return self._cluster_spec == other
diff --git a/tensorflow/python/training/server_lib_same_variables_clear_container_test.py b/tensorflow/python/training/server_lib_same_variables_clear_container_test.py
index e0ab21bbd97..f6b041b2907 100644
--- a/tensorflow/python/training/server_lib_same_variables_clear_container_test.py
+++ b/tensorflow/python/training/server_lib_same_variables_clear_container_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.client import session
 from tensorflow.python.framework import errors_impl
-from tensorflow.python.framework import test_util
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
@@ -33,7 +33,6 @@ class SameVariablesClearContainerTest(test.TestCase):
   # TODO(b/34465411): Starting multiple servers with different configurations
   # in the same test is flaky. Move this test case back into
   # "server_lib_test.py" when this is no longer the case.
-  @test_util.run_deprecated_v1
   def testSameVariablesClearContainer(self):
     # Starts two servers with different names so they map to different
     # resource "containers".
@@ -47,36 +46,37 @@ class SameVariablesClearContainerTest(test.TestCase):
         }, protocol="grpc", start=True)
 
     # Creates a graph with 2 variables.
-    v0 = variables.Variable(1.0, name="v0")
-    v1 = variables.Variable(2.0, name="v0")
+    with ops.Graph().as_default():
+      v0 = variables.Variable(1.0, name="v0")
+      v1 = variables.Variable(2.0, name="v0")
 
-    # Initializes the variables. Verifies that the values are correct.
-    sess_0 = session.Session(server0.target)
-    sess_1 = session.Session(server1.target)
-    sess_0.run(v0.initializer)
-    sess_1.run(v1.initializer)
-    self.assertAllEqual(1.0, sess_0.run(v0))
-    self.assertAllEqual(2.0, sess_1.run(v1))
+      # Initializes the variables. Verifies that the values are correct.
+      sess_0 = session.Session(server0.target)
+      sess_1 = session.Session(server1.target)
+      sess_0.run(v0.initializer)
+      sess_1.run(v1.initializer)
+      self.assertAllEqual(1.0, sess_0.run(v0))
+      self.assertAllEqual(2.0, sess_1.run(v1))
 
-    # Resets container "local0". Verifies that v0 is no longer initialized.
-    session.Session.reset(server0.target, ["local0"])
-    sess = session.Session(server0.target)
-    with self.assertRaises(errors_impl.FailedPreconditionError):
-      self.evaluate(v0)
-    # Reinitializes v0 for the following test.
-    self.evaluate(v0.initializer)
+      # Resets container "local0". Verifies that v0 is no longer initialized.
+      session.Session.reset(server0.target, ["local0"])
+      _ = session.Session(server0.target)
+      with self.assertRaises(errors_impl.FailedPreconditionError):
+        self.evaluate(v0)
+      # Reinitializes v0 for the following test.
+      self.evaluate(v0.initializer)
 
-    # Verifies that v1 is still valid.
-    self.assertAllEqual(2.0, sess_1.run(v1))
+      # Verifies that v1 is still valid.
+      self.assertAllEqual(2.0, sess_1.run(v1))
 
-    # Resets container "local1". Verifies that v1 is no longer initialized.
-    session.Session.reset(server1.target, ["local1"])
-    sess = session.Session(server1.target)
-    with self.assertRaises(errors_impl.FailedPreconditionError):
-      self.evaluate(v1)
-    # Verifies that v0 is still valid.
-    sess = session.Session(server0.target)
-    self.assertAllEqual(1.0, self.evaluate(v0))
+      # Resets container "local1". Verifies that v1 is no longer initialized.
+      session.Session.reset(server1.target, ["local1"])
+      _ = session.Session(server1.target)
+      with self.assertRaises(errors_impl.FailedPreconditionError):
+        self.evaluate(v1)
+      # Verifies that v0 is still valid.
+      _ = session.Session(server0.target)
+      self.assertAllEqual(1.0, self.evaluate(v0))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/training/server_lib_same_variables_no_clear_test.py b/tensorflow/python/training/server_lib_same_variables_no_clear_test.py
index ff3fab9f372..34f83ca6f12 100644
--- a/tensorflow/python/training/server_lib_same_variables_no_clear_test.py
+++ b/tensorflow/python/training/server_lib_same_variables_no_clear_test.py
@@ -34,7 +34,8 @@ class SameVariablesNoClearTest(test.TestCase):
   # TODO(b/34465411): Starting multiple servers with different configurations
   # in the same test is flaky. Move this test case back into
   # "server_lib_test.py" when this is no longer the case.
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only(
+      "This exercises tensor lookup via names which is not supported in V2.")
   def testSameVariablesNoClear(self):
     server = server_lib.Server.create_local_server()
 
diff --git a/tensorflow/python/training/server_lib_test.py b/tensorflow/python/training/server_lib_test.py
index dc2adb7dee8..75008985aae 100644
--- a/tensorflow/python/training/server_lib_test.py
+++ b/tensorflow/python/training/server_lib_test.py
@@ -22,6 +22,7 @@ import time
 
 import numpy as np
 
+from tensorflow.core.protobuf import cluster_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import tensorflow_server_pb2
 from tensorflow.python.client import session
@@ -29,7 +30,6 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
@@ -48,63 +48,62 @@ class GrpcServerTest(test.TestCase):
 
   def testRunStep(self):
     server = self._cached_server
+    with ops.Graph().as_default():
+      with session.Session(server.target) as sess:
+        c = constant_op.constant([[2, 1]])
+        d = constant_op.constant([[1], [2]])
+        e = math_ops.matmul(c, d)
+        self.assertAllEqual([[4]], sess.run(e))
+      # TODO(mrry): Add `server.stop()` and `server.join()` when these work.
 
-    with session.Session(server.target) as sess:
+  def testMultipleSessions(self):
+    server = self._cached_server
+    with ops.Graph().as_default():
       c = constant_op.constant([[2, 1]])
       d = constant_op.constant([[1], [2]])
       e = math_ops.matmul(c, d)
-      self.assertAllEqual([[4]], sess.run(e))
-    # TODO(mrry): Add `server.stop()` and `server.join()` when these work.
 
-  @test_util.run_v1_only("b/120545219")
-  def testMultipleSessions(self):
-    server = self._cached_server
+      sess_1 = session.Session(server.target)
+      sess_2 = session.Session(server.target)
 
-    c = constant_op.constant([[2, 1]])
-    d = constant_op.constant([[1], [2]])
-    e = math_ops.matmul(c, d)
+      self.assertAllEqual([[4]], sess_1.run(e))
+      self.assertAllEqual([[4]], sess_2.run(e))
 
-    sess_1 = session.Session(server.target)
-    sess_2 = session.Session(server.target)
-
-    self.assertAllEqual([[4]], sess_1.run(e))
-    self.assertAllEqual([[4]], sess_2.run(e))
-
-    sess_1.close()
-    sess_2.close()
-    # TODO(mrry): Add `server.stop()` and `server.join()` when these work.
+      sess_1.close()
+      sess_2.close()
+      # TODO(mrry): Add `server.stop()` and `server.join()` when these work.
 
   # Verifies various reset failures.
-  @test_util.run_v1_only("b/120545219")
   def testResetFails(self):
-    # Creates variable with container name.
-    with ops.container("test0"):
-      v0 = variables.VariableV1(1.0, name="v0")
-    # Creates variable with default container.
-    v1 = variables.VariableV1(2.0, name="v1")
-    # Verifies resetting the non-existent target returns error.
-    with self.assertRaises(errors_impl.NotFoundError):
-      session.Session.reset("nonexistent", ["test0"])
+    with ops.Graph().as_default():
+      # Creates variable with container name.
+      with ops.container("test0"):
+        v0 = variables.VariableV1(1.0, name="v0")
+      # Creates variable with default container.
+      v1 = variables.VariableV1(2.0, name="v1")
+      # Verifies resetting the non-existent target returns error.
+      with self.assertRaises(errors_impl.NotFoundError):
+        session.Session.reset("nonexistent", ["test0"])
 
-    # Verifies resetting with config.
-    # Verifies that resetting target with no server times out.
-    with self.assertRaises(errors_impl.DeadlineExceededError):
-      session.Session.reset(
-          "grpc://localhost:0", ["test0"],
-          config=config_pb2.ConfigProto(operation_timeout_in_ms=5))
+      # Verifies resetting with config.
+      # Verifies that resetting target with no server times out.
+      with self.assertRaises(errors_impl.DeadlineExceededError):
+        session.Session.reset(
+            "grpc://localhost:0", ["test0"],
+            config=config_pb2.ConfigProto(operation_timeout_in_ms=5))
 
-    # Verifies no containers are reset with non-existent container.
-    server = self._cached_server
-    sess = session.Session(server.target)
-    sess.run(variables.global_variables_initializer())
-    self.assertAllEqual(1.0, sess.run(v0))
-    self.assertAllEqual(2.0, sess.run(v1))
-    # No container is reset, but the server is reset.
-    session.Session.reset(server.target, ["test1"])
-    # Verifies that both variables are still valid.
-    sess = session.Session(server.target)
-    self.assertAllEqual(1.0, sess.run(v0))
-    self.assertAllEqual(2.0, sess.run(v1))
+      # Verifies no containers are reset with non-existent container.
+      server = self._cached_server
+      sess = session.Session(server.target)
+      sess.run(variables.global_variables_initializer())
+      self.assertAllEqual(1.0, sess.run(v0))
+      self.assertAllEqual(2.0, sess.run(v1))
+      # No container is reset, but the server is reset.
+      session.Session.reset(server.target, ["test1"])
+      # Verifies that both variables are still valid.
+      sess = session.Session(server.target)
+      self.assertAllEqual(1.0, sess.run(v0))
+      self.assertAllEqual(2.0, sess.run(v1))
 
   def _useRPCConfig(self):
     """Return a `tf.compat.v1.ConfigProto` that ensures we use the RPC stack for tests.
@@ -149,27 +148,28 @@ class GrpcServerTest(test.TestCase):
       self.assertEqual(0.5, min_val)
       self.assertEqual(0.5, max_val)
 
-  @test_util.run_v1_only("b/120545219")
   def testCloseCancelsBlockingOperation(self):
     server = self._cached_server
-    sess = session.Session(server.target, config=self._useRPCConfig())
+    with ops.Graph().as_default():
+      sess = session.Session(server.target, config=self._useRPCConfig())
 
-    q = data_flow_ops.FIFOQueue(10, [dtypes.float32])
-    enqueue_op = q.enqueue(37.0)
-    dequeue_t = q.dequeue()
+      q = data_flow_ops.FIFOQueue(10, [dtypes.float32])
+      enqueue_op = q.enqueue(37.0)
+      dequeue_t = q.dequeue()
 
-    sess.run(enqueue_op)
-    sess.run(dequeue_t)
+      sess.run(enqueue_op)
+      sess.run(dequeue_t)
 
-    def blocking_dequeue():
-      with self.assertRaisesRegex(errors_impl.CancelledError, "Session::Close"):
-        sess.run(dequeue_t)
+      def blocking_dequeue():
+        with self.assertRaisesRegex(errors_impl.CancelledError,
+                                    "Session::Close"):
+          sess.run(dequeue_t)
 
-    blocking_thread = self.checkedThread(blocking_dequeue)
-    blocking_thread.start()
-    time.sleep(0.5)
-    sess.close()
-    blocking_thread.join()
+      blocking_thread = self.checkedThread(blocking_dequeue)
+      blocking_thread.start()
+      time.sleep(0.5)
+      sess.close()
+      blocking_thread.join()
 
   def testInteractiveSession(self):
     server = self._cached_server
@@ -203,6 +203,63 @@ class GrpcServerTest(test.TestCase):
     self.assertEqual(0.1, server.server_def.default_session_config.gpu_options.
                      per_process_gpu_memory_fraction)
 
+  def testRestartedMaster(self):
+    master_old = server_lib.Server.create_local_server()
+    master_new = server_lib.Server.create_local_server()
+    worker = self._cached_server
+
+    def get_cluster_def(master, worker):
+      cluster_def = cluster_pb2.ClusterDef()
+      job = cluster_def.job.add()
+      job.name = "master"
+      job.tasks[0] = master.target[len("grpc://"):]
+      job = cluster_def.job.add()
+      job.name = "worker"
+      job.tasks[0] = worker.target[len("grpc://"):]
+      return cluster_def
+
+    def check_session_devices(sess):
+      # Make sure we have the correct set of cluster devices
+      devices = sess.list_devices()
+      device_names = set(d.name for d in devices)
+      self.assertIn("/job:master/replica:0/task:0/device:CPU:0", device_names)
+      self.assertIn("/job:worker/replica:0/task:0/device:CPU:0", device_names)
+
+    with ops.Graph().as_default():
+      # Construct a simple graph that runs ops on remote worker
+      with ops.device("/job:worker/replica:0/task:0/device:CPU:0"):
+        a = constant_op.constant([1.0])
+        b = a + a
+
+      config = config_pb2.ConfigProto(
+          cluster_def=get_cluster_def(master_old, worker))
+      sess_old = session.Session(master_old.target, config=config)
+      check_session_devices(sess_old)
+
+      # Create a session with the new master and the worker.
+      # The new master has the same task name ('/job:master/replica:0/task:0')
+      # as the old master, but is initiated from a different server thus has a
+      # different incarnation. This triggers the WorkerSession on worker with
+      # the old master incarnation to be garbage collected.
+
+      config = config_pb2.ConfigProto(
+          cluster_def=get_cluster_def(master_new, worker))
+      sess_new = session.Session(master_new.target, config=config)
+      check_session_devices(sess_new)
+
+      # Running on worker with the new session should work as expected
+      v = sess_new.run(b)
+      self.assertAllEqual(v, [2.0])
+
+      # Running on worker with the old session should raise an exception since
+      # the WorkerSession of the old session has been garbage collected
+      with self.assertRaisesRegex(errors_impl.AbortedError,
+                                  "Session handle is not found"):
+        sess_old.run(b)
+
+    sess_old.close()
+    sess_new.close()
+
   def testInvalidHostname(self):
     with self.assertRaisesRegex(errors_impl.InvalidArgumentError, "port"):
       _ = server_lib.Server(
@@ -210,19 +267,21 @@ class GrpcServerTest(test.TestCase):
               "local": ["localhost"]
           }, job_name="local", task_index=0)
 
-  @test_util.run_v1_only("b/120545219")
   def testTimeoutRaisesException(self):
     server = self._cached_server
-    q = data_flow_ops.FIFOQueue(1, [dtypes.float32])
-    blocking_t = q.dequeue()
+    with ops.Graph().as_default():
+      q = data_flow_ops.FIFOQueue(1, [dtypes.float32])
+      blocking_t = q.dequeue()
 
-    with session.Session(server.target) as sess:
-      with self.assertRaises(errors_impl.DeadlineExceededError):
-        sess.run(blocking_t, options=config_pb2.RunOptions(timeout_in_ms=1000))
+      with session.Session(server.target) as sess:
+        with self.assertRaises(errors_impl.DeadlineExceededError):
+          sess.run(
+              blocking_t, options=config_pb2.RunOptions(timeout_in_ms=1000))
 
-    with session.Session(server.target, config=self._useRPCConfig()) as sess:
-      with self.assertRaises(errors_impl.DeadlineExceededError):
-        sess.run(blocking_t, options=config_pb2.RunOptions(timeout_in_ms=1000))
+      with session.Session(server.target, config=self._useRPCConfig()) as sess:
+        with self.assertRaises(errors_impl.DeadlineExceededError):
+          sess.run(
+              blocking_t, options=config_pb2.RunOptions(timeout_in_ms=1000))
 
   def testTwoServersSamePort(self):
     # Starting a server with the same target as the cached server should fail.
@@ -245,63 +304,63 @@ class GrpcServerTest(test.TestCase):
       queue_runner_impl.start_queue_runners(sess)
       sess.run(var.assign(3.0))
 
-  @test_util.run_v1_only("b/120545219")
   def testIsolateSessionState(self):
     server = self._cached_server
 
-    init_value = array_ops.placeholder(dtypes.int32)
-    v = variables.VariableV1(init_value, validate_shape=False, name="v")
+    with ops.Graph().as_default():
+      init_value = array_ops.placeholder(dtypes.int32)
+      v = variables.VariableV1(init_value, validate_shape=False, name="v")
 
-    sharing_config = config_pb2.ConfigProto(isolate_session_state=False)
-    sharing_sess_0 = session.Session(server.target, config=sharing_config)
-    sharing_sess_1 = session.Session(server.target, config=sharing_config)
+      sharing_config = config_pb2.ConfigProto(isolate_session_state=False)
+      sharing_sess_0 = session.Session(server.target, config=sharing_config)
+      sharing_sess_1 = session.Session(server.target, config=sharing_config)
 
-    isolate_config = config_pb2.ConfigProto(isolate_session_state=True)
-    isolate_sess_0 = session.Session(server.target, config=isolate_config)
-    isolate_sess_1 = session.Session(server.target, config=isolate_config)
+      isolate_config = config_pb2.ConfigProto(isolate_session_state=True)
+      isolate_sess_0 = session.Session(server.target, config=isolate_config)
+      isolate_sess_1 = session.Session(server.target, config=isolate_config)
 
-    # Initially all variables are initialized.
-    for sess in [sharing_sess_0, sharing_sess_1,
-                 isolate_sess_0, isolate_sess_1]:
+      # Initially all variables are initialized.
+      for sess in [
+          sharing_sess_0, sharing_sess_1, isolate_sess_0, isolate_sess_1
+      ]:
+        with self.assertRaises(errors_impl.FailedPreconditionError):
+          sess.run(v)
+
+      # Shared sessions will see each other's updates, but isolated sessions
+      # will not.
+      sharing_sess_0.run(v.initializer, feed_dict={init_value: 86})
+      self.assertAllEqual(86, sharing_sess_0.run(v))
+      self.assertAllEqual(86, sharing_sess_1.run(v))
       with self.assertRaises(errors_impl.FailedPreconditionError):
-        sess.run(v)
+        isolate_sess_0.run(v)
+      with self.assertRaises(errors_impl.FailedPreconditionError):
+        isolate_sess_1.run(v)
 
-    # Shared sessions will see each other's updates, but isolated sessions
-    # will not.
-    sharing_sess_0.run(v.initializer, feed_dict={init_value: 86})
-    self.assertAllEqual(86, sharing_sess_0.run(v))
-    self.assertAllEqual(86, sharing_sess_1.run(v))
-    with self.assertRaises(errors_impl.FailedPreconditionError):
-      isolate_sess_0.run(v)
-    with self.assertRaises(errors_impl.FailedPreconditionError):
-      isolate_sess_1.run(v)
+      # Changing the shape works because `validate_shape` is False.
+      sharing_sess_1.run(v.initializer, feed_dict={init_value: [86, 99]})
+      self.assertAllEqual([86, 99], sharing_sess_0.run(v))
+      self.assertAllEqual([86, 99], sharing_sess_1.run(v))
+      with self.assertRaises(errors_impl.FailedPreconditionError):
+        isolate_sess_0.run(v)
+      with self.assertRaises(errors_impl.FailedPreconditionError):
+        isolate_sess_1.run(v)
 
-    # Changing the shape works because `validate_shape` is False.
-    sharing_sess_1.run(v.initializer, feed_dict={init_value: [86, 99]})
-    self.assertAllEqual([86, 99], sharing_sess_0.run(v))
-    self.assertAllEqual([86, 99], sharing_sess_1.run(v))
-    with self.assertRaises(errors_impl.FailedPreconditionError):
-      isolate_sess_0.run(v)
-    with self.assertRaises(errors_impl.FailedPreconditionError):
-      isolate_sess_1.run(v)
+      # Initializing in an isolated session will only affect the state in that
+      # session.
+      isolate_sess_0.run(v.initializer, feed_dict={init_value: 37})
+      self.assertAllEqual([86, 99], sharing_sess_0.run(v))
+      self.assertAllEqual([86, 99], sharing_sess_1.run(v))
+      self.assertAllEqual(37, isolate_sess_0.run(v))
+      with self.assertRaises(errors_impl.FailedPreconditionError):
+        isolate_sess_1.run(v)
 
-    # Initializing in an isolated session will only affect the state in that
-    # session.
-    isolate_sess_0.run(v.initializer, feed_dict={init_value: 37})
-    self.assertAllEqual([86, 99], sharing_sess_0.run(v))
-    self.assertAllEqual([86, 99], sharing_sess_1.run(v))
-    self.assertAllEqual(37, isolate_sess_0.run(v))
-    with self.assertRaises(errors_impl.FailedPreconditionError):
-      isolate_sess_1.run(v)
+      # Isolated sessions can have different shapes for the same variable.
+      isolate_sess_1.run(v.initializer, feed_dict={init_value: [19, 86]})
+      self.assertAllEqual([86, 99], sharing_sess_0.run(v))
+      self.assertAllEqual([86, 99], sharing_sess_1.run(v))
+      self.assertAllEqual(37, isolate_sess_0.run(v))
+      self.assertAllEqual([19, 86], isolate_sess_1.run(v))
 
-    # Isolated sessions can have different shapes for the same variable.
-    isolate_sess_1.run(v.initializer, feed_dict={init_value: [19, 86]})
-    self.assertAllEqual([86, 99], sharing_sess_0.run(v))
-    self.assertAllEqual([86, 99], sharing_sess_1.run(v))
-    self.assertAllEqual(37, isolate_sess_0.run(v))
-    self.assertAllEqual([19, 86], isolate_sess_1.run(v))
-
-  @test_util.run_v1_only("b/120545219")
   def testShapeChangingIsolateState(self):
     server = self._cached_server
     sharing_config = config_pb2.ConfigProto(isolate_session_state=False)
@@ -322,12 +381,6 @@ class GrpcServerTest(test.TestCase):
         sess.run(w_vector.initializer)
         self.assertAllEqual([4, 5, 6], sess.run(w_vector))
 
-    with ops.Graph().as_default():
-      w_scalar = variables.VariableV1(86, name="w")
-      with session.Session(server.target, config=sharing_config) as sess:
-        with self.assertRaises(errors_impl.InvalidArgumentError):
-          sess.run(w_scalar.initializer)
-
     with ops.Graph().as_default():
       w_scalar = variables.VariableV1(37, name="w")
       with session.Session(server.target, config=isolate_config) as sess:
diff --git a/tensorflow/python/training/session_manager_test.py b/tensorflow/python/training/session_manager_test.py
index df795ff5f7e..4e05b73110b 100644
--- a/tensorflow/python/training/session_manager_test.py
+++ b/tensorflow/python/training/session_manager_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
@@ -39,6 +40,11 @@ from tensorflow.python.training import session_manager
 
 class SessionManagerTest(test.TestCase):
 
+  @classmethod
+  def setUpClass(cls):
+    super(SessionManagerTest, cls).setUpClass()
+    variable_scope.disable_resource_variables()
+
   def testPrepareSessionSucceeds(self):
     with ops.Graph().as_default():
       v = variables.VariableV1([1.0, 2.0, 3.0], name="v")
@@ -81,7 +87,6 @@ class SessionManagerTest(test.TestCase):
       sess = sm.prepare_session("")
       self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
 
-  @test_util.run_v1_only("b/120545219")
   def testPrepareSessionFails(self):
     checkpoint_dir = os.path.join(self.get_temp_dir(), "prepare_session")
     checkpoint_dir2 = os.path.join(self.get_temp_dir(), "prepare_session2")
@@ -166,7 +171,6 @@ class SessionManagerTest(test.TestCase):
               sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
       self.assertEqual(1, sess.run(v))
 
-  @test_util.run_v1_only("b/120545219")
   def testRecoverSession(self):
     # Create a checkpoint.
     checkpoint_dir = os.path.join(self.get_temp_dir(), "recover_session")
@@ -199,7 +203,6 @@ class SessionManagerTest(test.TestCase):
           checkpoint_filename_with_path=checkpoint_management.latest_checkpoint(
               checkpoint_dir))
 
-  @test_util.run_v1_only("b/120545219")
   def testWaitForSessionReturnsNoneAfterTimeout(self):
     with ops.Graph().as_default():
       variables.VariableV1(1, name="v")
@@ -222,7 +225,6 @@ class SessionManagerTest(test.TestCase):
               variables.global_variables()),
           local_init_op=None)
 
-  @test_util.run_v1_only("b/120545219")
   def testRecoverSessionWithReadyForLocalInitOp(self):
     # Create a checkpoint.
     checkpoint_dir = os.path.join(self.get_temp_dir(),
@@ -276,7 +278,6 @@ class SessionManagerTest(test.TestCase):
       self.assertEqual(1, sess.run(v))
       self.assertEqual(1, sess.run(w))
 
-  @test_util.run_v1_only("b/120545219")
   def testRecoverSessionWithReadyForLocalInitOpFailsToReadyLocal(self):
     # We use ready_for_local_init_op=report_uninitialized_variables(),
     # which causes recover_session to not run local_init_op, and to return
@@ -333,7 +334,6 @@ class SessionManagerTest(test.TestCase):
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
       self.assertEqual(1, sess.run(v))
 
-  @test_util.run_v1_only("b/120545219")
   def testRecoverSessionNoChkptStillRunsLocalInitOp(self):
     # This test checks for backwards compatibility.
     # In particular, we continue to ensure that recover_session will execute
@@ -362,7 +362,6 @@ class SessionManagerTest(test.TestCase):
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
       self.assertEqual(1, sess.run(w))
 
-  @test_util.run_v1_only("b/120545219")
   def testRecoverSessionFailsStillRunsLocalInitOp(self):
     # Create a checkpoint.
     checkpoint_dir = os.path.join(
@@ -406,7 +405,6 @@ class SessionManagerTest(test.TestCase):
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
       self.assertEqual(1, sess.run(w))
 
-  @test_util.run_v1_only("b/120545219")
   def testWaitForSessionLocalInit(self):
     server = server_lib.Server.create_local_server()
     with ops.Graph().as_default() as graph:
@@ -458,7 +456,7 @@ class SessionManagerTest(test.TestCase):
         # because of overly restrictive ready_for_local_init_op
         sm.wait_for_session("", max_wait_secs=3)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("Requires TF V1 variable behavior.")
   def testWaitForSessionInsufficientReadyForLocalInitCheck(self):
     with ops.Graph().as_default() as graph:
       v = variables.VariableV1(1, name="v")
@@ -476,7 +474,6 @@ class SessionManagerTest(test.TestCase):
                                 "Session was not ready after waiting.*"):
       sm.wait_for_session("", max_wait_secs=3)
 
-  @test_util.run_v1_only("b/120545219")
   def testPrepareSessionWithReadyForLocalInitOp(self):
     with ops.Graph().as_default():
       v = variables.VariableV1(1, name="v")
@@ -516,7 +513,7 @@ class SessionManagerTest(test.TestCase):
       self.assertEqual(1, sess.run(w))
       self.assertEqual(3, sess.run(x))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("Requires TF V1 variable behavior.")
   def testPrepareSessionWithPartialInitOp(self):
     with ops.Graph().as_default():
       v = variables.VariableV1(1, name="v")
@@ -583,7 +580,6 @@ class SessionManagerTest(test.TestCase):
       self.assertEqual(1, sess.run(w_res))
       self.assertEqual(3, sess.run(x_res))
 
-  @test_util.run_v1_only("b/120545219")
   def testPrepareSessionWithCyclicInitializer(self):
     # Regression test. Previously Variable._build_initializer_expr would enter
     # into an infinite recursion when the variable's initial_value involved
@@ -657,7 +653,7 @@ class SessionManagerTest(test.TestCase):
           "Init operations did not make model ready for local_init"):
         sm2.prepare_session("", init_op=None)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("Requires TF V1 variable behavior.")
   def testPrepareSessionWithInsufficientReadyForLocalInitCheck(self):
     with ops.Graph().as_default():
       v = variables.VariableV1(1, name="v")
@@ -680,6 +676,11 @@ class SessionManagerTest(test.TestCase):
 
 class ObsoleteSessionManagerTest(test.TestCase):
 
+  @classmethod
+  def setUpClass(cls):
+    super(ObsoleteSessionManagerTest, cls).setUpClass()
+    variable_scope.disable_resource_variables()
+
   def testPrepareSessionSucceeds(self):
     with ops.Graph().as_default():
       v = variables.VariableV1([1.0, 2.0, 3.0], name="v")
@@ -710,7 +711,6 @@ class ObsoleteSessionManagerTest(test.TestCase):
           "", init_fn=lambda sess: sess.run(v.initializer))
       self.assertAllClose([125], sess.run(v))
 
-  @test_util.run_v1_only("b/120545219")
   def testPrepareSessionFails(self):
     checkpoint_dir = os.path.join(self.get_temp_dir(), "prepare_session")
     checkpoint_dir2 = os.path.join(self.get_temp_dir(), "prepare_session2")
@@ -772,7 +772,6 @@ class ObsoleteSessionManagerTest(test.TestCase):
           variables.is_variable_initialized(
               sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
 
-  @test_util.run_v1_only("b/120545219")
   def testRecoverSession(self):
     # Create a checkpoint.
     checkpoint_dir = os.path.join(self.get_temp_dir(), "recover_session")
@@ -811,7 +810,6 @@ class ObsoleteSessionManagerTest(test.TestCase):
               sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
       self.assertEqual(1, sess.run(v))
 
-  @test_util.run_v1_only("b/120545219")
   def testWaitForSessionReturnsNoneAfterTimeout(self):
     with ops.Graph().as_default():
       variables.VariableV1(1, name="v")
diff --git a/tensorflow/python/training/sync_replicas_optimizer_test.py b/tensorflow/python/training/sync_replicas_optimizer_test.py
index 7ff31d61c9a..32ad339d712 100644
--- a/tensorflow/python/training/sync_replicas_optimizer_test.py
+++ b/tensorflow/python/training/sync_replicas_optimizer_test.py
@@ -89,7 +89,8 @@ class SyncReplicasOptimizerTest(test.TestCase):
   def _run(self, train_op, sess):
     sess.run(train_op)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only(
+      "This exercises tensor lookup via names which is not supported in V2.")
   def test2Workers(self):
     num_workers = 2
     replicas_to_aggregate = 2
@@ -180,7 +181,8 @@ class SyncReplicasOptimizerTest(test.TestCase):
                         sessions[1].run(var_1_g_1))
 
   # 3 workers and one of them is backup.
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only(
+      "This exercises tensor lookup via names which is not supported in V2.")
   def test3Workers1Backup(self):
     num_workers = 3
     replicas_to_aggregate = 2
@@ -268,7 +270,9 @@ class SyncReplicasOptimizerHookTest(test.TestCase):
     with self.assertRaisesRegex(ValueError, "apply_gradient should be called"):
       hook.begin()
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only(
+      "train.SyncReplicasOptimizer and train.GradientDescentOptimizer "
+      "are V1 only APIs.")
   def testCanCreatedBeforeMinimizeCalled(self):
     """This behavior is required to be integrated with Estimators."""
     opt = training.SyncReplicasOptimizer(
@@ -281,7 +285,8 @@ class SyncReplicasOptimizerHookTest(test.TestCase):
     opt.minimize(v, global_step=global_step)
     hook.begin()
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only(
+      "train.SyncReplicasOptimizer and train.AdamOptimizer are V1 only APIs.")
   def testFetchVariableList(self):
     opt = training.SyncReplicasOptimizer(
         opt=adam.AdamOptimizer(0.01),
diff --git a/tensorflow/python/training/tracking/base.py b/tensorflow/python/training/tracking/base.py
index d9cf7d72111..a8b0410dc77 100644
--- a/tensorflow/python/training/tracking/base.py
+++ b/tensorflow/python/training/tracking/base.py
@@ -251,7 +251,12 @@ class CheckpointPosition(object):
                       original_variable=trackable,
                       slot_variable_id=slot_restoration.slot_variable_id,
                       slot_name=slot_restoration.slot_name))
-        else:
+
+        # `optimizer_object` can be a `Checkpoint` when user only needs the
+        # attributes the optimizer holds, such as `iterations`. In those cases,
+        # it would not have the optimizer's `_create_or_restore_slot_variable`
+        # method.
+        elif hasattr(optimizer_object, "_create_or_restore_slot_variable"):
           optimizer_object._create_or_restore_slot_variable(  # pylint: disable=protected-access
               slot_variable_position=CheckpointPosition(
                   checkpoint=checkpoint,
diff --git a/tensorflow/python/training/tracking/data_structures_test.py b/tensorflow/python/training/tracking/data_structures_test.py
index 90f8fbdef64..f87bcc8e4d1 100644
--- a/tensorflow/python/training/tracking/data_structures_test.py
+++ b/tensorflow/python/training/tracking/data_structures_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.saving.saved_model import json_utils
 from tensorflow.python.layers import core as non_keras_core
 from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
@@ -39,7 +40,6 @@ from tensorflow.python.training.tracking import data_structures
 from tensorflow.python.training.tracking import tracking
 from tensorflow.python.training.tracking import util
 from tensorflow.python.util import nest
-from tensorflow.python.util import serialization
 
 
 class ListTests(test.TestCase):
@@ -47,7 +47,7 @@ class ListTests(test.TestCase):
   def testJSONSerialization(self):
     obj = tracking.AutoTrackable()
     obj.l = [1]
-    json.dumps(obj.l, default=serialization.get_json_type)
+    json.dumps(obj.l, default=json_utils.get_json_type)
 
   def testNotTrackable(self):
     class NotTrackable(object):
@@ -337,7 +337,7 @@ class MappingTests(test.TestCase):
   def testJSONSerialization(self):
     obj = tracking.AutoTrackable()
     obj.d = {"a": 2}
-    json.dumps(obj.d, default=serialization.get_json_type)
+    json.dumps(obj.d, default=json_utils.get_json_type)
 
   def testNoOverwrite(self):
     mapping = data_structures.Mapping()
@@ -519,7 +519,7 @@ class TupleTests(test.TestCase, parameterized.TestCase):
   def testJSONSerialization(self):
     obj = tracking.AutoTrackable()
     obj.l = (1,)
-    json.dumps(obj.l, default=serialization.get_json_type)
+    json.dumps(obj.l, default=json_utils.get_json_type)
 
   def testNonLayerVariables(self):
     v = resource_variable_ops.ResourceVariable([1.])
diff --git a/tensorflow/python/training/tracking/layer_utils.py b/tensorflow/python/training/tracking/layer_utils.py
index ab0480c2228..c63abebd219 100644
--- a/tensorflow/python/training/tracking/layer_utils.py
+++ b/tensorflow/python/training/tracking/layer_utils.py
@@ -49,99 +49,6 @@ def has_weights(obj):
   return has_weight and not isinstance(obj, type)
 
 
-def cache_recursive_attribute(key):
-  """Decorator to cache Layer properties which recursively depend on sub-layers.
-
-  A number of attributes in Keras Layers take the form:
-
-  ```
-  @property
-  def thing(self):
-    return self._thing or any(layer.thing for layer in self.layers)
-  ```
-
-  This means that checking these properties (e.g. dynamic, stateful, etc) must
-  traverse the entire graph of layers to determine whether any descent has
-  changed its state. This decorator adds a mechanism for Layers and trackable
-  data structures to broadcast mutations (including the addition or deletion
-  of layers) and allows the top level layer to safely cache results. In general,
-  if computing an attribute triggers a depth first search it is a good candidate
-  for this caching mechanism.
-
-  The architecture is optimized for safety and correctness rather than absolute
-  optimality. This manifests in two ways:
-    1) Parents are never removed. It is possible for layer A to depend on layer
-       B but subsequently remove that dependency. In that case, layer B will
-       continue to broadcast its mutations to layer A until either A or B is
-       deleted. However because the only effect is to invalidate a cache this
-       does not affect correctness. (And robustly removing dependencies is
-       difficult and error prone.)
-
-    2) Layers aggressively invalidate their caches when there is any ambiguity
-       of whether or not it is necessary. For instance, consider the following:
-       ```
-       class MyLayer(tf.keras.layers.Layer):
-         def __init__(self):
-           super(MyLayer, self).__init__()
-
-           sub_layer = tf.keras.layers.Dense(1)
-           self.sub_layers = [
-               sub_layer  # This will be picked up, converted to a ListWrapper,
-                          # and added to self._layers
-           ]
-
-           # Include the layer twice.
-           self.sub_layers.append(sub_layer)
-
-           # Remove one copy, but one copy remains.
-           self.sub_layers.pop()
-       ```
-       In the example layer above, the set of tracked layers actually doesn't
-       change; however to know that in the general case the Layer needs
-       significant machinery to reason about what, if anything, has changed.
-       By invalidating on every mutation we don't need to concern ourselves
-       with the many types of mutations (append, pop, in-place replacement)
-       and their specific semantics.
-
-  Because mutations to layers are expected to be infrequent, this very
-  conservative approach captures the vast majority of the performance gains from
-  caching recursive properties while still remaining quite lightweight and easy
-  to reason about.
-
-  `tracking.cached_per_instance` provides a more detailed performance analysis
-  of the WeakKeyDictionary cache pattern.
-
-  Args:
-    key: A string indicating which field is being cached. While not strictly
-         necessary (since it could be obtained from f.__name__), it forces
-         deliberate behavior when caching an attribute.
-
-  Returns:
-    A caching decorater specialized to `key`.
-  """
-  cache = weakref.WeakKeyDictionary()
-  def outer(f):
-    """Attribute cache which has been specialized."""
-
-    @functools.wraps(f)
-    def wrapped(self):
-      """Cache aware version of `f`."""
-
-      # Sentinels are unique per Layer/Trackable, but can be hashed. (Unlike
-      # some trackable data structures.) Consequently it makes sense to use the
-      # sentinel as a cache key rather than `self`.
-      sentinel = getattr(self, "_attribute_sentinel")  # type: AttributeSentinel
-
-      if not sentinel.get(key) or sentinel not in cache:
-        cache[sentinel] = f(self)
-        sentinel.mark_cached(key)
-      output = cache[sentinel]
-      return output
-
-    return wrapped
-  return outer
-
-
 def invalidate_recursive_cache(key):
   """Convenience decorator to invalidate the cache when setting attributes."""
   def outer(f):
diff --git a/tensorflow/python/training/training_ops_test.py b/tensorflow/python/training/training_ops_test.py
index 118636c551e..3dd1283c924 100644
--- a/tensorflow/python/training/training_ops_test.py
+++ b/tensorflow/python/training/training_ops_test.py
@@ -60,7 +60,8 @@ class TrainingOpsTest(TensorFlowTestCase):
       self.assertShapeEqual(out, apply_sgd)
       self.assertAllCloseAccordingToType(x - alpha * delta, out)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("ApplyGradientDescent op returns a ref, so it is not "
+                         "supported in eager mode.")
   def testApplyGradientDescent(self):
     for (dtype, use_gpu) in itertools.product(
         [np.float16, np.float32, np.float64], [False, True]):
@@ -184,7 +185,8 @@ class TrainingOpsTest(TensorFlowTestCase):
         self.assertAllClose(linear_update, self.evaluate(linear))
         self.assertAllClose(expected_out, out)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("ApplyAdagrad op returns a ref, so it is not "
+                         "supported in eager mode.")
   def testApplyAdagrad(self):
     for (dtype, use_gpu) in itertools.product(
         [np.float16, np.float32, np.float64], [False, True]):
@@ -194,7 +196,8 @@ class TrainingOpsTest(TensorFlowTestCase):
       grad = np.arange(100).astype(dtype)
       self._testTypesForAdagrad(x, y, lr, grad, use_gpu)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("ApplyFtrl op returns a ref, so it is not "
+                         "supported in eager mode.")
   def testApplyFtrl(self):
     for dtype in [np.float16, np.float32, np.float64]:
       x = np.arange(100).astype(dtype)
@@ -206,7 +209,8 @@ class TrainingOpsTest(TensorFlowTestCase):
       grad = np.arange(100).astype(dtype)
       self._testTypesForFtrl(x, y, z, lr, grad, use_gpu=False, l1=l1, l2=l2)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("ApplyFtrlMultiplyLinearByLr op returns a ref, so it "
+                         "is not supported in eager mode.")
   def testApplyFtrlMultiplyLinearByLr(self):
     for dtype in [np.float16, np.float32, np.float64]:
       x = np.arange(100).astype(dtype)
@@ -320,7 +324,8 @@ class TrainingOpsTest(TensorFlowTestCase):
         self.assertAllCloseAccordingToType(y[index] + grad[i] * grad[i],
                                            self.evaluate(accum)[index])
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("SparseApplyAdagrad op returns a ref, so it is not "
+                         "supported in eager mode.")
   def testSparseApplyAdagrad(self):
     for (dtype, index_type) in itertools.product(
         [np.float16, np.float32, np.float64], [np.int32, np.int64]):
@@ -334,7 +339,8 @@ class TrainingOpsTest(TensorFlowTestCase):
       indices = np.array([0, 2]).astype(index_type)
       self._testTypesForSparseAdagrad(x, y, lr, grad, indices)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("SparseApplyAdagrad op returns a ref, so it is not "
+                         "supported in eager mode.")
   def testSparseApplyAdagradDim1(self):
     for (dtype, index_type) in itertools.product(
         [np.float16, np.float32, np.float64], [np.int32, np.int64]):
@@ -348,7 +354,8 @@ class TrainingOpsTest(TensorFlowTestCase):
       indices = np.array([0, 2]).astype(index_type)
       self._testTypesForSparseAdagrad(x, y, lr, grad, indices)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("SparseApplyFtrl op returns a ref, so it is not "
+                         "supported in eager mode.")
   def testSparseApplyFtrlDim1(self):
     for (dtype, index_type) in itertools.product(
         [np.float16, np.float32, np.float64], [np.int32, np.int64]):
@@ -364,7 +371,8 @@ class TrainingOpsTest(TensorFlowTestCase):
       indices = np.array([0, 2]).astype(index_type)
       self._testTypesForSparseFtrl(x, y, z, lr, grad, indices)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("SparseApplyFtrlMultiplyLinearByLr op returns a ref, "
+                         "so it is not supported in eager mode.")
   def testSparseApplyFtrlMultiplyLinearByLrDim1(self):
     for (dtype,
          index_type) in itertools.product([np.float16, np.float32, np.float64],
@@ -381,7 +389,8 @@ class TrainingOpsTest(TensorFlowTestCase):
       indices = np.array([0, 2]).astype(index_type)
       self._testTypesForSparseFtrlMultiplyLinearByLr(x, y, z, lr, grad, indices)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("ApplyAdam op returns a ref, so it is not "
+                         "supported in eager mode.")
   def testApplyAdam(self):
     for dtype, use_gpu in itertools.product(
         [np.float16, np.float32, np.float64], [False, True]):
diff --git a/tensorflow/python/training/training_util_test.py b/tensorflow/python/training/training_util_test.py
index 5049d6e00a0..cf5942287a1 100644
--- a/tensorflow/python/training/training_util_test.py
+++ b/tensorflow/python/training/training_util_test.py
@@ -20,14 +20,12 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import training_util
 
 
-@test_util.run_v1_only('b/120545219')
 class GlobalStepTest(test.TestCase):
 
   def _assert_global_step(self, global_step, expected_dtype=dtypes.int64):
@@ -38,11 +36,12 @@ class GlobalStepTest(test.TestCase):
   def test_invalid_dtype(self):
     with ops.Graph().as_default() as g:
       self.assertIsNone(training_util.get_global_step())
-      variables.Variable(
+      variables.VariableV1(
           0.0,
           trainable=False,
           dtype=dtypes.float32,
-          name=ops.GraphKeys.GLOBAL_STEP)
+          name=ops.GraphKeys.GLOBAL_STEP,
+          collections=[ops.GraphKeys.GLOBAL_STEP])
       self.assertRaisesRegex(TypeError, 'does not have integer type',
                              training_util.get_global_step)
     self.assertRaisesRegex(TypeError, 'does not have integer type',
@@ -55,7 +54,8 @@ class GlobalStepTest(test.TestCase):
           [0],
           trainable=False,
           dtype=dtypes.int32,
-          name=ops.GraphKeys.GLOBAL_STEP)
+          name=ops.GraphKeys.GLOBAL_STEP,
+          collections=[ops.GraphKeys.GLOBAL_STEP])
       self.assertRaisesRegex(TypeError, 'not scalar',
                              training_util.get_global_step)
     self.assertRaisesRegex(TypeError, 'not scalar',
@@ -79,7 +79,8 @@ class GlobalStepTest(test.TestCase):
           0,
           trainable=False,
           dtype=dtypes.int32,
-          name=ops.GraphKeys.GLOBAL_STEP)
+          name=ops.GraphKeys.GLOBAL_STEP,
+          collections=[ops.GraphKeys.GLOBAL_STEP])
       self._assert_global_step(
           training_util.get_global_step(), expected_dtype=dtypes.int32)
     self._assert_global_step(
@@ -92,7 +93,6 @@ class GlobalStepTest(test.TestCase):
       self._assert_global_step(training_util.get_or_create_global_step(g))
 
 
-@test_util.run_v1_only('b/120545219')
 class GlobalStepReadTest(test.TestCase):
 
   def test_global_step_read_is_none_if_there_is_no_global_step(self):
diff --git a/tensorflow/python/types/BUILD b/tensorflow/python/types/BUILD
index e93bf5c10b3..5f3f4fd0e31 100644
--- a/tensorflow/python/types/BUILD
+++ b/tensorflow/python/types/BUILD
@@ -31,5 +31,21 @@ py_strict_library(
         "//tensorflow:__subpackages__",
         "//tensorflow:types_whitelist",
     ],
+    deps = [
+        ":doc_typealias",
+        "//tensorflow/python:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "doc_typealias",
+    srcs = [
+        "doc_typealias.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//tensorflow:__subpackages__",
+    ],
     deps = [],
 )
diff --git a/tensorflow/python/types/core.py b/tensorflow/python/types/core.py
index 20da83e562d..bec5aecaba0 100644
--- a/tensorflow/python/types/core.py
+++ b/tensorflow/python/types/core.py
@@ -18,6 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import textwrap
+
+from typing import Union
+import numpy as np
+
+from tensorflow.python.types import doc_typealias
+from tensorflow.python.util.tf_export import tf_export
 
 # TODO(mdan): Consider adding ABC once the dependence on isinstance is reduced.
 # TODO(mdan): Add type annotations.
@@ -58,3 +65,34 @@ class Value(Tensor):
 
   def numpy(self):
     pass
+
+
+# TODO(rahulkamat): Add missing types that are convertible to Tensor.
+TensorLike = Union[Tensor, int, float, bool, str, complex, tuple, list,
+                   np.ndarray]
+doc_typealias.document(
+    obj=TensorLike,
+    doc=textwrap.dedent("""\
+      Union of all types that can be converted to a `tf.Tensor` by `tf.convert_to_tensor`.
+
+      This definition may be used in user code. Additional types may be added
+      in the future as more input types are supported.
+
+      Example:
+
+      ```
+      def foo(x: TensorLike):
+        pass
+      ```
+
+      This definition passes static type verification for:
+
+      ```
+      foo(tf.constant([1, 2, 3]))
+      foo([1, 2, 3])
+      foo(np.array([1, 2, 3]))
+      ```
+      """),
+)
+tf_export("types.experimental.TensorLike").export_constant(
+    __name__, "TensorLike")
diff --git a/tensorflow/python/types/doc_typealias.py b/tensorflow/python/types/doc_typealias.py
new file mode 100644
index 00000000000..93dae5ed2b5
--- /dev/null
+++ b/tensorflow/python/types/doc_typealias.py
@@ -0,0 +1,35 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Helper functions to add documentation to type aliases."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+
+
+def document(obj, doc):
+  """Adds a docstring to typealias by overriding the `__doc__` attribute.
+
+  Note: Overriding `__doc__` is only possible after python 3.7.
+
+  Args:
+    obj: Typealias object that needs to be documented.
+    doc: Docstring of the typealias. It should follow the standard pystyle
+      docstring rules.
+  """
+  if sys.version_info >= (3, 7):
+    obj.__doc__ = doc
diff --git a/tensorflow/python/util/dispatch_test.py b/tensorflow/python/util/dispatch_test.py
index cc4fed0abb7..f06f2fda7e3 100644
--- a/tensorflow/python/util/dispatch_test.py
+++ b/tensorflow/python/util/dispatch_test.py
@@ -18,16 +18,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.linalg import linear_operator_diag
 from tensorflow.python.ops.proto_ops import decode_proto
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import dispatch
+from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import get_canonical_name_for_symbol
 from tensorflow.python.util.tf_export import tf_export
 
@@ -58,6 +62,8 @@ class TensorTracer(object):
     self.name = name
     self.args = args
     self.kwargs = kwargs
+    self.shape = array_ops.ones(shape=(4, 4)).shape
+    self.dtype = dtypes.float32
 
   def __repr__(self):
     if self.args is None and self.kwargs is None:
@@ -68,10 +74,42 @@ class TensorTracer(object):
           ["{}={}".format(name, x) for (name, x) in self.kwargs.items()])
       return "{}({})".format(self.name, ", ".join(args))
 
+  @property
+  def is_tensor_like(self):
+    return True
+
+  @classmethod
+  def _overload_all_operators(cls):  # pylint: disable=invalid-name
+    """Register overloads for all operators."""
+    for operator in ops.Tensor.OVERLOADABLE_OPERATORS:
+      cls._overload_operator(operator)
+
+  @classmethod
+  def _overload_operator(cls, operator):  # pylint: disable=invalid-name
+    """Overload an operator with the same overloading as `ops.Tensor`."""
+    tensor_oper = getattr(ops.Tensor, operator)
+
+    # Compatibility with Python 2:
+    # Python 2 unbound methods have type checks for the first arg,
+    # so we need to extract the underlying function
+    tensor_oper = getattr(tensor_oper, "__func__", tensor_oper)
+    setattr(cls, operator, tensor_oper)
+
+TensorTracer._overload_all_operators()  # pylint: disable=protected-access
+
 
 class TensorTracerOpDispatcher(dispatch.GlobalOpDispatcher):
   """Global op dispatcher for TensorTracer."""
 
+  def _flatten_with_slice_flattening(self, x):
+    flat = []
+    for val in nest.flatten(x):
+      if isinstance(val, slice):
+        flat.extend((val.start, val.stop, val.step))
+      else:
+        flat.append(val)
+    return flat
+
   def handle(self, op, args, kwargs):
     # Dispatcher only applies if at least one arg is a TensorTracer.
     if not (any(self.is_tensor_tracer_arg(x) for x in args) or
@@ -82,11 +120,8 @@ class TensorTracerOpDispatcher(dispatch.GlobalOpDispatcher):
     return TensorTracer(symbol_name, args, kwargs)
 
   def is_tensor_tracer_arg(self, value):
-    if isinstance(value, TensorTracer):
-      return True
-    if isinstance(value, (list, tuple)):
-      if any(isinstance(x, TensorTracer) for x in value):
-        return True
+    return any(isinstance(x, TensorTracer) for x in
+               self._flatten_with_slice_flattening(value))
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -214,5 +249,83 @@ class DispatchTest(test_util.TensorFlowTestCase):
       # Clean up.
       dispatch._GLOBAL_DISPATCHERS = original_global_dispatchers
 
+  def testGlobalDispatcherGetItem(self):
+    original_global_dispatchers = dispatch._GLOBAL_DISPATCHERS
+    try:
+      TensorTracerOpDispatcher().register()
+
+      x = TensorTracer("x")
+      trace = x[0]
+      self.assertEqual(
+          str(trace),
+          "__operators__.getitem(x, 0)")
+
+      x = TensorTracer("x")
+      y = TensorTracer("y")
+      trace = x[y]
+      self.assertEqual(
+          str(trace),
+          "__operators__.getitem(x, y)")
+
+      x = TensorTracer("x")
+      y = TensorTracer("y")
+      trace = x[:y]  # pylint: disable=invalid-slice-index
+      self.assertEqual(
+          str(trace),
+          "__operators__.getitem(x, slice(None, y, None))")
+
+      x = array_ops.ones(shape=(3, 3))
+      y = TensorTracer("y")
+      trace = x[y]
+      self.assertEqual(
+          str(trace),
+          "__operators__.getitem(%s, y)" % x)
+
+      trace = x[:y]  # pylint: disable=invalid-slice-index
+      self.assertEqual(
+          str(trace),
+          "__operators__.getitem(%s, slice(None, y, None))" % x)
+
+    finally:
+      # Clean up.
+      dispatch._GLOBAL_DISPATCHERS = original_global_dispatchers
+
+  def testGlobalDispatcherLinearOperators(self):
+    original_global_dispatchers = dispatch._GLOBAL_DISPATCHERS
+    try:
+      TensorTracerOpDispatcher().register()
+
+      x = TensorTracer("x")
+
+      # To grab the eigenvalues the diag operator just calls convert_to_tensor
+      # (twice) in this case.
+      trace = linear_operator_diag.LinearOperatorDiag(x).eigvals()
+      self.assertEqual(
+          str(trace),
+          "convert_to_tensor(convert_to_tensor(x, dtype=None, dtype_hint=None, "
+          "name=diag))")
+
+      # The diagonal tensor addition gets traced even though the linear_operator
+      # API only uses dispatchable ops instead of directly exposing dispatching.
+      trace = linear_operator_diag.LinearOperatorDiag(x).add_to_tensor(x)
+      self.assertIn(
+          "linalg.set_diag(convert_to_tensor(x, name=x), __operators__.add("
+          "convert_to_tensor(x, dtype=None, dtype_hint=None, name=diag), "
+          "linalg.diag_part(convert_to_tensor(x, name=x)), "
+          "name=",
+          str(trace))
+
+      # The dispatch-supporting ops the non-singular check calls out to
+      # get traced.
+      trace = linear_operator_diag.LinearOperatorDiag(x).assert_non_singular()
+      self.assertIn("debugging.assert_less", str(trace))
+      self.assertIn(
+          "message=Singular operator:  Diagonal contained zero values.",
+          str(trace))
+
+    finally:
+      # Clean up.
+      dispatch._GLOBAL_DISPATCHERS = original_global_dispatchers
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index ca808ba9ff1..fb3f2102ba7 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -27,6 +27,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -541,30 +542,31 @@ class NestTest(parameterized.TestCase, test.TestCase):
     self.assertEqual(nt.a[1][::-1], rev_nt.a[1])
     self.assertEqual(nt.b[::-1], rev_nt.b)
 
-  @test_util.run_deprecated_v1
   def testMapStructureOverPlaceholders(self):
-    inp_a = (array_ops.placeholder(dtypes.float32, shape=[3, 4]),
-             array_ops.placeholder(dtypes.float32, shape=[3, 7]))
-    inp_b = (array_ops.placeholder(dtypes.float32, shape=[3, 4]),
-             array_ops.placeholder(dtypes.float32, shape=[3, 7]))
+    # Test requires placeholders and thus requires graph mode
+    with ops.Graph().as_default():
+      inp_a = (array_ops.placeholder(dtypes.float32, shape=[3, 4]),
+               array_ops.placeholder(dtypes.float32, shape=[3, 7]))
+      inp_b = (array_ops.placeholder(dtypes.float32, shape=[3, 4]),
+               array_ops.placeholder(dtypes.float32, shape=[3, 7]))
 
-    output = nest.map_structure(lambda x1, x2: x1 + x2, inp_a, inp_b)
+      output = nest.map_structure(lambda x1, x2: x1 + x2, inp_a, inp_b)
 
-    nest.assert_same_structure(output, inp_a)
-    self.assertShapeEqual(np.zeros((3, 4)), output[0])
-    self.assertShapeEqual(np.zeros((3, 7)), output[1])
+      nest.assert_same_structure(output, inp_a)
+      self.assertShapeEqual(np.zeros((3, 4)), output[0])
+      self.assertShapeEqual(np.zeros((3, 7)), output[1])
 
-    feed_dict = {
-        inp_a: (np.random.randn(3, 4), np.random.randn(3, 7)),
-        inp_b: (np.random.randn(3, 4), np.random.randn(3, 7))
-    }
+      feed_dict = {
+          inp_a: (np.random.randn(3, 4), np.random.randn(3, 7)),
+          inp_b: (np.random.randn(3, 4), np.random.randn(3, 7))
+      }
 
-    with self.cached_session() as sess:
-      output_np = sess.run(output, feed_dict=feed_dict)
-    self.assertAllClose(output_np[0],
-                        feed_dict[inp_a][0] + feed_dict[inp_b][0])
-    self.assertAllClose(output_np[1],
-                        feed_dict[inp_a][1] + feed_dict[inp_b][1])
+      with self.cached_session() as sess:
+        output_np = sess.run(output, feed_dict=feed_dict)
+      self.assertAllClose(output_np[0],
+                          feed_dict[inp_a][0] + feed_dict[inp_b][0])
+      self.assertAllClose(output_np[1],
+                          feed_dict[inp_a][1] + feed_dict[inp_b][1])
 
   def testAssertShallowStructure(self):
     inp_ab = ["a", "b"]
diff --git a/tensorflow/python/util/serialization.py b/tensorflow/python/util/serialization.py
index 3b1713b4c61..e35d5ff5d5d 100644
--- a/tensorflow/python/util/serialization.py
+++ b/tensorflow/python/util/serialization.py
@@ -70,6 +70,9 @@ def get_json_type(obj):
   if isinstance(obj, collections_abc.Mapping):
     return dict(obj)
 
+  if obj is Ellipsis:
+    return {'class_name': '__ellipsis__'}
+
   if isinstance(obj, wrapt.ObjectProxy):
     return obj.__wrapped__
 
diff --git a/tensorflow/python/util/stack_trace.cc b/tensorflow/python/util/stack_trace.cc
index cf574f6f292..04b427fd67b 100644
--- a/tensorflow/python/util/stack_trace.cc
+++ b/tensorflow/python/util/stack_trace.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/python/util/stack_trace.h"
 
+#include "tensorflow/core/platform/str_util.h"
+#include "tensorflow/core/platform/stringpiece.h"
+
 namespace {
 
 // Returns C string from a Python string object. Handles Python2/3 strings.
@@ -31,22 +34,33 @@ const char* GetPythonString(PyObject* o) {
   return PyBytes_AsString(o);
 #endif
 }
+
 }  // namespace
 
 namespace tensorflow {
-std::string StackTrace::ToString() const {
-  DCheckPyGilState();
 
-  std::ostringstream result;
+std::vector<StackFrame> StackTrace::ToStackFrames() const {
+  std::vector<StackFrame> result;
+  result.reserve(size_);
+
   for (int i = size_ - 1; i >= 0; --i) {
-    result << "  File \"" << PyUnicode_AsUTF8(code_objs_[i]->co_filename)
-           << "\", line "
-           << PyCode_Addr2Line(code_objs_[i], last_instructions_[i]) << ", in "
-           << GetPythonString(code_objs_[i]->co_name)
-           << "\n    <source line unimplemented>\n";
-    // TODO(kkb): Add source code line.  See tf_stack.cc's
-    // FrameSummary::line() function.
+    const char* file_name = GetPythonString(code_objs_[i]->co_filename);
+    const int line_number =
+        PyCode_Addr2Line(code_objs_[i], last_instructions_[i]);
+    result.emplace_back(StackFrame{file_name, line_number,
+                                   GetPythonString(code_objs_[i]->co_name)});
   }
-  return result.str();
+
+  return result;
 }
+
+StackTrace* StackTraceManager::Get(int id) {
+  DCheckPyGilState();
+  if (next_id_ - id > kStackTraceCircularBufferSize) return nullptr;
+
+  return &stack_traces_[id & (kStackTraceCircularBufferSize - 1)];
+}
+
+StackTraceManager* const stack_trace_manager = new StackTraceManager();
+
 }  // namespace tensorflow
diff --git a/tensorflow/python/util/stack_trace.h b/tensorflow/python/util/stack_trace.h
index 0b9a737bf7e..9e66ac3c8d0 100644
--- a/tensorflow/python/util/stack_trace.h
+++ b/tensorflow/python/util/stack_trace.h
@@ -25,6 +25,8 @@ limitations under the License.
 
 #include "absl/base/attributes.h"
 #include "absl/base/optimization.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/util/abstract_stack_trace.h"
 #include "tensorflow/python/lib/core/py_util.h"
 
 namespace tensorflow {
@@ -48,7 +50,6 @@ class StackTrace final {
     int i = 0;
     for (; i < kMaxDepth && frame != nullptr; frame = frame->f_back, ++i) {
       PyCodeObject* code_obj = frame->f_code;
-      DCHECK(frame->f_trace == nullptr);
       DCHECK(code_obj != nullptr);
 
       Py_INCREF(code_obj);
@@ -82,10 +83,8 @@ class StackTrace final {
     return *this;
   }
 
-  // Returns string representation of the captured stack trace.
-  std::string ToString() const;
-
-  // TODO(kkb): Implement structured stack trace object getter.
+  // Returns a structured representation of the captured stack trace.
+  std::vector<StackFrame> ToStackFrames() const;
 
  private:
   std::array<PyCodeObject*, kMaxDepth> code_objs_;
@@ -103,6 +102,53 @@ class StackTrace final {
   StackTrace& operator=(const StackTrace&) = delete;
 };
 
+// A class that manages Python stack traces in a circular buffer. Users can
+// insert stack trace entries and retrive them by ids.
+class StackTraceManager {
+ public:
+  static constexpr int kStackTraceCircularBufferSize = 1024;
+
+  // Captures the current Python stack trace and returns an id.
+  // Python GIL must be acquired beforehand.
+  ABSL_MUST_USE_RESULT
+  ABSL_ATTRIBUTE_HOT
+  int Capture() {
+    DCheckPyGilState();
+    const int id = next_id_++;
+    const int index = id & (kStackTraceCircularBufferSize - 1);
+    stack_traces_[index] = StackTrace::Capture();
+    return id;
+  }
+
+  // Retrieve captured Python stack trace by id. Returns `nullptr` if the
+  // requested stack trace is evicted from the circular buffer.
+  // Python GIL must be acquired beforehand.
+  ABSL_MUST_USE_RESULT
+  StackTrace* Get(int id);
+
+ private:
+  int next_id_ = 0;
+  std::array<StackTrace, kStackTraceCircularBufferSize> stack_traces_;
+};
+
+// Singleton StackTraceManager.
+extern StackTraceManager* const stack_trace_manager;
+
+// Returns Python stack trace object that can be converted to string.
+// Note that the actual stack trace is kept in a circular buffer for string
+// conversion could fail if it's evicted before.
+// Python GIL must be acquired beforehand.
+inline AbstractStackTrace GetStackTrace() {
+  DCheckPyGilState();
+  return AbstractStackTrace(stack_trace_manager->Capture(), [](int id) {
+    PyGILState_STATE gstate = PyGILState_Ensure();
+    std::vector<StackFrame> result =
+        stack_trace_manager->Get(id)->ToStackFrames();
+    PyGILState_Release(gstate);
+    return result;
+  });
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_PYTHON_UTIL_STACK_TRACE_H_
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index cf8581443e7..41b02a3dd4e 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -361,6 +361,16 @@ int IsSequenceHelper(PyObject* o) {
   return check_cache->CachedLookup(o);
 }
 
+// Returns 1 if `o`'s class has a `__tf_dispatch__` attribute.
+// Returns 0 otherwise.
+int IsDispatchableHelper(PyObject* o) {
+  static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
+    return PyObject_HasAttrString(
+        reinterpret_cast<PyObject*>(to_check->ob_type), "__tf_dispatch__");
+  });
+  return check_cache->CachedLookup(o);
+}
+
 // ValueIterator interface
 class ValueIterator {
  public:
@@ -917,6 +927,7 @@ bool IsResourceVariable(PyObject* o) {
 }
 bool IsVariable(PyObject* o) { return IsVariableHelper(o) == 1; }
 bool IsIndexedSlices(PyObject* o) { return IsIndexedSlicesHelper(o) == 1; }
+bool IsDispatchable(PyObject* o) { return IsDispatchableHelper(o) == 1; }
 
 bool IsTuple(PyObject* o) {
   tensorflow::Safe_PyObjectPtr wrapped;
diff --git a/tensorflow/python/util/util.h b/tensorflow/python/util/util.h
index fc0b864416e..c6d10843cdc 100644
--- a/tensorflow/python/util/util.h
+++ b/tensorflow/python/util/util.h
@@ -115,6 +115,15 @@ bool IsTuple(PyObject* o);
 //   True if the sequence subclasses mapping.
 bool IsMappingView(PyObject* o);
 
+// Returns a true if its input has a `__tf_dispatch__` attribute.
+//
+// Args:
+//   o: the input to be checked.
+//
+// Returns:
+//   True if `o` has a `__tf_dispatch__` attribute.
+bool IsDispatchable(PyObject* o);
+
 // A version of PyMapping_Keys that works in C++11
 //
 // Args:
@@ -280,12 +289,11 @@ PyObject* RegisterPyObject(PyObject* name, PyObject* value);
 // Variant of RegisterPyObject that requires the object's value to be a type.
 PyObject* RegisterType(PyObject* type_name, PyObject* type);
 
-}  // namespace swig
-
 // Returns a borrowed reference to an object that was registered with
-// RegisterPyObject.  (Do not call PY_DECREF on the result).
+// RegisterPyObject.  (Do not call Py_DECREF on the result).
 PyObject* GetRegisteredPyObject(const std::string& name);
 
+}  // namespace swig
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_PYTHON_UTIL_UTIL_H_
diff --git a/tensorflow/security/fuzzing/BUILD b/tensorflow/security/fuzzing/BUILD
index 2f560bc3c6b..75c62ec8bf1 100644
--- a/tensorflow/security/fuzzing/BUILD
+++ b/tensorflow/security/fuzzing/BUILD
@@ -48,9 +48,6 @@ tf_fuzz_target(
 tf_fuzz_target(
     name = "arg_def_case_fuzz",
     srcs = ["arg_def_case_fuzz.cc"],
-    tags = [
-        "notap",  # TODO(b/160990158): ArgDefCase invariant is broken
-    ],
     deps = [
         "//tensorflow/core/platform:str_util",
         "//tensorflow/core/platform:stringpiece",
@@ -65,3 +62,19 @@ tf_fuzz_target(
         "//tensorflow/core/platform:stringpiece",
     ],
 )
+
+tf_fuzz_target(
+    name = "stringprintf_fuzz",
+    srcs = ["stringprintf_fuzz.cc"],
+    deps = [
+        "//tensorflow/core/platform:stringprintf",
+    ],
+)
+
+tf_fuzz_target(
+    name = "tstring_fuzz",
+    srcs = ["tstring_fuzz.cc"],
+    deps = [
+        "//tensorflow/core/platform:tstring",
+    ],
+)
diff --git a/tensorflow/security/fuzzing/arg_def_case_fuzz.cc b/tensorflow/security/fuzzing/arg_def_case_fuzz.cc
index 975a1efa164..676326c0237 100644
--- a/tensorflow/security/fuzzing/arg_def_case_fuzz.cc
+++ b/tensorflow/security/fuzzing/arg_def_case_fuzz.cc
@@ -28,12 +28,11 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
 
   tensorflow::StringPiece sp(char_data, size);
 
-  tensorflow::str_util::ArgDefCase(sp);
-  for (const auto &c : sp) {
+  std::string ns = tensorflow::str_util::ArgDefCase(sp);
+  for (const auto &c : ns) {
     const bool is_letter = 'a' <= c && c <= 'z';
     const bool is_digit = '0' <= c && c <= '9';
     if (!is_letter && !is_digit) {
-      printf("Got '%c'\n", c);
       assert(c == '_');
     }
   }
diff --git a/tensorflow/security/fuzzing/consume_leading_digits_fuzz.cc b/tensorflow/security/fuzzing/consume_leading_digits_fuzz.cc
index d49bc1f2110..2c458bb988a 100644
--- a/tensorflow/security/fuzzing/consume_leading_digits_fuzz.cc
+++ b/tensorflow/security/fuzzing/consume_leading_digits_fuzz.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <fuzzer/FuzzedDataProvider.h>
+
 #include <cstdint>
 #include <cstdlib>
 
@@ -23,16 +25,22 @@ limitations under the License.
 namespace {
 
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
-  uint8_t *byte_data = const_cast<uint8_t *>(data);
-  char *char_data = reinterpret_cast<char *>(byte_data);
+  FuzzedDataProvider fuzzed_data(data, size);
 
-  tensorflow::StringPiece sp(char_data, size);
-  tensorflow::uint64 val;
+  while (fuzzed_data.remaining_bytes() > 0) {
+    std::string s = fuzzed_data.ConsumeRandomLengthString(25);
+    tensorflow::StringPiece sp(s);
+    tensorflow::uint64 val;
 
-  const bool leading_digits =
-      tensorflow::str_util::ConsumeLeadingDigits(&sp, &val);
-  if (leading_digits) {
-    assert(val >= 0);
+    const bool leading_digits =
+        tensorflow::str_util::ConsumeLeadingDigits(&sp, &val);
+    const char lead_char_consume_digits = *(sp.data());
+    if (leading_digits) {
+      if (lead_char_consume_digits >= '0') {
+        assert(lead_char_consume_digits > '9');
+      }
+      assert(val >= 0);
+    }
   }
 
   return 0;
diff --git a/tensorflow/security/fuzzing/stringprintf_fuzz.cc b/tensorflow/security/fuzzing/stringprintf_fuzz.cc
new file mode 100644
index 00000000000..5cb2afe04a1
--- /dev/null
+++ b/tensorflow/security/fuzzing/stringprintf_fuzz.cc
@@ -0,0 +1,45 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <fuzzer/FuzzedDataProvider.h>
+
+#include <cstdint>
+#include <cstdlib>
+
+#include "tensorflow/core/platform/stringprintf.h"
+
+// This is a fuzzer for tensorflow::strings::Printf
+
+namespace {
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  FuzzedDataProvider fuzzed_data(data, size);
+
+  const char split = fuzzed_data.ConsumeIntegral<char>();
+  const char split_a = split & 0x07;
+  const char split_b = (split >> 3) & 0x07;
+
+  const std::string sa_string = fuzzed_data.ConsumeBytesAsString(split_a);
+  const std::string sb_string = fuzzed_data.ConsumeBytesAsString(split_b);
+  const std::string sc_string = fuzzed_data.ConsumeRemainingBytesAsString();
+  const char *sa = sa_string.c_str();
+  const char *sb = sb_string.c_str();
+  const char *sc = sc_string.c_str();
+
+  tensorflow::strings::Printf("%s %s %s", sa, sb, sc);
+
+  return 0;
+}
+
+}  // namespace
diff --git a/tensorflow/security/fuzzing/tstring_fuzz.cc b/tensorflow/security/fuzzing/tstring_fuzz.cc
new file mode 100644
index 00000000000..d9d417a940b
--- /dev/null
+++ b/tensorflow/security/fuzzing/tstring_fuzz.cc
@@ -0,0 +1,40 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <fuzzer/FuzzedDataProvider.h>
+
+#include <cstdint>
+#include <cstdlib>
+
+#include "tensorflow/core/platform/tstring.h"
+
+// This is a fuzzer for tensorflow::tstring
+
+namespace {
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  FuzzedDataProvider fuzzed_data(data, size);
+
+  tensorflow::tstring base = fuzzed_data.ConsumeRandomLengthString(10);
+
+  while (fuzzed_data.remaining_bytes() > 0) {
+    tensorflow::tstring pair = fuzzed_data.ConsumeRandomLengthString(10);
+    base.append(pair);
+    assert(base.size() <= base.capacity());
+  }
+
+  return 0;
+}
+
+}  // namespace
diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
index 3a14be9ad50..bd545f097cf 100644
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -130,6 +130,18 @@ cc_library(
     ],
 )
 
+tf_cuda_cc_test(
+    name = "cuda_driver_test",
+    srcs = ["cuda_driver_test.cc"],
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/stream_executor/lib",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "memcpy_test",
     srcs = ["memcpy_test.cc"],
@@ -324,6 +336,7 @@ cc_library(
         "@local_config_cuda//cuda:cudnn_header",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform:dso_loader",
+        ":cudnn_version",
     ]),
 )
 
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index e30eb549a9c..67fd72d52f3 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -200,6 +200,21 @@ ScopedActivateContext::ScopedActivateContext(GpuContext* cuda_context) {
   if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie();
 
   auto* tls = &tls_data.get();
+
+  // If this is an outermost scope, we must not assume that the CUDA context has
+  // been left in the same state we left it. Other code may have run on this
+  // thread and altered the context.
+  if (tls->depth == 0) {
+    VLOG(3) << "ScopedActivateContext switching to " << cuda_context->id();
+    FAIL_IF_CUDA_RES_ERROR(cuCtxSetCurrent(cuda_context->context()),
+                           "Failed setting context");
+    tls->depth = 1;
+    tls->id = cuda_context->id();
+    tls->context = cuda_context;
+    to_restore_ = nullptr;
+    return;
+  }
+
   tls->depth++;
   if (tls->id == cuda_context->id()) {
     if (kVerifyGpuContext) {
@@ -212,8 +227,7 @@ ScopedActivateContext::ScopedActivateContext(GpuContext* cuda_context) {
   VLOG(3) << "ScopedActivateContext switching context from " << tls->id
           << " to " << cuda_context->id();
 
-  to_restore_ = (tls->depth == 1 ? nullptr : tls->context);
-
+  to_restore_ = tls->context;
   // Set the context and update thread local.
   FAIL_IF_CUDA_RES_ERROR(cuCtxSetCurrent(cuda_context->context()),
                          "Failed setting context");
diff --git a/tensorflow/stream_executor/cuda/cuda_driver_test.cc b/tensorflow/stream_executor/cuda/cuda_driver_test.cc
new file mode 100644
index 00000000000..5b173f96d85
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_driver_test.cc
@@ -0,0 +1,76 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#include "tensorflow/stream_executor/cuda/cuda_driver.h"
+
+#include "absl/memory/memory.h"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace stream_executor {
+namespace gpu {
+
+void CheckCuda(CUresult result, const char* file, int line) {
+  if (result == CUDA_SUCCESS) {
+    return;
+  }
+  const char* name;
+  cuGetErrorName(result, &name);
+  const char* message;
+  cuGetErrorString(result, &message);
+  LOG(FATAL) << file << "(" << line << "): " << name << ", " << message;
+}
+
+void CheckCuda(cudaError_t result, const char* file, int line) {
+  if (result == cudaSuccess) {
+    return;
+  }
+  const char* name = cudaGetErrorName(result);
+  const char* message = cudaGetErrorString(result);
+  LOG(FATAL) << file << "(" << line << "): " << name << ", " << message;
+}
+
+#define CHECK_CUDA(result) CheckCuda(result, __FILE__, __LINE__)
+
+TEST(CudaDriverTest, ScopedActivateContextTest) {
+  CHECK_CUDA(cuInit(0));
+  CUdevice device;
+  CHECK_CUDA(cuDeviceGet(&device, 0));
+  CUcontext context0, context1;
+  CHECK_CUDA(cuCtxCreate(&context0, 0, device));
+  CHECK_CUDA(cuCtxCreate(&context1, 0, device));
+  GpuContext se_context1(context1, /*id=*/101);
+  {
+    ScopedActivateContext scope(&se_context1);
+    CUcontext c;
+    CHECK_CUDA(cuCtxGetCurrent(&c));
+    EXPECT_EQ(c, context1);
+  }
+  CHECK_CUDA(cuCtxSetCurrent(context0));
+  // ScopedActivateContext must correctly set the CUDA context even if some
+  // other code changes the context between the two scopes.
+  {
+    ScopedActivateContext scope(&se_context1);
+    CUcontext c;
+    CHECK_CUDA(cuCtxGetCurrent(&c));
+    EXPECT_EQ(c, context1);
+  }
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/stream_executor/device_description.cc b/tensorflow/stream_executor/device_description.cc
index 635e2624ec5..130e2e638e5 100644
--- a/tensorflow/stream_executor/device_description.cc
+++ b/tensorflow/stream_executor/device_description.cc
@@ -125,9 +125,10 @@ bool DeviceDescription::rocm_amdgpu_isa_version(int *version) const {
 
 bool ThreadDimOk(const DeviceDescription &device_description,
                  const ThreadDim &thread_dim) {
-  auto total_threads = thread_dim.x * thread_dim.y * thread_dim.z;
-  auto threads_per_block_limit = device_description.threads_per_block_limit();
-  if (total_threads > static_cast<uint64>(threads_per_block_limit)) {
+  const int64 total_threads = thread_dim.x * thread_dim.y * thread_dim.z;
+  const int64 threads_per_block_limit =
+      device_description.threads_per_block_limit();
+  if (total_threads > threads_per_block_limit) {
     VLOG(2) << "exceeded total-thread-per-block limit: " << total_threads
             << " vs limit " << threads_per_block_limit;
     return false;
diff --git a/tensorflow/stream_executor/gpu/redzone_allocator.h b/tensorflow/stream_executor/gpu/redzone_allocator.h
index e5e42df73bd..3e56ca386a9 100644
--- a/tensorflow/stream_executor/gpu/redzone_allocator.h
+++ b/tensorflow/stream_executor/gpu/redzone_allocator.h
@@ -118,6 +118,9 @@ class RedzoneAllocator : public ScratchAllocator {
   // isn't necessarily just first.size() - 2 * redzone_size_ because when the
   // user allocation size is not a multiple of 4 bytes, we round up the size of
   // the RHS redzone.
+  //
+  // ScratchAllocators need to free all allocated memory on destruction so we
+  // use `OwningDeviceMemory` here.
   std::vector<std::pair<OwningDeviceMemory, int64>> allocated_buffers_;
 
   int64 allocated_bytes_excluding_redzones_ = 0;
diff --git a/tensorflow/stream_executor/multi_platform_manager.cc b/tensorflow/stream_executor/multi_platform_manager.cc
index 64543a8ae4d..120245e34b7 100644
--- a/tensorflow/stream_executor/multi_platform_manager.cc
+++ b/tensorflow/stream_executor/multi_platform_manager.cc
@@ -39,6 +39,14 @@ class MultiPlatformManagerImpl {
   port::StatusOr<Platform*> PlatformWithId(const Platform::Id& id)
       TF_LOCKS_EXCLUDED(mu_);
 
+  port::StatusOr<Platform*> PlatformWithName(absl::string_view target,
+                                             bool initialize_platform)
+      TF_LOCKS_EXCLUDED(mu_);
+
+  port::StatusOr<Platform*> PlatformWithId(const Platform::Id& id,
+                                           bool initialize_platform)
+      TF_LOCKS_EXCLUDED(mu_);
+
   port::StatusOr<Platform*> InitializePlatformWithName(
       absl::string_view target,
       const std::map<std::string, std::string>& options) TF_LOCKS_EXCLUDED(mu_);
@@ -47,8 +55,8 @@ class MultiPlatformManagerImpl {
       TF_LOCKS_EXCLUDED(mu_);
 
   port::StatusOr<std::vector<Platform*>> PlatformsWithFilter(
-      const std::function<bool(const Platform*)>& filter)
-      TF_LOCKS_EXCLUDED(mu_);
+      const std::function<bool(const Platform*)>& filter,
+      bool initialize_platform) TF_LOCKS_EXCLUDED(mu_);
 
   using Listener = MultiPlatformManager::Listener;
   port::Status RegisterListener(std::unique_ptr<Listener> listener)
@@ -104,10 +112,20 @@ port::Status MultiPlatformManagerImpl::RegisterPlatform(
 
 port::StatusOr<Platform*> MultiPlatformManagerImpl::PlatformWithName(
     absl::string_view target) {
+  return PlatformWithName(target, /*initialize_platform=*/true);
+}
+
+port::StatusOr<Platform*> MultiPlatformManagerImpl::PlatformWithId(
+    const Platform::Id& id) {
+  return PlatformWithId(id, /*initialize_platform=*/true);
+}
+
+port::StatusOr<Platform*> MultiPlatformManagerImpl::PlatformWithName(
+    absl::string_view target, bool initialize_platform) {
   absl::MutexLock lock(&mu_);
 
   SE_ASSIGN_OR_RETURN(Platform * platform, LookupByNameLocked(target));
-  if (!platform->Initialized()) {
+  if (initialize_platform && !platform->Initialized()) {
     SE_RETURN_IF_ERROR(platform->Initialize({}));
   }
 
@@ -115,11 +133,11 @@ port::StatusOr<Platform*> MultiPlatformManagerImpl::PlatformWithName(
 }
 
 port::StatusOr<Platform*> MultiPlatformManagerImpl::PlatformWithId(
-    const Platform::Id& id) {
+    const Platform::Id& id, bool initialize_platform) {
   absl::MutexLock lock(&mu_);
 
   SE_ASSIGN_OR_RETURN(Platform * platform, LookupByIdLocked(id));
-  if (!platform->Initialized()) {
+  if (initialize_platform && !platform->Initialized()) {
     SE_RETURN_IF_ERROR(platform->Initialize({}));
   }
 
@@ -170,7 +188,8 @@ port::Status MultiPlatformManagerImpl::RegisterListener(
 
 port::StatusOr<std::vector<Platform*>>
 MultiPlatformManagerImpl::PlatformsWithFilter(
-    const std::function<bool(const Platform*)>& filter) {
+    const std::function<bool(const Platform*)>& filter,
+    bool initialize_platform) {
   absl::MutexLock lock(&mu_);
   CHECK_EQ(id_map_.size(), name_map_.size());
   std::vector<Platform*> platforms;
@@ -178,7 +197,7 @@ MultiPlatformManagerImpl::PlatformsWithFilter(
   for (const auto& entry : id_map_) {
     Platform* platform = entry.second;
     if (filter(platform)) {
-      if (!platform->Initialized()) {
+      if (initialize_platform && !platform->Initialized()) {
         SE_RETURN_IF_ERROR(platform->Initialize({}));
       }
       platforms.push_back(platform);
@@ -250,6 +269,16 @@ MultiPlatformManagerImpl& Impl() {
   return Impl().PlatformWithId(id);
 }
 
+/*static*/ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithId(
+    const Platform::Id& id, bool initialize_platform) {
+  return Impl().PlatformWithId(id, initialize_platform);
+}
+
+/*static*/ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithName(
+    absl::string_view target, bool initialize_platform) {
+  return Impl().PlatformWithName(target, initialize_platform);
+}
+
 /*static*/ port::StatusOr<Platform*>
 MultiPlatformManager::InitializePlatformWithName(
     absl::string_view target,
@@ -271,7 +300,14 @@ MultiPlatformManager::InitializePlatformWithId(
 /*static*/ port::StatusOr<std::vector<Platform*>>
 MultiPlatformManager::PlatformsWithFilter(
     const std::function<bool(const Platform*)>& filter) {
-  return Impl().PlatformsWithFilter(filter);
+  return PlatformsWithFilter(filter, /*initialize_platform=*/true);
+}
+
+/*static*/ port::StatusOr<std::vector<Platform*>>
+MultiPlatformManager::PlatformsWithFilter(
+    const std::function<bool(const Platform*)>& filter,
+    bool initialize_platform) {
+  return Impl().PlatformsWithFilter(filter, initialize_platform);
 }
 
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/multi_platform_manager.h b/tensorflow/stream_executor/multi_platform_manager.h
index 556015de790..4fa2d819520 100644
--- a/tensorflow/stream_executor/multi_platform_manager.h
+++ b/tensorflow/stream_executor/multi_platform_manager.h
@@ -100,6 +100,13 @@ class MultiPlatformManager {
   static port::StatusOr<Platform*> PlatformWithName(absl::string_view target);
   static port::StatusOr<Platform*> PlatformWithId(const Platform::Id& id);
 
+  // Same functions as above, but allows platforms to be returned without
+  // initialization if initialize_platform == false.
+  static port::StatusOr<Platform*> PlatformWithName(absl::string_view target,
+                                                    bool initialize_platform);
+  static port::StatusOr<Platform*> PlatformWithId(const Platform::Id& id,
+                                                  bool initialize_platform);
+
   // Retrieves the platform registered with the given platform name (e.g.
   // "CUDA", "OpenCL", ...) or id (an opaque, comparable value provided by the
   // Platform's Id() method).
@@ -123,6 +130,10 @@ class MultiPlatformManager {
   static port::StatusOr<std::vector<Platform*>> PlatformsWithFilter(
       const std::function<bool(const Platform*)>& filter);
 
+  static port::StatusOr<std::vector<Platform*>> PlatformsWithFilter(
+      const std::function<bool(const Platform*)>& filter,
+      bool initialize_platform);
+
   // Although the MultiPlatformManager "owns" its platforms, it holds them as
   // undecorated pointers to prevent races during program exit (between this
   // object's data and the underlying platforms (e.g., CUDA, OpenCL).
diff --git a/tensorflow/stream_executor/platform/default/dso_loader.cc b/tensorflow/stream_executor/platform/default/dso_loader.cc
index 01af4114536..6e0113ab05a 100644
--- a/tensorflow/stream_executor/platform/default/dso_loader.cc
+++ b/tensorflow/stream_executor/platform/default/dso_loader.cc
@@ -43,7 +43,7 @@ port::StatusOr<void*> GetDsoHandle(const string& name, const string& version) {
   auto filename = port::Env::Default()->FormatLibraryFileName(name, version);
   void* dso_handle;
   port::Status status =
-      port::Env::Default()->LoadLibrary(filename.c_str(), &dso_handle);
+      port::Env::Default()->LoadDynamicLibrary(filename.c_str(), &dso_handle);
   if (status.ok()) {
     LOG(INFO) << "Successfully opened dynamic library " << filename;
     return dso_handle;
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index da418122375..505d54cf5bf 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -1886,7 +1886,7 @@ Stream *Stream::GetOrCreateSubStream() {
 
   // Look for the first reusable sub_stream that is ok, dropping !ok sub_streams
   // we encounter along the way.
-  for (int64 index = 0; index < sub_streams_.size();) {
+  for (size_t index = 0; index < sub_streams_.size();) {
     std::pair<std::unique_ptr<Stream>, bool> &pair = sub_streams_[index];
     if (pair.second) {
       // The sub_stream is reusable.
@@ -1937,7 +1937,7 @@ void Stream::ReturnSubStream(Stream *sub_stream) {
   absl::MutexLock lock(&mu_);
 
   // Look for the sub-stream.
-  for (int64 index = 0; index < sub_streams_.size(); ++index) {
+  for (int64 index = 0, end = sub_streams_.size(); index < end; ++index) {
     std::pair<std::unique_ptr<Stream>, bool> &pair = sub_streams_[index];
     if (pair.first.get() != sub_stream) {
       continue;
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index f2ec8f3dc18..d23f1472e33 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -478,7 +478,7 @@ port::Status StreamExecutor::GetStatus(Stream *stream) {
 
 DeviceMemoryBase StreamExecutor::Allocate(uint64 size, int64 memory_space) {
   if (memory_limit_bytes_ > 0 &&
-      mem_alloc_bytes_ + size > memory_limit_bytes_) {
+      static_cast<int64>(mem_alloc_bytes_ + size) > memory_limit_bytes_) {
     LOG(WARNING) << "Not enough memory to allocate " << size << " on device "
                  << device_ordinal_
                  << " within provided limit. [used=" << mem_alloc_bytes_
diff --git a/tensorflow/stream_executor/tf_allocator_adapter.cc b/tensorflow/stream_executor/tf_allocator_adapter.cc
index 0b2d66f7e29..b3483932333 100644
--- a/tensorflow/stream_executor/tf_allocator_adapter.cc
+++ b/tensorflow/stream_executor/tf_allocator_adapter.cc
@@ -40,7 +40,7 @@ port::StatusOr<OwningDeviceMemory> TfAllocatorAdapter::Allocate(
     int64 memory_space) {
   CHECK_EQ(memory_space, 0);
   tensorflow::AllocationAttributes attrs;
-  attrs.no_retry_on_failure = !retry_on_failure;
+  attrs.retry_on_failure = retry_on_failure;
   void *data = nullptr;
   if (size != 0) {
     data = wrapped_->AllocateRaw(tensorflow::Allocator::kAllocatorAlignment,
diff --git a/tensorflow/stream_executor/tpu/BUILD b/tensorflow/stream_executor/tpu/BUILD
index 41b9d35bed7..93998a4aefc 100644
--- a/tensorflow/stream_executor/tpu/BUILD
+++ b/tensorflow/stream_executor/tpu/BUILD
@@ -6,16 +6,62 @@ package(
 )
 
 cc_library(
-    name = "tpu_executor_c_api_hdrs",
-    hdrs = ["tpu_executor_c_api.h"],
-    visibility = ["//visibility:public"],
+    name = "c_api_decl",
+    hdrs = [
+        "c_api_decl.h",
+        "c_api_defn.h",
+    ],
     deps = [
         "//tensorflow/c:tf_attrtype",
         "//tensorflow/c:tf_status",
         "//tensorflow/core/tpu:libtftpu_header",
-        "//tensorflow/core/tpu/kernels:tpu_util_c_api_hdrs",
+        "//tensorflow/stream_executor:stream_executor_headers",
+    ],
+)
+
+cc_library(
+    name = "tpu_executor_c_api_hdrs",
+    hdrs = ["tpu_executor_c_api.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":c_api_decl",
+        "//tensorflow/c:tf_attrtype",
+        "//tensorflow/c:tf_status",
+        "//tensorflow/core/tpu:libtftpu_header",
+    ],
+)
+
+cc_library(
+    name = "c_api_conversions",
+    srcs = ["c_api_conversions.cc"],
+    hdrs = ["c_api_conversions.h"],
+    deps = [
+        ":c_api_decl",
+        ":tpu_executor_c_api_hdrs",
+        ":tpu_platform_interface",
+        "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service:maybe_owning_device_memory",
+        "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/core/tpu:tpu_api",
+        "//tensorflow/stream_executor:device_memory",
+        "//tensorflow/stream_executor:device_memory_allocator",
+        "@com_google_absl//absl/container:inlined_vector",
+    ],
+)
+
+cc_library(
+    name = "noncopyable_buffer",
+    hdrs = ["noncopyable_buffer.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
     ],
-    alwayslink = True,
 )
 
 cc_library(
@@ -23,34 +69,19 @@ cc_library(
     hdrs = ["tpu_node_context_c_api.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":c_api_decl",
         "//tensorflow/core/tpu:libtftpu_header",
-        "//tensorflow/core/tpu/kernels:tpu_util_c_api_hdrs",
     ],
-    alwayslink = True,
 )
 
 cc_library(
     name = "status_helper",
     hdrs = ["status_helper.h"],
     deps = [
+        ":c_api_decl",
         ":tpu_executor_c_api_hdrs",
-        "//tensorflow/core/platform:status",
+        "//tensorflow/core:lib",
         "//tensorflow/core/tpu:tpu_api",
-        "//tensorflow/core/tpu/kernels:tpu_util_c_api_hdrs",
-    ],
-)
-
-cc_library(
-    name = "c_api_conversions",
-    hdrs = ["c_api_conversions.h"],
-    deps = [
-        ":tpu_executor_c_api_hdrs",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:shaped_buffer",
-        "//tensorflow/stream_executor:device_memory",
-        "@com_google_absl//absl/container:inlined_vector",
     ],
 )
 
@@ -58,15 +89,9 @@ cc_library(
     name = "proto_helper",
     srcs = ["proto_helper.cc"],
     hdrs = ["proto_helper.h"],
-    deps = ["//tensorflow/core:lib"],
-)
-
-cc_library(
-    name = "device_memory_base_helper",
-    hdrs = ["device_memory_base_helper.h"],
     deps = [
-        ":tpu_executor_c_api_hdrs",
-        "//tensorflow/stream_executor:device_memory",
+        ":c_api_decl",
+        "//tensorflow/core:lib",
     ],
 )
 
@@ -83,16 +108,14 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        ":device_memory_base_helper",
+        ":c_api_conversions",
         ":status_helper",
         ":tpu_executor_base",
         ":tpu_executor_c_api_hdrs",
         ":tpu_executor_interface",
         ":tpu_platform_interface",
         ":tpu_stream_interface",
-        "//tensorflow/core/platform:casts",
-        "//tensorflow/core/platform:mutex",
-        "//tensorflow/core/platform:types",
+        "//tensorflow/core:lib",
         "//tensorflow/core/tpu:tpu_api",
         "//tensorflow/stream_executor",
         "//tensorflow/stream_executor/lib",
@@ -102,6 +125,29 @@ cc_library(
     alwayslink = True,
 )
 
+cc_library(
+    name = "tpu_executor_hdrs",
+    hdrs = [
+        "tpu_executor.h",
+        "tpu_executor_interface.h",
+        "tpu_platform.h",
+        "tpu_platform_interface.h",
+        "tpu_stream.h",
+        "tpu_stream_interface.h",
+        "tpu_timer.h",
+    ],
+    deps = [
+        ":c_api_conversions",
+        ":status_helper",
+        ":tpu_executor_c_api_hdrs",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/tpu:tpu_api",
+        "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
+
 cc_library(
     name = "tpu_executor_base",
     srcs = [
@@ -109,13 +155,15 @@ cc_library(
         "tpu_platform.cc",
     ],
     hdrs = [
+        "tpu_event.h",
         "tpu_executor.h",
         "tpu_platform.h",
         "tpu_stream.h",
         "tpu_timer.h",
     ],
     deps = [
-        ":device_memory_base_helper",
+        ":c_api_conversions",
+        ":c_api_decl",
         ":status_helper",
         ":tpu_executor_c_api_hdrs",
         ":tpu_executor_interface",
@@ -124,8 +172,6 @@ cc_library(
         "//tensorflow/c:tf_status",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/core:lib",
-        "//tensorflow/core/platform:mutex",
-        "//tensorflow/core/platform:types",
         "//tensorflow/core/tpu:tpu_api",
         "//tensorflow/stream_executor",
         "//tensorflow/stream_executor/lib",
@@ -142,13 +188,12 @@ cc_library(
         ":tpu_executor_c_api_hdrs",
         ":tpu_node_context_c_api_hdrs",
         ":tpu_platform_interface",
-        ":tpu_transfer_manager_base",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:backend",
-        "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/service:stream_pool",
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core/tpu:tpu_api",
         "//tensorflow/stream_executor:device_memory_allocator",
         "//tensorflow/stream_executor/lib",
@@ -156,6 +201,18 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tpu_transfer_manager_interface",
+    srcs = ["tpu_transfer_manager_interface.cc"],
+    hdrs = ["tpu_transfer_manager_interface.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":noncopyable_buffer",
+        ":tpu_platform_interface",
+        "//tensorflow/compiler/xla/service:transfer_manager",
+    ],
+)
+
 cc_library(
     name = "tpu_transfer_manager",
     srcs = ["tpu_transfer_manager_registration.cc"],
@@ -173,10 +230,12 @@ cc_library(
     hdrs = ["tpu_transfer_manager.h"],
     deps = [
         ":c_api_conversions",
+        ":noncopyable_buffer",
         ":proto_helper",
         ":status_helper",
         ":tpu_executor_base",
         ":tpu_executor_c_api_hdrs",
+        ":tpu_transfer_manager_interface",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
@@ -196,19 +255,50 @@ cc_library(
         ":tpu_executor_c_api_hdrs",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/service:computation_placer",
+        "//tensorflow/core/tpu:tpu_api",
     ],
     alwayslink = True,
 )
 
+cc_library(
+    name = "tpu_executable",
+    srcs = ["tpu_executable.cc"],
+    hdrs = ["tpu_executable.h"],
+    deps = [
+        ":c_api_conversions",
+        ":proto_helper",
+        ":status_helper",
+        ":tpu_executable_interface",
+        ":tpu_executor_base",
+        ":tpu_executor_c_api_hdrs",
+        ":tpu_platform_interface",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/tpu:tpu_api",
+        "//tensorflow/core/tpu/kernels:tpu_compile_c_api_hdrs",
+        "//tensorflow/core/tpu/kernels:tpu_execute_c_api_hdrs",
+        "//tensorflow/core/tpu/kernels:tpu_program_c_api_hdrs",
+        "//tensorflow/stream_executor",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 cc_library(
     name = "tpu_platform_interface",
     srcs = ["tpu_platform_interface.cc"],
     hdrs = ["tpu_platform_interface.h"],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/core/platform:types",
-        "//tensorflow/stream_executor:multi_platform_manager",
-        "//tensorflow/stream_executor:stream_executor_headers",
+        ":c_api_decl",
+        ":tpu_topology_external",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor",
     ],
 )
 
@@ -231,3 +321,41 @@ cc_library(
         "//tensorflow/stream_executor:stream_executor_headers",
     ],
 )
+
+cc_library(
+    name = "tpu_executable_interface",
+    srcs = ["tpu_executable_interface.cc"],
+    hdrs = ["tpu_executable_interface.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_execution_profile",
+        "//tensorflow/compiler/xla/service:hlo_profile_printer_data_cc",
+        "//tensorflow/compiler/xla/service:maybe_owning_device_memory",
+        "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/compiler/xla/service:transfer_manager",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "tpu_topology_external",
+    srcs = ["tpu_topology.cc"],
+    hdrs = ["tpu_topology.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":c_api_decl",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/tpu:tpu_api",
+    ],
+)
diff --git a/tensorflow/stream_executor/tpu/c_api_conversions.cc b/tensorflow/stream_executor/tpu/c_api_conversions.cc
new file mode 100644
index 00000000000..ddbd9ec2219
--- /dev/null
+++ b/tensorflow/stream_executor/tpu/c_api_conversions.cc
@@ -0,0 +1,227 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/tpu/c_api_conversions.h"
+
+#include "tensorflow/core/tpu/tpu_api.h"
+#include "tensorflow/stream_executor/tpu/c_api_defn.h"
+#include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
+#include "tensorflow/stream_executor/tpu/tpu_platform_interface.h"
+
+namespace ApiConverter {
+
+xla::ShapedBuffer FromC(XLA_ShapedBuffer* c_buffer) {
+  xla::Shape xla_on_host_shape = ApiConverter::FromC(&c_buffer->on_host_shape);
+  xla::Shape xla_on_device_shape =
+      ApiConverter::FromC(&c_buffer->on_device_shape);
+
+  xla::ShapeTree<stream_executor::DeviceMemoryBase> xla_shape_tree(
+      xla_on_device_shape);
+  size_t i = 0;
+  for (auto& pair : xla_shape_tree) {
+    pair.second = ApiConverter::FromC(c_buffer->bases[i]);
+    i++;
+  }
+
+  xla::ShapedBuffer xla_shaped_buffer(
+      xla_on_host_shape, xla_on_device_shape,
+      tensorflow::tpu::TpuPlatformInterface::GetRegisteredPlatform(),
+      c_buffer->device_ordinal);
+  xla_shaped_buffer.set_buffers(xla_shape_tree);
+  return xla_shaped_buffer;
+}
+
+SE_MaybeOwningDeviceMemory ToC(xla::MaybeOwningDeviceMemory& mem) {
+  SE_MaybeOwningDeviceMemory se_mem;
+  se_mem.owned = mem.HasOwnership();
+  se_mem.memory = ApiConverter::ToC(mem.AsDeviceMemoryBase());
+  if (mem.HasOwnership()) {
+    auto owned = mem.Release().value();
+    se_mem.device_ordinal = owned.device_ordinal();
+    se_mem.allocator = ApiConverter::ToC(owned.allocator());
+  } else {
+    se_mem.allocator =
+        ToC(static_cast<stream_executor::DeviceMemoryAllocator*>(nullptr));
+    se_mem.device_ordinal = -1;
+  }
+  return se_mem;
+}
+
+xla::MaybeOwningDeviceMemory FromC(
+    SE_MaybeOwningDeviceMemory* se_mem,
+    stream_executor::DeviceMemoryAllocator* allocator) {
+  if (se_mem->owned) {
+    return xla::MaybeOwningDeviceMemory(
+        stream_executor::OwningDeviceMemory(ApiConverter::FromC(se_mem->memory),
+                                            se_mem->device_ordinal, allocator));
+  } else {
+    return xla::MaybeOwningDeviceMemory(ApiConverter::FromC(se_mem->memory));
+  }
+}
+
+SE_DeviceMemoryAllocator ToC(
+    stream_executor::DeviceMemoryAllocator* allocator) {
+  SE_DeviceMemoryAllocator se_allocator;
+  if (allocator == nullptr) {
+    se_allocator.ctx = nullptr;
+    se_allocator.platform = nullptr;
+    se_allocator.allocate = nullptr;
+    se_allocator.deallocate = nullptr;
+    return se_allocator;
+  }
+  // N.B. Platform is assumed to be the registered backend platform.
+  se_allocator.platform = nullptr;
+  se_allocator.ctx = allocator;
+  se_allocator.allocate = [](void* ctx, int device_ordinal, uint64_t size,
+                             bool retry_on_failure, int64_t memory_space,
+                             SE_ScopedDeviceMemory* memory,
+                             SE_Status* se_status) {
+    auto allocation =
+        reinterpret_cast<stream_executor::DeviceMemoryAllocator*>(ctx)
+            ->Allocate(device_ordinal, size, retry_on_failure, memory_space);
+    if (!allocation.ok()) {
+      auto status = allocation.status();
+      tensorflow::tpu::ExecutorApiFn()->TpuStatus_SetFn(
+          se_status, status.code(), status.error_message().data(),
+          status.error_message().size());
+    } else {
+      auto& scoped_memory = allocation.ValueOrDie();
+      memory->wrapped = ApiConverter::ToC(scoped_memory.Release());
+      memory->device_ordinal = scoped_memory.device_ordinal();
+    }
+  };
+
+  se_allocator.deallocate = [](void* ctx, SE_DeviceMemoryBase* base,
+                               int device_ordinal, SE_Status* se_status) {
+    auto status = reinterpret_cast<stream_executor::DeviceMemoryAllocator*>(ctx)
+                      ->Deallocate(device_ordinal, ApiConverter::FromC(*base));
+    if (!status.ok()) {
+      tensorflow::tpu::ExecutorApiFn()->TpuStatus_SetFn(
+          se_status, status.code(), status.error_message().data(),
+          status.error_message().size());
+    }
+  };
+  return se_allocator;
+}
+
+SE_MaybeOwningDeviceMemory ToC(stream_executor::OwningDeviceMemory* mem) {
+  SE_MaybeOwningDeviceMemory se_mem;
+  se_mem.device_ordinal = mem->device_ordinal();
+  se_mem.memory = ApiConverter::ToC(mem->Release());
+  se_mem.allocator = ApiConverter::ToC(mem->allocator());
+  se_mem.owned = true;
+  return se_mem;
+}
+
+void ToC(const stream_executor::DeviceMemoryBase& base,
+         SE_DeviceMemoryBase* se_base) {
+  se_base->opaque = const_cast<void*>(base.opaque());
+  se_base->payload = base.payload();
+  se_base->size = base.size();
+}
+
+SE_DeviceMemoryBase ToC(const stream_executor::DeviceMemoryBase& base) {
+  SE_DeviceMemoryBase se_base;
+  ToC(base, &se_base);
+  return se_base;
+}
+
+stream_executor::DeviceMemoryBase FromC(const SE_DeviceMemoryBase& se_base) {
+  stream_executor::DeviceMemoryBase base(se_base.opaque, se_base.size);
+  base.SetPayload(se_base.payload);
+  return base;
+}
+
+xla::Shape FromC(XLA_Shape* shape) {
+  xla::ShapeProto p;
+  p.ParseFromArray(shape->bytes, shape->size);
+  return xla::Shape(p);
+}
+
+void ToC(const xla::Shape& xla_shape, XLA_Shape* c_shape) {
+  xla::ShapeProto p = xla_shape.ToProto();
+  std::string p_str = p.SerializeAsString();
+  c_shape->bytes = new char[p_str.size()];
+  c_shape->size = p_str.size();
+  memcpy(c_shape->bytes, p_str.data(), p_str.size());
+}
+
+XLA_ShapeIndex ToC(const xla::ShapeIndex& xla_shape) {
+  XLA_ShapeIndex c_shape;
+  CHECK_LT(xla_shape.size(), 8);
+  c_shape.count = xla_shape.size();
+  for (int i = 0; i < xla_shape.size(); ++i) {
+    c_shape.indices[i] = xla_shape[i];
+  }
+  return c_shape;
+}
+
+xla::ShapeIndex FromC(XLA_ShapeIndex* c_shape) {
+  return xla::ShapeIndex(&c_shape->indices[0],
+                         &c_shape->indices[c_shape->count]);
+}
+
+void ToC(const xla::LiteralSlice& literal, XLA_Literal* c_literal) {
+  ApiConverter::ToC(literal.shape(), &c_literal->shape);
+  auto shapes = xla::ShapeUtil::GetLeafShapes(literal.shape());
+  c_literal->buffers = new char*[shapes.size()];
+  c_literal->sizes = new size_t[shapes.size()];
+  c_literal->count = shapes.size();
+  for (int i = 0; i < shapes.size(); ++i) {
+    c_literal->buffers[i] = reinterpret_cast<char*>(
+        const_cast<void*>(literal.untyped_data(shapes[i].index)));
+    c_literal->sizes[i] = literal.size_bytes(shapes[i].index);
+  }
+}
+
+xla::MutableBorrowingLiteral FromC(XLA_Literal* c_literal) {
+  xla::Shape shape = ApiConverter::FromC(&c_literal->shape);
+  return xla::MutableBorrowingLiteral(
+      absl::MakeSpan(c_literal->buffers, c_literal->count), shape);
+}
+
+void ToC(const xla::ShapedBuffer& buffer, XLA_ShapedBuffer* c_device_buffer) {
+  ApiConverter::ToC(buffer.on_host_shape(), &c_device_buffer->on_host_shape);
+  ApiConverter::ToC(buffer.on_device_shape(),
+                    &c_device_buffer->on_device_shape);
+  c_device_buffer->device_ordinal = buffer.device_ordinal();
+  absl::InlinedVector<SE_DeviceMemoryBase, 2> bases;
+  for (auto& pair : buffer.buffers()) {
+    bases.push_back(ApiConverter::ToC(pair.second));
+  }
+  c_device_buffer->count = bases.size();
+  c_device_buffer->bases = new SE_DeviceMemoryBase[bases.size()];
+  for (int i = 0; i < bases.size(); ++i) {
+    c_device_buffer->bases[i] = bases[i];
+  }
+}
+
+void Free(XLA_Shape* shape) { delete[] shape->bytes; }
+void Free(XLA_ShapeIndex*) {}
+void Free(SE_DeviceMemoryBase*) {}
+
+void Free(XLA_Literal* c_literal) {
+  delete[] c_literal->buffers;
+  delete[] c_literal->sizes;
+  ApiConverter::Free(&c_literal->shape);
+}
+
+void Free(XLA_ShapedBuffer* c_buffer) {
+  ApiConverter::Free(&c_buffer->on_device_shape);
+  ApiConverter::Free(&c_buffer->on_host_shape);
+  delete[] c_buffer->bases;
+}
+
+}  // namespace ApiConverter
diff --git a/tensorflow/stream_executor/tpu/c_api_conversions.h b/tensorflow/stream_executor/tpu/c_api_conversions.h
index 1bb9ecee688..bfe5f37204c 100644
--- a/tensorflow/stream_executor/tpu/c_api_conversions.h
+++ b/tensorflow/stream_executor/tpu/c_api_conversions.h
@@ -17,99 +17,99 @@ limitations under the License.
 #define TENSORFLOW_STREAM_EXECUTOR_TPU_C_API_CONVERSIONS_H_
 
 #include "absl/container/inlined_vector.h"
+#include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
+#include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/tpu/c_api_decl.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 
-class TpuConversions {
- public:
-  static stream_executor::DeviceMemoryBase
-  SE_DeviceMemoryBaseToDeviceMemoryBase(SE_DeviceMemoryBase se_base) {
-    stream_executor::DeviceMemoryBase base(se_base.opaque, se_base.size);
-    base.SetPayload(se_base.payload);
-    return base;
+// APIs for converting between internal and external versions of
+// XLA/StreamExecutor data structures.
+namespace ApiConverter {
+
+// se::DeviceMemoryBase
+SE_DeviceMemoryBase ToC(const stream_executor::DeviceMemoryBase& base);
+void ToC(const stream_executor::DeviceMemoryBase& base,
+         SE_DeviceMemoryBase* se_base);
+stream_executor::DeviceMemoryBase FromC(const SE_DeviceMemoryBase& se_base);
+void Free(SE_DeviceMemoryBase*);
+
+// xla::Shape
+xla::Shape FromC(XLA_Shape* shape);
+void ToC(const xla::Shape& xla_shape, XLA_Shape* c_shape);
+void Free(XLA_Shape* shape);
+
+// xla::ShapeIndex
+XLA_ShapeIndex ToC(const xla::ShapeIndex& xla_shape);
+xla::ShapeIndex FromC(XLA_ShapeIndex* c_shape);
+void Free(XLA_ShapeIndex*);
+
+// Literal
+void ToC(const xla::LiteralSlice& literal, XLA_Literal* c_literal);
+xla::MutableBorrowingLiteral FromC(XLA_Literal* c_literal);
+void Free(XLA_Literal* c_literal);
+
+// ShapedBuffer
+void ToC(const xla::ShapedBuffer& buffer, XLA_ShapedBuffer* c_device_buffer);
+xla::ShapedBuffer FromC(XLA_ShapedBuffer* c_buffer);
+void Free(XLA_ShapedBuffer* c_buffer);
+
+// se::DeviceMemoryBase
+SE_DeviceMemoryBase ToC(const stream_executor::DeviceMemoryBase& base);
+stream_executor::DeviceMemoryBase FromC(const SE_DeviceMemoryBase& se_base);
+void Free(SE_DeviceMemoryBase*);
+
+// xla::Shape
+xla::Shape FromC(XLA_Shape* shape);
+void ToC(const xla::Shape& xla_shape, XLA_Shape* c_shape);
+void Free(XLA_Shape* shape);
+
+// Literal
+void ToC(const xla::LiteralSlice& literal, XLA_Literal* c_literal);
+xla::MutableBorrowingLiteral FromC(XLA_Literal* c_literal);
+void Free(XLA_Literal* c_literal);
+
+// ShapedBuffer
+void ToC(const xla::ShapedBuffer& buffer, XLA_ShapedBuffer* c_device_buffer);
+xla::ShapedBuffer FromC(XLA_ShapedBuffer* c_buffer);
+void Free(XLA_ShapedBuffer* c_buffer);
+
+xla::MaybeOwningDeviceMemory FromC(
+    SE_MaybeOwningDeviceMemory* se_mem,
+    stream_executor::DeviceMemoryAllocator* allocator);
+
+// DeviceMemoryAllocator
+SE_DeviceMemoryAllocator ToC(stream_executor::DeviceMemoryAllocator* allocator);
+
+// OwningDeviceMemory
+SE_MaybeOwningDeviceMemory ToC(stream_executor::OwningDeviceMemory* mem);
+SE_MaybeOwningDeviceMemory ToC(xla::MaybeOwningDeviceMemory& mem);
+
+// Helper for managing stack based C -> C++ conversions.
+template <class CType>
+struct StackHelper {
+  explicit StackHelper() {}
+
+  template <class CppType>
+  explicit StackHelper(const CppType& t) {
+    ::ApiConverter::ToC(t, &value);
+  }
+  ~StackHelper() { ::ApiConverter::Free(&value); }
+
+  template <class CppType>
+  CppType AsCpp() const {
+    return ::ApiConverter::FromC(&value);
   }
 
-  static SE_DeviceMemoryBase DeviceMemoryBaseToSE_DeviceMemoryBase(
-      const stream_executor::DeviceMemoryBase& base) {
-    SE_DeviceMemoryBase se_base;
-    se_base.opaque = const_cast<void*>(base.opaque());
-    se_base.payload = base.payload();
-    se_base.size = base.size();
-    return se_base;
-  }
-
-  static xla::Shape CShapeToXlaShape(XLA_Shape* shape) {
-    xla::ShapeProto p;
-    p.ParseFromArray(shape->bytes, shape->size);
-    return xla::Shape(p);
-  }
-
-  static void XlaShapeToCShape(const xla::Shape& xla_shape,
-                               XLA_Shape* c_shape) {
-    xla::ShapeProto p = xla_shape.ToProto();
-    std::string p_str = p.SerializeAsString();
-    c_shape->bytes = new char[p_str.size()];
-    c_shape->size = p_str.size();
-    memcpy(c_shape->bytes, p_str.data(), p_str.size());
-  }
-
-  static void XLAShapedBufferToCShapedBuffer(
-      const xla::ShapedBuffer& buffer, XLA_ShapedBuffer* c_device_buffer) {
-    XlaShapeToCShape(buffer.on_host_shape(), &c_device_buffer->on_host_shape);
-    XlaShapeToCShape(buffer.on_device_shape(),
-                     &c_device_buffer->on_device_shape);
-    c_device_buffer->device_ordinal = buffer.device_ordinal();
-    absl::InlinedVector<SE_DeviceMemoryBase, 2> bases;
-    for (auto& pair : buffer.buffers()) {
-      bases.push_back(DeviceMemoryBaseToSE_DeviceMemoryBase(pair.second));
-    }
-    c_device_buffer->count = bases.size();
-    c_device_buffer->bases = new SE_DeviceMemoryBase[bases.size()];
-    for (int i = 0; i < bases.size(); ++i) {
-      c_device_buffer->bases[i] = bases[i];
-    }
-  }
-
-  static void XLALiteralToCLiteral(const xla::LiteralSlice& literal,
-                                   XLA_Literal* c_literal) {
-    XlaShapeToCShape(literal.shape(), &c_literal->shape);
-    auto shapes = xla::ShapeUtil::GetLeafShapes(literal.shape());
-    c_literal->buffers = new char*[shapes.size()];
-    c_literal->sizes = new size_t[shapes.size()];
-    c_literal->count = shapes.size();
-    for (int i = 0; i < shapes.size(); ++i) {
-      c_literal->buffers[i] = reinterpret_cast<char*>(
-          const_cast<void*>(literal.untyped_data(shapes[i].index)));
-      c_literal->sizes[i] = literal.size_bytes(shapes[i].index);
-    }
-  }
-
-  static xla::MutableBorrowingLiteral CLiteralToXLALiteral(
-      XLA_Literal* c_literal) {
-    xla::Shape shape = CShapeToXlaShape(&c_literal->shape);
-    LOG(INFO) << "Shape: " << shape.DebugString();
-    return xla::MutableBorrowingLiteral(
-        absl::MakeSpan(c_literal->buffers, c_literal->count), shape);
-  }
-
-  static void CShapeCleanup(XLA_Shape* c_shape) { delete[] c_shape->bytes; }
-
-  static void CLiteralCleanup(XLA_Literal* c_literal) {
-    delete[] c_literal->buffers;
-    delete[] c_literal->sizes;
-    CShapeCleanup(&c_literal->shape);
-  }
-
-  static void CShapedBufferCleanup(XLA_ShapedBuffer* c_buffer) {
-    CShapeCleanup(&c_buffer->on_device_shape);
-    CShapeCleanup(&c_buffer->on_host_shape);
-    delete[] c_buffer->bases;
-  }
+  mutable CType value;
 };
 
-#endif  // THIRD_PARTY_TENSORFLOW_STREAM_EXECUTOR_TPU_C_API_CONVERSIONS_H_
+}  // namespace ApiConverter
+
+#endif
diff --git a/tensorflow/stream_executor/tpu/c_api_decl.h b/tensorflow/stream_executor/tpu/c_api_decl.h
new file mode 100644
index 00000000000..a7b4c372e18
--- /dev/null
+++ b/tensorflow/stream_executor/tpu/c_api_decl.h
@@ -0,0 +1,268 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_C_API_DECL_H_
+#define TENSORFLOW_STREAM_EXECUTOR_TPU_C_API_DECL_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/c/tf_attrtype.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/tpu/libtftpu.h"
+
+extern "C" {
+
+enum TpuCoreTypeEnum {
+  kTensorCore,
+  kEmbeddingV1,
+  kEmbeddingV2,
+};
+
+enum TpuVersionEnum {
+  kUnknownTpuVersion,
+  kTpuV2,
+  kTpuV3,
+};
+
+typedef struct SE_Status SE_Status;
+
+typedef struct SE_Platform SE_Platform;
+typedef struct SE_StreamExecutor SE_StreamExecutor;
+typedef struct SE_Stream SE_Stream;
+typedef struct SE_Event SE_Event;
+typedef struct SE_Timer SE_Timer;
+
+typedef struct TpuSerializedProto {
+  const char* bytes;
+  size_t size;
+} TpuSerializedProto;
+
+typedef struct SE_PlatformId {
+  void* id;  // aka stream_executor::Platform::Id
+} SE_PlatformId;
+typedef struct SE_StreamExecutorConfig SE_StreamExecutorConfig;
+typedef struct SE_DeviceOptions SE_DeviceOptions;
+typedef SE_Status* (*SE_StatusCallbackFn)(void*);
+
+typedef struct SE_DeviceMemoryBase {
+  void* opaque;
+  uint64_t size;
+  uint64_t payload;
+} SE_DeviceMemoryBase;
+
+typedef struct SE_ScopedDeviceMemory {
+  SE_DeviceMemoryBase wrapped;
+  int device_ordinal;
+} SE_ScopedDeviceMemory;
+
+typedef struct SE_AllocatorStats {
+  int64_t num_allocs;
+  int64_t bytes_in_use;
+  int64_t peak_bytes_in_use;
+  int64_t largest_alloc_size;
+
+  bool has_bytes_limit;
+  int64_t bytes_limit;
+
+  int64_t bytes_reserved;
+  int64_t peak_bytes_reserved;
+
+  bool has_bytes_reservable_limit;
+  int64_t bytes_reservable_limit;
+
+  int64_t largest_free_block_bytes;
+} SE_AllocatorStats;
+
+// Note, due to the... odd way in which DeviceMemoryAllocator is used in TF, we
+// cannot simply wrap an underlying pointer. Instead, we reverse the call
+// direction and request memory via a callback.
+typedef void (*SE_AllocateFn)(void* ctx, int device_ordinal, uint64_t size,
+                              bool retry_on_failure, int64_t memory_space,
+                              SE_ScopedDeviceMemory* result, SE_Status* status);
+
+typedef void (*SE_DeallocateFn)(void* ctx, SE_DeviceMemoryBase* base,
+                                int device_ordinal, SE_Status* status);
+
+typedef struct SE_DeviceMemoryAllocator {
+  SE_Platform* platform;
+  void* ctx;
+  SE_AllocateFn allocate;
+  SE_DeallocateFn deallocate;
+} SE_DeviceMemoryAllocator;
+
+typedef struct SE_DeviceDescription {
+  char* device_vendor;
+  char* platform_version;
+  char* driver_version;
+  char* runtime_version;
+  char* pci_bus_id;
+  char* name;
+
+  int64_t thread_dim_limit_x;
+  int64_t thread_dim_limit_y;
+  int64_t thread_dim_limit_z;
+  int64_t block_dim_limit_x;
+  int64_t block_dim_limit_y;
+  int64_t block_dim_limit_z;
+
+  int64_t threads_per_core_limit;
+  int64_t threads_per_block_limit;
+  int64_t threads_per_warp;
+
+  int64_t registers_per_core_limit;
+  int64_t registers_per_block_limit;
+
+  int64_t device_address_bits;
+  int64_t device_memory_size;
+  int64_t memory_bandwidth;
+
+  int64_t shared_memory_per_core;
+  int64_t shared_memory_per_block;
+
+  float clock_rate_ghz;
+
+  int cuda_compute_capability_major;
+  int cuda_compute_capability_minor;
+
+  int rocm_amdgpu_isa_version;
+
+  int numa_node;
+  int core_count;
+  bool ecc_enabled;
+} SE_DeviceDescription;
+
+typedef struct Tpu_Compiler Tpu_Compiler;
+typedef struct SE_Executable SE_Executable;
+
+typedef struct SE_ExecutableRunOptions {
+  SE_DeviceMemoryAllocator allocator;
+  int device_ordinal;
+  SE_Stream* stream;
+  SE_Stream* host_to_device_stream;
+  TpuSerializedProto device_assignment;
+  int rng_seed;
+  int64_t run_id;
+  int launch_id;
+} SE_ExecutableRunOptions;
+
+typedef struct SE_MaybeOwningDeviceMemory {
+  SE_DeviceMemoryBase memory;
+  bool owned;
+
+  // Set if owned
+  int device_ordinal;
+  SE_DeviceMemoryAllocator allocator;
+} SE_MaybeOwningDeviceMemory;
+
+// Represents an XLA shape tree.
+// Shapes are flattened in default traversal order.
+typedef struct XLA_Shape {
+  char* bytes;
+  size_t size;
+} XLA_Shape;
+
+// Represents a leaf node for a XLA shaped buffer.
+typedef struct XLA_ShapedBuffer {
+  XLA_Shape on_host_shape;
+  XLA_Shape on_device_shape;
+  int device_ordinal;
+
+  SE_DeviceMemoryBase* bases;
+  size_t count;
+} XLA_ShapedBuffer;
+
+// Represents a leaf XLA literal.
+typedef struct XLA_Literal {
+  char** buffers;
+  size_t* sizes;
+  size_t count;
+  XLA_Shape shape;
+} XLA_Literal;
+
+typedef struct XLA_MaybeOwningDeviceMemoryShapeTree {
+  XLA_Shape shape;
+  SE_MaybeOwningDeviceMemory* buffers;
+} XLA_MaybeOwningDeviceMemoryShapeTree;
+
+typedef struct XLA_ShapeIndex {
+  int64_t indices[8];
+  int64_t count;
+} XLA_ShapeIndex;
+
+typedef struct SE_ExecutionInput {
+  XLA_MaybeOwningDeviceMemoryShapeTree shape_tree;
+  XLA_ShapeIndex* unowned_indices;
+  int unowned_indices_size;
+  XLA_Shape dynamic_shape;
+  XLA_Shape host_shape;
+} SE_ExecutionInput;
+
+typedef struct SE_ExecutionOutput {
+  XLA_ShapedBuffer result;
+  SE_MaybeOwningDeviceMemory* to_be_released;
+  int to_be_released_size;
+  XLA_ShapeIndex* aliased_indices;
+  int aliased_indices_size;
+} SE_ExecutionOutput;
+
+typedef struct XLA_ComputationLayout {
+  int parameter_count;
+  XLA_Shape* parameter_layouts;
+  XLA_Shape result_layout;
+} XLA_ComputationLayout;
+
+typedef struct XLA_HloModuleConfig {
+  uint64_t seed;
+  int32_t launch_id;
+  int64_t replica_count;
+  int64_t num_partitions;
+  bool use_spmd_partitioning;
+  bool has_static_device_assignment;
+  TpuSerializedProto static_device_assignment;
+  bool has_entry_computation_layout;
+  XLA_ComputationLayout entry_computation_layout;
+} XLA_HloModuleConfig;
+
+typedef struct SE_HloExecutionProfile SE_HloExecutionProfile;
+
+struct SE_StreamExecutorList {
+  SE_StreamExecutor** exec;
+  int count;
+};
+
+typedef struct XLA_HloModuleGroup {
+  TpuSerializedProto proto;
+  XLA_HloModuleConfig* module_config;
+} XLA_HloModuleGroup;
+
+typedef struct XLA_HloModule {
+  TpuSerializedProto proto;
+  XLA_HloModuleConfig module_config;
+} XLA_HloModule;
+
+typedef struct XLA_TransferManager XLA_TransferManager;
+
+typedef struct XLA_ComputationPlacer XLA_ComputationPlacer;
+
+typedef void (*XLA_CallbackFn)(void*);
+typedef void (*XLA_StatusCallbackFn)(void*, SE_Status*);
+
+typedef struct SE_TpuTopology SE_TpuTopology;
+typedef struct SE_TpuTopology_Core SE_TpuTopology_Core;
+typedef struct SE_TpuTopology_Core SE_TpuTopology_Host;
+}
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_C_API_DECL_H_
diff --git a/tensorflow/stream_executor/tpu/c_api_defn.h b/tensorflow/stream_executor/tpu/c_api_defn.h
new file mode 100644
index 00000000000..62c02e2de48
--- /dev/null
+++ b/tensorflow/stream_executor/tpu/c_api_defn.h
@@ -0,0 +1,72 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_C_API_DEFN_H_
+#define TENSORFLOW_STREAM_EXECUTOR_TPU_C_API_DEFN_H_
+
+#include "tensorflow/stream_executor/device_options.h"
+#include "tensorflow/stream_executor/event.h"
+#include "tensorflow/stream_executor/stream.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+#include "tensorflow/stream_executor/timer.h"
+
+// Definitions for XLA API data structures. Any underlying C++ data structures
+// are implementation details and should only be used from within the stream
+// executor implementation.
+
+namespace stream_executor {
+class Platform;
+class StreamExecutor;
+}  // namespace stream_executor
+
+struct SE_Platform {
+  stream_executor::Platform* platform;
+};
+
+struct SE_StreamExecutor {
+  stream_executor::StreamExecutor* executor;
+};
+
+struct SE_Stream {
+  explicit SE_Stream(stream_executor::StreamExecutor* parent)
+      : stream(parent) {}
+  stream_executor::Stream stream;
+};
+
+struct SE_Event {
+  explicit SE_Event(stream_executor::StreamExecutor* parent) : event(parent) {}
+  stream_executor::Event event;
+};
+
+struct SE_Timer {
+  explicit SE_Timer(stream_executor::StreamExecutor* parent) : timer(parent) {}
+  stream_executor::Timer timer;
+};
+
+struct SE_StreamExecutorConfig {
+  stream_executor::StreamExecutorConfig config;
+};
+
+struct SE_DeviceOptions {
+  stream_executor::DeviceOptions options;
+};
+
+// Ignored -- these are just used to enforce the interface types
+struct XLA_TransferManager {};
+struct XLA_ComputationPlacer {};
+struct SE_TpuTopology {};
+struct SE_TpuTopology_Core {};
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_C_API_DEFN_H_
diff --git a/tensorflow/stream_executor/tpu/device_memory_base_helper.h b/tensorflow/stream_executor/tpu/device_memory_base_helper.h
deleted file mode 100644
index 9937dc29642..00000000000
--- a/tensorflow/stream_executor/tpu/device_memory_base_helper.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_DEVICE_MEMORY_BASE_HELPER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TPU_DEVICE_MEMORY_BASE_HELPER_H_
-
-#include "tensorflow/stream_executor/device_memory.h"
-#include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
-
-class DeviceMemoryBaseHelper {
- public:
-  static stream_executor::DeviceMemoryBase
-  SE_DeviceMemoryBaseToDeviceMemoryBase(SE_DeviceMemoryBase se_base) {
-    stream_executor::DeviceMemoryBase base(se_base.opaque, se_base.size);
-    base.SetPayload(se_base.payload);
-    return base;
-  }
-
-  static SE_DeviceMemoryBase DeviceMemoryBaseToSE_DeviceMemoryBase(
-      const stream_executor::DeviceMemoryBase& base) {
-    SE_DeviceMemoryBase se_base;
-    se_base.opaque = const_cast<void*>(base.opaque());
-    se_base.payload = base.payload();
-    se_base.size = base.size();
-    return se_base;
-  }
-};
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_DEVICE_MEMORY_BASE_HELPER_H_
diff --git a/tensorflow/stream_executor/tpu/noncopyable_buffer.h b/tensorflow/stream_executor/tpu/noncopyable_buffer.h
new file mode 100644
index 00000000000..09ea45f0108
--- /dev/null
+++ b/tensorflow/stream_executor/tpu/noncopyable_buffer.h
@@ -0,0 +1,112 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_NONCOPYABLE_BUFFER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_TPU_NONCOPYABLE_BUFFER_H_
+
+#include <memory>
+
+#include "absl/base/casts.h"
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Uncopyable buffer type with optional ownership of the underlying data. If
+// data is not owned then ensuring lifetime of the data exceeds the lifetime of
+// the buffer is the responsibility of the user.
+class NoncopyableBuffer {
+ public:
+  NoncopyableBuffer() = default;
+
+  // Allocate an owning buffer without initializing the data. Useful when it
+  // will be filled by a subsequent function and want to avoid initialization
+  // cost. Size is specified in number of uint32's.
+  explicit NoncopyableBuffer(size_t size)
+      : data_(new uint32[size]), buf_(data_.get()), size_(size) {}
+
+  // Allocates an owning buffer and initializes it with the specified data. Size
+  // is specified in number of uint32's.
+  NoncopyableBuffer(size_t size, absl::optional<uint32> value)
+      : NoncopyableBuffer(size) {
+#ifndef MEMORY_SANITIZER
+    if (!value.has_value()) {
+      return;
+    }
+#endif
+    uint32 v = value.value_or(0);
+    for (int64 i = 0; i < size; ++i) {
+      data_[i] = v;
+    }
+  }
+
+  // Directly use buf pointer without copying it to owning data_. This delays
+  // the memcpy until mutable access is requested. "buf" is not owned by this
+  // data structure, so it is the user's duty to ensure the live range of "buf"
+  // is longer than this data structure.
+  NoncopyableBuffer(const uint8* buf, uint64 size)  // Size is in uint8's.
+      : buf_(buf), size_(size / sizeof(uint32)) {
+    CHECK_EQ(size % sizeof(uint32), 0);
+  }
+  NoncopyableBuffer(const uint32* buf, uint64 size)  // Size is in uint32's.
+      : buf_(buf), size_(size) {}
+
+  NoncopyableBuffer(const NoncopyableBuffer&) = delete;
+  NoncopyableBuffer(NoncopyableBuffer&&) = default;
+
+  NoncopyableBuffer& operator=(const NoncopyableBuffer&) = delete;
+  NoncopyableBuffer& operator=(NoncopyableBuffer&&) = default;
+
+  // Ensure that the buffer owns the data and returns a mutable view into the
+  // owned data for modification.
+  absl::Span<uint32> mutable_data() {
+    if (data_ == nullptr) {
+      data_.reset(new uint32[size_]);
+      memcpy(data_.get(), buf_, size_ * sizeof(uint32));
+      buf_ = data_.get();
+    }
+    return absl::Span<uint32>(data_.get(), size_);
+  }
+
+  absl::Span<const uint32> const_data() const {
+    return absl::Span<const uint32>(absl::bit_cast<uint32*>(buf_), size_);
+  }
+  // Clone the content to a given buffer.
+  void CloneTo(void* buf) { memcpy(buf, buf_, size_ * sizeof(uint32)); }
+
+  // Return true if data is owned by this buffer (have been copied to `data_`).
+  bool owns_data() const { return data_ != nullptr; }
+
+  // Returns a copy of the object that owns its buffer.
+  NoncopyableBuffer Clone() const {
+    NoncopyableBuffer clone(size_);
+    memcpy(clone.data_.get(), buf_, size_ * sizeof(uint32));
+    return clone;
+  }
+
+ private:
+  // If data_ != nullptr then buf_ == data_.get()
+  std::unique_ptr<uint32[]> data_;  // Owning data pointer.
+  const void* buf_;                 // Non-owning data pointer.
+  uint64 size_;                     // Size in number of uint32's.
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_NONCOPYABLE_BUFFER_H_
diff --git a/tensorflow/stream_executor/tpu/proto_helper.h b/tensorflow/stream_executor/tpu/proto_helper.h
index 3bd2b09f95e..29c322b0e9e 100644
--- a/tensorflow/stream_executor/tpu/proto_helper.h
+++ b/tensorflow/stream_executor/tpu/proto_helper.h
@@ -19,14 +19,10 @@ limitations under the License.
 #include <cstddef>
 
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/stream_executor/tpu/c_api_decl.h"
 
 extern "C" {
 
-typedef struct TpuSerializedProto {
-  const char* bytes;
-  size_t size;
-} TpuSerializedProto;
-
 void StreamExecutor_Tpu_FreeSerializedProto(const TpuSerializedProto* proto);
 
 }  // extern "C"
diff --git a/tensorflow/stream_executor/tpu/status_helper.h b/tensorflow/stream_executor/tpu/status_helper.h
index 0129abb0815..0e522ce8241 100644
--- a/tensorflow/stream_executor/tpu/status_helper.h
+++ b/tensorflow/stream_executor/tpu/status_helper.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_STREAM_EXECUTOR_TPU_STATUS_HELPER_H_
 
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 
@@ -30,22 +29,24 @@ class StatusHelper {
     tensorflow::tpu::ExecutorApiFn()->TpuStatus_FreeFn(c_status);
   }
 
-  bool ok() const {
-    return tensorflow::tpu::ExecutorApiFn()->TpuStatus_CodeFn(c_status) == 0;
-  }
-
-  tensorflow::Status status() const {
-    if (!ok()) {
+  static tensorflow::Status FromC(SE_Status* const c_status) {
+    if (tensorflow::tpu::ExecutorApiFn()->TpuStatus_OkFn(c_status)) {
+      return tensorflow::Status::OK();
+    } else {
       return tensorflow::Status(
           tensorflow::error::Code(
               tensorflow::tpu::ExecutorApiFn()->TpuStatus_CodeFn(c_status)),
           tensorflow::tpu::ExecutorApiFn()->TpuStatus_MessageFn(c_status));
-    } else {
-      return tensorflow::Status::OK();
     }
   }
 
-  SE_Status* c_status;  // NOLINT
+  bool ok() const {
+    return tensorflow::tpu::ExecutorApiFn()->TpuStatus_OkFn(c_status);
+  }
+
+  tensorflow::Status status() const { return FromC(c_status); }
+
+  SE_Status* const c_status;  // NOLINT
 };
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_STATUS_HELPER_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_computation_placer.cc b/tensorflow/stream_executor/tpu/tpu_computation_placer.cc
index 660b446d953..9d8aa3808b3 100644
--- a/tensorflow/stream_executor/tpu/tpu_computation_placer.cc
+++ b/tensorflow/stream_executor/tpu/tpu_computation_placer.cc
@@ -15,17 +15,18 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/tpu/tpu_computation_placer.h"
 
+#include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_platform.h"
 
 template <typename T>
 using StatusOr = TpuComputationPlacer::StatusOr<T>;
 
 TpuComputationPlacer::TpuComputationPlacer() {
-  placer_ = TpuComputationPlacer_New();
+  placer_ = tensorflow::tpu::ExecutorApiFn()->TpuComputationPlacer_NewFn();
 }
 
 TpuComputationPlacer::~TpuComputationPlacer() {
-  TpuComputationPlacer_Free(placer_);
+  tensorflow::tpu::ExecutorApiFn()->TpuComputationPlacer_FreeFn(placer_);
 }
 
 StatusOr<int> TpuComputationPlacer::DeviceId(int replica, int computation,
diff --git a/tensorflow/stream_executor/tpu/tpu_event.h b/tensorflow/stream_executor/tpu/tpu_event.h
new file mode 100644
index 00000000000..af53d730ecf
--- /dev/null
+++ b/tensorflow/stream_executor/tpu/tpu_event.h
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EVENT_H_
+#define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EVENT_H_
+
+#include "tensorflow/core/tpu/tpu_api.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+#include "tensorflow/stream_executor/tpu/c_api_decl.h"
+
+class TpuEvent : public ::stream_executor::internal::EventInterface {
+ public:
+  explicit TpuEvent(SE_Event* event) : event_(event) {}
+  ~TpuEvent() override {
+    tensorflow::tpu::ExecutorApiFn()->TpuEvent_FreeFn(event_);
+  }
+
+ private:
+  SE_Event* event_;
+};
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EVENT_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_executable.cc b/tensorflow/stream_executor/tpu/tpu_executable.cc
new file mode 100644
index 00000000000..dd28f21455e
--- /dev/null
+++ b/tensorflow/stream_executor/tpu/tpu_executable.cc
@@ -0,0 +1,116 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/tpu/tpu_executable.h"
+
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/tpu/kernels/tpu_execute_c_api.h"
+#include "tensorflow/core/tpu/tpu_api.h"
+#include "tensorflow/stream_executor/tpu/c_api_conversions.h"
+#include "tensorflow/stream_executor/tpu/proto_helper.h"
+#include "tensorflow/stream_executor/tpu/status_helper.h"
+#include "tensorflow/stream_executor/tpu/tpu_platform.h"
+#include "tensorflow/stream_executor/tpu/tpu_platform_interface.h"
+
+namespace xla {
+
+TpuExecutable::TpuExecutable(const XLA_TpuProgram* core_program,
+                             std::unique_ptr<HloModule> hlo_module,
+                             HostCommandHandler host_command_handler)
+    : TpuExecutableInterface(std::move(hlo_module),
+                             /*hlo_profile_printer_data=*/nullptr,
+                             /*hlo_profile_index_map=*/nullptr),
+      core_program_(core_program),
+      host_command_handler_(std::move(host_command_handler)) {}
+
+Status TpuExecutable::LoadProgramAndEnqueueToStream(
+    const ServiceExecutableRunOptions& run_options,
+    absl::Span<const se::DeviceMemoryBase> arguments,
+    se::DeviceMemoryBase result,
+    absl::optional<se::DeviceMemoryBase> cross_program_prefetch_addr) {
+  SE_DeviceMemoryBase* arguments_bases = nullptr;
+  if (!arguments.empty()) {
+    arguments_bases = new SE_DeviceMemoryBase[arguments.size()];
+    for (int i = 0; i < arguments.size(); i++) {
+      arguments_bases[i] =
+          SE_DeviceMemoryBase{const_cast<void*>(arguments[i].opaque()),
+                              arguments[i].size(), arguments[i].payload()};
+    }
+  }
+
+  SE_DeviceMemoryBase result_base{result.opaque(), result.size(),
+                                  result.payload()};
+  SE_DeviceMemoryBase prefetch_base;
+  if (cross_program_prefetch_addr.has_value()) {
+    prefetch_base = SE_DeviceMemoryBase{cross_program_prefetch_addr->opaque(),
+                                        cross_program_prefetch_addr->size(),
+                                        cross_program_prefetch_addr->payload()};
+  }
+  int32 rng_seed = run_options.run_options().rng_seed();
+
+  XLA_DeviceAssignment c_dev_assign{/*bytes=*/nullptr, /*size=*/0};
+  auto dev_assign = run_options.run_options().device_assignment();
+  stream_executor::tpu::SerializedProto dev_assign_serialized;
+  if (dev_assign != nullptr) {
+    DeviceAssignmentProto dev_assign_proto;
+    TF_RETURN_IF_ERROR(dev_assign->Serialize(&dev_assign_proto));
+    dev_assign_serialized =
+        stream_executor::tpu::SerializeProto(dev_assign_proto);
+    c_dev_assign.bytes = dev_assign_serialized.bytes;
+    c_dev_assign.size = dev_assign_serialized.size;
+  }
+
+  auto platform = tensorflow::down_cast<tensorflow::TpuPlatform*>(
+      tensorflow::tpu::TpuPlatformInterface::GetRegisteredPlatform());
+  auto stream = platform->stream_map()->at(
+      run_options.run_options().stream()->implementation());
+  StatusHelper status;
+
+  tensorflow::tpu::ExecuteApiFn()
+      ->TpuExecutable_LoadProgramAndEnqueueToStreamFn(
+          core_program_, arguments_bases, arguments.size(), &result_base,
+          (cross_program_prefetch_addr.has_value() ? &prefetch_base : nullptr),
+          rng_seed, &c_dev_assign, stream, status.c_status);
+
+  if (dev_assign != nullptr) {
+    stream_executor::tpu::SerializedProto_Free(dev_assign_serialized);
+  }
+  delete[] arguments_bases;
+  return status.status();
+}
+
+Shape TpuExecutable::HostShapeToDeviceShape(const Shape& host_shape) {
+  XLA_Shape c_host_shape;
+  XLA_Shape c_device_shape;
+  ApiConverter::ToC(host_shape, &c_host_shape);
+  tensorflow::tpu::ExecuteApiFn()->HardwareLayout_HostShapeToDeviceShapeFn(
+      &c_host_shape, &c_device_shape);
+  Shape device_shape = ApiConverter::FromC(&c_device_shape);
+  ApiConverter::Free(&c_host_shape);
+  ApiConverter::Free(&c_device_shape);
+  return device_shape;
+}
+
+int64 TpuExecutable::ShapeSize(const Shape& shape) {
+  XLA_Shape c_shape;
+  ApiConverter::ToC(shape, &c_shape);
+  int64 size =
+      tensorflow::tpu::ExecuteApiFn()->HardwareLayout_ShapeSizeFn(&c_shape);
+  ApiConverter::Free(&c_shape);
+  return size;
+}
+
+}  // namespace xla
diff --git a/tensorflow/stream_executor/tpu/tpu_executable.h b/tensorflow/stream_executor/tpu/tpu_executable.h
new file mode 100644
index 00000000000..3c9e60ba335
--- /dev/null
+++ b/tensorflow/stream_executor/tpu/tpu_executable.h
@@ -0,0 +1,70 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EXECUTABLE_H_
+#define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EXECUTABLE_H_
+
+#include <functional>
+#include <memory>
+
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/service_executable_run_options.h"
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/tpu/kernels/tpu_program_c_api.h"
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/tpu/tpu_executable_interface.h"
+
+namespace xla {
+
+// An executable capable of being fed to a TPU device via TpuExecutor.
+class TpuExecutable : public TpuExecutableInterface {
+ public:
+  using HostCommandHandler = std::function<void(uint32, int64)>;
+
+  // Constructs an executable that holds a non-owning reference to an
+  // XLA_TpuProgram.
+  explicit TpuExecutable(const XLA_TpuProgram* core_program,
+                         std::unique_ptr<HloModule> hlo_module,
+                         HostCommandHandler host_command_handler = nullptr);
+  ~TpuExecutable() override = default;
+
+  const XLA_TpuProgram* core_program() const { return core_program_; }
+
+ private:
+  Status LoadProgramAndEnqueueToStream(
+      const ServiceExecutableRunOptions& run_options,
+      absl::Span<const stream_executor::DeviceMemoryBase> arguments,
+      stream_executor::DeviceMemoryBase result,
+      absl::optional<stream_executor::DeviceMemoryBase>
+          cross_program_prefetch_addr) override;
+
+  Shape HostShapeToDeviceShape(const Shape& host_shape) override;
+
+  int64 ShapeSize(const Shape& shape) override;
+
+  const XLA_TpuProgram* const core_program_;
+
+  const HostCommandHandler host_command_handler_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TpuExecutable);
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EXECUTABLE_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_executable_interface.cc b/tensorflow/stream_executor/tpu/tpu_executable_interface.cc
new file mode 100644
index 00000000000..f260cc1631f
--- /dev/null
+++ b/tensorflow/stream_executor/tpu/tpu_executable_interface.cc
@@ -0,0 +1,222 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/tpu/tpu_executable_interface.h"
+
+#include <utility>
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
+#include "tensorflow/compiler/xla/service/shaped_buffer.h"
+#include "tensorflow/compiler/xla/service/transfer_manager.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+
+namespace {
+
+// Write the tuple index buffers (arrays of pointers).
+static Status PopulateResultTupleBuffers(const ShapedBuffer& result,
+                                         se::Stream* stream,
+                                         se::Stream* transfer_stream) {
+  TF_ASSIGN_OR_RETURN(auto transfer_manager, TransferManager::GetForPlatform(
+                                                 stream->parent()->platform()));
+  if (transfer_manager->CanShapedBufferBeAccessedNow(stream->parent(),
+                                                     result)) {
+    TF_RETURN_IF_ERROR(transfer_manager->WriteTupleIndexTablesAsync(
+        transfer_stream ? transfer_stream : stream, result));
+    if (transfer_stream && transfer_stream != stream) {
+      stream->ThenWaitFor(transfer_stream);
+    }
+    return Status::OK();
+  } else {
+    return transfer_manager->WriteTupleIndexTablesAsync(stream, result);
+  }
+}
+
+}  // namespace
+
+StatusOr<ExecutionOutput>
+TpuExecutableInterface::AllocateOutputMemoryWithInputReuse(
+    const Shape& host_shape, const HloInputOutputAliasConfig& alias_config,
+    se::DeviceMemoryAllocator* allocator,
+    std::vector<ExecutionInput>* arguments, se::Stream* stream,
+    se::Stream* transfer_stream) {
+  auto stream_exec = stream->parent();
+  auto device_ordinal = stream_exec->device_ordinal();
+  VLOG(3) << "AllocateOutputMemoryWithInputReuse, device = " << device_ordinal
+          << " host_shape = " << ShapeUtil::HumanStringWithLayout(host_shape);
+  Shape device_shape = HostShapeToDeviceShape(host_shape);
+
+  TF_RETURN_IF_ERROR(alias_config.ForEachAliasWithStatus(
+      [&](const ShapeIndex& output_index,
+          absl::optional<HloInputOutputAliasConfig::Alias> alias) {
+        if (alias && alias->must_alias()) {
+          VLOG(1) << alias->ToString();
+          const MaybeOwningDeviceMemory& original_input =
+              (*arguments)[alias->parameter_number].Buffers().element(
+                  alias->parameter_index);
+          if (!original_input.HasOwnership()) {
+            return InvalidArgument(
+                "An input was configured to be must-alias at "
+                "compile time but not donated at runtime: %s",
+                alias->ToString());
+          }
+        }
+        return Status::OK();
+      }));
+
+  if (VLOG_IS_ON(3)) {
+    VLOG(3) << "AllocateOutputMemoryWithInputReuse, device = " << device_ordinal
+            << " host_shape = " << ShapeUtil::HumanStringWithLayout(host_shape);
+    if (!Shape::Equal().MinorToMajorOnlyInLayout()(host_shape, device_shape)) {
+      VLOG(3) << "Rewrote host_shape to device_shape: "
+              << ShapeUtil::HumanStringWithLayout(host_shape) << " -> "
+              << ShapeUtil::HumanStringWithLayout(device_shape);
+    }
+  }
+
+  ExecutionOutput result(host_shape, std::move(device_shape), allocator,
+                         device_ordinal);
+  // Iterate through and allocate a buffer for each shape index, checking for
+  // possible input buffer reuse.
+  int64 reused_buffer_bytes = 0;
+  int64 total_result_buffer_bytes = 0;
+  for (auto& pair : result.MutableResult()->buffers()) {
+    const ShapeIndex& result_index = pair.first;
+    se::DeviceMemoryBase& result_buffer = pair.second;
+    int64 allocation_bytes = ShapeSize(ShapeUtil::GetSubshape(
+        result.Result().on_device_shape(), result_index));
+    total_result_buffer_bytes += allocation_bytes;
+
+    // Return an InternalError if result_index is invalid. This avoids failing
+    // the CHECK when calling GetAliasedParameter
+    if (!ShapeUtil::IndexIsValid(alias_config.shape(), result_index)) {
+      return InternalError("result_index is invalid: %s",
+                           result_index.ToString());
+    }
+
+    absl::optional<HloInputOutputAliasConfig::Alias> alias =
+        alias_config.GetAliasedParameter(result_index);
+    if (alias) {
+      TF_RET_CHECK(alias->parameter_number < arguments->size());
+      ExecutionInput& input = (*arguments)[alias->parameter_number];
+      MaybeOwningDeviceMemory* device_memory =
+          input.MutableBuffer(alias->parameter_index);
+      if (auto owning = device_memory->Release()) {
+        // If the caller passes the ownership of the device memory, reuse it
+        // as the output buffer. It is up to the caller whether or not to
+        // donate a buffer; the aliasing information describes which buffers
+        // may alias, not buffers that must alias.
+        se::DeviceMemoryBase device_memory_base = owning->Release();
+        *device_memory = device_memory_base;
+        result_buffer = device_memory_base;
+        reused_buffer_bytes += allocation_bytes;
+        // The caller is giving us the input buffer, but in case of error of the
+        // execute call, we should not be releasing it as it contains valid data
+        // (for example, it is a parameter which the user wants us to alias, in
+        // a gradient update computation). So we store the index into the result
+        // in the aliased vactor, which will be fed to the ExecutionOutput,
+        // which will be using the indices to drop the addresses from its own
+        // ScopedShapedBuffer result, if the ExecutionOutput is not committed.
+        result.AddAliasedIndex(result_index);
+      }
+    }
+
+    // We need to allocate a new output buffer for two cases:
+    // - There is no alias between this output and any input.
+    // - There is an alias, but the xla doesn't own the input memory so it can't
+    // donate buffer to the computation.
+    if (result_buffer.is_null()) {
+      const Shape& on_device_shape = result.Result().on_device_shape();
+      const Shape& on_device_subshape =
+          ShapeUtil::GetSubshape(on_device_shape, result_index);
+      TF_ASSIGN_OR_RETURN(
+          auto allocated_buffer,
+          allocator->Allocate(device_ordinal, allocation_bytes,
+                              /*retry_on_failure=*/true,
+                              on_device_subshape.layout().memory_space()));
+      // Store the allocated buffer in our ScopedShapedBuffer, which takes
+      // ownership.
+      result_buffer = allocated_buffer.Release();
+    }
+    TF_RET_CHECK(allocation_bytes == 0 || result_buffer != nullptr);
+  }
+
+  VLOG(1) << "Reused " << reused_buffer_bytes
+          << " parameter buffers (total result buffer size: "
+          << total_result_buffer_bytes << ")";
+
+  TF_RETURN_IF_ERROR(
+      PopulateResultTupleBuffers(result.Result(), stream, transfer_stream));
+  return std::move(result);
+}
+
+StatusOr<ExecutionOutput> TpuExecutableInterface::ExecuteAsyncOnStream(
+    const ServiceExecutableRunOptions* run_options,
+    std::vector<ExecutionInput> arguments,
+    HloExecutionProfile* /*hlo_execution_profile*/) {
+  std::vector<se::DeviceMemoryBase> memory_bases;
+  memory_bases.reserve(arguments.size());
+  for (auto& argument : arguments) {
+    memory_bases.push_back(argument.Buffer({}).AsDeviceMemoryBase());
+  }
+  se::Stream* stream = run_options->stream();
+
+  CHECK_NE(run_options->allocator(), nullptr);
+  const Shape& shape =
+      hlo_module_ == nullptr ? ShapeUtil::MakeNil() : result_shape();
+  const HloInputOutputAliasConfig& alias_config =
+      hlo_module_ == nullptr ? HloInputOutputAliasConfig()
+                             : hlo_module_->input_output_alias_config();
+  TF_ASSIGN_OR_RETURN(
+      ExecutionOutput result,
+      AllocateOutputMemoryWithInputReuse(
+          shape, alias_config, run_options->allocator(), &arguments, stream,
+          run_options->run_options().host_to_device_stream()));
+
+  MarkToBeReleasedArguments(absl::MakeSpan(arguments), result);
+
+  // Address of the buffer in TPU memory that is being speculated.
+  absl::optional<se::DeviceMemoryBase> cross_program_prefetch_addr;
+  if (hlo_module_) {
+    for (const auto& [parameter, index] :
+         hlo_module_->CrossProgramPrefetches()) {
+      CHECK_LT(parameter, arguments.size());
+      // Ensure the cross program prefetched buffer doesn't alias with any
+      // program outputs. If the input and output aliased, the buffer could be
+      // invalidated during program execution and the program could read stale
+      // data from fast memory instead of fresh data in large memory.
+      auto it = arguments[parameter].MutableBuffers()->find({index});
+      CHECK(it != arguments[parameter].MutableBuffers()->end());
+      if (absl::c_none_of(result.Result().buffers(), [&](auto index_addr_pair) {
+            return index_addr_pair.second.IsSameAs(
+                it->second.AsDeviceMemoryBase());
+          })) {
+        // Supports only one cross-program prefetch address.
+        cross_program_prefetch_addr = it->second.AsDeviceMemoryBase();
+      }
+    }
+  }
+
+  TF_RETURN_IF_ERROR(LoadProgramAndEnqueueToStream(
+      *run_options, memory_bases, result.Result().root_buffer(),
+      cross_program_prefetch_addr));
+  return std::move(result);
+}
+
+}  // namespace xla
diff --git a/tensorflow/stream_executor/tpu/tpu_executable_interface.h b/tensorflow/stream_executor/tpu/tpu_executable_interface.h
new file mode 100644
index 00000000000..d0e13b8aea8
--- /dev/null
+++ b/tensorflow/stream_executor/tpu/tpu_executable_interface.h
@@ -0,0 +1,91 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EXECUTABLE_INTERFACE_H_
+#define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EXECUTABLE_INTERFACE_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
+#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_profile_printer_data.pb.h"
+#include "tensorflow/compiler/xla/service/service_executable_run_options.h"
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+
+namespace xla {
+
+// An executable capable of being fed to a TPU device.
+class TpuExecutableInterface : public Executable {
+ public:
+  explicit TpuExecutableInterface(
+      std::shared_ptr<HloModule> hlo_module,
+      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
+      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
+      : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
+                   std::move(hlo_profile_index_map)) {}
+  ~TpuExecutableInterface() override = default;
+
+  StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
+      const ServiceExecutableRunOptions* run_options,
+      std::vector<ExecutionInput> arguments,
+      HloExecutionProfile* hlo_execution_profile) override;
+
+  // Same as AllocateOutputMemory, except that input buffers can be reused
+  // as output buffers. See UserBufferAlias class comment for more details on
+  // the buffer reuse.
+  //
+  // `alias_config` indicates which input and output buffers can be aliased.
+  //
+  // `arguments` are ExecutionInput containing the input parameters. Currently
+  // only a single input parameter (typically a tuple) is supported on TPU. For
+  // each element in the shape tree, if the element holds the ownership of the
+  // memory, it is considered donated and XLA will potentially reuse it as
+  // output buffers.
+  //
+  // The optional 'transfer_stream' parameter enables transfers (for tuple
+  // tables) to be performed on a separate stream to 'stream'.
+  StatusOr<ExecutionOutput> AllocateOutputMemoryWithInputReuse(
+      const Shape& host_shape, const HloInputOutputAliasConfig& alias_config,
+      se::DeviceMemoryAllocator* allocator,
+      std::vector<ExecutionInput>* arguments, se::Stream* stream,
+      se::Stream* transfer_stream = nullptr);
+
+  virtual Status LoadProgramAndEnqueueToStream(
+      const ServiceExecutableRunOptions& run_options,
+      absl::Span<const stream_executor::DeviceMemoryBase> arguments,
+      stream_executor::DeviceMemoryBase result,
+      absl::optional<stream_executor::DeviceMemoryBase>
+          cross_program_prefetch_addr) = 0;
+
+ protected:
+  virtual Shape HostShapeToDeviceShape(const Shape& host_shape) = 0;
+
+  virtual int64 ShapeSize(const Shape& shape) = 0;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EXECUTABLE_INTERFACE_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_executor.cc b/tensorflow/stream_executor/tpu/tpu_executor.cc
index d85805777dd..851fb3ec4e7 100644
--- a/tensorflow/stream_executor/tpu/tpu_executor.cc
+++ b/tensorflow/stream_executor/tpu/tpu_executor.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/tpu/tpu_api.h"
-#include "tensorflow/stream_executor/tpu/device_memory_base_helper.h"
 #include "tensorflow/stream_executor/tpu/status_helper.h"
+#include "tensorflow/stream_executor/tpu/tpu_event.h"
 #include "tensorflow/stream_executor/tpu/tpu_stream.h"
 #include "tensorflow/stream_executor/tpu/tpu_timer.h"
 
@@ -169,7 +169,9 @@ std::unique_ptr<::stream_executor::internal::StreamInterface>
 TpuExecutor::GetStreamImplementation() {
   SE_Stream* tpu_stream = tpu::ExecutorApiFn()->TpuStream_NewFn(executor_);
   auto ptr = absl::make_unique<TpuStream>(tpu_stream);
+  tpu_platform().mutex().lock();
   stream_map()[ptr.get()] = tpu_stream;
+  tpu_platform().mutex().unlock();
   return ptr;
 }
 
@@ -185,18 +187,16 @@ TpuExecutor::CreateEventImplementation() {
 DeviceMemoryBase TpuExecutor::Allocate(uint64 size, int64 memory_space) {
   SE_DeviceMemoryBase se_base = tpu::ExecutorApiFn()->TpuExecutor_AllocateFn(
       executor_, size, memory_space);
-  return DeviceMemoryBaseHelper::SE_DeviceMemoryBaseToDeviceMemoryBase(se_base);
+  return ApiConverter::FromC(se_base);
 }
 
 void TpuExecutor::Deallocate(const DeviceMemoryBase& memory) {
-  SE_DeviceMemoryBase se_base =
-      DeviceMemoryBaseHelper::DeviceMemoryBaseToSE_DeviceMemoryBase(memory);
+  SE_DeviceMemoryBase se_base = ApiConverter::ToC(memory);
   tpu::ExecutorApiFn()->TpuExecutor_DeallocateFn(executor_, &se_base);
 }
 
 void TpuExecutor::Deallocate(DeviceMemoryBase* memory) {
-  SE_DeviceMemoryBase se_base =
-      DeviceMemoryBaseHelper::DeviceMemoryBaseToSE_DeviceMemoryBase(*memory);
+  SE_DeviceMemoryBase se_base = ApiConverter::ToC(*memory);
   tpu::ExecutorApiFn()->TpuExecutor_DeallocateFn(executor_, &se_base);
 }
 
@@ -271,8 +271,7 @@ Status TpuExecutor::EnqueueInfeed(int32 infeed_queue_index,
 bool TpuExecutor::Memcpy(Stream* stream, void* host_dst,
                          const ::stream_executor::DeviceMemoryBase& device_src,
                          uint64 size) {
-  SE_DeviceMemoryBase se_base =
-      DeviceMemoryBaseHelper::DeviceMemoryBaseToSE_DeviceMemoryBase(device_src);
+  SE_DeviceMemoryBase se_base = ApiConverter::ToC(device_src);
   return tpu::ExecutorApiFn()->TpuExecutor_MemcpyToHostFn(
       executor_, stream_map().at(stream->implementation()), host_dst, &se_base,
       size);
@@ -281,9 +280,7 @@ bool TpuExecutor::Memcpy(Stream* stream, void* host_dst,
 bool TpuExecutor::Memcpy(Stream* stream,
                          ::stream_executor::DeviceMemoryBase* device_dst,
                          const void* host_src, uint64 size) {
-  SE_DeviceMemoryBase se_base =
-      DeviceMemoryBaseHelper::DeviceMemoryBaseToSE_DeviceMemoryBase(
-          *device_dst);
+  SE_DeviceMemoryBase se_base = ApiConverter::ToC(*device_dst);
   return tpu::ExecutorApiFn()->TpuExecutor_MemcpyFromHostFn(
       executor_, stream_map().at(stream->implementation()), &se_base, host_src,
       size);
@@ -293,9 +290,7 @@ Status TpuExecutor::SynchronousMemcpy(
     ::stream_executor::DeviceMemoryBase* device_dst, const void* host_src,
     uint64 size) {
   StatusHelper status;
-  SE_DeviceMemoryBase se_base =
-      DeviceMemoryBaseHelper::DeviceMemoryBaseToSE_DeviceMemoryBase(
-          *device_dst);
+  SE_DeviceMemoryBase se_base = ApiConverter::ToC(*device_dst);
   tpu::ExecutorApiFn()->TpuExecutor_SynchronousMemcpyFromHostFn(
       executor_, &se_base, host_src, size, status.c_status);
   return status.status();
@@ -305,8 +300,7 @@ Status TpuExecutor::SynchronousMemcpy(
     void* host_dst, const ::stream_executor::DeviceMemoryBase& device_src,
     uint64 size) {
   StatusHelper status;
-  SE_DeviceMemoryBase se_base =
-      DeviceMemoryBaseHelper::DeviceMemoryBaseToSE_DeviceMemoryBase(device_src);
+  SE_DeviceMemoryBase se_base = ApiConverter::ToC(device_src);
   tpu::ExecutorApiFn()->TpuExecutor_SynchronousMemcpyToHostFn(
       executor_, host_dst, &se_base, size, status.c_status);
   return status.status();
diff --git a/tensorflow/stream_executor/tpu/tpu_executor.h b/tensorflow/stream_executor/tpu/tpu_executor.h
index 720c4ac16d8..faeae86da9b 100644
--- a/tensorflow/stream_executor/tpu/tpu_executor.h
+++ b/tensorflow/stream_executor/tpu/tpu_executor.h
@@ -66,9 +66,6 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
 
   DeviceMemoryBase Allocate(uint64 size, int64 memory_space) override;
 
-  StatusOr<DeviceMemoryBase> AllocateDeviceMemoryBase(uint64 size,
-                                                      int64 memory_space);
-
   Status AllocateEvent(Event* event) override;
 
   bool AllocateStream(Stream* stream) override;
@@ -224,6 +221,8 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
     LOG(FATAL) << "not yet implemented";
   }
 
+  SE_StreamExecutor* se_executor() { return executor_; }
+
  private:
   TpuPlatform& tpu_platform() {
     return *(tensorflow::down_cast<TpuPlatform*>(platform_));
diff --git a/tensorflow/stream_executor/tpu/tpu_executor_c_api.h b/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
index 5911d651b66..149a00615a9 100644
--- a/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
+++ b/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
@@ -21,118 +21,8 @@ limitations under the License.
 
 #include "tensorflow/c/tf_attrtype.h"
 #include "tensorflow/c/tf_status.h"
-#include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/core/tpu/libtftpu.h"
-
-typedef struct SE_Platform SE_Platform;
-typedef struct SE_StreamExecutor SE_StreamExecutor;
-typedef struct SE_Stream SE_Stream;
-typedef struct SE_Event SE_Event;
-typedef struct SE_Timer SE_Timer;
-
-typedef struct SE_PlatformId {
-  void* id;  // aka stream_executor::Platform::Id
-} SE_PlatformId;
-typedef struct SE_StreamExecutorConfig SE_StreamExecutorConfig;
-typedef struct SE_DeviceOptions SE_DeviceOptions;
-typedef SE_Status* (*SE_StatusCallbackFn)(void*);
-
-typedef struct SE_DeviceMemoryBase {
-  void* opaque;
-  uint64_t size;
-  uint64_t payload;
-} SE_DeviceMemoryBase;
-
-typedef struct SE_AllocatorStats {
-  int64_t num_allocs;
-  int64_t bytes_in_use;
-  int64_t peak_bytes_in_use;
-  int64_t largest_alloc_size;
-
-  bool has_bytes_limit;
-  int64_t bytes_limit;
-
-  int64_t bytes_reserved;
-  int64_t peak_bytes_reserved;
-
-  bool has_bytes_reservable_limit;
-  int64_t bytes_reservable_limit;
-
-  int64_t largest_free_block_bytes;
-} SE_AllocatorStats;
-
-typedef struct SE_DeviceDescription {
-  char* device_vendor;
-  char* platform_version;
-  char* driver_version;
-  char* runtime_version;
-  char* pci_bus_id;
-  char* name;
-
-  int64_t thread_dim_limit_x;
-  int64_t thread_dim_limit_y;
-  int64_t thread_dim_limit_z;
-  int64_t block_dim_limit_x;
-  int64_t block_dim_limit_y;
-  int64_t block_dim_limit_z;
-
-  int64_t threads_per_core_limit;
-  int64_t threads_per_block_limit;
-  int64_t threads_per_warp;
-
-  int64_t registers_per_core_limit;
-  int64_t registers_per_block_limit;
-
-  int64_t device_address_bits;
-  int64_t device_memory_size;
-  int64_t memory_bandwidth;
-
-  int64_t shared_memory_per_core;
-  int64_t shared_memory_per_block;
-
-  float clock_rate_ghz;
-
-  int cuda_compute_capability_major;
-  int cuda_compute_capability_minor;
-
-  int rocm_amdgpu_isa_version;
-
-  int numa_node;
-  int core_count;
-  bool ecc_enabled;
-} SE_DeviceDescription;
-
-typedef struct XLA_TransferManager XLA_TransferManager;
-
-typedef struct XLA_ComputationPlacer XLA_ComputationPlacer;
-
-// Represents an XLA shape tree.
-// Shapes are flattened in default traversal order.
-typedef struct XLA_Shape {
-  char* bytes;
-  size_t size;
-} XLA_Shape;
-
-// Represents a leaf node for a XLA shaped buffer.
-typedef struct XLA_ShapedBuffer {
-  XLA_Shape on_host_shape;
-  XLA_Shape on_device_shape;
-  int device_ordinal;
-
-  SE_DeviceMemoryBase* bases;
-  size_t count;
-} XLA_ShapedBuffer;
-
-// Represents a leaf XLA literal.
-typedef struct XLA_Literal {
-  char** buffers;
-  size_t* sizes;
-  size_t count;
-  XLA_Shape shape;
-} XLA_Literal;
-
-typedef void (*XLA_CallbackFn)(void*);
-typedef void (*XLA_StatusCallbackFn)(void*, SE_Status*);
+#include "tensorflow/stream_executor/tpu/c_api_decl.h"
 
 extern "C" {
 
@@ -149,6 +39,8 @@ SE_PlatformId TpuPlatform_Id(SE_Platform* platform);
 int64_t TpuPlatform_VisibleDeviceCount(SE_Platform* platform);
 int64_t TpuPlatform_TpuMemoryLimit(SE_Platform* platform);
 bool TpuPlatform_ShouldRegisterTpuDeviceToDeviceCopy(SE_Platform* platform);
+SE_TpuTopology* TpuPlatform_GetTopologyPtr(SE_Platform* platform);
+SE_TpuTopology_Host* TpuPlatform_GetHostLocation(SE_Platform* platform);
 
 void TpuExecutor_Init(SE_StreamExecutor* executor, int device_ordinal,
                       SE_DeviceOptions* device_options, SE_Status* status);
@@ -248,6 +140,8 @@ int64_t TpuTimer_Microseconds(SE_Timer*);
 
 SE_Status* TpuStatus_New();
 SE_Status* TpuStatus_Create(int32_t code, const char* msg);
+void TpuStatus_Set(SE_Status* status, int32_t code, const char* msg,
+                   int32_t len);
 void TpuStatus_Free(SE_Status* status);
 const char* TpuStatus_Message(SE_Status* status);
 int TpuStatus_Code(SE_Status* status);
@@ -282,17 +176,92 @@ void TpuTransferManager_TransferLiteralFromDevice(
     XLA_TransferManager* manager, SE_Stream* stream,
     XLA_ShapedBuffer* device_buffer, XLA_Literal* literal,
     XLA_StatusCallbackFn callback, void* ctx);
-
 int64_t TpuTransferManager_GetByteSizeRequirement(XLA_TransferManager* manager,
                                                   XLA_Shape* shape);
 void TpuTransferManager_WriteSingleTupleIndexTable(
     XLA_TransferManager* manager, SE_Stream* stream,
     SE_DeviceMemoryBase* elements, size_t elements_len, XLA_Shape* shape,
     SE_DeviceMemoryBase* region, SE_Status* status);
+void TpuTransferManager_GetInfeedLayout(XLA_Shape* shape,
+                                        XLA_Shape* infeed_shape);
+void TpuTransferManager_LinearizeToBuffers(
+    XLA_TransferManager* manager, XLA_Literal* c_literal, char*** buffers_array,
+    int64_t** buffers_size, int64_t* buffers_array_size, SE_Status* status);
+void TpuTransferManager_FreeBuffers(char** buffers_array, int64_t* buffers_size,
+                                    int64_t buffers_array_size);
 
 XLA_ComputationPlacer* TpuComputationPlacer_New();
 void TpuComputationPlacer_Free(XLA_ComputationPlacer* placer);
 
+int TpuTopology_LogicalDevicesPerHost(SE_TpuTopology* tpu_topology,
+                                      TpuCoreTypeEnum tpu_core_type);
+int TpuTopology_LogicalDevicesPerChip(SE_TpuTopology* tpu_topology,
+                                      TpuCoreTypeEnum tpu_core_type);
+int TpuTopology_ChipBounds_X(SE_TpuTopology* tpu_topology);
+int TpuTopology_ChipBounds_Y(SE_TpuTopology* tpu_topology);
+int TpuTopology_ChipBounds_Z(SE_TpuTopology* tpu_topology);
+bool TpuTopology_HasChip(SE_TpuTopology* tpu_topology, int x, int y, int z);
+SE_TpuTopology_Core* TpuTopology_Core(SE_TpuTopology* tpu_topology, int x,
+                                      int y, int z,
+                                      TpuCoreTypeEnum tpu_core_type, int index);
+int TpuTopology_NumCores(SE_TpuTopology* tpu_topology,
+                         TpuCoreTypeEnum tpu_core_type);
+// 'cores' should be a preallocated array of size TpuTopology_NumCores.
+void TpuTopology_Cores(SE_TpuTopology* tpu_topology,
+                       TpuCoreTypeEnum tpu_core_type,
+                       SE_TpuTopology_Core** cores);
+int TpuTopology_IdForHost(SE_TpuTopology* tpu_topology, int x, int y, int z);
+TpuVersionEnum TpuTopology_Version(SE_TpuTopology* tpu_topology);
+void TpuCoreLocation_ChipCoordinates(SE_TpuTopology_Core* tpu_core_location,
+                                     int* x, int* y, int* z);
+void TpuCoreLocation_HostCoordinates(SE_TpuTopology_Core* tpu_core_location,
+                                     int* x, int* y, int* z);
+int TpuCoreLocation_Index(SE_TpuTopology_Core* tpu_core_location);
+int TpuCoreLocation_Id(SE_TpuTopology_Core* tpu_core_location);
+
+int TpuHostLocation_Id(SE_TpuTopology_Host* tpu_host_location);
+
+// C API for XLA::Compiler interface
+
+TFTPU_CAPI_EXPORT Tpu_Compiler* TpuCompiler_New();
+TFTPU_CAPI_EXPORT void TpuCompiler_Free(Tpu_Compiler* compiler);
+
+TFTPU_CAPI_EXPORT void TpuCompiler_RunHloPasses(
+    Tpu_Compiler* compiler, XLA_HloModule* se_hlo_module,
+    SE_StreamExecutor* stream_executor, SE_DeviceMemoryAllocator* allocator,
+    XLA_HloModule* result, SE_Status* status);
+
+TFTPU_CAPI_EXPORT void TpuCompiler_RunBackend(
+    Tpu_Compiler* compiler, XLA_HloModule* se_hlo_module,
+    SE_StreamExecutor* stream_executor, SE_DeviceMemoryAllocator* allocator,
+    SE_Executable** result, SE_Status* status);
+
+TFTPU_CAPI_EXPORT void TpuCompiler_Compile(
+    Tpu_Compiler* compiler, XLA_HloModuleGroup* se_hlo_module_group,
+    SE_StreamExecutorList* stream_exec_lists, int num_lists,
+    SE_DeviceMemoryAllocator* allocator, SE_Executable** executables,
+    SE_Status* status);
+
+TFTPU_CAPI_EXPORT int64_t TpuCompiler_ShapeSize(Tpu_Compiler* compiler,
+                                                XLA_Shape* c_shape);
+
+TFTPU_CAPI_EXPORT void TpuExecutable_ExecuteAsyncOnStream(
+    SE_Executable* executable, SE_ExecutableRunOptions* run_options,
+    SE_ExecutionInput** se_arguments, int se_arguments_size,
+    SE_HloExecutionProfile* hlo_execution_profile, SE_ExecutionOutput* output,
+    SE_Status* status);
+
+TFTPU_CAPI_EXPORT void TpuExecutable_Free(SE_Executable*);
+
+// Converts an XLA `Shape` into its equivalent TPU `Shape` representation.
+TFTPU_CAPI_EXPORT void XlaShapeToTpuShapeRepresentation(
+    XLA_Shape* serialized_xla_shape, int data_type, bool use_fast_memory,
+    XLA_Shape* serialized_tpu_shape, SE_Status* status);
+
+TFTPU_CAPI_EXPORT void XlaShapeToTpuPaddedShape(XLA_Shape* serialized_xla_shape,
+                                                XLA_Shape* serialized_tpu_shape,
+                                                SE_Status* status);
+
 struct TfTpu_ExecutorApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_New);
   TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_Free);
@@ -303,6 +272,9 @@ struct TfTpu_ExecutorApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_VisibleDeviceCount);
   TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_TpuMemoryLimit);
   TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_ShouldRegisterTpuDeviceToDeviceCopy);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_GetTopologyPtr);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_GetHostLocation);
+
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_Init);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_Free);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_PlatformDeviceCount);
@@ -353,6 +325,7 @@ struct TfTpu_ExecutorApiFn {
 
   TFTPU_ADD_FN_IN_STRUCT(TpuStatus_New);
   TFTPU_ADD_FN_IN_STRUCT(TpuStatus_Create);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStatus_Set);
   TFTPU_ADD_FN_IN_STRUCT(TpuStatus_Free);
   TFTPU_ADD_FN_IN_STRUCT(TpuStatus_Message);
   TFTPU_ADD_FN_IN_STRUCT(TpuStatus_Code);
@@ -378,9 +351,43 @@ struct TfTpu_ExecutorApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_TransferLiteralFromDevice);
   TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_GetByteSizeRequirement);
   TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_WriteSingleTupleIndexTable);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_GetInfeedLayout);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_LinearizeToBuffers);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_FreeBuffers);
 
   TFTPU_ADD_FN_IN_STRUCT(TpuComputationPlacer_New);
   TFTPU_ADD_FN_IN_STRUCT(TpuComputationPlacer_Free);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_LogicalDevicesPerHost);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_LogicalDevicesPerChip);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_ChipBounds_X);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_ChipBounds_Y);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_ChipBounds_Z);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_HasChip);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_Core);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_NumCores);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_Cores);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_IdForHost);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_Version);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuCoreLocation_ChipCoordinates);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCoreLocation_HostCoordinates);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCoreLocation_Index);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCoreLocation_Id);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuHostLocation_Id);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompiler_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompiler_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompiler_RunHloPasses);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompiler_RunBackend);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompiler_Compile);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompiler_ShapeSize);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_ExecuteAsyncOnStream);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_Free);
+
+  TFTPU_ADD_FN_IN_STRUCT(XlaShapeToTpuShapeRepresentation);
+  TFTPU_ADD_FN_IN_STRUCT(XlaShapeToTpuPaddedShape);
 };
 }
 
diff --git a/tensorflow/stream_executor/tpu/tpu_node_context.cc b/tensorflow/stream_executor/tpu/tpu_node_context.cc
index b502264cfc7..b5597e2f88f 100644
--- a/tensorflow/stream_executor/tpu/tpu_node_context.cc
+++ b/tensorflow/stream_executor/tpu/tpu_node_context.cc
@@ -12,13 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+
 #include "tensorflow/stream_executor/tpu/tpu_node_context.h"
 
-#include "tensorflow/compiler/xla/service/backend.h"
-#include "tensorflow/compiler/xla/service/platform_util.h"
-#include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/core/tpu/tpu_api.h"
-#include "tensorflow/stream_executor/device_memory_allocator.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_node_context_c_api.h"
 
@@ -36,6 +33,8 @@ StatusOr<std::unique_ptr<TpuNodeContext>> TpuNodeContext::Create(
       tpu::NodeContextApiFn()->TpuNodeContext_CreateFn(device_ordinal,
                                                        status.c_status);
   if (!status.status().ok()) {
+    // TpuNodeContext_CreateFn allocates a new XLA_TpuNodeContext regardless of
+    // status. It needs to be freed if it's not given to a TpuNodeContext below.
     tpu::NodeContextApiFn()->TpuNodeContext_FreeFn(node_context);
     return status.status();
   }
@@ -46,13 +45,6 @@ TpuNodeContext::~TpuNodeContext() {
   tpu::NodeContextApiFn()->TpuNodeContext_FreeFn(node_context_);
 }
 
-/* static */
-Status TpuNodeContext::Initialize(int device_ordinal) {
-  StatusHelper status;
-  TpuNodeContext_Initialize(device_ordinal, status.c_status);
-  return status.status();
-}
-
 /* static */
 Status TpuNodeContext::StopChipHeartbeats() {
   StatusHelper status;
@@ -68,21 +60,21 @@ Status TpuNodeContext::CloseTpuHost() {
 }
 
 /* static */
-tensorflow::tpu::TpuPlatformInterface* TpuNodeContext::platform() {
+Status TpuNodeContext::Initialize(int device_ordinal) {
+  StatusHelper status;
+  tpu::NodeContextApiFn()->TpuNodeContext_InitializeFn(device_ordinal,
+                                                       status.c_status);
+  return status.status();
+}
+
+/* static */
+TpuPlatformInterface* TpuNodeContext::platform() {
   return TpuPlatformInterface::GetRegisteredPlatform();
 }
 
-/* static */
-stream_executor::DeviceMemoryAllocator* TpuNodeContext::memory_allocator() {
-  static stream_executor::StreamExecutorMemoryAllocator* memory_allocator =
-      new stream_executor::StreamExecutorMemoryAllocator(
-          platform(),
-          xla::PlatformUtil::GetStreamExecutors(platform()).ValueOrDie());
-  return memory_allocator;
-}
+int TpuNodeContext::device_ordinal() const { return device_ordinal_; }
 
-/* static */
-xla::Backend* TpuNodeContext::backend() {
+xla::Backend* TpuNodeContext::backend() const {
   static xla::Backend* backend =
       xla::Backend::CreateBackend(
           xla::BackendOptions().set_platform(platform()))
@@ -91,21 +83,8 @@ xla::Backend* TpuNodeContext::backend() {
   return backend;
 }
 
-/* static */
-StatusOr<xla::StreamPool::Ptr> TpuNodeContext::BorrowStream(
-    int device_ordinal) {
-  return backend()->BorrowStream(device_ordinal);
-}
-
-/* static */
-StatusOr<xla::StreamPool::Ptr> TpuNodeContext::BorrowStream(
-    stream_executor::StreamExecutor* executor) {
-  return backend()->BorrowStream(executor);
-}
-
-/* static */
-xla::TransferManager* TpuNodeContext::transfer_manager() {
-  return xla::TransferManager::GetForPlatform(platform()).ValueOrDie();
+stream_executor::StreamExecutor* TpuNodeContext::stream_executor() const {
+  return backend()->stream_executor(device_ordinal_).ValueOrDie();
 }
 
 }  // namespace tpu
diff --git a/tensorflow/stream_executor/tpu/tpu_node_context.h b/tensorflow/stream_executor/tpu/tpu_node_context.h
index 5f68bc677cc..27cf32f854f 100644
--- a/tensorflow/stream_executor/tpu/tpu_node_context.h
+++ b/tensorflow/stream_executor/tpu/tpu_node_context.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/stream_executor/device_memory_allocator.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
@@ -33,6 +34,11 @@ limitations under the License.
 namespace tensorflow {
 namespace tpu {
 
+// A TpuNodeContext object represents a specific TPU node (core). The static
+// class methods represent host-wide actions.
+//
+// First call Initialize in a freshly reset system. Then call Create to talk to
+// individual nodes.
 class TpuNodeContext final {
  public:
   using Status = stream_executor::port::Status;
@@ -47,41 +53,25 @@ class TpuNodeContext final {
   }
   ~TpuNodeContext();
 
-  TpuNodeContext(const TpuNodeContext&) = delete;
-  TpuNodeContext& operator=(const TpuNodeContext&) = delete;
-
-  static Status Initialize(int device_ordinal);
-
   static Status StopChipHeartbeats();
 
   static Status CloseTpuHost();
 
-  static tensorflow::tpu::TpuPlatformInterface* platform();
+  static Status Initialize(int device_ordinal);
 
-  static stream_executor::DeviceMemoryAllocator* memory_allocator();
+  static TpuPlatformInterface* platform();
 
-  static xla::TransferManager* transfer_manager();
+  int device_ordinal() const;
 
-  static xla::Backend* backend();
+  xla::Backend* backend() const;
 
-  static StatusOr<xla::StreamPool::Ptr> BorrowStream(int device_ordinal);
-
-  static StatusOr<xla::StreamPool::Ptr> BorrowStream(
-      stream_executor::StreamExecutor* executor);
-
-  stream_executor::StreamExecutor* stream_executor() {
-    LOG(FATAL) << "Not implemented yet.";
-  }
-
-  std::string tensor_core_location() { LOG(FATAL) << "Not implemented yet."; }
-
-  int index_on_host() { LOG(FATAL) << "Not implemented yet."; }
-
-  int device_ordinal() const { return device_ordinal_; }
+  stream_executor::StreamExecutor* stream_executor() const;
 
  private:
   const int device_ordinal_;
   XLA_TpuNodeContext* const node_context_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TpuNodeContext);
 };
 
 }  // namespace tpu
diff --git a/tensorflow/stream_executor/tpu/tpu_node_context_c_api.h b/tensorflow/stream_executor/tpu/tpu_node_context_c_api.h
index e7ca506df72..55288d2ba38 100644
--- a/tensorflow/stream_executor/tpu/tpu_node_context_c_api.h
+++ b/tensorflow/stream_executor/tpu/tpu_node_context_c_api.h
@@ -15,8 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_NODE_CONTEXT_C_API_H_
 #define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_NODE_CONTEXT_C_API_H_
 
-#include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/core/tpu/libtftpu.h"
+#include "tensorflow/stream_executor/tpu/c_api_decl.h"
 
 typedef struct XLA_TpuNodeContext XLA_TpuNodeContext;
 
@@ -26,19 +26,20 @@ XLA_TpuNodeContext* TpuNodeContext_Create(int device_ordinal,
                                           SE_Status* status);
 void TpuNodeContext_Free(XLA_TpuNodeContext* node_context);
 
-void TpuNodeContext_Initialize(int device_ordinal, SE_Status* status);
-
 void TpuNodeContext_StopChipHeartbeats(SE_Status* status);
+
 void TpuNodeContext_CloseTpuHost(SE_Status* status);
 
+void TpuNodeContext_Initialize(int device_ordinal, SE_Status* status);
+
 }  // extern "C"
 
 struct TfTpu_NodeContextApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_Create);
   TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_Free);
-  TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_Initialize);
   TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_StopChipHeartbeats);
   TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_CloseTpuHost);
+  TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_Initialize);
 };
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_NODE_CONTEXT_C_API_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_platform.cc b/tensorflow/stream_executor/tpu/tpu_platform.cc
index abebc2042e5..90401a3dfb7 100644
--- a/tensorflow/stream_executor/tpu/tpu_platform.cc
+++ b/tensorflow/stream_executor/tpu/tpu_platform.cc
@@ -118,6 +118,16 @@ bool TpuPlatform::ShouldRegisterTpuDeviceToDeviceCopy() {
       ->TpuPlatform_ShouldRegisterTpuDeviceToDeviceCopyFn(platform_);
 }
 
+const tensorflow::tpu::TpuTopologyPtr TpuPlatform::GetTopologyPtr() {
+  return tpu::ExecutorApiFn()->TpuPlatform_GetTopologyPtrFn(platform_);
+}
+
+const tensorflow::tpu::TpuHostLocationExternal TpuPlatform::GetTpuHostLocation()
+    const {
+  return tpu::TpuHostLocationExternal(
+      tpu::ExecutorApiFn()->TpuPlatform_GetHostLocationFn(platform_));
+}
+
 void TpuPlatform::InsertEvent(stream_executor::internal::EventInterface* key,
                               SE_Event* val) {
   tensorflow::mutex_lock lock(event_map_mu_);
diff --git a/tensorflow/stream_executor/tpu/tpu_platform.h b/tensorflow/stream_executor/tpu/tpu_platform.h
index 41fc257597d..a70634f7055 100644
--- a/tensorflow/stream_executor/tpu/tpu_platform.h
+++ b/tensorflow/stream_executor/tpu/tpu_platform.h
@@ -60,6 +60,11 @@ class TpuPlatform : public ::tensorflow::tpu::TpuPlatformInterface {
 
   bool ShouldRegisterTpuDeviceToDeviceCopy() override;
 
+  const tensorflow::tpu::TpuTopologyPtr GetTopologyPtr() override;
+
+  const tensorflow::tpu::TpuHostLocationExternal GetTpuHostLocation()
+      const override;
+
   bool Initialized() const override;
 
   Status Initialize(
@@ -115,16 +120,26 @@ class TpuPlatform : public ::tensorflow::tpu::TpuPlatformInterface {
   void InsertEvent(stream_executor::internal::EventInterface* key,
                    SE_Event* val);
   SE_Event* LookupEvent(stream_executor::internal::EventInterface* key);
+  SE_Stream* LookupStream(stream_executor::internal::StreamInterface* key) {
+    mutex().lock();
+    auto stream = stream_map_.at(key);
+    mutex().unlock();
+    return stream;
+  }
   void EraseEvent(stream_executor::internal::EventInterface* key);
 
+  SE_Platform* se_platform() const { return platform_; }
+
   // Returns the number of TPUs per host.
   static Status TpusPerHost(int* tpus);
 
   // Returns the memory capacity of the TPUs on this host.
   static Status TpuMemoryLimit(int64* memory_limit);
 
+  tensorflow::mutex& mutex() { return event_map_mu_; }
+
  private:
-  SE_Platform* platform_;
+  mutable SE_Platform* platform_;
   std::string name_;
   stream_executor::ExecutorCache executor_cache_;
   StreamMap stream_map_;
diff --git a/tensorflow/stream_executor/tpu/tpu_platform_interface.cc b/tensorflow/stream_executor/tpu/tpu_platform_interface.cc
index 7580e709bdf..28430392117 100644
--- a/tensorflow/stream_executor/tpu/tpu_platform_interface.cc
+++ b/tensorflow/stream_executor/tpu/tpu_platform_interface.cc
@@ -17,16 +17,18 @@ limitations under the License.
 
 #include <atomic>
 
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 
 namespace tensorflow {
 namespace tpu {
 
 namespace {
-TpuPlatformInterface* GetRegisteredPlatformStatic() {
+TpuPlatformInterface* GetRegisteredPlatformStatic(bool initialize_platform) {
   // Prefer TpuPlatform if it's registered.
   auto status_or_tpu_platform =
-      stream_executor::MultiPlatformManager::PlatformWithName("TPU");
+      stream_executor::MultiPlatformManager::PlatformWithName(
+          "TPU", initialize_platform);
   if (status_or_tpu_platform.ok()) {
     return static_cast<TpuPlatformInterface*>(
         status_or_tpu_platform.ValueOrDie());
@@ -43,7 +45,8 @@ TpuPlatformInterface* GetRegisteredPlatformStatic() {
           [](const stream_executor::Platform* platform) {
             return dynamic_cast<const TpuPlatformInterface*>(platform) !=
                    nullptr;
-          });
+          },
+          initialize_platform);
   if (!status_or_other_tpu_platforms.ok()) {
     LOG(WARNING) << "Error when getting other TPU platforms: "
                  << status_or_tpu_platform.status();
@@ -64,9 +67,27 @@ TpuPlatformInterface* GetRegisteredPlatformStatic() {
 
 /* static */
 TpuPlatformInterface* TpuPlatformInterface::GetRegisteredPlatform() {
-  // Use a local static variable to avoid data races during initialization.
+  return GetRegisteredPlatform(/*initialize_platform=*/true);
+}
+
+/* static */
+TpuPlatformInterface* TpuPlatformInterface::GetRegisteredPlatform(
+    bool initialize_platform) {
+  static auto* mu = new mutex;
+  static bool requested_initialize_platform = initialize_platform;
   static TpuPlatformInterface* tpu_registered_platform =
-      GetRegisteredPlatformStatic();
+      GetRegisteredPlatformStatic(initialize_platform);
+
+  mutex_lock lock(*mu);
+  if (!requested_initialize_platform && initialize_platform) {
+    // If the first time this function is called, we did not request
+    // initializing the platform, but the next caller wants the platform
+    // initialized, we will call GetRegisteredPlatformStatic again to initialize
+    // the platform.
+    tpu_registered_platform = GetRegisteredPlatformStatic(initialize_platform);
+    requested_initialize_platform = true;
+  }
+
   return tpu_registered_platform;
 }
 
diff --git a/tensorflow/stream_executor/tpu/tpu_platform_interface.h b/tensorflow/stream_executor/tpu/tpu_platform_interface.h
index 35a10b8a428..936de8d5c34 100644
--- a/tensorflow/stream_executor/tpu/tpu_platform_interface.h
+++ b/tensorflow/stream_executor/tpu/tpu_platform_interface.h
@@ -18,10 +18,16 @@ limitations under the License.
 
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/tpu/c_api_decl.h"
+#include "tensorflow/stream_executor/tpu/tpu_topology.h"
 
 namespace tensorflow {
 namespace tpu {
 
+// TODO(skyewm): get rid of TpuTopologyPtr and either use SE_TpuTopology* or
+// return a TpuTopologyExternal.
+typedef SE_TpuTopology* TpuTopologyPtr;
+
 class TpuPlatformInterface : public stream_executor::Platform {
  public:
   using Status = stream_executor::port::Status;
@@ -31,6 +37,9 @@ class TpuPlatformInterface : public stream_executor::Platform {
   // is registered or an error occurred.
   static TpuPlatformInterface* GetRegisteredPlatform();
 
+  // Option to not initialize a platform if not necessary.
+  static TpuPlatformInterface* GetRegisteredPlatform(bool initialize_platform);
+
   virtual Status Reset() { return Reset(false); }
 
   virtual Status Reset(bool only_tear_down) = 0;
@@ -38,6 +47,10 @@ class TpuPlatformInterface : public stream_executor::Platform {
   virtual int64 TpuMemoryLimit() = 0;
 
   virtual bool ShouldRegisterTpuDeviceToDeviceCopy() = 0;
+
+  virtual const TpuTopologyPtr GetTopologyPtr() = 0;
+
+  virtual const TpuHostLocationExternal GetTpuHostLocation() const = 0;
 };
 
 }  // namespace tpu
diff --git a/tensorflow/stream_executor/tpu/tpu_platform_registration.cc b/tensorflow/stream_executor/tpu/tpu_platform_registration.cc
index 6f054f57aa9..f0447cf527c 100644
--- a/tensorflow/stream_executor/tpu/tpu_platform_registration.cc
+++ b/tensorflow/stream_executor/tpu/tpu_platform_registration.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/initialize.h"
 #include "tensorflow/stream_executor/tpu/tpu_platform.h"
 
+#if defined(PLATFORM_GOOGLE)
 REGISTER_MODULE_INITIALIZER(tpu_platform, tensorflow::RegisterTpuPlatform());
 
 DECLARE_MODULE_INITIALIZER(multi_platform_manager);
@@ -26,3 +27,4 @@ DECLARE_MODULE_INITIALIZER(multi_platform_manager_listener);
 REGISTER_MODULE_INITIALIZER_SEQUENCE(tpu_platform, multi_platform_manager);
 REGISTER_MODULE_INITIALIZER_SEQUENCE(multi_platform_manager_listener,
                                      tpu_platform);
+#endif
diff --git a/tensorflow/stream_executor/tpu/tpu_stream.h b/tensorflow/stream_executor/tpu/tpu_stream.h
index 09b496bfedc..ab84005c718 100644
--- a/tensorflow/stream_executor/tpu/tpu_stream.h
+++ b/tensorflow/stream_executor/tpu/tpu_stream.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
-#include "tensorflow/stream_executor/tpu/device_memory_base_helper.h"
+#include "tensorflow/stream_executor/tpu/c_api_conversions.h"
 #include "tensorflow/stream_executor/tpu/status_helper.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_stream_interface.h"
@@ -45,28 +45,15 @@ class TpuStream : public tensorflow::tpu::TpuStreamInterface {
     StatusHelper status;
     tensorflow::tpu::ExecutorApiFn()
         ->TpuStream_TpuEnqueueOnDeviceSendRecvLocalFn(
-            stream_,
-            DeviceMemoryBaseHelper::DeviceMemoryBaseToSE_DeviceMemoryBase(
-                send_buffer),
-            DeviceMemoryBaseHelper::DeviceMemoryBaseToSE_DeviceMemoryBase(
-                recv_buffer),
-            status.c_status);
+            stream_, ApiConverter::ToC(send_buffer),
+            ApiConverter::ToC(recv_buffer), status.c_status);
     return status.status();
   }
 
- private:
-  SE_Stream* stream_;
-};
-
-class TpuEvent : public ::stream_executor::internal::EventInterface {
- public:
-  explicit TpuEvent(SE_Event* event) : event_(event) {}
-  ~TpuEvent() override {
-    tensorflow::tpu::ExecutorApiFn()->TpuEvent_FreeFn(event_);
-  }
+  SE_Stream* se_stream() const { return stream_; }
 
  private:
-  SE_Event* event_;
+  mutable SE_Stream* stream_;
 };
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_STREAM_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_topology.cc b/tensorflow/stream_executor/tpu/tpu_topology.cc
new file mode 100644
index 00000000000..c86b399b34e
--- /dev/null
+++ b/tensorflow/stream_executor/tpu/tpu_topology.cc
@@ -0,0 +1,114 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/tpu/tpu_topology.h"
+
+#include "tensorflow/core/tpu/tpu_api.h"
+
+namespace tensorflow {
+namespace tpu {
+
+TpuDimensionsExternal TpuCoreLocationExternal::chip_coordinates() const {
+  int x, y, z;
+  tpu::ExecutorApiFn()->TpuCoreLocation_ChipCoordinatesFn(core_location_, &x,
+                                                          &y, &z);
+  return {x, y, z};
+}
+
+TpuDimensionsExternal TpuCoreLocationExternal::host_coordinates() const {
+  int x, y, z;
+  tpu::ExecutorApiFn()->TpuCoreLocation_HostCoordinatesFn(core_location_, &x,
+                                                          &y, &z);
+  return {x, y, z};
+}
+
+int32 TpuCoreLocationExternal::index() const {
+  return tpu::ExecutorApiFn()->TpuCoreLocation_IndexFn(core_location_);
+}
+
+int32 TpuCoreLocationExternal::Id() const {
+  return tpu::ExecutorApiFn()->TpuCoreLocation_IdFn(core_location_);
+}
+
+int32 TpuHostLocationExternal::Id() const {
+  return tpu::ExecutorApiFn()->TpuHostLocation_IdFn(host_location_);
+}
+
+int32 TpuTopologyExternal::LogicalDevicesPerHost(
+    TpuCoreTypeEnum core_type) const {
+  return tpu::ExecutorApiFn()->TpuTopology_LogicalDevicesPerHostFn(topology_,
+                                                                   core_type);
+}
+
+int32 TpuTopologyExternal::LogicalDevicesPerChip(
+    TpuCoreTypeEnum core_type) const {
+  return tpu::ExecutorApiFn()->TpuTopology_LogicalDevicesPerChipFn(topology_,
+                                                                   core_type);
+}
+
+TpuTopologyChipBoundsExternal TpuTopologyExternal::chip_bounds() const {
+  return {tpu::ExecutorApiFn()->TpuTopology_ChipBounds_XFn(topology_),
+          tpu::ExecutorApiFn()->TpuTopology_ChipBounds_YFn(topology_),
+          tpu::ExecutorApiFn()->TpuTopology_ChipBounds_ZFn(topology_)};
+}
+
+bool TpuTopologyExternal::HasChip(int x, int y, int z) const {
+  return tpu::ExecutorApiFn()->TpuTopology_HasChipFn(topology_, x, y, z);
+}
+
+TpuCoreLocationExternal TpuTopologyExternal::Core(int x, int y, int z,
+                                                  TpuCoreTypeEnum core_type,
+                                                  int index) const {
+  return TpuCoreLocationExternal(tpu::ExecutorApiFn()->TpuTopology_CoreFn(
+      topology_, x, y, z, core_type, index));
+}
+
+std::vector<TpuCoreLocationExternal> TpuTopologyExternal::cores(
+    TpuCoreTypeEnum core_type) const {
+  int num_cores =
+      tpu::ExecutorApiFn()->TpuTopology_NumCoresFn(topology_, core_type);
+  std::vector<SE_TpuTopology_Core*> core_ptrs(num_cores);
+  tpu::ExecutorApiFn()->TpuTopology_CoresFn(topology_, core_type,
+                                            core_ptrs.data());
+  std::vector<TpuCoreLocationExternal> result;
+  result.reserve(num_cores);
+  for (SE_TpuTopology_Core* ptr : core_ptrs) {
+    result.emplace_back(ptr);
+  }
+  return result;
+}
+
+int TpuTopologyExternal::IdForHost(TpuDimensionsExternal host) const {
+  return tpu::ExecutorApiFn()->TpuTopology_IdForHostFn(topology_, host.x,
+                                                       host.y, host.z);
+}
+
+TpuVersionEnum TpuTopologyExternal::version() const {
+  return tpu::ExecutorApiFn()->TpuTopology_VersionFn(topology_);
+}
+
+std::string TpuVersionEnumToString(TpuVersionEnum version) {
+  switch (version) {
+    case kUnknownTpuVersion:
+      return "Unknown TPU version";
+    case kTpuV2:
+      return "TPU v2";
+    case kTpuV3:
+      return "TPU v3";
+  }
+}
+
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/stream_executor/tpu/tpu_topology.h b/tensorflow/stream_executor/tpu/tpu_topology.h
new file mode 100644
index 00000000000..5219ba7017b
--- /dev/null
+++ b/tensorflow/stream_executor/tpu/tpu_topology.h
@@ -0,0 +1,88 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_TOPOLOGY_H_
+#define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_TOPOLOGY_H_
+
+#include <vector>
+
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/tpu/c_api_decl.h"
+
+namespace tensorflow {
+namespace tpu {
+
+struct TpuDimensionsExternal {
+  int x;
+  int y;
+  int z;
+};
+
+class TpuCoreLocationExternal {
+ public:
+  TpuCoreLocationExternal() : core_location_(nullptr) {}
+  explicit TpuCoreLocationExternal(SE_TpuTopology_Core* core_location)
+      : core_location_(core_location) {}
+  TpuDimensionsExternal chip_coordinates() const;
+  TpuDimensionsExternal host_coordinates() const;
+  int32 index() const;
+  int32 Id() const;
+
+  SE_TpuTopology_Core* impl() const { return core_location_; }
+
+ private:
+  SE_TpuTopology_Core* core_location_;
+};
+
+class TpuHostLocationExternal {
+ public:
+  explicit TpuHostLocationExternal(SE_TpuTopology_Host* host_location)
+      : host_location_(host_location) {}
+  int32 Id() const;
+
+ private:
+  SE_TpuTopology_Host* host_location_;
+};
+
+struct TpuTopologyChipBoundsExternal {
+  int x;
+  int y;
+  int z;
+};
+
+class TpuTopologyExternal {
+ public:
+  explicit TpuTopologyExternal(SE_TpuTopology* topology)
+      : topology_(topology) {}
+  int32 LogicalDevicesPerHost(TpuCoreTypeEnum core_type) const;
+  int32 LogicalDevicesPerChip(TpuCoreTypeEnum core_type) const;
+  TpuTopologyChipBoundsExternal chip_bounds() const;
+  bool HasChip(int x, int y, int z) const;
+  TpuCoreLocationExternal Core(int x, int y, int z, TpuCoreTypeEnum core_type,
+                               int index) const;
+  std::vector<TpuCoreLocationExternal> cores(TpuCoreTypeEnum core_type) const;
+  int IdForHost(TpuDimensionsExternal host) const;
+  TpuVersionEnum version() const;
+
+ private:
+  SE_TpuTopology* topology_;
+};
+
+std::string TpuVersionEnumToString(TpuVersionEnum version);
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_TOPOLOGY_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc b/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc
index c55af7d58b9..9b268a6d8c9 100644
--- a/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc
+++ b/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc
@@ -15,11 +15,14 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/tpu/tpu_transfer_manager.h"
 
+#include <utility>
+
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/tpu/c_api_conversions.h"
+#include "tensorflow/stream_executor/tpu/noncopyable_buffer.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"
 #include "tensorflow/stream_executor/tpu/status_helper.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
@@ -46,13 +49,13 @@ xla::Shape TpuTransferManager::HostShapeToDeviceShape(
   XLA_Shape c_host_shape;
   XLA_Shape c_device_shape;
 
-  TpuConversions::XlaShapeToCShape(host_shape, &c_host_shape);
+  ApiConverter::ToC(host_shape, &c_host_shape);
 
   tpu::ExecutorApiFn()->TpuTransferManager_HostShapeToDeviceShapeFn(
       manager_, &c_host_shape, &c_device_shape);
-  xla::Shape device_shape = TpuConversions::CShapeToXlaShape(&c_device_shape);
-  TpuConversions::CShapeCleanup(&c_host_shape);
-  TpuConversions::CShapeCleanup(&c_device_shape);
+  xla::Shape device_shape = ApiConverter::FromC(&c_device_shape);
+  ApiConverter::Free(&c_host_shape);
+  ApiConverter::Free(&c_device_shape);
   return device_shape;
 }
 
@@ -63,36 +66,37 @@ Status TpuTransferManager::TransferLiteralToDeviceAsync(
   StatusHelper status;
 
   XLA_Literal c_literal;
-  TpuConversions::XLALiteralToCLiteral(literal, &c_literal);
+  ApiConverter::ToC(literal, &c_literal);
 
   XLA_ShapedBuffer c_device_buffer;
-  TpuConversions::XLAShapedBufferToCShapedBuffer(device_buffer,
-                                                 &c_device_buffer);
+  ApiConverter::ToC(device_buffer, &c_device_buffer);
 
   tpu::ExecutorApiFn()->TpuTransferManager_TransferLiteralToDeviceAsyncFn(
       manager_,
       TpuPlatform::GetRegisteredPlatform()->stream_map()->at(
           stream->implementation()),
       &c_literal, &c_device_buffer, status.c_status);
-  TpuConversions::CShapedBufferCleanup(&c_device_buffer);
-  TpuConversions::CLiteralCleanup(&c_literal);
+  ApiConverter::Free(&c_device_buffer);
+  ApiConverter::Free(&c_literal);
   return status.status();
 }
 
 struct TransferFromDeviceState {
   std::atomic<int64_t> remaining_transfers;
-  StatusHelper status_helper;
+  SE_Status* overall_status =
+      tpu::ExecutorApiFn()->TpuStatus_NewFn();  // OK or the first error
   std::function<void(Status)> done;
 
   void TransferFinished(SE_Status* status) {
-    if (!TpuStatus_Ok(status) && TpuStatus_Ok(status_helper.c_status)) {
-      status_helper.c_status = status;
-    } else {
-      TpuStatus_Free(status);
+    if (!tpu::ExecutorApiFn()->TpuStatus_OkFn(status) &&
+        tpu::ExecutorApiFn()->TpuStatus_OkFn(overall_status)) {
+      std::swap(overall_status, status);
     }
+    tpu::ExecutorApiFn()->TpuStatus_FreeFn(status);
 
     if (--remaining_transfers == 0) {
-      done(status_helper.status());
+      done(StatusHelper::FromC(overall_status));
+      tpu::ExecutorApiFn()->TpuStatus_FreeFn(overall_status);
       delete this;
     }
   }
@@ -110,30 +114,29 @@ void TpuTransferManager::TransferLiteralFromDevice(
   state->remaining_transfers = 1;
   state->done = done;
   XLA_ShapedBuffer c_device_buffer;
-  TpuConversions::XLAShapedBufferToCShapedBuffer(device_buffer,
-                                                 &c_device_buffer);
+  ApiConverter::ToC(device_buffer, &c_device_buffer);
   XLA_Literal c_literal;
-  TpuConversions::XLALiteralToCLiteral(literal, &c_literal);
+  ApiConverter::ToC(literal, &c_literal);
 
   tpu::ExecutorApiFn()->TpuTransferManager_TransferLiteralFromDeviceFn(
       manager_,
-      TpuPlatform::GetRegisteredPlatform()->stream_map()->at(
+      TpuPlatform::GetRegisteredPlatform()->LookupStream(
           stream->implementation()),
       &c_device_buffer, &c_literal, TransferLiteralFromDeviceTrampoline, state);
-  TpuConversions::CShapedBufferCleanup(&c_device_buffer);
-  TpuConversions::CLiteralCleanup(&c_literal);
+  ApiConverter::Free(&c_device_buffer);
+  ApiConverter::Free(&c_literal);
 }
 
 int64 TpuTransferManager::GetByteSizeRequirement(
     const xla::Shape& shape) const {
   XLA_Shape c_shape;
-  TpuConversions::XlaShapeToCShape(shape, &c_shape);
+  ApiConverter::ToC(shape, &c_shape);
 
   int64 size_in_bytes =
       tpu::ExecutorApiFn()->TpuTransferManager_GetByteSizeRequirementFn(
           manager_, &c_shape);
 
-  TpuConversions::CShapeCleanup(&c_shape);
+  ApiConverter::Free(&c_shape);
   return size_in_bytes;
 }
 
@@ -150,7 +153,7 @@ Status TpuTransferManager::WriteSingleTupleIndexTable(
                             elements[i].size(), elements[i].payload()};
   }
   XLA_Shape c_shape;
-  TpuConversions::XlaShapeToCShape(shape, &c_shape);
+  ApiConverter::ToC(shape, &c_shape);
   SE_DeviceMemoryBase region_base{region->opaque(), region->size(),
                                   region->payload()};
   StatusHelper status;
@@ -162,7 +165,34 @@ Status TpuTransferManager::WriteSingleTupleIndexTable(
       elements_bases, elements.size(), &c_shape, &region_base, status.c_status);
 
   delete[] elements_bases;
-  TpuConversions::CShapeCleanup(&c_shape);
+  ApiConverter::Free(&c_shape);
+  return status.status();
+}
+
+Status TpuTransferManager::LinearizeToBuffers(
+    const xla::LiteralSlice& literal,
+    std::deque<tensorflow::tpu::NoncopyableBuffer>* buffers) {
+  XLA_Literal c_literal;
+  ApiConverter::ToC(literal, &c_literal);
+
+  char** buffers_array;
+  int64_t* buffers_size;
+  int64_t buffers_array_size;
+  StatusHelper status;
+
+  tpu::ExecutorApiFn()->TpuTransferManager_LinearizeToBuffersFn(
+      manager_, &c_literal, &buffers_array, &buffers_size, &buffers_array_size,
+      status.c_status);
+
+  for (int64_t i = 0; i < buffers_array_size; ++i) {
+    tpu::NoncopyableBuffer buf(buffers_size[i]);
+    memcpy(buf.mutable_data().data(), buffers_array[i], buffers_size[i]);
+    buffers->push_back(std::move(buf));
+  }
+
+  tpu::ExecutorApiFn()->TpuTransferManager_FreeBuffersFn(
+      buffers_array, buffers_size, buffers_array_size);
+
   return status.status();
 }
 
diff --git a/tensorflow/stream_executor/tpu/tpu_transfer_manager.h b/tensorflow/stream_executor/tpu/tpu_transfer_manager.h
index 163ac81ea5f..558a5106d86 100644
--- a/tensorflow/stream_executor/tpu/tpu_transfer_manager.h
+++ b/tensorflow/stream_executor/tpu/tpu_transfer_manager.h
@@ -22,10 +22,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
+#include "tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.h"
 
 namespace tensorflow {
 
-class TpuTransferManager : public xla::TransferManager {
+class TpuTransferManager : public xla::TpuTransferManagerInterface {
  public:
   TpuTransferManager();
   ~TpuTransferManager() override;
@@ -61,6 +62,12 @@ class TpuTransferManager : public xla::TransferManager {
     LOG(FATAL) << "Not yet implemented";
   }
 
+  Status TransferBuffersToInfeed(
+      se::StreamExecutor* executor,
+      const std::deque<tensorflow::tpu::NoncopyableBuffer>& buffers) override {
+    LOG(FATAL) << "Not yet implemented.";
+  }
+
   Status ResetDevices(
       absl::Span<stream_executor::StreamExecutor* const> executor) override {
     LOG(FATAL) << "Not yet implemented";
@@ -74,6 +81,10 @@ class TpuTransferManager : public xla::TransferManager {
       const xla::Shape& shape,
       stream_executor::DeviceMemoryBase* region) override;
 
+  Status LinearizeToBuffers(
+      const xla::LiteralSlice& literal,
+      std::deque<tensorflow::tpu::NoncopyableBuffer>* buffers) override;
+
  private:
   XLA_TransferManager* manager_;
 };
diff --git a/tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.cc b/tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.cc
new file mode 100644
index 00000000000..746093972a4
--- /dev/null
+++ b/tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.cc
@@ -0,0 +1,40 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.h"
+
+#include "tensorflow/stream_executor/tpu/tpu_platform_interface.h"
+
+namespace xla {
+
+/*static*/ TpuTransferManagerInterface*
+TpuTransferManagerInterface::GetRegisteredTpuTransferManager() {
+  auto* platform = tensorflow::tpu::TpuPlatformInterface::GetRegisteredPlatform(
+      /*initialize_platform=*/false);
+  if (platform == nullptr) {
+    LOG(ERROR) << "Unable to retrieve registered TPU platform.";
+    return nullptr;
+  }
+  auto tm = xla::TransferManager::GetForPlatform(platform);
+  if (!tm.ok()) {
+    LOG(ERROR) << "Unable to retrieve TpuTransferManager. No TPU platform is "
+                  "registered for platform "
+               << platform->Name() << " and ID " << platform->id();
+    return nullptr;
+  }
+  return static_cast<TpuTransferManagerInterface*>(tm.ValueOrDie());
+}
+
+}  // namespace xla
diff --git a/tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.h b/tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.h
new file mode 100644
index 00000000000..b7e000b89ac
--- /dev/null
+++ b/tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.h
@@ -0,0 +1,41 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_TRANSFER_MANAGER_INTERFACE_H_
+#define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_TRANSFER_MANAGER_INTERFACE_H_
+
+#include <deque>
+
+#include "tensorflow/compiler/xla/service/transfer_manager.h"
+#include "tensorflow/stream_executor/tpu/noncopyable_buffer.h"
+
+namespace xla {
+
+class TpuTransferManagerInterface : public xla::TransferManager {
+ public:
+  virtual Status TransferBuffersToInfeed(
+      se::StreamExecutor* executor,
+      const std::deque<tensorflow::tpu::NoncopyableBuffer>& buffers) = 0;
+
+  virtual Status LinearizeToBuffers(
+      const LiteralSlice& literal,
+      std::deque<tensorflow::tpu::NoncopyableBuffer>* buffers) = 0;
+
+  static TpuTransferManagerInterface* GetRegisteredTpuTransferManager();
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_TRANSFER_MANAGER_INTERFACE_H_
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index b6bb0e37c94..51e26c67e72 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -47,7 +47,7 @@ load(
 load(
     "//third_party/mkl_dnn:build_defs.bzl",
     "if_mkl_open_source_only",
-    "if_mkl_v1_open_source_only",
+    "if_mkl_v1",
     "if_mkldnn_threadpool",
 )
 load(
@@ -220,7 +220,7 @@ def if_not_mobile(a):
 
 # Config setting selector used when building for products
 # which requires restricted licenses to be avoided.
-def if_not_lgpl_restricted(a):
+def if_not_mobile_or_arm_or_lgpl_restricted(a):
     _ = (a,)
     return select({
         "//conditions:default": [],
@@ -327,12 +327,8 @@ def tf_copts(
         if_tensorrt(["-DGOOGLE_TENSORRT=1"]) +
         if_mkl(["-DINTEL_MKL=1", "-DEIGEN_USE_VML"]) +
         if_mkl_open_source_only(["-DINTEL_MKL_DNN_ONLY"]) +
-        if_mkl_v1_open_source_only(["-DENABLE_MKLDNN_V1", "-DENABLE_INTEL_MKL_BFLOAT16"]) +
-        if_mkldnn_threadpool([
-            "-DENABLE_MKLDNN_THREADPOOL",
-            "-DENABLE_MKLDNN_V1",
-            "-DINTEL_MKL_DNN_ONLY",
-        ]) +
+        if_mkl_v1(["-DENABLE_MKLDNN_V1", "-DENABLE_INTEL_MKL_BFLOAT16"]) +
+        if_mkldnn_threadpool(["-DENABLE_MKLDNN_THREADPOOL"]) +
         if_enable_mkl(["-DENABLE_MKL"]) +
         if_ngraph(["-DINTEL_NGRAPH=1"]) +
         if_android_arm(["-mfpu=neon"]) +
@@ -1765,16 +1761,52 @@ def transitive_hdrs(name, deps = [], **kwargs):
     _transitive_hdrs(name = name + "_gather", deps = deps)
     native.filegroup(name = name, srcs = [":" + name + "_gather"])
 
+# Bazel rule for collecting the transitive parameters from a set of dependencies into a library.
+# Propagates defines and includes.
+def _transitive_parameters_library_impl(ctx):
+    defines = depset(
+        transitive = [dep[CcInfo].compilation_context.defines for dep in ctx.attr.original_deps],
+    )
+    system_includes = depset(
+        transitive = [dep[CcInfo].compilation_context.system_includes for dep in ctx.attr.original_deps],
+    )
+    includes = depset(
+        transitive = [dep[CcInfo].compilation_context.includes for dep in ctx.attr.original_deps],
+    )
+    quote_includes = depset(
+        transitive = [dep[CcInfo].compilation_context.quote_includes for dep in ctx.attr.original_deps],
+    )
+    framework_includes = depset(
+        transitive = [dep[CcInfo].compilation_context.framework_includes for dep in ctx.attr.original_deps],
+    )
+    return CcInfo(
+        compilation_context = cc_common.create_compilation_context(
+            defines = depset(direct = defines.to_list()),
+            system_includes = depset(direct = system_includes.to_list()),
+            includes = depset(direct = includes.to_list()),
+            quote_includes = depset(direct = quote_includes.to_list()),
+            framework_includes = depset(direct = framework_includes.to_list()),
+        ),
+    )
+
+_transitive_parameters_library = rule(
+    attrs = {
+        "original_deps": attr.label_list(
+            allow_empty = True,
+            allow_files = True,
+            providers = [CcInfo],
+        ),
+    },
+    implementation = _transitive_parameters_library_impl,
+)
+
 # Create a header only library that includes all the headers exported by
 # the libraries in deps.
 #
 # **NOTE**: The headers brought in are **NOT** fully transitive; certain
-# deep headers may be missing.  Furthermore, the `includes` argument of
-# cc_libraries in the dependencies are *not* going to be respected
-# when you use cc_header_only_library.  Some cases where this creates
-# problems include: Eigen, grpc, MLIR.  In cases such as these, you must
-# find a header-only version of the cc_library rule you care about and
-# link it *directly* in addition to your use of the cc_header_only_library
+# deep headers may be missing.  If this creates problems, you must find
+# a header-only version of the cc_library rule you care about and link it
+# *directly* in addition to your use of the cc_header_only_library
 # intermediary.
 #
 # For:
@@ -1783,11 +1815,15 @@ def transitive_hdrs(name, deps = [], **kwargs):
 #
 def cc_header_only_library(name, deps = [], includes = [], extra_deps = [], **kwargs):
     _transitive_hdrs(name = name + "_gather", deps = deps)
+    _transitive_parameters_library(
+        name = name + "_gathered_parameters",
+        original_deps = deps,
+    )
     cc_library(
         name = name,
         hdrs = [":" + name + "_gather"],
         includes = includes,
-        deps = extra_deps,
+        deps = [":" + name + "_gathered_parameters"] + extra_deps,
         **kwargs
     )
 
@@ -2145,12 +2181,6 @@ def pywrap_tensorflow_macro(
 # This macro is for running python tests against system installed pip package
 # on Windows.
 #
-# This macro can also enable testing with the experimental mlir bridge when
-# enable_mlir_bridge is true. When it is enabled tests are run both with and without
-# the mlir bridge. Support for enabling the mlir bridge is added here because
-# it allows all tensorflow tests to be configured to be run with and without the
-# mlir bridge.
-#
 # py_test is built as an executable python zip file on Windows, which contains all
 # dependencies of the target. Because of the C++ extensions, it would be very
 # inefficient if the py_test zips all runfiles, plus we don't need them when running
@@ -2164,28 +2194,25 @@ def pywrap_tensorflow_macro(
 #    Note that this only works on Windows. See the definition of
 #    //third_party/tensorflow/tools/pip_package:win_pip_package_marker for specific reasons.
 # 2. When --define=no_tensorflow_py_deps=false (by default), it's a normal py_test.
-def py_test(deps = [], data = [], kernels = [], **kwargs):
+def py_test(deps = [], data = [], kernels = [], exec_properties = None, **kwargs):
     # Python version placeholder
     if kwargs.get("python_version", None) == "PY3":
         kwargs["tags"] = kwargs.get("tags", []) + ["no_oss_py2"]
-    deps = deps.to_list() if type(deps) == "depset" else deps
+
+    if not exec_properties:
+        exec_properties = tf_exec_properties(kwargs)
+
     native.py_test(
         # TODO(jlebar): Ideally we'd use tcmalloc here.,
         deps = select({
-                   "//conditions:default": deps,
-                   clean_dep("//tensorflow:no_tensorflow_py_deps"): [],
-               }) +
-               select({
-                   str(Label("//tensorflow:enable_mlir_bridge")): [
-                       "//tensorflow/python:is_mlir_bridge_test_true",
-                   ],
-                   "//conditions:default": [],
-               }),
+            "//conditions:default": deps,
+            clean_dep("//tensorflow:no_tensorflow_py_deps"): [],
+        }),
         data = data + select({
             "//conditions:default": kernels,
             clean_dep("//tensorflow:no_tensorflow_py_deps"): ["//tensorflow/tools/pip_package:win_pip_package_marker"],
         }),
-        exec_properties = tf_exec_properties(kwargs),
+        exec_properties = exec_properties,
         **kwargs
     )
 
@@ -2907,6 +2934,11 @@ def tf_monitoring_python_deps():
         "//conditions:default": [],
     })
 
+# Teams sharing the same repo can provide their own ops_to_register.h file using
+# this function, and pass in -Ipath/to/repo flag when building the target.
+def tf_selective_registration_deps():
+    return []
+
 def tf_jit_compilation_passes_extra_deps():
     return []
 
@@ -2916,6 +2948,14 @@ def if_mlir(if_true, if_false = []):
         "//conditions:default": if_false,
     })
 
+def tf_enable_mlir_bridge():
+    return select({
+        str(Label("//tensorflow:enable_mlir_bridge")): [
+            "//tensorflow/python:is_mlir_bridge_test_true",
+        ],
+        "//conditions:default": [],
+    })
+
 def if_tpu(if_true, if_false = []):
     """Shorthand for select()ing whether to build for TPUs."""
     return select({
diff --git a/tensorflow/tools/android/inference_interface/asset_manager_filesystem.cc b/tensorflow/tools/android/inference_interface/asset_manager_filesystem.cc
index ee56f9affdf..f86dacf25b0 100644
--- a/tensorflow/tools/android/inference_interface/asset_manager_filesystem.cc
+++ b/tensorflow/tools/android/inference_interface/asset_manager_filesystem.cc
@@ -124,7 +124,8 @@ AssetManagerFileSystem::AssetManagerFileSystem(AAssetManager* asset_manager,
                                                const string& prefix)
     : asset_manager_(asset_manager), prefix_(prefix) {}
 
-Status AssetManagerFileSystem::FileExists(const string& fname) {
+Status AssetManagerFileSystem::FileExists(const string& fname,
+                                          TransactionToken* token) {
   string path = RemoveAssetPrefix(fname);
   auto asset = ScopedAsset(
       AAssetManager_open(asset_manager_, path.c_str(), AASSET_MODE_RANDOM));
@@ -135,7 +136,8 @@ Status AssetManagerFileSystem::FileExists(const string& fname) {
 }
 
 Status AssetManagerFileSystem::NewRandomAccessFile(
-    const string& fname, std::unique_ptr<RandomAccessFile>* result) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<RandomAccessFile>* result) {
   string path = RemoveAssetPrefix(fname);
   auto asset = ScopedAsset(
       AAssetManager_open(asset_manager_, path.c_str(), AASSET_MODE_RANDOM));
@@ -147,7 +149,8 @@ Status AssetManagerFileSystem::NewRandomAccessFile(
 }
 
 Status AssetManagerFileSystem::NewReadOnlyMemoryRegionFromFile(
-    const string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<ReadOnlyMemoryRegion>* result) {
   string path = RemoveAssetPrefix(fname);
   auto asset = ScopedAsset(
       AAssetManager_open(asset_manager_, path.c_str(), AASSET_MODE_STREAMING));
@@ -184,6 +187,7 @@ Status AssetManagerFileSystem::NewReadOnlyMemoryRegionFromFile(
 }
 
 Status AssetManagerFileSystem::GetChildren(const string& prefixed_dir,
+                                           TransactionToken* token,
                                            std::vector<string>* r) {
   std::string path = NormalizeDirectoryPath(prefixed_dir);
   auto dir =
@@ -199,7 +203,8 @@ Status AssetManagerFileSystem::GetChildren(const string& prefixed_dir,
   return Status::OK();
 }
 
-Status AssetManagerFileSystem::GetFileSize(const string& fname, uint64* s) {
+Status AssetManagerFileSystem::GetFileSize(const string& fname,
+                                           TransactionToken* token, uint64* s) {
   // If fname corresponds to a directory, return early. It doesn't map to an
   // AAsset, and would otherwise return NotFound.
   if (DirectoryExists(fname)) {
@@ -216,7 +221,9 @@ Status AssetManagerFileSystem::GetFileSize(const string& fname, uint64* s) {
   return Status::OK();
 }
 
-Status AssetManagerFileSystem::Stat(const string& fname, FileStatistics* stat) {
+Status AssetManagerFileSystem::Stat(const string& fname,
+                                    TransactionToken* token,
+                                    FileStatistics* stat) {
   uint64 size;
   stat->is_directory = DirectoryExists(fname);
   TF_RETURN_IF_ERROR(GetFileSize(fname, &size));
@@ -244,28 +251,35 @@ bool AssetManagerFileSystem::DirectoryExists(const std::string& fname) {
 }
 
 Status AssetManagerFileSystem::GetMatchingPaths(const string& pattern,
+                                                TransactionToken* token,
                                                 std::vector<string>* results) {
   return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
 }
 
 Status AssetManagerFileSystem::NewWritableFile(
-    const string& fname, std::unique_ptr<WritableFile>* result) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<WritableFile>* result) {
   return errors::Unimplemented("Asset storage is read only.");
 }
 Status AssetManagerFileSystem::NewAppendableFile(
-    const string& fname, std::unique_ptr<WritableFile>* result) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<WritableFile>* result) {
   return errors::Unimplemented("Asset storage is read only.");
 }
-Status AssetManagerFileSystem::DeleteFile(const string& f) {
+Status AssetManagerFileSystem::DeleteFile(const string& f,
+                                          TransactionToken* token) {
   return errors::Unimplemented("Asset storage is read only.");
 }
-Status AssetManagerFileSystem::CreateDir(const string& d) {
+Status AssetManagerFileSystem::CreateDir(const string& d,
+                                         TransactionToken* token) {
   return errors::Unimplemented("Asset storage is read only.");
 }
-Status AssetManagerFileSystem::DeleteDir(const string& d) {
+Status AssetManagerFileSystem::DeleteDir(const string& d,
+                                         TransactionToken* token) {
   return errors::Unimplemented("Asset storage is read only.");
 }
-Status AssetManagerFileSystem::RenameFile(const string& s, const string& t) {
+Status AssetManagerFileSystem::RenameFile(const string& s, const string& t,
+                                          TransactionToken* token) {
   return errors::Unimplemented("Asset storage is read only.");
 }
 
diff --git a/tensorflow/tools/android/inference_interface/asset_manager_filesystem.h b/tensorflow/tools/android/inference_interface/asset_manager_filesystem.h
index a87ff42ae21..893d5ccb90a 100644
--- a/tensorflow/tools/android/inference_interface/asset_manager_filesystem.h
+++ b/tensorflow/tools/android/inference_interface/asset_manager_filesystem.h
@@ -42,31 +42,37 @@ class AssetManagerFileSystem : public FileSystem {
   AssetManagerFileSystem(AAssetManager* asset_manager, const string& prefix);
   ~AssetManagerFileSystem() override = default;
 
-  Status FileExists(const string& fname) override;
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
+  Status FileExists(const string& fname, TransactionToken* token) override;
   Status NewRandomAccessFile(
-      const string& filename,
+      const string& filename, TransactionToken* token,
       std::unique_ptr<RandomAccessFile>* result) override;
   Status NewReadOnlyMemoryRegionFromFile(
-      const string& filename,
+      const string& filename, TransactionToken* token,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
 
-  Status GetFileSize(const string& f, uint64* s) override;
+  Status GetFileSize(const string& f, TransactionToken* token,
+                     uint64* s) override;
   // Currently just returns size.
-  Status Stat(const string& fname, FileStatistics* stat) override;
-  Status GetChildren(const string& dir, std::vector<string>* r) override;
+  Status Stat(const string& fname, TransactionToken* token,
+              FileStatistics* stat) override;
+  Status GetChildren(const string& dir, TransactionToken* token,
+                     std::vector<string>* r) override;
 
   // All these functions return Unimplemented error. Asset storage is
   // read only.
-  Status NewWritableFile(const string& fname,
+  Status NewWritableFile(const string& fname, TransactionToken* token,
                          std::unique_ptr<WritableFile>* result) override;
-  Status NewAppendableFile(const string& fname,
+  Status NewAppendableFile(const string& fname, TransactionToken* token,
                            std::unique_ptr<WritableFile>* result) override;
-  Status DeleteFile(const string& f) override;
-  Status CreateDir(const string& d) override;
-  Status DeleteDir(const string& d) override;
-  Status RenameFile(const string& s, const string& t) override;
+  Status DeleteFile(const string& f, TransactionToken* token) override;
+  Status CreateDir(const string& d, TransactionToken* token) override;
+  Status DeleteDir(const string& d, TransactionToken* token) override;
+  Status RenameFile(const string& s, const string& t,
+                    TransactionToken* token) override;
 
-  Status GetMatchingPaths(const string& pattern,
+  Status GetMatchingPaths(const string& pattern, TransactionToken* token,
                           std::vector<string>* results) override;
 
  private:
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt
index 7a41cfea4b7..b1754f26b20 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "from_value_rowids"
     argspec: "args=[\'cls\', \'values\', \'value_rowids\', \'nrows\', \'name\', \'validate\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
+  member_method {
+    name: "get_shape"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_dims"
     argspec: "args=[\'self\', \'outer_axis\', \'inner_axis\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor.pbtxt
index aa89308999c..fe3a8222353 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor.pbtxt
@@ -52,4 +52,8 @@ tf_class {
     name: "get_shape"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "with_values"
+    argspec: "args=[\'self\', \'new_values\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt
index c4c62860836..9315973e51d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.framework.ops.Tensor\'>"
   is_instance: "<class \'tensorflow.python.types.internal.NativeObject\'>"
   is_instance: "<class \'tensorflow.python.types.core.Tensor\'>"
-  is_instance: "<class \'typing.Generic\'>"
+  is_instance: "<type \'object\'>"
   member {
     name: "OVERLOADABLE_OPERATORS"
     mtype: "<type \'set\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-iterator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-iterator.pbtxt
index f276879275d..0c8af2ec6c9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-iterator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-iterator.pbtxt
@@ -39,6 +39,10 @@ tf_class {
     name: "get_next"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_next_as_optional"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "make_initializer"
     argspec: "args=[\'self\', \'dataset\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt
index a79d205cf0b..e33265430ea 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt
@@ -59,6 +59,10 @@ tf_class {
     name: "parallel_batch"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "reorder_data_discarding_ops"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "shuffle_and_repeat_fusion"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.pbtxt
index 12f4f3c2b08..3630c97da93 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.pbtxt
@@ -4,4 +4,12 @@ tf_module {
     name: "distribute"
     argspec: "args=[\'processing_mode\', \'service\', \'job_name\', \'max_outstanding_requests\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "from_dataset_id"
+    argspec: "args=[\'processing_mode\', \'service\', \'dataset_id\', \'element_spec\', \'job_name\', \'max_outstanding_requests\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "register_dataset"
+    argspec: "args=[\'service\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
index 0c5db602029..85dd7f5eaa6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -48,10 +48,6 @@ tf_class {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_run_v2"
-    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
-  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt
index ae62acffa44..23e03ceab02 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt
@@ -48,10 +48,6 @@ tf_class {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_run_v2"
-    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
-  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
index 9285405ea4f..7fbd9dded22 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
@@ -47,10 +47,6 @@ tf_class {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_run_v2"
-    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
-  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
index 3c3d785ac7c..2f7ba2db15c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
@@ -48,10 +48,6 @@ tf_class {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_run_v2"
-    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
-  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-collective-hints.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-collective-hints.pbtxt
index c010134466c..9a420b0e44e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-collective-hints.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-collective-hints.pbtxt
@@ -4,6 +4,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'bytes_per_pack\'], varargs=None, keywords=None, defaults=[\'0\'], "
+    argspec: "args=[\'self\', \'bytes_per_pack\', \'timeout_seconds\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
index e1f8bea251b..dac5652c7fd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
@@ -48,10 +48,6 @@ tf_class {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_run_v2"
-    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
-  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
index 6ae83d18589..f63c16dec5a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
@@ -48,10 +48,6 @@ tf_class {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_run_v2"
-    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
-  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
index 0e548eca9b5..53d5b756568 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
@@ -52,10 +52,6 @@ tf_class {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_run_v2"
-    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
-  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.-t-p-u-config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.-t-p-u-config.pbtxt
index d934b4013b2..e329045123e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.-t-p-u-config.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.-t-p-u-config.pbtxt
@@ -7,6 +7,10 @@ tf_class {
     name: "eval_training_input_configuration"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "experimental_allow_per_host_v2_parallel_get_next"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "experimental_host_call_every_n_steps"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
index 6318e577087..a9f6f069560 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -158,7 +158,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index 9b7b7736746..168539be647 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -164,7 +164,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.applications.mobilenet_v3.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.applications.mobilenet_v3.pbtxt
new file mode 100644
index 00000000000..418ace0882f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.applications.mobilenet_v3.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.keras.applications.mobilenet_v3"
+tf_module {
+  member_method {
+    name: "decode_predictions"
+    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
+  }
+  member_method {
+    name: "preprocess_input"
+    argspec: "args=[\'x\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.applications.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.applications.pbtxt
index 900df849f45..60c9b6d2909 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.applications.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.applications.pbtxt
@@ -28,6 +28,10 @@ tf_module {
     name: "mobilenet_v2"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "mobilenet_v3"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "nasnet"
     mtype: "<type \'module\'>"
@@ -116,6 +120,14 @@ tf_module {
     name: "MobileNetV2"
     argspec: "args=[\'input_shape\', \'alpha\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\', \'classifier_activation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1.0\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\', \'softmax\'], "
   }
+  member_method {
+    name: "MobileNetV3Large"
+    argspec: "args=[\'input_shape\', \'alpha\', \'minimalistic\', \'include_top\', \'weights\', \'input_tensor\', \'classes\', \'pooling\', \'dropout_rate\', \'classifier_activation\'], varargs=None, keywords=None, defaults=[\'None\', \'1.0\', \'False\', \'True\', \'imagenet\', \'None\', \'1000\', \'None\', \'0.2\', \'softmax\'], "
+  }
+  member_method {
+    name: "MobileNetV3Small"
+    argspec: "args=[\'input_shape\', \'alpha\', \'minimalistic\', \'include_top\', \'weights\', \'input_tensor\', \'classes\', \'pooling\', \'dropout_rate\', \'classifier_activation\'], varargs=None, keywords=None, defaults=[\'None\', \'1.0\', \'False\', \'True\', \'imagenet\', \'None\', \'1000\', \'None\', \'0.2\', \'softmax\'], "
+  }
   member_method {
     name: "NASNetLarge"
     argspec: "args=[\'input_shape\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
index 976eb49d4c8..2aff054a51d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -159,7 +159,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index e366d0b1f52..3bd1bc2c939 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -139,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
index 2b4ebd55410..7150f2bd928 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index 500aa28eae7..ed49246e458 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -159,7 +159,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
index 363e4b7cf20..51277dfae56 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
index 409dfcd26e4..378f6568eef 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
index 599d550fee2..a9d11967feb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
index b43eee1e6e5..fca5d2928ee 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
index 2edb8e028c1..96b809486a7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 4f7c7d6ca27..4ca1dc4a217 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
index e335099c084..ae2cb7f7e20 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 80c8e0d63c1..ae64e051158 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index 4e95f083490..fd77d449216 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index 370589fb876..fc39c337669 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
index 8abc0add0dc..cbcfbb1022f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 6938e84fe77..bbb6c19bd7f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index 5ca4fd39173..16d329f22c1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 378bf0e84a0..56cb840bd0b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
index f5ef78bc6d7..6dc759e1338 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
index 60402f72e77..a619ae0a480 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -142,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
index c6733e7e8ab..237d4e7f34c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index 958d06a0d0f..dc15a2c227c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -227,7 +227,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d-transpose.pbtxt
index 03ccf45ea84..3d3ee3c67bf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d-transpose.pbtxt
@@ -139,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
index f41ff3f2136..23fb7bfc4eb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 2370f378e57..a7eeb12ef04 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -139,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
index 82caa2a7b1d..9f4aa3cd95f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index e8adfd50fb2..a83cbc24972 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -139,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
index d49616f857c..7ccbb9a2694 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt
index 6332fcc5d4b..0733557f70d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt
@@ -139,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
index 5c94f928d6b..71c2e77e7ff 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 0de86bcf16a..824bd8bbb2f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -139,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
index a0f0ff2e7ba..ac9d5be1883 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index c772d1243f4..ed63b8d98d4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -139,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
index dc0637655e3..d00f7a5b396 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
index 090f4055906..2ca122485d6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
index b010cfe1acd..1b69967a59e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
index 802627cc0ed..265f13b06bd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
index 7c95b72997c..e40eb1470a7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
@@ -147,7 +147,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
index 40f6c7a7338..167a4d9e96f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
@@ -147,7 +147,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
index 3d541caa034..59793ff6d45 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
index 90d398e8975..8370406e34d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index d754f8d1de3..554d7531912 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -139,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
index 7189ec944a1..101719437d4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
index ecee07fae25..d441302523f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
index f24a778092b..a736e4c03fd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
index f9e1721669a..3f9002792d8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
index b76d54415c7..217f2701f3d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index 5a6976a8e54..7ed2d307cd0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
index 2eedd3a212e..0a2ebd2cfe7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -210,7 +210,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 1d3b1e73c55..1962f3284f0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 46caef8596c..64073b27c24 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 0afe2f7483f..73ed7f59394 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 8c001871ed6..2fb47e8a5a6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index c0e2d405e9a..e3ab2b5ab6d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 785db4a7762..494e2247fbf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index c288f835aac..22e79311d67 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index 060e6fd7d12..83f91393647 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index fcf2cf4c0ac..d211683ae9c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 5a9fcc6db6c..2b9442dee85 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 9d50d7e2d95..e02b42bdd0e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index b071143ab2b..60d2a947d87 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index 49a548fc277..352527ea0f4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index f090677378d..a1ff2f402a7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
index e7952bb23c9..d1811a28b55 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-spec.pbtxt
index fce381e8a0a..c95e3135df8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-spec.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'dtype\', \'shape\', \'ndim\', \'max_ndim\', \'min_ndim\', \'axes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'dtype\', \'shape\', \'ndim\', \'max_ndim\', \'min_ndim\', \'axes\', \'allow_last_axis_squeeze\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 16519f922d9..27cada1194e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 1dc701cc7d6..080fb51d538 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -210,7 +210,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
index 5673f4d01a1..c9f01c56606 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt
index 99eafcb8fe3..3b9306cdfe6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
index bc84f3b51c1..03902ed1de4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
@@ -136,7 +136,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index 95c5d4bb39b..bf98a150184 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index b72424921a4..040230d63b3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index 985c2274379..8d49e7a58a1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
index 65ae9893b11..485ae3b16ef 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 57c01d47d04..05050fdbffa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 88c1fb93eaf..8ae6a0ab43b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
index c5f18cb9a05..ae8aea28552 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index db5eb02deee..94d2e0e6f6e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index 45e68e0d94c..91b0b44ea50 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index 36ea8cbb6ae..587850f1d6c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
index 87afbb42ee7..ac97ca6e061 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
index 1de712f45ae..7c8950ce3fa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
index 4502002d2c7..9ef978eeb3a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 1543f27fab8..19a48d77113 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
index 47749db7b2a..03d5a2195cd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
index 76bfe022b2b..c8c5b8326dd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -141,7 +141,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
index 48ac55d9b2e..84530e067b2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
index 71753b21d64..4de5b1c20d8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
index 8778dfa3f20..b9fcb027aaf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index 1f7d7c0204d..5b6bff9dc5f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -139,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index e301442b952..3fb3c032a3e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -139,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index 6e52320dcb5..5387a8e5fc5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -139,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index 16e10561154..de2d3eaaab4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -139,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 8491504b3ab..80e17948612 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index f651e06aa50..48e0c26b010 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -198,7 +198,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
index 75737c0a415..db272bdf782 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index e70f19ac2e5..0260221d093 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index b5d4b00a220..ddad5641e76 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 52ace6ef50f..47e6ba9abfa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 14f3ccf383d..5379da642ed 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
index f85acb2945a..1e070fb36db 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index dccb48bce49..6d7724bdfe3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
index b02fba20dee..d740fc8de3a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index 84f58bf535e..de377d9d2eb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index b78fcc18e08..e2ee7941662 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 29851be0b77..8dd967cd3ce 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
index da355d1142f..334463a4031 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index 3180441407e..ba4e58e3dfb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index ecccc705e07..8538d903fba 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index fc93d9957d7..2b8681ae8cc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt
index 8e8684bab54..a2d7d285409 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt
index 9b4ddcd3f62..e24ca0dc01a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
index 73592f11b8d..ceb38316d11 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.CategoryCrossing"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.category_crossing.CategoryCrossing\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'depth\', \'name\', \'separator\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt
deleted file mode 100644
index e907d9a293b..00000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.keras.layers.experimental.preprocessing.CategoryEncoding.__metaclass__"
-tf_class {
-  is_instance: "<type \'type\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
index 26a016264d8..04e59727b19 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
index b03ca3dc11c..14d43cb08e8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.CenterCrop"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.CenterCrop\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'height\', \'width\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
index 10f5b9ae1aa..cb7a793f94d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.Discretization"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.discretization.Discretization\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'bins\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt
index b29b32e1315..75a1efc2f15 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.Hashing"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.hashing.Hashing\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'num_bins\', \'salt\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.__metaclass__.pbtxt
deleted file mode 100644
index 409509cd4d2..00000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.keras.layers.experimental.preprocessing.IntegerLookup.__metaclass__"
-tf_class {
-  is_instance: "<type \'type\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt
index 44afdfc7d34..75625d24d30 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt
@@ -147,7 +147,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-normalization.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-normalization.__metaclass__.pbtxt
deleted file mode 100644
index 20bb9904d18..00000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-normalization.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.keras.layers.experimental.preprocessing.Normalization.__metaclass__"
-tf_class {
-  is_instance: "<type \'type\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt
index e1a451f8f52..093a2b2292e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.__metaclass__.pbtxt
deleted file mode 100644
index ceebb69d16a..00000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.keras.layers.experimental.preprocessing.PreprocessingLayer.__metaclass__"
-tf_class {
-  is_instance: "<type \'type\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt
index 095471dc12a..0fa0355b0f2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt
@@ -141,7 +141,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt
index b9a330e8bcc..8eca3903616 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.RandomContrast"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomContrast\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'factor\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt
index f0f1bc45b0e..ad813468f53 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.RandomCrop"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomCrop\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'height\', \'width\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt
index 8b1d7734d44..15406e778f8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.RandomFlip"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomFlip\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'mode\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'horizontal_and_vertical\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt
index 5a8f67f9487..8119cb9687f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.RandomHeight"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomHeight\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'factor\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'bilinear\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
index 0938ec3d684..ee9a0254382 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.RandomRotation"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomRotation\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
index cfee60206f1..7e1095e7503 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.RandomTranslation"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomTranslation\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt
index 93b272d1105..0d113434d80 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.RandomWidth"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomWidth\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'factor\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'bilinear\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
index f0b50e92e66..fd59a92a4af 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.RandomZoom"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomZoom\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'reflect\', \'bilinear\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
index 94b6c730699..c8fcedd3221 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.Rescaling"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.Rescaling\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'scale\', \'offset\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.0\', \'None\'], "
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
index 30b20a1ac01..7efb8d72dcb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.Resizing"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.Resizing\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'height\', \'width\', \'interpolation\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'bilinear\', \'None\'], "
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.__metaclass__.pbtxt
deleted file mode 100644
index 4cb57350380..00000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.keras.layers.experimental.preprocessing.StringLookup.__metaclass__"
-tf_class {
-  is_instance: "<type \'type\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt
index 0e671b5f78b..2d7e71c2c43 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt
@@ -147,7 +147,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.__metaclass__.pbtxt
deleted file mode 100644
index fe45a5da03b..00000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.keras.layers.experimental.preprocessing.TextVectorization.__metaclass__"
-tf_class {
-  is_instance: "<type \'type\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
index 1b9a745486c..532f98fb322 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
index 4f586cfc1ef..dbab3abae8e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
@@ -79,7 +79,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index ad0edc64606..4368742d7bb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -158,7 +158,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index b38c669df0f..8e9409f27a9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -164,7 +164,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
index 3a4e965007e..af854e98013 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
@@ -58,7 +58,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
index 2cbc8b84800..e89cc5cef75 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
@@ -58,7 +58,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
index 19e805bd51c..15414d7234f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
@@ -58,7 +58,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
index 9e8914aa432..8b3c429e6b5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
@@ -58,7 +58,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt
index e3656cef3e7..51ab675db74 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt
@@ -58,7 +58,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
index ed7b6cf0eb8..342c0951bbe 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
@@ -58,7 +58,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
index d5f60307028..f007b4b971a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
@@ -57,7 +57,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
index 7bb57f46a34..d5bf6fa7f47 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
@@ -58,7 +58,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
index c14f081f444..df904f72511 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
@@ -58,7 +58,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt
index 17b89c29fb2..75dbd5e386a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'dtype\', \'shape\', \'ndim\', \'max_ndim\', \'min_ndim\', \'axes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'dtype\', \'shape\', \'ndim\', \'max_ndim\', \'min_ndim\', \'axes\', \'allow_last_axis_squeeze\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index dc09038dd5a..ba64d009908 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -258,7 +258,7 @@ tf_module {
   }
   member {
     name: "Tensor"
-    mtype: "<class \'typing.GenericMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "TensorArray"
@@ -656,6 +656,10 @@ tf_module {
     name: "truncated_normal_initializer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "types"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "uint16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -1338,7 +1342,7 @@ tf_module {
   }
   member_method {
     name: "function"
-    argspec: "args=[\'func\', \'input_signature\', \'autograph\', \'experimental_implements\', \'experimental_autograph_options\', \'experimental_relax_shapes\', \'experimental_compile\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'func\', \'input_signature\', \'autograph\', \'experimental_implements\', \'experimental_autograph_options\', \'experimental_relax_shapes\', \'experimental_compile\', \'experimental_follow_type_hints\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\', \'None\', \'False\', \'None\', \'None\'], "
   }
   member_method {
     name: "gather"
@@ -1730,7 +1734,7 @@ tf_module {
   }
   member_method {
     name: "nondifferentiable_batch_function"
-    argspec: "args=[\'num_batch_threads\', \'max_batch_size\', \'batch_timeout_micros\', \'allowed_batch_sizes\', \'max_enqueued_batches\', \'autograph\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\'], "
+    argspec: "args=[\'num_batch_threads\', \'max_batch_size\', \'batch_timeout_micros\', \'allowed_batch_sizes\', \'max_enqueued_batches\', \'autograph\', \'enable_large_batch_splitting\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'True\'], "
   }
   member_method {
     name: "norm"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 0db6da3dad2..0a2843431f2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -768,6 +768,10 @@ tf_module {
     name: "CollectiveReduce"
     argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'subdiv_offsets\', \'wait_for\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'auto\', \'0\', \'None\'], "
   }
+  member_method {
+    name: "CollectiveReduceV2"
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
+  }
   member_method {
     name: "CombinedNonMaxSuppression"
     argspec: "args=[\'boxes\', \'scores\', \'max_output_size_per_class\', \'max_total_size\', \'iou_threshold\', \'score_threshold\', \'pad_per_class\', \'clip_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'None\'], "
@@ -792,6 +796,10 @@ tf_module {
     name: "ComputeAccidentalHits"
     argspec: "args=[\'true_classes\', \'sampled_candidates\', \'num_true\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
   }
+  member_method {
+    name: "ComputeBatchSize"
+    argspec: "args=[\'input_dataset\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Concat"
     argspec: "args=[\'concat_dim\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -2656,6 +2664,10 @@ tf_module {
     name: "OptimizeDataset"
     argspec: "args=[\'input_dataset\', \'optimizations\', \'output_types\', \'output_shapes\', \'optimization_configs\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'None\'], "
   }
+  member_method {
+    name: "OptimizeDatasetV2"
+    argspec: "args=[\'input_dataset\', \'optimizations_enabled\', \'optimizations_disabled\', \'optimizations_default\', \'output_types\', \'output_shapes\', \'optimization_configs\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'None\'], "
+  }
   member_method {
     name: "OptionalFromValue"
     argspec: "args=[\'components\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -3308,6 +3320,10 @@ tf_module {
     name: "RebatchDataset"
     argspec: "args=[\'input_dataset\', \'num_replicas\', \'output_types\', \'output_shapes\', \'use_fallback\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
+  member_method {
+    name: "RebatchDatasetV2"
+    argspec: "args=[\'input_dataset\', \'batch_sizes\', \'drop_remainder\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Reciprocal"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -4492,6 +4508,10 @@ tf_module {
     name: "StatefulUniformInt"
     argspec: "args=[\'resource\', \'algorithm\', \'shape\', \'minval\', \'maxval\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "StatelessCase"
+    argspec: "args=[\'branch_index\', \'input\', \'Tout\', \'branches\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'None\'], "
+  }
   member_method {
     name: "StatelessIf"
     argspec: "args=[\'cond\', \'input\', \'Tout\', \'then_branch\', \'else_branch\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'None\'], "
@@ -4532,6 +4552,10 @@ tf_module {
     name: "StatelessRandomUniformInt"
     argspec: "args=[\'shape\', \'seed\', \'minval\', \'maxval\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "StatelessSampleDistortedBoundingBox"
+    argspec: "args=[\'image_size\', \'bounding_boxes\', \'min_object_covered\', \'seed\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'[0.75, 1.33]\', \'[0.05, 1]\', \'100\', \'False\', \'None\'], "
+  }
   member_method {
     name: "StatelessTruncatedNormal"
     argspec: "args=[\'shape\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-tensor.pbtxt
index a49cd1ccc4d..f0efebb3c8b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-tensor.pbtxt
@@ -52,4 +52,8 @@ tf_class {
     name: "get_shape"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "with_values"
+    argspec: "args=[\'self\', \'new_values\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-ftrl-parameters.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-ftrl-parameters.pbtxt
index 9e435cc0e8f..450015c3695 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-ftrl-parameters.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-ftrl-parameters.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\'], varargs=None, keywords=None, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'multiply_linear_by_learning_rate\', \'beta\', \'allow_zero_accumulator\'], varargs=None, keywords=None, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'True\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0\', \'False\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt
index 9cc8354b4bf..c15fdab977b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt
@@ -10,12 +10,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_config\', \'batch_size\', \'optimizer\', \'pipeline_execution_with_tensor_core\', \'initialize_tpu_embedding\'], varargs=None, keywords=None, defaults=[\'False\', \'True\'], "
+    argspec: "args=[\'self\', \'feature_config\', \'optimizer\', \'pipeline_execution_with_tensor_core\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "apply_gradients"
     argspec: "args=[\'self\', \'gradients\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'per_replica_batch_size\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "dequeue"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-ftrl-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-ftrl-optimizer.pbtxt
index 1d1aceb0138..9e12ae9b71f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-ftrl-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-ftrl-optimizer.pbtxt
@@ -18,7 +18,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\', \'accum_name\', \'linear_name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=None, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'False\', \'Ftrl\', \'None\', \'None\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\', \'accum_name\', \'linear_name\', \'l2_shrinkage_regularization_strength\', \'beta\'], varargs=None, keywords=None, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'False\', \'Ftrl\', \'None\', \'None\', \'0.0\', \'None\'], "
   }
   member_method {
     name: "apply_gradients"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.types.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.types.experimental.pbtxt
new file mode 100644
index 00000000000..d1b54df0c87
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.types.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.types.experimental"
+tf_module {
+  member {
+    name: "TensorLike"
+    mtype: "typing.Union"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.types.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.types.pbtxt
new file mode 100644
index 00000000000..7b97e527b46
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.types.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.types"
+tf_module {
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt
index 7a41cfea4b7..b1754f26b20 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "from_value_rowids"
     argspec: "args=[\'cls\', \'values\', \'value_rowids\', \'nrows\', \'name\', \'validate\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
+  member_method {
+    name: "get_shape"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_dims"
     argspec: "args=[\'self\', \'outer_axis\', \'inner_axis\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor.pbtxt
index aa89308999c..fe3a8222353 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor.pbtxt
@@ -52,4 +52,8 @@ tf_class {
     name: "get_shape"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "with_values"
+    argspec: "args=[\'self\', \'new_values\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt
index c4c62860836..9315973e51d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.framework.ops.Tensor\'>"
   is_instance: "<class \'tensorflow.python.types.internal.NativeObject\'>"
   is_instance: "<class \'tensorflow.python.types.core.Tensor\'>"
-  is_instance: "<class \'typing.Generic\'>"
+  is_instance: "<type \'object\'>"
   member {
     name: "OVERLOADABLE_OPERATORS"
     mtype: "<type \'set\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt
index a79d205cf0b..e33265430ea 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt
@@ -59,6 +59,10 @@ tf_class {
     name: "parallel_batch"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "reorder_data_discarding_ops"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "shuffle_and_repeat_fusion"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-master-server.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-dispatch-server.pbtxt
similarity index 61%
rename from tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-master-server.pbtxt
rename to tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-dispatch-server.pbtxt
index daac7716ca8..522cc00448a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-master-server.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-dispatch-server.pbtxt
@@ -1,6 +1,6 @@
-path: "tensorflow.data.experimental.service.MasterServer"
+path: "tensorflow.data.experimental.service.DispatchServer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.service.server_lib.MasterServer\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.service.server_lib.DispatchServer\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "target"
@@ -8,7 +8,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'port\', \'protocol\', \'start\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+    argspec: "args=[\'self\', \'port\', \'protocol\', \'work_dir\', \'fault_tolerant_mode\', \'start\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "join"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-server.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-server.pbtxt
index d0121b7edf2..8d8b1fd8584 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-server.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-server.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'port\', \'master_address\', \'worker_address\', \'protocol\', \'start\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'port\', \'dispatcher_address\', \'worker_address\', \'protocol\', \'start\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "join"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.pbtxt
index 347dd3c74b1..3ec5cd90ff8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.data.experimental.service"
 tf_module {
   member {
-    name: "MasterServer"
+    name: "DispatchServer"
     mtype: "<type \'type\'>"
   }
   member {
@@ -12,4 +12,12 @@ tf_module {
     name: "distribute"
     argspec: "args=[\'processing_mode\', \'service\', \'job_name\', \'max_outstanding_requests\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "from_dataset_id"
+    argspec: "args=[\'processing_mode\', \'service\', \'dataset_id\', \'element_spec\', \'job_name\', \'max_outstanding_requests\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "register_dataset"
+    argspec: "args=[\'service\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
index 8817f16d808..148c8c9d71f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -60,10 +60,6 @@ tf_class {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_run_v2"
-    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
-  }
   member_method {
     name: "experimental_split_to_logical_devices"
     argspec: "args=[\'self\', \'tensor\', \'partition_dimensions\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
index b6604408536..51e0d889489 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
@@ -60,10 +60,6 @@ tf_class {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_run_v2"
-    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
-  }
   member_method {
     name: "experimental_split_to_logical_devices"
     argspec: "args=[\'self\', \'tensor\', \'partition_dimensions\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
index 8140088e701..dbd329d6874 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
@@ -59,10 +59,6 @@ tf_class {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_run_v2"
-    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
-  }
   member_method {
     name: "experimental_split_to_logical_devices"
     argspec: "args=[\'self\', \'tensor\', \'partition_dimensions\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-t-p-u-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-t-p-u-strategy.pbtxt
index 29947a1c9c5..505c77be2e2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-t-p-u-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-t-p-u-strategy.pbtxt
@@ -60,10 +60,6 @@ tf_class {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_run_v2"
-    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
-  }
   member_method {
     name: "experimental_split_to_logical_devices"
     argspec: "args=[\'self\', \'tensor\', \'partition_dimensions\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
index ab030edd731..963ad04f6ab 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
@@ -60,10 +60,6 @@ tf_class {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_run_v2"
-    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
-  }
   member_method {
     name: "experimental_split_to_logical_devices"
     argspec: "args=[\'self\', \'tensor\', \'partition_dimensions\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-collective-hints.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-collective-hints.pbtxt
index c010134466c..9a420b0e44e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-collective-hints.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-collective-hints.pbtxt
@@ -4,6 +4,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'bytes_per_pack\'], varargs=None, keywords=None, defaults=[\'0\'], "
+    argspec: "args=[\'self\', \'bytes_per_pack\', \'timeout_seconds\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
index 43632e17b6d..5a44eaf20b5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
@@ -60,10 +60,6 @@ tf_class {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_run_v2"
-    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
-  }
   member_method {
     name: "experimental_split_to_logical_devices"
     argspec: "args=[\'self\', \'tensor\', \'partition_dimensions\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
index f792094bfdb..58bd5497817 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
@@ -60,10 +60,6 @@ tf_class {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_run_v2"
-    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
-  }
   member_method {
     name: "experimental_split_to_logical_devices"
     argspec: "args=[\'self\', \'tensor\', \'partition_dimensions\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
index 855cdbfb175..4bcd2277411 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
@@ -60,10 +60,6 @@ tf_class {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_run_v2"
-    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
-  }
   member_method {
     name: "experimental_split_to_logical_devices"
     argspec: "args=[\'self\', \'tensor\', \'partition_dimensions\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.errors.-operator-not-allowed-in-graph-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.errors.-operator-not-allowed-in-graph-error.pbtxt
new file mode 100644
index 00000000000..29478c395a5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.errors.-operator-not-allowed-in-graph-error.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.errors.OperatorNotAllowedInGraphError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OperatorNotAllowedInGraphError\'>"
+  is_instance: "<class \'TypeError\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.errors.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.errors.pbtxt
index 0a9ef10ef90..4933f33f30d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.errors.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.errors.pbtxt
@@ -84,6 +84,10 @@ tf_module {
     name: "OpError"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "OperatorNotAllowedInGraphError"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "OutOfRangeError"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
index 95e06075952..58384846276 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
@@ -8,6 +8,10 @@ tf_module {
     name: "dlpack"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "numpy"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "tensorrt"
     mtype: "<type \'module\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
index 3d01153895b..941d811f435 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
@@ -232,6 +232,42 @@ tf_module {
     name: "ssim_multiscale"
     argspec: "args=[\'img1\', \'img2\', \'max_val\', \'power_factors\', \'filter_size\', \'filter_sigma\', \'k1\', \'k2\'], varargs=None, keywords=None, defaults=[\'(0.0448, 0.2856, 0.3001, 0.2363, 0.1333)\', \'11\', \'1.5\', \'0.01\', \'0.03\'], "
   }
+  member_method {
+    name: "stateless_random_brightness"
+    argspec: "args=[\'image\', \'max_delta\', \'seed\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "stateless_random_contrast"
+    argspec: "args=[\'image\', \'lower\', \'upper\', \'seed\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "stateless_random_crop"
+    argspec: "args=[\'value\', \'size\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "stateless_random_flip_left_right"
+    argspec: "args=[\'image\', \'seed\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "stateless_random_flip_up_down"
+    argspec: "args=[\'image\', \'seed\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "stateless_random_hue"
+    argspec: "args=[\'image\', \'max_delta\', \'seed\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "stateless_random_jpeg_quality"
+    argspec: "args=[\'image\', \'min_jpeg_quality\', \'max_jpeg_quality\', \'seed\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "stateless_random_saturation"
+    argspec: "args=[\'image\', \'lower\', \'upper\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "stateless_sample_distorted_bounding_box"
+    argspec: "args=[\'image_size\', \'bounding_boxes\', \'seed\', \'min_object_covered\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'0.1\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "total_variation"
     argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
index 6318e577087..a9f6f069560 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -158,7 +158,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index 9b7b7736746..168539be647 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -164,7 +164,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.activations.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.activations.pbtxt
index ee3d1f3d4a2..b7b98a9d0ce 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.activations.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.activations.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "exponential"
     argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "gelu"
+    argspec: "args=[\'x\', \'approximate\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
   member_method {
     name: "get"
     argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.mobilenet_v3.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.mobilenet_v3.pbtxt
new file mode 100644
index 00000000000..418ace0882f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.mobilenet_v3.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.keras.applications.mobilenet_v3"
+tf_module {
+  member_method {
+    name: "decode_predictions"
+    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
+  }
+  member_method {
+    name: "preprocess_input"
+    argspec: "args=[\'x\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.pbtxt
index 900df849f45..60c9b6d2909 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.pbtxt
@@ -28,6 +28,10 @@ tf_module {
     name: "mobilenet_v2"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "mobilenet_v3"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "nasnet"
     mtype: "<type \'module\'>"
@@ -116,6 +120,14 @@ tf_module {
     name: "MobileNetV2"
     argspec: "args=[\'input_shape\', \'alpha\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\', \'classifier_activation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1.0\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\', \'softmax\'], "
   }
+  member_method {
+    name: "MobileNetV3Large"
+    argspec: "args=[\'input_shape\', \'alpha\', \'minimalistic\', \'include_top\', \'weights\', \'input_tensor\', \'classes\', \'pooling\', \'dropout_rate\', \'classifier_activation\'], varargs=None, keywords=None, defaults=[\'None\', \'1.0\', \'False\', \'True\', \'imagenet\', \'None\', \'1000\', \'None\', \'0.2\', \'softmax\'], "
+  }
+  member_method {
+    name: "MobileNetV3Small"
+    argspec: "args=[\'input_shape\', \'alpha\', \'minimalistic\', \'include_top\', \'weights\', \'input_tensor\', \'classes\', \'pooling\', \'dropout_rate\', \'classifier_activation\'], varargs=None, keywords=None, defaults=[\'None\', \'1.0\', \'False\', \'True\', \'imagenet\', \'None\', \'1000\', \'None\', \'0.2\', \'softmax\'], "
+  }
   member_method {
     name: "NASNetLarge"
     argspec: "args=[\'input_shape\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
index 976eb49d4c8..2aff054a51d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -159,7 +159,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index e366d0b1f52..3bd1bc2c939 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -139,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
index 2b4ebd55410..7150f2bd928 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index 500aa28eae7..ed49246e458 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -159,7 +159,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
index 363e4b7cf20..51277dfae56 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
index 409dfcd26e4..378f6568eef 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
index 599d550fee2..a9d11967feb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
index b43eee1e6e5..fca5d2928ee 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
index 2edb8e028c1..96b809486a7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 4f7c7d6ca27..4ca1dc4a217 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
index e335099c084..ae2cb7f7e20 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 80c8e0d63c1..ae64e051158 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index 4e95f083490..fd77d449216 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index 370589fb876..fc39c337669 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
index 8abc0add0dc..cbcfbb1022f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 6938e84fe77..bbb6c19bd7f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index 5ca4fd39173..16d329f22c1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 378bf0e84a0..56cb840bd0b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
index 049b9b2b8a6..fd130c55979 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
index 60402f72e77..a619ae0a480 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -142,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
index c6733e7e8ab..237d4e7f34c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index 958d06a0d0f..dc15a2c227c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -227,7 +227,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d-transpose.pbtxt
index 03ccf45ea84..3d3ee3c67bf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d-transpose.pbtxt
@@ -139,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
index f41ff3f2136..23fb7bfc4eb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 2370f378e57..a7eeb12ef04 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -139,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
index 82caa2a7b1d..9f4aa3cd95f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index e8adfd50fb2..a83cbc24972 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -139,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
index d49616f857c..7ccbb9a2694 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt
index 6332fcc5d4b..0733557f70d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt
@@ -139,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
index 5c94f928d6b..71c2e77e7ff 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 0de86bcf16a..824bd8bbb2f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -139,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
index a0f0ff2e7ba..ac9d5be1883 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index c772d1243f4..ed63b8d98d4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -139,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
index dc0637655e3..d00f7a5b396 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
index 090f4055906..2ca122485d6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
index b010cfe1acd..1b69967a59e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
index 802627cc0ed..265f13b06bd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
index 29c3b428d23..c74a5868d98 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
@@ -139,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
index 90d398e8975..8370406e34d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index d754f8d1de3..554d7531912 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -139,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
index 7189ec944a1..101719437d4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
index ecee07fae25..d441302523f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
index f24a778092b..a736e4c03fd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
index f9e1721669a..3f9002792d8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
index b76d54415c7..217f2701f3d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index ac60b3f5b3f..84576890c14 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -139,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
index 6d8f5008f65..bf71821c303 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -212,7 +212,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 1d3b1e73c55..1962f3284f0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 46caef8596c..64073b27c24 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 0afe2f7483f..73ed7f59394 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 8c001871ed6..2fb47e8a5a6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index c0e2d405e9a..e3ab2b5ab6d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 785db4a7762..494e2247fbf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index c288f835aac..22e79311d67 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index 060e6fd7d12..83f91393647 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index fcf2cf4c0ac..d211683ae9c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 5a9fcc6db6c..2b9442dee85 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 9d50d7e2d95..e02b42bdd0e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index b071143ab2b..60d2a947d87 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index 49a548fc277..352527ea0f4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index f090677378d..a1ff2f402a7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
index e7952bb23c9..d1811a28b55 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-spec.pbtxt
index fce381e8a0a..c95e3135df8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-spec.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'dtype\', \'shape\', \'ndim\', \'max_ndim\', \'min_ndim\', \'axes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'dtype\', \'shape\', \'ndim\', \'max_ndim\', \'min_ndim\', \'axes\', \'allow_last_axis_squeeze\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 3682005bee1..fb7ee25f8f6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -139,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 56368eefc1b..fd5eb6f50ce 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -212,7 +212,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
index 5673f4d01a1..c9f01c56606 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt
index 99eafcb8fe3..3b9306cdfe6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
index bc84f3b51c1..03902ed1de4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
@@ -136,7 +136,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index 95c5d4bb39b..bf98a150184 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index b72424921a4..040230d63b3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index 985c2274379..8d49e7a58a1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
index 65ae9893b11..485ae3b16ef 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 57c01d47d04..05050fdbffa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 88c1fb93eaf..8ae6a0ab43b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
index c5f18cb9a05..ae8aea28552 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index db5eb02deee..94d2e0e6f6e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index 45e68e0d94c..91b0b44ea50 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index 36ea8cbb6ae..587850f1d6c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
index 87afbb42ee7..ac97ca6e061 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
index 1de712f45ae..7c8950ce3fa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
index 4502002d2c7..9ef978eeb3a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 1543f27fab8..19a48d77113 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
index 47749db7b2a..03d5a2195cd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
index 76bfe022b2b..c8c5b8326dd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -141,7 +141,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
index 48ac55d9b2e..84530e067b2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
index 71753b21d64..4de5b1c20d8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
index 8778dfa3f20..b9fcb027aaf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index 1f7d7c0204d..5b6bff9dc5f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -139,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index e301442b952..3fb3c032a3e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -139,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index 6e52320dcb5..5387a8e5fc5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -139,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index 16e10561154..de2d3eaaab4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -139,7 +139,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 8491504b3ab..80e17948612 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index f651e06aa50..48e0c26b010 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -198,7 +198,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
index 75737c0a415..db272bdf782 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index e70f19ac2e5..0260221d093 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index b5d4b00a220..ddad5641e76 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 52ace6ef50f..47e6ba9abfa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 14f3ccf383d..5379da642ed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
index f85acb2945a..1e070fb36db 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index dccb48bce49..6d7724bdfe3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
index b02fba20dee..d740fc8de3a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index 84f58bf535e..de377d9d2eb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index b78fcc18e08..e2ee7941662 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 29851be0b77..8dd967cd3ce 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
index da355d1142f..334463a4031 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index 3180441407e..ba4e58e3dfb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index ecccc705e07..8538d903fba 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index fc93d9957d7..2b8681ae8cc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt
index 8e8684bab54..a2d7d285409 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt
index 9b4ddcd3f62..e24ca0dc01a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt
@@ -137,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-sync-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-sync-batch-normalization.pbtxt
index 70d57f09355..f34dce7b307 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-sync-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-sync-batch-normalization.pbtxt
@@ -138,7 +138,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
index 73592f11b8d..ceb38316d11 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.CategoryCrossing"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.category_crossing.CategoryCrossing\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'depth\', \'name\', \'separator\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt
deleted file mode 100644
index e907d9a293b..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.keras.layers.experimental.preprocessing.CategoryEncoding.__metaclass__"
-tf_class {
-  is_instance: "<type \'type\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
index 6874daac890..b4662d3c0e9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
@@ -143,7 +143,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
index b03ca3dc11c..14d43cb08e8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.CenterCrop"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.CenterCrop\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'height\', \'width\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
index 10f5b9ae1aa..cb7a793f94d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.Discretization"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.discretization.Discretization\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'bins\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt
index b29b32e1315..75a1efc2f15 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.Hashing"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.hashing.Hashing\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'num_bins\', \'salt\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.__metaclass__.pbtxt
deleted file mode 100644
index 409509cd4d2..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.keras.layers.experimental.preprocessing.IntegerLookup.__metaclass__"
-tf_class {
-  is_instance: "<type \'type\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt
index ca87b769d6a..e4e24a25b7b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt
@@ -144,7 +144,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-normalization.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-normalization.__metaclass__.pbtxt
deleted file mode 100644
index 20bb9904d18..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-normalization.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.keras.layers.experimental.preprocessing.Normalization.__metaclass__"
-tf_class {
-  is_instance: "<type \'type\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt
index 7045e8512ad..a58ffc1c2a5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt
@@ -143,7 +143,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.__metaclass__.pbtxt
deleted file mode 100644
index ceebb69d16a..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.keras.layers.experimental.preprocessing.PreprocessingLayer.__metaclass__"
-tf_class {
-  is_instance: "<type \'type\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt
index 095471dc12a..0fa0355b0f2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt
@@ -141,7 +141,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt
index b9a330e8bcc..8eca3903616 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.RandomContrast"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomContrast\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'factor\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt
index f0f1bc45b0e..ad813468f53 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.RandomCrop"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomCrop\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'height\', \'width\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt
index 8b1d7734d44..15406e778f8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.RandomFlip"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomFlip\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'mode\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'horizontal_and_vertical\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt
index 5a8f67f9487..8119cb9687f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.RandomHeight"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomHeight\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'factor\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'bilinear\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
index 0938ec3d684..ee9a0254382 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.RandomRotation"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomRotation\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
index cfee60206f1..7e1095e7503 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.RandomTranslation"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomTranslation\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt
index 93b272d1105..0d113434d80 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.RandomWidth"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomWidth\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'factor\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'bilinear\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
index f0b50e92e66..fd59a92a4af 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.RandomZoom"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomZoom\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'reflect\', \'bilinear\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
index 94b6c730699..c8fcedd3221 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.Rescaling"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.Rescaling\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'scale\', \'offset\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.0\', \'None\'], "
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
index 30b20a1ac01..7efb8d72dcb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.Resizing"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.Resizing\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -119,6 +120,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'height\', \'width\', \'interpolation\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'bilinear\', \'None\'], "
   }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
@@ -137,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.__metaclass__.pbtxt
deleted file mode 100644
index 4cb57350380..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.keras.layers.experimental.preprocessing.StringLookup.__metaclass__"
-tf_class {
-  is_instance: "<type \'type\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt
index 516f2c08571..80da4a3df58 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt
@@ -144,7 +144,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.__metaclass__.pbtxt
deleted file mode 100644
index fe45a5da03b..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.keras.layers.experimental.preprocessing.TextVectorization.__metaclass__"
-tf_class {
-  is_instance: "<type \'type\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
index 2106fa75996..4d5a28fc8b4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
@@ -143,7 +143,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
index 4f586cfc1ef..dbab3abae8e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
@@ -79,7 +79,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index ad0edc64606..4368742d7bb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -158,7 +158,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index b38c669df0f..8e9409f27a9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -164,7 +164,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
index 3a4e965007e..af854e98013 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
@@ -58,7 +58,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
index 2cbc8b84800..e89cc5cef75 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
@@ -58,7 +58,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
index 19e805bd51c..15414d7234f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
@@ -58,7 +58,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
index 9e8914aa432..8b3c429e6b5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
@@ -58,7 +58,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt
index e3656cef3e7..51ab675db74 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt
@@ -58,7 +58,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
index ed7b6cf0eb8..342c0951bbe 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
@@ -58,7 +58,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
index d5f60307028..f007b4b971a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
@@ -57,7 +57,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
index 7bb57f46a34..d5bf6fa7f47 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
@@ -58,7 +58,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
index c14f081f444..df904f72511 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
@@ -58,7 +58,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-device-wrapper.pbtxt
index e88c74bf97b..e0a352e79bf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-device-wrapper.pbtxt
@@ -148,7 +148,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-dropout-wrapper.pbtxt
index d44b2862408..d9a3159309d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-dropout-wrapper.pbtxt
@@ -152,7 +152,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-residual-wrapper.pbtxt
index a514f8ceea5..cc3e1399eed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-residual-wrapper.pbtxt
@@ -148,7 +148,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
index 152bac00961..741ab7fe017 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
@@ -168,6 +168,10 @@ tf_module {
     name: "fractional_max_pool"
     argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'0\', \'None\'], "
   }
+  member_method {
+    name: "gelu"
+    argspec: "args=[\'features\', \'approximate\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
   member_method {
     name: "in_top_k"
     argspec: "args=[\'targets\', \'predictions\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt
index a1a7c59dc48..cb3d38246a7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt
@@ -58,7 +58,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt
index 95579a35612..c7b2bca4b6b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt
@@ -58,7 +58,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt
index 78e5f6af446..209c9fe6620 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt
@@ -58,7 +58,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt
index 29e50bd90dc..12bbb14fb71 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt
@@ -58,7 +58,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt
index 4ba2ee05ef4..1482ed54eb9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt
@@ -58,7 +58,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt
index 419378e2174..2a422fa2340 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt
@@ -58,7 +58,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt
index a390e41c6ea..e7021e02772 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt
@@ -57,7 +57,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt
index 588dce59e1c..6543f4023a4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt
@@ -58,7 +58,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt
index 5c4bcc4c17d..94ff8dfcdfc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt
@@ -58,7 +58,7 @@ tf_class {
   }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index d9795fde29c..83baba1b1ce 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -66,7 +66,7 @@ tf_module {
   }
   member {
     name: "Tensor"
-    mtype: "<class \'typing.GenericMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "TensorArray"
@@ -392,6 +392,10 @@ tf_module {
     name: "train"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "types"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "uint16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -670,7 +674,7 @@ tf_module {
   }
   member_method {
     name: "function"
-    argspec: "args=[\'func\', \'input_signature\', \'autograph\', \'experimental_implements\', \'experimental_autograph_options\', \'experimental_relax_shapes\', \'experimental_compile\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'func\', \'input_signature\', \'autograph\', \'experimental_implements\', \'experimental_autograph_options\', \'experimental_relax_shapes\', \'experimental_compile\', \'experimental_follow_type_hints\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\', \'None\', \'False\', \'None\', \'None\'], "
   }
   member_method {
     name: "gather"
@@ -740,6 +744,10 @@ tf_module {
     name: "init_scope"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "inside_function"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "is_tensor"
     argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
@@ -826,7 +834,7 @@ tf_module {
   }
   member_method {
     name: "nondifferentiable_batch_function"
-    argspec: "args=[\'num_batch_threads\', \'max_batch_size\', \'batch_timeout_micros\', \'allowed_batch_sizes\', \'max_enqueued_batches\', \'autograph\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\'], "
+    argspec: "args=[\'num_batch_threads\', \'max_batch_size\', \'batch_timeout_micros\', \'allowed_batch_sizes\', \'max_enqueued_batches\', \'autograph\', \'enable_large_batch_splitting\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'True\'], "
   }
   member_method {
     name: "norm"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 0db6da3dad2..0a2843431f2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -768,6 +768,10 @@ tf_module {
     name: "CollectiveReduce"
     argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'subdiv_offsets\', \'wait_for\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'auto\', \'0\', \'None\'], "
   }
+  member_method {
+    name: "CollectiveReduceV2"
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
+  }
   member_method {
     name: "CombinedNonMaxSuppression"
     argspec: "args=[\'boxes\', \'scores\', \'max_output_size_per_class\', \'max_total_size\', \'iou_threshold\', \'score_threshold\', \'pad_per_class\', \'clip_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'None\'], "
@@ -792,6 +796,10 @@ tf_module {
     name: "ComputeAccidentalHits"
     argspec: "args=[\'true_classes\', \'sampled_candidates\', \'num_true\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
   }
+  member_method {
+    name: "ComputeBatchSize"
+    argspec: "args=[\'input_dataset\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Concat"
     argspec: "args=[\'concat_dim\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -2656,6 +2664,10 @@ tf_module {
     name: "OptimizeDataset"
     argspec: "args=[\'input_dataset\', \'optimizations\', \'output_types\', \'output_shapes\', \'optimization_configs\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'None\'], "
   }
+  member_method {
+    name: "OptimizeDatasetV2"
+    argspec: "args=[\'input_dataset\', \'optimizations_enabled\', \'optimizations_disabled\', \'optimizations_default\', \'output_types\', \'output_shapes\', \'optimization_configs\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'None\'], "
+  }
   member_method {
     name: "OptionalFromValue"
     argspec: "args=[\'components\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -3308,6 +3320,10 @@ tf_module {
     name: "RebatchDataset"
     argspec: "args=[\'input_dataset\', \'num_replicas\', \'output_types\', \'output_shapes\', \'use_fallback\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
+  member_method {
+    name: "RebatchDatasetV2"
+    argspec: "args=[\'input_dataset\', \'batch_sizes\', \'drop_remainder\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Reciprocal"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -4492,6 +4508,10 @@ tf_module {
     name: "StatefulUniformInt"
     argspec: "args=[\'resource\', \'algorithm\', \'shape\', \'minval\', \'maxval\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "StatelessCase"
+    argspec: "args=[\'branch_index\', \'input\', \'Tout\', \'branches\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'None\'], "
+  }
   member_method {
     name: "StatelessIf"
     argspec: "args=[\'cond\', \'input\', \'Tout\', \'then_branch\', \'else_branch\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'None\'], "
@@ -4532,6 +4552,10 @@ tf_module {
     name: "StatelessRandomUniformInt"
     argspec: "args=[\'shape\', \'seed\', \'minval\', \'maxval\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "StatelessSampleDistortedBoundingBox"
+    argspec: "args=[\'image_size\', \'bounding_boxes\', \'min_object_covered\', \'seed\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'[0.75, 1.33]\', \'[0.05, 1]\', \'100\', \'False\', \'None\'], "
+  }
   member_method {
     name: "StatelessTruncatedNormal"
     argspec: "args=[\'shape\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-tensor.pbtxt
index a49cd1ccc4d..f0efebb3c8b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-tensor.pbtxt
@@ -52,4 +52,8 @@ tf_class {
     name: "get_shape"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "with_values"
+    argspec: "args=[\'self\', \'new_values\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
index 0028b7d8953..4bd63ab5243 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
@@ -40,6 +40,10 @@ tf_module {
     name: "from_dense"
     argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "map_values"
+    argspec: "args=[\'op\'], varargs=args, keywords=kwargs, defaults=None"
+  }
   member_method {
     name: "mask"
     argspec: "args=[\'a\', \'mask_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt
index 9cc8354b4bf..c15fdab977b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt
@@ -10,12 +10,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_config\', \'batch_size\', \'optimizer\', \'pipeline_execution_with_tensor_core\', \'initialize_tpu_embedding\'], varargs=None, keywords=None, defaults=[\'False\', \'True\'], "
+    argspec: "args=[\'self\', \'feature_config\', \'optimizer\', \'pipeline_execution_with_tensor_core\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "apply_gradients"
     argspec: "args=[\'self\', \'gradients\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'per_replica_batch_size\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "dequeue"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.types.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.types.experimental.pbtxt
new file mode 100644
index 00000000000..d1b54df0c87
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.types.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.types.experimental"
+tf_module {
+  member {
+    name: "TensorLike"
+    mtype: "typing.Union"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.types.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.types.pbtxt
new file mode 100644
index 00000000000..7b97e527b46
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.types.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.types"
+tf_module {
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
+}
diff --git a/tensorflow/tools/api/lib/python_object_to_proto_visitor.py b/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
index 77882d50bae..86994248cc5 100644
--- a/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
+++ b/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
@@ -95,11 +95,13 @@ else:
     return False
 
 
-# Differences created by Generic typing.
+# Differences created by typing implementations.
 _NORMALIZE_TYPE[(
     'tensorflow.python.framework.ops.Tensor')] = (
         "<class 'tensorflow.python.framework.ops.Tensor'>")
 _NORMALIZE_TYPE['typing.Generic'] = "<class 'typing.Generic'>"
+# TODO(mdan): Remove once the golden files are generated in Python 3.7.
+_NORMALIZE_TYPE["<class 'typing._GenericAlias'>"] = 'typing.Union'
 
 
 if sys.version_info.major == 3 and sys.version_info.minor >= 8:
diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD
index 8a2880bcb64..8ad25045941 100644
--- a/tensorflow/tools/api/tests/BUILD
+++ b/tensorflow/tools/api/tests/BUILD
@@ -24,6 +24,7 @@ py_test(
         "//tensorflow/tools/api/golden:api_golden_v2",
         "//tensorflow/tools/api/tests:API_UPDATE_WARNING.txt",
         "//tensorflow/tools/api/tests:README.txt",
+        "//third_party/py/numpy/tf_numpy_api:api_golden",
     ],
     python_version = "PY3",
     srcs_version = "PY2AND3",
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index 1c3b883d40e..cdc955f7dac 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -132,6 +132,13 @@ def _KeyToFilePath(key, api_version):
                                 six.ensure_str(key))
   api_folder = (
       _API_GOLDEN_FOLDER_V2 if api_version == 2 else _API_GOLDEN_FOLDER_V1)
+  if key.startswith('tensorflow.experimental.numpy'):
+    # Jumps up one more level in order to let Copybara find the
+    # 'tensorflow/third_party' string to replace
+    api_folder = os.path.join(
+        api_folder, '..', '..', '..', '..', '../third_party',
+        'py', 'numpy', 'tf_numpy_api')
+    api_folder = os.path.normpath(api_folder)
   return os.path.join(api_folder, '%s.pbtxt' % case_insensitive_key)
 
 
@@ -199,6 +206,12 @@ def _FilterGoldenProtoDict(golden_proto_dict, omit_golden_symbols_map):
   return filtered_proto_dict
 
 
+def _GetTFNumpyGoldenPattern(api_version):
+  return os.path.join(resource_loader.get_root_dir_with_all_resources(),
+                      _KeyToFilePath('tensorflow.experimental.numpy*',
+                                     api_version))
+
+
 class ApiCompatibilityTest(test.TestCase):
 
   def __init__(self, *args, **kwargs):
@@ -298,8 +311,9 @@ class ApiCompatibilityTest(test.TestCase):
               filepath, text_format.MessageToString(actual_dict[key]))
       else:
         # Include the actual differences to help debugging.
-        for d in diffs:
+        for d, verbose_d in zip(diffs, verbose_diffs):
           logging.error('    %s', d)
+          logging.error('    %s', verbose_d)
         # Fail if we cannot fix the test by updating goldens.
         self.fail('%d differences found between API and golden.' % diff_count)
 
@@ -335,7 +349,7 @@ class ApiCompatibilityTest(test.TestCase):
 
   def _checkBackwardsCompatibility(self,
                                    root,
-                                   golden_file_pattern,
+                                   golden_file_patterns,
                                    api_version,
                                    additional_private_map=None,
                                    omit_golden_symbols_map=None):
@@ -348,6 +362,13 @@ class ApiCompatibilityTest(test.TestCase):
       public_api_visitor.private_map['tf'].append('enable_v2_behavior')
 
     public_api_visitor.do_not_descend_map['tf.GPUOptions'] = ['Experimental']
+    # Do not descend into these numpy classes because their signatures may be
+    # different between internal and OSS.
+    public_api_visitor.do_not_descend_map['tf.experimental.numpy'] = [
+        'bool_', 'complex_', 'complex128', 'complex64', 'float_', 'float16',
+        'float32', 'float64', 'inexact', 'int_', 'int16', 'int32', 'int64',
+        'int8', 'object_', 'string_', 'uint16', 'uint32', 'uint64', 'uint8',
+        'unicode_', 'iinfo']
     if FLAGS.only_test_core_api:
       public_api_visitor.do_not_descend_map['tf'].extend(_NON_CORE_PACKAGES)
     if additional_private_map:
@@ -357,7 +378,7 @@ class ApiCompatibilityTest(test.TestCase):
     proto_dict = visitor.GetProtos()
 
     # Read all golden files.
-    golden_file_list = file_io.get_matching_files(golden_file_pattern)
+    golden_file_list = file_io.get_matching_files(golden_file_patterns)
     if FLAGS.only_test_core_api:
       golden_file_list = _FilterNonCoreGoldenFiles(golden_file_list)
 
@@ -387,9 +408,10 @@ class ApiCompatibilityTest(test.TestCase):
     api_version = 1
     if hasattr(tf, '_major_api_version') and tf._major_api_version == 2:
       api_version = 2
-    golden_file_pattern = os.path.join(
-        resource_loader.get_root_dir_with_all_resources(),
-        _KeyToFilePath('*', api_version))
+    golden_file_patterns = [
+        os.path.join(resource_loader.get_root_dir_with_all_resources(),
+                     _KeyToFilePath('*', api_version)),
+        _GetTFNumpyGoldenPattern(api_version)]
     omit_golden_symbols_map = {}
     if (api_version == 2 and FLAGS.only_test_core_api and
         not _TENSORBOARD_AVAILABLE):
@@ -400,7 +422,7 @@ class ApiCompatibilityTest(test.TestCase):
 
     self._checkBackwardsCompatibility(
         tf,
-        golden_file_pattern,
+        golden_file_patterns,
         api_version,
         # Skip compat.v1 and compat.v2 since they are validated
         # in separate tests.
@@ -412,12 +434,12 @@ class ApiCompatibilityTest(test.TestCase):
 
   def testAPIBackwardsCompatibilityV1(self):
     api_version = 1
-    golden_file_pattern = os.path.join(
+    golden_file_patterns = os.path.join(
         resource_loader.get_root_dir_with_all_resources(),
         _KeyToFilePath('*', api_version))
     self._checkBackwardsCompatibility(
         tf.compat.v1,
-        golden_file_pattern,
+        golden_file_patterns,
         api_version,
         additional_private_map={
             'tf': ['pywrap_tensorflow'],
@@ -427,9 +449,10 @@ class ApiCompatibilityTest(test.TestCase):
 
   def testAPIBackwardsCompatibilityV2(self):
     api_version = 2
-    golden_file_pattern = os.path.join(
-        resource_loader.get_root_dir_with_all_resources(),
-        _KeyToFilePath('*', api_version))
+    golden_file_patterns = [
+        os.path.join(resource_loader.get_root_dir_with_all_resources(),
+                     _KeyToFilePath('*', api_version)),
+        _GetTFNumpyGoldenPattern(api_version)]
     omit_golden_symbols_map = {}
     if FLAGS.only_test_core_api and not _TENSORBOARD_AVAILABLE:
       # In TF 2.0 these summary symbols are imported from TensorBoard.
@@ -438,7 +461,7 @@ class ApiCompatibilityTest(test.TestCase):
       ]
     self._checkBackwardsCompatibility(
         tf.compat.v2,
-        golden_file_pattern,
+        golden_file_patterns,
         api_version,
         additional_private_map={'tf.compat': ['v1', 'v2']},
         omit_golden_symbols_map=omit_golden_symbols_map)
diff --git a/tensorflow/tools/api/tests/module_test.py b/tensorflow/tools/api/tests/module_test.py
index 5397278f5f3..b9ba655f84f 100644
--- a/tensorflow/tools/api/tests/module_test.py
+++ b/tensorflow/tools/api/tests/module_test.py
@@ -59,9 +59,8 @@ class ModuleTest(test.TestCase):
           'tf.Tensor([1 2 3 4 5 6 7 8 9], shape=(9,), dtype=int32)',
           str(tf.range(1, 10)))
     else:
-      self.assertEqual(
-          'Tensor("range:0", shape=(9,), dtype=int32)',
-          str(tf.range(1, 10)))
+      self.assertEqual('Tensor("range:0", shape=(9,), dtype=int32)',
+                       str(tf.range(1, 10)))
 
   def testCompatV2HasCompatV1(self):
     # pylint: disable=pointless-statement
@@ -79,6 +78,9 @@ class ModuleTest(test.TestCase):
       tf.compat.v1.summary.FileWriter
     # pylint: enable=pointless-statement
 
+  def testPythonModuleIsHidden(self):
+    self.assertNotIn('python', dir(tf))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/tools/build_info/gen_build_info.py b/tensorflow/tools/build_info/gen_build_info.py
index 19478ab4dc2..8f84ac4584c 100755
--- a/tensorflow/tools/build_info/gen_build_info.py
+++ b/tensorflow/tools/build_info/gen_build_info.py
@@ -52,6 +52,9 @@ def write_build_info(filename, key_value_list):
     else:
       build_info[key] = value.format(**build_info)
 
+  # Sort the build info to ensure deterministic output.
+  sorted_build_info_pairs = sorted(build_info.items())
+
   contents = """
 # Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
@@ -72,8 +75,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-build_info = {build_info}
-""".format(build_info=build_info)
+import collections
+
+build_info = collections.OrderedDict(%s)
+""" % sorted_build_info_pairs
   open(filename, "w").write(contents)
 
 
diff --git a/tensorflow/tools/ci_build/Dockerfile.pi-python3 b/tensorflow/tools/ci_build/Dockerfile.pi-python3
index bcc5d13f9d5..3dca7e254be 100644
--- a/tensorflow/tools/ci_build/Dockerfile.pi-python3
+++ b/tensorflow/tools/ci_build/Dockerfile.pi-python3
@@ -1,6 +1,9 @@
 FROM ubuntu:16.04
 
-LABEL maintainer="Jan Prach <jendap@google.com>"
+LABEL maintainer="Terry Heo <terryheo@google.com>"
+
+ENV CI_BUILD_PYTHON=python3
+ENV CROSSTOOL_PYTHON_INCLUDE_PATH=/usr/include/python3.5
 
 # Copy and run the install scripts.
 COPY install/*.sh /install/
diff --git a/tensorflow/tools/ci_build/Dockerfile.pi-python37 b/tensorflow/tools/ci_build/Dockerfile.pi-python37
index 2c1cd2f8942..4e301929147 100644
--- a/tensorflow/tools/ci_build/Dockerfile.pi-python37
+++ b/tensorflow/tools/ci_build/Dockerfile.pi-python37
@@ -1,6 +1,9 @@
 FROM ubuntu:16.04
 
-LABEL maintainer="Jan Prach <jendap@google.com>"
+LABEL maintainer="Terry Heo <terryheo@google.com>"
+
+ENV CI_BUILD_PYTHON=python3.7
+ENV CROSSTOOL_PYTHON_INCLUDE_PATH=/usr/include/python3.7
 
 # Copy and run the install scripts.
 COPY install/*.sh /install/
@@ -8,17 +11,16 @@ RUN /install/install_bootstrap_deb_packages.sh
 RUN add-apt-repository -y ppa:openjdk-r/ppa && \
     add-apt-repository -y ppa:george-edison55/cmake-3.x
 RUN /install/install_deb_packages.sh
-RUN /install/install_pip_packages.sh
+
+# The following line installs the Python 3.7 cross-compilation toolchain.
+RUN /install/install_pi_python3x_toolchain.sh "3.7"
+
 RUN /install/install_bazel.sh
 RUN /install/install_proto3.sh
 RUN /install/install_buildifier.sh
 RUN /install/install_auditwheel.sh
 RUN /install/install_golang.sh
 
-# The following line installs the Python cross-compilation toolchain. All the
-# preceding dependencies should be kept in sync with the main CPU docker file.
-RUN /install/install_pi_python37_toolchain.sh
-
 # Set up the master bazelrc configuration file.
 COPY install/.bazelrc /etc/bazel.bazelrc
 
diff --git a/tensorflow/tools/ci_build/Dockerfile.pi-python38 b/tensorflow/tools/ci_build/Dockerfile.pi-python38
new file mode 100644
index 00000000000..b7e6ffce073
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.pi-python38
@@ -0,0 +1,28 @@
+FROM ubuntu:16.04
+
+LABEL maintainer="Terry Heo <terryheo@google.com>"
+
+ENV CI_BUILD_PYTHON=python3.8
+ENV CROSSTOOL_PYTHON_INCLUDE_PATH=/usr/include/python3.8
+
+# Copy and run the install scripts.
+COPY install/*.sh /install/
+RUN /install/install_bootstrap_deb_packages.sh
+RUN add-apt-repository -y ppa:openjdk-r/ppa && \
+    add-apt-repository -y ppa:george-edison55/cmake-3.x
+RUN /install/install_deb_packages.sh
+
+# The following line installs the Python 3.8 cross-compilation toolchain.
+RUN /install/install_pi_python3x_toolchain.sh "3.8"
+
+RUN /install/install_bazel.sh
+RUN /install/install_proto3.sh
+RUN /install/install_buildifier.sh
+RUN /install/install_auditwheel.sh
+RUN /install/install_golang.sh
+
+# Set up the master bazelrc configuration file.
+COPY install/.bazelrc /etc/bazel.bazelrc
+
+# XLA is not needed for PI
+ENV TF_ENABLE_XLA=0
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython
index cd841a77aba..4092346dc7f 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython
@@ -8,7 +8,7 @@
 #  --tag "gcr.io/tensorflow-testing/nosla-cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython" .
 # $ docker push gcr.io/tensorflow-testing/nosla-cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython
 
-FROM nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04-rc as devtoolset
+FROM nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04 as devtoolset
 
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get install -y \
@@ -35,7 +35,7 @@ RUN /build_devtoolset.sh devtoolset-7 /dt7
 RUN /build_devtoolset.sh devtoolset-8 /dt8
 
 # TODO(klimek): Split up into two different docker images.
-FROM nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04-rc
+FROM nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04
 COPY --from=devtoolset /dt7 /dt7
 COPY --from=devtoolset /dt8 /dt8
 
diff --git a/tensorflow/tools/ci_build/Dockerfile.rocm b/tensorflow/tools/ci_build/Dockerfile.rocm
index 6d124204ed8..4f5d3ae7291 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rocm
+++ b/tensorflow/tools/ci_build/Dockerfile.rocm
@@ -1,6 +1,6 @@
 # This Dockerfile provides a starting point for a ROCm installation of
 # MIOpen and tensorflow.
-FROM ubuntu:xenial
+FROM ubuntu:bionic
 MAINTAINER Jeff Poznanovic <jeffrey.poznanovic@amd.com>
 
 ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/3.3/
@@ -19,9 +19,9 @@ RUN sh -c  "echo deb [arch=amd64] $DEB_ROCM_REPO xenial main > /etc/apt/sources.
 # Install misc pkgs
 RUN apt-get update --allow-insecure-repositories && DEBIAN_FRONTEND=noninteractive apt-get install -y \
   build-essential \
-  clang-3.8 \
-  clang-format-3.8 \
-  clang-tidy-3.8 \
+  clang-6.0 \
+  clang-format-6.0 \
+  clang-tidy-6.0 \
   cmake \
   cmake-qt-gui \
   ssh \
@@ -91,8 +91,6 @@ RUN touch ${ROCM_PATH}/.info/version
 COPY install/*.sh /install/
 ARG DEBIAN_FRONTEND=noninteractive
 RUN /install/install_bootstrap_deb_packages.sh
-RUN add-apt-repository -y ppa:openjdk-r/ppa && \
-    add-apt-repository -y ppa:george-edison55/cmake-3.x
 RUN /install/install_deb_packages.sh
 RUN /install/install_pip_packages.sh
 RUN /install/install_bazel.sh
diff --git a/tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh b/tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
similarity index 93%
rename from tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
rename to tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
index 203356952cb..bb95df86342 100755
--- a/tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+++ b/tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
@@ -15,4 +15,4 @@
 #!/bin/bash
 set -x
 
-DEFAULT_BAZEL_TARGETS="//tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... //tensorflow/compiler/mlir/lite/... -//tensorflow/lite/micro/examples/..."
+DEFAULT_BAZEL_TARGETS="//tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... //tensorflow/compiler/mlir/lite/... -//tensorflow/lite/micro/examples/... -//tensorflow/core/tpu/..."
diff --git a/tensorflow/tools/ci_build/builds/builds_common.sh b/tensorflow/tools/ci_build/builds/builds_common.sh
index 8b0c065a9e3..c5698f1068e 100644
--- a/tensorflow/tools/ci_build/builds/builds_common.sh
+++ b/tensorflow/tools/ci_build/builds/builds_common.sh
@@ -126,7 +126,7 @@ test_runner() {
   # Run a suite of tests, print failure logs (if any), wall-time each test,
   # and show the summary at the end.
   #
-  # Usage: test_runner <TEST_DESC> <ALL_TESTS> <TEST_BLACKLIST> <LOGS_DIR>
+  # Usage: test_runner <TEST_DESC> <ALL_TESTS> <TEST_DENYLIST> <LOGS_DIR>
   # e.g.,  test_runner "Tutorial test-on-install" \
   #                    "test1 test2 test3" "test2 test3" "/tmp/log_dir"
 
@@ -136,7 +136,7 @@ test_runner() {
 
   TEST_DESC=$1
   ALL_TESTS_STR=$2
-  TEST_BLACKLIST_SR=$3
+  TEST_DENYLIST_SR=$3
   LOGS_DIR=$4
 
   NUM_TESTS=$(echo "${ALL_TESTS_STR}" | wc -w)
@@ -152,9 +152,9 @@ test_runner() {
     ((COUNTER++))
     STAT_STR="(${COUNTER} / ${NUM_TESTS})"
 
-    if [[ "${TEST_BLACKLIST_STR}" == *"${CURR_TEST}"* ]]; then
+    if [[ "${TEST_DENYLIST_STR}" == *"${CURR_TEST}"* ]]; then
       ((SKIPPED_COUNTER++))
-      echo "${STAT_STR} Blacklisted ${TEST_DESC} SKIPPED: ${CURR_TEST}"
+      echo "${STAT_STR} Denylisted ${TEST_DESC} SKIPPED: ${CURR_TEST}"
       continue
     fi
 
diff --git a/tensorflow/tools/ci_build/builds/integration_tests.sh b/tensorflow/tools/ci_build/builds/integration_tests.sh
index 18dbd2a2363..6b036da0783 100755
--- a/tensorflow/tools/ci_build/builds/integration_tests.sh
+++ b/tensorflow/tools/ci_build/builds/integration_tests.sh
@@ -24,19 +24,19 @@
 # the Python binary path.
 #
 # This script obeys the following environment variables (if exists):
-#   TF_BUILD_INTEG_TEST_BLACKLIST: Force skipping of specified integration tests
+#   TF_BUILD_INTEG_TEST_DENYLIST: Force skipping of specified integration tests
 #       listed in INTEG_TESTS below.
 #
 
 # List of all integration tests to run, separated by spaces
 INTEG_TESTS="ffmpeg_lib"
 
-if [[ -z "${TF_BUILD_INTEG_TEST_BLACKLIST}" ]]; then
-  TF_BUILD_INTEG_TEST_BLACKLIST=""
+if [[ -z "${TF_BUILD_INTEG_TEST_DENYLIST}" ]]; then
+  TF_BUILD_INTEG_TEST_DENYLIST=""
 fi
 echo ""
 echo "=== Integration Tests ==="
-echo "TF_BUILD_INTEG_TEST_BLACKLIST = \"${TF_BUILD_INTEG_TEST_BLACKLIST}\""
+echo "TF_BUILD_INTEG_TEST_DENYLIST = \"${TF_BUILD_INTEG_TEST_DENYLIST}\""
 
 # Timeout (in seconds) for each integration test
 TIMEOUT=1800
@@ -121,4 +121,4 @@ test_ffmpeg_lib() {
 
 # Run the integration tests
 test_runner "integration test-on-install" \
-    "${INTEG_TESTS}" "${TF_BUILD_INTEG_TEST_BLACKLIST}" "${LOGS_DIR}"
+    "${INTEG_TESTS}" "${TF_BUILD_INTEG_TEST_DENYLIST}" "${LOGS_DIR}"
diff --git a/tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh b/tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
index 69ce63c13cb..19e1232cd92 100644
--- a/tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
+++ b/tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
@@ -19,8 +19,8 @@ set -e
 set -x
 
 # CPU size
-MAC_CPU_MAX_WHL_SIZE=160M
-LINUX_CPU_MAX_WHL_SIZE=133M
+MAC_CPU_MAX_WHL_SIZE=165M
+LINUX_CPU_MAX_WHL_SIZE=138M
 WIN_CPU_MAX_WHL_SIZE=113M
 # GPU size
 LINUX_GPU_MAX_WHL_SIZE=337M
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
deleted file mode 100755
index d9f2a4df61a..00000000000
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ /dev/null
@@ -1,581 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Build the Python PIP installation package for TensorFlow and install
-# the package.
-# The PIP installation is done using the --user flag.
-#
-# Usage:
-#   pip.sh CONTAINER_TYPE [--test_tutorials] [--integration_tests] [bazel flags]
-#
-# When executing the Python unit tests, the script obeys the shell
-# variables: TF_BUILD_BAZEL_CLEAN, TF_BUILD_INSTALL_EXTRA_PIP_PACKAGES,
-# NO_TEST_ON_INSTALL, PIP_TEST_ROOT, TF_NIGHTLY
-#
-# TF_BUILD_BAZEL_CLEAN, if set to any non-empty and non-0 value, directs the
-# script to perform bazel clean prior to main build and test steps.
-#
-# TF_BUILD_INSTALL_EXTRA_PIP_PACKAGES overrides the default extra pip packages
-# to be installed in virtualenv before run_pip_tests.sh is called. Multiple
-# package names are separated with spaces.
-#
-# If NO_TEST_ON_INSTALL has any non-empty and non-0 value, the test-on-install
-# part will be skipped.
-#
-# If NO_TEST_USER_OPS has any non-empty and non-0 value, the testing of user-
-# defined ops against the installation will be skipped.
-#
-# If NO_TEST_TFDBG_BINARIES has any non-empty and non-0 value, the testing of
-# TensorFlow Debugger (tfdbg) binaries and examples will be skipped.
-#
-# If PIP_TEST_ROOT has a non-empty and a non-0 value, the whl files will be
-# placed in that directory.
-#
-# If TF_NIGHTLY has a non-empty and a non-0 value, the name of the project will
-# be changed to tf_nightly or tf_nightly_gpu.
-#
-# Any flags not listed in the usage above will be passed directly to Bazel.
-#
-# If the --test_tutorials flag is set, it will cause the script to run the
-# tutorial tests (see test_tutorials.sh) after the PIP
-# installation and the Python unit tests-on-install step. Likewise,
-# --integration_tests will cause the integration tests (integration_tests.sh)
-# to run.
-#
-
-# Helper function: Strip leading and trailing whitespaces
-str_strip () {
-  echo -e "$1" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//'
-}
-
-# Fixed naming patterns for wheel (.whl) files given different python versions
-if [[ $(uname) == "Linux" ]]; then
-  declare -A WHL_TAGS
-  WHL_TAGS=(["2.7"]="cp27-none" ["3.4"]="cp34-cp34m" ["3.5"]="cp35-cp35m")
-fi
-
-
-INSTALL_EXTRA_PIP_PACKAGES=${TF_BUILD_INSTALL_EXTRA_PIP_PACKAGES}
-
-
-# Script directory
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-source "${SCRIPT_DIR}/builds_common.sh"
-
-
-SKIP_RETURN_CODE=112
-
-
-# Get the command line arguments
-CONTAINER_TYPE=$( echo "$1" | tr '[:upper:]' '[:lower:]' )
-shift
-
-if [[ -n "${TF_BUILD_BAZEL_CLEAN}" ]] && \
-   [[ "${TF_BUILD_BAZEL_CLEAN}" != "0" ]]; then
-  echo "TF_BUILD_BAZEL_CLEAN=${TF_BUILD_BAZEL_CLEAN}: Performing 'bazel clean'"
-  bazel clean
-fi
-
-DO_TEST_USER_OPS=1
-if [[ -n "${NO_TEST_USER_OPS}" ]] && \
-   [[ "${NO_TEST_USER_OPS}" != "0" ]]; then
-  echo "NO_TEST_USER_OPS=${NO_TEST_USER_OPS}: Will skip testing of user ops"
-  DO_TEST_USER_OPS=0
-fi
-
-DO_TEST_TFDBG_BINARIES=1
-if [[ -n "${NO_TEST_TFDBG_BINARIES}" ]] && \
-   [[ "${NO_TEST_TFDBG_BINARIES}" != "0" ]]; then
-  echo "NO_TEST_TFDBG_BINARIES=${NO_TEST_TFDBG_BINARIES}: Will skip testing of tfdbg binaries"
-  DO_TEST_TFDBG_BINARIES=0
-fi
-
-DO_TEST_TUTORIALS=0
-DO_INTEGRATION_TESTS=0
-BAZEL_FLAGS=""
-while true; do
-  if [[ "${1}" == "--test_tutorials" ]]; then
-    DO_TEST_TUTORIALS=1
-  elif [[ "${1}" == "--integration_tests" ]]; then
-    DO_INTEGRATION_TESTS=1
-  else
-    BAZEL_FLAGS="${BAZEL_FLAGS} ${1}"
-  fi
-
-  shift
-  if [[ -z "${1}" ]]; then
-    break
-  fi
-done
-
-BAZEL_FLAGS=$(str_strip "${BAZEL_FLAGS}")
-
-if [[ -z "$GIT_TAG_OVERRIDE" ]]; then
-  BAZEL_FLAGS+=" --action_env=GIT_TAG_OVERRIDE"
-fi
-
-echo "Using Bazel flags: ${BAZEL_FLAGS}"
-
-PIP_BUILD_TARGET="//tensorflow/tools/pip_package:build_pip_package"
-GPU_FLAG=""
-ROCM_FLAG=""
-if [[ ${CONTAINER_TYPE} == "cpu" ]] || \
-   [[ ${CONTAINER_TYPE} == "debian.jessie.cpu" ]]; then
-  bazel build ${BAZEL_FLAGS} ${PIP_BUILD_TARGET} || \
-      die "Build failed."
-elif [[ ${CONTAINER_TYPE} == "gpu" ]]; then
-  bazel build ${BAZEL_FLAGS} ${PIP_BUILD_TARGET} || \
-      die "Build failed."
-  GPU_FLAG="--gpu"
-elif [[ ${CONTAINER_TYPE} == "rocm" ]]; then
-  bazel build ${BAZEL_FLAGS} ${PIP_BUILD_TARGET} || \
-      die "Build failed."
-  ROCM_FLAG="--rocm"
-else
-  die "Unrecognized container type: \"${CONTAINER_TYPE}\""
-fi
-
-MAC_FLAG=""
-if [[ $(uname) == "Darwin" ]]; then
-  MAC_FLAG="--mac"
-fi
-
-
-# Check if in a virtualenv
-IN_VENV=$(python -c 'import sys; print("1" if hasattr(sys, "real_prefix") else "0")')
-# If still in a virtualenv, deactivate it first
-if [[ "$IN_VENV" == "1" ]]; then
-  echo "It appears that we are already in a virtualenv. Deactivating..."
-  deactivate || die "FAILED: Unable to deactivate from existing virtualenv"
-fi
-
-# Obtain the path to Python binary
-source tools/python_bin_path.sh
-
-# Assume: PYTHON_BIN_PATH is exported by the script above
-if [[ -z "$PYTHON_BIN_PATH" ]]; then
-  die "PYTHON_BIN_PATH was not provided. Did you run configure?"
-fi
-
-# Determine the major and minor versions of Python being used (e.g., 2.7)
-# This info will be useful for determining the directory of the local pip
-# installation of Python
-PY_MAJOR_MINOR_VER=$(${PYTHON_BIN_PATH} -V 2>&1 | awk '{print $NF}' | cut -d. -f-2)
-if [[ -z "${PY_MAJOR_MINOR_VER}" ]]; then
-  die "ERROR: Unable to determine the major.minor version of Python"
-fi
-
-echo "Python binary path to be used in PIP install: ${PYTHON_BIN_PATH} "\
-"(Major.Minor version: ${PY_MAJOR_MINOR_VER})"
-
-# Create a TF_NIGHTLY argument if this is a nightly build
-PROJECT_NAME="tensorflow"
-NIGHTLY_FLAG=""
-if [ -n "$TF_NIGHTLY" ]; then
-  PROJECT_NAME="tf_nightly"
-  NIGHTLY_FLAG="--nightly_flag"
-fi
-
-# Build PIP Wheel file
-# Set default pip file folder unless specified by env variable
-if [ -z "$PIP_TEST_ROOT" ]; then
-  PIP_TEST_ROOT="pip_test"
-fi
-PIP_WHL_DIR="${PIP_TEST_ROOT}/whl"
-PIP_WHL_DIR=$(realpath ${PIP_WHL_DIR})  # Get absolute path
-rm -rf ${PIP_WHL_DIR} && mkdir -p ${PIP_WHL_DIR}
-bazel-bin/tensorflow/tools/pip_package/build_pip_package ${PIP_WHL_DIR} ${GPU_FLAG} ${ROCM_FLAG} ${NIGHTLY_FLAG} || \
-    die "build_pip_package FAILED"
-
-WHL_PATH=$(ls ${PIP_WHL_DIR}/${PROJECT_NAME}*.whl)
-if [[ $(echo ${WHL_PATH} | wc -w) -ne 1 ]]; then
-  die "ERROR: Failed to find exactly one built TensorFlow .whl file in "\
-"directory: ${PIP_WHL_DIR}"
-fi
-
-# Print the size of the PIP wheel file.
-echo
-echo "Size of the PIP wheel file built: $(ls -l ${WHL_PATH} | awk '{print $5}')"
-echo
-
-# Rename the whl file properly so it will have the python
-# version tags and platform tags that won't cause pip install issues.
-if [[ $(uname) == "Linux" ]]; then
-  PY_TAGS=${WHL_TAGS[${PY_MAJOR_MINOR_VER}]}
-  PLATFORM_TAG=$(to_lower "$(uname)_$(uname -m)")
-# MAC has bash v3, which does not have associative array
-elif [[ $(uname) == "Darwin" ]]; then
-  if [[ ${PY_MAJOR_MINOR_VER} == "2.7" ]]; then
-    PY_TAGS="py2-none"
-  elif [[ ${PY_MAJOR_MINOR_VER} == "3.5" ]]; then
-    PY_TAGS="py3-none"
-  elif [[ ${PY_MAJOR_MINOR_VER} == "3.6" ]]; then
-    PY_TAGS="py3-none"
-  fi
-  PLATFORM_TAG="any"
-fi
-
-WHL_DIR=$(dirname "${WHL_PATH}")
-WHL_BASE_NAME=$(basename "${WHL_PATH}")
-
-if [[ -n "${PY_TAGS}" ]]; then
-  NEW_WHL_BASE_NAME=$(echo ${WHL_BASE_NAME} | cut -d \- -f 1)-\
-$(echo ${WHL_BASE_NAME} | cut -d \- -f 2)-${PY_TAGS}-${PLATFORM_TAG}.whl
-
-  if [[ ! -f "${WHL_DIR}/${NEW_WHL_BASE_NAME}" ]]; then
-    if cp "${WHL_DIR}/${WHL_BASE_NAME}" "${WHL_DIR}/${NEW_WHL_BASE_NAME}"
-    then
-      echo "Copied wheel file: ${WHL_BASE_NAME} --> ${NEW_WHL_BASE_NAME}"
-    else
-      die "ERROR: Failed to copy wheel file to ${NEW_WHL_BASE_NAME}"
-    fi
-  fi
-fi
-
-if [[ $(uname) == "Linux" ]]; then
-  AUDITED_WHL_NAME="${WHL_DIR}/$(echo ${WHL_BASE_NAME//linux/manylinux1})"
-
-  # Repair the wheels for cpu manylinux1
-  if [[ ${CONTAINER_TYPE} == "cpu" ]]; then
-    echo "auditwheel repairing ${WHL_PATH}"
-    auditwheel repair -w ${WHL_DIR} ${WHL_PATH}
-
-    if [[ -f ${AUDITED_WHL_NAME} ]]; then
-      WHL_PATH=${AUDITED_WHL_NAME}
-      echo "Repaired manylinx1 wheel file at: ${WHL_PATH}"
-    else
-      die "ERROR: Cannot find repaired wheel."
-    fi
-  # Copy and rename for gpu manylinux as we do not want auditwheel to package in libcudart.so
-  elif [[ ${CONTAINER_TYPE} == "gpu" ]] || \
-       [[ ${CONTAINER_TYPE} == "rocm" ]]; then
-    WHL_PATH=${AUDITED_WHL_NAME}
-    cp ${WHL_DIR}/${WHL_BASE_NAME} ${WHL_PATH}
-    echo "Copied manylinx1 wheel file at ${WHL_PATH}"
-  fi
-fi
-
-
-create_activate_virtualenv_and_install_tensorflow() {
-  # Create and activate a virtualenv; then install tensorflow pip package in it.
-  #
-  # Usage:
-  #   create_activate_virtualenv_and_install_tensorflow [--clean] \
-  #       <VIRTUALENV_DIR> <TF_WHEEL_PATH>
-  #
-  # Arguments:
-  #   --clean: Create a clean virtualenv, i.e., without --system-site-packages.
-  #   VIRTUALENV_DIR: virtualenv directory to be created.
-  #   TF_WHEEL_PATH: Path to the tensorflow wheel file to be installed in the
-  #     virtualenv.
-
-  VIRTUALENV_FLAGS="--system-site-packages"
-  if [[ "$1" == "--clean" ]]; then
-    VIRTUALENV_FLAGS=""
-    shift
-  fi
-
-  VIRTUALENV_DIR="$1"
-  TF_WHEEL_PATH="$2"
-  if [[ -d "${VIRTUALENV_DIR}" ]]; then
-    if rm -rf "${VIRTUALENV_DIR}"
-    then
-      echo "Removed existing virtualenv directory: ${VIRTUALENV_DIR}"
-    else
-      die "Failed to remove existing virtualenv directory: ${VIRTUALENV_DIR}"
-    fi
-  fi
-
-  if mkdir -p "${VIRTUALENV_DIR}"
-  then
-    echo "Created virtualenv directory: ${VIRTUALENV_DIR}"
-  else
-    die "FAILED to create virtualenv directory: ${VIRTUALENV_DIR}"
-  fi
-
-  # Use the virtualenv from the default python version (i.e., python-virtualenv)
-  # to create the virtualenv directory for testing. Use the -p flag to specify
-  # the python version inside the to-be-created virtualenv directory.
-  ${PYTHON_BIN_PATH} -m virtualenv -p "${PYTHON_BIN_PATH}" ${VIRTUALENV_FLAGS} \
-    "${VIRTUALENV_DIR}" || \
-    die "FAILED: Unable to create virtualenv"
-
-  source "${VIRTUALENV_DIR}/bin/activate" || \
-    die "FAILED: Unable to activate virtualenv in ${VIRTUALENV_DIR}"
-
-  # Install the pip file in virtual env.
-
-  # Upgrade pip so it supports tags such as cp27mu, manylinux1 etc.
-  echo "Upgrade pip in virtualenv"
-
-  # NOTE: pip install --upgrade pip leads to a documented TLS issue for
-  # some versions in python
-  curl https://bootstrap.pypa.io/get-pip.py | python
-
-  # Force upgrade of setuptools. This must happen before the pip install of the
-  # WHL_PATH, which pulls in absl-py, which uses install_requires notation
-  # introduced in setuptools >=20.5. The default version of setuptools is 5.5.1,
-  # which is too old for absl-py.
-  pip install --upgrade setuptools==39.1.0
-
-  # Force tensorflow reinstallation. Otherwise it may not get installed from
-  # last build if it had the same version number as previous build.
-  PIP_FLAGS="--upgrade --force-reinstall"
-  pip install -v ${PIP_FLAGS} ${WHL_PATH} || \
-    die "pip install (forcing to reinstall tensorflow) FAILED"
-  echo "Successfully installed pip package ${TF_WHEEL_PATH}"
-
-  # Force downgrade of setuptools. This must happen after the pip install of the
-  # WHL_PATH, which ends up upgrading to the latest version of setuptools.
-  # Versions of setuptools >= 39.1.0 will cause tests to fail like this:
-  #   ImportError: cannot import name py31compat
-  pip install --upgrade setuptools==39.1.0
-}
-
-################################################################################
-# Smoke test of tensorflow install in clean virtualenv
-################################################################################
-do_clean_virtualenv_smoke_test() {
-  if [[ -n "${NO_TEST_ON_INSTALL}" ]] &&
-       [[ "${NO_TEST_ON_INSTALL}" != "0" ]]; then
-    echo "NO_TEST_ON_INSTALL=${NO_TEST_ON_INSTALL}:"
-    echo "  Skipping smoke test of tensorflow install in clean virtualenv"
-    return ${SKIP_RETURN_CODE}
-  fi
-
-  CLEAN_VENV_DIR="${PIP_TEST_ROOT}/venv_clean"
-  create_activate_virtualenv_and_install_tensorflow --clean \
-    "${CLEAN_VENV_DIR}" "${WHL_PATH}"
-
-  # cd to a temporary directory to avoid picking up Python files in the source
-  # tree.
-  TMP_DIR=$(mktemp -d)
-  pushd "${TMP_DIR}"
-  if [[ $(python -c "import tensorflow as tf; print(tf.Session().run(tf.constant(42)))") == 42 ]];
-  then
-    echo "Smoke test of tensorflow install in clean virtualenv PASSED."
-  else
-    echo "Smoke test of tensorflow install in clean virtualenv FAILED."
-    return 1
-  fi
-
-  deactivate
-  if [[ $? != 0 ]]; then
-    echo "FAILED: Unable to deactivate virtualenv from ${CLEAN_VENV_DIR}"
-    return 1
-  fi
-
-  popd
-  rm -rf "${TMP_DIR}" "${CLEAN_VENV_DIR}"
-}
-
-################################################################################
-# Perform installation of tensorflow in "non-clean" virtualenv and tests against
-# the install.
-################################################################################
-do_virtualenv_pip_test() {
-  # Create virtualenv directory for install test
-  VENV_DIR="${PIP_TEST_ROOT}/venv"
-  create_activate_virtualenv_and_install_tensorflow \
-    "${VENV_DIR}" "${WHL_PATH}"
-
-  # Install extra pip packages required by the test-on-install
-  for PACKAGE in ${INSTALL_EXTRA_PIP_PACKAGES}; do
-    echo "Installing extra pip package required by test-on-install: ${PACKAGE}"
-
-    pip install ${PACKAGE}
-    if [[ $? != 0 ]]; then
-      echo "pip install ${PACKAGE} FAILED"
-      return 1
-    fi
-  done
-
-  if [[ -n "${NO_TEST_ON_INSTALL}" ]] &&
-     [[ "${NO_TEST_ON_INSTALL}" != "0" ]]; then
-    echo "NO_TEST_ON_INSTALL=${NO_TEST_ON_INSTALL}:"
-    echo "  Skipping ALL Python unit tests on install"
-    return ${SKIP_RETURN_CODE}
-  else
-    # Call run_pip_tests.sh to perform test-on-install
-    "${SCRIPT_DIR}/run_pip_tests.sh" --virtualenv ${GPU_FLAG} ${ROCM_FLAG} ${MAC_FLAG}
-    if [[ $? != 0 ]]; then
-      echo "PIP tests-on-install FAILED"
-      return 1
-    fi
-  fi
-}
-
-################################################################################
-# Run tests tagged with oss_serial against the virtualenv install.
-################################################################################
-do_virtualenv_oss_serial_pip_test() {
-  if [[ -n "${NO_TEST_ON_INSTALL}" ]] &&
-     [[ "${NO_TEST_ON_INSTALL}" != "0" ]]; then
-    echo "NO_TEST_ON_INSTALL=${NO_TEST_ON_INSTALL}:"
-    echo "  Skipping Python unit tests on install tagged with oss_serial"
-    return ${SKIP_RETURN_CODE}
-  else
-    # Call run_pip_tests.sh to perform test-on-install
-    "${SCRIPT_DIR}/run_pip_tests.sh" \
-      --virtualenv ${GPU_FLAG} ${ROCM_FLAG} ${MAC_FLAG} --oss_serial
-    if [[ $? != 0 ]]; then
-      echo "PIP tests-on-install (oss_serial) FAILED"
-      return 1
-    fi
-  fi
-}
-
-################################################################################
-# Test user ops (optional).
-################################################################################
-do_test_user_ops() {
-  if [[ "${DO_TEST_USER_OPS}" == "1" ]]; then
-    "${SCRIPT_DIR}/test_user_ops.sh" --virtualenv ${GPU_FLAG} ${ROCM_FLAG}
-    if [[ $? != 0 ]]; then
-      echo "PIP user-op tests-on-install FAILED"
-      return 1
-    fi
-  else
-    echo "Skipping user-op test-on-install due to DO_TEST_USER_OPS = ${DO_TEST_USER_OPS}"
-    return ${SKIP_RETURN_CODE}
-  fi
-}
-
-################################################################################
-# Test TensorFlow Debugger (tfdbg) binaries (optional).
-################################################################################
-do_test_tfdbg_binaries() {
-  if [[ "${DO_TEST_TFDBG_BINARIES}" == "1" ]]; then
-    # cd to a temporary directory to avoid picking up Python files in the source
-    # tree.
-    TMP_DIR=$(mktemp -d)
-    pushd "${TMP_DIR}"
-
-    "${SCRIPT_DIR}/../../../python/debug/examples/examples_test.sh" \
-      --virtualenv
-    if  [[ $? != 0 ]]; then
-      echo "PIP tests-on-install of tfdbg binaries FAILED"
-      return 1
-    fi
-    popd
-  else
-    echo "Skipping test of tfdbg binaries due to DO_TEST_TFDBG_BINARIES = ${DO_TEST_TFDBG_BINARIES}"
-    return ${SKIP_RETURN_CODE}
-  fi
-}
-
-################################################################################
-# Test tutorials (optional).
-################################################################################
-do_test_tutorials() {
-  if [[ "${DO_TEST_TUTORIALS}" == "1" ]]; then
-    "${SCRIPT_DIR}/test_tutorials.sh" --virtualenv
-    if [[ $? != 0 ]]; then
-      echo "PIP tutorial tests-on-install FAILED"
-      return 1
-    fi
-  else
-    echo "Skipping tutorial tests-on-install due to DO_TEST_TUTORIALS = ${DO_TEST_TUTORIALS}"
-    return ${SKIP_RETURN_CODE}
-  fi
-}
-
-################################################################################
-# Integration test for ffmpeg (optional).
-################################################################################
-do_ffmpeg_integration_test() {
-  # Optional: Run integration tests
-  if [[ "${DO_INTEGRATION_TESTS}" == "1" ]]; then
-    "${SCRIPT_DIR}/integration_tests.sh" --virtualenv
-    if [[ $? != 0 ]]; then
-      echo "Integration tests on install FAILED"
-      return 1
-    fi
-  else
-    echo "Skipping ffmpeg integration due to DO_INTEGRATION_TESTS = ${DO_INTEGRATION_TESTS}"
-    return ${SKIP_RETURN_CODE}
-  fi
-}
-
-
-# List of all PIP test tasks and their descriptions.
-PIP_TASKS=("do_clean_virtualenv_smoke_test" "do_virtualenv_pip_test" "do_virtualenv_oss_serial_pip_test" "do_test_user_ops" "do_test_tfdbg_binaries" "do_test_tutorials" "do_ffmpeg_integration_test")
-PIP_TASKS_DESC=("Smoke test of pip install in clean virtualenv" "PIP tests in virtualenv" "PIP test in virtualenv (tag: oss_serial)" "User ops test" "TensorFlow Debugger (tfdbg) binaries test" "Tutorials test" "ffmpeg integration test")
-
-
-# Execute all the PIP test steps.
-COUNTER=0
-FAIL_COUNTER=0
-PASS_COUNTER=0
-SKIP_COUNTER=0
-while [[ ${COUNTER} -lt "${#PIP_TASKS[@]}" ]]; do
-  INDEX=COUNTER
-  ((INDEX++))
-
-  echo
-  printf "${COLOR_BOLD}=== PIP test step ${INDEX} of ${#PIP_TASKS[@]}: "\
-"${PIP_TASKS[COUNTER]} (${PIP_TASKS_DESC[COUNTER]}) ===${COLOR_NC}"
-  echo
-
-  ${PIP_TASKS[COUNTER]}
-  RESULT=$?
-
-  if [[ ${RESULT} == ${SKIP_RETURN_CODE} ]]; then
-    ((SKIP_COUNTER++))
-  elif [[ ${RESULT} != "0" ]]; then
-    ((FAIL_COUNTER++))
-  else
-    ((PASS_COUNTER++))
-  fi
-
-  STEP_EXIT_CODES+=(${RESULT})
-
-  echo ""
-  ((COUNTER++))
-done
-
-deactivate || die "FAILED: Unable to deactivate virtualenv from ${VENV_DIR}"
-
-
-# Print summary of build results
-COUNTER=0
-echo "==== Summary of PIP test results ===="
-while [[ ${COUNTER} -lt "${#PIP_TASKS[@]}" ]]; do
-  INDEX=COUNTER
-  ((INDEX++))
-
-  echo "${INDEX}. ${PIP_TASKS[COUNTER]}: ${PIP_TASKS_DESC[COUNTER]}"
-  if [[ ${STEP_EXIT_CODES[COUNTER]} == ${SKIP_RETURN_CODE} ]]; then
-    printf "  ${COLOR_LIGHT_GRAY}SKIP${COLOR_NC}\n"
-  elif [[ ${STEP_EXIT_CODES[COUNTER]} == "0" ]]; then
-    printf "  ${COLOR_GREEN}PASS${COLOR_NC}\n"
-  else
-    printf "  ${COLOR_RED}FAIL${COLOR_NC}\n"
-  fi
-
-  ((COUNTER++))
-done
-
-echo
-echo "${SKIP_COUNTER} skipped; ${FAIL_COUNTER} failed; ${PASS_COUNTER} passed."
-
-echo
-if [[ ${FAIL_COUNTER} == "0" ]]; then
-  printf "PIP test ${COLOR_GREEN}PASSED${COLOR_NC}\n"
-else
-  printf "PIP test ${COLOR_RED}FAILED${COLOR_NC}\n"
-  exit 1
-fi
diff --git a/tensorflow/tools/ci_build/builds/pip_new.sh b/tensorflow/tools/ci_build/builds/pip_new.sh
index 330fa44b0de..32a4241ca01 100755
--- a/tensorflow/tools/ci_build/builds/pip_new.sh
+++ b/tensorflow/tools/ci_build/builds/pip_new.sh
@@ -448,11 +448,10 @@ install_tensorflow_pip() {
   # Check that requested python version matches configured one.
   check_python_pip_version
 
-  # Force upgrade of setuptools. This must happen before the pip install of the
-  # WHL_PATH, which pulls in absl-py, which uses install_requires notation
-  # introduced in setuptools >=20.5. The default version of setuptools is 5.5.1,
-  # which is too old for absl-py.
-  ${PIP_BIN_PATH} install --upgrade setuptools==39.1.0 || \
+  # Force upgrade of setuptools. We need it to install pips using
+  # `install_requires` notation introduced in setuptools >=20.5. The default
+  # version of setuptools is 5.5.1.
+  ${PIP_BIN_PATH} install --upgrade setuptools || \
     die "Error: setuptools install, upgrade FAILED"
 
   # Force tensorflow reinstallation. Otherwise it may not get installed from
@@ -462,13 +461,6 @@ install_tensorflow_pip() {
     die "pip install (forcing to reinstall tensorflow) FAILED"
   echo "Successfully installed pip package ${WHL_PATH}"
 
-  # Force downgrade of setuptools. This must happen after the pip install of the
-  # WHL_PATH, which ends up upgrading to the latest version of setuptools.
-  # Versions of setuptools >= 39.1.0 will cause tests to fail like this:
-  #   ImportError: cannot import name py31compat
-  ${PIP_BIN_PATH} install --upgrade setuptools==39.1.0 || \
-    die "Error: setuptools install, upgrade FAILED"
-
   # Install the future package in the virtualenv. Installing it in user system
   # packages does not appear to port it over when creating a virtualenv.
   #   ImportError: No module named builtins
diff --git a/tensorflow/tools/ci_build/builds/run_pip_tests.sh b/tensorflow/tools/ci_build/builds/run_pip_tests.sh
index a095633a22e..bf5c0043df0 100755
--- a/tensorflow/tools/ci_build/builds/run_pip_tests.sh
+++ b/tensorflow/tools/ci_build/builds/run_pip_tests.sh
@@ -25,7 +25,7 @@
 # the Python binary path.
 #
 # The --gpu flag informs the script that this is a GPU build, so that the
-# appropriate test blacklists can be applied accordingly.
+# appropriate test denylists can be applied accordingly.
 #
 # The --mac flag informs the script that this is running on mac. Mac does not
 # have flock, so we should skip using parallel_gpu_execute on mac.
diff --git a/tensorflow/tools/ci_build/builds/test_tutorials.sh b/tensorflow/tools/ci_build/builds/test_tutorials.sh
index a12827a2d3c..b5219c47b6a 100755
--- a/tensorflow/tools/ci_build/builds/test_tutorials.sh
+++ b/tensorflow/tools/ci_build/builds/test_tutorials.sh
@@ -28,19 +28,19 @@
 # the Python binary path.
 #
 # This script obeys the following environment variables (if exists):
-#   TUT_TESTS_BLACKLIST: Force skipping of specified tutorial tests listed
+#   TUT_TESTS_DENYLIST: Force skipping of specified tutorial tests listed
 #                        in TUT_TESTS below.
 #
 
 # List of all tutorial tests to run, separated by spaces
 TUT_TESTS="mnist_with_summaries word2vec"
 
-if [[ -z "${TUT_TESTS_BLACKLIST}" ]]; then
-  TF_BUILD_TUT_TEST_BLACKLIST=""
+if [[ -z "${TUT_TESTS_DENYLIST}" ]]; then
+  TF_BUILD_TUT_TEST_DENYLIST=""
 fi
 echo ""
 echo "=== Testing tutorials ==="
-echo "TF_BUILD_TUT_TEST_BLACKLIST = \"${TF_BUILD_TUT_TEST_BLACKLIST}\""
+echo "TF_BUILD_TUT_TEST_DENYLIST = \"${TF_BUILD_TUT_TEST_DENYLIST}\""
 
 # Timeout (in seconds) for each tutorial test
 TIMEOUT=1800
@@ -269,4 +269,4 @@ test_ptb_word_lm() {
 
 # Run the tutorial tests
 test_runner "tutorial test-on-install" \
-    "${TUT_TESTS}" "${TF_BUILD_TUT_TEST_BLACKLIST}" "${LOGS_DIR}"
+    "${TUT_TESTS}" "${TF_BUILD_TUT_TEST_DENYLIST}" "${LOGS_DIR}"
diff --git a/tensorflow/tools/ci_build/builds/test_user_ops.sh b/tensorflow/tools/ci_build/builds/test_user_ops.sh
index 0fe5acfcd9a..08c236a1b19 100755
--- a/tensorflow/tools/ci_build/builds/test_user_ops.sh
+++ b/tensorflow/tools/ci_build/builds/test_user_ops.sh
@@ -23,7 +23,7 @@
 # the Python binary path.
 #
 # The --gpu flag informs the script that this is a GPU build, so that the
-# appropriate test blacklists can be applied accordingly.
+# appropriate test denylists can be applied accordingly.
 #
 
 echo ""
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index eb9a5a2a96e..f4961e896ee 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -355,7 +355,7 @@ do_external_licenses_check(){
 
   EXTERNAL_LICENSES_CHECK_END_TIME=$(date +'%s')
 
-  # Blacklist
+  # Denylist
   echo ${MISSING_LICENSES_FILE}
   grep \
     -e "@bazel_tools//third_party/" \
diff --git a/tensorflow/tools/ci_build/horovod/gpu/nightly.sh b/tensorflow/tools/ci_build/horovod/gpu/nightly.sh
index ed701e6a9f7..060193614c3 100644
--- a/tensorflow/tools/ci_build/horovod/gpu/nightly.sh
+++ b/tensorflow/tools/ci_build/horovod/gpu/nightly.sh
@@ -57,6 +57,10 @@ sudo make all install
 export LD_LIBRARY_PATH=/usr/local/lib/openmpi
 sudo ldconfig
 
+sudo update-alternatives --install /usr/bin/gcc gcc /dt7/usr/bin/gcc 60 --slave /usr/bin/g++ g++ /dt7/usr/bin/g++
+
+g++ --version
+
 # Install Horovod.
 cd ..
 HOROVOD_WITH_TENSORFLOW=1
diff --git a/tensorflow/tools/ci_build/install/install_pi_python37_toolchain.sh b/tensorflow/tools/ci_build/install/install_pi_python3x_toolchain.sh
similarity index 69%
rename from tensorflow/tools/ci_build/install/install_pi_python37_toolchain.sh
rename to tensorflow/tools/ci_build/install/install_pi_python3x_toolchain.sh
index 3bda56af648..e425f7e30b6 100755
--- a/tensorflow/tools/ci_build/install/install_pi_python37_toolchain.sh
+++ b/tensorflow/tools/ci_build/install/install_pi_python3x_toolchain.sh
@@ -14,6 +14,7 @@
 # limitations under the License.
 # ==============================================================================
 
+PYTHON_VERSION=$1
 dpkg --add-architecture armhf
 dpkg --add-architecture arm64
 echo 'deb [arch=arm64,armhf] http://ports.ubuntu.com/ xenial main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
@@ -23,6 +24,15 @@ echo 'deb [arch=arm64,armhf] http://ports.ubuntu.com/ xenial-backports main rest
 sed -i 's#deb http://archive.ubuntu.com/ubuntu/#deb [arch=amd64] http://archive.ubuntu.com/ubuntu/#g' /etc/apt/sources.list
 yes | add-apt-repository ppa:deadsnakes/ppa
 apt-get update
-apt-get install -y python3.7 python3-numpy python3.7-dev python3-pip
-apt-get install -y libpython3.7-dev:armhf
-apt-get install -y libpython3.7-dev:arm64
+apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev
+#/usr/local/bin/python3.x is needed to use /install/install_pip_packages_by_version.sh
+ln -sf /usr/bin/python${PYTHON_VERSION} /usr/local/bin/python${PYTHON_VERSION}
+apt-get install -y libpython${PYTHON_VERSION}-dev:armhf
+apt-get install -y libpython${PYTHON_VERSION}-dev:arm64
+
+if [[ "${PYTHON_VERSION}" == "3.8" ]]; then
+  apt-get install -y python${PYTHON_VERSION}-distutils
+fi
+
+/install/install_pip_packages_by_version.sh "/usr/local/bin/pip${PYTHON_VERSION}"
+ln -sf /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numpy/core/include/numpy /usr/include/python${PYTHON_VERSION}/numpy
diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
index b3bb368173f..e02a2528747 100755
--- a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
@@ -38,9 +38,11 @@ export TF_ENABLE_XLA=0
 
 yes '' | ./configure
 
-# Fix for curl build problem in 32-bit, see https://stackoverflow.com/questions/35181744/size-of-array-curl-rule-01-is-negative
-sudo sed -i 's/define CURL_SIZEOF_LONG 8/define CURL_SIZEOF_LONG 4/g' /usr/include/curl/curlbuild.h
-sudo sed -i 's/define CURL_SIZEOF_CURL_OFF_T 8/define CURL_SIZEOF_CURL_OFF_T 4/g' /usr/include/curl/curlbuild.h
+if [[ $1 != "AARCH64" ]]; then
+  # Fix for curl build problem in 32-bit, see https://stackoverflow.com/questions/35181744/size-of-array-curl-rule-01-is-negative
+  sudo sed -i 's/define CURL_SIZEOF_LONG 8/define CURL_SIZEOF_LONG 4/g' /usr/include/curl/curlbuild.h
+  sudo sed -i 's/define CURL_SIZEOF_CURL_OFF_T 8/define CURL_SIZEOF_CURL_OFF_T 4/g' /usr/include/curl/curlbuild.h
+fi
 
 # The system-installed OpenSSL headers get pulled in by the latest BoringSSL
 # release on this configuration, so move them before we build:
@@ -79,6 +81,7 @@ if [[ $1 == "PI_ONE" ]]; then
   make PREFIX=${OPENBLAS_INSTALL_PATH} install
 
   PI_COPTS="--copt=-march=armv6 --copt=-mfpu=vfp
+  --cpu=armeabi --crosstool_top=@local_config_arm_compiler//:toolchain
   --copt=-DUSE_GEMM_FOR_CONV --copt=-DUSE_OPENBLAS
   --copt=-isystem --copt=${OPENBLAS_INSTALL_PATH}/include/
   --copt=-std=gnu11 --copt=-DS_IREAD=S_IRUSR --copt=-DS_IWRITE=S_IWUSR
@@ -87,8 +90,15 @@ if [[ $1 == "PI_ONE" ]]; then
   --linkopt=-l:libopenblas.a"
   echo "Building for the Pi One/Zero, with no NEON support"
   WHEEL_ARCH=linux_armv6l
+elif [[ $1 == "AARCH64" ]]; then
+  PI_COPTS="--config=elinux_aarch64
+  --copt=-std=gnu11
+  --copt=-O3"
+  WHEEL_ARCH=linux_aarch64
+  echo "Building for the aarch64"
 else
   PI_COPTS="--copt=-march=armv7-a --copt=-mfpu=neon-vfpv4
+  --cpu=armeabi --crosstool_top=@local_config_arm_compiler//:toolchain
   --copt=-std=gnu11 --copt=-DS_IREAD=S_IRUSR --copt=-DS_IWRITE=S_IWUSR
   --copt=-O3 --copt=-fno-tree-pre --copt=-fpermissive
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
@@ -107,8 +117,7 @@ cd ${WORKSPACE_PATH}
 bazel build -c opt ${PI_COPTS} \
   --config=monolithic \
   --copt=-funsafe-math-optimizations --copt=-ftree-vectorize \
-  --copt=-fomit-frame-pointer --cpu=armeabi \
-  --crosstool_top=@local_config_arm_compiler//:toolchain \
+  --copt=-fomit-frame-pointer \
   --define tensorflow_mkldnn_contraction_kernel=0 \
   --verbose_failures \
   //tensorflow:libtensorflow.so \
diff --git a/tensorflow/tools/ci_build/presubmit/macos/py2_cc/build.sh b/tensorflow/tools/ci_build/presubmit/macos/py2_cc/build.sh
index 0885d208f1a..9bce4d1020c 100644
--- a/tensorflow/tools/ci_build/presubmit/macos/py2_cc/build.sh
+++ b/tensorflow/tools/ci_build/presubmit/macos/py2_cc/build.sh
@@ -40,7 +40,7 @@ function run_build () {
   tag_filters="-no_oss,-no_oss_py2,-gpu,-tpu,-benchmark-test,-nomac,-no_mac,-v1only"
 
   # Get the default test targets for bazel.
-  source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+  source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
   "${BAZEL_WRAPPER_PATH}" \
     test \
diff --git a/tensorflow/tools/ci_build/presubmit/macos/py37_cc/build.sh b/tensorflow/tools/ci_build/presubmit/macos/py37_cc/build.sh
index 658432af36d..e648c488a00 100644
--- a/tensorflow/tools/ci_build/presubmit/macos/py37_cc/build.sh
+++ b/tensorflow/tools/ci_build/presubmit/macos/py37_cc/build.sh
@@ -38,7 +38,7 @@ function run_build () {
   tag_filters="-no_oss,-no_oss_py2,-gpu,-tpu,-benchmark-test,-nomac,-no_mac,-v1only"
 
   # Get the default test targets for bazel.
-  source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+  source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
   "${BAZEL_WRAPPER_PATH}" \
     test \
diff --git a/tensorflow/tools/ci_build/presubmit/ubuntu_16/cpu_py36_full/build.sh b/tensorflow/tools/ci_build/presubmit/ubuntu_16/cpu_py36_full/build.sh
index 7a1fdfdb069..bde3c3d55e3 100644
--- a/tensorflow/tools/ci_build/presubmit/ubuntu_16/cpu_py36_full/build.sh
+++ b/tensorflow/tools/ci_build/presubmit/ubuntu_16/cpu_py36_full/build.sh
@@ -46,7 +46,7 @@ function run_build () {
   tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test""$(maybe_skip_v1)"
 
   # Get the default test targets for bazel.
-  source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+  source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
   # Run bazel test command. Double test timeouts to avoid flakes.
   # //tensorflow/core/platform:setround_test is not supported. See b/64264700
diff --git a/tensorflow/tools/ci_build/presubmit/ubuntu_16/gpu_py36_full/build.sh b/tensorflow/tools/ci_build/presubmit/ubuntu_16/gpu_py36_full/build.sh
index 6a7e4c74576..a27cc881f41 100644
--- a/tensorflow/tools/ci_build/presubmit/ubuntu_16/gpu_py36_full/build.sh
+++ b/tensorflow/tools/ci_build/presubmit/ubuntu_16/gpu_py36_full/build.sh
@@ -50,7 +50,7 @@ function run_build () {
   tag_filters="gpu,-no_gpu,-nogpu,-benchmark-test,-no_oss,-oss_serial,-no_gpu_presubmit,-gpu_cupti""$(maybe_skip_v1)"
 
   # Get the default test targets for bazel.
-  source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+  source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
   RBE_CONFIG="@ubuntu16.04-py3-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0"
   TF_CUDA_CONFIG_REPO="${RBE_CONFIG}_config_cuda"
diff --git a/tensorflow/tools/ci_build/pylintrc b/tensorflow/tools/ci_build/pylintrc
index 68fdb617166..5d65c9644c7 100644
--- a/tensorflow/tools/ci_build/pylintrc
+++ b/tensorflow/tools/ci_build/pylintrc
@@ -10,7 +10,7 @@
 # Profiled execution.
 profile=no
 
-# Add files or directories to the blacklist. They should be base names, not
+# Add files or directories to the denylist. They should be base names, not
 # paths.
 ignore=CVS
 
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_libtensorflow.sh b/tensorflow/tools/ci_build/rel/macos/cpu_libtensorflow.sh
new file mode 100644
index 00000000000..3dfab5a2aaa
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/macos/cpu_libtensorflow.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+echo "chmod go+w lib_package/*" >> tensorflow/tools/ci_build/linux/libtensorflow.sh
+echo "bazel clean --expunge" >> tensorflow/tools/ci_build/linux/libtensorflow.sh
+
+# Install latest bazel
+source tensorflow/tools/ci_build/release/common.sh
+install_bazelisk
+
+# Pick a version of xcode
+export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
+sudo xcode-select -s "${DEVELOPER_DIR}"
+
+# Update the version string to nightly
+./tensorflow/tools/ci_build/update_version.py --nightly
+
+tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
+
+# Copy the nightly version update script
+cp tensorflow/tools/ci_build/builds/libtensorflow_nightly_symlink.sh lib_package
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_py35_nonpip.sh b/tensorflow/tools/ci_build/rel/macos/cpu_py35_nonpip.sh
new file mode 100644
index 00000000000..7e85779a207
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/macos/cpu_py35_nonpip.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+install_bazelisk
+
+# Pick a more recent version of xcode
+export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
+sudo xcode-select -s "${DEVELOPER_DIR}"
+python3.5 -m virtualenv tf_build_env --system-site-packages
+source tf_build_env/bin/activate
+
+# Install macos pip dependencies
+install_macos_pip_deps sudo pip3.5
+
+# Run configure.
+export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
+export TF2_BEHAVIOR=1
+export PYTHON_BIN_PATH=$(which python3.5)
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+tag_filters="-no_oss,-oss_serial,-nomac,-no_mac,-no_oss_py35,-v1only,-gpu,-tpu,-benchmark-test"
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
+
+# Run tests
+set +e
+bazel test --test_output=errors --config=opt \
+  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
+  --build_tag_filters="${tag_filters}" \
+  --test_tag_filters="${tag_filters}" -- \
+  ${DEFAULT_BAZEL_TARGETS} \
+  -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_py35_pip.sh b/tensorflow/tools/ci_build/rel/macos/cpu_py35_pip.sh
new file mode 100644
index 00000000000..99c2a149394
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/macos/cpu_py35_pip.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+install_bazelisk
+
+# Pick a more recent version of xcode
+export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
+sudo xcode-select -s "${DEVELOPER_DIR}"
+
+# Install macos pip dependencies
+install_macos_pip_deps sudo pip3.5
+
+# Export required variables for running pip_new.sh
+export OS_TYPE="MACOS"
+export CONTAINER_TYPE="CPU"
+export TF_PYTHON_VERSION='python3.5'
+export TF_BUILD_BOTH_CPU_PACKAGES=1
+
+# Run configure.
+export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Export optional variables for running pip.sh
+export TF_BUILD_FLAGS="--config=release_cpu_macos"
+export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
+export TF_TEST_TARGETS="//tensorflow/python/..."
+export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
+export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py35,-gpu,-tpu,-benchmark-test'
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
+export TF_PROJECT_NAME="tensorflow"
+export TF_PIP_TEST_ROOT="pip_test"
+
+./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_py36_nonpip.sh b/tensorflow/tools/ci_build/rel/macos/cpu_py36_nonpip.sh
new file mode 100644
index 00000000000..07d4f7957af
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/macos/cpu_py36_nonpip.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+install_bazelisk
+
+# Pick a more recent version of xcode
+export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
+sudo xcode-select -s "${DEVELOPER_DIR}"
+python3.6 -m virtualenv tf_build_env --system-site-packages
+source tf_build_env/bin/activate
+
+# Install macos pip dependencies
+install_macos_pip_deps sudo pip3.6
+
+# Run configure.
+export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
+export TF2_BEHAVIOR=1
+export PYTHON_BIN_PATH=$(which python3.6)
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+tag_filters="-no_oss,-oss_serial,-nomac,-no_mac,-no_oss_py36,-v1only,-gpu,-tpu,-benchmark-test"
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
+
+# Run tests
+set +e
+bazel test --test_output=errors --config=opt \
+  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
+  --build_tag_filters="${tag_filters}" \
+  --test_tag_filters="${tag_filters}" -- \
+  ${DEFAULT_BAZEL_TARGETS} \
+  -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_py36_pip.sh b/tensorflow/tools/ci_build/rel/macos/cpu_py36_pip.sh
new file mode 100644
index 00000000000..375a8c705fa
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/macos/cpu_py36_pip.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+install_bazelisk
+
+# Pick a more recent version of xcode
+export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
+sudo xcode-select -s "${DEVELOPER_DIR}"
+
+# Install macos pip dependencies
+install_macos_pip_deps sudo pip3.6
+
+# Export required variables for running pip_new.sh
+export OS_TYPE="MACOS"
+export CONTAINER_TYPE="CPU"
+export TF_PYTHON_VERSION='python3.6'
+export TF_BUILD_BOTH_CPU_PACKAGES=1
+
+# Run configure.
+export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Export optional variables for running pip.sh
+export TF_BUILD_FLAGS="--config=release_cpu_macos"
+export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
+export TF_TEST_TARGETS="//tensorflow/python/..."
+export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
+export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py35,-v1only,-gpu,-tpu,-benchmark-test'
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
+export TF_PROJECT_NAME="tensorflow"
+export TF_PIP_TEST_ROOT="pip_test"
+
+./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_py37_nonpip.sh b/tensorflow/tools/ci_build/rel/macos/cpu_py37_nonpip.sh
new file mode 100644
index 00000000000..a23ca47a038
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/macos/cpu_py37_nonpip.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+install_bazelisk
+
+# Pick a more recent version of xcode
+export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
+sudo xcode-select -s "${DEVELOPER_DIR}"
+python -m virtualenv tf_build_env --system-site-packages
+source tf_build_env/bin/activate
+
+# Install macos pip dependencies
+install_macos_pip_deps sudo pip3.7
+
+# Run configure.
+export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
+export TF2_BEHAVIOR=1
+export PYTHON_BIN_PATH=$(which python3.7)
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+tag_filters="-no_oss,-oss_serial,-nomac,-no_mac$(maybe_skip_v1),-gpu,-tpu,-benchmark-test"
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
+
+# Run tests
+set +e
+bazel test --test_output=errors --config=opt \
+  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
+  --build_tag_filters="${tag_filters}" \
+  --test_tag_filters="${tag_filters}" -- \
+  ${DEFAULT_BAZEL_TARGETS} \
+  -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_py37_pip.sh b/tensorflow/tools/ci_build/rel/macos/cpu_py37_pip.sh
new file mode 100644
index 00000000000..ea6779be698
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/macos/cpu_py37_pip.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+install_bazelisk
+
+# Pick a more recent version of xcode
+export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
+sudo xcode-select -s "${DEVELOPER_DIR}"
+
+# Install macos pip dependencies
+install_macos_pip_deps sudo pip3.7
+
+# Export required variables for running pip_new.sh
+export OS_TYPE="MACOS"
+export CONTAINER_TYPE="CPU"
+export TF_PYTHON_VERSION='python3.7'
+export TF_BUILD_BOTH_CPU_PACKAGES=1
+
+# Run configure.
+export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Export optional variables for running pip.sh
+export TF_BUILD_FLAGS="--config=release_cpu_macos"
+export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
+export TF_TEST_TARGETS="//tensorflow/python/..."
+export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
+export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py37,-v1only,-gpu,-tpu,-benchmark-test'
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
+export TF_PROJECT_NAME="tensorflow"
+export TF_PIP_TEST_ROOT="pip_test"
+
+./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_py38_nonpip.sh b/tensorflow/tools/ci_build/rel/macos/cpu_py38_nonpip.sh
new file mode 100644
index 00000000000..179ecdf97ca
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/macos/cpu_py38_nonpip.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+install_bazelisk
+
+# Pick a more recent version of xcode
+export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
+sudo xcode-select -s "${DEVELOPER_DIR}"
+python -m virtualenv tf_build_env --system-site-packages
+source tf_build_env/bin/activate
+
+# Install macos pip dependencies
+install_macos_pip_deps sudo pip3.8
+
+# Run configure.
+export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
+export TF2_BEHAVIOR=1
+export PYTHON_BIN_PATH=$(which python3.8)
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+tag_filters="-no_oss,-oss_serial,-nomac,-no_mac$(maybe_skip_v1),-gpu,-tpu,-benchmark-test"
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
+
+# Run tests
+set +e
+bazel test --test_output=errors --config=opt \
+  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
+  --build_tag_filters="${tag_filters}" \
+  --test_tag_filters="${tag_filters}" -- \
+  ${DEFAULT_BAZEL_TARGETS} \
+  -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_py38_pip.sh b/tensorflow/tools/ci_build/rel/macos/cpu_py38_pip.sh
new file mode 100644
index 00000000000..f0ef8e89766
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/macos/cpu_py38_pip.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+install_bazelisk
+
+# Pick a more recent version of xcode
+export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
+sudo xcode-select -s "${DEVELOPER_DIR}"
+
+# Install macos pip dependencies
+install_macos_pip_deps sudo pip3.8
+
+# Export required variables for running pip_new.sh
+export OS_TYPE="MACOS"
+export CONTAINER_TYPE="CPU"
+export TF_PYTHON_VERSION='python3.8'
+export TF_BUILD_BOTH_CPU_PACKAGES=1
+
+# Run configure.
+export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Export optional variables for running pip.sh
+export TF_BUILD_FLAGS="--config=release_cpu_macos"
+export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
+export TF_TEST_TARGETS="//tensorflow/python/..."
+export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
+export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py38,-v1only,-gpu,-tpu,-benchmark-test'
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
+export TF_PROJECT_NAME="tensorflow"
+export TF_PIP_TEST_ROOT="pip_test"
+
+./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/per_release/ubuntu/cpu_libtensorflow.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_libtensorflow.sh
old mode 100755
new mode 100644
similarity index 100%
rename from tensorflow/tools/ci_build/per_release/ubuntu/cpu_libtensorflow.sh
rename to tensorflow/tools/ci_build/rel/ubuntu/cpu_libtensorflow.sh
diff --git a/tensorflow/tools/ci_build/per_release/ubuntu/cpu_py35_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py35_nonpip.sh
old mode 100755
new mode 100644
similarity index 95%
rename from tensorflow/tools/ci_build/per_release/ubuntu/cpu_py35_nonpip.sh
rename to tensorflow/tools/ci_build/rel/ubuntu/cpu_py35_nonpip.sh
index 5339671cce3..fee64f0beb1
--- a/tensorflow/tools/ci_build/per_release/ubuntu/cpu_py35_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py35_nonpip.sh
@@ -34,7 +34,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py35,-v1only"
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
 set +e
diff --git a/tensorflow/tools/ci_build/per_release/ubuntu/cpu_py35_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py35_pip.sh
old mode 100755
new mode 100644
similarity index 83%
rename from tensorflow/tools/ci_build/per_release/ubuntu/cpu_py35_pip.sh
rename to tensorflow/tools/ci_build/rel/ubuntu/cpu_py35_pip.sh
index 5d0cbacb0b7..bdbb7f15e34
--- a/tensorflow/tools/ci_build/per_release/ubuntu/cpu_py35_pip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py35_pip.sh
@@ -28,19 +28,14 @@ export CONTAINER_TYPE="CPU"
 export TF_PYTHON_VERSION='python3.5'
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --config=v2 --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
+export TF_BUILD_FLAGS="--config=release_cpu_linux"
 export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
diff --git a/tensorflow/tools/ci_build/per_release/ubuntu/cpu_py36_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py36_nonpip.sh
old mode 100755
new mode 100644
similarity index 95%
rename from tensorflow/tools/ci_build/per_release/ubuntu/cpu_py36_nonpip.sh
rename to tensorflow/tools/ci_build/rel/ubuntu/cpu_py36_nonpip.sh
index c2790420afc..6b05141f00f
--- a/tensorflow/tools/ci_build/per_release/ubuntu/cpu_py36_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py36_nonpip.sh
@@ -34,7 +34,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py36,-v1only"
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
 set +e
diff --git a/tensorflow/tools/ci_build/per_release/ubuntu/cpu_py36_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py36_pip.sh
old mode 100755
new mode 100644
similarity index 83%
rename from tensorflow/tools/ci_build/per_release/ubuntu/cpu_py36_pip.sh
rename to tensorflow/tools/ci_build/rel/ubuntu/cpu_py36_pip.sh
index 25c4de88cdd..6277291043c
--- a/tensorflow/tools/ci_build/per_release/ubuntu/cpu_py36_pip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py36_pip.sh
@@ -28,19 +28,14 @@ export CONTAINER_TYPE="CPU"
 export TF_PYTHON_VERSION='python3.6'
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --config=v2 --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
+export TF_BUILD_FLAGS="--config=release_cpu_linux"
 export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
diff --git a/tensorflow/tools/ci_build/per_release/ubuntu/cpu_py37_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py37_nonpip.sh
old mode 100755
new mode 100644
similarity index 95%
rename from tensorflow/tools/ci_build/per_release/ubuntu/cpu_py37_nonpip.sh
rename to tensorflow/tools/ci_build/rel/ubuntu/cpu_py37_nonpip.sh
index f6415a7c9ad..db0c6056b6c
--- a/tensorflow/tools/ci_build/per_release/ubuntu/cpu_py37_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py37_nonpip.sh
@@ -34,7 +34,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py37,-v1only"
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
 set +e
diff --git a/tensorflow/tools/ci_build/per_release/ubuntu/cpu_py37_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py37_pip.sh
old mode 100755
new mode 100644
similarity index 83%
rename from tensorflow/tools/ci_build/per_release/ubuntu/cpu_py37_pip.sh
rename to tensorflow/tools/ci_build/rel/ubuntu/cpu_py37_pip.sh
index 940cef32ef8..ff88ae46f39
--- a/tensorflow/tools/ci_build/per_release/ubuntu/cpu_py37_pip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py37_pip.sh
@@ -28,19 +28,14 @@ export CONTAINER_TYPE="CPU"
 export TF_PYTHON_VERSION='python3.7'
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --config=v2 --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
+export TF_BUILD_FLAGS="--config=release_cpu_linux"
 export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
diff --git a/tensorflow/tools/ci_build/per_release/ubuntu/cpu_py38_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py38_nonpip.sh
old mode 100755
new mode 100644
similarity index 95%
rename from tensorflow/tools/ci_build/per_release/ubuntu/cpu_py38_nonpip.sh
rename to tensorflow/tools/ci_build/rel/ubuntu/cpu_py38_nonpip.sh
index ff7a9f3baef..36da30167d0
--- a/tensorflow/tools/ci_build/per_release/ubuntu/cpu_py38_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py38_nonpip.sh
@@ -34,7 +34,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py38,-v1only"
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
 set +e
diff --git a/tensorflow/tools/ci_build/per_release/ubuntu/cpu_py38_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py38_pip.sh
old mode 100755
new mode 100644
similarity index 83%
rename from tensorflow/tools/ci_build/per_release/ubuntu/cpu_py38_pip.sh
rename to tensorflow/tools/ci_build/rel/ubuntu/cpu_py38_pip.sh
index a27d1f863d6..52872cfd0a6
--- a/tensorflow/tools/ci_build/per_release/ubuntu/cpu_py38_pip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py38_pip.sh
@@ -28,19 +28,14 @@ export CONTAINER_TYPE="CPU"
 export TF_PYTHON_VERSION='python3.8'
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --config=v2 --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
+export TF_BUILD_FLAGS="--config=release_cpu_linux"
 export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
diff --git a/tensorflow/tools/ci_build/per_release/ubuntu/gpu_libtensorflow.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_libtensorflow.sh
old mode 100755
new mode 100644
similarity index 100%
rename from tensorflow/tools/ci_build/per_release/ubuntu/gpu_libtensorflow.sh
rename to tensorflow/tools/ci_build/rel/ubuntu/gpu_libtensorflow.sh
diff --git a/tensorflow/tools/ci_build/per_release/ubuntu/gpu_pip_on_cpu.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_pip_on_cpu.sh
similarity index 100%
rename from tensorflow/tools/ci_build/per_release/ubuntu/gpu_pip_on_cpu.sh
rename to tensorflow/tools/ci_build/rel/ubuntu/gpu_pip_on_cpu.sh
diff --git a/tensorflow/tools/ci_build/per_release/ubuntu/gpu_py35_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py35_nonpip.sh
old mode 100755
new mode 100644
similarity index 96%
rename from tensorflow/tools/ci_build/per_release/ubuntu/gpu_py35_nonpip.sh
rename to tensorflow/tools/ci_build/rel/ubuntu/gpu_py35_nonpip.sh
index d9a10c9551d..47ed3c4fd2a
--- a/tensorflow/tools/ci_build/per_release/ubuntu/gpu_py35_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py35_nonpip.sh
@@ -41,7 +41,7 @@ export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35"
 
diff --git a/tensorflow/tools/ci_build/per_release/ubuntu/gpu_py35_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py35_pip.sh
old mode 100755
new mode 100644
similarity index 68%
rename from tensorflow/tools/ci_build/per_release/ubuntu/gpu_py35_pip.sh
rename to tensorflow/tools/ci_build/rel/ubuntu/gpu_py35_pip.sh
index abf5c1db4b4..2a5c550890b
--- a/tensorflow/tools/ci_build/per_release/ubuntu/gpu_py35_pip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py35_pip.sh
@@ -28,39 +28,25 @@ export CONTAINER_TYPE="GPU"
 export TF_PYTHON_VERSION='python3.5'
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
 export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35'
-export TF_BUILD_FLAGS="--config=opt --config=v2 --config=cuda --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain "
+export TF_BUILD_FLAGS="--config=release_gpu_linux "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
 --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --test_env=TF2_BEHAVIOR=1 \
+--action_env=TF_CUDA_VERSION=10 --action_env=TF_CUDNN_VERSION=7 --test_env=TF2_BEHAVIOR=1 \
 --config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
 --verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 #export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=${PROJECT_NAME}
+export TF_PROJECT_NAME="tensorflow_gpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
 # To build both tensorflow and tensorflow-gpu pip packages
diff --git a/tensorflow/tools/ci_build/per_release/ubuntu/gpu_py36_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_nonpip.sh
old mode 100755
new mode 100644
similarity index 96%
rename from tensorflow/tools/ci_build/per_release/ubuntu/gpu_py36_nonpip.sh
rename to tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_nonpip.sh
index 547bb0a1fba..70038a8d875
--- a/tensorflow/tools/ci_build/per_release/ubuntu/gpu_py36_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_nonpip.sh
@@ -41,7 +41,7 @@ export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py36"
 
diff --git a/tensorflow/tools/ci_build/per_release/ubuntu/gpu_py36_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_pip.sh
old mode 100755
new mode 100644
similarity index 68%
rename from tensorflow/tools/ci_build/per_release/ubuntu/gpu_py36_pip.sh
rename to tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_pip.sh
index 17b52d9ce6b..9aa724c27b9
--- a/tensorflow/tools/ci_build/per_release/ubuntu/gpu_py36_pip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_pip.sh
@@ -28,39 +28,25 @@ export CONTAINER_TYPE="GPU"
 export TF_PYTHON_VERSION='python3.6'
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
 export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py36'
-export TF_BUILD_FLAGS="--config=opt --config=v2 --config=cuda --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain "
+export TF_BUILD_FLAGS="--config=release_gpu_linux "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
 --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --test_env=TF2_BEHAVIOR=1 \
+--action_env=TF_CUDA_VERSION=10 --action_env=TF_CUDNN_VERSION=7 --test_env=TF2_BEHAVIOR=1 \
 --config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
 --verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 #export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=${PROJECT_NAME}
+export TF_PROJECT_NAME=="tensorflow_gpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
 # To build both tensorflow and tensorflow-gpu pip packages
diff --git a/tensorflow/tools/ci_build/per_release/ubuntu/gpu_py37_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_nonpip.sh
old mode 100755
new mode 100644
similarity index 96%
rename from tensorflow/tools/ci_build/per_release/ubuntu/gpu_py37_nonpip.sh
rename to tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_nonpip.sh
index 54a72459fa1..225b2cf4b7b
--- a/tensorflow/tools/ci_build/per_release/ubuntu/gpu_py37_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_nonpip.sh
@@ -41,7 +41,7 @@ export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37"
 
diff --git a/tensorflow/tools/ci_build/per_release/ubuntu/gpu_py37_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_pip.sh
old mode 100755
new mode 100644
similarity index 68%
rename from tensorflow/tools/ci_build/per_release/ubuntu/gpu_py37_pip.sh
rename to tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_pip.sh
index 2b17849b737..9bfc6608a0b
--- a/tensorflow/tools/ci_build/per_release/ubuntu/gpu_py37_pip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_pip.sh
@@ -28,39 +28,25 @@ export CONTAINER_TYPE="GPU"
 export TF_PYTHON_VERSION='python3.7'
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
 export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37'
-export TF_BUILD_FLAGS="--config=opt --config=v2 --config=cuda --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain "
+export TF_BUILD_FLAGS="--config=release_gpu_linux "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
 --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --test_env=TF2_BEHAVIOR=1 \
+--action_env=TF_CUDA_VERSION=10 --action_env=TF_CUDNN_VERSION=7 --test_env=TF2_BEHAVIOR=1 \
 --config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
 --verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 #export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=${PROJECT_NAME}
+export TF_PROJECT_NAME=="tensorflow_gpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
 # To build both tensorflow and tensorflow-gpu pip packages
diff --git a/tensorflow/tools/ci_build/per_release/ubuntu/gpu_py38_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_nonpip.sh
old mode 100755
new mode 100644
similarity index 96%
rename from tensorflow/tools/ci_build/per_release/ubuntu/gpu_py38_nonpip.sh
rename to tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_nonpip.sh
index ab88f4712f0..f7678b7436f
--- a/tensorflow/tools/ci_build/per_release/ubuntu/gpu_py38_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_nonpip.sh
@@ -41,7 +41,7 @@ export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py38"
 
diff --git a/tensorflow/tools/ci_build/per_release/ubuntu/gpu_py38_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_pip.sh
old mode 100755
new mode 100644
similarity index 68%
rename from tensorflow/tools/ci_build/per_release/ubuntu/gpu_py38_pip.sh
rename to tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_pip.sh
index 1ba8c078021..d8838e7704a
--- a/tensorflow/tools/ci_build/per_release/ubuntu/gpu_py38_pip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_pip.sh
@@ -28,39 +28,25 @@ export CONTAINER_TYPE="GPU"
 export TF_PYTHON_VERSION='python3.8'
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
 export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py38'
-export TF_BUILD_FLAGS="--config=opt --config=v2 --config=cuda --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain "
+export TF_BUILD_FLAGS="--config=release_gpu_linux "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
 --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --test_env=TF2_BEHAVIOR=1 \
+--action_env=TF_CUDA_VERSION=10 --action_env=TF_CUDNN_VERSION=7 --test_env=TF2_BEHAVIOR=1 \
 --config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
 --verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 #export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=${PROJECT_NAME}
+export TF_PROJECT_NAME=="tensorflow_gpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
 # To build both tensorflow and tensorflow-gpu pip packages
diff --git a/tensorflow/tools/ci_build/per_release/ubuntu/sanity.sh b/tensorflow/tools/ci_build/rel/ubuntu/sanity.sh
old mode 100755
new mode 100644
similarity index 100%
rename from tensorflow/tools/ci_build/per_release/ubuntu/sanity.sh
rename to tensorflow/tools/ci_build/rel/ubuntu/sanity.sh
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_libtensorflow.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_libtensorflow.sh
new file mode 100644
index 00000000000..a0e3a7f4594
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_libtensorflow.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+
+# Source the external common scripts.
+source tensorflow/tools/ci_build/release/common.sh
+
+
+# Install latest bazel
+install_bazelisk
+which bazel
+
+# Install realpath
+sudo apt-get install realpath
+
+# Update the version string to nightly
+if [ -n "${IS_NIGHTLY_BUILD}" ]; then
+  ./tensorflow/tools/ci_build/update_version.py --nightly
+fi
+
+./tensorflow/tools/ci_build/linux/libtensorflow.sh
+
+# Copy the nightly version update script
+if [ -n "${IS_NIGHTLY_BUILD}" ]; then
+  cp tensorflow/tools/ci_build/builds/libtensorflow_nightly_symlink.sh lib_package
+fi
+
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py35_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py35_nonpip.sh
new file mode 100644
index 00000000000..fee64f0beb1
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py35_nonpip.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.5
+# Update bazel
+install_bazelisk
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which python3.5)
+export TF2_BEHAVIOR=1
+yes "" | "$PYTHON_BIN_PATH" configure.py
+tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py35,-v1only"
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
+
+# Run tests
+set +e
+bazel test --test_output=errors --config=opt --test_lang_filters=py \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --linkopt=-lrt \
+  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
+  --build_tag_filters="${tag_filters}" \
+  --test_tag_filters="${tag_filters}" -- \
+  ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py35_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py35_pip.sh
new file mode 100644
index 00000000000..bdbb7f15e34
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py35_pip.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.5
+# Update bazel
+install_bazelisk
+
+# Export required variables for running pip.sh
+export OS_TYPE="UBUNTU"
+export CONTAINER_TYPE="CPU"
+export TF_PYTHON_VERSION='python3.5'
+
+# Run configure.
+export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
+
+# Export optional variables for running pip.sh
+export TF_BUILD_FLAGS="--config=release_cpu_linux"
+export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
+export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
+export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
+export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py35,-v1only'
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
+export TF_PROJECT_NAME="tensorflow_cpu"
+export TF_PIP_TEST_ROOT="pip_test"
+
+./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py36_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py36_nonpip.sh
new file mode 100644
index 00000000000..6b05141f00f
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py36_nonpip.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.6
+# Update bazel
+install_bazelisk
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which python3.6)
+export TF2_BEHAVIOR=1
+yes "" | "$PYTHON_BIN_PATH" configure.py
+tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py36,-v1only"
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
+
+# Run tests
+set +e
+bazel test --test_output=errors --config=opt --test_lang_filters=py \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --linkopt=-lrt \
+  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
+  --build_tag_filters="${tag_filters}" \
+  --test_tag_filters="${tag_filters}" -- \
+  ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py36_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py36_pip.sh
new file mode 100644
index 00000000000..6277291043c
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py36_pip.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.6
+# Update bazel
+install_bazelisk
+
+# Export required variables for running pip.sh
+export OS_TYPE="UBUNTU"
+export CONTAINER_TYPE="CPU"
+export TF_PYTHON_VERSION='python3.6'
+
+# Run configure.
+export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
+
+# Export optional variables for running pip.sh
+export TF_BUILD_FLAGS="--config=release_cpu_linux"
+export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
+export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
+export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
+export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py36,-v1only'
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
+export TF_PROJECT_NAME="tensorflow_cpu"
+export TF_PIP_TEST_ROOT="pip_test"
+
+./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py37_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py37_nonpip.sh
new file mode 100644
index 00000000000..db0c6056b6c
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py37_nonpip.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.7
+# Update bazel
+install_bazelisk
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which python3.7)
+export TF2_BEHAVIOR=1
+yes "" | "$PYTHON_BIN_PATH" configure.py
+tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py37,-v1only"
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
+
+# Run tests
+set +e
+bazel test --test_output=errors --config=opt --test_lang_filters=py \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --linkopt=-lrt \
+  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
+  --build_tag_filters="${tag_filters}" \
+  --test_tag_filters="${tag_filters}" -- \
+  ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py37_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py37_pip.sh
new file mode 100644
index 00000000000..ff88ae46f39
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py37_pip.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.7
+# Update bazel
+install_bazelisk
+
+# Export required variables for running pip.sh
+export OS_TYPE="UBUNTU"
+export CONTAINER_TYPE="CPU"
+export TF_PYTHON_VERSION='python3.7'
+
+# Run configure.
+export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
+
+# Export optional variables for running pip.sh
+export TF_BUILD_FLAGS="--config=release_cpu_linux"
+export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
+export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
+export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
+export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py37,-v1only'
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
+export TF_PROJECT_NAME="tensorflow_cpu"
+export TF_PIP_TEST_ROOT="pip_test"
+
+./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py38_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py38_nonpip.sh
new file mode 100644
index 00000000000..36da30167d0
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py38_nonpip.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.8
+# Update bazel
+install_bazelisk
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which python3.8)
+export TF2_BEHAVIOR=1
+yes "" | "$PYTHON_BIN_PATH" configure.py
+tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py38,-v1only"
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
+
+# Run tests
+set +e
+bazel test --test_output=errors --config=opt --test_lang_filters=py \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --linkopt=-lrt \
+  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
+  --build_tag_filters="${tag_filters}" \
+  --test_tag_filters="${tag_filters}" -- \
+  ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py38_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py38_pip.sh
new file mode 100644
index 00000000000..52872cfd0a6
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py38_pip.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.8
+# Update bazel
+install_bazelisk
+
+# Export required variables for running pip.sh
+export OS_TYPE="UBUNTU"
+export CONTAINER_TYPE="CPU"
+export TF_PYTHON_VERSION='python3.8'
+
+# Run configure.
+export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
+
+# Export optional variables for running pip.sh
+export TF_BUILD_FLAGS="--config=release_cpu_linux"
+export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
+export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
+export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
+export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py38,-v1only'
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
+export TF_PROJECT_NAME="tensorflow_cpu"
+export TF_PIP_TEST_ROOT="pip_test"
+
+./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_libtensorflow.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_libtensorflow.sh
new file mode 100644
index 00000000000..d294311d1ff
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_libtensorflow.sh
@@ -0,0 +1,40 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+
+# Source the external common scripts.
+source tensorflow/tools/ci_build/release/common.sh
+
+
+# Install latest bazel
+install_bazelisk
+which bazel
+
+# Install realpath
+sudo apt-get install realpath
+
+export TF_NEED_CUDA=1
+
+# Update the version string to nightly
+if [ -n "${IS_NIGHTLY_BUILD}" ]; then
+  ./tensorflow/tools/ci_build/update_version.py --nightly
+fi
+
+./tensorflow/tools/ci_build/linux/libtensorflow.sh
+
+# Copy the nightly version update script
+if [ -n "${IS_NIGHTLY_BUILD}" ]; then
+  cp tensorflow/tools/ci_build/builds/libtensorflow_nightly_symlink.sh lib_package
+fi
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_pip_on_cpu.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_pip_on_cpu.sh
new file mode 100755
index 00000000000..6e67bf20730
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_pip_on_cpu.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.6
+# Update Bazel to the desired version
+install_bazelisk
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=10
+export TF_CUDNN_VERSION=7
+export TF_NEED_TENSORRT=1
+export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which python3.6)
+export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+########################
+## Build GPU pip package
+########################
+bazel build --config=opt \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  tensorflow/tools/pip_package:build_pip_package
+
+# Set TF nightly flag so we get the proper version of estimator
+if [[ "$IS_NIGHTLY" == 1 ]]; then
+  NIGHTLY_FLAG="--nightly_flag"
+fi
+
+PIP_WHL_DIR=whl
+mkdir -p ${PIP_WHL_DIR}
+PIP_WHL_DIR=$(readlink -f ${PIP_WHL_DIR})  # Get absolute path
+bazel-bin/tensorflow/tools/pip_package/build_pip_package "${PIP_WHL_DIR}" "${NIGHTLY_FLAG}"
+WHL_PATH=$(ls "${PIP_WHL_DIR}"/*.whl)
+
+cp "${WHL_PATH}" "$(pwd)"/.
+chmod +x tensorflow/tools/ci_build/builds/docker_cpu_pip.sh
+docker run -e "BAZEL_VERSION=${BAZEL_VERSION}" -e "CI_BUILD_USER=$(id -u -n)" -e "CI_BUILD_UID=$(id -u)"  -e "CI_BUILD_GROUP=$(id -g -n)" -e "CI_BUILD_GID=$(id -g)"  -e "CI_BUILD_HOME=/bazel_pip" -v "$(pwd)":/bazel_pip tensorflow/tensorflow:devel "./bazel_pip/tensorflow/tools/ci_build/builds/with_the_same_user" "./bazel_pip/tensorflow/tools/ci_build/builds/docker_cpu_pip.sh"
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py35_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py35_nonpip.sh
new file mode 100644
index 00000000000..3e91bf787a9
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py35_nonpip.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.5
+# Update bazel
+install_bazelisk
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=11
+export TF_CUDNN_VERSION=8
+export TF_NEED_TENSORRT=1
+export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which python3.5)
+export TF2_BEHAVIOR=1
+export PROJECT_NAME="tensorflow_gpu"
+export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
+
+tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35"
+
+set +e
+ls /usr/include/cud*
+bazel test --config=cuda --config=opt -s \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
+  --linkopt=-lrt \
+  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
+  --test_lang_filters=py \
+  --test_tag_filters=${tag_filters} \
+  --build_tag_filters=${tag_filters} \
+  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
+  --test_output=errors --verbose_failures=true \
+  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
+  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py35_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py35_pip.sh
new file mode 100644
index 00000000000..2a5c550890b
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py35_pip.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.5
+# Update bazel
+install_bazelisk
+
+# Export required variables for running pip.sh
+export OS_TYPE="UBUNTU"
+export CONTAINER_TYPE="GPU"
+export TF_PYTHON_VERSION='python3.5'
+
+# Run configure.
+export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
+
+# Export optional variables for running pip.sh
+export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35'
+export TF_BUILD_FLAGS="--config=release_gpu_linux "
+export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
+--distinct_host_configuration=false \
+--action_env=TF_CUDA_VERSION=10 --action_env=TF_CUDNN_VERSION=7 --test_env=TF2_BEHAVIOR=1 \
+--config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
+--verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
+--run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
+export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
+export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
+export TF_PROJECT_NAME="tensorflow_gpu"
+export TF_PIP_TEST_ROOT="pip_test"
+
+# To build both tensorflow and tensorflow-gpu pip packages
+export TF_BUILD_BOTH_GPU_PACKAGES=1
+
+./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py36_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py36_nonpip.sh
new file mode 100644
index 00000000000..70038a8d875
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py36_nonpip.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.6
+# Update bazel
+install_bazelisk
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=10
+export TF_CUDNN_VERSION=7
+export TF_NEED_TENSORRT=1
+export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which python3.6)
+export TF2_BEHAVIOR=1
+export PROJECT_NAME="tensorflow_gpu"
+export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
+
+tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py36"
+
+set +e
+bazel test --config=cuda --config=opt \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --linkopt=-lrt \
+  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
+  --test_lang_filters=py \
+  --test_tag_filters=${tag_filters} \
+  --build_tag_filters=${tag_filters} \
+  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
+  --test_output=errors --verbose_failures=true --keep_going \
+  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
+  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py36_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py36_pip.sh
new file mode 100644
index 00000000000..9aa724c27b9
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py36_pip.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.6
+# Update bazel
+install_bazelisk
+
+# Export required variables for running pip.sh
+export OS_TYPE="UBUNTU"
+export CONTAINER_TYPE="GPU"
+export TF_PYTHON_VERSION='python3.6'
+
+# Run configure.
+export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
+
+# Export optional variables for running pip.sh
+export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py36'
+export TF_BUILD_FLAGS="--config=release_gpu_linux "
+export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
+--distinct_host_configuration=false \
+--action_env=TF_CUDA_VERSION=10 --action_env=TF_CUDNN_VERSION=7 --test_env=TF2_BEHAVIOR=1 \
+--config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
+--verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
+--run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
+export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
+export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
+export TF_PROJECT_NAME=="tensorflow_gpu"
+export TF_PIP_TEST_ROOT="pip_test"
+
+# To build both tensorflow and tensorflow-gpu pip packages
+export TF_BUILD_BOTH_GPU_PACKAGES=1
+
+./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py37_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py37_nonpip.sh
new file mode 100644
index 00000000000..225b2cf4b7b
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py37_nonpip.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.7
+# Update bazel
+install_bazelisk
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=10
+export TF_CUDNN_VERSION=7
+export TF_NEED_TENSORRT=1
+export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which python3.7)
+export TF2_BEHAVIOR=1
+export PROJECT_NAME="tensorflow_gpu"
+export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
+
+tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37"
+
+set +e
+bazel test --config=cuda --config=opt \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --linkopt=-lrt \
+  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
+  --test_lang_filters=py \
+  --build_tag_filters=${tag_filters} \
+  --test_tag_filters=${tag_filters} \
+  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
+  --test_output=errors --verbose_failures=true --keep_going \
+  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
+  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py37_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py37_pip.sh
new file mode 100644
index 00000000000..71d6f3e6401
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py37_pip.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.7
+# Update bazel
+install_bazelisk
+
+# Export required variables for running pip.sh
+export OS_TYPE="UBUNTU"
+export CONTAINER_TYPE="GPU"
+export TF_PYTHON_VERSION='python3.7'
+
+# Run configure.
+export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
+
+# Export optional variables for running pip.sh
+export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37'
+export TF_BUILD_FLAGS="--config=release_gpu_linux "
+export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
+--distinct_host_configuration=false \
+--action_env=TF_CUDA_VERSION=11 --action_env=TF_CUDNN_VERSION=8 --test_env=TF2_BEHAVIOR=1 \
+--config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
+--verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
+--run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
+export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
+export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
+export TF_PROJECT_NAME=="tensorflow_gpu"
+export TF_PIP_TEST_ROOT="pip_test"
+
+# To build both tensorflow and tensorflow-gpu pip packages
+export TF_BUILD_BOTH_GPU_PACKAGES=1
+
+./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py38_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py38_nonpip.sh
new file mode 100644
index 00000000000..f7678b7436f
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py38_nonpip.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.8
+# Update bazel
+update_bazel_linux
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=10
+export TF_CUDNN_VERSION=7
+export TF_NEED_TENSORRT=1
+export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which python3.8)
+export TF2_BEHAVIOR=1
+export PROJECT_NAME="tensorflow_gpu"
+export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
+
+tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py38"
+
+test +e
+bazel test --config=cuda --config=opt \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --linkopt=-lrt \
+  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
+  --test_lang_filters=py \
+  --build_tag_filters=${tag_filters} \
+  --test_tag_filters=${tag_filters} \
+  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
+  --test_output=errors --verbose_failures=true --keep_going \
+  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
+  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py38_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py38_pip.sh
new file mode 100644
index 00000000000..d8838e7704a
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py38_pip.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.8
+# Update bazel
+update_bazel_linux
+
+# Export required variables for running pip.sh
+export OS_TYPE="UBUNTU"
+export CONTAINER_TYPE="GPU"
+export TF_PYTHON_VERSION='python3.8'
+
+# Run configure.
+export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
+
+# Export optional variables for running pip.sh
+export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py38'
+export TF_BUILD_FLAGS="--config=release_gpu_linux "
+export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
+--distinct_host_configuration=false \
+--action_env=TF_CUDA_VERSION=10 --action_env=TF_CUDNN_VERSION=7 --test_env=TF2_BEHAVIOR=1 \
+--config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
+--verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
+--run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
+export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
+export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
+export TF_PROJECT_NAME=="tensorflow_gpu"
+export TF_PIP_TEST_ROOT="pip_test"
+
+# To build both tensorflow and tensorflow-gpu pip packages
+export TF_BUILD_BOTH_GPU_PACKAGES=1
+
+./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/sanity.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/sanity.sh
new file mode 100644
index 00000000000..4fc600de867
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/sanity.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+
+# Install latest bazel
+source tensorflow/tools/ci_build/release/common.sh
+install_bazelisk
+which bazel
+
+# We need py3 lint
+sudo pip3 install pep8
+
+# TODO(gunan): figure out why we get stuck with later versions of pylint.
+# Install pylint.
+sudo python3 -m pip install setuptools --upgrade
+sudo python2 -m pip install pylint==1.6.4
+sudo python3 -m pip install pylint==1.6.4
+
+# TODO(yifeif): print pylint version for debug. remove later.
+python3 -m pylint --version
+
+# Run tensorflow sanity checks.
+tensorflow/tools/ci_build/ci_sanity.sh
diff --git a/tensorflow/tools/ci_build/per_release/windows/cpu_libtensorflow.bat b/tensorflow/tools/ci_build/rel/windows/cpu_libtensorflow.bat
old mode 100755
new mode 100644
similarity index 100%
rename from tensorflow/tools/ci_build/per_release/windows/cpu_libtensorflow.bat
rename to tensorflow/tools/ci_build/rel/windows/cpu_libtensorflow.bat
diff --git a/tensorflow/tools/ci_build/per_release/windows/cpu_py35.bat b/tensorflow/tools/ci_build/rel/windows/cpu_py35.bat
old mode 100755
new mode 100644
similarity index 84%
rename from tensorflow/tools/ci_build/per_release/windows/cpu_py35.bat
rename to tensorflow/tools/ci_build/rel/windows/cpu_py35.bat
index 02b12c7650a..175917d7cad
--- a/tensorflow/tools/ci_build/per_release/windows/cpu_py35.bat
+++ b/tensorflow/tools/ci_build/rel/windows/cpu_py35.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python35
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2 --define=no_tensorflow_py_deps=true" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/tensorflow/tools/ci_build/per_release/windows/cpu_py36.bat b/tensorflow/tools/ci_build/rel/windows/cpu_py36.bat
old mode 100755
new mode 100644
similarity index 84%
rename from tensorflow/tools/ci_build/per_release/windows/cpu_py36.bat
rename to tensorflow/tools/ci_build/rel/windows/cpu_py36.bat
index e44e6ca6e18..85b75053eff
--- a/tensorflow/tools/ci_build/per_release/windows/cpu_py36.bat
+++ b/tensorflow/tools/ci_build/rel/windows/cpu_py36.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python36
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2 --define=no_tensorflow_py_deps=true" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/tensorflow/tools/ci_build/per_release/windows/cpu_py37.bat b/tensorflow/tools/ci_build/rel/windows/cpu_py37.bat
old mode 100755
new mode 100644
similarity index 84%
rename from tensorflow/tools/ci_build/per_release/windows/cpu_py37.bat
rename to tensorflow/tools/ci_build/rel/windows/cpu_py37.bat
index c65167a5dc6..d8a6673ba4c
--- a/tensorflow/tools/ci_build/per_release/windows/cpu_py37.bat
+++ b/tensorflow/tools/ci_build/rel/windows/cpu_py37.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python37
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2 --define=no_tensorflow_py_deps=true" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/tensorflow/tools/ci_build/per_release/windows/cpu_py38.bat b/tensorflow/tools/ci_build/rel/windows/cpu_py38.bat
old mode 100755
new mode 100644
similarity index 84%
rename from tensorflow/tools/ci_build/per_release/windows/cpu_py38.bat
rename to tensorflow/tools/ci_build/rel/windows/cpu_py38.bat
index 06599fc0d8c..86adcda0bb9
--- a/tensorflow/tools/ci_build/per_release/windows/cpu_py38.bat
+++ b/tensorflow/tools/ci_build/rel/windows/cpu_py38.bat
@@ -17,5 +17,5 @@ SET PYTHON_DIRECTORY=Python38
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2 --define=no_tensorflow_py_deps=true" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
 
diff --git a/tensorflow/tools/ci_build/per_release/windows/gpu_libtensorflow.bat b/tensorflow/tools/ci_build/rel/windows/gpu_libtensorflow.bat
old mode 100755
new mode 100644
similarity index 100%
rename from tensorflow/tools/ci_build/per_release/windows/gpu_libtensorflow.bat
rename to tensorflow/tools/ci_build/rel/windows/gpu_libtensorflow.bat
diff --git a/tensorflow/tools/ci_build/per_release/windows/gpu_pip_on_cpu.bat b/tensorflow/tools/ci_build/rel/windows/gpu_pip_on_cpu.bat
old mode 100755
new mode 100644
similarity index 100%
rename from tensorflow/tools/ci_build/per_release/windows/gpu_pip_on_cpu.bat
rename to tensorflow/tools/ci_build/rel/windows/gpu_pip_on_cpu.bat
diff --git a/tensorflow/tools/ci_build/per_release/windows/gpu_py35.bat b/tensorflow/tools/ci_build/rel/windows/gpu_py35.bat
old mode 100755
new mode 100644
similarity index 89%
rename from tensorflow/tools/ci_build/per_release/windows/gpu_py35.bat
rename to tensorflow/tools/ci_build/rel/windows/gpu_py35.bat
index cba62225bee..86c118b2f83
--- a/tensorflow/tools/ci_build/per_release/windows/gpu_py35.bat
+++ b/tensorflow/tools/ci_build/rel/windows/gpu_py35.bat
@@ -17,7 +17,7 @@ SET PYTHON_DIRECTORY=Python35
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
 
 for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
 bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
diff --git a/tensorflow/tools/ci_build/per_release/windows/gpu_py36.bat b/tensorflow/tools/ci_build/rel/windows/gpu_py36.bat
old mode 100755
new mode 100644
similarity index 89%
rename from tensorflow/tools/ci_build/per_release/windows/gpu_py36.bat
rename to tensorflow/tools/ci_build/rel/windows/gpu_py36.bat
index ede8bd35f52..cc4f84afbee
--- a/tensorflow/tools/ci_build/per_release/windows/gpu_py36.bat
+++ b/tensorflow/tools/ci_build/rel/windows/gpu_py36.bat
@@ -17,7 +17,7 @@ SET PYTHON_DIRECTORY=Python36
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
 
 for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
 bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/per_release/windows/gpu_py37.bat b/tensorflow/tools/ci_build/rel/windows/gpu_py37.bat
old mode 100755
new mode 100644
similarity index 89%
rename from tensorflow/tools/ci_build/per_release/windows/gpu_py37.bat
rename to tensorflow/tools/ci_build/rel/windows/gpu_py37.bat
index 7509270fc43..5fa798e3eb8
--- a/tensorflow/tools/ci_build/per_release/windows/gpu_py37.bat
+++ b/tensorflow/tools/ci_build/rel/windows/gpu_py37.bat
@@ -17,7 +17,7 @@ SET PYTHON_DIRECTORY=Python37
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
 
 for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
 bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/per_release/windows/gpu_py38.bat b/tensorflow/tools/ci_build/rel/windows/gpu_py38.bat
old mode 100755
new mode 100644
similarity index 89%
rename from tensorflow/tools/ci_build/per_release/windows/gpu_py38.bat
rename to tensorflow/tools/ci_build/rel/windows/gpu_py38.bat
index fc1c600fa5e..fa1fc131145
--- a/tensorflow/tools/ci_build/per_release/windows/gpu_py38.bat
+++ b/tensorflow/tools/ci_build/rel/windows/gpu_py38.bat
@@ -17,7 +17,7 @@ SET PYTHON_DIRECTORY=Python38
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
 
 for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
 bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
diff --git a/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_libtensorflow.bat b/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_libtensorflow.bat
new file mode 100644
index 00000000000..67941234b15
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_libtensorflow.bat
@@ -0,0 +1,20 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\cpu\bazel\run_libtensorflow.bat || exit /b 1
+
+copy lib_package %TF_ARTIFACTS_DIR%\lib_package
diff --git a/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_py35.bat b/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_py35.bat
new file mode 100644
index 00000000000..175917d7cad
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_py35.bat
@@ -0,0 +1,20 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+SET PYTHON_DIRECTORY=Python35
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_py36.bat b/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_py36.bat
new file mode 100644
index 00000000000..85b75053eff
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_py36.bat
@@ -0,0 +1,20 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+SET PYTHON_DIRECTORY=Python36
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_py37.bat b/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_py37.bat
new file mode 100644
index 00000000000..d8a6673ba4c
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_py37.bat
@@ -0,0 +1,20 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+SET PYTHON_DIRECTORY=Python37
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_py38.bat b/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_py38.bat
new file mode 100644
index 00000000000..86adcda0bb9
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_py38.bat
@@ -0,0 +1,21 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+SET PYTHON_DIRECTORY=Python38
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+
diff --git a/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_libtensorflow.bat b/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_libtensorflow.bat
new file mode 100644
index 00000000000..8ab78bef3ca
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_libtensorflow.bat
@@ -0,0 +1,20 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\gpu\bazel\run_libtensorflow.bat || exit /b
+
+copy lib_package %TF_ARTIFACTS_DIR%\lib_package
diff --git a/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_pip_on_cpu.bat b/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_pip_on_cpu.bat
new file mode 100644
index 00000000000..213de532069
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_pip_on_cpu.bat
@@ -0,0 +1,21 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+SET PYTHON_DIRECTORY=Python36
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\integration\gpu_pip_on_cpu\run.bat
+
diff --git a/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_py35.bat b/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_py35.bat
new file mode 100644
index 00000000000..86c118b2f83
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_py35.bat
@@ -0,0 +1,23 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+SET PYTHON_DIRECTORY=Python35
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
+
+for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
+bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
diff --git a/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_py36.bat b/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_py36.bat
new file mode 100644
index 00000000000..cc4f84afbee
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_py36.bat
@@ -0,0 +1,23 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+SET PYTHON_DIRECTORY=Python36
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
+
+for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
+bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_py37.bat b/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_py37.bat
new file mode 100644
index 00000000000..5fa798e3eb8
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_py37.bat
@@ -0,0 +1,23 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+SET PYTHON_DIRECTORY=Python37
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
+
+for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
+bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_py38.bat b/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_py38.bat
new file mode 100644
index 00000000000..fa1fc131145
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_py38.bat
@@ -0,0 +1,23 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+SET PYTHON_DIRECTORY=Python38
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
+
+for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
+bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
diff --git a/tensorflow/tools/ci_build/release/common.sh b/tensorflow/tools/ci_build/release/common.sh
index b533564e7a1..c3b5bd9a867 100644
--- a/tensorflow/tools/ci_build/release/common.sh
+++ b/tensorflow/tools/ci_build/release/common.sh
@@ -143,6 +143,7 @@ function install_pip_deps {
   ${SUDO_CMD} ${PIP_CMD} install scipy
   ${SUDO_CMD} ${PIP_CMD} install scikit-learn
   ${SUDO_CMD} ${PIP_CMD} install --upgrade tb-nightly
+  ${PIP_CMD} install --user --upgrade flatbuffers
   ${PIP_CMD} install --user --upgrade attrs
   ${PIP_CMD} install --user --upgrade tf-estimator-nightly
   ${PIP_CMD} install --user --upgrade "future>=0.17.1"
@@ -166,6 +167,7 @@ function install_ubuntu_16_pip_deps {
   # LINT.IfChange(ubuntu_16_pip_installations)
   "${PIP_CMD}" install astunparse==1.6.3 --user
   "${PIP_CMD}" install --user --upgrade attrs
+  "${PIP_CMD}" install --user --upgrade flatbuffers
   "${PIP_CMD}" install keras_preprocessing==1.1.0 --no-deps --user
   "${PIP_CMD}" install numpy==1.16.0 --user
   "${PIP_CMD}" install --user --upgrade "future>=0.17.1"
@@ -220,6 +222,7 @@ function install_macos_pip_deps {
   ${SUDO_CMD} ${PIP_CMD} install h5py==2.10.0
   ${SUDO_CMD} ${PIP_CMD} install --upgrade grpcio
   ${SUDO_CMD} ${PIP_CMD} install --upgrade tb-nightly
+  ${PIP_CMD} install --user --upgrade flatbuffers
   ${PIP_CMD} install --user --upgrade attrs
   # b/156523241
   ${PIP_CMD} install --force-reinstall --user --upgrade tf-estimator-nightly
diff --git a/tensorflow/tools/ci_build/release/common_win.bat b/tensorflow/tools/ci_build/release/common_win.bat
index fa577fcfc33..e460ec8b0e1 100644
--- a/tensorflow/tools/ci_build/release/common_win.bat
+++ b/tensorflow/tools/ci_build/release/common_win.bat
@@ -26,6 +26,7 @@ SET PATH=%PATH%;C:\%PYTHON_DIRECTORY%
 
 @REM TODO(amitpatankar): Make an image with these packages and remove this.
 
+%PIP_EXE% install flatbuffers --upgrade --no-deps
 %PIP_EXE% install setuptools --upgrade
 %PIP_EXE% install future>=0.17.1 --no-deps
 %PIP_EXE% install --ignore-installed --force-reinstall --upgrade tf-estimator-nightly --no-deps
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nightly_release.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nightly_release.sh
index 69c57179379..6dc3e3849ad 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nightly_release.sh
@@ -30,13 +30,11 @@ sudo pip install twine
 ./tensorflow/tools/ci_build/update_version.py --nightly
 
 # Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python2)
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Build the pip package
-bazel build --config=opt --config=v2 tensorflow/tools/pip_package:build_pip_package
+bazel build --config=release_cpu_macos tensorflow/tools/pip_package:build_pip_package
 mkdir pip_pkg
 ./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --cpu --nightly_flag
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip.sh
index 02e9e2eb9f8..9031cd9be63 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip.sh
@@ -36,7 +36,7 @@ export PYTHON_BIN_PATH=$(which python2)
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 tag_filters="-no_oss,-oss_serial,-nomac,-no_mac,-no_oss_py2,-v1only,-gpu,-tpu,-benchmark-test"
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip_v1.sh
index 7a4fb54e250..8817b19fa7b 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip_v1.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip_v1.sh
@@ -37,7 +37,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 tag_filters="-no_oss,-oss_serial,-nomac,-no_mac,-no_oss_py2"
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
 bazel test --test_output=errors --config=opt \
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh
index 0630c117036..bcc7b4500d6 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh
@@ -33,13 +33,11 @@ export TF_PYTHON_VERSION='python2'
 export TF_BUILD_BOTH_CPU_PACKAGES=1
 
 # Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --config=v2"
+export TF_BUILD_FLAGS="--config=release_cpu_macos"
 export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nightly_release.sh b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nightly_release.sh
index 1f018136ef9..7da3b0ea9be 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nightly_release.sh
@@ -35,13 +35,12 @@ sudo pip install twine
 ./tensorflow/tools/ci_build/update_version.py --nightly
 
 # Run configure.
-export TF_NEED_CUDA=0
 export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python3.5)
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Build the pip package
-bazel build --config=opt --config=v2 tensorflow/tools/pip_package:build_pip_package
+bazel build --config=release_cpu_macos tensorflow/tools/pip_package:build_pip_package
 mkdir pip_pkg
 ./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --cpu --nightly_flag
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip.sh
index 06fabd7b1c7..7e85779a207 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip.sh
@@ -38,7 +38,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 tag_filters="-no_oss,-oss_serial,-nomac,-no_mac,-no_oss_py35,-v1only,-gpu,-tpu,-benchmark-test"
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
 set +e
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh
index 3f31033b2ac..99c2a149394 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh
@@ -33,13 +33,11 @@ export TF_PYTHON_VERSION='python3.5'
 export TF_BUILD_BOTH_CPU_PACKAGES=1
 
 # Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --config=v2"
+export TF_BUILD_FLAGS="--config=release_cpu_macos"
 export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nightly_release.sh b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nightly_release.sh
index 3702ec97413..33e1491dd86 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nightly_release.sh
@@ -33,13 +33,12 @@ sudo pip install twine
 ./tensorflow/tools/ci_build/update_version.py --nightly
 
 # Run configure.
-export TF_NEED_CUDA=0
 export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python3.6)
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Build the pip package
-bazel build --config=opt --config=v2 tensorflow/tools/pip_package:build_pip_package
+bazel build --config=release_cpu_macos tensorflow/tools/pip_package:build_pip_package
 mkdir pip_pkg
 ./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --cpu --nightly_flag
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip.sh
index 51cc3da62d6..07d4f7957af 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip.sh
@@ -38,7 +38,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 tag_filters="-no_oss,-oss_serial,-nomac,-no_mac,-no_oss_py36,-v1only,-gpu,-tpu,-benchmark-test"
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
 set +e
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip_v1.sh
index 01e95c37bae..2f639d7fc6b 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip_v1.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip_v1.sh
@@ -36,7 +36,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 tag_filters="-no_oss,-oss_serial,-nomac,-no_mac"
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
 bazel test --test_output=errors --config=opt \
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip.sh
index 26ee4ea8edb..375a8c705fa 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip.sh
@@ -33,13 +33,11 @@ export TF_PYTHON_VERSION='python3.6'
 export TF_BUILD_BOTH_CPU_PACKAGES=1
 
 # Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --config=v2"
+export TF_BUILD_FLAGS="--config=release_cpu_macos"
 export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nightly_release.sh b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nightly_release.sh
index eee97f6e2d2..631aea318bd 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nightly_release.sh
@@ -33,13 +33,12 @@ sudo pip install twine
 ./tensorflow/tools/ci_build/update_version.py --nightly
 
 # Run configure.
-export TF_NEED_CUDA=0
 export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python3.7)
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Build the pip package
-bazel build --config=opt --config=v2 tensorflow/tools/pip_package:build_pip_package
+bazel build --config=release_cpu_macos tensorflow/tools/pip_package:build_pip_package
 mkdir pip_pkg
 ./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --cpu --nightly_flag
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip.sh
index e0f2968b45a..a23ca47a038 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip.sh
@@ -38,7 +38,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 tag_filters="-no_oss,-oss_serial,-nomac,-no_mac$(maybe_skip_v1),-gpu,-tpu,-benchmark-test"
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
 set +e
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip_v1.sh
index 45d61222726..a05cd81d74f 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip_v1.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip_v1.sh
@@ -32,7 +32,7 @@ export PYTHON_BIN_PATH=$(which python3.7)
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 tag_filters="-no_oss,-oss_serial,-nomac,-no_mac"
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip.sh
index ed577db961a..ea6779be698 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip.sh
@@ -33,13 +33,11 @@ export TF_PYTHON_VERSION='python3.7'
 export TF_BUILD_BOTH_CPU_PACKAGES=1
 
 # Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --config=v2"
+export TF_BUILD_FLAGS="--config=release_cpu_macos"
 export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nightly_release.sh b/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nightly_release.sh
index 70773c1b597..5ffef89188c 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nightly_release.sh
@@ -33,13 +33,12 @@ sudo pip install twine
 ./tensorflow/tools/ci_build/update_version.py --nightly
 
 # Run configure.
-export TF_NEED_CUDA=0
 export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python3.8)
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Build the pip package
-bazel build --config=opt --config=v2 tensorflow/tools/pip_package:build_pip_package
+bazel build --config=release_cpu_macos tensorflow/tools/pip_package:build_pip_package
 mkdir pip_pkg
 ./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --cpu --nightly_flag
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nonpip.sh
index 22475f35491..179ecdf97ca 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nonpip.sh
@@ -38,7 +38,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 tag_filters="-no_oss,-oss_serial,-nomac,-no_mac$(maybe_skip_v1),-gpu,-tpu,-benchmark-test"
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
 set +e
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py38_full/pip.sh
index f8eda5a7520..f0ef8e89766 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py38_full/pip.sh
@@ -33,13 +33,11 @@ export TF_PYTHON_VERSION='python3.8'
 export TF_BUILD_BOTH_CPU_PACKAGES=1
 
 # Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --config=v2"
+export TF_BUILD_FLAGS="--config=release_cpu_macos"
 export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nightly_release.sh
index 1a5124ecef3..b60fe5fdc51 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nightly_release.sh
@@ -26,18 +26,12 @@ install_bazelisk
 python2.7 tensorflow/tools/ci_build/update_version.py --nightly
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
 export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python2.7)
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Build the pip package
-bazel build --config=opt --config=v2 \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  tensorflow/tools/pip_package:build_pip_package
+bazel build --config=release_cpu_linux tensorflow/tools/pip_package:build_pip_package
 
 ./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --cpu --nightly_flag
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip.sh
index 5bdb5794e95..8323625662f 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip.sh
@@ -35,7 +35,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py2,-v1only"
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
 set +e
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip_v1.sh
index 892c8d07efd..f9241673fd1 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip_v1.sh
@@ -34,7 +34,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py2"
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
 bazel test --test_output=errors --config=opt --test_lang_filters=py \
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip.sh
index 8524bbbad03..aa1e4b52483 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip.sh
@@ -38,7 +38,7 @@ export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
 export TF_BUILD_FLAGS="--config=opt --config=v2 --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nightly_release.sh
index ba1861b221e..200f3c41725 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nightly_release.sh
@@ -27,18 +27,12 @@ install_bazelisk
 python2.7 tensorflow/tools/ci_build/update_version.py --nightly
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
 export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python3.5)
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Build the pip package
-bazel build --config=opt --config=v2 \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  tensorflow/tools/pip_package:build_pip_package
+bazel build --config=release_cpu_linux tensorflow/tools/pip_package:build_pip_package
 
 ./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --cpu --nightly_flag
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip.sh
index 5339671cce3..fee64f0beb1 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip.sh
@@ -34,7 +34,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py35,-v1only"
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
 set +e
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip_v1.sh
index b91c542011b..4231891fbdb 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip_v1.sh
@@ -33,7 +33,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py35"
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
 bazel test --test_output=errors --config=opt --test_lang_filters=py \
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip.sh
index 5d0cbacb0b7..bdbb7f15e34 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip.sh
@@ -28,19 +28,14 @@ export CONTAINER_TYPE="CPU"
 export TF_PYTHON_VERSION='python3.5'
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --config=v2 --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
+export TF_BUILD_FLAGS="--config=release_cpu_linux"
 export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nightly_release.sh
index 2b770867099..c54fe72a55a 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nightly_release.sh
@@ -27,18 +27,12 @@ install_bazelisk
 python2.7 tensorflow/tools/ci_build/update_version.py --nightly
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
 export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python3.6)
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Build the pip package
-bazel build --config=opt --config=v2 \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  tensorflow/tools/pip_package:build_pip_package
+bazel build --config=release_cpu_linux tensorflow/tools/pip_package:build_pip_package
 
 ./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --cpu --nightly_flag
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nonpip.sh
index c2790420afc..6b05141f00f 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nonpip.sh
@@ -34,7 +34,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py36,-v1only"
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
 set +e
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nonpip_v1.sh
index 5481ae4e00e..38d03c8868c 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nonpip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nonpip_v1.sh
@@ -33,7 +33,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py36"
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
 bazel test --test_output=errors --config=opt --test_lang_filters=py \
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip.sh
index 25c4de88cdd..6277291043c 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip.sh
@@ -28,19 +28,14 @@ export CONTAINER_TYPE="CPU"
 export TF_PYTHON_VERSION='python3.6'
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --config=v2 --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
+export TF_BUILD_FLAGS="--config=release_cpu_linux"
 export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nightly_release.sh
index 25e59a5b096..4bea46486c3 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nightly_release.sh
@@ -27,18 +27,12 @@ install_bazelisk
 python2.7 tensorflow/tools/ci_build/update_version.py --nightly
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
 export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python3.7)
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Build the pip package
-bazel build --config=opt --config=v2 \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  tensorflow/tools/pip_package:build_pip_package
+bazel build --config=release_cpu_linux tensorflow/tools/pip_package:build_pip_package
 
 ./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --cpu --nightly_flag
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nonpip.sh
index f6415a7c9ad..db0c6056b6c 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nonpip.sh
@@ -34,7 +34,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py37,-v1only"
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
 set +e
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nonpip_v1.sh
index c0191560555..098155aa026 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nonpip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nonpip_v1.sh
@@ -33,7 +33,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py37"
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
 bazel test --test_output=errors --config=opt --test_lang_filters=py \
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip.sh
index 940cef32ef8..ff88ae46f39 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip.sh
@@ -28,19 +28,14 @@ export CONTAINER_TYPE="CPU"
 export TF_PYTHON_VERSION='python3.7'
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --config=v2 --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
+export TF_BUILD_FLAGS="--config=release_cpu_linux"
 export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nightly_release.sh
index e82064f7221..3dc627f23ee 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nightly_release.sh
@@ -27,18 +27,12 @@ install_bazelisk
 python2.7 tensorflow/tools/ci_build/update_version.py --nightly
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
 export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python3.8)
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Build the pip package
-bazel build --config=opt --config=v2 \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  tensorflow/tools/pip_package:build_pip_package
+bazel build --config=release_cpu_linux tensorflow/tools/pip_package:build_pip_package
 
 ./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --cpu --nightly_flag
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nonpip.sh
index ff7a9f3baef..36da30167d0 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nonpip.sh
@@ -34,7 +34,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py38,-v1only"
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
 set +e
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/pip.sh
index a27d1f863d6..52872cfd0a6 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/pip.sh
@@ -28,19 +28,14 @@ export CONTAINER_TYPE="CPU"
 export TF_PYTHON_VERSION='python3.8'
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --config=v2 --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
+export TF_BUILD_FLAGS="--config=release_cpu_linux"
 export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip.sh
index 1dce4b101e3..e8c8b763d4b 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip.sh
@@ -41,7 +41,7 @@ export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py2"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip_v1.sh
index c8695bdfbdd..20e7977945f 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip_v1.sh
@@ -40,7 +40,7 @@ export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py2"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip.sh
index 1bf074dde4e..b3f7f158648 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip.sh
@@ -45,7 +45,7 @@ export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
 export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial'
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nightly_release.sh
index 2ed5c014c65..aac88b57fa7 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nightly_release.sh
@@ -25,23 +25,12 @@ install_bazelisk
 python2.7 tensorflow/tools/ci_build/update_version.py --nightly
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
 export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python3.5)
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Build the pip package
-bazel build --config=opt --config=v2 \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  tensorflow/tools/pip_package:build_pip_package
+bazel build --config=release_gpu_linux tensorflow/tools/pip_package:build_pip_package
 
 ./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --nightly_flag
 ./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --gpu --nightly_flag
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip.sh
index d9a10c9551d..47ed3c4fd2a 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip.sh
@@ -41,7 +41,7 @@ export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip_v1.sh
index dea186ea62e..e4a5a69c10f 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip_v1.sh
@@ -40,7 +40,7 @@ export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh
index abf5c1db4b4..2a5c550890b 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh
@@ -28,39 +28,25 @@ export CONTAINER_TYPE="GPU"
 export TF_PYTHON_VERSION='python3.5'
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
 export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35'
-export TF_BUILD_FLAGS="--config=opt --config=v2 --config=cuda --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain "
+export TF_BUILD_FLAGS="--config=release_gpu_linux "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
 --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --test_env=TF2_BEHAVIOR=1 \
+--action_env=TF_CUDA_VERSION=10 --action_env=TF_CUDNN_VERSION=7 --test_env=TF2_BEHAVIOR=1 \
 --config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
 --verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 #export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=${PROJECT_NAME}
+export TF_PROJECT_NAME="tensorflow_gpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
 # To build both tensorflow and tensorflow-gpu pip packages
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nightly_release.sh
index 87b2e52d88a..600b4b0be8e 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nightly_release.sh
@@ -25,23 +25,12 @@ install_bazelisk
 python2.7 tensorflow/tools/ci_build/update_version.py --nightly
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
 export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python3.6)
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Build the pip package
-bazel build --config=opt --config=v2 \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  tensorflow/tools/pip_package:build_pip_package
+bazel build --config=release_gpu_linux tensorflow/tools/pip_package:build_pip_package
 
 ./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --nightly_flag
 ./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --gpu --nightly_flag
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip.sh
index 547bb0a1fba..70038a8d875 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip.sh
@@ -41,7 +41,7 @@ export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py36"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip_v1.sh
index 42ea884f790..aaa4d017546 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip_v1.sh
@@ -40,7 +40,7 @@ export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py36"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh
index 17b52d9ce6b..9aa724c27b9 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh
@@ -28,39 +28,25 @@ export CONTAINER_TYPE="GPU"
 export TF_PYTHON_VERSION='python3.6'
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
 export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py36'
-export TF_BUILD_FLAGS="--config=opt --config=v2 --config=cuda --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain "
+export TF_BUILD_FLAGS="--config=release_gpu_linux "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
 --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --test_env=TF2_BEHAVIOR=1 \
+--action_env=TF_CUDA_VERSION=10 --action_env=TF_CUDNN_VERSION=7 --test_env=TF2_BEHAVIOR=1 \
 --config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
 --verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 #export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=${PROJECT_NAME}
+export TF_PROJECT_NAME=="tensorflow_gpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
 # To build both tensorflow and tensorflow-gpu pip packages
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nightly_release.sh
index 0436ec32643..a9e51461715 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nightly_release.sh
@@ -25,23 +25,12 @@ install_bazelisk
 python2.7 tensorflow/tools/ci_build/update_version.py --nightly
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
 export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python3.7)
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Build the pip package
-bazel build --config=opt --config=v2 \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  tensorflow/tools/pip_package:build_pip_package
+bazel build --config=release_gpu_linux tensorflow/tools/pip_package:build_pip_package
 
 ./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --nightly_flag
 ./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --gpu --nightly_flag
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip.sh
index 54a72459fa1..225b2cf4b7b 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip.sh
@@ -41,7 +41,7 @@ export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip_v1.sh
index 7c2a93f042e..112f232a8e3 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip_v1.sh
@@ -40,7 +40,7 @@ export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh
index 2b17849b737..9bfc6608a0b 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh
@@ -28,39 +28,25 @@ export CONTAINER_TYPE="GPU"
 export TF_PYTHON_VERSION='python3.7'
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
 export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37'
-export TF_BUILD_FLAGS="--config=opt --config=v2 --config=cuda --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain "
+export TF_BUILD_FLAGS="--config=release_gpu_linux "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
 --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --test_env=TF2_BEHAVIOR=1 \
+--action_env=TF_CUDA_VERSION=10 --action_env=TF_CUDNN_VERSION=7 --test_env=TF2_BEHAVIOR=1 \
 --config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
 --verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 #export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=${PROJECT_NAME}
+export TF_PROJECT_NAME=="tensorflow_gpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
 # To build both tensorflow and tensorflow-gpu pip packages
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nightly_release.sh
index 783785db7f7..0b8fd1380f2 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nightly_release.sh
@@ -27,23 +27,12 @@ update_bazel_linux
 python2.7 tensorflow/tools/ci_build/update_version.py --nightly
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
 export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python3.8)
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Build the pip package
-bazel build --config=opt --config=v2 \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  tensorflow/tools/pip_package:build_pip_package
+bazel build --config=release_gpu_linux tensorflow/tools/pip_package:build_pip_package
 
 ./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --nightly_flag
 ./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --gpu --nightly_flag
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh
index ab88f4712f0..f7678b7436f 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh
@@ -41,7 +41,7 @@ export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py38"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh
index 1ba8c078021..d8838e7704a 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh
@@ -28,39 +28,25 @@ export CONTAINER_TYPE="GPU"
 export TF_PYTHON_VERSION='python3.8'
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
 export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py38'
-export TF_BUILD_FLAGS="--config=opt --config=v2 --config=cuda --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain "
+export TF_BUILD_FLAGS="--config=release_gpu_linux "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
 --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --test_env=TF2_BEHAVIOR=1 \
+--action_env=TF_CUDA_VERSION=10 --action_env=TF_CUDNN_VERSION=7 --test_env=TF2_BEHAVIOR=1 \
 --config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
 --verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 #export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=${PROJECT_NAME}
+export TF_PROJECT_NAME=="tensorflow_gpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
 # To build both tensorflow and tensorflow-gpu pip packages
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/tpu_py37_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/tpu_py37_full/nonpip.sh
index 1667316d214..327ea62208f 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/tpu_py37_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/tpu_py37_full/nonpip.sh
@@ -40,6 +40,7 @@ tag_filters="tpu,-tpu_pod,-no_tpu,-notpu,-no_oss,-no_oss_py37"
 
 bazel_args=(
   --config=opt \
+  --config=short_logs \
   --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
   --linkopt=-lrt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/nightly.bat b/tensorflow/tools/ci_build/release/windows/cpu_py35_full/nightly.bat
index 131eca130bd..979a30e046c 100644
--- a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/nightly.bat
+++ b/tensorflow/tools/ci_build/release/windows/cpu_py35_full/nightly.bat
@@ -19,4 +19,4 @@ SET PYTHON_DIRECTORY=Python35
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/nightly_release.bat b/tensorflow/tools/ci_build/release/windows/cpu_py35_full/nightly_release.bat
index 5b254fcae5f..6ed1088893f 100644
--- a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/nightly_release.bat
+++ b/tensorflow/tools/ci_build/release/windows/cpu_py35_full/nightly_release.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python35
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --tf_nightly --project_name "tf_nightly_cpu" --extra_build_flags "--config=v2"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --tf_nightly --project_name "tf_nightly_cpu"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/release.bat b/tensorflow/tools/ci_build/release/windows/cpu_py35_full/release.bat
index 02b12c7650a..175917d7cad 100644
--- a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/release.bat
+++ b/tensorflow/tools/ci_build/release/windows/cpu_py35_full/release.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python35
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2 --define=no_tensorflow_py_deps=true" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py36_full/nightly.bat b/tensorflow/tools/ci_build/release/windows/cpu_py36_full/nightly.bat
index ee77daa52b4..fd1854603f5 100644
--- a/tensorflow/tools/ci_build/release/windows/cpu_py36_full/nightly.bat
+++ b/tensorflow/tools/ci_build/release/windows/cpu_py36_full/nightly.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python36
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py36_full/nightly_release.bat b/tensorflow/tools/ci_build/release/windows/cpu_py36_full/nightly_release.bat
index 745ba8ea4b6..3af98dddeae 100644
--- a/tensorflow/tools/ci_build/release/windows/cpu_py36_full/nightly_release.bat
+++ b/tensorflow/tools/ci_build/release/windows/cpu_py36_full/nightly_release.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python36
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --tf_nightly --project_name "tf_nightly_cpu" --extra_build_flags "--config=v2"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --tf_nightly --project_name "tf_nightly_cpu"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py36_full/release.bat b/tensorflow/tools/ci_build/release/windows/cpu_py36_full/release.bat
index e44e6ca6e18..85b75053eff 100644
--- a/tensorflow/tools/ci_build/release/windows/cpu_py36_full/release.bat
+++ b/tensorflow/tools/ci_build/release/windows/cpu_py36_full/release.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python36
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2 --define=no_tensorflow_py_deps=true" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py37_full/nightly.bat b/tensorflow/tools/ci_build/release/windows/cpu_py37_full/nightly.bat
index 68d36f68112..69b9449b0c3 100644
--- a/tensorflow/tools/ci_build/release/windows/cpu_py37_full/nightly.bat
+++ b/tensorflow/tools/ci_build/release/windows/cpu_py37_full/nightly.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python37
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py37_full/nightly_release.bat b/tensorflow/tools/ci_build/release/windows/cpu_py37_full/nightly_release.bat
index 8596652f59d..850c21ee962 100644
--- a/tensorflow/tools/ci_build/release/windows/cpu_py37_full/nightly_release.bat
+++ b/tensorflow/tools/ci_build/release/windows/cpu_py37_full/nightly_release.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python37
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --tf_nightly --project_name "tf_nightly_cpu" --extra_build_flags "--config=v2"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --tf_nightly --project_name "tf_nightly_cpu"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py37_full/release.bat b/tensorflow/tools/ci_build/release/windows/cpu_py37_full/release.bat
index c65167a5dc6..d8a6673ba4c 100644
--- a/tensorflow/tools/ci_build/release/windows/cpu_py37_full/release.bat
+++ b/tensorflow/tools/ci_build/release/windows/cpu_py37_full/release.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python37
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2 --define=no_tensorflow_py_deps=true" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py38_full/nightly.bat b/tensorflow/tools/ci_build/release/windows/cpu_py38_full/nightly.bat
index 65692431469..0d5b3a7fff8 100644
--- a/tensorflow/tools/ci_build/release/windows/cpu_py38_full/nightly.bat
+++ b/tensorflow/tools/ci_build/release/windows/cpu_py38_full/nightly.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python38
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py38_full/nightly_release.bat b/tensorflow/tools/ci_build/release/windows/cpu_py38_full/nightly_release.bat
index 1214812cc46..2456b1e26bb 100644
--- a/tensorflow/tools/ci_build/release/windows/cpu_py38_full/nightly_release.bat
+++ b/tensorflow/tools/ci_build/release/windows/cpu_py38_full/nightly_release.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python38
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --tf_nightly --project_name "tf_nightly_cpu" --extra_build_flags "--config=v2"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --tf_nightly --project_name "tf_nightly_cpu"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py38_full/release.bat b/tensorflow/tools/ci_build/release/windows/cpu_py38_full/release.bat
index 06599fc0d8c..86adcda0bb9 100644
--- a/tensorflow/tools/ci_build/release/windows/cpu_py38_full/release.bat
+++ b/tensorflow/tools/ci_build/release/windows/cpu_py38_full/release.bat
@@ -17,5 +17,5 @@ SET PYTHON_DIRECTORY=Python38
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2 --define=no_tensorflow_py_deps=true" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
 
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py35_full/nightly.bat b/tensorflow/tools/ci_build/release/windows/gpu_py35_full/nightly.bat
index 19e8ebcfabd..ba8dee59853 100644
--- a/tensorflow/tools/ci_build/release/windows/gpu_py35_full/nightly.bat
+++ b/tensorflow/tools/ci_build/release/windows/gpu_py35_full/nightly.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python35
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py35_full/nightly_release.bat b/tensorflow/tools/ci_build/release/windows/gpu_py35_full/nightly_release.bat
index 6727f08726b..43e6414a74b 100644
--- a/tensorflow/tools/ci_build/release/windows/gpu_py35_full/nightly_release.bat
+++ b/tensorflow/tools/ci_build/release/windows/gpu_py35_full/nightly_release.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python35
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --tf_nightly --extra_build_flags "--config=v2"
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --tf_nightly
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py35_full/release.bat b/tensorflow/tools/ci_build/release/windows/gpu_py35_full/release.bat
index cba62225bee..86c118b2f83 100644
--- a/tensorflow/tools/ci_build/release/windows/gpu_py35_full/release.bat
+++ b/tensorflow/tools/ci_build/release/windows/gpu_py35_full/release.bat
@@ -17,7 +17,7 @@ SET PYTHON_DIRECTORY=Python35
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
 
 for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
 bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py36_full/nightly.bat b/tensorflow/tools/ci_build/release/windows/gpu_py36_full/nightly.bat
index dbd380f47e3..9624ca5f5b2 100644
--- a/tensorflow/tools/ci_build/release/windows/gpu_py36_full/nightly.bat
+++ b/tensorflow/tools/ci_build/release/windows/gpu_py36_full/nightly.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python36
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py36_full/nightly_release.bat b/tensorflow/tools/ci_build/release/windows/gpu_py36_full/nightly_release.bat
index b1f4b707eb5..15ec83c054e 100644
--- a/tensorflow/tools/ci_build/release/windows/gpu_py36_full/nightly_release.bat
+++ b/tensorflow/tools/ci_build/release/windows/gpu_py36_full/nightly_release.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python36
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --tf_nightly --extra_build_flags "--config=v2"
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --tf_nightly
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py36_full/release.bat b/tensorflow/tools/ci_build/release/windows/gpu_py36_full/release.bat
index ede8bd35f52..cc4f84afbee 100644
--- a/tensorflow/tools/ci_build/release/windows/gpu_py36_full/release.bat
+++ b/tensorflow/tools/ci_build/release/windows/gpu_py36_full/release.bat
@@ -17,7 +17,7 @@ SET PYTHON_DIRECTORY=Python36
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
 
 for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
 bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py37_full/nightly.bat b/tensorflow/tools/ci_build/release/windows/gpu_py37_full/nightly.bat
index 3d12b723048..c6141c42916 100644
--- a/tensorflow/tools/ci_build/release/windows/gpu_py37_full/nightly.bat
+++ b/tensorflow/tools/ci_build/release/windows/gpu_py37_full/nightly.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python37
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py37_full/nightly_release.bat b/tensorflow/tools/ci_build/release/windows/gpu_py37_full/nightly_release.bat
index c283752947c..1eb65d8a284 100644
--- a/tensorflow/tools/ci_build/release/windows/gpu_py37_full/nightly_release.bat
+++ b/tensorflow/tools/ci_build/release/windows/gpu_py37_full/nightly_release.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python37
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --tf_nightly --extra_build_flags "--config=v2"
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --tf_nightly
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py37_full/release.bat b/tensorflow/tools/ci_build/release/windows/gpu_py37_full/release.bat
index 7509270fc43..5fa798e3eb8 100644
--- a/tensorflow/tools/ci_build/release/windows/gpu_py37_full/release.bat
+++ b/tensorflow/tools/ci_build/release/windows/gpu_py37_full/release.bat
@@ -17,7 +17,7 @@ SET PYTHON_DIRECTORY=Python37
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
 
 for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
 bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py38_full/nightly.bat b/tensorflow/tools/ci_build/release/windows/gpu_py38_full/nightly.bat
index 247487fa926..dcbed63089e 100644
--- a/tensorflow/tools/ci_build/release/windows/gpu_py38_full/nightly.bat
+++ b/tensorflow/tools/ci_build/release/windows/gpu_py38_full/nightly.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python38
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py38_full/nightly_release.bat b/tensorflow/tools/ci_build/release/windows/gpu_py38_full/nightly_release.bat
index 0a208440148..670793340e8 100644
--- a/tensorflow/tools/ci_build/release/windows/gpu_py38_full/nightly_release.bat
+++ b/tensorflow/tools/ci_build/release/windows/gpu_py38_full/nightly_release.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python38
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --tf_nightly --extra_build_flags "--config=v2"
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --tf_nightly
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py38_full/release.bat b/tensorflow/tools/ci_build/release/windows/gpu_py38_full/release.bat
index fc1c600fa5e..fa1fc131145 100644
--- a/tensorflow/tools/ci_build/release/windows/gpu_py38_full/release.bat
+++ b/tensorflow/tools/ci_build/release/windows/gpu_py38_full/release.bat
@@ -17,7 +17,7 @@ SET PYTHON_DIRECTORY=Python38
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
 
 for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
 bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
diff --git a/tensorflow/tools/ci_build/sizetrack_helper.py b/tensorflow/tools/ci_build/sizetrack_helper.py
new file mode 100755
index 00000000000..03a04e36588
--- /dev/null
+++ b/tensorflow/tools/ci_build/sizetrack_helper.py
@@ -0,0 +1,378 @@
+#!/usr/bin/env python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Used for Google-internal artifact size tracking.
+
+See go/tf-devinfra/sizetrack.
+
+INVOCATION: The following flags are required:
+
+  sizetrack_helper.py \
+      --artifact=ARTIFACT, or --manual_bytes=MANUAL_BYTES
+      --artifact_id=ARTIFACT_ID \
+      --team=TEAM \
+      ... other optional args ...
+
+On Windows you might need something like:
+
+    C:\Python38\python.exe C:\path\to\sizetrack_helper.py ...
+
+PREREQUISITES:
+
+  1. Your current activated GCP user must have access scopes and IAM permissions
+     to do the following:
+
+      1. Query and load data into BigQuery
+      2. Upload files to GCS
+
+  2. Your environment must match the following criteria:
+
+      1. Current directory is a git repository
+      2. CL-based commits have a PiperOrigin-RevId trailer. This is the case
+         for any use of Copybara Single-source-of-truth, e.g. TensorFlow.
+         Only these commits are considered when running commands.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import csv
+import datetime
+import os
+import os.path
+import pathlib
+import platform
+import subprocess
+
+
+parser = argparse.ArgumentParser(
+    usage=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument(
+    "--project",
+    type=str,
+    default="tensorflow-testing",
+    help="GCP project you can access.")
+parser.add_argument(
+    "--dataset",
+    type=str,
+    default="sizetracker",
+    help="BigQuery dataset containing --table")
+parser.add_argument(
+    "--table", type=str, default="tensorflow_devinfra", help="BigQuery table.")
+parser.add_argument(
+    "--upload",
+    action="store_true",
+    help="Upload the artifact to --bucket for analysis.")
+parser.add_argument(
+    "--bucket",
+    type=str,
+    default="gs://tf-sizetracker-artifacts",
+    help="GCS bucket for artifacts.")
+parser.add_argument(
+    "--team",
+    type=str,
+    help="For grouping in the dashboard and buckets; e.g. tf-lite-team.")
+parser.add_argument(
+    "--artifact_id",
+    type=str,
+    help="Unique ID for your artifact, used for sorting dashboards.")
+parser.add_argument(
+    "-n",
+    "--dry_run",
+    action="store_true",
+    help="Dry run: do not load to BigQuery or upload to GCS.")
+parser.add_argument(
+    "--job",
+    type=str,
+    help="Name of job calling this script. Default: $KOKORO_JOB_NAME.")
+parser.add_argument(
+    "--print_schema",
+    action="store_true",
+    help="Print the table schema and don't do anything else.")
+size = parser.add_mutually_exclusive_group()
+size.add_argument(
+    "--artifact",
+    type=argparse.FileType("r"),
+    help="Local file you are measuring.")
+size.add_argument(
+    "--manual_bytes",
+    type=int,
+    help="Manually set the recorded size instead of providing an artifact.")
+FLAGS = parser.parse_args()
+
+
+NOW = datetime.datetime.now(
+    datetime.timezone.utc).replace(microsecond=0).isoformat()
+TABLE_NAME = "{}.{}".format(FLAGS.dataset, FLAGS.table)
+PROJECT_LEVEL_TABLE_NAME = "{}:{}".format(FLAGS.project, TABLE_NAME)
+CL_TRAILER = "PiperOrigin-RevId"
+PRETTY_COMMIT_DATE = "%cI"
+PRETTY_CL = "%(trailers:key={},valueonly)".format(CL_TRAILER)
+PRETTY_HEAD_INFO = "%h\t{cl}\t%s\t%ae\t%aI\t%ce\t%cI".format(cl=PRETTY_CL)
+PRETTY_EARLY = "{cl}\t%aI\t%cI".format(cl=PRETTY_CL)
+PRETTY_COMMIT = "%h"
+# This is a BigQuery table schema defined as CSV
+# See https://cloud.google.com/bigquery/docs/schemas
+SCHEMA = ",".join([
+    "id:string",
+    "filename:string",
+    # These 6 lines are from git's format=pretty
+    # %h $CL_PRETTY %s %ae %aI %ce %cI
+    "commit:string",
+    "cl:int64",
+    "description:string",
+    "author:string",
+    "author_date:timestamp",
+    "committer:string",
+    "commit_date:timestamp",
+    # Done with format=pretty
+    "earliest_commit:string",
+    "earliest_cl:int64",
+    "earliest_author_date:timestamp",
+    "earliest_commit_date:timestamp",
+    "all_commits:string",
+    "all_cls:string",
+    "bytes:int64",
+    "team:string",
+    "logged_date:timestamp",
+    "uploaded_to:string",
+    "job:string",
+])
+# Select the earliest recorded commit in the same table for the same artifact
+# and team. Used to determine the full range of tested commits for each
+# invocation. Returns empty string if there are no earlier records.
+BQ_GET_EARLIEST_INCLUDED_COMMIT = """
+  SELECT
+    commit
+  FROM {table} WHERE
+    commit_date < '{earlier_than_this_date}'
+    AND id = '{artifact_id}'
+    AND team = '{team}'
+  ORDER BY commit_date DESC LIMIT 1
+"""
+
+
+# pylint: disable=unused-argument
+def git_pretty(commit_range, pretty_format, n=None):
+  r"""Run git log and return the cleaned results.
+
+  Git is assumed to be available in the PATH.
+
+  The PiperOrigin-RevId trailer always picks up an extra newline, so this splits
+  entries on a null byte (\0, or %x00 for git log) and removes newlines.
+
+  Args:
+    commit_range: Standard range given to git log, e.g. HEAD~1..HEAD
+    pretty_format: See https://git-scm.com/docs/pretty-formats
+    n: Number of commits to get. By default, get all within commit_range.
+
+  Returns:
+    List of strings of whatever the format string was.
+  """
+  n = [] if n is None else ["-n", "1"]
+  try:
+    ret = subprocess.run([
+        "git", "log", *n, "--date", "iso", "--grep", CL_TRAILER, commit_range,
+        "--pretty=format:" + pretty_format + "%x00"
+    ],
+                         check=True,
+                         universal_newlines=True,
+                         stderr=subprocess.PIPE,
+                         stdout=subprocess.PIPE)
+  except subprocess.CalledProcessError as e:
+    print(e.stderr)
+    print(e.stdout)
+    raise e
+  out = ret.stdout.replace("\n", "")
+  # Split by \0 and make list of text, extra whitespace and empty lines removed
+  return list(filter(None, map(str.strip, out.split("\0"))))
+
+
+def gcloud(tool, args, stdin=None):
+  r"""Run a Google cloud utility.
+
+  On Linux and MacOS, utilities are assumed to be in the PATH.
+  On Windows, utilities are assumed to be available as
+    C:\Program Files (x86)\Google\Cloud SDK\google-cloud-sdk\bin\{tool}.cmd
+
+  Args:
+    tool: CLI tool, e.g. bq, gcloud, gsutil
+    args: List of arguments, same format as subprocess.run
+    stdin: String to send to stdin
+
+  Returns:
+    String, the stdout of the tool
+  """
+
+  if platform.system() == "Windows":
+    tool = (r"C:\Program Files (x86)\Google\Cloud "
+            r"SDK\google-cloud-sdk\bin\{}.cmd").format(tool)
+
+  try:
+    ret = subprocess.run([tool, *args],
+                         check=True,
+                         universal_newlines=True,
+                         stdout=subprocess.PIPE,
+                         stderr=subprocess.PIPE,
+                         input=stdin)
+  except subprocess.CalledProcessError as e:
+    print(e.stderr)
+    print(e.stdout)
+    raise e
+  return ret.stdout.strip()
+
+
+def bq(args, stdin=None):
+  """Helper for running bq, the BigQuery tool."""
+  # bq prints extra messages to stdout if ~/.bigqueryrc doesn't exist
+  pathlib.Path(pathlib.Path.home() / ".bigqueryrc").touch()
+  return gcloud(
+      "bq", ["--project_id", FLAGS.project, "--headless", *args],
+      stdin=stdin)
+
+
+def get_all_tested_commits():
+  """Get details about the full commit range tested by this invocation."""
+  head_info = git_pretty("HEAD", PRETTY_HEAD_INFO, n=1)
+  _, _, _, _, _, _, current_commit_date = head_info[0].split("\t")
+
+  query_earliest_included_commit = BQ_GET_EARLIEST_INCLUDED_COMMIT.format(
+      table=TABLE_NAME,
+      earlier_than_this_date=current_commit_date,
+      artifact_id=FLAGS.artifact_id,
+      team=FLAGS.team)
+
+  # --format=csv returns an empty string if no results, or else two lines:
+  # commit
+  # COMMIT_HASH
+  earliest_commit = bq(["query", "--format", "csv", "--nouse_legacy_sql"],
+                       stdin=query_earliest_included_commit)
+
+  # Compute the commit/CL range since the last test
+  if earliest_commit:
+
+    earliest_commit = earliest_commit.splitlines()[-1]  # Ignore CSV header
+    early_cl, early_author_date, early_commit_date = git_pretty(
+        earliest_commit, PRETTY_EARLY, n=1)[0].split("\t")
+
+    all_range = "{commit}..HEAD".format(commit=earliest_commit)
+    all_commits = ",".join(git_pretty(all_range, PRETTY_COMMIT))
+    all_changelists = ",".join(git_pretty(all_range, PRETTY_CL))
+
+    return [
+        earliest_commit, early_cl, early_author_date, early_commit_date,
+        all_commits, all_changelists
+    ]
+
+  # If the artifact has never been tracked before this commit
+  # Empty cells in CSV loads are loaded as NULL values
+  else:
+    return [""] * 6
+
+
+def get_upload_path():
+  """Generate URL for 'gsutil cp'."""
+  if FLAGS.upload and FLAGS.artifact:
+    artifact_filename = os.path.basename(FLAGS.artifact.name)
+    # note: not os.path.join here, because gsutil is always linux-style
+    # Using a timestamp prevents duplicate entries
+    path = "{bucket}/{team}/{artifact_id}/{now}.{artifact_filename}".format(
+        bucket=FLAGS.bucket,
+        team=FLAGS.team,
+        artifact_id=FLAGS.artifact_id,
+        now=NOW,
+        artifact_filename=artifact_filename)
+    return path
+  else:
+    return ""
+
+
+def build_row():
+  """Assemble one row of data about this artifact."""
+  (earliest_commit, early_cl, early_author_date, early_commit_date, all_commits,
+   all_changelists) = get_all_tested_commits()
+
+  # Use UTC to make sure machines in different timezones load consistent data
+  current_time = datetime.datetime.now(datetime.timezone.utc).isoformat()
+  artifact_filename = ("NO_FILE" if not FLAGS.artifact else os.path.basename(
+      FLAGS.artifact.name))
+  size_bytes = FLAGS.manual_bytes or os.path.getsize(FLAGS.artifact.name)
+  head_info = git_pretty("HEAD", PRETTY_HEAD_INFO, n=1)
+  all_head_info_items = head_info[0].split("\t")
+  return [
+      FLAGS.artifact_id,
+      artifact_filename,
+      *all_head_info_items,
+      earliest_commit,
+      early_cl,
+      early_author_date,
+      early_commit_date,
+      all_commits,
+      all_changelists,
+      size_bytes,
+      FLAGS.team,
+      current_time,
+      get_upload_path(),
+      FLAGS.job,
+  ]
+
+
+def main():
+
+  # Validate flags
+  if FLAGS.print_schema:
+    print(SCHEMA)
+    exit(0)
+  elif not FLAGS.team or not FLAGS.artifact_id or not (FLAGS.artifact or
+                                                       FLAGS.manual_bytes):
+    print(
+        "--team and --artifact_id are required if --print_schema is not "
+        "specified.\nYou must also specify one of --artifact or --manual_bytes."
+        "\nPass -h or --help for usage.")
+    exit(1)
+
+  if not FLAGS.job:
+    FLAGS.job = os.environ.get("KOKORO_JOB_NAME", "NO_JOB")
+
+  # Generate data about this artifact into a Tab Separated Value file
+  next_tsv_row = build_row()
+
+  # Upload artifact into GCS if it exists
+  if FLAGS.upload and FLAGS.artifact:
+    upload_path = get_upload_path()
+    if FLAGS.dry_run:
+      print("DRY RUN: Would gsutil cp to:\n{}".format(upload_path))
+    else:
+      gcloud("gsutil", ["cp", FLAGS.artifact.name, upload_path])
+
+  # Load into BigQuery
+  if FLAGS.dry_run:
+    print("DRY RUN: Generated this TSV row:")
+    print("\t".join(map(str, next_tsv_row)))
+  else:
+    with open("data.tsv", "w", newline="") as tsvfile:
+      writer = csv.writer(tsvfile, delimiter="\t", quoting=csv.QUOTE_MINIMAL,
+                          lineterminator=os.linesep)
+      writer.writerow(next_tsv_row)
+    bq([
+        "load", "--source_format", "CSV", "--field_delimiter", "tab",
+        PROJECT_LEVEL_TABLE_NAME, "data.tsv", SCHEMA
+    ])
+
+
+if __name__ == "__main__":
+  main()
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 0152e9decc7..e4258f0408e 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -24,6 +24,7 @@
 #   - Msys2
 #   - Anaconda3
 # * Bazel windows executable copied as "bazel.exe" and included in PATH.
+# change
 
 # All commands shall pass, and all should be visible.
 set -x
@@ -135,12 +136,12 @@ fi
 
 run_configure_for_cpu_build
 
-bazel build --announce_rc --config=opt ${EXTRA_BUILD_FLAGS}  \
+bazel build ${EXTRA_BUILD_FLAGS}  \
   --build_tag_filters=-no_pip,-no_windows,-no_oss,-gpu,-tpu \
   --output_filter=^$ \
   tensorflow/lite:framework tensorflow/lite/examples/minimal:minimal || exit $?
 
-bazel build --announce_rc --config=opt ${EXTRA_BUILD_FLAGS} \
+bazel build --config=release_cpu_windows ${EXTRA_BUILD_FLAGS} \
   --output_filter=^$ \
   tensorflow/tools/pip_package:build_pip_package || exit $?
 
diff --git a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
index 6dd183ceb87..1874a23df6d 100644
--- a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
@@ -138,9 +138,8 @@ fi
 
 run_configure_for_gpu_build
 
-bazel build --announce_rc --config=opt --define=no_tensorflow_py_deps=true \
+bazel build --config=release_gpu_windows ${EXTRA_BUILD_FLAGS} \
   --output_filter=^$ \
-  ${EXTRA_BUILD_FLAGS} \
   tensorflow/tools/pip_package:build_pip_package || exit $?
 
 if [[ "$SKIP_TEST" == 1 ]]; then
diff --git a/tensorflow/tools/common/traverse.py b/tensorflow/tools/common/traverse.py
index 1d9c98277b5..299fe5732c6 100644
--- a/tensorflow/tools/common/traverse.py
+++ b/tensorflow/tools/common/traverse.py
@@ -101,7 +101,7 @@ def traverse(root, visit):
   is already in the stack.
 
   Traversing system modules can take a long time, it is advisable to pass a
-  `visit` callable which blacklists such modules.
+  `visit` callable which denylists such modules.
 
   Args:
     root: A python object with which to start the traversal.
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py
index 1475a74d62f..bf149170977 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py
@@ -1282,69 +1282,72 @@ class TFAPIChangeSpec(ast_edits.NoUpdateSpec):
     # Warnings that are emitted only if a specific arg is found.
     self.function_arg_warnings = {
         "tf.nn.conv1d": {
-            ("use_cudnn_on_gpu", 4): (
-                ast_edits.WARNING,
-                "use_cudnn_on_gpu has been removed, behavior is now equivalent"
-                "to setting it to True."),
+            ("use_cudnn_on_gpu", 4):
+                (ast_edits.WARNING,
+                 "use_cudnn_on_gpu has been removed, behavior is now equivalent"
+                 "to setting it to True."),
         },
         "tf.nn.conv2d": {
-            ("use_cudnn_on_gpu", 4): (
-                ast_edits.WARNING,
-                "use_cudnn_on_gpu has been removed, behavior is now equivalent"
-                "to setting it to True."),
+            ("use_cudnn_on_gpu", 4):
+                (ast_edits.WARNING,
+                 "use_cudnn_on_gpu has been removed, behavior is now equivalent"
+                 "to setting it to True."),
         },
         "tf.nn.conv2d_backprop_filter": {
-            ("use_cudnn_on_gpu", 5): (
-                ast_edits.WARNING,
-                "use_cudnn_on_gpu has been removed, behavior is now equivalent"
-                "to setting it to True."),
+            ("use_cudnn_on_gpu", 5):
+                (ast_edits.WARNING,
+                 "use_cudnn_on_gpu has been removed, behavior is now equivalent"
+                 "to setting it to True."),
         },
         "tf.nn.conv2d_backprop_input": {
-            ("use_cudnn_on_gpu", 5): (
-                ast_edits.WARNING,
-                "use_cudnn_on_gpu has been removed, behavior is now equivalent"
-                "to setting it to True."),
+            ("use_cudnn_on_gpu", 5):
+                (ast_edits.WARNING,
+                 "use_cudnn_on_gpu has been removed, behavior is now equivalent"
+                 "to setting it to True."),
         },
         "tf.gradients": {
-            ("colocate_gradients_with_ops", 4): (
-                ast_edits.INFO,
-                "tf.gradients no longer takes "
-                "'colocate_gradients_with_ops' argument, it behaves as if it "
-                "was set to True."),
+            ("colocate_gradients_with_ops", 4):
+                (ast_edits.INFO, "tf.gradients no longer takes "
+                 "'colocate_gradients_with_ops' argument, it behaves as if it "
+                 "was set to True."),
+        },
+        "tf.hessians": {
+            ("colocate_gradients_with_ops", 3):
+                (ast_edits.INFO, "tf.hessians no longer takes "
+                 "'colocate_gradients_with_ops' argument, it behaves as if it "
+                 "was set to True."),
         },
         "*.minimize": {
-            ("colocate_gradients_with_ops", 5): (
-                ast_edits.INFO,
-                "Optimizer.minimize no longer takes "
-                "'colocate_gradients_with_ops' argument, it behaves as if it "
-                "was set to True."),
+            ("colocate_gradients_with_ops", 5):
+                (ast_edits.INFO, "Optimizer.minimize no longer takes "
+                 "'colocate_gradients_with_ops' argument, it behaves as if it "
+                 "was set to True."),
         },
         "*.compute_gradients": {
-            ("colocate_gradients_with_ops", 4): (
-                ast_edits.INFO,
-                "Optimizer.compute_gradients no "
-                "longer takes 'colocate_gradients_with_ops' argument, it "
-                "behaves as if it was set to True."),
+            ("colocate_gradients_with_ops", 4):
+                (ast_edits.INFO, "Optimizer.compute_gradients no "
+                 "longer takes 'colocate_gradients_with_ops' argument, it "
+                 "behaves as if it was set to True."),
         },
         "tf.cond": {
-            ("strict", 3): (
-                ast_edits.WARNING,
-                "tf.cond no longer takes 'strict' argument, it behaves as "
-                "if was set to True.")
+            ("strict", 3):
+                (ast_edits.WARNING,
+                 "tf.cond no longer takes 'strict' argument, it behaves as "
+                 "if was set to True.")
         },
         "tf.contrib.summary.audio": {
             ("family", 4): contrib_summary_family_arg_comment,
         },
         "tf.contrib.summary.create_file_writer": {
-            ("name", 4): (
-                ast_edits.WARNING,
-                "tf.contrib.summary.create_file_writer() no longer supports "
-                "implicit writer re-use based on shared logdirs or resource "
-                "names; this call site passed a 'name' argument that has been "
-                "removed. The new tf.compat.v2.summary.create_file_writer() "
-                "replacement has a 'name' parameter but the semantics are "
-                "the usual ones to name the op itself and do not control "
-                "writer re-use; writers must be manually re-used if desired.")
+            ("name", 4):
+                (ast_edits.WARNING,
+                 "tf.contrib.summary.create_file_writer() no longer supports "
+                 "implicit writer re-use based on shared logdirs or resource "
+                 "names; this call site passed a 'name' argument that has been "
+                 "removed. The new tf.compat.v2.summary.create_file_writer() "
+                 "replacement has a 'name' parameter but the semantics are "
+                 "the usual ones to name the op itself and do not control "
+                 "writer re-use; writers must be manually re-used if desired.")
         },
         "tf.contrib.summary.generic": {
             ("name", 0): (
@@ -1374,44 +1377,44 @@ class TFAPIChangeSpec(ast_edits.NoUpdateSpec):
             ("family", 2): contrib_summary_family_arg_comment,
         },
         "tf.image.resize": {
-            ("align_corners",
-             3): (ast_edits.WARNING,
-                  "align_corners is not supported by tf.image.resize, the new "
-                  "default transformation is close to what v1 provided. If you "
-                  "require exactly the same transformation as before, use "
-                  "compat.v1.image.resize."),
+            ("align_corners", 3):
+                (ast_edits.WARNING,
+                 "align_corners is not supported by tf.image.resize, the new "
+                 "default transformation is close to what v1 provided. If you "
+                 "require exactly the same transformation as before, use "
+                 "compat.v1.image.resize."),
         },
         "tf.image.resize_bilinear": {
-            ("align_corners",
-             2): (ast_edits.WARNING,
-                  "align_corners is not supported by tf.image.resize, the new "
-                  "default transformation is close to what v1 provided. If you "
-                  "require exactly the same transformation as before, use "
-                  "compat.v1.image.resize_bilinear."),
+            ("align_corners", 2):
+                (ast_edits.WARNING,
+                 "align_corners is not supported by tf.image.resize, the new "
+                 "default transformation is close to what v1 provided. If you "
+                 "require exactly the same transformation as before, use "
+                 "compat.v1.image.resize_bilinear."),
         },
         "tf.image.resize_area": {
-            ("align_corners",
-             2): (ast_edits.WARNING,
-                  "align_corners is not supported by tf.image.resize, the new "
-                  "default transformation is close to what v1 provided. If you "
-                  "require exactly the same transformation as before, use "
-                  "compat.v1.image.resize_area."),
+            ("align_corners", 2):
+                (ast_edits.WARNING,
+                 "align_corners is not supported by tf.image.resize, the new "
+                 "default transformation is close to what v1 provided. If you "
+                 "require exactly the same transformation as before, use "
+                 "compat.v1.image.resize_area."),
         },
         "tf.image.resize_bicubic": {
-            ("align_corners",
-             2): (ast_edits.WARNING,
-                  "align_corners is not supported by tf.image.resize, the new "
-                  "default transformation is close to what v1 provided. If you "
-                  "require exactly the same transformation as before, use "
-                  "compat.v1.image.resize_bicubic."),
+            ("align_corners", 2):
+                (ast_edits.WARNING,
+                 "align_corners is not supported by tf.image.resize, the new "
+                 "default transformation is close to what v1 provided. If you "
+                 "require exactly the same transformation as before, use "
+                 "compat.v1.image.resize_bicubic."),
         },
         "tf.image.resize_nearest_neighbor": {
-            ("align_corners",
-             2): (ast_edits.WARNING,
-                  "align_corners is not supported by tf.image.resize, the new "
-                  "default transformation is close to what v1 provided. If you "
-                  "require exactly the same transformation as before, use "
-                  "compat.v1.image.resize_nearest_neighbor."),
+            ("align_corners", 2):
+                (ast_edits.WARNING,
+                 "align_corners is not supported by tf.image.resize, the new "
+                 "default transformation is close to what v1 provided. If you "
+                 "require exactly the same transformation as before, use "
+                 "compat.v1.image.resize_nearest_neighbor."),
         },
     }
     all_renames_v2.add_contrib_direct_import_support(self.function_arg_warnings)
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
index 185a3b07f8d..6b65785fe32 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
@@ -1118,6 +1118,12 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
     self.assertEqual("optimizer.compute_gradients(a)\n", new_text)
     self.assertIn("Optimizer.compute_gradients no longer takes", report)
 
+  def testColocateGradientsWithHessians(self):
+    text = "tf.hessians(ys=a, xs=b, colocate_gradients_with_ops=False)\n"
+    _, report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual("tf.hessians(ys=a, xs=b)\n", new_text)
+    self.assertIn("tf.hessians no longer takes", report)
+
   def testExportSavedModelRename(self):
     text = "self.est.export_savedmodel(path)"
     _, report, unused_errors, unused_new_text = self._upgrade(text)
diff --git a/tensorflow/tools/def_file_filter/symbols_pybind.txt b/tensorflow/tools/def_file_filter/symbols_pybind.txt
index 07f5906aa08..b2546582418 100644
--- a/tensorflow/tools/def_file_filter/symbols_pybind.txt
+++ b/tensorflow/tools/def_file_filter/symbols_pybind.txt
@@ -20,6 +20,7 @@ tensorflow::swig::AssertSameStructureForData
 tensorflow::swig::RegisterPyObject
 tensorflow::swig::RegisterType
 tensorflow::swig::IsEagerTensorSlow
+tensorflow::swig::GetRegisteredPyObject
 
 [util_port] # util_port
 tensorflow::IsGoogleCudaEnabled
@@ -99,8 +100,8 @@ tensorflow::data::GrpcDataServerBase::Join
 tensorflow::data::GrpcDataServerBase::Start
 tensorflow::data::GrpcDataServerBase::Stop
 tensorflow::data::GrpcDataServerBase::BoundPort
-tensorflow::data::MasterGrpcDataServer::NumWorkers
-tensorflow::data::NewMasterServer
+tensorflow::data::DispatchGrpcDataServer::NumWorkers
+tensorflow::data::NewDispatchServer
 tensorflow::data::NewWorkerServer
 
 [protos_all]  # device_lib, dtypes
@@ -335,6 +336,15 @@ tensorflow::ProfilerSession::SerializeToString
 tensorflow::ProfilerSession::Status
 tensorflow::ProfilerSession::~ProfilerSession
 
+[profiler_server_impl] # profiler
+tensorflow::ProfilerServer::StartProfilerServer
+tensorflow::ProfilerServer::~ProfilerServer
+
+[profiler_client_impl] # profiler
+tensorflow::profiler::ProfileGrpc
+tensorflow::profiler::NewSessionGrpc
+tensorflow::profiler::MonitorGrpc
+
 [status_macros] # tfcompile
 xla::status_macros::MakeErrorStream::Impl::Impl
 xla::status_macros::MakeErrorStream::Impl::~Impl
@@ -369,3 +379,7 @@ tensorflow::grappler::CostAnalyzer::GenerateReport
 [flags] # tfe
 tensorflow::IsXlaEnabled
 tensorflow::GetMlirCommonFlags
+
+[tf32_utils] # tf32
+tensorflow::allow_tf32_execution
+tensorflow::tf32_execution_allowed
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
index b99c384fe20..b8bbbbd7bdf 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
@@ -79,7 +79,7 @@ RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
         && rm -rf /var/lib/apt/lists/*; }
 
 # Configure the build for our CUDA configuration.
-ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/include/x64_64-linux-gnu:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/include/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
 ENV TF_NEED_CUDA 1
 ENV TF_NEED_TENSORRT 1
 ENV TF_CUDA_VERSION=${CUDA}
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
index 4493964cffc..81d50dccf9d 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
@@ -79,7 +79,7 @@ RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
         && rm -rf /var/lib/apt/lists/*; }
 
 # Configure the build for our CUDA configuration.
-ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/include/x64_64-linux-gnu:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/include/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
 ENV TF_NEED_CUDA 1
 ENV TF_NEED_TENSORRT 1
 ENV TF_CUDA_VERSION=${CUDA}
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod-jupyter.Dockerfile
deleted file mode 100644
index 5ed856259a9..00000000000
--- a/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod-jupyter.Dockerfile
+++ /dev/null
@@ -1,183 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-#
-# THIS IS A GENERATED DOCKERFILE.
-#
-# This file was assembled from multiple pieces, whose use is documented
-# throughout. Please refer to the TensorFlow dockerfiles documentation
-# for more information.
-
-ARG UBUNTU_VERSION=18.04
-
-FROM ubuntu:${UBUNTU_VERSION} AS base
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        curl \
-        git \
-        libcurl3-dev \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libzmq3-dev \
-        pkg-config \
-        rsync \
-        software-properties-common \
-        sudo \
-        unzip \
-        zip \
-        zlib1g-dev \
-        openjdk-8-jdk \
-        openjdk-8-jre-headless \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV CI_BUILD_PYTHON python
-
-# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
-ARG CACHE_STOP=1
-# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
-ARG CHECKOUT_TF_SRC=0
-# In case of Python 2.7+ we need to add passwd entries for user and group id
-RUN chmod a+w /etc/passwd /etc/group
-RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
-
-# See http://bugs.python.org/issue19846
-ENV LANG C.UTF-8
-
-RUN apt-get update && apt-get install -y \
-    python3 \
-    python3-pip
-
-RUN python3 -m pip --no-cache-dir install --upgrade \
-    pip \
-    setuptools
-
-# Some TF tools expect a "python" binary
-RUN ln -s $(which python3) /usr/local/bin/python
-
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    git \
-    wget \
-    openjdk-8-jdk \
-    python3-dev \
-    virtualenv \
-    swig
-
-RUN python3 -m pip --no-cache-dir install \
-    Pillow \
-    h5py \
-    keras_preprocessing \
-    matplotlib \
-    mock \
-    'numpy<1.19.0' \
-    scipy \
-    sklearn \
-    pandas \
-    future \
-    portpicker \
-    enum34
-
-# Install bazel
-ARG BAZEL_VERSION=3.1.0
-RUN mkdir /bazel && \
-    wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
-    wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
-    chmod +x /bazel/installer.sh && \
-    /bazel/installer.sh && \
-    rm -f /bazel/installer.sh
-
-# install libnuma, openssh, wget
-RUN ( apt-get update && apt-get install -y --no-install-recommends --fix-missing \
-        libnuma-dev \
-        openssh-server \
-        openssh-client \
-        wget && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/* ) || \
-    ( yum -y update && yum -y install \
-            numactl-devel \
-            openssh-server \
-            openssh-clients \
-            wget && \
-    yum clean all ) || \
-    ( echo "Unsupported Linux distribution. Aborting!" && exit 1 )
-
-# Install Open MPI
-# download realese version from official website as openmpi github master is not always stable
-ARG OPENMPI_VERSION=openmpi-4.0.0
-ARG OPENMPI_DOWNLOAD_URL=https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-4.0.0.tar.gz
-RUN mkdir /tmp/openmpi && \
-    cd /tmp/openmpi && \
-    wget ${OPENMPI_DOWNLOAD_URL} && \
-    tar zxf ${OPENMPI_VERSION}.tar.gz && \
-    cd ${OPENMPI_VERSION} && \
-    ./configure --enable-orterun-prefix-by-default && \
-    make -j $(nproc) all && \
-    make install && \
-    ldconfig && \
-    rm -rf /tmp/openmpi
-
-# Create a wrapper for OpenMPI to allow running as root by default
-RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
-    echo '#!/bin/bash' > /usr/local/bin/mpirun && \
-    echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
-    chmod a+x /usr/local/bin/mpirun
-
-# Configure OpenMPI to run good defaults:
-RUN echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf
-
-# Install OpenSSH for MPI to communicate between containers
-RUN mkdir -p /var/run/sshd
-
-# Allow OpenSSH to talk to containers without asking for confirmation
-RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
-    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
-    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
-
-# Check out horovod source code if --build-arg CHECKOUT_HOROVOD_SRC=1
-ARG CHECKOUT_HOROVOD_SRC=0
-RUN test "${CHECKOUT_HOROVOD_SRC}" -eq 1 && git clone --recursive https://github.com/uber/horovod.git /horovod_src || true
-
-COPY bashrc /etc/bash.bashrc
-RUN chmod a+rwx /etc/bash.bashrc
-
-RUN python3 -m pip install --no-cache-dir jupyter matplotlib
-# Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
-RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
-RUN jupyter serverextension enable --py jupyter_http_over_ws
-
-RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
-RUN mkdir /.local && chmod a+rwx /.local
-RUN apt-get install -y --no-install-recommends wget
-# some examples require git to fetch dependencies
-RUN apt-get install -y --no-install-recommends git
-WORKDIR /tf/tensorflow-tutorials
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/classification.ipynb
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/overfit_and_underfit.ipynb
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/regression.ipynb
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/save_and_load.ipynb
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/text_classification.ipynb
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/text_classification_with_hub.ipynb
-COPY readme-for-jupyter.md README.md
-RUN apt-get autoremove -y && apt-get remove -y wget
-WORKDIR /tf
-EXPOSE 8888
-
-RUN python3 -m ipykernel.kernelspec
-
-CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-jupyter.Dockerfile
new file mode 100644
index 00000000000..ffc951f3fc3
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-jupyter.Dockerfile
@@ -0,0 +1,98 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        sudo \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+ARG TF_BRANCH=master
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git --branch "${TF_BRANCH}" --single-branch /tensorflow_src || true
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl
+
+# Install bazel
+ARG BAZEL_VERSION=3.1.0
+RUN mkdir /bazel && \
+    curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    bash /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
+# Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/ && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /tf
+EXPOSE 8888
+
+RUN python3 -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-mpi-horovod-jupyter.Dockerfile
similarity index 52%
rename from tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-mpi-horovod-jupyter.Dockerfile
index a4a0bee0bc6..34485a528cd 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-mpi-horovod-jupyter.Dockerfile
@@ -1,4 +1,4 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,11 +19,13 @@
 # throughout. Please refer to the TensorFlow dockerfiles documentation
 # for more information.
 
-ARG UBUNTU_VERSION=18.04
+ARG UBUNTU_VERSION=20.04
 
 FROM ubuntu:${UBUNTU_VERSION} AS base
 
-RUN apt-get update && apt-get install -y --no-install-recommends \
+ARG DEBIAN_FRONTEND="noninteractive"
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
         build-essential \
         curl \
         git \
@@ -50,14 +52,13 @@ ENV CI_BUILD_PYTHON python
 ARG CACHE_STOP=1
 # Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
 ARG CHECKOUT_TF_SRC=0
-# In case of Python 2.7+ we need to add passwd entries for user and group id
-RUN chmod a+w /etc/passwd /etc/group
-RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
+ARG TF_BRANCH=master
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git --branch "${TF_BRANCH}" --single-branch /tensorflow_src || true
 
 # See http://bugs.python.org/issue19846
 ENV LANG C.UTF-8
 
-RUN apt-get update && apt-get install -y \
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
     python3 \
     python3-pip
 
@@ -68,78 +69,37 @@ RUN python3 -m pip --no-cache-dir install --upgrade \
 # Some TF tools expect a "python" binary
 RUN ln -s $(which python3) /usr/local/bin/python
 
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    git \
-    wget \
-    openjdk-8-jdk \
-    python3-dev \
-    virtualenv \
-    swig
-
-RUN python3 -m pip --no-cache-dir install \
-    Pillow \
-    h5py \
-    keras_preprocessing \
-    matplotlib \
-    mock \
-    'numpy<1.19.0' \
-    scipy \
-    sklearn \
-    pandas \
-    future \
-    portpicker \
-    enum34
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl
 
 # Install bazel
 ARG BAZEL_VERSION=3.1.0
 RUN mkdir /bazel && \
-    wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
-    wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
-    chmod +x /bazel/installer.sh && \
-    /bazel/installer.sh && \
+    curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    bash /bazel/installer.sh && \
     rm -f /bazel/installer.sh
 
-# install libnuma, openssh, wget
-RUN ( apt-get update && apt-get install -y --no-install-recommends --fix-missing \
-        libnuma-dev \
-        openssh-server \
-        openssh-client \
-        wget && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/* ) || \
-    ( yum -y update && yum -y install \
-            numactl-devel \
-            openssh-server \
-            openssh-clients \
-            wget && \
-    yum clean all ) || \
-    ( echo "Unsupported Linux distribution. Aborting!" && exit 1 )
+ARG DEBIAN_FRONTEND="noninteractive"
 
-# Install Open MPI
-# download realese version from official website as openmpi github master is not always stable
-ARG OPENMPI_VERSION=openmpi-4.0.0
-ARG OPENMPI_DOWNLOAD_URL=https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-4.0.0.tar.gz
-RUN mkdir /tmp/openmpi && \
-    cd /tmp/openmpi && \
-    wget ${OPENMPI_DOWNLOAD_URL} && \
-    tar zxf ${OPENMPI_VERSION}.tar.gz && \
-    cd ${OPENMPI_VERSION} && \
-    ./configure --enable-orterun-prefix-by-default && \
-    make -j $(nproc) all && \
-    make install && \
-    ldconfig && \
-    rm -rf /tmp/openmpi
+# install libnuma, openssh, wget
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    libopenmpi-dev \
+    openmpi-bin \
+    openmpi-common \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
 
 # Create a wrapper for OpenMPI to allow running as root by default
-RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
-    echo '#!/bin/bash' > /usr/local/bin/mpirun && \
-    echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
-    chmod a+x /usr/local/bin/mpirun
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
 
 # Configure OpenMPI to run good defaults:
-RUN echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf
+RUN echo "btl_tcp_if_exclude = lo,docker0" >> /etc/openmpi/openmpi-mca-params.conf
 
 # Install OpenSSH for MPI to communicate between containers
 RUN mkdir -p /var/run/sshd
@@ -151,7 +111,22 @@ RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_confi
 
 # Check out horovod source code if --build-arg CHECKOUT_HOROVOD_SRC=1
 ARG CHECKOUT_HOROVOD_SRC=0
-RUN test "${CHECKOUT_HOROVOD_SRC}" -eq 1 && git clone --recursive https://github.com/uber/horovod.git /horovod_src || true
+ARG HOROVOD_BRANCH=master
+RUN test "${CHECKOUT_HOROVOD_SRC}" -eq 1 && git clone --branch "${HOROVOD_BRANCH}" --single-branch --recursive https://github.com/uber/horovod.git /horovod_src || true
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
+
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
+# Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/ && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /tf
+EXPOSE 8888
+
+RUN python3 -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-mpi-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-mpi-horovod.Dockerfile
new file mode 100644
index 00000000000..85e271f54f0
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-mpi-horovod.Dockerfile
@@ -0,0 +1,118 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        sudo \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+ARG TF_BRANCH=master
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git --branch "${TF_BRANCH}" --single-branch /tensorflow_src || true
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl
+
+# Install bazel
+ARG BAZEL_VERSION=3.1.0
+RUN mkdir /bazel && \
+    curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    bash /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install libnuma, openssh, wget
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    libopenmpi-dev \
+    openmpi-bin \
+    openmpi-common \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Configure OpenMPI to run good defaults:
+RUN echo "btl_tcp_if_exclude = lo,docker0" >> /etc/openmpi/openmpi-mca-params.conf
+
+# Install OpenSSH for MPI to communicate between containers
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Check out horovod source code if --build-arg CHECKOUT_HOROVOD_SRC=1
+ARG CHECKOUT_HOROVOD_SRC=0
+ARG HOROVOD_BRANCH=master
+RUN test "${CHECKOUT_HOROVOD_SRC}" -eq 1 && git clone --branch "${HOROVOD_BRANCH}" --single-branch --recursive https://github.com/uber/horovod.git /horovod_src || true
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn.Dockerfile
new file mode 100644
index 00000000000..10ae251d7ae
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn.Dockerfile
@@ -0,0 +1,84 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        sudo \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+ARG TF_BRANCH=master
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git --branch "${TF_BRANCH}" --single-branch /tensorflow_src || true
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl
+
+# Install bazel
+ARG BAZEL_VERSION=3.1.0
+RUN mkdir /bazel && \
+    curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    bash /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-jupyter.Dockerfile
new file mode 100644
index 00000000000..30729f9a6e3
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-jupyter.Dockerfile
@@ -0,0 +1,66 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
+ARG TF_PACKAGE=tensorflow
+ARG TF_PACKAGE_VERSION=
+RUN python3 -m pip install --no-cache-dir ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
+# Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/ && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /tf
+EXPOSE 8888
+
+RUN python3 -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-mpi-horovod-jupyter.Dockerfile
similarity index 50%
rename from tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/horovod-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-mpi-horovod-jupyter.Dockerfile
index 00c21e287f1..7a46ea0707d 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/horovod-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-mpi-horovod-jupyter.Dockerfile
@@ -1,4 +1,4 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,16 +19,14 @@
 # throughout. Please refer to the TensorFlow dockerfiles documentation
 # for more information.
 
-ARG UBUNTU_VERSION=18.04
+ARG UBUNTU_VERSION=20.04
 
 FROM ubuntu:${UBUNTU_VERSION} as base
 
-RUN apt-get update && apt-get install -y curl
-
 # See http://bugs.python.org/issue19846
 ENV LANG C.UTF-8
 
-RUN apt-get update && apt-get install -y \
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
     python3 \
     python3-pip
 
@@ -50,45 +48,26 @@ ARG TF_PACKAGE=tensorflow
 ARG TF_PACKAGE_VERSION=
 RUN python3 -m pip install --no-cache-dir ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
 
-# install libnuma, openssh, wget
-RUN ( apt-get update && apt-get install -y --no-install-recommends --fix-missing \
-        libnuma-dev \
-        openssh-server \
-        openssh-client \
-        wget && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/* ) || \
-    ( yum -y update && yum -y install \
-            numactl-devel \
-            openssh-server \
-            openssh-clients \
-            wget && \
-    yum clean all ) || \
-    ( echo "Unsupported Linux distribution. Aborting!" && exit 1 )
+ARG DEBIAN_FRONTEND="noninteractive"
 
-# Install Open MPI
-# download realese version from official website as openmpi github master is not always stable
-ARG OPENMPI_VERSION=openmpi-4.0.0
-ARG OPENMPI_DOWNLOAD_URL=https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-4.0.0.tar.gz
-RUN mkdir /tmp/openmpi && \
-    cd /tmp/openmpi && \
-    wget ${OPENMPI_DOWNLOAD_URL} && \
-    tar zxf ${OPENMPI_VERSION}.tar.gz && \
-    cd ${OPENMPI_VERSION} && \
-    ./configure --enable-orterun-prefix-by-default && \
-    make -j $(nproc) all && \
-    make install && \
-    ldconfig && \
-    rm -rf /tmp/openmpi
+# install libnuma, openssh, wget
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    libopenmpi-dev \
+    openmpi-bin \
+    openmpi-common \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
 
 # Create a wrapper for OpenMPI to allow running as root by default
-RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
-    echo '#!/bin/bash' > /usr/local/bin/mpirun && \
-    echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
-    chmod a+x /usr/local/bin/mpirun
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
 
 # Configure OpenMPI to run good defaults:
-RUN echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf
+RUN echo "btl_tcp_if_exclude = lo,docker0" >> /etc/openmpi/openmpi-mca-params.conf
 
 # Install OpenSSH for MPI to communicate between containers
 RUN mkdir -p /var/run/sshd
@@ -99,8 +78,26 @@ RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_confi
     mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
 
 # Install Horovod
-ARG HOROVOD_VERSION=0.16.4
-RUN python3 -m pip install --no-cache-dir horovod==${HOROVOD_VERSION}
+ARG HOROVOD_WITHOUT_PYTORCH=1
+ARG HOROVOD_WITHOUT_MXNET=1
+ARG HOROVOD_WITH_TENSORFLOW=1
+ARG HOROVOD_VERSION=
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    software-properties-common
+
+RUN add-apt-repository ppa:ubuntu-toolchain-r/test
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    g++-8 \
+    gcc-8 \
+    python3-dev
+
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 500 --slave /usr/bin/g++ g++ /usr/bin/g++-5 && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8
+
+RUN python3 -m pip install --no-cache-dir horovod${HOROVOD_VERSION:+==${HOROVOD_VERSION}}
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
@@ -110,20 +107,8 @@ RUN python3 -m pip install --no-cache-dir jupyter matplotlib
 RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
 RUN jupyter serverextension enable --py jupyter_http_over_ws
 
-RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
+RUN mkdir -p /tf/ && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
-RUN apt-get install -y --no-install-recommends wget
-# some examples require git to fetch dependencies
-RUN apt-get install -y --no-install-recommends git
-WORKDIR /tf/tensorflow-tutorials
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/classification.ipynb
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/overfit_and_underfit.ipynb
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/regression.ipynb
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/save_and_load.ipynb
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/text_classification.ipynb
-RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/text_classification_with_hub.ipynb
-COPY readme-for-jupyter.md README.md
-RUN apt-get autoremove -y && apt-get remove -y wget
 WORKDIR /tf
 EXPOSE 8888
 
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-mpi-horovod.Dockerfile
similarity index 56%
rename from tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/horovod.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-mpi-horovod.Dockerfile
index bef75f1e495..8fb1ee56930 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/horovod.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-mpi-horovod.Dockerfile
@@ -1,4 +1,4 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,16 +19,14 @@
 # throughout. Please refer to the TensorFlow dockerfiles documentation
 # for more information.
 
-ARG UBUNTU_VERSION=18.04
+ARG UBUNTU_VERSION=20.04
 
 FROM ubuntu:${UBUNTU_VERSION} as base
 
-RUN apt-get update && apt-get install -y curl
-
 # See http://bugs.python.org/issue19846
 ENV LANG C.UTF-8
 
-RUN apt-get update && apt-get install -y \
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
     python3 \
     python3-pip
 
@@ -50,45 +48,26 @@ ARG TF_PACKAGE=tensorflow
 ARG TF_PACKAGE_VERSION=
 RUN python3 -m pip install --no-cache-dir ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
 
-# install libnuma, openssh, wget
-RUN ( apt-get update && apt-get install -y --no-install-recommends --fix-missing \
-        libnuma-dev \
-        openssh-server \
-        openssh-client \
-        wget && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/* ) || \
-    ( yum -y update && yum -y install \
-            numactl-devel \
-            openssh-server \
-            openssh-clients \
-            wget && \
-    yum clean all ) || \
-    ( echo "Unsupported Linux distribution. Aborting!" && exit 1 )
+ARG DEBIAN_FRONTEND="noninteractive"
 
-# Install Open MPI
-# download realese version from official website as openmpi github master is not always stable
-ARG OPENMPI_VERSION=openmpi-4.0.0
-ARG OPENMPI_DOWNLOAD_URL=https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-4.0.0.tar.gz
-RUN mkdir /tmp/openmpi && \
-    cd /tmp/openmpi && \
-    wget ${OPENMPI_DOWNLOAD_URL} && \
-    tar zxf ${OPENMPI_VERSION}.tar.gz && \
-    cd ${OPENMPI_VERSION} && \
-    ./configure --enable-orterun-prefix-by-default && \
-    make -j $(nproc) all && \
-    make install && \
-    ldconfig && \
-    rm -rf /tmp/openmpi
+# install libnuma, openssh, wget
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    libopenmpi-dev \
+    openmpi-bin \
+    openmpi-common \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
 
 # Create a wrapper for OpenMPI to allow running as root by default
-RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
-    echo '#!/bin/bash' > /usr/local/bin/mpirun && \
-    echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
-    chmod a+x /usr/local/bin/mpirun
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
 
 # Configure OpenMPI to run good defaults:
-RUN echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf
+RUN echo "btl_tcp_if_exclude = lo,docker0" >> /etc/openmpi/openmpi-mca-params.conf
 
 # Install OpenSSH for MPI to communicate between containers
 RUN mkdir -p /var/run/sshd
@@ -99,8 +78,26 @@ RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_confi
     mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
 
 # Install Horovod
-ARG HOROVOD_VERSION=0.16.4
-RUN python3 -m pip install --no-cache-dir horovod==${HOROVOD_VERSION}
+ARG HOROVOD_WITHOUT_PYTORCH=1
+ARG HOROVOD_WITHOUT_MXNET=1
+ARG HOROVOD_WITH_TENSORFLOW=1
+ARG HOROVOD_VERSION=
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    software-properties-common
+
+RUN add-apt-repository ppa:ubuntu-toolchain-r/test
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    g++-8 \
+    gcc-8 \
+    python3-dev
+
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 500 --slave /usr/bin/g++ g++ /usr/bin/g++-5 && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8
+
+RUN python3 -m pip install --no-cache-dir horovod${HOROVOD_VERSION:+==${HOROVOD_VERSION}}
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn.Dockerfile
new file mode 100644
index 00000000000..6a6cdf52a55
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn.Dockerfile
@@ -0,0 +1,52 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
+ARG TF_PACKAGE=tensorflow
+ARG TF_PACKAGE_VERSION=
+RUN python3 -m pip install --no-cache-dir ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-jupyter.Dockerfile
new file mode 100644
index 00000000000..ffc951f3fc3
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-jupyter.Dockerfile
@@ -0,0 +1,98 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        sudo \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+ARG TF_BRANCH=master
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git --branch "${TF_BRANCH}" --single-branch /tensorflow_src || true
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl
+
+# Install bazel
+ARG BAZEL_VERSION=3.1.0
+RUN mkdir /bazel && \
+    curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    bash /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
+# Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/ && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /tf
+EXPOSE 8888
+
+RUN python3 -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-mpi-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-mpi-horovod-jupyter.Dockerfile
new file mode 100644
index 00000000000..34485a528cd
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-mpi-horovod-jupyter.Dockerfile
@@ -0,0 +1,132 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        sudo \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+ARG TF_BRANCH=master
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git --branch "${TF_BRANCH}" --single-branch /tensorflow_src || true
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl
+
+# Install bazel
+ARG BAZEL_VERSION=3.1.0
+RUN mkdir /bazel && \
+    curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    bash /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install libnuma, openssh, wget
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    libopenmpi-dev \
+    openmpi-bin \
+    openmpi-common \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Configure OpenMPI to run good defaults:
+RUN echo "btl_tcp_if_exclude = lo,docker0" >> /etc/openmpi/openmpi-mca-params.conf
+
+# Install OpenSSH for MPI to communicate between containers
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Check out horovod source code if --build-arg CHECKOUT_HOROVOD_SRC=1
+ARG CHECKOUT_HOROVOD_SRC=0
+ARG HOROVOD_BRANCH=master
+RUN test "${CHECKOUT_HOROVOD_SRC}" -eq 1 && git clone --branch "${HOROVOD_BRANCH}" --single-branch --recursive https://github.com/uber/horovod.git /horovod_src || true
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
+# Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/ && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /tf
+EXPOSE 8888
+
+RUN python3 -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-mpi-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-mpi-horovod.Dockerfile
new file mode 100644
index 00000000000..85e271f54f0
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-mpi-horovod.Dockerfile
@@ -0,0 +1,118 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        sudo \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+ARG TF_BRANCH=master
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git --branch "${TF_BRANCH}" --single-branch /tensorflow_src || true
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl
+
+# Install bazel
+ARG BAZEL_VERSION=3.1.0
+RUN mkdir /bazel && \
+    curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    bash /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install libnuma, openssh, wget
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    libopenmpi-dev \
+    openmpi-bin \
+    openmpi-common \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Configure OpenMPI to run good defaults:
+RUN echo "btl_tcp_if_exclude = lo,docker0" >> /etc/openmpi/openmpi-mca-params.conf
+
+# Install OpenSSH for MPI to communicate between containers
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Check out horovod source code if --build-arg CHECKOUT_HOROVOD_SRC=1
+ARG CHECKOUT_HOROVOD_SRC=0
+ARG HOROVOD_BRANCH=master
+RUN test "${CHECKOUT_HOROVOD_SRC}" -eq 1 && git clone --branch "${HOROVOD_BRANCH}" --single-branch --recursive https://github.com/uber/horovod.git /horovod_src || true
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn.Dockerfile
new file mode 100644
index 00000000000..10ae251d7ae
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn.Dockerfile
@@ -0,0 +1,84 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        sudo \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+ARG TF_BRANCH=master
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git --branch "${TF_BRANCH}" --single-branch /tensorflow_src || true
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl
+
+# Install bazel
+ARG BAZEL_VERSION=3.1.0
+RUN mkdir /bazel && \
+    curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    bash /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-jupyter.Dockerfile
new file mode 100644
index 00000000000..30729f9a6e3
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-jupyter.Dockerfile
@@ -0,0 +1,66 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
+ARG TF_PACKAGE=tensorflow
+ARG TF_PACKAGE_VERSION=
+RUN python3 -m pip install --no-cache-dir ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
+# Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/ && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /tf
+EXPOSE 8888
+
+RUN python3 -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-mpi-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-mpi-horovod-jupyter.Dockerfile
new file mode 100644
index 00000000000..65043d18443
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-mpi-horovod-jupyter.Dockerfile
@@ -0,0 +1,112 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
+ARG TF_PACKAGE=tensorflow
+ARG TF_PACKAGE_VERSION=
+RUN python3 -m pip install --no-cache-dir ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install libnuma, openssh, wget
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    libopenmpi-dev \
+    openmpi-bin \
+    openmpi-common \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Configure OpenMPI to run good defaults:
+RUN echo "btl_tcp_if_exclude = lo,docker0" >> /etc/openmpi/openmpi-mca-params.conf
+
+# Install OpenSSH for MPI to communicate between containers
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Install Horovod
+ARG HOROVOD_WITHOUT_PYTORCH=1
+ARG HOROVOD_WITHOUT_MXNET=1
+ARG HOROVOD_WITH_TENSORFLOW=1
+ARG HOROVOD_VERSION=
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    g++-8 \
+    gcc-8 \
+    python3-dev
+
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 700 --slave /usr/bin/g++ g++ /usr/bin/g++-7 && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8
+
+RUN python3 -m pip install --no-cache-dir horovod${HOROVOD_VERSION:+==${HOROVOD_VERSION}}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
+# Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/ && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /tf
+EXPOSE 8888
+
+RUN python3 -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-mpi-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-mpi-horovod.Dockerfile
new file mode 100644
index 00000000000..69efc88cd35
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-mpi-horovod.Dockerfile
@@ -0,0 +1,98 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
+ARG TF_PACKAGE=tensorflow
+ARG TF_PACKAGE_VERSION=
+RUN python3 -m pip install --no-cache-dir ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install libnuma, openssh, wget
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    libopenmpi-dev \
+    openmpi-bin \
+    openmpi-common \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Configure OpenMPI to run good defaults:
+RUN echo "btl_tcp_if_exclude = lo,docker0" >> /etc/openmpi/openmpi-mca-params.conf
+
+# Install OpenSSH for MPI to communicate between containers
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Install Horovod
+ARG HOROVOD_WITHOUT_PYTORCH=1
+ARG HOROVOD_WITHOUT_MXNET=1
+ARG HOROVOD_WITH_TENSORFLOW=1
+ARG HOROVOD_VERSION=
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    g++-8 \
+    gcc-8 \
+    python3-dev
+
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 700 --slave /usr/bin/g++ g++ /usr/bin/g++-7 && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8
+
+RUN python3 -m pip install --no-cache-dir horovod${HOROVOD_VERSION:+==${HOROVOD_VERSION}}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn.Dockerfile
new file mode 100644
index 00000000000..6a6cdf52a55
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn.Dockerfile
@@ -0,0 +1,52 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
+ARG TF_PACKAGE=tensorflow
+ARG TF_PACKAGE_VERSION=
+RUN python3 -m pip install --no-cache-dir ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-jupyter.Dockerfile
new file mode 100644
index 00000000000..b1f1edfe36e
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-jupyter.Dockerfile
@@ -0,0 +1,108 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        sudo \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+ARG TF_BRANCH=master
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git --branch "${TF_BRANCH}" --single-branch /tensorflow_src || true
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+ARG PYTHON=python3
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl \
+    software-properties-common
+
+RUN add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt-get install -y --no-install-recommends --fix-missing \
+    ${PYTHON}
+
+RUN curl -fSsL https://bootstrap.pypa.io/get-pip.py | python3.7
+RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/local/bin/python3 && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python3
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl
+
+# Install bazel
+ARG BAZEL_VERSION=3.1.0
+RUN mkdir /bazel && \
+    curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    bash /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
+# Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/ && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /tf
+EXPOSE 8888
+
+RUN python3 -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-mpi-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-mpi-horovod-jupyter.Dockerfile
new file mode 100644
index 00000000000..92b8101078c
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-mpi-horovod-jupyter.Dockerfile
@@ -0,0 +1,142 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        sudo \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+ARG TF_BRANCH=master
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git --branch "${TF_BRANCH}" --single-branch /tensorflow_src || true
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+ARG PYTHON=python3
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl \
+    software-properties-common
+
+RUN add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt-get install -y --no-install-recommends --fix-missing \
+    ${PYTHON}
+
+RUN curl -fSsL https://bootstrap.pypa.io/get-pip.py | python3.7
+RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/local/bin/python3 && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python3
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl
+
+# Install bazel
+ARG BAZEL_VERSION=3.1.0
+RUN mkdir /bazel && \
+    curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    bash /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install libnuma, openssh, wget
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    libopenmpi-dev \
+    openmpi-bin \
+    openmpi-common \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Configure OpenMPI to run good defaults:
+RUN echo "btl_tcp_if_exclude = lo,docker0" >> /etc/openmpi/openmpi-mca-params.conf
+
+# Install OpenSSH for MPI to communicate between containers
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Check out horovod source code if --build-arg CHECKOUT_HOROVOD_SRC=1
+ARG CHECKOUT_HOROVOD_SRC=0
+ARG HOROVOD_BRANCH=master
+RUN test "${CHECKOUT_HOROVOD_SRC}" -eq 1 && git clone --branch "${HOROVOD_BRANCH}" --single-branch --recursive https://github.com/uber/horovod.git /horovod_src || true
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
+# Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/ && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /tf
+EXPOSE 8888
+
+RUN python3 -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-mpi-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-mpi-horovod.Dockerfile
new file mode 100644
index 00000000000..72275fce911
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-mpi-horovod.Dockerfile
@@ -0,0 +1,128 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        sudo \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+ARG TF_BRANCH=master
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git --branch "${TF_BRANCH}" --single-branch /tensorflow_src || true
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+ARG PYTHON=python3
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl \
+    software-properties-common
+
+RUN add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt-get install -y --no-install-recommends --fix-missing \
+    ${PYTHON}
+
+RUN curl -fSsL https://bootstrap.pypa.io/get-pip.py | python3.7
+RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/local/bin/python3 && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python3
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl
+
+# Install bazel
+ARG BAZEL_VERSION=3.1.0
+RUN mkdir /bazel && \
+    curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    bash /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install libnuma, openssh, wget
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    libopenmpi-dev \
+    openmpi-bin \
+    openmpi-common \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Configure OpenMPI to run good defaults:
+RUN echo "btl_tcp_if_exclude = lo,docker0" >> /etc/openmpi/openmpi-mca-params.conf
+
+# Install OpenSSH for MPI to communicate between containers
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Check out horovod source code if --build-arg CHECKOUT_HOROVOD_SRC=1
+ARG CHECKOUT_HOROVOD_SRC=0
+ARG HOROVOD_BRANCH=master
+RUN test "${CHECKOUT_HOROVOD_SRC}" -eq 1 && git clone --branch "${HOROVOD_BRANCH}" --single-branch --recursive https://github.com/uber/horovod.git /horovod_src || true
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn.Dockerfile
new file mode 100644
index 00000000000..f8ae3df3f52
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn.Dockerfile
@@ -0,0 +1,94 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        sudo \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+ARG TF_BRANCH=master
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git --branch "${TF_BRANCH}" --single-branch /tensorflow_src || true
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+ARG PYTHON=python3
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl \
+    software-properties-common
+
+RUN add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt-get install -y --no-install-recommends --fix-missing \
+    ${PYTHON}
+
+RUN curl -fSsL https://bootstrap.pypa.io/get-pip.py | python3.7
+RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/local/bin/python3 && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python3
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl
+
+# Install bazel
+ARG BAZEL_VERSION=3.1.0
+RUN mkdir /bazel && \
+    curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    bash /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-jupyter.Dockerfile
new file mode 100644
index 00000000000..2b145259c52
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-jupyter.Dockerfile
@@ -0,0 +1,76 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+ARG PYTHON=python3
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl \
+    software-properties-common
+
+RUN add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt-get install -y --no-install-recommends --fix-missing \
+    ${PYTHON}
+
+RUN curl -fSsL https://bootstrap.pypa.io/get-pip.py | python3.7
+RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/local/bin/python3 && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python3
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
+ARG TF_PACKAGE=tensorflow
+ARG TF_PACKAGE_VERSION=
+RUN python3 -m pip install --no-cache-dir ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
+# Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/ && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /tf
+EXPOSE 8888
+
+RUN python3 -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-mpi-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-mpi-horovod-jupyter.Dockerfile
new file mode 100644
index 00000000000..09527a82523
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-mpi-horovod-jupyter.Dockerfile
@@ -0,0 +1,122 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+ARG PYTHON=python3
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl \
+    software-properties-common
+
+RUN add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt-get install -y --no-install-recommends --fix-missing \
+    ${PYTHON}
+
+RUN curl -fSsL https://bootstrap.pypa.io/get-pip.py | python3.7
+RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/local/bin/python3 && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python3
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
+ARG TF_PACKAGE=tensorflow
+ARG TF_PACKAGE_VERSION=
+RUN python3 -m pip install --no-cache-dir ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install libnuma, openssh, wget
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    libopenmpi-dev \
+    openmpi-bin \
+    openmpi-common \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Configure OpenMPI to run good defaults:
+RUN echo "btl_tcp_if_exclude = lo,docker0" >> /etc/openmpi/openmpi-mca-params.conf
+
+# Install OpenSSH for MPI to communicate between containers
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Install Horovod
+ARG HOROVOD_WITHOUT_PYTORCH=1
+ARG HOROVOD_WITHOUT_MXNET=1
+ARG HOROVOD_WITH_TENSORFLOW=1
+ARG HOROVOD_VERSION=
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    g++-8 \
+    gcc-8 \
+    ${PYTHON}-dev
+
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 100 --slave /usr/bin/g++ g++ /usr/bin/g++-9 && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8
+
+RUN python3 -m pip install --no-cache-dir horovod${HOROVOD_VERSION:+==${HOROVOD_VERSION}}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
+# Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/ && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /tf
+EXPOSE 8888
+
+RUN python3 -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-mpi-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-mpi-horovod.Dockerfile
new file mode 100644
index 00000000000..a703ed38dcc
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-mpi-horovod.Dockerfile
@@ -0,0 +1,108 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+ARG PYTHON=python3
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl \
+    software-properties-common
+
+RUN add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt-get install -y --no-install-recommends --fix-missing \
+    ${PYTHON}
+
+RUN curl -fSsL https://bootstrap.pypa.io/get-pip.py | python3.7
+RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/local/bin/python3 && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python3
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
+ARG TF_PACKAGE=tensorflow
+ARG TF_PACKAGE_VERSION=
+RUN python3 -m pip install --no-cache-dir ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install libnuma, openssh, wget
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    libopenmpi-dev \
+    openmpi-bin \
+    openmpi-common \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Configure OpenMPI to run good defaults:
+RUN echo "btl_tcp_if_exclude = lo,docker0" >> /etc/openmpi/openmpi-mca-params.conf
+
+# Install OpenSSH for MPI to communicate between containers
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Install Horovod
+ARG HOROVOD_WITHOUT_PYTORCH=1
+ARG HOROVOD_WITHOUT_MXNET=1
+ARG HOROVOD_WITH_TENSORFLOW=1
+ARG HOROVOD_VERSION=
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    g++-8 \
+    gcc-8 \
+    ${PYTHON}-dev
+
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 100 --slave /usr/bin/g++ g++ /usr/bin/g++-9 && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8
+
+RUN python3 -m pip install --no-cache-dir horovod${HOROVOD_VERSION:+==${HOROVOD_VERSION}}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn.Dockerfile
new file mode 100644
index 00000000000..666e0839d39
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn.Dockerfile
@@ -0,0 +1,62 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+ARG PYTHON=python3
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl \
+    software-properties-common
+
+RUN add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt-get install -y --no-install-recommends --fix-missing \
+    ${PYTHON}
+
+RUN curl -fSsL https://bootstrap.pypa.io/get-pip.py | python3.7
+RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/local/bin/python3 && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python3
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
+ARG TF_PACKAGE=tensorflow
+ARG TF_PACKAGE_VERSION=
+RUN python3 -m pip install --no-cache-dir ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
index c104f6c86cb..946136f0c88 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
@@ -79,7 +79,7 @@ RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
         && rm -rf /var/lib/apt/lists/*; }
 
 # Configure the build for our CUDA configuration.
-ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/include/x64_64-linux-gnu:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/include/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
 ENV TF_NEED_CUDA 1
 ENV TF_NEED_TENSORRT 1
 ENV TF_CUDA_VERSION=${CUDA}
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
index 9e2c6385d34..cf84f4a74a8 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
@@ -79,7 +79,7 @@ RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
         && rm -rf /var/lib/apt/lists/*; }
 
 # Configure the build for our CUDA configuration.
-ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/include/x64_64-linux-gnu:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/include/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
 ENV TF_NEED_CUDA 1
 ENV TF_NEED_TENSORRT 1
 ENV TF_CUDA_VERSION=${CUDA}
diff --git a/tensorflow/tools/dockerfiles/partials/mkl_horovod/devel-horovod.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/mkl_horovod/devel-horovod.partial.Dockerfile
deleted file mode 100644
index dab42914df3..00000000000
--- a/tensorflow/tools/dockerfiles/partials/mkl_horovod/devel-horovod.partial.Dockerfile
+++ /dev/null
@@ -1,3 +0,0 @@
-# Check out horovod source code if --build-arg CHECKOUT_HOROVOD_SRC=1
-ARG CHECKOUT_HOROVOD_SRC=0
-RUN test "${CHECKOUT_HOROVOD_SRC}" -eq 1 && git clone --recursive https://github.com/uber/horovod.git /horovod_src || true
diff --git a/tensorflow/tools/dockerfiles/partials/mkl_horovod/horovod.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/mkl_horovod/horovod.partial.Dockerfile
deleted file mode 100644
index 1e1704f89a8..00000000000
--- a/tensorflow/tools/dockerfiles/partials/mkl_horovod/horovod.partial.Dockerfile
+++ /dev/null
@@ -1,3 +0,0 @@
-# Install Horovod
-ARG HOROVOD_VERSION=0.16.4
-RUN python3 -m pip install --no-cache-dir horovod==${HOROVOD_VERSION}
diff --git a/tensorflow/tools/dockerfiles/partials/mkl_horovod/mpi.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/mkl_horovod/mpi.partial.Dockerfile
deleted file mode 100644
index 67055ab244a..00000000000
--- a/tensorflow/tools/dockerfiles/partials/mkl_horovod/mpi.partial.Dockerfile
+++ /dev/null
@@ -1,47 +0,0 @@
-# install libnuma, openssh, wget
-RUN ( apt-get update && apt-get install -y --no-install-recommends --fix-missing \
-        libnuma-dev \
-        openssh-server \
-        openssh-client \
-        wget && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/* ) || \
-    ( yum -y update && yum -y install \
-            numactl-devel \
-            openssh-server \
-            openssh-clients \
-            wget && \
-    yum clean all ) || \
-    ( echo "Unsupported Linux distribution. Aborting!" && exit 1 )
-
-# Install Open MPI
-# download realese version from official website as openmpi github master is not always stable
-ARG OPENMPI_VERSION=openmpi-4.0.0
-ARG OPENMPI_DOWNLOAD_URL=https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-4.0.0.tar.gz
-RUN mkdir /tmp/openmpi && \
-    cd /tmp/openmpi && \
-    wget ${OPENMPI_DOWNLOAD_URL} && \
-    tar zxf ${OPENMPI_VERSION}.tar.gz && \
-    cd ${OPENMPI_VERSION} && \
-    ./configure --enable-orterun-prefix-by-default && \
-    make -j $(nproc) all && \
-    make install && \
-    ldconfig && \
-    rm -rf /tmp/openmpi
-
-# Create a wrapper for OpenMPI to allow running as root by default
-RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
-    echo '#!/bin/bash' > /usr/local/bin/mpirun && \
-    echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
-    chmod a+x /usr/local/bin/mpirun
-
-# Configure OpenMPI to run good defaults:
-RUN echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf
-
-# Install OpenSSH for MPI to communicate between containers
-RUN mkdir -p /var/run/sshd
-
-# Allow OpenSSH to talk to containers without asking for confirmation
-RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
-    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
-    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
diff --git a/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/1604-horovod.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/1604-horovod.partial.Dockerfile
new file mode 100644
index 00000000000..dabe310b306
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/1604-horovod.partial.Dockerfile
@@ -0,0 +1,21 @@
+# Install Horovod
+ARG HOROVOD_WITHOUT_PYTORCH=1
+ARG HOROVOD_WITHOUT_MXNET=1
+ARG HOROVOD_WITH_TENSORFLOW=1
+ARG HOROVOD_VERSION=
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    software-properties-common
+
+RUN add-apt-repository ppa:ubuntu-toolchain-r/test
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    g++-8 \
+    gcc-8 \
+    python3-dev
+
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 500 --slave /usr/bin/g++ g++ /usr/bin/g++-5 && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8
+
+RUN python3 -m pip install --no-cache-dir horovod${HOROVOD_VERSION:+==${HOROVOD_VERSION}}
diff --git a/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/2004-horovod.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/2004-horovod.partial.Dockerfile
new file mode 100644
index 00000000000..f018c3a2fc5
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/2004-horovod.partial.Dockerfile
@@ -0,0 +1,16 @@
+# Install Horovod
+ARG HOROVOD_WITHOUT_PYTORCH=1
+ARG HOROVOD_WITHOUT_MXNET=1
+ARG HOROVOD_WITH_TENSORFLOW=1
+ARG HOROVOD_VERSION=
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    g++-8 \
+    gcc-8 \
+    ${PYTHON}-dev
+
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 100 --slave /usr/bin/g++ g++ /usr/bin/g++-9 && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8
+
+RUN python3 -m pip install --no-cache-dir horovod${HOROVOD_VERSION:+==${HOROVOD_VERSION}}
diff --git a/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/bazel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/bazel.partial.Dockerfile
new file mode 100644
index 00000000000..2feb75a8185
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/bazel.partial.Dockerfile
@@ -0,0 +1,10 @@
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl
+
+# Install bazel
+ARG BAZEL_VERSION=3.1.0
+RUN mkdir /bazel && \
+    curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    bash /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
diff --git a/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/cpu.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/cpu.partial.Dockerfile
new file mode 100644
index 00000000000..d01b26e27f6
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/cpu.partial.Dockerfile
@@ -0,0 +1 @@
+FROM ubuntu:${UBUNTU_VERSION} as base
diff --git a/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/devel-horovod.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/devel-horovod.partial.Dockerfile
new file mode 100644
index 00000000000..3150c7a108b
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/devel-horovod.partial.Dockerfile
@@ -0,0 +1,4 @@
+# Check out horovod source code if --build-arg CHECKOUT_HOROVOD_SRC=1
+ARG CHECKOUT_HOROVOD_SRC=0
+ARG HOROVOD_BRANCH=master
+RUN test "${CHECKOUT_HOROVOD_SRC}" -eq 1 && git clone --branch "${HOROVOD_BRANCH}" --single-branch --recursive https://github.com/uber/horovod.git /horovod_src || true
diff --git a/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/devel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/devel.partial.Dockerfile
new file mode 100644
index 00000000000..8466c30cf13
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/devel.partial.Dockerfile
@@ -0,0 +1,33 @@
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        sudo \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+ARG TF_BRANCH=master
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git --branch "${TF_BRANCH}" --single-branch /tensorflow_src || true
diff --git a/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/horovod.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/horovod.partial.Dockerfile
new file mode 100644
index 00000000000..63c1e13443c
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/horovod.partial.Dockerfile
@@ -0,0 +1,16 @@
+# Install Horovod
+ARG HOROVOD_WITHOUT_PYTORCH=1
+ARG HOROVOD_WITHOUT_MXNET=1
+ARG HOROVOD_WITH_TENSORFLOW=1
+ARG HOROVOD_VERSION=
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    g++-8 \
+    gcc-8 \
+    python3-dev
+
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 700 --slave /usr/bin/g++ g++ /usr/bin/g++-7 && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8
+
+RUN python3 -m pip install --no-cache-dir horovod${HOROVOD_VERSION:+==${HOROVOD_VERSION}}
diff --git a/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/jupyter.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/jupyter.partial.Dockerfile
new file mode 100644
index 00000000000..d01a945e5b6
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/jupyter.partial.Dockerfile
@@ -0,0 +1,13 @@
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
+# Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/ && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /tf
+EXPOSE 8888
+
+RUN python3 -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/mpi.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/mpi.partial.Dockerfile
new file mode 100644
index 00000000000..cf899900941
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/mpi.partial.Dockerfile
@@ -0,0 +1,28 @@
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install libnuma, openssh, wget
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    libopenmpi-dev \
+    openmpi-bin \
+    openmpi-common \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Configure OpenMPI to run good defaults:
+RUN echo "btl_tcp_if_exclude = lo,docker0" >> /etc/openmpi/openmpi-mca-params.conf
+
+# Install OpenSSH for MPI to communicate between containers
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
diff --git a/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/python.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/python.partial.Dockerfile
new file mode 100644
index 00000000000..be4c4a08c03
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/python.partial.Dockerfile
@@ -0,0 +1,13 @@
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
diff --git a/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/python3.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/python3.partial.Dockerfile
new file mode 100644
index 00000000000..85e7f51309f
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/python3.partial.Dockerfile
@@ -0,0 +1,23 @@
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+ARG PYTHON=python3
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl \
+    software-properties-common
+
+RUN add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt-get install -y --no-install-recommends --fix-missing \
+    ${PYTHON}
+
+RUN curl -fSsL https://bootstrap.pypa.io/get-pip.py | python3.7
+RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/local/bin/python3 && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python3
diff --git a/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/test-devel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/test-devel.partial.Dockerfile
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/version.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/version.partial.Dockerfile
new file mode 100644
index 00000000000..4b1dee24baf
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/version.partial.Dockerfile
@@ -0,0 +1 @@
+ARG UBUNTU_VERSION=20.04
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
index d7e01071a14..5b4b2b7f60b 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
@@ -56,7 +56,7 @@ RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
         && rm -rf /var/lib/apt/lists/*; }
 
 # Configure the build for our CUDA configuration.
-ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/include/x64_64-linux-gnu:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/include/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
 ENV TF_NEED_CUDA 1
 ENV TF_NEED_TENSORRT 1
 ENV TF_CUDA_VERSION=${CUDA}
diff --git a/tensorflow/tools/dockerfiles/spec.yml b/tensorflow/tools/dockerfiles/spec.yml
index ea05d77d001..83829d73346 100644
--- a/tensorflow/tools/dockerfiles/spec.yml
+++ b/tensorflow/tools/dockerfiles/spec.yml
@@ -32,12 +32,21 @@ releases:
         tag_specs:
             - "{nightly}{jupyter}"
             - "{_TAG_PREFIX}{ubuntu-devel}"
-
     # Built per-release and pushed to tensorflow/tensorflow
     # --arg _TAG_PREFIX=<val> should be set to "1.11" (for example) or "latest".
     versioned:
         tag_specs:
             - "{_TAG_PREFIX}{ubuntu}{jupyter}"
+    onednn:
+        tag_specs:
+            - "{_TAG_PREFIX}{ubuntu-onednn}"
+            - "{_TAG_PREFIX}{ubuntu-onednn}{onednn-jupyter}"
+            - "{_TAG_PREFIX}{ubuntu-devel-onednn}"
+            - "{_TAG_PREFIX}{ubuntu-devel-onednn}{onednn-jupyter}"
+            - "{_TAG_PREFIX}{ubuntu-onednn-mpi-horovod}"
+            - "{_TAG_PREFIX}{ubuntu-onednn-mpi-horovod}{onednn-jupyter}"
+            - "{_TAG_PREFIX}{ubuntu-devel-onednn-mpi-horovod}"
+            - "{_TAG_PREFIX}{ubuntu-devel-onednn-mpi-horovod}{onednn-jupyter}"
 
     # Dockerfiles stored in the TF repo; not pushed anywhere
     dockerfiles:
@@ -48,8 +57,14 @@ releases:
             - "{ubuntu-devel}{jupyter}"
             - "{ubuntu-ppc64le}{jupyter}"
             - "{ubuntu-devel-ppc64le}{jupyter}"
-            - "{ubuntu-horovod}{jupyter}"
-            - "{ubuntu-devel-horovod}{jupyter}"
+            - "{ubuntu-onednn}"
+            - "{ubuntu-onednn}{onednn-jupyter}"
+            - "{ubuntu-devel-onednn}"
+            - "{ubuntu-devel-onednn}{onednn-jupyter}"
+            - "{ubuntu-onednn-mpi-horovod}"
+            - "{ubuntu-devel-onednn-mpi-horovod}"
+            - "{ubuntu-onednn-mpi-horovod}{onednn-jupyter}"
+            - "{ubuntu-devel-onednn-mpi-horovod}{onednn-jupyter}"
             - "{ubuntu-devel-arm64v8}{jupyter}"
 
 slice_sets:
@@ -60,6 +75,12 @@ slice_sets:
           partials:
               - jupyter
 
+    onednn-jupyter:
+        - add_to_name: ""
+        - add_to_name: "-jupyter"
+          partials:
+              - onednn/ubuntu/jupyter
+
     ubuntu:
         - add_to_name: ""
           dockerfile_exclusive_name: "cpu"
@@ -126,40 +147,303 @@ slice_sets:
               - UBUNTU_VERSION=18.04
               - CHECKOUT_TF_SRC=1
 
-    ubuntu-horovod:
-        - add_to_name: "-horovod"
-          dockerfile_exclusive_name: "horovod"
-          dockerfile_subdirectory: "mkl_horovod"
+    ubuntu-onednn:
+        - add_to_name: "-16.04-onednn"
+          dockerfile_exclusive_name: "ubuntu-16.04-onednn"
+          dockerfile_subdirectory: "onednn"
           partials:
-              - ubuntu/version
-              - ubuntu/cpu
-              - ubuntu/python
+              - onednn/ubuntu/version
+              - onednn/ubuntu/cpu
+              - onednn/ubuntu/python
               - tensorflow
-              - mkl_horovod/mpi
-              - mkl_horovod/horovod
               - shell
           tests:
-              - import-mkl-horovod.sh
+              - import-onednn.sh
           args:
               - TF_PACKAGE=intel-tensorflow
-
-    ubuntu-devel-horovod:
-        - add_to_name: "devel-horovod"
-          dockerfile_exclusive_name: "devel-horovod"
-          dockerfile_subdirectory: "mkl_horovod"
+              - UBUNTU_VERSION=16.04
+        - add_to_name: "-18.04-onednn"
+          dockerfile_exclusive_name: "ubuntu-18.04-onednn"
+          dockerfile_subdirectory: "onednn"
           partials:
-              - ubuntu/version
-              - ubuntu/devel-cpu
-              - ubuntu/python
-              - ubuntu/bazel
-              - mkl_horovod/mpi
-              - mkl_horovod/devel-horovod
+              - onednn/ubuntu/version
+              - onednn/ubuntu/cpu
+              - onednn/ubuntu/python
+              - tensorflow
               - shell
           tests:
-              - build-mkl-horovod.sh
+              - import-onednn.sh
           args:
+              - TF_PACKAGE=intel-tensorflow
+              - UBUNTU_VERSION=18.04
+        - add_to_name: "-20.04-onednn"
+          dockerfile_exclusive_name: "ubuntu-20.04-onednn"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/cpu
+              - onednn/ubuntu/python3
+              - tensorflow
+              - shell
+          tests:
+              - import-onednn.sh
+          args:
+              - TF_PACKAGE=intel-tensorflow
+              - UBUNTU_VERSION=20.04
+              - PYTHON=python3.7
+
+    ubuntu-devel-onednn:
+        - add_to_name: "-16.04-devel-onednn"
+          dockerfile_exclusive_name: "ubuntu-16.04-devel-onednn"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/devel
+              - onednn/ubuntu/python
+              - onednn/ubuntu/bazel
+              - shell
+          tests:
+              - ""
+          args:
+              - UBUNTU_VERSION=16.04
+              - CHECKOUT_TF_SRC=1
+              - TF_BRANCH=master
+        - add_to_name: "-18.04-devel-onednn"
+          dockerfile_exclusive_name: "ubuntu-18.04-devel-onednn"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/devel
+              - onednn/ubuntu/python
+              - onednn/ubuntu/bazel
+              - shell
+          tests:
+              - ""
+          args:
+              - UBUNTU_VERSION=18.04
+              - CHECKOUT_TF_SRC=1
+              - TF_BRANCH=master
+        - add_to_name: "-20.04-devel-onednn"
+          dockerfile_exclusive_name: "ubuntu-20.04-devel-onednn"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/devel
+              - onednn/ubuntu/python3
+              - onednn/ubuntu/bazel
+              - shell
+          tests:
+              - ""
+          args:
+              - UBUNTU_VERSION=20.04
+              - PYTHON=python3.7
+              - CHECKOUT_TF_SRC=1
+              - TF_BRANCH=master
+
+    ubuntu-onednn-mpi-horovod:
+        - add_to_name: "-16.04-onednn-mpi-horovod"
+          dockerfile_exclusive_name: "ubuntu-16.04-onednn-mpi-horovod"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/cpu
+              - onednn/ubuntu/python
+              - tensorflow
+              - onednn/ubuntu/mpi
+              - onednn/ubuntu/1604-horovod
+              - shell
+          tests:
+              - import-onednn-horovod.sh
+          args:
+              - UBUNTU_VERSION=16.04
+              - DEBIAN_FRONTEND="noninteractive"
+              - TF_PACKAGE=intel-tensorflow
+        - add_to_name: "-18.04-onednn-mpi-horovod"
+          dockerfile_exclusive_name: "ubuntu-18.04-onednn-mpi-horovod"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/cpu
+              - onednn/ubuntu/python
+              - tensorflow
+              - onednn/ubuntu/mpi
+              - onednn/ubuntu/horovod
+              - shell
+          tests:
+              - import-onednn-horovod.sh
+          args:
+              - UBUNTU_VERSION=18.04
+              - DEBIAN_FRONTEND="noninteractive"
+              - TF_PACKAGE=intel-tensorflow
+        - add_to_name: "-20.04-onednn-mpi-horovod"
+          dockerfile_exclusive_name: "ubuntu-20.04-onednn-mpi-horovod"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/cpu
+              - onednn/ubuntu/python3
+              - tensorflow
+              - onednn/ubuntu/mpi
+              - onednn/ubuntu/2004-horovod
+              - shell
+          tests:
+              - import-onednn-horovod.sh
+          args:
+              - UBUNTU_VERSION=20.04
+              - PYTHON=python3.7
+              - DEBIAN_FRONTEND="noninteractive"
+              - TF_PACKAGE=intel-tensorflow
+
+    ubuntu-devel-onednn-mpi-horovod:
+        - add_to_name: "-16.04-onednn-devel-mpi-horovod"
+          dockerfile_exclusive_name: "ubuntu-16.04-devel-onednn-mpi-horovod"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/devel
+              - onednn/ubuntu/python
+              - onednn/ubuntu/bazel
+              - onednn/ubuntu/mpi
+              - onednn/ubuntu/devel-horovod
+              - shell
+          tests:
+              - ""
+          args:
+              - UBUNTU_VERSION=16.04
               - CHECKOUT_TF_SRC=1
               - CHECKOUT_HOROVOD_SRC=1
+              - HOROVOD_BRANCH=master
+        - add_to_name: "-18.04-onednn-devel-mpi-horovod"
+          dockerfile_exclusive_name: "ubuntu-18.04-devel-onednn-mpi-horovod"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/devel
+              - onednn/ubuntu/python
+              - onednn/ubuntu/bazel
+              - onednn/ubuntu/mpi
+              - onednn/ubuntu/devel-horovod
+              - shell
+          tests:
+              - ""
+          args:
+              - UBUNTU_VERSION=18.04
+              - CHECKOUT_TF_SRC=1
+              - CHECKOUT_HOROVOD_SRC=1
+              - HOROVOD_BRANCH=master
+        - add_to_name: "-20.04-onednn-devel-mpi-horovod"
+          dockerfile_exclusive_name: "ubuntu-20.04-devel-onednn-mpi-horovod"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/devel
+              - onednn/ubuntu/python3
+              - onednn/ubuntu/bazel
+              - onednn/ubuntu/mpi
+              - onednn/ubuntu/devel-horovod
+              - shell
+          tests:
+              - ""
+          args:
+              - UBUNTU_VERSION=20.04
+              - PYTHON=python3.7
+              - CHECKOUT_TF_SRC=1
+              - CHECKOUT_HOROVOD_SRC=1
+              - HOROVOD_BRANCH=master
+
+
+    ubuntu-onednn:
+        - add_to_name: "-16.04-onednn"
+          dockerfile_exclusive_name: "ubuntu-16.04-onednn"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/cpu
+              - onednn/ubuntu/python
+              - tensorflow
+              - shell
+          tests:
+              - import-onednn.sh
+          args:
+              - TF_PACKAGE=intel-tensorflow
+              - UBUNTU_VERSION=16.04
+        - add_to_name: "-18.04-onednn"
+          dockerfile_exclusive_name: "ubuntu-18.04-onednn"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/cpu
+              - onednn/ubuntu/python
+              - tensorflow
+              - shell
+          tests:
+              - import-onednn.sh
+          args:
+              - TF_PACKAGE=intel-tensorflow
+              - UBUNTU_VERSION=18.04
+        - add_to_name: "-20.04-onednn"
+          dockerfile_exclusive_name: "ubuntu-20.04-onednn"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/cpu
+              - onednn/ubuntu/python3
+              - tensorflow
+              - shell
+          tests:
+              - import-onednn.sh
+          args:
+              - TF_PACKAGE=intel-tensorflow
+              - UBUNTU_VERSION=20.04
+              - PYTHON=python3.7
+
+    ubuntu-devel-onednn:
+        - add_to_name: "-16.04-devel-onednn"
+          dockerfile_exclusive_name: "ubuntu-16.04-devel-onednn"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/devel
+              - onednn/ubuntu/python
+              - onednn/ubuntu/bazel
+              - shell
+          tests:
+              - ""
+          args:
+              - UBUNTU_VERSION=16.04
+              - CHECKOUT_TF_SRC=1
+              - TF_BRANCH=master
+        - add_to_name: "-18.04-devel-onednn"
+          dockerfile_exclusive_name: "ubuntu-18.04-devel-onednn"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/devel
+              - onednn/ubuntu/python
+              - onednn/ubuntu/bazel
+              - shell
+          tests:
+              - ""
+          args:
+              - UBUNTU_VERSION=18.04
+              - CHECKOUT_TF_SRC=1
+              - TF_BRANCH=master
+        - add_to_name: "-20.04-devel-onednn"
+          dockerfile_exclusive_name: "ubuntu-20.04-devel-onednn"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/devel
+              - onednn/ubuntu/python3
+              - onednn/ubuntu/bazel
+              - shell
+          tests:
+              - ""
+          args:
+              - UBUNTU_VERSION=20.04
+              - PYTHON=python3.7
+              - CHECKOUT_TF_SRC=1
+              - TF_BRANCH=master
 
     ubuntu-ppc64le:
         - add_to_name: "-ppc64le"
diff --git a/tensorflow/tools/dockerfiles/tests/import-onednn-horovod.sh b/tensorflow/tools/dockerfiles/tests/import-onednn-horovod.sh
new file mode 100755
index 00000000000..9331b686cb1
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/tests/import-onednn-horovod.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+python -c 'from tensorflow.python import _pywrap_util_port; print(_pywrap_util_port.IsMklEnabled()); import horovod.tensorflow as hvd'
+new_mkl_horovod_enabled=$?
+
+python -c 'from tensorflow.python import pywrap_tensorflow; print(pywrap_tensorflow.IsMklEnabled()); import horovod.tensorflow as hvd'
+old_mkl_horovod_enabled=$?
+
+if [[ $new_mkl_horovod_enabled -eq 0 ]]; then
+   echo "PASS: Horovod with MKL is enabled"
+elif [[ $old_mkl_horovod_enabled -eq 0]]; then
+   echo "PASS: Horovod with Old MKL is detected"
+else
+   die "FAIL: Horovod with MKL is not enabled"
+fi
diff --git a/tensorflow/tools/dockerfiles/tests/import-onednn.sh b/tensorflow/tools/dockerfiles/tests/import-onednn.sh
new file mode 100755
index 00000000000..7cc24f01bf7
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/tests/import-onednn.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+python -c 'from tensorflow.python import _pywrap_util_port; print(_pywrap_util_port.IsMklEnabled())'
+new_mkl_enabled=$?
+
+python -c 'from tensorflow.python import pywrap_tensorflow; print(pywrap_tensorflow.IsMklEnabled())'
+old_mkl_enabled=$?
+
+if [[ $new_mkl_enabled -eq 0 ]]; then
+   echo "PASS: MKL is enabled"
+elif [[ $old_mkl_enabled -eq 0]]; then
+   echo "PASS: Old MKL is detected"
+else
+   die "FAIL: MKL is not enabled"
+fi
diff --git a/tensorflow/tools/docs/generate2.py b/tensorflow/tools/docs/generate2.py
index d61c03548e3..44152ba30ef 100644
--- a/tensorflow/tools/docs/generate2.py
+++ b/tensorflow/tools/docs/generate2.py
@@ -151,28 +151,6 @@ class TfExportAwareVisitor(doc_generator_visitor.DocGeneratorVisitor):
     return (canonical_score,) + scores
 
 
-def _hide_layer_and_module_methods():
-  """Hide methods and properties defined in the base classes of keras layers."""
-  # __dict__ only sees attributes defined in *this* class, not on parent classes
-  module_contents = list(tf.Module.__dict__.items())
-  layer_contents = list(tf.keras.layers.Layer.__dict__.items())
-
-  for name, obj in module_contents + layer_contents:
-    if name == "__init__":
-      continue
-
-    if isinstance(obj, property):
-      obj = obj.fget
-
-    if isinstance(obj, (staticmethod, classmethod)):
-      obj = obj.__func__
-
-    try:
-      doc_controls.do_not_doc_in_subclasses(obj)
-    except AttributeError:
-      pass
-
-
 def build_docs(output_dir, code_url_prefix, search_hints=True):
   """Build api docs for tensorflow v2.
 
@@ -189,7 +167,11 @@ def build_docs(output_dir, code_url_prefix, search_hints=True):
     if not name.startswith("_"):
       doc_controls.hide_from_search(obj)
 
-  _hide_layer_and_module_methods()
+  for cls in [tf.Module, tf.keras.layers.Layer]:
+    doc_controls.decorate_all_class_attributes(
+        decorator=doc_controls.do_not_doc_in_subclasses,
+        cls=cls,
+        skip=["__init__"])
 
   try:
     doc_controls.do_not_generate_docs(tf.__operators__)
diff --git a/tensorflow/tools/docs/tf_doctest.py b/tensorflow/tools/docs/tf_doctest.py
index 40b06c6c53f..df6077b8c9c 100644
--- a/tensorflow/tools/docs/tf_doctest.py
+++ b/tensorflow/tools/docs/tf_doctest.py
@@ -29,7 +29,6 @@ from absl.testing import absltest
 import numpy as np
 import tensorflow.compat.v2 as tf
 
-import tensorflow.python as tf_root
 from tensorflow.tools.docs import tf_doctest_lib
 
 # We put doctest after absltest so that it picks up the unittest monkeypatch.
@@ -190,8 +189,9 @@ def load_tests(unused_loader, tests, unused_ignore):
     tf_modules = get_module_and_inject_docstring(FLAGS.file)
 
   for module in tf_modules:
-    if any(module.__name__.startswith(PACKAGE + prefix)
-           for prefix in FLAGS.module_prefix_skip):
+    if any(
+        module.__name__.startswith(PACKAGE + prefix)
+        for prefix in FLAGS.module_prefix_skip):
       continue
     testcase = TfTestCase()
     tests.addTests(
@@ -221,5 +221,9 @@ def setUpModule():
 
 
 if __name__ == '__main__':
-  recursive_import(tf_root)
+  # Use importlib to import python submodule of tensorflow.
+  # We delete python submodule in root __init__.py file. This means
+  # normal import won't work for some Python versions.
+  tf_python_root = importlib.import_module(PACKAGE[:-1])
+  recursive_import(tf_python_root)
   absltest.main()
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 4e608360f8b..9cf6e10702f 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -26,7 +26,6 @@ package(default_visibility = ["//visibility:private"])
 transitive_hdrs(
     name = "included_headers",
     deps = [
-        "//tensorflow/c/experimental:network",
         "//tensorflow/compiler/tf2xla:xla_compiled_cpu_function",
         "//tensorflow/compiler/mlir:mlir_graph_optimization_pass",
         "//tensorflow/core:core_cpu",
@@ -132,7 +131,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python/keras/mixed_precision/experimental:test_util",
     "//tensorflow/python/keras/tests:model_subclassing_test_util",
     "//tensorflow/python/keras/tests:model_architectures",
-    "//tensorflow/python/keras/benchmarks:benchmark_util",
+    "//tensorflow/python/keras/benchmarks:keras_benchmark_lib_pip",
     "//tensorflow/python/kernel_tests:cudnn_deterministic_base",
     "//tensorflow/python/kernel_tests:bias_op_base",
     "//tensorflow/python/kernel_tests/random:util",
@@ -151,6 +150,9 @@ COMMON_PIP_DEPS = [
     "//tensorflow/tools/docs:generate_lib",
     "//tensorflow/tools/docs:parser",
     "//tensorflow/tools/docs:py_guide_parser",
+    "//tensorflow/python/distribute/client:client",
+    "//tensorflow/python/distribute/client:parameter_server_client",
+    "//tensorflow/python/distribute/client:metric_utils",
 ]
 
 # On Windows, python binary is a zip file of runfiles tree.
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index d06cee1e038..d2002b58598 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -75,7 +75,6 @@ PYTHON_TARGETS, PY_TEST_QUERY_EXPRESSION = BuildPyTestDependencies()
 DEPENDENCY_DENYLIST = [
     "//tensorflow/python:extra_py_tests_deps",
     "//tensorflow/cc/saved_model:saved_model_half_plus_two",
-    "//tensorflow:enable_mlir_bridge",
     "//tensorflow:no_tensorflow_py_deps",
     "//tensorflow/tools/pip_package:win_pip_package_marker",
     "//tensorflow/python:test_ops_2",
@@ -175,8 +174,7 @@ def main():
     raise RuntimeError("""
     One or more added test dependencies are not in the pip package.
 If these test dependencies need to be in TensorFlow pip package, please add them to //tensorflow/tools/pip_package/BUILD.
-Else either denylist the dependencies in //tensorflow/tools/pip_package/pip_smoke_test.py
-or add no_pip tag to the test.""")
+Else add no_pip tag to the test.""")
 
   else:
     print("TEST PASSED")
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 0c476336781..54021af9975 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -54,6 +54,7 @@ _VERSION = '2.4.0'
 REQUIRED_PACKAGES = [
     'absl-py >= 0.7.0',
     'astunparse == 1.6.3',
+    'flatbuffers >= 1.12',
     'gast == 0.3.3',
     'google_pasta >= 0.1.8',
     'h5py >= 2.10.0, < 2.11.0',
@@ -63,14 +64,12 @@ REQUIRED_PACKAGES = [
     'numpy >= 1.16.0, < 1.19.0',
     'opt_einsum >= 2.3.2',
     'protobuf >= 3.9.2',
-    'tensorboard >= 2.2.0, < 2.3.0',
-    'tensorflow_estimator >= 2.2.0, < 2.3.0',
+    'tensorboard >= 2.3.0, < 3',
+    'tensorflow_estimator >= 2.3.0, < 2.4.0',
     'termcolor >= 1.1.0',
     'wrapt >= 1.11.1',
     'wheel >= 0.26',
     'six >= 1.12.0',
-    # scipy < 1.4.1 causes segfaults due to pybind11
-    'scipy == 1.4.1',
 ]
 
 if sys.byteorder == 'little':
@@ -90,7 +89,7 @@ if '--project_name' in sys.argv:
 if 'tf_nightly' in project_name:
   for i, pkg in enumerate(REQUIRED_PACKAGES):
     if 'tensorboard' in pkg:
-      REQUIRED_PACKAGES[i] = 'tb-nightly >= 2.3.0a0, < 2.4.0a0'
+      REQUIRED_PACKAGES[i] = 'tb-nightly >= 2.4.0a0, < 3.0.0a0'
     elif 'tensorflow_estimator' in pkg:
       REQUIRED_PACKAGES[i] = 'tf-estimator-nightly'
 
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 09dfe270943..95b9c91a62f 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -164,11 +164,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "bd4278ebbe3f6b104f46548717b00bdba95acaab3cbac3de4015c65d868259f8",
-        strip_prefix = "XNNPACK-d27202dfeaa8d3a96670ba47f3dce2f19305a092",
+        sha256 = "c6eae589a4af7785da467162acd339bae359842e14c93bddc8fbe84ffd361c70",
+        strip_prefix = "XNNPACK-aff24e26a760552ee98a036f2a6e95b123e1bc6d",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/d27202dfeaa8d3a96670ba47f3dce2f19305a092.zip",
-            "https://github.com/google/XNNPACK/archive/d27202dfeaa8d3a96670ba47f3dce2f19305a092.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/aff24e26a760552ee98a036f2a6e95b123e1bc6d.zip",
+            "https://github.com/google/XNNPACK/archive/aff24e26a760552ee98a036f2a6e95b123e1bc6d.zip",
         ],
     )
 
@@ -237,11 +237,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
         patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
-        sha256 = "2ec918767935cf6ec92b1d52c53a304cd13148f0b3dbdf3c3632de4a581d5a5a",  # SHARED_EIGEN_SHA
-        strip_prefix = "eigen-8889a2c1c648f5dd1413dc2d94c2407c7ce1bd32",
+        sha256 = "9d8cbf2bd665cbb7b684bf4c6c5482b98dc6965847108f260c077049da04bee8",  # SHARED_EIGEN_SHA
+        strip_prefix = "eigen-2ce2f5198929caab4b41a6ad1b9c93f67d8b9a69",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/8889a2c1c648f5dd1413dc2d94c2407c7ce1bd32/eigen-8889a2c1c648f5dd1413dc2d94c2407c7ce1bd32.tar.gz",
-            "https://gitlab.com/libeigen/eigen/-/archive/8889a2c1c648f5dd1413dc2d94c2407c7ce1bd32/eigen-8889a2c1c648f5dd1413dc2d94c2407c7ce1bd32.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/2ce2f5198929caab4b41a6ad1b9c93f67d8b9a69/eigen-2ce2f5198929caab4b41a6ad1b9c93f67d8b9a69.tar.gz",
+            "https://gitlab.com/libeigen/eigen/-/archive/2ce2f5198929caab4b41a6ad1b9c93f67d8b9a69/eigen-2ce2f5198929caab4b41a6ad1b9c93f67d8b9a69.tar.gz",
         ],
     )
 
@@ -336,8 +336,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "com_github_googlecloudplatform_google_cloud_cpp",
-        sha256 = "839b2d4dcb36a671734dac6b30ea8c298bbeaafcf7a45ee4a7d7aa5986b16569",
-        strip_prefix = "google-cloud-cpp-1.14.0",
+        sha256 = "d9d1358f464328b8fd6d24a98d4c2876fde0d3fdb06c8b6bd617be7fb9b0fbac",
+        strip_prefix = "google-cloud-cpp-1.16.0",
         repo_mapping = {
             "@com_github_curl_curl": "@curl",
         },
@@ -346,8 +346,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
             "//third_party/systemlibs:google_cloud_cpp.google.cloud.bigtable.BUILD": "google/cloud/bigtable/BUILD",
         },
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/googleapis/google-cloud-cpp/archive/v1.14.0.tar.gz",
-            "https://github.com/googleapis/google-cloud-cpp/archive/v1.14.0.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/googleapis/google-cloud-cpp/archive/v1.16.0.tar.gz",
+            "https://github.com/googleapis/google-cloud-cpp/archive/v1.16.0.tar.gz",
         ],
     )
 
@@ -535,6 +535,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
             "//third_party/systemlibs:absl_py.absl.BUILD": "absl/BUILD",
             "//third_party/systemlibs:absl_py.absl.flags.BUILD": "absl/flags/BUILD",
             "//third_party/systemlibs:absl_py.absl.testing.BUILD": "absl/testing/BUILD",
+            "//third_party/systemlibs:absl_py.absl.logging.BUILD": "absl/logging/BUILD",
         },
         urls = [
             "https://storage.googleapis.com/mirror.tensorflow.org/github.com/abseil/abseil-py/archive/pypi-v0.9.0.tar.gz",
@@ -687,6 +688,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "b956598d8cbe168b5ee717b5dafa56563eb5201a947856a6688bbeac9cac4e1f",
         strip_prefix = "grpc-b54a5b338637f92bfcf4b0bc05e0f57a5fd8fadd",
         system_build_file = clean_dep("//third_party/systemlibs:grpc.BUILD"),
+        patch_file = clean_dep("//third_party/grpc:generate_cc_env_fix.patch"),
         system_link_files = {
             "//third_party/systemlibs:BUILD": "bazel/BUILD",
             "//third_party/systemlibs:grpc.BUILD": "src/compiler/BUILD",
@@ -710,8 +712,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "c2a61ef3885019c5e0444d8789de63e1ce4d5003"
-    LLVM_SHA256 = "7e44c7970640da0a8b81e267252d9e0245390a832d1c23f20b32522f1473d12a"
+    LLVM_COMMIT = "950f1bf976b332eca60267b25bf759e2ad564e0c"
+    LLVM_SHA256 = "89b0e1e5d0cd56adfbe061fc42804088eaed6773e8ff9f1d597137b474055096"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
diff --git a/third_party/aws/aws-checksums.bazel b/third_party/aws/aws-checksums.bazel
index 759cb2e6fcf..f620a96d2c8 100644
--- a/third_party/aws/aws-checksums.bazel
+++ b/third_party/aws/aws-checksums.bazel
@@ -16,6 +16,7 @@ cc_library(
         "//conditions:default": [],
     }) + glob([
         "source/intel/*.c",
+        "source/arm/*.c",
         "source/*.c",
     ]),
     hdrs = glob([
diff --git a/third_party/com_google_absl_fix_mac_and_nvcc_build.patch b/third_party/com_google_absl_fix_mac_and_nvcc_build.patch
index 271e941bfe8..6301119ab2c 100644
--- a/third_party/com_google_absl_fix_mac_and_nvcc_build.patch
+++ b/third_party/com_google_absl_fix_mac_and_nvcc_build.patch
@@ -1,103 +1,8 @@
---- ./absl/time/internal/cctz/BUILD.bazel	2019-09-23 13:20:52.000000000 -0700
-+++ ./absl/time/internal/cctz/BUILD.bazel.fixed	2019-09-23 13:20:48.000000000 -0700
-@@ -74,15 +74,6 @@
-         "include/cctz/time_zone.h",
-         "include/cctz/zone_info_source.h",
-     ],
--    linkopts = select({
--        ":osx": [
--            "-framework Foundation",
--        ],
--        ":ios": [
--            "-framework Foundation",
--        ],
--        "//conditions:default": [],
--    }),
-     visibility = ["//visibility:public"],
-     deps = [
-         ":civil_time",
---- ./absl/strings/string_view.h	2019-09-23 13:20:52.000000000 -0700
-+++ ./absl/strings/string_view.h.fixed	2019-09-23 13:20:48.000000000 -0700
-@@ -283,7 +283,14 @@
-   // Returns the ith element of the `string_view` using the array operator.
-   // Note that this operator does not perform any bounds checking.
-   constexpr const_reference operator[](size_type i) const {
-+#if defined(__NVCC__) && (__CUDACC_VER_MAJOR__ < 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ < 2))
-+    // An NVCC bug treats the original return expression as a non-constant,
-+    // which is not allowed in a constexpr function.  This will be fixed in the
-+    // CUDA 10.2 release.
-+    return ptr_[i];
-+#else
-     return ABSL_ASSERT(i < size()), ptr_[i];
-+#endif
-   }
- 
-   // string_view::at()
-@@ -292,25 +299,46 @@
-   // and an exception of type `std::out_of_range` will be thrown on invalid
-   // access.
-   constexpr const_reference at(size_type i) const {
-+#if defined(__NVCC__) && (__CUDACC_VER_MAJOR__ < 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ < 2))
-+    // An NVCC bug treats the original return expression as a non-constant,
-+    // which is not allowed in a constexpr function.  This will be fixed in the
-+    // CUDA 10.2 release.
-+    return ptr_[i];
-+#else
-     return ABSL_PREDICT_TRUE(i < size())
-                ? ptr_[i]
-                : ((void)base_internal::ThrowStdOutOfRange(
-                       "absl::string_view::at"),
-                   ptr_[i]);
-+#endif
-   }
- 
-   // string_view::front()
-   //
-   // Returns the first element of a `string_view`.
-   constexpr const_reference front() const {
-+#if defined(__NVCC__) && (__CUDACC_VER_MAJOR__ < 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ < 2))
-+    // An NVCC bug treats the original return expression as a non-constant,
-+    // which is not allowed in a constexpr function.  This will be fixed in the
-+    // CUDA 10.2 release.
-+    return ptr_[0];
-+#else
-     return ABSL_ASSERT(!empty()), ptr_[0];
-+#endif
-   }
- 
-   // string_view::back()
-   //
-   // Returns the last element of a `string_view`.
-   constexpr const_reference back() const {
-+#if defined(__NVCC__) && (__CUDACC_VER_MAJOR__ < 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ < 2))
-+    // An NVCC bug treats the original return expression as a non-constant,
-+    // which is not allowed in a constexpr function.  This will be fixed in the
-+    // CUDA 10.2 release.
-+    return ptr_[size() - 1];
-+#else
-     return ABSL_ASSERT(!empty()), ptr_[size() - 1];
-+#endif
-   }
- 
-   // string_view::data()
-@@ -519,7 +547,14 @@
-       (std::numeric_limits<difference_type>::max)();
- 
-   static constexpr size_type CheckLengthInternal(size_type len) {
-+#if defined(__NVCC__) && (__CUDACC_VER_MAJOR__ < 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ < 2))
-+    // An NVCC bug treats the original return expression as a non-constant,
-+    // which is not allowed in a constexpr function.  This will be fixed in the
-+    // CUDA 10.2 release.
-+    return len;
-+#else
-     return (void)ABSL_ASSERT(len <= kMaxSize), len;
-+#endif
-   }
- 
-   static constexpr size_type StrlenInternal(const char* str) {
---- ./absl/container/internal/compressed_tuple.h	2020-03-04 12:57:37.000000000 -0800
-+++ ./absl/container/internal/compressed_tuple.h.fixed	2019-06-20 11:54:01.000000000 -0700
-@@ -32,7 +32,6 @@ Revert to commit 43ef2148c0936ebf7cb4be6b19927a9d9d145b8f as commit e9324d926a9189e222741fce6e676f0944661a72 includes a change not compatible with CUDA on Windows.
+diff --git a/absl/container/internal/compressed_tuple.h b/absl/container/internal/compressed_tuple.h
+index 4bfe92f..01db713 100644
+--- a/absl/container/internal/compressed_tuple.h
++++ b/absl/container/internal/compressed_tuple.h
+@@ -32,7 +32,6 @@
  #ifndef ABSL_CONTAINER_INTERNAL_COMPRESSED_TUPLE_H_
  #define ABSL_CONTAINER_INTERNAL_COMPRESSED_TUPLE_H_
  
@@ -105,7 +10,7 @@
  #include <tuple>
  #include <type_traits>
  #include <utility>
-@@ -77,110 +76,61 @@
+@@ -77,110 +76,61 @@ constexpr bool IsFinal() {
  #endif
  }
  
@@ -234,7 +139,7 @@
  //
  // To access the members, use member .get<N>() function.
  //
-@@ -196,58 +146,36 @@
+@@ -196,58 +146,36 @@ using TupleMoveConstructible = typename std::conditional<
  template <typename... Ts>
  class ABSL_INTERNAL_COMPRESSED_TUPLE_DECLSPEC CompressedTuple
      : private internal_compressed_tuple::CompressedTupleImpl<
@@ -302,3 +207,104 @@
    }
  };
  
+diff --git a/absl/strings/string_view.h b/absl/strings/string_view.h
+index 1861ea6..c7a916b 100644
+--- a/absl/strings/string_view.h
++++ b/absl/strings/string_view.h
+@@ -283,7 +283,14 @@ class string_view {
+   // Returns the ith element of the `string_view` using the array operator.
+   // Note that this operator does not perform any bounds checking.
+   constexpr const_reference operator[](size_type i) const {
++#if defined(__NVCC__) && (__CUDACC_VER_MAJOR__ < 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ < 2))
++    // An NVCC bug treats the original return expression as a non-constant,
++    // which is not allowed in a constexpr function.  This will be fixed in the
++    // CUDA 10.2 release.
++    return ptr_[i];
++#else
+     return ABSL_ASSERT(i < size()), ptr_[i];
++#endif
+   }
+ 
+   // string_view::at()
+@@ -292,25 +299,46 @@ class string_view {
+   // and an exception of type `std::out_of_range` will be thrown on invalid
+   // access.
+   constexpr const_reference at(size_type i) const {
++#if defined(__NVCC__) && (__CUDACC_VER_MAJOR__ < 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ < 2))
++    // An NVCC bug treats the original return expression as a non-constant,
++    // which is not allowed in a constexpr function.  This will be fixed in the
++    // CUDA 10.2 release.
++    return ptr_[i];
++#else
+     return ABSL_PREDICT_TRUE(i < size())
+                ? ptr_[i]
+                : ((void)base_internal::ThrowStdOutOfRange(
+                       "absl::string_view::at"),
+                   ptr_[i]);
++#endif
+   }
+ 
+   // string_view::front()
+   //
+   // Returns the first element of a `string_view`.
+   constexpr const_reference front() const {
++#if defined(__NVCC__) && (__CUDACC_VER_MAJOR__ < 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ < 2))
++    // An NVCC bug treats the original return expression as a non-constant,
++    // which is not allowed in a constexpr function.  This will be fixed in the
++    // CUDA 10.2 release.
++    return ptr_[0];
++#else
+     return ABSL_ASSERT(!empty()), ptr_[0];
++#endif
+   }
+ 
+   // string_view::back()
+   //
+   // Returns the last element of a `string_view`.
+   constexpr const_reference back() const {
++#if defined(__NVCC__) && (__CUDACC_VER_MAJOR__ < 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ < 2))
++    // An NVCC bug treats the original return expression as a non-constant,
++    // which is not allowed in a constexpr function.  This will be fixed in the
++    // CUDA 10.2 release.
++    return ptr_[size() - 1];
++#else
+     return ABSL_ASSERT(!empty()), ptr_[size() - 1];
++#endif
+   }
+ 
+   // string_view::data()
+@@ -519,7 +547,14 @@ class string_view {
+       (std::numeric_limits<difference_type>::max)();
+ 
+   static constexpr size_type CheckLengthInternal(size_type len) {
++#if defined(__NVCC__) && (__CUDACC_VER_MAJOR__ < 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ < 2))
++    // An NVCC bug treats the original return expression as a non-constant,
++    // which is not allowed in a constexpr function.  This will be fixed in the
++    // CUDA 10.2 release.
++    return len;
++#else
+     return (void)ABSL_ASSERT(len <= kMaxSize), len;
++#endif
+   }
+ 
+   static constexpr size_type StrlenInternal(const char* str) {
+diff --git a/absl/time/internal/cctz/BUILD.bazel b/absl/time/internal/cctz/BUILD.bazel
+index 7a53c81..159b0f0 100644
+--- a/absl/time/internal/cctz/BUILD.bazel
++++ b/absl/time/internal/cctz/BUILD.bazel
+@@ -74,15 +74,6 @@ cc_library(
+         "include/cctz/time_zone.h",
+         "include/cctz/zone_info_source.h",
+     ],
+-    linkopts = select({
+-        ":osx": [
+-            "-framework Foundation",
+-        ],
+-        ":ios": [
+-            "-framework Foundation",
+-        ],
+-        "//conditions:default": [],
+-    }),
+     visibility = ["//visibility:public"],
+     deps = [
+         ":civil_time",
diff --git a/third_party/eigen3/gpu_packet_math.patch b/third_party/eigen3/gpu_packet_math.patch
index 21e4f196cee..fdc8961b93d 100644
--- a/third_party/eigen3/gpu_packet_math.patch
+++ b/third_party/eigen3/gpu_packet_math.patch
@@ -1,3 +1,4 @@
+diff -ru a/Eigen/src/Geometry/arch/Geometry_SSE.h b/Eigen/src/Geometry/arch/Geometry_SSE.h
 --- a/Eigen/src/Geometry/arch/Geometry_SSE.h
 +++ b/Eigen/src/Geometry/arch/Geometry_SSE.h
 @@ -33,13 +33,14 @@
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
index 861a87b68bf..5bb7ca95db5 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
@@ -11,5 +11,4 @@ inline void sleep(unsigned int seconds) { Sleep(1000*seconds); }
 // prevent clashes.
 #undef DeleteFile
 #undef ERROR
-#undef LoadLibrary
 #endif  // _WIN32
diff --git a/third_party/flatbuffers/BUILD.system b/third_party/flatbuffers/BUILD.system
index 14fceada826..8fe4d7a5907 100644
--- a/third_party/flatbuffers/BUILD.system
+++ b/third_party/flatbuffers/BUILD.system
@@ -36,3 +36,8 @@ cc_library(
     name = "runtime_cc",
     visibility = ["//visibility:public"],
 )
+
+py_library(
+    name = "runtime_py",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/flatbuffers/build_defs.bzl b/third_party/flatbuffers/build_defs.bzl
index 02027aa09f5..1fbe629e66a 100644
--- a/third_party/flatbuffers/build_defs.bzl
+++ b/third_party/flatbuffers/build_defs.bzl
@@ -358,11 +358,8 @@ def _concat_flatbuffer_py_srcs_impl(ctx):
         outputs = [ctx.outputs.out],
         command = (
             "find '%s' -name '*.py' -exec cat {} + |" +
-            "sed '/import flatbuffers/d' |" +
-            "sed 's/from flatbuffers." +
-            "/from flatbuffers.python.flatbuffers./' |" +
-            "sed '1s/^/from flatbuffers.python " +
-            "import flatbuffers\\'$'\\n/' > %s"
+            "sed 's/from flatbuffers.compat import import_numpy/import numpy as np' |" +
+            "sed '/np = import_numpy()/d' > %s"
         ) % (
             ctx.attr.deps[0].files.to_list()[0].path,
             ctx.outputs.out.path,
diff --git a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
index eb320a94201..afc8132bd15 100644
--- a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
+++ b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
@@ -3,6 +3,7 @@
 load(
     "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
     "action_config",
+    "artifact_name_pattern",
     "env_entry",
     "env_set",
     "feature",
@@ -971,6 +972,7 @@ def _impl(ctx):
             linker_path = ctx.attr.host_compiler_path,
             strip_path = ctx.attr.host_compiler_prefix + "/strip",
         )
+        artifact_name_patterns = []
     elif (cpu == "local"):
         toolchain_identifier = "local_linux"
         target_cpu = "local"
@@ -984,6 +986,7 @@ def _impl(ctx):
             linker_path = ctx.attr.host_compiler_path,
             strip_path = ctx.attr.host_compiler_prefix + "/strip",
         )
+        artifact_name_patterns = []
     elif (cpu == "x64_windows"):
         toolchain_identifier = "local_windows"
         target_cpu = "x64_windows"
@@ -997,6 +1000,38 @@ def _impl(ctx):
             linker_path = ctx.attr.msvc_link_path,
             strip_path = "fake_tool_strip_not_supported",
         )
+        artifact_name_patterns = [
+            artifact_name_pattern(
+                category_name = "object_file",
+                prefix = "",
+                extension = ".obj",
+            ),
+            artifact_name_pattern(
+                category_name = "static_library",
+                prefix = "",
+                extension = ".lib",
+            ),
+            artifact_name_pattern(
+                category_name = "alwayslink_static_library",
+                prefix = "",
+                extension = ".lo.lib",
+            ),
+            artifact_name_pattern(
+                category_name = "executable",
+                prefix = "",
+                extension = ".exe",
+            ),
+            artifact_name_pattern(
+                category_name = "dynamic_library",
+                prefix = "",
+                extension = ".dll",
+            ),
+            artifact_name_pattern(
+                category_name = "interface_library",
+                prefix = "",
+                extension = ".if.lib",
+            ),
+        ]
     else:
         fail("Unreachable")
 
@@ -1007,7 +1042,7 @@ def _impl(ctx):
             ctx = ctx,
             features = _features(cpu, compiler, ctx),
             action_configs = action_configs,
-            artifact_name_patterns = [],
+            artifact_name_patterns = artifact_name_patterns,
             cxx_builtin_include_directories = ctx.attr.builtin_include_directories,
             toolchain_identifier = toolchain_identifier,
             host_system_name = "local",
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index a192c022a47..70bb91159de 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -187,6 +187,7 @@ def _get_win_cuda_defines(repository_ctx):
     # the same tmp directory
     escaped_cxx_include_directories = [
         _get_nvcc_tmp_dir_for_windows(repository_ctx),
+        "C:\\\\botcode\\\\w",
     ]
     for path in escaped_include_paths.split(";"):
         if path:
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 6a1204b87db..dcc1d52688e 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -35,7 +35,7 @@ load(
 
 _GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
 _GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX"
-_ROCM_TOOLKIT_PATH = "ROCM_TOOLKIT_PATH"
+_ROCM_TOOLKIT_PATH = "ROCM_PATH"
 _TF_ROCM_VERSION = "TF_ROCM_VERSION"
 _TF_MIOPEN_VERSION = "TF_MIOPEN_VERSION"
 _TF_ROCM_AMDGPU_TARGETS = "TF_ROCM_AMDGPU_TARGETS"
@@ -196,6 +196,13 @@ def _rocm_include_path(repository_ctx, rocm_config, bash_bin):
     inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/10.0.0/include")
     inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/11.0.0/include")
 
+    # Support hcc based off clang 10.0.0 (for ROCm 3.3)
+    inc_dirs.append(rocm_toolkit_path + "/hcc/compiler/lib/clang/10.0.0/include/")
+    inc_dirs.append(rocm_toolkit_path + "/hcc/lib/clang/10.0.0/include")
+
+    # Add hcc headers
+    inc_dirs.append(rocm_toolkit_path + "/hcc/include")
+
     return inc_dirs
 
 def _enable_rocm(repository_ctx):
@@ -382,7 +389,7 @@ def _find_libs(repository_ctx, rocm_config, bash_bin):
     libs_paths = [
         (name, _rocm_lib_paths(repository_ctx, name, path))
         for name, path in [
-            ("hip_hcc", rocm_config.rocm_toolkit_path),
+            ("hip_hcc", rocm_config.rocm_toolkit_path + "/hip"),
             ("rocblas", rocm_config.rocm_toolkit_path + "/rocblas"),
             ("rocfft", rocm_config.rocm_toolkit_path + "/rocfft"),
             ("hiprand", rocm_config.rocm_toolkit_path + "/hiprand"),
diff --git a/third_party/grpc/generate_cc_env_fix.patch b/third_party/grpc/generate_cc_env_fix.patch
new file mode 100644
index 00000000000..51832fe9628
--- /dev/null
+++ b/third_party/grpc/generate_cc_env_fix.patch
@@ -0,0 +1,10 @@
+--- a/bazel/generate_cc.bzl
++++ b/bazel/generate_cc.bzl
+@@ -141,6 +141,7 @@ def generate_cc_impl(ctx):
+         outputs = out_files,
+         executable = ctx.executable._protoc,
+         arguments = arguments,
++        use_default_shell_env = True,
+     )
+
+     return struct(files = depset(out_files))
diff --git a/third_party/hexagon/workspace.bzl b/third_party/hexagon/workspace.bzl
index 1a682f0e8ad..a22e2dbe87e 100644
--- a/third_party/hexagon/workspace.bzl
+++ b/third_party/hexagon/workspace.bzl
@@ -2,14 +2,14 @@
 
 load("//third_party:repo.bzl", "third_party_http_archive")
 
-# Note: Use libhexagon_nn_skel version 1.17 Only with the current version.
+# Note: Use libhexagon_nn_skel version 1.20 Only with the current version.
 # This comment will be updated with compatible version.
 def repo():
     third_party_http_archive(
         name = "hexagon_nn",
-        sha256 = "a0c011f7795e1a09eb7355be295d6442718b8565cc0e3c58a91671dde2bc99fb",
+        sha256 = "2b0e29a061f389ad52054c12fcae38991b5f731d7a05770c7ac421433ed17cc2",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/storage.cloud.google.com/download.tensorflow.org/tflite/hexagon_nn_headers_v1.17.0.0.tgz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/storage.cloud.google.com/download.tensorflow.org/tflite/hexagon_nn_headers_v1.20.0.0.tgz",
         ],
         build_file = "//third_party/hexagon:BUILD",
     )
diff --git a/third_party/icu/udata.patch b/third_party/icu/udata.patch
index db6a06d26ef..0b65e4ed388 100644
--- a/third_party/icu/udata.patch
+++ b/third_party/icu/udata.patch
@@ -1,19 +1,6 @@
---- a/icu4c/source/common/unicode/uconfig.h
-+++ b/icu4c/source/common/unicode/uconfig.h
-@@ -55,6 +55,11 @@
- #include "uconfig_local.h"
- #endif
- 
-+// Tensorflow is statically linked on all platforms.
-+#ifndef U_STATIC_IMPLEMENTATION
-+#define U_STATIC_IMPLEMENTATION
-+#endif
-+
- /**
-  * \def U_DEBUG
-  * Determines whether to include debugging code.
---- a/icu4c/source/common/udata.cpp
-+++ b/icu4c/source/common/udata.cpp
+diff -ru a/icu4c/source/common/udata.cpp b/icu4c/source/common/udata.cpp
+--- a/icu4c/source/common/udata.cpp	2019-04-17 12:03:04.000000000 +0000
++++ b/icu4c/source/common/udata.cpp	2020-07-14 23:49:37.836668741 +0000
 @@ -18,11 +18,10 @@
  
  #include "unicode/utypes.h"  /* U_PLATFORM etc. */
@@ -57,3 +44,18 @@
  #if U_PLATFORM_HAS_WINUWP_API == 0 // Windows UWP Platform does not support dll icu data at this time
          setCommonICUDataPointer(&U_ICUDATA_ENTRY_POINT, FALSE, pErrorCode);
          {
+diff -ru a/icu4c/source/common/unicode/uconfig.h b/icu4c/source/common/unicode/uconfig.h
+--- a/icu4c/source/common/unicode/uconfig.h	2019-04-17 12:03:04.000000000 +0000
++++ b/icu4c/source/common/unicode/uconfig.h	2020-07-14 23:49:37.836668741 +0000
+@@ -55,6 +55,11 @@
+ #include "uconfig_local.h"
+ #endif
+ 
++// Tensorflow is statically linked on all platforms.
++#ifndef U_STATIC_IMPLEMENTATION
++#define U_STATIC_IMPLEMENTATION
++#endif
++
+ /**
+  * \def U_DEBUG
+  * Determines whether to include debugging code.
diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index 6e53745166d..13bc7bf2902 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -155,10 +155,10 @@ gentbl(
     name = "InstCombineTableGen",
     tbl_outs = [(
         "-gen-searchable-tables",
-        "lib/Transforms/InstCombine/InstCombineTables.inc",
+        "lib/Target/AMDGPU/InstCombineTables.inc",
     )],
     tblgen = ":llvm-tblgen",
-    td_file = "lib/Transforms/InstCombine/InstCombineTables.td",
+    td_file = "lib/Target/AMDGPU/InstCombineTables.td",
     td_srcs = glob([
         "include/llvm/CodeGen/*.td",
         "include/llvm/IR/Intrinsics*.td",
@@ -721,8 +721,10 @@ cc_library(
             "lib/Analysis/*.h",
         ],
         exclude = [
+            "lib/Analysis/DevelopmentModeInlineAdvisor.cpp",
             "lib/Analysis/MLInlineAdvisor.cpp",
             "lib/Analysis/ReleaseModeModelRunner.cpp",
+            "lib/Analysis/TFUtils.cpp",
         ],
     ),
     hdrs = glob([
@@ -1554,7 +1556,9 @@ cc_library(
         ":BPFInfo",
         ":CodeGen",
         ":Core",
+        ":IPO",
         ":MC",
+        ":Scalar",
         ":SelectionDAG",
         ":Support",
         ":Target",
@@ -3186,6 +3190,7 @@ cc_library(
     ]),
     copts = llvm_copts,
     deps = [
+        ":BinaryFormat",
         ":DebugInfoCodeView",
         ":MC",
         ":Object",
diff --git a/third_party/mkl/BUILD b/third_party/mkl/BUILD
index bbbec855ab7..66a2bf8ceb9 100644
--- a/third_party/mkl/BUILD
+++ b/third_party/mkl/BUILD
@@ -10,15 +10,6 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
-config_setting(
-    name = "build_with_mkl_ml_only",
-    define_values = {
-        "build_with_mkl": "true",
-        "build_with_mkl_ml_only": "true",
-    },
-    visibility = ["//visibility:public"],
-)
-
 config_setting(
     name = "build_with_mkl_lnx_x64",
     define_values = {
@@ -39,11 +30,6 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
-load(
-    "//third_party/mkl:build_defs.bzl",
-    "if_mkl",
-)
-
 filegroup(
     name = "LICENSE",
     srcs = ["MKL_LICENSE"] + select({
diff --git a/third_party/mkl/build_defs.bzl b/third_party/mkl/build_defs.bzl
index bd0686523bc..851403fd13a 100644
--- a/third_party/mkl/build_defs.bzl
+++ b/third_party/mkl/build_defs.bzl
@@ -41,26 +41,11 @@ def if_mkl_ml(if_true, if_false = []):
       a select evaluating to either if_true or if_false as appropriate.
     """
     return select({
-        "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_dnn_only": if_false,
+        "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_opensource": if_false,
         "@org_tensorflow//third_party/mkl:build_with_mkl": if_true,
         "//conditions:default": if_false,
     })
 
-def if_mkl_ml_only(if_true, if_false = []):
-    """Shorthand for select()'ing on whether we're building with MKL-ML only.
-
-    Args:
-      if_true: expression to evaluate if building with MKL-ML only.
-      if_false: expression to evaluate if building without MKL, or with MKL-DNN.
-
-    Returns:
-      a select evaluating to either if_true or if_false as appropriate.
-    """
-    return select({
-        "@org_tensorflow//third_party/mkl:build_with_mkl_ml_only": if_true,
-        "//conditions:default": if_false,
-    })
-
 def if_mkl_lnx_x64(if_true, if_false = []):
     """Shorthand to select() if building with MKL and the target is Linux x86-64.
 
@@ -107,8 +92,6 @@ def mkl_deps():
     return select({
         "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_dnn_only": ["@mkl_dnn"],
         "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_dnn_v1_only": ["@mkl_dnn_v1//:mkl_dnn"],
-        "@org_tensorflow//third_party/mkl_dnn:build_with_mkldnn_threadpool": ["@mkl_dnn_v1//:mkl_dnn"],
-        "@org_tensorflow//third_party/mkl:build_with_mkl_ml_only": ["@org_tensorflow//third_party/mkl:intel_binary_blob"],
         "@org_tensorflow//third_party/mkl:build_with_mkl": [
             "@org_tensorflow//third_party/mkl:intel_binary_blob",
             "@mkl_dnn",
diff --git a/third_party/mkl_dnn/BUILD b/third_party/mkl_dnn/BUILD
index fe558322916..c3059a3dc5c 100644
--- a/third_party/mkl_dnn/BUILD
+++ b/third_party/mkl_dnn/BUILD
@@ -18,6 +18,16 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "build_with_mkl_opensource",
+    define_values = {
+        "build_with_mkl": "true",
+        "build_with_mkl_dnn_v1_only": "true",
+        "build_with_mkl_opensource": "true",
+    },
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "build_with_mkl_dnn_v1_only",
     define_values = {
@@ -31,6 +41,8 @@ config_setting(
     name = "build_with_mkldnn_threadpool",
     define_values = {
         "build_with_mkl": "true",
+        "build_with_mkl_dnn_v1_only": "true",
+        "build_with_mkl_opensource": "true",
         "build_with_mkldnn_threadpool": "true",
     },
     visibility = ["//visibility:public"],
diff --git a/third_party/mkl_dnn/build_defs.bzl b/third_party/mkl_dnn/build_defs.bzl
index bd3b4b94f29..6a3e4f827ce 100644
--- a/third_party/mkl_dnn/build_defs.bzl
+++ b/third_party/mkl_dnn/build_defs.bzl
@@ -10,11 +10,11 @@ def if_mkl_open_source_only(if_true, if_false = []):
 
     """
     return select({
-        "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_dnn_only": if_true,
+        "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_opensource": if_true,
         "//conditions:default": if_false,
     })
 
-def if_mkl_v1_open_source_only(if_true, if_false = []):
+def if_mkl_v1(if_true, if_false = []):
     """Returns `if_true` if MKL-DNN v1.x is used.
 
     Shorthand for select()'ing on whether we're building with
diff --git a/third_party/mkl_dnn/mkldnn.BUILD b/third_party/mkl_dnn/mkldnn.BUILD
index 71dde75e2e0..5279043ad29 100644
--- a/third_party/mkl_dnn/mkldnn.BUILD
+++ b/third_party/mkl_dnn/mkldnn.BUILD
@@ -3,7 +3,7 @@ exports_files(["LICENSE"])
 load(
     "@org_tensorflow//third_party/mkl_dnn:build_defs.bzl",
     "if_mkl_open_source_only",
-    "if_mkl_v1_open_source_only",
+    "if_mkl_v1",
 )
 load(
     "@org_tensorflow//third_party:common.bzl",
@@ -60,7 +60,7 @@ cc_library(
         "src/cpu/**/*.cpp",
         "src/cpu/**/*.hpp",
         "src/cpu/xbyak/*.h",
-    ]) + if_mkl_v1_open_source_only([
+    ]) + if_mkl_v1([
         ":mkldnn_config_h",
     ]) + [":mkldnn_version_h"],
     hdrs = glob(["include/*"]),
@@ -71,7 +71,7 @@ cc_library(
     ] + if_mkl_open_source_only([
         "-UUSE_MKL",
         "-UUSE_CBLAS",
-    ]) + if_mkl_v1_open_source_only([
+    ]) + if_mkl_v1([
         "-UUSE_MKL",
         "-UUSE_CBLAS",
     ]) + select({
diff --git a/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/mkl_dnn/mkldnn_v1.BUILD
index 7bdec138b99..592a28e01a8 100644
--- a/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -3,9 +3,13 @@ exports_files(["LICENSE"])
 load(
     "@org_tensorflow//third_party/mkl_dnn:build_defs.bzl",
     "if_mkl_open_source_only",
-    "if_mkl_v1_open_source_only",
+    "if_mkl_v1",
     "if_mkldnn_threadpool",
 )
+load(
+    "@org_tensorflow//third_party/mkl:build_defs.bzl",
+    "if_mkl_ml",
+)
 load(
     "@org_tensorflow//third_party:common.bzl",
     "template_rule",
@@ -85,7 +89,7 @@ cc_library(
     ] + if_mkl_open_source_only([
         "-UUSE_MKL",
         "-UUSE_CBLAS",
-    ]) + if_mkl_v1_open_source_only([
+    ]) + if_mkl_v1([
         "-UUSE_MKL",
         "-UUSE_CBLAS",
     ]) + if_mkldnn_threadpool([
@@ -109,21 +113,10 @@ cc_library(
         "src/cpu/xbyak",
     ],
     visibility = ["//visibility:public"],
-    deps = select({
-        "@org_tensorflow//tensorflow:linux_x86_64": [
-            "@mkl_linux//:mkl_headers",
-            "@mkl_linux//:mkl_libs_linux",
-        ],
-        "@org_tensorflow//tensorflow:macos": [
-            "@mkl_darwin//:mkl_headers",
-            "@mkl_darwin//:mkl_libs_darwin",
-        ],
-        "@org_tensorflow//tensorflow:windows": [
-            "@mkl_windows//:mkl_headers",
-            "@mkl_windows//:mkl_libs_windows",
-        ],
-        "//conditions:default": [],
-    }),
+    deps = if_mkl_ml(
+        ["@org_tensorflow//third_party/mkl:intel_binary_blob"],
+        [],
+    ),
 )
 
 cc_library(
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 5627067ac5e..eeb78e0544b 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -32,45 +32,32 @@ cc_library(
     textual_hdrs = ["include/mlir/IR/DialectSymbolRegistry.def"],
 )
 
-gentbl(
-    name = "OpAsmInterfacesIncGen",
-    strip_include_prefix = "include",
-    tbl_outs = [
-        (
-            "-gen-op-interface-decls",
-            "include/mlir/IR/OpAsmInterface.h.inc",
-        ),
-        (
-            "-gen-op-interface-defs",
-            "include/mlir/IR/OpAsmInterface.cpp.inc",
-        ),
-    ],
-    tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/IR/OpAsmInterface.td",
-    td_srcs = [
-        ":OpBaseTdFiles",
-    ],
-)
-
-gentbl(
-    name = "SymbolInterfacesIncGen",
-    strip_include_prefix = "include",
-    tbl_outs = [
-        (
-            "-gen-op-interface-decls",
-            "include/mlir/IR/SymbolInterfaces.h.inc",
-        ),
-        (
-            "-gen-op-interface-defs",
-            "include/mlir/IR/SymbolInterfaces.cpp.inc",
-        ),
-    ],
-    tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/IR/SymbolInterfaces.td",
-    td_srcs = [
-        ":OpBaseTdFiles",
-    ],
-)
+[
+    gentbl(
+        name = name + "IncGen",
+        strip_include_prefix = "include",
+        tbl_outs = [
+            (
+                "-gen-op-interface-decls",
+                "include/mlir/IR/" + name + ".h.inc",
+            ),
+            (
+                "-gen-op-interface-defs",
+                "include/mlir/IR/" + name + ".cpp.inc",
+            ),
+        ],
+        tblgen = ":mlir-tblgen",
+        td_file = "include/mlir/IR/" + name + ".td",
+        td_srcs = [
+            ":OpBaseTdFiles",
+        ],
+    )
+    for name in [
+        "OpAsmInterface",
+        "RegionKindInterface",
+        "SymbolInterfaces",
+    ]
+]
 
 cc_library(
     name = "IR",
@@ -88,7 +75,8 @@ cc_library(
         ":CallOpInterfacesIncGen",
         ":DialectSymbolRegistry",
         ":InferTypeOpInterfaceIncGen",
-        ":OpAsmInterfacesIncGen",
+        ":OpAsmInterfaceIncGen",
+        ":RegionKindInterfaceIncGen",
         ":SideEffectInterfacesIncGen",
         ":Support",
         ":SymbolInterfacesIncGen",
@@ -111,7 +99,6 @@ cc_library(
         "-lpthread",
     ],
     deps = [
-        ":Analysis",
         ":IR",
         ":Support",
         "@llvm-project//llvm:Support",
@@ -136,6 +123,38 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "CAPIIR",
+    srcs = [
+        "lib/CAPI/IR/IR.cpp",
+    ],
+    hdrs = [
+        "include/mlir-c/IR.h",
+    ],
+    includes = ["include"],
+    deps = [
+        ":IR",
+        ":Parser",
+        ":Support",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "CAPIRegistration",
+    srcs = [
+        "lib/CAPI/Registration/Registration.cpp",
+    ],
+    hdrs = [
+        "include/mlir-c/Registration.h",
+    ],
+    includes = ["include"],
+    deps = [
+        ":AllPassesAndDialectsNoRegistration",
+        ":CAPIIR",
+    ],
+)
+
 cc_library(
     name = "EDSCInterface",
     srcs = [
@@ -356,11 +375,11 @@ gentbl(
 )
 
 gentbl(
-    name = "LoopPassIncGen",
+    name = "SCFPassIncGen",
     strip_include_prefix = "include",
     tbl_outs = [
         (
-            "-gen-pass-decls",
+            "-gen-pass-decls -name SCF",
             "include/mlir/Dialect/SCF/Passes.h.inc",
         ),
     ],
@@ -382,9 +401,9 @@ cc_library(
     deps = [
         ":Affine",
         ":IR",
-        ":LoopPassIncGen",
         ":Pass",
         ":SCFDialect",
+        ":SCFPassIncGen",
         ":StandardOps",
         ":Transforms",
         "@llvm-project//llvm:Support",
@@ -399,7 +418,7 @@ filegroup(
         "include/mlir/Interfaces/CallInterfaces.td",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
         "include/mlir/Interfaces/SideEffectInterfaces.td",
-        "include/mlir/Interfaces/VectorUnrollInterface.td",
+        "include/mlir/Interfaces/VectorInterfaces.td",
         "include/mlir/Interfaces/ViewLikeInterface.td",
         ":OpBaseTdFiles",
     ],
@@ -512,6 +531,7 @@ cc_library(
     deps = [
         ":Affine",
         ":IR",
+        ":Support",
         "@llvm-project//llvm:Support",
     ],
 )
@@ -521,7 +541,7 @@ gentbl(
     strip_include_prefix = "include",
     tbl_outs = [
         (
-            "-gen-pass-decls",
+            "-gen-pass-decls -name Affine",
             "include/mlir/Dialect/Affine/Passes.h.inc",
         ),
     ],
@@ -545,6 +565,7 @@ cc_library(
     deps = [
         ":Affine",
         ":AffinePassIncGen",
+        ":AffineUtils",
         ":Analysis",
         ":IR",
         ":Pass",
@@ -562,7 +583,7 @@ gentbl(
     strip_include_prefix = "include",
     tbl_outs = [
         (
-            "-gen-pass-decls",
+            "-gen-pass-decls -name Conversion",
             "include/mlir/Conversion/Passes.h.inc",
         ),
     ],
@@ -573,6 +594,35 @@ gentbl(
     ],
 )
 
+cc_library(
+    name = "ConversionPasses",
+    hdrs = ["include/mlir/Conversion/Passes.h"],
+    includes = ["include"],
+    deps = [
+        ":AVX512ToLLVM",
+        ":AffineToStandard",
+        ":ConversionPassIncGen",
+        ":GPUToGPURuntimeTransforms",
+        ":GPUToNVVMTransforms",
+        ":GPUToROCDLTransforms",
+        ":GPUToSPIRVTransforms",
+        ":GPUToVulkanTransforms",
+        ":LinalgToLLVM",
+        ":LinalgToSPIRV",
+        ":LinalgToStandard",
+        ":SCFToGPUPass",
+        ":SCFToStandard",
+        ":SPIRVToLLVM",
+        ":ShapeToSCF",
+        ":ShapeToStandard",
+        ":StandardToLLVM",
+        ":StandardToSPIRVTransforms",
+        ":VectorToLLVM",
+        ":VectorToROCDL",
+        ":VectorToSCF",
+    ],
+)
+
 cc_library(
     name = "AffineToStandard",
     srcs = glob([
@@ -638,7 +688,9 @@ cc_library(
         ":EDSC",
         ":IR",
         ":LoopLikeInterface",
+        ":Pass",
         ":SCFIncGen",
+        ":SCFPassIncGen",
         ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
@@ -658,13 +710,13 @@ cc_library(
 )
 
 cc_library(
-    name = "VectorUnrollInterface",
-    srcs = ["lib/Interfaces/VectorUnrollInterface.cpp"],
-    hdrs = ["include/mlir/Interfaces/VectorUnrollInterface.h"],
+    name = "VectorInterfaces",
+    srcs = ["lib/Interfaces/VectorInterfaces.cpp"],
+    hdrs = ["include/mlir/Interfaces/VectorInterfaces.h"],
     includes = ["include"],
     deps = [
         ":IR",
-        ":VectorUnrollInterfaceIncGen",
+        ":VectorInterfacesIncGen",
     ],
 )
 
@@ -750,6 +802,7 @@ cc_library(
         ":MLIRShapeCanonicalizationIncGen",
         ":ShapeOpsIncGen",
         ":SideEffectInterfaces",
+        ":StandardOps",
         ":Support",
         "@llvm-project//llvm:Support",
     ],
@@ -768,32 +821,12 @@ cc_library(
         ":Pass",
         ":SCFDialect",
         ":Shape",
-        ":ShapeToStandardPatternsIncGen",
         ":StandardOps",
         ":Support",
         ":Transforms",
     ],
 )
 
-gentbl(
-    name = "ShapeToStandardPatternsIncGen",
-    strip_include_prefix = "include/mlir/Conversion/ShapeToStandard",
-    tbl_outs = [
-        (
-            "-gen-rewriters",
-            "include/mlir/Conversion/ShapeToStandard/ShapeToStandardPatterns.inc",
-        ),
-    ],
-    tblgen = ":mlir-tblgen",
-    td_file = "lib/Conversion/ShapeToStandard/ShapeToStandardPatterns.td",
-    td_srcs = [
-        ":StdOpsTdFiles",
-        "include/mlir/Dialect/Shape/IR/ShapeBase.td",
-        "include/mlir/Dialect/Shape/IR/ShapeOps.td",
-        "include/mlir/Interfaces/InferTypeOpInterface.td",
-    ],
-)
-
 cc_library(
     name = "ShapeToSCF",
     srcs = glob([
@@ -817,7 +850,7 @@ gentbl(
     name = "ShapeTransformsPassIncGen",
     strip_include_prefix = "include",
     tbl_outs = [(
-        "-gen-pass-decls",
+        "-gen-pass-decls -name Shape",
         "include/mlir/Dialect/Shape/Transforms/Passes.h.inc",
     )],
     tblgen = ":mlir-tblgen",
@@ -838,7 +871,7 @@ cc_library(
         ":Pass",
         ":Shape",
         ":ShapeTransformsPassIncGen",
-        ":Support",
+        ":StandardOps",
         ":Transforms",
     ],
 )
@@ -866,7 +899,7 @@ cc_library(
         ":SideEffectInterfaces",
         ":StandardOpsIncGen",
         ":Support",
-        ":VectorUnrollInterface",
+        ":VectorInterfaces",
         ":ViewLikeInterface",
         "@llvm-project//llvm:Support",
     ],
@@ -876,7 +909,7 @@ gentbl(
     name = "StandardOpsTransformsPassIncGen",
     strip_include_prefix = "include",
     tbl_outs = [(
-        "-gen-pass-decls",
+        "-gen-pass-decls -name Standard",
         "include/mlir/Dialect/StandardOps/Transforms/Passes.h.inc",
     )],
     tblgen = ":mlir-tblgen",
@@ -926,12 +959,13 @@ cc_library(
         ":DialectUtils",
         ":EDSC",
         ":IR",
+        ":LinalgOps",
+        ":SCFDialect",
         ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
+        ":VectorInterfaces",
         ":VectorOpsIncGen",
-        ":VectorTransformPatternsIncGen",
-        ":VectorUnrollInterface",
         "@llvm-project//llvm:Support",
     ],
 )
@@ -1117,7 +1151,7 @@ gentbl(
     strip_include_prefix = "include",
     tbl_outs = [
         (
-            "-gen-pass-decls",
+            "-gen-pass-decls -name LLVM",
             "include/mlir/Dialect/LLVMIR/Transforms/Passes.h.inc",
         ),
     ],
@@ -1236,7 +1270,7 @@ gentbl(
     strip_include_prefix = "include",
     tbl_outs = [
         (
-            "-gen-pass-decls",
+            "-gen-pass-decls -name GPU",
             "include/mlir/Dialect/GPU/Passes.h.inc",
         ),
     ],
@@ -1457,6 +1491,7 @@ cc_library(
         ":IR",
         ":LLVMDialect",
         ":Pass",
+        ":StandardToLLVM",
         ":Support",
         ":TargetNVVMIR",
         "@llvm-project//llvm:Core",
@@ -1926,6 +1961,7 @@ cc_library(
         ":SPIRVCanonicalizationIncGen",
         ":SPIRVOpUtilsIncGen",
         ":SPIRVOpsIncGen",
+        ":SPIRVPassIncGen",
         ":SPIRVSerializationGen",
         ":SPIRVTargetAndABIStructGen",
         ":SideEffectInterfaces",
@@ -1940,7 +1976,7 @@ gentbl(
     strip_include_prefix = "include",
     tbl_outs = [
         (
-            "-gen-pass-decls",
+            "-gen-pass-decls -name SPIRV",
             "include/mlir/Dialect/SPIRV/Passes.h.inc",
         ),
     ],
@@ -2071,11 +2107,12 @@ cc_library(
         ":Analysis",
         ":ControlFlowInterfaces",
         ":IR",
-        ":LoopLikeInterface",
+        ":Pass",
         ":SCFDialect",
         ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
+        ":TransformsPassIncGen",
         "@llvm-project//llvm:Support",
     ],
 )
@@ -2138,20 +2175,20 @@ gentbl(
 )
 
 gentbl(
-    name = "VectorUnrollInterfaceIncGen",
+    name = "VectorInterfacesIncGen",
     strip_include_prefix = "include",
     tbl_outs = [
         (
             "-gen-op-interface-decls",
-            "include/mlir/Interfaces/VectorUnrollInterface.h.inc",
+            "include/mlir/Interfaces/VectorInterfaces.h.inc",
         ),
         (
             "-gen-op-interface-defs",
-            "include/mlir/Interfaces/VectorUnrollInterface.cpp.inc",
+            "include/mlir/Interfaces/VectorInterfaces.cpp.inc",
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Interfaces/VectorUnrollInterface.td",
+    td_file = "include/mlir/Interfaces/VectorInterfaces.td",
     td_srcs = [
         ":OpBaseTdFiles",
     ],
@@ -2202,7 +2239,7 @@ gentbl(
     strip_include_prefix = "include",
     tbl_outs = [
         (
-            "-gen-pass-decls",
+            "-gen-pass-decls -name Transforms",
             "include/mlir/Transforms/Passes.h.inc",
         ),
     ],
@@ -2365,6 +2402,7 @@ cc_library(
         ":ConversionPassIncGen",
         ":IR",
         ":LLVMDialect",
+        ":Parser",
         ":Pass",
         ":StandardOps",
         ":Support",
@@ -2593,9 +2631,11 @@ cc_library(
         "lib/Target/LLVMIR/DebugTranslation.cpp",
         "lib/Target/LLVMIR/DebugTranslation.h",
         "lib/Target/LLVMIR/ModuleTranslation.cpp",
+        "lib/Target/LLVMIR/TypeTranslation.cpp",
     ],
     hdrs = [
         "include/mlir/Target/LLVMIR/ModuleTranslation.h",
+        "include/mlir/Target/LLVMIR/TypeTranslation.h",
     ],
     includes = ["include"],
     deps = [
@@ -2739,6 +2779,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":Analysis",
+        ":ConversionPasses",
         ":GPUToGPURuntimeTransforms",
         ":GPUToNVVMTransforms",
         ":GPUToROCDLTransforms",
@@ -2768,8 +2809,10 @@ cc_library(
         "@llvm-project//mlir/test:TestDialect",
         "@llvm-project//mlir/test:TestIR",
         "@llvm-project//mlir/test:TestPass",
+        "@llvm-project//mlir/test:TestReducer",
         "@llvm-project//mlir/test:TestSPIRV",
         "@llvm-project//mlir/test:TestTransforms",
+        "@llvm-project//mlir/test:TestTypeDialect",
     ],
 )
 
@@ -2817,8 +2860,9 @@ cc_library(
         ":AVX512ToLLVM",
         ":Affine",
         ":AffinePassIncGen",
+        ":AffineToStandard",
         ":AffineTransforms",
-        ":ConversionPassIncGen",
+        ":ConversionPasses",
         ":GPUDialect",
         ":GPUPassIncGen",
         ":GPUToGPURuntimeTransforms",
@@ -2838,13 +2882,13 @@ cc_library(
         ":LinalgToSPIRV",
         ":LinalgToStandard",
         ":LinalgTransforms",
-        ":LoopPassIncGen",
         ":NVVMDialect",
         ":OpenMPDialect",
         ":QuantOps",
         ":QuantPassIncGen",
         ":ROCDLDialect",
         ":SCFDialect",
+        ":SCFPassIncGen",
         ":SCFToGPUPass",
         ":SCFToStandard",
         ":SCFTransforms",
@@ -2916,8 +2960,10 @@ cc_binary(
         "@llvm-project//mlir/test:TestDialect",
         "@llvm-project//mlir/test:TestIR",
         "@llvm-project//mlir/test:TestPass",
+        "@llvm-project//mlir/test:TestReducer",
         "@llvm-project//mlir/test:TestSPIRV",
         "@llvm-project//mlir/test:TestTransforms",
+        "@llvm-project//mlir/test:TestTypeDialect",
     ],
 )
 
@@ -3239,7 +3285,7 @@ gentbl(
     strip_include_prefix = "include",
     tbl_outs = [
         (
-            "-gen-pass-decls",
+            "-gen-pass-decls -name Quant",
             "include/mlir/Dialect/Quant/Passes.h.inc",
         ),
     ],
@@ -3280,6 +3326,7 @@ cc_library(
         ":QuantPassIncGen",
         ":SideEffectInterfaces",
         ":StandardOps",
+        ":TransformUtils",
         "@llvm-project//llvm:Support",
     ],
 )
@@ -3533,7 +3580,7 @@ gentbl(
     strip_include_prefix = "include",
     tbl_outs = [
         (
-            "-gen-pass-decls",
+            "-gen-pass-decls -name Linalg",
             "include/mlir/Dialect/Linalg/Passes.h.inc",
         ),
     ],
@@ -3595,7 +3642,7 @@ filegroup(
     name = "VectorOpsTdFiles",
     srcs = [
         "include/mlir/Dialect/Vector/VectorOps.td",
-        "include/mlir/Interfaces/VectorUnrollInterface.td",
+        "include/mlir/Interfaces/VectorInterfaces.td",
         ":AffineOpsTdFiles",
         ":OpBaseTdFiles",
     ],
@@ -3629,34 +3676,6 @@ gentbl(
     ],
 )
 
-filegroup(
-    name = "VectorTransformPatternsTdFiles",
-    srcs = [
-        "include/mlir/Dialect/Vector/VectorTransformPatterns.td",
-        ":AffineOpsTdFiles",
-        ":LinalgOpsTdFiles",
-        ":LinalgStructuredOpsTdFiles",
-        ":OpBaseTdFiles",
-        ":StdOpsTdFiles",
-        ":VectorOpsTdFiles",
-    ],
-)
-
-gentbl(
-    name = "VectorTransformPatternsIncGen",
-    tbl_outs = [
-        (
-            "-gen-rewriters",
-            "include/mlir/Dialect/Vector/VectorTransformPatterns.h.inc",
-        ),
-    ],
-    tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Dialect/Vector/VectorTransformPatterns.td",
-    td_srcs = [
-        ":VectorTransformPatternsTdFiles",
-    ],
-)
-
 cc_library(
     name = "VectorToLLVM",
     srcs = glob([
@@ -3673,6 +3692,7 @@ cc_library(
         ":EDSC",
         ":IR",
         ":LLVMDialect",
+        ":LLVMIRModuleTranslation",
         ":Pass",
         ":StandardOps",
         ":StandardToLLVM",
@@ -3725,20 +3745,17 @@ exports_files(
         "include/mlir/Interfaces/ControlFlowInterfaces.h",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
         "include/mlir/Interfaces/SideEffectInterfaces.td",
-        "include/mlir/Interfaces/VectorUnrollInterface.td",
+        "include/mlir/Interfaces/VectorInterfaces.td",
         "include/mlir/Interfaces/ViewLikeInterface.td",
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
         "include/mlir/Dialect/StandardOps/IR/Ops.td",
+        "include/mlir/Dialect/Shape/IR/ShapeOps.td",
+        "include/mlir/Dialect/Shape/IR/ShapeBase.td",
         "include/mlir/IR/OpAsmInterface.td",
         "include/mlir/IR/OpBase.td",
+        "include/mlir/IR/RegionKindInterface.td",
         "include/mlir/IR/SymbolInterfaces.td",
         "include/mlir/Transforms/InliningUtils.h",
-    ],
-    visibility = [":friends"],
-)
-
-exports_files(
-    [
         "include/mlir/Interfaces/InferTypeOpInterface.td",
         "include/mlir/Interfaces/LoopLikeInterface.td",
     ],
diff --git a/third_party/mlir/test.BUILD b/third_party/mlir/test.BUILD
index 4b999bfa466..f507842a639 100644
--- a/third_party/mlir/test.BUILD
+++ b/third_party/mlir/test.BUILD
@@ -16,21 +16,6 @@ cc_library(
     includes = ["."],
 )
 
-gentbl(
-    name = "TestVectorTransformPatternsIncGen",
-    tbl_outs = [
-        (
-            "-gen-rewriters",
-            "lib/DeclarativeTransforms/TestVectorTransformPatterns.h.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "lib/DeclarativeTransforms/TestVectorTransformPatterns.td",
-    td_srcs = [
-        "@llvm-project//mlir:VectorTransformPatternsTdFiles",
-    ],
-)
-
 gentbl(
     name = "TestOpsIncGen",
     strip_include_prefix = "lib/Dialect/Test",
@@ -73,6 +58,7 @@ gentbl(
     td_srcs = [
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:include/mlir/IR/OpAsmInterface.td",
+        "@llvm-project//mlir:include/mlir/IR/RegionKindInterface.td",
         "@llvm-project//mlir:include/mlir/IR/SymbolInterfaces.td",
         "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.td",
         "@llvm-project//mlir:include/mlir/Interfaces/ControlFlowInterfaces.td",
@@ -114,7 +100,6 @@ cc_library(
         "lib/Dialect/Test/TestTypes.h",
     ],
     includes = [
-        "lib/DeclarativeTransforms",
         "lib/Dialect/Test",
     ],
     deps = [
@@ -143,6 +128,7 @@ cc_library(
         "lib/IR/TestMatchers.cpp",
         "lib/IR/TestSideEffects.cpp",
         "lib/IR/TestSymbolUses.cpp",
+        "lib/IR/TestTypes.cpp",
     ],
     deps = [
         ":TestDialect",
@@ -167,6 +153,18 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "TestReducer",
+    srcs = [
+        "lib/Reducer/MLIRTestReducer.cpp",
+    ],
+    deps = [
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 cc_library(
     name = "TestTransforms",
     srcs = glob(["lib/Transforms/*.cpp"]),
@@ -174,7 +172,6 @@ cc_library(
     includes = ["lib/Dialect/Test"],
     deps = [
         ":TestDialect",
-        ":TestVectorTransformPatternsIncGen",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Affine",
         "@llvm-project//mlir:Analysis",
@@ -225,9 +222,22 @@ cc_library(
         "lib/Dialect/SPIRV/*.cpp",
     ]),
     deps = [
+        "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SPIRVDialect",
         "@llvm-project//mlir:SPIRVLowering",
     ],
 )
+
+cc_library(
+    name = "TestTypeDialect",
+    srcs = glob([
+        "lib/Dialect/LLVMIR/*.cpp",
+    ]),
+    deps = [
+        ":TestDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMDialect",
+    ],
+)
diff --git a/third_party/png_fix_rpi.patch b/third_party/png_fix_rpi.patch
index e07eb2f6c1b..df6cfd7ffae 100644
--- a/third_party/png_fix_rpi.patch
+++ b/third_party/png_fix_rpi.patch
@@ -1,6 +1,6 @@
-diff -r -u /tmp/libpng-1.6.37/scripts/pnglibconf.h.prebuilt ./scripts/pnglibconf.h.prebuilt
---- /tmp/libpng-1.6.37/scripts/pnglibconf.h.prebuilt    2019-04-14 11:10:32.000000000 -0700
-+++ ./scripts/pnglibconf.h.prebuilt     2019-05-21 09:40:52.138528512 -0700
+diff -r -u ./scripts/pnglibconf.h.prebuilt ./scripts/pnglibconf.h.prebuilt
+--- ./scripts/pnglibconf.h.prebuilt
++++ ./scripts/pnglibconf.h.prebuilt
 @@ -19,6 +19,12 @@
  #define PNG_ALIGNED_MEMORY_SUPPORTED
  /*#undef PNG_ARM_NEON_API_SUPPORTED*/
diff --git a/third_party/py/numpy/tf_numpy_api/BUILD b/third_party/py/numpy/tf_numpy_api/BUILD
new file mode 100644
index 00000000000..fc95ad62e40
--- /dev/null
+++ b/third_party/py/numpy/tf_numpy_api/BUILD
@@ -0,0 +1,11 @@
+# TensorFlow API backwards compatibility test goldens for tf.experimental.numpy.
+
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
+)
+
+filegroup(
+    name = "api_golden",
+    srcs = glob(["*.pbtxt"]),
+)
diff --git a/third_party/py/numpy/tf_numpy_api/numpy_ops.ndarray.pbtxt b/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.ndarray.pbtxt
similarity index 97%
rename from third_party/py/numpy/tf_numpy_api/numpy_ops.ndarray.pbtxt
rename to third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.ndarray.pbtxt
index 8492a30d81b..f54ecbdbf47 100644
--- a/third_party/py/numpy/tf_numpy_api/numpy_ops.ndarray.pbtxt
+++ b/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.ndarray.pbtxt
@@ -1,4 +1,4 @@
-path: "numpy_ops.ndarray"
+path: "tensorflow.experimental.numpy.ndarray"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.numpy_ops.np_arrays.ndarray\'>"
   is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
diff --git a/third_party/py/numpy/tf_numpy_api/numpy_ops.pbtxt b/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.pbtxt
similarity index 98%
rename from third_party/py/numpy/tf_numpy_api/numpy_ops.pbtxt
rename to third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.pbtxt
index 30913665f14..f5ffcf9e244 100644
--- a/third_party/py/numpy/tf_numpy_api/numpy_ops.pbtxt
+++ b/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.pbtxt
@@ -1,4 +1,4 @@
-path: "numpy_ops"
+path: "tensorflow.experimental.numpy"
 tf_module {
   member {
     name: "bool_"
@@ -72,6 +72,10 @@ tf_module {
     name: "ndarray"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "newaxis"
+    mtype: "<type \'NoneType\'>"
+  }
   member {
     name: "object_"
     mtype: "<type \'type\'>"
@@ -474,7 +478,7 @@ tf_module {
   }
   member_method {
     name: "imag"
-    argspec: "args=[\'a\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'val\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "inner"
@@ -522,7 +526,7 @@ tf_module {
   }
   member_method {
     name: "isscalar"
-    argspec: "args=[\'a\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'num\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "issubdtype"
@@ -682,7 +686,7 @@ tf_module {
   }
   member_method {
     name: "pad"
-    argspec: "args=[\'ary\', \'pad_width\', \'mode\', \'constant_values\'], varargs=None, keywords=None, defaults=[\'0\'], "
+    argspec: "args=[\'array\', \'pad_width\', \'mode\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "polyval"
diff --git a/third_party/py/numpy/tf_numpy_api/numpy_ops.random.pbtxt b/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.random.pbtxt
similarity index 94%
rename from third_party/py/numpy/tf_numpy_api/numpy_ops.random.pbtxt
rename to third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.random.pbtxt
index 3e6bb720e8c..ad8e752a7ec 100644
--- a/third_party/py/numpy/tf_numpy_api/numpy_ops.random.pbtxt
+++ b/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.random.pbtxt
@@ -1,4 +1,4 @@
-path: "numpy_ops.random"
+path: "tensorflow.experimental.numpy.random"
 tf_module {
   member_method {
     name: "rand"
diff --git a/third_party/repo.bzl b/third_party/repo.bzl
index a4d2b899f80..ef729b5223d 100644
--- a/third_party/repo.bzl
+++ b/third_party/repo.bzl
@@ -60,12 +60,9 @@ def _execute_and_check_ret_code(repo_ctx, cmd_and_args):
 def _repos_are_siblings():
     return Label("@foo//bar").workspace_root.startswith("../")
 
-# Apply a patch_file to the repository root directory
-# Runs 'patch -p1' on both Windows and Unix.
+# Apply a patch_file to the repository root directory.
 def _apply_patch(ctx, patch_file):
-    patch_command = ["patch", "-p1", "-d", ctx.path("."), "-i", ctx.path(patch_file)]
-    cmd = _wrap_bash_cmd(ctx, patch_command)
-    _execute_and_check_ret_code(ctx, cmd)
+    ctx.patch(patch_file, strip = 1)
 
 def _apply_delete(ctx, paths):
     for path in paths:
diff --git a/third_party/ruy/workspace.bzl b/third_party/ruy/workspace.bzl
index ee0faec6eff..c2b8f0531de 100644
--- a/third_party/ruy/workspace.bzl
+++ b/third_party/ruy/workspace.bzl
@@ -5,11 +5,11 @@ load("//third_party:repo.bzl", "third_party_http_archive")
 def repo():
     third_party_http_archive(
         name = "ruy",
-        sha256 = "8fd4adeeff4f29796bf7cdda64806ec0495a2435361569f02afe3fe33406f07c",
-        strip_prefix = "ruy-34ea9f4993955fa1ff4eb58e504421806b7f2e8f",
+        sha256 = "d8f9dc52c0a52c8470e2e0b60bc16cba91853d812846c075f7ed8404990b003d",
+        strip_prefix = "ruy-5bb02fbf90824c2eb6cd7418f766c593106a332b",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/ruy/archive/34ea9f4993955fa1ff4eb58e504421806b7f2e8f.zip",
-            "https://github.com/google/ruy/archive/34ea9f4993955fa1ff4eb58e504421806b7f2e8f.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/ruy/archive/5bb02fbf90824c2eb6cd7418f766c593106a332b.zip",
+            "https://github.com/google/ruy/archive/5bb02fbf90824c2eb6cd7418f766c593106a332b.zip",
         ],
         build_file = "//third_party/ruy:BUILD",
     )
diff --git a/third_party/sycl/crosstool/computecpp.tpl b/third_party/sycl/crosstool/computecpp.tpl
index c699eabb6f3..ac27e81bc88 100755
--- a/third_party/sycl/crosstool/computecpp.tpl
+++ b/third_party/sycl/crosstool/computecpp.tpl
@@ -41,7 +41,7 @@ def main():
     # compile for C
     return call([CPU_C_COMPILER] + compiler_flags)
 
-  # create a blacklist of folders that will be skipped when compiling with ComputeCpp
+  # create a denylist of folders that will be skipped when compiling with ComputeCpp
   skip_extensions = [".cu.cc"]
   skip_folders = ["tensorflow/compiler", "tensorflow/docs_src", "third_party", "external", "hexagon"]
   skip_folders = [(folder + '/') for folder in skip_folders]
diff --git a/third_party/sycl/crosstool/trisycl.tpl b/third_party/sycl/crosstool/trisycl.tpl
index 87a70d8f954..8206a1a94b1 100644
--- a/third_party/sycl/crosstool/trisycl.tpl
+++ b/third_party/sycl/crosstool/trisycl.tpl
@@ -57,7 +57,7 @@ def main():
   ] + opt_flags
 
   if (compiling_cpp == 1):
-    # create a blacklist of folders that will be skipped when compiling
+    # create a denylist of folders that will be skipped when compiling
     # with triSYCL
     skip_extensions = ['.cu.cc']
     skip_folders = [
diff --git a/third_party/systemlibs/absl_py.absl.logging.BUILD b/third_party/systemlibs/absl_py.absl.logging.BUILD
new file mode 100644
index 00000000000..71cfc7a247c
--- /dev/null
+++ b/third_party/systemlibs/absl_py.absl.logging.BUILD
@@ -0,0 +1,11 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//visibility:public"])
+
+filegroup(
+    name = "LICENSE",
+)
+
+py_library(
+    name = "logging",
+)
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11/BUILD b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11/BUILD
new file mode 100755
index 00000000000..358af09fbdd
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11/BUILD
@@ -0,0 +1,175 @@
+# This file is expanded from a template by cuda_configure.bzl
+# Update cuda_configure.bzl#verify_build_defines when adding new variables.
+
+load(":cc_toolchain_config.bzl", "cc_toolchain_config")
+
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+toolchain(
+    name = "toolchain-linux-x86_64",
+    exec_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    toolchain = ":cc-compiler-local",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "local|compiler": ":cc-compiler-local",
+        "darwin|compiler": ":cc-compiler-darwin",
+        "x64_windows|msvc-cl": ":cc-compiler-windows",
+        "x64_windows": ":cc-compiler-windows",
+        "arm": ":cc-compiler-local",
+        "aarch64": ":cc-compiler-local",
+        "k8": ":cc-compiler-local",
+        "piii": ":cc-compiler-local",
+        "ppc": ":cc-compiler-local",
+        "darwin": ":cc-compiler-darwin",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-local",
+    all_files = ":crosstool_wrapper_driver_is_not_gcc",
+    ar_files = ":crosstool_wrapper_driver_is_not_gcc",
+    as_files = ":crosstool_wrapper_driver_is_not_gcc",
+    compiler_files = ":crosstool_wrapper_driver_is_not_gcc",
+    dwp_files = ":empty",
+    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    # To support linker flags that need to go to the start of command line
+    # we need the toolchain to support parameter files. Parameter files are
+    # last on the command line and contain all shared libraries to link, so all
+    # regular options will be left of them.
+    supports_param_files = 1,
+    toolchain_config = ":cc-compiler-local-config",
+    toolchain_identifier = "local_linux",
+)
+
+cc_toolchain_config(
+    name = "cc-compiler-local-config",
+    builtin_include_directories = [
+        "/dt7/usr/include/c++/7",
+        "/dt7/usr/include/c++/7/x86_64-pc-linux-gnu",
+        "/dt7/usr/include/c++/7/backward",
+        "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include",
+        "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include-fixed",
+        "/dt7/usr/include",
+        "/usr/local/cuda-11.0/targets/x86_64-linux/include",
+        "/usr/local/cuda-11.0/include",
+        "/usr/local/cuda-11.0/extras/CUPTI/include",
+        "/usr/include",
+    ],
+    builtin_sysroot = "",
+    cpu = "local",
+    cuda_path = "",
+    extra_no_canonical_prefixes_flags = ["-fno-canonical-system-headers"],
+    host_compiler_path = "clang/bin/crosstool_wrapper_driver_is_not_gcc",
+    host_compiler_prefix = "/usr/bin",
+    host_compiler_warnings = [],
+    host_unfiltered_compile_flags = [],
+    linker_bin_path = "/usr/bin",
+)
+
+cc_toolchain(
+    name = "cc-compiler-darwin",
+    all_files = ":crosstool_wrapper_driver_is_not_gcc",
+    ar_files = ":crosstool_wrapper_driver_is_not_gcc",
+    as_files = ":crosstool_wrapper_driver_is_not_gcc",
+    compiler_files = ":crosstool_wrapper_driver_is_not_gcc",
+    dwp_files = ":empty",
+    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 0,
+    toolchain_config = ":cc-compiler-local-darwin",
+    toolchain_identifier = "local_darwin",
+)
+
+cc_toolchain_config(
+    name = "cc-compiler-local-darwin",
+    builtin_include_directories = [
+        "/dt7/usr/include/c++/7",
+        "/dt7/usr/include/c++/7/x86_64-pc-linux-gnu",
+        "/dt7/usr/include/c++/7/backward",
+        "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include",
+        "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include-fixed",
+        "/dt7/usr/include",
+        "/usr/local/cuda-11.0/targets/x86_64-linux/include",
+        "/usr/local/cuda-11.0/include",
+        "/usr/local/cuda-11.0/extras/CUPTI/include",
+        "/usr/include",
+    ],
+    cpu = "darwin",
+    extra_no_canonical_prefixes_flags = ["-fno-canonical-system-headers"],
+    host_compiler_path = "clang/bin/crosstool_wrapper_driver_is_not_gcc",
+    host_compiler_prefix = "/usr/bin",
+    host_compiler_warnings = [],
+    host_unfiltered_compile_flags = [],
+    linker_bin_path = "/usr/bin",
+)
+
+cc_toolchain(
+    name = "cc-compiler-windows",
+    all_files = ":windows_msvc_wrapper_files",
+    ar_files = ":windows_msvc_wrapper_files",
+    as_files = ":windows_msvc_wrapper_files",
+    compiler_files = ":windows_msvc_wrapper_files",
+    dwp_files = ":empty",
+    linker_files = ":windows_msvc_wrapper_files",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":cc-compiler-windows-config",
+    toolchain_identifier = "local_windows",
+)
+
+cc_toolchain_config(
+    name = "cc-compiler-windows-config",
+    builtin_include_directories = [
+        "/dt7/usr/include/c++/7",
+        "/dt7/usr/include/c++/7/x86_64-pc-linux-gnu",
+        "/dt7/usr/include/c++/7/backward",
+        "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include",
+        "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include-fixed",
+        "/dt7/usr/include",
+        "/usr/local/cuda-11.0/targets/x86_64-linux/include",
+        "/usr/local/cuda-11.0/include",
+        "/usr/local/cuda-11.0/extras/CUPTI/include",
+        "/usr/include",
+    ],
+    cpu = "x64_windows",
+    msvc_cl_path = "msvc_not_used",
+    msvc_env_include = "msvc_not_used",
+    msvc_env_lib = "msvc_not_used",
+    msvc_env_path = "msvc_not_used",
+    msvc_env_tmp = "msvc_not_used",
+    msvc_lib_path = "msvc_not_used",
+    msvc_link_path = "msvc_not_used",
+    msvc_ml_path = "msvc_not_used",
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "crosstool_wrapper_driver_is_not_gcc",
+    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"],
+)
+
+filegroup(
+    name = "windows_msvc_wrapper_files",
+    srcs = glob(["windows/msvc_*"]),
+)
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11/cc_toolchain_config.bzl b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11/cc_toolchain_config.bzl
new file mode 100755
index 00000000000..70197628811
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11/cc_toolchain_config.bzl
@@ -0,0 +1,1516 @@
+"""cc_toolchain_config rule for configuring CUDA toolchains on Linux, Mac, and Windows."""
+
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "action_config",
+    "env_entry",
+    "env_set",
+    "feature",
+    "feature_set",
+    "flag_group",
+    "flag_set",
+    "tool",
+    "tool_path",
+    "variable_with_value",
+)
+load(
+    "@bazel_tools//tools/build_defs/cc:action_names.bzl",
+    "ASSEMBLE_ACTION_NAME",
+    "CC_FLAGS_MAKE_VARIABLE_ACTION_NAME",
+    "CLIF_MATCH_ACTION_NAME",
+    "CPP_COMPILE_ACTION_NAME",
+    "CPP_HEADER_PARSING_ACTION_NAME",
+    "CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME",
+    "CPP_LINK_EXECUTABLE_ACTION_NAME",
+    "CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME",
+    "CPP_LINK_STATIC_LIBRARY_ACTION_NAME",
+    "CPP_MODULE_CODEGEN_ACTION_NAME",
+    "CPP_MODULE_COMPILE_ACTION_NAME",
+    "C_COMPILE_ACTION_NAME",
+    "LINKSTAMP_COMPILE_ACTION_NAME",
+    "LTO_BACKEND_ACTION_NAME",
+    "LTO_INDEXING_ACTION_NAME",
+    "OBJCPP_COMPILE_ACTION_NAME",
+    "OBJCPP_EXECUTABLE_ACTION_NAME",
+    "OBJC_ARCHIVE_ACTION_NAME",
+    "OBJC_COMPILE_ACTION_NAME",
+    "OBJC_EXECUTABLE_ACTION_NAME",
+    "OBJC_FULLY_LINK_ACTION_NAME",
+    "PREPROCESS_ASSEMBLE_ACTION_NAME",
+    "STRIP_ACTION_NAME",
+)
+
+ACTION_NAMES = struct(
+    c_compile = C_COMPILE_ACTION_NAME,
+    cpp_compile = CPP_COMPILE_ACTION_NAME,
+    linkstamp_compile = LINKSTAMP_COMPILE_ACTION_NAME,
+    cc_flags_make_variable = CC_FLAGS_MAKE_VARIABLE_ACTION_NAME,
+    cpp_module_codegen = CPP_MODULE_CODEGEN_ACTION_NAME,
+    cpp_header_parsing = CPP_HEADER_PARSING_ACTION_NAME,
+    cpp_module_compile = CPP_MODULE_COMPILE_ACTION_NAME,
+    assemble = ASSEMBLE_ACTION_NAME,
+    preprocess_assemble = PREPROCESS_ASSEMBLE_ACTION_NAME,
+    lto_indexing = LTO_INDEXING_ACTION_NAME,
+    lto_backend = LTO_BACKEND_ACTION_NAME,
+    cpp_link_executable = CPP_LINK_EXECUTABLE_ACTION_NAME,
+    cpp_link_dynamic_library = CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME,
+    cpp_link_nodeps_dynamic_library = CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME,
+    cpp_link_static_library = CPP_LINK_STATIC_LIBRARY_ACTION_NAME,
+    strip = STRIP_ACTION_NAME,
+    objc_archive = OBJC_ARCHIVE_ACTION_NAME,
+    objc_compile = OBJC_COMPILE_ACTION_NAME,
+    objc_executable = OBJC_EXECUTABLE_ACTION_NAME,
+    objc_fully_link = OBJC_FULLY_LINK_ACTION_NAME,
+    objcpp_compile = OBJCPP_COMPILE_ACTION_NAME,
+    objcpp_executable = OBJCPP_EXECUTABLE_ACTION_NAME,
+    clif_match = CLIF_MATCH_ACTION_NAME,
+    objcopy_embed_data = "objcopy_embed_data",
+    ld_embed_data = "ld_embed_data",
+)
+
+def _impl(ctx):
+    if (ctx.attr.cpu == "darwin"):
+        toolchain_identifier = "local_darwin"
+    elif (ctx.attr.cpu == "local"):
+        toolchain_identifier = "local_linux"
+    elif (ctx.attr.cpu == "x64_windows"):
+        toolchain_identifier = "local_windows"
+    else:
+        fail("Unreachable")
+
+    host_system_name = "local"
+
+    target_system_name = "local"
+
+    if (ctx.attr.cpu == "darwin"):
+        target_cpu = "darwin"
+    elif (ctx.attr.cpu == "local"):
+        target_cpu = "local"
+    elif (ctx.attr.cpu == "x64_windows"):
+        target_cpu = "x64_windows"
+    else:
+        fail("Unreachable")
+
+    if (ctx.attr.cpu == "local"):
+        target_libc = "local"
+    elif (ctx.attr.cpu == "darwin"):
+        target_libc = "macosx"
+    elif (ctx.attr.cpu == "x64_windows"):
+        target_libc = "msvcrt"
+    else:
+        fail("Unreachable")
+
+    if (ctx.attr.cpu == "darwin" or
+        ctx.attr.cpu == "local"):
+        compiler = "compiler"
+    elif (ctx.attr.cpu == "x64_windows"):
+        compiler = "msvc-cl"
+    else:
+        fail("Unreachable")
+
+    abi_version = "local"
+
+    abi_libc_version = "local"
+
+    cc_target_os = None
+
+    builtin_sysroot = ctx.attr.builtin_sysroot
+
+    all_link_actions = [
+        ACTION_NAMES.cpp_link_executable,
+        ACTION_NAMES.cpp_link_dynamic_library,
+        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+    ]
+
+    cpp_link_dynamic_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_dynamic_library,
+        implies = [
+            "nologo",
+            "shared_flag",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+            "has_configured_linker_path",
+            "def_file",
+        ],
+        tools = [tool(path = ctx.attr.msvc_link_path)],
+    )
+
+    cpp_link_nodeps_dynamic_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+        implies = [
+            "nologo",
+            "shared_flag",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+            "has_configured_linker_path",
+            "def_file",
+        ],
+        tools = [tool(path = ctx.attr.msvc_link_path)],
+    )
+
+    cpp_link_static_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_static_library,
+        implies = [
+            "nologo",
+            "archiver_flags",
+            "input_param_flags",
+            "linker_param_file",
+            "msvc_env",
+        ],
+        tools = [tool(path = ctx.attr.msvc_lib_path)],
+    )
+
+    assemble_action = action_config(
+        action_name = ACTION_NAMES.assemble,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "sysroot",
+        ],
+        tools = [tool(path = ctx.attr.msvc_ml_path)],
+    )
+
+    preprocess_assemble_action = action_config(
+        action_name = ACTION_NAMES.preprocess_assemble,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "sysroot",
+        ],
+        tools = [tool(path = ctx.attr.msvc_ml_path)],
+    )
+
+    c_compile_action = action_config(
+        action_name = ACTION_NAMES.c_compile,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "parse_showincludes",
+            "user_compile_flags",
+            "sysroot",
+            "unfiltered_compile_flags",
+        ],
+        tools = [tool(path = ctx.attr.msvc_cl_path)],
+    )
+
+    cpp_compile_action = action_config(
+        action_name = ACTION_NAMES.cpp_compile,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "parse_showincludes",
+            "user_compile_flags",
+            "sysroot",
+            "unfiltered_compile_flags",
+        ],
+        tools = [tool(path = ctx.attr.msvc_cl_path)],
+    )
+
+    cpp_link_executable_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_executable,
+        implies = [
+            "nologo",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+        ],
+        tools = [tool(path = ctx.attr.msvc_link_path)],
+    )
+
+    if (ctx.attr.cpu == "darwin" or
+        ctx.attr.cpu == "local"):
+        action_configs = []
+    elif (ctx.attr.cpu == "x64_windows"):
+        action_configs = [
+            assemble_action,
+            preprocess_assemble_action,
+            c_compile_action,
+            cpp_compile_action,
+            cpp_link_executable_action,
+            cpp_link_dynamic_library_action,
+            cpp_link_nodeps_dynamic_library_action,
+            cpp_link_static_library_action,
+        ]
+    else:
+        fail("Unreachable")
+
+    no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
+
+    pic_feature = feature(
+        name = "pic",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [
+                    flag_group(flags = ["-fPIC"], expand_if_available = "pic"),
+                    flag_group(
+                        flags = ["-fPIE"],
+                        expand_if_not_available = "pic",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    preprocessor_defines_feature = feature(
+        name = "preprocessor_defines",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/D%{preprocessor_defines}"],
+                        iterate_over = "preprocessor_defines",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    generate_pdb_file_feature = feature(
+        name = "generate_pdb_file",
+        requires = [
+            feature_set(features = ["dbg"]),
+            feature_set(features = ["fastbuild"]),
+        ],
+    )
+
+    linkstamps_feature = feature(
+        name = "linkstamps",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{linkstamp_paths}"],
+                        iterate_over = "linkstamp_paths",
+                        expand_if_available = "linkstamp_paths",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    unfiltered_compile_flags_feature = feature(
+        name = "unfiltered_compile_flags",
+        flag_sets = ([
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ctx.attr.host_unfiltered_compile_flags,
+                    ),
+                ],
+            ),
+        ] if ctx.attr.host_unfiltered_compile_flags else []),
+    )
+
+    determinism_feature = feature(
+        name = "determinism",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "-Wno-builtin-macro-redefined",
+                            "-D__DATE__=\"redacted\"",
+                            "-D__TIMESTAMP__=\"redacted\"",
+                            "-D__TIME__=\"redacted\"",
+                        ],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    nologo_feature = feature(
+        name = "nologo",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                flag_groups = [flag_group(flags = ["/nologo"])],
+            ),
+        ],
+    )
+
+    supports_pic_feature = feature(name = "supports_pic", enabled = True)
+
+    output_execpath_flags_feature = feature(
+        name = "output_execpath_flags",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["/OUT:%{output_execpath}"],
+                        expand_if_available = "output_execpath",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    default_link_flags_feature = feature(
+        name = "default_link_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/MACHINE:X64"])],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "local"):
+        hardening_feature = feature(
+            name = "hardening",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "-U_FORTIFY_SOURCE",
+                                "-D_FORTIFY_SOURCE=1",
+                                "-fstack-protector",
+                            ],
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [flag_group(flags = ["-Wl,-z,relro,-z,now"])],
+                ),
+                flag_set(
+                    actions = [ACTION_NAMES.cpp_link_executable],
+                    flag_groups = [flag_group(flags = ["-pie", "-Wl,-z,relro,-z,now"])],
+                ),
+            ],
+        )
+    elif (ctx.attr.cpu == "darwin"):
+        hardening_feature = feature(
+            name = "hardening",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "-U_FORTIFY_SOURCE",
+                                "-D_FORTIFY_SOURCE=1",
+                                "-fstack-protector",
+                            ],
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = [ACTION_NAMES.cpp_link_executable],
+                    flag_groups = [flag_group(flags = ["-pie"])],
+                ),
+            ],
+        )
+    else:
+        hardening_feature = None
+
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+    targets_windows_feature = feature(
+        name = "targets_windows",
+        enabled = True,
+        implies = ["copy_dynamic_libraries_to_binary"],
+    )
+
+    msvc_env_feature = feature(
+        name = "msvc_env",
+        env_sets = [
+            env_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                env_entries = [
+                    env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
+                    env_entry(
+                        key = "INCLUDE",
+                        value = ctx.attr.msvc_env_include,
+                    ),
+                    env_entry(key = "LIB", value = ctx.attr.msvc_env_lib),
+                    env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
+                    env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
+                ],
+            ),
+        ],
+    )
+
+    linker_subsystem_flag_feature = feature(
+        name = "linker_subsystem_flag",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
+            ),
+        ],
+    )
+
+    dynamic_link_msvcrt_no_debug_feature = feature(
+        name = "dynamic_link_msvcrt_no_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MD"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
+            ),
+        ],
+        requires = [
+            feature_set(features = ["fastbuild"]),
+            feature_set(features = ["opt"]),
+        ],
+    )
+
+    warnings_feature = feature(
+        name = "warnings",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [
+                    flag_group(
+                        flags = ["-Wall"] + ctx.attr.host_compiler_warnings,
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    dynamic_link_msvcrt_debug_feature = feature(
+        name = "dynamic_link_msvcrt_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MDd"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
+            ),
+        ],
+        requires = [feature_set(features = ["dbg"])],
+    )
+
+    compiler_output_flags_feature = feature(
+        name = "compiler_output_flags",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.assemble],
+                flag_groups = [
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fo%{output_file}", "/Zi"],
+                                expand_if_not_available = "output_preprocess_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                        expand_if_not_available = "output_assembly_file",
+                    ),
+                ],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fo%{output_file}"],
+                                expand_if_not_available = "output_preprocess_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                        expand_if_not_available = "output_assembly_file",
+                    ),
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fa%{output_file}"],
+                                expand_if_available = "output_assembly_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                    ),
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/P", "/Fi%{output_file}"],
+                                expand_if_available = "output_preprocess_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    default_compile_flags_feature = feature(
+        name = "default_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "/DCOMPILER_MSVC",
+                            "/DNOMINMAX",
+                            "/D_WIN32_WINNT=0x0600",
+                            "/D_CRT_SECURE_NO_DEPRECATE",
+                            "/D_CRT_SECURE_NO_WARNINGS",
+                            "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS",
+                            "/bigobj",
+                            "/Zm500",
+                            "/J",
+                            "/Gy",
+                            "/GF",
+                            "/EHsc",
+                            "/wd4351",
+                            "/wd4291",
+                            "/wd4250",
+                            "/wd4996",
+                        ],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    static_link_msvcrt_debug_feature = feature(
+        name = "static_link_msvcrt_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MTd"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
+            ),
+        ],
+        requires = [feature_set(features = ["dbg"])],
+    )
+
+    static_link_msvcrt_feature = feature(name = "static_link_msvcrt")
+
+    if (ctx.attr.cpu == "darwin" or
+        ctx.attr.cpu == "local"):
+        dbg_feature = feature(
+            name = "dbg",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["-g"])],
+                ),
+            ],
+            implies = ["common"],
+        )
+    elif (ctx.attr.cpu == "x64_windows"):
+        dbg_feature = feature(
+            name = "dbg",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEBUG:FULL", "/INCREMENTAL:NO"])],
+                ),
+            ],
+            implies = ["generate_pdb_file"],
+        )
+    else:
+        dbg_feature = None
+
+    undefined_dynamic_feature = feature(
+        name = "undefined-dynamic",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_executable,
+                ],
+                flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])],
+            ),
+        ],
+    )
+
+    parse_showincludes_feature = feature(
+        name = "parse_showincludes",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                ],
+                flag_groups = [flag_group(flags = ["/showIncludes"])],
+            ),
+        ],
+    )
+
+    linker_param_file_feature = feature(
+        name = "linker_param_file",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions +
+                          [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        flags = ["@%{linker_param_file}"],
+                        expand_if_available = "linker_param_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    static_link_msvcrt_no_debug_feature = feature(
+        name = "static_link_msvcrt_no_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MT"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
+            ),
+        ],
+        requires = [
+            feature_set(features = ["fastbuild"]),
+            feature_set(features = ["opt"]),
+        ],
+    )
+
+    supports_interface_shared_libraries_feature = feature(
+        name = "supports_interface_shared_libraries",
+        enabled = True,
+    )
+
+    disable_assertions_feature = feature(
+        name = "disable-assertions",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["-DNDEBUG"])],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "x64_windows"):
+        fastbuild_feature = feature(
+            name = "fastbuild",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(flags = ["/DEBUG:FASTLINK", "/INCREMENTAL:NO"]),
+                    ],
+                ),
+            ],
+            implies = ["generate_pdb_file"],
+        )
+    elif (ctx.attr.cpu == "darwin" or
+          ctx.attr.cpu == "local"):
+        fastbuild_feature = feature(name = "fastbuild", implies = ["common"])
+    else:
+        fastbuild_feature = None
+
+    user_compile_flags_feature = feature(
+        name = "user_compile_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{user_compile_flags}"],
+                        iterate_over = "user_compile_flags",
+                        expand_if_available = "user_compile_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    compiler_input_flags_feature = feature(
+        name = "compiler_input_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/c", "%{source_file}"],
+                        expand_if_available = "source_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    no_legacy_features_feature = feature(name = "no_legacy_features")
+
+    archiver_flags_feature = feature(
+        name = "archiver_flags",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/OUT:%{output_execpath}"],
+                        expand_if_available = "output_execpath",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    redirector_feature = feature(
+        name = "redirector",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "-B",
+                            "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py",
+                        ],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    linker_bin_path_feature = feature(
+        name = "linker-bin-path",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["-B" + ctx.attr.linker_bin_path])],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "local"):
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_executable,
+                    ],
+                    flag_groups = [flag_group(flags = ["-Wl,--gc-sections"])],
+                ),
+            ],
+            implies = ["common", "disable-assertions"],
+        )
+    elif (ctx.attr.cpu == "darwin"):
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
+                        ),
+                    ],
+                ),
+            ],
+            implies = ["common", "disable-assertions"],
+        )
+    elif (ctx.attr.cpu == "x64_windows"):
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/O2", "/DNDEBUG"])],
+                ),
+            ],
+        )
+    else:
+        opt_feature = None
+
+    include_paths_feature = feature(
+        name = "include_paths",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/I%{quote_include_paths}"],
+                        iterate_over = "quote_include_paths",
+                    ),
+                    flag_group(
+                        flags = ["/I%{include_paths}"],
+                        iterate_over = "include_paths",
+                    ),
+                    flag_group(
+                        flags = ["/I%{system_include_paths}"],
+                        iterate_over = "system_include_paths",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    shared_flag_feature = feature(
+        name = "shared_flag",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [flag_group(flags = ["/DLL"])],
+            ),
+        ],
+    )
+
+    windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
+
+    frame_pointer_feature = feature(
+        name = "frame-pointer",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["-fno-omit-frame-pointer"])],
+            ),
+        ],
+    )
+
+    build_id_feature = feature(
+        name = "build-id",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    sysroot_feature = feature(
+        name = "sysroot",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["--sysroot=%{sysroot}"],
+                        iterate_over = "sysroot",
+                        expand_if_available = "sysroot",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    cuda_path_feature = feature(
+        name = "cuda_path",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["--cuda-path=" + ctx.attr.cuda_path],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    def_file_feature = feature(
+        name = "def_file",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
+                        expand_if_available = "def_file_path",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "darwin"):
+        stdlib_feature = feature(
+            name = "stdlib",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-lc++"])],
+                ),
+            ],
+        )
+    elif (ctx.attr.cpu == "local"):
+        stdlib_feature = feature(
+            name = "stdlib",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-lstdc++"])],
+                ),
+            ],
+        )
+    else:
+        stdlib_feature = None
+
+    no_stripping_feature = feature(name = "no_stripping")
+
+    alwayslink_feature = feature(
+        name = "alwayslink",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_executable,
+                ],
+                flag_groups = [flag_group(flags = ["-Wl,-no-as-needed"])],
+            ),
+        ],
+    )
+
+    input_param_flags_feature = feature(
+        name = "input_param_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/IMPLIB:%{interface_library_output_path}"],
+                        expand_if_available = "interface_library_output_path",
+                    ),
+                ],
+            ),
+            flag_set(
+                actions = all_link_actions +
+                          [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        iterate_over = "libraries_to_link",
+                        flag_groups = [
+                            flag_group(
+                                iterate_over = "libraries_to_link.object_files",
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "object_file_group",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "object_file",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "interface_library",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["%{libraries_to_link.name}"],
+                                        expand_if_false = "libraries_to_link.is_whole_archive",
+                                    ),
+                                    flag_group(
+                                        flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
+                                        expand_if_true = "libraries_to_link.is_whole_archive",
+                                    ),
+                                ],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "static_library",
+                                ),
+                            ),
+                        ],
+                        expand_if_available = "libraries_to_link",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "local"):
+        no_canonical_prefixes_feature = feature(
+            name = "no-canonical-prefixes",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "-no-canonical-prefixes",
+                            ] + ctx.attr.extra_no_canonical_prefixes_flags,
+                        ),
+                    ],
+                ),
+            ],
+        )
+    elif (ctx.attr.cpu == "darwin"):
+        no_canonical_prefixes_feature = feature(
+            name = "no-canonical-prefixes",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [flag_group(flags = ["-no-canonical-prefixes"])],
+                ),
+            ],
+        )
+    else:
+        no_canonical_prefixes_feature = None
+
+    has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
+
+    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+    user_link_flags_feature = feature(
+        name = "user_link_flags",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{user_link_flags}"],
+                        iterate_over = "user_link_flags",
+                        expand_if_available = "user_link_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    cpp11_feature = feature(
+        name = "c++11",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["-std=c++11"])],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "local"):
+        common_feature = feature(
+            name = "common",
+            implies = [
+                "stdlib",
+                "c++11",
+                "determinism",
+                "alwayslink",
+                "hardening",
+                "warnings",
+                "frame-pointer",
+                "build-id",
+                "no-canonical-prefixes",
+                "linker-bin-path",
+            ],
+        )
+    elif (ctx.attr.cpu == "darwin"):
+        common_feature = feature(
+            name = "common",
+            implies = [
+                "stdlib",
+                "c++11",
+                "determinism",
+                "hardening",
+                "warnings",
+                "frame-pointer",
+                "no-canonical-prefixes",
+                "linker-bin-path",
+                "undefined-dynamic",
+            ],
+        )
+    else:
+        common_feature = None
+
+    if (ctx.attr.cpu == "local"):
+        features = [
+            cpp11_feature,
+            stdlib_feature,
+            determinism_feature,
+            alwayslink_feature,
+            pic_feature,
+            hardening_feature,
+            warnings_feature,
+            frame_pointer_feature,
+            build_id_feature,
+            no_canonical_prefixes_feature,
+            disable_assertions_feature,
+            linker_bin_path_feature,
+            common_feature,
+            opt_feature,
+            fastbuild_feature,
+            dbg_feature,
+            supports_dynamic_linker_feature,
+            supports_pic_feature,
+        ]
+        if ctx.attr.cuda_path:
+            features.append(cuda_path_feature)
+    elif (ctx.attr.cpu == "darwin"):
+        features = [
+            cpp11_feature,
+            stdlib_feature,
+            determinism_feature,
+            pic_feature,
+            hardening_feature,
+            warnings_feature,
+            frame_pointer_feature,
+            no_canonical_prefixes_feature,
+            disable_assertions_feature,
+            linker_bin_path_feature,
+            undefined_dynamic_feature,
+            common_feature,
+            opt_feature,
+            fastbuild_feature,
+            dbg_feature,
+            supports_dynamic_linker_feature,
+            supports_pic_feature,
+        ]
+    elif (ctx.attr.cpu == "x64_windows"):
+        features = [
+            no_legacy_features_feature,
+            redirector_feature,
+            nologo_feature,
+            has_configured_linker_path_feature,
+            no_stripping_feature,
+            targets_windows_feature,
+            copy_dynamic_libraries_to_binary_feature,
+            default_compile_flags_feature,
+            msvc_env_feature,
+            include_paths_feature,
+            preprocessor_defines_feature,
+            parse_showincludes_feature,
+            generate_pdb_file_feature,
+            shared_flag_feature,
+            linkstamps_feature,
+            output_execpath_flags_feature,
+            archiver_flags_feature,
+            input_param_flags_feature,
+            linker_subsystem_flag_feature,
+            user_link_flags_feature,
+            default_link_flags_feature,
+            linker_param_file_feature,
+            static_link_msvcrt_feature,
+            static_link_msvcrt_no_debug_feature,
+            dynamic_link_msvcrt_no_debug_feature,
+            static_link_msvcrt_debug_feature,
+            dynamic_link_msvcrt_debug_feature,
+            dbg_feature,
+            fastbuild_feature,
+            opt_feature,
+            user_compile_flags_feature,
+            sysroot_feature,
+            unfiltered_compile_flags_feature,
+            compiler_output_flags_feature,
+            compiler_input_flags_feature,
+            def_file_feature,
+            windows_export_all_symbols_feature,
+            no_windows_export_all_symbols_feature,
+            supports_dynamic_linker_feature,
+            supports_interface_shared_libraries_feature,
+        ]
+    else:
+        fail("Unreachable")
+
+    cxx_builtin_include_directories = ctx.attr.builtin_include_directories
+
+    if (ctx.attr.cpu == "x64_windows"):
+        tool_paths = [
+            tool_path(name = "ar", path = ctx.attr.msvc_lib_path),
+            tool_path(name = "ml", path = ctx.attr.msvc_ml_path),
+            tool_path(name = "cpp", path = ctx.attr.msvc_cl_path),
+            tool_path(name = "gcc", path = ctx.attr.msvc_cl_path),
+            tool_path(name = "gcov", path = "wrapper/bin/msvc_nop.bat"),
+            tool_path(name = "ld", path = ctx.attr.msvc_link_path),
+            tool_path(name = "nm", path = "wrapper/bin/msvc_nop.bat"),
+            tool_path(
+                name = "objcopy",
+                path = "wrapper/bin/msvc_nop.bat",
+            ),
+            tool_path(
+                name = "objdump",
+                path = "wrapper/bin/msvc_nop.bat",
+            ),
+            tool_path(
+                name = "strip",
+                path = "wrapper/bin/msvc_nop.bat",
+            ),
+        ]
+    elif (ctx.attr.cpu == "local"):
+        tool_paths = [
+            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
+            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/ar"),
+            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
+            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
+            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
+            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
+            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
+            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
+            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
+        ]
+    elif (ctx.attr.cpu == "darwin"):
+        tool_paths = [
+            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
+            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/libtool"),
+            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
+            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
+            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
+            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
+            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
+            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
+            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
+        ]
+    else:
+        fail("Unreachable")
+
+    out = ctx.actions.declare_file(ctx.label.name)
+    ctx.actions.write(out, "Fake executable")
+    return [
+        cc_common.create_cc_toolchain_config_info(
+            ctx = ctx,
+            features = features,
+            action_configs = action_configs,
+            artifact_name_patterns = [],
+            cxx_builtin_include_directories = cxx_builtin_include_directories,
+            toolchain_identifier = toolchain_identifier,
+            host_system_name = host_system_name,
+            target_system_name = target_system_name,
+            target_cpu = target_cpu,
+            target_libc = target_libc,
+            compiler = compiler,
+            abi_version = abi_version,
+            abi_libc_version = abi_libc_version,
+            tool_paths = tool_paths,
+            make_variables = [],
+            builtin_sysroot = builtin_sysroot,
+            cc_target_os = cc_target_os,
+        ),
+        DefaultInfo(
+            executable = out,
+        ),
+    ]
+
+cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {
+        "cpu": attr.string(mandatory = True, values = ["darwin", "local", "x64_windows"]),
+        "builtin_include_directories": attr.string_list(),
+        "extra_no_canonical_prefixes_flags": attr.string_list(),
+        "host_compiler_path": attr.string(),
+        "host_compiler_prefix": attr.string(),
+        "host_compiler_warnings": attr.string_list(),
+        "host_unfiltered_compile_flags": attr.string_list(),
+        "linker_bin_path": attr.string(),
+        "builtin_sysroot": attr.string(),
+        "cuda_path": attr.string(),
+        "msvc_cl_path": attr.string(default = "msvc_not_used"),
+        "msvc_env_include": attr.string(default = "msvc_not_used"),
+        "msvc_env_lib": attr.string(default = "msvc_not_used"),
+        "msvc_env_path": attr.string(default = "msvc_not_used"),
+        "msvc_env_tmp": attr.string(default = "msvc_not_used"),
+        "msvc_lib_path": attr.string(default = "msvc_not_used"),
+        "msvc_link_path": attr.string(default = "msvc_not_used"),
+        "msvc_ml_path": attr.string(default = "msvc_not_used"),
+    },
+    provides = [CcToolchainConfigInfo],
+    executable = True,
+)
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11/clang/bin/crosstool_wrapper_driver_is_not_gcc b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11/clang/bin/crosstool_wrapper_driver_is_not_gcc
new file mode 100755
index 00000000000..07c85a38229
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11/clang/bin/crosstool_wrapper_driver_is_not_gcc
@@ -0,0 +1,289 @@
+#!/usr/bin/env python
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Crosstool wrapper for compiling CUDA programs.
+
+SYNOPSIS:
+  crosstool_wrapper_is_not_gcc [options passed in by cc_library()
+                                or cc_binary() rule]
+
+DESCRIPTION:
+  This script is expected to be called by the cc_library() or cc_binary() bazel
+  rules. When the option "-x cuda" is present in the list of arguments passed
+  to this script, it invokes the nvcc CUDA compiler. Most arguments are passed
+  as is as a string to --compiler-options of nvcc. When "-x cuda" is not
+  present, this wrapper invokes hybrid_driver_is_not_gcc with the input
+  arguments as is.
+
+NOTES:
+  Changes to the contents of this file must be propagated from
+  //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc to
+  //third_party/gpus/crosstool/v*/*/clang/bin/crosstool_wrapper_is_not_gcc
+"""
+
+from __future__ import print_function
+
+__author__ = 'keveman@google.com (Manjunath Kudlur)'
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import pipes
+
+# Template values set by cuda_autoconf.
+CPU_COMPILER = ('/dt7/usr/bin/gcc')
+GCC_HOST_COMPILER_PATH = ('/dt7/usr/bin/gcc')
+
+NVCC_PATH = '/usr/local/cuda-11.0/bin/nvcc'
+PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
+NVCC_VERSION = '10.1'
+
+def Log(s):
+  print('gpus/crosstool: {0}'.format(s))
+
+
+def GetOptionValue(argv, option):
+  """Extract the list of values for option from the argv list.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    option: The option whose value to extract, with the leading '-'.
+
+  Returns:
+    A list of values, either directly following the option,
+    (eg., -opt val1 val2) or values collected from multiple occurrences of
+    the option (eg., -opt val1 -opt val2).
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument(option, nargs='*', action='append')
+  option = option.lstrip('-').replace('-', '_')
+  args, _ = parser.parse_known_args(argv)
+  if not args or not vars(args)[option]:
+    return []
+  else:
+    return sum(vars(args)[option], [])
+
+
+def GetHostCompilerOptions(argv):
+  """Collect the -isystem, -iquote, and --sysroot option values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be used as the --compiler-options to nvcc.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-isystem', nargs='*', action='append')
+  parser.add_argument('-iquote', nargs='*', action='append')
+  parser.add_argument('--sysroot', nargs=1)
+  parser.add_argument('-g', nargs='*', action='append')
+  parser.add_argument('-fno-canonical-system-headers', action='store_true')
+  parser.add_argument('-no-canonical-prefixes', action='store_true')
+
+  args, _ = parser.parse_known_args(argv)
+
+  opts = ''
+
+  if args.isystem:
+    opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))
+  if args.iquote:
+    opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
+  if args.g:
+    opts += ' -g' + ' -g'.join(sum(args.g, []))
+  if args.fno_canonical_system_headers:
+    opts += ' -fno-canonical-system-headers'
+  if args.no_canonical_prefixes:
+    opts += ' -no-canonical-prefixes'
+  if args.sysroot:
+    opts += ' --sysroot ' + args.sysroot[0]
+
+  return opts
+
+def _update_options(nvcc_options):
+  if NVCC_VERSION in ("7.0",):
+    return nvcc_options
+
+  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
+  return [ update_options[opt] if opt in update_options else opt
+                    for opt in nvcc_options ]
+
+def GetNvccOptions(argv):
+  """Collect the -nvcc_options values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be passed directly to nvcc.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-nvcc_options', nargs='*', action='append')
+
+  args, _ = parser.parse_known_args(argv)
+
+  if args.nvcc_options:
+    options = _update_options(sum(args.nvcc_options, []))
+    return ' '.join(['--'+a for a in options])
+  return ''
+
+def system(cmd):
+  """Invokes cmd with os.system().
+
+  Args:
+    cmd: The command.
+
+  Returns:
+    The exit code if the process exited with exit() or -signal
+    if the process was terminated by a signal.
+  """
+  retv = os.system(cmd)
+  if os.WIFEXITED(retv):
+    return os.WEXITSTATUS(retv)
+  else:
+    return -os.WTERMSIG(retv)
+
+def InvokeNvcc(argv, log=False):
+  """Call nvcc with arguments assembled from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    log: True if logging is requested.
+
+  Returns:
+    The return value of calling system('nvcc ' + args)
+  """
+
+  host_compiler_options = GetHostCompilerOptions(argv)
+  nvcc_compiler_options = GetNvccOptions(argv)
+  opt_option = GetOptionValue(argv, '-O')
+  m_options = GetOptionValue(argv, '-m')
+  m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
+  include_options = GetOptionValue(argv, '-I')
+  out_file = GetOptionValue(argv, '-o')
+  depfiles = GetOptionValue(argv, '-MF')
+  defines = GetOptionValue(argv, '-D')
+  defines = ''.join([' -D' + define for define in defines])
+  undefines = GetOptionValue(argv, '-U')
+  undefines = ''.join([' -U' + define for define in undefines])
+  std_options = GetOptionValue(argv, '-std')
+  # Supported -std flags as of CUDA 9.0. Only keep last to mimic gcc/clang.
+  nvcc_allowed_std_options = ["c++03", "c++11", "c++14"]
+  std_options = ''.join([' -std=' + define
+      for define in std_options if define in nvcc_allowed_std_options][-1:])
+  fatbin_options = ''.join([' --fatbin-options=' + option
+      for option in GetOptionValue(argv, '-Xcuda-fatbinary')])
+
+  # The list of source files get passed after the -c option. I don't know of
+  # any other reliable way to just get the list of source files to be compiled.
+  src_files = GetOptionValue(argv, '-c')
+
+  # Pass -w through from host to nvcc, but don't do anything fancier with
+  # warnings-related flags, since they're not necessarily the same across
+  # compilers.
+  warning_options = ' -w' if '-w' in argv else ''
+
+  if len(src_files) == 0:
+    return 1
+  if len(out_file) != 1:
+    return 1
+
+  opt = (' -O2' if (len(opt_option) > 0 and int(opt_option[0]) > 0)
+         else ' -g')
+
+  includes = (' -I ' + ' -I '.join(include_options)
+              if len(include_options) > 0
+              else '')
+
+  # Unfortunately, there are other options that have -c prefix too.
+  # So allowing only those look like C/C++ files.
+  src_files = [f for f in src_files if
+               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  srcs = ' '.join(src_files)
+  out = ' -o ' + out_file[0]
+
+  nvccopts = '-D_FORCE_INLINES '
+  for capability in GetOptionValue(argv, "--cuda-gpu-arch"):
+    capability = capability[len('sm_'):]
+    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s\" ' % (capability,
+                                                               capability)
+  for capability in GetOptionValue(argv, '--cuda-include-ptx'):
+    capability = capability[len('sm_'):]
+    nvccopts += r'-gencode=arch=compute_%s,\"code=compute_%s\" ' % (capability,
+                                                                    capability)
+  nvccopts += nvcc_compiler_options
+  nvccopts += undefines
+  nvccopts += defines
+  nvccopts += std_options
+  nvccopts += m_options
+  nvccopts += warning_options
+  nvccopts += fatbin_options
+
+  if depfiles:
+    # Generate the dependency file
+    depfile = depfiles[0]
+    cmd = (NVCC_PATH + ' ' + nvccopts +
+           ' --compiler-options "' + host_compiler_options + '"' +
+           ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+           ' -I .' +
+           ' -x cu ' + opt + includes + ' ' + srcs + ' -M -o ' + depfile)
+    if log: Log(cmd)
+    exit_status = system(cmd)
+    if exit_status != 0:
+      return exit_status
+
+  cmd = (NVCC_PATH + ' ' + nvccopts +
+         ' --compiler-options "' + host_compiler_options + ' -fPIC"' +
+         ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+         ' -I .' +
+         ' -x cu ' + opt + includes + ' -c ' + srcs + out)
+
+  # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.
+  # Need to investigate and fix.
+  cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
+  if log: Log(cmd)
+  return system(cmd)
+
+
+def main():
+  parser = ArgumentParser()
+  parser.add_argument('-x', nargs=1)
+  parser.add_argument('--cuda_log', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  if args.x and args.x[0] == 'cuda':
+    if args.cuda_log: Log('-x cuda')
+    leftover = [pipes.quote(s) for s in leftover]
+    if args.cuda_log: Log('using nvcc')
+    return InvokeNvcc(leftover, log=args.cuda_log)
+
+  # Strip our flags before passing through to the CPU compiler for files which
+  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
+  # We not only want to pass -x to the CPU compiler, but also keep it in its
+  # relative location in the argv list (the compiler is actually sensitive to
+  # this).
+  cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                             if not flag.startswith(('--cuda_log'))]
+
+  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())